diff --git a/_clang-format b/.clang-format
similarity index 100%
rename from _clang-format
rename to .clang-format
diff --git a/.clang-tidy b/.clang-tidy
index b5f9d549338..b242b140753 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,3 +1,7 @@
+HeaderFilterRegex: '/(examples|include|src|tests)/.*\.hpp'
+
+FormatStyle: file
+
 Checks: >
     -*,
     readability-identifier-naming,
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 0cb8002152e..c081eb72997 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2024 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,44 +15,48 @@
 #===============================================================================
 
 # Default
-* @oneapi-src/onednn-arch @intel-innersource/dnn-arch
+* @uxlfoundation/onednn-arch
 
 # Github automation
-/.github/ @oneapi-src/onednn-devops
+/.github/ @uxlfoundation/onednn-devops
 
 # CPU Engine
-/src/cpu/aarch64/ @oneapi-src/onednn-cpu-aarch64 @intel-innersource/dnn-arch
-/src/cpu/x64/ @oneapi-src/onednn-cpu-x64 @intel-innersource/dnn-cpu
-/src/cpu/rnn/ @oneapi-src/onednn-cpu-x64 @intel-innersource/dnn-cpu
+/src/cpu/aarch64/ @uxlfoundation/onednn-cpu-aarch64
+/src/cpu/x64/ @uxlfoundation/onednn-cpu-x64
+/src/cpu/rnn/ @uxlfoundation/onednn-cpu-x64
 
 # GPU Engine
-/src/gpu/amd/ @oneapi-src/onednn-gpu-amd @intel-innersource/dnn-arch
-/src/gpu/intel/ @oneapi-src/onednn-gpu-intel @intel-innersource/dnn-gpu
-/src/gpu/nvidia/ @oneapi-src/onednn-gpu-nvidia @intel-innersource/dnn-arch
-/src/gpu/generic/ @oneapi-src/onednn-arch @intel-innersource/dnn-arch @intel-innersource/dnn-gpu
-/src/gpu/generic/sycl/ @oneapi-src/onednn-gpu-generic @intel-innersource/dnn-arch @intel-innersource/dnn-gpu
+/src/gpu/amd/ @uxlfoundation/onednn-gpu-amd
+/src/gpu/intel/ @uxlfoundation/onednn-gpu-intel
+/src/gpu/nvidia/ @uxlfoundation/onednn-gpu-nvidia
+/src/gpu/generic/ @uxlfoundation/onednn-arch
+/src/gpu/generic/sycl/ @uxlfoundation/onednn-gpu-generic
 
 # Tests
-/tests/benchdnn/inputs/ @oneapi-src/onednn-maintain @intel-innersource/dnn-arch @intel-innersource/dnn-cpu @intel-innersource/dnn-gpu
-/tests/benchdnn/graph/ @oneapi-src/onednn-graph @oneapi-src/onednn-arch @intel-innersource/dnn-graph @intel-innersource/dnn-arch
-/tests/benchdnn/inputs/graph/ @oneapi-src/onednn-graph @oneapi-src/onednn-arch @intel-innersource/dnn-graph @intel-innersource/dnn-arch
-/tests/gtests/graph/ @oneapi-src/onednn-graph @intel-innersource/dnn-graph
+/tests/benchdnn/inputs/ @uxlfoundation/onednn-maintain
+/tests/benchdnn/graph/ @uxlfoundation/onednn-graph @uxlfoundation/onednn-arch
+/tests/benchdnn/inputs/graph/ @uxlfoundation/onednn-graph @uxlfoundation/onednn-arch
+/tests/gtests/graph/ @uxlfoundation/onednn-graph
 
 # Graph API
-/src/graph/ @oneapi-src/onednn-graph @intel-innersource/dnn-graph
-
-# Graph compiler
-/src/graph/backend/graph_compiler/ @intel-innersource/dnn-compiler
-/tests/gtests/graph/unit/backend/graph_compiler/ @intel-innersource/dnn-compiler
+/src/graph/ @uxlfoundation/onednn-graph
 
 # Documentation
-*.md  @oneapi-src/onednn-doc @oneapi-src/onednn-arch @intel-innersource/dnn-doc @intel-innersource/dnn-arch
-/doc/ @oneapi-src/onednn-doc @oneapi-src/onednn-arch @intel-innersource/dnn-doc @intel-innersource/dnn-arch
+*.md  @uxlfoundation/onednn-doc @uxlfoundation/onednn-arch
+/doc/ @uxlfoundation/onednn-doc @uxlfoundation/onednn-arch
+
+# Third party components
+/third-party/ @uxlfoundation/onednn-arch
+/third_party/level_zero/ @uxlfoundation/onednn-gpu-intel
+/third_party/mdapi/ @uxlfoundation/onednn-gpu-intel
+/third_party/ngen/ @uxlfoundation/onednn-gpu-intel
+/third_party/xbyak/ @uxlfoundation/onednn-cpu-x64
+/third_party/xbyak_aarch64/ @uxlfoundation/onednn-cpu-aarch64
 
 # Governance and process
-/.github/CODEOWNERS @oneapi-src/onednn-maintain
-/SECURITY.md @oneapi-src/onednn-maintain
-/MAINTAINERS.md @oneapi-src/onednn-maintain
-/CONTRIBUTING.md @oneapi-src/onednn-maintain
-/CODING_STANDARDS.md @oneapi-src/onednn-maintain
-/CODE_OF_CONDUCT.md @oneapi-src/onednn-maintain
+/.github/CODEOWNERS @uxlfoundation/onednn-maintain
+/SECURITY.md @uxlfoundation/onednn-maintain
+/MAINTAINERS.md @uxlfoundation/onednn-maintain
+/CONTRIBUTING.md @uxlfoundation/onednn-maintain
+/CODING_STANDARDS.md @uxlfoundation/onednn-maintain
+/CODE_OF_CONDUCT.md @uxlfoundation/onednn-maintain
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 2770b4545dc..6141ad2ca74 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -12,7 +12,7 @@ factors are considered important to reproduce an issue.
 
 # Version
 Report oneDNN version and githash. Version information is printed to stdout
-in [verbose mode](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html).
+in [verbose mode](https://uxlfoundation.github.io/oneDNN/dev_guide_verbose.html).
 
 # Environment
 oneDNN includes hardware-specific optimizations and may behave
@@ -28,10 +28,10 @@ the following information to help reproduce the issue:
 
 # Steps to reproduce
 Please check that the issue is reproducible with the latest revision on
-master. Include all the steps to reproduce the issue. 
+main. Include all the steps to reproduce the issue.
 
-You can use [verbose mode](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html)
-and [benchdnn](https://github.com/oneapi-src/oneDNN/tree/master/tests/benchdnn)
+You can use [verbose mode](https://uxlfoundation.github.io/oneDNN/dev_guide_verbose.html)
+and [benchdnn](https://github.com/uxlfoundation/oneDNN/tree/main/tests/benchdnn)
 to validate correctness of all primitives the library supports. If this does not
 work a short C/C++ program or modified unit tests demonstrating the issue
 will greatly help with the investigation.
@@ -40,7 +40,7 @@ will greatly help with the investigation.
 Document behavior you observe. For performance defects, like performance
 regressions or a function being slow, provide a log including output generated
 by your application in
-[verbose mode](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html). 
+[verbose mode](https://uxlfoundation.github.io/oneDNN/dev_guide_verbose.html).
 
 # Expected behavior
 Document behavior you expect.
\ No newline at end of file
diff --git a/.github/automation/.azure-pipeline.yml b/.github/automation/.azure-pipeline.yml
deleted file mode 100644
index a6ddac46fe9..00000000000
--- a/.github/automation/.azure-pipeline.yml
+++ /dev/null
@@ -1,132 +0,0 @@
-#! /bin/bash
-
-#===============================================================================
-# Copyright 2019-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-trigger:
-- main
-- rls-*
-
-jobs:
-  - job: 'ClangFormat'
-    pool:
-      vmImage: 'ubuntu-20.04'
-    steps:
-      - script: |
-          .github/automation/env/clang.sh 11
-        displayName: 'init'
-      - script: |
-          .github/automation/clang-format.sh
-        displayName: 'ClangFormat_Check'
-        failOnStderr: true 
-  - job: 'Ubuntu20'
-    timeoutInMinutes: 120
-    pool:
-      vmImage: 'ubuntu-20.04'
-    strategy:
-      matrix: 
-        clang:
-          CC: clang
-          CXX: clang++
-        gcc:
-          CC: gcc
-          CXX: g++
-    steps:
-      - script: |
-          if [ "$(CC)" == "clang" ]; then
-            .github/automation/env/clang.sh 9
-          fi
-        displayName: "Init_Env"
-      - script: |
-          .github/automation/build.sh --threading omp --mode Release --source-dir $(pwd) --build-dir $(pwd)/build
-        displayName: 'build'
-      - script: |
-          .github/automation/test.sh --build-dir $(pwd)/build --report-dir $(pwd)/report
-        displayName: 'test'
-        failOnStderr: true
-  - job: 'Ubuntu22'
-    timeoutInMinutes: 120
-    pool:
-      vmImage: 'ubuntu-22.04'
-    strategy:
-      matrix: 
-        clang:
-          CC: clang
-          CXX: clang++
-        gcc:
-          CC: gcc
-          CXX: g++
-    steps:
-      - script: |
-          if [ "$(CC)" == "clang" ]; then
-            .github/automation/env/clang.sh 15
-          fi
-        displayName: "Init_Env"
-      - script: |
-          .github/automation/build.sh --threading omp --mode Release --source-dir $(pwd) --build-dir $(pwd)/build
-        displayName: 'build'
-      - script: |
-          .github/automation/test.sh --build-dir $(pwd)/build --report-dir $(pwd)/report
-        displayName: 'test'
-        failOnStderr: true
-  - job: 'macOS12'
-    timeoutInMinutes: 120
-    pool:
-      vmImage: 'macOS-12'
-    steps:
-      - script: |
-          .github/automation/build.sh --threading omp --mode Release --source-dir $(pwd) --build-dir $(pwd)/build
-        displayName: 'build'
-      - script: |
-          .github/automation/test.sh --build-dir $(pwd)/build --report-dir $(pwd)/report
-        displayName: 'test'
-        failOnStderr: true
-  - job: 'macOS13'
-    timeoutInMinutes: 120
-    pool:
-      vmImage: 'macOS-13'
-    steps:
-      - script: |
-          .github/automation/build.sh --threading omp --mode Release --source-dir $(pwd) --build-dir $(pwd)/build
-        displayName: 'build'
-      - script: |
-          .github/automation/test.sh --build-dir $(pwd)/build --report-dir $(pwd)/report
-        displayName: 'test'
-        failOnStderr: true
-  - job: 'Windows_Server_2022'
-    timeoutInMinutes: 120
-    pool:
-      vmImage: 'windows-2022'
-    steps:
-      - script: |
-          .github\automation\build.bat /THREADING omp /MODE Release /VSVERSION vs2022 /SOURCEDIR %CD% /BUILDDIR %CD%\build
-        displayName: 'build'
-      - script: |
-          .github\automation\test.bat /BUILDDIR %CD%\build /MODE Release /REPORTDIR %CD%\report
-        displayName: 'test'
-        failOnStderr: true
-  - job: 'Windows_Server_2019'
-    timeoutInMinutes: 120
-    pool:
-      vmImage: 'windows-2019'
-    steps:
-      - script: |
-          .github\automation\build.bat /THREADING omp /MODE Release /VSVERSION vs2019 /SOURCEDIR %CD% /BUILDDIR %CD%\build
-        displayName: 'build'
-      - script: |
-         .github\automation\test.bat /BUILDDIR %CD%\build /MODE Release /REPORTDIR %CD%\report
-        displayName: 'test'
-        failOnStderr: true
diff --git a/.github/automation/aarch64/build.sh b/.github/automation/aarch64/build.sh
new file mode 100755
index 00000000000..a3d8d81f26b
--- /dev/null
+++ b/.github/automation/aarch64/build.sh
@@ -0,0 +1,54 @@
+#! /bin/bash
+
+# *******************************************************************************
+# Copyright 2024 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Build oneDNN for aarch64.
+
+set -o errexit -o pipefail -o noclobber
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+# Defines MP, CC, CXX and OS.
+source ${SCRIPT_DIR}/common.sh
+
+export ACL_ROOT_DIR=${ACL_ROOT_DIR:-"${PWD}/ComputeLibrary"}
+
+CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-"Release"}
+ONEDNN_TEST_SET=${ONEDNN_TEST_SET:-"SMOKE"}
+ONEDNN_BUILD_GRAPH=${ONEDNN_BUILD_GRAPH:-"ON"}
+
+if [[ "$ONEDNN_ACTION" == "configure" ]]; then
+    set -x
+    cmake \
+        -Bbuild -S. \
+        -DDNNL_USE_ACL=ON \
+        -DONEDNN_BUILD_GRAPH=$ONEDNN_BUILD_GRAPH \
+        -DDNNL_CPU_RUNTIME=$ONEDNN_THREADING \
+        -DONEDNN_WERROR=ON \
+        -DDNNL_BUILD_FOR_CI=ON \
+        -DONEDNN_TEST_SET=$ONEDNN_TEST_SET \
+        -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE
+    set +x
+elif [[ "$ONEDNN_ACTION" == "build" ]]; then
+    set -x
+    cmake --build build 
+    set +x
+else
+    echo "Unknown action: $ONEDNN_ACTION"
+    exit 1
+fi
diff --git a/.github/automation/aarch64/build_acl.sh b/.github/automation/aarch64/build_acl.sh
new file mode 100755
index 00000000000..53cc2a825fc
--- /dev/null
+++ b/.github/automation/aarch64/build_acl.sh
@@ -0,0 +1,81 @@
+#! /bin/bash
+
+# *******************************************************************************
+# Copyright 2020-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Build ACL from github.
+
+set -o errexit -o pipefail -o noclobber
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+# Defines MP, CC, CXX and OS.
+source ${SCRIPT_DIR}/common.sh
+
+ACL_BUILD_TYPE=${ACL_BUILD_TYPE:-"Release"}
+ACL_ROOT_DIR=${ACL_ROOT_DIR:-"${PWD}/ComputeLibrary"}
+ACL_REPO="https://github.com/ARM-software/ComputeLibrary.git"
+
+if [[ "$ACL_THREADING" == "OMP" ]]; then
+    ACL_OPENMP=1
+elif [[ "$ACL_THREADING" == "SEQ" ]]; then
+    ACL_OPENMP=0
+fi
+
+if [[ "$OS" == "Linux" ]]; then
+    ACL_MULTI_ISA_SUPPORT=1
+    if [[ "$ACL_THREADING" == "OMP" ]]; then
+        ACL_OPENMP=1
+    elif [[ "$ACL_THREADING" == "SEQ" ]]; then
+        ACL_OPENMP=0
+    fi
+    ACL_OS="linux"
+elif [[ "$OS" == "Darwin" ]]; then
+    ACL_MULTI_ISA_SUPPORT=0
+    ACL_OPENMP=0
+    ACL_OS="macos"
+else
+    echo "Unknown OS: $OS"
+    exit 1
+fi
+
+if [[ "$ACL_BUILD_TYPE" == "Release" ]]; then
+    ACL_DEBUG=0
+elif [[ "$ACL_BUILD_TYPE" == "Debug" ]]; then
+    ACL_DEBUG=1
+else
+    echo "Unknown build config: $ACL_BUILD_TYPE"
+    exit 1
+fi
+
+if [[ "$ACL_ACTION" == "clone" ]]; then
+    set -x
+    git clone --branch $ACL_VERSION --depth 1 $ACL_REPO $ACL_ROOT_DIR
+    set +x
+elif [[ "$ACL_ACTION" == "build" ]]; then
+    set -x
+    cd $ACL_ROOT_DIR
+    set -x
+    scons $MP Werror=0 debug=$ACL_DEBUG neon=1 opencl=0 embed_kernels=0 \
+        os=$ACL_OS arch=armv8.2-a build=native multi_isa=$ACL_MULTI_ISA_SUPPORT \
+        fixed_format_kernels=1 cppthreads=0 openmp=$ACL_OPENMP examples=0 \
+        validation_tests=0
+    set +x
+else
+    echo "Unknown action: $ACL_ACTION"
+    exit 1
+fi
diff --git a/.github/automation/aarch64/ci.json b/.github/automation/aarch64/ci.json
new file mode 100644
index 00000000000..bdd44eaed7c
--- /dev/null
+++ b/.github/automation/aarch64/ci.json
@@ -0,0 +1,8 @@
+{
+    "dependencies": {
+        "acl": "v25.02",
+        "gcc": "13",
+        "clang": "17",
+        "onednn-base": "v3.7"
+    }
+}
diff --git a/.github/automation/aarch64/common.sh b/.github/automation/aarch64/common.sh
new file mode 100644
index 00000000000..cfb483eb468
--- /dev/null
+++ b/.github/automation/aarch64/common.sh
@@ -0,0 +1,46 @@
+#! /bin/bash
+
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Common variables for aarch64 ci. Exports: 
+# CC, CXX, OS
+
+set -o errexit -o pipefail -o noclobber
+
+export OS=$(uname)
+
+# Num threads on system.
+if [[ "$OS" == "Darwin" ]]; then
+    export MP="-j$(sysctl -n hw.ncpu)"
+elif [[ "$OS" == "Linux" ]]; then
+    export MP="-j$(nproc)"
+fi
+
+if [[ "$BUILD_TOOLSET" == "gcc" ]]; then
+    export CC=gcc-${GCC_VERSION}
+    export CXX=g++-${GCC_VERSION}
+elif [[ "$BUILD_TOOLSET" == "clang" ]]; then
+    export CC=clang
+    export CXX=clang++
+fi
+
+# Print every exported variable.
+echo "OS: $OS"
+echo "Toolset: $BUILD_TOOLSET"
+echo "CC: $CC"
+echo "CXX: $CXX"
diff --git a/.github/automation/aarch64/get_acl.sh b/.github/automation/aarch64/get_acl.sh
new file mode 100755
index 00000000000..7745b9b9764
--- /dev/null
+++ b/.github/automation/aarch64/get_acl.sh
@@ -0,0 +1,95 @@
+#! /bin/bash
+
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+set -o errexit -o pipefail -o noclobber
+
+WORKSPACE=${GITHUB_WORKSPACE:-$(pwd)}
+echo "github workspace $GITHUB_WORKSPACE"
+
+os_type=$(uname)
+
+ACL_WITH_ASSERTS=${ACL_WITH_ASSERTS:-0}
+ACL_VERSION=${ACL_VERSION:-v24.08.1}
+
+if [[ "$os_type" == "Linux" ]]; then
+  echo "This machine is running Linux"
+  ARCHIVE="arm_compute-${ACL_VERSION}-linux-aarch64-cpu-bin.tar.gz"
+elif [[ "$os_type" == "Darwin" ]]; then
+  echo "This machine is running macOS"
+  ARCHIVE="arm_compute-${ACL_VERSION}-macos-aarch64-cpu-bin.tar.gz"
+else
+  echo "Unknown OS: $os_type"
+  exit 1
+fi
+
+# Set version and root directory
+export ACL_ROOT_DIR="${WORKSPACE}/ComputeLibrary"
+
+echo "ACL_VERSION: ${ACL_VERSION}"
+echo "ACL_DIR_NAME: ${ACL_DIR_NAME}"
+echo "ACL_ROOT_DIR: ${ACL_ROOT_DIR}"
+echo "ACL_WITH_ASSERTS: ${ACL_WITH_ASSERTS}"
+
+# Download the specified Compute Library version
+if [[ ! -f $ARCHIVE ]]; then
+  ACL_URL="https://github.com/ARM-software/ComputeLibrary/releases/download/${ACL_VERSION}/${ARCHIVE}"
+  echo "Downloading ACL from ${ACL_URL}"
+  wget ${ACL_URL}
+else
+  echo "$ARCHIVE already exists, skipping download."
+fi
+
+# Function to find the appropriate lib directory
+find_acl_lib_dir() {
+  local dirs=("$ACL_ROOT_DIR"/lib/*/)
+  local selected_dir=""
+
+  # Select directory based on build type
+  for dir in "${dirs[@]}"; do
+    if [[ $ACL_WITH_ASSERTS == 1 ]]; then
+      [[ "$dir" == *"-asserts/" ]] && selected_dir="$dir" && break
+    else
+      [[ "$dir" != *"-asserts/" ]] && selected_dir="$dir" && break
+    fi
+  done
+
+  # Return result or exit if not found
+  if [[ -z "$selected_dir" ]]; then
+    echo "No matching ACL lib directory found."
+    exit 1
+  else
+    echo "$selected_dir"
+  fi
+}
+
+# Extract the tarball if not already extracted
+if [[ ! -d $ACL_ROOT_DIR ]]; then
+  mkdir -p $ACL_ROOT_DIR
+  tar -xzvf "${ARCHIVE}" -C $ACL_ROOT_DIR --strip-components=1 >/dev/null 2>&1
+else
+  echo "$ACL_ROOT_DIR directory already exists, skipping extraction."
+fi
+
+# Find the ACL library directory
+ACL_LIB_DIR=$(find_acl_lib_dir)
+echo "Using ACL lib from ${ACL_LIB_DIR}"
+echo "cp contents from ${ACL_LIB_DIR} to ${ACL_ROOT_DIR}/lib"
+cp -rf "$ACL_LIB_DIR"* "$ACL_ROOT_DIR/lib/"
+
+echo "${ACL_VERSION}" >"${ACL_ROOT_DIR}/arm_compute/arm_compute_version.embed"
diff --git a/.github/automation/aarch64/skipped-tests.sh b/.github/automation/aarch64/skipped-tests.sh
new file mode 100755
index 00000000000..01f01923e76
--- /dev/null
+++ b/.github/automation/aarch64/skipped-tests.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Test oneDNN for aarch64.
+
+set -eo pipefail
+
+OS=${OS:-"Linux"}
+
+# AArch64 does not officially support graph for now.
+SKIPPED_GRAPH_TEST_FAILURES="test_graph_unit_dnnl_sdp_decomp_cpu"
+SKIPPED_GRAPH_TEST_FAILURES+="|test_graph_unit_dnnl_mqa_decomp_cpu"
+
+# described in issue: https://github.com/uxlfoundation/oneDNN/issues/2175
+SKIPPED_TEST_FAILURES="test_benchdnn_modeC_matmul_multidims_cpu"
+
+#  We currently have some OS and config specific test failures.
+if [[ "$OS" == "Linux" ]]; then
+    if [[ "$CMAKE_BUILD_TYPE" == "Debug" ]]; then
+        # as test_matmul is time consuming , we only run it in release mode to save time.
+        SKIPPED_TEST_FAILURES+="|test_matmul"
+    fi
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_binary_ci_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_binary_different_dt_ci_cpu"
+
+    SKIPPED_GRAPH_TEST_FAILURES+="|test_benchdnn_modeC_graph_ci_cpu"
+    SKIPPED_GRAPH_TEST_FAILURES+="|cpu-graph-gqa-cpp"
+    SKIPPED_GRAPH_TEST_FAILURES+="|cpu-graph-mqa-cpp"
+    SKIPPED_GRAPH_TEST_FAILURES+="|cpu-graph-sdpa-cpp"
+    SKIPPED_GRAPH_TEST_FAILURES+="|cpu-graph-sdpa-stacked-qkv-cpp"
+    SKIPPED_GRAPH_TEST_FAILURES+="|test_graph_unit_dnnl_large_partition_cpu"
+
+    # OpenVINO Toolkit OneDNN fork failed tests
+    SKIPPED_TEST_FAILURES+="|test_batch_normalization"
+    SKIPPED_TEST_FAILURES+="|test_eltwise"
+    SKIPPED_TEST_FAILURES+="|test_iface_attr"
+    SKIPPED_TEST_FAILURES+="|test_lrn"
+    SKIPPED_TEST_FAILURES+="|test_pooling_forward"
+    SKIPPED_TEST_FAILURES+="|test_reduction"
+    SKIPPED_TEST_FAILURES+="|test_api"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_binary_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_bnorm_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_conv_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_deconv_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_eltwise_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_lrn_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_pool_smoke_cpu"
+    SKIPPED_TEST_FAILURES+="|test_benchdnn_modeC_reduction_smoke_cpu"
+fi
+
+# Nightly failures
+SKIPPED_NIGHTLY_TEST_FAILURES="test_benchdnn_modeC_bnorm_all_blocked_cpu"
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_bnorm_regressions_cpu"
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_conv_int8_cpu"
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_graph_fusions_cpu"
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_matmul_sparse_gpu_cpu"
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_reorder_all_cpu"
+
+# * c7g failures. TODO: scope these to c7g only. Better yet, fix them.
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_binary_all_cpu"
+SKIPPED_NIGHTLY_TEST_FAILURES+="|test_benchdnn_modeC_graph_int8_cpu"
+
+SKIPPED_TEST_FAILURES+="|${SKIPPED_GRAPH_TEST_FAILURES}|${SKIPPED_NIGHTLY_TEST_FAILURES}"
+
+printf "${SKIPPED_TEST_FAILURES}"
diff --git a/.github/automation/aarch64/test.sh b/.github/automation/aarch64/test.sh
new file mode 100755
index 00000000000..d26fccb9aea
--- /dev/null
+++ b/.github/automation/aarch64/test.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Test oneDNN for aarch64.
+
+set -o errexit -o pipefail -o noclobber
+
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+
+export CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-"Release"}
+
+# Defines MP, CC, CXX and OS.
+source ${SCRIPT_DIR}/common.sh
+
+# Sequential (probably macOS) builds should use num proc parallelism.
+if [[ "$ONEDNN_THREADING" == "SEQ" ]]; then
+    export CTEST_PARALLEL_LEVEL=""
+fi
+
+set -x
+ctest --no-tests=error --output-on-failure -E $("${SCRIPT_DIR}"/skipped-tests.sh)
+set +x
diff --git a/.github/automation/build.sh b/.github/automation/build.sh
deleted file mode 100755
index 5a684f6353f..00000000000
--- a/.github/automation/build.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#! /bin/bash
-
-#===============================================================================
-# Copyright 2019-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-
-while [[ $# -gt 0 ]]; do
-    key="$1"
-
-    case $key in
-        --threading)
-        BUILD_THREADING="$2"
-        ;;
-        --mode)
-        BUILD_MODE="$2"
-        ;;
-        --source-dir)
-        SORUCE_DIR="$2"
-        ;;
-        --acl-dir)
-        ACL_DIR="$2"
-        ;;
-        --build-dir)
-        BUILD_DIR="$2"
-        ;;
-        --cmake-opt)
-        CMAKE_OPT="$2"
-        ;;
-        *)
-        echo "Unknown option: $1"
-        exit 1
-        ;;
-    esac
-    shift
-    shift
-done
-
-CMAKE_OPTIONS="-DCMAKE_BUILD_TYPE=${BUILD_MODE} -DDNNL_BUILD_FOR_CI=ON -DDNNL_WERROR=ON ${CMAKE_OPT}"
-
-CPU_RUNTIME="NONE"
-GPU_RUNTIME="NONE"
-
-if [ "${BUILD_THREADING}" == "tbb" ]; then
-    CPU_RUNTIME="TBB"
-    echo "Info: Setting DNNL_CPU_RUNTIME to TBB..."
-elif [ "${BUILD_THREADING}" == "omp" ]; then
-    echo "Info: Setting DNNL_CPU_RUNTIME to OMP..."
-    CPU_RUNTIME="OMP"
-elif [ "${BUILD_THREADING}" == "ocl" ]; then
-    echo "Info: Setting DNNL_CPU_RUNTIME to OMP..."
-    echo "Info: Setting DNNL_GPU_RUNTIME to OCL..."
-    CPU_RUNTIME="OMP"
-    GPU_RUNTIME="OCL"
-else
-    echo "Error unknown threading: ${BUILD_THREADING}"
-    exit 1
-fi
-
-CMAKE_OPTIONS="${CMAKE_OPTIONS}
-               -DDNNL_CPU_RUNTIME=${CPU_RUNTIME}
-               -DDNNL_GPU_RUNTIME=${GPU_RUNTIME}
-               -DDNNL_TEST_SET=SMOKE
-              "
-
-# Enable Compute Library backend if a location for the built library is given
-# NOTE: only for AArch64 builds.
-if [ ! -z ${ACL_DIR} ]; then
-  export ACL_ROOT_DIR=$ACL_DIR
-  CMAKE_OPTIONS="${CMAKE_OPTIONS} -DDNNL_AARCH64_USE_ACL=ON"
-  echo "Info: Building with Arm Compute Library backend for Aarch64..."
-fi
-
-if [ "$(uname)" == "Linux" ]; then
-    MAKE_OP="-j$(grep -c processor /proc/cpuinfo)"
-else
-    MAKE_OP="-j$(sysctl -n hw.physicalcpu)"
-fi
-
-cd "${SORUCE_DIR}"
-echo "Calling CMake with otions: ${CMAKE_OPTIONS}"
-cmake . -B${BUILD_DIR} ${CMAKE_OPTIONS}
-err=$?
-if [ "$err" != 0 ]; then
-    if [ -e "${BUILD_DIR}/CMakeFiles/CMakeOutput.log" ]; then
-        echo "CMakeOutput.log:"
-        cat ${BUILD_DIR}/CMakeFiles/CMakeOutput.log
-    fi
-    if [ -e "${BUILD_DIR}/CMakeFiles/CMakeError.log" ]; then
-        echo "CMakeError.log:"
-        cat ${BUILD_DIR}/CMakeFiles/CMakeError.log
-    fi
-    exit $err
-fi
-cd ${BUILD_DIR} && make -k ${MAKE_OP}
-exit $?
diff --git a/.github/automation/build_acl.sh b/.github/automation/build_acl.sh
deleted file mode 100755
index 41c6b0b4a2e..00000000000
--- a/.github/automation/build_acl.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#! /bin/bash
-
-# *******************************************************************************
-# Copyright 2020-2023 Arm Limited and affiliates.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# *******************************************************************************
-
-# Compute Library build defaults
-ACL_VERSION="v23.11"
-ACL_DIR="${PWD}/ComputeLibrary"
-ACL_ARCH="armv8a"
-ACL_MULTI_ISA_SUPPORT=0
-
-while [[ $# -gt 0 ]]; do
-    case $1 in
-        --version)
-        ACL_VERSION="v$2"
-        shift
-        ;;
-        --arch)
-        ACL_ARCH="$2"
-        shift
-        ;;
-        --multi_isa)
-        ACL_MULTI_ISA_SUPPORT=1
-        ;;
-        --root-dir)
-        ACL_DIR="$2"
-        shift
-        ;;
-        *)
-        echo "Unknown option: $1"
-        exit 1
-        ;;
-    esac
-    shift
-done
-
-readonly ACL_REPO="https://github.com/ARM-software/ComputeLibrary.git"
-MAKE_NP="-j$(grep -c processor /proc/cpuinfo)"
-
-git clone --branch $ACL_VERSION --depth 1 $ACL_REPO $ACL_DIR
-cd $ACL_DIR
-
-scons --silent $MAKE_NP Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \
-    os=linux arch=$ACL_ARCH build=native multi_isa=$ACL_MULTI_ISA_SUPPORT \
-    fixed_format_kernels=1
-
-exit $?
diff --git a/.github/automation/clang-format.sh b/.github/automation/clang-format.sh
index b38c3877466..28c99331109 100755
--- a/.github/automation/clang-format.sh
+++ b/.github/automation/clang-format.sh
@@ -16,20 +16,31 @@
 # limitations under the License.
 #===============================================================================
 
-echo "Using clang-format version: $(clang-format --version)"
+CLANG_FORMAT=clang-format-11
+
+echo "Checking ${CLANG_FORMAT}"
+if ! ${CLANG_FORMAT} --version; then
+    echo ${CLANG_FORMAT} is not available or not working correctly.
+    exit 1
+fi
+
 echo "Starting format check..."
 
-for filename in $(find "$(pwd)" -type f | grep -P ".*\.(c|cpp|h|hpp|cl)$"); do clang-format -style=file -i $filename; done
+for filename in $(find "$(pwd)" -type f | grep -P ".*\.(c|cpp|h|hpp|cl)$"); do ${CLANG_FORMAT} -style=file -i $filename; done
 
 RETURN_CODE=0
-echo $(git status) | grep "nothing to commit" > /dev/null
 
+echo $(git status) | grep "nothing to commit" > /dev/null
 if [ $? -eq 1 ]; then
-    echo "Clang-format check FAILED! Found not formatted files!"
-    echo "$(git status)"
+    echo "Clang-format check FAILED! The following files must be formatted with ${CLANG_FORMAT}:"
+    echo "$(git diff --name-only)"
+    echo
+    echo "Changes required to pass this check:"
+    echo "$(git diff)"
+    echo
     RETURN_CODE=3
 else
-    echo "Clang-format check PASSED! Not formatted files not found..."
+    echo "Clang-format check PASSED!"
 fi
 
-exit ${RETURN_CODE}
\ No newline at end of file
+exit ${RETURN_CODE}
diff --git a/.github/automation/commit-msg-check.py b/.github/automation/commit-msg-check.py
new file mode 100755
index 00000000000..aa6ca2cd5f6
--- /dev/null
+++ b/.github/automation/commit-msg-check.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+
+# *******************************************************************************
+# Copyright 2024 Arm Limited and affiliates.
+# Copyright 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+import argparse
+import subprocess
+import re
+
+# Ensure the scope ends in a colon and that same level scopes are
+# comma delimited.
+# Current implementation only checks the first level scope as ':' can be used
+# in the commit description (ex: TBB::tbb or bf16:bf16).
+# TODO: Limit scopes to an acceptable list of tags.
+def __scopeCheck(msg: str):
+    status = "Message scope: "
+
+    if not re.match('^[a-z0-9_]+(, [a-z0-9_]+)*: ', msg):
+        print(f"{status} FAILED: Commit message must follow the format "
+               "<scope>:[ <scope>:] <short description>")
+        return False
+
+    print(f"{status} OK")
+    return True
+
+# Ensure a character limit for the first line.
+def __numCharacterCheck(msg: str):
+    status = "Message length:"
+    if len(msg) <= 72:
+        print(f"{status} OK")
+        return True
+    else:
+        # Fixup commits usually include the full name of the commit they are
+        # fixing, which adds 6 more symbols to the message. Let them in.
+        if re.match('^fixup: ', msg):
+            print(f"{status} Fixup message, OK")
+            return True
+        else:
+            print(f"{status} FAILED: Commit message summary must not "
+                   "exceed 72 characters.")
+            return False
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("head", help="Head commit of PR branch")
+    parser.add_argument("base", help="Base commit of PR branch")
+    args = parser.parse_args()
+    base: str = args.base
+    head: str = args.head
+
+    commit_range = base + ".." + head
+    messages = subprocess.run(["git", "rev-list", "--format=oneline",
+        commit_range], capture_output=True, text=True).stdout
+
+    is_ok = True
+    for i in messages.splitlines():
+      print(i)
+      commit_msg=i.split(' ', 1)[1]
+      result = __numCharacterCheck(commit_msg)
+      is_ok = is_ok and result
+      result = __scopeCheck(commit_msg)
+      is_ok = is_ok and result
+
+    if is_ok:
+        print("All commmit messages are formatted correctly. ")
+    else:
+        print("Some commit message checks failed. Please align commit messages "
+              "with Contributing Guidelines and update the PR.")
+        exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/automation/env/qemu.sh b/.github/automation/env/qemu.sh
deleted file mode 100755
index 71dc2c636f6..00000000000
--- a/.github/automation/env/qemu.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#! /bin/bash
-
-#===============================================================================
-# Copyright 2020 FUJITSU LIMITED
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-# Download, build and install QEMU
-wget https://download.qemu.org/qemu-5.0.0.tar.xz
-tar xJf qemu-5.0.0.tar.xz > /dev/null
-cd qemu-5.0.0
-./configure --target-list=aarch64-linux-user > /dev/null
-make > /dev/null
-make install > /dev/null
diff --git a/.github/automation/performance/bench_nightly_performance.sh b/.github/automation/performance/bench_nightly_performance.sh
new file mode 100644
index 00000000000..9b66f0a7c01
--- /dev/null
+++ b/.github/automation/performance/bench_nightly_performance.sh
@@ -0,0 +1,53 @@
+#! /bin/bash
+
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Usage: bash bench_nightly_performance.sh {baseline_benchdnn_executable} {benchdnn_executable} {baseline_results_file} {new_results_file}
+
+IFS=$'\n' # Prevents shuffling from using spaces as delimiters
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+TESTS=(
+        "$1 --matmul --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/matmul_nightly >> $3"
+        "$2 --matmul --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/matmul_nightly >> $4"
+        "$1 --conv --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/conv_nightly >> $3"
+        "$2 --conv --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/conv_nightly >> $4"
+        "$1 --eltwise --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/eltwise_nightly >> $3"
+        "$2 --eltwise --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/eltwise_nightly >> $4"
+        "$1 --reorder --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/reorder_nightly >> $3"
+        "$2 --reorder --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/reorder_nightly >> $4"
+    )
+
+N=5
+
+for i in $( seq $N )
+do
+    echo "Testing loop ${i} / ${N}..."
+
+    TESTS=( $(shuf -e "${TESTS[@]}") )
+
+    for test in "${TESTS[@]}"
+    do
+        echo "Starting ${test}"
+        SECONDS=0
+        eval $test
+        duration=$SECONDS
+        echo "Completed in $((duration / 60)):$((duration % 60))"
+    done
+done
diff --git a/.github/automation/performance/bench_pr_performance.sh b/.github/automation/performance/bench_pr_performance.sh
new file mode 100755
index 00000000000..48f0eebd643
--- /dev/null
+++ b/.github/automation/performance/bench_pr_performance.sh
@@ -0,0 +1,52 @@
+#! /bin/bash
+
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+# Usage: bash bench_pr_performance.sh {baseline_benchdnn_executable} {benchdnn_executable} {baseline_results_file} {new_results_file}
+
+IFS=$'\n' # Prevents shuffling from using spaces as delimiters
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+
+TESTS=(
+        "$1 --matmul --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/matmul >> $3"
+        "$2 --matmul --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/matmul >> $4"
+        "$1 --conv --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/conv >> $3"
+        "$2 --conv --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/conv >> $4"
+        "$1 --eltwise --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/eltwise >> $3"
+        "$2 --eltwise --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/eltwise >> $4"
+        "$1 --reorder --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/reorder >> $3"
+        "$2 --reorder --mode=P --perf-template=%prb%,%-time% --batch=${SCRIPT_DIR}/inputs/reorder >> $4"
+    )
+
+N=5
+
+for i in $( seq $N )
+do
+    echo "Testing loop ${i} / ${N}..."
+
+    TESTS=( $(shuf -e "${TESTS[@]}") )
+
+    for test in "${TESTS[@]}"
+    do
+        echo "Starting ${test}"
+        SECONDS=0
+        eval $test
+        duration=$SECONDS
+        echo "Completed in $((duration / 60)):$((duration % 60))"
+    done
+done
diff --git a/.github/automation/performance/benchdnn_comparison.py b/.github/automation/performance/benchdnn_comparison.py
new file mode 100644
index 00000000000..1dba59e4e4b
--- /dev/null
+++ b/.github/automation/performance/benchdnn_comparison.py
@@ -0,0 +1,98 @@
+#!/usr/bin/python3
+
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+import sys
+import os
+from collections import defaultdict
+from scipy.stats import ttest_ind
+import warnings
+import statistics
+
+
+def compare_two_benchdnn(file1, file2, tolerance=0.05):
+    """
+    Compare two benchdnn output files
+    """
+    with open(file1) as f:
+        r1 = f.readlines()
+
+    with open(file2) as f:
+        r2 = f.readlines()
+
+    # Trim non-formatted lines and split the problem from time
+    r1 = [x.split(",") for x in r1 if x[0:8] == "--mode=P"]
+    r2 = [x.split(",") for x in r2 if x[0:8] == "--mode=P"]
+
+    if (len(r1) == 0) or (len(r2) == 0):
+        warnings.warn("One or both of the test results have zero lines")
+    if len(r1) != len(r2):
+        warnings.warn("The number of benchdnn runs do not match")
+
+    r1_samples = defaultdict(list)
+    r2_samples = defaultdict(list)
+
+    for k, v in r1:
+        r1_samples[k].append(float(v[:-1]))
+    for k, v in r2:
+        r2_samples[k].append(float(v[:-1]))
+
+    failed_tests = []
+    times = {}
+    for prb, r1_times in r1_samples.items():
+        if prb not in r2_samples:
+            warnings.warn(f"{prb} exists in {file1} but not {file2}")
+            continue
+
+        r2_times = r2_samples[prb]
+
+        res = ttest_ind(r2_times, r1_times, alternative='greater')
+        r1_med = statistics.median(r1_times)
+        r2_med = statistics.median(r2_times)
+        times[prb] = (r1_med, r2_med)
+        times_str = f" {times[prb][0]} vs {times[prb][1]}"
+
+        # pass the test if:
+        # the t-test passes (i.e. pvalue > 0.05) OR
+        # both the median time and min time has not 
+        # slowed down by more than 10%
+        passed = res.pvalue > 0.05 or \
+                ((r2_med - r1_med) / r1_med < 0.1 and \
+                (min(r2_times) - min(r1_times)) / min(r1_times) < 0.1)
+        if not passed:
+            failed_tests.append(prb + times_str)
+            passed = False
+
+    if "GITHUB_OUTPUT" in os.environ:
+        with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+            print(f"pass={not failed_tests}", file=f)
+
+    if not failed_tests:
+        print("Regression tests passed")
+    else:
+        message = "\n----The following regression tests failed:----\n" + \
+                    "\n".join(failed_tests) + "\n"
+        if "GITHUB_OUTPUT" in os.environ:
+            out_message = message.replace("\n", "%0A")
+            with open(os.environ["GITHUB_OUTPUT"], "a") as f:
+                print(f'message={out_message}', file=f)
+        print(message)
+        raise Exception("Some regression tests failed")
+
+if __name__ == "__main__":
+    compare_two_benchdnn(sys.argv[1], sys.argv[2])
diff --git a/.github/automation/performance/inputs/conv b/.github/automation/performance/inputs/conv
new file mode 100644
index 00000000000..554fb596465
--- /dev/null
+++ b/.github/automation/performance/inputs/conv
@@ -0,0 +1,27 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+--reset
+--dir=FWD_D
+--dt=f32
+mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw267ow267kw1sw1dw0pw0
+
+--reset
+--dir=FWD_D
+--dt=f32
+--attr-fpmath=bf16
+mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw267ow267kw1sw1dw0pw0
diff --git a/.github/automation/performance/inputs/conv_nightly b/.github/automation/performance/inputs/conv_nightly
new file mode 100644
index 00000000000..04699bee0e5
--- /dev/null
+++ b/.github/automation/performance/inputs/conv_nightly
@@ -0,0 +1,25 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+--reset
+--batch=conv
+
+--reset 
+--dt=f32
+--alg=auto
+--dir=FWD_D,FWD_B
+--attr-fpmath=,bf16
+--batch=shapes_resnet_50
diff --git a/.github/automation/performance/inputs/eltwise b/.github/automation/performance/inputs/eltwise
new file mode 100644
index 00000000000..35935ce2629
--- /dev/null
+++ b/.github/automation/performance/inputs/eltwise
@@ -0,0 +1,23 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+--reset
+--inplace=true
+--alg=gelu_erf
+--dir=FWD_D
+--dt=f32,bf16
+--tag=abc
+1536x384
diff --git a/.github/automation/performance/inputs/eltwise_nightly b/.github/automation/performance/inputs/eltwise_nightly
new file mode 100644
index 00000000000..801ff66a3ba
--- /dev/null
+++ b/.github/automation/performance/inputs/eltwise_nightly
@@ -0,0 +1,41 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+--reset
+--batch=eltwise
+
+--reset
+
+--dt=f32
+--tag=abx,axb
+--dir=FWD_D
+--attr-post-ops=,
+
+## algs which do not support alpha and beta + relu with alpha=0
+--alpha=0 --beta=0
+--alg=exp,exp_dst,gelu_erf,gelu_tanh,relu_dst,tanh,tanh_dst
+384x384
+
+## algs which support negative alpha
+--alpha=-2 --beta=0
+--alg=elu,relu,swish
+384x384
+
+## algs which support alpha and beta
+--alpha=-2 --beta=3
+--alg=linear
+384x384
diff --git a/.github/automation/performance/inputs/matmul b/.github/automation/performance/inputs/matmul
new file mode 100644
index 00000000000..f9deaac91f0
--- /dev/null
+++ b/.github/automation/performance/inputs/matmul
@@ -0,0 +1,32 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+--reset
+--stag=ab
+--wtag=any
+--dtag=ab
+--attr-post-ops=sum
+--dt=f32
+1500x1536:1536x384
+
+--reset
+--stag=ab
+--wtag=any
+--dtag=ab
+--attr-post-ops=sum
+--attr-fpmath=bf16
+--dt=f32
+1500x1536:1536x384
diff --git a/.github/automation/performance/inputs/matmul_nightly b/.github/automation/performance/inputs/matmul_nightly
new file mode 100644
index 00000000000..9a37d97c7ec
--- /dev/null
+++ b/.github/automation/performance/inputs/matmul_nightly
@@ -0,0 +1,43 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+--reset
+--batch=matmul
+
+# Plain cases
+--reset
+--dt=f32,s8:s8:f32
+--bia-dt=f32,undef
+--bia_mask=2
+--batch=shapes_2d_ci
+--bia_mask=4
+--batch=shapes_3d
+
+--dt=f32
+--bia-dt=f32,undef
+--bia_mask=2
+--attr-fpmath=bf16
+--batch=shapes_2d_ci
+--bia_mask=4
+--batch=shapes_3d
+
+#f16
+--dt=f16:f16:f16
+--bia-dt=undef
+--bia_mask=2
+--batch=shapes_2d_ci
+--bia_mask=4
+--batch=shapes_3d
diff --git a/.github/automation/performance/inputs/reorder b/.github/automation/performance/inputs/reorder
new file mode 100644
index 00000000000..38441f10f2c
--- /dev/null
+++ b/.github/automation/performance/inputs/reorder
@@ -0,0 +1,39 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+--reset
+--sdt=f32
+--ddt=f32
+--allow-enum-tags-only=0
+--stag=ba
+--dtag=Ab4a,Ab8a
+384x384
+
+--reset
+--sdt=f32
+--ddt=bf16
+--allow-enum-tags-only=0
+--stag=ba
+--dtag=BA8b4a,BA4b4a
+384x384
+
+--reset
+--sdt=bf16
+--ddt=f32
+--allow-enum-tags-only=0
+--stag=BA8b4a,BA4b4a
+384x384
diff --git a/.github/automation/performance/inputs/reorder_nightly b/.github/automation/performance/inputs/reorder_nightly
new file mode 100644
index 00000000000..62e5f9ed262
--- /dev/null
+++ b/.github/automation/performance/inputs/reorder_nightly
@@ -0,0 +1,27 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+--reset
+--batch=reorder
+
+--reset
+--sdt=f32,s8
+--ddt=f32,s8
+
+--stag=abx,axb,aBx4b,aBx8b
+--dtag=abx,axb,aBx4b,aBx8b
+4x256x5x5
diff --git a/.github/automation/x64/build_linters.sh b/.github/automation/x64/build_linters.sh
new file mode 100755
index 00000000000..58951e313ca
--- /dev/null
+++ b/.github/automation/x64/build_linters.sh
@@ -0,0 +1,40 @@
+# Build oneDNN for PR linter checks.
+
+set -o errexit -o pipefail -o noclobber
+
+export CC=clang
+export CXX=clang++
+
+if [[ "$ONEDNN_ACTION" == "configure" ]]; then
+    if [[ "$GITHUB_JOB" == "pr-clang-tidy" ]]; then
+      set -x
+      cmake \
+          -Bbuild -S. \
+          -DCMAKE_BUILD_TYPE=debug \
+          -DONEDNN_BUILD_GRAPH=ON \
+          -DDNNL_EXPERIMENTAL=ON \
+          -DDNNL_EXPERIMENTAL_SPARSE=ON \
+          -DDNNL_EXPERIMENTAL_PROFILING=ON \
+          -DDNNL_EXPERIMENTAL_UKERNEL=ON \
+          -DONEDNN_EXPERIMENTAL_LOGGING=ON \
+          -DDNNL_CPU_RUNTIME=OMP \
+          -DDNNL_GPU_RUNTIME=OCL \
+          -DDNNL_WERROR=ON \
+          -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+      set +x
+    elif [[ "$GITHUB_JOB" == "pr-format-tags" ]]; then
+      set -x
+      cmake -B../build -S. -DONEDNN_BUILD_GRAPH=OFF -DDNNL_EXPERIMENTAL_SPARSE=ON
+      set +x
+    else
+      echo "Unknown linter job: $GITHUB_JOB"
+      exit 1
+    fi
+elif [[ "$ONEDNN_ACTION" == "build" ]]; then
+    set -x
+    cmake --build build -j`nproc`
+    set +x
+else
+    echo "Unknown action: $ONEDNN_ACTION"
+    exit 1
+fi
diff --git a/.github/automation/build.bat b/.github/azure/build.bat
similarity index 100%
rename from .github/automation/build.bat
rename to .github/azure/build.bat
diff --git a/.github/azure/build.sh b/.github/azure/build.sh
new file mode 100755
index 00000000000..f8a589b8bb1
--- /dev/null
+++ b/.github/azure/build.sh
@@ -0,0 +1,108 @@
+#! /bin/bash
+
+#===============================================================================
+# Copyright 2019-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+
+while [[ $# -gt 0 ]]; do
+    key="$1"
+
+    case $key in
+        --threading)
+        BUILD_THREADING="$2"
+        ;;
+        --mode)
+        BUILD_MODE="$2"
+        ;;
+        --source-dir)
+        SORUCE_DIR="$2"
+        ;;
+        --acl-dir)
+        ACL_DIR="$2"
+        ;;
+        --build-dir)
+        BUILD_DIR="$2"
+        ;;
+        --cmake-opt)
+        CMAKE_OPT="$2"
+        ;;
+        *)
+        echo "Unknown option: $1"
+        exit 1
+        ;;
+    esac
+    shift
+    shift
+done
+
+CMAKE_OPTIONS="-DCMAKE_BUILD_TYPE=${BUILD_MODE} -DDNNL_BUILD_FOR_CI=ON -DDNNL_WERROR=ON ${CMAKE_OPT}"
+
+CPU_RUNTIME="NONE"
+GPU_RUNTIME="NONE"
+
+if [ "${BUILD_THREADING}" == "tbb" ]; then
+    CPU_RUNTIME="TBB"
+    echo "Info: Setting DNNL_CPU_RUNTIME to TBB..."
+elif [ "${BUILD_THREADING}" == "omp" ]; then
+    echo "Info: Setting DNNL_CPU_RUNTIME to OMP..."
+    CPU_RUNTIME="OMP"
+elif [ "${BUILD_THREADING}" == "ocl" ]; then
+    echo "Info: Setting DNNL_CPU_RUNTIME to OMP..."
+    echo "Info: Setting DNNL_GPU_RUNTIME to OCL..."
+    CPU_RUNTIME="OMP"
+    GPU_RUNTIME="OCL"
+else
+    echo "Error unknown threading: ${BUILD_THREADING}"
+    exit 1
+fi
+
+CMAKE_OPTIONS="${CMAKE_OPTIONS}
+               -DDNNL_CPU_RUNTIME=${CPU_RUNTIME}
+               -DDNNL_GPU_RUNTIME=${GPU_RUNTIME}
+               -DDNNL_TEST_SET=SMOKE
+              "
+
+# Enable Compute Library backend if a location for the built library is given
+# NOTE: only for AArch64 builds.
+if [ ! -z ${ACL_DIR} ]; then
+  export ACL_ROOT_DIR=$ACL_DIR
+  CMAKE_OPTIONS="${CMAKE_OPTIONS} -DDNNL_USE_ACL=ON"
+  echo "Info: Building with Arm Compute Library backend for Aarch64..."
+fi
+
+if [ "$(uname)" == "Linux" ]; then
+    MAKE_OP="-j$(grep -c processor /proc/cpuinfo)"
+else
+    MAKE_OP="-j$(sysctl -n hw.physicalcpu)"
+fi
+
+cd "${SORUCE_DIR}"
+echo "Calling CMake with otions: ${CMAKE_OPTIONS}"
+cmake . -B${BUILD_DIR} ${CMAKE_OPTIONS}
+err=$?
+if [ "$err" != 0 ]; then
+    if [ -e "${BUILD_DIR}/CMakeFiles/CMakeOutput.log" ]; then
+        echo "CMakeOutput.log:"
+        cat ${BUILD_DIR}/CMakeFiles/CMakeOutput.log
+    fi
+    if [ -e "${BUILD_DIR}/CMakeFiles/CMakeError.log" ]; then
+        echo "CMakeError.log:"
+        cat ${BUILD_DIR}/CMakeFiles/CMakeError.log
+    fi
+    exit $err
+fi
+cd ${BUILD_DIR} && make -k ${MAKE_OP}
+exit $?
diff --git a/.github/azure/ci-x64.yml b/.github/azure/ci-x64.yml
new file mode 100644
index 00000000000..2ee792dd7cc
--- /dev/null
+++ b/.github/azure/ci-x64.yml
@@ -0,0 +1,111 @@
+#===============================================================================
+# Copyright 2019-2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+# Reference:
+# https://learn.microsoft.com/en-us/azure/devops/pipelines/yaml-schema
+
+trigger:
+  batch: true
+  branches:
+    include: [ main, 'rls-*' ]
+  paths:
+    include:
+      - .github/azure
+      - cmake
+      - examples
+      - include
+      - src
+      - tests
+      - CMakeLists.txt
+    exclude:
+      - src/gpu
+      - src/cpu/aarch64
+      - src/cpu/ppc64
+      - src/cpu/rv64
+      - src/cpu/s390x
+      - src/xpu
+
+pr:
+  autoCancel: true
+  branches:
+    include: [ main, 'rls-*' ]
+  paths:
+    include:
+      - .github/azure
+      - cmake
+      - examples
+      - include
+      - src
+      - tests
+      - CMakeLists.txt
+    exclude:
+      - src/gpu
+      - src/cpu/aarch64
+      - src/cpu/ppc64
+      - src/cpu/rv64
+      - src/cpu/s390x
+      - src/xpu
+
+jobs:
+  - job: 'Ubuntu22'
+    timeoutInMinutes: 120
+    pool:
+      vmImage: 'ubuntu-22.04'
+    strategy:
+      matrix: 
+        clang:
+          CC: clang
+          CXX: clang++
+        gcc:
+          CC: gcc
+          CXX: g++
+    steps:
+      - script: |
+          if [ "$(CC)" == "clang" ]; then
+            .github/azure/env/clang.sh 15
+          fi
+        displayName: "Init_Env"
+      - script: |
+          .github/azure/build.sh --threading omp --mode Release --source-dir $(pwd) --build-dir $(pwd)/build
+        displayName: 'build'
+      - script: |
+          .github/azure/test.sh --build-dir $(pwd)/build --report-dir $(pwd)/report
+        displayName: 'test'
+        failOnStderr: true
+  - job: 'macOS14'
+    timeoutInMinutes: 120
+    pool:
+      vmImage: 'macOS-14'
+    steps:
+      - script: |
+          .github/azure/build.sh --threading omp --mode Release --source-dir $(pwd) --build-dir $(pwd)/build
+        displayName: 'build'
+      - script: |
+          .github/azure/test.sh --build-dir $(pwd)/build --report-dir $(pwd)/report
+        displayName: 'test'
+        failOnStderr: true
+  - job: 'Windows_Server_2022'
+    timeoutInMinutes: 120
+    pool:
+      vmImage: 'windows-2022'
+    steps:
+      - script: |
+          .github\azure\build.bat /THREADING omp /MODE Release /VSVERSION vs2022 /SOURCEDIR %CD% /BUILDDIR %CD%\build
+        displayName: 'build'
+      - script: |
+          .github\azure\test.bat /BUILDDIR %CD%\build /MODE Release /REPORTDIR %CD%\report
+        displayName: 'test'
+        failOnStderr: true
diff --git a/.github/automation/env/clang.sh b/.github/azure/env/clang.sh
similarity index 100%
rename from .github/automation/env/clang.sh
rename to .github/azure/env/clang.sh
diff --git a/.github/automation/test.bat b/.github/azure/test.bat
similarity index 100%
rename from .github/automation/test.bat
rename to .github/azure/test.bat
diff --git a/.github/automation/test.sh b/.github/azure/test.sh
similarity index 100%
rename from .github/automation/test.sh
rename to .github/azure/test.sh
diff --git a/.github/labels.yml b/.github/labels.yml
index 81e5e89ac72..98ec4fdde80 100644
--- a/.github/labels.yml
+++ b/.github/labels.yml
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2024 Intel Corporation
+# Copyright 2024-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,6 +41,33 @@ documentation:
 - changed-files:
   - any-glob-to-any-file: ['**/*.md', 'doc/**']
 
+# Common code
+component:build:
+- changed-files:
+  - any-glob-to-any-file:
+    - 'cmake/**'
+    - 'CMakeLists.txt'
+
+component:examples:
+- changed-files:
+  - any-glob-to-any-file: 'examples/**'
+
+component:tests:
+- changed-files:
+  - any-glob-to-any-file: 'tests/**'
+
+component:api:
+- changed-files:
+  - any-glob-to-any-file: 'include/**'
+
+component:graph-api:
+- changed-files:
+  - any-glob-to-any-file:
+    - 'src/graph/**'
+    - 'tests/benchdnn/graph/**'
+    - 'tests/gtests/graph/**'
+    - 'doc/graph/**'
+
 # CPU Engine
 platform:cpu-aarch64:
 - changed-files:
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 8dde2c631ea..7e7f8473297 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,6 +1,6 @@
 # Description
 
-Please include a summary of the change. Please also include relevant motivation and context. See [contribution guidelines](https://github.com/oneapi-src/oneDNN/blob/master/CONTRIBUTING.md) for more details. If the change fixes an issue not documented in the project's Github issue tracker, please document all steps necessary to reproduce it.
+Please include a summary of the change. Please also include relevant motivation and context. See [contribution guidelines](https://github.com/uxlfoundation/oneDNN/blob/main/CONTRIBUTING.md) for more details. If the change fixes an issue not documented in the project's Github issue tracker, please document all steps necessary to reproduce it.
 
 Fixes # (github issue)
 
@@ -26,7 +26,7 @@ Fixes # (github issue)
 - [ ] Have you included information on how to reproduce the issue (either in a github issue or in this PR)?
 - [ ] Have you added relevant regression tests?
 
-## [RFC](https://github.com/oneapi-src/oneDNN/tree/rfcs) PR
+## [RFC](https://github.com/uxlfoundation/oneDNN/tree/rfcs) PR
 
-- [ ] Does RFC document follow the [template](https://github.com/oneapi-src/oneDNN/blob/rfcs/rfcs/template.md#onednn-design-document-rfc)?
+- [ ] Does RFC document follow the [template](https://github.com/uxlfoundation/oneDNN/blob/rfcs/rfcs/template.md#onednn-design-document-rfc)?
 - [ ] Have you added a link to the rendered document?
diff --git a/.github/workflows/aarch64-acl.yml b/.github/workflows/aarch64-acl.yml
new file mode 100644
index 00000000000..a241b5680da
--- /dev/null
+++ b/.github/workflows/aarch64-acl.yml
@@ -0,0 +1,124 @@
+# *******************************************************************************
+# Copyright 2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "Build ACL cache"
+
+#* To avoid duplicate jobs running when both push and PR is satisfied, we use this:
+#* https://github.com/orgs/community/discussions/26940#discussioncomment-5686753
+on:
+  workflow_call:
+  workflow_dispatch:
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  # Cache is built sequentially to avoid cache-hit race conditions
+  build-cache:
+    strategy:
+      max-parallel: 1
+      matrix:
+        config: [
+          { name: MacOS, label: macos-14, threading: SEQ, toolset: clang, build: Release },
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: Release },
+          { name: c6g, label: ah-ubuntu_22_04-c6g_2x-50, threading: OMP, toolset: clang, build: Debug },
+          { name: c6g, label: ah-ubuntu_22_04-c6g_2x-50, threading: OMP, toolset: gcc, build: Release }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+      - name: Checkout oneDNN
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: oneDNN
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=`cat ${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json`
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> $GITHUB_OUTPUT
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd ${{ github.workspace }}/ComputeLibrary && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> $GITHUB_OUTPUT
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> $GITHUB_OUTPUT
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@v4
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Install Scons (MacOS)
+        if: ${{ matrix.config.name == 'MacOS' && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        run: brew install scons
+
+      - name: Install scons (Linux)
+        if: ${{ matrix.config.name != 'MacOS' && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        run: |
+          sudo apt update -y
+          sudo apt install -y scons
+
+      - if: ${{ contains(matrix.config.label,'ubuntu') && (matrix.config.threading == 'OMP') && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - if: ${{ contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'gcc') && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - if: ${{ contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'clang') && (steps.cache-acl-restore.outputs.cache-hit != 'true') }}
+        name: Install clang
+        uses: KyleMayes/install-llvm-action@6ba6e2cd3813def9879be378609d87cb3ef3bac3
+        with:
+          version: ${{ fromJson(steps.get-versions.outputs.output).dependencies.clang }}
+
+      - name: Build ACL
+        if: ${{ steps.cache-acl-restore.outputs.cache-hit != 'true' }}
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: build
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_THREADING: ${{ matrix.config.threading }}
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          ACL_BUILD_TYPE: ${{ matrix.config.build }}
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - name: Save ACL in cache
+        id: cache-acl_build-save
+        if: ${{ steps.cache-acl-restore.outputs.cache-hit != 'true' }}
+        uses: actions/cache/save@v4
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
diff --git a/.github/workflows/ci-aarch64.yml b/.github/workflows/ci-aarch64.yml
new file mode 100644
index 00000000000..6b0a2923c97
--- /dev/null
+++ b/.github/workflows/ci-aarch64.yml
@@ -0,0 +1,241 @@
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "CI AArch64"
+
+#* To avoid duplicate jobs running when both push and PR is satisfied, we use this:
+#* https://github.com/orgs/community/discussions/26940#discussioncomment-5686753
+on:
+  push:
+    branches: [main, "rls-*"]
+    paths:
+      - ".github/automation/*"
+      - ".github/automation/aarch64"
+      - ".github/workflows/aarch64-acl.yml"
+      - ".github/workflows/ci-aarch64.yml"
+      - "cmake/**"
+      - "examples/**"
+      - "include/**"
+      - "src/common/**"
+      - "src/cpu/*"
+      - "src/cpu/aarch64/**"
+      - "tests/**"
+      - "CMakeLists.txt"
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - ".github/automation/*"
+      - ".github/automation/aarch64"
+      - ".github/workflows/aarch64-acl.yml"
+      - ".github/workflows/ci-aarch64.yml"
+      - "cmake/**"
+      - "examples/**"
+      - "include/**"
+      - "src/common/**"
+      - "src/cpu/*"
+      - "src/cpu/aarch64/**"
+      - "tests/**"
+      - "CMakeLists.txt"
+  #* allow manual trigger of workflow when needed.
+  workflow_dispatch:
+
+#* Stop stale workflows when pull requests are updated: https://stackoverflow.com/a/70972844
+#* Does not apply to the main branch.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  build-acl-cache:
+    uses: ./.github/workflows/aarch64-acl.yml
+
+  build-and-test:
+    needs: build-acl-cache
+    strategy:
+      matrix:
+        config: [
+          { name: MacOS, label: macos-14, threading: SEQ, toolset: clang, build: Release, testset: SMOKE },
+          { name: cb100, label: ubuntu-24.04-arm, threading: OMP, toolset: gcc, build: Release, testset: SMOKE },
+          { name: c6g, label: ah-ubuntu_22_04-c6g_4x-50, threading: OMP, toolset: gcc, build: Release, testset: CI },
+          { name: c6g, label: ah-ubuntu_22_04-c6g_2x-50, threading: OMP, toolset: clang, build: Debug, testset: SMOKE },
+          { name: c7g, label: ah-ubuntu_22_04-c7g_4x-50, threading: OMP, toolset: gcc, build: Release, testset: CI }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+      - name: Checkout oneDNN
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: oneDNN
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=`cat ${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json`
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> $GITHUB_OUTPUT
+
+      # Note: This will create a github actions cache
+      - name: Get latest CMake and Ninja
+        uses: lukka/get-cmake@56d043d188c3612951d8755da8f4b709ec951ad6 # v3.31.6
+        with:
+          cmakeVersion: 3.31.0
+          ninjaVersion: 1.12.0
+
+      - if: ${{ (contains(matrix.config.label,'ubuntu') && (matrix.config.threading == 'OMP')) }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - if: ${{ (contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'gcc')) }}
+        name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - if: ${{ (contains(matrix.config.label,'ubuntu') && (matrix.config.toolset == 'clang')) }}
+        name: Install clang
+        uses: KyleMayes/install-llvm-action@6ba6e2cd3813def9879be378609d87cb3ef3bac3
+        with:
+          version: ${{ fromJson(steps.get-versions.outputs.output).dependencies.clang }}
+
+      - name: setup python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install scipy
+        if: ${{ matrix.config.build == 'Release' }}
+        run: pip install scipy statistics
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd ${{ github.workspace }}/ComputeLibrary && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> $GITHUB_OUTPUT
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> $GITHUB_OUTPUT
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@v4
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Configure oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run oneDNN tests
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/test.sh
+        working-directory: ${{ github.workspace }}/oneDNN/build
+        env:
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CTEST_PARALLEL_LEVEL: 6
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      ## Performance test steps ##
+      - name: Checkout oneDNN base
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' }}
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          ref: ${{ github.base_ref }}
+          path: oneDNN_base
+
+      - name: Configure oneDNN base
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' }}
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN base
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' }}
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run performance tests
+        shell: bash
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' }}
+        run: |
+          OMP_NUM_THREADS=4 bash ${{ github.workspace }}/oneDNN/.github/automation/performance/bench_pr_performance.sh ${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn ${{ github.workspace }}/oneDNN/build/tests/benchdnn/benchdnn base_4.txt new_4.txt
+          OMP_NUM_THREADS=16 bash ${{ github.workspace }}/oneDNN/.github/automation/performance/bench_pr_performance.sh ${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn ${{ github.workspace }}/oneDNN/build/tests/benchdnn/benchdnn base_16.txt new_16.txt
+        env:
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Compare performance test results
+        if: ${{ github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' }}
+        id: performance-test
+        continue-on-error: true
+        run: |
+          echo "4 threads:"
+          python ${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py base_4.txt new_4.txt
+          echo "16 threads:"
+          python ${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py base_16.txt new_16.txt
+
+      - name: Check performance test failure
+        if: ${{ steps.performance-test.outputs.pass != 'True' && github.event_name == 'pull_request' && matrix.config.build == 'Release' && matrix.config.name != 'cb100' }}
+        run: echo "::warning file=.github/workflows/ci-aarch64.yml,line=1,col=1::${{ steps.performance-test.outputs.message }}"
+
+  # This job adds a check named "CI AArch64" that represents overall
+  # workflow status and can be used in branch rulesets
+  status:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    name: "CI AArch64"
+    steps:
+      - name: Print success
+        run: echo Success
diff --git a/.github/workflows/clang-tidy.yml b/.github/workflows/clang-tidy.yml
new file mode 100644
index 00000000000..f4bd097bb50
--- /dev/null
+++ b/.github/workflows/clang-tidy.yml
@@ -0,0 +1,85 @@
+name: "Clang-Tidy"
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize, reopened]
+    paths:
+      - ".github/automation/x64/**"
+      - ".github/workflows/clang-tidy.yml"
+      - "cmake/**"
+      - "examples/**"
+      - "include/**"
+      - "src/common/**"
+      - "src/cpu/*"
+      - "src/cpu/gemm/**"
+      - "src/cpu/matmul/**"
+      - "src/cpu/reorder/**"
+      - "src/cpu/rnn/**"
+      - "src/cpu/x64/**"
+      - "src/gpu/*"
+      - "src/gpu/intel/**"
+      - "src/graph/**"
+      - "tests/**"
+      - "CMakeLists.txt"
+
+## Declare default permissions as read only.
+permissions: read-all
+
+# Kill stale checks
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  pr-clang-tidy:
+    name: Clang-Tidy
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout oneDNN
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: Install clang
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y clang libomp-dev ocl-icd-libopencl1 ocl-icd-opencl-dev
+
+      - name: Configure oneDNN
+        run: .github/automation/x64/build_linters.sh
+        env:
+          ONEDNN_ACTION: configure
+
+      - name: Check source files
+        run: |
+          echo -e "Checking Clang-Tidy $(clang-tidy --version)\n"
+          touch source-check.log
+          for file in $(git diff --name-only ${{ github.event.pull_request.head.sha }} ${{ github.event.pull_request.base.sha }} | grep -E '\.cpp$'); 
+          do
+            if grep -q "$file" "build/compile_commands.json"; then
+              echo -e "\nAnalyzing $file"
+              clang-tidy -p build --header-filter='' $file 2>&1 | tee -a source-check.log
+            else
+              echo "Skipped $file as it's not built in x64 OpenMP/OpenCL configuration."
+            fi
+          done
+          grep -i -E "warning:|error:" source-check.log | sort -u
+          grep -q -i -E "warning:|error:" source-check.log && exit 1 || true
+
+      - name: Check header files
+        if: always()
+        continue-on-error: true
+        run: |
+          echo -e "Checking Clang-Tidy $(clang-tidy --version)\n"
+          touch headers-check.log
+          for file in $(git diff --name-only ${{ github.event.pull_request.head.sha }} ${{ github.event.pull_request.base.sha }} | grep -E '\.cpp$'); 
+          do
+            if grep -q "$file" "build/compile_commands.json"; then
+              echo -e "\nAnalyzing $file"
+              clang-tidy -p build $file 2>&1 | tee -a headers-check.log
+            else
+              echo "Skipped $file as it's not built in x64 OpenMP/OpenCL configuration."
+            fi
+          done
+          grep -i -E "warning:|error:" headers-check.log | sort -u
+          grep -q -i -E "warning:|error:" headers-check.log && exit 1 || true
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 09d2f175a7b..64d4471fb87 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -28,7 +28,7 @@ jobs:
       pull-requests: write
 
     steps:
-      - uses: actions/labeler@v5.0.0
+      - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0
         with:
           sync-labels: true
           configuration-path: '.github/labels.yml'
diff --git a/.github/workflows/nightly-aarch64.yml b/.github/workflows/nightly-aarch64.yml
new file mode 100644
index 00000000000..d8086ae2f66
--- /dev/null
+++ b/.github/workflows/nightly-aarch64.yml
@@ -0,0 +1,146 @@
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "Nightly AArch64"
+
+on:
+  #* allow manual trigger of workflow when needed. Useful for a nightly.
+  workflow_dispatch:
+  schedule:
+    #* minute (0-59) hour (0-23) day (1-31) month (1-12)  day of the week (0 - 6)
+    #* cron jobs run on the default (main) branch.
+    #* set to run at 5am UCT
+  - cron: "0 5 * * *"
+
+#* Stop stale workflows, though we should never hit this unless it hangs for a whole day.
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  build-acl-cache:
+    uses: ./.github/workflows/aarch64-acl.yml
+
+  test-performance:
+    uses: ./.github/workflows/performance-aarch64.yml
+
+  build-and-test:
+    needs: build-acl-cache
+    strategy:
+      matrix:
+        config: [
+          { name: c6g, label: ah-ubuntu_22_04-c6g_8x-100, threading: OMP, toolset: gcc, build: Release, testset: NIGHTLY },
+          { name: c7g, label: ah-ubuntu_22_04-c7g_8x-100, threading: OMP, toolset: gcc, build: Release, testset: NIGHTLY }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+
+      - name: Checkout oneDNN
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: oneDNN
+
+      # Note: This will create a github actions cache
+      - name: Get latest CMake and Ninja
+        uses: lukka/get-cmake@56d043d188c3612951d8755da8f4b709ec951ad6 # v3.31.6
+        with:
+          cmakeVersion: 3.31.0
+          ninjaVersion: 1.12.0
+
+      - if: ${{ matrix.config.threading == 'OMP' }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=`cat ${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json`
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> $GITHUB_OUTPUT
+
+      - name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd ${{ github.workspace }}/ComputeLibrary && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> $GITHUB_OUTPUT
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> $GITHUB_OUTPUT
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@v4
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Configure oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run oneDNN tests
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/test.sh
+        working-directory: ${{ github.workspace }}/oneDNN/build
+        env:
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CTEST_PARALLEL_LEVEL: 8
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+  #* This job adds a check named "Nightly AArch64" that represents overall
+  #* workflow status and can be used in branch rulesets
+  status:
+    needs: build-and-test
+    runs-on: ubuntu-latest
+    name: "Nightly AArch64"
+    steps:
+      - name: Print success
+        run: echo Success
diff --git a/.github/workflows/openssf-scorecard.yml b/.github/workflows/openssf-scorecard.yml
index 60d94920c1b..2b4efc813dc 100644
--- a/.github/workflows/openssf-scorecard.yml
+++ b/.github/workflows/openssf-scorecard.yml
@@ -41,12 +41,12 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@62b2cac7ed8198b15735ed49ab1e5cf35480ba46 # v2.4.0
+        uses: ossf/scorecard-action@f49aabe0b5af0936a0987cfb85d86b75731b0186 # v2.4.1
         with:
           results_file: results.sarif
           results_format: sarif
diff --git a/.github/workflows/performance-aarch64.yml b/.github/workflows/performance-aarch64.yml
new file mode 100644
index 00000000000..b142240db7d
--- /dev/null
+++ b/.github/workflows/performance-aarch64.yml
@@ -0,0 +1,171 @@
+# *******************************************************************************
+# Copyright 2024-2025 Arm Limited and affiliates.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "Performance AArch64"
+
+on:
+  workflow_call:
+
+#* Stop stale workflows
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-performance
+  cancel-in-progress: true
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  build-acl-cache:
+    uses: ./.github/workflows/aarch64-acl.yml
+
+  build-and-test-performance:
+    needs: build-acl-cache
+    strategy:
+      matrix:
+        config: [
+          { name: c7g, label: ah-ubuntu_22_04-c7g_m-100, threading: OMP, toolset: gcc, build: Release, testset: NIGHTLY }
+        ]
+
+    name: ${{ matrix.config.name }}, ${{ matrix.config.toolset }}, ${{ matrix.config.threading }}, ${{ matrix.config.build }}
+    runs-on: ${{ matrix.config.label }}
+    steps:
+
+      - name: Checkout oneDNN
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: oneDNN
+
+      # Note: This will create a github actions cache
+      - name: Get latest CMake and Ninja
+        uses: lukka/get-cmake@56d043d188c3612951d8755da8f4b709ec951ad6 # v3.31.6
+        with:
+          cmakeVersion: 3.31.0
+          ninjaVersion: 1.12.0
+
+      - if: ${{ matrix.config.threading == 'OMP' }}
+        name: Install openmp
+        run: |
+          sudo apt install -y libomp-dev
+
+      - name: Read version file
+        id: get-versions
+        run: |
+          content=`cat ${{ github.workspace }}/oneDNN/.github/automation/aarch64/ci.json`
+          content="${content//[$'\t\r\n$ ']}"
+          echo "output=$content" >> $GITHUB_OUTPUT
+
+      - name: Install gcc
+        run: |
+          sudo add-apt-repository ppa:ubuntu-toolchain-r/test -y
+          sudo apt update -y
+          sudo apt install -y g++-${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+
+      - name: setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10'
+
+      - name: Install scipy
+        if: ${{ matrix.config.build == 'Release' }}
+        run: pip install scipy statistics
+
+      - name: Clone ACL
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build_acl.sh
+        env:
+          ACL_ACTION: clone
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          ACL_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.acl }}
+
+      - name: Get ACL commit hash for cache key
+        id: get_acl_commit_hash
+        run: (cd ${{ github.workspace }}/ComputeLibrary && echo "ACLCommitHash=$(git rev-parse --short HEAD)") >> $GITHUB_OUTPUT
+
+      - name: Get system name
+        id: get_system_name
+        run: (echo "SystemName=$(uname)") >> $GITHUB_OUTPUT
+
+      - name: Restore cached ACL
+        id: cache-acl-restore
+        uses: actions/cache/restore@v4
+        with:
+          key: ${{ steps.get_system_name.outputs.SystemName }}-acl-${{ matrix.config.toolset }}-${{ matrix.config.build }}-${{ steps.get_acl_commit_hash.outputs.ACLCommitHash }}
+          path: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Configure oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Checkout oneDNN base
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          ref: ${{ fromJson(steps.get-versions.outputs.output).dependencies.onednn-base }}
+          path: oneDNN_base
+
+      - name: Configure oneDNN base
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ACL_ROOT_DIR: ${{ github.workspace }}/ComputeLibrary
+          BUILD_TOOLSET: ${{ matrix.config.toolset }}
+          CMAKE_BUILD_TYPE: ${{ matrix.config.build }}
+          CMAKE_GENERATOR: Ninja
+          GCC_VERSION: ${{ fromJson(steps.get-versions.outputs.output).dependencies.gcc }}
+          ONEDNN_ACTION: configure
+          ONEDNN_TEST_SET: ${{ matrix.config.testset }}
+          ONEDNN_THREADING: ${{ matrix.config.threading }}
+
+      - name: Build oneDNN base
+        run: ${{ github.workspace }}/oneDNN/.github/automation/aarch64/build.sh
+        working-directory: ${{ github.workspace }}/oneDNN_base
+        env:
+          ONEDNN_ACTION: build
+
+      - name: Run performance tests
+        shell: bash
+        run: |
+          OMP_NUM_THREADS=16 bash ${{ github.workspace }}/oneDNN/.github/automation/performance/bench_nightly_performance.sh ${{ github.workspace }}/oneDNN_base/build/tests/benchdnn/benchdnn ${{ github.workspace }}/oneDNN/build/tests/benchdnn/benchdnn base.txt new.txt
+        env:
+          DYLD_LIBRARY_PATH: ${{ github.workspace }}/ComputeLibrary/build
+
+      - name: Compare 16 threads performance test results
+        run: |
+          python ${{ github.workspace }}/oneDNN/.github/automation/performance/benchdnn_comparison.py base.txt new.txt
+
+  #* This job adds a check named "Nightly Performance AArch64" that represents overall
+  #* workflow status and can be used in branch rulesets
+  status:
+    needs: build-and-test-performance
+    runs-on: ubuntu-latest
+    name: "Nightly Performance AArch64"
+    steps:
+      - name: Print success
+        run: echo Success
diff --git a/.github/workflows/pr-linter.yml b/.github/workflows/pr-linter.yml
new file mode 100644
index 00000000000..4ba07ddaa15
--- /dev/null
+++ b/.github/workflows/pr-linter.yml
@@ -0,0 +1,98 @@
+# *******************************************************************************
+# Copyright 2024 Arm Limited and affiliates.
+# Copyright 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# *******************************************************************************
+
+name: "PR Linters"
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize, reopened]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+# Kill stale checks
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
+jobs:
+  pr-commits:
+    name: Commit messages
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Check commit messages
+        run: python3 ./.github/automation/commit-msg-check.py "${{ github.event.pull_request.head.sha }}" "${{ github.event.pull_request.base.sha }}"
+
+  pr-clang-format:
+    name: Clang-Format
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+      - name: Install clang-format
+        run: sudo apt update && sudo apt install -y "clang-format-11"
+      - name: Check code formatting
+        run: .github/automation/clang-format.sh
+
+  pr-format-tags:
+    name: Format tags consistency
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout oneDNN
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          path: source
+
+      - name: Install clang
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y clang libomp-dev
+
+      - name: Install castxml package
+        run: |
+          python -m venv venv
+          source venv/bin/activate
+          python -m pip install --no-cache-dir --disable-pip-version-check castxml
+
+      - name: Configure oneDNN
+        run: .github/automation/x64/build_linters.sh
+        working-directory: ${{ github.workspace }}/source
+        env:
+          ONEDNN_ACTION: configure
+
+      - name: Check format-tags
+        run: |
+          venv/bin/castxml --castxml-cc-gnu-c clang --castxml-output=1 -I${{ github.workspace }}/source/include -I${{ github.workspace }}/build/include ${{ github.workspace }}/source/include/oneapi/dnnl/dnnl_types.h -o ${{ github.workspace }}/types.xml
+          python ${{ github.workspace }}/source/scripts/generate_dnnl_debug.py ${{ github.workspace }}/types.xml
+          python ${{ github.workspace }}/source/scripts/generate_format_tags.py
+          cd ${{ github.workspace }}/source/
+          git diff | grep . && exit 1 || true
+
+  pr-status:
+    name: Formatting
+    runs-on: ubuntu-latest
+    needs: [ pr-commits, pr-clang-format, pr-format-tags ]
+    steps:
+      - name: Print success
+        run: echo "Success"
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index db170326a5d..381a4f20f45 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 #===============================================================================
 # Copyright 2019-2021 Intel Corporation
-# Copyright 2024 Arm Limited and affiliates.
+# Copyright 2024-2025 Arm Limited and affiliates.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 # limitations under the License.
 #===============================================================================
 
-build
+/build*
 external
 .vs
 .vscode
diff --git a/CITATION.cff b/CITATION.cff
index 1598115c9a1..84eb0055376 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -8,8 +8,8 @@ message: >-
 type: software
 authors:
   - name: oneDNN Contributors
-repository-code: 'https://github.com/oneapi-src/oneDNN'
-url: 'https://oneapi-src.github.io/oneDNN'
+repository-code: 'https://github.com/uxlfoundation/oneDNN'
+url: 'https://uxlfoundation.github.io/oneDNN'
 abstract: >-
   oneAPI Deep Neural Network Library (oneDNN) is an open-source cross-platform
   performance library of basic building blocks for deep learning applications.
@@ -18,4 +18,4 @@ abstract: >-
   oneDNN has experimental support for the following architectures: NVIDIA GPU,
   AMD GPU, OpenPOWER Power ISA (PPC64), IBMz (s390x), and RISC-V.
 license: Apache-2.0
-version: v3.6
+version: v3.8
diff --git a/CMakeLists.txt b/CMakeLists.txt
index af2522a0721..a106cea3122 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2019 Intel Corporation
+# Copyright 2016-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,54 +14,26 @@
 # limitations under the License.
 #===============================================================================
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.13)
 
-if(POLICY CMP0022)
-    cmake_policy(SET CMP0022 NEW)
-endif()
-
-# Foo::Bar always refers to an IMPORTED target
-if(POLICY CMP0028)
-    cmake_policy(SET CMP0028 NEW)
-endif()
-
-if(POLICY CMP0054)
-    cmake_policy(SET CMP0054 NEW)
-endif()
-
-# Enable RPATH on MacOS/OSX
-if(POLICY CMP0042)
-    cmake_policy(SET CMP0042 NEW)
-endif()
-
-# Do not export symbols from executables
-if(POLICY CMP0065)
-    cmake_policy(SET CMP0065 NEW)
-endif()
-
-# Pass linker flags to try_compile
-if(POLICY CMP0056)
-    cmake_policy(SET CMP0056 NEW)
-endif()
-
-# Always link with full path
-if(POLICY CMP0060)
-    cmake_policy(SET CMP0060 NEW)
-endif()
+# CMake minimum required version enables all policies introduced in minimum
+# version and earlier versions. Policies introduced in future versions
+# are handled individually in the section below.
 
-# Pass compiler flags to try_compile
-if(POLICY CMP0066)
-    cmake_policy(SET CMP0066 NEW)
+# CMake 3.14: Install rules from add_subdirectory() calls are interleaved
+# with those in caller.
+if(POLICY CMP0082)
+    cmake_policy(SET CMP0082 NEW)
 endif()
 
-# Use <PackageName>_ROOT env. variable as a prefix
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
+# CMake 3.27: The FindPythonInterp and FindPythonLibs modules are removed.
+if(POLICY CMP0148)
+    cmake_policy(SET CMP0148 NEW)
 endif()
 
-# Install rules order
-if(POLICY CMP0082)
-    cmake_policy(SET CMP0082 NEW)
+# CMake 3.27: The FindCUDA module is removed.
+if(POLICY CMP0146)
+    cmake_policy(SET CMP0146 OLD)
 endif()
 
 if("${CMAKE_BUILD_TYPE}" STREQUAL "")
@@ -69,26 +41,31 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "")
     set(CMAKE_BUILD_TYPE "Release" CACHE STRING
         "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel RelWithAssert RelWithMDd...")
 endif()
+if (CMAKE_GENERATOR MATCHES "^Visual Studio")
+    message(STATUS
+"oneDNN build configuration is based on the CMAKE_BUILD_TYPE value, but
+ the CMake generator '${CMAKE_GENERATOR}' does not respect it and requires
+ using the --config option to choose the build type. Changing the build type
+ using the --config option requires rerunning CMake from scratch with a
+ matching CMAKE_BUILD_TYPE value.")
+endif()
 
 set(PROJECT_NAME "oneDNN")
 set(PROJECT_FULL_NAME "oneAPI Deep Neural Network Library (oneDNN)")
-set(PROJECT_VERSION "3.6.0")
+set(PROJECT_VERSION "3.8.0")
 
-if (CMAKE_VERSION VERSION_LESS 3.0)
-    project(${PROJECT_NAME} C CXX)
-else()
-    cmake_policy(SET CMP0048 NEW)
-    project(${PROJECT_NAME} VERSION "${PROJECT_VERSION}" LANGUAGES C CXX)
-endif()
+project(${PROJECT_NAME} VERSION "${PROJECT_VERSION}" LANGUAGES C CXX)
 
 if (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
-    message(FATAL_ERROR "oneDNN supports 64 bit platforms only")
+    message(WARNING "oneDNN officially supports 64 bit platforms only")
 endif()
 
 # Set the target architecture.
 if(NOT DNNL_TARGET_ARCH)
     if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
         set(DNNL_TARGET_ARCH "AARCH64")
+    elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+        set(DNNL_TARGET_ARCH "ARM")
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64.*|PPC64.*|powerpc64.*)")
         set(DNNL_TARGET_ARCH "PPC64")
     elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x.*|S390X.*)")
@@ -133,30 +110,22 @@ include("cmake/host_compiler.cmake")
 include("cmake/configuring_primitive_list.cmake")
 
 if(UNIX OR MINGW)
-    if(CMAKE_VERSION VERSION_LESS "3.1.0")
-        # No CMAKE_<lang>_STANDARD, so add directly to CMAKE_<lang>_FLAGS
-        # (prepended so the user can override)
-        set(CMAKE_C_FLAGS "-std=c99 ${CMAKE_C_FLAGS}")
-        # Let SYCL to choose the C++ standard it needs.
-        if(NOT DNNL_WITH_SYCL)
-            set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
-        endif()
-    else()
-        # CMAKE_<lang>_STANDARD support, so set it to our defaults, unless
-        # overridden by the user
-        if(NOT DEFINED CMAKE_C_STANDARD)
-            set(CMAKE_C_STANDARD 99)
-        endif()
-        if(NOT DEFINED CMAKE_CXX_STANDARD AND NOT DNNL_WITH_SYCL)
-            set(CMAKE_CXX_STANDARD 11)
-        endif()
-
-        # Disable -std=gnuXX and -std=gnu++XX
-        set(CMAKE_C_EXTENSIONS OFF)
-        set(CMAKE_CXX_EXTENSIONS OFF)
+    # CMAKE_<lang>_STANDARD support, so set it to our defaults, unless
+    # overridden by the user
+    if(NOT DEFINED CMAKE_C_STANDARD)
+        set(CMAKE_C_STANDARD 99)
+    endif()
+    if(NOT DEFINED CMAKE_CXX_STANDARD AND NOT DNNL_WITH_SYCL)
+        set(CMAKE_CXX_STANDARD 11)
     endif()
-endif()
 
+    # Disable -std=gnuXX and -std=gnu++XX
+    set(CMAKE_C_EXTENSIONS OFF)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+endif()
+if (ANDROID)
+    set(CMAKE_CXX_STANDARD 20)
+endif()
 # Handle cases when OpenMP runtime is requested but not found: override CPU
 # runtime to be sequential
 if(DNNL_CPU_RUNTIME STREQUAL "OMP" AND
@@ -179,7 +148,7 @@ configure_file(
     "${PROJECT_BINARY_DIR}/README"
 )
 
-if(DNNL_INSTALL_MODE MATCHES "^(BUNDLE|BUNDLE_V2)$" AND NOT DEFINED CMAKE_INSTALL_LIBDIR)
+if(DNNL_INSTALL_MODE STREQUAL "BUNDLE" AND NOT DEFINED CMAKE_INSTALL_LIBDIR)
     # define CMAKE_INSTALL_LIBDIR as "lib" in the case of bundle
     set(CMAKE_INSTALL_LIBDIR "lib")
 endif()
@@ -192,10 +161,6 @@ add_subdirectory(examples)
 add_subdirectory(tests)
 
 if(DNNL_INSTALL_MODE STREQUAL "BUNDLE")
-    install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_PREFIX})
-    install(FILES THIRD-PARTY-PROGRAMS DESTINATION ${CMAKE_INSTALL_PREFIX})
-    install(FILES ${PROJECT_BINARY_DIR}/README DESTINATION ${CMAKE_INSTALL_PREFIX})
-elseif(DNNL_INSTALL_MODE STREQUAL "BUNDLE_V2")
     install(FILES LICENSE DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${LIB_PACKAGE_NAME})
     install(FILES THIRD-PARTY-PROGRAMS DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${LIB_PACKAGE_NAME})
     install(FILES ${PROJECT_BINARY_DIR}/README DESTINATION ${CMAKE_INSTALL_DATAROOTDIR}/doc/${LIB_PACKAGE_NAME})
diff --git a/CODING_STANDARDS.md b/CODING_STANDARDS.md
index bfe16cae060..4698cebdd3b 100644
--- a/CODING_STANDARDS.md
+++ b/CODING_STANDARDS.md
@@ -25,7 +25,7 @@ oneDNN uses [clang-tidy](https://clang.llvm.org/extra/clang-tidy/) in order to
 diagnose and fix common style violations and easy-to-fix issues in the code
 base. For instructions on how to use `clang-tidy`, please refer to the
 [clang-tidy
-RFC](https://github.com/oneapi-src/oneDNN/blob/rfcs/rfcs/20200813-clang-tidy/README.md).
+RFC](https://github.com/uxlfoundation/oneDNN/blob/rfcs/rfcs/20200813-clang-tidy/README.md).
 The list of clang-tidy checks the oneDNN code base follows is available in the
 `.clang-tidy` file found in the oneDNN top-level directory.
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ea8c718f80e..100d8429486 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,8 +7,8 @@ requests! To get started, see the GitHub
 You can:
 
 - Submit your changes directly with a
-  [pull request](https://github.com/oneapi-src/oneDNN/pulls)
-- Log a bug or feedback with an [issue](https://github.com/oneapi-src/oneDNN/issues)
+  [pull request](https://github.com/uxlfoundation/oneDNN/pulls)
+- Log a bug or feedback with an [issue](https://github.com/uxlfoundation/oneDNN/issues)
 
 **See also:** [Contributor Covenant](CODE_OF_CONDUCT.md) code of conduct.
 
@@ -54,7 +54,7 @@ For Comments (RFC) process, which consists of opening, discussing, and
 accepting (promoting) RFC pull requests.
 
 More information about the process can be found in the dedicated
-[`rfcs`](https://github.com/oneapi-src/oneDNN/tree/rfcs) branch.
+[`rfcs`](https://github.com/uxlfoundation/oneDNN/tree/rfcs) branch.
 
 ## Code contribution guidelines
 
@@ -146,7 +146,7 @@ Use the following command to run tests selected by a build configuration:
 ```
 
 To modify the coverage, use the
-[`ONEDNN_TEST_SET`](https://oneapi-src.github.io/oneDNN/dev_guide_build_options.html#onednn-test-set)
+[`ONEDNN_TEST_SET`](https://uxlfoundation.github.io/oneDNN/dev_guide_build_options.html#onednn-test-set)
 build option.
 
 More details on how to run benchdnn can be found in
diff --git a/MAINTAINERS.md b/MAINTAINERS.md
index 212a815977f..c5bc3a05772 100644
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@@ -35,7 +35,7 @@ Privileges:
 ## Code Owner
 
 A Code Owner has responsibility for a specific project component or a functional
-area. Code Owners are collectively responsible, with other Code Owners, 
+area. Code Owners are collectively responsible, with other Code Owners,
 for developing and maintaining their component or functional areas, including
 reviewing all changes to their their areas of responsibility and indicating
 whether those changes are ready to merge. They have a track record of
@@ -72,7 +72,7 @@ including name, Github username, and affiliation.
 2. At least two specific component Maintainers approve the PR.
 
 ## Maintainer
-Maintainers are the most established contributors who are responsible for the 
+Maintainers are the most established contributors who are responsible for the
 project technical direction and participate in making decisions about the
 strategy and priorities of the project.
 
@@ -100,7 +100,7 @@ Privileges:
   * Can recommend Code Owners to become Maintainers.
 
 Process of becoming a maintainer:
-1. A Maintainer may nominate a current Reviewer to become a new Maintainer by 
+1. A Maintainer may nominate a current Reviewer to become a new Maintainer by
 opening a PR against MAINTAINERS.md file.
 2. A majority of the current Maintainers must then approve the PR.
 
@@ -108,7 +108,7 @@ opening a PR against MAINTAINERS.md file.
 
 ## Core (API, Architecture, Tests)
 
-Team: @oneapi-src/onednn-arch
+Team: @uxlfoundation/onednn-arch
 
 | Name               | Github ID             | Affiliation       | Role       |
 | -----------------  | --------------------- | ----------------- | ---------- |
@@ -117,10 +117,11 @@ Team: @oneapi-src/onednn-arch
 | Mourad Gouicem     | @mgouicem             | Intel Corporation | Maintainer |
 | Vadim Pirogov      | @vpirogov             | Intel Corporation | Maintainer |
 | Ankit Manerikar    | @avmanerikar          | Intel Corporation | Code Owner |
+| Stefan Palicki     | @spalicki             | Intel Corporation | Code Owner |
 
 ## Graph API
 
-Team: @oneapi-src/onednn-graph
+Team: @uxlfoundation/onednn-graph
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
@@ -130,39 +131,39 @@ Team: @oneapi-src/onednn-graph
 | Shaojie Cui        | @ShanSimu             | Intel Corporation | Code Owner |
 | Yonghao Gu         | @gyhintel             | Intel Corporation | Code Owner |
 | Rong Zhang         | @rongzha1             | Intel Corporation | Code Owner |
-| Zhailong Mu        | @muzhailong           | Intel Corporation | Code Owner |
 | Xiang Guo          | @xiang1guo            | Intel Corporation | Code Owner |
-| Jiaming Song       | @litchilitchy         | Intel Corporation | Code Owner |
 | Yixin Bao          | @ElaineBao            | Intel Corporation | Code Owner |
 
 ## CPU Engine
 
 ### x64
 
-Team: @oneapi-src/onednn-cpu-x64
+Team: @uxlfoundation/onednn-cpu-x64
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
 | Andrey Kalinin     | @ankalinin            | Intel Corporation | Maintainer |
-| Srinivas Putta     | @nivas-x86            | Intel Corporation | Maintainer |
 | Tatyana Primak     | @tprimak              | Intel Corporation | Maintainer |
+| Alexey Makarevich  | @amakarev             | Intel Corporation | Code Owner |
 | David Eberius      | @davideberius         | Intel Corporation | Code Owner |
-| John Karasev       | @karashjoh000         | Intel Corporation | Code Owner |
 | Stefan Palicki     | @spalicki             | Intel Corporation | Code Owner |
 | Tomasz Czeszun     | @tczeszun             | Intel Corporation | Code Owner |
-| Xuxin Zen          | @xuxinzen             | Intel Corporation | Code Owner |
+| Xuxin Zeng         | @xuxinzen             | Intel Corporation | Code Owner |
 
 ### AArch64
 
-Team: @oneapi-src/onednn-cpu-aarch64
+Team: @uxlfoundation/onednn-cpu-aarch64
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
+| Hamza Butt         | @theComputeKid        | Arm Ltd           | Maintainer |
 | Crefeda Rodrigues  | @cfrod                | Arm Ltd           | Code Owner |
 | David Svantesson   | @davsva01             | Arm Ltd           | Code Owner |
-| Johnatan Deakin    | @jondea               | Arm Ltd           | Code Owner |
-| Hamza Butt         | @theComputeKid        | Arm Ltd           | Code Owner |
+| Jonathan Deakin    | @jondea               | Arm Ltd           | Code Owner |
+| Radu Salavat       | @Radu2k               | Arm Ltd           | Code Owner |
+| Siddhartha Menon   | @Sqvid                | Arm Ltd           | Code Owner |
 | Sunita Nadampalli  | @snadampal            | Amazon.com, Inc.  | Code Owner |
+| Ryo Suzuki         | @Ryo-not-rio          | Arm Ltd           | Code Owner |
 
 ### OpenPOWER (PPC64)
 
@@ -184,7 +185,7 @@ Vacant. Maintained by Core team.
 
 ### Intel
 
-Team: @oneapi-src/onednn-gpu-intel
+Team: @uxlfoundation/onednn-gpu-intel
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
@@ -192,9 +193,11 @@ Team: @oneapi-src/onednn-gpu-intel
 | Konstantin Arturov | @karturov             | Intel Corporation | Maintainer |
 | Peter Caday        | @petercad             | Intel Corporation | Maintainer |
 | Andy Kassen        | @atkassen             | Intel Corporation | Code Owner |
+| Daniel Youssif     | @dyoussif             | Intel Corporation | Code Owner |
 | Haleema Sadia      | @h-sadia              | Intel Corporation | Code Owner |
 | Andrey Guskov      | @hidefromkgb          | Intel Corporation | Code Owner |
 | Gallagher Pryor    | @pv-pterab-s          | Intel Corporation | Code Owner |
+| Kealan Barbieri    | @kealan-barbieri      | Intel Corporation | Code Owner |
 | Roy Oursler        | @rjoursler            | Intel Corporation | Code Owner |
 | Simon Ewing        | @Simonsays095         | Intel Corporation | Code Owner |
 | Sergey Kazakov     | @skazakov1            | Intel Corporation | Code Owner |
@@ -204,42 +207,44 @@ Team: @oneapi-src/onednn-gpu-intel
 ### NVIDIA, AMD, and generic GPU
 
 Teams:
-* @oneapi-src/onednn-gpu-nvidia
-* @oneapi-src/onednn-gpu-amd
-* @oneapi-src/onednn-gpu-generic
+* @uxlfoundation/onednn-gpu-nvidia
+* @uxlfoundation/onednn-gpu-amd
+* @uxlfoundation/onednn-gpu-generic
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
-| Dylan Angus        | @dylan-angus-codeplay | Codeplay Software | Code Owner |
-| John Osorio        | @kala85               | Codeplay Software | Code Owner |
+| Anton Mitkov       | @ShanoToni            | Codeplay Software | Code Owner |
+| Atharva Dubey      | @AD2605               | Codeplay Software | Code Owner |
 | Mehdi Goli         | @mehdi-goli           | Codeplay Software | Code Owner |
-| Anton Mitkov       | @ShaoToni             | Codeplay Software | Code Owner |
+| Nicolò Scipione    | @s-Nick               | Codeplay Software | Code Owner |
 | Svetlozar Georgiev | @sgeor255             | Codeplay Software | Code Owner |
+| Romain Biessy      | @Rbiessy              | Codeplay Software | Code Owner |
 
 ## Support functions
 
 ### Documentation
 
-Team: @oneapi-src/onednn-doc
+Team: @uxlfoundation/onednn-doc
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
 | Vadim Pirogov      | @vpirogov             | Intel Corporation | Maintainer |
-| Deb Taylor         | @deb-intel            | Intel Corporation | Code Owner |
+| Ranu Kundu         | @ranukund             | Intel Corporation | Code Owner |
+| Tao Lv             | @TaoLv                | Intel Corporation | Code Owner |
 
 ### DevOps
 
-Team: @oneapi-src/onednn-devops
+Team: @uxlfoundation/onednn-devops
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
 | Sergey Razumovskiy | @srazumov             | Intel Corporation | Maintainer |
 | Vadim Pirogov      | @vpirogov             | Intel Corporation | Maintainer |
+| Hamza Butt         | @theComputeKid        | Arm Ltd           | Code Owner |
 
 ### Release management
 
 | Name               | Github ID             | Affiliation       | Role       |
 | ------------------ | --------------------- | ----------------- | ---------- |
-| Harry Mao          | @harrymao2022         | Intel Corporation | Maintainer |
 | Tatyana Primak     | @tprimak              | Intel Corporation | Maintainer |
 | Vadim Pirogov      | @vpirogov             | Intel Corporation | Maintainer |
diff --git a/README.binary.in b/README.binary.in
index 76d7f7e39a6..1d8ae9f9e03 100644
--- a/README.binary.in
+++ b/README.binary.in
@@ -13,17 +13,17 @@ developers interested in improving application performance on CPUs and GPUs.
 This package contains oneDNN v@PROJECT_VERSION@ (@DNNL_VERSION_HASH@).
 
 You can find information about the latest version and release notes
-at the oneDNN Github (https://github.com/oneapi-src/oneDNN/releases).
+at the oneDNN Github (https://github.com/uxlfoundation/oneDNN/releases).
 
 Documentation
 -------------
 
 * Developer guide
-(https://oneapi-src.github.io/oneDNN/v@DNNL_VERSION_MAJOR@.@DNNL_VERSION_MINOR@)
+(https://uxlfoundation.github.io/oneDNN/v@DNNL_VERSION_MAJOR@.@DNNL_VERSION_MINOR@)
 explains the programming model, supported functionality, and implementation
 details, and includes annotated examples.
 * API reference
-(https://oneapi-src.github.io/oneDNN/v@DNNL_VERSION_MAJOR@.@DNNL_VERSION_MINOR@/modules.html)
+(https://uxlfoundation.github.io/oneDNN/v@DNNL_VERSION_MAJOR@.@DNNL_VERSION_MINOR@/modules.html)
 provides a comprehensive reference of the library API.
 
 System Requirements
@@ -48,7 +48,7 @@ just-in-time (JIT) code generation to deploy the code optimized
 for the latest supported ISA. Future ISAs may have initial support in the
 library disabled by default and require the use of run-time controls to enable
 them. See CPU dispatcher control
-(https://oneapi-src.github.io/oneDNN/dev_guide_cpu_dispatcher_control.html)
+(https://uxlfoundation.github.io/oneDNN/dev_guide_cpu_dispatcher_control.html)
 for more details.
 
 The library is optimized for the following GPUs:
@@ -65,7 +65,7 @@ Support
 -------
 
 Submit questions, feature requests, and bug reports on the
-GitHub issues page (https://github.com/oneapi-src/oneDNN/issues).
+GitHub issues page (https://github.com/uxlfoundation/oneDNN/issues).
 
 License
 -------
@@ -102,7 +102,7 @@ govern your use of the third party programs as set forth in the
 
 # Security
 
-Security Policy (https://github.com/oneapi-src/oneDNN/blob/main/SECURITY.md)
+Security Policy (https://github.com/uxlfoundation/oneDNN/blob/main/SECURITY.md)
 outlines our guidelines and procedures for ensuring the highest level
 of Security and trust for our users who consume oneDNN.
 
diff --git a/README.md b/README.md
index 6b5c384a069..861de627f76 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ oneAPI Deep Neural Network Library (oneDNN)
 ===========================================
 
 [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/8762/badge)](https://www.bestpractices.dev/projects/8762)
-[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/oneapi-src/oneDNN/badge)](https://securityscorecards.dev/viewer/?uri=github.com/oneapi-src/oneDNN)
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/uxlfoundation/oneDNN/badge)](https://securityscorecards.dev/viewer/?uri=github.com/uxlfoundation/oneDNN)
 
 oneAPI Deep Neural Network Library (oneDNN) is an open-source cross-platform
 performance library of basic building blocks for deep learning applications.
@@ -18,18 +18,33 @@ AMD\* GPU, OpenPOWER\* Power ISA (PPC64), IBMz\* (s390x), and RISC-V.
 
 oneDNN is intended for deep learning applications and framework
 developers interested in improving application performance on CPUs and GPUs.
-Deep learning practitioners should use one of the
-[applications enabled with oneDNN](#applications-enabled-with-onednn).
+
+Deep learning practitioners should use one of the applications enabled with oneDNN:
+
+* [Apache SINGA](https://singa.apache.org)
+* [DeepLearning4J\*](https://deeplearning4j.konduit.ai)
+* [Flashlight\*](https://github.com/flashlight/flashlight)
+* [MATLAB\* Deep Learning Toolbox](https://www.mathworks.com/help/deeplearning)
+* [ONNX Runtime](https://onnxruntime.ai)
+* [OpenVINO(TM) toolkit](https://github.com/openvinotoolkit/openvino)
+* [PaddlePaddle\*](http://www.paddlepaddle.org)
+* [PyTorch\*](https://pytorch.org). Intel GPU support and additional
+optimizations are available with [Intel® Extension for PyTorch*].
+* [Tensorflow\*](https://www.tensorflow.org). Intel GPU support and additional
+optimizations are available with [Intel® Extension for TensorFlow*].
+
+[Intel® Extension for PyTorch*]: https://github.com/intel/intel-extension-for-pytorch
+[Intel® Extension for TensorFlow*]: https://github.com/intel/intel-extension-for-tensorflow
 
 [UXL Foundation]: http://www.uxlfoundation.org
-[oneAPI specification]: https://spec.oneapi.io
+[oneAPI specification]: https://oneapi-spec.uxlfoundation.org/specifications/oneapi/latest/elements/onednn/source/
 
 # Table of Contents
 
 - [Documentation](#documentation)
-- [Installation](#installation)
 - [System Requirements](#system-requirements)
-- [Applications Enabled with oneDNN](#applications-enabled-with-onednn)
+- [Installation](#installation)
+- [Validated Configurations](#validated-configurations)
 - [Governance](#governance)
 - [Support](#support)
 - [Contributing](#contributing)
@@ -39,32 +54,18 @@ Deep learning practitioners should use one of the
 
 # Documentation
 
-* [Developer Guide] explains the programming model, supported functionality,
-  and implementation details, and includes annotated examples.
-* [API Reference] provides a comprehensive reference of the library API.
-
-[Developer Guide]: https://oneapi-src.github.io/oneDNN
-[API Reference]: https://oneapi-src.github.io/oneDNN/group_dnnl_api.html
-
-# Installation
+* [oneDNN Developer Guide and Reference] explains the programming
+  model, supported functionality, implementation details, and includes
+  annotated examples.
+* [API Reference] provides a comprehensive reference of the library
+  API.
+* [Release Notes] explains the new features, performance
+  optimizations, and improvements implemented in each version of
+  oneDNN.
 
-Binary distribution of this software is available in:
-* [Anaconda]
-* [Intel oneAPI]
-
-The packages do not include library dependencies and these need to be resolved
-in the application at build time. See the [System Requirements] section below
-and the [Build Options] section in the [Developer Guide] for more details on
-CPU and GPU runtimes.
-
-If the configuration you need is not available, you can
-[build the library from source][Build from Source].
-
-[Anaconda]: https://anaconda.org/conda-forge/onednn
-[Intel oneAPI]: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onednn.html
-[System Requirements]: #system-requirements
-[Build Options]: https://oneapi-src.github.io/oneDNN/dev_guide_build_options.html
-[Build from Source]: https://oneapi-src.github.io/oneDNN/dev_guide_build.html
+[oneDNN Developer Guide and Reference]: https://uxlfoundation.github.io/oneDNN
+[API Reference]: https://uxlfoundation.github.io/oneDNN/group_dnnl_api.html
+[Release Notes]: https://github.com/uxlfoundation/oneDNN/releases
 
 # System Requirements
 
@@ -119,15 +120,15 @@ The library is optimized for the following GPUs:
  (formerly Meteor Lake, Arrow Lake and Lunar Lake)
 * future Intel Arc graphics (code name Battlemage)
 
-[CPU dispatcher control]: https://oneapi-src.github.io/oneDNN/dev_guide_cpu_dispatcher_control.html
-[Linking Guide]: https://oneapi-src.github.io/oneDNN/dev_guide_link.html
+[CPU dispatcher control]: https://uxlfoundation.github.io/oneDNN/dev_guide_cpu_dispatcher_control.html
+[Linking Guide]: https://uxlfoundation.github.io/oneDNN/dev_guide_link.html
 
 ## Requirements for Building from Source
 
 oneDNN supports systems meeting the following requirements:
 * Operating system with Intel 64 / Arm 64 / Power / IBMz architecture support
 * C++ compiler with C++11 standard support
-* [CMake] 2.8.12 or later
+* [CMake] 3.13 or later
 
 The following tools are required to build oneDNN documentation:
 * [Doxygen] 1.8.5 or later
@@ -173,7 +174,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
 machine learning applications and provides AArch64 optimized implementations
 of core functions. This functionality currently requires that ACL is downloaded
 and built separately. See [Build from Source] section of the Developer Guide for
-details. oneDNN only supports Compute Library versions 24.08.1 or later.
+details. oneDNN only supports Compute Library versions 24.11.1 or later.
 
 [Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary
 
@@ -239,12 +240,12 @@ is enabled:
 [timeout detection and recovery]: https://learn.microsoft.com/en-us/windows-hardware/drivers/display/timeout-detection-and-recovery
 [TdrDelay]: https://learn.microsoft.com/en-us/windows-hardware/drivers/display/tdr-registry-keys#tdrdelay
 
-### Runtime Dependencies
+## Runtime Dependencies
 
 When oneDNN is built from source, the library runtime dependencies and specific
 versions are defined by the build environment.
 
-#### Linux
+### Linux
 
 Common dependencies:
 * GNU C Library (`libc.so`)
@@ -265,7 +266,7 @@ Runtime-specific dependencies:
 | `DNNL_GPU_RUNTIME=OCL`   | any                           | OpenCL loader (`libOpenCL.so`)
 | `DNNL_GPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`libsycl.so`), OpenCL loader (`libOpenCL.so`), oneAPI Level Zero loader (`libze_loader.so`)
 
-#### Windows
+### Windows
 
 Common dependencies:
 * Microsoft Visual C++ Redistributable (`msvcrt.dll`)
@@ -281,7 +282,7 @@ Runtime-specific dependencies:
 | `DNNL_GPU_RUNTIME=OCL`   | any                           | OpenCL loader (`OpenCL.dll`)
 | `DNNL_GPU_RUNTIME=SYCL`  | Intel oneAPI DPC++ Compiler   | Intel oneAPI DPC++ Compiler runtime (`sycl.dll`), OpenCL loader (`OpenCL.dll`), oneAPI Level Zero loader (`ze_loader.dll`)
 
-#### macOS
+### macOS
 
 Common dependencies:
 * System C/C++ runtime (`libc++.dylib`, `libSystem.dylib`)
@@ -293,11 +294,32 @@ Runtime-specific dependencies:
 | `DNNL_CPU_RUNTIME=OMP` | Intel C/C++ Compiler          | Intel OpenMP runtime (`libiomp5.dylib`)
 | `DNNL_CPU_RUNTIME=TBB` | any                           | TBB (`libtbb.dylib`)
 
-### Validated Configurations
+# Installation
 
-CPU engine was validated on RedHat\* Enterprise Linux 8 with
-* GNU Compiler Collection 5.4, 6.1, 7.2, 8.1, 9.1, 11.1, 11.3
-* Clang\* 7.1, 8.0, 9.0, 14.0.6
+You can download and install the oneDNN library using one of the following options:
+
+- Binary Distribution: You can download pre-built binary packages from
+  the following sources:
+    - [conda-forge]: If the configuration you need is not available on
+      the conda-forge channel, you can build the library using the
+      Source Distribution.
+    - Intel oneAPI:
+       - [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.htm)
+       - [Intel® oneDNN standalone package](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onednn-download.html)
+
+- Source Distribution: You can build the library from source by
+  following the instructions on the [Build from Source] page.
+
+[conda-forge]: https://anaconda.org/conda-forge/onednn
+[System Requirements]: #system-requirements
+[Build Options]: https://uxlfoundation.github.io/oneDNN/dev_guide_build_options.html
+[Build from Source]: https://uxlfoundation.github.io/oneDNN/dev_guide_build.html
+
+# Validated Configurations
+
+x86-64 CPU engine was validated on RedHat\* Enterprise Linux 8 with
+* GNU Compiler Collection 8.5, 9.5, 11.1, 11.3
+* Clang\* 11.0, 14.0.6
 * [Intel oneAPI DPC++/C++ Compiler] 2024.0
 
 on Windows Server\* 2019 with
@@ -307,16 +329,19 @@ on Windows Server\* 2019 with
 on macOS 11 (Big Sur) with
 * Apple LLVM version 13.0
 
-on Ubuntu 20.04 AArch64 with
-* GNU Compiler Collection 7.0, 8.0, 9.0, 10.0
-* Clang\* 9.0, 17.0
+AArch64 CPU engine was validated on Ubuntu 22.04 with
+* GNU Compiler Collection 10.0, 13.0
+* Clang\* 17.0
 * [Arm Compiler for Linux] 24.04
 * [Arm Compute Library (ACL)] built for armv8-a arch, latest stable version
 available at the time of release
 
+on macOS 14 (Sonoma) with
+* Apple LLVM version 15.0
+
 GPU engine was validated on Ubuntu\* 22.04 with
-* GNU Compiler Collection 7.2, 8.1, and 9.1
-* Clang 7.1, 8.0, 9.0
+* GNU Compiler Collection 8.5, and 9.5
+* Clang 11.0
 * [Intel oneAPI DPC++/C++ Compiler] 2024.0
 * [Intel Software for General Purpose GPU capabilities] latest stable version
 available at the time of release
@@ -331,24 +356,6 @@ time of release
 [Intel Arc & Iris Xe Graphics Driver]: https://www.intel.com/content/www/us/en/download/785597/intel-arc-iris-xe-graphics-windows.html
 [Arm Compiler for Linux]: https://developer.arm.com/Tools%20and%20Software/Arm%20Compiler%20for%20Linux
 
-# Applications Enabled with oneDNN
-
-* [Apache\* MXNet](https://mxnet.apache.org)
-* [Apache SINGA](https://singa.apache.org)
-* [DeepLearning4J\*](https://deeplearning4j.konduit.ai)
-* [Flashlight\*](https://github.com/flashlight/flashlight)
-* [Korali](https://github.com/cselab/korali)
-* [MATLAB\* Deep Learning Toolbox](https://www.mathworks.com/help/deeplearning)
-* [ONNX Runtime](https://onnxruntime.ai)
-* [OpenVINO(TM) toolkit](https://github.com/openvinotoolkit/openvino)
-* [PaddlePaddle\*](http://www.paddlepaddle.org)
-* [PyTorch\*](https://pytorch.org). Intel GPU support and additional
-optimizations are available with [Intel Extension for PyTorch].
-* [Tensorflow\*](https://www.tensorflow.org). Intel GPU support and additional
-optimizations are available with [Intel Extension for Tensorflow].
-
-[Intel Extension for PyTorch]: https://github.com/intel/intel-extension-for-pytorch
-[Intel Extension for Tensorflow]: https://github.com/intel/intel-extension-for-tensorflow
 
 # Support
 
@@ -358,7 +365,7 @@ Submit questions, feature requests, and bug reports on the
 You can also contact oneDNN developers via [UXL Foundation Slack] using
 [#onednn] channel.
 
-[Github issues]: https://github.com/oneapi-src/oneDNN/issues
+[Github issues]: https://github.com/uxlfoundation/oneDNN/issues
 [UXL Foundation Slack]: https://slack-invite.uxlfoundation.org/
 [#onednn]: https://uxlfoundation.slack.com/channels/onednn
 
@@ -384,37 +391,31 @@ schedule and work already in progress towards future milestones in Github's
 [Milestones] section. If you are looking for a specific task to start,
 consider selecting from issues that are marked with the [help wanted] label.
 
-If you have an idea on how to improve the library:
-* For changes impacting the public API or library overall, such as adding new
-primitives or changes to the architecture, submit an [RFC pull request].
-* Ensure that the changes are consistent with the [code contribution guidelines]
-and [coding standards].
-* Ensure that you can build the product and run all the examples with your
-patch.
-* Submit a [pull request].
-
-For additional details, see [contribution guidelines](CONTRIBUTING.md). You can
-also contact oneDNN developers and maintainers via [UXL Foundation Slack] using
-[#onednn] channel.
 
-This project is intended to be a safe, welcoming space for collaboration, and
-contributors are expected to adhere to the
+See [contribution guidelines](CONTRIBUTING.md) to start contributing
+to oneDNN. You can also contact oneDNN developers and maintainers via
+[UXL Foundation Slack] using [#onednn] channel.
+
+This project is intended to be a safe, welcoming space for
+collaboration, and contributors are expected to adhere to the
 [Contributor Covenant](CODE_OF_CONDUCT.md) code of conduct.
 
-[RFC pull request]: https://github.com/oneapi-src/oneDNN/tree/rfcs
+[RFC pull request]: https://github.com/uxlfoundation/oneDNN/tree/rfcs
 [code contribution guidelines]: CONTRIBUTING.md#code-contribution-guidelines
 [coding standards]: CONTRIBUTING.md#coding-standards
-[pull request]: https://github.com/oneapi-src/oneDNN/pulls
-[Milestones]: https://github.com/oneapi-src/oneDNN/milestones
-[help wanted]: https://github.com/oneapi-src/oneDNN/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22
+[pull request]: https://github.com/uxlfoundation/oneDNN/pulls
+[Milestones]: https://github.com/uxlfoundation/oneDNN/milestones
+[help wanted]: https://github.com/uxlfoundation/oneDNN/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22
+
 
 # License
 
-oneDNN is licensed under [Apache License Version 2.0](LICENSE). Refer to the
-"[LICENSE](LICENSE)" file for the full license text and copyright notice.
+oneDNN is licensed under [Apache License Version 2.0](LICENSE). Refer
+to the "[LICENSE](LICENSE)" file for the full license text and
+copyright notice.
 
-This distribution includes third party software governed by separate license
-terms.
+This distribution includes third party software governed by separate
+license terms.
 
 3-clause BSD license:
 * [Xbyak](https://github.com/herumi/xbyak)
@@ -443,17 +444,17 @@ and OpenCL Driver](https://github.com/intel/compute-runtime)
 Interface](https://github.com/intel/metrics-discovery)
 * [spdlog](https://github.com/gabime/spdlog)
 
-This third party software, even if included with the distribution of
-the Intel software, may be governed by separate license terms, including
-without limitation, third party license terms, other Intel software license
-terms, and open source software license terms. These separate license terms
-govern your use of the third party programs as set forth in the
-"[THIRD-PARTY-PROGRAMS](THIRD-PARTY-PROGRAMS)" file.
+This third-party software, even if included with the distribution of
+the Intel software, may be governed by separate license terms,
+including without limitation,third party license terms, other Intel
+software license terms, and open source software license terms. These
+separate license terms govern your use of the third party programs as
+set forth in the "[THIRD-PARTY-PROGRAMS](THIRD-PARTY-PROGRAMS)" file.
 
 # Security
 
 [Security Policy](SECURITY.md) outlines our guidelines and procedures
-for ensuring the highest level of Security and trust for our users
+for ensuring the highest level of security and trust for our users
 who consume oneDNN.
 
 # Trademark Information
diff --git a/SECURITY.md b/SECURITY.md
index 0613b2e7703..279574c78fc 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -64,6 +64,6 @@ If you have any suggestions on how this Policy could be improved, please submit
 an issue or a pull request to this repository. Please **do not** report
 potential vulnerabilities or security flaws via a pull request.
 
-[1]: https://github.com/oneapi-src/oneDNN/releases/latest
-[2]: https://github.com/oneapi-src/oneDNN/security/advisories/new
-[3]: https://github.com/oneapi-src/oneDNN/security/advisories
+[1]: https://github.com/uxlfoundation/oneDNN/releases/latest
+[2]: https://github.com/uxlfoundation/oneDNN/security/advisories/new
+[3]: https://github.com/uxlfoundation/oneDNN/security/advisories
diff --git a/THIRD-PARTY-PROGRAMS b/THIRD-PARTY-PROGRAMS
index c377e234ed9..fa47ab926ed 100644
--- a/THIRD-PARTY-PROGRAMS
+++ b/THIRD-PARTY-PROGRAMS
@@ -496,7 +496,7 @@ limitations under the License.
    END OF TERMS AND CONDITIONS
 
 --------------------------------------------------------------------------------
-6. Boost C++ Libraries (src/common/primitive_hashing.hpp, src/graph/backend/graph_compiler/core/src/util/hash_utils.hpp)
+6. Boost C++ Libraries (src/common/primitive_hashing.hpp)
 Copyright 2005-2014 Daniel James.
 
 Boost Software License - Version 1.0 - August 17th, 2003
@@ -610,227 +610,3 @@ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 IN THE SOFTWARE.
-
-10. LLVM (src/graph/backend/graph_compiler/core/src/util/array_ref.hpp)
-==============================================================================
-The LLVM Project is under the Apache License v2.0 with LLVM Exceptions:
-==============================================================================
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-    1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-    2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-    3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-    4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-    5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-    6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-    7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-    8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-    9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-    END OF TERMS AND CONDITIONS
-
-    APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-    Copyright [yyyy] [name of copyright owner]
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-
-
----- LLVM Exceptions to the Apache 2.0 License ----
-
-As an exception, if, as a result of your compiling your source code, portions
-of this Software are embedded into an Object form of such source code, you
-may redistribute such embedded portions in such Object form without complying
-with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-
-In addition, if you combine or link compiled forms of this Software with
-software that is licensed under the GPLv2 ("Combined Software") and if a
-court of competent jurisdiction determines that the patent provision (Section
-3), the indemnity provision (Section 9) or other Section of the License
-conflicts with the conditions of the GPLv2, you may retroactively and
-prospectively choose to deem waived or otherwise exclude such Section(s) of
-the License, but only in their entirety and only with respect to the Combined
-Software.
diff --git a/cmake/ACL.cmake b/cmake/ACL.cmake
index b185f7ba340..d619e6f9226 100644
--- a/cmake/ACL.cmake
+++ b/cmake/ACL.cmake
@@ -21,17 +21,17 @@ endif()
 set(acl_cmake_included true)
 include("cmake/options.cmake")
 
-if(NOT DNNL_TARGET_ARCH STREQUAL "AARCH64")
+if(NOT DNNL_TARGET_ARCH MATCHES "^(AARCH64|ARM)$")
     return()
 endif()
 
-if(NOT DNNL_AARCH64_USE_ACL)
+if(NOT DNNL_USE_ACL)
     return()
 endif()
 
 find_package(ACL REQUIRED)
 
-set(ACL_MINIMUM_VERSION "24.08.1")
+set(ACL_MINIMUM_VERSION "24.11.1")
 
 if(ACL_FOUND)
     file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)
@@ -67,7 +67,7 @@ if(ACL_FOUND)
     message(STATUS "Arm Compute Library: ${ACL_LIBRARIES}")
     message(STATUS "Arm Compute Library headers: ${ACL_INCLUDE_DIRS}")
 
-    add_definitions(-DDNNL_AARCH64_USE_ACL)
+    add_definitions(-DDNNL_USE_ACL)
     set(CMAKE_CXX_STANDARD 14)
     set(CMAKE_CXX_EXTENSIONS "OFF")
 endif()
diff --git a/cmake/Doxygen.cmake b/cmake/Doxygen.cmake
index 5d27d650a9e..a9409985be8 100644
--- a/cmake/Doxygen.cmake
+++ b/cmake/Doxygen.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2024 Intel Corporation
+# Copyright 2016-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ if(DOXYGEN_FOUND)
         COMMENT "Generating API documentation in .xml format with Doxygen" VERBATIM)
     add_custom_target(doc_doxygen DEPENDS ${DOXYGEN_STAMP_FILE})
 
-    if(NOT DNNL_INSTALL_MODE MATCHES "BUNDLE|BUNDLE_V2")
+    if(NOT DNNL_INSTALL_MODE STREQUAL "BUNDLE")
         install(
             DIRECTORY ${DOXYGEN_OUTPUT_DIR}
             DESTINATION share/doc/${LIB_PACKAGE_NAME} OPTIONAL)
diff --git a/cmake/FindMIOpen.cmake b/cmake/FindMIOpen.cmake
index 3928ce0dbce..727c16730af 100644
--- a/cmake/FindMIOpen.cmake
+++ b/cmake/FindMIOpen.cmake
@@ -34,6 +34,7 @@ list(APPEND EXTRA_SHARED_LIBS amd_comgr)
 
 # Prioritize MIOPENROOT
 list(APPEND miopen_root_hints
+            $ENV{ROCM_PATH}
             ${MIOPENROOT}
             $ENV{MIOPENROOT}
             "/opt/rocm"
@@ -68,6 +69,10 @@ if(EXISTS "${MIOpen_INCLUDE_DIR}/miopen/version.h")
         "${MIOpen_MAJOR_VERSION}.${MIOpen_MINOR_VERSION}.${MIOpen_PATCH_VERSION}"
     )
 
+    if(${MIOpen_MAJOR_VERSION} LESS 3)
+        add_definitions(-DMIOPEN_HAS_INT8X4=1)
+    endif()
+
     unset(MIOpen_VERSION_CONTENT)
 else()
     message(WARNING "MIOpen version couldn't be identified.")
diff --git a/cmake/FindOpenCL.cmake b/cmake/FindOpenCL.cmake
index de876351714..711850959ba 100644
--- a/cmake/FindOpenCL.cmake
+++ b/cmake/FindOpenCL.cmake
@@ -47,18 +47,18 @@ function(_FIND_OPENCL_VERSION)
   set(CMAKE_REQUIRED_QUIET ${OpenCL_FIND_QUIETLY})
 
   CMAKE_PUSH_CHECK_STATE()
-  foreach(VERSION "2_2" "2_1" "2_0" "1_2" "1_1" "1_0")
+  foreach(VERSION "3_0" "2_2" "2_1" "2_0" "1_2" "1_1" "1_0")
     set(CMAKE_REQUIRED_INCLUDES "${OpenCL_INCLUDE_DIR}")
 
     if(APPLE)
       CHECK_SYMBOL_EXISTS(
         CL_VERSION_${VERSION}
-        "${OpenCL_INCLUDE_DIR}/Headers/cl.h"
+        "Headers/cl.h"
         OPENCL_VERSION_${VERSION})
     else()
       CHECK_SYMBOL_EXISTS(
         CL_VERSION_${VERSION}
-        "${OpenCL_INCLUDE_DIR}/CL/cl.h"
+        "CL/cl.h"
         OPENCL_VERSION_${VERSION})
     endif()
 
diff --git a/cmake/FindcublasLt.cmake b/cmake/FindcublasLt.cmake
new file mode 100644
index 00000000000..bb7d4a3d5df
--- /dev/null
+++ b/cmake/FindcublasLt.cmake
@@ -0,0 +1,48 @@
+# ===============================================================================
+# Copyright 2020-2025 Intel Corporation 
+# Copyright 2020-2024 Codeplay Software Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+# ===============================================================================
+
+find_package(CUDA 10.0 REQUIRED)
+find_package(Threads REQUIRED)
+
+find_path(
+  CUBLASLT_INCLUDE_DIR "cublasLt.h"
+  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES include)
+
+find_library(CUDA_DRIVER_LIBRARY cuda)
+
+find_library(
+  CUBLASLT_LIBRARY cublasLt
+  HINTS ${CUDA_TOOLKIT_ROOT_DIR}
+  PATH_SUFFIXES lib lib64 bin)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  cublasLt REQUIRED_VARS CUBLASLT_INCLUDE_DIR CUDA_INCLUDE_DIRS CUBLASLT_LIBRARY
+                         CUDA_LIBRARIES CUDA_DRIVER_LIBRARY)
+
+if(NOT TARGET cublasLt::cublasLt)
+  add_library(cublasLt::cublasLt SHARED IMPORTED)
+  set_target_properties(
+    cublasLt::cublasLt
+    PROPERTIES IMPORTED_LOCATION ${CUBLASLT_LIBRARY}
+               INTERFACE_INCLUDE_DIRECTORIES
+               "${CUBLASLT_INCLUDE_DIR};${CUDA_INCLUDE_DIRS}"
+               INTERFACE_LINK_LIBRARIES
+               "Threads::Threads;${CUDA_DRIVER_LIBRARY};${CUDA_LIBRARIES}"
+               INTERFACE_COMPILE_DEFINITIONS CUDA_NO_HALF)
+endif()
diff --git a/cmake/FindrocBLAS.cmake b/cmake/FindrocBLAS.cmake
index c36baa8b473..45743c28873 100644
--- a/cmake/FindrocBLAS.cmake
+++ b/cmake/FindrocBLAS.cmake
@@ -19,21 +19,23 @@ find_package(Threads REQUIRED)
 
 # Prioritize ROCBLASROOT
 list(APPEND rocblas_root_hints
+            $ENV{ROCM_PATH}
             ${ROCBLASROOT}
             $ENV{ROCBLASROOT}
             "/opt/rocm"
-            "/opt/rocm/rocblas")
+            "/opt/rocm/rocblas"
+            "/opt/rocm/lib")
 
 find_path(
     rocBLAS_INCLUDE_DIR "rocblas.h"
     HINTS ${rocblas_root_hints}
-    PATH_SUFFIXES include
+    PATH_SUFFIXES include include/rocblas
 )
 
 find_library(
     rocBLAS_LIBRARY rocblas
     HINTS ${rocblas_root_hints}
-    PATH_SUFFIXES lib
+    PATH_SUFFIXES lib lib/rocblas
 )
 
 if(EXISTS "${rocBLAS_INCLUDE_DIR}/internal/rocblas-version.h")
diff --git a/cmake/OpenMP.cmake b/cmake/OpenMP.cmake
index 9484c268506..75aeba8a467 100644
--- a/cmake/OpenMP.cmake
+++ b/cmake/OpenMP.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2017-2024 Intel Corporation
+# Copyright 2017-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ endif()
 set(OpenMP_cmake_included true)
 include("cmake/Threading.cmake")
 
-if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
     # OSX Clang doesn't have OpenMP by default.
     # But we still want to build the library.
     set(_omp_severity "WARNING")
@@ -31,19 +31,6 @@ else()
     set(_omp_severity "FATAL_ERROR")
 endif()
 
-macro(set_openmp_values_for_old_cmake)
-    #newer version for findOpenMP (>= v. 3.9)
-    if(CMAKE_VERSION VERSION_LESS "3.9" AND OPENMP_FOUND)
-        if(${CMAKE_MAJOR_VERSION} VERSION_LESS "3" AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-            # Override FindOpenMP flags for Intel Compiler (otherwise deprecated)
-            set(OpenMP_CXX_FLAGS "-fopenmp")
-            set(OpenMP_C_FLAGS "-fopenmp")
-        endif()
-        set(OpenMP_C_FOUND true)
-        set(OpenMP_CXX_FOUND true)
-    endif()
-endmacro()
-
 if(DPCPP_HOST_COMPILER_KIND STREQUAL "DEFAULT")
     # XXX: workaround: when -fsycl is specified the compiler doesn't define
     # _OPENMP macro causing `find_package(OpenMP)` to fail.
@@ -51,10 +38,7 @@ if(DPCPP_HOST_COMPILER_KIND STREQUAL "DEFAULT")
     # the -fsycl option by default so it has to be explicitly disabled.
     set(_omp_original_cmake_cxx_flags "${CMAKE_CXX_FLAGS}")
     string(REGEX REPLACE "-fsycl" "-fno-sycl" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
     find_package(OpenMP)
-    set_openmp_values_for_old_cmake()
-
     set(CMAKE_CXX_FLAGS "${_omp_original_cmake_cxx_flags}")
 endif()
 
@@ -68,13 +52,6 @@ if(NOT OpenMP_CXX_FOUND AND MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "(Clang|Intel
         # The ICX driver doesn't link OpenMP library even if `/Qopenmp`
         # was specified.
         set(OpenMP_FLAGS "/Qopenmp -Xclang --dependent-lib=libiomp5md")
-    else()
-        if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "10.0")
-            # version < 10 can't pass cl-style `/openmp` flag
-            set(OpenMP_FLAGS "-Xclang -fopenmp")
-            # ... and requires explicit linking against omp library
-            set(OpenMP_CXX_LIBRARIES "libomp.lib")
-        endif()
     endif()
     set(OpenMP_C_FLAGS ${OpenMP_FLAGS})
     set(OpenMP_CXX_FLAGS ${OpenMP_FLAGS})
diff --git a/cmake/SDL.cmake b/cmake/SDL.cmake
index cf8a7d61f51..10953c021af 100644
--- a/cmake/SDL.cmake
+++ b/cmake/SDL.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2017-2024 Intel Corporation
+# Copyright 2017-2025 Intel Corporation
 # Copyright 2021 FUJITSU LIMITED
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -30,16 +30,11 @@ macro(sdl_unix_common_ccxx_flags var)
     append(${var} "-fPIC -Wformat -Wformat-security")
 endmacro()
 
-macro(sdl_gnu_common_ccxx_flags var)
-    if(DPCPP_HOST_COMPILER_KIND STREQUAL "GNU")
-        # GNU compiler 7.4 or newer is required for host compiler
-        append(${var} "-fstack-protector-strong")
-    else()
-        if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.9)
-            append(${var} "-fstack-protector-all")
-        else()
-            append(${var} "-fstack-protector-strong")
-        endif()
+macro(sdl_gnu_common_ccxx_flags var gnu_version)
+    append(${var} "-fstack-protector-strong")
+    if(NOT (${gnu_version} VERSION_LESS 8.0) 
+       AND (DNNL_TARGET_ARCH STREQUAL "X64"))
+        append(${var} "-fcf-protection=full")
     endif()
 endmacro()
 
@@ -49,44 +44,84 @@ endmacro()
 # only. To prevent warnings on users' side who use the library and turn
 # this warning on, let's use it too. Applicable for the library sources
 # and interfaces only (tests currently rely on that fact heavily)
-macro(sdl_gnu_src_ccxx_flags var)
+macro(sdl_unix_src_ccxx_flags var)
     append(${var} "-Wmissing-field-initializers")
 endmacro()
 
-macro(sdl_gnu_example_ccxx_flags var)
+macro(sdl_unix_example_ccxx_flags var)
     # At this point the flags for src and examples are the same
-    sdl_gnu_src_ccxx_flags(${var})
+    sdl_unix_src_ccxx_flags(${var})
 endmacro()
 
-if(UNIX)
-    set(CMAKE_CCXX_FLAGS)
+set(ONEDNN_SDL_COMPILER_FLAGS)
+set(ONEDNN_SDL_LINKER_FLAGS)
 
-    sdl_unix_common_ccxx_flags(CMAKE_CCXX_FLAGS)
-    append(CMAKE_CXX_FLAGS_RELEASE "-D_FORTIFY_SOURCE=2")
-    append(CMAKE_C_FLAGS_RELEASE "-D_FORTIFY_SOURCE=2")
+if(UNIX)
+    sdl_unix_common_ccxx_flags(ONEDNN_SDL_COMPILER_FLAGS)
+    sdl_unix_src_ccxx_flags(CMAKE_SRC_CCXX_FLAGS)
+    sdl_unix_example_ccxx_flags(CMAKE_EXAMPLE_CCXX_FLAGS)
+    if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "RELEASE")
+        append(ONEDNN_SDL_COMPILER_FLAGS "-D_FORTIFY_SOURCE=2")
+    endif()
     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-        sdl_gnu_common_ccxx_flags(CMAKE_CCXX_FLAGS)
-        sdl_gnu_src_ccxx_flags(CMAKE_SRC_CCXX_FLAGS)
-        sdl_gnu_example_ccxx_flags(CMAKE_EXAMPLE_CCXX_FLAGS)
-    elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        sdl_gnu_common_ccxx_flags(ONEDNN_SDL_COMPILER_FLAGS
+                                  CMAKE_CXX_COMPILER_VERSION)
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang")
         get_filename_component(CXX_CMD_NAME ${CMAKE_CXX_COMPILER} NAME)
         # Fujitsu CXX compiler does not support "-fstack-protector-all".
         if(NOT CXX_CMD_NAME STREQUAL "FCC")
-            append(CMAKE_CCXX_FLAGS "-fstack-protector-all")
+            append(ONEDNN_SDL_COMPILER_FLAGS "-fstack-protector-all")
         endif()
     elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
-        append(CMAKE_CXX_FLAGS "-fstack-protector")
+        append(ONEDNN_SDL_COMPILER_FLAGS "-fstack-protector")
     endif()
-    append(CMAKE_C_FLAGS "${CMAKE_CCXX_FLAGS}")
-    append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_FLAGS}")
     if(APPLE)
-        append(CMAKE_SHARED_LINKER_FLAGS "-Wl,-bind_at_load")
-        append(CMAKE_EXE_LINKER_FLAGS "-Wl,-bind_at_load")
+        append(ONEDNN_SDL_LINKER_FLAGS "-Wl,-bind_at_load")
     else()
+        # Only applies to executables.
         append(CMAKE_EXE_LINKER_FLAGS "-pie")
-        append(CMAKE_SHARED_LINKER_FLAGS "-Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
-        append(CMAKE_EXE_LINKER_FLAGS "-Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
+        append(ONEDNN_SDL_LINKER_FLAGS "-Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now")
+    endif()
+elseif(WIN32)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+        append(ONEDNN_SDL_COMPILER_FLAGS "/GS /Gy /guard:cf /DYNAMICBASE /sdl")
+        append(ONEDNN_SDL_LINKER_FLAGS "/NXCOMPAT /LTCG")
+    elseif(CMAKE_BASE_NAME STREQUAL "icx")
+        append(ONEDNN_SDL_COMPILER_FLAGS "/GS /Gy /guard:cf /Wformat /Wformat-security")
+        append(ONEDNN_SDL_LINKER_FLAGS "/link /NXCOMPAT")
+    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+        append(ONEDNN_SDL_COMPILER_FLAGS "-Wformat -Wformat-security")
+        if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "RELEASE")
+            append(ONEDNN_SDL_COMPILER_FLAGS "-D_FORTIFY_SOURCE=2")
+        endif()
+        get_filename_component(CXX_CMD_NAME ${CMAKE_CXX_COMPILER} NAME)
+        # Fujitsu CXX compiler does not support "-fstack-protector-all".
+        if(NOT CXX_CMD_NAME STREQUAL "FCC")
+            append(ONEDNN_SDL_COMPILER_FLAGS "-fstack-protector-all")
+        endif()
+        append(ONEDNN_SDL_LINKER_FLAGS "-Xlinker /NXCOMPAT -Xlinker /LTCG")
+    endif()
+
+    if(NOT MINGW)
+        # For a Windows build, a malicious DLL can be injected because of the
+        # uncontrolled search order for load-time linked libraries defined for a
+        # Windows setting. The following cmake flags change the search order so that
+        # DLLs are loaded from the current working directory only if it is under a path
+        # in the Safe Load List.
+        if(CMAKE_BASE_NAME STREQUAL "icx")
+            # add ICX-style linker flags
+            append(ONEDNN_SDL_LINKER_FLAGS "/link /DEPENDENTLOADFLAG:0x2000")
+        elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+            # add Clang-style linker flags
+            append(ONEDNN_SDL_LINKER_FLAGS "-Xlinker /DEPENDENTLOADFLAG:0x2000")
+        else()
+            # Default to MSVC-style definition
+            append(ONEDNN_SDL_LINKER_FLAGS "/DEPENDENTLOADFLAG:0x2000")
+        endif()
     endif()
-elseif(MSVC AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
-    set(CMAKE_CCXX_FLAGS "/guard:cf")
 endif()
+
+append(CMAKE_C_FLAGS "${ONEDNN_SDL_COMPILER_FLAGS}")
+append(CMAKE_CXX_FLAGS "${ONEDNN_SDL_COMPILER_FLAGS}")
+append(CMAKE_SHARED_LINKER_FLAGS "${ONEDNN_SDL_LINKER_FLAGS}")
+append(CMAKE_EXE_LINKER_FLAGS "${ONEDNN_SDL_LINKER_FLAGS}")
diff --git a/cmake/SYCL.cmake b/cmake/SYCL.cmake
index bfaf25e53a1..5ca1c1c1beb 100644
--- a/cmake/SYCL.cmake
+++ b/cmake/SYCL.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2024 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -88,12 +88,13 @@ endmacro()
 if(DNNL_SYCL_CUDA)
     suppress_warnings_for_nvidia_target()
     find_package(cuBLAS REQUIRED)
+    find_package(cublasLt REQUIRED)
     find_package(cuDNN REQUIRED)
 
-    adjust_headers_priority("cuBLAS::cuBLAS;cuDNN::cuDNN")
+    adjust_headers_priority("cuBLAS::cuBLAS;cuDNN::cuDNN;cublasLt::cublasLt")
     add_definitions_with_host_compiler("-DCUDA_NO_HALF")
 
-    list(APPEND EXTRA_SHARED_LIBS cuBLAS::cuBLAS cuDNN::cuDNN)
+    list(APPEND EXTRA_SHARED_LIBS cuBLAS::cuBLAS cuDNN::cuDNN cublasLt::cublasLt)
     message(STATUS "DPC++ support is enabled (CUDA)")
 elseif(DNNL_SYCL_HIP)
     find_package(HIP REQUIRED)
@@ -135,14 +136,7 @@ endif()
 #   #pragma message("The Intel extensions have been moved into cl_ext.h.
 #   Please include cl_ext.h directly.")
 if(NOT WIN32)
-    if(${CMAKE_VERSION} VERSION_LESS "3.1.0")
-        # Prior to CMake 3.1 the Makefile generators did not escape # correctly
-        # inside make variable assignments used in generated makefiles, causing
-        # them to be treated as comments. This is a workaround.
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-\\#pragma-messages")
-    else()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-#pragma-messages")
-    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-#pragma-messages")
 endif()
 
 add_definitions_with_host_compiler("-DCL_TARGET_OPENCL_VERSION=300")
diff --git a/cmake/Sphinx.cmake b/cmake/Sphinx.cmake
index 99b7de2868f..ed1e17a41f0 100644
--- a/cmake/Sphinx.cmake
+++ b/cmake/Sphinx.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021 Intel Corporation
+# Copyright 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@ if(Sphinx_cmake_included)
 endif()
 set(Sphinx_cmake_included true)
 
-find_package(PythonInterp 2.7)
+find_package(Python 3.7 COMPONENTS Interpreter)
 find_package(Sphinx)
-if (PYTHONINTERP_FOUND AND SPHINX_FOUND)
+if (Python_FOUND AND SPHINX_FOUND)
     set(SPHINX_GENERATOR "html" CACHE STRING "specifies generator for Sphinx")
 
     set(SPHINX_OUTPUT_DIR
@@ -52,7 +52,7 @@ if (PYTHONINTERP_FOUND AND SPHINX_FOUND)
         COMMAND ${CMAKE_COMMAND} -E copy_directory
             ${CMAKE_CURRENT_SOURCE_DIR}/doc/sphinx/_static
             ${SPHINX_SOURCE_DIR}/_static
-        COMMAND ${PYTHON_EXECUTABLE}
+        COMMAND ${Python_EXECUTABLE}
             ${CMAKE_CURRENT_BINARY_DIR}/cleanup.py ${SPHINX_SOURCE_DIR}
         COMMAND ${SPHINX_EXECUTABLE} -b ${SPHINX_GENERATOR}
             -D release=v${PROJECT_VERSION} -j auto rst ${SPHINX_OUTPUT_DIR}
@@ -60,4 +60,4 @@ if (PYTHONINTERP_FOUND AND SPHINX_FOUND)
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/reference
         COMMENT "Generating API documentation with Sphinx" VERBATIM)
     add_custom_target(doc_sphinx DEPENDS ${SPHINX_STAMP_FILE} doc_doxyrest)
-endif(PYTHONINTERP_FOUND AND SPHINX_FOUND)
+endif(Python_FOUND AND SPHINX_FOUND)
diff --git a/cmake/TBB.cmake b/cmake/TBB.cmake
index 7c82c428b41..ef6669da8a9 100644
--- a/cmake/TBB.cmake
+++ b/cmake/TBB.cmake
@@ -26,7 +26,10 @@ include("cmake/Threading.cmake")
 macro(handle_tbb_target)
     if(TBB_FOUND)
         set_property(TARGET TBB::tbb PROPERTY "MAP_IMPORTED_CONFIG_RELWITHMDD" "DEBUG")
-        include_directories_with_host_compiler(${_tbb_include_dirs})
+        foreach(inc_dir ${_tbb_include_dirs})
+            include_directories(BEFORE SYSTEM ${inc_dir})
+            append_host_compiler_options(CMAKE_CXX_FLAGS "-I${inc_dir}")
+        endforeach()
         list(APPEND EXTRA_SHARED_LIBS TBB::tbb)
 
         # Print TBB location
@@ -59,7 +62,7 @@ macro(handle_tbb_target)
     add_definitions(-DTBB_PREVIEW_TASK_ARENA_CONSTRAINTS_EXTENSION=1)
 endmacro()
 
-if(NOT DNNL_CPU_THREADING_RUNTIME STREQUAL "TBB")
+if(NOT "${DNNL_CPU_THREADING_RUNTIME}" MATCHES "^(TBB|TBB_AUTO)$")
     return()
 endif()
 
diff --git a/cmake/Threading.cmake b/cmake/Threading.cmake
index 5ad3d903b07..cdbef190c7f 100644
--- a/cmake/Threading.cmake
+++ b/cmake/Threading.cmake
@@ -39,22 +39,12 @@ list(APPEND EXTRA_SHARED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
 
 # A macro to avoid code duplication
 macro(find_package_tbb)
-    # Try to find TBB using a TBB-provided CMake config file.
-    find_package(TBB QUIET COMPONENTS tbb)
-    # If the previous `find_package` call failed then try to
-    # use a TBB CMake config file that is maintained by oneDNN.
-    # The reason the previous call may fail is that TBB package is
-    # very old and doesn't provide a CMake config file.
-    if(NOT TBB_FOUND)
-        message(STATUS "TBB-provided CMake config either failed or was not found. Trying to use a custom one.")
-        set(_cmake_proj_dir "${PROJECT_SOURCE_DIR}/cmake")
-        if(WIN32)
-            find_package(TBB ${ARGN} COMPONENTS tbb HINTS ${_cmake_proj_dir}/win)
-        elseif(APPLE)
-            find_package(TBB ${ARGN} COMPONENTS tbb HINTS ${_cmake_proj_dir}/mac)
-        elseif(UNIX)
-            find_package(TBB ${ARGN} COMPONENTS tbb HINTS ${_cmake_proj_dir}/lnx)
-        endif()
+    if(WIN32)
+        find_package(TBB ${ARGN} COMPONENTS tbb)
+    elseif(APPLE)
+        find_package(TBB ${ARGN} COMPONENTS tbb)
+    elseif(UNIX)
+        find_package(TBB ${ARGN} COMPONENTS tbb)
     endif()
 
     if(TBB_FOUND)
diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in
index 24a35b5d4bf..0cdd6f754e3 100644
--- a/cmake/config.cmake.in
+++ b/cmake/config.cmake.in
@@ -21,6 +21,8 @@ set(DNNL_GPU_RUNTIME "@DNNL_GPU_RUNTIME@")
 
 set(DNNL_BLAS_VENDOR "@DNNL_BLAS_VENDOR@")
 
+set(DNNL_GPU_VENDOR "@DNNL_GPU_VENDOR@")
+
 if(DNNL_CPU_THREADING_RUNTIME STREQUAL "TBB")
     # Try to find TBB using a TBB-provided CMake config file.
     find_package(TBB QUIET COMPONENTS tbb)
@@ -62,6 +64,14 @@ check_required_components("@LIB_PACKAGE_NAME@")
 
 if(DNNL_CPU_RUNTIME STREQUAL "SYCL" OR DNNL_CPU_RUNTIME STREQUAL "DPCPP" OR
    DNNL_GPU_RUNTIME STREQUAL "SYCL" OR DNNL_GPU_RUNTIME STREQUAL "DPCPP")
+    if(DNNL_GPU_VENDOR STREQUAL "NVIDIA")
+        set(DNNL_ORIGINAL_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH})
+        list(INSERT CMAKE_MODULE_PATH 0 ${PACKAGE_PREFIX_DIR}/@LIB_CONFIG_INSTALL_DIR@)
+        find_package(cuDNN REQUIRED)
+        find_package(cuBLAS REQUIRED)
+        find_package(cublasLt REQUIRED)
+        set(CMAKE_MODULE_PATH ${DNNL_ORIGINAL_CMAKE_MODULE_PATH})
+    endif()
     set(DNNL_COMPILE_FLAGS "-fsycl")
     @HANDLE_BUNDLE_DEBUG_SYCL_CONFIGURATION@
 endif()
diff --git a/cmake/configuring_primitive_list.cmake b/cmake/configuring_primitive_list.cmake
index 3524f171070..55fc83b33e2 100644
--- a/cmake/configuring_primitive_list.cmake
+++ b/cmake/configuring_primitive_list.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2024 Intel Corporation
+# Copyright 2021-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ else()
     foreach(impl ${DNNL_ENABLE_PRIMITIVE})
         string(TOUPPER ${impl} uimpl)
         if(NOT "${uimpl}" MATCHES
-                "^(BATCH_NORMALIZATION|BINARY|CONCAT|CONVOLUTION|DECONVOLUTION|ELTWISE|INNER_PRODUCT|LAYER_NORMALIZATION|LRN|MATMUL|POOLING|PRELU|REDUCTION|REORDER|RESAMPLING|RNN|SDPA|SHUFFLE|SOFTMAX|SUM)$")
+                "^(BATCH_NORMALIZATION|BINARY|CONCAT|CONVOLUTION|DECONVOLUTION|ELTWISE|GROUP_NORMALIZATION|INNER_PRODUCT|LAYER_NORMALIZATION|LRN|MATMUL|POOLING|PRELU|REDUCTION|REORDER|RESAMPLING|RNN|SDPA|SHUFFLE|SOFTMAX|SUM)$")
             message(FATAL_ERROR "Unsupported primitive: ${uimpl}")
         endif()
         set(BUILD_${uimpl} TRUE)
@@ -58,7 +58,7 @@ if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
 else()
     foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
         string(TOUPPER ${isa} uisa)
-        if(NOT "${uisa}" MATCHES "^(GEN9|GEN11|XELP|XEHP|XEHPG|XEHPC|XE2)$")
+        if(NOT "${uisa}" MATCHES "^(GEN9|GEN11|XELP|XEHP|XEHPG|XEHPC|XE2|XE3)$")
             message(FATAL_ERROR "Unsupported primitive GPU ISA: ${uisa}")
         endif()
         set(BUILD_${uisa} TRUE)
diff --git a/cmake/coverage.cmake b/cmake/coverage.cmake
index ef0d06eed83..ce1799ed120 100644
--- a/cmake/coverage.cmake
+++ b/cmake/coverage.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2020 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -36,11 +36,7 @@ if("${DNNL_CODE_COVERAGE}" STREQUAL "GCOV")
         message(FATAL_ERROR "GCOV not found in path")
     endif()
 
-    if("${CMAKE_CXX_COMPILER_ID}" MATCHES "(Apple)?[Cc]lang")
-        if("${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 3)
-            message(FATAL_ERROR "Clang version must be 3.0.0 or greater! Aborting...")
-        endif()
-    elseif(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+    if(NOT CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang|GNU")
         message(FATAL_ERROR "Unsupported compiler: ${CMAKE_CXX_COMPILER_ID}")
     endif()
 
@@ -49,7 +45,7 @@ if("${DNNL_CODE_COVERAGE}" STREQUAL "GCOV")
 
     if(NOT CMAKE_BUILD_TYPE MATCHES "[Dd]ebug")
         message(WARNING "Code coverage results with an optimised (non-Debug) build may be misleading")
-    endif() 
+    endif()
 
     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
         link_libraries(gcov)
diff --git a/cmake/dnnl_compat.cmake b/cmake/dnnl_compat.cmake
index 17e3d0192e1..7e2fc043a1a 100644
--- a/cmake/dnnl_compat.cmake
+++ b/cmake/dnnl_compat.cmake
@@ -35,6 +35,8 @@ endmacro()
 set(COMPAT_CACHE_BOOL_VARS
     "EXPERIMENTAL"
     "EXPERIMENTAL_SPARSE"
+    "EXPERIMENTAL_UKERNEL"
+    "EXPERIMENTAL_LOGGING"
     "VERBOSE"
     "ENABLE_CONCURRENT_EXEC"
     "ENABLE_PRIMITIVE_CACHE"
diff --git a/cmake/gen_gpu_kernel.cmake b/cmake/gen_gpu_kernel.cmake
index 672c88ef877..dfc5feb1d9b 100644
--- a/cmake/gen_gpu_kernel.cmake
+++ b/cmake/gen_gpu_kernel.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2024 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -22,18 +22,22 @@
 
 file(READ ${CL_FILE} cl_file_lines)
 
-# Remove C++ style comments
-string(REGEX REPLACE "//[^\n]*\n" "\n" cl_file_lines "${cl_file_lines}")
-# Remove repeated whitespaces
-string(REGEX REPLACE " +" " " cl_file_lines "${cl_file_lines}")
-# Remove leading whitespaces
-string(REGEX REPLACE "\n " "\n" cl_file_lines "${cl_file_lines}")
-# Remove empty lines
-string(REGEX REPLACE "\n+" "\n" cl_file_lines "${cl_file_lines}")
+string(LENGTH "${cl_file_lines}" len)
+if(MINIFY OR len GREATER 65535)
+    # Remove C++ style comments
+    string(REGEX REPLACE "//[^\n]*\n" "\n" cl_file_lines "${cl_file_lines}")
+    # Remove repeated whitespaces
+    string(REGEX REPLACE " +" " " cl_file_lines "${cl_file_lines}")
+    # Remove leading whitespaces
+    string(REGEX REPLACE "\n " "\n" cl_file_lines "${cl_file_lines}")
+    # Remove empty lines
+    string(REGEX REPLACE "\n+" "\n" cl_file_lines "${cl_file_lines}")
+endif()
 
 string(LENGTH "${cl_file_lines}" len)
 if(len GREATER 65535)
-    message(WARNING "Windows requires string literals to fit in 65535 bytes. Please split ${CL_FILE}.")
+    message(FATAL_ERROR
+        "Windows requires string literals to fit in 65535 bytes. Please split ${CL_FILE}.")
 endif()
 
 get_filename_component(cl_file_name ${CL_FILE} NAME_WE)
diff --git a/cmake/gen_gpu_kernel_list.cmake b/cmake/gen_gpu_kernel_list.cmake
index 02f8cacb9bb..f64f90d6259 100644
--- a/cmake/gen_gpu_kernel_list.cmake
+++ b/cmake/gen_gpu_kernel_list.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2020-2021 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,6 +46,11 @@ endfunction()
 function(gen_gpu_kernel_list ker_list_templ ker_list_src ker_sources headers)
     set(_sources "${SOURCES}")
 
+    set(MINIFY "ON")
+    if(DNNL_DEV_MODE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
+        set(MINIFY "OFF")
+    endif()
+
     set(KER_LIST_EXTERN)
     set(KER_LIST_ENTRIES)
     set(KER_HEADERS_EXTERN)
@@ -62,6 +67,7 @@ function(gen_gpu_kernel_list ker_list_templ ker_list_src ker_sources headers)
             COMMAND ${CMAKE_COMMAND}
                 -DCL_FILE="${header_path}"
                 -DGEN_FILE="${gen_file}"
+                -DMINIFY="${MINIFY}"
                 -P ${PROJECT_SOURCE_DIR}/cmake/gen_gpu_kernel.cmake
             DEPENDS ${header_path}
         )
@@ -81,6 +87,7 @@ function(gen_gpu_kernel_list ker_list_templ ker_list_src ker_sources headers)
             COMMAND ${CMAKE_COMMAND}
                 -DCL_FILE="${ker_path}"
                 -DGEN_FILE="${gen_file}"
+                -DMINIFY="${MINIFY}"
                 -P ${PROJECT_SOURCE_DIR}/cmake/gen_gpu_kernel.cmake
             DEPENDS ${ker_path}
         )
diff --git a/cmake/host_compiler.cmake b/cmake/host_compiler.cmake
index 22b5ed60bd3..7d64edbbe8a 100644
--- a/cmake/host_compiler.cmake
+++ b/cmake/host_compiler.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2024 Intel Corporation
+# Copyright 2021-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -35,6 +35,8 @@ if(DPCPP_HOST_COMPILER_KIND MATCHES "^(GNU|CLANG)$")
     platform_unix_and_mingw_common_cxx_flags(DPCPP_HOST_COMPILER_OPTS)
 
     sdl_unix_common_ccxx_flags(DPCPP_HOST_COMPILER_OPTS)
+    sdl_unix_src_ccxx_flags(DPCPP_SRC_COMPILER_OPTS)
+    sdl_unix_example_ccxx_flags(DPCPP_EXAMPLE_COMPILER_OPTS)
 
     # SYCL uses C++17 features in headers hence C++17 support should be enabled
     # for host compiler.
@@ -78,9 +80,7 @@ if(DPCPP_HOST_COMPILER_KIND MATCHES "^(GNU|CLANG)$")
 
     if(DPCPP_HOST_COMPILER_KIND STREQUAL "GNU")
         platform_gnu_nowarn_ccxx_flags(DPCPP_CXX_NOWARN_FLAGS ${DPCPP_HOST_COMPILER_MAJOR_VER}.${DPCPP_HOST_COMPILER_MINOR_VER})
-        sdl_gnu_common_ccxx_flags(DPCPP_HOST_COMPILER_OPTS)
-        sdl_gnu_src_ccxx_flags(DPCPP_SRC_CXX_FLAGS)
-        sdl_gnu_example_ccxx_flags(DPCPP_EXAMPLE_CXX_FLAGS)
+        sdl_gnu_common_ccxx_flags(DPCPP_HOST_COMPILER_OPTS DPCPP_HOST_COMPILER_VER)
 
         # SYCL headers contain some comments that trigger warning with GNU compiler
         append(DPCPP_HOST_COMPILER_OPTS "-Wno-comment")
@@ -100,6 +100,11 @@ if(DPCPP_HOST_COMPILER_KIND MATCHES "^(GNU|CLANG)$")
     # Affects both, GNU and CLANG kinds.
     append(CMAKE_CXX_FLAGS "-Wno-unused-command-line-argument")
 
+    # Option `-fsycl-unnamed-lambda` is enabled by default, but not compatible
+    # with `-fsycl-host-compiler`. While icpx driver adds
+    # `-fno-sycl-unnamed-lambda` to avoid build issues clang++ does not do that.
+    append(CMAKE_CXX_FLAGS "-fno-sycl-unnamed-lambda")
+
     append(CMAKE_CXX_FLAGS "-fsycl-host-compiler=${DPCPP_HOST_COMPILER}")
     append_host_compiler_options(CMAKE_CXX_FLAGS "${DPCPP_HOST_COMPILER_OPTS}")
 endif()
diff --git a/cmake/host_compiler_id.cmake b/cmake/host_compiler_id.cmake
index 2a74d703c4f..966c553023c 100644
--- a/cmake/host_compiler_id.cmake
+++ b/cmake/host_compiler_id.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2024 Intel Corporation
+# Copyright 2024-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -98,13 +98,13 @@ message(STATUS "Host compiler version: ${DPCPP_HOST_COMPILER_MAJOR_VER}.${DPCPP_
 
 # Check the version of the provided host compiler.
 if(DPCPP_HOST_COMPILER_KIND STREQUAL "GNU")
-    if((DPCPP_HOST_COMPILER_MAJOR_VER LESS 7) OR (DPCPP_HOST_COMPILER_MAJOR_VER EQUAL 7 AND DPCPP_HOST_COMPILER_MINOR_VER LESS 4))
-        message(FATAL_ERROR "The minimum version of ${DPCPP_HOST_COMPILER_KIND} host compiler is 7.4.")
+    if(DPCPP_HOST_COMPILER_MAJOR_VER LESS 8)
+        message(FATAL_ERROR "The minimum version of ${DPCPP_HOST_COMPILER_KIND} host compiler is 8.0.")
     endif()
 endif()
 
 if(DPCPP_HOST_COMPILER_KIND STREQUAL "CLANG")
-    if(DPCPP_HOST_COMPILER_MAJOR_VER LESS 8)
-        message(FATAL_ERROR "The minimum version of ${DPCPP_HOST_COMPILER_KIND} host compiler is 8.0.")
+    if(DPCPP_HOST_COMPILER_MAJOR_VER LESS 11)
+        message(FATAL_ERROR "The minimum version of ${DPCPP_HOST_COMPILER_KIND} host compiler is 11.0.")
     endif()
 endif()
diff --git a/cmake/lnx/TBBConfig.cmake b/cmake/lnx/TBBConfig.cmake
deleted file mode 100644
index bedbff68e39..00000000000
--- a/cmake/lnx/TBBConfig.cmake
+++ /dev/null
@@ -1,183 +0,0 @@
-#===============================================================================
-# Copyright 2017-2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-# TBB_FOUND should not be set explicitly. It is defined automatically by CMake.
-# Handling of TBB_VERSION is in TBBConfigVersion.cmake.
-
-if (NOT TBB_FIND_COMPONENTS)
-    set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy")
-    foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-        set(TBB_FIND_REQUIRED_${_tbb_component} 1)
-    endforeach()
-endif()
-
-# Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc
-list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix)
-if (NOT _tbbmalloc_proxy_ix EQUAL -1)
-    list(FIND TBB_FIND_COMPONENTS tbbmalloc _tbbmalloc_ix)
-    if (_tbbmalloc_ix EQUAL -1)
-        list(APPEND TBB_FIND_COMPONENTS tbbmalloc)
-        set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy})
-    endif()
-endif()
-
-# oneDNN changes: use TBBROOT to locate Intel TBB
-# get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_FILE}" PATH)
-# get_filename_component(_tbb_root "${_tbb_root}" PATH)
-if (NOT TBBROOT)
-    if(DEFINED ENV{TBBROOT})
-        set (TBBROOT $ENV{TBBROOT})
-    endif()
-endif()
-
-set(_tbb_root ${TBBROOT})
-
-set(_tbb_x32_subdir ia32)
-set(_tbb_x64_subdir intel64)
-
-if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(_tbb_arch_subdir ${_tbb_x64_subdir})
-else()
-    set(_tbb_arch_subdir ${_tbb_x32_subdir})
-endif()
-
-if (CMAKE_CXX_COMPILER_LOADED)
-    set(_tbb_compiler_id ${CMAKE_CXX_COMPILER_ID})
-    set(_tbb_compiler_ver ${CMAKE_CXX_COMPILER_VERSION})
-elseif (CMAKE_C_COMPILER_LOADED)
-    set(_tbb_compiler_id ${CMAKE_C_COMPILER_ID})
-    set(_tbb_compiler_ver ${CMAKE_C_COMPILER_VERSION})
-endif()
-
-# For non-GCC compilers try to find version of system GCC to choose right compiler subdirectory.
-if (NOT _tbb_compiler_id STREQUAL "GNU")
-    execute_process(COMMAND gcc --version OUTPUT_VARIABLE _tbb_gcc_ver_output ERROR_QUIET)
-    string(REGEX REPLACE ".*gcc.* ([0-9]+\\.[0-9]+)\\.[0-9]+.*" "\\1" _tbb_compiler_ver "${_tbb_gcc_ver_output}")
-    if (NOT _tbb_compiler_ver)
-        message(FATAL_ERROR "This Intel TBB package is intended to be used only environment with available 'gcc'")
-    endif()
-    unset(_tbb_gcc_ver_output)
-endif()
-
-if (EXISTS "${_tbb_root}/lib/${_tbb_arch_subdir}")
-    set(_tbb_lib ${_tbb_root}/lib/${_tbb_arch_subdir})
-    set(_tbb_inc ${_tbb_root}/include)
-
-    file(GLOB _tbb_gcc_versions_available RELATIVE ${_tbb_lib} ${_tbb_lib}/*)
-    # shall we check _tbb_gcc_versions_available is not empty?
-    foreach (_tbb_gcc_version ${_tbb_gcc_versions_available})
-        string(SUBSTRING ${_tbb_gcc_version} 3 -1 _tbb_gcc_version_number)
-        if (NOT _tbb_compiler_ver VERSION_LESS _tbb_gcc_version_number)
-            set(_tbb_compiler_subdir ${_tbb_gcc_version})
-        endif()
-    endforeach()
-else()
-    if (TBBROOT)
-        set(__tbb_hint_path "${TBBROOT}")
-    else()
-        set(__tbb_hint_path "/non/existing/path")
-    endif()
-
-    # try to find TBB in the system
-    find_library(_tbb_lib NAMES tbb
-        HINTS "${__tbb_hint_path}"
-        PATH_SUFFIXES lib lib64)
-    find_path(_tbb_inc NAMES tbb.h
-        HINTS "${__tbb_hint_path}"
-        PATH_SUFFIXES include tbb include/tbb)
-    unset(__tbb_hint_path)
-
-    if (NOT _tbb_lib OR NOT _tbb_inc)
-        message("FATAL_ERROR" "Cannot find TBB")
-    endif()
-
-    get_filename_component(_tbb_lib "${_tbb_lib}" PATH)
-    get_filename_component(_tbb_inc "${_tbb_inc}" PATH)
-
-    set(_tbb_arch_subdir "")
-    set(_tbb_compiler_subdir "")
-endif()
-
-unset(_tbb_gcc_version_number)
-unset(_tbb_compiler_id)
-unset(_tbb_compiler_ver)
-
-# Now we check that all the needed component are present
-get_filename_component(_tbb_lib_path "${_tbb_lib}/${_tbb_compiler_subdir}" ABSOLUTE)
-
-if (TBB_FOUND)
-    return()
-endif()
-
-foreach (_tbb_soversion 2 12)
-foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-    set(_tbb_release_lib
-        "${_tbb_lib_path}/lib${_tbb_component}.so.${_tbb_soversion}")
-    set(_tbb_debug_lib
-        "${_tbb_lib_path}/lib${_tbb_component}_debug.so.${_tbb_soversion}")
-
-    # oneDNN change: check library existence (BUILD_MODE related only, not both)
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE)
-    if (UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-        if (EXISTS "${_tbb_debug_lib}")
-            set(_lib_exists TRUE)
-        elseif (EXISTS "${_tbb_release_lib}")
-            message(FATAL_ERROR
-                "Intel TBB release library is found here: ${_tbb_release_lib}. "
-                "But the debug library
-                (lib${_tbb_component}_debug.so.${_tbb_soversion}) is missing.")
-        endif()
-    else()
-        if (EXISTS "${_tbb_release_lib}")
-            set(_lib_exists TRUE)
-        endif()
-    endif()
-
-    if (_lib_exists)
-        if (NOT TARGET TBB::${_tbb_component})
-            add_library(TBB::${_tbb_component} SHARED IMPORTED)
-            set_target_properties(TBB::${_tbb_component} PROPERTIES
-                                  IMPORTED_CONFIGURATIONS "RELEASE;DEBUG"
-                                  IMPORTED_LOCATION_RELEASE     "${_tbb_release_lib}"
-                                  IMPORTED_LOCATION_DEBUG       "${_tbb_debug_lib}"
-                                  INTERFACE_INCLUDE_DIRECTORIES "${_tbb_inc}")
-
-            # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc
-            if (_tbb_component STREQUAL tbbmalloc_proxy)
-                set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc)
-            endif()
-
-            list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component})
-            set(TBB_${_tbb_component}_FOUND 1)
-        endif()
-        break()
-    endif()
-endforeach()
-endforeach()
-
-if (NOT _lib_exists AND TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component})
-    message(FATAL_ERROR "Missed required Intel TBB component: ${_tbb_component}")
-endif()
-
-unset(_tbb_x32_subdir)
-unset(_tbb_x64_subdir)
-unset(_tbb_arch_subdir)
-unset(_tbb_compiler_subdir)
-unset(_tbbmalloc_proxy_ix)
-unset(_tbbmalloc_ix)
-unset(_tbb_lib_path)
-unset(_tbb_release_lib)
-unset(_tbb_debug_lib)
diff --git a/cmake/mac/TBBConfig.cmake b/cmake/mac/TBBConfig.cmake
deleted file mode 100644
index 7bb9af865e2..00000000000
--- a/cmake/mac/TBBConfig.cmake
+++ /dev/null
@@ -1,127 +0,0 @@
-#===============================================================================
-# Copyright 2017-2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-# TBB_FOUND should not be set explicitly. It is defined automatically by CMake.
-# Handling of TBB_VERSION is in TBBConfigVersion.cmake.
-
-if (NOT TBB_FIND_COMPONENTS)
-    set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy")
-    foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-        set(TBB_FIND_REQUIRED_${_tbb_component} 1)
-    endforeach()
-endif()
-
-# Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc
-list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix)
-if (NOT _tbbmalloc_proxy_ix EQUAL -1)
-    list(FIND TBB_FIND_COMPONENTS tbbmalloc _tbbmalloc_ix)
-    if (_tbbmalloc_ix EQUAL -1)
-        list(APPEND TBB_FIND_COMPONENTS tbbmalloc)
-        set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy})
-    endif()
-endif()
-
-# oneDNN changes: use TBBROOT to locate Intel TBB
-# get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_FILE}" PATH)
-# get_filename_component(_tbb_root "${_tbb_root}" PATH)
-if (NOT TBBROOT)
-    if(DEFINED ENV{TBBROOT})
-        set (TBBROOT $ENV{TBBROOT})
-    else()
-        message("FATAL_ERROR" "TBBROOT is unset")
-    endif()
-endif()
-
-set(_tbb_root ${TBBROOT})
-
-set(_tbb_x32_subdir .)
-set(_tbb_x64_subdir .)
-
-if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(_tbb_arch_subdir ${_tbb_x64_subdir})
-else()
-    set(_tbb_arch_subdir ${_tbb_x32_subdir})
-endif()
-
-set(_tbb_compiler_subdir .)
-
-get_filename_component(_tbb_lib_path "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}" ABSOLUTE)
-
-if (TBB_FOUND)
-    return()
-endif()
-
-foreach (_tbb_lib_version .12 "")
-foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-    set(_tbb_release_lib "${_tbb_lib_path}/lib${_tbb_component}${_tbb_lib_version}.dylib")
-    set(_tbb_debug_lib "${_tbb_lib_path}/lib${_tbb_component}_debug${_tbb_lib_version}.dylib")
-
-    # oneDNN change: check library existence (BUILD_MODE related only, not both)
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE)
-    if (UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-        if (EXISTS "${_tbb_debug_lib}")
-            set(_lib_exists TRUE)
-        elseif (EXISTS "${_tbb_release_lib}")
-            message(FATAL_ERROR
-                "Intel TBB release library is found here: ${_tbb_release_lib}. "
-                "But the debug library
-                (lib${_tbb_component}_debug${_tbb_lib_version}.dylib) is missing.")
-        endif()
-    else()
-        if (EXISTS "${_tbb_release_lib}")
-            set(_lib_exists TRUE)
-        endif()
-    endif()
-
-    if (_lib_exists)
-        if (NOT TARGET TBB::${_tbb_component})
-            add_library(TBB::${_tbb_component} SHARED IMPORTED)
-            set_target_properties(TBB::${_tbb_component} PROPERTIES
-                                  IMPORTED_CONFIGURATIONS "RELEASE;DEBUG"
-                                  IMPORTED_LOCATION_RELEASE     "${_tbb_release_lib}"
-                                  IMPORTED_LOCATION_DEBUG       "${_tbb_debug_lib}"
-                                  INTERFACE_INCLUDE_DIRECTORIES "${_tbb_root}/include")
-
-            # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc
-            if (_tbb_component STREQUAL tbbmalloc_proxy)
-                set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc)
-            endif()
-
-            list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component})
-            set(TBB_${_tbb_component}_FOUND 1)
-        endif()
-        break()
-    endif()
-endforeach()
-endforeach()
-
-foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-    if (NOT TARGET TBB::${_tbb_component} AND TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component})
-        message(FATAL_ERROR "Missed required Intel TBB component: ${_tbb_component}")
-    endif()
-endforeach()
-
-unset(_tbb_x32_subdir)
-unset(_tbb_x64_subdir)
-unset(_tbb_arch_subdir)
-unset(_tbb_compiler_subdir)
-unset(_tbbmalloc_proxy_ix)
-unset(_tbbmalloc_ix)
-unset(_tbb_lib_path)
-unset(_tbb_release_lib)
-unset(_tbb_debug_lib)
-unset(_tbb_lib_version)
-unset(_lib_exists)
diff --git a/cmake/options.cmake b/cmake/options.cmake
index 0bb963ae24b..b73128d759d 100644
--- a/cmake/options.cmake
+++ b/cmake/options.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2018-2024 Intel Corporation
+# Copyright 2018-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -81,12 +81,11 @@ set(DNNL_TEST_SET "CI" CACHE STRING
     semicolon separated string, e.g., DNNL_TEST_SET=CI;NO_CORR.")
 
 set(DNNL_INSTALL_MODE "DEFAULT" CACHE STRING
-    "specifies installation mode; supports DEFAULT, BUNDLE and BUNDLE_V2.
+    "specifies installation mode; supports DEFAULT and BUNDLE.
 
-    When BUNDLE or BUNDLE_V2 option is set oneDNN will be installed as a bundle
-    which contains examples and benchdnn. The difference between BUNDLE and
-    BUNDLE_V2 is in the directory layout.")
-if (NOT "${DNNL_INSTALL_MODE}" MATCHES "^(DEFAULT|BUNDLE|BUNDLE_V2)$")
+    When BUNDLE option is set oneDNN will be installed as a bundle
+    which contains examples and benchdnn.")
+if (NOT "${DNNL_INSTALL_MODE}" MATCHES "^(DEFAULT|BUNDLE)$")
     message(FATAL_ERROR "Unsupported install mode: ${DNNL_INSTALL_MODE}")
 endif()
 
@@ -123,9 +122,9 @@ set(DNNL_ENABLE_PRIMITIVE "ALL" CACHE STRING
     - ALL (the default). Includes all primitives to be enabled.
     - <PRIMITIVE_NAME>. Includes only the selected primitive to be enabled.
       Possible values are: BATCH_NORMALIZATION, BINARY, CONCAT, CONVOLUTION,
-      DECONVOLUTION, ELTWISE, INNER_PRODUCT, LAYER_NORMALIZATION, LRN, MATMUL,
-      POOLING, PRELU, REDUCTION, REORDER, RESAMPLING, RNN, SDPA, SHUFFLE,
-      SOFTMAX, SUM.
+      DECONVOLUTION, ELTWISE, GROUP_NORMALIZATION, INNER_PRODUCT,
+      LAYER_NORMALIZATION, LRN, MATMUL, POOLING, PRELU, REDUCTION, REORDER,
+      RESAMPLING, RNN, SDPA, SHUFFLE, SOFTMAX, SUM.
     - <PRIMITIVE_NAME>;<PRIMITIVE_NAME>;... Includes only selected primitives to
       be enabled at build time. This is treated as CMake string, thus, semicolon
       is a mandatory delimiter between names. This is the way to specify several
@@ -147,7 +146,7 @@ set(DNNL_ENABLE_PRIMITIVE_GPU_ISA "ALL" CACHE STRING
     implementations will always be available. Valid values:
     - ALL (the default). Includes all ISA to be enabled.
     - <ISA_NAME>;<ISA_NAME>;... Includes only selected ISA to be enabled.
-      Possible values are: GEN9, GEN11, XELP, XEHP, XEHPG, XEHPC, XE2.")
+      Possible values are: GEN9, GEN11, XELP, XEHP, XEHPG, XEHPC, XE2, XE3.")
 
 set(ONEDNN_ENABLE_GEMM_KERNELS_ISA "ALL" CACHE STRING
     "Specifies an ISA set of GeMM kernels residing in x64/gemm folder to be
@@ -224,13 +223,6 @@ option(DNNL_EXPERIMENTAL_LOGGING
     independently from DNNL_EXPERIMENTAL."
     OFF) # disabled by default
 
-option(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND
-    "builds oneDNN Graph API graph-compiler backend" OFF)
-set(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_LLVM_CONFIG "AUTO" CACHE STRING
-    "graph-compiler's llvm-config path")
-set(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT "builtin" CACHE STRING
-    "the optional JIT backends for graph-compiler: llvm;c;builtin")
-
 # ======================
 # Profiling capabilities
 # ======================
@@ -262,7 +254,7 @@ set(DNNL_CPU_RUNTIME "OMP" CACHE STRING
     To use Threading Building Blocks (TBB) one should also
     set TBBROOT (either environment variable or CMake option) to the library
     location.")
-if(NOT "${DNNL_CPU_RUNTIME}" MATCHES "^(NONE|OMP|TBB|SEQ|THREADPOOL|DPCPP|SYCL)$")
+if(NOT "${DNNL_CPU_RUNTIME}" MATCHES "^(NONE|OMP|TBB|TBB_AUTO|SEQ|THREADPOOL|DPCPP|SYCL)$")
     message(FATAL_ERROR "Unsupported CPU runtime: ${DNNL_CPU_RUNTIME}")
 endif()
 
@@ -381,6 +373,8 @@ set(DNNL_USE_CLANG_TIDY "NONE" CACHE STRING
     - NONE (default)
       Clang-tidy is disabled.
     - CHECK
+      Enables checks from .clang-tidy for source code
+    - CHECK_ALL
       Enables checks from .clang-tidy.
     - FIX
       Enables checks from .clang-tidy and fix found issues.
@@ -419,8 +413,11 @@ set(DNNL_BLAS_VENDOR "NONE" CACHE STRING
 # AArch64 optimizations with Arm Compute Library
 # ==============================================
 
-option(DNNL_AARCH64_USE_ACL "Enables use of AArch64 optimised functions
+option(DNNL_USE_ACL "Enables use of ARM optimised functions
     from Arm Compute Library.
     This is only supported on AArch64 builds and assumes there is a
     functioning Compute Library build available at the location specified by the
     environment variable ACL_ROOT_DIR." OFF)
+
+option(DNNL_XBYAK_NO_EXCEPTION
+    "Enables XBYAK_NO_EXCEPTION" ON) # enabled by default
diff --git a/cmake/platform.cmake b/cmake/platform.cmake
index fc8a7c13e35..aa06aaef7ca 100644
--- a/cmake/platform.cmake
+++ b/cmake/platform.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2024 Intel Corporation
+# Copyright 2016-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,11 +78,12 @@ macro(platform_gnu_nowarn_ccxx_flags var gnu_version)
     append(${var} "-Wno-strict-overflow")
     # suppress false positive warnings about uninitialized variables
     append(${var} "-Wno-maybe-uninitialized")
-    # suppress false positive warnings with 10.x: GCC Bugzilla – Bug 96963
+    # suppress false positive warnings with 9.x+: GCC Bugzilla – Bug 96963
     # assume 0.0 is unknown version - always suppress the warning
     if(${gnu_version} VERSION_EQUAL 0.0 OR
-        (${gnu_version} VERSION_GREATER 10.0 AND ${gnu_version} VERSION_LESS 11.0))
+        ${gnu_version} VERSION_GREATER 9.0)
         append(${var} "-Wno-stringop-overflow")
+        append(${var} "-Wno-array-bounds")
     endif()
 endmacro()
 
@@ -119,12 +120,26 @@ endif()
 if(MSVC)
     set(USERCONFIG_PLATFORM "x64")
     append_if(DNNL_WERROR CMAKE_CCXX_FLAGS "/WX")
+
+    # Generating frame pointers for easier performance profiling
+    if(DNNL_TARGET_ARCH STREQUAL "X64")
+        if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+            append(CMAKE_CCXX_FLAGS "-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
+        else()
+            append(CMAKE_CCXX_FLAGS "/Oy-")
+        endif()
+    endif()
+
     if(${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
         append(CMAKE_CCXX_FLAGS "/MP")
         # increase number of sections in obj file
         append(CMAKE_CCXX_FLAGS "/bigobj")
         # make preprocessor standard compliant
         append(CMAKE_CCXX_FLAGS "/Zc:preprocessor")
+        # Set UTF-8 as default encoding to be consistent with other compilers
+        append(CMAKE_CCXX_FLAGS "/utf-8")
+        # Enable __cplusplus macro to align behavior with other compilers
+        append(CMAKE_CCXX_FLAGS "/Zc:__cplusplus")
         # int64_t -> int (tent)
         append(CMAKE_CCXX_NOWARN_FLAGS "/wd4244")
         # workaround: macro outputs defined token in msvs header
@@ -152,7 +167,7 @@ if(MSVC)
         # disable: icpc deprecation notice
         append(CMAKE_CXX_FLAGS_DEBUG "-Qdiag-disable:10441")
     endif()
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang")
         append(CMAKE_CCXX_NOEXCEPT_FLAGS "-fno-exceptions")
         # Clang cannot vectorize some loops with #pragma omp simd and gets
         # very upset. Tell it that it's okay and that we love it
@@ -227,14 +242,22 @@ elseif(UNIX OR MINGW)
         append(CMAKE_CCXX_NOWARN_FLAGS "-Wno-recommended-option")
         # Older compiler versions may not support "-Wno-recommended-option".
         append(CMAKE_CCXX_FLAGS "-Wno-unknown-warning-option")
+
+        # Align with GCC -Wall
+        append(CMAKE_CCXX_FLAGS "-Wsign-compare")
+    endif()
+
+    # Generating frame pointers for easier performance profiling
+    if(DNNL_TARGET_ARCH STREQUAL "X64")
+        append(CMAKE_CCXX_FLAGS "-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
     endif()
 
     platform_unix_and_mingw_common_ccxx_flags(CMAKE_CCXX_FLAGS)
     platform_unix_and_mingw_common_cxx_flags(CMAKE_CXX_FLAGS)
     platform_unix_and_mingw_noexcept_ccxx_flags(CMAKE_CMAKE_CCXX_NOEXCEPT_FLAGS)
     # compiler specific settings
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-        if(DNNL_TARGET_ARCH STREQUAL "AARCH64")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang")
+        if(DNNL_TARGET_ARCH MATCHES "^(AARCH64|ARM)$")
              if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
                  set(DEF_ARCH_OPT_FLAGS "-O3")
              endif()
@@ -276,8 +299,11 @@ elseif(UNIX OR MINGW)
             if(DNNL_USE_CLANG_SANITIZER STREQUAL "MemoryWithOrigin")
                 append(CMAKE_CCXX_SANITIZER_FLAGS
                     "-fsanitize-memory-track-origins=2")
-                append(CMAKE_CCXX_SANITIZER_FLAGS
-                    "-fno-omit-frame-pointer")
+                # Already enabled for x64
+                if(NOT DNNL_TARGET_ARCH STREQUAL "X64")
+                    append(CMAKE_CCXX_SANITIZER_FLAGS
+                        "-fno-omit-frame-pointer")
+                endif()
             endif()
             set(DNNL_ENABLED_CLANG_SANITIZER "${DNNL_USE_CLANG_SANITIZER}")
         elseif(DNNL_USE_CLANG_SANITIZER STREQUAL "Undefined")
@@ -302,25 +328,35 @@ elseif(UNIX OR MINGW)
             message(STATUS
                 "Using Clang ${DNNL_ENABLED_CLANG_SANITIZER} "
                 "sanitizer (experimental!)")
-            append(CMAKE_CCXX_SANITIZER_FLAGS "-g -fno-omit-frame-pointer")
+            append(CMAKE_CCXX_SANITIZER_FLAGS "-g")
+            # Already enabled for x64
+            if(NOT DNNL_TARGET_ARCH STREQUAL "X64")
+                append(CMAKE_CCXX_SANITIZER_FLAGS "-fno-omit-frame-pointer")
+            endif()
+
             # Blacklist to ignore false-positive cases. Each case may be
             # assigned to a specific sanitizer. See online doc for help.
             append(CMAKE_CCXX_SANITIZER_FLAGS
                    "-fsanitize-blacklist=${PROJECT_SOURCE_DIR}/.clang-ignorelist")
         endif()
 
-        if (DNNL_USE_CLANG_TIDY MATCHES "(CHECK|FIX)" AND ${CMAKE_VERSION} VERSION_LESS "3.6.0")
-            message(FATAL_ERROR "Using clang-tidy requires CMake 3.6.0 or newer")
-        elseif(DNNL_USE_CLANG_TIDY MATCHES "(CHECK|FIX)")
+        if(DNNL_USE_CLANG_TIDY MATCHES "(CHECK|CHECK_ALL|FIX)")
             find_program(CLANG_TIDY NAMES clang-tidy)
             if(NOT CLANG_TIDY)
                 message(FATAL_ERROR "Clang-tidy not found")
             else()
+                # FIXME: Remove --header-filter option once clang-tidy warnings
+                # are addressed
                 if(DNNL_USE_CLANG_TIDY STREQUAL "CHECK")
+                    set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY}
+                        --header-filter='')
+                    message(STATUS "Using clang-tidy to run checks for source")
+                elseif(DNNL_USE_CLANG_TIDY STREQUAL "CHECK_ALL")
                     set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY})
-                    message(STATUS "Using clang-tidy to run checks")
+                    message(STATUS "Using clang-tidy to run checks for source and headers")
                 elseif(DNNL_USE_CLANG_TIDY STREQUAL "FIX")
-                    set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY} -fix)
+                    set(CMAKE_CXX_CLANG_TIDY ${CLANG_TIDY}
+                        -fix)
                     message(STATUS "Using clang-tidy to run checks and fix found issues")
                 endif()
             endif()
@@ -333,13 +369,7 @@ elseif(UNIX OR MINGW)
             append(CMAKE_CCXX_FLAGS "-Wno-ignored-attributes")
         endif()
 
-        # XXX: Suppress an erroneous warning of nested lambda visibility
-        #  exceeding that of the containing class (GCC Bugzilla - Bug 80947).
-        if (CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8 AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 6.0)
-            append(CMAKE_CCXX_FLAGS "-Wno-attributes")
-        endif()
-
-        if(DNNL_TARGET_ARCH STREQUAL "AARCH64")
+        if(DNNL_TARGET_ARCH MATCHES "^(AARCH64|ARM)$")
             if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
                 set(DEF_ARCH_OPT_FLAGS "-O3")
             endif()
@@ -418,8 +448,7 @@ if(DNNL_ARCH_OPT_FLAGS STREQUAL "HostOpts")
     set(DNNL_ARCH_OPT_FLAGS "${DEF_ARCH_OPT_FLAGS}")
 endif()
 
-append(CMAKE_C_FLAGS "${CMAKE_CCXX_FLAGS} ${DNNL_ARCH_OPT_FLAGS}")
-append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_FLAGS} ${DNNL_ARCH_OPT_FLAGS}")
+append(CMAKE_CCXX_FLAGS "${DNNL_ARCH_OPT_FLAGS}")
 
 if(APPLE)
     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
@@ -435,8 +464,18 @@ endif()
 if (DNNL_TARGET_ARCH STREQUAL "RV64")
     # Check if the RVV Intrinsics can be compiled with the current toolchain and flags
     include(CheckCXXSourceCompiles)
-    check_cxx_source_compiles("#include <riscv_vector.h>
-                               int main() { return 0; };"
+    check_cxx_source_compiles("#if !defined(__riscv) || !defined(__riscv_v)
+                               #error \"RISC-V or vector extension(RVV) is not supported by the compiler\"
+                               #endif
+
+                               #if defined(__riscv_v_intrinsic) && __riscv_v_intrinsic < 12000
+                               #error \"RISC-V intrinsics v0.12 or higher is required\"
+                               #endif
+
+                               #include <riscv_vector.h>
+                               int main() {
+                                return 0;
+                               };"
                                CAN_COMPILE_RVV_INTRINSICS
     )
     # set CAN_COMPILE_RVV_INTRINSICS to TRUE / FALSE instead of 1 / "" (Undefined)
@@ -454,3 +493,6 @@ if (DNNL_TARGET_ARCH STREQUAL "RV64")
     message(STATUS "Can compile RVV Intrinsics: ${CAN_COMPILE_RVV_INTRINSICS}")
     message(STATUS "DNNL_RISCV_USE_RVV_INTRINSICS: ${DNNL_RISCV_USE_RVV_INTRINSICS}")
 endif()
+
+append(CMAKE_C_FLAGS "${CMAKE_CCXX_FLAGS}")
+append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_FLAGS}")
diff --git a/cmake/testing.cmake b/cmake/testing.cmake
index a002920fcbe..8d9b3f7fd16 100644
--- a/cmake/testing.cmake
+++ b/cmake/testing.cmake
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2020-2024 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@ set(DNNL_TEST_SET_COVERAGE "0")
 set(DNNL_TEST_SET_COVERAGE_STR "")
 set(DNNL_TEST_SET_HAS_NO_CORR "0")
 set(DNNL_TEST_SET_HAS_ADD_BITWISE "0")
+set(DNNL_TEST_SET_HAS_GRAPH_EXE "0")
 
 function(check_consistency entry)
     if(NOT DNNL_TEST_SET_COVERAGE EQUAL 0)
@@ -57,6 +58,8 @@ foreach(entry ${DNNL_TEST_SET})
         set(DNNL_TEST_SET_HAS_NO_CORR "1")
     elseif(entry STREQUAL "ADD_BITWISE")
         set(DNNL_TEST_SET_HAS_ADD_BITWISE "1")
+    elseif(entry STREQUAL "GRAPH_EXE")
+        set(DNNL_TEST_SET_HAS_GRAPH_EXE "1")
     elseif(entry STREQUAL "CI_NO_CORR") # Left here for compatibility till v4.0
         set(DNNL_TEST_SET_COVERAGE ${DNNL_TEST_SET_CI})
         set(DNNL_TEST_SET_COVERAGE_STR "CI")
@@ -68,7 +71,7 @@ foreach(entry ${DNNL_TEST_SET})
         message(FATAL_ERROR
                 "The DNNL_TEST_SET entry ${entry} is not recognized. "
                 "Supported values are:"
-                "NIGHTLY, CI, SMOKE, NO_CORR, ADD_BITWISE.")
+                "NIGHTLY, CI, SMOKE, NO_CORR, ADD_BITWISE, GRAPH_EXE.")
     endif()
 endforeach()
 
@@ -79,3 +82,6 @@ endif()
 if(DNNL_TEST_SET_HAS_ADD_BITWISE EQUAL 1)
     message(STATUS "Enabled testing modifier: Add bitwise validation")
 endif()
+if(DNNL_TEST_SET_HAS_GRAPH_EXE EQUAL 1)
+    message(STATUS "Enabled testing modifier: Use graph execution")
+endif()
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index fdd6b2c95dc..05d55f5ccc9 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -142,35 +142,16 @@ macro(append_to_windows_path_list path_list path)
     endif()
 endmacro()
 
-function(target_link_libraries_build target list)
-    # Foreach is required for compatibility with 2.8.11 ways
-    foreach(lib ${list})
-        target_link_libraries(${target} LINK_PUBLIC
-            "$<BUILD_INTERFACE:${lib}>")
-    endforeach(lib)
-endfunction()
-
+# Strip paths from libraries before populating INSTALL_INTERFACE
 function(target_link_libraries_install target list)
-    # Foreach is required for compatibility with 2.8.11 ways
     foreach(lib ${list})
         get_filename_component(base "${lib}" NAME)
-        target_link_libraries(${target} LINK_PUBLIC
-            "$<INSTALL_INTERFACE:${base}>")
+        target_link_libraries(${target} PUBLIC "$<INSTALL_INTERFACE:${base}>")
     endforeach(lib)
 endfunction()
 
 function(find_libm var)
-    # This is to account for the linker cache in OSX11.  might work
-    # with lower than 3.9.4, but was not able to test with anything
-    # between 2.8 and 3.9. See here for more details:
-    # https://gitlab.kitware.com/cmake/cmake/-/issues/20863
-    if (APPLE AND (${CMAKE_HOST_SYSTEM_VERSION} VERSION_GREATER "20.0.0")
-           AND (${CMAKE_VERSION} VERSION_LESS "3.9.4"))
-        message(INFO "Using OSX11 and above with CMAKE older than 3.18 can cause linking issues.")
-        set(OSX11_AND_OLDER_CMAKE TRUE)
-    endif()
-
-    if(UNIX AND (NOT (APPLE AND OSX11_AND_OLDER_CMAKE)))
+    if(UNIX)
         find_library(${var} m REQUIRED)
     endif()
 endfunction()
diff --git a/cmake/win/TBBConfig.cmake b/cmake/win/TBBConfig.cmake
deleted file mode 100644
index 623147f53ac..00000000000
--- a/cmake/win/TBBConfig.cmake
+++ /dev/null
@@ -1,164 +0,0 @@
-#===============================================================================
-# Copyright 2017-2021 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-# TBB_FOUND should not be set explicitly. It is defined automatically by CMake.
-# Handling of TBB_VERSION is in TBBConfigVersion.cmake.
-
-if (NOT TBB_FIND_COMPONENTS)
-    set(TBB_FIND_COMPONENTS "tbb;tbbmalloc;tbbmalloc_proxy")
-    foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-        set(TBB_FIND_REQUIRED_${_tbb_component} 1)
-    endforeach()
-endif()
-
-# Add components with internal dependencies: tbbmalloc_proxy -> tbbmalloc
-list(FIND TBB_FIND_COMPONENTS tbbmalloc_proxy _tbbmalloc_proxy_ix)
-if (NOT _tbbmalloc_proxy_ix EQUAL -1)
-    list(FIND TBB_FIND_COMPONENTS tbbmalloc _tbbmalloc_ix)
-    if (_tbbmalloc_ix EQUAL -1)
-        list(APPEND TBB_FIND_COMPONENTS tbbmalloc)
-        set(TBB_FIND_REQUIRED_tbbmalloc ${TBB_FIND_REQUIRED_tbbmalloc_proxy})
-    endif()
-endif()
-
-# oneDNN changes: use TBBROOT to locate Intel TBB
-# get_filename_component(_tbb_root "${CMAKE_CURRENT_LIST_FILE}" PATH)
-# get_filename_component(_tbb_root "${_tbb_root}" PATH)
-if (NOT TBBROOT)
-    if(DEFINED ENV{TBBROOT})
-        set (TBBROOT $ENV{TBBROOT})
-    else()
-        message("FATAL_ERROR" "TBBROOT is unset")
-    endif()
-endif()
-
-set(_tbb_root ${TBBROOT})
-
-set(_tbb_x32_subdir ia32)
-set(_tbb_x64_subdir intel64)
-
-if (CMAKE_SIZEOF_VOID_P EQUAL 8)
-    set(_tbb_arch_subdir ${_tbb_x64_subdir})
-else()
-    set(_tbb_arch_subdir ${_tbb_x32_subdir})
-endif()
-
-# Workaround: 3.19.0 and 3.19.1 versions don't define MSVC_VERSION.
-# The workaround is to assume that vc14 is used.
-set(_tbb_detect_msvc_version FALSE)
-if (NOT ${CMAKE_VERSION} VERSION_EQUAL "3.19.0" AND NOT ${CMAKE_VERSION} VERSION_EQUAL "3.19.1")
-    set(_tbb_detect_msvc_version TRUE)
-endif()
-
-# Detect the most relevant MSVC subdirectory
-set(_tbb_msvc_1700_subdir vc11)
-set(_tbb_msvc_1800_subdir vc12)
-set(_tbb_msvc_1900_subdir vc14)
-
-# oneDNN changes: if the project is not with MSVC, try to use MSVC 1900
-set(_tbb_msvc_ver 1900)
-
-if (_tbb_detect_msvc_version)
-    if (MSVC)
-        set(_tbb_msvc_ver ${MSVC_VERSION})
-    endif()
-    if (MSVC_VERSION VERSION_LESS 1700)
-        message(FATAL_ERROR "This Intel TBB package is intended to be used only in the project with MSVC version 1700 (vc11) or higher")
-    elseif (MSVC_VERSION VERSION_GREATER 1900)
-        set(_tbb_msvc_ver 1900)
-    endif()
-endif()
-set(_tbb_compiler_subdir ${_tbb_msvc_${_tbb_msvc_ver}_subdir})
-unset(_tbb_msvc_1700_subdir)
-unset(_tbb_msvc_1800_subdir)
-unset(_tbb_msvc_1900_subdir)
-
-if (WINDOWS_STORE)
-    set(_tbb_compiler_subdir ${_tbb_compiler_subdir}_ui)
-endif()
-
-#set conveniance variable to locate TBB files (these are used for a PSXE install)
-get_filename_component(_tbb_lib_path "${_tbb_root}/lib/${_tbb_arch_subdir}/${_tbb_compiler_subdir}" ABSOLUTE)
-get_filename_component(_tbb_inc_path "${_tbb_root}/include/" ABSOLUTE)
-
-if (TBB_FOUND)
-    return()
-endif()
-
-foreach (_tbb_lib_version 12 "")
-foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-    set(_tbb_release_lib "${_tbb_lib_path}/${_tbb_component}${_tbb_lib_version}.lib")
-    set(_tbb_debug_lib "${_tbb_lib_path}/${_tbb_component}${_tbb_lib_version}_debug.lib")
-
-    # oneDNN change: check library existence (BUILD_MODE related only, not both)
-    string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE)
-    if (UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG")
-        if (EXISTS "${_tbb_debug_lib}")
-            set(_lib_exists TRUE)
-        elseif (EXISTS "${_tbb_release_lib}")
-            message(FATAL_ERROR
-                "Intel TBB release library is found here: ${_tbb_release_lib}. "
-                "But the debug library
-                (lib${_tbb_component}${tbb_lib_version}_debug.lib) is missing.")
-        endif()
-    else()
-        if (EXISTS "${_tbb_release_lib}")
-            set(_lib_exists TRUE)
-        endif()
-    endif()
-
-    if (_lib_exists)
-        if (NOT TARGET TBB::${_tbb_component})
-            add_library(TBB::${_tbb_component} SHARED IMPORTED)
-            set_target_properties(TBB::${_tbb_component} PROPERTIES
-                                  IMPORTED_CONFIGURATIONS "RELEASE;DEBUG"
-                                  IMPORTED_LOCATION_RELEASE     "${_tbb_release_lib}"
-                                  IMPORTED_LOCATION_DEBUG       "${_tbb_debug_lib}"
-                                  INTERFACE_INCLUDE_DIRECTORIES "${_tbb_inc_path}"
-                                  IMPORTED_IMPLIB_RELEASE       "${_tbb_release_lib}"
-                                  IMPORTED_IMPLIB_DEBUG         "${_tbb_debug_lib}"
-                                  INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1")
-
-            # Add internal dependencies for imported targets: TBB::tbbmalloc_proxy -> TBB::tbbmalloc
-            if (_tbb_component STREQUAL tbbmalloc_proxy)
-                set_target_properties(TBB::tbbmalloc_proxy PROPERTIES INTERFACE_LINK_LIBRARIES TBB::tbbmalloc)
-            endif()
-
-            list(APPEND TBB_IMPORTED_TARGETS TBB::${_tbb_component})
-            set(TBB_${_tbb_component}_FOUND 1)
-        endif()
-        break()
-    endif()
-endforeach()
-endforeach()
-
-foreach (_tbb_component ${TBB_FIND_COMPONENTS})
-    if (NOT TARGET TBB::${_tbb_component} AND TBB_FIND_REQUIRED AND TBB_FIND_REQUIRED_${_tbb_component})
-        message(FATAL_ERROR "Missed required Intel TBB component: ${_tbb_component}")
-    endif()
-endforeach()
-
-unset(_tbb_x32_subdir)
-unset(_tbb_x64_subdir)
-unset(_tbb_arch_subdir)
-unset(_tbb_compiler_subdir)
-unset(_tbbmalloc_proxy_ix)
-unset(_tbbmalloc_ix)
-unset(_tbb_lib_path)
-unset(_tbb_release_lib)
-unset(_tbb_debug_lib)
-unset(_tbb_lib_version)
-unset(_lib_exists)
diff --git a/doc/advanced/experimental.md b/doc/advanced/experimental.md
index 0f55dfc0243..b3464c75871 100644
--- a/doc/advanced/experimental.md
+++ b/doc/advanced/experimental.md
@@ -22,14 +22,14 @@ Both kinds of experimental features can be enabled simultaneously.
 
 | Environment variable                     | Description                                                                                                                                                    |
 |:-----------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| ONEDNN_EXPERIMENTAL_BNORM_STATS_ONE_PASS | Calculate mean and variance in batch normalization(BN) in single pass ([RFC](https://github.com/oneapi-src/oneDNN/tree/rfcs/rfcs/20210519-single-pass-bnorm)). |
+| ONEDNN_EXPERIMENTAL_BNORM_STATS_ONE_PASS | Calculate mean and variance in batch normalization(BN) in single pass ([RFC](https://github.com/uxlfoundation/oneDNN/tree/rfcs/rfcs/20210519-single-pass-bnorm)). |
+| ONEDNN_EXPERIMENTAL_GPU_CONV_V2          | Enable shapeless GPU convolution implementation (the feature is under development).                                                                            |
 
 | Build time option                          | Description                                                        |
 |:-------------------------------------------|:-------------------------------------------------------------------|
 | ONEDNN_EXPERIMENTAL_SPARSE                 | Enable experimental API and functionality for sparse domain.       |
 | ONEDNN_EXPERIMENTAL_UKERNEL                | Enable experimental microkernel APIs and functionalities.          |
 | ONEDNN_EXPERIMENTAL_PROFILING              | Enable experimental profiling API.                                 |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND | Enable experimental graph compiler backend of the graph component. |
 | ONEDNN_EXPERIMENTAL_LOGGING                | Enable experimental logging support for oneDNN verbose mode.       |
 
 ## Features details
@@ -55,25 +55,29 @@ of buffers. The order of the buffers in the vector matters and should correspond
 the buffers' indices.
 
 oneDNN also introduces a new format kind dnnl::memory::format_kind::sparse.
-Sparse encoding (a.k.a. sparse format) is an
-enumeration type that specifies how data is encoded. Currently, oneDNN
-supports CSR (Compressed Sparse Row) and PACKED sparse encodings
-(dnnl::memory::sparse_encoding::csr, dnnl::memory::sparse_encoding_packed).
+Sparse encoding (a.k.a. sparse format) is an enumeration type that specifies
+how data is encoded. Currently, oneDNN supports Compressed Sparse Row (CSR),
+Sorted Co-ordinate (COO) Sparse Format, and PACKED sparse encodings
+(dnnl::memory::sparse_encoding::csr, dnnl::memory::sparse_encoding::coo,
+dnnl::memory::sparse_encoding::packed) for CPU engine, and, only sorted
+COO (Co-ordinate Sparse Format) for GPU engine.
 
 The memory descriptor has dedicated static member functions for creating memory
 descriptors for different sparse encodings.
 
 Each encoding defines the number and meaning of the buffers.
 
-| Sparse encoding | Buffers                                 |
-|:----------------|:----------------------------------------|
-| CSR             | 0 - values, 1 - indices, 2 - pointers   |
-| PACKED          | The meaning and content are unspecified |
+| Sparse encoding | Buffers                                                                    |
+|:----------------|:---------------------------------------------------------------------------|
+| CSR             | 0 - values, 1 - indices, 2 - pointers                                      |
+| Sorted COO      | 0 - values, 1 to *ndims* - indices (*ndims* - number of tensor dimensions) |
+| PACKED          | The meaning and content are unspecified                                    |
 
-The pseudo-code below demonstrates how to create a memory object
-for CSR sparse encoding and use the new API to work with the
+The pseudocode below demonstrates how to create a memory object
+for the CSR and COO sparse encodings and use the new API to work with the
 underlying handles.
 
+###### CSR Encoding:
 ~~~cpp
     using namespace dnnl;
     const memory::dim M = 4, N = 6;
@@ -119,6 +123,49 @@ underlying handles.
     assert(pointers_handle == (void *)csr_pointers.data());
 ~~~
 
+###### Sorted COO Encoding:
+~~~cpp
+    using namespace dnnl;
+    const memory::dim M = 4, N = 6;
+    const memory::dim nnz = 5;
+    const auto values_dt = memory::data_type::f32;
+    const auto indices_dt = memory::data_type::s32;
+
+    // Create a memory descriptor for COO sparse encoding.
+    const auto coo_md = memory::desc::coo(
+            {M, N}, // Dimensions
+            values_dt, // Data type of values
+            nnz, // Number of non-zero entries
+            indices_dt); // Data type of indices (metadata)
+
+    // A sparse matrix represented in the COO format.
+    std::vector<float> coo_values = {2.5f, 1.5f, 1.5f, 2.5f, 2.0f};
+    std::vector<int32_t> coo_row_indices = {0, 1, 2, 2, 3};
+    std::vector<int32_t> coo_col_indices = {0, 2, 0, 5, 1};
+ 
+    // Create a memory object for the given buffers with values and metadata.
+    memory coo_mem(coo_md, engine, {
+        coo_values.data(), // Buffer with values
+        coo_row_indices.data(), // Buffer with row indices (metadata)
+        coo_col_indices.data() // Buffer with column indices (metadata)
+        });
+
+    const auto values_sz = coo_mem.get_size(0);
+    const auto indices_sz = coo_mem.get_size(1);
+
+    assert(values_sz == coo_values.size() * sizeof(float));
+    assert(indices_sz == coo_row_indices.size() * sizeof(int32_t));
+    assert(indices_sz == coo_col_indices.size() * sizeof(int32_t));
+
+    void *values_handle = coo_mem.get_data_handle(0);
+    void *row_indices_handle = coo_mem.get_data_handle(1);
+    void *col_indices_handle = coo_mem.get_data_handle(2);
+
+    assert(values_handle == (void *)coo_values.data());
+    assert(row_indices_handle == (void *)coo_row_indices.data());
+    assert(col_indices_handle == (void *)coo_col_indices.data());
+~~~
+
 A memory descriptor created for the sparse encoding PACKED cannot
 be used to create a memory object. It can only be used to create
 a primitive descriptor to query the actual memory descriptor
@@ -132,14 +179,15 @@ This option enables the matmul primitive that can work with
 sparse input tensors.
 
 ###### CSR encoding
-Only one of the input tensors is allowed to be sparse. The
-output tensor is always dense.
+Supported only for the CPU engine. Only one of the input tensors can be sparse.
+The output tensor is always dense.
 
-The following data types combinations are supported:
+The following data type combinations are supported:
 
-| Values | Indices | Pointers |
-|:-------|:--------|:---------|
-| f32    | s32     | s32      |
+| Values (src, weight, dst)   | Indices  |
+|:----------------------------|:---------|
+| f16, f16, f16               | s32      |
+| f32, f32, f32               | s32      |
 
 The following format tags are supported for dense input/output
 tensors:
@@ -154,6 +202,34 @@ Benchdnn can be used to test matmul with a CSR input tensor as follows:
 For the case above, the number of non-zero elements for the source tensor is
 calculated as max(4 * 1000000 * (1 - 0.99), 1).
 
+###### COO encoding
+Supported only for the CPU and GPU engines. Only one of the input tensors can
+be sparse. The output tensor is always dense.
+
+The following data type combinations are supported:
+
+| Values (src, weight, dst)   | Indices  |
+|:----------------------------|:---------|
+| f16, f16, f16               | s32      |
+| f32, f32, f32               | s32      |
+
+The following format tags are supported for dense weights tensor:
+
+* ab
+* ba
+
+The following format tags are supported for dense destination tensor:
+
+* ab
+
+See the example [here](@ref cpu_matmul_coo_cpp).
+
+Benchdnn can be used to test matmul with a COO input tensor as follows:
+`./benchdnn --matmul --encoding=coo+0.99:: --wtag=ab --dtag=ab 4x1000000:1000000x128`
+
+For the case above, the number of non-zero elements for the source tensor is
+calculated as max(4 * 1000000 * (1 - 0.99), 1).
+
 ###### PACKED encoding
 
 Only the weights tensor is allowed to be sparse. The other tensors
@@ -164,6 +240,7 @@ scales, zero-points, etc) that is supported for the dense weights should
 also work for the sparse weights.
 
 Currently, matmul has the following limitations for the PACKED encoding:
+* Supported only for the CPU engine
 * Only Intel Advanced Matrix Extensions (Intel AMX) instruction set
 architecture (ISA) is supported
 * Only `s8` data type for the weights is supported
@@ -188,11 +265,10 @@ In general, it is expected that all reorder-related functionality
 destination tensor should also work for the sparse one.
 
 #### Common Limitations
-* This functionality is not supported for SYCL and OpenCL runtimes
-* The interoperability API for sparse memory is not provided
+* The interoperability API to get/set data handles is not supported. Use the
+runtime agnostic API to do that.
 * Sparse memory and memory descriptor can only be used with the Matrix
-Multiplication and Reorder primitives
-* Sparse memory can be created only for a CPU engine
+Multiplication and Reorder primitives.
 
 ### ONEDNN_EXPERIMENTAL_UKERNEL
 
@@ -257,11 +333,6 @@ user-provided queue.
 * Only Intel vendor is supported for SYCL runtime
 * Out-of-order queue is not supported
 
-### ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND
-This option extends the coverage scope of the graph API to cover larger fusion
-patterns apart from primitive patterns. Refer to
-[Graph Compiler](@ref dev_guide_graph_compiler) for more details.
-
 @warning
 - Enabling some experimental features does not guarantee that the library will utilize them
 - Enabling some experimental features might change the accuracy of oneDNN primitives
diff --git a/doc/advanced/understanding_memory_formats.md b/doc/advanced/understanding_memory_formats.md
index 30d2f191179..c23cfeb124e 100644
--- a/doc/advanced/understanding_memory_formats.md
+++ b/doc/advanced/understanding_memory_formats.md
@@ -115,9 +115,9 @@ in this example.
 
 One can create memory with **NCHW** data layout using
 #dnnl_nchw of the enum type #dnnl_format_tag_t defined in
-[dnnl_types.h](https://github.com/oneapi-src/oneDNN/blob/master/include/oneapi/dnnl/dnnl_types.h)
+[dnnl_types.h](https://github.com/uxlfoundation/oneDNN/blob/main/include/oneapi/dnnl/dnnl_types.h)
 for the C API, and dnnl::memory::format_tag::nchw defined in
-[dnnl.hpp](https://github.com/oneapi-src/oneDNN/blob/master/include/oneapi/dnnl/dnnl.hpp)
+[dnnl.hpp](https://github.com/uxlfoundation/oneDNN/blob/main/include/oneapi/dnnl/dnnl.hpp)
 for the C++ API.
 
 
diff --git a/doc/build/build.md b/doc/build/build.md
index 6fb58bd6610..cdd1ea0c2d6 100644
--- a/doc/build/build.md
+++ b/doc/build/build.md
@@ -3,16 +3,16 @@ Build from Source {#dev_guide_build}
 
 ## Download the Source Code
 
-Download [oneDNN source code](https://github.com/oneapi-src/oneDNN/archive/master.zip)
-or clone [the repository](https://github.com/oneapi-src/oneDNN.git).
+Download [oneDNN source code](https://github.com/uxlfoundation/oneDNN/archive/main.zip)
+or clone [the repository](https://github.com/uxlfoundation/oneDNN.git).
 
 ~~~sh
-git clone https://github.com/oneapi-src/oneDNN.git
+git clone https://github.com/uxlfoundation/oneDNN.git
 ~~~
 
 ## Build the Library
 
-Ensure that all [software dependencies](https://github.com/oneapi-src/oneDNN#requirements-for-building-from-source)
+Ensure that all [software dependencies](https://github.com/uxlfoundation/oneDNN#requirements-for-building-from-source)
 are in place and have at least the minimal supported version.
 
 The oneDNN build system is based on CMake. Use
@@ -51,7 +51,7 @@ cmake .. <extra build options>
 
 - Build the library
 ~~~sh
-make -j
+make -j$(nproc)
 ~~~
 
 #### Intel oneAPI DPC++/C++ Compiler with SYCL runtime
@@ -86,7 +86,7 @@ it is installed in a custom location.
 
 - Build the library
 ~~~sh
-make -j
+make -j$(nproc)
 ~~~
 
 #### GCC targeting AArch64 on x64 host
@@ -106,7 +106,7 @@ cmake .. \
 
 - Build the library
 ~~~sh
-make -j
+make -j$(nproc)
 ~~~
 
 #### GCC with Arm Compute Library (ACL) on AArch64 host
@@ -117,13 +117,13 @@ make -j
 ~~~sh
 export ACL_ROOT_DIR=<path/to/Compute Library>
 cmake .. \
-          -DDNNL_AARCH64_USE_ACL=ON \
+          -DDNNL_USE_ACL=ON \
           <extra build options>
 ~~~
 
 - Build the library
 ~~~sh
-make -j
+make -j$(nproc)
 ~~~
 
 ### Windows
@@ -142,9 +142,13 @@ cmake -G "Visual Studio 16 2019" ..
 cmake --build . --config=Release
 ~~~
 
-@note CMake's Microsoft Visual Studio generator does not respect `CMAKE_BUILD_TYPE` option.
-Solution file supports both Debug and Release builds with Debug being the default.
-You can choose specific build type with `--config` option.
+@note Currently, the oneDNN build system has limited support for multi-config
+ generators. Build configuration is based on the `CMAKE_BUILD_TYPE` option
+ (`Release` by default), and CMake must be rerun from scratch every time
+ the build type changes to apply the new build configuration. You can choose
+ a specific build type with the `--config` option (the solution file supports
+ both `Debug` and `Release` builds), but it must refer to the same build type
+ (`Release`, `Debug`, etc.) as selected with the `CMAKE_BUILD_TYPE` option.
 
 @note You can also open `oneDNN.sln` to build the project from the
 Microsoft Visual Studio IDE.
diff --git a/doc/build/build_options.md b/doc/build/build_options.md
index 2bcdede9ce2..a98310fb367 100644
--- a/doc/build/build_options.md
+++ b/doc/build/build_options.md
@@ -13,7 +13,6 @@ oneDNN supports the following build-time options.
 | ONEDNN_BUILD_TESTS              | **ON**, OFF                                         | Controls building the tests                                                                     |
 | ONEDNN_BUILD_GRAPH              | **ON**, OFF                                         | Controls building graph component                                                               |
 | ONEDNN_ENABLE_GRAPH_DUMP        | ON, **OFF**                                         | Controls dumping graph artifacts                                                                |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND | ON, **OFF**                              | Enables the [graph compiler backend](@ref dev_guide_graph_compiler) of the graph component (experimental)|
 | ONEDNN_ARCH_OPT_FLAGS           | *compiler flags*                                    | Specifies compiler optimization flags (see warning note below)                                  |
 | ONEDNN_ENABLE_CONCURRENT_EXEC   | ON, **OFF**                                         | Disables sharing a common scratchpad between primitives in #dnnl::scratchpad_mode::library mode |
 | ONEDNN_ENABLE_JIT_PROFILING     | **ON**, OFF                                         | Enables [integration with performance profilers](@ref dev_guide_profilers)                      |
@@ -87,14 +86,14 @@ dependencies for forward propagation kind part.
 #### ONEDNN_ENABLE_PRIMITIVE
 This option supports several values: `ALL` (the default) which enables all
 primitives implementations or a set of `BATCH_NORMALIZATION`, `BINARY`,
-`CONCAT`, `CONVOLUTION`, `DECONVOLUTION`, `ELTWISE`, `INNER_PRODUCT`,
-`LAYER_NORMALIZATION`, `LRN`, `MATMUL`, `POOLING`, `PRELU`, `REDUCTION`,
-`REORDER`, `RESAMPLING`, `RNN`, `SDPA`, `SHUFFLE`, `SOFTMAX`, `SUM`. When a set
-is used, only those selected primitives implementations will be available.
-Attempting to use other primitive implementations will end up returning an
-unimplemented status when creating primitive descriptor. In order to specify a
-set, a CMake-style string should be used, with semicolon delimiters, as in this
-example:
+`CONCAT`, `CONVOLUTION`, `DECONVOLUTION`, `ELTWISE`, `GROUP_NORMALIZATION`,
+`INNER_PRODUCT`, `LAYER_NORMALIZATION`, `LRN`, `MATMUL`, `POOLING`, `PRELU`,
+`REDUCTION`, `REORDER`, `RESAMPLING`, `RNN`, `SDPA`, `SHUFFLE`, `SOFTMAX`,
+`SUM`. When a set is used, only those selected primitives implementations will
+be available. Attempting to use other primitive implementations will end up
+returning an unimplemented status when creating primitive descriptor. In order
+to specify a set, a CMake-style string should be used, with semicolon
+delimiters, as in this example:
 ```
 -DONEDNN_ENABLE_PRIMITIVE=CONVOLUTION;MATMUL;REORDER
 ```
@@ -118,7 +117,7 @@ Example that enables SSE41 and AVX2 sets:
 #### ONEDNN_ENABLE_PRIMITIVE_GPU_ISA
 This option supports several values: `ALL` (the default) which enables all
 ISA implementations or any set of `GEN9`, `GEN11`, `XELP`, `XEHP`, `XEHPG`,
-`XEHPC`, and `XE2`. Selected ISA will enable correspondent parts in
+`XEHPC`, `XE2`, and `XE3`. Selected ISA will enable correspondent parts in
 just-in-time kernel generation based implementations. OpenCL based kernels and
 implementations will always be available. Example that enables XeLP and XeHP
 set:
@@ -303,7 +302,7 @@ $ cmake -DONEDNN_BLAS_VENDOR=ARMPL ..
 
 Additional options available for development/debug purposes. These options are
 subject to change without notice, see
-[`cmake/options.cmake`](https://github.com/oneapi-src/oneDNN/blob/master/cmake/options.cmake)
+[`cmake/options.cmake`](https://github.com/uxlfoundation/oneDNN/blob/main/cmake/options.cmake)
 for details.
 
 ## GPU Options
@@ -335,20 +334,3 @@ CMake error.
 |:------------------------|:-------------------|
 | ONEDNN_GPU_VENDOR       | NVIDIA             |
 | ONEDNN_ENABLE_PRIMITIVE | PRIMITIVE_NAME     |
-
-## Graph Compiler Backend Limitations
-
-As a backend of the graph component, besides the options described in
-[Graph component limitations](@ref component_limitation), graph compiler
-backend has some extra limitations. Specifying unsupported build options will
-lead to a CMake error.
-
-| CMake Option            | Unsupported Values |
-| :-----------------------| :------------------|
-| ONEDNN_CPU_RUNTIME      | THREADPOOL, SYCL   |
-| ONEDNN_GPU_RUNTIME      | OCL, SYCL          |
-
-Besides, the instructions contained in the kernels generated by the graph
-compiler backend are [AVX512_CORE](@ref dev_guide_cpu_dispatcher_control) or
-above, so these kernels will not be dispatched on systems that do not have
-corresponding instruction sets support.
diff --git a/doc/graph/experimental_graph_compiler.md b/doc/graph/experimental_graph_compiler.md
deleted file mode 100644
index 487de765695..00000000000
--- a/doc/graph/experimental_graph_compiler.md
+++ /dev/null
@@ -1,161 +0,0 @@
-Graph Compiler {#dev_guide_graph_compiler}
-==========================================
-
-oneDNN Graph Compiler is an experimental backend for oneDNN Graph API. It can
-generate optimized implementations for complex computational graphs including
-multi-head attention (MHA), multi-layer perceptron (MLP), and convolution
-residual blocks over typical data types for both inference and training. It
-also brings improved performance by providing more flexible operator fusion.
-
-Use of oneDNN Graph Compiler is transparent for applications, as it does not
-involve API or programming model changes.
-
-## Build-Time Controls
-The following build time options only work when
-`ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND` is ON.
-
-| CMake Option                                       | Supported values (defaults in bold)        | Description                                                  |
-| :---                                               | :---                                       | :---                                                         |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT         | llvm, c, **builtin**                       | Selects the CPU codegen and JIT to be built by graph compiler backend. Multiple codegen approaches can be used simultaneously. See the [example](@ref jit_options) for setting multiple codegen methods.  |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_LLVM_CONFIG | **AUTO**, *path to llvm-config binary*     | Defines the method for detecting and configuring LLVM.   |
-
-@anchor jit_options
-### Codegen and JIT Options
-Graph compiler backend supports several different codegen and JIT options
-including C, LLVM, and builtin (xbyak). Users can choose to build a subset of
-available options by setting the `ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT`
-option.
-
-~~~bash
-cmake .. -DONEDNN_BUILD_GRAPH=ON -DONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND=ON -DONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT="c;builtin"
-~~~
-
-This will only build `c` and `builtin` codegen options.
-
-~~~bash
-cmake .. -DONEDNN_BUILD_GRAPH=ON -DONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND=ON -DONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT="llvm;c;builtin"
-~~~
-
-This will build all three codegen options.
-
-#### C
-C codegen generates temporary cpp files and adopts `g++` to compile them into
-the executable. It can be used for debugging purposes as the generated code is
-more friendly and readable to developers.
-
-#### LLVM
-LLVM codegen generates LLVM-IR in memory. It provides the best performance
-among all supported codegen methods. When LLVM codegen is chosen, extra LLVM
-dependency is required. If LLVM does not exist in this case, a CMake error will
-occur.
-
-Users can set `ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_LLVM_CONFIG` to specify
-the LLVM to be integrated. By default,
-`ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_LLVM_CONFIG` is set to `AUTO`, which
-auto-detects existing LLVM in the environment. If auto-detection fails or user
-wants to explicitly specify the version of LLVM, a specific path to
-*llvm-config binary* shall be set.
-
-Users can follow the [guidelines](https://llvm.org/docs/GettingStarted.html#getting-the-source-code-and-building-llvm)
-to build and install LLVM from source, or download and install the pre-built
-binary from [here](https://apt.llvm.org/).
-
-@note **LLVM 10.0 or above** is required to enable LLVM codegen.
-
-#### Builtin
-Builtin codegen and JIT method is implemented with xbyak technology inside.
-Compared with C or LLVM codegen, it has no extra dependency.
-
-## Environment Variables
-The following environment variables are introduced by the graph compiler
-backend.
-
-| Environment Variable                                 | Value                            | Description                                                                                             |
-| :---                                                 | :---                             |:---                                                                                                     |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT           | **llvm**                         | Uses LLVM as codegen and JIT method                                                                     |
-|                                                      | builtin                          | Uses builtin as codegen and JIT method                                                                  |
-|                                                      | c                                | Uses C as codegen and JIT method                                                                        |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_OPT_LEVEL         | 0                                | Turns off optimization passes and sets the compilation optimization level to be 0 in C and LLVM JIT     |
-|                                                      | 1,2,**3**                        | Sets the compilation optimization level of C and LLVM JIT                                               |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_KERNEL_TRACE      | **0**                            | No kernel execution trace output                                                                        |
-|                                                      | 1,*stderr or filename.json*      | Generates kernel execution trace to the file specified by the given filename with chrome tracing format |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_PRINT_PASS_RESULT | **0**                            | No IR output after each graph or tensor IR pass                                                         |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_PRINT_PASS_RESULT | 1                                | Prints the output IR of each graph and tensor IR passes                                                 |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_VERBOSE           | **0**                            | No verbose output                                                                                       |
-|                                                      | 1                                | Prints warning messages during compilation                                                              |
-|                                                      | 2                                | Prints warning messages and info logs (e.g. fusion-related information) during compilation              |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_DUMP_GENCODE      | *path_to_dump*                   | Dumps the generated kernel in C                                                                         |
-| ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_C_INCLUDE         | *path_to_c_codegen_header*       | Specifies the C codegen header for JIT compilation                                                      |
-
-### Enable Tracing
-
-~~~bash
-ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_KERNEL_TRACE=1 ./application
-~~~
-
-This will produce a kernel execution trace in JSON format that will be
-stored to the default destination: `./sctrace.json`.
-
-~~~bash
-ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_KERNEL_TRACE=1,stderr ./application
-~~~
-
-This will dump a kernel execution trace to the *stderr* stream.
-
-~~~bash
-ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_KERNEL_TRACE=1,/tmp/filename.json ./application
-~~~
-
-This will produce a kernel execution trace in JSON format that will be stored
-to the user specified path `/tmp/filename.json`.
-
-### Switch Between Different Codegen Methods
-By default, codegen methods have priorities ranked from higher to lower as
-`llvm`, `c`, `builtin`. When multiple codegen and JIT methods are enabled at
-build stage, the method with the highest priority is adopted at runtime by
-default.
-
-Users can switch to a different codegen method at runtime by setting
-`ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT`.
-
-~~~bash
-ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT=builtin ./application
-~~~
-
-This will switch the CPU codegen and JIT method to `builtin` (xbyak).
-
-~~~bash
-ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT=c ./application
-~~~
-
-This will switch the CPU codegen and JIT method to `c`.
-
-When using C codegen option, the generated C code will rely on existing runtime
-function declarations in `cpu_include.hpp`.
-`ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_C_INCLUDE` environment variable is used to
-specify the corresponding include path.
-Normally, the include path is automatically set at CMake build stage. But if
-the following error message occurs
-`environment variable ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_C_INCLUDE is not set`,
-users shall manually set `ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_C_INCLUDE` to
-`/path_to_onednn_repo/src/graph/backend/graph_compiler/core/src`. 
-
-@warning The specified codegen method must be built. Otherwise, the default
-codegen method would be used.
-
-### Enable Code Dumping
-Users can use `ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_DUMP_GENCODE` variable to
-generate offline C kernels.
-
-~~~bash
-ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_DUMP_GENCODE="./dump_code" ./application
-~~~
-
-This will dump the generated C kernels to `dump_code` folder.
-
-@warning `ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_DUMP_GENCODE` works under both LLVM
-and C codegen.
-
-@warning The user specified `ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_DUMP_GENCODE`
-path shall be an existing folder. Otherwise the code dumping will not be in
-effect.
diff --git a/doc/graph/fusion_patterns/gated_mlp.md b/doc/graph/fusion_patterns/gated_mlp.md
new file mode 100644
index 00000000000..73611ab6e1f
--- /dev/null
+++ b/doc/graph/fusion_patterns/gated_mlp.md
@@ -0,0 +1,123 @@
+Gated Multi-Layer Perceptron (Gated-MLP) {#dev_guide_graph_gated_mlp}
+=====================================================================
+
+## Overview
+
+Gated Multi-Layer Perceptron (Gated-MLP) is a variant of MLP which is widely
+used as the Feed Forward Network (FFN) in many Transformer-based Large Language
+Models (LLMs).
+
+Typically, the FFN in Transformer architecture [1] is defined as a two layer MLP
+with a ReLU activation in between which can be replaced with other activations.
+
+\f[
+
+    FFN(src,W,V) = ReLU(src \cdot W) \cdot V
+
+\f]
+
+Gated Linear Unit (GLU) is adopted to replace the first linear layer to
+improve the quality of Transformer-based models [2]:
+
+\f[
+
+    GLU(src,W_1,W_2) = (src \cdot W_1) \otimes Sigmoid(src \cdot W_2) \\
+
+    FFN(src,W_1,W_2,V) = GLU(src,W_1,W_2) \cdot V
+
+\f]
+
+Where the \f$ src \cdot W_1 \f$ is usually called "FC (fully-connected) up",
+\f$ src \cdot W_2 \f$ is called "FC gate", and the last linear is called
+"FC down".
+
+Swish activation is further adopted to replace Sigmoid in the GLU to form
+swiGLU.
+
+\f[
+
+    Swish(x) = x \otimes Sigmoid(x) \\
+
+    swiGLU(src,W_1,W_2) = (src \cdot W_1) \otimes Swish(src \cdot W_2) \\
+
+    FFN(src,W_1,W_2,V) = swiGLU(src,W_1,W_2) \cdot V
+
+\f]
+
+The Gated-MLP based on swiGLU is also adopted in LLMs like LLaMA [3], Qwen [4],
+etc.
+
+## Gated-MLP patterns
+
+oneDNN supports Gated-MLP and its optimization through Graph API [5] by defining
+the graph, getting partition from the graph, and optimizing the kernels
+underneath. In general, a Gated-MLP pattern is defined as a directional acyclic
+graph (DAG) using oneDNN Graph API.
+
+### Floating-point Gated-MLP
+
+oneDNN defines floating-point (f32, bf16, and f16) Gated-MLP as follows. The blue
+nodes are required when defining a Gated-MLP pattern while the brown nodes are
+optional.
+
+![Gated-MLP pattern](images/fp-gated-mlp.png)
+
+1. The first MatMul on the top left calculates "FC up": \f$ src \cdot W_1 \f$.
+   See [MatMul](@ref dev_guide_op_matmul) operation in Graph API.
+2. The second MatMul on the top right calculates "FC gate": \f$ src \cdot W_2 \f$.
+3. The Activation node is optional. If required, it can be constructed with the
+   activation operations in Graph API, for example, [ReLU](@ref dev_guide_op_relu),
+   [GELU](@ref dev_guide_op_gelu), [Sigmoid](@ref dev_guide_op_sigmoid), and so on.
+   For Swish activation, the node can be constructed with the [Sigmoid](@ref dev_guide_op_sigmoid)
+   and [Multiply](@ref dev_guide_op_multiply) as below. You can also refer the
+   [Gated-MLP example](https://github.com/uxlfoundation/oneDNN/tree/main/examples/graph/gated_mlp.cpp)
+   for Swish definition.
+
+   ![Swish Activation](images/gated-mlp-swish.png)
+
+4. The last MatMul on the bottom performs the "FC down" operation between the
+   GLU output and \f$V\f$.
+
+## Data Types
+
+oneDNN supports the floating-point Gated-MLP pattern with data types f32, bf16,
+and f16. You can specify the data type via the input and output data type fields
+of logical tensors for each operation. oneDNN does not support mixing different
+floating data types in a floating-point Gated-MLP pattern.
+
+The definition of the data types and support status on different CPU and GPU
+platforms follow the general description in @ref dev_guide_data_types.
+
+## Implementation limitations
+
+1. oneDNN primitive-based Gated-MLP is implemented as the reference
+   implementation on both Intel Architecture Processors and Intel Graphics
+   Products. In this case, floating-point Gated-MLP patterns are usually
+   implemented with three f32, bf16, or f16 matmul (with binary or eltwise
+   post-ops) primitives.
+2. The Gated-MLP patterns functionally supports all input shapes meeting the
+   shape requirements of each operation in the graph. For example, the `MatMul`
+   operation requires shape consistency for `k` dimension. The `Multiply`
+   operation requires the input tensors to have the same shape or the shapes can
+   be properly broadcasted based on the operation attribute.
+
+## Examples
+
+oneDNN provides a [Gated-MLP
+example](https://github.com/uxlfoundation/oneDNN/tree/main/examples/graph/gated_mlp.cpp)
+demonstrating how to construct a typical floating-point Gated-MLP pattern with
+oneDNN Graph API on CPU and GPU with different runtimes.
+
+For applications where the weights of FC up and FC gate are combined as a single
+tensor, oneDNN also provides an
+[example](https://github.com/uxlfoundation/oneDNN/tree/main/examples/graph/gated_mlp_wei_combined.cpp)
+demonstrating how to create the weight tensors for the pattern with the offsets
+and strides from the combined weight tensor.
+
+## References
+
+1. Attention is all you need, https://arxiv.org/abs/1706.03762v7
+2. GLU Variants Improve Transformer, https://arxiv.org/abs/2002.05202
+3. LLaMA: Open and Efficient Foundation Language Models, https://arxiv.org/abs/2302.13971
+4. Qwen Technical Report, https://arxiv.org/abs/2309.16609
+5. oneDNN Graph API documentation, https://uxlfoundation.github.io/oneDNN/graph_extension.html
diff --git a/doc/graph/fusion_patterns/gqa.md b/doc/graph/fusion_patterns/gqa.md
new file mode 100644
index 00000000000..84d846924cb
--- /dev/null
+++ b/doc/graph/fusion_patterns/gqa.md
@@ -0,0 +1,106 @@
+Grouped Query Attention (GQA) {#dev_guide_graph_gqa}
+====================================================
+
+## Overview
+
+In a typical Scaled Dot-Product Attention (SDPA) [1], the input Query, Key, and
+Value tensors have the same head number. It becomes a performance bottleneck to
+load the Key and Value tensors in each generation step, especially when the
+sentence length gets longer.
+
+To reduce the memory bandwidth overhead of loading the Key and Value tensors,
+Multi-Query Attention (MQA) [2] is created by reducing the head number of Key
+and Value tensors to one which means multiple Queries will map to the same
+single Key and Value tensor. However, MQA may lead to model quality degradation
+and training instability. Therefore, Grouped-Query Attention (GQA) [3], an
+interpolation between the typical SDPA and MQA, is proposed with single Key and
+Value head per a subgroup of Query heads. The head number of Key and Value
+equals to the group number of Query heads.
+
+The notations used in the document:
+
+- N: the mini-batch size.
+- H_q: the head number of Query.
+- H_kv: the head number of Key or Value.
+- N_rep: H_q / H_kv, indicates how many Query heads are mapped to one Key head.
+- S: the sequence length.
+- D: the size of each head.
+
+## GQA Pattern
+
+Similar to how SDPA is supported, the GQA pattern is also defined as a
+directional acyclic graph (DAG) using oneDNN Graph API. oneDNN extends the
+[SDPA pattern](@ref dev_guide_graph_sdpa) to support floating-point (f32, bf16,
+and f16) GQA as follows. The blue nodes are required when defining a GQA pattern
+while the brown nodes are optional.
+
+![GQA pattern](images/gqa.png)
+
+Compared to a typical SDPA pattern, there are a few differences in the GQA
+pattern:
+
+1. The input Query has shape (N, H_q, S, D). It will be reshaped to (N, H_kv,
+   N_rep, S, D) by splitting H_q dimension into H_kv and N_rep. The reshaping
+   can be constructed using the [StaticReshape](@ref dev_guide_op_staticreshape)
+   operation in Graph API.
+2. Similarly, the input Key and Value have shape (N, H_kv, S, D). They will be
+   reshaped to (N, H_kv, 1, S, D) to meet the input shape requirement of
+   [MatMul](@ref dev_guide_op_matmul) operation.
+3. The second MatMul calculates the dot products between the probabilities after
+   SoftMax and Value nodes and generates output with shape (N, H_kv, N_rep, S, D).
+4. Another StaticReshape operation is applied to the output of the second MatMul
+   to convert the shape into (N, H_q, S, D) by combining H_kv and N_rep
+   dimensions.
+5. The input scale factor and mask in the pattern also need to meet the
+   operations' shape requirement which can be achieved through StaticReshape
+   similarly. Besides that, they have the same definition as described in the
+   typical SDPA pattern.
+
+## Data Types
+
+oneDNN supports the floating-point GQA pattern with data types f32, bf16, and
+f16. You can specify the data type via the input and output data type fields of
+logical tensors for each operation. oneDNN does not support mixing different
+floating data types in a floating-point GQA pattern.
+
+The definition of the data types and support status on different CPU and GPU
+platforms follow the general description in @ref dev_guide_data_types.
+
+## Implementation Limitations
+
+1. oneDNN primitive-based GQA is implemented as the reference implementation on
+   both Intel Architecture Processors and Intel Graphics Products. The reference
+   implementation requires memory to store the intermediate results of the dot
+   products between Query and Key which takes \f$O(S^2)\f$ memory. It may lead
+   to Out-of-Memory error when computing long sequence length input on platforms with
+   limited memory.
+2. The GQA patterns functionally support all input shapes meeting the shape
+   requirements of each operation in the graph.
+3. CPU
+   - Optimized implementation is available for 4D Q/K/V tensors with shape
+     defined as (N, H_q, S, D) for Query and (N, H_kv, S, D) for Key and Value.
+   - Optimized implementation is available for OpenMP runtime and Threadpool
+     runtime on Intel Architecture Processors.
+   - Specifically for OpenMP runtime, the optimized implementation requires `N *
+     H_q > 2 * thread number` to get enough parallelism.
+4. GPU
+   - Optimized implementation is available for 4D Q/K/V tensors with shape
+     defined as (N, H_q, S, D) for Query and (N, H_kv, S, D) for Key and Value.
+   - Optimized implementation is available for floating-point GQA with `f16`
+     data type and `D <= 256` on Intel Graphics Products with Intel(R) Xe Matrix
+     Extensions (Intel(R) XMX) support.
+
+## Example
+
+oneDNN provides a [GQA
+example](https://github.com/uxlfoundation/oneDNN/tree/main/examples/graph/gqa.cpp)
+demonstrating how to construct a floating-point GQA pattern with oneDNN Graph
+API on CPU and GPU with different runtimes.
+
+## References
+
+[1] Attention is all you need, https://arxiv.org/abs/1706.03762v7
+
+[2] Fast Transformer Decoding: One Write-Head is All You Need, https://arxiv.org/abs/1911.02150
+
+[3] GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints, https://arxiv.org/abs/2305.13245
diff --git a/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png b/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png
new file mode 100644
index 00000000000..b0563e7fb0e
Binary files /dev/null and b/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png differ
diff --git a/doc/graph/fusion_patterns/images/fp-gated-mlp.png b/doc/graph/fusion_patterns/images/fp-gated-mlp.png
new file mode 100644
index 00000000000..a52952ce87b
Binary files /dev/null and b/doc/graph/fusion_patterns/images/fp-gated-mlp.png differ
diff --git a/doc/graph/fusion_patterns/images/gated-mlp-swish.png b/doc/graph/fusion_patterns/images/gated-mlp-swish.png
new file mode 100644
index 00000000000..2050ee8d871
Binary files /dev/null and b/doc/graph/fusion_patterns/images/gated-mlp-swish.png differ
diff --git a/doc/graph/fusion_patterns/images/gqa.png b/doc/graph/fusion_patterns/images/gqa.png
new file mode 100644
index 00000000000..0871903bcda
Binary files /dev/null and b/doc/graph/fusion_patterns/images/gqa.png differ
diff --git a/doc/graph/images/sdpa-mask-1.png b/doc/graph/fusion_patterns/images/sdpa-mask-1.png
similarity index 100%
rename from doc/graph/images/sdpa-mask-1.png
rename to doc/graph/fusion_patterns/images/sdpa-mask-1.png
diff --git a/doc/graph/images/sdpa-mask-2.png b/doc/graph/fusion_patterns/images/sdpa-mask-2.png
similarity index 100%
rename from doc/graph/images/sdpa-mask-2.png
rename to doc/graph/fusion_patterns/images/sdpa-mask-2.png
diff --git a/doc/graph/fusion_patterns/images/sdpa-mask-3.png b/doc/graph/fusion_patterns/images/sdpa-mask-3.png
new file mode 100644
index 00000000000..339a9589122
Binary files /dev/null and b/doc/graph/fusion_patterns/images/sdpa-mask-3.png differ
diff --git a/doc/graph/images/sdpa-reorder.png b/doc/graph/fusion_patterns/images/sdpa-reorder.png
similarity index 100%
rename from doc/graph/images/sdpa-reorder.png
rename to doc/graph/fusion_patterns/images/sdpa-reorder.png
diff --git a/doc/graph/fusion_patterns/images/sdpa.png b/doc/graph/fusion_patterns/images/sdpa.png
new file mode 100644
index 00000000000..07add3d2afe
Binary files /dev/null and b/doc/graph/fusion_patterns/images/sdpa.png differ
diff --git a/doc/graph/fusion_patterns/sdpa.md b/doc/graph/fusion_patterns/sdpa.md
new file mode 100644
index 00000000000..75528dc56ba
--- /dev/null
+++ b/doc/graph/fusion_patterns/sdpa.md
@@ -0,0 +1,157 @@
+Scaled Dot-Product Attention (SDPA) {#dev_guide_graph_sdpa}
+===========================================================
+
+## Overview
+
+Scaled Dot-Product Attention (SDPA) is introduced in [1] as the core operation
+of Transformer block which now becomes the backbone of many language models and
+generative models (BERT, Stable Diffusion, GPT, etc.).
+
+The input of SDPA consists of query (Q), key (K), and value (V). The attention
+output is computed as:
+
+\f[
+
+    attention(Q,K,V) = V \cdot softmax(\frac{QK^T}{\sqrt{d_k}})
+
+\f]
+
+\f$d_k\f$ is the dimension size of K. Other notations used in the document:
+
+- N: the mini-batch size.
+- H: the number of multi-head.
+- S: the sequence length.
+- D: the size of each head.
+
+## SDPA patterns
+
+oneDNN supports SDPA and its optimization through Graph API [2] by defining the
+SDPA graph, getting partition from the graph, and optimizing the kernels
+underneath. In general, an SDPA pattern is defined as a directional acyclic
+graph (DAG) using oneDNN Graph API.
+
+### Floating-point SDPA
+
+oneDNN defines floating-point (f32, bf16, or f16) SDPA as follows. The blue
+nodes are required when defining an SDPA pattern while the brown parts are
+optional.
+
+![SDPA pattern](images/sdpa.png)
+
+1. The first MatMul calculates the dot products between Query and Key. See
+   [MatMul](@ref dev_guide_op_matmul) operation in Graph API.
+2. The Scale node is optional and is used to scale the output of the first
+   MatMul with a scaling factor. It can be constructed by [Multiply](@ref dev_guide_op_multiply)
+   or [Divide](@ref dev_guide_op_divide) operation in Graph API. The scaling
+   factor is given by users as an input of SDPA. \f$\sqrt{d_k}\f$ in the formula
+   is not considered as a part of the SDPA pattern because it is a constant.
+3. The Mask node is optional and is used to apply an attention mask to the
+   output of the previous Scale node. There are two types of masks that can
+   be applied:
+
+   1. Explicit user-generated mask: You can explicitly create a mask tensor
+   and pass it to the library for the computation of SDPA. In this case, mask
+   can be constructed by [Add](@ref dev_guide_op_add)
+   or [Select](@ref dev_guide_op_select) operation in Graph API for different
+   mask policies (for example, causal mask or padding mask). When the
+   Add operation is used to apply the mask, the input mask is usually an upper
+   triangular matrix with all the elements above the diagonal filled with
+   `-inf` and zeroes elsewhere. The `-inf` entries will become zero probability
+   after Softmax is applied in the next step.
+   Alternatively, a Select operation may be used. In this case, the
+   input is a boolean tensor (for example, with the boolean value set to `true`
+   on and below the diagonal, and `false` above the diagonal).
+   A `false` element in the mask forces the corresponding element of the scaled
+   output to `-inf`, while a `true` element leaves it unchanged.
+
+   ![SDPA-mask-1](images/sdpa-mask-1.png) ![SDPA-mask-2](images/sdpa-mask-2.png)
+
+   2. Implicit library-generated mask: You can use the operations in the library
+   to generate a mask by constructing a subgraph. Currently, Graph API supports
+   generating an implicit causal mask (top-left aligned) using operations of
+   [GenIndex](@ref dev_guide_op_genindex), [GreaterEqual](@ref dev_guide_op_greaterequal)
+   and [Select](@ref dev_guide_op_select).
+
+   ![SDPA-mask-3](images/sdpa-mask-3.png)
+
+4. The SoftMax operation takes the masked output and transforms it into
+   probabilities between 0 and 1. See [SoftMax](@ref dev_guide_op_softmax)
+   operation in Graph API.
+5. The second MatMul calculates the dot products between the probabilities after
+   SoftMax and Value.
+6. The Reorder node is optional and used to reshape or transpose the attention
+   output for cases where the attention output is transformed from shape (N, H,
+   S, D) to (N, S, H, D) or (N, S, H * D). The node can be constructed by the
+   combinations of [StaticTranspose](@ref dev_guide_op_statictranspose) and
+   [StaticReshape](@ref dev_guide_op_staticreshape) operation in Graph API.
+
+   ![SDPA-Reorder](images/sdpa-reorder.png)
+
+
+## Data Types
+
+oneDNN supports the floating-point SDPA pattern with data types f32, bf16, and
+f16. You can specify the data type via the input and output logical tensors'
+data type fields for each operation.
+
+oneDNN supports bf16 or f16 SDPA with f32 intermediate type, which means the
+Q/K/V tensors have bf16 or f16 data type while the output of the first MatMul,
+Scale, Mask, and the input of SoftMax are in f32 data type.
+
+oneDNN supports the quantized SDPA pattern with int8-f32 mixed precision,
+int8-bf16 mixed precision, and int8-f16 mixed precision data types.
+
+The definition of the data types and support status on different CPU and GPU
+platforms follow the general description in @ref dev_guide_data_types.
+
+## Implementation limitations
+
+1. oneDNN primitive-based SDPA is implemented as the reference implementation on
+   both Intel Architecture Processors and Intel Graphics Products. In this case,
+   floating-point SDPA patterns are usually implemented with f32, bf16, or f16
+   matmul (with post-ops) and softmax primitives, while quantized SDPA patterns
+   are implemented with int8 matmul (with post-ops) and f32, bf16, or f16
+   softmax primitives. The reference implementation requires memory to store the
+   intermediate results of the dot products between Query and Key which takes
+   \f$O(S^2)\f$ memory. It may lead to out-of-memory error when computing long
+   sequence length input on platforms with limited memory. For an implicit
+   causal mask, the reference implementation is only available on CPU.
+2. The SDPA patterns functionally supports all input shapes meeting the shape
+   requirements of each operation in the graph. For example, Add, Multiply,
+   Divide, and Select operations require the input tensors to have the same
+   shape or the shapes can be properly broadcasted based on the operation
+   attribute.
+3. CPU
+   - Optimized implementation is available for 4D Q/K/V tensors with shape
+     defined as (N, H, S, D).
+   - Optimized implementation is available for OpenMP runtime and Threadpool
+     runtime on Intel Architecture Processors.
+   - Specifically for OpenMP runtime, the optimized implementation requires `N *
+     H > 2 * thread number` to get enough parallelism.
+4. GPU
+   - Optimized implementation is available for 4D Q/K/V tensors with shape
+     defined as (N, H, S, D).
+   - Optimized implementation is available for `f16` or `bf16` SDPA with `f32`
+     intermediate data type and `D <= 256` on Intel Graphics Products with
+     Intel(R) Xe Matrix Extensions (Intel(R) XMX) support.
+
+## Example
+
+oneDNN provides an [SDPA
+example](https://github.com/uxlfoundation/oneDNN/tree/main/examples/graph/sdpa.cpp)
+demonstrating how to construct a typical floating-point SDPA pattern with oneDNN
+Graph API on CPU and GPU with different runtimes.
+
+oneDNN also provides a [MQA (Multi-Query Attention)
+example](https://github.com/uxlfoundation/oneDNN/tree/main/examples/graph/mqa.cpp) [3]
+demonstrating how to construct a floating-point MQA pattern with the same
+pattern structure as in the SDPA example but different head number in Key and
+Value tensors. In MQA, the head number of Key and Value is always one.
+
+## References
+
+[1] Attention is all you need, https://arxiv.org/abs/1706.03762v7
+
+[2] oneDNN Graph API documentation, https://uxlfoundation.github.io/oneDNN/graph_extension.html
+
+[3] Fast Transformer Decoding: One Write-Head is All You Need, https://arxiv.org/abs/1911.02150
diff --git a/doc/graph/fusion_patterns/sdpa_with_compressed_kv.md b/doc/graph/fusion_patterns/sdpa_with_compressed_kv.md
new file mode 100644
index 00000000000..e7a55ef571c
--- /dev/null
+++ b/doc/graph/fusion_patterns/sdpa_with_compressed_kv.md
@@ -0,0 +1,119 @@
+SDPA with Compressed Key and Value {#dev_guide_graph_sdpa_compressed_kv}
+========================================================================
+
+## Overview
+
+int4 and int8 compressions for Key and Value are exploited in fused Scaled
+Dot-Product Attention (SDPA)[1] to reduce the memory footprint of generative
+inference of LLM, especially when KV cache mechanism is adopted. Specifically,
+Key and Value tensors are stored using lower precision data types like int4 and
+int8 to reduce memory usage, and are subsequently de-quantized to wider floating
+point data types such as f16 and bf16 for computation.
+
+Note that grouped quantization is required to improve the model accuracy,
+especially for int4 data types. In this case, group size is needed as an
+attribute for quantization, which indicates the number of elements that share
+the same scaling factor and zero-points in each quantization group.
+
+The notations used in this topic are:
+
+- N: The mini-batch size.
+- H: The head number.
+- S: The sequence length.
+- D: The size of each head.
+- G: The group size.
+
+## SDPA Pattern
+
+The SDPA pattern with compressed Key and Value is defined as a directional
+acyclic graph (DAG) using oneDNN Graph API. oneDNN extends
+[SDPA pattern](@ref dev_guide_graph_sdpa) to support the following three kinds
+of compressed SDPA patterns:
+
+1. SDPA with compressed Key and Value.
+2. SDPA with floating-point Key and compressed Value.
+3. SDPA with compressed Key and floating-point Value.
+
+The floating-point data types include f32, f16 and bf16, and the compressed
+data type refers to low-precision integral data types, including int4 (u4/s4)
+and int8 (u8/s8) data types.
+
+In oneDNN Graph API, we support quantization through a pattern with quantization
+operations such as [DynamicDequantize](@ref dev_guide_op_dynamicdequantize) and
+[DynamicQuantize](@ref dev_guide_op_dynamicquantize). The supported pattern is
+as follows. The blue nodes are required while the brown nodes are optional.
+
+![compressed SDPA pattern](images/compressed_sdpa_pattern.png)
+
+Compared to a typical SDPA pattern, there are a few differences:
+
+1. Two additional DynamicDequantize operations are applied to the input Key and
+Value to convert the integral values to floating-point values. 
+2. Apart from the Query, Key and Value inputs, the pattern requires additional
+quantization information such as scale and zero-points for the dequantization of
+Key and Value tensors. Currently, oneDNN only supports grouped quantization
+on one dimension; specifically, the shapes of scale and zero-points for Key and
+Value de-quantization should be (N, H, S, D/G).
+3. Additionally, the `group_shape` attribute of the quantization operations must
+be specified as (1, 1, 1, G) for Key and Value dequantization.
+
+## Data Types
+
+oneDNN supports the following combinations of data types for Query, Key, Value,
+output, scale for Key, zero-points for Key, scale for Value and zero-points for
+Value:
+
+| Query   |  Key    | Scale_K | Zp_K            |  Value | Scale_V | Zp_V            | Output |
+|:--------|:--------|:--------|:----------------|:-------|:--------|:----------------|:-------|
+| dt_fp   | dt_int  | dt_fp   | u4,s4,u8,s8,s32 | dt_int | dt_fp   | u4,s4,u8,s8,s32 | dt_fp  |
+| dt_fp   | dt_int  | dt_fp   | u4,s4,u8,s8,s32 | dt_fp  | N/A     | N/A             | dt_fp  |
+| dt_fp   | dt_fp   | N/A     | N/A             | dt_int | dt_fp   | u4,s4,u8,s8,s32 | dt_fp  |
+
+Notes:
+- dt_fp can be: f16, bf16 or f32.
+- dt_int can be: u8, s8, u4 or s4.
+- zero-point inputs are optional.
+
+You can specify the data type via the input and output data type fields of
+logical tensors for each operation. The definition of the data types and support
+status on different CPU and GPU platforms follow the general description in
+@ref dev_guide_data_types.
+
+### Floating-point Math Mode
+
+You should set the floating-point math mode
+(@ref dev_guide_attributes_fpmath_mode) when using SDPA with compressed Key and
+Value. Generally, the math mode should align with the data type of the Query,
+which indicates the computation data type. Additionally, the second boolean
+flag, `apply_to_int`, should be set to true. You can configure these attribute
+values using the `set_fpmath_mode` API
+(@ref dnnl::graph::graph::set_fpmath_mode) on the graph object.
+
+## Implementation Limitations
+
+- oneDNN primitive-based SDPA with compressed Key and Value is implemented as
+a reference implementation on both Intel Architecture Processors and Intel
+Graphics Products. The reference implementation requires memory to store the
+intermediate results of the dot products between Query and Key which takes
+\f$O(S^2)\f$ memory. It may lead to Out-of-Memory error when computing long
+sequence length inputs on platforms with limited memory.
+- The compressed SDPA patterns functionally support all input shapes meeting
+the shape requirements of each operation in the graph.
+- CPU
+    - oneDNN does not provide optimized implementation on CPU currently. All
+    executions will be implemented with the primitive-based reference
+    computation.
+- GPU
+    - Optimized implementation is available for 4D Q/K/V tensors with the shape
+    defined as (N, H, S, D) for Query and Value, (N, H, D, S) for Key,
+    (N, H, D/G, S) for scales and zero-points of Key (if available) and
+    (N, H, S, D/G) for scales and zero-points of Value (if available).
+    - Optimized implementation is available for compressed SDPA with `f16`
+    computation data type on Intel Graphics Products with Intel(R) Xe Matrix
+    Extensions (Intel(R) XMX) support.
+    - If int4 zero-points are specified, optimized implementation will be only
+    available when the group size equals 16.
+
+## References
+
+[1] Attention is all you need, https://arxiv.org/abs/1706.03762v7
diff --git a/doc/graph/images/sdpa.png b/doc/graph/images/sdpa.png
deleted file mode 100644
index 87f4443bf49..00000000000
Binary files a/doc/graph/images/sdpa.png and /dev/null differ
diff --git a/doc/graph/operations/Add.md b/doc/graph/operations/Add.md
index 6f5342b382c..5fef1ab7d7e 100644
--- a/doc/graph/operations/Add.md
+++ b/doc/graph/operations/Add.md
@@ -44,8 +44,10 @@ different and auto-broadcasting is allowed if `auto_broadcast` attributes is
 
 Add operation supports the following data type combinations.
 
-| Src_0 / Src_1 | Dst  |
-|:--------------|:-----|
-| f32           | f32  |
-| bf16          | bf16 |
-| f16           | f16  |
+| Src_0     | Src_1     | Dst  |
+|:----------|:----------|:-----|
+| f32       | f32       | f32  |
+| bf16      | bf16      | bf16 |
+| f16       | f16       | f16  |
+| f32       | bf16, f16 | f32  |
+| bf16, f16 | f32       | f32  |
diff --git a/doc/graph/operations/Divide.md b/doc/graph/operations/Divide.md
index 8c4ab535544..11689c9b7eb 100644
--- a/doc/graph/operations/Divide.md
+++ b/doc/graph/operations/Divide.md
@@ -44,8 +44,10 @@ different and auto-broadcasting is allowed if `auto_broadcast` attributes is
 
 Divide operation supports the following data type combinations.
 
-| Src_0 / Src_1 | Dst  |
-|:--------------|:-----|
-| f32           | f32  |
-| bf16          | bf16 |
-| f16           | f16  |
+| Src_0     | Src_1     | Dst  |
+|:----------|:----------|:-----|
+| f32       | f32       | f32  |
+| bf16      | bf16      | bf16 |
+| f16       | f16       | f16  |
+| f32       | bf16, f16 | f32  |
+| bf16, f16 | f32       | f32  |
diff --git a/doc/graph/operations/DynamicDequantize.md b/doc/graph/operations/DynamicDequantize.md
index 9e730f1fb54..46aba4667f9 100644
--- a/doc/graph/operations/DynamicDequantize.md
+++ b/doc/graph/operations/DynamicDequantize.md
@@ -3,11 +3,11 @@ DynamicDequantize {#dev_guide_op_dynamicdequantize}
 
 ## General
 
-DynamicDequantize operation converts a quantized (s8 or u8) tensor to a f32
-tensor. It supports both per-tensor and per-channel asymmetric linear
-de-quantization. Rounding mode is library-implementation defined. Unlike the
-@ref dev_guide_op_dequantize, DynamicDequantize takes scales and zero-points as
-operator src tensors.
+The Dynamic Dequantize operation converts a quantized (s4, u4, s8, or u8) tensor
+to an bf16, f16 or f32 tensor. It supports per-tensor, per-channel, and per-group asymmetric
+linear de-quantization. The rounding mode is defined by the library
+implementation. Unlike the @ref dev_guide_op_dequantize, Dynamic Dequantize takes
+scales and zero-points as operator src tensors.
 
 For per-tensor de-quantization
 
@@ -16,12 +16,23 @@ For per-tensor de-quantization
 For per-channel de-quantization, taking channel axis = 1 as an example:
   \f[ {dst}_{\cdots,i,\cdots,\cdots} = (src_{\cdots,i,\cdots,\cdots} - zps_i)*scales_i,i\in [0,channelNum-1] \f]
 
+For per-group de-quantization, let's take group shape = Gx1 as an example. It
+indicates that one scaling factor will de adopted for G elements in the src
+tensor. On the dimensions where group quantization is adopted, make channelNum
+equal to the dimension of src and groupNum equal to channelNum/group size:
+  \f[ {dst}_{i,\cdots} = (src_{i,\cdots} - zps_j)*scales_j,i\in [0,channelNum-1],j\in [0,groupNum-1] \f]
+Where:
+  \f[ i = j*groupSize+k,k\in [0,groupSize-1] \f]
+On other dimensions:
+  \f[ {dst}_{i,\cdots} = (src_{i,\cdots} - zps_i)*scales_i,i\in [0,channelNum-1] \f]
+
 ## Operation attributes
 
 | Attribute Name                             | Description                                                          | Value Type | Supported Values                                                                                                                                | Required or Optional |
 |:-------------------------------------------|:---------------------------------------------------------------------|:-----------|:------------------------------------------------------------------------------------------------------------------------------------------------|:---------------------|
 | [qtype](@ref dnnl::graph::op::attr::qtype) | Specifies which de-quantization type is used.                        | string     | `per_tensor` (default), `per_channel`                                                                                                           | Optional             |
-| [axis](@ref dnnl::graph::op::attr::axis)   | Specifies dimension on which per-channel de-quantization is applied. | s64        | A s64 value in the range of [-r, r-1] where r = rank(src), `1` by default. Negative value means counting the dimension backwards from the end.  | Optional             |
+| [axis](@ref dnnl::graph::op::attr::axis)   | Specifies dimension on which per-channel de-quantization is applied. | s64        | An s64 value in the range of [-r, r-1] where r = rank(src), `1` by default. Negative values mean counting the dimension backwards from the end.  | Optional             |
+| [group_shape](@ref dnnl::graph::op::attr::group_shape)   | Specifies the group shape of an operation. | s64        | An s64 list indicates the group size on the dimensions where grouped quantization is adopted.  | Optional             |
 
 ## Execution arguments
 
@@ -36,15 +47,23 @@ constructing an operation.
 | 1     | `scales`      | Required             |
 | 2     | `zps`         | Optional             |
 
-@note `scales` is a f32 1D tensor to be applied to the de-quantization formula.
-For `qtype` = `per-tensor`, there should be only one element in the scales
-tensor. For `qtype` = `per-channel`, the element number should be equal to the
-element number of src tensor along the dimension axis.
-
-@note `zps` is a 1D tensor with offset values that map to zero. For `qtype` =
-`per-tensor`, there should be only one element in the zps tensor. For `qtype` =
+@note `scales` is a bf16/f16/f32 tensor to be applied to the de-quantization
+formula. For `qtype` = `per-tensor`, there should be only one element in the
+`scales` tensor. For `qtype` = `per-channel`, the element number should be equal
+to the element number of the src tensor along the dimension axis. For
+`qtype` = `per-gropup`, the `scale` tensor should have the same number of 
+dimension as the `src` tensor. On the dimensions where grouped quantization is
+applied, the dimension should be the number of groups, which equals to
+`src_dim` / `group_size`, while other dimensions should match the `src` tensor.
+
+@note `zps` is a tensor with offset values that map to zero. For `qtype` =
+`per-tensor`, there should be only one element in the `zps` tensor. For `qtype` =
 `per-channel`, the element number should be equal to the element number of input
-tensor along the dimension axis. If omitted, zps values are assumed to be zero.
+tensor along the dimension axis. For `qtype` = `per-group`, the `zps` tensor
+should have the same number of dimensions as the `src` tensor. On the dimensions
+where grouped quantization is applied, the dimension should be the number of
+groups, which equals to `src_dim` / `group_size`, while other dimensions should
+match the `src` tensor. If omitted, the `zps` values are assumed to be zero.
 
 ### Outputs
 
@@ -58,5 +77,9 @@ DynamicDequantize operation supports the following data type combinations.
 
 | Src | Dst | Scales | Zps         |
 |:-- -|:----|:-------|:------------|
-| s8  | f32 | f32    | s8, u8, s32 |
-| u8  | f32 | f32    | s8, u8, s32 |
+| s8  | f16, bf16, f32 | f16, bf16, f32 | s8, u8, s32 |
+| u8  | f16, bf16, f32 | f16, bf16, f32 | s8, u8, s32 |
+| s4  | f16, bf16, f32 | f16, bf16, f32 | s4, u4, s32 |
+| u4  | f16, bf16, f32 | f16, bf16, f32 | s4, u4, s32 |
+
+It's expected that the data types of scales and dst should be the same.
diff --git a/doc/graph/operations/GenIndex.md b/doc/graph/operations/GenIndex.md
new file mode 100644
index 00000000000..ff3306633dc
--- /dev/null
+++ b/doc/graph/operations/GenIndex.md
@@ -0,0 +1,39 @@
+GenIndex{#dev_guide_op_genindex}
+================================
+
+## General
+
+The GenIndex operation creates an index tensor along a specified axis of
+an input tensor. The resulting index tensor has the same shape as the
+input tensor, with each element representing the index along the
+specified axis.
+
+## Operation Attributes
+
+| Attribute Name                            | Description                                                     | Value Type | Supported Values                                           | Required or Optional |
+|:------------------------------------------|:----------------------------------------------------------------|:-----------|:-----------------------------------------------------------|:---------------------|
+| [axis] (@ref dnnl::graph::op::attr::axis) | Specifies the dimension along which index values are generated. | s64        | An s64 value in the range of [-r, r-1] where r = rank(src) | Required             |
+
+## Execution Arguments
+
+### Input
+
+| Index | Argument Name | Required or Optional |
+|:------|:--------------|:---------------------|
+| 0     | `src`         | Required             |
+
+### Output
+
+| Index | Argument Name | Required or Optional |
+|:------|:--------------|:---------------------|
+| 0     | `dst`         | Required             |
+
+## Supported Data Types
+
+The GenIndex operation supports the following data type combinations.
+
+| Src    | Dst    |
+|:-------|:-------|
+| f32    | s32    |
+| bf16   | s32    |
+| f16    | s32    |
diff --git a/doc/graph/operations/GreaterEqual.md b/doc/graph/operations/GreaterEqual.md
new file mode 100644
index 00000000000..912b066b45c
--- /dev/null
+++ b/doc/graph/operations/GreaterEqual.md
@@ -0,0 +1,49 @@
+GreaterEqual{#dev_guide_op_greaterequal}
+========================================
+
+## General
+
+The GreaterEqual operation performs an element-wise greater-than-or-equal
+comparison between two given tensors. This operation applies
+the multi-directional broadcast rules to ensure compatibility between
+the tensors of different shapes.
+
+\f[ dst = \begin{cases} true & \text{if}\ src_0 \ge src_1 \\
+    false & \text{if}\ src_0 < src_1 \end{cases} \f]
+
+## Operation Attributes
+
+| Attribute Name                                               | Description                                                | Value Type | Supported Values         | Required or Optional |
+|:-------------------------------------------------------------|:-----------------------------------------------------------|:-----------|:-------------------------|:---------------------|
+| [auto_broadcast](@ref dnnl::graph::op::attr::auto_broadcast) | Specifies rules used for auto-broadcasting of src tensors. | string     | `none`,`numpy` (default) | Optional             |
+
+## Execution Arguments
+
+### Input
+
+| Index | Argument Name | Required or Optional |
+|:------|:--------------|:---------------------|
+| 0     | `src_0`       | Required             |
+| 1     | `src_1`       | Required             |
+
+@note Both src shapes should match and no auto-broadcasting is allowed if
+the `auto_broadcast` attribute is `none`. `src_0` and `src_1` shapes can be
+different and auto-broadcasting is allowed if the `auto_broadcast` attribute
+is `numpy`. Broadcasting is performed according to the `auto_broadcast` value.
+
+### Output
+
+| Index | Argument Name | Required or Optional |
+|:------|:--------------|:---------------------|
+| 0     | `dst`         | Required             |
+
+## Supported Data Types
+
+The GreaterEqual operation supports the following data type combinations.
+
+| Src_0 / Src_1 | Dst      |
+|:--------------|:---------|
+| f32           | boolean  |
+| bf16          | boolean  |
+| f16           | boolean  |
+| s32           | boolean  |
diff --git a/doc/graph/operations/MatMul.md b/doc/graph/operations/MatMul.md
index d2b4cc89b0f..7879393969a 100644
--- a/doc/graph/operations/MatMul.md
+++ b/doc/graph/operations/MatMul.md
@@ -61,8 +61,8 @@ constructing an operation.
 
 MatMul operation supports the following data type combinations.
 
-| Src  | Weights | Bias | Dst  |
-|:-----|:--------|:-----|:-----|
-| f32  | f32     | f32  | f32  |
-| bf16 | bf16    | bf16 | bf16 |
-| f16  | f16     | f16  | f16  |
+| Src  | Weights | Bias | Dst       |
+|:-----|:--------|:-----|:----------|
+| f32  | f32     | f32  | f32       |
+| bf16 | bf16    | bf16 | f32, bf16 |
+| f16  | f16     | f16  | f32, f16  |
diff --git a/doc/graph/operations/Multiply.md b/doc/graph/operations/Multiply.md
index 625bfea10d2..24e09881e10 100644
--- a/doc/graph/operations/Multiply.md
+++ b/doc/graph/operations/Multiply.md
@@ -44,8 +44,10 @@ different and auto-broadcasting is allowed if `auto_broadcast` attributes is
 
 Multiply operation supports the following data type combinations.
 
-| Src_0 / Src_1 | Dst  |
-|:--------------|:-----|
-| f32           | f32  |
-| bf16          | bf16 |
-| f16           | f16  |
+| Src_0     | Src_1     | Dst  |
+|:----------|:----------|:-----|
+| f32       | f32       | f32  |
+| bf16      | bf16      | bf16 |
+| f16       | f16       | f16  |
+| f32       | bf16, f16 | f32  |
+| bf16, f16 | f32       | f32  |
diff --git a/doc/graph/operations/Softmax.md b/doc/graph/operations/Softmax.md
index 6655eb218d6..467634b1d05 100644
--- a/doc/graph/operations/Softmax.md
+++ b/doc/graph/operations/Softmax.md
@@ -36,8 +36,8 @@ constructing an operation.
 
 SoftMax operation supports the following data type combinations.
 
-| Src  | Dst  |
-|:-----|:-----|
-| f32  | f32  |
-| bf16 | bf16 |
-| f16  | f16  |
+| Src  | Dst             |
+|:-----|:----------------|
+| f32  | f32, bf16, f16  |
+| bf16 | bf16            |
+| f16  | f16             |
diff --git a/doc/graph/operations/Subtract.md b/doc/graph/operations/Subtract.md
index 28138271a5a..bca45816cc8 100644
--- a/doc/graph/operations/Subtract.md
+++ b/doc/graph/operations/Subtract.md
@@ -44,8 +44,10 @@ different and auto-broadcasting is allowed if `auto_broadcast` attributes is
 
 Subtract operation supports the following data type combinations.
 
-| Src_0 / Src_1 | Dst  |
-|:--------------|:-----|
-| f32           | f32  |
-| bf16          | bf16 |
-| f16           | f16  |
+| Src_0     | Src_1     | Dst  |
+|:----------|:----------|:-----|
+| f32       | f32       | f32  |
+| bf16      | bf16      | bf16 |
+| f16       | f16       | f16  |
+| f32       | bf16, f16 | f32  |
+| bf16, f16 | f32       | f32  |
diff --git a/doc/graph/programming_model/graph_basic_concepts.md b/doc/graph/programming_model/graph_basic_concepts.md
index 5ee5eaa558b..b2f75349d1f 100644
--- a/doc/graph/programming_model/graph_basic_concepts.md
+++ b/doc/graph/programming_model/graph_basic_concepts.md
@@ -41,13 +41,19 @@ tensor as the edge between them.
 ## Graph
 
 `Graph` (@ref dnnl::graph::graph) contains a set of operations. A graph object
-is associated to a specific engine kind (@ref dnnl::engine::kind). Multiple
-operations can be added (@ref dnnl::graph::graph::add_op) along with input and
-output logical tensors to a graph. After finishing adding operations,
-finalization API (@ref dnnl::graph::graph::finalize) can be called to indicate
-that the graph is ready for partitioning. By calling partitioning API (@ref
-dnnl::graph::graph::get_partitions), a group of partitions from the graph will
-be returned .
+is associated to a specific engine kind (@ref dnnl::engine::kind). In addition, 
+you can set the graph-level floating-point math mode through the setter API 
+( @ref dnnl::graph::graph::set_fpmath_mode ) or in the constructor. The API
+accepts two paramters, the given floating point math mode and a optional boolean
+flag to indicate whether to use floating-point arithmetic for integral
+operations.
+
+Multiple operations can be added (@ref dnnl::graph::graph::add_op) along with
+input and output logical tensors to a graph. After finishing adding the
+operations, finalization API (@ref dnnl::graph::graph::finalize) can be called
+to indicate that the graph is ready for partitioning. By calling partitioning
+API (@ref dnnl::graph::graph::get_partitions), a group of partitions from the
+graph will be returned. 
 
 ## Partition
 
diff --git a/doc/graph/programming_model/low_precision.md b/doc/graph/programming_model/low_precision.md
index 35118771b96..83b7744ba25 100644
--- a/doc/graph/programming_model/low_precision.md
+++ b/doc/graph/programming_model/low_precision.md
@@ -52,7 +52,6 @@ Graph operations support bf16 and f16 data types.
 
 A TypeCast operation performing down conversion should be inserted clearly to
 indicate the use of low numeric precision. oneDNN Graph implementation fully
-honors the API-specified numeric precision and only performs the computation
-using the API-specified or higher numeric precision.
+honors the API-specified numeric precision.
 
 @img{bf16_programming.jpg,Figure 2: Overview of bf16 programming model.,80%,}
diff --git a/doc/graph/rst/graph_fusion_patterns.rst b/doc/graph/rst/graph_fusion_patterns.rst
new file mode 100644
index 00000000000..ce4bca97f7f
--- /dev/null
+++ b/doc/graph/rst/graph_fusion_patterns.rst
@@ -0,0 +1,171 @@
+Fusion Patterns
+###############
+
+.. default-role:: math
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   dev_guide_graph_gated_mlp
+   dev_guide_graph_gqa
+   dev_guide_graph_sdpa_compressed_kv
+   dev_guide_graph_sdpa
+
+
+The following fusion patterns are subgraphs that the oneDNN Graph API
+recognizes as candidates for fusion. The patterns are described using
+oneDNN Graph operation (op) names with the following convention.
+
+.. note::
+   oneDNN Graph performs limited input validation to minimize 
+   the performance overheads. The application is responsible for 
+   sanitizing inputs passed to the library. Because large ``u8`` or 
+   ``s8`` inputs may lead to accumulator overflow, you can use 
+   floating-point patterns instead of quantized patterns.
+
+``"+"`` describes a chain of two ops. The preceding op produces an
+output tensor, which is consumed by the following op as its first
+operand.
+
+``"[]"`` describes a component of the overall pattern description. For
+example, it could include a subgraph or all the op choices within the
+bracket.
+
+``"|"`` describes choices of multiple operations, say A+[B|C] means the
+graph partition contains A followed by B or C.
+
+``","`` describes a graph composed of multiple subgraphs, each subgraph
+marks its output tensor explicitly, which is consumed by other
+subgraphs.
+
+``Superscript`` denotes the numbers of repetition pattern. For example,
+A+[B|C] `^{3}` means the graph partition
+contains A followed by three ops, each of them is either B or C. The
+superscript could be a range of number meaning allowing a range of
+repetition. If the range is between 0 and 1, we use superscript ``"?"``.
+
+``Subscript`` denotes the input and output tensors which need to
+explicitly mark the producer and consumer relation within one graph
+partition. For example,
+A `_{>t1}` +B+C `_{<t1}`
+refers to the pattern started with A followed by B and C, and C takes an
+implicit input tensor from B and an extra tensor t1 output from A.
+``">"`` refers to the output tensor, and ``"<"`` for input tensor. Input
+and output tensors between neighbor ops are not explicitly marked, for
+example, B consumes t1 implicitly in the example above.
+
+Subscript ``"out"`` marks the output tensor of a certain op to be the
+output of a graph partition. For example, in
+A `_{>t1}` +B `_{>out}`\ +C `_{<t1,>out}`,
+B's output and C's output are marked as output tensors.
+
+Subscript ``"in"`` marks the input tensor of a certain op to be the
+input of a graph partition. For example, in
+A `_{<in1}`\ +B `_{<in1}`
+A's input and B's second input are graph partition input, and they share
+the same input tensor in1. Most input tensors of a graph partition are
+not explicitly marked. For example, the input tensors of the first op
+are implicitly regarded as graph partition inputs. Besides, for input
+tensors of other ops, if they are not produced by any proceeding ops,
+they are regarded as implicit graph partition inputs. In the example
+A `_{>t1}`\ +B+C `_{<t1}`,
+A's inputs are regarded as implicit graph partition inputs, and if B is
+a binary operation, the second input tensor is an implicit graph
+partition input.
+
+The following categories will be used in describing a fusion pattern.
+
+Unary = [Abs \| Clamp \| Elu \| Exp \| GELU \| HardSwish \| LeakyReLU \|
+Log \| Sigmoid \| SoftPlus \| Pow \| ReLU \| Round \| Sqrt \| Square \|
+Tanh]
+
+Binary = [Add \| Divide \| Maximum \| Minimum \| Multiply \| Subtract]
+
+Reduction = [ReduceL1 \| ReduceL2 \| ReduceMax \| ReduceMean \|
+ReduceMin \| ReduceProd \| ReduceSum]
+
+Inference
+~~~~~~~~~
+
+Floating Point Patterns
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. list-table:: 
+   :widths: 75 25
+   :header-rows: 1
+
+   * - Pattern
+     - Description
+   * - Scaled Dot-Product Attention
+     - Refer to `Scaled Dot-Product Attention (SDPA) <dev_guide_graph_sdpa.html>`_ for more details.
+   * - Grouped Query Attention
+     - Refer to `Grouped Query Attention (GQA) <dev_guide_graph_gqa.html>`_ for more details.
+   * - Scaled Dot-Product Attention with Compressed Key/Value
+     - Refer to `Scaled Dot-Product Attention with Compressed Key/Value <dev_guide_graph_sdpa_compressed_kv.html>`_ for more details.
+   * - Gated Multi-Layer Perceptron (Gated-MLP)
+     - Refer to `Gated Multi-Layer Perceptron (Gated-MLP) <dev_guide_graph_gated_mlp.html>`_ for more details.
+   * - Convolution + BiasAdd `^?` + BatchNormInference `^?` + [Unary \| Binary] `^{0-3}` `_{>out}`
+     - This pattern is widely used in Convolution Neural Networks, for example ResNet, ResNext, SSD, etc.
+   * - ConvTranspose + BiasAdd `^?` + [Unary \| Binary] `^{0-3}` `_{>out}`
+     - This pattern is widely used in Generative Adversarial Networks.
+   * - Interpolate + [Unary \| Binary] `^{0-3}` `_{>out}`
+     - This pattern is widely used for image processing.
+   * - MatMul + BiasAdd `^?` + [Unary \| Binary] `^{0-3}` + Select `^?` `_{>out}`
+     - This pattern is widely used in language models and recommendation models, for example BERT, DLRM, etc.
+   * - Reduction + [Unary \| Binary] `^{0-3}` `_{>out}`
+     - This pattern is widely used for data processing, for example loss reduction.
+   * - Unary + Binary `^{0-3}` `_{>out}`
+     - This pattern is widely used in Convolution Neural Networks. 
+   * - Binary + [Unary \| Binary] `^{0-3}` `_{>out}`
+     - This pattern is widely used in language models and recommendation models, for example BERT, DLRM, etc.
+   * - [AvgPool \| MaxPool] + Binary `^{0-3}` `_{>out}`
+     - This pattern is widely used in Convolution Neural Networks.
+   * - BatchNormInference + ReLU `_{>out}`
+     - This pattern is widely used in Convolution Neural Networks, for example DenseNet.
+   * - Reciprocal + Multiply `_{>out}`
+     - N/A
+   * - Reorder + Add `_{>out}`
+     - N/A
+   
+
+
+
+Quantized Patterns
+^^^^^^^^^^^^^^^^^^
+
+.. list-table:: 
+   :widths: 75 25
+   :header-rows: 1
+
+   * - Pattern
+     - Description
+   * - Quantize `^?` + Dequantize `_{>t1}`, Dequantize `_{>t2}` `^{0-3}`, Dequantize + Convolution `_{<t1}` + BiasAdd `^?` + [Unary \| Binary `_{<t2}`] `^{0-3}` + Quantize `^? _{>out}`
+     - N/A
+   * - Quantize `^?` + Dequantize `_{>t1}`, Dequantize `_{>t2}` `^{0-3}`, Dequantize + ConvTranspose `_{<t1}` + BiasAdd `^?` + [Unary \| Binary `_{<t2}`] `^{0-3}` + Quantize `^?` `_{>out}`
+     - N/A
+   * - Quantize `^?` + Dequantize `_{>t1}`, Dequantize `_{>t2}` `^{0-3}`, Dequantize + MatMul `_{<t1}` + BiasAdd `^?` + [Unary \| Binary `_{<t2}`] `^{0-3}` + Select `^?` + Quantize `^?` `_{>out}`
+     - N/A
+   * - Dequantize + [AvgPool \| MaxPool] + Quantize `_{>out}``
+     - N/A
+   * - Dequantize `_{>t1}`, Dequantize + [AvgPool \| MaxPool] + Add `_{<t1}` + Quantize `_{>out}`
+     - N/A
+   * - Dequantize + Reorder + Quantize `_{>out}`
+     - This pattern is widely used in Generative Adversarial Networks.
+   * - Dequantize `_{>t1}`, Dequantize + Reorder + Add `_{<t1}` + Quantize `_{>out}`
+     - This pattern is widely used for image processing.
+   * - [SoftMax \| LayerNorm \| GroupNorm] + [Unary \| Binary `_{<t2}`] `^{0-3}` + Quantize `^? _{>out}`
+     - This pattern is used in SmoothQuant to fuse scales and quantization into previous layers.
+
+Training
+~~~~~~~~
+
+.. list-table:: 
+   :widths: 75 25
+   :header-rows: 1
+
+   * - Pattern
+     - Description
+   * - ConvolutionBackwardWeights + BiasAddBackward `_{>out}`
+     - N/A
+   * - ReLUBackward + BatchNormTrainingBackward `_{>out}`
+     - N/A
diff --git a/doc/graph/sdpa.md b/doc/graph/sdpa.md
deleted file mode 100644
index 1b0864a5c76..00000000000
--- a/doc/graph/sdpa.md
+++ /dev/null
@@ -1,128 +0,0 @@
-Scaled Dot-Product Attention (SDPA) {#dev_guide_graph_sdpa}
-===========================================================
-
-## Background
-
-Scaled Dot-Product Attention (SDPA) was introduced in [1] as the core operation
-of Transformer block which now becomes the backbone of many language models and
-generative models (BERT, Stable Diffusion, GPT, etc.).
-
-The input of SDPA consists of query (Q), key (K), and value (V). The attention
-output is computed as:
-
-\f[
-
-    attention(Q,K,V) = V \cdot softmax(\frac{QK^T}{\sqrt{d_k}})
-
-\f]
-
-\f$d_k\f$ is the dimension size of K. Other notations used in the document:
-
-- N: the mini-batch size.
-- H: the number of multi-head.
-- S: the sequence length.
-- D: the size of each head.
-
-## SDPA patterns
-
-oneDNN supports SDPA and its optimization through Graph API [2] by defining the
-SDPA graph, getting partition from the graph, and optimizing the kernels
-underneath. In general, an SDPA pattern is defined as a directional acyclic
-graph (DAG) using oneDNN Graph API.
-
-### Floating point SDPA
-
-oneDNN defines floating point (f32, bf16, or f16) SDPA as follows. The blue
-nodes are required when defining an SDPA pattern while the brown parts are
-optional.
-
-![SDPA pattern](images/sdpa.png)
-
-1. The first MatMul calculates the dot products between Query and Key. See
-   [MatMul](@ref dev_guide_op_matmul) operation in Graph API.
-2. The Scale node is optional and is used to scale the output of the first
-   MatMul with a scaling factor. It can be constructed by [Multiply](@ref dev_guide_op_multiply)
-   or [Divide](@ref dev_guide_op_divide) operation in Graph API. The scaling
-   factor is given by users as an input of SDPA. \f$\sqrt{d_k}\f$ in the formula
-   is not considered as part of the SDPA pattern as it is constant.
-3. The Mask node is optional and is used to apply an attention mask to the
-   output of the previous Scale node. It can be constructed by [Add](@ref dev_guide_op_add)
-   or [Select](@ref dev_guide_op_select) operation in Graph API for different
-   mask policies (eg. causal mask or padding mask). When Add operation is used
-   to apply the mask, the input mask is usually an upper triangular matrix with
-   all the elements above the diagonal filled with `-inf` and zeroes elsewhere.
-   The `-inf` entries will become zero probability after Softmax is applied in
-   the next step. Alternately, a Select operation may be used. In this case, the
-   input is a boolean tensor (for example, with `true` on and below the
-   diagonal, and `false` above the diagonal). A `false` element in the mask
-   forces the corresponding element of the scaled output to `-inf`, while a
-   `true` element leaves it unchanged.
-
-   ![SDPA-mask-1](images/sdpa-mask-1.png) ![SDPA-mask-2](images/sdpa-mask-2.png)
-
-4. The SoftMax operation takes the masked output and transforms it into
-   probabilities between 0 and 1. See [SoftMax](@ref dev_guide_op_softmax)
-   operation in Graph API.
-5. The second MatMul calculates the dot products between the probabilities after
-   SoftMax and Value.
-6. The Reorder node is optional and used to reshape or transpose the attention
-   output for cases where the attention output is transformed from shape (N, H,
-   S, D) to (N, S, H, D) or (N, S, H * D). The node can be constructed by the
-   combinations of [StaticTranspose](@ref dev_guide_op_statictranspose) and
-   [StaticReshape](@ref dev_guide_op_staticreshape) operation in Graph API.
-
-   ![SDPA-Reorder](images/sdpa-reorder.png)
-
-
-## Data types
-
-oneDNN supports the floating point SDPA pattern with data types f32, bf16, and
-f16. oneDNN users can specify the data type via the input and output logical
-tensors' data type fields for each operation. oneDNN does not support mixing
-different floating data types in a floating point SDPA pattern.
-
-oneDNN supports the quantized SDPA pattern with int8-f32 mixed precision,
-int8-bf16 mixed precision, and int8-f16 mixed precision data types.
-
-The definition of the data types and support status on different CPU and GPU
-platforms follow the general description in @ref dev_guide_data_types.
-
-## Implementation limitations
-
-1. oneDNN primitive-based SDPA is implemented as the reference implementation on
-   both Intel Architecture Processors and Intel Graphics Products. In this case,
-   floating point SDPA patterns are usually implemented with f32/bf16/f16 matmul
-   (with post-ops) and softmax primitives, while quantized SDPA patterns are
-   implemented with int8 matmul (with post-ops) and f32/bf16/f16 softmax
-   primitives.
-2. The SDPA patterns functionally supports all input shapes meeting the shape
-   requirements of each operation in the graph. For example, Add, Multiply,
-   Divide, and Select operations require the input tensors to have the same
-   shape or the shapes can be properly broadcasted based on the operation
-   attribute.
-3. CPU
-   - Optimized implementation is available for 4D Q/K/V tensors with shape
-     defined as (N, H, S, D).
-   - Optimized implementation is available for OpenMP runtime and Threadpool
-     runtime on Intel Architecture Processors.
-   - Specifically for OpenMP runtime, the optimized implementation requires `N *
-     H > 2 * thread number` to get enough parallelism.
-4. GPU
-   - Optimized implementation is available for 4D Q/K/V tensors with shape
-     defined as (N, H, S, D).
-   - Optimized implementation is available for floating point SDPA with `f16`
-     data type and `D <= 256` on Intel Graphics Products with Intel(R) Xe Matrix
-     Extensions (Intel(R) XMX) support.
-
-## Example
-
-oneDNN provides an [SDPA
-example](https://github.com/oneapi-src/oneDNN/tree/main/examples/graph/sdpa.cpp)
-demonstrating how to construct a typical floating point SDPA pattern with oneDNN
-Graph API on CPU and GPU with different runtimes.
-
-## References
-
-[1] Attention is all you need, https://arxiv.org/abs/1706.03762v7
-
-[2] oneDNN Graph API documentation, https://oneapi-src.github.io/oneDNN/graph_extension.html
diff --git a/doc/graph/supported_patterns.md b/doc/graph/supported_patterns.md
deleted file mode 100644
index 6118a088929..00000000000
--- a/doc/graph/supported_patterns.md
+++ /dev/null
@@ -1,159 +0,0 @@
-Supported Fusion Patterns {#dev_guide_graph_fusion_patterns}
-============================================================
-
-@anchor fusion_patterns
-## Fusion Patterns
-
-The following fusion patterns are subgraphs that the oneDNN Graph API recognizes
-as candidate for fusion. The patterns are described using oneDNN Graph
-operation (op) names with the following convention.
-
-@note oneDNN Graph performs limited input validation to minimize the performance
-overheads. The application is responsible for sanitizing inputs passed to the
-library. For large u8 or s8 inputs may lead to accumulator overflow, you can use
-floating point patterns instead of quantized patterns.
-
-`"+"` describes a chain of two ops. The preceding op produces an output tensor,
-which is consumed by the following op as its first operand.
-
-`"[]"` describes a component of the overall pattern description. For example,
-it could include a subgraph or all the op choices within the bracket.
-
-`"|"` describes choices of multiple operations, say A+[B|C] means the graph
-partition contains A followed by B or C.
-
-`","` describes a graph composed of multiple subgraphs, each subgraph marks its
-output tensor explicitly, which is consumed by other subgraphs.
-
-`Superscript` denotes the numbers of repetition pattern. For example,
-A+[B|C]\f$^{3}\f$ means the graph partition contains A followed by three ops,
-each of them is either B or C. The superscript could be a range of number
-meaning allowing a range of repetition. If the range is between 0 and 1, we use
-superscript `"?"`.
-
-`Subscript` denotes the input and output tensors which need to explicitly mark
-the producer and consumer relation within one graph partition. For example,
-A\f$_{>t1}\f$+B+C\f$_{<t1}\f$ refers
-to the pattern started with A followed by B and C, and C takes an implicit input
-tensor from B and an extra tensor t1 output from A. `">"` refers to the output
-tensor, and `"<"` for input tensor.  Input and output tensor between neighbor
-ops are not explicitly marked, for example, B consumes t1 implicitly in the
-example above.
-
-Subscript `"out"` marks the output tensor of a certain op to be the output of
-a graph partition. For example, in
-A\f$_{>t1}\f$+B\f$_{>out}\f$+C\f$_{<t1,>out}\f$, B's output and C's output
-are marked as output tensors.
-
-Subscript `"in"` marks the input tensor of a certain op to be the input of a
-graph partition. For example, in A\f$_{<in1}\f$+B\f$_{<in1}\f$ A's input and
-B's second input are graph partition input, and they share the same input tensor
-in1. Most input tensors of a graph partition are not explicitly marked.
-For example, the input tensors of the first op are implicitly regarded as graph
-partition inputs. Besides, for input tensors of other ops, if they are not
-produced by any proceeding ops, they are regarded as implicit graph partition
-inputs. In the example A\f$_{>t1}\f$+B+C\f$_{<t1}\f$, A's inputs are
-regarded as implicit graph partition inputs, and if B is a binary operation, the
-second input tensor is an implicit graph partition input.
-
-The following categories will be used in describing fusion pattern.
-
-Unary = [Abs | Clamp | Elu | Exp | GELU | HardSwish | LeakyReLU |
-Log | Sigmoid | SoftPlus | Pow | ReLU | Round | Sqrt | Square | Tanh]
-
-Binary = [Add | Divide | Maximum | Minimum | Multiply | Subtract]
-
-Reduction = [ReduceL1 | ReduceL2 | ReduceMax | ReduceMean | ReduceMin |
-ReduceProd | ReduceSum]
-
-### Inference
-
-#### Floating Point Patterns
-
-| Pattern | Description                  |
-|:--------|:-----------------------------|
-| Convolution + BiasAdd\f$^?\f$ + BatchNormInference\f$^?\f$ + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Convolution Neural Networks, for example ResNet, ResNext, SSD, etc. |
-| ConvTranspose + BiasAdd\f$^?\f$ + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Generative Adversarial Networks. |
-| Interpolate + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used for image processing. |
-| MatMul + BiasAdd\f$^?\f$ + [Unary \| Binary]\f$^{0-3}\f$ + Select\f$^?\f$\f$_{>out}\f$ | This pattern is widely used in language models and recommendation models, for example BERT, DLRM, etc. |
-| Reduction + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used for data processing, for example loss reduction. |
-| Unary + Binary\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Convolution Neural Networks. |
-| Binary + [Unary \| Binary]\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Generative Adversarial Networks, for example ParallelWaveGAN. |
-| [AvgPool \| MaxPool] + Binary\f$^{0-3}\f$\f$_{>out}\f$ | This pattern is widely used in Convolution Neural Networks. |
-| BatchNormInference + ReLU\f$_{>out}\f$ | This pattern is widely used in Convolution Neural Networks, for example DenseNet. |
-| Reciprocal + Multiply\f$_{>out}\f$ | N/A |
-| Reorder + Add\f$_{>out}\f$ | N/A |
-| Scaled Dot-Product Attention | Refer to @ref dev_guide_graph_sdpa for more details. |
-
-#### Quantized Patterns
-
-| Pattern | Description                  |
-|:--------|:-----------------------------|
-| Quantize\f$^?\f$ + Dequantize\f$_{>t1}\f$, Dequantize\f$_{>t2}\f$\f$^{0-3}\f$, Dequantize + Convolution\f$_{<t1}\f$ + BiasAdd\f$^?\f$ + [Unary \| Binary\f$_{<t2}\f$]\f$^{0-3}\f$ + Quantize\f$^?\f$\f$_{>out}\f$ | N/A |
-| Quantize\f$^?\f$ + Dequantize\f$_{>t1}\f$, Dequantize\f$_{>t2}\f$\f$^{0-3}\f$, Dequantize + ConvTranspose\f$_{<t1}\f$ + BiasAdd\f$^?\f$ + [Unary \| Binary\f$_{<t2}\f$]\f$^{0-3}\f$ + Quantize\f$^?\f$\f$_{>out}\f$ |N/A |
-| Quantize\f$^?\f$ + Dequantize\f$_{>t1}\f$, Dequantize\f$_{>t2}\f$\f$^{0-3}\f$, Dequantize + MatMul\f$_{<t1}\f$ + BiasAdd\f$^?\f$ + [Unary \| Binary\f$_{<t2}\f$]\f$^{0-3}\f$ + Select\f$^?\f$ + Quantize\f$^?\f$\f$_{>out}\f$ |N/A |
-| Dequantize + [AvgPool \| MaxPool] + Quantize\f$_{>out}\f$ |N/A |
-| Dequantize\f$_{>t1}\f$, Dequantize + [AvgPool \| MaxPool] + Add\f$_{<t1}\f$ + Quantize\f$_{>out}\f$ |N/A |
-| Dequantize + Reorder + Quantize\f$_{>out}\f$ |N/A |
-| Dequantize\f$_{>t1}\f$, Dequantize + Reorder + Add\f$_{<t1}\f$ + Quantize\f$_{>out}\f$ |N/A |
-| [SoftMax \| LayerNorm \| GroupNorm] + [Unary \| Binary\f$_{<t2}\f$]\f$^{0-3}\f$ + Quantize\f$^?\f$\f$_{>out}\f$ | This pattern is used in SmoothQuant to fuse scales and quantization into previous layers |
-
-### Training
-
-| Pattern | Description                  |
-|:--------|:-----------------------------|
-| ConvolutionBackwardWeights + BiasAddBackward\f$_{>out}\f$ | N/A |
-| ReLUBackward + BatchNormTrainingBackward\f$_{>out}\f$ |N/A |
-
-All the above fusion patterns are supported by default.
-
-## Aggressive Fusion Patterns
-Aggressive fusion patterns also follow the pattern description convention
-defined in the [Fusion Patterns](@ref fusion_patterns) section.
-
-@note Aggressive fusion patterns are only supported when
-[Graph Compiler](@ref dev_guide_graph_compiler) is enabled.
-
-The following categories will also be used to describe aggressive fusion
-patterns.
-
-- ReshapeTranspose = [StaticReshape + StaticTranspose\f$^{1-2}\f$]
-
-- Activation = [ReLU \| Sigmoid \| GELU]
-
-- ActivationBackward = [ReLUBackward \| SigmoidBackward \| GELUBackward]
-
-### Inference
-
-#### Floating Point Patterns
-
-| Pattern | Description                  |
-|:--------|:-----------------------------|
-| MatMul + [Multiply \| Divide] + Add + Softmax + MatMul + StaticTranspose + Reorder\f$_{>out}\f$ | Multi-head Attention. This pattern is widely used in models containing encoder-decoder structures, for example BERT. |
-| ReshapeTranspose\f$_{>t1}\f$, ReshapeTranspose\f$_{>t2}\f$, ReshapeTranspose + MatMul\f$_{<t1}\f$ + [Multiply \| Divide] + Add + Softmax + MatMul\f$_{<t2}\f$ + StaticTranspose + StaticReshape\f$_{>out}\f$ | Multi-head Attention. |
-| MatMul + Activation\f$_{>t1}\f$, [MatMul\f$_{<t1}\f$ + Activation\f$_{>t1}\f$]\f$^{0-4}\f$, MatMul\f$_{<t1}\f$ + Activation\f$_{>out}\f$ | Multi-layer Perceptron. This pattern is widely used in recommendation models, for example DLRM. |
-| [Convolution + BiasAdd\f$^{?}\f$ + ReLU]\f$^{1-3}\f$ + Convolution + BiasAdd\f$^{?}\f$ + Add + ReLU\f$_{>out}\f$ | Identical Bottleneck. Enabled only in single thread runtime scenario. This pattern is widely used in Convolution Neural Networks, for example ResNet. |
-| Convolution + BiasAdd\f$^{?}\f$\f$_{>t1}\f$, [Convolution + BiasAdd\f$^{?}\f$ + ReLU]\f$^{1-3}\f$ + Convolution + BiasAdd\f$^{?}\f$ + Add\f$_{<t1}\f$ + ReLU\f$_{>out}\f$ | Convolutional Bottleneck. Enabled only in single thread runtime scenario. This pattern is widely used in Convolution Neural Networks, for example ResNet. |
-
-#### Quantized Patterns
-
-| Pattern | Description                  |
-|:--------|:-----------------------------|
-| Dequantize\f$_{>t1}\f$, Dequantize\f$_{>t2}\f$, Dequantize + MatMul\f$_{<t1}\f$ + [Multiply \| Divide] + Add + Softmax + Quantize + Dequantize + MatMul\f$_{<t2}\f$ + StaticTranspose + Reorder + Quantize\f$_{>out}\f$ | Quantized Multi-head Attention. |
-| Dequantize + ReshapeTranspose\f$_{>t1}\f$, Dequantize + ReshapeTranspose\f$_{>t2}\f$, Dequantize + MatMul\f$_{<t1}\f$ + [Multiply \| Divide] + Add + Softmax + Quantize + Dequantize + MatMul\f$_{<t2}\f$ + StaticTranspose + StaticReshape + Quantize\f$_{>out}\f$ | Quantized Multi-head Attention. |
-| Dequantize\f$_{>t1}\f$, Dequantize + MatMul\f$_{<t1}\f$ + Activation + Quantize\f$_{>t2}\f$, [Dequantize\f$_{>t3}\f$, Dequantize\f$_{<t2}\f$ + MatMul\f$_{<t3}\f$ + Activation + Quantize\f$_{>t2}\f$]\f$^{0-4}\f$, Dequantize\f$_{>t4}\f$, Dequantize\f$_{<t2}\f$ + MatMul\f$_{<t4}\f$ + Activation + Quantize\f$_{>out}\f$ | Quantized Multi-layer Perceptron. |
-| Dequantize\f$_{>t2}\f$, Dequantize\f$_{>t3}\f$, [Dequantize\f$_{>t1}\f$, Dequantize + Convolution\f$_{<t1}\f$ + BiasAdd\f$^{?}\f$ + ReLU + Quantize]\f$^{1-3}\f$ + Dequantize + Convolution\f$_{<t2}\f$ + BiasAdd\f$^{?}\f$ + Add\f$_{<t3}\f$ + ReLU + Quantize\f$_{>out}\f$ | Quantized Identical Bottleneck. Enabled only in single thread runtime scenario. |
-| [Dequantize\f$_{>t1}\f$, Dequantize + Convolution\f$_{<t1}\f$ + BiasAdd\f$^{?}\f$ + Quantize + Dequantize]\f$_{>t2}\f$, Dequantize\f$_{>t4}\f$, [Dequantize\f$_{>t3}\f$, Dequantize + Convolution\f$_{<t3}\f$ + BiasAdd\f$^{?}\f$ + ReLU + Quantize]\f$^{1-3}\f$ + Dequantize + Convolution\f$_{<t4}\f$ + BiasAdd\f$^{?}\f$ + Add\f$_{<t2}\f$ + ReLU + Quantize\f$_{>out}\f$ | Quantized Convolutional Bottleneck. Enabled only in single thread runtime scenario. |
-
-### Training
-
-| Pattern | Description                  |
-|:--------|:-----------------------------|
-| Dequantize\f$_{>t1}\f$, Dequantize\f$_{>t2}\f$, Dequantize + MatMul\f$_{<t1}\f$ + [Multiply \| Divide] + Add + Softmax + Quantize + Dequantize + MatMul\f$_{<t2}\f$ + StaticTranspose + Reorder + Quantize\f$_{>out}\f$ | Multi-head Attention Training Forward Pattern. |
-| StaticReshape + StaticTranspose\f$_{>t1}\f$ + MatMul + Multiply\f$_{>t2}\f$ + Subtract\f$_{<t3}\f$ + Multiply\f$^{?}\f$ + [Multiply \| Divide]\f$_{>t4}\f$ + MatMul\f$_{>out1}\f$, Multiply\f$_{<t2}\f$ + ReduceSum\f$_{>t3}\f$, MatMul\f$_{<t1,>out2}\f$, MatMul\f$_{<t4,>out3}\f$ | Multi-head Attention Training Backward Pattern. |
-| MatMul\f$_{>out1}\f$ + Activation\f$_{>t1,>out2}\f$, [MatMul\f$_{<t1,>out3}\f$ + Activation\f$_{>t1,>out4}\f$]\f$^{0-4}\f$, MatMul\f$_{<t1,>out5}\f$ + Activation\f$_{>out6}\f$ | Multi-layer Perceptron Training Forward Pattern. |
-| StaticTranspose\f$^{?}\f$\f$_{>t0}\f$, ActivationBackward\f$_{>t2}\f$ + MatMul\f$_{<t0,>t1}\f$, ReduceSum\f$^{?}\f$\f$_{<t2,>out1}\f$, StaticTranspose\f$^{?}\f$ + MatMul\f$_{<t2,>out2}\f$, [StaticTranspose\f$^{?}\f$\f$_{>t3}\f$, ActivationBackward\f$_{>t4,<t1}\f$ + MatMul\f$_{<t3,>t1}\f$, ReduceSum\f$^{?}\f$\f$_{<t4,>out3}\f$, StaticTranspose\f$^{?}\f$ + MatMul\f$_{<t4,>out4}\f$]\f$^{0-4}\f$, StaticTranspose\f$^{?}\f$\f$_{>t5}\f$, ActivationBackward\f$_{>t6,<t1}\f$ + MatMul\f$_{<t5,>out5}\f$, ReduceSum\f$^{?}\f$\f$_{<t6,>out6}\f$, StaticTranspose\f$^{?}\f$ + MatMul\f$_{<t6,>out7}\f$ | Multi-layer Perceptron Training Backward Pattern. |
-| Convolution\f$_{>out1}\f$ + BatchNormForwardTraining\f$_{>out2}\f$ + ReLU\f$_{>out3}\f$ + Convolution\f$_{>out4}\f$ + BatchNormForwardTraining\f$_{>out5}\f$ + ReLU\f$_{>out6}\f$ + Convolution\f$_{>out7}\f$ + BatchNormForwardTraining\f$_{>out8}\f$ + Add + ReLU\f$_{>out9}\f$ | Identical Bottleneck Training Forward Pattern. |
-| Convolution\f$_{>out1}\f$ + BatchNormForwardTraining\f$_{>t1,>out2}\f$, Convolution\f$_{>out3}\f$ + BatchNormForwardTraining\f$_{>out4}\f$ + ReLU\f$_{>out5}\f$ + Convolution\f$_{>out6}\f$ + BatchNormForwardTraining\f$_{>out7}\f$ + ReLU\f$_{>out8}\f$ + Convolution\f$_{>out9}\f$ + BatchNormForwardTraining\f$_{>out10}\f$ + Add\f$_{<t1}\f$ + ReLU\f$_{>out11}\f$ | Convolutional Bottleneck Training Forward Pattern. |
-| ReLUBackward\f$_{>t1}\f$ + BatchNormTrainingBackward\f$_{>t2,>out1}\f$ + ConvolutionBackwardData + ReLUBackward + BatchNormTrainingBackward\f$_{>t3,>out2}\f$ + ConvolutionBackwardData + ReLUBackward + BatchNormTrainingBackward\f$_{>t4,>out3}\f$ + ConvolutionBackwardData + Add\f$_{<t1,>out4}\f$, ConvolutionBackwardWeights\f$_{<t2,>out5}\f$, ConvolutionBackwardWeights\f$_{<t3,>out6}\f$, ConvolutionBackwardWeights\f$_{<t4,>out7}\f$ | Identical Bottleneck Training Backward Pattern. |
-| ReLUBackward\f$_{>t1}\f$ + BatchNormTrainingBackward\f$_{>t2,>out1}\f$ + ConvolutionBackwardData + ReLUBackward + BatchNormTrainingBackward\f$_{>t3,>out2}\f$ + ConvolutionBackwardData + ReLUBackward + BatchNormTrainingBackward\f$_{>t4,>out3}\f$ + ConvolutionBackwardData + Add\f$_{<t6,>out4}\f$, BatchNormTrainingBackward\f$_{<t1,>t5,>out5}\f$ + ConvolutionBackwardData\f$_{>t6}\f$, ConvolutionBackwardWeights\f$_{<t2,>out6}\f$, ConvolutionBackwardWeights\f$_{<t3,>out7}\f$, ConvolutionBackwardWeights\f$_{<t4,>out8}\f$, ConvolutionBackwardWeights\f$_{<t5,>out9}\f$ | Convolutional Bottleneck Training Backward Pattern. |
diff --git a/doc/performance_considerations/benchdnn.md b/doc/performance_considerations/benchdnn.md
index 36d8124d059..56fc9628e1c 100644
--- a/doc/performance_considerations/benchdnn.md
+++ b/doc/performance_considerations/benchdnn.md
@@ -4,4 +4,4 @@ Benchmarking Performance {#dev_guide_benchdnn}
 oneDNN has a built-in benchmarking program called benchdnn.
 
 For a complete description of the available options and working examples, see
-the [benchdnn readme](https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn/README.md#benchdnn).
+the [benchdnn readme](https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/README.md#benchdnn).
diff --git a/doc/performance_considerations/verbose.md b/doc/performance_considerations/verbose.md
index ca1f02e95e5..9696125237b 100644
--- a/doc/performance_considerations/verbose.md
+++ b/doc/performance_considerations/verbose.md
@@ -27,6 +27,7 @@ the type of tracing information to display.
 |:---------------------------|:--------------------|:--------------------------------------------------|
 | `ONEDNN_VERBOSE`           | `none`              | no messages printed                               |
 | \                          | **`error`**         | **error messages**  (default)                     |
+| \                          | `warn`              | warning messages                                  |
 | \                          | `check`             | primitive creation parameter checking information |
 | \                          | `profile_create`    | primitive creation  timings                       |
 | \                          | `profile_exec`      | primitive execution timings                       |
@@ -150,7 +151,7 @@ Above, we can see that the highest performance implementations were
 not dispatched either because they required a higher ISA, or because
 they did not support that datatype configuration.
 A complete list of verbose messages encountered in the dispatch mode 
-can be found [here](https://oneapi-src.github.io/oneDNN/dev_guide_verbose_table.html) along with their explanation.
+can be found [here](https://uxlfoundation.github.io/oneDNN/dev_guide_verbose_table.html) along with their explanation.
 
 ### Enable ONEDNN_VERBOSE with timestamps
 
@@ -195,6 +196,7 @@ Each subsequent line of primitive verbose information is formatted as a
 comma-separated list and contains the following, in order of appearance in the
 line from left to right:
 * `onednn_verbose` marker string
+* verbose mode version: `v0` or `v1` 
 * if `ONEDNN_VERBOSE_TIMESTAMP=1` is specified, start time of the call. On Linux
   this number represents amount of milliseconds since Unix epoch. On Windows
   this number represents amount of milliseconds since the last system start.
@@ -238,7 +240,7 @@ primitive execution.
 
 @note
 When oneDNN verbose mode is enabled for builds with
-[Compute Library for the Arm architecture](https://oneapi-src.github.io/oneDNN/dev_guide_build.html#gcc-with-arm-compute-library-acl-on-aarch64-host),
+[Compute Library for the Arm architecture](https://uxlfoundation.github.io/oneDNN/dev_guide_build.html#gcc-with-arm-compute-library-acl-on-aarch64-host),
 any failures in the validation of Compute Library primitives will be detailed
 in the verbose output.
 
diff --git a/doc/performance_considerations/verbose_table.md b/doc/performance_considerations/verbose_table.md
index 34d678aca08..ace5d59afd1 100644
--- a/doc/performance_considerations/verbose_table.md
+++ b/doc/performance_considerations/verbose_table.md
@@ -33,6 +33,7 @@ The following catalogue lists verbose messages, explanations, and additional inf
 |`<t> has a bad number of dimensions <ndims>`	                        |`t`- tensor, `ndims`- number of tensor dimensions                                                                          | all	        | Tensor data has bad or invalid number of dimensions for the current primitive operation. **Example**: The `convolution` primitive expects only 1D-, 2D- or 3D-spatial tensors for operations and prints this message for any other data with higher dimensions.                                                       |
 |`bad dimensions <t>:<axis>`	                                        |`t`- tensor, `axis`- axis          | all	        | Tensor `<t>` has an invalid dimension along the specified axis. **Example**: The `concat` primitive prints this message when the destination tensor dimension along the concatenated axis does not match the sum of the dimensions of the concatenated tensors.                                                                                                                                                |
 |`dimension <t0>:<a0> is inconsistent with <t1>:<a1>`	                |`t0, t1` - tensors,  `a0, a1` - tensor axes | all	| Tensors `t0, t1` have inconsistent dimensions along axes `a0` and `a1` respectively. **Example**: This is encountered for the `matmul` primitive when the input matrices have mismatching dimensions.                                                                                                                                                                                                    |
+|`out-of-range dimensions for <t>`	                                    |`t` - tensor | all	| One of the dimensions of tensor `t` is beyond the maximum range that can be processed by the current implementation.                                                      |
 |`tensors <t0> and <t1> have inconsistent number of dimensions`	        |`t0, t1` - tensors | all	| Tensors `t0, t1` have inconsistent dimensions for primitive operation.                                                                                      |
 |`tensors <t0> and <t1> have inconsistent datatypes`	                |`t0, t1` - tensors | all	| Tensors `t0, t1` have inconsistent data types for primitive operation.                                                                                      |
 |**Unsupported Combinations**                                           |           |               |                                                                                                                                                             |
@@ -53,22 +54,25 @@ The following catalogue lists verbose messages, explanations, and additional inf
 |`alpha and beta parameters are not properly set`	                    |           | `eltwise`	        | Alpha and beta parameters are not properly set for the elementwise algorithm.                                                                           |
 |`large shapes fall back`	                                            |           | `gemm`	        | Heuristic to skip current implementation for large tensor shapes for better performance.                                                                |
 |`only trivial strides are supported`	                                |           | `gemm`, `rnn`	    | Current implementation for the primitive does not process non-trivial stride values.                                                                    |
-|`unsupported fpmath mode`	                                            |           | `matmul`	        | [Floating-point math mode](https://oneapi-src.github.io/oneDNN/group_dnnl_api_fpmath_mode.html?highlight=math%20mode) is not supported by the current primitive implementation.                                                                                                                                                                                                                                                   |
+|`unsupported fpmath mode`	                                            |           | `matmul`	        | [Floating-point math mode](https://uxlfoundation.github.io/oneDNN/group_dnnl_api_fpmath_mode.html?highlight=math%20mode) is not supported by the current primitive implementation.                                                                                                                                                                                                                                                   |
 |`small shapes fall back`	                                            |           | `matmul`	        | Heuristic to skip current implementation for small tensor shapes for better performance.                                                                |
 |`incompatible gemm format`	                                            |           | `matmul`, `ip`    | Specified GeMM format is incompatible with the current primitive implementation.                                                                        |
 |`unsupported <t> tensor layout`	                                    |`t` - tensor | `reorder`	    | The data layout for the source/destination tensor is not supported by the current implementation.                                                       |
 |`bad axis`	                                                            |           | `softmax`, `shuffle`        | Bad or invalid axis specified for softmax/shuffle operation.                                                                                  |
 |`unsupported <d> architecture`	                                        | `d` - `dnnl::engine::kind`    | `gemm`  | Unsupported architecture for specified device-type. Typically encountered when current GPU device does not support the primitive.             |
 |**Miscellaneous**                                                      |           |                   |                                                                                                                                                         |
-|`failed to create nested primitive <pm>`	                            |`pm` - `dnnl::primitive`       | all	        | Descriptor initialization for the nested primitive implementation was unsuccessful.                                                     |
+|`failed to create nested <pm> primitive`	                            |`pm` - `dnnl::primitive`       | all	        | Descriptor initialization for the nested primitive implementation was unsuccessful.                                                     |
 |`failed to create <pm> descriptor`	                                    |`pm` -`dnnl::primitive`, `dnnl::memory`    | all	         | Descriptor initialization for the primitive or memory object was unsuccessful.                                             |
-|`bad accumulation mode`	                                            |           | all	        | Bad or invalid [accumulation mode](https://oneapi-src.github.io/oneDNN/enum_dnnl_accumulation_mode.html) specified for primitive attribute `dnnl::primitive_attr`. |
+|`bad accumulation mode`	                                            |           | all	        | Bad or invalid [accumulation mode](https://uxlfoundation.github.io/oneDNN/enum_dnnl_accumulation_mode.html) specified for primitive attribute `dnnl::primitive_attr`. |
 |`unsupported <t> md flag`	                                            |`t` - tensor               | all	        | Bad or unsupported flags specified for the memory descriptor `dnnl::memory::desc`.                                                          |
 |`problem is not mathematically consistent`	                            |           | all	        | *(self-explanatory)*                                                                                                                                        |
 |`workspace mismatch between forward and backward primitive descriptors`|           | all	        | *(self-explanatory)*                                                                                                                                        |
-|`workspace initialization failed`	                                    |           | all	        | [Workspace](https://oneapi-src.github.io/oneDNN/dev_guide_inference_and_training_aspects.html?highlight=workspace#workspace) descriptor initialization was unsuccessful during primitive creation.                                                                                                                                                                                                                           |
+|`workspace initialization failed`	                                    |           | all	        | [Workspace](https://uxlfoundation.github.io/oneDNN/dev_guide_inference_and_training_aspects.html?highlight=workspace#workspace) descriptor initialization was unsuccessful during primitive creation.                                                                                                                                                                                                                           |
 |`invalid datatype for <t>`	                                            |`t` - tensor | all	        | The data type for the tensor/data processed by the primitive is invalid. **Example**: This is encountered when an undefined data type `data_type::undef` is specified for the accumulator.                                                                                                                                                                                                                                    |
-|`failed to run kernel deterministically`	                            |           | all	        | failed to run application in the [deterministic mode](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_deterministic.html?highlight=deterministic). |
+|`failed to run kernel deterministically`	                            |           | all	        | failed to run application in the [deterministic mode](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_deterministic.html?highlight=deterministic). |
+|`skipping or dispatching to another implementation`	                |           | all	        | *(self-explanatory)*                                                                                                                                        |
+|`failed to create <k> kernel`	                                        |`k` - kernel name       | all	        | *(self-explanatory)*                                                                                                                            |
+
 
 ## Engine Creation
 
@@ -76,13 +80,13 @@ The following catalogue lists verbose messages, explanations, and additional inf
 |:-----------------------------------------------------|:----------|:------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------|
 |`bad engine kind`                                     |           | all	     | Invalid value for `dnnl::engine::kind` encountered during engine creation.                                                                              |
 |`invalid <d> device in environment: index <i>`        |`d` - `dnnl::engine::kind`, `i` - device index           | all	     | Device of type `dnnl::engine::kind` and index `i` is invalid for the current environment.                   |
-|`no <d> device is available`           	           |`d` - `dnnl::engine::kind`          | all	     | No device of type `dnnl::engine::kind` was found during engine creation.                                                        |
-|`<n> <d> devices are available but <i> was queried`   |`d` - `dnnl::engine::kind`,  `n` - number of `d` devices, `i` - queried device index         | all	     | Queried index is out-of-range for device of type `dnnl::engine::kind`.  |
+|`no <d> <k> device is available`           	           |`d` - `dnnl::engine::kind`, `k` - `dnnl::impl::runtime_kind`          | all	     | No device of type `dnnl::engine::kind` was found during engine creation.                                                        |
+|`<n> <d> devices are available but device index <i> was queried`   |`d` - `dnnl::engine::kind`,  `n` - number of `d` devices, `i` - queried device index         | all	     | Queried index is out-of-range for device of type `dnnl::engine::kind`.  |
 |`device not found in the given context`	           |           | all	     | *(self-explanatory)*                                                                                                                                    |
 |`unsupported <d> platform (expected <d0> got <d1>)`   |`d` - `dnnl::engine::kind`, `d0` - queried platform, `d1` - available platform           | `sycl`, `opencl`	     | Unsupported device platform encountered during engine creation. |
-|`failed to create <d> engine with index <i>`	       |`d` - `dnnl::engine::kind`, `i` - device index              |all	     | Engine creation was unsuccessful for specified device index and kind.                                   |
+|`failed to create <d> engine with index <i>`	       |`d` - `dnnl::engine::kind`, `i` - device index              |all	     | Engine creation was unsuccessful for the specified device index and kind.                               |
 |`unsupported <d> backend`	                           |`d` - `dnnl::engine::kind`         | `sycl`	        | *(self-explanatory)*                                                                                                         |
-|`profiling capabilities are not supported`	           |           | all	     | Experimental profiling ([ONEDNN_EXPERIMENTAL_PROFILING](https://oneapi-src.github.io/oneDNN/dev_guide_experimental.html?highlight=profiling#onednn-experimental-profiling)) is not enabled for the application.                                                                                                                                                                                                               |
+|`profiling capabilities are not supported`	           |           | all	     | Experimental profiling ([ONEDNN_EXPERIMENTAL_PROFILING](https://uxlfoundation.github.io/oneDNN/dev_guide_experimental.html?highlight=profiling#onednn-experimental-profiling)) is not enabled for the application.                                                                                                                                                                                                               |
 
 
 ## Memory Creation and Related Operations
@@ -92,6 +96,6 @@ The following catalogue lists verbose messages, explanations, and additional inf
 |`bad arguments for memory descriptor`	    | Bad or unsupported values passed to the memory descriptor `dnnl::memory::desc` during memory object creation.                                                      |
 |`invalid memory index`	                    | An out-of-range value encountered for memory handle during data mapping.                                                                                           |
 |`unsupported memory stride`	            | Memory descriptor initialization failed due to unsupported value for memory strides.                                                                               |
-|`scratchpad memory limit exceeded`	        | [Scratchpad](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_scratchpad.html?highlight=scratchpad) space is exhausted during GEMM kernel initialization.  |
+|`scratchpad memory limit exceeded`	        | [Scratchpad](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_scratchpad.html?highlight=scratchpad) space is exhausted during GEMM kernel initialization.  |
 |`scratchpad initialization unsuccessful`	| *(self-explanatory)*                                                                                                                                               |
 
diff --git a/doc/primitives/batch_normalization.md b/doc/primitives/batch_normalization.md
index a85f6ac442a..6fddd954fab 100644
--- a/doc/primitives/batch_normalization.md
+++ b/doc/primitives/batch_normalization.md
@@ -103,8 +103,8 @@ requires different inputs and outputs.  For clarity, a summary is shown below.
 | #dnnl_normalization_flags_none                               | *Inputs*: \src <br><br> *Outputs*: \dst                                                                            | *Inputs*: \src <br><br> *Outputs*: \dst, \f$\mu\f$, \f$\sigma^2\f$                                                                                                                       | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$ <br><br> *Outputs*: \diffsrc                                                                                                              | Same as for #dnnl_backward                                                                                       |
 | #dnnl_use_global_stats                                       | *Inputs*: \src, \f$\mu\f$, \f$\sigma^2\f$ <br><br> *Outputs*: \dst                                                 | *Inputs*: \src, \f$\mu\f$, \f$\sigma^2\f$ <br><br> *Outputs*: \dst                                                                                                                       | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$ <br><br> *Outputs*: \diffsrc                                                                                                              | Same as for #dnnl_backward                                                                                       |
 | #dnnl_use_scale                                              | *Inputs*: \src, \f$\gamma\f$  <br><br> *Outputs*: \dst                                                             | *Inputs*: \src, \f$\gamma\f$ <br><br> *Outputs*: \dst, \f$\mu\f$, \f$\sigma^2\f$                                                                                                         | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$ <br><br> *Outputs*: \diffsrc, \f$\diffgamma\f$                                                                              | Not supported                                                                                                    |
-| #dnnl_use_shift                                              | *Inputs*: \src, \f$\beta\f$ <br><br> *Outputs*: \dst                                                               | *Inputs*: \src, \f$\beta\f$ <br><br> *Outputs*: \dst, \f$\mu\f$, \f$\sigma^2\f$                                                                                                          | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\beta\f$ <br><br> *Outputs*: \diffsrc, \f$\diffbeta\f$                                                                                | Not supported                                                                                                    |
-| #dnnl_use_global_stats \| #dnnl_use_scale \| #dnnl_use_shift | *Inputs*: \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$, \f$\beta\f$ <br><br> *Outputs*: \dst                      | *Inputs*: \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$, \f$\beta\f$ <br><br> *Outputs*: \dst                                                                                            | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$, \f$\beta\f$ <br><br> *Outputs*: \diffsrc, \f$\diffgamma\f$, \f$\diffbeta\f$                                                | Not supported                                                                                                    |
+| #dnnl_use_shift                                              | *Inputs*: \src, \f$\beta\f$ <br><br> *Outputs*: \dst                                                               | *Inputs*: \src, \f$\beta\f$ <br><br> *Outputs*: \dst, \f$\mu\f$, \f$\sigma^2\f$                                                                                                          | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$  <br><br> *Outputs*: \diffsrc, \f$\diffbeta\f$                                                                                | Not supported                                                                                                    |
+| #dnnl_use_global_stats \| #dnnl_use_scale \| #dnnl_use_shift | *Inputs*: \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$, \f$\beta\f$ <br><br> *Outputs*: \dst                      | *Inputs*: \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$, \f$\beta\f$ <br><br> *Outputs*: \dst                                                                                            | *Inputs*: \diffdst, \src, \f$\mu\f$, \f$\sigma^2\f$, \f$\gamma\f$  <br><br> *Outputs*: \diffsrc, \f$\diffgamma\f$, \f$\diffbeta\f$                                                | Not supported                                                                                                    |
 | `flags` \| #dnnl_fuse_norm_relu                              | *Inputs*: same as with `flags` <br><br> *Outputs*: same as with `flags`                                            | *Inputs*: same as with `flags` <br><br> *Outputs*: same as with `flags`, [Workspace](@ref dev_guide_inference_and_training_aspects_workspace)                                            | *Inputs*: same as with `flags`, [Workspace](@ref dev_guide_inference_and_training_aspects_workspace) <br><br> *Outputs*: same as with `flags`                                                 | Same as for #dnnl_backward if `flags` do not contain #dnnl_use_scale or #dnnl_use_shift; not supported otherwise |
 | `flags` \| #dnnl_fuse_norm_add_relu                          | *Inputs*: same as with `flags` and \f$\src_1\f$ for fused binary addition <br><br> *Outputs*: same as with `flags` | *Inputs*: same as with `flags` and \f$\src_1\f$ for fused binary addition <br><br> *Outputs*: same as with `flags`, [Workspace](@ref dev_guide_inference_and_training_aspects_workspace) | *Inputs*: same as with `flags`, [Workspace](@ref dev_guide_inference_and_training_aspects_workspace) <br><br> *Outputs*: same as with `flags` and \f$\diffsrc_1\f$ for fused binary addition | Same as for #dnnl_backward if `flags` do not contain #dnnl_use_scale or #dnnl_use_shift; not supported otherwise |
 
@@ -193,7 +193,7 @@ If #dnnl_use_scale or #dnnl_use_shift are used, the scale (\f$\gamma\f$) and
 shift (\f$\beta\f$) are separate 1D tensors of shape \f$C\f$.
 
 
-The format of the corresponding memory object must be #dnnl_nc (#dnnl_ab).
+The format of the corresponding memory object must be #dnnl_a.
 
 #### Source, Destination, and Their Gradients
 
diff --git a/doc/primitives/binary.md b/doc/primitives/binary.md
index 81b8bed6157..47f50248d31 100644
--- a/doc/primitives/binary.md
+++ b/doc/primitives/binary.md
@@ -16,9 +16,18 @@ between tensors source 0 and source 1 (the variable names follow the standard
         \src_0(\overline{x}) \mathbin{op} \src_1(\overline{x}),
 \f]
 
-where \f$op\f$ is one of addition, subtraction, multiplication, division,
-greater than or equal to, greater than, less than or equal to, less than,
-equal to, not equal to, get maximum value, and get minimum value.
+where \f$op\f$ is one of the following operators: addition (\f$+\f$), 
+subtraction (\f$-\f$), multiplication (\f$\times\f$), division (\f$\div\f$), 
+greater than or equal to (\f$\geq\f$), greater than (\f$>\f$), 
+less than or equal to (\f$\leq\f$), less than (\f$<\f$), equal to (\f$=\f$), 
+not equal to (\f$\neq\f$), get maximum value (\f$\max(\cdot)\f$), 
+get minimum value (\f$\min(\cdot)\f$), and conditional select operation.
+For the conditional select operation, the binary primitive uses a third input 
+tensor \f$src_2\f$ to select between the two source tensors:
+
+\f[
+    \dst[i] = \src_2[i] ? \src_0[i] : \src_1[i]
+\f]
 
 The binary primitive does not have a notion of forward or backward propagations.
 
@@ -31,6 +40,7 @@ argument index as specified by the following table.
 |-----------------------------|---------------------------------------------------------------------------|
 | \f$\src_0\f$                | DNNL_ARG_SRC_0                                                            |
 | \f$\src_1\f$                | DNNL_ARG_SRC_1                                                            |
+| \f$\src_2\f$                | DNNL_ARG_SRC_2                                                            |
 | \dst                        | DNNL_ARG_DST                                                              |
 | \f$\text{binary post-op}\f$ | DNNL_ARG_ATTR_MULTIPLE_POST_OP(binary_post_op_position) \| DNNL_ARG_SRC_1 |
 | \f$binary scale0\f$         | DNNL_ARG_ATTR_SCALES \| DNNL_ARG_SRC_0                                    |
@@ -65,6 +75,10 @@ argument index as specified by the following table.
    be overwritten. In-place mode requires the \dst and source 0 data types to be
    the same. Different data types will unavoidably lead to correctness issues.
 
+ * For the binary select operation, broadcast semantics are not supported for 
+   the third conditional input tensor. For this case, the dimensions and layout 
+   of the conditional input tensor must match that of the source 0 tensor. 
+
 ### Post-Ops and Attributes
 
 The following attributes are supported:
@@ -80,6 +94,8 @@ The following attributes are supported:
 
 The source and destination tensors may have `f32`, `bf16`, `f16`, `s32` or `s8/u8`
 data types.
+For the binary select operation, the conditional input tensor can only be 
+of `s8` data type. 
 The binary primitive supports the following combinations of data types:
 
 | Source 0 / 1                | Destination                 |
@@ -106,7 +122,6 @@ meaning associated with any of tensors dimensions.
 
 2. **GPU**
    - Only tensors of 6 or fewer dimensions are supported.
-   - s32 data type is not supported.
 
 ## Performance Tips
 
diff --git a/doc/primitives/convolution.md b/doc/primitives/convolution.md
index 069c781cc03..205ba5d240e 100644
--- a/doc/primitives/convolution.md
+++ b/doc/primitives/convolution.md
@@ -100,6 +100,12 @@ Here:
 - \f$OW = \left\lfloor{\frac{IW - DKW + PW_L + PW_R}{SW}}
         \right\rfloor + 1,\f$ where \f$DKW = 1 + (KW - 1) \cdot (DW + 1)\f$.
 
+@note In oneDNN, convolution without dilation is defined by setting the dilation
+parameters to `0`. This differs from PyTorch and TensorFlow, where a non-dilated
+case corresponds to a dilation value of `1`. As a result, the PyTorch and
+TensorFlow dilation parameters need to be adjusted by subtracting `1` (for example,
+\f$DH_onednn = DH_torch - 1\f$, and \f$DW_onednn = DW_torch - 1\f$).
+
 #### Deconvolution (Transposed Convolution)
 
 Deconvolutions (also called fractionally strided convolutions or transposed
@@ -160,22 +166,22 @@ N/A.
 Convolution primitive supports the following combination of data types for
 source, destination, and weights memory objects:
 
-| Propagation    | Source    | Weights      | Destination                 | Bias                        |
-|:---------------|:----------|:-------------|:----------------------------|:----------------------------|
-| forward        | f32       | f32          | f32, u8, s8                 | f32                         |
-| forward        | f16       | f16          | f16, f32, u8, s8            | f16, f32                    |
-| forward        | u8, s8    | s8           | u8, s8, s32, f32, f16, bf16 | u8, s8, s32, f32, f16, bf16 |
-| forward        | bf16      | bf16         | f32, bf16                   | f32, bf16                   |
-| forward        | f8_e5m2   | f8_e5m2      | f8_e5m2, f32, f16, bf16     | f32                         |
-| forward        | f64       | f64          | f64                         | f64                         |
-| backward       | f32, bf16 | bf16         | bf16                        |                             |
-| backward       | f32, f16  | f16          | f16                         |                             |
-| backward       | f8_e5m2   | f8_e5m2      | f8_e5m2                     |                             |
-| backward       | f32       | f32          | f32                         | f32                         |
-| backward       | f64       | f64          | f64                         | f64                         |
-| weights update | bf16      | f32, bf16    | bf16, s8, u8                | f32, bf16                   |
-| weights update | f16       | f32, f16     | f16                         | f32, f16                    |
-| weights update | f8_e5m2   | f32, f8_e5m2 | f8_e5m2                     | f32                         |
+| Propagation    | Source           | Weights               | Destination                      | Bias                        |
+|:---------------|:-----------------|:----------------------|:---------------------------------|:----------------------------|
+| forward        | f32              | f32                   | f32, u8, s8                      | f32                         |
+| forward        | f16              | f16                   | f16, f32, u8, s8                 | f16, f32                    |
+| forward        | u8, s8           | s8                    | u8, s8, s32, f32, f16, bf16      | u8, s8, s32, f32, f16, bf16 |
+| forward        | bf16             | bf16                  | f32, bf16                        | f32, bf16                   |
+| forward        | f8_e5m2, f8_e4m3 | f8_e5m2, f8_e4m3      | f8_e5m2, f8_e4m3, f32, f16, bf16 | f32                         |
+| forward        | f64              | f64                   | f64                              | f64                         |
+| backward       | f32, bf16        | bf16                  | bf16                             |                             |
+| backward       | f32, f16         | f16                   | f16                              |                             |
+| backward       | f8_e5m2, f8_e4m3 | f8_e5m2, f8_e4m3      | f8_e5m2, f8_e4m3                 |                             |
+| backward       | f32              | f32                   | f32                              | f32                         |
+| backward       | f64              | f64                   | f64                              | f64                         |
+| weights update | bf16             | f32, bf16             | bf16, s8, u8                     | f32, bf16                   |
+| weights update | f16              | f32, f16              | f16                              | f32, f16                    |
+| weights update | f8_e5m2, f8_e4m3 | f32, f8_e5m2, f8_e4m3 | f8_e5m2, f8_e4m3                 | f32                         |
 
 @warning
     There might be hardware and/or implementation specific restrictions.
@@ -432,8 +438,8 @@ of Winograd algorithm implementations.
 
 3. **GPU**
    - Depthwise post-op is not supported
-   - Only reference support is available for f8_e4m3. Optimized implementation
-     is available for f8_e5m2 on Intel(R) Data Center GPU Max Series only.
+   - `f8` iplementation uses Intel XMX cores only on Intel GPUs based on
+     Xe-HPC and Xe2-LPG, and Xe2-HPG uArch.
 
 4. **CPU**
    - Only reference support for fp8 data types (f8_e5m2, f8_e4m3) is
diff --git a/doc/primitives/lrn.md b/doc/primitives/lrn.md
index 7e4ca54a8fc..24e472af655 100644
--- a/doc/primitives/lrn.md
+++ b/doc/primitives/lrn.md
@@ -14,7 +14,7 @@ The LRN primitive performs a forward or backward local response normalization.
 The LRN operation is defined by the following formulas (the variable names
 follow the standard @ref dev_guide_conventions):
 
-LRN [across channels](#dnnl_lrn_across_channels):
+LRN across channels:
 
 \f[
     \dst(n, c, h, w) =
@@ -26,7 +26,7 @@ LRN [across channels](#dnnl_lrn_across_channels):
         \src(n, c, h, w),
 \f]
 
-LRN [within channel](#dnnl_lrn_within_channel):
+LRN within a single channel:
 
 \f[
     \dst(n, c, h, w) =
diff --git a/doc/primitives/matmul.md b/doc/primitives/matmul.md
index a8073c68090..1a47dbc8437 100644
--- a/doc/primitives/matmul.md
+++ b/doc/primitives/matmul.md
@@ -67,7 +67,7 @@ argument index as specified by the following table.
    user must pass fully specified memory objects so that the primitive is able
    to perform the computations. Note that the less information about shapes
    or format is available at the creation stage, the less performant execution
-   will be.  In particular, if the shape is not known at creation stage, one
+   will be.  In particular, if the shape is not known at the creation stage, you
    cannot use the special format tag #dnnl::memory::format_tag::any to enable an
    implementation to choose the most appropriate memory format for the
    corresponding input or output shapes. On the other hand, run-time specified
@@ -80,13 +80,13 @@ argument index as specified by the following table.
    invalid.
 
 3. The broadcasting shape consistency check is not done for the dimensions with
-   #DNNL_RUNTIME_DIM_VAL. It is user responsibility to make sure the dimensions
+   #DNNL_RUNTIME_DIM_VAL. Make sure the dimensions
    for the tensors are valid.
 
 4. Multiple batch dimensions and broadcasting of batch dimensions of `src` and
    `weights` are supported for both CPU and GPU engines.
 
-   Please check tutorials below to see #DNNL_RUNTIME_DIM_VAL support in use.
+   Check the tutorials below to see #DNNL_RUNTIME_DIM_VAL support in use.
 
 ### Data Types
 
@@ -94,14 +94,17 @@ The MatMul primitive supports the following combinations of data
 types for source, destination, weights, and bias tensors:
 
 
-| Source         | Weights   | Destination                 | Bias                        |
-|:---------------|:----------|:----------------------------|:----------------------------|
-| f32            | f32       | f32                         | f32                         |
-| f16            | f16       | f16, u8, s8                 | f16, f32                    |
-| bf16           | bf16      | f32, bf16                   | bf16, f32                   |
-| f32, bf16, f16 | u8, s8    | f32, bf16, f16              | f32, bf16, f16              |
-| u8, s8         | s8        | u8, s8, s32, f32, f16, bf16 | u8, s8, s32, f32, f16, bf16 |
-| f8_e5m2        | f8_e5m2   | f32, f16, bf16, f8_e5m2     | f32, bf16, f16              |
+| Source           | Weights              | Destination                      | Bias                        |
+|:-----------------|:---------------------|:---------------------------------|:----------------------------|
+| f64              | f64                  | f64                              | f64, f32, f16, bf16, s8, u8 |
+| f32              | f32                  | f32                              | f32, bf16, f16, u8, s8      |
+| f16              | f16, u8, s8, u4, s4  | f16, u8, s8                      | f32                         |
+| f16              | f16, u8, s8          | f32                              | f32, f16                    |
+| bf16             | bf16, u8, s8, u4, s4 | f32, bf16                        | f32, bf16                   |
+| f32, bf16, f16   | u8, s8               | f32, bf16, f16                   | f32, bf16, f16              |
+| f8_e5m2, f8_e4m3 | f8_e5m2, f8_e4m3     | f32, f16, bf16, f8_e5m2, f8_e4m3 | f32, bf16, f16              |
+| u8, s8           | s8                   | u8, s8, s32, f32, f16, bf16      | u8, s8, s32, f32, f16, bf16 |
+
 
 
 ### Data Representation
@@ -178,8 +181,8 @@ memory buffer that shares its shape with the destination buffer).
    - Sum post-op doesn't support data type other than destination data type.
    - Bias of bf16 data type is supported for configuration with bf16 source data
      type and weights bf16 data type, and up to three dimensional matrices.
-   - Only reference support is available for f8_e4m3. Optimized implementation
-     for f8_e5m2 is available only on Intel(R) Data Center GPU Max Series.
+   - Optimized implementations for fp8 data type are available only on Intel(R) 
+     Data Center GPU Max Series and Intel(R) Xe2 Graphics.
    - Configuration with int8 source data type, s8 weight data type and bf16
      destination data type don't support:
      * Destination zero point.
@@ -187,13 +190,12 @@ memory buffer that shares its shape with the destination buffer).
      * Three and higher dimensional matrices.
    - The layout of dropout mask has to be exactly the same as that of dst.
 
+
 3. **CPU**
    - Configuration with int8 source data type, s8 weight data type and f16
      destination data type isn't supported.
    - Configuration with floating point source data type, integer weights data
      type and floating point destination data type is not optimized.
-   - Only reference support for fp8 data types (f8_e5m2, f8_e4m3) is
-     is available on CPU.
    - The layout of dropout mask has to be exactly the same as that of dst.
  
 ## Performance Tips
diff --git a/doc/primitives/prelu.md b/doc/primitives/prelu.md
index 52c9669b097..3418cc7c3b9 100644
--- a/doc/primitives/prelu.md
+++ b/doc/primitives/prelu.md
@@ -62,7 +62,7 @@ For no broadcast case, results are calculated using formula:
         \diffdst(n, c, h, w) \cdot \weights(n, c, h, w) &
         \mbox{if } \src(n, c, h, w) \leq 0
         \end{cases}\\\\
-    \diff_weights(n, c, h, w) &=
+    \diffweights(n, c, h, w) &=
         \min(\src(n, c, h, w), 0) \cdot \diffdst(n, c, h, w)
 \f]
 
diff --git a/doc/primitives/reorder.md b/doc/primitives/reorder.md
index 76a50405afa..16a8310ec15 100644
--- a/doc/primitives/reorder.md
+++ b/doc/primitives/reorder.md
@@ -115,15 +115,16 @@ would lead to the following operation:
 
 \f[
     \dst(\overline{x}) =
-            scale_{src} \cdot \src(\overline{x} - shift_{src}) +
+            scale_{src} \cdot (\src(\overline{x}) - shift_{src}) +
             \beta  \cdot \dst(\overline{x}) + shift_{dst}
 \f]
 
 @note
     * The intermediate operations are being done using single precision
       floating point data type.
-    * \f$scale_{src}\f$ and \f$scale_{dst}\f$ must be passed during execution runtime
-      as a separate memory argument. Using \f$scale_{src}\f$ argument will lead to
+    * \f$scale_{src}\f$, \f$shift_{src}\f$, \f$scale_{dst}\f$, and
+      \f$shift_{dst}\f$ must be passed during execution runtime as a separate
+      memory arguments. Using \f$scale_{src}\f$ argument will lead to
       multiplication of tensor values by a scale value. Using \f$scale_{dst}\f$
       argument will lead to division of tensor values by a scale value.
 
diff --git a/doc/primitives/softmax.md b/doc/primitives/softmax.md
index 1ec1a4961e1..cbe15452e6d 100644
--- a/doc/primitives/softmax.md
+++ b/doc/primitives/softmax.md
@@ -100,12 +100,28 @@ argument index as specified by the following table.
 Attributes enable you to modify the behavior of the softmax primitive.
 The following attributes are supported by the softmax primitive:
 
-| Propagation | Type      | Operation                                            | Description                                                   | Restrictions                                                           |
-|:------------|:----------|:-----------------------------------------------------|:--------------------------------------------------------------|:-----------------------------------------------------------------------|
-| forward     | attribute | [Scales](@ref dnnl::primitive_attr::set_scales_mask) | Scales the corresponding tensor by the given scale factor(s). | Supported only for int8 softmax and one scale per tensor is supported. |
-| forward     | post-op   | [Binary](@ref dnnl::post_ops::append_binary)         | Applies a @ref dnnl_api_binary operation to the result        | General binary post-op restrictions                                    |
-| forward     | Post-op   | [Eltwise](@ref dnnl::post_ops::append_eltwise)       | Applies an @ref dnnl_api_eltwise operation to the result.     |                                                                        |
-
+| Propagation | Type      | Operation                                                             | Description                                                   | Restrictions                                                           |
+|:------------|:----------|:----------------------------------------------------------------------|:--------------------------------------------------------------|:-----------------------------------------------------------------------|
+| forward     | attribute | [Scales](@ref dnnl::primitive_attr::set_scales_mask)                  | Scales the corresponding tensor by the given scale factor(s). | Supported only for int8 softmax and one scale per tensor is supported. |
+| forward     | post-op   | [Binary](@ref dnnl::post_ops::append_binary)                          | Applies a @ref dnnl_api_binary operation to the result        | General binary post-op restrictions                                    |
+| forward     | Post-op   | [Eltwise](@ref dnnl::post_ops::append_eltwise)                        | Applies an @ref dnnl_api_eltwise operation to the result.     |                                                                        |
+| forward     | attribute | [Accumulation mode](@ref dnnl::primitive_attr::set_accumulation_mode) | Defines the implementation's accumulation arithmetic.         | Only the values `strict`, `relaxed`, and `any` are supported.          |
+
+#### Accumulation Mode
+
+You can optimize performance of the forward operation when the source and
+destination floating-point data types of the operation are equal and different
+from `f32`. When the destination data type is different from `f32`, additional
+memory will be used to accumulate data and store it in the destination memory
+buffer for a requested data type. Using the additional memory can be opted-out
+with an accumulation mode setting set to
+[relaxed](@ref dnnl::accumulation_mode::relaxed) or
+[any](@ref dnnl::accumulation_mode::any), which will use the precision of
+destination data type to accumulate intermediate results directly into the
+destination memory buffer. This performance optimization, however, results in
+in a minor decrease in accuracy. Depending on the actual data, the difference
+between `strict` and `relaxed` accumulation can reach several units in the last
+piece (ulps).
 
 ### Data Type Support
 
diff --git a/doc/programming_model/data_types.md b/doc/programming_model/data_types.md
index 9166a2ff2d8..eaee2f2dfb1 100644
--- a/doc/programming_model/data_types.md
+++ b/doc/programming_model/data_types.md
@@ -7,8 +7,7 @@ to be the golden standard in deep learning applications and is supported
 in all the library functions. The purpose of low precision data types
 support is to improve performance of compute intensive operations, such as
 convolutions, inner product, and recurrent neural network cells
-in comparison to fp32. Boolean data type is used for Graph Compiler to optimize 
-operations which take bool as inputs and/or outputs data type.
+in comparison to fp32.
 
 | Data type | Description                                                                                                                                                                             |
 |:----------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -21,32 +20,27 @@ operations which take bool as inputs and/or outputs data type.
 | boolean   | bool (size is C++ implementation defined)                                                                                                                                               |
 | f8\_e5m2  | [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) with 5 exponent and 2 mantissa bits |
 | f8\_e4m3  | [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf) with 4 exponent and 3 mantissa bits |
+| e8m0      | [MX standard 8-bit scaling type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf)                                                                 |
+| f4\_e2m1  | [MX standard 4-bit floating-point](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2 exponent and 1 mantissa bits                           |
+| f4\_e3m0  | 4-bit floating-point with 3 exponent bits and no mantissa bit                                                                                                                           |
 
 
-@note
-    Boolean is only supported in the Graph Compiler in CPU engines. No
-    primitives support boolean during primitive computation.
-
 ## Inference and Training
 
 oneDNN supports training and inference with the following data types:
 
-| Usage mode | CPU                                                      | GPU                                           |
-|:-----------|:---------------------------------------------------------|:----------------------------------------------|
-| Inference  | f32, bf16, f16, f8\_e5m2/f8\_e4m3, s8/u8, s4/u4, boolean | f32, bf16, f16, f8\_e5m2/f8\_e4m3, s8/u8, f64 |
-| Training   | f32, bf16, f16                                           | f32, bf16, f16, f64                           |
+| Usage mode | CPU                                                                          | GPU                                           |
+|:-----------|:-----------------------------------------------------------------------------|:----------------------------------------------|
+| Inference  | f32, bf16, f16, f8\_e5m2/f8\_e4m3, f4\_e2m1, f4\_e3m0, s8/u8, s4/u4, boolean | f32, bf16, f16, f8\_e5m2/f8\_e4m3, s8/u8, f64 |
+| Training   | f32, bf16, f16, f8\_e5m2/f8\_e4m3                                            | f32, bf16, f16, f8\_e5m2/f8\_e4m3, f64        |
 
 @note
     Using lower precision arithmetic may require changes in the deep learning
     model implementation.
 
 @note
-    f64 is only supported for convolution, reorder, layer normalization and
-    pooling primitives, on the GPU engine.
-
-@note
-    Boolean is only supported by the oneDNN graph API when the graph compiler
-    backend is enabled.
+    f64 is supported only for matmul, convolution, reorder, layer normalization, and
+    pooling primitives on the GPU engine.
 
 @note
     s4/u4 data types are only supported as a storage data type for weights argument
@@ -75,12 +69,12 @@ post-ops).  The following formula governs the datatypes dynamic during
 a primitive computation:
 
 \f[
-\operatorname{convert_{dst\_dt}} ( \operatorname{dst\_zero\_point_{f32}} + \operatorname{postops_{f32}} (\operatorname{oscale_{f32}} * \operatorname{convert_{f32}} (\operatorname{Op}(\operatorname{src_{src\_dt}}, \operatorname{weights_{wei\_dt}}, ...))))
+\operatorname{convert_{dst\_dt}} ( \operatorname{zp_{dst}} + 1/\operatorname{scale_{dst}} * \operatorname{postops_{f32}} (\operatorname{convert_{f32}} (\operatorname{Op}(\operatorname{src_{src\_dt}}, \operatorname{weights_{wei\_dt}}, ...))))
 \f]
 
 The `Op` output datatype depends on the datatype of its inputs:
 - if `src`, `weights`, ... are floating-point datatype (f32, f16,
-  bf16, f8\_e5m2, f8\_e4m3), then the `Op` outputs f32 elements.
+  bf16, f8\_e5m2, f8\_e4m3, f4\_e2m1, f4\_e3m0), then the `Op` outputs f32 elements.
 - if `src`, `weights`, ... are integral datatypes (s8, u8, s32), then
   the `Op` outputs s32 elements.
 - if the primitive allows to mix input datatypes, the `Op` outputs
@@ -96,7 +90,15 @@ No downconversions are allowed by default, but can be enabled using
 the floating-point math controls described in @ref
 dev_guide_attributes_fpmath_mode.
 
-
+The \f$convert_{dst\_dt}\f$ conversion is guaranteed to be faithfully
+rounded but not guaranteed to be correctly rounded (the returned value
+is not always the closest one but one of the two closest representable
+value). In particular, some hardware platforms have no direct
+conversion instructions from f32 data type to low-precision data types
+such as fp8 or fp4, and will perform conversion through an
+intermediate data type (for example f16 or bf16), which may result in
+[double
+rounding](https://en.wikipedia.org/wiki/Rounding#Double_rounding).
 
 ### Rounding mode and denormal handling
 
@@ -111,8 +113,11 @@ the floating-point environment can control:
 
 @note
   For CPU devices, the default floating-point environment is defined by
-  the C and C++ standards in the fenv.h header. Rounding mode can be
-  changed globally using the fesetround() C function.
+  the C and C++ standards in the following header:
+~~~cpp
+#include <fenv.h>
+~~~
+  Rounding mode can be changed globally using the `fesetround()` C function.
 
 @note
   Most DNN applications do not require precise computations with denormal
@@ -164,7 +169,8 @@ types that oneDNN recognizes.
 | bf16               | Intel DL Boost with bfloat16 support |
 | f16                | Intel AVX512-FP16                    |
 | boolean            | Intel AVX2                           |
-| f8\_e5m2, f8\_e4m3 | TBA.                                 |
+| f8\_e5m2, f8\_e4m3 | Intel AVX512-FP16                    |
+| f4\_e2m1, f4\_e3m0 | TBA                                  |
 
 @note
   See @ref dev_guide_int8_computations in the Developer Guide for additional
@@ -205,30 +211,33 @@ library:
    * Intel(R) Data Center GPU Flex Series (formerly Arctic Sound)
  * Xe-HPC (accelerated f16, bf16, u8, and s8 support via DPAS and f64 support via MAD)
    * Intel(R) Data Center GPU Max Series (formerly Ponte Vecchio)
+ * Xe2-LPG
+   * Intel(R) Graphics for Intel(R) Core(TM) Ultra processors (Series 2) (formerly Lunar Lake)
+ * Xe2-HPG
+   * Intel(R) Arc(TM) B-Series Graphics (formerly Battlemage)
 
 The following table indicates the data types with performant compute primitives
 for each uArch supported by oneDNN. Unless otherwise noted, all data types have 
 reference support on all architectures.
 
-| uArch  | Supported Data types                             |
-|:-------|:-------------------------------------------------|
-| Xe-LP  | f32, f16, s8, u8                                 |
-| Xe-HPG | f32, f16, bf16, s8, u8                           |
-| Xe-HPC | f64, f32, bf16, f16, s8, u8                      |
-| TBA    | f64, f32, bf16, f16, s8, u8, f8\_e5m2, f8\_e4m3  |
+| uArch   | Supported Data types                                                |
+|:--------|:--------------------------------------------------------------------|
+| Xe-LP   | f32, f16, s8, u8                                                    |
+| Xe-HPG  | f32, f16, bf16, s8, u8                                              |
+| Xe-HPC  | f64, f32, bf16, f16, s8, u8                                         |
+| Xe2-LPG | f64, f32, bf16, f16, s8, u8                                         |
+| Xe2-HPG | f64, f32, bf16, f16, s8, u8                                         |
+| TBA     | f64, f32, bf16, f16, s8, u8, f8\_e5m2, f8\_e4m3, f4\_e2m1, f4\_e3m0 |
+
 
 @note
   f64 configurations are only supported on GPU engines with HW capability for
   double-precision floating-point.
 
 @note
-  f8\_e5m2 compute operations have limited performance through upconversion on
-  Xe-HPC.
+  f8\_e5m2 and f8\_e4m3 compute operations have limited performance through upconversion on
+  Xe-HPC and Xe2 GPUs.
 
 @note
   f16 operations may be faster with f16 accumulation on GPU architectures older
   than Xe-HPC. Newer architectures accumulate to f32.
-
-@note
-  Boolean is only supported by the oneDNN graph API when the graph compiler
-  backend is enabled. The graph compiler backend only supports the CPU engine.
diff --git a/doc/rst/graph_extension.rst b/doc/rst/graph_extension.rst
index 5d835236442..cd681e1d9c2 100644
--- a/doc/rst/graph_extension.rst
+++ b/doc/rst/graph_extension.rst
@@ -6,7 +6,6 @@ Graph Extension
 
    graph_programming_model
    graph_supported_operations
-   dev_guide_graph_fusion_patterns
+   graph_fusion_patterns
    dev_guide_graph_dump
    dev_guide_constant_tensor_cache
-   dev_guide_graph_compiler
diff --git a/doc/rst/index.rst b/doc/rst/index.rst
index 8cdad3d559d..9a0e3d232c1 100644
--- a/doc/rst/index.rst
+++ b/doc/rst/index.rst
@@ -1,5 +1,5 @@
-oneAPI Deep Neural Network Library Developer Guide and Reference
-=======================================================================
+oneAPI Deep Neural Network Library (oneDNN) Developer Guide and Reference
+=========================================================================
 
 .. toctree::
    :maxdepth: 1
diff --git a/doc/rst/orphans.rst b/doc/rst/orphans.rst
index db05dbc3653..636e6a6d210 100644
--- a/doc/rst/orphans.rst
+++ b/doc/rst/orphans.rst
@@ -37,6 +37,7 @@ Orphans
     example_convolution.cpp.rst
     example_cpu_cnn_training_f32.c.rst
     example_cpu_matmul_csr.cpp.rst
+    example_cpu_matmul_coo.cpp.rst
     example_cpu_matmul_quantization.cpp.rst
     example_cpu_matmul_weights_compression.cpp.rst
     example_cpu_rnn_inference_f32.cpp.rst
@@ -79,6 +80,7 @@ Orphans
     page_convolution_example_cpp.rst
     page_convolution_example_cpp_short.rst
     page_cpu_matmul_csr_cpp
+    page_cpu_matmul_coo_cpp
     page_cpu_matmul_weights_compression_cpp
     page_cpu_matmul_quantization_cpp.rst
     page_cpu_matmul_quantization_cpp_short.rst
diff --git a/doc/sphinx/_static/favicons.png b/doc/sphinx/_static/favicons.png
new file mode 100644
index 00000000000..f450376b19e
Binary files /dev/null and b/doc/sphinx/_static/favicons.png differ
diff --git a/doc/sphinx/_static/oneAPI-rgb-rev-100.png b/doc/sphinx/_static/oneAPI-rgb-rev-100.png
new file mode 100644
index 00000000000..58d2d5c54e5
Binary files /dev/null and b/doc/sphinx/_static/oneAPI-rgb-rev-100.png differ
diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py
index 976056daff7..a512c72b941 100644
--- a/doc/sphinx/conf.py
+++ b/doc/sphinx/conf.py
@@ -51,7 +51,7 @@ def whereis(binary):
 # -- Project information -----------------------------------------------------
 
 project = 'oneDNN'
-copyright = '2016-2024 Intel Corporation'
+copyright = '2016-2025 Intel Corporation'
 author = ''
 
 # -- General configuration ---------------------------------------------------
@@ -116,6 +116,8 @@ def whereis(binary):
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
+source_suffix = '.rst'
+
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
@@ -135,11 +137,16 @@ def whereis(binary):
 html_static_path = ['_static']
 #html_js_files = [('dnnl.js', {'defer': 'defer'})]
 
+html_logo = '_static/oneAPI-rgb-rev-100.png'
+html_favicon = '_static/favicons.png'
+
 html_theme_options = {
-    "repository_url": "https://github.com/oneapi-src/oneDNN",
-    "repository_branch": "master",
+    "repository_url": "https://github.com/uxlfoundation/oneDNN",
+    "repository_branch": "main",
     "use_repository_button": True,
-    "use_download_button": False
+    "use_download_button": True,
+    "path_to_docs": "doc",
+    "use_issues_button": True
 }
 
 mathjax3_config = {
diff --git a/doc/ukernel/operations/brgemm.md b/doc/ukernel/operations/brgemm.md
index acefec29b2f..7e4520fb032 100644
--- a/doc/ukernel/operations/brgemm.md
+++ b/doc/ukernel/operations/brgemm.md
@@ -44,14 +44,14 @@ The BRGeMM ukernel supports the following combinations of data-types.
 
 Because of hardware restrictions, the BRGeMM ukernel requires a specific data
 layout. For x86-64 architecture this layout applies to a B matrix. It is
-expressed through @ref dnnl::ukernel::pack_type which can be queried by
-@ref dnnl::ukernel::brgemm::get_B_pack_type call. If the query returns
-@ref dnnl::ukernel::brgemm::pack_type::no_trans, then packing is not required.
+expressed through #dnnl::ukernel::pack_type which can be queried by
+#dnnl::ukernel::brgemm::get_B_pack_type call. If the query returns
+#dnnl::ukernel::pack_type::no_trans, then packing is not required.
 Otherwise, the user is responsible for packing the data appropriately before
-calling @ref dnnl::ukernel::brgemm::execute, either with custom code, or by
-using a dedicated set of APIs: @ref dnnl::ukernel::transform::generate for
+calling #dnnl::ukernel::brgemm::execute, either with custom code, or by
+using a dedicated set of APIs: #dnnl::ukernel::transform::generate for
 generating a kernel of a transform routine and
-@ref dnnl::ukernel::transform::execute to run the generated kernel.
+#dnnl::ukernel::transform::execute to run the generated kernel.
 
 ## Attributes
 
diff --git a/doc/ukernel/operations/transform.md b/doc/ukernel/operations/transform.md
index 1e0ecb742ff..be3f24d8455 100644
--- a/doc/ukernel/operations/transform.md
+++ b/doc/ukernel/operations/transform.md
@@ -2,16 +2,26 @@ Data transformation {#dev_guide_ukernel_transform}
 =======================================
 
 >
-> [API Reference](@ref dnnl_api_ukernel_brgemm)
+> [API Reference](@ref dnnl::ukernel::transform)
 >
 
 ## General
 
-The transform ukernel allows users to convert data from one format to the other,
-similar to what reorder primitive provides functionally.
+The [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm) might require the B tensor
+in a specific memory layout depending on target data types and the machine
+architecture. Check the requirement by calling the
+[get_B_pack_type()](@ref dnnl::ukernel::brgemm::get_B_pack_type) function. If it
+returns the [pack32](@ref dnnl::ukernel::pack_type::pack32) type, it implies
+that packing is required, otherwise, packing is not required.
+
+The transform ukernel allows the conversion of data from the original layout,
+which is described as either
+[non-transposed](@ref dnnl::ukernel::pack_type::no_trans) or
+[transposed](@ref dnnl::ukernel::pack_type::trans) to the layout requested by
+the BRGeMM ukernel.
+
+The only supported output packing type is `pack32`.
 
-The only output data format supported by this routine is packed format, which is
-required by B matrices in [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm).
 This is an out-of-place operation.
 
 ## Data Types
@@ -34,7 +44,9 @@ No attribute is supported for transform ukernel.
 
 ## Implementation limitations
 
-- Destination leading dimension only supported values are: 16, 32, 48, or 64.
+- Destination leading dimension, or `out_ld`, must be one of the following
+  values: `16`, `32`, `48`, or `64`. This is the implementation limitation,
+  there are no efficient kernels supported for other leading dimension values.
 
 ## Examples
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 2d848af454a..56d6a48787b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2024 Intel Corporation
+# Copyright 2016-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -57,6 +57,7 @@ file(GLOB_RECURSE headers *.hpp *.h)
 
 if(NOT DNNL_EXPERIMENTAL_SPARSE)
     list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/cpu_matmul_csr.cpp)
+    list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/cpu_matmul_coo.cpp)
     list(REMOVE_ITEM sources ${CMAKE_CURRENT_SOURCE_DIR}/cpu_matmul_weights_compression.cpp)
 endif()
 
@@ -74,7 +75,10 @@ if(DNNL_SYCL_CUDA)
         ${CMAKE_CURRENT_SOURCE_DIR}/primitives/lstm.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/primitives/layer_normalization.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/primitives/reorder.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/shuffle.cpp)
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/shuffle.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/group_normalization.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/vanilla_rnn.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/lbr_gru.cpp)
 endif()
 
 # Remove examples for Graph API if graph component is not enabled
@@ -90,9 +94,26 @@ if(NOT ONEDNN_BUILD_GRAPH)
         ${CMAKE_CURRENT_SOURCE_DIR}/graph/mqa.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/graph/sdpa_stacked_qkv.cpp
         ${CMAKE_CURRENT_SOURCE_DIR}/graph/gqa.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/graph/gated_mlp.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/graph/gated_mlp_wei_combined.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/graph/gated_mlp_int4.cpp
         )
 endif()
 
+if(DNNL_SYCL_GENERIC)
+    list(REMOVE_ITEM sources
+        # XXX: Enable when InnerProduct is implemented
+        ${CMAKE_CURRENT_SOURCE_DIR}/cnn_inference_f32.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/inner_product.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/rnn_training_f32.cpp
+        # XXX: Enable when Reduction is implemented
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/reduction.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/group_normalization.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/lbr_gru.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/lstm.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/primitives/vanilla_rnn.cpp)
+endif()
+
 if(DNNL_SYCL_HIP)
     # Build examples for supported primitives that support required features.
     set(sources)
@@ -183,12 +204,8 @@ foreach(src ${sources})
     endif()
 endforeach()
 
-if (DNNL_INSTALL_MODE STREQUAL "BUNDLE" OR DNNL_INSTALL_MODE STREQUAL "BUNDLE_V2")
-    if(DNNL_INSTALL_MODE STREQUAL "BUNDLE")
-        set(BUNDLE_EXAMPLES_DIR "examples")
-    else()
-        set(BUNDLE_EXAMPLES_DIR "${CMAKE_INSTALL_DATAROOTDIR}/doc/${LIB_PACKAGE_NAME}/examples")
-    endif()
+if (DNNL_INSTALL_MODE STREQUAL "BUNDLE")
+    set(BUNDLE_EXAMPLES_DIR "${CMAKE_INSTALL_DATAROOTDIR}/doc/${LIB_PACKAGE_NAME}/examples")
 
     configure_file(CMakeLists.txt.in CMakeLists.txt @ONLY)
     install(FILES
diff --git a/examples/CMakeLists.txt.in b/examples/CMakeLists.txt.in
index ed408567f85..4ac638719fc 100644
--- a/examples/CMakeLists.txt.in
+++ b/examples/CMakeLists.txt.in
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2024 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # limitations under the License.
 #===============================================================================
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.13)
 
 if("${CMAKE_BUILD_TYPE}" STREQUAL "")
     message(STATUS "CMAKE_BUILD_TYPE is unset, defaulting to Release")
@@ -28,18 +28,9 @@ project (DNNL_EXAMPLES)
 set(DNNL_CPU_RUNTIME "@DNNL_CPU_RUNTIME@")
 set(DNNL_GPU_RUNTIME "@DNNL_GPU_RUNTIME@")
 
-if(POLICY CMP0015)
-    cmake_policy(SET CMP0015 NEW)
-endif()
-
-# Use <PackageName>_ROOT env. variable as a prefix
-if(POLICY CMP0074)
-    cmake_policy(SET CMP0074 NEW)
-endif()
-
 set(DNNL_INSTALL_MODE "@DNNL_INSTALL_MODE@")
 set(IS_NEW_DIR_LAYOUT FALSE)
-if(DNNL_INSTALL_MODE STREQUAL "BUNDLE_V2")
+if(DNNL_INSTALL_MODE STREQUAL "BUNDLE")
     set(IS_NEW_DIR_LAYOUT TRUE)
 endif()
 
@@ -84,25 +75,8 @@ if(CMAKE_BASE_NAME MATCHES "^(icx|icpx)$")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-recommended-option -Wno-unknown-warning-option")
 endif()
 
-function(find_libm var)
-    # This is to account for the linker cache in OSX11.  might work
-    # with lower than 3.9.4, but was not able to test with anything
-    # between 2.8 and 3.9. See here for more details:
-    # https://gitlab.kitware.com/cmake/cmake/-/issues/20863
-    if (APPLE AND (${CMAKE_HOST_SYSTEM_VERSION} VERSION_GREATER "20.0.0")
-           AND (${CMAKE_VERSION} VERSION_LESS "3.9.4"))
-        message(INFO "Using OSX11 and above with CMAKE older than 3.18 can cause linking issues.")
-        set(OSX11_AND_OLDER_CMAKE TRUE)
-    endif()
-
-    if(UNIX AND (NOT (APPLE AND OSX11_AND_OLDER_CMAKE)))
-        find_library(${var} m REQUIRED)
-    endif()
-endfunction()
-
-
 if(UNIX OR MINGW)
-    find_libm(LIBM)
+    find_library(LIBM m REQUIRED)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
 
     if(NOT DNNL_WITH_SYCL)
@@ -116,22 +90,11 @@ if(UNIX OR MINGW)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w")
 endif()
 
-if(WIN32 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
     add_definitions(/Qpar)
     add_definitions(/openmp)
 else()
     find_package(OpenMP)
-    #newer version for findOpenMP (>= v. 3.9)
-    if(CMAKE_VERSION VERSION_LESS "3.9" AND OPENMP_FOUND)
-        if(${CMAKE_MAJOR_VERSION} VERSION_LESS "3" AND
-                ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
-            # Override FindOpenMP flags for Intel Compiler (otherwise deprecated)
-            set(OpenMP_CXX_FLAGS "-fopenmp")
-            set(OpenMP_C_FLAGS "-fopenmp")
-        endif()
-        set(OpenMP_C_FOUND true)
-        set(OpenMP_CXX_FOUND true)
-    endif()
     if(OpenMP_C_FOUND)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     endif()
@@ -190,7 +153,7 @@ elseif(APPLE)
     set(CTESTCONFIG_PATH "${DNNLROOT}/lib")
 endif()
 
-# Common configuration for tests / test cases on Windows and Apple
+# Configuration for tests / test cases on Windows
 function(maybe_configure_test name kind)
     if(WIN32)
         string(REPLACE  ";" "\;" PATH "${CTESTCONFIG_PATH};$ENV{PATH}")
@@ -198,14 +161,6 @@ function(maybe_configure_test name kind)
         if(CMAKE_GENERATOR MATCHES "Visual Studio")
             configure_file(template.vcxproj.user ${name}.vcxproj.user @ONLY)
         endif()
-    elseif(APPLE)
-        # When LIBRARY_PATH is set (e.g. when using compiler env. scripts)
-        # cmake may stop passing `rpath` linker option. The hack below adds the
-        # LIBRARY_PATH to DYLD_LIBRARY_PATH to make the executable find its
-        # dependencies.
-        # TODO: the problem may be in older version of cmake (2.8.11), revisit.
-        set_property(${kind} ${name} PROPERTY ENVIRONMENT
-                "DYLD_LIBRARY_PATH=${CTESTCONFIG_PATH}:$ENV{LIBRARY_PATH}:$ENV{DYLD_LIBRARY_PATH}")
     endif()
 endfunction()
 
diff --git a/examples/bnorm_u8_via_binary_postops.cpp b/examples/bnorm_u8_via_binary_postops.cpp
index e72c852239a..eab8bf18635 100644
--- a/examples/bnorm_u8_via_binary_postops.cpp
+++ b/examples/bnorm_u8_via_binary_postops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,9 +46,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void bnorm_u8_via_binary_postops(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -102,12 +99,18 @@ void bnorm_u8_via_binary_postops(dnnl::engine::kind engine_kind) {
             oscale_data.begin(), oscale_data.end(), []() { return 0.5f; });
 
     // Create descriptors.
-    auto src_md = memory::desc(src_dims, dt::u8, tag::nhwc);
-    auto mean_md = memory::desc(params_dims, dt::f32, tag::nhwc);
-    auto variance_md = memory::desc(params_dims, dt::f32, tag::nhwc);
-    auto scale_md = memory::desc(params_dims, dt::f32, tag::nhwc);
-    auto shift_md = memory::desc(params_dims, dt::f32, tag::nhwc);
-    auto oscale_md = memory::desc(params_dims, dt::f32, tag::nhwc);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::u8, memory::format_tag::nhwc);
+    auto mean_md = memory::desc(
+            params_dims, memory::data_type::f32, memory::format_tag::nhwc);
+    auto variance_md = memory::desc(
+            params_dims, memory::data_type::f32, memory::format_tag::nhwc);
+    auto scale_md = memory::desc(
+            params_dims, memory::data_type::f32, memory::format_tag::nhwc);
+    auto shift_md = memory::desc(
+            params_dims, memory::data_type::f32, memory::format_tag::nhwc);
+    auto oscale_md = memory::desc(
+            params_dims, memory::data_type::f32, memory::format_tag::nhwc);
 
     // Create src memory objects.
     auto src_mem = memory(src_md, engine);
diff --git a/examples/cnn_inference_f32.cpp b/examples/cnn_inference_f32.cpp
index 355f0c11561..24c39eca166 100644
--- a/examples/cnn_inference_f32.cpp
+++ b/examples/cnn_inference_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2022 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,9 +51,6 @@
 using namespace dnnl;
 
 void simple_net(engine::kind engine_kind, int times = 100) {
-    using tag = memory::format_tag;
-    using dt = memory::data_type;
-
     /// Initialize an engine and stream. The last parameter in the call represents
     /// the index of the engine.
     /// @snippet cnn_inference_f32.cpp Initialize engine and stream
@@ -91,33 +88,43 @@ void simple_net(engine::kind engine_kind, int times = 100) {
     std::vector<float> conv1_bias(product(conv1_bias_tz));
     //[Allocate buffers]
 
-    /// Create memory that describes data layout in the buffers. This example uses
-    /// tag::nchw (batch-channels-height-width) for input data and tag::oihw
-    /// for weights.
+    /// Create memory that describes data layout in the buffers. This example
+    /// uses dnnl::memory::format_tag::nchw (batch-channels-height-width)
+    /// for input data and dnnl::memory::format_tag::oihw for weights.
     /// @snippet cnn_inference_f32.cpp Create user memory
     //[Create user memory]
-    auto user_src_memory = memory({{conv1_src_tz}, dt::f32, tag::nchw}, eng);
+    auto user_src_memory = memory(
+            {{conv1_src_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(user_src.data(), user_src_memory);
     auto user_weights_memory
-            = memory({{conv1_weights_tz}, dt::f32, tag::oihw}, eng);
+            = memory({{conv1_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::oihw},
+                    eng);
     write_to_dnnl_memory(conv1_weights.data(), user_weights_memory);
-    auto conv1_user_bias_memory
-            = memory({{conv1_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv1_user_bias_memory = memory(
+            {{conv1_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv1_bias.data(), conv1_user_bias_memory);
     //[Create user memory]
 
-    /// Create memory descriptors with layout tag::any. The `any` format enables
-    /// the convolution primitive to choose the data format that will result in
-    /// best performance based on its input parameters (convolution kernel
-    /// sizes, strides, padding, and so on). If the resulting format is different
-    /// from `nchw`, the user data must be transformed to the format required for
-    /// the convolution (as explained below).
+    /// Create memory descriptors with layout dnnl::memory::format_tag::any.
+    /// The `any` format enables the convolution primitive to choose the data
+    /// format that will result in best performance based on its input
+    /// parameters (convolution kernel sizes, strides, padding, and so on).
+    /// If the resulting format is different from `nchw`, the user data must be
+    /// transformed to the format required for the convolution (as explained
+    /// below).
     /// @snippet cnn_inference_f32.cpp Create convolution memory descriptors
     //[Create convolution memory descriptors]
-    auto conv1_src_md = memory::desc({conv1_src_tz}, dt::f32, tag::any);
-    auto conv1_bias_md = memory::desc({conv1_bias_tz}, dt::f32, tag::any);
-    auto conv1_weights_md = memory::desc({conv1_weights_tz}, dt::f32, tag::any);
-    auto conv1_dst_md = memory::desc({conv1_dst_tz}, dt::f32, tag::any);
+    auto conv1_src_md = memory::desc(
+            {conv1_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv1_bias_md = memory::desc(
+            {conv1_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv1_weights_md = memory::desc({conv1_weights_tz},
+            memory::data_type::f32, memory::format_tag::any);
+    auto conv1_dst_md = memory::desc(
+            {conv1_dst_tz}, memory::data_type::f32, memory::format_tag::any);
     //[Create convolution memory descriptors]
 
     /// Create a convolution primitive descriptor by specifying engine,
@@ -136,9 +143,9 @@ void simple_net(engine::kind engine_kind, int times = 100) {
             conv1_strides, conv1_padding, conv1_padding);
     //[Create convolution primitive descriptor]
 
-    /// Check whether data and weights formats required by convolution is different
-    /// from the user format. In case it is different change the layout using
-    /// reorder primitive.
+    /// Check whether data and weights formats required by convolution is
+    /// different from the user format. In case it is different change the
+    /// layout using reorder primitive.
     /// @snippet cnn_inference_f32.cpp Reorder data and weights
     //[Reorder data and weights]
     auto conv1_src_memory = user_src_memory;
@@ -180,7 +187,8 @@ void simple_net(engine::kind engine_kind, int times = 100) {
     /// Create the relu primitive. For better performance, keep the input data
     /// format for ReLU (as well as for other operation primitives until another
     /// convolution or inner product is encountered) the same as the one chosen
-    /// for convolution. Also note that ReLU is done in-place by using conv1 memory.
+    /// for convolution. Also note that ReLU is done in-place by using conv1
+    /// memory.
     /// @snippet cnn_inference_f32.cpp Create relu primitive
     //[Create relu primitive]
     auto relu1_prim_desc
@@ -224,11 +232,12 @@ void simple_net(engine::kind engine_kind, int times = 100) {
     memory::dims pool_dilation = {0, 0};
     memory::dims pool_padding = {0, 0};
 
-    auto pool1_dst_md = memory::desc({pool1_dst_tz}, dt::f32, tag::any);
+    auto pool1_dst_md = memory::desc(
+            {pool1_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     /// For training execution, pooling requires a private workspace memory
-    /// to perform the backward pass. However, pooling should not use 'workspace'
-    /// for inference, because this is detrimental to performance.
+    /// to perform the backward pass. However, pooling should not use
+    /// 'workspace' for inference, because this is detrimental to performance.
     /// @snippet cnn_inference_f32.cpp Create pooling primitive
     ///
     /// The example continues to create more layers according
@@ -260,17 +269,24 @@ void simple_net(engine::kind engine_kind, int times = 100) {
 
     // create memory for user data
     auto conv2_user_weights_memory
-            = memory({{conv2_weights_tz}, dt::f32, tag::goihw}, eng);
+            = memory({{conv2_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::goihw},
+                    eng);
     write_to_dnnl_memory(conv2_weights.data(), conv2_user_weights_memory);
-    auto conv2_user_bias_memory
-            = memory({{conv2_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv2_user_bias_memory = memory(
+            {{conv2_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv2_bias.data(), conv2_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto conv2_src_md = memory::desc({conv2_src_tz}, dt::f32, tag::any);
-    auto conv2_bias_md = memory::desc({conv2_bias_tz}, dt::f32, tag::any);
-    auto conv2_weights_md = memory::desc({conv2_weights_tz}, dt::f32, tag::any);
-    auto conv2_dst_md = memory::desc({conv2_dst_tz}, dt::f32, tag::any);
+    auto conv2_src_md = memory::desc(
+            {conv2_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv2_bias_md = memory::desc(
+            {conv2_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv2_weights_md = memory::desc({conv2_weights_tz},
+            memory::data_type::f32, memory::format_tag::any);
+    auto conv2_dst_md = memory::desc(
+            {conv2_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a convolution
     auto conv2_prim_desc = convolution_forward::primitive_desc(eng,
@@ -348,7 +364,8 @@ void simple_net(engine::kind engine_kind, int times = 100) {
     memory::dims pool2_dilation = {0, 0};
     memory::dims pool2_padding = {0, 0};
 
-    auto pool2_dst_md = memory::desc({pool2_dst_tz}, dt::f32, tag::any);
+    auto pool2_dst_md = memory::desc(
+            {pool2_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a pooling
     auto pool2_pd = pooling_forward::primitive_desc(eng,
@@ -377,17 +394,24 @@ void simple_net(engine::kind engine_kind, int times = 100) {
 
     // create memory for user data
     auto conv3_user_weights_memory
-            = memory({{conv3_weights_tz}, dt::f32, tag::oihw}, eng);
+            = memory({{conv3_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::oihw},
+                    eng);
     write_to_dnnl_memory(conv3_weights.data(), conv3_user_weights_memory);
-    auto conv3_user_bias_memory
-            = memory({{conv3_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv3_user_bias_memory = memory(
+            {{conv3_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv3_bias.data(), conv3_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto conv3_src_md = memory::desc({conv3_src_tz}, dt::f32, tag::any);
-    auto conv3_bias_md = memory::desc({conv3_bias_tz}, dt::f32, tag::any);
-    auto conv3_weights_md = memory::desc({conv3_weights_tz}, dt::f32, tag::any);
-    auto conv3_dst_md = memory::desc({conv3_dst_tz}, dt::f32, tag::any);
+    auto conv3_src_md = memory::desc(
+            {conv3_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv3_bias_md = memory::desc(
+            {conv3_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv3_weights_md = memory::desc({conv3_weights_tz},
+            memory::data_type::f32, memory::format_tag::any);
+    auto conv3_dst_md = memory::desc(
+            {conv3_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a convolution
     auto conv3_prim_desc = convolution_forward::primitive_desc(eng,
@@ -450,17 +474,24 @@ void simple_net(engine::kind engine_kind, int times = 100) {
 
     // create memory for user data
     auto conv4_user_weights_memory
-            = memory({{conv4_weights_tz}, dt::f32, tag::goihw}, eng);
+            = memory({{conv4_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::goihw},
+                    eng);
     write_to_dnnl_memory(conv4_weights.data(), conv4_user_weights_memory);
-    auto conv4_user_bias_memory
-            = memory({{conv4_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv4_user_bias_memory = memory(
+            {{conv4_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv4_bias.data(), conv4_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto conv4_src_md = memory::desc({conv4_src_tz}, dt::f32, tag::any);
-    auto conv4_bias_md = memory::desc({conv4_bias_tz}, dt::f32, tag::any);
-    auto conv4_weights_md = memory::desc({conv4_weights_tz}, dt::f32, tag::any);
-    auto conv4_dst_md = memory::desc({conv4_dst_tz}, dt::f32, tag::any);
+    auto conv4_src_md = memory::desc(
+            {conv4_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv4_bias_md = memory::desc(
+            {conv4_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv4_weights_md = memory::desc({conv4_weights_tz},
+            memory::data_type::f32, memory::format_tag::any);
+    auto conv4_dst_md = memory::desc(
+            {conv4_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a convolution
     auto conv4_prim_desc = convolution_forward::primitive_desc(eng,
@@ -522,17 +553,24 @@ void simple_net(engine::kind engine_kind, int times = 100) {
 
     // create memory for user data
     auto conv5_user_weights_memory
-            = memory({{conv5_weights_tz}, dt::f32, tag::goihw}, eng);
+            = memory({{conv5_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::goihw},
+                    eng);
     write_to_dnnl_memory(conv5_weights.data(), conv5_user_weights_memory);
-    auto conv5_user_bias_memory
-            = memory({{conv5_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv5_user_bias_memory = memory(
+            {{conv5_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv5_bias.data(), conv5_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto conv5_src_md = memory::desc({conv5_src_tz}, dt::f32, tag::any);
-    auto conv5_weights_md = memory::desc({conv5_weights_tz}, dt::f32, tag::any);
-    auto conv5_bias_md = memory::desc({conv5_bias_tz}, dt::f32, tag::any);
-    auto conv5_dst_md = memory::desc({conv5_dst_tz}, dt::f32, tag::any);
+    auto conv5_src_md = memory::desc(
+            {conv5_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv5_weights_md = memory::desc({conv5_weights_tz},
+            memory::data_type::f32, memory::format_tag::any);
+    auto conv5_bias_md = memory::desc(
+            {conv5_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv5_dst_md = memory::desc(
+            {conv5_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a convolution
     auto conv5_prim_desc = convolution_forward::primitive_desc(eng,
@@ -591,7 +629,8 @@ void simple_net(engine::kind engine_kind, int times = 100) {
 
     std::vector<float> pool5_dst(product(pool5_dst_tz));
 
-    auto pool5_dst_md = memory::desc({pool5_dst_tz}, dt::f32, tag::any);
+    auto pool5_dst_md = memory::desc(
+            {pool5_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a pooling
     auto pool5_pd = pooling_forward::primitive_desc(eng,
@@ -618,16 +657,24 @@ void simple_net(engine::kind engine_kind, int times = 100) {
 
     // create memory for user data
     auto fc6_user_weights_memory
-            = memory({{fc6_weights_tz}, dt::f32, tag::oihw}, eng);
+            = memory({{fc6_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::oihw},
+                    eng);
     write_to_dnnl_memory(fc6_weights.data(), fc6_user_weights_memory);
-    auto fc6_user_bias_memory = memory({{fc6_bias_tz}, dt::f32, tag::x}, eng);
+    auto fc6_user_bias_memory = memory(
+            {{fc6_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(fc6_bias.data(), fc6_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto fc6_src_md = memory::desc({fc6_src_tz}, dt::f32, tag::any);
-    auto fc6_bias_md = memory::desc({fc6_bias_tz}, dt::f32, tag::any);
-    auto fc6_weights_md = memory::desc({fc6_weights_tz}, dt::f32, tag::any);
-    auto fc6_dst_md = memory::desc({fc6_dst_tz}, dt::f32, tag::any);
+    auto fc6_src_md = memory::desc(
+            {fc6_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc6_bias_md = memory::desc(
+            {fc6_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc6_weights_md = memory::desc(
+            {fc6_weights_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc6_dst_md = memory::desc(
+            {fc6_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a inner_product
     auto fc6_prim_desc = inner_product_forward::primitive_desc(eng,
@@ -667,17 +714,23 @@ void simple_net(engine::kind engine_kind, int times = 100) {
     std::vector<float> fc7_bias(product(fc7_bias_tz));
 
     // create memory for user data
-    auto fc7_user_weights_memory
-            = memory({{fc7_weights_tz}, dt::f32, tag::nc}, eng);
+    auto fc7_user_weights_memory = memory(
+            {{fc7_weights_tz}, memory::data_type::f32, memory::format_tag::nc},
+            eng);
     write_to_dnnl_memory(fc7_weights.data(), fc7_user_weights_memory);
 
-    auto fc7_user_bias_memory = memory({{fc7_bias_tz}, dt::f32, tag::x}, eng);
+    auto fc7_user_bias_memory = memory(
+            {{fc7_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(fc7_bias.data(), fc7_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto fc7_bias_md = memory::desc({fc7_bias_tz}, dt::f32, tag::any);
-    auto fc7_weights_md = memory::desc({fc7_weights_tz}, dt::f32, tag::any);
-    auto fc7_dst_md = memory::desc({fc7_dst_tz}, dt::f32, tag::any);
+    auto fc7_bias_md = memory::desc(
+            {fc7_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc7_weights_md = memory::desc(
+            {fc7_weights_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc7_dst_md = memory::desc(
+            {fc7_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a inner_product
     auto fc7_prim_desc = inner_product_forward::primitive_desc(eng,
@@ -709,18 +762,26 @@ void simple_net(engine::kind engine_kind, int times = 100) {
     std::vector<float> fc8_bias(product(fc8_bias_tz));
 
     // create memory for user data
-    auto fc8_user_weights_memory
-            = memory({{fc8_weights_tz}, dt::f32, tag::nc}, eng);
+    auto fc8_user_weights_memory = memory(
+            {{fc8_weights_tz}, memory::data_type::f32, memory::format_tag::nc},
+            eng);
     write_to_dnnl_memory(fc8_weights.data(), fc8_user_weights_memory);
-    auto fc8_user_bias_memory = memory({{fc8_bias_tz}, dt::f32, tag::x}, eng);
+    auto fc8_user_bias_memory = memory(
+            {{fc8_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(fc8_bias.data(), fc8_user_bias_memory);
-    auto user_dst_memory = memory({{fc8_dst_tz}, dt::f32, tag::nc}, eng);
+    auto user_dst_memory = memory(
+            {{fc8_dst_tz}, memory::data_type::f32, memory::format_tag::nc},
+            eng);
     write_to_dnnl_memory(user_dst.data(), user_dst_memory);
 
     // create memory descriptors for convolution data w/ no specified format
-    auto fc8_bias_md = memory::desc({fc8_bias_tz}, dt::f32, tag::any);
-    auto fc8_weights_md = memory::desc({fc8_weights_tz}, dt::f32, tag::any);
-    auto fc8_dst_md = memory::desc({fc8_dst_tz}, dt::f32, tag::any);
+    auto fc8_bias_md = memory::desc(
+            {fc8_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc8_weights_md = memory::desc(
+            {fc8_weights_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto fc8_dst_md = memory::desc(
+            {fc8_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a inner_product
     auto fc8_prim_desc = inner_product_forward::primitive_desc(eng,
diff --git a/examples/cnn_inference_int8.cpp b/examples/cnn_inference_int8.cpp
index 1565f18ebca..7cfbe9b0d1b 100644
--- a/examples/cnn_inference_int8.cpp
+++ b/examples/cnn_inference_int8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,9 +33,6 @@
 using namespace dnnl;
 
 void simple_net_int8(engine::kind engine_kind) {
-    using tag = memory::format_tag;
-    using dt = memory::data_type;
-
     auto eng = engine(engine_kind, 0);
     stream s(eng);
 
@@ -89,12 +86,18 @@ void simple_net_int8(engine::kind engine_kind) {
     /// The user data will be in its original 32-bit floating point format.
     /// @snippet cnn_inference_int8.cpp Allocate buffers
     //[Allocate buffers]
-    auto user_src_memory = memory({{conv_src_tz}, dt::f32, tag::nchw}, eng);
+    auto user_src_memory = memory(
+            {{conv_src_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(user_src.data(), user_src_memory);
     auto user_weights_memory
-            = memory({{conv_weights_tz}, dt::f32, tag::oihw}, eng);
+            = memory({{conv_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::oihw},
+                    eng);
     write_to_dnnl_memory(conv_weights.data(), user_weights_memory);
-    auto user_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng);
+    auto user_bias_memory = memory(
+            {{conv_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv_bias.data(), user_bias_memory);
     //[Allocate buffers]
 
@@ -112,10 +115,14 @@ void simple_net_int8(engine::kind engine_kind) {
     ///  > Bias does not support quantization.
     /// @snippet cnn_inference_int8.cpp Create convolution memory descriptors
     //[Create convolution memory descriptors]
-    auto conv_src_md = memory::desc({conv_src_tz}, dt::u8, tag::any);
-    auto conv_bias_md = memory::desc({conv_bias_tz}, dt::f32, tag::any);
-    auto conv_weights_md = memory::desc({conv_weights_tz}, dt::s8, tag::any);
-    auto conv_dst_md = memory::desc({conv_dst_tz}, dt::u8, tag::any);
+    auto conv_src_md = memory::desc(
+            {conv_src_tz}, memory::data_type::u8, memory::format_tag::any);
+    auto conv_bias_md = memory::desc(
+            {conv_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_weights_md = memory::desc(
+            {conv_weights_tz}, memory::data_type::s8, memory::format_tag::any);
+    auto conv_dst_md = memory::desc(
+            {conv_dst_tz}, memory::data_type::u8, memory::format_tag::any);
     //[Create convolution memory descriptors]
 
     /// Configuring int8-specific parameters in an int8 primitive is done
@@ -129,7 +136,8 @@ void simple_net_int8(engine::kind engine_kind) {
     conv_attr.set_scales_mask(DNNL_ARG_DST, dst_mask);
 
     // Prepare dst scales
-    auto dst_scale_md = memory::desc({1}, dt::f32, tag::x);
+    auto dst_scale_md
+            = memory::desc({1}, memory::data_type::f32, memory::format_tag::x);
     auto dst_scale_memory = memory(dst_scale_md, eng);
     write_to_dnnl_memory(dst_scales.data(), dst_scale_memory);
     //[Configure scaling]
@@ -194,7 +202,8 @@ void simple_net_int8(engine::kind engine_kind) {
     auto conv_src_memory = memory(conv_prim_desc.src_desc(), eng);
     primitive_attr src_attr;
     src_attr.set_scales_mask(DNNL_ARG_DST, src_mask);
-    auto src_scale_md = memory::desc({1}, dt::f32, tag::x);
+    auto src_scale_md
+            = memory::desc({1}, memory::data_type::f32, memory::format_tag::x);
     auto src_scale_memory = memory(src_scale_md, eng);
     write_to_dnnl_memory(src_scales.data(), src_scale_memory);
     auto src_reorder_pd
@@ -208,7 +217,8 @@ void simple_net_int8(engine::kind engine_kind) {
     auto conv_weights_memory = memory(conv_prim_desc.weights_desc(), eng);
     primitive_attr weight_attr;
     weight_attr.set_scales_mask(DNNL_ARG_DST, weight_mask);
-    auto wei_scale_md = memory::desc({1}, dt::f32, tag::x);
+    auto wei_scale_md
+            = memory::desc({1}, memory::data_type::f32, memory::format_tag::x);
     auto wei_scale_memory = memory(wei_scale_md, eng);
     write_to_dnnl_memory(weight_scales.data(), wei_scale_memory);
     auto weight_reorder_pd
@@ -251,7 +261,9 @@ void simple_net_int8(engine::kind engine_kind) {
     /// computation output data.
     /// @snippet cnn_inference_int8.cpp Dequantize the result
     ///[Dequantize the result]
-    auto user_dst_memory = memory({{conv_dst_tz}, dt::f32, tag::nchw}, eng);
+    auto user_dst_memory = memory(
+            {{conv_dst_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(user_dst.data(), user_dst_memory);
     primitive_attr dst_attr;
     dst_attr.set_scales_mask(DNNL_ARG_SRC, dst_mask);
diff --git a/examples/cnn_training_bf16.cpp b/examples/cnn_training_bf16.cpp
index 9ef4f8a4d1b..0bcee7201f2 100644
--- a/examples/cnn_training_bf16.cpp
+++ b/examples/cnn_training_bf16.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,9 +38,6 @@
 using namespace dnnl;
 
 void simple_net(engine::kind engine_kind) {
-    using tag = memory::format_tag;
-    using dt = memory::data_type;
-
     auto eng = engine(engine_kind, 0);
     stream s(eng);
 
@@ -79,27 +76,36 @@ void simple_net(engine::kind engine_kind) {
         conv_bias[i] = sinf((float)i);
 
     // create memory for user data
-    auto conv_user_src_memory
-            = memory({{conv_src_tz}, dt::f32, tag::nchw}, eng);
+    auto conv_user_src_memory = memory(
+            {{conv_src_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(net_src.data(), conv_user_src_memory);
 
     auto conv_user_weights_memory
-            = memory({{conv_weights_tz}, dt::f32, tag::oihw}, eng);
+            = memory({{conv_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::oihw},
+                    eng);
     write_to_dnnl_memory(conv_weights.data(), conv_user_weights_memory);
 
-    auto conv_user_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv_user_bias_memory = memory(
+            {{conv_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv_bias.data(), conv_user_bias_memory);
 
     // create memory descriptors for bfloat16 convolution data w/ no specified
     // format tag(`any`)
     // tag `any` lets a primitive(convolution in this case)
     // chose the memory format preferred for best performance.
-    auto conv_src_md = memory::desc({conv_src_tz}, dt::bf16, tag::any);
-    auto conv_weights_md = memory::desc({conv_weights_tz}, dt::bf16, tag::any);
-    auto conv_dst_md = memory::desc({conv_dst_tz}, dt::bf16, tag::any);
+    auto conv_src_md = memory::desc(
+            {conv_src_tz}, memory::data_type::bf16, memory::format_tag::any);
+    auto conv_weights_md = memory::desc({conv_weights_tz},
+            memory::data_type::bf16, memory::format_tag::any);
+    auto conv_dst_md = memory::desc(
+            {conv_dst_tz}, memory::data_type::bf16, memory::format_tag::any);
     // here bias data type is set to bf16.
     // additionally, f32 data type is supported for bf16 convolution.
-    auto conv_bias_md = memory::desc({conv_bias_tz}, dt::bf16, tag::any);
+    auto conv_bias_md = memory::desc(
+            {conv_bias_tz}, memory::data_type::bf16, memory::format_tag::any);
 
     // create a convolution primitive descriptor
 
@@ -225,11 +231,13 @@ void simple_net(engine::kind engine_kind) {
     memory::dims pool_padding = {0, 0};
 
     // create memory for pool dst data in user format
-    auto pool_user_dst_memory
-            = memory({{pool_dst_tz}, dt::f32, tag::nchw}, eng);
+    auto pool_user_dst_memory = memory(
+            {{pool_dst_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
 
     // create pool dst memory descriptor in format any for bfloat16 data type
-    auto pool_dst_md = memory::desc({pool_dst_tz}, dt::bf16, tag::any);
+    auto pool_dst_md = memory::desc(
+            {pool_dst_tz}, memory::data_type::bf16, memory::format_tag::any);
 
     // create a pooling primitive descriptor
     auto pool_pd = pooling_forward::primitive_desc(eng, prop_kind::forward,
@@ -269,14 +277,17 @@ void simple_net(engine::kind engine_kind) {
         net_diff_dst[i] = sinf((float)i);
 
     // create memory for user diff dst data stored in float data type
-    auto pool_user_diff_dst_memory
-            = memory({{pool_dst_tz}, dt::f32, tag::nchw}, eng);
+    auto pool_user_diff_dst_memory = memory(
+            {{pool_dst_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(net_diff_dst.data(), pool_user_diff_dst_memory);
 
     // Backward pooling
     // create memory descriptors for pooling
-    auto pool_diff_src_md = memory::desc({lrn_data_tz}, dt::bf16, tag::any);
-    auto pool_diff_dst_md = memory::desc({pool_dst_tz}, dt::bf16, tag::any);
+    auto pool_diff_src_md = memory::desc(
+            {lrn_data_tz}, memory::data_type::bf16, memory::format_tag::any);
+    auto pool_diff_dst_md = memory::desc(
+            {pool_dst_tz}, memory::data_type::bf16, memory::format_tag::any);
 
     // backward primitive descriptor needs to hint forward descriptor
     auto pool_bwd_pd = pooling_backward::primitive_desc(eng,
@@ -305,7 +316,8 @@ void simple_net(engine::kind engine_kind) {
             {DNNL_ARG_WORKSPACE, pool_workspace_memory}});
 
     // Backward lrn
-    auto lrn_diff_dst_md = memory::desc({lrn_data_tz}, dt::bf16, tag::any);
+    auto lrn_diff_dst_md = memory::desc(
+            {lrn_data_tz}, memory::data_type::bf16, memory::format_tag::any);
     const auto &lrn_diff_src_md = lrn_diff_dst_md;
 
     // create backward lrn primitive descriptor
@@ -335,8 +347,10 @@ void simple_net(engine::kind engine_kind) {
             {DNNL_ARG_WORKSPACE, lrn_workspace_memory}});
 
     // Backward relu
-    auto relu_diff_src_md = memory::desc({relu_data_tz}, dt::bf16, tag::any);
-    auto relu_diff_dst_md = memory::desc({relu_data_tz}, dt::bf16, tag::any);
+    auto relu_diff_src_md = memory::desc(
+            {relu_data_tz}, memory::data_type::bf16, memory::format_tag::any);
+    auto relu_diff_dst_md = memory::desc(
+            {relu_data_tz}, memory::data_type::bf16, memory::format_tag::any);
     auto relu_src_md = conv_pd.dst_desc();
 
     // create backward relu primitive_descriptor
@@ -367,14 +381,20 @@ void simple_net(engine::kind engine_kind) {
     // create user format diff weights and diff bias memory for float data type
 
     auto conv_user_diff_weights_memory
-            = memory({{conv_weights_tz}, dt::f32, tag::nchw}, eng);
-    auto conv_diff_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng);
+            = memory({{conv_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::nchw},
+                    eng);
+    auto conv_diff_bias_memory = memory(
+            {{conv_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
 
     // create memory descriptors for bfloat16 convolution data
-    auto conv_bwd_src_md = memory::desc({conv_src_tz}, dt::bf16, tag::any);
-    auto conv_diff_weights_md
-            = memory::desc({conv_weights_tz}, dt::bf16, tag::any);
-    auto conv_diff_dst_md = memory::desc({conv_dst_tz}, dt::bf16, tag::any);
+    auto conv_bwd_src_md = memory::desc(
+            {conv_src_tz}, memory::data_type::bf16, memory::format_tag::any);
+    auto conv_diff_weights_md = memory::desc({conv_weights_tz},
+            memory::data_type::bf16, memory::format_tag::any);
+    auto conv_diff_dst_md = memory::desc(
+            {conv_dst_tz}, memory::data_type::bf16, memory::format_tag::any);
 
     // use diff bias provided by the user
     auto conv_diff_bias_md = conv_diff_bias_memory.get_desc();
diff --git a/examples/cnn_training_f32.cpp b/examples/cnn_training_f32.cpp
index a5aaa5e2b4e..89668569d75 100644
--- a/examples/cnn_training_f32.cpp
+++ b/examples/cnn_training_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2022 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,9 +35,6 @@
 using namespace dnnl;
 
 void simple_net(engine::kind engine_kind) {
-    using tag = memory::format_tag;
-    using dt = memory::data_type;
-
     auto eng = engine(engine_kind, 0);
     stream s(eng);
 
@@ -75,23 +72,32 @@ void simple_net(engine::kind engine_kind) {
         conv_bias[i] = sinf((float)i);
 
     // create memory for user data
-    auto conv_user_src_memory
-            = memory({{conv_src_tz}, dt::f32, tag::nchw}, eng);
+    auto conv_user_src_memory = memory(
+            {{conv_src_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(net_src.data(), conv_user_src_memory);
     auto conv_user_weights_memory
-            = memory({{conv_weights_tz}, dt::f32, tag::oihw}, eng);
+            = memory({{conv_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::oihw},
+                    eng);
     write_to_dnnl_memory((void *)conv_weights.data(), conv_user_weights_memory);
-    auto conv_user_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv_user_bias_memory = memory(
+            {{conv_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv_bias.data(), conv_user_bias_memory);
 
     // create memory descriptors for convolution data w/ no specified
     // format tag(`any`)
     // tag `any` lets a primitive(convolution in this case)
     // chose the memory format preferred for best performance.
-    auto conv_src_md = memory::desc({conv_src_tz}, dt::f32, tag::any);
-    auto conv_bias_md = memory::desc({conv_bias_tz}, dt::f32, tag::any);
-    auto conv_weights_md = memory::desc({conv_weights_tz}, dt::f32, tag::any);
-    auto conv_dst_md = memory::desc({conv_dst_tz}, dt::f32, tag::any);
+    auto conv_src_md = memory::desc(
+            {conv_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_bias_md = memory::desc(
+            {conv_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_weights_md = memory::desc(
+            {conv_weights_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_dst_md = memory::desc(
+            {conv_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a convolution primitive descriptor
     auto conv_pd = convolution_forward::primitive_desc(eng, prop_kind::forward,
@@ -189,12 +195,14 @@ void simple_net(engine::kind engine_kind) {
     memory::dims pool_padding = {0, 0};
 
     // create memory for pool dst data in user format
-    auto pool_user_dst_memory
-            = memory({{pool_dst_tz}, dt::f32, tag::nchw}, eng);
+    auto pool_user_dst_memory = memory(
+            {{pool_dst_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(net_dst.data(), pool_user_dst_memory);
 
     // create pool dst memory descriptor in format any
-    auto pool_dst_md = memory::desc({pool_dst_tz}, dt::f32, tag::any);
+    auto pool_dst_md = memory::desc(
+            {pool_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create a pooling primitive descriptor
     auto pool_pd = pooling_forward::primitive_desc(eng, prop_kind::forward,
@@ -233,14 +241,17 @@ void simple_net(engine::kind engine_kind) {
         net_diff_dst[i] = sinf((float)i);
 
     // create memory for user diff dst data
-    auto pool_user_diff_dst_memory
-            = memory({{pool_dst_tz}, dt::f32, tag::nchw}, eng);
+    auto pool_user_diff_dst_memory = memory(
+            {{pool_dst_tz}, memory::data_type::f32, memory::format_tag::nchw},
+            eng);
     write_to_dnnl_memory(net_diff_dst.data(), pool_user_diff_dst_memory);
 
     // Backward pooling
     // create memory descriptors for pooling
-    auto pool_diff_src_md = memory::desc({lrn_data_tz}, dt::f32, tag::any);
-    auto pool_diff_dst_md = memory::desc({pool_dst_tz}, dt::f32, tag::any);
+    auto pool_diff_src_md = memory::desc(
+            {lrn_data_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto pool_diff_dst_md = memory::desc(
+            {pool_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // backward primitive descriptor needs to hint forward descriptor
     auto pool_bwd_pd = pooling_backward::primitive_desc(eng,
@@ -269,7 +280,8 @@ void simple_net(engine::kind engine_kind) {
             {DNNL_ARG_WORKSPACE, pool_workspace_memory}});
 
     // Backward lrn
-    auto lrn_diff_dst_md = memory::desc({lrn_data_tz}, dt::f32, tag::any);
+    auto lrn_diff_dst_md = memory::desc(
+            {lrn_data_tz}, memory::data_type::f32, memory::format_tag::any);
     const auto &lrn_diff_src_md = lrn_diff_dst_md;
 
     // create backward lrn primitive descriptor
@@ -299,8 +311,10 @@ void simple_net(engine::kind engine_kind) {
             {DNNL_ARG_WORKSPACE, lrn_workspace_memory}});
 
     // Backward relu
-    auto relu_diff_src_md = memory::desc({relu_data_tz}, dt::f32, tag::any);
-    auto relu_diff_dst_md = memory::desc({relu_data_tz}, dt::f32, tag::any);
+    auto relu_diff_src_md = memory::desc(
+            {relu_data_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto relu_diff_dst_md = memory::desc(
+            {relu_data_tz}, memory::data_type::f32, memory::format_tag::any);
     auto relu_src_md = conv_pd.dst_desc();
 
     // create backward relu primitive_descriptor
@@ -333,18 +347,25 @@ void simple_net(engine::kind engine_kind) {
     std::vector<float> conv_diff_bias_buffer(product(conv_bias_tz));
 
     auto conv_user_diff_weights_memory
-            = memory({{conv_weights_tz}, dt::f32, tag::nchw}, eng);
+            = memory({{conv_weights_tz}, memory::data_type::f32,
+                             memory::format_tag::nchw},
+                    eng);
     write_to_dnnl_memory(conv_user_diff_weights_buffer.data(),
             conv_user_diff_weights_memory);
-    auto conv_diff_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng);
+    auto conv_diff_bias_memory = memory(
+            {{conv_bias_tz}, memory::data_type::f32, memory::format_tag::x},
+            eng);
     write_to_dnnl_memory(conv_diff_bias_buffer.data(), conv_diff_bias_memory);
 
     // create memory descriptors
-    auto conv_bwd_src_md = memory::desc({conv_src_tz}, dt::f32, tag::any);
-    auto conv_diff_bias_md = memory::desc({conv_bias_tz}, dt::f32, tag::any);
-    auto conv_diff_weights_md
-            = memory::desc({conv_weights_tz}, dt::f32, tag::any);
-    auto conv_diff_dst_md = memory::desc({conv_dst_tz}, dt::f32, tag::any);
+    auto conv_bwd_src_md = memory::desc(
+            {conv_src_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_diff_bias_md = memory::desc(
+            {conv_bias_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_diff_weights_md = memory::desc(
+            {conv_weights_tz}, memory::data_type::f32, memory::format_tag::any);
+    auto conv_diff_dst_md = memory::desc(
+            {conv_dst_tz}, memory::data_type::f32, memory::format_tag::any);
 
     // create backward convolution primitive descriptor
     auto conv_bwd_weights_pd = convolution_backward_weights::primitive_desc(eng,
diff --git a/examples/cpu_matmul_coo.cpp b/examples/cpu_matmul_coo.cpp
new file mode 100644
index 00000000000..e16411015ea
--- /dev/null
+++ b/examples/cpu_matmul_coo.cpp
@@ -0,0 +1,108 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/// @example cpu_matmul_coo.cpp
+/// > Annotated version: @ref cpu_matmul_coo_cpp
+///
+/// This C++ API example demonstrates how to create and execute a
+/// [MatMul](@ref dev_guide_matmul) primitive that uses a source tensor
+/// encoded with the COO sparse encoding.
+///
+/// @page cpu_matmul_coo_cpp MatMul Primitive Example
+///
+/// @include cpu_matmul_coo.cpp
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "dnnl.hpp"
+#include "example_utils.hpp"
+
+using namespace dnnl;
+
+bool check_result(dnnl::memory dst_mem) {
+    // clang-format off
+    const std::vector<float> expected_result = {8.750000, 11.250000, 2.500000,
+                                                6.000000,  2.250000, 3.750000,
+                                               19.000000, 15.500000, 5.250000,
+                                                4.000000,  7.000000, 3.000000};
+    // clang-format on
+    std::vector<float> dst_data(expected_result.size());
+    read_from_dnnl_memory(dst_data.data(), dst_mem);
+    return expected_result == dst_data;
+}
+
+void sparse_matmul() {
+    dnnl::engine engine(engine::kind::cpu, 0);
+
+    const memory::dim M = 4;
+    const memory::dim N = 3;
+    const memory::dim K = 6;
+
+    // A sparse matrix represented in the COO format.
+    std::vector<float> src_coo_values = {2.5f, 1.5f, 1.5f, 2.5f, 2.0f};
+    std::vector<int32_t> src_coo_row_indices = {0, 1, 2, 2, 3};
+    std::vector<int32_t> src_coo_col_indices = {0, 2, 0, 5, 1};
+
+    // clang-format off
+    std::vector<float> weights_data = {3.5f, 4.5f, 1.0f,
+                                       2.0f, 3.5f, 1.5f,
+                                       4.0f, 1.5f, 2.5f,
+                                       3.5f, 5.5f, 4.5f,
+                                       1.5f, 2.5f, 5.5f,
+                                       5.5f, 3.5f, 1.5f};
+    // clang-format on
+
+    const int nnz = static_cast<int>(src_coo_values.size());
+
+    // Create a memory descriptor for COO format by providing information
+    // about number of non-zero entries and data types of metadata.
+    const auto src_coo_md = memory::desc::coo(
+            {M, K}, memory::data_type::f32, nnz, memory::data_type::s32);
+    const auto wei_md = memory::desc(
+            {K, N}, memory::data_type::f32, memory::format_tag::oi);
+    const auto dst_md = memory::desc(
+            {M, N}, memory::data_type::f32, memory::format_tag::nc);
+
+    // This memory is created for the given values and metadata of COO format.
+    memory src_coo_mem(src_coo_md, engine,
+            {src_coo_values.data(), src_coo_row_indices.data(),
+                    src_coo_col_indices.data()});
+    memory wei_mem(wei_md, engine, weights_data.data());
+    memory dst_mem(dst_md, engine);
+
+    dnnl::stream stream(engine);
+
+    auto sparse_matmul_pd
+            = matmul::primitive_desc(engine, src_coo_md, wei_md, dst_md);
+    auto sparse_matmul_prim = matmul(sparse_matmul_pd);
+
+    std::unordered_map<int, memory> sparse_matmul_args;
+    sparse_matmul_args.insert({DNNL_ARG_SRC, src_coo_mem});
+    sparse_matmul_args.insert({DNNL_ARG_WEIGHTS, wei_mem});
+    sparse_matmul_args.insert({DNNL_ARG_DST, dst_mem});
+
+    sparse_matmul_prim.execute(stream, sparse_matmul_args);
+    stream.wait();
+    if (!check_result(dst_mem)) throw std::runtime_error("Unexpected output.");
+}
+
+int main(int argc, char **argv) {
+    return handle_example_errors({engine::kind::cpu}, sparse_matmul);
+}
diff --git a/examples/cpu_matmul_csr.cpp b/examples/cpu_matmul_csr.cpp
index 7033f4aef81..e7823b685bd 100644
--- a/examples/cpu_matmul_csr.cpp
+++ b/examples/cpu_matmul_csr.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 /// > Annotated version: @ref cpu_matmul_csr_cpp
 ///
 /// This C++ API example demonstrates how to create and execute a
-/// [MatMul](@ref dev_guide_matmul) primitive that uses a weights tensor
+/// [MatMul](@ref dev_guide_matmul) primitive that uses a source tensor
 /// encoded with the CSR sparse encoding.
 ///
 /// @page cpu_matmul_csr_cpp MatMul Primitive Example
@@ -36,9 +36,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 bool check_result(dnnl::memory dst_mem) {
     // clang-format off
     const std::vector<float> expected_result = {8.750000, 11.250000, 2.500000,
@@ -77,10 +74,12 @@ void sparse_matmul() {
 
     // Create a memory descriptor for CSR format by providing information
     // about number of non-zero entries and data types of metadata.
-    const auto src_csr_md
-            = memory::desc::csr({M, K}, dt::f32, nnz, dt::s32, dt::s32);
-    const auto wei_md = memory::desc({K, N}, dt::f32, tag::oi);
-    const auto dst_md = memory::desc({M, N}, dt::f32, tag::nc);
+    const auto src_csr_md = memory::desc::csr({M, K}, memory::data_type::f32,
+            nnz, memory::data_type::s32, memory::data_type::s32);
+    const auto wei_md = memory::desc(
+            {K, N}, memory::data_type::f32, memory::format_tag::oi);
+    const auto dst_md = memory::desc(
+            {M, N}, memory::data_type::f32, memory::format_tag::nc);
 
     // This memory is created for the given values and metadata of CSR format.
     memory src_csr_mem(src_csr_md, engine,
diff --git a/examples/cpu_matmul_weights_compression.cpp b/examples/cpu_matmul_weights_compression.cpp
index 1169838b6e5..4bbc772f8c9 100644
--- a/examples/cpu_matmul_weights_compression.cpp
+++ b/examples/cpu_matmul_weights_compression.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,9 +37,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void matmul_example(dnnl::engine::kind engine_kind) {
     // Create execution dnnl::engine.
     dnnl::engine engine(engine_kind, 0);
@@ -79,22 +76,31 @@ void matmul_example(dnnl::engine::kind engine_kind) {
     const memory::dim nnz = std::count_if(weights_data.begin(),
             weights_data.end(), [](float v) { return v != 0.0f; });
 
-    auto src_md = memory::desc(src_dims, dt::f32, tag::ab);
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::ab);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::ab);
 
     auto src_mem = memory(src_md, engine);
     auto dst_mem = memory(dst_md, engine);
 
-    auto user_src_mem = memory({src_dims, dt::f32, tag::ab}, engine);
-    auto user_weights_mem = memory({weights_dims, dt::f32, tag::ab}, engine);
-    auto user_dst_mem = memory({dst_dims, dt::f32, tag::ab}, engine);
+    auto user_src_mem = memory(
+            {src_dims, memory::data_type::f32, memory::format_tag::ab}, engine);
+    auto user_weights_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ab},
+            engine);
+    auto user_dst_mem = memory(
+            {dst_dims, memory::data_type::f32, memory::format_tag::ab}, engine);
 
     write_to_dnnl_memory(src_data.data(), src_mem);
     write_to_dnnl_memory(weights_data.data(), user_weights_mem);
 
-    auto matmul_src_md = memory::desc(src_dims, dt::u8, tag::any);
-    auto matmul_weights_md = memory::desc::packed(weights_dims, dt::s8, nnz);
-    auto matmul_dst_md = memory::desc(dst_dims, dt::u8, tag::any);
+    auto matmul_src_md = memory::desc(
+            src_dims, memory::data_type::u8, memory::format_tag::any);
+    auto matmul_weights_md
+            = memory::desc::packed(weights_dims, memory::data_type::s8, nnz);
+    auto matmul_dst_md = memory::desc(
+            dst_dims, memory::data_type::u8, memory::format_tag::any);
 
     matmul::primitive_desc matmul_pd;
     try {
diff --git a/examples/example_utils.hpp b/examples/example_utils.hpp
index 136dfe6147f..8ff0676dc77 100644
--- a/examples/example_utils.hpp
+++ b/examples/example_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
 * limitations under the License.
 *******************************************************************************/
 
+/// @file
+/// Examples C++ Utility Functions
+
 #ifndef EXAMPLE_UTILS_HPP
 #define EXAMPLE_UTILS_HPP
 
@@ -76,7 +79,7 @@ inline void finalize() {
 #endif
 }
 
-dnnl::engine::kind validate_engine_kind(dnnl::engine::kind akind) {
+inline dnnl::engine::kind validate_engine_kind(dnnl::engine::kind akind) {
     // Checking if a GPU exists on the machine
     if (akind == dnnl::engine::kind::gpu) {
         if (dnnl::engine::get_count(dnnl::engine::kind::gpu) == 0) {
@@ -91,6 +94,7 @@ dnnl::engine::kind validate_engine_kind(dnnl::engine::kind akind) {
 // Exception class to indicate that the example uses a feature that is not
 // available on the current systems. It is not treated as an error then, but
 // just notifies a user.
+// NOLINTNEXTLINE(readability-identifier-naming)
 struct example_allows_unimplemented : public std::exception {
     example_allows_unimplemented(const char *message) noexcept
         : message(message) {}
@@ -104,7 +108,7 @@ inline const char *engine_kind2str_upper(dnnl::engine::kind kind);
 // Returns `0` on success, `1` or oneDNN error, and `2` on example error.
 inline int handle_example_errors(
         std::initializer_list<dnnl::engine::kind> engine_kinds,
-        std::function<void()> example) {
+        const std::function<void()> &example) {
     int exit_code = 0;
 
     try {
diff --git a/examples/graph/gated_mlp.cpp b/examples/graph/gated_mlp.cpp
new file mode 100644
index 00000000000..fd7547a486c
--- /dev/null
+++ b/examples/graph/gated_mlp.cpp
@@ -0,0 +1,275 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_graph.hpp"
+
+#include "graph_example_utils.hpp"
+
+using namespace dnnl;
+
+using namespace dnnl::graph;
+using layout_type = logical_tensor::layout_type;
+using dim = logical_tensor::dim;
+using dims = logical_tensor::dims;
+
+struct mlp_dims_t {
+    dim mb;
+    dim ic;
+    dim oc;
+};
+
+static const int min_runs = 4;
+
+// this is changed from the fill_random() function in matmul_perf.cpp.
+void fill_random(std::vector<float> &out) {
+    static std::vector<float> random_data_f;
+    constexpr size_t nrand = 1037;
+
+    if (random_data_f.empty()) {
+        std::mt19937 generator;
+        std::uniform_real_distribution<float> dist_f(-1.0f, 1.0f);
+
+        random_data_f.resize(nrand);
+        for (auto &d : random_data_f)
+            d = dist_f(generator);
+    }
+
+    for (size_t i = 0; i < out.size(); i += nrand) {
+        size_t chunk = std::min(nrand, out.size() - i);
+        std::memcpy(&out[i], random_data_f.data(), chunk * sizeof(float));
+    }
+}
+
+const char *get_type_string(logical_tensor::data_type dt) {
+    const char *type_string = "unknown";
+
+#define TYPE_CASE(T) \
+    if (dt == logical_tensor::data_type::T) type_string = #T;
+    TYPE_CASE(f16);
+    TYPE_CASE(f32);
+    TYPE_CASE(bf16);
+#undef TYPE_CASE
+
+    return type_string;
+}
+
+void print_test_case(logical_tensor::data_type dt, const mlp_dims_t &p) {
+    std::cout << '[' << std::setw(4) << get_type_string(dt);
+    std::cout << " mb = " << p.mb << ", ic = " << p.ic << ", oc = " << p.oc;
+    std::cout << "] " << std::flush;
+}
+
+void bench_gated_mlp(engine::kind ekind, logical_tensor::data_type dt,
+        const mlp_dims_t &p, double time_limit = 0.) {
+    const bool quick_test = (time_limit == 0.);
+    print_test_case(dt, p);
+
+    allocator alloc = create_allocator(ekind);
+
+    // Create execution dnnl::engine.
+    dnnl::engine eng = make_engine_with_allocator(ekind, 0, alloc);
+    // Create dnnl::stream.
+    dnnl::stream strm(eng);
+
+    // input shape
+    const dims src_sz = {p.mb, p.ic};
+    // weight0/weight1 shape: fc_gate and fc_up
+    const dims wei0_sz = {p.ic, p.oc};
+    // hidden shape
+    const dims hd_sz = {p.mb, p.oc};
+    // weight2 shape: fc_down
+    const dims wei2_sz = {p.oc, p.ic};
+    // output shape
+    const dims out_sz = {p.mb, p.ic};
+
+    // Incremental IDs used to create logical tensors and operations.
+    size_t id = 0;
+
+    // fc_gate
+    auto src = logical_tensor(id++, dt, src_sz, layout_type::strided);
+    auto wei0 = logical_tensor(id++, dt, wei0_sz, layout_type::strided);
+    auto out0 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto fc_gate = op(id++, op::kind::MatMul, "fc_gate");
+    fc_gate.add_inputs({src, wei0});
+    fc_gate.add_outputs({out0});
+
+    // fc_up
+    auto wei1 = logical_tensor(id++, dt, wei0_sz, layout_type::strided);
+    auto out1 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto fc_up = op(id++, op::kind::MatMul, "fc_up");
+    fc_up.add_inputs({src, wei1});
+    fc_up.add_outputs({out1});
+
+    // activation swish: sigmoid
+    auto out2 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto swi_sig = op(id++, op::kind::Sigmoid, "swish/sigmoid");
+    swi_sig.add_inputs({out0});
+    swi_sig.add_outputs({out2});
+
+    // activation swish: multiply
+    auto out3 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto swi_mul = op(id++, op::kind::Multiply, "swish/multiply");
+    swi_mul.add_inputs({out0, out2});
+    swi_mul.add_outputs({out3});
+
+    // multiplication
+    auto out4 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto mul = op(id++, op::kind::Multiply, "mul");
+    mul.add_inputs({out3, out1});
+    mul.add_outputs({out4});
+
+    // fc_down
+    auto wei2 = logical_tensor(id++, dt, wei2_sz, layout_type::strided);
+    auto dst = logical_tensor(id++, dt, out_sz, layout_type::strided);
+    auto fc_down = op(id++, op::kind::MatMul, "fc_down");
+    fc_down.add_inputs({out4, wei2});
+    fc_down.add_outputs({dst});
+
+    // Construct a gated mlp graph with engine kind and operations.
+    dnnl::graph::graph mlp(ekind);
+    mlp.add_op(fc_gate);
+    mlp.add_op(fc_up);
+    mlp.add_op(swi_sig);
+    mlp.add_op(swi_mul);
+    mlp.add_op(mul);
+    mlp.add_op(fc_down);
+    mlp.finalize();
+
+    // Get partitions from the mlp graph.
+    std::vector<partition> partitions = mlp.get_partitions();
+    // This is just for oneDNN testing purpose.
+    if (partitions.size() != 1) {
+        std::cout << "unsupported mlp" << std::endl;
+        return;
+    }
+
+    // Compile the partition with inputs, outputs, and an engine.
+    compiled_partition cp
+            = partitions[0].compile({src, wei0, wei1, wei2}, {dst}, eng);
+
+    // Create tensor objects
+    auto ts_src = tensor(src, eng);
+    auto ts_wei0 = tensor(wei0, eng);
+    auto ts_wei1 = tensor(wei1, eng);
+    auto ts_wei2 = tensor(wei2, eng);
+    auto ts_dst = tensor(dst, eng);
+
+    // Allocate user data.
+    std::vector<float> src_data(product(src_sz));
+    std::vector<float> wei0_data(product(wei0_sz));
+    std::vector<float> wei1_data(product(wei0_sz));
+    std::vector<float> wei2_data(product(wei2_sz));
+
+    fill_random(src_data);
+    fill_random(wei0_data);
+    fill_random(wei1_data);
+    fill_random(wei2_data);
+
+    // Write data to tensor object's handle.
+    write_to_dnnl_tensor(src_data.data(), ts_src);
+    write_to_dnnl_tensor(wei0_data.data(), ts_wei0);
+    write_to_dnnl_tensor(wei1_data.data(), ts_wei1);
+    write_to_dnnl_tensor(wei2_data.data(), ts_wei2);
+
+    // Warmup run.
+    // Execute the compiled partition of mqa.
+    cp.execute(strm, {ts_src, ts_wei0, ts_wei1, ts_wei2}, {ts_dst});
+
+    // Wait for the computation to finish.
+    strm.wait();
+
+    // First run.
+    auto start_first = std::chrono::steady_clock::now();
+    cp.execute(strm, {ts_src, ts_wei0, ts_wei1, ts_wei2}, {ts_dst});
+    strm.wait();
+    auto end_first = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::milli> dur_first
+            = end_first - start_first;
+
+    if (quick_test) return;
+
+    // Timing runs.
+    const int runs = std::max(min_runs, int(time_limit / dur_first.count()));
+    auto start = std::chrono::steady_clock::now();
+    for (int i = 0; i <= runs; i++) {
+        cp.execute(strm, {ts_src, ts_wei0, ts_wei1, ts_wei2}, {ts_dst});
+    }
+    strm.wait();
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::milli> duration = end - start;
+
+    // Display the results.
+    double avg_time = (duration.count() - dur_first.count()) / runs;
+    std::cout << "graph runs: " << runs + 1 << "; ";
+    std::cout << "avg_time: " << avg_time << " ms" << std::endl;
+}
+
+void bad_args() {
+    std::cerr << "Usage: graph-gated-mlp-cpp [cpu|gpu]\n"
+                 "       graph-gated-mlp-cpp [cpu|gpu] <mb> <ic> <oc>\n\n";
+    throw std::invalid_argument("Incorrect input arguments.");
+}
+
+void bench(engine::kind ekind, dnnl_data_type_t dt, const mlp_dims_t &p,
+        double time_limit = 0.) {
+    try {
+        bench_gated_mlp(ekind, static_cast<logical_tensor::data_type>(dt), p,
+                time_limit);
+        get_mem_pool().clear();
+    } catch (dnnl::error &e) {
+        // Catch and report unimplemented cases.
+        if (e.status == dnnl_unimplemented) {
+            std::cout << "unsupported mlp" << std::endl;
+        } else
+            throw;
+    }
+}
+
+void mlp_perf(engine::kind ekind, int argc, char **argv) {
+    // default testing parameters
+    mlp_dims_t params = {1, 4096, 14336};
+
+    if (argc > 2) {
+        if (argc == 5) {
+            params.mb = std::atoi(argv[2]);
+            params.ic = std::atoi(argv[3]);
+            params.oc = std::atoi(argv[4]);
+        } else {
+            bad_args();
+        }
+
+        if (params.mb <= 0 || params.ic <= 0 || params.oc <= 0) { bad_args(); }
+    }
+
+    bench(ekind, dnnl_f32, params, 2000.0 /*ms*/);
+    bench(ekind, dnnl_bf16, params, 2000.0 /*ms*/);
+    bench(ekind, dnnl_f16, params, 2000.0 /*ms*/);
+}
+
+int main(int argc, char **argv) {
+    return handle_example_errors(
+            mlp_perf, parse_engine_kind(argc, argv, 3), argc, argv);
+}
diff --git a/examples/graph/gated_mlp_int4.cpp b/examples/graph/gated_mlp_int4.cpp
new file mode 100644
index 00000000000..2910dba8712
--- /dev/null
+++ b/examples/graph/gated_mlp_int4.cpp
@@ -0,0 +1,356 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_graph.hpp"
+
+#include "graph_example_utils.hpp"
+
+using namespace dnnl;
+
+using namespace dnnl::graph;
+using data_type = logical_tensor::data_type;
+using layout_type = logical_tensor::layout_type;
+using dim = logical_tensor::dim;
+using dims = logical_tensor::dims;
+
+struct mlp_dims_t {
+    dim mb;
+    dim ic;
+    dim oc;
+    dim gr; // group size for int4 group quantization
+};
+
+static const int min_runs = 4;
+
+// this is changed from the fill_random() function in matmul_perf.cpp.
+void fill_random(std::vector<float> &out) {
+    static std::vector<float> random_data_f;
+    constexpr size_t nrand = 1037;
+
+    if (random_data_f.empty()) {
+        std::mt19937 generator;
+        std::uniform_real_distribution<float> dist_f(-1.0f, 1.0f);
+
+        random_data_f.resize(nrand);
+        for (auto &d : random_data_f)
+            d = dist_f(generator);
+    }
+
+    for (size_t i = 0; i < out.size(); i += nrand) {
+        size_t chunk = std::min(nrand, out.size() - i);
+        std::memcpy(&out[i], random_data_f.data(), chunk * sizeof(float));
+    }
+}
+
+const char *get_type_string(logical_tensor::data_type dt) {
+    const char *type_string = "unknown";
+
+#define TYPE_CASE(T) \
+    if (dt == logical_tensor::data_type::T) type_string = #T;
+    TYPE_CASE(f16);
+    TYPE_CASE(f32);
+    TYPE_CASE(bf16);
+#undef TYPE_CASE
+
+    return type_string;
+}
+
+void print_test_case(logical_tensor::data_type dt, const mlp_dims_t &p) {
+    std::cout << '[' << std::setw(4) << get_type_string(dt);
+    std::cout << " mb = " << p.mb << ", ic = " << p.ic << ", oc = " << p.oc
+              << ", group size = " << p.gr;
+    std::cout << "] " << std::flush;
+}
+
+void bench_gated_mlp(engine::kind ekind, logical_tensor::data_type dt,
+        const mlp_dims_t &p, double time_limit = 0.) {
+    const bool quick_test = (time_limit == 0.);
+    print_test_case(dt, p);
+
+    // input shape
+    const dims src_sz = {p.mb, p.ic};
+    // weight0/weight1 shape: fc_gate and fc_up
+    const dims wei0_sz = {p.ic, p.oc};
+    const dims wei0_scales_sz = {p.ic, p.oc / p.gr};
+    // hidden shape
+    const dims hd_sz = {p.mb, p.oc};
+    // weight2 shape: fc_down
+    const dims wei2_sz = {p.oc, p.ic};
+    const dims wei2_scales_sz = {p.oc, p.ic / p.gr};
+    // output shape
+    const dims out_sz = {p.mb, p.ic};
+
+    allocator alloc = create_allocator(ekind);
+
+    // Create execution dnnl::engine.
+    dnnl::engine eng = make_engine_with_allocator(ekind, 0, alloc);
+    // Create dnnl::stream.
+    dnnl::stream strm(eng);
+
+    // Incremental IDs used to create logical tensors and operations.
+    size_t id = 0;
+
+    // dequantize for fc_gate weights
+    auto wei0_int4 = logical_tensor(
+            id++, data_type::u4, wei0_sz, layout_type::strided);
+    auto wei0_scales
+            = logical_tensor(id++, dt, wei0_scales_sz, layout_type::strided);
+    auto wei0_zps = logical_tensor(
+            id++, data_type::u8, wei0_scales_sz, layout_type::strided);
+    auto wei0_dt = logical_tensor(id++, dt, wei0_sz, layout_type::strided);
+    auto deq_gate = op(id++, op::kind::DynamicDequantize, "deq_gate");
+    deq_gate.set_attr<std::string>(op::attr::qtype, "per_group");
+    deq_gate.set_attr<dims>(op::attr::group_shape, {1, p.gr});
+    deq_gate.set_attr<int64_t>(op::attr::axis, -1);
+    deq_gate.add_inputs({wei0_int4, wei0_scales, wei0_zps});
+    deq_gate.add_outputs({wei0_dt});
+
+    // fc_gate
+    auto src = logical_tensor(id++, dt, src_sz, layout_type::strided);
+    auto out0 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto fc_gate = op(id++, op::kind::MatMul, "fc_gate");
+    fc_gate.add_inputs({src, wei0_dt});
+    fc_gate.add_outputs({out0});
+
+    // dequantize for fc_up weights
+    auto wei1_int4 = logical_tensor(
+            id++, data_type::u4, wei0_sz, layout_type::strided);
+    auto wei1_scales
+            = logical_tensor(id++, dt, wei0_scales_sz, layout_type::strided);
+    auto wei1_zps = logical_tensor(
+            id++, data_type::u8, wei0_scales_sz, layout_type::strided);
+    auto wei1_dt = logical_tensor(id++, dt, wei0_sz, layout_type::strided);
+    auto deq_up = op(id++, op::kind::DynamicDequantize, "deq_up");
+    deq_up.set_attr<std::string>(op::attr::qtype, "per_group");
+    deq_up.set_attr<dims>(op::attr::group_shape, {1, p.gr});
+    deq_up.set_attr<int64_t>(op::attr::axis, -1);
+    deq_up.add_inputs({wei1_int4, wei1_scales, wei1_zps});
+    deq_up.add_outputs({wei1_dt});
+
+    // fc_up
+    auto out1 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto fc_up = op(id++, op::kind::MatMul, "fc_up");
+    fc_up.add_inputs({src, wei1_dt});
+    fc_up.add_outputs({out1});
+
+    // activation swish: sigmoid
+    auto out2 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto swi_sig = op(id++, op::kind::Sigmoid, "swish/sigmoid");
+    swi_sig.add_inputs({out0});
+    swi_sig.add_outputs({out2});
+
+    // activation swish: multiply
+    auto out3 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto swi_mul = op(id++, op::kind::Multiply, "swish/multiply");
+    swi_mul.add_inputs({out0, out2});
+    swi_mul.add_outputs({out3});
+
+    // multiplication
+    auto out4 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto mul = op(id++, op::kind::Multiply, "mul");
+    mul.add_inputs({out3, out1});
+    mul.add_outputs({out4});
+
+    // dequantize for fc_down weights
+    auto wei2_int4 = logical_tensor(
+            id++, data_type::u4, wei2_sz, layout_type::strided);
+    auto wei2_scales
+            = logical_tensor(id++, dt, wei2_scales_sz, layout_type::strided);
+    auto wei2_zps = logical_tensor(
+            id++, data_type::u8, wei2_scales_sz, layout_type::strided);
+    auto wei2_dt = logical_tensor(id++, dt, wei2_sz, layout_type::strided);
+    auto deq_down = op(id++, op::kind::DynamicDequantize, "deq_down");
+    deq_down.set_attr<std::string>(op::attr::qtype, "per_group");
+    deq_down.set_attr<dims>(op::attr::group_shape, {1, p.gr});
+    deq_down.set_attr<int64_t>(op::attr::axis, -1);
+    deq_down.add_inputs({wei2_int4, wei2_scales, wei2_zps});
+    deq_down.add_outputs({wei2_dt});
+
+    // fc_down
+    auto dst = logical_tensor(id++, dt, out_sz, layout_type::strided);
+    auto fc_down = op(id++, op::kind::MatMul, "fc_down");
+    fc_down.add_inputs({out4, wei2_dt});
+    fc_down.add_outputs({dst});
+
+    // Construct a gated mlp graph with engine kind and operations.
+    dnnl::graph::graph mlp(ekind);
+    mlp.set_fpmath_mode(fpmath_mode::strict, true);
+    mlp.add_op(deq_gate);
+    mlp.add_op(deq_up);
+    mlp.add_op(fc_gate);
+    mlp.add_op(fc_up);
+    mlp.add_op(swi_sig);
+    mlp.add_op(swi_mul);
+    mlp.add_op(mul);
+    mlp.add_op(deq_down);
+    mlp.add_op(fc_down);
+    mlp.finalize();
+
+    // Get partitions from the mlp graph.
+    std::vector<partition> partitions = mlp.get_partitions();
+    // This is just for oneDNN testing purpose.
+    if (partitions.size() != 1) {
+        std::cout << "unsupported mlp" << std::endl;
+        return;
+    }
+
+    // Compile the partition with inputs, outputs, and an engine.
+    compiled_partition cp = partitions[0].compile(
+            {src, wei0_int4, wei0_scales, wei0_zps, wei1_int4, wei1_scales,
+                    wei1_zps, wei2_int4, wei2_scales, wei2_zps},
+            {dst}, eng);
+
+    // Create tensor objects
+    auto ts_src = tensor(src, eng);
+    auto ts_wei0 = tensor(wei0_int4, eng);
+    auto ts_wei0_scales = tensor(wei0_scales, eng);
+    auto ts_wei0_zps = tensor(wei0_zps, eng);
+    auto ts_wei1 = tensor(wei1_int4, eng);
+    auto ts_wei1_scales = tensor(wei1_scales, eng);
+    auto ts_wei1_zps = tensor(wei1_zps, eng);
+    auto ts_wei2 = tensor(wei2_int4, eng);
+    auto ts_wei2_scales = tensor(wei2_scales, eng);
+    auto ts_wei2_zps = tensor(wei2_zps, eng);
+    auto ts_dst = tensor(dst, eng);
+
+    // Allocate user data.
+    std::vector<float> src_data(product(src_sz));
+    std::vector<float> wei0_data(product(wei0_sz));
+    std::vector<float> wei1_data(product(wei0_sz));
+    std::vector<float> wei2_data(product(wei2_sz));
+
+    fill_random(src_data);
+    fill_random(wei0_data);
+    fill_random(wei1_data);
+    fill_random(wei2_data);
+
+    // Write data to tensor object's handle.
+    write_to_dnnl_tensor(src_data.data(), ts_src);
+    write_to_dnnl_tensor(wei0_data.data(), ts_wei0);
+    write_to_dnnl_tensor(wei1_data.data(), ts_wei1);
+    write_to_dnnl_tensor(wei2_data.data(), ts_wei2);
+
+    // Warmup run.
+    // Execute the compiled partition of mlp. TODO: initialize the scales and zps.
+    cp.execute(strm,
+            {ts_src, ts_wei0, ts_wei0_scales, ts_wei0_zps, ts_wei1,
+                    ts_wei1_scales, ts_wei1_zps, ts_wei2, ts_wei2_scales,
+                    ts_wei2_zps},
+            {ts_dst});
+
+    // Wait for the computation to finish.
+    strm.wait();
+
+    // First run.
+    auto start_first = std::chrono::steady_clock::now();
+    cp.execute(strm,
+            {ts_src, ts_wei0, ts_wei0_scales, ts_wei0_zps, ts_wei1,
+                    ts_wei1_scales, ts_wei1_zps, ts_wei2, ts_wei2_scales,
+                    ts_wei2_zps},
+            {ts_dst});
+    strm.wait();
+    auto end_first = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::milli> dur_first
+            = end_first - start_first;
+
+    if (quick_test) return;
+
+    // Timing runs.
+    const int runs = std::max(min_runs, int(time_limit / dur_first.count()));
+    auto start = std::chrono::steady_clock::now();
+    for (int i = 0; i <= runs; i++) {
+        cp.execute(strm,
+                {ts_src, ts_wei0, ts_wei0_scales, ts_wei0_zps, ts_wei1,
+                        ts_wei1_scales, ts_wei1_zps, ts_wei2, ts_wei2_scales,
+                        ts_wei2_zps},
+                {ts_dst});
+    }
+    strm.wait();
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::milli> duration = end - start;
+
+    // Display the results.
+    double avg_time = (duration.count() - dur_first.count()) / runs;
+    std::cout << "graph runs: " << runs + 1 << "; ";
+    std::cout << "avg_time: " << avg_time << " ms" << std::endl;
+}
+
+void bad_args() {
+    std::cerr << "Usage: graph-gated-mlp-int4-cpp [cpu|gpu]\n"
+                 "       graph-gated-mlp-int4-cpp [cpu|gpu] <mb> <ic> <oc> "
+                 "<group size>\n\n";
+    throw std::invalid_argument("Incorrect input arguments.");
+}
+
+void bench(engine::kind ekind, dnnl_data_type_t dt, const mlp_dims_t &p,
+        double time_limit = 0.) {
+    try {
+        bench_gated_mlp(ekind, static_cast<logical_tensor::data_type>(dt), p,
+                time_limit);
+        get_mem_pool().clear();
+    } catch (dnnl::error &e) {
+        // Catch and report unimplemented cases.
+        if (e.status == dnnl_unimplemented) {
+            std::cout << "unsupported mlp" << std::endl;
+        } else
+            throw;
+    }
+}
+
+void mlp_perf(engine::kind ekind, int argc, char **argv) {
+    // default testing parameters
+    mlp_dims_t params = {1, 4096, 14336, 128};
+
+    if (argc > 2) {
+        if (argc == 6) {
+            params.mb = std::atoi(argv[2]);
+            params.ic = std::atoi(argv[3]);
+            params.oc = std::atoi(argv[4]);
+            params.gr = std::atoi(argv[5]);
+        } else {
+            bad_args();
+        }
+
+        if (params.mb <= 0 || params.ic <= 0 || params.oc <= 0
+                || params.gr <= 0) {
+            bad_args();
+        }
+
+        if (params.ic < params.gr || params.oc < params.gr
+                || params.ic % params.gr != 0 || params.oc % params.gr != 0) {
+            bad_args();
+        }
+    }
+
+    bench(ekind, dnnl_f32, params, 2000.0 /*ms*/);
+    bench(ekind, dnnl_bf16, params, 2000.0 /*ms*/);
+    bench(ekind, dnnl_f16, params, 2000.0 /*ms*/);
+}
+
+int main(int argc, char **argv) {
+    return handle_example_errors(
+            mlp_perf, parse_engine_kind(argc, argv, 4), argc, argv);
+}
diff --git a/examples/graph/gated_mlp_wei_combined.cpp b/examples/graph/gated_mlp_wei_combined.cpp
new file mode 100644
index 00000000000..2d3a6c88071
--- /dev/null
+++ b/examples/graph/gated_mlp_wei_combined.cpp
@@ -0,0 +1,300 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "oneapi/dnnl/dnnl.hpp"
+#include "oneapi/dnnl/dnnl_graph.hpp"
+
+#include "graph_example_utils.hpp"
+
+using namespace dnnl;
+
+using namespace dnnl::graph;
+using layout_type = logical_tensor::layout_type;
+using dim = logical_tensor::dim;
+using dims = logical_tensor::dims;
+
+struct mlp_dims_t {
+    dim mb;
+    dim ic;
+    dim oc;
+};
+
+static const int min_runs = 4;
+
+// this is changed from the fill_random() function in matmul_perf.cpp.
+void fill_random(std::vector<float> &out) {
+    static std::vector<float> random_data_f;
+    constexpr size_t nrand = 1037;
+
+    if (random_data_f.empty()) {
+        std::mt19937 generator;
+        std::uniform_real_distribution<float> dist_f(-1.0f, 1.0f);
+
+        random_data_f.resize(nrand);
+        for (auto &d : random_data_f)
+            d = dist_f(generator);
+    }
+
+    for (size_t i = 0; i < out.size(); i += nrand) {
+        size_t chunk = std::min(nrand, out.size() - i);
+        std::memcpy(&out[i], random_data_f.data(), chunk * sizeof(float));
+    }
+}
+
+const char *get_type_string(logical_tensor::data_type dt) {
+    const char *type_string = "unknown";
+
+#define TYPE_CASE(T) \
+    if (dt == logical_tensor::data_type::T) type_string = #T;
+    TYPE_CASE(f16);
+    TYPE_CASE(f32);
+    TYPE_CASE(bf16);
+#undef TYPE_CASE
+
+    return type_string;
+}
+
+size_t size_of(logical_tensor::data_type dt) {
+    // This example only supports f32, bf16, and f16.
+    switch (dt) {
+        case logical_tensor::data_type::f32: return 4;
+        case logical_tensor::data_type::bf16:
+        case logical_tensor::data_type::f16: return 2;
+        default: assert(!"unknown data_type");
+    }
+
+    return (size_t)-1; /* not supposed to be reachable */
+}
+
+void print_test_case(logical_tensor::data_type dt, const mlp_dims_t &p) {
+    std::cout << '[' << std::setw(4) << get_type_string(dt);
+    std::cout << " mb = " << p.mb << ", ic = " << p.ic << ", oc = " << p.oc;
+    std::cout << "] " << std::flush;
+}
+
+void bench_gated_mlp(engine::kind ekind, logical_tensor::data_type dt,
+        const mlp_dims_t &p, double time_limit = 0.) {
+    const bool quick_test = (time_limit == 0.);
+    print_test_case(dt, p);
+
+    allocator alloc = create_allocator(ekind);
+
+    // Create execution dnnl::engine.
+    dnnl::engine eng = make_engine_with_allocator(ekind, 0, alloc);
+    // Create dnnl::stream.
+    dnnl::stream strm(eng);
+
+    // input shape
+    const dims src_sz = {p.mb, p.ic};
+    // weight0/weight1 shape: fc_gate and fc_up
+    const dims wei0_sz = {p.ic, p.oc};
+    // hidden shape
+    const dims hd_sz = {p.mb, p.oc};
+    // weight2 shape: fc_down
+    const dims wei2_sz = {p.oc, p.ic};
+    // output shape
+    const dims out_sz = {p.mb, p.ic};
+
+    // Combined wei0 and wei1 together into shape (ic, 2 * oc), assuming the
+    // first part is wei0 for fc_gate and the second part is wei1 for fc_up.
+    const dims combined_wei0_sz = {p.ic, 2 * p.oc};
+    const dims combined_wei0_st = {2 * p.oc, 1};
+
+    // Incremental IDs used to create logical tensors and operations.
+    size_t id = 0;
+
+    // This logical tensor is not part of the graph but is used to generate the
+    // big chunk of device memory which should be already there in real user
+    // application or framework.
+    auto combined_wei0
+            = logical_tensor(id++, dt, combined_wei0_sz, layout_type::strided);
+
+    // fc_gate: wei0 is non-contiguous now.
+    auto src = logical_tensor(id++, dt, src_sz, layout_type::strided);
+    auto wei0 = logical_tensor(id++, dt, wei0_sz, combined_wei0_st);
+    auto out0 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto fc_gate = op(id++, op::kind::MatMul, "fc_gate");
+    fc_gate.add_inputs({src, wei0});
+    fc_gate.add_outputs({out0});
+
+    // fc_up: wei1 is non-contiguous now.
+    auto wei1 = logical_tensor(id++, dt, wei0_sz, combined_wei0_st);
+    auto out1 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto fc_up = op(id++, op::kind::MatMul, "fc_up");
+    fc_up.add_inputs({src, wei1});
+    fc_up.add_outputs({out1});
+
+    // activation swish: sigmoid
+    auto out2 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto swi_sig = op(id++, op::kind::Sigmoid, "swish/sigmoid");
+    swi_sig.add_inputs({out0});
+    swi_sig.add_outputs({out2});
+
+    // activation swish: multiply
+    auto out3 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto swi_mul = op(id++, op::kind::Multiply, "swish/multiply");
+    swi_mul.add_inputs({out0, out2});
+    swi_mul.add_outputs({out3});
+
+    // multiplication
+    auto out4 = logical_tensor(id++, dt, hd_sz, layout_type::strided);
+    auto mul = op(id++, op::kind::Multiply, "mul");
+    mul.add_inputs({out3, out1});
+    mul.add_outputs({out4});
+
+    // fc_down
+    auto wei2 = logical_tensor(id++, dt, wei2_sz, layout_type::strided);
+    auto dst = logical_tensor(id++, dt, out_sz, layout_type::strided);
+    auto fc_down = op(id++, op::kind::MatMul, "fc_down");
+    fc_down.add_inputs({out4, wei2});
+    fc_down.add_outputs({dst});
+
+    // Construct a gated mlp graph with engine kind and operations.
+    dnnl::graph::graph mlp(ekind);
+    mlp.add_op(fc_gate);
+    mlp.add_op(fc_up);
+    mlp.add_op(swi_sig);
+    mlp.add_op(swi_mul);
+    mlp.add_op(mul);
+    mlp.add_op(fc_down);
+    mlp.finalize();
+
+    // Get partitions from the mlp graph.
+    std::vector<partition> partitions = mlp.get_partitions();
+    // This is just for oneDNN testing purpose.
+    if (partitions.size() != 1) {
+        std::cout << "unsupported mlp" << std::endl;
+        return;
+    }
+
+    // Compile the partition with inputs, outputs, and an engine.
+    compiled_partition cp
+            = partitions[0].compile({src, wei0, wei1, wei2}, {dst}, eng);
+
+    // Create tensor objects
+    auto ts_src = tensor(src, eng);
+    auto ts_combined_wei0 = tensor(combined_wei0, eng);
+    auto ts_wei2 = tensor(wei2, eng);
+    auto ts_dst = tensor(dst, eng);
+
+    // Allocate user data.
+    std::vector<float> src_data(product(src_sz));
+    std::vector<float> combined_wei0_data(product(combined_wei0_sz));
+    std::vector<float> wei2_data(product(wei2_sz));
+
+    fill_random(src_data);
+    fill_random(combined_wei0_data);
+    fill_random(wei2_data);
+
+    // Write data to tensor object's handle.
+    write_to_dnnl_tensor(src_data.data(), ts_src);
+    write_to_dnnl_tensor(combined_wei0_data.data(), ts_combined_wei0);
+    write_to_dnnl_tensor(wei2_data.data(), ts_wei2);
+
+    // create ts_wei0, ts_wei1 from the data handle of combined_wei0 and offsets.
+    char *handle = reinterpret_cast<char *>(ts_combined_wei0.get_data_handle());
+    auto ts_wei0 = tensor(wei0, eng, handle);
+    auto ts_wei1 = tensor(wei1, eng, handle + p.oc * size_of(dt));
+
+    // Warmup run.
+    // Execute the compiled partition of mqa.
+    cp.execute(strm, {ts_src, ts_wei0, ts_wei1, ts_wei2}, {ts_dst});
+
+    // Wait for the computation to finish.
+    strm.wait();
+
+    // First run.
+    auto start_first = std::chrono::steady_clock::now();
+    cp.execute(strm, {ts_src, ts_wei0, ts_wei1, ts_wei2}, {ts_dst});
+    strm.wait();
+    auto end_first = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::milli> dur_first
+            = end_first - start_first;
+
+    if (quick_test) return;
+
+    // Timing runs.
+    const int runs = std::max(min_runs, int(time_limit / dur_first.count()));
+    auto start = std::chrono::steady_clock::now();
+    for (int i = 0; i <= runs; i++) {
+        cp.execute(strm, {ts_src, ts_wei0, ts_wei1, ts_wei2}, {ts_dst});
+    }
+    strm.wait();
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<double, std::milli> duration = end - start;
+
+    // Display the results.
+    double avg_time = (duration.count() - dur_first.count()) / runs;
+    std::cout << "graph runs: " << runs + 1 << "; ";
+    std::cout << "avg_time: " << avg_time << " ms" << std::endl;
+}
+
+void bad_args() {
+    std::cerr << "Usage: graph-gated-mlp-wei-combined-cpp [cpu|gpu]\n"
+                 "       graph-gated-mlp-wei-combined-cpp [cpu|gpu] <mb> <ic> "
+                 "<oc>\n\n";
+    throw std::invalid_argument("Incorrect input arguments.");
+}
+
+void bench(engine::kind ekind, dnnl_data_type_t dt, const mlp_dims_t &p,
+        double time_limit = 0.) {
+    try {
+        bench_gated_mlp(ekind, static_cast<logical_tensor::data_type>(dt), p,
+                time_limit);
+        get_mem_pool().clear();
+    } catch (dnnl::error &e) {
+        // Catch and report unimplemented cases.
+        if (e.status == dnnl_unimplemented) {
+            std::cout << "unsupported mlp" << std::endl;
+        } else
+            throw;
+    }
+}
+
+void mlp_perf(engine::kind ekind, int argc, char **argv) {
+    // default testing parameters
+    mlp_dims_t params = {1, 4096, 14336};
+
+    if (argc > 2) {
+        if (argc == 5) {
+            params.mb = std::atoi(argv[2]);
+            params.ic = std::atoi(argv[3]);
+            params.oc = std::atoi(argv[4]);
+        } else {
+            bad_args();
+        }
+
+        if (params.mb <= 0 || params.ic <= 0 || params.oc <= 0) { bad_args(); }
+    }
+
+    bench(ekind, dnnl_f32, params, 2000.0 /*ms*/);
+    bench(ekind, dnnl_bf16, params, 2000.0 /*ms*/);
+    bench(ekind, dnnl_f16, params, 2000.0 /*ms*/);
+}
+
+int main(int argc, char **argv) {
+    return handle_example_errors(
+            mlp_perf, parse_engine_kind(argc, argv, 3), argc, argv);
+}
diff --git a/examples/graph/gqa.cpp b/examples/graph/gqa.cpp
index 3c60bfd1d9d..0f0e244116d 100644
--- a/examples/graph/gqa.cpp
+++ b/examples/graph/gqa.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include "graph_example_utils.hpp"
 
 using namespace dnnl;
-using tag = memory::format_tag;
 
 using namespace dnnl::graph;
 using layout_type = logical_tensor::layout_type;
diff --git a/examples/graph/graph_example_utils.hpp b/examples/graph/graph_example_utils.hpp
index 02a9a844812..4671aa36d9d 100644
--- a/examples/graph/graph_example_utils.hpp
+++ b/examples/graph/graph_example_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,7 +40,8 @@
 /// @param partitions a list of partitions
 /// @param id_to_set_any_layout a set of ids of logical tensors with any layout
 ///     type
-void set_any_layout(const std::vector<dnnl::graph::partition> &partitions,
+inline void set_any_layout(
+        const std::vector<dnnl::graph::partition> &partitions,
         std::unordered_set<size_t> &id_to_set_any_layout) {
     // mapping from output tensor id to the all supported flags of
     // supported partitions, we may only need outputs' supported flags
@@ -104,30 +105,30 @@ void set_any_layout(const std::vector<dnnl::graph::partition> &partitions,
     }
 }
 
-struct cpu_deletor {
-    cpu_deletor() = default;
+struct cpu_deletor_t {
+    cpu_deletor_t() = default;
     void operator()(void *ptr) {
         if (ptr) free(ptr);
     }
 };
 
 #ifdef DNNL_WITH_SYCL
-struct sycl_deletor {
-    sycl_deletor() = delete;
+struct sycl_deletor_t {
+    sycl_deletor_t() = delete;
     ::sycl::context ctx_;
-    sycl_deletor(const ::sycl::context &ctx) : ctx_(ctx) {}
+    sycl_deletor_t(const ::sycl::context &ctx) : ctx_(ctx) {}
     void operator()(void *ptr) {
         if (ptr) ::sycl::free(ptr, ctx_);
     }
 };
 
-void *sycl_malloc_wrapper(
+inline void *sycl_malloc_wrapper(
         size_t size, size_t alignment, const void *dev, const void *ctx) {
     return malloc_shared(size, *static_cast<const ::sycl::device *>(dev),
             *static_cast<const ::sycl::context *>(ctx));
 }
 
-void sycl_free_wrapper(
+inline void sycl_free_wrapper(
         void *ptr, const void *device, const void *context, void *event) {
     // Device is not used in this example, but it may be useful for some users
     // application.
@@ -142,7 +143,7 @@ void sycl_free_wrapper(
 }
 #endif
 
-void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
+inline void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         std::vector<std::shared_ptr<void>> &data_buffer,
         const dnnl::engine &eng) {
@@ -152,14 +153,14 @@ void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
 
         // memory allocation
         data_buffer.push_back({});
-        data_buffer.back().reset(malloc(mem_size), cpu_deletor {});
+        data_buffer.back().reset(malloc(mem_size), cpu_deletor_t {});
 
         dnnl::graph::tensor new_ts {lt, eng, data_buffer.back().get()};
         tensors.push_back(new_ts);
     }
 }
 
-void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
+inline void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         std::vector<std::shared_ptr<void>> &data_buffer,
         std::unordered_map<size_t, dnnl::graph::tensor> &global_outputs_ts_map,
@@ -180,7 +181,7 @@ void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
 
         // memory allocation
         data_buffer.push_back({});
-        data_buffer.back().reset(malloc(mem_size), cpu_deletor {});
+        data_buffer.back().reset(malloc(mem_size), cpu_deletor_t {});
 
         dnnl::graph::tensor new_ts {lt, eng, data_buffer.back().get()};
         tensors.push_back(new_ts);
@@ -191,7 +192,7 @@ void allocate_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
 }
 
 #ifdef DNNL_WITH_SYCL
-void allocate_sycl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
+inline void allocate_sycl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         std::vector<std::shared_ptr<void>> &data_buffer, sycl::queue &q,
         const dnnl::engine &eng) {
@@ -203,14 +204,14 @@ void allocate_sycl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         data_buffer.push_back({});
         data_buffer.back().reset(::sycl::malloc_shared(mem_size, q.get_device(),
                                          q.get_context()),
-                sycl_deletor {q.get_context()});
+                sycl_deletor_t {q.get_context()});
 
         dnnl::graph::tensor new_ts {lt, eng, data_buffer.back().get()};
         tensors.push_back(new_ts);
     }
 }
 
-void allocate_sycl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
+inline void allocate_sycl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         std::vector<std::shared_ptr<void>> &data_buffer,
         std::unordered_map<size_t, dnnl::graph::tensor> &global_outputs_ts_map,
@@ -233,7 +234,7 @@ void allocate_sycl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         data_buffer.push_back({});
         data_buffer.back().reset(::sycl::malloc_shared(mem_size, q.get_device(),
                                          q.get_context()),
-                sycl_deletor {q.get_context()});
+                sycl_deletor_t {q.get_context()});
 
         dnnl::graph::tensor new_ts {lt, eng, data_buffer.back().get()};
         tensors.push_back(new_ts);
@@ -292,7 +293,7 @@ static void *ocl_malloc_device(
 }
 
 static void ocl_free(
-        void *ptr, cl_device_id dev, const cl_context ctx, cl_event event) {
+        void *ptr, cl_device_id dev, cl_context ctx, cl_event event) {
     if (nullptr == ptr) return;
     using F = cl_int (*)(cl_context, void *);
     if (event) { OCL_CHECK(clWaitForEvents(1, &event)); }
@@ -305,7 +306,7 @@ static void ocl_free(
     OCL_CHECK(f(ctx, ptr));
 }
 
-void allocate_ocl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
+inline void allocate_ocl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         std::vector<std::shared_ptr<void>> &data_buffer,
         std::unordered_map<size_t, dnnl::graph::tensor> &global_outputs_ts_map,
@@ -341,7 +342,8 @@ void allocate_ocl_graph_mem(std::vector<dnnl::graph::tensor> &tensors,
     }
 }
 
-void ocl_memcpy(dnnl::engine &eng, void *dst, const void *src, size_t size) {
+inline void ocl_memcpy(
+        dnnl::engine &eng, void *dst, const void *src, size_t size) {
     using F = cl_int (*)(cl_command_queue, cl_bool, void *, const void *,
             size_t, cl_uint, const cl_event *, cl_event *);
     if (!src || !dst) return;
@@ -370,8 +372,6 @@ void ocl_memcpy(dnnl::engine &eng, void *dst, const void *src, size_t size) {
     err = f(queue, CL_FALSE, dst, src, size, 0, nullptr, nullptr);
     if (err != CL_SUCCESS)
         throw std::runtime_error("clEnqueueMemcpyINTEL failed");
-
-    return;
 }
 #endif
 
@@ -525,7 +525,7 @@ class simple_memory_pool_t {
 #ifdef DNNL_WITH_SYCL
             auto sh_ptr = std::shared_ptr<void> {
                     sycl_malloc_wrapper(size, alignment, dev, ctx),
-                    sycl_deletor {*static_cast<const sycl::context *>(ctx)}};
+                    sycl_deletor_t {*static_cast<const sycl::context *>(ctx)}};
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
@@ -535,7 +535,7 @@ class simple_memory_pool_t {
 #endif
             ptr = sh_ptr.get();
             // record the map of mm size and its ptr for reuse
-            map_size_ptr_.emplace(std::make_pair(size, sh_ptr));
+            map_size_ptr_.emplace(size, sh_ptr);
             is_free_ptr_[ptr] = false;
         }
         return ptr;
@@ -562,10 +562,11 @@ class simple_memory_pool_t {
             }
         }
         if (need_alloc_new_mm) {
-            auto sh_ptr = std::shared_ptr<void> {malloc(size), cpu_deletor {}};
+            auto sh_ptr
+                    = std::shared_ptr<void> {malloc(size), cpu_deletor_t {}};
             ptr = sh_ptr.get();
             // record the map of mm size and its ptr for reuse
-            map_size_ptr_.emplace(std::make_pair(size, sh_ptr));
+            map_size_ptr_.emplace(size, sh_ptr);
             is_free_ptr_[ptr] = false;
         }
         return ptr;
@@ -575,27 +576,26 @@ class simple_memory_pool_t {
     void deallocate(
             void *ptr, const void *device, const void *context, void *event) {
         std::lock_guard<std::mutex> pool_guard(pool_lock);
-        if (event) {
-            auto sycl_deps_ptr = static_cast<::sycl::event *>(event);
-            sycl_deps_ptr->wait();
-        }
+        // This example currently supports `in_order`. So the kernel are
+        // executed in the order in which they are submitted. Don't need to wait
+        // event.
         is_free_ptr_[ptr] = true;
         return;
     }
 #endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     void deallocate(
-            void *ptr, cl_device_id dev, const cl_context ctx, cl_event event) {
+            void *ptr, cl_device_id dev, cl_context ctx, cl_event event) {
         std::lock_guard<std::mutex> pool_guard(pool_lock);
-        if (event) { OCL_CHECK(clWaitForEvents(1, &event)); }
+        // This example currently supports `In-order`. So the kernel are
+        // executed in the order in which they are submitted. Don't need to wait
+        // event.
         is_free_ptr_[ptr] = true;
-        return;
     }
 #endif
     void deallocate_host(void *ptr) {
         std::lock_guard<std::mutex> pool_guard(pool_lock);
         is_free_ptr_[ptr] = true;
-        return;
     }
     void clear() {
         dnnl::graph::set_compiled_partition_cache_capacity(0);
@@ -609,12 +609,12 @@ class simple_memory_pool_t {
     std::unordered_map<void *, bool> is_free_ptr_;
 };
 
-simple_memory_pool_t &get_mem_pool() {
+inline simple_memory_pool_t &get_mem_pool() {
     static simple_memory_pool_t mem_pool;
     return mem_pool;
 }
 
-dnnl::graph::allocator create_allocator(dnnl::engine::kind ekind) {
+inline dnnl::graph::allocator create_allocator(dnnl::engine::kind ekind) {
     if (ekind == dnnl::engine::kind::cpu) {
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
         auto alloc_func = [](size_t size, size_t alignment, const void *dev,
@@ -653,8 +653,8 @@ dnnl::graph::allocator create_allocator(dnnl::engine::kind ekind) {
                                   cl_context ctx) -> void * {
             return get_mem_pool().allocate(size, alignment, dev, ctx);
         };
-        auto dealloc_func = [](void *ptr, cl_device_id dev,
-                                    const cl_context ctx, cl_event event) {
+        auto dealloc_func = [](void *ptr, cl_device_id dev, cl_context ctx,
+                                    cl_event event) {
             return get_mem_pool().deallocate(ptr, dev, ctx, event);
         };
         return dnnl::graph::ocl_interop::make_allocator(
diff --git a/examples/graph/mqa.cpp b/examples/graph/mqa.cpp
index 3f35b4684de..4097d356a00 100644
--- a/examples/graph/mqa.cpp
+++ b/examples/graph/mqa.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include "graph_example_utils.hpp"
 
 using namespace dnnl;
-using tag = memory::format_tag;
 
 using namespace dnnl::graph;
 using layout_type = logical_tensor::layout_type;
diff --git a/examples/graph/sdpa.cpp b/examples/graph/sdpa.cpp
index 3b86c51808c..99f3dc2c64b 100644
--- a/examples/graph/sdpa.cpp
+++ b/examples/graph/sdpa.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include "graph_example_utils.hpp"
 
 using namespace dnnl;
-using tag = memory::format_tag;
 
 using namespace dnnl::graph;
 using layout_type = logical_tensor::layout_type;
@@ -97,6 +96,9 @@ void bench_sdpa_primitives(engine::kind ekind, memory::data_type dt,
     // Create dnnl::stream.
     dnnl::stream strm(eng);
 
+    // Intermediate data type
+    const memory::data_type dt_inter = memory::data_type::f32;
+
     // Prepare input and output shapes to construct the sdpa graph.
     const memory::dims q_sz = {p.mb, p.head_num, p.query_num, p.head_size};
     const memory::dims k_sz = {p.mb, p.head_num, p.head_size, p.seq_len};
@@ -109,11 +111,12 @@ void bench_sdpa_primitives(engine::kind ekind, memory::data_type dt,
     // scaled_score = score / scale
     // masked_score = scaled_score + mask
     // All combined in a single matmul primitive.
-    auto query_md = memory::desc(q_sz, dt, tag::abcd);
-    auto key_md = memory::desc(k_sz, dt, tag::abdc);
-    auto score_md = memory::desc(score_sz, dt, tag::abcd);
-    auto scale_md = memory::desc(scale_sz, dt, tag::abcd);
-    auto mask_md = memory::desc(mask_sz, dt, tag::abcd);
+    auto query_md = memory::desc(q_sz, dt, memory::format_tag::abcd);
+    auto key_md = memory::desc(k_sz, dt, memory::format_tag::abdc);
+    auto score_md = memory::desc(score_sz, dt_inter, memory::format_tag::abcd);
+    auto scale_md = memory::desc(scale_sz, dt, memory::format_tag::abcd);
+    auto mask_md = memory::desc(mask_sz, dt, memory::format_tag::abcd);
+    auto probs_md = memory::desc(score_sz, dt, memory::format_tag::abcd);
 
     primitive_attr bmm1_attr;
     bmm1_attr.set_scratchpad_mode(scratchpad_mode::user);
@@ -131,16 +134,16 @@ void bench_sdpa_primitives(engine::kind ekind, memory::data_type dt,
     softmax_attr.set_scratchpad_mode(scratchpad_mode::user);
     auto softmax_pd = softmax_forward::primitive_desc(eng,
             prop_kind::forward_inference, algorithm::softmax_accurate, score_md,
-            score_md, /* axis = */ score_md.get_ndims() - 1, softmax_attr);
+            probs_md, /* axis = */ score_md.get_ndims() - 1, softmax_attr);
     auto softmax_prim = softmax_forward(softmax_pd);
 
     // attention_output = attention_probs x value
-    auto value_md = memory::desc(v_sz, dt, tag::abcd);
-    auto output_md = memory::desc(q_sz, dt, tag::abcd);
+    auto value_md = memory::desc(v_sz, dt, memory::format_tag::abcd);
+    auto output_md = memory::desc(q_sz, dt, memory::format_tag::abcd);
     primitive_attr bmm2_attr;
     bmm2_attr.set_scratchpad_mode(scratchpad_mode::user);
     auto bmm2_pd = matmul::primitive_desc(
-            eng, score_md, value_md, output_md, bmm2_attr);
+            eng, probs_md, value_md, output_md, bmm2_attr);
     auto bmm2_prim = matmul(bmm2_pd);
 
     // Create memory objects
@@ -180,10 +183,11 @@ void bench_sdpa_primitives(engine::kind ekind, memory::data_type dt,
     }
     auto scratchpad_md
             = memory::desc({static_cast<memory::dim>(max_scratchpad_size)},
-                    memory::data_type::u8, tag::a);
+                    memory::data_type::u8, memory::format_tag::a);
 
     // allocate intermediate memory
     auto m_score = memory(score_md, eng);
+    auto m_probs = memory(probs_md, eng);
     auto m_scratchpad = memory(scratchpad_md, eng);
 
     const auto loop = [&]() {
@@ -198,11 +202,11 @@ void bench_sdpa_primitives(engine::kind ekind, memory::data_type dt,
                         {DNNL_ARG_SCRATCHPAD, m_scratchpad}});
 
         softmax_prim.execute(strm,
-                {{DNNL_ARG_SRC, m_score}, {DNNL_ARG_DST, m_score},
+                {{DNNL_ARG_SRC, m_score}, {DNNL_ARG_DST, m_probs},
                         {DNNL_ARG_SCRATCHPAD, m_scratchpad}});
 
         bmm2_prim.execute(strm,
-                {{DNNL_ARG_SRC, m_score}, {DNNL_ARG_WEIGHTS, m_value},
+                {{DNNL_ARG_SRC, m_probs}, {DNNL_ARG_WEIGHTS, m_value},
                         {DNNL_ARG_DST, m_output},
                         {DNNL_ARG_SCRATCHPAD, m_scratchpad}});
     };
@@ -283,10 +287,13 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     // Incremental IDs used to create logical tensors and operations.
     size_t id = 0;
 
+    // Intermediate data type
+    const logical_tensor::data_type dt_inter = logical_tensor::data_type::f32;
+
     // score = query x key.T
     auto query = logical_tensor(id++, dt, qv_sz, layout_type::strided);
     auto key = logical_tensor(id++, dt, k_sz, layout_type::strided);
-    auto score = logical_tensor(id++, dt, score_sz, layout_type::strided);
+    auto score = logical_tensor(id++, dt_inter, score_sz, layout_type::strided);
     auto bmm1 = op(id++, op::kind::MatMul, "bmm1");
     bmm1.set_attr<bool>(op::attr::transpose_b, true);
     bmm1.add_inputs({query, key});
@@ -295,7 +302,7 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     // scaled_score = score / scale
     auto scale = logical_tensor(id++, dt, scale_sz, layout_type::strided);
     auto scaled_score
-            = logical_tensor(id++, dt, score_sz, layout_type::strided);
+            = logical_tensor(id++, dt_inter, score_sz, layout_type::strided);
     auto scale_div = op(id++, op::kind::Divide, "scale_div");
     scale_div.add_inputs({score, scale});
     scale_div.add_outputs({scaled_score});
@@ -303,7 +310,7 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     // masked_score = scaled_score + mask
     auto mask = logical_tensor(id++, dt, mask_sz, layout_type::strided);
     auto masked_score
-            = logical_tensor(id++, dt, score_sz, layout_type::strided);
+            = logical_tensor(id++, dt_inter, score_sz, layout_type::strided);
     auto mask_add = op(id++, op::kind::Add, "mask_add");
     mask_add.add_inputs({scaled_score, mask});
     mask_add.add_outputs({masked_score});
diff --git a/examples/graph/sdpa_stacked_qkv.cpp b/examples/graph/sdpa_stacked_qkv.cpp
index dacae9b4672..29920224192 100644
--- a/examples/graph/sdpa_stacked_qkv.cpp
+++ b/examples/graph/sdpa_stacked_qkv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,6 @@
 #include "graph_example_utils.hpp"
 
 using namespace dnnl;
-using tag = memory::format_tag;
 
 using namespace dnnl::graph;
 using layout_type = logical_tensor::layout_type;
@@ -143,6 +142,9 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     // Incremental IDs used to create logical tensors and operations.
     size_t id = 0;
 
+    // Intermediate data type
+    const logical_tensor::data_type dt_inter = logical_tensor::data_type::f32;
+
     // This logical tensor is not part of the graph but is used to generate the
     // big chunk of device memory which should be already there in real user
     // application or framework.
@@ -153,7 +155,7 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     auto key = logical_tensor(id++, dt, qkv_sz, qkv_strides);
     // Though query and key are non-contiguous above, the output score is still
     // contiguous.
-    auto score = logical_tensor(id++, dt, score_sz, layout_type::strided);
+    auto score = logical_tensor(id++, dt_inter, score_sz, layout_type::strided);
     auto bmm1 = op(id++, op::kind::MatMul, "bmm1");
     bmm1.set_attr<bool>(op::attr::transpose_b, true);
     bmm1.add_inputs({query, key});
@@ -162,7 +164,7 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     // scaled_score = score / scale
     auto scale = logical_tensor(id++, dt, scale_sz, layout_type::strided);
     auto scaled_score
-            = logical_tensor(id++, dt, score_sz, layout_type::strided);
+            = logical_tensor(id++, dt_inter, score_sz, layout_type::strided);
     auto scale_div = op(id++, op::kind::Divide, "scale_div");
     scale_div.add_inputs({score, scale});
     scale_div.add_outputs({scaled_score});
@@ -170,7 +172,7 @@ void bench_sdpa(engine::kind ekind, logical_tensor::data_type dt,
     // masked_score = scaled_score + mask
     auto mask = logical_tensor(id++, dt, mask_sz, layout_type::strided);
     auto masked_score
-            = logical_tensor(id++, dt, score_sz, layout_type::strided);
+            = logical_tensor(id++, dt_inter, score_sz, layout_type::strided);
     auto mask_add = op(id++, op::kind::Add, "mask_add");
     mask_add.add_inputs({scaled_score, mask});
     mask_add.add_outputs({masked_score});
diff --git a/examples/matmul_perf.cpp b/examples/matmul_perf.cpp
index d1d29677a53..083d082d549 100644
--- a/examples/matmul_perf.cpp
+++ b/examples/matmul_perf.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,20 +28,17 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 struct gemm_dims_t {
     memory::dim m, n, k;
 };
 
 static const int min_runs = 4;
 
-const char *get_type_string(dt type) {
+const char *get_type_string(memory::data_type type) {
     const char *type_string = "unknown";
 
 #define TYPE_CASE(T) \
-    if (type == dt::T) type_string = #T;
+    if (type == memory::data_type::T) type_string = #T;
     TYPE_CASE(f16);
     TYPE_CASE(f32);
     TYPE_CASE(f64);
@@ -53,7 +50,7 @@ const char *get_type_string(dt type) {
     return type_string;
 }
 
-void print_test_case(dt type, gemm_dims_t dims) {
+void print_test_case(memory::data_type type, gemm_dims_t dims) {
     std::cout << '[' << std::setw(4) << get_type_string(type);
     if (dims.m == dims.n && dims.m == dims.k)
         std::cout << " m = n = k = " << dims.m;
@@ -89,9 +86,10 @@ void fill_random(std::vector<float> &out, bool is_integer) {
     }
 }
 
-double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims,
-        double time_limit = 0.) {
-    bool is_integer = (type == dt::s8 || type == dt::u8);
+double run_case(engine::kind engine_kind, memory::data_type type,
+        gemm_dims_t dims, double time_limit = 0.) {
+    bool is_integer
+            = (type == memory::data_type::s8 || type == memory::data_type::u8);
     bool quick_test = (time_limit == 0.);
 
     // Create execution dnnl::engine.
@@ -115,12 +113,14 @@ double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims,
 
     // Create memory descriptors and memory objects for src, weights, bias, and
     // dst.
-    auto a_md = memory::desc(a_dims, type, tag::any);
-    auto b_md = memory::desc(b_dims, type, tag::any);
-    auto c_md = memory::desc(c_dims, type, tag::any);
+    auto a_md = memory::desc(a_dims, type, memory::format_tag::any);
+    auto b_md = memory::desc(b_dims, type, memory::format_tag::any);
+    auto c_md = memory::desc(c_dims, type, memory::format_tag::any);
 
-    auto a_in_md = memory::desc(a_dims, dt::f32, tag::ab);
-    auto b_in_md = memory::desc(b_dims, dt::f32, tag::ab);
+    auto a_in_md = memory::desc(
+            a_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto b_in_md = memory::desc(
+            b_dims, memory::data_type::f32, memory::format_tag::ab);
 
     auto a_in_mem = memory(a_in_md, engine);
     auto b_in_mem = memory(b_in_md, engine);
@@ -197,7 +197,7 @@ double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims,
     return avg_time;
 }
 
-void run(engine::kind engine_kind, dt type, gemm_dims_t dims,
+void run(engine::kind engine_kind, memory::data_type type, gemm_dims_t dims,
         double time_limit) {
     try {
         if (dims.m * dims.n != 0) {
@@ -257,10 +257,10 @@ void matmul_perf(engine::kind engine_kind, int argc, char **argv) {
         if (dims.m <= 0 || dims.n <= 0 || dims.k <= 0) bad_args();
     }
 
-    run(engine_kind, dt::f32, dims, 2.0);
-    run(engine_kind, dt::f16, dims, 2.0);
-    run(engine_kind, dt::bf16, dims, 2.0);
-    run(engine_kind, dt::s8, dims, 2.0);
+    run(engine_kind, memory::data_type::f32, dims, 2.0);
+    run(engine_kind, memory::data_type::f16, dims, 2.0);
+    run(engine_kind, memory::data_type::bf16, dims, 2.0);
+    run(engine_kind, memory::data_type::s8, dims, 2.0);
 }
 
 int main(int argc, char **argv) {
diff --git a/examples/primitives/augru.cpp b/examples/primitives/augru.cpp
index 28abc6557af..dd0901c6c62 100644
--- a/examples/primitives/augru.cpp
+++ b/examples/primitives/augru.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,9 +42,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void augru_example(dnnl::engine::kind engine_kind) {
 
     if (engine_kind == engine::kind::gpu)
@@ -100,10 +97,14 @@ void augru_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src, bias, and dst.
-    auto src_layer_md = memory::desc(src_dims, dt::f32, tag::tnc);
-    auto attention_md = memory::desc(attention_dims, dt::f32, tag::tnc);
-    auto bias_md = memory::desc(bias_dims, dt::f32, tag::ldgo);
-    auto dst_layer_md = memory::desc(dst_dims, dt::f32, tag::tnc);
+    auto src_layer_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto attention_md = memory::desc(
+            attention_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::ldgo);
+    auto dst_layer_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::tnc);
 
     auto src_layer_mem = memory(src_layer_md, engine);
     auto attention_mem = memory(attention_md, engine);
@@ -112,10 +113,12 @@ void augru_example(dnnl::engine::kind engine_kind) {
 
     // Create memory objects for weights using user's memory layout. In this
     // example, LDIGO is assumed.
-    auto user_weights_layer_mem
-            = memory({weights_dims, dt::f32, tag::ldigo}, engine);
-    auto user_weights_iter_mem
-            = memory({weights_dims, dt::f32, tag::ldigo}, engine);
+    auto user_weights_layer_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ldigo},
+            engine);
+    auto user_weights_iter_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ldigo},
+            engine);
 
     // Write data to memory object's handle.
     write_to_dnnl_memory(src_layer_data.data(), src_layer_mem);
@@ -126,8 +129,10 @@ void augru_example(dnnl::engine::kind engine_kind) {
 
     // Create memory descriptors for weights with format_tag::any. This enables
     // the AUGRU primitive to choose the optimized memory layout.
-    auto augru_weights_layer_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto augru_weights_iter_md = memory::desc(weights_dims, dt::f32, tag::any);
+    auto augru_weights_layer_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto augru_weights_iter_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Optional memory descriptors for recurrent data.
     auto src_iter_md = memory::desc();
diff --git a/examples/primitives/batch_normalization.cpp b/examples/primitives/batch_normalization.cpp
index 5c3e163f968..088f8470cfa 100644
--- a/examples/primitives/batch_normalization.cpp
+++ b/examples/primitives/batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,9 +44,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void batch_normalization_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -91,9 +88,12 @@ void batch_normalization_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src and scale/shift memory descriptors and memory objects.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto scaleshift_md = memory::desc(scaleshift_dims, dt::f32, tag::x);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto scaleshift_md = memory::desc(
+            scaleshift_dims, memory::data_type::f32, memory::format_tag::x);
 
     auto src_mem = memory(src_md, engine);
     auto scale_mem = memory(scaleshift_md, engine);
diff --git a/examples/primitives/binary.cpp b/examples/primitives/binary.cpp
index 5b4707bc886..999650d0932 100644
--- a/examples/primitives/binary.cpp
+++ b/examples/primitives/binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,9 +42,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void binary_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -78,9 +75,12 @@ void binary_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src and dst memory descriptors.
-    auto src_0_md = memory::desc(src_0_dims, dt::f32, tag::nchw);
-    auto src_1_md = memory::desc(src_1_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(src_0_dims, dt::f32, tag::nchw);
+    auto src_0_md = memory::desc(
+            src_0_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto src_1_md = memory::desc(
+            src_1_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            src_0_dims, memory::data_type::f32, memory::format_tag::nchw);
 
     // Create src memory objects.
     auto src_0_mem = memory(src_0_md, engine);
diff --git a/examples/primitives/concat.cpp b/examples/primitives/concat.cpp
index e563722aeb8..50935810f98 100644
--- a/examples/primitives/concat.cpp
+++ b/examples/primitives/concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void concat_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -85,7 +82,8 @@ void concat_example(dnnl::engine::kind engine_kind) {
     std::vector<memory> src_mems;
 
     for (int n = 0; n < num_src; ++n) {
-        auto md = memory::desc(src_dims, dt::f32, tag::nchw);
+        auto md = memory::desc(
+                src_dims, memory::data_type::f32, memory::format_tag::nchw);
         auto mem = memory(md, engine);
 
         // Write data to memory object's handle.
diff --git a/examples/primitives/convolution.cpp b/examples/primitives/convolution.cpp
index e3ecc0f9b94..cc8eb768363 100644
--- a/examples/primitives/convolution.cpp
+++ b/examples/primitives/convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void convolution_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -75,6 +72,7 @@ void convolution_example(dnnl::engine::kind engine_kind) {
     // dimensions.
     memory::dims src_dims = {N, IC, IH, IW};
     memory::dims weights_dims = {OC, IC, KH, KW};
+    // To simulate an empty bias use an empty initializer `{}`.
     memory::dims bias_dims = {OC};
     memory::dims dst_dims = {N, OC, OH, OW};
 
@@ -86,7 +84,7 @@ void convolution_example(dnnl::engine::kind engine_kind) {
     // Allocate buffers.
     std::vector<float> src_data(product(src_dims));
     std::vector<float> weights_data(product(weights_dims));
-    std::vector<float> bias_data(OC);
+    std::vector<float> bias_data(product(bias_dims));
     std::vector<float> dst_data(product(dst_dims));
 
     // Initialize src, weights, and dst tensors.
@@ -105,26 +103,39 @@ void convolution_example(dnnl::engine::kind engine_kind) {
 
     // Create memory objects for tensor data (src, weights, dst). In this
     // example, NCHW layout is assumed for src and dst, and OIHW for weights.
-    auto user_src_mem = memory({src_dims, dt::f32, tag::nchw}, engine);
-    auto user_weights_mem = memory({weights_dims, dt::f32, tag::oihw}, engine);
-    auto user_dst_mem = memory({dst_dims, dt::f32, tag::nchw}, engine);
+    auto user_src_mem = memory(
+            {src_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
+    auto user_weights_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::oihw},
+            engine);
+    auto user_dst_mem = memory(
+            {dst_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
 
     // Create memory descriptors with format_tag::any for the primitive. This
     // enables the convolution primitive to choose memory layouts for an
     // optimized primitive implementation, and these layouts may differ from the
     // ones provided by the user.
-    auto conv_src_md = memory::desc(src_dims, dt::f32, tag::any);
-    auto conv_weights_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto conv_dst_md = memory::desc(dst_dims, dt::f32, tag::any);
+    auto conv_src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::any);
+    auto conv_weights_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto conv_dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Create memory descriptor and memory object for input bias.
-    auto user_bias_md = memory::desc(bias_dims, dt::f32, tag::a);
+    auto user_bias_md = bias_dims.empty()
+            ? memory::desc()
+            : memory::desc(
+                    bias_dims, memory::data_type::f32, memory::format_tag::a);
     auto user_bias_mem = memory(user_bias_md, engine);
 
     // Write data to memory object's handle.
     write_to_dnnl_memory(src_data.data(), user_src_mem);
     write_to_dnnl_memory(weights_data.data(), user_weights_mem);
-    write_to_dnnl_memory(bias_data.data(), user_bias_mem);
+    if (!bias_dims.empty())
+        write_to_dnnl_memory(bias_data.data(), user_bias_mem);
 
     // Create primitive post-ops (ReLU).
     const float alpha = 0.f;
@@ -254,20 +265,30 @@ void depthwise_convolution_example(dnnl::engine::kind engine_kind) {
 
     // Create memory objects for tensor data (src, weights, dst). In this
     // example, NCHW layout is assumed for src and dst, and OIHW for weights.
-    auto user_src_mem = memory({src_dims, dt::f32, tag::nchw}, engine);
-    auto user_weights_mem = memory({weights_dims, dt::f32, tag::goihw}, engine);
-    auto user_dst_mem = memory({dst_dims, dt::f32, tag::nchw}, engine);
+    auto user_src_mem = memory(
+            {src_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
+    auto user_weights_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::goihw},
+            engine);
+    auto user_dst_mem = memory(
+            {dst_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
 
     // Create memory descriptors with format_tag::any for the primitive. This
     // enables the convolution primitive to choose memory layouts for an
     // optimized primitive implementation, and these layouts may differ from the
     // ones provided by the user.
-    auto conv_src_md = memory::desc(src_dims, dt::f32, tag::any);
-    auto conv_weights_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto conv_dst_md = memory::desc(dst_dims, dt::f32, tag::any);
+    auto conv_src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::any);
+    auto conv_weights_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto conv_dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Create memory descriptor and memory object for input bias.
-    auto user_bias_md = memory::desc(bias_dims, dt::f32, tag::a);
+    auto user_bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::a);
     auto user_bias_mem = memory(user_bias_md, engine);
 
     // Write data to memory object's handle.
diff --git a/examples/primitives/deconvolution.cpp b/examples/primitives/deconvolution.cpp
index 841b7f7ba8d..f1efdc61386 100644
--- a/examples/primitives/deconvolution.cpp
+++ b/examples/primitives/deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void deconvolution_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -111,20 +108,30 @@ void deconvolution_example(dnnl::engine::kind engine_kind) {
 
     // Create memory objects for tensor data (src, weights, dst). In this
     // example, NCHW layout is assumed for src and dst, and OIHW for weights.
-    auto user_src_mem = memory({src_dims, dt::f32, tag::nchw}, engine);
-    auto user_weights_mem = memory({weights_dims, dt::f32, tag::oihw}, engine);
-    auto user_dst_mem = memory({dst_dims, dt::f32, tag::nchw}, engine);
+    auto user_src_mem = memory(
+            {src_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
+    auto user_weights_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::oihw},
+            engine);
+    auto user_dst_mem = memory(
+            {dst_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
 
     // Create memory descriptors with format_tag::any for the primitive. This
     // enables the deconvolution primitive to choose memory layouts for an
     // optimized primitive implementation, and these layouts may differ from the
     // ones provided by the user.
-    auto deconv_src_md = memory::desc(src_dims, dt::f32, tag::any);
-    auto deconv_weights_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto deconv_dst_md = memory::desc(dst_dims, dt::f32, tag::any);
+    auto deconv_src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::any);
+    auto deconv_weights_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto deconv_dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Create memory descriptor and memory object for input bias.
-    auto user_bias_md = memory::desc(bias_dims, dt::f32, tag::a);
+    auto user_bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::a);
     auto user_bias_mem = memory(user_bias_md, engine);
 
     // Write data to memory object's handle.
diff --git a/examples/primitives/eltwise.cpp b/examples/primitives/eltwise.cpp
index 2bea8dcbe08..acd59ed26a3 100644
--- a/examples/primitives/eltwise.cpp
+++ b/examples/primitives/eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,9 +39,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void eltwise_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -73,8 +70,10 @@ void eltwise_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src and dst memory descriptors and memory objects.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::nchw);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::nchw);
 
     auto src_mem = memory(src_md, engine);
     auto dst_mem = memory(dst_md, engine);
diff --git a/examples/primitives/group_normalization.cpp b/examples/primitives/group_normalization.cpp
index ce9ea87455f..84c67a41a57 100644
--- a/examples/primitives/group_normalization.cpp
+++ b/examples/primitives/group_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void group_normalization_example(engine::kind engine_kind) {
     // Create execution dnnl::engine.
     dnnl::engine engine(engine_kind, 0);
@@ -93,9 +90,12 @@ void group_normalization_example(engine::kind engine_kind) {
     });
 
     // Create src and scale/shift memory descriptors and memory objects.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::ncdhw);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::ncdhw);
-    auto scaleshift_md = memory::desc(scaleshift_dims, dt::f32, tag::x);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::ncdhw);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::ncdhw);
+    auto scaleshift_md = memory::desc(
+            scaleshift_dims, memory::data_type::f32, memory::format_tag::x);
 
     auto src_mem = memory(src_md, engine);
     auto scale_mem = memory(scaleshift_md, engine);
diff --git a/examples/primitives/inner_product.cpp b/examples/primitives/inner_product.cpp
index f987b88ca16..334c092151c 100644
--- a/examples/primitives/inner_product.cpp
+++ b/examples/primitives/inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,9 +42,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void inner_product_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -89,9 +86,12 @@ void inner_product_example(dnnl::engine::kind engine_kind) {
 
     // Create memory descriptors and memory objects for src and dst. In this
     // example, NCHW layout is assumed.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto bias_md = memory::desc(bias_dims, dt::f32, tag::a);
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::nc);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::a);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::nc);
 
     auto src_mem = memory(src_md, engine);
     auto bias_mem = memory(bias_md, engine);
@@ -99,7 +99,9 @@ void inner_product_example(dnnl::engine::kind engine_kind) {
 
     // Create memory object for user's layout for weights. In this example, OIHW
     // is assumed.
-    auto user_weights_mem = memory({weights_dims, dt::f32, tag::oihw}, engine);
+    auto user_weights_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::oihw},
+            engine);
 
     // Write data to memory object's handles.
     write_to_dnnl_memory(src_data.data(), src_mem);
@@ -110,8 +112,8 @@ void inner_product_example(dnnl::engine::kind engine_kind) {
     // the inner product primitive to choose the memory layout for an optimized
     // primitive implementation, and this format may differ from the one
     // provided by the user.
-    auto inner_product_weights_md
-            = memory::desc(weights_dims, dt::f32, tag::any);
+    auto inner_product_weights_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Create primitive post-ops (ReLU).
     const float alpha = 0.f;
diff --git a/examples/primitives/layer_normalization.cpp b/examples/primitives/layer_normalization.cpp
index 0079bc59b23..9bdc7dd2a8f 100644
--- a/examples/primitives/layer_normalization.cpp
+++ b/examples/primitives/layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void layer_normalization_example(dnnl::engine::kind engine_kind) {
 
     /// Create execution dnnl::engine.
@@ -89,9 +86,12 @@ void layer_normalization_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src memory descriptor and memory objects.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::tnc);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::tnc);
-    auto scaleshift_md = memory::desc(scaleshift_dims, dt::f32, tag::x);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto scaleshift_md = memory::desc(
+            scaleshift_dims, memory::data_type::f32, memory::format_tag::x);
 
     auto src_mem = memory(src_md, engine);
     auto scale_mem = memory(scaleshift_md, engine);
@@ -105,7 +105,8 @@ void layer_normalization_example(dnnl::engine::kind engine_kind) {
     // Create primitive descriptor.
     const float epsilon = 1.e-10f;
     auto lnorm_pd = layer_normalization_forward::primitive_desc(engine,
-            prop_kind::forward_training, src_md, dst_md, dt::f32, epsilon,
+            prop_kind::forward_training, src_md, dst_md, memory::data_type::f32,
+            epsilon,
             normalization_flags::use_scale | normalization_flags::use_shift);
 
     // Use the memory descriptors from the primitive to create memory objects
diff --git a/examples/primitives/lbr_gru.cpp b/examples/primitives/lbr_gru.cpp
index aeba8103c88..d343b44a33c 100644
--- a/examples/primitives/lbr_gru.cpp
+++ b/examples/primitives/lbr_gru.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,9 +42,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void lbr_gru_example(dnnl::engine::kind engine_kind) {
     // Create execution dnnl::engine.
     dnnl::engine engine(engine_kind, 0);
@@ -98,9 +95,12 @@ void lbr_gru_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src, bias, and dst.
-    auto src_layer_md = memory::desc(src_dims, dt::f32, tag::tnc);
-    auto bias_md = memory::desc(bias_dims, dt::f32, tag::ldgo);
-    auto dst_layer_md = memory::desc(dst_layer_dims, dt::f32, tag::tnc);
+    auto src_layer_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::ldgo);
+    auto dst_layer_md = memory::desc(
+            dst_layer_dims, memory::data_type::f32, memory::format_tag::tnc);
 
     auto src_layer_mem = memory(src_layer_md, engine);
     auto bias_mem = memory(bias_md, engine);
@@ -110,9 +110,13 @@ void lbr_gru_example(dnnl::engine::kind engine_kind) {
     // example, LDIGO (num_layers, num_directions, input_channels, num_gates,
     // output_channels) is assumed.
     auto user_weights_layer_mem
-            = memory({weights_layer_dims, dt::f32, tag::ldigo}, engine);
+            = memory({weights_layer_dims, memory::data_type::f32,
+                             memory::format_tag::ldigo},
+                    engine);
     auto user_weights_iter_mem
-            = memory({weights_iter_dims, dt::f32, tag::ldigo}, engine);
+            = memory({weights_iter_dims, memory::data_type::f32,
+                             memory::format_tag::ldigo},
+                    engine);
 
     // Write data to memory object's handle.
     // For GRU cells, the gates order is update, reset and output
@@ -125,8 +129,10 @@ void lbr_gru_example(dnnl::engine::kind engine_kind) {
 
     // Create memory descriptors for weights with format_tag::any. This enables
     // the lbr_gru primitive to choose the optimized memory layout.
-    auto weights_layer_md = memory::desc(weights_layer_dims, dt::f32, tag::any);
-    auto weights_iter_md = memory::desc(weights_iter_dims, dt::f32, tag::any);
+    auto weights_layer_md = memory::desc(weights_layer_dims,
+            memory::data_type::f32, memory::format_tag::any);
+    auto weights_iter_md = memory::desc(
+            weights_iter_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Optional memory descriptors for recurrent data.
     // Default memory descriptor for initial hidden states of the GRU cells
diff --git a/examples/primitives/lrn.cpp b/examples/primitives/lrn.cpp
index 0d6df092d32..ce75c807910 100644
--- a/examples/primitives/lrn.cpp
+++ b/examples/primitives/lrn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,9 +39,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void lrn_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -69,8 +66,10 @@ void lrn_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src and dst memory descriptors and memory objects.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::nchw);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
     auto src_mem = memory(src_md, engine);
     auto dst_mem = memory(src_md, engine);
 
diff --git a/examples/primitives/lstm.cpp b/examples/primitives/lstm.cpp
index ba579944662..67514bb65c3 100644
--- a/examples/primitives/lstm.cpp
+++ b/examples/primitives/lstm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,9 +42,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void lstm_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -90,9 +87,12 @@ void lstm_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src, bias, and dst.
-    auto src_layer_md = memory::desc(src_dims, dt::f32, tag::tnc);
-    auto bias_md = memory::desc(bias_dims, dt::f32, tag::ldgo);
-    auto dst_layer_md = memory::desc(dst_dims, dt::f32, tag::tnc);
+    auto src_layer_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::ldgo);
+    auto dst_layer_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::tnc);
 
     auto src_layer_mem = memory(src_layer_md, engine);
     auto bias_mem = memory(bias_md, engine);
@@ -100,10 +100,12 @@ void lstm_example(dnnl::engine::kind engine_kind) {
 
     // Create memory objects for weights using user's memory layout. In this
     // example, LDIGO is assumed.
-    auto user_weights_layer_mem
-            = memory({weights_dims, dt::f32, tag::ldigo}, engine);
-    auto user_weights_iter_mem
-            = memory({weights_dims, dt::f32, tag::ldigo}, engine);
+    auto user_weights_layer_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ldigo},
+            engine);
+    auto user_weights_iter_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ldigo},
+            engine);
 
     // Write data to memory object's handle.
     write_to_dnnl_memory(src_layer_data.data(), src_layer_mem);
@@ -113,8 +115,10 @@ void lstm_example(dnnl::engine::kind engine_kind) {
 
     // Create memory descriptors for weights with format_tag::any. This enables
     // the LSTM primitive to choose the optimized memory layout.
-    auto lstm_weights_layer_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto lstm_weights_iter_md = memory::desc(weights_dims, dt::f32, tag::any);
+    auto lstm_weights_layer_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto lstm_weights_iter_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Optional memory descriptors for recurrent data.
     auto src_iter_md = memory::desc();
diff --git a/examples/primitives/matmul.cpp b/examples/primitives/matmul.cpp
index 08d3faf11ae..fff7efeb0c2 100644
--- a/examples/primitives/matmul.cpp
+++ b/examples/primitives/matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,9 +41,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void matmul_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -84,10 +81,14 @@ void matmul_example(dnnl::engine::kind engine_kind) {
 
     // Create memory descriptors and memory objects for src, weights, bias, and
     // dst.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::abc);
-    auto weights_md = memory::desc(weights_dims, dt::f32, tag::abc);
-    auto bias_md = memory::desc(bias_dims, dt::f32, tag::abc);
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::abc);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::abc);
+    auto weights_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::abc);
+    auto bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::abc);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::abc);
 
     auto src_mem = memory(src_md, engine);
     auto weights_mem = memory(weights_md, engine);
diff --git a/examples/primitives/pooling.cpp b/examples/primitives/pooling.cpp
index 92a2c877801..a84c37e6028 100644
--- a/examples/primitives/pooling.cpp
+++ b/examples/primitives/pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,9 +39,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void pooling_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -66,6 +63,17 @@ void pooling_example(dnnl::engine::kind engine_kind) {
             DH = 1, // height-wise dilation
             DW = 1; // width-wise dilation
 
+    // oneDNN uses the following formula to calculate destination dimensions:
+    // dst = (src - ((weights - 1) * (dilation_onednn + 1) + 1)) / stride + 1
+    //
+    // PyTorch and TensorFlow use a different formula:
+    // dst = (src - ((weights - 1) * dilation_torch + 1)) / stride + 1
+    //
+    // As a result, the PyTorch and Tensorflow dilation parameters need to be
+    // adjusted by subtracting 1:
+    // dilation_onednn = dilation_torch - 1.
+    //
+    // Output tensor height and width.
     const memory::dim OH = (IH - ((KH - 1) * DH + KH) + PH_L + PH_R) / SH + 1;
     const memory::dim OW = (IW - ((KW - 1) * DW + KW) + PW_L + PW_R) / SW + 1;
 
@@ -92,10 +100,12 @@ void pooling_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src and dst.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
     auto src_mem = memory(src_md, engine);
 
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::nchw);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::nchw);
     auto dst_mem = memory(dst_md, engine);
 
     // Write data to memory object's handle.
diff --git a/examples/primitives/prelu.cpp b/examples/primitives/prelu.cpp
index 9f46e61231f..986f90bc553 100644
--- a/examples/primitives/prelu.cpp
+++ b/examples/primitives/prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,9 +38,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void prelu_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -78,18 +75,27 @@ void prelu_example(dnnl::engine::kind engine_kind) {
 
     // Create memory objects for tensor data (src, weights, dst). In this
     // example, NCHW layout is assumed for src, weights and dst.
-    auto user_src_mem = memory({src_dims, dt::f32, tag::nchw}, engine);
-    auto user_weights_mem = memory({weights_dims, dt::f32, tag::nchw}, engine);
-    auto user_dst_mem = memory({dst_dims, dt::f32, tag::nchw}, engine);
+    auto user_src_mem = memory(
+            {src_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
+    auto user_weights_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
+    auto user_dst_mem = memory(
+            {dst_dims, memory::data_type::f32, memory::format_tag::nchw},
+            engine);
 
     // Create memory descriptors for the primitive. Src tag is set
     // to match src memory object. Setting weights tag to format_tag::any
     // enables the PReLU primitive to choose memory layout for an optimized
     // primitive implementation, and that layout may differ from the one
     // provided by the user.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto weights_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::any);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto weights_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Write data to memory object's handle.
     write_to_dnnl_memory(src_data.data(), user_src_mem);
diff --git a/examples/primitives/reduction.cpp b/examples/primitives/reduction.cpp
index 86dc1f70cbc..cde6abafdbc 100644
--- a/examples/primitives/reduction.cpp
+++ b/examples/primitives/reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,9 +34,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void reduction_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -66,8 +63,10 @@ void reduction_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src and dst memory descriptors and memory objects.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::nchw);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::nchw);
 
     auto src_mem = memory(src_md, engine);
     auto dst_mem = memory(dst_md, engine);
diff --git a/examples/primitives/reorder.cpp b/examples/primitives/reorder.cpp
index 066b6eef97c..aa6d87f6c5e 100644
--- a/examples/primitives/reorder.cpp
+++ b/examples/primitives/reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,9 +41,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void reorder_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -72,8 +69,10 @@ void reorder_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src and dst.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(src_dims, dt::s8, tag::nhwc);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::s8, memory::format_tag::nhwc);
 
     auto src_mem = memory(src_md, engine);
     auto dst_mem = memory(dst_md, engine);
@@ -94,7 +93,8 @@ void reorder_example(dnnl::engine::kind engine_kind) {
     // Create primitive post-ops (per-channel output scales)
     primitive_attr reorder_attr;
     reorder_attr.set_scales_mask(DNNL_ARG_DST, 1 << ic_dim);
-    auto dst_scales_mem = memory({{IC}, dt::f32, tag::x}, engine);
+    auto dst_scales_mem = memory(
+            {{IC}, memory::data_type::f32, memory::format_tag::x}, engine);
     write_to_dnnl_memory(scales.data(), dst_scales_mem);
 
     // Create primitive descriptor.
diff --git a/examples/primitives/resampling.cpp b/examples/primitives/resampling.cpp
index 59d6f948a6a..5bbefcd727a 100644
--- a/examples/primitives/resampling.cpp
+++ b/examples/primitives/resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,9 +39,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void resampling_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -73,8 +70,10 @@ void resampling_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src and dst.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(dst_dims, dt::f32, tag::nchw);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            dst_dims, memory::data_type::f32, memory::format_tag::nchw);
 
     auto src_mem = memory(src_md, engine);
     auto dst_mem = memory(dst_md, engine);
diff --git a/examples/primitives/shuffle.cpp b/examples/primitives/shuffle.cpp
index 6a437a23d63..7f3041e8587 100644
--- a/examples/primitives/shuffle.cpp
+++ b/examples/primitives/shuffle.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,9 +41,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void shuffle_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -76,11 +73,15 @@ void shuffle_example(dnnl::engine::kind engine_kind) {
     const int group_size = 4;
 
     // Create memory descriptor and memory objects for src and dst.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nchw);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::nchw);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nchw);
     auto src_mem = memory(src_md, engine);
 
-    auto dst_mem = memory({src_dims, dt::f32, tag::abcd}, engine);
+    auto dst_mem = memory(
+            {src_dims, memory::data_type::f32, memory::format_tag::abcd},
+            engine);
 
     // Write data to memory object's handle.
     write_to_dnnl_memory(src_data.data(), src_mem);
diff --git a/examples/primitives/softmax.cpp b/examples/primitives/softmax.cpp
index 58160edc245..183ac7cde76 100644
--- a/examples/primitives/softmax.cpp
+++ b/examples/primitives/softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,9 +43,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void softmax_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -70,8 +67,10 @@ void softmax_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create src memory descriptor and memory object.
-    auto src_md = memory::desc(src_dims, dt::f32, tag::nc);
-    auto dst_md = memory::desc(src_dims, dt::f32, tag::nc);
+    auto src_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nc);
+    auto dst_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::nc);
     auto src_mem = memory(src_md, engine);
 
     // Write data to memory object's handle.
diff --git a/examples/primitives/sum.cpp b/examples/primitives/sum.cpp
index 19fc3e2e097..41149ff7146 100644
--- a/examples/primitives/sum.cpp
+++ b/examples/primitives/sum.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,9 +41,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void sum_example(dnnl::engine::kind engine_kind) {
 
     // Create execution dnnl::engine.
@@ -84,7 +81,8 @@ void sum_example(dnnl::engine::kind engine_kind) {
     std::vector<memory> src_mem;
 
     for (int n = 0; n < num_src; ++n) {
-        auto md = memory::desc(src_dims, dt::f32, tag::nchw);
+        auto md = memory::desc(
+                src_dims, memory::data_type::f32, memory::format_tag::nchw);
         auto mem = memory(md, engine);
 
         // Write data to memory object's handle.
diff --git a/examples/primitives/vanilla_rnn.cpp b/examples/primitives/vanilla_rnn.cpp
index a288468fc09..e8f2c99ce17 100644
--- a/examples/primitives/vanilla_rnn.cpp
+++ b/examples/primitives/vanilla_rnn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,9 +42,6 @@
 
 using namespace dnnl;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void vanilla_rnn_example(dnnl::engine::kind engine_kind) {
     // Create execution dnnl::engine.
     dnnl::engine engine(engine_kind, 0);
@@ -95,9 +92,12 @@ void vanilla_rnn_example(dnnl::engine::kind engine_kind) {
     });
 
     // Create memory descriptors and memory objects for src, bias, and dst.
-    auto src_layer_md = memory::desc(src_dims, dt::f32, tag::tnc);
-    auto bias_md = memory::desc(bias_dims, dt::f32, tag::ldgo);
-    auto dst_layer_md = memory::desc(dst_layer_dims, dt::f32, tag::tnc);
+    auto src_layer_md = memory::desc(
+            src_dims, memory::data_type::f32, memory::format_tag::tnc);
+    auto bias_md = memory::desc(
+            bias_dims, memory::data_type::f32, memory::format_tag::ldgo);
+    auto dst_layer_md = memory::desc(
+            dst_layer_dims, memory::data_type::f32, memory::format_tag::tnc);
 
     auto src_layer_mem = memory(src_layer_md, engine);
     auto bias_mem = memory(bias_md, engine);
@@ -106,10 +106,12 @@ void vanilla_rnn_example(dnnl::engine::kind engine_kind) {
     // Create memory objects for weights using user's memory layout. In this
     // example, LDIGO (num_layers, num_directions, input_channels, num_gates,
     // output_channels) is assumed.
-    auto user_weights_layer_mem
-            = memory({weights_dims, dt::f32, tag::ldigo}, engine);
-    auto user_weights_iter_mem
-            = memory({weights_dims, dt::f32, tag::ldigo}, engine);
+    auto user_weights_layer_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ldigo},
+            engine);
+    auto user_weights_iter_mem = memory(
+            {weights_dims, memory::data_type::f32, memory::format_tag::ldigo},
+            engine);
 
     // Write data to memory object's handle.
     write_to_dnnl_memory(src_layer_data.data(), src_layer_mem);
@@ -119,8 +121,10 @@ void vanilla_rnn_example(dnnl::engine::kind engine_kind) {
 
     // Create memory descriptors for weights with format_tag::any. This enables
     // the Vanilla primitive to choose the optimized memory layout.
-    auto weights_layer_md = memory::desc(weights_dims, dt::f32, tag::any);
-    auto weights_iter_md = memory::desc(weights_dims, dt::f32, tag::any);
+    auto weights_layer_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
+    auto weights_iter_md = memory::desc(
+            weights_dims, memory::data_type::f32, memory::format_tag::any);
 
     // Optional memory descriptors for recurrent data.
     // Default memory descriptor for initial hidden states of the GRU cells
diff --git a/examples/rnn_training_f32.cpp b/examples/rnn_training_f32.cpp
index 42546f3adaa..fbde4aa0fa0 100644
--- a/examples/rnn_training_f32.cpp
+++ b/examples/rnn_training_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,9 +68,6 @@ const int common_n_layers = 1;
 const int lstm_n_gates = 4;
 
 void simple_net(engine::kind engine_kind) {
-    using tag = memory::format_tag;
-    using dt = memory::data_type;
-
     auto eng = engine(engine_kind, 0);
     stream s(eng);
 
@@ -173,14 +170,14 @@ void simple_net(engine::kind engine_kind) {
 
     // Create auxiliary f32 memory descriptor
     // based on user- supplied dimensions and layout.
-    auto formatted_md
-            = [=](const memory::dims &dimensions, memory::format_tag layout) {
-                  return memory::desc {{dimensions}, dt::f32, layout};
-              };
+    auto formatted_md = [=](const memory::dims &dimensions,
+                                memory::format_tag layout) {
+        return memory::desc {{dimensions}, memory::data_type::f32, layout};
+    };
     // Create auxiliary generic f32 memory descriptor
     // based on supplied dimensions, with format_tag::any.
     auto generic_md = [=](const memory::dims &dimensions) {
-        return formatted_md(dimensions, tag::any);
+        return formatted_md(dimensions, memory::format_tag::any);
     };
 
     //
@@ -203,8 +200,9 @@ void simple_net(engine::kind engine_kind) {
 
     // Memory for the user allocated memory
     // Suppose user data is in tnc format.
-    auto net_src_memory
-            = dnnl::memory({{net_src_dims}, dt::f32, tag::tnc}, eng);
+    auto net_src_memory = dnnl::memory(
+            {{net_src_dims}, memory::data_type::f32, memory::format_tag::tnc},
+            eng);
     write_to_dnnl_memory(net_src.data(), net_src_memory);
     // src_layer memory of the leftmost and rightmost RNN primitives
     // are accessed through the respective sub-memories in larger memory.
@@ -222,34 +220,44 @@ void simple_net(engine::kind engine_kind) {
     // primitive prefers it in a different format.
     std::vector<float> user_common_weights_layer(
             tz_volume(common_weights_layer_dims), 1.0f);
-    auto user_common_weights_layer_memory = dnnl::memory(
-            {common_weights_layer_dims, dt::f32, tag::ldigo}, eng);
+    auto user_common_weights_layer_memory
+            = dnnl::memory({common_weights_layer_dims, memory::data_type::f32,
+                                   memory::format_tag::ldigo},
+                    eng);
     write_to_dnnl_memory(
             user_common_weights_layer.data(), user_common_weights_layer_memory);
 
     std::vector<float> user_common_weights_iter(
             tz_volume(common_weights_iter_dims), 1.0f);
-    auto user_common_weights_iter_memory = dnnl::memory(
-            {{common_weights_iter_dims}, dt::f32, tag::ldigo}, eng);
+    auto user_common_weights_iter_memory
+            = dnnl::memory({{common_weights_iter_dims}, memory::data_type::f32,
+                                   memory::format_tag::ldigo},
+                    eng);
     write_to_dnnl_memory(
             user_common_weights_layer.data(), user_common_weights_iter_memory);
 
     std::vector<float> user_common_bias(tz_volume(common_bias_dims), 1.0f);
     auto user_common_bias_memory
-            = dnnl::memory({{common_bias_dims}, dt::f32, tag::ldgo}, eng);
+            = dnnl::memory({{common_bias_dims}, memory::data_type::f32,
+                                   memory::format_tag::ldgo},
+                    eng);
     write_to_dnnl_memory(user_common_bias.data(), user_common_bias_memory);
 
     std::vector<float> user_leftmost_dst_layer(
             tz_volume(leftmost_dst_layer_dims), 1.0f);
     auto user_leftmost_dst_layer_memory
-            = dnnl::memory({{leftmost_dst_layer_dims}, dt::f32, tag::tnc}, eng);
+            = dnnl::memory({{leftmost_dst_layer_dims}, memory::data_type::f32,
+                                   memory::format_tag::tnc},
+                    eng);
     write_to_dnnl_memory(
             user_leftmost_dst_layer.data(), user_leftmost_dst_layer_memory);
 
     std::vector<float> user_rightmost_dst_layer(
             tz_volume(rightmost_dst_layer_dims), 1.0f);
-    auto user_rightmost_dst_layer_memory = dnnl::memory(
-            {{rightmost_dst_layer_dims}, dt::f32, tag::tnc}, eng);
+    auto user_rightmost_dst_layer_memory
+            = dnnl::memory({{rightmost_dst_layer_dims}, memory::data_type::f32,
+                                   memory::format_tag::tnc},
+                    eng);
     write_to_dnnl_memory(
             user_rightmost_dst_layer.data(), user_rightmost_dst_layer_memory);
 
@@ -265,7 +273,8 @@ void simple_net(engine::kind engine_kind) {
             generic_md(common_weights_layer_dims), // weights_layer_desc
             generic_md(common_weights_iter_dims), // weights_iter_desc
             generic_md(common_bias_dims), // bias_desc
-            formatted_md(leftmost_dst_layer_dims, tag::tnc), // dst_layer_desc
+            formatted_md(leftmost_dst_layer_dims,
+                    memory::format_tag::tnc), // dst_layer_desc
             generic_md(leftmost_dst_iter_dims), // dst_iter_desc
             generic_md(leftmost_dst_iter_c_dims) // dst_iter_c_desc
     );
@@ -304,7 +313,8 @@ void simple_net(engine::kind engine_kind) {
             generic_md(common_weights_layer_dims), // weights_layer_desc
             generic_md(common_weights_iter_dims), // weights_iter_desc
             generic_md(common_bias_dims), // bias_desc
-            formatted_md(rightmost_dst_layer_dims, tag::tnc), // dst_layer_desc
+            formatted_md(rightmost_dst_layer_dims,
+                    memory::format_tag::tnc), // dst_layer_desc
             memory::desc(), // dst_iter_desc
             memory::desc() // dst_iter_c_desc
     );
@@ -410,8 +420,8 @@ void simple_net(engine::kind engine_kind) {
 
     // User-provided memory for backward by data output
     std::vector<float> net_diff_src(tz_volume(net_src_dims), 1.0f);
-    auto net_diff_src_memory
-            = dnnl::memory(formatted_md(net_src_dims, tag::tnc), eng);
+    auto net_diff_src_memory = dnnl::memory(
+            formatted_md(net_src_dims, memory::format_tag::tnc), eng);
     write_to_dnnl_memory(net_diff_src.data(), net_diff_src_memory);
 
     // diff_src follows the same layout we have for net_src
@@ -429,13 +439,14 @@ void simple_net(engine::kind engine_kind) {
     std::vector<float> user_common_diff_weights_layer(
             tz_volume(common_weights_layer_dims), 1.0f);
     auto user_common_diff_weights_layer_memory = dnnl::memory(
-            formatted_md(common_weights_layer_dims, tag::ldigo), eng);
+            formatted_md(common_weights_layer_dims, memory::format_tag::ldigo),
+            eng);
     write_to_dnnl_memory(user_common_diff_weights_layer.data(),
             user_common_diff_weights_layer_memory);
 
     std::vector<float> user_common_diff_bias(tz_volume(common_bias_dims), 1.0f);
-    auto user_common_diff_bias_memory
-            = dnnl::memory(formatted_md(common_bias_dims, tag::ldgo), eng);
+    auto user_common_diff_bias_memory = dnnl::memory(
+            formatted_md(common_bias_dims, memory::format_tag::ldgo), eng);
     write_to_dnnl_memory(
             user_common_diff_bias.data(), user_common_diff_bias_memory);
 
@@ -448,8 +459,8 @@ void simple_net(engine::kind engine_kind) {
     };
     // Suppose user data is in tnc format.
     std::vector<float> net_diff_dst(tz_volume(net_diff_dst_dims), 1.0f);
-    auto net_diff_dst_memory
-            = dnnl::memory(formatted_md(net_diff_dst_dims, tag::tnc), eng);
+    auto net_diff_dst_memory = dnnl::memory(
+            formatted_md(net_diff_dst_dims, memory::format_tag::tnc), eng);
     write_to_dnnl_memory(net_diff_dst.data(), net_diff_dst_memory);
     // diff_dst_layer memory of the leftmost and rightmost RNN primitives
     // are accessed through the respective sub-memory in larger memory.
@@ -474,7 +485,8 @@ void simple_net(engine::kind engine_kind) {
             generic_md(common_weights_layer_dims), // weights_layer_desc
             generic_md(common_weights_iter_dims), // weights_iter_desc
             generic_md(common_bias_dims), // bias_desc
-            formatted_md(leftmost_dst_layer_dims, tag::tnc), // dst_layer_desc
+            formatted_md(leftmost_dst_layer_dims,
+                    memory::format_tag::tnc), // dst_layer_desc
             generic_md(leftmost_dst_iter_dims), // dst_iter_desc
             generic_md(leftmost_dst_iter_c_dims), // dst_iter_c_desc
             user_leftmost_diff_src_layer_md, // diff_src_layer_desc
@@ -519,7 +531,8 @@ void simple_net(engine::kind engine_kind) {
             generic_md(common_weights_layer_dims), // weights_layer_desc
             generic_md(common_weights_iter_dims), // weights_iter_desc
             generic_md(common_bias_dims), // bias_desc
-            formatted_md(rightmost_dst_layer_dims, tag::tnc), // dst_layer_desc
+            formatted_md(rightmost_dst_layer_dims,
+                    memory::format_tag::tnc), // dst_layer_desc
             memory::desc(), // dst_iter_desc
             memory::desc(), // dst_iter_c_desc
             user_rightmost_diff_src_layer_md, // diff_src_layer_desc
diff --git a/examples/sycl_interop_buffer.cpp b/examples/sycl_interop_buffer.cpp
index 7ef65de296a..59972ad6cbb 100644
--- a/examples/sycl_interop_buffer.cpp
+++ b/examples/sycl_interop_buffer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@
 /// @section sycl_interop_buffer_cpp_headers Public headers
 ///
 /// To start using oneDNN, we must first include the @ref dnnl.hpp
-/// header file in the application. We also include CL/sycl.hpp from DPC++ for
+/// header file in the application. We also include sycl/sycl.hpp from DPC++ for
 /// using SYCL APIs and @ref dnnl_debug.h, which  contains some debugging
 /// facilities such as returning a string representation
 /// for common oneDNN C types.
@@ -56,8 +56,6 @@
 
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
diff --git a/examples/sycl_interop_usm.cpp b/examples/sycl_interop_usm.cpp
index a61d8bbf353..713c05b9ab5 100644
--- a/examples/sycl_interop_usm.cpp
+++ b/examples/sycl_interop_usm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,8 +21,6 @@
 
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
diff --git a/examples/tutorials/matmul/cpu_matmul_quantization.cpp b/examples/tutorials/matmul/cpu_matmul_quantization.cpp
index 5fee5ed17de..b7c0264b944 100644
--- a/examples/tutorials/matmul/cpu_matmul_quantization.cpp
+++ b/examples/tutorials/matmul/cpu_matmul_quantization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -154,7 +154,7 @@ void compute_q10n_params(const char *message, const std::vector<float> &v,
 
 #ifndef OMIT_WORKAROUND_FOR_SKX
     // Read more in CPU / Section 1 here:
-    // https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html
+    // https://uxlfoundation.github.io/oneDNN/dev_guide_int8_computations.html
     if (std::is_same<T, uint8_t>::value) max_int /= 2;
 #endif
 
@@ -203,7 +203,10 @@ int compare_vectors(const std::vector<float> &v1,
 
 } // namespace
 
-engine eng(engine::kind::cpu, 0); // We create a global engine for simplicity
+const engine &eng() {
+    static const engine eng(engine::kind::cpu, 0);
+    return eng;
+}
 
 // Quantize float data into X_int_m oneDNN memory using the q10n parameters
 //
@@ -216,23 +219,23 @@ engine eng(engine::kind::cpu, 0); // We create a global engine for simplicity
 // - X_int_m -- prepared oneDNN memory that would hold quantized values
 void quantize(const std::vector<float> &X_f32, float scale_X, int32_t zp_X,
         memory &X_int_m) {
-    using dt = memory::data_type;
-
-    stream s(eng);
+    stream s(eng());
 
     memory::desc x_int_md = X_int_m.get_desc();
     const auto &dims = x_int_md.get_dims();
 
-    memory::desc x_f32_md({dims[0], dims[1]}, dt::f32, {dims[1], 1});
-    memory X_f32_m(x_f32_md, eng, (void *)X_f32.data());
+    memory::desc x_f32_md(
+            {dims[0], dims[1]}, memory::data_type::f32, {dims[1], 1});
+    memory X_f32_m(x_f32_md, eng(), (void *)X_f32.data());
 
     primitive_attr q10n_attr;
     q10n_attr.set_scales_mask(DNNL_ARG_DST, /* mask */ 0);
     q10n_attr.set_zero_points_mask(DNNL_ARG_DST, /* mask */ 0);
 
-    reorder::primitive_desc q10n_pd(eng, x_f32_md, eng, x_int_md, q10n_attr);
-    memory dst_scale_X_m({{1}, dt::f32, {1}}, eng, &scale_X);
-    memory zp_X_m({{1}, dt::s32, {1}}, eng, &zp_X);
+    reorder::primitive_desc q10n_pd(
+            eng(), x_f32_md, eng(), x_int_md, q10n_attr);
+    memory dst_scale_X_m({{1}, memory::data_type::f32, {1}}, eng(), &scale_X);
+    memory zp_X_m({{1}, memory::data_type::s32, {1}}, eng(), &zp_X);
     reorder(q10n_pd).execute(s,
             {{DNNL_ARG_SRC, X_f32_m}, {DNNL_ARG_DST, X_int_m},
                     {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scale_X_m},
@@ -256,15 +259,15 @@ void f32_matmul_compute(int64_t M, int64_t N, int64_t K,
     memory::desc c_md({M, N}, memory::data_type::f32, {N, 1});
 
     // Wrap raw pointers into oneDNN memory objects
-    memory A_f32_m(a_md, eng, (void *)A_f32.data());
-    memory B_f32_m(b_md, eng, (void *)B_f32.data());
-    memory C_f32_m(c_md, eng, (void *)C_f32.data());
+    memory A_f32_m(a_md, eng(), (void *)A_f32.data());
+    memory B_f32_m(b_md, eng(), (void *)B_f32.data());
+    memory C_f32_m(c_md, eng(), (void *)C_f32.data());
 
     // Create a MatMul primitive
-    matmul::primitive_desc matmul_pd(eng, a_md, b_md, c_md);
+    matmul::primitive_desc matmul_pd(eng(), a_md, b_md, c_md);
     matmul matmul_p(matmul_pd);
 
-    stream s(eng);
+    stream s(eng());
     matmul_p.execute(s,
             {{DNNL_ARG_SRC, A_f32_m}, {DNNL_ARG_WEIGHTS, B_f32_m},
                     {DNNL_ARG_DST, C_f32_m}});
@@ -281,7 +284,7 @@ void f32_matmul_compute(int64_t M, int64_t N, int64_t K,
 void dynamic_q10n_matmul(int64_t M, int64_t N, int64_t K,
         const std::vector<float> &A_f32, const std::vector<float> &B_f32,
         std::vector<uint8_t> &C_u8, float &scale_C, int32_t &zp_C) {
-    stream s(eng);
+    stream s(eng());
 
     float scale_A, scale_B;
     int32_t zp_A, zp_B;
@@ -295,13 +298,13 @@ void dynamic_q10n_matmul(int64_t M, int64_t N, int64_t K,
     // Quantize matrix A_u8 using reorder primitive
     std::vector<uint8_t> A_u8(M * K, 0);
     memory::desc a_u8_md({M, K}, memory::data_type::u8, {K, 1});
-    memory A_u8_m(a_u8_md, eng, (void *)A_u8.data());
+    memory A_u8_m(a_u8_md, eng(), (void *)A_u8.data());
     quantize(A_f32, scale_A, zp_A, A_u8_m);
 
     // Quantize matrix B_s8 using reorder primitive
     std::vector<uint8_t> B_s8(K * N, 0);
     memory::desc b_s8_md({K, N}, memory::data_type::s8, {N, 1});
-    memory B_s8_m(b_s8_md, eng, (void *)B_s8.data());
+    memory B_s8_m(b_s8_md, eng(), (void *)B_s8.data());
     quantize(B_f32, scale_B, 0, B_s8_m);
 
     // Compute C_f32. We cannot directly compute C_u8 since we don't know the
@@ -319,7 +322,7 @@ void dynamic_q10n_matmul(int64_t M, int64_t N, int64_t K,
 
     std::vector<float> C_f32(M * N, 0);
     memory::desc c_f32_md({M, N}, memory::data_type::f32, {N, 1});
-    memory C_f32_m(c_f32_md, eng, (void *)C_f32.data());
+    memory C_f32_m(c_f32_md, eng(), (void *)C_f32.data());
 
     // Create and compute a reduced precision MatMul primitive
     {
@@ -329,12 +332,12 @@ void dynamic_q10n_matmul(int64_t M, int64_t N, int64_t K,
         matmul_attr.set_zero_points_mask(DNNL_ARG_SRC, /* mask */ 0);
 
         matmul::primitive_desc matmul_pd(
-                eng, a_u8_md, b_s8_md, c_f32_md, matmul_attr);
+                eng(), a_u8_md, b_s8_md, c_f32_md, matmul_attr);
         matmul matmul_p(matmul_pd);
 
-        memory scales_A_m({{1}, memory::data_type::f32, {1}}, eng, &scale_A);
-        memory scales_B_m({{1}, memory::data_type::f32, {1}}, eng, &scale_B);
-        memory zp_A_m({{1}, memory::data_type::s32, {1}}, eng, &zp_A);
+        memory scales_A_m({{1}, memory::data_type::f32, {1}}, eng(), &scale_A);
+        memory scales_B_m({{1}, memory::data_type::f32, {1}}, eng(), &scale_B);
+        memory zp_A_m({{1}, memory::data_type::s32, {1}}, eng(), &zp_A);
 
         matmul_p.execute(s,
                 {{DNNL_ARG_SRC, A_u8_m}, {DNNL_ARG_WEIGHTS, B_s8_m},
@@ -349,7 +352,7 @@ void dynamic_q10n_matmul(int64_t M, int64_t N, int64_t K,
 
     // Finally quantize the matrix C
     memory::desc c_u8_md({M, N}, memory::data_type::u8, {N, 1});
-    memory C_u8_m(c_u8_md, eng, (void *)C_u8.data());
+    memory C_u8_m(c_u8_md, eng(), (void *)C_u8.data());
     quantize(C_f32, scale_C, zp_C, C_u8_m);
 }
 
diff --git a/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp b/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp
index 643b8a2f473..749a6911608 100644
--- a/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp
+++ b/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -111,7 +111,10 @@ int compare_vectors(const std::vector<float> &v1, const std::vector<float> &v2,
 int number_of_runs = 1;
 float fixed_beta = 0.f;
 
-engine eng(engine::kind::cpu, 0); // We create a global engine for simplicity
+const engine &eng() {
+    static const engine eng(engine::kind::cpu, 0);
+    return eng;
+}
 
 // Create a _dynamic_ MatMul primitive that can work with arbitrary shapes
 // and alpha parameters.
@@ -143,7 +146,7 @@ matmul dynamic_matmul_create() {
     }
 
     // Create a MatMul primitive
-    matmul::primitive_desc matmul_pd(eng, a_md, b_md, c_md, attr);
+    matmul::primitive_desc matmul_pd(eng(), a_md, b_md, c_md, attr);
     return matmul(matmul_pd);
 }
 
@@ -164,15 +167,15 @@ void dynamic_matmul_execute(matmul &matmul_p, char transA, char transB,
     dims b_strides = tolower(transB) == 'n' ? dims {ldb, 1} : dims {1, ldb};
 
     // Wrap raw pointers into oneDNN memories (with proper shapes)
-    memory A_m({{M, K}, memory::data_type::f32, a_strides}, eng, (void *)A);
-    memory B_m({{K, N}, memory::data_type::f32, b_strides}, eng, (void *)B);
-    memory C_m({{M, N}, memory::data_type::f32, {ldc, 1}}, eng, (void *)C);
+    memory A_m({{M, K}, memory::data_type::f32, a_strides}, eng(), (void *)A);
+    memory B_m({{K, N}, memory::data_type::f32, b_strides}, eng(), (void *)B);
+    memory C_m({{M, N}, memory::data_type::f32, {ldc, 1}}, eng(), (void *)C);
 
     // Prepare oneDNN memory for alpha
-    memory alpha_m({{1}, memory::data_type::f32, {1}}, eng, &alpha);
+    memory alpha_m({{1}, memory::data_type::f32, {1}}, eng(), &alpha);
 
     // Execute the MatMul primitive
-    stream s(eng);
+    stream s(eng());
     matmul_p.execute(s,
             {{DNNL_ARG_SRC, A_m}, {DNNL_ARG_WEIGHTS, B_m}, {DNNL_ARG_DST, C_m},
                     {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, alpha_m}});
diff --git a/examples/tutorials/matmul/weights_decompression_matmul.cpp b/examples/tutorials/matmul/weights_decompression_matmul.cpp
index ead80cfc451..b5d5d465f8a 100644
--- a/examples/tutorials/matmul/weights_decompression_matmul.cpp
+++ b/examples/tutorials/matmul/weights_decompression_matmul.cpp
@@ -160,10 +160,10 @@ void infer(const matmul &matmul_p, int64_t M, int64_t N, int64_t K, int64_t G,
 void weights_decompression_matmul(engine::kind engine_kind) {
     engine eng(engine_kind, 0);
 
-    const int64_t K = 96;
+    const int64_t K = 64;
     const int64_t N = 1000;
     const int64_t M = 100;
-    // Quantization Group size for scales
+    // Quantization Group size for scales. Must be divisible by 32.
     const int64_t G = K / 2;
 
     auto matmul_pd = matmul_pd_create(M, N, K, G, eng);
diff --git a/examples/ukernels/cpu_brgemm.cpp b/examples/ukernels/cpu_brgemm.cpp
index 5c119f5c527..2b2bc45c72e 100644
--- a/examples/ukernels/cpu_brgemm.cpp
+++ b/examples/ukernels/cpu_brgemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,9 +36,6 @@
 using namespace dnnl;
 using namespace dnnl::ukernel;
 
-using tag = memory::format_tag;
-using dt = memory::data_type;
-
 void brgemm_example() {
 
     // Create execution dnnl::engine. Needed for reorders to operate over input
@@ -57,17 +54,37 @@ void brgemm_example() {
     }
     const memory::dim n_calls = K / K_k;
 
+    memory::data_type a_dt = memory::data_type::u8;
+    memory::data_type b_dt = memory::data_type::s8;
+    memory::data_type c_dt = memory::data_type::s32; // Accumulator data type.
+    memory::data_type d_dt = memory::data_type::f32; // Output data type.
+
+    // Query the packing requirement from the ukernel. It's enough to query
+    // packing requirements once for multiple objects.
+    // Based on this information, specific `ldb` value can be used, since
+    // transform has a limited set of values supported.
+    bool need_pack = false;
+    try {
+        need_pack = brgemm::get_B_pack_type(a_dt, b_dt) == pack_type::pack32;
+    } catch (error &e) {
+        if (e.status == dnnl_unimplemented)
+            throw example_allows_unimplemented {
+                    "Kernel is not supported on this platform.\n"};
+
+        // on any other error just re-throw
+        throw;
+    }
+
     const memory::dim lda = K;
+    // `ldb` for `need_pack = true` must be one of 16, 32, 48, or 64. This
+    // example doesn't explore options for dividing N into blocks which would
+    // likely happen for N > 64.
+    // const memory::dim ldb = need_pack ? N_block : N;
     const memory::dim ldb = N;
     const memory::dim ldc = N; // Leading dimension for accumulator.
     const memory::dim ldd = N; // Leading dimension for an actual output.
     const memory::dim batch_size = n_calls - 1;
 
-    memory::data_type a_dt = dt::u8;
-    memory::data_type b_dt = dt::s8;
-    memory::data_type c_dt = dt::s32; // Accumulator data type.
-    memory::data_type d_dt = dt::f32; // Output data type.
-
     // A, B, and C tensors dimensions.
     memory::dims A_dims = {M, K};
     memory::dims B_dims = {K, N};
@@ -111,11 +128,16 @@ void brgemm_example() {
 
     // Create f32 memories. They are used as data holders and reorder into
     // memories passed to the ukernel.
-    auto A_f32_md = memory::desc(A_dims, dt::f32, tag::ab);
-    auto B_f32_md = memory::desc(B_dims, dt::f32, tag::ab);
-    auto binary_add_f32_md = memory::desc(binary_add_dims, dt::f32, tag::ab);
-    auto B_scales_f32_md = memory::desc(B_scales_dims, dt::f32, tag::ab);
-    auto D_f32_md = memory::desc(D_dims, dt::f32, tag::ab);
+    auto A_f32_md = memory::desc(
+            A_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto B_f32_md = memory::desc(
+            B_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto binary_add_f32_md = memory::desc(
+            binary_add_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto B_scales_f32_md = memory::desc(
+            B_scales_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto D_f32_md = memory::desc(
+            D_dims, memory::data_type::f32, memory::format_tag::ab);
 
     auto A_f32_mem = memory(A_f32_md, engine, A_user_data.data());
     auto B_f32_mem = memory(B_f32_md, engine, B_user_data.data());
@@ -127,12 +149,14 @@ void brgemm_example() {
 
     // Create ukernel memories in requested data types.
     // Note that all formats are `ab`.
-    auto A_md = memory::desc(A_dims, a_dt, tag::ab);
-    auto B_md = memory::desc(B_dims, b_dt, tag::ab);
-    auto binary_add_md = memory::desc(binary_add_dims, dt::f32, tag::ab);
-    auto B_scales_md = memory::desc(B_scales_dims, dt::f32, tag::ab);
-    auto C_md = memory::desc(C_dims, c_dt, tag::ab);
-    auto D_md = memory::desc(D_dims, d_dt, tag::ab);
+    auto A_md = memory::desc(A_dims, a_dt, memory::format_tag::ab);
+    auto B_md = memory::desc(B_dims, b_dt, memory::format_tag::ab);
+    auto binary_add_md = memory::desc(
+            binary_add_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto B_scales_md = memory::desc(
+            B_scales_dims, memory::data_type::f32, memory::format_tag::ab);
+    auto C_md = memory::desc(C_dims, c_dt, memory::format_tag::ab);
+    auto D_md = memory::desc(D_dims, d_dt, memory::format_tag::ab);
 
     auto A_mem = memory(A_md, engine);
     auto B_mem = memory(B_md, engine);
@@ -213,7 +237,7 @@ void brgemm_example() {
         // Specify post-ops for the brgemm object.
         brg_po.set_post_ops(ldd, d_dt, brgemm_ops);
         // Specify quantization scales for B.
-        if (b_dt == dt::s8 || b_dt == dt::u8) {
+        if (b_dt == memory::data_type::s8 || b_dt == memory::data_type::u8) {
             brg_po.set_B_scales(/* mask = */ 2);
         }
         // Finalize the initialization.
@@ -239,12 +263,6 @@ void brgemm_example() {
     void *B_base_ptr = B_ptr;
     size_t blocked_B_size = 0;
 
-    // Query the packing requirement from the kernel. It's enough to query
-    // packing requirements from a single object as long as only dimension
-    // settings change between objects.
-    // Note: example uses the one that always present regardless of dimensions.
-    const bool need_pack = brg_po.get_B_pack_type() == pack_type::pack32;
-
     // If packing is needed, create a dedicated object for data transformation.
     if (need_pack) {
         // Packing B tensor routine. The BRGeMM ukernel expects B passed in a
@@ -312,11 +330,21 @@ void brgemm_example() {
     params.set_post_ops_args(bin_po_ptrs.data());
     params.set_B_scales(B_scales_mem.get_data_handle());
 
-    // An execute call. The difference here is an additional D tensor pointer
-    // to store final output result after finishing accumulation and post-ops
-    // application.
-    brg_po.execute(A_ptr, B_base_ptr, A_B_po_offsets, C_ptr,
-            D_mem.get_data_handle(), scratchpad.data(), params);
+    // An execute call. The difference here is when post operations are
+    // requested, an additional D tensor pointer to store final output result
+    // after finishing accumulation and post-ops application is required.
+    // Additionally, a special `params` object with post operations handles
+    // is required.
+    //
+    // If post operations are not defined, the call is invalid, and a special
+    // API checks the state.
+    if (brg_po.is_execute_postops_valid()) {
+        brg_po.execute(A_ptr, B_base_ptr, A_B_po_offsets, C_ptr,
+                D_mem.get_data_handle(), scratchpad.data(), params);
+    } else {
+        brg_po.execute(
+                A_ptr, B_base_ptr, A_B_po_offsets, C_ptr, scratchpad.data());
+    }
 
     // Once all computations are done, need to release HW context.
     brgemm::release_hw_context();
diff --git a/include/oneapi/dnnl/dnnl.h b/include/oneapi/dnnl/dnnl.h
index fbc34a49ad5..ab5871f1c2e 100644
--- a/include/oneapi/dnnl/dnnl.h
+++ b/include/oneapi/dnnl/dnnl.h
@@ -24,6 +24,7 @@
 #include "oneapi/dnnl/dnnl_config.h"
 #include "oneapi/dnnl/dnnl_types.h"
 #include "oneapi/dnnl/dnnl_version.h"
+#include <stdbool.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -420,6 +421,8 @@ dnnl_status_t DNNL_API dnnl_primitive_attr_set_scratchpad_mode(
 ///     otherwise.
 dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales_mask(
         dnnl_primitive_attr_t attr, int arg, int mask);
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales_dims(
+        dnnl_primitive_attr_t attr, int arg, const dnnl_dims_t dims, int ndims, dnnl_data_type_t data_type);
 
 /// Sets primitive attributes scaling factors for primitive operations for a
 /// given memory argument. The scaling factors must be passed at execution time
@@ -467,6 +470,8 @@ dnnl_status_t DNNL_API dnnl_primitive_attr_set_scales(
 ///     otherwise.
 dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points_mask(
         dnnl_primitive_attr_t attr, int arg, int mask);
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points_dims(
+        dnnl_primitive_attr_t attr, int arg, const dnnl_dims_t dims, int ndims, dnnl_data_type_t data_type);
 
 /// Sets primitive attributes zero points for primitive operations for a given
 /// memory argument. The zero points must be passed at execution time
@@ -499,7 +504,7 @@ dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points(
 ///
 /// @param attr Primitive attributes.
 /// @param arg Argument for which rounding mode should be set.
-/// @params mode Rounding mode to apply to the argument.
+/// @param mode Rounding mode to apply to the argument.
 /// @returns #dnnl_success on success and a status describing the error
 ///     otherwise.
 dnnl_status_t DNNL_API dnnl_primitive_attr_set_rounding(
@@ -509,12 +514,24 @@ dnnl_status_t DNNL_API dnnl_primitive_attr_set_rounding(
 ///
 /// @param attr Primitive attributes.
 /// @param arg Argument for which rounding mode query applies.
-/// @params mode Output rounding mode.
+/// @param mode Output rounding mode.
 /// @returns #dnnl_success on success and a status describing the error
 ///     otherwise.
 dnnl_status_t DNNL_API dnnl_primitive_attr_get_rounding(
         dnnl_primitive_attr_t attr, int arg, dnnl_rounding_mode_t *mode);
 
+dnnl_status_t DNNL_API dnnl_primitive_attr_get_output_compensations(
+        const_dnnl_primitive_attr_t attr, int *count, int *mask, const int32_t **compensations);
+
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_output_compensations(
+        dnnl_primitive_attr_t attr, int count, int mask);
+
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_input_zero_points(
+        dnnl_primitive_attr_t attr, int count, int mask);
+
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_weights_zero_points(
+        dnnl_primitive_attr_t attr, int count, int mask);
+
 /// Returns primitive attributes post-ops.
 ///
 /// @warning
@@ -716,6 +733,13 @@ dnnl_status_t DNNL_API dnnl_post_ops_get_params_dw(
         dnnl_data_type_t *dst_data_type, dnnl_dim_t *kernel_size,
         dnnl_dim_t *stride_size, dnnl_dim_t *padding_l_size);
 
+/// Appends DW convolution post operation to the @p post_ops with given parameters
+/// @p weights and @p bias.
+///
+/// The kind of this post operation is #dnnl_convolution.
+dnnl_status_t DNNL_API dnnl_post_ops_append_dw_conv(
+        dnnl_post_ops_t post_ops, int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w, dnnl_data_type_t in_dt);
+
 /// Appends a binary post-op.
 ///
 /// The kind of this post operation is #dnnl_binary.
@@ -795,6 +819,18 @@ dnnl_status_t DNNL_API dnnl_post_ops_append_prelu(
 dnnl_status_t DNNL_API dnnl_post_ops_get_params_prelu(
         const_dnnl_post_ops_t post_ops, int index, int *mask);
 
+dnnl_status_t DNNL_API dnnl_post_ops_append_depthwise(
+        dnnl_post_ops_t post_ops, dnnl_alg_kind_t alg, size_t offset_size, const size_t* offset);
+
+dnnl_status_t DNNL_API dnnl_post_ops_append_quantization(
+        dnnl_post_ops_t post_ops, dnnl_alg_kind_t alg,
+        size_t per_channel_size, const bool* per_channel,
+        size_t all_default_size, const bool* all_default,
+        size_t offset_size, const size_t* offset);
+
+dnnl_status_t DNNL_API dnnl_post_ops_append_binarization(
+        dnnl_post_ops_t post_ops, dnnl_alg_kind_t alg, const float* weights_data, const float* output_mask);
+
 /// @} dnnl_api_attributes
 
 /// @} dnnl_api_primitives
@@ -898,6 +934,29 @@ dnnl_status_t DNNL_API dnnl_memory_desc_create_with_csr_encoding(
         dnnl_data_type_t data_type, dnnl_dim_t nnz, dnnl_data_type_t indices_dt,
         dnnl_data_type_t pointers_dt);
 
+/// Creates a memory descriptor for COO encoding.
+///
+/// The created memory descriptor will describe a memory object that
+/// contains n+1 buffers for an n-dimensional tensor.
+/// The buffers have the following meaning and assigned numbers (index):
+///  - 0: values
+///  - 1: indices for dimension 0
+///  - 2: indices for dimension 1 ...
+///  - n: indices for dimension n-1
+///
+/// @param memory_desc Output memory descriptor.
+/// @param ndims Number of dimensions.
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @param nnz Number of non-zero entries.
+/// @param indices_dt Data type of indices.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_with_coo_encoding(
+        dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
+        dnnl_data_type_t data_type, dnnl_dim_t nnz,
+        dnnl_data_type_t indices_dt);
+
 /// Creates a memory descriptor for packed sparse encoding.
 ///
 /// The created memory descriptor cannot be used to create a memory
@@ -921,6 +980,19 @@ dnnl_status_t DNNL_API dnnl_memory_desc_create_with_packed_encoding(
         dnnl_memory_desc_t *memory_desc, int ndims, const dnnl_dims_t dims,
         dnnl_data_type_t data_type, dnnl_dim_t nnz);
 #endif
+/// Initializes a sparse descriptor.
+///
+/// @param memory_desc Output memory descriptor.
+/// @param encoding Encoding.
+/// @param ndims Number of dimensions.
+/// @param dims Array of dimensions.
+/// @param data_type Elements data type.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_memory_desc_create_sparse(
+        dnnl_memory_desc_t *memory_desc,
+        dnnl_sparse_encoding_t encoding, int ndims,
+        const dnnl_dims_t dims, dnnl_data_type_t data_type);
 
 /// Creates a memory descriptor for a region inside an area
 /// described by an existing memory descriptor.
@@ -1175,7 +1247,6 @@ size_t DNNL_API dnnl_memory_desc_get_size(const_dnnl_memory_desc_t memory_desc);
 size_t DNNL_API dnnl_memory_desc_get_size_v2(
         const_dnnl_memory_desc_t memory_desc, int index);
 #endif
-
 /// Returns the size of data type.
 ///
 /// @param data_type Data type.
@@ -1228,7 +1299,6 @@ dnnl_status_t DNNL_API dnnl_memory_create_v2(dnnl_memory_t *memory,
         const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
         int nhandles, void **handles);
 #endif
-
 /// Returns the memory descriptor for a memory object.
 ///
 /// @param memory Memory object.
@@ -1340,7 +1410,6 @@ dnnl_status_t DNNL_API dnnl_memory_unmap_data(
 dnnl_status_t DNNL_API dnnl_memory_unmap_data_v2(
         const_dnnl_memory_t memory, void *mapped_ptr, int index);
 #endif
-
 /// Returns memory object's data handle.
 ///
 /// @param memory Memory object.
@@ -1477,7 +1546,7 @@ dnnl_status_t DNNL_API dnnl_sum_primitive_desc_create(
 /// Creates a primitive descriptor for a binary primitive.
 ///
 /// @note
-///     Memory descriptors @p src1_desc and @p dst_desc are alloweded to be
+///     Memory descriptors @p src1_desc and @p dst_desc are allowed to be
 ///     initialized with #dnnl_format_tag_any or with format_kind set to
 ///     #dnnl_format_kind_any.
 ///
@@ -1504,6 +1573,37 @@ dnnl_status_t DNNL_API dnnl_binary_primitive_desc_create(
         const_dnnl_memory_desc_t src1_desc, const_dnnl_memory_desc_t dst_desc,
         const_dnnl_primitive_attr_t attr);
 
+/// Creates a primitive descriptor for a binary primitive with support of
+/// ternary operators.
+///
+/// @note
+///     Memory descriptors @p src1_desc, @p src2_desc and @p dst_desc are
+///     allowed to be initialized with #dnnl_format_tag_any or with format_kind
+///     set to #dnnl_format_kind_any.
+///
+/// @note
+///     All memory descriptors must have the same number of dimensions.
+///     Element broadcasting is supported for memory descriptor @p src1_desc
+///     and is applied to @p src1_desc dimensions that have a size equal to 1.
+///     There is no broadcasting support for @p src2_desc.
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param alg_kind Algorithm kind.
+/// @param src0_desc Source 0 memory descriptor.
+/// @param src1_desc Source 1 memory descriptor.
+/// @param src2_desc Source memory descriptor for ternary operations. Might
+///     be empty.
+/// @param dst_desc Destination memory descriptor.
+/// @param attr Primitive attributes.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_binary_primitive_desc_create_v2(
+        dnnl_primitive_desc_t *primitive_desc, dnnl_engine_t engine,
+        dnnl_alg_kind_t alg_kind, const_dnnl_memory_desc_t src0_desc,
+        const_dnnl_memory_desc_t src1_desc, const_dnnl_memory_desc_t src2_desc,
+        const_dnnl_memory_desc_t dst_desc, const_dnnl_primitive_attr_t attr);
+
 /// @} dnnl_api_binary
 
 /// @addtogroup dnnl_api_convolution
@@ -2638,6 +2738,10 @@ dnnl_status_t DNNL_API dnnl_primitive_attr_get_rnn_weights_projection_qparams(
         const_dnnl_primitive_attr_t attr, dnnl_dim_t *count, int *mask,
         const float **scales);
 
+dnnl_status_t DNNL_API dnnl_primitive_attr_set_src_dyn_quant_params(
+        dnnl_primitive_attr_t attr, uint64_t group_size);
+dnnl_status_t DNNL_API  dnnl_primitive_attr_get_src_dyn_quant_params(
+        dnnl_primitive_attr_t attr, uint64_t* group_size);
 /// @} dnnl_api_attributes
 
 /// @addtogroup dnnl_api_rnn
diff --git a/include/oneapi/dnnl/dnnl.hpp b/include/oneapi/dnnl/dnnl.hpp
index 1dc369eaa05..8e55eabfcd0 100644
--- a/include/oneapi/dnnl/dnnl.hpp
+++ b/include/oneapi/dnnl/dnnl.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +20,7 @@
 
 #ifndef ONEAPI_DNNL_DNNL_HPP
 #define ONEAPI_DNNL_DNNL_HPP
+// NOLINTBEGIN(readability-identifier-naming)
 
 #include "oneapi/dnnl/dnnl_config.h"
 
@@ -29,6 +31,7 @@
 #include <memory>
 #include <string>
 #include <vector>
+#include <array>
 #include <unordered_map>
 
 #include "oneapi/dnnl/dnnl.h"
@@ -148,6 +151,10 @@ struct primitive : public handle<dnnl_primitive_t> {
         layer_normalization = dnnl_layer_normalization,
         /// A group normalization primitive
         group_normalization = dnnl_group_normalization,
+
+        depthwise = dnnl_depthwise,
+        quantization = dnnl_quantization,
+        binarization = dnnl_binarization,
     };
 
     using handle::handle;
@@ -168,7 +175,7 @@ struct primitive : public handle<dnnl_primitive_t> {
             const std::vector<uint8_t> &cache_blob);
 
     /// Constructs a primitive from a primitive descriptor.
-    ///
+    ///src/common/deconvolution_pd.hpp
     /// @param pd Primitive descriptor.
     primitive(const primitive_desc &pd);
 
@@ -406,6 +413,12 @@ enum class algorithm {
     eltwise_hardswish = dnnl_eltwise_hardswish,
     /// Elementwise: hardsigmoid
     eltwise_hardsigmoid = dnnl_eltwise_hardsigmoid,
+    /// Elementwise: hsigmoid
+    eltwise_hsigmoid = dnnl_eltwise_hsigmoid,
+    /// Elementwise: round_half_to_even
+    eltwise_round_half_to_even = dnnl_eltwise_round_half_to_even,
+    /// Elementwise: round_half_away_from_zero
+    eltwise_round_half_away_from_zero = dnnl_eltwise_round_half_away_from_zero,
     /// Elementwise: rectified linar unit (ReLU) (dst for backward)
     eltwise_relu_use_dst_for_bwd = dnnl_eltwise_relu_use_dst_for_bwd,
     /// Elementwise: hyperbolic tangent non-linearity (tanh) (dst for backward)
@@ -470,6 +483,10 @@ enum class algorithm {
     binary_eq = dnnl_binary_eq,
     /// Binary not equal
     binary_ne = dnnl_binary_ne,
+    /// Binary select
+    binary_select = dnnl_binary_select,
+    /// Binary prelu
+    binary_prelu = dnnl_binary_prelu,
     /// Nearest Neighbor resampling method
     resampling_nearest = dnnl_resampling_nearest,
     /// Linear (Bilinear, Trilinear) resampling method
@@ -496,6 +513,13 @@ enum class algorithm {
     softmax_accurate = dnnl_softmax_accurate,
     /// LogSoftmax, numerically stable
     softmax_log = dnnl_softmax_log,
+
+    depthwise_scale_shift = dnnl_depthwise_scale_shift,
+    depthwise_prelu = dnnl_depthwise_prelu,
+
+    quantization_quantize_dequantize = dnnl_quantization_quantize_dequantize,
+    quantization_quantize = dnnl_quantization_quantize,
+    binarization_depthwise = dnnl_binarization_depthwise,
 };
 
 /// Converts algorithm kind enum value from C++ API to C API type.
@@ -831,10 +855,10 @@ struct memory : public handle<dnnl_memory_t> {
     using handle::handle;
 
     /// Integer type for representing dimension sizes and indices.
-    typedef dnnl_dim_t dim;
+    using dim = dnnl_dim_t;
     /// Vector of dimensions. Implementations are free to force a limit on the
     /// vector's length.
-    typedef std::vector<dim> dims;
+    using dims = std::vector<dim>;
 
     /// Helper function that validates that an `std::vector` of dimensions can
     /// be safely converted to the C API array ::dnnl_dims_t. Throws if
@@ -852,6 +876,10 @@ struct memory : public handle<dnnl_memory_t> {
     enum class data_type {
         /// Undefined data type (used for empty memory descriptors).
         undef = dnnl_data_type_undef,
+        /// 4-bit float data type with 3-bit exponent and 0 bit mantissa.
+        f4_e3m0 = dnnl_f4_e3m0,
+        /// [MX-compliant 4-bit float data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2-bit exponent and 1 bit mantissa.
+        f4_e2m1 = dnnl_f4_e2m1,
         /// [MX-compliant 8-bit compliant scale data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 8-bit exponent.
         e8m0 = dnnl_e8m0,
         /// [OFP8 standard 8-bit floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
@@ -879,6 +907,10 @@ struct memory : public handle<dnnl_memory_t> {
         s4 = dnnl_s4,
         /// 4-bit unsigned integer.
         u4 = dnnl_u4,
+        /// 1-bit integer
+        bin = dnnl_bin,
+        /// 4-bit normalized float.
+        nf4 = dnnl_nf4,
     };
 
     /// Returns size of data type in bytes.
@@ -901,6 +933,7 @@ struct memory : public handle<dnnl_memory_t> {
         /// Format kind for sparse tensors.
         sparse = dnnl_format_kind_sparse,
 #endif
+        sparsed = dnnl_format_sparse,
         /// A special format kind that indicates that tensor format is opaque.
         opaque = dnnl_format_kind_opaque,
     };
@@ -918,6 +951,8 @@ struct memory : public handle<dnnl_memory_t> {
             /// only be used to create a primitive descriptor to query the
             /// actual memory descriptor (similar to the format tag `any`).
             packed = dnnl_packed,
+            /// Coordinate Sparse (COO) encoding.
+            coo = dnnl_coo,
     };
 #endif
 
@@ -1213,6 +1248,7 @@ struct memory : public handle<dnnl_memory_t> {
         AB16b64a2b = dnnl_AB16b64a2b,
         Ab4a = dnnl_Ab4a,
         Ab8a = dnnl_Ab8a,
+        Ab32a = dnnl_Ab32a,
         Abc16a = dnnl_Abc16a,
         ABc16a16b = dnnl_ABc16a16b,
         ABc4a4b = dnnl_ABc4a4b,
@@ -1302,6 +1338,7 @@ struct memory : public handle<dnnl_memory_t> {
         aBCd4b4c = dnnl_aBCd4b4c,
         ABcd8a16b2a = dnnl_ABcd8a16b2a,
         ABcd8a8b = dnnl_ABcd8a8b,
+        ABcd8a32b = dnnl_ABcd8a32b,
         ABcd8a4b = dnnl_ABcd8a4b,
         ABcd8a2b = dnnl_ABcd8a2b,
         /// 4D tensor blocked by 2nd dimension with block size 8
@@ -1407,6 +1444,7 @@ struct memory : public handle<dnnl_memory_t> {
         aBdeC8b4c = dnnl_aBdeC8b4c,
         aBdefc16b = dnnl_aBdefc16b,
         aCBdef16c16b = dnnl_aCBdef16c16b,
+        aCBdef8b8c = dnnl_aCBdef8b8c,
         aCBdef16b16c = dnnl_aCBdef16b16c,
         aBdefc4b = dnnl_aBdefc4b,
         aBdefc8b = dnnl_aBdefc8b,
@@ -1417,8 +1455,10 @@ struct memory : public handle<dnnl_memory_t> {
         Acb8a = dnnl_Acb8a,
         AcB8a2b = dnnl_AcB8a2b,
         AcB8a4b = dnnl_AcB8a4b,
+        aCBd8b8c = dnnl_aCBd8b8c,
         aCBd16b16c = dnnl_aCBd16b16c,
         aCBd16c16b = dnnl_aCBd16c16b,
+        aCBde8b8c = dnnl_aCBde8b8c,
         aCBde16b16c = dnnl_aCBde16b16c,
         aCBde16c16b = dnnl_aCBde16c16b,
         Acdb16a = dnnl_Acdb16a,
@@ -1431,14 +1471,19 @@ struct memory : public handle<dnnl_memory_t> {
         Acdeb8a = dnnl_Acdeb8a,
         AcdeB8a2b = dnnl_AcdeB8a2b,
         AcdeB8a4b = dnnl_AcdeB8a4b,
+        BAc8a8b = dnnl_BAc8a8b,
         BAc16a16b = dnnl_BAc16a16b,
         BAc16b16a = dnnl_BAc16b16a,
+        BAcd8a8b = dnnl_BAcd8a8b,
         BAcd16a16b = dnnl_BAcd16a16b,
         BAcd16b16a = dnnl_BAcd16b16a,
         ABcd32a32b = dnnl_ABcd32a32b,
         BAcde16b16a = dnnl_BAcde16b16a,
+        BAcde8a8b = dnnl_BAcde8a8b,
         BAcde16a16b = dnnl_BAcde16a16b,
         aBdec32b = dnnl_aBdec32b,
+        Abcdef4a = dnnl_Abcdef4a,
+        Abcdef8a = dnnl_Abcdef8a,
         Abcdef16a = dnnl_Abcdef16a,
         Abcdef32a = dnnl_Abcdef32a,
         Acdb32a = dnnl_Acdb32a,
@@ -1460,10 +1505,12 @@ struct memory : public handle<dnnl_memory_t> {
         AB8a2b = dnnl_AB8a2b,
         abDc16d = dnnl_abDc16d,
         abDc32d = dnnl_abDc32d,
+        abDC16d4c = dnnl_abDC16d4c,
         abDC32d4c = dnnl_abDC32d4c,
         abCd32c = dnnl_abCd32c,
         abdEc16e = dnnl_abdEc16e,
         abdEc32e = dnnl_abdEc32e,
+        abdEC16e4c = dnnl_abdEC16e4c,
         abdEC32e2c = dnnl_abdEC32e2c,
         abdEC32e4c = dnnl_abdEC32e4c,
         abdCe16c = dnnl_abdCe16c,
@@ -1596,6 +1643,9 @@ struct memory : public handle<dnnl_memory_t> {
         BA16a32b4a = dnnl_BA16a32b4a,
         BA16a48b4a = dnnl_BA16a48b4a,
         BA16a64b4a = dnnl_BA16a64b4a,
+        BA24b8a = dnnl_BA24b8a,
+        aCB24c8b = dnnl_aCB24c8b,
+        abDC24d8c = dnnl_abDC24d8c,
         decbA16a = dnnl_decbA16a,
         decbA8a = dnnl_decbA8a,
         defcbA16a = dnnl_defcbA16a,
@@ -1686,7 +1736,10 @@ struct memory : public handle<dnnl_memory_t> {
         IOdhw16i16o = dnnl_IOdhw16i16o,
         gIOhw16i16o = dnnl_gIOhw16i16o,
         gOhwi32o = dnnl_gOhwi32o,
+        Goidhw4g = dnnl_Goidhw4g,
+        Goidhw8g = dnnl_Goidhw8g,
         Goidhw16g = dnnl_Goidhw16g,
+        IOw8o8i = dnnl_IOw8o8i,
         IOw16o16i = dnnl_IOw16o16i,
         OIw16i16o = dnnl_OIw16i16o,
         OwI16i16o = dnnl_OwI16i16o,
@@ -1743,6 +1796,7 @@ struct memory : public handle<dnnl_memory_t> {
         Owi8o = dnnl_Owi8o,
         OwI8o2i = dnnl_OwI8o2i,
         OwI8o4i = dnnl_OwI8o4i,
+        IOhw8o8i = dnnl_IOhw8o8i,
         IOhw16o16i = dnnl_IOhw16o16i,
         Ohwi16o = dnnl_Ohwi16o,
         OhwI16o2i = dnnl_OhwI16o2i,
@@ -1786,8 +1840,11 @@ struct memory : public handle<dnnl_memory_t> {
         OhwI8i8o = dnnl_OhwI8i8o,
         OIhw8o16i2o = dnnl_OIhw8o16i2o,
         OIhw8o8i = dnnl_OIhw8o8i,
+        OIhw8o32i = dnnl_OIhw8o32i,
+        OIhw16o32i = dnnl_OIhw16o32i,
         OIhw8o4i = dnnl_OIhw8o4i,
         OIhw2i8o4i = dnnl_OIhw2i8o4i,
+        IOdhw8o8i = dnnl_IOdhw8o8i,
         IOdhw16o16i = dnnl_IOdhw16o16i,
         Odhwi16o = dnnl_Odhwi16o,
         OdhwI16o2i = dnnl_OdhwI16o2i,
@@ -1841,6 +1898,7 @@ struct memory : public handle<dnnl_memory_t> {
         OdhwI8i8o = dnnl_OdhwI8i8o,
         OIdhw8o8i = dnnl_OIdhw8o8i,
         OIdhw8o4i = dnnl_OIdhw8o4i,
+        gIOw8o8i = dnnl_gIOw8o8i,
         gIOw16o16i = dnnl_gIOw16o16i,
         gOIw16i16o = dnnl_gOIw16i16o,
         gOIw16o16i = dnnl_gOIw16o16i,
@@ -1869,6 +1927,7 @@ struct memory : public handle<dnnl_memory_t> {
         gOwI8o4i = dnnl_gOwI8o4i,
         Goiw8g = dnnl_Goiw8g,
         Goiw16g = dnnl_Goiw16g,
+        gIOhw8o8i = dnnl_gIOhw8o8i,
         gIOhw16o16i = dnnl_gIOhw16o16i,
         gOhwi16o = dnnl_gOhwi16o,
         gOhwI16o2i = dnnl_gOhwI16o2i,
@@ -1915,6 +1974,7 @@ struct memory : public handle<dnnl_memory_t> {
         gOIhw8o8i = dnnl_gOIhw8o8i,
         gOIhw8o4i = dnnl_gOIhw8o4i,
         gIOdhw16i16o = dnnl_gIOdhw16i16o,
+        gIOdhw8o8i = dnnl_gIOdhw8o8i,
         gIOdhw16o16i = dnnl_gIOdhw16o16i,
         gOdhwi16o = dnnl_gOdhwi16o,
         gOdhwI16o2i = dnnl_gOdhwI16o2i,
@@ -1955,8 +2015,10 @@ struct memory : public handle<dnnl_memory_t> {
 
         ldOi16o = abDc16d,
         ldOi32o = abDc32d,
+        ldOI16o4i = abDC16d4c,
         ldOI32o4i = abDC32d4c,
         ldgOi16o = abdEc16e,
+        ldgOI16o4i = abdEC16e4c,
         ldgOi32o = abdEc32e,
         ldgOI32o2i = abdEC32e2c,
         ldgOI32o4i = abdEC32e4c,
@@ -2721,7 +2783,6 @@ struct memory : public handle<dnnl_memory_t> {
     /// A memory descriptor.
     struct desc : public handle<dnnl_memory_desc_t> {
         using handle<dnnl_memory_desc_t>::handle;
-
         friend struct memory;
 
         /// Constructs a zero (empty) memory descriptor. Such a memory
@@ -2828,6 +2889,38 @@ struct memory : public handle<dnnl_memory_t> {
             return desc {md};
         }
 
+        /// Function for creating a memory descriptor for COO sparse encodings.
+        ///
+        /// The created memory descriptor will describe a memory object that
+        /// contains n+1 buffers for an n-dimensional tensor.
+        /// The buffers have the following meaning and assigned numbers (index):
+        ///  - 0: values
+        ///  - 1: indices for dimension 0
+        ///  - 2: indices for dimension 1 ...
+        ///  - n: indices for dimension n-1
+        ///
+        /// @param adims Tensor dimensions.
+        /// @param adata_type Data precision/type.
+        /// @param nnz Number of non-zero entries.
+        /// @param index_dt Data type of indices.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case a
+        ///     zero memory descriptor will be constructed. This flag is
+        ///     optional and defaults to false.
+        static desc coo(const dims &adims, data_type adata_type, dim nnz,
+                data_type index_dt, bool allow_empty = false) {
+            validate_dims(adims);
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_with_coo_encoding(
+                    &md, (int)adims.size(), adims.data(),
+                    convert_to_c(adata_type), nnz, convert_to_c(index_dt));
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a memory descriptor for COO sparse "
+                        "encoding");
+            return desc {md};
+        }
+
         /// Function for creating a memory descriptor for packed sparse
         /// encoding.
         ///
@@ -2880,6 +2973,31 @@ struct memory : public handle<dnnl_memory_t> {
             reset(md);
         }
 
+        /// @fork
+        /// Copy constructor for memory::desc
+        /// Ensures deep copy (underlying C structure is copied as well)
+        /// To preserve behavior of 2.x oneDNN versions
+        ///
+        /// @param desc memory descriptor to copy.
+        desc(const memory::desc& adesc) {
+            auto cdesc = adesc.get();
+            dnnl_memory_desc_t cloned_md = nullptr;
+            dnnl_memory_desc_clone(&cloned_md, cdesc);
+
+            reset(cloned_md);
+        }
+
+        desc sparse_desc(const dims &adims, data_type adata_type,
+                bool allow_empty = false) {
+            dnnl_memory_desc_t md = nullptr;
+            dnnl_status_t status = dnnl_memory_desc_create_sparse(&md, dnnl_sparse_encoding_packed,
+                    (int)adims.size(), adims.data(), convert_to_c(adata_type));
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not construct a memory descriptor with sparse format");
+            return desc(md);
+        }
         /// Constructs a memory descriptor for a region inside an area
         /// described by this memory descriptor.
         //
@@ -3128,9 +3246,9 @@ struct memory : public handle<dnnl_memory_t> {
         /// Returns the data type of the memory descriptor.
         ///
         /// @returns The data type.
-        memory::data_type get_data_type() const {
-            return query_data_type(query::data_type);
-        }
+        // memory::data_type get_data_type() const {
+        //     return query_data_type(query::data_type);
+        // }
 #endif
 
         /// Returns the format kind of the memory descriptor.
@@ -3145,6 +3263,30 @@ struct memory : public handle<dnnl_memory_t> {
                     : dnnl::memory::format_kind::undef;
         }
 
+        /// Returns the format kind of the memory descriptor.
+        ///
+        /// @returns the format kind.
+        dnnl_sparse_encoding_t get_sparse_encoding() const {
+            dnnl_sparse_encoding_t sparse_encoding;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                get(), dnnl_query_sparse_encoding, &sparse_encoding);
+            return status == dnnl_success
+                    ? sparse_encoding
+                    : dnnl_sparse_encoding_undef;
+        }
+
+        /// Returns the data type of the memory descriptor.
+        ///
+        /// @returns The data type.
+        memory::data_type get_data_type() const {
+            dnnl_data_type_t data_type;
+            dnnl_status_t status = dnnl_memory_desc_query(
+                    get(), dnnl_query_data_type, &data_type);
+            return status == dnnl_success
+                    ? static_cast<dnnl::memory::data_type>(data_type)
+                    : dnnl::memory::data_type::undef;
+        }
+
         /// Returns dimensions of the memory descriptor.
         ///
         /// Potentially expensive due to the data copy involved.
@@ -3322,6 +3464,44 @@ struct memory : public handle<dnnl_memory_t> {
         reset(result);
     }
 #else
+    /// Constructs a memory object.
+    ///
+    /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory
+    /// object will have the underlying buffer set. In this case, the buffer
+    /// will be initialized as if #dnnl::memory::set_data_handle() had been
+    /// called.
+    ///
+    /// @sa memory::set_data_handle()
+    ///
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    /// @param handle Handle of the memory buffer to use.
+    ///     - A pointer to the user-allocated buffer. In this case the library
+    ///       doesn't own the buffer.
+    ///     - The #DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+    ///       allocate the buffer for the memory object. In this case the
+    ///       library owns the buffer.
+    ///     - #DNNL_MEMORY_NONE to create dnnl::memory without an underlying
+    ///       buffer.
+    // memory(const desc &md, const engine &aengine, void *handle) {
+    //     dnnl_memory_t result;
+    //     error::wrap_c_api(
+    //             dnnl_memory_create(&result, md.get(), aengine.get(), handle),
+    //             "could not create a memory object");
+    //     reset(result);
+    // }
+
+    /// Constructs a memory object.
+    ///
+    /// The underlying buffer(s) for the memory will be allocated by the
+    /// library.
+    ///
+    /// @param md Memory descriptor.
+    /// @param aengine Engine to store the data on.
+    // memory(const desc &md, const engine &aengine)
+    //     : memory(md, aengine, DNNL_MEMORY_ALLOCATE) {}
+#endif
+
     /// Constructs a memory object.
     ///
     /// Unless @p handle is equal to #DNNL_MEMORY_NONE, the constructed memory
@@ -3349,15 +3529,11 @@ struct memory : public handle<dnnl_memory_t> {
         reset(result);
     }
 
-    /// Constructs a memory object.
-    ///
     /// The underlying buffer for the memory will be allocated by the library.
-    ///
     /// @param md Memory descriptor.
     /// @param aengine Engine to store the data on.
     memory(const desc &md, const engine &aengine)
         : memory(md, aengine, DNNL_MEMORY_ALLOCATE) {}
-#endif
 
     /// Returns the associated memory descriptor.
     desc get_desc() const {
@@ -3805,6 +3981,12 @@ struct post_ops : public handle<dnnl_post_ops_t> {
                 "could not append a binary post-op");
     }
 
+    void append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w, dnnl_data_type_t in_dt) {
+        error::wrap_c_api(dnnl_post_ops_append_dw_conv(get(),
+                                                       in_h, in_w, ker_h, ker_w, str_h, str_w, in_dt),
+                          "could not append dw conv");
+    }
+
     /// Returns the parameters of a binary post-op.
     ///
     /// @param index Index of the binary post-op.
@@ -3889,6 +4071,23 @@ struct post_ops : public handle<dnnl_post_ops_t> {
         error::wrap_c_api(dnnl_post_ops_get_params_prelu(get(), index, &mask),
                 "could not get parameters of a binary post-op");
     }
+
+    void append_depthwise(algorithm alg, const std::array<size_t, 2>& offset) {
+        error::wrap_c_api(dnnl_post_ops_append_depthwise(get(), convert_to_c(alg), offset.size(), offset.data()),
+                "could not append depthwise");
+    }
+
+    void append_quantization(algorithm alg, const std::array<bool, 6>& per_channel, const std::array<bool, 6>& all_default,
+                             const std::array<size_t, 6>& offset) {
+        error::wrap_c_api(dnnl_post_ops_append_quantization(get(), convert_to_c(alg), per_channel.size(), per_channel.data(),
+                all_default.size(), all_default.data(), offset.size(), offset.data()),
+                          "could not append quantization");
+    }
+
+    void append_binarization(algorithm alg, const float* weights_data, const float* output_mask) {
+        error::wrap_c_api(dnnl_post_ops_append_binarization(get(), convert_to_c(alg), weights_data, output_mask),
+                          "could not append binarization");
+    }
 };
 
 /// @cond DO_NOT_DOCUMENT_THIS
@@ -4067,6 +4266,10 @@ struct primitive_attr : public handle<dnnl_primitive_attr_t> {
         error::wrap_c_api(dnnl_primitive_attr_set_scales_mask(get(), arg, mask),
                 "could not set scales primitive attribute");
     }
+    void set_scales_dims(int arg, const memory::dims& dims, memory::data_type data_type = memory::data_type::f32) {
+        error::wrap_c_api(dnnl_primitive_attr_set_scales_dims(get(), arg, dims.data(), dims.size(), memory::convert_to_c(data_type)),
+                "could not set scales primitive attribute");
+    }
 
     /// Sets scaling factors for primitive operations for a given memory
     /// argument. The scaling factors must be passed at execution time
@@ -4112,6 +4315,11 @@ struct primitive_attr : public handle<dnnl_primitive_attr_t> {
                 dnnl_primitive_attr_set_zero_points_mask(get(), arg, mask),
                 "could not set zero points primitive attribute");
     }
+    void set_zero_points_dims(int arg, const memory::dims& dims, memory::data_type dt) {
+        error::wrap_c_api(
+                dnnl_primitive_attr_set_zero_points_dims(get(), arg, dims.data(), dims.size(), memory::convert_to_c(dt)),
+                "could not set zero points primitive attribute");
+    }
 
     /// Sets zero points for primitive operations for a given memory argument.
     /// The zero points must be passed at execution time as an argument with
@@ -4139,10 +4347,28 @@ struct primitive_attr : public handle<dnnl_primitive_attr_t> {
                 "could not set zero points primitive attribute");
     }
 
+    void set_output_compensations(dnnl_dim_t count, int mask)
+    {
+        error::wrap_c_api(dnnl_primitive_attr_set_output_compensations(get(), count, mask),
+                "could not set int output compensations");
+    }
+
+    void set_input_zero_points(dnnl_dim_t count, int mask)
+    {
+        error::wrap_c_api(dnnl_primitive_attr_set_input_zero_points(get(), count, mask),
+                "could not set int input zero_points");
+    }
+
+    void set_weights_zero_points(dnnl_dim_t count, int mask)
+    {
+        error::wrap_c_api(dnnl_primitive_attr_set_weights_zero_points(get(), count, mask),
+                "could not set int weights zero_points");
+    }
+
     /// Returns post-ops previously set via set_post_ops().
     ///
     /// @returns Post-ops.
-    const post_ops get_post_ops() const {
+    post_ops get_post_ops() const {
         const_dnnl_post_ops_t const_c_post_ops;
         error::wrap_c_api(
                 dnnl_primitive_attr_get_post_ops(get(), &const_c_post_ops),
@@ -4161,7 +4387,7 @@ struct primitive_attr : public handle<dnnl_primitive_attr_t> {
     ///     by the respective primitive descriptor constructor.
     ///
     /// @param ops Post-ops object to copy post-ops from.
-    void set_post_ops(const post_ops ops) {
+    void set_post_ops(const post_ops &ops) {
         error::wrap_c_api(dnnl_primitive_attr_set_post_ops(get(), ops.get()),
                 "could not set post-ops primitive attribute");
     }
@@ -4362,6 +4588,16 @@ struct primitive_attr : public handle<dnnl_primitive_attr_t> {
         for (dnnl_dim_t c = 0; c < count; c++)
             scales[c] = c_scales[c];
     }
+
+    void set_src_dyn_quant_params(uint64_t group_size) {
+        error::wrap_c_api(dnnl_primitive_attr_set_src_dyn_quant_params(get(), group_size),
+                "could not set src dynamic quantization parameters primitive attribute");
+    }
+
+    void get_src_dyn_quant_params(uint64_t& group_size) const {
+        error::wrap_c_api(dnnl_primitive_attr_get_src_dyn_quant_params(get(), &group_size),
+                "could not get src dynamic quantization parameters primitive attribute");
+    }
 };
 
 /// @} dnnl_api_attributes
@@ -5002,8 +5238,10 @@ struct reorder : public primitive {
                     dst_engine.get(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a reorder "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the reorder primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
         }
 
@@ -5030,8 +5268,10 @@ struct reorder : public primitive {
                     dst.get_engine().get(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a reorder "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the reorder primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
         }
 
@@ -5155,8 +5395,10 @@ struct concat : public primitive {
                     concat_dimension, c_srcs.data(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a concat "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the concat primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
         }
 
@@ -5189,8 +5431,10 @@ struct concat : public primitive {
                     concat_dimension, c_api_srcs.data(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a concat "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the concat primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
         }
 
@@ -5271,8 +5515,10 @@ struct sum : public primitive {
                     scales.data(), c_api_srcs.data(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a sum "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the sum primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
         }
 
@@ -5306,8 +5552,10 @@ struct sum : public primitive {
                     scales.data(), c_api_srcs.data(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a sum "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the sum primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(status == dnnl_success ? result : dnnl_primitive_desc_t());
         }
 
@@ -5647,8 +5895,10 @@ struct convolution_forward : public primitive {
                             &padding_l[0], &padding_r[0], attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "convolution forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the convolution forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -5839,8 +6089,10 @@ struct convolution_backward_data : public primitive {
                             hint_fwd_pd.get(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "convolution backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the convolution backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -6145,8 +6397,10 @@ struct convolution_backward_weights : public primitive {
                             &padding_r[0], hint_fwd_pd.get(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "convolution weights update primitive");
+                        "could not create a primitive descriptor for "
+                        "the convolution weights update primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -6441,8 +6695,10 @@ struct deconvolution_forward : public primitive {
                             &padding_l[0], &padding_r[0], attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "deconvolution forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the deconvolution forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -6631,8 +6887,10 @@ struct deconvolution_backward_data : public primitive {
                             hint_fwd_pd.get(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "deconvolution backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the deconvolution backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -6930,8 +7188,10 @@ struct deconvolution_backward_weights : public primitive {
                             &padding_r[0], hint_fwd_pd.get(), attr.get());
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "deconvolution weights update primitive");
+                        "could not create a primitive descriptor for "
+                        "the deconvolution weights update primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -7009,8 +7269,10 @@ struct lrn_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a lrn "
-                        "forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the lrn forward propagation primitive. Run workload "
+                        "with environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(pd);
         }
 
@@ -7116,8 +7378,10 @@ struct lrn_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a lrn "
-                        "backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the lrn backward propagation primitive. Run workload "
+                        "with environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(pd);
         }
 
@@ -7329,8 +7593,10 @@ struct eltwise_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for an "
-                        "eltwise forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the eltwise forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -7506,8 +7772,10 @@ struct eltwise_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for an "
-                        "eltwise backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the eltwise backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -7579,8 +7847,10 @@ struct softmax_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a softmax "
-                        "forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the softmax forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
 
@@ -7670,8 +7940,10 @@ struct softmax_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a softmax "
-                        "backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the softmax backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
 
@@ -7788,8 +8060,11 @@ struct batch_normalization_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a batch "
-                        "normalization forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the batch normalization forward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
             reset(pd);
         }
 
@@ -7916,8 +8191,11 @@ struct batch_normalization_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a batch "
-                        "normalization backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the batch normalization backward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
             reset(pd);
         }
 
@@ -8061,8 +8339,11 @@ struct group_normalization_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a group "
-                        "normalization forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the group normalization forward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
             reset(pd);
         }
 
@@ -8193,8 +8474,11 @@ struct group_normalization_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a group "
-                        "normalization backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the group normalization backward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
             reset(pd);
         }
 
@@ -8499,8 +8783,11 @@ struct layer_normalization_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a layer "
-                        "normalization forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the layer normalization forward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
             reset(pd);
         }
     };
@@ -8768,8 +9055,11 @@ struct layer_normalization_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a layer "
-                        "normalization backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the layer normalization backward propagation "
+                        "primitive. Run workload with environment variable "
+                        "ONEDNN_VERBOSE=all to get additional diagnostic "
+                        "information.");
             reset(pd);
         }
     };
@@ -8908,8 +9198,10 @@ struct inner_product_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for an inner "
-                        "product forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the inner product forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -8975,8 +9267,10 @@ struct inner_product_backward_data : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for an inner "
-                        "product backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the inner product backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
 
@@ -9136,8 +9430,10 @@ struct inner_product_backward_weights : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for an inner "
-                        "product weights gradient primitive");
+                        "could not create a primitive descriptor for "
+                        "the inner product weights gradient primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -9438,8 +9734,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         weights_iter_desc.get(), bias_desc.get(),
                         dst_layer_desc.get(), dst_iter_desc.get(),
                         convert_to_c(flags), alpha, beta, attr.get());
-                msg = "could not create a primitive descriptor for a vanilla "
-                      "RNN forward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the vanilla RNN forward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
                 break;
             case algorithm::vanilla_lstm:
                 status = dnnl_lstm_forward_primitive_desc_create(&pd,
@@ -9452,8 +9750,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         dst_layer_desc.get(), dst_iter_desc.get(),
                         optional_arg(dst_iter_c_desc), convert_to_c(flags),
                         attr.get());
-                msg = "could not create a primitive descriptor for an LSTM "
-                      "forward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the LSTM forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::vanilla_gru:
                 status = dnnl_gru_forward_primitive_desc_create(&pd,
@@ -9463,8 +9763,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         weights_iter_desc.get(), bias_desc.get(),
                         dst_layer_desc.get(), dst_iter_desc.get(),
                         convert_to_c(flags), attr.get());
-                msg = "could not create a primitive descriptor for a GRU "
-                      "forward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the GRU forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::lbr_gru:
                 status = dnnl_lbr_gru_forward_primitive_desc_create(&pd,
@@ -9474,8 +9776,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         weights_iter_desc.get(), bias_desc.get(),
                         dst_layer_desc.get(), dst_iter_desc.get(),
                         convert_to_c(flags), attr.get());
-                msg = "could not create a primitive descriptor for an LBR GRU "
-                      "forward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the LBR GRU forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::vanilla_augru:
                 status = dnnl_augru_forward_primitive_desc_create(&pd,
@@ -9485,8 +9789,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         weights_layer_desc.get(), weights_iter_desc.get(),
                         bias_desc.get(), dst_layer_desc.get(),
                         dst_iter_desc.get(), convert_to_c(flags), attr.get());
-                msg = "could not create a primitive descriptor for an AUGRU "
-                      "forward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the AUGRU forward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::lbr_augru:
                 status = dnnl_lbr_augru_forward_primitive_desc_create(&pd,
@@ -9496,8 +9802,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         weights_layer_desc.get(), weights_iter_desc.get(),
                         bias_desc.get(), dst_layer_desc.get(),
                         dst_iter_desc.get(), convert_to_c(flags), attr.get());
-                msg = "could not create a primitive descriptor for an LBR "
-                      "AUGRU forward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the LBR AUGRU forward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
                 break;
             default: status = dnnl_unimplemented;
         }
@@ -9555,8 +9863,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
                         convert_to_c(flags), alpha, beta, hint_fwd_pd.get(),
                         attr.get());
-                msg = "could not create a primitive descriptor for a vanilla "
-                      "RNN backward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the vanilla RNN backward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
                 break;
             case algorithm::vanilla_lstm:
                 status = dnnl_lstm_backward_primitive_desc_create(&pd,
@@ -9578,8 +9888,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         diff_dst_iter_desc.get(),
                         optional_arg(diff_dst_iter_c_desc), convert_to_c(flags),
                         hint_fwd_pd.get(), attr.get());
-                msg = "could not create a primitive descriptor for an LSTM "
-                      "backward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the LSTM backward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::vanilla_gru:
                 status = dnnl_gru_backward_primitive_desc_create(&pd,
@@ -9593,8 +9905,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         diff_weights_iter_desc.get(), diff_bias_desc.get(),
                         diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
                         convert_to_c(flags), hint_fwd_pd.get(), attr.get());
-                msg = "could not create a primitive descriptor for a GRU "
-                      "backward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the GRU backward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::lbr_gru:
                 status = dnnl_lbr_gru_backward_primitive_desc_create(&pd,
@@ -9608,8 +9922,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         diff_weights_iter_desc.get(), diff_bias_desc.get(),
                         diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
                         convert_to_c(flags), hint_fwd_pd.get(), attr.get());
-                msg = "could not create a primitive descriptor for an LBR GRU "
-                      "backward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the LBR GRU backward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
                 break;
             case algorithm::vanilla_augru:
                 status = dnnl_augru_backward_primitive_desc_create(&pd,
@@ -9625,8 +9941,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         diff_weights_iter_desc.get(), diff_bias_desc.get(),
                         diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
                         convert_to_c(flags), hint_fwd_pd.get(), attr.get());
-                msg = "could not create a primitive descriptor for an AUGRU "
-                      "backward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the AUGRU backward propagation primitive. Run workload "
+                      "with environment variable ONEDNN_VERBOSE=all to get "
+                      "additional diagnostic information.";
                 break;
             case algorithm::lbr_augru:
                 status = dnnl_lbr_augru_backward_primitive_desc_create(&pd,
@@ -9642,8 +9960,10 @@ struct rnn_primitive_desc_base : public primitive_desc {
                         diff_weights_iter_desc.get(), diff_bias_desc.get(),
                         diff_dst_layer_desc.get(), diff_dst_iter_desc.get(),
                         convert_to_c(flags), hint_fwd_pd.get(), attr.get());
-                msg = "could not create a primitive descriptor for an LBR "
-                      "AUGRU backward propagation primitive";
+                msg = "could not create a primitive descriptor for "
+                      "the LBR AUGRU backward propagation primitive. Run "
+                      "workload with environment variable ONEDNN_VERBOSE=all "
+                      "to get additional diagnostic information.";
                 break;
             default: status = dnnl_unimplemented;
         }
@@ -12381,8 +12701,10 @@ struct shuffle_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a shuffle "
-                        "forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the shuffle forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
 
@@ -12468,8 +12790,10 @@ struct shuffle_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a shuffle "
-                        "backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the shuffle backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
 
@@ -12560,8 +12884,46 @@ struct binary : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a binary "
-                        "operation primitive");
+                        "could not create a primitive descriptor for "
+                        "the binary operation primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
+            reset(pd);
+        }
+
+        /// Constructs a primitive descriptor for an elementwise binary operator
+        /// primitive with support of ternary operators.
+        ///
+        /// @param aengine Engine to use.
+        /// @param aalgorithm Elementwise binary algorithm.
+        /// @param src0 Memory descriptor for source tensor #0.
+        /// @param src1 Memory descriptor for source tensor #1.
+        /// @param src2 Memory descriptor for source tensor #2 for ternary
+        ///     operations. Might be empty.
+        /// @param dst Memory descriptor for destination tensor.
+        /// @param attr Primitive attributes to use. Attributes are optional
+        ///     and default to empty attributes.
+        /// @param allow_empty A flag signifying whether construction is
+        ///     allowed to fail without throwing an exception. In this case an
+        ///     empty object will be produced. This flag is optional and
+        ///     defaults to false.
+        primitive_desc(const engine &aengine, algorithm aalgorithm,
+                const memory::desc &src0, const memory::desc &src1,
+                const memory::desc &src2, const memory::desc &dst,
+                const primitive_attr &attr = default_attr(),
+                bool allow_empty = false) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = dnnl_binary_primitive_desc_create_v2(&pd,
+                    aengine.get(), dnnl::convert_to_c(aalgorithm), src0.get(),
+                    src1.get(), src2.get(), dst.get(), attr.get());
+
+            if (!allow_empty)
+                error::wrap_c_api(status,
+                        "could not create a primitive descriptor for "
+                        "the binary v2 operation primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(pd);
         }
 
@@ -12581,6 +12943,9 @@ struct binary : public primitive {
         /// Returns the memory descriptor for source #1.
         memory::desc src1_desc() const { return base::src_desc(1); }
 
+        /// Returns the memory descriptor for source #2.
+        memory::desc src2_desc() const { return base::src_desc(2); }
+
         /// @copydoc dnnl::primitive_desc_base::dst_desc()const
         memory::desc dst_desc() const { return base::dst_desc(0); }
 
@@ -12700,8 +13065,10 @@ struct matmul : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a matmul "
-                        "primitive");
+                        "could not create a primitive descriptor for "
+                        "the matmul primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(pd);
         }
     };
@@ -12863,8 +13230,10 @@ struct resampling_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "resampling forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the resampling forward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -12987,8 +13356,10 @@ struct resampling_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "resampling backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the resampling backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
     };
@@ -13324,8 +13695,10 @@ struct prelu_forward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a prelu "
-                        "forward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the prelu forward propagation primitive. Run workload "
+                        "with environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(pd);
         }
 
@@ -13409,8 +13782,10 @@ struct prelu_backward : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a prelu "
-                        "backward propagation primitive");
+                        "could not create a primitive descriptor for "
+                        "the prelu backward propagation primitive. Run "
+                        "workload with environment variable ONEDNN_VERBOSE=all "
+                        "to get additional diagnostic information.");
             reset(pd);
         }
 
@@ -13509,8 +13884,10 @@ struct reduction : public primitive {
 
             if (!allow_empty)
                 error::wrap_c_api(status,
-                        "could not create a primitive descriptor for a "
-                        "reduction primitive descriptor");
+                        "could not create a primitive descriptor for "
+                        "the reduction primitive. Run workload with "
+                        "environment variable ONEDNN_VERBOSE=all to get "
+                        "additional diagnostic information.");
             reset(pd);
         }
 
@@ -13864,4 +14241,5 @@ namespace dnnl = ::dnnl;
 
 /// @} dnnl_api
 
+// NOLINTEND(readability-identifier-naming)
 #endif /* ONEAPI_DNNL_DNNL_HPP */
diff --git a/include/oneapi/dnnl/dnnl_common.hpp b/include/oneapi/dnnl/dnnl_common.hpp
index 562f2d4aaa3..1112f863c32 100644
--- a/include/oneapi/dnnl/dnnl_common.hpp
+++ b/include/oneapi/dnnl/dnnl_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #ifndef ONEAPI_DNNL_DNNL_COMMON_HPP
 #define ONEAPI_DNNL_DNNL_COMMON_HPP
+// NOLINTBEGIN(readability-identifier-naming)
 
 /// @cond DO_NOT_DOCUMENT_THIS
 #include <algorithm>
@@ -127,7 +128,7 @@ template <typename T, typename traits = handle_traits<T>>
 struct handle {
 private:
     static dnnl_status_t dummy_destructor(T) { return dnnl_success; }
-    std::shared_ptr<typename std::remove_pointer<T>::type> data_ {0};
+    std::shared_ptr<typename std::remove_pointer<T>::type> data_ {nullptr};
 
 protected:
     bool operator==(const T other) const { return other == data_.get(); }
@@ -370,6 +371,7 @@ struct stream : public handle<dnnl_stream_t> {
     }
 };
 
+//NOLINTBEGIN(bugprone-macro-parentheses)
 #define DNNL_DEFINE_BITMASK_OPS(enum_name) \
     inline enum_name operator|(enum_name lhs, enum_name rhs) { \
         return static_cast<enum_name>( \
@@ -407,6 +409,7 @@ struct stream : public handle<dnnl_stream_t> {
     inline enum_name operator~(enum_name rhs) { \
         return static_cast<enum_name>(~static_cast<unsigned>(rhs)); \
     }
+//NOLINTEND(bugprone-macro-parentheses)
 
 DNNL_DEFINE_BITMASK_OPS(stream::flags)
 
@@ -476,4 +479,5 @@ inline dnnl_accumulation_mode_t convert_to_c(accumulation_mode mode) {
 
 /// @} dnnl_api
 
-#endif
+// NOLINTEND(readability-identifier-naming)
+#endif /* ONEAPI_DNNL_DNNL_COMMON_HPP */
diff --git a/include/oneapi/dnnl/dnnl_common_types.h b/include/oneapi/dnnl/dnnl_common_types.h
index 5b6348ebae7..56ac2a8ecf3 100644
--- a/include/oneapi/dnnl/dnnl_common_types.h
+++ b/include/oneapi/dnnl/dnnl_common_types.h
@@ -104,6 +104,14 @@ typedef enum {
     dnnl_u4 = 12,
     /// [MX-compliant 8-bit compliant scale data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 8-bit exponent.
     dnnl_e8m0 = 13,
+    /// [MX-compliant 4-bit float data type](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) with 2-bit exponent and 1 bit mantissa.
+    dnnl_f4_e2m1 = 14,
+    /// 4-bit float data type with 3-bit exponent and 0 bit mantissa.
+    dnnl_f4_e3m0 = 15,
+    /// 4-bit normalized float.
+    dnnl_nf4 = 16,
+    /// 1-bit integer.
+    dnnl_bin = 17,
 
     /// Parameter to allow internal only data_types without undefined behavior.
     /// This parameter is chosen to be valid for so long as sizeof(int) >= 2.
diff --git a/include/oneapi/dnnl/dnnl_config.h.in b/include/oneapi/dnnl/dnnl_config.h.in
index f2ba61b6511..af74c13a072 100644
--- a/include/oneapi/dnnl/dnnl_config.h.in
+++ b/include/oneapi/dnnl/dnnl_config.h.in
@@ -70,6 +70,9 @@
 /// TBB runtime (CPU only)
 #define DNNL_RUNTIME_TBB 4u
 
+/// TBB runtime with auto partitioning (CPU only)
+#define DNNL_RUNTIME_TBB_AUTO 5u
+
 /// Threadpool runtime (CPU only)
 #define DNNL_RUNTIME_THREADPOOL 8u
 
@@ -222,6 +225,7 @@
 #cmakedefine01 BUILD_XEHPG
 #cmakedefine01 BUILD_XEHPC
 #cmakedefine01 BUILD_XE2
+#cmakedefine01 BUILD_XE3
 // GeMM kernels ISA controls
 #cmakedefine01 BUILD_GEMM_KERNELS_ALL
 #cmakedefine01 BUILD_GEMM_KERNELS_NONE
diff --git a/include/oneapi/dnnl/dnnl_debug.h b/include/oneapi/dnnl/dnnl_debug.h
index 9efa63dd61e..14b7fb596e4 100644
--- a/include/oneapi/dnnl/dnnl_debug.h
+++ b/include/oneapi/dnnl/dnnl_debug.h
@@ -44,6 +44,7 @@ const char DNNL_API *dnnl_fmt_tag2str(dnnl_format_tag_t v);
 const char DNNL_API *dnnl_prop_kind2str(dnnl_prop_kind_t v);
 const char DNNL_API *dnnl_prim_kind2str(dnnl_primitive_kind_t v);
 const char DNNL_API *dnnl_alg_kind2str(dnnl_alg_kind_t v);
+const char DNNL_API *dnnl_sparse_encoding2str(dnnl_sparse_encoding_t v);
 const char DNNL_API *dnnl_rnn_flags2str(dnnl_rnn_flags_t v);
 const char DNNL_API *dnnl_rnn_direction2str(dnnl_rnn_direction_t v);
 const char DNNL_API *dnnl_scratchpad_mode2str(dnnl_scratchpad_mode_t v);
diff --git a/include/oneapi/dnnl/dnnl_graph.h b/include/oneapi/dnnl/dnnl_graph.h
index a0d465982ca..77f7b46b48f 100644
--- a/include/oneapi/dnnl/dnnl_graph.h
+++ b/include/oneapi/dnnl/dnnl_graph.h
@@ -590,6 +590,28 @@ dnnl_status_t DNNL_API dnnl_graph_graph_create_with_fpmath_mode(
 ///     otherwise.
 dnnl_status_t DNNL_API dnnl_graph_graph_destroy(dnnl_graph_graph_t graph);
 
+/// Set the floating point math mode for a graph.
+///
+/// @param graph The target graph.
+/// @param mode The floating-point math mode.
+/// @param apply_to_int The flag that controls whether to use floating-point
+///     arithmetic for integral operations.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_set_fpmath_mode(
+        dnnl_graph_graph_t graph, dnnl_fpmath_mode_t mode, int apply_to_int);
+
+/// Get the floating point math mode for a graph.
+///
+/// @param graph The target graph.
+/// @param mode The floating-point math mode.
+/// @param apply_to_int The flag that controls whether to use floating-point
+///     arithmetic for integral operations.
+/// @returns #dnnl_success on success or a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_graph_graph_get_fpmath_mode(
+        dnnl_graph_graph_t graph, dnnl_fpmath_mode_t *mode, int *apply_to_int);
+
 /// Adds an operation into a graph. The API will return failure if the operator
 /// has already been added to the graph or the operation cannot pass the schema
 /// check in the library (eg. input and output numbers and data types, the
diff --git a/include/oneapi/dnnl/dnnl_graph.hpp b/include/oneapi/dnnl/dnnl_graph.hpp
index 1d178e07973..288105aa08b 100644
--- a/include/oneapi/dnnl/dnnl_graph.hpp
+++ b/include/oneapi/dnnl/dnnl_graph.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #ifndef ONEAPI_DNNL_DNNL_GRAPH_HPP
 #define ONEAPI_DNNL_DNNL_GRAPH_HPP
+// NOLINTBEGIN(readability-identifier-naming)
 
 #include "oneapi/dnnl/dnnl_common.hpp"
 #include "oneapi/dnnl/dnnl_graph.h"
@@ -270,6 +271,10 @@ class logical_tensor {
         /// floating-point](https://www.opencompute.org/documents/ocp-8-bit-floating-point-specification-ofp8-revision-1-0-2023-06-20-pdf)
         /// with a 4-bit exponent and a 3-bit mantissa.
         f8_e4m3 = dnnl_f8_e4m3,
+        /// 4-bit signed integer.
+        s4 = dnnl_s4,
+        /// 4-bit unsigned integer.
+        u4 = dnnl_u4,
     };
 
     /// Layout type
@@ -360,7 +365,7 @@ class logical_tensor {
             layout_type ltype, property_type ptype = property_type::undef) {
         dnnl_graph_logical_tensor_t val;
         // if dimension size equals to 0, it's a scalar
-        if (adims.size() == 0)
+        if (adims.empty())
             error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid,
                                       convert_to_c(dtype), 0,
                                       convert_to_c(ltype), convert_to_c(ptype)),
@@ -415,7 +420,7 @@ class logical_tensor {
             property_type ptype = property_type::undef) {
         dnnl_graph_logical_tensor_t val;
 
-        if (adims.size() == 0) {
+        if (adims.empty()) {
             error::wrap_c_api(dnnl_graph_logical_tensor_init(&val, tid,
                                       convert_to_c(dtype), 0,
                                       convert_to_c(layout_type::opaque),
@@ -832,6 +837,8 @@ class op : public op_handle {
         TanhBackward = dnnl_graph_op_tanh_backward,
         TypeCast = dnnl_graph_op_type_cast,
         Wildcard = dnnl_graph_op_wildcard,
+        GenIndex = dnnl_graph_op_gen_index,
+        GreaterEqual = dnnl_graph_op_greater_equal,
         // Sentinel
         LastSymbol = dnnl_graph_op_last_symbol,
     };
@@ -908,6 +915,12 @@ class op : public op_handle {
         weights_shape = dnnl_graph_op_attr_weights_shape,
         /// Specifies a zps attribute to an op.
         zps = dnnl_graph_op_attr_zps,
+        /// Specifies the group shape of an op. The size of the vector should
+        /// match that of the input. For the dimensions where the grouped
+        /// quantization occurs, the values should correspond to the group
+        /// size, which indicates the number of elements that will share the
+        /// same scaling factor.
+        group_shape = dnnl_graph_op_attr_group_shape,
 
         // bool attributes. The value of these attributes can be any single bool
         // value.
@@ -1373,6 +1386,10 @@ class graph : public graph_handle {
     /// mode. All partitions returned from the graph will inherit the engine
     /// kind and floating-point math mode.
     ///
+    /// Setting the floating-point math mode enables automatic down-conversion
+    /// of inputs for the given graph, promoting speedup by using
+    /// lower-precision data types when available.
+    ///
     /// @param engine_kind Engine kind.
     /// @param mode Floating-point math mode.
     graph(engine::kind engine_kind, fpmath_mode mode) {
@@ -1384,6 +1401,37 @@ class graph : public graph_handle {
         reset(g);
     }
 
+    /// Set the floating point math mode for a graph. Users can enforce the
+    /// graph to comply with the mode by specifying a boolean flag with the
+    /// setter function.
+    ///
+    /// @param mode The floating-point math mode.
+    /// @param apply_to_int The flag that controls whether to use
+    /// floating-point arithmetic for integral operations.
+    void set_fpmath_mode(fpmath_mode mode, bool apply_to_int = false) {
+        error::wrap_c_api(dnnl_graph_graph_set_fpmath_mode(
+                                  get(), convert_to_c(mode), apply_to_int),
+                "could not set fpmath mode graph attribute");
+    }
+
+    /// Get the floating point math mode and the boolean flag that specifies
+    /// whether the graph will be enforced to comply the mode.
+    ///
+    /// @param mode The floating-point math mode.
+    /// @param apply_to_int The flag that controls whether to use
+    /// floating-point arithmetic for integral operations.
+    void get_fpmath_mode(fpmath_mode &mode, bool &apply_to_int) const {
+        dnnl_fpmath_mode_t c_mode;
+        int c_apply_to_int;
+
+        error::wrap_c_api(dnnl_graph_graph_get_fpmath_mode(
+                                  get(), &c_mode, &c_apply_to_int),
+                "could not get fpmath mode graph attribute");
+
+        mode = fpmath_mode(c_mode);
+        apply_to_int = static_cast<bool>(c_apply_to_int);
+    }
+
     /// Adds an op into the graph to construct a computational DAG. The API will
     /// return failure if the operator has already been added to the graph or
     /// the operation cannot pass the schema check in the library (eg. input and
@@ -1584,4 +1632,5 @@ namespace dnnl = ::dnnl;
 
 /// @} dnnl_api
 
-#endif
+// NOLINTEND(readability-identifier-naming)
+#endif /* ONEAPI_DNNL_DNNL_GRAPH_HPP */
diff --git a/include/oneapi/dnnl/dnnl_graph_ocl.hpp b/include/oneapi/dnnl/dnnl_graph_ocl.hpp
index 636dc0d1c47..18ff36bd686 100644
--- a/include/oneapi/dnnl/dnnl_graph_ocl.hpp
+++ b/include/oneapi/dnnl/dnnl_graph_ocl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/include/oneapi/dnnl/dnnl_graph_sycl.hpp b/include/oneapi/dnnl/dnnl_graph_sycl.hpp
index 8f694f4b36b..2507842cb38 100644
--- a/include/oneapi/dnnl/dnnl_graph_sycl.hpp
+++ b/include/oneapi/dnnl/dnnl_graph_sycl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,8 +25,6 @@
 
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
diff --git a/include/oneapi/dnnl/dnnl_graph_types.h b/include/oneapi/dnnl/dnnl_graph_types.h
index 4ec65da25cd..4aeb4d6bd87 100644
--- a/include/oneapi/dnnl/dnnl_graph_types.h
+++ b/include/oneapi/dnnl/dnnl_graph_types.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
+ * Copyright 2020-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -256,6 +256,8 @@ typedef enum {
     dnnl_graph_op_select,
     dnnl_graph_op_pow,
     dnnl_graph_op_group_norm,
+    dnnl_graph_op_gen_index,
+    dnnl_graph_op_greater_equal,
     dnnl_graph_op_last_symbol,
 } dnnl_graph_op_kind_t;
 
@@ -327,6 +329,8 @@ typedef enum {
     dnnl_graph_op_attr_weights_shape,
     /// Specifies a zps attribute to an op.
     dnnl_graph_op_attr_zps,
+    /// Specifies a group shape attribute to an op.
+    dnnl_graph_op_attr_group_shape,
 
     // bool attributes. The value of these attributes can be any single bool
     // value.
diff --git a/include/oneapi/dnnl/dnnl_ocl.h b/include/oneapi/dnnl/dnnl_ocl.h
index 6300bb7459f..70d0c5460a0 100644
--- a/include/oneapi/dnnl/dnnl_ocl.h
+++ b/include/oneapi/dnnl/dnnl_ocl.h
@@ -75,6 +75,35 @@ dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create(dnnl_memory_t *memory,
         const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
         dnnl_ocl_interop_memory_kind_t memory_kind, void *handle);
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_ocl_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_ocl_interop_memory_kind_t memory_kind, int nhandles,
+        void **handles);
+#endif
+
 /// Returns the memory allocation kind associated with a memory object.
 ///
 /// @param memory Memory to query.
diff --git a/include/oneapi/dnnl/dnnl_ocl.hpp b/include/oneapi/dnnl/dnnl_ocl.hpp
index c2466bc8276..de3b4150b8a 100644
--- a/include/oneapi/dnnl/dnnl_ocl.hpp
+++ b/include/oneapi/dnnl/dnnl_ocl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -236,6 +236,112 @@ inline memory_kind get_memory_kind(const memory &amemory) {
     return static_cast<memory_kind>(ckind);
 }
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_ocl_interop_usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to dnnl_ocl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_ocl_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), (int)handles.size(),
+                    handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Constructs a memory object with multiple OpenCL buffers.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_objects A vector of OpenCL buffers to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, std::vector<cl_mem> mem_objects) {
+    const int nhandles = memory_desc.get_num_handles();
+    std::vector<void *> handles(nhandles, DNNL_MEMORY_NONE);
+    memory amemory(memory_desc, aengine, handles);
+    for (int i = 0; i < nhandles; i++)
+        amemory.set_data_handle(mem_objects[i], i);
+    return amemory;
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::usm, or
+/// - dnnl::ocl_interop::set_mem_object() has been called, if @p memory_kind is
+///   equal to dnnl::ocl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::ocl_interop::memory_kind::usm.
+///     - An OpenCL buffer. In this case the library doesn't own the buffer.
+///       Requires @p memory_kind be equal to be equal to
+///       dnnl::ocl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind, void *handle) {
+    return make_memory(
+            memory_desc, aengine, kind, std::vector<void *> {handle});
+}
+
+/// Constructs a memory object from an OpenCL buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param mem_object An OpenCL buffer to use.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, cl_mem mem_object) {
+    return make_memory(memory_desc, aengine, std::vector<cl_mem> {mem_object});
+}
+#else
+
 /// Creates a memory object.
 ///
 /// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
@@ -288,6 +394,7 @@ inline memory make_memory(const memory::desc &memory_desc,
     set_mem_object(amemory, mem_object);
     return amemory;
 }
+#endif
 
 /// Executes computations specified by the primitive in a specified stream and
 /// returns a SYCL event.
diff --git a/include/oneapi/dnnl/dnnl_sycl.h b/include/oneapi/dnnl/dnnl_sycl.h
index ed61d92435b..a4abe851836 100644
--- a/include/oneapi/dnnl/dnnl_sycl.h
+++ b/include/oneapi/dnnl/dnnl_sycl.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -95,6 +95,36 @@ dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create(dnnl_memory_t *memory,
         const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
         dnnl_sycl_interop_memory_kind_t memory_kind, void *handle);
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory Output memory object.
+/// @param memory_desc Memory descriptor.
+/// @param engine Engine to use.
+/// @param memory_kind Memory allocation kind to specify the type of handles.
+/// @param nhandles Number of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl_sycl_interop_usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl_sycl_interop_buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_sycl_interop_memory_create_v2(dnnl_memory_t *memory,
+        const_dnnl_memory_desc_t memory_desc, dnnl_engine_t engine,
+        dnnl_sycl_interop_memory_kind_t memory_kind, int nhandles,
+        void **handles);
+#endif
+
 /// Returns the memory allocation kind associated with a memory object.
 ///
 /// @param memory Memory to query.
diff --git a/include/oneapi/dnnl/dnnl_sycl.hpp b/include/oneapi/dnnl/dnnl_sycl.hpp
index b9ddc876ed8..1f7d8f559c1 100644
--- a/include/oneapi/dnnl/dnnl_sycl.hpp
+++ b/include/oneapi/dnnl/dnnl_sycl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,8 +28,6 @@
 
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
@@ -208,6 +206,83 @@ inline memory_kind get_memory_kind(const memory &amemory) {
     return static_cast<memory_kind>(ckind);
 }
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+/// Creates a memory object with multiple handles.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handles.
+/// @param handles Handles of the memory buffers to use as underlying storages.
+///     For each element of the @p handles array the following applies:
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+///  If the @p handles vector is not provided the library will allocate all
+///  buffers as if all handles have the special value DNNL_MEMORY_ALLOCATE.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind,
+        std::vector<void *> handles = {}) {
+    if (handles.empty()) {
+        const int nhandles = memory_desc.get_num_handles();
+        handles.resize(nhandles, DNNL_MEMORY_ALLOCATE);
+    }
+
+    dnnl_memory_t c_memory;
+    error::wrap_c_api(
+            dnnl_sycl_interop_memory_create_v2(&c_memory, memory_desc.get(),
+                    aengine.get(), convert_to_c(kind), (int)handles.size(),
+                    handles.data()),
+            "could not create a memory");
+    return memory(c_memory);
+}
+
+/// Creates a memory object.
+///
+/// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
+/// constructed memory object will have the underlying buffer set. In this
+/// case, the buffer will be initialized as if:
+/// - dnnl::memory::set_data_handle() had been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::usm, or
+/// - dnnl::sycl_interop::set_buffer() has been called, if @p memory_kind is
+///   equal to dnnl::sycl_interop::memory_kind::buffer.
+///
+/// @param memory_desc Memory descriptor.
+/// @param aengine Engine to use.
+/// @param kind Memory allocation kind to specify the type of handle.
+/// @param handle Handle of the memory buffer to use as an underlying storage.
+///     - A USM pointer to the user-allocated buffer. In this case the library
+///       doesn't own the buffer. Requires @p memory_kind to be equal to
+///       dnnl::sycl_interop::memory_kind::usm.
+///     - A pointer to SYCL buffer. In this case the library doesn't own the
+///       buffer. Requires @p memory_kind be equal to be equal to
+///       dnnl::sycl_interop::memory_kind::buffer.
+///     - The DNNL_MEMORY_ALLOCATE special value. Instructs the library to
+///       allocate the buffer that corresponds to the memory allocation kind
+///       @p memory_kind for the memory object. In this case the library
+///       owns the buffer.
+///     - The DNNL_MEMORY_NONE specific value. Instructs the library to
+///       create memory object without an underlying buffer.
+///
+/// @returns Created memory object.
+inline memory make_memory(const memory::desc &memory_desc,
+        const engine &aengine, memory_kind kind, void *handle) {
+    return make_memory(
+            memory_desc, aengine, kind, std::vector<void *> {handle});
+}
+#else
+
 /// Creates a memory object.
 ///
 /// Unless @p handle is equal to DNNL_MEMORY_NONE or DNNL_MEMORY_ALLOCATE, the
@@ -246,6 +321,7 @@ inline memory make_memory(const memory::desc &memory_desc,
             "could not create a memory");
     return memory(c_memory);
 }
+#endif
 
 /// Constructs a memory object from a SYCL buffer.
 ///
diff --git a/include/oneapi/dnnl/dnnl_threadpool.hpp b/include/oneapi/dnnl/dnnl_threadpool.hpp
index e3ebd0ff251..849465a540c 100644
--- a/include/oneapi/dnnl/dnnl_threadpool.hpp
+++ b/include/oneapi/dnnl/dnnl_threadpool.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/include/oneapi/dnnl/dnnl_threadpool_iface.hpp b/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
index 271d4db7f22..c3127c1d474 100644
--- a/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
+++ b/include/oneapi/dnnl/dnnl_threadpool_iface.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,8 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
+/// @file
+/// Threadpool Interoperability C++ Types
+
 #ifndef ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
 #define ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP
+// NOLINTBEGIN(readability-identifier-naming)
 
 #include <cstdint>
 #include <functional>
@@ -57,7 +61,7 @@ struct threadpool_iface {
     /// waiting for the submitted closures to finish execution on its own.
     static constexpr uint64_t ASYNCHRONOUS = 1;
 
-    virtual ~threadpool_iface() {}
+    virtual ~threadpool_iface() = default;
 };
 
 } // namespace threadpool_interop
@@ -70,4 +74,5 @@ struct threadpool_iface {
 
 /// @} dnnl_api
 
-#endif
+// NOLINTEND(readability-identifier-naming)
+#endif /* ONEAPI_DNNL_DNNL_THREADPOOL_IFACE_HPP */
diff --git a/include/oneapi/dnnl/dnnl_types.h b/include/oneapi/dnnl/dnnl_types.h
index bb385ee2737..8821401352b 100644
--- a/include/oneapi/dnnl/dnnl_types.h
+++ b/include/oneapi/dnnl/dnnl_types.h
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,6 +56,8 @@ typedef enum {
     /// Format kind for sparse tensors.
     dnnl_format_kind_sparse,
 #endif
+    /// Format for sparse data.
+    dnnl_format_sparse,
     /// Parameter to allow internal only format kinds without undefined
     /// behavior. This parameter is chosen to be valid for so long as
     /// sizeof(int) >= 2.
@@ -74,6 +77,8 @@ typedef enum {
     /// only be used to create a primitive descriptor to query the
     /// actual memory descriptor (similar to the format tag `any`).
     dnnl_packed,
+    /// Coordinate Sparse Encoding (COO).
+    dnnl_coo,
 } dnnl_sparse_encoding_t;
 #endif
 
@@ -284,6 +289,7 @@ typedef enum {
     dnnl_ABcd8a16b2a,
     dnnl_ABcd2b8a4b,
     dnnl_ABcd8a8b,
+    dnnl_ABcd8a32b,
     dnnl_ABcd8a4b,
     /// 4D tensor blocked by 2nd dimension with block size 8
     dnnl_aBcd8b,
@@ -396,6 +402,8 @@ typedef enum {
     dnnl_aCBdef16c16b,
     dnnl_aBdefc4b,
     dnnl_aBdefc8b,
+    dnnl_Abcdef4a,
+    dnnl_Abcdef8a,
     dnnl_Abcdef16a,
     dnnl_Abcdef32a,
     dnnl_aBedc16b,
@@ -1035,6 +1043,18 @@ typedef enum {
     dnnl_bcad,
     dnnl_cabd,
     dnnl_dabc,
+    dnnl_Ab32a,
+    dnnl_aCBd8b8c,
+    dnnl_aCBde8b8c,
+    dnnl_BAc8a8b,
+    dnnl_BAcd8a8b,
+    dnnl_BAcde8a8b,
+    dnnl_aCBdef8b8c,
+    dnnl_abdEC16e4c,
+    dnnl_abDC16d4c,
+    dnnl_BA24b8a,
+    dnnl_aCB24c8b,
+    dnnl_abDC24d8c,
 
     /// Just a sentinel, not real memory format tag. Must be changed after new
     /// format tag is added.
@@ -1170,10 +1190,12 @@ typedef enum {
     /// 5D LSTM projection tensor
     dnnl_ldOi16o = dnnl_abDc16d,
     dnnl_ldOi32o = dnnl_abDc32d,
+    dnnl_ldOI16o4i = dnnl_abDC16d4c,
     dnnl_ldOI32o4i = dnnl_abDC32d4c,
     dnnl_ldIo32i = dnnl_abCd32c,
     /// 6D RNN weights tensor
     dnnl_ldgOi16o = dnnl_abdEc16e,
+    dnnl_ldgOI16o4i = dnnl_abdEC16e4c,
     dnnl_ldgOi32o = dnnl_abdEc32e,
     dnnl_ldgOI32o2i = dnnl_abdEC32e2c,
     dnnl_ldgOI32o4i = dnnl_abdEC32e4c,
@@ -1255,6 +1277,7 @@ typedef enum {
     dnnl_OI8i8o = dnnl_AB8b8a,
 
     // weights, 3D
+    dnnl_IOw8o8i = dnnl_BAc8a8b,
     dnnl_IOw16o16i = dnnl_BAc16a16b,
     dnnl_IOw16i16o = dnnl_BAc16b16a,
     dnnl_OIw16i16o = dnnl_ABc16b16a,
@@ -1325,6 +1348,7 @@ typedef enum {
 
     // weights, 4D
     dnnl_IOhw16i16o = dnnl_BAcd16b16a,
+    dnnl_IOhw8o8i = dnnl_BAcd8a8b,
     dnnl_IOhw16o16i = dnnl_BAcd16a16b,
     dnnl_Ohwi16o = dnnl_Acdb16a,
     dnnl_OhwI16o2i = dnnl_AcdB16a2b,
@@ -1386,6 +1410,8 @@ typedef enum {
     dnnl_OIhw2i8o4i = dnnl_ABcd2b8a4b,
     dnnl_IOhw8o16i2o = dnnl_BAcd8a16b2a,
     dnnl_OIhw8o8i = dnnl_ABcd8a8b,
+    dnnl_OIhw8o32i = dnnl_ABcd8a32b,
+    dnnl_OIhw16o32i = dnnl_ABcd16a32b,
     dnnl_OIhw8o4i = dnnl_ABcd8a4b,
     dnnl_Owhi16o = dnnl_Adcb16a,
     dnnl_OIhw8i32o = dnnl_ABcd8b32a,
@@ -1457,6 +1483,7 @@ typedef enum {
     dnnl_OIdhw8o4i = dnnl_ABcde8a4b,
     dnnl_IOdhw16i16o = dnnl_BAcde16b16a,
     dnnl_OIdhw4o8i8o4i = dnnl_ABcde4a8b8a4b,
+    dnnl_IOdhw8o8i = dnnl_BAcde8a8b,
     dnnl_IOdhw16o16i = dnnl_BAcde16a16b,
     dnnl_OIdhw16o16i2o = dnnl_ABcde16a16b2a,
     dnnl_OIdhw8i32o = dnnl_ABcde8b32a,
@@ -1470,6 +1497,7 @@ typedef enum {
     dnnl_Goiw16g = dnnl_Abcd16a,
     dnnl_Goiw8g = dnnl_Abcd8a,
     dnnl_Goiw4g = dnnl_Abcd4a,
+    dnnl_gIOw8o8i = dnnl_aCBd8b8c,
     dnnl_gIOw16o16i = dnnl_aCBd16b16c,
     dnnl_gIOw16i16o = dnnl_aCBd16c16b,
     dnnl_gOIw16i16o = dnnl_aBCd16c16b,
@@ -1515,6 +1543,7 @@ typedef enum {
 
     // weights w/ groups, 4D
     dnnl_gIOhw16i16o = dnnl_aCBde16c16b,
+    dnnl_gIOhw8o8i = dnnl_aCBde8b8c,
     dnnl_gIOhw16o16i = dnnl_aCBde16b16c,
     dnnl_gOhwi16o = dnnl_aBdec16b,
     dnnl_gOhwI16o2i = dnnl_aBdeC16b2c,
@@ -1582,6 +1611,7 @@ typedef enum {
 
     // weights w/ groups, 6D
     dnnl_gIOdhw16i16o = dnnl_aCBdef16c16b,
+    dnnl_gIOdhw8o8i = dnnl_aCBdef8b8c,
     dnnl_gIOdhw16o16i = dnnl_aCBdef16b16c,
     dnnl_gOdhwi16o = dnnl_aBdefc16b,
     dnnl_gOdhwI16o2i = dnnl_aBdefC16b2c,
@@ -1617,6 +1647,8 @@ typedef enum {
     dnnl_gIOdhw8o16i2o = dnnl_aCBdef8b16c2b,
     dnnl_gOIdhw8o8i = dnnl_aBCdef8b8c,
     dnnl_gOIdhw8o4i = dnnl_aBCdef8b4c,
+    dnnl_Goidhw4g = dnnl_Abcdef4a,
+    dnnl_Goidhw8g = dnnl_Abcdef8a,
     dnnl_Goidhw16g = dnnl_Abcdef16a,
     dnnl_Goidhw32g = dnnl_Abcdef32a,
     dnnl_gOIdhw2i4o2i = dnnl_aBCdef2c4b2c,
@@ -1989,6 +2021,12 @@ typedef enum {
     dnnl_deconvolution,
     /// An element-wise primitive.
     dnnl_eltwise,
+    /// An depthwise-wise primitive.
+    dnnl_depthwise,
+    /// A quantization primitive.
+    dnnl_quantization,
+    /// A binatization primitive.
+    dnnl_binarization,
     /// An LRN primitive.
     dnnl_lrn,
     /// A batch normalization primitive.
@@ -2081,6 +2119,12 @@ typedef enum {
     dnnl_eltwise_mish,
     /// Eltwise: hardswish
     dnnl_eltwise_hardswish,
+    /// Eltwise: hsigmoid
+    dnnl_eltwise_hsigmoid,
+    /// Eltwise: round_half_to_even
+    dnnl_eltwise_round_half_to_even,
+    /// Eltwise: round_half_away_from_zero
+    dnnl_eltwise_round_half_away_from_zero,
     /// Eltwise: ReLU (dst for backward)
     dnnl_eltwise_relu_use_dst_for_bwd = 0x100,
     /// Eltwise: hyperbolic tangent non-linearity (tanh) (dst for backward)
@@ -2147,6 +2191,10 @@ typedef enum {
     dnnl_binary_eq = 0x1fffa,
     /// Binary not equal
     dnnl_binary_ne = 0x1fffb,
+    /// Binary select
+    dnnl_binary_select = 0x1fffc,
+    /// Binary prelu
+    dnnl_binary_prelu = 0x1fffd,
     /// Nearest Neighbor Resampling Method
     dnnl_resampling_nearest = 0x2fff0,
     /// Linear Resampling Method
@@ -2173,6 +2221,13 @@ typedef enum {
     dnnl_softmax_accurate = 0x30000,
     /// Logsoftmax
     dnnl_softmax_log,
+
+    dnnl_depthwise_scale_shift = 0x3fff0,
+    dnnl_depthwise_prelu = 0x3fff1,
+
+    dnnl_quantization_quantize_dequantize = 0x4fff0,
+    dnnl_quantization_quantize = 0x4fff1,
+    dnnl_binarization_depthwise = 0x4fff2,
 } dnnl_alg_kind_t;
 
 /// Flags for normalization primitives.
@@ -2259,7 +2314,12 @@ typedef enum {
 /// A `size_t` counterpart of the DNNL_RUNTIME_DIM_VAL.
 /// For instance, this value is returned by dnnl_memory_desc_get_size() if
 /// either of the dimensions or strides equal to #DNNL_RUNTIME_DIM_VAL.
+
+#if INTPTR_MAX == INT64_MAX
 #define DNNL_RUNTIME_SIZE_VAL ((size_t)DNNL_RUNTIME_DIM_VAL)
+#else
+#define DNNL_RUNTIME_SIZE_VAL ((size_t)INT32_MIN)
+#endif
 
 /// @cond DO_NOT_DOCUMENT_THIS
 /// Hex representation for a **special** quiet NAN (!= NAN from math.h)
@@ -2291,6 +2351,52 @@ typedef struct dnnl_memory_desc *dnnl_memory_desc_t;
 /// A memory descriptor handle.
 typedef const struct dnnl_memory_desc *const_dnnl_memory_desc_t;
 
+/// Sparse encodings.
+typedef enum {
+    dnnl_sparse_encoding_undef = 0,
+    dnnl_sparse_encoding_any,
+    dnnl_sparse_encoding_packed,
+    dnnl_sparse_encoding_csr,
+    dnnl_sparse_encoding_coo,
+} dnnl_sparse_encoding_t;
+
+/* typedef struct dnnl_sparse_desc *dnnl_sparse_desc_t; */
+/* typedef const struct dnnl_sparse_desc *const_dnnl_sparse_desc_t; */
+
+/// Flags for memory special features
+typedef enum {
+    dnnl_memory_extra_flag_none = 0x0U,
+    /// Indicates the weights have an additional buffer, that depends on the
+    /// @p compensation_mask.
+    ///
+    /// For instance, in 4D case with the compensation mask equals (1 << 0)
+    /// the additional buffer would consist of OC values:
+    /// O[oc : 0,OC] =
+    ///  -128 * SUM(ic : 0,IC; kh : 0,KH; kw : 0,KW){ weights(oc, ic, kh, kw) }
+    dnnl_memory_extra_flag_compensation_conv_s8s8 = 0x1U,
+    dnnl_memory_extra_flag_scale_adjust = 0x2U,
+    dnnl_memory_extra_flag_rnn_u8s8_compensation = 0x4U,
+    dnnl_memory_extra_flag_gpu_rnn_u8s8_compensation
+    = dnnl_memory_extra_flag_rnn_u8s8_compensation,
+    dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 0x8U,
+    dnnl_memory_extra_flag_rnn_s8s8_compensation = 0x16U,
+} dnnl_memory_extra_flags_t;
+
+/// Description of extra information stored in memory
+typedef struct {
+    /// The flags contain arbitrary extra information, such as compensation.
+    /// @sa dnnl_memory_extra_flags_t
+    uint64_t flags;
+    /// Compensation mask
+    int compensation_mask;
+    /// Scale applied to the data
+    float scale_adjust;
+    /// Compensation mask for asymmetric quantization
+    int asymm_compensation_mask;
+    /// For future backwards compatibility
+    char reserved[60];
+} dnnl_memory_extra_desc_t;
+
 /// @struct dnnl_memory
 /// An opaque structure to describe a memory.
 struct dnnl_memory;
@@ -2383,6 +2489,7 @@ typedef enum {
     dnnl_scratchpad_mode_user,
 } dnnl_scratchpad_mode_t;
 
+/// Rounding mode
 typedef enum {
     /// rounding mode dictated by the floating-point environment
     dnnl_rounding_mode_environment,
@@ -2529,6 +2636,12 @@ typedef const struct dnnl_primitive *const_dnnl_primitive_t;
 /// Bias tensor argument.
 #define DNNL_ARG_BIAS 41
 
+/// Reduce tensor argument.
+#define DNNL_ARG_REDUCE 42
+
+/// Note: when adding a new macro after `DNNL_ARG_REDUCE` please reserve a
+/// space for potential indices for `DNNL_ARG_REDUCE`.
+
 /// Mean values tensor argument.
 #define DNNL_ARG_MEAN 49
 /// Variance values tensor argument.
@@ -2642,6 +2755,7 @@ typedef const struct dnnl_primitive *const_dnnl_primitive_t;
 #define DNNL_ARG_ATTR_DROPOUT_SEED 511
 
 /// Output scaling factors provided at execution time.
+/// Deprecated value.
 #define DNNL_ARG_ATTR_OUTPUT_SCALES 513
 
 /// Starting index for source arguments for primitives that take a variable
@@ -2804,6 +2918,8 @@ typedef enum {
     dnnl_query_num_handles_s32, ///< Number of buffers required for a memory
 ///  descriptor
 #endif
+    dnnl_query_sparse_encoding,
+
     // Max value to prevent UB for internal use only dnnl_query_t
     dnnl_query_max = 0x7fff,
 } dnnl_query_t;
@@ -2891,6 +3007,7 @@ typedef enum {
     dnnl_cpu_isa_avx10_1_512_amx_fp16 = 0x1fef,
     /// @copydoc dnnl_cpu_isa_avx10_1_512_amx_fp16
     dnnl_cpu_isa_avx512_core_amx_fp16 = dnnl_cpu_isa_avx10_1_512_amx_fp16,
+    dnnl_cpu_isa_avx512_vpopcnt = 0x3fef,
 } dnnl_cpu_isa_t;
 
 /// CPU ISA hints flags
diff --git a/include/oneapi/dnnl/dnnl_ukernel.h b/include/oneapi/dnnl/dnnl_ukernel.h
index 102b2765373..50cdfb71c0c 100644
--- a/include/oneapi/dnnl/dnnl_ukernel.h
+++ b/include/oneapi/dnnl/dnnl_ukernel.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -185,13 +185,14 @@ dnnl_status_t DNNL_API dnnl_brgemm_finalize(dnnl_brgemm_t brgemm);
 
 /// Returns the packing type expected by a tensor B of a BRGeMM ukernel object.
 ///
-/// @param brgemm BRGeMM ukernel object.
-/// @param pack_type Output packing type. Can be `dnnl_brgemm_no_pack` if
-///     packing is not expected, and `dnnl_brgemm_pack_32`, otherwise.
+/// @param pack_type Output packing type. Can be `dnnl_brgemm_no_trans` if
+///     packing is not expected, and `dnnl_pack_type_pack32`, otherwise.
+/// @param dt_a Data type of tensor A.
+/// @param dt_b Data type of tensor B.
 /// @returns #dnnl_success on success and a status describing the error
 ///     otherwise.
-dnnl_status_t DNNL_API dnnl_brgemm_get_B_pack_type(
-        const_dnnl_brgemm_t brgemm, dnnl_pack_type_t *pack_type);
+dnnl_status_t DNNL_API dnnl_brgemm_get_B_pack_type(dnnl_pack_type_t *pack_type,
+        dnnl_data_type_t dt_a, dnnl_data_type_t dt_b);
 
 /// Returns the size of a scratchpad memory needed for the BRGeMM ukernel
 /// object.
@@ -203,6 +204,17 @@ dnnl_status_t DNNL_API dnnl_brgemm_get_B_pack_type(
 dnnl_status_t DNNL_API dnnl_brgemm_get_scratchpad_size(
         const_dnnl_brgemm_t brgemm, size_t *size);
 
+/// Returns the flag indicating when the call to `dnnl_brgemm_execute_postops`
+/// is valid.
+///
+/// @param brgemm BRGeMM ukernel object.
+/// @param valid The flag indicating if `dnnl_brgemm_execute_postops` is valid
+///     for a given ukernel object. `1` is for valid and `0`, otherwise.
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+dnnl_status_t DNNL_API dnnl_brgemm_is_execute_postops_valid(
+        const_dnnl_brgemm_t brgemm, int *valid);
+
 /// Initializes the hardware-specific context. If no initialization required,
 /// returns the success status.
 ///
diff --git a/include/oneapi/dnnl/dnnl_ukernel.hpp b/include/oneapi/dnnl/dnnl_ukernel.hpp
index 642123842de..e42895973e3 100644
--- a/include/oneapi/dnnl/dnnl_ukernel.hpp
+++ b/include/oneapi/dnnl/dnnl_ukernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #ifndef ONEAPI_DNNL_DNNL_UKERNEL_HPP
 #define ONEAPI_DNNL_DNNL_UKERNEL_HPP
+// NOLINTBEGIN(readability-identifier-naming)
 
 #include "oneapi/dnnl/dnnl.hpp"
 #include "oneapi/dnnl/dnnl_ukernel.h"
@@ -29,6 +30,8 @@
 /// oneDNN namespace
 namespace dnnl {
 
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
 /// @addtogroup dnnl_api_utils
 /// @{
 
@@ -59,6 +62,8 @@ struct handle_traits<dnnl_ukernel_attr_params_t> {
 
 /// @} dnnl_api_utils
 
+#endif
+
 /// @addtogroup dnnl_api_ukernel Ukernels
 /// Collection of ukernels
 /// @{
@@ -68,6 +73,10 @@ namespace ukernel {
 
 #ifdef DNNL_EXPERIMENTAL_UKERNEL
 
+/// @addtogroup dnnl_api_ukernel_utils ukernel utils
+/// ukernel utility functions
+/// @{
+
 /// Packing specification
 enum class pack_type {
     /// Undefined pack type. A guard value.
@@ -115,8 +124,8 @@ struct attr_params : public handle<dnnl_ukernel_attr_params_t> {
 
     /// Sets tensor B scales arguments to a storage.
     ///
-    /// If @ref brgemm::set_B_scales used mask of 2, then at least N values of
-    /// selected data type are expected.
+    /// If @ref attr_params::set_B_scales used mask of 2, then at
+    /// least N values of selected data type are expected.
     ///
     /// @param b_scales Pointer to scales storage.
     void set_B_scales(const void *b_scales) {
@@ -136,11 +145,13 @@ struct attr_params : public handle<dnnl_ukernel_attr_params_t> {
             error::wrap_c_api(status, "could not set D scales argument");
     }
 };
+/// @} dnnl_api_ukernel_utils
 
 /// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel
 /// BRGeMM ukernel routines
 /// @{
 
+/// BRGeMM ukernel
 struct brgemm : public handle<dnnl_brgemm_t> {
     /// Default constructor. Produces an empty object.
     brgemm() = default;
@@ -200,7 +211,7 @@ struct brgemm : public handle<dnnl_brgemm_t> {
     ///
     /// @param ldd Leading dimension of tensor D.
     /// @param d_dt Data type of tensor D.
-    /// @param post_ops Primitive post-operation attributes to extend the kernel
+    /// @param po Primitive post-operation attributes to extend the kernel
     ///     operations.
     void set_post_ops(memory::dim ldd, memory::data_type d_dt,
             const post_ops &po = default_post_ops()) {
@@ -258,9 +269,14 @@ struct brgemm : public handle<dnnl_brgemm_t> {
 
     /// Returns the packing type expected by a tensor B of a BRGeMM ukernel
     /// object.
-    pack_type get_B_pack_type() const {
+    ///
+    /// @param a_dt Data type of tensor A.
+    /// @param b_dt Data type of tensor B.
+    static pack_type get_B_pack_type(
+            memory::data_type a_dt, memory::data_type b_dt) {
         dnnl_pack_type_t c_pack_type;
-        dnnl_status_t status = dnnl_brgemm_get_B_pack_type(get(), &c_pack_type);
+        dnnl_status_t status = dnnl_brgemm_get_B_pack_type(&c_pack_type,
+                memory::convert_to_c(a_dt), memory::convert_to_c(b_dt));
         if (status != dnnl_success)
             error::wrap_c_api(status, "could not query B pack type");
 
@@ -279,6 +295,21 @@ struct brgemm : public handle<dnnl_brgemm_t> {
         return size;
     }
 
+    /// Returns the flag indicating when the call to execute with post
+    /// operations is valid.
+    ///
+    /// `True` is for a valid call, `false`, otherwise.
+    bool is_execute_postops_valid() const {
+        int valid;
+        dnnl_status_t status
+                = dnnl_brgemm_is_execute_postops_valid(get(), &valid);
+        if (status != dnnl_success)
+            error::wrap_c_api(status,
+                    "could not query a flag for execute postops from a BRGeMM "
+                    "ukernel object");
+        return static_cast<bool>(valid);
+    }
+
     /// Initializes the hardware-specific context. Affects the global state for
     /// all BRGeMM ukernel objects. If no initialization required, returns.
     void set_hw_context() const {
@@ -334,11 +365,11 @@ struct brgemm : public handle<dnnl_brgemm_t> {
     /// @param C Pointer to a tensor C (accumulation buffer).
     /// @param D Pointer to a tensor D (output buffer).
     /// @param scratchpad Pointer to a scratchpad buffer.
-    /// @param binary_po Binary post-op memory buffer. Must be passed If binary
-    ///     post-op was specified at construction call.
+    /// @param params Post-op memory arguments. Must be passed If binary
+    ///     post-op or scales were set.
     void execute(const void *A, const void *B,
             const std::vector<std::pair<memory::dim, memory::dim>> &A_B_offsets,
-            void *C, void *D, void *scratchpad,
+            const void *C, void *D, void *scratchpad,
             const attr_params &params = default_attr_params()) const {
         // TODO: export batch_element to C API later for user to fill it and
         // pass directly to the call.
@@ -364,7 +395,13 @@ struct brgemm : public handle<dnnl_brgemm_t> {
         return ap;
     }
 };
+/// @} dnnl_api_ukernel_brgemm
+
+/// @addtogroup dnnl_api_ukernel_transform Transform ukernel
+/// Transform routines
+/// @{
 
+/// Transform ukernel
 struct transform : public handle<dnnl_transform_t> {
     /// Default constructor. Produces an empty object.
     transform() = default;
@@ -419,7 +456,7 @@ struct transform : public handle<dnnl_transform_t> {
     }
 };
 
-/// @} dnnl_api_ukernel_brgemm
+/// @} dnnl_api_ukernel_transform
 
 #endif
 
@@ -431,4 +468,5 @@ struct transform : public handle<dnnl_transform_t> {
 
 /// @} dnnl_api
 
+// NOLINTEND(readability-identifier-naming)
 #endif /* ONEAPI_DNNL_DNNL_UKERNEL_HPP */
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000000..c29a82348f6
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,3 @@
+[tool.black]
+line-length = 80
+include = 'scripts\/.*\.pyi?$'
diff --git a/scripts/generate_dnnl_debug.py b/scripts/generate_dnnl_debug.py
index 84c5b086aad..5c197152c99 100755
--- a/scripts/generate_dnnl_debug.py
+++ b/scripts/generate_dnnl_debug.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 ################################################################################
-# Copyright 2018-2024 Intel Corporation
+# Copyright 2018-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,43 +24,20 @@
 import xml.etree.ElementTree as ET
 
 
-def banner(year_from):
-    year_now = str(datetime.datetime.now().year)
-    banner_year = (
-        year_from if year_now == year_from else "%s-%s" % (year_from, year_now)
-    )
+def template(body, banner):
     return """\
-/*******************************************************************************
-* Copyright %s Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
+%s
 // DO NOT EDIT, AUTO-GENERATED
 // Use this script to update the file: scripts/%s
 
 // clang-format off
 
-""" % (
-        banner_year,
+%s""" % (
+        banner,
         os.path.basename(__file__),
+        body
     )
 
-
-def template(body, year_from):
-    return "%s%s" % (banner(year_from), body)
-
-
 def header(body):
     return (
         """\
@@ -129,6 +106,7 @@ def header_benchdnn(body):
 #ifdef DNNL_EXPERIMENTAL_SPARSE
 const char *sparse_encoding2str(dnnl_sparse_encoding_t encoding);
 #endif
+const char *sparse_encoding2str(dnnl_sparse_encoding_t encoding);
 
 /* engine kind */
 const char *engine_kind2str(dnnl_engine_kind_t kind);
@@ -183,6 +161,9 @@ def source_benchdnn(body):
     return dnnl_sparse_encoding2str(encoding);
 }
 #endif
+const char *sparse_encoding2str(dnnl_sparse_encoding_t encoding) {
+    return dnnl_sparse_encoding2str(encoding);
+}
 
 const char *engine_kind2str(dnnl_engine_kind_t kind) {
     return dnnl_engine_kind2str(kind);
@@ -320,7 +301,7 @@ def str_to_func(enum, values, is_dnnl=True):
     return func
 
 
-def generate(ifile, banner_years):
+def generate(ifile, banners):
     h_body, s_body = "", ""
     h_benchdnn_body, s_benchdnn_body = "", ""
     root = ET.parse(ifile).getroot()
@@ -361,7 +342,7 @@ def generate(ifile, banner_years):
         header_benchdnn(h_benchdnn_body),
         source_benchdnn(s_benchdnn_body),
     ]
-    return [template(b, y) for b, y in zip(bodies, banner_years)]
+    return [template(b, y) for b, y in zip(bodies, banners)]
 
 
 def usage():
@@ -380,7 +361,6 @@ def usage():
     )
     sys.exit(1)
 
-
 for arg in sys.argv:
     if "-help" in arg:
         usage()
@@ -396,12 +376,12 @@ def usage():
     "%s/../tests/benchdnn/dnnl_debug_autogenerated.cpp" % script_root,
 )
 
-banner_years = []
+banners = []
 for file_path in file_paths:
     with open(file_path, "r") as f:
-        m = re.search(r"Copyright (.*) Intel", f.read())
-        banner_years.append(m.group(1).split("-")[0])
+        m = re.match(r'^/\*+\n(\*.*\n)+\*+/\n', f.read())
+        banners.append('' if m == None else m.group(0))
 
-for file_path, file_body in zip(file_paths, generate(ifile, banner_years)):
+for file_path, file_body in zip(file_paths, generate(ifile, banners)):
     with open(file_path, "w") as f:
         f.write(file_body)
diff --git a/scripts/synthdnn/README.md b/scripts/synthdnn/README.md
new file mode 100644
index 00000000000..6fd3ac003f6
--- /dev/null
+++ b/scripts/synthdnn/README.md
@@ -0,0 +1,27 @@
+# Synthdnn
+
+Synthdnn is a suite of scripts for collecting and analyzing oneDNN performance
+across a randomly generated data. The general architecture is intended to follow
+a data pipeline composed of synthetic problem generation, data collection, and
+data analysis. The `synthdnn.py` script provides a command line interface to
+these tools. Sample Usage:
+
+
+Problem Generation:
+```
+python3 synthdnn.py <primitive> [sampling controls] -b <batch_file>
+```
+Performance Data Collection:
+```
+python3 synthdnn.py collect --engine=<engine> --collect <data_kind> -b <batch_file> <benchdnn_file>
+```
+
+Problem Generation and Performance Data Collection:
+```
+python3 synthdnn.py <primitive> [sampling controls] --engine=<engine> --collect <data_kind> <benchdnn_file>
+```
+
+Report Generation: Not yet implemented.
+```
+
+See `synthdnn.py -h` for additional details.
diff --git a/scripts/synthdnn/matmul/primitive.py b/scripts/synthdnn/matmul/primitive.py
new file mode 100644
index 00000000000..0a30fcd6781
--- /dev/null
+++ b/scripts/synthdnn/matmul/primitive.py
@@ -0,0 +1,216 @@
+################################################################################
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import itertools
+
+
+class Dims:
+    def __init__(self, b, m, n, k):
+        # b is a list due to variable size
+        self.b = b
+        self.m = m
+        self.n = n
+        self.k = k
+
+    def __str__(self):
+        a_dims = self.b + [self.m, self.k]
+        b_dims = self.b + [self.k, self.n]
+        a_str = "x".join([str(x) for x in a_dims])
+        b_str = "x".join([str(x) for x in b_dims])
+        return f"{a_str}:{b_str}"
+
+    def __eq__(self, other):
+        return (self.b, self.m, self.n, self.k) == (
+            other.b,
+            other.m,
+            other.n,
+            other.k,
+        )
+
+    def __hash__(self):
+        return hash((self.b, self.m, self.n, self.k))
+
+
+class Layouts:
+    class Layout:
+        def __init__(self, layout):
+            self.A, self.B, self.C = layout.split(":")
+
+        def benchdnn_str(self):
+            return f"--stag={self.A} --wtag={self.B} --dtag={self.C}"
+
+    def __init__(self, layouts, ndims):
+        if layouts == "all":
+            self.values = self.supported(ndims)
+        else:
+            self.values = [self.Layout(x) for x in layouts.split(",")]
+
+    def __iter__(self):
+        return iter(self.values)
+
+    @staticmethod
+    def supported(ndims):
+        if ndims < 2 or ndims > 6:
+            raise RuntimeError(f"No support for ndims={ndims}")
+        dims_base = "abcdef"
+        gemm_kn = dims_base[ndims - 1]
+        gemm_mk = dims_base[ndims - 2]
+        perms = [
+            "".join(p)
+            for p in itertools.permutations(dims_base[:ndims])
+            if p[-1] == gemm_kn or p[-1] == gemm_mk
+        ]
+        perms.insert(0, "any")
+        return [
+            Layouts.Layout(f"{a}:{b}:{c}")
+            for a, b, c in itertools.product(perms, perms, perms)
+            if c == "any" or c[-1] == gemm_kn
+        ]
+
+
+class Types:
+    class Type:
+        def __init__(self, type_str):
+            s = type_str.split("(")
+            self.A, self.B, self.C = s[0].split(":")
+            self.A, self.B, self.C = self.wildcard_match(self.A, self.B, self.C)
+            if len(s) < 2:
+                self.mode = None
+            else:
+                self.mode = s[1].strip(")")
+
+        @staticmethod
+        def wildcard_match(A, B, C):
+            wildcard_match = A
+            B = B.replace("*", wildcard_match)
+            C = C.replace("*", wildcard_match)
+            return [A, B, C]
+
+        def __str__(self):
+            mode_str = ""
+            if self.mode:
+                mode_str = f"({self.mode})"
+            return f"{self.A}:{self.B}:{self.C}{mode_str}"
+
+        def benchdnn_str(self):
+            mode_str = ""
+            if not self.mode is None:
+                mode_str = f"--attr-fpmath={self.mode}"
+            return f"--dt={self.A}:{self.B}:{self.C} {mode_str}"
+
+        def __eq__(self, other):
+            return (self.A, self.B, self.C, self.mode) == (
+                other.A,
+                other.B,
+                other.C,
+                other.mode,
+            )
+
+    def __init__(self, types):
+        if types == "all":
+            self.values = self.supported()
+        else:
+            self.values = [self.Type(x) for x in types.split(",")]
+
+    def __str__(self):
+        return ",".join([str(x) for x in self.values])
+
+    def __iter__(self):
+        return iter(self.values)
+
+    @staticmethod
+    def supported():
+        support_matrix = [
+            [["f64"], ["f64"], ["f64"]],
+            [["f32"], ["f32"], ["f32"]],
+            [["f32"], ["u8", "s8"], ["f32", "f16", "bf16"]],
+            [
+                ["f16", "bf16"],
+                ["*", "u8", "s8", "u4", "s4"],
+                ["f32", "*", "u8", "s8"],
+            ],
+            [["u8", "s8"], ["u8"], ["f32", "bf16", "f16", "s32", "u8", "s8"]],
+            [
+                ["f8_e5m2", "f8_e4m3"],
+                ["f8_e5m2", "f8_e4m3"],
+                ["f32", "bf16", "f16", "f8_e5m2", "f8_e4m3"],
+            ],
+        ]
+
+        def is_int_type(t):
+            return t in ["u4", "s4", "u8", "s8", "s32"]
+
+        def get_accumulator(wei):
+            if is_int_type(wei):
+                return "s32"
+            if wei == "f64":
+                return "f64"
+            return "f32"
+
+        def get_fpmath_modes(src, wei, dst):
+            src, wei, dst = Types.Type.wildcard_match(src, wei, dst)
+            if get_accumulator(wei) == "f32":
+                ret = [""]
+                if "f32" in [src, wei]:
+                    ret.append("(tf32)")
+                if "f32" in [src, wei] and not "f16" in [src, wei]:
+                    ret.append("(bf16)")
+                if "f32" in [src, wei] and not "bf16" in [src, wei]:
+                    ret.append("(f16)")
+                return ret
+            if (
+                get_accumulator(wei) == "s32"
+                and not is_int_type(dst)
+                and not is_int_type(src)
+            ):
+                ret = []
+                if "f32" in [src, wei]:
+                    ret.append("(strict:true)")
+                    ret.append("(tf32:true)")
+                if "f16" not in [src, wei]:
+                    ret.append("(bf16:true)")
+                if "bf16" not in [src, wei]:
+                    ret.append("(f16:true)")
+                return ret
+            return [""]
+
+        out = []
+        for c in support_matrix:
+            for src, wei, dst in itertools.product(c[0], c[1], c[2]):
+                for math in get_fpmath_modes(src, wei, dst):
+                    out.append(Types.Type(f"{src}:{wei}:{dst}{math}"))
+        return out
+
+
+# Kind represents problem parameters that do not make sense to consider
+# in aggregate for optimization purposes as these features require significant
+# changes within generated implementations or the implementation dispatching.
+class Kind:
+    def __init__(self, layout, type):
+        self.layout = layout
+        self.type = type
+
+    def benchdnn_str(self):
+        return f"{self.layout.benchdnn_str()} {self.type.benchdnn_str()}"
+
+
+class Primitive:
+    def __init__(self, kind, dims):
+        self.kind: Kind = kind
+        self.dims = dims
+
+    def benchdnn_str(self):
+        return f"{self.kind.benchdnn_str()} {self.dims}"
diff --git a/scripts/synthdnn/matmul/sampler.py b/scripts/synthdnn/matmul/sampler.py
new file mode 100755
index 00000000000..77a50737559
--- /dev/null
+++ b/scripts/synthdnn/matmul/sampler.py
@@ -0,0 +1,157 @@
+################################################################################
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import itertools
+import random
+import math
+
+from matmul.primitive import *
+
+
+class Region:
+    def __init__(self, line):
+        restrictions = []
+        for x in line.split(":"):
+            if len(x) <= 0 or x[0] != "(" or x[-1] != ")":
+                raise RuntimeError(
+                    f"Unable to parse restrictions: {x} in {line}"
+                )
+            restrictions.append(x[1:-1])
+        if len(restrictions) != 3:
+            raise RuntimeError(f"Invalid number of restrictions in {line}")
+
+        self.min = [int(x) for x in restrictions[0].split(",")]
+        self.max = [int(x) for x in restrictions[1].split(",")]
+        self.alignment = [int(x) for x in restrictions[2].split(",")]
+
+        if len(self.min) != len(self.max) or len(self.min) != len(
+            self.alignment
+        ):
+            raise RuntimeError(
+                f"Inconsistent number of dimensions between restrictions in {line}"
+            )
+
+        self.ndims = len(self.min)
+
+    def __str__(self):
+        str_min = ",".join([str(x) for x in self.min])
+        str_max = ",".join([str(x) for x in self.max])
+        str_alignment = ",".join([str(x) for x in self.alignment])
+        return f"({str_min}):({str_max}):({str_alignment})"
+
+
+class Sampler:
+    def __init__(self, samples, mode, types, layouts, region):
+        self.layouts = layouts
+        self.mode = mode
+        self.types = types
+        self.region = region
+        self.samples = samples
+
+        random.seed("oneDNN Matmul")
+        self.kinds = [Kind(x, y) for x, y in itertools.product(layouts, types)]
+        random.shuffle(self.kinds)
+        self.dim_sampler = self.DimSampler(region)
+
+    def __str__(self):
+        return f"-s {self.samples} -m {self.mode} -l {self.layouts} -r {self.region} -t {self.types}"
+
+    def __iter__(self):
+        if self.mode == "zip":
+            return self.ZipIter(self.samples, self.kinds, self.dim_sampler)
+        elif self.mode == "product":
+            return self.ProductIter(self.samples, self.kinds, self.dim_sampler)
+        else:
+            raise RuntimeError(f"Unknown iteration mode {self.mode}")
+
+    # Itertools.product seems to break on an infinite sampler
+    class ProductIter:
+        def __init__(self, samples, kinds, dim_sampler):
+            self.dim_sampler = dim_sampler
+            self.kinds = kinds
+            self.kinds_iter = iter(self.kinds)
+            self.rem_samples = samples
+
+        def __next__(self):
+            if self.rem_samples == 0:
+                raise StopIteration
+
+            try:
+                self.k = next(self.kinds_iter)
+                self.s = next(self.dim_sampler)
+            except StopIteration:
+                self.kinds_iter = iter(self.kinds)
+                self.k = next(self.kinds_iter)
+                self.s = next(self.dim_sampler)
+                self.rem_samples = self.rem_samples - 1
+
+            return Primitive(self.k, self.s)
+
+    class ZipIter:
+        def __init__(self, samples, kinds, dim_sampler):
+            self.dim_sampler = dim_sampler
+            self.kinds_iter = itertools.cycle(kinds)
+            self.rem_samples = samples
+
+        def __next__(self):
+            if self.rem_samples == 0:
+                raise StopIteration
+
+            self.rem_samples = self.rem_samples - 1
+            k = next(self.kinds_iter)
+            s = next(self.dim_sampler)
+
+            return Primitive(k, s)
+
+    class DimSampler:
+        def __init__(self, region):
+            self.region = region
+            self.seen = set()
+            if region.ndims < 3:
+                raise RuntimeError(
+                    f"Insufficient dimensions for matmul operation, expected at least 3, but got {region.ndims}"
+                )
+
+        def __next__(self):
+
+            # Sample from a power distribution as most problem features occur
+            # when some dimension is small. In addition, small problems often
+            # require less time to run enabling faster data collection
+            def get_sample(minval, maxval, align):
+                assert minval <= maxval, "Sample bounds are out of order"
+                if minval == maxval:
+                    return minval
+                x = round(
+                    pow(2, random.uniform(math.log2(minval), math.log2(maxval)))
+                )
+                return (x // align) * align
+
+            for _ in range(1000):
+                dims = [0] * self.region.ndims
+                for i in range(self.region.ndims):
+                    dims[i] = get_sample(
+                        self.region.min[i],
+                        self.region.max[i],
+                        self.region.alignment[i],
+                    )
+                dims_tuple = tuple(dims)
+                if dims_tuple not in self.seen:
+                    self.seen.add(dims_tuple)
+                    return Dims(dims[:-3], dims[-3], dims[-2], dims[-1])
+
+            raise RuntimeError(
+                f"Cannot sample >{len(self.seen)} problems in region {self.region}"
+            )
diff --git a/scripts/synthdnn/synthdnn.py b/scripts/synthdnn/synthdnn.py
new file mode 100755
index 00000000000..7c0f170e3e5
--- /dev/null
+++ b/scripts/synthdnn/synthdnn.py
@@ -0,0 +1,205 @@
+#! /bin/python3
+################################################################################
+# Copyright 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import argparse
+import os
+import sys
+from tempfile import NamedTemporaryFile
+
+from matmul import sampler as matmul_sampler
+from matmul import primitive as matmul
+
+
+def log(output):
+    print("synthdnn: " + output)
+
+
+def error(output):
+    print("synthdnn: error: " + output)
+    exit(1)
+
+
+def write_batch_file(batch_file, samples, optional_args):
+    batch_file.write("#### Auto-generated by synthdnn\n")
+    batch_file.write(f"#### python3 synthdnn.py {' '.join(sys.argv[1:])}\n\n")
+    for s in samples:
+        batch_file.write(f"--reset {optional_args}{s.benchdnn_str()}\n")
+    batch_file.flush()
+
+
+def setup_collect_args(parser, req):
+    parser.add_argument(
+        "-b",
+        "--batch-file",
+        required=req,
+        default=None,
+        help="batch file used for the operation",
+    )
+
+    # Interface with benchdnn
+    nargs = 1
+    if not req:
+        nargs = "?"
+    parser.add_argument(
+        "benchdnn", nargs=nargs, help="path to benchdnn executable"
+    )
+    parser.add_argument(
+        "--engine", default="cpu", help="engine used for benchdnn execution"
+    )
+    parser.add_argument(
+        "--impl",
+        default=None,
+        help="implementation to use in benchdnn execution",
+    )
+    parser.add_argument(
+        "--skip-impl",
+        default=None,
+        help="implementation to skip in benchdnn execution",
+    )
+    parser.add_argument(
+        "--collect",
+        default="corr",
+        help="benchdnn collection type, can be one of [corr, perf]",
+    )
+    parser.add_argument("-n", "--name", default="", help="sample name")
+
+
+def setup_collect_subparser(subparsers):
+    collect_parser = subparsers.add_parser(
+        "collect", help="call with -h for information"
+    )
+    collect_parser.add_argument(
+        "--subprogram_main", default=collect_main, help=argparse.SUPPRESS
+    )
+
+    setup_collect_args(collect_parser, True)
+
+
+def get_optional_args(args):
+    optional_args = []
+    if args.impl:
+        optional_args.append(f"--impl={args.impl}")
+    if args.skip_impl:
+        optional_args.append(f"--skip-impl={args.skip_impl}")
+
+    if len(optional_args) > 0:
+        return " ".join(optional_args) + " "
+
+    return ""
+
+
+def collect_main(args):
+    # args.benchdnn may be a list depending on command line setup
+    benchdnn = args.benchdnn
+    if type(benchdnn) is list:
+        benchdnn = benchdnn[0]
+
+    if not os.path.exists(benchdnn):
+        error(f"cannot execute {benchdnn}, no such file exists")
+
+    if args.collect == "corr":
+        benchdnn_args = f"--engine={args.engine} --matmul --mode-modifier=P {get_optional_args(args)}"
+    elif args.collect == "perf":
+        benchdnn_args = f"--engine={args.engine} --matmul --mode=F --cold-cache=all --perf-template=sample,{args.name},%prb%,%0Gflops%,%0Gbw% --memory-kind=usm_device --attr-scratchpad=user {get_optional_args(args)}"
+        if args.name.find(",") != -1:
+            error(f"sample name {args.name} contains invalid character: ,")
+    else:
+        error(f"unknown collection method {args.collect}")
+
+    cmd = f"{benchdnn} {benchdnn_args} --batch={args.batch_file}"
+    log(f"executing: {cmd}")
+    ret = os.system(cmd)
+    log("execution complete")
+    if ret != 0:
+        error(f"execution of {cmd} failed with return code {ret}")
+
+
+def setup_matmul_subparser(subparsers):
+    matmul_parser = subparsers.add_parser(
+        "matmul", help="call with -h for information"
+    )
+    matmul_parser.add_argument(
+        "--subprogram_main", default=matmul_main, help=argparse.SUPPRESS
+    )
+
+    # Data Collection shortcut
+    setup_collect_args(matmul_parser, False)
+
+    # Sampler Arguments
+    matmul_parser.add_argument(
+        "-l",
+        "--layouts",
+        default="all",
+        help='stag:wtag:dtag, comma separated list of layouts or "all" for every supported layout',
+    )
+    matmul_parser.add_argument(
+        "-m",
+        "--iter-mode",
+        default="zip",
+        help="iteration mode, must be one of zip or product",
+    )
+    matmul_parser.add_argument(
+        "-r",
+        "--region",
+        default="(1,1,1,1):(8,8192,8192,8192):(1,1,1,1)",
+        help="([b_min,]m_min,n_min,k_min):([b_max,]m_max,n_max,k_max):([b_align,]m_align,n_align,k_align)",
+    )
+    matmul_parser.add_argument(
+        "-s", "--samples", default=1000, help="number of samples to collect"
+    )
+    matmul_parser.add_argument(
+        "-t",
+        "--types",
+        default="all",
+        help='dt:dt:dt(optional fpmath-mode), comma separated list of type configurations or "all" for every supported type',
+    )
+
+
+def matmul_main(args):
+    batch_file = (
+        open(args.batch_file, "w+t") if args.batch_file is not None else None
+    )
+    if args.benchdnn is not None and batch_file == None:
+        batch_file = NamedTemporaryFile("w+t")
+
+    region = matmul_sampler.Region(args.region)
+    types = matmul.Types(args.types)
+    layouts = matmul.Layouts(args.layouts, region.ndims - 1)
+    samples = matmul_sampler.Sampler(
+        int(args.samples), args.iter_mode, types, layouts, region
+    )
+    if batch_file:
+        log(f"generating batch file: {args.batch_file}")
+        write_batch_file(batch_file, samples, get_optional_args(args))
+        log(f"generation complete")
+    else:
+        write_batch_file(sys.stdout, samples, get_optional_args(args))
+
+    if args.benchdnn:
+        args.batch_file = batch_file.name
+        collect_main(args)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers(
+        help="primitive targeted for data collection"
+    )
+    setup_collect_subparser(subparsers)
+    setup_matmul_subparser(subparsers)
+    args = parser.parse_args()
+    args.subprogram_main(args)
diff --git a/scripts/verbose_converter/README.md b/scripts/verbose_converter/README.md
index 3a7b3af3e26..983a5ecc26a 100644
--- a/scripts/verbose_converter/README.md
+++ b/scripts/verbose_converter/README.md
@@ -1,7 +1,7 @@
 # Verbose log converter
 
 Verbose log converter is a tool that allows to convert [oneDNN
-verbose](https://oneapi-src.github.io/oneDNN/dev_guide_verbose.html)
+verbose](https://uxlfoundation.github.io/oneDNN/dev_guide_verbose.html)
 output to various outputs (input files for benchdnn and execution
 statistics breakdown at this time). The tool can be extended to
 produce other types of output by adding generators.
diff --git a/scripts/verbose_converter/src/__init__.py b/scripts/verbose_converter/src/__init__.py
new file mode 100644
index 00000000000..3acfa281439
--- /dev/null
+++ b/scripts/verbose_converter/src/__init__.py
@@ -0,0 +1,18 @@
+################################################################################
+# Copyright 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+# This file marks this directory as a package and is needed to allow relative
+# imports. See https://docs.python.org/3/tutorial/modules.html#packages.
diff --git a/scripts/verbose_converter/src/benchdnn_generator.py b/scripts/verbose_converter/src/benchdnn_generator.py
index 9ed56f199cf..6131c9d70d5 100644
--- a/scripts/verbose_converter/src/benchdnn_generator.py
+++ b/scripts/verbose_converter/src/benchdnn_generator.py
@@ -1,5 +1,5 @@
 ################################################################################
-# Copyright 2020-2024 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,831 +14,848 @@
 # limitations under the License.
 ################################################################################
 
+import logging
+from collections import defaultdict
+from typing import Dict, List, Mapping, Optional, Set, cast
 
-def everyone_is(list, value="None"):
-    if [value == "None"]:
-        value = list[0]
-    return [e for e in list if e != value] == []
-
-
-primitives_with_algs = (
-    "binary",
-    "convolution",
-    "deconvolution",
-    "eltwise",
-    "lrn",
-    "pooling",
-    "reduction",
-    "resampling",
-    "rnn",
-)
-
-
-def alg_remove_primitive(alg):
-    for p in primitives_with_algs:
-        if alg.find(p) != -1:
-            alg = alg[(alg.find(p) + len(p) + 1) :]
-    return alg
-
-
-def convert_driver(prop_kind):
-    driver = {
-        "batch_normalization": "bnorm",
-        "binary": "binary",
-        "brgemm": "brgemm",
-        "concat": "concat",
-        "convolution": "conv",
-        "deconvolution": "deconv",
-        "eltwise": "eltwise",
-        "group_normalization": "gnorm",
-        "inner_product": "ip",
-        "layer_normalization": "lnorm",
-        "lrn": "lrn",
-        "matmul": "matmul",
-        "pooling": "pool",
-        "prelu": "prelu",
-        "reduction": "reduction",
-        "reorder": "reorder",
-        "resampling": "resampling",
-        "rnn": "rnn",
-        "shuffle": "shuffle",
-        "softmax": "softmax",
-        "sum": "sum",
-    }.get(prop_kind)
-    return driver
-
-
-def convert_engine(engine):
-    return f"--engine={engine}"
-
-
-def convert_dir(entry):
-    # get base direction
-    dir = {
-        "forward_training": "FWD_D",
-        "forward_inference": "FWD_I",
-        "backward_data": "BWD_D",
-        "backward_weights": "BWD_W",
-        "backward": "BWD_DW",
-    }.get(entry["prop_kind"])
-
-    if not dir:
-        return ""
+from . import ir
 
-    found_bias = [
-        e for e in entry["mds"] if "bia" == e["arg"] and e["data_type"] != "undef"
-    ]
-    dir = "FWD_B" if "FWD" in dir and found_bias else dir
-    dir = "BWD_WB" if dir == "BWD_W" and found_bias else dir
-    if entry["prim_kind"] == "rnn":
-        return f"--prop={dir}"
-    else:
-        return f"--dir={dir}"
-
-
-def convert_aux(entry):
-    if entry.get("aux") != None:
-        alg = entry["aux"]["alg"] if entry["aux"].get("alg") != None else ""
-        pk = entry["prim_kind"]
-        if pk == "convolution":
-            str = ""
-            alg = alg_remove_primitive(alg)
-            algs = {"winograd": "WINO", "direct": "direct"}
-            alg = algs.get(alg)
-            if alg != None:
-                str = f"--alg={alg}"
-            return str
-        if pk == "eltwise":
-            alpha = entry["aux"]["alpha"]
-            beta = entry["aux"]["beta"]
-            alg += f" --alpha={alpha} --beta={beta}"
-            return f"--alg={alg}"
-        elif pk == "concat":
-            axis = entry["aux"]["axis"]
-            return f"--axis={axis}"
-        elif pk in [
-            "batch_normalization",
-            "layer_normalization",
-            "group_normalization",
-        ]:
-            flags = entry["aux"]["flags"]
-            return f"--flags={flags}"
-        elif pk == "lrn":
-            str = ""
-            alg = alg_remove_primitive(alg)
-            algs = {"across_channels": "ACROSS", "within_channel": "WITHIN"}
-            alg = algs.get(alg)
-            if alg != None:
-                str = f"--alg={alg}"
-            return str
-        elif pk == "reduction":
-            p = entry["aux"]["p"]
-            eps = entry["aux"]["eps"]
-            alg += f" --p={p} --eps={eps}"
-            return f"--alg={alg}"
-        elif pk == "rnn":
-            str = ""
-            algs = {
-                "vanilla_rnn": "VANILLA_RNN",
-                "vanilla_lstm": "VANILLA_LSTM",
-                "vanilla_gru": "VANILLA_GRU",
-                "vanilla_augru": "VANILLA_AUGRU",
-                "lbr_gru": "LBR_GRU",
-                "lbr_augru": "LBR_AUGRU",
-            }
-            alg = algs.get(alg)
-            if alg != None:
-                str += f"--alg={alg}"
-            ir_dir = entry["aux"]["direction"]
-            dirs = {
-                "unidirectional_left2right": "left2right",
-                "unidirectional_right2left": "right2left",
-                "bidirectional_sum": "sum",
-                "bidirectional_concat": "concat",
-            }
-            dir = dirs.get(ir_dir)
-            if dir is not None:
-                str += f" --direction={dir}"
-            ir_act = entry["aux"]["activation"]
-            acts = {
-                "eltwise_relu": "RELU",
-                "eltwise_logistic": "LOGISTIC",
-                "eltwise_tanh": "TANH",
-            }
-            act = acts.get(ir_act)
-            if act is not None:
-                str += f" --activation={act}"
-            flags = entry["aux"]["flags"]
-            if flags is not None:
-                str += f" --flags={flags}"
-            return str
-        elif pk == "shuffle":
-            axis = entry["aux"]["axis"]
-            group = entry["aux"]["group"]
-            return f"--axis={axis} --group={group}"
-        elif pk == "softmax":
-            axis = entry["aux"]["axis"]
-            return f"--alg={alg} --axis={axis}"
-        elif pk == "pooling":
-            return f"--alg={alg}"
-        elif pk == "matmul":
-            runtime_dims_masks = (
-                entry["aux"]["runtime_dims_masks"]
-                if entry["aux"].get("runtime_dims_masks") != None
-                else ""
-            )
-            return f"--runtime_dims_masks={runtime_dims_masks}"
-        elif pk == "reorder":
-            runtime_dim_mask = (
-                entry["aux"]["runtime-dim-mask"]
-                if entry["aux"].get("runtime-dim-mask") != None
-                else ""
-            )
-            return f"--runtime-dim-mask={runtime_dim_mask}"
-        elif pk == "brgemm":
-            bs = entry["aux"]["bs"] if entry["aux"].get("bs") != None else ""
-            beta = entry["aux"]["beta"] if entry["aux"].get("beta") != None else ""
-            return f"--bs={bs} --beta={beta}"
-        else:
-            alg = alg_remove_primitive(alg)
-            if alg != "":
-                return f"--alg={alg}"
-    return ""
-
-
-def convert_bias_mask(mds):
-    bia_mds = [md for md in mds if md["arg"] == "bia"]
-    if len(bia_mds) != 0:
-        bia_md = bia_mds[0]
-        flags = bia_md["flags"]["value"].split("_")
-        if len(flags) > 1:
-            mask = flags[1][4:]
-            return f"--bia_mask={mask}"
-    return ""
-
-
-def convert_dts(mds, prim_kind):
-    def convert_dts_common(mds):
-        dts = [md["data_type"] for md in mds if md["data_type"] != "undef"]
-        dt = dts[0]
-        return f"--dt={dt}"
-
-    def convert_dts_cfg_rnn(mds):
-        cfg = "--cfg="
-        args = ["src_iter", "src_iter_c", "src_layer", "dst_iter", "dst_layer", "bias"]
-        mds_strip = [md for md in mds if md["arg"] in args]
-        # ws is not part of cfg
-        mds_strip = [md for md in mds_strip if "ws" not in md["arg"]]
-        # bias is not part of cfg
-        mds_strip = [md for md in mds_strip if "bia" not in md["arg"]]
-        common_dt = everyone_is([md["data_type"] for md in mds_strip])
-        if common_dt and mds_strip[0]["data_type"] in ["f32", "f16"]:
-            cfg += mds_strip[0]["data_type"]
-        elif common_dt and mds_strip[0]["data_type"] == "bf16":
-            cfg += mds_strip[0]["data_type"]
-            # bias is part of cfg for bf16
-            bias_md = [md for md in mds if md["arg"] == "bias"][0]
-            bias_dt = bias_md["data_type"]
-            if bias_dt != mds_strip[0]["data_type"]:
-                cfg += bias_dt
-        else:
-            for arg in args:
-                for md in mds_strip:
-                    if md["arg"] == arg:
-                        # src iter is skipped if it is f32
-                        if arg == "src_iter_c" and md["data_type"] == "f16":
-                            continue
-                        cfg += md["data_type"]
-        return cfg
-
-    def convert_dts_all(mds):
-        dts = ""
-        md_args = ""
-        for md in mds:
-            md_arg = md["arg"][0]
-            if md_args.find(md_arg) == -1:
-                md_dt = md["data_type"]
-                dts += f" --{md_arg}dt={md_dt}"
-                md_args += md_arg
-        return dts
 
-    def convert_dts_prelu(mds):
-        data_md = [md for md in mds if "data" in md["arg"]][0]
-        weights_md = [md for md in mds if "wei" in md["arg"]][0]
-
-        data_dt = data_md["data_type"]
-        weights_dt = weights_md["data_type"]
-
-        return f" --sdt={data_dt}:{weights_dt}"
-
-    # --dt=SRC_DT[:WEI_DT][:DST_DT]
-    def convert_dts_multiple(mds):
-        dts = "--dt="
-        has_fused_dw = 0
-        for md in mds:
-            md_dt = md["data_type"]
-            md_arg = md["arg"]
-            if md_arg == "src_fused":
-                has_fused_dw = 1
-            # Fused dw defines dst_dt by src_fused argument
-            # Note: assumes the order in mds is 'src_fused', then 'dst'.
-            if has_fused_dw == 1 and md_arg == "dst":
-                continue
+def maybe_make_any_tag(md: ir.MemoryDescriptor):
+    return "any" if "a" in md.properties else md.tag
 
-            if md_arg == "src":
-                dts += f"{md_dt}"
-            elif md_arg == "wei":
-                dts += f":{md_dt}"
-            elif md_arg == "dst" or md_arg == "src_fused":
-                dts += f":{md_dt}"
-            else:
-                dts += f""
-        return dts
 
-    def convert_dts_multiple_src(mds):
-        src_dts = ""
-        dts = ""
-        first_src = True
-        for md in mds:
-            md_dt = md["data_type"]
-            md_arg = md["arg"]
-            if md_arg == "src":
-                if not first_src:
-                    src_dts += f":{md_dt}"
-                else:
-                    src_dts += f" --{md_arg[0]}dt={md_dt}"
-                    first_src = False
-            else:
-                if md_dt != "undef":
-                    dts += f" --{md_arg[0]}dt={md_dt}"
-        return src_dts + dts
-
-    def convert_dts_with_bias(mds):
-        dt = convert_dts_multiple(mds)
-        mds_bias = [md for md in mds if "bia" in md["arg"]]
-        if len(mds_bias) != 0:
-            md_bias = mds_bias[0]
-            bias_dt = md_bias["data_type"]
-            dt += " " + f"--bia_dt={bias_dt}"
-        return dt
-
-    def convert_dts_with_ss(mds):
-        dt = convert_dts_multiple(mds)
-        mds_scale = [md for md in mds if "scale" in md["arg"]]
-        mds_shift = [md for md in mds if "shift" in md["arg"]]
-
-        if len(mds_scale) != 0:
-            md_scale = mds_scale[0]
-            scale_dt = md_scale["data_type"]
-            dt += " " + f"--ss_dt={scale_dt}"
-        elif len(mds_shift) != 0:
-            md_shift = mds_shift[0]
-            shift_dt = md_shift["data_type"]
-            dt += " " + f"--ss_dt={shift_dt}"
-
-        return dt
-
-    convert_dts = {
-        "batch_normalization": convert_dts_common,
-        "binary": convert_dts_multiple_src,
-        "brgemm": convert_dts_multiple,
-        "concat": convert_dts_all,
-        "convolution": convert_dts_multiple,
-        "deconvolution": convert_dts_multiple,
-        "eltwise": convert_dts_common,
-        "inner_product": convert_dts_multiple,
-        "group_normalization": convert_dts_multiple,
-        "layer_normalization": convert_dts_with_ss,
-        "lrn": convert_dts_common,
-        "matmul": convert_dts_with_bias,
-        "pooling": convert_dts_multiple,
-        "prelu": convert_dts_prelu,
-        "reduction": convert_dts_all,
-        "reorder": convert_dts_all,
-        "resampling": convert_dts_all,
-        "rnn": convert_dts_cfg_rnn,
-        "shuffle": convert_dts_common,
-        "softmax": convert_dts_all,
-        "sum": convert_dts_multiple_src,
-    }
+def attribute_flag(name: str):
+    def wrapper(converter: "Converter"):
+        attr = getattr(converter.entry.exts, name)
+        flag_name = name.replace("_", "-")
+        if attr is None:
+            return ""
+        return f"--attr-{flag_name}={attr!s}"
+
+    return property(wrapper)
+
+
+class ConverterMeta(type):
+    driver: str
+
+
+class Converter(metaclass=ConverterMeta):
+    def __init__(self, entry: ir.Entry):
+        self.entry = entry
+
+    def _get_dir(self):
+        dirs = {
+            "forward_training": "FWD_D",
+            "forward_inference": "FWD_I",
+            "backward_data": "BWD_D",
+            "backward_weights": "BWD_W",
+            "backward": "BWD_DW",
+        }
+
+        if self.entry.prop_kind not in dirs:
+            return ""
+
+        return dirs[self.entry.prop_kind]
+
+    def _get_alg(self):
+        return self.entry.aux.get("alg")
+
+    @staticmethod
+    def _get_policies():
+        return "common", "per_oc"
+
+    @staticmethod
+    def _get_policy_map():
+        return 0, 1, 1, 1
+
+    def policy(self, mask: int):
+        policies = self._get_policies()
+        policy_map = self._get_policy_map()
 
-    convert = convert_dts.get(prim_kind)
-    if convert != None:
-        return convert(mds)
-    # FIXME: Error handling. Throw an error if get() is used, but None returned
-    return ""
-
-
-def convert_tags(mds, prim_kind):
-    def convert_tags_common(mds):
-        tags = [md["tag"] for md in mds if md["tag"] != ""]
-        tag = tags[0]
-        return f"--tag={tag}" if tag else ""
-
-    def convert_tags_all(mds):
-        tags = ""
-        has_fused_dw = 0
-        for md in mds:
-            md_arg = md["arg"]
-            md_arg_abbr = md["arg"][0]
-            if md_arg == "src_fused":
-                has_fused_dw = 1
-                md_arg_abbr = "d"
-
-            # Fused dw defines dst_dt by src_fused argument
-            # Note: assumes the order in mds is 'src_fused', then 'dst'.
-            if has_fused_dw == 1 and md_arg == "dst":
+        if mask >= len(policy_map) or policy_map[mask] >= len(policies):
+            return "per_tensor"
+        return policies[policy_map[mask]]
+
+    @property
+    def engine(self):
+        return f"--engine={self.entry.engine}"
+
+    @property
+    def dir(self):
+        if self._get_dir():
+            return f"--dir={self._get_dir()}"
+        return ""
+
+    @property
+    def bias_mask(self):
+        return ""
+
+    @property
+    def dts(self):
+        for md in self.entry.mds:
+            if md.data_type == "undef":
                 continue
-            # skip bias and dw_fused weights
-            if md_arg_abbr == "b" or md_arg == "wei_fused":
+            return f"--dt={md.data_type}"
+        return ""
+
+    @property
+    def tags(self):
+        for md in self.entry.mds:
+            if not md.tag:
                 continue
+            return f"--tag={md.tag}"  # XXX: Don't use maybe_make_any_tag
+        return ""
+
+    @property
+    def flags(self):
+        return ""
+
+    def _get_nondefault_args(self, values, defaults):
+        parts: List[str] = []
+        pairs = list(zip(values, defaults))
+        seen_nondefault = False
+        for value, default in reversed(pairs):
+            if value != default:
+                seen_nondefault = True
+            if seen_nondefault:
+                parts.append(str(value))
+        return list(reversed(parts))
+
+    def _convert_dw_post_op(self, po: ir.DepthwisePostOp):
+        return f"dw:{po.ksp}:{po.dst_dt}"
+
+    def _convert_sum_post_op(self, po: ir.SumPostOp):
+        values = po.scale, po.zp, po.dt
+        args = self._get_nondefault_args(values, defaults=(1.0, 0, ""))
+        return ":".join(["sum"] + args)
+
+    def _convert_prelu_post_op(self, po: ir.PreLUPostOp):
+        if po.mask != 0:
+            return f"prelu:{self.policy(po.mask)}"
+        return "prelu"
+
+    def _convert_eltwise_post_op(self, po: ir.EltwisePostOp):
+        values = po.alpha, po.beta, po.scale
+        args = self._get_nondefault_args(values, defaults=(0.0, 0.0, 1.0))
+        return ":".join([po.alg] + args)
+
+    def _convert_binary_post_op(self, po: ir.BinaryPostOp):
+        if po.tag != "any":
+            return f"{po.alg}:{po.dt}:{po.mask}:{po.tag}"
+        return f"{po.alg}:{po.dt}:{po.mask}"
+
+    @property
+    def post_ops(self):
+        post_ops = self.entry.exts.post_ops
+        if post_ops is None:
+            return ""
+        results = []
+        for post_op in post_ops:
+            if post_op.alg == "dw":
+                dw_po = cast(ir.DepthwisePostOp, post_op)
+                results.append(self._convert_dw_post_op(dw_po))
+            elif post_op.alg == "sum":
+                sum_po = cast(ir.SumPostOp, post_op)
+                results.append(self._convert_sum_post_op(sum_po))
+            elif post_op.alg == "prelu":
+                prelu_po = cast(ir.PreLUPostOp, post_op)
+                results.append(self._convert_prelu_post_op(prelu_po))
+            elif post_op.alg.startswith("binary_"):
+                binary_po = cast(ir.BinaryPostOp, post_op)
+                results.append(self._convert_binary_post_op(binary_po))
+            elif post_op.alg.startswith("eltwise_"):
+                eltwise_po = cast(ir.EltwisePostOp, post_op)
+                results.append(self._convert_eltwise_post_op(eltwise_po))
+        return "--attr-post-ops=" + "+".join(results)
+
+    def _get_quantization(
+        self,
+        params: Optional[Mapping[str, ir.QuantizationParam]],
+        def_value: float,
+        def_type: str,
+    ):
+        if params is None:
+            return ""
+        results = []
+        for arg, param in params.items():
+            policy = self.policy(param.mask)
+            result = f"{arg}:{policy}"
+            if policy == "common":
+                result += f":{def_value}"
+            dt = param.data_type
+            groups = param.groups
+            if dt != def_type or groups:
+                result += f":{dt}"
+            if groups:
+                result += f":{groups}"
+            results.append(result)
+        return "+".join(results)
+
+    @property
+    def scales(self):
+        params = self._get_quantization(self.entry.exts.scales, 0.5, "f32")
+        return f"--attr-scales={params}"
+
+    @property
+    def zero_points(self):
+        params = self._get_quantization(self.entry.exts.zero_points, 1, "s32")
+        return f"--attr-zero-points={params}"
+
+    @property
+    def rounding_mode(self):
+        rounding_modes = self.entry.exts.rounding_mode
+        if rounding_modes is None:
+            return ""
+        results = []
+        for arg, mode in rounding_modes.items():
+            results.append(f"{arg}:{mode!s}")
+        return "--attr-rounding-mode=" + "+".join(results)
+
+    scratchpad_mode = attribute_flag("scratchpad")
+    fpmath_mode = attribute_flag("fpmath")
+    acc_mode = attribute_flag("acc_mode")
+
+    @property
+    def dropout(self):
+        dropout = self.entry.exts.dropout
+        if dropout is None:
+            return ""
+        # Use default p=0.5 and seed=12345 since those values are user data and
+        # can't be obtained properly.
+        result = "0.5:12345"
+        if dropout.tag:
+            result += f":{dropout.tag}"
+        return f"--attr-dropout={result}"
+
+    deterministic = attribute_flag("deterministic")
+
+    @property
+    def attrs(self):
+        attrs = (
+            self.post_ops,
+            self.scales,
+            self.zero_points,
+            self.scratchpad_mode,
+            self.fpmath_mode,
+            self.acc_mode,
+            self.rounding_mode,
+            self.dropout,
+            self.deterministic,
+        )
+        return " ".join(attr for attr in attrs if attr)
+
+    @property
+    def aux(self):
+        alg = self._get_alg()
+        if alg is not None:
+            return f"--alg={alg}"
+        return ""
 
-            if "a" in md["properties"]:
-                tags += f" --{md_arg_abbr}tag=any"
+    @property
+    def shapes(self):
+        return self.entry.shapes
+
+
+class AlgorithmMixin:
+    entry: ir.Entry
+
+    def _get_alg(self):
+        alg = self.entry.aux.get("alg")
+        if alg is None:
+            return None
+        return alg.split(self.entry.prim_kind, 1)[1][1:]
+
+
+class MultiSourceMixin:
+    entry: ir.Entry
+
+    @property
+    def dts(self):
+        src_dts: List[str] = []
+        other_dts: Dict[str, str] = {}
+        for md in self.entry.mds:
+            dt = md.data_type
+            if md.arg == "src":
+                src_dts.append(dt)
+            elif dt != "undef":
+                other_dts[md.arg[0]] = dt
+        sdt_flags = "--sdt=" + ":".join(src_dts)
+        other_dt_flags = " ".join(f"--{k}dt={v}" for k, v in other_dts.items())
+        return f"{sdt_flags} {other_dt_flags}".strip()
+
+    @property
+    def tags(self):
+        src_tags: List[str] = []
+        other_tags: Dict[str, str] = {}
+        for md in self.entry.mds:
+            if md.arg == "src":
+                src_tags.append(maybe_make_any_tag(md))
+            elif md.tag:
+                other_tags[md.arg[0]] = maybe_make_any_tag(md)
+        stag_flags = "--stag=" + ":".join(src_tags)
+        other_tag_flags = " ".join(
+            f"--{k}tag={v}" for k, v in other_tags.items()
+        )
+        return f"{stag_flags} {other_tag_flags}".strip()
+
+
+class CommonDataTypeMixin:
+    entry: ir.Entry
+
+    @property
+    def dts(self):
+        dts: Dict[str, str] = {}
+        for md in self.entry.mds:
+            c = md.arg[0]
+            if c in dts:
+                continue
+            dts[c] = md.data_type
+        return " ".join(f"--{k}dt={v}" for k, v in dts.items())
+
+
+class TagTripletMixin:
+    entry: ir.Entry
+
+    @property
+    def tags(self):
+        md_map = {md.arg: md for md in self.entry.mds}
+        has_fused_dw = "src_fused" in md_map
+        # Fused dw defines dst tag by src_fused argument
+        dst_name = "src_fused" if has_fused_dw else "dst"
+        tags = []
+        if "src" in md_map:
+            md = md_map["src"]
+            tag = maybe_make_any_tag(md)
+            tags.append(f"--stag={tag}")
+        if "wei" in md_map:
+            md = md_map["wei"]
+            tag = maybe_make_any_tag(md)
             # pass wtag any for cases with compensation
-            elif md_arg_abbr == "w" and md["flags"]["value"] != "f0":
-                tags += f" --{md_arg_abbr}tag=any"
+            if str(md.flags.value) != "f0":
+                tag = "any"
+            tags.append(f"--wtag={tag}")
+        if dst_name in md_map:
+            md = md_map[dst_name]
+            tag = maybe_make_any_tag(md)
+            tags.append(f"--dtag={tag}")
+        return " ".join(tags)
+
+
+class StridesMixin(TagTripletMixin):
+    @property
+    def tags(self):
+        tags = []
+        strides = []
+
+        def add_strides_or_tag(arg, md):
+            tag = maybe_make_any_tag(md)
+            if arg == "wei" and str(md.flags.value) != "f0":
+                tag = "any"
+            if tag != "any" and tag.lower() == tag and md.strides:
+                strides.append(md.strides)
             else:
-                md_tag = md["tag"]
-                tags += f" --{md_arg_abbr}tag={md_tag}"
-        return tags
-
-    def convert_tags_and_strides(mds):
-        tags = ""
-        strides = f" --strides="
-        for md in mds:
-            md_arg = md["arg"][0]
-            # skip bias
-            if md_arg == "b":
+                tags.append(f"--{arg[0]}tag={tag}")
+                strides.append("")
+
+        md_map = {md.arg: md for md in self.entry.mds}
+        args = "src", "wei", "dst"
+        for arg in args:
+            if arg not in md_map:
                 continue
+            md = md_map[arg]
+            add_strides_or_tag(arg, md)
+        stride_flag = "--strides=" + ":".join(strides)
+        return " ".join(tags + [stride_flag])
+
+
+class MultiDataTypeMixin:
+    entry: ir.Entry
+
+    @property
+    def dts(self):
+        dt_map = {md.arg: md.data_type for md in self.entry.mds}
+        # Fused dw defines dst_dt by src_fused argument
+        has_fused_dw = "src_fused" in dt_map
+        dst_name = "src_fused" if has_fused_dw else "dst"
+        dts = [
+            dt_map.get("src", ""),
+            dt_map.get("wei", ""),
+            dt_map.get(dst_name, ""),
+        ]
+        return "--dt=" + ":".join(dt for dt in dts if dt)
+
+
+class MultiDataTypeWithBiasMixin(MultiDataTypeMixin):
+    @property
+    def dts(self):
+        dts = super().dts
+        for md in self.entry.mds:
+            if md.arg != "bia":
+                continue
+            return f"{dts} --bia-dt={md.data_type}".strip()
+        return dts
 
-            if "a" in md["properties"]:
-                tags += f" --{md_arg}tag=any"
-            # pass wtag any for cases with compensation
-            elif md_arg == "w" and md["flags"]["value"] != "f0":
-                tags += f" --{md_arg}tag=any"
-            else:
-                md_strides = md["strides"]
-
-                def tag_has_blocks(string):
-                    for l in string:
-                        if l.isupper():
-                            return True
-                    return False
-
-                md_tag_has_blocks = tag_has_blocks(md["tag"])
-                if md_strides != "" and not md_tag_has_blocks:
-                    strides += f"{md_strides}"
-                else:
-                    md_tag = md["tag"]
-                    tags += f" --{md_arg}tag={md_tag}"
-            if md_arg != "d":
-                strides += f":"
-
-        tags += strides
-        return tags
 
-    # --tag=SRC_TAG[:WEI_TAG][:DST_TAG]
-    def convert_tags_multiple(mds):
-        tags = "--tag="
-        for md in mds:
-            md_tag = md["tag"]
-            md_arg = md["arg"]
-            if md_arg == "src" or md_arg == "wei" or md_arg == "dst":
-                if md_arg != "src":
-                    tags += f":"
-                if "a" in md["properties"]:
-                    tags += f"any"
-                else:
-                    tags += f"{md_tag}"
-            else:
-                tags += f""
-        return tags
-
-    def convert_tags_multiple_src(mds):
-        src_tags = ""
-        tags = ""
-        first_src = False
-        for md in mds:
-            md_tag = md["tag"]
-            md_arg = md["arg"]
-            if md_arg == "src":
-                if first_src:
-                    if "a" in md["properties"]:
-                        src_tags += f":any"
-                    else:
-                        src_tags += f":{md_tag}"
-                else:
-                    if "a" in md["properties"]:
-                        src_tags += f" --{md_arg[0]}tag=any"
-                    else:
-                        src_tags += f" --{md_arg[0]}tag={md_tag}"
-                    first_src = True
-            else:
-                if md_tag != "":
-                    if "a" in md["properties"]:
-                        tags += f" --{md_arg[0]}tag=any"
-                    else:
-                        tags += f" --{md_arg[0]}tag={md_tag}"
-        return src_tags + tags
-
-    def convert_tags_prelu(mds):
-        # FIXME: fix benchdnn input template
-        data_md = [md for md in mds if "data" in md["arg"]][0]
-        weights_md = [md for md in mds if "wei" in md["arg"]][0]
-
-        data_tag = data_md["tag"]
-        if "a" in data_md["properties"]:
-            data_tag = "any"
-        weights_tag = weights_md["tag"]
-        if "a" in weights_md["properties"]:
-            weights_tag = "any"
-
-        return f" --stag={data_tag}:{weights_tag}"
-
-    def convert_tags_rnn(mds):
-        tags = "--tag="
-        with_proj = ""
-        with_peep = ""
-        skip_colon = True
+class NormalizationMixin:
+    entry: ir.Entry
 
-        # Tags for backward are driven by diff tensors, query them instead of
-        # forward tensors. Latter will always have `any` format.
-        has_diff_tensors = False
-        for md in mds:
-            if md["arg"].find("diff") != -1:
-                has_diff_tensors = True
+    @property
+    def aux(self):
+        flags = self.entry.aux.get("flags")
+        if flags is not None:
+            return f"--flags={flags}"
+        return ""
 
-        for md in mds:
-            md_arg = md["arg"]
-            md_tag = md["tag"]
-            if has_diff_tensors == True:
-                if md_arg in ["diff_src_layer", "diff_wei_layer", "diff_dst_layer"]:
-                    if not skip_colon:
-                        tags += f":"
-                    if "a" in md["properties"]:
-                        tags += f"any"
-                    else:
-                        tags += f"{md_tag}"
-                    skip_colon = False
-            else:
-                if md_arg in ["src_layer", "wei_layer", "dst_layer"]:
-                    if not skip_colon:
-                        tags += f":"
-                    if "a" in md["properties"]:
-                        tags += f"any"
-                    else:
-                        tags += f"{md_tag}"
-                    skip_colon = False
-
-            if md_arg == "wei_proj" and md_tag != "undef":
-                with_proj = " --with-projection=true"
-            if md_arg == "wei_peephole" and md_tag != "undef":
-                with_peep = " --with-peephole=true"
-
-        return tags + with_proj + with_peep
-
-    def convert_tags_lnorm(mds):
-        tag = convert_tags_multiple(mds)
-        stat_md = ""
-        for md in mds:
-            if md["arg"] == "stats":
-                stat_tag = md["tag"]
-
-        return f"{tag} --stat_tag={stat_tag}"
-
-    cvt_tags = {
-        "batch_normalization": convert_tags_common,
-        "binary": convert_tags_multiple_src,
-        "concat": convert_tags_multiple_src,
-        "convolution": convert_tags_all,
-        "deconvolution": convert_tags_all,
-        "eltwise": convert_tags_common,
-        "inner_product": convert_tags_all,
-        "group_normalization": convert_tags_multiple,
-        "layer_normalization": convert_tags_lnorm,
-        "lrn": convert_tags_common,
-        "matmul": convert_tags_and_strides,
-        "pooling": convert_tags_common,
-        "prelu": convert_tags_prelu,
-        "reduction": convert_tags_all,
-        "reorder": convert_tags_and_strides,
-        "resampling": convert_tags_common,
-        "rnn": convert_tags_rnn,
-        "shuffle": convert_tags_common,
-        "softmax": convert_tags_all,
-        "sum": convert_tags_multiple_src,
-    }
 
-    convert = cvt_tags.get(prim_kind)
-    if convert:
-        return convert(mds)
-    return ""
-
-
-def convert_flags(mds, prim_kind):
-    def convert_flags_reorder(mds):
-        def convert_flag(prefix, md):
-            flag = ""
-            flag_fields = md.get("flags")
-            if flag_fields != None:
-                cvt = {"s8_comp_mask": "s8s8_comp", "zp_comp_mask": "zp_comp"}
-                for f in cvt.keys():
-                    value = flag_fields.get(f)
-                    if value != None:
-                        benchdnn_flag = cvt[f] + ":" + value
-                        if flag == "":
-                            flag = benchdnn_flag
-                        else:
-                            flag += "+" + benchdnn_flag
-            if flag != "":
-                return f"--{prefix}flag={flag}"
-            else:
-                return ""
+class BatchNormalizationConverter(NormalizationMixin, Converter):
+    driver: str = "bnorm"
 
-        flags = ""
-        # FIXME: fix benchdnn input template
-        input_md = [md for md in mds if "src" in md["arg"]][0]
-        output_md = [md for md in mds if "dst" in md["arg"]][0]
 
-        iflag = convert_flag("i", input_md)
-        oflag = convert_flag("o", output_md)
+class BinaryConverter(AlgorithmMixin, MultiSourceMixin, Converter):
+    driver: str = "binary"
 
-        if iflag != "":
-            flags += iflag
-        if oflag != "":
-            flags += " " + oflag
-        return flags
+    @property
+    def shapes(self):
+        return self.entry.shapes.split(" ", 1)[0]
 
-    def convert_flags_rnn(mds):
-        for md in mds:
-            md_arg = md["arg"]
-            if md_arg == "src_iter" or md_arg == "src_layer":
-                md_strides = md["strides"]
-                if md_strides != "":
-                    return f"--trivial-strides=false"
 
-        return f"--trivial-strides=true"
+class BRGEMMConverter(MultiDataTypeMixin, Converter):
+    driver: str = "brgemm"
 
-    cvt_flags = {
-        "rnn": convert_flags_rnn,
-        "reorder": convert_flags_reorder,
-    }
+    @property
+    def aux(self):
+        bs = self.entry.aux.get("bs", "")
+        beta = self.entry.aux.get("beta", "")
+        return f"--bs={bs} --beta={beta}"
+
+
+class ConcatConverter(CommonDataTypeMixin, MultiSourceMixin, Converter):
+    driver: str = "concat"
 
-    convert = cvt_flags.get(prim_kind)
-    if convert:
-        return convert(mds)
-    return ""
+    @property
+    def aux(self):
+        axis = self.entry.aux.get("axis")
+        if axis is None:
+            return ""
+        return f"--axis={axis}"
 
 
-def extract_attr(attrs, type):
-    start_idx = attrs.find(type)
-    if start_idx == -1:
+class ConvolutionConverter(
+    AlgorithmMixin,
+    TagTripletMixin,
+    MultiDataTypeWithBiasMixin,
+    Converter,
+):
+    driver: str = "conv"
+
+    @property
+    def aux(self):
+        alg = self._get_alg()
+        if alg is not None:
+            return f"--alg={alg}"
         return ""
 
-    start_idx += len(type) + 1
-    end_symbol = ";"
-    if type == "post_ops":
-        start_idx += 1
-        end_symbol = "'"
-    end_idx = attrs.find(end_symbol, start_idx)
-    if type == "post_ops":
-        start_idx -= 1
-        end_idx += 1
-    return attrs[start_idx:end_idx]
-
-
-def convert_scale_policy(value, prim_kind):
-    if prim_kind == "reorder":
-        masks = {0: "common", 1: "per_dim_0", 2: "per_dim_1", 3: "per_dim_01"}
-    elif prim_kind == "matmul":
-        masks = {
-            0: "common",
-            1: "per_oc",
-            2: "per_oc",
-            3: "per_ocic",
-            4: "per_oc",
-            6: "per_ocic",
-            8: "per_oc",
-            12: "per_ocic",
-        }
-    else:
-        masks = {0: "common", 1: "per_oc", 2: "per_oc", 3: "per_oc"}
-
-    mask = masks.get(int(value))
-    if mask:
-        return mask
-    # this is a workaround for tensors with mask more than 4
-    return "per_tensor"
-
-
-def convert_zp_policy(value, prim_kind):
-    if prim_kind == "matmul":
-        masks = {
-            0: "common",
-            2: "per_oc",
-            3: "per_ocic",
-            4: "per_oc",
-            6: "per_ocic",
-            12: "per_ocic",
-        }
-    else:
-        masks = {0: "common", 2: "per_dim_1"}
-    mask = masks.get(int(value))
-    if mask:
-        return mask
-    # this is a workaround for tensors with mask more than 4
-    return "per_tensor"
-
-
-def convert_post_ops(post_ops, prim_kind):
-    def convert_binary_post_op(post_op):
-        po = post_op["alg"] + ":" + post_op["dt"] + ":" + post_op["mask"]
-        if post_op["tag"] != None:
-            po += ":" + post_op["tag"]
-        return po
-
-    def convert_dw_post_op(post_op):
-        po = post_op["alg"] + ":" + post_op["ksp"] + ":" + post_op["dst_dt"]
-        return po
-
-    def convert_eltwise_post_op(post_op):
-        benchdnn_p_op = post_op["alg"]
-        alpha = post_op["alpha"]
-        beta = post_op["beta"]
-        scale = post_op["scale"]
-        if alpha != "1.0":
-            benchdnn_p_op += ":" + alpha
-            if beta != "0.0":
-                benchdnn_p_op += ":" + beta
-                if alpha != "1.0":
-                    benchdnn_p_op += ":" + scale
-        return benchdnn_p_op
-
-    def convert_sum_post_op(post_op):
-        benchdnn_p_op = post_op["alg"]
-        if post_op["scale"] != 1.0:
-            benchdnn_p_op += ":" + post_op["scale"]
-            if post_op["zp"] != 0:
-                benchdnn_p_op += ":" + post_op["zp"]
-                if post_op["dt"] != "":
-                    benchdnn_p_op += ":" + post_op["dt"]
-        return benchdnn_p_op
-
-    def convert_prelu_post_op(post_op):
-        benchdnn_p_op = post_op["alg"]
-        if post_op["mask"] != 0:
-            policy = convert_scale_policy(post_op["mask"], prim_kind)
-            benchdnn_p_op += ":" + policy
-        return benchdnn_p_op
-
-    convert = {
-        "binary": convert_binary_post_op,
-        "dw": convert_dw_post_op,
-        "eltwise": convert_eltwise_post_op,
-        "sum": convert_sum_post_op,
-        "prelu": convert_prelu_post_op,
-    }
 
-    benchdnn_postops = ""
-    for e in post_ops:
-        for k in convert.keys():
-            if k in e["alg"]:
-                cvt = convert.get(k)
-                if benchdnn_postops != "":
-                    benchdnn_postops += "+"
-                benchdnn_postops += cvt(e)
-                break
-    return benchdnn_postops
+class DeconvolutionConverter(ConvolutionConverter):
+    driver: str = "deconv"
 
 
-def convert_quantization(q_param, prim_kind, def_value, def_type):
-    res = []
-    for arg in q_param.keys():
-        p = q_param[arg]
-        policy = convert_scale_policy(p["mask"], prim_kind)
-        benchdnn_p = arg + ":" + policy
-        if policy == "common":
-            benchdnn_p += ":" + def_value
-        dt = p["data_type"]
-        groups = p["groups"]
-        if dt != def_type or groups != "":
-            benchdnn_p += ":" + dt
-        if groups != "":
-            benchdnn_p += ":" + groups
-        res.append(benchdnn_p)
-    return "+".join(res)
+class EltwiseConverter(Converter):
+    driver: str = "eltwise"
 
+    @property
+    def aux(self):
+        alpha = self.entry.aux.get("alpha")
+        beta = self.entry.aux.get("beta")
+        flags = [f"--alg={self._get_alg()}"]
+        if alpha is not None:
+            flags.append(f"--alpha={alpha}")
+        if beta is not None:
+            flags.append(f"--beta={beta}")
+        return " ".join(flags)
 
-def convert_scales(scales, prim_kind):
-    return convert_quantization(
-        q_param=scales, prim_kind=prim_kind, def_value="0.5", def_type="f32"
-    )
 
+class GroupNormalizationConverter(
+    MultiDataTypeMixin,
+    BatchNormalizationConverter,
+):
+    driver: str = "gnorm"
 
-def convert_zero_points(zero_points, prim_kind):
-    return convert_quantization(
-        q_param=zero_points, prim_kind=prim_kind, def_value="1", def_type="s32"
-    )
+    # --tag=SRC_TAG[:WEI_TAG][:DST_TAG]
+    @property
+    def tags(self):
+        tag_map = {md.arg: maybe_make_any_tag(md) for md in self.entry.mds}
+        args = "src", "wei", "dst"
+        tags = [tag_map[arg] for arg in args if arg in tag_map]
+        return "--tag=" + ":".join(tags)
+
+
+class InnerProductConverter(
+    TagTripletMixin, MultiDataTypeWithBiasMixin, Converter
+):
+    driver: str = "ip"
+
+
+class LayerNormalizationConverter(GroupNormalizationConverter):
+    driver: str = "lnorm"
+
+    @property
+    def dts(self):
+        dts = super().dts
+        shift_flag = None
+        for md in self.entry.mds:
+            if "scale" in md.arg:
+                return f"{dts} --ss_dt={md.data_type}".strip()
+            if "shift" in md.arg and shift_flag is None:
+                shift_flag = f"--ss_dt={md.data_type}"
+        if shift_flag is not None:
+            return f"{dts} {shift_flag}".strip()
+        return dts
 
-def convert_rounding_mode(rounding_modes, prim_kind):
-    res = []
-    for arg in rounding_modes.keys():
-        res.append(arg + ":" + rounding_modes[arg])
-    return "+".join(res)
+    @property
+    def tags(self):
+        tags = super().tags
+        for md in self.entry.mds:
+            if md.arg == "stats":
+                tags = f"{tags} --stat_tag={maybe_make_any_tag(md)}"
+        return tags.strip()
 
-def convert_scratchpad_mode(scratchpad_mode, prim_kind):
-    return scratchpad_mode
 
+class LRNConverter(AlgorithmMixin, Converter):
+    driver: str = "lrn"
 
-def convert_fpmath_mode(fpmath_mode, prim_kind):
-    return fpmath_mode
+    @property
+    def aux(self):
+        alg = self._get_alg()
+        algs = {"across_channels": "ACROSS", "within_channel": "WITHIN"}
+        if alg not in algs:
+            return ""
+        return f"--alg={algs[alg]}"
 
 
-def convert_acc_mode(acc_mode, prim_kind):
-    return acc_mode
+class MatmulConverter(StridesMixin, MultiDataTypeWithBiasMixin, Converter):
+    driver: str = "matmul"
 
+    @staticmethod
+    def _get_policies():
+        return "common", "per_oc", "per_ocic"
 
-def convert_dropout(dropout, prim_kind):
-    ret = dropout["p"]
-    if dropout["seed"] != None:
-        ret += ":" + dropout["seed"]
-        if dropout["tag"] != None:
-            ret += ":" + dropout["tag"]
-    return ret
+    @staticmethod
+    def _get_policy_map():
+        return 0, 1, 1, 2, 1, 3, 2, 3, 1, 3, 3, 3, 2
 
+    @property
+    def bias_mask(self):
+        for md in self.entry.mds:
+            if md.arg != "bia":
+                continue
+            if "_" in md.flags.value:
+                mask = md.flags.value.split("_")[1][4:]
+                return f"--bia_mask={mask}"
+        return ""
 
-def convert_deterministic(deterministic, prim_kind):
-    return deterministic
+    @property
+    def aux(self):
+        rt_dim_masks = self.entry.aux.get("runtime_dims_masks", "")
+        return f"--runtime_dims_masks={rt_dim_masks}"
 
 
-def convert_attrs(exts, prim_kind):
-    converters = {
-        "attr-post-ops": convert_post_ops,
-        "attr-scales": convert_scales,
-        "attr-zero-points": convert_zero_points,
-        "attr-scratchpad": convert_scratchpad_mode,
-        "attr-fpmath": convert_fpmath_mode,
-        "attr-acc": convert_acc_mode,
-        "attr-rounding-mode": convert_rounding_mode,
-        "attr-dropout": convert_dropout,
-        "attr-deterministic": convert_deterministic,
-    }
+class PoolingConverter(MultiDataTypeMixin, Converter):
+    driver: str = "pool"
+
+    @property
+    def aux(self):
+        return f"--alg={self._get_alg()}"
+
+
+class PreLUConverter(Converter):
+    driver: str = "prelu"
 
-    benchdnn_attrs = ""
-    for e in converters.keys():
-        attr = exts.get(e)
-        if attr != None:
-            if benchdnn_attrs != "":
-                benchdnn_attrs += " "
-            benchdnn_attrs += f"--{e}=" + converters[e](attr, prim_kind)
-    return benchdnn_attrs
+    @property
+    def dts(self):
+        data_dt, wei_dt = "", ""
+        for md in self.entry.mds:
+            if "data" in md.arg and not data_dt:
+                data_dt = md.data_type
+            if "wei" in md.arg and not wei_dt:
+                wei_dt = md.data_type
+            if data_dt and wei_dt:
+                break
+        return f"--sdt={data_dt}:{wei_dt}"
+
+    @property
+    def tags(self):
+        data_tag, wei_tag = "", ""
+        for md in self.entry.mds:
+            if "data" in md.arg and not data_tag:
+                data_tag = maybe_make_any_tag(md)
+            if "wei" in md.arg and not wei_tag:
+                wei_tag = maybe_make_any_tag(md)
+            if data_tag and wei_tag:
+                break
+        return f"--stag={data_tag}:{wei_tag}"
+
+
+class ReductionConverter(
+    AlgorithmMixin,
+    TagTripletMixin,
+    CommonDataTypeMixin,
+    Converter,
+):
+    driver: str = "reduction"
+
+    @property
+    def aux(self):
+        p = self.entry.aux.get("p")
+        eps = self.entry.aux.get("eps")
+        args = [f"--alg={self._get_alg()}"]
+        if p is not None:
+            args.append(f"--p={p}")
+        if eps is not None:
+            args.append(f"--eps={eps}")
+        return " ".join(args)
+
+
+class ReorderConverter(StridesMixin, CommonDataTypeMixin, Converter):
+    driver: str = "reorder"
+
+    def _convert_flag(self, prefix, md: ir.MemoryDescriptor):
+        flags = []
+        fields = md.flags
+        if fields.s8_comp_mask is not None:
+            flags.append(f"s8s8_comp:{fields.s8_comp_mask}")
+        if fields.zp_comp_mask is not None:
+            flags.append(f"zp_comp:{fields.zp_comp_mask}")
+        if flags:
+            return f"--{prefix}flag=" + "+".join(flags)
+        return ""
+
+    @staticmethod
+    def _get_policies():
+        return "common", "per_dim_0", "per_dim_1", "per_dim_01"
 
+    @staticmethod
+    def _get_policy_map():
+        return 0, 1, 2, 3
 
-def convert_shapes(shapes, prim_kind):
-    if prim_kind == "binary":
-        shapes = shapes.split(" ")[0]
-    return f"{shapes}"
+    @property
+    def flags(self):
+        flags = {}
+        for md in self.entry.mds:
+            if "src" in md.arg and "src" not in flags:
+                flags["src"] = self._convert_flag("i", md)
+            elif "dst" in md.arg and "dst" not in flags:
+                flags["dst"] = self._convert_flag("o", md)
+
+            if "src" in flags and "dst" in flags:
+                break
+        iflag = flags.get("src", "")
+        oflag = flags.get("dst", "")
+        return f"{iflag} {oflag}".strip()
+
+    @property
+    def aux(self):
+        mask = self.entry.aux.get("runtime-dim-mask")
+        if mask:
+            return f"--runtime-dim-mask={mask}"
+        return ""
+
+
+class ResamplingConverter(AlgorithmMixin, CommonDataTypeMixin, Converter):
+    driver: str = "resampling"
+
+
+class RNNConverter(AlgorithmMixin, Converter):
+    driver: str = "rnn"
+
+    @property
+    def flags(self):
+        for md in self.entry.mds:
+            if md.arg not in ("src_iter", "src_layer"):
+                continue
+            if md.strides == "":
+                continue
+            return "--trivial-strides=false"
+        return "--trivial-strides=true"
+
+    def _get_flag_from(self, flag_name, flag_values):
+        flag = self.entry.aux.get(flag_name)
+        if flag is None or flag not in flag_values:
+            return ""
+        return f"--{flag_name}={flag_values[flag]}"
+
+    @property
+    def aux(self):
+        algs = {
+            "vanilla_rnn": "VANILLA_RNN",
+            "vanilla_lstm": "VANILLA_LSTM",
+            "vanilla_gru": "VANILLA_GRU",
+            "vanilla_augru": "VANILLA_AUGRU",
+            "lbr_gru": "LBR_GRU",
+            "lbr_augru": "LBR_AUGRU",
+        }
+        dirs = {
+            "unidirectional_left2right": "left2right",
+            "unidirectional_right2left": "right2left",
+            "bidirectional_sum": "sum",
+            "bidirectional_concat": "concat",
+        }
+        acts = {
+            "eltwise_relu": "RELU",
+            "eltwise_logistic": "LOGISTIC",
+            "eltwise_tanh": "TANH",
+        }
+        all_flags = [
+            self._get_flag_from("alg", algs),
+            self._get_flag_from("direction", dirs),
+            self._get_flag_from("activation", acts),
+        ]
+        flags = self.entry.aux.get("flags")
+        if flags is not None:
+            all_flags.append(f"--flags={flags}")
+        return " ".join(flag for flag in all_flags if flag)
+
+    @property
+    def dir(self):
+        dir = self._get_dir()
+        return f"--prop={dir}"
+
+    @property
+    def dts(self):
+        args = ["src_iter", "src_iter_c", "src_layer", "dst_iter", "dst_layer"]
+        cfg_dts: str
+        common_dt = True
+        shared_dt = None
+        bias_dt = None
+        md_map: Dict[Optional[str], ir.MemoryDescriptor] = {}
+        for md in self.entry.mds:
+            md_map[md.arg] = md
+            if md.arg == "bias":
+                bias_dt = md.data_type
+            elif md.arg in args:
+                if shared_dt is None:
+                    shared_dt = md.data_type
+                elif md.data_type != shared_dt:
+                    common_dt = False
+        if common_dt and shared_dt in ["f32", "f16"]:
+            cfg_dts = shared_dt
+        elif common_dt and shared_dt == "bf16":
+            cfg_dts = shared_dt
+            # bias is part of cfg for bf16
+            if bias_dt is not None and bias_dt != shared_dt:
+                cfg_dts += bias_dt
+        else:
+            cfg_dts = ""
+            for arg in args:
+                if arg not in md_map:
+                    continue
+                md = md_map[arg]
+                # src iter is skipped if it is f16
+                if arg == "src_iter_c" and md.data_type == "f16":
+                    continue
+                cfg_dts += md.data_type
+        return f"--cfg={cfg_dts}"
+
+    @property
+    def tags(self):
+        # Tags for backward are driven by diff tensors, query them instead of
+        # forward tensors. Latter will always have `any` format.
+        has_diff_tensors = False
+        for md in self.entry.mds:
+            if "diff" in md.arg:
+                has_diff_tensors = True
+                break
+
+        layer_names = ["src_layer", "wei_layer", "dst_layer"]
+        if has_diff_tensors:
+            layer_names = [f"diff_{name}" for name in layer_names]
+        tags = []
+        other_flags = []
+        for md in self.entry.mds:
+            arg = md.arg
+            tag = maybe_make_any_tag(md)
+            if arg in layer_names:
+                tags.append(tag)
+            elif md.tag == "undef":
+                continue
+            elif arg == "wei_proj":
+                other_flags.append("--with-projection=true")
+            elif arg == "wei_peephole":
+                other_flags.append("--with-peephole=true")
+        tag_flag = "--tag=" + ":".join(tags)
+        return " ".join([tag_flag] + other_flags)
+
+
+class ShuffleConverter(Converter):
+    driver: str = "shuffle"
+
+    @property
+    def aux(self):
+        axis = self.entry.aux.get("axis")
+        group = self.entry.aux.get("group")
+        args = []
+        if axis is not None:
+            args.append(f"--axis={axis}")
+        if group is not None:
+            args.append(f"--group={group}")
+        return " ".join(args)
+
+
+class SoftmaxConverter(TagTripletMixin, CommonDataTypeMixin, Converter):
+    driver: str = "softmax"
+
+    @property
+    def aux(self):
+        axis = self.entry.aux.get("axis")
+        flags = f"--alg={self._get_alg()}"
+        if axis is not None:
+            flags += f" --axis={axis}"
+        return flags
+
+
+class SumConverter(MultiSourceMixin, Converter):
+    driver: str = "sum"
+
+
+class ZeroPadConverter(Converter):
+    driver: str = "zeropad"
+
+    @property
+    def dts(self):
+        return f"--dt={self.entry.mds[0].data_type}"
+
+    @property
+    def tags(self):
+        return f"--tag={maybe_make_any_tag(self.entry.mds[0])}"
+
+
+def get_converter(primitive: str) -> ConverterMeta:
+    converters: Dict[str, ConverterMeta] = {
+        "batch_normalization": BatchNormalizationConverter,
+        "binary": BinaryConverter,
+        "brgemm": BRGEMMConverter,
+        "concat": ConcatConverter,
+        "convolution": ConvolutionConverter,
+        "deconvolution": DeconvolutionConverter,
+        "eltwise": EltwiseConverter,
+        "group_normalization": GroupNormalizationConverter,
+        "inner_product": InnerProductConverter,
+        "layer_normalization": LayerNormalizationConverter,
+        "lrn": LRNConverter,
+        "matmul": MatmulConverter,
+        "pooling": PoolingConverter,
+        "prelu": PreLUConverter,
+        "reduction": ReductionConverter,
+        "reorder": ReorderConverter,
+        "resampling": ResamplingConverter,
+        "rnn": RNNConverter,
+        "shuffle": ShuffleConverter,
+        "softmax": SoftmaxConverter,
+        "sum": SumConverter,
+        "zero_pad": ZeroPadConverter,
+    }
+    return converters[primitive]
 
 
 class InputGenerator:
@@ -846,49 +863,39 @@ class InputGenerator:
     Generates an input for benchdnn from internal representation.
     """
 
-    def __init__(self, writer):
-        self.__writer = writer
+    def __init__(self, logger: Optional[logging.Logger] = None):
+        self.logger = logger
+
+    def _generate_case(self, entry: ir.Entry):
+        Converter = get_converter(entry.prim_kind)
+        converter = Converter(entry)
+        args = [
+            "--reset",
+            "--allow-enum-tags-only=0",
+            converter.engine,
+            converter.dir,
+            converter.aux,
+            converter.bias_mask,
+            converter.dts,
+            converter.tags,
+            converter.flags,
+            converter.attrs,
+            converter.shapes,
+        ]
+        return converter.driver, " ".join(arg for arg in args if arg)
 
     def generate(self, input, split_by_driver=False):
-        data = {}
-
-        def generate_case(entry, add_driver=True):
-            case = ""
-            if add_driver:
-                case += "--" + convert_driver(entry["prim_kind"])
-            # reset everything, because benchdnn is a state machine and options
-            # affect all following test cases
-            case += " --reset"
-            # allow extended set of tags
-            case += " --allow-enum-tags-only=0"
-
-            case += " " + convert_engine(entry["engine"])
-            # XXX: direction depends on mds (FWD_B is forward + defined bias md)
-            case += " " + convert_dir(entry)
-            case += " " + convert_aux(entry)
-            if entry["prim_kind"] == "matmul":
-                case += " " + convert_bias_mask(entry["mds"])
-            # XXX: data types configuration is not unified across drivers
-            case += " " + convert_dts(entry["mds"], entry["prim_kind"])
-            case += " " + convert_tags(entry["mds"], entry["prim_kind"])
-            case += " " + convert_flags(entry["mds"], entry["prim_kind"])
-            case += " " + convert_attrs(entry["exts"], entry["prim_kind"])
-            case += " " + convert_shapes(entry["shapes"], entry["prim_kind"])
-            return case
-
-        if split_by_driver:
-            for key, value in input.items():
-                case = generate_case(value, False) + "\n"
-                driver_cases = data.get(convert_driver(value["prim_kind"]))
-                if driver_cases:
-                    data[convert_driver(value["prim_kind"])] += case
-                else:
-                    data[convert_driver(value["prim_kind"])] = case
-        else:
-            for key, value in input.items():
-                case = generate_case(value, True) + "\n"
-                if data.get("all"):
-                    data["all"] += case
-                else:
-                    data["all"] = case
-        return data
+        missing: Set[str] = set()
+        data: Dict[str, List[str]] = defaultdict(list)
+        for value in input.values():
+            try:
+                driver, args = self._generate_case(value)
+            except KeyError as e:
+                if self.logger is not None and str(e) not in missing:
+                    missing.add(str(e))
+                    self.logger.warning(f"Missing converter: {e!s}")
+                continue
+            if not split_by_driver:
+                driver, args = "all", f"--{driver} {args}"
+            data[driver].append(args)
+        return {k: "\n".join(v) for k, v in data.items()}
diff --git a/scripts/verbose_converter/src/breakdown_generator.py b/scripts/verbose_converter/src/breakdown_generator.py
index 23ed3ada8d7..772f4c0dcf1 100644
--- a/scripts/verbose_converter/src/breakdown_generator.py
+++ b/scripts/verbose_converter/src/breakdown_generator.py
@@ -1,5 +1,5 @@
 ################################################################################
-# Copyright 2022-2023 Intel Corporation
+# Copyright 2022-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,35 +14,44 @@
 # limitations under the License.
 ################################################################################
 
+from collections import defaultdict
+from typing import Any, Dict, List
+
+from . import ir
+
+
+class Aggregate:
+    def __init__(self):
+        self.occurrences = 0
+        self.time = 0.0
+
+    def add(self, occurrence: float):
+        self.occurrences += 1
+        self.time += occurrence
+
+    def __iter__(self):
+        yield self.occurrences
+        yield self.time
+
 
 class BreakdownGenerator:
     """
     Generates an input for benchdnn from internal representation.
     """
 
-    def __init__(self, writer):
-        self.__writer = writer
+    def __init__(self, _: Any = None):  # Maintain old interface
+        pass
 
-    def generate(self, input, agg_keys):
-        data = {}
-        output = {}
+    def generate(self, input: Dict[int, ir.Entry], agg_keys: List[str]):
+        data: Dict[str, Aggregate] = defaultdict(Aggregate)
         ofs = ","
 
+        if not input:
+            return {}
+
         def key2str(key, value):
             def mds2str(mds):
-                md_fields = [
-                    "arg",
-                    "data_type",
-                    "properties",
-                    "format_kind",
-                    "tag",
-                    "strides",
-                ]
-                ffs = ":"
-                mdfs = " "
-                return mdfs.join(
-                    [ffs.join([arg[field] for field in md_fields]) for arg in mds]
-                )
+                return " ".join(map(str, mds))
 
             def aux2str(aux):
                 auxfs = " "
@@ -56,66 +65,62 @@ def aux2str(aux):
                 return str(value)
 
         # Gather occurences and aggregate time statistics
-        total_time = 0
-        for key, value in input.items():
-            item_key = ofs.join([key2str(k, value[k]) for k in agg_keys])
-            occ, time = data.get(item_key, (0, 0.0))
-            data[item_key] = (occ + 1, time + float(value["time"]))
-            total_time += float(value["time"])
+        total_time: float = 0
+        for value in input.values():
+            item_key = ofs.join(key2str(k, getattr(value, k)) for k in agg_keys)
+            data[item_key].add(value.time)
+            total_time += value.time
 
         # sort keys by increasing total time
-        sorted_item_keys = sorted(
-            data, key=lambda t: data.__getitem__(t)[1], reverse=True
-        )
-
-        cum_entry = 0
-        cum_time = 0
-        avg_call = 0
-        sorted_avg_call = {}
-        sorted_cum_time = {}
-        for key in sorted_item_keys:
-            cum_entry = cum_entry + 1
-            cum_time = cum_time + data[key][1]
-            avg_call = avg_call + (data[key][0] - avg_call) / cum_entry
+        sorted_keys = sorted(data, key=lambda t: data[t].time, reverse=True)
+
+        cum_entry: int = 0
+        cum_time: float = 0
+        avg_call: float = 0
+        sorted_avg_call: Dict[str, float] = {}
+        sorted_cum_time: Dict[str, float] = {}
+        for key in sorted_keys:
+            item = data[key]
+            cum_entry += 1
+            cum_time = cum_time + item.time
+            avg_call = avg_call + (item.occurrences - avg_call) / cum_entry
             sorted_avg_call[key] = avg_call
             sorted_cum_time[key] = cum_time
 
-        output["all"] = (
-            ofs.join(
-                agg_keys
-                + [
-                    "ncalls",
-                    "time(ms)",
-                    "overall%",
-                    "agg_ncalls(avg)",
-                    "agg_time(ms)",
-                    "agg_overall%",
-                ]
-            )
-            + "\n"
-        )
+        fixed_keys = [
+            "ncalls",
+            "time(ms)",
+            "overall%",
+            "agg_ncalls(avg)",
+            "agg_time(ms)",
+            "agg_overall%",
+        ]
+
+        output = ofs.join(agg_keys + fixed_keys)
 
         def str_num(s):
-            return "{val:.2f}".format(val=s)
+            return f"{s:.2f}"
 
         def str_pct(s):
-            return "{val:.2f}".format(val=s * 100)
-
-        ors = "\n"
-        output["all"] += ors.join(
-            [
-                ofs.join(
-                    [
-                        str(item_key),
-                        str(data[item_key][0]),
-                        str_num(data[item_key][1]),
-                        str_pct(data[item_key][1] / total_time),
-                        str_num(sorted_avg_call[item_key]),
-                        str_num(sorted_cum_time[item_key]),
-                        str_pct(sorted_cum_time[item_key] / total_time),
-                    ]
-                )
-                for item_key in sorted_item_keys
+            return f"{s * 100:.2f}"
+
+        def safe_div(n, d):
+            # Assumption: 0 <= n <= d
+            # If the assumption is broken, we can still raise ZeroDivisionError
+            return 1 if n == d == 0 else n / d
+
+        for key in sorted_keys:
+            item = data[key]
+            avg_call = sorted_avg_call[key]
+            cum_time = sorted_cum_time[key]
+            fields = [
+                str(key),
+                str(item.occurrences),
+                str_num(item.time),
+                str_pct(safe_div(item.time, total_time)),
+                str_num(avg_call),
+                str_num(cum_time),
+                str_pct(safe_div(cum_time, total_time)),
             ]
-        )
-        return output
+            output += "\n" + ofs.join(fields)
+        return {"all": output}
diff --git a/scripts/verbose_converter/src/dnnl_parser.py b/scripts/verbose_converter/src/dnnl_parser.py
index 88bfcba3405..78d24da59bd 100644
--- a/scripts/verbose_converter/src/dnnl_parser.py
+++ b/scripts/verbose_converter/src/dnnl_parser.py
@@ -13,6 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################
+from typing import Iterable, List, Tuple
+
+from . import ir, parse
+
+
+class LoggingContext:
+    def __init__(self, logger):
+        self.logger = logger
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, _):
+        if type is not None and issubclass(type, parse.ParseError):
+            self.logger.warning(str(value))
+            return True
 
 
 class LogParser:
@@ -21,32 +37,10 @@ class LogParser:
     representation.
     """
 
-    def __init__(self, writer, input=""):
-        # each data entry is a dictionary that consists of:
-        # engine(str),
-        # primitive(str),
-        # implementation(str),
-        # prop_kind(str),
-        # aux({field(str) : value(str)}),
-        # mds(
-        #     {
-        #         arg(str): {
-        #             data_type(str),
-        #             properties(str),
-        #             format_kind(str),
-        #             tag(str),
-        #             strides(str),
-        #             flags(str),
-        #         }
-        #     }
-        # )
-        # shapes(str)
-        # extensions(str)
-        # time(float)
-        self.__raw_data = []
-        self.__data = {}
-        self.__writer = writer
-        self.__input = input
+    def __init__(self, logger, input: Iterable[str] = ()):
+        self.input = input
+        self.error_handler = LoggingContext(logger)
+        self.data: List[Tuple[str, ir.Entry]] = []
 
     def process(self, filter_events):
         """
@@ -61,436 +55,8 @@ def process(self, filter_events):
         None
         """
 
-        def convert_primitive(log_entry, template, version):
-            """
-            Converts oneDNN verbose primitive entry into the internal
-            representation.
-            """
-
-            def split_arg_dt(arg_dt):
-                def buffer(dt):
-                    return {"data": dt, "offset": 0}
-
-                def eof(buf):
-                    return buf["offset"] >= len(buf["data"])
-
-                def get_data(buf):
-                    if eof(buf):
-                        return None
-                    return buf["data"][buf["offset"] :]
-
-                def read_int(buf):
-                    data = get_data(buf)
-                    if not data:
-                        return None
-                    if data[0] not in "123456789":
-                        return None
-                    for n, c in enumerate(data):
-                        if c not in "0123456789":
-                            buf["offset"] += n
-                            return int(data[:n])
-                    buf["offset"] += len(data)
-                    return int(data)
-
-                def read_literal(buf, literal):
-                    data = get_data(buf)
-                    if not data:
-                        return None
-                    if not data.startswith(literal):
-                        return None
-                    buf["offset"] += len(literal)
-                    return True
-
-                def parse_int_type(dt):
-                    buf = buffer(dt)
-                    if not (read_literal(buf, "u") or read_literal(buf, "s")):
-                        return False
-                    if not read_int(buf):
-                        return False
-                    return eof(buf)
-
-                def parse_float_type(dt):
-                    buf = buffer(dt)
-                    read_literal(buf, "b")  # ignore b in bf16
-                    if not read_literal(buf, "f"):
-                        return False
-                    if not read_int(buf):
-                        return False
-                    if eof(buf):
-                        return True  # f16, f32, f64
-                    if not read_literal(buf, "_e"):
-                        return False
-                    if not read_int(buf):
-                        return False
-                    if not read_literal(buf, "m"):
-                        return False
-                    if not read_int(buf):
-                        return False
-                    return eof(buf)  # f8_eXmY
-
-                parts = arg_dt.split("_")
-                for split in range(1, len(parts)):
-                    input_parts = parts[:split]
-                    dt_parts = parts[split:]
-                    dt = "_".join(dt_parts)
-                    if dt == "undef":
-                        return "_".join(input_parts), dt
-                    if parse_int_type(dt) or parse_float_type(dt):
-                        return "_".join(input_parts), dt
-
-            def convert_mds(log_mds, version):
-                mds = []
-                for md in log_mds.split(" "):
-                    fields = md.split(":")
-                    idx = 0
-
-                    # if version >= 1:
-                    #     arg:dt:properties:format_kind:tag:strides:flags
-                    ##       ^
-                    # else:
-                    #     arg_dt:properties:format_kind:tag:strides:flags
-                    # (note) Legacy way could have collisions with `arg` and
-                    #   `dt` since `_` used as a delimiter and as a part of the
-                    #   name.
-                    arg = None
-                    data_type = None
-                    if int(version) >= 1:
-                        arg = fields[idx]
-                        idx += 1
-                        data_type = fields[idx]
-                        idx += 1
-                    else:
-                        arg_dt = fields[idx]
-                        idx += 1
-                        arg, data_type = split_arg_dt(arg_dt)
-
-                    properties = fields[idx]
-                    idx += 1
-                    format_kind = fields[idx]
-                    idx += 1
-                    tag = fields[idx]
-                    idx += 1
-
-                    # Add compatibility for v3.1 verbose and below,
-                    # when strides delimeter is absent.
-                    # TODO: remove eventually.
-                    strides = ""
-                    if "f" not in fields[idx] and format_kind != "undef":
-                        strides = fields[idx]
-                        idx += 1
-
-                    flags = {}
-                    flags["value"] = fields[idx]
-                    idx += 1
-                    if len(fields) > idx:
-                        flag_fields = fields[idx:]
-                        for f in flag_fields:
-                            if f[:3] == "s8m":
-                                flags["s8_comp_mask"] = f[3:]
-                            if f[:3] == "zpm":
-                                flags["zp_comp_mask"] = f[3:]
-
-                    mds.append(
-                        {
-                            "arg": arg,
-                            "data_type": data_type,
-                            "properties": properties,
-                            "format_kind": format_kind,
-                            "tag": tag,
-                            "strides": strides,
-                            "flags": flags,
-                        }
-                    )
-                return mds
-
-            def convert_aux(log_aux, version):
-                aux = {}
-                if log_aux == "":
-                    return aux
-                for log_aux_l in log_aux.split(" "):
-                    # Handle strings like NAME:VAL1[:VAL2[:VAL3...]]
-                    res = log_aux_l.split(":")
-                    field = res[0]
-                    value = ""
-                    last_idx = len(res) - 1
-                    for i in range(1, last_idx):
-                        val_i = res[i]
-                        value += f"{val_i}:"
-                    val_n = res[last_idx]
-                    value += f"{val_n}"
-                    aux[field] = value
-                return aux
-
-            def convert_prim_kind(prim_kind, version):
-                return prim_kind
-
-            def convert_exts(exts, version):
-                def extract_attr(attrs, type):
-                    start_idx = attrs.find(type)
-                    if start_idx == -1:
-                        return ""
-
-                    start_idx += len(type) + 1
-                    end_symbol = " "
-                    end_idx = attrs.find(end_symbol, start_idx)
-                    if end_idx == -1:
-                        end_idx = None
-                    return attrs[start_idx:end_idx]
-
-                def convert_structure_to_ir_seq(ir, value):
-                    params = value.split(":")
-                    fields = list(ir.keys())
-                    ir.update(
-                        (fields[i], params[i])
-                        for i in range(0, min(len(params), len(fields)))
-                    )
-                    return ir
-
-                def convert_post_ops(value):
-                    def convert_binary_post_op(value):
-                        p_op = {"alg": "", "dt": "f32", "mask": "0", "tag": None}
-                        p_op = convert_structure_to_ir_seq(p_op, value)
-                        p_op["prim_kind"] = "binary"
-                        return p_op
-
-                    def convert_dw_post_op(value):
-                        p_op = {
-                            "alg": "",
-                            "ksp": "",
-                            "dst_dt": "f32",
-                            "wei_dt": "f32",
-                            "scales": {"mask": "0", "value": None},
-                        }
-                        params = value.split(":")
-                        len_params = len(params)
-                        p_op["alg"] = params[0]
-                        p_op["ksp"] = params[1]
-                        if len_params > 2:
-                            p_op["dst_dt"] = params[2]
-                        if len_params > 3:
-                            p_op["wei_dt"] = "s8"
-                            p_op["scales"]["mask"] = params[3]
-                        if len_params > 4:
-                            p_op["scales"]["value"] = params[4]
-                        return p_op
-
-                    def convert_eltwise_post_op(value):
-                        p_op = {
-                            "alg": "",
-                            "alpha": "1.0",
-                            "beta": "0.0",
-                            "scale": "1.0",
-                        }
-                        return convert_structure_to_ir_seq(p_op, value)
-
-                    def convert_sum_post_op(value):
-                        p_op = {"alg": "", "scale": "1.0", "zp": "0", "dt": ""}
-                        return convert_structure_to_ir_seq(p_op, value)
-
-                    def convert_prelu_post_op(value):
-                        p_op = {"alg": "", "mask": "0"}
-                        return convert_structure_to_ir_seq(p_op, value)
-
-                    convert = {
-                        "binary": convert_binary_post_op,
-                        "dw": convert_dw_post_op,
-                        "eltwise": convert_eltwise_post_op,
-                        "sum": convert_sum_post_op,
-                        "prelu": convert_prelu_post_op,
-                    }
-
-                    entries = value.split("+")
-                    postops = []
-                    for e in entries:
-                        for k in convert.keys():
-                            if k in e:
-                                cvt = convert.get(k)
-                                postops.append(cvt(e))
-                                break
-                    return postops
-
-                def convert_scales(value):
-                    res = {}
-                    scales = value.split("+")
-                    for s in scales:
-                        arg = s[: s.find(":")]
-                        s_wo_arg = s[s.find(":") + 1 :]
-                        scale_dict = {"mask": "0", "data_type": "f32", "groups": ""}
-                        res[arg] = convert_structure_to_ir_seq(scale_dict, s_wo_arg)
-                    return res
-
-                def convert_zero_points(value):
-                    res = {}
-                    zp_value = value.split("+")
-                    for zp in zp_value:
-                        arg = zp[: zp.find(":")]
-                        zp_value_wo_arg = zp[zp.find(":") + 1 :]
-                        zp_dict = {"mask": "0", "data_type": "s32", "groups": ""}
-                        res[arg] = convert_structure_to_ir_seq(zp_dict, zp_value_wo_arg)
-                    return res
-
-                def convert_rounding_mode(value):
-                    res = {}
-                    rounding_modes = value.split("+")
-                    for r in rounding_modes:
-                        arg = r[: r.find(":")]
-                        res[arg] = r[r.find(":") + 1 :]
-                    return res
-
-                def convert_scratchpad_mode(value):
-                    return value
-
-                def convert_fpmath_mode(value):
-                    return value
-
-                def convert_acc_mode(value):
-                    return value
-
-                def convert_dropout(value):
-                    res = {"p": 0}
-                    elems = value.split(":")
-                    res["p"] = elems[0]
-                    if len(elems) > 1:
-                        res["seed"] = elems[1]
-                        if len(elems) > 2:
-                            res["tag"] = elems[2]
-                    return res
-
-                def convert_deterministic(value):
-                    return value
-
-                converters = {
-                    "attr-post-ops": convert_post_ops,
-                    "attr-scales": convert_scales,
-                    "attr-zero-points": convert_zero_points,
-                    "attr-scratchpad": convert_scratchpad_mode,
-                    "attr-fpmath": convert_fpmath_mode,
-                    "attr-acc": convert_acc_mode,
-                    "attr-rounding-mode": convert_rounding_mode,
-                    "attr-dropout": convert_dropout,
-                    "attr-deterministic": convert_deterministic,
-                }
-                attrs = {}
-                for e in converters.keys():
-                    attr = extract_attr(exts, e)
-                    if attr != "":
-                        attrs[e] = converters[e](attr)
-                return attrs
-
-            def convert_pass(v, version):
-                return v
-
-            convert = {
-                "prim_kind": convert_prim_kind,
-                "mds": convert_mds,
-                "aux": convert_aux,
-                "exts": convert_exts,
-            }
-
-            dnnl_to_ir = {
-                "engine": "engine",
-                "prim_kind": "primitive",
-                "impl": "implementation",
-                "prop_kind": "prop_kind",
-                "mds": "memory_descriptors",
-                "exts": "attributes",
-                "aux": "auxiliary",
-                "shapes": "problem_desc",
-                "time": "exec_time",
-                "timestamp": "timestamp",
-            }
-
-            ir_req = [
-                "engine",
-                "prim_kind",
-                "impl",
-                "prop_kind",
-                "mds",
-                "exts",
-                "aux",
-                "shapes",
-            ]
-
-            entry = {}
-
-            t = template.split(",")
-            for key, value in dnnl_to_ir.items():
-                notification_level = "WARN" if key in ir_req else "INFO"
-                try:
-                    idx = t.index(value)
-                    if idx != -1:
-                        cvt = convert.get(key)
-                        if cvt is None:
-                            cvt = convert_pass
-                        field = log_entry[idx]
-                        try:
-                            entry[key] = cvt(field, version)
-                        except:
-                            self.__writer.print(
-                                f"Parser: parsing entry error: {field}: {value}",
-                                notification_level,
-                            )
-                    else:
-                        self.__writer.print(
-                            f"Parser: Unknown entry: {value}", notification_level
-                        )
-                except:
-                    self.__writer.print(
-                        f"Parser: skipping empty entry: {key}", notification_level
-                    )
-            return entry
-
-        # `verbose_template` should have `component` field as second entry, but
-        # since it gets discarded for compatibility with previous verbose
-        # outputs, it's not in the final version of the string.
-        # Restore `component` when the least compatible library version's
-        # verbose output will contain it.
-        verbose_template = (
-            "onednn_verbose,operation,engine,primitive,"
-            + "implementation,prop_kind,memory_descriptors,attributes,"
-            + "auxiliary,problem_desc"
-        )
-
-        i = len(self.__data)
-        for line in self.__input:
-            self.__raw_data.append(line.rstrip())
-            l_raw = line.split(",")
-            marker = l_raw[0]
-            if marker != "onednn_verbose":
-                continue
-
-            verbose_version = 0
-            # Check for version presence, discard 'v' from numerical version,
-            # and discard version entry for compatibility reasons.
-            # Note: to compare against `version`, one must use int() function
-            # call as arg is passed as `str` object!
-            if l_raw[1][0] == "v" and l_raw[1][1].isdigit():
-                verbose_version = l_raw[1].lstrip("v")
-                l_raw.pop(1)
-
-            # Discard a timestamp when it's supplied in a standalone line.
-            # TODO: update verbose_template instead.
-            if l_raw[1].split(".")[0].isdigit():
-                l_raw.pop(1)
-            # Skip Graph component as not supported
-            if l_raw[1] == "graph":
-                continue
-            # Remove a component from the line if presented (see a comment above)
-            if l_raw[1] == "primitive" or l_raw[1] == "ukernel":
-                l_raw.pop(1)
-
-            event = l_raw[1].split(":")[0]
-            if event == "info":
-                opt = l_raw[2]
-                if opt.split(":")[0] == "template":
-                    verbose_template = "onednn_verbose," + line.split(":")[1]
-            if event in filter_events:
-                l_converted = convert_primitive(
-                    l_raw, verbose_template + ",exec_time", verbose_version
-                )
-                if l_converted:
-                    self.__data[i] = l_converted
-                    i = i + 1
+        parser = parse.Parser(self.input, filter_events, self.error_handler)
+        self.data = list(parser)
 
     def get_data(self):
         """
@@ -505,7 +71,7 @@ def get_data(self):
         data
         """
 
-        return self.__data
+        return {i: entry for i, (_, entry) in enumerate(self.data)}
 
     def dump(self, converted=False):
         """
@@ -513,18 +79,16 @@ def dump(self, converted=False):
 
         Parameters
         ----------
-        converted (default: False) -- If True dump() prints data in internal
-        represenataion, otherwise prints data in the original form.
+        converted (default: False) -- If truthy, prints data in internal
+        representation, otherwise prints data in the original form.
 
         Returns
         -------
         None
         """
 
-        if converted:
-            [
-                self.__writer.print(f"{key}, {value}", "STDIO")
-                for key, value in self.__data.items()
-            ]
-        else:
-            [self.__writer.print(d, "STDIO") for d in self.__raw_data]
+        for i, (line, entry) in enumerate(self.data):
+            if converted:
+                print(f"{i}, {entry!r}")
+            else:
+                print(line)
diff --git a/scripts/verbose_converter/src/ir.py b/scripts/verbose_converter/src/ir.py
new file mode 100644
index 00000000000..8c399ce084a
--- /dev/null
+++ b/scripts/verbose_converter/src/ir.py
@@ -0,0 +1,436 @@
+################################################################################
+# Copyright 2024-2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import enum
+import string
+from abc import abstractmethod
+from collections.abc import MutableMapping
+from dataclasses import MISSING, dataclass, fields
+from typing import Dict, List, Optional, Union
+
+
+def alias(attr):
+    def getter(self):
+        return getattr(self, attr)
+
+    def setter(self, value):
+        return setattr(self, attr, value)
+
+    def deleter(self):
+        return delattr(self, attr)
+
+    return property(getter, setter, deleter, attr)
+
+
+def hash_str(obj):
+    return getattr(obj.__class__, "__hash_str__", str)(obj)
+
+
+@dataclass(eq=False)
+class Mapping(MutableMapping):
+    def __getitem__(self, item):
+        try:
+            value = getattr(self, item)
+            if isinstance(value, int):
+                value = str(value)
+            elif isinstance(value, float):
+                value = str(value)
+                # The verbose converter assumes defaults are 1.0, whereas
+                # oneDNN assumes defaults are 0.0. This is a workaround so that
+                # we don't accidentally drop these values, instead setting as 0
+                # or 1 which will always be sent through to the benchdnn
+                # reproducer
+                if value[-2:] == ".0":
+                    value = value[:-2]
+            return value
+        except AttributeError:
+            raise KeyError(item)
+
+    def __setitem__(self, item, value):
+        setattr(self, item, value)
+
+    def __delitem__(self, item):
+        delattr(self, item)
+
+    def __len__(self):
+        return len(fields(self))
+
+    def __iter__(self):
+        for field in fields(self):
+            yield field.name
+
+    def __hash__(self):
+        return hash(hash_str(self))
+
+    def __eq__(self, other):
+        if not isinstance(other, self.__class__):
+            return False
+        return hash_str(self) == hash_str(other)
+
+    def __str__(self):
+        raise NotImplementedError
+
+    def __hash_str__(self):
+        return str(self)
+
+    def __repr__(self):
+        child_reprs = []
+        for key, value in self.items():
+            child_reprs.append(f"{key!r}: {value!r}")
+        return "{" + ", ".join(child_reprs) + "}"
+
+
+@dataclass(eq=False)
+class MemoryDescriptor(Mapping):
+    @dataclass(eq=False)
+    class Flags(Mapping):
+        value: str
+        s8_comp_mask: Optional[str] = None
+        zp_comp_mask: Optional[str] = None
+        scale_adjust: float = 1.0
+
+        def __str__(self):
+            my_str = self.value
+            if self.s8_comp_mask is not None:
+                my_str += f":s8m{self.s8_comp_mask}"
+            if self.zp_comp_mask is not None:
+                my_str += f":s8m{self.zp_comp_mask}"
+            if self.scale_adjust != 1.0:
+                my_str += f":sa{self.scale_adjust}"
+            return my_str
+
+    arg: str
+    data_type: str
+    properties: str
+    format_kind: str
+    tag: str
+    flags: Flags
+    strides: str = ""  # Pre-v3.1 does not have strides
+
+    padding = alias("properties")
+
+    def __len__(self):
+        return 1 + super().__len__()
+
+    def __iter__(self):
+        yield from super().__iter__()
+        yield "padding"
+
+    def _format(self, tag: str, convert) -> str:
+        header = f"{self.arg}:{self.data_type}"
+        return ":".join(
+            [
+                header,
+                self.properties,
+                self.format_kind,
+                tag,
+                self.strides,
+                convert(self.flags),
+            ]
+        )
+
+    def __str__(self):
+        return self._format(self.tag, str)
+
+    def __hash_str__(self):
+        tag = self.tag
+        if "a" not in self.properties:
+            return self._format(tag, hash_str)
+        for i, c in enumerate(tag):
+            if not c.isalpha():
+                return self._format(string.ascii_lowercase[:i], hash_str)
+        return self._format(string.ascii_lowercase[: len(tag)], hash_str)
+
+
+@dataclass(eq=False)
+class Dropout(Mapping):
+    tag: Optional[str] = None
+
+    def __str__(self):
+        return self.tag or ""
+
+
+class FormattedMapping(Mapping):
+    @abstractmethod
+    def _format(self, _) -> str:
+        raise NotImplementedError
+
+    def __str__(self):
+        return self._format(str)
+
+    def __hash_str__(self):
+        return self._format(hash_str)
+
+
+@dataclass(eq=False)
+class PostOp(FormattedMapping):
+    alg: str
+
+    def _format(self, convert):
+        required_args = []
+        optional_args = []
+        seen_non_default = False
+        for field in reversed(fields(self)):
+            if field.name == "alg":
+                continue
+            value = getattr(self, field.name)
+            if field.default is MISSING:
+                required_args.append(value)
+                continue
+            if not seen_non_default and value == field.default:
+                continue
+            seen_non_default = True
+            optional_args.append(value)
+        args = [self.alg] + required_args[::-1] + optional_args[::-1]
+        return ":".join(map(convert, args))
+
+
+@dataclass(eq=False)
+class SumPostOp(PostOp):
+    alg: str = "sum"
+    scale: float = 1.0
+    zp: int = 0
+    dt: str = ""
+
+
+@dataclass(eq=False)
+class DepthwiseScales(Mapping):
+    mask: int = 0
+    value: Optional[str] = None
+
+    def __str__(self):
+        if self.value is not None:
+            return f"{self.mask}:{self.value}"
+        if self.mask != 0:
+            return str(self.mask)
+        return ""
+
+
+@dataclass(eq=False)
+class KSPMixin:
+    ksp: str
+
+
+@dataclass(eq=False)
+class DepthwisePostOp(PostOp, KSPMixin):
+    alg: str = "dw"
+    dst_dt: str = "f32"
+    wei_dt: str = "f32"
+    scales: DepthwiseScales = DepthwiseScales()
+
+    def __len__(self):
+        return 1 + super().__len__()
+
+    def __iter__(self):
+        yield "alg"
+        yield from super().__iter__()
+
+
+@dataclass(eq=False)
+class PreLUPostOp(PostOp):
+    alg: str = "prelu"
+    mask: int = 0
+    has_scaleshift: bool = False
+
+    def __getitem__(self, item):
+        if item == "has_scaleshift":
+            return "true" if self.has_scaleshift else ""
+        return super().__getitem__(item)
+
+    def __str__(self):
+        if self.has_scaleshift:
+            return f"{self.alg}:{self.mask}:true"
+        return f"{self.alg}:{self.mask}"
+
+
+@dataclass(eq=False)
+class EltwisePostOp(PostOp):
+    alpha: float = 0.0
+    beta: float = 0.0
+    scale: float = 1.0
+
+
+@dataclass(eq=False)
+class BinaryPostOp(PostOp):
+    dt: str
+    mask: int = 0
+    tag: str = "any"
+
+
+@dataclass(eq=False)
+class QuantizationParam(Mapping):
+    value: float
+    data_type: str
+    mask: int = 0
+    groups: str = ""
+
+    def __str__(self):
+        if self.groups:
+            return f"{self.mask}:{self.data_type}:{self.groups}"
+        return f"{self.mask}:{self.data_type}"
+
+
+@dataclass(eq=False)
+class Scale(QuantizationParam):
+    value: float = 1.0
+    data_type: str = "f32"
+
+
+@dataclass(eq=False)
+class ZeroPoint(QuantizationParam):
+    value: int = 0
+    data_type: str = "s32"
+
+
+class CompositeAttribute:
+    def __str__(self):
+        raise NotImplementedError
+
+
+@dataclass(eq=False)
+class FPMathMode(CompositeAttribute):
+    mode: str
+    apply_to_int: bool = False
+
+    def __str__(self):
+        a2i_str = ":true" if self.apply_to_int else ""
+        return self.mode + a2i_str
+
+
+class RoundingMode(CompositeAttribute, enum.Enum):
+    ENVIRONMENT = "environment"
+    STOCHASTIC = "stochastic"
+
+    def __str__(self):
+        return self.value
+
+
+Attribute = Union[
+    str,  # acc-mode, etc
+    FPMathMode,
+    Dropout,
+    List[PostOp],
+    Dict[str, Scale],
+    Dict[str, ZeroPoint],
+    Dict[str, RoundingMode],
+    Scale,  # oscale
+]
+
+
+@dataclass(eq=False)
+class Attributes(FormattedMapping):
+    acc_mode: Optional[str] = None
+    deterministic: Optional[str] = None
+    dropout: Optional[Dropout] = None
+    fpmath: Optional[FPMathMode] = None
+    oscale: Optional[Scale] = None
+    post_ops: Optional[List[PostOp]] = None
+    rounding_mode: Optional[Dict[str, RoundingMode]] = None
+    scales: Optional[Dict[str, Scale]] = None
+    scratchpad: Optional[str] = None
+    zero_points: Optional[Dict[str, ZeroPoint]] = None
+
+    acc = alias("acc_mode")
+
+    @staticmethod
+    def _field_name_to_attr_name(field_name: str):
+        return "attr-" + field_name.replace("_", "-")
+
+    def _attr_name_to_field_name(self, item: str):
+        original_item = item
+        for field in fields(self):
+            if item == self._field_name_to_attr_name(field.name):
+                return field.name
+        raise KeyError(original_item)
+
+    def __getitem__(self, item: str):
+        value = getattr(self, self._attr_name_to_field_name(item))
+        if value is None:
+            raise KeyError(item)
+        return value
+
+    def __setitem__(self, item: str, value: Attribute):
+        return setattr(self, self._attr_name_to_field_name(item), value)
+
+    def __delitem__(self, item: str):
+        setattr(self, self._attr_name_to_field_name(item), None)
+
+    def __iter__(self):
+        for field in fields(self):
+            if getattr(self, field.name) is not None:
+                yield self._field_name_to_attr_name(field.name)
+
+    def __len__(self):
+        return len(list(iter(self)))
+
+    def _format(self, convert):
+        parts = []
+        for key, attr in self.items():
+            if isinstance(attr, list):
+                sub_parts = "+".join(map(convert, attr))
+                parts.append(f"{key}:{sub_parts}")
+            elif isinstance(attr, dict):
+                converted = (f"{k}:{convert(v)}" for k, v in attr.items())
+                combined = "+".join(converted)
+                parts.append(f"{key}:{combined}")
+            else:
+                parts.append(f"{key}:{convert(attr)}")
+        return " ".join(parts)
+
+
+@dataclass(eq=False)
+class HashableEntry(FormattedMapping):
+    operation: str
+    engine: str
+    prim_kind: str
+    impl: str
+    prop_kind: str
+    aux: Dict[str, str]
+    mds: List[MemoryDescriptor]
+    shapes: str
+    exts: Attributes
+
+    def _format(self, convert):
+        parts = [
+            self.operation,
+            self.engine,
+            self.prim_kind,
+            self.impl,
+            self.prop_kind,
+            " ".join(map(convert, self.mds)),
+            convert(self.exts),
+            " ".join(f"{k}:{convert(v)}" for k, v in self.aux.items()),
+            self.shapes,
+        ]
+        return ",".join(parts)
+
+    def __str__(self):
+        return f"onednn_verbose,v1,primitive,{super().__str__()},0"
+
+
+class Entry(HashableEntry):
+    def __init__(
+        self,
+        *,
+        time=0.0,
+        timestamp: Optional[float] = None,
+        version: int = 0,
+        **kwargs,
+    ):
+        self.time = time
+        self.timestamp = timestamp
+        self.version = version
+        super().__init__(**kwargs)
diff --git a/scripts/verbose_converter/src/parse.py b/scripts/verbose_converter/src/parse.py
new file mode 100644
index 00000000000..0b20d98bb8e
--- /dev/null
+++ b/scripts/verbose_converter/src/parse.py
@@ -0,0 +1,659 @@
+################################################################################
+# Copyright 2024-2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import string
+from contextlib import nullcontext
+from typing import (
+    ContextManager,
+    Dict,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Set,
+    Tuple,
+)
+
+from . import ir
+
+__all__ = ["Parser"]
+
+
+class ParseSpec:
+    digits = list(string.digits)
+
+    def __init__(self, buf: str):
+        self._buf = buf
+        self.offset = 0
+
+    def __str__(self):
+        return self.buf
+
+    @property
+    def buf(self):
+        return self._buf[self.offset :]
+
+    @property
+    def eof(self):
+        return self.offset >= len(self._buf)
+
+    def peek(self, n=1):
+        return self.buf[:n]
+
+    def seek(self, n=1):
+        self._read(n)
+
+    def _read(self, n: int) -> str:
+        token = self._buf[self.offset : self.offset + n]
+        self.offset += n
+        return token
+
+    def _find_str(self) -> int:
+        buf = ParseSpec(self.buf)
+        while not buf.eof and buf.peek() not in ("+", ":"):
+            buf.seek()
+        return buf.offset
+
+    def _find_uint(self) -> int:
+        buf = ParseSpec(self.buf)
+        if buf.eof or buf.peek() not in self.digits:
+            return 0
+
+        if not buf.read_literal("0"):
+            while buf.read_one_of(*self.digits):
+                pass
+        return buf.offset
+
+    def _find_int(self) -> int:
+        buf = ParseSpec(self.buf)
+        buf.read_one_of("-", "+")
+        return buf.offset + buf._find_uint()
+
+    def _find_float(self) -> int:
+        buf = ParseSpec(self.buf)
+        buf.read_one_of("-", "+")
+        if buf.eof or buf.peek() not in ["."] + self.digits:
+            return 0  # ignore [+/-][e...]
+        if not buf.read_literal("0"):
+            while buf.read_one_of(*self.digits):
+                pass
+        # else: we already read a 0.
+        if buf.read_literal("."):
+            while buf.read_one_of(*self.digits):
+                pass
+        if buf.read_literal("e"):
+            buf.read_one_of("-", "+")
+            if not buf.read_one_of(*self.digits):
+                return 0  # ignore [+/-][X][.Y]e[+/-]
+            while buf.read_one_of(*self.digits):
+                pass
+        return buf.offset
+
+    def _find_literal(self, literal):
+        if self.buf.startswith(literal):
+            return len(literal)
+        return 0
+
+    def read_str(self) -> str:
+        return self._read(self._find_str())
+
+    def read_literal(self, literal: str) -> Optional[str]:
+        offset = self._find_literal(literal)
+        if offset == len(literal):
+            return self._read(offset)
+        return None
+
+    def read_one_of(self, *literals: str) -> Optional[str]:
+        for literal in literals:
+            if self.read_literal(literal) is not None:
+                return literal
+        return None
+
+    def read_uint(self) -> Optional[int]:
+        offset = self._find_uint()
+        if offset:
+            return int(self._read(offset))
+        return None
+
+    def read_int(self) -> Optional[int]:
+        offset = self._find_int()
+        if offset:
+            return int(self._read(offset))
+        return None
+
+    def read_float(self) -> Optional[float]:
+        offset = self._find_float()
+        if offset:
+            return float(self._read(offset))
+        return None
+
+
+class ParseError(ValueError):
+    pass
+
+
+class InvalidEntryError(ParseError):
+    pass
+
+
+class ParserImpl:
+    default_template = (
+        "operation,engine,primitive,implementation,prop_kind,"
+        + "memory_descriptors,attributes,auxiliary,problem_desc,exec_time"
+    )
+    _version_map: Dict[int, type] = {}
+
+    @staticmethod
+    def parse_aux(aux: str):
+        parsed: Dict[str, str] = {}
+        if aux == "":
+            return parsed
+        for aux_l in aux.split():
+            # Handle strings like NAME:VAL1[:VAL2[:VAL3...]]
+            field, *values = aux_l.split(":", 1)
+            parsed[field] = values[0] if values else ""
+        return parsed
+
+    def parse_mds(self, descriptors):
+        try:
+            return list(map(self.parse_md, descriptors.split()))
+        except ValueError:
+            raise ValueError(f"Could not parse mds {descriptors}")
+
+    @staticmethod
+    def is_bit_layout(dt):
+        buf = ParseSpec(dt)
+        if not buf.read_literal("e"):
+            return False
+        if buf.read_uint() is None:
+            return False
+        if not buf.read_literal("m"):
+            return False
+        if buf.read_uint() is None:
+            return False
+        return buf.eof  # eXmY
+
+    def is_float_type(self, dt):
+        buf = ParseSpec(dt)
+        buf.read_literal("b")  # ignore b in bf16
+        if not buf.read_literal("f"):
+            return False
+        if buf.read_uint() is None:
+            return False
+        if buf.eof:
+            return True  # bf16, f16, f32, f64
+        if not buf.read_literal("_"):
+            return False
+        return self.is_bit_layout(buf.buf)  # fZ_eXmY
+
+    @staticmethod
+    def is_int_type(dt):
+        buf = ParseSpec(dt)
+        if not buf.read_one_of("u", "s"):
+            return False
+        if buf.read_uint() is None:
+            return False
+        return buf.eof
+
+    def is_data_type(self, dt):
+        return (
+            dt == "undef"
+            or self.is_int_type(dt)
+            or self.is_float_type(dt)
+            or self.is_bit_layout(dt)
+        )
+
+    @staticmethod
+    def parse_md_flags(flags, fields):
+        flags = ir.MemoryDescriptor.Flags(value=flags or "f0")
+        for field in fields:
+            if field[:3] == "s8m":
+                flags.s8_comp_mask = field[3:]
+            elif field[:3] == "zpm":
+                flags.zp_comp_mask = field[3:]
+            elif field[:2] == "sa":
+                flags.scale_adjust = float(field[2:])
+        return flags
+
+    def parse_md(self, descriptor):
+        fields = descriptor.split(":")
+        arg_dt, properties, format_kind, tag = fields[:4]
+        arg_dt_parts = arg_dt.split("_")
+        for i in range(1, len(arg_dt_parts)):
+            arg = "_".join(arg_dt_parts[:i])
+            dt = "_".join(arg_dt_parts[i:])
+            if self.is_data_type(dt):
+                break
+        else:
+            if len(arg_dt_parts) != 1 or not self.is_data_type(arg_dt):
+                raise ParseError(
+                    f"Could not parse memory descriptor {descriptor}"
+                )
+            arg, dt = "data", arg_dt
+
+        strides = ""
+        if "f" not in fields[4] and format_kind != "undef":
+            strides = fields[4]
+            flags = self.parse_md_flags(fields[5], fields[6:])
+        else:
+            flags = self.parse_md_flags(fields[4], fields[5:])
+        return ir.MemoryDescriptor(
+            arg=arg,
+            data_type=dt,
+            properties=properties,
+            format_kind=format_kind,
+            tag=tag,
+            strides=strides,
+            flags=flags,
+        )
+
+    def parse_attrs(self, attrs):
+        exts = ir.Attributes()
+        for attr in attrs.split():
+            spec = ParseSpec(attr)
+            name, args = spec.read_str(), ""
+            if spec.read_literal(":"):
+                args = spec.buf
+            if name in ("attr-acc-mode", "attr-acc"):
+                exts.acc_mode = self.parse_acc_mode(args)
+            elif name == "attr-deterministic":
+                exts.deterministic = self.parse_deterministic(args)
+            elif name == "attr-dropout":
+                exts.dropout = self.parse_dropout(args)
+            elif name == "attr-fpmath":
+                exts.fpmath = self.parse_fpmath_mode(args)
+            # Kept for compatibility with v2.7 and below.
+            elif name == "attr-oscale":
+                exts.oscale = self.parse_oscale(args)
+            elif name == "attr-post-ops":
+                exts.post_ops = self.parse_post_ops(args)
+            elif name == "attr-rounding-mode":
+                exts.rounding_mode = self.parse_rounding_modes(args)
+            elif name == "attr-scales":
+                exts.scales = self.parse_scales(args)
+            elif name == "attr-scratchpad":
+                exts.scratchpad = self.parse_scratchpad_mode(args)
+            elif name == "attr-zero-points":
+                exts.zero_points = self.parse_zero_points(args)
+        return exts
+
+    def parse_post_ops(self, post_ops: str):
+        spec = ParseSpec(post_ops)
+        parsed: List[ir.PostOp] = []
+        while True:
+            alg = spec.read_str()
+            if alg == "sum":
+                parsed.append(self.parse_sum_post_op(spec))
+            elif alg == "dw":
+                parsed.append(self.parse_dw_post_op(spec))
+            elif alg == "prelu":
+                parsed.append(self.parse_prelu_post_op(spec))
+            elif alg.startswith("eltwise_"):
+                parsed.append(self.parse_eltwise_post_op(spec, alg))
+            elif alg.startswith("binary_"):
+                parsed.append(self.parse_binary_post_op(spec, alg))
+            else:
+                raise ParseError(f"Unexpected post-op: {alg}")
+            if not spec.read_literal("+"):
+                break
+        return parsed
+
+    @staticmethod
+    def parse_sum_post_op(spec) -> ir.SumPostOp:
+        post_op = ir.SumPostOp()
+        if spec.read_literal(":"):
+            post_op.scale = spec.read_float()
+        if spec.read_literal(":"):
+            post_op.zp = spec.read_int()
+        if spec.read_literal(":"):
+            post_op.dt = spec.read_str()
+        return post_op
+
+    @staticmethod
+    def parse_dw_post_op(spec) -> ir.DepthwisePostOp:
+        if not spec.read_literal(":"):
+            raise ParseError("Expected argument for depthwise post-op")
+        ksp = spec.read_str()
+        post_op = ir.DepthwisePostOp(ksp=ksp)
+        if spec.read_literal(":"):
+            post_op.dst_dt = spec.read_str()
+        if spec.read_literal(":"):
+            post_op.wei_dt = "s8"
+            post_op.scales.mask = spec.read_uint()
+        if spec.read_literal(":"):
+            post_op.scales.value = spec.read_str()
+        return post_op
+
+    @staticmethod
+    def parse_prelu_post_op(spec) -> ir.PreLUPostOp:
+        post_op = ir.PreLUPostOp()
+        if spec.read_literal(":"):
+            post_op.mask = spec.read_uint()
+        if spec.read_literal(":"):
+            post_op.has_scaleshift = spec.read_str() == "true"
+        return post_op
+
+    @staticmethod
+    def parse_eltwise_post_op(spec, alg) -> ir.EltwisePostOp:
+        post_op = ir.EltwisePostOp(alg=alg)
+        if spec.read_literal(":"):
+            post_op.alpha = spec.read_float()
+        if spec.read_literal(":"):
+            post_op.beta = spec.read_float()
+        if spec.read_literal(":"):
+            post_op.scale = spec.read_float()
+        return post_op
+
+    @staticmethod
+    def parse_binary_post_op(spec, alg) -> ir.BinaryPostOp:
+        if not spec.read_literal(":"):
+            raise ParseError("Expected data type for binary post-op")
+        dt = spec.read_str()
+        post_op = ir.BinaryPostOp(alg=alg, dt=dt)
+        if spec.read_literal(":"):
+            post_op.mask = spec.read_uint()
+        if spec.read_literal(":"):
+            post_op.tag = spec.read_str()
+        return post_op
+
+    @staticmethod
+    def parse_dropout(args: str) -> ir.Dropout:
+        return ir.Dropout(tag=args if args else None)
+
+    @staticmethod
+    def parse_per_argument(attr, name, parse):
+        spec = ParseSpec(attr)
+        parsed = {}
+        while True:
+            arg = spec.read_str()
+            if not spec.read_literal(":"):
+                raise ParseError(f"Expected mask for {arg} {name}")
+            parsed[arg] = parse(spec)
+            if not spec.read_literal("+"):
+                break
+        return parsed
+
+    def parse_scales(self, scales: str):
+        return self.parse_per_argument(scales, "scale", self.parse_scale)
+
+    @staticmethod
+    def parse_quantization_param(spec, read_value, param_type):
+        # Old style: mask[:[value[*]|*]]
+        # New style: mask[:data_type[:groups]]
+        param = param_type()
+        param.mask = spec.read_uint()
+        if spec.read_literal(":"):
+            value = read_value()
+            if value is not None:
+                param.value = value
+                spec.read_literal("*")
+            elif spec.read_literal("*"):
+                pass
+            elif not spec.eof:  # new style
+                param.data_type = spec.read_str()
+                if spec.read_literal(":"):
+                    param.groups = spec.read_str()
+        return param
+
+    # v2.7 and below
+    def parse_oscale(self, oscale: str):
+        spec = ParseSpec(oscale)
+        return self.parse_scale(spec)
+
+    def parse_scale(self, spec) -> ir.Scale:
+        return self.parse_quantization_param(spec, spec.read_float, ir.Scale)
+
+    def parse_zero_points(self, zps: str):
+        return self.parse_per_argument(zps, "zero point", self.parse_zero_point)
+
+    def parse_zero_point(self, spec) -> ir.ZeroPoint:
+        return self.parse_quantization_param(spec, spec.read_int, ir.ZeroPoint)
+
+    @staticmethod
+    def parse_fpmath_mode(mathmode: str) -> ir.FPMathMode:
+        spec = ParseSpec(mathmode)
+        mode = spec.read_str()
+        apply_to_int = False
+        if spec.read_literal(":"):
+            apply_to_int = spec.read_str() == "true"
+        return ir.FPMathMode(mode=mode, apply_to_int=apply_to_int)
+
+    @staticmethod
+    def parse_rounding_mode(rounding_mode: str) -> ir.RoundingMode:
+        rm = rounding_mode.lower()
+        for member in ir.RoundingMode.__members__.values():
+            if str(member) == rm:
+                return member
+        else:
+            raise ValueError(f"Invalid rounding mode {rounding_mode}")
+
+    def parse_rounding_modes(self, rounding_modes: str):
+        spec = ParseSpec(rounding_modes)
+        modes: Dict[str, ir.RoundingMode] = {}
+        while True:
+            arg = spec.read_str()
+            if not spec.read_literal(":"):
+                raise ValueError("Expected rounding mode")
+            mode = self.parse_rounding_mode(spec.read_str())
+            modes[arg] = mode
+            if not spec.read_literal("+"):
+                break
+        return modes
+
+    identity = staticmethod(lambda x: x)
+
+    # Additional attributes
+    parse_acc_mode = identity
+    parse_deterministic = identity
+    parse_scratchpad_mode = identity
+
+    # Additional template components
+    parse_operation = identity
+    parse_prim_kind = identity
+    parse_prop_kind = identity
+    parse_engine = identity
+    parse_impl = identity
+    parse_shapes = identity
+    parse_time = staticmethod(float)
+    parse_timestamp = staticmethod(float)
+
+    def dnnl_to_ir(self):
+        return {
+            "operation": ("operation", self.parse_operation, True),
+            "engine": ("engine", self.parse_engine, True),
+            "primitive": ("prim_kind", self.parse_prim_kind, True),
+            "implementation": ("impl", self.parse_impl, True),
+            "prop_kind": ("prop_kind", self.parse_prop_kind, True),
+            "memory_descriptors": ("mds", self.parse_mds, True),
+            "attributes": ("exts", self.parse_attrs, True),
+            "auxiliary": ("aux", self.parse_aux, True),
+            "problem_desc": ("shapes", self.parse_shapes, True),
+            "exec_time": ("time", self.parse_time, False),
+            "timestamp": ("timestamp", self.parse_timestamp, False),
+        }
+
+    def parse(self, line: str, template: Optional[str]):
+        if template is None:
+            template = self.default_template
+        entry = {}
+        fields = template.rstrip().split(",")
+        values = line.rstrip().split(",")
+        mapping = self.dnnl_to_ir()
+        min_fields = sum((mapping[field][2] for field in fields))
+        max_fields = len(fields)
+        if len(values) < min_fields:
+            raise InvalidEntryError("parse error: too few fields to parse")
+        if len(values) > max_fields:
+            raise InvalidEntryError("parse error: too many fields to parse")
+        mapped = dict(zip(fields, values))
+        for field, (key, parse, reqd) in mapping.items():
+            if field not in mapped:
+                if not reqd:
+                    continue
+                raise InvalidEntryError(f"parse error: missing {field} field")
+            value = mapped[field]
+            try:
+                entry[key] = parse(value)
+            except (ParseError, ValueError) as e:
+                raise ParseError(f"parse error: {field}: {value} ({e!s})")
+        return entry
+
+
+def register(*, version: int):
+    def registrar(impl: type):
+        ParserImpl._version_map[version] = impl
+        return impl
+
+    return registrar
+
+
+@register(version=0)
+class LegacyParserImpl(ParserImpl):
+    pass
+
+
+@register(version=1)
+class V1ParserImpl(ParserImpl):
+    def parse_md(self, descriptor):
+        fields = descriptor.split(":")
+        return ir.MemoryDescriptor(
+            arg=fields[0],
+            data_type=fields[1],
+            properties=fields[2],
+            format_kind=fields[3],
+            tag=fields[4],
+            strides=fields[5],
+            flags=self.parse_md_flags(fields[6], fields[7:]),
+        )
+
+
+class Parser:
+    _parser_impls: Dict[int, ParserImpl] = {}
+    _default_events = "exec", "create", "create_nested"
+
+    def __init__(
+        self,
+        input: Iterable[str],
+        events: Iterable[str] = _default_events,
+        error_handler: ContextManager = nullcontext(),
+    ):
+        self.input = input
+        self.events = set(events)
+        self.error_handler = error_handler
+
+    def _fix_template(self, template) -> Optional[str]:
+        return template
+
+    @staticmethod
+    def _parse_leading_fields(input: Iterable[str]):
+        MARKER = "onednn_verbose"
+        for line in map(str.rstrip, input):
+            if not line.startswith(f"{MARKER},"):
+                continue
+            try:
+                _, operation, args = line.split(",", 2)
+            except ValueError:
+                continue
+            version = 0
+            if operation.startswith("v"):
+                try:
+                    version = int(operation[1:])
+                except ValueError:
+                    pass
+                else:
+                    operation, args = args.split(",", 1)
+            timestamp = None
+            try:
+                timestamp = float(operation)
+            except ValueError:
+                pass
+            else:
+                operation, args = args.split(",", 1)
+            component = "primitive"
+            if operation in ("graph", "primitive", "ukernel"):
+                component = operation
+                operation, args = args.split(",", 1)
+            yield line, version, timestamp, component, operation, args
+
+    def __iter__(self) -> Iterator[Tuple[str, ir.Entry]]:
+        template = None
+        cache: Dict[str, dict] = {}
+        errors: Set[str] = set()
+        parsed = self._parse_leading_fields(self.input)
+        for line, version, timestamp, component, operation, args in parsed:
+            if component == "graph":
+                continue
+            event = operation.split(":", 1)[0]
+            if event == "info":
+                for marker in ("template", "prim_template"):
+                    if not args.startswith(f"{marker}:"):
+                        continue
+                    fixed_template = self._fix_template(args[len(marker) + 1 :])
+                    if fixed_template is not None:
+                        break
+                else:
+                    continue
+                first_component, rest = fixed_template.split(",", 1)
+                # Timestamp is usually out of order with respect to the
+                # template because of missing component for "graph",
+                # "primitive", "ukernel", etc.
+                if first_component == "timestamp":
+                    fixed_template = rest
+                if template != fixed_template:
+                    template = fixed_template
+                    cache.clear()
+                continue
+            if event not in self.events:
+                continue
+            leading_args, last_arg = args.rsplit(",", 1)
+            try:
+                time = float(last_arg)
+            except ValueError:
+                time = 0.0
+                leading_args = args
+            key = f"v{version},{component},{operation},{leading_args}"
+            if key in errors:
+                continue
+            success = False
+            with self.error_handler:
+                if key in cache:
+                    params = dict(cache[key])
+                    params.update(time=time, timestamp=timestamp)
+                else:
+                    new_line = f"{operation},{args}"
+                    params = self.parse(new_line, template, version)
+                    cache[key] = dict(params)
+                    if timestamp is not None:
+                        params.update(timestamp=timestamp)
+                yield line, ir.Entry(version=version, **params)
+                success = True
+            if not success:
+                errors.add(key)
+
+    def items(self) -> Iterable[Tuple[int, Tuple[str, ir.Entry]]]:
+        yield from enumerate(self)
+
+    @staticmethod
+    def _get_impl(version: int = 0) -> ParserImpl:
+        if version not in Parser._parser_impls:
+            if version not in ParserImpl._version_map:
+                raise ParseError(f"No parsers registered for version {version}")
+            Parser._parser_impls[version] = ParserImpl._version_map[version]()
+        return Parser._parser_impls[version]
+
+    def parse(self, line: str, template: Optional[str], version: int = 0):
+        impl = self._get_impl(version)
+        return impl.parse(line, template)
diff --git a/scripts/verbose_converter/src/utils.py b/scripts/verbose_converter/src/utils.py
index 69cdcf7161c..39cc525a3d9 100644
--- a/scripts/verbose_converter/src/utils.py
+++ b/scripts/verbose_converter/src/utils.py
@@ -1,5 +1,5 @@
 ################################################################################
-# Copyright 2021-2023 Intel Corporation
+# Copyright 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,19 +14,39 @@
 # limitations under the License.
 ################################################################################
 
+import functools
 import sys
 
-status = {"SUCCESS": 0, "FAILED": 1}
+
+@functools.total_ordering
+class Version:
+    def __init__(self, major: int, minor: int, patch: int):
+        self.major = major
+        self.minor = minor
+        self.patch = patch
+
+    @property
+    def _as_tuple(self):
+        return self.major, self.minor, self.patch
+
+    def __lt__(self, other):
+        return self._as_tuple < other._as_tuple
+
+    def __eq__(self, other):
+        return self._as_tuple == other._as_tuple
 
 
 def get_version():
-    version = sys.version.split(" ")[0].split(".")
-    return {"major": int(version[0]), "minor": int(version[1]), "fix": int(version[2])}
+    return Version(*map(int, sys.version.split(" ")[0].split(".")))
 
 
 def check_version():
-    v = get_version()
-    if not (v["major"] >= 3 and v["minor"] >= 6):
-        print("ERROR: unsupported python version")
-        return status.get("FAILED")
-    return status.get("SUCCESS")
+    return get_version() >= Version(3, 7, 0)
+
+
+def dedent(multiline):
+    lines = multiline.split("\n")
+    if len(lines) == 1:
+        return lines[0].strip()
+    indent = min(len(line) - len(line.lstrip()) for line in lines[1:])
+    return (lines[0] + "\n".join(line[indent:] for line in lines[1:])).strip()
diff --git a/scripts/verbose_converter/src/writer.py b/scripts/verbose_converter/src/writer.py
deleted file mode 100644
index 5ec54d7918a..00000000000
--- a/scripts/verbose_converter/src/writer.py
+++ /dev/null
@@ -1,30 +0,0 @@
-################################################################################
-# Copyright 2021-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-################################################################################
-
-
-class Writer:
-    def __init__(self, verbose_level=0):
-        self.__verbose_level = int(verbose_level)
-        self.__file = None
-
-    def print(self, string, type):
-        if type == "WARN":
-            print(f"{type}: {string}")
-        if type == "INFO":
-            if self.__verbose_level > 0:
-                print(string)
-        if type == "STDIO":
-            print(string)
diff --git a/scripts/verbose_converter/tests/benchdnn_test.py b/scripts/verbose_converter/tests/benchdnn_test.py
index db0d108c8d0..0bbe0a9fd83 100755
--- a/scripts/verbose_converter/tests/benchdnn_test.py
+++ b/scripts/verbose_converter/tests/benchdnn_test.py
@@ -15,139 +15,157 @@
 # limitations under the License.
 ################################################################################
 
-import sys, os, subprocess
-
 import argparse
+import os
+import subprocess
+import sys
 from argparse import RawTextHelpFormatter
+from collections import defaultdict
+from typing import Dict, List
 
-# add parent dir to sys.path to make verbose_converter visible for test
-current_dir = os.path.dirname(os.path.realpath(__file__))
-parent_dir = os.path.dirname(current_dir)
-sys.path.append(parent_dir)
 
-import verbose_converter
-from src import benchdnn_generator as benchdnn_gen
+class TestingException(RuntimeError):
+    def __init__(self, msg):
+        from src.utils import dedent  # type: ignore[import-not-found]
+
+        super().__init__(dedent(msg))
+
 
-status = {"SUCCESS": 0, "FAILED": 1}
+class FailedCase(TestingException):
+    def __init__(self, status: str, repro: str):
+        super().__init__(f"Failed case: {status}: {repro}")
 
 
 def convert_dir_benchdnn2verbose(dir):
-    return {
+    mapping = {
         "FWD_D": "forward_training",
         "FWD_B": "forward_training",
         "FWD_I": "forward_inference",
         "BWD_D": "backward_data",
         "BWD_W": "backward_weights",
         "BWD_DW": "backward",
-    }.get(dir)
-
-
-def filter_verbose(benchdnn_verbose, driver):
-    v = ""
-    benchdnn_prop_kind = None
-
-    for test_case in benchdnn_verbose.split("__REPRO"):
-        verbose_lines = test_case.split("\n")
-        # `start` with `1` as there's a leftover from previous REPRO line.
-        for idx, l in enumerate(verbose_lines, start=1):
-            # Parse header
-            if l.find("create: ") != -1:
-                # detect prop kind in benchdnn log
-                dir = "--prop=" if driver == "rnn" else "--dir="
-                dir_start = l.find(dir)
-                if dir_start != -1:
-                    dir_end = l.find(" ", dir_start)
-                    benchdnn_prop_kind = convert_dir_benchdnn2verbose(
-                        l[dir_start + len(dir) : dir_end]
-                    )
-                else:
-                    benchdnn_prop_kind = None
-            else:
-                # detect driver
-                l_s = l.split(",")
-                primitive_idx = 5
-                d = (
-                    benchdnn_gen.convert_driver(l_s[primitive_idx])
-                    if len(l_s) > primitive_idx
-                    else ""
-                )
-                if (
-                    len(l_s) > primitive_idx
-                    and l_s[0] == "onednn_verbose"
-                    and d == driver
-                ):
-                    # filter out additional forward calls, it's located in two
-                    # positions after primitive_kind.
-                    verbose_prop_kind = l_s[primitive_idx + 2]
-                    if (
-                        benchdnn_prop_kind != None
-                        and verbose_prop_kind != benchdnn_prop_kind
-                    ):
-                        continue
-                    # Filter out fill reorders. Only the last one is actual.
-                    # `len - 1` due to status piece left in `verbose_lines` as
-                    # a product of split by `__REPRO`.
-                    if d == "reorder" and idx != len(verbose_lines) - 1:
-                        continue
-                    # Filter out transform routine till it's properly supported.
-                    # Use impl name for that due to it's the only difference
-                    # between two ukernel calls.
-                    impl_name = l_s[5]
-                    if d == "brgemm" and impl_name == "pack_B":
-                        continue
-
-                    # found primitive creation for the test case
-                    # remove time
-                    l_wo_time = "".join(f + "," for f in l.split(",")[0:-1])[0:-1]
-                    v += l_wo_time + "\n"
+    }
+    return mapping.get(dir, "undef")
+
+
+def filter_verbose(verbose: str, driver: str, filter_event: str):
+    found_cases: List[str] = []
+    tentative_cases: Dict[str, List[str]] = defaultdict(list)
+    for line in verbose.split("\n"):
+        if "__REPRO" in line:
+            # n: STATUS (Status message) __REPRO: repro
+            _, status_info, repro = map(str.strip, line.split(":", 2))
+            status_and_message = status_info.rsplit(None, 1)[0]
+            status = status_and_message.split("(", 1)[0].strip()
+            # workaround for nvim-treesitter indent bug: )
+            argname = "prop" if driver == "rnn" else "dir"
+            known_prop_kind: str = "undef"
+            for part in repro.split():
+                if part.startswith(f"--{argname}="):
+                    value = part[len(argname) + 3 :]
+                    known_prop_kind = convert_dir_benchdnn2verbose(value)
                     break
-    return [status.get("SUCCESS"), ""], v
-
 
-def generate_verbose(path_to_benchdnn, driver, batch):
+            cases = tentative_cases[known_prop_kind]
+            tentative_cases.clear()
+            if status == "SKIPPED":
+                continue
+            elif "FAILED" in status:
+                raise FailedCase(status, repro)
+            elif not cases:
+                continue
+            found_cases.append(cases[-1])
+        elif line.startswith("onednn_verbose,"):
+            # Detect driver
+            parts = line.split(",")
+            try:
+                float(parts[2])  # check for timestamp
+            except ValueError:
+                pass
+            else:
+                parts.pop(2)
+            try:
+                component = parts[2]
+                event, *_ = parts[3].split(":", 1)
+                primitive = parts[5]
+                impl_name = parts[6]
+                prop_kind = parts[7]
+            except IndexError:
+                continue
+            if component != "primitive" or event not in filter_event:
+                continue
+            if get_driver(primitive) != driver:
+                continue
+            # Filter out transform routine till it's properly supported. Use
+            # impl name for that due to it's the only difference between two
+            # ukernel calls.
+            if driver == "brgemm" and impl_name == "pack_B":
+                continue
+            # Remove primitive creation/run time
+            try:
+                float(parts[-1])
+            except ValueError:
+                continue
+            without_time = ",".join(parts[:-1])
+            # Filter out fill reorders. Only the last one is real.
+            tentative_cases[prop_kind].append(without_time)
+            if prop_kind != "undef":
+                # In case the reproducer uses the default prop kind
+                tentative_cases["undef"].append(without_time)
+    return "\n".join(found_cases)
+
+
+def generate_verbose(path_to_benchdnn, engine, driver, batch):
     benchdnn_exe = path_to_benchdnn + "/benchdnn"
     sub_env = os.environ.copy()
     sub_env["ONEDNN_PRIMITIVE_CACHE_CAPACITY"] = "0"
 
     # Runtime dimension require execution verbose output.
     # BRGEMM driver through ukernel API supports verbose only at execution.
-    sub_env["ONEDNN_VERBOSE"] = "2"
+    profile_mode = "create"
     benchdnn_mode = "I"
-    if driver == "matmul" or driver == "reorder" or driver == "brgemm":
-        sub_env["ONEDNN_VERBOSE"] = "1"
+    if driver in ("matmul", "reorder", "brgemm"):
+        profile_mode = "exec"
         benchdnn_mode = "R"
+    # Add extra noise (dispatch, etc.) to ensure it gets filtered out
+    sub_env["ONEDNN_VERBOSE"] = f"dispatch,error,check,profile_{profile_mode}"
 
     sub_args = [
         benchdnn_exe,
+        f"--engine={engine}",
         f"--{driver}",
         f"--mode={benchdnn_mode}",
-        f"-v1",
         f"--batch={batch}",
     ]
     try:
-        sub = subprocess.run(sub_args, capture_output=True, text=True, env=sub_env)
-    except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
-        return [
-            status.get("FAILED"),
-            f"subprocess.run() raised exception: " + f"{e.stdout}",
-        ], ""
-    except BaseException as e:
-        return [
-            status.get("FAILED"),
-            f"subprocess.run() raised exception: " + f"{e.args}\n{e.stdout}",
-        ], ""
+        sub = subprocess.run(
+            sub_args,
+            capture_output=True,
+            text=True,
+            env=sub_env,
+        )
+    except Exception as e:
+        raise TestingException(
+            f"subprocess.run() raised exception: {e!s}"
+        ) from None
+
     if sub.returncode != 0:
         # most likely converter generated incorrect batch file
-        return [
-            status.get("FAILED"),
-            f"subprocess.run() returned {sub.returncode},\n"
-            + f"args: {sub_args}\nstderr: {sub.stderr}",
-        ], ""
+        raise TestingException(
+            f"""
+             subprocess.run() returned {sub.returncode},
+             args: {sub_args}
+             stderr: {sub.stderr}
+             """
+        )
 
-    return filter_verbose(sub.stdout, driver=driver)
+    filter_event = "exec" if benchdnn_mode == "R" else "create"
+    return filter_verbose(sub.stdout, driver, filter_event)
 
 
 def generate_batch(verbose, driver):
+    import verbose_converter  # type: ignore[import-not-found]
+
     verbose = verbose.splitlines()
     aggregate_opts = [
         "engine",
@@ -159,8 +177,7 @@ def generate_batch(verbose, driver):
         "alg_kind",
         "shapes",
     ]
-    s, data = verbose_converter.convert(
-        verbose_level=0,
+    data = verbose_converter.convert(
         parser="oneDNN",
         input=verbose,
         action="generate",
@@ -168,68 +185,117 @@ def generate_batch(verbose, driver):
         split_output=True,
         agg_keys=aggregate_opts,
     )
-    if s != status.get("SUCCESS"):
-        return [s, f"verbose_converter.convert() returned {s}"], ""
 
-    filename = "test.generated"
-    for key, value in data.items():
-        # remove -- from driver name
-        driver_filename = key + "." + filename
-        of = open(driver_filename, "w")
-        print(value, file=of)
-    return [s, ""], driver + "." + filename
+    filename = f"{driver}.test.generated"
+    output = data.get(driver, "")
+    with open(filename, "w") as fd:
+        fd.write(f"{output}\n")
+    return filename
 
 
 def compare(driver, ref_v, comp_v):
-    ref_lines = ref_v.splitlines()
-    ref_lines = [l for l in ref_lines if driver in l]
-    comp_lines = comp_v.splitlines()
-    len(comp_lines)
-    comp_lines = [l for l in comp_lines if driver in l]
-    len(comp_lines)
-
-    for r, c in zip(ref_lines, comp_lines):
-        if r != c:
-            ref_log_filename = f"{driver}.reference.log"
-            com_log_filename = f"{driver}.computed.log"
-            ref_log = open(ref_log_filename, "w")
-            com_log = open(com_log_filename, "w")
-            print(ref_v, file=ref_log)
-            print(comp_v, file=com_log)
-            return status.get("FAILED"), f"verboses do not match,\nref: {r}\ncom: {c}"
-
-    return status.get("SUCCESS"), ""
-
-
-def test(path_to_benchdnn, driver, batch):
-    s, ref_verbose = generate_verbose(path_to_benchdnn, driver, batch)
-    if s[0] != status.get("SUCCESS"):
-        return s
-    # XXX: Maybe generate batch and run becndhnn for each verbose line
+    def filter_lines(lines):
+        for line in lines.splitlines():
+            if driver in line:
+                yield line
+
+    def without_impl(verbose_line):
+        parts = verbose_line.split(",")
+        return ",".join(parts[:6] + parts[7:])
+
+    def find_named_entry(name, entries):
+        for entry in entries:
+            entry_name, *entry_args = entry.split(":")
+            if entry_name == name:
+                return entry_args
+        return None
+
+    def accept_results(r, c):
+        if r == c:
+            return True
+
+        # TODO: Handle cases with non-unique md tags
+        #  * multiple size-1 dimensions with the same stride
+        #  * multiple dimensions with 0 stride
+        if driver == "matmul":
+            # In matmul cases with runtime dims that resolve to ones, the bias
+            # memory descriptor will potentially have the wrong mask printed in
+            # the verbose line. We do not maintain enough information to always
+            # print the correct mask, but the reference and computed verbose
+            # lines will match, up to implementation name.
+            parts = r.split(",")
+            mds = parts[8].split()
+            aux = parts[10].split()
+            shapes = parts[11].split(":", 1)
+            wei, act = list(map(lambda x: list(map(int, x.split("x"))), shapes))
+            if find_named_entry("bia", mds) is None:
+                return False
+            rt_dim_mask = find_named_entry("runtime_dims_masks", aux)
+            if rt_dim_mask is None:
+                return False
+            wei_mask, act_mask = list(map(int, rt_dim_mask))
+            if wei[-2] == 1 and wei_mask & (1 << (len(wei) - 2)):
+                return without_impl(r) == without_impl(c)
+            if act[-1] == 1 and act_mask & (1 << (len(act) - 1)):
+                return without_impl(r) == without_impl(c)
+        elif driver == "sum":
+            # There is no information in a sum verbose line about scales, so if
+            # dispatch depends on particular scale values, the implementation
+            # may change with default scales. In this case, we check that the
+            # rest of the verbose line is the same.
+            return without_impl(r) == without_impl(c)
+        return False
+
+    file_map = {"reference": ref_v, "computed": comp_v}
+    for r, c in zip(filter_lines(ref_v), filter_lines(comp_v)):
+        if accept_results(r, c):
+            continue
+        for log_type, content in file_map.items():
+            with open(f"{driver}.{log_type}.log", "w") as fd:
+                fd.write(content)
+        raise TestingException(
+            f"""
+             verboses do not match
+             ref: {r}
+             com: {c}
+             """
+        )
+
+
+def test(path_to_benchdnn, engine, driver, batch):
+    ref_verbose = generate_verbose(path_to_benchdnn, engine, driver, batch)
+    # XXX: Maybe generate batch and run benchdnn for each verbose line
     # separately to detect error on case level and not on batch level?
     # The reason behind testing on batch level is that ref_verbose generator
     # might introduce multiple verbose lines for single line in batch file
-    s, gen_batch = generate_batch(ref_verbose, driver)
-    if s[0] != status.get("SUCCESS"):
-        return s
-    s, verbose = generate_verbose(path_to_benchdnn, driver, gen_batch)
-    if s[0] != status.get("SUCCESS"):
-        return s
-
-    return compare(driver, ref_verbose, verbose)
+    com_batch = generate_batch(ref_verbose, driver)
+    com_verbose = generate_verbose(path_to_benchdnn, engine, driver, com_batch)
+    compare(driver, ref_verbose, com_verbose)
+    # XXX: Maybe run an additional loop
+    #    ref -> ref verbose -> com 1 -> com 1 verbose -> com 2 -> com 2 verbose
+    # Comparing com 1 and com 2 verbose instead would address the special cases
+    # in accept_results. We can even compare just the cases where ref and com 1
+    # don't match.
 
 
 def main():
+    relpath = "../../../build/tests/benchdnn"
     realpath = os.path.dirname(os.path.realpath(__file__))
-    print(realpath)
-    realpath_benchdnn = realpath + "/../../../build/tests/benchdnn"
+    realpath_benchdnn = os.path.realpath(f"{realpath}/{relpath}")
     args_parser = argparse.ArgumentParser(
         description="benchdnn test", formatter_class=RawTextHelpFormatter
     )
+    args_parser.add_argument(
+        "-e",
+        "--engine",
+        default="cpu",
+        choices=("cpu", "gpu"),
+        help="Engine to use to run tests",
+    )
     args_parser.add_argument(
         "-d",
         "--dataset",
-        default=realpath + "/" + "dataset_simple",
+        default=f"{realpath}/dataset_simple",
         help="input with benchdnn batch files",
     )
     args_parser.add_argument(
@@ -241,23 +307,49 @@ def main():
     args_parser.add_argument(
         "-i",
         "--inputs_path",
-        default=realpath_benchdnn + "/" + "inputs",
+        default=f"{realpath_benchdnn}/inputs",
         help="Path to benchdnn batch files",
     )
     args = args_parser.parse_args()
 
+    failed = False
     with open(args.dataset, "r") as dataset:
         for case in dataset.readlines():
-            if case[0] != "#" and case[0] != "\n":
-                [driver, batch] = case.split(",")
-                batch = batch.split("\n")[0]
-                batch_file_path = args.inputs_path + "/" + driver + "/" + batch
-                s = test(args.benchdnn_path, driver, batch_file_path)
-                s_str = "PASSED" if s[0] == status.get("SUCCESS") else "FAILED"
-                print(f"BENCHDNN TEST: {driver}, {batch}: {s_str} " + s[1])
+            case = case.split("#", 1)[0].strip()
+            if not case:
+                continue
+            driver, batch = case.split(",")
+            batch = batch.split("\n", 1)[0]
+            batch_file_path = f"{args.inputs_path}/{driver}/{batch}"
+            test_info = f"BENCHDNN TEST: {args.engine}, {driver}, {batch}"
+            try:
+                test(args.benchdnn_path, args.engine, driver, batch_file_path)
+            except Exception as e:
+                print(f"{test_info}: FAILED {e!s}")
+                failed = True
+            else:
+                print(f"{test_info}: PASSED")
+    return failed
 
-    return status.get("SUCCESS")
+
+def get_driver(primitive: str):
+    import src.benchdnn_generator as bg  # type: ignore[import-not-found]
+
+    try:
+        converter = bg.get_converter(primitive)
+    except KeyError:
+        return None
+    else:
+        return converter.driver
 
 
+# Add parent dir to sys.path to make verbose_converter visible for test
+current_dir = os.path.dirname(os.path.realpath(__file__))
+parent_dir = os.path.dirname(current_dir)
+sys.path.append(parent_dir)
+
 if __name__ == "__main__":
-    main()
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        sys.exit(0)
diff --git a/scripts/verbose_converter/tests/dataset_simple b/scripts/verbose_converter/tests/dataset_simple
index 90776b29557..586cd6a5bc6 100644
--- a/scripts/verbose_converter/tests/dataset_simple
+++ b/scripts/verbose_converter/tests/dataset_simple
@@ -1,5 +1,5 @@
 ################################################################################
-# Copyright 2021-2023 Intel Corporation
+# Copyright 2021-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ ip,shapes_ci
 lnorm,shapes_ci
 lrn,shapes_ci
 matmul,shapes_2d_ci
-pooling,shapes_basic
+pool,shapes_basic
 prelu,shapes_ci
 reduction,shapes_ci
 resampling,shapes_ci
diff --git a/scripts/verbose_converter/verbose_converter.py b/scripts/verbose_converter/verbose_converter.py
index 680cd2252a8..5479330dadb 100755
--- a/scripts/verbose_converter/verbose_converter.py
+++ b/scripts/verbose_converter/verbose_converter.py
@@ -15,82 +15,99 @@
 # limitations under the License.
 ################################################################################
 
-import sys
-
 import argparse
+import logging
+import sys
 from argparse import RawTextHelpFormatter
+from typing import IO, Dict, Iterable, List
+
+from src.benchdnn_generator import InputGenerator  # type: ignore
+from src.breakdown_generator import BreakdownGenerator  # type: ignore
+from src.dnnl_parser import LogParser  # type: ignore
+from src.utils import check_version  # type: ignore
+
+default_events = "exec", "create"
+stream_handler = logging.StreamHandler(sys.stderr)
+fmt = logging.Formatter(fmt="{levelname}: {name}: {message}", style="{")
+# workaround for nvim-treesitter indent bug: }
+stream_handler.setFormatter(fmt)
+logger = logging.getLogger("verbose_converter")
+logger.setLevel(logging.CRITICAL + 10)  # off
+logger.addHandler(stream_handler)
+
+
+def one_line(multiline: str):
+    return " ".join(map(str.strip, multiline.split("\n"))).strip()
 
-from src import utils
-from src import writer
+
+class ConverterError(RuntimeError):
+    pass
+
+
+def generate(generator, parser: LogParser, *args):
+    return generator.generate(parser.get_data(), *args)
 
 
 def convert(
-    verbose_level,
-    parser,
-    input,
-    action,
-    generator,
-    split_output,
-    agg_keys,
-    events=["create", "exec"],
-):
-    status = utils.check_version()
-    if status != utils.status.get("SUCCESS"):
-        return status
-
-    logger = writer.Writer(verbose_level=verbose_level)
-    log_parser = None
-    if parser == "oneDNN":
-        from src import dnnl_parser
+    parser: str,
+    input: Iterable[str],
+    action: str,
+    generator: str,
+    split_output: bool,
+    agg_keys: List[str],
+    events: Iterable[str] = default_events,
+) -> Dict[str, str]:
+    if not check_version():
+        raise ConverterError("Unsupported Python version")
 
-        log_parser = dnnl_parser.LogParser(logger, input)
+    log_parser: LogParser
+    if parser == "oneDNN":
+        log_parser = LogParser(logger, input)
     else:
-        logger.print("Error: unsupported parser", "STDIO")
-        return utils.status.get("FAILED")
+        raise ConverterError("Unsupported parser")
 
-    logger.print(f"Processing input ...", "INFO")
+    logger.info("Processing input ...")
     log_parser.process(events)
 
-    output = None
     if action == "dumpIR":
-        logger.print(f"Dumping data from input...", "INFO")
+        logger.info("Dumping data from input...")
         log_parser.dump(True)
-
-    if action == "generate":
-        logger.print(f"Generating output ...", "INFO")
+        return {}
+    elif action == "generate":
+        logger.info("Generating output ...")
         if generator == "benchdnn":
-            from src import benchdnn_generator
-
-            gen = benchdnn_generator.InputGenerator(logger)
-            output = gen.generate(log_parser.get_data(), split_output)
+            if "create_nested" in events:
+                logger.warning(
+                    one_line(
+                        """
+                        Benchdnn arguments generated from create_nested events
+                        may not work!
+                        """
+                    )
+                )
+            return generate(InputGenerator(logger), log_parser, split_output)
         elif generator == "breakdown":
-            from src import breakdown_generator
-
-            gen = breakdown_generator.BreakdownGenerator(logger)
-            output = gen.generate(log_parser.get_data(), agg_keys)
+            return generate(BreakdownGenerator(logger), log_parser, agg_keys)
         else:
-            logger.print("Error: unsupported generator", "STDIO")
-            return utils.status.get("FAILED")
-
-    return utils.status.get("SUCCESS"), output
+            raise ConverterError("Unsupported generator")
+    else:
+        raise ConverterError("Unsupported action")
 
 
-def validate_option(value, supported_values, str):
-    if not value in supported_values:
-        print(f"ERROR: {str}")
-        return utils.status.get("FAILED")
-    return utils.status.get("SUCCESS")
+def validate_option(value, supported_values, message):
+    if value not in supported_values:
+        raise ConverterError(message)
 
 
-def main():
-    status = utils.check_version()
-    if status != utils.status.get("SUCCESS"):
-        return status
+def main() -> int:
+    if not check_version():
+        logger.error("Unsupported Python version")
+        return 1
 
     action_opts = ["generate", "dumpIR"]
     generator_opts = ["benchdnn", "breakdown"]
     parser_opts = ["oneDNN"]
-    verbose_opts = ["0", "1"]
+    verbose_opts = [0, 1]
     aggregate_opts = [
         "engine",
         "prim_kind",
@@ -101,7 +118,7 @@ def main():
         "aux",
         "shapes",
     ]
-    event_opts = ["exec", "create"]
+    event_opts = list(default_events) + ["create_nested"]
     args_parser = argparse.ArgumentParser(
         description="oneDNN log converter", formatter_class=RawTextHelpFormatter
     )
@@ -132,12 +149,18 @@ def main():
         "--aggregate",
         nargs="+",
         default=aggregate_opts,
-        help=f"aggregates statistics on the specified keys (default: all keys but time).\nValues: {aggregate_opts}",
+        help=one_line(
+            f"""
+             aggregates statistics on the specified keys (default: all keys but
+             time). Values: {aggregate_opts}
+             """
+        ),
     )
     args_parser.add_argument(
         "-v",
         "--verbose_level",
-        default="0",
+        default=0,
+        type=int,
         help=f"verbose level (default: 0). Values: {verbose_opts}.",
     )
     args_parser.add_argument(
@@ -153,26 +176,31 @@ def main():
         "-e",
         "--events",
         nargs="+",
-        default=event_opts,
-        help=f"events to parse (default: create and exec).\nValues: {event_opts}.",
+        default=list(default_events),
+        help=one_line(
+            f"""
+             events to parse (default: create and exec). Values: {event_opts}.
+             """
+        ),
     )
     args = args_parser.parse_args()
 
     # validate options
-    status = validate_option(args.action, action_opts, "Unknown action value")
-    if status != utils.status.get("SUCCESS"):
-        return status
-    status = validate_option(
-        args.verbose_level, verbose_opts, "Unknown verbose_level value"
-    )
-    if status != utils.status.get("SUCCESS"):
-        return status
-    status = validate_option(args.parser, parser_opts, "Unknown parser value")
-    if status != utils.status.get("SUCCESS"):
-        return status
-    status = validate_option(args.generator, generator_opts, "Unknown generator value")
-    if status != utils.status.get("SUCCESS"):
-        return status
+    logger.setLevel(logging.ERROR)
+    try:
+        validate_option(args.action, action_opts, "Unknown action value")
+        validate_option(
+            args.verbose_level, verbose_opts, "Unknown verbose level"
+        )
+        validate_option(args.parser, parser_opts, "Unknown parser value")
+        validate_option(
+            args.generator, generator_opts, "Unknown generator value"
+        )
+        for event in args.events:
+            validate_option(event, event_opts, "Unknown event")
+    except ConverterError as e:
+        logger.error(str(e))
+        return 1
 
     input_data = []
     if args.input == "stdin":
@@ -181,53 +209,61 @@ def main():
             for line in sys.stdin:
                 input_data.append(line)
         else:
-            print("WARN: no input was provided to the script")
+            logger.warning("No input was provided to the script")
             args_parser.print_help()
     else:
         try:
             input_data = open(args.input, "r").readlines()
         except BaseException as e:
-            print(f"Error while reading input: {e}")
-
-    output = None
+            logger.error(f"While reading input: {e!s}")
+            return 1
 
-    event_sets = args.events if args.generator == 'breakdown' else [args.events]
+    event_sets = (
+        [[e] for e in args.events]
+        if args.generator == "breakdown"
+        else [args.events]
+    )
+    verbosity_levels = [logging.WARNING, logging.INFO]
+    logger.setLevel(verbosity_levels[args.verbose_level])
 
     for events in event_sets:
-        status, output = convert(
-            verbose_level=args.verbose_level,
-            parser=args.parser,
-            input=input_data,
-            action=args.action,
-            generator=args.generator,
-            split_output=args.split,
-            agg_keys=args.aggregate,
-            events=events
-        )
-
-        if status != utils.status.get("SUCCESS"):
-            return status
+        try:
+            output = convert(
+                parser=args.parser,
+                input=input_data,
+                action=args.action,
+                generator=args.generator,
+                split_output=args.split,
+                agg_keys=args.aggregate,
+                events=events,
+            )
+        except ConverterError as e:
+            logger.error(str(e))
+            return 1
 
-        if output != None:
+        for key, value in output.items():
+            fd: IO
+            filename = args.output
+            if args.split:
+                filename += f".{key}"
             if args.output != "stdout":
-                if output != None:
-                    for key, value in output.items():
-                        filename = args.output
-                        if args.split == True:
-                            filename += "." + key
-                        of = open(filename, "w")
-                    if args.generator == "breakdown":
-                        print(f"Event: {events}", file=of)
-                    print(value, end="", file=of)
+                fd = open(filename, "w")
+            else:
+                fd = sys.stdout
+            if args.generator == "breakdown":
+                fd.write(f"Event: {events[0]}\n")
+                fd.write(f"{value}\n")
             else:
-                if args.generator == "breakdown":
-                    print(f"Event: {events}")
-                for key, value in output.items():
-                    if args.split == False:
-                        print(f"{value}")
-                    else:
-                        print(f"--{key}\n{value}")
+                if args.split:
+                    fd.write(f"--{key}\n")
+                fd.write(f"{value}\n")
+            if args.output != "stdout":
+                fd.close()
+    return 0
 
 
 if __name__ == "__main__":
-    main()
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        sys.exit(0)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b3111b2c3cd..9aa535aadc4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2024 Intel Corporation
+# Copyright 2016-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ file(GLOB HEADERS_SUBDIR
     ${CMAKE_CURRENT_SOURCE_DIR}/../include/oneapi/dnnl/*.h
     ${CMAKE_CURRENT_SOURCE_DIR}/../include/oneapi/dnnl/*.hpp
     )
+include_directories_with_host_compiler(${PROJECT_SOURCE_DIR}/third_party)
 include_directories_with_host_compiler(${CMAKE_CURRENT_SOURCE_DIR})
 
 if (DNNL_LIBRARY_TYPE STREQUAL "SHARED")
@@ -80,10 +81,11 @@ if(DNNL_EXPERIMENTAL_SPARSE)
 endif()
 
 if(DNNL_EXPERIMENTAL_UKERNEL)
-    if(DNNL_TARGET_ARCH STREQUAL "ARCH_GENERIC")
-        message(FATAL_ERROR "ukernel API does not support generic architecture.")
+    if(DNNL_TARGET_ARCH STREQUAL "X64" OR DNNL_TARGET_ARCH STREQUAL "AARCH64")
+        message(STATUS "Experimental functionality for ukernels is enabled")
+    else()
+        message(FATAL_ERROR "ukernel API isn't supported for ${DNNL_TARGET_ARCH}.")
     endif()
-    message(STATUS "Experimental functionality for ukernels is enabled")
 endif()
 
 if(DNNL_EXPERIMENTAL_PROFILING)
@@ -123,6 +125,23 @@ if(UNIX)
     endif()
 endif()
 
+if(DNNL_XBYAK_NO_EXCEPTION)
+    add_definitions_with_host_compiler(-DDNNL_XBYAK_NO_EXCEPTION)
+endif()
+
+macro(enable_conditional_compilation4 target)
+    if(COMMAND ov_mark_target_as_cc)
+        ov_mark_target_as_cc(${target})
+        if(SELECTIVE_BUILD STREQUAL "ON")
+            # After disabling a block of code, some variables might be unused.
+            if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU"
+                    OR CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+                target_compile_options(${target} PRIVATE -Wno-unused-variable)
+            endif()
+        endif()
+    endif()
+endmacro()
+
 add_subdirectory(common)
 
 if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
@@ -152,10 +171,10 @@ endif()
 if(ONEDNN_BUILD_GRAPH)
     message(STATUS "Graph component is enabled")
 
-    if (NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND NOT DNNL_GPU_VENDOR STREQUAL "INTEL")
+    if (NOT DNNL_GPU_RUNTIME STREQUAL "NONE" AND NOT DNNL_GPU_VENDOR STREQUAL "INTEL" AND NOT DNNL_GPU_VENDOR STREQUAL "NVIDIA")
         message(FATAL_ERROR "Graph API does not support ${DNNL_GPU_VENDOR} GPU. "
             "Either disable Graph API with ONEDNN_BUILD_GRAPH=OFF or change GPU "
-            "vendor to INTEL with ONEDNN_GPU_VENDOR=INTEL.")
+            "vendor to INTEL or NVIDIA.")
     endif()
 
     if (NOT DNNL_ENABLE_PRIMITIVE STREQUAL "ALL")
@@ -164,22 +183,12 @@ if(ONEDNN_BUILD_GRAPH)
             "primitive selection with ONEDNN_ENABLE_PRIMITIVE=ALL.")
     endif()
 
-    if(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND)
-        add_definitions_with_host_compiler(-DDNNL_ENABLE_COMPILER_BACKEND)
-    endif()
     if(ONEDNN_ENABLE_GRAPH_DUMP)
         message(STATUS "Graph artifacts dump is enabled")
         add_definitions_with_host_compiler(-DDNNL_ENABLE_GRAPH_DUMP)
     endif()
 
     add_subdirectory(graph)
-    if(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND AND TARGET dnnl_graphcompiler_llvm_lib_exclude_string)
-        get_property(GC_EXCLUDE_LIBS TARGET dnnl_graphcompiler_llvm_lib_exclude_string PROPERTY INTERFACE_LINK_LIBRARIES)
-        if(DEFINED GC_EXCLUDE_LIBS AND NOT MSVC AND NOT APPLE)
-            # set the LLVM symbols as hidden, or all LLVM symbols will be exported
-            append(CMAKE_SHARED_LINKER_FLAGS "-Wl,--exclude-libs=${GC_EXCLUDE_LIBS}")
-        endif()
-    endif()
 else()
     # If graph component is not built, remove the headers from build and installation.
     list(REMOVE_ITEM HEADERS_SUBDIR
@@ -196,8 +205,7 @@ get_property(SHARED_LIB_DEPS GLOBAL PROPERTY DNNL_SUBDIR_EXTRA_SHARED_LIBS)
 add_library(${LIB_PACKAGE_NAME} ${DNNL_LIBRARY_TYPE}
     ${VERSION_RESOURCE_FILE} ${HEADERS_ROOT} ${HEADERS_SUBDIR} ${LIB_DEPS})
 
-# LINK_PRIVATE for cmake 2.8.11 compatibility
-target_link_libraries(${LIB_PACKAGE_NAME} LINK_PRIVATE ${STATIC_LIB_DEPS} ${SHARED_LIB_DEPS})
+target_link_libraries(${LIB_PACKAGE_NAME} PRIVATE ${STATIC_LIB_DEPS} ${SHARED_LIB_DEPS})
 
 set_property(TARGET ${LIB_PACKAGE_NAME} PROPERTY OUTPUT_NAME ${DNNL_LIBRARY_NAME})
 set_property(TARGET ${LIB_PACKAGE_NAME} PROPERTY VERSION "${DNNL_VERSION_MAJOR}.${DNNL_VERSION_MINOR}")
@@ -209,13 +217,20 @@ target_include_directories(${LIB_PACKAGE_NAME} PUBLIC
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
     )
 
-target_link_libraries_build(${LIB_PACKAGE_NAME}
-    "${EXTRA_SHARED_LIBS};${EXTRA_STATIC_LIBS}")
+target_link_libraries(${LIB_PACKAGE_NAME} PUBLIC "$<BUILD_INTERFACE:${EXTRA_SHARED_LIBS};${EXTRA_STATIC_LIBS}>")
 target_link_libraries_install(${LIB_PACKAGE_NAME} "${EXTRA_SHARED_LIBS}")
 if(DNNL_LIBRARY_TYPE STREQUAL "STATIC")
     target_link_libraries_install(${LIB_PACKAGE_NAME} "${EXTRA_STATIC_LIBS}")
 endif()
 
+foreach(object_library IN LISTS LIB_DEPS)
+    string(REPLACE "$<TARGET_OBJECTS:" "" object_library "${object_library}")
+    string(REPLACE ">" "" object_library "${object_library}")
+
+    # explicitly set compile PDB name as with Ninja, all targets have the same pdb name like vc<vc_ver>.pdb
+    set_target_properties(${object_library} PROPERTIES COMPILE_PDB_NAME ${object_library})
+endforeach()
+
 set(LIB_EXPORT_NAME "${LIB_PACKAGE_NAME}-targets")
 install(TARGETS ${LIB_PACKAGE_NAME}
     EXPORT "${LIB_EXPORT_NAME}"
@@ -232,7 +247,7 @@ foreach(header ${HEADERS_SUBDIR})
 endforeach()
 
 string(TOUPPER "${LIB_PACKAGE_NAME}::" LIB_NAMESPACE)
-if(DNNL_INSTALL_MODE STREQUAL "BUNDLE_V2" AND WIN32)
+if(DNNL_INSTALL_MODE STREQUAL "BUNDLE" AND WIN32)
     # Config file for binary distribution needs to define a mapping
     # DEBUG -> RELWITHMDD so that proper library (dnnld) is picked up for the
     # DEBUG configuration.
@@ -263,7 +278,7 @@ install(EXPORT ${LIB_EXPORT_NAME}
 
 # Apply a workaround to CMake config file to make it work with symlinks.
 # The patched config file is only used in oneAPI binary distribution.
-if(UNIX AND DNNL_INSTALL_MODE STREQUAL "BUNDLE_V2")
+if(UNIX AND DNNL_INSTALL_MODE STREQUAL "BUNDLE")
     install(CODE "file(READ \"${CMAKE_INSTALL_PREFIX}/${LIB_CONFIG_INSTALL_DIR}/${LIB_PACKAGE_NAME}-targets.cmake\" TARGETS_CONTENT)")
     install(CODE "string(REPLACE
                          \"get_filename_component(_IMPORT_PREFIX \\\"\\\${CMAKE_CURRENT_LIST_FILE}\\\" PATH)\"
@@ -273,7 +288,7 @@ if(UNIX AND DNNL_INSTALL_MODE STREQUAL "BUNDLE_V2")
 endif()
 
 # Install custom find modules for transitive dependencies
-if(DNNL_CPU_THREADING_RUNTIME STREQUAL "TBB")
+if("${DNNL_CPU_THREADING_RUNTIME}" MATCHES "^(TBB|TBB_AUTO)$")
     if(WIN32)
         install(FILES "../cmake/win/TBBConfig.cmake" RENAME "FindTBB.cmake"
             DESTINATION ${LIB_CONFIG_INSTALL_DIR})
@@ -298,6 +313,14 @@ if(DNNL_BLAS_VENDOR STREQUAL "ACCELERATE")
     DESTINATION ${LIB_CONFIG_INSTALL_DIR})
 endif()
 
+if(DNNL_SYCL_CUDA)
+    install(FILES
+        "../cmake/FindcuBLAS.cmake"
+        "../cmake/FindcublasLt.cmake"
+        "../cmake/FindcuDNN.cmake"
+        DESTINATION ${LIB_CONFIG_INSTALL_DIR})
+endif()
+
 # On Windows we need to add dnnl.dll path to CTESTCONFIG_PATH which is later
 # passed to ctest and Visual Studio solutions
 if(WIN32)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index ffb7d2c3831..5d698d8d0e1 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2024 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ file(GLOB SOURCES
 if(DNNL_ENABLE_JIT_PROFILING OR DNNL_ENABLE_ITT_TASKS)
     if(DNNL_TARGET_ARCH STREQUAL "AARCH64" OR DNNL_TARGET_ARCH STREQUAL "X64")
         file(GLOB ITT_SOURCES
-            ${CMAKE_CURRENT_SOURCE_DIR}/ittnotify/*.[ch]
+            ${PROJECT_SOURCE_DIR}/third_party/ittnotify/*.c
             )
         list(APPEND SOURCES ${ITT_SOURCES})
 
@@ -33,10 +33,10 @@ if(DNNL_ENABLE_JIT_PROFILING OR DNNL_ENABLE_ITT_TASKS)
             set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DITT_API_IPT_SUPPORT")
             if(UNIX OR MINGW)
                 enable_language(ASM)
-                set(ITT_PT ${CMAKE_CURRENT_SOURCE_DIR}/ittnotify/ittptmark64.S)
+                set(ITT_PT ${PROJECT_SOURCE_DIR}/third_party/ittnotify/ittptmark64.S)
             else()
                 enable_language(ASM_MASM)
-                set(ITT_PT ${CMAKE_CURRENT_SOURCE_DIR}/ittnotify/ittptmark64.asm)
+                set(ITT_PT ${PROJECT_SOURCE_DIR}/third_party/ittnotify/ittptmark64.asm)
             endif()
             list(APPEND SOURCES ${ITT_PT})
         endif()
@@ -49,13 +49,11 @@ if(NOT DNNL_CPU_RUNTIME STREQUAL "THREADPOOL")
 endif()
 
 if(NOT DNNL_EXPERIMENTAL_LOGGING)
-    # avoid building and linking spdlog if logging support is not enabled
-    list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/spdlog/*")
     list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/logging.cpp")
-    list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/logging.hpp")
 endif()
 
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_common)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
+enable_conditional_compilation4(${OBJ_LIB})
diff --git a/src/common/batch_normalization_pd.hpp b/src/common/batch_normalization_pd.hpp
index 577f3bb75a5..6cd6ce47d96 100644
--- a/src/common/batch_normalization_pd.hpp
+++ b/src/common/batch_normalization_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ struct batch_normalization_pd_t : public primitive_desc_t {
 
     float alpha() const {
         const auto &p = attr()->post_ops_;
-        const bool entry_size_ok = p.entry_.size() > 0;
+        const bool entry_size_ok = !p.entry_.empty();
         assert(entry_size_ok || fuse_norm_relu() || fuse_norm_add_relu());
         if (entry_size_ok) return p.entry_[0].eltwise.alpha;
         return 0.f;
@@ -126,16 +126,15 @@ struct batch_normalization_pd_t : public primitive_desc_t {
 
     memory_desc_t ws_md_;
 
-    batch_normalization_pd_t(const batch_normalization_desc_t *adesc,
+    batch_normalization_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const batch_normalization_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<batch_normalization_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_md_(desc_.src_desc)
         , stat_md_(desc_.stat_desc)
-        , scaleshift_md_(desc_.scaleshift_desc)
-        , ws_md_() {}
+        , scaleshift_md_(desc_.scaleshift_desc) {}
 
     virtual status_t init_default_ws(size_t bits_per_element) {
         const auto src_mdw = memory_desc_wrapper(src_md_);
@@ -149,14 +148,16 @@ struct batch_normalization_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct batch_normalization_fwd_pd_t : public batch_normalization_pd_t {
-    typedef batch_normalization_fwd_pd_t base_class;
-    typedef batch_normalization_fwd_pd_t hint_class;
+    using base_class = batch_normalization_fwd_pd_t;
+    using hint_class = batch_normalization_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
-        if (arg == DNNL_ARG_SRC_1 && fuse_norm_add_relu())
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_SRC_1)
+            return fuse_norm_add_relu() ? arg_usage_t::input
+                                        : arg_usage_t::unused;
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
         if (utils::one_of(arg, DNNL_ARG_MEAN, DNNL_ARG_VARIANCE)) {
@@ -165,11 +166,14 @@ struct batch_normalization_fwd_pd_t : public batch_normalization_pd_t {
             return arg_usage_t::unused;
         }
 
-        if (arg == DNNL_ARG_SCALE && use_scale()) return arg_usage_t::input;
-        if (arg == DNNL_ARG_SHIFT && use_shift()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_SCALE)
+            return use_scale() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_SHIFT)
+            return use_shift() ? arg_usage_t::input : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_WORKSPACE && !types::is_zero_md(workspace_md()))
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::output
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -230,7 +234,7 @@ struct batch_normalization_fwd_pd_t : public batch_normalization_pd_t {
 protected:
     memory_desc_t dst_md_;
 
-    batch_normalization_fwd_pd_t(const batch_normalization_desc_t *adesc,
+    batch_normalization_fwd_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const batch_normalization_fwd_pd_t *hint_fwd_pd)
         : batch_normalization_pd_t(adesc, attr, hint_fwd_pd)
@@ -247,30 +251,36 @@ struct batch_normalization_fwd_pd_t : public batch_normalization_pd_t {
                 weights_md()->data_type == data_type::f32);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct batch_normalization_bwd_pd_t : public batch_normalization_pd_t {
-    typedef batch_normalization_bwd_pd_t base_class;
-    typedef batch_normalization_fwd_pd_t hint_class;
+    using base_class = batch_normalization_bwd_pd_t;
+    using hint_class = batch_normalization_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_MEAN, DNNL_ARG_VARIANCE,
                     DNNL_ARG_DIFF_DST))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_SCALE && use_scale()) return arg_usage_t::input;
-        if (arg == DNNL_ARG_SHIFT && use_shift()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_SCALE)
+            return use_scale() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_SHIFT)
+            return use_shift() ? arg_usage_t::input : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_WORKSPACE && !types::is_zero_md(workspace_md()))
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::input
+                                                      : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
-        if (arg == DNNL_ARG_DIFF_SRC_1 && fuse_norm_add_relu())
-            return arg_usage_t::output;
-
-        if (arg == DNNL_ARG_DIFF_SCALE && use_scale())
-            return arg_usage_t::output;
-        if (arg == DNNL_ARG_DIFF_SHIFT && use_shift())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DIFF_SRC_1)
+            return fuse_norm_add_relu() ? arg_usage_t::output
+                                        : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_DIFF_SCALE)
+            return use_scale() ? arg_usage_t::output : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_SHIFT)
+            return use_shift() ? arg_usage_t::output : arg_usage_t::unused;
         return primitive_desc_t::arg_usage(arg);
     }
 
@@ -341,7 +351,7 @@ struct batch_normalization_bwd_pd_t : public batch_normalization_pd_t {
     memory_desc_t diff_dst_md_;
     memory_desc_t diff_scaleshift_md_;
 
-    batch_normalization_bwd_pd_t(const batch_normalization_desc_t *adesc,
+    batch_normalization_bwd_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const batch_normalization_fwd_pd_t *hint_fwd_pd)
         : batch_normalization_pd_t(adesc, attr, hint_fwd_pd)
@@ -366,6 +376,7 @@ struct batch_normalization_bwd_pd_t : public batch_normalization_pd_t {
                         diff_weights_md()->data_type));
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/binary.cpp b/src/common/binary.cpp
index 2948b5b6beb..570e6eddc3e 100644
--- a/src/common/binary.cpp
+++ b/src/common/binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,23 +48,24 @@ status_t binary_attr_check(const binary_desc_t &desc, const engine_t *engine,
     // Check attributes
     const data_type_t dst_dt = desc.dst_desc.data_type;
 
-    auto attr_mask = smask_t::post_ops | smask_t::scales_runtime;
+    auto attr_mask = smask_t::post_ops | smask_t::scales;
 
     VCHECK_BINARY_UNIMPL(attr->has_default_values(attr_mask, dst_dt),
             VERBOSE_UNSUPPORTED_ATTR);
 
     // Check scales
     if (!attr->scales_.has_default_values()) {
-        VCHECK_BINARY_UNIMPL(attr->scales_.has_default_values(
-                                     {DNNL_ARG_SRC_0, DNNL_ARG_SRC_1}),
+        static const std::vector<int> supported_args {
+                DNNL_ARG_SRC_0, DNNL_ARG_SRC_1};
+        VCHECK_BINARY_UNIMPL(attr->scales_.has_default_values(supported_args),
                 VERBOSE_UNSUPPORTED_SCALES_CFG);
 
-        const auto &sc = attr->scales_;
-        const int mask_src_0 = sc.get(DNNL_ARG_SRC_0).mask_;
-        const int mask_src_1 = sc.get(DNNL_ARG_SRC_1).mask_;
+        for (int arg : supported_args) {
+            if (attr->scales_.has_default_values(arg)) continue;
 
-        VCHECK_BINARY_UNIMPL(utils::everyone_is(0, mask_src_0, mask_src_1),
-                VERBOSE_UNSUPPORTED_SCALES_CFG);
+            const int mask = attr->scales_.get_mask(arg);
+            VCHECK_BINARY_UNIMPL(mask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+        }
     }
 
     // Check post-ops
@@ -77,30 +78,24 @@ status_t binary_attr_check(const binary_desc_t &desc, const engine_t *engine,
         // Check sum
         VCHECK_BINARY_UNIMPL(po.check_sum_consistency(dst_dt, false, true),
                 VERBOSE_UNSUPPORTED_POSTOP);
-    }
 
+        // Note: verbose support is inside the call.
+        CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
+    }
     return status::success;
 }
 
-status_t dnnl_binary_primitive_desc_create(
-        primitive_desc_iface_t **primitive_desc_iface, engine_t *engine,
-        alg_kind_t alg_kind, const memory_desc_t *src0_md,
-        const memory_desc_t *src1_md, const memory_desc_t *dst_md,
-        const primitive_attr_t *attr) {
+status_t binary_md_check(const engine_t *engine, alg_kind_t alg_kind,
+        const memory_desc_t *src0_md, const memory_desc_t *src1_md,
+        const memory_desc_t *src2_md, const memory_desc_t *dst_md) {
     VCHECK_BINARY(!any_null(src0_md, src1_md, dst_md), VERBOSE_NULL_ARG);
-    VCHECK_BINARY(
-            one_of(alg_kind, binary_add, binary_mul, binary_max, binary_min,
-                    binary_div, binary_sub, binary_ge, binary_gt, binary_le,
-                    binary_lt, binary_eq, binary_ne),
-            VERBOSE_BAD_ALGORITHM);
+    VCHECK_BINARY(IMPLICATION(alg_kind == binary_select, src2_md != nullptr),
+            VERBOSE_NULL_ARG);
+
     // TODO - Add support for mutual or bi-directional broadcasts
     VCHECK_BINARY(!memory_desc_wrapper(src0_md).format_any(),
             VERBOSE_UNSUPPORTED_TAG_S, "src0");
 
-    auto bod = binary_desc_t();
-    bod.primitive_kind = primitive_kind::binary;
-    bod.alg_kind = alg_kind;
-
     VCONDCHECK(primitive, create, check, binary,
             !memory_desc_wrapper(src0_md).has_runtime_dims_or_strides(),
             status::unimplemented, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
@@ -111,10 +106,6 @@ status_t dnnl_binary_primitive_desc_create(
             !memory_desc_wrapper(dst_md).has_runtime_dims_or_strides(),
             status::unimplemented, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
-    bod.src_desc[0] = *src0_md;
-    bod.src_desc[1] = *src1_md;
-    bod.dst_desc = *dst_md;
-
     const int ndims = dst_md->ndims;
     const dims_t &dims = dst_md->dims;
 
@@ -122,8 +113,19 @@ status_t dnnl_binary_primitive_desc_create(
             src0_md->ndims == ndims, VERBOSE_INCONSISTENT_NDIMS, "src0", "dst");
     VCHECK_BINARY(
             src1_md->ndims == ndims, VERBOSE_INCONSISTENT_NDIMS, "src1", "dst");
+
+    if (src2_md != nullptr) {
+        VCONDCHECK(primitive, create, check, binary,
+                !memory_desc_wrapper(src2_md).has_runtime_dims_or_strides(),
+                status::unimplemented, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+        VCHECK_BINARY(src2_md->ndims == ndims, VERBOSE_INCONSISTENT_NDIMS,
+                "src2", "dst");
+        VCHECK_BINARY(
+                src2_md->data_type == data_type::s8, VERBOSE_UNSUPPORTED_DT);
+    }
+
     for (int d = 0; d < ndims; ++d) {
-        //dims must equal eachother or equal 1 (broadcast)
+        //dims must equal each other or equal 1 (broadcast)
         VCHECK_BINARY(utils::one_of(src0_md->dims[d], 1, dims[d]),
                 VERBOSE_BAD_DIM, "src0", d);
         VCHECK_BINARY(utils::one_of(src1_md->dims[d], 1, dims[d]),
@@ -131,7 +133,49 @@ status_t dnnl_binary_primitive_desc_create(
         VCHECK_BINARY(IMPLICATION(src0_md->dims[d] != dims[d],
                               src1_md->dims[d] == dims[d]),
                 VERBOSE_INCONSISTENT_DIM, "src1", d, "dst", d);
+
+        if (src2_md != nullptr) {
+            VCHECK_BINARY(utils::one_of(src2_md->dims[d], 1, dims[d]),
+                    VERBOSE_BAD_DIM, "src2", d);
+            VCHECK_BINARY(IMPLICATION(src0_md->dims[d] != dims[d],
+                                  src2_md->dims[d] == src0_md->dims[d]),
+                    VERBOSE_INCONSISTENT_DIM, "src0", d, "src2", d);
+        }
     }
+    return status::success;
+}
+
+status_t dnnl_binary_primitive_desc_create(
+        primitive_desc_iface_t **primitive_desc_iface, engine_t *engine,
+        alg_kind_t alg_kind, const memory_desc_t *src0_md,
+        const memory_desc_t *src1_md, const memory_desc_t *dst_md,
+        const primitive_attr_t *attr) {
+
+    return dnnl_binary_primitive_desc_create_v2(primitive_desc_iface, engine,
+            alg_kind, src0_md, src1_md, nullptr, dst_md, attr);
+}
+
+status_t dnnl_binary_primitive_desc_create_v2(
+        primitive_desc_iface_t **primitive_desc_iface, engine_t *engine,
+        alg_kind_t alg_kind, const memory_desc_t *src0_md,
+        const memory_desc_t *src1_md, const memory_desc_t *src2_md,
+        const memory_desc_t *dst_md, const primitive_attr_t *attr) {
+    VCHECK_BINARY(
+            one_of(alg_kind, binary_add, binary_mul, binary_max, binary_min,
+                    binary_div, binary_sub, binary_ge, binary_gt, binary_le,
+                    binary_lt, binary_eq, binary_ne, binary_select, binary_prelu),
+            VERBOSE_BAD_ALGORITHM);
+
+    CHECK(binary_md_check(engine, alg_kind, src0_md, src1_md, src2_md, dst_md));
+
+    auto bod = binary_desc_t();
+    bod.primitive_kind = primitive_kind::binary;
+    bod.alg_kind = alg_kind;
+
+    bod.src_desc[0] = *src0_md;
+    bod.src_desc[1] = *src1_md;
+    if (alg_kind == binary_select) bod.src_desc[2] = *src2_md;
+    bod.dst_desc = *dst_md;
 
     CHECK(binary_attr_check(bod, engine, attr));
     return primitive_desc_create(primitive_desc_iface, engine,
diff --git a/src/common/binary_pd.hpp b/src/common/binary_pd.hpp
index aa0d2cb23cf..2ec1c31bb5b 100644
--- a/src/common/binary_pd.hpp
+++ b/src/common/binary_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,11 +37,12 @@
 namespace dnnl {
 namespace impl {
 
+// NOLINTBEGIN(google-default-arguments)
 struct binary_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::binary;
 
-    typedef binary_pd_t base_class;
-    typedef binary_pd_t hint_class;
+    using base_class = binary_pd_t;
+    using hint_class = binary_pd_t;
 
     const binary_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -59,7 +60,8 @@ struct binary_pd_t : public primitive_desc_t {
     }
 
     arg_usage_t arg_usage(int arg) const override {
-        if (arg == DNNL_ARG_SRC_0 || arg == DNNL_ARG_SRC_1)
+        if (arg == DNNL_ARG_SRC_0 || arg == DNNL_ARG_SRC_1
+                || arg == DNNL_ARG_SRC_2)
             return arg_usage_t::input;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
@@ -72,6 +74,7 @@ struct binary_pd_t : public primitive_desc_t {
         switch (arg) {
             case DNNL_ARG_SRC_0: return src_md(0);
             case DNNL_ARG_SRC_1: return src_md(1);
+            case DNNL_ARG_SRC_2: return src_md(2);
             case DNNL_ARG_DST: return dst_md(0, user_input);
             default: return primitive_desc_t::arg_md(arg);
         }
@@ -81,6 +84,7 @@ struct binary_pd_t : public primitive_desc_t {
             int index = 0, bool user_input = false) const override {
         if (index == 0) return user_input ? &desc()->src_desc[0] : &src0_md_;
         if (index == 1) return user_input ? &desc()->src_desc[1] : &src1_md_;
+        if (index == 2) return user_input ? &desc()->src_desc[2] : &src2_md_;
         return &glob_zero_md;
     }
     const memory_desc_t *dst_md(
@@ -89,7 +93,9 @@ struct binary_pd_t : public primitive_desc_t {
         return &glob_zero_md;
     }
 
-    int n_inputs() const override { return 2 + n_binary_po_inputs(); }
+    int n_inputs() const override {
+        return 2 + n_binary_po_inputs() + static_cast<int>(is_ternary_op());
+    }
     int n_outputs() const override { return 1; }
 
     const dims_t &broadcast_dims() const { return broadcast_dims_; }
@@ -106,21 +112,29 @@ struct binary_pd_t : public primitive_desc_t {
         return src0_d.consistent_with(src1_d);
     }
 
+    bool is_ternary_op() const {
+        const memory_desc_wrapper src2_d(src_md(2));
+        return !src2_d.is_zero()
+                && (desc()->alg_kind == alg_kind::binary_select);
+    }
+
 protected:
     binary_desc_t desc_;
 
     memory_desc_t src0_md_;
     memory_desc_t src1_md_;
+    memory_desc_t src2_md_;
     memory_desc_t dst_md_;
 
     dims_t broadcast_dims_;
 
-    binary_pd_t(const binary_desc_t *adesc, const primitive_attr_t *attr,
+    binary_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const binary_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<binary_desc_t>(adesc))
         , src0_md_(desc_.src_desc[0])
         , src1_md_(desc_.src_desc[1])
+        , src2_md_(desc_.src_desc[2])
         , dst_md_(desc_.dst_desc) {
         init_broadcast_dims();
     }
@@ -134,6 +148,14 @@ struct binary_pd_t : public primitive_desc_t {
             }
         }
 
+        if (is_ternary_op() && src2_md_.format_kind == format_kind::any) {
+            const memory_desc_wrapper src_d(src_md(0));
+            if (src_d.is_blocking_desc()) {
+                CHECK(memory_desc_init_by_blocking_desc(
+                        src2_md_, src_d.blocking_desc()));
+            }
+        }
+
         if (dst_md_.format_kind == format_kind::any) {
             const memory_desc_wrapper src_d(src_md(0));
             if (src_d.is_blocking_desc()) {
@@ -158,10 +180,13 @@ struct binary_pd_t : public primitive_desc_t {
 
     bool attr_scales_ok(const std::vector<int> &supported_args
             = {DNNL_ARG_SRC_0, DNNL_ARG_SRC_1, DNNL_ARG_DST}) const {
-        bool ok = attr()->scales_.has_default_values(supported_args);
-        for (int arg : supported_args) {
-            const auto &mask = attr()->scales_.get(arg).mask_;
-            ok = ok && (mask == 0);
+        const auto &scales = attr()->scales_;
+        bool ok = scales.has_default_values(supported_args);
+
+        for (const auto &arg : supported_args) {
+            if (scales.has_default_values(arg)) continue;
+
+            ok = ok && scales.get_mask(arg) == 0;
         }
         return ok;
     }
@@ -176,6 +201,7 @@ struct binary_pd_t : public primitive_desc_t {
                     = (dims_A[d] == dims_B[d] && dims_A[d] != 1) ? 0 : 1;
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/broadcast_strategy.cpp b/src/common/broadcast_strategy.cpp
index 6123f917cac..8ab33aa1d54 100644
--- a/src/common/broadcast_strategy.cpp
+++ b/src/common/broadcast_strategy.cpp
@@ -34,6 +34,7 @@ broadcasting_strategy_t get_rhs_arg_broadcasting_strategy(
     static const bcast_set_t all_bcast_strategies {
             broadcasting_strategy_t::scalar, broadcasting_strategy_t::per_oc,
             broadcasting_strategy_t::per_oc_spatial,
+            broadcasting_strategy_t::per_oc_d,
             broadcasting_strategy_t::shared_axes,
             broadcasting_strategy_t::per_mb,
             broadcasting_strategy_t::per_mb_spatial,
@@ -164,6 +165,17 @@ bool is_spatial_bcast(const std::bitset<DNNL_MAX_NDIMS> mask,
 
     return spatial_bcast;
 }
+// Check if mask corresponds to per oc_d
+// true if dim == 4 and mask = [1, 0, 0, 1]
+bool is_per_oc_d_bcast(const std::bitset<DNNL_MAX_NDIMS> mask,
+        const memory_desc_t &rhs_arg_md, const memory_desc_wrapper &dst_d) {
+    const dims_t &rdims = rhs_arg_md.dims;
+    const dims_t &ddims = dst_d.dims();
+    if (rhs_arg_md.ndims != 4) return false;
+    if (!mask.test(0) || !mask.test(3)) return false;
+    if (rdims[1] != ddims[1] || rdims[2] != ddims[2]) return false;
+    return true;
+}
 
 bool bcast_strategy_enabled(const bcast_set_t &supported_strategy_set,
         const broadcasting_strategy_t &bcast) {
@@ -254,7 +266,10 @@ broadcasting_strategy_t get_rhs_arg_broadcasting_strategy(
     else if (is_spatial_bcast(mask, dst_d)
             && is_enabled(broadcasting_strategy_t::spatial))
         bcast = broadcasting_strategy_t::spatial;
-    else if (is_enabled(broadcasting_strategy_t::shared_axes))
+    else if (is_per_oc_d_bcast(mask, rhs_arg_md, dst_d)
+            && is_enabled(broadcasting_strategy_t::per_oc_d)) {
+        bcast = broadcasting_strategy_t::per_oc_d;
+    } else if (is_enabled(broadcasting_strategy_t::shared_axes))
         bcast = broadcasting_strategy_t::shared_axes;
 
     return bcast;
diff --git a/src/common/broadcast_strategy.hpp b/src/common/broadcast_strategy.hpp
index 8b10e205fff..1ec98e52363 100644
--- a/src/common/broadcast_strategy.hpp
+++ b/src/common/broadcast_strategy.hpp
@@ -34,6 +34,7 @@ enum class broadcasting_strategy_t {
     per_oc, // [1, c, 1, 1, 1] // Channel-wise
     per_oc_spatial, // [1, c, 1, 1, 1] specific case for binary kernel nchw format
     per_mb, // [n, 1, 1, 1, 1] // broadcast per batch
+    per_oc_d, // [a, b, c, d] -> [1, b, c, 1]; [n, g, oc/g, sp] --> [1, g, oc/g, 1] specific case for ncsp matmul reduction.
     per_mb_spatial, // [n, 1, d, h, w] // Broadcast only channel
     per_mb_w, // [n, 1, 1, 1, w] // Broadcast per batch and width
     per_w, // [1, 1, 1, 1, w] // Broadcast per width
diff --git a/src/common/c_types_map.hpp b/src/common/c_types_map.hpp
index a299936466c..64abe75b796 100644
--- a/src/common/c_types_map.hpp
+++ b/src/common/c_types_map.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,6 +91,9 @@ const alg_kind_t eltwise_gelu_tanh = dnnl_eltwise_gelu_tanh;
 const alg_kind_t eltwise_gelu_erf = dnnl_eltwise_gelu_erf;
 const alg_kind_t eltwise_hardswish = dnnl_eltwise_hardswish;
 const alg_kind_t eltwise_hardsigmoid = dnnl_eltwise_hardsigmoid;
+const alg_kind_t eltwise_hsigmoid = dnnl_eltwise_hsigmoid;
+const alg_kind_t eltwise_round_half_to_even = dnnl_eltwise_round_half_to_even;
+const alg_kind_t eltwise_round_half_away_from_zero = dnnl_eltwise_round_half_away_from_zero;
 const alg_kind_t eltwise_relu_use_dst_for_bwd
         = dnnl_eltwise_relu_use_dst_for_bwd;
 const alg_kind_t eltwise_tanh_use_dst_for_bwd
@@ -126,6 +130,8 @@ const alg_kind_t binary_le = dnnl_binary_le;
 const alg_kind_t binary_lt = dnnl_binary_lt;
 const alg_kind_t binary_eq = dnnl_binary_eq;
 const alg_kind_t binary_ne = dnnl_binary_ne;
+const alg_kind_t binary_select = dnnl_binary_select;
+const alg_kind_t binary_prelu = dnnl_binary_prelu;
 const alg_kind_t resampling_nearest = dnnl_resampling_nearest;
 const alg_kind_t resampling_linear = dnnl_resampling_linear;
 const alg_kind_t reduction_max = dnnl_reduction_max;
@@ -141,11 +147,23 @@ const alg_kind_t reduction_norm_lp_power_p_sum
         = dnnl_reduction_norm_lp_power_p_sum;
 const alg_kind_t softmax_accurate = dnnl_softmax_accurate;
 const alg_kind_t softmax_log = dnnl_softmax_log;
+const alg_kind_t depthwise_scale_shift = dnnl_depthwise_scale_shift;
+const alg_kind_t depthwise_prelu = dnnl_depthwise_prelu;
+const alg_kind_t quantization_quantize_dequantize = dnnl_quantization_quantize_dequantize;
+const alg_kind_t quantization_quantize = dnnl_quantization_quantize;
+const alg_kind_t binarization_depthwise = dnnl_binarization_depthwise;
+// Internal only alg kinds.
+const alg_kind_t internal_only_start = (alg_kind_t)(1 << 12);
+// GPU only via jit_eltwise injector.
+const alg_kind_t eltwise_stochastic_round
+        = (alg_kind_t)(internal_only_start + 1);
 } // namespace alg_kind
 
 using data_type_t = dnnl_data_type_t;
 namespace data_type {
 const data_type_t undef = dnnl_data_type_undef;
+const data_type_t f4_e3m0 = dnnl_f4_e3m0;
+const data_type_t f4_e2m1 = dnnl_f4_e2m1;
 const data_type_t e8m0 = dnnl_e8m0;
 const data_type_t f8_e5m2 = dnnl_f8_e5m2;
 const data_type_t f8_e4m3 = dnnl_f8_e4m3;
@@ -161,9 +179,11 @@ const data_type_t u4 = dnnl_u4;
 const data_type_t boolean = dnnl_boolean;
 const data_type_t data_type_max = dnnl_data_type_max;
 
+const data_type_t bin = dnnl_bin;
+const data_type_t nf4 = dnnl_nf4;
+
 // Not exposed through API as all current uses are internal only
 const data_type_t tf32 = static_cast<data_type_t>(1 << 8);
-
 } // namespace data_type
 
 using fpmath_mode_t = dnnl_fpmath_mode_t;
@@ -202,16 +222,18 @@ using sparse_encoding_t = dnnl_sparse_encoding_t;
 namespace sparse_encoding {
 const sparse_encoding_t undef = dnnl_sparse_encoding_undef;
 const sparse_encoding_t csr = dnnl_csr;
+const sparse_encoding_t coo = dnnl_coo;
 const sparse_encoding_t packed = dnnl_packed;
 } // namespace sparse_encoding
 #else
 // Declare dummy values to avoid guarding internal implementation.
-using sparse_encoding_t = int;
-namespace sparse_encoding {
-const sparse_encoding_t undef = 0;
-const sparse_encoding_t csr = 1;
-const sparse_encoding_t packed = 2;
-} // namespace sparse_encoding
+// using sparse_encoding_t = int;
+// namespace sparse_encoding {
+// const sparse_encoding_t undef = 0;
+// const sparse_encoding_t csr = 1;
+// const sparse_encoding_t packed = 2;
+// const sparse_encoding_t coo = 3;
+// } // namespace sparse_encoding
 #endif
 
 using format_kind_t = dnnl_format_kind_t;
@@ -223,13 +245,15 @@ const format_kind_t opaque = dnnl_format_kind_opaque;
 #ifdef DNNL_EXPERIMENTAL_SPARSE
 const format_kind_t sparse = dnnl_format_kind_sparse;
 #else
-const format_kind_t sparse = static_cast<format_kind_t>(4);
+// const format_kind_t sparse = static_cast<format_kind_t>(4);
 #endif
 
 // Internal only format kinds.
 const format_kind_t internal_only_start = (format_kind_t)(1 << 8);
 const format_kind_t wino = internal_only_start;
 const format_kind_t rnn_packed = (format_kind_t)(internal_only_start + 1);
+const format_kind_t cublaslt_blocked = (format_kind_t)(internal_only_start + 2);
+const format_kind_t sparse = dnnl_format_sparse;
 } // namespace format_kind
 
 #ifdef DNNL_EXPERIMENTAL_PROFILING
@@ -248,6 +272,8 @@ const profiling_data_kind_t internal_only_start
         = (profiling_data_kind_t)(1 << 8);
 const profiling_data_kind_t cycles
         = (profiling_data_kind_t)(internal_only_start + 1);
+const profiling_data_kind_t time_per_kernel
+        = (profiling_data_kind_t)(internal_only_start + 2);
 } // namespace profiling_data_kind
 
 using format_tag_t = dnnl_format_tag_t;
@@ -358,6 +384,9 @@ const format_tag_t aCB16b16c = dnnl_aCB16b16c;
 const format_tag_t aCB16b32c = dnnl_aCB16b32c;
 const format_tag_t aCB16b48c = dnnl_aCB16b48c;
 const format_tag_t aCB16b64c = dnnl_aCB16b64c;
+const format_tag_t BA24b8a = dnnl_BA24b8a;
+const format_tag_t aCB24c8b = dnnl_aCB24c8b;
+const format_tag_t abDC24d8c = dnnl_abDC24d8c;
 const format_tag_t aCB16b16c2b = dnnl_aCB16b16c2b;
 const format_tag_t aCB16b32c2b = dnnl_aCB16b32c2b;
 const format_tag_t aCB16b48c2b = dnnl_aCB16b48c2b;
@@ -369,6 +398,7 @@ const format_tag_t aCB16b64c4b = dnnl_aCB16b64c4b;
 
 const format_tag_t Ab4a = dnnl_Ab4a;
 const format_tag_t Ab8a = dnnl_Ab8a;
+const format_tag_t Ab32a = dnnl_Ab32a;
 const format_tag_t Abc16a = dnnl_Abc16a;
 const format_tag_t ABc16a16b = dnnl_ABc16a16b;
 const format_tag_t ABc4a2b = dnnl_ABc4a2b;
@@ -471,6 +501,7 @@ const format_tag_t aBCd4b4c = dnnl_aBCd4b4c;
 const format_tag_t ABcd8a16b2a = dnnl_ABcd8a16b2a;
 const format_tag_t BAcd8a16b2a = dnnl_BAcd8a16b2a;
 const format_tag_t ABcd8a8b = dnnl_ABcd8a8b;
+const format_tag_t ABcd8a32b = dnnl_ABcd8a32b;
 const format_tag_t ABcd8a4b = dnnl_ABcd8a4b;
 const format_tag_t ABcd8a2b = dnnl_ABcd8a2b;
 const format_tag_t aBcd8b = dnnl_aBcd8b;
@@ -615,6 +646,7 @@ const format_tag_t aBdefc16b = dnnl_aBdefc16b;
 const format_tag_t aBdefC16b2c = dnnl_aBdefC16b2c;
 const format_tag_t aBdefC16b4c = dnnl_aBdefC16b4c;
 const format_tag_t aCBdef16c16b = dnnl_aCBdef16c16b;
+const format_tag_t aCBdef8b8c = dnnl_aCBdef8b8c;
 const format_tag_t aCBdef16b16c = dnnl_aCBdef16b16c;
 const format_tag_t aBdefc4b = dnnl_aBdefc4b;
 const format_tag_t aBdefc8b = dnnl_aBdefc8b;
@@ -629,8 +661,10 @@ const format_tag_t Acb4a = dnnl_Acb4a;
 const format_tag_t Acb8a = dnnl_Acb8a;
 const format_tag_t AcB8a2b = dnnl_AcB8a2b;
 const format_tag_t AcB8a4b = dnnl_AcB8a4b;
+const format_tag_t aCBd8b8c = dnnl_aCBd8b8c;
 const format_tag_t aCBd16b16c = dnnl_aCBd16b16c;
 const format_tag_t aCBd16c16b = dnnl_aCBd16c16b;
+const format_tag_t aCBde8b8c = dnnl_aCBde8b8c;
 const format_tag_t aCBde16b16c = dnnl_aCBde16b16c;
 const format_tag_t aCBde16c16b = dnnl_aCBde16c16b;
 const format_tag_t Acdb16a = dnnl_Acdb16a;
@@ -649,7 +683,9 @@ const format_tag_t AcdeB8a2b = dnnl_AcdeB8a2b;
 const format_tag_t AcdeB8a4b = dnnl_AcdeB8a4b;
 const format_tag_t Acedb16a = dnnl_Acedb16a;
 const format_tag_t Adcb16a = dnnl_Adcb16a;
+const format_tag_t BAc8a8b = dnnl_BAc8a8b;
 const format_tag_t BAc16a16b = dnnl_BAc16a16b;
+const format_tag_t BAcd8a8b = dnnl_BAcd8a8b;
 const format_tag_t BAcd16a16b = dnnl_BAcd16a16b;
 const format_tag_t ABc32a16b = dnnl_ABc32a16b;
 const format_tag_t ABcd32a16b = dnnl_ABcd32a16b;
@@ -658,6 +694,7 @@ const format_tag_t ABc40a16b = dnnl_ABc40a16b;
 const format_tag_t ABcd40a16b = dnnl_ABcd40a16b;
 const format_tag_t ABcde40a16b = dnnl_ABcde40a16b;
 const format_tag_t ABc32a32b = dnnl_ABc32a32b;
+const format_tag_t BAcde8a8b = dnnl_BAcde8a8b;
 const format_tag_t BAcde16a16b = dnnl_BAcde16a16b;
 const format_tag_t ABcd32a32b = dnnl_ABcd32a32b;
 const format_tag_t ABcde32a32b = dnnl_ABcde32a32b;
@@ -666,6 +703,8 @@ const format_tag_t ABcd40a32b = dnnl_ABcd40a32b;
 const format_tag_t ABcde40a32b = dnnl_ABcde40a32b;
 const format_tag_t BAcde16b16a = dnnl_BAcde16b16a;
 const format_tag_t aBdec32b = dnnl_aBdec32b;
+const format_tag_t Abcdef4a = dnnl_Abcdef4a;
+const format_tag_t Abcdef8a = dnnl_Abcdef8a;
 const format_tag_t Abcdef16a = dnnl_Abcdef16a;
 const format_tag_t Abcdef32a = dnnl_Abcdef32a;
 const format_tag_t Acdb32a = dnnl_Acdb32a;
@@ -689,6 +728,7 @@ const format_tag_t AB32a32b8a2b = dnnl_AB32a32b8a2b;
 const format_tag_t AB8a2b = dnnl_AB8a2b;
 const format_tag_t abDc16d = dnnl_abDc16d;
 const format_tag_t abDc32d = dnnl_abDc32d;
+const format_tag_t abDC16d4c = dnnl_abDC16d4c;
 const format_tag_t abDC32d4c = dnnl_abDC32d4c;
 const format_tag_t abCd4c = dnnl_abCd4c;
 const format_tag_t abCde4c = dnnl_abCde4c;
@@ -698,6 +738,7 @@ const format_tag_t abCde32c = dnnl_abCde32c;
 const format_tag_t abCdef32c = dnnl_abCdef32c;
 const format_tag_t abdEc16e = dnnl_abdEc16e;
 const format_tag_t abdEc32e = dnnl_abdEc32e;
+const format_tag_t abdEC16e4c = dnnl_abdEC16e4c;
 const format_tag_t abdEC32e2c = dnnl_abdEC32e2c;
 const format_tag_t abdEC32e4c = dnnl_abdEC32e4c;
 const format_tag_t abdEC64e2c = dnnl_abdEC64e2c;
@@ -1163,7 +1204,10 @@ const format_tag_t IOhw16i16o = dnnl_IOhw16i16o;
 const format_tag_t Ohwi32o = dnnl_Ohwi32o;
 const format_tag_t gIOhw16i16o = dnnl_gIOhw16i16o;
 const format_tag_t gOhwi32o = dnnl_gOhwi32o;
+const format_tag_t Goidhw4g = dnnl_Goidhw4g;
+const format_tag_t Goidhw8g = dnnl_Goidhw8g;
 const format_tag_t Goidhw16g = dnnl_Goidhw16g;
+const format_tag_t IOw8o8i = dnnl_IOw8o8i;
 const format_tag_t IOw16o16i = dnnl_IOw16o16i;
 const format_tag_t IOw16i16o = dnnl_IOw16i16o;
 const format_tag_t gIOw16i16o = dnnl_gIOw16i16o;
@@ -1219,7 +1263,9 @@ const format_tag_t Owi4o = dnnl_Owi4o;
 const format_tag_t Owi8o = dnnl_Owi8o;
 const format_tag_t OwI8o2i = dnnl_OwI8o2i;
 const format_tag_t OwI8o4i = dnnl_OwI8o4i;
+const format_tag_t IOdhw8o8i = dnnl_IOdhw8o8i;
 const format_tag_t IOdhw16o16i = dnnl_IOdhw16o16i;
+const format_tag_t IOhw8o8i = dnnl_IOhw8o8i;
 const format_tag_t IOhw16o16i = dnnl_IOhw16o16i;
 const format_tag_t Ohwi16o = dnnl_Ohwi16o;
 const format_tag_t OhwI16o2i = dnnl_OhwI16o2i;
@@ -1272,6 +1318,8 @@ const format_tag_t OhwI8i8o = dnnl_OhwI8i8o;
 const format_tag_t OIhw8o16i2o = dnnl_OIhw8o16i2o;
 const format_tag_t IOhw8o16i2o = dnnl_IOhw8o16i2o;
 const format_tag_t OIhw8o8i = dnnl_OIhw8o8i;
+const format_tag_t OIhw8o32i = dnnl_OIhw8o32i;
+const format_tag_t OIhw16o32i = dnnl_OIhw16o32i;
 const format_tag_t OIhw8o4i = dnnl_OIhw8o4i;
 const format_tag_t Owhi16o = dnnl_Owhi16o;
 const format_tag_t Odwhi16o = dnnl_Odwhi16o;
@@ -1327,6 +1375,7 @@ const format_tag_t OIdhw8i8o = dnnl_OIdhw8i8o;
 const format_tag_t OdhwI8i8o = dnnl_OdhwI8i8o;
 const format_tag_t OIdhw8o8i = dnnl_OIdhw8o8i;
 const format_tag_t OIdhw8o4i = dnnl_OIdhw8o4i;
+const format_tag_t gIOw8o8i = dnnl_gIOw8o8i;
 const format_tag_t gIOw16o16i = dnnl_gIOw16o16i;
 const format_tag_t Goiw16g = dnnl_Goiw16g;
 const format_tag_t Goiw8g = dnnl_Goiw8g;
@@ -1355,7 +1404,9 @@ const format_tag_t gOwi4o = dnnl_gOwi4o;
 const format_tag_t gOwi8o = dnnl_gOwi8o;
 const format_tag_t gOwI8o2i = dnnl_gOwI8o2i;
 const format_tag_t gOwI8o4i = dnnl_gOwI8o4i;
+const format_tag_t gIOdhw8o8i = dnnl_gIOdhw8o8i;
 const format_tag_t gIOdhw16o16i = dnnl_gIOdhw16o16i;
+const format_tag_t gIOhw8o8i = dnnl_gIOhw8o8i;
 const format_tag_t gIOhw16o16i = dnnl_gIOhw16o16i;
 const format_tag_t gOhwi16o = dnnl_gOhwi16o;
 const format_tag_t gOhwI16o2i = dnnl_gOhwI16o2i;
@@ -1454,10 +1505,12 @@ const format_tag_t gOIhw4o8i2o = dnnl_gOIhw4o8i2o;
 const format_tag_t gOIdhw4o8i2o = dnnl_gOIdhw4o8i2o;
 const format_tag_t ldOi16o = dnnl_ldOi16o;
 const format_tag_t ldOi32o = dnnl_ldOi32o;
+const format_tag_t ldOI16o4i = dnnl_ldOI16o4i;
 const format_tag_t ldOI32o4i = dnnl_ldOI32o4i;
 const format_tag_t ldIo32i = dnnl_ldIo32i;
 const format_tag_t ldgOi16o = dnnl_ldgOi16o;
 const format_tag_t ldgOi32o = dnnl_ldgOi32o;
+const format_tag_t ldgOI16o4i = dnnl_ldgOI16o4i;
 const format_tag_t ldgOI32o2i = dnnl_ldgOI32o2i;
 const format_tag_t ldgOI32o4i = dnnl_ldgOI32o4i;
 const format_tag_t ldgOI64o2i = dnnl_ldgOI64o2i;
@@ -1894,6 +1947,15 @@ const rnn_flags_t diff_weights_overwrite
         = dnnl_rnn_flags_diff_weights_overwrite;
 } // namespace rnn_flags
 
+using sparse_encoding_t = dnnl_sparse_encoding_t;
+namespace sparse_encoding {
+const sparse_encoding_t undef = dnnl_sparse_encoding_undef;
+const sparse_encoding_t any = dnnl_sparse_encoding_any;
+const sparse_encoding_t packed = dnnl_sparse_encoding_packed;
+const sparse_encoding_t csr = dnnl_sparse_encoding_csr;
+const sparse_encoding_t coo = dnnl_sparse_encoding_coo;
+} // namespace sparse_encoding
+
 using engine_kind_t = dnnl_engine_kind_t;
 namespace engine_kind {
 const engine_kind_t any_engine = dnnl_any_engine;
@@ -1906,6 +1968,7 @@ enum runtime_kind_t {
     dnnl_runtime_seq,
     dnnl_runtime_omp,
     dnnl_runtime_tbb,
+    dnnl_runtime_tbb_auto,
     dnnl_runtime_threadpool,
     dnnl_runtime_ocl,
     dnnl_runtime_sycl,
@@ -1916,6 +1979,7 @@ const runtime_kind_t none = dnnl_runtime_none;
 const runtime_kind_t seq = dnnl_runtime_seq;
 const runtime_kind_t omp = dnnl_runtime_omp;
 const runtime_kind_t tbb = dnnl_runtime_tbb;
+const runtime_kind_t tbb_auto = dnnl_runtime_tbb_auto;
 const runtime_kind_t threadpool = dnnl_runtime_threadpool;
 const runtime_kind_t ocl = dnnl_runtime_ocl;
 const runtime_kind_t sycl = dnnl_runtime_sycl;
@@ -1945,6 +2009,9 @@ const primitive_kind_t reduction = dnnl_reduction;
 const primitive_kind_t softmax = dnnl_softmax;
 const primitive_kind_t layer_normalization = dnnl_layer_normalization;
 const primitive_kind_t group_normalization = dnnl_group_normalization;
+const primitive_kind_t depthwise = dnnl_depthwise;
+const primitive_kind_t quantization = dnnl_quantization;
+const primitive_kind_t binarization = dnnl_binarization;
 
 // Internal only primitive kinds.
 const primitive_kind_t internal_only_start = (primitive_kind_t)(1 << 12);
@@ -2025,17 +2092,26 @@ const query_t sparse_encoding = dnnl_query_sparse_encoding;
 const query_t nnz_s64 = dnnl_query_nnz_s64;
 const query_t num_handles_s32 = dnnl_query_num_handles_s32;
 #else
-const query_t sparse_encoding = static_cast<query_t>(266);
-const query_t nnz_s64 = static_cast<query_t>(267);
-const query_t num_handles_s32 = static_cast<query_t>(268);
+// const query_t sparse_encoding = static_cast<query_t>(266);
+// const query_t nnz_s64 = static_cast<query_t>(267);
+// const query_t num_handles_s32 = static_cast<query_t>(268);
 #endif
 
 // Internal only query kinds.
 const query_t internal_only_start = (query_t)(1 << 12);
 const query_t zero_pad_d = internal_only_start;
 const query_t preferred_gpu_threads_per_eu = (query_t)(internal_only_start + 1);
+const query_t sparse_encoding = dnnl_query_sparse_encoding;
 } // namespace query
 
+// There are no external values to map to because this is an internal feature
+// for now.
+using matmul_reduce_kind_t = int;
+namespace matmul_reduce_kind {
+const matmul_reduce_kind_t undef = 0;
+const matmul_reduce_kind_t src = 1;
+} // namespace matmul_reduce_kind
+
 using rnn_direction_t = dnnl_rnn_direction_t;
 
 using engine_t = dnnl_engine;
diff --git a/src/common/cache_blob_id.cpp b/src/common/cache_blob_id.cpp
index aedcc393bfd..b6b9de9c553 100644
--- a/src/common/cache_blob_id.cpp
+++ b/src/common/cache_blob_id.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 #include "common/dnnl_thread.hpp"
 #include "common/engine.hpp"
 #include "common/primitive_desc.hpp"
+#include "common/primitive_serialization.hpp"
 #include "common/serialization.hpp"
-#include "common/serialization_stream.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -38,45 +38,43 @@ const std::vector<uint8_t> &cache_blob_id_t::get(
         return sstream_.get_data();
     }
 
-    if (pd->op_desc()->kind == primitive_kind::zero_pad) {
-        return sstream_.get_data();
-    }
+    if (pd->kind() == primitive_kind::zero_pad) { return sstream_.get_data(); }
 
     assert(engine->kind() == engine_kind::gpu
             && engine->runtime_kind() == runtime_kind::ocl);
 
     const auto init_id = [&]() {
-        serialization::serialize_desc(sstream_, pd->op_desc());
-        serialization::serialize_attr(sstream_, *pd->attr());
+        serialize_desc(sstream_, pd->op_desc());
+        serialize(sstream_, *pd->attr());
 
         const int nthr = engine->kind() == engine_kind::gpu
                 ? 0
                 : dnnl_get_max_threads();
-        sstream_.write(&nthr);
+        sstream_.append(nthr);
 
         for (const auto &md : pd->hint_mds(false /* is_hint */)) {
-            serialization::serialize_md(sstream_, md);
+            serialize(sstream_, md);
         }
 
-        sstream_.write(&engine_kind);
+        sstream_.append(engine_kind);
         // TODO: blob object can probably be re-used for different runtimes
         // if the engine kind is the same. Check this assumption when extending
         // this API to DPCPP runtime.
-        sstream_.write(&runtime_kind);
+        sstream_.append(runtime_kind);
 
         engine->serialize_device(sstream_);
 
         auto pd_iterator_offset = pd->pd_iterator_offset();
-        sstream_.write(&pd_iterator_offset);
+        sstream_.append(pd_iterator_offset);
         auto pd_skip_idx = pd->skip_idx();
-        sstream_.write(&pd_skip_idx);
+        sstream_.append(pd_skip_idx);
 
         auto version = dnnl_version();
-        sstream_.write(&version->major);
-        sstream_.write(&version->minor);
-        sstream_.write(&version->patch);
+        sstream_.append(version->major);
+        sstream_.append(version->minor);
+        sstream_.append(version->patch);
 
-        sstream_.write(version->hash, std::strlen(version->hash));
+        sstream_.append_array(std::strlen(version->hash), version->hash);
 
         is_initialized_ = true;
     };
diff --git a/src/common/cache_blob_id.hpp b/src/common/cache_blob_id.hpp
index 53c0f002709..46eadf217da 100644
--- a/src/common/cache_blob_id.hpp
+++ b/src/common/cache_blob_id.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <vector>
 #include <type_traits>
 
-#include "common/serialization_stream.hpp"
+#include "common/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/common/compiler_workarounds.hpp b/src/common/compiler_workarounds.hpp
index 17beeb84b72..bedbd8f82d4 100644
--- a/src/common/compiler_workarounds.hpp
+++ b/src/common/compiler_workarounds.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,11 +17,6 @@
 #ifndef COMPILER_WORKAROUNDS_HPP
 #define COMPILER_WORKAROUNDS_HPP
 
-#if (defined __GNUC__) && (!defined(__INTEL_COMPILER)) \
-        && (!defined(__INTEL_LLVM_COMPILER)) && (!defined(__clang__major__))
-#define NEED_GCC_WA_CHECK 1
-#endif
-
 // Workaround 01: clang.
 //
 // Clang has an issue [1] with `#pragma omp simd` that might lead to segfault.
@@ -32,7 +27,7 @@
 // vectorization for clang altogether for now.
 //
 // [1] https://bugs.llvm.org/show_bug.cgi?id=48104
-#if (defined __clang_major__) && (__clang_major__ >= 6)
+#if (defined __clang_major__) && (__clang_major__ < 13)
 #define CLANG_WA_01_SAFE_TO_USE_OMP_SIMD 0
 #else
 #define CLANG_WA_01_SAFE_TO_USE_OMP_SIMD 1
@@ -40,48 +35,15 @@
 
 // Workaround 02: clang.
 //
-// Clang 6+ generates incorrect code with OMP_SIMD in some particular cases.
+// Clang generates incorrect code with OMP_SIMD in some particular cases.
 // Unlike CLANG_WA_01_SAFE_TO_USE_OMP_SIMD, the issue happens even with -O3.
-#if (defined __clang_major__) && (__clang_major__ >= 6)
+#if (defined __clang_major__) && (__clang_major__ < 13)
 #define CLANG_WA_02_SAFE_TO_USE_OMP_SIMD 0
 #else
 #define CLANG_WA_02_SAFE_TO_USE_OMP_SIMD 1
 #endif
 
-// Workaround 03: GCC
-//
-// For very large functions with too much control flow (i.e. if, switch, goto
-// statements), GCC 7 may struggle to perform optimizations based on tree
-// dominator (i.e. -ftree-dominator-opts, which is enabled with O1), thereby
-// producing an internal compiler error (ICE). Specifically, it seems that the
-// jump threading optimization is the culprit, which cannot be disabled on its
-// own. There is no reliable way to reproduce the ICE, therefore it is not clear
-// which __GCC_MINOR__ version fixes issue.
-#if (defined NEED_GCC_WA_CHECK) && (__GNUC__ == 7)
-#define GCC_WA_NO_TREE_DOMINATOR_OPTS 1
-#else
-#define GCC_WA_NO_TREE_DOMINATOR_OPTS 0
-#endif
-
-// Workaround 04: GCC
-//
-// GCC 10 & 11 && 12 (at least versiona 10.1, 10.3 & 11.1, 12.2) report false positives
-// in xbyak when -Warray-bounds build setting is on
-#if (defined NEED_GCC_WA_CHECK) && (__GNUC__ >= 10)
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
-// Workaround 05: GCC
-//
-// NOTE: inside lambda, type cast variables captured by reference using
-// either c-like "(type)var" or functional "type(var)" notation in order
-// to avoid gcc7 bug with c++14 standard
-// (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83204).
-#if (defined NEED_GCC_WA_CHECK) && (__GNUC__ <= 7)
-#define GCC_WA_LAMBDA_C_CAST
-#endif
-
-// Workaround 05: c++17 vs c++20
+// Workaround 03: MSVC c++17 vs c++20
 //
 // C++17/20 are contradictory wrt capturing this and using default '=' capture.
 // - C++17 and before have to return a warning for the [=, this] capture as
diff --git a/src/common/concat.cpp b/src/common/concat.cpp
index d686df416f8..df4c65bc00b 100644
--- a/src/common/concat.cpp
+++ b/src/common/concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,13 +51,25 @@ status_t concat_primitive_desc_create(std::shared_ptr<primitive_desc_t> &pd,
         attr = &default_attr();
     else {
         using smask_t = primitive_attr_t::skip_mask_t;
-        VCHECK_CONCAT_UNIMPL(attr->has_default_values(smask_t::scales_runtime),
+        VCHECK_CONCAT_UNIMPL(attr->has_default_values(smask_t::scales),
                 VERBOSE_UNSUPPORTED_ATTR);
         const auto &scales = attr->scales_;
-        if (!scales.has_default_values())
-            for (const auto &s : scales.scales_)
-                VCHECK_CONCAT_UNIMPL(
-                        s.second.mask_ == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+        if (!scales.has_default_values()) {
+            std::vector<int> supported_args(n);
+            for (int i = 0; i < n; i++) {
+                supported_args[i] = DNNL_ARG_MULTIPLE_SRC + i;
+            }
+            VCHECK_CONCAT_UNIMPL(
+                    attr->scales_.has_default_values(supported_args),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            for (int arg : supported_args) {
+                if (scales.has_default_values(arg)) continue;
+
+                int mask = scales.get_mask(arg);
+                VCHECK_CONCAT_UNIMPL(mask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+            }
+        }
     }
 
     const int ndims = src_mds[0]->ndims;
diff --git a/src/common/concat_pd.hpp b/src/common/concat_pd.hpp
index 26925bbb718..615ee62737f 100644
--- a/src/common/concat_pd.hpp
+++ b/src/common/concat_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,13 +37,14 @@
 namespace dnnl {
 namespace impl {
 
+// NOLINTBEGIN(google-default-arguments)
 struct concat_pd_t : public primitive_desc_t {
     const concat_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
         return reinterpret_cast<const op_desc_t *>(this->desc());
     }
 
-    ~concat_pd_t() = default;
+    ~concat_pd_t() override = default;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg >= DNNL_ARG_MULTIPLE_SRC
@@ -95,7 +96,6 @@ struct concat_pd_t : public primitive_desc_t {
      * use this auxiliary array iff init() returned success */
     std::vector<memory_desc_t> src_image_mds_;
 
-protected:
     concat_desc_t desc_;
 
     concat_pd_t(const primitive_attr_t *attr, const memory_desc_t *dst_md,
@@ -112,14 +112,14 @@ struct concat_pd_t : public primitive_desc_t {
         init_desc();
     }
 
-    concat_pd_t(const concat_pd_t &other) : primitive_desc_t(other) {
-        n_ = other.n_;
-        concat_dim_ = other.concat_dim_;
-        dst_md_ = other.dst_md_;
-        original_dst_ = other.original_dst_;
-        src_mds_ = other.src_mds_;
-        src_image_mds_ = other.src_image_mds_;
-
+    concat_pd_t(const concat_pd_t &other)
+        : primitive_desc_t(other)
+        , n_(other.n_)
+        , concat_dim_(other.concat_dim_)
+        , dst_md_(other.dst_md_)
+        , original_dst_(other.original_dst_)
+        , src_mds_(other.src_mds_)
+        , src_image_mds_(other.src_image_mds_) {
         init_desc();
     }
 
@@ -266,6 +266,7 @@ struct concat_pd_t : public primitive_desc_t {
             desc_.src_mds.push_back(&md);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 #define DECLARE_CONCAT_PD_t(impl_name, ...) \
     static status_t create(concat_pd_t **concat_pd, \
@@ -284,6 +285,7 @@ struct concat_pd_t : public primitive_desc_t {
                     &primitive, \
             dnnl::impl::engine_t *engine, const cache_blob_t &cache_blob) \
             const override { \
+        DNNL_PRIMITIVE_CREATE(pd_t) \
         return primitive_t::create_primitive_common<__VA_ARGS__, pd_t>( \
                 primitive, this, engine, false, cache_blob); \
     } \
diff --git a/src/common/convolution.cpp b/src/common/convolution.cpp
index 9300043adcc..93c55bc64b2 100644
--- a/src/common/convolution.cpp
+++ b/src/common/convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -116,6 +116,18 @@ status_t conv_desc_init(convolution_desc_t *conv_desc, prop_kind_t prop_kind,
             VERBOSE_INCONSISTENT_DIM, "src", 1, "weights", with_groups + 1);
     VCHECK_CONV(dst_desc->dims[1] == g * weights_desc->dims[with_groups + 0],
             VERBOSE_INCONSISTENT_DIM, "dst", 1, "weights", with_groups + 0);
+    // s4/u4/f4 weights requires channels to be multiple of 2 to be byte aligned
+    VCHECK_CONV(IMPLICATION(utils::one_of(weights_desc->data_type,
+                                    data_type::s4, data_type::u4,
+                                    data_type::f4_e2m1, data_type::f4_e3m0),
+                        weights_desc->dims[with_groups + 1] % 2 == 0),
+            VERBOSE_INCONSISTENT_DIM, "weights", with_groups + 1);
+    // s4/u4/f4 src requires channels to be multiple of 2 to be byte aligned
+    VCHECK_CONV(IMPLICATION(utils::one_of(src_desc->data_type, data_type::s4,
+                                    data_type::u4, data_type::f4_e2m1,
+                                    data_type::f4_e3m0),
+                        src_desc->dims[1] % 2 == 0),
+            VERBOSE_INCONSISTENT_DIM, "src", 1);
 
     int sp_dims = src_desc->ndims - 2;
     utils::array_copy(cd.strides, strides, sp_dims);
@@ -136,7 +148,8 @@ status_t conv_desc_init(convolution_desc_t *conv_desc, prop_kind_t prop_kind,
         dim_t dst = dst_desc->dims[i];
         dim_t ker_range = 1 + (ker - 1) * (dil + 1);
         VCHECK_CONV(str > 0, VERBOSE_BAD_DIM, "strides", i - 2);
-        VCHECK_CONV(dil >= 0 && pad_l >= 0 && pad_r + str > 0,
+        //VCHECK_CONV(dil >= 0 && pad_l >= 0 && pad_r + str > 0, // TODO: [dmitrygo] Commented as WA to support dw conv fusing
+        VCHECK_CONV(dil >= 0 && pad_l >= 0,
                 VERBOSE_INCONSISTENT_PRB);
         VCHECK_CONV((src - ker_range + pad_l + pad_r) / str + 1 == dst,
                 VERBOSE_INCONSISTENT_PRB);
@@ -159,18 +172,26 @@ status_t conv_attr_check(const convolution_desc_t &desc, const engine_t *engine,
         const data_type_t src_dt = desc.src_desc.data_type;
         const data_type_t dst_dt = desc.dst_desc.data_type;
 
-        auto fwd_attr_mask
-                = smask_t::post_ops | smask_t::sum_dt | smask_t::fpmath_mode;
-
-        bool is_int8 = utils::one_of(src_dt, data_type::s8, data_type::u8);
-        if (engine->kind() == engine_kind::gpu)
-            is_int8 = is_int8
-                    || utils::one_of(dst_dt, data_type::s8, data_type::u8,
-                            data_type::s32);
-        if (is_int8)
-            fwd_attr_mask |= smask_t::scales_runtime
-                    | smask_t::zero_points_runtime
-                    | smask_t::zero_points_runtime_data_type;
+        auto fwd_attr_mask = smask_t::post_ops | smask_t::sum_dt
+                | smask_t::fpmath_mode | smask_t::rounding_mode;
+        const bool is_gpu = engine->kind() == engine_kind::gpu;
+
+        const bool is_int8 = utils::one_of(src_dt, data_type::s8, data_type::u8)
+                || (is_gpu
+                        && utils::one_of(dst_dt, data_type::s8, data_type::u8,
+                                data_type::s32));
+        const bool is_fp8 = is_gpu
+                && (utils::one_of(
+                            src_dt, data_type::f8_e5m2, data_type::f8_e4m3)
+                        || utils::one_of(dst_dt, data_type::f8_e5m2,
+                                data_type::f8_e4m3));
+        const bool enable_quantization = is_int8 || is_fp8;
+        if (enable_quantization)
+            fwd_attr_mask |= smask_t::zero_points_data_type
+                    | smask_t::scales_data_type
+                    | smask_t::input_zero_points
+                    | smask_t::output_compensations
+                    | smask_t::weights_zero_points;
 
         VCHECK_CONV_UNIMPL(attr->has_default_values(fwd_attr_mask, dst_dt),
                 VERBOSE_UNSUPPORTED_ATTR);
@@ -178,27 +199,37 @@ status_t conv_attr_check(const convolution_desc_t &desc, const engine_t *engine,
         // Check scales
         if (!attr->scales_.has_default_values()) {
             const auto &sc = attr->scales_;
-            const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-            const int mask_wei = sc.get(DNNL_ARG_WEIGHTS).mask_;
-            const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
             const bool with_groups
                     = desc.src_desc.ndims != desc.weights_desc.ndims;
-            VCHECK_CONV_UNIMPL(utils::everyone_is(0, mask_src, mask_dst)
-                            && utils::one_of(mask_wei, 0, with_groups ? 3 : 1),
+            VCHECK_CONV_UNIMPL(IMPLICATION(!sc.has_default_values(DNNL_ARG_SRC),
+                                       sc.get_mask(DNNL_ARG_SRC) == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_CONV_UNIMPL(
+                    IMPLICATION(!sc.has_default_values(DNNL_ARG_WEIGHTS),
+                            utils::one_of(sc.get_mask(DNNL_ARG_WEIGHTS), 0,
+                                    with_groups ? 3 : 1)),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_CONV_UNIMPL(
+                    IMPLICATION(!sc.has_default_values(DNNL_ARG_DST),
+                            utils::one_of(sc.get_mask(DNNL_ARG_DST), 0, 2)),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
         }
 
         // Check zero points
         if (!attr->zero_points_.has_default_values()) {
             const auto &zp = attr->zero_points_;
-            int mask_src = 0, mask_wei = 0, mask_dst = 0;
-            zp.get(DNNL_ARG_SRC, &mask_src);
-            zp.get(DNNL_ARG_WEIGHTS, &mask_wei);
-            zp.get(DNNL_ARG_DST, &mask_dst);
-
-            VCHECK_CONV_UNIMPL((mask_src == 0 || mask_src == 1 << 1)
-                            && (mask_wei == 0)
-                            && (mask_dst == 0 || mask_dst == 1 << 1),
+
+            VCHECK_CONV_UNIMPL(IMPLICATION(!zp.has_default_values(DNNL_ARG_SRC),
+                                       utils::one_of(zp.get_mask(DNNL_ARG_SRC),
+                                               0, 1 << 1)),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+            VCHECK_CONV_UNIMPL(
+                    IMPLICATION(!zp.has_default_values(DNNL_ARG_WEIGHTS),
+                            zp.get_mask(DNNL_ARG_WEIGHTS) == 0),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+            VCHECK_CONV_UNIMPL(IMPLICATION(!zp.has_default_values(DNNL_ARG_DST),
+                                       utils::one_of(zp.get_mask(DNNL_ARG_DST),
+                                               0, 1 << 1)),
                     VERBOSE_UNSUPPORTED_ZP_CFG);
         }
 
@@ -207,17 +238,20 @@ status_t conv_attr_check(const convolution_desc_t &desc, const engine_t *engine,
             const auto &po = attr->post_ops_;
             using namespace primitive_kind;
             VCHECK_CONV_UNIMPL(po.has_default_values({binary, eltwise, prelu,
-                                       sum, convolution}),
+                                       sum, convolution, depthwise, quantization}),
                     VERBOSE_UNSUPPORTED_POSTOP);
 
             // Check sum
             VCHECK_CONV_UNIMPL(po.check_sum_consistency(dst_dt, is_int8, true),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
-    } else {
-        auto bwd_attr_mask = smask_t::fpmath_mode;
-        VCHECK_CONV_UNIMPL(attr->has_default_values(bwd_attr_mask),
-                VERBOSE_UNSUPPORTED_ATTR);
+    // } else {
+    //     auto bwd_attr_mask = smask_t::fpmath_mode | smask_t::accumulation_mode;
+    //     VCHECK_CONV_UNIMPL(attr->has_default_values(bwd_attr_mask),
+    //             VERBOSE_UNSUPPORTED_ATTR);
     }
 
     return status::success;
diff --git a/src/common/convolution_pd.hpp b/src/common/convolution_pd.hpp
index 123f15d6e95..f85d5cd8ae6 100644
--- a/src/common/convolution_pd.hpp
+++ b/src/common/convolution_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -182,17 +182,16 @@ struct convolution_pd_t : public primitive_desc_t {
     convolution_desc_t desc_;
     const convolution_fwd_pd_t *hint_fwd_pd_;
 
-    convolution_pd_t(const convolution_desc_t *adesc,
-            const primitive_attr_t *attr,
+    convolution_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const convolution_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<convolution_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd) {}
 
     bool set_default_formats_common_template(memory_desc_t &src_md,
             format_tag_t src_tag, memory_desc_t &wei_md, format_tag_t wei_tag,
             memory_desc_t &dst_md, format_tag_t dst_tag,
-            memory_desc_t &bia_md) {
+            memory_desc_t &bia_md) const {
         using namespace format_tag;
 
 #define IS_OK(f) \
@@ -243,9 +242,13 @@ struct convolution_pd_t : public primitive_desc_t {
             = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) const {
         bool ok = attr()->scales_.has_default_values(supported_args);
         for (int arg : supported_args) {
-            const auto &mask = attr()->scales_.get(arg).mask_;
+            if (attr()->scales_.has_default_values(arg)) continue;
+
+            const auto &mask = attr()->scales_.get_mask(arg);
             if (arg == DNNL_ARG_WEIGHTS)
                 ok = ok && (mask == 0 || mask == (with_groups() ? 3 : 1));
+            else if (arg == DNNL_ARG_DST)
+                ok = ok && (mask == 0 || mask == 2);
             else
                 ok = ok && (mask == 0);
         }
@@ -253,15 +256,17 @@ struct convolution_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct convolution_fwd_pd_t : public convolution_pd_t {
-    typedef convolution_fwd_pd_t base_class;
-    typedef convolution_fwd_pd_t hint_class;
+    using base_class = convolution_fwd_pd_t;
+    using hint_class = convolution_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_BIAS && with_bias()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_BIAS)
+            return with_bias() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
@@ -299,7 +304,7 @@ struct convolution_fwd_pd_t : public convolution_pd_t {
 
     int n_inputs() const override {
         return 2 + with_bias() + attr_post_op_dw_inputs() + n_binary_po_inputs()
-                + n_prelu_po_inputs();
+                + n_prelu_po_inputs() + n_depthwise_po_inputs() + n_quantization_po_inputs();
     }
 
     int n_outputs() const override { return 1; }
@@ -310,8 +315,7 @@ struct convolution_fwd_pd_t : public convolution_pd_t {
     memory_desc_t bias_md_;
     memory_desc_t dst_md_;
 
-    convolution_fwd_pd_t(const convolution_desc_t *adesc,
-            const primitive_attr_t *attr,
+    convolution_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const convolution_fwd_pd_t *hint_fwd_pd)
         : convolution_pd_t(adesc, attr, hint_fwd_pd)
         , src_md_(desc_.src_desc)
@@ -329,14 +333,15 @@ struct convolution_fwd_pd_t : public convolution_pd_t {
         const auto &po = attr_.post_ops_;
         int conv = po.find(primitive_kind::convolution);
         if (conv == -1) return 0;
-        return po.entry_[conv].depthwise_conv.bias_dt == data_type::undef ? 1
-                                                                          : 2;
+        return 2;
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct convolution_bwd_data_pd_t : public convolution_pd_t {
-    typedef convolution_bwd_data_pd_t base_class;
-    typedef convolution_fwd_pd_t hint_class;
+    using base_class = convolution_bwd_data_pd_t;
+    using hint_class = convolution_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_WEIGHTS, DNNL_ARG_DIFF_DST))
@@ -378,7 +383,9 @@ struct convolution_bwd_data_pd_t : public convolution_pd_t {
         return &glob_zero_md;
     }
 
-    int n_inputs() const override { return 2 + with_bias(); }
+    int n_inputs() const override {
+        return 2 + with_bias() + n_depthwise_po_inputs() + n_quantization_po_inputs();
+    }
     int n_outputs() const override { return 1; }
 
     virtual bool support_bias() const { return false; }
@@ -389,7 +396,7 @@ struct convolution_bwd_data_pd_t : public convolution_pd_t {
     memory_desc_t bias_md_;
     memory_desc_t diff_dst_md_;
 
-    convolution_bwd_data_pd_t(const convolution_desc_t *adesc,
+    convolution_bwd_data_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const convolution_fwd_pd_t *hint_fwd_pd)
         : convolution_pd_t(adesc, attr, hint_fwd_pd)
@@ -404,12 +411,14 @@ struct convolution_bwd_data_pd_t : public convolution_pd_t {
                 weights_md_, wei_tag, diff_dst_md_, diff_dst_tag, bias_md_);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct convolution_bwd_weights_pd_t : public convolution_pd_t {
-    typedef convolution_bwd_weights_pd_t base_class;
-    typedef convolution_fwd_pd_t hint_class;
+    using base_class = convolution_bwd_weights_pd_t;
+    using hint_class = convolution_fwd_pd_t;
 
-    convolution_bwd_weights_pd_t(const convolution_desc_t *adesc,
+    convolution_bwd_weights_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const convolution_fwd_pd_t *hint_fwd_pd)
         : convolution_pd_t(adesc, attr, hint_fwd_pd)
@@ -424,8 +433,8 @@ struct convolution_bwd_weights_pd_t : public convolution_pd_t {
 
         if (arg == DNNL_ARG_DIFF_WEIGHTS) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_DIFF_BIAS && with_bias())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DIFF_BIAS)
+            return with_bias() ? arg_usage_t::output : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -477,6 +486,7 @@ struct convolution_bwd_weights_pd_t : public convolution_pd_t {
                 diff_bias_md_);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/deconvolution.cpp b/src/common/deconvolution.cpp
index 00f3f89d037..54352dbcdd3 100644
--- a/src/common/deconvolution.cpp
+++ b/src/common/deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -131,8 +131,12 @@ status_t deconv_desc_init(deconvolution_desc_t *deconv_desc,
         dim_t ker_range = 1 + (ker - 1) * (dil + 1);
 
         VCHECK_DECONV(str > 0, VERBOSE_BAD_DIM, "strides", i - 2);
-        VCHECK_DECONV(dil >= 0 && pad_l >= 0 && pad_r + str > 0,
-                VERBOSE_INCONSISTENT_PRB);
+        // VCHECK_DECONV(dil >= 0 && pad_l >= 0 && pad_r + str > 0,
+        //         VERBOSE_INCONSISTENT_PRB);
+        //WA: OV has feature to set output shape, which would cause specified output space dims are larger than deconv actural space dims.
+        //    Need to extra padding on the space dims. pad_r < 0 && pad_r + str <=0 in these test cases.
+        VCHECK_DECONV(dil >= 0 && pad_l >= 0,
+                    VERBOSE_INCONSISTENT_PRB);
         VCHECK_DECONV((dst - ker_range + pad_l + pad_r) / str + 1 == src,
                 VERBOSE_INCONSISTENT_PRB);
     }
@@ -162,9 +166,7 @@ status_t deconv_attr_check(const deconvolution_desc_t &desc,
             is_int8 = is_int8
                     || utils::one_of(dst_dt, data_type::s8, data_type::u8,
                             data_type::s32);
-        if (is_int8)
-            fwd_attr_mask
-                    |= smask_t::scales_runtime | smask_t::zero_points_runtime;
+        if (is_int8) fwd_attr_mask |= smask_t::scales | smask_t::zero_points;
 
         VCHECK_DECONV_UNIMPL(attr->has_default_values(fwd_attr_mask, dst_dt),
                 VERBOSE_UNSUPPORTED_ATTR);
@@ -172,26 +174,38 @@ status_t deconv_attr_check(const deconvolution_desc_t &desc,
         // Check scales
         if (!attr->scales_.has_default_values()) {
             const auto &sc = attr->scales_;
-            const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-            const int mask_wei = sc.get(DNNL_ARG_WEIGHTS).mask_;
-            const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
             const bool with_groups
                     = desc.src_desc.ndims != desc.weights_desc.ndims;
-            VCHECK_DECONV_UNIMPL(utils::everyone_is(0, mask_src, mask_dst)
-                            && utils::one_of(mask_wei, 0, with_groups ? 3 : 1),
+            VCHECK_DECONV_UNIMPL(
+                    IMPLICATION(!sc.has_default_values(DNNL_ARG_SRC),
+                            sc.get_mask(DNNL_ARG_SRC) == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_DECONV_UNIMPL(
+                    IMPLICATION(!sc.has_default_values(DNNL_ARG_WEIGHTS),
+                            utils::one_of(sc.get_mask(DNNL_ARG_WEIGHTS), 0,
+                                    with_groups ? 3 : 1)),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_DECONV_UNIMPL(
+                    IMPLICATION(!sc.has_default_values(DNNL_ARG_DST),
+                            sc.get_mask(DNNL_ARG_DST) == 0),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
         }
 
         // Check zero points
         if (!attr->zero_points_.has_default_values()) {
             const auto &zp = attr->zero_points_;
-            int mask_src = 0, mask_dst = 0;
-            zp.get(DNNL_ARG_SRC, &mask_src);
-            zp.get(DNNL_ARG_DST, &mask_dst);
 
-            VCHECK_DECONV_UNIMPL(zp.has_default_values(DNNL_ARG_WEIGHTS)
-                            && (mask_src == 0 || mask_src == 1 << 1)
-                            && (mask_dst == 0 || mask_dst == 1 << 1),
+            VCHECK_DECONV_UNIMPL(
+                    IMPLICATION(!zp.has_default_values(DNNL_ARG_SRC),
+                            utils::one_of(
+                                    zp.get_mask(DNNL_ARG_SRC), 0, 1 << 1)),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+            VCHECK_DECONV_UNIMPL(zp.has_default_values(DNNL_ARG_WEIGHTS),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+            VCHECK_DECONV_UNIMPL(
+                    IMPLICATION(!zp.has_default_values(DNNL_ARG_DST),
+                            utils::one_of(
+                                    zp.get_mask(DNNL_ARG_DST), 0, 1 << 1)),
                     VERBOSE_UNSUPPORTED_ZP_CFG);
         }
 
@@ -207,6 +221,9 @@ status_t deconv_attr_check(const deconvolution_desc_t &desc,
             VCHECK_DECONV_UNIMPL(
                     po.check_sum_consistency(dst_dt, is_int8, true),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         auto bwd_attr_mask = smask_t::fpmath_mode;
diff --git a/src/common/deconvolution_pd.hpp b/src/common/deconvolution_pd.hpp
index bcc372384ac..62aa5c3bdf5 100644
--- a/src/common/deconvolution_pd.hpp
+++ b/src/common/deconvolution_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -163,18 +163,19 @@ struct deconvolution_pd_t : public primitive_desc_t {
     deconvolution_desc_t desc_;
     const deconvolution_fwd_pd_t *hint_fwd_pd_;
 
-    deconvolution_pd_t(const deconvolution_desc_t *adesc,
-            const primitive_attr_t *attr,
+    deconvolution_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const deconvolution_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<deconvolution_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd) {}
 
     bool attr_scales_ok(const std::vector<int> &supported_args
             = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) const {
         bool ok = attr()->scales_.has_default_values(supported_args);
         for (int arg : supported_args) {
-            const auto &mask = attr()->scales_.get(arg).mask_;
+            if (attr()->scales_.has_default_values(arg)) continue;
+
+            const auto &mask = attr()->scales_.get_mask(arg);
             if (arg == DNNL_ARG_WEIGHTS)
                 ok = ok && (mask == 0 || mask == (with_groups() ? 3 : 1));
             else
@@ -200,15 +201,17 @@ struct deconvolution_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct deconvolution_fwd_pd_t : public deconvolution_pd_t {
-    typedef deconvolution_fwd_pd_t base_class;
-    typedef deconvolution_fwd_pd_t hint_class;
+    using base_class = deconvolution_fwd_pd_t;
+    using hint_class = deconvolution_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_BIAS && with_bias()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_BIAS)
+            return with_bias() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
@@ -245,7 +248,7 @@ struct deconvolution_fwd_pd_t : public deconvolution_pd_t {
     }
 
     int n_inputs() const override {
-        return 2 + with_bias() + n_prelu_po_inputs() + n_binary_po_inputs();
+        return 2 + with_bias() + n_prelu_po_inputs() + n_binary_po_inputs() + n_depthwise_po_inputs() + n_quantization_po_inputs();
     }
     int n_outputs() const override { return 1; }
 
@@ -255,8 +258,7 @@ struct deconvolution_fwd_pd_t : public deconvolution_pd_t {
     memory_desc_t bias_md_;
     memory_desc_t dst_md_;
 
-    deconvolution_fwd_pd_t(const deconvolution_desc_t *adesc,
-            const primitive_attr_t *attr,
+    deconvolution_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const deconvolution_fwd_pd_t *hint_fwd_pd)
         : deconvolution_pd_t(adesc, attr, hint_fwd_pd)
         , src_md_(desc_.src_desc)
@@ -264,10 +266,12 @@ struct deconvolution_fwd_pd_t : public deconvolution_pd_t {
         , bias_md_(desc_.bias_desc)
         , dst_md_(desc_.dst_desc) {}
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct deconvolution_bwd_data_pd_t : public deconvolution_pd_t {
-    typedef deconvolution_bwd_data_pd_t base_class;
-    typedef deconvolution_fwd_pd_t hint_class;
+    using base_class = deconvolution_bwd_data_pd_t;
+    using hint_class = deconvolution_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_WEIGHTS, DNNL_ARG_DIFF_DST))
@@ -316,7 +320,7 @@ struct deconvolution_bwd_data_pd_t : public deconvolution_pd_t {
     memory_desc_t weights_md_;
     memory_desc_t diff_dst_md_;
 
-    deconvolution_bwd_data_pd_t(const deconvolution_desc_t *adesc,
+    deconvolution_bwd_data_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const deconvolution_fwd_pd_t *hint_fwd_pd)
         : deconvolution_pd_t(adesc, attr, hint_fwd_pd)
@@ -324,10 +328,12 @@ struct deconvolution_bwd_data_pd_t : public deconvolution_pd_t {
         , weights_md_(desc_.weights_desc)
         , diff_dst_md_(desc_.diff_dst_desc) {}
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct deconvolution_bwd_weights_pd_t : public deconvolution_pd_t {
-    typedef deconvolution_bwd_weights_pd_t base_class;
-    typedef deconvolution_fwd_pd_t hint_class;
+    using base_class = deconvolution_bwd_weights_pd_t;
+    using hint_class = deconvolution_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_DIFF_DST))
@@ -335,8 +341,8 @@ struct deconvolution_bwd_weights_pd_t : public deconvolution_pd_t {
 
         if (arg == DNNL_ARG_DIFF_WEIGHTS) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_DIFF_BIAS && with_bias())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DIFF_BIAS)
+            return with_bias() ? arg_usage_t::output : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -381,7 +387,7 @@ struct deconvolution_bwd_weights_pd_t : public deconvolution_pd_t {
     memory_desc_t diff_bias_md_;
     memory_desc_t diff_dst_md_;
 
-    deconvolution_bwd_weights_pd_t(const deconvolution_desc_t *adesc,
+    deconvolution_bwd_weights_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const deconvolution_fwd_pd_t *hint_fwd_pd)
         : deconvolution_pd_t(adesc, attr, hint_fwd_pd)
@@ -390,6 +396,7 @@ struct deconvolution_bwd_weights_pd_t : public deconvolution_pd_t {
         , diff_bias_md_(desc_.diff_bias_desc)
         , diff_dst_md_(desc_.diff_dst_desc) {}
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/dnnl_debug.cpp b/src/common/dnnl_debug.cpp
index 8f2dc99a278..a7aacb41115 100644
--- a/src/common/dnnl_debug.cpp
+++ b/src/common/dnnl_debug.cpp
@@ -33,6 +33,7 @@ const char *dnnl_runtime2str(unsigned runtime) {
         case DNNL_RUNTIME_SEQ: return "sequential";
         case DNNL_RUNTIME_OMP: return "OpenMP";
         case DNNL_RUNTIME_TBB: return "TBB";
+        case DNNL_RUNTIME_TBB_AUTO: return "TBB_AUTO";
         case DNNL_RUNTIME_OCL: return "OpenCL";
         case DNNL_RUNTIME_THREADPOOL: return "threadpool";
 #ifdef DNNL_WITH_SYCL
@@ -49,8 +50,11 @@ const char *dnnl_fmt_kind2str(dnnl_format_kind_t v) {
 #ifdef DNNL_EXPERIMENTAL_SPARSE
     if (v == dnnl_format_kind_sparse) return "sparse";
 #endif
-    if (v == format_kind::wino || v == format_kind::rnn_packed) return "opaque";
+    if (v == format_kind::wino || v == format_kind::rnn_packed
+            || v == format_kind::cublaslt_blocked)
+        return "opaque";
     if (v == dnnl_format_kind_max) return "max";
+    if (v == dnnl_format_sparse) return "format_sparse";
     assert(!"unknown fmt_kind");
     return "unknown fmt_kind";
 }
diff --git a/src/common/dnnl_debug_autogenerated.cpp b/src/common/dnnl_debug_autogenerated.cpp
index 05c3a55f9d7..1cbad069cd3 100644
--- a/src/common/dnnl_debug_autogenerated.cpp
+++ b/src/common/dnnl_debug_autogenerated.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,6 +58,12 @@ const char *dnnl_dt2str(dnnl_data_type_t v) {
     if (v == dnnl_s4) return "s4";
     if (v == dnnl_u4) return "u4";
     if (v == dnnl_e8m0) return "e8m0";
+    if (v == dnnl_f4_e2m1) return "f4_e2m1";
+    if (v == dnnl_f4_e3m0) return "f4_e3m0";
+    if (v == dnnl_bin) return "bin";
+    if (v == dnnl_nf4) return "nf4";
+    if (v == dnnl_s4) return "s4";
+    if (v == dnnl_u4) return "u4";
     if (v == dnnl_data_type_max) return "data_type_max";
     assert(!"unknown dt");
     return "unknown dt";
@@ -96,6 +103,7 @@ const char *dnnl_sparse_encoding2str(dnnl_sparse_encoding_t v) {
     if (v == dnnl_sparse_encoding_undef) return "undef";
     if (v == dnnl_csr) return "csr";
     if (v == dnnl_packed) return "packed";
+    if (v == dnnl_coo) return "coo";
     assert(!"unknown sparse_encoding");
     return "unknown sparse_encoding";
 }
@@ -203,6 +211,8 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_ABcd8a16b2a) return "ABcd8a16b2a";
     if (v == dnnl_ABcd2b8a4b) return "ABcd2b8a4b";
     if (v == dnnl_ABcd8a8b) return "ABcd8a8b";
+    if (v == dnnl_ABcd8a32b) return "ABcd8a32b";
+    if (v == dnnl_ABcd16a32b) return "ABcd16a32b";
     if (v == dnnl_ABcd8a4b) return "ABcd8a4b";
     if (v == dnnl_aBcd8b) return "aBcd8b";
     if (v == dnnl_aBCd4c8b2c) return "aBCd4c8b2c";
@@ -301,6 +311,8 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_aCBdef16c16b) return "aCBdef16c16b";
     if (v == dnnl_aBdefc4b) return "aBdefc4b";
     if (v == dnnl_aBdefc8b) return "aBdefc8b";
+    if (v == dnnl_Abcdef4a) return "Abcdef4a";
+    if (v == dnnl_Abcdef8a) return "Abcdef8a";
     if (v == dnnl_Abcdef16a) return "Abcdef16a";
     if (v == dnnl_Abcdef32a) return "Abcdef32a";
     if (v == dnnl_aBedc16b) return "aBedc16b";
@@ -940,6 +952,18 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_bcad) return "bcad";
     if (v == dnnl_cabd) return "cabd";
     if (v == dnnl_dabc) return "dabc";
+    if (v == dnnl_Ab32a) return "Ab32a";
+    if (v == dnnl_aCBd8b8c) return "aCBd8b8c";
+    if (v == dnnl_aCBde8b8c) return "aCBde8b8c";
+    if (v == dnnl_BAc8a8b) return "BAc8a8b";
+    if (v == dnnl_BAcd8a8b) return "BAcd8a8b";
+    if (v == dnnl_BAcde8a8b) return "BAcde8a8b";
+    if (v == dnnl_aCBdef8b8c) return "aCBdef8b8c";
+    if (v == dnnl_abdEC16e4c) return "abdEC16e4c";
+    if (v == dnnl_abDC16d4c) return "abDC16d4c";
+    if (v == dnnl_BA24b8a) return "BA24b8a";
+    if (v == dnnl_aCB24c8b) return "aCB24c8b";
+    if (v == dnnl_abDC24d8c) return "abDC24d8c";
     if (v == dnnl_format_tag_last) return "format_tag_last";
     if (v == dnnl_x) return "x";
     if (v == dnnl_nc) return "nc";
@@ -993,9 +1017,11 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_ldgo) return "ldgo";
     if (v == dnnl_ldOi16o) return "ldOi16o";
     if (v == dnnl_ldOi32o) return "ldOi32o";
+    if (v == dnnl_ldOI16o4i) return "ldOI16o4i";
     if (v == dnnl_ldOI32o4i) return "ldOI32o4i";
     if (v == dnnl_ldIo32i) return "ldIo32i";
     if (v == dnnl_ldgOi16o) return "ldgOi16o";
+    if (v == dnnl_ldgOI16o4i) return "ldgOI16o4i";
     if (v == dnnl_ldgOi32o) return "ldgOi32o";
     if (v == dnnl_ldgOI32o2i) return "ldgOI32o2i";
     if (v == dnnl_ldgOI32o4i) return "ldgOI32o4i";
@@ -1045,6 +1071,7 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_OI8i24o) return "OI8i24o";
     if (v == dnnl_OI8i16o) return "OI8i16o";
     if (v == dnnl_OI8i8o) return "OI8i8o";
+    if (v == dnnl_IOw8o8i) return "IOw8o8i";
     if (v == dnnl_IOw16o16i) return "IOw16o16i";
     if (v == dnnl_IOw16i16o) return "IOw16i16o";
     if (v == dnnl_OIw16i16o) return "OIw16i16o";
@@ -1113,6 +1140,7 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_OwI8i16o) return "OwI8i16o";
     if (v == dnnl_OwI8o4i) return "OwI8o4i";
     if (v == dnnl_IOhw16i16o) return "IOhw16i16o";
+    if (v == dnnl_IOhw8o8i) return "IOhw8o8i";
     if (v == dnnl_IOhw16o16i) return "IOhw16o16i";
     if (v == dnnl_Ohwi16o) return "Ohwi16o";
     if (v == dnnl_OhwI16o2i) return "OhwI16o2i";
@@ -1173,6 +1201,8 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_OIhw8o16i2o) return "OIhw8o16i2o";
     if (v == dnnl_OIhw2i8o4i) return "OIhw2i8o4i";
     if (v == dnnl_IOhw8o16i2o) return "IOhw8o16i2o";
+    if (v == dnnl_OIhw8o32i) return "OIhw8o23i";
+    if (v == dnnl_OIhw16o32i) return "OIhw16o23i";
     if (v == dnnl_OIhw8o8i) return "OIhw8o8i";
     if (v == dnnl_OIhw8o4i) return "OIhw8o4i";
     if (v == dnnl_Owhi16o) return "Owhi16o";
@@ -1243,6 +1273,7 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_OIdhw8o4i) return "OIdhw8o4i";
     if (v == dnnl_IOdhw16i16o) return "IOdhw16i16o";
     if (v == dnnl_OIdhw4o8i8o4i) return "OIdhw4o8i8o4i";
+    if (v == dnnl_IOdhw8o8i) return "IOdhw8o8i";
     if (v == dnnl_IOdhw16o16i) return "IOdhw16o16i";
     if (v == dnnl_OIdhw16o16i2o) return "OIdhw16o16i2o";
     if (v == dnnl_OIdhw8i32o) return "OIdhw8i32o";
@@ -1254,6 +1285,7 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_Goiw16g) return "Goiw16g";
     if (v == dnnl_Goiw8g) return "Goiw8g";
     if (v == dnnl_Goiw4g) return "Goiw4g";
+    if (v == dnnl_gIOw8o8i) return "gIOw8o8i";
     if (v == dnnl_gIOw16o16i) return "gIOw16o16i";
     if (v == dnnl_gIOw16i16o) return "gIOw16i16o";
     if (v == dnnl_gOIw16i16o) return "gOIw16i16o";
@@ -1297,6 +1329,7 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_goIw4i) return "goIw4i";
     if (v == dnnl_goIw32i) return "goIw32i";
     if (v == dnnl_gIOhw16i16o) return "gIOhw16i16o";
+    if (v == dnnl_gIOhw8o8i) return "gIOhw8o8i";
     if (v == dnnl_gIOhw16o16i) return "gIOhw16o16i";
     if (v == dnnl_gOhwi16o) return "gOhwi16o";
     if (v == dnnl_gOhwI16o2i) return "gOhwI16o2i";
@@ -1360,6 +1393,7 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_gOIhw4i8o2i) return "gOIhw4i8o2i";
     if (v == dnnl_gOIhw4o8i2o) return "gOIhw4o8i2o";
     if (v == dnnl_gIOdhw16i16o) return "gIOdhw16i16o";
+    if (v == dnnl_gIOdhw8o8i) return "gIOdhw8o8i";
     if (v == dnnl_gIOdhw16o16i) return "gIOdhw16o16i";
     if (v == dnnl_gOdhwi16o) return "gOdhwi16o";
     if (v == dnnl_gOdhwI16o2i) return "gOdhwI16o2i";
@@ -1395,6 +1429,8 @@ const char *dnnl_fmt_tag2str(dnnl_format_tag_t v) {
     if (v == dnnl_gIOdhw8o16i2o) return "gIOdhw8o16i2o";
     if (v == dnnl_gOIdhw8o8i) return "gOIdhw8o8i";
     if (v == dnnl_gOIdhw8o4i) return "gOIdhw8o4i";
+    if (v == dnnl_Goidhw4g) return "Goidhw4g";
+    if (v == dnnl_Goidhw8g) return "Goidhw8g";
     if (v == dnnl_Goidhw16g) return "Goidhw16g";
     if (v == dnnl_Goidhw32g) return "Goidhw32g";
     if (v == dnnl_gOIdhw2i4o2i) return "gOIdhw2i4o2i";
@@ -1751,6 +1787,8 @@ const char *dnnl_prim_kind2str(dnnl_primitive_kind_t v) {
     if (v == dnnl_softmax) return "softmax";
     if (v == dnnl_layer_normalization) return "layer_normalization";
     if (v == dnnl_group_normalization) return "group_normalization";
+    if (v == dnnl_depthwise) return "depthwise";
+    if (v == dnnl_quantization) return "quantization";
     if (v == dnnl_primitive_kind_max) return "primitive_kind_max";
     if (v == dnnl::impl::primitive_kind::sdpa) return "sdpa";
     assert(!"unknown prim_kind");
@@ -1785,6 +1823,9 @@ const char *dnnl_alg_kind2str(dnnl_alg_kind_t v) {
     if (v == dnnl_eltwise_round) return "eltwise_round";
     if (v == dnnl_eltwise_mish) return "eltwise_mish";
     if (v == dnnl_eltwise_hardswish) return "eltwise_hardswish";
+    if (v == dnnl_eltwise_hsigmoid) return "eltwise_hsigmoid";
+    if (v == dnnl_eltwise_round_half_to_even) return "eltwise_round_half_to_even";
+    if (v == dnnl_eltwise_round_half_away_from_zero) return "eltwise_round_half_away_from_zero";
     if (v == dnnl_eltwise_relu_use_dst_for_bwd) return "eltwise_relu_use_dst_for_bwd";
     if (v == dnnl_eltwise_tanh_use_dst_for_bwd) return "eltwise_tanh_use_dst_for_bwd";
     if (v == dnnl_eltwise_elu_use_dst_for_bwd) return "eltwise_elu_use_dst_for_bwd";
@@ -1815,6 +1856,8 @@ const char *dnnl_alg_kind2str(dnnl_alg_kind_t v) {
     if (v == dnnl_binary_lt) return "binary_lt";
     if (v == dnnl_binary_eq) return "binary_eq";
     if (v == dnnl_binary_ne) return "binary_ne";
+    if (v == dnnl_binary_select) return "binary_select";
+    if (v == dnnl_binary_prelu) return "binary_prelu";
     if (v == dnnl_resampling_nearest) return "resampling_nearest";
     if (v == dnnl_resampling_linear) return "resampling_linear";
     if (v == dnnl_reduction_max) return "reduction_max";
@@ -1828,10 +1871,23 @@ const char *dnnl_alg_kind2str(dnnl_alg_kind_t v) {
     if (v == dnnl_reduction_norm_lp_power_p_sum) return "reduction_norm_lp_power_p_sum";
     if (v == dnnl_softmax_accurate) return "softmax_accurate";
     if (v == dnnl_softmax_log) return "softmax_log";
+    if (v == dnnl_depthwise_scale_shift) return "depthwise_scale_shift";
+    if (v == dnnl_depthwise_prelu) return "depthwise_prelu";
+    if (v == dnnl_quantization_quantize_dequantize) return "quantization_quantize_dequantize";
+    if (v == dnnl_quantization_quantize) return "quantization_quantize";
+    if (v == dnnl_binarization_depthwise) return "binarization_depthwise";
     assert(!"unknown alg_kind");
     return "unknown alg_kind";
 }
 
+const char *dnnl_sparse_encoding2str(dnnl_sparse_encoding_t v) {
+    if (v == dnnl_sparse_encoding_undef) return "undef";
+    if (v == dnnl_sparse_encoding_any) return "any";
+    if (v == dnnl_sparse_encoding_packed) return "sparse_encoding_packed";
+    assert(!"unknown sparse_encoding");
+    return "unknown sparse_encoding";
+}
+
 const char *dnnl_rnn_flags2str(dnnl_rnn_flags_t v) {
     if (v == dnnl_rnn_flags_undef) return "undef";
     if (v == dnnl_rnn_flags_diff_weights_overwrite) return "rnn_flags_diff_weights_overwrite";
diff --git a/src/common/dnnl_sel_build.hpp b/src/common/dnnl_sel_build.hpp
new file mode 100644
index 00000000000..fee17f6685b
--- /dev/null
+++ b/src/common/dnnl_sel_build.hpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+
+#define DNNL_MACRO_EXPAND(x) x
+
+#define DNNL_MACRO_CAT_(x, y) x ## y
+#define DNNL_MACRO_CAT(x, y) DNNL_MACRO_CAT_(x, y)
+#define DNNL_MACRO_CAT3_(x, y, z) x ## y ## z
+#define DNNL_MACRO_CAT3(x, y, z) DNNL_MACRO_CAT3_(x, y, z)
+
+#define DNNL_MACRO_TOSTRING(...) DNNL_MACRO_TOSTRING_(__VA_ARGS__)
+#define DNNL_MACRO_TOSTRING_(...) #__VA_ARGS__
+
+#define DNNL_MACRO_NARG(...) DNNL_MACRO_EXPAND( DNNL_MACRO_NARG_(__VA_ARGS__, DNNL_MACRO_RSEQ_N()) )
+#define DNNL_MACRO_NARG_(...) DNNL_MACRO_EXPAND( DNNL_MACRO_ARG_N(__VA_ARGS__) )
+#define DNNL_MACRO_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N
+#define DNNL_MACRO_RSEQ_N() 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define DNNL_MACRO_EVAL_(NAME, N) NAME ## _ ## N
+#define DNNL_MACRO_EVAL(NAME, N) DNNL_MACRO_EVAL_(NAME, N)
+
+#define DNNL_MACRO_OVERLOAD(NAME, ...) \
+    DNNL_MACRO_EXPAND( DNNL_MACRO_EVAL(NAME, DNNL_MACRO_EXPAND( DNNL_MACRO_NARG(__VA_ARGS__) ))(__VA_ARGS__) )
+
+#if defined(SELECTIVE_BUILD_ANALYZER)
+
+# include <openvino/cc/selective_build.h>
+
+namespace dnnl {
+
+OV_CC_DOMAINS(DNNL)
+
+}   // namespace dnnl
+
+# define DNNL_CSCOPE(region) OV_SCOPE(DNNL, region)
+
+# define DNNL_PRIMITIVE_NAME_INIT(pd_t) name = typeid(pd_t).name();
+# define DNNL_PRIMITIVE_CREATE(pd_t) OV_ITT_SCOPED_TASK(dnnl::FACTORY_DNNL, std::string("CREATE$CPUEngine$") + typeid(pd_t).name());
+# define DNNL_PRIMITIVE_IMPL(...) DNNL_MACRO_OVERLOAD(DNNL_PRIMITIVE_IMPL, __VA_ARGS__),
+# define DNNL_PRIMITIVE_IMPL_2(expr, type) dnnl::impl::move(expr(type), OV_CC_TOSTRING(type))
+# define DNNL_PRIMITIVE_IMPL_3(expr, type, t1) dnnl::impl::move(expr(type<t1>), OV_CC_TOSTRING(type ## _ ## t1))
+# define DNNL_PRIMITIVE_IMPL_4(expr, type, t1, t2) dnnl::impl::move(expr(type<t1, t2>), OV_CC_TOSTRING(type ## _ ## t1 ## _ ## t2))
+# define DNNL_PRIMITIVE_IMPL_5(expr, type, t1, t2, t3) dnnl::impl::move(expr(type<t1, t2, t3>), OV_CC_TOSTRING(type ## _ ## t1 ## _ ## t2 ## _ ## t3))
+# define DNNL_PRIMITIVE_IMPL_6(expr, type, t1, t2, t3, t4) dnnl::impl::move(expr(type<t1, t2, t3, t4>), OV_CC_TOSTRING(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4))
+# define DNNL_PRIMITIVE_IMPL_7(expr, type, t1, t2, t3, t4, t5) dnnl::impl::move(expr(type<t1, t2, t3, t4, t5>), OV_CC_TOSTRING(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4 ## _ ## t5))
+# define DNNL_PRIMITIVE_IMPL_8(expr, type, t1, t2, t3, t4, t5, t6) dnnl::impl::move(expr(type<t1, t2, t3, t4, t5, t6>), OV_CC_TOSTRING(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4 ## _ ## t5 ## _ ## t6))
+# define DNNL_PRIMITIVE_IMPL_9(expr, type, t1, t2, t3, t4, t5, t6, t7) dnnl::impl::move(expr(type<t1, t2, t3, t4, t5, t6, t7>), OV_CC_TOSTRING(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4 ## _ ## t5 ## _ ## t6 ## _ ## t7))
+
+#elif defined(SELECTIVE_BUILD)
+
+# include <openvino/cc/selective_build.h>
+
+# define DNNL_CSCOPE(region) OV_SCOPE(DNNL, region)
+
+# define DNNL_OBJ_BUILDER_0(...)
+# define DNNL_OBJ_BUILDER_1(...) __VA_ARGS__,
+# define DNNL_OBJ_BUILDER(name, ...) OV_CC_EXPAND(OV_CC_CAT(DNNL_OBJ_BUILDER_, OV_CC_EXPAND(OV_CC_SCOPE_IS_ENABLED(OV_CC_CAT(DNNL_, name))))(__VA_ARGS__))
+
+# define DNNL_PRIMITIVE_NAME_INIT(pd_t)
+# define DNNL_PRIMITIVE_CREATE(pd_t)
+# define DNNL_PRIMITIVE_IMPL(...) DNNL_MACRO_OVERLOAD(DNNL_PRIMITIVE_IMPL, __VA_ARGS__)
+# define DNNL_PRIMITIVE_IMPL_2(expr, type) DNNL_OBJ_BUILDER(type, expr(type))
+# define DNNL_PRIMITIVE_IMPL_3(expr, type, t1) DNNL_OBJ_BUILDER(type ## _ ## t1, expr(type<t1>))
+# define DNNL_PRIMITIVE_IMPL_4(expr, type, t1, t2) DNNL_OBJ_BUILDER(type ## _ ## t1 ## _ ## t2, expr(type<t1, t2>))
+# define DNNL_PRIMITIVE_IMPL_5(expr, type, t1, t2, t3) DNNL_OBJ_BUILDER(type ## _ ## t1 ## _ ## t2 ## _ ## t3, expr(type<t1, t2, t3>))
+# define DNNL_PRIMITIVE_IMPL_6(expr, type, t1, t2, t3, t4) DNNL_OBJ_BUILDER(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4, expr(type<t1, t2, t3, t4>))
+# define DNNL_PRIMITIVE_IMPL_7(expr, type, t1, t2, t3, t4, t5) DNNL_OBJ_BUILDER(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4 ## _ ## t5, expr(type<t1, t2, t3, t4, t5>))
+# define DNNL_PRIMITIVE_IMPL_8(expr, type, t1, t2, t3, t4, t5, t6) DNNL_OBJ_BUILDER(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4 ## _ ## t5 ## _ ## t6, expr(type<t1, t2, t3, t4, t5, t6>))
+# define DNNL_PRIMITIVE_IMPL_9(expr, type, t1, t2, t3, t4, t5, t6, t7) DNNL_OBJ_BUILDER(type ## _ ## t1 ## _ ## t2 ## _ ## t3 ## _ ## t4 ## _ ## t5 ## _ ## t6 ## _ ## t7, expr(type<t1, t2, t3, t4, t5, t6, t7>))
+
+#else
+
+# define DNNL_CSCOPE(region)
+
+# define DNNL_PRIMITIVE_NAME_INIT(pd_t)
+# define DNNL_PRIMITIVE_CREATE(pd_t)
+# define DNNL_PRIMITIVE_IMPL(...) DNNL_MACRO_OVERLOAD(DNNL_PRIMITIVE_IMPL, __VA_ARGS__),
+# define DNNL_PRIMITIVE_IMPL_2(expr, type) expr(type)
+# define DNNL_PRIMITIVE_IMPL_3(expr, type, t1) expr(type<t1>)
+# define DNNL_PRIMITIVE_IMPL_4(expr, type, t1, t2) expr(type<t1, t2>)
+# define DNNL_PRIMITIVE_IMPL_5(expr, type, t1, t2, t3) expr(type<t1, t2, t3>)
+# define DNNL_PRIMITIVE_IMPL_6(expr, type, t1, t2, t3, t4) expr(type<t1, t2, t3, t4>)
+# define DNNL_PRIMITIVE_IMPL_7(expr, type, t1, t2, t3, t4, t5) expr(type<t1, t2, t3, t4, t5>)
+# define DNNL_PRIMITIVE_IMPL_8(expr, type, t1, t2, t3, t4, t5, t6) expr(type<t1, t2, t3, t4, t5, t6>)
+# define DNNL_PRIMITIVE_IMPL_9(expr, type, t1, t2, t3, t4, t5, t6, t7) expr(type<t1, t2, t3, t4, t5, t6, t7>)
+
+#endif
diff --git a/src/common/dnnl_thread.cpp b/src/common/dnnl_thread.cpp
new file mode 100644
index 00000000000..e28f92b3557
--- /dev/null
+++ b/src/common/dnnl_thread.cpp
@@ -0,0 +1,102 @@
+#include <functional>
+
+#include "dnnl_thread.hpp"
+
+#if defined(DNNL_ENABLE_ITT_TASKS)
+#include "common/ittnotify.hpp"
+#endif
+
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+#include "counting_barrier.hpp"
+#endif
+
+namespace dnnl {
+namespace impl {
+
+void parallel(int nthr, const std::function<void(int, int)> &f) {
+    nthr = adjust_num_threads(nthr, INT64_MAX);
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_SEQ
+    for (int i = 0; i < nthr; ++i) {
+        f(i, nthr);
+    }
+#else
+#if defined(DNNL_ENABLE_ITT_TASKS)
+    auto task_primitive_kind = itt::primitive_task_get_current_kind();
+    bool itt_enable = itt::get_itt(itt::__itt_task_level_high);
+#endif
+    if (nthr == 1) {
+        f(0, 1);
+        return;
+    }
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+#pragma omp parallel num_threads(nthr)
+    {
+        int nthr_ = omp_get_num_threads();
+        int ithr_ = omp_get_thread_num();
+        assert(nthr_ == nthr);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+        if (ithr_ && itt_enable) itt::primitive_task_start(task_primitive_kind);
+#endif
+        f(ithr_, nthr_);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+        if (ithr_ && itt_enable) itt::primitive_task_end();
+#endif
+    }
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB
+    tbb::parallel_for(
+            0, nthr,
+            [&](int ithr) {
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                bool mark_task = itt::primitive_task_get_current_kind()
+                        == primitive_kind::undefined;
+                if (mark_task && itt_enable)
+                    itt::primitive_task_start(task_primitive_kind);
+#endif
+                f(ithr, nthr);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                if (mark_task && itt_enable) itt::primitive_task_end();
+#endif
+            },
+            tbb::static_partitioner());
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB_AUTO
+    tbb::parallel_for(
+            0, nthr, [&](int ithr) { f(ithr, nthr); });
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+    using namespace dnnl::impl::threadpool_utils;
+    dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
+    if (!tp || dnnl_in_parallel()) {
+        threadpool_utils::deactivate_threadpool();
+        for (int ithr = 0; ithr < nthr; ithr++) {
+            f(ithr, nthr);
+        }
+        threadpool_utils::activate_threadpool(tp);
+    } else {
+        bool async = tp->get_flags()
+                & dnnl::threadpool_interop::threadpool_iface::ASYNCHRONOUS;
+        counting_barrier_t b;
+        if (async) b.init(nthr);
+        tp->parallel_for(nthr, [&, tp](int ithr, int nthr) {
+            bool is_master = threadpool_utils::get_active_threadpool() == tp;
+            if (!is_master) {
+                threadpool_utils::activate_threadpool(tp);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                if (itt_enable) itt::primitive_task_start(task_primitive_kind);
+#endif
+            }
+            f(ithr, nthr);
+            if (!is_master) {
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                if (itt_enable) itt::primitive_task_end();
+#endif
+                threadpool_utils::deactivate_threadpool();
+            }
+            if (async) b.notify();
+        });
+        if (async) b.wait();
+    }
+#endif
+#endif
+}
+
+} // namespace impl
+} // namespace dnnl
diff --git a/src/common/dnnl_thread.hpp b/src/common/dnnl_thread.hpp
index 6122819a308..2f13770dec5 100644
--- a/src/common/dnnl_thread.hpp
+++ b/src/common/dnnl_thread.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ inline void dnnl_thr_barrier() {
 #pragma omp barrier
 }
 
-#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB
+#elif (DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB || DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB_AUTO)
 #include "tbb/parallel_for.h"
 #include "tbb/task_arena.h"
 #define DNNL_THR_SYNC 0
@@ -184,25 +184,25 @@ inline int dnnl_get_current_num_threads() {
 #define OMP_GET_NUM_THREADS() 1
 #endif
 
-// MSVC still supports omp 2.0 only
-#if defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+// Disabling OMP SIMD feature in the following scenarios:
+// * For MSVC as it only supports OpenMP 2.0
+// *   however VS2019 also now offers SIMD functionality
+// *   with the -openmp:experimental compilation switch that enables additional OpenMP features
+// *   not available when using the -openmp switch
+// * In debug mode on Windows to avoid incorrect code generation
+//   by Intel(R) oneAPI DPC++/C++ Compiler
+#if defined(_MSC_VER) && (_MSC_VER < 1900) \
+        && ((!defined(__clang__) && !defined(__INTEL_COMPILER)) \
+                || defined(_DEBUG))
 #define collapse(x)
 #define PRAGMA_OMP_SIMD(...)
 #else
 #define PRAGMA_OMP_SIMD(...) PRAGMA_MACRO(CHAIN2(omp, simd __VA_ARGS__))
-#endif // defined(_MSC_VER) && !defined(__clang__) && !defined(__INTEL_COMPILER)
-
-// process simdlen; it is supported for Clang >= 3.9; ICC >= 17.0; GCC >= 6.1
-// No support on Windows.
-#if (defined(__clang_major__) \
-        && (__clang_major__ < 3 \
-                || (__clang_major__ == 3 && __clang_minor__ < 9))) \
-        || (defined(__INTEL_COMPILER) && __INTEL_COMPILER < 1700) \
-        || (!defined(__INTEL_COMPILER) && !defined(__clang__) \
-                && (defined(_MSC_VER) || __GNUC__ < 6 \
-                        || (__GNUC__ == 6 && __GNUC_MINOR__ < 1)))
-#define simdlen(x)
-#endif // long simdlen if
+#endif // defined(_MSC_VER) && ((!defined(__clang__) && !defined(__INTEL_COMPILER)) || defined(_DEBUG))
+
+#if defined(DNNL_ENABLE_ITT_TASKS)
+#include "common/ittnotify.hpp"
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -282,87 +282,7 @@ inline int adjust_num_threads(int nthr, dim_t work_amount) {
 #endif
 }
 
-static inline void parallel(int nthr, const std::function<void(int, int)> &f) {
-    nthr = adjust_num_threads(nthr, INT64_MAX);
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_SEQ
-    for (int i = 0; i < nthr; ++i) {
-        f(i, nthr);
-    }
-#else
-#if defined(DNNL_ENABLE_ITT_TASKS)
-    auto task_primitive_kind = itt::primitive_task_get_current_kind();
-    bool itt_enable = itt::get_itt(itt::__itt_task_level_high);
-#endif
-    if (nthr == 1) {
-        f(0, 1);
-        return;
-    }
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-#pragma omp parallel num_threads(nthr)
-    {
-        int nthr_ = omp_get_num_threads();
-        int ithr_ = omp_get_thread_num();
-        assert(nthr_ == nthr);
-#if defined(DNNL_ENABLE_ITT_TASKS)
-        if (ithr_ && itt_enable) itt::primitive_task_start(task_primitive_kind);
-#endif
-        f(ithr_, nthr_);
-#if defined(DNNL_ENABLE_ITT_TASKS)
-        if (ithr_ && itt_enable) itt::primitive_task_end();
-#endif
-    }
-#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB
-    tbb::parallel_for(
-            0, nthr,
-            [&](int ithr) {
-#if defined(DNNL_ENABLE_ITT_TASKS)
-                bool mark_task = itt::primitive_task_get_current_kind()
-                        == primitive_kind::undefined;
-                if (mark_task && itt_enable)
-                    itt::primitive_task_start(task_primitive_kind);
-#endif
-                f(ithr, nthr);
-#if defined(DNNL_ENABLE_ITT_TASKS)
-                if (mark_task && itt_enable) itt::primitive_task_end();
-#endif
-            },
-            tbb::static_partitioner());
-#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-    using namespace dnnl::impl::threadpool_utils;
-    dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
-    if (!tp || dnnl_in_parallel()) {
-        threadpool_utils::deactivate_threadpool();
-        for (int ithr = 0; ithr < nthr; ithr++) {
-            f(ithr, nthr);
-        }
-        threadpool_utils::activate_threadpool(tp);
-    } else {
-        bool async = tp->get_flags()
-                & dnnl::threadpool_interop::threadpool_iface::ASYNCHRONOUS;
-        counting_barrier_t b;
-        if (async) b.init(nthr);
-        tp->parallel_for(nthr, [&, tp](int ithr, int nthr) {
-            bool is_master = threadpool_utils::get_active_threadpool() == tp;
-            if (!is_master) {
-                threadpool_utils::activate_threadpool(tp);
-#if defined(DNNL_ENABLE_ITT_TASKS)
-                if (itt_enable) itt::primitive_task_start(task_primitive_kind);
-#endif
-            }
-            f(ithr, nthr);
-            if (!is_master) {
-#if defined(DNNL_ENABLE_ITT_TASKS)
-                if (itt_enable) itt::primitive_task_end();
-#endif
-                threadpool_utils::deactivate_threadpool();
-            }
-            if (async) b.notify();
-        });
-        if (async) b.wait();
-    }
-#endif
-#endif
-}
+void DNNL_API parallel(int nthr, const std::function<void(int, int)> &f);
 
 // XXX: IMPORTANT!!!
 // Keep the functions below static.
@@ -652,6 +572,171 @@ static inline void parallel_nd(dim_t D0, dim_t D1, dim_t D2, dim_t D3, dim_t D4,
         });
 }
 
+template <typename F>
+void parallel_legacy(int nthr, F f) {
+    nthr = adjust_num_threads(nthr, INT64_MAX);
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_SEQ
+    assert(nthr == 1);
+    f(0, 1);
+#else
+#if defined(DNNL_ENABLE_ITT_TASKS)
+    auto task_primitive_kind = itt::primitive_task_get_current_kind();
+    bool itt_enable = itt::get_itt(itt::__itt_task_level_high);
+#endif
+    if (nthr == 1) {
+        f(0, 1);
+        return;
+    }
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+#pragma omp parallel num_threads(nthr)
+    {
+        int nthr_ = omp_get_num_threads();
+        int ithr_ = omp_get_thread_num();
+        assert(nthr_ == nthr);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+        if (ithr_ && itt_enable) itt::primitive_task_start(task_primitive_kind);
+#endif
+        f(ithr_, nthr_);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+        if (ithr_ && itt_enable) itt::primitive_task_end();
+#endif
+    }
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB
+    tbb::parallel_for(
+        0, nthr,
+        [&](int ithr) {
+#if defined(DNNL_ENABLE_ITT_TASKS)
+            bool mark_task = itt::primitive_task_get_current_kind()
+                == primitive_kind::undefined;
+            if (mark_task && itt_enable)
+                itt::primitive_task_start(task_primitive_kind);
+#endif
+            f(ithr, nthr);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+            if (mark_task && itt_enable) itt::primitive_task_end();
+#endif
+        },
+        tbb::static_partitioner());
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB_AUTO
+    tbb::parallel_for(
+        0, nthr, [&](int ithr) { f(ithr, nthr); });
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+    using namespace dnnl::impl::threadpool_utils;
+    dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
+    if (!tp || dnnl_in_parallel()) {
+        threadpool_utils::deactivate_threadpool();
+        for (int ithr = 0; ithr < nthr; ithr++) {
+            f(ithr, nthr);
+        }
+        threadpool_utils::activate_threadpool(tp);
+    } else {
+        bool async = tp->get_flags()
+            & dnnl::threadpool_interop::threadpool_iface::ASYNCHRONOUS;
+        counting_barrier_t b;
+        if (async) b.init(nthr);
+        tp->parallel_for(nthr, [&, tp](int ithr, int nthr) {
+            bool is_master = threadpool_utils::get_active_threadpool() == tp;
+            if (!is_master) {
+                threadpool_utils::activate_threadpool(tp);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                if (itt_enable) itt::primitive_task_start(task_primitive_kind);
+#endif
+            }
+            f(ithr, nthr);
+            if (!is_master) {
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                if (itt_enable) itt::primitive_task_end();
+#endif
+                threadpool_utils::deactivate_threadpool();
+            }
+            if (async) b.notify();
+        });
+        if (async) b.wait();
+    }
+#endif
+#endif
+}
+
+template <typename T0, typename F>
+void for_nd_legacy(const int ithr, const int nthr, const T0 &D0, F f) {
+    T0 start {0}, end {0};
+    balance211(D0, nthr, ithr, start, end);
+    for (T0 d0 = start; d0 < end; ++d0)
+        f(d0);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename F>
+void for_nd_legacy(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
+                       const T2 &D2, const T3 &D3, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
+    if (work_amount == 0) return;
+    size_t start {0}, end {0};
+    balance211(work_amount, nthr, ithr, start, end);
+
+    T0 d0 {0};
+    T1 d1 {0};
+    T2 d2 {0};
+    T3 d3 {0};
+    utils::nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
+    for (size_t iwork = start; iwork < end; ++iwork) {
+        f(d0, d1, d2, d3);
+        utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3);
+    }
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4,
+        typename T5, typename F>
+void for_nd_legacy(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
+            const T2 &D2, const T3 &D3, const T4 &D4, const T5 &D5, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
+    if (work_amount == 0) return;
+    size_t start {0}, end {0};
+    balance211(work_amount, nthr, ithr, start, end);
+
+    T0 d0 {0};
+    T1 d1 {0};
+    T2 d2 {0};
+    T3 d3 {0};
+    T4 d4 {0};
+    T5 d5 {0};
+    utils::nd_iterator_init(
+            start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
+    for (size_t iwork = start; iwork < end; ++iwork) {
+        f(d0, d1, d2, d3, d4, d5);
+        utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
+    }
+}
+
+template <typename T0, typename F>
+void parallel_nd_legacy(const T0 &D0, F f) {
+    const size_t work_amount = (size_t)D0;
+    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), work_amount);
+    if (nthr)
+        parallel_legacy(nthr, [&](int ithr, int nthr) { for_nd_legacy(ithr, nthr, D0, f); });
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename F>
+void parallel_nd_legacy(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
+    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), work_amount);
+    if (nthr)
+        parallel_legacy(nthr, [&](int ithr, int nthr) {
+            for_nd_legacy(ithr, nthr, D0, D1, D2, D3, f);
+        });
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4,
+        typename T5, typename F>
+void parallel_nd_legacy(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
+                 const T4 &D4, const T5 &D5, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
+    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), work_amount);
+    if (nthr)
+        parallel_legacy(nthr, [&](int ithr, int nthr) {
+            for_nd_legacy(ithr, nthr, D0, D1, D2, D3, D4, D5, f);
+        });
+}
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/dnnl_traits.hpp b/src/common/dnnl_traits.hpp
index cefdf1a80ee..4f9b8029282 100644
--- a/src/common/dnnl_traits.hpp
+++ b/src/common/dnnl_traits.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,171 +17,166 @@
 #ifndef COMMON_DNNL_TRAITS_HPP
 #define COMMON_DNNL_TRAITS_HPP
 
-#include <assert.h>
-#include <stdint.h>
-
-#include "oneapi/dnnl/dnnl.h"
-
 #include "bfloat16.hpp"
 #include "c_types_map.hpp"
 #include "float16.hpp"
+#include "float4.hpp"
 #include "float8.hpp"
-#include "nstl.hpp"
-#include "opdesc.hpp"
-#include "utils.hpp"
-#include "z_magic.hpp"
+#include "int4.hpp"
+
+#include <cstdint>
 
 namespace dnnl {
 namespace impl {
 
 template <data_type_t>
-struct prec_traits {}; /* ::type -> float */
+struct prec_traits_t {}; /* ::type -> float */
 template <typename>
-struct data_traits {}; /* ::data_type -> f32 */
+struct data_traits_t {}; /* ::data_type -> f32 */
 template <int>
-struct typesize_traits {}; /* ::data_type_size -> f32 */
+struct typesize_traits_t {}; /* ::data_type_size -> f32 */
 template <primitive_kind_t>
-struct pkind_traits {}; /* ::desc_type, ::query_d */
+struct pkind_traits_t {}; /* ::desc_type, ::query_d */
 
 template <>
-struct prec_traits<data_type::e8m0> {
-    typedef float8_e8m0_t type;
+struct prec_traits_t<data_type::f4_e3m0> {
+    using type = float4_e3m0_t;
+};
+template <>
+struct prec_traits_t<data_type::f4_e2m1> {
+    using type = float4_e2m1_t;
 };
 template <>
-struct prec_traits<data_type::f8_e5m2> {
-    typedef float8_e5m2_t type;
+struct prec_traits_t<data_type::e8m0> {
+    using type = float8_e8m0_t;
 };
 template <>
-struct prec_traits<data_type::f8_e4m3> {
-    typedef float8_e4m3_t type;
+struct prec_traits_t<data_type::f8_e5m2> {
+    using type = float8_e5m2_t;
 };
 template <>
-struct prec_traits<data_type::f16> {
-    typedef float16_t type;
+struct prec_traits_t<data_type::f8_e4m3> {
+    using type = float8_e4m3_t;
 };
 template <>
-struct prec_traits<data_type::bf16> {
-    typedef bfloat16_t type;
+struct prec_traits_t<data_type::f16> {
+    using type = float16_t;
 };
 template <>
-struct prec_traits<data_type::f32> {
-    typedef float type;
+struct prec_traits_t<data_type::bf16> {
+    using type = bfloat16_t;
 };
 template <>
-struct prec_traits<data_type::f64> {
-    typedef double type;
+struct prec_traits_t<data_type::f32> {
+    using type = float;
 };
 template <>
-struct prec_traits<data_type::s32> {
-    typedef int32_t type;
+struct prec_traits_t<data_type::f64> {
+    using type = double;
 };
 template <>
-struct prec_traits<data_type::s8> {
-    typedef int8_t type;
+struct prec_traits_t<data_type::s32> {
+    using type = int32_t;
 };
 template <>
-struct prec_traits<data_type::u8> {
-    typedef uint8_t type;
+struct prec_traits_t<data_type::s8> {
+    using type = int8_t;
 };
 template <>
-struct prec_traits<data_type::s4> {
-    typedef int4_t type;
+struct prec_traits_t<data_type::u8> {
+    using type = uint8_t;
 };
 template <>
-struct prec_traits<data_type::u4> {
-    typedef uint4_t type;
+struct prec_traits_t<data_type::s4> {
+    using type = int4_t;
 };
 template <>
-struct prec_traits<data_type::boolean> {
-    typedef bool type;
+struct prec_traits_t<data_type::u4> {
+    using type = uint4_t;
+};
+template <>
+struct prec_traits_t<data_type::boolean> {
+    using type = bool;
+};
+
+template <> struct prec_traits_t<data_type::bin> {
+    using type = uint8_t;
+};
+
+template <> struct prec_traits_t<data_type::nf4> {
+    using type = uint8_t;
 };
 
 template <>
-struct data_traits<float8_e5m2_t> {
+struct data_traits_t<float4_e3m0_t> {
+    static constexpr data_type_t data_type = data_type::f4_e3m0;
+};
+template <>
+struct data_traits_t<float4_e2m1_t> {
+    static constexpr data_type_t data_type = data_type::f4_e2m1;
+};
+template <>
+struct data_traits_t<float8_e8m0_t> {
+    static constexpr data_type_t data_type = data_type::e8m0;
+};
+template <>
+struct data_traits_t<float8_e5m2_t> {
     static constexpr data_type_t data_type = data_type::f8_e5m2;
 };
 template <>
-struct data_traits<float8_e4m3_t> {
+struct data_traits_t<float8_e4m3_t> {
     static constexpr data_type_t data_type = data_type::f8_e4m3;
 };
 template <>
-struct data_traits<float16_t> {
+struct data_traits_t<float16_t> {
     static constexpr data_type_t data_type = data_type::f16;
 };
 template <>
-struct data_traits<bfloat16_t> {
+struct data_traits_t<bfloat16_t> {
     static constexpr data_type_t data_type = data_type::bf16;
 };
 template <>
-struct data_traits<float> {
+struct data_traits_t<float> {
     static constexpr data_type_t data_type = data_type::f32;
 };
 template <>
-struct data_traits<int32_t> {
+struct data_traits_t<int32_t> {
     static constexpr data_type_t data_type = data_type::s32;
 };
 template <>
-struct data_traits<int8_t> {
+struct data_traits_t<int8_t> {
     static constexpr data_type_t data_type = data_type::s8;
 };
 template <>
-struct data_traits<uint8_t> {
+struct data_traits_t<uint8_t> {
     static constexpr data_type_t data_type = data_type::u8;
 };
 template <>
-struct data_traits<int4_t> {
+struct data_traits_t<int4_t> {
     static constexpr data_type_t data_type = data_type::s4;
 };
 template <>
-struct data_traits<uint4_t> {
+struct data_traits_t<uint4_t> {
     static constexpr data_type_t data_type = data_type::u4;
 };
 template <>
-struct data_traits<bool> {
+struct data_traits_t<bool> {
     static constexpr data_type_t data_type = data_type::boolean;
 };
 
 template <>
-struct typesize_traits<4> {
-    typedef float type;
+struct typesize_traits_t<4> {
+    using type = float;
 };
 template <>
-struct typesize_traits<2> {
-    typedef int16_t type;
+struct typesize_traits_t<2> {
+    using type = int16_t;
 };
 template <>
-struct typesize_traits<1> {
-    typedef uint8_t type;
+struct typesize_traits_t<1> {
+    using type = uint8_t;
 };
 
-#define PKIND_TRAITS_INST(op) \
-    template <> \
-    struct pkind_traits<primitive_kind::op> { \
-        typedef CONCAT2(op, _desc_t) desc_type; \
-    }
-PKIND_TRAITS_INST(convolution);
-PKIND_TRAITS_INST(deconvolution);
-PKIND_TRAITS_INST(shuffle);
-PKIND_TRAITS_INST(eltwise);
-PKIND_TRAITS_INST(softmax);
-PKIND_TRAITS_INST(pooling);
-PKIND_TRAITS_INST(prelu);
-PKIND_TRAITS_INST(lrn);
-PKIND_TRAITS_INST(batch_normalization);
-PKIND_TRAITS_INST(group_normalization);
-PKIND_TRAITS_INST(layer_normalization);
-PKIND_TRAITS_INST(inner_product);
-PKIND_TRAITS_INST(rnn);
-PKIND_TRAITS_INST(gemm);
-PKIND_TRAITS_INST(zero_pad);
-PKIND_TRAITS_INST(binary);
-PKIND_TRAITS_INST(matmul);
-PKIND_TRAITS_INST(resampling);
-PKIND_TRAITS_INST(reduction);
-PKIND_TRAITS_INST(sum);
-PKIND_TRAITS_INST(sdpa);
-#undef PKIND_TRAITS_INST
-
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/eltwise.cpp b/src/common/eltwise.cpp
index 356584a54d0..ba7675120f8 100644
--- a/src/common/eltwise.cpp
+++ b/src/common/eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,7 +57,8 @@ status_t eltwise_desc_init(eltwise_desc_t *eltwise_desc, prop_kind_t prop_kind,
     VCHECK_ELTWISE(
             IMPLICATION(!is_fwd, !any_null(diff_src_desc, diff_dst_desc)),
             VERBOSE_NULL_ARG);
-    VCHECK_ELTWISE(IMPLICATION(alg_kind == eltwise_round, is_fwd),
+    VCHECK_ELTWISE(IMPLICATION(one_of(alg_kind, eltwise_round, eltwise_hsigmoid,
+                        eltwise_round_half_away_from_zero, eltwise_round_half_to_even), is_fwd),
             VERBOSE_BAD_PROPKIND);
     VCHECK_ELTWISE(
             IMPLICATION(is_fwd, !memory_desc_wrapper(src_desc).format_any()),
@@ -136,6 +137,9 @@ status_t eltwise_attr_check(const eltwise_desc_t &desc, const engine_t *engine,
             using namespace primitive_kind;
             VCHECK_ELTWISE_IMPL(po.has_default_values({binary}),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         VCHECK_ELTWISE_IMPL(false, VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/common/eltwise_pd.hpp b/src/common/eltwise_pd.hpp
index e315f5866c8..4f2d43e33da 100644
--- a/src/common/eltwise_pd.hpp
+++ b/src/common/eltwise_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -102,10 +102,10 @@ struct eltwise_pd_t : public primitive_desc_t {
     memory_desc_t src_md_;
     memory_desc_t dst_md_;
 
-    eltwise_pd_t(const eltwise_desc_t *adesc, const primitive_attr_t *attr,
+    eltwise_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const eltwise_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<eltwise_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_md_(desc_.src_desc)
         , dst_md_(desc_.dst_desc) {}
@@ -116,9 +116,10 @@ struct eltwise_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct eltwise_fwd_pd_t : public eltwise_pd_t {
-    typedef eltwise_fwd_pd_t base_class;
-    typedef eltwise_fwd_pd_t hint_class;
+    using base_class = eltwise_fwd_pd_t;
+    using hint_class = eltwise_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
@@ -158,7 +159,7 @@ struct eltwise_fwd_pd_t : public eltwise_pd_t {
         return one_of(alg, eltwise_relu, eltwise_tanh, eltwise_elu,
                        eltwise_square, eltwise_abs, eltwise_sqrt, eltwise_swish,
                        eltwise_gelu_tanh, eltwise_gelu_erf, eltwise_round,
-                       eltwise_hardswish)
+                       eltwise_hardswish, eltwise_round_half_away_from_zero, eltwise_round_half_to_even)
                 || one_of(alg, eltwise_relu_use_dst_for_bwd,
                         eltwise_tanh_use_dst_for_bwd,
                         eltwise_elu_use_dst_for_bwd,
@@ -179,7 +180,7 @@ struct eltwise_fwd_pd_t : public eltwise_pd_t {
     }
 
 protected:
-    eltwise_fwd_pd_t(const eltwise_desc_t *adesc, const primitive_attr_t *attr,
+    eltwise_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const eltwise_fwd_pd_t *hint_fwd_pd)
         : eltwise_pd_t(adesc, attr, hint_fwd_pd) {}
 
@@ -190,14 +191,18 @@ struct eltwise_fwd_pd_t : public eltwise_pd_t {
                         == status::success);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct eltwise_bwd_pd_t : public eltwise_pd_t {
-    typedef eltwise_bwd_pd_t base_class;
-    typedef eltwise_fwd_pd_t hint_class;
+    using base_class = eltwise_bwd_pd_t;
+    using hint_class = eltwise_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
-        if (use_dst() ? arg == DNNL_ARG_DST : arg == DNNL_ARG_SRC)
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_SRC)
+            return !use_dst() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DST)
+            return use_dst() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DIFF_DST) return arg_usage_t::input;
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
@@ -281,7 +286,7 @@ struct eltwise_bwd_pd_t : public eltwise_pd_t {
     memory_desc_t diff_src_md_;
     memory_desc_t diff_dst_md_;
 
-    eltwise_bwd_pd_t(const eltwise_desc_t *adesc, const primitive_attr_t *attr,
+    eltwise_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const eltwise_fwd_pd_t *hint_fwd_pd)
         : eltwise_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_md_(desc_.diff_src_desc)
@@ -298,6 +303,7 @@ struct eltwise_bwd_pd_t : public eltwise_pd_t {
                                 == status::success);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/engine.cpp b/src/common/engine.cpp
index 9fdd4e1f073..8768b3e0c1f 100644
--- a/src/common/engine.cpp
+++ b/src/common/engine.cpp
@@ -88,7 +88,19 @@ status_t dnnl_engine_create(
     try {
         auto ef = get_engine_factory(kind, get_default_runtime(kind));
         VERROR_ENGINE(ef != nullptr, invalid_arguments,
-                VERBOSE_INVALID_ENGINE_KIND, dnnl_engine_kind2str(kind));
+                VERBOSE_INVALID_ENGINE_KIND, "", dnnl_engine_kind2str(kind));
+
+        auto s_runtime_kind = dnnl_runtime2str(kind == engine_kind::cpu
+                        ? dnnl_version()->cpu_runtime
+                        : dnnl_version()->gpu_runtime);
+
+        VERROR_ENGINE(ef->count() > 0, invalid_arguments,
+                "%s %s devices queried but not found",
+                get_default_runtime(kind) == runtime_kind::none
+                        ? ""
+                        : s_runtime_kind,
+                dnnl_engine_kind2str(kind));
+
         VERROR_ENGINE(index < ef->count(), invalid_arguments,
                 VERBOSE_INVALID_ENGINE_IDX, ef->count(),
                 dnnl_engine_kind2str(kind), index);
diff --git a/src/common/engine.hpp b/src/common/engine.hpp
index 5195b4aad28..159358fca66 100644
--- a/src/common/engine.hpp
+++ b/src/common/engine.hpp
@@ -67,8 +67,13 @@ struct dnnl_engine : public dnnl::impl::c_compatible {
     /** create memory storage */
     virtual dnnl::impl::status_t create_memory_storage(
             dnnl::impl::memory_storage_t **storage, unsigned flags, size_t size,
-            void *handle)
-            = 0;
+            void *handle) {
+        assert(impl());
+        if (!impl()) return dnnl::impl::status::runtime_error;
+        return impl()->create_memory_storage(
+                storage, this, flags, size, handle);
+    }
+
     dnnl::impl::status_t create_memory_storage(
             dnnl::impl::memory_storage_t **storage, size_t size) {
         return create_memory_storage(
@@ -187,6 +192,8 @@ inline runtime_kind_t get_default_runtime(engine_kind_t kind) {
     return runtime_kind::omp;
 #elif DNNL_CPU_RUNTIME == DNNL_RUNTIME_TBB
     return runtime_kind::tbb;
+#elif DNNL_CPU_RUNTIME == DNNL_RUNTIME_TBB_AUTO
+    return runtime_kind::tbb_auto;
 #elif DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
     return runtime_kind::threadpool;
 #elif DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
diff --git a/src/common/engine_impl.hpp b/src/common/engine_impl.hpp
index d84cd6b9763..e42b8fa27fa 100644
--- a/src/common/engine_impl.hpp
+++ b/src/common/engine_impl.hpp
@@ -62,6 +62,12 @@ class engine_impl_t {
 
     virtual status_t init() { return status::success; }
 
+    virtual status_t create_memory_storage(memory_storage_t **storage,
+            engine_t *engine, unsigned flags, size_t size, void *handle) const {
+        assert(!"unexpected");
+        return status::runtime_error;
+    }
+
     virtual status_t create_stream_impl(
             impl::stream_impl_t **stream_impl, unsigned flags) const {
         auto *si = new impl::stream_impl_t(flags);
diff --git a/src/common/experimental.cpp b/src/common/experimental.cpp
index 6ec2f545a34..e32e9b433da 100644
--- a/src/common/experimental.cpp
+++ b/src/common/experimental.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,6 +33,16 @@ bool DNNL_API use_bnorm_stats_one_pass() {
     return stats_onepass_algo;
 }
 
+bool use_gpu_conv_v2() {
+#ifdef DNNL_EXPERIMENTAL
+    static const bool is_enabled
+            = getenv_int_user("EXPERIMENTAL_GPU_CONV_V2", 0);
+#else
+    static const bool is_enabled = false;
+#endif
+    return is_enabled;
+}
+
 } // namespace experimental
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/experimental.hpp b/src/common/experimental.hpp
index c6efb96fd9d..7cdfcbd7d83 100644
--- a/src/common/experimental.hpp
+++ b/src/common/experimental.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,9 +21,10 @@ namespace impl {
 namespace experimental {
 
 bool use_bnorm_stats_one_pass();
+bool use_gpu_conv_v2();
 
 } // namespace experimental
 } // namespace impl
 } // namespace dnnl
 
-#endif
\ No newline at end of file
+#endif
diff --git a/src/common/float16.hpp b/src/common/float16.hpp
index 5449967d5d6..1125fd11e43 100644
--- a/src/common/float16.hpp
+++ b/src/common/float16.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include "bit_cast.hpp"
 #include "oneapi/dnnl/dnnl.h"
+#include "cpu/platform.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -39,7 +40,7 @@ struct float16_t {
     float16_t &operator=(float f);
 
     operator float() const;
-    float f() { return (float)(*this); }
+    float f() const { return (float)(*this); }
 
     float16_t &operator+=(float16_t a) {
         (*this) = float(f() + a.f());
diff --git a/src/common/float4.cpp b/src/common/float4.cpp
new file mode 100644
index 00000000000..34eaae4b4c5
--- /dev/null
+++ b/src/common/float4.cpp
@@ -0,0 +1,162 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <array>
+
+#include "common/bit_cast.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/float16.hpp"
+#include "common/float4.hpp"
+#include "common/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+
+uint8_t float2e2m1(float f) {
+    uint32_t f_raw = float2int(f);
+    uint32_t sign = f_raw & 0x80000000;
+
+    // There is no NaN or infinity in e2m1, for now we just return zero
+    // TODO: figure if there is a standard value to return
+    uint32_t naninf_mask = 0x7f800000;
+    if ((f_raw & naninf_mask) == naninf_mask) return 0x07 | (sign >> 28);
+
+    // we convert with naive closest value computation out of 8
+    float e2m1_val_table[8] = {0.0f, .5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f};
+
+    float abs_f = fmin(e2m1_val_table[7], int2float(f_raw ^ sign));
+
+    int idx = 0;
+    float min_diff = ::fabsf(e2m1_val_table[idx] - abs_f);
+    uint8_t raw_bits = idx;
+    for (++idx; idx < 8; ++idx) {
+        float diff = ::fabsf(e2m1_val_table[idx] - abs_f);
+        if (diff < min_diff) {
+            min_diff = diff;
+            raw_bits = idx;
+        }
+        // Special case for midpoint, we round to even (so even index)
+        if ((diff == min_diff) && !(idx & 1)) raw_bits = idx;
+    }
+    assert(raw_bits < 8);
+    // reapply sign
+    if (sign) raw_bits = raw_bits | 0x08;
+    assert(raw_bits < 16);
+    return raw_bits;
+}
+
+float4_e2m1_t &float4_e2m1_t::operator=(bfloat16_t f) {
+    float f32 = f;
+    raw_bits_ = float2e2m1(f32);
+    return *this;
+}
+
+float4_e2m1_t &float4_e2m1_t::operator=(float16_t f) {
+    float f32 = f;
+    raw_bits_ = float2e2m1(f32);
+    return *this;
+}
+
+float4_e2m1_t &float4_e2m1_t::operator=(float f) {
+    raw_bits_ = float2e2m1(f);
+    return *this;
+}
+
+float4_e2m1_t::operator float() const {
+    // List of e2m1 values. The index of each value maps to its encoding.
+    static const float e2m1_table[16] = {0.0f, .5f, 1.0f, 1.5f, 2.0f, 3.0f,
+            4.0f, 6.0f, -0.0f, -.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f};
+    assert(raw_bits_ < 16);
+    return e2m1_table[raw_bits_];
+}
+
+float4_e2m1_t::operator float16_t() const {
+    // List of e2m1 values. The index of each value maps to its encoding.
+    static const float16_t e2m1_table[16] = {0.0f, .5f, 1.0f, 1.5f, 2.0f, 3.0f,
+            4.0f, 6.0f, -0.0f, -.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f};
+    assert(raw_bits_ < 16);
+    return e2m1_table[raw_bits_];
+}
+
+uint8_t float2e3m0(float f) {
+    uint32_t f_raw = float2int(f);
+    uint32_t sign = f_raw & 0x80000000;
+
+    // There is no NaN or infinity in e3m0, we just return maxval
+    uint32_t naninf_mask = 0x7f800000;
+    if ((f_raw & naninf_mask) == naninf_mask) return 0x7 | (sign >> 28);
+
+    // we convert with naive closest value computation out of 8
+    float e3m0_val_table[8] = {0.0f, .25f, .5f, 1.0f, 2.0f, 4.0f, 8.0f, 16.0f};
+
+    float abs_f = fmin(e3m0_val_table[7], int2float(f_raw ^ sign));
+
+    int idx = 0;
+    float min_diff = ::fabsf(e3m0_val_table[idx] - abs_f);
+    uint8_t raw_bits = idx;
+    for (++idx; idx < 8; ++idx) {
+        float diff = ::fabsf(e3m0_val_table[idx] - abs_f);
+        if (diff < min_diff) {
+            min_diff = diff;
+            raw_bits = idx;
+        }
+        // Special case for midpoint, we round to even (so even index)
+        if ((diff == min_diff) && !(idx & 1)) raw_bits = idx;
+    }
+    assert(raw_bits < 8);
+    // reapply sign
+    if (sign) raw_bits = raw_bits | 0x08;
+    assert(raw_bits < 16);
+    return raw_bits;
+}
+
+float4_e3m0_t &float4_e3m0_t::operator=(bfloat16_t f) {
+    float f32 = f;
+    raw_bits_ = float2e3m0(f32);
+    return *this;
+}
+
+float4_e3m0_t &float4_e3m0_t::operator=(float16_t f) {
+    float f32 = f;
+    raw_bits_ = float2e3m0(f32);
+    return *this;
+}
+
+float4_e3m0_t &float4_e3m0_t::operator=(float f) {
+    raw_bits_ = float2e3m0(f);
+    return *this;
+}
+
+float4_e3m0_t::operator float() const {
+    // List of e3m0 values. The index of each value maps to its encoding.
+    static const float e3m0_table[16]
+            = {0.0f, .25f, .5f, 1.0f, 2.0f, 4.0f, 8.0f, 16.0f, -0.0f, -.25f,
+                    -.5f, -1.0f, -2.0f, -4.0f, -8.0f, -16.0f};
+    assert(raw_bits_ < 16);
+    return e3m0_table[raw_bits_];
+}
+
+float4_e3m0_t::operator float16_t() const {
+    // List of e3m0 values. The index of each value maps to its encoding.
+    static const float16_t e3m0_table[16]
+            = {0.0f, .25f, .5f, 1.0f, 2.0f, 4.0f, 8.0f, 16.0f, -0.0f, -.25f,
+                    -.5f, -1.0f, -2.0f, -4.0f, -8.0f, -16.0f};
+    assert(raw_bits_ < 16);
+    return e3m0_table[raw_bits_];
+}
+
+} // namespace impl
+} // namespace dnnl
diff --git a/src/common/float4.hpp b/src/common/float4.hpp
new file mode 100644
index 00000000000..44be31d9d0a
--- /dev/null
+++ b/src/common/float4.hpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef COMMON_FLOAT4_HPP
+#define COMMON_FLOAT4_HPP
+
+#include <cassert>
+#include <cstdint>
+
+#include "common/bfloat16.hpp"
+#include "common/float16.hpp"
+
+namespace dnnl {
+namespace impl {
+
+struct float4_e2m1_t {
+    uint8_t raw_bits_;
+    float4_e2m1_t() = default;
+    constexpr float4_e2m1_t(uint8_t r, bool = true) : raw_bits_(r) {}
+    float4_e2m1_t(float f) { (*this) = f; }
+    float4_e2m1_t(float16_t f) { (*this) = f; }
+    float4_e2m1_t(bfloat16_t f) { (*this) = f; }
+
+    float4_e2m1_t DNNL_API &operator=(float f);
+    float4_e2m1_t DNNL_API &operator=(float16_t f);
+    float4_e2m1_t DNNL_API &operator=(bfloat16_t f);
+
+    DNNL_API operator float() const;
+    DNNL_API operator float16_t() const;
+    DNNL_API operator bfloat16_t() const;
+
+    float4_e2m1_t &operator+=(const float a) {
+        (*this) = float {*this} + a;
+        return *this;
+    }
+};
+static_assert(sizeof(float4_e2m1_t) == 1, "float4_e2m1_t must be 1 byte");
+
+struct float4_e3m0_t {
+    uint8_t raw_bits_;
+    float4_e3m0_t() = default;
+    constexpr float4_e3m0_t(uint8_t r, bool = true) : raw_bits_(r) {}
+    float4_e3m0_t(float f) { (*this) = f; }
+    float4_e3m0_t(float16_t f) { (*this) = f; }
+    float4_e3m0_t(bfloat16_t f) { (*this) = f; }
+
+    float4_e3m0_t DNNL_API &operator=(float f);
+    float4_e3m0_t DNNL_API &operator=(float16_t f);
+    float4_e3m0_t DNNL_API &operator=(bfloat16_t f);
+
+    DNNL_API operator float() const;
+    DNNL_API operator float16_t() const;
+    DNNL_API operator bfloat16_t() const;
+
+    float4_e3m0_t &operator+=(const float a) {
+        (*this) = float {*this} + a;
+        return *this;
+    }
+};
+static_assert(sizeof(float4_e3m0_t) == 1, "float4_e3m0_t must be 1 byte");
+
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/common/float8.hpp b/src/common/float8.hpp
index a32e9bbca7d..e8f09cb0467 100644
--- a/src/common/float8.hpp
+++ b/src/common/float8.hpp
@@ -72,6 +72,7 @@ struct float8_e4m3_t {
         return *this;
     }
 };
+static_assert(sizeof(float8_e4m3_t) == 1, "float8_e4m3_t must be 1 byte");
 
 void cvt_f8_e5m2_to_float(float *out, const float8_e5m2_t *inp, size_t nelems);
 void cvt_f8_e4m3_to_float(float *out, const float8_e4m3_t *inp, size_t nelems);
@@ -85,8 +86,6 @@ void add_floats_and_cvt_to_f8_e5m2(float8_e5m2_t *out, const float *inp0,
 void add_floats_and_cvt_to_f8_e4m3(float8_e4m3_t *out, const float *inp0,
         const float *inp1, size_t nelems);
 
-static_assert(sizeof(float8_e5m2_t) == 1, "float8_e4m3_t must be 1 byte");
-
 #if DNNL_X64
 namespace cpu {
 namespace x64 {
diff --git a/src/common/gemm.cpp b/src/common/gemm.cpp
index 6a2578cd5ca..52da4a1be84 100644
--- a/src/common/gemm.cpp
+++ b/src/common/gemm.cpp
@@ -85,7 +85,7 @@ std::string get_descriptor(dim_t M, dim_t N, dim_t K) {
         if (!is_src_ab && lda != M) ss << "lda:" << lda << " "; \
         if (is_wei_ab && ldb != N) ss << "ldb:" << ldb << " "; \
         if (!is_wei_ab && ldb != K) ss << "ldb:" << ldb << " "; \
-        if (alpha != 1.f) ss << "attr-oscale:common:" << alpha << " "; \
+        if (alpha != 1.f) ss << "attr-scales:src:common:" << alpha << " "; \
         if (beta != 0.f) ss << "attr-post-ops:sum:" << beta << " "; \
         ss << ",," << get_descriptor(M, N, K); \
         VPROF(start_ms, primitive, exec, VERBOSE_profile, ss.str().c_str(), \
diff --git a/src/common/gemm_pd.hpp b/src/common/gemm_pd.hpp
index d7c4f2c650e..2180aada880 100644
--- a/src/common/gemm_pd.hpp
+++ b/src/common/gemm_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,11 +36,12 @@ namespace impl {
     VCHECK(primitive, create, dispatch, gemm, (f), "%s," msg, \
             this->info(engine), ##__VA_ARGS__)
 
+// NOLINTBEGIN(google-default-arguments)
 struct gemm_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::gemm;
 
-    typedef gemm_pd_t base_class;
-    typedef gemm_pd_t hint_class;
+    using base_class = gemm_pd_t;
+    using hint_class = gemm_pd_t;
 
     const gemm_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -91,9 +92,10 @@ struct gemm_pd_t : public primitive_desc_t {
     // resolve the 'any' tags.
     gemm_desc_t desc_;
 
-    gemm_pd_t(const gemm_desc_t *adesc, const primitive_attr_t *attr,
+    gemm_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const hint_class *hint_fwd_pd)
-        : primitive_desc_t(attr, base_pkind), desc_(*adesc) {}
+        : primitive_desc_t(attr, base_pkind)
+        , desc_(*op_desc_t::to_desc<gemm_desc_t>(adesc)) {}
 
     // By default, we just resolve 'any' with blocked layout and trivial strides
     bool set_default_format(memory_desc_t *md) {
@@ -121,6 +123,7 @@ struct gemm_pd_t : public primitive_desc_t {
         return ok;
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/gemm_types.hpp b/src/common/gemm_types.hpp
index c23a7c9f58d..7dac69356e3 100644
--- a/src/common/gemm_types.hpp
+++ b/src/common/gemm_types.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <assert.h>
 #include "common/c_types_map.hpp"
 #include "common/memory_desc.hpp"
+#include "common/opdesc.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -49,20 +50,23 @@ const sum_ab_t sum_none = dnnl_sum_none;
 // A descriptor for a matrix multiplication (gemm) operation. To make the
 // interface consistent, the descriptor represent the GEMM operation in row
 // major.
-struct gemm_desc_t {
-    // The kind of primitive. Used for self identifying the primitive
-    // descriptor. Must be #dnnl_gemm.
-    dnnl_primitive_kind_t primitive_kind;
+struct gemm_desc_t : public op_desc_t {
+    gemm_desc_t() : op_desc_t(primitive_kind::gemm) {}
+
+    std::unique_ptr<op_desc_t> clone() const override {
+        return utils::make_unique<gemm_desc_t>(*this);
+    }
+
     memory_desc_t a_desc;
     memory_desc_t b_desc;
     memory_desc_t c_desc;
     memory_desc_t bias_desc;
     // Type for accumulating A*B.
-    dnnl_data_type_t acc_type;
+    dnnl_data_type_t acc_type {};
     // Sum across k dimension in either A or B tensor
     // and output to sum_ab tensor.
-    sum_ab_t sum_ab;
-    dnnl_data_type_t sum_ab_type;
+    sum_ab_t sum_ab {};
+    dnnl_data_type_t sum_ab_type {};
 
     // These accessors are to be used by the GEMM implementation. Because the
     // GEMM implementation currently assumes column major. These accessors
@@ -73,7 +77,8 @@ struct gemm_desc_t {
     // Simplified accessors that comply to GEMM API
     static transpose_t get_trans(const memory_desc_t &md) {
         if (!md.ndims) return transpose::notrans; // arbitrary
-        return md.format_desc.blocking.strides[md.ndims - 1] != 1
+        return md.dims[md.ndims - 1] != 1
+                        && md.format_desc.blocking.strides[md.ndims - 1] != 1
                 ? transpose::trans
                 : transpose::notrans;
     }
@@ -116,9 +121,16 @@ struct gemm_desc_t {
     // This assumes that one of the dimensions has strides 1
     static dnnl_dim_t get_ld(const memory_desc_t &md) {
         auto strides = md.format_desc.blocking.strides;
-        assert(strides[md.ndims - 1] == 1 || strides[md.ndims - 2] == 1);
-        return strides[md.ndims - 1] != 1 ? strides[md.ndims - 1]
-                                          : strides[md.ndims - 2];
+        assert(md.dims[md.ndims - 1] == 1 || strides[md.ndims - 1] == 1
+                || md.dims[md.ndims - 2] == 1 || strides[md.ndims - 2] == 1);
+        switch (get_trans(md)) {
+            case transpose::trans:
+                return md.dims[md.ndims - 1] > 1 ? strides[md.ndims - 1]
+                                                 : md.dims[md.ndims - 2];
+            default:
+                return md.dims[md.ndims - 2] > 1 ? strides[md.ndims - 2]
+                                                 : md.dims[md.ndims - 1];
+        }
     }
     // Leading dimension of A.
     dnnl_dim_t lda() const { return get_ld(b_desc); }
diff --git a/src/common/gemm_utils.hpp b/src/common/gemm_utils.hpp
index 65045a7d911..23afbb2c2a0 100644
--- a/src/common/gemm_utils.hpp
+++ b/src/common/gemm_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,9 +101,10 @@ static inline status_t create_2d_desc(memory_desc_t *md_2d, int d0, int d1,
     }
 }
 
-static inline gemm_desc_t create_gemm_desc(const memory_desc_t *a_md,
-        const memory_desc_t *b_md, const memory_desc_t *c_md,
-        const memory_desc_t *bias_md, data_type_t acc_dt, engine_t *engine,
+static inline status_t create_gemm_desc(gemm_desc_t *_gemm_desc,
+        const memory_desc_t *a_md, const memory_desc_t *b_md,
+        const memory_desc_t *c_md, const memory_desc_t *bias_md,
+        data_type_t acc_dt, engine_t *engine,
         sum_ab_t sum_ab = sum_ab::sum_none,
         data_type_t sum_ab_dt = data_type::undef) {
     auto gemm_desc = gemm_desc_t();
@@ -121,7 +122,8 @@ static inline gemm_desc_t create_gemm_desc(const memory_desc_t *a_md,
                     data_type::f16, a_md->data_type, b_md->data_type)) {
         gemm_desc.acc_type = data_type::f16;
     }
-    return gemm_desc;
+    *_gemm_desc = gemm_desc;
+    return status::success;
 }
 
 static inline status_t create_gemm_pd(
@@ -131,8 +133,9 @@ static inline status_t create_gemm_pd(
         data_type_t acc_dt, const primitive_attr_t *attr, bool skip_ref = false,
         sum_ab_t sum_ab = sum_ab::sum_none,
         data_type_t sum_ab_dt = data_type::undef) {
-    auto gemm_desc = create_gemm_desc(
-            a_md, b_md, c_md, bias_md, acc_dt, engine, sum_ab, sum_ab_dt);
+    gemm_desc_t gemm_desc;
+    CHECK(create_gemm_desc(&gemm_desc, a_md, b_md, c_md, bias_md, acc_dt,
+            engine, sum_ab, sum_ab_dt));
 
     primitive_attr_t gemm_attr = *attr;
 
@@ -141,7 +144,7 @@ static inline status_t create_gemm_pd(
 
     gemm_pd_ = *(++it);
     if (!gemm_pd_) return status::unimplemented;
-    if (skip_ref && strstr(gemm_pd_.get()->name(), "ref") != NULL)
+    if (skip_ref && strstr(gemm_pd_->name(), "ref") != nullptr)
         return status::unimplemented;
 
     return status::success;
@@ -156,8 +159,11 @@ static inline bool is_md_gemm_compatible_plain_format(
 
     if (blk_desc.inner_nblks != 0) return false;
 
-    return (blk_desc.strides[md->ndims - 1] == 1)
-            || (!is_dst && blk_desc.strides[md->ndims - 2] == 1);
+    return (md->dims[md->ndims - 1] == 1
+                   || blk_desc.strides[md->ndims - 1] == 1)
+            || (!is_dst
+                    && (md->dims[md->ndims - 2] == 1
+                            || blk_desc.strides[md->ndims - 2] == 1));
 }
 
 } // namespace impl
diff --git a/src/common/group_normalization.cpp b/src/common/group_normalization.cpp
index 4e0abf3b6a2..ddc44734119 100644
--- a/src/common/group_normalization.cpp
+++ b/src/common/group_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -147,19 +147,25 @@ status_t group_normalization_attr_check(const group_normalization_desc_t &desc,
 
         const bool is_int8 = utils::one_of(src_dt, data_type::s8, data_type::u8)
                 || utils::one_of(dst_dt, data_type::s8, data_type::u8);
-        if (is_int8) fwd_attr_mask |= smask_t::scales_runtime;
+        if (is_int8) fwd_attr_mask |= smask_t::scales;
 
         VCHECK_GNORM_UNIMPL(attr->has_default_values(fwd_attr_mask, dst_dt),
                 VERBOSE_UNSUPPORTED_ATTR);
 
         // Check scales
         if (!attr->scales_.has_default_values()) {
-            const auto &sc = attr->scales_;
-            const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-            const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
-
-            VCHECK_GNORM_UNIMPL(utils::everyone_is(0, mask_src, mask_dst),
+            static const std::vector<int> supported_args {
+                    DNNL_ARG_SRC, DNNL_ARG_DST};
+            VCHECK_GNORM_UNIMPL(
+                    attr->scales_.has_default_values(supported_args),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            for (int arg : supported_args) {
+                if (attr->scales_.has_default_values(arg)) continue;
+
+                const int mask = attr->scales_.get_mask(arg);
+                VCHECK_GNORM_UNIMPL(mask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+            }
         }
 
         // Check post-ops
@@ -168,6 +174,9 @@ status_t group_normalization_attr_check(const group_normalization_desc_t &desc,
             using namespace primitive_kind;
             VCHECK_GNORM_UNIMPL(po.has_default_values({binary, eltwise}),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         VCHECK_GNORM_UNIMPL(false, VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/common/group_normalization_pd.hpp b/src/common/group_normalization_pd.hpp
index 313f36f378d..7b908050efc 100644
--- a/src/common/group_normalization_pd.hpp
+++ b/src/common/group_normalization_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -100,20 +100,21 @@ struct group_normalization_pd_t : public primitive_desc_t {
     memory_desc_t stat_md_;
     memory_desc_t scaleshift_md_;
 
-    group_normalization_pd_t(const group_normalization_desc_t *adesc,
+    group_normalization_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const group_normalization_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<group_normalization_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_md_(desc_.src_desc)
         , stat_md_(desc_.stat_desc)
         , scaleshift_md_(desc_.scaleshift_desc) {}
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct group_normalization_fwd_pd_t : public group_normalization_pd_t {
-    typedef group_normalization_fwd_pd_t base_class;
-    typedef group_normalization_fwd_pd_t hint_class;
+    using base_class = group_normalization_fwd_pd_t;
+    using hint_class = group_normalization_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
@@ -123,8 +124,10 @@ struct group_normalization_fwd_pd_t : public group_normalization_pd_t {
             return arg_usage_t::unused;
         }
 
-        if (arg == DNNL_ARG_SCALE && use_scale()) return arg_usage_t::input;
-        if (arg == DNNL_ARG_SHIFT && use_shift()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_SCALE)
+            return use_scale() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_SHIFT)
+            return use_shift() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
         return primitive_desc_t::arg_usage(arg);
@@ -175,7 +178,7 @@ struct group_normalization_fwd_pd_t : public group_normalization_pd_t {
 protected:
     memory_desc_t dst_md_;
 
-    group_normalization_fwd_pd_t(const group_normalization_desc_t *adesc,
+    group_normalization_fwd_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr, const hint_class *hint_fwd_pd)
         : group_normalization_pd_t(adesc, attr, hint_fwd_pd)
         , dst_md_(desc_.dst_desc) {}
@@ -190,43 +193,44 @@ struct group_normalization_fwd_pd_t : public group_normalization_pd_t {
         return IMPLICATION(use_scale() || use_shift(),
                 weights_md()->data_type == data_type::f32);
     }
-    bool attr_scales_ok() const {
+    bool attr_scales_ok(const std::vector<int> &supported_args
+            = {DNNL_ARG_SRC, DNNL_ARG_DST}) const {
         using namespace data_type;
         const auto &scales = attr()->scales_;
-        const std::vector<int> supported_args({DNNL_ARG_SRC, DNNL_ARG_DST});
         bool ok = scales.has_default_values(supported_args);
 
         for (const auto &arg : supported_args) {
-            const auto &sc = scales.get(arg);
-            if (!sc.has_default_values()) {
+            if (!scales.has_default_values(arg)) {
                 const data_type_t dt = arg_md(arg)->data_type;
-                ok = ok && utils::one_of(dt, s8, u8) && sc.mask_ == 0;
+                ok = ok && utils::one_of(dt, s8, u8);
+                ok = ok && scales.get_mask(arg) == 0;
             }
         }
         return ok;
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct group_normalization_bwd_pd_t : public group_normalization_pd_t {
-    typedef group_normalization_bwd_pd_t base_class;
-    typedef group_normalization_fwd_pd_t hint_class;
+    using base_class = group_normalization_bwd_pd_t;
+    using hint_class = group_normalization_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_MEAN, DNNL_ARG_VARIANCE,
                     DNNL_ARG_DIFF_DST))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_SCALE && use_scale()) return arg_usage_t::input;
-
-        if (arg == DNNL_ARG_WORKSPACE && !types::is_zero_md(workspace_md()))
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_SCALE)
+            return use_scale() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_DIFF_SCALE && use_scale())
-            return arg_usage_t::output;
-        if (arg == DNNL_ARG_DIFF_SHIFT && use_shift())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DIFF_SCALE)
+            return use_scale() ? arg_usage_t::output : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_SHIFT)
+            return use_shift() ? arg_usage_t::output : arg_usage_t::unused;
+
         return primitive_desc_t::arg_usage(arg);
     }
 
@@ -285,7 +289,7 @@ struct group_normalization_bwd_pd_t : public group_normalization_pd_t {
     memory_desc_t diff_dst_md_;
     memory_desc_t diff_scaleshift_md_;
 
-    group_normalization_bwd_pd_t(const group_normalization_desc_t *adesc,
+    group_normalization_bwd_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr, const hint_class *hint_fwd_pd)
         : group_normalization_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_md_(desc_.diff_src_desc)
@@ -309,6 +313,7 @@ struct group_normalization_bwd_pd_t : public group_normalization_pd_t {
                         diff_weights_md()->data_type));
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/impl_list_item.hpp b/src/common/impl_list_item.hpp
index 45a8d0aee1f..a18f4d49a9f 100644
--- a/src/common/impl_list_item.hpp
+++ b/src/common/impl_list_item.hpp
@@ -19,6 +19,7 @@
 
 #include "c_types_map.hpp"
 #include "primitive_desc.hpp"
+#include "dnnl_sel_build.hpp"
 #include "utils.hpp"
 
 namespace dnnl {
@@ -89,24 +90,35 @@ struct impl_list_item_t {
         : public type_deduction_helper_t<pd_t> {};
 
     template <typename pd_t>
-    constexpr impl_list_item_t(type_deduction_helper_t<pd_t>)
-        : create_pd_func_(&primitive_desc_t::create<
-                          typename type_deduction_helper_t<pd_t>::type>) {}
+    impl_list_item_t(type_deduction_helper_t<pd_t>) {
+        using deduced_pd_t = typename type_deduction_helper_t<pd_t>::type;
+        create_pd_func_ = &primitive_desc_t::create<deduced_pd_t>;
+        DNNL_PRIMITIVE_NAME_INIT(pd_t);
+    }
 
     template <typename pd_t>
-    constexpr impl_list_item_t(concat_type_deduction_helper_t<pd_t>)
-        : create_concat_pd_func_(
-                concat_type_deduction_helper_t<pd_t>::type::create) {}
+    impl_list_item_t(concat_type_deduction_helper_t<pd_t>) {
+        using deduced_pd_t =
+                typename concat_type_deduction_helper_t<pd_t>::type;
+        create_concat_pd_func_ = deduced_pd_t::create;
+        DNNL_PRIMITIVE_NAME_INIT(pd_t);
+    }
 
     template <typename pd_t>
-    constexpr impl_list_item_t(sum_type_deduction_helper_t<pd_t>)
-        : create_sum_pd_func_(sum_type_deduction_helper_t<pd_t>::type::create) {
+    impl_list_item_t(sum_type_deduction_helper_t<pd_t>) {
+        using deduced_pd_t = typename sum_type_deduction_helper_t<pd_t>::type;
+        create_sum_pd_func_ = deduced_pd_t::create;
+        DNNL_PRIMITIVE_NAME_INIT(pd_t);
     }
 
     template <typename pd_t>
-    constexpr impl_list_item_t(reorder_type_deduction_helper_t<pd_t>)
-        : create_reorder_pd_func_(
-                reorder_type_deduction_helper_t<pd_t>::type::create) {}
+    impl_list_item_t(reorder_type_deduction_helper_t<pd_t>) {
+        using deduced_pd_t =
+                typename reorder_type_deduction_helper_t<pd_t>::type;
+        create_reorder_pd_func_ = deduced_pd_t::create;
+        DNNL_PRIMITIVE_NAME_INIT(pd_t);
+    }
+
 
     explicit operator bool() const {
         return !utils::everyone_is(nullptr, create_pd_func_,
@@ -127,6 +139,10 @@ struct impl_list_item_t {
         return -1;
     }
 
+#if defined(SELECTIVE_BUILD_ANALYZER)
+    const char *name = {};
+#endif
+
 private:
     status_t operator()(primitive_desc_t **pd, const op_desc_t *adesc,
             const primitive_attr_t *attr, engine_t *engine,
@@ -206,6 +222,15 @@ struct impl_list_item_t {
             engine_t *, const primitive_attr_t *);
 };
 
+#if defined(SELECTIVE_BUILD_ANALYZER)
+inline impl_list_item_t&& move(impl_list_item_t &&t, const char *name) {
+    OV_ITT_SCOPED_TASK(
+        dnnl::FACTORY_DNNL,
+        openvino::itt::handle(std::string("REG$CPUEngine$") + t.name + "$" + name));
+    return static_cast<impl_list_item_t&&>(t);
+}
+#endif
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/impl_registration.hpp b/src/common/impl_registration.hpp
index 6625f21653d..d6dc6c36079 100644
--- a/src/common/impl_registration.hpp
+++ b/src/common/impl_registration.hpp
@@ -27,9 +27,9 @@
 #define REG_BWD_D_PK(...) __VA_ARGS__
 #else
 #define REG_BWD_PK(...) \
-    { nullptr }
+    { nullptr }, 
 #define REG_BWD_D_PK(...) \
-    { nullptr }
+    { nullptr }, 
 #endif
 
 // Primitives section
@@ -56,7 +56,7 @@
 #define REG_CONCAT_P(...) __VA_ARGS__
 #else
 #define REG_CONCAT_P(...) \
-    { nullptr }
+    { nullptr },
 #endif
 
 #if BUILD_PRIMITIVE_ALL || BUILD_CONVOLUTION
@@ -128,7 +128,7 @@
 #define REG_MATMUL_P(...) __VA_ARGS__
 #else
 #define REG_MATMUL_P(...) \
-    { nullptr }
+    { nullptr }, 
 #endif
 
 #if BUILD_PRIMITIVE_ALL || BUILD_POOLING
@@ -149,7 +149,7 @@
 #define REG_REDUCTION_P(...) __VA_ARGS__
 #else
 #define REG_REDUCTION_P(...) \
-    { nullptr }
+    { nullptr }, 
 #endif
 
 #if BUILD_PRIMITIVE_ALL || BUILD_REORDER
@@ -245,4 +245,10 @@
 #define REG_XE2_ISA(...)
 #endif
 
+#if BUILD_PRIMITIVE_GPU_ISA_ALL || BUILD_XE3
+#define REG_XE3_ISA(...) __VA_ARGS__
+#else
+#define REG_XE3_ISA(...)
+#endif
+
 #endif
diff --git a/src/common/inner_product.cpp b/src/common/inner_product.cpp
index 8375869cec0..05c314a7645 100644
--- a/src/common/inner_product.cpp
+++ b/src/common/inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -109,36 +109,60 @@ status_t ip_attr_check(const inner_product_desc_t &desc, const engine_t *engine,
     using smask_t = primitive_attr_t::skip_mask_t;
 
     if (attr == nullptr) return status::success;
-    if (attr->has_default_values()) return status::success;
+    const data_type_t src_dt = desc.src_desc.data_type;
+    const data_type_t wei_dt = desc.weights_desc.data_type;
+    bool is_weight_compression = (one_of(src_dt, data_type::f32, data_type::bf16) &&
+                                  one_of(wei_dt, data_type::u8, data_type::s8, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1)) ||
+                                 (one_of(src_dt, data_type::f32) && one_of(wei_dt, data_type::f16, data_type::bf16));
+    auto attr_mask = smask_t::none;
+    // From oneDNN 3.5, those checks must be skipped if wei_decomp is enabled
+    // reference from src/plugins/intel_cpu/thirdparty/onednn/src/common/matmul.cpp:L62
+    if (is_weight_compression) {
+        attr_mask |= smask_t::zero_points_data_type;
+        attr_mask |= smask_t::zero_points_groups;
+        attr_mask |= smask_t::scales_data_type;
+        attr_mask |= smask_t::scales_groups;
+    }
+    if (attr->has_default_values(attr_mask)) return status::success;
 
     // Check attributes
     if (utils::one_of(desc.prop_kind, prop_kind::forward_inference,
                 prop_kind::forward_training)) {
         const data_type_t src_dt = desc.src_desc.data_type;
         const data_type_t dst_dt = desc.dst_desc.data_type;
+        const data_type_t wei_dt = desc.weights_desc.data_type;
 
-        auto fwd_attr_mask
-                = smask_t::post_ops | smask_t::sum_dt | smask_t::fpmath_mode;
+        auto fwd_attr_mask = smask_t::post_ops | smask_t::sum_dt
+                | smask_t::fpmath_mode | smask_t::accumulation_mode;
 
         bool is_int8 = utils::one_of(src_dt, data_type::s8, data_type::u8);
         if (engine->kind() == engine_kind::gpu)
             is_int8 = is_int8
                     || utils::one_of(dst_dt, data_type::s8, data_type::u8,
                             data_type::s32);
-        if (is_int8) fwd_attr_mask |= smask_t::scales_runtime;
 
+        if (engine->kind() == engine_kind::cpu)
+            is_int8 |= one_of(wei_dt, data_type::u8, data_type::s8, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1);
+        if (is_int8) fwd_attr_mask |= smask_t::scales | smask_t::zero_points | smask_t::src_dyn_quant_params;
+
+        if (is_weight_compression) {
+            fwd_attr_mask |= attr_mask;
+        }
         VCHECK_IP_UNIMPL(attr->has_default_values(fwd_attr_mask, dst_dt),
                 VERBOSE_UNSUPPORTED_ATTR);
 
         // Check scales
         if (!attr->scales_.has_default_values()) {
             const auto &sc = attr->scales_;
-            const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-            const int mask_wei = sc.get(DNNL_ARG_WEIGHTS).mask_;
-            const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
-
-            VCHECK_IP_UNIMPL(utils::everyone_is(0, mask_src, mask_dst)
-                            && utils::one_of(mask_wei, 0, 1),
+            VCHECK_IP_UNIMPL(IMPLICATION(!sc.has_default_values(DNNL_ARG_SRC),
+                                     sc.get_mask(DNNL_ARG_SRC) == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_IP_UNIMPL(
+                    IMPLICATION(!sc.has_default_values(DNNL_ARG_WEIGHTS),
+                            utils::one_of(sc.get_mask(DNNL_ARG_WEIGHTS), 0, 1)),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_IP_UNIMPL(IMPLICATION(!sc.has_default_values(DNNL_ARG_DST),
+                                     sc.get_mask(DNNL_ARG_DST) == 0),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
         }
 
@@ -153,9 +177,12 @@ status_t ip_attr_check(const inner_product_desc_t &desc, const engine_t *engine,
             // Check sum
             VCHECK_IP_UNIMPL(po.check_sum_consistency(dst_dt, is_int8, true),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
-        auto bwd_attr_mask = smask_t::fpmath_mode;
+        auto bwd_attr_mask = smask_t::fpmath_mode | smask_t::accumulation_mode;
         VCHECK_IP_UNIMPL(attr->has_default_values(bwd_attr_mask),
                 VERBOSE_UNSUPPORTED_ATTR);
     }
diff --git a/src/common/inner_product_pd.hpp b/src/common/inner_product_pd.hpp
index 6dbdfcafc64..4e71f61aa17 100644
--- a/src/common/inner_product_pd.hpp
+++ b/src/common/inner_product_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,13 +44,6 @@ struct inner_product_fwd_pd_t;
 struct inner_product_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::inner_product;
 
-    inner_product_pd_t(const inner_product_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const inner_product_fwd_pd_t *hint_fwd_pd)
-        : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
-        , hint_fwd_pd_(hint_fwd_pd) {}
-
     const inner_product_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
         return reinterpret_cast<const op_desc_t *>(this->desc());
@@ -139,10 +132,16 @@ struct inner_product_pd_t : public primitive_desc_t {
     inner_product_desc_t desc_;
     const inner_product_fwd_pd_t *hint_fwd_pd_;
 
+    inner_product_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+            const inner_product_fwd_pd_t *hint_fwd_pd)
+        : primitive_desc_t(attr, base_pkind)
+        , desc_(*op_desc_t::to_desc<inner_product_desc_t>(adesc))
+        , hint_fwd_pd_(hint_fwd_pd) {}
+
     bool set_default_formats_common_template(memory_desc_t &src_md,
             format_tag_t src_tag, memory_desc_t &wei_md, format_tag_t wei_tag,
             memory_desc_t &dst_md, format_tag_t dst_tag,
-            memory_desc_t &bia_md) {
+            memory_desc_t &bia_md) const {
         using namespace format_tag;
 
 #define IS_OK(f) \
@@ -185,7 +184,9 @@ struct inner_product_pd_t : public primitive_desc_t {
             = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) const {
         bool ok = attr()->scales_.has_default_values(supported_args);
         for (auto arg : supported_args) {
-            int mask = attr()->scales_.get(arg).mask_;
+            if (attr()->scales_.has_default_values(arg)) continue;
+
+            int mask = attr()->scales_.get_mask(arg);
             if (arg == DNNL_ARG_WEIGHTS)
                 ok = ok && (mask == 0 || mask == (1 << 0));
             else
@@ -195,24 +196,17 @@ struct inner_product_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct inner_product_fwd_pd_t : public inner_product_pd_t {
-    typedef inner_product_fwd_pd_t base_class;
-    typedef inner_product_fwd_pd_t hint_class;
-
-    inner_product_fwd_pd_t(const inner_product_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const inner_product_fwd_pd_t *hint_fwd_pd)
-        : inner_product_pd_t(adesc, attr, hint_fwd_pd)
-        , src_md_(desc_.src_desc)
-        , weights_md_(desc_.weights_desc)
-        , bias_md_(desc_.bias_desc)
-        , dst_md_(desc_.dst_desc) {}
+    using base_class = inner_product_fwd_pd_t;
+    using hint_class = inner_product_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_BIAS && with_bias()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_BIAS)
+            return with_bias() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
@@ -259,24 +253,26 @@ struct inner_product_fwd_pd_t : public inner_product_pd_t {
     memory_desc_t bias_md_;
     memory_desc_t dst_md_;
 
+    inner_product_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+            const inner_product_fwd_pd_t *hint_fwd_pd)
+        : inner_product_pd_t(adesc, attr, hint_fwd_pd)
+        , src_md_(desc_.src_desc)
+        , weights_md_(desc_.weights_desc)
+        , bias_md_(desc_.bias_desc)
+        , dst_md_(desc_.dst_desc) {}
+
     bool set_default_formats_common(
             format_tag_t src_tag, format_tag_t wei_tag, format_tag_t dst_tag) {
         return set_default_formats_common_template(src_md_, src_tag,
                 weights_md_, wei_tag, dst_md_, dst_tag, bias_md_);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct inner_product_bwd_data_pd_t : public inner_product_pd_t {
-    typedef inner_product_bwd_data_pd_t base_class;
-    typedef inner_product_fwd_pd_t hint_class;
-
-    inner_product_bwd_data_pd_t(const inner_product_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const inner_product_fwd_pd_t *hint_fwd_pd)
-        : inner_product_pd_t(adesc, attr, hint_fwd_pd)
-        , diff_src_md_(desc_.diff_src_desc)
-        , weights_md_(desc_.weights_desc)
-        , diff_dst_md_(desc_.diff_dst_desc) {}
+    using base_class = inner_product_bwd_data_pd_t;
+    using hint_class = inner_product_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_WEIGHTS, DNNL_ARG_DIFF_DST))
@@ -324,6 +320,14 @@ struct inner_product_bwd_data_pd_t : public inner_product_pd_t {
     memory_desc_t weights_md_;
     memory_desc_t diff_dst_md_;
 
+    inner_product_bwd_data_pd_t(const op_desc_t *adesc,
+            const primitive_attr_t *attr,
+            const inner_product_fwd_pd_t *hint_fwd_pd)
+        : inner_product_pd_t(adesc, attr, hint_fwd_pd)
+        , diff_src_md_(desc_.diff_src_desc)
+        , weights_md_(desc_.weights_desc)
+        , diff_dst_md_(desc_.diff_dst_desc) {}
+
     bool set_default_formats_common(format_tag_t diff_src_tag,
             format_tag_t wei_tag, format_tag_t diff_dst_tag) {
         memory_desc_t dummy_md;
@@ -331,19 +335,12 @@ struct inner_product_bwd_data_pd_t : public inner_product_pd_t {
                 weights_md_, wei_tag, diff_dst_md_, diff_dst_tag, dummy_md);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct inner_product_bwd_weights_pd_t : public inner_product_pd_t {
-    typedef inner_product_bwd_weights_pd_t base_class;
-    typedef inner_product_fwd_pd_t hint_class;
-
-    inner_product_bwd_weights_pd_t(const inner_product_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const inner_product_fwd_pd_t *hint_fwd_pd)
-        : inner_product_pd_t(adesc, attr, hint_fwd_pd)
-        , src_md_(desc_.src_desc)
-        , diff_weights_md_(desc_.diff_weights_desc)
-        , diff_bias_md_(desc_.diff_bias_desc)
-        , diff_dst_md_(desc_.diff_dst_desc) {}
+    using base_class = inner_product_bwd_weights_pd_t;
+    using hint_class = inner_product_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_DIFF_DST))
@@ -351,8 +348,8 @@ struct inner_product_bwd_weights_pd_t : public inner_product_pd_t {
 
         if (arg == DNNL_ARG_DIFF_WEIGHTS) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_DIFF_BIAS && with_bias())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DIFF_BIAS)
+            return with_bias() ? arg_usage_t::output : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -397,6 +394,15 @@ struct inner_product_bwd_weights_pd_t : public inner_product_pd_t {
     memory_desc_t diff_bias_md_;
     memory_desc_t diff_dst_md_;
 
+    inner_product_bwd_weights_pd_t(const op_desc_t *adesc,
+            const primitive_attr_t *attr,
+            const inner_product_fwd_pd_t *hint_fwd_pd)
+        : inner_product_pd_t(adesc, attr, hint_fwd_pd)
+        , src_md_(desc_.src_desc)
+        , diff_weights_md_(desc_.diff_weights_desc)
+        , diff_bias_md_(desc_.diff_bias_desc)
+        , diff_dst_md_(desc_.diff_dst_desc) {}
+
     bool set_default_formats_common(format_tag_t src_tag,
             format_tag_t diff_wei_tag, format_tag_t diff_dst_tag) {
         return set_default_formats_common_template(src_md_, src_tag,
@@ -404,6 +410,7 @@ struct inner_product_bwd_weights_pd_t : public inner_product_pd_t {
                 diff_bias_md_);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/int4.hpp b/src/common/int4.hpp
index 2e692d13a5a..6658eac9921 100644
--- a/src/common/int4.hpp
+++ b/src/common/int4.hpp
@@ -25,41 +25,23 @@
 namespace dnnl {
 namespace impl {
 
-enum class int4_extract_t : uint8_t { low_half = 0, high_half = 4 };
-
-inline uint8_t extract_half_byte(uint8_t val, int4_extract_t half) {
-    uint8_t shift = static_cast<uint8_t>(half);
-    return (val >> shift) & 0xF;
-}
-
-inline uint8_t insert_half_byte(uint8_t src, uint8_t val, int4_extract_t half) {
-    uint8_t shift = static_cast<uint8_t>(half);
-    uint8_t mask = half == int4_extract_t::high_half ? 0x0F : 0xF0;
-    return (src & mask) | (uint8_t)(val << shift);
-}
-
 struct uint4_t {
     template <typename IntegerType,
             typename SFINAE = typename std::enable_if<
                     std::is_integral<IntegerType>::value>::type>
-    constexpr uint4_t(IntegerType raw) : raw_(raw) {}
+    constexpr uint4_t(IntegerType raw) : raw_bits_(static_cast<uint8_t>(raw)) {
+#if __cplusplus >= 201402L
+        assert(0 <= raw && raw <= std::numeric_limits<uint8_t>::max());
+#endif
+    }
     uint4_t(float val_f32) {
         uint8_t val_uint8 = static_cast<uint8_t>(val_f32);
-        raw_ = val_uint8 & 0xF;
+        raw_bits_ = val_uint8 & 0xF;
     }
 
-    operator float() const { return (float)raw_; }
-
-    uint8_t insert(uint8_t src, int4_extract_t half) const {
-        return insert_half_byte(src, raw_, half);
-    }
+    operator float() const { return (float)raw_bits_; }
 
-    static uint4_t extract(uint8_t val, int4_extract_t half) {
-        return uint4_t(extract_half_byte(val, half));
-    }
-
-private:
-    uint8_t raw_;
+    uint8_t raw_bits_;
 };
 
 static_assert(sizeof(uint4_t) == 1, "uint4_t must be 1 byte");
@@ -68,30 +50,21 @@ struct int4_t {
     template <typename IntegerType,
             typename SFINAE = typename std::enable_if<
                     std::is_integral<IntegerType>::value>::type>
-    constexpr int4_t(IntegerType i) : raw_(static_cast<uint8_t>(i)) {}
+    constexpr int4_t(IntegerType i) : raw_bits_(static_cast<uint8_t>(i)) {}
     int4_t(float val_f32) {
         int8_t val_int8 = static_cast<int8_t>(val_f32);
         bool negative = val_f32 < 0;
         // positive numbers have the most significant bit set to 0
         // negative numbers have the most significant bit set to 1
-        raw_ = negative ? (val_int8 & 0xF) | 0x8 : val_int8 & 0x7;
+        raw_bits_ = negative ? (val_int8 & 0xF) | 0x8 : val_int8 & 0x7;
     }
 
     operator float() const {
-        float sign = (raw_ & (1 << 3)) ? -1.f : 1.f;
-        return sign * (float)(sign == -1 ? (~raw_ & 0xF) + 1 : raw_);
-    }
-
-    uint8_t insert(uint8_t src, int4_extract_t half) const {
-        return insert_half_byte(src, raw_, half);
-    }
-
-    static int4_t extract(uint8_t val, int4_extract_t half) {
-        return int4_t(extract_half_byte(val, half));
+        float sign = (raw_bits_ & (1 << 3)) ? -1.f : 1.f;
+        return sign * (float)(sign == -1 ? (~raw_bits_ & 0xF) + 1 : raw_bits_);
     }
 
-private:
-    uint8_t raw_;
+    uint8_t raw_bits_;
 };
 
 static_assert(sizeof(int4_t) == 1, "int4_t must be 1 byte");
diff --git a/src/common/ittnotify.cpp b/src/common/ittnotify.cpp
index e9c9dfa8404..2994962a997 100644
--- a/src/common/ittnotify.cpp
+++ b/src/common/ittnotify.cpp
@@ -18,8 +18,8 @@
 #include "utils.hpp"
 
 #if defined(DNNL_ENABLE_ITT_TASKS)
-#include "common/ittnotify/ittnotify.h"
 #include "dnnl_debug.h"
+#include "ittnotify/ittnotify.h"
 #endif
 
 namespace dnnl {
@@ -80,12 +80,16 @@ void primitive_task_start(primitive_kind_t kind) {
             CASE(layer_normalization),
             CASE(group_normalization),
             CASE(sdpa),
+            CASE(depthwise),
+            CASE(quantization),
     };
 #undef CASE
     int kind_idx = (int)kind;
     assert(kind_idx >= 0);
-    assert((size_t)kind_idx
-            < sizeof(prim_kind_itt_strings) / sizeof(prim_kind_itt_strings[0]));
+    if (kind_idx < primitive_kind::internal_only_start) {
+        assert((size_t)kind_idx < sizeof(prim_kind_itt_strings)
+                        / sizeof(prim_kind_itt_strings[0]));
+    }
     __itt_task_begin(itt_domain(), __itt_null, __itt_null,
             prim_kind_itt_strings[kind_idx]);
     thread_primitive_kind = kind;
diff --git a/src/common/ittnotify.hpp b/src/common/ittnotify.hpp
index b1ec4b7e248..71a51394bbb 100644
--- a/src/common/ittnotify.hpp
+++ b/src/common/ittnotify.hpp
@@ -24,7 +24,9 @@ namespace dnnl {
 namespace impl {
 namespace itt {
 
-typedef enum {
+// GCC treats using and typedef differently for enums and structs
+// https://stackoverflow.com/questions/48613758
+typedef enum { // NOLINT(modernize-use-using)
     __itt_task_level_none = 0,
     __itt_task_level_low,
     __itt_task_level_high
diff --git a/src/common/ittnotify/ittnotify.h b/src/common/ittnotify/ittnotify.h
deleted file mode 100644
index d3df4b5e380..00000000000
--- a/src/common/ittnotify/ittnotify.h
+++ /dev/null
@@ -1,4459 +0,0 @@
-/*
-  Copyright (C) 2005-2019 Intel Corporation
-
-  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
-*/
-#ifndef _ITTNOTIFY_H_
-#define _ITTNOTIFY_H_
-
-/**
-@file
-@brief Public User API functions and types
-@mainpage
-
-The Instrumentation and Tracing Technology API (ITT API) is used to
-annotate a user's program with additional information
-that can be used by correctness and performance tools. The user inserts
-calls in their program. Those calls generate information that is collected
-at runtime, and used by Intel(R) Threading Tools.
-
-@section API Concepts
-The following general concepts are used throughout the API.
-
-@subsection Unicode Support
-Many API functions take character string arguments. On Windows, there
-are two versions of each such function. The function name is suffixed
-by W if Unicode support is enabled, and by A otherwise. Any API function
-that takes a character string argument adheres to this convention.
-
-@subsection Conditional Compilation
-Many users prefer having an option to modify ITT API code when linking it
-inside their runtimes. ITT API header file provides a mechanism to replace
-ITT API function names inside your code with empty strings. To do this,
-define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the
-static library from the linker script.
-
-@subsection Domains
-[see domains]
-Domains provide a way to separate notification for different modules or
-libraries in a program. Domains are specified by dotted character strings,
-e.g. TBB.Internal.Control.
-
-A mechanism (to be specified) is provided to enable and disable
-domains. By default, all domains are enabled.
-@subsection Named Entities and Instances
-Named entities (frames, regions, tasks, and markers) communicate
-information about the program to the analysis tools. A named entity often
-refers to a section of program code, or to some set of logical concepts
-that the programmer wants to group together.
-
-Named entities relate to the programmer's static view of the program. When
-the program actually executes, many instances of a given named entity
-may be created.
-
-The API annotations denote instances of named entities. The actual
-named entities are displayed using the analysis tools. In other words,
-the named entities come into existence when instances are created.
-
-Instances of named entities may have instance identifiers (IDs). Some
-API calls use instance identifiers to create relationships between
-different instances of named entities. Other API calls associate data
-with instances of named entities.
-
-Some named entities must always have instance IDs. In particular, regions
-and frames always have IDs. Task and markers need IDs only if the ID is
-needed in another API call (such as adding a relation or metadata).
-
-The lifetime of instance IDs is distinct from the lifetime of
-instances. This allows various relationships to be specified separate
-from the actual execution of instances. This flexibility comes at the
-expense of extra API calls.
-
-The same ID may not be reused for different instances, unless a previous
-[ref] __itt_id_destroy call for that ID has been issued.
-*/
-
-/** @cond exclude_from_documentation */
-#ifndef ITT_OS_WIN
-#  define ITT_OS_WIN   1
-#endif /* ITT_OS_WIN */
-
-#ifndef ITT_OS_LINUX
-#  define ITT_OS_LINUX 2
-#endif /* ITT_OS_LINUX */
-
-#ifndef ITT_OS_MAC
-#  define ITT_OS_MAC   3
-#endif /* ITT_OS_MAC */
-
-#ifndef ITT_OS_FREEBSD
-#  define ITT_OS_FREEBSD   4
-#endif /* ITT_OS_FREEBSD */
-
-#ifndef ITT_OS
-#  if defined WIN32 || defined _WIN32
-#    define ITT_OS ITT_OS_WIN
-#  elif defined( __APPLE__ ) && defined( __MACH__ )
-#    define ITT_OS ITT_OS_MAC
-#  elif defined( __FreeBSD__ )
-#    define ITT_OS ITT_OS_FREEBSD
-#  else
-#    define ITT_OS ITT_OS_LINUX
-#  endif
-#endif /* ITT_OS */
-
-#ifndef ITT_PLATFORM_WIN
-#  define ITT_PLATFORM_WIN 1
-#endif /* ITT_PLATFORM_WIN */
-
-#ifndef ITT_PLATFORM_POSIX
-#  define ITT_PLATFORM_POSIX 2
-#endif /* ITT_PLATFORM_POSIX */
-
-#ifndef ITT_PLATFORM_MAC
-#  define ITT_PLATFORM_MAC 3
-#endif /* ITT_PLATFORM_MAC */
-
-#ifndef ITT_PLATFORM_FREEBSD
-#  define ITT_PLATFORM_FREEBSD 4
-#endif /* ITT_PLATFORM_FREEBSD */
-
-#ifndef ITT_PLATFORM
-#  if ITT_OS==ITT_OS_WIN
-#    define ITT_PLATFORM ITT_PLATFORM_WIN
-#  elif ITT_OS==ITT_OS_MAC
-#    define ITT_PLATFORM ITT_PLATFORM_MAC
-#  elif ITT_OS==ITT_OS_FREEBSD
-#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
-#  else
-#    define ITT_PLATFORM ITT_PLATFORM_POSIX
-#  endif
-#endif /* ITT_PLATFORM */
-
-#if defined(_UNICODE) && !defined(UNICODE)
-#define UNICODE
-#endif
-
-#include <stddef.h>
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#include <tchar.h>
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#include <stdint.h>
-#if defined(UNICODE) || defined(_UNICODE)
-#include <wchar.h>
-#endif /* UNICODE || _UNICODE */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-#ifndef ITTAPI_CDECL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define ITTAPI_CDECL __cdecl
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define ITTAPI_CDECL __attribute__ ((cdecl))
-#    else  /* _M_IX86 || __i386__ */
-#      define ITTAPI_CDECL /* actual only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* ITTAPI_CDECL */
-
-#ifndef STDCALL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define STDCALL __stdcall
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
-#    else  /* _M_IX86 || __i386__ */
-#      define STDCALL /* supported only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* STDCALL */
-
-#define ITTAPI    ITTAPI_CDECL
-#define LIBITTAPI ITTAPI_CDECL
-
-/* TODO: Temporary for compatibility! */
-#define ITTAPI_CALL    ITTAPI_CDECL
-#define LIBITTAPI_CALL ITTAPI_CDECL
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-/* use __forceinline (VC++ specific) */
-#if defined(__MINGW32__) && !defined(__cplusplus)
-#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
-#else
-#define ITT_INLINE           static __forceinline
-#endif /* __MINGW32__ */
-
-#define ITT_INLINE_ATTRIBUTE /* nothing */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-/*
- * Generally, functions are not inlined unless optimization is specified.
- * For functions declared inline, this attribute inlines the function even
- * if no optimization level was specified.
- */
-#ifdef __STRICT_ANSI__
-#define ITT_INLINE           static
-#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
-#else  /* __STRICT_ANSI__ */
-#define ITT_INLINE           static inline
-#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
-#endif /* __STRICT_ANSI__ */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-/** @endcond */
-
-#ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#  include "legacy/ittnotify.h"
-#endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
-
-/** @cond exclude_from_documentation */
-/* Helper macro for joining tokens */
-#define ITT_JOIN_AUX(p,n) p##n
-#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
-
-#ifdef ITT_MAJOR
-#undef ITT_MAJOR
-#endif
-#ifdef ITT_MINOR
-#undef ITT_MINOR
-#endif
-#define ITT_MAJOR     3
-#define ITT_MINOR     0
-
-/* Standard versioning of a token with major and minor version numbers */
-#define ITT_VERSIONIZE(x)    \
-    ITT_JOIN(x,              \
-    ITT_JOIN(_,              \
-    ITT_JOIN(ITT_MAJOR,      \
-    ITT_JOIN(_, ITT_MINOR))))
-
-#ifndef INTEL_ITTNOTIFY_PREFIX
-#  define INTEL_ITTNOTIFY_PREFIX __itt_
-#endif /* INTEL_ITTNOTIFY_PREFIX */
-#ifndef INTEL_ITTNOTIFY_POSTFIX
-#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
-#endif /* INTEL_ITTNOTIFY_POSTFIX */
-
-#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
-#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
-
-#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
-#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
-
-#define ITTNOTIFY_VOID_D0(n,d)       (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_VOID_D1(n,d,x)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_VOID_D2(n,d,x,y)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-#define ITTNOTIFY_DATA_D0(n,d)       (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_DATA_D1(n,d,x)     (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_DATA_D2(n,d,x,y)   (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-
-#ifdef ITT_STUB
-#undef ITT_STUB
-#endif
-#ifdef ITT_STUBV
-#undef ITT_STUBV
-#endif
-#define ITT_STUBV(api,type,name,args)                             \
-    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
-    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
-#define ITT_STUB ITT_STUBV
-/** @endcond */
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-/** @cond exclude_from_gpa_documentation */
-/**
- * @defgroup public Public API
- * @{
- * @}
- */
-
-/**
- * @defgroup control Collection Control
- * @ingroup public
- * General behavior: application continues to run, but no profiling information is being collected
- *
- * Pausing occurs not only for the current thread but for all process as well as spawned processes
- * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
- *   - Does not analyze or report errors that involve memory access.
- *   - Other errors are reported as usual. Pausing data collection in
- *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
- *     only pauses tracing and analyzing memory access.
- *     It does not pause tracing or analyzing threading APIs.
- *   .
- * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
- *   - Does continue to record when new threads are started.
- *   .
- * - Other effects:
- *   - Possible reduction of runtime overhead.
- *   .
- * @{
- */
-/** @brief Pause collection */
-void ITTAPI __itt_pause(void);
-/** @brief Resume collection */
-void ITTAPI __itt_resume(void);
-/** @brief Detach collection */
-void ITTAPI __itt_detach(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, pause,  (void))
-ITT_STUBV(ITTAPI, void, resume, (void))
-ITT_STUBV(ITTAPI, void, detach, (void))
-#define __itt_pause      ITTNOTIFY_VOID(pause)
-#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
-#define __itt_resume     ITTNOTIFY_VOID(resume)
-#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
-#define __itt_detach     ITTNOTIFY_VOID(detach)
-#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_pause()
-#define __itt_pause_ptr  0
-#define __itt_resume()
-#define __itt_resume_ptr 0
-#define __itt_detach()
-#define __itt_detach_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_pause_ptr  0
-#define __itt_resume_ptr 0
-#define __itt_detach_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} control group */
-/** @endcond */
-
-/**
- * @defgroup Intel Processor Trace control
- * API from this group provides control over collection and analysis of Intel Processor Trace (Intel PT) data
- * Information about Intel Processor Trace technology can be found here (Volume 3 chapter 35):
- * https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf
- * Use this API to mark particular code regions for loading detailed performance statistics.
- * This mode makes your analysis faster and more accurate.
- * @{
-*/
-typedef unsigned char __itt_pt_region;
-
-/**
- * @brief function saves a region name marked with Intel PT API and returns a region id.
- * Only 7 names can be registered. Attempts to register more names will be ignored and a region id with auto names will be returned.
- * For automatic naming of regions pass NULL as function parameter
-*/
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_pt_region ITTAPI __itt_pt_region_createA(const char    *name);
-__itt_pt_region ITTAPI __itt_pt_region_createW(const wchar_t *name);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_pt_region_create __itt_pt_region_createW
-#else /* UNICODE */
-#  define __itt_pt_region_create __itt_pt_region_createA
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_pt_region ITTAPI __itt_pt_region_create(const char *name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char    *name))
-ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create,  (const char    *name))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_pt_region_createA     ITTNOTIFY_DATA(pt_region_createA)
-#define __itt_pt_region_createA_ptr ITTNOTIFY_NAME(pt_region_createA)
-#define __itt_pt_region_createW     ITTNOTIFY_DATA(pt_region_createW)
-#define __itt_pt_region_createW_ptr ITTNOTIFY_NAME(pt_region_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_pt_region_create     ITTNOTIFY_DATA(pt_region_create)
-#define __itt_pt_region_create_ptr ITTNOTIFY_NAME(pt_region_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_pt_region_createA(name) (__itt_pt_region)0
-#define __itt_pt_region_createA_ptr 0
-#define __itt_pt_region_createW(name) (__itt_pt_region)0
-#define __itt_pt_region_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_pt_region_create(name)  (__itt_pt_region)0
-#define __itt_pt_region_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_pt_region_createA_ptr 0
-#define __itt_pt_region_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_pt_region_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief function contains a special code pattern identified on the post-processing stage and
- * marks the beginning of a code region targeted for Intel PT analysis
- * @param[in] region - region id, 0 <= region < 8
-*/
-void __itt_mark_pt_region_begin(__itt_pt_region region);
-/**
- * @brief function contains a special code pattern identified on the post-processing stage and
- * marks the end of a code region targeted for Intel PT analysis
- * @param[in] region - region id, 0 <= region < 8
-*/
-void __itt_mark_pt_region_end(__itt_pt_region region);
-/** @} Intel PT control group*/
-
-/**
- * @defgroup threads Threads
- * @ingroup public
- * Give names to threads
- * @{
- */
-/**
- * @brief Sets thread name of calling thread
- * @param[in] name - name of thread
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_thread_set_nameA(const char    *name);
-void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_thread_set_name     __itt_thread_set_nameW
-#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
-#else /* UNICODE */
-#  define __itt_thread_set_name     __itt_thread_set_nameA
-#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_thread_set_name(const char *name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
-ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
-#define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
-#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
-#define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
-#define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thread_set_nameA(name)
-#define __itt_thread_set_nameA_ptr 0
-#define __itt_thread_set_nameW(name)
-#define __itt_thread_set_nameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thread_set_name(name)
-#define __itt_thread_set_name_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thread_set_nameA_ptr 0
-#define __itt_thread_set_nameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thread_set_name_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @cond exclude_from_gpa_documentation */
-
-/**
- * @brief Mark current thread as ignored from this point on, for the duration of its existence.
- */
-void ITTAPI __itt_thread_ignore(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, thread_ignore, (void))
-#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
-#define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_thread_ignore()
-#define __itt_thread_ignore_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_thread_ignore_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} threads group */
-
-/**
- * @defgroup suppress Error suppression
- * @ingroup public
- * General behavior: application continues to run, but errors are suppressed
- *
- * @{
- */
-
-/*****************************************************************//**
- * @name group of functions used for error suppression in correctness tools
- *********************************************************************/
-/** @{ */
-/**
- * @hideinitializer
- * @brief possible value for suppression mask
- */
-#define __itt_suppress_all_errors 0x7fffffff
-
-/**
- * @hideinitializer
- * @brief possible value for suppression mask (suppresses errors from threading analysis)
- */
-#define __itt_suppress_threading_errors 0x000000ff
-
-/**
- * @hideinitializer
- * @brief possible value for suppression mask (suppresses errors from memory analysis)
- */
-#define __itt_suppress_memory_errors 0x0000ff00
-
-/**
- * @brief Start suppressing errors identified in mask on this thread
- */
-void ITTAPI __itt_suppress_push(unsigned int mask);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
-#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
-#define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_suppress_push(mask)
-#define __itt_suppress_push_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_suppress_push_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Undo the effects of the matching call to __itt_suppress_push
- */
-void ITTAPI __itt_suppress_pop(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, suppress_pop, (void))
-#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
-#define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_suppress_pop()
-#define __itt_suppress_pop_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_suppress_pop_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @enum __itt_model_disable
- * @brief Enumerator for the disable methods
- */
-typedef enum __itt_suppress_mode {
-    __itt_unsuppress_range,
-    __itt_suppress_range
-} __itt_suppress_mode_t;
-
-/**
- * @enum __itt_collection_state
- * @brief Enumerator for collection state. All non-work states have negative values.
- */
-typedef enum {
-    __itt_collection_uninitialized = 0, /* uninitialized */
-    __itt_collection_init_fail = 1, /* failed to init */
-    __itt_collection_collector_absent = 2, /* non work state collector exists */
-    __itt_collection_collector_exists = 3, /* work state collector exists */
-    __itt_collection_init_successful = 4 /* success to init */
-} __itt_collection_state;
-
-/**
- * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
- */
-void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
-#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
-#define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_suppress_mark_range(mask)
-#define __itt_suppress_mark_range_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_suppress_mark_range_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
- *        call is found, nothing is changed.
- */
-void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
-#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
-#define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_suppress_clear_range(mask)
-#define __itt_suppress_clear_range_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_suppress_clear_range_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} */
-/** @} suppress group */
-
-/**
- * @defgroup sync Synchronization
- * @ingroup public
- * Indicate user-written synchronization code
- * @{
- */
-/**
- * @hideinitializer
- * @brief possible value of attribute argument for sync object type
- */
-#define __itt_attr_barrier 1
-
-/**
- * @hideinitializer
- * @brief possible value of attribute argument for sync object type
- */
-#define __itt_attr_mutex   2
-
-/**
-@brief Name a synchronization object
-@param[in] addr       Handle for the synchronization object. You should
-use a real address to uniquely identify the synchronization object.
-@param[in] objtype    null-terminated object type string. If NULL is
-passed, the name will be "User Synchronization".
-@param[in] objname    null-terminated object name string. If NULL,
-no name will be assigned to the object.
-@param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
- */
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
-void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_sync_create     __itt_sync_createW
-#  define __itt_sync_create_ptr __itt_sync_createW_ptr
-#else /* UNICODE */
-#  define __itt_sync_create     __itt_sync_createA
-#  define __itt_sync_create_ptr __itt_sync_createA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
-ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
-#define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
-#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
-#define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
-#define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_createA(addr, objtype, objname, attribute)
-#define __itt_sync_createA_ptr 0
-#define __itt_sync_createW(addr, objtype, objname, attribute)
-#define __itt_sync_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_create(addr, objtype, objname, attribute)
-#define __itt_sync_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_createA_ptr 0
-#define __itt_sync_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
-@brief Rename a synchronization object
-
-You can use the rename call to assign or reassign a name to a given
-synchronization object.
-@param[in] addr  handle for the synchronization object.
-@param[in] name  null-terminated object name string.
-*/
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
-void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_sync_rename     __itt_sync_renameW
-#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
-#else /* UNICODE */
-#  define __itt_sync_rename     __itt_sync_renameA
-#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_sync_rename(void *addr, const char *name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
-ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
-#define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
-#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
-#define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
-#define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_renameA(addr, name)
-#define __itt_sync_renameA_ptr 0
-#define __itt_sync_renameW(addr, name)
-#define __itt_sync_renameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_rename(addr, name)
-#define __itt_sync_rename_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_renameA_ptr 0
-#define __itt_sync_renameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_rename_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- @brief Destroy a synchronization object.
- @param addr Handle for the synchronization object.
- */
-void ITTAPI __itt_sync_destroy(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
-#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
-#define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_sync_destroy(addr)
-#define __itt_sync_destroy_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_sync_destroy_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/*****************************************************************//**
- * @name group of functions is used for performance measurement tools
- *********************************************************************/
-/** @{ */
-/**
- * @brief Enter spin loop on user-defined sync object
- */
-void ITTAPI __itt_sync_prepare(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
-#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
-#define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_sync_prepare(addr)
-#define __itt_sync_prepare_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_sync_prepare_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Quit spin loop without acquiring spin object
- */
-void ITTAPI __itt_sync_cancel(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
-#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
-#define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_sync_cancel(addr)
-#define __itt_sync_cancel_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_sync_cancel_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Successful spin loop completion (sync object acquired)
- */
-void ITTAPI __itt_sync_acquired(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
-#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
-#define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_sync_acquired(addr)
-#define __itt_sync_acquired_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_sync_acquired_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Start sync object releasing code. Is called before the lock release call.
- */
-void ITTAPI __itt_sync_releasing(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
-#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
-#define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_sync_releasing(addr)
-#define __itt_sync_releasing_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_sync_releasing_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} */
-
-/** @} sync group */
-
-/**************************************************************//**
- * @name group of functions is used for correctness checking tools
- ******************************************************************/
-/** @{ */
-/**
- * @ingroup legacy
- * @deprecated Legacy API
- * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
- * @see void __itt_sync_prepare(void* addr);
- */
-void ITTAPI __itt_fsync_prepare(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
-#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
-#define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_fsync_prepare(addr)
-#define __itt_fsync_prepare_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_fsync_prepare_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup legacy
- * @deprecated Legacy API
- * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
- * @see void __itt_sync_cancel(void *addr);
- */
-void ITTAPI __itt_fsync_cancel(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
-#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
-#define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_fsync_cancel(addr)
-#define __itt_fsync_cancel_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_fsync_cancel_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup legacy
- * @deprecated Legacy API
- * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
- * @see void __itt_sync_acquired(void *addr);
- */
-void ITTAPI __itt_fsync_acquired(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
-#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
-#define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_fsync_acquired(addr)
-#define __itt_fsync_acquired_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_fsync_acquired_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup legacy
- * @deprecated Legacy API
- * @brief Fast synchronization which does no require spinning.
- * - This special function is to be used by TBB and OpenMP libraries only when they know
- *   there is no spin but they need to suppress TC warnings about shared variable modifications.
- * - It only has corresponding pointers in static library and does not have corresponding function
- *   in dynamic library.
- * @see void __itt_sync_releasing(void* addr);
- */
-void ITTAPI __itt_fsync_releasing(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
-#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
-#define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_fsync_releasing(addr)
-#define __itt_fsync_releasing_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_fsync_releasing_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} */
-
-/**
- * @defgroup model Modeling by Intel(R) Parallel Advisor
- * @ingroup public
- * This is the subset of itt used for modeling by Intel(R) Parallel Advisor.
- * This API is called ONLY using annotate.h, by "Annotation" macros
- * the user places in their sources during the parallelism modeling steps.
- *
- * site_begin/end and task_begin/end take the address of handle variables,
- * which are writeable by the API.  Handles must be 0 initialized prior
- * to the first call to begin, or may cause a run-time failure.
- * The handles are initialized in a multi-thread safe way by the API if
- * the handle is 0.  The commonly expected idiom is one static handle to
- * identify a site or task.  If a site or task of the same name has already
- * been started during this collection, the same handle MAY be returned,
- * but is not required to be - it is unspecified if data merging is done
- * based on name.  These routines also take an instance variable.  Like
- * the lexical instance, these must be 0 initialized.  Unlike the lexical
- * instance, this is used to track a single dynamic instance.
- *
- * API used by the Intel(R) Parallel Advisor to describe potential concurrency
- * and related activities. User-added source annotations expand to calls
- * to these procedures to enable modeling of a hypothetical concurrent
- * execution serially.
- * @{
- */
-#if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
-
-typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
-typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
-typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
-typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
-
-/**
- * @enum __itt_model_disable
- * @brief Enumerator for the disable methods
- */
-typedef enum {
-    __itt_model_disable_observation,
-    __itt_model_disable_collection
-} __itt_model_disable;
-
-#endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
-
-/**
- * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support.
- *
- * site_begin/end model a potential concurrency site.
- * site instances may be recursively nested with themselves.
- * site_end exits the most recently started but unended site for the current
- * thread.  The handle passed to end may be used to validate structure.
- * Instances of a site encountered on different threads concurrently
- * are considered completely distinct. If the site name for two different
- * lexical sites match, it is unspecified whether they are treated as the
- * same or different for data presentation.
- */
-void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_model_site_beginW(const wchar_t *name);
-#endif
-void ITTAPI __itt_model_site_beginA(const char *name);
-void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
-void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
-void ITTAPI __itt_model_site_end_2(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
-#endif
-ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
-ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
-ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
-ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
-#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
-#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
-#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
-#endif
-#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
-#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
-#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
-#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
-#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
-#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
-#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
-#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_site_begin(site, instance, name)
-#define __itt_model_site_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_site_beginW(name)
-#define __itt_model_site_beginW_ptr  0
-#endif
-#define __itt_model_site_beginA(name)
-#define __itt_model_site_beginA_ptr  0
-#define __itt_model_site_beginAL(name, siteNameLen)
-#define __itt_model_site_beginAL_ptr  0
-#define __itt_model_site_end(site, instance)
-#define __itt_model_site_end_ptr    0
-#define __itt_model_site_end_2()
-#define __itt_model_site_end_2_ptr    0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_site_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_site_beginW_ptr  0
-#endif
-#define __itt_model_site_beginA_ptr  0
-#define __itt_model_site_beginAL_ptr  0
-#define __itt_model_site_end_ptr    0
-#define __itt_model_site_end_2_ptr    0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support
- *
- * task_begin/end model a potential task, which is contained within the most
- * closely enclosing dynamic site.  task_end exits the most recently started
- * but unended task.  The handle passed to end may be used to validate
- * structure.  It is unspecified if bad dynamic nesting is detected.  If it
- * is, it should be encoded in the resulting data collection.  The collector
- * should not fail due to construct nesting issues, nor attempt to directly
- * indicate the problem.
- */
-void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_model_task_beginW(const wchar_t *name);
-void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
-#endif
-void ITTAPI __itt_model_task_beginA(const char *name);
-void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
-void ITTAPI __itt_model_iteration_taskA(const char *name);
-void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
-void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
-void ITTAPI __itt_model_task_end_2(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
-ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
-#endif
-ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
-ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
-ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
-ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
-ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
-ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
-#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
-#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
-#define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
-#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
-#define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
-#endif
-#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
-#define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
-#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
-#define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
-#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
-#define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
-#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
-#define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
-#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
-#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
-#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
-#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_task_begin(task, instance, name)
-#define __itt_model_task_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_task_beginW(name)
-#define __itt_model_task_beginW_ptr  0
-#endif
-#define __itt_model_task_beginA(name)
-#define __itt_model_task_beginA_ptr  0
-#define __itt_model_task_beginAL(name, siteNameLen)
-#define __itt_model_task_beginAL_ptr  0
-#define __itt_model_iteration_taskA(name)
-#define __itt_model_iteration_taskA_ptr  0
-#define __itt_model_iteration_taskAL(name, siteNameLen)
-#define __itt_model_iteration_taskAL_ptr  0
-#define __itt_model_task_end(task, instance)
-#define __itt_model_task_end_ptr    0
-#define __itt_model_task_end_2()
-#define __itt_model_task_end_2_ptr    0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_task_begin_ptr  0
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_model_task_beginW_ptr 0
-#endif
-#define __itt_model_task_beginA_ptr  0
-#define __itt_model_task_beginAL_ptr  0
-#define __itt_model_iteration_taskA_ptr    0
-#define __itt_model_iteration_taskAL_ptr    0
-#define __itt_model_task_end_ptr    0
-#define __itt_model_task_end_2_ptr    0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support
- *
- * lock_acquire/release model a potential lock for both lockset and
- * performance modeling.  Each unique address is modeled as a separate
- * lock, with invalid addresses being valid lock IDs.  Specifically:
- * no storage is accessed by the API at the specified address - it is only
- * used for lock identification.  Lock acquires may be self-nested and are
- * unlocked by a corresponding number of releases.
- * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing,
- * but may not have identical semantics.)
- */
-void ITTAPI __itt_model_lock_acquire(void *lock);
-void ITTAPI __itt_model_lock_acquire_2(void *lock);
-void ITTAPI __itt_model_lock_release(void *lock);
-void ITTAPI __itt_model_lock_release_2(void *lock);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
-ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
-ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
-ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
-#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
-#define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
-#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
-#define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
-#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
-#define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
-#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
-#define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_lock_acquire(lock)
-#define __itt_model_lock_acquire_ptr 0
-#define __itt_model_lock_acquire_2(lock)
-#define __itt_model_lock_acquire_2_ptr 0
-#define __itt_model_lock_release(lock)
-#define __itt_model_lock_release_ptr 0
-#define __itt_model_lock_release_2(lock)
-#define __itt_model_lock_release_2_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_lock_acquire_ptr 0
-#define __itt_model_lock_acquire_2_ptr 0
-#define __itt_model_lock_release_ptr 0
-#define __itt_model_lock_release_2_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support
- *
- * record_allocation/deallocation describe user-defined memory allocator
- * behavior, which may be required for correctness modeling to understand
- * when storage is not expected to be actually reused across threads.
- */
-void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
-void ITTAPI __itt_model_record_deallocation(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
-ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
-#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
-#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
-#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
-#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_record_allocation(addr, size)
-#define __itt_model_record_allocation_ptr   0
-#define __itt_model_record_deallocation(addr)
-#define __itt_model_record_deallocation_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_record_allocation_ptr   0
-#define __itt_model_record_deallocation_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_INDUCTION_USES support
- *
- * Note particular storage is inductive through the end of the current site
- */
-void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
-#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
-#define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_induction_uses(addr, size)
-#define __itt_model_induction_uses_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_induction_uses_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_REDUCTION_USES support
- *
- * Note particular storage is used for reduction through the end
- * of the current site
- */
-void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
-#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
-#define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_reduction_uses(addr, size)
-#define __itt_model_reduction_uses_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_reduction_uses_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_OBSERVE_USES support
- *
- * Have correctness modeling record observations about uses of storage
- * through the end of the current site
- */
-void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
-#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
-#define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_observe_uses(addr, size)
-#define __itt_model_observe_uses_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_observe_uses_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_CLEAR_USES support
- *
- * Clear the special handling of a piece of storage related to induction,
- * reduction or observe_uses
- */
-void ITTAPI __itt_model_clear_uses(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
-#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
-#define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_clear_uses(addr)
-#define __itt_model_clear_uses_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_clear_uses_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support
- *
- * disable_push/disable_pop push and pop disabling based on a parameter.
- * Disabling observations stops processing of memory references during
- * correctness modeling, and all annotations that occur in the disabled
- * region.  This allows description of code that is expected to be handled
- * specially during conversion to parallelism or that is not recognized
- * by tools (e.g. some kinds of synchronization operations.)
- * This mechanism causes all annotations in the disabled region, other
- * than disable_push and disable_pop, to be ignored.  (For example, this
- * might validly be used to disable an entire parallel site and the contained
- * tasks and locking in it for data collection purposes.)
- * The disable for collection is a more expensive operation, but reduces
- * collector overhead significantly.  This applies to BOTH correctness data
- * collection and performance data collection.  For example, a site
- * containing a task might only enable data collection for the first 10
- * iterations.  Both performance and correctness data should reflect this,
- * and the program should run as close to full speed as possible when
- * collection is disabled.
- */
-void ITTAPI __itt_model_disable_push(__itt_model_disable x);
-void ITTAPI __itt_model_disable_pop(void);
-void ITTAPI __itt_model_aggregate_task(size_t x);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
-ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
-ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
-#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
-#define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
-#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
-#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
-#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
-#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_model_disable_push(x)
-#define __itt_model_disable_push_ptr 0
-#define __itt_model_disable_pop()
-#define __itt_model_disable_pop_ptr 0
-#define __itt_model_aggregate_task(x)
-#define __itt_model_aggregate_task_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_model_disable_push_ptr 0
-#define __itt_model_disable_pop_ptr 0
-#define __itt_model_aggregate_task_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} model group */
-
-/**
- * @defgroup heap Heap
- * @ingroup public
- * Heap group
- * @{
- */
-
-typedef void* __itt_heap_function;
-
-/**
- * @brief Create an identification for heap function
- * @return non-zero identifier or NULL
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
-__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_heap_function_create     __itt_heap_function_createW
-#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
-#else
-#  define __itt_heap_function_create     __itt_heap_function_createA
-#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
-#define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
-#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
-#define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
-#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
-#define __itt_heap_function_createA_ptr 0
-#define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
-#define __itt_heap_function_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
-#define __itt_heap_function_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_heap_function_createA_ptr 0
-#define __itt_heap_function_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_heap_function_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record an allocation begin occurrence.
- */
-void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
-#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
-#define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_allocate_begin(h, size, initialized)
-#define __itt_heap_allocate_begin_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_allocate_begin_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record an allocation end occurrence.
- */
-void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
-#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
-#define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_allocate_end(h, addr, size, initialized)
-#define __itt_heap_allocate_end_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_allocate_end_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record a free begin occurrence.
- */
-void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
-#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
-#define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_free_begin(h, addr)
-#define __itt_heap_free_begin_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_free_begin_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record a free end occurrence.
- */
-void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
-#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
-#define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_free_end(h, addr)
-#define __itt_heap_free_end_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_free_end_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record a reallocation begin occurrence.
- */
-void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
-#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
-#define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
-#define __itt_heap_reallocate_begin_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_reallocate_begin_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record a reallocation end occurrence.
- */
-void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
-#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
-#define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
-#define __itt_heap_reallocate_end_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_reallocate_end_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @brief internal access begin */
-void ITTAPI __itt_heap_internal_access_begin(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
-#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
-#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_internal_access_begin()
-#define __itt_heap_internal_access_begin_ptr  0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_internal_access_begin_ptr  0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @brief internal access end */
-void ITTAPI __itt_heap_internal_access_end(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
-#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
-#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_internal_access_end()
-#define __itt_heap_internal_access_end_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_internal_access_end_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @brief record memory growth begin */
-void ITTAPI __itt_heap_record_memory_growth_begin(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
-#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
-#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_record_memory_growth_begin()
-#define __itt_heap_record_memory_growth_begin_ptr  0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_record_memory_growth_begin_ptr  0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @brief record memory growth end */
-void ITTAPI __itt_heap_record_memory_growth_end(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
-#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
-#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_record_memory_growth_end()
-#define __itt_heap_record_memory_growth_end_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_record_memory_growth_end_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Specify the type of heap detection/reporting to modify.
- */
-/**
- * @hideinitializer
- * @brief Report on memory leaks.
- */
-#define __itt_heap_leaks 0x00000001
-
-/**
- * @hideinitializer
- * @brief Report on memory growth.
- */
-#define __itt_heap_growth 0x00000002
-
-
-/** @brief heap reset detection */
-void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
-#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
-#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_reset_detection()
-#define __itt_heap_reset_detection_ptr  0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_reset_detection_ptr  0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @brief report */
-void ITTAPI __itt_heap_record(unsigned int record_mask);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
-#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
-#define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_heap_record()
-#define __itt_heap_record_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_heap_record_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @} heap group */
-/** @endcond */
-/* ========================================================================== */
-
-/**
- * @defgroup domains Domains
- * @ingroup public
- * Domains group
- * @{
- */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_domain
-{
-    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
-    const char* nameA;  /*!< Copy of original name in ASCII. */
-#if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* nameW;
-#endif /* UNICODE || _UNICODE */
-    int   extra1; /*!< Reserved to the runtime */
-    void* extra2; /*!< Reserved to the runtime */
-    struct ___itt_domain* next;
-} __itt_domain;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @ingroup domains
- * @brief Create a domain.
- * Create domain using some domain name: the URI naming style is recommended.
- * Because the set of domains is expected to be static over the application's
- * execution time, there is no mechanism to destroy a domain.
- * Any domain can be accessed by any thread in the process, regardless of
- * which thread created the domain. This call is thread-safe.
- * @param[in] name name of domain
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
-__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_domain_create     __itt_domain_createW
-#  define __itt_domain_create_ptr __itt_domain_createW_ptr
-#else /* UNICODE */
-#  define __itt_domain_create     __itt_domain_createA
-#  define __itt_domain_create_ptr __itt_domain_createA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_domain* ITTAPI __itt_domain_create(const char *name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
-ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
-#define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
-#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
-#define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
-#define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_domain_createA(name) (__itt_domain*)0
-#define __itt_domain_createA_ptr 0
-#define __itt_domain_createW(name) (__itt_domain*)0
-#define __itt_domain_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_domain_create(name)  (__itt_domain*)0
-#define __itt_domain_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_domain_createA_ptr 0
-#define __itt_domain_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_domain_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} domains group */
-
-/**
- * @defgroup ids IDs
- * @ingroup public
- * IDs group
- * @{
- */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_id
-{
-    unsigned long long d1, d2, d3;
-} __itt_id;
-
-#pragma pack(pop)
-/** @endcond */
-
-static const __itt_id __itt_null = { 0, 0, 0 };
-
-/**
- * @ingroup ids
- * @brief A convenience function is provided to create an ID without domain control.
- * @brief This is a convenience function to initialize an __itt_id structure. This function
- * does not affect the collector runtime in any way. After you make the ID with this
- * function, you still must create it with the __itt_id_create function before using the ID
- * to identify a named entity.
- * @param[in] addr The address of object; high QWORD of the ID value.
- * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
- */
-
-ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
-ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
-{
-    __itt_id id = __itt_null;
-    id.d1 = (unsigned long long)((uintptr_t)addr);
-    id.d2 = (unsigned long long)extra;
-    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
-    return id;
-}
-
-/**
- * @ingroup ids
- * @brief Create an instance of identifier.
- * This establishes the beginning of the lifetime of an instance of
- * the given ID in the trace. Once this lifetime starts, the ID
- * can be used to tag named entity instances in calls such as
- * __itt_task_begin, and to specify relationships among
- * identified named entity instances, using the \ref relations APIs.
- * Instance IDs are not domain specific!
- * @param[in] domain The domain controlling the execution of this call.
- * @param[in] id The ID to create.
- */
-void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
-#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
-#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_id_create(domain,id)
-#define __itt_id_create_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_id_create_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup ids
- * @brief Destroy an instance of identifier.
- * This ends the lifetime of the current instance of the given ID value in the trace.
- * Any relationships that are established after this lifetime ends are invalid.
- * This call must be performed before the given ID value can be reused for a different
- * named entity instance.
- * @param[in] domain The domain controlling the execution of this call.
- * @param[in] id The ID to destroy.
- */
-void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
-#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
-#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_id_destroy(domain,id)
-#define __itt_id_destroy_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_id_destroy_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} ids group */
-
-/**
- * @defgroup handless String Handles
- * @ingroup public
- * String Handles group
- * @{
- */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_string_handle
-{
-    const char* strA; /*!< Copy of original string in ASCII. */
-#if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
-#else  /* UNICODE || _UNICODE */
-    void* strW;
-#endif /* UNICODE || _UNICODE */
-    int   extra1; /*!< Reserved. Must be zero   */
-    void* extra2; /*!< Reserved. Must be zero   */
-    struct ___itt_string_handle* next;
-} __itt_string_handle;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @ingroup handles
- * @brief Create a string handle.
- * Create and return handle value that can be associated with a string.
- * Consecutive calls to __itt_string_handle_create with the same name
- * return the same value. Because the set of string handles is expected to remain
- * static during the application's execution time, there is no mechanism to destroy a string handle.
- * Any string handle can be accessed by any thread in the process, regardless of which thread created
- * the string handle. This call is thread-safe.
- * @param[in] name The input string
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
-__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_string_handle_create     __itt_string_handle_createW
-#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
-#else /* UNICODE */
-#  define __itt_string_handle_create     __itt_string_handle_createA
-#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
-#define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
-#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
-#define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
-#define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_string_handle_createA(name) (__itt_string_handle*)0
-#define __itt_string_handle_createA_ptr 0
-#define __itt_string_handle_createW(name) (__itt_string_handle*)0
-#define __itt_string_handle_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_string_handle_create(name)  (__itt_string_handle*)0
-#define __itt_string_handle_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_string_handle_createA_ptr 0
-#define __itt_string_handle_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_string_handle_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} handles group */
-
-/** @cond exclude_from_documentation */
-typedef unsigned long long __itt_timestamp;
-/** @endcond */
-
-#define __itt_timestamp_none ((__itt_timestamp)-1LL)
-
-/** @cond exclude_from_gpa_documentation */
-
-/**
- * @ingroup timestamps
- * @brief Return timestamp corresponding to the current moment.
- * This returns the timestamp in the format that is the most relevant for the current
- * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
- * compare __itt_timestamp values.
- */
-__itt_timestamp ITTAPI __itt_get_timestamp(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
-#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
-#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_get_timestamp()
-#define __itt_get_timestamp_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_get_timestamp_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} timestamps */
-/** @endcond */
-
-/** @cond exclude_from_gpa_documentation */
-
-/**
- * @defgroup regions Regions
- * @ingroup public
- * Regions group
- * @{
- */
-/**
- * @ingroup regions
- * @brief Begin of region instance.
- * Successive calls to __itt_region_begin with the same ID are ignored
- * until a call to __itt_region_end with the same ID
- * @param[in] domain The domain for this region instance
- * @param[in] id The instance ID for this region instance. Must not be __itt_null
- * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
- * @param[in] name The name of this region
- */
-void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
-
-/**
- * @ingroup regions
- * @brief End of region instance.
- * The first call to __itt_region_end with a given ID ends the
- * region. Successive calls with the same ID are ignored, as are
- * calls that do not have a matching __itt_region_begin call.
- * @param[in] domain The domain for this region instance
- * @param[in] id The instance ID for this region instance
- */
-void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
-#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
-#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
-#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
-#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_region_begin(d,x,y,z)
-#define __itt_region_begin_ptr 0
-#define __itt_region_end(d,x)
-#define __itt_region_end_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_region_begin_ptr 0
-#define __itt_region_end_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} regions group */
-
-/**
- * @defgroup frames Frames
- * @ingroup public
- * Frames are similar to regions, but are intended to be easier to use and to implement.
- * In particular:
- * - Frames always represent periods of elapsed time
- * - By default, frames have no nesting relationships
- * @{
- */
-
-/**
- * @ingroup frames
- * @brief Begin a frame instance.
- * Successive calls to __itt_frame_begin with the
- * same ID are ignored until a call to __itt_frame_end with the same ID.
- * @param[in] domain The domain for this frame instance
- * @param[in] id The instance ID for this frame instance or NULL
- */
-void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id);
-
-/**
- * @ingroup frames
- * @brief End a frame instance.
- * The first call to __itt_frame_end with a given ID
- * ends the frame. Successive calls with the same ID are ignored, as are
- * calls that do not have a matching __itt_frame_begin call.
- * @param[in] domain The domain for this frame instance
- * @param[in] id The instance ID for this frame instance or NULL for current
- */
-void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
-
-/**
- * @ingroup frames
- * @brief Submits a frame instance.
- * Successive calls to __itt_frame_begin or __itt_frame_submit with the
- * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit
- * with the same ID.
- * Passing special __itt_timestamp_none value as "end" argument means
- * take the current timestamp as the end timestamp.
- * @param[in] domain The domain for this frame instance
- * @param[in] id The instance ID for this frame instance or NULL
- * @param[in] begin Timestamp of the beginning of the frame
- * @param[in] end Timestamp of the end of the frame
- */
-void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
-    __itt_timestamp begin, __itt_timestamp end);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
-ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
-ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
-#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
-#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
-#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
-#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
-#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
-#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_frame_begin_v3(domain,id)
-#define __itt_frame_begin_v3_ptr 0
-#define __itt_frame_end_v3(domain,id)
-#define __itt_frame_end_v3_ptr   0
-#define __itt_frame_submit_v3(domain,id,begin,end)
-#define __itt_frame_submit_v3_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_frame_begin_v3_ptr 0
-#define __itt_frame_end_v3_ptr   0
-#define __itt_frame_submit_v3_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} frames group */
-/** @endcond */
-
-/**
- * @defgroup taskgroup Task Group
- * @ingroup public
- * Task Group
- * @{
- */
-/**
- * @ingroup task_groups
- * @brief Denotes a task_group instance.
- * Successive calls to __itt_task_group with the same ID are ignored.
- * @param[in] domain The domain for this task_group instance
- * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
- * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
- * @param[in] name The name of this task_group
- */
-void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
-#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_group(d,x,y,z)
-#define __itt_task_group_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_group_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} taskgroup group */
-
-/**
- * @defgroup tasks Tasks
- * @ingroup public
- * A task instance represents a piece of work performed by a particular
- * thread for a period of time. A call to __itt_task_begin creates a
- * task instance. This becomes the current instance for that task on that
- * thread. A following call to __itt_task_end on the same thread ends the
- * instance. There may be multiple simultaneous instances of tasks with the
- * same name on different threads. If an ID is specified, the task instance
- * receives that ID. Nested tasks are allowed.
- *
- * Note: The task is defined by the bracketing of __itt_task_begin and
- * __itt_task_end on the same thread. If some scheduling mechanism causes
- * task switching (the thread executes a different user task) or task
- * switching (the user task switches to a different thread) then this breaks
- * the notion of  current instance. Additional API calls are required to
- * deal with that possibility.
- * @{
- */
-
-/**
- * @ingroup tasks
- * @brief Begin a task instance.
- * @param[in] domain The domain for this task
- * @param[in] taskid The instance ID for this task instance, or __itt_null
- * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
- * @param[in] name The name of this task
- */
-void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
-
-/**
- * @ingroup tasks
- * @brief Begin a task instance.
- * @param[in] domain The domain for this task
- * @param[in] taskid The identifier for this task instance (may be 0)
- * @param[in] parentid The parent of this task (may be 0)
- * @param[in] fn The pointer to the function you are tracing
- */
-void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
-
-/**
- * @ingroup tasks
- * @brief End the current task instance.
- * @param[in] domain The domain for this task
- */
-void ITTAPI __itt_task_end(const __itt_domain *domain);
-
-/**
- * @ingroup tasks
- * @brief Begin an overlapped task instance.
- * @param[in] domain The domain for this task.
- * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
- * @param[in] parentid The parent of this task, or __itt_null.
- * @param[in] name The name of this task.
- */
-void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
-
-/**
- * @ingroup tasks
- * @brief End an overlapped task instance.
- * @param[in] domain The domain for this task
- * @param[in] taskid Explicit ID of finished task
- */
-void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
-ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
-ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __itt_id taskid))
-#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
-#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
-#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
-#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
-#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
-#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
-#define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
-#define __itt_task_begin_overlapped_ptr      ITTNOTIFY_NAME(task_begin_overlapped)
-#define __itt_task_end_overlapped(d,x)       ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
-#define __itt_task_end_overlapped_ptr        ITTNOTIFY_NAME(task_end_overlapped)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_begin(domain,id,parentid,name)
-#define __itt_task_begin_ptr    0
-#define __itt_task_begin_fn(domain,id,parentid,fn)
-#define __itt_task_begin_fn_ptr 0
-#define __itt_task_end(domain)
-#define __itt_task_end_ptr      0
-#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
-#define __itt_task_begin_overlapped_ptr         0
-#define __itt_task_end_overlapped(domain,taskid)
-#define __itt_task_end_overlapped_ptr           0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_begin_ptr    0
-#define __itt_task_begin_fn_ptr 0
-#define __itt_task_end_ptr      0
-#define __itt_task_begin_overlapped_ptr 0
-#define __itt_task_end_overlapped_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} tasks group */
-
-
-/**
- * @defgroup markers Markers
- * Markers represent a single discreet event in time. Markers have a scope,
- * described by an enumerated type __itt_scope. Markers are created by
- * the API call __itt_marker. A marker instance can be given an ID for use in
- * adding metadata.
- * @{
- */
-
-/**
- * @brief Describes the scope of an event object in the trace.
- */
-typedef enum
-{
-    __itt_scope_unknown = 0,
-    __itt_scope_global,
-    __itt_scope_track_group,
-    __itt_scope_track,
-    __itt_scope_task,
-    __itt_scope_marker
-} __itt_scope;
-
-/** @cond exclude_from_documentation */
-#define __itt_marker_scope_unknown  __itt_scope_unknown
-#define __itt_marker_scope_global   __itt_scope_global
-#define __itt_marker_scope_process  __itt_scope_track_group
-#define __itt_marker_scope_thread   __itt_scope_track
-#define __itt_marker_scope_task     __itt_scope_task
-/** @endcond */
-
-/**
- * @ingroup markers
- * @brief Create a marker instance
- * @param[in] domain The domain for this marker
- * @param[in] id The instance ID for this marker or __itt_null
- * @param[in] name The name for this marker
- * @param[in] scope The scope for this marker
- */
-void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
-#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
-#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_marker(domain,id,name,scope)
-#define __itt_marker_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_marker_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} markers group */
-
-/**
- * @defgroup metadata Metadata
- * The metadata API is used to attach extra information to named
- * entities. Metadata can be attached to an identified named entity by ID,
- * or to the current entity (which is always a task).
- *
- * Conceptually metadata has a type (what kind of metadata), a key (the
- * name of the metadata), and a value (the actual data). The encoding of
- * the value depends on the type of the metadata.
- *
- * The type of metadata is specified by an enumerated type __itt_metdata_type.
- * @{
- */
-
-/**
- * @ingroup parameters
- * @brief describes the type of metadata
- */
-typedef enum {
-    __itt_metadata_unknown = 0,
-    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
-    __itt_metadata_s64,     /**< Signed 64-bit integer */
-    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
-    __itt_metadata_s32,     /**< Signed 32-bit integer */
-    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
-    __itt_metadata_s16,     /**< Signed 16-bit integer */
-    __itt_metadata_float,   /**< Signed 32-bit floating-point */
-    __itt_metadata_double   /**< SIgned 64-bit floating-point */
-} __itt_metadata_type;
-
-/**
- * @ingroup parameters
- * @brief Add metadata to an instance of a named entity.
- * @param[in] domain The domain controlling the call
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
- * @param[in] key The name of the metadata
- * @param[in] type The type of the metadata
- * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
- * @param[in] data The metadata itself
-*/
-void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
-#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
-#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_metadata_add(d,x,y,z,a,b)
-#define __itt_metadata_add_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_metadata_add_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup parameters
- * @brief Add string metadata to an instance of a named entity.
- * @param[in] domain The domain controlling the call
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
- * @param[in] key The name of the metadata
- * @param[in] data The metadata itself
- * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
-*/
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
-void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_metadata_str_add     __itt_metadata_str_addW
-#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
-#else /* UNICODE */
-#  define __itt_metadata_str_add     __itt_metadata_str_addA
-#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
-#endif
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
-ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
-#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
-#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
-#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
-#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_addA(d,x,y,z,a)
-#define __itt_metadata_str_addA_ptr 0
-#define __itt_metadata_str_addW(d,x,y,z,a)
-#define __itt_metadata_str_addW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add(d,x,y,z,a)
-#define __itt_metadata_str_add_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_addA_ptr 0
-#define __itt_metadata_str_addW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup parameters
- * @brief Add metadata to an instance of a named entity.
- * @param[in] domain The domain controlling the call
- * @param[in] scope The scope of the instance to which the metadata is to be added
-
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
-
- * @param[in] key The name of the metadata
- * @param[in] type The type of the metadata
- * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
- * @param[in] data The metadata itself
-*/
-void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
-#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
-#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
-#define __itt_metadata_add_with_scope_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_metadata_add_with_scope_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup parameters
- * @brief Add string metadata to an instance of a named entity.
- * @param[in] domain The domain controlling the call
- * @param[in] scope The scope of the instance to which the metadata is to be added
-
- * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
-
- * @param[in] key The name of the metadata
- * @param[in] data The metadata itself
- * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
-*/
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
-void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
-#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
-#else /* UNICODE */
-#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
-#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
-#endif
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
-#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
-#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeA_ptr  0
-#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
-#define __itt_metadata_str_add_with_scopeW_ptr  0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
-#define __itt_metadata_str_add_with_scope_ptr   0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_metadata_str_add_with_scopeA_ptr  0
-#define __itt_metadata_str_add_with_scopeW_ptr  0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_metadata_str_add_with_scope_ptr   0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @} metadata group */
-
-/**
- * @defgroup relations Relations
- * Instances of named entities can be explicitly associated with other
- * instances using instance IDs and the relationship API calls.
- *
- * @{
- */
-
-/**
- * @ingroup relations
- * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
- * Relations between instances can be added with an API call. The relation
- * API uses instance IDs. Relations can be added before or after the actual
- * instances are created and persist independently of the instances. This
- * is the motivation for having different lifetimes for instance IDs and
- * the actual instances.
- */
-typedef enum
-{
-    __itt_relation_is_unknown = 0,
-    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
-    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
-    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
-    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
-    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
-    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
-    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
-} __itt_relation;
-
-/**
- * @ingroup relations
- * @brief Add a relation to the current task instance.
- * The current task instance is the head of the relation.
- * @param[in] domain The domain controlling this call
- * @param[in] relation The kind of relation
- * @param[in] tail The ID for the tail of the relation
- */
-void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
-
-/**
- * @ingroup relations
- * @brief Add a relation between two instance identifiers.
- * @param[in] domain The domain controlling this call
- * @param[in] head The ID for the head of the relation
- * @param[in] relation The kind of relation
- * @param[in] tail The ID for the tail of the relation
- */
-void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
-ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
-#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
-#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
-#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
-#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_relation_add_to_current(d,x,y)
-#define __itt_relation_add_to_current_ptr 0
-#define __itt_relation_add(d,x,y,z)
-#define __itt_relation_add_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_relation_add_to_current_ptr 0
-#define __itt_relation_add_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} relations group */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_clock_info
-{
-    unsigned long long clock_freq; /*!< Clock domain frequency */
-    unsigned long long clock_base; /*!< Clock domain base timestamp */
-} __itt_clock_info;
-
-#pragma pack(pop)
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_clock_domain
-{
-    __itt_clock_info info;      /*!< Most recent clock domain info */
-    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
-    void* fn_data;              /*!< Input argument for the callback function */
-    int   extra1;               /*!< Reserved. Must be zero */
-    void* extra2;               /*!< Reserved. Must be zero */
-    struct ___itt_clock_domain* next;
-} __itt_clock_domain;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @ingroup clockdomains
- * @brief Create a clock domain.
- * Certain applications require the capability to trace their application using
- * a clock domain different than the CPU, for instance the instrumentation of events
- * that occur on a GPU.
- * Because the set of domains is expected to be static over the application's execution time,
- * there is no mechanism to destroy a domain.
- * Any domain can be accessed by any thread in the process, regardless of which thread created
- * the domain. This call is thread-safe.
- * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
- * @param[in] fn_data Argument for a callback function; may be NULL
- */
-__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
-#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
-#define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
-#define __itt_clock_domain_create_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_clock_domain_create_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup clockdomains
- * @brief Recalculate clock domains frequencies and clock base timestamps.
- */
-void ITTAPI __itt_clock_domain_reset(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
-#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
-#define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_clock_domain_reset()
-#define __itt_clock_domain_reset_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_clock_domain_reset_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup clockdomain
- * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
- * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
- * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
- * identified named entity instances, using the \ref relations APIs.
- * @param[in] domain The domain controlling the execution of this call.
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] id The ID to create.
- */
-void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
-
-/**
- * @ingroup clockdomain
- * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
- * given ID value in the trace. Any relationships that are established after this lifetime ends are
- * invalid. This call must be performed before the given ID value can be reused for a different
- * named entity instance.
- * @param[in] domain The domain controlling the execution of this call.
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] id The ID to destroy.
- */
-void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
-ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
-#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
-#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
-#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
-#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
-#define __itt_id_create_ex_ptr    0
-#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
-#define __itt_id_destroy_ex_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_id_create_ex_ptr    0
-#define __itt_id_destroy_ex_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup clockdomain
- * @brief Begin a task instance.
- * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] taskid The instance ID for this task instance, or __itt_null
- * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
- * @param[in] name The name of this task
- */
-void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
-
-/**
- * @ingroup clockdomain
- * @brief Begin a task instance.
- * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] taskid The identifier for this task instance, or __itt_null
- * @param[in] parentid The parent of this task, or __itt_null
- * @param[in] fn The pointer to the function you are tracing
- */
-void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
-
-/**
- * @ingroup clockdomain
- * @brief End the current task instance.
- * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- */
-void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
-ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
-#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
-#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
-#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
-#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
-#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
-#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
-#define __itt_task_begin_ex_ptr          0
-#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
-#define __itt_task_begin_fn_ex_ptr       0
-#define __itt_task_end_ex(domain,clock_domain,timestamp)
-#define __itt_task_end_ex_ptr            0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_begin_ex_ptr          0
-#define __itt_task_begin_fn_ex_ptr       0
-#define __itt_task_end_ex_ptr            0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @defgroup counters Counters
- * @ingroup public
- * Counters are user-defined objects with a monotonically increasing
- * value. Counter values are 64-bit unsigned integers.
- * Counters have names that can be displayed in
- * the tools.
- * @{
- */
-
-/**
- * @brief opaque structure for counter identification
- */
-/** @cond exclude_from_documentation */
-
-typedef struct ___itt_counter* __itt_counter;
-
-/**
- * @brief Create an unsigned 64 bits integer counter with given name/domain
- *
- * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
- * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
- * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer
- *
- * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64)
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
-__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_counter_create     __itt_counter_createW
-#  define __itt_counter_create_ptr __itt_counter_createW_ptr
-#else /* UNICODE */
-#  define __itt_counter_create     __itt_counter_createA
-#  define __itt_counter_create_ptr __itt_counter_createA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
-ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
-#define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
-#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
-#define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
-#define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_createA(name, domain)
-#define __itt_counter_createA_ptr 0
-#define __itt_counter_createW(name, domain)
-#define __itt_counter_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create(name, domain)
-#define __itt_counter_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_createA_ptr 0
-#define __itt_counter_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Increment the unsigned 64 bits integer counter value
- *
- * Calling this function to non-unsigned 64 bits integer counters has no effect
- */
-void ITTAPI __itt_counter_inc(__itt_counter id);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
-#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
-#define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_inc(id)
-#define __itt_counter_inc_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_inc_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/**
- * @brief Increment the unsigned 64 bits integer counter value with x
- *
- * Calling this function to non-unsigned 64 bits integer counters has no effect
- */
-void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
-#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
-#define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_inc_delta(id, value)
-#define __itt_counter_inc_delta_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_inc_delta_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Decrement the unsigned 64 bits integer counter value
- *
- * Calling this function to non-unsigned 64 bits integer counters has no effect
- */
-void ITTAPI __itt_counter_dec(__itt_counter id);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id))
-#define __itt_counter_dec     ITTNOTIFY_VOID(counter_dec)
-#define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_dec(id)
-#define __itt_counter_dec_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_dec_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/**
- * @brief Decrement the unsigned 64 bits integer counter value with x
- *
- * Calling this function to non-unsigned 64 bits integer counters has no effect
- */
-void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value))
-#define __itt_counter_dec_delta     ITTNOTIFY_VOID(counter_dec_delta)
-#define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_dec_delta(id, value)
-#define __itt_counter_dec_delta_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_dec_delta_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup counters
- * @brief Increment a counter by one.
- * The first call with a given name creates a counter by that name and sets its
- * value to zero. Successive calls increment the counter value.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
- * @param[in] name The name of the counter
- */
-void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
-
-/**
- * @ingroup counters
- * @brief Increment a counter by the value specified in delta.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
- * @param[in] name The name of the counter
- * @param[in] delta The amount by which to increment the counter
- */
-void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
-#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
-#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
-#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
-#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_inc_v3(domain,name)
-#define __itt_counter_inc_v3_ptr       0
-#define __itt_counter_inc_delta_v3(domain,name,delta)
-#define __itt_counter_inc_delta_v3_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_inc_v3_ptr       0
-#define __itt_counter_inc_delta_v3_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-
-/**
- * @ingroup counters
- * @brief Decrement a counter by one.
- * The first call with a given name creates a counter by that name and sets its
- * value to zero. Successive calls decrement the counter value.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
- * @param[in] name The name of the counter
- */
-void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name);
-
-/**
- * @ingroup counters
- * @brief Decrement a counter by the value specified in delta.
- * @param[in] domain The domain controlling the call. Counter names are not domain specific.
- *            The domain argument is used only to enable or disable the API calls.
- * @param[in] name The name of the counter
- * @param[in] delta The amount by which to decrement the counter
- */
-void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name))
-ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
-#define __itt_counter_dec_v3(d,x)         ITTNOTIFY_VOID_D1(counter_dec_v3,d,x)
-#define __itt_counter_dec_v3_ptr          ITTNOTIFY_NAME(counter_dec_v3)
-#define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y)
-#define __itt_counter_dec_delta_v3_ptr    ITTNOTIFY_NAME(counter_dec_delta_v3)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_dec_v3(domain,name)
-#define __itt_counter_dec_v3_ptr       0
-#define __itt_counter_dec_delta_v3(domain,name,delta)
-#define __itt_counter_dec_delta_v3_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_dec_v3_ptr       0
-#define __itt_counter_dec_delta_v3_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @} counters group */
-
-
-/**
- * @brief Set the counter value
- */
-void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
-#define __itt_counter_set_value     ITTNOTIFY_VOID(counter_set_value)
-#define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_set_value(id, value_ptr)
-#define __itt_counter_set_value_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_set_value_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Set the counter value
- */
-void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr))
-#define __itt_counter_set_value_ex     ITTNOTIFY_VOID(counter_set_value_ex)
-#define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
-#define __itt_counter_set_value_ex_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_set_value_ex_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Create a typed counter with given name/domain
- *
- * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
- * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
- * can be used to change the value of the counter
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_counter ITTAPI __itt_counter_create_typedA(const char    *name, const char    *domain, __itt_metadata_type type);
-__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_counter_create_typed     __itt_counter_create_typedW
-#  define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
-#else /* UNICODE */
-#  define __itt_counter_create_typed     __itt_counter_create_typedA
-#  define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type))
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char *name, const char *domain, __itt_metadata_type type))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_create_typedA     ITTNOTIFY_DATA(counter_create_typedA)
-#define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA)
-#define __itt_counter_create_typedW     ITTNOTIFY_DATA(counter_create_typedW)
-#define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_typed     ITTNOTIFY_DATA(counter_create_typed)
-#define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_create_typedA(name, domain, type)
-#define __itt_counter_create_typedA_ptr 0
-#define __itt_counter_create_typedW(name, domain, type)
-#define __itt_counter_create_typedW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_typed(name, domain, type)
-#define __itt_counter_create_typed_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_counter_create_typedA_ptr 0
-#define __itt_counter_create_typedW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_counter_create_typed_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or
- * __itt_counter_create_typed()
- */
-void ITTAPI __itt_counter_destroy(__itt_counter id);
-
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
-#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
-#define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_counter_destroy(id)
-#define __itt_counter_destroy_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_counter_destroy_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} counters group */
-
-/**
- * @ingroup markers
- * @brief Create a marker instance.
- * @param[in] domain The domain for this marker
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] id The instance ID for this marker, or __itt_null
- * @param[in] name The name for this marker
- * @param[in] scope The scope for this marker
- */
-void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
-#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
-#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
-#define __itt_marker_ex_ptr    0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_marker_ex_ptr    0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @ingroup clockdomain
- * @brief Add a relation to the current task instance.
- * The current task instance is the head of the relation.
- * @param[in] domain The domain controlling this call
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] relation The kind of relation
- * @param[in] tail The ID for the tail of the relation
- */
-void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
-
-/**
- * @ingroup clockdomain
- * @brief Add a relation between two instance identifiers.
- * @param[in] domain The domain controlling this call
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] head The ID for the head of the relation
- * @param[in] relation The kind of relation
- * @param[in] tail The ID for the tail of the relation
- */
-void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
-ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
-#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
-#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
-#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
-#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
-#define __itt_relation_add_to_current_ex_ptr 0
-#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
-#define __itt_relation_add_ex_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_relation_add_to_current_ex_ptr 0
-#define __itt_relation_add_ex_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-typedef enum ___itt_track_group_type
-{
-    __itt_track_group_type_normal = 0
-} __itt_track_group_type;
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_track_group
-{
-    __itt_string_handle* name;     /*!< Name of the track group */
-    struct ___itt_track* track;    /*!< List of child tracks    */
-    __itt_track_group_type tgtype; /*!< Type of the track group */
-    int   extra1;                  /*!< Reserved. Must be zero  */
-    void* extra2;                  /*!< Reserved. Must be zero  */
-    struct ___itt_track_group* next;
-} __itt_track_group;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @brief Placeholder for custom track types. Currently, "normal" custom track
- * is the only available track type.
- */
-typedef enum ___itt_track_type
-{
-    __itt_track_type_normal = 0
-#ifdef INTEL_ITTNOTIFY_API_PRIVATE
-    , __itt_track_type_queue
-#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
-} __itt_track_type;
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_track
-{
-    __itt_string_handle* name; /*!< Name of the track group */
-    __itt_track_group* group;  /*!< Parent group to a track */
-    __itt_track_type ttype;    /*!< Type of the track       */
-    int   extra1;              /*!< Reserved. Must be zero  */
-    void* extra2;              /*!< Reserved. Must be zero  */
-    struct ___itt_track* next;
-} __itt_track;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @brief Create logical track group.
- */
-__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
-#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
-#define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_track_group_create(name)  (__itt_track_group*)0
-#define __itt_track_group_create_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_track_group_create_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Create logical track.
- */
-__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
-#define __itt_track_create     ITTNOTIFY_DATA(track_create)
-#define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
-#define __itt_track_create_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_track_create_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Set the logical track.
- */
-void ITTAPI __itt_set_track(__itt_track* track);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
-#define __itt_set_track     ITTNOTIFY_VOID(set_track)
-#define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_set_track(track)
-#define __itt_set_track_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_set_track_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/* ========================================================================== */
-/** @cond exclude_from_gpa_documentation */
-/**
- * @defgroup events Events
- * @ingroup public
- * Events group
- * @{
- */
-/** @brief user event type */
-typedef int __itt_event;
-
-/**
- * @brief Create an event notification
- * @note name or namelen being null/name and namelen not matching, user event feature not enabled
- * @return non-zero event identifier upon success and __itt_err otherwise
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
-__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_event_create     __itt_event_createW
-#  define __itt_event_create_ptr __itt_event_createW_ptr
-#else
-#  define __itt_event_create     __itt_event_createA
-#  define __itt_event_create_ptr __itt_event_createA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
-ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
-#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
-#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
-#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create      ITTNOTIFY_DATA(event_create)
-#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA(name, namelen) (__itt_event)0
-#define __itt_event_createA_ptr 0
-#define __itt_event_createW(name, namelen) (__itt_event)0
-#define __itt_event_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create(name, namelen)  (__itt_event)0
-#define __itt_event_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA_ptr 0
-#define __itt_event_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record an event occurrence.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
- */
-int LIBITTAPI __itt_event_start(__itt_event event);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
-#define __itt_event_start     ITTNOTIFY_DATA(event_start)
-#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_event_start(event) (int)0
-#define __itt_event_start_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_event_start_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record an event end occurrence.
- * @note It is optional if events do not have durations.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
- */
-int LIBITTAPI __itt_event_end(__itt_event event);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
-#define __itt_event_end     ITTNOTIFY_DATA(event_end)
-#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_event_end(event) (int)0
-#define __itt_event_end_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_event_end_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} events group */
-
-
-/**
- * @defgroup arrays Arrays Visualizer
- * @ingroup public
- * Visualize arrays
- * @{
- */
-
-/**
- * @enum __itt_av_data_type
- * @brief Defines types of arrays data (for C/C++ intrinsic types)
- */
-typedef enum
-{
-    __itt_e_first = 0,
-    __itt_e_char = 0,  /* 1-byte integer */
-    __itt_e_uchar,     /* 1-byte unsigned integer */
-    __itt_e_int16,     /* 2-byte integer */
-    __itt_e_uint16,    /* 2-byte unsigned integer  */
-    __itt_e_int32,     /* 4-byte integer */
-    __itt_e_uint32,    /* 4-byte unsigned integer */
-    __itt_e_int64,     /* 8-byte integer */
-    __itt_e_uint64,    /* 8-byte unsigned integer */
-    __itt_e_float,     /* 4-byte floating */
-    __itt_e_double,    /* 8-byte floating */
-    __itt_e_last = __itt_e_double
-} __itt_av_data_type;
-
-/**
- * @brief Save an array data to a file.
- * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
- * @param[in] data - pointer to the array data
- * @param[in] rank - the rank of the array
- * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
- * The size of dimensions must be equal to the rank
- * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
- * @param[in] filePath - the file path; the output format is defined by the file extension
- * @param[in] columnOrder - defines how the array is stored in the linear memory.
- * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
- */
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
-int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_av_save     __itt_av_saveW
-#  define __itt_av_save_ptr __itt_av_saveW_ptr
-#else /* UNICODE */
-#  define __itt_av_save     __itt_av_saveA
-#  define __itt_av_save_ptr __itt_av_saveA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
-ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
-#define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
-#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
-#define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_av_save     ITTNOTIFY_DATA(av_save)
-#define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_av_saveA(name)
-#define __itt_av_saveA_ptr 0
-#define __itt_av_saveW(name)
-#define __itt_av_saveW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_av_save(name)
-#define __itt_av_save_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_av_saveA_ptr 0
-#define __itt_av_saveW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_av_save_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-void ITTAPI __itt_enable_attach(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, enable_attach, (void))
-#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
-#define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_enable_attach()
-#define __itt_enable_attach_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_enable_attach_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @cond exclude_from_gpa_documentation */
-
-/** @} arrays group */
-
-/** @endcond */
-
-/**
- * @brief Module load notification
- * This API is used to report necessary information in case of bypassing default system loader.
- * Notification should be done immidiatelly after this module is loaded to process memory.
- * @param[in] start_addr - module start address
- * @param[in] end_addr - module end address
- * @param[in] path - file system full path to the module
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
-void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_module_load     __itt_module_loadW
-#  define __itt_module_load_ptr __itt_module_loadW_ptr
-#else /* UNICODE */
-#  define __itt_module_load     __itt_module_loadA
-#  define __itt_module_load_ptr __itt_module_loadA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path))
-ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const char *path))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_module_loadA     ITTNOTIFY_VOID(module_loadA)
-#define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA)
-#define __itt_module_loadW     ITTNOTIFY_VOID(module_loadW)
-#define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_module_load     ITTNOTIFY_VOID(module_load)
-#define __itt_module_load_ptr ITTNOTIFY_NAME(module_load)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_module_loadA(start_addr, end_addr, path)
-#define __itt_module_loadA_ptr 0
-#define __itt_module_loadW(start_addr, end_addr, path)
-#define __itt_module_loadW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_module_load(start_addr, end_addr, path)
-#define __itt_module_load_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_module_loadA_ptr 0
-#define __itt_module_loadW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_module_load_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Report module unload
- * This API is used to report necessary information in case of bypassing default system loader.
- * Notification should be done just before the module is unloaded from process memory.
- * @param[in] addr - base address of loaded module
- */
-void ITTAPI __itt_module_unload(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, module_unload, (void *addr))
-#define __itt_module_unload     ITTNOTIFY_VOID(module_unload)
-#define __itt_module_unload_ptr ITTNOTIFY_NAME(module_unload)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_module_unload(addr)
-#define __itt_module_unload_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_module_unload_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-typedef enum
-{
-    __itt_module_type_unknown = 0,
-    __itt_module_type_elf,
-    __itt_module_type_coff
-} __itt_module_type;
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-typedef enum
-{
-    itt_section_type_unknown,
-    itt_section_type_bss,        /* notifies that the section contains uninitialized data. These are the relevant section types and the modules that contain them:
-                                  * ELF module:  SHT_NOBITS section type
-                                  * COFF module: IMAGE_SCN_CNT_UNINITIALIZED_DATA section type
-                                  */
-    itt_section_type_data,       /* notifies that section contains initialized data. These are the relevant section types and the modules that contain them:
-                                  * ELF module:  SHT_PROGBITS section type
-                                  * COFF module: IMAGE_SCN_CNT_INITIALIZED_DATA section type
-                                  */
-    itt_section_type_text        /* notifies that the section contains executable code. These are the relevant section types and the modules that contain them:
-                                  * ELF module:  SHT_PROGBITS section type
-                                  * COFF module: IMAGE_SCN_CNT_CODE section type
-                                  */
-} __itt_section_type;
-/** @endcond */
-
-/**
- * @hideinitializer
- * @brief bit-mask, detects a section attribute that indicates whether a section can be executed as code:
- * These are the relevant section attributes and the modules that contain them:
- * ELF module:  PF_X section attribute
- * COFF module: IMAGE_SCN_MEM_EXECUTE attribute
- */
-#define __itt_section_exec 0x20000000
-
-/**
- * @hideinitializer
- * @brief bit-mask, detects a section attribute that indicates whether a section can be read.
- * These are the relevant section attributes and the modules that contain them:
- * ELF module:  PF_R attribute
- * COFF module: IMAGE_SCN_MEM_READ attribute
- */
-#define __itt_section_read 0x40000000
-
-/**
- * @hideinitializer
- * @brief bit-mask, detects a section attribute that indicates whether a section can be written to.
- * These are the relevant section attributes and the modules that contain them:
- * ELF module:  PF_W attribute
- * COFF module: IMAGE_SCN_MEM_WRITE attribute
- */
-#define __itt_section_write 0x80000000
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_section_info
-{
-    const char* name;                 /*!< Section name in UTF8 */
-    __itt_section_type type;          /*!< Section content and semantics description */
-    size_t flags;                     /*!< Section bit flags that describe attributes using bit mask
-                                       * Zero if disabled, non-zero if enabled
-                                       */
-    void* start_addr;                 /*!< Section load(relocated) start address */
-    size_t size;                      /*!< Section file offset */
-    size_t file_offset;               /*!< Section size */
-} __itt_section_info;
-
-#pragma pack(pop)
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_module_object
-{
-    unsigned int version;                 /*!< API version*/
-    __itt_id module_id;                   /*!< Unique identifier. This is unchanged for sections that belong to the same module */
-    __itt_module_type module_type;        /*!< Binary module format */
-    const char* module_name;              /*!< Unique module name or path to module in UTF8
-                                           * Contains module name when module_bufer and module_size exist
-                                           * Contains module path when module_bufer and module_size absent
-                                           * module_name remains the same for the certain module_id
-                                           */
-    void* module_buffer;                  /*!< Module buffer content */
-    size_t module_size;                   /*!< Module buffer size */
-                                          /*!< If module_buffer and module_size exist, the binary module is dumped onto the system.
-                                           * If module_buffer and module_size do not exist,
-                                           * the binary module exists on the system already.
-                                           * The module_name parameter contains the path to the module.
-                                           */
-    __itt_section_info* section_array;    /*!< Reference to section information */
-    size_t section_number;
-} __itt_module_object;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @brief Load module content and its loaded(relocated) sections.
- * This API is useful to save a module, or specify its location on the system and report information about loaded sections.
- * The target module is saved on the system if module buffer content and size are available.
- * If module buffer content and size are unavailable, the module name contains the path to the existing binary module.
- * @param[in] module_obj - provides module and section information, along with unique module identifiers (name,module ID)
- * which bind the binary module to particular sections.
- */
-void ITTAPI __itt_module_load_with_sections(__itt_module_object* module_obj);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, module_load_with_sections,  (__itt_module_object* module_obj))
-#define __itt_module_load_with_sections     ITTNOTIFY_VOID(module_load_with_sections)
-#define __itt_module_load_with_sections_ptr ITTNOTIFY_NAME(module_load_with_sections)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_module_load_with_sections(module_obj)
-#define __itt_module_load_with_sections_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_module_load_with_sections_ptr  0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Unload a module and its loaded(relocated) sections.
- * This API notifies that the module and its sections were unloaded.
- * @param[in] module_obj - provides module and sections information, along with unique module identifiers (name,module ID)
- * which bind the binary module to particular sections.
- */
-void ITTAPI __itt_module_unload_with_sections(__itt_module_object* module_obj);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, module_unload_with_sections,  (__itt_module_object* module_obj))
-#define __itt_module_unload_with_sections     ITTNOTIFY_VOID(module_unload_with_sections)
-#define __itt_module_unload_with_sections_ptr ITTNOTIFY_NAME(module_unload_with_sections)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_module_unload_with_sections(module_obj)
-#define __itt_module_unload_with_sections_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_module_unload_with_sections_ptr  0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-#pragma pack(push, 8)
-
-typedef struct ___itt_histogram
-{
-    const __itt_domain* domain;      /*!< Domain of the histogram*/
-    const char* nameA;               /*!< Name of the histogram */
-#if defined(UNICODE) || defined(_UNICODE)
-    const wchar_t* nameW;
-#else  /* UNICODE || _UNICODE */
-    void* nameW;
-#endif /* UNICODE || _UNICODE */
-    __itt_metadata_type x_type;     /*!< Type of the histogram X axis */
-    __itt_metadata_type y_type;     /*!< Type of the histogram Y axis */
-    int   extra1;                   /*!< Reserved to the runtime */
-    void* extra2;                   /*!< Reserved to the runtime */
-    struct ___itt_histogram* next;
-}  __itt_histogram;
-
-#pragma pack(pop)
-/** @endcond */
-
-/**
- * @brief Create a typed histogram instance with given name/domain.
- * @param[in] domain The domain controlling the call.
- * @param[in] name   The name of the histogram.
- * @param[in] x_type The type of the X axis in histogram (may be 0 to calculate batch statistics).
- * @param[in] y_type The type of the Y axis in histogram.
-*/
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_histogram* ITTAPI __itt_histogram_createA(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
-__itt_histogram* ITTAPI __itt_histogram_createW(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_histogram_create     __itt_histogram_createW
-#  define __itt_histogram_create_ptr __itt_histogram_createW_ptr
-#else /* UNICODE */
-#  define __itt_histogram_create     __itt_histogram_createA
-#  define __itt_histogram_create_ptr __itt_histogram_createA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_histogram* ITTAPI __itt_histogram_create(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_histogram*, histogram_createA, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
-ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_histogram_createA     ITTNOTIFY_DATA(histogram_createA)
-#define __itt_histogram_createA_ptr ITTNOTIFY_NAME(histogram_createA)
-#define __itt_histogram_createW     ITTNOTIFY_DATA(histogram_createW)
-#define __itt_histogram_createW_ptr ITTNOTIFY_NAME(histogram_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_histogram_create     ITTNOTIFY_DATA(histogram_create)
-#define __itt_histogram_create_ptr ITTNOTIFY_NAME(histogram_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_histogram_createA(domain, name, x_type, y_type) (__itt_histogram*)0
-#define __itt_histogram_createA_ptr 0
-#define __itt_histogram_createW(domain, name, x_type, y_type) (__itt_histogram*)0
-#define __itt_histogram_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_histogram_create(domain, name, x_type, y_type) (__itt_histogram*)0
-#define __itt_histogram_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_histogram_createA_ptr 0
-#define __itt_histogram_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_histogram_create_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Submit statistics for a histogram instance.
- * @param[in] hist    Pointer to the histogram instance to which the histogram statistic is to be dumped.
- * @param[in] length  The number of elements in dumped axis data array.
- * @param[in] x_data  The X axis dumped data itself (may be NULL to calculate batch statistics).
- * @param[in] y_data  The Y axis dumped data itself.
-*/
-void ITTAPI __itt_histogram_submit(__itt_histogram* hist, size_t length, void* x_data, void* y_data);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* hist, size_t length, void* x_data, void* y_data))
-#define __itt_histogram_submit     ITTNOTIFY_VOID(histogram_submit)
-#define __itt_histogram_submit_ptr ITTNOTIFY_NAME(histogram_submit)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_histogram_submit(hist, length, x_data, y_data)
-#define __itt_histogram_submit_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_histogram_submit_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-
-/**
-* @brief function allows to obtain the current collection state at the moment
-* @return collection state as a enum __itt_collection_state
-*/
-__itt_collection_state __itt_get_collection_state(void);
-
-/**
-* @brief function releases resources allocated by ITT API static part
-* this API should be called from the library destructor
-* @return void
-*/
-void __itt_release_resources(void);
-/** @endcond */
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#endif /* _ITTNOTIFY_H_ */
-
-#ifdef INTEL_ITTNOTIFY_API_PRIVATE
-
-#ifndef _ITTNOTIFY_PRIVATE_
-#define _ITTNOTIFY_PRIVATE_
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * @ingroup clockdomain
- * @brief Begin an overlapped task instance.
- * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
- * @param[in] parentid The parent of this task, or __itt_null.
- * @param[in] name The name of this task.
- */
-void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
-
-/**
- * @ingroup clockdomain
- * @brief End an overlapped task instance.
- * @param[in] domain The domain for this task
- * @param[in] clock_domain The clock domain controlling the execution of this call.
- * @param[in] timestamp The user defined timestamp.
- * @param[in] taskid Explicit ID of finished task
- */
-void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
-ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
-#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
-#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
-#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
-#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
-#define __itt_task_begin_overlapped_ex_ptr      0
-#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
-#define __itt_task_end_overlapped_ex_ptr        0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_task_begin_overlapped_ex_ptr      0
-#define __itt_task_end_overlapped_ptr           0
-#define __itt_task_end_overlapped_ex_ptr        0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @defgroup makrs_internal Marks
- * @ingroup internal
- * Marks group
- * @warning Internal API:
- *   - It is not shipped to outside of Intel
- *   - It is delivered to internal Intel teams using e-mail or SVN access only
- * @{
- */
-/** @brief user mark type */
-typedef int __itt_mark_type;
-
-/**
- * @brief Creates a user mark type with the specified name using char or Unicode string.
- * @param[in] name - name of mark to create
- * @return Returns a handle to the mark type
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
-__itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_mark_create     __itt_mark_createW
-#  define __itt_mark_create_ptr __itt_mark_createW_ptr
-#else /* UNICODE */
-#  define __itt_mark_create     __itt_mark_createA
-#  define __itt_mark_create_ptr __itt_mark_createA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_mark_type ITTAPI __itt_mark_create(const char *name);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
-ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
-#define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
-#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
-#define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
-#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_createA(name) (__itt_mark_type)0
-#define __itt_mark_createA_ptr 0
-#define __itt_mark_createW(name) (__itt_mark_type)0
-#define __itt_mark_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_create(name)  (__itt_mark_type)0
-#define __itt_mark_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_createA_ptr 0
-#define __itt_mark_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
- *
- * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
- * - The call is "synchronous" - function returns after mark is actually added to results.
- * - This function is useful, for example, to mark different phases of application
- *   (beginning of the next mark automatically meand end of current region).
- * - Can be used together with "continuous" marks (see below) at the same collection session
- * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
- * @param[in] parameter - string parameter of mark
- * @return Returns zero value in case of success, non-zero value otherwise.
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
-int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_mark     __itt_markW
-#  define __itt_mark_ptr __itt_markW_ptr
-#else /* UNICODE  */
-#  define __itt_mark     __itt_markA
-#  define __itt_mark_ptr __itt_markA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
-ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_markA     ITTNOTIFY_DATA(markA)
-#define __itt_markA_ptr ITTNOTIFY_NAME(markA)
-#define __itt_markW     ITTNOTIFY_DATA(markW)
-#define __itt_markW_ptr ITTNOTIFY_NAME(markW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark      ITTNOTIFY_DATA(mark)
-#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_markA(mt, parameter) (int)0
-#define __itt_markA_ptr 0
-#define __itt_markW(mt, parameter) (int)0
-#define __itt_markW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark(mt, parameter)  (int)0
-#define __itt_mark_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_markA_ptr 0
-#define __itt_markW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Use this if necessary to create a "discrete" user event type (mark) for process
- * rather then for one thread
- * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
-int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_mark_global     __itt_mark_globalW
-#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
-#else /* UNICODE  */
-#  define __itt_mark_global     __itt_mark_globalA
-#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
-ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
-#define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
-#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
-#define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
-#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_globalA(mt, parameter) (int)0
-#define __itt_mark_globalA_ptr 0
-#define __itt_mark_globalW(mt, parameter) (int)0
-#define __itt_mark_globalW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_global(mt, parameter)  (int)0
-#define __itt_mark_global_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_mark_globalA_ptr 0
-#define __itt_mark_globalW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_mark_global_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Creates an "end" point for "continuous" mark with specified name.
- *
- * - Returns zero value in case of success, non-zero value otherwise.
- *   Also returns non-zero value when preceding "begin" point for the
- *   mark with the same name failed to be created or not created.
- * - The mark of "continuous" type is placed to collection results in
- *   case of success. It appears in overtime view(s) as a special tick
- *   sign (different from "discrete" mark) together with line from
- *   corresponding "begin" mark to "end" mark.
- * @note Continuous marks can overlap and be nested inside each other.
- * Discrete mark can be nested inside marked region
- * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
- * @return Returns zero value in case of success, non-zero value otherwise.
- */
-int ITTAPI __itt_mark_off(__itt_mark_type mt);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
-#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
-#define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_mark_off(mt) (int)0
-#define __itt_mark_off_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_mark_off_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Use this if necessary to create an "end" point for mark of process
- * @see int __itt_mark_off(__itt_mark_type mt);
- */
-int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
-#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
-#define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_mark_global_off(mt) (int)0
-#define __itt_mark_global_off_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_mark_global_off_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} marks group */
-
-/**
- * @defgroup counters_internal Counters
- * @ingroup internal
- * Counters group
- * @{
- */
-
-
-/**
- * @defgroup stitch Stack Stitching
- * @ingroup internal
- * Stack Stitching group
- * @{
- */
-/**
- * @brief opaque structure for counter identification
- */
-typedef struct ___itt_caller *__itt_caller;
-
-/**
- * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
- * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
- */
-__itt_caller ITTAPI __itt_stack_caller_create(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
-#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
-#define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_stack_caller_create() (__itt_caller)0
-#define __itt_stack_caller_create_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_stack_caller_create_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
- */
-void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
-#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
-#define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_stack_caller_destroy(id)
-#define __itt_stack_caller_destroy_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_stack_caller_destroy_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
- * at the same stack level the function was called and stitched to the corresponding stitch point.
- */
-void ITTAPI __itt_stack_callee_enter(__itt_caller id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
-#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
-#define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_stack_callee_enter(id)
-#define __itt_stack_callee_enter_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_stack_callee_enter_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
- */
-void ITTAPI __itt_stack_callee_leave(__itt_caller id);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
-#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
-#define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_stack_callee_leave(id)
-#define __itt_stack_callee_leave_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_stack_callee_leave_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @} stitch group */
-
-/* ***************************************************************************************************************************** */
-
-#include <stdarg.h>
-
-/** @cond exclude_from_documentation */
-typedef enum __itt_error_code
-{
-    __itt_error_success       = 0, /*!< no error */
-    __itt_error_no_module     = 1, /*!< module can't be loaded */
-    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
-    __itt_error_no_symbol     = 2, /*!< symbol not found */
-    /* %1$s -- library name, %2$s -- symbol name. */
-    __itt_error_unknown_group = 3, /*!< unknown group specified */
-    /* %1$s -- env var name, %2$s -- group name. */
-    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
-    /* %1$s -- env var name, %2$d -- system error. */
-    __itt_error_env_too_long  = 5, /*!< variable value too long */
-    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
-    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
-    /* %1$s -- function name, %2$d -- errno. */
-} __itt_error_code;
-
-typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
-__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
-
-const char* ITTAPI __itt_api_version(void);
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler)
-void __itt_error_handler(__itt_error_code code, va_list args);
-extern const int ITTNOTIFY_NAME(err);
-#define __itt_err ITTNOTIFY_NAME(err)
-ITT_STUB(ITTAPI, const char*, api_version, (void))
-#define __itt_api_version     ITTNOTIFY_DATA(api_version)
-#define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_api_version()   (const char*)0
-#define __itt_api_version_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_api_version_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#endif /* _ITTNOTIFY_PRIVATE_ */
-
-#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
diff --git a/src/common/ittnotify/legacy/ittnotify.h b/src/common/ittnotify/legacy/ittnotify.h
deleted file mode 100644
index 0215db72963..00000000000
--- a/src/common/ittnotify/legacy/ittnotify.h
+++ /dev/null
@@ -1,992 +0,0 @@
-/*
-  Copyright (C) 2005-2019 Intel Corporation
-
-  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
-*/
-#ifndef _LEGACY_ITTNOTIFY_H_
-#define _LEGACY_ITTNOTIFY_H_
-
-/**
- * @file
- * @brief Legacy User API functions and types
- */
-
-/** @cond exclude_from_documentation */
-#ifndef ITT_OS_WIN
-#  define ITT_OS_WIN   1
-#endif /* ITT_OS_WIN */
-
-#ifndef ITT_OS_LINUX
-#  define ITT_OS_LINUX 2
-#endif /* ITT_OS_LINUX */
-
-#ifndef ITT_OS_MAC
-#  define ITT_OS_MAC   3
-#endif /* ITT_OS_MAC */
-
-#ifndef ITT_OS_FREEBSD
-#  define ITT_OS_FREEBSD   4
-#endif /* ITT_OS_FREEBSD */
-
-#ifndef ITT_OS
-#  if defined WIN32 || defined _WIN32
-#    define ITT_OS ITT_OS_WIN
-#  elif defined( __APPLE__ ) && defined( __MACH__ )
-#    define ITT_OS ITT_OS_MAC
-#  elif defined( __FreeBSD__ )
-#    define ITT_OS ITT_OS_FREEBSD
-#  else
-#    define ITT_OS ITT_OS_LINUX
-#  endif
-#endif /* ITT_OS */
-
-#ifndef ITT_PLATFORM_WIN
-#  define ITT_PLATFORM_WIN 1
-#endif /* ITT_PLATFORM_WIN */
-
-#ifndef ITT_PLATFORM_POSIX
-#  define ITT_PLATFORM_POSIX 2
-#endif /* ITT_PLATFORM_POSIX */
-
-#ifndef ITT_PLATFORM_MAC
-#  define ITT_PLATFORM_MAC 3
-#endif /* ITT_PLATFORM_MAC */
-
-#ifndef ITT_PLATFORM_FREEBSD
-#  define ITT_PLATFORM_FREEBSD 4
-#endif /* ITT_PLATFORM_FREEBSD */
-
-#ifndef ITT_PLATFORM
-#  if ITT_OS==ITT_OS_WIN
-#    define ITT_PLATFORM ITT_PLATFORM_WIN
-#  elif ITT_OS==ITT_OS_MAC
-#    define ITT_PLATFORM ITT_PLATFORM_MAC
-#  elif ITT_OS==ITT_OS_FREEBSD
-#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
-#  else
-#    define ITT_PLATFORM ITT_PLATFORM_POSIX
-#  endif
-#endif /* ITT_PLATFORM */
-
-#if defined(_UNICODE) && !defined(UNICODE)
-#define UNICODE
-#endif
-
-#include <stddef.h>
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#include <tchar.h>
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#include <stdint.h>
-#if defined(UNICODE) || defined(_UNICODE)
-#include <wchar.h>
-#endif /* UNICODE || _UNICODE */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-#ifndef ITTAPI_CDECL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define ITTAPI_CDECL __cdecl
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define ITTAPI_CDECL __attribute__ ((cdecl))
-#    else  /* _M_IX86 || __i386__ */
-#      define ITTAPI_CDECL /* actual only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* ITTAPI_CDECL */
-
-#ifndef STDCALL
-#  if ITT_PLATFORM==ITT_PLATFORM_WIN
-#    define STDCALL __stdcall
-#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#    if defined _M_IX86 || defined __i386__
-#      define STDCALL __attribute__ ((stdcall))
-#    else  /* _M_IX86 || __i386__ */
-#      define STDCALL /* supported only on x86 platform */
-#    endif /* _M_IX86 || __i386__ */
-#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* STDCALL */
-
-#define ITTAPI    ITTAPI_CDECL
-#define LIBITTAPI ITTAPI_CDECL
-
-/* TODO: Temporary for compatibility! */
-#define ITTAPI_CALL    ITTAPI_CDECL
-#define LIBITTAPI_CALL ITTAPI_CDECL
-
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-/* use __forceinline (VC++ specific) */
-#if defined(__MINGW32__) && !defined(__cplusplus)
-#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
-#else
-#define ITT_INLINE           static __forceinline
-#endif /* __MINGW32__ */
-
-#define ITT_INLINE_ATTRIBUTE /* nothing */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-/*
- * Generally, functions are not inlined unless optimization is specified.
- * For functions declared inline, this attribute inlines the function even
- * if no optimization level was specified.
- */
-#ifdef __STRICT_ANSI__
-#define ITT_INLINE           static
-#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
-#else  /* __STRICT_ANSI__ */
-#define ITT_INLINE           static inline
-#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
-#endif /* __STRICT_ANSI__ */
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-/** @endcond */
-
-/** @cond exclude_from_documentation */
-/* Helper macro for joining tokens */
-#define ITT_JOIN_AUX(p,n) p##n
-#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
-
-#ifdef ITT_MAJOR
-#undef ITT_MAJOR
-#endif
-#ifdef ITT_MINOR
-#undef ITT_MINOR
-#endif
-#define ITT_MAJOR     3
-#define ITT_MINOR     0
-
-/* Standard versioning of a token with major and minor version numbers */
-#define ITT_VERSIONIZE(x)    \
-    ITT_JOIN(x,              \
-    ITT_JOIN(_,              \
-    ITT_JOIN(ITT_MAJOR,      \
-    ITT_JOIN(_, ITT_MINOR))))
-
-#ifndef INTEL_ITTNOTIFY_PREFIX
-#  define INTEL_ITTNOTIFY_PREFIX __itt_
-#endif /* INTEL_ITTNOTIFY_PREFIX */
-#ifndef INTEL_ITTNOTIFY_POSTFIX
-#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
-#endif /* INTEL_ITTNOTIFY_POSTFIX */
-
-#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
-#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
-
-#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
-#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
-
-#define ITTNOTIFY_VOID_D0(n,d)       (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_VOID_D1(n,d,x)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_VOID_D2(n,d,x,y)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-#define ITTNOTIFY_DATA_D0(n,d)       (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
-#define ITTNOTIFY_DATA_D1(n,d,x)     (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
-#define ITTNOTIFY_DATA_D2(n,d,x,y)   (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
-#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
-#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
-#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
-#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
-
-#ifdef ITT_STUB
-#undef ITT_STUB
-#endif
-#ifdef ITT_STUBV
-#undef ITT_STUBV
-#endif
-#define ITT_STUBV(api,type,name,args)                             \
-    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
-    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
-#define ITT_STUB ITT_STUBV
-/** @endcond */
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-/**
- * @defgroup legacy Legacy API
- * @{
- * @}
- */
-
-/**
- * @defgroup legacy_control Collection Control
- * @ingroup legacy
- * General behavior: application continues to run, but no profiling information is being collected
- *
- * Pausing occurs not only for the current thread but for all process as well as spawned processes
- * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
- *   - Does not analyze or report errors that involve memory access.
- *   - Other errors are reported as usual. Pausing data collection in
- *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
- *     only pauses tracing and analyzing memory access.
- *     It does not pause tracing or analyzing threading APIs.
- *   .
- * - Intel(R) Parallel Amplifier and Intel(R) VTune(TM) Amplifier XE:
- *   - Does continue to record when new threads are started.
- *   .
- * - Other effects:
- *   - Possible reduction of runtime overhead.
- *   .
- * @{
- */
-#ifndef _ITTNOTIFY_H_
-/** @brief Pause collection */
-void ITTAPI __itt_pause(void);
-/** @brief Resume collection */
-void ITTAPI __itt_resume(void);
-/** @brief Detach collection */
-void ITTAPI __itt_detach(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, pause,   (void))
-ITT_STUBV(ITTAPI, void, resume,  (void))
-ITT_STUBV(ITTAPI, void, detach,  (void))
-#define __itt_pause      ITTNOTIFY_VOID(pause)
-#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
-#define __itt_resume     ITTNOTIFY_VOID(resume)
-#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
-#define __itt_detach     ITTNOTIFY_VOID(detach)
-#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_pause()
-#define __itt_pause_ptr  0
-#define __itt_resume()
-#define __itt_resume_ptr 0
-#define __itt_detach()
-#define __itt_detach_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_pause_ptr  0
-#define __itt_resume_ptr 0
-#define __itt_detach_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-#endif /* _ITTNOTIFY_H_ */
-/** @} legacy_control group */
-
-/**
- * @defgroup legacy_threads Threads
- * @ingroup legacy
- * Threads group
- * @warning Legacy API
- * @{
- */
-/**
- * @deprecated Legacy API
- * @brief Set name to be associated with thread in analysis GUI.
- * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int LIBITTAPI __itt_thr_name_setA(const char    *name, int namelen);
-int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_thr_name_set     __itt_thr_name_setW
-#  define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
-#else
-#  define __itt_thr_name_set     __itt_thr_name_setA
-#  define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char    *name, int namelen))
-ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thr_name_setA     ITTNOTIFY_DATA(thr_name_setA)
-#define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA)
-#define __itt_thr_name_setW     ITTNOTIFY_DATA(thr_name_setW)
-#define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thr_name_set     ITTNOTIFY_DATA(thr_name_set)
-#define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thr_name_setA(name, namelen)
-#define __itt_thr_name_setA_ptr 0
-#define __itt_thr_name_setW(name, namelen)
-#define __itt_thr_name_setW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thr_name_set(name, namelen)
-#define __itt_thr_name_set_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_thr_name_setA_ptr 0
-#define __itt_thr_name_setW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_thr_name_set_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Mark current thread as ignored from this point on, for the duration of its existence.
- */
-void LIBITTAPI __itt_thr_ignore(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
-#define __itt_thr_ignore     ITTNOTIFY_VOID(thr_ignore)
-#define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_thr_ignore()
-#define __itt_thr_ignore_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_thr_ignore_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} legacy_threads group */
-
-/**
- * @defgroup legacy_sync Synchronization
- * @ingroup legacy
- * Synchronization group
- * @warning Legacy API
- * @{
- */
-/**
- * @hideinitializer
- * @brief possible value of attribute argument for sync object type
- */
-#define __itt_attr_barrier 1
-
-/**
- * @hideinitializer
- * @brief possible value of attribute argument for sync object type
- */
-#define __itt_attr_mutex   2
-
-/**
- * @deprecated Legacy API
- * @brief Assign a name to a sync object using char or Unicode string
- * @param[in] addr    - pointer to the sync object. You should use a real pointer to your object
- *                      to make sure that the values don't clash with other object addresses
- * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will
- *                      be assumed to be of generic "User Synchronization" type
- * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned
- *                      to the object -- you can use the __itt_sync_rename call later to assign
- *                      the name
- * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
- *                      exact semantics of how prepare/acquired/releasing calls work.
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-void ITTAPI __itt_sync_set_nameA(void *addr, const char    *objtype, const char    *objname, int attribute);
-void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_sync_set_name     __itt_sync_set_nameW
-#  define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
-#else /* UNICODE */
-#  define __itt_sync_set_name     __itt_sync_set_nameA
-#  define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute))
-ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_set_nameA     ITTNOTIFY_VOID(sync_set_nameA)
-#define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA)
-#define __itt_sync_set_nameW     ITTNOTIFY_VOID(sync_set_nameW)
-#define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_set_name     ITTNOTIFY_VOID(sync_set_name)
-#define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_set_nameA(addr, objtype, objname, attribute)
-#define __itt_sync_set_nameA_ptr 0
-#define __itt_sync_set_nameW(addr, objtype, objname, attribute)
-#define __itt_sync_set_nameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_set_name(addr, objtype, objname, attribute)
-#define __itt_sync_set_name_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_sync_set_nameA_ptr 0
-#define __itt_sync_set_nameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_sync_set_name_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Assign a name and type to a sync object using char or Unicode string
- * @param[in] addr -      pointer to the sync object. You should use a real pointer to your object
- *                        to make sure that the values don't clash with other object addresses
- * @param[in] objtype -   null-terminated object type string. If NULL is passed, the object will
- *                        be assumed to be of generic "User Synchronization" type
- * @param[in] objname -   null-terminated object name string. If NULL, no name will be assigned
- *                        to the object -- you can use the __itt_sync_rename call later to assign
- *                        the name
- * @param[in] typelen, namelen -   a length of string for appropriate objtype and objname parameter
- * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
- *                        exact semantics of how prepare/acquired/releasing calls work.
- * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute);
-int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_notify_sync_name __itt_notify_sync_nameW
-#else
-#  define __itt_notify_sync_name __itt_notify_sync_nameA
-#endif
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
-ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_notify_sync_nameA     ITTNOTIFY_DATA(notify_sync_nameA)
-#define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA)
-#define __itt_notify_sync_nameW     ITTNOTIFY_DATA(notify_sync_nameW)
-#define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_notify_sync_name     ITTNOTIFY_DATA(notify_sync_name)
-#define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute)
-#define __itt_notify_sync_nameA_ptr 0
-#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute)
-#define __itt_notify_sync_nameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute)
-#define __itt_notify_sync_name_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_notify_sync_nameA_ptr 0
-#define __itt_notify_sync_nameW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_notify_sync_name_ptr 0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Enter spin loop on user-defined sync object
- */
-void LIBITTAPI __itt_notify_sync_prepare(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr))
-#define __itt_notify_sync_prepare     ITTNOTIFY_VOID(notify_sync_prepare)
-#define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_notify_sync_prepare(addr)
-#define __itt_notify_sync_prepare_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_notify_sync_prepare_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Quit spin loop without acquiring spin object
- */
-void LIBITTAPI __itt_notify_sync_cancel(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr))
-#define __itt_notify_sync_cancel     ITTNOTIFY_VOID(notify_sync_cancel)
-#define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_notify_sync_cancel(addr)
-#define __itt_notify_sync_cancel_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_notify_sync_cancel_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Successful spin loop completion (sync object acquired)
- */
-void LIBITTAPI __itt_notify_sync_acquired(void *addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr))
-#define __itt_notify_sync_acquired     ITTNOTIFY_VOID(notify_sync_acquired)
-#define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_notify_sync_acquired(addr)
-#define __itt_notify_sync_acquired_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_notify_sync_acquired_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Start sync object releasing code. Is called before the lock release call.
- */
-void LIBITTAPI __itt_notify_sync_releasing(void* addr);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr))
-#define __itt_notify_sync_releasing     ITTNOTIFY_VOID(notify_sync_releasing)
-#define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_notify_sync_releasing(addr)
-#define __itt_notify_sync_releasing_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_notify_sync_releasing_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} legacy_sync group */
-
-#ifndef _ITTNOTIFY_H_
-/**
- * @defgroup legacy_events Events
- * @ingroup legacy
- * Events group
- * @{
- */
-
-/** @brief user event type */
-typedef int __itt_event;
-
-/**
- * @brief Create an event notification
- * @note name or namelen being null/name and namelen not matching, user event feature not enabled
- * @return non-zero event identifier upon success and __itt_err otherwise
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
-__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_event_create     __itt_event_createW
-#  define __itt_event_create_ptr __itt_event_createW_ptr
-#else
-#  define __itt_event_create     __itt_event_createA
-#  define __itt_event_create_ptr __itt_event_createA_ptr
-#endif /* UNICODE */
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
-ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char *name, int namelen))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
-#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
-#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
-#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create      ITTNOTIFY_DATA(event_create)
-#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA(name, namelen) (__itt_event)0
-#define __itt_event_createA_ptr 0
-#define __itt_event_createW(name, namelen) (__itt_event)0
-#define __itt_event_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create(name, namelen)  (__itt_event)0
-#define __itt_event_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_event_createA_ptr 0
-#define __itt_event_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_event_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record an event occurrence.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
- */
-int LIBITTAPI __itt_event_start(__itt_event event);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
-#define __itt_event_start     ITTNOTIFY_DATA(event_start)
-#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_event_start(event) (int)0
-#define __itt_event_start_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_event_start_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @brief Record an event end occurrence.
- * @note It is optional if events do not have durations.
- * @return __itt_err upon failure (invalid event id/user event feature not enabled)
- */
-int LIBITTAPI __itt_event_end(__itt_event event);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
-#define __itt_event_end     ITTNOTIFY_DATA(event_end)
-#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_event_end(event) (int)0
-#define __itt_event_end_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_event_end_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} legacy_events group */
-#endif /* _ITTNOTIFY_H_ */
-
-/**
- * @defgroup legacy_memory Memory Accesses
- * @ingroup legacy
- */
-
-/**
- * @deprecated Legacy API
- * @brief Inform the tool of memory accesses on reading
- */
-void LIBITTAPI __itt_memory_read(void *addr, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size))
-#define __itt_memory_read     ITTNOTIFY_VOID(memory_read)
-#define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_memory_read(addr, size)
-#define __itt_memory_read_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_memory_read_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Inform the tool of memory accesses on writing
- */
-void LIBITTAPI __itt_memory_write(void *addr, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size))
-#define __itt_memory_write     ITTNOTIFY_VOID(memory_write)
-#define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_memory_write(addr, size)
-#define __itt_memory_write_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_memory_write_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief Inform the tool of memory accesses on updating
- */
-void LIBITTAPI __itt_memory_update(void *address, size_t size);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size))
-#define __itt_memory_update     ITTNOTIFY_VOID(memory_update)
-#define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_memory_update(addr, size)
-#define __itt_memory_update_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_memory_update_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} legacy_memory group */
-
-/**
- * @defgroup legacy_state Thread and Object States
- * @ingroup legacy
- */
-
-/** @brief state type */
-typedef int __itt_state_t;
-
-/** @cond exclude_from_documentation */
-typedef enum __itt_obj_state {
-    __itt_obj_state_err = 0,
-    __itt_obj_state_clr = 1,
-    __itt_obj_state_set = 2,
-    __itt_obj_state_use = 3
-} __itt_obj_state_t;
-
-typedef enum __itt_thr_state {
-    __itt_thr_state_err = 0,
-    __itt_thr_state_clr = 1,
-    __itt_thr_state_set = 2
-} __itt_thr_state_t;
-
-typedef enum __itt_obj_prop {
-    __itt_obj_prop_watch    = 1,
-    __itt_obj_prop_ignore   = 2,
-    __itt_obj_prop_sharable = 3
-} __itt_obj_prop_t;
-
-typedef enum __itt_thr_prop {
-    __itt_thr_prop_quiet = 1
-} __itt_thr_prop_t;
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief managing thread and object states
- */
-__itt_state_t LIBITTAPI __itt_state_get(void);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_state_t, state_get, (void))
-#define __itt_state_get     ITTNOTIFY_DATA(state_get)
-#define __itt_state_get_ptr ITTNOTIFY_NAME(state_get)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_state_get(void) (__itt_state_t)0
-#define __itt_state_get_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_state_get_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief managing thread and object states
- */
-__itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
-#define __itt_state_set     ITTNOTIFY_DATA(state_set)
-#define __itt_state_set_ptr ITTNOTIFY_NAME(state_set)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_state_set(s) (__itt_state_t)0
-#define __itt_state_set_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_state_set_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief managing thread and object modes
- */
-__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s))
-#define __itt_thr_mode_set     ITTNOTIFY_DATA(thr_mode_set)
-#define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0
-#define __itt_thr_mode_set_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_thr_mode_set_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/**
- * @deprecated Legacy API
- * @brief managing thread and object modes
- */
-__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s))
-#define __itt_obj_mode_set     ITTNOTIFY_DATA(obj_mode_set)
-#define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0
-#define __itt_obj_mode_set_ptr 0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_obj_mode_set_ptr 0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} legacy_state group */
-
-/**
- * @defgroup frames Frames
- * @ingroup legacy
- * Frames group
- * @{
- */
-/**
- * @brief opaque structure for frame identification
- */
-typedef struct __itt_frame_t *__itt_frame;
-
-/**
- * @brief Create a global frame with given domain
- */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-__itt_frame ITTAPI __itt_frame_createA(const char    *domain);
-__itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain);
-#if defined(UNICODE) || defined(_UNICODE)
-#  define __itt_frame_create     __itt_frame_createW
-#  define __itt_frame_create_ptr __itt_frame_createW_ptr
-#else /* UNICODE */
-#  define __itt_frame_create     __itt_frame_createA
-#  define __itt_frame_create_ptr __itt_frame_createA_ptr
-#endif /* UNICODE */
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-__itt_frame ITTAPI __itt_frame_create(const char *domain);
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain))
-ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain))
-#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_frame_createA     ITTNOTIFY_DATA(frame_createA)
-#define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA)
-#define __itt_frame_createW     ITTNOTIFY_DATA(frame_createW)
-#define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW)
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_frame_create     ITTNOTIFY_DATA(frame_create)
-#define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create)
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_frame_createA(domain)
-#define __itt_frame_createA_ptr 0
-#define __itt_frame_createW(domain)
-#define __itt_frame_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_frame_create(domain)
-#define __itt_frame_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#if ITT_PLATFORM==ITT_PLATFORM_WIN
-#define __itt_frame_createA_ptr 0
-#define __itt_frame_createW_ptr 0
-#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#define __itt_frame_create_ptr  0
-#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-
-/** @brief Record a frame begin occurrence. */
-void ITTAPI __itt_frame_begin(__itt_frame frame);
-/** @brief Record a frame end occurrence. */
-void ITTAPI __itt_frame_end  (__itt_frame frame);
-
-/** @cond exclude_from_documentation */
-#ifndef INTEL_NO_MACRO_BODY
-#ifndef INTEL_NO_ITTNOTIFY_API
-ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame))
-ITT_STUBV(ITTAPI, void, frame_end,   (__itt_frame frame))
-#define __itt_frame_begin     ITTNOTIFY_VOID(frame_begin)
-#define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin)
-#define __itt_frame_end       ITTNOTIFY_VOID(frame_end)
-#define __itt_frame_end_ptr   ITTNOTIFY_NAME(frame_end)
-#else  /* INTEL_NO_ITTNOTIFY_API */
-#define __itt_frame_begin(frame)
-#define __itt_frame_begin_ptr 0
-#define __itt_frame_end(frame)
-#define __itt_frame_end_ptr   0
-#endif /* INTEL_NO_ITTNOTIFY_API */
-#else  /* INTEL_NO_MACRO_BODY */
-#define __itt_frame_begin_ptr 0
-#define __itt_frame_end_ptr   0
-#endif /* INTEL_NO_MACRO_BODY */
-/** @endcond */
-/** @} frames group */
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#endif /* _LEGACY_ITTNOTIFY_H_ */
diff --git a/src/common/layer_normalization.cpp b/src/common/layer_normalization.cpp
index 79ccc98c45c..62804c601dc 100644
--- a/src/common/layer_normalization.cpp
+++ b/src/common/layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -157,19 +157,25 @@ status_t layer_normalization_attr_check(const layer_normalization_desc_t &desc,
 
         const bool is_int8 = utils::one_of(src_dt, data_type::s8, data_type::u8)
                 || utils::one_of(dst_dt, data_type::s8, data_type::u8);
-        if (is_int8) fwd_attr_mask |= smask_t::scales_runtime;
+        if (is_int8) fwd_attr_mask |= smask_t::scales;
 
         VCHECK_LNORM_UNIMPL(attr->has_default_values(fwd_attr_mask, dst_dt),
                 VERBOSE_UNSUPPORTED_ATTR);
 
         // Check scales
         if (!attr->scales_.has_default_values()) {
-            const auto &sc = attr->scales_;
-            const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-            const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
-
-            VCHECK_LNORM_UNIMPL(utils::everyone_is(0, mask_src, mask_dst),
+            static const std::vector<int> supported_args {
+                    DNNL_ARG_SRC, DNNL_ARG_DST};
+            VCHECK_LNORM_UNIMPL(
+                    attr->scales_.has_default_values(supported_args),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            for (int arg : supported_args) {
+                if (attr->scales_.has_default_values(arg)) continue;
+
+                const int mask = attr->scales_.get_mask(arg);
+                VCHECK_LNORM_UNIMPL(mask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+            }
         }
 
         // Check post-ops
@@ -178,6 +184,9 @@ status_t layer_normalization_attr_check(const layer_normalization_desc_t &desc,
             using namespace primitive_kind;
             VCHECK_LNORM_UNIMPL(po.has_default_values({binary, eltwise, sum}),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         VCHECK_LNORM_UNIMPL(false, VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/common/layer_normalization_pd.hpp b/src/common/layer_normalization_pd.hpp
index 242ea372eee..bdd27c47978 100644
--- a/src/common/layer_normalization_pd.hpp
+++ b/src/common/layer_normalization_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -107,11 +107,11 @@ struct layer_normalization_pd_t : public primitive_desc_t {
     memory_desc_t stat_md_;
     memory_desc_t scaleshift_md_;
 
-    layer_normalization_pd_t(const layer_normalization_desc_t *adesc,
+    layer_normalization_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const layer_normalization_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<layer_normalization_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_md_(desc_.src_desc)
         , stat_md_(desc_.stat_desc)
@@ -156,9 +156,10 @@ struct layer_normalization_pd_t : public primitive_desc_t {
     const memory_desc_t &src_desc() const { return desc_.src_desc; }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct layer_normalization_fwd_pd_t : public layer_normalization_pd_t {
-    typedef layer_normalization_fwd_pd_t base_class;
-    typedef layer_normalization_fwd_pd_t hint_class;
+    using base_class = layer_normalization_fwd_pd_t;
+    using hint_class = layer_normalization_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
@@ -170,8 +171,10 @@ struct layer_normalization_fwd_pd_t : public layer_normalization_pd_t {
             return arg_usage_t::unused;
         }
 
-        if (arg == DNNL_ARG_SCALE && use_scale()) return arg_usage_t::input;
-        if (arg == DNNL_ARG_SHIFT && use_shift()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_SCALE)
+            return use_scale() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_SHIFT)
+            return use_shift() ? arg_usage_t::input : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -224,7 +227,7 @@ struct layer_normalization_fwd_pd_t : public layer_normalization_pd_t {
 protected:
     memory_desc_t dst_md_;
 
-    layer_normalization_fwd_pd_t(const layer_normalization_desc_t *adesc,
+    layer_normalization_fwd_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const layer_normalization_fwd_pd_t *hint_fwd_pd)
         : layer_normalization_pd_t(adesc, attr, hint_fwd_pd)
@@ -248,34 +251,44 @@ struct layer_normalization_fwd_pd_t : public layer_normalization_pd_t {
         return false;
     }
 
-    bool attr_scales_ok() const {
+    bool attr_scales_ok(const std::vector<int> &supported_args
+            = {DNNL_ARG_SRC, DNNL_ARG_DST}) const {
+        using namespace data_type;
         const auto &scales = attr()->scales_;
-        bool ok = true;
-        for (const auto &e : scales.scales_) {
-            ok = ok && e.second.mask_ == 0;
+        bool ok = scales.has_default_values(supported_args);
+
+        for (const auto &arg : supported_args) {
+            if (!scales.has_default_values(arg)) {
+                // TODO: disallow non-int8 scales?
+                // const data_type_t dt = arg_md(arg)->data_type;
+                // ok = ok && utils::one_of(dt, s8, u8);
+                ok = ok && scales.get_mask(arg) == 0;
+            }
         }
         return ok;
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct layer_normalization_bwd_pd_t : public layer_normalization_pd_t {
-    typedef layer_normalization_bwd_pd_t base_class;
-    typedef layer_normalization_fwd_pd_t hint_class;
+    using base_class = layer_normalization_bwd_pd_t;
+    using hint_class = layer_normalization_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_MEAN, DNNL_ARG_VARIANCE,
                     DNNL_ARG_DIFF_DST))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_SCALE && use_scale()) return arg_usage_t::input;
-        if (arg == DNNL_ARG_SHIFT && use_shift()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_SCALE)
+            return use_scale() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_DIFF_SCALE && use_scale())
-            return arg_usage_t::output;
-        if (arg == DNNL_ARG_DIFF_SHIFT && use_shift())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DIFF_SCALE)
+            return use_scale() ? arg_usage_t::output : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_SHIFT)
+            return use_shift() ? arg_usage_t::output : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -324,7 +337,7 @@ struct layer_normalization_bwd_pd_t : public layer_normalization_pd_t {
         return index == 0 ? &diff_scaleshift_md_ : &glob_zero_md;
     }
 
-    int n_inputs() const override { return 4 + use_scale() + use_shift(); }
+    int n_inputs() const override { return 4 + use_scale(); }
     int n_outputs() const override {
         return 1
                 + (desc_.prop_kind == prop_kind::backward)
@@ -336,7 +349,7 @@ struct layer_normalization_bwd_pd_t : public layer_normalization_pd_t {
     memory_desc_t diff_dst_md_;
     memory_desc_t diff_scaleshift_md_;
 
-    layer_normalization_bwd_pd_t(const layer_normalization_desc_t *adesc,
+    layer_normalization_bwd_pd_t(const op_desc_t *adesc,
             const primitive_attr_t *attr,
             const layer_normalization_fwd_pd_t *hint_fwd_pd)
         : layer_normalization_pd_t(adesc, attr, hint_fwd_pd)
@@ -368,6 +381,7 @@ struct layer_normalization_bwd_pd_t : public layer_normalization_pd_t {
         return false;
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 0eaec490899..bb4f1f9b2a8 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,21 +17,20 @@
 #include "common/logging.hpp"
 #include "common/utils.hpp"
 
-#include "common/spdlog/sinks/rotating_file_sink.h"
-#include "common/spdlog/spdlog.h"
+#include "spdlog/sinks/rotating_file_sink.h"
+#include "spdlog/spdlog.h"
 
 namespace dnnl {
 namespace impl {
 
-log_manager_t::log_manager_t() {
-
+log_manager_t::log_manager_t()
+    : logfile_path_(getenv_string_user("VERBOSE_LOGFILE"))
     // enables logging as well as printing to stdout
-    console_flag_ = getenv_int_user("VERBOSE_LOG_WITH_CONSOLE", 0);
+    , console_flag_(getenv_int_user("VERBOSE_LOG_WITH_CONSOLE", 0)) {
 
     // logging is automatically disabled when no filepath is provided by
     // DNNL_VERBOSE_LOGFILE
     // in this case, we fall back to printing to stdout
-    logfile_path_ = getenv_string_user("VERBOSE_LOGFILE");
     if (logfile_path_.empty()) {
         console_flag_ = true;
         return;
@@ -93,7 +92,7 @@ void log_manager_t::log(const char *msg, log_level_t log_level) const {
 void log_manager_t::set_log_level(const std::string &vmode_str) const {
     // The logging level is determined from the verbose mode
     // with the following order of decreasing priority:
-    // [trace, debug, info, error, critical, off]
+    // [trace, debug, info, warn, error, critical, off]
     spdlog::set_level(spdlog::level::off);
 
     if (vmode_str == "-1" || vmode_str == "all") {
@@ -104,6 +103,8 @@ void log_manager_t::set_log_level(const std::string &vmode_str) const {
             || vmode_str.find("profile") != std::string::npos
             || vmode_str.find("dispatch") != std::string::npos) {
         spdlog::set_level(spdlog::level::info);
+    } else if (vmode_str.find("warn") != std::string::npos) {
+        spdlog::set_level(spdlog::level::warn);
     } else if (vmode_str.find("check") != std::string::npos) {
         spdlog::set_level(spdlog::level::err);
     } else if (vmode_str.find("error") != std::string::npos) {
diff --git a/src/common/lrn_pd.hpp b/src/common/lrn_pd.hpp
index e7afb7b2bdb..0205c21b213 100644
--- a/src/common/lrn_pd.hpp
+++ b/src/common/lrn_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -89,26 +89,27 @@ struct lrn_pd_t : public primitive_desc_t {
     memory_desc_t src_md_;
     memory_desc_t ws_md_;
 
-    lrn_pd_t(const lrn_desc_t *adesc, const primitive_attr_t *attr,
+    lrn_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const lrn_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<lrn_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
-        , src_md_(desc_.src_desc)
-        , ws_md_() {}
+        , src_md_(desc_.src_desc) {}
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct lrn_fwd_pd_t : public lrn_pd_t {
-    typedef lrn_fwd_pd_t base_class;
-    typedef lrn_fwd_pd_t hint_class;
+    using base_class = lrn_fwd_pd_t;
+    using hint_class = lrn_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_WORKSPACE && (!types::is_zero_md(workspace_md())))
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::output
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -145,7 +146,7 @@ struct lrn_fwd_pd_t : public lrn_pd_t {
 protected:
     memory_desc_t dst_md_;
 
-    lrn_fwd_pd_t(const lrn_desc_t *adesc, const primitive_attr_t *attr,
+    lrn_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const lrn_fwd_pd_t *hint_fwd_pd)
         : lrn_pd_t(adesc, attr, hint_fwd_pd), dst_md_(desc_.dst_desc) {}
 
@@ -156,10 +157,12 @@ struct lrn_fwd_pd_t : public lrn_pd_t {
                         == status::success);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct lrn_bwd_pd_t : public lrn_pd_t {
-    typedef lrn_bwd_pd_t base_class;
-    typedef lrn_fwd_pd_t hint_class;
+    using base_class = lrn_bwd_pd_t;
+    using hint_class = lrn_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_DIFF_DST))
@@ -167,8 +170,9 @@ struct lrn_bwd_pd_t : public lrn_pd_t {
 
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_WORKSPACE && (!types::is_zero_md(workspace_md())))
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::input
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -214,7 +218,7 @@ struct lrn_bwd_pd_t : public lrn_pd_t {
     memory_desc_t diff_src_md_;
     memory_desc_t diff_dst_md_;
 
-    lrn_bwd_pd_t(const lrn_desc_t *adesc, const primitive_attr_t *attr,
+    lrn_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const lrn_fwd_pd_t *hint_fwd_pd)
         : lrn_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_md_(desc_.diff_src_desc)
@@ -231,6 +235,7 @@ struct lrn_bwd_pd_t : public lrn_pd_t {
                                 == status::success);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/math_utils.hpp b/src/common/math_utils.hpp
index 0c156dff8db..848e393b6e9 100644
--- a/src/common/math_utils.hpp
+++ b/src/common/math_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,7 +68,8 @@ inline T gcd(T a, T b) {
     return b;
 }
 
-inline int lcm(int a, int b) {
+template <typename T>
+inline T lcm(T a, T b) {
     a = impl::nstl::abs(a);
     b = impl::nstl::abs(b);
     assert(a > 0 && b > 0);
@@ -88,12 +89,15 @@ inline int ilog2q(size_t v) {
     int p = 0;
 #define CP(pw) \
     do { \
-        if (v >= (1ull << pw)) { \
-            v >>= pw; \
-            p += pw; \
+        if (v >= (1ull << (pw))) { \
+            v >>= (pw); \
+            p += (pw); \
         } \
     } while (0)
+
+#if INTPTR_MAX == INT64_MAX
     CP(32);
+#endif
     CP(16);
     CP(8);
     CP(4);
@@ -238,7 +242,7 @@ template <typename T, typename U = typename utils::remove_reference<T>::type>
 inline U logistic_fwd(T s) {
     // Here we avoid division/inverse by infinity as some architectures have
     // non-standard behavior
-    float exp_overflow_bound = 88.72283172607421875;
+    float exp_overflow_bound = 88.72283172607421875f;
     float in = (float)-s;
     return in < exp_overflow_bound ? (U)(1.f / (1.f + ::expf(in))) : 0.f;
 }
@@ -255,7 +259,7 @@ inline U logistic_bwd_use_dst(T dd, T d) {
 template <typename T, typename A,
         typename U = typename utils::remove_reference<T>::type>
 inline U soft_relu_fwd(T s, A alpha) {
-    float exp_overflow_bound = 88.72283172607421875;
+    float exp_overflow_bound = 20.f;
     float in = (float)s * (float)alpha;
     float v = (in < exp_overflow_bound ? (U)(::log1pf(::expf(in))) : (U)in);
     return (U)(v / alpha);
@@ -414,6 +418,31 @@ inline U hardswish_bwd(T dd, T s, A alpha, A beta) {
     return v <= 0.f ? 0.f : v >= 1.f ? dd : dd * w;
 }
 
+template <typename T,
+        typename U = typename utils::remove_reference<T>::type>
+inline U hsigmoid_fwd(T s) {
+    float v = s + 3.0f;
+    v = v > 0.0f ? v : 0.0f;
+    v = v < 6.0f ? v : 6.0f;
+    return (U)(v / 6.0f);
+}
+
+template <typename T,
+        typename U = typename utils::remove_reference<T>::type>
+inline U round_half_to_even_fwd(T s) {
+    float r = ::roundf((float)s);
+    float d = (float)s - r;
+    float remainder = ::fmodf(r, 2.0f);
+    return ((d != 0.5f) && (d != -0.5f)) || (remainder == 0.0f) ? (U)r :
+           (U)((float)s + d);
+}
+
+template <typename T,
+        typename U = typename utils::remove_reference<T>::type>
+inline U round_half_away_from_zero_fwd(T s) {
+    return (U)(::roundf((float)s));
+}
+
 inline bool is_eltwise_ok(
         data_type_t src_dt, alg_kind_t alg, float alpha, float beta) {
     using namespace alg_kind;
@@ -426,7 +455,8 @@ inline bool is_eltwise_ok(
                       eltwise_exp, eltwise_gelu_tanh, eltwise_hardsigmoid,
                       eltwise_hardswish, eltwise_swish, eltwise_log,
                       eltwise_clip, eltwise_clip_v2, eltwise_pow,
-                      eltwise_gelu_erf, eltwise_round)
+                      eltwise_gelu_erf, eltwise_round,
+                      eltwise_hsigmoid, eltwise_round_half_away_from_zero, eltwise_round_half_to_even)
             && IMPLICATION(
                     one_of(alg, eltwise_clip, eltwise_clip_v2), beta >= alpha)
             && IMPLICATION(alg == eltwise_round, src_dt == dnnl_f32)
@@ -514,7 +544,7 @@ inline uint16_t philox8x16(uint32_t idx, uint32_t seed) {
     // - 1 lsb is used to index 16-bit words within this 32 bit random
     //   value
     uint32_t r = philox4x32(idx >> 1, seed);
-    return (uint16_t)(r >> ((idx & 1) * sizeof(uint16_t)));
+    return (uint16_t)(r >> ((idx & 1) * sizeof(uint16_t) * 8));
 }
 
 inline uint8_t philox16x8(uint32_t idx, uint32_t seed) {
@@ -523,7 +553,7 @@ inline uint8_t philox16x8(uint32_t idx, uint32_t seed) {
     // - 2 lsb is used to index 8-bit words within this 32 bit random
     //   value
     uint32_t r = philox4x32(idx >> 2, seed);
-    return (uint8_t)(r >> ((idx & 3) * sizeof(uint8_t)));
+    return (uint8_t)(r >> ((idx & 3) * sizeof(uint8_t) * 8));
 }
 
 inline float stochastic_round_fwd(
@@ -551,8 +581,8 @@ inline float stochastic_round_fwd(
             << (digits<uint32_t>(data_type::f32) - digits<uint32_t>(dst_dt));
 
     // IMPORTANT: lsb of bias are used.
-    uint32_t rnd_bias = data_type_size(dst_dt) == 16 ? philox16x8(idx, seed)
-                                                     : philox8x16(idx, seed);
+    uint32_t rnd_bias = data_type_size(dst_dt) == 2 ? philox16x8(idx, seed)
+                                                    : philox8x16(idx, seed);
     rnd_bias = rnd_bias & ~truncation_mask;
 
     uint32_t s_u = utils::bit_cast<uint32_t>(s);
@@ -567,6 +597,42 @@ inline float stochastic_round_fwd(
     return r;
 }
 
+inline float get_bias(const char *bias, size_t offset, data_type_t data_type) {
+    if (!bias) return 0.0f;
+
+#define CASE(dt) \
+    case dt: return (float)((const prec_traits_t<dt>::type *)bias)[offset]
+
+    switch (data_type) {
+        CASE(data_type::s8);
+        CASE(data_type::u8);
+        CASE(data_type::bf16);
+        CASE(data_type::s32);
+        CASE(data_type::f32);
+        default: assert(!"unimplemented");
+    }
+    return 0; // never happens (should probably be a NaN)
+#undef CASE
+}
+
+inline float get_sum(char *sum, size_t offset, data_type_t data_type) {
+    if (!sum)
+        return 0.0f;
+
+#define CASE(dt) \
+    case dt: return (float)((const prec_traits_t<dt>::type *)sum)[offset]
+
+    switch (data_type) {
+        CASE(data_type::s8);
+        CASE(data_type::u8);
+        CASE(data_type::s32);
+        CASE(data_type::f32);
+        default: assert(!"unimplemented");
+    }
+    return 0; // never happens (should probably be a NaN)
+#undef CASE
+}
+
 } // namespace math
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/matmul.cpp b/src/common/matmul.cpp
index c0fb43b6b56..12174cc3b24 100644
--- a/src/common/matmul.cpp
+++ b/src/common/matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,25 +52,28 @@ status_t matmul_attr_check(const matmul_desc_t &desc, const engine_t *engine,
     auto attr_mask = smask_t::post_ops | smask_t::sum_dt | smask_t::dropout
             | smask_t::rounding_mode;
     // Matmul supports scales for floating point data types
-    attr_mask |= smask_t::scales_runtime;
-    attr_mask |= smask_t::scales_runtime_data_type;
+    attr_mask |= smask_t::scales_data_type;
 
     const bool src_is_int8
             = utils::one_of(src_dt, data_type::s8, data_type::u8);
-    if (src_is_int8) attr_mask |= smask_t::zero_points_runtime;
+    const bool src_is_fp8
+            = utils::one_of(src_dt, data_type::f8_e5m2, data_type::f8_e4m3);
+    if (src_is_int8 || src_is_fp8) attr_mask |= smask_t::zero_points;
 
     // Matmul supports zero points for floating point data types as part of
     // weights decompression.
     const bool wei_is_int = utils::one_of(
             wei_dt, data_type::s8, data_type::u8, data_type::s4, data_type::u4);
-    if (wei_is_int) {
-        attr_mask |= smask_t::zero_points_runtime_data_type;
-        attr_mask |= smask_t::zero_points_runtime_groups;
-        attr_mask |= smask_t::scales_runtime_groups;
+    const bool wei_is_fp8
+            = utils::one_of(wei_dt, data_type::f8_e5m2, data_type::f8_e4m3);
+    if (wei_is_int || wei_is_fp8) {
+        attr_mask |= smask_t::zero_points_data_type;
+        attr_mask |= smask_t::zero_points_groups;
+        attr_mask |= smask_t::scales_groups;
     }
 
-    // Matmul supports fpmath mode
-    attr_mask |= smask_t::fpmath_mode;
+    // Matmul supports fpmath mode and accumulation mode
+    attr_mask |= smask_t::fpmath_mode | smask_t::accumulation_mode;
 
     VCHECK_MATMUL_UNIMPL(attr->has_default_values(attr_mask, dst_dt),
             VERBOSE_UNSUPPORTED_ATTR);
@@ -85,67 +88,173 @@ status_t matmul_attr_check(const matmul_desc_t &desc, const engine_t *engine,
     int wei_qmask_K = 1 << (ndims_wei - 2);
     int wei_qmask_N = 1 << (ndims_wei - 1);
 
+    int dst_qmask_M = src_qmask_K;
+    int dst_qmask_N = wei_qmask_N;
+
     // Check scales
     if (!attr->scales_.has_default_values()) {
         const auto &sc = attr->scales_;
-        const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-        const int mask_wei = sc.get(DNNL_ARG_WEIGHTS).mask_;
-        const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
-
-        // Check allowed masks.
-        VCHECK_MATMUL_UNIMPL(utils::one_of(mask_src, 0, src_qmask_K,
-                                     src_qmask_M + src_qmask_K)
-                        && utils::one_of(mask_wei, 0, wei_qmask_N,
-                                wei_qmask_N + wei_qmask_K)
-                        && mask_dst == 0,
-                VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+        dim_t src_scale_group_k = 1;
+        if (!sc.has_default_values(DNNL_ARG_SRC)) {
+            const int mask_src = sc.get_mask(DNNL_ARG_SRC);
+
+            VCHECK_MATMUL_UNIMPL(utils::one_of(mask_src, 0, src_qmask_K,
+                                         src_qmask_M + src_qmask_K),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            if (!sc.get(DNNL_ARG_SRC).has_default_groups()) {
+                if (mask_src & src_qmask_K)
+                    src_scale_group_k = sc.get_group(DNNL_ARG_SRC, 1);
+            }
+
+            // Due to hardware specifics, groups should be multiple of 32.
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(src_scale_group_k > 1,
+                                         src_scale_group_k % 32 == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+        }
+
+        dim_t wei_scale_group_k = 1;
+        dim_t wei_scale_group_n = 1;
+        if (!sc.has_default_values(DNNL_ARG_WEIGHTS)) {
+            const int mask_wei = sc.get_mask(DNNL_ARG_WEIGHTS);
+
+            // Masks for weights scales can be any - skipping them.
+
+            if (!sc.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+                if (mask_wei & wei_qmask_K)
+                    wei_scale_group_k = sc.get_group(DNNL_ARG_WEIGHTS, 0);
+                if (mask_wei & wei_qmask_N)
+                    wei_scale_group_n = sc.get_group(DNNL_ARG_WEIGHTS, 1);
+            }
+
+            // Groups per N are solely for weights decompression as it's
+            // impossible to get performant kernel for a single `k` element in
+            // chain for regular quantized case.
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(wei_scale_group_n > 1,
+                                         attr->fpmath_.apply_to_int_),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            // Due to hardware specifics, groups should be multiple of 32.
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(wei_scale_group_k > 1,
+                                         wei_scale_group_k % 32 == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(wei_scale_group_n > 1,
+                                         wei_scale_group_n % 32 == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+        }
+
+        if (!sc.has_default_values(DNNL_ARG_DST)) {
+            const int mask_dst = sc.get_mask(DNNL_ARG_DST);
+
+            if (engine->kind() == engine_kind::gpu) {
+                VCHECK_MATMUL_UNIMPL(
+                        utils::one_of(mask_dst, 0, dst_qmask_N, dst_qmask_M,
+                                dst_qmask_N + dst_qmask_M),
+                        VERBOSE_UNSUPPORTED_SCALES_CFG);
+            } else {
+                VCHECK_MATMUL_UNIMPL(
+                        mask_dst == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+            }
+        }
+
         // Check dependency between scales.
         // Source scales groups are supported for int8 source and must divide
         // or be divided by weights groups when both are greater than 1.
-        const auto src_scale_group_k = (mask_src & src_qmask_K)
-                ? sc.get(DNNL_ARG_SRC).group_dims_[1]
-                : 1;
-        const auto wei_scale_group_k = (mask_wei & wei_qmask_K)
-                ? sc.get(DNNL_ARG_WEIGHTS).group_dims_[0]
-                : 1;
         const bool groups_are_divisible = IMPLICATION(
                 src_scale_group_k > 1 && wei_scale_group_k > 1,
                 (src_scale_group_k % wei_scale_group_k == 0)
                         || (wei_scale_group_k % src_scale_group_k == 0));
-        VCHECK_MATMUL_UNIMPL(IMPLICATION(src_scale_group_k > 1,
-                                     src_is_int8 && groups_are_divisible),
+        VCHECK_MATMUL_UNIMPL(
+                IMPLICATION(src_scale_group_k > 1,
+                        (src_is_int8 || src_is_fp8) && groups_are_divisible),
                 VERBOSE_UNSUPPORTED_SCALES_CFG);
     }
 
     // Check zero points
     if (!attr->zero_points_.has_default_values()) {
         const auto &zp = attr->zero_points_;
-        int mask_src = 0, mask_wei = 0, mask_dst = 0;
-        zp.get(DNNL_ARG_SRC, &mask_src);
-        zp.get(DNNL_ARG_WEIGHTS, &mask_wei);
-        zp.get(DNNL_ARG_DST, &mask_dst);
 
-        VCHECK_MATMUL_UNIMPL(mask_src == 0
-                        || (desc.src_desc.ndims == 2 && mask_src == 1 << 1),
-                VERBOSE_UNSUPPORTED_ZP_CFG);
-        VCHECK_MATMUL_UNIMPL(utils::one_of(mask_wei, 0, wei_qmask_N,
-                                     wei_qmask_N + wei_qmask_K),
-                VERBOSE_UNSUPPORTED_ZP_CFG);
-        VCHECK_MATMUL_UNIMPL(mask_dst == 0
-                        || (desc.dst_desc.ndims == 2 && mask_dst == 1 << 1),
-                VERBOSE_UNSUPPORTED_ZP_CFG);
+        dim_t src_zero_point_group_k = 1;
+        if (!zp.has_default_values(DNNL_ARG_SRC)) {
+            const int mask_src = zp.get_mask(DNNL_ARG_SRC);
 
-        if (utils::one_of(zp.get_data_type(DNNL_ARG_WEIGHTS), data_type::s4,
-                    data_type::u4)) {
-            dim_t k = desc.weights_desc.dims[ndims_wei - 2];
-            dim_t n = desc.weights_desc.dims[ndims_wei - 1];
-            VCHECK_MATMUL_UNIMPL(
-                    IMPLICATION(mask_wei & wei_qmask_K, k % 2 == 0),
+            VCHECK_MATMUL_UNIMPL(utils::one_of(mask_src, 0, src_qmask_K,
+                                         src_qmask_M + src_qmask_K),
                     VERBOSE_UNSUPPORTED_ZP_CFG);
-            VCHECK_MATMUL_UNIMPL(
-                    IMPLICATION(mask_wei & wei_qmask_N, n % 2 == 0),
+
+            if (!zp.get(DNNL_ARG_SRC).has_default_groups()) {
+                if (mask_src & src_qmask_K)
+                    src_zero_point_group_k = zp.get_group(DNNL_ARG_SRC, 1);
+            }
+
+            // Due to hardware specifics, groups should be multiple of 32.
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(src_zero_point_group_k > 1,
+                                         src_zero_point_group_k % 32 == 0),
                     VERBOSE_UNSUPPORTED_ZP_CFG);
         }
+
+        dim_t wei_zero_point_group_k = 1;
+        dim_t wei_zero_point_group_n = 1;
+        if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+            const int mask_wei = zp.get_mask(DNNL_ARG_WEIGHTS);
+
+            // Masks for weights zero_points can be any - skipping them.
+
+            if (!zp.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+                if (mask_wei & wei_qmask_K)
+                    wei_zero_point_group_k = zp.get_group(DNNL_ARG_WEIGHTS, 0);
+                if (mask_wei & wei_qmask_N)
+                    wei_zero_point_group_n = zp.get_group(DNNL_ARG_WEIGHTS, 1);
+            }
+
+            // Groups per N are solely for weights decompression as it's
+            // impossible to get performant kernel for a single `k` element in
+            // chain for regular quantized case.
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(wei_zero_point_group_n > 1,
+                                         attr->fpmath_.apply_to_int_),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+
+            // Due to hardware specifics, groups should be multiple of 32.
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(wei_zero_point_group_k > 1,
+                                         wei_zero_point_group_k % 32 == 0),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+            VCHECK_MATMUL_UNIMPL(IMPLICATION(wei_zero_point_group_n > 1,
+                                         wei_zero_point_group_n % 32 == 0),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+
+            if (utils::one_of(zp.get_data_type(DNNL_ARG_WEIGHTS), data_type::s4,
+                        data_type::u4)) {
+                dim_t k = desc.weights_desc.dims[ndims_wei - 2];
+                dim_t n = desc.weights_desc.dims[ndims_wei - 1];
+                VCHECK_MATMUL_UNIMPL(
+                        IMPLICATION(mask_wei & wei_qmask_K, k % 2 == 0),
+                        VERBOSE_UNSUPPORTED_ZP_CFG);
+                VCHECK_MATMUL_UNIMPL(
+                        IMPLICATION(mask_wei & wei_qmask_N, n % 2 == 0),
+                        VERBOSE_UNSUPPORTED_ZP_CFG);
+            }
+        }
+
+        if (!zp.has_default_values(DNNL_ARG_DST)) {
+            const int mask_dst = zp.get_mask(DNNL_ARG_DST);
+
+            VCHECK_MATMUL_UNIMPL(mask_dst == 0
+                            || (desc.dst_desc.ndims == 2 && mask_dst == 1 << 1),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+        }
+
+        // Check dependency between zero_points.
+        // Source zero_points groups are supported for int8 source and must
+        // divide or be divided by weights groups when both are greater than 1.
+        const bool groups_are_divisible = IMPLICATION(
+                src_zero_point_group_k > 1 && wei_zero_point_group_k > 1,
+                (src_zero_point_group_k % wei_zero_point_group_k == 0)
+                        || (wei_zero_point_group_k % src_zero_point_group_k
+                                == 0));
+        VCHECK_MATMUL_UNIMPL(IMPLICATION(src_zero_point_group_k > 1,
+                                     src_is_int8 && groups_are_divisible),
+                VERBOSE_UNSUPPORTED_ZP_CFG);
     }
 
     // Check post-ops
@@ -160,6 +269,9 @@ status_t matmul_attr_check(const matmul_desc_t &desc, const engine_t *engine,
         VCHECK_MATMUL_UNIMPL(
                 po.check_sum_consistency(dst_dt, src_is_int8, true),
                 VERBOSE_UNSUPPORTED_POSTOP);
+
+        // Note: verbose support is inside the call.
+        CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
     }
 
     return status::success;
@@ -171,10 +283,16 @@ namespace dnnl {
 namespace impl {
 status_t matmul_desc_init(matmul_desc_t *matmul_desc,
         const memory_desc_t *src_desc, const memory_desc_t *weights_desc,
-        const memory_desc_t *bias_desc, const memory_desc_t *dst_desc) {
+        const memory_desc_t *bias_desc, const memory_desc_t *dst_desc,
+        const memory_desc_t *reduce_desc, matmul_reduce_kind_t reduce_kind) {
     VCHECK_MATMUL(
             !any_null(src_desc, weights_desc, dst_desc), VERBOSE_NULL_ARG);
 
+    // Note: This is an artificial limitation for the internal `reduce` feature
+    // to limit the scope to what is actually used.
+    VCHECK_MATMUL(
+            IMPLICATION(bias_desc, !reduce_desc), VERBOSE_UNSUPPORTED_BIAS_CFG);
+
     auto op_d = matmul_desc_t();
     op_d.primitive_kind = primitive_kind::matmul;
 
@@ -182,8 +300,17 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
     op_d.weights_desc = *weights_desc;
     if (bias_desc) op_d.bias_desc = *bias_desc;
     op_d.dst_desc = *dst_desc;
+    if (reduce_desc) {
+        VCHECK_MATMUL(reduce_desc->format_kind != format_kind::any,
+                VERBOSE_UNSUPPORTED_FORMAT_KIND);
+        op_d.reduce_desc = *reduce_desc;
+        op_d.reduce_kind = reduce_kind;
+        VCHECK_MATMUL(op_d.reduce_kind != matmul_reduce_kind::undef,
+                VERBOSE_BAD_PARAM);
+    }
 
     const bool with_bias = op_d.bias_desc.ndims != 0;
+    const bool with_reduce = op_d.reduce_desc.ndims != 0;
     const int ndims = dst_desc->ndims;
     VCHECK_MATMUL(ndims >= 2 && ndims <= DNNL_MAX_NDIMS, VERBOSE_BAD_NDIMS,
             "dst", ndims);
@@ -191,6 +318,8 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
             VERBOSE_INCONSISTENT_NDIMS, "src", "weights");
     VCHECK_MATMUL(IMPLICATION(with_bias, op_d.bias_desc.ndims == ndims),
             VERBOSE_BAD_NDIMS, "bias", op_d.bias_desc.ndims);
+    VCHECK_MATMUL(IMPLICATION(with_reduce, op_d.reduce_desc.ndims == ndims),
+            VERBOSE_BAD_NDIMS, "reduce", op_d.reduce_desc.ndims);
 
     // check: m, n, k
     const int m_idx = ndims - 2;
@@ -212,15 +341,52 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
                                   dst_desc->dims[m_idx])),
             VERBOSE_INCONSISTENT_DIM, "bias", m_idx, "dst", m_idx);
 
+    VCHECK_MATMUL(IMPLICATION(with_reduce,
+                          one_of(op_d.reduce_desc.dims[n_idx], 1,
+                                  dst_desc->dims[n_idx])),
+            VERBOSE_INCONSISTENT_DIM, "reduce", n_idx, "dst", n_idx);
+    VCHECK_MATMUL(IMPLICATION(with_reduce,
+                          one_of(op_d.reduce_desc.dims[m_idx], 1,
+                                  dst_desc->dims[m_idx])),
+            VERBOSE_INCONSISTENT_DIM, "reduce", m_idx, "dst", m_idx);
+
     const int bia_mask = with_bias
             ? utils::get_dims_mask(dst_desc->dims, op_d.bias_desc.dims, ndims)
             : 0;
 
-    // s4/u4 requires n to be multiple of 2
-    VCHECK_MATMUL(IMPLICATION(utils::one_of(weights_desc->data_type,
-                                      data_type::s4, data_type::u4),
-                          weights_desc->dims[n_idx] % 2 == 0),
-            VERBOSE_BAD_DIM, "weights", n_idx);
+    using namespace data_type;
+    if (weights_desc->format_kind == format_kind::blocked
+            && utils::one_of(
+                    weights_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) {
+        const auto &wei_strides = weights_desc->format_desc.blocking.strides;
+
+        int n_unit_strides = 0;
+        for (int d = 0; d < ndims; d++) {
+            if (wei_strides[d] == 1) {
+                n_unit_strides++;
+                VCHECK_MATMUL(
+                        n_unit_strides <= 1, VERBOSE_BAD_DIM, "weights", d);
+            }
+            VCHECK_MATMUL(
+                    IMPLICATION(wei_strides[d] > 1, wei_strides[d] % 2 == 0),
+                    VERBOSE_BAD_DIM, "weights", d);
+        }
+    }
+    if (src_desc->format_kind == format_kind::blocked
+            && utils::one_of(src_desc->data_type, s4, u4, f4_e2m1, f4_e3m0)) {
+        const auto &src_strides = src_desc->format_desc.blocking.strides;
+
+        int n_unit_strides = 0;
+        for (int d = 0; d < ndims; d++) {
+            if (src_strides[d] == 1) {
+                n_unit_strides++;
+                VCHECK_MATMUL(n_unit_strides <= 1, VERBOSE_BAD_DIM, "src", d);
+            }
+            VCHECK_MATMUL(
+                    IMPLICATION(src_strides[d] > 1, src_strides[d] % 2 == 0),
+                    VERBOSE_BAD_DIM, "src", d);
+        }
+    }
 
     // check if other dims match.
     for (int d = 0; d < ndims - 2; ++d) {
@@ -228,6 +394,7 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
         const dim_t w_dim = weights_desc->dims[d];
         const dim_t d_dim = dst_desc->dims[d];
         const dim_t b_dim = with_bias ? op_d.bias_desc.dims[d] : 0;
+        const dim_t r_dim = with_reduce ? op_d.reduce_desc.dims[d] : 0;
 
         if (one_of(DNNL_RUNTIME_DIM_VAL, s_dim, w_dim, d_dim, b_dim)) {
 
@@ -246,6 +413,8 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
                     VERBOSE_INVALID_BROADCAST, "src", d);
             VCHECK_MATMUL(IMPLICATION(with_bias, one_of(b_dim, 1, d_dim)),
                     VERBOSE_INCONSISTENT_DIM, "bias", d, "dst", d);
+            VCHECK_MATMUL(IMPLICATION(with_reduce, one_of(r_dim, 1, d_dim)),
+                    VERBOSE_INCONSISTENT_DIM, "reduce", d, "dst", d);
         }
     }
 
@@ -256,6 +425,14 @@ status_t matmul_desc_init(matmul_desc_t *matmul_desc,
     *matmul_desc = op_d;
     return status::success;
 }
+
+status_t matmul_desc_init(matmul_desc_t *matmul_desc,
+        const memory_desc_t *src_desc, const memory_desc_t *weights_desc,
+        const memory_desc_t *bias_desc, const memory_desc_t *dst_desc) {
+    return matmul_desc_init(matmul_desc, src_desc, weights_desc, bias_desc,
+            dst_desc, nullptr, matmul_reduce_kind::undef);
+}
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/matmul_pd.hpp b/src/common/matmul_pd.hpp
index f1963d7f8a3..76c43308044 100644
--- a/src/common/matmul_pd.hpp
+++ b/src/common/matmul_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,15 +36,21 @@
 namespace dnnl {
 namespace impl {
 
+status_t matmul_desc_init(matmul_desc_t *matmul_desc,
+        const memory_desc_t *src_desc, const memory_desc_t *weights_desc,
+        const memory_desc_t *bias_desc, const memory_desc_t *dst_desc,
+        const memory_desc_t *reduce_desc, matmul_reduce_kind_t reduce_kind);
+
 status_t matmul_desc_init(matmul_desc_t *matmul_desc,
         const memory_desc_t *src_desc, const memory_desc_t *weights_desc,
         const memory_desc_t *bias_desc, const memory_desc_t *dst_desc);
 
+// NOLINTBEGIN(google-default-arguments)
 struct matmul_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::matmul;
 
-    typedef matmul_pd_t base_class;
-    typedef matmul_pd_t hint_class;
+    using base_class = matmul_pd_t;
+    using hint_class = matmul_pd_t;
 
     const matmul_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -55,8 +61,11 @@ struct matmul_pd_t : public primitive_desc_t {
         const bool input = utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS);
         if (input) return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_BIAS && with_bias()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_BIAS)
+            return with_bias() ? arg_usage_t::input : arg_usage_t::unused;
 
+        if (arg == DNNL_ARG_REDUCE)
+            return with_reduce() ? arg_usage_t::output : arg_usage_t::unused;
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
         return primitive_desc_t::arg_usage(arg);
@@ -69,6 +78,7 @@ struct matmul_pd_t : public primitive_desc_t {
             case DNNL_ARG_WEIGHTS: return weights_md(0);
             case DNNL_ARG_BIAS: return weights_md(1);
             case DNNL_ARG_DST: return dst_md(0, user_input);
+            case DNNL_ARG_REDUCE: return reduce_md(0);
             default: return primitive_desc_t::arg_md(arg);
         }
     }
@@ -93,10 +103,16 @@ struct matmul_pd_t : public primitive_desc_t {
         return &glob_zero_md;
     }
 
+    const memory_desc_t *reduce_md(
+            int index = 0, bool user_input = false) const {
+        if (index == 0) return user_input ? &desc()->reduce_desc : &reduce_md_;
+        return &glob_zero_md;
+    }
+
     int n_inputs() const override {
         return 2 + with_bias() + n_binary_po_inputs() + n_prelu_po_inputs();
     }
-    int n_outputs() const override { return 1; }
+    int n_outputs() const override { return 1 + with_reduce(); }
 
     bool has_zero_dim_memory() const {
         return memory_desc_wrapper(src_md(0)).has_zero_dim()
@@ -113,6 +129,10 @@ struct matmul_pd_t : public primitive_desc_t {
     }
 
     bool with_bias() const { return bias_md_.ndims != 0; }
+    bool with_reduce() const { return reduce_md_.ndims != 0; }
+
+    matmul_reduce_kind_t reduce_kind() const { return desc_.reduce_kind; }
+
     bool batched() const { return ndims() > 2; }
 
     dim_t batch() const {
@@ -159,36 +179,65 @@ struct matmul_pd_t : public primitive_desc_t {
         return 1 << (wei_ndims - 2);
     }
 
+    int dst_qmask_N() const { return wei_qmask_N(); }
+
+    int dst_qmask_M() const { return src_qmask_M(); }
+
     virtual bool attr_scales_ok(const std::vector<int> &supported_args
             = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) const {
-        if (attr()->scales_.has_default_values()) return true;
+        const auto &scales = attr()->scales_;
+        if (scales.has_default_values()) return true;
 
-        bool ok = attr()->scales_.has_default_values(supported_args);
+        bool ok = scales.has_default_values(supported_args);
         for (int arg : supported_args) {
-            const auto &sc = attr()->scales_.get(arg);
-            const auto &mask = sc.mask_;
-            if (sc.has_default_values()) { continue; }
+            if (scales.has_default_values(arg)) { continue; }
 
+            const auto &mask = scales.get_mask(arg);
             if (arg == DNNL_ARG_WEIGHTS) {
+                const auto &g0 = scales.get_group(arg, 0);
+                const auto &g1 = scales.get_group(arg, 1);
+                const bool wei_k_group_ok = IMPLICATION(g0 > 1, K() % g1 == 0);
+                const bool wei_n_group_ok = IMPLICATION(g1 > 1, N() % g0 == 0);
+
+                // Any group is allowed to be greater than 1 but only one at a
+                // time, not both.
+                ok = ok
+                        && IMPLICATION(!scales.get(arg).has_default_groups(),
+                                utils::one_of(1, g0, g1) && wei_k_group_ok
+                                        && wei_n_group_ok);
+
+                // Mask over K dim is allowed for decompression feature only.
+                const bool is_decompression_or_dynquant
+                        = utils::one_of(weights_md(0)->data_type, data_type::s8,
+                                  data_type::u8, data_type::s4, data_type::u4)
+                        && IMPLICATION(
+                                !types::is_integral_dt(src_md()->data_type),
+                                attr()->fpmath_.apply_to_int_);
                 ok = ok
-                        && utils::one_of(mask, 0, wei_qmask_N(),
-                                wei_qmask_K() + wei_qmask_N());
-                ok = ok && utils::one_of(sc.ndims_, 0, 2)
-                        && IMPLICATION(sc.ndims_ == 2,
-                                sc.group_dims_[1] == 1
-                                        && K() % sc.group_dims_[0] == 0);
+                        && IMPLICATION((mask & wei_qmask_K()),
+                                is_decompression_or_dynquant);
             } else if (arg == DNNL_ARG_SRC) {
                 ok = ok
                         && utils::one_of(mask, 0, src_qmask_K(),
                                 src_qmask_M() + src_qmask_K());
-                ok = ok && utils::one_of(sc.ndims_, 0, 2);
-                ok = ok && IMPLICATION((mask & src_qmask_K()), sc.ndims_ == 2);
                 ok = ok
-                        && IMPLICATION(sc.ndims_ == 2,
-                                sc.group_dims_[0] == 1
-                                        && K() % sc.group_dims_[1] == 0);
+                        && IMPLICATION((mask & src_qmask_K()),
+                                !scales.get(arg).has_default_groups());
+                ok = ok
+                        && IMPLICATION(!scales.get(arg).has_default_groups(),
+                                scales.get_group(arg, 0)
+                                        && K() % scales.get_group(arg, 1) == 0);
+            } else if (arg == DNNL_ARG_DST) {
+                ok = ok
+                        && utils::one_of(mask, 0, dst_qmask_N(),
+                                dst_qmask_M() + dst_qmask_N());
+                ok = ok
+                        && IMPLICATION(!scales.get(arg).has_default_groups(),
+                                scales.get_group(arg, 1) == 1
+                                        && (M() % scales.get_group(arg, 0))
+                                                == 0);
             } else {
-                ok = ok && (mask == 0);
+                assert(!"Unsupported arg");
             }
         }
         return ok;
@@ -201,19 +250,22 @@ struct matmul_pd_t : public primitive_desc_t {
     memory_desc_t weights_md_;
     memory_desc_t bias_md_;
     memory_desc_t dst_md_;
+    memory_desc_t reduce_md_;
 
-    matmul_pd_t(const matmul_desc_t *adesc, const primitive_attr_t *attr,
+    matmul_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const matmul_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<matmul_desc_t>(adesc))
         , src_md_(desc_.src_desc)
         , weights_md_(desc_.weights_desc)
         , bias_md_(desc_.bias_desc)
-        , dst_md_(desc_.dst_desc) {}
+        , dst_md_(desc_.dst_desc)
+        , reduce_md_(desc_.reduce_desc) {}
 
     // temporary solution to deal with format `any`
     bool set_default_formats() {
-        for (auto md : {&src_md_, &weights_md_, &bias_md_, &dst_md_}) {
+        for (auto md :
+                {&src_md_, &weights_md_, &bias_md_, &dst_md_, &reduce_md_}) {
             memory_desc_wrapper mdw(md);
             if (mdw.format_any()) {
                 if (mdw.has_runtime_dims_or_strides()) return false;
@@ -229,9 +281,10 @@ struct matmul_pd_t : public primitive_desc_t {
     // call this function.
     bool is_dense_format_kind() {
         return impl::is_dense_format_kind(
-                {&src_md_, &weights_md_, &bias_md_, &dst_md_});
+                {&src_md_, &weights_md_, &bias_md_, &dst_md_, &reduce_md_});
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/memory.cpp b/src/common/memory.cpp
index 745bfcb7d15..dd286cc4346 100644
--- a/src/common/memory.cpp
+++ b/src/common/memory.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,19 +59,15 @@ size_t memory_desc_map_size(const memory_desc_t *md, int index = 0) {
     auto mdw = memory_desc_wrapper(md);
 
     if (mdw.has_runtime_dims_or_strides()) return DNNL_RUNTIME_SIZE_VAL;
-    if (mdw.offset0() == 0) return mdw.size(index);
 
-    memory_desc_t md_no_offset0 = *md;
-    md_no_offset0.offset0 = 0;
-    return memory_desc_wrapper(md_no_offset0).size(index)
-            + md->offset0 * mdw.data_type_size();
+    return mdw.size(index, true, true);
 }
 } // namespace
 
 dnnl_memory::dnnl_memory(dnnl::impl::engine_t *engine,
         const dnnl::impl::memory_desc_t *md, const std::vector<unsigned> &flags,
         const std::vector<void *> &handles)
-    : engine_(engine), md_(*md) {
+    : engine_(engine), md_(*md), counter_(1) {
 
     const size_t nhandles = handles.size();
     std::vector<std::unique_ptr<dnnl::impl::memory_storage_t>> mem_storages(
@@ -91,14 +87,27 @@ dnnl_memory::dnnl_memory(dnnl::impl::engine_t *engine,
 dnnl_memory::dnnl_memory(dnnl::impl::engine_t *engine,
         const dnnl::impl::memory_desc_t *md,
         std::unique_ptr<dnnl::impl::memory_storage_t> &&memory_storage)
-    : engine_(engine), md_(*md) {
+    : engine_(engine), md_(*md), counter_(1) {
     this->reset_memory_storage(std::move(memory_storage));
 }
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+dnnl_memory::dnnl_memory(dnnl::impl::engine_t *engine,
+        const dnnl::impl::memory_desc_t *md,
+        std::vector<std::unique_ptr<dnnl::impl::memory_storage_t>>
+                &&memory_storages)
+    : engine_(engine)
+    , md_(*md)
+    , memory_storages_(std::move(memory_storages))
+    , counter_(1) {}
+#endif
+
 status_t dnnl_memory::set_data_handle(void *handle, int index) const {
     using namespace dnnl::impl;
     void *old_handle;
-    CHECK(memory_storage(index)->get_data_handle(&old_handle));
+    auto *ms = memory_storage(index);
+    if (!ms) return status::invalid_arguments;
+    CHECK(ms->get_data_handle(&old_handle));
     if (handle != old_handle) {
         CHECK(memory_storage(index)->set_data_handle(handle));
     }
@@ -154,13 +163,14 @@ status_t dnnl_memory_create(memory_t **memory, const memory_desc_t *md,
     auto _memory = new memory_t(engine, md, flags, handle_ptr);
     if (_memory == nullptr) return out_of_memory;
     if (_memory->memory_storage() == nullptr) {
-        delete _memory;
+        _memory->release();
         return out_of_memory;
     }
     *memory = _memory;
     return success;
 }
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
 status_t dnnl_memory_create_v2(memory_t **memory, const memory_desc_t *md,
         engine_t *engine, int nhandles, void **handles) {
     const bool args_ok = !any_null(memory, engine, handles) && nhandles > 0;
@@ -169,8 +179,8 @@ status_t dnnl_memory_create_v2(memory_t **memory, const memory_desc_t *md,
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
     if (engine->kind() == engine_kind::gpu)
 #endif
-        return dnnl_sycl_interop_memory_create(
-                memory, md, engine, dnnl_sycl_interop_usm, handles[0]);
+        return dnnl_sycl_interop_memory_create_v2(
+                memory, md, engine, dnnl_sycl_interop_usm, nhandles, handles);
 #endif
     memory_desc_t z_md = types::zero_md();
     if (md == nullptr) md = &z_md;
@@ -196,13 +206,14 @@ status_t dnnl_memory_create_v2(memory_t **memory, const memory_desc_t *md,
     if (_memory == nullptr) return out_of_memory;
     for (size_t i = 0; i < handles_vec.size(); i++) {
         if (_memory->memory_storage((int)i) == nullptr) {
-            delete _memory;
+            _memory->release();
             return out_of_memory;
         }
     }
     *memory = _memory;
     return success;
 }
+#endif
 
 status_t dnnl_memory_get_memory_desc(
         const memory_t *memory, const memory_desc_t **md) {
@@ -288,8 +299,18 @@ status_t dnnl_memory_unmap_data(const memory_t *memory, void *mapped_ptr) {
     return dnnl_memory_unmap_data_v2(memory, mapped_ptr, 0);
 }
 
+status_t dnnl_memory_unmap_data_sparse(
+        const_dnnl_memory_t memory, int index, void *mapped_ptr) {
+    bool args_ok = !any_null(memory);
+    if (!args_ok) return invalid_arguments;
+
+    return memory->memory_storage()->unmap_data(mapped_ptr, nullptr);
+
+    return unimplemented;
+}
+
 status_t dnnl_memory_destroy(memory_t *memory) {
-    delete memory;
+    if (memory != nullptr) memory->release();
     return success;
 }
 
diff --git a/src/common/memory.hpp b/src/common/memory.hpp
index 3d64ac1a028..5dd0de18248 100644
--- a/src/common/memory.hpp
+++ b/src/common/memory.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,7 +55,12 @@ struct dnnl_memory : public dnnl::impl::c_compatible {
     dnnl_memory(dnnl::impl::engine_t *engine,
             const dnnl::impl::memory_desc_t *md,
             std::unique_ptr<dnnl::impl::memory_storage_t> &&memory_storage);
-    virtual ~dnnl_memory() = default;
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    dnnl_memory(dnnl::impl::engine_t *engine,
+            const dnnl::impl::memory_desc_t *md,
+            std::vector<std::unique_ptr<dnnl::impl::memory_storage_t>>
+                    &&memory_storage);
+#endif
 
     /** returns memory's engine */
     dnnl::impl::engine_t *engine() const { return engine_; }
@@ -77,7 +82,9 @@ struct dnnl_memory : public dnnl::impl::c_compatible {
 
     /** returns data handle */
     dnnl::impl::status_t get_data_handle(void **handle, int index = 0) const {
-        return memory_storage(index)->get_data_handle(handle);
+        auto ms = memory_storage(index);
+        if (!ms) return dnnl::impl::status::invalid_arguments;
+        return ms->get_data_handle(handle);
     }
 
     /** sets data handle */
@@ -91,7 +98,15 @@ struct dnnl_memory : public dnnl::impl::c_compatible {
 
     size_t get_num_handles() const { return memory_storages_.size(); }
 
+    void retain() { counter_++; }
+
+    void release() {
+        if (--counter_ == 0) { delete this; }
+    }
+
 protected:
+    virtual ~dnnl_memory() = default;
+
     dnnl::impl::engine_t *engine_;
     const dnnl::impl::memory_desc_t md_;
 
@@ -101,6 +116,17 @@ struct dnnl_memory : public dnnl::impl::c_compatible {
 
     // Number of storages is larger than 1 only for sparse memory.
     std::vector<std::unique_ptr<dnnl::impl::memory_storage_t>> memory_storages_;
+    std::atomic<int> counter_;
 };
 
+namespace dnnl {
+namespace impl {
+
+struct memory_deleter_t {
+    void operator()(memory_t *m) const { m->release(); }
+};
+
+} // namespace impl
+} // namespace dnnl
+
 #endif
diff --git a/src/common/memory_desc.cpp b/src/common/memory_desc.cpp
index 15115eab9e5..fc22cd5dbc7 100644
--- a/src/common/memory_desc.cpp
+++ b/src/common/memory_desc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -109,6 +109,7 @@ status_t memory_desc_init_by_strides(memory_desc_t &memory_desc, int ndims,
     return success;
 }
 
+#if 0
 status_t memory_desc_init_by_csr_encoding(memory_desc_t &memory_desc, int ndims,
         const dims_t dims, data_type_t data_type, dim_t nnz,
         data_type_t indices_dt, data_type_t pointers_dt) {
@@ -139,6 +140,37 @@ status_t memory_desc_init_by_csr_encoding(memory_desc_t &memory_desc, int ndims,
 
     return success;
 }
+#endif
+
+status_t memory_desc_init_by_coo_encoding(memory_desc_t &memory_desc, int ndims,
+        const dims_t dims, data_type_t data_type, dim_t nnz,
+        data_type_t indices_dt) {
+    if (ndims == 0) {
+        memory_desc = types::zero_md();
+        return success;
+    }
+
+    // This is the only number of dims that is supported at this point.
+    VCHECK_MEMORY(ndims <= 2, unimplemented, VERBOSE_BAD_NDIMS, "", ndims);
+
+    bool args_ok = memory_desc_sanity_check(
+            ndims, dims, data_type, format_kind::undef);
+    VCHECK_MEMORY(args_ok, invalid_arguments, VERBOSE_MEM_DESC_CHECK_FAIL);
+
+    auto md = memory_desc_t();
+    md.ndims = ndims;
+    array_copy(md.dims, dims, ndims);
+    md.data_type = data_type;
+    array_copy(md.padded_dims, dims, ndims);
+    md.format_kind = format_kind::sparse;
+    md.format_desc.sparse_desc.encoding = sparse_encoding::coo;
+    md.format_desc.sparse_desc.nnz = nnz;
+    md.format_desc.sparse_desc.metadata_types[0] = indices_dt;
+
+    memory_desc = md;
+
+    return success;
+}
 
 status_t memory_desc_init_by_packed_encoding(memory_desc_t &memory_desc,
         int ndims, const dims_t dims, data_type_t data_type, dim_t nnz) {
@@ -441,8 +473,9 @@ status_t memory_desc_permute_axes(memory_desc_t &out_memory_desc,
     VCHECK_MEMORY(
             !memory_desc_wrapper(in_memory_desc).has_runtime_dims_or_strides(),
             invalid_arguments, VERBOSE_UNSUPPORTED_MEM_STRIDE);
-    VCHECK_MEMORY(in_memory_desc.extra.flags == 0, invalid_arguments,
-            VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
+    VCHECK_MEMORY(
+            check_md_extra_flags_compensation_gpu(in_memory_desc.extra.flags),
+            invalid_arguments, VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
 
     // verify that perm is indeed a permutation of [0 .. ndims)
     unsigned occurrence_mask = 0;
@@ -500,8 +533,10 @@ status_t memory_desc_init_by_string_tag(memory_desc_t &md, int ndims,
             pos--;
 
         int dim_idx = std::tolower(tag[pos0]) - 'a';
-        VCHECK_MEMORY(dim_idx < ndims, invalid_arguments, VERBOSE_BAD_NDIMS, "",
-                ndims);
+        VCHECK_MEMORY(dim_idx < ndims, invalid_arguments,
+                "ndims deduced (%d) from the tag \'%s\' is inconsistent with "
+                "provided ndims (%d)",
+                dim_idx + 1, tag.c_str(), ndims);
         ndims_from_tag = std::max(dim_idx + 1, ndims_from_tag);
         int block_str_len = pos0 - pos - 1;
         bool is_blocked = block_str_len > 0;
@@ -511,7 +546,9 @@ status_t memory_desc_init_by_string_tag(memory_desc_t &md, int ndims,
         dim_blocks.emplace_back(dim_idx, block);
     }
     VCHECK_MEMORY((ndims_from_tag == ndims), invalid_arguments,
-            VERBOSE_BAD_NDIMS, "", ndims);
+            "ndims deduced (%d) from the tag \'%s\' is inconsistent with "
+            "provided ndims (%d)",
+            ndims_from_tag, tag.c_str(), ndims);
 
     auto &blk = md.format_desc.blocking;
 
@@ -579,6 +616,7 @@ status_t dnnl_memory_desc_create_with_strides(memory_desc_t **memory_desc,
     return success;
 }
 
+#if 0
 status_t dnnl_memory_desc_create_with_csr_encoding(memory_desc_t **memory_desc,
         int ndims, const dims_t dims, data_type_t data_type, dim_t nnz,
         data_type_t indices_dt, data_type_t pointers_dt) {
@@ -591,6 +629,53 @@ status_t dnnl_memory_desc_create_with_csr_encoding(memory_desc_t **memory_desc,
     (*memory_desc) = md.release();
     return success;
 }
+#endif
+
+status_t dnnl_memory_desc_init_sparse(sparse_desc_t **sparse_desc,
+        sparse_encoding_t encoding) {
+    if (!sparse_desc) return invalid_arguments;
+    auto sd = utils::make_unique<sparse_desc_t>();
+
+    sd->encoding = encoding;
+    *sparse_desc = sd.release();
+
+    return success;
+}
+
+status_t dnnl_memory_desc_create_sparse(memory_desc_t **memory_desc,
+        sparse_encoding_t encoding, int ndims,
+        const dims_t dims, data_type_t data_type) {
+
+    sparse_desc_t* sd = nullptr;;
+    CHECK(dnnl_memory_desc_init_sparse(&sd, encoding));
+
+    auto md = utils::make_unique<memory_desc_t>();
+    if (!md) return out_of_memory;
+
+    md->ndims = ndims;
+    array_copy(md->dims, dims, ndims);
+    md->data_type = data_type;
+    array_copy(md->padded_dims, dims, ndims);
+    md->format_kind = format_kind::sparse;
+    md->format_desc.sparse_desc = *sd;
+
+    *memory_desc = md.release();;
+
+    return success;
+}
+
+status_t dnnl_memory_desc_create_with_coo_encoding(memory_desc_t **memory_desc,
+        int ndims, const dims_t dims, data_type_t data_type, dim_t nnz,
+        data_type_t indices_dt) {
+    if (any_null(memory_desc)) return invalid_arguments;
+
+    auto md = utils::make_unique<memory_desc_t>();
+    if (!md) return out_of_memory;
+    CHECK(memory_desc_init_by_coo_encoding(
+            *md, ndims, dims, data_type, nnz, indices_dt));
+    (*memory_desc) = md.release();
+    return success;
+}
 
 status_t dnnl_memory_desc_create_with_packed_encoding(
         memory_desc_t **memory_desc, int ndims, const dims_t dims,
@@ -679,6 +764,7 @@ status_t dnnl_memory_desc_query(
         case query::format_kind:
             switch ((int)md->format_kind) {
                 case format_kind::rnn_packed:
+                case format_kind::cublaslt_blocked:
                 case format_kind::wino:
                     *(format_kind_t *)result = format_kind::opaque;
                     break;
@@ -701,6 +787,10 @@ status_t dnnl_memory_desc_query(
             if (!is_blocked) return status::invalid_arguments;
             *(const dims_t **)result = &md->format_desc.blocking.inner_idxs;
             break;
+        case query::sparse_encoding:
+            if (md->format_kind != format_kind::sparse) return status::invalid_arguments;
+            *(const dnnl_sparse_encoding_t **)result = &md->format_desc.sparse_desc.encoding;
+            break;
         default: return status::unimplemented;
     }
     return status::success;
@@ -728,12 +818,20 @@ status_t dnnl_memory_desc_query_v2(
         case query::data_type:
             *(data_type_t *)result = (index == 0)
                     ? md->data_type
-                    : md->format_desc.sparse_desc.metadata_types[index - 1];
+                    : md->format_desc.sparse_desc.metadata_types
+                              [md->format_desc.sparse_desc.encoding
+                                                      == sparse_encoding_t::
+                                                              dnnl_coo
+                                              ? 0
+                                              : index - 1];
             break;
         case query::num_handles_s32:
             if (is_sparse) {
                 switch (md->format_desc.sparse_desc.encoding) {
                     case sparse_encoding::csr:
+                    case sparse_encoding::coo:
+                        *(int *)result = md->ndims + 1;
+                        break;
                     case sparse_encoding::packed: *(int *)result = 3; break;
                     default: assert(!"unknown encoding"); *(int *)result = 0;
                 }
diff --git a/src/common/memory_desc.hpp b/src/common/memory_desc.hpp
index b8045a2b144..00efd877a10 100644
--- a/src/common/memory_desc.hpp
+++ b/src/common/memory_desc.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 namespace dnnl {
 namespace impl {
 
+enum class cublaslt_memory_format_t { col32_2r_4r4 };
+
 // Winograd-specific formats
 enum class wino_memory_format_t {
     // Undefined memory format, used for empty memory descriptors.
@@ -54,7 +56,7 @@ const rnn_packed_memory_format_t ldio_p = rnn_packed_memory_format_t::ldio_p;
 // TODO: convert to 'enum class'.
 // Flags for memory special features
 enum memory_extra_flags_t {
-    dnnl_memory_extra_flag_none = 0x0U,
+    dnnl_memory_extra_flag_none = 0u,
     // Indicates the weights have an additional buffer, that depends on the
     // @p compensation_mask.
     //
@@ -62,13 +64,22 @@ enum memory_extra_flags_t {
     // the additional buffer would consist of OC values:
     // O[oc : 0,OC] =
     //  -128 * SUM(ic : 0,IC; kh : 0,KH; kw : 0,KW){ weights(oc, ic, kh, kw) }
-    dnnl_memory_extra_flag_compensation_conv_s8s8 = 0x1U,
-    dnnl_memory_extra_flag_scale_adjust = 0x2U,
-    dnnl_memory_extra_flag_rnn_u8s8_compensation = 0x4U,
+    dnnl_memory_extra_flag_compensation_conv_s8s8 = 1u,
+    dnnl_memory_extra_flag_scale_adjust = 2u,
+    dnnl_memory_extra_flag_rnn_u8s8_compensation = 4u,
     dnnl_memory_extra_flag_gpu_rnn_u8s8_compensation
     = dnnl_memory_extra_flag_rnn_u8s8_compensation,
-    dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 0x8U,
-    dnnl_memory_extra_flag_rnn_s8s8_compensation = 0x16U,
+    dnnl_memory_extra_flag_compensation_conv_asymmetric_src = 8u,
+    dnnl_memory_extra_flag_rnn_s8s8_compensation = 16u,
+    // This flag has to be kept separate from *compensation_conv_asymmetric_src
+    // since the GPU precompute algorithm is incompatible with that of the CPU
+    dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src = 32u,
+    // This flag depends on *compensation_gpu_conv_asymmetric_src and is used
+    // when precompute is to be performed for a backward-by-data convolution
+    dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd = 64u,
+    // This flag depends on *compensation_gpu_conv_asymmetric_src and is used
+    // when IC and OC are swapped to reinterpret a deconv as a BWD_D conv
+    dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap = 128u,
 };
 
 // Create aliases for extra flags to preserve the old behavior.
@@ -85,8 +96,23 @@ const memory_extra_flags_t rnn_s8s8_compensation
         = dnnl_memory_extra_flag_rnn_s8s8_compensation;
 const memory_extra_flags_t compensation_conv_asymmetric_src
         = dnnl_memory_extra_flag_compensation_conv_asymmetric_src;
+const memory_extra_flags_t compensation_gpu_conv_asymmetric_src
+        = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src;
+const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_bwd
+        = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_bwd;
+const memory_extra_flags_t compensation_gpu_conv_asymmetric_src_swap
+        = dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src_swap;
 } // namespace memory_extra_flags
 
+inline bool check_md_extra_flags_compensation_gpu(uint64_t flags) {
+    using namespace memory_extra_flags;
+    const uint64_t c = compensation_gpu_conv_asymmetric_src;
+    const uint64_t b = compensation_gpu_conv_asymmetric_src_bwd;
+    const uint64_t s = compensation_gpu_conv_asymmetric_src_swap;
+    return (flags == none) || (flags == c) || (flags == (c | b))
+            || (flags == (c | b | s));
+}
+
 // Generic description of blocked data layout for most memory formats.
 struct blocking_desc_t {
     // The strides between the outermost blocks.
@@ -135,6 +161,12 @@ struct rnn_packed_desc_t {
     size_t size;
 };
 
+struct cublaslt_blocked_desc_t {
+    cublaslt_memory_format_t cublaslt_format;
+    size_t size;
+};
+
+#if 0
 struct sparse_desc_t {
     static constexpr int max_metadata_types = 2;
     // Each encoding defines the number of handles it requires and their
@@ -193,6 +225,21 @@ struct sparse_desc_t {
     // - Use the block number to find an offset in the packed data
     // - Use the bitmask to unpack the packed data
     blocking_desc_t packed_desc;
+}
+#endif
+
+struct sparse_desc_t {
+    static constexpr int max_metadata_types = 2;
+    /// Specifies what encoding is used.
+    dnnl_sparse_encoding_t encoding;
+    /// Descriptor for blocked bitmask - opaque.
+    blocking_desc_t packed_desc;
+    // Number of non-zero entries.
+    dnnl_dim_t nnz;
+    // Metadata types. Each encoding defines how to interpret these.
+    // - CSR: 0th - index data type
+    //        1st - pointer data type
+    dnnl_data_type_t metadata_types[max_metadata_types];
 };
 
 // Description of extra information stored in memory
@@ -201,7 +248,12 @@ struct memory_extra_desc_t {
         : flags(0)
         , compensation_mask(0)
         , scale_adjust(0.0f)
-        , asymm_compensation_mask(0) {}
+        , asymm_compensation_mask(0)
+        , idhw {0, 0, 0}
+        , odhw {0, 0, 0}
+        , pdhw {0, 0, 0}
+        , ddhw {0, 0, 0}
+        , dst_size(0) {}
     // The flags contain arbitrary extra information, such as compensation.
     // @sa dnnl_memory_extra_flags_t
     uint64_t flags;
@@ -211,6 +263,16 @@ struct memory_extra_desc_t {
     float scale_adjust;
     // Compensation mask for asymmetric quantization
     int asymm_compensation_mask;
+    // Precomp GPU ZP convolution input spatials
+    dim_t idhw[3];
+    // Precomp GPU ZP convolution output spatials
+    dim_t odhw[3];
+    // Precomp GPU ZP convolution padding spatials
+    dim_t pdhw[3];
+    // Precomp GPU ZP convolution dilation spatials
+    dim_t ddhw[3];
+    // Precomp GPU ZP convolution destination size
+    dim_t dst_size;
 };
 
 status_t DNNL_API memory_desc_init_by_tag(memory_desc_t &memory_desc, int ndims,
@@ -245,8 +307,7 @@ struct dnnl_memory_desc : public dnnl::impl::c_compatible {
         , padded_offsets {}
         , offset0(0)
         , format_kind(dnnl::impl::format_kind::undef)
-        , format_desc {}
-        , extra {} {}
+        , format_desc {} {}
     // Number of dimensions
     int ndims;
     // Dimensions in the following order:
@@ -289,6 +350,8 @@ struct dnnl_memory_desc : public dnnl::impl::c_compatible {
         dnnl::impl::wino_desc_t wino_desc;
         // Tensor of packed weights for RNN.
         dnnl::impl::rnn_packed_desc_t rnn_packed_desc;
+        // Description of the data layout for memory formats used in cublasLt IMMA kernels.
+        dnnl::impl::cublaslt_blocked_desc_t cublaslt_blocked_desc;
         // Description of the sparse encodings.
         dnnl::impl::sparse_desc_t sparse_desc;
         // ... other descriptions possible
diff --git a/src/common/memory_desc_wrapper.cpp b/src/common/memory_desc_wrapper.cpp
index 4d6cb0b92cc..3636d051b18 100644
--- a/src/common/memory_desc_wrapper.cpp
+++ b/src/common/memory_desc_wrapper.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,11 +27,10 @@
 namespace dnnl {
 namespace impl {
 
-status_t fill_blocked(memory_desc_t &md, std::initializer_list<int> perm,
-        std::initializer_list<int> inner_blks,
-        std::initializer_list<int> inner_idxs) {
+template<typename T>
+static status_t fill_blocked_impl(memory_desc_t &md, T&& perm, T&& inner_blks, T&& inner_idxs) {
     const bool ok = true && perm.size() == (size_t)md.ndims
-            && inner_blks.size() == inner_idxs.size();
+                    && inner_blks.size() == inner_idxs.size();
     if (!ok) return status::invalid_arguments;
 
     md.offset0 = 0;
@@ -81,6 +81,27 @@ status_t fill_blocked(memory_desc_t &md, std::initializer_list<int> perm,
     return status::success;
 }
 
+status_t fill_blocked(memory_desc_t &md,
+                      std::initializer_list<int> perm,
+                      std::initializer_list<int> inner_blks,
+                      std::initializer_list<int> inner_idxs) {
+    return fill_blocked_impl(md, perm, inner_blks, inner_idxs);
+}
+
+status_t fill_blocked(memory_desc_t &md,
+                      std::vector<int>& perm,
+                      std::vector<int>& inner_blks,
+                      std::vector<int>& inner_idxs) {
+    return fill_blocked_impl(md, perm, inner_blks, inner_idxs);
+}
+
+status_t fill_blocked(memory_desc_t &md,
+                      std::vector<dim_t>& perm,
+                      std::vector<dim_t>& inner_blks,
+                      std::vector<dim_t>& inner_idxs) {
+    return fill_blocked_impl(md, perm, inner_blks, inner_idxs);
+}
+
 void memory_desc_wrapper::compute_strides_compat(dims_t *strides_compat) const {
 
     if (ndims() == 0) return;
@@ -125,15 +146,15 @@ void memory_desc_wrapper::compute_strides_compat(dims_t *strides_compat) const {
     utils::array_copy(strides_compat[1], inner_strides, ndims());
 }
 
-status_t memory_desc_wrapper::compute_blocking(
-        memory_desc_t &memory_desc, format_tag_t tag) {
+template<typename F, typename... Args>
+status_t process_tag(F f, format_tag_t tag, Args&&... args) {
     using namespace format_tag;
 
-    VCHECK_MEMORY((memory_desc.ndims != 0), status::invalid_arguments,
-            VERBOSE_BAD_NDIMS, "", 0);
+    // VCHECK_MEMORY((memory_desc.ndims != 0), status::invalid_arguments,
+    //         VERBOSE_BAD_NDIMS, "", 0);
 
 #define C(tag, ... /* perm, inner_blks, inner_idxs */) \
-    case tag: return fill_blocked(memory_desc, __VA_ARGS__)
+    case tag: return f(std::forward<Args>(args)..., __VA_ARGS__)
 
     switch (tag) {
         C(a, {0}, {}, {});
@@ -192,6 +213,7 @@ status_t memory_desc_wrapper::compute_blocking(
 
         C(Ab4a, {0, 1}, {4}, {0});
         C(Ab8a, {0, 1}, {8}, {0});
+        C(Ab32a, {0, 1}, {32}, {0});
 
         C(BA4b4a, {1, 0}, {4, 4}, {1, 0});
         C(BA8b4a, {1, 0}, {8, 4}, {1, 0});
@@ -200,6 +222,9 @@ status_t memory_desc_wrapper::compute_blocking(
         C(BA16a32b, {1, 0}, {16, 32}, {0, 1});
         C(BA16a48b, {1, 0}, {16, 48}, {0, 1});
         C(BA16a64b, {1, 0}, {16, 64}, {0, 1});
+        C(BA24b8a, {1, 0}, {24, 8}, {1, 0});
+        C(aCB24c8b, {0, 2, 1}, {24, 8}, {2, 1});
+        C(abDC24d8c, {0, 1, 3, 2}, {24, 8}, {3, 2});
         C(BA16a16b2a, {1, 0}, {16, 16, 2}, {0, 1, 0});
         C(BA16a32b2a, {1, 0}, {16, 32, 2}, {0, 1, 0});
         C(BA16a48b2a, {1, 0}, {16, 48, 2}, {0, 1, 0});
@@ -396,6 +421,7 @@ status_t memory_desc_wrapper::compute_blocking(
         C(ABcd8a16b2a, {0, 1, 2, 3}, {8, 16, 2}, {0, 1, 0});
         C(BAcd8a16b2a, {1, 0, 2, 3}, {8, 16, 2}, {0, 1, 0});
         C(ABcd8a8b, {0, 1, 2, 3}, {8, 8}, {0, 1});
+        C(ABcd8a32b, {0, 1, 2, 3}, {8, 32}, {0, 1});
         C(ABcd8a4b, {0, 1, 2, 3}, {8, 4}, {0, 1});
         C(ABcd8a2b, {0, 1, 2, 3}, {8, 2}, {0, 1});
         C(aBcd8b, {0, 1, 2, 3}, {8}, {1});
@@ -515,7 +541,9 @@ status_t memory_desc_wrapper::compute_blocking(
         C(Acb8a, {0, 2, 1}, {8}, {0});
         C(AcB8a2b, {0, 2, 1}, {8, 2}, {0, 1});
         C(AcB8a4b, {0, 2, 1}, {8, 4}, {0, 1});
+        C(aCBd8b8c, {0, 2, 1, 3}, {8, 8}, {1, 2});
         C(aCBd16b16c, {0, 2, 1, 3}, {16, 16}, {1, 2});
+        C(aCBde8b8c, {0, 2, 1, 3, 4}, {8, 8}, {1, 2});
         C(aCBde16b16c, {0, 2, 1, 3, 4}, {16, 16}, {1, 2});
         C(Acdb16a, {0, 2, 3, 1}, {16}, {0});
         C(AcdB16a2b, {0, 2, 3, 1}, {16, 2}, {0, 1});
@@ -531,7 +559,9 @@ status_t memory_desc_wrapper::compute_blocking(
         C(AcdeB8a4b, {0, 2, 3, 4, 1}, {8, 4}, {0, 1});
         C(Acedb16a, {0, 2, 4, 3, 1}, {16}, {0});
         C(Adcb16a, {0, 3, 2, 1}, {16}, {0});
+        C(BAc8a8b, {1, 0, 2}, {8, 8}, {0, 1});
         C(BAc16a16b, {1, 0, 2}, {16, 16}, {0, 1});
+        C(BAcd8a8b, {1, 0, 2, 3}, {8, 8}, {0, 1});
         C(BAcd16a16b, {1, 0, 2, 3}, {16, 16}, {0, 1});
         C(ABc32a16b, {0, 1, 2}, {32, 16}, {0, 1});
         C(ABcd32a16b, {0, 1, 2, 3}, {32, 16}, {0, 1});
@@ -584,13 +614,17 @@ status_t memory_desc_wrapper::compute_blocking(
         C(aBCde2b8c8b2c, {0, 1, 2, 3, 4}, {2, 8, 8, 2}, {1, 2, 1, 2});
         C(aBdec32b, {0, 1, 3, 4, 2}, {32}, {1});
         C(aCBdef16c16b, {0, 2, 1, 3, 4, 5}, {16, 16}, {2, 1});
+        C(aCBdef8b8c, {0, 2, 1, 3, 4, 5}, {8, 8}, {1, 2});
         C(aCBdef16b16c, {0, 2, 1, 3, 4, 5}, {16, 16}, {1, 2});
+        C(Abcdef4a, {0, 1, 2, 3, 4, 5}, {4}, {0});
+        C(Abcdef8a, {0, 1, 2, 3, 4, 5}, {8}, {0});
         C(Abcdef16a, {0, 1, 2, 3, 4, 5}, {16}, {0});
         C(Abcdef32a, {0, 1, 2, 3, 4, 5}, {32}, {0});
         C(aCBd16c16b, {0, 2, 1, 3}, {16, 16}, {2, 1});
         C(aCBde16c16b, {0, 2, 1, 3, 4}, {16, 16}, {2, 1});
         C(Acdb32a, {0, 2, 3, 1}, {32}, {0});
         C(BAcd16b16a, {1, 0, 2, 3}, {16, 16}, {1, 0});
+        C(BAcde8a8b, {1, 0, 2, 3, 4}, {8, 8}, {0, 1});
         C(BAcde16a16b, {1, 0, 2, 3, 4}, {16, 16}, {0, 1});
         C(BAc16b16a, {1, 0, 2}, {16, 16}, {1, 0});
         C(aBCd2b4c2b, {0, 1, 2, 3}, {2, 4, 2}, {1, 2, 1});
@@ -611,6 +645,7 @@ status_t memory_desc_wrapper::compute_blocking(
         C(AB8a2b, {0, 1}, {8, 2}, {0, 1});
         C(abDc16d, {0, 1, 3, 2}, {16}, {3});
         C(abDc32d, {0, 1, 3, 2}, {32}, {3});
+        C(abDC16d4c, {0, 1, 3, 2}, {16, 4}, {3, 2});
         C(abDC32d4c, {0, 1, 3, 2}, {32, 4}, {3, 2});
         C(abCd4c, {0, 1, 2, 3}, {4}, {2});
         C(abCde4c, {0, 1, 2, 3, 4}, {4}, {2});
@@ -620,6 +655,7 @@ status_t memory_desc_wrapper::compute_blocking(
         C(abCdef32c, {0, 1, 2, 3, 4, 5}, {32}, {2});
         C(abdEc16e, {0, 1, 3, 4, 2}, {16}, {4});
         C(abdEc32e, {0, 1, 3, 4, 2}, {32}, {4});
+        C(abdEC16e4c, {0, 1, 3, 4, 2}, {16, 4}, {4, 2});
         C(abdEC32e2c, {0, 1, 3, 4, 2}, {32, 2}, {4, 2});
         C(abdEC32e4c, {0, 1, 3, 4, 2}, {32, 4}, {4, 2});
         C(abdEC64e2c, {0, 1, 3, 4, 2}, {64, 2}, {4, 2});
@@ -991,6 +1027,28 @@ status_t memory_desc_wrapper::compute_blocking(
     return status::invalid_arguments;
 }
 
+status_t memory_desc_wrapper::compute_blocking(memory_desc_t &memory_desc, format_tag_t tag) {
+    using fill_blocked_t = status_t(memory_desc_t&, std::initializer_list<int>, std::initializer_list<int>, std::initializer_list<int>);
+    if (memory_desc.ndims == 0) return status::invalid_arguments;
+    return process_tag<fill_blocked_t>(fill_blocked, tag, memory_desc);
+}
+
+status_t memory_desc_wrapper::compute_blocking(format_tag_t tag,
+                          std::vector<size_t> &perm,
+                          std::vector<size_t> &inner_blks,
+                          std::vector<size_t> &inner_idxs) {
+
+    auto extract_data = [&](std::initializer_list<int> _perm,
+                            std::initializer_list<int> _inner_blks,
+                            std::initializer_list<int> _inner_idxs) -> status_t {
+        perm = {_perm.begin(), _perm.end()};
+        inner_blks = {_inner_blks.begin(), _inner_blks.end()};
+        inner_idxs = {_inner_idxs.begin(), _inner_idxs.end()};
+        return status::success;
+    };
+    return process_tag(extract_data, tag);
+}
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp
index 0d85e63437d..6cfad59ad81 100644
--- a/src/common/memory_desc_wrapper.hpp
+++ b/src/common/memory_desc_wrapper.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,8 +32,20 @@
 namespace dnnl {
 namespace impl {
 
+status_t fill_blocked(memory_desc_t &md,
+                      std::vector<int> &perm,
+                      std::vector<int> &inner_blks,
+                      std::vector<int> &inner_idxs);
+
+status_t fill_blocked(memory_desc_t &md,
+                      std::vector<dim_t> &perm,
+                      std::vector<dim_t> &inner_blks,
+                      std::vector<dim_t> &inner_idxs);
+
+
 /** thin wrapper class over \struct memory_desc_t which allows easy
  * manipulations with underlying C structure, which is taken by reference */
+// NOLINTNEXTLINE(readability-identifier-naming)
 struct memory_desc_wrapper : public c_compatible {
     const memory_desc_t *md_;
 
@@ -67,12 +79,20 @@ struct memory_desc_wrapper : public c_compatible {
     bool is_rnn_packed_desc() const {
         return format_kind() == format_kind::rnn_packed;
     }
+    bool is_cublaslt_blocked_desc() const {
+        return format_kind() == format_kind::cublaslt_blocked;
+    }
     bool is_sparse_desc() const { return format_kind() == format_kind::sparse; }
 
+    bool is_blocking_or_sparse_packed_desc() const {
+        return is_blocking_desc()
+                || (is_sparse_desc()
+                        && sparse_desc().encoding == sparse_encoding::packed);
+    }
+
     const blocking_desc_t &blocking_desc() const {
-        assert(is_blocking_desc() || is_sparse_packed_desc());
-        if (!is_sparse_desc()) return md_->format_desc.blocking;
-        return sparse_desc().packed_desc;
+        assert(is_blocking_desc());
+        return md_->format_desc.blocking;
     }
     const wino_desc_t &wino_desc() const {
         assert(is_wino_desc());
@@ -82,6 +102,10 @@ struct memory_desc_wrapper : public c_compatible {
         assert(is_rnn_packed_desc());
         return md_->format_desc.rnn_packed_desc;
     }
+    const cublaslt_blocked_desc_t &cublaslt_blocked_desc() const {
+        assert(is_cublaslt_blocked_desc());
+        return md_->format_desc.cublaslt_blocked_desc;
+    }
 
     const sparse_desc_t &sparse_desc() const {
         assert(is_sparse_desc());
@@ -93,20 +117,18 @@ struct memory_desc_wrapper : public c_compatible {
         return sparse_desc().metadata_types[idx];
     }
 
-    sparse_encoding_t encoding() const {
-        assert(is_sparse_desc());
-        return sparse_desc().encoding;
-    }
-
     dim_t nnz() const {
         assert(is_sparse_desc());
         return sparse_desc().nnz;
     }
-
     const dims_t &strides() const { return blocking_desc().strides; }
-
     const memory_extra_desc_t &extra() const { return md_->extra; }
 
+    sparse_encoding_t encoding() const {
+        assert(is_sparse_desc());
+        return sparse_desc().encoding;
+    }
+
     /* some useful function */
 
     /** returns the number of elements including padding if \param with_padding
@@ -142,30 +164,28 @@ struct memory_desc_wrapper : public c_compatible {
     size_t additional_buffer_data_size(uint64_t flag_select) const {
         using namespace memory_extra_flags;
         if (flag_select & compensation_conv_s8s8) return sizeof(int32_t);
-        if ((flag_select & rnn_u8s8_compensation)
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(flag_select))
-            return sizeof(float);
+        if (flag_select & rnn_u8s8_compensation) return sizeof(float);
         if (flag_select & compensation_conv_asymmetric_src)
             return sizeof(int32_t);
+        if (flag_select & compensation_gpu_conv_asymmetric_src)
+            return sizeof(int32_t);
         return 0;
     }
 
     /** return true if memory format has additional buffer */
     bool is_additional_buffer() const {
         using namespace memory_extra_flags;
-        // Currently compensation is not required for rnn_s8s8_compensation,
-        // but it has common bit with rnn_u8s8_compensation constant so we have
-        // to exclude rnn_s8s8_compensation case explicitly
-        return ((extra().flags
-                        & (compensation_conv_s8s8 | rnn_u8s8_compensation
-                                | compensation_conv_asymmetric_src))
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        extra().flags));
+        return extra().flags
+                & (compensation_conv_s8s8 | rnn_u8s8_compensation
+                        | compensation_gpu_conv_asymmetric_src
+                        | compensation_conv_asymmetric_src);
     }
 
     /** returns the size required for a particular extra memory buffer */
     size_t additional_buffer_size(memory_extra_flags_t flag) const {
         using namespace memory_extra_flags;
+        const auto flags = extra().flags;
+        if (!(flags & flag)) return 0;
 
         const auto ndims = this->ndims();
         const auto &pdims = padded_dims();
@@ -179,26 +199,26 @@ struct memory_desc_wrapper : public c_compatible {
                       return (size_t)prod * buff_data_size;
                   };
 
-        if (extra().flags & compensation_conv_s8s8) {
+        if (flag == compensation_conv_s8s8) {
             return calculate_size(extra().compensation_mask,
                     additional_buffer_data_size(flag));
         }
-
-        if ((extra().flags & rnn_u8s8_compensation)
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        extra().flags)) {
+        if (flag == rnn_u8s8_compensation) {
             return calculate_size(extra().compensation_mask,
                     additional_buffer_data_size(flag));
         }
-        if (extra().flags & compensation_conv_asymmetric_src) {
+        if (flag == compensation_conv_asymmetric_src) {
             return calculate_size(extra().asymm_compensation_mask,
                     additional_buffer_data_size(flag));
         }
+        if (flag == compensation_gpu_conv_asymmetric_src) {
+            return extra().dst_size;
+        }
 
         return 0;
     }
 
-    int blk_size() const {
+    dim_t blk_size() const {
         assert(is_blocking_desc() || is_sparse_packed_desc());
         const auto &bd = blocking_desc();
         return utils::array_product(bd.inner_blks, bd.inner_nblks);
@@ -213,18 +233,22 @@ struct memory_desc_wrapper : public c_compatible {
         buff_size += additional_buffer_size(compensation_conv_s8s8);
         buff_size += additional_buffer_size(rnn_u8s8_compensation);
         buff_size += additional_buffer_size(compensation_conv_asymmetric_src);
+        buff_size
+                += additional_buffer_size(compensation_gpu_conv_asymmetric_src);
         return buff_size;
     }
 
-    /** returns the size required to store described memory
-     * note: if offset0 != 0 returns 0 (need to specify the behavior) */
-    size_t size(int index = 0, bool include_additional_size = true) const {
+    /** returns the size required to store described memory note: does not
+        include offset0 by default */
+    size_t size(int index = 0, bool include_additional_size = true,
+            bool include_offset0 = false) const {
         if (utils::one_of(format_kind(), format_kind::undef, format_kind::any)
                 || is_zero() || has_zero_dim())
             return 0;
 
         if (utils::one_of(format_kind(), format_kind::blocked,
-                    format_kind::wino, format_kind::rnn_packed)
+                    format_kind::wino, format_kind::rnn_packed,
+                    format_kind::cublaslt_blocked)
                 && index != 0) {
             return 0;
         }
@@ -235,9 +259,28 @@ struct memory_desc_wrapper : public c_compatible {
             return wino_desc().size;
         } else if (is_rnn_packed_desc()) {
             return rnn_packed_desc().size;
+        } else if (is_cublaslt_blocked_desc()) {
+            return cublaslt_blocked_desc().size;
+        } else if (is_sparse_desc()) {
+            if (sparse_desc().encoding == sparse_encoding::packed) {
+                // Only  2D tensors are supported at this point.
+                assert(ndims() == 2);
+                // Only OI16i64o4i is supported at this point.
+                // assert(matches_tag(format_tag::OI16i64o4i)); - TODO: enable for sparse packed.
+                const size_t metadata = padded_dims()[0] * padded_dims()[1] / 64
+                        * sizeof(uint64_t);
+                using comp_tile_len_type = int;
+                size_t comp_tile_data_size = ceil(static_cast<float>(padded_dims()[0] * padded_dims()[1])
+                        / (64 * 64 * (64 / sizeof(comp_tile_len_type)))) * 64;
+                return comp_tile_data_size + (padded_dims()[0] * padded_dims()[1] * data_type_size())
+                        + metadata + 1000;
+                        // todo: [av] why 1000?
+            } else {
+                printf("encoding:%d\n", (int)sparse_desc().encoding), fflush(stdout);
+                assert(!"unknown sparse encoding");
+                return 0;
+            }
         } else if (is_blocking_desc()) {
-            if (offset0() != 0) return 0;
-
             dims_t blocks = {0};
             compute_blocks(blocks);
 
@@ -252,11 +295,13 @@ struct memory_desc_wrapper : public c_compatible {
             }
 
             if (max_size == 1 && bd.inner_nblks != 0) {
-                max_size = utils::array_product(bd.inner_blks, bd.inner_nblks);
+                max_size = static_cast<size_t>(blk_size());
             }
 
-            size_t data_size = max_size * data_type_size()
-                    / sub_byte_data_type_multiplier();
+            // `div_up` guarantees a spot in memory for odd number of half-byte
+            // elements. Crucial case is `1` when simple division returns 0.
+            size_t data_size = utils::div_up(max_size * data_type_size(),
+                    sub_byte_data_type_multiplier());
             if (is_additional_buffer()) {
                 // The additional buffers, typically of data type int32_t, float
                 // are stored at the end of data. Pad the data, so that the
@@ -265,7 +310,9 @@ struct memory_desc_wrapper : public c_compatible {
                 data_size = utils::rnd_up(data_size, alignment_in_bytes);
             }
             return data_size
-                    + (include_additional_size ? additional_buffer_size() : 0);
+                    + (include_additional_size ? additional_buffer_size() : 0)
+                    + (include_offset0 ? data_type_size() * offset0() : 0);
+#if 0
         } else if (is_sparse_desc()) {
             if (sparse_desc().encoding == sparse_encoding::csr) {
                 switch (index) {
@@ -283,6 +330,18 @@ struct memory_desc_wrapper : public c_compatible {
                     }
                     default: assert(!"unknown index"); return 0;
                 }
+            } else if (sparse_desc().encoding == sparse_encoding::coo) {
+                // Return size for values.
+                if (index == 0) {
+                    return nnz() * data_type_size();
+                } else if (index > 0 && index <= ndims()) {
+                    // Return size for index buffers.
+                    const auto idx_dt = metadata_type(0);
+                    return nnz() * types::data_type_size(idx_dt);
+                } else {
+                    assert(!"unknown index");
+                    return 0;
+                }
             } else if (sparse_desc().encoding == sparse_encoding::packed) {
                 // If the size if queried from a user-created memory descriptor.
                 if (blocking_desc().strides[0] == 0) return 0;
@@ -305,6 +364,7 @@ struct memory_desc_wrapper : public c_compatible {
                 assert(!"unknown sparse encoding");
                 return 0;
             }
+#endif
         } else {
             assert(!"unknown format kind");
             return 0;
@@ -333,8 +393,7 @@ struct memory_desc_wrapper : public c_compatible {
         if (utils::one_of(format_kind(), format_kind::undef, format_kind::any))
             return false;
         if (has_runtime_dims_or_strides() || has_broadcast()) return false;
-        return nelems(with_padding) * data_type_size()
-                / sub_byte_data_type_multiplier()
+        return utils::div_up(nelems(with_padding)* data_type_size(), sub_byte_data_type_multiplier())
                 == size(0, /* include_additional_size = */ false);
     }
 
@@ -418,15 +477,16 @@ struct memory_desc_wrapper : public c_compatible {
      * following statement might be true: lhs == rhs && !lhs.similar_to(rhs) */
     /* TODO: revise */
     bool similar_to(const memory_desc_wrapper &rhs, bool with_padding = true,
-            bool with_data_type = true, int dim_start = 0) const;
+            bool with_data_type = true, int dim_start = 0, bool use_weak_cmp = false,
+            bool check_off0 = false, uint64_t stride_mask = 0xffffffffffffffff) const;
 
     /** returns true if one memory can be reordered to another */
     bool consistent_with(const memory_desc_wrapper &rhs) const;
 
     /** returns true if the memory desc corresponds to the given format tag.
      * @sa memory_desc_matches_tag */
-    bool matches_tag(format_tag_t tag) const {
-        return memory_desc_matches_tag(*md_, tag);
+    bool matches_tag(format_tag_t tag, const dims_t strides = nullptr) const {
+        return memory_desc_matches_tag(*md_, tag, strides);
     }
 
     /** returns matching tag (or undef if match is not found)
@@ -439,14 +499,30 @@ struct memory_desc_wrapper : public c_compatible {
         return format_tag::undef;
     }
 
+    template <typename... Tags>
+    format_tag_t mb_stride_relaxed_match(Tags... tags) const {
+        dims_t skip_mb_stride {};
+        // See `memory_desc_matches_tag` comment.
+        skip_mb_stride[0] = -1;
+        for (const auto &tag : {tags...})
+            if (matches_tag(tag, skip_mb_stride)) return tag;
+        return format_tag::undef;
+    }
+
     /* offset section */
 
     /** returns physical offset by logical one. logical offset is represented by
      * an array \param pos. if \param is_pos_padded is true \param pos
      * represents the position in already padded area */
     dim_t off_v(const dims_t pos, bool is_pos_padded = false) const {
-        assert(is_blocking_desc() || is_sparse_packed_desc());
-        const blocking_desc_t &blk = blocking_desc();
+        assert(is_blocking_or_sparse_packed_desc());
+
+        const blocking_desc_t &blk = [&]() {
+            if (is_blocking_desc())
+                return blocking_desc();
+            else
+                return sparse_desc().packed_desc;
+        }();
 
         dims_t pos_copy = {0};
         for (int d = 0; d < ndims(); ++d)
@@ -520,7 +596,11 @@ struct memory_desc_wrapper : public c_compatible {
      * user responsibility to adjust the result to get offset within blocks */
     template <typename... Args>
     dim_t blk_off(Args... args) const {
-        return _blk_off<sizeof...(args), Args...>(args...);
+        assert(is_blocking_or_sparse_packed_desc());
+        if (is_blocking_desc()) {
+            return _blk_off<sizeof...(args), Args...>(args...);
+        }
+        return _blk_off_sparse<sizeof...(args), Args...>(args...);
     }
 
     template <bool skip_first, typename T, typename... Args>
@@ -529,12 +609,27 @@ struct memory_desc_wrapper : public c_compatible {
                           : blk_off<T, Args...>(xn, args...);
     }
 
+    /** returns physical offset by logical one. Logical offset is represented by
+     * a tuple of block indices (\param bn, ..., \param b1, \param b0). It is a
+     * user responsibility to adjust the result to get offset within blocks.
+     * If @tparam sub_off0 is true, then offset0() will be subtracted
+     * from result.*/
+    template <bool skip_first, bool sub_off0, typename T, typename... Args>
+    dim_t blk_off(T xn, Args... args) const {
+        return blk_off<skip_first, Args...>(xn, args...) - sub_off0 * offset0();
+    }
+
     /* static functions section */
     /* TODO: replace with non-static, once md_ becomes non-const ref */
 
     static status_t compute_blocking(
             memory_desc_t &memory_desc, format_tag_t tag);
 
+    static status_t compute_blocking(format_tag_t tag,
+                                     std::vector<size_t> &perm,
+                                     std::vector<size_t> &inner_blks,
+                                     std::vector<size_t> &inner_idxs);
+
 private:
     /* TODO: put logical_offset in utils */
     template <typename T>
@@ -554,41 +649,71 @@ struct memory_desc_wrapper : public c_compatible {
         return offset0();
     }
 
+    template <int ORIG_LEN, typename... Void>
+    dim_t _blk_off_sparse() const {
+        return offset0();
+    }
+
     template <int ORIG_LEN, typename T, typename... Args>
     dim_t _blk_off(T xc, Args... args) const {
-        assert(is_blocking_desc() || is_sparse_packed_desc());
+        assert(is_blocking_desc());
+        constexpr int dc = ORIG_LEN - sizeof...(args) - 1;
+        return xc * blocking_desc().strides[dc] + _blk_off<ORIG_LEN, Args...>(args...);
+    }
+
+    template <int ORIG_LEN, typename T, typename... Args>
+    dim_t _blk_off_sparse(T xc, Args... args) const {
+        assert(is_sparse_desc());
         constexpr int dc = ORIG_LEN - sizeof...(args) - 1;
-        return xc * blocking_desc().strides[dc]
-                + _blk_off<ORIG_LEN, Args...>(args...);
+        return xc * sparse_desc().packed_desc.strides[dc] + _blk_off_sparse<ORIG_LEN, Args...>(args...);
     }
 };
 
 inline bool memory_desc_wrapper::similar_to(const memory_desc_wrapper &rhs,
-        bool with_padding, bool with_data_type, int dim_start) const {
+        bool with_padding, bool with_data_type, int dim_start, bool use_weak_cmp, bool check_off0, uint64_t stride_mask) const {
     using namespace utils;
 
     if (one_of(format_kind(), format_kind::undef, format_kind::any))
         return false;
-    if (is_wino_desc() || is_rnn_packed_desc()) return false;
+    if (is_wino_desc() || is_rnn_packed_desc() || is_cublaslt_blocked_desc())
+        return false;
 
     const int ds = dim_start;
     const auto &blk = blocking_desc();
     const auto &r_blk = rhs.blocking_desc();
 
+    auto custom_cpm = use_weak_cmp ? array_cmp_weak : array_cmp<dnnl_dim_t>;
+    auto cmp_strides = [&]() {
+        if (0xffffffffffffffff == stride_mask) {
+            return custom_cpm(blk.strides + ds, r_blk.strides + ds, ndims() - ds);
+        } else {
+            for (int i = 0; i < ndims(); ++i) {
+                if (stride_mask & (1 << i)) {
+                    if (blk.strides[i] != r_blk.strides[i]
+                        && IMPLICATION(use_weak_cmp, (blk.strides[i] != DNNL_RUNTIME_DIM_VAL && r_blk.strides[i] != DNNL_RUNTIME_DIM_VAL))) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    };
+
     return ndims() == rhs.ndims() && dim_start <= ndims() /* guard */
             && format_kind() == rhs.format_kind()
             && IMPLICATION(with_data_type, data_type() == rhs.data_type())
-            && array_cmp(dims() + ds, rhs.dims() + ds, ndims() - ds)
-            && array_cmp(blk.strides + ds, r_blk.strides + ds, ndims() - ds)
+            && custom_cpm(dims() + ds, rhs.dims() + ds, ndims() - ds)
+            && cmp_strides()
             && blk.inner_nblks == r_blk.inner_nblks
             && array_cmp(blk.inner_blks, r_blk.inner_blks, blk.inner_nblks)
             && array_cmp(blk.inner_idxs, r_blk.inner_idxs, blk.inner_nblks)
             && IMPLICATION(with_padding,
                     true
-                            && array_cmp(padded_dims() + ds,
+                            && custom_cpm(padded_dims() + ds,
                                     rhs.padded_dims() + ds, ndims() - ds)
-                            && array_cmp(padded_offsets() + ds,
-                                    rhs.padded_offsets() + ds, ndims() - ds));
+                            && custom_cpm(padded_offsets() + ds,
+                                    rhs.padded_offsets() + ds, ndims() - ds))
+            && IMPLICATION(check_off0, (offset0() == DNNL_RUNTIME_DIM_VAL || rhs.offset0() ==DNNL_RUNTIME_DIM_VAL || offset0() == rhs.offset0()));
 }
 
 inline bool memory_desc_wrapper::consistent_with(
diff --git a/src/common/memory_storage.hpp b/src/common/memory_storage.hpp
index 822cce0391f..747e53ead5c 100644
--- a/src/common/memory_storage.hpp
+++ b/src/common/memory_storage.hpp
@@ -75,6 +75,14 @@ struct memory_storage_t : public c_compatible {
     /** returns shallow copy */
     virtual std::unique_ptr<memory_storage_t> clone() const = 0;
 
+    /** returns shallow copy with a offset for accessor pointer for buffers
+    * to prevent use of sub-buffers where possible*/
+    virtual std::unique_ptr<memory_storage_t> clone_ptr_off(
+            size_t offset) const {
+        assert(!"not expected");
+        return nullptr;
+    }
+
     /** returns true if the pointer associated with the storage is NULL */
     bool is_null() const {
         void *ptr;
diff --git a/src/common/memory_tracking.hpp b/src/common/memory_tracking.hpp
index 9f5ca8612f1..58813ff00aa 100644
--- a/src/common/memory_tracking.hpp
+++ b/src/common/memory_tracking.hpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
-* Copyright 2024 Arm Ltd. and affiliates
+* Copyright 2018-2025 Intel Corporation
+* Copyright 2024-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -166,11 +166,14 @@ enum {
     key_brgemm_primitive_buffer_d,
     key_brgemm_primitive_zp_comp_a,
     key_brgemm_primitive_zp_comp_b,
+    key_brgemm_primitive_buffer_reduce,
+    key_brgemm_primitive_decomp_buf,
     key_concat_iptrs,
     key_concat_istrides,
     key_concat_nelems,
     key_concat_optrs,
     key_concat_tent_dst,
+    key_conv_pack_space,
     key_conv_adjusted_scales,
     key_conv_amx_inp_buffer,
     key_conv_amx_tilecfg,
@@ -179,6 +182,8 @@ enum {
     key_conv_amx_wsp_buffer,
     key_conv_bia_reduction,
     key_conv_bias_bf16_convert_wsp,
+    key_conv_bias_f16_convert_wsp,
+    key_conv_bias_s32_convert,
     key_conv_cudnn,
     key_conv_cudnn_algo,
     key_conv_cudnn_filter,
@@ -198,9 +203,13 @@ enum {
     key_conv_gemm_imtr,
     key_conv_gemm_zp_src_comp,
     key_conv_int_dat_in_acc_dt,
+    key_conv_ncsp_dst,
+    key_conv_ncsp_src,
+    key_conv_ncsp_diff_dst,
+    key_conv_ncsp_diff_src,
+    key_conv_ncsp_matmul_dst,
+    key_conv_ncsp_diff_sp_sum,
     key_conv_padded_bias,
-    key_conv_permuted_inputs,
-    key_conv_permuted_outputs,
     key_conv_permuted_weights,
     key_conv_rtus_space,
     key_conv_store_wsp,
@@ -247,20 +256,27 @@ enum {
     key_iprod_dst_bf16_convert_wsp,
     key_iprod_dst_reorder,
     key_iprod_int_dat_in_acc_dt,
+    key_iprod_src_reorder,
+    key_iprod_weights_reorder,
     key_lnorm_inv_sqrtvar,
     key_lnorm_tmp_mean,
     key_lnorm_tmp_var,
     key_lnorm_tmp_diff_ss,
     key_lnorm_reduction,
+    key_matmul_pack_space,
     key_matmul_dst_in_acc_dt,
+    key_matmul_lt_algo_scratch,
+    key_matmul_lt_block_c,
     key_matmul_src_trans,
     key_matmul_wei_trans,
     key_matmul_dst_trans,
     key_matmul_dst_cast_acc,
+    key_matmul_sparse_tmp_ptr,
     key_pool_dst_bf16cvt,
     key_pool_dst_plain2blocked_cvt,
     key_pool_ind_plain2blocked_cvt,
     key_pool_src_bf16cvt,
+    key_pool_src_f32_accum,
     key_pool_src_plain2blocked_cvt,
     key_pool_reduction,
     key_precomputed_scales,
@@ -269,6 +285,7 @@ enum {
     key_reducer_space_bctx,
     key_reduction,
     key_reduction_1,
+    key_reduction_out,
     key_reorder_cross_space,
     key_reorder_space,
     key_reorder_src_scales,
@@ -281,6 +298,9 @@ enum {
     key_reorder_rnn_weights_reduction,
     key_reorder_rnn_weights_transposition,
     key_reorder_rnn_weights_xf16_cvt,
+    key_reorder_cublaslt_src_float,
+    key_reorder_cublaslt_dst_float,
+    key_reorder_cublaslt_generic,
     key_rnn_space,
     key_rnn_bf32_attention_trans,
     key_rnn_bf32_wei_layer_trans,
@@ -302,15 +322,20 @@ enum {
     key_softmax_interim_store,
     key_sum_reduction,
     key_sum_srcs_cvt,
-    key_wino_transformed_weights,
     key_wino_U,
     key_wino_V,
     key_wino_M,
-    key_wino_workspace,
+    key_decompression_scales,
+    key_decompression_zero_points,
+    key_src_quantized,
+    key_src_dequantized_scales,
+    key_src_grouped_sum,
     // These two keys should always be the last ones,
     // even though they are not in alphabetical order
     key_nested,
     key_nested_multiple,
+    key_dw_conv_buffer,
+    key_dw_conv_padded_bias,
 };
 
 enum {
@@ -414,14 +439,8 @@ struct registry_t {
     public:
         common_iterator_t(const void *base_ptr_,
                 const std::unordered_map<key_t, entry_t> &map,
-                bool is_begin = true) {
-            base_ptr = base_ptr_;
-            if (is_begin) {
-                iter = map.cbegin();
-            } else {
-                iter = map.cend();
-            }
-        }
+                bool is_begin = true)
+            : base_ptr(base_ptr_), iter(is_begin ? map.cbegin() : map.cend()) {}
         common_iterator_t &operator++(int) {
             iter++;
             return *this;
@@ -439,8 +458,8 @@ struct registry_t {
                     (return_type)ptr_start, entry.size};
         }
     };
-    typedef common_iterator_t<void *> iterator;
-    typedef common_iterator_t<const void *> const_iterator;
+    using iterator = common_iterator_t<void *>;
+    using const_iterator = common_iterator_t<const void *>;
     iterator begin(void *base_ptr_) const {
         return iterator(base_ptr_, offset_map_);
     }
diff --git a/src/common/memory_zero_pad.cpp b/src/common/memory_zero_pad.cpp
index ebaedd428bf..77afaedb832 100644
--- a/src/common/memory_zero_pad.cpp
+++ b/src/common/memory_zero_pad.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include "dnnl_thread.hpp"
 #include "dnnl_traits.hpp"
+#include "dnnl_sel_build.hpp"
 #include "stream.hpp"
 #include "type_helpers.hpp"
 #include "utils.hpp"
@@ -25,6 +26,7 @@
 #include "memory.hpp"
 #include "primitive_exec_types.hpp"
 
+using namespace dnnl;
 using namespace dnnl::impl;
 using namespace dnnl::impl::data_type;
 using namespace dnnl::impl::status;
@@ -39,7 +41,7 @@ void typed_zero_pad_blk(const memory_desc_wrapper &m_d, void *data_handle) {
      * This allows user will be to create bf16 memory
      * on non-avx512_core machines. */
     using data_t = typename utils::conditional<dt == bf16, uint16_t,
-            typename prec_traits<dt>::type>::type;
+            typename prec_traits_t<dt>::type>::type;
     auto data = reinterpret_cast<data_t *>(data_handle);
     const auto &dims = m_d.dims();
     const auto &pdims = m_d.padded_dims();
@@ -142,7 +144,7 @@ void typed_zero_pad_generic_blocked(
      * This allows user will be to create bf16 memory
      * on non-avx512_core machines. */
     using data_t = typename utils::conditional<dt == bf16, uint16_t,
-            typename prec_traits<dt>::type>::type;
+            typename prec_traits_t<dt>::type>::type;
     auto data = reinterpret_cast<data_t *>(data_handle);
     const int ndims = m_d.ndims();
     const auto &dims = m_d.dims();
@@ -204,7 +206,7 @@ status_t typed_zero_pad(const memory_t *memory, const exec_ctx_t &ctx) {
     void *mapped_ptr
             = ctx.map_memory_storage(memory_storage, ctx.stream(), map_size);
 
-    auto *data = static_cast<typename prec_traits<dt>::type *>(mapped_ptr);
+    auto *data = static_cast<typename prec_traits_t<dt>::type *>(mapped_ptr);
     auto blk = mdw.blocking_desc();
 
     auto get_blksize = [&](int ind) {
@@ -219,9 +221,11 @@ status_t typed_zero_pad(const memory_t *memory, const exec_ctx_t &ctx) {
 #define CASE(blksize_, blk_kind) \
     do { \
         if (blksize == (blksize_)) { \
-            typed_zero_pad_blk<dt, blk_kind, blksize_>(mdw, data); \
-            ctx.unmap_memory_storage( \
-                    memory_storage, mapped_ptr, ctx.stream()); \
+            DNNL_CSCOPE(DNNL_MACRO_CAT3(typed_zero_pad_blk_, blksize_, blk_kind)) { \
+                typed_zero_pad_blk<dt, blk_kind, blksize_>(mdw, data); \
+                ctx.unmap_memory_storage( \
+                        memory_storage, mapped_ptr, ctx.stream()); \
+            } \
             return success; \
         } \
     } while (0)
@@ -280,6 +284,8 @@ static status_t zero_pad(const memory_t *memory, const exec_ctx_t &ctx) {
     switch (mdw.data_type()) {
         case f16: return typed_zero_pad<f16>(memory, ctx);
         case bf16: return typed_zero_pad<bf16>(memory, ctx);
+        case f4_e3m0: return typed_zero_pad<f4_e3m0>(memory, ctx);
+        case f4_e2m1: return typed_zero_pad<f4_e2m1>(memory, ctx);
         case e8m0: return typed_zero_pad<e8m0>(memory, ctx);
         case f8_e5m2: return typed_zero_pad<f8_e5m2>(memory, ctx);
         case f8_e4m3: return typed_zero_pad<f8_e4m3>(memory, ctx);
@@ -289,6 +295,8 @@ static status_t zero_pad(const memory_t *memory, const exec_ctx_t &ctx) {
         case u8: return typed_zero_pad<u8>(memory, ctx);
         case s4: return typed_zero_pad<s8>(memory, ctx);
         case u4: return typed_zero_pad<u8>(memory, ctx);
+        case bin: return typed_zero_pad<u8>(memory, ctx);
+        case nf4: return typed_zero_pad<u8>(memory, ctx);
         default: assert(!"memory is undefined"); return unimplemented;
     }
     return unimplemented;
diff --git a/src/common/nstl.hpp b/src/common/nstl.hpp
index 45a6d7c49ac..227ecff67f2 100644
--- a/src/common/nstl.hpp
+++ b/src/common/nstl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include "bfloat16.hpp"
 #include "float16.hpp"
+#include "float4.hpp"
 #include "float8.hpp"
 #include "int4.hpp"
 #include "internal_defs.hpp"
@@ -54,7 +55,7 @@ void *malloc(size_t size, int alignment);
 #endif
 void free(void *p);
 
-struct c_compatible {
+struct c_compatible { // NOLINT(readability-identifier-naming)
     enum { default_alignment = 64 };
     static void *operator new(size_t sz) {
         return MALLOC(sz, default_alignment);
@@ -83,14 +84,14 @@ struct c_compatible {
 namespace nstl {
 
 template <typename T>
-constexpr const T abs(const T &a) {
+constexpr T abs(const T &a) {
     return a >= 0 ? a : -a;
 }
 
 // Computes the modulus and returns the result as the least positive residue
 // when the divisor > 0.
 template <typename T>
-inline const T modulo(const T &dividend, const T &divisor) {
+inline T modulo(const T &dividend, const T &divisor) {
     static_assert(std::is_integral<T>::value, "T must be an integer type.");
     assert(divisor > 0);
     T result = dividend % divisor;
@@ -100,7 +101,7 @@ inline const T modulo(const T &dividend, const T &divisor) {
 // Computes the additive inverse modulus and returns the result as the least
 // positive residue when the divisor > 0.
 template <typename T>
-inline const T additive_inverse_modulo(const T &dividend, const T &divisor) {
+inline T additive_inverse_modulo(const T &dividend, const T &divisor) {
     static_assert(std::is_integral<T>::value, "T must be an integer type.");
     assert(divisor > 0);
     T result = modulo(dividend, divisor);
@@ -156,6 +157,38 @@ struct numeric_limits<int8_t> : public std::numeric_limits<int8_t> {};
 template <>
 struct numeric_limits<uint8_t> : public std::numeric_limits<uint8_t> {};
 
+template <>
+struct numeric_limits<float4_e3m0_t> {
+    static constexpr float4_e3m0_t lowest() { return float4_e3m0_t(0xf, true); }
+    // Min normal is equal to the value 1.0
+    static constexpr float4_e3m0_t min() { return float4_e3m0_t(0x1, true); }
+    // Max normal is equal to the value 6.0
+    static constexpr float4_e3m0_t max() { return float4_e3m0_t(0x7, true); }
+
+    static constexpr int bias = 0x3;
+    static constexpr int digits = 1; // 1 implicit bit
+
+    static constexpr float4_e3m0_t epsilon() {
+        return float4_e3m0_t(0x3, true);
+    }
+};
+
+template <>
+struct numeric_limits<float4_e2m1_t> {
+    static constexpr float4_e2m1_t lowest() { return float4_e2m1_t(0xf, true); }
+    // Min normal is equal to the value 1.0
+    static constexpr float4_e2m1_t min() { return float4_e2m1_t(0x2, true); }
+    // Max normal is equal to the value 6.0
+    static constexpr float4_e2m1_t max() { return float4_e2m1_t(0x7, true); }
+
+    static constexpr int bias = 0x1;
+    static constexpr int digits = 2; // 1+1 implicit bits
+
+    static constexpr float4_e2m1_t epsilon() {
+        return float4_e2m1_t(0x2, true);
+    }
+};
+
 template <>
 struct numeric_limits<float8_e8m0_t> {
     static constexpr float8_e8m0_t lowest() {
@@ -253,7 +286,7 @@ struct numeric_limits<int4_t> {
 };
 
 template <typename T>
-struct is_integral {
+struct is_integral { // NOLINT(readability-identifier-naming)
     static constexpr bool value = false;
 };
 template <>
@@ -282,7 +315,7 @@ struct is_integral<uint4_t> {
 };
 
 template <typename T, typename U>
-struct is_same {
+struct is_same { // NOLINT(readability-identifier-naming)
     static constexpr bool value = false;
 };
 template <typename T>
@@ -310,20 +343,20 @@ struct is_same<T, T> {
 enum nstl_status_t { success = 0, out_of_memory };
 
 template <typename T>
-class vector : public c_compatible {
+class vector : public c_compatible { // NOLINT(readability-identifier-naming)
 private:
     std::vector<T> _impl;
 
 public:
-    typedef typename std::vector<T>::iterator iterator;
-    typedef typename std::vector<T>::const_iterator const_iterator;
-    typedef typename std::vector<T>::size_type size_type;
-    vector() {}
+    using iterator = typename std::vector<T>::iterator;
+    using const_iterator = typename std::vector<T>::const_iterator;
+    using size_type = typename std::vector<T>::size_type;
+    vector() = default;
     vector(size_type n) : _impl(n) {}
     vector(size_type n, const T &value) : _impl(n, value) {}
     template <typename input_iterator>
     vector(input_iterator first, input_iterator last) : _impl(first, last) {}
-    ~vector() {}
+    ~vector() = default;
     size_type size() const { return _impl.size(); }
     T &operator[](size_type i) { return _impl[i]; }
     const T &operator[](size_type i) const { return _impl[i]; }
@@ -339,21 +372,25 @@ class vector : public c_compatible {
     }
     void clear() { _impl.clear(); }
     void push_back(const T &t) { _impl.push_back(t); }
+    template<typename... Args>
+	void emplace_back(Args&&... args) {
+        _impl.emplace_back(std::forward<Args>(args)...);
+    }
     void resize(size_type count) { _impl.resize(count); }
     void reserve(size_type count) { _impl.reserve(count); }
 };
 
 template <typename Key, typename T>
-class map : public c_compatible {
+class map : public c_compatible { // NOLINT(readability-identifier-naming)
 private:
     std::map<Key, T> _impl;
 
 public:
-    typedef typename std::map<Key, T>::iterator iterator;
-    typedef typename std::map<Key, T>::const_iterator const_iterator;
-    typedef typename std::map<Key, T>::size_type size_type;
-    map() {}
-    ~map() {}
+    using iterator = typename std::map<Key, T>::iterator;
+    using const_iterator = typename std::map<Key, T>::const_iterator;
+    using size_type = typename std::map<Key, T>::size_type;
+    map() = default;
+    ~map() = default;
     size_type size() const { return _impl.size(); }
     T &operator[](const Key &k) { return _impl[k]; }
     const T &operator[](const Key &k) const { return _impl[k]; }
@@ -369,10 +406,10 @@ class map : public c_compatible {
 
 // Compile-time sequence of indices (part of C++14)
 template <size_t... Ints>
-struct index_sequence {};
+struct index_sequence {}; // NOLINT(readability-identifier-naming)
 
 template <size_t N, size_t... Next>
-struct make_index_sequence_helper
+struct make_index_sequence_helper // NOLINT(readability-identifier-naming)
     : public make_index_sequence_helper<N - 1, N - 1, Next...> {};
 
 template <size_t... Next>
diff --git a/src/common/opdesc.hpp b/src/common/opdesc.hpp
index 8067ae0ddb6..ea2ff5c9975 100644
--- a/src/common/opdesc.hpp
+++ b/src/common/opdesc.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,30 +17,83 @@
 #ifndef COMMON_OPDESC_HPP
 #define COMMON_OPDESC_HPP
 
-#include <vector>
-
 #include "common/c_types_map.hpp"
-#include "common/gemm_types.hpp"
-#include "common/sdpa_types.hpp"
+#include "common/memory_desc.hpp"
+#include "common/utils.hpp"
+
+#include <vector>
+#include <type_traits>
 
 namespace dnnl {
 namespace impl {
 
-struct reorder_desc_t {
+#define DECLARE_COMMON_OP_DESC_CLONE(op_desc_kind_t) \
+    std::unique_ptr<op_desc_t> clone() const override { \
+        return utils::make_unique<op_desc_kind_t>(*this); \
+    }
+
+// A base class for all descriptors that allows to dispatch between them through
+// a dedicated `kind` field.
+struct op_desc_t {
+    virtual ~op_desc_t() = default;
+
+    virtual std::unique_ptr<op_desc_t> clone() const = 0;
+
+    // Converters to a inherited type.
+    template <typename T>
+    static const T *to_desc(const op_desc_t *op_desc) {
+        static_assert(!std::is_pointer<T>::value,
+                "T is not expected to be a pointer type.");
+        return utils::downcast<const T *>(op_desc);
+    }
+    template <typename T>
+    static T *to_desc(op_desc_t *op_desc) {
+        static_assert(!std::is_pointer<T>::value,
+                "T is not expected to be a pointer type.");
+        return utils::downcast<T *>(op_desc);
+    }
+
+    // The kind of primitive. Used for self-identifying the primitive desc.
     primitive_kind_t primitive_kind;
-    const memory_desc_t *src_md;
-    const memory_desc_t *dst_md;
-    engine_kind_t src_engine_kind;
-    engine_kind_t dst_engine_kind;
-    bool is_cross_engine;
+
+protected:
+    op_desc_t() : primitive_kind(primitive_kind::undefined) {}
+    op_desc_t(primitive_kind_t pk) : primitive_kind(pk) {}
+    op_desc_t(const op_desc_t &) = default;
+    op_desc_t &operator=(const op_desc_t &) = default;
+    op_desc_t(op_desc_t &&) = default;
+    op_desc_t &operator=(op_desc_t &&) = default;
 };
 
-struct concat_desc_t {
+// A descriptor of a reorder operation.
+struct reorder_desc_t : public op_desc_t {
+    reorder_desc_t() = default;
+    reorder_desc_t(primitive_kind_t primitive_kind, const memory_desc_t *src_md,
+            const memory_desc_t *dst_md, engine_kind_t src_engine_kind,
+            engine_kind_t dst_engine_kind, bool is_cross_engine)
+        : op_desc_t(primitive_kind)
+        , src_md(src_md)
+        , dst_md(dst_md)
+        , src_engine_kind(src_engine_kind)
+        , dst_engine_kind(dst_engine_kind)
+        , is_cross_engine(is_cross_engine) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(reorder_desc_t);
+
+    const memory_desc_t *src_md {};
+    const memory_desc_t *dst_md {};
+    engine_kind_t src_engine_kind {};
+    engine_kind_t dst_engine_kind {};
+    bool is_cross_engine {};
+};
+
+// A descriptor of a concat operation.
+struct concat_desc_t : public op_desc_t {
     concat_desc_t() = default;
     concat_desc_t(primitive_kind_t primitive_kind, const memory_desc_t *dst_md,
             dim_t n, dim_t concat_dimension,
             const memory_desc_t *const *src_mds)
-        : primitive_kind(primitive_kind)
+        : op_desc_t(primitive_kind)
         , dst_md(dst_md)
         , n(n)
         , concat_dimension(concat_dimension) {
@@ -48,41 +101,49 @@ struct concat_desc_t {
             this->src_mds.push_back(src_mds[i]);
     }
 
-    primitive_kind_t primitive_kind;
-    const memory_desc_t *dst_md;
-    dim_t n;
-    dim_t concat_dimension;
+    DECLARE_COMMON_OP_DESC_CLONE(concat_desc_t);
+
+    const memory_desc_t *dst_md {};
+    dim_t n {};
+    dim_t concat_dimension {};
     std::vector<const memory_desc_t *> src_mds;
 };
 
-struct sum_desc_t {
+// A descriptor of a sum operation.
+struct sum_desc_t : public op_desc_t {
     sum_desc_t() = default;
     sum_desc_t(primitive_kind_t primitive_kind, const memory_desc_t *dst_md,
             dim_t n, const float *scales, const memory_desc_t *const *src_mds)
-        : primitive_kind(primitive_kind), dst_md(dst_md), n(n), scales(scales) {
+        : op_desc_t(primitive_kind), dst_md(dst_md), n(n), scales(scales) {
         for (dim_t i = 0; i < n; i++)
             this->src_mds.push_back(src_mds[i]);
     }
 
-    primitive_kind_t primitive_kind;
-    const memory_desc_t *dst_md;
-    dim_t n;
-    const float *scales;
+    DECLARE_COMMON_OP_DESC_CLONE(sum_desc_t);
+
+    const memory_desc_t *dst_md {};
+    dim_t n {};
+    const float *scales {};
     std::vector<const memory_desc_t *> src_mds;
 };
 
-struct zero_pad_desc_t {
-    primitive_kind_t primitive_kind;
+// A descriptor of a zero padding operation.
+struct zero_pad_desc_t : public op_desc_t {
+    zero_pad_desc_t() : op_desc_t(primitive_kind::zero_pad) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(zero_pad_desc_t);
 };
 
-struct inner_product_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_inner_product.
-    primitive_kind_t primitive_kind;
+// A descriptor of a inner product operation.
+struct inner_product_desc_t : public op_desc_t {
+    inner_product_desc_t() : op_desc_t(primitive_kind::inner_product) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(inner_product_desc_t);
+
     // The kind of propagation. Possible values: forward_training,
     // forward_inference, backward_data,
     // backward_weights, and backward_bias.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
@@ -100,20 +161,22 @@ struct inner_product_desc_t {
     // Destination gradient memory descriptor.
     memory_desc_t diff_dst_desc;
     // The accumulator data type.
-    data_type_t accum_data_type;
+    data_type_t accum_data_type {};
 };
 
-struct convolution_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_convolution.
-    primitive_kind_t primitive_kind;
+// A descriptor of a convolution operation.
+struct convolution_desc_t : public op_desc_t {
+    convolution_desc_t() : op_desc_t(primitive_kind::convolution) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(convolution_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward_data,
     // #dnnl_backward_weights, and #dnnl_backward_bias.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // The kind of the convolution algorithm. Possible values:
     // #dnnl_convolution_direct.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
@@ -131,51 +194,53 @@ struct convolution_desc_t {
     // Destination gradient memory descriptor.
     memory_desc_t diff_dst_desc;
     // Convolution strides in each spatial dimension.
-    dims_t strides;
+    dims_t strides {};
     // Convolution dilates in each spatial dimension.
-    dims_t dilates;
+    dims_t dilates {};
     // Padding in each spatial dimension. padding[0] is a padding in the
     // beginning (@p padding_l), padding[1] is a padding in the end (@p
     // padding_r).
-    dims_t padding[2];
+    dims_t padding[2] {};
     // The accumulator data type. Initialized automatically.
-    data_type_t accum_data_type;
+    data_type_t accum_data_type {};
     // For internal use only. To mark conv is used for deconv.
-    bool use_inversion;
+    bool use_inversion {};
 };
 
 // A descriptor of a deconvolution operation.
 using deconvolution_desc_t = convolution_desc_t;
 
 // A descriptor of a shuffle operation.
-struct shuffle_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_shuffle.
-    primitive_kind_t primitive_kind;
+struct shuffle_desc_t : public op_desc_t {
+    shuffle_desc_t() : op_desc_t(primitive_kind::shuffle) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(shuffle_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source or source gradient memory descriptor.
     memory_desc_t src_desc;
     // Destination or destination gradient memory descriptor.
     memory_desc_t dst_desc;
     // Axis for shuffling.
-    int axis;
+    int axis {};
     // Number of groups.
-    dim_t group_size;
+    dim_t group_size {};
 };
 
 // A descriptor of resampling operation.
-struct resampling_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_resampling.
-    primitive_kind_t primitive_kind;
+struct resampling_desc_t : public op_desc_t {
+    resampling_desc_t() : op_desc_t(primitive_kind::resampling) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(resampling_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward_data,
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // The kind of the resampling algorithm. Possible values:
     // #dnnl_resampling_nearest, #dnnl_resampling_linear.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
@@ -185,7 +250,7 @@ struct resampling_desc_t {
     // Destination gradient memory descriptor.
     memory_desc_t diff_dst_desc;
     // Resampling factor in each spatial dimension.
-    float factors[DNNL_MAX_NDIMS];
+    float factors[DNNL_MAX_NDIMS] {};
 };
 
 // A descriptor of a matrix multiplication operation.
@@ -195,10 +260,11 @@ struct resampling_desc_t {
 //
 // 3D case:
 //     dst[mb, m, n] = src[mb, m, k] * weights[mb, k, n] + bias[mb, m, n]
-struct matmul_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_matmul.
-    primitive_kind_t primitive_kind;
+struct matmul_desc_t : public op_desc_t {
+    matmul_desc_t() : op_desc_t(primitive_kind::matmul) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(matmul_desc_t);
+
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Weights memory descriptor.
@@ -207,18 +273,23 @@ struct matmul_desc_t {
     memory_desc_t bias_desc;
     // Destination memory descriptor.
     memory_desc_t dst_desc;
+    // Reduce memory descriptor;
+    memory_desc_t reduce_desc;
+    // Reduce kind.
+    matmul_reduce_kind_t reduce_kind {};
     // The accumulator data type. Initialized automatically.
-    data_type_t accum_data_type;
+    data_type_t accum_data_type {};
 };
 
 // A descriptor of a element-wise operation.
-struct eltwise_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_eltwise.
-    primitive_kind_t primitive_kind;
+struct eltwise_desc_t : public op_desc_t {
+    eltwise_desc_t() : op_desc_t(primitive_kind::eltwise) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(eltwise_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // The kind of eltwise algorithm. Possible values: #dnnl_eltwise_relu,
     // #dnnl_eltwise_tanh, #dnnl_eltwise_elu, #dnnl_eltwise_square,
     // #dnnl_eltwise_abs, #dnnl_eltwise_sqrt, #dnnl_eltwise_linear,
@@ -233,7 +304,7 @@ struct eltwise_desc_t {
     // #dnnl_eltwise_logistic_use_dst_for_bwd,
     // #dnnl_eltwise_exp_use_dst_for_bwd,
     // #dnnl_eltwise_clip_v2_use_dst_for_bwd.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Destination memory descriptor.
@@ -265,17 +336,20 @@ struct eltwise_desc_t {
     //  - #dnnl_eltwise_mish: @p alpha and @p beta ignored
     //  - #dnnl_eltwise_hardswish: @p alpha and @p beta ignored
     //  - #dnnl_eltwise_hardsigmoid: @p alpha -- scale, @p beta -- shift
-    float alpha, beta;
+    float alpha {};
+    float beta {};
 };
 
 // A descriptor of a Batch Normalization operation.
-struct batch_normalization_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_batch_normalization.
-    primitive_kind_t primitive_kind;
+struct batch_normalization_desc_t : public op_desc_t {
+    batch_normalization_desc_t()
+        : op_desc_t(primitive_kind::batch_normalization) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(batch_normalization_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Destination memory descriptor.
@@ -293,18 +367,20 @@ struct batch_normalization_desc_t {
     // Statistics (mean or variance) descriptor use 1D #dnnl_x format[Channels].
     memory_desc_t stat_desc;
     // Batch normalization epsilon parameter.
-    float batch_norm_epsilon;
-    unsigned flags;
+    float batch_norm_epsilon {};
+    unsigned flags {};
 };
 
 // A descriptor of a Group Normalization operation.
-struct group_normalization_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_group_normalization.
-    primitive_kind_t primitive_kind;
+struct group_normalization_desc_t : public op_desc_t {
+    group_normalization_desc_t()
+        : op_desc_t(primitive_kind::group_normalization) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(group_normalization_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
@@ -318,10 +394,10 @@ struct group_normalization_desc_t {
     // format[Batch, groups].
     memory_desc_t stat_desc;
     // Group normalization groups parameter.
-    dim_t groups;
+    dim_t groups {};
     // Group normalization epsilon parameter.
-    float group_norm_epsilon;
-    unsigned flags;
+    float group_norm_epsilon {};
+    unsigned flags {};
     // Destination memory descriptor.
     memory_desc_t dst_desc;
     // Destination gradient memory descriptor.
@@ -329,13 +405,15 @@ struct group_normalization_desc_t {
 };
 
 // A descriptor of a Layer Normalization operation.
-struct layer_normalization_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_layer_normalization.
-    primitive_kind_t primitive_kind;
+struct layer_normalization_desc_t : public op_desc_t {
+    layer_normalization_desc_t()
+        : op_desc_t(primitive_kind::layer_normalization) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(layer_normalization_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
@@ -353,8 +431,8 @@ struct layer_normalization_desc_t {
     // (stride[last_dim] == 1) user-provided format.
     memory_desc_t stat_desc;
     // Layer normalization epsilon parameter.
-    float layer_norm_epsilon;
-    unsigned flags;
+    float layer_norm_epsilon {};
+    unsigned flags {};
     // Destination memory descriptor.
     memory_desc_t dst_desc;
     // Destination gradient memory descriptor.
@@ -362,16 +440,17 @@ struct layer_normalization_desc_t {
 };
 
 // A descriptor of a Local Response Normalization (LRN) operation.
-struct lrn_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_lrn.
-    primitive_kind_t primitive_kind;
+struct lrn_desc_t : public op_desc_t {
+    lrn_desc_t() : op_desc_t(primitive_kind::lrn) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(lrn_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // LRN algorithm. Possible values: #dnnl_lrn_within_channel and
     // #dnnl_lrn_across_channels.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Destination memory descriptor.
@@ -382,26 +461,27 @@ struct lrn_desc_t {
     memory_desc_t diff_dst_desc;
     // The number of channels to sum over (for cross-channel LRN) or the side
     // length of the square region to sum over (for within-channel LRN).
-    dim_t local_size;
+    dim_t local_size {};
     // LRN alpha parameter.
-    float lrn_alpha;
+    float lrn_alpha {};
     // LRN beta parameter.
-    float lrn_beta;
+    float lrn_beta {};
     // LRN k parameter.
-    float lrn_k;
+    float lrn_k {};
 };
 
 // A descriptor of reduction operation.
-struct reduction_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_reduction.
-    primitive_kind_t primitive_kind;
+struct reduction_desc_t : public op_desc_t {
+    reduction_desc_t() : op_desc_t(primitive_kind::reduction) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(reduction_desc_t);
+
     // The kind of reduction algorithm. Possible values:
     // #dnnl_reduction_max, #dnnl_reduction_min, #dnnl_reduction_sum,
     // #dnnl_reduction_mul, #dnnl_reduction_mean, #dnnl_reduction_norm_lp_max,
     // #dnnl_reduction_norm_lp_sum, #dnnl_reduction_norm_lp_power_p_max,
     // #dnnl_reduction_norm_lp_power_p_sum.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Destination memory descriptor.
@@ -417,26 +497,28 @@ struct reduction_desc_t {
     // #dnnl_reduction_sum: @p p and @p eps are ignored
     // #dnnl_reduction_mul: @p p and @p eps are ignored
     // #dnnl_reduction_mean: @p p and @p eps are ignored
-    float p, eps;
+    float p {};
+    float eps {};
 };
 
 /// A descriptor of a Softmax operation.
-struct softmax_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_softmax.
-    primitive_kind_t primitive_kind;
+struct softmax_desc_t : public op_desc_t {
+    softmax_desc_t() : op_desc_t(primitive_kind::softmax) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(softmax_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
     memory_desc_t diff_src_desc;
     // The axis along which to perform the softmax.
-    int softmax_axis;
+    int softmax_axis {};
     // Softmax algorithm. Possible values: #dnnl_softmax_accurate and
     // #dnnl_softmax_log.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Destination memory descriptor.
     memory_desc_t dst_desc;
     // Destination gradient memory descriptor.
@@ -444,28 +526,32 @@ struct softmax_desc_t {
 };
 
 // A descriptor of a binary operation.
-struct binary_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_binary.
-    primitive_kind_t primitive_kind;
+struct binary_desc_t : public op_desc_t {
+    binary_desc_t() : op_desc_t(primitive_kind::binary) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(binary_desc_t);
+
     // The kind of the binary algorithm. Possible values:
     // #dnnl_binary_add, #dnnl_binary_mul, #dnnl_binary_max, #dnnl_binary_min,
-    // #dnnl_binary_div and #dnnl_binary_sub.
-    alg_kind_t alg_kind;
+    // #dnnl_binary_div, #dnnl_binary_sub, #dnnl_binary_ge, #dnnl_binary_gt,
+    // #dnnl_binary_le, #dnnl_binary_lt, #dnnl_binary_eq, #dnnl_binary_ne,
+    // and #dnnl_binary_select
+    alg_kind_t alg_kind {};
     // Source memory descriptors.
-    memory_desc_t src_desc[2];
+    memory_desc_t src_desc[3] {};
     // Destination memory descriptor.
     memory_desc_t dst_desc;
 };
 
 /// A descriptor of a PReLU operation.
-struct prelu_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_prelu.
-    primitive_kind_t primitive_kind;
+struct prelu_desc_t : public op_desc_t {
+    prelu_desc_t() : op_desc_t(primitive_kind::prelu) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(prelu_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Learnable parameter alpha memory descriptor.
@@ -482,18 +568,19 @@ struct prelu_desc_t {
 };
 
 // A descriptor of a pooling operation.
-struct pooling_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_pooling.
-    primitive_kind_t primitive_kind;
+struct pooling_desc_t : public op_desc_t {
+    pooling_desc_t() : op_desc_t(primitive_kind::pooling) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(pooling_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, #dnnl_backward, and #dnnl_backward_data.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // The kind of pooling algorithm.
     // Possible values: #dnnl_pooling_max,
     // #dnnl_pooling_avg_include_padding, and
     // #dnnl_pooling_avg_exclude_padding.
-    alg_kind_t alg_kind;
+    alg_kind_t alg_kind {};
     // Source memory descriptor.
     memory_desc_t src_desc;
     // Source gradient memory descriptor.
@@ -503,32 +590,33 @@ struct pooling_desc_t {
     // Destination gradient memory descriptor.
     memory_desc_t diff_dst_desc;
     // Pooling kernel strides for spatial dimensions.
-    dims_t strides;
+    dims_t strides {};
     // Pooling kernel spatial dimensions.
-    dims_t kernel;
+    dims_t kernel {};
     // Padding in each spatial dimension. padding[0] is a padding in the
     // beginning (@p padding_l), padding[1] is a padding in the end (@p
     // padding_r).
-    dims_t padding[2];
+    dims_t padding[2] {};
     // The accumulator data type. Initialized automatically.
-    data_type_t accum_data_type;
+    data_type_t accum_data_type {};
     // Pooling dilations for spatial dimensions.
-    dims_t dilation;
+    dims_t dilation {};
 };
 
 // A descriptor for an RNN operation.
-struct rnn_desc_t {
-    // The kind of primitive. Used for self-identifying the primitive
-    // descriptor. Must be #dnnl_rnn.
-    dnnl_primitive_kind_t primitive_kind;
+struct rnn_desc_t : public op_desc_t {
+    rnn_desc_t() : op_desc_t(primitive_kind::rnn) {}
+
+    DECLARE_COMMON_OP_DESC_CLONE(rnn_desc_t);
+
     // The kind of propagation. Possible values: #dnnl_forward_training,
     // #dnnl_forward_inference, and #dnnl_backward.
-    prop_kind_t prop_kind;
+    prop_kind_t prop_kind {};
     // RNN cell kind. Must be one of #dnnl_vanilla_rnn,
     // #dnnl_vanilla_lstm, #dnnl_vanilla_gru, or #dnnl_lbr_gru.
-    alg_kind_t cell_kind;
+    alg_kind_t cell_kind {};
     // The direction of RNN primitive execution.
-    rnn_direction_t direction;
+    rnn_direction_t direction {};
     // Source layer memory descriptor.
     memory_desc_t src_layer_desc;
     // Source iteration memory descriptor for hidden state.
@@ -584,82 +672,15 @@ struct rnn_desc_t {
     memory_desc_t diff_weights_projection_desc;
 
     // RNN cell flags
-    unsigned int flags;
+    unsigned flags {};
     // Activation function used for vanilla_rnn cell kind.
     // Must be either #dnnl_eltwise_relu or #dnnl_eltwise_tanh.
-    alg_kind_t activation_kind;
-    float alpha;
-    float beta;
+    alg_kind_t activation_kind {};
+    float alpha {};
+    float beta {};
 };
 
-struct op_desc_t {
-    union {
-        primitive_kind_t kind;
-        convolution_desc_t convolution;
-        deconvolution_desc_t deconvolution;
-        shuffle_desc_t shuffle;
-        pooling_desc_t pooling;
-        prelu_desc_t prelu;
-        eltwise_desc_t eltwise;
-        softmax_desc_t softmax;
-        lrn_desc_t lrn;
-        batch_normalization_desc_t batch_normalization;
-        group_normalization_desc_t group_normalization;
-        layer_normalization_desc_t layer_normalization;
-        inner_product_desc_t inner_product;
-        rnn_desc_t rnn;
-        gemm_desc_t gemm;
-        concat_desc_t concat;
-        reorder_desc_t reorder;
-        sum_desc_t sum;
-        binary_desc_t binary;
-        matmul_desc_t matmul;
-        resampling_desc_t resampling;
-        zero_pad_desc_t zero_pad;
-        reduction_desc_t reduction;
-        sdpa_desc_t sdpa;
-    };
-
-#define DECL_CTOR_AND_CONVERTERS(c_type) \
-    op_desc_t(const c_type &) = delete; \
-    static op_desc_t *convert_from_c(c_type *_) { \
-        return reinterpret_cast<op_desc_t *>(_); \
-    } \
-    static const op_desc_t *convert_from_c(const c_type *_) { \
-        return reinterpret_cast<const op_desc_t *>(_); \
-    }
-
-    DECL_CTOR_AND_CONVERTERS(convolution_desc_t);
-    DECL_CTOR_AND_CONVERTERS(shuffle_desc_t);
-    DECL_CTOR_AND_CONVERTERS(pooling_desc_t);
-    DECL_CTOR_AND_CONVERTERS(prelu_desc_t);
-    DECL_CTOR_AND_CONVERTERS(eltwise_desc_t);
-    DECL_CTOR_AND_CONVERTERS(softmax_desc_t);
-    DECL_CTOR_AND_CONVERTERS(lrn_desc_t);
-    DECL_CTOR_AND_CONVERTERS(batch_normalization_desc_t);
-    DECL_CTOR_AND_CONVERTERS(group_normalization_desc_t);
-    DECL_CTOR_AND_CONVERTERS(layer_normalization_desc_t);
-    DECL_CTOR_AND_CONVERTERS(inner_product_desc_t);
-    DECL_CTOR_AND_CONVERTERS(rnn_desc_t);
-    DECL_CTOR_AND_CONVERTERS(gemm_desc_t);
-    DECL_CTOR_AND_CONVERTERS(concat_desc_t);
-    DECL_CTOR_AND_CONVERTERS(reorder_desc_t);
-    DECL_CTOR_AND_CONVERTERS(sum_desc_t);
-    DECL_CTOR_AND_CONVERTERS(binary_desc_t);
-    DECL_CTOR_AND_CONVERTERS(matmul_desc_t);
-    DECL_CTOR_AND_CONVERTERS(resampling_desc_t);
-    DECL_CTOR_AND_CONVERTERS(zero_pad_desc_t);
-    DECL_CTOR_AND_CONVERTERS(reduction_desc_t);
-    DECL_CTOR_AND_CONVERTERS(sdpa_desc_t);
-
-    // concat_desc_t and sum_desc_t have data members which have non-trivial
-    // special member functions hence the default destructor is implicitly
-    // deleted by the compiler which causes a warning on Windows so we should
-    // delete the destructor explicitly.
-    ~op_desc_t() = delete;
-
-#undef DECL_CTOR_AND_CONVERTERS
-};
+#undef DECLARE_COMMON_OP_DESC_CLONE
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/optional.hpp b/src/common/optional.hpp
index 83eac9eb70c..93388b3b80c 100644
--- a/src/common/optional.hpp
+++ b/src/common/optional.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ class optional_t {
 
     optional_t(const nullopt_t nullopt_) : has_value_(false), dummy {} {}
     optional_t() : optional_t(nullopt) {}
-    optional_t(T object) : has_value_(true), value_(object) {}
+    optional_t(const T &object) : has_value_(true), value_(object) {}
     optional_t(const optional_t &other)
         : has_value_(other.has_value_), dummy {} {
         if (has_value_) new (std::addressof(value_)) T(other.value_);
diff --git a/src/common/pooling.cpp b/src/common/pooling.cpp
index c20685bc6b3..a1fd610fe68 100644
--- a/src/common/pooling.cpp
+++ b/src/common/pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -117,13 +117,19 @@ status_t pooling_desc_init(pooling_desc_t *pool_desc, prop_kind_t prop_kind,
         VCHECK_POOLING((src - ker_range + pad_l + pad_r) / str + 1 == dst,
                 VERBOSE_INCONSISTENT_PRB)
 
+        // [fork] Initially this check was also commented and padding handling
+        // was also corrected for nchw_pooling.
+        // after rebase to onednn v2.7 the changes in nchw_pooling led to the
+        // test fails because of accuracy.
+        // With the commented check and without any updates in nchw_pooling no issues found
+
         // It's not allowed for pooling window to be totally placed outside
         // of real source domain for pooling_avg_exclude_padding algorithm
         // due to 0 / 0 ambiguity
-        VCHECK_POOLING(
-                IMPLICATION(alg_kind == pooling_avg_exclude_padding,
-                        (pad_l < ker_range && pad_r < ker_range && dil < src)),
-                VERBOSE_INCONSISTENT_PRB);
+        // VCHECK_POOLING(
+        //         IMPLICATION(alg_kind == pooling_avg_exclude_padding,
+        //                 (pad_l < ker_range && pad_r < ker_range && dil < src)),
+        //         VERBOSE_INCONSISTENT_PRB);
     }
 
     *pool_desc = pd;
@@ -151,8 +157,11 @@ status_t pooling_attr_check(const pooling_desc_t &desc, const engine_t *engine,
         if (!attr->post_ops_.has_default_values()) {
             const auto &po = attr->post_ops_;
             using namespace primitive_kind;
-            VCHECK_POOLING_IMPL(po.has_default_values({binary, eltwise}),
+            VCHECK_POOLING_IMPL(po.has_default_values({binary, eltwise, quantization}),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         VCHECK_POOLING_IMPL(false, VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/common/pooling_pd.hpp b/src/common/pooling_pd.hpp
index 0690497ac38..62a7cdec05b 100644
--- a/src/common/pooling_pd.hpp
+++ b/src/common/pooling_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -146,12 +146,11 @@ struct pooling_pd_t : public primitive_desc_t {
 
     memory_desc_t ws_md_;
 
-    pooling_pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
+    pooling_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const pooling_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
-        , hint_fwd_pd_(hint_fwd_pd)
-        , ws_md_() {}
+        , desc_(*op_desc_t::to_desc<pooling_desc_t>(adesc))
+        , hint_fwd_pd_(hint_fwd_pd) {}
 
     void init_default_ws(data_type_t dt = data_type::undef) {
         ws_md_ = is_fwd() ? *dst_md() : *diff_dst_md();
@@ -161,7 +160,7 @@ struct pooling_pd_t : public primitive_desc_t {
     data_type_t indices_data_type() const {
         /* the simplest way to express 256... */
         const int u8_max = nstl::numeric_limits<
-                typename prec_traits<data_type::u8>::type>::max();
+                typename prec_traits_t<data_type::u8>::type>::max();
         return utils::array_product(desc()->kernel, spatial_ndims()) <= u8_max
                 ? data_type::u8
                 : data_type::s32;
@@ -176,17 +175,19 @@ struct pooling_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct pooling_fwd_pd_t : public pooling_pd_t {
-    typedef pooling_fwd_pd_t base_class;
-    typedef pooling_fwd_pd_t hint_class;
+    using base_class = pooling_fwd_pd_t;
+    using hint_class = pooling_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_WORKSPACE && (!types::is_zero_md(workspace_md())))
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::output
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -215,7 +216,7 @@ struct pooling_fwd_pd_t : public pooling_pd_t {
                                                          : &glob_zero_md;
     }
 
-    int n_inputs() const override { return 1 + n_binary_po_inputs(); }
+    int n_inputs() const override { return 1 + n_binary_po_inputs() + n_depthwise_po_inputs() + n_quantization_po_inputs(); }
     int n_outputs() const override {
         return 1 + (!types::is_zero_md(workspace_md()));
     }
@@ -229,7 +230,7 @@ struct pooling_fwd_pd_t : public pooling_pd_t {
     memory_desc_t src_md_;
     memory_desc_t dst_md_;
 
-    pooling_fwd_pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
+    pooling_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const pooling_fwd_pd_t *hint_fwd_pd)
         : pooling_pd_t(adesc, attr, hint_fwd_pd)
         , src_md_(desc_.src_desc)
@@ -245,18 +246,21 @@ struct pooling_fwd_pd_t : public pooling_pd_t {
                 dst_md_, src_md_.format_desc.blocking);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct pooling_bwd_pd_t : public pooling_pd_t {
-    typedef pooling_bwd_pd_t base_class;
-    typedef pooling_fwd_pd_t hint_class;
+    using base_class = pooling_bwd_pd_t;
+    using hint_class = pooling_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_DIFF_DST) return arg_usage_t::input;
 
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_WORKSPACE && (!types::is_zero_md(workspace_md())))
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::input
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -302,7 +306,7 @@ struct pooling_bwd_pd_t : public pooling_pd_t {
     memory_desc_t diff_src_md_;
     memory_desc_t diff_dst_md_;
 
-    pooling_bwd_pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
+    pooling_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const pooling_fwd_pd_t *hint_fwd_pd)
         : pooling_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_md_(desc_.diff_src_desc)
@@ -338,6 +342,7 @@ struct pooling_bwd_pd_t : public pooling_pd_t {
 private:
     std::vector<memory_desc_t> hint_mds_;
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/prelu_pd.hpp b/src/common/prelu_pd.hpp
index 2b3d96ff9d5..de5305d5b4f 100644
--- a/src/common/prelu_pd.hpp
+++ b/src/common/prelu_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,18 +78,19 @@ struct prelu_pd_t : public primitive_desc_t {
     memory_desc_t src_md_;
     memory_desc_t weights_md_;
 
-    prelu_pd_t(const prelu_desc_t *adesc, const primitive_attr_t *attr,
+    prelu_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const prelu_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<prelu_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_md_(desc_.src_desc)
         , weights_md_(desc_.weights_desc) {}
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct prelu_fwd_pd_t : public prelu_pd_t {
-    typedef prelu_fwd_pd_t base_class;
-    typedef prelu_fwd_pd_t hint_class;
+    using base_class = prelu_fwd_pd_t;
+    using hint_class = prelu_fwd_pd_t;
 
     primitive_desc_t::arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
@@ -133,7 +134,7 @@ struct prelu_fwd_pd_t : public prelu_pd_t {
 protected:
     memory_desc_t dst_md_;
 
-    prelu_fwd_pd_t(const prelu_desc_t *adesc, const primitive_attr_t *attr,
+    prelu_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const prelu_fwd_pd_t *hint_fwd_pd)
         : prelu_pd_t(adesc, attr, hint_fwd_pd), dst_md_(desc_.dst_desc) {}
 
@@ -148,10 +149,12 @@ struct prelu_fwd_pd_t : public prelu_pd_t {
                                 == status::success);
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct prelu_bwd_pd_t : public prelu_pd_t {
-    typedef prelu_bwd_pd_t base_class;
-    typedef prelu_fwd_pd_t hint_class;
+    using base_class = prelu_bwd_pd_t;
+    using hint_class = prelu_fwd_pd_t;
 
     primitive_desc_t::arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
@@ -216,7 +219,7 @@ struct prelu_bwd_pd_t : public prelu_pd_t {
     memory_desc_t diff_weights_md_;
     memory_desc_t diff_dst_md_;
 
-    prelu_bwd_pd_t(const prelu_desc_t *adesc, const primitive_attr_t *attr,
+    prelu_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const prelu_fwd_pd_t *hint_fwd_pd)
         : prelu_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_md_(desc_.diff_src_desc)
@@ -242,6 +245,7 @@ struct prelu_bwd_pd_t : public prelu_pd_t {
                                 == status::success);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/primitive.hpp b/src/common/primitive.hpp
index ba217b521d7..1cc39c86eeb 100644
--- a/src/common/primitive.hpp
+++ b/src/common/primitive.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -150,25 +150,25 @@ struct nested_scratchpad_t {
 } // namespace impl
 } // namespace dnnl
 
-#define ARG_TYPE(t) \
-    typename std::remove_cv<typename std::remove_pointer<t>::type>::type
+#define ARG_PTR_TYPE(t) \
+    typename std::remove_cv<typename std::remove_pointer<t>::type>::type *
 
 // Returns destination memory which has been zero pad initialized. This macro
 // may result in a failure returned via the `status` input since zero pad
 // may fail.
 #define CTX_OUT_CLEAN_MEM(type, arg, status) \
-    static_cast<ARG_TYPE(type) *>(ctx.host_ptr(arg, true, &status))
+    static_cast<ARG_PTR_TYPE(type)>(ctx.host_ptr(arg, true, &(status)))
 
 // Returns destination memory which may not have been zero pad initialized.
 #define CTX_OUT_MEM_COMMON(type, arg, index) \
-    static_cast<ARG_TYPE(type) *>(ctx.host_ptr(arg, false, nullptr, index))
+    static_cast<ARG_PTR_TYPE(type)>(ctx.host_ptr(arg, false, nullptr, index))
 #define CTX_OUT_MEm(type, arg) CTX_OUT_MEM_COMMON(type, arg, 0)
 #define CTX_OUT_MEm0(type, arg) CTX_OUT_MEM_COMMON(type, arg, 0)
 #define CTX_OUT_MEm1(type, arg) CTX_OUT_MEM_COMMON(type, arg, 1)
 #define CTX_OUT_MEm2(type, arg) CTX_OUT_MEM_COMMON(type, arg, 2)
 
 #define CTX_IN_MEM_COMMON(type, arg, index) \
-    static_cast<const ARG_TYPE(type) *>( \
+    static_cast<const ARG_PTR_TYPE(type)>( \
             ctx.host_ptr(arg, false, nullptr, index))
 #define CTX_IN_MEm(type, arg) CTX_IN_MEM_COMMON(type, arg, 0)
 #define CTX_IN_MEm0(type, arg) CTX_IN_MEM_COMMON(type, arg, 0)
diff --git a/src/common/primitive_attr.cpp b/src/common/primitive_attr.cpp
index 09007dd968a..d088682adca 100644
--- a/src/common/primitive_attr.cpp
+++ b/src/common/primitive_attr.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,12 +35,7 @@ const primitive_attr_t &default_attr() {
     return default_attr_instance;
 }
 
-const runtime_scales_t &default_runtime_scale() {
-    static const runtime_scales_t default_runtime_scale_instance;
-    return default_runtime_scale_instance;
-}
-
-void scales_t::set_single_scale(float scale) {
+void rnn_create_time_scales_t::set_single_scale(float scale) {
     count_ = 1;
     mask_ = 0;
     scales_ = scales_buf_;
@@ -51,7 +46,8 @@ void scales_t::set_single_scale(float scale) {
     }
 }
 
-status_t scales_t::set(dim_t count, int mask, const float *scales) {
+status_t rnn_create_time_scales_t::set(
+        dim_t count, int mask, const float *scales) {
     cleanup();
 
     count_ = count;
@@ -73,39 +69,25 @@ status_t scales_t::set(dim_t count, int mask, const float *scales) {
     return status::success;
 }
 
-status_t zero_points_t::get(int arg, int *mask, data_type_t *dt) const {
-    if (mask) *mask = get_mask(arg);
-    if (dt) *dt = get_data_type(arg);
-    return status::success;
-}
+template <typename T>
+status_t shifts_t<T>::set(int count, int mask, const T *shifts) {
+    cleanup();
 
-int zero_points_t::get(int arg) const {
-    return get_mask(arg);
-}
-
-status_t zero_points_t::set(int arg, int mask, int ndims, const dims_t groups,
-        data_type_t data_type) {
-    const bool supported_arg
-            = utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST);
-    if (!supported_arg) return status::unimplemented;
-
-    switch (arg) {
-        case DNNL_ARG_SRC:
-            is_set_src = true;
-            mask_src = mask;
-            break;
-        case DNNL_ARG_WEIGHTS:
-            is_set_wei = true;
-            mask_wei = mask;
-            data_type_wei = data_type;
-            group_ndims_wei = ndims;
-            utils::array_copy(group_dims_wei, groups, group_ndims_wei);
-            break;
-        case DNNL_ARG_DST:
-            is_set_dst = true;
-            mask_dst = mask;
-            break;
+    count_ = count;
+    mask_ = mask;
+
+    if (count_ == 1) {
+        shifts_ = shifts_buf_;
+        utils::array_set(shifts_, shifts[0], shifts_buf_size);
+    } else {
+        shifts_ = (T *)impl::malloc(count_ * sizeof(*shifts_), 64);
+        if (shifts_ == nullptr)
+            return status::out_of_memory;
+
+        for (int c = 0; c < count_; ++c)
+            shifts_[c] = shifts[c];
     }
+
     return status::success;
 }
 
@@ -128,37 +110,31 @@ status_t dropout_t::set_default_formats(const memory_desc_t *dst_md) {
 bool primitive_attr_t::has_default_values(dnnl_primitive_attr::skip_mask_t mask,
         dnnl::impl::data_type_t dst_dt) const {
     using smask_t = skip_mask_t;
-    // prepare mask for runtime-parameters check
-    smask_t defined_mask = smask_t::none;
-    if ((mask & smask_t::oscale_runtime) == smask_t::oscale_runtime)
-        defined_mask |= smask_t::oscale;
-    if ((mask & smask_t::scales_runtime) == smask_t::scales_runtime)
-        defined_mask |= smask_t::scales;
-    if ((mask & smask_t::zero_points_runtime) == smask_t::zero_points_runtime)
-        defined_mask |= smask_t::zero_points;
     bool ok = true;
 
 #define CHECK_ARG(x) ok = ok && (x)
 #define CHECK_MASK(mask_name, mask_field) \
     CHECK_ARG(IMPLICATION( \
             (bool)(~mask & (mask_name)), (mask_field).has_default_values()))
-    CHECK_MASK(smask_t::oscale_runtime, output_scales_);
     CHECK_MASK(smask_t::scales, scales_);
-    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::scales_runtime_groups),
+    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::scales_groups),
             scales_.has_default_groups()));
-    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::scales_runtime_data_type),
+    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::scales_data_type),
             scales_.has_default_data_type()));
     CHECK_MASK(smask_t::zero_points, zero_points_);
-    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::zero_points_runtime_groups),
+    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::zero_points_groups),
             zero_points_.has_default_groups()));
-    CHECK_ARG(
-            IMPLICATION((bool)(~mask & smask_t::zero_points_runtime_data_type),
-                    zero_points_.has_default_data_type()));
+    CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::zero_points_data_type),
+            zero_points_.has_default_data_type()));
+    CHECK_MASK(smask_t::input_zero_points, input_zero_points_);
+    CHECK_MASK(smask_t::weights_zero_points, weights_zero_points_);
+    CHECK_MASK(smask_t::output_compensations, output_compensations_);
     CHECK_MASK(smask_t::post_ops, post_ops_);
     CHECK_MASK(smask_t::rnn_data_qparams, rnn_data_qparams_);
     CHECK_MASK(smask_t::rnn_weights_qparams, rnn_weights_qparams_);
     CHECK_MASK(smask_t::rnn_weights_projection_qparams,
             rnn_weights_projection_qparams_);
+    CHECK_MASK(smask_t::src_dyn_quant_params, src_dyn_quant_params_);
     CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::sum_dt),
             post_ops_.sum_with_default_dt(dst_dt)));
     bool gpu_attr_ok = IMPLICATION((bool)(~mask & smask_t::gpu_attr),
@@ -172,7 +148,7 @@ bool primitive_attr_t::has_default_values(dnnl_primitive_attr::skip_mask_t mask,
             (bool)(~mask & smask_t::dropout), dropout_.has_default_values()));
     CHECK_ARG(IMPLICATION((bool)(~mask & smask_t::rounding_mode),
             rounding_mode_.has_default_values()));
-    CHECK_ARG(this->defined(defined_mask));
+    CHECK_ARG(this->defined(smask_t::none));
     bool fpmath_mode_ok = IMPLICATION(
             (bool)(~mask & smask_t::fpmath_mode) && fpmath_.apply_to_int_,
             fpmath_.mode_ == fpmath_mode::strict);
@@ -188,14 +164,11 @@ bool primitive_attr_t::defined(dnnl_primitive_attr::skip_mask_t mask) const {
 #define CHECK_ARG(x) ok = ok && (x)
 #define CHECK_MASK(mask_name, mask_field) \
     CHECK_ARG(IMPLICATION((bool)(~mask & (mask_name)), (mask_field).defined()))
-    CHECK_MASK(smask_t::oscale, output_scales_);
-    CHECK_MASK(smask_t::scales, scales_);
-    CHECK_MASK(smask_t::zero_points, zero_points_);
-    CHECK_MASK(smask_t::post_ops, post_ops_);
     CHECK_MASK(smask_t::rnn_data_qparams, rnn_data_qparams_);
     CHECK_MASK(smask_t::rnn_weights_qparams, rnn_weights_qparams_);
     CHECK_MASK(smask_t::rnn_weights_projection_qparams,
             rnn_weights_projection_qparams_);
+    CHECK_MASK(smask_t::src_dyn_quant_params, src_dyn_quant_params_);
     return ok;
 #undef CHECK_MASK
 #undef CHECK_ARG
@@ -203,6 +176,8 @@ bool primitive_attr_t::defined(dnnl_primitive_attr::skip_mask_t mask) const {
 
 status_t post_ops_t::append_sum(
         float scale, int32_t zero_point, data_type_t dt) {
+    if (is_runtime_value(scale)) return invalid_arguments;
+
     entry_.emplace_back();
     auto &e = entry_.back();
     e.kind = primitive_kind::sum;
@@ -216,6 +191,9 @@ status_t post_ops_t::append_eltwise(
         float scale, alg_kind_t alg, float alpha, float beta) {
     if (!math::is_eltwise_ok(data_type::f32, alg, alpha, beta))
         return invalid_arguments;
+    if (is_runtime_value(scale)) return invalid_arguments;
+    if (is_runtime_value(alpha)) return invalid_arguments;
+    if (is_runtime_value(beta)) return invalid_arguments;
 
     entry_.emplace_back();
     auto &e = entry_.back();
@@ -262,7 +240,7 @@ status_t post_ops_t::validate_binary(
     using namespace alg_kind;
     bool alg_ok = one_of(alg, binary_add, binary_mul, binary_max, binary_min,
             binary_div, binary_sub, binary_ge, binary_gt, binary_le, binary_lt,
-            binary_eq, binary_ne);
+            binary_eq, binary_ne, binary_prelu);
     if (!alg_ok) return invalid_arguments;
     if (!memory_desc_sanity_check(*user_src1_desc)) return invalid_arguments;
 
@@ -313,25 +291,77 @@ status_t post_ops_t::append_prelu(int mask) {
     return success;
 }
 
-bool post_ops_t::defined() const {
-    for (int idx = 0; idx < len(); ++idx) {
-        auto kind = entry_[idx].kind;
-        if (kind == primitive_kind::sum) {
-            if (is_runtime_value(entry_[idx].sum.scale)) return false;
-        } else if (kind == primitive_kind::eltwise) {
-            const auto &e = entry_[idx].eltwise;
-            if (is_runtime_value(e.scale) || is_runtime_value(e.alpha)
-                    || is_runtime_value(e.beta))
-                return false;
-        } else if (utils::one_of(kind, primitive_kind::binary,
-                           primitive_kind::prelu,
-                           primitive_kind::convolution)) {
-            // binary is always defined
-        } else {
-            assert(!"unreachable");
-        }
-    }
-    return true;
+status_t post_ops_t::append_depthwise(alg_kind_t alg, size_t offset_size, const size_t* offset) {
+    using namespace dnnl::impl::alg_kind;
+    if (len() == post_ops_limit) return out_of_memory;
+    bool known_alg = one_of(alg, depthwise_scale_shift, depthwise_prelu);
+    if (!known_alg)
+        return invalid_arguments;
+
+    entry_.emplace_back();
+    auto &e = entry_.back();
+    e.kind = primitive_kind::depthwise;
+    e.depthwise.alg = alg;
+    array_copy(e.depthwise.offset, offset, offset_size);
+
+    return success;
+}
+
+status_t post_ops_t::append_quantization(alg_kind_t alg,
+        size_t per_channel_size, const bool* per_channel,
+        size_t all_default_size, const bool* all_default,
+        size_t offset_size, const size_t* offset) {
+    using namespace dnnl::impl::alg_kind;
+    if (len() == post_ops_limit) return out_of_memory;
+    bool known_alg = one_of(alg, quantization_quantize_dequantize, quantization_quantize);
+    if (!known_alg)
+        return invalid_arguments;
+
+    entry_.emplace_back();
+    auto &e = entry_.back();
+    e.kind = primitive_kind::quantization;
+    e.quantization.alg = alg;
+
+    array_copy(e.quantization.per_channel, per_channel, per_channel_size);
+    array_copy(e.quantization.all_default, all_default, all_default_size);
+    array_copy(e.quantization.offset, offset, offset_size);
+
+    return success;
+}
+
+status_t post_ops_t::append_binarization(alg_kind_t alg, const float* weights_data, const float* output_mask_data) {
+    using namespace dnnl::impl::alg_kind;
+    if (len() == post_ops_limit) return out_of_memory;
+    bool known_alg = one_of(alg, binarization_depthwise);
+    if (!known_alg)
+        return invalid_arguments;
+
+    entry_.emplace_back();
+    auto &e = entry_.back();
+    e.kind = primitive_kind::binarization;
+    e.binarization.alg = alg;
+    e.binarization.weights_data = weights_data;
+    e.binarization.output_mask_data = output_mask_data;
+
+    return success;
+}
+
+status_t post_ops_t::append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w,
+                                    dnnl::impl::data_type_t in_dt) {
+    if (len() == post_ops_limit) return out_of_memory;
+
+    entry_.emplace_back();
+    auto &e = entry_.back();
+    e.kind = primitive_kind::convolution;
+    e.depthwise_conv_old.in_h = in_h;
+    e.depthwise_conv_old.in_w = in_w;
+    e.depthwise_conv_old.ker_h = ker_h;
+    e.depthwise_conv_old.ker_w = ker_w;
+    e.depthwise_conv_old.str_h = str_h;
+    e.depthwise_conv_old.str_w = str_w;
+    e.depthwise_conv_old.in_dt = in_dt;
+
+    return success;
 }
 
 status_t post_ops_t::set_default_formats(const memory_desc_t *dst_md) {
@@ -398,6 +428,26 @@ bool post_ops_t::check_sum_consistency(const data_type_t dst_dt,
             && check_sum_consistent_quantization(dst_dt, is_int8);
 }
 
+status_t post_ops_t::entry_t::validate_binary_with_dst_consistency(
+        const memory_desc_t *dst_md) const {
+    if (!is_binary()) return status::success;
+
+    VCHECK_ATTR(dst_md->ndims == binary.user_src1_desc.ndims,
+            VERBOSE_INCONSISTENT_NDIMS_WITH_VALS, "dst", "bin_po",
+            dst_md->ndims, binary.user_src1_desc.ndims);
+
+    return status::success;
+}
+
+status_t post_ops_t::validate_binary_with_dst_consistency(
+        const memory_desc_t *dst_md) const {
+    for (const auto &e : entry_) {
+        CHECK(e.validate_binary_with_dst_consistency(dst_md));
+    }
+
+    return status::success;
+}
+
 status_t primitive_attr_t::set_dropout(const memory_desc_t *user_dropout_desc) {
     if (any_null(user_dropout_desc)) return invalid_arguments;
     dropout_.user_dropout_desc_ = *user_dropout_desc;
@@ -429,8 +479,10 @@ status_t primitive_attr_t::set_accumulation_mode(accumulation_mode_t am) {
 
 status_t primitive_attr_t::set_scratchpad_mode(
         scratchpad_mode_t scratchpad_mode) {
-    const bool ok = one_of(
-            scratchpad_mode, scratchpad_mode::library, scratchpad_mode::user);
+    /* workaround for the name conflict with system struct 'user' in llvm-android toolchain */
+    using namespace dnnl::impl::scratchpad_mode;
+
+    const bool ok = one_of(scratchpad_mode, scratchpad_mode::library, scratchpad_mode::user);
     if (!ok) return invalid_arguments;
 
     scratchpad_mode_ = scratchpad_mode;
@@ -562,10 +614,18 @@ status_t dnnl_primitive_attr_set_scratchpad_mode(
 
 status_t dnnl_primitive_attr_set_scales_mask(
         primitive_attr_t *attr, int arg, int mask) {
-    bool ok = attr && mask >= 0 && arg >= 0;
-    if (!ok) return invalid_arguments;
+    VCHECK_ATTR(attr, VERBOSE_NULL_ARG);
+    VCHECK_ATTR(mask >= 0, VERBOSE_BAD_PARAM, "mask");
+    VCHECK_ATTR(arg >= 0, VERBOSE_BAD_PARAM, "arg");
     return attr->scales_.set(arg, mask);
 }
+status_t dnnl_primitive_attr_set_scales_dims(
+        primitive_attr_t *attr, int arg, const dims_t dims, int ndims, data_type_t data_type) {
+    bool ok = attr && arg >= 0 && ndims > 0
+            && attr->scales_.has_default_values();
+    if (!ok) return invalid_arguments;
+    return attr->scales_.set_scales(arg, dims, ndims, data_type);
+}
 
 status_t dnnl_primitive_attr_set_scales(primitive_attr_t *attr, int arg,
         int mask, int ndims, const dims_t group_dims, data_type_t data_type) {
@@ -574,39 +634,47 @@ status_t dnnl_primitive_attr_set_scales(primitive_attr_t *attr, int arg,
     VCHECK_ATTR(mask >= 0, VERBOSE_BAD_PARAM, "mask");
     VCHECK_ATTR(arg >= 0, VERBOSE_BAD_PARAM, "arg");
     VCHECK_ATTR(ndims >= 0, VERBOSE_BAD_PARAM, "ndims");
-    VCHECK_ATTR(utils::one_of(data_type, f32, bf16, f16, e8m0),
-            VERBOSE_INVALID_DATATYPE, "scales");
-    VCHECK_ATTR(IMPLICATION(!utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS),
-                        data_type == f32 && ndims == 0)
-                    || IMPLICATION(arg == DNNL_ARG_DST,
-                            utils::one_of(data_type, f32, e8m0)),
+    VCHECK_ATTR(
+            utils::one_of(data_type, f32, bf16, f16, e8m0, f8_e5m2, f8_e4m3),
             VERBOSE_INVALID_DATATYPE, "scales");
     VCHECK_ATTR(IMPLICATION(ndims, validate_dims(ndims, group_dims)),
             VERBOSE_BAD_PARAM, "group_dims");
-    return attr->scales_.set(arg, mask, ndims, group_dims, data_type);
+    return attr->scales_.set(arg, mask, data_type, ndims, group_dims);
 }
 
 status_t dnnl_primitive_attr_set_zero_points_mask(
         primitive_attr_t *attr, int arg, int mask) {
-    bool ok = attr && mask >= 0;
+    VCHECK_ATTR(attr, VERBOSE_NULL_ARG);
+    VCHECK_ATTR(mask >= 0, VERBOSE_BAD_PARAM, "mask");
+    return attr->zero_points_.set(arg, mask);
+}
+status_t dnnl_primitive_attr_set_zero_points_dims(
+        primitive_attr_t *attr, int arg, const dims_t dims, int ndims, dnnl_data_type_t data_type) {
+    bool ok = attr && ndims > 0;
     if (!ok) return invalid_arguments;
 
-    return attr->zero_points_.set(arg, mask);
+    return attr->zero_points_.set_zero_points(arg, dims, ndims, data_type);
 }
 
-dnnl_status_t DNNL_API dnnl_primitive_attr_set_zero_points(
-        dnnl_primitive_attr_t attr, int arg, int mask, int ndims,
-        const dnnl_dims_t group_dims, dnnl_data_type_t data_type) {
+status_t dnnl_primitive_attr_set_zero_points(dnnl_primitive_attr_t attr,
+        int arg, int mask, int ndims, const dnnl_dims_t group_dims,
+        dnnl_data_type_t data_type) {
     using namespace data_type;
-    bool ok = attr && arg >= 0 && mask >= 0 && ndims >= 0
-            && utils::one_of(data_type, s32, s8, u8, s4, u4)
-            && IMPLICATION(
-                    arg != DNNL_ARG_WEIGHTS, data_type == s32 && ndims == 0)
-            && IMPLICATION(utils::one_of(data_type, s4, u4), mask > 0)
-            && IMPLICATION(ndims, validate_dims(ndims, group_dims));
-    if (!ok) return invalid_arguments;
+    VCHECK_ATTR(attr, VERBOSE_NULL_ARG);
+    VCHECK_ATTR(mask >= 0, VERBOSE_BAD_PARAM, "mask");
+    VCHECK_ATTR(arg >= 0, VERBOSE_BAD_PARAM, "arg");
+    VCHECK_ATTR(ndims >= 0, VERBOSE_BAD_PARAM, "ndims");
+    VCHECK_ATTR(utils::one_of(data_type, s32, s8, u8, s4, u4),
+            VERBOSE_INVALID_DATATYPE, "zero points");
+    VCHECK_ATTR(IMPLICATION(utils::one_of(data_type, s4, u4), mask > 0),
+            VERBOSE_BAD_PARAM, "mask with int4 data type");
+    VCHECK_ATTR(IMPLICATION(!utils::one_of(arg, DNNL_ARG_SRC, DNNL_ARG_WEIGHTS),
+                        data_type == s32 && ndims == 0),
+            VERBOSE_INVALID_DATATYPE, "zero points");
+    VCHECK_ATTR(IMPLICATION(ndims, validate_dims(ndims, group_dims)),
+            VERBOSE_BAD_PARAM, "group_dims");
 
-    return attr->zero_points_.set(arg, mask, ndims, group_dims, data_type);
+    return attr->zero_points_.set(arg, mask, data_type, ndims, group_dims);
 }
 
 status_t dnnl_primitive_attr_get_rounding(
@@ -622,6 +690,33 @@ status_t dnnl_primitive_attr_set_rounding(
     return attr->rounding_mode_.set(arg, mode);
 }
 
+status_t dnnl_primitive_attr_set_output_compensations(primitive_attr_t *attr,
+        int count, int mask) {
+    bool ok = !any_null(attr) && count > 0 && mask >= 0;
+    if (!ok)
+        return invalid_arguments;
+
+    return attr->output_compensations_.set(count, mask);
+}
+
+status_t dnnl_primitive_attr_set_input_zero_points(primitive_attr_t *attr,
+        int count, int mask) {
+    bool ok = !any_null(attr) && count > 0 && mask >= 0;
+    if (!ok)
+        return invalid_arguments;
+
+    return attr->input_zero_points_.set(count, mask);
+}
+
+status_t dnnl_primitive_attr_set_weights_zero_points(primitive_attr_t *attr,
+        int count, int mask) {
+    bool ok = !any_null(attr) && count > 0 && mask >= 0;
+    if (!ok)
+        return invalid_arguments;
+
+    return attr->weights_zero_points_.set(count, mask);
+}
+
 status_t dnnl_primitive_attr_get_post_ops(
         const primitive_attr_t *attr, const post_ops_t **post_ops) {
     if (any_null(attr, post_ops)) return invalid_arguments;
@@ -681,19 +776,20 @@ status_t dnnl_post_ops_append_sum(
 }
 
 namespace {
-bool simple_get_params_check(
+status_t simple_get_params_check(
         const post_ops_t *post_ops, int index, primitive_kind_t kind) {
-    bool ok = true && post_ops != nullptr && 0 <= index
-            && index < post_ops->len() && post_ops->entry_[index].kind == kind;
-    return ok;
+    VCHECK_ATTR(post_ops, VERBOSE_NULL_ARG);
+    VCHECK_ATTR(index >= 0, VERBOSE_BAD_PARAM, "index");
+    VCHECK_ATTR(index < post_ops->len(), VERBOSE_BAD_PARAM, "index");
+    VCHECK_ATTR(
+            post_ops->entry_[index].kind == kind, VERBOSE_BAD_PARAM, "kind");
+    return status::success;
 }
 } // namespace
 
 status_t dnnl_post_ops_get_params_sum(const post_ops_t *post_ops, int index,
         float *scale, int32_t *zero_point, data_type_t *dt) {
-    bool ok = true
-            && simple_get_params_check(post_ops, index, primitive_kind::sum);
-    if (!ok) return invalid_arguments;
+    CHECK(simple_get_params_check(post_ops, index, primitive_kind::sum));
 
     if (scale) *scale = post_ops->entry_[index].sum.scale;
     if (zero_point) *zero_point = post_ops->entry_[index].sum.zero_point;
@@ -711,15 +807,12 @@ status_t dnnl_post_ops_append_eltwise(
 
 status_t dnnl_post_ops_get_params_eltwise(const post_ops_t *post_ops, int index,
         alg_kind_t *alg, float *alpha, float *beta) {
-    bool ok = true
-            && simple_get_params_check(post_ops, index, primitive_kind::eltwise)
-            && !any_null(alpha, beta);
-    if (!ok) return invalid_arguments;
+    CHECK(simple_get_params_check(post_ops, index, primitive_kind::eltwise));
 
     const auto &e = post_ops->entry_[index].eltwise;
-    *alg = e.alg;
-    *alpha = e.alpha;
-    *beta = e.beta;
+    if (alg) *alg = e.alg;
+    if (alpha) *alpha = e.alpha;
+    if (beta) *beta = e.beta;
 
     return success;
 }
@@ -736,9 +829,8 @@ status_t dnnl_post_ops_append_dw(post_ops_t *post_ops, data_type_t wei_dt,
 status_t dnnl_post_ops_get_params_dw(const post_ops_t *post_ops, int index,
         data_type_t *wei_dt, data_type_t *bias_dt, data_type_t *dst_dt,
         dim_t *kernel, dim_t *stride, dim_t *padding) {
-
-    if (!simple_get_params_check(post_ops, index, primitive_kind::convolution))
-        return invalid_arguments;
+    CHECK(simple_get_params_check(
+            post_ops, index, primitive_kind::convolution));
 
     const auto &d = post_ops->entry_[index].depthwise_conv;
     if (wei_dt) *wei_dt = d.wei_dt;
@@ -760,8 +852,7 @@ status_t dnnl_post_ops_append_binary(post_ops_t *post_ops, alg_kind_t alg_kind,
 
 status_t dnnl_post_ops_get_params_binary(const post_ops_t *post_ops, int index,
         alg_kind_t *alg_kind, const memory_desc_t **user_src1_desc) {
-    if (!simple_get_params_check(post_ops, index, primitive_kind::binary))
-        return invalid_arguments;
+    CHECK(simple_get_params_check(post_ops, index, primitive_kind::binary));
 
     const auto &b = post_ops->entry_[index].binary;
     if (alg_kind) *alg_kind = b.alg;
@@ -787,6 +878,45 @@ status_t dnnl_post_ops_get_params_prelu(
     return success;
 }
 
+status_t dnnl_post_ops_append_depthwise(dnnl_post_ops_t post_ops, dnnl_alg_kind_t alg, size_t offset_size, const size_t* offset) {
+    if (post_ops == nullptr || offset == nullptr) return invalid_arguments;
+
+    if (offset_size != 2)
+        return invalid_arguments;
+
+    return post_ops->append_depthwise(alg, offset_size, offset);
+}
+
+status_t dnnl_post_ops_append_quantization(post_ops_t *post_ops, alg_kind_t kind,
+                                           size_t per_channel_size, const bool* per_channel,
+                                           size_t all_default_size, const bool* all_default,
+                                           size_t offset_size, const size_t* offset) {
+    if (post_ops == nullptr || per_channel == nullptr || all_default == nullptr || offset == nullptr)
+        return invalid_arguments;
+
+    if (per_channel_size != all_default_size || all_default_size != offset_size ||  offset_size != 6)
+        return invalid_arguments;
+
+    return post_ops->append_quantization(kind, per_channel_size, per_channel, all_default_size, all_default, offset_size, offset);
+}
+
+status_t dnnl_post_ops_append_binarization(post_ops_t *post_ops, alg_kind_t kind, const float* weights_data,
+                                           const float* output_mask_data) {
+    if (post_ops == nullptr)
+        return invalid_arguments;
+
+    return post_ops->append_binarization(kind, weights_data, output_mask_data);
+}
+
+status_t dnnl_post_ops_append_dw_conv(post_ops_t *post_ops,
+                                      int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w,
+                                      dnnl::impl::data_type_t in_dt) {
+    if (post_ops == nullptr)
+        return invalid_arguments;
+
+    return post_ops->append_dw_conv(in_h, in_w, ker_h, ker_w, str_h, str_w, in_dt);
+}
+
 status_t dnnl_primitive_attr_set_rnn_data_qparams(
         primitive_attr_t *attr, const float scale, const float shift) {
     if (attr == nullptr) return invalid_arguments;
@@ -854,3 +984,22 @@ status_t DNNL_API dnnl_primitive_attr_set_rnn_tparams(
 
     return attr->rnn_tparams_.set(mode, ngates, scales, cscale);
 }
+
+status_t dnnl_primitive_attr_set_src_dyn_quant_params(
+        primitive_attr_t *attr, const uint64_t group_size) {
+    if (attr == nullptr) return invalid_arguments;
+
+    return attr->src_dyn_quant_params_.set(group_size);
+}
+
+status_t dnnl_primitive_attr_get_src_dyn_quant_params(
+        primitive_attr_t *attr, uint64_t* group_size) {
+    if (attr == nullptr) return invalid_arguments;
+
+    if (group_size) *group_size = attr->src_dyn_quant_params_.get();
+    return success;
+}
+
+template struct dnnl::impl::shifts_t<uint8_t>;
+template struct dnnl::impl::shifts_t<int32_t>;
+template struct dnnl::impl::shifts_t<float>;
diff --git a/src/common/primitive_attr.hpp b/src/common/primitive_attr.hpp
index 5e1496978ed..c961d884455 100644
--- a/src/common/primitive_attr.hpp
+++ b/src/common/primitive_attr.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include "c_types_map.hpp"
 #include "nstl.hpp"
+#include "primitive_attr_quant.hpp"
 #include "type_helpers.hpp"
 #include "utils.hpp"
 
@@ -36,11 +37,9 @@ namespace dnnl {
 namespace impl {
 
 const primitive_attr_t &default_attr();
-struct runtime_scales_t;
-const runtime_scales_t &default_runtime_scale();
 
 struct rnn_data_qparams_t : public c_compatible {
-    rnn_data_qparams_t() : scale_(1.), shift_(0.) {}
+    rnn_data_qparams_t() : scale_(1.f), shift_(0.f) {}
     bool has_default_values() const { return (scale_ == 1. && shift_ == 0.); }
     bool defined() const {
         return !is_runtime_value(scale_) && !is_runtime_value(shift_);
@@ -129,14 +128,14 @@ struct rnn_tparams_t : public c_compatible {
 };
 
 // Note: keep for RNN quantization
-struct scales_t : public c_compatible {
-    scales_t() : count_(1), mask_(0), scales_(scales_buf_) {
-        set_single_scale(1.);
+struct rnn_create_time_scales_t : public c_compatible {
+    rnn_create_time_scales_t() : count_(1), mask_(0), scales_(scales_buf_) {
+        set_single_scale(1.f);
     }
 
-    ~scales_t() { cleanup(); }
+    ~rnn_create_time_scales_t() { cleanup(); }
 
-    bool operator==(const scales_t &rhs) const {
+    bool operator==(const rnn_create_time_scales_t &rhs) const {
         bool ret = count_ == rhs.count_ && mask_ == rhs.mask_
                 && !utils::any_null(scales_, rhs.scales_)
                 && defined() == rhs.defined()
@@ -162,7 +161,7 @@ struct scales_t : public c_compatible {
         return status::success;
     }
 
-    status_t copy_from(const scales_t &other) {
+    status_t copy_from(const rnn_create_time_scales_t &other) {
         return set(other.count_, other.mask_, other.scales_);
     }
 
@@ -182,296 +181,59 @@ struct scales_t : public c_compatible {
         scales_ = scales_buf_;
     }
 
-    DNNL_DISALLOW_COPY_AND_ASSIGN(scales_t);
+    DNNL_DISALLOW_COPY_AND_ASSIGN(rnn_create_time_scales_t);
 };
 
-struct runtime_scales_t : public c_compatible {
-    // Clang-3.8.1 raises an error for a default initialization of a const
-    // object. Const runtime_scales_t object is used as default_scales.
-    // runtime_scales_t() = default;
-    runtime_scales_t() {}
+template <typename T>
+struct shifts_t: public c_compatible {
+    shifts_t(): count_(1), mask_(0), shifts_(shifts_buf_)
+    { set(0); }
 
-    runtime_scales_t &operator=(const runtime_scales_t &rhs) {
-        mask_ = rhs.mask_;
-        is_set_ = rhs.is_set_;
-        ndims_ = rhs.ndims_;
-        if (ndims_ > 0) utils::array_copy(group_dims_, rhs.group_dims_, ndims_);
-        data_type_ = rhs.data_type_;
-        return *this;
-    }
-
-    status_t set(int mask) { return set(0, mask, {}, data_type::f32); }
-
-    status_t set(int ndims, int mask, const dims_t group_dims,
-            data_type_t data_type = data_type::f32) {
-        mask_ = mask;
-        is_set_ = true;
-        ndims_ = ndims;
-        if (ndims > 0) utils::array_copy(group_dims_, group_dims, ndims);
-        data_type_ = data_type;
-        return status::success;
-    }
-
-    bool operator==(const runtime_scales_t &rhs) const {
-        return mask_ == rhs.mask_ && is_set_ == rhs.is_set_
-                && ndims_ == rhs.ndims_
-                && IMPLICATION(ndims_ > 0,
-                        utils::array_cmp(group_dims_, rhs.group_dims_, ndims_))
-                && data_type_ == rhs.data_type_;
-    }
-
-    bool has_default_values() const { return *this == default_runtime_scale(); }
-
-    bool has_default_groups() const { return 0 == ndims_; }
-    bool has_default_data_type() const { return data_type_ == data_type::f32; }
-
-    bool defined() const { return has_default_values(); }
-
-    void reset() { *this = default_runtime_scale(); }
-
-    // TODO: replace with `-1` to remove `is_set_`.
-    // Hide `mask_` under `private:` to force interface usage.
-    int mask_ = 0;
-    bool is_set_ = false;
-    int ndims_ = 0;
-    dims_t group_dims_ = {};
-    data_type_t data_type_ = data_type::f32;
-};
-
-struct arg_scales_t : public c_compatible {
-    arg_scales_t() = default;
-
-    const runtime_scales_t &get(int arg) const {
-        static const runtime_scales_t default_scales;
-        const auto it = scales_.find(arg);
-        if (it == scales_.end()) return default_scales;
-        return it->second;
-    }
-
-    status_t set(int arg, const runtime_scales_t &scale) {
-        if (!check_arg(arg)) return status::invalid_arguments;
-        scales_[arg] = scale;
-        return status::success;
-    }
-
-    bool operator==(const arg_scales_t &rhs) const {
-        return scales_ == rhs.scales_;
-    }
-
-    bool has_default_values(const std::vector<int> &skip_args = {}) const {
-        auto predicate = [](const runtime_scales_t &s) {
-            return s.has_default_values();
-        };
-        return has_default_property(skip_args, predicate);
-    }
-
-    bool has_default_data_type(const std::vector<int> &skip_args = {}) const {
-        auto predicate = [](const runtime_scales_t &s) {
-            return s.has_default_data_type();
-        };
-        return has_default_property(skip_args, predicate);
-    }
-
-    bool has_default_groups(const std::vector<int> &skip_args = {}) const {
-        auto predicate = [](const runtime_scales_t &s) {
-            return s.has_default_groups();
-        };
-        return has_default_property(skip_args, predicate);
-    }
-
-    status_t set(int arg, int mask) {
-        return set(arg, mask, 0, {}, data_type::f32);
-    }
-
-    status_t set(int arg, int mask, int ndims, const dims_t group_dims,
-            data_type_t data_type) {
-        if (!check_arg(arg)) return status::invalid_arguments;
-        return scales_[arg].set(ndims, mask, group_dims, data_type);
-    }
+    ~shifts_t() { cleanup(); }
 
-    // TODO: move to `private` and keep a single interface per entry.
-    status_t get(int arg, int *mask, bool *is_set, int *ndims = nullptr,
-            dims_t group_dims = nullptr,
-            data_type_t *data_type = nullptr) const {
-        if (!check_arg(arg)) return status::invalid_arguments;
-        const auto &s = get(arg);
-        if (mask) *mask = s.mask_;
-        if (is_set) *is_set = s.is_set_;
-        if (ndims) *ndims = s.ndims_;
-        if (group_dims && s.ndims_ > 0)
-            utils::array_copy(group_dims, s.group_dims_, s.ndims_);
-        if (data_type) *data_type = s.data_type_;
-        return status::success;
-    }
-
-    data_type_t get_data_type(int arg) const {
-        data_type_t data_type;
-        auto st = get(arg, nullptr, nullptr, nullptr, nullptr, &data_type);
-        if (st != status::success) return data_type::undef;
-        return data_type;
-    }
-
-    status_t reset(int arg) {
-        if (!check_arg(arg)) return status::invalid_arguments;
-        const auto it = scales_.find(arg);
-        if (it != scales_.end()) scales_.erase(it);
-        return status::success;
-    }
-
-    bool defined() const { return has_default_values(); }
-
-    status_t copy_from(const arg_scales_t &other) {
-        for (auto it = other.scales_.begin(); it != other.scales_.end(); ++it) {
-            // Find an entry that can match the arguments without constructing a
-            // new object.
-            if (scales_.count(it->first) == 1) {
-                auto &entry = scales_[it->first];
-                if (entry == it->second) continue;
-            }
-
-            CHECK(set(it->first, it->second));
-        }
-        return status::success;
+    bool operator==(const shifts_t<T> &rhs) const {
+        bool ret = count_ == rhs.count_ && mask_ == rhs.mask_
+                   && !utils::any_null(shifts_, rhs.shifts_)
+                   && defined() == rhs.defined()
+                   && IMPLICATION(defined(),
+                                  utils::array_cmp(shifts_, rhs.shifts_, count_));
+        return ret;
     }
 
-    std::map<int, runtime_scales_t> scales_;
-
-private:
-    bool check_arg(int arg) const {
-        // binary
-        for (const auto &sa : {DNNL_ARG_SRC_0, DNNL_ARG_SRC_1}) {
-            if (arg == sa) return true;
-        }
-        // concat
-        if (arg & DNNL_ARG_MULTIPLE_SRC) return true;
-        // convolution
-        for (const auto &sa : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-            if (arg == sa) return true;
-        }
-        // depth-wise convolution post op
-        for (const auto &sa : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | sa)) return true;
-        }
-        return false;
-    }
-
-    bool has_default_property(const std::vector<int> &skip_args,
-            bool (*predicate)(const runtime_scales_t &)) const {
-        for (const auto &s : scales_) {
-            if (!predicate(s.second)) {
-                bool skip = false;
-                for (const auto &skip_a : skip_args)
-                    if (s.first == skip_a) {
-                        skip = true;
-                        break;
-                    }
-                if (skip) continue;
-                return false;
-            }
+    bool has_default_values() const {
+        for (int c = 0; c < count_; ++c) {
+            if(shifts_[c] != 0) return false;
         }
         return true;
     }
-};
-
-struct zero_points_t : public c_compatible {
-    bool operator==(const zero_points_t &rhs) const {
-        return mask_src == rhs.mask_src && mask_wei == rhs.mask_wei
-                && mask_dst == rhs.mask_dst && is_set_src == rhs.is_set_src
-                && is_set_wei == rhs.is_set_wei && is_set_dst == rhs.is_set_dst
-                && data_type_wei == rhs.data_type_wei
-                && group_ndims_wei == rhs.group_ndims_wei
-                && IMPLICATION(group_ndims_wei > 0,
-                        utils::array_cmp(group_dims_wei, rhs.group_dims_wei,
-                                group_ndims_wei));
-    }
-
-    // arg-specific checks
-    bool common(int arg) const { return get_mask(arg) == 0; }
-    bool defined(int arg) const { return has_default_values(arg); }
-    bool has_default_values(int arg) const {
-        return is_set(arg) == false && has_default_data_type(arg);
-    }
-    bool has_default_groups(int arg) const {
-        return IMPLICATION(arg == DNNL_ARG_WEIGHTS, group_ndims_wei == 0);
-    }
-    bool has_default_data_type(int arg) const {
-        return get_data_type(arg) == data_type::s32;
-    }
-    // same checks but for all supported arguments at once
-    bool common() const { return check_all(&zero_points_t::common); }
-    bool defined() const { return has_default_values(); }
-    bool has_default_values() const {
-        return check_all(&zero_points_t::has_default_values);
-    }
-    bool has_default_groups() const {
-        return check_all(&zero_points_t::has_default_groups);
-    }
-    bool has_default_data_type() const {
-        return check_all(&zero_points_t::has_default_data_type);
-    }
-
-    status_t get(int arg, int *mask, data_type_t *dt = nullptr) const;
-
-    int get(int arg) const; // Returns 0 if dimension is unset
-
-    data_type_t get_data_type(int arg) const {
-        if (arg == DNNL_ARG_WEIGHTS) return data_type_wei;
-        return data_type::s32;
-    }
 
-    const dim_t *get_groups(int arg) const {
-        if (arg == DNNL_ARG_WEIGHTS) return group_dims_wei;
-        return nullptr;
-    }
+    bool defined() const { return !is_runtime_value(shifts_[0]); }
 
-    int get_groups_ndims(int arg) const {
-        if (arg == DNNL_ARG_WEIGHTS) return group_ndims_wei;
-        return 0;
-    }
+    status_t set(int count, int mask, const T *zero_points);
+    status_t set(T single_zero_point) { return this->set(1, 0, &single_zero_point); }
 
-    status_t set(int arg, int mask, int ndims, const dims_t group_dims,
-            data_type_t data_type);
-
-    status_t set(int arg, int mask) {
-        return set(arg, mask, 0, nullptr, data_type::s32);
+    status_t copy_from(const shifts_t &other) {
+        return set(other.count_, other.mask_, other.shifts_);
     }
 
-    status_t set(int arg) { return set(arg, 0); }
+    dim_t count_;
+    int mask_;
+    T *shifts_;
 
 private:
-    bool is_set_src = false, is_set_wei = false, is_set_dst = false;
-    int mask_src = 0, mask_wei = 0, mask_dst = 0;
-    data_type_t data_type_wei = data_type::s32;
-    int group_ndims_wei = 0;
-    dims_t group_dims_wei {};
-
-    int get_mask(int arg) const {
-        int mask = 0;
-        switch (arg) {
-            case DNNL_ARG_SRC: mask = mask_src; break;
-            case DNNL_ARG_WEIGHTS: mask = mask_wei; break;
-            case DNNL_ARG_DST: mask = mask_dst; break;
-            default: mask = 0;
-        }
-        return mask;
-    }
+    enum { shifts_buf_size = 16 };
+    T shifts_buf_[shifts_buf_size];
 
-    bool is_set(int arg) const {
-        bool arg_is_set = false;
-        switch (arg) {
-            case DNNL_ARG_SRC: arg_is_set = is_set_src; break;
-            case DNNL_ARG_WEIGHTS: arg_is_set = is_set_wei; break;
-            case DNNL_ARG_DST: arg_is_set = is_set_dst; break;
-            default: arg_is_set = 0;
-        }
-        return arg_is_set;
-    }
+    void cleanup() {
+        if (shifts_ != shifts_buf_ && shifts_ != nullptr)
+            impl::free(shifts_);
 
-    bool check_all(bool (zero_points_t::*f)(int) const) const {
-        for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST})
-            if (!(this->*f)(arg)) return false;
-        return true;
+        count_ = 1;
+        mask_ = 0;
+        shifts_ = shifts_buf_;
     }
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(shifts_t);
 };
 
 struct dropout_t : public c_compatible {
@@ -558,6 +320,26 @@ struct fpmath_t : public c_compatible {
     bool apply_to_int_;
 };
 
+struct legacy_zero_points_t : public c_compatible {
+    bool operator==(const legacy_zero_points_t &rhs) const {
+        return count_ == rhs.count_ && mask_ == rhs.mask_;
+    }
+
+    bool has_default_values() const {
+        return count_ == 0 && mask_ == 0;
+    }
+
+    status_t set(dim_t count, int mask) {
+        count_ = count;
+        mask_ = mask;
+
+        return status::success;
+    }
+
+    dim_t count_ = 0;
+    int mask_ = 0;
+};
+
 } // namespace impl
 } // namespace dnnl
 
@@ -609,14 +391,64 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
             int mask;
         };
 
+        struct depthwise_t {
+            enum depthwise_fields {
+                scales,
+                shifts,
+
+                fields_count
+            };
+
+            dnnl::impl::alg_kind_t alg;
+            size_t offset[fields_count];
+        };
+
+        struct quantization_t {
+            enum quantization_fields {
+                crop_low,
+                crop_high,
+                inp_scale,
+                inp_shift,
+                output_scale,
+                output_shift,
+
+                fields_count
+            };
+
+            dnnl::impl::alg_kind_t alg;
+            bool per_channel[fields_count];
+            bool all_default[fields_count];
+            size_t offset[fields_count];
+        };
+
+        struct binarization_t {
+            dnnl::impl::alg_kind_t alg;
+            const float* weights_data;
+            const float* output_mask_data;
+        };
+
+        struct depthwise_conv_old_t {
+            int in_h;
+            int in_w;
+            int ker_h;
+            int ker_w;
+            int str_h;
+            int str_w;
+            dnnl::impl::data_type_t in_dt;
+        };
+
         dnnl::impl::primitive_kind_t kind
                 = dnnl::impl::primitive_kind::undefined;
         union {
             sum_t sum;
             eltwise_t eltwise;
             depthwise_conv_t depthwise_conv;
+            depthwise_conv_old_t depthwise_conv_old;
             binary_t binary;
             prelu_t prelu;
+            depthwise_t depthwise;
+            quantization_t quantization;
+            binarization_t binarization;
         };
 
         bool is_eltwise(bool require_scale_one = false) const {
@@ -655,8 +487,23 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
         }
 
         bool is_like_binary() const { return is_binary() || is_prelu(); }
+        bool is_depthwise() const {
+            using namespace dnnl::impl;
+            return kind == primitive_kind::depthwise;
+        }
 
-        dnnl::impl::status_t set_depthwise_scales(const float *scales);
+        bool is_quantization() const {
+            using namespace dnnl::impl;
+            return kind == primitive_kind::quantization;
+        }
+
+        bool is_binarization() const {
+            using namespace dnnl::impl;
+            return kind == primitive_kind::binarization;
+        }
+
+        dnnl::impl::status_t validate_binary_with_dst_consistency(
+                const dnnl::impl::memory_desc_t *dst_desc) const;
 
         bool operator==(const entry_t &rhs) const {
             using namespace dnnl::impl;
@@ -676,18 +523,26 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
                             && sum.dt == rhs.sum.dt;
                     break;
                 case primitive_kind::convolution:
-                    // Depthwise Only
-                    ret = depthwise_conv.kernel == rhs.depthwise_conv.kernel
-                            && depthwise_conv.stride
-                                    == rhs.depthwise_conv.stride
-                            && depthwise_conv.padding
-                                    == rhs.depthwise_conv.padding
-                            && depthwise_conv.wei_dt
-                                    == rhs.depthwise_conv.wei_dt
-                            && depthwise_conv.bias_dt
-                                    == rhs.depthwise_conv.bias_dt
-                            && depthwise_conv.dst_dt
-                                    == rhs.depthwise_conv.dst_dt;
+                    // todo: [antonvor] uncomment when new behavior of dw convolution fusing from oneDNN 1.6 will be supported
+                    // // Depthwise Only
+                    // ret = depthwise_conv.kernel == rhs.depthwise_conv.kernel
+                    //         && depthwise_conv.stride
+                    //                 == rhs.depthwise_conv.stride
+                    //         && depthwise_conv.padding
+                    //                 == rhs.depthwise_conv.padding
+                    //         && depthwise_conv.wei_dt
+                    //                 == rhs.depthwise_conv.wei_dt
+                    //         && depthwise_conv.bias_dt
+                    //                 == rhs.depthwise_conv.bias_dt
+                    //         && depthwise_conv.dst_dt
+                    //                 == rhs.depthwise_conv.dst_dt;
+                    ret = depthwise_conv_old.in_h == rhs.depthwise_conv_old.in_h
+                        && depthwise_conv_old.in_w == rhs.depthwise_conv_old.in_w
+                        && depthwise_conv_old.ker_h == rhs.depthwise_conv_old.ker_h
+                        && depthwise_conv_old.ker_w == rhs.depthwise_conv_old.ker_w
+                        && depthwise_conv_old.str_h == rhs.depthwise_conv_old.str_h
+                        && depthwise_conv_old.str_w == rhs.depthwise_conv_old.str_w
+                        && depthwise_conv_old.in_dt == rhs.depthwise_conv_old.in_dt;
                     break;
                 case primitive_kind::binary:
                     ret = binary.alg == rhs.binary.alg
@@ -697,6 +552,21 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
                 case primitive_kind::prelu:
                     ret = prelu.mask == rhs.prelu.mask;
                     break;
+                case primitive_kind::depthwise:
+                    ret = depthwise.alg == rhs.depthwise.alg
+                          && array_cmp(depthwise.offset, rhs.depthwise.offset, depthwise.fields_count);
+                    break;
+                case primitive_kind::quantization:
+                    ret = quantization.alg == rhs.quantization.alg
+                          && array_cmp(quantization.per_channel, rhs.quantization.per_channel, quantization.fields_count)
+                          && array_cmp(quantization.all_default, rhs.quantization.all_default, quantization.fields_count)
+                          && array_cmp(quantization.offset, rhs.quantization.offset, quantization.fields_count);
+                    break;
+                case primitive_kind::binarization:
+                    ret = depthwise.alg == rhs.depthwise.alg
+                          && binarization.weights_data == rhs.binarization.weights_data
+                          && binarization.output_mask_data == rhs.binarization.output_mask_data;
+                    break;
                 default: assert(!"unsupported post_op");
             }
             return ret;
@@ -707,7 +577,7 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
         }
     };
 
-    dnnl_post_ops() : entry_() {}
+    dnnl_post_ops() = default;
     ~dnnl_post_ops() = default;
 
     dnnl::impl::status_t append_sum(float scale, int32_t zero_point = 0,
@@ -721,6 +591,15 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
     dnnl::impl::status_t append_binary(dnnl::impl::alg_kind_t alg,
             const dnnl::impl::memory_desc_t *user_src1_desc);
     dnnl::impl::status_t append_prelu(int mask);
+    dnnl::impl::status_t append_depthwise(dnnl::impl::alg_kind_t alg, size_t offset_size, const size_t* offset);
+    dnnl::impl::status_t append_quantization(dnnl::impl::alg_kind_t alg,
+            size_t per_channel_size, const bool* per_channel,
+            size_t all_default_size, const bool* all_default,
+            size_t offset_size, const size_t* offset);
+    dnnl::impl::status_t append_binarization(dnnl::impl::alg_kind_t alg, const float* weights_data,
+            const float* output_mask_data);
+    dnnl::impl::status_t append_dw_conv(int in_h, int in_w, int ker_h, int ker_w, int str_h, int str_w,
+            dnnl::impl::data_type_t in_dt);
 
     dnnl::impl::status_t prepend_binary(dnnl::impl::alg_kind_t alg,
             const dnnl::impl::memory_desc_t *user_src1_desc);
@@ -743,7 +622,16 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
         return dst_dt;
     }
 
-    bool defined() const;
+    int count(dnnl::impl::primitive_kind_t kind, int start = 0,
+              int stop = -1) const {
+        if (stop == -1) stop = len();
+        stop = dnnl::impl::nstl::min(stop, len());
+        int cnt = 0;
+        for (int idx = start; idx < stop; ++idx)
+            if (entry_[idx].kind == kind) cnt++;
+        return cnt;
+    }
+
     int len() const { return (int)entry_.size(); }
     bool has_default_values(
             const std::vector<dnnl::impl::primitive_kind_t> &skip_pk
@@ -777,6 +665,9 @@ struct dnnl_post_ops : public dnnl::impl::c_compatible {
                 || entry_[sum_ind].sum.dt == dst_dt;
     }
 
+    dnnl::impl::status_t validate_binary_with_dst_consistency(
+            const dnnl::impl::memory_desc_t *dst_desc) const;
+
     bool contain(dnnl::impl::primitive_kind_t kind, int index) const {
         return find(kind, index, index + 1) == index;
     }
@@ -820,7 +711,8 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
         return new dnnl_primitive_attr(*this);
     }
 
-    dnnl_primitive_attr(const dnnl_primitive_attr &other) {
+    dnnl_primitive_attr(const dnnl_primitive_attr &other)
+        : c_compatible(other) {
         if (copy_from(other) != dnnl::impl::status::success)
             is_initialized_ = false;
     }
@@ -828,7 +720,6 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
     dnnl::impl::status_t copy_from(const dnnl_primitive_attr &other) {
         using namespace dnnl::impl;
 
-        output_scales_ = other.output_scales_;
         scales_ = other.scales_;
         zero_points_ = other.zero_points_;
         rounding_mode_ = other.rounding_mode_;
@@ -844,6 +735,10 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
         CHECK(rnn_tparams_.copy_from(other.rnn_tparams_));
         if (other.gpu_attr_) gpu_attr_ = other.gpu_attr_->clone();
         dropout_ = other.dropout_;
+        input_zero_points_ = (other.input_zero_points_);
+        weights_zero_points_ = (other.weights_zero_points_);
+        output_compensations_ = (other.output_compensations_);
+        src_dyn_quant_params_ = other.src_dyn_quant_params_;
 
         return status::success;
     }
@@ -852,28 +747,27 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
 
     enum class skip_mask_t : unsigned {
         none = 0,
-        oscale = 1u << 0,
-        oscale_runtime = 1u << 1,
-        scales = 1u << 2,
-        scales_runtime = (unsigned)scales | (1u << 3),
+        scales = 1u << 1,
+        scales_groups = (unsigned)scales | (1u << 2),
+        scales_data_type = (unsigned)scales | (1u << 3),
         zero_points = 1u << 4,
-        zero_points_runtime = (unsigned)zero_points | (1u << 5),
-        post_ops = 1u << 6,
-        rnn_data_qparams = 1u << 7,
-        rnn_weights_qparams = 1u << 8,
-        rnn_tparams = 1u << 9,
-        sum_dt = 1u << 10,
-        rnn_weights_projection_qparams = 1u << 11,
-        gpu_attr = 1u << 12,
-        accumulation_mode = 1u << 13,
-        fpmath_mode = 1u << 14,
-        scales_runtime_groups = (unsigned)scales_runtime | (1u << 15),
-        scales_runtime_data_type = (unsigned)scales_runtime | (1u << 16),
-        zero_points_runtime_groups = (unsigned)zero_points_runtime | (1u << 17),
-        zero_points_runtime_data_type
-        = (unsigned)zero_points_runtime | (1u << 18),
-        dropout = 1u << 19,
-        rounding_mode = 1u << 20,
+        zero_points_groups = (unsigned)zero_points | (1u << 5),
+        zero_points_data_type = (unsigned)zero_points | (1u << 6),
+        post_ops = 1u << 7,
+        sum_dt = 1u << 8,
+        rnn_data_qparams = 1u << 9,
+        rnn_weights_qparams = 1u << 10,
+        rnn_tparams = 1u << 11,
+        rnn_weights_projection_qparams = 1u << 12,
+        gpu_attr = 1u << 13,
+        accumulation_mode = 1u << 14,
+        fpmath_mode = 1u << 15,
+        dropout = 1u << 16,
+        rounding_mode = 1u << 17,
+        input_zero_points = 1u << 18,
+        weights_zero_points = 1u << 19,
+        output_compensations = 1u << 20,
+        src_dyn_quant_params = 1u << 21,
     };
 
     /** Returns true if the attributes have default values.
@@ -889,7 +783,6 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
         bool ret = scratchpad_mode_ == rhs.scratchpad_mode_
                 && fpmath_ == rhs.fpmath_ && acc_mode_ == rhs.acc_mode_
                 && deterministic_ == rhs.deterministic_
-                && output_scales_ == rhs.output_scales_
                 && scales_ == rhs.scales_ && zero_points_ == rhs.zero_points_
                 && post_ops_ == rhs.post_ops_
                 && rnn_data_qparams_ == rhs.rnn_data_qparams_
@@ -901,7 +794,11 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
                             && gpu_attr_->is_equal(*rhs.gpu_attr_))
                         || (!gpu_attr_ && !rhs.gpu_attr_))
                 && dropout_ == rhs.dropout_
-                && rounding_mode_ == rhs.rounding_mode_;
+                && rounding_mode_ == rhs.rounding_mode_
+                && input_zero_points_ == rhs.input_zero_points_
+                && weights_zero_points_ == rhs.weights_zero_points_
+                && output_compensations_ == rhs.output_compensations_
+                && src_dyn_quant_params_ == rhs.src_dyn_quant_params_;
         return ret;
     }
 
@@ -964,8 +861,7 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
     }
 
     // NOTE: make sure that the types below have overloaded comparison operator
-    dnnl::impl::runtime_scales_t output_scales_;
-    dnnl::impl::arg_scales_t scales_;
+    dnnl::impl::scales_t scales_;
     dnnl::impl::zero_points_t zero_points_;
     dnnl::impl::scratchpad_mode_t scratchpad_mode_;
     dnnl::impl::fpmath_t fpmath_;
@@ -973,14 +869,20 @@ struct dnnl_primitive_attr : public dnnl::impl::c_compatible {
     bool deterministic_;
     dnnl::impl::post_ops_t post_ops_;
     dnnl::impl::rnn_data_qparams_t rnn_data_qparams_;
-    dnnl::impl::scales_t rnn_weights_qparams_;
-    dnnl::impl::scales_t rnn_weights_projection_qparams_;
+    dnnl::impl::rnn_create_time_scales_t rnn_weights_qparams_;
+    dnnl::impl::rnn_create_time_scales_t rnn_weights_projection_qparams_;
     dnnl::impl::rnn_tparams_t rnn_tparams_;
     dnnl::impl::dropout_t dropout_;
     dnnl::impl::rnd_mode_t rounding_mode_;
 
     std::unique_ptr<dnnl::impl::primitive_attr_item_t> gpu_attr_;
 
+    dnnl::impl::legacy_zero_points_t input_zero_points_;
+    dnnl::impl::legacy_zero_points_t weights_zero_points_;
+    dnnl::impl::legacy_zero_points_t output_compensations_;
+
+    dnnl::impl::src_dyn_quant_params_t src_dyn_quant_params_;
+
     dnnl_primitive_attr &operator=(const dnnl_primitive_attr &other) = delete;
 };
 
diff --git a/src/common/primitive_attr_quant.cpp b/src/common/primitive_attr_quant.cpp
new file mode 100644
index 00000000000..2c954f8d9e6
--- /dev/null
+++ b/src/common/primitive_attr_quant.cpp
@@ -0,0 +1,289 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/primitive_attr_quant.hpp"
+#include "common/primitive_hashing.hpp"
+#include "common/verbose.hpp"
+
+namespace dnnl {
+namespace impl {
+
+const quant_entry_t &default_quant_entry() {
+    static const quant_entry_t default_quant_entry;
+    return default_quant_entry;
+}
+
+size_t quant_entry_t::get_hash() const {
+    size_t seed = 0;
+    seed = hash_combine(seed, mask_);
+    seed = hash_combine(seed, static_cast<size_t>(data_type_));
+    seed = hash_combine(seed, group_ndims_);
+    if (group_ndims_ > 0)
+        seed = primitive_hashing::get_array_hash(
+                seed, group_dims_, group_ndims_);
+    return seed;
+}
+
+void quant_entry_t::serialize(serialization_stream_t &sstream) const {
+    sstream.append(mask_);
+    sstream.append(data_type_);
+    sstream.append_array(group_ndims_, group_dims_);
+}
+
+quant_entry_t quant_entry_t::deserialize(deserializer_t &d) {
+    quant_entry_t e;
+    d.pop(e.mask_);
+    d.pop(e.data_type_);
+    size_t group_ndims;
+    d.pop_array(group_ndims, e.group_dims_);
+    e.group_ndims_ = static_cast<int>(group_ndims);
+    return e;
+}
+
+std::string quant_entry_t::get_verbose() const {
+    std::string s;
+    s.append(std::to_string(get_mask()));
+    s.append(":").append(dnnl_dt2str(get_data_type()));
+    s.append(":").append(std::to_string(type_));
+    s.append(":");
+    if (group_ndims_ > 0) {
+                s.append(std::to_string(group_dims_[0]))
+                .append("x")
+                .append(std::to_string(group_dims_[1]));
+    }
+    s.append(":");
+    if (get_ndims() > 0) {
+            s.append(std::to_string(get_dims()[0]))
+            .append("x")
+            .append(std::to_string(get_dims()[1]));
+    }
+
+    return s;
+}
+
+std::ostream &operator<<(std::ostream &ss, const quant_entry_t &e) {
+    ss << e.get_verbose();
+    return ss;
+}
+
+size_t quant_entries_t::get_hash() const {
+    size_t seed = 0;
+    // Go through scales for all arguments.
+    for (const auto &e : entries_) {
+        seed = hash_combine(seed, e.first);
+        seed = hash_combine(seed, e.second.get_hash());
+    }
+    return seed;
+}
+
+void quant_entries_t::serialize(serialization_stream_t &sstream) const {
+    sstream.append(entries_.size());
+    for (const auto &e : entries_) {
+        sstream.append(e.first);
+        sstream.append(e.second);
+    }
+}
+
+template <typename T>
+T deserialize_entries(deserializer_t &d) {
+    T entries;
+    size_t size = d.pop<size_t>();
+    for (size_t i = 0; i < size; i++) {
+        int arg = d.pop<int>();
+        entries.set(arg, d.pop<quant_entry_t>());
+    }
+    return entries;
+}
+
+std::string quant_entries_t::get_verbose() const {
+    std::string s;
+    std::string empty_delim, attr_delim = "+";
+    std::string delim = empty_delim;
+    for (const auto &scale : entries_) {
+        const auto &q = scale.second;
+        if (q.has_default_values()) continue;
+
+        int arg = scale.first;
+        s.append(delim)
+                .append(arg2str(arg))
+                .append(":")
+                .append(q.get_verbose());
+        delim = attr_delim;
+    }
+    return s;
+}
+
+scales_t scales_t::deserialize(deserializer_t &d) {
+    return deserialize_entries<scales_t>(d);
+}
+
+zero_points_t zero_points_t::deserialize(deserializer_t &d) {
+    return deserialize_entries<zero_points_t>(d);
+}
+
+status_t quant_entry_t::set(int mask, data_type_t data_type, int group_ndims,
+        const dims_t group_dims) {
+    type_ = type_ | DNNL;
+    is_set_ = true;
+    mask_ = mask;
+    data_type_ = data_type;
+    group_ndims_ = group_ndims;
+    if (group_ndims_ > 0) {
+        utils::array_copy(group_dims_, group_dims, group_ndims_);
+    }
+    return status::success;
+}
+
+status_t quant_entry_t::set_scales(const dims_t dims, int ndims, data_type_t data_type, int mask) {
+    type_ = type_ | OV_SCALES;
+    is_set_scale = true;
+    ndims_scale = ndims;
+    mask_scale = mask;
+    data_type_scale = data_type;
+    if (ndims_scale > 0) {
+        utils::array_copy(dims_scale, dims, ndims_scale);
+    }
+    return status::success;
+}
+
+status_t quant_entry_t::set_zero_points(const dims_t dims, int ndims, data_type_t data_type, int mask) {
+    type_ = type_ | DNNL;
+    is_set_wei = true;
+    ndims_wei = ndims;
+    mask_wei = mask;
+    if (ndims_wei > 0) {
+        utils::array_copy(dims_wei, dims, ndims_wei);
+        group_ndims_ = ndims;
+        utils::array_copy(group_dims_, dims, group_ndims_);
+    }
+    data_type_wei = data_type;
+    return status::success;
+}
+
+status_t quant_entry_t::set_zero_points(const dims_t dims, int ndims, data_type_t data_type) {
+    type_ = type_ | OV_ZERO_POINTS;
+    is_set_wei = true;
+    ndims_wei = ndims;
+    mask_wei = 1;
+    if (ndims_wei > 0) {
+        utils::array_copy(dims_wei, dims, ndims_wei);
+    }
+    data_type_wei = data_type;
+    return status::success;
+}
+
+status_t quant_entry_t::set(const quant_entry_t &other) {
+    type_ = other.type_;
+    is_set_ = other.is_set_;
+    mask_ = other.mask_;
+    data_type_ = other.data_type_;
+    group_ndims_ = other.group_ndims_;
+    if(group_ndims_ > 0)
+        utils::array_copy(group_dims_, other.group_dims_, group_ndims_);
+    is_set_scale = other.is_set_scale;
+    mask_scale = other.mask_scale;
+    data_type_scale = other.data_type_scale;
+    ndims_scale = other.ndims_scale;
+    if (ndims_scale > 0)
+        utils::array_cmp(dims_scale, other.dims_scale, ndims_scale);
+    is_set_wei = other.is_set_wei;
+    mask_wei = other.mask_wei;
+    data_type_wei = other.data_type_wei;
+    ndims_wei = other.ndims_wei;
+    if(ndims_wei > 0)
+        utils::array_cmp(dims_wei, other.dims_wei, ndims_wei);
+    return status::success;
+}
+int quant_entry_t::get_mask() const {
+    if (is_set_wei) return mask_wei;
+    if (is_set_) return mask_;
+    if (is_set_scale) return mask_scale;
+    return INT_MIN;
+}
+data_type_t quant_entry_t::get_data_type() const {
+    if (is_set_wei) return data_type_wei;
+    if (is_set_) return data_type_;
+    if (is_set_scale) return data_type_scale;
+    return data_type::undef;
+}
+const dims_t& quant_entry_t::get_dims() const {
+    if (is_set_wei) return dims_wei;
+    if (is_set_) return group_dims_;
+    if (is_set_scale) return dims_scale;
+    static const dims_t result = {};
+    return result;
+}
+
+int quant_entry_t::get_ndims() const {
+    if (is_set_wei) return ndims_wei;
+    if (is_set_) return group_ndims_;
+    if (is_set_scale) return ndims_scale;
+    return 0;
+}
+// Note: keep the definition here to satisfy the
+// `gtests/internals/test_comparison_operators` linking requirements which
+// mandates bodies to be in the header file.
+bool quant_entry_t::operator==(const quant_entry_t &rhs) const {
+    bool result = (type_ == rhs.type_ && is_set_ == rhs.is_set_
+            && mask_ == rhs.mask_
+            && data_type_ == rhs.data_type_
+            && group_ndims_ == rhs.group_ndims_
+            && IMPLICATION(group_ndims_ > 0,
+                utils::array_cmp(
+                    group_dims_, rhs.group_dims_, group_ndims_)));
+
+    if (!result) return false;
+    result = (is_set_scale == rhs.is_set_scale
+            && mask_scale == rhs.mask_scale
+            && data_type_scale == rhs.data_type_scale
+            && ndims_scale == rhs.ndims_scale
+            && IMPLICATION(ndims_scale > 0,
+                utils::array_cmp(
+                    dims_scale, rhs.dims_scale, ndims_scale)));
+
+    if (!result) return false;
+    result = (is_set_wei == rhs.is_set_wei
+            && mask_wei == rhs.mask_wei
+            && data_type_wei == rhs.data_type_wei
+            && ndims_wei == rhs.ndims_wei
+            && IMPLICATION(ndims_wei > 0,
+                utils::array_cmp(
+                    dims_wei, rhs.dims_wei, ndims_wei)));
+    return result;
+}
+status_t quant_entries_t::set_scales(int arg, const dims_t dims, int ndims, data_type_t data_type) {
+    if (!check_arg(arg)) return status::invalid_arguments;
+    CHECK(entries_[arg].set_scales(dims, ndims, data_type));
+    return status::success;
+}
+status_t quant_entries_t::set_zero_points(int arg, const dims_t dims, int ndims, data_type_t data_type) {
+    if (arg != DNNL_ARG_WEIGHTS) return status::unimplemented;
+    CHECK(entries_[arg].set_zero_points(dims, ndims, data_type));
+    return status::success;
+}
+status_t zero_points_t::set(int arg, int mask, data_type_t data_type, int group_ndims,
+        const dims_t group_dims) {
+    if (!check_arg(arg)) return status::invalid_arguments;
+    if (arg == DNNL_ARG_WEIGHTS) {
+        CHECK(entries_[arg].set_zero_points(group_dims, group_ndims, data_type, mask));
+    } else {
+        CHECK(entries_[arg].set(mask, data_type, group_ndims, group_dims));
+    }
+    return status::success;
+}
+
+} // namespace impl
+} // namespace dnnl
diff --git a/src/common/primitive_attr_quant.hpp b/src/common/primitive_attr_quant.hpp
new file mode 100644
index 00000000000..de541f2a909
--- /dev/null
+++ b/src/common/primitive_attr_quant.hpp
@@ -0,0 +1,378 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef COMMON_PRIMITIVE_ATTR_QUANT_HPP
+#define COMMON_PRIMITIVE_ATTR_QUANT_HPP
+
+// NOTE: Objects declared in this header are moved out from primitive_attr.hpp due
+// to micro_sdpa primitive. Int8 support requires at least two primitive_attr
+// objects to be used inside sdpa_desc_t object which triggers a deleted
+// copy-ctor of primitive_attr_t, which is there because of RNN scales still
+// rely on static scales and manage dynamically-allocated memory.
+//
+// As a result, micro_sdpa uses scales and zero-points objects directly and
+// requires a dedicated header for that, otherwise, it's going to be a circular
+// dependency between headers when it comes to inclusion of opdesc.hpp which
+// sdpa_desc_t is a part of.
+
+#include "common/serialization.hpp"
+#include "common/utils.hpp"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include <unordered_map>
+
+namespace dnnl {
+namespace impl {
+
+struct quant_entry_t;
+const quant_entry_t &default_quant_entry();
+
+struct quant_entry_t : public c_compatible {
+    quant_entry_t() = default;
+
+    // `set(...)` approach is taken over constructors as the usage model assumes
+    // the change of state of this object but it doesn't require its destruction
+    // which would come with some performance price which prevails in this case.
+    status_t set(int mask, data_type_t data_type) {
+        return set(mask, data_type, 0, {});
+    }
+    status_t set(int mask, data_type_t data_type, int group_ndims,
+            const dims_t group_dims);
+    status_t set_scales(const dims_t dims, int ndims, data_type_t data_type = data_type::f32, int mask = 1);
+    status_t set_zero_points(const dims_t dims, int ndims, data_type_t data_type);
+    status_t set_zero_points(const dims_t dims, int ndims, data_type_t data_type, int mask);
+    status_t set(const quant_entry_t &other);
+    quant_entry_t &operator=(const quant_entry_t &rhs) {
+        auto st = this->set(rhs);
+        assert(st == status::success);
+        UNUSED(st);
+        return *this;
+    }
+    bool has_default_values() const { return *this == default_quant_entry(); }
+    bool has_default_groups() const {
+        return this->group_ndims_ == default_quant_entry().group_ndims_;
+    }
+    int get_mask() const;
+    data_type_t get_data_type() const;
+    const dims_t& get_dims() const;
+    int get_ndims() const;
+    dim_t get_group(int d) const {
+        // If groups were not requested, return `1` for convenience.
+        if (group_ndims_ == default_quant_entry().group_ndims_) return 1;
+        // But if they were, any out of bound access would return `0` and likely
+        // lead to a division by zero which is fast to catch.
+        if (d >= group_ndims_) return 0;
+        return group_dims_[d];
+    }
+
+    // Note: keep the definition here to satisfy the
+    // `gtests/internals/test_comparison_operators` linking requirements which
+    // mandates bodies to be in the header file.
+    bool operator==(const quant_entry_t &rhs) const;
+    size_t get_hash() const;
+
+    void serialize(serialization_stream_t &sstream) const;
+
+    static quant_entry_t deserialize(deserializer_t &d);
+
+    std::string get_verbose() const;
+
+private:
+    data_type_t data_type_ = data_type::undef;
+    int group_ndims_ = 0;
+    dims_t group_dims_ {};
+    // Note: INT_MIN is used on purpose to avoid potential issues when
+    // `(mask & bit)` expression will return `true`. `INT_MIN` is represented
+    // as `10...0` in bits and will avoid such situations.
+    int mask_ = INT_MIN;
+    bool is_set_ = false;
+    // openvino extension
+    enum entry_type {
+        NONE = 0,
+        DNNL = 1,
+        OV_SCALES = 2,
+        OV_ZERO_POINTS = 4
+    };
+    int type_ = NONE;
+    // scale
+    bool is_set_scale = false;
+    int ndims_scale = 0;
+    int mask_scale = INT_MIN;
+    dims_t dims_scale {};
+    data_type_t data_type_scale = data_type::undef;
+    // zero_point
+    bool is_set_wei = false;
+    int ndims_wei = 0;
+    int mask_wei = INT_MIN;
+    dims_t dims_wei {};
+    data_type_t data_type_wei = data_type::s32;
+};
+
+std::ostream &operator<<(std::ostream &ss, const quant_entry_t &e);
+
+struct quant_entries_t : public c_compatible {
+    quant_entries_t(data_type_t default_data_type)
+        : default_data_type_(default_data_type) {}
+
+    const quant_entry_t &get(int arg) const {
+        const auto it = entries_.find(arg);
+        if (it == entries_.end()) return default_quant_entry();
+        return it->second;
+    }
+
+    // See `set(...)` comment for `quant_entry_t` for a design choice
+    // explanation.
+    virtual status_t set(int arg, int mask) {
+        return set(arg, mask, default_data_type_, 0, {});
+    }
+    const dims_t & get_dims(int arg) const {
+        return get(arg).get_dims();
+    }
+    int get_ndims(int arg) const {
+        return get(arg).get_ndims();
+    }
+    virtual status_t set(int arg, int mask, data_type_t data_type, int group_ndims,
+            const dims_t group_dims) {
+        if (!check_arg(arg)) return status::invalid_arguments;
+        CHECK(entries_[arg].set(mask, data_type, group_ndims, group_dims));
+        return status::success;
+    }
+    status_t set_scales(int arg, const dims_t dims, int ndims, data_type_t data_type = data_type::f32);
+    status_t set_zero_points(int arg, const dims_t dims, int ndims, data_type_t data_type);
+
+    // Use this interface with `default_quant_entry` when need to remove a
+    // specific entry.
+    virtual status_t set(int arg, const quant_entry_t &other) {
+        return entries_[arg].set(other);
+    }
+
+    // This interface is different from the one below and is just a shortcut.
+    bool has_default_values(int arg) const {
+        return get(arg).has_default_values();
+    }
+
+    // This interface is used to make sure that other than `supported_args` have
+    // default values. It's to make sure that non-allowed arguments were not
+    // passed to the library.
+    bool has_default_values(const std::vector<int> &supported_args = {}) const {
+        auto predicate
+                = [](const quant_entry_t &s) { return s.has_default_values(); };
+        return has_default_property(supported_args, predicate);
+    }
+
+    // This interface checks specific argument. It exists because quant_entry_t
+    // doesn't have a notion of default data_type, only this object does.
+    // Note: can be removed once the library unconditionally supports data type
+    // for scales/zero-points for every implementation, then this call can be
+    // removed as to make a proper load, the data type must be queried.
+    bool has_default_data_type(int arg) const {
+        // Note: `data_type::undef` represents `default_quant_entry`.
+        return utils::one_of(
+                get(arg).get_data_type(), default_data_type_, data_type::undef);
+    }
+
+    // This interface is different from the one below and is just a shortcut.
+    bool has_default_groups(int arg) const {
+        return get(arg).has_default_groups();
+    }
+
+    // This interface is used to make sure that other than `supported_args` have
+    // default values. It's to make sure that non-allowed arguments were not
+    // passed to the library.
+    bool has_default_groups(const std::vector<int> &supported_args = {}) const {
+        auto predicate
+                = [](const quant_entry_t &s) { return s.has_default_groups(); };
+        return has_default_property(supported_args, predicate);
+    }
+
+    int get_mask(int arg) const { return get(arg).get_mask(); }
+    data_type_t get_data_type(int arg) const {
+        return get(arg).get_data_type();
+    }
+    dim_t get_group(int arg, int d) const { return get(arg).get_group(d); }
+
+    bool operator==(const quant_entries_t &rhs) const {
+        return entries_ == rhs.entries_;
+    }
+
+    size_t get_hash() const;
+
+    void serialize(serialization_stream_t &sstream) const;
+
+    std::string get_verbose() const;
+
+protected:
+    // Sorted property of `std::map` is used for hashing.
+    std::map<int, quant_entry_t> entries_;
+    // Value is different depending on the inheritor.
+    data_type_t default_data_type_ = data_type::undef;
+
+    virtual bool check_arg(int arg) const = 0;
+
+    // The function makes sure that if any argument was specified by user, that
+    // only `supported_args` have their value customized, rest unsupported
+    // values were not updated.
+    bool has_default_property(const std::vector<int> &supported_args,
+            bool (*predicate)(const quant_entry_t &)) const {
+        for (const auto &s : entries_) {
+            // Arg passed the condition, check the next one.
+            if (predicate(s.second)) continue;
+
+            bool allow_non_default = false;
+            for (const auto &supported_arg : supported_args)
+                if (s.first == supported_arg) {
+                    allow_non_default = true;
+                    break;
+                }
+            if (allow_non_default) continue;
+            return false;
+        }
+        return true;
+    }
+};
+
+struct scales_t : public quant_entries_t {
+    scales_t() : quant_entries_t(default_data_type_) {};
+
+    // This interface checks the content of all entries, and allows to ignore
+    // certain arguments.
+    // Note: can't be put in `quant_entries_t` because `default_data_type_` is
+    // not a static member, but `has_default_property` requires `predicate`
+    // to have it this way.
+    bool has_default_data_type(
+            const std::vector<int> &supported_args = {}) const {
+        auto predicate = [](const quant_entry_t &s) {
+            // Note: `data_type::undef` represents `default_quant_entry`.
+            return utils::one_of(
+                    s.get_data_type(), default_data_type_, data_type::undef);
+        };
+        return has_default_property(supported_args, predicate);
+    }
+    // Note: must present as compiler doesn't see an overloaded version inside a
+    // base class.
+    bool has_default_data_type(int arg) const {
+        return quant_entries_t::has_default_data_type(arg);
+    }
+
+    static scales_t deserialize(deserializer_t &d);
+
+private:
+    static constexpr data_type_t default_data_type_ = data_type::f32;
+
+    bool check_arg(int arg) const override {
+        // regular
+        for (const auto &sa : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+            if (arg == sa) return true;
+        }
+        // binary
+        for (const auto &sa : {DNNL_ARG_SRC_1}) {
+            if (arg == sa) return true;
+        }
+        // concat
+        if (arg & DNNL_ARG_MULTIPLE_SRC) return true;
+        // depth-wise convolution post op
+        for (const auto &sa : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | sa)) return true;
+        }
+        // sdpa
+        if (arg == DNNL_ARG_SRC_2) return true;
+        return false;
+    }
+};
+
+struct zero_points_t : public quant_entries_t {
+    zero_points_t() : quant_entries_t(default_data_type_) {};
+
+    // This interface checks the content of all entries, and allows to ignore
+    // certain arguments.
+    // Note: can't be put in `quant_entries_t` because `default_data_type_` is
+    // not a static member, but `has_default_property` requires `predicate`
+    // to have it this way.
+    bool has_default_data_type(
+            const std::vector<int> &supported_args = {}) const {
+        auto predicate = [](const quant_entry_t &s) {
+            // Note: `data_type::undef` represents `default_quant_entry`.
+            return utils::one_of(
+                    s.get_data_type(), default_data_type_, data_type::undef);
+        };
+        return has_default_property(supported_args, predicate);
+    }
+    // Note: must present as compiler doesn't see an overloaded version inside a
+    // base class.
+    bool has_default_data_type(int arg) const {
+        return quant_entries_t::has_default_data_type(arg);
+    }
+
+    static zero_points_t deserialize(deserializer_t &d);
+    status_t set(int arg, int mask) override {
+        return quant_entries_t::set(arg, mask, default_data_type_, 0, {});
+    }
+    status_t set(int arg, int mask, data_type_t data_type, int group_ndims,
+            const dims_t group_dims) override;
+
+    status_t set(int arg, const quant_entry_t &other) override {
+        return quant_entries_t::set(arg, other);
+    }
+private:
+    static constexpr data_type_t default_data_type_ = data_type::s32;
+
+    bool check_arg(int arg) const override {
+        // regular
+        // gemm internal primitive would use DNNL_ARG_A, DNNL_ARG_B, DNNL_ARG_C,
+        // which match to DNNL_ARG_WEIGHTS, DNNL_ARG_SRC, DNNL_ARG_DST. They
+        // are defined in gpu internals, thus, not spelled here.
+        for (const auto &sa : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+            if (arg == sa) return true;
+        }
+        // sdpa
+        if (arg == DNNL_ARG_SRC_2) return true;
+        return false;
+    }
+};
+
+struct src_dyn_quant_params_t : public c_compatible {
+    src_dyn_quant_params_t() : group_size_(0) {}
+    bool has_default_values() const {
+        return (group_size_ == 0);
+    }
+    bool defined() const {
+        return true;
+    }
+
+    status_t set(uint64_t group_size) {
+        group_size_ = group_size;
+        return status::success;
+    }
+
+    uint64_t get() const {
+        return group_size_;
+    }
+
+    bool operator==(const src_dyn_quant_params_t &rhs) const {
+        using namespace utils;
+        return group_size_ == rhs.group_size_;
+    }
+
+private:
+    uint64_t group_size_;
+};
+
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/common/primitive_cache.cpp b/src/common/primitive_cache.cpp
index 31f506c814c..5264b12a0d8 100644
--- a/src/common/primitive_cache.cpp
+++ b/src/common/primitive_cache.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -133,6 +133,18 @@ primitive_cache_iface_t::result_t primitive_cache_iface_t::get_or_create(
     return {std::move(r.value), r.status};
 }
 
+status_t set_primitive_cache_capacity(
+        int primitive_capacity, int kernel_capacity) {
+    if (primitive_capacity < 0 || kernel_capacity < 0)
+        return status::invalid_arguments;
+#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
+    auto status = global_primitive_cache().set_capacity(primitive_capacity);
+    CHECK(status);
+    return kernel_cache::get().set_capacity(kernel_capacity);
+#endif
+    return status::success;
+}
+
 } // namespace impl
 } // namespace dnnl
 
@@ -148,11 +160,5 @@ dnnl::impl::status_t dnnl_get_primitive_cache_capacity(int *capacity) {
 }
 
 dnnl::impl::status_t dnnl_set_primitive_cache_capacity(int capacity) {
-    if (capacity < 0) return dnnl::impl::status::invalid_arguments;
-#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
-    auto status = dnnl::impl::global_primitive_cache().set_capacity(capacity);
-    if (status != dnnl::impl::status::success) return status;
-    return dnnl::impl::kernel_cache::get().set_capacity(capacity);
-#endif
-    return dnnl::impl::status::success;
+    return dnnl::impl::set_primitive_cache_capacity(capacity, capacity);
 }
diff --git a/src/common/primitive_cache.hpp b/src/common/primitive_cache.hpp
index bab4a8155dd..25f90e8fbf2 100644
--- a/src/common/primitive_cache.hpp
+++ b/src/common/primitive_cache.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,6 +59,8 @@ struct primitive_cache_iface_t {
 };
 
 primitive_cache_iface_t primitive_cache();
+status_t set_primitive_cache_capacity(
+        int primitive_capacity, int kernel_capacity);
 
 // Undocumented API for testing.
 status_t DNNL_API get_primitive_cache_size(int *size);
diff --git a/src/common/primitive_desc.hpp b/src/common/primitive_desc.hpp
index 0939b28a6b2..69b6e52487b 100644
--- a/src/common/primitive_desc.hpp
+++ b/src/common/primitive_desc.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include "cache_blob.hpp"
 #include "cache_blob_id.hpp"
 #include "cache_hit_types.hpp"
+#include "dnnl_sel_build.hpp"
 #include "memory_tracking.hpp"
 #include "nstl.hpp"
 #include "opdesc.hpp"
@@ -47,6 +48,7 @@ static int po_inputs(const post_ops_t &post_ops, const primitive_kind_t kind) {
 struct impl_list_item_t;
 struct primitive_t;
 // Primitive descriptor implementation
+// NOLINTBEGIN(google-default-arguments)
 struct primitive_desc_t : public c_compatible {
     primitive_desc_t(const primitive_attr_t *attr, primitive_kind_t kind)
         : attr_(*attr), kind_(kind), pd_iterator_offset_(0), skip_idx_(-1) {
@@ -80,7 +82,7 @@ struct primitive_desc_t : public c_compatible {
     //     doesn't require any special handling since `get_verbose` is `false`.
     std::string info_with_runtime_dims(engine_t *engine,
             const memory_desc_t *src_md, const memory_desc_t *wei_md,
-            const memory_desc_t *bia_md, const memory_desc_t *dst_md) {
+            const memory_desc_t *bia_md, const memory_desc_t *dst_md) const {
         std::string info_str = info(engine);
 
         // Matmul and reorder are the only primitives supporting runtime dims.
@@ -150,45 +152,60 @@ struct primitive_desc_t : public c_compatible {
     enum class arg_usage_t { unused, input, output };
     virtual arg_usage_t arg_usage(int arg) const {
         using types::is_zero_md;
-        if (arg == DNNL_ARG_ATTR_OUTPUT_SCALES
-                && !attr()->output_scales_.defined())
+
+        if ((arg & (DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC))
+            && !attr()->input_zero_points_.has_default_values())
+            return arg_usage_t::input;
+        if ((arg & (DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS))
+            && !attr()->weights_zero_points_.has_default_values())
+            return arg_usage_t::input;
+        if ((arg & (DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST))
+                && !attr()->output_compensations_.has_default_values()
+                && arg != DNNL_ARG_SCRATCHPAD)
             return arg_usage_t::input;
+
         if (arg & DNNL_ARG_ATTR_ZERO_POINTS) {
             int zp_arg = arg & ~DNNL_ARG_ATTR_ZERO_POINTS;
-            if (!attr()->zero_points_.defined(zp_arg))
-                return arg_usage_t::input;
+            return !attr()->zero_points_.has_default_values(zp_arg)
+                ? arg_usage_t::input
+                : arg_usage_t::unused;
         }
         if (arg & DNNL_ARG_ATTR_SCALES) {
             int scale_arg = arg & ~DNNL_ARG_ATTR_SCALES;
-            if (!attr()->scales_.get(scale_arg).defined())
-                return arg_usage_t::input;
+            return !attr()->scales_.has_default_values(scale_arg)
+                ? arg_usage_t::input
+                : arg_usage_t::unused;
         }
-        if ((arg == (DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0))
-                && !attr()->scales_.get(DNNL_ARG_SRC_0).defined())
-            return arg_usage_t::input;
-        if ((arg == (DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1))
-                && !attr()->scales_.get(DNNL_ARG_SRC_1).defined())
-            return arg_usage_t::input;
-        if (arg == DNNL_ARG_SCRATCHPAD && !is_zero_md(scratchpad_md()))
-            return arg_usage_t::output;
-        if (arg == DNNL_ARG_ATTR_DROPOUT_MASK
-                && !attr()->dropout_.has_default_values())
-            return arg_usage_t::output;
-        if ((arg == DNNL_ARG_ATTR_DROPOUT_PROBABILITY
-                    || arg == DNNL_ARG_ATTR_DROPOUT_SEED)
-                && !attr()->dropout_.has_default_values())
-            return arg_usage_t::input;
-        if ((arg == DNNL_ARG_ATTR_ROUNDING_SEED)
-                && !attr()->rounding_mode_.has_default_values())
-            return arg_usage_t::input;
+
+        if (arg == DNNL_ARG_SCRATCHPAD)
+            return !is_zero_md(scratchpad_md()) ? arg_usage_t::output
+                                                : arg_usage_t::unused;
+        if (arg == DNNL_ARG_ATTR_DROPOUT_MASK)
+            return !attr()->dropout_.has_default_values() ? arg_usage_t::output
+                                                          : arg_usage_t::unused;
+        if (arg == DNNL_ARG_ATTR_DROPOUT_PROBABILITY)
+            return !attr()->dropout_.has_default_values() ? arg_usage_t::input
+                                                          : arg_usage_t::unused;
+        if (arg == DNNL_ARG_ATTR_DROPOUT_SEED)
+            return !attr()->dropout_.has_default_values() ? arg_usage_t::input
+                                                          : arg_usage_t::unused;
+        if (arg == DNNL_ARG_ATTR_ROUNDING_SEED)
+            return !attr()->rounding_mode_.has_default_values()
+                    ? arg_usage_t::input
+                    : arg_usage_t::unused;
+
         for (int idx = 0; idx < attr()->post_ops_.len(); ++idx) {
             using namespace primitive_kind;
-            if (post_op_has_proper_input(
-                        attr(), binary, idx, arg, DNNL_ARG_SRC_1)
-                    || post_op_has_proper_input(
-                            attr(), prelu, idx, arg, DNNL_ARG_WEIGHTS))
+            if (post_op_has_proper_input(attr(), binary, idx, arg, DNNL_ARG_SRC_1) ||
+                post_op_has_proper_input(attr(), depthwise, idx, arg, DNNL_ARG_SRC_1) ||
+                post_op_has_proper_input(attr(), quantization, idx, arg, DNNL_ARG_SRC_1) ||
+                post_op_has_proper_input(attr(), prelu, idx, arg, DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
         }
+        if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
+            return arg_usage_t::input;
+        if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+            return arg_usage_t::input;
 
         return arg_usage_t::unused;
     }
@@ -354,6 +371,15 @@ struct primitive_desc_t : public c_compatible {
     int n_prelu_po_inputs() const {
         return po_inputs(attr()->post_ops_, primitive_kind::prelu);
     }
+
+    int n_depthwise_po_inputs() const {
+        return po_inputs(attr()->post_ops_, primitive_kind::depthwise);
+    }
+
+    int n_quantization_po_inputs() const {
+        return po_inputs(attr()->post_ops_, primitive_kind::quantization);
+    }
+
     // The `hint_mds(bool is_hint)` returns a vector of memory descriptors
     // that might affect the equality of primitive descriptors for backward pass.
     //
@@ -438,7 +464,6 @@ struct primitive_desc_t : public c_compatible {
 
     memory_tracking::registry_t scratchpad_registry_;
 
-protected:
     void init_pd_iterator_offset(int offset) { pd_iterator_offset_ = offset; }
     void init_skip_idx(int skip_idx) { skip_idx_ = skip_idx; }
 
@@ -460,11 +485,11 @@ struct primitive_desc_t : public c_compatible {
         /** the only reason why this class is here is the inability of
          * utils::make_unique() to operate on protected parent classes
          * of the derivative pd_t's; compilers should optimize it out */
-        class pd_t_compat : public pd_t {
+        class pd_compat_t : public pd_t {
         public:
-            pd_t_compat(Args &&...args) : pd_t(std::forward<Args>(args)...) {}
+            pd_compat_t(Args &&...args) : pd_t(std::forward<Args>(args)...) {}
         };
-        return utils::make_unique<pd_t_compat>(std::forward<Args>(args)...);
+        return utils::make_unique<pd_compat_t>(std::forward<Args>(args)...);
     }
 
     template <typename pd_t>
@@ -472,13 +497,11 @@ struct primitive_desc_t : public c_compatible {
             const primitive_attr_t *attr, engine_t *engine,
             const primitive_desc_t *hint_fwd) {
         using namespace dnnl::impl::status;
-        using pd_op_desc_t = typename pkind_traits<pd_t::base_pkind>::desc_type;
-        if (adesc->kind != pd_t::base_pkind) return invalid_arguments;
+        if (adesc->primitive_kind != pd_t::base_pkind) return invalid_arguments;
         assert(hint_fwd ? hint_fwd->kind() == pd_t::base_pkind : true);
         auto hint
                 = reinterpret_cast<const typename pd_t::hint_class *>(hint_fwd);
-        auto _pd
-                = make_unique_pd<pd_t>((const pd_op_desc_t *)adesc, attr, hint);
+        auto _pd = make_unique_pd<pd_t>(adesc, attr, hint);
         if (_pd == nullptr) return out_of_memory;
         if (!_pd->is_initialized()) return out_of_memory;
         CHECK(_pd->init(engine));
@@ -488,6 +511,7 @@ struct primitive_desc_t : public c_compatible {
 
     friend struct dnnl::impl::impl_list_item_t;
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
@@ -503,6 +527,7 @@ struct primitive_desc_t : public c_compatible {
                     &primitive, \
             dnnl::impl::engine_t *engine, const cache_blob_t &cache_blob) \
             const override { \
+        DNNL_PRIMITIVE_CREATE(pd_t) \
         return primitive_t::create_primitive_common<impl_type, pd_t>( \
                 primitive, this, engine, use_global_scratchpad, cache_blob); \
     } \
diff --git a/src/common/primitive_desc_iface.cpp b/src/common/primitive_desc_iface.cpp
index f359a72e6c3..263f29bff66 100644
--- a/src/common/primitive_desc_iface.cpp
+++ b/src/common/primitive_desc_iface.cpp
@@ -37,7 +37,7 @@ status_t primitive_desc_create(primitive_desc_iface_t **primitive_desc_iface,
 
     if (!primitive_desc_iface) return invalid_arguments;
 
-    const bool known_primitive_kind = utils::one_of(op_desc->kind,
+    const bool known_primitive_kind = utils::one_of(op_desc->primitive_kind,
             batch_normalization, binary, convolution, deconvolution, eltwise,
             gemm, group_normalization, inner_product, layer_normalization, lrn,
             matmul, pooling, prelu, reduction, resampling, rnn, sdpa, shuffle,
diff --git a/src/common/primitive_desc_iterator.hpp b/src/common/primitive_desc_iterator.hpp
index 39fe8f51838..096d20642a6 100644
--- a/src/common/primitive_desc_iterator.hpp
+++ b/src/common/primitive_desc_iterator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,26 +38,19 @@ struct primitive_desc_iterator_t : public c_compatible {
             int skip_idx = -1)
         : idx_(-1)
         , engine_(engine)
-        , op_desc_(nullptr)
+        , op_desc_(op_desc->clone())
         , attr_(attr ? *attr : primitive_attr_t())
         , hint_fwd_pd_(hint_fwd_pd)
-        , impl_list_(nullptr)
+        , impl_list_(engine_->get_implementation_list(op_desc_.get()))
         , last_idx_(0)
         , skip_idx_(skip_idx)
         , offset_(-1) {
 
-        op_desc_ = (op_desc_t *)std::malloc(sizeof(op_desc_t));
-        copy_c_op_desc(op_desc_, op_desc);
-
-        impl_list_ = engine_->get_implementation_list(op_desc_);
-
         while (impl_list_[last_idx_])
             ++last_idx_;
         is_initialized_ = is_initialized_ && attr_.is_initialized();
     }
 
-    ~primitive_desc_iterator_t() { std::free(op_desc_); }
-
     engine_t *engine() const { return engine_; }
 
     bool operator==(const primitive_desc_iterator_t &rhs) const {
@@ -82,7 +75,7 @@ struct primitive_desc_iterator_t : public c_compatible {
         std::vector<dnnl::impl::memory_desc_t> hint_mds;
         if (hint_fwd_pd_) hint_mds = hint_fwd_pd_->hint_mds(true /* is_hint */);
         primitive_hashing::key_t key(
-                engine_, op_desc_, &attr_, offset_, hint_mds, skip_idx_);
+                engine_, op_desc_.get(), &attr_, offset_, hint_mds, skip_idx_);
 
         pd_ = primitive_cache().get_pd(key);
         if (pd_) { return *this; }
@@ -90,8 +83,8 @@ struct primitive_desc_iterator_t : public c_compatible {
         while (++idx_ != last_idx_) {
             if (idx_ == skip_idx_) continue;
             primitive_desc_t *candidate_pd = nullptr;
-            auto s = impl_list_[idx_](&candidate_pd, op_desc_, &attr_, engine_,
-                    hint_fwd_pd_, offset_, skip_idx_);
+            auto s = impl_list_[idx_](&candidate_pd, op_desc_.get(), &attr_,
+                    engine_, hint_fwd_pd_, offset_, skip_idx_);
             if (s == status::success) {
                 pd_.reset(candidate_pd);
                 break;
@@ -110,7 +103,7 @@ struct primitive_desc_iterator_t : public c_compatible {
     int idx_;
     engine_t *engine_;
     std::shared_ptr<primitive_desc_t> pd_;
-    op_desc_t *op_desc_;
+    std::unique_ptr<op_desc_t> op_desc_;
     const primitive_attr_t attr_;
     const primitive_desc_t *hint_fwd_pd_;
     const impl_list_item_t *impl_list_;
@@ -122,7 +115,6 @@ struct primitive_desc_iterator_t : public c_compatible {
     primitive_desc_iterator_t(engine_t *engine, int last_idx)
         : idx_(last_idx)
         , engine_(engine)
-        , op_desc_(nullptr)
         , hint_fwd_pd_(nullptr)
         , impl_list_(nullptr)
         , last_idx_(last_idx)
@@ -133,7 +125,7 @@ struct primitive_desc_iterator_t : public c_compatible {
         : idx_(other.idx_)
         , engine_(other.engine_)
         , pd_(std::move(other.pd_))
-        , op_desc_(other.op_desc_)
+        , op_desc_(std::move(other.op_desc_))
         , attr_(other.attr_)
         , hint_fwd_pd_(other.hint_fwd_pd_)
         , impl_list_(other.impl_list_)
diff --git a/src/common/primitive_exec_types.cpp b/src/common/primitive_exec_types.cpp
index db9d582ebaa..4f1c3e6295b 100644
--- a/src/common/primitive_exec_types.cpp
+++ b/src/common/primitive_exec_types.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,8 +49,7 @@ status_t cvt_primitive_args(const primitive_desc_t *pd, int nargs,
             case primitive_desc_t::arg_usage_t::input:
                 args[arg] = {mem, true};
                 n_inputs++;
-                extra_inputs += (arg == DNNL_ARG_ATTR_OUTPUT_SCALES)
-                        || (arg & DNNL_ARG_ATTR_ZERO_POINTS)
+                extra_inputs += (arg & DNNL_ARG_ATTR_ZERO_POINTS)
                         || (arg & DNNL_ARG_ATTR_SCALES)
                         // 1x1 + dw conv fusion
                         || (arg
@@ -136,7 +135,7 @@ void *exec_ctx_t::host_ptr(
     if (do_zeropad) status = mem->zero_pad(*this);
     if (status_) *status_ = status;
 
-    auto *mem_storage = mem->memory_storage(index);
+    auto *mem_storage = mem->memory_storage();
     return host_ptr(mem_storage);
 }
 
diff --git a/src/common/primitive_exec_types.hpp b/src/common/primitive_exec_types.hpp
index f52e3399ae2..c7c9c2ac75f 100644
--- a/src/common/primitive_exec_types.hpp
+++ b/src/common/primitive_exec_types.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,9 +25,22 @@
 #include "memory.hpp"
 #include "memory_storage.hpp"
 
-#define CTX_IN_STORAGE(arg) \
+// __VA_ARGS__here is an index of the buffer. It is empty unless the memory
+// argument is sparse.
+#define CTX_IN_STORAGE(arg, ...) CTX_IN_STORAGe##__VA_ARGS__(arg)
+
+#define CTX_IN_STORAGe(arg) \
     (ctx.input(arg) ? *(ctx.input(arg)->memory_storage()) \
                     : dnnl::impl::memory_storage_t::empty_storage())
+#define CTX_IN_STORAGe0(arg) \
+    (ctx.input(arg) ? *ctx.input(arg)->memory_storage(0) \
+                    : dnnl::impl::memory_storage_t::empty_storage())
+#define CTX_IN_STORAGe1(arg) \
+    (ctx.input(arg) ? *ctx.input(arg)->memory_storage(1) \
+                    : dnnl::impl::memory_storage_t::empty_storage())
+#define CTX_IN_STORAGe2(arg) \
+    (ctx.input(arg) ? *ctx.input(arg)->memory_storage(2) \
+                    : dnnl::impl::memory_storage_t::empty_storage())
 
 // Returns destination memory which may not have been zero pad initialized.
 #define CTX_OUT_STORAGE(arg) \
diff --git a/src/common/primitive_hashing.cpp b/src/common/primitive_hashing.cpp
index 97b657607ec..99a6239ad9c 100644
--- a/src/common/primitive_hashing.cpp
+++ b/src/common/primitive_hashing.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <algorithm>
+#include "primitive_attr.hpp"
 #include "primitive_desc.hpp"
 #include "type_helpers.hpp"
 #include "utils.hpp"
@@ -29,7 +31,7 @@ namespace primitive_hashing {
 key_t::key_t(const engine_t *engine, const op_desc_t *op_desc,
         const primitive_attr_t *attr, int pd_iterator_offset,
         const std::vector<memory_desc_t> &hint_mds, int skip_idx)
-    : primitive_kind_(op_desc->kind)
+    : primitive_kind_(op_desc->primitive_kind)
     , op_desc_(op_desc)
     , attr_(attr)
     , pd_iterator_offset_(pd_iterator_offset)
@@ -53,22 +55,32 @@ bool key_t::operator==(const key_t &rhs) const {
         && hint_mds_.size() == rhs.hint_mds_.size()
         && pd_iterator_offset_ == rhs.pd_iterator_offset_
         && impl_nthr_ == rhs.impl_nthr_
-	&& skip_idx_ == rhs.skip_idx_
-        && (*attr_) == (*rhs.attr_);
-
-    if (!ret) return false;
+        && skip_idx_ == rhs.skip_idx_
+        && (*attr_) == (*rhs.attr_)
+        && std::equal(
+            hint_mds_.begin(), hint_mds_.end(), rhs.hint_mds_.begin());
+
+    if (!ret) {
+        // ANCHOR: HASHING_DEBUGINFO_16.
+        VDEBUGINFO(16, primitive, hashing, "operator==,ret=%d", ret);
+        return ret;
+    }
 
 #define CASE(pkind) \
     case primitive_kind::pkind: \
-        ret = cast_to_desc<pkind##_desc_t>(op_desc_) \
-                == cast_to_desc<pkind##_desc_t>(rhs.op_desc_); \
+        ret = *op_desc_t::to_desc<pkind##_desc_t>(op_desc_) \
+                == *op_desc_t::to_desc<pkind##_desc_t>(rhs.op_desc_); \
         break;
 
         switch ((int)primitive_kind_) {
             CASE(batch_normalization)
             CASE(binary)
             CASE(concat)
-            CASE(convolution)
+            // Use a custom comparison function that ignores alg_kind.
+            case primitive_kind::convolution:
+                ret = compare_conv_opdesc(*op_desc_t::to_desc<convolution_desc_t>(op_desc_),
+                *op_desc_t::to_desc<convolution_desc_t>(rhs.op_desc_));
+            break;
             CASE(deconvolution)
             CASE(eltwise)
             CASE(gemm)
@@ -93,231 +105,9 @@ bool key_t::operator==(const key_t &rhs) const {
 #undef CASE
     // clang-format on
 
-    if (!ret) return false;
-
-    for (size_t i = 0; i < hint_mds_.size(); ++i)
-        if (hint_mds_[i] != rhs.hint_mds_[i]) return false;
-
-    return true;
-}
-
-// Combine hash of each memory_desc_t data member
-size_t get_md_hash(const memory_desc_t &md) {
-    size_t seed = 0;
-    seed = get_array_hash(seed, md.dims, md.ndims);
-    seed = hash_combine(seed, static_cast<size_t>(md.data_type));
-    seed = get_array_hash(seed, md.padded_dims, md.ndims);
-    seed = get_array_hash(seed, md.padded_offsets, md.ndims);
-    seed = hash_combine(seed, md.offset0);
-    seed = hash_combine(seed, static_cast<size_t>(md.format_kind));
-    // format desc
-    switch ((int)md.format_kind) {
-        case format_kind::undef:
-        case format_kind::any: break;
-        case format_kind::blocked:
-            for (int i = 0; i < md.ndims; i++) {
-                if (md.dims[i] == 1 && md.padded_dims[i] == 1) continue;
-                seed = hash_combine(seed, md.format_desc.blocking.strides[i]);
-            }
-            seed = hash_combine(seed, md.format_desc.blocking.inner_nblks);
-            seed = get_array_hash(seed, md.format_desc.blocking.inner_blks,
-                    md.format_desc.blocking.inner_nblks);
-            seed = get_array_hash(seed, md.format_desc.blocking.inner_idxs,
-                    md.format_desc.blocking.inner_nblks);
-            break;
-        case format_kind::wino:
-            seed = hash_combine(seed,
-                    static_cast<size_t>(md.format_desc.wino_desc.wino_format));
-            seed = hash_combine(seed, md.format_desc.wino_desc.r);
-            seed = hash_combine(seed, md.format_desc.wino_desc.alpha);
-            seed = hash_combine(seed, md.format_desc.wino_desc.ic);
-            seed = hash_combine(seed, md.format_desc.wino_desc.oc);
-            seed = hash_combine(seed, md.format_desc.wino_desc.ic_block);
-            seed = hash_combine(seed, md.format_desc.wino_desc.oc_block);
-            seed = hash_combine(seed, md.format_desc.wino_desc.ic2_block);
-            seed = hash_combine(seed, md.format_desc.wino_desc.oc2_block);
-            seed = hash_combine(seed, md.format_desc.wino_desc.adj_scale);
-            seed = hash_combine(seed, md.format_desc.wino_desc.size);
-            break;
-        case format_kind::rnn_packed:
-            seed = hash_combine(seed,
-                    static_cast<size_t>(md.format_desc.rnn_packed_desc.format));
-            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.n_parts);
-            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.n);
-            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.ldb);
-            {
-                int n_parts = md.format_desc.rnn_packed_desc.n_parts;
-                seed = get_array_hash(
-                        seed, md.format_desc.rnn_packed_desc.parts, n_parts);
-                seed = get_array_hash(seed,
-                        md.format_desc.rnn_packed_desc.part_pack_size, n_parts);
-                seed = get_array_hash(seed,
-                        md.format_desc.rnn_packed_desc.pack_part, n_parts);
-            }
-            seed = hash_combine(
-                    seed, md.format_desc.rnn_packed_desc.offset_compensation);
-            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.size);
-            break;
-#ifdef DNNL_EXPERIMENTAL_SPARSE
-        case format_kind::sparse:
-            seed = hash_combine(seed,
-                    static_cast<size_t>(md.format_desc.sparse_desc.encoding));
-            seed = hash_combine(seed, md.format_desc.sparse_desc.nnz);
-            seed = get_array_hash(seed,
-                    md.format_desc.sparse_desc.metadata_types,
-                    sparse_desc_t::max_metadata_types);
-            // User cannot initialize `packed_desc` therefore `packed_desc`
-            // is always zero initialized.
-            break;
-#endif
-        default: assert(!"unknown format_kind");
-    }
-
-    if (md.extra.flags != dnnl_memory_extra_flag_none) {
-        seed = hash_combine(seed, md.extra.flags);
-        if ((md.extra.flags
-                    & (dnnl_memory_extra_flag_compensation_conv_s8s8
-                            | dnnl_memory_extra_flag_rnn_u8s8_compensation))
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        md.extra.flags)) {
-            seed = hash_combine(seed, md.extra.compensation_mask);
-        }
-
-        if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) {
-            seed = hash_combine(seed, md.extra.scale_adjust);
-        }
-
-        if (md.extra.flags
-                & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
-            seed = hash_combine(seed, md.extra.asymm_compensation_mask);
-        }
-    }
-    // Combined hash for a memory descriptor
-    return seed;
-}
-
-// Combine hash of each primitive_attr_t data member
-size_t get_attr_hash(const primitive_attr_t &attr) {
-    size_t seed = 0;
-    // scratchpad_mode
-    seed = hash_combine(seed, static_cast<size_t>(attr.scratchpad_mode_));
-    // fpmath_mode
-    seed = hash_combine(seed, static_cast<size_t>(attr.fpmath_.mode_));
-    seed = hash_combine(seed, static_cast<size_t>(attr.fpmath_.apply_to_int_));
-    // deterministic
-    seed = hash_combine(seed, static_cast<size_t>(attr.deterministic_));
-    // acc_mode
-    seed = hash_combine(seed, static_cast<size_t>(attr.acc_mode_));
-    // rounding_mode
-    if (!attr.rounding_mode_.has_default_values()) {
-        for (const auto &e : attr.rounding_mode_.rounding_modes_map_) {
-            seed = hash_combine(seed, e.first);
-            seed = hash_combine(seed, static_cast<size_t>(e.second));
-        }
-    }
-
-    if (!attr.output_scales_.has_default_values()) {
-        // output_scales: mask
-        seed = hash_combine(seed, attr.output_scales_.mask_);
-    } else if (!attr.scales_.has_default_values()) {
-        // go through scales for all arguments
-        for (const auto &p : attr.scales_.scales_) {
-            // scales: arg
-            seed = hash_combine(seed, p.first);
-            // scales: mask
-            seed = hash_combine(seed, p.second.mask_);
-            // scales: groups
-            const int ndims = p.second.ndims_;
-            seed = hash_combine(seed, ndims);
-            if (ndims > 0)
-                seed = get_array_hash(seed, p.second.group_dims_, ndims);
-            // scales: data type
-            seed = hash_combine(seed, static_cast<size_t>(p.second.data_type_));
-        }
-    }
-    // zero_points
-    for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST})
-        if (!attr.zero_points_.has_default_values(arg)) {
-            const auto &zps = attr.zero_points_;
-            // zero_points: arg
-            seed = hash_combine(seed, arg);
-            int mask = 0;
-            zps.get(arg, &mask);
-            // zero_points: mask
-            seed = hash_combine(seed, mask);
-            // zero points: groups
-            const int ndims = zps.get_groups_ndims(arg);
-            seed = hash_combine(seed, ndims);
-            if (ndims > 0)
-                seed = get_array_hash(seed, zps.get_groups(arg), ndims);
-            // zero points: data type
-            seed = hash_combine(
-                    seed, static_cast<size_t>(zps.get_data_type(arg)));
-        }
-    // post_ops: entry[:]
-    for (int i = 0; i < attr.post_ops_.len(); i++) {
-        const auto &entry = attr.post_ops_.entry_[i];
-        switch (entry.kind) {
-            case primitive_kind::eltwise:
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.eltwise.alg));
-                seed = hash_combine(seed, entry.eltwise.scale);
-                seed = hash_combine(seed, entry.eltwise.alpha);
-                seed = hash_combine(seed, entry.eltwise.beta);
-                break;
-            case primitive_kind::sum:
-                seed = hash_combine(seed, entry.sum.scale);
-                seed = hash_combine(seed, entry.sum.zero_point);
-                seed = hash_combine(seed, static_cast<size_t>(entry.sum.dt));
-                break;
-            case primitive_kind::convolution:
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.depthwise_conv.kernel));
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.depthwise_conv.stride));
-                seed = hash_combine(seed,
-                        static_cast<size_t>(entry.depthwise_conv.padding));
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.depthwise_conv.wei_dt));
-                seed = hash_combine(seed,
-                        static_cast<size_t>(entry.depthwise_conv.bias_dt));
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.depthwise_conv.dst_dt));
-                break;
-            case primitive_kind::binary:
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.binary.alg));
-                seed = hash_combine(
-                        seed, get_md_hash(entry.binary.user_src1_desc));
-                break;
-            case primitive_kind::prelu:
-                seed = hash_combine(
-                        seed, static_cast<size_t>(entry.prelu.mask));
-                break;
-            default: assert(!"unknown post_op");
-        }
-    }
-    // rnn_data_qparams: scale, shift
-    seed = hash_combine(seed, attr.rnn_data_qparams_.scale_);
-    seed = hash_combine(seed, attr.rnn_data_qparams_.shift_);
-    if (!attr.rnn_weights_qparams_.has_default_values()) {
-        // rnn_weights_qparams: mask
-        seed = hash_combine(seed, attr.rnn_weights_qparams_.mask_);
-        // rnn_weights_qparams: count
-        seed = hash_combine(seed, attr.rnn_weights_qparams_.count_);
-        // rnn_weights_qparams: scales[:]
-        seed = get_array_hash(seed, attr.rnn_weights_qparams_.scales_,
-                attr.rnn_weights_qparams_.count_);
-    }
-    if (attr.gpu_attr_) {
-        seed = hash_combine(seed, attr.gpu_attr_->get_hash());
-    }
-    if (!attr.dropout_.has_default_values()) {
-        seed = hash_combine(
-                seed, get_md_hash(attr.dropout_.user_dropout_desc_));
-    }
-    // Combined hash for attributes
-    return seed;
+    // ANCHOR: HASHING_DEBUGINFO_16.
+    VDEBUGINFO(16, primitive, hashing, "operator==,ret=%d", ret);
+    return ret;
 }
 
 // Functions that compute hash for different op_descs
@@ -366,6 +156,8 @@ size_t get_desc_hash(const binary_desc_t &desc) {
     // Memory descriptors
     seed = hash_combine(seed, get_md_hash(desc.src_desc[0]));
     seed = hash_combine(seed, get_md_hash(desc.src_desc[1]));
+    if (desc.alg_kind == alg_kind::binary_select)
+        seed = hash_combine(seed, get_md_hash(desc.src_desc[2]));
     seed = hash_combine(seed, get_md_hash(desc.dst_desc));
     // Combined hash for binary op desc
     return seed;
@@ -377,7 +169,18 @@ size_t get_desc_hash(const convolution_desc_t &desc) {
     // Kinds
     seed = hash_combine(seed, static_cast<size_t>(desc.primitive_kind));
     seed = hash_combine(seed, static_cast<size_t>(desc.prop_kind));
-    seed = hash_combine(seed, static_cast<size_t>(desc.alg_kind));
+
+    // Ignore `alg_kind` to keep hash value consistent for any algorithm.
+    //
+    // Background: when a convolution primitive descriptor is created for
+    // the algorithm `auto` we overwrite `alg_kind` field in `op_desc` when
+    // store it in the primitive descriptor. Because of that, the `op_desc`
+    // stored in the primitive descriptor is different from the one user
+    // passed to oneDNN API. Because of the difference the requested
+    // primitive descriptor cannot be found in the cache if we hash/compare
+    // `alg_kind`.
+    //seed = hash_combine(seed, static_cast<size_t>(desc.alg_kind));
+
     // Memory descriptors
     seed = hash_combine(seed, get_md_hash(desc.src_desc));
     seed = hash_combine(seed, get_md_hash(desc.diff_src_desc));
@@ -530,6 +333,9 @@ size_t get_desc_hash(const matmul_desc_t &desc) {
     seed = hash_combine(seed, get_md_hash(desc.weights_desc));
     seed = hash_combine(seed, get_md_hash(desc.bias_desc));
     seed = hash_combine(seed, get_md_hash(desc.dst_desc));
+    seed = hash_combine(seed, get_md_hash(desc.reduce_desc));
+    // Reduce kind.
+    seed = hash_combine(seed, static_cast<size_t>(desc.reduce_kind));
     // Accumulator type
     seed = hash_combine(seed, static_cast<size_t>(desc.accum_data_type));
     // Combined hash for matmul op desc
@@ -727,11 +533,17 @@ size_t get_desc_hash(const sdpa_desc_t &desc) {
     seed = hash_combine(seed, get_md_hash(desc.q_desc));
     seed = hash_combine(seed, get_md_hash(desc.k_desc));
     seed = hash_combine(seed, get_md_hash(desc.v_desc));
+    seed = hash_combine(seed, desc.kq_scales.get_hash());
+    seed = hash_combine(seed, desc.kq_zero_points.get_hash());
+    seed = hash_combine(seed, desc.vs_scales.get_hash());
+    seed = hash_combine(seed, desc.vs_zero_points.get_hash());
     seed = hash_combine(seed, get_md_hash(desc.dst_desc));
     seed = hash_combine(seed, get_md_hash(desc.attn_mask_desc));
     // Scale type
     seed = hash_combine(seed, static_cast<size_t>(desc.scale_dt));
     seed = hash_combine(seed, desc.invert_scale);
+    seed = hash_combine(seed, desc.kv_head_number);
+    seed = hash_combine(seed, static_cast<size_t>(desc.mask_type));
     // Combined hash for sdpa desc
     return seed;
 }
diff --git a/src/common/primitive_hashing.hpp b/src/common/primitive_hashing.hpp
index fa33f920e55..655ed95c93d 100644
--- a/src/common/primitive_hashing.hpp
+++ b/src/common/primitive_hashing.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,11 +21,11 @@
 #include <typeindex>
 #include <type_traits>
 
-#include "c_types_map.hpp"
-#include "engine_id.hpp"
-#include "oneapi/dnnl/dnnl.h"
-#include "primitive_attr.hpp"
-#include "type_helpers.hpp"
+#include "common/c_types_map.hpp"
+#include "common/engine_id.hpp"
+#include "common/type_helpers.hpp"
+#include "common/verbose.hpp"
+#include "common/primitive_hashing_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -59,11 +59,6 @@ struct key_t {
     engine_id_t engine_id_;
 
 private:
-    template <typename desc_t>
-    static const desc_t &cast_to_desc(const void *p) {
-        return *(reinterpret_cast<const desc_t *>(p));
-    }
-
     static primitive_kind_t get_pkind(primitive_kind_t pkind);
 
     // Thread ID is not used as part of the key, it's only used to get
@@ -72,8 +67,6 @@ struct key_t {
     std::thread::id thread_id_;
 };
 
-size_t get_md_hash(const memory_desc_t &md);
-size_t get_attr_hash(const primitive_attr_t &attr);
 size_t get_desc_hash(const concat_desc_t &desc);
 size_t get_desc_hash(const batch_normalization_desc_t &desc);
 size_t get_desc_hash(const binary_desc_t &desc);
@@ -97,39 +90,6 @@ size_t get_desc_hash(const softmax_desc_t &desc);
 size_t get_desc_hash(const sum_desc_t &desc);
 size_t get_desc_hash(const zero_pad_desc_t &desc);
 
-template <typename T>
-size_t get_array_hash(size_t seed, const T *v, int size) {
-    for (int i = 0; i < size; i++) {
-        seed = hash_combine(seed, v[i]);
-    }
-    return seed;
-}
-
-template <>
-inline size_t get_array_hash<memory_desc_t>(
-        size_t seed, const memory_desc_t *v, int size) {
-    for (int i = 0; i < size; i++) {
-        seed = hash_combine(seed, get_md_hash(v[i]));
-    }
-    return seed;
-}
-
-inline size_t get_array_hash(
-        size_t seed, const std::vector<const memory_desc_t *> &mds) {
-    for (const auto *md : mds)
-        seed = hash_combine(seed, get_md_hash(*md));
-    return seed;
-}
-
-template <>
-inline size_t get_array_hash<data_type_t>(
-        size_t seed, const data_type_t *v, int size) {
-    for (int i = 0; i < size; i++) {
-        seed = hash_combine(seed, static_cast<size_t>(v[i]));
-    }
-    return seed;
-}
-
 } // namespace primitive_hashing
 } // namespace impl
 } // namespace dnnl
@@ -153,11 +113,19 @@ struct hash<dnnl::impl::primitive_hashing::key_t> {
         seed = hash_combine(seed, hash_combine(0, key.skip_idx_));
 
         seed = hash_combine(seed, key.engine_id_.hash());
+
+        seed = get_array_hash(
+                seed, key.hint_mds_.data(), (int)key.hint_mds_.size());
+
+        const result_type verb_seed_before_desc = seed;
+        UNUSED(verb_seed_before_desc);
+
         // Combine hash for op_desc with the computed hash
 #define CASE(pkind) \
     case primitive_kind::pkind: \
-        seed = hash_combine( \
-                seed, get_desc_hash(*(pkind##_desc_t *)key.op_desc_)); \
+        seed = hash_combine(seed, \
+                get_desc_hash( \
+                        *op_desc_t::to_desc<pkind##_desc_t>(key.op_desc_))); \
         break;
 
         // clang-format off
@@ -189,8 +157,13 @@ struct hash<dnnl::impl::primitive_hashing::key_t> {
         }
             // clang-format on
 #undef CASE
-        seed = get_array_hash(
-                seed, key.hint_mds_.data(), (int)key.hint_mds_.size());
+
+        // Note: `16` is just a random number, as debuginfo hasn't received a
+        // single command center for levels across layers of the library.
+        // ANCHOR: HASHING_DEBUGINFO_16.
+        VDEBUGINFO(16, primitive, hashing,
+                "operator(),seed_before_desc=%zu seed_after_desc=%zu",
+                verb_seed_before_desc, seed);
 
         return seed;
     }
diff --git a/src/common/primitive_hashing_utils.cpp b/src/common/primitive_hashing_utils.cpp
new file mode 100644
index 00000000000..6a84ebe4dd3
--- /dev/null
+++ b/src/common/primitive_hashing_utils.cpp
@@ -0,0 +1,254 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <algorithm>
+#include "primitive_attr.hpp"
+#include "primitive_desc.hpp"
+#include "type_helpers.hpp"
+#include "utils.hpp"
+
+#include "dnnl_thread.hpp"
+#include "engine.hpp"
+#include "primitive_hashing_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace primitive_hashing {
+
+// Combine hash of each memory_desc_t data member
+size_t get_md_hash(const memory_desc_t &md) {
+    size_t seed = 0;
+    seed = get_array_hash(seed, md.dims, md.ndims);
+    seed = hash_combine(seed, static_cast<size_t>(md.data_type));
+    seed = get_array_hash(seed, md.padded_dims, md.ndims);
+    seed = get_array_hash(seed, md.padded_offsets, md.ndims);
+    seed = hash_combine(seed, md.offset0);
+    seed = hash_combine(seed, static_cast<size_t>(md.format_kind));
+    // format desc
+    switch ((int)md.format_kind) {
+        case format_kind::undef:
+        case format_kind::any: break;
+        case format_kind::blocked:
+            for (int i = 0; i < md.ndims; i++) {
+                if (md.dims[i] == 1 && md.padded_dims[i] == 1) continue;
+                seed = hash_combine(seed, md.format_desc.blocking.strides[i]);
+            }
+            seed = hash_combine(seed, md.format_desc.blocking.inner_nblks);
+            seed = get_array_hash(seed, md.format_desc.blocking.inner_blks,
+                    md.format_desc.blocking.inner_nblks);
+            seed = get_array_hash(seed, md.format_desc.blocking.inner_idxs,
+                    md.format_desc.blocking.inner_nblks);
+            break;
+        case format_kind::wino:
+            seed = hash_combine(seed,
+                    static_cast<size_t>(md.format_desc.wino_desc.wino_format));
+            seed = hash_combine(seed, md.format_desc.wino_desc.r);
+            seed = hash_combine(seed, md.format_desc.wino_desc.alpha);
+            seed = hash_combine(seed, md.format_desc.wino_desc.ic);
+            seed = hash_combine(seed, md.format_desc.wino_desc.oc);
+            seed = hash_combine(seed, md.format_desc.wino_desc.ic_block);
+            seed = hash_combine(seed, md.format_desc.wino_desc.oc_block);
+            seed = hash_combine(seed, md.format_desc.wino_desc.ic2_block);
+            seed = hash_combine(seed, md.format_desc.wino_desc.oc2_block);
+            seed = hash_combine(seed, md.format_desc.wino_desc.adj_scale);
+            seed = hash_combine(seed, md.format_desc.wino_desc.size);
+            break;
+        case format_kind::cublaslt_blocked:
+            seed = hash_combine(seed,
+                    static_cast<size_t>(md.format_desc.cublaslt_blocked_desc
+                                                .cublaslt_format));
+            seed = hash_combine(
+                    seed, (md.format_desc.cublaslt_blocked_desc.size));
+            break;
+        case format_kind::rnn_packed:
+            seed = hash_combine(seed,
+                    static_cast<size_t>(md.format_desc.rnn_packed_desc.format));
+            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.n_parts);
+            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.n);
+            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.ldb);
+            {
+                int n_parts = md.format_desc.rnn_packed_desc.n_parts;
+                seed = get_array_hash(
+                        seed, md.format_desc.rnn_packed_desc.parts, n_parts);
+                seed = get_array_hash(seed,
+                        md.format_desc.rnn_packed_desc.part_pack_size, n_parts);
+                seed = get_array_hash(seed,
+                        md.format_desc.rnn_packed_desc.pack_part, n_parts);
+            }
+            seed = hash_combine(
+                    seed, md.format_desc.rnn_packed_desc.offset_compensation);
+            seed = hash_combine(seed, md.format_desc.rnn_packed_desc.size);
+            break;
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        case format_kind::sparse:
+            seed = hash_combine(seed,
+                    static_cast<size_t>(md.format_desc.sparse_desc.encoding));
+            seed = hash_combine(seed, md.format_desc.sparse_desc.nnz);
+            seed = get_array_hash(seed,
+                    md.format_desc.sparse_desc.metadata_types,
+                    sparse_desc_t::max_metadata_types);
+            // User cannot initialize `packed_desc` therefore `packed_desc`
+            // is always zero initialized.
+            break;
+#endif
+        case format_kind::sparse:
+            seed = hash_combine(seed,
+                    static_cast<size_t>(md.format_desc.sparse_desc.encoding));
+            // User cannot initialize `packed_desc` therefore therefore
+            // at this point `packed_desc` is always zero initialized.
+            break;
+        default: assert(!"unknown format_kind");
+    }
+
+    if (md.extra.flags != dnnl_memory_extra_flag_none) {
+        seed = hash_combine(seed, md.extra.flags);
+        if (md.extra.flags
+                & (dnnl_memory_extra_flag_compensation_conv_s8s8
+                        | dnnl_memory_extra_flag_rnn_u8s8_compensation)) {
+            seed = hash_combine(seed, md.extra.compensation_mask);
+        }
+
+        if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) {
+            seed = hash_combine(seed, md.extra.scale_adjust);
+        }
+
+        if (md.extra.flags
+                & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
+            seed = hash_combine(seed, md.extra.asymm_compensation_mask);
+        }
+
+        if (md.extra.flags
+                & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) {
+            seed = get_array_hash(seed, md.extra.idhw, 3);
+            seed = get_array_hash(seed, md.extra.odhw, 3);
+            seed = get_array_hash(seed, md.extra.pdhw, 3);
+            seed = get_array_hash(seed, md.extra.ddhw, 3);
+            seed = hash_combine(seed, md.extra.dst_size);
+        }
+    }
+    // Combined hash for a memory descriptor
+    return seed;
+}
+
+// Combine hash of each primitive_attr_t data member
+size_t get_attr_hash(const primitive_attr_t &attr) {
+    size_t seed = 0;
+    // scratchpad_mode
+    seed = hash_combine(seed, static_cast<size_t>(attr.scratchpad_mode_));
+    // fpmath_mode
+    seed = hash_combine(seed, static_cast<size_t>(attr.fpmath_.mode_));
+    seed = hash_combine(seed, static_cast<size_t>(attr.fpmath_.apply_to_int_));
+    // deterministic
+    seed = hash_combine(seed, static_cast<size_t>(attr.deterministic_));
+    // acc_mode
+    seed = hash_combine(seed, static_cast<size_t>(attr.acc_mode_));
+    // rounding_mode
+    if (!attr.rounding_mode_.has_default_values()) {
+        for (const auto &e : attr.rounding_mode_.rounding_modes_map_) {
+            seed = hash_combine(seed, e.first);
+            seed = hash_combine(seed, static_cast<size_t>(e.second));
+        }
+    }
+
+    if (!attr.scales_.has_default_values()) {
+        seed = hash_combine(seed, attr.scales_.get_hash());
+    }
+
+    if (!attr.zero_points_.has_default_values()) {
+        seed = hash_combine(seed, attr.zero_points_.get_hash());
+    }
+
+    // post_ops: entry[:]
+    seed = get_post_op_hash(seed, attr.post_ops_);
+    // rnn_data_qparams: scale, shift
+    seed = hash_combine(seed, attr.rnn_data_qparams_.scale_);
+    seed = hash_combine(seed, attr.rnn_data_qparams_.shift_);
+    if (!attr.rnn_weights_qparams_.has_default_values()) {
+        // rnn_weights_qparams: mask
+        seed = hash_combine(seed, attr.rnn_weights_qparams_.mask_);
+        // rnn_weights_qparams: count
+        seed = hash_combine(seed, attr.rnn_weights_qparams_.count_);
+        // rnn_weights_qparams: scales[:]
+        seed = get_array_hash(seed, attr.rnn_weights_qparams_.scales_,
+                attr.rnn_weights_qparams_.count_);
+    }
+    if (attr.gpu_attr_) {
+        seed = hash_combine(seed, attr.gpu_attr_->get_hash());
+    }
+    if (!attr.dropout_.has_default_values()) {
+        seed = hash_combine(
+                seed, get_md_hash(attr.dropout_.user_dropout_desc_));
+    }
+    seed = hash_combine(seed, attr.src_dyn_quant_params_.get());
+    // Combined hash for attributes
+    return seed;
+}
+
+// Combine hash of each post_ops::entry_
+size_t get_post_op_hash(size_t seed, const post_ops_t &post_ops) {
+    for (int i = 0; i < post_ops.len(); i++) {
+        const auto &entry = post_ops.entry_[i];
+        switch (entry.kind) {
+            case primitive_kind::eltwise:
+                seed = hash_combine(
+                        seed, static_cast<size_t>(entry.eltwise.alg));
+                seed = hash_combine(seed, entry.eltwise.scale);
+                seed = hash_combine(seed, entry.eltwise.alpha);
+                seed = hash_combine(seed, entry.eltwise.beta);
+                break;
+            case primitive_kind::sum:
+                seed = hash_combine(seed, entry.sum.scale);
+                seed = hash_combine(seed, static_cast<size_t>(entry.sum.dt));
+                break;
+            case primitive_kind::convolution:
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.in_h));
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.in_w));
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.ker_h));
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.ker_w));
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.str_h));
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.str_w));
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise_conv_old.in_dt));
+                break;
+            case primitive_kind::binary:
+                seed = hash_combine(
+                        seed, static_cast<size_t>(entry.binary.alg));
+                seed = hash_combine(
+                        seed, get_md_hash(entry.binary.user_src1_desc));
+                break;
+            case primitive_kind::prelu:
+                seed = hash_combine(
+                        seed, static_cast<size_t>(entry.prelu.mask));
+                break;
+            case primitive_kind::depthwise:
+                seed = hash_combine(seed, static_cast<size_t>(entry.depthwise.alg));
+                seed = get_array_hash(seed, entry.depthwise.offset, entry.depthwise.fields_count);
+                break;
+            case primitive_kind::quantization:
+                seed = hash_combine(seed, static_cast<size_t>(entry.quantization.alg));
+                seed = get_array_hash(seed, entry.quantization.per_channel, entry.quantization.fields_count);
+                seed = get_array_hash(seed, entry.quantization.all_default, entry.quantization.fields_count);
+                seed = get_array_hash(seed, entry.quantization.offset, entry.quantization.fields_count);
+                break;
+            default: assert(!"unknown post_op");
+        }
+    }
+
+    return seed;
+}
+
+} // namespace primitive_hashing
+} // namespace impl
+} // namespace dnnl
diff --git a/src/common/primitive_hashing_utils.hpp b/src/common/primitive_hashing_utils.hpp
new file mode 100644
index 00000000000..27a7490d093
--- /dev/null
+++ b/src/common/primitive_hashing_utils.hpp
@@ -0,0 +1,71 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef COMMON_PRIMITIVE_HASHING_UTILS_HPP
+#define COMMON_PRIMITIVE_HASHING_UTILS_HPP
+
+#include <thread>
+#include <typeindex>
+#include <type_traits>
+
+#include "common/c_types_map.hpp"
+#include "common/engine_id.hpp"
+#include "common/type_helpers.hpp"
+#include "common/verbose.hpp"
+
+namespace dnnl {
+namespace impl {
+struct primitive_desc_t;
+namespace primitive_hashing {
+
+size_t get_md_hash(const memory_desc_t &md);
+size_t get_attr_hash(const primitive_attr_t &attr);
+size_t get_post_op_hash(size_t seed, const post_ops_t &post_ops);
+
+template <typename T>
+size_t get_array_hash(size_t seed, const T *v, int size) {
+    for (int i = 0; i < size; i++) {
+        seed = hash_combine(seed, v[i]);
+    }
+    return seed;
+}
+
+template <>
+inline size_t get_array_hash<memory_desc_t>(
+        size_t seed, const memory_desc_t *v, int size) {
+    for (int i = 0; i < size; i++) {
+        seed = hash_combine(seed, get_md_hash(v[i]));
+    }
+    return seed;
+}
+
+inline size_t get_array_hash(
+        size_t seed, const std::vector<const memory_desc_t *> &mds) {
+    for (const auto *md : mds)
+        seed = hash_combine(seed, get_md_hash(*md));
+    return seed;
+}
+
+template<typename T, typename A>
+size_t get_vector_hash(size_t seed, const std::vector<T, A> &vec) {
+    return get_array_hash(seed, vec.data(), vec.size());
+}
+
+} // namespace primitive_hashing
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/common/primitive_serialization.cpp b/src/common/primitive_serialization.cpp
new file mode 100644
index 00000000000..b0132bc197b
--- /dev/null
+++ b/src/common/primitive_serialization.cpp
@@ -0,0 +1,596 @@
+/*******************************************************************************
+* Copyright 2021-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/primitive_serialization.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+
+status_t serialize_desc(
+        serialization_stream_t &sstream, const op_desc_t *op_desc) {
+#define CASE(pkind) \
+    case primitive_kind::pkind: \
+        serialize(sstream, *(const pkind##_desc_t *)op_desc); \
+        break;
+
+    switch ((int)op_desc->primitive_kind) {
+        CASE(batch_normalization)
+        CASE(binary)
+        CASE(concat)
+        CASE(convolution)
+        CASE(deconvolution)
+        CASE(eltwise)
+        CASE(gemm)
+        CASE(group_normalization)
+        CASE(inner_product)
+        CASE(layer_normalization)
+        CASE(lrn)
+        CASE(matmul)
+        CASE(pooling)
+        CASE(prelu)
+        CASE(reduction)
+        CASE(reorder)
+        CASE(resampling)
+        CASE(rnn)
+        CASE(sdpa)
+        CASE(shuffle)
+        CASE(softmax)
+        CASE(sum)
+        default: return status::invalid_arguments;
+    }
+#undef CASE
+    return status::success;
+}
+
+void serialize(serialization_stream_t &sstream, const memory_desc_t &md) {
+    sstream.append(md.ndims);
+    sstream.append_array(md.ndims, md.dims);
+    sstream.append(md.data_type);
+    sstream.append_array(md.ndims, md.padded_dims);
+    sstream.append_array(md.ndims, md.padded_offsets);
+    sstream.append(md.offset0);
+    sstream.append(md.format_kind);
+    // format desc
+    switch ((int)md.format_kind) {
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        case format_kind::sparse:
+#endif
+        case format_kind::undef:
+        case format_kind::any: break;
+        case format_kind::blocked:
+            sstream.append_array(md.ndims, md.format_desc.blocking.strides);
+            sstream.append(md.format_desc.blocking.inner_nblks);
+            sstream.append_array(md.format_desc.blocking.inner_nblks,
+                    md.format_desc.blocking.inner_blks);
+            sstream.append_array(md.format_desc.blocking.inner_nblks,
+                    md.format_desc.blocking.inner_idxs);
+            break;
+        case format_kind::wino:
+            sstream.append(md.format_desc.wino_desc.wino_format);
+            sstream.append(md.format_desc.wino_desc.r);
+            sstream.append(md.format_desc.wino_desc.alpha);
+            sstream.append(md.format_desc.wino_desc.ic);
+            sstream.append(md.format_desc.wino_desc.oc);
+            sstream.append(md.format_desc.wino_desc.ic_block);
+            sstream.append(md.format_desc.wino_desc.oc_block);
+            sstream.append(md.format_desc.wino_desc.ic2_block);
+            sstream.append(md.format_desc.wino_desc.oc2_block);
+            sstream.append(md.format_desc.wino_desc.adj_scale);
+            sstream.append(md.format_desc.wino_desc.size);
+            break;
+        case format_kind::cublaslt_blocked:
+            sstream.append(
+                    md.format_desc.cublaslt_blocked_desc.cublaslt_format);
+            sstream.append(md.format_desc.cublaslt_blocked_desc.size);
+            break;
+        case format_kind::rnn_packed:
+            sstream.append(md.format_desc.rnn_packed_desc.format);
+            sstream.append(md.format_desc.rnn_packed_desc.n_parts);
+            sstream.append(md.format_desc.rnn_packed_desc.n);
+            sstream.append(md.format_desc.rnn_packed_desc.ldb);
+            {
+                int n_parts = md.format_desc.rnn_packed_desc.n_parts;
+                sstream.append_array(
+                        n_parts, md.format_desc.rnn_packed_desc.parts);
+                sstream.append_array(
+                        n_parts, md.format_desc.rnn_packed_desc.part_pack_size);
+                sstream.append_array(
+                        n_parts, md.format_desc.rnn_packed_desc.pack_part);
+            }
+            sstream.append(md.format_desc.rnn_packed_desc.offset_compensation);
+            sstream.append(md.format_desc.rnn_packed_desc.size);
+            break;
+        default: assert(!"unknown format_kind");
+    }
+
+    if (md.extra.flags != dnnl_memory_extra_flag_none) {
+        sstream.append(md.extra.flags);
+        if (md.extra.flags
+                & (dnnl_memory_extra_flag_compensation_conv_s8s8
+                        | dnnl_memory_extra_flag_rnn_u8s8_compensation)) {
+            sstream.append(md.extra.compensation_mask);
+        }
+        if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) {
+            sstream.append(md.extra.scale_adjust);
+        }
+        if (md.extra.flags
+                & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
+            sstream.append(md.extra.asymm_compensation_mask);
+        }
+        if (md.extra.flags
+                & dnnl_memory_extra_flag_compensation_gpu_conv_asymmetric_src) {
+            sstream.append_array(3, md.extra.idhw);
+            sstream.append_array(3, md.extra.odhw);
+            sstream.append_array(3, md.extra.pdhw);
+            sstream.append_array(3, md.extra.ddhw);
+            sstream.append(md.extra.dst_size);
+        }
+    }
+}
+
+void serialize(serialization_stream_t &sstream, const post_ops_t &post_ops) {
+    // post_ops: entry[:]
+    for (int i = 0; i < post_ops.len(); i++) {
+        const auto &entry = post_ops.entry_[i];
+        switch (entry.kind) {
+            case primitive_kind::eltwise:
+                sstream.append(entry.eltwise.alg);
+                sstream.append(entry.eltwise.scale);
+                sstream.append(entry.eltwise.alpha);
+                sstream.append(entry.eltwise.beta);
+                break;
+            case primitive_kind::sum:
+                sstream.append(entry.sum.scale);
+                sstream.append(entry.sum.zero_point);
+                sstream.append(entry.sum.dt);
+                break;
+            case primitive_kind::convolution:
+                sstream.append(entry.depthwise_conv.kernel);
+                sstream.append(entry.depthwise_conv.stride);
+                sstream.append(entry.depthwise_conv.padding);
+                sstream.append(entry.depthwise_conv.wei_dt);
+                sstream.append(entry.depthwise_conv.bias_dt);
+                sstream.append(entry.depthwise_conv.dst_dt);
+                break;
+            case primitive_kind::binary:
+                sstream.append(entry.binary.alg);
+                serialize(sstream, entry.binary.user_src1_desc);
+                break;
+            case primitive_kind::prelu: sstream.append(entry.prelu.mask); break;
+            default: assert(!"unknown post_op");
+        }
+    }
+}
+
+void serialize(serialization_stream_t &sstream, const primitive_attr_t &attr) {
+    // scratchpad_mode
+    sstream.append(attr.scratchpad_mode_);
+    // fpmath_mode
+    sstream.append(attr.fpmath_.mode_);
+    sstream.append(attr.fpmath_.apply_to_int_);
+    // deterministic
+    sstream.append(attr.deterministic_);
+    // acc_mode
+    sstream.append(attr.acc_mode_);
+
+    if (!attr.scales_.has_default_values()) {
+        sstream.append('s');
+        attr.scales_.serialize(sstream);
+    }
+    // zero_points
+    if (!attr.zero_points_.has_default_values()) {
+        sstream.append('z');
+        attr.zero_points_.serialize(sstream);
+    }
+
+    // Rounding modes
+    if (!attr.rounding_mode_.has_default_values()) sstream.append('r');
+    for (const auto &e : attr.rounding_mode_.rounding_modes_map_) {
+        if (!attr.rounding_mode_.has_default_values(e.first)) {
+            sstream.append(e.first);
+            sstream.append(e.second);
+        }
+    }
+
+    if (!attr.dropout_.has_default_values()) {
+        sstream.append('d');
+        serialize(sstream, attr.dropout_.user_dropout_desc_);
+    }
+
+    serialize(sstream, attr.post_ops_);
+
+    // rnn_data_qparams: scale, shift
+    sstream.append(attr.rnn_data_qparams_.scale_);
+    sstream.append(attr.rnn_data_qparams_.shift_);
+    if (!attr.rnn_weights_qparams_.has_default_values()) {
+        // rnn_weights_qparams: mask
+        sstream.append(attr.rnn_weights_qparams_.mask_);
+        // rnn_weights_qparams: count
+        sstream.append(attr.rnn_weights_qparams_.count_);
+        // rnn_weights_qparams: scales[:]
+        sstream.append_array(attr.rnn_weights_qparams_.count_,
+                attr.rnn_weights_qparams_.scales_);
+    }
+    if (attr.gpu_attr_) {
+        attr.gpu_attr_->serialize(sstream);
+    } else {
+        int zero = 0;
+        sstream.append(zero);
+    }
+    sstream.append(attr.src_dyn_quant_params_.get());
+}
+
+void serialize(serialization_stream_t &sstream, const concat_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    // Memory descriptors
+    serialize(sstream, *desc.dst_md);
+    // N
+    sstream.append(desc.n);
+    // Concat dimension
+    sstream.append(desc.concat_dimension);
+    // Array of mds
+    for (int i = 0; i < desc.n; i++)
+        serialize(sstream, *desc.src_mds[i]);
+}
+
+void serialize(serialization_stream_t &sstream,
+        const batch_normalization_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    serialize(sstream, desc.scaleshift_desc);
+    serialize(sstream, desc.diff_scaleshift_desc);
+    serialize(sstream, desc.stat_desc);
+    // Epsilon
+    sstream.append(desc.batch_norm_epsilon);
+    // Flags
+    sstream.append(desc.flags);
+}
+
+void serialize(serialization_stream_t &sstream, const binary_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc[0]);
+    serialize(sstream, desc.src_desc[1]);
+    serialize(sstream, desc.src_desc[2]);
+    serialize(sstream, desc.dst_desc);
+}
+
+// (De-)Convolution
+void serialize(
+        serialization_stream_t &sstream, const convolution_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.weights_desc);
+    serialize(sstream, desc.diff_weights_desc);
+    serialize(sstream, desc.bias_desc);
+    serialize(sstream, desc.diff_bias_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Strides, dilates, padding
+    sstream.append_array(DNNL_MAX_NDIMS, desc.strides);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.dilates);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.padding[0]);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.padding[1]);
+    // Accumulator type
+    sstream.append(desc.accum_data_type);
+    // Internal member
+    sstream.append(desc.use_inversion);
+}
+
+// Eltwise
+void serialize(serialization_stream_t &sstream, const eltwise_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Alpha, beta
+    sstream.append(desc.alpha);
+    sstream.append(desc.beta);
+}
+
+void serialize(serialization_stream_t &sstream, const gemm_desc_t &desc) {
+    // Kind
+    sstream.append(desc.primitive_kind);
+    serialize(sstream, desc.a_desc);
+    serialize(sstream, desc.b_desc);
+    serialize(sstream, desc.c_desc);
+    serialize(sstream, desc.bias_desc);
+    // Accumulator type
+    sstream.append(desc.acc_type);
+    sstream.append(desc.sum_ab);
+    sstream.append(desc.sum_ab_type);
+}
+
+void serialize(serialization_stream_t &sstream,
+        const group_normalization_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    serialize(sstream, desc.scaleshift_desc);
+    serialize(sstream, desc.diff_scaleshift_desc);
+    serialize(sstream, desc.stat_desc);
+    // Groups
+    sstream.append(desc.groups);
+    // Epsilon
+    sstream.append(desc.group_norm_epsilon);
+    // Flags
+    sstream.append(desc.flags);
+}
+
+void serialize(
+        serialization_stream_t &sstream, const inner_product_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.weights_desc);
+    serialize(sstream, desc.diff_weights_desc);
+    serialize(sstream, desc.bias_desc);
+    serialize(sstream, desc.diff_bias_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Accumulator type
+    sstream.append(desc.accum_data_type);
+}
+
+void serialize(serialization_stream_t &sstream,
+        const layer_normalization_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.data_scaleshift_desc);
+    serialize(sstream, desc.diff_data_scaleshift_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    serialize(sstream, desc.stat_desc);
+    // Epsilon
+    sstream.append(desc.layer_norm_epsilon);
+    // Flags
+    sstream.append(desc.flags);
+}
+
+void serialize(serialization_stream_t &sstream, const lrn_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Local size
+    sstream.append(desc.local_size);
+    // Alpha, beta
+    sstream.append(desc.lrn_alpha);
+    sstream.append(desc.lrn_beta);
+    // k
+    sstream.append(desc.lrn_k);
+}
+
+void serialize(serialization_stream_t &sstream, const matmul_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.weights_desc);
+    serialize(sstream, desc.bias_desc);
+    serialize(sstream, desc.dst_desc);
+    // Accumulator type
+    sstream.append(desc.accum_data_type);
+}
+
+void serialize(serialization_stream_t &sstream, const pooling_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Strides, dilates, padding
+    sstream.append_array(DNNL_MAX_NDIMS, desc.strides);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.kernel);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.padding[0]);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.padding[1]);
+    sstream.append_array(DNNL_MAX_NDIMS, desc.dilation);
+    // Accumulator type
+    sstream.append(desc.accum_data_type);
+}
+
+void serialize(serialization_stream_t &sstream, const prelu_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.weights_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.diff_weights_desc);
+    serialize(sstream, desc.diff_dst_desc);
+}
+
+void serialize(serialization_stream_t &sstream, const reduction_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.dst_desc);
+    // P, eps
+    sstream.append(desc.p);
+    sstream.append(desc.eps);
+}
+
+void serialize(serialization_stream_t &sstream, const reorder_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    // Memory descriptors
+    serialize(sstream, *desc.src_md);
+    serialize(sstream, *desc.dst_md);
+    // Kinds of source and destination engines
+    sstream.append(desc.src_engine_kind);
+    sstream.append(desc.dst_engine_kind);
+    sstream.append(desc.is_cross_engine);
+}
+
+void serialize(serialization_stream_t &sstream, const resampling_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Factors
+    sstream.append_array(DNNL_MAX_NDIMS, desc.factors);
+}
+
+void serialize(serialization_stream_t &sstream, const rnn_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    sstream.append(desc.cell_kind);
+    sstream.append(desc.direction);
+    // Memory descriptors
+    serialize(sstream, desc.src_layer_desc);
+    serialize(sstream, desc.src_iter_desc);
+    serialize(sstream, desc.src_iter_c_desc);
+    serialize(sstream, desc.weights_layer_desc);
+    serialize(sstream, desc.weights_iter_desc);
+    serialize(sstream, desc.bias_desc);
+    serialize(sstream, desc.dst_layer_desc);
+    serialize(sstream, desc.dst_iter_desc);
+    serialize(sstream, desc.dst_iter_c_desc);
+    serialize(sstream, desc.weights_peephole_desc);
+    serialize(sstream, desc.weights_projection_desc);
+    serialize(sstream, desc.diff_src_layer_desc);
+    serialize(sstream, desc.diff_src_iter_desc);
+    serialize(sstream, desc.diff_src_iter_c_desc);
+    serialize(sstream, desc.diff_weights_layer_desc);
+    serialize(sstream, desc.diff_weights_iter_desc);
+    serialize(sstream, desc.diff_bias_desc);
+    serialize(sstream, desc.diff_dst_layer_desc);
+    serialize(sstream, desc.diff_dst_iter_desc);
+    serialize(sstream, desc.diff_dst_iter_c_desc);
+    serialize(sstream, desc.diff_weights_peephole_desc);
+    serialize(sstream, desc.diff_weights_projection_desc);
+    // Flags
+    sstream.append(desc.flags);
+    // Activation kind
+    sstream.append(desc.activation_kind);
+    // Alpha, beta
+    sstream.append(desc.alpha);
+    sstream.append(desc.beta);
+}
+
+// Shuffle
+void serialize(serialization_stream_t &sstream, const shuffle_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.dst_desc);
+    // Axis
+    sstream.append(desc.axis);
+    // Groupe size
+    sstream.append(desc.group_size);
+}
+
+void serialize(serialization_stream_t &sstream, const softmax_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    sstream.append(desc.prop_kind);
+    sstream.append(desc.alg_kind);
+    // Memory descriptors
+    serialize(sstream, desc.src_desc);
+    serialize(sstream, desc.diff_src_desc);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.diff_dst_desc);
+    // Axis
+    sstream.append(desc.softmax_axis);
+}
+
+void serialize(serialization_stream_t &sstream, const sum_desc_t &desc) {
+    // Kinds
+    sstream.append(desc.primitive_kind);
+    // Memory descriptors
+    serialize(sstream, *desc.dst_md);
+    // N
+    sstream.append(desc.n);
+    // Scales
+    sstream.append_array(desc.n, desc.scales);
+    // Array of mds
+    for (int i = 0; i < desc.n; i++)
+        serialize(sstream, *desc.src_mds[i]);
+}
+
+void serialize(serialization_stream_t &sstream, const sdpa_desc_t &desc) {
+    // Kind
+    sstream.append(desc.primitive_kind);
+    serialize(sstream, desc.q_desc);
+    serialize(sstream, desc.k_desc);
+    serialize(sstream, desc.v_desc);
+    desc.kq_scales.serialize(sstream);
+    desc.kq_zero_points.serialize(sstream);
+    desc.vs_scales.serialize(sstream);
+    desc.vs_zero_points.serialize(sstream);
+    serialize(sstream, desc.dst_desc);
+    serialize(sstream, desc.attn_mask_desc);
+    sstream.append(desc.scale_dt);
+    sstream.append(desc.invert_scale);
+    sstream.append(desc.kv_head_number);
+    sstream.append(desc.mask_type);
+}
+
+} // namespace impl
+} // namespace dnnl
diff --git a/src/common/primitive_serialization.hpp b/src/common/primitive_serialization.hpp
new file mode 100644
index 00000000000..50d87f46be3
--- /dev/null
+++ b/src/common/primitive_serialization.hpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+* Copyright 2021-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef COMMON_PRIMITIVE_SERIALIZATION_HPP
+#define COMMON_PRIMITIVE_SERIALIZATION_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/primitive_attr.hpp"
+#include "common/serialization.hpp"
+#include "common/type_helpers.hpp"
+
+namespace dnnl {
+namespace impl {
+
+void serialize(serialization_stream_t &sstream, const post_ops_t &post_ops);
+void serialize(serialization_stream_t &sstream, const primitive_attr_t &attr);
+void serialize(serialization_stream_t &sstream, const memory_desc_t &md);
+void serialize(serialization_stream_t &sstream, const concat_desc_t &desc);
+void serialize(serialization_stream_t &sstream,
+        const batch_normalization_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const binary_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const convolution_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const eltwise_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const gemm_desc_t &desc);
+void serialize(serialization_stream_t &sstream,
+        const group_normalization_desc_t &desc);
+void serialize(
+        serialization_stream_t &sstream, const inner_product_desc_t &desc);
+void serialize(serialization_stream_t &sstream,
+        const layer_normalization_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const lrn_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const matmul_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const pooling_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const prelu_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const reduction_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const reorder_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const resampling_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const rnn_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const sdpa_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const shuffle_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const softmax_desc_t &desc);
+void serialize(serialization_stream_t &sstream, const sum_desc_t &desc);
+
+status_t serialize_desc(
+        serialization_stream_t &sstream, const op_desc_t *op_desc);
+
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/common/profiler.hpp b/src/common/profiler.hpp
index 8bedb0a8e52..35b5b3a90b8 100644
--- a/src/common/profiler.hpp
+++ b/src/common/profiler.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,8 +90,7 @@ static double get_msec() {
 // names are copied into long term storage.
 
 struct profiler_t {
-    profiler_t(const std::string &profile_name)
-        : _profile_name(profile_name), _run_data(), _data() {
+    profiler_t(const std::string &profile_name) : _profile_name(profile_name) {
         // Reserve data on construction to reduce chance of recording
         // reallocation
         _run_data.reserve(128);
@@ -109,14 +108,14 @@ struct profiler_t {
     // Recording data
     void stamp(const char *name) {
         optimization_barrier();
-        _run_data.emplace_back(record_t<const char *>(name, get_msec()));
+        _run_data.emplace_back(name, get_msec());
         assert(_state == RUNNING);
         optimization_barrier();
     }
 
     void stop(const char *name) {
         optimization_barrier();
-        _run_data.emplace_back(record_t<const char *>(name, get_msec()));
+        _run_data.emplace_back(name, get_msec());
         stop();
     }
 
@@ -172,7 +171,7 @@ struct profiler_t {
         T name;
         prof_time_t time;
         record_t(T name, prof_time_t time) : name(name), time(time) {}
-        record_t(std::pair<T, prof_time_t> record)
+        record_t(const std::pair<T, prof_time_t> &record)
             : name(record.first), time(record.second) {}
         // Reversed time ordering
         bool operator<(const record_t &b) const { return this->time > b.time; }
diff --git a/src/common/reduction.cpp b/src/common/reduction.cpp
index 6c59114e2c9..1f06a111dfc 100644
--- a/src/common/reduction.cpp
+++ b/src/common/reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -124,6 +124,9 @@ status_t reduction_attr_check(const reduction_desc_t &desc,
         // Check sum
         VCHECK_RED_UNIMPL(po.check_sum_consistency(dst_dt, false, true),
                 VERBOSE_UNSUPPORTED_POSTOP);
+
+        // Note: verbose support is inside the call.
+        CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
     }
 
     return status::success;
diff --git a/src/common/reduction_pd.hpp b/src/common/reduction_pd.hpp
index e6a5b448609..211b89fd00a 100644
--- a/src/common/reduction_pd.hpp
+++ b/src/common/reduction_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,10 +37,11 @@ status_t reduction_desc_init(reduction_desc_t *reduction_desc,
         alg_kind_t alg_kind, const memory_desc_t *src_desc,
         const memory_desc_t *dst_desc, float p, float eps);
 
+// NOLINTBEGIN(google-default-arguments)
 struct reduction_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::reduction;
 
-    typedef reduction_pd_t hint_class;
+    using hint_class = reduction_pd_t;
 
     const reduction_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -131,16 +132,20 @@ struct reduction_pd_t : public primitive_desc_t {
         }
     }
 
+    bool has_zero_dim_memory() const {
+        return memory_desc_wrapper(src_md()).has_zero_dim();
+    }
+
 protected:
     reduction_desc_t desc_;
 
     memory_desc_t src_md_;
     memory_desc_t dst_md_;
 
-    reduction_pd_t(const reduction_desc_t *adesc, const primitive_attr_t *attr,
+    reduction_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const hint_class *hint_fwd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<reduction_desc_t>(adesc))
         , src_md_(desc_.src_desc)
         , dst_md_(desc_.dst_desc) {}
 
@@ -161,6 +166,7 @@ struct reduction_pd_t : public primitive_desc_t {
         return status::success;
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/reorder.cpp b/src/common/reorder.cpp
index cedd98c7eb6..c21fe526dfb 100644
--- a/src/common/reorder.cpp
+++ b/src/common/reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,6 +38,10 @@ namespace impl {
     VCONDCHECK(primitive, create, check, reorder, (cond), \
             status::invalid_arguments, msg, ##__VA_ARGS__);
 
+#define VCHECK_REORDER_UNIMPL(cond, msg, ...) \
+    VCONDCHECK(primitive, create, check, reorder, (cond), \
+            status::unimplemented, msg, ##__VA_ARGS__);
+
 namespace {
 engine_t *get_reorder_engine(engine_t *src_engine, engine_t *dst_engine) {
     auto s_ek = src_engine->kind();
@@ -98,6 +102,48 @@ status_t reorder_primitive_desc_create(std::shared_ptr<primitive_desc_t> &pd,
                            zero_points.has_default_values(DNNL_ARG_DST)),
             VERBOSE_UNSUPPORTED_ZP_CFG);
 
+    // Check scales
+    if (!attr->scales_.has_default_values()) {
+        static const std::vector<int> supported_args {
+                DNNL_ARG_SRC, DNNL_ARG_DST};
+        VCHECK_REORDER_UNIMPL(attr->scales_.has_default_values(supported_args),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+        const auto &sc = attr->scales_;
+        const auto &sc_src = sc.get(DNNL_ARG_SRC);
+        const int mask_src = sc.get_mask(DNNL_ARG_SRC);
+
+        VCHECK_REORDER(IMPLICATION(utils::one_of(src_md->data_type,
+                                           data_type::s4, data_type::u4),
+                               mask_src > 0),
+                VERBOSE_INVALID_DATATYPE, "mask for int4 source");
+
+        if (!sc_src.has_default_groups()) {
+            const int src_ndims = s_mdw.ndims();
+            const bool group_dims_are_consistent
+                    = IMPLICATION(sc_src.get_group(0) > 1,
+                              src_md->dims[src_ndims - 2] % sc_src.get_group(0)
+                                      == 0)
+                    && IMPLICATION(sc_src.get_group(1) > 1,
+                            src_md->dims[src_ndims - 1] % sc_src.get_group(1)
+                                    == 0);
+            VCHECK_REORDER(group_dims_are_consistent,
+                    "groups dimensions are not consistent with reorder "
+                    "dimensions");
+
+            // Groups are always applied to last two dimensions. Check that
+            // input scale mask is consistent with this limitation.
+            const bool mask_applies_to_last_two_dims
+                    = (mask_src & (1 << (src_ndims - 1)))
+                    && (mask_src & (1 << (src_ndims - 2)));
+            VCHECK_REORDER(mask_applies_to_last_two_dims,
+                    "mask is not consistent with groups");
+        }
+
+        VCHECK_REORDER(sc.get(DNNL_ARG_DST).has_default_groups(),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+    }
+
     bool is_cross_engine = src_engine != dst_engine
             && utils::one_of(
                     engine_kind::gpu, src_engine->kind(), dst_engine->kind());
diff --git a/src/common/reorder.hpp b/src/common/reorder.hpp
index c254afba76f..c831d2c79c6 100644
--- a/src/common/reorder.hpp
+++ b/src/common/reorder.hpp
@@ -29,6 +29,10 @@ status_t reorder_primitive_desc_create(std::shared_ptr<primitive_desc_t> &pd,
         engine_t *engine, const memory_desc_t *src_md,
         const memory_desc_t *dst_md, const primitive_attr_t *attr = nullptr);
 
+status_t reorder_primitive_desc_create(std::shared_ptr<primitive_desc_t> &pd,
+        engine_t *engine, const memory_desc_t *src_md, engine_t *src_engine,
+        const memory_desc_t *dst_md, engine_t *dst_engine,
+        const primitive_attr_t *attr = nullptr);
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/reorder_pd.hpp b/src/common/reorder_pd.hpp
index aea7c6e99de..2eea2c0c246 100644
--- a/src/common/reorder_pd.hpp
+++ b/src/common/reorder_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -102,6 +102,7 @@ struct reorder_primitive_desc_iface_t : public dnnl_primitive_desc {
     dnnl::impl::engine_t *scratchpad_engine_;
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct reorder_pd_t : public primitive_desc_t {
     const reorder_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -159,10 +160,10 @@ struct reorder_pd_t : public primitive_desc_t {
         init_desc(src_engine_kind, dst_engine_kind, false);
     }
 
-    reorder_pd_t(const reorder_pd_t &other) : primitive_desc_t(other) {
-        src_md_ = other.src_md_;
-        dst_md_ = other.dst_md_;
-
+    reorder_pd_t(const reorder_pd_t &other)
+        : primitive_desc_t(other)
+        , src_md_(other.src_md_)
+        , dst_md_(other.dst_md_) {
         init_desc(other.desc_.src_engine_kind, other.desc_.dst_engine_kind,
                 other.desc_.is_cross_engine);
     }
@@ -177,7 +178,6 @@ struct reorder_pd_t : public primitive_desc_t {
         return *this;
     }
 
-protected:
     void init_desc(engine_kind_t src_engine_kind, engine_kind_t dst_engine_kind,
             bool is_cross_engine) {
         desc_ = reorder_desc_t();
@@ -189,6 +189,7 @@ struct reorder_pd_t : public primitive_desc_t {
         desc_.is_cross_engine = is_cross_engine;
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/resampling.cpp b/src/common/resampling.cpp
index 98f91292929..cb5b151629c 100644
--- a/src/common/resampling.cpp
+++ b/src/common/resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -131,6 +131,9 @@ status_t resampling_attr_check(const resampling_desc_t &desc,
             // Check sum
             VCHECK_RS_UNIMPL(po.check_sum_consistency(dst_dt, false, true),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         VCHECK_RS_UNIMPL(false, VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/common/resampling_pd.hpp b/src/common/resampling_pd.hpp
index 8946d2297e5..f5d3cab4fff 100644
--- a/src/common/resampling_pd.hpp
+++ b/src/common/resampling_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,13 +42,6 @@ struct resampling_fwd_pd_t;
 struct resampling_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::resampling;
 
-    resampling_pd_t(const resampling_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const resampling_fwd_pd_t *hint_fwd_pd)
-        : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
-        , hint_fwd_pd_(hint_fwd_pd) {}
-
     const resampling_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
         return reinterpret_cast<const op_desc_t *>(this->desc());
@@ -103,6 +96,12 @@ struct resampling_pd_t : public primitive_desc_t {
     resampling_desc_t desc_;
     const resampling_fwd_pd_t *hint_fwd_pd_;
 
+    resampling_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+            const resampling_fwd_pd_t *hint_fwd_pd)
+        : primitive_desc_t(attr, base_pkind)
+        , desc_(*op_desc_t::to_desc<resampling_desc_t>(adesc))
+        , hint_fwd_pd_(hint_fwd_pd) {}
+
 private:
     const memory_desc_t &src_desc() const {
         return is_fwd() ? desc_.src_desc : desc_.diff_src_desc;
@@ -112,16 +111,10 @@ struct resampling_pd_t : public primitive_desc_t {
     }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct resampling_fwd_pd_t : public resampling_pd_t {
-    typedef resampling_fwd_pd_t base_class;
-    typedef resampling_fwd_pd_t hint_class;
-
-    resampling_fwd_pd_t(const resampling_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const resampling_fwd_pd_t *hint_fwd_pd)
-        : resampling_pd_t(adesc, attr, hint_fwd_pd)
-        , src_md_(desc_.src_desc)
-        , dst_md_(desc_.dst_desc) {}
+    using base_class = resampling_fwd_pd_t;
+    using hint_class = resampling_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
@@ -155,6 +148,12 @@ struct resampling_fwd_pd_t : public resampling_pd_t {
     memory_desc_t src_md_;
     memory_desc_t dst_md_;
 
+    resampling_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+            const resampling_fwd_pd_t *hint_fwd_pd)
+        : resampling_pd_t(adesc, attr, hint_fwd_pd)
+        , src_md_(desc_.src_desc)
+        , dst_md_(desc_.dst_desc) {}
+
     virtual status_t set_default_params(
             format_tag_t src_tag_hint = format_tag::undef) {
         if (dst_md()->format_kind != format_kind::any) return status::success;
@@ -170,17 +169,12 @@ struct resampling_fwd_pd_t : public resampling_pd_t {
         }
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct resampling_bwd_pd_t : public resampling_pd_t {
-    typedef resampling_bwd_pd_t base_class;
-    typedef resampling_fwd_pd_t hint_class;
-
-    resampling_bwd_pd_t(const resampling_desc_t *adesc,
-            const primitive_attr_t *attr,
-            const resampling_fwd_pd_t *hint_fwd_pd)
-        : resampling_pd_t(adesc, attr, hint_fwd_pd)
-        , diff_src_md_(desc_.diff_src_desc)
-        , diff_dst_md_(desc_.diff_dst_desc) {}
+    using base_class = resampling_bwd_pd_t;
+    using hint_class = resampling_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_DIFF_DST) return arg_usage_t::input;
@@ -216,6 +210,12 @@ struct resampling_bwd_pd_t : public resampling_pd_t {
     memory_desc_t diff_src_md_;
     memory_desc_t diff_dst_md_;
 
+    resampling_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+            const resampling_fwd_pd_t *hint_fwd_pd)
+        : resampling_pd_t(adesc, attr, hint_fwd_pd)
+        , diff_src_md_(desc_.diff_src_desc)
+        , diff_dst_md_(desc_.diff_dst_desc) {}
+
     virtual status_t set_default_params() {
         if (diff_dst_md()->format_kind == format_kind::any && hint_fwd_pd_) {
             status_t status = memory_desc_init_by_md_and_dt(diff_dst_md_,
@@ -232,6 +232,7 @@ struct resampling_bwd_pd_t : public resampling_pd_t {
                 diff_src_md_, diff_dst_md_.format_desc.blocking);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/rnn_pd.hpp b/src/common/rnn_pd.hpp
index 857ad2e572e..f18e5aaf7de 100644
--- a/src/common/rnn_pd.hpp
+++ b/src/common/rnn_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@ namespace impl {
 
 struct rnn_fwd_pd_t;
 
+// NOLINTBEGIN(google-default-arguments)
 struct rnn_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::rnn;
 
@@ -230,10 +231,10 @@ struct rnn_pd_t : public primitive_desc_t {
 
     memory_desc_t ws_md_;
 
-    rnn_pd_t(const rnn_desc_t *adesc, const primitive_attr_t *attr,
+    rnn_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const rnn_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<rnn_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_layer_md_(desc_.src_layer_desc)
         , src_iter_md_(desc_.src_iter_desc)
@@ -245,47 +246,53 @@ struct rnn_pd_t : public primitive_desc_t {
         , bias_md_(desc_.bias_desc)
         , dst_layer_md_(desc_.dst_layer_desc)
         , dst_iter_md_(desc_.dst_iter_desc)
-        , dst_iter_c_md_(desc_.dst_iter_c_desc)
-        , ws_md_() {}
+        , dst_iter_c_md_(desc_.dst_iter_c_desc) {}
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct rnn_fwd_pd_t : public rnn_pd_t {
-    typedef rnn_fwd_pd_t base_class;
-    typedef rnn_fwd_pd_t hint_class;
+    using base_class = rnn_fwd_pd_t;
+    using hint_class = rnn_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC_LAYER) return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_AUGRU_ATTENTION && with_augru_attention())
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_AUGRU_ATTENTION)
+            return with_augru_attention() ? arg_usage_t::input
+                                          : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_SRC_ITER && with_src_iter())
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_SRC_ITER)
+            return with_src_iter() ? arg_usage_t::input : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_SRC_ITER_C && with_src_iter_c())
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_SRC_ITER_C)
+            return with_src_iter_c() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (utils::one_of(arg, DNNL_ARG_WEIGHTS_LAYER, DNNL_ARG_WEIGHTS_ITER))
             return arg_usage_t::input;
 
-        if (arg == DNNL_ARG_WEIGHTS_PEEPHOLE && is_lstm_peephole())
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_WEIGHTS_PEEPHOLE)
+            return is_lstm_peephole() ? arg_usage_t::input
+                                      : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_WEIGHTS_PROJECTION && is_lstm_projection())
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_WEIGHTS_PROJECTION)
+            return is_lstm_projection() ? arg_usage_t::input
+                                        : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_BIAS && with_bias()) return arg_usage_t::input;
+        if (arg == DNNL_ARG_BIAS)
+            return with_bias() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_DST_LAYER) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_DST_ITER && with_dst_iter())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DST_ITER)
+            return with_dst_iter() ? arg_usage_t::output : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_DST_ITER_C && with_dst_iter() && is_lstm())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_DST_ITER_C)
+            return with_dst_iter_c() ? arg_usage_t::output
+                                     : arg_usage_t::unused;
 
-        if (arg == DNNL_ARG_WORKSPACE && is_training())
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return is_training() ? arg_usage_t::output : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -323,14 +330,16 @@ struct rnn_fwd_pd_t : public rnn_pd_t {
     }
 
 protected:
-    rnn_fwd_pd_t(const rnn_desc_t *adesc, const primitive_attr_t *attr,
+    rnn_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const rnn_fwd_pd_t *hint_fwd_pd)
         : rnn_pd_t(adesc, attr, hint_fwd_pd) {}
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct rnn_bwd_pd_t : public rnn_pd_t {
-    typedef rnn_bwd_pd_t base_class;
-    typedef rnn_fwd_pd_t hint_class;
+    using base_class = rnn_bwd_pd_t;
+    using hint_class = rnn_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_SRC_LAYER, DNNL_ARG_DST_LAYER,
@@ -342,53 +351,52 @@ struct rnn_bwd_pd_t : public rnn_pd_t {
                     DNNL_ARG_DIFF_WEIGHTS_LAYER, DNNL_ARG_DIFF_WEIGHTS_ITER))
             return arg_usage_t::output;
 
-        if (with_augru_attention()) {
-            if (arg == DNNL_ARG_AUGRU_ATTENTION) return arg_usage_t::input;
-            if (arg == DNNL_ARG_DIFF_AUGRU_ATTENTION)
-                return arg_usage_t::output;
-        }
-
-        if (is_lstm_peephole()) {
-            if (arg == DNNL_ARG_WEIGHTS_PEEPHOLE) return arg_usage_t::input;
-
-            if (arg == DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE)
-                return arg_usage_t::output;
-        }
-
-        if (is_lstm_projection()) {
-            if (arg == DNNL_ARG_WEIGHTS_PROJECTION) return arg_usage_t::input;
-
-            if (arg == DNNL_ARG_DIFF_WEIGHTS_PROJECTION)
-                return arg_usage_t::output;
-        }
-
-        if (with_bias()) {
-            if (arg == DNNL_ARG_BIAS) return arg_usage_t::input;
-
-            if (arg == DNNL_ARG_DIFF_BIAS) return arg_usage_t::output;
-        }
-
-        if (with_src_iter()) {
-            if (arg == DNNL_ARG_SRC_ITER) return arg_usage_t::input;
-
-            if (arg == DNNL_ARG_DIFF_SRC_ITER) return arg_usage_t::output;
-        }
-
-        if (with_src_iter_c()) {
-            if (arg == DNNL_ARG_SRC_ITER_C) return arg_usage_t::input;
-
-            if (arg == DNNL_ARG_DIFF_SRC_ITER_C) return arg_usage_t::output;
-        }
-
-        if (with_dst_iter()
-                && utils::one_of(
-                        arg, DNNL_ARG_DST_ITER, DNNL_ARG_DIFF_DST_ITER))
-            return arg_usage_t::input;
-
-        if (with_dst_iter_c()
-                && utils::one_of(
-                        arg, DNNL_ARG_DST_ITER_C, DNNL_ARG_DIFF_DST_ITER_C))
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_AUGRU_ATTENTION)
+            return with_augru_attention() ? arg_usage_t::input
+                                          : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_AUGRU_ATTENTION)
+            return with_augru_attention() ? arg_usage_t::output
+                                          : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_WEIGHTS_PEEPHOLE)
+            return is_lstm_peephole() ? arg_usage_t::input
+                                      : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_WEIGHTS_PEEPHOLE)
+            return is_lstm_peephole() ? arg_usage_t::output
+                                      : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_WEIGHTS_PROJECTION)
+            return is_lstm_projection() ? arg_usage_t::input
+                                        : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_WEIGHTS_PROJECTION)
+            return is_lstm_projection() ? arg_usage_t::output
+                                        : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_BIAS)
+            return with_bias() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_BIAS)
+            return with_bias() ? arg_usage_t::output : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_SRC_ITER)
+            return with_src_iter() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_SRC_ITER)
+            return with_src_iter() ? arg_usage_t::output : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_SRC_ITER_C)
+            return with_src_iter_c() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_SRC_ITER_C)
+            return with_src_iter_c() ? arg_usage_t::output
+                                     : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_DST_ITER)
+            return with_dst_iter() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_DST_ITER)
+            return with_dst_iter() ? arg_usage_t::input : arg_usage_t::unused;
+
+        if (arg == DNNL_ARG_DST_ITER_C)
+            return with_dst_iter_c() ? arg_usage_t::input : arg_usage_t::unused;
+        if (arg == DNNL_ARG_DIFF_DST_ITER_C)
+            return with_dst_iter_c() ? arg_usage_t::input : arg_usage_t::unused;
 
         if (arg == DNNL_ARG_WORKSPACE) return arg_usage_t::input;
 
@@ -521,7 +529,7 @@ struct rnn_bwd_pd_t : public rnn_pd_t {
     memory_desc_t diff_dst_iter_md_;
     memory_desc_t diff_dst_iter_c_md_;
 
-    rnn_bwd_pd_t(const rnn_desc_t *adesc, const primitive_attr_t *attr,
+    rnn_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const rnn_fwd_pd_t *hint_fwd_pd)
         : rnn_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_layer_md_(desc_.diff_src_layer_desc)
@@ -536,6 +544,7 @@ struct rnn_bwd_pd_t : public rnn_pd_t {
         , diff_dst_iter_md_(desc_.diff_dst_iter_desc)
         , diff_dst_iter_c_md_(desc_.diff_dst_iter_c_desc) {}
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/scratchpad.hpp b/src/common/scratchpad.hpp
index f837b75a28c..133b1ee34e8 100644
--- a/src/common/scratchpad.hpp
+++ b/src/common/scratchpad.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace dnnl {
 namespace impl {
 
 struct scratchpad_t {
-    virtual ~scratchpad_t() {}
+    virtual ~scratchpad_t() = default;
     virtual const memory_storage_t *get_memory_storage() const = 0;
     virtual size_t size() const = 0;
 };
diff --git a/src/common/sdpa_pd.hpp b/src/common/sdpa_pd.hpp
index 9d95612cfd7..39a686abbeb 100644
--- a/src/common/sdpa_pd.hpp
+++ b/src/common/sdpa_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,11 +27,6 @@
 namespace dnnl {
 namespace impl {
 
-#define DNNL_ARG_QUERIES DNNL_ARG_SRC_0
-#define DNNL_ARG_KEYS DNNL_ARG_SRC_1
-#define DNNL_ARG_VALUES DNNL_ARG_SRC_2
-#define DNNL_ARG_ATTN_MASK DNNL_ARG_SHIFT
-
 #define VDISPATCH_SDPA(cond, msg, ...) \
     VCONDCHECK(primitive, create, dispatch, sdpa, (cond), \
             status::unimplemented, "%s," msg, this->info(engine), \
@@ -41,11 +36,12 @@ namespace impl {
     VCHECK(primitive, create, dispatch, sdpa, (f), "%s," msg, \
             this->info(engine), ##__VA_ARGS__)
 
+// NOLINTBEGIN(google-default-arguments)
 struct sdpa_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::sdpa;
 
-    typedef sdpa_pd_t base_class;
-    typedef sdpa_pd_t hint_class;
+    using base_class = sdpa_pd_t;
+    using hint_class = sdpa_pd_t;
 
     const sdpa_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -53,8 +49,15 @@ struct sdpa_pd_t : public primitive_desc_t {
     }
 
     arg_usage_t arg_usage(int arg) const override {
+        // TODO: this is broken for cases when the user passes quantization
+        // memories unconditionally but the primitive desc is not set up for
+        // quantization.
         if (utils::one_of(arg, DNNL_ARG_QUERIES, DNNL_ARG_KEYS, DNNL_ARG_VALUES,
-                    DNNL_ARG_ATTN_MASK, DNNL_ARG_SCALE))
+                    DNNL_ARG_ATTN_MASK, DNNL_ARG_SCALE,
+                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_KEYS,
+                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_VALUES,
+                    DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_KEYS,
+                    DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_VALUES))
             return arg_usage_t::input;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
@@ -94,7 +97,9 @@ struct sdpa_pd_t : public primitive_desc_t {
     const memory_desc_t *val_md() const { return &desc_.v_desc; }
     const memory_desc_t *attn_mask_md() const { return &desc_.attn_mask_desc; }
 
-    int n_inputs() const override { return 3 + int(with_attn_mask()); }
+    int n_inputs() const override {
+        return 3 + int(with_attn_mask()) + int(with_attn_scale());
+    }
     int n_outputs() const override { return 1; }
 
     bool with_attn_scale() const {
@@ -105,12 +110,81 @@ struct sdpa_pd_t : public primitive_desc_t {
         return (attn_mask_md()->data_type != data_type::undef);
     }
 
+    /// If true, the attention mask is a causal mask
+    bool with_causal_mask() const {
+        return desc_.mask_type == attn_mask_type::top_left
+                || desc_.mask_type == attn_mask_type::bottom_right;
+    }
+
+    /// If true, dequantize the K tensor using scaling in the KQ matmul
+    bool with_key_scales() const {
+        return (!desc()->kq_scales.has_default_values());
+    }
+
+    /// If true, dequantize the V tensor using scaling in the VS matmul
+    bool with_value_scales() const {
+        return (!desc()->vs_scales.has_default_values());
+    }
+
+    /// If true, dequantize the K tensor with zero points in the KQ matmul
+    bool with_key_zp() const {
+        return (!desc()->kq_zero_points.has_default_values(DNNL_ARG_WEIGHTS));
+    }
+
+    /// If true, dequantize the V tensor with zero points in the VS matmul
+    bool with_value_zp() const {
+        return (!desc()->vs_zero_points.has_default_values(DNNL_ARG_WEIGHTS));
+    }
+
+    /// Returns the data type of the scales tensor for the KQ matmul
+    data_type_t key_scales_dt() const {
+        return desc()->kq_scales.get_data_type();
+    }
+
+    /// Returns the data type of the zero points tensor for the KQ matmul
+    data_type_t key_zp_dt() const {
+        return desc()->kq_zero_points.get_data_type(DNNL_ARG_WEIGHTS);
+    }
+
+    /// Returns the data type of the scales tensor for the VS matmul
+    data_type_t value_scales_dt() const {
+        return desc()->vs_scales.get_data_type();
+    }
+
+    /// Returns the data type of the zero points tensor for the VS matmul
+    data_type_t value_zp_dt() const {
+        return desc()->vs_zero_points.get_data_type(DNNL_ARG_WEIGHTS);
+    }
+
+    // Returns the group size for the quantization parameters for the KQ matmul
+    int key_group_size() const {
+        int out = 0;
+        if (with_key_scales())
+            out = scale_group_size(desc()->kq_scales, *key_md());
+        else if (with_key_zp()) {
+            out = zp_group_size(desc()->kq_zero_points, *key_md());
+        }
+        return out;
+    }
+
+    // Returns the group size for the quantization parameters for the VS matmul
+    int value_group_size() const {
+        int out = 0;
+        if (with_value_scales())
+            out = scale_group_size(desc()->vs_scales, *val_md());
+        else if (with_value_zp()) {
+            out = zp_group_size(desc()->vs_zero_points, *val_md());
+        }
+        return out;
+    }
+
 protected:
     sdpa_desc_t desc_;
 
-    sdpa_pd_t(const sdpa_desc_t *adesc, const primitive_attr_t *attr,
+    sdpa_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const hint_class *hint_fwd_pd)
-        : primitive_desc_t(attr, base_pkind), desc_(*adesc) {}
+        : primitive_desc_t(attr, base_pkind)
+        , desc_(*op_desc_t::to_desc<sdpa_desc_t>(adesc)) {}
 
     bool set_default_format(memory_desc_t *md) {
         memory_desc_wrapper mdw(md);
@@ -132,7 +206,49 @@ struct sdpa_pd_t : public primitive_desc_t {
 
         return ok;
     }
+
+private:
+    static int scale_group_size(
+            const quant_entry_t &scales, const memory_desc_t &desc) {
+        dim_t out = utils::array_product(desc.dims, desc.ndims);
+        const auto mask = scales.get_mask();
+        if (scales.has_default_groups()) {
+            for (int idx : mask_iterator(mask)) {
+                out /= desc.dims[idx];
+            }
+        } else {
+            for (int idx : mask_iterator(mask)) {
+                if (idx < 2) {
+                    out /= desc.dims[idx];
+                } else {
+                    out /= (desc.dims[idx] / scales.get_group(idx - 2));
+                }
+            }
+        }
+        return static_cast<int>(out);
+    }
+
+    static int zp_group_size(
+            const zero_points_t &zp, const memory_desc_t &desc) {
+        dim_t out = utils::array_product(desc.dims, desc.ndims);
+        if (zp.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+            for (int idx : mask_iterator(zp.get_mask(DNNL_ARG_WEIGHTS))) {
+                out /= desc.dims[idx];
+            }
+        } else {
+            for (int idx : mask_iterator(zp.get_mask(DNNL_ARG_WEIGHTS))) {
+                if (idx < 2) {
+                    out /= desc.dims[idx];
+                } else {
+                    out /= (desc.dims[idx]
+                            / zp.get_group(DNNL_ARG_WEIGHTS, idx - 2));
+                }
+            }
+        }
+        return static_cast<int>(out);
+    }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/sdpa_test_iface.cpp b/src/common/sdpa_test_iface.cpp
new file mode 100644
index 00000000000..a7834210f51
--- /dev/null
+++ b/src/common/sdpa_test_iface.cpp
@@ -0,0 +1,46 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/primitive_desc_iface.hpp"
+#include "common/sdpa_pd.hpp"
+#include "common/sdpa_types.hpp"
+#include "common/sdpa_utils.hpp"
+#include "opdesc.hpp"
+
+using dnnl::impl::status_t;
+using namespace dnnl::impl;
+
+dnnl_status_t DNNL_API sdpa_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc_iface, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t query_desc, const_dnnl_memory_desc_t key_desc,
+        const_dnnl_memory_desc_t value_desc, const_dnnl_memory_desc_t dst_desc,
+        const_dnnl_memory_desc_t mask_desc, dnnl_data_type_t scale_dt,
+        bool invert_scale, dnnl_dim_t kv_head_number, int attn_mask_type,
+        const_dnnl_primitive_attr_t attr, const_dnnl_primitive_attr_t kq_attr,
+        const_dnnl_primitive_attr_t vs_attr) {
+    CHECK(sdpa_desc_check(query_desc, key_desc, value_desc, dst_desc, mask_desc,
+            engine, attr, kq_attr, vs_attr));
+    CHECK(sdpa_attr_check(
+            query_desc, key_desc, value_desc, engine, attr, kq_attr, vs_attr));
+
+    dnnl::impl::sdpa_desc_t sdpa_desc = dnnl::impl::create_sdpa_desc(query_desc,
+            key_desc, value_desc, dst_desc, mask_desc,
+            (dnnl::impl::data_type_t)scale_dt, invert_scale, kv_head_number,
+            static_cast<attn_mask_type_t>(attn_mask_type), kq_attr, vs_attr);
+    return dnnl::impl::primitive_desc_create(primitive_desc_iface, engine,
+            (const dnnl::impl::op_desc_t *)&sdpa_desc, nullptr, attr);
+}
diff --git a/src/common/sdpa_types.hpp b/src/common/sdpa_types.hpp
index 03fc9f67aaa..8b203a9e956 100644
--- a/src/common/sdpa_types.hpp
+++ b/src/common/sdpa_types.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,28 +17,75 @@
 #ifndef COMMON_SDPA_TYPES_HPP
 #define COMMON_SDPA_TYPES_HPP
 
-#include <assert.h>
+#include "oneapi/dnnl/dnnl_types.h"
+
 #include "common/c_types_map.hpp"
 #include "common/memory_desc.hpp"
+#include "common/primitive_attr_quant.hpp"
+
+#include <assert.h>
 
 namespace dnnl {
 namespace impl {
 
+#define DNNL_ARG_QUERIES DNNL_ARG_SRC_0
+#define DNNL_ARG_KEYS DNNL_ARG_SRC_1
+#define DNNL_ARG_VALUES DNNL_ARG_SRC_2
+#define DNNL_ARG_ATTN_MASK DNNL_ARG_SHIFT
+
+// NOLINTBEGIN(modernize-use-using)
+/// Types of attention mask
+typedef enum {
+    dnnl_attn_mask_undef = 0,
+    /// explicit attention masks defined in a buffer
+    dnnl_attn_mask_buffer = 1,
+
+    /// causal mask with the diagonal starting from the top left hand side of
+    /// the mask tensor
+    dnnl_attn_mask_top_left = 2,
+
+    /// causal mask with the diagonal starting from the bottom right hand side
+    /// of the mask tensor
+    dnnl_attn_mask_bottom_right = 3,
+} dnnl_attn_mask_type_t;
+// NOLINTEND(modernize-use-using)
+
+using attn_mask_type_t = dnnl_attn_mask_type_t;
+namespace attn_mask_type {
+const attn_mask_type_t undef = dnnl_attn_mask_undef;
+const attn_mask_type_t buffer = dnnl_attn_mask_buffer;
+const attn_mask_type_t top_left = dnnl_attn_mask_top_left;
+const attn_mask_type_t bottom_right = dnnl_attn_mask_bottom_right;
+} // namespace attn_mask_type
+
 // A descriptor for a scaled dot product attention (SDPA) operation.
-struct sdpa_desc_t {
-    // The kind of primitive. Used for self identifying the primitive
-    // descriptor. Must be sdpa.
-    dnnl_primitive_kind_t primitive_kind;
+struct sdpa_desc_t : public op_desc_t {
+    sdpa_desc_t() : op_desc_t(primitive_kind::sdpa) {}
+
+    std::unique_ptr<op_desc_t> clone() const override {
+        return utils::make_unique<sdpa_desc_t>(*this);
+    }
+
     memory_desc_t q_desc; /* queries */
     memory_desc_t k_desc; /* keys */
     memory_desc_t v_desc; /* values */
+
+    // primitive_attr_t can't be used because of deleted copy-ctor, but desc_t
+    // must be copyable.
+    quant_entry_t kq_scales;
+    zero_points_t kq_zero_points;
+    quant_entry_t vs_scales;
+    zero_points_t vs_zero_points;
+
     memory_desc_t dst_desc;
     memory_desc_t attn_mask_desc;
-    data_type_t scale_dt;
+    data_type_t scale_dt {};
     // invert_scale = false: multiply by scale
     // invert_scale = true:  divide by scale
-    bool invert_scale;
-    dim_t kv_head_number;
+    bool invert_scale {};
+    dim_t kv_head_number {};
+
+    attn_mask_type_t mask_type = attn_mask_type::undef;
 
     // Number of queries.
     dnnl_dim_t queries() const { return q_desc.dims[q_desc.ndims - 2]; }
diff --git a/src/common/sdpa_utils.hpp b/src/common/sdpa_utils.hpp
index ccba17a2081..c72d6f0a0f4 100644
--- a/src/common/sdpa_utils.hpp
+++ b/src/common/sdpa_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,20 +28,135 @@
 namespace dnnl {
 namespace impl {
 
+#define VCHECK_SDPA(f, msg, ...) \
+    VCHECK(primitive, create, check, sdpa, (f), msg, ##__VA_ARGS__);
+
+#define VCHECK_SDPA_COND(cond, msg, ...) \
+    VCONDCHECK(primitive, create, check, sdpa, (cond), \
+            status::invalid_arguments, msg, ##__VA_ARGS__);
+
+#define VCHECK_SDPA_ATTR_TYPE( \
+        variable_check, variable, attribute_member_name, expected_types) \
+    VCONDCHECK(primitive, create, check, sdpa, (variable_check), \
+            status::invalid_arguments, VERBOSE_INVALID_DATATYPE, \
+            format_verbose_string(#variable attribute_member_name \
+                    "(%s). must be " expected_types, \
+                    attr2str(variable).c_str()) \
+                    .c_str())
+
+#define VCHECK_SDPA_UNIMPL(cond, msg, ...) \
+    VCONDCHECK(primitive, create, check, sdpa, (cond), status::unimplemented, \
+            msg, ##__VA_ARGS__);
+
+static inline status_t sdpa_desc_check(const memory_desc_t *q_desc,
+        const memory_desc_t *k_desc, const memory_desc_t *v_desc,
+        const memory_desc_t *dst_desc, const memory_desc_t *attn_mask_md,
+        const engine_t *engine, const primitive_attr_t *attr,
+        const primitive_attr_t *kq_attr, const primitive_attr_t *vs_attr) {
+    int ndims = dst_desc->ndims;
+    int r = ndims - 2, c = ndims - 1;
+    VCHECK_SDPA_COND(utils::everyone_is(ndims, q_desc->ndims, k_desc->ndims,
+                             v_desc->ndims),
+            "number of dimensions have to match. expected: %d q: %d k: %d v: "
+            "%d",
+            ndims, q_desc->ndims, k_desc->ndims, v_desc->ndims);
+
+    VCHECK_SDPA_COND(q_desc->dims[c] == k_desc->dims[r],
+            "q_desc->dims[%d](%s) must match k_desc->dims[%d](%s)", c,
+            md2dim_str(q_desc).c_str(), r, md2dim_str(k_desc).c_str());
+    VCHECK_SDPA_COND(k_desc->dims[c] == v_desc->dims[r],
+            "k_desc->dims[%d](%s) must match v_desc->dims[%d](%s)", c,
+            md2dim_str(k_desc).c_str(), r, md2dim_str(v_desc).c_str());
+    VCHECK_SDPA_COND(dst_desc->dims[r] == q_desc->dims[r],
+            "dst_desc->dims[%d](%s) == q_desc->dims[%d](%s)", r,
+            md2dim_str(dst_desc).c_str(), r, md2dim_str(q_desc).c_str());
+    VCHECK_SDPA_COND(dst_desc->dims[c] == v_desc->dims[c],
+            "dst_desc->dims[%d](%s) == v_desc->dims[%d](%s)", c,
+            md2dim_str(dst_desc).c_str(), c, md2dim_str(v_desc).c_str());
+
+    return status::success;
+}
+
+static inline status_t sdpa_attr_check(const memory_desc_t *q_desc,
+        const memory_desc_t *k_desc, const memory_desc_t *v_desc,
+        const engine_t *engine, const primitive_attr_t *attr,
+        const primitive_attr_t *kq_attr, const primitive_attr_t *vs_attr) {
+    using smask_t = primitive_attr_t::skip_mask_t;
+
+    if (utils::everyone_is(nullptr, attr, kq_attr, vs_attr))
+        return status::success;
+    if (attr && attr->has_default_values() && kq_attr
+            && kq_attr->has_default_values() && vs_attr
+            && vs_attr->has_default_values()) {
+        return status::success;
+    }
+
+    using namespace dnnl::impl::data_type;
+    if (kq_attr && !kq_attr->has_default_values()) {
+        const auto &sc = kq_attr->scales_;
+        const auto &zp = kq_attr->zero_points_;
+        if (!sc.has_default_values()) {
+            const auto &scale_dt = sc.get_data_type(DNNL_ARG_WEIGHTS);
+            VCHECK_SDPA_ATTR_TYPE(utils::one_of(scale_dt, f16, f32), kq_attr,
+                    "scales", "f16 or f32");
+        }
+        if (!zp.has_default_values()) {
+            const auto &zp_dt = zp.get_data_type(DNNL_ARG_WEIGHTS);
+            VCHECK_SDPA_ATTR_TYPE(utils::one_of(zp_dt, s4, u4, u8, s8, s32),
+                    kq_attr, "zero_points", "u4, s4, u8, s8, or s32");
+        }
+    }
+
+    if (vs_attr && !vs_attr->has_default_values()) {
+        const auto &sc = vs_attr->scales_;
+        const auto &zp = vs_attr->zero_points_;
+
+        if (!sc.has_default_values()) {
+            const auto &scale_dt = sc.get_data_type(DNNL_ARG_WEIGHTS);
+            VCHECK_SDPA_ATTR_TYPE(utils::one_of(scale_dt, f16, f32), vs_attr,
+                    "scales", "f16 or f32");
+        }
+        if (!zp.has_default_values()) {
+            const auto &zp_dt = zp.get_data_type(DNNL_ARG_WEIGHTS);
+            VCHECK_SDPA_ATTR_TYPE(utils::one_of(zp_dt, s4, u4, u8, s8, s32),
+                    vs_attr, "zero_points", "u4, s4, u8, s8, or s32");
+        }
+    }
+
+    if (attr) {
+        smask_t attr_mask = smask_t::none;
+        VCHECK_SDPA_UNIMPL(
+                attr->has_default_values(attr_mask), VERBOSE_UNSUPPORTED_ATTR);
+    }
+
+    return status::success;
+}
+
 static inline sdpa_desc_t create_sdpa_desc(const memory_desc_t *q_md,
         const memory_desc_t *k_md, const memory_desc_t *v_md,
         const memory_desc_t *dst_md, const memory_desc_t *attn_mask_md,
-        data_type_t scale_dt, dim_t kv_head_number, bool invert_scale = false) {
+        data_type_t scale_dt, bool invert_scale, dim_t kv_head_number,
+        attn_mask_type_t attn_mask_type, const primitive_attr_t *kq_attr,
+        const primitive_attr_t *vs_attr) {
     auto sdpa_desc = sdpa_desc_t();
     sdpa_desc.primitive_kind = primitive_kind::sdpa;
     sdpa_desc.q_desc = *q_md;
     sdpa_desc.k_desc = *k_md;
+    if (kq_attr) {
+        sdpa_desc.kq_scales = kq_attr->scales_.get(DNNL_ARG_WEIGHTS);
+        sdpa_desc.kq_zero_points = kq_attr->zero_points_;
+    }
+    if (vs_attr) {
+        sdpa_desc.vs_scales = vs_attr->scales_.get(DNNL_ARG_WEIGHTS);
+        sdpa_desc.vs_zero_points = vs_attr->zero_points_;
+    }
     sdpa_desc.v_desc = *v_md;
     sdpa_desc.dst_desc = *dst_md;
     if (attn_mask_md) sdpa_desc.attn_mask_desc = *attn_mask_md;
     sdpa_desc.scale_dt = scale_dt;
     sdpa_desc.invert_scale = invert_scale;
     sdpa_desc.kv_head_number = kv_head_number;
+    sdpa_desc.mask_type = attn_mask_type;
     return sdpa_desc;
 }
 
@@ -50,26 +165,25 @@ static inline status_t create_sdpa_pd(
         const memory_desc_t *q_md, const memory_desc_t *k_md,
         const memory_desc_t *v_md, const memory_desc_t *dst_md,
         const memory_desc_t *attn_mask_md, data_type_t scale_dt,
-        bool invert_scale, const primitive_attr_t *attr, dim_t kv_head_number) {
-    auto sdpa_desc = create_sdpa_desc(q_md, k_md, v_md, dst_md, attn_mask_md,
-            scale_dt, kv_head_number, invert_scale);
+        bool invert_scale, dim_t kv_head_number,
+        attn_mask_type_t attn_mask_type, const primitive_attr_t *attr,
+        const primitive_attr_t *kq_attr = nullptr,
+        const primitive_attr_t *vs_attr = nullptr) {
+    CHECK(sdpa_attr_check(q_md, k_md, v_md, engine, attr, kq_attr, vs_attr));
+    CHECK(sdpa_desc_check(q_md, k_md, v_md, dst_md, attn_mask_md, engine, attr,
+            kq_attr, vs_attr));
 
-    int ndims = dst_md->ndims;
-    int r = ndims - 2, c = ndims - 1;
-    if (!utils::everyone_is(ndims, q_md->ndims, k_md->ndims, v_md->ndims))
-        return status::invalid_arguments;
-    if (q_md->dims[c] != k_md->dims[r]) return status::invalid_arguments;
-    if (k_md->dims[c] != v_md->dims[r]) return status::invalid_arguments;
-    if (dst_md->dims[r] != q_md->dims[r] || dst_md->dims[c] != v_md->dims[c])
-        return status::invalid_arguments;
+    auto sdpa_desc = create_sdpa_desc(q_md, k_md, v_md, dst_md, attn_mask_md,
+            scale_dt, invert_scale, kv_head_number, attn_mask_type, kq_attr,
+            vs_attr);
 
-    primitive_attr_t sdpa_attr = *attr;
+    primitive_attr_t sdpa_attr = attr ? *attr : default_attr();
 
     primitive_desc_iterator_t it(
             engine, (op_desc_t *)&sdpa_desc, &sdpa_attr, nullptr);
 
     sdpa_pd_ = *(++it);
-    if (!sdpa_pd_) return status::unimplemented;
+    VCHECK_SDPA_COND(sdpa_pd_, "failed to create the SDPA primitive");
 
     return status::success;
 }
diff --git a/src/common/serialization.cpp b/src/common/serialization.cpp
deleted file mode 100644
index 035733db406..00000000000
--- a/src/common/serialization.cpp
+++ /dev/null
@@ -1,618 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "common/serialization.hpp"
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace serialization {
-
-status_t serialize_desc(
-        serialization_stream_t &sstream, const op_desc_t *op_desc) {
-#define CASE(pkind) \
-    case primitive_kind::pkind: \
-        serialize_desc(sstream, *(const pkind##_desc_t *)op_desc); \
-        break;
-
-    switch ((int)op_desc->kind) {
-        CASE(batch_normalization)
-        CASE(binary)
-        CASE(concat)
-        CASE(convolution)
-        CASE(deconvolution)
-        CASE(eltwise)
-        CASE(gemm)
-        CASE(group_normalization)
-        CASE(inner_product)
-        CASE(layer_normalization)
-        CASE(lrn)
-        CASE(matmul)
-        CASE(pooling)
-        CASE(prelu)
-        CASE(reduction)
-        CASE(reorder)
-        CASE(resampling)
-        CASE(rnn)
-        CASE(sdpa)
-        CASE(shuffle)
-        CASE(softmax)
-        CASE(sum)
-        default: return status::invalid_arguments;
-    }
-#undef CASE
-    return status::success;
-}
-
-void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md) {
-    sstream.write(&md.ndims);
-    sstream.write(md.dims, md.ndims);
-    sstream.write(&md.data_type);
-    sstream.write(md.padded_dims, md.ndims);
-    sstream.write(md.padded_offsets, md.ndims);
-    sstream.write(&md.offset0);
-    sstream.write(&md.format_kind);
-    // format desc
-    switch ((int)md.format_kind) {
-        case format_kind::undef:
-        case format_kind::any: break;
-        case format_kind::blocked:
-            sstream.write(md.format_desc.blocking.strides, md.ndims);
-            sstream.write(&md.format_desc.blocking.inner_nblks);
-            sstream.write(md.format_desc.blocking.inner_blks,
-                    md.format_desc.blocking.inner_nblks);
-            sstream.write(md.format_desc.blocking.inner_idxs,
-                    md.format_desc.blocking.inner_nblks);
-            break;
-        case format_kind::wino:
-            sstream.write(&md.format_desc.wino_desc.wino_format);
-            sstream.write(&md.format_desc.wino_desc.r);
-            sstream.write(&md.format_desc.wino_desc.alpha);
-            sstream.write(&md.format_desc.wino_desc.ic);
-            sstream.write(&md.format_desc.wino_desc.oc);
-            sstream.write(&md.format_desc.wino_desc.ic_block);
-            sstream.write(&md.format_desc.wino_desc.oc_block);
-            sstream.write(&md.format_desc.wino_desc.ic2_block);
-            sstream.write(&md.format_desc.wino_desc.oc2_block);
-            sstream.write(&md.format_desc.wino_desc.adj_scale);
-            sstream.write(&md.format_desc.wino_desc.size);
-            break;
-        case format_kind::rnn_packed:
-            sstream.write(&md.format_desc.rnn_packed_desc.format);
-            sstream.write(&md.format_desc.rnn_packed_desc.n_parts);
-            sstream.write(&md.format_desc.rnn_packed_desc.n);
-            sstream.write(&md.format_desc.rnn_packed_desc.ldb);
-            {
-                int n_parts = md.format_desc.rnn_packed_desc.n_parts;
-                sstream.write(md.format_desc.rnn_packed_desc.parts, n_parts);
-                sstream.write(
-                        md.format_desc.rnn_packed_desc.part_pack_size, n_parts);
-                sstream.write(
-                        md.format_desc.rnn_packed_desc.pack_part, n_parts);
-            }
-            sstream.write(&md.format_desc.rnn_packed_desc.offset_compensation);
-            sstream.write(&md.format_desc.rnn_packed_desc.size);
-            break;
-        default: assert(!"unknown format_kind");
-    }
-
-    if (md.extra.flags != dnnl_memory_extra_flag_none) {
-        sstream.write(&md.extra.flags);
-        if ((md.extra.flags
-                    & (dnnl_memory_extra_flag_compensation_conv_s8s8
-                            | dnnl_memory_extra_flag_rnn_u8s8_compensation))
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        md.extra.flags)) {
-            sstream.write(&md.extra.compensation_mask);
-        }
-
-        if (md.extra.flags & dnnl_memory_extra_flag_scale_adjust) {
-            sstream.write(&md.extra.scale_adjust);
-        }
-
-        if (md.extra.flags
-                & dnnl_memory_extra_flag_compensation_conv_asymmetric_src) {
-            sstream.write(&md.extra.asymm_compensation_mask);
-        }
-    }
-}
-
-void serialize_post_ops(
-        serialization_stream_t &sstream, const post_ops_t &post_ops) {
-    // post_ops: entry[:]
-    for (int i = 0; i < post_ops.len(); i++) {
-        const auto &entry = post_ops.entry_[i];
-        switch (entry.kind) {
-            case primitive_kind::eltwise:
-                sstream.write(&entry.eltwise.alg);
-                sstream.write(&entry.eltwise.scale);
-                sstream.write(&entry.eltwise.alpha);
-                sstream.write(&entry.eltwise.beta);
-                break;
-            case primitive_kind::sum:
-                sstream.write(&entry.sum.scale);
-                sstream.write(&entry.sum.zero_point);
-                sstream.write(&entry.sum.dt);
-                break;
-            case primitive_kind::convolution:
-                sstream.write(&entry.depthwise_conv.kernel);
-                sstream.write(&entry.depthwise_conv.stride);
-                sstream.write(&entry.depthwise_conv.padding);
-                sstream.write(&entry.depthwise_conv.wei_dt);
-                sstream.write(&entry.depthwise_conv.bias_dt);
-                sstream.write(&entry.depthwise_conv.dst_dt);
-                break;
-            case primitive_kind::binary:
-                sstream.write(&entry.binary.alg);
-                serialize_md(sstream, entry.binary.user_src1_desc);
-                break;
-            case primitive_kind::prelu: sstream.write(&entry.prelu.mask); break;
-            default: assert(!"unknown post_op");
-        }
-    }
-}
-
-void serialize_attr(
-        serialization_stream_t &sstream, const primitive_attr_t &attr) {
-    // scratchpad_mode
-    sstream.write(&attr.scratchpad_mode_);
-    // fpmath_mode
-    sstream.write(&attr.fpmath_.mode_);
-    sstream.write(&attr.fpmath_.apply_to_int_);
-    // deterministic
-    sstream.write(&attr.deterministic_);
-    // acc_mode
-    sstream.write(&attr.acc_mode_);
-
-    if (!attr.output_scales_.has_default_values()) {
-        // output_scales: mask
-        sstream.write(&attr.output_scales_.mask_);
-    } else if (!attr.scales_.has_default_values()) {
-        sstream.write("scale:");
-        // go through scales for all arguments
-        for (const auto &p : attr.scales_.scales_) {
-            // scales: arg
-            sstream.write(&p.first);
-            // scales: mask
-            sstream.write(&p.second.mask_);
-            // scales: groups
-            const int ndims = p.second.ndims_;
-            sstream.write(&ndims);
-            if (ndims > 0) sstream.write(p.second.group_dims_, ndims);
-            // scales: data type
-            sstream.write(&p.second.data_type_);
-        }
-    }
-    // zero_points
-    if (!attr.zero_points_.has_default_values()) sstream.write("zp:");
-    for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST})
-        if (!attr.zero_points_.has_default_values(arg)) {
-            const auto &zps = attr.zero_points_;
-            // zero_points: arg
-            sstream.write(&arg);
-            int mask = 0;
-            data_type_t dt = data_type::s32;
-            zps.get(arg, &mask, &dt);
-            // zero_points: mask
-            sstream.write(&mask);
-            // zero points: groups
-            const int ndims = zps.get_groups_ndims(arg);
-            sstream.write(&ndims);
-            if (ndims > 0) sstream.write(zps.get_groups(arg), ndims);
-            // zero_points: data type
-            sstream.write(&dt);
-        }
-
-    // Rounding modes
-    if (!attr.rounding_mode_.has_default_values()) sstream.write("rm:");
-    for (const auto &e : attr.rounding_mode_.rounding_modes_map_) {
-        if (!attr.rounding_mode_.has_default_values(e.first)) {
-            sstream.write(&e.first);
-            sstream.write(&e.second);
-        }
-    }
-
-    if (!attr.dropout_.has_default_values()) {
-        sstream.write("dropout:");
-        serialize_md(sstream, attr.dropout_.user_dropout_desc_);
-    }
-
-    serialize_post_ops(sstream, attr.post_ops_);
-
-    // rnn_data_qparams: scale, shift
-    sstream.write(&attr.rnn_data_qparams_.scale_);
-    sstream.write(&attr.rnn_data_qparams_.shift_);
-    if (!attr.rnn_weights_qparams_.has_default_values()) {
-        // rnn_weights_qparams: mask
-        sstream.write(&attr.rnn_weights_qparams_.mask_);
-        // rnn_weights_qparams: count
-        sstream.write(&attr.rnn_weights_qparams_.count_);
-        // rnn_weights_qparams: scales[:]
-        sstream.write(attr.rnn_weights_qparams_.scales_,
-                attr.rnn_weights_qparams_.count_);
-    }
-    if (attr.gpu_attr_) {
-        attr.gpu_attr_->serialize(sstream);
-    } else {
-        int zero = 0;
-        sstream.write(&zero);
-    }
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const concat_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    // Memory descriptors
-    serialize_md(sstream, *desc.dst_md);
-    // N
-    sstream.write(&desc.n);
-    // Concat dimension
-    sstream.write(&desc.concat_dimension);
-    // Array of mds
-    for (int i = 0; i < desc.n; i++)
-        serialize_md(sstream, *desc.src_mds[i]);
-}
-
-void serialize_desc(serialization_stream_t &sstream,
-        const batch_normalization_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    serialize_md(sstream, desc.scaleshift_desc);
-    serialize_md(sstream, desc.diff_scaleshift_desc);
-    serialize_md(sstream, desc.stat_desc);
-    // Epsilon
-    sstream.write(&desc.batch_norm_epsilon);
-    // Flags
-    sstream.write(&desc.flags);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const binary_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc[0]);
-    serialize_md(sstream, desc.src_desc[1]);
-    serialize_md(sstream, desc.dst_desc);
-}
-
-// (De-)Convolution
-void serialize_desc(
-        serialization_stream_t &sstream, const convolution_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.weights_desc);
-    serialize_md(sstream, desc.diff_weights_desc);
-    serialize_md(sstream, desc.bias_desc);
-    serialize_md(sstream, desc.diff_bias_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Strides, dilates, padding
-    sstream.write(desc.strides, DNNL_MAX_NDIMS);
-    sstream.write(desc.dilates, DNNL_MAX_NDIMS);
-    sstream.write(desc.padding[0], DNNL_MAX_NDIMS);
-    sstream.write(desc.padding[1], DNNL_MAX_NDIMS);
-    // Accumulator type
-    sstream.write(&desc.accum_data_type);
-    // Internal member
-    sstream.write(&desc.use_inversion);
-}
-
-// Eltwise
-void serialize_desc(
-        serialization_stream_t &sstream, const eltwise_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Alpha, beta
-    sstream.write(&desc.alpha);
-    sstream.write(&desc.beta);
-}
-
-void serialize_desc(serialization_stream_t &sstream, const gemm_desc_t &desc) {
-    // Kind
-    sstream.write(&desc.primitive_kind);
-    serialize_md(sstream, desc.a_desc);
-    serialize_md(sstream, desc.b_desc);
-    serialize_md(sstream, desc.c_desc);
-    serialize_md(sstream, desc.bias_desc);
-    // Accumulator type
-    sstream.write(&desc.acc_type);
-    sstream.write(&desc.sum_ab);
-    sstream.write(&desc.sum_ab_type);
-}
-
-void serialize_desc(serialization_stream_t &sstream,
-        const group_normalization_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    serialize_md(sstream, desc.scaleshift_desc);
-    serialize_md(sstream, desc.diff_scaleshift_desc);
-    serialize_md(sstream, desc.stat_desc);
-    // Groups
-    sstream.write(&desc.groups);
-    // Epsilon
-    sstream.write(&desc.group_norm_epsilon);
-    // Flags
-    sstream.write(&desc.flags);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const inner_product_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.weights_desc);
-    serialize_md(sstream, desc.diff_weights_desc);
-    serialize_md(sstream, desc.bias_desc);
-    serialize_md(sstream, desc.diff_bias_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Accumulator type
-    sstream.write(&desc.accum_data_type);
-}
-
-void serialize_desc(serialization_stream_t &sstream,
-        const layer_normalization_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.data_scaleshift_desc);
-    serialize_md(sstream, desc.diff_data_scaleshift_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    serialize_md(sstream, desc.stat_desc);
-    // Epsilon
-    sstream.write(&desc.layer_norm_epsilon);
-    // Flags
-    sstream.write(&desc.flags);
-}
-
-void serialize_desc(serialization_stream_t &sstream, const lrn_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Local size
-    sstream.write(&desc.local_size);
-    // Alpha, beta
-    sstream.write(&desc.lrn_alpha);
-    sstream.write(&desc.lrn_beta);
-    // k
-    sstream.write(&desc.lrn_k);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const matmul_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.weights_desc);
-    serialize_md(sstream, desc.bias_desc);
-    serialize_md(sstream, desc.dst_desc);
-    // Accumulator type
-    sstream.write(&desc.accum_data_type);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const pooling_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Strides, dilates, padding
-    sstream.write(desc.strides, DNNL_MAX_NDIMS);
-    sstream.write(desc.kernel, DNNL_MAX_NDIMS);
-    sstream.write(desc.padding[0], DNNL_MAX_NDIMS);
-    sstream.write(desc.padding[1], DNNL_MAX_NDIMS);
-    sstream.write(desc.dilation, DNNL_MAX_NDIMS);
-    // Accumulator type
-    sstream.write(&desc.accum_data_type);
-}
-
-void serialize_desc(serialization_stream_t &sstream, const prelu_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.weights_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.diff_weights_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const reduction_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    // P, eps
-    sstream.write(&desc.p);
-    sstream.write(&desc.eps);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const reorder_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    // Memory descriptors
-    serialize_md(sstream, *desc.src_md);
-    serialize_md(sstream, *desc.dst_md);
-    // Kinds of source and destination engines
-    sstream.write(&desc.src_engine_kind);
-    sstream.write(&desc.dst_engine_kind);
-    sstream.write(&desc.is_cross_engine);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const resampling_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Factors
-    sstream.write(desc.factors, DNNL_MAX_NDIMS);
-}
-
-void serialize_desc(serialization_stream_t &sstream, const rnn_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    sstream.write(&desc.cell_kind);
-    sstream.write(&desc.direction);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_layer_desc);
-    serialize_md(sstream, desc.src_iter_desc);
-    serialize_md(sstream, desc.src_iter_c_desc);
-    serialize_md(sstream, desc.weights_layer_desc);
-    serialize_md(sstream, desc.weights_iter_desc);
-    serialize_md(sstream, desc.bias_desc);
-    serialize_md(sstream, desc.dst_layer_desc);
-    serialize_md(sstream, desc.dst_iter_desc);
-    serialize_md(sstream, desc.dst_iter_c_desc);
-    serialize_md(sstream, desc.weights_peephole_desc);
-    serialize_md(sstream, desc.weights_projection_desc);
-    serialize_md(sstream, desc.diff_src_layer_desc);
-    serialize_md(sstream, desc.diff_src_iter_desc);
-    serialize_md(sstream, desc.diff_src_iter_c_desc);
-    serialize_md(sstream, desc.diff_weights_layer_desc);
-    serialize_md(sstream, desc.diff_weights_iter_desc);
-    serialize_md(sstream, desc.diff_bias_desc);
-    serialize_md(sstream, desc.diff_dst_layer_desc);
-    serialize_md(sstream, desc.diff_dst_iter_desc);
-    serialize_md(sstream, desc.diff_dst_iter_c_desc);
-    serialize_md(sstream, desc.diff_weights_peephole_desc);
-    serialize_md(sstream, desc.diff_weights_projection_desc);
-    // Flags
-    sstream.write(&desc.flags);
-    // Activation kind
-    sstream.write(&desc.activation_kind);
-    // Alpha, beta
-    sstream.write(&desc.alpha);
-    sstream.write(&desc.beta);
-}
-
-// Shuffle
-void serialize_desc(
-        serialization_stream_t &sstream, const shuffle_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    // Axis
-    sstream.write(&desc.axis);
-    // Groupe size
-    sstream.write(&desc.group_size);
-}
-
-void serialize_desc(
-        serialization_stream_t &sstream, const softmax_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    sstream.write(&desc.prop_kind);
-    sstream.write(&desc.alg_kind);
-    // Memory descriptors
-    serialize_md(sstream, desc.src_desc);
-    serialize_md(sstream, desc.diff_src_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.diff_dst_desc);
-    // Axis
-    sstream.write(&desc.softmax_axis);
-}
-
-void serialize_desc(serialization_stream_t &sstream, const sum_desc_t &desc) {
-    // Kinds
-    sstream.write(&desc.primitive_kind);
-    // Memory descriptors
-    serialize_md(sstream, *desc.dst_md);
-    // N
-    sstream.write(&desc.n);
-    // Scales
-    sstream.write(desc.scales, desc.n);
-    // Array of mds
-    for (int i = 0; i < desc.n; i++)
-        serialize_md(sstream, *desc.src_mds[i]);
-}
-
-void serialize_desc(serialization_stream_t &sstream, const sdpa_desc_t &desc) {
-    // Kind
-    sstream.write(&desc.primitive_kind);
-    serialize_md(sstream, desc.q_desc);
-    serialize_md(sstream, desc.k_desc);
-    serialize_md(sstream, desc.v_desc);
-    serialize_md(sstream, desc.dst_desc);
-    serialize_md(sstream, desc.attn_mask_desc);
-    sstream.write(&desc.scale_dt);
-    sstream.write(&desc.invert_scale);
-}
-
-} // namespace serialization
-} // namespace impl
-} // namespace dnnl
diff --git a/src/common/serialization.hpp b/src/common/serialization.hpp
index afd4ffba136..f575b140979 100644
--- a/src/common/serialization.hpp
+++ b/src/common/serialization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,59 +17,272 @@
 #ifndef COMMON_SERIALIZATION_HPP
 #define COMMON_SERIALIZATION_HPP
 
-#include "common/c_types_map.hpp"
-#include "common/primitive_attr.hpp"
-#include "common/serialization_stream.hpp"
-#include "common/type_helpers.hpp"
-#include "oneapi/dnnl/dnnl.h"
+#include <cstdint>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <type_traits>
+
+#include "common/utils.hpp"
 
 namespace dnnl {
 namespace impl {
-namespace serialization {
-
-void serialize_post_ops(
-        serialization_stream_t &sstream, const post_ops_t &post_ops);
-void serialize_attr(
-        serialization_stream_t &sstream, const primitive_attr_t &attr);
-void serialize_md(serialization_stream_t &sstream, const memory_desc_t &md);
-void serialize_desc(serialization_stream_t &sstream, const concat_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream,
-        const batch_normalization_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const binary_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const convolution_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const eltwise_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const gemm_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream,
-        const group_normalization_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const inner_product_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream,
-        const layer_normalization_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const lrn_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const matmul_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const pooling_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const prelu_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const reduction_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const reorder_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const resampling_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const rnn_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const sdpa_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const shuffle_desc_t &desc);
-void serialize_desc(
-        serialization_stream_t &sstream, const softmax_desc_t &desc);
-void serialize_desc(serialization_stream_t &sstream, const sum_desc_t &desc);
-
-status_t serialize_desc(
-        serialization_stream_t &sstream, const op_desc_t *op_desc);
-
-} // namespace serialization
+
+#define DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(cls) \
+    static_assert(serialization_stream_t::is_trivially_serialized<cls>::value, \
+            #cls " must be trivially serializable.")
+
+struct serialization_stream_t {
+    serialization_stream_t() = default;
+
+    template <typename Arg1, typename... Args>
+    serialization_stream_t(const Arg1 &a1, const Args &...args) {
+        append(a1, args...);
+    }
+
+    static serialization_stream_t from_data(std::vector<uint8_t> data) {
+        serialization_stream_t s;
+        s.data_ = std::move(data);
+        return s;
+    }
+
+    bool operator==(const serialization_stream_t &other) const {
+        return data_ == other.data_;
+    }
+
+#if defined(__cpp_lib_has_unique_object_representations) \
+        && __cpp_lib_has_unique_object_representations >= 201606L
+    template <typename T>
+    struct is_trivially_serialized {
+        static const bool value
+                = (std::has_unique_object_representations<T>::value
+                          || std::is_floating_point<T>::value)
+                && !(std::is_pointer<T>::value);
+    };
+
+#else
+    // Fallback for backward compatibility. As the structure layout should not
+    // change between c++ versions, compiling with c++17 will already verify the
+    // structures are valid for this use case.
+    template <typename T>
+    struct is_trivially_serialized {
+        static const bool value = std::is_trivially_copyable<T>::value
+                && !(std::is_pointer<T>::value);
+    };
+#endif
+
+    template <typename T, typename = void>
+    struct has_serialize_t {
+        static const bool value = false;
+    };
+
+    template <typename T>
+    struct has_serialize_t<T,
+            decltype(std::declval<T>().serialize(
+                    std::declval<serialization_stream_t &>()))> {
+        static const bool value = true;
+    };
+
+    // Append helper function for structures with the member function
+    // void serialize(serialization_stream_t &) const
+    template <typename T,
+            utils::enable_if_t<has_serialize_t<T>::value, bool> = true>
+    void append(const T &t) {
+        t.serialize(*this);
+    }
+
+    // Append helper function for trivially serialized objects
+    template <typename T,
+            utils::enable_if_t<is_trivially_serialized<T>::value
+                            && !has_serialize_t<T>::value,
+                    bool> = true>
+    void append(const T &t) {
+        std::array<uint8_t, sizeof(T)> type_data;
+        std::memcpy(type_data.data(), &t, sizeof(T));
+        data_.insert(data_.end(), type_data.begin(), type_data.end());
+    }
+
+    template <typename T,
+            utils::enable_if_t<utils::is_vector<T>::value, bool> = true>
+    void append(const T &v) {
+        append(v.size());
+        for (const typename T::value_type &d : v)
+            append<typename T::value_type>(d);
+    }
+
+    template <typename Arg1, typename Arg2, typename... Args>
+    void append(const Arg1 &a1, const Arg2 &a2, const Args &...args) {
+        append(a1);
+        append(a2, args...);
+    }
+
+    template <typename T,
+            utils::enable_if_t<is_trivially_serialized<T>::value, bool> = true>
+    void append_array(size_t size, const T *ptr) {
+        append(size);
+        const auto *p = reinterpret_cast<const uint8_t *>(ptr);
+        data_.insert(data_.end(), p, p + sizeof(T) * size);
+    }
+
+    template <typename T,
+            utils::enable_if_t<is_trivially_serialized<T>::value, bool> = true>
+    T get(size_t idx) const {
+        T t {};
+        if (data_.size() < idx + sizeof(T)) {
+            assert(!"unexpected");
+            return t;
+        }
+        std::memcpy(&t, &data_[idx], sizeof(T));
+        return t;
+    }
+
+    void get(size_t idx, size_t size, uint8_t *ptr) const {
+        if (data_.size() < idx + size) {
+            assert(!"unexpected");
+            return;
+        }
+        std::memcpy(ptr, &data_[idx], size);
+    }
+
+    size_t get_hash() const { return hash_range(data_.data(), data_.size()); }
+
+    template <typename T>
+    static size_t get_hash(const T &t) {
+        return serialization_stream_t(t).get_hash();
+    }
+
+    std::string str() {
+        std::ostringstream oss;
+        oss << std::hex << std::setfill('0');
+        for (auto c : data_) {
+            oss << std::setw(2) << static_cast<uint32_t>(c);
+        }
+        return oss.str();
+    }
+
+    bool empty() const { return data_.empty(); }
+
+    const std::vector<uint8_t> &get_data() const { return data_; }
+
+private:
+    static size_t hash_range(const uint8_t *v, size_t size) {
+        size_t seed = 0;
+        const uint8_t *end = v + size;
+        for (; v < end; v += sizeof(seed)) {
+            size_t value = 0;
+            std::memcpy(&value, v,
+                    std::min(static_cast<size_t>(end - v), sizeof(seed)));
+            seed = hash_combine(seed, value);
+        }
+
+        return seed;
+    }
+
+    std::vector<uint8_t> data_;
+};
+
+struct deserializer_t {
+    deserializer_t(const serialization_stream_t &sstream)
+        : idx_(0), sstream_(sstream) {}
+
+    template <typename T>
+    struct has_deserialize_t {
+        using yes_t = uint8_t;
+        using no_t = uint16_t;
+
+        template <typename U>
+        static yes_t test(
+                utils::enable_if_t<std::is_same<decltype(&U::deserialize),
+                                           U (*)(deserializer_t &)>::value,
+                        bool>);
+        template <typename U>
+        static no_t test(...);
+
+        static const bool value = (sizeof(test<T>(0)) == sizeof(yes_t));
+    };
+
+    // Helper function for structures with the static member function
+    // void deserialize(deserializer_t&)
+    template <typename T,
+            utils::enable_if_t<has_deserialize_t<T>::value, bool> = true>
+    void pop(T &t) {
+        t = T::deserialize(*this);
+    }
+    template <typename T,
+            utils::enable_if_t<has_deserialize_t<T>::value, bool> = true>
+    T pop() {
+        return T::deserialize(*this);
+    }
+
+    template <typename T,
+            utils::enable_if_t<
+                    serialization_stream_t::is_trivially_serialized<T>::value
+                            && !has_deserialize_t<T>::value,
+                    bool> = true>
+    void pop(T &t) {
+        t = sstream_.get<T>(idx_);
+        idx_ += sizeof(T);
+    }
+
+    template <typename T,
+            utils::enable_if_t<
+                    serialization_stream_t::is_trivially_serialized<T>::value
+                            && !has_deserialize_t<T>::value,
+                    bool> = true>
+    T pop() {
+        auto idx_start = idx_;
+        idx_ += sizeof(T);
+        return sstream_.get<T>(idx_start);
+    }
+
+    // Helper for vector types
+    template <typename T,
+            utils::enable_if_t<utils::is_vector<T>::value, bool> = true>
+    void pop(T &v) {
+        size_t size;
+        pop(size);
+        v.clear();
+        v.reserve(size);
+        for (size_t i = 0; i < size; i++) {
+            typename T::value_type t = {};
+            pop(t);
+            v.emplace_back(t);
+        }
+    }
+
+    template <typename T,
+            utils::enable_if_t<
+                    serialization_stream_t::is_trivially_serialized<T>::value,
+                    bool> = true>
+    void pop_array(size_t &size, T *ptr) {
+        pop(size);
+        sstream_.get(idx_, sizeof(T) * size, reinterpret_cast<uint8_t *>(ptr));
+        idx_ += sizeof(T) * size;
+    }
+
+    bool empty() const { return idx_ >= sstream_.get_data().size(); }
+
+private:
+    size_t idx_ = 0;
+    const serialization_stream_t &sstream_;
+};
+
+template <typename T>
+struct trivially_serializable_t {
+    static constexpr bool is_trivially_validatable = true;
+
+    serialization_stream_t serialize() const {
+        DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(T);
+        return serialization_stream_t(*static_cast<const T *>(this));
+    }
+
+    static T deserialize(const serialization_stream_t &s) {
+        return deserializer_t(s).pop<T>();
+    }
+};
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/serialization_stream.hpp b/src/common/serialization_stream.hpp
deleted file mode 100644
index 28eb32aad61..00000000000
--- a/src/common/serialization_stream.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef COMMON_SERIALIZATION_STREAM_HPP
-#define COMMON_SERIALIZATION_STREAM_HPP
-
-#include <cstdint>
-#include <vector>
-#include <type_traits>
-
-namespace dnnl {
-namespace impl {
-
-struct serialization_stream_t {
-    serialization_stream_t() = default;
-
-    template <typename T>
-    void write(const T ptr, size_t nelems = 1) {
-        using non_pointer_type = typename std::remove_pointer<T>::type;
-
-        static_assert(std::is_pointer<T>::value,
-                "T is expected to be a pointer type.");
-        static_assert(!std::is_pointer<non_pointer_type>::value,
-                "T cannot be a pointer to pointer.");
-        static_assert(!std::is_class<non_pointer_type>::value,
-                "non-pointer type is expected to be a trivial type to avoid "
-                "padding issues.");
-        static_assert(!std::is_array<non_pointer_type>::value,
-                "non-pointer type cannot be an array.");
-
-        write_impl((const void *)ptr, sizeof(non_pointer_type) * nelems);
-    }
-
-    bool empty() const { return data_.empty(); }
-
-    const std::vector<uint8_t> &get_data() const { return data_; }
-
-private:
-    void write_impl(const void *ptr, size_t size) {
-        const auto *p = reinterpret_cast<const uint8_t *>(ptr);
-        data_.insert(data_.end(), p, p + size);
-    }
-
-    std::vector<uint8_t> data_;
-};
-
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/common/shuffle_pd.hpp b/src/common/shuffle_pd.hpp
index dec26b107f2..5a2886ee210 100644
--- a/src/common/shuffle_pd.hpp
+++ b/src/common/shuffle_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,11 +34,12 @@
 namespace dnnl {
 namespace impl {
 
+// NOLINTBEGIN(google-default-arguments)
 struct shuffle_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::shuffle;
 
-    typedef shuffle_pd_t base_class;
-    typedef shuffle_pd_t hint_class;
+    using base_class = shuffle_pd_t;
+    using hint_class = shuffle_pd_t;
 
     const shuffle_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -145,10 +146,10 @@ struct shuffle_pd_t : public primitive_desc_t {
     memory_desc_t src_md_;
     memory_desc_t dst_md_;
 
-    shuffle_pd_t(const shuffle_desc_t *adesc, const primitive_attr_t *attr,
+    shuffle_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const shuffle_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<shuffle_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , src_md_(desc_.src_desc)
         , dst_md_(desc_.dst_desc) {
@@ -179,6 +180,7 @@ struct shuffle_pd_t : public primitive_desc_t {
         return is_fwd() ? src_md() : diff_src_md();
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/softmax.cpp b/src/common/softmax.cpp
index 94e6e9c4ca5..77abe54034b 100644
--- a/src/common/softmax.cpp
+++ b/src/common/softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -107,18 +107,26 @@ status_t softmax_attr_check(const softmax_desc_t &desc, const engine_t *engine,
 
         const bool is_int8 = utils::one_of(src_dt, data_type::s8, data_type::u8)
                 || utils::one_of(dst_dt, data_type::s8, data_type::u8);
-        if (is_int8) fwd_attr_mask |= smask_t::scales_runtime;
+        if (is_int8) fwd_attr_mask |= smask_t::scales;
 
         VCHECK_SOFTMAX_UNIMPL(attr->has_default_values(fwd_attr_mask, dst_dt),
                 VERBOSE_UNSUPPORTED_ATTR);
 
+        // Check scales
         if (!attr->scales_.has_default_values()) {
-            const auto &sc = attr->scales_;
-            const int mask_src = sc.get(DNNL_ARG_SRC).mask_;
-            const int mask_dst = sc.get(DNNL_ARG_DST).mask_;
-
-            VCHECK_SOFTMAX_UNIMPL(utils::everyone_is(0, mask_src, mask_dst),
+            static const std::vector<int> supported_args {
+                    DNNL_ARG_SRC, DNNL_ARG_DST};
+            VCHECK_SOFTMAX_UNIMPL(
+                    attr->scales_.has_default_values(supported_args),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            for (int arg : supported_args) {
+                if (attr->scales_.has_default_values(arg)) continue;
+
+                const int mask = attr->scales_.get_mask(arg);
+                VCHECK_SOFTMAX_UNIMPL(
+                        mask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+            }
         }
 
         // Check post-ops
@@ -127,6 +135,9 @@ status_t softmax_attr_check(const softmax_desc_t &desc, const engine_t *engine,
             using namespace primitive_kind;
             VCHECK_SOFTMAX_UNIMPL(po.has_default_values({binary, eltwise}),
                     VERBOSE_UNSUPPORTED_POSTOP);
+
+            // Note: verbose support is inside the call.
+            CHECK(po.validate_binary_with_dst_consistency(&desc.dst_desc));
         }
     } else {
         VCHECK_SOFTMAX_UNIMPL(false, VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/common/softmax_pd.hpp b/src/common/softmax_pd.hpp
index e42ae59e1e2..623772a2c61 100644
--- a/src/common/softmax_pd.hpp
+++ b/src/common/softmax_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -108,10 +108,10 @@ struct softmax_pd_t : public primitive_desc_t {
 
     memory_desc_t dst_md_;
 
-    softmax_pd_t(const softmax_desc_t *adesc, const primitive_attr_t *attr,
+    softmax_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const softmax_fwd_pd_t *hint_fwd_pd)
         : primitive_desc_t(attr, base_pkind)
-        , desc_(*adesc)
+        , desc_(*op_desc_t::to_desc<softmax_desc_t>(adesc))
         , hint_fwd_pd_(hint_fwd_pd)
         , dst_md_(desc_.dst_desc) {}
 
@@ -119,17 +119,19 @@ struct softmax_pd_t : public primitive_desc_t {
     const memory_desc_t &dst_desc() const { return dst_md_; }
 };
 
+// NOLINTBEGIN(google-default-arguments)
 struct softmax_fwd_pd_t : public softmax_pd_t {
-    typedef softmax_fwd_pd_t base_class;
-    typedef softmax_fwd_pd_t hint_class;
+    using base_class = softmax_fwd_pd_t;
+    using hint_class = softmax_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (arg == DNNL_ARG_SRC) return arg_usage_t::input;
 
         if (arg == DNNL_ARG_DST) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_WORKSPACE && (!types::is_zero_md(workspace_md())))
-            return arg_usage_t::output;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::output
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -162,7 +164,7 @@ struct softmax_fwd_pd_t : public softmax_pd_t {
 protected:
     memory_desc_t src_md_;
 
-    softmax_fwd_pd_t(const softmax_desc_t *adesc, const primitive_attr_t *attr,
+    softmax_fwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const softmax_fwd_pd_t *hint_fwd_pd)
         : softmax_pd_t(adesc, attr, hint_fwd_pd), src_md_(desc_.src_desc) {}
 
@@ -176,19 +178,28 @@ struct softmax_fwd_pd_t : public softmax_pd_t {
                 dst_md_, src_md_.format_desc.blocking);
     }
 
-    bool attr_scales_ok() const {
+    bool attr_scales_ok(const std::vector<int> &supported_args
+            = {DNNL_ARG_SRC, DNNL_ARG_DST}) const {
         const auto &scales = attr()->scales_;
-        bool ok = true;
-        for (const auto &e : scales.scales_) {
-            ok = ok && e.second.mask_ == 0;
+        bool ok = scales.has_default_values(supported_args);
+
+        for (const auto &arg : supported_args) {
+            if (scales.has_default_values(arg)) continue;
+
+            // TODO: disallow non-int8 scales?
+            // const data_type_t dt = arg_md(arg)->data_type;
+            // ok = ok && utils::one_of(dt, s8, u8);
+            ok = ok && scales.get_mask(arg) == 0;
         }
         return ok;
     }
 };
+// NOLINTEND(google-default-arguments)
 
+// NOLINTBEGIN(google-default-arguments)
 struct softmax_bwd_pd_t : public softmax_pd_t {
-    typedef softmax_bwd_pd_t base_class;
-    typedef softmax_fwd_pd_t hint_class;
+    using base_class = softmax_bwd_pd_t;
+    using hint_class = softmax_fwd_pd_t;
 
     arg_usage_t arg_usage(int arg) const override {
         if (utils::one_of(arg, DNNL_ARG_DST, DNNL_ARG_DIFF_DST))
@@ -196,8 +207,9 @@ struct softmax_bwd_pd_t : public softmax_pd_t {
 
         if (arg == DNNL_ARG_DIFF_SRC) return arg_usage_t::output;
 
-        if (arg == DNNL_ARG_WORKSPACE && (!types::is_zero_md(workspace_md())))
-            return arg_usage_t::input;
+        if (arg == DNNL_ARG_WORKSPACE)
+            return !types::is_zero_md(workspace_md()) ? arg_usage_t::input
+                                                      : arg_usage_t::unused;
 
         return primitive_desc_t::arg_usage(arg);
     }
@@ -239,7 +251,7 @@ struct softmax_bwd_pd_t : public softmax_pd_t {
     memory_desc_t diff_src_md_;
     memory_desc_t diff_dst_md_;
 
-    softmax_bwd_pd_t(const softmax_desc_t *adesc, const primitive_attr_t *attr,
+    softmax_bwd_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
             const softmax_fwd_pd_t *hint_fwd_pd)
         : softmax_pd_t(adesc, attr, hint_fwd_pd)
         , diff_src_md_(desc_.diff_src_desc)
@@ -260,6 +272,7 @@ struct softmax_bwd_pd_t : public softmax_pd_t {
         return status::success;
     }
 };
+// NOLINTEND(google-default-arguments)
 
 } // namespace impl
 } // namespace dnnl
diff --git a/src/common/spdlog/common-inl.h b/src/common/spdlog/common-inl.h
deleted file mode 100755
index 19817b2a702..00000000000
--- a/src/common/spdlog/common-inl.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/common.h>
-#endif
-
-#include <algorithm>
-#include <iterator>
-
-namespace spdlog {
-namespace level {
-
-#if __cplusplus >= 201703L
-constexpr
-#endif
-        static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES;
-
-static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES;
-
-SPDLOG_INLINE const string_view_t &to_string_view(
-        spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
-    return level_string_views[l];
-}
-
-SPDLOG_INLINE const char *to_short_c_str(
-        spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
-    return short_level_names[l];
-}
-
-SPDLOG_INLINE spdlog::level::level_enum from_str(
-        const std::string &name) SPDLOG_NOEXCEPT {
-    auto it = std::find(
-            std::begin(level_string_views), std::end(level_string_views), name);
-    if (it != std::end(level_string_views))
-        return static_cast<level::level_enum>(
-                std::distance(std::begin(level_string_views), it));
-
-    // check also for "warn" and "err" before giving up..
-    if (name == "warn") { return level::warn; }
-    if (name == "err") { return level::err; }
-    return level::off;
-}
-} // namespace level
-
-SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg) : msg_(std::move(msg)) {}
-
-SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno) {
-#ifdef SPDLOG_USE_STD_FORMAT
-    msg_ = std::system_error(
-            std::error_code(last_errno, std::generic_category()), msg)
-                   .what();
-#else
-    memory_buf_t outbuf;
-    fmt::format_system_error(outbuf, last_errno, msg.c_str());
-    msg_ = fmt::to_string(outbuf);
-#endif
-}
-
-SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT {
-    return msg_.c_str();
-}
-
-SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno) {
-    SPDLOG_THROW(spdlog_ex(msg, last_errno));
-}
-
-SPDLOG_INLINE void throw_spdlog_ex(std::string msg) {
-    SPDLOG_THROW(spdlog_ex(std::move(msg)));
-}
-
-} // namespace spdlog
diff --git a/src/common/spdlog/common.h b/src/common/spdlog/common.h
deleted file mode 100755
index 69f4289b540..00000000000
--- a/src/common/spdlog/common.h
+++ /dev/null
@@ -1,424 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/tweakme.h>
-
-#include <atomic>
-#include <chrono>
-#include <cstdio>
-#include <exception>
-#include <functional>
-#include <memory>
-#include <string>
-#include <initializer_list>
-#include <type_traits>
-
-#ifdef SPDLOG_USE_STD_FORMAT
-#include <version>
-#if __cpp_lib_format >= 202207L
-#include <format>
-#else
-#include <string_view>
-#endif
-#endif
-
-#ifdef SPDLOG_COMPILED_LIB
-#undef SPDLOG_HEADER_ONLY
-#if defined(SPDLOG_SHARED_LIB)
-#if defined(_WIN32)
-#ifdef spdlog_EXPORTS
-#define SPDLOG_API __declspec(dllexport)
-#else // !spdlog_EXPORTS
-#define SPDLOG_API __declspec(dllimport)
-#endif
-#else // !defined(_WIN32)
-#define SPDLOG_API __attribute__((visibility("default")))
-#endif
-#else // !defined(SPDLOG_SHARED_LIB)
-#define SPDLOG_API
-#endif
-#define SPDLOG_INLINE
-#else // !defined(SPDLOG_COMPILED_LIB)
-#define SPDLOG_API
-#define SPDLOG_HEADER_ONLY
-#define SPDLOG_INLINE inline
-#endif // #ifdef SPDLOG_COMPILED_LIB
-
-#include <common/spdlog/fmt/fmt.h>
-
-#if !defined(SPDLOG_USE_STD_FORMAT) \
-        && FMT_VERSION \
-                >= 80000 // backward compatibility with fmt versions older than 8
-#define SPDLOG_FMT_RUNTIME(format_string) fmt::runtime(format_string)
-#define SPDLOG_FMT_STRING(format_string) FMT_STRING(format_string)
-#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
-#include <common/spdlog/fmt/xchar.h>
-#endif
-#else
-#define SPDLOG_FMT_RUNTIME(format_string) format_string
-#define SPDLOG_FMT_STRING(format_string) format_string
-#endif
-
-// visual studio up to 2013 does not support noexcept nor constexpr
-#if defined(_MSC_VER) && (_MSC_VER < 1900)
-#define SPDLOG_NOEXCEPT _NOEXCEPT
-#define SPDLOG_CONSTEXPR
-#else
-#define SPDLOG_NOEXCEPT noexcept
-#define SPDLOG_CONSTEXPR constexpr
-#endif
-
-// If building with std::format, can just use constexpr, otherwise if building with fmt
-// SPDLOG_CONSTEXPR_FUNC needs to be set the same as FMT_CONSTEXPR to avoid situations where
-// a constexpr function in spdlog could end up calling a non-constexpr function in fmt
-// depending on the compiler
-// If fmt determines it can't use constexpr, we should inline the function instead
-#ifdef SPDLOG_USE_STD_FORMAT
-#define SPDLOG_CONSTEXPR_FUNC constexpr
-#else // Being built with fmt
-#if FMT_USE_CONSTEXPR
-#define SPDLOG_CONSTEXPR_FUNC FMT_CONSTEXPR
-#else
-#define SPDLOG_CONSTEXPR_FUNC inline
-#endif
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#define SPDLOG_DEPRECATED __attribute__((deprecated))
-#elif defined(_MSC_VER)
-#define SPDLOG_DEPRECATED __declspec(deprecated)
-#else
-#define SPDLOG_DEPRECATED
-#endif
-
-// disable thread local on msvc 2013
-#ifndef SPDLOG_NO_TLS
-#if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__cplusplus_winrt)
-#define SPDLOG_NO_TLS 1
-#endif
-#endif
-
-#ifndef SPDLOG_FUNCTION
-#define SPDLOG_FUNCTION static_cast<const char *>(__FUNCTION__)
-#endif
-
-#ifdef SPDLOG_NO_EXCEPTIONS
-#define SPDLOG_TRY
-#define SPDLOG_THROW(ex) \
-    do { \
-        printf("spdlog fatal error: %s\n", ex.what()); \
-        std::abort(); \
-    } while (0)
-#define SPDLOG_CATCH_STD
-#else
-#define SPDLOG_TRY try
-#define SPDLOG_THROW(ex) throw(ex)
-#define SPDLOG_CATCH_STD \
-    catch (const std::exception &) { \
-    }
-#endif
-
-namespace spdlog {
-
-class formatter;
-
-namespace sinks {
-class sink;
-}
-
-#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
-using filename_t = std::wstring;
-// allow macro expansion to occur in SPDLOG_FILENAME_T
-#define SPDLOG_FILENAME_T_INNER(s) L##s
-#define SPDLOG_FILENAME_T(s) SPDLOG_FILENAME_T_INNER(s)
-#else
-using filename_t = std::string;
-#define SPDLOG_FILENAME_T(s) s
-#endif
-
-using log_clock = std::chrono::system_clock;
-using sink_ptr = std::shared_ptr<sinks::sink>;
-using sinks_init_list = std::initializer_list<sink_ptr>;
-using err_handler = std::function<void(const std::string &err_msg)>;
-#ifdef SPDLOG_USE_STD_FORMAT
-namespace fmt_lib = std;
-
-using string_view_t = std::string_view;
-using memory_buf_t = std::string;
-
-template <typename... Args>
-#if __cpp_lib_format >= 202207L
-using format_string_t = std::format_string<Args...>;
-#else
-using format_string_t = std::string_view;
-#endif
-
-template <class T, class Char = char>
-struct is_convertible_to_basic_format_string
-    : std::integral_constant<bool,
-              std::is_convertible<T, std::basic_string_view<Char>>::value> {};
-
-#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
-using wstring_view_t = std::wstring_view;
-using wmemory_buf_t = std::wstring;
-
-template <typename... Args>
-#if __cpp_lib_format >= 202207L
-using wformat_string_t = std::wformat_string<Args...>;
-#else
-using wformat_string_t = std::wstring_view;
-#endif
-#endif
-#define SPDLOG_BUF_TO_STRING(x) x
-#else // use fmt lib instead of std::format
-namespace fmt_lib = fmt;
-
-using string_view_t = fmt::basic_string_view<char>;
-using memory_buf_t = fmt::basic_memory_buffer<char, 250>;
-
-template <typename... Args>
-using format_string_t = fmt::format_string<Args...>;
-
-template <class T>
-using remove_cvref_t =
-        typename std::remove_cv<typename std::remove_reference<T>::type>::type;
-
-template <typename Char>
-#if FMT_VERSION >= 90101
-using fmt_runtime_string = fmt::runtime_format_string<Char>;
-#else
-using fmt_runtime_string = fmt::basic_runtime<Char>;
-#endif
-
-// clang doesn't like SFINAE disabled constructor in std::is_convertible<> so have to repeat the
-// condition from basic_format_string here, in addition, fmt::basic_runtime<Char> is only
-// convertible to basic_format_string<Char> but not basic_string_view<Char>
-template <class T, class Char = char>
-struct is_convertible_to_basic_format_string
-    : std::integral_constant<bool,
-              std::is_convertible<T, fmt::basic_string_view<Char>>::value
-                      || std::is_same<remove_cvref_t<T>,
-                              fmt_runtime_string<Char>>::value> {};
-
-#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
-using wstring_view_t = fmt::basic_string_view<wchar_t>;
-using wmemory_buf_t = fmt::basic_memory_buffer<wchar_t, 250>;
-
-template <typename... Args>
-using wformat_string_t = fmt::wformat_string<Args...>;
-#endif
-#define SPDLOG_BUF_TO_STRING(x) fmt::to_string(x)
-#endif
-
-#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
-#ifndef _WIN32
-#error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
-#endif // _WIN32
-#endif // SPDLOG_WCHAR_TO_UTF8_SUPPORT
-
-template <class T>
-struct is_convertible_to_any_format_string
-    : std::integral_constant<bool,
-              is_convertible_to_basic_format_string<T, char>::value
-                      || is_convertible_to_basic_format_string<T,
-                              wchar_t>::value> {};
-
-#if defined(SPDLOG_NO_ATOMIC_LEVELS)
-using level_t = details::null_atomic_int;
-#else
-using level_t = std::atomic<int>;
-#endif
-
-#define SPDLOG_LEVEL_TRACE 0
-#define SPDLOG_LEVEL_DEBUG 1
-#define SPDLOG_LEVEL_INFO 2
-#define SPDLOG_LEVEL_WARN 3
-#define SPDLOG_LEVEL_ERROR 4
-#define SPDLOG_LEVEL_CRITICAL 5
-#define SPDLOG_LEVEL_OFF 6
-
-#if !defined(SPDLOG_ACTIVE_LEVEL)
-#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
-#endif
-
-// Log level enum
-namespace level {
-enum level_enum : int {
-    trace = SPDLOG_LEVEL_TRACE,
-    debug = SPDLOG_LEVEL_DEBUG,
-    info = SPDLOG_LEVEL_INFO,
-    warn = SPDLOG_LEVEL_WARN,
-    err = SPDLOG_LEVEL_ERROR,
-    critical = SPDLOG_LEVEL_CRITICAL,
-    off = SPDLOG_LEVEL_OFF,
-    n_levels
-};
-
-#define SPDLOG_LEVEL_NAME_TRACE spdlog::string_view_t("trace", 5)
-#define SPDLOG_LEVEL_NAME_DEBUG spdlog::string_view_t("debug", 5)
-#define SPDLOG_LEVEL_NAME_INFO spdlog::string_view_t("info", 4)
-#define SPDLOG_LEVEL_NAME_WARNING spdlog::string_view_t("warning", 7)
-#define SPDLOG_LEVEL_NAME_ERROR spdlog::string_view_t("error", 5)
-#define SPDLOG_LEVEL_NAME_CRITICAL spdlog::string_view_t("critical", 8)
-#define SPDLOG_LEVEL_NAME_OFF spdlog::string_view_t("off", 3)
-
-#if !defined(SPDLOG_LEVEL_NAMES)
-#define SPDLOG_LEVEL_NAMES \
-    { \
-        SPDLOG_LEVEL_NAME_TRACE, SPDLOG_LEVEL_NAME_DEBUG, \
-                SPDLOG_LEVEL_NAME_INFO, SPDLOG_LEVEL_NAME_WARNING, \
-                SPDLOG_LEVEL_NAME_ERROR, SPDLOG_LEVEL_NAME_CRITICAL, \
-                SPDLOG_LEVEL_NAME_OFF \
-    }
-#endif
-
-#if !defined(SPDLOG_SHORT_LEVEL_NAMES)
-
-#define SPDLOG_SHORT_LEVEL_NAMES \
-    { "T", "D", "I", "W", "E", "C", "O" }
-#endif
-
-SPDLOG_API const string_view_t &to_string_view(
-        spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
-SPDLOG_API const char *to_short_c_str(
-        spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
-SPDLOG_API spdlog::level::level_enum from_str(
-        const std::string &name) SPDLOG_NOEXCEPT;
-
-} // namespace level
-
-//
-// Color mode used by sinks with color support.
-//
-enum class color_mode { always, automatic, never };
-
-//
-// Pattern time - specific time getting to use for pattern_formatter.
-// local time by default
-//
-enum class pattern_time_type {
-    local, // log localtime
-    utc // log utc
-};
-
-//
-// Log exception
-//
-class SPDLOG_API spdlog_ex : public std::exception {
-public:
-    explicit spdlog_ex(std::string msg);
-    spdlog_ex(const std::string &msg, int last_errno);
-    const char *what() const SPDLOG_NOEXCEPT override;
-
-private:
-    std::string msg_;
-};
-
-[[noreturn]] SPDLOG_API void throw_spdlog_ex(
-        const std::string &msg, int last_errno);
-[[noreturn]] SPDLOG_API void throw_spdlog_ex(std::string msg);
-
-struct source_loc {
-    SPDLOG_CONSTEXPR source_loc() = default;
-    SPDLOG_CONSTEXPR source_loc(
-            const char *filename_in, int line_in, const char *funcname_in)
-        : filename {filename_in}, line {line_in}, funcname {funcname_in} {}
-
-    SPDLOG_CONSTEXPR bool empty() const SPDLOG_NOEXCEPT { return line <= 0; }
-    const char *filename {nullptr};
-    int line {0};
-    const char *funcname {nullptr};
-};
-
-struct file_event_handlers {
-    file_event_handlers()
-        : before_open(nullptr)
-        , after_open(nullptr)
-        , before_close(nullptr)
-        , after_close(nullptr) {}
-
-    std::function<void(const filename_t &filename)> before_open;
-    std::function<void(const filename_t &filename, std::FILE *file_stream)>
-            after_open;
-    std::function<void(const filename_t &filename, std::FILE *file_stream)>
-            before_close;
-    std::function<void(const filename_t &filename)> after_close;
-};
-
-namespace details {
-
-// to_string_view
-
-SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(
-        const memory_buf_t &buf) SPDLOG_NOEXCEPT {
-    return spdlog::string_view_t {buf.data(), buf.size()};
-}
-
-SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(
-        spdlog::string_view_t str) SPDLOG_NOEXCEPT {
-    return str;
-}
-
-#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
-SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(
-        const wmemory_buf_t &buf) SPDLOG_NOEXCEPT {
-    return spdlog::wstring_view_t {buf.data(), buf.size()};
-}
-
-SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(
-        spdlog::wstring_view_t str) SPDLOG_NOEXCEPT {
-    return str;
-}
-#endif
-
-#ifndef SPDLOG_USE_STD_FORMAT
-template <typename T, typename... Args>
-inline fmt::basic_string_view<T> to_string_view(
-        fmt::basic_format_string<T, Args...> fmt) {
-    return fmt;
-}
-#elif __cpp_lib_format >= 202207L
-template <typename T, typename... Args>
-SPDLOG_CONSTEXPR_FUNC std::basic_string_view<T> to_string_view(
-        std::basic_format_string<T, Args...> fmt) SPDLOG_NOEXCEPT {
-    return fmt.get();
-}
-#endif
-
-// make_unique support for pre c++14
-#if __cplusplus >= 201402L // C++14 and beyond
-using std::enable_if_t;
-using std::make_unique;
-#else
-template <bool B, class T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args &&...args) {
-    static_assert(!std::is_array<T>::value, "arrays not supported");
-    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-#endif
-
-// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
-template <typename T, typename U,
-        enable_if_t<!std::is_same<T, U>::value, int> = 0>
-constexpr T conditional_static_cast(U value) {
-    return static_cast<T>(value);
-}
-
-template <typename T, typename U,
-        enable_if_t<std::is_same<T, U>::value, int> = 0>
-constexpr T conditional_static_cast(U value) {
-    return value;
-}
-
-} // namespace details
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "common-inl.h"
-#endif
diff --git a/src/common/spdlog/details/backtracer-inl.h b/src/common/spdlog/details/backtracer-inl.h
deleted file mode 100755
index 14448d74c41..00000000000
--- a/src/common/spdlog/details/backtracer-inl.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/backtracer.h>
-#endif
-namespace spdlog {
-namespace details {
-SPDLOG_INLINE backtracer::backtracer(const backtracer &other) {
-    std::lock_guard<std::mutex> lock(other.mutex_);
-    enabled_ = other.enabled();
-    messages_ = other.messages_;
-}
-
-SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT {
-    std::lock_guard<std::mutex> lock(other.mutex_);
-    enabled_ = other.enabled();
-    messages_ = std::move(other.messages_);
-}
-
-SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    enabled_ = other.enabled();
-    messages_ = std::move(other.messages_);
-    return *this;
-}
-
-SPDLOG_INLINE void backtracer::enable(size_t size) {
-    std::lock_guard<std::mutex> lock {mutex_};
-    enabled_.store(true, std::memory_order_relaxed);
-    messages_ = circular_q<log_msg_buffer> {size};
-}
-
-SPDLOG_INLINE void backtracer::disable() {
-    std::lock_guard<std::mutex> lock {mutex_};
-    enabled_.store(false, std::memory_order_relaxed);
-}
-
-SPDLOG_INLINE bool backtracer::enabled() const {
-    return enabled_.load(std::memory_order_relaxed);
-}
-
-SPDLOG_INLINE void backtracer::push_back(const log_msg &msg) {
-    std::lock_guard<std::mutex> lock {mutex_};
-    messages_.push_back(log_msg_buffer {msg});
-}
-
-SPDLOG_INLINE bool backtracer::empty() const {
-    std::lock_guard<std::mutex> lock {mutex_};
-    return messages_.empty();
-}
-
-// pop all items in the q and apply the given fun on each of them.
-SPDLOG_INLINE void backtracer::foreach_pop(
-        std::function<void(const details::log_msg &)> fun) {
-    std::lock_guard<std::mutex> lock {mutex_};
-    while (!messages_.empty()) {
-        auto &front_msg = messages_.front();
-        fun(front_msg);
-        messages_.pop_front();
-    }
-}
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/file_helper-inl.h b/src/common/spdlog/details/file_helper-inl.h
deleted file mode 100755
index 6d24b0ce257..00000000000
--- a/src/common/spdlog/details/file_helper-inl.h
+++ /dev/null
@@ -1,162 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/file_helper.h>
-#endif
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/os.h>
-
-#include <cerrno>
-#include <chrono>
-#include <cstdio>
-#include <string>
-#include <thread>
-#include <tuple>
-
-namespace spdlog {
-namespace details {
-
-SPDLOG_INLINE file_helper::file_helper(
-        const file_event_handlers &event_handlers)
-    : event_handlers_(event_handlers) {}
-
-SPDLOG_INLINE file_helper::~file_helper() {
-    close();
-}
-
-SPDLOG_INLINE void file_helper::open(const filename_t &fname, bool truncate) {
-    close();
-    filename_ = fname;
-
-    auto *mode = SPDLOG_FILENAME_T("ab");
-    auto *trunc_mode = SPDLOG_FILENAME_T("wb");
-
-    if (event_handlers_.before_open) { event_handlers_.before_open(filename_); }
-    for (int tries = 0; tries < open_tries_; ++tries) {
-        // create containing folder if not exists already.
-        os::create_dir(os::dir_name(fname));
-        if (truncate) {
-            // Truncate by opening-and-closing a tmp file in "wb" mode, always
-            // opening the actual log-we-write-to in "ab" mode, since that
-            // interacts more politely with eternal processes that might
-            // rotate/truncate the file underneath us.
-            std::FILE *tmp;
-            if (os::fopen_s(&tmp, fname, trunc_mode)) { continue; }
-            std::fclose(tmp);
-        }
-        if (!os::fopen_s(&fd_, fname, mode)) {
-            if (event_handlers_.after_open) {
-                event_handlers_.after_open(filename_, fd_);
-            }
-            return;
-        }
-
-        details::os::sleep_for_millis(open_interval_);
-    }
-
-    throw_spdlog_ex("Failed opening file " + os::filename_to_str(filename_)
-                    + " for writing",
-            errno);
-}
-
-SPDLOG_INLINE void file_helper::reopen(bool truncate) {
-    if (filename_.empty()) {
-        throw_spdlog_ex("Failed re opening file - was not opened before");
-    }
-    this->open(filename_, truncate);
-}
-
-SPDLOG_INLINE void file_helper::flush() {
-    if (std::fflush(fd_) != 0) {
-        throw_spdlog_ex(
-                "Failed flush to file " + os::filename_to_str(filename_),
-                errno);
-    }
-}
-
-SPDLOG_INLINE void file_helper::sync() {
-    if (!os::fsync(fd_)) {
-        throw_spdlog_ex(
-                "Failed to fsync file " + os::filename_to_str(filename_),
-                errno);
-    }
-}
-
-SPDLOG_INLINE void file_helper::close() {
-    if (fd_ != nullptr) {
-        if (event_handlers_.before_close) {
-            event_handlers_.before_close(filename_, fd_);
-        }
-
-        std::fclose(fd_);
-        fd_ = nullptr;
-
-        if (event_handlers_.after_close) {
-            event_handlers_.after_close(filename_);
-        }
-    }
-}
-
-SPDLOG_INLINE void file_helper::write(const memory_buf_t &buf) {
-    if (fd_ == nullptr) return;
-    size_t msg_size = buf.size();
-    auto data = buf.data();
-    if (std::fwrite(data, 1, msg_size, fd_) != msg_size) {
-        throw_spdlog_ex(
-                "Failed writing to file " + os::filename_to_str(filename_),
-                errno);
-    }
-}
-
-SPDLOG_INLINE size_t file_helper::size() const {
-    if (fd_ == nullptr) {
-        throw_spdlog_ex("Cannot use size() on closed file "
-                + os::filename_to_str(filename_));
-    }
-    return os::filesize(fd_);
-}
-
-SPDLOG_INLINE const filename_t &file_helper::filename() const {
-    return filename_;
-}
-
-//
-// return file path and its extension:
-//
-// "mylog.txt" => ("mylog", ".txt")
-// "mylog" => ("mylog", "")
-// "mylog." => ("mylog.", "")
-// "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
-//
-// the starting dot in filenames is ignored (hidden files):
-//
-// ".mylog" => (".mylog". "")
-// "my_folder/.mylog" => ("my_folder/.mylog", "")
-// "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
-SPDLOG_INLINE std::tuple<filename_t, filename_t>
-file_helper::split_by_extension(const filename_t &fname) {
-    auto ext_index = fname.rfind('.');
-
-    // no valid extension found - return whole path and empty string as
-    // extension
-    if (ext_index == filename_t::npos || ext_index == 0
-            || ext_index == fname.size() - 1) {
-        return std::make_tuple(fname, filename_t());
-    }
-
-    // treat cases like "/etc/rc.d/somelogfile or "/abc/.hiddenfile"
-    auto folder_index = fname.find_last_of(details::os::folder_seps_filename);
-    if (folder_index != filename_t::npos && folder_index >= ext_index - 1) {
-        return std::make_tuple(fname, filename_t());
-    }
-
-    // finally - return a valid base and extension tuple
-    return std::make_tuple(fname.substr(0, ext_index), fname.substr(ext_index));
-}
-
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/log_msg-inl.h b/src/common/spdlog/details/log_msg-inl.h
deleted file mode 100755
index 027ce5102c0..00000000000
--- a/src/common/spdlog/details/log_msg-inl.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/log_msg.h>
-#endif
-
-#include <common/spdlog/details/os.h>
-
-namespace spdlog {
-namespace details {
-
-SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time,
-        spdlog::source_loc loc, string_view_t a_logger_name,
-        spdlog::level::level_enum lvl, spdlog::string_view_t msg)
-    : logger_name(a_logger_name)
-    , level(lvl)
-    , time(log_time)
-#ifndef SPDLOG_NO_THREAD_ID
-    , thread_id(os::thread_id())
-#endif
-    , source(loc)
-    , payload(msg) {
-}
-
-SPDLOG_INLINE log_msg::log_msg(spdlog::source_loc loc,
-        string_view_t a_logger_name, spdlog::level::level_enum lvl,
-        spdlog::string_view_t msg)
-    : log_msg(os::now(), loc, a_logger_name, lvl, msg) {}
-
-SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name,
-        spdlog::level::level_enum lvl, spdlog::string_view_t msg)
-    : log_msg(os::now(), source_loc {}, a_logger_name, lvl, msg) {}
-
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/log_msg.h b/src/common/spdlog/details/log_msg.h
deleted file mode 100755
index c11aaf257b1..00000000000
--- a/src/common/spdlog/details/log_msg.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <string>
-#include <common/spdlog/common.h>
-
-namespace spdlog {
-namespace details {
-struct SPDLOG_API log_msg {
-    log_msg() = default;
-    log_msg(log_clock::time_point log_time, source_loc loc,
-            string_view_t logger_name, level::level_enum lvl,
-            string_view_t msg);
-    log_msg(source_loc loc, string_view_t logger_name, level::level_enum lvl,
-            string_view_t msg);
-    log_msg(string_view_t logger_name, level::level_enum lvl,
-            string_view_t msg);
-    log_msg(const log_msg &other) = default;
-    log_msg &operator=(const log_msg &other) = default;
-
-    string_view_t logger_name;
-    level::level_enum level {level::off};
-    log_clock::time_point time;
-    size_t thread_id {0};
-
-    // wrapping the formatted text with color (updated by pattern_formatter).
-    mutable size_t color_range_start {0};
-    mutable size_t color_range_end {0};
-
-    source_loc source;
-    string_view_t payload;
-};
-} // namespace details
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "log_msg-inl.h"
-#endif
diff --git a/src/common/spdlog/details/log_msg_buffer-inl.h b/src/common/spdlog/details/log_msg_buffer-inl.h
deleted file mode 100755
index f3ef28f3708..00000000000
--- a/src/common/spdlog/details/log_msg_buffer-inl.h
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/log_msg_buffer.h>
-#endif
-
-namespace spdlog {
-namespace details {
-
-SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg &orig_msg)
-    : log_msg {orig_msg} {
-    buffer.append(logger_name.begin(), logger_name.end());
-    buffer.append(payload.begin(), payload.end());
-    update_string_views();
-}
-
-SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg_buffer &other)
-    : log_msg {other} {
-    buffer.append(logger_name.begin(), logger_name.end());
-    buffer.append(payload.begin(), payload.end());
-    update_string_views();
-}
-
-SPDLOG_INLINE log_msg_buffer::log_msg_buffer(
-        log_msg_buffer &&other) SPDLOG_NOEXCEPT
-    : log_msg {other},
-      buffer {std::move(other.buffer)} {
-    update_string_views();
-}
-
-SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(
-        const log_msg_buffer &other) {
-    log_msg::operator=(other);
-    buffer.clear();
-    buffer.append(
-            other.buffer.data(), other.buffer.data() + other.buffer.size());
-    update_string_views();
-    return *this;
-}
-
-SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(
-        log_msg_buffer &&other) SPDLOG_NOEXCEPT {
-    log_msg::operator=(other);
-    buffer = std::move(other.buffer);
-    update_string_views();
-    return *this;
-}
-
-SPDLOG_INLINE void log_msg_buffer::update_string_views() {
-    logger_name = string_view_t {buffer.data(), logger_name.size()};
-    payload = string_view_t {
-            buffer.data() + logger_name.size(), payload.size()};
-}
-
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/null_mutex.h b/src/common/spdlog/details/null_mutex.h
deleted file mode 100755
index 1aa188fe027..00000000000
--- a/src/common/spdlog/details/null_mutex.h
+++ /dev/null
@@ -1,38 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <atomic>
-#include <utility>
-// null, no cost dummy "mutex" and dummy "atomic" int
-
-namespace spdlog {
-namespace details {
-struct null_mutex {
-    void lock() const {}
-    void unlock() const {}
-};
-
-struct null_atomic_int {
-    int value;
-    null_atomic_int() = default;
-
-    explicit null_atomic_int(int new_value) : value(new_value) {}
-
-    int load(std::memory_order = std::memory_order_relaxed) const {
-        return value;
-    }
-
-    void store(int new_value, std::memory_order = std::memory_order_relaxed) {
-        value = new_value;
-    }
-
-    int exchange(int new_value, std::memory_order = std::memory_order_relaxed) {
-        std::swap(new_value, value);
-        return new_value; // return value before the call
-    }
-};
-
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/os-inl.h b/src/common/spdlog/details/os-inl.h
deleted file mode 100755
index 3cf5ad7d6d7..00000000000
--- a/src/common/spdlog/details/os-inl.h
+++ /dev/null
@@ -1,589 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/os.h>
-#endif
-
-#include <common/spdlog/common.h>
-
-#include <algorithm>
-#include <array>
-#include <chrono>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <ctime>
-#include <string>
-#include <thread>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#ifdef _WIN32
-#include <fileapi.h> // for FlushFileBuffers
-#include <io.h> // for _get_osfhandle, _isatty, _fileno
-#include <process.h> // for _get_pid
-#include <common/spdlog/details/windows_include.h>
-
-#ifdef __MINGW32__
-#include <share.h>
-#endif
-
-#if defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)
-#include <cassert>
-#include <limits>
-#endif
-
-#include <direct.h> // for _mkdir/_wmkdir
-
-#else // unix
-
-#include <fcntl.h>
-#include <unistd.h>
-
-#ifdef __linux__
-#include <sys/syscall.h> //Use gettid() syscall under linux to get thread id
-
-#elif defined(_AIX)
-#include <pthread.h> // for pthread_getthrds_np
-
-#elif defined(__DragonFly__) || defined(__FreeBSD__)
-#include <pthread_np.h> // for pthread_getthreadid_np
-
-#elif defined(__NetBSD__)
-#include <lwp.h> // for _lwp_self
-
-#elif defined(__sun)
-#include <thread.h> // for thr_self
-#endif
-
-#endif // unix
-
-#if defined __APPLE__
-#include <AvailabilityMacros.h>
-#endif
-
-#ifndef __has_feature // Clang - feature checking macros.
-#define __has_feature(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-namespace spdlog {
-namespace details {
-namespace os {
-
-SPDLOG_INLINE spdlog::log_clock::time_point now() SPDLOG_NOEXCEPT {
-#if defined __linux__ && defined SPDLOG_CLOCK_COARSE
-    timespec ts;
-    ::clock_gettime(CLOCK_REALTIME_COARSE, &ts);
-    return std::chrono::time_point<log_clock, typename log_clock::duration>(
-            std::chrono::duration_cast<typename log_clock::duration>(
-                    std::chrono::seconds(ts.tv_sec)
-                    + std::chrono::nanoseconds(ts.tv_nsec)));
-
-#else
-    return log_clock::now();
-#endif
-}
-SPDLOG_INLINE std::tm localtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    std::tm tm;
-    ::localtime_s(&tm, &time_tt);
-#else
-    std::tm tm;
-    ::localtime_r(&time_tt, &tm);
-#endif
-    return tm;
-}
-
-SPDLOG_INLINE std::tm localtime() SPDLOG_NOEXCEPT {
-    std::time_t now_t = ::time(nullptr);
-    return localtime(now_t);
-}
-
-SPDLOG_INLINE std::tm gmtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    std::tm tm;
-    ::gmtime_s(&tm, &time_tt);
-#else
-    std::tm tm;
-    ::gmtime_r(&time_tt, &tm);
-#endif
-    return tm;
-}
-
-SPDLOG_INLINE std::tm gmtime() SPDLOG_NOEXCEPT {
-    std::time_t now_t = ::time(nullptr);
-    return gmtime(now_t);
-}
-
-// fopen_s on non windows for writing
-SPDLOG_INLINE bool fopen_s(
-        FILE **fp, const filename_t &filename, const filename_t &mode) {
-#ifdef _WIN32
-#ifdef SPDLOG_WCHAR_FILENAMES
-    *fp = ::_wfsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
-#else
-    *fp = ::_fsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
-#endif
-#if defined(SPDLOG_PREVENT_CHILD_FD)
-    if (*fp != nullptr) {
-        auto file_handle
-                = reinterpret_cast<HANDLE>(_get_osfhandle(::_fileno(*fp)));
-        if (!::SetHandleInformation(file_handle, HANDLE_FLAG_INHERIT, 0)) {
-            ::fclose(*fp);
-            *fp = nullptr;
-        }
-    }
-#endif
-#else // unix
-#if defined(SPDLOG_PREVENT_CHILD_FD)
-    const int mode_flag = mode == SPDLOG_FILENAME_T("ab") ? O_APPEND : O_TRUNC;
-    const int fd = ::open((filename.c_str()),
-            O_CREAT | O_WRONLY | O_CLOEXEC | mode_flag, mode_t(0644));
-    if (fd == -1) { return true; }
-    *fp = ::fdopen(fd, mode.c_str());
-    if (*fp == nullptr) { ::close(fd); }
-#else
-    *fp = ::fopen((filename.c_str()), mode.c_str());
-#endif
-#endif
-
-    return *fp == nullptr;
-}
-
-SPDLOG_INLINE int remove(const filename_t &filename) SPDLOG_NOEXCEPT {
-#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
-    return ::_wremove(filename.c_str());
-#else
-    return std::remove(filename.c_str());
-#endif
-}
-
-SPDLOG_INLINE int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
-    return path_exists(filename) ? remove(filename) : 0;
-}
-
-SPDLOG_INLINE int rename(const filename_t &filename1,
-        const filename_t &filename2) SPDLOG_NOEXCEPT {
-#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
-    return ::_wrename(filename1.c_str(), filename2.c_str());
-#else
-    return std::rename(filename1.c_str(), filename2.c_str());
-#endif
-}
-
-// Return true if path exists (file or directory)
-SPDLOG_INLINE bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    struct _stat buffer;
-#ifdef SPDLOG_WCHAR_FILENAMES
-    return (::_wstat(filename.c_str(), &buffer) == 0);
-#else
-    return (::_stat(filename.c_str(), &buffer) == 0);
-#endif
-#else // common linux/unix all have the stat system call
-    struct stat buffer;
-    return (::stat(filename.c_str(), &buffer) == 0);
-#endif
-}
-
-#ifdef _MSC_VER
-// avoid warning about unreachable statement at the end of filesize()
-#pragma warning(push)
-#pragma warning(disable : 4702)
-#endif
-
-// Return file size according to open FILE* object
-SPDLOG_INLINE size_t filesize(FILE *f) {
-    if (f == nullptr) {
-        throw_spdlog_ex("Failed getting file size. fd is null");
-    }
-#if defined(_WIN32) && !defined(__CYGWIN__)
-    int fd = ::_fileno(f);
-#if defined(_WIN64) // 64 bits
-    __int64 ret = ::_filelengthi64(fd);
-    if (ret >= 0) { return static_cast<size_t>(ret); }
-
-#else // windows 32 bits
-    long ret = ::_filelength(fd);
-    if (ret >= 0) { return static_cast<size_t>(ret); }
-#endif
-
-#else // unix
-// OpenBSD and AIX doesn't compile with :: before the fileno(..)
-#if defined(__OpenBSD__) || defined(_AIX)
-    int fd = fileno(f);
-#else
-    int fd = ::fileno(f);
-#endif
-// 64 bits(but not in osx, linux/musl or cygwin, where fstat64 is deprecated)
-#if ((defined(__linux__) && defined(__GLIBC__)) || defined(__sun) \
-        || defined(_AIX)) \
-        && (defined(__LP64__) || defined(_LP64))
-    struct stat64 st;
-    if (::fstat64(fd, &st) == 0) { return static_cast<size_t>(st.st_size); }
-#else // other unix or linux 32 bits or cygwin
-    struct stat st;
-    if (::fstat(fd, &st) == 0) { return static_cast<size_t>(st.st_size); }
-#endif
-#endif
-    throw_spdlog_ex("Failed getting file size from fd", errno);
-    return 0; // will not be reached.
-}
-
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-// Return utc offset in minutes or throw spdlog_ex on failure
-SPDLOG_INLINE int utc_minutes_offset(const std::tm &tm) {
-#ifdef _WIN32
-#if _WIN32_WINNT < _WIN32_WINNT_WS08
-    TIME_ZONE_INFORMATION tzinfo;
-    auto rv = ::GetTimeZoneInformation(&tzinfo);
-#else
-    DYNAMIC_TIME_ZONE_INFORMATION tzinfo;
-    auto rv = ::GetDynamicTimeZoneInformation(&tzinfo);
-#endif
-    if (rv == TIME_ZONE_ID_INVALID)
-        throw_spdlog_ex("Failed getting timezone info. ", errno);
-
-    int offset = -tzinfo.Bias;
-    if (tm.tm_isdst) {
-        offset -= tzinfo.DaylightBias;
-    } else {
-        offset -= tzinfo.StandardBias;
-    }
-    return offset;
-#else
-
-#if defined(sun) || defined(__sun) || defined(_AIX) \
-        || (defined(__NEWLIB__) && !defined(__TM_GMTOFF)) \
-        || (!defined(_BSD_SOURCE) && !defined(_GNU_SOURCE))
-    // 'tm_gmtoff' field is BSD extension and it's missing on SunOS/Solaris
-    struct helper {
-        static long int calculate_gmt_offset(
-                const std::tm &localtm = details::os::localtime(),
-                const std::tm &gmtm = details::os::gmtime()) {
-            int local_year = localtm.tm_year + (1900 - 1);
-            int gmt_year = gmtm.tm_year + (1900 - 1);
-
-            long int days = (
-                    // difference in day of year
-                    localtm.tm_yday
-                    - gmtm.tm_yday
-
-                    // + intervening leap days
-                    + ((local_year >> 2) - (gmt_year >> 2))
-                    - (local_year / 100 - gmt_year / 100)
-                    + ((local_year / 100 >> 2) - (gmt_year / 100 >> 2))
-
-                    // + difference in years * 365 */
-                    + static_cast<long int>(local_year - gmt_year) * 365);
-
-            long int hours = (24 * days) + (localtm.tm_hour - gmtm.tm_hour);
-            long int mins = (60 * hours) + (localtm.tm_min - gmtm.tm_min);
-            long int secs = (60 * mins) + (localtm.tm_sec - gmtm.tm_sec);
-
-            return secs;
-        }
-    };
-
-    auto offset_seconds = helper::calculate_gmt_offset(tm);
-#else
-    auto offset_seconds = tm.tm_gmtoff;
-#endif
-
-    return static_cast<int>(offset_seconds / 60);
-#endif
-}
-
-// Return current thread id as size_t
-// It exists because the std::this_thread::get_id() is much slower(especially
-// under VS 2013)
-SPDLOG_INLINE size_t _thread_id() SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    return static_cast<size_t>(::GetCurrentThreadId());
-#elif defined(__linux__)
-#if defined(__ANDROID__) && defined(__ANDROID_API__) && (__ANDROID_API__ < 21)
-#define SYS_gettid __NR_gettid
-#endif
-    return static_cast<size_t>(::syscall(SYS_gettid));
-#elif defined(_AIX)
-    struct __pthrdsinfo buf;
-    int reg_size = 0;
-    pthread_t pt = pthread_self();
-    int retval = pthread_getthrds_np(
-            &pt, PTHRDSINFO_QUERY_TID, &buf, sizeof(buf), NULL, &reg_size);
-    int tid = (!retval) ? buf.__pi_tid : 0;
-    return static_cast<size_t>(tid);
-#elif defined(__DragonFly__) || defined(__FreeBSD__)
-    return static_cast<size_t>(::pthread_getthreadid_np());
-#elif defined(__NetBSD__)
-    return static_cast<size_t>(::_lwp_self());
-#elif defined(__OpenBSD__)
-    return static_cast<size_t>(::getthrid());
-#elif defined(__sun)
-    return static_cast<size_t>(::thr_self());
-#elif __APPLE__
-    uint64_t tid;
-// There is no pthread_threadid_np prior to Mac OS X 10.6, and it is not supported on any PPC,
-// including 10.6.8 Rosetta. __POWERPC__ is Apple-specific define encompassing ppc and ppc64.
-#ifdef MAC_OS_X_VERSION_MAX_ALLOWED
-    {
-#if (MAC_OS_X_VERSION_MAX_ALLOWED < 1060) || defined(__POWERPC__)
-        tid = pthread_mach_thread_np(pthread_self());
-#elif MAC_OS_X_VERSION_MIN_REQUIRED < 1060
-        if (&pthread_threadid_np) {
-            pthread_threadid_np(nullptr, &tid);
-        } else {
-            tid = pthread_mach_thread_np(pthread_self());
-        }
-#else
-        pthread_threadid_np(nullptr, &tid);
-#endif
-    }
-#else
-    pthread_threadid_np(nullptr, &tid);
-#endif
-    return static_cast<size_t>(tid);
-#else // Default to standard C++11 (other Unix)
-    return static_cast<size_t>(
-            std::hash<std::thread::id>()(std::this_thread::get_id()));
-#endif
-}
-
-// Return current thread id as size_t (from thread local storage)
-SPDLOG_INLINE size_t thread_id() SPDLOG_NOEXCEPT {
-#if defined(SPDLOG_NO_TLS)
-    return _thread_id();
-#else // cache thread id in tls
-    static thread_local const size_t tid = _thread_id();
-    return tid;
-#endif
-}
-
-// This is avoid msvc issue in sleep_for that happens if the clock changes.
-// See https://github.com/gabime/spdlog/issues/609
-SPDLOG_INLINE void sleep_for_millis(unsigned int milliseconds) SPDLOG_NOEXCEPT {
-#if defined(_WIN32)
-    ::Sleep(milliseconds);
-#else
-    std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
-#endif
-}
-
-// wchar support for windows file names (SPDLOG_WCHAR_FILENAMES must be defined)
-#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
-SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) {
-    memory_buf_t buf;
-    wstr_to_utf8buf(filename, buf);
-    return SPDLOG_BUF_TO_STRING(buf);
-}
-#else
-SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) {
-    return filename;
-}
-#endif
-
-SPDLOG_INLINE int pid() SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    return conditional_static_cast<int>(::GetCurrentProcessId());
-#else
-    return conditional_static_cast<int>(::getpid());
-#endif
-}
-
-// Determine if the terminal supports colors
-// Based on: https://github.com/agauniyal/rang/
-SPDLOG_INLINE bool is_color_terminal() SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    return true;
-#else
-
-    static const bool result = []() {
-        const char *env_colorterm_p = std::getenv("COLORTERM");
-        if (env_colorterm_p != nullptr) { return true; }
-
-        static constexpr std::array<const char *, 16> terms
-                = {{"ansi", "color", "console", "cygwin", "gnome", "konsole",
-                        "kterm", "linux", "msys", "putty", "rxvt", "screen",
-                        "vt100", "xterm", "alacritty", "vt102"}};
-
-        const char *env_term_p = std::getenv("TERM");
-        if (env_term_p == nullptr) { return false; }
-
-        return std::any_of(terms.begin(), terms.end(), [&](const char *term) {
-            return std::strstr(env_term_p, term) != nullptr;
-        });
-    }();
-
-    return result;
-#endif
-}
-
-// Determine if the terminal attached
-// Source: https://github.com/agauniyal/rang/
-SPDLOG_INLINE bool in_terminal(FILE *file) SPDLOG_NOEXCEPT {
-#ifdef _WIN32
-    return ::_isatty(_fileno(file)) != 0;
-#else
-    return ::isatty(fileno(file)) != 0;
-#endif
-}
-
-#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) \
-        && defined(_WIN32)
-SPDLOG_INLINE void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target) {
-    if (wstr.size()
-            > static_cast<size_t>((std::numeric_limits<int>::max)()) / 4 - 1) {
-        throw_spdlog_ex("UTF-16 string is too big to be converted to UTF-8");
-    }
-
-    int wstr_size = static_cast<int>(wstr.size());
-    if (wstr_size == 0) {
-        target.resize(0);
-        return;
-    }
-
-    int result_size = static_cast<int>(target.capacity());
-    if ((wstr_size + 1) * 4 > result_size) {
-        result_size = ::WideCharToMultiByte(
-                CP_UTF8, 0, wstr.data(), wstr_size, NULL, 0, NULL, NULL);
-    }
-
-    if (result_size > 0) {
-        target.resize(result_size);
-        result_size = ::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size,
-                target.data(), result_size, NULL, NULL);
-
-        if (result_size > 0) {
-            target.resize(result_size);
-            return;
-        }
-    }
-
-    throw_spdlog_ex(fmt_lib::format(
-            "WideCharToMultiByte failed. Last error: {}", ::GetLastError()));
-}
-
-SPDLOG_INLINE void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target) {
-    if (str.size()
-            > static_cast<size_t>((std::numeric_limits<int>::max)()) - 1) {
-        throw_spdlog_ex("UTF-8 string is too big to be converted to UTF-16");
-    }
-
-    int str_size = static_cast<int>(str.size());
-    if (str_size == 0) {
-        target.resize(0);
-        return;
-    }
-
-    // find the size to allocate for the result buffer
-    int result_size = ::MultiByteToWideChar(
-            CP_UTF8, MB_ERR_INVALID_CHARS, str.data(), str_size, NULL, 0);
-
-    if (result_size > 0) {
-        target.resize(result_size);
-        result_size = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                str.data(), str_size, target.data(), result_size);
-        if (result_size > 0) {
-            assert(result_size == target.size());
-            return;
-        }
-    }
-
-    throw_spdlog_ex(fmt_lib::format(
-            "MultiByteToWideChar failed. Last error: {}", ::GetLastError()));
-}
-#endif
-
-// return true on success
-static SPDLOG_INLINE bool mkdir_(const filename_t &path) {
-#ifdef _WIN32
-#ifdef SPDLOG_WCHAR_FILENAMES
-    return ::_wmkdir(path.c_str()) == 0;
-#else
-    return ::_mkdir(path.c_str()) == 0;
-#endif
-#else
-    return ::mkdir(path.c_str(), mode_t(0755)) == 0;
-#endif
-}
-
-// create the given directory - and all directories leading to it return true on success or if the directory already exists
-SPDLOG_INLINE bool create_dir(const filename_t &path) {
-    if (path_exists(path)) { return true; }
-
-    if (path.empty()) { return false; }
-
-    size_t search_offset = 0;
-    do {
-        auto token_pos
-                = path.find_first_of(folder_seps_filename, search_offset);
-        // treat the entire path as a folder if no folder separator not found
-        if (token_pos == filename_t::npos) { token_pos = path.size(); }
-
-        auto subdir = path.substr(0, token_pos);
-#ifdef _WIN32
-        // if subdir is just a drive letter, add a slash e.g. "c:"=>"c:\",
-        // otherwise path_exists(subdir) returns false (issue #3079)
-        const bool is_drive = subdir.length() == 2 && subdir[1] == ':';
-        if (is_drive) {
-            subdir += '\\';
-            token_pos++;
-        }
-#endif
-
-        if (!subdir.empty() && !path_exists(subdir) && !mkdir_(subdir)) {
-            return false; // return error if failed creating dir
-        }
-        search_offset = token_pos + 1;
-    } while (search_offset < path.size());
-
-    return true;
-}
-
-// Return directory name from given path or empty string
-// "abc/file" => "abc"
-// "abc/" => "abc"
-// "abc" => ""
-// "abc///" => "abc//"
-SPDLOG_INLINE filename_t dir_name(const filename_t &path) {
-    auto pos = path.find_last_of(folder_seps_filename);
-    return pos != filename_t::npos ? path.substr(0, pos) : filename_t {};
-}
-
-std::string SPDLOG_INLINE getenv(const char *field) {
-#if defined(_MSC_VER)
-#if defined(__cplusplus_winrt)
-    return std::string {}; // not supported under uwp
-#else
-    size_t len = 0;
-    char buf[128];
-    bool ok = ::getenv_s(&len, buf, sizeof(buf), field) == 0;
-    return ok ? buf : std::string {};
-#endif
-#else // revert to getenv
-    char *buf = ::getenv(field);
-    return buf ? buf : std::string {};
-#endif
-}
-
-// Do fsync by FILE handlerpointer
-// Return true on success
-SPDLOG_INLINE bool fsync(FILE *fp) {
-#ifdef _WIN32
-    return FlushFileBuffers(
-                   reinterpret_cast<HANDLE>(_get_osfhandle(_fileno(fp))))
-            != 0;
-#else
-    return ::fsync(fileno(fp)) == 0;
-#endif
-}
-
-} // namespace os
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/registry-inl.h b/src/common/spdlog/details/registry-inl.h
deleted file mode 100755
index b34c9cf07cf..00000000000
--- a/src/common/spdlog/details/registry-inl.h
+++ /dev/null
@@ -1,271 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/registry.h>
-#endif
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/periodic_worker.h>
-#include <common/spdlog/logger.h>
-#include <common/spdlog/pattern_formatter.h>
-
-#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
-// support for the default stdout color logger
-#ifdef _WIN32
-#include <common/spdlog/sinks/wincolor_sink.h>
-#else
-#include <common/spdlog/sinks/ansicolor_sink.h>
-#endif
-#endif // SPDLOG_DISABLE_DEFAULT_LOGGER
-
-#include <chrono>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-
-namespace spdlog {
-namespace details {
-
-SPDLOG_INLINE registry::registry() : formatter_(new pattern_formatter()) {
-#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
-// create default logger (ansicolor_stdout_sink_mt or wincolor_stdout_sink_mt in windows).
-#ifdef _WIN32
-    auto color_sink = std::make_shared<sinks::wincolor_stdout_sink_mt>();
-#else
-    auto color_sink = std::make_shared<sinks::ansicolor_stdout_sink_mt>();
-#endif
-
-    const char *default_logger_name = "";
-    default_logger_ = std::make_shared<spdlog::logger>(
-            default_logger_name, std::move(color_sink));
-    loggers_[default_logger_name] = default_logger_;
-
-#endif // SPDLOG_DISABLE_DEFAULT_LOGGER
-}
-
-SPDLOG_INLINE registry::~registry() = default;
-
-SPDLOG_INLINE void registry::register_logger(
-        std::shared_ptr<logger> new_logger) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    register_logger_(std::move(new_logger));
-}
-
-SPDLOG_INLINE void registry::initialize_logger(
-        std::shared_ptr<logger> new_logger) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    new_logger->set_formatter(formatter_->clone());
-
-    if (err_handler_) { new_logger->set_error_handler(err_handler_); }
-
-    // set new level according to previously configured level or default level
-    auto it = log_levels_.find(new_logger->name());
-    auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
-    new_logger->set_level(new_level);
-
-    new_logger->flush_on(flush_level_);
-
-    if (backtrace_n_messages_ > 0) {
-        new_logger->enable_backtrace(backtrace_n_messages_);
-    }
-
-    if (automatic_registration_) { register_logger_(std::move(new_logger)); }
-}
-
-SPDLOG_INLINE std::shared_ptr<logger> registry::get(
-        const std::string &logger_name) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    auto found = loggers_.find(logger_name);
-    return found == loggers_.end() ? nullptr : found->second;
-}
-
-SPDLOG_INLINE std::shared_ptr<logger> registry::default_logger() {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    return default_logger_;
-}
-
-// Return raw ptr to the default logger.
-// To be used directly by the spdlog default api (e.g. spdlog::info)
-// This make the default API faster, but cannot be used concurrently with set_default_logger().
-// e.g do not call set_default_logger() from one thread while calling spdlog::info() from another.
-SPDLOG_INLINE logger *registry::get_default_raw() {
-    return default_logger_.get();
-}
-
-// set default logger.
-// default logger is stored in default_logger_ (for faster retrieval) and in the loggers_ map.
-SPDLOG_INLINE void registry::set_default_logger(
-        std::shared_ptr<logger> new_default_logger) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    if (new_default_logger != nullptr) {
-        loggers_[new_default_logger->name()] = new_default_logger;
-    }
-    default_logger_ = std::move(new_default_logger);
-}
-
-SPDLOG_INLINE void registry::set_tp(std::shared_ptr<thread_pool> tp) {
-    std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
-    tp_ = std::move(tp);
-}
-
-SPDLOG_INLINE std::shared_ptr<thread_pool> registry::get_tp() {
-    std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
-    return tp_;
-}
-
-// Set global formatter. Each sink in each logger will get a clone of this object
-SPDLOG_INLINE void registry::set_formatter(
-        std::unique_ptr<formatter> formatter) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    formatter_ = std::move(formatter);
-    for (auto &l : loggers_) {
-        l.second->set_formatter(formatter_->clone());
-    }
-}
-
-SPDLOG_INLINE void registry::enable_backtrace(size_t n_messages) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    backtrace_n_messages_ = n_messages;
-
-    for (auto &l : loggers_) {
-        l.second->enable_backtrace(n_messages);
-    }
-}
-
-SPDLOG_INLINE void registry::disable_backtrace() {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    backtrace_n_messages_ = 0;
-    for (auto &l : loggers_) {
-        l.second->disable_backtrace();
-    }
-}
-
-SPDLOG_INLINE void registry::set_level(level::level_enum log_level) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    for (auto &l : loggers_) {
-        l.second->set_level(log_level);
-    }
-    global_log_level_ = log_level;
-}
-
-SPDLOG_INLINE void registry::flush_on(level::level_enum log_level) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    for (auto &l : loggers_) {
-        l.second->flush_on(log_level);
-    }
-    flush_level_ = log_level;
-}
-
-SPDLOG_INLINE void registry::set_error_handler(err_handler handler) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    for (auto &l : loggers_) {
-        l.second->set_error_handler(handler);
-    }
-    err_handler_ = std::move(handler);
-}
-
-SPDLOG_INLINE void registry::apply_all(
-        const std::function<void(const std::shared_ptr<logger>)> &fun) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    for (auto &l : loggers_) {
-        fun(l.second);
-    }
-}
-
-SPDLOG_INLINE void registry::flush_all() {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    for (auto &l : loggers_) {
-        l.second->flush();
-    }
-}
-
-SPDLOG_INLINE void registry::drop(const std::string &logger_name) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    auto is_default_logger
-            = default_logger_ && default_logger_->name() == logger_name;
-    loggers_.erase(logger_name);
-    if (is_default_logger) { default_logger_.reset(); }
-}
-
-SPDLOG_INLINE void registry::drop_all() {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    loggers_.clear();
-    default_logger_.reset();
-}
-
-// clean all resources and threads started by the registry
-SPDLOG_INLINE void registry::shutdown() {
-    {
-        std::lock_guard<std::mutex> lock(flusher_mutex_);
-        periodic_flusher_.reset();
-    }
-
-    drop_all();
-
-    {
-        std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
-        tp_.reset();
-    }
-}
-
-SPDLOG_INLINE std::recursive_mutex &registry::tp_mutex() {
-    return tp_mutex_;
-}
-
-SPDLOG_INLINE void registry::set_automatic_registration(
-        bool automatic_registration) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    automatic_registration_ = automatic_registration;
-}
-
-SPDLOG_INLINE void registry::set_levels(
-        log_levels levels, level::level_enum *global_level) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    log_levels_ = std::move(levels);
-    auto global_level_requested = global_level != nullptr;
-    global_log_level_
-            = global_level_requested ? *global_level : global_log_level_;
-
-    for (auto &logger : loggers_) {
-        auto logger_entry = log_levels_.find(logger.first);
-        if (logger_entry != log_levels_.end()) {
-            logger.second->set_level(logger_entry->second);
-        } else if (global_level_requested) {
-            logger.second->set_level(*global_level);
-        }
-    }
-}
-
-SPDLOG_INLINE registry &registry::instance() {
-    static registry s_instance;
-    return s_instance;
-}
-
-SPDLOG_INLINE void registry::apply_logger_env_levels(
-        std::shared_ptr<logger> new_logger) {
-    std::lock_guard<std::mutex> lock(logger_map_mutex_);
-    auto it = log_levels_.find(new_logger->name());
-    auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
-    new_logger->set_level(new_level);
-}
-
-SPDLOG_INLINE void registry::throw_if_exists_(const std::string &logger_name) {
-    if (loggers_.find(logger_name) != loggers_.end()) {
-        throw_spdlog_ex(
-                "logger with name '" + logger_name + "' already exists");
-    }
-}
-
-SPDLOG_INLINE void registry::register_logger_(
-        std::shared_ptr<logger> new_logger) {
-    auto logger_name = new_logger->name();
-    throw_if_exists_(logger_name);
-    loggers_[logger_name] = std::move(new_logger);
-}
-
-} // namespace details
-} // namespace spdlog
diff --git a/src/common/spdlog/details/synchronous_factory.h b/src/common/spdlog/details/synchronous_factory.h
deleted file mode 100755
index dbe67d72d45..00000000000
--- a/src/common/spdlog/details/synchronous_factory.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include "registry.h"
-
-namespace spdlog {
-
-// Default logger factory-  creates synchronous loggers
-class logger;
-
-struct synchronous_factory {
-    template <typename Sink, typename... SinkArgs>
-    static std::shared_ptr<spdlog::logger> create(
-            std::string logger_name, SinkArgs &&...args) {
-        auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
-        auto new_logger = std::make_shared<spdlog::logger>(
-                std::move(logger_name), std::move(sink));
-        details::registry::instance().initialize_logger(new_logger);
-        return new_logger;
-    }
-};
-} // namespace spdlog
diff --git a/src/common/spdlog/details/windows_include.h b/src/common/spdlog/details/windows_include.h
deleted file mode 100755
index 6a2f14f9c76..00000000000
--- a/src/common/spdlog/details/windows_include.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#ifndef NOMINMAX
-#define NOMINMAX // prevent windows redefining min/max
-#endif
-
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-
-#include <windows.h>
diff --git a/src/common/spdlog/fmt/bundled/core.h b/src/common/spdlog/fmt/bundled/core.h
deleted file mode 100755
index 26686495b67..00000000000
--- a/src/common/spdlog/fmt/bundled/core.h
+++ /dev/null
@@ -1,3059 +0,0 @@
-// Formatting library for C++ - the core API for char/UTF-8
-//
-// Copyright (c) 2012 - present, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_CORE_H_
-#define FMT_CORE_H_
-
-#include <cstddef> // std::byte
-#include <cstdio> // std::FILE
-#include <cstring> // std::strlen
-#include <iterator>
-#include <limits>
-#include <memory> // std::addressof
-#include <string>
-#include <type_traits>
-
-// The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 100201
-
-#if defined(__clang__) && !defined(__ibmxl__)
-#define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
-#else
-#define FMT_CLANG_VERSION 0
-#endif
-
-#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) \
-        && !defined(__NVCOMPILER)
-#define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#else
-#define FMT_GCC_VERSION 0
-#endif
-
-#ifndef FMT_GCC_PRAGMA
-// Workaround _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884.
-#if FMT_GCC_VERSION >= 504
-#define FMT_GCC_PRAGMA(arg) _Pragma(arg)
-#else
-#define FMT_GCC_PRAGMA(arg)
-#endif
-#endif
-
-#ifdef __ICL
-#define FMT_ICC_VERSION __ICL
-#elif defined(__INTEL_COMPILER)
-#define FMT_ICC_VERSION __INTEL_COMPILER
-#else
-#define FMT_ICC_VERSION 0
-#endif
-
-#ifdef _MSC_VER
-#define FMT_MSC_VERSION _MSC_VER
-#define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
-#else
-#define FMT_MSC_VERSION 0
-#define FMT_MSC_WARNING(...)
-#endif
-
-#ifdef _MSVC_LANG
-#define FMT_CPLUSPLUS _MSVC_LANG
-#else
-#define FMT_CPLUSPLUS __cplusplus
-#endif
-
-#ifdef __has_feature
-#define FMT_HAS_FEATURE(x) __has_feature(x)
-#else
-#define FMT_HAS_FEATURE(x) 0
-#endif
-
-#if defined(__has_include) || FMT_ICC_VERSION >= 1600 || FMT_MSC_VERSION > 1900
-#define FMT_HAS_INCLUDE(x) __has_include(x)
-#else
-#define FMT_HAS_INCLUDE(x) 0
-#endif
-
-#ifdef __has_cpp_attribute
-#define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
-#else
-#define FMT_HAS_CPP_ATTRIBUTE(x) 0
-#endif
-
-#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
-    (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
-
-#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
-    (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
-
-// Check if relaxed C++14 constexpr is supported.
-// GCC doesn't allow throw in constexpr until version 6 (bug 67371).
-#ifndef FMT_USE_CONSTEXPR
-#if (FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912 \
-        || (FMT_GCC_VERSION >= 600 && FMT_CPLUSPLUS >= 201402L)) \
-        && !FMT_ICC_VERSION \
-        && (!defined(__NVCC__) || FMT_CPLUSPLUS >= 202002L)
-#define FMT_USE_CONSTEXPR 1
-#else
-#define FMT_USE_CONSTEXPR 0
-#endif
-#endif
-#if FMT_USE_CONSTEXPR
-#define FMT_CONSTEXPR constexpr
-#else
-#define FMT_CONSTEXPR
-#endif
-
-#if (FMT_CPLUSPLUS >= 202002L \
-        || (FMT_CPLUSPLUS >= 201709L && FMT_GCC_VERSION >= 1002)) \
-        && ((!defined(_GLIBCXX_RELEASE) || _GLIBCXX_RELEASE >= 10) \
-                && (!defined(_LIBCPP_VERSION) || _LIBCPP_VERSION >= 10000) \
-                && (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1928)) \
-        && defined(__cpp_lib_is_constant_evaluated)
-#define FMT_CONSTEXPR20 constexpr
-#else
-#define FMT_CONSTEXPR20
-#endif
-
-// Check if constexpr std::char_traits<>::{compare,length} are supported.
-#if defined(__GLIBCXX__)
-#if FMT_CPLUSPLUS >= 201703L && defined(_GLIBCXX_RELEASE) \
-        && _GLIBCXX_RELEASE >= 7 // GCC 7+ libstdc++ has _GLIBCXX_RELEASE.
-#define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#endif
-#elif defined(_LIBCPP_VERSION) && FMT_CPLUSPLUS >= 201703L \
-        && _LIBCPP_VERSION >= 4000
-#define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#elif FMT_MSC_VERSION >= 1914 && FMT_CPLUSPLUS >= 201703L
-#define FMT_CONSTEXPR_CHAR_TRAITS constexpr
-#endif
-#ifndef FMT_CONSTEXPR_CHAR_TRAITS
-#define FMT_CONSTEXPR_CHAR_TRAITS
-#endif
-
-// Check if exceptions are disabled.
-#ifndef FMT_EXCEPTIONS
-#if (defined(__GNUC__) && !defined(__EXCEPTIONS)) \
-        || (FMT_MSC_VERSION && !_HAS_EXCEPTIONS)
-#define FMT_EXCEPTIONS 0
-#else
-#define FMT_EXCEPTIONS 1
-#endif
-#endif
-
-// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
-#if FMT_EXCEPTIONS && FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION \
-        && !defined(__NVCC__)
-#define FMT_NORETURN [[noreturn]]
-#else
-#define FMT_NORETURN
-#endif
-
-#ifndef FMT_NODISCARD
-#if FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
-#define FMT_NODISCARD [[nodiscard]]
-#else
-#define FMT_NODISCARD
-#endif
-#endif
-
-#ifndef FMT_INLINE
-#if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#define FMT_INLINE inline __attribute__((always_inline))
-#else
-#define FMT_INLINE inline
-#endif
-#endif
-
-#ifdef _MSC_VER
-#define FMT_UNCHECKED_ITERATOR(It) \
-    using _Unchecked_type = It // Mark iterator as checked.
-#else
-#define FMT_UNCHECKED_ITERATOR(It) using unchecked_type = It
-#endif
-
-#ifndef FMT_BEGIN_NAMESPACE
-#define FMT_BEGIN_NAMESPACE \
-    namespace fmt { \
-    inline namespace v10 {
-#define FMT_END_NAMESPACE \
-    } \
-    }
-#endif
-
-#ifndef FMT_EXPORT
-#define FMT_EXPORT
-#define FMT_BEGIN_EXPORT
-#define FMT_END_EXPORT
-#endif
-
-#if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#define FMT_VISIBILITY(value) __attribute__((visibility(value)))
-#else
-#define FMT_VISIBILITY(value)
-#endif
-
-#if !defined(FMT_HEADER_ONLY) && defined(_WIN32)
-#if defined(FMT_LIB_EXPORT)
-#define FMT_API __declspec(dllexport)
-#elif defined(FMT_SHARED)
-#define FMT_API __declspec(dllimport)
-#endif
-#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
-#define FMT_API FMT_VISIBILITY("default")
-#endif
-#ifndef FMT_API
-#define FMT_API
-#endif
-
-// libc++ supports string_view in pre-c++17.
-#if FMT_HAS_INCLUDE(<string_view>) \
-        && (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
-#include <string_view>
-#define FMT_USE_STRING_VIEW
-#elif FMT_HAS_INCLUDE("experimental/string_view") && FMT_CPLUSPLUS >= 201402L
-#include <experimental/string_view>
-#define FMT_USE_EXPERIMENTAL_STRING_VIEW
-#endif
-
-#ifndef FMT_UNICODE
-#define FMT_UNICODE !FMT_MSC_VERSION
-#endif
-
-#ifndef FMT_CONSTEVAL
-#if ((FMT_GCC_VERSION >= 1000 || FMT_CLANG_VERSION >= 1101) \
-        && (!defined(__apple_build_version__) \
-                || __apple_build_version__ >= 14000029L) \
-        && FMT_CPLUSPLUS >= 202002L) \
-        || (defined(__cpp_consteval) \
-                && (!FMT_MSC_VERSION || FMT_MSC_VERSION >= 1929))
-// consteval is broken in MSVC before VS2019 version 16.10 and Apple clang
-// before 14.
-#define FMT_CONSTEVAL consteval
-#define FMT_HAS_CONSTEVAL
-#else
-#define FMT_CONSTEVAL
-#endif
-#endif
-
-#ifndef FMT_USE_NONTYPE_TEMPLATE_ARGS
-#if defined(__cpp_nontype_template_args) \
-        && ((FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L) \
-                || __cpp_nontype_template_args >= 201911L) \
-        && !defined(__NVCOMPILER) && !defined(__LCC__)
-#define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
-#else
-#define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
-#endif
-#endif
-
-// GCC < 5 requires this-> in decltype
-#ifndef FMT_DECLTYPE_THIS
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
-#define FMT_DECLTYPE_THIS this->
-#else
-#define FMT_DECLTYPE_THIS
-#endif
-#endif
-
-// Enable minimal optimizations for more compact code in debug mode.
-FMT_GCC_PRAGMA("GCC push_options")
-#if !defined(__OPTIMIZE__) && !defined(__NVCOMPILER) && !defined(__LCC__) \
-        && !defined(__CUDACC__)
-FMT_GCC_PRAGMA("GCC optimize(\"Og\")")
-#endif
-
-FMT_BEGIN_NAMESPACE
-
-// Implementations of enable_if_t and other metafunctions for older systems.
-template <bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-template <bool B, typename T, typename F>
-using conditional_t = typename std::conditional<B, T, F>::type;
-template <bool B>
-using bool_constant = std::integral_constant<bool, B>;
-template <typename T>
-using remove_reference_t = typename std::remove_reference<T>::type;
-template <typename T>
-using remove_const_t = typename std::remove_const<T>::type;
-template <typename T>
-using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
-template <typename T>
-struct type_identity {
-    using type = T;
-};
-template <typename T>
-using type_identity_t = typename type_identity<T>::type;
-template <typename T>
-using underlying_t = typename std::underlying_type<T>::type;
-
-// Checks whether T is a container with contiguous storage.
-template <typename T>
-struct is_contiguous : std::false_type {};
-template <typename Char>
-struct is_contiguous<std::basic_string<Char>> : std::true_type {};
-
-struct monostate {
-    constexpr monostate() {}
-};
-
-// An enable_if helper to be used in template parameters which results in much
-// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
-// to workaround a bug in MSVC 2019 (see #1140 and #1186).
-#ifdef FMT_DOC
-#define FMT_ENABLE_IF(...)
-#else
-#define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
-#endif
-
-// This is defined in core.h instead of format.h to avoid injecting in std.
-// It is a template to avoid undesirable implicit conversions to std::byte.
-#ifdef __cpp_lib_byte
-template <typename T, FMT_ENABLE_IF(std::is_same<T, std::byte>::value)>
-inline auto format_as(T b) -> unsigned char {
-    return static_cast<unsigned char>(b);
-}
-#endif
-
-namespace detail {
-// Suppresses "unused variable" warnings with the method described in
-// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
-// (void)var does not work on many Intel compilers.
-template <typename... T>
-FMT_CONSTEXPR void ignore_unused(const T &...) {}
-
-constexpr FMT_INLINE auto is_constant_evaluated(
-        bool default_value = false) noexcept -> bool {
-// Workaround for incompatibility between libstdc++ consteval-based
-// std::is_constant_evaluated() implementation and clang-14.
-// https://github.com/fmtlib/fmt/issues/3247
-#if FMT_CPLUSPLUS >= 202002L && defined(_GLIBCXX_RELEASE) \
-        && _GLIBCXX_RELEASE >= 12 \
-        && (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
-    ignore_unused(default_value);
-    return __builtin_is_constant_evaluated();
-#elif defined(__cpp_lib_is_constant_evaluated)
-    ignore_unused(default_value);
-    return std::is_constant_evaluated();
-#else
-    return default_value;
-#endif
-}
-
-// Suppresses "conditional expression is constant" warnings.
-template <typename T>
-constexpr FMT_INLINE auto const_check(T value) -> T {
-    return value;
-}
-
-FMT_NORETURN FMT_API void assert_fail(
-        const char *file, int line, const char *message);
-
-#ifndef FMT_ASSERT
-#ifdef NDEBUG
-// FMT_ASSERT is not empty to avoid -Wempty-body.
-#define FMT_ASSERT(condition, message) \
-    fmt::detail::ignore_unused((condition), (message))
-#else
-#define FMT_ASSERT(condition, message) \
-    ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
-                    ? (void)0 \
-                    : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
-#endif
-#endif
-
-#if defined(FMT_USE_STRING_VIEW)
-template <typename Char>
-using std_string_view = std::basic_string_view<Char>;
-#elif defined(FMT_USE_EXPERIMENTAL_STRING_VIEW)
-template <typename Char>
-using std_string_view = std::experimental::basic_string_view<Char>;
-#else
-template <typename T>
-struct std_string_view {};
-#endif
-
-#ifdef FMT_USE_INT128
-// Do nothing.
-#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) \
-        && !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
-#define FMT_USE_INT128 1
-using int128_opt = __int128_t; // An optional native 128-bit integer.
-using uint128_opt = __uint128_t;
-template <typename T>
-inline auto convert_for_visit(T value) -> T {
-    return value;
-}
-#else
-#define FMT_USE_INT128 0
-#endif
-#if !FMT_USE_INT128
-enum class int128_opt {};
-enum class uint128_opt {};
-// Reduce template instantiations.
-template <typename T>
-auto convert_for_visit(T) -> monostate {
-    return {};
-}
-#endif
-
-// Casts a nonnegative integer to unsigned.
-template <typename Int>
-FMT_CONSTEXPR auto to_unsigned(Int value) ->
-        typename std::make_unsigned<Int>::type {
-    FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
-    return static_cast<typename std::make_unsigned<Int>::type>(value);
-}
-
-FMT_CONSTEXPR inline auto is_utf8() -> bool {
-    FMT_MSC_WARNING(suppress : 4566)
-    constexpr unsigned char section[] = "\u00A7";
-
-    // Avoid buggy sign extensions in MSVC's constant evaluation mode (#2297).
-    using uchar = unsigned char;
-    return FMT_UNICODE
-            || (sizeof(section) == 3 && uchar(section[0]) == 0xC2
-                    && uchar(section[1]) == 0xA7);
-}
-} // namespace detail
-
-/**
-  An implementation of ``std::basic_string_view`` for pre-C++17. It provides a
-  subset of the API. ``fmt::basic_string_view`` is used for format strings even
-  if ``std::string_view`` is available to prevent issues when a library is
-  compiled with a different ``-std`` option than the client code (which is not
-  recommended).
- */
-FMT_EXPORT
-template <typename Char>
-class basic_string_view {
-private:
-    const Char *data_;
-    size_t size_;
-
-public:
-    using value_type = Char;
-    using iterator = const Char *;
-
-    constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
-
-    /** Constructs a string reference object from a C string and a size. */
-    constexpr basic_string_view(const Char *s, size_t count) noexcept
-        : data_(s), size_(count) {}
-
-    /**
-    \rst
-    Constructs a string reference object from a C string computing
-    the size with ``std::char_traits<Char>::length``.
-    \endrst
-   */
-    FMT_CONSTEXPR_CHAR_TRAITS
-    FMT_INLINE
-    basic_string_view(const Char *s)
-        : data_(s)
-        , size_(detail::const_check(std::is_same<Char, char>::value
-                        && !detail::is_constant_evaluated(true))
-                          ? std::strlen(reinterpret_cast<const char *>(s))
-                          : std::char_traits<Char>::length(s)) {}
-
-    /** Constructs a string reference from a ``std::basic_string`` object. */
-    template <typename Traits, typename Alloc>
-    FMT_CONSTEXPR basic_string_view(
-            const std::basic_string<Char, Traits, Alloc> &s) noexcept
-        : data_(s.data()), size_(s.size()) {}
-
-    template <typename S,
-            FMT_ENABLE_IF(
-                    std::is_same<S, detail::std_string_view<Char>>::value)>
-    FMT_CONSTEXPR basic_string_view(S s) noexcept
-        : data_(s.data()), size_(s.size()) {}
-
-    /** Returns a pointer to the string data. */
-    constexpr auto data() const noexcept -> const Char * { return data_; }
-
-    /** Returns the string size. */
-    constexpr auto size() const noexcept -> size_t { return size_; }
-
-    constexpr auto begin() const noexcept -> iterator { return data_; }
-    constexpr auto end() const noexcept -> iterator { return data_ + size_; }
-
-    constexpr auto operator[](size_t pos) const noexcept -> const Char & {
-        return data_[pos];
-    }
-
-    FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
-        data_ += n;
-        size_ -= n;
-    }
-
-    FMT_CONSTEXPR_CHAR_TRAITS auto starts_with(
-            basic_string_view<Char> sv) const noexcept -> bool {
-        return size_ >= sv.size_
-                && std::char_traits<Char>::compare(data_, sv.data_, sv.size_)
-                == 0;
-    }
-    FMT_CONSTEXPR_CHAR_TRAITS auto starts_with(Char c) const noexcept -> bool {
-        return size_ >= 1 && std::char_traits<Char>::eq(*data_, c);
-    }
-    FMT_CONSTEXPR_CHAR_TRAITS auto starts_with(const Char *s) const -> bool {
-        return starts_with(basic_string_view<Char>(s));
-    }
-
-    // Lexicographically compare this string reference to other.
-    FMT_CONSTEXPR_CHAR_TRAITS auto compare(basic_string_view other) const
-            -> int {
-        size_t str_size = size_ < other.size_ ? size_ : other.size_;
-        int result
-                = std::char_traits<Char>::compare(data_, other.data_, str_size);
-        if (result == 0)
-            result = size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
-        return result;
-    }
-
-    FMT_CONSTEXPR_CHAR_TRAITS friend auto operator==(
-            basic_string_view lhs, basic_string_view rhs) -> bool {
-        return lhs.compare(rhs) == 0;
-    }
-    friend auto operator!=(basic_string_view lhs, basic_string_view rhs)
-            -> bool {
-        return lhs.compare(rhs) != 0;
-    }
-    friend auto operator<(basic_string_view lhs, basic_string_view rhs)
-            -> bool {
-        return lhs.compare(rhs) < 0;
-    }
-    friend auto operator<=(basic_string_view lhs, basic_string_view rhs)
-            -> bool {
-        return lhs.compare(rhs) <= 0;
-    }
-    friend auto operator>(basic_string_view lhs, basic_string_view rhs)
-            -> bool {
-        return lhs.compare(rhs) > 0;
-    }
-    friend auto operator>=(basic_string_view lhs, basic_string_view rhs)
-            -> bool {
-        return lhs.compare(rhs) >= 0;
-    }
-};
-
-FMT_EXPORT
-using string_view = basic_string_view<char>;
-
-/** Specifies if ``T`` is a character type. Can be specialized by users. */
-FMT_EXPORT
-template <typename T>
-struct is_char : std::false_type {};
-template <>
-struct is_char<char> : std::true_type {};
-
-namespace detail {
-
-// A base class for compile-time strings.
-struct compile_string {};
-
-template <typename S>
-struct is_compile_string : std::is_base_of<compile_string, S> {};
-
-template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
-FMT_INLINE auto to_string_view(const Char *s) -> basic_string_view<Char> {
-    return s;
-}
-template <typename Char, typename Traits, typename Alloc>
-inline auto to_string_view(const std::basic_string<Char, Traits, Alloc> &s)
-        -> basic_string_view<Char> {
-    return s;
-}
-template <typename Char>
-constexpr auto to_string_view(basic_string_view<Char> s)
-        -> basic_string_view<Char> {
-    return s;
-}
-template <typename Char,
-        FMT_ENABLE_IF(!std::is_empty<std_string_view<Char>>::value)>
-inline auto to_string_view(std_string_view<Char> s) -> basic_string_view<Char> {
-    return s;
-}
-template <typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
-constexpr auto to_string_view(const S &s)
-        -> basic_string_view<typename S::char_type> {
-    return basic_string_view<typename S::char_type>(s);
-}
-void to_string_view(...);
-
-// Specifies whether S is a string type convertible to fmt::basic_string_view.
-// It should be a constexpr function but MSVC 2017 fails to compile it in
-// enable_if and MSVC 2015 fails to compile it as an alias template.
-// Arg Dep Lookup is intentionally disabled as to_string_view is not an
-// extension point.
-template <typename S>
-struct is_string
-    : std::is_class<decltype(detail::to_string_view(std::declval<S>()))> {};
-
-template <typename S, typename = void>
-struct char_t_impl {};
-template <typename S>
-struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
-    using result = decltype(to_string_view(std::declval<S>()));
-    using type = typename result::value_type;
-};
-
-enum class type {
-    none_type,
-    // Integer types should go first,
-    int_type,
-    uint_type,
-    long_long_type,
-    ulong_long_type,
-    int128_type,
-    uint128_type,
-    bool_type,
-    char_type,
-    last_integer_type = char_type,
-    // followed by floating-point types.
-    float_type,
-    double_type,
-    long_double_type,
-    last_numeric_type = long_double_type,
-    cstring_type,
-    string_type,
-    pointer_type,
-    custom_type
-};
-
-// Maps core type T to the corresponding type enum constant.
-template <typename T, typename Char>
-struct type_constant : std::integral_constant<type, type::custom_type> {};
-
-#define FMT_TYPE_CONSTANT(Type, constant) \
-    template <typename Char> \
-    struct type_constant<Type, Char> \
-        : std::integral_constant<type, type::constant> {}
-
-FMT_TYPE_CONSTANT(int, int_type);
-FMT_TYPE_CONSTANT(unsigned, uint_type);
-FMT_TYPE_CONSTANT(long long, long_long_type);
-FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
-FMT_TYPE_CONSTANT(int128_opt, int128_type);
-FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
-FMT_TYPE_CONSTANT(bool, bool_type);
-FMT_TYPE_CONSTANT(Char, char_type);
-FMT_TYPE_CONSTANT(float, float_type);
-FMT_TYPE_CONSTANT(double, double_type);
-FMT_TYPE_CONSTANT(long double, long_double_type);
-FMT_TYPE_CONSTANT(const Char *, cstring_type);
-FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
-FMT_TYPE_CONSTANT(const void *, pointer_type);
-
-constexpr auto is_integral_type(type t) -> bool {
-    return t > type::none_type && t <= type::last_integer_type;
-}
-constexpr auto is_arithmetic_type(type t) -> bool {
-    return t > type::none_type && t <= type::last_numeric_type;
-}
-
-constexpr auto set(type rhs) -> int {
-    return 1 << static_cast<int>(rhs);
-}
-constexpr auto in(type t, int set) -> bool {
-    return ((set >> static_cast<int>(t)) & 1) != 0;
-}
-
-// Bitsets of types.
-enum {
-    sint_set
-    = set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
-    uint_set = set(type::uint_type) | set(type::ulong_long_type)
-            | set(type::uint128_type),
-    bool_set = set(type::bool_type),
-    char_set = set(type::char_type),
-    float_set = set(type::float_type) | set(type::double_type)
-            | set(type::long_double_type),
-    string_set = set(type::string_type),
-    cstring_set = set(type::cstring_type),
-    pointer_set = set(type::pointer_type)
-};
-
-// DEPRECATED!
-FMT_NORETURN FMT_API void throw_format_error(const char *message);
-
-struct error_handler {
-    constexpr error_handler() = default;
-
-    // This function is intentionally not constexpr to give a compile-time error.
-    FMT_NORETURN void on_error(const char *message) {
-        throw_format_error(message);
-    }
-};
-} // namespace detail
-
-/** Throws ``format_error`` with a given message. */
-using detail::throw_format_error;
-
-/** String's character type. */
-template <typename S>
-using char_t = typename detail::char_t_impl<S>::type;
-
-/**
-  \rst
-  Parsing context consisting of a format string range being parsed and an
-  argument counter for automatic indexing.
-  You can use the ``format_parse_context`` type alias for ``char`` instead.
-  \endrst
- */
-FMT_EXPORT
-template <typename Char>
-class basic_format_parse_context {
-private:
-    basic_string_view<Char> format_str_;
-    int next_arg_id_;
-
-    FMT_CONSTEXPR void do_check_arg_id(int id);
-
-public:
-    using char_type = Char;
-    using iterator = const Char *;
-
-    explicit constexpr basic_format_parse_context(
-            basic_string_view<Char> format_str, int next_arg_id = 0)
-        : format_str_(format_str), next_arg_id_(next_arg_id) {}
-
-    /**
-    Returns an iterator to the beginning of the format string range being
-    parsed.
-   */
-    constexpr auto begin() const noexcept -> iterator {
-        return format_str_.begin();
-    }
-
-    /**
-    Returns an iterator past the end of the format string range being parsed.
-   */
-    constexpr auto end() const noexcept -> iterator {
-        return format_str_.end();
-    }
-
-    /** Advances the begin iterator to ``it``. */
-    FMT_CONSTEXPR void advance_to(iterator it) {
-        format_str_.remove_prefix(detail::to_unsigned(it - begin()));
-    }
-
-    /**
-    Reports an error if using the manual argument indexing; otherwise returns
-    the next argument index and switches to the automatic indexing.
-   */
-    FMT_CONSTEXPR auto next_arg_id() -> int {
-        if (next_arg_id_ < 0) {
-            detail::throw_format_error(
-                    "cannot switch from manual to automatic argument indexing");
-            return 0;
-        }
-        int id = next_arg_id_++;
-        do_check_arg_id(id);
-        return id;
-    }
-
-    /**
-    Reports an error if using the automatic argument indexing; otherwise
-    switches to the manual indexing.
-   */
-    FMT_CONSTEXPR void check_arg_id(int id) {
-        if (next_arg_id_ > 0) {
-            detail::throw_format_error(
-                    "cannot switch from automatic to manual argument indexing");
-            return;
-        }
-        next_arg_id_ = -1;
-        do_check_arg_id(id);
-    }
-    FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {}
-    FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
-};
-
-FMT_EXPORT
-using format_parse_context = basic_format_parse_context<char>;
-
-namespace detail {
-// A parse context with extra data used only in compile-time checks.
-template <typename Char>
-class compile_parse_context : public basic_format_parse_context<Char> {
-private:
-    int num_args_;
-    const type *types_;
-    using base = basic_format_parse_context<Char>;
-
-public:
-    explicit FMT_CONSTEXPR compile_parse_context(
-            basic_string_view<Char> format_str, int num_args, const type *types,
-            int next_arg_id = 0)
-        : base(format_str, next_arg_id), num_args_(num_args), types_(types) {}
-
-    constexpr auto num_args() const -> int { return num_args_; }
-    constexpr auto arg_type(int id) const -> type { return types_[id]; }
-
-    FMT_CONSTEXPR auto next_arg_id() -> int {
-        int id = base::next_arg_id();
-        if (id >= num_args_) throw_format_error("argument not found");
-        return id;
-    }
-
-    FMT_CONSTEXPR void check_arg_id(int id) {
-        base::check_arg_id(id);
-        if (id >= num_args_) throw_format_error("argument not found");
-    }
-    using base::check_arg_id;
-
-    FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
-        detail::ignore_unused(arg_id);
-#if !defined(__LCC__)
-        if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
-            throw_format_error("width/precision is not integer");
-#endif
-    }
-};
-
-// Extracts a reference to the container from back_insert_iterator.
-template <typename Container>
-inline auto get_container(std::back_insert_iterator<Container> it)
-        -> Container & {
-    using base = std::back_insert_iterator<Container>;
-    struct accessor : base {
-        accessor(base b) : base(b) {}
-        using base::container;
-    };
-    return *accessor(it).container;
-}
-
-template <typename Char, typename InputIt, typename OutputIt>
-FMT_CONSTEXPR auto copy_str(InputIt begin, InputIt end, OutputIt out)
-        -> OutputIt {
-    while (begin != end)
-        *out++ = static_cast<Char>(*begin++);
-    return out;
-}
-
-template <typename Char, typename T, typename U,
-        FMT_ENABLE_IF(
-                std::is_same<remove_const_t<T>, U>::value &&is_char<U>::value)>
-FMT_CONSTEXPR auto copy_str(T *begin, T *end, U *out) -> U * {
-    if (is_constant_evaluated())
-        return copy_str<Char, T *, U *>(begin, end, out);
-    auto size = to_unsigned(end - begin);
-    if (size > 0) memcpy(out, begin, size * sizeof(U));
-    return out + size;
-}
-
-/**
-  \rst
-  A contiguous memory buffer with an optional growing ability. It is an internal
-  class and shouldn't be used directly, only via `~fmt::basic_memory_buffer`.
-  \endrst
- */
-template <typename T>
-class buffer {
-private:
-    T *ptr_;
-    size_t size_;
-    size_t capacity_;
-
-protected:
-    // Don't initialize ptr_ since it is not accessed to save a few cycles.
-    FMT_MSC_WARNING(suppress : 26495)
-    FMT_CONSTEXPR buffer(size_t sz) noexcept : size_(sz), capacity_(sz) {}
-
-    FMT_CONSTEXPR20 buffer(
-            T *p = nullptr, size_t sz = 0, size_t cap = 0) noexcept
-        : ptr_(p), size_(sz), capacity_(cap) {}
-
-    FMT_CONSTEXPR20 ~buffer() = default;
-    buffer(buffer &&) = default;
-
-    /** Sets the buffer data and capacity. */
-    FMT_CONSTEXPR void set(T *buf_data, size_t buf_capacity) noexcept {
-        ptr_ = buf_data;
-        capacity_ = buf_capacity;
-    }
-
-    /** Increases the buffer capacity to hold at least *capacity* elements. */
-    // DEPRECATED!
-    virtual FMT_CONSTEXPR20 void grow(size_t capacity) = 0;
-
-public:
-    using value_type = T;
-    using const_reference = const T &;
-
-    buffer(const buffer &) = delete;
-    void operator=(const buffer &) = delete;
-
-    FMT_INLINE auto begin() noexcept -> T * { return ptr_; }
-    FMT_INLINE auto end() noexcept -> T * { return ptr_ + size_; }
-
-    FMT_INLINE auto begin() const noexcept -> const T * { return ptr_; }
-    FMT_INLINE auto end() const noexcept -> const T * { return ptr_ + size_; }
-
-    /** Returns the size of this buffer. */
-    constexpr auto size() const noexcept -> size_t { return size_; }
-
-    /** Returns the capacity of this buffer. */
-    constexpr auto capacity() const noexcept -> size_t { return capacity_; }
-
-    /** Returns a pointer to the buffer data (not null-terminated). */
-    FMT_CONSTEXPR auto data() noexcept -> T * { return ptr_; }
-    FMT_CONSTEXPR auto data() const noexcept -> const T * { return ptr_; }
-
-    /** Clears this buffer. */
-    void clear() { size_ = 0; }
-
-    // Tries resizing the buffer to contain *count* elements. If T is a POD type
-    // the new elements may not be initialized.
-    FMT_CONSTEXPR20 void try_resize(size_t count) {
-        try_reserve(count);
-        size_ = count <= capacity_ ? count : capacity_;
-    }
-
-    // Tries increasing the buffer capacity to *new_capacity*. It can increase the
-    // capacity by a smaller amount than requested but guarantees there is space
-    // for at least one additional element either by increasing the capacity or by
-    // flushing the buffer if it is full.
-    FMT_CONSTEXPR20 void try_reserve(size_t new_capacity) {
-        if (new_capacity > capacity_) grow(new_capacity);
-    }
-
-    FMT_CONSTEXPR20 void push_back(const T &value) {
-        try_reserve(size_ + 1);
-        ptr_[size_++] = value;
-    }
-
-    /** Appends data to the end of the buffer. */
-    template <typename U>
-    void append(const U *begin, const U *end);
-
-    template <typename Idx>
-    FMT_CONSTEXPR auto operator[](Idx index) -> T & {
-        return ptr_[index];
-    }
-    template <typename Idx>
-    FMT_CONSTEXPR auto operator[](Idx index) const -> const T & {
-        return ptr_[index];
-    }
-};
-
-struct buffer_traits {
-    explicit buffer_traits(size_t) {}
-    auto count() const -> size_t { return 0; }
-    auto limit(size_t size) -> size_t { return size; }
-};
-
-class fixed_buffer_traits {
-private:
-    size_t count_ = 0;
-    size_t limit_;
-
-public:
-    explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
-    auto count() const -> size_t { return count_; }
-    auto limit(size_t size) -> size_t {
-        size_t n = limit_ > count_ ? limit_ - count_ : 0;
-        count_ += size;
-        return size < n ? size : n;
-    }
-};
-
-// A buffer that writes to an output iterator when flushed.
-template <typename OutputIt, typename T, typename Traits = buffer_traits>
-class iterator_buffer final : public Traits, public buffer<T> {
-private:
-    OutputIt out_;
-    enum { buffer_size = 256 };
-    T data_[buffer_size];
-
-protected:
-    FMT_CONSTEXPR20 void grow(size_t) override {
-        if (this->size() == buffer_size) flush();
-    }
-
-    void flush() {
-        auto size = this->size();
-        this->clear();
-        out_ = copy_str<T>(data_, data_ + this->limit(size), out_);
-    }
-
-public:
-    explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
-        : Traits(n), buffer<T>(data_, 0, buffer_size), out_(out) {}
-    iterator_buffer(iterator_buffer &&other)
-        : Traits(other), buffer<T>(data_, 0, buffer_size), out_(other.out_) {}
-    ~iterator_buffer() { flush(); }
-
-    auto out() -> OutputIt {
-        flush();
-        return out_;
-    }
-    auto count() const -> size_t { return Traits::count() + this->size(); }
-};
-
-template <typename T>
-class iterator_buffer<T *, T, fixed_buffer_traits> final
-    : public fixed_buffer_traits,
-      public buffer<T> {
-private:
-    T *out_;
-    enum { buffer_size = 256 };
-    T data_[buffer_size];
-
-protected:
-    FMT_CONSTEXPR20 void grow(size_t) override {
-        if (this->size() == this->capacity()) flush();
-    }
-
-    void flush() {
-        size_t n = this->limit(this->size());
-        if (this->data() == out_) {
-            out_ += n;
-            this->set(data_, buffer_size);
-        }
-        this->clear();
-    }
-
-public:
-    explicit iterator_buffer(T *out, size_t n = buffer_size)
-        : fixed_buffer_traits(n), buffer<T>(out, 0, n), out_(out) {}
-    iterator_buffer(iterator_buffer &&other)
-        : fixed_buffer_traits(other)
-        , buffer<T>(std::move(other))
-        , out_(other.out_) {
-        if (this->data() != out_) {
-            this->set(data_, buffer_size);
-            this->clear();
-        }
-    }
-    ~iterator_buffer() { flush(); }
-
-    auto out() -> T * {
-        flush();
-        return out_;
-    }
-    auto count() const -> size_t {
-        return fixed_buffer_traits::count() + this->size();
-    }
-};
-
-template <typename T>
-class iterator_buffer<T *, T> final : public buffer<T> {
-protected:
-    FMT_CONSTEXPR20 void grow(size_t) override {}
-
-public:
-    explicit iterator_buffer(T *out, size_t = 0)
-        : buffer<T>(out, 0, ~size_t()) {}
-
-    auto out() -> T * { return &*this->end(); }
-};
-
-// A buffer that writes to a container with the contiguous storage.
-template <typename Container>
-class iterator_buffer<std::back_insert_iterator<Container>,
-        enable_if_t<is_contiguous<Container>::value,
-                typename Container::value_type>>
-        final : public buffer<typename Container::value_type> {
-private:
-    Container &container_;
-
-protected:
-    FMT_CONSTEXPR20 void grow(size_t capacity) override {
-        container_.resize(capacity);
-        this->set(&container_[0], capacity);
-    }
-
-public:
-    explicit iterator_buffer(Container &c)
-        : buffer<typename Container::value_type>(c.size()), container_(c) {}
-    explicit iterator_buffer(
-            std::back_insert_iterator<Container> out, size_t = 0)
-        : iterator_buffer(get_container(out)) {}
-
-    auto out() -> std::back_insert_iterator<Container> {
-        return std::back_inserter(container_);
-    }
-};
-
-// A buffer that counts the number of code units written discarding the output.
-template <typename T = char>
-class counting_buffer final : public buffer<T> {
-private:
-    enum { buffer_size = 256 };
-    T data_[buffer_size];
-    size_t count_ = 0;
-
-protected:
-    FMT_CONSTEXPR20 void grow(size_t) override {
-        if (this->size() != buffer_size) return;
-        count_ += this->size();
-        this->clear();
-    }
-
-public:
-    counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
-
-    auto count() -> size_t { return count_ + this->size(); }
-};
-} // namespace detail
-
-template <typename Char>
-FMT_CONSTEXPR void basic_format_parse_context<Char>::do_check_arg_id(int id) {
-    // Argument id is only checked at compile-time during parsing because
-    // formatting has its own validation.
-    if (detail::is_constant_evaluated()
-            && (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
-        using context = detail::compile_parse_context<Char>;
-        if (id >= static_cast<context *>(this)->num_args())
-            detail::throw_format_error("argument not found");
-    }
-}
-
-template <typename Char>
-FMT_CONSTEXPR void basic_format_parse_context<Char>::check_dynamic_spec(
-        int arg_id) {
-    if (detail::is_constant_evaluated()
-            && (!FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200)) {
-        using context = detail::compile_parse_context<Char>;
-        static_cast<context *>(this)->check_dynamic_spec(arg_id);
-    }
-}
-
-FMT_EXPORT template <typename Context>
-class basic_format_arg;
-FMT_EXPORT template <typename Context>
-class basic_format_args;
-FMT_EXPORT template <typename Context>
-class dynamic_format_arg_store;
-
-// A formatter for objects of type T.
-FMT_EXPORT
-template <typename T, typename Char = char, typename Enable = void>
-struct formatter {
-    // A deleted default constructor indicates a disabled formatter.
-    formatter() = delete;
-};
-
-// Specifies if T has an enabled formatter specialization. A type can be
-// formattable even if it doesn't have a formatter e.g. via a conversion.
-template <typename T, typename Context>
-using has_formatter
-        = std::is_constructible<typename Context::template formatter_type<T>>;
-
-// An output iterator that appends to a buffer.
-// It is used to reduce symbol sizes for the common case.
-class appender : public std::back_insert_iterator<detail::buffer<char>> {
-    using base = std::back_insert_iterator<detail::buffer<char>>;
-
-public:
-    using std::back_insert_iterator<detail::buffer<char>>::back_insert_iterator;
-    appender(base it) noexcept : base(it) {}
-    FMT_UNCHECKED_ITERATOR(appender);
-
-    auto operator++() noexcept -> appender & { return *this; }
-    auto operator++(int) noexcept -> appender { return *this; }
-};
-
-namespace detail {
-
-template <typename Context, typename T>
-constexpr auto has_const_formatter_impl(T *) -> decltype(
-        typename Context::template formatter_type<T>().format(
-                std::declval<const T &>(), std::declval<Context &>()),
-        true) {
-    return true;
-}
-template <typename Context>
-constexpr auto has_const_formatter_impl(...) -> bool {
-    return false;
-}
-template <typename T, typename Context>
-constexpr auto has_const_formatter() -> bool {
-    return has_const_formatter_impl<Context>(static_cast<T *>(nullptr));
-}
-
-template <typename T>
-using buffer_appender = conditional_t<std::is_same<T, char>::value, appender,
-        std::back_insert_iterator<buffer<T>>>;
-
-// Maps an output iterator to a buffer.
-template <typename T, typename OutputIt>
-auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
-    return iterator_buffer<OutputIt, T>(out);
-}
-template <typename T, typename Buf,
-        FMT_ENABLE_IF(std::is_base_of<buffer<char>, Buf>::value)>
-auto get_buffer(std::back_insert_iterator<Buf> out) -> buffer<char> & {
-    return get_container(out);
-}
-
-template <typename Buf, typename OutputIt>
-FMT_INLINE auto get_iterator(Buf &buf, OutputIt) -> decltype(buf.out()) {
-    return buf.out();
-}
-template <typename T, typename OutputIt>
-auto get_iterator(buffer<T> &, OutputIt out) -> OutputIt {
-    return out;
-}
-
-struct view {};
-
-template <typename Char, typename T>
-struct named_arg : view {
-    const Char *name;
-    const T &value;
-    named_arg(const Char *n, const T &v) : name(n), value(v) {}
-};
-
-template <typename Char>
-struct named_arg_info {
-    const Char *name;
-    int id;
-};
-
-template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
-struct arg_data {
-    // args_[0].named_args points to named_args_ to avoid bloating format_args.
-    // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
-    T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)];
-    named_arg_info<Char> named_args_[NUM_NAMED_ARGS];
-
-    template <typename... U>
-    arg_data(const U &...init)
-        : args_ {T(named_args_, NUM_NAMED_ARGS), init...} {}
-    arg_data(const arg_data &other) = delete;
-    auto args() const -> const T * { return args_ + 1; }
-    auto named_args() -> named_arg_info<Char> * { return named_args_; }
-};
-
-template <typename T, typename Char, size_t NUM_ARGS>
-struct arg_data<T, Char, NUM_ARGS, 0> {
-    // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
-    T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
-
-    template <typename... U>
-    FMT_CONSTEXPR FMT_INLINE arg_data(const U &...init) : args_ {init...} {}
-    FMT_CONSTEXPR FMT_INLINE auto args() const -> const T * { return args_; }
-    FMT_CONSTEXPR FMT_INLINE auto named_args() -> std::nullptr_t {
-        return nullptr;
-    }
-};
-
-template <typename Char>
-inline void init_named_args(named_arg_info<Char> *, int, int) {}
-
-template <typename T>
-struct is_named_arg : std::false_type {};
-template <typename T>
-struct is_statically_named_arg : std::false_type {};
-
-template <typename T, typename Char>
-struct is_named_arg<named_arg<Char, T>> : std::true_type {};
-
-template <typename Char, typename T, typename... Tail,
-        FMT_ENABLE_IF(!is_named_arg<T>::value)>
-void init_named_args(named_arg_info<Char> *named_args, int arg_count,
-        int named_arg_count, const T &, const Tail &...args) {
-    init_named_args(named_args, arg_count + 1, named_arg_count, args...);
-}
-
-template <typename Char, typename T, typename... Tail,
-        FMT_ENABLE_IF(is_named_arg<T>::value)>
-void init_named_args(named_arg_info<Char> *named_args, int arg_count,
-        int named_arg_count, const T &arg, const Tail &...args) {
-    named_args[named_arg_count++] = {arg.name, arg_count};
-    init_named_args(named_args, arg_count + 1, named_arg_count, args...);
-}
-
-template <typename... Args>
-FMT_CONSTEXPR FMT_INLINE void init_named_args(
-        std::nullptr_t, int, int, const Args &...) {}
-
-template <bool B = false>
-constexpr auto count() -> size_t {
-    return B ? 1 : 0;
-}
-template <bool B1, bool B2, bool... Tail>
-constexpr auto count() -> size_t {
-    return (B1 ? 1 : 0) + count<B2, Tail...>();
-}
-
-template <typename... Args>
-constexpr auto count_named_args() -> size_t {
-    return count<is_named_arg<Args>::value...>();
-}
-
-template <typename... Args>
-constexpr auto count_statically_named_args() -> size_t {
-    return count<is_statically_named_arg<Args>::value...>();
-}
-
-struct unformattable {};
-struct unformattable_char : unformattable {};
-struct unformattable_pointer : unformattable {};
-
-template <typename Char>
-struct string_value {
-    const Char *data;
-    size_t size;
-};
-
-template <typename Char>
-struct named_arg_value {
-    const named_arg_info<Char> *data;
-    size_t size;
-};
-
-template <typename Context>
-struct custom_value {
-    using parse_context = typename Context::parse_context_type;
-    void *value;
-    void (*format)(void *arg, parse_context &parse_ctx, Context &ctx);
-};
-
-// A formatting argument value.
-template <typename Context>
-class value {
-public:
-    using char_type = typename Context::char_type;
-
-    union {
-        monostate no_value;
-        int int_value;
-        unsigned uint_value;
-        long long long_long_value;
-        unsigned long long ulong_long_value;
-        int128_opt int128_value;
-        uint128_opt uint128_value;
-        bool bool_value;
-        char_type char_value;
-        float float_value;
-        double double_value;
-        long double long_double_value;
-        const void *pointer;
-        string_value<char_type> string;
-        custom_value<Context> custom;
-        named_arg_value<char_type> named_args;
-    };
-
-    constexpr FMT_INLINE value() : no_value() {}
-    constexpr FMT_INLINE value(int val) : int_value(val) {}
-    constexpr FMT_INLINE value(unsigned val) : uint_value(val) {}
-    constexpr FMT_INLINE value(long long val) : long_long_value(val) {}
-    constexpr FMT_INLINE value(unsigned long long val)
-        : ulong_long_value(val) {}
-    FMT_INLINE value(int128_opt val) : int128_value(val) {}
-    FMT_INLINE value(uint128_opt val) : uint128_value(val) {}
-    constexpr FMT_INLINE value(float val) : float_value(val) {}
-    constexpr FMT_INLINE value(double val) : double_value(val) {}
-    FMT_INLINE value(long double val) : long_double_value(val) {}
-    constexpr FMT_INLINE value(bool val) : bool_value(val) {}
-    constexpr FMT_INLINE value(char_type val) : char_value(val) {}
-    FMT_CONSTEXPR FMT_INLINE value(const char_type *val) {
-        string.data = val;
-        if (is_constant_evaluated()) string.size = {};
-    }
-    FMT_CONSTEXPR FMT_INLINE value(basic_string_view<char_type> val) {
-        string.data = val.data();
-        string.size = val.size();
-    }
-    FMT_INLINE value(const void *val) : pointer(val) {}
-    FMT_INLINE value(const named_arg_info<char_type> *args, size_t size)
-        : named_args {args, size} {}
-
-    template <typename T>
-    FMT_CONSTEXPR20 FMT_INLINE value(T &val) {
-        using value_type = remove_const_t<T>;
-        custom.value = const_cast<value_type *>(std::addressof(val));
-        // Get the formatter type through the context to allow different contexts
-        // have different extension points, e.g. `formatter<T>` for `format` and
-        // `printf_formatter<T>` for `printf`.
-        custom.format = format_custom_arg<value_type,
-                typename Context::template formatter_type<value_type>>;
-    }
-    value(unformattable);
-    value(unformattable_char);
-    value(unformattable_pointer);
-
-private:
-    // Formats an argument of a custom type, such as a user-defined class.
-    template <typename T, typename Formatter>
-    static void format_custom_arg(void *arg,
-            typename Context::parse_context_type &parse_ctx, Context &ctx) {
-        auto f = Formatter();
-        parse_ctx.advance_to(f.parse(parse_ctx));
-        using qualified_type
-                = conditional_t<has_const_formatter<T, Context>(), const T, T>;
-        // Calling format through a mutable reference is deprecated.
-        ctx.advance_to(f.format(*static_cast<qualified_type *>(arg), ctx));
-    }
-};
-
-// To minimize the number of types we need to deal with, long is translated
-// either to int or to long long depending on its size.
-enum { long_short = sizeof(long) == sizeof(int) };
-using long_type = conditional_t<long_short, int, long long>;
-using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
-
-template <typename T>
-struct format_as_result {
-    template <typename U,
-            FMT_ENABLE_IF(std::is_enum<U>::value || std::is_class<U>::value)>
-    static auto map(U *)
-            -> remove_cvref_t<decltype(format_as(std::declval<U>()))>;
-    static auto map(...) -> void;
-
-    using type = decltype(map(static_cast<T *>(nullptr)));
-};
-template <typename T>
-using format_as_t = typename format_as_result<T>::type;
-
-template <typename T>
-struct has_format_as
-    : bool_constant<!std::is_same<format_as_t<T>, void>::value> {};
-
-// Maps formatting arguments to core types.
-// arg_mapper reports errors by returning unformattable instead of using
-// static_assert because it's used in the is_formattable trait.
-template <typename Context>
-struct arg_mapper {
-    using char_type = typename Context::char_type;
-
-    FMT_CONSTEXPR FMT_INLINE auto map(signed char val) -> int { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(unsigned char val) -> unsigned {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(short val) -> int { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(unsigned short val) -> unsigned {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(int val) -> int { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(unsigned val) -> unsigned { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(long val) -> long_type { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(unsigned long val) -> ulong_type {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(long long val) -> long long {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(unsigned long long val)
-            -> unsigned long long {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(int128_opt val) -> int128_opt {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(uint128_opt val) -> uint128_opt {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(bool val) -> bool { return val; }
-
-    template <typename T,
-            FMT_ENABLE_IF(std::is_same<T, char>::value
-                    || std::is_same<T, char_type>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(T val) -> char_type {
-        return val;
-    }
-    template <typename T,
-            enable_if_t<(std::is_same<T, wchar_t>::value ||
-#ifdef __cpp_char8_t
-                                std::is_same<T, char8_t>::value ||
-#endif
-                                std::is_same<T, char16_t>::value
-                                || std::is_same<T, char32_t>::value)
-                            && !std::is_same<T, char_type>::value,
-                    int> = 0>
-    FMT_CONSTEXPR FMT_INLINE auto map(T) -> unformattable_char {
-        return {};
-    }
-
-    FMT_CONSTEXPR FMT_INLINE auto map(float val) -> float { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(double val) -> double { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(long double val) -> long double {
-        return val;
-    }
-
-    FMT_CONSTEXPR FMT_INLINE auto map(char_type *val) -> const char_type * {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(const char_type *val)
-            -> const char_type * {
-        return val;
-    }
-    template <typename T,
-            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value
-                    && std::is_same<char_type, char_t<T>>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(const T &val)
-            -> basic_string_view<char_type> {
-        return to_string_view(val);
-    }
-    template <typename T,
-            FMT_ENABLE_IF(is_string<T>::value && !std::is_pointer<T>::value
-                    && !std::is_same<char_type, char_t<T>>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(const T &) -> unformattable_char {
-        return {};
-    }
-
-    FMT_CONSTEXPR FMT_INLINE auto map(void *val) -> const void * { return val; }
-    FMT_CONSTEXPR FMT_INLINE auto map(const void *val) -> const void * {
-        return val;
-    }
-    FMT_CONSTEXPR FMT_INLINE auto map(std::nullptr_t val) -> const void * {
-        return val;
-    }
-
-    // Use SFINAE instead of a const T* parameter to avoid a conflict with the
-    // array overload.
-    template <typename T,
-            FMT_ENABLE_IF(std::is_pointer<T>::value
-                    || std::is_member_pointer<T>::value
-                    || std::is_function<
-                            typename std::remove_pointer<T>::type>::value
-                    || (std::is_array<T>::value
-                            && !std::is_convertible<T,
-                                    const char_type *>::value))>
-    FMT_CONSTEXPR auto map(const T &) -> unformattable_pointer {
-        return {};
-    }
-
-    template <typename T, std::size_t N,
-            FMT_ENABLE_IF(!std::is_same<T, wchar_t>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(const T (&values)[N]) -> const T (&)[N] {
-        return values;
-    }
-
-    // Only map owning types because mapping views can be unsafe.
-    template <typename T, typename U = format_as_t<T>,
-            FMT_ENABLE_IF(std::is_arithmetic<U>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(const T &val)
-            -> decltype(FMT_DECLTYPE_THIS map(U())) {
-        return map(format_as(val));
-    }
-
-    template <typename T, typename U = remove_const_t<T>>
-    struct formattable : bool_constant<has_const_formatter<U, Context>()
-                                 || (has_formatter<U, Context>::value
-                                         && !std::is_const<T>::value)> {};
-
-    template <typename T, FMT_ENABLE_IF(formattable<T>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto do_map(T &val) -> T & {
-        return val;
-    }
-    template <typename T, FMT_ENABLE_IF(!formattable<T>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto do_map(T &) -> unformattable {
-        return {};
-    }
-
-    template <typename T, typename U = remove_const_t<T>,
-            FMT_ENABLE_IF((std::is_class<U>::value || std::is_enum<U>::value
-                                  || std::is_union<U>::value)
-                    && !is_string<U>::value && !is_char<U>::value
-                    && !is_named_arg<U>::value
-                    && !std::is_arithmetic<format_as_t<U>>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(T &val)
-            -> decltype(FMT_DECLTYPE_THIS do_map(val)) {
-        return do_map(val);
-    }
-
-    template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
-    FMT_CONSTEXPR FMT_INLINE auto map(const T &named_arg)
-            -> decltype(FMT_DECLTYPE_THIS map(named_arg.value)) {
-        return map(named_arg.value);
-    }
-
-    auto map(...) -> unformattable { return {}; }
-};
-
-// A type constant after applying arg_mapper<Context>.
-template <typename T, typename Context>
-using mapped_type_constant = type_constant<decltype(arg_mapper<Context>().map(
-                                                   std::declval<const T &>())),
-        typename Context::char_type>;
-
-enum { packed_arg_bits = 4 };
-// Maximum number of arguments with packed types.
-enum { max_packed_args = 62 / packed_arg_bits };
-enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
-enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
-
-template <typename Char, typename InputIt>
-auto copy_str(InputIt begin, InputIt end, appender out) -> appender {
-    get_container(out).append(begin, end);
-    return out;
-}
-template <typename Char, typename InputIt>
-auto copy_str(
-        InputIt begin, InputIt end, std::back_insert_iterator<std::string> out)
-        -> std::back_insert_iterator<std::string> {
-    get_container(out).append(begin, end);
-    return out;
-}
-
-template <typename Char, typename R, typename OutputIt>
-FMT_CONSTEXPR auto copy_str(R &&rng, OutputIt out) -> OutputIt {
-    return detail::copy_str<Char>(rng.begin(), rng.end(), out);
-}
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
-// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
-template <typename...>
-struct void_t_impl {
-    using type = void;
-};
-template <typename... T>
-using void_t = typename void_t_impl<T...>::type;
-#else
-template <typename...>
-using void_t = void;
-#endif
-
-template <typename It, typename T, typename Enable = void>
-struct is_output_iterator : std::false_type {};
-
-template <typename It, typename T>
-struct is_output_iterator<It, T,
-        void_t<typename std::iterator_traits<It>::iterator_category,
-                decltype(*std::declval<It>() = std::declval<T>())>>
-    : std::true_type {};
-
-template <typename It>
-struct is_back_insert_iterator : std::false_type {};
-template <typename Container>
-struct is_back_insert_iterator<std::back_insert_iterator<Container>>
-    : std::true_type {};
-
-// A type-erased reference to an std::locale to avoid a heavy <locale> include.
-class locale_ref {
-private:
-    const void *locale_; // A type-erased pointer to std::locale.
-
-public:
-    constexpr FMT_INLINE locale_ref() : locale_(nullptr) {}
-    template <typename Locale>
-    explicit locale_ref(const Locale &loc);
-
-    explicit operator bool() const noexcept { return locale_ != nullptr; }
-
-    template <typename Locale>
-    auto get() const -> Locale;
-};
-
-template <typename>
-constexpr auto encode_types() -> unsigned long long {
-    return 0;
-}
-
-template <typename Context, typename Arg, typename... Args>
-constexpr auto encode_types() -> unsigned long long {
-    return static_cast<unsigned>(mapped_type_constant<Arg, Context>::value)
-            | (encode_types<Context, Args...>() << packed_arg_bits);
-}
-
-#if defined(__cpp_if_constexpr)
-// This type is intentionally undefined, only used for errors
-template <typename T, typename Char>
-struct type_is_unformattable_for;
-#endif
-
-template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(PACKED)>
-FMT_CONSTEXPR FMT_INLINE auto make_arg(T &val) -> value<Context> {
-    using arg_type = remove_cvref_t<decltype(arg_mapper<Context>().map(val))>;
-
-    constexpr bool formattable_char
-            = !std::is_same<arg_type, unformattable_char>::value;
-    static_assert(formattable_char, "Mixing character types is disallowed.");
-
-    // Formatting of arbitrary pointers is disallowed. If you want to format a
-    // pointer cast it to `void*` or `const void*`. In particular, this forbids
-    // formatting of `[const] volatile char*` printed as bool by iostreams.
-    constexpr bool formattable_pointer
-            = !std::is_same<arg_type, unformattable_pointer>::value;
-    static_assert(formattable_pointer,
-            "Formatting of non-void pointers is disallowed.");
-
-    constexpr bool formattable = !std::is_same<arg_type, unformattable>::value;
-#if defined(__cpp_if_constexpr)
-    if constexpr (!formattable) {
-        type_is_unformattable_for<T, typename Context::char_type> _;
-    }
-#endif
-    static_assert(formattable,
-            "Cannot format an argument. To make type T formattable provide a "
-            "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
-    return {arg_mapper<Context>().map(val)};
-}
-
-template <typename Context, typename T>
-FMT_CONSTEXPR auto make_arg(T &val) -> basic_format_arg<Context> {
-    auto arg = basic_format_arg<Context>();
-    arg.type_ = mapped_type_constant<T, Context>::value;
-    arg.value_ = make_arg<true, Context>(val);
-    return arg;
-}
-
-template <bool PACKED, typename Context, typename T, FMT_ENABLE_IF(!PACKED)>
-FMT_CONSTEXPR inline auto make_arg(T &val) -> basic_format_arg<Context> {
-    return make_arg<Context>(val);
-}
-} // namespace detail
-FMT_BEGIN_EXPORT
-
-// A formatting argument. Context is a template parameter for the compiled API
-// where output can be unbuffered.
-template <typename Context>
-class basic_format_arg {
-private:
-    detail::value<Context> value_;
-    detail::type type_;
-
-    template <typename ContextType, typename T>
-    friend FMT_CONSTEXPR auto detail::make_arg(T &value)
-            -> basic_format_arg<ContextType>;
-
-    template <typename Visitor, typename Ctx>
-    friend FMT_CONSTEXPR auto visit_format_arg(Visitor &&vis,
-            const basic_format_arg<Ctx> &arg) -> decltype(vis(0));
-
-    friend class basic_format_args<Context>;
-    friend class dynamic_format_arg_store<Context>;
-
-    using char_type = typename Context::char_type;
-
-    template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
-    friend struct detail::arg_data;
-
-    basic_format_arg(const detail::named_arg_info<char_type> *args, size_t size)
-        : value_(args, size) {}
-
-public:
-    class handle {
-    public:
-        explicit handle(detail::custom_value<Context> custom)
-            : custom_(custom) {}
-
-        void format(typename Context::parse_context_type &parse_ctx,
-                Context &ctx) const {
-            custom_.format(custom_.value, parse_ctx, ctx);
-        }
-
-    private:
-        detail::custom_value<Context> custom_;
-    };
-
-    constexpr basic_format_arg() : type_(detail::type::none_type) {}
-
-    constexpr explicit operator bool() const noexcept {
-        return type_ != detail::type::none_type;
-    }
-
-    auto type() const -> detail::type { return type_; }
-
-    auto is_integral() const -> bool { return detail::is_integral_type(type_); }
-    auto is_arithmetic() const -> bool {
-        return detail::is_arithmetic_type(type_);
-    }
-
-    FMT_INLINE auto format_custom(const char_type *parse_begin,
-            typename Context::parse_context_type &parse_ctx, Context &ctx)
-            -> bool {
-        if (type_ != detail::type::custom_type) return false;
-        parse_ctx.advance_to(parse_begin);
-        value_.custom.format(value_.custom.value, parse_ctx, ctx);
-        return true;
-    }
-};
-
-/**
-  \rst
-  Visits an argument dispatching to the appropriate visit method based on
-  the argument type. For example, if the argument type is ``double`` then
-  ``vis(value)`` will be called with the value of type ``double``.
-  \endrst
- */
-// DEPRECATED!
-template <typename Visitor, typename Context>
-FMT_CONSTEXPR FMT_INLINE auto visit_format_arg(Visitor &&vis,
-        const basic_format_arg<Context> &arg) -> decltype(vis(0)) {
-    switch (arg.type_) {
-        case detail::type::none_type: break;
-        case detail::type::int_type: return vis(arg.value_.int_value);
-        case detail::type::uint_type: return vis(arg.value_.uint_value);
-        case detail::type::long_long_type:
-            return vis(arg.value_.long_long_value);
-        case detail::type::ulong_long_type:
-            return vis(arg.value_.ulong_long_value);
-        case detail::type::int128_type:
-            return vis(detail::convert_for_visit(arg.value_.int128_value));
-        case detail::type::uint128_type:
-            return vis(detail::convert_for_visit(arg.value_.uint128_value));
-        case detail::type::bool_type: return vis(arg.value_.bool_value);
-        case detail::type::char_type: return vis(arg.value_.char_value);
-        case detail::type::float_type: return vis(arg.value_.float_value);
-        case detail::type::double_type: return vis(arg.value_.double_value);
-        case detail::type::long_double_type:
-            return vis(arg.value_.long_double_value);
-        case detail::type::cstring_type: return vis(arg.value_.string.data);
-        case detail::type::string_type:
-            using sv = basic_string_view<typename Context::char_type>;
-            return vis(sv(arg.value_.string.data, arg.value_.string.size));
-        case detail::type::pointer_type: return vis(arg.value_.pointer);
-        case detail::type::custom_type:
-            return vis(typename basic_format_arg<Context>::handle(
-                    arg.value_.custom));
-    }
-    return vis(monostate());
-}
-
-// Formatting context.
-template <typename OutputIt, typename Char>
-class basic_format_context {
-private:
-    OutputIt out_;
-    basic_format_args<basic_format_context> args_;
-    detail::locale_ref loc_;
-
-public:
-    using iterator = OutputIt;
-    using format_arg = basic_format_arg<basic_format_context>;
-    using format_args = basic_format_args<basic_format_context>;
-    using parse_context_type = basic_format_parse_context<Char>;
-    template <typename T>
-    using formatter_type = formatter<T, Char>;
-
-    /** The character type for the output. */
-    using char_type = Char;
-
-    basic_format_context(basic_format_context &&) = default;
-    basic_format_context(const basic_format_context &) = delete;
-    void operator=(const basic_format_context &) = delete;
-    /**
-    Constructs a ``basic_format_context`` object. References to the arguments
-    are stored in the object so make sure they have appropriate lifetimes.
-   */
-    constexpr basic_format_context(
-            OutputIt out, format_args ctx_args, detail::locale_ref loc = {})
-        : out_(out), args_(ctx_args), loc_(loc) {}
-
-    constexpr auto arg(int id) const -> format_arg { return args_.get(id); }
-    FMT_CONSTEXPR auto arg(basic_string_view<Char> name) -> format_arg {
-        return args_.get(name);
-    }
-    FMT_CONSTEXPR auto arg_id(basic_string_view<Char> name) -> int {
-        return args_.get_id(name);
-    }
-    auto args() const -> const format_args & { return args_; }
-
-    // DEPRECATED!
-    FMT_CONSTEXPR auto error_handler() -> detail::error_handler { return {}; }
-    void on_error(const char *message) { error_handler().on_error(message); }
-
-    // Returns an iterator to the beginning of the output range.
-    FMT_CONSTEXPR auto out() -> iterator { return out_; }
-
-    // Advances the begin iterator to ``it``.
-    void advance_to(iterator it) {
-        if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
-    }
-
-    FMT_CONSTEXPR auto locale() -> detail::locale_ref { return loc_; }
-};
-
-template <typename Char>
-using buffer_context
-        = basic_format_context<detail::buffer_appender<Char>, Char>;
-using format_context = buffer_context<char>;
-
-template <typename T, typename Char = char>
-using is_formattable = bool_constant<!std::is_base_of<detail::unformattable,
-        decltype(detail::arg_mapper<buffer_context<Char>>().map(
-                std::declval<T &>()))>::value>;
-
-/**
-  \rst
-  An array of references to arguments. It can be implicitly converted into
-  `~fmt::basic_format_args` for passing into type-erased formatting functions
-  such as `~fmt::vformat`.
-  \endrst
- */
-template <typename Context, typename... Args>
-class format_arg_store
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-    // Workaround a GCC template argument substitution bug.
-    : public basic_format_args<Context>
-#endif
-{
-private:
-    static const size_t num_args = sizeof...(Args);
-    static constexpr size_t num_named_args
-            = detail::count_named_args<Args...>();
-    static const bool is_packed = num_args <= detail::max_packed_args;
-
-    using value_type = conditional_t<is_packed, detail::value<Context>,
-            basic_format_arg<Context>>;
-
-    detail::arg_data<value_type, typename Context::char_type, num_args,
-            num_named_args>
-            data_;
-
-    friend class basic_format_args<Context>;
-
-    static constexpr unsigned long long desc
-            = (is_packed ? detail::encode_types<Context, Args...>()
-                         : detail::is_unpacked_bit | num_args)
-            | (num_named_args != 0 ? static_cast<unsigned long long>(
-                       detail::has_named_args_bit)
-                                   : 0);
-
-public:
-    template <typename... T>
-    FMT_CONSTEXPR FMT_INLINE format_arg_store(T &...args)
-        :
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-        basic_format_args<Context>(*this)
-        ,
-#endif
-        data_ {detail::make_arg<is_packed, Context>(args)...} {
-        if (detail::const_check(num_named_args != 0))
-            detail::init_named_args(data_.named_args(), 0, 0, args...);
-    }
-};
-
-/**
-  \rst
-  Constructs a `~fmt::format_arg_store` object that contains references to
-  arguments and can be implicitly converted to `~fmt::format_args`. `Context`
-  can be omitted in which case it defaults to `~fmt::format_context`.
-  See `~fmt::arg` for lifetime considerations.
-  \endrst
- */
-// Arguments are taken by lvalue references to avoid some lifetime issues.
-template <typename Context = format_context, typename... T>
-constexpr auto make_format_args(T &...args)
-        -> format_arg_store<Context, remove_cvref_t<T>...> {
-    return {args...};
-}
-
-/**
-  \rst
-  Returns a named argument to be used in a formatting function.
-  It should only be used in a call to a formatting function or
-  `dynamic_format_arg_store::push_back`.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {s:.2f} seconds", fmt::arg("s", 1.23));
-  \endrst
- */
-template <typename Char, typename T>
-inline auto arg(const Char *name, const T &arg) -> detail::named_arg<Char, T> {
-    static_assert(!detail::is_named_arg<T>(), "nested named arguments");
-    return {name, arg};
-}
-FMT_END_EXPORT
-
-/**
-  \rst
-  A view of a collection of formatting arguments. To avoid lifetime issues it
-  should only be used as a parameter type in type-erased functions such as
-  ``vformat``::
-
-    void vlog(string_view format_str, format_args args);  // OK
-    format_args args = make_format_args();  // Error: dangling reference
-  \endrst
- */
-template <typename Context>
-class basic_format_args {
-public:
-    using size_type = int;
-    using format_arg = basic_format_arg<Context>;
-
-private:
-    // A descriptor that contains information about formatting arguments.
-    // If the number of arguments is less or equal to max_packed_args then
-    // argument types are passed in the descriptor. This reduces binary code size
-    // per formatting function call.
-    unsigned long long desc_;
-    union {
-        // If is_packed() returns true then argument values are stored in values_;
-        // otherwise they are stored in args_. This is done to improve cache
-        // locality and reduce compiled code size since storing larger objects
-        // may require more code (at least on x86-64) even if the same amount of
-        // data is actually copied to stack. It saves ~10% on the bloat test.
-        const detail::value<Context> *values_;
-        const format_arg *args_;
-    };
-
-    constexpr auto is_packed() const -> bool {
-        return (desc_ & detail::is_unpacked_bit) == 0;
-    }
-    auto has_named_args() const -> bool {
-        return (desc_ & detail::has_named_args_bit) != 0;
-    }
-
-    FMT_CONSTEXPR auto type(int index) const -> detail::type {
-        int shift = index * detail::packed_arg_bits;
-        unsigned int mask = (1 << detail::packed_arg_bits) - 1;
-        return static_cast<detail::type>((desc_ >> shift) & mask);
-    }
-
-    constexpr FMT_INLINE basic_format_args(
-            unsigned long long desc, const detail::value<Context> *values)
-        : desc_(desc), values_(values) {}
-    constexpr basic_format_args(unsigned long long desc, const format_arg *args)
-        : desc_(desc), args_(args) {}
-
-public:
-    constexpr basic_format_args() : desc_(0), args_(nullptr) {}
-
-    /**
-   \rst
-   Constructs a `basic_format_args` object from `~fmt::format_arg_store`.
-   \endrst
-   */
-    template <typename... Args>
-    constexpr FMT_INLINE basic_format_args(
-            const format_arg_store<Context, Args...> &store)
-        : basic_format_args(
-                format_arg_store<Context, Args...>::desc, store.data_.args()) {}
-
-    /**
-   \rst
-   Constructs a `basic_format_args` object from
-   `~fmt::dynamic_format_arg_store`.
-   \endrst
-   */
-    constexpr FMT_INLINE basic_format_args(
-            const dynamic_format_arg_store<Context> &store)
-        : basic_format_args(store.get_types(), store.data()) {}
-
-    /**
-   \rst
-   Constructs a `basic_format_args` object from a dynamic set of arguments.
-   \endrst
-   */
-    constexpr basic_format_args(const format_arg *args, int count)
-        : basic_format_args(
-                detail::is_unpacked_bit | detail::to_unsigned(count), args) {}
-
-    /** Returns the argument with the specified id. */
-    FMT_CONSTEXPR auto get(int id) const -> format_arg {
-        format_arg arg;
-        if (!is_packed()) {
-            if (id < max_size()) arg = args_[id];
-            return arg;
-        }
-        if (id >= detail::max_packed_args) return arg;
-        arg.type_ = type(id);
-        if (arg.type_ == detail::type::none_type) return arg;
-        arg.value_ = values_[id];
-        return arg;
-    }
-
-    template <typename Char>
-    auto get(basic_string_view<Char> name) const -> format_arg {
-        int id = get_id(name);
-        return id >= 0 ? get(id) : format_arg();
-    }
-
-    template <typename Char>
-    auto get_id(basic_string_view<Char> name) const -> int {
-        if (!has_named_args()) return -1;
-        const auto &named_args
-                = (is_packed() ? values_[-1] : args_[-1].value_).named_args;
-        for (size_t i = 0; i < named_args.size; ++i) {
-            if (named_args.data[i].name == name) return named_args.data[i].id;
-        }
-        return -1;
-    }
-
-    auto max_size() const -> int {
-        unsigned long long max_packed = detail::max_packed_args;
-        return static_cast<int>(
-                is_packed() ? max_packed : desc_ & ~detail::is_unpacked_bit);
-    }
-};
-
-/** An alias to ``basic_format_args<format_context>``. */
-// A separate type would result in shorter symbols but break ABI compatibility
-// between clang and gcc on ARM (#1919).
-FMT_EXPORT using format_args = basic_format_args<format_context>;
-
-// We cannot use enum classes as bit fields because of a gcc bug, so we put them
-// in namespaces instead (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61414).
-// Additionally, if an underlying type is specified, older gcc incorrectly warns
-// that the type is too small. Both bugs are fixed in gcc 9.3.
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 903
-#define FMT_ENUM_UNDERLYING_TYPE(type)
-#else
-#define FMT_ENUM_UNDERLYING_TYPE(type) : type
-#endif
-namespace align {
-enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char) {
-        none, left, right, center, numeric};
-}
-using align_t = align::type;
-namespace sign {
-enum type FMT_ENUM_UNDERLYING_TYPE(unsigned char) {none, minus, plus, space};
-}
-using sign_t = sign::type;
-
-namespace detail {
-
-// Workaround an array initialization issue in gcc 4.8.
-template <typename Char>
-struct fill_t {
-private:
-    enum { max_size = 4 };
-    Char data_[max_size] = {Char(' '), Char(0), Char(0), Char(0)};
-    unsigned char size_ = 1;
-
-public:
-    FMT_CONSTEXPR void operator=(basic_string_view<Char> s) {
-        auto size = s.size();
-        FMT_ASSERT(size <= max_size, "invalid fill");
-        for (size_t i = 0; i < size; ++i)
-            data_[i] = s[i];
-        size_ = static_cast<unsigned char>(size);
-    }
-
-    constexpr auto size() const -> size_t { return size_; }
-    constexpr auto data() const -> const Char * { return data_; }
-
-    FMT_CONSTEXPR auto operator[](size_t index) -> Char & {
-        return data_[index];
-    }
-    FMT_CONSTEXPR auto operator[](size_t index) const -> const Char & {
-        return data_[index];
-    }
-};
-} // namespace detail
-
-enum class presentation_type : unsigned char {
-    none,
-    dec, // 'd'
-    oct, // 'o'
-    hex_lower, // 'x'
-    hex_upper, // 'X'
-    bin_lower, // 'b'
-    bin_upper, // 'B'
-    hexfloat_lower, // 'a'
-    hexfloat_upper, // 'A'
-    exp_lower, // 'e'
-    exp_upper, // 'E'
-    fixed_lower, // 'f'
-    fixed_upper, // 'F'
-    general_lower, // 'g'
-    general_upper, // 'G'
-    chr, // 'c'
-    string, // 's'
-    pointer, // 'p'
-    debug // '?'
-};
-
-// Format specifiers for built-in and string types.
-template <typename Char = char>
-struct format_specs {
-    int width;
-    int precision;
-    presentation_type type;
-    align_t align : 4;
-    sign_t sign : 3;
-    bool alt : 1; // Alternate form ('#').
-    bool localized : 1;
-    detail::fill_t<Char> fill;
-
-    constexpr format_specs()
-        : width(0)
-        , precision(-1)
-        , type(presentation_type::none)
-        , align(align::none)
-        , sign(sign::none)
-        , alt(false)
-        , localized(false) {}
-};
-
-namespace detail {
-
-enum class arg_id_kind { none, index, name };
-
-// An argument reference.
-template <typename Char>
-struct arg_ref {
-    FMT_CONSTEXPR arg_ref() : kind(arg_id_kind::none), val() {}
-
-    FMT_CONSTEXPR explicit arg_ref(int index)
-        : kind(arg_id_kind::index), val(index) {}
-    FMT_CONSTEXPR explicit arg_ref(basic_string_view<Char> name)
-        : kind(arg_id_kind::name), val(name) {}
-
-    FMT_CONSTEXPR auto operator=(int idx) -> arg_ref & {
-        kind = arg_id_kind::index;
-        val.index = idx;
-        return *this;
-    }
-
-    arg_id_kind kind;
-    union value {
-        FMT_CONSTEXPR value(int idx = 0) : index(idx) {}
-        FMT_CONSTEXPR value(basic_string_view<Char> n) : name(n) {}
-
-        int index;
-        basic_string_view<Char> name;
-    } val;
-};
-
-// Format specifiers with width and precision resolved at formatting rather
-// than parsing time to allow reusing the same parsed specifiers with
-// different sets of arguments (precompilation of format strings).
-template <typename Char = char>
-struct dynamic_format_specs : format_specs<Char> {
-    arg_ref<Char> width_ref;
-    arg_ref<Char> precision_ref;
-};
-
-// Converts a character to ASCII. Returns '\0' on conversion failure.
-template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
-constexpr auto to_ascii(Char c) -> char {
-    return c <= 0xff ? static_cast<char>(c) : '\0';
-}
-template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
-constexpr auto to_ascii(Char c) -> char {
-    return c <= 0xff ? static_cast<char>(c) : '\0';
-}
-
-// Returns the number of code units in a code point or 1 on error.
-template <typename Char>
-FMT_CONSTEXPR auto code_point_length(const Char *begin) -> int {
-    if (const_check(sizeof(Char) != 1)) return 1;
-    auto c = static_cast<unsigned char>(*begin);
-    return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 0x3)
-            + 1;
-}
-
-// Return the result via the out param to workaround gcc bug 77539.
-template <bool IS_CONSTEXPR, typename T, typename Ptr = const T *>
-FMT_CONSTEXPR auto find(Ptr first, Ptr last, T value, Ptr &out) -> bool {
-    for (out = first; out != last; ++out) {
-        if (*out == value) return true;
-    }
-    return false;
-}
-
-template <>
-inline auto find<false, char>(const char *first, const char *last, char value,
-        const char *&out) -> bool {
-    out = static_cast<const char *>(
-            std::memchr(first, value, to_unsigned(last - first)));
-    return out != nullptr;
-}
-
-// Parses the range [begin, end) as an unsigned integer. This function assumes
-// that the range is non-empty and the first character is a digit.
-template <typename Char>
-FMT_CONSTEXPR auto parse_nonnegative_int(
-        const Char *&begin, const Char *end, int error_value) noexcept -> int {
-    FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
-    unsigned value = 0, prev = 0;
-    auto p = begin;
-    do {
-        prev = value;
-        value = value * 10 + unsigned(*p - '0');
-        ++p;
-    } while (p != end && '0' <= *p && *p <= '9');
-    auto num_digits = p - begin;
-    begin = p;
-    if (num_digits <= std::numeric_limits<int>::digits10)
-        return static_cast<int>(value);
-    // Check for overflow.
-    const unsigned max = to_unsigned((std::numeric_limits<int>::max)());
-    return num_digits == std::numeric_limits<int>::digits10 + 1
-                    && prev * 10ull + unsigned(p[-1] - '0') <= max
-            ? static_cast<int>(value)
-            : error_value;
-}
-
-FMT_CONSTEXPR inline auto parse_align(char c) -> align_t {
-    switch (c) {
-        case '<': return align::left;
-        case '>': return align::right;
-        case '^': return align::center;
-    }
-    return align::none;
-}
-
-template <typename Char>
-constexpr auto is_name_start(Char c) -> bool {
-    return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto do_parse_arg_id(
-        const Char *begin, const Char *end, Handler &&handler) -> const Char * {
-    Char c = *begin;
-    if (c >= '0' && c <= '9') {
-        int index = 0;
-        constexpr int max = (std::numeric_limits<int>::max)();
-        if (c != '0')
-            index = parse_nonnegative_int(begin, end, max);
-        else
-            ++begin;
-        if (begin == end || (*begin != '}' && *begin != ':'))
-            throw_format_error("invalid format string");
-        else
-            handler.on_index(index);
-        return begin;
-    }
-    if (!is_name_start(c)) {
-        throw_format_error("invalid format string");
-        return begin;
-    }
-    auto it = begin;
-    do {
-        ++it;
-    } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
-    handler.on_name({begin, to_unsigned(it - begin)});
-    return it;
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR FMT_INLINE auto parse_arg_id(
-        const Char *begin, const Char *end, Handler &&handler) -> const Char * {
-    FMT_ASSERT(begin != end, "");
-    Char c = *begin;
-    if (c != '}' && c != ':') return do_parse_arg_id(begin, end, handler);
-    handler.on_auto();
-    return begin;
-}
-
-template <typename Char>
-struct dynamic_spec_id_handler {
-    basic_format_parse_context<Char> &ctx;
-    arg_ref<Char> &ref;
-
-    FMT_CONSTEXPR void on_auto() {
-        int id = ctx.next_arg_id();
-        ref = arg_ref<Char>(id);
-        ctx.check_dynamic_spec(id);
-    }
-    FMT_CONSTEXPR void on_index(int id) {
-        ref = arg_ref<Char>(id);
-        ctx.check_arg_id(id);
-        ctx.check_dynamic_spec(id);
-    }
-    FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
-        ref = arg_ref<Char>(id);
-        ctx.check_arg_id(id);
-    }
-};
-
-// Parses [integer | "{" [arg_id] "}"].
-template <typename Char>
-FMT_CONSTEXPR auto parse_dynamic_spec(const Char *begin, const Char *end,
-        int &value, arg_ref<Char> &ref, basic_format_parse_context<Char> &ctx)
-        -> const Char * {
-    FMT_ASSERT(begin != end, "");
-    if ('0' <= *begin && *begin <= '9') {
-        int val = parse_nonnegative_int(begin, end, -1);
-        if (val != -1)
-            value = val;
-        else
-            throw_format_error("number is too big");
-    } else if (*begin == '{') {
-        ++begin;
-        auto handler = dynamic_spec_id_handler<Char> {ctx, ref};
-        if (begin != end) begin = parse_arg_id(begin, end, handler);
-        if (begin != end && *begin == '}') return ++begin;
-        throw_format_error("invalid format string");
-    }
-    return begin;
-}
-
-template <typename Char>
-FMT_CONSTEXPR auto parse_precision(const Char *begin, const Char *end,
-        int &value, arg_ref<Char> &ref, basic_format_parse_context<Char> &ctx)
-        -> const Char * {
-    ++begin;
-    if (begin == end || *begin == '}') {
-        throw_format_error("invalid precision");
-        return begin;
-    }
-    return parse_dynamic_spec(begin, end, value, ref, ctx);
-}
-
-enum class state { start, align, sign, hash, zero, width, precision, locale };
-
-// Parses standard format specifiers.
-template <typename Char>
-FMT_CONSTEXPR FMT_INLINE auto parse_format_specs(const Char *begin,
-        const Char *end, dynamic_format_specs<Char> &specs,
-        basic_format_parse_context<Char> &ctx, type arg_type) -> const Char * {
-    auto c = '\0';
-    if (end - begin > 1) {
-        auto next = to_ascii(begin[1]);
-        c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
-    } else {
-        if (begin == end) return begin;
-        c = to_ascii(*begin);
-    }
-
-    struct {
-        state current_state = state::start;
-        FMT_CONSTEXPR void operator()(state s, bool valid = true) {
-            if (current_state >= s || !valid)
-                throw_format_error("invalid format specifier");
-            current_state = s;
-        }
-    } enter_state;
-
-    using pres = presentation_type;
-    constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
-    struct {
-        const Char *&begin;
-        dynamic_format_specs<Char> &specs;
-        type arg_type;
-
-        FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char * {
-            if (!in(arg_type, set)) {
-                if (arg_type == type::none_type) return begin;
-                throw_format_error("invalid format specifier");
-            }
-            specs.type = pres_type;
-            return begin + 1;
-        }
-    } parse_presentation_type {begin, specs, arg_type};
-
-    for (;;) {
-        switch (c) {
-            case '<':
-            case '>':
-            case '^':
-                enter_state(state::align);
-                specs.align = parse_align(c);
-                ++begin;
-                break;
-            case '+':
-            case '-':
-            case ' ':
-                if (arg_type == type::none_type) return begin;
-                enter_state(state::sign, in(arg_type, sint_set | float_set));
-                switch (c) {
-                    case '+': specs.sign = sign::plus; break;
-                    case '-': specs.sign = sign::minus; break;
-                    case ' ': specs.sign = sign::space; break;
-                }
-                ++begin;
-                break;
-            case '#':
-                if (arg_type == type::none_type) return begin;
-                enter_state(state::hash, is_arithmetic_type(arg_type));
-                specs.alt = true;
-                ++begin;
-                break;
-            case '0':
-                enter_state(state::zero);
-                if (!is_arithmetic_type(arg_type)) {
-                    if (arg_type == type::none_type) return begin;
-                    throw_format_error(
-                            "format specifier requires numeric argument");
-                }
-                if (specs.align == align::none) {
-                    // Ignore 0 if align is specified for compatibility with std::format.
-                    specs.align = align::numeric;
-                    specs.fill[0] = Char('0');
-                }
-                ++begin;
-                break;
-            case '1':
-            case '2':
-            case '3':
-            case '4':
-            case '5':
-            case '6':
-            case '7':
-            case '8':
-            case '9':
-            case '{':
-                enter_state(state::width);
-                begin = parse_dynamic_spec(
-                        begin, end, specs.width, specs.width_ref, ctx);
-                break;
-            case '.':
-                if (arg_type == type::none_type) return begin;
-                enter_state(state::precision,
-                        in(arg_type, float_set | string_set | cstring_set));
-                begin = parse_precision(
-                        begin, end, specs.precision, specs.precision_ref, ctx);
-                break;
-            case 'L':
-                if (arg_type == type::none_type) return begin;
-                enter_state(state::locale, is_arithmetic_type(arg_type));
-                specs.localized = true;
-                ++begin;
-                break;
-            case 'd': return parse_presentation_type(pres::dec, integral_set);
-            case 'o': return parse_presentation_type(pres::oct, integral_set);
-            case 'x':
-                return parse_presentation_type(pres::hex_lower, integral_set);
-            case 'X':
-                return parse_presentation_type(pres::hex_upper, integral_set);
-            case 'b':
-                return parse_presentation_type(pres::bin_lower, integral_set);
-            case 'B':
-                return parse_presentation_type(pres::bin_upper, integral_set);
-            case 'a':
-                return parse_presentation_type(pres::hexfloat_lower, float_set);
-            case 'A':
-                return parse_presentation_type(pres::hexfloat_upper, float_set);
-            case 'e':
-                return parse_presentation_type(pres::exp_lower, float_set);
-            case 'E':
-                return parse_presentation_type(pres::exp_upper, float_set);
-            case 'f':
-                return parse_presentation_type(pres::fixed_lower, float_set);
-            case 'F':
-                return parse_presentation_type(pres::fixed_upper, float_set);
-            case 'g':
-                return parse_presentation_type(pres::general_lower, float_set);
-            case 'G':
-                return parse_presentation_type(pres::general_upper, float_set);
-            case 'c':
-                if (arg_type == type::bool_type)
-                    throw_format_error("invalid format specifier");
-                return parse_presentation_type(pres::chr, integral_set);
-            case 's':
-                return parse_presentation_type(
-                        pres::string, bool_set | string_set | cstring_set);
-            case 'p':
-                return parse_presentation_type(
-                        pres::pointer, pointer_set | cstring_set);
-            case '?':
-                return parse_presentation_type(
-                        pres::debug, char_set | string_set | cstring_set);
-            case '}': return begin;
-            default: {
-                if (*begin == '}') return begin;
-                // Parse fill and alignment.
-                auto fill_end = begin + code_point_length(begin);
-                if (end - fill_end <= 0) {
-                    throw_format_error("invalid format specifier");
-                    return begin;
-                }
-                if (*begin == '{') {
-                    throw_format_error("invalid fill character '{'");
-                    return begin;
-                }
-                auto align = parse_align(to_ascii(*fill_end));
-                enter_state(state::align, align != align::none);
-                specs.fill = {begin, to_unsigned(fill_end - begin)};
-                specs.align = align;
-                begin = fill_end + 1;
-            }
-        }
-        if (begin == end) return begin;
-        c = to_ascii(*begin);
-    }
-}
-
-template <typename Char, typename Handler>
-FMT_CONSTEXPR auto parse_replacement_field(
-        const Char *begin, const Char *end, Handler &&handler) -> const Char * {
-    struct id_adapter {
-        Handler &handler;
-        int arg_id;
-
-        FMT_CONSTEXPR void on_auto() { arg_id = handler.on_arg_id(); }
-        FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
-        FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
-            arg_id = handler.on_arg_id(id);
-        }
-    };
-
-    ++begin;
-    if (begin == end) return handler.on_error("invalid format string"), end;
-    if (*begin == '}') {
-        handler.on_replacement_field(handler.on_arg_id(), begin);
-    } else if (*begin == '{') {
-        handler.on_text(begin, begin + 1);
-    } else {
-        auto adapter = id_adapter {handler, 0};
-        begin = parse_arg_id(begin, end, adapter);
-        Char c = begin != end ? *begin : Char();
-        if (c == '}') {
-            handler.on_replacement_field(adapter.arg_id, begin);
-        } else if (c == ':') {
-            begin = handler.on_format_specs(adapter.arg_id, begin + 1, end);
-            if (begin == end || *begin != '}')
-                return handler.on_error("unknown format specifier"), end;
-        } else {
-            return handler.on_error("missing '}' in format string"), end;
-        }
-    }
-    return begin + 1;
-}
-
-template <bool IS_CONSTEXPR, typename Char, typename Handler>
-FMT_CONSTEXPR FMT_INLINE void parse_format_string(
-        basic_string_view<Char> format_str, Handler &&handler) {
-    auto begin = format_str.data();
-    auto end = begin + format_str.size();
-    if (end - begin < 32) {
-        // Use a simple loop instead of memchr for small strings.
-        const Char *p = begin;
-        while (p != end) {
-            auto c = *p++;
-            if (c == '{') {
-                handler.on_text(begin, p - 1);
-                begin = p = parse_replacement_field(p - 1, end, handler);
-            } else if (c == '}') {
-                if (p == end || *p != '}')
-                    return handler.on_error("unmatched '}' in format string");
-                handler.on_text(begin, p);
-                begin = ++p;
-            }
-        }
-        handler.on_text(begin, end);
-        return;
-    }
-    struct writer {
-        FMT_CONSTEXPR void operator()(const Char *from, const Char *to) {
-            if (from == to) return;
-            for (;;) {
-                const Char *p = nullptr;
-                if (!find<IS_CONSTEXPR>(from, to, Char('}'), p))
-                    return handler_.on_text(from, to);
-                ++p;
-                if (p == to || *p != '}')
-                    return handler_.on_error("unmatched '}' in format string");
-                handler_.on_text(from, p);
-                from = p + 1;
-            }
-        }
-        Handler &handler_;
-    } write = {handler};
-    while (begin != end) {
-        // Doing two passes with memchr (one for '{' and another for '}') is up to
-        // 2.5x faster than the naive one-pass implementation on big format strings.
-        const Char *p = begin;
-        if (*begin != '{' && !find<IS_CONSTEXPR>(begin + 1, end, Char('{'), p))
-            return write(begin, end);
-        write(begin, p);
-        begin = parse_replacement_field(p, end, handler);
-    }
-}
-
-template <typename T, bool = is_named_arg<T>::value>
-struct strip_named_arg {
-    using type = T;
-};
-template <typename T>
-struct strip_named_arg<T, true> {
-    using type = remove_cvref_t<decltype(T::value)>;
-};
-
-template <typename T, typename ParseContext>
-FMT_CONSTEXPR auto parse_format_specs(ParseContext &ctx)
-        -> decltype(ctx.begin()) {
-    using char_type = typename ParseContext::char_type;
-    using context = buffer_context<char_type>;
-    using mapped_type = conditional_t<mapped_type_constant<T, context>::value
-                    != type::custom_type,
-            decltype(arg_mapper<context>().map(std::declval<const T &>())),
-            typename strip_named_arg<T>::type>;
-#if defined(__cpp_if_constexpr)
-    if constexpr (std::is_default_constructible<
-                          formatter<mapped_type, char_type>>::value) {
-        return formatter<mapped_type, char_type>().parse(ctx);
-    } else {
-        type_is_unformattable_for<T, char_type> _;
-        return ctx.begin();
-    }
-#else
-    return formatter<mapped_type, char_type>().parse(ctx);
-#endif
-}
-
-// Checks char specs and returns true iff the presentation type is char-like.
-template <typename Char>
-FMT_CONSTEXPR auto check_char_specs(const format_specs<Char> &specs) -> bool {
-    if (specs.type != presentation_type::none
-            && specs.type != presentation_type::chr
-            && specs.type != presentation_type::debug) {
-        return false;
-    }
-    if (specs.align == align::numeric || specs.sign != sign::none || specs.alt)
-        throw_format_error("invalid format specifier for char");
-    return true;
-}
-
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <int N, typename T, typename... Args, typename Char>
-constexpr auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
-    if constexpr (is_statically_named_arg<T>()) {
-        if (name == T::name) return N;
-    }
-    if constexpr (sizeof...(Args) > 0)
-        return get_arg_index_by_name<N + 1, Args...>(name);
-    (void)name; // Workaround an MSVC bug about "unused" parameter.
-    return -1;
-}
-#endif
-
-template <typename... Args, typename Char>
-FMT_CONSTEXPR auto get_arg_index_by_name(basic_string_view<Char> name) -> int {
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-    if constexpr (sizeof...(Args) > 0)
-        return get_arg_index_by_name<0, Args...>(name);
-#endif
-    (void)name;
-    return -1;
-}
-
-template <typename Char, typename... Args>
-class format_string_checker {
-private:
-    using parse_context_type = compile_parse_context<Char>;
-    static constexpr int num_args = sizeof...(Args);
-
-    // Format specifier parsing function.
-    // In the future basic_format_parse_context will replace compile_parse_context
-    // here and will use is_constant_evaluated and downcasting to access the data
-    // needed for compile-time checks: https://godbolt.org/z/GvWzcTjh1.
-    using parse_func = const Char *(*)(parse_context_type &);
-
-    type types_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
-    parse_context_type context_;
-    parse_func parse_funcs_[num_args > 0 ? static_cast<size_t>(num_args) : 1];
-
-public:
-    explicit FMT_CONSTEXPR format_string_checker(basic_string_view<Char> fmt)
-        : types_ {mapped_type_constant<Args, buffer_context<Char>>::value...}
-        , context_(fmt, num_args, types_)
-        , parse_funcs_ {&parse_format_specs<Args, parse_context_type>...} {}
-
-    FMT_CONSTEXPR void on_text(const Char *, const Char *) {}
-
-    FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
-    FMT_CONSTEXPR auto on_arg_id(int id) -> int {
-        return context_.check_arg_id(id), id;
-    }
-    FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-        auto index = get_arg_index_by_name<Args...>(id);
-        if (index < 0) on_error("named argument is not found");
-        return index;
-#else
-        (void)id;
-        on_error(
-                "compile-time checks for named arguments require C++20 "
-                "support");
-        return 0;
-#endif
-    }
-
-    FMT_CONSTEXPR void on_replacement_field(int id, const Char *begin) {
-        on_format_specs(id, begin, begin); // Call parse() on empty specs.
-    }
-
-    FMT_CONSTEXPR auto on_format_specs(int id, const Char *begin, const Char *)
-            -> const Char * {
-        context_.advance_to(begin);
-        // id >= 0 check is a workaround for gcc 10 bug (#2065).
-        return id >= 0 && id < num_args ? parse_funcs_[id](context_) : begin;
-    }
-
-    FMT_CONSTEXPR void on_error(const char *message) {
-        throw_format_error(message);
-    }
-};
-
-// Reports a compile-time error if S is not a valid format string.
-template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
-FMT_INLINE void check_format_string(const S &) {
-#ifdef FMT_ENFORCE_COMPILE_STRING
-    static_assert(is_compile_string<S>::value,
-            "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
-            "FMT_STRING.");
-#endif
-}
-template <typename... Args, typename S,
-        FMT_ENABLE_IF(is_compile_string<S>::value)>
-void check_format_string(S format_str) {
-    using char_t = typename S::char_type;
-    FMT_CONSTEXPR auto s = basic_string_view<char_t>(format_str);
-    using checker = format_string_checker<char_t, remove_cvref_t<Args>...>;
-    FMT_CONSTEXPR bool error = (parse_format_string<true>(s, checker(s)), true);
-    ignore_unused(error);
-}
-
-template <typename Char = char>
-struct vformat_args {
-    using type = basic_format_args<basic_format_context<
-            std::back_insert_iterator<buffer<Char>>, Char>>;
-};
-template <>
-struct vformat_args<char> {
-    using type = format_args;
-};
-
-// Use vformat_args and avoid type_identity to keep symbols short.
-template <typename Char>
-void vformat_to(buffer<Char> &buf, basic_string_view<Char> fmt,
-        typename vformat_args<Char>::type args, locale_ref loc = {});
-
-FMT_API void vprint_mojibake(std::FILE *, string_view, format_args);
-#ifndef _WIN32
-inline void vprint_mojibake(std::FILE *, string_view, format_args) {}
-#endif
-} // namespace detail
-
-FMT_BEGIN_EXPORT
-
-// A formatter specialization for natively supported types.
-template <typename T, typename Char>
-struct formatter<T, Char,
-        enable_if_t<detail::type_constant<T, Char>::value
-                != detail::type::custom_type>> {
-private:
-    detail::dynamic_format_specs<Char> specs_;
-
-public:
-    template <typename ParseContext>
-    FMT_CONSTEXPR auto parse(ParseContext &ctx) -> const Char * {
-        auto type = detail::type_constant<T, Char>::value;
-        auto end = detail::parse_format_specs(
-                ctx.begin(), ctx.end(), specs_, ctx, type);
-        if (type == detail::type::char_type) detail::check_char_specs(specs_);
-        return end;
-    }
-
-    template <detail::type U = detail::type_constant<T, Char>::value,
-            FMT_ENABLE_IF(U == detail::type::string_type
-                    || U == detail::type::cstring_type
-                    || U == detail::type::char_type)>
-    FMT_CONSTEXPR void set_debug_format(bool set = true) {
-        specs_.type = set ? presentation_type::debug : presentation_type::none;
-    }
-
-    template <typename FormatContext>
-    FMT_CONSTEXPR auto format(const T &val, FormatContext &ctx) const
-            -> decltype(ctx.out());
-};
-
-template <typename Char = char>
-struct runtime_format_string {
-    basic_string_view<Char> str;
-};
-
-/** A compile-time format string. */
-template <typename Char, typename... Args>
-class basic_format_string {
-private:
-    basic_string_view<Char> str_;
-
-public:
-    template <typename S,
-            FMT_ENABLE_IF(std::is_convertible<const S &,
-                    basic_string_view<Char>>::value)>
-    FMT_CONSTEVAL FMT_INLINE basic_format_string(const S &s) : str_(s) {
-        static_assert(detail::count<(std::is_base_of<detail::view,
-                                             remove_reference_t<Args>>::value
-                                && std::is_reference<Args>::value)...>()
-                        == 0,
-                "passing views as lvalues is disallowed");
-#ifdef FMT_HAS_CONSTEVAL
-        if constexpr (detail::count_named_args<Args...>()
-                == detail::count_statically_named_args<Args...>()) {
-            using checker = detail::format_string_checker<Char,
-                    remove_cvref_t<Args>...>;
-            detail::parse_format_string<true>(str_, checker(s));
-        }
-#else
-        detail::check_format_string<Args...>(s);
-#endif
-    }
-    basic_format_string(runtime_format_string<Char> fmt) : str_(fmt.str) {}
-
-    FMT_INLINE operator basic_string_view<Char>() const { return str_; }
-    FMT_INLINE auto get() const -> basic_string_view<Char> { return str_; }
-};
-
-#if FMT_GCC_VERSION && FMT_GCC_VERSION < 409
-// Workaround broken conversion on older gcc.
-template <typename...>
-using format_string = string_view;
-inline auto runtime(string_view s) -> string_view {
-    return s;
-}
-#else
-template <typename... Args>
-using format_string = basic_format_string<char, type_identity_t<Args>...>;
-/**
-  \rst
-  Creates a runtime format string.
-
-  **Example**::
-
-    // Check format string at runtime instead of compile-time.
-    fmt::print(fmt::runtime("{:d}"), "I am not a number");
-  \endrst
- */
-inline auto runtime(string_view s) -> runtime_format_string<> {
-    return {{s}};
-}
-#endif
-
-FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt`` and returns the result
-  as a string.
-
-  **Example**::
-
-    #include <fmt/core.h>
-    std::string message = fmt::format("The answer is {}.", 42);
-  \endrst
-*/
-template <typename... T>
-FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T &&...args)
-        -> std::string {
-    return vformat(fmt, fmt::make_format_args(args...));
-}
-
-/** Formats a string and writes the output to ``out``. */
-template <typename OutputIt,
-        FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-auto vformat_to(OutputIt out, string_view fmt, format_args args) -> OutputIt {
-    auto &&buf = detail::get_buffer<char>(out);
-    detail::vformat_to(buf, fmt, args, {});
-    return detail::get_iterator(buf, out);
-}
-
-/**
- \rst
- Formats ``args`` according to specifications in ``fmt``, writes the result to
- the output iterator ``out`` and returns the iterator past the end of the output
- range. `format_to` does not append a terminating null character.
-
- **Example**::
-
-   auto out = std::vector<char>();
-   fmt::format_to(std::back_inserter(out), "{}", 42);
- \endrst
- */
-template <typename OutputIt, typename... T,
-        FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-FMT_INLINE auto format_to(OutputIt out, format_string<T...> fmt, T &&...args)
-        -> OutputIt {
-    return vformat_to(out, fmt, fmt::make_format_args(args...));
-}
-
-template <typename OutputIt>
-struct format_to_n_result {
-    /** Iterator past the end of the output range. */
-    OutputIt out;
-    /** Total (not truncated) output size. */
-    size_t size;
-};
-
-template <typename OutputIt, typename... T,
-        FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
-        -> format_to_n_result<OutputIt> {
-    using traits = detail::fixed_buffer_traits;
-    auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
-    detail::vformat_to(buf, fmt, args, {});
-    return {buf.out(), buf.count()};
-}
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt``, writes up to ``n``
-  characters of the result to the output iterator ``out`` and returns the total
-  (not truncated) output size and the iterator past the end of the output range.
-  `format_to_n` does not append a terminating null character.
-  \endrst
- */
-template <typename OutputIt, typename... T,
-        FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
-FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
-        T &&...args) -> format_to_n_result<OutputIt> {
-    return vformat_to_n(out, n, fmt, fmt::make_format_args(args...));
-}
-
-/** Returns the number of chars in the output of ``format(fmt, args...)``. */
-template <typename... T>
-FMT_NODISCARD FMT_INLINE auto formatted_size(
-        format_string<T...> fmt, T &&...args) -> size_t {
-    auto buf = detail::counting_buffer<>();
-    detail::vformat_to<char>(buf, fmt, fmt::make_format_args(args...), {});
-    return buf.count();
-}
-
-FMT_API void vprint(string_view fmt, format_args args);
-FMT_API void vprint(std::FILE *f, string_view fmt, format_args args);
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt`` and writes the output
-  to ``stdout``.
-
-  **Example**::
-
-    fmt::print("Elapsed time: {0:.2f} seconds", 1.23);
-  \endrst
- */
-template <typename... T>
-FMT_INLINE void print(format_string<T...> fmt, T &&...args) {
-    const auto &vargs = fmt::make_format_args(args...);
-    return detail::is_utf8() ? vprint(fmt, vargs)
-                             : detail::vprint_mojibake(stdout, fmt, vargs);
-}
-
-/**
-  \rst
-  Formats ``args`` according to specifications in ``fmt`` and writes the
-  output to the file ``f``.
-
-  **Example**::
-
-    fmt::print(stderr, "Don't {}!", "panic");
-  \endrst
- */
-template <typename... T>
-FMT_INLINE void print(std::FILE *f, format_string<T...> fmt, T &&...args) {
-    const auto &vargs = fmt::make_format_args(args...);
-    return detail::is_utf8() ? vprint(f, fmt, vargs)
-                             : detail::vprint_mojibake(f, fmt, vargs);
-}
-
-/**
-  Formats ``args`` according to specifications in ``fmt`` and writes the
-  output to the file ``f`` followed by a newline.
- */
-template <typename... T>
-FMT_INLINE void println(std::FILE *f, format_string<T...> fmt, T &&...args) {
-    return fmt::print(f, "{}\n", fmt::format(fmt, std::forward<T>(args)...));
-}
-
-/**
-  Formats ``args`` according to specifications in ``fmt`` and writes the output
-  to ``stdout`` followed by a newline.
- */
-template <typename... T>
-FMT_INLINE void println(format_string<T...> fmt, T &&...args) {
-    return fmt::println(stdout, fmt, std::forward<T>(args)...);
-}
-
-FMT_END_EXPORT
-FMT_GCC_PRAGMA("GCC pop_options")
-FMT_END_NAMESPACE
-
-#ifdef FMT_HEADER_ONLY
-#include "common/spdlog/fmt/bundled/format.h"
-#endif
-#endif // FMT_CORE_H_
diff --git a/src/common/spdlog/fmt/bundled/format-inl.h b/src/common/spdlog/fmt/bundled/format-inl.h
deleted file mode 100755
index bc912667ca0..00000000000
--- a/src/common/spdlog/fmt/bundled/format-inl.h
+++ /dev/null
@@ -1,2859 +0,0 @@
-// Formatting library for C++ - implementation
-//
-// Copyright (c) 2012 - 2016, Victor Zverovich
-// All rights reserved.
-//
-// For the license information refer to format.h.
-
-#ifndef FMT_FORMAT_INL_H_
-#define FMT_FORMAT_INL_H_
-
-#include <algorithm>
-#include <cerrno> // errno
-#include <climits>
-#include <cmath>
-#include <exception>
-
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
-#include <locale>
-#endif
-
-#if defined(_WIN32) && !defined(FMT_WINDOWS_NO_WCHAR)
-#include <io.h> // _isatty
-#endif
-
-#include "format.h"
-
-FMT_BEGIN_NAMESPACE
-namespace detail {
-
-FMT_FUNC void assert_fail(const char *file, int line, const char *message) {
-    // Use unchecked std::fprintf to avoid triggering another assertion when
-    // writing to stderr fails
-    std::fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
-    // Chosen instead of std::abort to satisfy Clang in CUDA mode during device
-    // code pass.
-    std::terminate();
-}
-
-FMT_FUNC void throw_format_error(const char *message) {
-    FMT_THROW(format_error(message));
-}
-
-FMT_FUNC void format_error_code(detail::buffer<char> &out, int error_code,
-        string_view message) noexcept {
-    // Report error code making sure that the output fits into
-    // inline_buffer_size to avoid dynamic memory allocation and potential
-    // bad_alloc.
-    out.try_resize(0);
-    static const char SEP[] = ": ";
-    static const char ERROR_STR[] = "error ";
-    // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
-    size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
-    auto abs_value = static_cast<uint32_or_64_or_128_t<int>>(error_code);
-    if (detail::is_negative(error_code)) {
-        abs_value = 0 - abs_value;
-        ++error_code_size;
-    }
-    error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
-    auto it = buffer_appender<char>(out);
-    if (message.size() <= inline_buffer_size - error_code_size)
-        fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
-    fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
-    FMT_ASSERT(out.size() <= inline_buffer_size, "");
-}
-
-FMT_FUNC void report_error(
-        format_func func, int error_code, const char *message) noexcept {
-    memory_buffer full_message;
-    func(full_message, error_code, message);
-    // Don't use fwrite_fully because the latter may throw.
-    if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0)
-        std::fputc('\n', stderr);
-}
-
-// A wrapper around fwrite that throws on error.
-inline void fwrite_fully(const void *ptr, size_t count, FILE *stream) {
-    size_t written = std::fwrite(ptr, 1, count, stream);
-    if (written < count)
-        FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
-}
-
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
-template <typename Locale>
-locale_ref::locale_ref(const Locale &loc) : locale_(&loc) {
-    static_assert(std::is_same<Locale, std::locale>::value, "");
-}
-
-template <typename Locale>
-auto locale_ref::get() const -> Locale {
-    static_assert(std::is_same<Locale, std::locale>::value, "");
-    return locale_ ? *static_cast<const std::locale *>(locale_) : std::locale();
-}
-
-template <typename Char>
-FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
-    auto &facet = std::use_facet<std::numpunct<Char>>(loc.get<std::locale>());
-    auto grouping = facet.grouping();
-    auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
-    return {std::move(grouping), thousands_sep};
-}
-template <typename Char>
-FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
-    return std::use_facet<std::numpunct<Char>>(loc.get<std::locale>())
-            .decimal_point();
-}
-#else
-template <typename Char>
-FMT_FUNC auto thousands_sep_impl(locale_ref) -> thousands_sep_result<Char> {
-    return {"\03", FMT_STATIC_THOUSANDS_SEPARATOR};
-}
-template <typename Char>
-FMT_FUNC Char decimal_point_impl(locale_ref) {
-    return '.';
-}
-#endif
-
-FMT_FUNC auto write_loc(appender out, loc_value value,
-        const format_specs<> &specs, locale_ref loc) -> bool {
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
-    auto locale = loc.get<std::locale>();
-    // We cannot use the num_put<char> facet because it may produce output in
-    // a wrong encoding.
-    using facet = format_facet<std::locale>;
-    if (std::has_facet<facet>(locale))
-        return std::use_facet<facet>(locale).put(out, value, specs);
-    return facet(locale).put(out, value, specs);
-#endif
-    return false;
-}
-} // namespace detail
-
-template <typename Locale>
-typename Locale::id format_facet<Locale>::id;
-
-#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
-template <typename Locale>
-format_facet<Locale>::format_facet(Locale &loc) {
-    auto &numpunct = std::use_facet<std::numpunct<char>>(loc);
-    grouping_ = numpunct.grouping();
-    if (!grouping_.empty())
-        separator_ = std::string(1, numpunct.thousands_sep());
-}
-
-template <>
-FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(appender out,
-        loc_value val, const format_specs<> &specs) const -> bool {
-    return val.visit(detail::loc_writer<> {
-            out, specs, separator_, grouping_, decimal_point_});
-}
-#endif
-
-FMT_FUNC auto vsystem_error(int error_code, string_view fmt, format_args args)
-        -> std::system_error {
-    auto ec = std::error_code(error_code, std::generic_category());
-    return std::system_error(ec, vformat(fmt, args));
-}
-
-namespace detail {
-
-template <typename F>
-inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
-    return x.f == y.f && x.e == y.e;
-}
-
-// Compilers should be able to optimize this into the ror instruction.
-FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
-    r &= 31;
-    return (n >> r) | (n << (32 - r));
-}
-FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
-    r &= 63;
-    return (n >> r) | (n << (64 - r));
-}
-
-// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
-namespace dragonbox {
-// Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
-// 64-bit unsigned integer.
-inline auto umul96_upper64(uint32_t x, uint64_t y) noexcept -> uint64_t {
-    return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
-}
-
-// Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
-// 128-bit unsigned integer.
-inline auto umul192_lower128(uint64_t x, uint128_fallback y) noexcept
-        -> uint128_fallback {
-    uint64_t high = x * y.high();
-    uint128_fallback high_low = umul128(x, y.low());
-    return {high + high_low.high(), high_low.low()};
-}
-
-// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
-// 64-bit unsigned integer.
-inline auto umul96_lower64(uint32_t x, uint64_t y) noexcept -> uint64_t {
-    return x * y;
-}
-
-// Various fast log computations.
-inline auto floor_log10_pow2_minus_log10_4_over_3(int e) noexcept -> int {
-    FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
-    return (e * 631305 - 261663) >> 21;
-}
-
-FMT_INLINE_VARIABLE constexpr struct {
-    uint32_t divisor;
-    int shift_amount;
-} div_small_pow10_infos[] = {{10, 16}, {100, 16}};
-
-// Replaces n by floor(n / pow(10, N)) returning true if and only if n is
-// divisible by pow(10, N).
-// Precondition: n <= pow(10, N + 1).
-template <int N>
-auto check_divisibility_and_divide_by_pow10(uint32_t &n) noexcept -> bool {
-    // The numbers below are chosen such that:
-    //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
-    //   2. nm mod 2^k < m if and only if n is divisible by d,
-    // where m is magic_number, k is shift_amount
-    // and d is divisor.
-    //
-    // Item 1 is a common technique of replacing division by a constant with
-    // multiplication, see e.g. "Division by Invariant Integers Using
-    // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set
-    // to ceil(2^k/d) for large enough k.
-    // The idea for item 2 originates from Schubfach.
-    constexpr auto info = div_small_pow10_infos[N - 1];
-    FMT_ASSERT(n <= info.divisor * 10, "n is too large");
-    constexpr uint32_t magic_number
-            = (1u << info.shift_amount) / info.divisor + 1;
-    n *= magic_number;
-    const uint32_t comparison_mask = (1u << info.shift_amount) - 1;
-    bool result = (n & comparison_mask) < magic_number;
-    n >>= info.shift_amount;
-    return result;
-}
-
-// Computes floor(n / pow(10, N)) for small n and N.
-// Precondition: n <= pow(10, N + 1).
-template <int N>
-auto small_division_by_pow10(uint32_t n) noexcept -> uint32_t {
-    constexpr auto info = div_small_pow10_infos[N - 1];
-    FMT_ASSERT(n <= info.divisor * 10, "n is too large");
-    constexpr uint32_t magic_number
-            = (1u << info.shift_amount) / info.divisor + 1;
-    return (n * magic_number) >> info.shift_amount;
-}
-
-// Computes floor(n / 10^(kappa + 1)) (float)
-inline auto divide_by_10_to_kappa_plus_1(uint32_t n) noexcept -> uint32_t {
-    // 1374389535 = ceil(2^37/100)
-    return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
-}
-// Computes floor(n / 10^(kappa + 1)) (double)
-inline auto divide_by_10_to_kappa_plus_1(uint64_t n) noexcept -> uint64_t {
-    // 2361183241434822607 = ceil(2^(64+7)/1000)
-    return umul128_upper64(n, 2361183241434822607ull) >> 7;
-}
-
-// Various subroutines using pow10 cache
-template <typename T>
-struct cache_accessor;
-
-template <>
-struct cache_accessor<float> {
-    using carrier_uint = float_info<float>::carrier_uint;
-    using cache_entry_type = uint64_t;
-
-    static auto get_cached_power(int k) noexcept -> uint64_t {
-        FMT_ASSERT(
-                k >= float_info<float>::min_k && k <= float_info<float>::max_k,
-                "k is out of range");
-        static constexpr const uint64_t pow10_significands[] = {
-                0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
-                0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
-                0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
-                0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
-                0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
-                0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
-                0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
-                0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
-                0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
-                0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
-                0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
-                0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
-                0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
-                0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
-                0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
-                0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
-                0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
-                0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
-                0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
-                0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940985,
-                0xa18f07d736b90be6, 0xc9f2c9cd04674edf, 0xfc6f7c4045812297,
-                0x9dc5ada82b70b59e, 0xc5371912364ce306, 0xf684df56c3e01bc7,
-                0x9a130b963a6c115d, 0xc097ce7bc90715b4, 0xf0bdc21abb48db21,
-                0x96769950b50d88f5, 0xbc143fa4e250eb32, 0xeb194f8e1ae525fe,
-                0x92efd1b8d0cf37bf, 0xb7abc627050305ae, 0xe596b7b0c643c71a,
-                0x8f7e32ce7bea5c70, 0xb35dbf821ae4f38c, 0xe0352f62a19e306f};
-        return pow10_significands[k - float_info<float>::min_k];
-    }
-
-    struct compute_mul_result {
-        carrier_uint result;
-        bool is_integer;
-    };
-    struct compute_mul_parity_result {
-        bool parity;
-        bool is_integer;
-    };
-
-    static auto compute_mul(carrier_uint u,
-            const cache_entry_type &cache) noexcept -> compute_mul_result {
-        auto r = umul96_upper64(u, cache);
-        return {static_cast<carrier_uint>(r >> 32),
-                static_cast<carrier_uint>(r) == 0};
-    }
-
-    static auto compute_delta(const cache_entry_type &cache, int beta) noexcept
-            -> uint32_t {
-        return static_cast<uint32_t>(cache >> (64 - 1 - beta));
-    }
-
-    static auto compute_mul_parity(carrier_uint two_f,
-            const cache_entry_type &cache, int beta) noexcept
-            -> compute_mul_parity_result {
-        FMT_ASSERT(beta >= 1, "");
-        FMT_ASSERT(beta < 64, "");
-
-        auto r = umul96_lower64(two_f, cache);
-        return {((r >> (64 - beta)) & 1) != 0,
-                static_cast<uint32_t>(r >> (32 - beta)) == 0};
-    }
-
-    static auto compute_left_endpoint_for_shorter_interval_case(
-            const cache_entry_type &cache, int beta) noexcept -> carrier_uint {
-        return static_cast<carrier_uint>(
-                (cache - (cache >> (num_significand_bits<float>() + 2)))
-                >> (64 - num_significand_bits<float>() - 1 - beta));
-    }
-
-    static auto compute_right_endpoint_for_shorter_interval_case(
-            const cache_entry_type &cache, int beta) noexcept -> carrier_uint {
-        return static_cast<carrier_uint>(
-                (cache + (cache >> (num_significand_bits<float>() + 1)))
-                >> (64 - num_significand_bits<float>() - 1 - beta));
-    }
-
-    static auto compute_round_up_for_shorter_interval_case(
-            const cache_entry_type &cache, int beta) noexcept -> carrier_uint {
-        return (static_cast<carrier_uint>(cache
-                        >> (64 - num_significand_bits<float>() - 2 - beta))
-                       + 1)
-                / 2;
-    }
-};
-
-template <>
-struct cache_accessor<double> {
-    using carrier_uint = float_info<double>::carrier_uint;
-    using cache_entry_type = uint128_fallback;
-
-    static auto get_cached_power(int k) noexcept -> uint128_fallback {
-        FMT_ASSERT(k >= float_info<double>::min_k
-                        && k <= float_info<double>::max_k,
-                "k is out of range");
-
-        static constexpr const uint128_fallback pow10_significands[] = {
-#if FMT_USE_FULL_CACHE_DRAGONBOX
-            {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
-            {0x9faacf3df73609b1, 0x77b191618c54e9ad},
-            {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
-            {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
-            {0x9becce62836ac577, 0x4ee367f9430aec33},
-            {0xc2e801fb244576d5, 0x229c41f793cda740},
-            {0xf3a20279ed56d48a, 0x6b43527578c11110},
-            {0x9845418c345644d6, 0x830a13896b78aaaa},
-            {0xbe5691ef416bd60c, 0x23cc986bc656d554},
-            {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
-            {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
-            {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
-            {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
-            {0x91376c36d99995be, 0x23100809b9c21fa2},
-            {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
-            {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
-            {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
-            {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
-            {0xdd95317f31c7fa1d, 0x40405643d711d584},
-            {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
-            {0xad1c8eab5ee43b66, 0xda3243650005eed0},
-            {0xd863b256369d4a40, 0x90bed43e40076a83},
-            {0x873e4f75e2224e68, 0x5a7744a6e804a292},
-            {0xa90de3535aaae202, 0x711515d0a205cb37},
-            {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
-            {0x8412d9991ed58091, 0xe858790afe9486c3},
-            {0xa5178fff668ae0b6, 0x626e974dbe39a873},
-            {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
-            {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
-            {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
-            {0xc987434744ac874e, 0xa327ffb266b56221},
-            {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
-            {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
-            {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
-            {0xf6019da07f549b2b, 0x7e2a53a146606a49},
-            {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
-            {0xc0314325637a1939, 0xfa911155fefb5309},
-            {0xf03d93eebc589f88, 0x793555ab7eba27cb},
-            {0x96267c7535b763b5, 0x4bc1558b2f3458df},
-            {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
-            {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
-            {0x92a1958a7675175f, 0x0bfacd89ec191eca},
-            {0xb749faed14125d36, 0xcef980ec671f667c},
-            {0xe51c79a85916f484, 0x82b7e12780e7401b},
-            {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
-            {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
-            {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
-            {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
-            {0xaecc49914078536d, 0x58fae9f773886e19},
-            {0xda7f5bf590966848, 0xaf39a475506a899f},
-            {0x888f99797a5e012d, 0x6d8406c952429604},
-            {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
-            {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
-            {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
-            {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
-            {0xd0601d8efc57b08b, 0xf13b94daf124da27},
-            {0x823c12795db6ce57, 0x76c53d08d6b70859},
-            {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
-            {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
-            {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
-            {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
-            {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
-            {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
-            {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
-            {0xc21094364dfb5636, 0x985915fc12f542e5},
-            {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
-            {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
-            {0xbd8430bd08277231, 0x50c6ff782a838354},
-            {0xece53cec4a314ebd, 0xa4f8bf5635246429},
-            {0x940f4613ae5ed136, 0x871b7795e136be9a},
-            {0xb913179899f68584, 0x28e2557b59846e40},
-            {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
-            {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
-            {0xb4bca50b065abe63, 0x0fed077a756b53aa},
-            {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
-            {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
-            {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
-            {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
-            {0x89e42caaf9491b60, 0xf41686c49db57245},
-            {0xac5d37d5b79b6239, 0x311c2875c522ced6},
-            {0xd77485cb25823ac7, 0x7d633293366b828c},
-            {0x86a8d39ef77164bc, 0xae5dff9c02033198},
-            {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
-            {0xd267caa862a12d66, 0xd072df63c324fd7c},
-            {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
-            {0xa46116538d0deb78, 0x52d9be85f074e609},
-            {0xcd795be870516656, 0x67902e276c921f8c},
-            {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
-            {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
-            {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
-            {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
-            {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
-            {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
-            {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
-            {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
-            {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
-            {0xef340a98172aace4, 0x86fb897116c87c35},
-            {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
-            {0xbae0a846d2195712, 0x8974836059cca10a},
-            {0xe998d258869facd7, 0x2bd1a438703fc94c},
-            {0x91ff83775423cc06, 0x7b6306a34627ddd0},
-            {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
-            {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
-            {0x8e938662882af53e, 0x547eb47b7282ee9d},
-            {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
-            {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
-            {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
-            {0xae0b158b4738705e, 0x9624ab50b148d446},
-            {0xd98ddaee19068c76, 0x3badd624dd9b0958},
-            {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
-            {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
-            {0xd47487cc8470652b, 0x7647c32000696720},
-            {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
-            {0xa5fb0a17c777cf09, 0xf468107100525891},
-            {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
-            {0x81ac1fe293d599bf, 0xc6f14cd848405531},
-            {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
-            {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
-            {0xfd442e4688bd304a, 0x908f4a166d1da664},
-            {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
-            {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
-            {0xf7549530e188c128, 0xd12bee59e68ef47d},
-            {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
-            {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
-            {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
-            {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
-            {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
-            {0xebdf661791d60f56, 0x111b495b3464ad22},
-            {0x936b9fcebb25c995, 0xcab10dd900beec35},
-            {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
-            {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
-            {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
-            {0xb3f4e093db73a093, 0x59ed216765690f57},
-            {0xe0f218b8d25088b8, 0x306869c13ec3532d},
-            {0x8c974f7383725573, 0x1e414218c73a13fc},
-            {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
-            {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
-            {0x894bc396ce5da772, 0x6b8bba8c328eb784},
-            {0xab9eb47c81f5114f, 0x066ea92f3f326565},
-            {0xd686619ba27255a2, 0xc80a537b0efefebe},
-            {0x8613fd0145877585, 0xbd06742ce95f5f37},
-            {0xa798fc4196e952e7, 0x2c48113823b73705},
-            {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
-            {0x82ef85133de648c4, 0x9a984d73dbe722fc},
-            {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
-            {0xcc963fee10b7d1b3, 0x318df905079926a9},
-            {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
-            {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
-            {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
-            {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
-            {0x9c1661a651213e2d, 0x06bea10ca65c084f},
-            {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
-            {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
-            {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
-            {0xbe89523386091465, 0xf6bbb397f1135824},
-            {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
-            {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
-            {0xba121a4650e4ddeb, 0x92f34d62616ce414},
-            {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
-            {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
-            {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
-            {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
-            {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
-            {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
-            {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
-            {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
-            {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
-            {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
-            {0x87625f056c7c4a8b, 0x11471cd764ad4973},
-            {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
-            {0xd389b47879823479, 0x4aff1d108d4ec2c4},
-            {0x843610cb4bf160cb, 0xcedf722a585139bb},
-            {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
-            {0xce947a3da6a9273e, 0x733d226229feea33},
-            {0x811ccc668829b887, 0x0806357d5a3f5260},
-            {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
-            {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
-            {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
-            {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
-            {0xc5029163f384a931, 0x0a9e795e65d4df12},
-            {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
-            {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
-            {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
-            {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
-            {0x964e858c91ba2655, 0x3a6a07f8d510f870},
-            {0xbbe226efb628afea, 0x890489f70a55368c},
-            {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
-            {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
-            {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
-            {0xe55990879ddcaabd, 0xcc420a6a101d0516},
-            {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
-            {0xb32df8e9f3546564, 0x47939822dc96abfa},
-            {0xdff9772470297ebd, 0x59787e2b93bc56f8},
-            {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
-            {0xaefae51477a06b03, 0xede622920b6b23f2},
-            {0xdab99e59958885c4, 0xe95fab368e45ecee},
-            {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
-            {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
-            {0xd59944a37c0752a2, 0x4be76d3346f04960},
-            {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
-            {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
-            {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
-            {0x825ecc24c873782f, 0x8ed400668c0c28c9},
-            {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
-            {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
-            {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
-            {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
-            {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
-            {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
-            {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
-            {0xc24452da229b021b, 0xfbe85badce996169},
-            {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
-            {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
-            {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
-            {0xed246723473e3813, 0x290123e9aab23b69},
-            {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
-            {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
-            {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
-            {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
-            {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
-            {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
-            {0x8d590723948a535f, 0x579c487e5a38ad0f},
-            {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
-            {0xdcdb1b2798182244, 0xf8e431456cf88e66},
-            {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
-            {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
-            {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
-            {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
-            {0xa87fea27a539e9a5, 0x3f2398d747b36225},
-            {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
-            {0x83a3eeeef9153e89, 0x1953cf68300424ad},
-            {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
-            {0xcdb02555653131b6, 0x3792f412cb06794e},
-            {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
-            {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
-            {0xc8de047564d20a8b, 0xf245825a5a445276},
-            {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
-            {0x9ced737bb6c4183d, 0x55464dd69685606c},
-            {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
-            {0xf53304714d9265df, 0xd53dd99f4b3066a9},
-            {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
-            {0xbf8fdb78849a5f96, 0xde98520472bdd034},
-            {0xef73d256a5c0f77c, 0x963e66858f6d4441},
-            {0x95a8637627989aad, 0xdde7001379a44aa9},
-            {0xbb127c53b17ec159, 0x5560c018580d5d53},
-            {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
-            {0x9226712162ab070d, 0xcab3961304ca70e9},
-            {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
-            {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
-            {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
-            {0xb267ed1940f1c61c, 0x55f038b237591ed4},
-            {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
-            {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
-            {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
-            {0xd9c7dced53c72255, 0x96e7bd358c904a22},
-            {0x881cea14545c7575, 0x7e50d64177da2e55},
-            {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
-            {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
-            {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
-            {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
-            {0xcfb11ead453994ba, 0x67de18eda5814af3},
-            {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
-            {0xa2425ff75e14fc31, 0xa1258379a94d028e},
-            {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
-            {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
-            {0x9e74d1b791e07e48, 0x775ea264cf55347e},
-            {0xc612062576589dda, 0x95364afe032a819e},
-            {0xf79687aed3eec551, 0x3a83ddbd83f52205},
-            {0x9abe14cd44753b52, 0xc4926a9672793543},
-            {0xc16d9a0095928a27, 0x75b7053c0f178294},
-            {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
-            {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
-            {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
-            {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
-            {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
-            {0xb877aa3236a4b449, 0x09befeb9fad487c3},
-            {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
-            {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
-            {0xb424dc35095cd80f, 0x538484c19ef38c95},
-            {0xe12e13424bb40e13, 0x2865a5f206b06fba},
-            {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
-            {0xafebff0bcb24aafe, 0xf78f69a51539d749},
-            {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
-            {0x89705f4136b4a597, 0x31680a88f8953031},
-            {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
-            {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
-            {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
-            {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
-            {0xd1b71758e219652b, 0xd3c36113404ea4a9},
-            {0x83126e978d4fdf3b, 0x645a1cac083126ea},
-            {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
-            {0xcccccccccccccccc, 0xcccccccccccccccd},
-            {0x8000000000000000, 0x0000000000000000},
-            {0xa000000000000000, 0x0000000000000000},
-            {0xc800000000000000, 0x0000000000000000},
-            {0xfa00000000000000, 0x0000000000000000},
-            {0x9c40000000000000, 0x0000000000000000},
-            {0xc350000000000000, 0x0000000000000000},
-            {0xf424000000000000, 0x0000000000000000},
-            {0x9896800000000000, 0x0000000000000000},
-            {0xbebc200000000000, 0x0000000000000000},
-            {0xee6b280000000000, 0x0000000000000000},
-            {0x9502f90000000000, 0x0000000000000000},
-            {0xba43b74000000000, 0x0000000000000000},
-            {0xe8d4a51000000000, 0x0000000000000000},
-            {0x9184e72a00000000, 0x0000000000000000},
-            {0xb5e620f480000000, 0x0000000000000000},
-            {0xe35fa931a0000000, 0x0000000000000000},
-            {0x8e1bc9bf04000000, 0x0000000000000000},
-            {0xb1a2bc2ec5000000, 0x0000000000000000},
-            {0xde0b6b3a76400000, 0x0000000000000000},
-            {0x8ac7230489e80000, 0x0000000000000000},
-            {0xad78ebc5ac620000, 0x0000000000000000},
-            {0xd8d726b7177a8000, 0x0000000000000000},
-            {0x878678326eac9000, 0x0000000000000000},
-            {0xa968163f0a57b400, 0x0000000000000000},
-            {0xd3c21bcecceda100, 0x0000000000000000},
-            {0x84595161401484a0, 0x0000000000000000},
-            {0xa56fa5b99019a5c8, 0x0000000000000000},
-            {0xcecb8f27f4200f3a, 0x0000000000000000},
-            {0x813f3978f8940984, 0x4000000000000000},
-            {0xa18f07d736b90be5, 0x5000000000000000},
-            {0xc9f2c9cd04674ede, 0xa400000000000000},
-            {0xfc6f7c4045812296, 0x4d00000000000000},
-            {0x9dc5ada82b70b59d, 0xf020000000000000},
-            {0xc5371912364ce305, 0x6c28000000000000},
-            {0xf684df56c3e01bc6, 0xc732000000000000},
-            {0x9a130b963a6c115c, 0x3c7f400000000000},
-            {0xc097ce7bc90715b3, 0x4b9f100000000000},
-            {0xf0bdc21abb48db20, 0x1e86d40000000000},
-            {0x96769950b50d88f4, 0x1314448000000000},
-            {0xbc143fa4e250eb31, 0x17d955a000000000},
-            {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
-            {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
-            {0xb7abc627050305ad, 0xf14a3d9e40000000},
-            {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
-            {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
-            {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
-            {0xe0352f62a19e306e, 0xd50b2037ad200000},
-            {0x8c213d9da502de45, 0x4526f422cc340000},
-            {0xaf298d050e4395d6, 0x9670b12b7f410000},
-            {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
-            {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
-            {0xab0e93b6efee0053, 0x8eea0d047a457a00},
-            {0xd5d238a4abe98068, 0x72a4904598d6d880},
-            {0x85a36366eb71f041, 0x47a6da2b7f864750},
-            {0xa70c3c40a64e6c51, 0x999090b65f67d924},
-            {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
-            {0x82818f1281ed449f, 0xbff8f10e7a8921a5},
-            {0xa321f2d7226895c7, 0xaff72d52192b6a0e},
-            {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764491},
-            {0xfee50b7025c36a08, 0x02f236d04753d5b5},
-            {0x9f4f2726179a2245, 0x01d762422c946591},
-            {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef6},
-            {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb3},
-            {0x9b934c3b330c8577, 0x63cc55f49f88eb30},
-            {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fc},
-            {0xf316271c7fc3908a, 0x8bef464e3945ef7b},
-            {0x97edd871cfda3a56, 0x97758bf0e3cbb5ad},
-            {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea318},
-            {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bde},
-            {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6b},
-            {0xb975d6b6ee39e436, 0xb3e2fd538e122b45},
-            {0xe7d34c64a9c85d44, 0x60dbbca87196b617},
-            {0x90e40fbeea1d3a4a, 0xbc8955e946fe31ce},
-            {0xb51d13aea4a488dd, 0x6babab6398bdbe42},
-            {0xe264589a4dcdab14, 0xc696963c7eed2dd2},
-            {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca3},
-            {0xb0de65388cc8ada8, 0x3b25a55f43294bcc},
-            {0xdd15fe86affad912, 0x49ef0eb713f39ebf},
-            {0x8a2dbf142dfcc7ab, 0x6e3569326c784338},
-            {0xacb92ed9397bf996, 0x49c2c37f07965405},
-            {0xd7e77a8f87daf7fb, 0xdc33745ec97be907},
-            {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a4},
-            {0xa8acd7c0222311bc, 0xc40832ea0d68ce0d},
-            {0xd2d80db02aabd62b, 0xf50a3fa490c30191},
-            {0x83c7088e1aab65db, 0x792667c6da79e0fb},
-            {0xa4b8cab1a1563f52, 0x577001b891185939},
-            {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
-            {0x80b05e5ac60b6178, 0x544f8158315b05b5},
-            {0xa0dc75f1778e39d6, 0x696361ae3db1c722},
-            {0xc913936dd571c84c, 0x03bc3a19cd1e38ea},
-            {0xfb5878494ace3a5f, 0x04ab48a04065c724},
-            {0x9d174b2dcec0e47b, 0x62eb0d64283f9c77},
-            {0xc45d1df942711d9a, 0x3ba5d0bd324f8395},
-            {0xf5746577930d6500, 0xca8f44ec7ee3647a},
-            {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecc},
-            {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67f},
-            {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101f},
-            {0x95d04aee3b80ece5, 0xbba1f1d158724a13},
-            {0xbb445da9ca61281f, 0x2a8a6e45ae8edc98},
-            {0xea1575143cf97226, 0xf52d09d71a3293be},
-            {0x924d692ca61be758, 0x593c2626705f9c57},
-            {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836d},
-            {0xe498f455c38b997a, 0x0b6dfb9c0f956448},
-            {0x8edf98b59a373fec, 0x4724bd4189bd5ead},
-            {0xb2977ee300c50fe7, 0x58edec91ec2cb658},
-            {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ee},
-            {0x8b865b215899f46c, 0xbd79e0d20082ee75},
-            {0xae67f1e9aec07187, 0xecd8590680a3aa12},
-            {0xda01ee641a708de9, 0xe80e6f4820cc9496},
-            {0x884134fe908658b2, 0x3109058d147fdcde},
-            {0xaa51823e34a7eede, 0xbd4b46f0599fd416},
-            {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91b},
-            {0x850fadc09923329e, 0x03e2cf6bc604ddb1},
-            {0xa6539930bf6bff45, 0x84db8346b786151d},
-            {0xcfe87f7cef46ff16, 0xe612641865679a64},
-            {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07f},
-            {0xa26da3999aef7749, 0xe3be5e330f38f09e},
-            {0xcb090c8001ab551c, 0x5cadf5bfd3072cc6},
-            {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f7},
-            {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afb},
-            {0xc646d63501a1511d, 0xb281e1fd541501b9},
-            {0xf7d88bc24209a565, 0x1f225a7ca91a4227},
-            {0x9ae757596946075f, 0x3375788de9b06959},
-            {0xc1a12d2fc3978937, 0x0052d6b1641c83af},
-            {0xf209787bb47d6b84, 0xc0678c5dbd23a49b},
-            {0x9745eb4d50ce6332, 0xf840b7ba963646e1},
-            {0xbd176620a501fbff, 0xb650e5a93bc3d899},
-            {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebf},
-            {0x93ba47c980e98cdf, 0xc66f336c36b10138},
-            {0xb8a8d9bbe123f017, 0xb80b0047445d4185},
-            {0xe6d3102ad96cec1d, 0xa60dc059157491e6},
-            {0x9043ea1ac7e41392, 0x87c89837ad68db30},
-            {0xb454e4a179dd1877, 0x29babe4598c311fc},
-            {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67b},
-            {0x8ce2529e2734bb1d, 0x1899e4a65f58660d},
-            {0xb01ae745b101e9e4, 0x5ec05dcff72e7f90},
-            {0xdc21a1171d42645d, 0x76707543f4fa1f74},
-            {0x899504ae72497eba, 0x6a06494a791c53a9},
-            {0xabfa45da0edbde69, 0x0487db9d17636893},
-            {0xd6f8d7509292d603, 0x45a9d2845d3c42b7},
-            {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
-            {0xa7f26836f282b732, 0x8e6cac7768d7141f},
-            {0xd1ef0244af2364ff, 0x3207d795430cd927},
-            {0x8335616aed761f1f, 0x7f44e6bd49e807b9},
-            {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a7},
-            {0xcd036837130890a1, 0x36dba887c37a8c10},
-            {0x802221226be55a64, 0xc2494954da2c978a},
-            {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6d},
-            {0xc83553c5c8965d3d, 0x6f92829494e5acc8},
-            {0xfa42a8b73abbf48c, 0xcb772339ba1f17fa},
-            {0x9c69a97284b578d7, 0xff2a760414536efc},
-            {0xc38413cf25e2d70d, 0xfef5138519684abb},
-            {0xf46518c2ef5b8cd1, 0x7eb258665fc25d6a},
-            {0x98bf2f79d5993802, 0xef2f773ffbd97a62},
-            {0xbeeefb584aff8603, 0xaafb550ffacfd8fb},
-            {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf39},
-            {0x952ab45cfa97a0b2, 0xdd945a747bf26184},
-            {0xba756174393d88df, 0x94f971119aeef9e5},
-            {0xe912b9d1478ceb17, 0x7a37cd5601aab85e},
-            {0x91abb422ccb812ee, 0xac62e055c10ab33b},
-            {0xb616a12b7fe617aa, 0x577b986b314d600a},
-            {0xe39c49765fdf9d94, 0xed5a7e85fda0b80c},
-            {0x8e41ade9fbebc27d, 0x14588f13be847308},
-            {0xb1d219647ae6b31c, 0x596eb2d8ae258fc9},
-            {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bc},
-            {0x8aec23d680043bee, 0x25de7bb9480d5855},
-            {0xada72ccc20054ae9, 0xaf561aa79a10ae6b},
-            {0xd910f7ff28069da4, 0x1b2ba1518094da05},
-            {0x87aa9aff79042286, 0x90fb44d2f05d0843},
-            {0xa99541bf57452b28, 0x353a1607ac744a54},
-            {0xd3fa922f2d1675f2, 0x42889b8997915ce9},
-            {0x847c9b5d7c2e09b7, 0x69956135febada12},
-            {0xa59bc234db398c25, 0x43fab9837e699096},
-            {0xcf02b2c21207ef2e, 0x94f967e45e03f4bc},
-            {0x8161afb94b44f57d, 0x1d1be0eebac278f6},
-            {0xa1ba1ba79e1632dc, 0x6462d92a69731733},
-            {0xca28a291859bbf93, 0x7d7b8f7503cfdcff},
-            {0xfcb2cb35e702af78, 0x5cda735244c3d43f},
-            {0x9defbf01b061adab, 0x3a0888136afa64a8},
-            {0xc56baec21c7a1916, 0x088aaa1845b8fdd1},
-            {0xf6c69a72a3989f5b, 0x8aad549e57273d46},
-            {0x9a3c2087a63f6399, 0x36ac54e2f678864c},
-            {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7de},
-            {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d6},
-            {0x969eb7c47859e743, 0x9f644ae5a4b1b326},
-            {0xbc4665b596706114, 0x873d5d9f0dde1fef},
-            {0xeb57ff22fc0c7959, 0xa90cb506d155a7eb},
-            {0x9316ff75dd87cbd8, 0x09a7f12442d588f3},
-            {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb30},
-            {0xe5d3ef282a242e81, 0x8f1668c8a86da5fb},
-            {0x8fa475791a569d10, 0xf96e017d694487bd},
-            {0xb38d92d760ec4455, 0x37c981dcc395a9ad},
-            {0xe070f78d3927556a, 0x85bbe253f47b1418},
-            {0x8c469ab843b89562, 0x93956d7478ccec8f},
-            {0xaf58416654a6babb, 0x387ac8d1970027b3},
-            {0xdb2e51bfe9d0696a, 0x06997b05fcc0319f},
-            {0x88fcf317f22241e2, 0x441fece3bdf81f04},
-            {0xab3c2fddeeaad25a, 0xd527e81cad7626c4},
-            {0xd60b3bd56a5586f1, 0x8a71e223d8d3b075},
-            {0x85c7056562757456, 0xf6872d5667844e4a},
-            {0xa738c6bebb12d16c, 0xb428f8ac016561dc},
-            {0xd106f86e69d785c7, 0xe13336d701beba53},
-            {0x82a45b450226b39c, 0xecc0024661173474},
-            {0xa34d721642b06084, 0x27f002d7f95d0191},
-            {0xcc20ce9bd35c78a5, 0x31ec038df7b441f5},
-            {0xff290242c83396ce, 0x7e67047175a15272},
-            {0x9f79a169bd203e41, 0x0f0062c6e984d387},
-            {0xc75809c42c684dd1, 0x52c07b78a3e60869},
-            {0xf92e0c3537826145, 0xa7709a56ccdf8a83},
-            {0x9bbcc7a142b17ccb, 0x88a66076400bb692},
-            {0xc2abf989935ddbfe, 0x6acff893d00ea436},
-            {0xf356f7ebf83552fe, 0x0583f6b8c4124d44},
-            {0x98165af37b2153de, 0xc3727a337a8b704b},
-            {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5d},
-            {0xeda2ee1c7064130c, 0x1162def06f79df74},
-            {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba9},
-            {0xb9a74a0637ce2ee1, 0x6d953e2bd7173693},
-            {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0438},
-            {0x910ab1d4db9914a0, 0x1d9c9892400a22a3},
-            {0xb54d5e4a127f59c8, 0x2503beb6d00cab4c},
-            {0xe2a0b5dc971f303a, 0x2e44ae64840fd61e},
-            {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
-            {0xb10d8e1456105dad, 0x7425a83e872c5f48},
-            {0xdd50f1996b947518, 0xd12f124e28f7771a},
-            {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa70},
-            {0xace73cbfdc0bfb7b, 0x636cc64d1001550c},
-            {0xd8210befd30efa5a, 0x3c47f7e05401aa4f},
-            {0x8714a775e3e95c78, 0x65acfaec34810a72},
-            {0xa8d9d1535ce3b396, 0x7f1839a741a14d0e},
-            {0xd31045a8341ca07c, 0x1ede48111209a051},
-            {0x83ea2b892091e44d, 0x934aed0aab460433},
-            {0xa4e4b66b68b65d60, 0xf81da84d56178540},
-            {0xce1de40642e3f4b9, 0x36251260ab9d668f},
-            {0x80d2ae83e9ce78f3, 0xc1d72b7c6b42601a},
-            {0xa1075a24e4421730, 0xb24cf65b8612f820},
-            {0xc94930ae1d529cfc, 0xdee033f26797b628},
-            {0xfb9b7cd9a4a7443c, 0x169840ef017da3b2},
-            {0x9d412e0806e88aa5, 0x8e1f289560ee864f},
-            {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e3},
-            {0xf5b5d7ec8acb58a2, 0xae10af696774b1dc},
-            {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef2a},
-            {0xbff610b0cc6edd3f, 0x17fd090a58d32af4},
-            {0xeff394dcff8a948e, 0xddfc4b4cef07f5b1},
-            {0x95f83d0a1fb69cd9, 0x4abdaf101564f98f},
-            {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f2},
-            {0xea53df5fd18d5513, 0x84c86189216dc5ee},
-            {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb5},
-            {0xb7118682dbb66a77, 0x3fbc8c33221dc2a2},
-            {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
-            {0x8f05b1163ba6832d, 0x29cb4d87f2a7400f},
-            {0xb2c71d5bca9023f8, 0x743e20e9ef511013},
-            {0xdf78e4b2bd342cf6, 0x914da9246b255417},
-            {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548f},
-            {0xae9672aba3d0c320, 0xa184ac2473b529b2},
-            {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741f},
-            {0x8865899617fb1871, 0x7e2fa67c7a658893},
-            {0xaa7eebfb9df9de8d, 0xddbb901b98feeab8},
-            {0xd51ea6fa85785631, 0x552a74227f3ea566},
-            {0x8533285c936b35de, 0xd53a88958f872760},
-            {0xa67ff273b8460356, 0x8a892abaf368f138},
-            {0xd01fef10a657842c, 0x2d2b7569b0432d86},
-            {0x8213f56a67f6b29b, 0x9c3b29620e29fc74},
-            {0xa298f2c501f45f42, 0x8349f3ba91b47b90},
-            {0xcb3f2f7642717713, 0x241c70a936219a74},
-            {0xfe0efb53d30dd4d7, 0xed238cd383aa0111},
-            {0x9ec95d1463e8a506, 0xf4363804324a40ab},
-            {0xc67bb4597ce2ce48, 0xb143c6053edcd0d6},
-            {0xf81aa16fdc1b81da, 0xdd94b7868e94050b},
-            {0x9b10a4e5e9913128, 0xca7cf2b4191c8327},
-            {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f1},
-            {0xf24a01a73cf2dccf, 0xbc633b39673c8ced},
-            {0x976e41088617ca01, 0xd5be0503e085d814},
-            {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e19},
-            {0xec9c459d51852ba2, 0xddf8e7d60ed1219f},
-            {0x93e1ab8252f33b45, 0xcabb90e5c942b504},
-            {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
-            {0xe7109bfba19c0c9d, 0x0cc512670a783ad5},
-            {0x906a617d450187e2, 0x27fb2b80668b24c6},
-            {0xb484f9dc9641e9da, 0xb1f9f660802dedf7},
-            {0xe1a63853bbd26451, 0x5e7873f8a0396974},
-            {0x8d07e33455637eb2, 0xdb0b487b6423e1e9},
-            {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda63},
-            {0xdc5c5301c56b75f7, 0x7641a140cc7810fc},
-            {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9e},
-            {0xac2820d9623bf429, 0x546345fa9fbdcd45},
-            {0xd732290fbacaf133, 0xa97c177947ad4096},
-            {0x867f59a9d4bed6c0, 0x49ed8eabcccc485e},
-            {0xa81f301449ee8c70, 0x5c68f256bfff5a75},
-            {0xd226fc195c6a2f8c, 0x73832eec6fff3112},
-            {0x83585d8fd9c25db7, 0xc831fd53c5ff7eac},
-            {0xa42e74f3d032f525, 0xba3e7ca8b77f5e56},
-            {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35ec},
-            {0x80444b5e7aa7cf85, 0x7980d163cf5b81b4},
-            {0xa0555e361951c366, 0xd7e105bcc3326220},
-            {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa8},
-            {0xfa856334878fc150, 0xb14f98f6f0feb952},
-            {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d4},
-            {0xc3b8358109e84f07, 0x0a862f80ec4700c9},
-            {0xf4a642e14c6262c8, 0xcd27bb612758c0fb},
-            {0x98e7e9cccfbd7dbd, 0x8038d51cb897789d},
-            {0xbf21e44003acdd2c, 0xe0470a63e6bd56c4},
-            {0xeeea5d5004981478, 0x1858ccfce06cac75},
-            {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
-            {0xbaa718e68396cffd, 0xd30560258f54e6bb},
-            {0xe950df20247c83fd, 0x47c6b82ef32a206a},
-            {0x91d28b7416cdd27e, 0x4cdc331d57fa5442},
-            {0xb6472e511c81471d, 0xe0133fe4adf8e953},
-            {0xe3d8f9e563a198e5, 0x58180fddd97723a7},
-            {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7649},
-            {0xb201833b35d63f73, 0x2cd2cc6551e513db},
-            {0xde81e40a034bcf4f, 0xf8077f7ea65e58d2},
-            {0x8b112e86420f6191, 0xfb04afaf27faf783},
-            {0xadd57a27d29339f6, 0x79c5db9af1f9b564},
-            {0xd94ad8b1c7380874, 0x18375281ae7822bd},
-            {0x87cec76f1c830548, 0x8f2293910d0b15b6},
-            {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb23},
-            {0xd433179d9c8cb841, 0x5fa60692a46151ec},
-            {0x849feec281d7f328, 0xdbc7c41ba6bcd334},
-            {0xa5c7ea73224deff3, 0x12b9b522906c0801},
-            {0xcf39e50feae16bef, 0xd768226b34870a01},
-            {0x81842f29f2cce375, 0xe6a1158300d46641},
-            {0xa1e53af46f801c53, 0x60495ae3c1097fd1},
-            {0xca5e89b18b602368, 0x385bb19cb14bdfc5},
-            {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6},
-            {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2},
-            {0xc5a05277621be293, 0xc7098b7305241886},
-            {0xf70867153aa2db38, 0xb8cbee4fc66d1ea8},
-            {0x9a65406d44a5c903, 0x737f74f1dc043329},
-            {0xc0fe908895cf3b44, 0x505f522e53053ff3},
-            {0xf13e34aabb430a15, 0x647726b9e7c68ff0},
-            {0x96c6e0eab509e64d, 0x5eca783430dc19f6},
-            {0xbc789925624c5fe0, 0xb67d16413d132073},
-            {0xeb96bf6ebadf77d8, 0xe41c5bd18c57e890},
-            {0x933e37a534cbaae7, 0x8e91b962f7b6f15a},
-            {0xb80dc58e81fe95a1, 0x723627bbb5a4adb1},
-            {0xe61136f2227e3b09, 0xcec3b1aaa30dd91d},
-            {0x8fcac257558ee4e6, 0x213a4f0aa5e8a7b2},
-            {0xb3bd72ed2af29e1f, 0xa988e2cd4f62d19e},
-            {0xe0accfa875af45a7, 0x93eb1b80a33b8606},
-            {0x8c6c01c9498d8b88, 0xbc72f130660533c4},
-            {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5},
-            {0xdb68c2ca82ed2a05, 0xa67398db9f6820e2},
-#else
-            {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
-            {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
-            {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
-            {0x86a8d39ef77164bc, 0xae5dff9c02033198},
-            {0xd98ddaee19068c76, 0x3badd624dd9b0958},
-            {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
-            {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
-            {0xe55990879ddcaabd, 0xcc420a6a101d0516},
-            {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
-            {0x95a8637627989aad, 0xdde7001379a44aa9},
-            {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
-            {0xc350000000000000, 0x0000000000000000},
-            {0x9dc5ada82b70b59d, 0xf020000000000000},
-            {0xfee50b7025c36a08, 0x02f236d04753d5b5},
-            {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
-            {0xa6539930bf6bff45, 0x84db8346b786151d},
-            {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
-            {0xd910f7ff28069da4, 0x1b2ba1518094da05},
-            {0xaf58416654a6babb, 0x387ac8d1970027b3},
-            {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
-            {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
-            {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
-            {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
-            {0xf13e34aabb430a15, 0x647726b9e7c68ff0}
-#endif
-        };
-
-#if FMT_USE_FULL_CACHE_DRAGONBOX
-        return pow10_significands[k - float_info<double>::min_k];
-#else
-        static constexpr const uint64_t powers_of_5_64[] = {0x0000000000000001,
-                0x0000000000000005, 0x0000000000000019, 0x000000000000007d,
-                0x0000000000000271, 0x0000000000000c35, 0x0000000000003d09,
-                0x000000000001312d, 0x000000000005f5e1, 0x00000000001dcd65,
-                0x00000000009502f9, 0x0000000002e90edd, 0x000000000e8d4a51,
-                0x0000000048c27395, 0x000000016bcc41e9, 0x000000071afd498d,
-                0x0000002386f26fc1, 0x000000b1a2bc2ec5, 0x000003782dace9d9,
-                0x00001158e460913d, 0x000056bc75e2d631, 0x0001b1ae4d6e2ef5,
-                0x000878678326eac9, 0x002a5a058fc295ed, 0x00d3c21bcecceda1,
-                0x0422ca8b0a00a425, 0x14adf4b7320334b9};
-
-        static const int compression_ratio = 27;
-
-        // Compute base index.
-        int cache_index = (k - float_info<double>::min_k) / compression_ratio;
-        int kb = cache_index * compression_ratio + float_info<double>::min_k;
-        int offset = k - kb;
-
-        // Get base cache.
-        uint128_fallback base_cache = pow10_significands[cache_index];
-        if (offset == 0) return base_cache;
-
-        // Compute the required amount of bit-shift.
-        int alpha
-                = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset;
-        FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
-
-        // Try to recover the real cache.
-        uint64_t pow5 = powers_of_5_64[offset];
-        uint128_fallback recovered_cache = umul128(base_cache.high(), pow5);
-        uint128_fallback middle_low = umul128(base_cache.low(), pow5);
-
-        recovered_cache += middle_low.high();
-
-        uint64_t high_to_middle = recovered_cache.high() << (64 - alpha);
-        uint64_t middle_to_low = recovered_cache.low() << (64 - alpha);
-
-        recovered_cache = uint128_fallback {
-                (recovered_cache.low() >> alpha) | high_to_middle,
-                ((middle_low.low() >> alpha) | middle_to_low)};
-        FMT_ASSERT(recovered_cache.low() + 1 != 0, "");
-        return {recovered_cache.high(), recovered_cache.low() + 1};
-#endif
-    }
-
-    struct compute_mul_result {
-        carrier_uint result;
-        bool is_integer;
-    };
-    struct compute_mul_parity_result {
-        bool parity;
-        bool is_integer;
-    };
-
-    static auto compute_mul(carrier_uint u,
-            const cache_entry_type &cache) noexcept -> compute_mul_result {
-        auto r = umul192_upper128(u, cache);
-        return {r.high(), r.low() == 0};
-    }
-
-    static auto compute_delta(cache_entry_type const &cache, int beta) noexcept
-            -> uint32_t {
-        return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
-    }
-
-    static auto compute_mul_parity(carrier_uint two_f,
-            const cache_entry_type &cache, int beta) noexcept
-            -> compute_mul_parity_result {
-        FMT_ASSERT(beta >= 1, "");
-        FMT_ASSERT(beta < 64, "");
-
-        auto r = umul192_lower128(two_f, cache);
-        return {((r.high() >> (64 - beta)) & 1) != 0,
-                ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
-    }
-
-    static auto compute_left_endpoint_for_shorter_interval_case(
-            const cache_entry_type &cache, int beta) noexcept -> carrier_uint {
-        return (cache.high()
-                       - (cache.high() >> (num_significand_bits<double>() + 2)))
-                >> (64 - num_significand_bits<double>() - 1 - beta);
-    }
-
-    static auto compute_right_endpoint_for_shorter_interval_case(
-            const cache_entry_type &cache, int beta) noexcept -> carrier_uint {
-        return (cache.high()
-                       + (cache.high() >> (num_significand_bits<double>() + 1)))
-                >> (64 - num_significand_bits<double>() - 1 - beta);
-    }
-
-    static auto compute_round_up_for_shorter_interval_case(
-            const cache_entry_type &cache, int beta) noexcept -> carrier_uint {
-        return ((cache.high()
-                        >> (64 - num_significand_bits<double>() - 2 - beta))
-                       + 1)
-                / 2;
-    }
-};
-
-FMT_FUNC auto get_cached_power(int k) noexcept -> uint128_fallback {
-    return cache_accessor<double>::get_cached_power(k);
-}
-
-// Various integer checks
-template <typename T>
-auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
-    const int case_shorter_interval_left_endpoint_lower_threshold = 2;
-    const int case_shorter_interval_left_endpoint_upper_threshold = 3;
-    return exponent >= case_shorter_interval_left_endpoint_lower_threshold
-            && exponent <= case_shorter_interval_left_endpoint_upper_threshold;
-}
-
-// Remove trailing zeros from n and return the number of zeros removed (float)
-FMT_INLINE int remove_trailing_zeros(uint32_t &n, int s = 0) noexcept {
-    FMT_ASSERT(n != 0, "");
-    // Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
-    constexpr uint32_t mod_inv_5 = 0xcccccccd;
-    constexpr uint32_t mod_inv_25 = 0xc28f5c29; // = mod_inv_5 * mod_inv_5
-
-    while (true) {
-        auto q = rotr(n * mod_inv_25, 2);
-        if (q > max_value<uint32_t>() / 100) break;
-        n = q;
-        s += 2;
-    }
-    auto q = rotr(n * mod_inv_5, 1);
-    if (q <= max_value<uint32_t>() / 10) {
-        n = q;
-        s |= 1;
-    }
-    return s;
-}
-
-// Removes trailing zeros and returns the number of zeros removed (double)
-FMT_INLINE int remove_trailing_zeros(uint64_t &n) noexcept {
-    FMT_ASSERT(n != 0, "");
-
-    // This magic number is ceil(2^90 / 10^8).
-    constexpr uint64_t magic_number = 12379400392853802749ull;
-    auto nm = umul128(n, magic_number);
-
-    // Is n is divisible by 10^8?
-    if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0
-            && nm.low() < magic_number) {
-        // If yes, work with the quotient...
-        auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
-        // ... and use the 32 bit variant of the function
-        int s = remove_trailing_zeros(n32, 8);
-        n = n32;
-        return s;
-    }
-
-    // If n is not divisible by 10^8, work with n itself.
-    constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
-    constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29; // mod_inv_5 * mod_inv_5
-
-    int s = 0;
-    while (true) {
-        auto q = rotr(n * mod_inv_25, 2);
-        if (q > max_value<uint64_t>() / 100) break;
-        n = q;
-        s += 2;
-    }
-    auto q = rotr(n * mod_inv_5, 1);
-    if (q <= max_value<uint64_t>() / 10) {
-        n = q;
-        s |= 1;
-    }
-
-    return s;
-}
-
-// The main algorithm for shorter interval case
-template <typename T>
-FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
-    decimal_fp<T> ret_value;
-    // Compute k and beta
-    const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
-    const int beta = exponent + floor_log2_pow10(-minus_k);
-
-    // Compute xi and zi
-    using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
-    const cache_entry_type cache
-            = cache_accessor<T>::get_cached_power(-minus_k);
-
-    auto xi = cache_accessor<
-            T>::compute_left_endpoint_for_shorter_interval_case(cache, beta);
-    auto zi = cache_accessor<
-            T>::compute_right_endpoint_for_shorter_interval_case(cache, beta);
-
-    // If the left endpoint is not an integer, increase it
-    if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
-
-    // Try bigger divisor
-    ret_value.significand = zi / 10;
-
-    // If succeed, remove trailing zeros if necessary and return
-    if (ret_value.significand * 10 >= xi) {
-        ret_value.exponent = minus_k + 1;
-        ret_value.exponent += remove_trailing_zeros(ret_value.significand);
-        return ret_value;
-    }
-
-    // Otherwise, compute the round-up of y
-    ret_value.significand
-            = cache_accessor<T>::compute_round_up_for_shorter_interval_case(
-                    cache, beta);
-    ret_value.exponent = minus_k;
-
-    // When tie occurs, choose one of them according to the rule
-    if (exponent >= float_info<T>::shorter_interval_tie_lower_threshold
-            && exponent
-                    <= float_info<T>::shorter_interval_tie_upper_threshold) {
-        ret_value.significand = ret_value.significand % 2 == 0
-                ? ret_value.significand
-                : ret_value.significand - 1;
-    } else if (ret_value.significand < xi) {
-        ++ret_value.significand;
-    }
-    return ret_value;
-}
-
-template <typename T>
-auto to_decimal(T x) noexcept -> decimal_fp<T> {
-    // Step 1: integer promotion & Schubfach multiplier calculation.
-
-    using carrier_uint = typename float_info<T>::carrier_uint;
-    using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
-    auto br = bit_cast<carrier_uint>(x);
-
-    // Extract significand bits and exponent bits.
-    const carrier_uint significand_mask
-            = (static_cast<carrier_uint>(1) << num_significand_bits<T>()) - 1;
-    carrier_uint significand = (br & significand_mask);
-    int exponent = static_cast<int>(
-            (br & exponent_mask<T>()) >> num_significand_bits<T>());
-
-    if (exponent != 0) { // Check if normal.
-        exponent -= exponent_bias<T>() + num_significand_bits<T>();
-
-        // Shorter interval case; proceed like Schubfach.
-        // In fact, when exponent == 1 and significand == 0, the interval is
-        // regular. However, it can be shown that the end-results are anyway same.
-        if (significand == 0) return shorter_interval_case<T>(exponent);
-
-        significand
-                |= (static_cast<carrier_uint>(1) << num_significand_bits<T>());
-    } else {
-        // Subnormal case; the interval is always regular.
-        if (significand == 0) return {0, 0};
-        exponent = std::numeric_limits<T>::min_exponent
-                - num_significand_bits<T>() - 1;
-    }
-
-    const bool include_left_endpoint = (significand % 2 == 0);
-    const bool include_right_endpoint = include_left_endpoint;
-
-    // Compute k and beta.
-    const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
-    const cache_entry_type cache
-            = cache_accessor<T>::get_cached_power(-minus_k);
-    const int beta = exponent + floor_log2_pow10(-minus_k);
-
-    // Compute zi and deltai.
-    // 10^kappa <= deltai < 10^(kappa + 1)
-    const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta);
-    const carrier_uint two_fc = significand << 1;
-
-    // For the case of binary32, the result of integer check is not correct for
-    // 29711844 * 2^-82
-    // = 6.1442653300000000008655037797566933477355632930994033813476... * 10^-18
-    // and 29711844 * 2^-81
-    // = 1.2288530660000000001731007559513386695471126586198806762695... * 10^-17,
-    // and they are the unique counterexamples. However, since 29711844 is even,
-    // this does not cause any problem for the endpoints calculations; it can only
-    // cause a problem when we need to perform integer check for the center.
-    // Fortunately, with these inputs, that branch is never executed, so we are
-    // fine.
-    const typename cache_accessor<T>::compute_mul_result z_mul
-            = cache_accessor<T>::compute_mul((two_fc | 1) << beta, cache);
-
-    // Step 2: Try larger divisor; remove trailing zeros if necessary.
-
-    // Using an upper bound on zi, we might be able to optimize the division
-    // better than the compiler; we are computing zi / big_divisor here.
-    decimal_fp<T> ret_value;
-    ret_value.significand = divide_by_10_to_kappa_plus_1(z_mul.result);
-    uint32_t r = static_cast<uint32_t>(
-            z_mul.result - float_info<T>::big_divisor * ret_value.significand);
-
-    if (r < deltai) {
-        // Exclude the right endpoint if necessary.
-        if (r == 0 && (z_mul.is_integer & !include_right_endpoint)) {
-            --ret_value.significand;
-            r = float_info<T>::big_divisor;
-            goto small_divisor_case_label;
-        }
-    } else if (r > deltai) {
-        goto small_divisor_case_label;
-    } else {
-        // r == deltai; compare fractional parts.
-        const typename cache_accessor<T>::compute_mul_parity_result x_mul
-                = cache_accessor<T>::compute_mul_parity(
-                        two_fc - 1, cache, beta);
-
-        if (!(x_mul.parity | (x_mul.is_integer & include_left_endpoint)))
-            goto small_divisor_case_label;
-    }
-    ret_value.exponent = minus_k + float_info<T>::kappa + 1;
-
-    // We may need to remove trailing zeros.
-    ret_value.exponent += remove_trailing_zeros(ret_value.significand);
-    return ret_value;
-
-    // Step 3: Find the significand with the smaller divisor.
-
-small_divisor_case_label:
-    ret_value.significand *= 10;
-    ret_value.exponent = minus_k + float_info<T>::kappa;
-
-    uint32_t dist = r - (deltai / 2) + (float_info<T>::small_divisor / 2);
-    const bool approx_y_parity
-            = ((dist ^ (float_info<T>::small_divisor / 2)) & 1) != 0;
-
-    // Is dist divisible by 10^kappa?
-    const bool divisible_by_small_divisor
-            = check_divisibility_and_divide_by_pow10<float_info<T>::kappa>(
-                    dist);
-
-    // Add dist / 10^kappa to the significand.
-    ret_value.significand += dist;
-
-    if (!divisible_by_small_divisor) return ret_value;
-
-    // Check z^(f) >= epsilon^(f).
-    // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
-    // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f).
-    // Since there are only 2 possibilities, we only need to care about the
-    // parity. Also, zi and r should have the same parity since the divisor
-    // is an even number.
-    const auto y_mul
-            = cache_accessor<T>::compute_mul_parity(two_fc, cache, beta);
-
-    // If z^(f) >= epsilon^(f), we might have a tie when z^(f) == epsilon^(f),
-    // or equivalently, when y is an integer.
-    if (y_mul.parity != approx_y_parity)
-        --ret_value.significand;
-    else if (y_mul.is_integer & (ret_value.significand % 2 != 0))
-        --ret_value.significand;
-    return ret_value;
-}
-} // namespace dragonbox
-} // namespace detail
-
-template <>
-struct formatter<detail::bigint> {
-    FMT_CONSTEXPR auto parse(format_parse_context &ctx)
-            -> format_parse_context::iterator {
-        return ctx.begin();
-    }
-
-    auto format(const detail::bigint &n, format_context &ctx) const
-            -> format_context::iterator {
-        auto out = ctx.out();
-        bool first = true;
-        for (auto i = n.bigits_.size(); i > 0; --i) {
-            auto value = n.bigits_[i - 1u];
-            if (first) {
-                out = fmt::format_to(out, FMT_STRING("{:x}"), value);
-                first = false;
-                continue;
-            }
-            out = fmt::format_to(out, FMT_STRING("{:08x}"), value);
-        }
-        if (n.exp_ > 0)
-            out = fmt::format_to(out, FMT_STRING("p{}"),
-                    n.exp_ * detail::bigint::bigit_bits);
-        return out;
-    }
-};
-
-FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
-    for_each_codepoint(s, [this](uint32_t cp, string_view) {
-        if (cp == invalid_code_point)
-            FMT_THROW(std::runtime_error("invalid utf8"));
-        if (cp <= 0xFFFF) {
-            buffer_.push_back(static_cast<wchar_t>(cp));
-        } else {
-            cp -= 0x10000;
-            buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
-            buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
-        }
-        return true;
-    });
-    buffer_.push_back(0);
-}
-
-FMT_FUNC void format_system_error(detail::buffer<char> &out, int error_code,
-        const char *message) noexcept {
-    FMT_TRY {
-        auto ec = std::error_code(error_code, std::generic_category());
-        write(std::back_inserter(out), std::system_error(ec, message).what());
-        return;
-    }
-    FMT_CATCH(...) {}
-    format_error_code(out, error_code, message);
-}
-
-FMT_FUNC void report_system_error(
-        int error_code, const char *message) noexcept {
-    report_error(format_system_error, error_code, message);
-}
-
-FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
-    // Don't optimize the "{}" case to keep the binary size small and because it
-    // can be better optimized in fmt::format anyway.
-    auto buffer = memory_buffer();
-    detail::vformat_to(buffer, fmt, args);
-    return to_string(buffer);
-}
-
-namespace detail {
-#if !defined(_WIN32) || defined(FMT_WINDOWS_NO_WCHAR)
-FMT_FUNC auto write_console(int, string_view) -> bool {
-    return false;
-}
-FMT_FUNC auto write_console(std::FILE *, string_view) -> bool {
-    return false;
-}
-#else
-using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
-extern "C" __declspec(dllimport) int __stdcall WriteConsoleW( //
-        void *, const void *, dword, dword *, void *);
-
-FMT_FUNC bool write_console(int fd, string_view text) {
-    auto u16 = utf8_to_utf16(text);
-    return WriteConsoleW(reinterpret_cast<void *>(_get_osfhandle(fd)),
-                   u16.c_str(), static_cast<dword>(u16.size()), nullptr,
-                   nullptr)
-            != 0;
-}
-
-FMT_FUNC auto write_console(std::FILE *f, string_view text) -> bool {
-    return write_console(_fileno(f), text);
-}
-#endif
-
-#ifdef _WIN32
-// Print assuming legacy (non-Unicode) encoding.
-FMT_FUNC void vprint_mojibake(std::FILE *f, string_view fmt, format_args args) {
-    auto buffer = memory_buffer();
-    detail::vformat_to(buffer, fmt, args);
-    fwrite_fully(buffer.data(), buffer.size(), f);
-}
-#endif
-
-FMT_FUNC void print(std::FILE *f, string_view text) {
-#ifdef _WIN32
-    int fd = _fileno(f);
-    if (_isatty(fd)) {
-        std::fflush(f);
-        if (write_console(fd, text)) return;
-    }
-#endif
-    fwrite_fully(text.data(), text.size(), f);
-}
-} // namespace detail
-
-FMT_FUNC void vprint(std::FILE *f, string_view fmt, format_args args) {
-    auto buffer = memory_buffer();
-    detail::vformat_to(buffer, fmt, args);
-    detail::print(f, {buffer.data(), buffer.size()});
-}
-
-FMT_FUNC void vprint(string_view fmt, format_args args) {
-    vprint(stdout, fmt, args);
-}
-
-namespace detail {
-
-struct singleton {
-    unsigned char upper;
-    unsigned char lower_count;
-};
-
-inline auto is_printable(uint16_t x, const singleton *singletons,
-        size_t singletons_size, const unsigned char *singleton_lowers,
-        const unsigned char *normal, size_t normal_size) -> bool {
-    auto upper = x >> 8;
-    auto lower_start = 0;
-    for (size_t i = 0; i < singletons_size; ++i) {
-        auto s = singletons[i];
-        auto lower_end = lower_start + s.lower_count;
-        if (upper < s.upper) break;
-        if (upper == s.upper) {
-            for (auto j = lower_start; j < lower_end; ++j) {
-                if (singleton_lowers[j] == (x & 0xff)) return false;
-            }
-        }
-        lower_start = lower_end;
-    }
-
-    auto xsigned = static_cast<int>(x);
-    auto current = true;
-    for (size_t i = 0; i < normal_size; ++i) {
-        auto v = static_cast<int>(normal[i]);
-        auto len = (v & 0x80) != 0 ? (v & 0x7f) << 8 | normal[++i] : v;
-        xsigned -= len;
-        if (xsigned < 0) break;
-        current = !current;
-    }
-    return current;
-}
-
-// This code is generated by support/printable.py.
-FMT_FUNC auto is_printable(uint32_t cp) -> bool {
-    static constexpr singleton singletons0[] = {
-            {0x00, 1},
-            {0x03, 5},
-            {0x05, 6},
-            {0x06, 3},
-            {0x07, 6},
-            {0x08, 8},
-            {0x09, 17},
-            {0x0a, 28},
-            {0x0b, 25},
-            {0x0c, 20},
-            {0x0d, 16},
-            {0x0e, 13},
-            {0x0f, 4},
-            {0x10, 3},
-            {0x12, 18},
-            {0x13, 9},
-            {0x16, 1},
-            {0x17, 5},
-            {0x18, 2},
-            {0x19, 3},
-            {0x1a, 7},
-            {0x1c, 2},
-            {0x1d, 1},
-            {0x1f, 22},
-            {0x20, 3},
-            {0x2b, 3},
-            {0x2c, 2},
-            {0x2d, 11},
-            {0x2e, 1},
-            {0x30, 3},
-            {0x31, 2},
-            {0x32, 1},
-            {0xa7, 2},
-            {0xa9, 2},
-            {0xaa, 4},
-            {0xab, 8},
-            {0xfa, 2},
-            {0xfb, 5},
-            {0xfd, 4},
-            {0xfe, 3},
-            {0xff, 9},
-    };
-    static constexpr unsigned char singletons0_lower[] = {
-            0xad,
-            0x78,
-            0x79,
-            0x8b,
-            0x8d,
-            0xa2,
-            0x30,
-            0x57,
-            0x58,
-            0x8b,
-            0x8c,
-            0x90,
-            0x1c,
-            0x1d,
-            0xdd,
-            0x0e,
-            0x0f,
-            0x4b,
-            0x4c,
-            0xfb,
-            0xfc,
-            0x2e,
-            0x2f,
-            0x3f,
-            0x5c,
-            0x5d,
-            0x5f,
-            0xb5,
-            0xe2,
-            0x84,
-            0x8d,
-            0x8e,
-            0x91,
-            0x92,
-            0xa9,
-            0xb1,
-            0xba,
-            0xbb,
-            0xc5,
-            0xc6,
-            0xc9,
-            0xca,
-            0xde,
-            0xe4,
-            0xe5,
-            0xff,
-            0x00,
-            0x04,
-            0x11,
-            0x12,
-            0x29,
-            0x31,
-            0x34,
-            0x37,
-            0x3a,
-            0x3b,
-            0x3d,
-            0x49,
-            0x4a,
-            0x5d,
-            0x84,
-            0x8e,
-            0x92,
-            0xa9,
-            0xb1,
-            0xb4,
-            0xba,
-            0xbb,
-            0xc6,
-            0xca,
-            0xce,
-            0xcf,
-            0xe4,
-            0xe5,
-            0x00,
-            0x04,
-            0x0d,
-            0x0e,
-            0x11,
-            0x12,
-            0x29,
-            0x31,
-            0x34,
-            0x3a,
-            0x3b,
-            0x45,
-            0x46,
-            0x49,
-            0x4a,
-            0x5e,
-            0x64,
-            0x65,
-            0x84,
-            0x91,
-            0x9b,
-            0x9d,
-            0xc9,
-            0xce,
-            0xcf,
-            0x0d,
-            0x11,
-            0x29,
-            0x45,
-            0x49,
-            0x57,
-            0x64,
-            0x65,
-            0x8d,
-            0x91,
-            0xa9,
-            0xb4,
-            0xba,
-            0xbb,
-            0xc5,
-            0xc9,
-            0xdf,
-            0xe4,
-            0xe5,
-            0xf0,
-            0x0d,
-            0x11,
-            0x45,
-            0x49,
-            0x64,
-            0x65,
-            0x80,
-            0x84,
-            0xb2,
-            0xbc,
-            0xbe,
-            0xbf,
-            0xd5,
-            0xd7,
-            0xf0,
-            0xf1,
-            0x83,
-            0x85,
-            0x8b,
-            0xa4,
-            0xa6,
-            0xbe,
-            0xbf,
-            0xc5,
-            0xc7,
-            0xce,
-            0xcf,
-            0xda,
-            0xdb,
-            0x48,
-            0x98,
-            0xbd,
-            0xcd,
-            0xc6,
-            0xce,
-            0xcf,
-            0x49,
-            0x4e,
-            0x4f,
-            0x57,
-            0x59,
-            0x5e,
-            0x5f,
-            0x89,
-            0x8e,
-            0x8f,
-            0xb1,
-            0xb6,
-            0xb7,
-            0xbf,
-            0xc1,
-            0xc6,
-            0xc7,
-            0xd7,
-            0x11,
-            0x16,
-            0x17,
-            0x5b,
-            0x5c,
-            0xf6,
-            0xf7,
-            0xfe,
-            0xff,
-            0x80,
-            0x0d,
-            0x6d,
-            0x71,
-            0xde,
-            0xdf,
-            0x0e,
-            0x0f,
-            0x1f,
-            0x6e,
-            0x6f,
-            0x1c,
-            0x1d,
-            0x5f,
-            0x7d,
-            0x7e,
-            0xae,
-            0xaf,
-            0xbb,
-            0xbc,
-            0xfa,
-            0x16,
-            0x17,
-            0x1e,
-            0x1f,
-            0x46,
-            0x47,
-            0x4e,
-            0x4f,
-            0x58,
-            0x5a,
-            0x5c,
-            0x5e,
-            0x7e,
-            0x7f,
-            0xb5,
-            0xc5,
-            0xd4,
-            0xd5,
-            0xdc,
-            0xf0,
-            0xf1,
-            0xf5,
-            0x72,
-            0x73,
-            0x8f,
-            0x74,
-            0x75,
-            0x96,
-            0x2f,
-            0x5f,
-            0x26,
-            0x2e,
-            0x2f,
-            0xa7,
-            0xaf,
-            0xb7,
-            0xbf,
-            0xc7,
-            0xcf,
-            0xd7,
-            0xdf,
-            0x9a,
-            0x40,
-            0x97,
-            0x98,
-            0x30,
-            0x8f,
-            0x1f,
-            0xc0,
-            0xc1,
-            0xce,
-            0xff,
-            0x4e,
-            0x4f,
-            0x5a,
-            0x5b,
-            0x07,
-            0x08,
-            0x0f,
-            0x10,
-            0x27,
-            0x2f,
-            0xee,
-            0xef,
-            0x6e,
-            0x6f,
-            0x37,
-            0x3d,
-            0x3f,
-            0x42,
-            0x45,
-            0x90,
-            0x91,
-            0xfe,
-            0xff,
-            0x53,
-            0x67,
-            0x75,
-            0xc8,
-            0xc9,
-            0xd0,
-            0xd1,
-            0xd8,
-            0xd9,
-            0xe7,
-            0xfe,
-            0xff,
-    };
-    static constexpr singleton singletons1[] = {
-            {0x00, 6},
-            {0x01, 1},
-            {0x03, 1},
-            {0x04, 2},
-            {0x08, 8},
-            {0x09, 2},
-            {0x0a, 5},
-            {0x0b, 2},
-            {0x0e, 4},
-            {0x10, 1},
-            {0x11, 2},
-            {0x12, 5},
-            {0x13, 17},
-            {0x14, 1},
-            {0x15, 2},
-            {0x17, 2},
-            {0x19, 13},
-            {0x1c, 5},
-            {0x1d, 8},
-            {0x24, 1},
-            {0x6a, 3},
-            {0x6b, 2},
-            {0xbc, 2},
-            {0xd1, 2},
-            {0xd4, 12},
-            {0xd5, 9},
-            {0xd6, 2},
-            {0xd7, 2},
-            {0xda, 1},
-            {0xe0, 5},
-            {0xe1, 2},
-            {0xe8, 2},
-            {0xee, 32},
-            {0xf0, 4},
-            {0xf8, 2},
-            {0xf9, 2},
-            {0xfa, 2},
-            {0xfb, 1},
-    };
-    static constexpr unsigned char singletons1_lower[] = {
-            0x0c,
-            0x27,
-            0x3b,
-            0x3e,
-            0x4e,
-            0x4f,
-            0x8f,
-            0x9e,
-            0x9e,
-            0x9f,
-            0x06,
-            0x07,
-            0x09,
-            0x36,
-            0x3d,
-            0x3e,
-            0x56,
-            0xf3,
-            0xd0,
-            0xd1,
-            0x04,
-            0x14,
-            0x18,
-            0x36,
-            0x37,
-            0x56,
-            0x57,
-            0x7f,
-            0xaa,
-            0xae,
-            0xaf,
-            0xbd,
-            0x35,
-            0xe0,
-            0x12,
-            0x87,
-            0x89,
-            0x8e,
-            0x9e,
-            0x04,
-            0x0d,
-            0x0e,
-            0x11,
-            0x12,
-            0x29,
-            0x31,
-            0x34,
-            0x3a,
-            0x45,
-            0x46,
-            0x49,
-            0x4a,
-            0x4e,
-            0x4f,
-            0x64,
-            0x65,
-            0x5c,
-            0xb6,
-            0xb7,
-            0x1b,
-            0x1c,
-            0x07,
-            0x08,
-            0x0a,
-            0x0b,
-            0x14,
-            0x17,
-            0x36,
-            0x39,
-            0x3a,
-            0xa8,
-            0xa9,
-            0xd8,
-            0xd9,
-            0x09,
-            0x37,
-            0x90,
-            0x91,
-            0xa8,
-            0x07,
-            0x0a,
-            0x3b,
-            0x3e,
-            0x66,
-            0x69,
-            0x8f,
-            0x92,
-            0x6f,
-            0x5f,
-            0xee,
-            0xef,
-            0x5a,
-            0x62,
-            0x9a,
-            0x9b,
-            0x27,
-            0x28,
-            0x55,
-            0x9d,
-            0xa0,
-            0xa1,
-            0xa3,
-            0xa4,
-            0xa7,
-            0xa8,
-            0xad,
-            0xba,
-            0xbc,
-            0xc4,
-            0x06,
-            0x0b,
-            0x0c,
-            0x15,
-            0x1d,
-            0x3a,
-            0x3f,
-            0x45,
-            0x51,
-            0xa6,
-            0xa7,
-            0xcc,
-            0xcd,
-            0xa0,
-            0x07,
-            0x19,
-            0x1a,
-            0x22,
-            0x25,
-            0x3e,
-            0x3f,
-            0xc5,
-            0xc6,
-            0x04,
-            0x20,
-            0x23,
-            0x25,
-            0x26,
-            0x28,
-            0x33,
-            0x38,
-            0x3a,
-            0x48,
-            0x4a,
-            0x4c,
-            0x50,
-            0x53,
-            0x55,
-            0x56,
-            0x58,
-            0x5a,
-            0x5c,
-            0x5e,
-            0x60,
-            0x63,
-            0x65,
-            0x66,
-            0x6b,
-            0x73,
-            0x78,
-            0x7d,
-            0x7f,
-            0x8a,
-            0xa4,
-            0xaa,
-            0xaf,
-            0xb0,
-            0xc0,
-            0xd0,
-            0xae,
-            0xaf,
-            0x79,
-            0xcc,
-            0x6e,
-            0x6f,
-            0x93,
-    };
-    static constexpr unsigned char normal0[] = {
-            0x00,
-            0x20,
-            0x5f,
-            0x22,
-            0x82,
-            0xdf,
-            0x04,
-            0x82,
-            0x44,
-            0x08,
-            0x1b,
-            0x04,
-            0x06,
-            0x11,
-            0x81,
-            0xac,
-            0x0e,
-            0x80,
-            0xab,
-            0x35,
-            0x28,
-            0x0b,
-            0x80,
-            0xe0,
-            0x03,
-            0x19,
-            0x08,
-            0x01,
-            0x04,
-            0x2f,
-            0x04,
-            0x34,
-            0x04,
-            0x07,
-            0x03,
-            0x01,
-            0x07,
-            0x06,
-            0x07,
-            0x11,
-            0x0a,
-            0x50,
-            0x0f,
-            0x12,
-            0x07,
-            0x55,
-            0x07,
-            0x03,
-            0x04,
-            0x1c,
-            0x0a,
-            0x09,
-            0x03,
-            0x08,
-            0x03,
-            0x07,
-            0x03,
-            0x02,
-            0x03,
-            0x03,
-            0x03,
-            0x0c,
-            0x04,
-            0x05,
-            0x03,
-            0x0b,
-            0x06,
-            0x01,
-            0x0e,
-            0x15,
-            0x05,
-            0x3a,
-            0x03,
-            0x11,
-            0x07,
-            0x06,
-            0x05,
-            0x10,
-            0x07,
-            0x57,
-            0x07,
-            0x02,
-            0x07,
-            0x15,
-            0x0d,
-            0x50,
-            0x04,
-            0x43,
-            0x03,
-            0x2d,
-            0x03,
-            0x01,
-            0x04,
-            0x11,
-            0x06,
-            0x0f,
-            0x0c,
-            0x3a,
-            0x04,
-            0x1d,
-            0x25,
-            0x5f,
-            0x20,
-            0x6d,
-            0x04,
-            0x6a,
-            0x25,
-            0x80,
-            0xc8,
-            0x05,
-            0x82,
-            0xb0,
-            0x03,
-            0x1a,
-            0x06,
-            0x82,
-            0xfd,
-            0x03,
-            0x59,
-            0x07,
-            0x15,
-            0x0b,
-            0x17,
-            0x09,
-            0x14,
-            0x0c,
-            0x14,
-            0x0c,
-            0x6a,
-            0x06,
-            0x0a,
-            0x06,
-            0x1a,
-            0x06,
-            0x59,
-            0x07,
-            0x2b,
-            0x05,
-            0x46,
-            0x0a,
-            0x2c,
-            0x04,
-            0x0c,
-            0x04,
-            0x01,
-            0x03,
-            0x31,
-            0x0b,
-            0x2c,
-            0x04,
-            0x1a,
-            0x06,
-            0x0b,
-            0x03,
-            0x80,
-            0xac,
-            0x06,
-            0x0a,
-            0x06,
-            0x21,
-            0x3f,
-            0x4c,
-            0x04,
-            0x2d,
-            0x03,
-            0x74,
-            0x08,
-            0x3c,
-            0x03,
-            0x0f,
-            0x03,
-            0x3c,
-            0x07,
-            0x38,
-            0x08,
-            0x2b,
-            0x05,
-            0x82,
-            0xff,
-            0x11,
-            0x18,
-            0x08,
-            0x2f,
-            0x11,
-            0x2d,
-            0x03,
-            0x20,
-            0x10,
-            0x21,
-            0x0f,
-            0x80,
-            0x8c,
-            0x04,
-            0x82,
-            0x97,
-            0x19,
-            0x0b,
-            0x15,
-            0x88,
-            0x94,
-            0x05,
-            0x2f,
-            0x05,
-            0x3b,
-            0x07,
-            0x02,
-            0x0e,
-            0x18,
-            0x09,
-            0x80,
-            0xb3,
-            0x2d,
-            0x74,
-            0x0c,
-            0x80,
-            0xd6,
-            0x1a,
-            0x0c,
-            0x05,
-            0x80,
-            0xff,
-            0x05,
-            0x80,
-            0xdf,
-            0x0c,
-            0xee,
-            0x0d,
-            0x03,
-            0x84,
-            0x8d,
-            0x03,
-            0x37,
-            0x09,
-            0x81,
-            0x5c,
-            0x14,
-            0x80,
-            0xb8,
-            0x08,
-            0x80,
-            0xcb,
-            0x2a,
-            0x38,
-            0x03,
-            0x0a,
-            0x06,
-            0x38,
-            0x08,
-            0x46,
-            0x08,
-            0x0c,
-            0x06,
-            0x74,
-            0x0b,
-            0x1e,
-            0x03,
-            0x5a,
-            0x04,
-            0x59,
-            0x09,
-            0x80,
-            0x83,
-            0x18,
-            0x1c,
-            0x0a,
-            0x16,
-            0x09,
-            0x4c,
-            0x04,
-            0x80,
-            0x8a,
-            0x06,
-            0xab,
-            0xa4,
-            0x0c,
-            0x17,
-            0x04,
-            0x31,
-            0xa1,
-            0x04,
-            0x81,
-            0xda,
-            0x26,
-            0x07,
-            0x0c,
-            0x05,
-            0x05,
-            0x80,
-            0xa5,
-            0x11,
-            0x81,
-            0x6d,
-            0x10,
-            0x78,
-            0x28,
-            0x2a,
-            0x06,
-            0x4c,
-            0x04,
-            0x80,
-            0x8d,
-            0x04,
-            0x80,
-            0xbe,
-            0x03,
-            0x1b,
-            0x03,
-            0x0f,
-            0x0d,
-    };
-    static constexpr unsigned char normal1[] = {
-            0x5e,
-            0x22,
-            0x7b,
-            0x05,
-            0x03,
-            0x04,
-            0x2d,
-            0x03,
-            0x66,
-            0x03,
-            0x01,
-            0x2f,
-            0x2e,
-            0x80,
-            0x82,
-            0x1d,
-            0x03,
-            0x31,
-            0x0f,
-            0x1c,
-            0x04,
-            0x24,
-            0x09,
-            0x1e,
-            0x05,
-            0x2b,
-            0x05,
-            0x44,
-            0x04,
-            0x0e,
-            0x2a,
-            0x80,
-            0xaa,
-            0x06,
-            0x24,
-            0x04,
-            0x24,
-            0x04,
-            0x28,
-            0x08,
-            0x34,
-            0x0b,
-            0x01,
-            0x80,
-            0x90,
-            0x81,
-            0x37,
-            0x09,
-            0x16,
-            0x0a,
-            0x08,
-            0x80,
-            0x98,
-            0x39,
-            0x03,
-            0x63,
-            0x08,
-            0x09,
-            0x30,
-            0x16,
-            0x05,
-            0x21,
-            0x03,
-            0x1b,
-            0x05,
-            0x01,
-            0x40,
-            0x38,
-            0x04,
-            0x4b,
-            0x05,
-            0x2f,
-            0x04,
-            0x0a,
-            0x07,
-            0x09,
-            0x07,
-            0x40,
-            0x20,
-            0x27,
-            0x04,
-            0x0c,
-            0x09,
-            0x36,
-            0x03,
-            0x3a,
-            0x05,
-            0x1a,
-            0x07,
-            0x04,
-            0x0c,
-            0x07,
-            0x50,
-            0x49,
-            0x37,
-            0x33,
-            0x0d,
-            0x33,
-            0x07,
-            0x2e,
-            0x08,
-            0x0a,
-            0x81,
-            0x26,
-            0x52,
-            0x4e,
-            0x28,
-            0x08,
-            0x2a,
-            0x56,
-            0x1c,
-            0x14,
-            0x17,
-            0x09,
-            0x4e,
-            0x04,
-            0x1e,
-            0x0f,
-            0x43,
-            0x0e,
-            0x19,
-            0x07,
-            0x0a,
-            0x06,
-            0x48,
-            0x08,
-            0x27,
-            0x09,
-            0x75,
-            0x0b,
-            0x3f,
-            0x41,
-            0x2a,
-            0x06,
-            0x3b,
-            0x05,
-            0x0a,
-            0x06,
-            0x51,
-            0x06,
-            0x01,
-            0x05,
-            0x10,
-            0x03,
-            0x05,
-            0x80,
-            0x8b,
-            0x62,
-            0x1e,
-            0x48,
-            0x08,
-            0x0a,
-            0x80,
-            0xa6,
-            0x5e,
-            0x22,
-            0x45,
-            0x0b,
-            0x0a,
-            0x06,
-            0x0d,
-            0x13,
-            0x39,
-            0x07,
-            0x0a,
-            0x36,
-            0x2c,
-            0x04,
-            0x10,
-            0x80,
-            0xc0,
-            0x3c,
-            0x64,
-            0x53,
-            0x0c,
-            0x48,
-            0x09,
-            0x0a,
-            0x46,
-            0x45,
-            0x1b,
-            0x48,
-            0x08,
-            0x53,
-            0x1d,
-            0x39,
-            0x81,
-            0x07,
-            0x46,
-            0x0a,
-            0x1d,
-            0x03,
-            0x47,
-            0x49,
-            0x37,
-            0x03,
-            0x0e,
-            0x08,
-            0x0a,
-            0x06,
-            0x39,
-            0x07,
-            0x0a,
-            0x81,
-            0x36,
-            0x19,
-            0x80,
-            0xb7,
-            0x01,
-            0x0f,
-            0x32,
-            0x0d,
-            0x83,
-            0x9b,
-            0x66,
-            0x75,
-            0x0b,
-            0x80,
-            0xc4,
-            0x8a,
-            0xbc,
-            0x84,
-            0x2f,
-            0x8f,
-            0xd1,
-            0x82,
-            0x47,
-            0xa1,
-            0xb9,
-            0x82,
-            0x39,
-            0x07,
-            0x2a,
-            0x04,
-            0x02,
-            0x60,
-            0x26,
-            0x0a,
-            0x46,
-            0x0a,
-            0x28,
-            0x05,
-            0x13,
-            0x82,
-            0xb0,
-            0x5b,
-            0x65,
-            0x4b,
-            0x04,
-            0x39,
-            0x07,
-            0x11,
-            0x40,
-            0x05,
-            0x0b,
-            0x02,
-            0x0e,
-            0x97,
-            0xf8,
-            0x08,
-            0x84,
-            0xd6,
-            0x2a,
-            0x09,
-            0xa2,
-            0xf7,
-            0x81,
-            0x1f,
-            0x31,
-            0x03,
-            0x11,
-            0x04,
-            0x08,
-            0x81,
-            0x8c,
-            0x89,
-            0x04,
-            0x6b,
-            0x05,
-            0x0d,
-            0x03,
-            0x09,
-            0x07,
-            0x10,
-            0x93,
-            0x60,
-            0x80,
-            0xf6,
-            0x0a,
-            0x73,
-            0x08,
-            0x6e,
-            0x17,
-            0x46,
-            0x80,
-            0x9a,
-            0x14,
-            0x0c,
-            0x57,
-            0x09,
-            0x19,
-            0x80,
-            0x87,
-            0x81,
-            0x47,
-            0x03,
-            0x85,
-            0x42,
-            0x0f,
-            0x15,
-            0x85,
-            0x50,
-            0x2b,
-            0x80,
-            0xd5,
-            0x2d,
-            0x03,
-            0x1a,
-            0x04,
-            0x02,
-            0x81,
-            0x70,
-            0x3a,
-            0x05,
-            0x01,
-            0x85,
-            0x00,
-            0x80,
-            0xd7,
-            0x29,
-            0x4c,
-            0x04,
-            0x0a,
-            0x04,
-            0x02,
-            0x83,
-            0x11,
-            0x44,
-            0x4c,
-            0x3d,
-            0x80,
-            0xc2,
-            0x3c,
-            0x06,
-            0x01,
-            0x04,
-            0x55,
-            0x05,
-            0x1b,
-            0x34,
-            0x02,
-            0x81,
-            0x0e,
-            0x2c,
-            0x04,
-            0x64,
-            0x0c,
-            0x56,
-            0x0a,
-            0x80,
-            0xae,
-            0x38,
-            0x1d,
-            0x0d,
-            0x2c,
-            0x04,
-            0x09,
-            0x07,
-            0x02,
-            0x0e,
-            0x06,
-            0x80,
-            0x9a,
-            0x83,
-            0xd8,
-            0x08,
-            0x0d,
-            0x03,
-            0x0d,
-            0x03,
-            0x74,
-            0x0c,
-            0x59,
-            0x07,
-            0x0c,
-            0x14,
-            0x0c,
-            0x04,
-            0x38,
-            0x08,
-            0x0a,
-            0x06,
-            0x28,
-            0x08,
-            0x22,
-            0x4e,
-            0x81,
-            0x54,
-            0x0c,
-            0x15,
-            0x03,
-            0x03,
-            0x05,
-            0x07,
-            0x09,
-            0x19,
-            0x07,
-            0x07,
-            0x09,
-            0x03,
-            0x0d,
-            0x07,
-            0x29,
-            0x80,
-            0xcb,
-            0x25,
-            0x0a,
-            0x84,
-            0x06,
-    };
-    auto lower = static_cast<uint16_t>(cp);
-    if (cp < 0x10000) {
-        return is_printable(lower, singletons0,
-                sizeof(singletons0) / sizeof(*singletons0), singletons0_lower,
-                normal0, sizeof(normal0));
-    }
-    if (cp < 0x20000) {
-        return is_printable(lower, singletons1,
-                sizeof(singletons1) / sizeof(*singletons1), singletons1_lower,
-                normal1, sizeof(normal1));
-    }
-    if (0x2a6de <= cp && cp < 0x2a700) return false;
-    if (0x2b735 <= cp && cp < 0x2b740) return false;
-    if (0x2b81e <= cp && cp < 0x2b820) return false;
-    if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
-    if (0x2ebe1 <= cp && cp < 0x2f800) return false;
-    if (0x2fa1e <= cp && cp < 0x30000) return false;
-    if (0x3134b <= cp && cp < 0xe0100) return false;
-    if (0xe01f0 <= cp && cp < 0x110000) return false;
-    return cp < 0x110000;
-}
-
-} // namespace detail
-
-FMT_END_NAMESPACE
-
-#endif // FMT_FORMAT_INL_H_
diff --git a/src/common/spdlog/fmt/bundled/format.h b/src/common/spdlog/fmt/bundled/format.h
deleted file mode 100755
index c8e36554fc2..00000000000
--- a/src/common/spdlog/fmt/bundled/format.h
+++ /dev/null
@@ -1,4664 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-/*
-  Formatting library for C++
-
-  Copyright (c) 2012 - present, Victor Zverovich
-
-  Permission is hereby granted, free of charge, to any person obtaining
-  a copy of this software and associated documentation files (the
-  "Software"), to deal in the Software without restriction, including
-  without limitation the rights to use, copy, modify, merge, publish,
-  distribute, sublicense, and/or sell copies of the Software, and to
-  permit persons to whom the Software is furnished to do so, subject to
-  the following conditions:
-
-  The above copyright notice and this permission notice shall be
-  included in all copies or substantial portions of the Software.
-
-  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-  --- Optional exception to the license ---
-
-  As an exception, if, as a result of your compiling your source code, portions
-  of this Software are embedded into a machine-executable object form of such
-  source code, you may redistribute such embedded portions in such object form
-  without including the above copyright and permission notices.
- */
-
-#ifndef FMT_FORMAT_H_
-#define FMT_FORMAT_H_
-
-#include <cmath> // std::signbit
-#include <cstdint> // uint32_t
-#include <cstring> // std::memcpy
-#include <limits> // std::numeric_limits
-#include <memory> // std::uninitialized_copy
-#include <stdexcept> // std::runtime_error
-#include <initializer_list> // std::initializer_list
-#include <system_error> // std::system_error
-
-#ifdef __cpp_lib_bit_cast
-#include <bit> // std::bit_cast
-#endif
-
-#include "common/spdlog/fmt/bundled/core.h"
-
-#if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
-#define FMT_INLINE_VARIABLE inline
-#else
-#define FMT_INLINE_VARIABLE
-#endif
-
-#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
-#define FMT_FALLTHROUGH [[fallthrough]]
-#elif defined(__clang__)
-#define FMT_FALLTHROUGH [[clang::fallthrough]]
-#elif FMT_GCC_VERSION >= 700 \
-        && (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
-#define FMT_FALLTHROUGH [[gnu::fallthrough]]
-#else
-#define FMT_FALLTHROUGH
-#endif
-
-#ifndef FMT_DEPRECATED
-#if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VERSION >= 1900
-#define FMT_DEPRECATED [[deprecated]]
-#else
-#if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
-#define FMT_DEPRECATED __attribute__((deprecated))
-#elif FMT_MSC_VERSION
-#define FMT_DEPRECATED __declspec(deprecated)
-#else
-#define FMT_DEPRECATED /* deprecated */
-#endif
-#endif
-#endif
-
-#ifndef FMT_NO_UNIQUE_ADDRESS
-#if FMT_CPLUSPLUS >= 202002L
-#if FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
-#define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
-// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485)
-#elif (FMT_MSC_VERSION >= 1929) && !FMT_CLANG_VERSION
-#define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
-#endif
-#endif
-#endif
-#ifndef FMT_NO_UNIQUE_ADDRESS
-#define FMT_NO_UNIQUE_ADDRESS
-#endif
-
-// Visibility when compiled as a shared library/object.
-#if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
-#define FMT_SO_VISIBILITY(value) FMT_VISIBILITY(value)
-#else
-#define FMT_SO_VISIBILITY(value)
-#endif
-
-#ifdef __has_builtin
-#define FMT_HAS_BUILTIN(x) __has_builtin(x)
-#else
-#define FMT_HAS_BUILTIN(x) 0
-#endif
-
-#if FMT_GCC_VERSION || FMT_CLANG_VERSION
-#define FMT_NOINLINE __attribute__((noinline))
-#else
-#define FMT_NOINLINE
-#endif
-
-#ifndef FMT_THROW
-#if FMT_EXCEPTIONS
-#if FMT_MSC_VERSION || defined(__NVCC__)
-FMT_BEGIN_NAMESPACE
-namespace detail {
-template <typename Exception>
-inline void do_throw(const Exception &x) {
-    // Silence unreachable code warnings in MSVC and NVCC because these
-    // are nearly impossible to fix in a generic code.
-    volatile bool b = true;
-    if (b) throw x;
-}
-} // namespace detail
-FMT_END_NAMESPACE
-#define FMT_THROW(x) detail::do_throw(x)
-#else
-#define FMT_THROW(x) throw x
-#endif
-#else
-#define FMT_THROW(x) ::fmt::detail::assert_fail(__FILE__, __LINE__, (x).what())
-#endif
-#endif
-
-#if FMT_EXCEPTIONS
-#define FMT_TRY try
-#define FMT_CATCH(x) catch (x)
-#else
-#define FMT_TRY if (true)
-#define FMT_CATCH(x) if (false)
-#endif
-
-#ifndef FMT_MAYBE_UNUSED
-#if FMT_HAS_CPP17_ATTRIBUTE(maybe_unused)
-#define FMT_MAYBE_UNUSED [[maybe_unused]]
-#else
-#define FMT_MAYBE_UNUSED
-#endif
-#endif
-
-#ifndef FMT_USE_USER_DEFINED_LITERALS
-// EDG based compilers (Intel, NVIDIA, Elbrus, etc), GCC and MSVC support UDLs.
-//
-// GCC before 4.9 requires a space in `operator"" _a` which is invalid in later
-// compiler versions.
-#if (FMT_HAS_FEATURE(cxx_user_literals) || FMT_GCC_VERSION >= 409 \
-        || FMT_MSC_VERSION >= 1900) \
-        && (!defined(__EDG_VERSION__) \
-                || __EDG_VERSION__ >= /* UDL feature */ 480)
-#define FMT_USE_USER_DEFINED_LITERALS 1
-#else
-#define FMT_USE_USER_DEFINED_LITERALS 0
-#endif
-#endif
-
-// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
-// integer formatter template instantiations to just one by only using the
-// largest integer type. This results in a reduction in binary size but will
-// cause a decrease in integer formatting performance.
-#if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
-#define FMT_REDUCE_INT_INSTANTIATIONS 0
-#endif
-
-// __builtin_clz is broken in clang with Microsoft CodeGen:
-// https://github.com/fmtlib/fmt/issues/519.
-#if !FMT_MSC_VERSION
-#if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION
-#define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
-#endif
-#if FMT_HAS_BUILTIN(__builtin_clzll) || FMT_GCC_VERSION || FMT_ICC_VERSION
-#define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
-#endif
-#endif
-
-// __builtin_ctz is broken in Intel Compiler Classic on Windows:
-// https://github.com/fmtlib/fmt/issues/2510.
-#ifndef __ICL
-#if FMT_HAS_BUILTIN(__builtin_ctz) || FMT_GCC_VERSION || FMT_ICC_VERSION \
-        || defined(__NVCOMPILER)
-#define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
-#endif
-#if FMT_HAS_BUILTIN(__builtin_ctzll) || FMT_GCC_VERSION || FMT_ICC_VERSION \
-        || defined(__NVCOMPILER)
-#define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
-#endif
-#endif
-
-#if FMT_MSC_VERSION
-#include <intrin.h> // _BitScanReverse[64], _BitScanForward[64], _umul128
-#endif
-
-// Some compilers masquerade as both MSVC and GCC-likes or otherwise support
-// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
-// MSVC intrinsics if the clz and clzll builtins are not available.
-#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL) \
-        && !defined(FMT_BUILTIN_CTZLL)
-FMT_BEGIN_NAMESPACE
-namespace detail {
-// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
-#if !defined(__clang__)
-#pragma intrinsic(_BitScanForward)
-#pragma intrinsic(_BitScanReverse)
-#if defined(_WIN64)
-#pragma intrinsic(_BitScanForward64)
-#pragma intrinsic(_BitScanReverse64)
-#endif
-#endif
-
-inline auto clz(uint32_t x) -> int {
-    unsigned long r = 0;
-    _BitScanReverse(&r, x);
-    FMT_ASSERT(x != 0, "");
-    // Static analysis complains about using uninitialized data
-    // "r", but the only way that can happen is if "x" is 0,
-    // which the callers guarantee to not happen.
-    FMT_MSC_WARNING(suppress : 6102)
-    return 31 ^ static_cast<int>(r);
-}
-#define FMT_BUILTIN_CLZ(n) detail::clz(n)
-
-inline auto clzll(uint64_t x) -> int {
-    unsigned long r = 0;
-#ifdef _WIN64
-    _BitScanReverse64(&r, x);
-#else
-    // Scan the high 32 bits.
-    if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
-        return 63 ^ static_cast<int>(r + 32);
-    // Scan the low 32 bits.
-    _BitScanReverse(&r, static_cast<uint32_t>(x));
-#endif
-    FMT_ASSERT(x != 0, "");
-    FMT_MSC_WARNING(
-            suppress : 6102) // Suppress a bogus static analysis warning.
-    return 63 ^ static_cast<int>(r);
-}
-#define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
-
-inline auto ctz(uint32_t x) -> int {
-    unsigned long r = 0;
-    _BitScanForward(&r, x);
-    FMT_ASSERT(x != 0, "");
-    FMT_MSC_WARNING(
-            suppress : 6102) // Suppress a bogus static analysis warning.
-    return static_cast<int>(r);
-}
-#define FMT_BUILTIN_CTZ(n) detail::ctz(n)
-
-inline auto ctzll(uint64_t x) -> int {
-    unsigned long r = 0;
-    FMT_ASSERT(x != 0, "");
-    FMT_MSC_WARNING(
-            suppress : 6102) // Suppress a bogus static analysis warning.
-#ifdef _WIN64
-    _BitScanForward64(&r, x);
-#else
-    // Scan the low 32 bits.
-    if (_BitScanForward(&r, static_cast<uint32_t>(x)))
-        return static_cast<int>(r);
-    // Scan the high 32 bits.
-    _BitScanForward(&r, static_cast<uint32_t>(x >> 32));
-    r += 32;
-#endif
-    return static_cast<int>(r);
-}
-#define FMT_BUILTIN_CTZLL(n) detail::ctzll(n)
-} // namespace detail
-FMT_END_NAMESPACE
-#endif
-
-FMT_BEGIN_NAMESPACE
-namespace detail {
-
-FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
-    ignore_unused(condition);
-#ifdef FMT_FUZZ
-    if (condition) throw std::runtime_error("fuzzing limit reached");
-#endif
-}
-
-template <typename CharT, CharT... C>
-struct string_literal {
-    static constexpr CharT value[sizeof...(C)] = {C...};
-    constexpr operator basic_string_view<CharT>() const {
-        return {value, sizeof...(C)};
-    }
-};
-
-#if FMT_CPLUSPLUS < 201703L
-template <typename CharT, CharT... C>
-constexpr CharT string_literal<CharT, C...>::value[sizeof...(C)];
-#endif
-
-// Implementation of std::bit_cast for pre-C++20.
-template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) == sizeof(From))>
-FMT_CONSTEXPR20 auto bit_cast(const From &from) -> To {
-#ifdef __cpp_lib_bit_cast
-    if (is_constant_evaluated()) return std::bit_cast<To>(from);
-#endif
-    auto to = To();
-    // The cast suppresses a bogus -Wclass-memaccess on GCC.
-    std::memcpy(static_cast<void *>(&to), &from, sizeof(to));
-    return to;
-}
-
-inline auto is_big_endian() -> bool {
-#ifdef _WIN32
-    return false;
-#elif defined(__BIG_ENDIAN__)
-    return true;
-#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
-    return __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__;
-#else
-    struct bytes {
-        char data[sizeof(int)];
-    };
-    return bit_cast<bytes>(1).data[0] == 0;
-#endif
-}
-
-class uint128_fallback {
-private:
-    uint64_t lo_, hi_;
-
-public:
-    constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {}
-    constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {}
-
-    constexpr auto high() const noexcept -> uint64_t { return hi_; }
-    constexpr auto low() const noexcept -> uint64_t { return lo_; }
-
-    template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-    constexpr explicit operator T() const {
-        return static_cast<T>(lo_);
-    }
-
-    friend constexpr auto operator==(
-            const uint128_fallback &lhs, const uint128_fallback &rhs) -> bool {
-        return lhs.hi_ == rhs.hi_ && lhs.lo_ == rhs.lo_;
-    }
-    friend constexpr auto operator!=(
-            const uint128_fallback &lhs, const uint128_fallback &rhs) -> bool {
-        return !(lhs == rhs);
-    }
-    friend constexpr auto operator>(
-            const uint128_fallback &lhs, const uint128_fallback &rhs) -> bool {
-        return lhs.hi_ != rhs.hi_ ? lhs.hi_ > rhs.hi_ : lhs.lo_ > rhs.lo_;
-    }
-    friend constexpr auto operator|(const uint128_fallback &lhs,
-            const uint128_fallback &rhs) -> uint128_fallback {
-        return {lhs.hi_ | rhs.hi_, lhs.lo_ | rhs.lo_};
-    }
-    friend constexpr auto operator&(const uint128_fallback &lhs,
-            const uint128_fallback &rhs) -> uint128_fallback {
-        return {lhs.hi_ & rhs.hi_, lhs.lo_ & rhs.lo_};
-    }
-    friend constexpr auto operator~(const uint128_fallback &n)
-            -> uint128_fallback {
-        return {~n.hi_, ~n.lo_};
-    }
-    friend auto operator+(const uint128_fallback &lhs,
-            const uint128_fallback &rhs) -> uint128_fallback {
-        auto result = uint128_fallback(lhs);
-        result += rhs;
-        return result;
-    }
-    friend auto operator*(const uint128_fallback &lhs, uint32_t rhs)
-            -> uint128_fallback {
-        FMT_ASSERT(lhs.hi_ == 0, "");
-        uint64_t hi = (lhs.lo_ >> 32) * rhs;
-        uint64_t lo = (lhs.lo_ & ~uint32_t()) * rhs;
-        uint64_t new_lo = (hi << 32) + lo;
-        return {(hi >> 32) + (new_lo < lo ? 1 : 0), new_lo};
-    }
-    friend auto operator-(const uint128_fallback &lhs, uint64_t rhs)
-            -> uint128_fallback {
-        return {lhs.hi_ - (lhs.lo_ < rhs ? 1 : 0), lhs.lo_ - rhs};
-    }
-    FMT_CONSTEXPR auto operator>>(int shift) const -> uint128_fallback {
-        if (shift == 64) return {0, hi_};
-        if (shift > 64) return uint128_fallback(0, hi_) >> (shift - 64);
-        return {hi_ >> shift, (hi_ << (64 - shift)) | (lo_ >> shift)};
-    }
-    FMT_CONSTEXPR auto operator<<(int shift) const -> uint128_fallback {
-        if (shift == 64) return {lo_, 0};
-        if (shift > 64) return uint128_fallback(lo_, 0) << (shift - 64);
-        return {hi_ << shift | (lo_ >> (64 - shift)), (lo_ << shift)};
-    }
-    FMT_CONSTEXPR auto operator>>=(int shift) -> uint128_fallback & {
-        return *this = *this >> shift;
-    }
-    FMT_CONSTEXPR void operator+=(uint128_fallback n) {
-        uint64_t new_lo = lo_ + n.lo_;
-        uint64_t new_hi = hi_ + n.hi_ + (new_lo < lo_ ? 1 : 0);
-        FMT_ASSERT(new_hi >= hi_, "");
-        lo_ = new_lo;
-        hi_ = new_hi;
-    }
-    FMT_CONSTEXPR void operator&=(uint128_fallback n) {
-        lo_ &= n.lo_;
-        hi_ &= n.hi_;
-    }
-
-    FMT_CONSTEXPR20 auto operator+=(uint64_t n) noexcept -> uint128_fallback & {
-        if (is_constant_evaluated()) {
-            lo_ += n;
-            hi_ += (lo_ < n ? 1 : 0);
-            return *this;
-        }
-#if FMT_HAS_BUILTIN(__builtin_addcll) && !defined(__ibmxl__)
-        unsigned long long carry;
-        lo_ = __builtin_addcll(lo_, n, 0, &carry);
-        hi_ += carry;
-#elif FMT_HAS_BUILTIN(__builtin_ia32_addcarryx_u64) && !defined(__ibmxl__)
-        unsigned long long result;
-        auto carry = __builtin_ia32_addcarryx_u64(0, lo_, n, &result);
-        lo_ = result;
-        hi_ += carry;
-#elif defined(_MSC_VER) && defined(_M_X64)
-        auto carry = _addcarry_u64(0, lo_, n, &lo_);
-        _addcarry_u64(carry, hi_, 0, &hi_);
-#else
-        lo_ += n;
-        hi_ += (lo_ < n ? 1 : 0);
-#endif
-        return *this;
-    }
-};
-
-using uint128_t = conditional_t<FMT_USE_INT128, uint128_opt, uint128_fallback>;
-
-#ifdef UINTPTR_MAX
-using uintptr_t = ::uintptr_t;
-#else
-using uintptr_t = uint128_t;
-#endif
-
-// Returns the largest possible value for type T. Same as
-// std::numeric_limits<T>::max() but shorter and not affected by the max macro.
-template <typename T>
-constexpr auto max_value() -> T {
-    return (std::numeric_limits<T>::max)();
-}
-template <typename T>
-constexpr auto num_bits() -> int {
-    return std::numeric_limits<T>::digits;
-}
-// std::numeric_limits<T>::digits may return 0 for 128-bit ints.
-template <>
-constexpr auto num_bits<int128_opt>() -> int {
-    return 128;
-}
-template <>
-constexpr auto num_bits<uint128_t>() -> int {
-    return 128;
-}
-
-// A heterogeneous bit_cast used for converting 96-bit long double to uint128_t
-// and 128-bit pointers to uint128_fallback.
-template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) > sizeof(From))>
-inline auto bit_cast(const From &from) -> To {
-    constexpr auto size = static_cast<int>(sizeof(From) / sizeof(unsigned));
-    struct data_t {
-        unsigned value[static_cast<unsigned>(size)];
-    } data = bit_cast<data_t>(from);
-    auto result = To();
-    if (const_check(is_big_endian())) {
-        for (int i = 0; i < size; ++i)
-            result = (result << num_bits<unsigned>()) | data.value[i];
-    } else {
-        for (int i = size - 1; i >= 0; --i)
-            result = (result << num_bits<unsigned>()) | data.value[i];
-    }
-    return result;
-}
-
-template <typename UInt>
-FMT_CONSTEXPR20 inline auto countl_zero_fallback(UInt n) -> int {
-    int lz = 0;
-    constexpr UInt msb_mask = static_cast<UInt>(1) << (num_bits<UInt>() - 1);
-    for (; (n & msb_mask) == 0; n <<= 1)
-        lz++;
-    return lz;
-}
-
-FMT_CONSTEXPR20 inline auto countl_zero(uint32_t n) -> int {
-#ifdef FMT_BUILTIN_CLZ
-    if (!is_constant_evaluated()) return FMT_BUILTIN_CLZ(n);
-#endif
-    return countl_zero_fallback(n);
-}
-
-FMT_CONSTEXPR20 inline auto countl_zero(uint64_t n) -> int {
-#ifdef FMT_BUILTIN_CLZLL
-    if (!is_constant_evaluated()) return FMT_BUILTIN_CLZLL(n);
-#endif
-    return countl_zero_fallback(n);
-}
-
-FMT_INLINE void assume(bool condition) {
-    (void)condition;
-#if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION
-    __builtin_assume(condition);
-#elif FMT_GCC_VERSION
-    if (!condition) __builtin_unreachable();
-#endif
-}
-
-// An approximation of iterator_t for pre-C++20 systems.
-template <typename T>
-using iterator_t = decltype(std::begin(std::declval<T &>()));
-template <typename T>
-using sentinel_t = decltype(std::end(std::declval<T &>()));
-
-// A workaround for std::string not having mutable data() until C++17.
-template <typename Char>
-inline auto get_data(std::basic_string<Char> &s) -> Char * {
-    return &s[0];
-}
-template <typename Container>
-inline auto get_data(Container &c) -> typename Container::value_type * {
-    return c.data();
-}
-
-// Attempts to reserve space for n extra characters in the output range.
-// Returns a pointer to the reserved range or a reference to it.
-template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
-#if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
-__attribute__((no_sanitize("undefined")))
-#endif
-inline auto
-reserve(std::back_insert_iterator<Container> it, size_t n) ->
-        typename Container::value_type * {
-    Container &c = get_container(it);
-    size_t size = c.size();
-    c.resize(size + n);
-    return get_data(c) + size;
-}
-
-template <typename T>
-inline auto reserve(buffer_appender<T> it, size_t n) -> buffer_appender<T> {
-    buffer<T> &buf = get_container(it);
-    buf.try_reserve(buf.size() + n);
-    return it;
-}
-
-template <typename Iterator>
-constexpr auto reserve(Iterator &it, size_t) -> Iterator & {
-    return it;
-}
-
-template <typename OutputIt>
-using reserve_iterator
-        = remove_reference_t<decltype(reserve(std::declval<OutputIt &>(), 0))>;
-
-template <typename T, typename OutputIt>
-constexpr auto to_pointer(OutputIt, size_t) -> T * {
-    return nullptr;
-}
-template <typename T>
-auto to_pointer(buffer_appender<T> it, size_t n) -> T * {
-    buffer<T> &buf = get_container(it);
-    auto size = buf.size();
-    if (buf.capacity() < size + n) return nullptr;
-    buf.try_resize(size + n);
-    return buf.data() + size;
-}
-
-template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
-inline auto base_iterator(std::back_insert_iterator<Container> it,
-        typename Container::value_type *)
-        -> std::back_insert_iterator<Container> {
-    return it;
-}
-
-template <typename Iterator>
-constexpr auto base_iterator(Iterator, Iterator it) -> Iterator {
-    return it;
-}
-
-// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
-// instead (#1998).
-template <typename OutputIt, typename Size, typename T>
-FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T &value)
-        -> OutputIt {
-    for (Size i = 0; i < count; ++i)
-        *out++ = value;
-    return out;
-}
-template <typename T, typename Size>
-FMT_CONSTEXPR20 auto fill_n(T *out, Size count, char value) -> T * {
-    if (is_constant_evaluated()) {
-        return fill_n<T *, Size, T>(out, count, value);
-    }
-    std::memset(out, value, to_unsigned(count));
-    return out + count;
-}
-
-#ifdef __cpp_char8_t
-using char8_type = char8_t;
-#else
-enum char8_type : unsigned char {};
-#endif
-
-template <typename OutChar, typename InputIt, typename OutputIt>
-FMT_CONSTEXPR FMT_NOINLINE auto copy_str_noinline(
-        InputIt begin, InputIt end, OutputIt out) -> OutputIt {
-    return copy_str<OutChar>(begin, end, out);
-}
-
-// A public domain branchless UTF-8 decoder by Christopher Wellons:
-// https://github.com/skeeto/branchless-utf8
-/* Decode the next character, c, from s, reporting errors in e.
- *
- * Since this is a branchless decoder, four bytes will be read from the
- * buffer regardless of the actual length of the next character. This
- * means the buffer _must_ have at least three bytes of zero padding
- * following the end of the data stream.
- *
- * Errors are reported in e, which will be non-zero if the parsed
- * character was somehow invalid: invalid byte sequence, non-canonical
- * encoding, or a surrogate half.
- *
- * The function returns a pointer to the next character. When an error
- * occurs, this pointer will be a guess that depends on the particular
- * error, but it will always advance at least one byte.
- */
-FMT_CONSTEXPR inline auto utf8_decode(const char *s, uint32_t *c, int *e)
-        -> const char * {
-    constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
-    constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
-    constexpr const int shiftc[] = {0, 18, 12, 6, 0};
-    constexpr const int shifte[] = {0, 6, 4, 2, 0};
-
-    int len = "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
-            [static_cast<unsigned char>(*s) >> 3];
-    // Compute the pointer to the next character early so that the next
-    // iteration can start working on the next character. Neither Clang
-    // nor GCC figure out this reordering on their own.
-    const char *next = s + len + !len;
-
-    using uchar = unsigned char;
-
-    // Assume a four-byte character and load four bytes. Unused bits are
-    // shifted out.
-    *c = uint32_t(uchar(s[0]) & masks[len]) << 18;
-    *c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
-    *c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
-    *c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
-    *c >>= shiftc[len];
-
-    // Accumulate the various error conditions.
-    *e = (*c < mins[len]) << 6; // non-canonical encoding
-    *e |= ((*c >> 11) == 0x1b) << 7; // surrogate half?
-    *e |= (*c > 0x10FFFF) << 8; // out of range?
-    *e |= (uchar(s[1]) & 0xc0) >> 2;
-    *e |= (uchar(s[2]) & 0xc0) >> 4;
-    *e |= uchar(s[3]) >> 6;
-    *e ^= 0x2a; // top two bits of each tail byte correct?
-    *e >>= shifte[len];
-
-    return next;
-}
-
-constexpr FMT_INLINE_VARIABLE uint32_t invalid_code_point = ~uint32_t();
-
-// Invokes f(cp, sv) for every code point cp in s with sv being the string view
-// corresponding to the code point. cp is invalid_code_point on error.
-template <typename F>
-FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
-    auto decode = [f](const char *buf_ptr, const char *ptr) {
-        auto cp = uint32_t();
-        auto error = 0;
-        auto end = utf8_decode(buf_ptr, &cp, &error);
-        bool result = f(error ? invalid_code_point : cp,
-                string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
-        return result ? (error ? buf_ptr + 1 : end) : nullptr;
-    };
-    auto p = s.data();
-    const size_t block_size = 4; // utf8_decode always reads blocks of 4 chars.
-    if (s.size() >= block_size) {
-        for (auto end = p + s.size() - block_size + 1; p < end;) {
-            p = decode(p, p);
-            if (!p) return;
-        }
-    }
-    if (auto num_chars_left = s.data() + s.size() - p) {
-        char buf[2 * block_size - 1] = {};
-        copy_str<char>(p, p + num_chars_left, buf);
-        const char *buf_ptr = buf;
-        do {
-            auto end = decode(buf_ptr, p);
-            if (!end) return;
-            p += end - buf_ptr;
-            buf_ptr = end;
-        } while (buf_ptr - buf < num_chars_left);
-    }
-}
-
-template <typename Char>
-inline auto compute_width(basic_string_view<Char> s) -> size_t {
-    return s.size();
-}
-
-// Computes approximate display width of a UTF-8 string.
-FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
-    size_t num_code_points = 0;
-    // It is not a lambda for compatibility with C++14.
-    struct count_code_points {
-        size_t *count;
-        FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
-            *count += detail::to_unsigned(1
-                    + (cp >= 0x1100
-                            && (cp <= 0x115f || // Hangul Jamo init. consonants
-                                    cp == 0x2329
-                                    || // LEFT-POINTING ANGLE BRACKET
-                                    cp == 0x232a
-                                    || // RIGHT-POINTING ANGLE BRACKET
-                                    // CJK ... Yi except IDEOGRAPHIC HALF FILL SPACE:
-                                    (cp >= 0x2e80 && cp <= 0xa4cf
-                                            && cp != 0x303f)
-                                    || (cp >= 0xac00 && cp <= 0xd7a3)
-                                    || // Hangul Syllables
-                                    (cp >= 0xf900 && cp <= 0xfaff)
-                                    || // CJK Compatibility Ideographs
-                                    (cp >= 0xfe10 && cp <= 0xfe19)
-                                    || // Vertical Forms
-                                    (cp >= 0xfe30 && cp <= 0xfe6f)
-                                    || // CJK Compatibility Forms
-                                    (cp >= 0xff00 && cp <= 0xff60)
-                                    || // Fullwidth Forms
-                                    (cp >= 0xffe0 && cp <= 0xffe6)
-                                    || // Fullwidth Forms
-                                    (cp >= 0x20000 && cp <= 0x2fffd) || // CJK
-                                    (cp >= 0x30000 && cp <= 0x3fffd) ||
-                                    // Miscellaneous Symbols and Pictographs + Emoticons:
-                                    (cp >= 0x1f300 && cp <= 0x1f64f) ||
-                                    // Supplemental Symbols and Pictographs:
-                                    (cp >= 0x1f900 && cp <= 0x1f9ff))));
-            return true;
-        }
-    };
-    // We could avoid branches by using utf8_decode directly.
-    for_each_codepoint(s, count_code_points {&num_code_points});
-    return num_code_points;
-}
-
-inline auto compute_width(basic_string_view<char8_type> s) -> size_t {
-    return compute_width(
-            string_view(reinterpret_cast<const char *>(s.data()), s.size()));
-}
-
-template <typename Char>
-inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
-    size_t size = s.size();
-    return n < size ? n : size;
-}
-
-// Calculates the index of the nth code point in a UTF-8 string.
-inline auto code_point_index(string_view s, size_t n) -> size_t {
-    size_t result = s.size();
-    const char *begin = s.begin();
-    for_each_codepoint(s, [begin, &n, &result](uint32_t, string_view sv) {
-        if (n != 0) {
-            --n;
-            return true;
-        }
-        result = to_unsigned(sv.begin() - begin);
-        return false;
-    });
-    return result;
-}
-
-inline auto code_point_index(basic_string_view<char8_type> s, size_t n)
-        -> size_t {
-    return code_point_index(
-            string_view(reinterpret_cast<const char *>(s.data()), s.size()), n);
-}
-
-template <typename T>
-struct is_integral : std::is_integral<T> {};
-template <>
-struct is_integral<int128_opt> : std::true_type {};
-template <>
-struct is_integral<uint128_t> : std::true_type {};
-
-template <typename T>
-using is_signed = std::integral_constant<bool,
-        std::numeric_limits<T>::is_signed
-                || std::is_same<T, int128_opt>::value>;
-
-template <typename T>
-using is_integer = bool_constant<is_integral<T>::value
-        && !std::is_same<T, bool>::value && !std::is_same<T, char>::value
-        && !std::is_same<T, wchar_t>::value>;
-
-#ifndef FMT_USE_FLOAT
-#define FMT_USE_FLOAT 1
-#endif
-#ifndef FMT_USE_DOUBLE
-#define FMT_USE_DOUBLE 1
-#endif
-#ifndef FMT_USE_LONG_DOUBLE
-#define FMT_USE_LONG_DOUBLE 1
-#endif
-
-#ifndef FMT_USE_FLOAT128
-#ifdef __clang__
-// Clang emulates GCC, so it has to appear early.
-#if FMT_HAS_INCLUDE(<quadmath.h>)
-#define FMT_USE_FLOAT128 1
-#endif
-#elif defined(__GNUC__)
-// GNU C++:
-#if defined(_GLIBCXX_USE_FLOAT128) && !defined(__STRICT_ANSI__)
-#define FMT_USE_FLOAT128 1
-#endif
-#endif
-#ifndef FMT_USE_FLOAT128
-#define FMT_USE_FLOAT128 0
-#endif
-#endif
-
-#if FMT_USE_FLOAT128
-using float128 = __float128;
-#else
-using float128 = void;
-#endif
-template <typename T>
-using is_float128 = std::is_same<T, float128>;
-
-template <typename T>
-using is_floating_point = bool_constant<std::is_floating_point<T>::value
-        || is_float128<T>::value>;
-
-template <typename T, bool = std::is_floating_point<T>::value>
-struct is_fast_float : bool_constant<std::numeric_limits<T>::is_iec559
-                               && sizeof(T) <= sizeof(double)> {};
-template <typename T>
-struct is_fast_float<T, false> : std::false_type {};
-
-template <typename T>
-using is_double_double = bool_constant<std::numeric_limits<T>::digits == 106>;
-
-#ifndef FMT_USE_FULL_CACHE_DRAGONBOX
-#define FMT_USE_FULL_CACHE_DRAGONBOX 0
-#endif
-
-template <typename T>
-template <typename U>
-void buffer<T>::append(const U *begin, const U *end) {
-    while (begin != end) {
-        auto count = to_unsigned(end - begin);
-        try_reserve(size_ + count);
-        auto free_cap = capacity_ - size_;
-        if (free_cap < count) count = free_cap;
-        std::uninitialized_copy_n(begin, count, ptr_ + size_);
-        size_ += count;
-        begin += count;
-    }
-}
-
-template <typename T, typename Enable = void>
-struct is_locale : std::false_type {};
-template <typename T>
-struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
-} // namespace detail
-
-FMT_BEGIN_EXPORT
-
-// The number of characters to store in the basic_memory_buffer object itself
-// to avoid dynamic memory allocation.
-enum { inline_buffer_size = 500 };
-
-/**
-  \rst
-  A dynamically growing memory buffer for trivially copyable/constructible types
-  with the first ``SIZE`` elements stored in the object itself.
-
-  You can use the ``memory_buffer`` type alias for ``char`` instead.
-
-  **Example**::
-
-     auto out = fmt::memory_buffer();
-     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
-
-  This will append the following output to the ``out`` object:
-
-  .. code-block:: none
-
-     The answer is 42.
-
-  The output can be converted to an ``std::string`` with ``to_string(out)``.
-  \endrst
- */
-template <typename T, size_t SIZE = inline_buffer_size,
-        typename Allocator = std::allocator<T>>
-class basic_memory_buffer final : public detail::buffer<T> {
-private:
-    T store_[SIZE];
-
-    // Don't inherit from Allocator to avoid generating type_info for it.
-    FMT_NO_UNIQUE_ADDRESS Allocator alloc_;
-
-    // Deallocate memory allocated by the buffer.
-    FMT_CONSTEXPR20 void deallocate() {
-        T *data = this->data();
-        if (data != store_) alloc_.deallocate(data, this->capacity());
-    }
-
-protected:
-    FMT_CONSTEXPR20 void grow(size_t size) override {
-        detail::abort_fuzzing_if(size > 5000);
-        const size_t max_size
-                = std::allocator_traits<Allocator>::max_size(alloc_);
-        size_t old_capacity = this->capacity();
-        size_t new_capacity = old_capacity + old_capacity / 2;
-        if (size > new_capacity)
-            new_capacity = size;
-        else if (new_capacity > max_size)
-            new_capacity = size > max_size ? size : max_size;
-        T *old_data = this->data();
-        T *new_data = std::allocator_traits<Allocator>::allocate(
-                alloc_, new_capacity);
-        // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
-        detail::assume(this->size() <= new_capacity);
-        // The following code doesn't throw, so the raw pointer above doesn't leak.
-        std::uninitialized_copy_n(old_data, this->size(), new_data);
-        this->set(new_data, new_capacity);
-        // deallocate must not throw according to the standard, but even if it does,
-        // the buffer already uses the new storage and will deallocate it in
-        // destructor.
-        if (old_data != store_) alloc_.deallocate(old_data, old_capacity);
-    }
-
-public:
-    using value_type = T;
-    using const_reference = const T &;
-
-    FMT_CONSTEXPR20 explicit basic_memory_buffer(
-            const Allocator &alloc = Allocator())
-        : alloc_(alloc) {
-        this->set(store_, SIZE);
-        if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T());
-    }
-    FMT_CONSTEXPR20 ~basic_memory_buffer() { deallocate(); }
-
-private:
-    // Move data from other to this buffer.
-    FMT_CONSTEXPR20 void move(basic_memory_buffer &other) {
-        alloc_ = std::move(other.alloc_);
-        T *data = other.data();
-        size_t size = other.size(), capacity = other.capacity();
-        if (data == other.store_) {
-            this->set(store_, capacity);
-            detail::copy_str<T>(other.store_, other.store_ + size, store_);
-        } else {
-            this->set(data, capacity);
-            // Set pointer to the inline array so that delete is not called
-            // when deallocating.
-            other.set(other.store_, 0);
-            other.clear();
-        }
-        this->resize(size);
-    }
-
-public:
-    /**
-    \rst
-    Constructs a :class:`fmt::basic_memory_buffer` object moving the content
-    of the other object to it.
-    \endrst
-   */
-    FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer &&other) noexcept {
-        move(other);
-    }
-
-    /**
-    \rst
-    Moves the content of the other ``basic_memory_buffer`` object to this one.
-    \endrst
-   */
-    auto operator=(basic_memory_buffer &&other) noexcept
-            -> basic_memory_buffer & {
-        FMT_ASSERT(this != &other, "");
-        deallocate();
-        move(other);
-        return *this;
-    }
-
-    // Returns a copy of the allocator associated with this buffer.
-    auto get_allocator() const -> Allocator { return alloc_; }
-
-    /**
-    Resizes the buffer to contain *count* elements. If T is a POD type new
-    elements may not be initialized.
-   */
-    FMT_CONSTEXPR20 void resize(size_t count) { this->try_resize(count); }
-
-    /** Increases the buffer capacity to *new_capacity*. */
-    void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
-
-    using detail::buffer<T>::append;
-    template <typename ContiguousRange>
-    void append(const ContiguousRange &range) {
-        append(range.data(), range.data() + range.size());
-    }
-};
-
-using memory_buffer = basic_memory_buffer<char>;
-
-template <typename T, size_t SIZE, typename Allocator>
-struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
-};
-
-FMT_END_EXPORT
-namespace detail {
-FMT_API auto write_console(int fd, string_view text) -> bool;
-FMT_API auto write_console(std::FILE *f, string_view text) -> bool;
-FMT_API void print(std::FILE *, string_view);
-} // namespace detail
-
-FMT_BEGIN_EXPORT
-
-// Suppress a misleading warning in older versions of clang.
-#if FMT_CLANG_VERSION
-#pragma clang diagnostic ignored "-Wweak-vtables"
-#endif
-
-/** An error reported from a formatting function. */
-class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
-public:
-    using std::runtime_error::runtime_error;
-};
-
-namespace detail_exported {
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <typename Char, size_t N>
-struct fixed_string {
-    constexpr fixed_string(const Char (&str)[N]) {
-        detail::copy_str<Char, const Char *, Char *>(
-                static_cast<const Char *>(str), str + N, data);
-    }
-    Char data[N] = {};
-};
-#endif
-
-// Converts a compile-time string to basic_string_view.
-template <typename Char, size_t N>
-constexpr auto compile_string_to_view(const Char (&s)[N])
-        -> basic_string_view<Char> {
-    // Remove trailing NUL character if needed. Won't be present if this is used
-    // with a raw character array (i.e. not defined as a string).
-    return {s,
-            N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
-}
-template <typename Char>
-constexpr auto compile_string_to_view(detail::std_string_view<Char> s)
-        -> basic_string_view<Char> {
-    return {s.data(), s.size()};
-}
-} // namespace detail_exported
-
-class loc_value {
-private:
-    basic_format_arg<format_context> value_;
-
-public:
-    template <typename T, FMT_ENABLE_IF(!detail::is_float128<T>::value)>
-    loc_value(T value) : value_(detail::make_arg<format_context>(value)) {}
-
-    template <typename T, FMT_ENABLE_IF(detail::is_float128<T>::value)>
-    loc_value(T) {}
-
-    template <typename Visitor>
-    auto visit(Visitor &&vis) -> decltype(vis(0)) {
-        return visit_format_arg(vis, value_);
-    }
-};
-
-// A locale facet that formats values in UTF-8.
-// It is parameterized on the locale to avoid the heavy <locale> include.
-template <typename Locale>
-class format_facet : public Locale::facet {
-private:
-    std::string separator_;
-    std::string grouping_;
-    std::string decimal_point_;
-
-protected:
-    virtual auto do_put(appender out, loc_value val,
-            const format_specs<> &specs) const -> bool;
-
-public:
-    static FMT_API typename Locale::id id;
-
-    explicit format_facet(Locale &loc);
-    explicit format_facet(string_view sep = "",
-            std::initializer_list<unsigned char> g = {3},
-            std::string decimal_point = ".")
-        : separator_(sep.data(), sep.size())
-        , grouping_(g.begin(), g.end())
-        , decimal_point_(decimal_point) {}
-
-    auto put(appender out, loc_value val, const format_specs<> &specs) const
-            -> bool {
-        return do_put(out, val, specs);
-    }
-};
-
-namespace detail {
-
-// Returns true if value is negative, false otherwise.
-// Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
-template <typename T, FMT_ENABLE_IF(is_signed<T>::value)>
-constexpr auto is_negative(T value) -> bool {
-    return value < 0;
-}
-template <typename T, FMT_ENABLE_IF(!is_signed<T>::value)>
-constexpr auto is_negative(T) -> bool {
-    return false;
-}
-
-template <typename T>
-FMT_CONSTEXPR auto is_supported_floating_point(T) -> bool {
-    if (std::is_same<T, float>()) return FMT_USE_FLOAT;
-    if (std::is_same<T, double>()) return FMT_USE_DOUBLE;
-    if (std::is_same<T, long double>()) return FMT_USE_LONG_DOUBLE;
-    return true;
-}
-
-// Smallest of uint32_t, uint64_t, uint128_t that is large enough to
-// represent all values of an integral type T.
-template <typename T>
-using uint32_or_64_or_128_t = conditional_t<num_bits<T>() <= 32
-                && !FMT_REDUCE_INT_INSTANTIATIONS,
-        uint32_t, conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
-template <typename T>
-using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
-
-#define FMT_POWERS_OF_10(factor) \
-    factor * 10, (factor)*100, (factor)*1000, (factor)*10000, (factor)*100000, \
-            (factor)*1000000, (factor)*10000000, (factor)*100000000, \
-            (factor)*1000000000
-
-// Converts value in the range [0, 100) to a string.
-constexpr auto digits2(size_t value) -> const char * {
-    // GCC generates slightly better code when value is pointer-size.
-    return &"0001020304050607080910111213141516171819"
-            "2021222324252627282930313233343536373839"
-            "4041424344454647484950515253545556575859"
-            "6061626364656667686970717273747576777879"
-            "8081828384858687888990919293949596979899"[value * 2];
-}
-
-// Sign is a template parameter to workaround a bug in gcc 4.8.
-template <typename Char, typename Sign>
-constexpr auto sign(Sign s) -> Char {
-#if !FMT_GCC_VERSION || FMT_GCC_VERSION >= 604
-    static_assert(std::is_same<Sign, sign_t>::value, "");
-#endif
-    return static_cast<Char>("\0-+ "[s]);
-}
-
-template <typename T>
-FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
-    int count = 1;
-    for (;;) {
-        // Integer division is slow so do it for a group of four digits instead
-        // of for every digit. The idea comes from the talk by Alexandrescu
-        // "Three Optimization Tips for C++". See speed-test for a comparison.
-        if (n < 10) return count;
-        if (n < 100) return count + 1;
-        if (n < 1000) return count + 2;
-        if (n < 10000) return count + 3;
-        n /= 10000u;
-        count += 4;
-    }
-}
-#if FMT_USE_INT128
-FMT_CONSTEXPR inline auto count_digits(uint128_opt n) -> int {
-    return count_digits_fallback(n);
-}
-#endif
-
-#ifdef FMT_BUILTIN_CLZLL
-// It is a separate function rather than a part of count_digits to workaround
-// the lack of static constexpr in constexpr functions.
-inline auto do_count_digits(uint64_t n) -> int {
-    // This has comparable performance to the version by Kendall Willets
-    // (https://github.com/fmtlib/format-benchmark/blob/master/digits10)
-    // but uses smaller tables.
-    // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
-    static constexpr uint8_t bsr2log10[] = {1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4,
-            4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, 10,
-            11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15, 15, 16,
-            16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
-    auto t = bsr2log10[FMT_BUILTIN_CLZLL(n | 1) ^ 63];
-    static constexpr const uint64_t zero_or_powers_of_10[]
-            = {0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL),
-                    10000000000000000000ULL};
-    return t - (n < zero_or_powers_of_10[t]);
-}
-#endif
-
-// Returns the number of decimal digits in n. Leading zeros are not counted
-// except for n == 0 in which case count_digits returns 1.
-FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
-#ifdef FMT_BUILTIN_CLZLL
-    if (!is_constant_evaluated()) { return do_count_digits(n); }
-#endif
-    return count_digits_fallback(n);
-}
-
-// Counts the number of digits in n. BITS = log2(radix).
-template <int BITS, typename UInt>
-FMT_CONSTEXPR auto count_digits(UInt n) -> int {
-#ifdef FMT_BUILTIN_CLZ
-    if (!is_constant_evaluated() && num_bits<UInt>() == 32)
-        return (FMT_BUILTIN_CLZ(static_cast<uint32_t>(n) | 1) ^ 31) / BITS + 1;
-#endif
-    // Lambda avoids unreachable code warnings from NVHPC.
-    return [](UInt m) {
-        int num_digits = 0;
-        do {
-            ++num_digits;
-        } while ((m >>= BITS) != 0);
-        return num_digits;
-    }(n);
-}
-
-#ifdef FMT_BUILTIN_CLZ
-// It is a separate function rather than a part of count_digits to workaround
-// the lack of static constexpr in constexpr functions.
-FMT_INLINE auto do_count_digits(uint32_t n) -> int {
-// An optimization by Kendall Willets from https://bit.ly/3uOIQrB.
-// This increments the upper 32 bits (log10(T) - 1) when >= T is added.
-#define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T)
-    static constexpr uint64_t table[] = {
-            FMT_INC(0), FMT_INC(0), FMT_INC(0), // 8
-            FMT_INC(10), FMT_INC(10), FMT_INC(10), // 64
-            FMT_INC(100), FMT_INC(100), FMT_INC(100), // 512
-            FMT_INC(1000), FMT_INC(1000), FMT_INC(1000), // 4096
-            FMT_INC(10000), FMT_INC(10000), FMT_INC(10000), // 32k
-            FMT_INC(100000), FMT_INC(100000), FMT_INC(100000), // 256k
-            FMT_INC(1000000), FMT_INC(1000000), FMT_INC(1000000), // 2048k
-            FMT_INC(10000000), FMT_INC(10000000), FMT_INC(10000000), // 16M
-            FMT_INC(100000000), FMT_INC(100000000), FMT_INC(100000000), // 128M
-            FMT_INC(1000000000), FMT_INC(1000000000),
-            FMT_INC(1000000000), // 1024M
-            FMT_INC(1000000000), FMT_INC(1000000000) // 4B
-    };
-    auto inc = table[FMT_BUILTIN_CLZ(n | 1) ^ 31];
-    return static_cast<int>((n + inc) >> 32);
-}
-#endif
-
-// Optional version of count_digits for better performance on 32-bit platforms.
-FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int {
-#ifdef FMT_BUILTIN_CLZ
-    if (!is_constant_evaluated()) { return do_count_digits(n); }
-#endif
-    return count_digits_fallback(n);
-}
-
-template <typename Int>
-constexpr auto digits10() noexcept -> int {
-    return std::numeric_limits<Int>::digits10;
-}
-template <>
-constexpr auto digits10<int128_opt>() noexcept -> int {
-    return 38;
-}
-template <>
-constexpr auto digits10<uint128_t>() noexcept -> int {
-    return 38;
-}
-
-template <typename Char>
-struct thousands_sep_result {
-    std::string grouping;
-    Char thousands_sep;
-};
-
-template <typename Char>
-FMT_API auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char>;
-template <typename Char>
-inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<Char> {
-    auto result = thousands_sep_impl<char>(loc);
-    return {result.grouping, Char(result.thousands_sep)};
-}
-template <>
-inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<wchar_t> {
-    return thousands_sep_impl<wchar_t>(loc);
-}
-
-template <typename Char>
-FMT_API auto decimal_point_impl(locale_ref loc) -> Char;
-template <typename Char>
-inline auto decimal_point(locale_ref loc) -> Char {
-    return Char(decimal_point_impl<char>(loc));
-}
-template <>
-inline auto decimal_point(locale_ref loc) -> wchar_t {
-    return decimal_point_impl<wchar_t>(loc);
-}
-
-// Compares two characters for equality.
-template <typename Char>
-auto equal2(const Char *lhs, const char *rhs) -> bool {
-    return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]);
-}
-inline auto equal2(const char *lhs, const char *rhs) -> bool {
-    return memcmp(lhs, rhs, 2) == 0;
-}
-
-// Copies two characters from src to dst.
-template <typename Char>
-FMT_CONSTEXPR20 FMT_INLINE void copy2(Char *dst, const char *src) {
-    if (!is_constant_evaluated() && sizeof(Char) == sizeof(char)) {
-        memcpy(dst, src, 2);
-        return;
-    }
-    *dst++ = static_cast<Char>(*src++);
-    *dst = static_cast<Char>(*src);
-}
-
-template <typename Iterator>
-struct format_decimal_result {
-    Iterator begin;
-    Iterator end;
-};
-
-// Formats a decimal unsigned integer value writing into out pointing to a
-// buffer of specified size. The caller must ensure that the buffer is large
-// enough.
-template <typename Char, typename UInt>
-FMT_CONSTEXPR20 auto format_decimal(Char *out, UInt value, int size)
-        -> format_decimal_result<Char *> {
-    FMT_ASSERT(size >= count_digits(value), "invalid digit count");
-    out += size;
-    Char *end = out;
-    while (value >= 100) {
-        // Integer division is slow so do it for a group of two digits instead
-        // of for every digit. The idea comes from the talk by Alexandrescu
-        // "Three Optimization Tips for C++". See speed-test for a comparison.
-        out -= 2;
-        copy2(out, digits2(static_cast<size_t>(value % 100)));
-        value /= 100;
-    }
-    if (value < 10) {
-        *--out = static_cast<Char>('0' + value);
-        return {out, end};
-    }
-    out -= 2;
-    copy2(out, digits2(static_cast<size_t>(value)));
-    return {out, end};
-}
-
-template <typename Char, typename UInt, typename Iterator,
-        FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
-FMT_CONSTEXPR inline auto format_decimal(Iterator out, UInt value, int size)
-        -> format_decimal_result<Iterator> {
-    // Buffer is large enough to hold all digits (digits10 + 1).
-    Char buffer[digits10<UInt>() + 1] = {};
-    auto end = format_decimal(buffer, value, size).end;
-    return {out, detail::copy_str_noinline<Char>(buffer, end, out)};
-}
-
-template <unsigned BASE_BITS, typename Char, typename UInt>
-FMT_CONSTEXPR auto format_uint(Char *buffer, UInt value, int num_digits,
-        bool upper = false) -> Char * {
-    buffer += num_digits;
-    Char *end = buffer;
-    do {
-        const char *digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
-        unsigned digit = static_cast<unsigned>(value & ((1 << BASE_BITS) - 1));
-        *--buffer = static_cast<Char>(
-                BASE_BITS < 4 ? static_cast<char>('0' + digit) : digits[digit]);
-    } while ((value >>= BASE_BITS) != 0);
-    return end;
-}
-
-template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
-FMT_CONSTEXPR inline auto format_uint(
-        It out, UInt value, int num_digits, bool upper = false) -> It {
-    if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
-        format_uint<BASE_BITS>(ptr, value, num_digits, upper);
-        return out;
-    }
-    // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
-    char buffer[num_bits<UInt>() / BASE_BITS + 1] = {};
-    format_uint<BASE_BITS>(buffer, value, num_digits, upper);
-    return detail::copy_str_noinline<Char>(buffer, buffer + num_digits, out);
-}
-
-// A converter from UTF-8 to UTF-16.
-class utf8_to_utf16 {
-private:
-    basic_memory_buffer<wchar_t> buffer_;
-
-public:
-    FMT_API explicit utf8_to_utf16(string_view s);
-    operator basic_string_view<wchar_t>() const {
-        return {&buffer_[0], size()};
-    }
-    auto size() const -> size_t { return buffer_.size() - 1; }
-    auto c_str() const -> const wchar_t * { return &buffer_[0]; }
-    auto str() const -> std::wstring { return {&buffer_[0], size()}; }
-};
-
-enum class to_utf8_error_policy { abort, replace };
-
-// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
-template <typename WChar, typename Buffer = memory_buffer>
-class to_utf8 {
-private:
-    Buffer buffer_;
-
-public:
-    to_utf8() {}
-    explicit to_utf8(basic_string_view<WChar> s,
-            to_utf8_error_policy policy = to_utf8_error_policy::abort) {
-        static_assert(sizeof(WChar) == 2 || sizeof(WChar) == 4,
-                "Expect utf16 or utf32");
-        if (!convert(s, policy))
-            FMT_THROW(std::runtime_error(
-                    sizeof(WChar) == 2 ? "invalid utf16" : "invalid utf32"));
-    }
-    operator string_view() const { return string_view(&buffer_[0], size()); }
-    auto size() const -> size_t { return buffer_.size() - 1; }
-    auto c_str() const -> const char * { return &buffer_[0]; }
-    auto str() const -> std::string { return std::string(&buffer_[0], size()); }
-
-    // Performs conversion returning a bool instead of throwing exception on
-    // conversion error. This method may still throw in case of memory allocation
-    // error.
-    auto convert(basic_string_view<WChar> s,
-            to_utf8_error_policy policy = to_utf8_error_policy::abort) -> bool {
-        if (!convert(buffer_, s, policy)) return false;
-        buffer_.push_back(0);
-        return true;
-    }
-    static auto convert(Buffer &buf, basic_string_view<WChar> s,
-            to_utf8_error_policy policy = to_utf8_error_policy::abort) -> bool {
-        for (auto p = s.begin(); p != s.end(); ++p) {
-            uint32_t c = static_cast<uint32_t>(*p);
-            if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
-                // Handle a surrogate pair.
-                ++p;
-                if (p == s.end() || (c & 0xfc00) != 0xd800
-                        || (*p & 0xfc00) != 0xdc00) {
-                    if (policy == to_utf8_error_policy::abort) return false;
-                    buf.append(string_view("\xEF\xBF\xBD"));
-                    --p;
-                } else {
-                    c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
-                }
-            } else if (c < 0x80) {
-                buf.push_back(static_cast<char>(c));
-            } else if (c < 0x800) {
-                buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
-                buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
-            } else if ((c >= 0x800 && c <= 0xd7ff)
-                    || (c >= 0xe000 && c <= 0xffff)) {
-                buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
-                buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
-                buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
-            } else if (c >= 0x10000 && c <= 0x10ffff) {
-                buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
-                buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
-                buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
-                buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
-            } else {
-                return false;
-            }
-        }
-        return true;
-    }
-};
-
-// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
-inline auto umul128(uint64_t x, uint64_t y) noexcept -> uint128_fallback {
-#if FMT_USE_INT128
-    auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
-    return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
-#elif defined(_MSC_VER) && defined(_M_X64)
-    auto hi = uint64_t();
-    auto lo = _umul128(x, y, &hi);
-    return {hi, lo};
-#else
-    const uint64_t mask = static_cast<uint64_t>(max_value<uint32_t>());
-
-    uint64_t a = x >> 32;
-    uint64_t b = x & mask;
-    uint64_t c = y >> 32;
-    uint64_t d = y & mask;
-
-    uint64_t ac = a * c;
-    uint64_t bc = b * c;
-    uint64_t ad = a * d;
-    uint64_t bd = b * d;
-
-    uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
-
-    return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
-            (intermediate << 32) + (bd & mask)};
-#endif
-}
-
-namespace dragonbox {
-// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
-// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
-inline auto floor_log10_pow2(int e) noexcept -> int {
-    FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
-    static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
-    return (e * 315653) >> 20;
-}
-
-inline auto floor_log2_pow10(int e) noexcept -> int {
-    FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
-    return (e * 1741647) >> 19;
-}
-
-// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
-inline auto umul128_upper64(uint64_t x, uint64_t y) noexcept -> uint64_t {
-#if FMT_USE_INT128
-    auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
-    return static_cast<uint64_t>(p >> 64);
-#elif defined(_MSC_VER) && defined(_M_X64)
-    return __umulh(x, y);
-#else
-    return umul128(x, y).high();
-#endif
-}
-
-// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
-// 128-bit unsigned integer.
-inline auto umul192_upper128(uint64_t x, uint128_fallback y) noexcept
-        -> uint128_fallback {
-    uint128_fallback r = umul128(x, y.high());
-    r += umul128_upper64(x, y.low());
-    return r;
-}
-
-FMT_API auto get_cached_power(int k) noexcept -> uint128_fallback;
-
-// Type-specific information that Dragonbox uses.
-template <typename T, typename Enable = void>
-struct float_info;
-
-template <>
-struct float_info<float> {
-    using carrier_uint = uint32_t;
-    static const int exponent_bits = 8;
-    static const int kappa = 1;
-    static const int big_divisor = 100;
-    static const int small_divisor = 10;
-    static const int min_k = -31;
-    static const int max_k = 46;
-    static const int shorter_interval_tie_lower_threshold = -35;
-    static const int shorter_interval_tie_upper_threshold = -35;
-};
-
-template <>
-struct float_info<double> {
-    using carrier_uint = uint64_t;
-    static const int exponent_bits = 11;
-    static const int kappa = 2;
-    static const int big_divisor = 1000;
-    static const int small_divisor = 100;
-    static const int min_k = -292;
-    static const int max_k = 341;
-    static const int shorter_interval_tie_lower_threshold = -77;
-    static const int shorter_interval_tie_upper_threshold = -77;
-};
-
-// An 80- or 128-bit floating point number.
-template <typename T>
-struct float_info<T,
-        enable_if_t<std::numeric_limits<T>::digits == 64
-                || std::numeric_limits<T>::digits == 113
-                || is_float128<T>::value>> {
-    using carrier_uint = detail::uint128_t;
-    static const int exponent_bits = 15;
-};
-
-// A double-double floating point number.
-template <typename T>
-struct float_info<T, enable_if_t<is_double_double<T>::value>> {
-    using carrier_uint = detail::uint128_t;
-};
-
-template <typename T>
-struct decimal_fp {
-    using significand_type = typename float_info<T>::carrier_uint;
-    significand_type significand;
-    int exponent;
-};
-
-template <typename T>
-FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
-} // namespace dragonbox
-
-// Returns true iff Float has the implicit bit which is not stored.
-template <typename Float>
-constexpr auto has_implicit_bit() -> bool {
-    // An 80-bit FP number has a 64-bit significand an no implicit bit.
-    return std::numeric_limits<Float>::digits != 64;
-}
-
-// Returns the number of significand bits stored in Float. The implicit bit is
-// not counted since it is not stored.
-template <typename Float>
-constexpr auto num_significand_bits() -> int {
-    // std::numeric_limits may not support __float128.
-    return is_float128<Float>() ? 112
-                                : (std::numeric_limits<Float>::digits
-                                        - (has_implicit_bit<Float>() ? 1 : 0));
-}
-
-template <typename Float>
-constexpr auto exponent_mask() ->
-        typename dragonbox::float_info<Float>::carrier_uint {
-    using float_uint = typename dragonbox::float_info<Float>::carrier_uint;
-    return ((float_uint(1) << dragonbox::float_info<Float>::exponent_bits) - 1)
-            << num_significand_bits<Float>();
-}
-template <typename Float>
-constexpr auto exponent_bias() -> int {
-    // std::numeric_limits may not support __float128.
-    return is_float128<Float>() ? 16383
-                                : std::numeric_limits<Float>::max_exponent - 1;
-}
-
-// Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
-template <typename Char, typename It>
-FMT_CONSTEXPR auto write_exponent(int exp, It it) -> It {
-    FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
-    if (exp < 0) {
-        *it++ = static_cast<Char>('-');
-        exp = -exp;
-    } else {
-        *it++ = static_cast<Char>('+');
-    }
-    if (exp >= 100) {
-        const char *top = digits2(to_unsigned(exp / 100));
-        if (exp >= 1000) *it++ = static_cast<Char>(top[0]);
-        *it++ = static_cast<Char>(top[1]);
-        exp %= 100;
-    }
-    const char *d = digits2(to_unsigned(exp));
-    *it++ = static_cast<Char>(d[0]);
-    *it++ = static_cast<Char>(d[1]);
-    return it;
-}
-
-// A floating-point number f * pow(2, e) where F is an unsigned type.
-template <typename F>
-struct basic_fp {
-    F f;
-    int e;
-
-    static constexpr const int num_significand_bits
-            = static_cast<int>(sizeof(F) * num_bits<unsigned char>());
-
-    constexpr basic_fp() : f(0), e(0) {}
-    constexpr basic_fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
-
-    // Constructs fp from an IEEE754 floating-point number.
-    template <typename Float>
-    FMT_CONSTEXPR basic_fp(Float n) {
-        assign(n);
-    }
-
-    // Assigns n to this and return true iff predecessor is closer than successor.
-    template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
-    FMT_CONSTEXPR auto assign(Float n) -> bool {
-        static_assert(
-                std::numeric_limits<Float>::digits <= 113, "unsupported FP");
-        // Assume Float is in the format [sign][exponent][significand].
-        using carrier_uint =
-                typename dragonbox::float_info<Float>::carrier_uint;
-        const auto num_float_significand_bits
-                = detail::num_significand_bits<Float>();
-        const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
-        const auto significand_mask = implicit_bit - 1;
-        auto u = bit_cast<carrier_uint>(n);
-        f = static_cast<F>(u & significand_mask);
-        auto biased_e = static_cast<int>(
-                (u & exponent_mask<Float>()) >> num_float_significand_bits);
-        // The predecessor is closer if n is a normalized power of 2 (f == 0)
-        // other than the smallest normalized number (biased_e > 1).
-        auto is_predecessor_closer = f == 0 && biased_e > 1;
-        if (biased_e == 0)
-            biased_e = 1; // Subnormals use biased exponent 1 (min exponent).
-        else if (has_implicit_bit<Float>())
-            f += static_cast<F>(implicit_bit);
-        e = biased_e - exponent_bias<Float>() - num_float_significand_bits;
-        if (!has_implicit_bit<Float>()) ++e;
-        return is_predecessor_closer;
-    }
-
-    template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
-    FMT_CONSTEXPR auto assign(Float n) -> bool {
-        static_assert(std::numeric_limits<double>::is_iec559, "unsupported FP");
-        return assign(static_cast<double>(n));
-    }
-};
-
-using fp = basic_fp<unsigned long long>;
-
-// Normalizes the value converted from double and multiplied by (1 << SHIFT).
-template <int SHIFT = 0, typename F>
-FMT_CONSTEXPR auto normalize(basic_fp<F> value) -> basic_fp<F> {
-    // Handle subnormals.
-    const auto implicit_bit = F(1) << num_significand_bits<double>();
-    const auto shifted_implicit_bit = implicit_bit << SHIFT;
-    while ((value.f & shifted_implicit_bit) == 0) {
-        value.f <<= 1;
-        --value.e;
-    }
-    // Subtract 1 to account for hidden bit.
-    const auto offset = basic_fp<F>::num_significand_bits
-            - num_significand_bits<double>() - SHIFT - 1;
-    value.f <<= offset;
-    value.e -= offset;
-    return value;
-}
-
-// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
-FMT_CONSTEXPR inline auto multiply(uint64_t lhs, uint64_t rhs) -> uint64_t {
-#if FMT_USE_INT128
-    auto product = static_cast<__uint128_t>(lhs) * rhs;
-    auto f = static_cast<uint64_t>(product >> 64);
-    return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
-#else
-    // Multiply 32-bit parts of significands.
-    uint64_t mask = (1ULL << 32) - 1;
-    uint64_t a = lhs >> 32, b = lhs & mask;
-    uint64_t c = rhs >> 32, d = rhs & mask;
-    uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
-    // Compute mid 64-bit of result and round.
-    uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
-    return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
-#endif
-}
-
-FMT_CONSTEXPR inline auto operator*(fp x, fp y) -> fp {
-    return {multiply(x.f, y.f), x.e + y.e + 64};
-}
-
-template <typename T, bool doublish = num_bits<T>() == num_bits<double>()>
-using convert_float_result
-        = conditional_t<std::is_same<T, float>::value || doublish, double, T>;
-
-template <typename T>
-constexpr auto convert_float(T value) -> convert_float_result<T> {
-    return static_cast<convert_float_result<T>>(value);
-}
-
-template <typename OutputIt, typename Char>
-FMT_NOINLINE FMT_CONSTEXPR auto fill(
-        OutputIt it, size_t n, const fill_t<Char> &fill) -> OutputIt {
-    auto fill_size = fill.size();
-    if (fill_size == 1) return detail::fill_n(it, n, fill[0]);
-    auto data = fill.data();
-    for (size_t i = 0; i < n; ++i)
-        it = copy_str<Char>(data, data + fill_size, it);
-    return it;
-}
-
-// Writes the output of f, padded according to format specifications in specs.
-// size: output size in code units.
-// width: output display width in (terminal) column positions.
-template <align::type align = align::left, typename OutputIt, typename Char,
-        typename F>
-FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs<Char> &specs,
-        size_t size, size_t width, F &&f) -> OutputIt {
-    static_assert(align == align::left || align == align::right, "");
-    unsigned spec_width = to_unsigned(specs.width);
-    size_t padding = spec_width > width ? spec_width - width : 0;
-    // Shifts are encoded as string literals because static constexpr is not
-    // supported in constexpr functions.
-    auto *shifts
-            = align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
-    size_t left_padding = padding >> shifts[specs.align];
-    size_t right_padding = padding - left_padding;
-    auto it = reserve(out, size + padding * specs.fill.size());
-    if (left_padding != 0) it = fill(it, left_padding, specs.fill);
-    it = f(it);
-    if (right_padding != 0) it = fill(it, right_padding, specs.fill);
-    return base_iterator(out, it);
-}
-
-template <align::type align = align::left, typename OutputIt, typename Char,
-        typename F>
-constexpr auto write_padded(OutputIt out, const format_specs<Char> &specs,
-        size_t size, F &&f) -> OutputIt {
-    return write_padded<align>(out, specs, size, size, f);
-}
-
-template <align::type align = align::left, typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
-        const format_specs<Char> &specs) -> OutputIt {
-    return write_padded<align>(
-            out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
-                const char *data = bytes.data();
-                return copy_str<Char>(data, data + bytes.size(), it);
-            });
-}
-
-template <typename Char, typename OutputIt, typename UIntPtr>
-auto write_ptr(OutputIt out, UIntPtr value, const format_specs<Char> *specs)
-        -> OutputIt {
-    int num_digits = count_digits<4>(value);
-    auto size = to_unsigned(num_digits) + size_t(2);
-    auto write = [=](reserve_iterator<OutputIt> it) {
-        *it++ = static_cast<Char>('0');
-        *it++ = static_cast<Char>('x');
-        return format_uint<4, Char>(it, value, num_digits);
-    };
-    return specs ? write_padded<align::right>(out, *specs, size, write)
-                 : base_iterator(out, write(reserve(out, size)));
-}
-
-// Returns true iff the code point cp is printable.
-FMT_API auto is_printable(uint32_t cp) -> bool;
-
-inline auto needs_escape(uint32_t cp) -> bool {
-    return cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\'
-            || !is_printable(cp);
-}
-
-template <typename Char>
-struct find_escape_result {
-    const Char *begin;
-    const Char *end;
-    uint32_t cp;
-};
-
-template <typename Char>
-using make_unsigned_char = typename conditional_t<std::is_integral<Char>::value,
-        std::make_unsigned<Char>, type_identity<uint32_t>>::type;
-
-template <typename Char>
-auto find_escape(const Char *begin, const Char *end)
-        -> find_escape_result<Char> {
-    for (; begin != end; ++begin) {
-        uint32_t cp = static_cast<make_unsigned_char<Char>>(*begin);
-        if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue;
-        if (needs_escape(cp)) return {begin, begin + 1, cp};
-    }
-    return {begin, nullptr, 0};
-}
-
-inline auto find_escape(const char *begin, const char *end)
-        -> find_escape_result<char> {
-    if (!is_utf8()) return find_escape<char>(begin, end);
-    auto result = find_escape_result<char> {end, nullptr, 0};
-    for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
-            [&](uint32_t cp, string_view sv) {
-                if (needs_escape(cp)) {
-                    result = {sv.begin(), sv.end(), cp};
-                    return false;
-                }
-                return true;
-            });
-    return result;
-}
-
-#define FMT_STRING_IMPL(s, base, explicit) \
-    [] { \
-        /* Use the hidden visibility as a workaround for a GCC bug (#1973). */ \
-        /* Use a macro-like name to avoid shadowing warnings. */ \
-        struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base { \
-            using char_type FMT_MAYBE_UNUSED \
-                    = fmt::remove_cvref_t<decltype(s[0])>; \
-            FMT_MAYBE_UNUSED FMT_CONSTEXPR explicit \
-            operator fmt::basic_string_view<char_type>() const { \
-                return fmt::detail_exported::compile_string_to_view< \
-                        char_type>(s); \
-            } \
-        }; \
-        return FMT_COMPILE_STRING(); \
-    }()
-
-/**
-  \rst
-  Constructs a compile-time format string from a string literal *s*.
-
-  **Example**::
-
-    // A compile-time error because 'd' is an invalid specifier for strings.
-    std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
-  \endrst
- */
-#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string, )
-
-template <size_t width, typename Char, typename OutputIt>
-auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt {
-    *out++ = static_cast<Char>('\\');
-    *out++ = static_cast<Char>(prefix);
-    Char buf[width];
-    fill_n(buf, width, static_cast<Char>('0'));
-    format_uint<4>(buf, cp, width);
-    return copy_str<Char>(buf, buf + width, out);
-}
-
-template <typename OutputIt, typename Char>
-auto write_escaped_cp(OutputIt out, const find_escape_result<Char> &escape)
-        -> OutputIt {
-    auto c = static_cast<Char>(escape.cp);
-    switch (escape.cp) {
-        case '\n':
-            *out++ = static_cast<Char>('\\');
-            c = static_cast<Char>('n');
-            break;
-        case '\r':
-            *out++ = static_cast<Char>('\\');
-            c = static_cast<Char>('r');
-            break;
-        case '\t':
-            *out++ = static_cast<Char>('\\');
-            c = static_cast<Char>('t');
-            break;
-        case '"': FMT_FALLTHROUGH;
-        case '\'': FMT_FALLTHROUGH;
-        case '\\': *out++ = static_cast<Char>('\\'); break;
-        default:
-            if (escape.cp < 0x100) {
-                return write_codepoint<2, Char>(out, 'x', escape.cp);
-            }
-            if (escape.cp < 0x10000) {
-                return write_codepoint<4, Char>(out, 'u', escape.cp);
-            }
-            if (escape.cp < 0x110000) {
-                return write_codepoint<8, Char>(out, 'U', escape.cp);
-            }
-            for (Char escape_char : basic_string_view<Char>(escape.begin,
-                         to_unsigned(escape.end - escape.begin))) {
-                out = write_codepoint<2, Char>(
-                        out, 'x', static_cast<uint32_t>(escape_char) & 0xFF);
-            }
-            return out;
-    }
-    *out++ = c;
-    return out;
-}
-
-template <typename Char, typename OutputIt>
-auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
-        -> OutputIt {
-    *out++ = static_cast<Char>('"');
-    auto begin = str.begin(), end = str.end();
-    do {
-        auto escape = find_escape(begin, end);
-        out = copy_str<Char>(begin, escape.begin, out);
-        begin = escape.end;
-        if (!begin) break;
-        out = write_escaped_cp<OutputIt, Char>(out, escape);
-    } while (begin != end);
-    *out++ = static_cast<Char>('"');
-    return out;
-}
-
-template <typename Char, typename OutputIt>
-auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
-    Char v_array[1] = {v};
-    *out++ = static_cast<Char>('\'');
-    if ((needs_escape(static_cast<uint32_t>(v)) && v != static_cast<Char>('"'))
-            || v == static_cast<Char>('\'')) {
-        out = write_escaped_cp(out,
-                find_escape_result<Char> {
-                        v_array, v_array + 1, static_cast<uint32_t>(v)});
-    } else {
-        *out++ = v;
-    }
-    *out++ = static_cast<Char>('\'');
-    return out;
-}
-
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write_char(
-        OutputIt out, Char value, const format_specs<Char> &specs) -> OutputIt {
-    bool is_debug = specs.type == presentation_type::debug;
-    return write_padded(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
-        if (is_debug) return write_escaped_char(it, value);
-        *it++ = value;
-        return it;
-    });
-}
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, Char value,
-        const format_specs<Char> &specs, locale_ref loc = {}) -> OutputIt {
-    // char is formatted as unsigned char for consistency across platforms.
-    using unsigned_type = conditional_t<std::is_same<Char, char>::value,
-            unsigned char, unsigned>;
-    return check_char_specs(specs)
-            ? write_char(out, value, specs)
-            : write(out, static_cast<unsigned_type>(value), specs, loc);
-}
-
-// Data for write_int that doesn't depend on output iterator type. It is used to
-// avoid template code bloat.
-template <typename Char>
-struct write_int_data {
-    size_t size;
-    size_t padding;
-
-    FMT_CONSTEXPR write_int_data(
-            int num_digits, unsigned prefix, const format_specs<Char> &specs)
-        : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
-        if (specs.align == align::numeric) {
-            auto width = to_unsigned(specs.width);
-            if (width > size) {
-                padding = width - size;
-                size = width;
-            }
-        } else if (specs.precision > num_digits) {
-            size = (prefix >> 24) + to_unsigned(specs.precision);
-            padding = to_unsigned(specs.precision - num_digits);
-        }
-    }
-};
-
-// Writes an integer in the format
-//   <left-padding><prefix><numeric-padding><digits><right-padding>
-// where <digits> are written by write_digits(it).
-// prefix contains chars in three lower bytes and the size in the fourth byte.
-template <typename OutputIt, typename Char, typename W>
-FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, int num_digits,
-        unsigned prefix, const format_specs<Char> &specs, W write_digits)
-        -> OutputIt {
-    // Slightly faster check for specs.width == 0 && specs.precision == -1.
-    if ((specs.width | (specs.precision + 1)) == 0) {
-        auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
-        if (prefix != 0) {
-            for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
-                *it++ = static_cast<Char>(p & 0xff);
-        }
-        return base_iterator(out, write_digits(it));
-    }
-    auto data = write_int_data<Char>(num_digits, prefix, specs);
-    return write_padded<align::right>(
-            out, specs, data.size, [=](reserve_iterator<OutputIt> it) {
-                for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
-                    *it++ = static_cast<Char>(p & 0xff);
-                it = detail::fill_n(it, data.padding, static_cast<Char>('0'));
-                return write_digits(it);
-            });
-}
-
-template <typename Char>
-class digit_grouping {
-private:
-    std::string grouping_;
-    std::basic_string<Char> thousands_sep_;
-
-    struct next_state {
-        std::string::const_iterator group;
-        int pos;
-    };
-    auto initial_state() const -> next_state { return {grouping_.begin(), 0}; }
-
-    // Returns the next digit group separator position.
-    auto next(next_state &state) const -> int {
-        if (thousands_sep_.empty()) return max_value<int>();
-        if (state.group == grouping_.end())
-            return state.pos += grouping_.back();
-        if (*state.group <= 0 || *state.group == max_value<char>())
-            return max_value<int>();
-        state.pos += *state.group++;
-        return state.pos;
-    }
-
-public:
-    explicit digit_grouping(locale_ref loc, bool localized = true) {
-        if (!localized) return;
-        auto sep = thousands_sep<Char>(loc);
-        grouping_ = sep.grouping;
-        if (sep.thousands_sep) thousands_sep_.assign(1, sep.thousands_sep);
-    }
-    digit_grouping(std::string grouping, std::basic_string<Char> sep)
-        : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {}
-
-    auto has_separator() const -> bool { return !thousands_sep_.empty(); }
-
-    auto count_separators(int num_digits) const -> int {
-        int count = 0;
-        auto state = initial_state();
-        while (num_digits > next(state))
-            ++count;
-        return count;
-    }
-
-    // Applies grouping to digits and write the output to out.
-    template <typename Out, typename C>
-    auto apply(Out out, basic_string_view<C> digits) const -> Out {
-        auto num_digits = static_cast<int>(digits.size());
-        auto separators = basic_memory_buffer<int>();
-        separators.push_back(0);
-        auto state = initial_state();
-        while (int i = next(state)) {
-            if (i >= num_digits) break;
-            separators.push_back(i);
-        }
-        for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
-                i < num_digits; ++i) {
-            if (num_digits - i == separators[sep_index]) {
-                out = copy_str<Char>(thousands_sep_.data(),
-                        thousands_sep_.data() + thousands_sep_.size(), out);
-                --sep_index;
-            }
-            *out++ = static_cast<Char>(digits[to_unsigned(i)]);
-        }
-        return out;
-    }
-};
-
-FMT_CONSTEXPR inline void prefix_append(unsigned &prefix, unsigned value) {
-    prefix |= prefix != 0 ? value << 8 : value;
-    prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
-}
-
-// Writes a decimal integer with digit grouping.
-template <typename OutputIt, typename UInt, typename Char>
-auto write_int(OutputIt out, UInt value, unsigned prefix,
-        const format_specs<Char> &specs, const digit_grouping<Char> &grouping)
-        -> OutputIt {
-    static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
-    int num_digits = 0;
-    auto buffer = memory_buffer();
-    switch (specs.type) {
-        case presentation_type::none:
-        case presentation_type::dec: {
-            num_digits = count_digits(value);
-            format_decimal<char>(appender(buffer), value, num_digits);
-            break;
-        }
-        case presentation_type::hex_lower:
-        case presentation_type::hex_upper: {
-            bool upper = specs.type == presentation_type::hex_upper;
-            if (specs.alt)
-                prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
-            num_digits = count_digits<4>(value);
-            format_uint<4, char>(appender(buffer), value, num_digits, upper);
-            break;
-        }
-        case presentation_type::bin_lower:
-        case presentation_type::bin_upper: {
-            bool upper = specs.type == presentation_type::bin_upper;
-            if (specs.alt)
-                prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
-            num_digits = count_digits<1>(value);
-            format_uint<1, char>(appender(buffer), value, num_digits);
-            break;
-        }
-        case presentation_type::oct: {
-            num_digits = count_digits<3>(value);
-            // Octal prefix '0' is counted as a digit, so only add it if precision
-            // is not greater than the number of digits.
-            if (specs.alt && specs.precision <= num_digits && value != 0)
-                prefix_append(prefix, '0');
-            format_uint<3, char>(appender(buffer), value, num_digits);
-            break;
-        }
-        case presentation_type::chr:
-            return write_char(out, static_cast<Char>(value), specs);
-        default: throw_format_error("invalid format specifier");
-    }
-
-    unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits)
-            + to_unsigned(grouping.count_separators(num_digits));
-    return write_padded<align::right>(
-            out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
-                for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
-                    *it++ = static_cast<Char>(p & 0xff);
-                return grouping.apply(
-                        it, string_view(buffer.data(), buffer.size()));
-            });
-}
-
-// Writes a localized value.
-FMT_API auto write_loc(appender out, loc_value value,
-        const format_specs<> &specs, locale_ref loc) -> bool;
-template <typename OutputIt, typename Char>
-inline auto write_loc(
-        OutputIt, loc_value, const format_specs<Char> &, locale_ref) -> bool {
-    return false;
-}
-
-template <typename UInt>
-struct write_int_arg {
-    UInt abs_value;
-    unsigned prefix;
-};
-
-template <typename T>
-FMT_CONSTEXPR auto make_write_int_arg(T value, sign_t sign)
-        -> write_int_arg<uint32_or_64_or_128_t<T>> {
-    auto prefix = 0u;
-    auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
-    if (is_negative(value)) {
-        prefix = 0x01000000 | '-';
-        abs_value = 0 - abs_value;
-    } else {
-        constexpr const unsigned prefixes[4]
-                = {0, 0, 0x1000000u | '+', 0x1000000u | ' '};
-        prefix = prefixes[sign];
-    }
-    return {abs_value, prefix};
-}
-
-template <typename Char = char>
-struct loc_writer {
-    buffer_appender<Char> out;
-    const format_specs<Char> &specs;
-    std::basic_string<Char> sep;
-    std::string grouping;
-    std::basic_string<Char> decimal_point;
-
-    template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-    auto operator()(T value) -> bool {
-        auto arg = make_write_int_arg(value, specs.sign);
-        write_int(out, static_cast<uint64_or_128_t<T>>(arg.abs_value),
-                arg.prefix, specs, digit_grouping<Char>(grouping, sep));
-        return true;
-    }
-
-    template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-    auto operator()(T) -> bool {
-        return false;
-    }
-};
-
-template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
-        const format_specs<Char> &specs, locale_ref) -> OutputIt {
-    static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
-    auto abs_value = arg.abs_value;
-    auto prefix = arg.prefix;
-    switch (specs.type) {
-        case presentation_type::none:
-        case presentation_type::dec: {
-            auto num_digits = count_digits(abs_value);
-            return write_int(out, num_digits, prefix, specs,
-                    [=](reserve_iterator<OutputIt> it) {
-                        return format_decimal<Char>(it, abs_value, num_digits)
-                                .end;
-                    });
-        }
-        case presentation_type::hex_lower:
-        case presentation_type::hex_upper: {
-            bool upper = specs.type == presentation_type::hex_upper;
-            if (specs.alt)
-                prefix_append(prefix, unsigned(upper ? 'X' : 'x') << 8 | '0');
-            int num_digits = count_digits<4>(abs_value);
-            return write_int(out, num_digits, prefix, specs,
-                    [=](reserve_iterator<OutputIt> it) {
-                        return format_uint<4, Char>(
-                                it, abs_value, num_digits, upper);
-                    });
-        }
-        case presentation_type::bin_lower:
-        case presentation_type::bin_upper: {
-            bool upper = specs.type == presentation_type::bin_upper;
-            if (specs.alt)
-                prefix_append(prefix, unsigned(upper ? 'B' : 'b') << 8 | '0');
-            int num_digits = count_digits<1>(abs_value);
-            return write_int(out, num_digits, prefix, specs,
-                    [=](reserve_iterator<OutputIt> it) {
-                        return format_uint<1, Char>(it, abs_value, num_digits);
-                    });
-        }
-        case presentation_type::oct: {
-            int num_digits = count_digits<3>(abs_value);
-            // Octal prefix '0' is counted as a digit, so only add it if precision
-            // is not greater than the number of digits.
-            if (specs.alt && specs.precision <= num_digits && abs_value != 0)
-                prefix_append(prefix, '0');
-            return write_int(out, num_digits, prefix, specs,
-                    [=](reserve_iterator<OutputIt> it) {
-                        return format_uint<3, Char>(it, abs_value, num_digits);
-                    });
-        }
-        case presentation_type::chr:
-            return write_char(out, static_cast<Char>(abs_value), specs);
-        default: throw_format_error("invalid format specifier");
-    }
-    return out;
-}
-template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(OutputIt out,
-        write_int_arg<T> arg, const format_specs<Char> &specs, locale_ref loc)
-        -> OutputIt {
-    return write_int(out, arg, specs, loc);
-}
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_integral<T>::value && !std::is_same<T, bool>::value
-                && std::is_same<OutputIt, buffer_appender<Char>>::value)>
-FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
-        const format_specs<Char> &specs, locale_ref loc) -> OutputIt {
-    if (specs.localized && write_loc(out, value, specs, loc)) return out;
-    return write_int_noinline(
-            out, make_write_int_arg(value, specs.sign), specs, loc);
-}
-// An inlined version of write used in format string compilation.
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_integral<T>::value && !std::is_same<T, bool>::value
-                && !std::is_same<OutputIt, buffer_appender<Char>>::value)>
-FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
-        const format_specs<Char> &specs, locale_ref loc) -> OutputIt {
-    if (specs.localized && write_loc(out, value, specs, loc)) return out;
-    return write_int(out, make_write_int_arg(value, specs.sign), specs, loc);
-}
-
-// An output iterator that counts the number of objects written to it and
-// discards them.
-class counting_iterator {
-private:
-    size_t count_;
-
-public:
-    using iterator_category = std::output_iterator_tag;
-    using difference_type = std::ptrdiff_t;
-    using pointer = void;
-    using reference = void;
-    FMT_UNCHECKED_ITERATOR(counting_iterator);
-
-    struct value_type {
-        template <typename T>
-        FMT_CONSTEXPR void operator=(const T &) {}
-    };
-
-    FMT_CONSTEXPR counting_iterator() : count_(0) {}
-
-    FMT_CONSTEXPR auto count() const -> size_t { return count_; }
-
-    FMT_CONSTEXPR auto operator++() -> counting_iterator & {
-        ++count_;
-        return *this;
-    }
-    FMT_CONSTEXPR auto operator++(int) -> counting_iterator {
-        auto it = *this;
-        ++*this;
-        return it;
-    }
-
-    FMT_CONSTEXPR friend auto operator+(counting_iterator it, difference_type n)
-            -> counting_iterator {
-        it.count_ += static_cast<size_t>(n);
-        return it;
-    }
-
-    FMT_CONSTEXPR auto operator*() const -> value_type { return {}; }
-};
-
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
-        const format_specs<Char> &specs) -> OutputIt {
-    auto data = s.data();
-    auto size = s.size();
-    if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
-        size = code_point_index(s, to_unsigned(specs.precision));
-    bool is_debug = specs.type == presentation_type::debug;
-    size_t width = 0;
-    if (specs.width != 0) {
-        if (is_debug)
-            width = write_escaped_string(counting_iterator {}, s).count();
-        else
-            width = compute_width(basic_string_view<Char>(data, size));
-    }
-    return write_padded(
-            out, specs, size, width, [=](reserve_iterator<OutputIt> it) {
-                if (is_debug) return write_escaped_string(it, s);
-                return copy_str<Char>(data, data + size, it);
-            });
-}
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out,
-        basic_string_view<type_identity_t<Char>> s,
-        const format_specs<Char> &specs, locale_ref) -> OutputIt {
-    return write(out, s, specs);
-}
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, const Char *s,
-        const format_specs<Char> &specs, locale_ref) -> OutputIt {
-    if (specs.type == presentation_type::pointer)
-        return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
-    if (!s) throw_format_error("string pointer is null");
-    return write(out, basic_string_view<Char>(s), specs, {});
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_integral<T>::value && !std::is_same<T, bool>::value
-                && !std::is_same<T, Char>::value)>
-FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
-    auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
-    bool negative = is_negative(value);
-    // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
-    if (negative) abs_value = ~abs_value + 1;
-    int num_digits = count_digits(abs_value);
-    auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
-    auto it = reserve(out, size);
-    if (auto ptr = to_pointer<Char>(it, size)) {
-        if (negative) *ptr++ = static_cast<Char>('-');
-        format_decimal<Char>(ptr, abs_value, num_digits);
-        return out;
-    }
-    if (negative) *it++ = static_cast<Char>('-');
-    it = format_decimal<Char>(it, abs_value, num_digits).end;
-    return base_iterator(out, it);
-}
-
-// DEPRECATED!
-template <typename Char>
-FMT_CONSTEXPR auto parse_align(const Char *begin, const Char *end,
-        format_specs<Char> &specs) -> const Char * {
-    FMT_ASSERT(begin != end, "");
-    auto align = align::none;
-    auto p = begin + code_point_length(begin);
-    if (end - p <= 0) p = begin;
-    for (;;) {
-        switch (to_ascii(*p)) {
-            case '<': align = align::left; break;
-            case '>': align = align::right; break;
-            case '^': align = align::center; break;
-        }
-        if (align != align::none) {
-            if (p != begin) {
-                auto c = *begin;
-                if (c == '}') return begin;
-                if (c == '{') {
-                    throw_format_error("invalid fill character '{'");
-                    return begin;
-                }
-                specs.fill = {begin, to_unsigned(p - begin)};
-                begin = p + 1;
-            } else {
-                ++begin;
-            }
-            break;
-        } else if (p == begin) {
-            break;
-        }
-        p = begin;
-    }
-    specs.align = align;
-    return begin;
-}
-
-// A floating-point presentation format.
-enum class float_format : unsigned char {
-    general, // General: exponent notation or fixed point based on magnitude.
-    exp, // Exponent notation with the default precision of 6, e.g. 1.2e-3.
-    fixed, // Fixed point with the default precision of 6, e.g. 0.0012.
-    hex
-};
-
-struct float_specs {
-    int precision;
-    float_format format : 8;
-    sign_t sign : 8;
-    bool upper : 1;
-    bool locale : 1;
-    bool binary32 : 1;
-    bool showpoint : 1;
-};
-
-template <typename Char>
-FMT_CONSTEXPR auto parse_float_type_spec(const format_specs<Char> &specs)
-        -> float_specs {
-    auto result = float_specs();
-    result.showpoint = specs.alt;
-    result.locale = specs.localized;
-    switch (specs.type) {
-        case presentation_type::none:
-            result.format = float_format::general;
-            break;
-        case presentation_type::general_upper:
-            result.upper = true;
-            FMT_FALLTHROUGH;
-        case presentation_type::general_lower:
-            result.format = float_format::general;
-            break;
-        case presentation_type::exp_upper: result.upper = true; FMT_FALLTHROUGH;
-        case presentation_type::exp_lower:
-            result.format = float_format::exp;
-            result.showpoint |= specs.precision != 0;
-            break;
-        case presentation_type::fixed_upper:
-            result.upper = true;
-            FMT_FALLTHROUGH;
-        case presentation_type::fixed_lower:
-            result.format = float_format::fixed;
-            result.showpoint |= specs.precision != 0;
-            break;
-        case presentation_type::hexfloat_upper:
-            result.upper = true;
-            FMT_FALLTHROUGH;
-        case presentation_type::hexfloat_lower:
-            result.format = float_format::hex;
-            break;
-        default: throw_format_error("invalid format specifier"); break;
-    }
-    return result;
-}
-
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
-        format_specs<Char> specs, const float_specs &fspecs) -> OutputIt {
-    auto str = isnan ? (fspecs.upper ? "NAN" : "nan")
-                     : (fspecs.upper ? "INF" : "inf");
-    constexpr size_t str_size = 3;
-    auto sign = fspecs.sign;
-    auto size = str_size + (sign ? 1 : 0);
-    // Replace '0'-padding with space for non-finite values.
-    const bool is_zero_fill = specs.fill.size() == 1
-            && *specs.fill.data() == static_cast<Char>('0');
-    if (is_zero_fill) specs.fill[0] = static_cast<Char>(' ');
-    return write_padded(out, specs, size, [=](reserve_iterator<OutputIt> it) {
-        if (sign) *it++ = detail::sign<Char>(sign);
-        return copy_str<Char>(str, str + str_size, it);
-    });
-}
-
-// A decimal floating-point number significand * pow(10, exp).
-struct big_decimal_fp {
-    const char *significand;
-    int significand_size;
-    int exponent;
-};
-
-constexpr auto get_significand_size(const big_decimal_fp &f) -> int {
-    return f.significand_size;
-}
-template <typename T>
-inline auto get_significand_size(const dragonbox::decimal_fp<T> &f) -> int {
-    return count_digits(f.significand);
-}
-
-template <typename Char, typename OutputIt>
-constexpr auto write_significand(OutputIt out, const char *significand,
-        int significand_size) -> OutputIt {
-    return copy_str<Char>(significand, significand + significand_size, out);
-}
-template <typename Char, typename OutputIt, typename UInt>
-inline auto write_significand(
-        OutputIt out, UInt significand, int significand_size) -> OutputIt {
-    return format_decimal<Char>(out, significand, significand_size).end;
-}
-template <typename Char, typename OutputIt, typename T, typename Grouping>
-FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
-        int significand_size, int exponent, const Grouping &grouping)
-        -> OutputIt {
-    if (!grouping.has_separator()) {
-        out = write_significand<Char>(out, significand, significand_size);
-        return detail::fill_n(out, exponent, static_cast<Char>('0'));
-    }
-    auto buffer = memory_buffer();
-    write_significand<char>(appender(buffer), significand, significand_size);
-    detail::fill_n(appender(buffer), exponent, '0');
-    return grouping.apply(out, string_view(buffer.data(), buffer.size()));
-}
-
-template <typename Char, typename UInt,
-        FMT_ENABLE_IF(std::is_integral<UInt>::value)>
-inline auto write_significand(Char *out, UInt significand, int significand_size,
-        int integral_size, Char decimal_point) -> Char * {
-    if (!decimal_point)
-        return format_decimal(out, significand, significand_size).end;
-    out += significand_size + 1;
-    Char *end = out;
-    int floating_size = significand_size - integral_size;
-    for (int i = floating_size / 2; i > 0; --i) {
-        out -= 2;
-        copy2(out, digits2(static_cast<std::size_t>(significand % 100)));
-        significand /= 100;
-    }
-    if (floating_size % 2 != 0) {
-        *--out = static_cast<Char>('0' + significand % 10);
-        significand /= 10;
-    }
-    *--out = decimal_point;
-    format_decimal(out - integral_size, significand, integral_size);
-    return end;
-}
-
-template <typename OutputIt, typename UInt, typename Char,
-        FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
-inline auto write_significand(OutputIt out, UInt significand,
-        int significand_size, int integral_size, Char decimal_point)
-        -> OutputIt {
-    // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
-    Char buffer[digits10<UInt>() + 2];
-    auto end = write_significand(buffer, significand, significand_size,
-            integral_size, decimal_point);
-    return detail::copy_str_noinline<Char>(buffer, end, out);
-}
-
-template <typename OutputIt, typename Char>
-FMT_CONSTEXPR auto write_significand(OutputIt out, const char *significand,
-        int significand_size, int integral_size, Char decimal_point)
-        -> OutputIt {
-    out = detail::copy_str_noinline<Char>(
-            significand, significand + integral_size, out);
-    if (!decimal_point) return out;
-    *out++ = decimal_point;
-    return detail::copy_str_noinline<Char>(
-            significand + integral_size, significand + significand_size, out);
-}
-
-template <typename OutputIt, typename Char, typename T, typename Grouping>
-FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
-        int significand_size, int integral_size, Char decimal_point,
-        const Grouping &grouping) -> OutputIt {
-    if (!grouping.has_separator()) {
-        return write_significand(out, significand, significand_size,
-                integral_size, decimal_point);
-    }
-    auto buffer = basic_memory_buffer<Char>();
-    write_significand(buffer_appender<Char>(buffer), significand,
-            significand_size, integral_size, decimal_point);
-    grouping.apply(out,
-            basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
-    return detail::copy_str_noinline<Char>(
-            buffer.data() + integral_size, buffer.end(), out);
-}
-
-template <typename OutputIt, typename DecimalFP, typename Char,
-        typename Grouping = digit_grouping<Char>>
-FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP &f,
-        const format_specs<Char> &specs, float_specs fspecs, locale_ref loc)
-        -> OutputIt {
-    auto significand = f.significand;
-    int significand_size = get_significand_size(f);
-    const Char zero = static_cast<Char>('0');
-    auto sign = fspecs.sign;
-    size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
-    using iterator = reserve_iterator<OutputIt>;
-
-    Char decimal_point = fspecs.locale ? detail::decimal_point<Char>(loc)
-                                       : static_cast<Char>('.');
-
-    int output_exp = f.exponent + significand_size - 1;
-    auto use_exp_format = [=]() {
-        if (fspecs.format == float_format::exp) return true;
-        if (fspecs.format != float_format::general) return false;
-        // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
-        // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
-        const int exp_lower = -4, exp_upper = 16;
-        return output_exp < exp_lower
-                || output_exp
-                >= (fspecs.precision > 0 ? fspecs.precision : exp_upper);
-    };
-    if (use_exp_format()) {
-        int num_zeros = 0;
-        if (fspecs.showpoint) {
-            num_zeros = fspecs.precision - significand_size;
-            if (num_zeros < 0) num_zeros = 0;
-            size += to_unsigned(num_zeros);
-        } else if (significand_size == 1) {
-            decimal_point = Char();
-        }
-        auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp;
-        int exp_digits = 2;
-        if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
-
-        size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
-        char exp_char = fspecs.upper ? 'E' : 'e';
-        auto write = [=](iterator it) {
-            if (sign) *it++ = detail::sign<Char>(sign);
-            // Insert a decimal point after the first digit and add an exponent.
-            it = write_significand(
-                    it, significand, significand_size, 1, decimal_point);
-            if (num_zeros > 0) it = detail::fill_n(it, num_zeros, zero);
-            *it++ = static_cast<Char>(exp_char);
-            return write_exponent<Char>(output_exp, it);
-        };
-        return specs.width > 0
-                ? write_padded<align::right>(out, specs, size, write)
-                : base_iterator(out, write(reserve(out, size)));
-    }
-
-    int exp = f.exponent + significand_size;
-    if (f.exponent >= 0) {
-        // 1234e5 -> 123400000[.0+]
-        size += to_unsigned(f.exponent);
-        int num_zeros = fspecs.precision - exp;
-        abort_fuzzing_if(num_zeros > 5000);
-        if (fspecs.showpoint) {
-            ++size;
-            if (num_zeros <= 0 && fspecs.format != float_format::fixed)
-                num_zeros = 0;
-            if (num_zeros > 0) size += to_unsigned(num_zeros);
-        }
-        auto grouping = Grouping(loc, fspecs.locale);
-        size += to_unsigned(grouping.count_separators(exp));
-        return write_padded<align::right>(out, specs, size, [&](iterator it) {
-            if (sign) *it++ = detail::sign<Char>(sign);
-            it = write_significand<Char>(
-                    it, significand, significand_size, f.exponent, grouping);
-            if (!fspecs.showpoint) return it;
-            *it++ = decimal_point;
-            return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
-        });
-    } else if (exp > 0) {
-        // 1234e-2 -> 12.34[0+]
-        int num_zeros
-                = fspecs.showpoint ? fspecs.precision - significand_size : 0;
-        size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
-        auto grouping = Grouping(loc, fspecs.locale);
-        size += to_unsigned(grouping.count_separators(exp));
-        return write_padded<align::right>(out, specs, size, [&](iterator it) {
-            if (sign) *it++ = detail::sign<Char>(sign);
-            it = write_significand(it, significand, significand_size, exp,
-                    decimal_point, grouping);
-            return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
-        });
-    }
-    // 1234e-6 -> 0.001234
-    int num_zeros = -exp;
-    if (significand_size == 0 && fspecs.precision >= 0
-            && fspecs.precision < num_zeros) {
-        num_zeros = fspecs.precision;
-    }
-    bool pointy = num_zeros != 0 || significand_size != 0 || fspecs.showpoint;
-    size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
-    return write_padded<align::right>(out, specs, size, [&](iterator it) {
-        if (sign) *it++ = detail::sign<Char>(sign);
-        *it++ = zero;
-        if (!pointy) return it;
-        *it++ = decimal_point;
-        it = detail::fill_n(it, num_zeros, zero);
-        return write_significand<Char>(it, significand, significand_size);
-    });
-}
-
-template <typename Char>
-class fallback_digit_grouping {
-public:
-    constexpr fallback_digit_grouping(locale_ref, bool) {}
-
-    constexpr auto has_separator() const -> bool { return false; }
-
-    constexpr auto count_separators(int) const -> int { return 0; }
-
-    template <typename Out, typename C>
-    constexpr auto apply(Out out, basic_string_view<C>) const -> Out {
-        return out;
-    }
-};
-
-template <typename OutputIt, typename DecimalFP, typename Char>
-FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP &f,
-        const format_specs<Char> &specs, float_specs fspecs, locale_ref loc)
-        -> OutputIt {
-    if (is_constant_evaluated()) {
-        return do_write_float<OutputIt, DecimalFP, Char,
-                fallback_digit_grouping<Char>>(out, f, specs, fspecs, loc);
-    } else {
-        return do_write_float(out, f, specs, fspecs, loc);
-    }
-}
-
-template <typename T>
-constexpr auto isnan(T value) -> bool {
-    return !(value >= value); // std::isnan doesn't support __float128.
-}
-
-template <typename T, typename Enable = void>
-struct has_isfinite : std::false_type {};
-
-template <typename T>
-struct has_isfinite<T, enable_if_t<sizeof(std::isfinite(T())) != 0>>
-    : std::true_type {};
-
-template <typename T,
-        FMT_ENABLE_IF(
-                std::is_floating_point<T>::value &&has_isfinite<T>::value)>
-FMT_CONSTEXPR20 auto isfinite(T value) -> bool {
-    constexpr T inf = T(std::numeric_limits<double>::infinity());
-    if (is_constant_evaluated())
-        return !detail::isnan(value) && value < inf && value > -inf;
-    return std::isfinite(value);
-}
-template <typename T, FMT_ENABLE_IF(!has_isfinite<T>::value)>
-FMT_CONSTEXPR auto isfinite(T value) -> bool {
-    T inf = T(std::numeric_limits<double>::infinity());
-    // std::isfinite doesn't support __float128.
-    return !detail::isnan(value) && value < inf && value > -inf;
-}
-
-template <typename T, FMT_ENABLE_IF(is_floating_point<T>::value)>
-FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
-    if (is_constant_evaluated()) {
-#ifdef __cpp_if_constexpr
-        if constexpr (std::numeric_limits<double>::is_iec559) {
-            auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
-            return (bits >> (num_bits<uint64_t>() - 1)) != 0;
-        }
-#endif
-    }
-    return std::signbit(static_cast<double>(value));
-}
-
-inline FMT_CONSTEXPR20 void adjust_precision(int &precision, int exp10) {
-    // Adjust fixed precision by exponent because it is relative to decimal
-    // point.
-    if (exp10 > 0 && precision > max_value<int>() - exp10)
-        FMT_THROW(format_error("number is too big"));
-    precision += exp10;
-}
-
-class bigint {
-private:
-    // A bigint is stored as an array of bigits (big digits), with bigit at index
-    // 0 being the least significant one.
-    using bigit = uint32_t;
-    using double_bigit = uint64_t;
-    enum { bigits_capacity = 32 };
-    basic_memory_buffer<bigit, bigits_capacity> bigits_;
-    int exp_;
-
-    FMT_CONSTEXPR20 auto operator[](int index) const -> bigit {
-        return bigits_[to_unsigned(index)];
-    }
-    FMT_CONSTEXPR20 auto operator[](int index) -> bigit & {
-        return bigits_[to_unsigned(index)];
-    }
-
-    static constexpr const int bigit_bits = num_bits<bigit>();
-
-    friend struct formatter<bigint>;
-
-    FMT_CONSTEXPR20 void subtract_bigits(
-            int index, bigit other, bigit &borrow) {
-        auto result
-                = static_cast<double_bigit>((*this)[index]) - other - borrow;
-        (*this)[index] = static_cast<bigit>(result);
-        borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
-    }
-
-    FMT_CONSTEXPR20 void remove_leading_zeros() {
-        int num_bigits = static_cast<int>(bigits_.size()) - 1;
-        while (num_bigits > 0 && (*this)[num_bigits] == 0)
-            --num_bigits;
-        bigits_.resize(to_unsigned(num_bigits + 1));
-    }
-
-    // Computes *this -= other assuming aligned bigints and *this >= other.
-    FMT_CONSTEXPR20 void subtract_aligned(const bigint &other) {
-        FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
-        FMT_ASSERT(fmt_compare(*this, other) >= 0, "");
-        bigit borrow = 0;
-        int i = other.exp_ - exp_;
-        for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
-            subtract_bigits(i, other.bigits_[j], borrow);
-        while (borrow > 0)
-            subtract_bigits(i, 0, borrow);
-        remove_leading_zeros();
-    }
-
-    FMT_CONSTEXPR20 void multiply(uint32_t value) {
-        const double_bigit wide_value = value;
-        bigit carry = 0;
-        for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
-            double_bigit result = bigits_[i] * wide_value + carry;
-            bigits_[i] = static_cast<bigit>(result);
-            carry = static_cast<bigit>(result >> bigit_bits);
-        }
-        if (carry != 0) bigits_.push_back(carry);
-    }
-
-    template <typename UInt,
-            FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value
-                    || std::is_same<UInt, uint128_t>::value)>
-    FMT_CONSTEXPR20 void multiply(UInt value) {
-        using half_uint = conditional_t<std::is_same<UInt, uint128_t>::value,
-                uint64_t, uint32_t>;
-        const int shift = num_bits<half_uint>() - bigit_bits;
-        const UInt lower = static_cast<half_uint>(value);
-        const UInt upper = value >> num_bits<half_uint>();
-        UInt carry = 0;
-        for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
-            UInt result = lower * bigits_[i] + static_cast<bigit>(carry);
-            carry = (upper * bigits_[i] << shift) + (result >> bigit_bits)
-                    + (carry >> bigit_bits);
-            bigits_[i] = static_cast<bigit>(result);
-        }
-        while (carry != 0) {
-            bigits_.push_back(static_cast<bigit>(carry));
-            carry >>= bigit_bits;
-        }
-    }
-
-    template <typename UInt,
-            FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value
-                    || std::is_same<UInt, uint128_t>::value)>
-    FMT_CONSTEXPR20 void assign(UInt n) {
-        size_t num_bigits = 0;
-        do {
-            bigits_[num_bigits++] = static_cast<bigit>(n);
-            n >>= bigit_bits;
-        } while (n != 0);
-        bigits_.resize(num_bigits);
-        exp_ = 0;
-    }
-
-public:
-    FMT_CONSTEXPR20 bigint() : exp_(0) {}
-    explicit bigint(uint64_t n) { assign(n); }
-
-    bigint(const bigint &) = delete;
-    void operator=(const bigint &) = delete;
-
-    FMT_CONSTEXPR20 void assign(const bigint &other) {
-        auto size = other.bigits_.size();
-        bigits_.resize(size);
-        auto data = other.bigits_.data();
-        copy_str<bigit>(data, data + size, bigits_.data());
-        exp_ = other.exp_;
-    }
-
-    template <typename Int>
-    FMT_CONSTEXPR20 void operator=(Int n) {
-        FMT_ASSERT(n > 0, "");
-        assign(uint64_or_128_t<Int>(n));
-    }
-
-    FMT_CONSTEXPR20 auto num_bigits() const -> int {
-        return static_cast<int>(bigits_.size()) + exp_;
-    }
-
-    FMT_NOINLINE FMT_CONSTEXPR20 auto operator<<=(int shift) -> bigint & {
-        FMT_ASSERT(shift >= 0, "");
-        exp_ += shift / bigit_bits;
-        shift %= bigit_bits;
-        if (shift == 0) return *this;
-        bigit carry = 0;
-        for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
-            bigit c = bigits_[i] >> (bigit_bits - shift);
-            bigits_[i] = (bigits_[i] << shift) + carry;
-            carry = c;
-        }
-        if (carry != 0) bigits_.push_back(carry);
-        return *this;
-    }
-
-    template <typename Int>
-    FMT_CONSTEXPR20 auto operator*=(Int value) -> bigint & {
-        FMT_ASSERT(value > 0, "");
-        multiply(uint32_or_64_or_128_t<Int>(value));
-        return *this;
-    }
-
-    // updated from compare to fmt_compare to avoid build conflicts with oneDNN
-    // primitive descriptors.
-    friend FMT_CONSTEXPR20 auto fmt_compare(
-            const bigint &lhs, const bigint &rhs) -> int {
-        int num_lhs_bigits = lhs.num_bigits(),
-            num_rhs_bigits = rhs.num_bigits();
-        if (num_lhs_bigits != num_rhs_bigits)
-            return num_lhs_bigits > num_rhs_bigits ? 1 : -1;
-        int i = static_cast<int>(lhs.bigits_.size()) - 1;
-        int j = static_cast<int>(rhs.bigits_.size()) - 1;
-        int end = i - j;
-        if (end < 0) end = 0;
-        for (; i >= end; --i, --j) {
-            bigit lhs_bigit = lhs[i], rhs_bigit = rhs[j];
-            if (lhs_bigit != rhs_bigit) return lhs_bigit > rhs_bigit ? 1 : -1;
-        }
-        if (i != j) return i > j ? 1 : -1;
-        return 0;
-    }
-
-    // Returns fmt_compare(lhs1 + lhs2, rhs).
-    friend FMT_CONSTEXPR20 auto add_compare(
-            const bigint &lhs1, const bigint &lhs2, const bigint &rhs) -> int {
-        auto minimum = [](int a, int b) { return a < b ? a : b; };
-        auto maximum = [](int a, int b) { return a > b ? a : b; };
-        int max_lhs_bigits = maximum(lhs1.num_bigits(), lhs2.num_bigits());
-        int num_rhs_bigits = rhs.num_bigits();
-        if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
-        if (max_lhs_bigits > num_rhs_bigits) return 1;
-        auto get_bigit = [](const bigint &n, int i) -> bigit {
-            return i >= n.exp_ && i < n.num_bigits() ? n[i - n.exp_] : 0;
-        };
-        double_bigit borrow = 0;
-        int min_exp = minimum(minimum(lhs1.exp_, lhs2.exp_), rhs.exp_);
-        for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
-            double_bigit sum = static_cast<double_bigit>(get_bigit(lhs1, i))
-                    + get_bigit(lhs2, i);
-            bigit rhs_bigit = get_bigit(rhs, i);
-            if (sum > rhs_bigit + borrow) return 1;
-            borrow = rhs_bigit + borrow - sum;
-            if (borrow > 1) return -1;
-            borrow <<= bigit_bits;
-        }
-        return borrow != 0 ? -1 : 0;
-    }
-
-    // Assigns pow(10, exp) to this bigint.
-    FMT_CONSTEXPR20 void assign_pow10(int exp) {
-        FMT_ASSERT(exp >= 0, "");
-        if (exp == 0) return *this = 1;
-        // Find the top bit.
-        int bitmask = 1;
-        while (exp >= bitmask)
-            bitmask <<= 1;
-        bitmask >>= 1;
-        // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
-        // repeated squaring and multiplication.
-        *this = 5;
-        bitmask >>= 1;
-        while (bitmask != 0) {
-            square();
-            if ((exp & bitmask) != 0) *this *= 5;
-            bitmask >>= 1;
-        }
-        *this <<= exp; // Multiply by pow(2, exp) by shifting.
-    }
-
-    FMT_CONSTEXPR20 void square() {
-        int num_bigits = static_cast<int>(bigits_.size());
-        int num_result_bigits = 2 * num_bigits;
-        basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
-        bigits_.resize(to_unsigned(num_result_bigits));
-        auto sum = uint128_t();
-        for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
-            // Compute bigit at position bigit_index of the result by adding
-            // cross-product terms n[i] * n[j] such that i + j == bigit_index.
-            for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
-                // Most terms are multiplied twice which can be optimized in the future.
-                sum += static_cast<double_bigit>(n[i]) * n[j];
-            }
-            (*this)[bigit_index] = static_cast<bigit>(sum);
-            sum >>= num_bits<bigit>(); // Compute the carry.
-        }
-        // Do the same for the top half.
-        for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
-                ++bigit_index) {
-            for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
-                sum += static_cast<double_bigit>(n[i++]) * n[j--];
-            (*this)[bigit_index] = static_cast<bigit>(sum);
-            sum >>= num_bits<bigit>();
-        }
-        remove_leading_zeros();
-        exp_ *= 2;
-    }
-
-    // If this bigint has a bigger exponent than other, adds trailing zero to make
-    // exponents equal. This simplifies some operations such as subtraction.
-    FMT_CONSTEXPR20 void align(const bigint &other) {
-        int exp_difference = exp_ - other.exp_;
-        if (exp_difference <= 0) return;
-        int num_bigits = static_cast<int>(bigits_.size());
-        bigits_.resize(to_unsigned(num_bigits + exp_difference));
-        for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
-            bigits_[j] = bigits_[i];
-        std::uninitialized_fill_n(bigits_.data(), exp_difference, 0u);
-        exp_ -= exp_difference;
-    }
-
-    // Divides this bignum by divisor, assigning the remainder to this and
-    // returning the quotient.
-    FMT_CONSTEXPR20 auto divmod_assign(const bigint &divisor) -> int {
-        FMT_ASSERT(this != &divisor, "");
-        if (fmt_compare(*this, divisor) < 0) return 0;
-        FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
-        align(divisor);
-        int quotient = 0;
-        do {
-            subtract_aligned(divisor);
-            ++quotient;
-        } while (fmt_compare(*this, divisor) >= 0);
-        return quotient;
-    }
-};
-
-// format_dragon flags.
-enum dragon {
-    predecessor_closer = 1,
-    fixup = 2, // Run fixup to correct exp10 which can be off by one.
-    fixed = 4,
-};
-
-// Formats a floating-point number using a variation of the Fixed-Precision
-// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
-// https://fmt.dev/papers/p372-steele.pdf.
-FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
-        unsigned flags, int num_digits, buffer<char> &buf, int &exp10) {
-    bigint numerator; // 2 * R in (FPP)^2.
-    bigint denominator; // 2 * S in (FPP)^2.
-    // lower and upper are differences between value and corresponding boundaries.
-    bigint lower; // (M^- in (FPP)^2).
-    bigint upper_store; // upper's value if different from lower.
-    bigint *upper = nullptr; // (M^+ in (FPP)^2).
-    // Shift numerator and denominator by an extra bit or two (if lower boundary
-    // is closer) to make lower and upper integers. This eliminates multiplication
-    // by 2 during later computations.
-    bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0;
-    int shift = is_predecessor_closer ? 2 : 1;
-    if (value.e >= 0) {
-        numerator = value.f;
-        numerator <<= value.e + shift;
-        lower = 1;
-        lower <<= value.e;
-        if (is_predecessor_closer) {
-            upper_store = 1;
-            upper_store <<= value.e + 1;
-            upper = &upper_store;
-        }
-        denominator.assign_pow10(exp10);
-        denominator <<= shift;
-    } else if (exp10 < 0) {
-        numerator.assign_pow10(-exp10);
-        lower.assign(numerator);
-        if (is_predecessor_closer) {
-            upper_store.assign(numerator);
-            upper_store <<= 1;
-            upper = &upper_store;
-        }
-        numerator *= value.f;
-        numerator <<= shift;
-        denominator = 1;
-        denominator <<= shift - value.e;
-    } else {
-        numerator = value.f;
-        numerator <<= shift;
-        denominator.assign_pow10(exp10);
-        denominator <<= shift - value.e;
-        lower = 1;
-        if (is_predecessor_closer) {
-            upper_store = 1ULL << 1;
-            upper = &upper_store;
-        }
-    }
-    int even = static_cast<int>((value.f & 1) == 0);
-    if (!upper) upper = &lower;
-    bool shortest = num_digits < 0;
-    if ((flags & dragon::fixup) != 0) {
-        if (add_compare(numerator, *upper, denominator) + even <= 0) {
-            --exp10;
-            numerator *= 10;
-            if (num_digits < 0) {
-                lower *= 10;
-                if (upper != &lower) *upper *= 10;
-            }
-        }
-        if ((flags & dragon::fixed) != 0)
-            adjust_precision(num_digits, exp10 + 1);
-    }
-    // Invariant: value == (numerator / denominator) * pow(10, exp10).
-    if (shortest) {
-        // Generate the shortest representation.
-        num_digits = 0;
-        char *data = buf.data();
-        for (;;) {
-            int digit = numerator.divmod_assign(denominator);
-            bool low = fmt_compare(numerator, lower) - even
-                    < 0; // numerator <[=] lower.
-            // numerator + upper >[=] pow10:
-            bool high = add_compare(numerator, *upper, denominator) + even > 0;
-            data[num_digits++] = static_cast<char>('0' + digit);
-            if (low || high) {
-                if (!low) {
-                    ++data[num_digits - 1];
-                } else if (high) {
-                    int result = add_compare(numerator, numerator, denominator);
-                    // Round half to even.
-                    if (result > 0 || (result == 0 && (digit % 2) != 0))
-                        ++data[num_digits - 1];
-                }
-                buf.try_resize(to_unsigned(num_digits));
-                exp10 -= num_digits - 1;
-                return;
-            }
-            numerator *= 10;
-            lower *= 10;
-            if (upper != &lower) *upper *= 10;
-        }
-    }
-    // Generate the given number of digits.
-    exp10 -= num_digits - 1;
-    if (num_digits <= 0) {
-        denominator *= 10;
-        auto digit = add_compare(numerator, numerator, denominator) > 0 ? '1'
-                                                                        : '0';
-        buf.push_back(digit);
-        return;
-    }
-    buf.try_resize(to_unsigned(num_digits));
-    for (int i = 0; i < num_digits - 1; ++i) {
-        int digit = numerator.divmod_assign(denominator);
-        buf[i] = static_cast<char>('0' + digit);
-        numerator *= 10;
-    }
-    int digit = numerator.divmod_assign(denominator);
-    auto result = add_compare(numerator, numerator, denominator);
-    if (result > 0 || (result == 0 && (digit % 2) != 0)) {
-        if (digit == 9) {
-            const auto overflow = '0' + 10;
-            buf[num_digits - 1] = overflow;
-            // Propagate the carry.
-            for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
-                buf[i] = '0';
-                ++buf[i - 1];
-            }
-            if (buf[0] == overflow) {
-                buf[0] = '1';
-                if ((flags & dragon::fixed) != 0)
-                    buf.push_back('0');
-                else
-                    ++exp10;
-            }
-            return;
-        }
-        ++digit;
-    }
-    buf[num_digits - 1] = static_cast<char>('0' + digit);
-}
-
-// Formats a floating-point number using the hexfloat format.
-template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
-FMT_CONSTEXPR20 void format_hexfloat(
-        Float value, int precision, float_specs specs, buffer<char> &buf) {
-    // float is passed as double to reduce the number of instantiations and to
-    // simplify implementation.
-    static_assert(!std::is_same<Float, float>::value, "");
-
-    using info = dragonbox::float_info<Float>;
-
-    // Assume Float is in the format [sign][exponent][significand].
-    using carrier_uint = typename info::carrier_uint;
-
-    constexpr auto num_float_significand_bits
-            = detail::num_significand_bits<Float>();
-
-    basic_fp<carrier_uint> f(value);
-    f.e += num_float_significand_bits;
-    if (!has_implicit_bit<Float>()) --f.e;
-
-    constexpr auto num_fraction_bits
-            = num_float_significand_bits + (has_implicit_bit<Float>() ? 1 : 0);
-    constexpr auto num_xdigits = (num_fraction_bits + 3) / 4;
-
-    constexpr auto leading_shift = ((num_xdigits - 1) * 4);
-    const auto leading_mask = carrier_uint(0xF) << leading_shift;
-    const auto leading_xdigit
-            = static_cast<uint32_t>((f.f & leading_mask) >> leading_shift);
-    if (leading_xdigit > 1) f.e -= (32 - countl_zero(leading_xdigit) - 1);
-
-    int print_xdigits = num_xdigits - 1;
-    if (precision >= 0 && print_xdigits > precision) {
-        const int shift = ((print_xdigits - precision - 1) * 4);
-        const auto mask = carrier_uint(0xF) << shift;
-        const auto v = static_cast<uint32_t>((f.f & mask) >> shift);
-
-        if (v >= 8) {
-            const auto inc = carrier_uint(1) << (shift + 4);
-            f.f += inc;
-            f.f &= ~(inc - 1);
-        }
-
-        // Check long double overflow
-        if (!has_implicit_bit<Float>()) {
-            const auto implicit_bit = carrier_uint(1)
-                    << num_float_significand_bits;
-            if ((f.f & implicit_bit) == implicit_bit) {
-                f.f >>= 4;
-                f.e += 4;
-            }
-        }
-
-        print_xdigits = precision;
-    }
-
-    char xdigits[num_bits<carrier_uint>() / 4];
-    detail::fill_n(xdigits, sizeof(xdigits), '0');
-    format_uint<4>(xdigits, f.f, num_xdigits, specs.upper);
-
-    // Remove zero tail
-    while (print_xdigits > 0 && xdigits[print_xdigits] == '0')
-        --print_xdigits;
-
-    buf.push_back('0');
-    buf.push_back(specs.upper ? 'X' : 'x');
-    buf.push_back(xdigits[0]);
-    if (specs.showpoint || print_xdigits > 0 || print_xdigits < precision)
-        buf.push_back('.');
-    buf.append(xdigits + 1, xdigits + 1 + print_xdigits);
-    for (; print_xdigits < precision; ++print_xdigits)
-        buf.push_back('0');
-
-    buf.push_back(specs.upper ? 'P' : 'p');
-
-    uint32_t abs_e;
-    if (f.e < 0) {
-        buf.push_back('-');
-        abs_e = static_cast<uint32_t>(-f.e);
-    } else {
-        buf.push_back('+');
-        abs_e = static_cast<uint32_t>(f.e);
-    }
-    format_decimal<char>(appender(buf), abs_e, detail::count_digits(abs_e));
-}
-
-template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
-FMT_CONSTEXPR20 void format_hexfloat(
-        Float value, int precision, float_specs specs, buffer<char> &buf) {
-    format_hexfloat(static_cast<double>(value), precision, specs, buf);
-}
-
-constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
-    // For checking rounding thresholds.
-    // The kth entry is chosen to be the smallest integer such that the
-    // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
-    // It is equal to ceil(2^31 + 2^32/10^(k + 1)).
-    // These are stored in a string literal because we cannot have static arrays
-    // in constexpr functions and non-static ones are poorly optimized.
-    return U"\x9999999a\x828f5c29\x80418938\x80068db9\x8000a7c6\x800010c7"
-           U"\x800001ae\x8000002b"[index];
-}
-
-template <typename Float>
-FMT_CONSTEXPR20 auto format_float(Float value, int precision, float_specs specs,
-        buffer<char> &buf) -> int {
-    // float is passed as double to reduce the number of instantiations.
-    static_assert(!std::is_same<Float, float>::value, "");
-    FMT_ASSERT(value >= 0, "value is negative");
-    auto converted_value = convert_float(value);
-
-    const bool fixed = specs.format == float_format::fixed;
-    if (value <= 0) { // <= instead of == to silence a warning.
-        if (precision <= 0 || !fixed) {
-            buf.push_back('0');
-            return 0;
-        }
-        buf.try_resize(to_unsigned(precision));
-        fill_n(buf.data(), precision, '0');
-        return -precision;
-    }
-
-    int exp = 0;
-    bool use_dragon = true;
-    unsigned dragon_flags = 0;
-    if (!is_fast_float<Float>() || is_constant_evaluated()) {
-        const auto inv_log2_10 = 0.3010299956639812; // 1 / log2(10)
-        using info = dragonbox::float_info<decltype(converted_value)>;
-        const auto f = basic_fp<typename info::carrier_uint>(converted_value);
-        // Compute exp, an approximate power of 10, such that
-        //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
-        // This is based on log10(value) == log2(value) / log2(10) and approximation
-        // of log2(value) by e + num_fraction_bits idea from double-conversion.
-        auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10;
-        exp = static_cast<int>(e);
-        if (e > exp) ++exp; // Compute ceil.
-        dragon_flags = dragon::fixup;
-    } else if (precision < 0) {
-        // Use Dragonbox for the shortest format.
-        if (specs.binary32) {
-            auto dec = dragonbox::to_decimal(static_cast<float>(value));
-            write<char>(buffer_appender<char>(buf), dec.significand);
-            return dec.exponent;
-        }
-        auto dec = dragonbox::to_decimal(static_cast<double>(value));
-        write<char>(buffer_appender<char>(buf), dec.significand);
-        return dec.exponent;
-    } else {
-        // Extract significand bits and exponent bits.
-        using info = dragonbox::float_info<double>;
-        auto br = bit_cast<uint64_t>(static_cast<double>(value));
-
-        const uint64_t significand_mask
-                = (static_cast<uint64_t>(1) << num_significand_bits<double>())
-                - 1;
-        uint64_t significand = (br & significand_mask);
-        int exponent = static_cast<int>((br & exponent_mask<double>())
-                >> num_significand_bits<double>());
-
-        if (exponent != 0) { // Check if normal.
-            exponent
-                    -= exponent_bias<double>() + num_significand_bits<double>();
-            significand |= (static_cast<uint64_t>(1)
-                    << num_significand_bits<double>());
-            significand <<= 1;
-        } else {
-            // Normalize subnormal inputs.
-            FMT_ASSERT(significand != 0, "zeros should not appear here");
-            int shift = countl_zero(significand);
-            FMT_ASSERT(shift >= num_bits<uint64_t>()
-                                    - num_significand_bits<double>(),
-                    "");
-            shift -= (num_bits<uint64_t>() - num_significand_bits<double>()
-                    - 2);
-            exponent = (std::numeric_limits<double>::min_exponent
-                               - num_significand_bits<double>())
-                    - shift;
-            significand <<= shift;
-        }
-
-        // Compute the first several nonzero decimal significand digits.
-        // We call the number we get the first segment.
-        const int k = info::kappa - dragonbox::floor_log10_pow2(exponent);
-        exp = -k;
-        const int beta = exponent + dragonbox::floor_log2_pow10(k);
-        uint64_t first_segment;
-        bool has_more_segments;
-        int digits_in_the_first_segment;
-        {
-            const auto r = dragonbox::umul192_upper128(
-                    significand << beta, dragonbox::get_cached_power(k));
-            first_segment = r.high();
-            has_more_segments = r.low() != 0;
-
-            // The first segment can have 18 ~ 19 digits.
-            if (first_segment >= 1000000000000000000ULL) {
-                digits_in_the_first_segment = 19;
-            } else {
-                // When it is of 18-digits, we align it to 19-digits by adding a bogus
-                // zero at the end.
-                digits_in_the_first_segment = 18;
-                first_segment *= 10;
-            }
-        }
-
-        // Compute the actual number of decimal digits to print.
-        if (fixed)
-            adjust_precision(precision, exp + digits_in_the_first_segment);
-
-        // Use Dragon4 only when there might be not enough digits in the first
-        // segment.
-        if (digits_in_the_first_segment > precision) {
-            use_dragon = false;
-
-            if (precision <= 0) {
-                exp += digits_in_the_first_segment;
-
-                if (precision < 0) {
-                    // Nothing to do, since all we have are just leading zeros.
-                    buf.try_resize(0);
-                } else {
-                    // We may need to round-up.
-                    buf.try_resize(1);
-                    if ((first_segment
-                                | static_cast<uint64_t>(has_more_segments))
-                            > 5000000000000000000ULL) {
-                        buf[0] = '1';
-                    } else {
-                        buf[0] = '0';
-                    }
-                }
-            } // precision <= 0
-            else {
-                exp += digits_in_the_first_segment - precision;
-
-                // When precision > 0, we divide the first segment into three
-                // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits
-                // in 32-bits which usually allows faster calculation than in
-                // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize
-                // division-by-constant for large 64-bit divisors, we do it here
-                // manually. The magic number 7922816251426433760 below is equal to
-                // ceil(2^(64+32) / 10^10).
-                const uint32_t first_subsegment = static_cast<uint32_t>(
-                        dragonbox::umul128_upper64(
-                                first_segment, 7922816251426433760ULL)
-                        >> 32);
-                const uint64_t second_third_subsegments
-                        = first_segment - first_subsegment * 10000000000ULL;
-
-                uint64_t prod;
-                uint32_t digits;
-                bool should_round_up;
-                int number_of_digits_to_print = precision > 9 ? 9 : precision;
-
-                // Print a 9-digits subsegment, either the first or the second.
-                auto print_subsegment = [&](uint32_t subsegment, char *buffer) {
-                    int number_of_digits_printed = 0;
-
-                    // If we want to print an odd number of digits from the subsegment,
-                    if ((number_of_digits_to_print & 1) != 0) {
-                        // Convert to 64-bit fixed-point fractional form with 1-digit
-                        // integer part. The magic number 720575941 is a good enough
-                        // approximation of 2^(32 + 24) / 10^8; see
-                        // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
-                        // for details.
-                        prod = ((subsegment * static_cast<uint64_t>(720575941))
-                                       >> 24)
-                                + 1;
-                        digits = static_cast<uint32_t>(prod >> 32);
-                        *buffer = static_cast<char>('0' + digits);
-                        number_of_digits_printed++;
-                    }
-                    // If we want to print an even number of digits from the
-                    // first_subsegment,
-                    else {
-                        // Convert to 64-bit fixed-point fractional form with 2-digits
-                        // integer part. The magic number 450359963 is a good enough
-                        // approximation of 2^(32 + 20) / 10^7; see
-                        // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
-                        // for details.
-                        prod = ((subsegment * static_cast<uint64_t>(450359963))
-                                       >> 20)
-                                + 1;
-                        digits = static_cast<uint32_t>(prod >> 32);
-                        copy2(buffer, digits2(digits));
-                        number_of_digits_printed += 2;
-                    }
-
-                    // Print all digit pairs.
-                    while (number_of_digits_printed
-                            < number_of_digits_to_print) {
-                        prod = static_cast<uint32_t>(prod)
-                                * static_cast<uint64_t>(100);
-                        digits = static_cast<uint32_t>(prod >> 32);
-                        copy2(buffer + number_of_digits_printed,
-                                digits2(digits));
-                        number_of_digits_printed += 2;
-                    }
-                };
-
-                // Print first subsegment.
-                print_subsegment(first_subsegment, buf.data());
-
-                // Perform rounding if the first subsegment is the last subsegment to
-                // print.
-                if (precision <= 9) {
-                    // Rounding inside the subsegment.
-                    // We round-up if:
-                    //  - either the fractional part is strictly larger than 1/2, or
-                    //  - the fractional part is exactly 1/2 and the last digit is odd.
-                    // We rely on the following observations:
-                    //  - If fractional_part >= threshold, then the fractional part is
-                    //    strictly larger than 1/2.
-                    //  - If the MSB of fractional_part is set, then the fractional part
-                    //    must be at least 1/2.
-                    //  - When the MSB of fractional_part is set, either
-                    //    second_third_subsegments being nonzero or has_more_segments
-                    //    being true means there are further digits not printed, so the
-                    //    fractional part is strictly larger than 1/2.
-                    if (precision < 9) {
-                        uint32_t fractional_part = static_cast<uint32_t>(prod);
-                        should_round_up = fractional_part
-                                        >= fractional_part_rounding_thresholds(
-                                                8 - number_of_digits_to_print)
-                                || ((fractional_part >> 31)
-                                           & ((digits & 1)
-                                                   | (second_third_subsegments
-                                                           != 0)
-                                                   | has_more_segments))
-                                        != 0;
-                    }
-                    // Rounding at the subsegment boundary.
-                    // In this case, the fractional part is at least 1/2 if and only if
-                    // second_third_subsegments >= 5000000000ULL, and is strictly larger
-                    // than 1/2 if we further have either second_third_subsegments >
-                    // 5000000000ULL or has_more_segments == true.
-                    else {
-                        should_round_up
-                                = second_third_subsegments > 5000000000ULL
-                                || (second_third_subsegments == 5000000000ULL
-                                        && ((digits & 1) != 0
-                                                || has_more_segments));
-                    }
-                }
-                // Otherwise, print the second subsegment.
-                else {
-                    // Compilers are not aware of how to leverage the maximum value of
-                    // second_third_subsegments to find out a better magic number which
-                    // allows us to eliminate an additional shift. 1844674407370955162 =
-                    // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))).
-                    const uint32_t second_subsegment = static_cast<uint32_t>(
-                            dragonbox::umul128_upper64(second_third_subsegments,
-                                    1844674407370955162ULL));
-                    const uint32_t third_subsegment
-                            = static_cast<uint32_t>(second_third_subsegments)
-                            - second_subsegment * 10;
-
-                    number_of_digits_to_print = precision - 9;
-                    print_subsegment(second_subsegment, buf.data() + 9);
-
-                    // Rounding inside the subsegment.
-                    if (precision < 18) {
-                        // The condition third_subsegment != 0 implies that the segment was
-                        // of 19 digits, so in this case the third segment should be
-                        // consisting of a genuine digit from the input.
-                        uint32_t fractional_part = static_cast<uint32_t>(prod);
-                        should_round_up = fractional_part
-                                        >= fractional_part_rounding_thresholds(
-                                                8 - number_of_digits_to_print)
-                                || ((fractional_part >> 31)
-                                           & ((digits & 1)
-                                                   | (third_subsegment != 0)
-                                                   | has_more_segments))
-                                        != 0;
-                    }
-                    // Rounding at the subsegment boundary.
-                    else {
-                        // In this case, the segment must be of 19 digits, thus
-                        // the third subsegment should be consisting of a genuine digit from
-                        // the input.
-                        should_round_up = third_subsegment > 5
-                                || (third_subsegment == 5
-                                        && ((digits & 1) != 0
-                                                || has_more_segments));
-                    }
-                }
-
-                // Round-up if necessary.
-                if (should_round_up) {
-                    ++buf[precision - 1];
-                    for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) {
-                        buf[i] = '0';
-                        ++buf[i - 1];
-                    }
-                    if (buf[0] > '9') {
-                        buf[0] = '1';
-                        if (fixed)
-                            buf[precision++] = '0';
-                        else
-                            ++exp;
-                    }
-                }
-                buf.try_resize(to_unsigned(precision));
-            }
-        } // if (digits_in_the_first_segment > precision)
-        else {
-            // Adjust the exponent for its use in Dragon4.
-            exp += digits_in_the_first_segment - 1;
-        }
-    }
-    if (use_dragon) {
-        auto f = basic_fp<uint128_t>();
-        bool is_predecessor_closer = specs.binary32
-                ? f.assign(static_cast<float>(value))
-                : f.assign(converted_value);
-        if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
-        if (fixed) dragon_flags |= dragon::fixed;
-        // Limit precision to the maximum possible number of significant digits in
-        // an IEEE754 double because we don't need to generate zeros.
-        const int max_double_digits = 767;
-        if (precision > max_double_digits) precision = max_double_digits;
-        format_dragon(f, dragon_flags, precision, buf, exp);
-    }
-    if (!fixed && !specs.showpoint) {
-        // Remove trailing zeros.
-        auto num_digits = buf.size();
-        while (num_digits > 0 && buf[num_digits - 1] == '0') {
-            --num_digits;
-            ++exp;
-        }
-        buf.try_resize(num_digits);
-    }
-    return exp;
-}
-template <typename Char, typename OutputIt, typename T>
-FMT_CONSTEXPR20 auto write_float(OutputIt out, T value,
-        format_specs<Char> specs, locale_ref loc) -> OutputIt {
-    float_specs fspecs = parse_float_type_spec(specs);
-    fspecs.sign = specs.sign;
-    if (detail::signbit(value)) { // value < 0 is false for NaN so use signbit.
-        fspecs.sign = sign::minus;
-        value = -value;
-    } else if (fspecs.sign == sign::minus) {
-        fspecs.sign = sign::none;
-    }
-
-    if (!detail::isfinite(value))
-        return write_nonfinite(out, detail::isnan(value), specs, fspecs);
-
-    if (specs.align == align::numeric && fspecs.sign) {
-        auto it = reserve(out, 1);
-        *it++ = detail::sign<Char>(fspecs.sign);
-        out = base_iterator(out, it);
-        fspecs.sign = sign::none;
-        if (specs.width != 0) --specs.width;
-    }
-
-    memory_buffer buffer;
-    if (fspecs.format == float_format::hex) {
-        if (fspecs.sign) buffer.push_back(detail::sign<char>(fspecs.sign));
-        format_hexfloat(convert_float(value), specs.precision, fspecs, buffer);
-        return write_bytes<align::right>(
-                out, {buffer.data(), buffer.size()}, specs);
-    }
-    int precision
-            = specs.precision >= 0 || specs.type == presentation_type::none
-            ? specs.precision
-            : 6;
-    if (fspecs.format == float_format::exp) {
-        if (precision == max_value<int>())
-            throw_format_error("number is too big");
-        else
-            ++precision;
-    } else if (fspecs.format != float_format::fixed && precision == 0) {
-        precision = 1;
-    }
-    if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
-    int exp = format_float(convert_float(value), precision, fspecs, buffer);
-    fspecs.precision = precision;
-    auto f = big_decimal_fp {
-            buffer.data(), static_cast<int>(buffer.size()), exp};
-    return write_float(out, f, specs, fspecs, loc);
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_floating_point<T>::value)>
-FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs<Char> specs,
-        locale_ref loc = {}) -> OutputIt {
-    if (const_check(!is_supported_floating_point(value))) return out;
-    return specs.localized && write_loc(out, value, specs, loc)
-            ? out
-            : write_float(out, value, specs, loc);
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_fast_float<T>::value)>
-FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
-    if (is_constant_evaluated()) return write(out, value, format_specs<Char>());
-    if (const_check(!is_supported_floating_point(value))) return out;
-
-    auto fspecs = float_specs();
-    if (detail::signbit(value)) {
-        fspecs.sign = sign::minus;
-        value = -value;
-    }
-
-    constexpr auto specs = format_specs<Char>();
-    using floaty
-            = conditional_t<std::is_same<T, long double>::value, double, T>;
-    using floaty_uint = typename dragonbox::float_info<floaty>::carrier_uint;
-    floaty_uint mask = exponent_mask<floaty>();
-    if ((bit_cast<floaty_uint>(value) & mask) == mask)
-        return write_nonfinite(out, std::isnan(value), specs, fspecs);
-
-    auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
-    return write_float(out, dec, specs, fspecs, {});
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_floating_point<T>::value && !is_fast_float<T>::value)>
-inline auto write(OutputIt out, T value) -> OutputIt {
-    return write(out, value, format_specs<Char>());
-}
-
-template <typename Char, typename OutputIt>
-auto write(OutputIt out, monostate, format_specs<Char> = {}, locale_ref = {})
-        -> OutputIt {
-    FMT_ASSERT(false, "");
-    return out;
-}
-
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
-        -> OutputIt {
-    auto it = reserve(out, value.size());
-    it = copy_str_noinline<Char>(value.begin(), value.end(), it);
-    return base_iterator(out, it);
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(is_string<T>::value)>
-constexpr auto write(OutputIt out, const T &value) -> OutputIt {
-    return write<Char>(out, to_string_view(value));
-}
-
-// FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
-template <typename Char, typename OutputIt, typename T,
-        bool check = std::is_enum<T>::value && !std::is_same<T, Char>::value
-                && mapped_type_constant<T,
-                           basic_format_context<OutputIt, Char>>::value
-                        != type::custom_type,
-        FMT_ENABLE_IF(check)>
-FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
-    return write<Char>(out, static_cast<underlying_t<T>>(value));
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(std::is_same<T, bool>::value)>
-FMT_CONSTEXPR auto write(OutputIt out, T value,
-        const format_specs<Char> &specs = {}, locale_ref = {}) -> OutputIt {
-    return specs.type != presentation_type::none
-                    && specs.type != presentation_type::string
-            ? write(out, value ? 1 : 0, specs, {})
-            : write_bytes(out, value ? "true" : "false", specs);
-}
-
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
-    auto it = reserve(out, 1);
-    *it++ = value;
-    return base_iterator(out, it);
-}
-
-template <typename Char, typename OutputIt>
-FMT_CONSTEXPR_CHAR_TRAITS auto write(OutputIt out, const Char *value)
-        -> OutputIt {
-    if (value) return write(out, basic_string_view<Char>(value));
-    throw_format_error("string pointer is null");
-    return out;
-}
-
-template <typename Char, typename OutputIt, typename T,
-        FMT_ENABLE_IF(std::is_same<T, void>::value)>
-auto write(OutputIt out, const T *value, const format_specs<Char> &specs = {},
-        locale_ref = {}) -> OutputIt {
-    return write_ptr<Char>(out, bit_cast<uintptr_t>(value), &specs);
-}
-
-// A write overload that handles implicit conversions.
-template <typename Char, typename OutputIt, typename T,
-        typename Context = basic_format_context<OutputIt, Char>>
-FMT_CONSTEXPR auto write(
-        OutputIt out, const T &value) -> enable_if_t<std::is_class<T>::value
-                && !is_string<T>::value && !is_floating_point<T>::value
-                && !std::is_same<T, Char>::value
-                && !std::is_same<T,
-                        remove_cvref_t<decltype(
-                                arg_mapper<Context>().map(value))>>::value,
-        OutputIt> {
-    return write<Char>(out, arg_mapper<Context>().map(value));
-}
-
-template <typename Char, typename OutputIt, typename T,
-        typename Context = basic_format_context<OutputIt, Char>>
-FMT_CONSTEXPR auto write(OutputIt out, const T &value)
-        -> enable_if_t<mapped_type_constant<T, Context>::value
-                        == type::custom_type,
-                OutputIt> {
-    auto formatter = typename Context::template formatter_type<T>();
-    auto parse_ctx = typename Context::parse_context_type({});
-    formatter.parse(parse_ctx);
-    auto ctx = Context(out, {}, {});
-    return formatter.format(value, ctx);
-}
-
-// An argument visitor that formats the argument and writes it via the output
-// iterator. It's a class and not a generic lambda for compatibility with C++11.
-template <typename Char>
-struct default_arg_formatter {
-    using iterator = buffer_appender<Char>;
-    using context = buffer_context<Char>;
-
-    iterator out;
-    basic_format_args<context> args;
-    locale_ref loc;
-
-    template <typename T>
-    auto operator()(T value) -> iterator {
-        return write<Char>(out, value);
-    }
-    auto operator()(typename basic_format_arg<context>::handle h) -> iterator {
-        basic_format_parse_context<Char> parse_ctx({});
-        context format_ctx(out, args, loc);
-        h.format(parse_ctx, format_ctx);
-        return format_ctx.out();
-    }
-};
-
-template <typename Char>
-struct arg_formatter {
-    using iterator = buffer_appender<Char>;
-    using context = buffer_context<Char>;
-
-    iterator out;
-    const format_specs<Char> &specs;
-    locale_ref locale;
-
-    template <typename T>
-    FMT_CONSTEXPR FMT_INLINE auto operator()(T value) -> iterator {
-        return detail::write(out, value, specs, locale);
-    }
-    auto operator()(typename basic_format_arg<context>::handle) -> iterator {
-        // User-defined types are handled separately because they require access
-        // to the parse context.
-        return out;
-    }
-};
-
-struct width_checker {
-    template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-    FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-        if (is_negative(value)) throw_format_error("negative width");
-        return static_cast<unsigned long long>(value);
-    }
-
-    template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-    FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-        throw_format_error("width is not integer");
-        return 0;
-    }
-};
-
-struct precision_checker {
-    template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
-    FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
-        if (is_negative(value)) throw_format_error("negative precision");
-        return static_cast<unsigned long long>(value);
-    }
-
-    template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
-    FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
-        throw_format_error("precision is not integer");
-        return 0;
-    }
-};
-
-template <typename Handler, typename FormatArg>
-FMT_CONSTEXPR auto get_dynamic_spec(FormatArg arg) -> int {
-    unsigned long long value = visit_format_arg(Handler(), arg);
-    if (value > to_unsigned(max_value<int>()))
-        throw_format_error("number is too big");
-    return static_cast<int>(value);
-}
-
-template <typename Context, typename ID>
-FMT_CONSTEXPR auto get_arg(Context &ctx, ID id) -> decltype(ctx.arg(id)) {
-    auto arg = ctx.arg(id);
-    if (!arg) ctx.on_error("argument not found");
-    return arg;
-}
-
-template <typename Handler, typename Context>
-FMT_CONSTEXPR void handle_dynamic_spec(
-        int &value, arg_ref<typename Context::char_type> ref, Context &ctx) {
-    switch (ref.kind) {
-        case arg_id_kind::none: break;
-        case arg_id_kind::index:
-            value = detail::get_dynamic_spec<Handler>(
-                    get_arg(ctx, ref.val.index));
-            break;
-        case arg_id_kind::name:
-            value = detail::get_dynamic_spec<Handler>(
-                    get_arg(ctx, ref.val.name));
-            break;
-    }
-}
-
-#if FMT_USE_USER_DEFINED_LITERALS
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <typename T, typename Char, size_t N,
-        fmt::detail_exported::fixed_string<Char, N> Str>
-struct statically_named_arg : view {
-    static constexpr auto name = Str.data;
-
-    const T &value;
-    statically_named_arg(const T &v) : value(v) {}
-};
-
-template <typename T, typename Char, size_t N,
-        fmt::detail_exported::fixed_string<Char, N> Str>
-struct is_named_arg<statically_named_arg<T, Char, N, Str>> : std::true_type {};
-
-template <typename T, typename Char, size_t N,
-        fmt::detail_exported::fixed_string<Char, N> Str>
-struct is_statically_named_arg<statically_named_arg<T, Char, N, Str>>
-    : std::true_type {};
-
-template <typename Char, size_t N,
-        fmt::detail_exported::fixed_string<Char, N> Str>
-struct udl_arg {
-    template <typename T>
-    auto operator=(T &&value) const {
-        return statically_named_arg<T, Char, N, Str>(std::forward<T>(value));
-    }
-};
-#else
-template <typename Char>
-struct udl_arg {
-    const Char *str;
-
-    template <typename T>
-    auto operator=(T &&value) const -> named_arg<Char, T> {
-        return {str, std::forward<T>(value)};
-    }
-};
-#endif
-#endif // FMT_USE_USER_DEFINED_LITERALS
-
-template <typename Locale, typename Char>
-auto vformat(const Locale &loc, basic_string_view<Char> fmt,
-        basic_format_args<buffer_context<type_identity_t<Char>>> args)
-        -> std::basic_string<Char> {
-    auto buf = basic_memory_buffer<Char>();
-    detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
-    return {buf.data(), buf.size()};
-}
-
-using format_func = void (*)(detail::buffer<char> &, int, const char *);
-
-FMT_API void format_error_code(
-        buffer<char> &out, int error_code, string_view message) noexcept;
-
-FMT_API void report_error(
-        format_func func, int error_code, const char *message) noexcept;
-} // namespace detail
-
-FMT_API auto vsystem_error(int error_code, string_view format_str,
-        format_args args) -> std::system_error;
-
-/**
-  \rst
-  Constructs :class:`std::system_error` with a message formatted with
-  ``fmt::format(fmt, args...)``.
-  *error_code* is a system error code as given by ``errno``.
-
-  **Example**::
-
-    // This throws std::system_error with the description
-    //   cannot open file 'madeup': No such file or directory
-    // or similar (system message may vary).
-    const char* filename = "madeup";
-    std::FILE* file = std::fopen(filename, "r");
-    if (!file)
-      throw fmt::system_error(errno, "cannot open file '{}'", filename);
-  \endrst
- */
-template <typename... T>
-auto system_error(int error_code, format_string<T...> fmt, T &&...args)
-        -> std::system_error {
-    return vsystem_error(error_code, fmt, fmt::make_format_args(args...));
-}
-
-/**
-  \rst
-  Formats an error message for an error returned by an operating system or a
-  language runtime, for example a file opening error, and writes it to *out*.
-  The format is the same as the one used by ``std::system_error(ec, message)``
-  where ``ec`` is ``std::error_code(error_code, std::generic_category()})``.
-  It is implementation-defined but normally looks like:
-
-  .. parsed-literal::
-     *<message>*: *<system-message>*
-
-  where *<message>* is the passed message and *<system-message>* is the system
-  message corresponding to the error code.
-  *error_code* is a system error code as given by ``errno``.
-  \endrst
- */
-FMT_API void format_system_error(detail::buffer<char> &out, int error_code,
-        const char *message) noexcept;
-
-// Reports a system error without throwing an exception.
-// Can be used to report errors from destructors.
-FMT_API void report_system_error(int error_code, const char *message) noexcept;
-
-/** Fast integer formatter. */
-class format_int {
-private:
-    // Buffer should be large enough to hold all digits (digits10 + 1),
-    // a sign and a null character.
-    enum {
-        buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3
-    };
-    mutable char buffer_[buffer_size];
-    char *str_;
-
-    template <typename UInt>
-    auto format_unsigned(UInt value) -> char * {
-        auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
-        return detail::format_decimal(buffer_, n, buffer_size - 1).begin;
-    }
-
-    template <typename Int>
-    auto format_signed(Int value) -> char * {
-        auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
-        bool negative = value < 0;
-        if (negative) abs_value = 0 - abs_value;
-        auto begin = format_unsigned(abs_value);
-        if (negative) *--begin = '-';
-        return begin;
-    }
-
-public:
-    explicit format_int(int value) : str_(format_signed(value)) {}
-    explicit format_int(long value) : str_(format_signed(value)) {}
-    explicit format_int(long long value) : str_(format_signed(value)) {}
-    explicit format_int(unsigned value) : str_(format_unsigned(value)) {}
-    explicit format_int(unsigned long value) : str_(format_unsigned(value)) {}
-    explicit format_int(unsigned long long value)
-        : str_(format_unsigned(value)) {}
-
-    /** Returns the number of characters written to the output buffer. */
-    auto size() const -> size_t {
-        return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
-    }
-
-    /**
-    Returns a pointer to the output buffer content. No terminating null
-    character is appended.
-   */
-    auto data() const -> const char * { return str_; }
-
-    /**
-    Returns a pointer to the output buffer content with terminating null
-    character appended.
-   */
-    auto c_str() const -> const char * {
-        buffer_[buffer_size - 1] = '\0';
-        return str_;
-    }
-
-    /**
-    \rst
-    Returns the content of the output buffer as an ``std::string``.
-    \endrst
-   */
-    auto str() const -> std::string { return std::string(str_, size()); }
-};
-
-template <typename T, typename Char>
-struct formatter<T, Char, enable_if_t<detail::has_format_as<T>::value>>
-    : formatter<detail::format_as_t<T>, Char> {
-    template <typename FormatContext>
-    auto format(const T &value, FormatContext &ctx) const
-            -> decltype(ctx.out()) {
-        using base = formatter<detail::format_as_t<T>, Char>;
-        return base::format(format_as(value), ctx);
-    }
-};
-
-#define FMT_FORMAT_AS(Type, Base) \
-    template <typename Char> \
-    struct formatter<Type, Char> : formatter<Base, Char> {}
-
-FMT_FORMAT_AS(signed char, int);
-FMT_FORMAT_AS(unsigned char, unsigned);
-FMT_FORMAT_AS(short, int);
-FMT_FORMAT_AS(unsigned short, unsigned);
-FMT_FORMAT_AS(long, detail::long_type);
-FMT_FORMAT_AS(unsigned long, detail::ulong_type);
-FMT_FORMAT_AS(Char *, const Char *);
-FMT_FORMAT_AS(std::basic_string<Char>, basic_string_view<Char>);
-FMT_FORMAT_AS(std::nullptr_t, const void *);
-FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
-FMT_FORMAT_AS(void *, const void *);
-
-template <typename Char, size_t N>
-struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {};
-
-/**
-  \rst
-  Converts ``p`` to ``const void*`` for pointer formatting.
-
-  **Example**::
-
-    auto s = fmt::format("{}", fmt::ptr(p));
-  \endrst
- */
-template <typename T>
-auto ptr(T p) -> const void * {
-    static_assert(std::is_pointer<T>::value, "");
-    return detail::bit_cast<const void *>(p);
-}
-template <typename T, typename Deleter>
-auto ptr(const std::unique_ptr<T, Deleter> &p) -> const void * {
-    return p.get();
-}
-template <typename T>
-auto ptr(const std::shared_ptr<T> &p) -> const void * {
-    return p.get();
-}
-
-/**
-  \rst
-  Converts ``e`` to the underlying type.
-
-  **Example**::
-
-    enum class color { red, green, blue };
-    auto s = fmt::format("{}", fmt::underlying(color::red));
-  \endrst
- */
-template <typename Enum>
-constexpr auto underlying(Enum e) noexcept -> underlying_t<Enum> {
-    return static_cast<underlying_t<Enum>>(e);
-}
-
-namespace enums {
-template <typename Enum, FMT_ENABLE_IF(std::is_enum<Enum>::value)>
-constexpr auto format_as(Enum e) noexcept -> underlying_t<Enum> {
-    return static_cast<underlying_t<Enum>>(e);
-}
-} // namespace enums
-
-class bytes {
-private:
-    string_view data_;
-    friend struct formatter<bytes>;
-
-public:
-    explicit bytes(string_view data) : data_(data) {}
-};
-
-template <>
-struct formatter<bytes> {
-private:
-    detail::dynamic_format_specs<> specs_;
-
-public:
-    template <typename ParseContext>
-    FMT_CONSTEXPR auto parse(ParseContext &ctx) -> const char * {
-        return parse_format_specs(
-                ctx.begin(), ctx.end(), specs_, ctx, detail::type::string_type);
-    }
-
-    template <typename FormatContext>
-    auto format(bytes b, FormatContext &ctx) -> decltype(ctx.out()) {
-        detail::handle_dynamic_spec<detail::width_checker>(
-                specs_.width, specs_.width_ref, ctx);
-        detail::handle_dynamic_spec<detail::precision_checker>(
-                specs_.precision, specs_.precision_ref, ctx);
-        return detail::write_bytes(ctx.out(), b.data_, specs_);
-    }
-};
-
-// group_digits_view is not derived from view because it copies the argument.
-template <typename T>
-struct group_digits_view {
-    T value;
-};
-
-/**
-  \rst
-  Returns a view that formats an integer value using ',' as a locale-independent
-  thousands separator.
-
-  **Example**::
-
-    fmt::print("{}", fmt::group_digits(12345));
-    // Output: "12,345"
-  \endrst
- */
-template <typename T>
-auto group_digits(T value) -> group_digits_view<T> {
-    return {value};
-}
-
-template <typename T>
-struct formatter<group_digits_view<T>> : formatter<T> {
-private:
-    detail::dynamic_format_specs<> specs_;
-
-public:
-    template <typename ParseContext>
-    FMT_CONSTEXPR auto parse(ParseContext &ctx) -> const char * {
-        return parse_format_specs(
-                ctx.begin(), ctx.end(), specs_, ctx, detail::type::int_type);
-    }
-
-    template <typename FormatContext>
-    auto format(group_digits_view<T> t, FormatContext &ctx)
-            -> decltype(ctx.out()) {
-        detail::handle_dynamic_spec<detail::width_checker>(
-                specs_.width, specs_.width_ref, ctx);
-        detail::handle_dynamic_spec<detail::precision_checker>(
-                specs_.precision, specs_.precision_ref, ctx);
-        return detail::write_int(ctx.out(),
-                static_cast<detail::uint64_or_128_t<T>>(t.value), 0, specs_,
-                detail::digit_grouping<char>("\3", ","));
-    }
-};
-
-template <typename T>
-struct nested_view {
-    const formatter<T> *fmt;
-    const T *value;
-};
-
-template <typename T>
-struct formatter<nested_view<T>> {
-    FMT_CONSTEXPR auto parse(format_parse_context &ctx) -> const char * {
-        return ctx.begin();
-    }
-    auto format(nested_view<T> view, format_context &ctx) const
-            -> decltype(ctx.out()) {
-        return view.fmt->format(*view.value, ctx);
-    }
-};
-
-template <typename T>
-struct nested_formatter {
-private:
-    int width_;
-    detail::fill_t<char> fill_;
-    align_t align_ : 4;
-    formatter<T> formatter_;
-
-public:
-    constexpr nested_formatter() : width_(0), align_(align_t::none) {}
-
-    FMT_CONSTEXPR auto parse(format_parse_context &ctx) -> const char * {
-        auto specs = detail::dynamic_format_specs<char>();
-        auto it = parse_format_specs(
-                ctx.begin(), ctx.end(), specs, ctx, detail::type::none_type);
-        width_ = specs.width;
-        fill_ = specs.fill;
-        align_ = specs.align;
-        ctx.advance_to(it);
-        return formatter_.parse(ctx);
-    }
-
-    template <typename F>
-    auto write_padded(format_context &ctx, F write) const
-            -> decltype(ctx.out()) {
-        if (width_ == 0) return write(ctx.out());
-        auto buf = memory_buffer();
-        write(std::back_inserter(buf));
-        auto specs = format_specs<>();
-        specs.width = width_;
-        specs.fill = fill_;
-        specs.align = align_;
-        return detail::write(
-                ctx.out(), string_view(buf.data(), buf.size()), specs);
-    }
-
-    auto nested(const T &value) const -> nested_view<T> {
-        return nested_view<T> {&formatter_, &value};
-    }
-};
-
-// DEPRECATED! join_view will be moved to ranges.h.
-template <typename It, typename Sentinel, typename Char = char>
-struct join_view : detail::view {
-    It begin;
-    Sentinel end;
-    basic_string_view<Char> sep;
-
-    join_view(It b, Sentinel e, basic_string_view<Char> s)
-        : begin(b), end(e), sep(s) {}
-};
-
-template <typename It, typename Sentinel, typename Char>
-struct formatter<join_view<It, Sentinel, Char>, Char> {
-private:
-    using value_type =
-#ifdef __cpp_lib_ranges
-            std::iter_value_t<It>;
-#else
-            typename std::iterator_traits<It>::value_type;
-#endif
-    formatter<remove_cvref_t<value_type>, Char> value_formatter_;
-
-public:
-    template <typename ParseContext>
-    FMT_CONSTEXPR auto parse(ParseContext &ctx) -> const Char * {
-        return value_formatter_.parse(ctx);
-    }
-
-    template <typename FormatContext>
-    auto format(const join_view<It, Sentinel, Char> &value,
-            FormatContext &ctx) const -> decltype(ctx.out()) {
-        auto it = value.begin;
-        auto out = ctx.out();
-        if (it != value.end) {
-            out = value_formatter_.format(*it, ctx);
-            ++it;
-            while (it != value.end) {
-                out = detail::copy_str<Char>(
-                        value.sep.begin(), value.sep.end(), out);
-                ctx.advance_to(out);
-                out = value_formatter_.format(*it, ctx);
-                ++it;
-            }
-        }
-        return out;
-    }
-};
-
-/**
-  Returns a view that formats the iterator range `[begin, end)` with elements
-  separated by `sep`.
- */
-template <typename It, typename Sentinel>
-auto join(It begin, Sentinel end, string_view sep) -> join_view<It, Sentinel> {
-    return {begin, end, sep};
-}
-
-/**
-  \rst
-  Returns a view that formats `range` with elements separated by `sep`.
-
-  **Example**::
-
-    std::vector<int> v = {1, 2, 3};
-    fmt::print("{}", fmt::join(v, ", "));
-    // Output: "1, 2, 3"
-
-  ``fmt::join`` applies passed format specifiers to the range elements::
-
-    fmt::print("{:02}", fmt::join(v, ", "));
-    // Output: "01, 02, 03"
-  \endrst
- */
-template <typename Range>
-auto join(Range &&range, string_view sep)
-        -> join_view<detail::iterator_t<Range>, detail::sentinel_t<Range>> {
-    return join(std::begin(range), std::end(range), sep);
-}
-
-/**
-  \rst
-  Converts *value* to ``std::string`` using the default format for type *T*.
-
-  **Example**::
-
-    #include <fmt/format.h>
-
-    std::string answer = fmt::to_string(42);
-  \endrst
- */
-template <typename T,
-        FMT_ENABLE_IF(!std::is_integral<T>::value
-                && !detail::has_format_as<T>::value)>
-inline auto to_string(const T &value) -> std::string {
-    auto buffer = memory_buffer();
-    detail::write<char>(appender(buffer), value);
-    return {buffer.data(), buffer.size()};
-}
-
-template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
-FMT_NODISCARD inline auto to_string(T value) -> std::string {
-    // The buffer should be large enough to store the number including the sign
-    // or "false" for bool.
-    constexpr int max_size = detail::digits10<T>() + 2;
-    char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
-    char *begin = buffer;
-    return std::string(begin, detail::write<char>(begin, value));
-}
-
-template <typename Char, size_t SIZE>
-FMT_NODISCARD auto to_string(const basic_memory_buffer<Char, SIZE> &buf)
-        -> std::basic_string<Char> {
-    auto size = buf.size();
-    detail::assume(size < std::basic_string<Char>().max_size());
-    return std::basic_string<Char>(buf.data(), size);
-}
-
-template <typename T,
-        FMT_ENABLE_IF(
-                !std::is_integral<T>::value && detail::has_format_as<T>::value)>
-inline auto to_string(const T &value) -> std::string {
-    return to_string(format_as(value));
-}
-
-FMT_END_EXPORT
-
-namespace detail {
-
-template <typename Char>
-void vformat_to(buffer<Char> &buf, basic_string_view<Char> fmt,
-        typename vformat_args<Char>::type args, locale_ref loc) {
-    auto out = buffer_appender<Char>(buf);
-    if (fmt.size() == 2 && equal2(fmt.data(), "{}")) {
-        auto arg = args.get(0);
-        if (!arg) throw_format_error("argument not found");
-        visit_format_arg(default_arg_formatter<Char> {out, args, loc}, arg);
-        return;
-    }
-
-    struct format_handler : error_handler {
-        basic_format_parse_context<Char> parse_context;
-        buffer_context<Char> context;
-
-        format_handler(buffer_appender<Char> p_out, basic_string_view<Char> str,
-                basic_format_args<buffer_context<Char>> p_args,
-                locale_ref p_loc)
-            : parse_context(str), context(p_out, p_args, p_loc) {}
-
-        void on_text(const Char *begin, const Char *end) {
-            auto text
-                    = basic_string_view<Char>(begin, to_unsigned(end - begin));
-            context.advance_to(write<Char>(context.out(), text));
-        }
-
-        FMT_CONSTEXPR auto on_arg_id() -> int {
-            return parse_context.next_arg_id();
-        }
-        FMT_CONSTEXPR auto on_arg_id(int id) -> int {
-            return parse_context.check_arg_id(id), id;
-        }
-        FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
-            int arg_id = context.arg_id(id);
-            if (arg_id < 0) throw_format_error("argument not found");
-            return arg_id;
-        }
-
-        FMT_INLINE void on_replacement_field(int id, const Char *) {
-            auto arg = get_arg(context, id);
-            context.advance_to(
-                    visit_format_arg(default_arg_formatter<Char> {context.out(),
-                                             context.args(), context.locale()},
-                            arg));
-        }
-
-        auto on_format_specs(int id, const Char *begin, const Char *end)
-                -> const Char * {
-            auto arg = get_arg(context, id);
-            // Not using a visitor for custom types gives better codegen.
-            if (arg.format_custom(begin, parse_context, context))
-                return parse_context.begin();
-            auto specs = detail::dynamic_format_specs<Char>();
-            begin = parse_format_specs(
-                    begin, end, specs, parse_context, arg.type());
-            detail::handle_dynamic_spec<detail::width_checker>(
-                    specs.width, specs.width_ref, context);
-            detail::handle_dynamic_spec<detail::precision_checker>(
-                    specs.precision, specs.precision_ref, context);
-            if (begin == end || *begin != '}')
-                throw_format_error("missing '}' in format string");
-            auto f = arg_formatter<Char> {
-                    context.out(), specs, context.locale()};
-            context.advance_to(visit_format_arg(f, arg));
-            return begin;
-        }
-    };
-    detail::parse_format_string<false>(
-            fmt, format_handler(out, fmt, args, loc));
-}
-
-FMT_BEGIN_EXPORT
-
-#ifndef FMT_HEADER_ONLY
-extern template FMT_API void vformat_to(
-        buffer<char> &, string_view, typename vformat_args<>::type, locale_ref);
-extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
-        -> thousands_sep_result<char>;
-extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
-        -> thousands_sep_result<wchar_t>;
-extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
-extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
-#endif // FMT_HEADER_ONLY
-
-} // namespace detail
-
-#if FMT_USE_USER_DEFINED_LITERALS
-inline namespace literals {
-/**
-  \rst
-  User-defined literal equivalent of :func:`fmt::arg`.
-
-  **Example**::
-
-    using namespace fmt::literals;
-    fmt::print("Elapsed time: {s:.2f} seconds", "s"_a=1.23);
-  \endrst
- */
-#if FMT_USE_NONTYPE_TEMPLATE_ARGS
-template <detail_exported::fixed_string Str>
-constexpr auto operator""_a() {
-    using char_t = remove_cvref_t<decltype(Str.data[0])>;
-    return detail::udl_arg<char_t, sizeof(Str.data) / sizeof(char_t), Str>();
-}
-#else
-constexpr auto operator""_a(const char *s, size_t) -> detail::udl_arg<char> {
-    return {s};
-}
-#endif
-} // namespace literals
-#endif // FMT_USE_USER_DEFINED_LITERALS
-
-template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
-inline auto vformat(const Locale &loc, string_view fmt, format_args args)
-        -> std::string {
-    return detail::vformat(loc, fmt, args);
-}
-
-template <typename Locale, typename... T,
-        FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
-inline auto format(const Locale &loc, format_string<T...> fmt, T &&...args)
-        -> std::string {
-    return fmt::vformat(loc, string_view(fmt), fmt::make_format_args(args...));
-}
-
-template <typename OutputIt, typename Locale,
-        FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value
-                        &&detail::is_locale<Locale>::value)>
-auto vformat_to(OutputIt out, const Locale &loc, string_view fmt,
-        format_args args) -> OutputIt {
-    using detail::get_buffer;
-    auto &&buf = get_buffer<char>(out);
-    detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
-    return detail::get_iterator(buf, out);
-}
-
-template <typename OutputIt, typename Locale, typename... T,
-        FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value
-                        &&detail::is_locale<Locale>::value)>
-FMT_INLINE auto format_to(OutputIt out, const Locale &loc,
-        format_string<T...> fmt, T &&...args) -> OutputIt {
-    return vformat_to(out, loc, fmt, fmt::make_format_args(args...));
-}
-
-template <typename Locale, typename... T,
-        FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
-FMT_NODISCARD FMT_INLINE auto formatted_size(
-        const Locale &loc, format_string<T...> fmt, T &&...args) -> size_t {
-    auto buf = detail::counting_buffer<>();
-    detail::vformat_to<char>(
-            buf, fmt, fmt::make_format_args(args...), detail::locale_ref(loc));
-    return buf.count();
-}
-
-FMT_END_EXPORT
-
-template <typename T, typename Char>
-template <typename FormatContext>
-FMT_CONSTEXPR FMT_INLINE auto formatter<T, Char,
-        enable_if_t<detail::type_constant<T, Char>::value
-                != detail::type::custom_type>>::format(const T &val,
-        FormatContext &ctx) const -> decltype(ctx.out()) {
-    if (specs_.width_ref.kind == detail::arg_id_kind::none
-            && specs_.precision_ref.kind == detail::arg_id_kind::none) {
-        return detail::write<Char>(ctx.out(), val, specs_, ctx.locale());
-    }
-    auto specs = specs_;
-    detail::handle_dynamic_spec<detail::width_checker>(
-            specs.width, specs.width_ref, ctx);
-    detail::handle_dynamic_spec<detail::precision_checker>(
-            specs.precision, specs.precision_ref, ctx);
-    return detail::write<Char>(ctx.out(), val, specs, ctx.locale());
-}
-
-FMT_END_NAMESPACE
-
-#ifdef FMT_HEADER_ONLY
-#define FMT_FUNC inline
-#include "format-inl.h"
-#else
-#define FMT_FUNC
-#endif
-
-#endif // FMT_FORMAT_H_
diff --git a/src/common/spdlog/fmt/fmt.h b/src/common/spdlog/fmt/fmt.h
deleted file mode 100755
index 426251ea4e1..00000000000
--- a/src/common/spdlog/fmt/fmt.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//
-// Copyright(c) 2016-2018 Gabi Melman.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-//
-
-#pragma once
-
-//
-// Include a bundled header-only copy of fmtlib or an external one.
-// By default spdlog include its own copy.
-//
-#include <common/spdlog/tweakme.h>
-
-#if defined( \
-        SPDLOG_USE_STD_FORMAT) // SPDLOG_USE_STD_FORMAT is defined - use std::format
-#include <format>
-#elif !defined(SPDLOG_FMT_EXTERNAL)
-#if !defined(SPDLOG_COMPILED_LIB) && !defined(FMT_HEADER_ONLY)
-#define FMT_HEADER_ONLY
-#endif
-#ifndef FMT_USE_WINDOWS_H
-#define FMT_USE_WINDOWS_H 0
-#endif
-
-#include <common/spdlog/fmt/bundled/core.h>
-#include <common/spdlog/fmt/bundled/format.h>
-
-#else // SPDLOG_FMT_EXTERNAL is defined - use external fmtlib
-#include <fmt/core.h>
-#include <fmt/format.h>
-#endif
diff --git a/src/common/spdlog/logger-inl.h b/src/common/spdlog/logger-inl.h
deleted file mode 100755
index 08e52ad0e27..00000000000
--- a/src/common/spdlog/logger-inl.h
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/logger.h>
-#endif
-
-#include <common/spdlog/details/backtracer.h>
-#include <common/spdlog/pattern_formatter.h>
-#include <common/spdlog/sinks/sink.h>
-
-#include <cstdio>
-
-namespace spdlog {
-
-// public methods
-SPDLOG_INLINE logger::logger(const logger &other)
-    : name_(other.name_)
-    , sinks_(other.sinks_)
-    , level_(other.level_.load(std::memory_order_relaxed))
-    , flush_level_(other.flush_level_.load(std::memory_order_relaxed))
-    , custom_err_handler_(other.custom_err_handler_)
-    , tracer_(other.tracer_) {}
-
-SPDLOG_INLINE logger::logger(logger &&other) SPDLOG_NOEXCEPT
-    : name_(std::move(other.name_)),
-      sinks_(std::move(other.sinks_)),
-      level_(other.level_.load(std::memory_order_relaxed)),
-      flush_level_(other.flush_level_.load(std::memory_order_relaxed)),
-      custom_err_handler_(std::move(other.custom_err_handler_)),
-      tracer_(std::move(other.tracer_))
-
-{}
-
-SPDLOG_INLINE logger &logger::operator=(logger other) SPDLOG_NOEXCEPT {
-    this->swap(other);
-    return *this;
-}
-
-SPDLOG_INLINE void logger::swap(spdlog::logger &other) SPDLOG_NOEXCEPT {
-    name_.swap(other.name_);
-    sinks_.swap(other.sinks_);
-
-    // swap level_
-    auto other_level = other.level_.load();
-    auto my_level = level_.exchange(other_level);
-    other.level_.store(my_level);
-
-    // swap flush level_
-    other_level = other.flush_level_.load();
-    my_level = flush_level_.exchange(other_level);
-    other.flush_level_.store(my_level);
-
-    custom_err_handler_.swap(other.custom_err_handler_);
-    std::swap(tracer_, other.tracer_);
-}
-
-SPDLOG_INLINE void swap(logger &a, logger &b) {
-    a.swap(b);
-}
-
-SPDLOG_INLINE void logger::set_level(level::level_enum log_level) {
-    level_.store(log_level);
-}
-
-SPDLOG_INLINE level::level_enum logger::level() const {
-    return static_cast<level::level_enum>(
-            level_.load(std::memory_order_relaxed));
-}
-
-SPDLOG_INLINE const std::string &logger::name() const {
-    return name_;
-}
-
-// set formatting for the sinks in this logger.
-// each sink will get a separate instance of the formatter object.
-SPDLOG_INLINE void logger::set_formatter(std::unique_ptr<formatter> f) {
-    for (auto it = sinks_.begin(); it != sinks_.end(); ++it) {
-        if (std::next(it) == sinks_.end()) {
-            // last element - we can be move it.
-            (*it)->set_formatter(std::move(f));
-            break; // to prevent clang-tidy warning
-        } else {
-            (*it)->set_formatter(f->clone());
-        }
-    }
-}
-
-SPDLOG_INLINE void logger::set_pattern(
-        std::string pattern, pattern_time_type time_type) {
-    auto new_formatter = details::make_unique<pattern_formatter>(
-            std::move(pattern), time_type);
-    set_formatter(std::move(new_formatter));
-}
-
-// create new backtrace sink and move to it all our child sinks
-SPDLOG_INLINE void logger::enable_backtrace(size_t n_messages) {
-    tracer_.enable(n_messages);
-}
-
-// restore orig sinks and level and delete the backtrace sink
-SPDLOG_INLINE void logger::disable_backtrace() {
-    tracer_.disable();
-}
-
-SPDLOG_INLINE void logger::dump_backtrace() {
-    dump_backtrace_();
-}
-
-// flush functions
-SPDLOG_INLINE void logger::flush() {
-    flush_();
-}
-
-SPDLOG_INLINE void logger::flush_on(level::level_enum log_level) {
-    flush_level_.store(log_level);
-}
-
-SPDLOG_INLINE level::level_enum logger::flush_level() const {
-    return static_cast<level::level_enum>(
-            flush_level_.load(std::memory_order_relaxed));
-}
-
-// sinks
-SPDLOG_INLINE const std::vector<sink_ptr> &logger::sinks() const {
-    return sinks_;
-}
-
-SPDLOG_INLINE std::vector<sink_ptr> &logger::sinks() {
-    return sinks_;
-}
-
-// error handler
-SPDLOG_INLINE void logger::set_error_handler(err_handler handler) {
-    custom_err_handler_ = std::move(handler);
-}
-
-// create new logger with same sinks and configuration.
-SPDLOG_INLINE std::shared_ptr<logger> logger::clone(std::string logger_name) {
-    auto cloned = std::make_shared<logger>(*this);
-    cloned->name_ = std::move(logger_name);
-    return cloned;
-}
-
-// protected methods
-SPDLOG_INLINE void logger::log_it_(const spdlog::details::log_msg &log_msg,
-        bool log_enabled, bool traceback_enabled) {
-    if (log_enabled) { sink_it_(log_msg); }
-    if (traceback_enabled) { tracer_.push_back(log_msg); }
-}
-
-SPDLOG_INLINE void logger::sink_it_(const details::log_msg &msg) {
-    for (auto &sink : sinks_) {
-        if (sink->should_log(msg.level)) {
-            SPDLOG_TRY { sink->log(msg); }
-            SPDLOG_LOGGER_CATCH(msg.source)
-        }
-    }
-
-    if (should_flush_(msg)) { flush_(); }
-}
-
-SPDLOG_INLINE void logger::flush_() {
-    for (auto &sink : sinks_) {
-        SPDLOG_TRY { sink->flush(); }
-        SPDLOG_LOGGER_CATCH(source_loc())
-    }
-}
-
-SPDLOG_INLINE void logger::dump_backtrace_() {
-    using details::log_msg;
-    if (tracer_.enabled() && !tracer_.empty()) {
-        sink_it_(log_msg {name(), level::info,
-                "****************** Backtrace Start ******************"});
-        tracer_.foreach_pop(
-                [this](const log_msg &msg) { this->sink_it_(msg); });
-        sink_it_(log_msg {name(), level::info,
-                "****************** Backtrace End ********************"});
-    }
-}
-
-SPDLOG_INLINE bool logger::should_flush_(const details::log_msg &msg) {
-    auto flush_level = flush_level_.load(std::memory_order_relaxed);
-    return (msg.level >= flush_level) && (msg.level != level::off);
-}
-
-SPDLOG_INLINE void logger::err_handler_(const std::string &msg) {
-    if (custom_err_handler_) {
-        custom_err_handler_(msg);
-    } else {
-        using std::chrono::system_clock;
-        static std::mutex mutex;
-        static std::chrono::system_clock::time_point last_report_time;
-        static size_t err_counter = 0;
-        std::lock_guard<std::mutex> lk {mutex};
-        auto now = system_clock::now();
-        err_counter++;
-        if (now - last_report_time < std::chrono::seconds(1)) { return; }
-        last_report_time = now;
-        auto tm_time = details::os::localtime(system_clock::to_time_t(now));
-        char date_buf[64];
-        std::strftime(
-                date_buf, sizeof(date_buf), "%Y-%m-%d %H:%M:%S", &tm_time);
-#if defined(USING_R) && defined(R_R_H) // if in R environment
-        REprintf("[*** LOG ERROR #%04zu ***] [%s] [%s] %s\n", err_counter,
-                date_buf, name().c_str(), msg.c_str());
-#else
-        std::fprintf(stderr, "[*** LOG ERROR #%04zu ***] [%s] [%s] %s\n",
-                err_counter, date_buf, name().c_str(), msg.c_str());
-#endif
-    }
-}
-} // namespace spdlog
diff --git a/src/common/spdlog/logger.h b/src/common/spdlog/logger.h
deleted file mode 100755
index 4de596385fe..00000000000
--- a/src/common/spdlog/logger.h
+++ /dev/null
@@ -1,386 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-// Thread safe logger (except for set_error_handler())
-// Has name, log level, vector of std::shared sink pointers and formatter
-// Upon each log write the logger:
-// 1. Checks if its log level is enough to log the message and if yes:
-// 2. Call the underlying sinks to do the job.
-// 3. Each sink use its own private copy of a formatter to format the message
-// and send to its destination.
-//
-// The use of private formatter per sink provides the opportunity to cache some
-// formatted data, and support for different format per sink.
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/backtracer.h>
-#include <common/spdlog/details/log_msg.h>
-
-#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
-#ifndef _WIN32
-#error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
-#endif
-#include <common/spdlog/details/os.h>
-#endif
-
-#include <vector>
-
-#ifndef SPDLOG_NO_EXCEPTIONS
-#define SPDLOG_LOGGER_CATCH(location) \
-    catch (const std::exception &ex) { \
-        if (location.filename) { \
-            err_handler_(fmt_lib::format(SPDLOG_FMT_STRING("{} [{}({})]"), \
-                    ex.what(), location.filename, location.line)); \
-        } else { \
-            err_handler_(ex.what()); \
-        } \
-    } \
-    catch (...) { \
-        err_handler_("Rethrowing unknown exception in logger"); \
-        throw; \
-    }
-#else
-#define SPDLOG_LOGGER_CATCH(location)
-#endif
-
-namespace spdlog {
-
-class SPDLOG_API logger {
-public:
-    // Empty logger
-    explicit logger(std::string name) : name_(std::move(name)), sinks_() {}
-
-    // Logger with range on sinks
-    template <typename It>
-    logger(std::string name, It begin, It end)
-        : name_(std::move(name)), sinks_(begin, end) {}
-
-    // Logger with single sink
-    logger(std::string name, sink_ptr single_sink)
-        : logger(std::move(name), {std::move(single_sink)}) {}
-
-    // Logger with sinks init list
-    logger(std::string name, sinks_init_list sinks)
-        : logger(std::move(name), sinks.begin(), sinks.end()) {}
-
-    virtual ~logger() = default;
-
-    logger(const logger &other);
-    logger(logger &&other) SPDLOG_NOEXCEPT;
-    logger &operator=(logger other) SPDLOG_NOEXCEPT;
-    void swap(spdlog::logger &other) SPDLOG_NOEXCEPT;
-
-    template <typename... Args>
-    void log(source_loc loc, level::level_enum lvl,
-            format_string_t<Args...> fmt, Args &&...args) {
-        log_(loc, lvl, details::to_string_view(fmt),
-                std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void log(level::level_enum lvl, format_string_t<Args...> fmt,
-            Args &&...args) {
-        log(source_loc {}, lvl, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename T>
-    void log(level::level_enum lvl, const T &msg) {
-        log(source_loc {}, lvl, msg);
-    }
-
-    // T cannot be statically converted to format string (including string_view/wstring_view)
-    template <class T,
-            typename std::enable_if<
-                    !is_convertible_to_any_format_string<const T &>::value,
-                    int>::type
-            = 0>
-    void log(source_loc loc, level::level_enum lvl, const T &msg) {
-        log(loc, lvl, "{}", msg);
-    }
-
-    void log(log_clock::time_point log_time, source_loc loc,
-            level::level_enum lvl, string_view_t msg) {
-        bool log_enabled = should_log(lvl);
-        bool traceback_enabled = tracer_.enabled();
-        if (!log_enabled && !traceback_enabled) { return; }
-
-        details::log_msg log_msg(log_time, loc, name_, lvl, msg);
-        log_it_(log_msg, log_enabled, traceback_enabled);
-    }
-
-    void log(source_loc loc, level::level_enum lvl, string_view_t msg) {
-        bool log_enabled = should_log(lvl);
-        bool traceback_enabled = tracer_.enabled();
-        if (!log_enabled && !traceback_enabled) { return; }
-
-        details::log_msg log_msg(loc, name_, lvl, msg);
-        log_it_(log_msg, log_enabled, traceback_enabled);
-    }
-
-    void log(level::level_enum lvl, string_view_t msg) {
-        log(source_loc {}, lvl, msg);
-    }
-
-    template <typename... Args>
-    void trace(format_string_t<Args...> fmt, Args &&...args) {
-        log(level::trace, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void debug(format_string_t<Args...> fmt, Args &&...args) {
-        log(level::debug, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void info(format_string_t<Args...> fmt, Args &&...args) {
-        log(level::info, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void warn(format_string_t<Args...> fmt, Args &&...args) {
-        log(level::warn, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void error(format_string_t<Args...> fmt, Args &&...args) {
-        log(level::err, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void critical(format_string_t<Args...> fmt, Args &&...args) {
-        log(level::critical, fmt, std::forward<Args>(args)...);
-    }
-
-#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
-    template <typename... Args>
-    void log(source_loc loc, level::level_enum lvl,
-            wformat_string_t<Args...> fmt, Args &&...args) {
-        log_(loc, lvl, details::to_string_view(fmt),
-                std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void log(level::level_enum lvl, wformat_string_t<Args...> fmt,
-            Args &&...args) {
-        log(source_loc {}, lvl, fmt, std::forward<Args>(args)...);
-    }
-
-    void log(log_clock::time_point log_time, source_loc loc,
-            level::level_enum lvl, wstring_view_t msg) {
-        bool log_enabled = should_log(lvl);
-        bool traceback_enabled = tracer_.enabled();
-        if (!log_enabled && !traceback_enabled) { return; }
-
-        memory_buf_t buf;
-        details::os::wstr_to_utf8buf(
-                wstring_view_t(msg.data(), msg.size()), buf);
-        details::log_msg log_msg(log_time, loc, name_, lvl,
-                string_view_t(buf.data(), buf.size()));
-        log_it_(log_msg, log_enabled, traceback_enabled);
-    }
-
-    void log(source_loc loc, level::level_enum lvl, wstring_view_t msg) {
-        bool log_enabled = should_log(lvl);
-        bool traceback_enabled = tracer_.enabled();
-        if (!log_enabled && !traceback_enabled) { return; }
-
-        memory_buf_t buf;
-        details::os::wstr_to_utf8buf(
-                wstring_view_t(msg.data(), msg.size()), buf);
-        details::log_msg log_msg(
-                loc, name_, lvl, string_view_t(buf.data(), buf.size()));
-        log_it_(log_msg, log_enabled, traceback_enabled);
-    }
-
-    void log(level::level_enum lvl, wstring_view_t msg) {
-        log(source_loc {}, lvl, msg);
-    }
-
-    template <typename... Args>
-    void trace(wformat_string_t<Args...> fmt, Args &&...args) {
-        log(level::trace, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void debug(wformat_string_t<Args...> fmt, Args &&...args) {
-        log(level::debug, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void info(wformat_string_t<Args...> fmt, Args &&...args) {
-        log(level::info, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void warn(wformat_string_t<Args...> fmt, Args &&...args) {
-        log(level::warn, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void error(wformat_string_t<Args...> fmt, Args &&...args) {
-        log(level::err, fmt, std::forward<Args>(args)...);
-    }
-
-    template <typename... Args>
-    void critical(wformat_string_t<Args...> fmt, Args &&...args) {
-        log(level::critical, fmt, std::forward<Args>(args)...);
-    }
-#endif
-
-    template <typename T>
-    void trace(const T &msg) {
-        log(level::trace, msg);
-    }
-
-    template <typename T>
-    void debug(const T &msg) {
-        log(level::debug, msg);
-    }
-
-    template <typename T>
-    void info(const T &msg) {
-        log(level::info, msg);
-    }
-
-    template <typename T>
-    void warn(const T &msg) {
-        log(level::warn, msg);
-    }
-
-    template <typename T>
-    void error(const T &msg) {
-        log(level::err, msg);
-    }
-
-    template <typename T>
-    void critical(const T &msg) {
-        log(level::critical, msg);
-    }
-
-    // return true logging is enabled for the given level.
-    bool should_log(level::level_enum msg_level) const {
-        return msg_level >= level_.load(std::memory_order_relaxed);
-    }
-
-    // return true if backtrace logging is enabled.
-    bool should_backtrace() const { return tracer_.enabled(); }
-
-    void set_level(level::level_enum log_level);
-
-    level::level_enum level() const;
-
-    const std::string &name() const;
-
-    // set formatting for the sinks in this logger.
-    // each sink will get a separate instance of the formatter object.
-    void set_formatter(std::unique_ptr<formatter> f);
-
-    // set formatting for the sinks in this logger.
-    // equivalent to
-    //     set_formatter(make_unique<pattern_formatter>(pattern, time_type))
-    // Note: each sink will get a new instance of a formatter object, replacing the old one.
-    void set_pattern(std::string pattern,
-            pattern_time_type time_type = pattern_time_type::local);
-
-    // backtrace support.
-    // efficiently store all debug/trace messages in a circular buffer until needed for debugging.
-    void enable_backtrace(size_t n_messages);
-    void disable_backtrace();
-    void dump_backtrace();
-
-    // flush functions
-    void flush();
-    void flush_on(level::level_enum log_level);
-    level::level_enum flush_level() const;
-
-    // sinks
-    const std::vector<sink_ptr> &sinks() const;
-
-    std::vector<sink_ptr> &sinks();
-
-    // error handler
-    void set_error_handler(err_handler);
-
-    // create new logger with same sinks and configuration.
-    virtual std::shared_ptr<logger> clone(std::string logger_name);
-
-protected:
-    std::string name_;
-    std::vector<sink_ptr> sinks_;
-    spdlog::level_t level_ {level::info};
-    spdlog::level_t flush_level_ {level::off};
-    err_handler custom_err_handler_ {nullptr};
-    details::backtracer tracer_;
-
-    // common implementation for after templated public api has been resolved
-    template <typename... Args>
-    void log_(source_loc loc, level::level_enum lvl, string_view_t fmt,
-            Args &&...args) {
-        bool log_enabled = should_log(lvl);
-        bool traceback_enabled = tracer_.enabled();
-        if (!log_enabled && !traceback_enabled) { return; }
-        SPDLOG_TRY {
-            memory_buf_t buf;
-#ifdef SPDLOG_USE_STD_FORMAT
-            fmt_lib::vformat_to(std::back_inserter(buf), fmt,
-                    fmt_lib::make_format_args(args...));
-#else
-            fmt::vformat_to(
-                    fmt::appender(buf), fmt, fmt::make_format_args(args...));
-#endif
-
-            details::log_msg log_msg(
-                    loc, name_, lvl, string_view_t(buf.data(), buf.size()));
-            log_it_(log_msg, log_enabled, traceback_enabled);
-        }
-        SPDLOG_LOGGER_CATCH(loc)
-    }
-
-#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
-    template <typename... Args>
-    void log_(source_loc loc, level::level_enum lvl, wstring_view_t fmt,
-            Args &&...args) {
-        bool log_enabled = should_log(lvl);
-        bool traceback_enabled = tracer_.enabled();
-        if (!log_enabled && !traceback_enabled) { return; }
-        SPDLOG_TRY {
-            // format to wmemory_buffer and convert to utf8
-            wmemory_buf_t wbuf;
-            fmt_lib::vformat_to(std::back_inserter(wbuf), fmt,
-                    fmt_lib::make_format_args<fmt_lib::wformat_context>(
-                            args...));
-
-            memory_buf_t buf;
-            details::os::wstr_to_utf8buf(
-                    wstring_view_t(wbuf.data(), wbuf.size()), buf);
-            details::log_msg log_msg(
-                    loc, name_, lvl, string_view_t(buf.data(), buf.size()));
-            log_it_(log_msg, log_enabled, traceback_enabled);
-        }
-        SPDLOG_LOGGER_CATCH(loc)
-    }
-#endif // SPDLOG_WCHAR_TO_UTF8_SUPPORT
-
-    // log the given message (if the given log level is high enough),
-    // and save backtrace (if backtrace is enabled).
-    void log_it_(const details::log_msg &log_msg, bool log_enabled,
-            bool traceback_enabled);
-    virtual void sink_it_(const details::log_msg &msg);
-    virtual void flush_();
-    void dump_backtrace_();
-    bool should_flush_(const details::log_msg &msg);
-
-    // handle errors during logging.
-    // default handler prints the error to stderr at max rate of 1 message/sec.
-    void err_handler_(const std::string &msg);
-};
-
-void swap(logger &a, logger &b);
-
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "logger-inl.h"
-#endif
diff --git a/src/common/spdlog/pattern_formatter-inl.h b/src/common/spdlog/pattern_formatter-inl.h
deleted file mode 100755
index 5f8b3d4a02e..00000000000
--- a/src/common/spdlog/pattern_formatter-inl.h
+++ /dev/null
@@ -1,1424 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/pattern_formatter.h>
-#endif
-
-#include <common/spdlog/details/fmt_helper.h>
-#include <common/spdlog/details/log_msg.h>
-#include <common/spdlog/details/os.h>
-#include <common/spdlog/fmt/fmt.h>
-#include <common/spdlog/formatter.h>
-#include <common/spdlog/mdc.h>
-
-#include <algorithm>
-#include <array>
-#include <cctype>
-#include <chrono>
-#include <cstring>
-#include <ctime>
-#include <iterator>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <utility>
-#include <vector>
-
-namespace spdlog {
-namespace details {
-
-///////////////////////////////////////////////////////////////////////
-// name & level pattern appender
-///////////////////////////////////////////////////////////////////////
-
-class scoped_padder {
-public:
-    scoped_padder(size_t wrapped_size, const padding_info &padinfo,
-            memory_buf_t &dest)
-        : padinfo_(padinfo), dest_(dest) {
-        remaining_pad_ = static_cast<long>(padinfo.width_)
-                - static_cast<long>(wrapped_size);
-        if (remaining_pad_ <= 0) { return; }
-
-        if (padinfo_.side_ == padding_info::pad_side::left) {
-            pad_it(remaining_pad_);
-            remaining_pad_ = 0;
-        } else if (padinfo_.side_ == padding_info::pad_side::center) {
-            auto half_pad = remaining_pad_ / 2;
-            auto reminder = remaining_pad_ & 1;
-            pad_it(half_pad);
-            remaining_pad_ = half_pad + reminder; // for the right side
-        }
-    }
-
-    template <typename T>
-    static unsigned int count_digits(T n) {
-        return fmt_helper::count_digits(n);
-    }
-
-    ~scoped_padder() {
-        if (remaining_pad_ >= 0) {
-            pad_it(remaining_pad_);
-        } else if (padinfo_.truncate_) {
-            long new_size = static_cast<long>(dest_.size()) + remaining_pad_;
-            dest_.resize(static_cast<size_t>(new_size));
-        }
-    }
-
-private:
-    void pad_it(long count) {
-        fmt_helper::append_string_view(
-                string_view_t(spaces_.data(), static_cast<size_t>(count)),
-                dest_);
-    }
-
-    const padding_info &padinfo_;
-    memory_buf_t &dest_;
-    long remaining_pad_;
-    string_view_t spaces_ {
-            "                                                                ",
-            64};
-};
-
-struct null_scoped_padder {
-    null_scoped_padder(size_t /*wrapped_size*/,
-            const padding_info & /*padinfo*/, memory_buf_t & /*dest*/) {}
-
-    template <typename T>
-    static unsigned int count_digits(T /* number */) {
-        return 0;
-    }
-};
-
-template <typename ScopedPadder>
-class name_formatter final : public flag_formatter {
-public:
-    explicit name_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        ScopedPadder p(msg.logger_name.size(), padinfo_, dest);
-        fmt_helper::append_string_view(msg.logger_name, dest);
-    }
-};
-
-// log level appender
-template <typename ScopedPadder>
-class level_formatter final : public flag_formatter {
-public:
-    explicit level_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        const string_view_t &level_name = level::to_string_view(msg.level);
-        ScopedPadder p(level_name.size(), padinfo_, dest);
-        fmt_helper::append_string_view(level_name, dest);
-    }
-};
-
-// short log level appender
-template <typename ScopedPadder>
-class short_level_formatter final : public flag_formatter {
-public:
-    explicit short_level_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        string_view_t level_name {level::to_short_c_str(msg.level)};
-        ScopedPadder p(level_name.size(), padinfo_, dest);
-        fmt_helper::append_string_view(level_name, dest);
-    }
-};
-
-///////////////////////////////////////////////////////////////////////
-// Date time pattern appenders
-///////////////////////////////////////////////////////////////////////
-
-static const char *ampm(const tm &t) {
-    return t.tm_hour >= 12 ? "PM" : "AM";
-}
-
-static int to12h(const tm &t) {
-    return t.tm_hour > 12 ? t.tm_hour - 12 : t.tm_hour;
-}
-
-// Abbreviated weekday name
-static std::array<const char *, 7> days {
-        {"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}};
-
-template <typename ScopedPadder>
-class a_formatter final : public flag_formatter {
-public:
-    explicit a_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        string_view_t field_value {days[static_cast<size_t>(tm_time.tm_wday)]};
-        ScopedPadder p(field_value.size(), padinfo_, dest);
-        fmt_helper::append_string_view(field_value, dest);
-    }
-};
-
-// Full weekday name
-static std::array<const char *, 7> full_days {{"Sunday", "Monday", "Tuesday",
-        "Wednesday", "Thursday", "Friday", "Saturday"}};
-
-template <typename ScopedPadder>
-class A_formatter : public flag_formatter {
-public:
-    explicit A_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        string_view_t field_value {
-                full_days[static_cast<size_t>(tm_time.tm_wday)]};
-        ScopedPadder p(field_value.size(), padinfo_, dest);
-        fmt_helper::append_string_view(field_value, dest);
-    }
-};
-
-// Abbreviated month
-static const std::array<const char *, 12> months {{"Jan", "Feb", "Mar", "Apr",
-        "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"}};
-
-template <typename ScopedPadder>
-class b_formatter final : public flag_formatter {
-public:
-    explicit b_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        string_view_t field_value {months[static_cast<size_t>(tm_time.tm_mon)]};
-        ScopedPadder p(field_value.size(), padinfo_, dest);
-        fmt_helper::append_string_view(field_value, dest);
-    }
-};
-
-// Full month name
-static const std::array<const char *, 12> full_months {
-        {"January", "February", "March", "April", "May", "June", "July",
-                "August", "September", "October", "November", "December"}};
-
-template <typename ScopedPadder>
-class B_formatter final : public flag_formatter {
-public:
-    explicit B_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        string_view_t field_value {
-                full_months[static_cast<size_t>(tm_time.tm_mon)]};
-        ScopedPadder p(field_value.size(), padinfo_, dest);
-        fmt_helper::append_string_view(field_value, dest);
-    }
-};
-
-// Date and time representation (Thu Aug 23 15:35:46 2014)
-template <typename ScopedPadder>
-class c_formatter final : public flag_formatter {
-public:
-    explicit c_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 24;
-        ScopedPadder p(field_size, padinfo_, dest);
-
-        fmt_helper::append_string_view(
-                days[static_cast<size_t>(tm_time.tm_wday)], dest);
-        dest.push_back(' ');
-        fmt_helper::append_string_view(
-                months[static_cast<size_t>(tm_time.tm_mon)], dest);
-        dest.push_back(' ');
-        fmt_helper::append_int(tm_time.tm_mday, dest);
-        dest.push_back(' ');
-        // time
-
-        fmt_helper::pad2(tm_time.tm_hour, dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_min, dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_sec, dest);
-        dest.push_back(' ');
-        fmt_helper::append_int(tm_time.tm_year + 1900, dest);
-    }
-};
-
-// year - 2 digit
-template <typename ScopedPadder>
-class C_formatter final : public flag_formatter {
-public:
-    explicit C_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(tm_time.tm_year % 100, dest);
-    }
-};
-
-// Short MM/DD/YY date, equivalent to %m/%d/%y 08/23/01
-template <typename ScopedPadder>
-class D_formatter final : public flag_formatter {
-public:
-    explicit D_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 10;
-        ScopedPadder p(field_size, padinfo_, dest);
-
-        fmt_helper::pad2(tm_time.tm_mon + 1, dest);
-        dest.push_back('/');
-        fmt_helper::pad2(tm_time.tm_mday, dest);
-        dest.push_back('/');
-        fmt_helper::pad2(tm_time.tm_year % 100, dest);
-    }
-};
-
-// year - 4 digit
-template <typename ScopedPadder>
-class Y_formatter final : public flag_formatter {
-public:
-    explicit Y_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 4;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::append_int(tm_time.tm_year + 1900, dest);
-    }
-};
-
-// month 1-12
-template <typename ScopedPadder>
-class m_formatter final : public flag_formatter {
-public:
-    explicit m_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(tm_time.tm_mon + 1, dest);
-    }
-};
-
-// day of month 1-31
-template <typename ScopedPadder>
-class d_formatter final : public flag_formatter {
-public:
-    explicit d_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(tm_time.tm_mday, dest);
-    }
-};
-
-// hours in 24 format 0-23
-template <typename ScopedPadder>
-class H_formatter final : public flag_formatter {
-public:
-    explicit H_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(tm_time.tm_hour, dest);
-    }
-};
-
-// hours in 12 format 1-12
-template <typename ScopedPadder>
-class I_formatter final : public flag_formatter {
-public:
-    explicit I_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(to12h(tm_time), dest);
-    }
-};
-
-// minutes 0-59
-template <typename ScopedPadder>
-class M_formatter final : public flag_formatter {
-public:
-    explicit M_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(tm_time.tm_min, dest);
-    }
-};
-
-// seconds 0-59
-template <typename ScopedPadder>
-class S_formatter final : public flag_formatter {
-public:
-    explicit S_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad2(tm_time.tm_sec, dest);
-    }
-};
-
-// milliseconds
-template <typename ScopedPadder>
-class e_formatter final : public flag_formatter {
-public:
-    explicit e_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        auto millis = fmt_helper::time_fraction<std::chrono::milliseconds>(
-                msg.time);
-        const size_t field_size = 3;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad3(static_cast<uint32_t>(millis.count()), dest);
-    }
-};
-
-// microseconds
-template <typename ScopedPadder>
-class f_formatter final : public flag_formatter {
-public:
-    explicit f_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        auto micros = fmt_helper::time_fraction<std::chrono::microseconds>(
-                msg.time);
-
-        const size_t field_size = 6;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad6(static_cast<size_t>(micros.count()), dest);
-    }
-};
-
-// nanoseconds
-template <typename ScopedPadder>
-class F_formatter final : public flag_formatter {
-public:
-    explicit F_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        auto ns = fmt_helper::time_fraction<std::chrono::nanoseconds>(msg.time);
-        const size_t field_size = 9;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::pad9(static_cast<size_t>(ns.count()), dest);
-    }
-};
-
-// seconds since epoch
-template <typename ScopedPadder>
-class E_formatter final : public flag_formatter {
-public:
-    explicit E_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        const size_t field_size = 10;
-        ScopedPadder p(field_size, padinfo_, dest);
-        auto duration = msg.time.time_since_epoch();
-        auto seconds
-                = std::chrono::duration_cast<std::chrono::seconds>(duration)
-                          .count();
-        fmt_helper::append_int(seconds, dest);
-    }
-};
-
-// AM/PM
-template <typename ScopedPadder>
-class p_formatter final : public flag_formatter {
-public:
-    explicit p_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 2;
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::append_string_view(ampm(tm_time), dest);
-    }
-};
-
-// 12 hour clock 02:55:02 pm
-template <typename ScopedPadder>
-class r_formatter final : public flag_formatter {
-public:
-    explicit r_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 11;
-        ScopedPadder p(field_size, padinfo_, dest);
-
-        fmt_helper::pad2(to12h(tm_time), dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_min, dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_sec, dest);
-        dest.push_back(' ');
-        fmt_helper::append_string_view(ampm(tm_time), dest);
-    }
-};
-
-// 24-hour HH:MM time, equivalent to %H:%M
-template <typename ScopedPadder>
-class R_formatter final : public flag_formatter {
-public:
-    explicit R_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 5;
-        ScopedPadder p(field_size, padinfo_, dest);
-
-        fmt_helper::pad2(tm_time.tm_hour, dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_min, dest);
-    }
-};
-
-// ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S
-template <typename ScopedPadder>
-class T_formatter final : public flag_formatter {
-public:
-    explicit T_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 8;
-        ScopedPadder p(field_size, padinfo_, dest);
-
-        fmt_helper::pad2(tm_time.tm_hour, dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_min, dest);
-        dest.push_back(':');
-        fmt_helper::pad2(tm_time.tm_sec, dest);
-    }
-};
-
-// ISO 8601 offset from UTC in timezone (+-HH:MM)
-template <typename ScopedPadder>
-class z_formatter final : public flag_formatter {
-public:
-    explicit z_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    z_formatter() = default;
-    z_formatter(const z_formatter &) = delete;
-    z_formatter &operator=(const z_formatter &) = delete;
-
-    void format(const details::log_msg &msg, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        const size_t field_size = 6;
-        ScopedPadder p(field_size, padinfo_, dest);
-
-        auto total_minutes = get_cached_offset(msg, tm_time);
-        bool is_negative = total_minutes < 0;
-        if (is_negative) {
-            total_minutes = -total_minutes;
-            dest.push_back('-');
-        } else {
-            dest.push_back('+');
-        }
-
-        fmt_helper::pad2(total_minutes / 60, dest); // hours
-        dest.push_back(':');
-        fmt_helper::pad2(total_minutes % 60, dest); // minutes
-    }
-
-private:
-    log_clock::time_point last_update_ {std::chrono::seconds(0)};
-    int offset_minutes_ {0};
-
-    int get_cached_offset(const log_msg &msg, const std::tm &tm_time) {
-        // refresh every 10 seconds
-        if (msg.time - last_update_ >= std::chrono::seconds(10)) {
-            offset_minutes_ = os::utc_minutes_offset(tm_time);
-            last_update_ = msg.time;
-        }
-        return offset_minutes_;
-    }
-};
-
-// Thread id
-template <typename ScopedPadder>
-class t_formatter final : public flag_formatter {
-public:
-    explicit t_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        const auto field_size = ScopedPadder::count_digits(msg.thread_id);
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::append_int(msg.thread_id, dest);
-    }
-};
-
-// Current pid
-template <typename ScopedPadder>
-class pid_formatter final : public flag_formatter {
-public:
-    explicit pid_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &,
-            memory_buf_t &dest) override {
-        const auto pid = static_cast<uint32_t>(details::os::pid());
-        auto field_size = ScopedPadder::count_digits(pid);
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::append_int(pid, dest);
-    }
-};
-
-template <typename ScopedPadder>
-class v_formatter final : public flag_formatter {
-public:
-    explicit v_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        ScopedPadder p(msg.payload.size(), padinfo_, dest);
-        fmt_helper::append_string_view(msg.payload, dest);
-    }
-};
-
-class ch_formatter final : public flag_formatter {
-public:
-    explicit ch_formatter(char ch) : ch_(ch) {}
-
-    void format(const details::log_msg &, const std::tm &,
-            memory_buf_t &dest) override {
-        dest.push_back(ch_);
-    }
-
-private:
-    char ch_;
-};
-
-// aggregate user chars to display as is
-class aggregate_formatter final : public flag_formatter {
-public:
-    aggregate_formatter() = default;
-
-    void add_ch(char ch) { str_ += ch; }
-    void format(const details::log_msg &, const std::tm &,
-            memory_buf_t &dest) override {
-        fmt_helper::append_string_view(str_, dest);
-    }
-
-private:
-    std::string str_;
-};
-
-// mark the color range. expect it to be in the form of "%^colored text%$"
-class color_start_formatter final : public flag_formatter {
-public:
-    explicit color_start_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        msg.color_range_start = dest.size();
-    }
-};
-
-class color_stop_formatter final : public flag_formatter {
-public:
-    explicit color_stop_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        msg.color_range_end = dest.size();
-    }
-};
-
-// print source location
-template <typename ScopedPadder>
-class source_location_formatter final : public flag_formatter {
-public:
-    explicit source_location_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        if (msg.source.empty()) {
-            ScopedPadder p(0, padinfo_, dest);
-            return;
-        }
-
-        size_t text_size;
-        if (padinfo_.enabled()) {
-            // calc text size for padding based on "filename:line"
-            text_size = std::char_traits<char>::length(msg.source.filename)
-                    + ScopedPadder::count_digits(msg.source.line) + 1;
-        } else {
-            text_size = 0;
-        }
-
-        ScopedPadder p(text_size, padinfo_, dest);
-        fmt_helper::append_string_view(msg.source.filename, dest);
-        dest.push_back(':');
-        fmt_helper::append_int(msg.source.line, dest);
-    }
-};
-
-// print source filename
-template <typename ScopedPadder>
-class source_filename_formatter final : public flag_formatter {
-public:
-    explicit source_filename_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        if (msg.source.empty()) {
-            ScopedPadder p(0, padinfo_, dest);
-            return;
-        }
-        size_t text_size = padinfo_.enabled()
-                ? std::char_traits<char>::length(msg.source.filename)
-                : 0;
-        ScopedPadder p(text_size, padinfo_, dest);
-        fmt_helper::append_string_view(msg.source.filename, dest);
-    }
-};
-
-template <typename ScopedPadder>
-class short_filename_formatter final : public flag_formatter {
-public:
-    explicit short_filename_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4127) // consider using 'if constexpr' instead
-#endif // _MSC_VER
-    static const char *basename(const char *filename) {
-        // if the size is 2 (1 character + null terminator) we can use the more efficient strrchr
-        // the branch will be elided by optimizations
-        if (sizeof(os::folder_seps) == 2) {
-            const char *rv = std::strrchr(filename, os::folder_seps[0]);
-            return rv != nullptr ? rv + 1 : filename;
-        } else {
-            const std::reverse_iterator<const char *> begin(
-                    filename + std::strlen(filename));
-            const std::reverse_iterator<const char *> end(filename);
-
-            const auto it = std::find_first_of(begin, end,
-                    std::begin(os::folder_seps), std::end(os::folder_seps) - 1);
-            return it != end ? it.base() : filename;
-        }
-    }
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif // _MSC_VER
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        if (msg.source.empty()) {
-            ScopedPadder p(0, padinfo_, dest);
-            return;
-        }
-        auto filename = basename(msg.source.filename);
-        size_t text_size = padinfo_.enabled()
-                ? std::char_traits<char>::length(filename)
-                : 0;
-        ScopedPadder p(text_size, padinfo_, dest);
-        fmt_helper::append_string_view(filename, dest);
-    }
-};
-
-template <typename ScopedPadder>
-class source_linenum_formatter final : public flag_formatter {
-public:
-    explicit source_linenum_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        if (msg.source.empty()) {
-            ScopedPadder p(0, padinfo_, dest);
-            return;
-        }
-
-        auto field_size = ScopedPadder::count_digits(msg.source.line);
-        ScopedPadder p(field_size, padinfo_, dest);
-        fmt_helper::append_int(msg.source.line, dest);
-    }
-};
-
-// print source funcname
-template <typename ScopedPadder>
-class source_funcname_formatter final : public flag_formatter {
-public:
-    explicit source_funcname_formatter(padding_info padinfo)
-        : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        if (msg.source.empty()) {
-            ScopedPadder p(0, padinfo_, dest);
-            return;
-        }
-        size_t text_size = padinfo_.enabled()
-                ? std::char_traits<char>::length(msg.source.funcname)
-                : 0;
-        ScopedPadder p(text_size, padinfo_, dest);
-        fmt_helper::append_string_view(msg.source.funcname, dest);
-    }
-};
-
-// print elapsed time since last message
-template <typename ScopedPadder, typename Units>
-class elapsed_formatter final : public flag_formatter {
-public:
-    using DurationUnits = Units;
-
-    explicit elapsed_formatter(padding_info padinfo)
-        : flag_formatter(padinfo), last_message_time_(log_clock::now()) {}
-
-    void format(const details::log_msg &msg, const std::tm &,
-            memory_buf_t &dest) override {
-        auto delta = (std::max)(
-                msg.time - last_message_time_, log_clock::duration::zero());
-        auto delta_units = std::chrono::duration_cast<DurationUnits>(delta);
-        last_message_time_ = msg.time;
-        auto delta_count = static_cast<size_t>(delta_units.count());
-        auto n_digits
-                = static_cast<size_t>(ScopedPadder::count_digits(delta_count));
-        ScopedPadder p(n_digits, padinfo_, dest);
-        fmt_helper::append_int(delta_count, dest);
-    }
-
-private:
-    log_clock::time_point last_message_time_;
-};
-
-// Class for formatting Mapped Diagnostic Context (MDC) in log messages.
-// Example: [logger-name] [info] [mdc_key_1:mdc_value_1 mdc_key_2:mdc_value_2] some message
-template <typename ScopedPadder>
-class mdc_formatter : public flag_formatter {
-public:
-    explicit mdc_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &, const std::tm &,
-            memory_buf_t &dest) override {
-        auto &mdc_map = mdc::get_context();
-        if (mdc_map.empty()) {
-            ScopedPadder p(0, padinfo_, dest);
-            return;
-        } else {
-            format_mdc(mdc_map, dest);
-        }
-    }
-
-    void format_mdc(const mdc::mdc_map_t &mdc_map, memory_buf_t &dest) {
-        auto last_element = --mdc_map.end();
-        for (auto it = mdc_map.begin(); it != mdc_map.end(); ++it) {
-            auto &pair = *it;
-            const auto &key = pair.first;
-            const auto &value = pair.second;
-            size_t content_size = key.size() + value.size() + 1; // 1 for ':'
-
-            if (it != last_element) {
-                content_size++; // 1 for ' '
-            }
-
-            ScopedPadder p(content_size, padinfo_, dest);
-            fmt_helper::append_string_view(key, dest);
-            fmt_helper::append_string_view(":", dest);
-            fmt_helper::append_string_view(value, dest);
-            if (it != last_element) {
-                fmt_helper::append_string_view(" ", dest);
-            }
-        }
-    }
-};
-
-// Full info formatter
-// pattern: [%Y-%m-%d %H:%M:%S.%e] [%n] [%l] [%s:%#] %v
-class full_formatter final : public flag_formatter {
-public:
-    explicit full_formatter(padding_info padinfo) : flag_formatter(padinfo) {}
-
-    void format(const details::log_msg &msg, const std::tm &tm_time,
-            memory_buf_t &dest) override {
-        using std::chrono::duration_cast;
-        using std::chrono::milliseconds;
-        using std::chrono::seconds;
-
-        // cache the date/time part for the next second.
-        auto duration = msg.time.time_since_epoch();
-        auto secs = duration_cast<seconds>(duration);
-
-        if (cache_timestamp_ != secs || cached_datetime_.size() == 0) {
-            cached_datetime_.clear();
-            cached_datetime_.push_back('[');
-            fmt_helper::append_int(tm_time.tm_year + 1900, cached_datetime_);
-            cached_datetime_.push_back('-');
-
-            fmt_helper::pad2(tm_time.tm_mon + 1, cached_datetime_);
-            cached_datetime_.push_back('-');
-
-            fmt_helper::pad2(tm_time.tm_mday, cached_datetime_);
-            cached_datetime_.push_back(' ');
-
-            fmt_helper::pad2(tm_time.tm_hour, cached_datetime_);
-            cached_datetime_.push_back(':');
-
-            fmt_helper::pad2(tm_time.tm_min, cached_datetime_);
-            cached_datetime_.push_back(':');
-
-            fmt_helper::pad2(tm_time.tm_sec, cached_datetime_);
-            cached_datetime_.push_back('.');
-
-            cache_timestamp_ = secs;
-        }
-        dest.append(cached_datetime_.begin(), cached_datetime_.end());
-
-        auto millis = fmt_helper::time_fraction<milliseconds>(msg.time);
-        fmt_helper::pad3(static_cast<uint32_t>(millis.count()), dest);
-        dest.push_back(']');
-        dest.push_back(' ');
-
-        // append logger name if exists
-        if (msg.logger_name.size() > 0) {
-            dest.push_back('[');
-            fmt_helper::append_string_view(msg.logger_name, dest);
-            dest.push_back(']');
-            dest.push_back(' ');
-        }
-
-        dest.push_back('[');
-        // wrap the level name with color
-        msg.color_range_start = dest.size();
-        // fmt_helper::append_string_view(level::to_c_str(msg.level), dest);
-        fmt_helper::append_string_view(level::to_string_view(msg.level), dest);
-        msg.color_range_end = dest.size();
-        dest.push_back(']');
-        dest.push_back(' ');
-
-        // add source location if present
-        if (!msg.source.empty()) {
-            dest.push_back('[');
-            const char *filename = details::short_filename_formatter<
-                    details::null_scoped_padder>::basename(msg.source.filename);
-            fmt_helper::append_string_view(filename, dest);
-            dest.push_back(':');
-            fmt_helper::append_int(msg.source.line, dest);
-            dest.push_back(']');
-            dest.push_back(' ');
-        }
-
-        // add mdc if present
-        auto &mdc_map = mdc::get_context();
-        if (!mdc_map.empty()) {
-            dest.push_back('[');
-            mdc_formatter_.format_mdc(mdc_map, dest);
-            dest.push_back(']');
-            dest.push_back(' ');
-        }
-        // fmt_helper::append_string_view(msg.msg(), dest);
-        fmt_helper::append_string_view(msg.payload, dest);
-    }
-
-private:
-    std::chrono::seconds cache_timestamp_ {0};
-    memory_buf_t cached_datetime_;
-    mdc_formatter<null_scoped_padder> mdc_formatter_ {padding_info {}};
-};
-
-} // namespace details
-
-SPDLOG_INLINE pattern_formatter::pattern_formatter(std::string pattern,
-        pattern_time_type time_type, std::string eol,
-        custom_flags custom_user_flags)
-    : pattern_(std::move(pattern))
-    , eol_(std::move(eol))
-    , pattern_time_type_(time_type)
-    , need_localtime_(false)
-    , last_log_secs_(0)
-    , custom_handlers_(std::move(custom_user_flags)) {
-    std::memset(&cached_tm_, 0, sizeof(cached_tm_));
-    compile_pattern_(pattern_);
-}
-
-// use by default full formatter for if pattern is not given
-SPDLOG_INLINE pattern_formatter::pattern_formatter(
-        pattern_time_type time_type, std::string eol)
-    : pattern_("%+")
-    , eol_(std::move(eol))
-    , pattern_time_type_(time_type)
-    , need_localtime_(true)
-    , last_log_secs_(0) {
-    std::memset(&cached_tm_, 0, sizeof(cached_tm_));
-    formatters_.push_back(details::make_unique<details::full_formatter>(
-            details::padding_info {}));
-}
-
-SPDLOG_INLINE std::unique_ptr<formatter> pattern_formatter::clone() const {
-    custom_flags cloned_custom_formatters;
-    for (auto &it : custom_handlers_) {
-        cloned_custom_formatters[it.first] = it.second->clone();
-    }
-    auto cloned = details::make_unique<pattern_formatter>(pattern_,
-            pattern_time_type_, eol_, std::move(cloned_custom_formatters));
-    cloned->need_localtime(need_localtime_);
-#if defined(__GNUC__) && __GNUC__ < 5
-    return std::move(cloned);
-#else
-    return cloned;
-#endif
-}
-
-SPDLOG_INLINE void pattern_formatter::format(
-        const details::log_msg &msg, memory_buf_t &dest) {
-    if (need_localtime_) {
-        const auto secs = std::chrono::duration_cast<std::chrono::seconds>(
-                msg.time.time_since_epoch());
-        if (secs != last_log_secs_) {
-            cached_tm_ = get_time_(msg);
-            last_log_secs_ = secs;
-        }
-    }
-
-    for (auto &f : formatters_) {
-        f->format(msg, cached_tm_, dest);
-    }
-    // write eol
-    details::fmt_helper::append_string_view(eol_, dest);
-}
-
-SPDLOG_INLINE void pattern_formatter::set_pattern(std::string pattern) {
-    pattern_ = std::move(pattern);
-    need_localtime_ = false;
-    compile_pattern_(pattern_);
-}
-
-SPDLOG_INLINE void pattern_formatter::need_localtime(bool need) {
-    need_localtime_ = need;
-}
-
-SPDLOG_INLINE std::tm pattern_formatter::get_time_(
-        const details::log_msg &msg) {
-    if (pattern_time_type_ == pattern_time_type::local) {
-        return details::os::localtime(log_clock::to_time_t(msg.time));
-    }
-    return details::os::gmtime(log_clock::to_time_t(msg.time));
-}
-
-template <typename Padder>
-SPDLOG_INLINE void pattern_formatter::handle_flag_(
-        char flag, details::padding_info padding) {
-    // process custom flags
-    auto it = custom_handlers_.find(flag);
-    if (it != custom_handlers_.end()) {
-        auto custom_handler = it->second->clone();
-        custom_handler->set_padding_info(padding);
-        formatters_.push_back(std::move(custom_handler));
-        return;
-    }
-
-    // process built-in flags
-    switch (flag) {
-        case ('+'): // default formatter
-            formatters_.push_back(
-                    details::make_unique<details::full_formatter>(padding));
-            need_localtime_ = true;
-            break;
-
-        case 'n': // logger name
-            formatters_.push_back(
-                    details::make_unique<details::name_formatter<Padder>>(
-                            padding));
-            break;
-
-        case 'l': // level
-            formatters_.push_back(
-                    details::make_unique<details::level_formatter<Padder>>(
-                            padding));
-            break;
-
-        case 'L': // short level
-            formatters_.push_back(details::make_unique<
-                    details::short_level_formatter<Padder>>(padding));
-            break;
-
-        case ('t'): // thread id
-            formatters_.push_back(
-                    details::make_unique<details::t_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('v'): // the message text
-            formatters_.push_back(
-                    details::make_unique<details::v_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('a'): // weekday
-            formatters_.push_back(
-                    details::make_unique<details::a_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('A'): // short weekday
-            formatters_.push_back(
-                    details::make_unique<details::A_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('b'):
-        case ('h'): // month
-            formatters_.push_back(
-                    details::make_unique<details::b_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('B'): // short month
-            formatters_.push_back(
-                    details::make_unique<details::B_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('c'): // datetime
-            formatters_.push_back(
-                    details::make_unique<details::c_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('C'): // year 2 digits
-            formatters_.push_back(
-                    details::make_unique<details::C_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('Y'): // year 4 digits
-            formatters_.push_back(
-                    details::make_unique<details::Y_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('D'):
-        case ('x'): // datetime MM/DD/YY
-            formatters_.push_back(
-                    details::make_unique<details::D_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('m'): // month 1-12
-            formatters_.push_back(
-                    details::make_unique<details::m_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('d'): // day of month 1-31
-            formatters_.push_back(
-                    details::make_unique<details::d_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('H'): // hours 24
-            formatters_.push_back(
-                    details::make_unique<details::H_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('I'): // hours 12
-            formatters_.push_back(
-                    details::make_unique<details::I_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('M'): // minutes
-            formatters_.push_back(
-                    details::make_unique<details::M_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('S'): // seconds
-            formatters_.push_back(
-                    details::make_unique<details::S_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('e'): // milliseconds
-            formatters_.push_back(
-                    details::make_unique<details::e_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('f'): // microseconds
-            formatters_.push_back(
-                    details::make_unique<details::f_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('F'): // nanoseconds
-            formatters_.push_back(
-                    details::make_unique<details::F_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('E'): // seconds since epoch
-            formatters_.push_back(
-                    details::make_unique<details::E_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('p'): // am/pm
-            formatters_.push_back(
-                    details::make_unique<details::p_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('r'): // 12 hour clock 02:55:02 pm
-            formatters_.push_back(
-                    details::make_unique<details::r_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('R'): // 24-hour HH:MM time
-            formatters_.push_back(
-                    details::make_unique<details::R_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('T'):
-        case ('X'): // ISO 8601 time format (HH:MM:SS)
-            formatters_.push_back(
-                    details::make_unique<details::T_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('z'): // timezone
-            formatters_.push_back(
-                    details::make_unique<details::z_formatter<Padder>>(
-                            padding));
-            need_localtime_ = true;
-            break;
-
-        case ('P'): // pid
-            formatters_.push_back(
-                    details::make_unique<details::pid_formatter<Padder>>(
-                            padding));
-            break;
-
-        case ('^'): // color range start
-            formatters_.push_back(
-                    details::make_unique<details::color_start_formatter>(
-                            padding));
-            break;
-
-        case ('$'): // color range end
-            formatters_.push_back(
-                    details::make_unique<details::color_stop_formatter>(
-                            padding));
-            break;
-
-        case ('@'): // source location (filename:filenumber)
-            formatters_.push_back(details::make_unique<
-                    details::source_location_formatter<Padder>>(padding));
-            break;
-
-        case ('s'): // short source filename - without directory name
-            formatters_.push_back(details::make_unique<
-                    details::short_filename_formatter<Padder>>(padding));
-            break;
-
-        case ('g'): // full source filename
-            formatters_.push_back(details::make_unique<
-                    details::source_filename_formatter<Padder>>(padding));
-            break;
-
-        case ('#'): // source line number
-            formatters_.push_back(details::make_unique<
-                    details::source_linenum_formatter<Padder>>(padding));
-            break;
-
-        case ('!'): // source funcname
-            formatters_.push_back(details::make_unique<
-                    details::source_funcname_formatter<Padder>>(padding));
-            break;
-
-        case ('%'): // % char
-            formatters_.push_back(
-                    details::make_unique<details::ch_formatter>('%'));
-            break;
-
-        case ('u'): // elapsed time since last log message in nanos
-            formatters_.push_back(
-                    details::make_unique<details::elapsed_formatter<Padder,
-                            std::chrono::nanoseconds>>(padding));
-            break;
-
-        case ('i'): // elapsed time since last log message in micros
-            formatters_.push_back(
-                    details::make_unique<details::elapsed_formatter<Padder,
-                            std::chrono::microseconds>>(padding));
-            break;
-
-        case ('o'): // elapsed time since last log message in millis
-            formatters_.push_back(
-                    details::make_unique<details::elapsed_formatter<Padder,
-                            std::chrono::milliseconds>>(padding));
-            break;
-
-        case ('O'): // elapsed time since last log message in seconds
-            formatters_.push_back(details::make_unique<
-                    details::elapsed_formatter<Padder, std::chrono::seconds>>(
-                    padding));
-            break;
-
-        case ('&'):
-            formatters_.push_back(
-                    details::make_unique<details::mdc_formatter<Padder>>(
-                            padding));
-            break;
-
-        default: // Unknown flag appears as is
-            auto unknown_flag
-                    = details::make_unique<details::aggregate_formatter>();
-
-            if (!padding.truncate_) {
-                unknown_flag->add_ch('%');
-                unknown_flag->add_ch(flag);
-                formatters_.push_back((std::move(unknown_flag)));
-            }
-            // fix issue #1617 (prev char was '!' and should have been treated as funcname flag
-            // instead of truncating flag) spdlog::set_pattern("[%10!] %v") => "[      main] some
-            // message" spdlog::set_pattern("[%3!!] %v") => "[mai] some message"
-            else {
-                padding.truncate_ = false;
-                formatters_.push_back(details::make_unique<
-                        details::source_funcname_formatter<Padder>>(padding));
-                unknown_flag->add_ch(flag);
-                formatters_.push_back((std::move(unknown_flag)));
-            }
-
-            break;
-    }
-}
-
-// Extract given pad spec (e.g. %8X, %=8X, %-8!X, %8!X, %=8!X, %-8!X, %+8!X)
-// Advance the given it pass the end of the padding spec found (if any)
-// Return padding.
-SPDLOG_INLINE details::padding_info pattern_formatter::handle_padspec_(
-        std::string::const_iterator &it, std::string::const_iterator end) {
-    using details::padding_info;
-    using details::scoped_padder;
-    const size_t max_width = 64;
-    if (it == end) { return padding_info {}; }
-
-    padding_info::pad_side side;
-    switch (*it) {
-        case '-':
-            side = padding_info::pad_side::right;
-            ++it;
-            break;
-        case '=':
-            side = padding_info::pad_side::center;
-            ++it;
-            break;
-        default: side = details::padding_info::pad_side::left; break;
-    }
-
-    if (it == end || !std::isdigit(static_cast<unsigned char>(*it))) {
-        return padding_info {}; // no padding if no digit found here
-    }
-
-    auto width = static_cast<size_t>(*it) - '0';
-    for (++it; it != end && std::isdigit(static_cast<unsigned char>(*it));
-            ++it) {
-        auto digit = static_cast<size_t>(*it) - '0';
-        width = width * 10 + digit;
-    }
-
-    // search for the optional truncate marker '!'
-    bool truncate;
-    if (it != end && *it == '!') {
-        truncate = true;
-        ++it;
-    } else {
-        truncate = false;
-    }
-    return details::padding_info {
-            std::min<size_t>(width, max_width), side, truncate};
-}
-
-SPDLOG_INLINE void pattern_formatter::compile_pattern_(
-        const std::string &pattern) {
-    auto end = pattern.end();
-    std::unique_ptr<details::aggregate_formatter> user_chars;
-    formatters_.clear();
-    for (auto it = pattern.begin(); it != end; ++it) {
-        if (*it == '%') {
-            if (user_chars) // append user chars found so far
-            {
-                formatters_.push_back(std::move(user_chars));
-            }
-
-            auto padding = handle_padspec_(++it, end);
-
-            if (it != end) {
-                if (padding.enabled()) {
-                    handle_flag_<details::scoped_padder>(*it, padding);
-                } else {
-                    handle_flag_<details::null_scoped_padder>(*it, padding);
-                }
-            } else {
-                break;
-            }
-        } else // chars not following the % sign should be displayed as is
-        {
-            if (!user_chars) {
-                user_chars
-                        = details::make_unique<details::aggregate_formatter>();
-            }
-            user_chars->add_ch(*it);
-        }
-    }
-    if (user_chars) // append raw chars found so far
-    {
-        formatters_.push_back(std::move(user_chars));
-    }
-}
-} // namespace spdlog
diff --git a/src/common/spdlog/pattern_formatter.h b/src/common/spdlog/pattern_formatter.h
deleted file mode 100755
index 3f19b6e8973..00000000000
--- a/src/common/spdlog/pattern_formatter.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/log_msg.h>
-#include <common/spdlog/details/os.h>
-#include <common/spdlog/formatter.h>
-
-#include <chrono>
-#include <ctime>
-#include <memory>
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-namespace spdlog {
-namespace details {
-
-// padding information.
-struct padding_info {
-    enum class pad_side { left, right, center };
-
-    padding_info() = default;
-    padding_info(size_t width, padding_info::pad_side side, bool truncate)
-        : width_(width), side_(side), truncate_(truncate), enabled_(true) {}
-
-    bool enabled() const { return enabled_; }
-    size_t width_ = 0;
-    pad_side side_ = pad_side::left;
-    bool truncate_ = false;
-    bool enabled_ = false;
-};
-
-class SPDLOG_API flag_formatter {
-public:
-    explicit flag_formatter(padding_info padinfo) : padinfo_(padinfo) {}
-    flag_formatter() = default;
-    virtual ~flag_formatter() = default;
-    virtual void format(const details::log_msg &msg, const std::tm &tm_time,
-            memory_buf_t &dest)
-            = 0;
-
-protected:
-    padding_info padinfo_;
-};
-
-} // namespace details
-
-class SPDLOG_API custom_flag_formatter : public details::flag_formatter {
-public:
-    virtual std::unique_ptr<custom_flag_formatter> clone() const = 0;
-
-    void set_padding_info(const details::padding_info &padding) {
-        flag_formatter::padinfo_ = padding;
-    }
-};
-
-class SPDLOG_API pattern_formatter final : public formatter {
-public:
-    using custom_flags
-            = std::unordered_map<char, std::unique_ptr<custom_flag_formatter>>;
-
-    explicit pattern_formatter(std::string pattern,
-            pattern_time_type time_type = pattern_time_type::local,
-            std::string eol = spdlog::details::os::default_eol,
-            custom_flags custom_user_flags = custom_flags());
-
-    // use default pattern is not given
-    explicit pattern_formatter(
-            pattern_time_type time_type = pattern_time_type::local,
-            std::string eol = spdlog::details::os::default_eol);
-
-    pattern_formatter(const pattern_formatter &other) = delete;
-    pattern_formatter &operator=(const pattern_formatter &other) = delete;
-
-    std::unique_ptr<formatter> clone() const override;
-    void format(const details::log_msg &msg, memory_buf_t &dest) override;
-
-    template <typename T, typename... Args>
-    pattern_formatter &add_flag(char flag, Args &&...args) {
-        custom_handlers_[flag]
-                = details::make_unique<T>(std::forward<Args>(args)...);
-        return *this;
-    }
-    void set_pattern(std::string pattern);
-    void need_localtime(bool need = true);
-
-private:
-    std::string pattern_;
-    std::string eol_;
-    pattern_time_type pattern_time_type_;
-    bool need_localtime_;
-    std::tm cached_tm_;
-    std::chrono::seconds last_log_secs_;
-    std::vector<std::unique_ptr<details::flag_formatter>> formatters_;
-    custom_flags custom_handlers_;
-
-    std::tm get_time_(const details::log_msg &msg);
-    template <typename Padder>
-    void handle_flag_(char flag, details::padding_info padding);
-
-    // Extract given pad spec (e.g. %8X)
-    // Advance the given it pass the end of the padding spec found (if any)
-    // Return padding.
-    static details::padding_info handle_padspec_(
-            std::string::const_iterator &it, std::string::const_iterator end);
-
-    void compile_pattern_(const std::string &pattern);
-};
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "pattern_formatter-inl.h"
-#endif
diff --git a/src/common/spdlog/sinks/ansicolor_sink-inl.h b/src/common/spdlog/sinks/ansicolor_sink-inl.h
deleted file mode 100755
index bfd5aa00784..00000000000
--- a/src/common/spdlog/sinks/ansicolor_sink-inl.h
+++ /dev/null
@@ -1,138 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/sinks/ansicolor_sink.h>
-#endif
-
-#include <common/spdlog/details/os.h>
-#include <common/spdlog/pattern_formatter.h>
-
-namespace spdlog {
-namespace sinks {
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE ansicolor_sink<ConsoleMutex>::ansicolor_sink(
-        FILE *target_file, color_mode mode)
-    : target_file_(target_file)
-    , mutex_(ConsoleMutex::mutex())
-    , formatter_(details::make_unique<spdlog::pattern_formatter>())
-
-{
-    set_color_mode(mode);
-    colors_.at(level::trace) = to_string_(white);
-    colors_.at(level::debug) = to_string_(cyan);
-    colors_.at(level::info) = to_string_(green);
-    colors_.at(level::warn) = to_string_(yellow_bold);
-    colors_.at(level::err) = to_string_(red_bold);
-    colors_.at(level::critical) = to_string_(bold_on_red);
-    colors_.at(level::off) = to_string_(reset);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_color(
-        level::level_enum color_level, string_view_t color) {
-    std::lock_guard<mutex_t> lock(mutex_);
-    colors_.at(static_cast<size_t>(color_level)) = to_string_(color);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::log(
-        const details::log_msg &msg) {
-    // Wrap the originally formatted message in color codes.
-    // If color is not supported in the terminal, log as is instead.
-    std::lock_guard<mutex_t> lock(mutex_);
-    msg.color_range_start = 0;
-    msg.color_range_end = 0;
-    memory_buf_t formatted;
-    formatter_->format(msg, formatted);
-    if (should_do_colors_ && msg.color_range_end > msg.color_range_start) {
-        // before color range
-        print_range_(formatted, 0, msg.color_range_start);
-        // in color range
-        print_ccode_(colors_.at(static_cast<size_t>(msg.level)));
-        print_range_(formatted, msg.color_range_start, msg.color_range_end);
-        print_ccode_(reset);
-        // after color range
-        print_range_(formatted, msg.color_range_end, formatted.size());
-    } else // no color
-    {
-        print_range_(formatted, 0, formatted.size());
-    }
-    fflush(target_file_);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::flush() {
-    std::lock_guard<mutex_t> lock(mutex_);
-    fflush(target_file_);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_pattern(
-        const std::string &pattern) {
-    std::lock_guard<mutex_t> lock(mutex_);
-    formatter_ = std::unique_ptr<spdlog::formatter>(
-            new pattern_formatter(pattern));
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_formatter(
-        std::unique_ptr<spdlog::formatter> sink_formatter) {
-    std::lock_guard<mutex_t> lock(mutex_);
-    formatter_ = std::move(sink_formatter);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE bool ansicolor_sink<ConsoleMutex>::should_color() {
-    return should_do_colors_;
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_color_mode(
-        color_mode mode) {
-    switch (mode) {
-        case color_mode::always: should_do_colors_ = true; return;
-        case color_mode::automatic:
-            should_do_colors_ = details::os::in_terminal(target_file_)
-                    && details::os::is_color_terminal();
-            return;
-        case color_mode::never: should_do_colors_ = false; return;
-        default: should_do_colors_ = false;
-    }
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::print_ccode_(
-        const string_view_t &color_code) {
-    fwrite(color_code.data(), sizeof(char), color_code.size(), target_file_);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::print_range_(
-        const memory_buf_t &formatted, size_t start, size_t end) {
-    fwrite(formatted.data() + start, sizeof(char), end - start, target_file_);
-}
-
-template <typename ConsoleMutex>
-SPDLOG_INLINE std::string ansicolor_sink<ConsoleMutex>::to_string_(
-        const string_view_t &sv) {
-    return std::string(sv.data(), sv.size());
-}
-
-// ansicolor_stdout_sink
-template <typename ConsoleMutex>
-SPDLOG_INLINE ansicolor_stdout_sink<ConsoleMutex>::ansicolor_stdout_sink(
-        color_mode mode)
-    : ansicolor_sink<ConsoleMutex>(stdout, mode) {}
-
-// ansicolor_stderr_sink
-template <typename ConsoleMutex>
-SPDLOG_INLINE ansicolor_stderr_sink<ConsoleMutex>::ansicolor_stderr_sink(
-        color_mode mode)
-    : ansicolor_sink<ConsoleMutex>(stderr, mode) {}
-
-} // namespace sinks
-} // namespace spdlog
diff --git a/src/common/spdlog/sinks/base_sink-inl.h b/src/common/spdlog/sinks/base_sink-inl.h
deleted file mode 100755
index 8c94fa5d587..00000000000
--- a/src/common/spdlog/sinks/base_sink-inl.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/sinks/base_sink.h>
-#endif
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/pattern_formatter.h>
-
-#include <memory>
-#include <mutex>
-
-template <typename Mutex>
-SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::base_sink()
-    : formatter_ {details::make_unique<spdlog::pattern_formatter>()} {}
-
-template <typename Mutex>
-SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::base_sink(
-        std::unique_ptr<spdlog::formatter> formatter)
-    : formatter_ {std::move(formatter)} {}
-
-template <typename Mutex>
-void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::log(
-        const details::log_msg &msg) {
-    std::lock_guard<Mutex> lock(mutex_);
-    sink_it_(msg);
-}
-
-template <typename Mutex>
-void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::flush() {
-    std::lock_guard<Mutex> lock(mutex_);
-    flush_();
-}
-
-template <typename Mutex>
-void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_pattern(
-        const std::string &pattern) {
-    std::lock_guard<Mutex> lock(mutex_);
-    set_pattern_(pattern);
-}
-
-template <typename Mutex>
-void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_formatter(
-        std::unique_ptr<spdlog::formatter> sink_formatter) {
-    std::lock_guard<Mutex> lock(mutex_);
-    set_formatter_(std::move(sink_formatter));
-}
-
-template <typename Mutex>
-void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_pattern_(
-        const std::string &pattern) {
-    set_formatter_(details::make_unique<spdlog::pattern_formatter>(pattern));
-}
-
-template <typename Mutex>
-void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_formatter_(
-        std::unique_ptr<spdlog::formatter> sink_formatter) {
-    formatter_ = std::move(sink_formatter);
-}
diff --git a/src/common/spdlog/sinks/base_sink.h b/src/common/spdlog/sinks/base_sink.h
deleted file mode 100755
index 2a37d9d1555..00000000000
--- a/src/common/spdlog/sinks/base_sink.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-//
-// base sink templated over a mutex (either dummy or real)
-// concrete implementation should override the sink_it_() and flush_()  methods.
-// locking is taken care of in this class - no locking needed by the
-// implementers..
-//
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/log_msg.h>
-#include <common/spdlog/sinks/sink.h>
-
-namespace spdlog {
-namespace sinks {
-template <typename Mutex>
-class SPDLOG_API base_sink : public sink {
-public:
-    base_sink();
-    explicit base_sink(std::unique_ptr<spdlog::formatter> formatter);
-    ~base_sink() override = default;
-
-    base_sink(const base_sink &) = delete;
-    base_sink(base_sink &&) = delete;
-
-    base_sink &operator=(const base_sink &) = delete;
-    base_sink &operator=(base_sink &&) = delete;
-
-    void log(const details::log_msg &msg) final;
-    void flush() final;
-    void set_pattern(const std::string &pattern) final;
-    void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) final;
-
-protected:
-    // sink formatter
-    std::unique_ptr<spdlog::formatter> formatter_;
-    Mutex mutex_;
-
-    virtual void sink_it_(const details::log_msg &msg) = 0;
-    virtual void flush_() = 0;
-    virtual void set_pattern_(const std::string &pattern);
-    virtual void set_formatter_(
-            std::unique_ptr<spdlog::formatter> sink_formatter);
-};
-} // namespace sinks
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "base_sink-inl.h"
-#endif
diff --git a/src/common/spdlog/sinks/basic_file_sink-inl.h b/src/common/spdlog/sinks/basic_file_sink-inl.h
deleted file mode 100755
index ad2ee24eb42..00000000000
--- a/src/common/spdlog/sinks/basic_file_sink-inl.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/sinks/basic_file_sink.h>
-#endif
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/os.h>
-
-namespace spdlog {
-namespace sinks {
-
-template <typename Mutex>
-SPDLOG_INLINE basic_file_sink<Mutex>::basic_file_sink(
-        const filename_t &filename, bool truncate,
-        const file_event_handlers &event_handlers)
-    : file_helper_ {event_handlers} {
-    file_helper_.open(filename, truncate);
-}
-
-template <typename Mutex>
-SPDLOG_INLINE const filename_t &basic_file_sink<Mutex>::filename() const {
-    return file_helper_.filename();
-}
-
-template <typename Mutex>
-SPDLOG_INLINE void basic_file_sink<Mutex>::sink_it_(
-        const details::log_msg &msg) {
-    memory_buf_t formatted;
-    base_sink<Mutex>::formatter_->format(msg, formatted);
-    file_helper_.write(formatted);
-}
-
-template <typename Mutex>
-SPDLOG_INLINE void basic_file_sink<Mutex>::flush_() {
-    file_helper_.flush();
-}
-
-} // namespace sinks
-} // namespace spdlog
diff --git a/src/common/spdlog/sinks/basic_file_sink.h b/src/common/spdlog/sinks/basic_file_sink.h
deleted file mode 100755
index 3d7742356e3..00000000000
--- a/src/common/spdlog/sinks/basic_file_sink.h
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/details/file_helper.h>
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/details/synchronous_factory.h>
-#include <common/spdlog/sinks/base_sink.h>
-
-#include <mutex>
-#include <string>
-
-namespace spdlog {
-namespace sinks {
-/*
- * Trivial file sink with single file as target
- */
-template <typename Mutex>
-class basic_file_sink final : public base_sink<Mutex> {
-public:
-    explicit basic_file_sink(const filename_t &filename, bool truncate = false,
-            const file_event_handlers &event_handlers = {});
-    const filename_t &filename() const;
-
-protected:
-    void sink_it_(const details::log_msg &msg) override;
-    void flush_() override;
-
-private:
-    details::file_helper file_helper_;
-};
-
-using basic_file_sink_mt = basic_file_sink<std::mutex>;
-using basic_file_sink_st = basic_file_sink<details::null_mutex>;
-
-} // namespace sinks
-
-//
-// factory functions
-//
-template <typename Factory = spdlog::synchronous_factory>
-inline std::shared_ptr<logger> basic_logger_mt(const std::string &logger_name,
-        const filename_t &filename, bool truncate = false,
-        const file_event_handlers &event_handlers = {}) {
-    return Factory::template create<sinks::basic_file_sink_mt>(
-            logger_name, filename, truncate, event_handlers);
-}
-
-template <typename Factory = spdlog::synchronous_factory>
-inline std::shared_ptr<logger> basic_logger_st(const std::string &logger_name,
-        const filename_t &filename, bool truncate = false,
-        const file_event_handlers &event_handlers = {}) {
-    return Factory::template create<sinks::basic_file_sink_st>(
-            logger_name, filename, truncate, event_handlers);
-}
-
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "basic_file_sink-inl.h"
-#endif
diff --git a/src/common/spdlog/sinks/null_sink.h b/src/common/spdlog/sinks/null_sink.h
deleted file mode 100755
index 628bdee33ac..00000000000
--- a/src/common/spdlog/sinks/null_sink.h
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/details/synchronous_factory.h>
-#include <common/spdlog/sinks/base_sink.h>
-
-#include <mutex>
-
-namespace spdlog {
-namespace sinks {
-
-template <typename Mutex>
-class null_sink : public base_sink<Mutex> {
-protected:
-    void sink_it_(const details::log_msg &) override {}
-    void flush_() override {}
-};
-
-using null_sink_mt = null_sink<details::null_mutex>;
-using null_sink_st = null_sink<details::null_mutex>;
-
-} // namespace sinks
-
-template <typename Factory = spdlog::synchronous_factory>
-inline std::shared_ptr<logger> null_logger_mt(const std::string &logger_name) {
-    auto null_logger
-            = Factory::template create<sinks::null_sink_mt>(logger_name);
-    null_logger->set_level(level::off);
-    return null_logger;
-}
-
-template <typename Factory = spdlog::synchronous_factory>
-inline std::shared_ptr<logger> null_logger_st(const std::string &logger_name) {
-    auto null_logger
-            = Factory::template create<sinks::null_sink_st>(logger_name);
-    null_logger->set_level(level::off);
-    return null_logger;
-}
-
-} // namespace spdlog
diff --git a/src/common/spdlog/sinks/ostream_sink.h b/src/common/spdlog/sinks/ostream_sink.h
deleted file mode 100755
index 383ae4cb3aa..00000000000
--- a/src/common/spdlog/sinks/ostream_sink.h
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/sinks/base_sink.h>
-
-#include <mutex>
-#include <ostream>
-
-namespace spdlog {
-namespace sinks {
-template <typename Mutex>
-class ostream_sink final : public base_sink<Mutex> {
-public:
-    explicit ostream_sink(std::ostream &os, bool force_flush = false)
-        : ostream_(os), force_flush_(force_flush) {}
-    ostream_sink(const ostream_sink &) = delete;
-    ostream_sink &operator=(const ostream_sink &) = delete;
-
-protected:
-    void sink_it_(const details::log_msg &msg) override {
-        memory_buf_t formatted;
-        base_sink<Mutex>::formatter_->format(msg, formatted);
-        ostream_.write(formatted.data(),
-                static_cast<std::streamsize>(formatted.size()));
-        if (force_flush_) { ostream_.flush(); }
-    }
-
-    void flush_() override { ostream_.flush(); }
-
-    std::ostream &ostream_;
-    bool force_flush_;
-};
-
-using ostream_sink_mt = ostream_sink<std::mutex>;
-using ostream_sink_st = ostream_sink<details::null_mutex>;
-
-} // namespace sinks
-} // namespace spdlog
diff --git a/src/common/spdlog/sinks/rotating_file_sink-inl.h b/src/common/spdlog/sinks/rotating_file_sink-inl.h
deleted file mode 100755
index 8b491c9828b..00000000000
--- a/src/common/spdlog/sinks/rotating_file_sink-inl.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/sinks/rotating_file_sink.h>
-#endif
-
-#include <common/spdlog/common.h>
-
-#include <common/spdlog/details/file_helper.h>
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/fmt/fmt.h>
-
-#include <cerrno>
-#include <chrono>
-#include <ctime>
-#include <mutex>
-#include <string>
-#include <tuple>
-
-namespace spdlog {
-namespace sinks {
-
-template <typename Mutex>
-SPDLOG_INLINE rotating_file_sink<Mutex>::rotating_file_sink(
-        filename_t base_filename, std::size_t max_size, std::size_t max_files,
-        bool rotate_on_open, const file_event_handlers &event_handlers)
-    : base_filename_(std::move(base_filename))
-    , max_size_(max_size)
-    , max_files_(max_files)
-    , file_helper_ {event_handlers} {
-    if (max_size == 0) {
-        throw_spdlog_ex(
-                "rotating sink constructor: max_size arg cannot be zero");
-    }
-
-    if (max_files > 200000) {
-        throw_spdlog_ex(
-                "rotating sink constructor: max_files arg cannot exceed "
-                "200000");
-    }
-    file_helper_.open(calc_filename(base_filename_, 0));
-    current_size_ = file_helper_.size(); // expensive. called only once
-    if (rotate_on_open && current_size_ > 0) {
-        rotate_();
-        current_size_ = 0;
-    }
-}
-
-// calc filename according to index and file extension if exists.
-// e.g. calc_filename("logs/mylog.txt, 3) => "logs/mylog.3.txt".
-template <typename Mutex>
-SPDLOG_INLINE filename_t rotating_file_sink<Mutex>::calc_filename(
-        const filename_t &filename, std::size_t index) {
-    if (index == 0u) { return filename; }
-
-    filename_t basename, ext;
-    std::tie(basename, ext)
-            = details::file_helper::split_by_extension(filename);
-    return fmt_lib::format(SPDLOG_FILENAME_T("{}.{}{}"), basename, index, ext);
-}
-
-template <typename Mutex>
-SPDLOG_INLINE filename_t rotating_file_sink<Mutex>::filename() {
-    std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
-    return file_helper_.filename();
-}
-
-template <typename Mutex>
-SPDLOG_INLINE void rotating_file_sink<Mutex>::sink_it_(
-        const details::log_msg &msg) {
-    memory_buf_t formatted;
-    base_sink<Mutex>::formatter_->format(msg, formatted);
-    auto new_size = current_size_ + formatted.size();
-
-    // rotate if the new estimated file size exceeds max size.
-    // rotate only if the real size > 0 to better deal with full disk (see issue #2261).
-    // we only check the real size when new_size > max_size_ because it is relatively expensive.
-    if (new_size > max_size_) {
-        file_helper_.flush();
-        if (file_helper_.size() > 0) {
-            rotate_();
-            new_size = formatted.size();
-        }
-    }
-    file_helper_.write(formatted);
-    current_size_ = new_size;
-}
-
-template <typename Mutex>
-SPDLOG_INLINE void rotating_file_sink<Mutex>::flush_() {
-    file_helper_.flush();
-}
-
-// Rotate files:
-// log.txt -> log.1.txt
-// log.1.txt -> log.2.txt
-// log.2.txt -> log.3.txt
-// log.3.txt -> delete
-template <typename Mutex>
-SPDLOG_INLINE void rotating_file_sink<Mutex>::rotate_() {
-    using details::os::filename_to_str;
-    using details::os::path_exists;
-
-    file_helper_.close();
-    for (auto i = max_files_; i > 0; --i) {
-        filename_t src = calc_filename(base_filename_, i - 1);
-        if (!path_exists(src)) { continue; }
-        filename_t target = calc_filename(base_filename_, i);
-
-        if (!rename_file_(src, target)) {
-            // if failed try again after a small delay.
-            // this is a workaround to a windows issue, where very high rotation
-            // rates can cause the rename to fail with permission denied (because of antivirus?).
-            details::os::sleep_for_millis(100);
-            if (!rename_file_(src, target)) {
-                file_helper_.reopen(
-                        true); // truncate the log file anyway to prevent it to grow beyond its limit!
-                current_size_ = 0;
-                throw_spdlog_ex("rotating_file_sink: failed renaming "
-                                + filename_to_str(src) + " to "
-                                + filename_to_str(target),
-                        errno);
-            }
-        }
-    }
-    file_helper_.reopen(true);
-}
-
-// delete the target if exists, and rename the src file  to target
-// return true on success, false otherwise.
-template <typename Mutex>
-SPDLOG_INLINE bool rotating_file_sink<Mutex>::rename_file_(
-        const filename_t &src_filename, const filename_t &target_filename) {
-    // try to delete the target file in case it already exists.
-    (void)details::os::remove(target_filename);
-    return details::os::rename(src_filename, target_filename) == 0;
-}
-
-} // namespace sinks
-} // namespace spdlog
diff --git a/src/common/spdlog/sinks/rotating_file_sink.h b/src/common/spdlog/sinks/rotating_file_sink.h
deleted file mode 100755
index 937c165e8ae..00000000000
--- a/src/common/spdlog/sinks/rotating_file_sink.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/details/file_helper.h>
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/details/synchronous_factory.h>
-#include <common/spdlog/sinks/base_sink.h>
-
-#include <chrono>
-#include <mutex>
-#include <string>
-
-namespace spdlog {
-namespace sinks {
-
-//
-// Rotating file sink based on size
-//
-template <typename Mutex>
-class rotating_file_sink final : public base_sink<Mutex> {
-public:
-    rotating_file_sink(filename_t base_filename, std::size_t max_size,
-            std::size_t max_files, bool rotate_on_open = false,
-            const file_event_handlers &event_handlers = {});
-    static filename_t calc_filename(
-            const filename_t &filename, std::size_t index);
-    filename_t filename();
-
-protected:
-    void sink_it_(const details::log_msg &msg) override;
-    void flush_() override;
-
-private:
-    // Rotate files:
-    // log.txt -> log.1.txt
-    // log.1.txt -> log.2.txt
-    // log.2.txt -> log.3.txt
-    // log.3.txt -> delete
-    void rotate_();
-
-    // delete the target if exists, and rename the src file  to target
-    // return true on success, false otherwise.
-    bool rename_file_(
-            const filename_t &src_filename, const filename_t &target_filename);
-
-    filename_t base_filename_;
-    std::size_t max_size_;
-    std::size_t max_files_;
-    std::size_t current_size_;
-    details::file_helper file_helper_;
-};
-
-using rotating_file_sink_mt = rotating_file_sink<std::mutex>;
-using rotating_file_sink_st = rotating_file_sink<details::null_mutex>;
-
-} // namespace sinks
-
-//
-// factory functions
-//
-
-template <typename Factory = spdlog::synchronous_factory>
-inline std::shared_ptr<logger> rotating_logger_mt(
-        const std::string &logger_name, const filename_t &filename,
-        size_t max_file_size, size_t max_files, bool rotate_on_open = false,
-        const file_event_handlers &event_handlers = {}) {
-    return Factory::template create<sinks::rotating_file_sink_mt>(logger_name,
-            filename, max_file_size, max_files, rotate_on_open, event_handlers);
-}
-
-template <typename Factory = spdlog::synchronous_factory>
-inline std::shared_ptr<logger> rotating_logger_st(
-        const std::string &logger_name, const filename_t &filename,
-        size_t max_file_size, size_t max_files, bool rotate_on_open = false,
-        const file_event_handlers &event_handlers = {}) {
-    return Factory::template create<sinks::rotating_file_sink_st>(logger_name,
-            filename, max_file_size, max_files, rotate_on_open, event_handlers);
-}
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "rotating_file_sink-inl.h"
-#endif
diff --git a/src/common/spdlog/sinks/sink-inl.h b/src/common/spdlog/sinks/sink-inl.h
deleted file mode 100755
index a1ef129fba9..00000000000
--- a/src/common/spdlog/sinks/sink-inl.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/sinks/sink.h>
-#endif
-
-#include <common/spdlog/common.h>
-
-SPDLOG_INLINE bool spdlog::sinks::sink::should_log(
-        spdlog::level::level_enum msg_level) const {
-    return msg_level >= level_.load(std::memory_order_relaxed);
-}
-
-SPDLOG_INLINE void spdlog::sinks::sink::set_level(level::level_enum log_level) {
-    level_.store(log_level, std::memory_order_relaxed);
-}
-
-SPDLOG_INLINE spdlog::level::level_enum spdlog::sinks::sink::level() const {
-    return static_cast<spdlog::level::level_enum>(
-            level_.load(std::memory_order_relaxed));
-}
diff --git a/src/common/spdlog/sinks/sink.h b/src/common/spdlog/sinks/sink.h
deleted file mode 100755
index 18e0d7cffba..00000000000
--- a/src/common/spdlog/sinks/sink.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#include <common/spdlog/details/log_msg.h>
-#include <common/spdlog/formatter.h>
-
-namespace spdlog {
-
-namespace sinks {
-class SPDLOG_API sink {
-public:
-    virtual ~sink() = default;
-    virtual void log(const details::log_msg &msg) = 0;
-    virtual void flush() = 0;
-    virtual void set_pattern(const std::string &pattern) = 0;
-    virtual void set_formatter(
-            std::unique_ptr<spdlog::formatter> sink_formatter)
-            = 0;
-
-    void set_level(level::level_enum log_level);
-    level::level_enum level() const;
-    bool should_log(level::level_enum msg_level) const;
-
-protected:
-    // sink log level - default is all
-    level_t level_ {level::trace};
-};
-
-} // namespace sinks
-} // namespace spdlog
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "sink-inl.h"
-#endif
diff --git a/src/common/spdlog/spdlog-inl.h b/src/common/spdlog/spdlog-inl.h
deleted file mode 100755
index b0641663f5c..00000000000
--- a/src/common/spdlog/spdlog-inl.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/spdlog.h>
-#endif
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/pattern_formatter.h>
-
-namespace spdlog {
-
-SPDLOG_INLINE void initialize_logger(std::shared_ptr<logger> logger) {
-    details::registry::instance().initialize_logger(std::move(logger));
-}
-
-SPDLOG_INLINE std::shared_ptr<logger> get(const std::string &name) {
-    return details::registry::instance().get(name);
-}
-
-SPDLOG_INLINE void set_formatter(std::unique_ptr<spdlog::formatter> formatter) {
-    details::registry::instance().set_formatter(std::move(formatter));
-}
-
-SPDLOG_INLINE void set_pattern(
-        std::string pattern, pattern_time_type time_type) {
-    set_formatter(std::unique_ptr<spdlog::formatter>(
-            new pattern_formatter(std::move(pattern), time_type)));
-}
-
-SPDLOG_INLINE void enable_backtrace(size_t n_messages) {
-    details::registry::instance().enable_backtrace(n_messages);
-}
-
-SPDLOG_INLINE void disable_backtrace() {
-    details::registry::instance().disable_backtrace();
-}
-
-SPDLOG_INLINE void dump_backtrace() {
-    default_logger_raw()->dump_backtrace();
-}
-
-SPDLOG_INLINE level::level_enum get_level() {
-    return default_logger_raw()->level();
-}
-
-SPDLOG_INLINE bool should_log(level::level_enum log_level) {
-    return default_logger_raw()->should_log(log_level);
-}
-
-SPDLOG_INLINE void set_level(level::level_enum log_level) {
-    details::registry::instance().set_level(log_level);
-}
-
-SPDLOG_INLINE void flush_on(level::level_enum log_level) {
-    details::registry::instance().flush_on(log_level);
-}
-
-SPDLOG_INLINE void set_error_handler(void (*handler)(const std::string &msg)) {
-    details::registry::instance().set_error_handler(handler);
-}
-
-SPDLOG_INLINE void register_logger(std::shared_ptr<logger> logger) {
-    details::registry::instance().register_logger(std::move(logger));
-}
-
-SPDLOG_INLINE void apply_all(
-        const std::function<void(std::shared_ptr<logger>)> &fun) {
-    details::registry::instance().apply_all(fun);
-}
-
-SPDLOG_INLINE void drop(const std::string &name) {
-    details::registry::instance().drop(name);
-}
-
-SPDLOG_INLINE void drop_all() {
-    details::registry::instance().drop_all();
-}
-
-SPDLOG_INLINE void shutdown() {
-    details::registry::instance().shutdown();
-}
-
-SPDLOG_INLINE void set_automatic_registration(bool automatic_registration) {
-    details::registry::instance().set_automatic_registration(
-            automatic_registration);
-}
-
-SPDLOG_INLINE std::shared_ptr<spdlog::logger> default_logger() {
-    return details::registry::instance().default_logger();
-}
-
-SPDLOG_INLINE spdlog::logger *default_logger_raw() {
-    return details::registry::instance().get_default_raw();
-}
-
-SPDLOG_INLINE void set_default_logger(
-        std::shared_ptr<spdlog::logger> default_logger) {
-    details::registry::instance().set_default_logger(std::move(default_logger));
-}
-
-SPDLOG_INLINE void apply_logger_env_levels(std::shared_ptr<logger> logger) {
-    details::registry::instance().apply_logger_env_levels(std::move(logger));
-}
-
-} // namespace spdlog
diff --git a/src/common/spdlog/spdlog.h b/src/common/spdlog/spdlog.h
deleted file mode 100755
index ef7ac2d53ee..00000000000
--- a/src/common/spdlog/spdlog.h
+++ /dev/null
@@ -1,362 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-// spdlog main header file.
-// see example.cpp for usage example
-
-#ifndef SPDLOG_H
-#define SPDLOG_H
-
-#pragma once
-
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/registry.h>
-#include <common/spdlog/details/synchronous_factory.h>
-#include <common/spdlog/logger.h>
-#include <common/spdlog/version.h>
-
-#include <chrono>
-#include <functional>
-#include <memory>
-#include <string>
-
-namespace spdlog {
-
-using default_factory = synchronous_factory;
-
-// Create and register a logger with a templated sink type
-// The logger's level, formatter and flush level will be set according the
-// global settings.
-//
-// Example:
-//   spdlog::create<daily_file_sink_st>("logger_name", "dailylog_filename", 11, 59);
-template <typename Sink, typename... SinkArgs>
-inline std::shared_ptr<spdlog::logger> create(
-        std::string logger_name, SinkArgs &&...sink_args) {
-    return default_factory::create<Sink>(
-            std::move(logger_name), std::forward<SinkArgs>(sink_args)...);
-}
-
-// Initialize and register a logger,
-// formatter and flush level will be set according the global settings.
-//
-// Useful for initializing manually created loggers with the global settings.
-//
-// Example:
-//   auto mylogger = std::make_shared<spdlog::logger>("mylogger", ...);
-//   spdlog::initialize_logger(mylogger);
-SPDLOG_API void initialize_logger(std::shared_ptr<logger> logger);
-
-// Return an existing logger or nullptr if a logger with such name doesn't
-// exist.
-// example: spdlog::get("my_logger")->info("hello {}", "world");
-SPDLOG_API std::shared_ptr<logger> get(const std::string &name);
-
-// Set global formatter. Each sink in each logger will get a clone of this object
-SPDLOG_API void set_formatter(std::unique_ptr<spdlog::formatter> formatter);
-
-// Set global format string.
-// example: spdlog::set_pattern("%Y-%m-%d %H:%M:%S.%e %l : %v");
-SPDLOG_API void set_pattern(std::string pattern,
-        pattern_time_type time_type = pattern_time_type::local);
-
-// enable global backtrace support
-SPDLOG_API void enable_backtrace(size_t n_messages);
-
-// disable global backtrace support
-SPDLOG_API void disable_backtrace();
-
-// call dump backtrace on default logger
-SPDLOG_API void dump_backtrace();
-
-// Get global logging level
-SPDLOG_API level::level_enum get_level();
-
-// Set global logging level
-SPDLOG_API void set_level(level::level_enum log_level);
-
-// Determine whether the default logger should log messages with a certain level
-SPDLOG_API bool should_log(level::level_enum lvl);
-
-// Set global flush level
-SPDLOG_API void flush_on(level::level_enum log_level);
-
-// Start/Restart a periodic flusher thread
-// Warning: Use only if all your loggers are thread safe!
-template <typename Rep, typename Period>
-inline void flush_every(std::chrono::duration<Rep, Period> interval) {
-    details::registry::instance().flush_every(interval);
-}
-
-// Set global error handler
-SPDLOG_API void set_error_handler(void (*handler)(const std::string &msg));
-
-// Register the given logger with the given name
-SPDLOG_API void register_logger(std::shared_ptr<logger> logger);
-
-// Apply a user defined function on all registered loggers
-// Example:
-// spdlog::apply_all([&](std::shared_ptr<spdlog::logger> l) {l->flush();});
-SPDLOG_API void apply_all(
-        const std::function<void(std::shared_ptr<logger>)> &fun);
-
-// Drop the reference to the given logger
-SPDLOG_API void drop(const std::string &name);
-
-// Drop all references from the registry
-SPDLOG_API void drop_all();
-
-// stop any running threads started by spdlog and clean registry loggers
-SPDLOG_API void shutdown();
-
-// Automatic registration of loggers when using spdlog::create() or spdlog::create_async
-SPDLOG_API void set_automatic_registration(bool automatic_registration);
-
-// API for using default logger (stdout_color_mt),
-// e.g: spdlog::info("Message {}", 1);
-//
-// The default logger object can be accessed using the spdlog::default_logger():
-// For example, to add another sink to it:
-// spdlog::default_logger()->sinks().push_back(some_sink);
-//
-// The default logger can replaced using spdlog::set_default_logger(new_logger).
-// For example, to replace it with a file logger.
-//
-// IMPORTANT:
-// The default API is thread safe (for _mt loggers), but:
-// set_default_logger() *should not* be used concurrently with the default API.
-// e.g do not call set_default_logger() from one thread while calling spdlog::info() from another.
-
-SPDLOG_API std::shared_ptr<spdlog::logger> default_logger();
-
-SPDLOG_API spdlog::logger *default_logger_raw();
-
-SPDLOG_API void set_default_logger(
-        std::shared_ptr<spdlog::logger> default_logger);
-
-// Initialize logger level based on environment configs.
-//
-// Useful for applying SPDLOG_LEVEL to manually created loggers.
-//
-// Example:
-//   auto mylogger = std::make_shared<spdlog::logger>("mylogger", ...);
-//   spdlog::apply_logger_env_levels(mylogger);
-SPDLOG_API void apply_logger_env_levels(std::shared_ptr<logger> logger);
-
-template <typename... Args>
-inline void log(source_loc source, level::level_enum lvl,
-        format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->log(source, lvl, fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void log(
-        level::level_enum lvl, format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->log(
-            source_loc {}, lvl, fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void trace(format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->trace(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void debug(format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->debug(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void info(format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->info(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void warn(format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->warn(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void error(format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->error(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void critical(format_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->critical(fmt, std::forward<Args>(args)...);
-}
-
-template <typename T>
-inline void log(source_loc source, level::level_enum lvl, const T &msg) {
-    default_logger_raw()->log(source, lvl, msg);
-}
-
-template <typename T>
-inline void log(level::level_enum lvl, const T &msg) {
-    default_logger_raw()->log(lvl, msg);
-}
-
-#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
-template <typename... Args>
-inline void log(source_loc source, level::level_enum lvl,
-        wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->log(source, lvl, fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void log(
-        level::level_enum lvl, wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->log(
-            source_loc {}, lvl, fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void trace(wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->trace(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void debug(wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->debug(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void info(wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->info(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void warn(wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->warn(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void error(wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->error(fmt, std::forward<Args>(args)...);
-}
-
-template <typename... Args>
-inline void critical(wformat_string_t<Args...> fmt, Args &&...args) {
-    default_logger_raw()->critical(fmt, std::forward<Args>(args)...);
-}
-#endif
-
-template <typename T>
-inline void trace(const T &msg) {
-    default_logger_raw()->trace(msg);
-}
-
-template <typename T>
-inline void debug(const T &msg) {
-    default_logger_raw()->debug(msg);
-}
-
-template <typename T>
-inline void info(const T &msg) {
-    default_logger_raw()->info(msg);
-}
-
-template <typename T>
-inline void warn(const T &msg) {
-    default_logger_raw()->warn(msg);
-}
-
-template <typename T>
-inline void error(const T &msg) {
-    default_logger_raw()->error(msg);
-}
-
-template <typename T>
-inline void critical(const T &msg) {
-    default_logger_raw()->critical(msg);
-}
-
-} // namespace spdlog
-
-//
-// enable/disable log calls at compile time according to global level.
-//
-// define SPDLOG_ACTIVE_LEVEL to one of those (before including spdlog.h):
-// SPDLOG_LEVEL_TRACE,
-// SPDLOG_LEVEL_DEBUG,
-// SPDLOG_LEVEL_INFO,
-// SPDLOG_LEVEL_WARN,
-// SPDLOG_LEVEL_ERROR,
-// SPDLOG_LEVEL_CRITICAL,
-// SPDLOG_LEVEL_OFF
-//
-
-#ifndef SPDLOG_NO_SOURCE_LOC
-#define SPDLOG_LOGGER_CALL(logger, level, ...) \
-    (logger)->log(spdlog::source_loc {__FILE__, __LINE__, SPDLOG_FUNCTION}, \
-            level, __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_CALL(logger, level, ...) \
-    (logger)->log(spdlog::source_loc {}, level, __VA_ARGS__)
-#endif
-
-#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_TRACE
-#define SPDLOG_LOGGER_TRACE(logger, ...) \
-    SPDLOG_LOGGER_CALL(logger, spdlog::level::trace, __VA_ARGS__)
-#define SPDLOG_TRACE(...) \
-    SPDLOG_LOGGER_TRACE(spdlog::default_logger_raw(), __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_TRACE(logger, ...) (void)0
-#define SPDLOG_TRACE(...) (void)0
-#endif
-
-#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
-#define SPDLOG_LOGGER_DEBUG(logger, ...) \
-    SPDLOG_LOGGER_CALL(logger, spdlog::level::debug, __VA_ARGS__)
-#define SPDLOG_DEBUG(...) \
-    SPDLOG_LOGGER_DEBUG(spdlog::default_logger_raw(), __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_DEBUG(logger, ...) (void)0
-#define SPDLOG_DEBUG(...) (void)0
-#endif
-
-#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_INFO
-#define SPDLOG_LOGGER_INFO(logger, ...) \
-    SPDLOG_LOGGER_CALL(logger, spdlog::level::info, __VA_ARGS__)
-#define SPDLOG_INFO(...) \
-    SPDLOG_LOGGER_INFO(spdlog::default_logger_raw(), __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_INFO(logger, ...) (void)0
-#define SPDLOG_INFO(...) (void)0
-#endif
-
-#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_WARN
-#define SPDLOG_LOGGER_WARN(logger, ...) \
-    SPDLOG_LOGGER_CALL(logger, spdlog::level::warn, __VA_ARGS__)
-#define SPDLOG_WARN(...) \
-    SPDLOG_LOGGER_WARN(spdlog::default_logger_raw(), __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_WARN(logger, ...) (void)0
-#define SPDLOG_WARN(...) (void)0
-#endif
-
-#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_ERROR
-#define SPDLOG_LOGGER_ERROR(logger, ...) \
-    SPDLOG_LOGGER_CALL(logger, spdlog::level::err, __VA_ARGS__)
-#define SPDLOG_ERROR(...) \
-    SPDLOG_LOGGER_ERROR(spdlog::default_logger_raw(), __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_ERROR(logger, ...) (void)0
-#define SPDLOG_ERROR(...) (void)0
-#endif
-
-#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_CRITICAL
-#define SPDLOG_LOGGER_CRITICAL(logger, ...) \
-    SPDLOG_LOGGER_CALL(logger, spdlog::level::critical, __VA_ARGS__)
-#define SPDLOG_CRITICAL(...) \
-    SPDLOG_LOGGER_CRITICAL(spdlog::default_logger_raw(), __VA_ARGS__)
-#else
-#define SPDLOG_LOGGER_CRITICAL(logger, ...) (void)0
-#define SPDLOG_CRITICAL(...) (void)0
-#endif
-
-#ifdef SPDLOG_HEADER_ONLY
-#include "spdlog-inl.h"
-#endif
-
-#endif // SPDLOG_H
diff --git a/src/common/spdlog/version.h b/src/common/spdlog/version.h
deleted file mode 100755
index d3d49f42e1d..00000000000
--- a/src/common/spdlog/version.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
-// Distributed under the MIT License (http://opensource.org/licenses/MIT)
-
-#pragma once
-
-#define SPDLOG_VER_MAJOR 1
-#define SPDLOG_VER_MINOR 14
-#define SPDLOG_VER_PATCH 1
-
-#define SPDLOG_TO_VERSION(major, minor, patch) \
-    (major * 10000 + minor * 100 + patch)
-#define SPDLOG_VERSION \
-    SPDLOG_TO_VERSION(SPDLOG_VER_MAJOR, SPDLOG_VER_MINOR, SPDLOG_VER_PATCH)
diff --git a/src/common/stack_checker.hpp b/src/common/stack_checker.hpp
index 013cdbcb58e..05cfa44bab1 100644
--- a/src/common/stack_checker.hpp
+++ b/src/common/stack_checker.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
+ * Copyright 2021-2024 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,7 +204,7 @@ struct stack_checker_t {
             size_t soft_stack_limit_in_bytes
                     = get_soft_stack_limit() * get_page_size();
             if (stack_consumption > soft_stack_limit_in_bytes) {
-                VERROR(common, stack_checker,
+                VWARN(common, stack_checker,
                         "'%s' consumed %lu bytes of "
                         "stack while the limit is %lu bytes",
                         context_.c_str(), stack_consumption,
diff --git a/src/common/stream.hpp b/src/common/stream.hpp
index a29627cdb1f..e9fa73295e6 100644
--- a/src/common/stream.hpp
+++ b/src/common/stream.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 struct dnnl_stream : public dnnl::impl::c_compatible {
     dnnl_stream(dnnl::impl::engine_t *engine, dnnl::impl::stream_impl_t *impl)
         : engine_(engine), impl_(impl) {}
-    virtual ~dnnl_stream() {}
+    virtual ~dnnl_stream() = default;
 
     /** returns stream's engine */
     dnnl::impl::engine_t *engine() const { return engine_; }
diff --git a/src/common/sum_pd.hpp b/src/common/sum_pd.hpp
index 38663af5515..bc50b1d6fa3 100644
--- a/src/common/sum_pd.hpp
+++ b/src/common/sum_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
 namespace dnnl {
 namespace impl {
 
+// NOLINTBEGIN(google-default-arguments)
 struct sum_pd_t : public primitive_desc_t {
     const sum_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
@@ -115,14 +116,14 @@ struct sum_pd_t : public primitive_desc_t {
         init_desc();
     }
 
-    sum_pd_t(const sum_pd_t &other) : primitive_desc_t(other) {
-        n_ = other.n_;
-        scales_ = other.scales_;
-        dst_md_ = other.dst_md_;
-        dst_acc_md_ = other.dst_acc_md_;
-        src_mds_ = other.src_mds_;
-        original_dst_md_ = other.original_dst_md_;
-
+    sum_pd_t(const sum_pd_t &other)
+        : primitive_desc_t(other)
+        , n_(other.n_)
+        , scales_(other.scales_)
+        , dst_md_(other.dst_md_)
+        , dst_acc_md_(other.dst_acc_md_)
+        , src_mds_(other.src_mds_)
+        , original_dst_md_(other.original_dst_md_) {
         init_desc();
     }
     sum_pd_t &operator=(const sum_pd_t &other) {
@@ -195,6 +196,7 @@ struct sum_pd_t : public primitive_desc_t {
             desc_.src_mds.push_back(&md);
     }
 };
+// NOLINTEND(google-default-arguments)
 
 #define DECLARE_SUM_PD_t(impl_name, ...) \
     static status_t create(sum_pd_t **sum_pd, dnnl::impl::engine_t *engine, \
diff --git a/src/common/tag_traits.hpp b/src/common/tag_traits.hpp
index 487f4581c9e..ad34ec963fb 100644
--- a/src/common/tag_traits.hpp
+++ b/src/common/tag_traits.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
+* Copyright 2024 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,6 +26,35 @@
 namespace dnnl {
 namespace impl {
 
+inline format_tag_t get_abx_tag(int ndims) {
+    switch (ndims) {
+        case 1: return format_tag::a;
+        case 2: return format_tag::ab;
+        case 3: return format_tag::abc;
+        case 4: return format_tag::abcd;
+        case 5: return format_tag::abcde;
+        case 6: return format_tag::abcdef;
+        case 7: return format_tag::abcdefg;
+        case 8: return format_tag::abcdefgh;
+        case 9: return format_tag::abcdefghi;
+        case 10: return format_tag::abcdefghij;
+        case 11: return format_tag::abcdefghijk;
+        case 12: return format_tag::abcdefghijkl;
+
+        default: assert(!"unexpected ndims"); return format_tag::undef;
+    }
+}
+
+inline format_tag_t get_axb_tag(int ndims) {
+    switch (ndims) {
+        case 2: return format_tag::ab;
+        case 3: return format_tag::acb;
+        case 4: return format_tag::acdb;
+        case 5: return format_tag::acdeb;
+        default: assert(!"unexpected ndims"); return format_tag::undef;
+    }
+}
+
 enum class block_dim_t {
     _,
     _A,
@@ -89,6 +119,7 @@ enum class inner_blk_t {
     _8b16c,
     _8b24c,
     _8b32a,
+    _8a32b,
     _8b8c,
     _8c2b,
     _8c4b,
@@ -112,6 +143,7 @@ enum class inner_blk_t {
     _16b4c,
     _16c2b,
     _16c4b,
+    _16e4c,
     _24a2b,
     _24a4b,
     _24b2a,
@@ -120,6 +152,7 @@ enum class inner_blk_t {
     _24b4c,
     _24c2b,
     _24c4b,
+    _16d4c,
     _32d4c,
     _32e2c,
     _32e4c,
@@ -209,7 +242,7 @@ constexpr int AB_or_BC_blk_off(int x0, int x1) {
             utils::one_of(f, ib::_4a4b, ib::_4b4a, ib::_4b4c, ib::_4c4b,
                     ib::_8a2b, ib::_8a4b, ib::_8b2a, ib::_8b4a, ib::_8b2c,
                     ib::_8c2b, ib::_8c4b, ib::_8b4c, ib::_8a8b, ib::_8b8a,
-                    ib::_8b16a, ib::_8b24a, ib::_8b32a, ib::_8b8c, ib::_8c8b,
+                    ib::_8b16a, ib::_8b24a, ib::_8b32a, ib::_8a32b, ib::_8b8c, ib::_8c8b,
                     ib::_16a16b, ib::_16b64a, ib::_16b48a, ib::_16b32a,
                     ib::_16b16a, ib::_16b16c, ib::_16c16b, ib::_32a32b,
                     ib::_16a2b, ib::_16a4b, ib::_16b2a, ib::_16b4a, ib::_16b2c,
@@ -241,9 +274,10 @@ constexpr int AB_or_BC_blk_off(int x0, int x1) {
         : (utils::one_of(f, ib::_2a24b, ib::_2b24c, ib::_8a24b, ib::_8b24c)) ? 24 * x0 + x1
         : (f == ib::_4a4b || f == ib::_4b4c) ? 4 * x0 + x1
         : (f == ib::_4b4a || f == ib::_4c4b) ? 4 * x1 + x0
-        : (f == ib::_8a8b || f == ib::_8b8c) ? 8 * x0 + x1
+        : (f == ib::_8a8b || f == ib::_8a32b || f == ib::_8b8c) ? 8 * x0 + x1
         : (f == ib::_8b8a || f == ib::_8c8b) ? 8 * x1 + x0
         : (utils::one_of(f, ib::_16a16b, ib::_16b16c, ib::_8a16b, ib::_8b16c)) ? 16 * x0 + x1
+        : (f == ib::_16a16b || f == ib::_16a32b || f == ib::_16b16c) ? 16 * x0 + x1
         : (f == ib::_16b64a) ? 64 * x1 + x0
         : (f == ib::_16b48a) ? 48 * x1 + x0
         : (f == ib::_8b32a || f == ib::_16b32a) ? 32 * x1 + x0
@@ -293,12 +327,12 @@ constexpr int AB_or_BC_blk_off(int x0, int x1) {
 }
 
 template <inner_blk_t b>
-struct inner_blk_traits {
+struct inner_blk_traits_t {
     using ib = inner_blk_t;
 };
 
 template <format_tag_t>
-struct tag_traits {
+struct tag_traits_t {
     // block_dim_t block_dims;
     // inner_blk_t inner_blks;
     // int ndims;
@@ -306,7 +340,7 @@ struct tag_traits {
 
 #define DECL_TRAITS(_tag, _blk_fmt, _inner_blk, _ndims) \
     template <> \
-    struct tag_traits<format_tag::_tag> { \
+    struct tag_traits_t<format_tag::_tag> { \
         static constexpr block_dim_t block_dims = block_dim_t::_blk_fmt; \
         static constexpr inner_blk_t inner_blks = inner_blk_t::_inner_blk; \
         static constexpr int ndims = _ndims; \
@@ -668,6 +702,8 @@ DECL_TRAITS(ABcde16b48a2b, _AB, _16b48a2b, 5);
 DECL_TRAITS(ABcde16b64a2b, _AB, _16b64a2b, 5);
 DECL_TRAITS(ABcd8a16b2a, _AB, _8a16b2a, 4);
 DECL_TRAITS(ABcd8a8b, _AB, _8a8b, 4);
+DECL_TRAITS(ABcd8a32b, _AB, _8a32b, 4);
+DECL_TRAITS(ABcd16a32b, _AB, _16a32b, 4);
 DECL_TRAITS(aBcd8b, _B, _8b, 4);
 DECL_TRAITS(ABcd8b16a2b, _AB, _8b16a2b, 4);
 DECL_TRAITS(AcdB8b16a2b, _AB, _8b16a2b, 4);
@@ -767,7 +803,9 @@ DECL_TRAITS(AcB16a4b, _AB, _16a4b, 3);
 DECL_TRAITS(Acb8a, _A, _8a, 3);
 DECL_TRAITS(AcB8a2b, _AB, _8a2b, 3);
 DECL_TRAITS(AcB8a4b, _AB, _8a4b, 3);
+DECL_TRAITS(aCBd8b8c, _BC, _8b8c, 4);
 DECL_TRAITS(aCBd16b16c, _BC, _16b16c, 4);
+DECL_TRAITS(aCBde8b8c, _BC, _8b8c, 5);
 DECL_TRAITS(aCBde16b16c, _BC, _16b16c, 5);
 DECL_TRAITS(Acdb16a, _A, _16a, 4);
 DECL_TRAITS(AcdB16a2b, _AB, _16a2b, 4);
@@ -783,8 +821,11 @@ DECL_TRAITS(AcdeB8a2b, _AB, _8a2b, 5);
 DECL_TRAITS(AcdeB8a4b, _AB, _8a4b, 5);
 DECL_TRAITS(Acedb16a, _A, _16a, 5);
 DECL_TRAITS(Adcb16a, _A, _16a, 4);
+DECL_TRAITS(BAc8a8b, _AB, _8a8b, 3);
 DECL_TRAITS(BAc16a16b, _AB, _16a16b, 3);
+DECL_TRAITS(BAcd8a8b, _AB, _8a8b, 4);
 DECL_TRAITS(BAcd16a16b, _AB, _16a16b, 4);
+DECL_TRAITS(BAcde8a8b, _AB, _8a8b, 5);
 DECL_TRAITS(BAcde16a16b, _AB, _16a16b, 5);
 DECL_TRAITS(ABcd32a32b, _AB, _32a32b, 4);
 DECL_TRAITS(BAcde16b16a, _AB, _16b16a, 5);
@@ -794,7 +835,10 @@ DECL_TRAITS(aBCde4b8c8b4c, _BC, _4b8c8b4c, 5);
 DECL_TRAITS(aBCde2b8c8b2c, _BC, _2b8c8b2c, 5);
 DECL_TRAITS(aBdec32b, _B, _32b, 5);
 DECL_TRAITS(aCBdef16c16b, _BC, _16c16b, 6);
+DECL_TRAITS(aCBdef8b8c, _BC, _8b8c, 6);
 DECL_TRAITS(aCBdef16b16c, _BC, _16b16c, 6);
+DECL_TRAITS(Abcdef4a, _A, _4a, 6);
+DECL_TRAITS(Abcdef8a, _A, _8a, 6);
 DECL_TRAITS(Abcdef16a, _A, _16a, 6);
 DECL_TRAITS(aCBd16c16b, _BC, _16c16b, 4);
 DECL_TRAITS(aCBde16c16b, _BC, _16c16b, 4);
@@ -815,6 +859,7 @@ DECL_TRAITS(aBCde4c8b2c, _BC, _4c8b2c, 5);
 DECL_TRAITS(aBCdef4c8b2c, _BC, _4c8b2c, 6);
 DECL_TRAITS(abDc16d, _D, _16d, 4);
 DECL_TRAITS(abDc32d, _D, _32d, 4);
+DECL_TRAITS(abDC16d4c, _CD, _16d4c, 4);
 DECL_TRAITS(abDC32d4c, _CD, _32d4c, 4);
 DECL_TRAITS(abCd32c, _C, _32c, 4);
 DECL_TRAITS(abCde32c, _C, _32c, 5);
@@ -824,6 +869,7 @@ DECL_TRAITS(abCde4c, _C, _4c, 5);
 DECL_TRAITS(abCdef4c, _C, _4c, 6);
 DECL_TRAITS(abdEc16e, _E, _16e, 5);
 DECL_TRAITS(abdEc32e, _E, _32e, 5);
+DECL_TRAITS(abdEC16e4c, _CE, _16e4c, 5);
 DECL_TRAITS(abdEC32e2c, _CE, _32e2c, 5);
 DECL_TRAITS(abdEC32e4c, _CE, _32e4c, 5);
 DECL_TRAITS(abdEC64e2c, _CE, _64e2c, 5);
diff --git a/src/common/type_helpers.hpp b/src/common/type_helpers.hpp
index d5ea2c38dda..ad00fadde16 100644
--- a/src/common/type_helpers.hpp
+++ b/src/common/type_helpers.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,8 +26,11 @@
 #include "bit_cast.hpp"
 #include "c_types_map.hpp"
 #include "dnnl_traits.hpp"
+#include "gemm_types.hpp"
 #include "memory_desc.hpp"
 #include "nstl.hpp"
+#include "opdesc.hpp"
+#include "sdpa_types.hpp"
 #include "utils.hpp"
 
 namespace dnnl {
@@ -50,8 +53,16 @@ status_t safe_ptr_assign(std::unique_ptr<base_type> &lhs, derived_type *rhs) {
     return status::success;
 }
 
+template <typename base_type, typename base_type_deleter, typename derived_type>
+status_t safe_ptr_assign(
+        std::unique_ptr<base_type, base_type_deleter> &lhs, derived_type *rhs) {
+    if (rhs == nullptr) return status::out_of_memory;
+    lhs.reset(rhs);
+    return status::success;
+}
+
 template <typename T, typename U>
-struct is_subset {
+struct is_subset { // NOLINT(readability-identifier-naming)
     static constexpr bool value = false;
 };
 template <typename T>
@@ -82,20 +93,24 @@ namespace types {
 inline size_t data_type_size(data_type_t data_type) {
     using namespace data_type;
     switch ((int)data_type) {
-        case e8m0: return sizeof(prec_traits<e8m0>::type);
-        case f8_e5m2: return sizeof(prec_traits<f8_e5m2>::type);
-        case f8_e4m3: return sizeof(prec_traits<f8_e4m3>::type);
-        case f16: return sizeof(prec_traits<f16>::type);
-        case bf16: return sizeof(prec_traits<bf16>::type);
+        case f4_e3m0: return sizeof(prec_traits_t<f4_e3m0>::type);
+        case f4_e2m1: return sizeof(prec_traits_t<f4_e2m1>::type);
+        case e8m0: return sizeof(prec_traits_t<e8m0>::type);
+        case f8_e5m2: return sizeof(prec_traits_t<f8_e5m2>::type);
+        case f8_e4m3: return sizeof(prec_traits_t<f8_e4m3>::type);
+        case f16: return sizeof(prec_traits_t<f16>::type);
+        case bf16: return sizeof(prec_traits_t<bf16>::type);
         case tf32: // the tf32 type is an f32
-        case f32: return sizeof(prec_traits<f32>::type);
-        case f64: return sizeof(prec_traits<f64>::type);
-        case s32: return sizeof(prec_traits<s32>::type);
-        case s8: return sizeof(prec_traits<s8>::type);
-        case u8: return sizeof(prec_traits<u8>::type);
-        case s4: return sizeof(prec_traits<s4>::type);
-        case u4: return sizeof(prec_traits<u4>::type);
-        case boolean: return sizeof(prec_traits<boolean>::type);
+        case f32: return sizeof(prec_traits_t<f32>::type);
+        case f64: return sizeof(prec_traits_t<f64>::type);
+        case s32: return sizeof(prec_traits_t<s32>::type);
+        case s8: return sizeof(prec_traits_t<s8>::type);
+        case u8: return sizeof(prec_traits_t<u8>::type);
+        case s4: return sizeof(prec_traits_t<s4>::type);
+        case u4: return sizeof(prec_traits_t<u4>::type);
+        case boolean: return sizeof(prec_traits_t<boolean>::type);
+        case bin: return sizeof(prec_traits_t<u8>::type);
+        case nf4: return sizeof(prec_traits_t<u8>::type);
         case data_type::undef:
         default: assert(!"unknown data_type");
     }
@@ -105,6 +120,8 @@ inline size_t data_type_size(data_type_t data_type) {
 inline size_t elements_to_bytes(data_type_t data_type, size_t count) {
     using namespace data_type;
     switch ((int)data_type) {
+        case f4_e2m1:
+        case f4_e3m0:
         case s4:
         case u4: return (count + 1) >> 1;
         default: return data_type_size(data_type) * count;
@@ -114,6 +131,8 @@ inline size_t elements_to_bytes(data_type_t data_type, size_t count) {
 inline size_t bytes_to_elements(data_type_t data_type, size_t bytes) {
     using namespace data_type;
     switch ((int)data_type) {
+        case f4_e2m1:
+        case f4_e3m0:
         case s4:
         case u4: return bytes * 2;
         default: return utils::div_up(bytes, data_type_size(data_type));
@@ -125,14 +144,18 @@ inline T min_value(data_type_t data_type) {
     using namespace data_type;
 #define CASE(x) \
     case x: \
-        return static_cast<T>(nstl::numeric_limits<prec_traits<x>::type>::min())
+        return static_cast<T>( \
+                nstl::numeric_limits<prec_traits_t<x>::type>::min())
     switch (data_type) {
+        CASE(f4_e3m0);
+        CASE(f4_e2m1);
         CASE(e8m0);
         CASE(f8_e5m2);
         CASE(f8_e4m3);
         CASE(f16);
         CASE(bf16);
         CASE(f32);
+        CASE(f64);
         CASE(s32);
         CASE(s8);
         CASE(u8);
@@ -150,19 +173,23 @@ inline T max_value(data_type_t data_type) {
     using namespace data_type;
 #define CASE(x) \
     case x: \
-        return static_cast<T>(nstl::numeric_limits<prec_traits<x>::type>::max())
+        return static_cast<T>( \
+                nstl::numeric_limits<prec_traits_t<x>::type>::max())
     switch (data_type) {
+        CASE(f4_e3m0);
+        CASE(f4_e2m1);
         CASE(e8m0);
         CASE(f8_e5m2);
         CASE(f8_e4m3);
         CASE(f16);
-        CASE(f32);
         CASE(bf16);
+        CASE(f32);
         CASE(s32);
         CASE(s8);
         CASE(u8);
         CASE(s4);
         CASE(u4);
+        case f64: return nstl::numeric_limits<T>::max();
         case data_type::undef:
         default: assert(!"unknown data_type");
     }
@@ -177,8 +204,10 @@ inline float max_value(data_type_t data_type) {
 #define CASE(x) \
     case x: \
         return static_cast<float>( \
-                nstl::numeric_limits<prec_traits<x>::type>::max())
+                nstl::numeric_limits<prec_traits_t<x>::type>::max())
     switch (data_type) {
+        CASE(f4_e3m0);
+        CASE(f4_e2m1);
         CASE(e8m0);
         CASE(f8_e5m2);
         CASE(f8_e4m3);
@@ -200,6 +229,7 @@ inline float max_value(data_type_t data_type) {
         // approach is saturating on some integer values before it should happen
         // in the reality.
         case s32: return 2147483520.f;
+        case f64: return nstl::numeric_limits<float>::max();
         case data_type::undef:
         default: assert(!"unknown data_type");
     }
@@ -213,8 +243,10 @@ inline T lowest_value(data_type_t data_type) {
 #define CASE(x) \
     case x: \
         return static_cast<T>( \
-                nstl::numeric_limits<prec_traits<x>::type>::lowest())
+                nstl::numeric_limits<prec_traits_t<x>::type>::lowest())
     switch (data_type) {
+        CASE(f4_e3m0);
+        CASE(f4_e2m1);
         CASE(e8m0);
         CASE(f8_e5m2);
         CASE(f8_e4m3);
@@ -226,6 +258,7 @@ inline T lowest_value(data_type_t data_type) {
         CASE(u8);
         CASE(s4);
         CASE(u4);
+        case f64: return nstl::numeric_limits<T>::lowest();
         case data_type::undef:
         default: assert(!"unknown data_type");
     }
@@ -239,14 +272,17 @@ inline T digits(data_type_t data_type) {
 #define CASE(x) \
     case x: \
         return static_cast<T>( \
-                nstl::numeric_limits<prec_traits<x>::type>::digits)
+                nstl::numeric_limits<prec_traits_t<x>::type>::digits)
     switch (data_type) {
+        CASE(f4_e3m0);
+        CASE(f4_e2m1);
         CASE(e8m0);
         CASE(f8_e5m2);
         CASE(f8_e4m3);
         CASE(f16);
         CASE(bf16);
         CASE(f32);
+        CASE(f64);
         CASE(s32);
         CASE(s8);
         CASE(u8);
@@ -271,31 +307,24 @@ inline format_kind_t format_tag_to_kind(format_tag_t tag) {
     return format_kind::undef;
 }
 
-// Currently rnn_s8s8_compensation has common bits with rnn_u8s8_compensation
-// and scale_adjust constants so we have to perform additional checks to
-// separate these two cases
-inline bool extra_flag_rnn_s8s8_compensation_is_set(uint64_t flags) {
-    return ((flags & memory_extra_flags::rnn_s8s8_compensation)
-                   ^ memory_extra_flags::rnn_s8s8_compensation)
-            == 0;
-}
-
 inline bool memory_extra_desc_is_equal(
         const memory_extra_desc_t &lhs, const memory_extra_desc_t &rhs) {
     using namespace memory_extra_flags;
-    return true && lhs.flags == rhs.flags
+    return lhs.flags == rhs.flags
             && IMPLICATION(lhs.flags & compensation_conv_s8s8,
                     lhs.compensation_mask == rhs.compensation_mask)
-            && IMPLICATION((lhs.flags & rnn_u8s8_compensation)
-                            && !extra_flag_rnn_s8s8_compensation_is_set(
-                                    lhs.flags),
+            && IMPLICATION(lhs.flags & rnn_u8s8_compensation,
                     lhs.compensation_mask == rhs.compensation_mask)
-            && IMPLICATION((lhs.flags & scale_adjust)
-                            && !extra_flag_rnn_s8s8_compensation_is_set(
-                                    lhs.flags),
+            && IMPLICATION(lhs.flags & scale_adjust,
                     lhs.scale_adjust == rhs.scale_adjust)
             && IMPLICATION(lhs.flags & compensation_conv_asymmetric_src,
-                    lhs.asymm_compensation_mask == rhs.asymm_compensation_mask);
+                    lhs.asymm_compensation_mask == rhs.asymm_compensation_mask)
+            && IMPLICATION(lhs.flags & compensation_gpu_conv_asymmetric_src,
+                    (lhs.dst_size == rhs.dst_size)
+                            && utils::array_cmp(lhs.idhw, rhs.idhw, 3)
+                            && utils::array_cmp(lhs.odhw, rhs.odhw, 3)
+                            && utils::array_cmp(lhs.pdhw, rhs.pdhw, 3)
+                            && utils::array_cmp(lhs.ddhw, rhs.ddhw, 3));
 }
 
 inline bool blocking_desc_is_equal(const memory_desc_t &lhs_md,
@@ -327,12 +356,12 @@ inline bool blocking_desc_is_equal(const memory_desc_t &lhs_md,
     bool equal = lhs.inner_nblks == rhs.inner_nblks
             && array_cmp(lhs.inner_blks, rhs.inner_blks, lhs.inner_nblks)
             && array_cmp(lhs.inner_idxs, rhs.inner_idxs, lhs.inner_nblks);
-    if (ignore_strides) return equal;
 
     // Check the strides.
-    // Note: for dimensions of size `1` the stride doesn't really matter.
+    // Note: for dimensions of size `1` the stride doesn't really matter
+    if (ignore_strides) return equal;
+
     for (int d = 0; d < lhs_md.ndims; ++d) {
-        if (lhs_md.dims[d] == 1 && lhs_md.padded_dims[d] == 1) continue;
         equal = equal && lhs.strides[d] == rhs.strides[d];
     }
 
@@ -346,6 +375,10 @@ inline bool wino_desc_is_equal(const wino_desc_t &lhs, const wino_desc_t &rhs) {
             && lhs.ic2_block == rhs.ic2_block && lhs.oc2_block == rhs.oc2_block
             && lhs.r == rhs.r;
 }
+inline bool cublaslt_blocked_desc_is_equal(const cublaslt_blocked_desc_t &lhs,
+        const cublaslt_blocked_desc_t &rhs) {
+    return lhs.cublaslt_format == rhs.cublaslt_format && lhs.size == rhs.size;
+}
 
 inline bool rnn_packed_desc_is_equal(
         const rnn_packed_desc_t &lhs, const rnn_packed_desc_t &rhs) {
@@ -364,6 +397,7 @@ inline bool rnn_packed_desc_is_equal(
 
 inline bool sparse_desc_is_equal(
         const sparse_desc_t &lhs, const sparse_desc_t &rhs) {
+#if 0
     bool ok = lhs.encoding == rhs.encoding && lhs.nnz == rhs.nnz;
     if (!ok) return false;
 
@@ -371,6 +405,8 @@ inline bool sparse_desc_is_equal(
         ok = ok && lhs.metadata_types[i] == rhs.metadata_types[i];
 
     return ok;
+#endif
+    return lhs.encoding == rhs.encoding;
 }
 
 inline memory_desc_t zero_md() {
@@ -392,6 +428,8 @@ inline data_type_t default_accum_data_type(
     // true
     if (one_of(src_dt, s8, u8, u4, s4) && (dst_dt != f32 || strict)) return s32;
 
+    if (one_of(f4_e3m0, src_dt, dst_dt)) return f32;
+    if (one_of(f4_e2m1, src_dt, dst_dt)) return f32;
     if (one_of(f8_e5m2, src_dt, dst_dt)) return f32;
     if (one_of(f8_e4m3, src_dt, dst_dt)) return f32;
     if (one_of(f16, src_dt, dst_dt)) return f32;
@@ -415,6 +453,7 @@ inline data_type_t default_accum_data_type(data_type_t src_dt,
 
     /* prop_kind doesn't matter */
     if (everyone_is(f32, src_dt, wei_dt)) return f32;
+    if (one_of(src_dt, f32, bf16) && one_of(wei_dt, u8, s8, nf4, s4, u4, f4_e2m1)) return f32;
     if (everyone_is(f64, src_dt, wei_dt)) return f64;
 
     if (one_of(prop_kind, forward_training, forward_inference)) {
@@ -433,6 +472,8 @@ inline data_type_t default_accum_data_type(data_type_t src_dt,
             return f32;
     }
 
+    if (one_of(f4_e3m0, src_dt, wei_dt, dst_dt)) return f32;
+    if (one_of(f4_e2m1, src_dt, wei_dt, dst_dt)) return f32;
     if (one_of(f8_e5m2, src_dt, wei_dt, dst_dt)) return f32;
     if (one_of(f8_e4m3, src_dt, wei_dt, dst_dt)) return f32;
     if (one_of(bf16, src_dt, wei_dt, dst_dt)) return f32;
@@ -594,7 +635,7 @@ inline bool operator!=(const memory_desc_t &lhs, const memory_desc_t &rhs) {
 #define DEREF_AND_COMPARE_DESC_MEMBERS(m) *lhs.m == *rhs.m
 #define COMPARE_FLOAT_DESC_MEMBERS(m) utils::equal_with_nan(lhs.m, rhs.m)
 #define COMPARE_FLOAT_DESC_ARRAY_MEMBERS(m, s) \
-    !std::memcmp(lhs.m, rhs.m, sizeof(float) * s)
+    !std::memcmp(lhs.m, rhs.m, sizeof(float) * (s))
 
 // clang-format off
 inline bool operator==(const batch_normalization_desc_t &lhs,
@@ -619,6 +660,12 @@ inline bool operator==(const binary_desc_t &lhs, const binary_desc_t &rhs) {
             && COMPARE_DESC_MEMBERS(src_desc[0])
             && COMPARE_DESC_MEMBERS(src_desc[1])
             && COMPARE_DESC_MEMBERS(dst_desc);
+
+    // For ternary operators like select, the additional input for conditional 
+    // select must also be compared
+    if(utils::one_of(alg_kind::binary_select, lhs.alg_kind, rhs.alg_kind))
+        ret = ret && COMPARE_DESC_MEMBERS(src_desc[2]);
+
     return ret;
 }
 
@@ -637,11 +684,12 @@ inline bool operator==(const concat_desc_t &lhs, const concat_desc_t &rhs) {
     return ret;
 }
 
-inline bool operator==(
-        const convolution_desc_t &lhs, const convolution_desc_t &rhs) {
+// This function can only be used to compare the opdescs in the primitive cache.
+// For comparing the opdescs outside the primitive cache please use the regular
+// comparison operator (==).
+inline bool compare_conv_opdesc(const convolution_desc_t &lhs, const convolution_desc_t &rhs) {
     bool ret = COMPARE_DESC_MEMBERS(primitive_kind)
             && COMPARE_DESC_MEMBERS(prop_kind)
-            && COMPARE_DESC_MEMBERS(alg_kind)
             && COMPARE_DESC_MEMBERS(src_desc)
             && COMPARE_DESC_MEMBERS(diff_src_desc)
             && COMPARE_DESC_MEMBERS(weights_desc)
@@ -656,9 +704,31 @@ inline bool operator==(
             && COMPARE_DESC_ARRAY_MEMBERS(padding[1], DNNL_MAX_NDIMS)
             && COMPARE_DESC_MEMBERS(accum_data_type)
             && COMPARE_DESC_MEMBERS(use_inversion);
+
+      // The `alg_kind` can be `auto` only if this function is called for the
+      // primitive descriptor cache scenario. In this case, we ignore `alg_kind`
+      // and rely on `pd_iterator_offset` to fetch the first suitable
+      // implementation.
+      //
+      // Background: when a convolution primitive descriptor is created for
+      // the algorithm `auto` we overwrite `alg_kind` field in `op_desc` when
+      // store it in the primitive descriptor. Because of that, the `op_desc`
+      // stored in the primitive descriptor is different from the one user
+      // passed to oneDNN API. Because of the difference the requested
+      // primitive descriptor cannot be found in the cache if we compare
+      // `alg_kind`.
+      if (!utils::one_of(alg_kind::convolution_auto, lhs.alg_kind, rhs.alg_kind))
+          ret = ret && COMPARE_DESC_MEMBERS(alg_kind);
+
     return ret;
 }
 
+inline bool operator==(
+        const convolution_desc_t &lhs, const convolution_desc_t &rhs) {
+        if (!(COMPARE_DESC_MEMBERS(alg_kind))) return false;
+        return compare_conv_opdesc(lhs, rhs);
+}
+
 inline bool operator==(const eltwise_desc_t &lhs, const eltwise_desc_t &rhs) {
     bool ret = COMPARE_DESC_MEMBERS(primitive_kind)
             && COMPARE_DESC_MEMBERS(prop_kind)
@@ -754,6 +824,8 @@ inline bool operator==(const matmul_desc_t &lhs, const matmul_desc_t &rhs) {
             && COMPARE_DESC_MEMBERS(weights_desc)
             && COMPARE_DESC_MEMBERS(bias_desc)
             && COMPARE_DESC_MEMBERS(dst_desc)
+            && COMPARE_DESC_MEMBERS(reduce_desc)
+            && COMPARE_DESC_MEMBERS(reduce_kind)
             && COMPARE_DESC_MEMBERS(accum_data_type);
     return ret;
 }
@@ -908,10 +980,16 @@ inline bool operator==(const sdpa_desc_t &lhs, const sdpa_desc_t &rhs) {
             && COMPARE_DESC_MEMBERS(q_desc)
             && COMPARE_DESC_MEMBERS(k_desc)
             && COMPARE_DESC_MEMBERS(v_desc)
+            && COMPARE_DESC_MEMBERS(kq_scales)
+            && COMPARE_DESC_MEMBERS(kq_zero_points)
+            && COMPARE_DESC_MEMBERS(vs_scales)
+            && COMPARE_DESC_MEMBERS(vs_zero_points)
             && COMPARE_DESC_MEMBERS(dst_desc)
             && COMPARE_DESC_MEMBERS(attn_mask_desc)
             && COMPARE_DESC_MEMBERS(scale_dt)
-            && COMPARE_DESC_MEMBERS(invert_scale);
+            && COMPARE_DESC_MEMBERS(invert_scale)
+            && COMPARE_DESC_MEMBERS(kv_head_number)
+            && COMPARE_DESC_MEMBERS(mask_type);
     return ret;
 }
 
@@ -923,7 +1001,8 @@ inline bool operator==(const sdpa_desc_t &lhs, const sdpa_desc_t &rhs) {
 #undef COMPARE_FLOAT_DESC_MEMBERS
 #undef COMPARE_FLOAT_DESC_ARRAY_MEMBERS
 
-inline bool is_dense_format_kind(const std::vector<const memory_desc_t *> mds) {
+inline bool is_dense_format_kind(
+        const std::vector<const memory_desc_t *> &mds) {
 #ifdef DNNL_EXPERIMENTAL_SPARSE
     for (const auto *md : mds)
         if (md->format_kind == format_kind::sparse) return false;
@@ -1034,22 +1113,21 @@ inline status_t memory_desc_init_by_tag(
     const bool is_sparse = md.format_kind == format_kind::sparse;
     auto md_tmp = memory_desc_t();
 
-    CHECK(memory_desc_init_by_tag(
-            md_tmp, md.ndims, md.dims, md.data_type, tag));
-
-    if (strides != nullptr && !memory_desc_strides_check(md_tmp, strides))
-        return status::invalid_arguments;
+    status_t status =
+        memory_desc_init_by_tag(md_tmp, md.ndims, md.dims, md.data_type, tag);
 
     if (is_sparse) {
-        if (md.format_desc.sparse_desc.encoding != sparse_encoding::packed
-                || md.offset0 != 0)
-            return status::invalid_arguments;
-        md = cvt_blocked2sparse_packed(md_tmp, md.format_desc.sparse_desc.nnz);
+        const auto &bd = md_tmp.format_desc.blocking;
+        md.format_desc.sparse_desc.encoding = sparse_encoding::packed;
+        md.format_desc.sparse_desc.packed_desc = bd;
     } else {
         md = md_tmp;
     }
 
-    if (strides == nullptr) return status::success;
+    if (status != status::success || strides == nullptr) return status;
+
+    if (!memory_desc_strides_check(md_tmp, strides))
+        return status::invalid_arguments;
 
     for (int d = 0; d < md.ndims; ++d) {
         if (is_sparse)
@@ -1057,7 +1135,6 @@ inline status_t memory_desc_init_by_tag(
         else
             md.format_desc.blocking.strides[d] = strides[d];
     }
-
     return status::success;
 }
 
@@ -1135,9 +1212,19 @@ inline status_t memory_desc_init_by_md_and_dt(memory_desc_t &md,
  * Assumes a dense structure such as that returned by memory_desc_init_by_tag().
  * Strides must match those returned by memory_desc_init_by_tag(), with one
  * exception: the strides of unit dimensions are ignored in order to align with
- * memory descriptor equality comparisons and hashing.
- */
-inline bool memory_desc_matches_tag(const memory_desc_t &md, format_tag_t tag) {
+ * memory descriptor equality comparisons and hashing,
+ * the strides of unit dimensions are ignored.
+ * When strides are empty the dense structure is assumed (e.g., the one that
+ * memory_desc_init_by_tag() returns).
+ * When strides are not empty, standard strides check is overrided, and
+ * additional rules are applied:
+ * Strides might contain `0` value, indicating the stride must match the one
+ * that memory_desc_init_by_tag() returns.
+ * Strides might contain `-1` values, that would be ignored during the
+ * comparison. For instance, this can be used if a stride along minibatch
+ * doesn't matter. */
+inline bool memory_desc_matches_tag(const memory_desc_t &md, format_tag_t tag,
+        const dims_t strides = nullptr) {
     if (md.format_kind != format_kind::sparse) {
         if (md.format_kind != types::format_tag_to_kind(tag)) return false;
     }
@@ -1146,8 +1233,38 @@ inline bool memory_desc_matches_tag(const memory_desc_t &md, format_tag_t tag) {
     status_t status = memory_desc_init_by_tag(
             md_gold, md.ndims, md.dims, md.data_type, tag);
     if (status != status::success) return false;
+    // bool equal = types::blocking_desc_is_equal(
+    //         md, md_gold, /* ignore_strides = */ (bool)strides);
+    // if (!strides || !equal) return equal;
+
+    const bool is_sparse_packed_desc = md.format_kind == format_kind::sparse
+            && md.format_desc.sparse_desc.encoding == sparse_encoding::packed;
 
-    return types::blocking_desc_is_equal(md, md_gold);
+    if (md.format_kind != format_kind::blocked && !is_sparse_packed_desc)
+        return false; // unimplemented yet
+
+    const auto &blk = md.format_kind == format_kind::blocked
+            ? md.format_desc.blocking
+            : md.format_desc.sparse_desc.packed_desc;
+    const auto &blk_gold = md_gold.format_desc.blocking;
+
+    using utils::array_cmp;
+    bool same_blocks = true && blk.inner_nblks == blk_gold.inner_nblks
+            && array_cmp(blk.inner_blks, blk_gold.inner_blks, blk.inner_nblks)
+            && array_cmp(blk.inner_idxs, blk_gold.inner_idxs, blk.inner_nblks);
+
+    if (!same_blocks) return false;
+
+    if (strides == nullptr)
+        return array_cmp(blk.strides, blk_gold.strides, md.ndims);
+
+    for (int d = 0; d < md.ndims; ++d) {
+        dim_t stride = strides[d];
+        if (stride == -1) continue;
+        if (stride == 0) stride = blk_gold.strides[d];
+        if (blk.strides[d] != stride) return false;
+    }
+    return true;
 }
 
 /** returns matching tag (or undef if match is not found)
@@ -1183,8 +1300,8 @@ inline bool memory_desc_sanity_check(int ndims, const dims_t dims,
     if (ndims == 0) return true;
 
     bool ok = dims != nullptr && 0 < ndims && ndims <= DNNL_MAX_NDIMS
-            && utils::one_of(data_type, f8_e5m2, f8_e4m3, f16, bf16, f32, f64,
-                    s32, s8, u8, s4, u4);
+            && utils::one_of(data_type, f4_e3m0, f4_e2m1, e8m0, f8_e5m2,
+                    f8_e4m3, f16, bf16, f32, f64, s32, s8, u8, s4, u4, bin, nf4);
     if (!ok) return false;
 
     bool has_runtime_dims = false;
@@ -1206,38 +1323,6 @@ inline bool memory_desc_sanity_check(const memory_desc_t &md) {
             md.ndims, md.dims, md.data_type, format_kind::undef);
 }
 
-inline void copy_c_op_desc(op_desc_t *dst, const op_desc_t *src) {
-#define CASE_OP_DESC(pkind) \
-    case primitive_kind::pkind: dst->pkind = src->pkind; break;
-
-    switch ((int)src->kind) {
-        CASE_OP_DESC(batch_normalization);
-        CASE_OP_DESC(binary);
-        CASE_OP_DESC(convolution);
-        CASE_OP_DESC(deconvolution);
-        CASE_OP_DESC(eltwise);
-        CASE_OP_DESC(gemm);
-        CASE_OP_DESC(group_normalization);
-        CASE_OP_DESC(inner_product);
-        CASE_OP_DESC(layer_normalization);
-        CASE_OP_DESC(lrn);
-        CASE_OP_DESC(matmul);
-        CASE_OP_DESC(pooling);
-        CASE_OP_DESC(prelu);
-        CASE_OP_DESC(reduction);
-        CASE_OP_DESC(resampling);
-        CASE_OP_DESC(rnn);
-        CASE_OP_DESC(sdpa);
-        CASE_OP_DESC(shuffle);
-        CASE_OP_DESC(softmax);
-
-        // Internal descs
-        CASE_OP_DESC(zero_pad);
-        default: assert(!"unknown C primitive kind");
-    }
-#undef CASE_OP_DESC
-}
-
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index a11d9104bfd..9ccb8531f96 100644
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -19,12 +19,12 @@
 #include <windows.h>
 #endif
 
-#if defined __unix__ || defined __APPLE__ || defined __FreeBSD__ \
-        || defined __Fuchsia__
+#if defined(__unix__) || defined(__APPLE__) || defined(__FreeBSD__) \
+        || defined(__Fuchsia__)
 #include <unistd.h>
 #endif
 
-#ifdef __unix__
+#if defined(__unix__) || defined(__APPLE__)
 #include <sys/stat.h>
 #include <sys/types.h>
 #endif
@@ -41,6 +41,7 @@
 
 #include "memory_debug.hpp"
 #include "utils.hpp"
+#include "verbose.hpp"
 
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
 #include "cpu/platform.hpp"
@@ -124,7 +125,49 @@ std::string getenv_string_user(const char *name) {
     return value;
 }
 
+status_t check_for_symlinks(const char *filename, bool *res) {
+#ifdef _WIN32
+    DWORD attr = GetFileAttributes(filename);
+
+    // checking for ERROR_FILE_NOT_FOUND allows the application to open
+    // new files without raising an exception
+    if (attr == INVALID_FILE_ATTRIBUTES)
+        return (GetLastError() == ERROR_FILE_NOT_FOUND)
+                ? status::success
+                : status::invalid_arguments;
+    *res = (attr & FILE_ATTRIBUTE_REPARSE_POINT);
+    return status::success;
+#else
+    struct stat finfo;
+    // checking for ENOENT allows the application to open new files without
+    // raising an exception
+    if (lstat(filename, &finfo) != 0)
+        return (errno == ENOENT) ? status::success : status::invalid_arguments;
+    *res = (finfo.st_mode & S_IFMT) == S_IFLNK;
+    return status::success;
+#endif
+}
+
 FILE *fopen(const char *filename, const char *mode) {
+    bool is_symlink = false;
+    status_t fattr_status = check_for_symlinks(filename, &is_symlink);
+
+    // For any return status other than status::success, the file IO operation
+    // is abandoned implying a major issue in retrieving the file
+    if (fattr_status != status::success) {
+        VERROR(common, common, "error reading file attributes for %s",
+                filename);
+        return nullptr;
+    }
+
+    // The symlink flag is updated and checked only after the file attributes are
+    // successfully read, avoiding the use of an uninitialized variable.
+    if (is_symlink) {
+        VERROR(common, common,
+                "cannot open %s - specified file is a symbolic link", filename);
+        return nullptr;
+    }
+
 #ifdef _WIN32
     FILE *fp = NULL;
     return ::fopen_s(&fp, filename, mode) ? NULL : fp;
@@ -187,7 +230,7 @@ bool get_jit_dump() {
     return jit_dump.get();
 }
 
-#if defined(DNNL_AARCH64) && (DNNL_AARCH64 == 1)
+#if defined(DNNL_AARCH64) && (DNNL_AARCH64 == 1) || defined(DNNL_ARM) && (DNNL_ARM == 1)
 static setting_t<unsigned> jit_profiling_flags {DNNL_JIT_PROFILE_LINUX_PERFMAP};
 #else
 static setting_t<unsigned> jit_profiling_flags {DNNL_JIT_PROFILE_VTUNE};
diff --git a/src/common/utils.hpp b/src/common/utils.hpp
index cb9d681dccc..84957503436 100644
--- a/src/common/utils.hpp
+++ b/src/common/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,21 +52,25 @@ namespace impl {
 
 #define DNNL_SHORT_CIRCUIT_SELF_ASSIGN(other) \
     do { \
-        if (this == &other) return *this; \
+        if (this == &(other)) return *this; \
     } while (0)
 
 #define DNNL_SHORT_CIRCUIT_SELF_COMPARISON(other) \
     do { \
-        if (this == &other) return true; \
+        if (this == &(other)) return true; \
     } while (0)
 
 #define DNNL_DISALLOW_COPY_AND_ASSIGN(T) \
     T(const T &) = delete; \
-    T &operator=(const T &) = delete;
+    void operator=(const T &) = delete;
 
 // Sanity check for 64 bits
-static_assert(sizeof(void *) == 8, "oneDNN supports 64-bit architectures only");
+// static_assert(sizeof(void *) == 8, "oneDNN supports 64-bit architectures only");
 
+// Note: if `f` has any explicit templated arguments, e.g., func<A, B>, then
+// compiler returns `error: macro "CHECK" passed 2 arguments, but takes just 1`.
+// The solution is to use an alias, e.g. `using func_alias = func<A, B>;` and
+// use `func_alias` in CHECK, then it compiles.
 #define CHECK(f) \
     do { \
         dnnl::impl::status_t _status_ = f; \
@@ -88,6 +92,15 @@ static_assert(sizeof(void *) == 8, "oneDNN supports 64-bit architectures only");
 
 #define IMPLICATION(cause, effect) (!(cause) || !!(effect))
 
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER) \
+        || defined(__INTEL_LLVM_COMPILER)
+#define FORCE_INLINE __forceinline
+#elif defined(__clang__) || defined(__GNUC__)
+#define FORCE_INLINE inline __attribute__((always_inline))
+#else
+#define FORCE_INLINE inline
+#endif
+
 namespace utils {
 
 /* a bunch of std:: analogues to be compliant with any msvs version
@@ -100,41 +113,50 @@ namespace utils {
 
 /* SFINAE helper -- analogue to std::enable_if */
 template <bool expr, class T = void>
-struct enable_if {};
+struct enable_if {}; // NOLINT(readability-identifier-naming)
+
 template <class T>
 struct enable_if<true, T> {
-    typedef T type;
+    using type = T;
 };
 
+// Replacement implementation of std::enable_if_t from C++14, included here for
+// interoperability with C++11
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+
+template <typename T>
+using is_vector = std::is_same<T, typename std::vector<typename T::value_type>>;
+
 /* analogue std::conditional */
 template <bool, typename, typename>
-struct conditional {};
+struct conditional {}; // NOLINT(readability-identifier-naming)
 template <typename T, typename F>
 struct conditional<true, T, F> {
-    typedef T type;
+    using type = T;
 };
 template <typename T, typename F>
 struct conditional<false, T, F> {
-    typedef F type;
+    using type = F;
 };
 
 template <bool, typename, bool, typename, typename>
-struct conditional3 {};
+struct conditional3 {}; // NOLINT(readability-identifier-naming)
 template <typename T, typename FT, typename FF>
 struct conditional3<true, T, false, FT, FF> {
-    typedef T type;
+    using type = T;
 };
 template <typename T, typename FT, typename FF>
 struct conditional3<false, T, true, FT, FF> {
-    typedef FT type;
+    using type = FT;
 };
 template <typename T, typename FT, typename FF>
 struct conditional3<false, T, false, FT, FF> {
-    typedef FF type;
+    using type = FF;
 };
 
 template <bool, typename U, U, U>
-struct conditional_v {};
+struct conditional_v {}; // NOLINT(readability-identifier-naming)
 template <typename U, U t, U f>
 struct conditional_v<true, U, t, f> {
     static constexpr U value = t;
@@ -145,16 +167,16 @@ struct conditional_v<false, U, t, f> {
 };
 
 template <typename T>
-struct remove_reference {
-    typedef T type;
+struct remove_reference { // NOLINT(readability-identifier-naming)
+    using type = T;
 };
 template <typename T>
 struct remove_reference<T &> {
-    typedef T type;
+    using type = T;
 };
 template <typename T>
 struct remove_reference<T &&> {
-    typedef T type;
+    using type = T;
 };
 
 template <typename T>
@@ -177,6 +199,7 @@ std::unique_ptr<T> make_unique(Args &&...args) {
     return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
 }
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
 template <typename T, typename P>
 constexpr bool everyone_is(T val, P item) {
     return val == item;
@@ -185,7 +208,9 @@ template <typename T, typename P, typename... Args>
 constexpr bool everyone_is(T val, P item, Args... item_others) {
     return val == item && everyone_is(val, item_others...);
 }
+// NOLINTEND(performance-unnecessary-value-param)
 
+// NOLINTBEGIN(performance-unnecessary-value-param)
 template <typename T, typename P>
 constexpr bool one_of(T val, P item) {
     return val == item;
@@ -194,6 +219,7 @@ template <typename T, typename P, typename... Args>
 constexpr bool one_of(T val, P item, Args... item_others) {
     return val == item || one_of(val, item_others...);
 }
+// NOLINTEND(performance-unnecessary-value-param)
 
 template <typename T, typename P>
 constexpr P map(T pat, P def) {
@@ -209,11 +235,30 @@ constexpr bool any_null(Args... ptrs) {
     return one_of(nullptr, ptrs...);
 }
 
+// For some unknown reason, GCC 11.x and beyond can't compile specific places
+// of the library that involve this routine. It's connected to the fact that
+// this function is inline and defined in a header.
+#if defined(__GNUC__) && __GNUC__ > 8 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wrestrict"
+// /usr/include/bits/string_fortified.h:29:33: warning: ‘void* __builtin_memcpy(
+//     void*, const void*, long unsigned int)’ accessing 18446744056529682432 or
+//     more bytes at offsets 320 and 0 overlaps 9223372002495037441 bytes at
+//     offset -9223372019674906625 [-Wrestrict]
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+// warning: ‘void* __builtin_memcpy(void*, const void*, long unsigned int)’
+//     specified bound between 18446744056529682432 and 18446744073709551608
+//     exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
+#endif
 template <typename T>
 inline void array_copy(T *dst, const T *src, size_t size) {
     for (size_t i = 0; i < size; ++i)
         dst[i] = src[i];
 }
+#if defined(__GNUC__) && __GNUC__ > 8 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
 template <typename T>
 inline bool array_cmp(const T *a1, const T *a2, size_t size) {
     for (size_t i = 0; i < size; ++i)
@@ -226,9 +271,15 @@ inline void array_set(T *arr, const U &val, size_t size) {
         arr[i] = static_cast<T>(val);
 }
 
+inline bool array_cmp_weak(const dnnl_dim_t *a1, const dnnl_dim_t *a2, size_t size) {
+    for (size_t i = 0; i < size; ++i)
+        if (a1[i] != a2[i] && a1[i] != DNNL_RUNTIME_DIM_VAL && a2[i] != DNNL_RUNTIME_DIM_VAL) return false;
+    return true;
+}
+
 namespace product_impl {
 template <size_t>
-struct int2type {};
+struct int2type {}; // NOLINT(readability-identifier-naming)
 
 template <typename T>
 constexpr int product_impl(const T *arr, int2type<0>) {
@@ -455,11 +506,10 @@ T pick_by_prop_kind(prop_kind_t prop_kind, const T &val_fwd, const T &val_bwd_d,
 }
 
 template <typename Telem, size_t Tdims>
-struct array_offset_calculator {
+struct array_offset_calculator { // NOLINT(readability-identifier-naming)
     template <typename... Targs>
-    array_offset_calculator(Telem *base, Targs... Fargs) : _dims {Fargs...} {
-        _base_ptr = base;
-    }
+    array_offset_calculator(Telem *base, Targs... Fargs)
+        : _base_ptr(base), _dims {Fargs...} {}
 
     template <typename... Targs>
     array_offset_calculator(std::nullptr_t, Targs... Fargs) = delete;
@@ -515,7 +565,14 @@ const char *format_cvt_impl(T &&t) {
 
 template <typename... Args>
 std::string format_impl(const char *fmt, Args... args) {
+    // volatile here is a workaround for GCC 8 format-truncation warning e.g.:
+    // ‘%d’ directive output truncated writing 1 byte into a region of size 0
+    // triggered by overaggressive optmization in '-O3'; fixed in GCC 9+
+#if defined(__GNUC__) && __GNUC__ == 8 && !defined(__clang__)
+    volatile size_t sz = snprintf(nullptr, 0, fmt, args...);
+#else
     size_t sz = snprintf(nullptr, 0, fmt, args...);
+#endif
     std::string buf(sz + 1, '\0');
     snprintf(&buf[0], sz + 1, fmt, args...);
     buf.resize(sz);
@@ -528,15 +585,15 @@ std::string format(const char *fmt, Args &&...args) {
 }
 
 inline bool need_src_or_dst_check(
-        bool is_fwd, int o, int i, int k, int p, int s, int d) {
+        bool is_fwd, dim_t o, dim_t i, dim_t k, dim_t p, dim_t s, dim_t d) {
     if (is_fwd) {
-        int i_min = -p;
-        int i_max = (o - 1) * s - p + (k - 1) * (1 + d);
+        dim_t i_min = -p;
+        dim_t i_max = (o - 1) * s - p + (k - 1) * (1 + d);
         return (i_min < 0) || (i_max >= i);
     }
     // Backward.
-    int os_min = p - (k - 1) * (1 + d);
-    int os_max = (i - 1) + p;
+    dim_t os_min = p - (k - 1) * (1 + d);
+    dim_t os_max = (i - 1) + p;
     return (os_min < 0) || (os_max >= o * s);
 }
 
@@ -568,15 +625,23 @@ inline int get_dims_mask(const dims_t dims1, const dims_t dims2, int ndims,
     return mask;
 };
 
-inline void copy_dims_with_mask(
-        dims_t ddims, const dims_t sdims, int ndims, int mask) {
+// The function can be used to get dimensions for memory descriptors or
+// dimensions for logical offset. First ones are happy to have ones when mask
+// is not applied. This allows to initialize them with existing functions using
+// tags/strides. Latter ones are not nappy with ones and must have zeros as
+// logical offsets starts with 0. `fill_with_one` flag regulates the behavior
+// between them.
+inline void copy_dims_with_mask(dims_t ddims, const dims_t sdims, int ndims,
+        int mask, bool fill_with_one = false) {
     for (int d = 0; d < ndims; ++d) {
-        ddims[d] = (mask & (1 << d)) ? sdims[d] : 0;
+        ddims[d] = (mask & (1 << d)) ? sdims[d]
+                                     : static_cast<dim_t>(fill_with_one);
     }
 }
 
-inline void apply_mask_on_dims(dims_t dims, int ndims, int mask) {
-    copy_dims_with_mask(dims, dims, ndims, mask);
+inline void apply_mask_on_dims(
+        dims_t dims, int ndims, int mask, bool fill_with_one = false) {
+    copy_dims_with_mask(dims, dims, ndims, mask, fill_with_one);
 }
 
 inline void dim_iterator(const dims_t dims, dims_t indices, int ndims) {
@@ -641,6 +706,9 @@ std::string getenv_string_user(const char *name);
 bool get_jit_dump();
 unsigned get_jit_profiling_flags();
 std::string get_jit_profiling_jitdumpdir();
+// Checks if the filepath is a valid path and not a symlink to ensure
+// the application only processes secure files.
+status_t check_for_symlinks(const char *filename, bool *res);
 FILE *fopen(const char *filename, const char *mode);
 int getpagesize();
 
@@ -680,7 +748,7 @@ struct setting_t {
     constexpr setting_t(const T init) : value_ {init}, initialized_ {false} {}
     bool initialized() { return initialized_; }
     T get() { return value_; }
-    void set(T new_value) {
+    void set(const T &new_value) {
         value_ = new_value;
         initialized_ = true;
     }
@@ -691,11 +759,17 @@ struct setting_t {
 // Copyright 2005-2014 Daniel James.
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
-template <typename T>
+template <typename T, typename std::enable_if<!std::is_enum<T>::value , int>::type = 0>
 static size_t hash_combine(size_t seed, const T &v) {
     return seed ^= std::hash<T> {}(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
 }
 
+template <typename T, typename std::enable_if<std::is_enum<T>::value , int>::type = 0>
+static size_t hash_combine(size_t seed, const T &v) {
+    using underlying_t = typename std::underlying_type<T>::type;
+    return hash_combine(seed, static_cast<underlying_t>(v));
+}
+
 inline int float2int(float x) {
     return utils::bit_cast<int>(x);
 }
@@ -749,7 +823,7 @@ struct set_once_before_first_get_setting_t {
 
 inline bool is_native_runtime(runtime_kind_t kind) {
     return utils::one_of(kind, runtime_kind::seq, runtime_kind::omp,
-            runtime_kind::tbb, runtime_kind::threadpool);
+            runtime_kind::tbb, runtime_kind::tbb_auto, runtime_kind::threadpool);
 }
 
 // Convenience wrapper to choose at compile-time between std::unique_ptr's
@@ -778,6 +852,79 @@ template <typename T>
 using maybe_unique_ptr = std::unique_ptr<T, nop_deleter_t>;
 #endif // DNNL_MAYBE_UNIQUE_PTR_IS_UNIQUE
 
+// Common abstraction to manipulate nibbles in memory as pairs
+struct nibble2_t {
+
+    // constructs a nibble pair from a pair of uint8_t values
+    nibble2_t(uint8_t low_, uint8_t high_) : low(low_), high(high_) {}
+
+    // constructs a nibble pairs from an uin8_t, taking its low and high part
+    nibble2_t(uint8_t pack_) : low(pack_ & 0xf), high((pack_ >> 4) & 0xf) {}
+
+    // sets low (idx=0) or high (idx=1)  nibble.
+    inline void set(uint8_t val, int idx) {
+        switch (idx) {
+            case 0: low = val; return;
+            case 1: high = val; return;
+            default: assert(!"Out of range index"); return;
+        }
+    }
+
+    // returns low (idx = 0) or high (idx = 1) nibble in a uint8_t
+    inline uint8_t get(int idx) const {
+        switch (idx) {
+            case 0: return low;
+            case 1: return high;
+            default: assert(!"out of range index"); return 0;
+        }
+    }
+
+    // returns pair of nibbles as uint8t
+    inline uint8_t get() const { return static_cast<uint8_t>(high << 4 | low); }
+
+private:
+    uint8_t low : 4;
+    uint8_t high : 4;
+};
+static_assert(sizeof(nibble2_t) == 1, "nibble2_t must be 1 byte");
+
+/// Iterates through a binary integer
+/// usage:
+///
+/// for(int idx : mask_iterator(13)) { // 13 == 1101
+///     printf("%d\t", idx);
+/// }
+/// output: 0  2  3
+class mask_iterator { // NOLINT(readability-identifier-naming)
+    int mask_;
+    int index_;
+
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = int;
+    using value_type = int;
+    using pointer = value_type *;
+    using reference = value_type &;
+    mask_iterator() : mask_(0), index_(0) {}
+    mask_iterator(int mask) : mask_(mask), index_(0) {
+        if ((mask_ & 0x1) == 0) { ++(*this); }
+    }
+    mask_iterator &begin() { return *this; }
+    mask_iterator end() const { return 0; }
+    value_type operator*() const { return index_; }
+    mask_iterator &operator++() {
+        do {
+            index_++;
+            mask_ >>= 1;
+        } while ((mask_ & 0x1) == 0 && mask_ != 0);
+        if (mask_ == 0) { index_ = 0; }
+        return *this;
+    }
+    bool operator!=(const mask_iterator &other) const {
+        return mask_ != other.mask_ || index_ != other.index_;
+    }
+};
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp
index 3ea9067130f..1d0d18c8b4a 100644
--- a/src/common/verbose.cpp
+++ b/src/common/verbose.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 * Copyright 2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -53,6 +53,7 @@
 #include "reorder_pd.hpp"
 #include "resampling_pd.hpp"
 #include "rnn_pd.hpp"
+#include "sdpa_pd.hpp"
 #include "shuffle_pd.hpp"
 #include "softmax_pd.hpp"
 #include "sum_pd.hpp"
@@ -84,7 +85,15 @@ static constexpr char verbose_version[] = "v1";
 
 static setting_t<uint32_t> verbose {0};
 
-void print_header(const filter_status_t &filter_status) noexcept {
+// Component filters help manage verbose output by parsing and printing for
+// matching components. The filter status is tracked from verbose initializaton,
+// allowing queries for the component type during verbose printing.
+filter_status_t &filter_status() {
+    static filter_status_t filter_status;
+    return filter_status;
+}
+
+void print_header() noexcept {
     static std::atomic_flag version_printed = ATOMIC_FLAG_INIT;
     if (!version_printed.test_and_set()) {
         verbose_printf("info,oneDNN v%d.%d.%d (commit %s)\n",
@@ -119,6 +128,8 @@ void print_header(const filter_status_t &filter_status) noexcept {
         verbose_printf("info,use batch_normalization stats one pass is %s\n",
                 experimental::use_bnorm_stats_one_pass() ? "enabled"
                                                          : "disabled");
+        verbose_printf("info,GPU convolution v2 is %s\n",
+                experimental::use_gpu_conv_v2() ? "enabled" : "disabled");
 #endif
 
 #ifdef DNNL_EXPERIMENTAL_SPARSE
@@ -147,16 +158,16 @@ void print_header(const filter_status_t &filter_status) noexcept {
                 "mode,implementation,backend,exec_time\n",
                 get_verbose_timestamp() ? "timestamp," : "");
 #endif
-        if (filter_status.status == filter_status_t::flags::valid)
+        if (filter_status().status == filter_status_t::flags::valid)
             verbose_printf(
                     "common,info,filter format is enabled, hit components: "
                     "%s\n",
-                    filter_status.components.c_str());
-        else if (filter_status.status == filter_status_t::flags::invalid)
+                    filter_status().components.c_str());
+        else if (filter_status().status == filter_status_t::flags::invalid)
             verbose_printf(
                     "common,error,filter format is ill-formed and is not "
                     "applied, error: %s\n",
-                    filter_status.err_msg.c_str());
+                    filter_status().err_msg.c_str());
     }
 }
 
@@ -165,11 +176,9 @@ uint32_t get_verbose(verbose_t::flag_kind verbosity_kind,
         component_t::flag_kind filter_kind) noexcept {
 #if defined(DISABLE_VERBOSE)
     return verbose_t::none;
-#else
+#endif
     // we print all verbose by default
     static int flags = component_t::all;
-    // record filter parsing result to instruct verbose printing
-    static filter_status_t filter_status;
 
     if (!verbose.initialized()) {
         // Assumes that all threads see the same environment
@@ -178,9 +187,8 @@ uint32_t get_verbose(verbose_t::flag_kind verbosity_kind,
             // Legacy: we accept values 0,1,2
             // 0 and none erase previously set flags, including error
             if (s == "0" || s == "none") k = verbose_t::none;
-            if (s == "1") k |= verbose_t::exec_profile;
-            if (s == "2")
-                k |= verbose_t::exec_profile | verbose_t::create_profile;
+            if (s == "1") k |= verbose_t::level1;
+            if (s == "2") k |= verbose_t::level2;
             if (s == "all" || s == "-1") k |= verbose_t::all;
             if (s == "error") k |= verbose_t::error;
             if (s == "check")
@@ -192,62 +200,62 @@ uint32_t get_verbose(verbose_t::flag_kind verbosity_kind,
             if (s == "profile_exec") k |= verbose_t::exec_profile;
             // Enable profiling to external libraries
             if (s == "profile_externals") k |= verbose_t::profile_externals;
+            if (s == "warn") k |= verbose_t::warn;
             // we extract debug info debuginfo=XX. ignore if debuginfo is invalid.
             if (s.rfind("debuginfo=", 0) == 0)
                 k |= verbose_t::make_debuginfo(
                         std::strtol(s.c_str() + 10, nullptr, 10));
         };
 
-        auto update_filter = [&](const std::string &s,
-                                     filter_status_t &filter_status) -> int {
+        auto update_filter = [&](const std::string &s) -> int {
             int k = component_t::none;
             try {
                 std::regex regexp = std::regex(s);
 
-#define REGEX_SEARCH(k, component, regexp, filter_status) \
+#define REGEX_SEARCH(k, component, regexp) \
     if (std::regex_search("" #component "", regexp)) { \
         (k) |= component_t::component; \
-        (filter_status).components += "" #component ","; \
+        filter_status().components += "" #component ","; \
     }
-                REGEX_SEARCH(k, primitive, regexp, filter_status);
-                REGEX_SEARCH(k, reorder, regexp, filter_status);
-                REGEX_SEARCH(k, shuffle, regexp, filter_status);
-                REGEX_SEARCH(k, concat, regexp, filter_status);
-                REGEX_SEARCH(k, sum, regexp, filter_status);
-                REGEX_SEARCH(k, convolution, regexp, filter_status);
-                REGEX_SEARCH(k, deconvolution, regexp, filter_status);
-                REGEX_SEARCH(k, eltwise, regexp, filter_status);
-                REGEX_SEARCH(k, lrn, regexp, filter_status);
-                REGEX_SEARCH(k, batch_normalization, regexp, filter_status);
-                REGEX_SEARCH(k, inner_product, regexp, filter_status);
-                REGEX_SEARCH(k, rnn, regexp, filter_status);
-                REGEX_SEARCH(k, binary, regexp, filter_status);
-                REGEX_SEARCH(k, matmul, regexp, filter_status);
-                REGEX_SEARCH(k, resampling, regexp, filter_status);
-                REGEX_SEARCH(k, pooling, regexp, filter_status);
-                REGEX_SEARCH(k, reduction, regexp, filter_status);
-                REGEX_SEARCH(k, prelu, regexp, filter_status);
-                REGEX_SEARCH(k, softmax, regexp, filter_status);
-                REGEX_SEARCH(k, layer_normalization, regexp, filter_status);
-                REGEX_SEARCH(k, group_normalization, regexp, filter_status);
-                REGEX_SEARCH(k, graph, regexp, filter_status);
-                REGEX_SEARCH(k, gemm_api, regexp, filter_status);
-                REGEX_SEARCH(k, ukernel, regexp, filter_status);
+                REGEX_SEARCH(k, primitive, regexp);
+                REGEX_SEARCH(k, reorder, regexp);
+                REGEX_SEARCH(k, shuffle, regexp);
+                REGEX_SEARCH(k, concat, regexp);
+                REGEX_SEARCH(k, sum, regexp);
+                REGEX_SEARCH(k, convolution, regexp);
+                REGEX_SEARCH(k, deconvolution, regexp);
+                REGEX_SEARCH(k, eltwise, regexp);
+                REGEX_SEARCH(k, lrn, regexp);
+                REGEX_SEARCH(k, batch_normalization, regexp);
+                REGEX_SEARCH(k, inner_product, regexp);
+                REGEX_SEARCH(k, rnn, regexp);
+                REGEX_SEARCH(k, binary, regexp);
+                REGEX_SEARCH(k, matmul, regexp);
+                REGEX_SEARCH(k, resampling, regexp);
+                REGEX_SEARCH(k, pooling, regexp);
+                REGEX_SEARCH(k, reduction, regexp);
+                REGEX_SEARCH(k, prelu, regexp);
+                REGEX_SEARCH(k, softmax, regexp);
+                REGEX_SEARCH(k, layer_normalization, regexp);
+                REGEX_SEARCH(k, group_normalization, regexp);
+                REGEX_SEARCH(k, graph, regexp);
+                REGEX_SEARCH(k, gemm_api, regexp);
+                REGEX_SEARCH(k, ukernel, regexp);
 #undef REGEX_SEARCH
             } catch (const std::exception &e) {
-                filter_status.status = filter_status_t::flags::invalid;
-                filter_status.err_msg = e.what();
+                filter_status().status = filter_status_t::flags::invalid;
+                filter_status().err_msg = e.what();
                 return component_t::all;
             }
 
             // filter enabled and at least one component is hit
-            if (filter_status.components.length() != 0) {
+            if (!filter_status().components.empty()) {
                 // pop out the last comma
-                filter_status.components.pop_back();
-                filter_status.status = filter_status_t::flags::valid;
+                filter_status().components.pop_back();
+                filter_status().status = filter_status_t::flags::valid;
             } else {
-                filter_status.status = filter_status_t::flags::invalid;
-                filter_status.err_msg
+                filter_status().status = filter_status_t::flags::invalid;
+                filter_status().err_msg
                         = "component with name \'" + s + "\' not found";
             }
             return k;
@@ -264,9 +272,7 @@ uint32_t get_verbose(verbose_t::flag_kind verbosity_kind,
             // update filter flags
             if (tok.rfind("filter=", 0) == 0) {
                 auto filter_str = tok.substr(7);
-                if (!filter_str.empty()) {
-                    flags = update_filter(filter_str, filter_status);
-                }
+                if (!filter_str.empty()) { flags = update_filter(filter_str); }
             }
             if (pos_en == std::string::npos) break;
         }
@@ -284,17 +290,16 @@ uint32_t get_verbose(verbose_t::flag_kind verbosity_kind,
     int result = verbose.get() & verbosity_kind;
     if (verbosity_kind == verbose_t::debuginfo)
         result = verbose_t::get_debuginfo(verbose.get());
-    if (result) print_header(filter_status);
     bool filter_result = flags & filter_kind;
     return filter_result ? result : 0;
-#endif
 }
-
+#if !defined(DISABLE_VERBOSE)
 static setting_t<bool> verbose_timestamp {false};
+#endif
 bool get_verbose_timestamp() {
 #if defined(DISABLE_VERBOSE)
     return false;
-#else
+#endif
     if (verbose.get() == 0) return false;
 
     if (!verbose_timestamp.initialized()) {
@@ -304,27 +309,8 @@ bool get_verbose_timestamp() {
         verbose_timestamp.set(val);
     }
     return verbose_timestamp.get();
-#endif
 }
 
-#if defined(DISABLE_VERBOSE)
-void pd_info_t::init(
-        dnnl::impl::engine_t *, const dnnl::impl::primitive_desc_t *) {}
-
-std::string rt_mds2str(primitive_kind_t prim_kind, const memory_desc_t *src_md,
-        const memory_desc_t *wei_md, const memory_desc_t *bia_md,
-        const memory_desc_t *dst_md) {
-    return std::string();
-}
-
-std::string rt_dims2fmt_str(primitive_kind_t prim_kind,
-        const memory_desc_t *src_md, const memory_desc_t *wei_md,
-        const memory_desc_t *dst_md) {
-    return std::string();
-}
-
-#else
-
 std::ostream &operator<<(std::ostream &ss, engine_kind_t eng_kind) {
     ss << dnnl_engine_kind2str(eng_kind);
     return ss;
@@ -392,6 +378,14 @@ std::string rnn_flags2str(unsigned flags) {
     return s;
 }
 
+std::string cublasltfmt2str(const memory_desc_t *md) {
+    if (md->format_desc.cublaslt_blocked_desc.cublaslt_format
+            == cublaslt_memory_format_t::col32_2r_4r4) {
+        return ":col32_2r_4r4";
+    }
+    return "";
+}
+
 std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) {
     using namespace memory_extra_flags;
 
@@ -400,6 +394,21 @@ std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) {
         ss << ":s8m" << extra.compensation_mask;
     if (extra.flags & compensation_conv_asymmetric_src)
         ss << ":zpm" << extra.asymm_compensation_mask;
+    if (extra.flags & compensation_gpu_conv_asymmetric_src) {
+        ss << ":zid" << extra.idhw[0];
+        ss << ":zih" << extra.idhw[1];
+        ss << ":ziw" << extra.idhw[2];
+        ss << ":zod" << extra.odhw[0];
+        ss << ":zoh" << extra.odhw[1];
+        ss << ":zow" << extra.odhw[2];
+        ss << ":zpd" << extra.pdhw[0];
+        ss << ":zph" << extra.pdhw[1];
+        ss << ":zpw" << extra.pdhw[2];
+        ss << ":zdd" << extra.ddhw[0];
+        ss << ":zdh" << extra.ddhw[1];
+        ss << ":zdw" << extra.ddhw[2];
+        ss << ":zs" << extra.dst_size;
+    }
     if (extra.flags & scale_adjust && extra.scale_adjust != 1.f)
         ss << ":sa" << extra.scale_adjust;
     return ss;
@@ -408,28 +417,60 @@ std::ostream &operator<<(std::ostream &ss, const memory_extra_desc_t &extra) {
 std::string md2fmt_tag_str(const memory_desc_t *md) {
     memory_desc_wrapper mdw(md);
 
-    dims_t blocks = {0};
-    mdw.compute_blocks(blocks);
-
-    char dim_chars[DNNL_MAX_NDIMS + 1];
-    dims_t ou_blocks = {0};
-    utils::array_copy(ou_blocks, mdw.padded_dims(), mdw.ndims());
-
-    for (int d = 0; d < mdw.ndims(); ++d) {
-        dim_chars[d] = (blocks[d] == 1 ? 'a' : 'A') + (char)d;
-        ou_blocks[d] /= blocks[d];
-    }
-
     // Can't report meaningful tag for runtime dimensions.
     if (mdw.has_runtime_strides()) return "*";
 
-    dims_t strides;
-    const auto &blk = mdw.blocking_desc();
-    utils::array_copy(strides, blk.strides, mdw.ndims());
+    struct sort_key_t {
+        uint64_t stride_order;
+        dim_t outer_block;
+        int idx;
+        char dim_char;
+    };
+
+    dims_t blocks = {0};
+    mdw.compute_blocks(blocks);
 
-    utils::simultaneous_sort(strides, ou_blocks, dim_chars, mdw.ndims(),
-            [](dim_t a, dim_t b) { return b - a; });
+    std::vector<sort_key_t> sort_keys(mdw.ndims());
+    const auto &pdims = mdw.padded_dims();
+    const auto &blk = mdw.blocking_desc();
+    for (int i = 0; i < mdw.ndims(); ++i)
+        // Assume that any dimension with stride 0 is outer relative to other
+        // dimensions. Use (uint64_t)(stride - 1) to sort a stride of 0 highest.
+        // Multiple dimensions with stride 0 is ambiguous.
+        sort_keys[i] = {(uint64_t)(blk.strides[i] - 1), pdims[i] / blocks[i], i,
+                (char)((blocks[i] == 1 ? 'a' : 'A') + i)};
+
+    // Old approach: utils::simultaneous_sort(strides, outer_blocks, dim_chars)
+    //   input tag: acdb
+    //   dims: 5x8x0x2
+    //   strides: 0x1x16x8
+    //   output tag: cdba
+    //
+    // New approach with std::sort and sort keys:
+    //   input tag: acdb
+    //   dims: 5x8x0x2
+    //   "stride orders": (BIG NUMBER)x0x15x7
+    //   output tag: acdb
+    std::sort(sort_keys.begin(), sort_keys.end(),
+            [](const sort_key_t &left, const sort_key_t &right) {
+                if (left.stride_order < right.stride_order) return false;
+                if (left.stride_order == right.stride_order) {
+                    // WLOG, we can assume a dimension of size 1 has the same
+                    // stride as the next outermost dimension. Sort the one with
+                    // the non-unit outer block as the outer dimension. Multiple
+                    // dimensions of size 1 with the same stride is ambiguous.
+                    if (left.outer_block < right.outer_block) return false;
+                    if (left.outer_block == right.outer_block)
+                        // Sort 1x1x... outer blocks to (arbitrarily) list them
+                        // in alphabetical order.
+                        return left.idx < right.idx;
+                }
+                return true;
+            });
 
+    char dim_chars[DNNL_MAX_NDIMS + 1];
+    for (int i = 0; i < mdw.ndims(); ++i)
+        dim_chars[i] = sort_keys[i].dim_char;
     dim_chars[mdw.ndims()] = '\0';
 
     std::string s(dim_chars);
@@ -512,6 +553,7 @@ std::string md2fmt_str(
         case format_kind::blocked:
             ss << ":" << md2fmt_tag_str(md) << ":" << md2fmt_strides_str(md);
             break;
+        case format_kind::cublaslt_blocked: ss << cublasltfmt2str(md); break;
         case format_kind::wino:
         case format_kind::rnn_packed:
         case format_kind::opaque: ss << "::"; break;
@@ -579,24 +621,13 @@ std::string md2desc_str(const memory_desc_t *md) {
     return s;
 }
 
-std::ostream &operator<<(std::ostream &ss, const runtime_scales_t &scale) {
-    ss << scale.mask_;
-    ss << ":" << scale.data_type_;
-    if (scale.ndims_) {
-        ss << ":";
-        for (int i = 0; i < scale.ndims_ - 1; ++i)
-            ss << scale.group_dims_[i] << 'x';
-        ss << scale.group_dims_[scale.ndims_ - 1];
-    }
-    return ss;
-}
-
-std::ostream &operator<<(std::ostream &ss, const scales_t &oscale) {
-    ss << oscale.mask_;
-    const float val = oscale.scales_[0];
+std::ostream &operator<<(
+        std::ostream &ss, const rnn_create_time_scales_t &rnn_scales) {
+    ss << rnn_scales.mask_;
+    const float val = rnn_scales.scales_[0];
     // Can't use scientific flags since it breaks parsing on converter and
     // benchdnn side.
-    if (oscale.mask_ == 0 || is_runtime_value(val))
+    if (rnn_scales.mask_ == 0 || is_runtime_value(val))
         ss << ":" << get_val_str(val);
     return ss;
 }
@@ -680,7 +711,8 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
 
     const accumulation_mode_t &am = attr->acc_mode_;
     if (am != accumulation_mode::strict) {
-        ss << field_delim() << "attr-acc:" << dnnl_accumulation_mode2str(am);
+        ss << field_delim()
+           << "attr-acc-mode:" << dnnl_accumulation_mode2str(am);
     }
 
     const auto &rm = attr->rounding_mode_;
@@ -701,53 +733,25 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
     if (deterministic) {
         ss << field_delim() << "attr-deterministic:" << deterministic;
     }
+
+    // Fast exit if rest attributes were not specified.
     if (attr->has_default_values()) return ss;
 
-    const runtime_scales_t &os = attr->output_scales_;
-    if (!os.has_default_values()) {
-        ss << field_delim() << "attr-oscale:" << os;
+    const scales_t &scales = attr->scales_;
+    if (!scales.has_default_values()) {
+        ss << field_delim() << "attr-scales:" << scales.get_verbose();
     }
 
-    const arg_scales_t &as = attr->scales_;
-    if (!as.has_default_values()) {
-        std::string delim = empty_delim;
-        ss << field_delim() << "attr-scales:";
-        for (const auto &map_entry : as.scales_) {
-            const auto &val = map_entry.second;
-            if (val.has_default_values()) continue;
-
-            int arg = map_entry.first;
-            ss << delim << arg2str(arg) << ":" << val;
-            delim = attr_delim;
-        }
+    const zero_points_t &zero_points = attr->zero_points_;
+    if (!zero_points.has_default_values()) {
+        ss << field_delim() << "attr-zero-points:" << zero_points.get_verbose();
     }
-
-    const zero_points_t &zp = attr->zero_points_;
-    if (!zp.has_default_values()) {
-        std::string delim = empty_delim;
-        ss << field_delim() << "attr-zero-points:";
-        for (const auto &arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-            if (zp.has_default_values(arg)) continue;
-
-            int mask = 0;
-            zp.get(arg, &mask);
-            const auto dt = zp.get_data_type(arg);
-
-            ss << delim << arg2str(arg) << ":" << mask << ":" << dt;
-
-            const auto &g_ndim = zp.get_groups_ndims(arg);
-            if (g_ndim) {
-                const auto &g_dims = zp.get_groups(arg);
-                ss << ":";
-                for (int i = 0; i < g_ndim - 1; ++i)
-                    ss << g_dims[i] << 'x';
-                ss << g_dims[g_ndim - 1];
-            }
-
-            delim = attr_delim;
-        }
+    const auto &legacy_input_zp = attr->input_zero_points_;
+    if (!legacy_input_zp.has_default_values()) {
+        ss << "attr-legacy-input-zero-points:";
+        ss << ":" << get_val_str(legacy_input_zp.mask_) << ":" << get_val_str(legacy_input_zp.count_);
+        ss << " ";
     }
-
     const post_ops_t &po = attr->post_ops_;
     if (!po.has_default_values()) {
         std::string delim = empty_delim;
@@ -766,12 +770,14 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
                     if (s.dt != data_type::undef) ss << ":" << s.dt;
                 } break;
                 case primitive_kind::convolution: {
-                    using namespace data_type;
-                    const auto &c = e.depthwise_conv;
-                    ss << delim << "dw:k" << c.kernel << "s" << c.stride << "p"
-                       << c.padding;
-                    if (c.wei_dt == s8 || c.dst_dt != f32)
-                        ss << ":" << c.dst_dt;
+                    // using namespace data_type;
+                    // const auto &c = e.depthwise_conv;
+                    // ss << delim << "dw:k" << c.kernel << "s" << c.stride << "p"
+                    //    << c.padding;
+                    // if (c.wei_dt == s8 || c.dst_dt != f32)
+                    //     ss << ":" << c.dst_dt;
+                    const char *alg_str = "depthwise_conv_old";
+                    ss << delim << alg_str;
                 } break;
                 case primitive_kind::eltwise: {
                     const post_ops_t::entry_t::eltwise_t &ew = e.eltwise;
@@ -783,7 +789,7 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
                 } break;
                 case primitive_kind::binary: {
                     const post_ops_t::entry_t::binary_t &eb = e.binary;
-                    const auto &md = eb.src1_desc;
+                    const auto &md = eb.user_src1_desc;
                     int mask = 0;
                     for (int d = 0; d < md.ndims; ++d)
                         mask += md.dims[d] != 1 ? (1 << d) : 0;
@@ -792,7 +798,7 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
                     switch (mdw.format_kind()) {
                         case format_kind::blocked:
                             if (!mdw.count_non_unit_dims(1))
-                                ss << ":" << md2fmt_tag_str(&md);
+                                ss << ":" << md2fmt_tag_str(&eb.src1_desc);
                             break;
                         case format_kind::any: ss << ":any"; break;
                         default: assert(!"unsupported format_kind");
@@ -803,6 +809,14 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
                     ss << delim << "prelu"
                        << ":" << ep.mask;
                 } break;
+                case primitive_kind::depthwise: {
+                    const post_ops_t::entry_t::depthwise_t &dw = e.depthwise;
+                    ss << delim << dw.alg;
+                } break;
+                case primitive_kind::quantization: {
+                    const post_ops_t::entry_t::quantization_t &qt = e.quantization;
+                    ss << delim << qt.alg;
+                } break;
                 default: assert(!"unsupported post op primitive kind!"); break;
             }
             delim = attr_delim;
@@ -813,10 +827,18 @@ std::ostream &operator<<(std::ostream &ss, const primitive_attr_t *attr) {
     if (!rnn_qp.has_default_values()) {
         ss << field_delim() << "rnn_data_qparams:" << rnn_qp.scale_ << ":"
            << rnn_qp.shift_ << ";";
+        ss << "rnn_data_qparams:" << rnn_qp.scale_ << ":" << rnn_qp.shift_
+           << " ";
+    }
+
+    const src_dyn_quant_params_t &dyn_qp = attr->src_dyn_quant_params_;
+    if (!dyn_qp.has_default_values()) {
+        ss << "src_dyn_quant_group_size:" << dyn_qp.get() << ";";
     }
 
     if (!attr->dropout_.has_default_values()) {
-        const memory_desc_wrapper mdw(attr->dropout_.dropout_desc_);
+        ss << field_delim() << "attr-dropout";
+        const memory_desc_wrapper mdw(attr->dropout_.user_dropout_desc_);
         switch (mdw.format_kind()) {
             case format_kind::blocked:
                 if (!mdw.count_non_unit_dims(1))
@@ -1525,8 +1547,9 @@ std::string init_info_softmax(const engine_t *e, const pd_t *pd) {
        << " ";
     ss << md2fmt_str("dst", dst_md, pd->dst_md(0, true)->format_kind);
     if (!types::is_zero_md(diff_dst_md)) {
-        ss << md2fmt_str(
-                "diff_dst", diff_dst_md, pd->diff_dst_md(0, true)->format_kind);
+        ss << " "
+           << md2fmt_str("diff_dst", diff_dst_md,
+                      pd->diff_dst_md(0, true)->format_kind);
     }
 
     ss << "," << pd->attr() << ",";
@@ -1556,6 +1579,51 @@ std::string init_info_sum(const engine_t *e, const pd_t *pd) {
     return ss.str();
 }
 
+template <typename pd_t>
+std::string init_info_sdpa(const engine_t *e, const pd_t *pd) {
+    std::stringstream ss;
+    ss << e << "," << pd->kind() << "," << pd->name() << ",";
+
+    const sdpa_desc_t *desc = pd->desc();
+
+    std::string delimiter;
+    if (!desc->kq_scales.has_default_values()) {
+        ss << delimiter << "kq_attr-scales:wei:" << desc->kq_scales;
+        delimiter = "+";
+    }
+    if (!desc->kq_zero_points.has_default_values()) {
+        ss << delimiter
+           << "kq_attr-zero-points:" << desc->kq_zero_points.get_verbose();
+        delimiter = "+";
+    }
+
+    if (!desc->vs_scales.has_default_values()) {
+        ss << delimiter << "vs_attr-scales:wei:" << desc->vs_scales;
+        delimiter = "+";
+    }
+    if (!desc->vs_zero_points.has_default_values()) {
+        ss << delimiter
+           << "vs_attr-zero-points:" << desc->vs_zero_points.get_verbose();
+    }
+
+    ss << ",query:" << pd->qry_md()->data_type << ":"
+       << md2dim_str(pd->qry_md());
+    ss << ",key:" << pd->key_md()->data_type << ":" << md2dim_str(pd->key_md())
+       << ":" << md2fmt_tag_str(pd->key_md());
+    ss << ",val:" << pd->val_md()->data_type << ":" << md2dim_str(pd->val_md());
+    if (pd->with_attn_mask()) {
+        ss << ",msk:" << pd->attn_mask_md()->data_type << ":"
+           << md2dim_str(pd->attn_mask_md());
+    } else if (pd->with_causal_mask()) {
+        if (desc->mask_type == attn_mask_type::top_left)
+            ss << ",msk:causal:top_left";
+        else
+            ss << ",msk:causal:bottom_right";
+    }
+
+    return ss.str();
+}
+
 } // namespace
 
 std::string rt_mds2str(primitive_kind_t prim_kind, const memory_desc_t *src_md,
@@ -1564,6 +1632,10 @@ std::string rt_mds2str(primitive_kind_t prim_kind, const memory_desc_t *src_md,
     // Note: pass format_kind::undef since runtime dims-ed mds can't have
     // format_kind::any at any stage.
     std::string s;
+#if defined(DISABLE_VERBOSE)
+    return s;
+#endif
+
     switch ((int)prim_kind) {
         case primitive_kind::matmul:
             s = mds2str_matmul(src_md, format_kind::undef, wei_md,
@@ -1607,13 +1679,15 @@ std::string prepend_identifier_and_version(const char *fmt_str) {
 }
 
 void verbose_printf_impl(const char *raw_fmt_str, verbose_t::flag_kind kind) {
+#if defined(DISABLE_VERBOSE)
+    return;
+#endif
+
+    if (get_verbose(kind)) print_header();
+
     const auto &fmt_str = prepend_identifier_and_version(raw_fmt_str);
 
 #ifdef DNNL_EXPERIMENTAL_LOGGING
-    // by default, verbose_t::create_check is passed to the logger
-    // so that it prints at spdlog log_level_t::info when no verbose flag
-    // is specified. This is useful for printing headers, format fields, etc.
-    // which do not correspond to a specific verbose kind.
     const log_manager_t &log_manager = log_manager_t::get_log_manager();
 
     if (log_manager.is_logger_enabled())
@@ -1632,6 +1706,10 @@ std::string rt_dims2fmt_str(primitive_kind_t prim_kind,
         const memory_desc_t *src_md, const memory_desc_t *wei_md,
         const memory_desc_t *dst_md) {
     std::string s;
+#if defined(DISABLE_VERBOSE)
+    return s;
+#endif
+
     switch ((int)prim_kind) {
         case primitive_kind::matmul:
             s = dims2fmt_str_matmul(src_md, wei_md);
@@ -1661,6 +1739,7 @@ std::string rt_dims2fmt_str(primitive_kind_t prim_kind,
 }
 
 void pd_info_t::init(engine_t *engine, const primitive_desc_t *pd) {
+    // Handles VERBOSE_DISABLE since `is_initialized_` is set to `true`.
     if (is_initialized_) return;
 
     std::call_once(initialization_flag_, [&] {
@@ -1692,9 +1771,7 @@ void pd_info_t::init(engine_t *engine, const primitive_desc_t *pd) {
             CASE(shuffle);
             CASE(softmax);
             CASE(sum);
-            case primitive_kind::sdpa:
-              str_ = "sdpa, unknown info";
-              break;
+            CASE(sdpa);
             case primitive_kind::zero_pad:
               str_ = "zero_pad, unknown info";
               break;
@@ -1708,7 +1785,6 @@ void pd_info_t::init(engine_t *engine, const primitive_desc_t *pd) {
         is_initialized_ = true;
     });
 }
-#endif
 
 } // namespace impl
 } // namespace dnnl
@@ -1719,10 +1795,8 @@ dnnl_status_t dnnl_set_verbose(int level) {
     if (level < 0 || level > 2) return invalid_arguments;
 
     uint32_t verbose_level = verbose_t::none;
-    if (level == 1) verbose_level = verbose_t::error | verbose_t::exec_profile;
-    if (level == 2)
-        verbose_level = verbose_t::error | verbose_t::exec_profile
-                | verbose_t::create_profile;
+    if (level == 1) verbose_level = verbose_t::level1;
+    if (level == 2) verbose_level = verbose_t::level2;
     // we put the lower byte of level as devinfo to preserve backward
     // compatibility with historical VERBOSE={1,2}
     if (level == 1 || level == 2) verbose_level |= (level << 24);
diff --git a/src/common/verbose.hpp b/src/common/verbose.hpp
index c2839c67a6e..b6315a8f5c1 100644
--- a/src/common/verbose.hpp
+++ b/src/common/verbose.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 * Copyright 2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -57,14 +57,14 @@ inline constexpr size_t get_file_name_offset(T (&str)[1]) {
     return 0;
 }
 template <typename T, T v>
-struct const_expr_value {
+struct const_expr_value_t {
     static constexpr const T value = v;
 };
 
 } // namespace utility
 
 #define UTILITY_CONST_EXPR_VALUE(exp) \
-    utility::const_expr_value<decltype(exp), exp>::value
+    utility::const_expr_value_t<decltype(exp), exp>::value
 
 #define __FILENAME__ (&__FILE__[utility::get_file_name_offset(__FILE__)])
 
@@ -73,12 +73,12 @@ struct const_expr_value {
 // The string can contain format specifiers which are provided in VA_ARGS
 // Note: using ##__VAR_ARGS__ is necessary to avoid trailing comma in printf call
 
-#define VFORMAT(stamp, apitype, logtype, logsubtype, msg, ...) \
+#define VFORMAT(stamp, flagkind, apitype, logtype, logsubtype, msg, ...) \
     do { \
         std::string stamp_; \
         if (dnnl::impl::get_verbose_timestamp()) \
             stamp_ = std::to_string(stamp) + ","; \
-        dnnl::impl::verbose_printf( \
+        dnnl::impl::verbose_printf(flagkind, \
                 "%s" CONCAT2(VERBOSE_, apitype) "," CONCAT2( \
                         VERBOSE_, logtype) "%s," msg "\n", \
                 stamp_.c_str(), logsubtype, ##__VA_ARGS__); \
@@ -88,7 +88,8 @@ struct const_expr_value {
 #define VINFO(apitype, logtype, logsubtype, component, msg, ...) \
     do { \
         if (dnnl::impl::get_verbose(verbose_t::logtype##_##logsubtype)) \
-            VFORMAT(get_msec(), apitype, logtype, VERBOSE_##logsubtype, \
+            VFORMAT(get_msec(), verbose_t::logtype##_##logsubtype, apitype, \
+                    logtype, VERBOSE_##logsubtype, \
                     #component "," msg ",%s:%d", ##__VA_ARGS__, __FILENAME__, \
                     __LINE__); \
     } while (0)
@@ -116,8 +117,22 @@ struct const_expr_value {
 #define VERROR(apitype, component, msg, ...) \
     do { \
         if (dnnl::impl::get_verbose(verbose_t::error)) { \
-            VFORMAT(get_msec(), apitype, error, "", #component "," msg, \
-                    ##__VA_ARGS__); \
+            VFORMAT(get_msec(), verbose_t::error, apitype, error, "", \
+                    #component "," msg ",%s:%d", ##__VA_ARGS__, __FILENAME__, \
+                    __LINE__); \
+        } \
+    } while (0)
+
+// Special syntactic sugar for warnings, plus flush of the output stream
+// The difference between the warn and error verbose modes is that the
+// verbose error messages are only reserved for printing when an exception is
+// thrown or when a status check fails.
+#define VWARN(apitype, component, msg, ...) \
+    do { \
+        if (dnnl::impl::get_verbose(verbose_t::warn)) { \
+            VFORMAT(get_msec(), verbose_t::warn, apitype, warn, "", \
+                    #component "," msg ",%s:%d", ##__VA_ARGS__, __FILENAME__, \
+                    __LINE__); \
         } \
     } while (0)
 
@@ -127,17 +142,21 @@ struct const_expr_value {
     do { \
         if (dnnl::impl::get_verbose_dev_mode(verbose_t::debuginfo) \
                 >= (level)) { \
-            VFORMAT(get_msec(), apitype, debuginfo, "", #component "," msg, \
-                    ##__VA_ARGS__); \
+            VFORMAT(get_msec(), verbose_t::debuginfo, apitype, debuginfo, "", \
+                    #component "," msg ",%s:%d", ##__VA_ARGS__, __FILENAME__, \
+                    __LINE__); \
         } \
     } while (0)
 
 // Special syntactic sugar for logging performance
 // NOTE: the VPROF macro does not check for verbose flags, it is the
-// responsibility of the caller do check those (it should happen
+// responsibility of the caller to check those (it should happen
 // anyway to condition collecting stamp/duration)
 #define VPROF(stamp, apitype, logtype, logsubtype, info, duration) \
-    { VFORMAT(stamp, apitype, logtype, logsubtype, "%s,%g", info, duration); }
+    { \
+        VFORMAT(stamp, dnnl::impl::verbose_t::exec_profile, apitype, logtype, \
+                logsubtype, "%s,%g", info, duration); \
+    }
 
 struct verbose_t {
     enum flag_kind : uint32_t {
@@ -152,9 +171,13 @@ struct verbose_t {
         exec_check = 1 << 6,
         exec_profile = 1 << 7,
         profile_externals = 1 << 8,
+        warn = 1 << 9,
         // the upper 8 bits are reserved for devinfo levels
         debuginfo = 1 << 24,
         //
+        level1 = error | exec_profile | warn,
+        level2 = error | exec_profile | warn | create_profile,
+
         all = (uint32_t)-1,
     };
 
@@ -234,6 +257,8 @@ get_verbose_to_log_level_map() {
             verbose_to_log_map {
                     {verbose_t::all, log_manager_t::trace},
                     {verbose_t::debuginfo, log_manager_t::debug},
+                    {verbose_t::level1, log_manager_t::info},
+                    {verbose_t::level2, log_manager_t::info},
                     {verbose_t::create_dispatch, log_manager_t::info},
                     {verbose_t::create_check, log_manager_t::info},
                     {verbose_t::create_profile, log_manager_t::info},
@@ -241,6 +266,7 @@ get_verbose_to_log_level_map() {
                     {verbose_t::exec_profile, log_manager_t::info},
                     {verbose_t::exec_check, log_manager_t::error},
                     {verbose_t::error, log_manager_t::critical},
+                    {verbose_t::warn, log_manager_t::warn},
                     {verbose_t::none, log_manager_t::off},
             };
     return verbose_to_log_map;
@@ -279,6 +305,10 @@ inline std::string format_verbose_string(
 
 // processes fixed strings for logging and printing
 inline void verbose_printf(const char *fmt_str) {
+    // by default, verbose_t::create_check is passed to the logger
+    // so that it prints at spdlog log_level_t::info when no verbose flag
+    // is specified. This is useful for printing headers, format fields, etc.
+    // which do not correspond to a specific verbose kind.
     verbose_printf_impl(fmt_str, verbose_t::create_check);
 }
 
@@ -293,6 +323,10 @@ inline void verbose_printf(verbose_t::flag_kind kind, const char *fmt_str) {
 template <typename... str_args>
 inline void verbose_printf(const char *fmt_str, str_args... args) {
     std::string msg = format_verbose_string(fmt_str, args...);
+    // by default, verbose_t::create_check is passed to the logger
+    // so that it prints at spdlog log_level_t::info when no verbose flag
+    // is specified. This is useful for printing headers, format fields, etc.
+    // which do not correspond to a specific verbose kind.
     verbose_printf_impl(msg.c_str(), verbose_t::create_check);
 }
 
@@ -348,6 +382,7 @@ std::string md2fmt_str(
         const char *name, const memory_desc_t *md, format_kind_t user_format);
 std::string md2dim_str(
         const memory_desc_t *md, dims_type_t dims_type = dims_type_t::dims);
+std::string arg2str(int arg);
 // Returns a verbose string of dimensions or descriptor from src, wei, and/or
 // dst memory descs. Can be called externally to provide info about actual
 // values of runtime dimensions.
diff --git a/src/common/verbose_msg.hpp b/src/common/verbose_msg.hpp
index 67e52c1f3c8..cf92ffbbfa4 100644
--- a/src/common/verbose_msg.hpp
+++ b/src/common/verbose_msg.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 * Copyright 2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -26,6 +26,7 @@
 
 // log type strings
 #define VERBOSE_error "error"
+#define VERBOSE_warn "warning"
 #define VERBOSE_create "create"
 #define VERBOSE_create_nested "create_nested"
 #define VERBOSE_exec "exec"
@@ -42,9 +43,9 @@
 // verbose messages
 #define VERBOSE_PROFILING_UNSUPPORTED "profiling capabilities are not supported"
 #define VERBOSE_INVALID_DEVICE_ENV "invalid %s device in environment: index %zu"
-#define VERBOSE_INVALID_ENGINE_KIND "no %s device is available"
+#define VERBOSE_INVALID_ENGINE_KIND "no %s %s device is available"
 #define VERBOSE_INVALID_ENGINE_IDX \
-    "%zu %s devices are available but %zu was queried"
+    "%zu %s devices are available but device index %zu was queried"
 #define VERBOSE_INVALID_ACC_MODE "bad accumulation mode %s"
 #define VERBOSE_NULL_ARG "one of the mandatory arguments is nullptr"
 #define VERBOSE_BAD_ENGINE_KIND "bad engine kind"
@@ -64,6 +65,9 @@
 #define VERBOSE_INCONSISTENT_DIM "dimension %s:%d is inconsistent with %s:%d"
 #define VERBOSE_INCONSISTENT_NDIMS \
     "tensors %s and %s have inconsistent number of dimensions"
+// TODO: replace the version above with the version below.
+#define VERBOSE_INCONSISTENT_NDIMS_WITH_VALS \
+    "tensors %s and %s have inconsistent number of dimensions (%d) and (%d)"
 #define VERBOSE_INCONSISTENT_DT "tensors %s and %s have inconsistent datatypes"
 #define VERBOSE_INCONSISTENT_MDS "inconsistent %s and %s mds"
 #define VERBOSE_INCONSISTENT_ALPHA_BETA \
@@ -71,6 +75,7 @@
 #define VERBOSE_INCONSISTENT_PRB "problem is not mathematically consistent"
 #define VERBOSE_BAD_NDIMS "%s has a bad number of dimensions %d"
 #define VERBOSE_BAD_DIM "bad dimension %s:%d"
+#define VERBOSE_OUT_OF_RANGE_DIMS "out-of-range dimensions for %s"
 
 #define VERBOSE_UNSUPPORTED_ISA "unsupported isa"
 #define VERBOSE_UNSUPPORTED_DT "unsupported datatype"
@@ -103,6 +108,8 @@
 #define VERBOSE_WS_MISMATCH \
     "workspace mismatch between forward and backward primitive " \
     "descriptors"
+#define VERBOSE_TENSOR_FORMAT_MISMATCH \
+    "memory formats for %s and %s tensors do not match"
 
 #define VERBOSE_WS_INIT "workspace initialization failed"
 #define VERBOSE_SCRATCHPAD_INIT "scratchpad initialization unsuccessful"
@@ -116,14 +123,16 @@
 #define VERBOSE_IMPL_HEURISTIC_FAIL "heuristic fail: %s"
 #define VERBOSE_1x1CONV_HEURISTIC_FAIL "heuristic fail for 1x1 convolution: %s"
 #define VERBOSE_SCRATCHPAD_LIMIT "scratchpad memory limit exceeded"
-#define VERBOSE_PRIMITIVE_CREATION_FAIL "failed to create nested primitive %s"
+#define VERBOSE_PRIMITIVE_CREATION_FAIL "failed to create nested %s primitive"
 #define VERBOSE_DESC_CREATION_FAIL "failed to create %s descriptor"
 #define VERBOSE_SHAPE_RESTRICTION "failed shape restrictions"
 #define VERBOSE_INCOMPATIBLE_GEMM_FMT "incompatible gemm format"
 
 #define VERBOSE_DEVICE_CTX_MISMATCH "device not found in the given context"
+#define VERBOSE_MISSING_OCL_DEVICE "%s OpenCL device not found"
 #define VERBOSE_INVALID_PLATFORM "unsupported %s platform (expected %s got %s)"
 #define VERBOSE_ENGINE_CREATION_FAIL "failed to create %s engine with index %zu"
+#define VERBOSE_KERNEL_CREATION_FAIL "failed to create %s kernel"
 #define VERBOSE_DETERMINISTIC_FAIL "failed to run kernel deterministically"
 #define VERBOSE_SKIP_PRIMITIVE_IMPL \
     "skipping or dispatching to another implementation"
diff --git a/src/common/z_magic.hpp b/src/common/z_magic.hpp
index 9baae4c8bab..597954e2844 100644
--- a/src/common/z_magic.hpp
+++ b/src/common/z_magic.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2022 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@
 #define PRAGMA_MACRO(x) PRAGMA_MACRo(x)
 #endif
 
-#define UNUSED(x) ((void)x)
+#define UNUSED(x) ((void)(x))
 #define MAYBE_UNUSED(x) UNUSED(x)
 
 #if defined(_WIN32) && !defined(__GNUC__)
diff --git a/src/cpu/CMakeLists.txt b/src/cpu/CMakeLists.txt
index 17ad1e4a59e..ab791ee7b2c 100644
--- a/src/cpu/CMakeLists.txt
+++ b/src/cpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2019-2024 Intel Corporation
+# Copyright 2019-2025 Intel Corporation
 # Copyright 2020 Arm Ltd. and affiliates
 # Copyright 2021 FUJITSU LIMITED
 #
@@ -22,6 +22,7 @@ file(GLOB_RECURSE SOURCES_EXTRA
     ${CMAKE_CURRENT_SOURCE_DIR}/matmul/*.[ch]pp
     ${CMAKE_CURRENT_SOURCE_DIR}/reorder/*.[ch]pp
     ${CMAKE_CURRENT_SOURCE_DIR}/rnn/*.[ch]pp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ukernel/*.[ch]pp
     )
 
 foreach(SOURCE_FILE ${SOURCES_EXTRA})
@@ -130,6 +131,7 @@ set(OBJ_LIB ${LIB_PACKAGE_NAME}_cpu)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
+enable_conditional_compilation4(${OBJ_LIB})
 
 if (DNNL_TARGET_ARCH STREQUAL "X64")
     add_subdirectory(x64)
@@ -137,6 +139,9 @@ endif()
 if (DNNL_TARGET_ARCH STREQUAL "AARCH64")
     add_subdirectory(aarch64)
 endif()
+if (DNNL_USE_ACL)
+    add_subdirectory(acl)
+endif()
 if (DNNL_TARGET_ARCH STREQUAL "PPC64")
     add_subdirectory(ppc64)
 endif()
diff --git a/src/cpu/README.md b/src/cpu/README.md
index 75668c15c82..7641f9e825b 100644
--- a/src/cpu/README.md
+++ b/src/cpu/README.md
@@ -44,7 +44,9 @@ architecture. Hence, for portability reasons [`cpu/platform.hpp`](platform.hpp)
 header file provides a set of helpers macros that could help conditionally
 enable or disable parts of code. There the following macros defined:
 - `DNNL_X64` is 1 on x64 architecture;
+- `DNNL_X86` is 1 on x86 architecture;
 - `DNNL_AARCH64` is 1 on Arm AArch64 architecture;
+- `DNNL_ARM` is 1 on Arm 32 architecture;
 - `DNNL_PPC64` is 1 on OpenPOWER / IBM Power architecture;
 - `DNNL_S390X` is 1 on IBMz / s390x architecture;
 - `DNNL_RV64` is 1 on RISC-V architecture;
diff --git a/src/cpu/aarch64/CMakeLists.txt b/src/cpu/aarch64/CMakeLists.txt
index 432a00bc70a..32eec64988c 100644
--- a/src/cpu/aarch64/CMakeLists.txt
+++ b/src/cpu/aarch64/CMakeLists.txt
@@ -20,21 +20,6 @@ file(GLOB_RECURSE SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
     )
 
-file(GLOB XBYAK_AARCH64_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/xbyak_aarch64/src/util_impl.cpp
-    )
-
-list(REMOVE_ITEM SOURCES ${XBYAK_AARCH64_FILES})
-
-if(NOT DNNL_AARCH64_USE_ACL)
-    file(GLOB_RECURSE ACL_FILES
-        ${CMAKE_CURRENT_SOURCE_DIR}/acl_*.[ch]
-        ${CMAKE_CURRENT_SOURCE_DIR}/acl_*.[ch]pp
-        )
-    list(REMOVE_ITEM SOURCES ${ACL_FILES})
-endif()
-
 # If the runtime is not THREADPOOL remove threadpool_scheduler sources.
 if(NOT DNNL_CPU_RUNTIME STREQUAL "THREADPOOL")
     list(APPEND ACL_THREADPOOL_FILES
@@ -48,5 +33,6 @@ set(OBJ_LIB ${LIB_PACKAGE_NAME}_cpu_aarch64)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
+enable_conditional_compilation4(${OBJ_LIB})
 
-add_subdirectory(xbyak_aarch64)
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/xbyak_aarch64 xbyak_aarch64)
diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
deleted file mode 100644
index 37a3d6c3d98..00000000000
--- a/src/cpu/aarch64/acl_convolution_utils.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
-#define CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
-
-#include <map>
-#include "acl_post_ops.hpp"
-#include "acl_utils.hpp"
-#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
-#include "cpu/cpu_convolution_pd.hpp"
-#include <type_traits>
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-template <typename ConvOp>
-struct acl_obj_t {
-    ConvOp conv;
-    arm_compute::experimental::MemoryRequirements aux_mem_req;
-};
-
-struct acl_conv_conf_t {
-    bool with_bias;
-    bool fast_math;
-    // If this is true, the result of the convolution goes into a temporarily
-    // allocated ACL tensor to be accumulated into the oneDNN dst during postops
-    bool use_dst_acc_for_sum;
-    // Tells that the selected algorithm is Winograd. This is needed because the
-    // algorithm can be set to algorithm::convolution_auto and later on we need to
-    // skip fixed-format protocol as ACL Winograd does not support it.
-    bool alg_winograd;
-    arm_compute::TensorInfo src_tensor_info;
-    arm_compute::TensorInfo wei_tensor_info;
-    arm_compute::TensorInfo bia_tensor_info;
-    arm_compute::TensorInfo dst_tensor_info;
-
-    arm_compute::PadStrideInfo padstride_info;
-    arm_compute::Size2D dilation_info;
-    // Additional information about the weights not included in wei_tensor_info
-    arm_compute::WeightsInfo weights_info;
-    // Note: this will default to not enabled, and will do nothing
-    arm_compute::ActivationLayerInfo act_info;
-};
-
-namespace acl_convolution_utils {
-
-status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
-        memory_desc_t &weights_md, memory_desc_t &dst_md,
-        memory_desc_t &bias_md, const convolution_desc_t &cd,
-        const primitive_attr_t &attr);
-
-} // namespace acl_convolution_utils
-
-// Keys are anonymous with local linkage. So deduce the type automagically.
-using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
-
-template <typename op_t, typename post_ops_t>
-status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
-        const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
-        post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
-        arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
-        const dnnl::impl::memory_desc_t &dst_md) {
-
-    // Book temp mem.
-    const auto aux_mem_req = conv.workspace();
-    for (const auto &key : conv_keys) {
-        const auto id = key.first;
-        if (aux_mem_req[id].size > 0) {
-            scratchpad.book(key.second, aux_mem_req[id].size, 1,
-                    aux_mem_req[id].alignment, aux_mem_req[id].alignment);
-        }
-    }
-
-    CHECK(post_ops.init(engine, attr_post_ops, dst_md, act_info));
-    use_dst_acc_for_sum = post_ops.has_sum();
-
-    if (use_dst_acc_for_sum) {
-        const memory_desc_wrapper dst_d(&dst_md);
-        scratchpad.book(memory_tracking::names::key_generic_acc, dst_d.nelems(),
-                dst_d.data_type_size());
-    }
-
-    return status::success;
-}
-
-template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
-        typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
-        typename bia_data_t = src_data_t>
-status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
-        conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
-        const std::map<int, conv_key_t> &conv_keys) {
-
-    auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
-    auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
-
-    // import_memory() and free() methods do not allocate/free any additional
-    // memory, only acquire/release pointers.
-    arm_compute::Tensor src_tensor;
-    arm_compute::Tensor wei_tensor;
-    arm_compute::Tensor bia_tensor = nullptr;
-    arm_compute::Tensor dst_tensor;
-
-    auto const acp = pd->acp_;
-    src_tensor.allocator()->init(acp.src_tensor_info);
-    wei_tensor.allocator()->init(acp.wei_tensor_info);
-    dst_tensor.allocator()->init(acp.dst_tensor_info);
-
-    src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
-    wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));
-
-    const auto scratchpad = ctx.get_scratchpad_grantor();
-
-    // If we have an unfused sum post op, put the result in a scratchpad tensor.
-    // Result will be summed to the dst during acl_post_ops.execute
-    auto dst_base = acp.use_dst_acc_for_sum
-            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
-            : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
-    dst_tensor.allocator()->import_memory(dst_base);
-
-    if (acp.with_bias) {
-        auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
-        bia_tensor.allocator()->init(acp.bia_tensor_info);
-        bia_tensor.allocator()->import_memory(
-                const_cast<bia_data_t *>(bia_base));
-    }
-
-    // Constness of the weight tensor matters for depthwise conv in ACL.
-    // Otherwise, it will package the weights more often than needed, as
-    // it will expect the weights to change within the duration of the run
-    // func.
-    arm_compute::ITensorPack pack;
-    pack.add_tensor(arm_compute::TensorType::ACL_SRC_0, &src_tensor);
-    pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &wei_tensor);
-    pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &bia_tensor);
-    pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor);
-
-    // Get temp workspaces.
-    const auto aux_mem = acl_conv_obj->aux_mem_req;
-
-    // Hold onto tmp tensors while we need pack.
-    std::vector<arm_compute::Tensor> tmp_tensors(aux_mem.size());
-    for (const auto &key : conv_keys) {
-        const auto id = key.first;
-        if (aux_mem[id].size > 0) {
-            const auto info = arm_compute::TensorInfo(
-                    arm_compute::TensorShape(aux_mem[id].size), 1,
-                    arm_compute::DataType::U8);
-            auto buffer = scratchpad.get<void>(key.second);
-            tmp_tensors[id].allocator()->init(info, aux_mem[id].alignment);
-            tmp_tensors[id].allocator()->import_memory(buffer);
-            pack.add_tensor(aux_mem[id].slot, &tmp_tensors[id]);
-        }
-    }
-
-    acl_conv_obj->conv.run(pack);
-
-    void *dst = dst_tensor.buffer();
-    pd->post_ops.execute(ctx, dst);
-
-    return status::success;
-}
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
diff --git a/src/cpu/aarch64/acl_reorder.cpp b/src/cpu/aarch64/acl_reorder.cpp
index 061751b5557..73e38c0c4bb 100644
--- a/src/cpu/aarch64/acl_reorder.cpp
+++ b/src/cpu/aarch64/acl_reorder.cpp
@@ -19,7 +19,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     // Lock here is needed because resource_mapper does not support
@@ -46,7 +46,7 @@ status_t acl_reorder_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_reorder.hpp b/src/cpu/aarch64/acl_reorder.hpp
index e586ed4e304..617053841be 100644
--- a/src/cpu/aarch64/acl_reorder.hpp
+++ b/src/cpu/aarch64/acl_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Arm Ltd. and affiliates
+* Copyright 2023-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,19 +13,19 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#ifndef CPU_AARCH64_ACL_REORDER_HPP
-#define CPU_AARCH64_ACL_REORDER_HPP
+#ifndef CPU_ACL_REORDER_HPP
+#define CPU_ACL_REORDER_HPP
 
 #include "arm_compute/core/Types.h"
 #include "common/utils.hpp"
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 #include "cpu/aarch64/cpu_isa_traits.hpp"
 #include "cpu/reorder/cpu_reorder_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_reorder_obj_t {
     arm_compute::NEReorderLayer reorder;
@@ -95,12 +95,12 @@ struct acl_reorder_fwd_t : public primitive_t {
 
             if (!ok) return status::unimplemented;
 
-            int mask = -1;
-            bool is_set = false;
-            CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
-            const memory_desc_wrapper input_d(src_md);
-            if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
-                return status::unimplemented;
+            if (!attr->scales_.has_default_values(DNNL_ARG_DST)) {
+                int mask = attr->scales_.get_mask(DNNL_ARG_DST);
+                const memory_desc_wrapper input_d(src_md);
+                if (input_d.has_runtime_dims_or_strides() && mask > 0)
+                    return status::unimplemented;
+            }
 
             // Create and check primitive descriptor
             auto _pd = make_unique_pd<pd_t>(attr, src_engine->kind(), src_md,
@@ -131,7 +131,7 @@ struct acl_reorder_fwd_t : public primitive_t {
             if (dst_tag == format_tag::BA4b4a || dst_tag == format_tag::Acdb4a
                     || dst_tag == format_tag::Ab4a) {
                 _pd->app_.dst_wf = arm_compute::WeightFormat::OHWIo4;
-            } else if (mayiuse(sve_256)
+            } else if (aarch64::mayiuse(aarch64::sve_256)
                     && (dst_tag == format_tag::BA8b4a
                             || dst_tag == format_tag::Acdb8a
                             || dst_tag == format_tag::Ab8a)) {
@@ -147,13 +147,17 @@ struct acl_reorder_fwd_t : public primitive_t {
             switch (src_md->ndims) {
                 case 2: {
                     if (src_tag == format_tag::ab
-                            && dst_md->data_type == data_type::bf16) { // bf16
+                            && dst_md->data_type == data_type::bf16
+                            && utils::one_of(dst_tag, format_tag::BA8b4a,
+                                    format_tag::BA4b4a)) { // bf16
                         acl_tensor_shape_in = arm_compute::TensorShape(
                                 src_md->dims[0], src_md->dims[1]);
                         acl_tensor_shape_out = arm_compute::TensorShape(
                                 dst_md->padded_dims[0], dst_md->padded_dims[1]);
                     } else if (src_tag == format_tag::ba
-                            && dst_md->data_type == data_type::f32) { // f32
+                            && dst_md->data_type == data_type::f32
+                            && !utils::one_of(dst_tag, format_tag::BA8b4a,
+                                    format_tag::BA4b4a)) { // f32
                         acl_tensor_shape_in = arm_compute::TensorShape(
                                 src_md->dims[1], src_md->dims[0]);
                         acl_tensor_shape_out = arm_compute::TensorShape(
@@ -239,9 +243,9 @@ struct acl_reorder_fwd_t : public primitive_t {
 
 }; // acl_reorder_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_REORDER_HPP
+#endif // CPU_ACL_REORDER_HPP
diff --git a/src/cpu/aarch64/acl_softmax.cpp b/src/cpu/aarch64/acl_softmax.cpp
deleted file mode 100644
index 976b33665d2..00000000000
--- a/src/cpu/aarch64/acl_softmax.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2022 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "cpu/aarch64/acl_softmax.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-
-    // Lock here is needed because resource_mapper does not support
-    // concurrent multithreaded access.
-    std::lock_guard<std::mutex> _lock {this->mtx};
-
-    auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
-    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
-
-    // Retrieve primitive resource and configured Compute Library objects
-    auto *acl_resource
-            = ctx.get_resource_mapper()->get<acl_softmax_resource_t>(this);
-    acl_softmax_obj_t &acl_obj = acl_resource->get_acl_obj();
-
-    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src));
-    acl_obj.dst_tensor.allocator()->import_memory(dst);
-
-    acl_obj.softmax->run();
-
-    acl_obj.src_tensor.allocator()->free();
-    acl_obj.dst_tensor.allocator()->free();
-
-    return status::success;
-}
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_softmax.hpp b/src/cpu/aarch64/acl_softmax.hpp
deleted file mode 100644
index 020e6ca5ab0..00000000000
--- a/src/cpu/aarch64/acl_softmax.hpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_AARCH64_ACL_SOFTMAX_HPP
-#define CPU_AARCH64_ACL_SOFTMAX_HPP
-
-#include "cpu/cpu_softmax_pd.hpp"
-
-#include "cpu/aarch64/acl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-struct acl_softmax_obj_t {
-    std::unique_ptr<arm_compute::IFunction> softmax;
-    arm_compute::Tensor src_tensor;
-    arm_compute::Tensor dst_tensor;
-};
-
-struct acl_softmax_conf_t {
-    arm_compute::TensorInfo src_info;
-    arm_compute::TensorInfo dst_info;
-    float beta;
-    int32_t axis;
-    bool is_logsoftmax;
-};
-
-struct acl_softmax_resource_t : public resource_t {
-    acl_softmax_resource_t()
-        : acl_obj_(utils::make_unique<acl_softmax_obj_t>()) {}
-
-    status_t configure(const acl_softmax_conf_t &asp) {
-        if (!acl_obj_) return status::out_of_memory;
-
-        // Init Compute Library tensors based on info from descriptor
-        acl_obj_->src_tensor.allocator()->init(asp.src_info);
-        acl_obj_->dst_tensor.allocator()->init(asp.dst_info);
-
-        if (asp.is_logsoftmax) {
-            auto logsoftmax
-                    = std::make_unique<arm_compute::NELogSoftmaxLayer>();
-            // clang-format off
-            logsoftmax->configure(
-                &acl_obj_->src_tensor,
-                &acl_obj_->dst_tensor,
-                asp.beta,
-                asp.axis);
-            // clang-format on
-            acl_obj_->softmax = std::move(logsoftmax);
-        } else {
-            auto softmax = std::make_unique<arm_compute::NESoftmaxLayer>();
-            // clang-format off
-            softmax->configure(
-                &acl_obj_->src_tensor,
-                &acl_obj_->dst_tensor,
-                asp.beta,
-                asp.axis);
-            // clang-format on
-            acl_obj_->softmax = std::move(softmax);
-        }
-
-        return status::success;
-    }
-
-    acl_softmax_obj_t &get_acl_obj() const { return *acl_obj_; }
-
-    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_softmax_resource_t);
-
-private:
-    std::unique_ptr<acl_softmax_obj_t> acl_obj_;
-}; // acl_softmax_resource_t
-
-struct acl_softmax_fwd_t : public primitive_t {
-    struct pd_t : public cpu_softmax_fwd_pd_t {
-        using cpu_softmax_fwd_pd_t::cpu_softmax_fwd_pd_t;
-
-        DECLARE_COMMON_PD_T("acl", acl_softmax_fwd_t);
-
-        status_t init(engine_t *engine) {
-
-            bool ok = is_fwd()
-                    && set_default_formats() == status::success
-                    // ACL only supports matching src/dst (this must come after
-                    // set_default_formats() to handle format_kind::any)
-                    && *src_md() == *dst_md()
-                    && utils::one_of(
-                            src_md()->data_type, data_type::f32, data_type::f16)
-                    && attr()->has_default_values();
-            if (!ok) return status::unimplemented;
-
-            // Get memory desc to find sizes and dims
-            const memory_desc_wrapper src_d(src_md());
-            const data_type_t data_type = src_d.data_type();
-
-            // ACL only supports plain tensors, can be permuted but not blocked
-            if (!src_d.is_plain()) return status::unimplemented;
-
-            // Guards against a 0-sized dimension
-            if (src_d.has_zero_dim()) return status::unimplemented;
-
-            // No scaling
-            asp_.beta = 1;
-
-            asp_.is_logsoftmax = is_logsoftmax();
-
-            // The strides give us the in memory inner size
-            dim_t inner_size_ = src_d.blocking_desc().strides[axis()];
-
-            dim_t axis_size_ = axis_size();
-
-            // The outer size is any left-over dimensions not inner or on the axis
-            dim_t outer_size_ = src_d.nelems() / (inner_size_ * axis_size_);
-
-            // In this context, NHWC tells ACL that the logical and physical
-            // dimensions are the same
-            arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
-
-            const arm_compute::DataType acl_data_t
-                    = acl_utils::get_acl_data_t(data_type);
-
-            const int threads = dnnl_get_max_threads();
-            if (inner_size_ == 1) {
-                // A rough empirical heuristic created by fitting a polynomial
-                // of the tensor sizes and thread count to the run time of the
-                // ref and ACL softmax. This variable is greater than zero when
-                // ref is faster, and less than zero when ACL is faster. We can
-                // interpret the constant term as the constant overhead
-                // associated with calling the external library and the negative
-                // coefficient on total_size as ACL being faster at processing
-                // each element
-                double acl_ref_performance_diff = 1 + 0.005 * outer_size_
-                        - 0.0027 * axis_size_
-                                * std::ceil(double(outer_size_) / threads);
-                if (threads > 1 || outer_size_ > 1) {
-                    // Using threads within ACL adds another constant overhead
-                    acl_ref_performance_diff += 17;
-                }
-                if (acl_ref_performance_diff > 0) return status::unimplemented;
-
-                // If the inner size is 1, we can get rid of the dimension.
-                // This stops ACL doing a unnecessary permute
-                arm_compute::TensorShape acl_tensor_shape
-                        = arm_compute::TensorShape(axis_size_, outer_size_);
-                asp_.axis = 0;
-
-                asp_.src_info = arm_compute::TensorInfo(
-                        acl_tensor_shape, 1, acl_data_t, acl_layout);
-                asp_.dst_info = arm_compute::TensorInfo(
-                        acl_tensor_shape, 1, acl_data_t, acl_layout);
-            } else {
-                // A rough empirical heuristic, see comment above
-                // The only difference here is that ACL does a reorder, and so
-                // is considerably better
-                double acl_ref_performance_diff = 1 + 0.005 * outer_size_
-                        - 0.01 * inner_size_ * axis_size_
-                                * std::ceil(double(outer_size_) / threads);
-                if (threads > 1 || outer_size_ > 1) {
-                    // Using threads within ACL adds another constant overhead
-                    acl_ref_performance_diff += 17;
-                }
-
-                if (acl_ref_performance_diff > 0) return status::unimplemented;
-
-                // Irrespective of the input dimensions, we construct a tensor
-                // with dimensions such that softmax can be applied over the
-                // middle axis (1), with the correct stride and vector length.
-                arm_compute::TensorShape acl_tensor_shape
-                        = arm_compute::TensorShape(
-                                inner_size_, axis_size_, outer_size_);
-                asp_.axis = 1;
-
-                asp_.src_info = arm_compute::TensorInfo(
-                        acl_tensor_shape, 1, acl_data_t, acl_layout);
-                asp_.dst_info = arm_compute::TensorInfo(
-                        acl_tensor_shape, 1, acl_data_t, acl_layout);
-            }
-
-            // Validate manually to check for return status
-            if (asp_.is_logsoftmax) {
-                ACL_CHECK_VALID(arm_compute::NELogSoftmaxLayer::validate(
-                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
-            } else {
-                ACL_CHECK_VALID(arm_compute::NESoftmaxLayer::validate(
-                        &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
-            }
-
-            return status::success;
-        }
-
-        acl_softmax_conf_t asp_;
-    }; // pd_t
-
-    acl_softmax_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-
-    status_t create_resource(
-            engine_t *engine, resource_mapper_t &mapper) const override {
-        if (mapper.has_resource(this)) return status::success;
-
-        auto r = utils::make_unique<acl_softmax_resource_t>();
-        if (!r) return status::out_of_memory;
-
-        // Configure the resource based on information from primitive descriptor
-        auto st = r->configure(pd()->asp_);
-        if (st == status::success) { mapper.add(this, std::move(r)); }
-
-        return st;
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        return execute_forward(ctx);
-    }
-
-private:
-    // To guard the const execute_forward, the mutex must be 'mutable'
-    mutable std::mutex mtx;
-    status_t execute_forward(const exec_ctx_t &ctx) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-}; // acl_softmax_fwd_t
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/aarch64/acl_thread.cpp b/src/cpu/aarch64/acl_thread.cpp
deleted file mode 100644
index 1b098629ab5..00000000000
--- a/src/cpu/aarch64/acl_thread.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "cpu/aarch64/acl_thread.hpp"
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
-#endif
-#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-namespace acl_thread_utils {
-
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-void acl_thread_bind() {
-    static std::once_flag flag_once;
-    // The threads in Compute Library are bound for the cores 0..max_threads-1
-    // dnnl_get_max_threads() returns OMP_NUM_THREADS
-    const int max_threads = dnnl_get_max_threads();
-    // arm_compute::Scheduler does not support concurrent access thus a
-    // workaround here restricts it to only one call
-    std::call_once(flag_once, [&]() {
-        arm_compute::Scheduler::get().set_num_threads(max_threads);
-    });
-}
-// Swap BenchmarkScheduler for default ACL scheduler builds (i.e. CPPScheduler, OMPScheduler)
-void acl_set_benchmark_scheduler_default() {
-    static std::once_flag flag_once;
-    arm_compute::IScheduler *_real_scheduler = &arm_compute::Scheduler::get();
-    std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
-            = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
-    // set Benchmark scheduler in ACL
-    std::call_once(flag_once, [&]() {
-        arm_compute::Scheduler::set(
-                std::static_pointer_cast<arm_compute::IScheduler>(
-                        benchmark_scheduler));
-    });
-}
-#endif
-
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-void acl_set_tp_scheduler() {
-    static std::once_flag flag_once;
-    // Create threadpool scheduler
-    std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
-            = std::make_unique<ThreadpoolScheduler>();
-    // set CUSTOM scheduler in ACL
-    std::call_once(flag_once,
-            [&]() { arm_compute::Scheduler::set(threadpool_scheduler); });
-}
-
-void acl_set_threadpool_num_threads() {
-    using namespace dnnl::impl::threadpool_utils;
-    static std::once_flag flag_once;
-    threadpool_interop::threadpool_iface *tp = get_active_threadpool();
-    // Check active threadpool
-    bool is_main = get_active_threadpool() == tp;
-    if (is_main) {
-        // Set num threads based on threadpool size
-        const int num_threads = (tp) ? dnnl_get_max_threads() : 1;
-        std::call_once(flag_once, [&]() {
-            arm_compute::Scheduler::get().set_num_threads(num_threads);
-        });
-    }
-}
-// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
-void acl_set_tp_benchmark_scheduler() {
-    static std::once_flag flag_once;
-    // Create threadpool scheduler
-    std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
-            = std::make_unique<ThreadpoolScheduler>();
-    arm_compute::IScheduler *_real_scheduler = nullptr;
-    _real_scheduler = threadpool_scheduler.release();
-    // Create benchmark scheduler and set TP as real scheduler
-    std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
-            = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
-    std::call_once(flag_once,
-            [&]() { arm_compute::Scheduler::set(benchmark_scheduler); });
-}
-#endif
-
-void set_acl_threading() {
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
-    acl_thread_bind();
-    if (get_verbose(verbose_t::profile_externals)) {
-        acl_set_benchmark_scheduler_default();
-    }
-#endif
-#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
-    if (get_verbose(verbose_t::profile_externals)) {
-        acl_set_tp_benchmark_scheduler();
-    } else {
-        acl_set_tp_scheduler();
-    }
-
-#endif
-}
-
-} // namespace acl_thread_utils
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_winograd_convolution.cpp b/src/cpu/aarch64/acl_winograd_convolution.cpp
deleted file mode 100644
index da015388d64..00000000000
--- a/src/cpu/aarch64/acl_winograd_convolution.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "acl_winograd_convolution.hpp"
-#include "common/memory_tracking.hpp"
-#include "common/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-namespace {
-using data_t = prec_traits<data_type::f32>::type;
-
-// Keys are anonymous. So deduce the type automagically.
-using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
-
-// Map: [slot , key]
-const std::map<int, conv_key_t> wino_conv_keys
-        = {{0, conv_key_t::key_gemm_asm_tmp_buffer},
-                {1, conv_key_t::key_gemm_pretranspose_b},
-                {2, conv_key_t::key_gemm_pretranspose},
-                {3, conv_key_t::key_gemm_interleaved_lhs},
-                {4, conv_key_t::key_gemm_pretransposed_rhs},
-                {5, conv_key_t::key_gemm_transposed_1xwrhs},
-                {6, conv_key_t::key_gemm_tmp_buffer},
-                {7, conv_key_t::key_conv_permuted_outputs},
-                {8, conv_key_t::key_conv_permuted_inputs},
-                {9, conv_key_t::key_wino_workspace},
-                {10, conv_key_t::key_wino_transformed_weights},
-                {11, conv_key_t::key_conv_permuted_weights}};
-} // namespace
-
-status_t acl_wino_convolution_fwd_t::pd_t::init(engine_t *engine) {
-    using namespace data_type;
-    const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
-            && attr()->has_default_values(
-                    primitive_attr_t::skip_mask_t::post_ops, f16);
-    const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
-            && attr()->has_default_values(
-                    primitive_attr_t::skip_mask_t::post_ops, f32);
-    bool ok = is_fwd()
-            && utils::one_of(desc()->alg_kind, alg_kind::convolution_auto,
-                    alg_kind::convolution_winograd)
-            && utils::one_of(true, is_fp16_ok, is_fp32_ok)
-            && !has_zero_dim_memory();
-
-    ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
-    if (!ok) return status::unimplemented;
-
-    CHECK(init_conf());
-
-    set_default_alg_kind(alg_kind::convolution_winograd);
-
-    Op conv;
-    conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
-            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
-            &acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
-            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
-
-    auto scratchpad = scratchpad_registry().registrar();
-    const auto aux_mem = conv.workspace();
-    return init_scratchpad(conv, scratchpad, wino_conv_keys, engine, post_ops,
-            attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
-}
-
-status_t acl_wino_convolution_fwd_t::init(engine_t *engine) {
-    auto acp = pd()->acp_;
-    acl_obj_->conv.configure(&acp.src_tensor_info, &acp.wei_tensor_info,
-            acp.with_bias ? &acp.bia_tensor_info : nullptr,
-            &acp.dst_tensor_info, acp.padstride_info, acp.act_info,
-            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
-
-    acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
-    return status::success;
-}
-
-status_t acl_wino_convolution_fwd_t::pd_t::init_conf() {
-
-    // Under these conditions, fallback to faster GEMM-based convolution
-    // unless the user explicitly specifies Winograd algorithm
-    if (utils::one_of(true, src_md_.dims[2] > 112, // ih
-                src_md_.dims[3] > 112, // iw
-                src_md_.dims[1] < 64, // ic
-                dst_md_.dims[1]<64, // oc
-                        dnnl_get_max_threads()> 28)
-            && desc()->alg_kind == alg_kind::convolution_auto) {
-        return status::unimplemented;
-    }
-
-    // General Compute Library checks, memory tags are also set there
-    acp_.alg_winograd = true;
-    CHECK(acl_convolution_utils::acl_init_conf(
-            acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
-
-    const bool shape_ok
-            // only unit strides allowed
-            = (acp_.padstride_info.stride() == std::pair<uint, uint> {1, 1})
-            // Note: Compute Library supports arbitrary padding for wino kernels
-            // but we only allow small padding to be consistent with oneDNN
-            && (acp_.padstride_info.pad().first <= 1) // padding left/right
-            && (acp_.padstride_info.pad().second <= 1) // padding top/bottom
-            // only non-dilated convolutions allowed
-            && (acp_.dilation_info == arm_compute::Size2D(1, 1));
-
-    ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
-
-    // Validate convolution manually to check for return status
-    ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info,
-            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
-            &acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
-            true)); // enable_fast_math flag in ACL Winograd
-
-    return status::success;
-}
-
-status_t acl_wino_convolution_fwd_t::execute_forward(
-        const exec_ctx_t &ctx) const {
-    return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
-            ctx, acl_obj_.get(), pd(), wino_conv_keys);
-}
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp
deleted file mode 100644
index 15b015757ea..00000000000
--- a/src/cpu/aarch64/acl_winograd_convolution.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Arm Ltd. and affiliates
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
-#define CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
-
-#include "cpu/cpu_convolution_pd.hpp"
-
-#include "acl_convolution_utils.hpp"
-#include "arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-struct acl_wino_convolution_fwd_t : public primitive_t {
-    using Op = arm_compute::experimental::op::CpuWinogradConv2d;
-
-    struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), acp_() {}
-
-        DECLARE_COMMON_PD_T(
-                "wino:acl", acl_wino_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
-
-        status_t init(engine_t *engine);
-
-        acl_conv_conf_t acp_;
-        acl_post_ops_t post_ops;
-
-    private:
-        status_t init_conf();
-    };
-
-    acl_wino_convolution_fwd_t(const pd_t *apd)
-        : primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}
-
-    status_t init(engine_t *engine) override;
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        return execute_forward(ctx);
-    }
-
-private:
-    status_t execute_forward(const exec_ctx_t &ctx) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::unique_ptr<acl_obj_t<Op>> acl_obj_;
-}; // acl_wino_convolution_fwd_t
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // CPU_AARCH64_ACL_WINOGRAD_CONVOLUTION_HPP
diff --git a/src/cpu/aarch64/brgemm/brgemm.cpp b/src/cpu/aarch64/brgemm/brgemm.cpp
index 6ed6cc59597..94e1c73fd3b 100644
--- a/src/cpu/aarch64/brgemm/brgemm.cpp
+++ b/src/cpu/aarch64/brgemm/brgemm.cpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2023-2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -170,8 +171,8 @@ status_t brgemm_desc_init(brgemm_t *brg, cpu_isa_t isa,
     if (brg == nullptr) return status::invalid_arguments;
     if (transA || transB) return status::unimplemented;
 
-    brgemm_utils::init_brgemm_conf(brg, isa, type, dt_a, dt_b, layout, alpha,
-            beta, LDA, LDB, LDC, M, N, K, strides);
+    CHECK(brgemm_utils::init_brgemm_conf(brg, isa, type, dt_a, dt_b, layout,
+            alpha, beta, LDA, LDB, LDC, M, N, K, strides));
 
     if (M <= 0 || N <= 0 || K <= 0) return status::invalid_arguments;
     bool ldx_check = (brg->is_row_major()) ? (LDA < K)
@@ -197,8 +198,8 @@ status_t brdgmm_desc_init(brgemm_t *brg, cpu_isa_t isa,
     if (transA || layout != brgemm_row_major || alpha != 1.0f || beta != 0.f)
         return status::unimplemented;
 
-    brgemm_utils::init_brdgmm_conf(brg, isa, type, dt_a, dt_b, layout, alpha,
-            beta, LDA, LDC, M, N, strides);
+    CHECK(brgemm_utils::init_brdgmm_conf(brg, isa, type, dt_a, dt_b, layout,
+            alpha, beta, LDA, LDC, M, N, strides));
 
     const bool ldx_check = (LDA < N || LDC < N);
     if (ldx_check) return status::invalid_arguments;
@@ -290,41 +291,52 @@ status_t brgemm_desc_set_postops(brgemm_t *brg, const primitive_attr_t *attr,
 
     const auto &src_scales = attr->scales_.get(DNNL_ARG_SRC);
     const auto &wei_scales = attr->scales_.get(DNNL_ARG_WEIGHTS);
-    brg->with_scales = !src_scales.has_default_values()
-            || !wei_scales.has_default_values()
+    const bool has_src_scales = !src_scales.has_default_values();
+    const bool has_wei_scales = !wei_scales.has_default_values();
+    brg->with_scales = has_src_scales || has_wei_scales
             || brg->with_weights_scale_adjust;
     if (brg->with_scales) {
         // Note. the current version supports only two different output scale
         // types:
-        //     1) common (mask_ = 0)
+        //     1) common (mask = 0)
         //     2) per_n_dim_scale - broadcast across n dimension;
         //        for convolution and inner product promitives it corresponds
-        //        to "per_oc" mask_ = 1 << 1; for matmul - to
-        //        mask_ = (1 << (ndims - 1))), where ndims is number of
+        //        to "per_oc" mask = 1 << 1; for matmul - to
+        //        mask = (1 << (ndims - 1))), where ndims is number of
         //        dimensions for original matmul problem
-        // So if wei_scales.mask_ != 0 (not common) it's assumed here that scale
-        // type is per_n_dim_scale and driver which calls brgemm kernel checked
-        // that mask has correct value for this case
-        brg->is_oc_scale = wei_scales.mask_ != 0;
+        // So if wei_scales.get_mask() > 0 (not common) it's assumed here that
+        // scale type is per_n_dim_scale and driver which calls brgemm kernel
+        // checked that mask has correct value for this case
+        brg->is_oc_scale = wei_scales.get_mask() > 0;
     }
 
     const auto &dst_scales = attr->scales_.get(DNNL_ARG_DST);
-    brg->with_dst_scales = !dst_scales.has_default_values();
-    const bool scales_ok = src_scales.mask_ == 0 && dst_scales.mask_ == 0
+    const bool has_dst_scales = !dst_scales.has_default_values();
+    brg->with_dst_scales = has_dst_scales;
+    const bool scales_ok
+            = IMPLICATION(has_src_scales, src_scales.get_mask() == 0)
+            && IMPLICATION(has_dst_scales, dst_scales.get_mask() == 0)
             && attr->scales_.has_default_values(
                     {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST});
     if (!scales_ok) return status::unimplemented;
 
     auto init_zp_type
             = [&](brgemm_broadcast_t &zp_type, int mem_arg) -> status_t {
-        auto zero_points = attr->zero_points_;
-
-        // common zero point type is supported for now
-        if (!zero_points.common(mem_arg)) return status::unimplemented;
+        const auto &zp = attr->zero_points_;
+        // Always init a default value;
+        zp_type = brgemm_broadcast_t::none;
+
+        if (!zp.has_default_values(mem_arg)) {
+            int mask = zp.get_mask(mem_arg);
+            if (mask == 0) {
+                zp_type = brgemm_broadcast_t::per_tensor;
+            } else if (mask == (1 << 1)) {
+                zp_type = brgemm_broadcast_t::per_n;
+            } else {
+                return status::unimplemented;
+            }
+        }
 
-        zp_type = zero_points.has_default_values(mem_arg)
-                ? brgemm_broadcast_t::none
-                : brgemm_broadcast_t::per_tensor;
         return status::success;
     };
 
@@ -416,6 +428,11 @@ status_t brgemm_desc_set_attr(brgemm_t *brg, const brgemm_attr_t &brgattr) {
     return status::success;
 }
 
+status_t brgemm_desc_finalize(brgemm_t *brg) {
+    // TODO: implement functionality here similar to corresponding one in x64
+    return status::success;
+}
+
 status_t brgemm_kernel_create(
         brgemm_kernel_t **brg_kernel, const brgemm_t &brg) {
     if (!brg_kernel) return status::invalid_arguments;
@@ -512,11 +529,13 @@ int brgemm_cmp(const brgemm_t &lhs, const brgemm_t &rhs) {
     CMP_BRGEMM_FIELD(brgattr.hint_prfB.dist2);
     CMP_BRGEMM_FIELD(brgattr.hint_prfC.dist1);
     CMP_BRGEMM_FIELD(brgattr.hint_prfC.dist2);
-    CMP_BRGEMM_FIELD(brgattr.wary_tail_read);
+    CMP_BRGEMM_FIELD(brgattr.wary_A_k_tail_read);
+    CMP_BRGEMM_FIELD(brgattr.extendable_k);
     CMP_BRGEMM_FIELD(brgattr.generate_skip_accumulation);
     CMP_BRGEMM_FIELD(brgattr.bd_mask_level);
     CMP_BRGEMM_FIELD(brgattr.use_uker);
     CMP_BRGEMM_FIELD(brgattr.use_interleave_stores);
+    CMP_BRGEMM_FIELD(brgattr.b_is_vnni);
     CMP_BRGEMM_FIELD(brgattr.fpmath_mode);
     CMP_BRGEMM_FIELD(brgattr.LDA2);
     CMP_BRGEMM_FIELD(brgattr.LDB2);
diff --git a/src/cpu/aarch64/brgemm/brgemm.hpp b/src/cpu/aarch64/brgemm/brgemm.hpp
index f6531f5ff64..64ae821a1c5 100644
--- a/src/cpu/aarch64/brgemm/brgemm.hpp
+++ b/src/cpu/aarch64/brgemm/brgemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2023 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -121,6 +121,11 @@ status_t DNNL_API brgemm_desc_set_postops(brgemm_t *brg,
 status_t DNNL_API brgemm_desc_set_attr(
         brgemm_t *brg, const brgemm_attr_t &brgattr);
 
+/// Finalize BRGEMM descriptor.
+///
+/// @param brg Output BRGEMM descriptor
+status_t DNNL_API brgemm_desc_finalize(brgemm_t *brg);
+
 /// Generates a BRGEMM kernel based on descriptor
 ///
 /// @param brg_kernel Output BRGEMM kernel
diff --git a/src/cpu/aarch64/brgemm/brgemm_types.hpp b/src/cpu/aarch64/brgemm/brgemm_types.hpp
index d6eb16cd6ff..0c5485ce8c7 100644
--- a/src/cpu/aarch64/brgemm/brgemm_types.hpp
+++ b/src/cpu/aarch64/brgemm/brgemm_types.hpp
@@ -133,7 +133,8 @@ struct DNNL_API brgemm_attr_t {
             = brgemm_kernel_prefetching_t::brgemm_prf_default;
     brgemm_prf_t hint_prfA, hint_prfB, hint_prfC;
 
-    bool wary_tail_read;
+    bool wary_A_k_tail_read {false};
+    bool extendable_k {false};
     bool generate_skip_accumulation;
     // Value of bd_mask_level specifies how bd_mask is used in brgemm kernel
     // 0 – bd_mask is not used
@@ -147,6 +148,7 @@ struct DNNL_API brgemm_attr_t {
     // interleave stores or not
     bool use_interleave_stores;
     impl::fpmath_mode_t fpmath_mode = fpmath_mode::strict;
+    bool b_is_vnni {false};
     // Second level leading dimension describing distance between 16-line
     // blocks in case of blocked layout. Used to calculate address of next
     // bd block. By default are equal to regular leading dimension parameters
diff --git a/src/cpu/aarch64/brgemm/brgemm_utils.cpp b/src/cpu/aarch64/brgemm/brgemm_utils.cpp
index 109436db6bf..c517d9a0856 100644
--- a/src/cpu/aarch64/brgemm/brgemm_utils.cpp
+++ b/src/cpu/aarch64/brgemm/brgemm_utils.cpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2022-2023 Intel Corporation
 * Copyright 2023-2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,15 +48,18 @@ impl::data_type_t get_accum_datatype(brgemm_t *brg) {
     return brg->is_int8 ? data_type::s32 : data_type::f32;
 }
 
-void init_kernel_datatype(
+status_t init_kernel_datatype(
         brgemm_t *brg, impl::data_type_t dt_a, impl::data_type_t dt_b) {
-    assert(dt_a != data_type::undef && dt_b != data_type::undef);
+    if (!(dt_a != data_type::undef && dt_b != data_type::undef))
+        return status::unimplemented;
     brg->is_int8 = utils::one_of(dt_a, data_type::u8, data_type::s8)
             && utils::one_of(dt_b, data_type::u8, data_type::s8);
     brg->is_bf16 = (dt_a == data_type::bf16) && (dt_b == data_type::bf16);
     brg->is_f32 = (dt_a == data_type::f32) && (dt_b == data_type::f32);
     brg->is_f16 = utils::one_of(data_type::f16, dt_a, dt_b);
-    assert(brg->is_int8 || brg->is_bf16 || brg->is_f32 || brg->is_f16);
+    if (!(brg->is_int8 || brg->is_bf16 || brg->is_f32 || brg->is_f16))
+        return status::unimplemented;
+    return status::success;
 }
 
 void init_common_conf(brgemm_t *brg, brgemm_batch_kind_t type, float alpha,
@@ -88,7 +92,7 @@ void maybe_try_bf32(brgemm_t *brg) {
     //
 }
 
-void set_isa_impl(brgemm_t *brg) {
+status_t set_isa_impl(brgemm_t *brg) {
     auto is_isa_ok = [&](cpu_isa_t isa) {
         return mayiuse(isa) &&
                 // maybe IMPLICATION(brg->isa_user != isa_undef,
@@ -96,19 +100,14 @@ void set_isa_impl(brgemm_t *brg) {
                 one_of(brg->isa_user, isa_undef, isa);
     };
 
-    if (brg->is_bf32) {
-        assert(!"unsupported case");
-    } else if (brg->is_f32) {
-        brg->isa_impl = utils::map(true, isa_undef, is_isa_ok(sve_512), sve_512,
-                is_isa_ok(sve_256), sve_256);
-    } else if (brg->is_bf16) {
-        assert(!"unsupported case");
-    } else if (brg->is_f16) {
-        assert(!"unsupported case");
-    } else if (brg->is_int8) {
+    if (brg->is_bf32 || brg->is_bf16 || brg->is_f16) {
+        return status::unimplemented;
+    } else if (brg->is_f32 || brg->is_int8) {
         brg->isa_impl = utils::map(true, isa_undef, is_isa_ok(sve_512), sve_512,
                 is_isa_ok(sve_256), sve_256);
+        return status::success;
     }
+    return status::success;
 }
 
 void set_brg_vmm(brgemm_t *brg) {
@@ -187,7 +186,7 @@ inline size_t data_type_vnni_granularity(data_type_t data_type) {
 }
 status_t brgemm_blocking(brgemm_t *brg) {
 
-    set_isa_impl(brg);
+    CHECK(set_isa_impl(brg));
     if (brg->isa_impl == isa_undef) return status::unimplemented;
     assert(!brg->is_dgmm); // should not be called from brdgmm
     set_brg_vmm(brg);
@@ -296,10 +295,11 @@ status_t brdgmm_blocking(brgemm_t *brg) {
     return status::success;
 }
 
-void init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
-        impl::data_type_t dt_a, impl::data_type_t dt_b, brgemm_layout_t layout,
-        float alpha, float beta, dim_t LDA, dim_t LDB, dim_t LDC, dim_t M,
-        dim_t N, dim_t K, const brgemm_strides_t *strides, bool is_bf32) {
+status_t init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa,
+        brgemm_batch_kind_t type, impl::data_type_t dt_a,
+        impl::data_type_t dt_b, brgemm_layout_t layout, float alpha, float beta,
+        dim_t LDA, dim_t LDB, dim_t LDC, dim_t M, dim_t N, dim_t K,
+        const brgemm_strides_t *strides, bool is_bf32) {
 
     init_common_conf(brg, type, alpha, beta, strides);
 
@@ -307,7 +307,7 @@ void init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
 
     brg->dt_a = brg->is_row_major() ? dt_a : dt_b;
     brg->dt_b = brg->is_row_major() ? dt_b : dt_a;
-    init_kernel_datatype(brg, brg->dt_a, brg->dt_b);
+    CHECK(init_kernel_datatype(brg, brg->dt_a, brg->dt_b));
 
     brg->dt_c = get_accum_datatype(brg);
     brg->dt_d = brg->dt_c;
@@ -319,7 +319,7 @@ void init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
     brg->typesize_D = types::data_type_size(brg->dt_d);
 
     brg->isa_user = isa;
-    set_isa_impl(brg);
+    CHECK(set_isa_impl(brg));
     brg->is_bf32 = false;
 
     brg->has_int8_vnni = true;
@@ -352,11 +352,13 @@ void init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
     brg->rd_step = has_no_vnni_compute_instruction
             ? 1
             : data_type_vnni_granularity(brg->dt_b);
+    return status::success;
 }
 
-void init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
-        impl::data_type_t dt_a, impl::data_type_t dt_b, brgemm_layout_t layout,
-        float alpha, float beta, dim_t LDA, dim_t LDC, dim_t M, dim_t N,
+status_t init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa,
+        brgemm_batch_kind_t type, impl::data_type_t dt_a,
+        impl::data_type_t dt_b, brgemm_layout_t layout, float alpha, float beta,
+        dim_t LDA, dim_t LDC, dim_t M, dim_t N,
         const brgemm_strides_t *strides) {
 
     init_common_conf(brg, type, alpha, beta, strides);
@@ -365,7 +367,7 @@ void init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
 
     brg->dt_a = dt_a;
     brg->dt_b = dt_b;
-    init_kernel_datatype(brg, brg->dt_a, brg->dt_b);
+    CHECK(init_kernel_datatype(brg, brg->dt_a, brg->dt_b));
 
     brg->dt_c = get_accum_datatype(brg);
     brg->dt_d = brg->dt_c;
@@ -394,6 +396,7 @@ void init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
 
     brg->bcast_dim = M;
     brg->load_dim = N;
+    return status::success;
 }
 
 } // namespace brgemm_utils
@@ -402,4 +405,4 @@ void init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
 } // namespace impl
 } // namespace dnnl
 
-//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
\ No newline at end of file
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/aarch64/brgemm/brgemm_utils.hpp b/src/cpu/aarch64/brgemm/brgemm_utils.hpp
index 485b5fde961..563a5d734ac 100644
--- a/src/cpu/aarch64/brgemm/brgemm_utils.hpp
+++ b/src/cpu/aarch64/brgemm/brgemm_utils.hpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2022 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,20 +45,21 @@ status_t brdgmm_blocking(brgemm_t *brg);
  * having to depend on BRGeMM's API. An additional feature is that this
  * function can be modified depending on needs without requiring changes
  * at the API level. */
-void init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
-        impl::data_type_t dt_a, impl::data_type_t dt_b, brgemm_layout_t layout,
-        float alpha, float beta, dim_t LDA, dim_t LDB, dim_t LDC, dim_t M,
-        dim_t N, dim_t K, const brgemm_strides_t *strides = nullptr,
-        bool is_bf32 = false);
+status_t init_brgemm_conf(brgemm_t *brg, cpu_isa_t isa,
+        brgemm_batch_kind_t type, impl::data_type_t dt_a,
+        impl::data_type_t dt_b, brgemm_layout_t layout, float alpha, float beta,
+        dim_t LDA, dim_t LDB, dim_t LDC, dim_t M, dim_t N, dim_t K,
+        const brgemm_strides_t *strides = nullptr, bool is_bf32 = false);
 
 /* The purpose of this function is to enable initialization of brgemm values
  * and then call additional functions like blocking heuristics without
  * having to depend on BRDGeMM's API. An additional feature is that this
  * function can be modified depending on needs without requiring changes
  * at the API level. */
-void init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa, brgemm_batch_kind_t type,
-        impl::data_type_t dt_a, impl::data_type_t dt_b, brgemm_layout_t layout,
-        float alpha, float beta, dim_t LDA, dim_t LDC, dim_t M, dim_t N,
+status_t init_brdgmm_conf(brgemm_t *brg, cpu_isa_t isa,
+        brgemm_batch_kind_t type, impl::data_type_t dt_a,
+        impl::data_type_t dt_b, brgemm_layout_t layout, float alpha, float beta,
+        dim_t LDA, dim_t LDC, dim_t M, dim_t N,
         const brgemm_strides_t *strides = nullptr);
 
 } // namespace brgemm_utils
diff --git a/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp b/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp
index 087fb52935a..b3f02816761 100644
--- a/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp
+++ b/src/cpu/aarch64/brgemm/jit_brgemm_kernel.cpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -766,10 +767,38 @@ void jit_brgemm_kernel_t::read_params() {
 void jit_brgemm_kernel_t::zero_accumulators(int bd_block2, bool is_bdb_tail,
         int ld_block2, bool is_ld_tail, bool skip_accumulation) {
     int bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
+    const bool need_to_apply_beta = brg.beta != 0.f;
     for_(int bd = 0; bd < bd_block; bd++)
     for (int ld = 0; ld < ld_block2; ld++) {
         auto zmm = accm(ld_block2, bd, ld);
-        eor(zmm.d, zmm.d, zmm.d);
+        // This part is moved here from apply_alpha_beta function so that fadd instruction can be avoided.
+        // This is also required only when K is blocked.
+        if (need_to_apply_beta) {
+            const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
+            const auto k_mask = is_tail ? ld_tail_mask : ld_full_mask;
+
+            const int offset = C_offset(bd, ld);
+
+            int base_offset = 0;
+            auto x_addr = reg_aux_C;
+
+            if ((unsigned)(offset - base_offset) > cpu_sveLen * 7) {
+                add_imm(reg_tmp_, reg_aux_C, offset, X_TMP_0);
+                base_offset = offset;
+                x_addr = reg_tmp_;
+            }
+            LD_MUL_VL(ld1w, zmm.s, k_mask, x_addr, offset - base_offset, 4);
+
+            const bool need_init_beta_vmm = brg.beta != 1.f;
+            auto vmm_beta = z_tail_mask();
+            if (need_init_beta_vmm) {
+                auto wreg_tmp = WReg(reg_tmp_gpr.getIdx());
+                mov_imm(wreg_tmp, float2int(static_cast<float>(brg.beta)));
+                dup(vmm_beta.s, wreg_tmp);
+                fmul(zmm.s, zmm.s, vmm_beta.s);
+            }
+        } else
+            eor(zmm.d, zmm.d, zmm.d);
     }
 }
 
@@ -790,58 +819,6 @@ void jit_brgemm_kernel_t::apply_alpha_beta(
         if (dq2ps_required) { scvtf(vmm.s, P_ALL_ONE / T_m, vmm.s); }
         if (apply_alpha) { fmul(vmm.s, vmm.s, vmm_alpha.s); }
     }
-
-    if (brg.beta == 0.f) return;
-    const bool use_vadd_for_beta = brg.beta == 1.f && !dq2ps_required;
-    const bool need_init_beta_vmm = brg.beta != 1.f;
-    auto vmm_prev_dst = z_tmp_1();
-    auto vmm_beta = z_tail_mask();
-    if (need_init_beta_vmm) {
-        auto wreg_tmp = WReg(reg_tmp_gpr.getIdx());
-        mov_imm(wreg_tmp, float2int(static_cast<float>(brg.beta)));
-        dup(vmm_beta.s, wreg_tmp);
-    }
-
-    int base_offset = 0;
-    auto x_addr = reg_aux_C;
-    for_(int bd = 0; bd < bd_block; bd++)
-    for (int ld = 0; ld < ld_block2; ld++) {
-        const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
-        const auto k_mask = is_tail ? ld_tail_mask : ld_full_mask;
-        auto vmm = accm(ld_block2, bd, ld);
-        if (use_vadd_for_beta) {
-            if (brg.is_int8) {
-                assert(!"unsupported\n");
-            } else {
-                ZRegS z_masked = vmm.s;
-                ZRegS z(vmm.getIdx());
-
-                const int offset = C_offset(bd, ld);
-
-                if ((unsigned)(offset - base_offset) > cpu_sveLen * 7) {
-                    add_imm(reg_tmp_, reg_aux_C, offset, X_TMP_0);
-                    base_offset = offset;
-                    x_addr = reg_tmp_;
-                }
-                LD_MUL_VL(ld1w, vmm_prev_dst.s, k_mask, x_addr,
-                        offset - base_offset, 4);
-                if (is_ld_tail) {
-                    movprfx(z_masked, k_mask / T_z, z);
-                    fadd(z_masked, k_mask / T_m, vmm_prev_dst.s);
-                } else {
-                    fadd(z_masked, z_masked, vmm_prev_dst.s);
-                }
-            }
-        } else {
-            add_imm(X_DEFAULT_ADDR, reg_aux_C, C_offset(bd, ld), X_TMP_0);
-            ld1w(vmm_prev_dst.s, k_mask / T_z, ptr(X_DEFAULT_ADDR));
-            if (brg.beta == 1.f) {
-                fadd(vmm.s, vmm.s, vmm_prev_dst.s);
-            } else {
-                fmla(vmm.s, P_ALL_ONE / T_m, vmm_prev_dst.s, vmm_beta.s);
-            }
-        }
-    }
 }
 
 void jit_brgemm_kernel_t::apply_post_ops(
@@ -1414,7 +1391,8 @@ void jit_brgemm_kernel_t::gemm_microkernel_sve512(int bd_block2,
                     || brg.zp_type_a != brgemm_broadcast_t::none);
     if (brg.req_cal_comp_pads || comp_vpad) assert(!"unsupported\n");
 
-    bool maybe_load_bytes = (rows_for_rd_tail > 0 || brg.brgattr.wary_tail_read)
+    bool maybe_load_bytes
+            = (rows_for_rd_tail > 0 || brg.brgattr.wary_A_k_tail_read)
             && is_rd_tail && rd_tail_size != 0 && (brg.is_bf16 || brg.is_int8);
     if (n_bcast_1_load) {
         for (int rd = 0; rd < rd_loop; rd += brg.rd_step) {
@@ -1424,7 +1402,7 @@ void jit_brgemm_kernel_t::gemm_microkernel_sve512(int bd_block2,
             auto rows_by_load_bytes = have_to_load_bytes ? rows_for_rd_tail : 0;
             for (int bd = bd_b; bd < bd_e && !is_emdbd; bd++) {
                 const auto bd_by_load_bytes = (bd >= bd_e - rows_by_load_bytes
-                        || brg.brgattr.wary_tail_read);
+                        || brg.brgattr.wary_A_k_tail_read);
                 broadcast(bcst(bd), A_offset(bd, rd),
                         have_to_load_bytes && bd_by_load_bytes, brg.dt_a);
             }
@@ -1464,7 +1442,6 @@ void jit_brgemm_kernel_t::gemm_microkernel_sve512(int bd_block2,
         int base_offset = 0;
 
         for (int rd = 0; rd < rd_loop; rd += brg.rd_step) {
-            int prefetch_count_B = 0;
             for (int ld = 0; ld < ld_block2; ld++) {
                 const auto mask = is_ld_tail ? ld_tail_mask : P_ALL_ONE;
                 if (brg.dt_b == data_type::f16) {
@@ -1492,17 +1469,11 @@ void jit_brgemm_kernel_t::gemm_microkernel_sve512(int bd_block2,
                 if (!is_emdbd) {
                     const auto bd_by_load_bytes
                             = (bd >= bd_e - rows_by_load_bytes
-                                    || brg.brgattr.wary_tail_read);
+                                    || brg.brgattr.wary_A_k_tail_read);
                     broadcast(bcst(), A_offset(bd, rd),
                             have_to_load_bytes && bd_by_load_bytes, brg.dt_a);
                 }
-                if (prefetch_count_B < ld_block2) {
-                    add_imm(X_DEFAULT_ADDR, reg_aux_B,
-                            B_offset(prefetch_count_B++, rd)
-                                    + brg.LDB * brg.rd_block * brg.typesize_B,
-                            X_TMP_0);
-                    prfm(PLDL1KEEP, ptr(X_DEFAULT_ADDR));
-                }
+                //The current implementaion of prefetch is not giving any gain in performance but is rather introducing some latency. Therefore it is removed util a new useful implementation is deviced.
                 for (int ld = 0; ld < ld_block2; ld++) {
                     auto zmm = accm(ld_block2, bd, ld);
                     if (is_emdbd) {
@@ -1876,7 +1847,7 @@ void jit_brgemm_kernel_t::bdb_loop() {
 }
 
 void jit_brgemm_kernel_t::generate() {
-    size_t simd_w_;
+    size_t simd_w_ = 0;
     switch (brg.isa_impl) {
         case sve_512:
             simd_w_ = cpu_isa_traits<sve_512>::vlen / sizeof(float);
@@ -1884,7 +1855,10 @@ void jit_brgemm_kernel_t::generate() {
         case sve_256:
             simd_w_ = cpu_isa_traits<sve_256>::vlen / sizeof(float);
             break;
-        default: assert(!"unsupported isa");
+        default: {
+            assert(!"unsupported isa");
+            return;
+        }
     }
     preamble();
     if (simd_w_ != cpu_sveLen / sizeof(float)) {
@@ -1935,7 +1909,8 @@ brgemm_attr_t::brgemm_attr_t()
     , hint_innermost_loop(brgemm_ld_loop_innermost)
     , hint_loop_order(brgemm_kernel_loop_order_t::brgemm_lo_default)
     , hint_prefetching(brgemm_kernel_prefetching_t::brgemm_prf_default)
-    , wary_tail_read(true)
+    , wary_A_k_tail_read(true)
+    , extendable_k(false)
     , generate_skip_accumulation(false)
     , bd_mask_level(0)
     , use_uker(false)
diff --git a/src/cpu/aarch64/cpu_isa_traits.hpp b/src/cpu/aarch64/cpu_isa_traits.hpp
index 64a6368b654..8bf338cf08b 100644
--- a/src/cpu/aarch64/cpu_isa_traits.hpp
+++ b/src/cpu/aarch64/cpu_isa_traits.hpp
@@ -31,8 +31,8 @@
 #define XBYAK_USE_MMAP_ALLOCATOR
 #endif
 
-#include "cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h"
-#include "cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h"
+#include "xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h"
+#include "xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/cpu/aarch64/cpu_reducer.cpp b/src/cpu/aarch64/cpu_reducer.cpp
index 1e1c947dc96..4361e3c0c21 100644
--- a/src/cpu/aarch64/cpu_reducer.cpp
+++ b/src/cpu/aarch64/cpu_reducer.cpp
@@ -99,7 +99,7 @@ using namespace Xbyak_aarch64;
 
 template <impl::data_type_t data_type, cpu_isa_t isa>
 struct reducer_2d_driver_t : public jit_generator {
-    using data_t = typename prec_traits<data_type>::type;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step,
             size_t dst_step, bool nullify_dst)
@@ -122,7 +122,7 @@ template <impl::data_type_t data_type, cpu_isa_t isa>
 struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type, isa> {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
 
-    using data_t = typename prec_traits<data_type>::type;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     void operator()(
             data_t *dst, const data_t *srcs, size_t ny, size_t nx) override {
@@ -134,7 +134,7 @@ struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type, isa> {
 
     const int vlen = cpu_isa_traits<isa>::vlen;
     const int typesize
-            = sizeof(typename dnnl::impl::prec_traits<data_type>::type);
+            = sizeof(typename dnnl::impl::prec_traits_t<data_type>::type);
     XReg reg_dst = abi_param1;
     XReg reg_src = abi_param2;
     XReg reg_ny = abi_param3;
diff --git a/src/cpu/aarch64/cpu_reducer.hpp b/src/cpu/aarch64/cpu_reducer.hpp
index 0ccbd446948..7e6566c32cc 100644
--- a/src/cpu/aarch64/cpu_reducer.hpp
+++ b/src/cpu/aarch64/cpu_reducer.hpp
@@ -169,7 +169,7 @@ struct reducer_2d_driver_t;
  */
 template <impl::data_type_t data_type, cpu_isa_t isa = sve_512>
 struct cpu_reducer_t {
-    typedef typename prec_traits<data_type>::type data_t;
+    typedef typename prec_traits_t<data_type>::type data_t;
 
     struct conf_t {
         conf_t() = default;
@@ -249,7 +249,7 @@ struct cpu_reducer_t {
 
 template <impl::data_type_t data_type, cpu_isa_t isa = sve_512>
 struct cpu_reducer_2d_t {
-    typedef typename prec_traits<data_type>::type data_t;
+    typedef typename prec_traits_t<data_type>::type data_t;
 
     struct conf_t {
         conf_t() = default;
@@ -334,7 +334,7 @@ struct cpu_reducer_2d_t {
 /** simple 1d accumulator: y[:] += x[:] */
 template <impl::data_type_t data_type, cpu_isa_t isa = sve_512>
 struct cpu_accumulator_1d_t {
-    typedef typename prec_traits<data_type>::type data_t;
+    typedef typename prec_traits_t<data_type>::type data_t;
 
     cpu_accumulator_1d_t();
     ~cpu_accumulator_1d_t();
diff --git a/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.cpp b/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.cpp
index 435f12b16f1..00163cbecaa 100644
--- a/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.cpp
+++ b/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2019-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
+* Copyright 2021-2024 FUJITSU LIMITED
 * Copyright 2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -475,9 +475,87 @@ void jit_uni_eltwise_injector_f32<isa>::elu_compute_vector_fwd(
     h->mov(vmm_src, p_mask / T_m, vmm_aux3);
 }
 
+template <cpu_isa_t isa>
+void jit_uni_eltwise_injector_f32<
+        isa>::tanh_polynomial_approx_compute_vector_fwd(const TRegS &vmm_src) {
+
+    if (!utils::one_of(isa, sve_512)) return;
+
+    using namespace Xbyak_aarch64::util;
+
+    const int tanh_n_polynomials = 32;
+
+    // Register mapping
+    TRegS vmm_dst = vmm_aux1, vmm_src_shift = vmm_aux1, vmm_coeff = vmm_aux1,
+          vmm_pol = vmm_aux2, vmm_indices = vmm_aux3, vmm_tmp = vmm_aux3,
+          vmm_src_pos = vmm_aux4, vmm_sign = vmm_aux4;
+
+    const auto &mask = PReg(6); // avoid pred regs used in *conv_kernel*
+
+    // Helper function to gather polynomial coefficients
+    auto gather_coefficient = [&](TRegS vmm_coeff, int coeff_idx,
+                                      TRegS vmm_pol_idx) {
+        h->add_imm(h->X_TMP_1, x_table,
+                table_off(tanh_pol_table, coeff_idx * tanh_n_polynomials),
+                h->X_TMP_0);
+        h->ld1w(ZRegS(IDX(vmm_coeff)), p_all,
+                ptr(h->X_TMP_1, ZRegS(IDX(vmm_pol_idx)), SXTW));
+    };
+
+    // because tanh(x) = -tanh(-x), we extract sign to make x postive
+    // and reapply sign at the end
+    h->fabs(vmm_src_pos, p_all / T_z, vmm_src);
+
+    // Compute indices for the table lookup
+    h->sub(ZRegS(IDX(vmm_indices)), ZRegS(IDX(vmm_src_pos)),
+            ZRegS(IDX(table_val(tanh_idx_bias, z_tmp))));
+    h->and_(ZRegD(IDX(vmm_indices)), ZRegD(IDX(vmm_indices)),
+            ZRegD(IDX(table_val(tanh_idx_mask, z_tmp))));
+    h->lsr(ZRegD(IDX(vmm_indices)), ZRegD(IDX(vmm_indices)), 20);
+
+    // Argument reduction
+    h->and_(ZRegD(IDX(vmm_src_shift)), ZRegD(IDX(vmm_src_pos)),
+            ZRegD(IDX(table_val(tanh_idx_mask, z_tmp))));
+    h->fsub(vmm_src_pos, vmm_src_pos, ZRegS(IDX(vmm_src_shift)));
+
+    gather_coefficient(vmm_pol, 6, vmm_indices);
+    for (int deg = 5; deg >= 0; --deg) {
+        gather_coefficient(vmm_coeff, deg, vmm_indices);
+        h->fmad(vmm_pol, p_all / T_m, vmm_src_pos, vmm_coeff);
+    }
+
+    // Restore src_pos
+    h->fabs(vmm_src_pos, p_all / T_z, vmm_src);
+
+    // Now Blend the results
+    // [saturation_ubound; +inf] : return +/- 1
+    table_val(one, vmm_dst);
+
+    // [linear_ubound; saturation_lbound] :  return +/- P(x)
+    table_val(tanh_saturation_lbound, vmm_tmp);
+    h->fcmgt(PRegS(IDX(mask)), p_all / T_z, vmm_tmp, vmm_src_pos);
+    h->sel(vmm_dst, mask / T_m, vmm_pol, vmm_dst);
+
+    // [0; linear_ubound]  :  return x
+    table_val(tanh_linear_ubound, vmm_tmp);
+    h->fcmgt(PRegS(IDX(mask)), p_all / T_z, vmm_tmp, vmm_src_pos);
+    h->sel(vmm_dst, mask / T_m, vmm_src_pos, vmm_dst);
+
+    // Reapply sign and return
+    h->and_(ZRegD(IDX(vmm_sign)), ZRegD(IDX(vmm_src)),
+            ZRegD(IDX(table_val(sign_mask, z_tmp))));
+    h->eor(ZRegD(IDX(vmm_src)), ZRegD(IDX(vmm_dst)), ZRegD(IDX(vmm_sign)));
+}
+
 template <cpu_isa_t isa>
 void jit_uni_eltwise_injector_f32<isa>::tanh_compute_vector_fwd(
         const TRegS &vmm_src) {
+
+    if (utils::one_of(isa, sve_512)) {
+        tanh_polynomial_approx_compute_vector_fwd(vmm_src);
+        return;
+    }
+
     // tanh(x) = x(1 + (-1/3)x^2) for |x| < tanh_range
     // tanh(x) = 1 - 2/(1 + exp(2 x)) for otherwise
 
@@ -918,10 +996,87 @@ void jit_uni_eltwise_injector_f32<isa>::log_compute_vector_fwd(
     }
     h->L(exitL);
 }
+template <cpu_isa_t isa>
+void jit_uni_eltwise_injector_f32<
+        isa>::gelu_erf_minimax_approx_compute_vector_fwd(const TRegS &vmm_src) {
+    if (isa != sve_512) { // TODO: change this condition based on cpu id.
+        return;
+    }
+
+    // register mapping
+    TRegS vmm_pol = vmm_aux0;
+    TRegS vmm_src_pos = vmm_aux1;
+    TRegS vmm_indices = vmm_aux2;
+    TRegS vmm_tmp = vmm_aux3; // this is for immediate read after write
+
+    auto gather_coefficient
+            = [&](TRegS vmm_coeff, int coeff_idx, TRegS vmm_pol_idx) {
+                  // we actually have 25 polynomials but pad to avoid unaligned accesses/
+                  int gelu_erf_n_polynomials = 32;
+                  h->add_imm(h->X_TMP_1, x_table,
+                          table_off(gelu_erf_minimax_pol,
+                                  coeff_idx * gelu_erf_n_polynomials),
+                          h->X_TMP_0);
+                  h->ld1w(ZRegS(IDX(vmm_coeff)), p_all / T_z,
+                          ptr(h->X_TMP_1, ZRegS(IDX(vmm_pol_idx)), SXTW));
+              };
+
+    // we use the erf function symmetry erf(-x) = -erf(x)
+    // So we make x positive, we will reapply the sign after erf evaluation
+    h->fabs(vmm_src_pos, p_all / T_z, vmm_src);
+
+    // Compute indices for table lookup
+    h->add(vmm_indices, vmm_src_pos,
+            ZRegS(IDX(table_val(gelu_erf_idx_bias, z_tmp, 0))));
+
+    // An arithmetic shift is needed to properly map denormals to
+    // their polynomial. we shift by 21 as we use 2 bits of mantissa
+    // for indexing.
+    h->asr(ZRegS(IDX(vmm_indices)), ZRegS(IDX(vmm_indices)), 21);
+
+    // Apply special rules
+    h->smax(vmm_indices, p_all / T_z,
+            ZRegS(IDX(table_val(gelu_erf_one, z_tmp))));
+    h->smin(vmm_indices, p_all / T_z,
+            ZRegS(IDX(table_val(gelu_erf_twenty_four, z_tmp))));
+
+    // We have to check
+    //     index = x_pos > rbound ? 23 : index;
+    // for erf to return -1/1 when we should.
+    h->fcmlt(p_mask.s, p_all / T_z, vmm_src_pos,
+            ZRegS(IDX(table_val(gelu_erf_rbound, z_tmp))));
+    h->sel(vmm_indices, p_mask, vmm_indices,
+            ZRegS(IDX(table_val(gelu_erf_twenty_three, z_tmp))));
+
+    // Adjusting indices
+    h->mul(ZRegS(IDX(vmm_indices)), sizeof(float));
+
+    // Evaluate the polynomial
+    gather_coefficient(vmm_pol, 5, vmm_indices);
+    for (int deg = 4; deg >= 0; --deg) {
+        gather_coefficient(vmm_tmp, deg, vmm_indices);
+        h->fmad(vmm_pol, p_all / T_z, vmm_src_pos, vmm_tmp);
+    }
 
+    // Set the sign of vmm_pol properly
+    h->mov(ZRegD(IDX(vmm_tmp)), ZRegD(IDX(vmm_src)));
+    h->and_(ZRegD(IDX(vmm_tmp)), ZRegD(IDX(vmm_tmp)),
+            ZRegD(IDX(table_val(sign_mask, z_tmp))));
+    h->eor(ZRegD(IDX(vmm_pol)), p_all / T_z, ZRegD(IDX(vmm_tmp)));
+
+    // Compute the final output
+    h->fadd(vmm_pol, vmm_pol, ZRegS(IDX(table_val(one, z_tmp))));
+    h->fmul(vmm_src, p_all / T_z, vmm_pol);
+    h->fmul(vmm_src, vmm_src, ZRegS(IDX(table_val(half, z_tmp))));
+}
 template <cpu_isa_t isa>
 void jit_uni_eltwise_injector_f32<isa>::gelu_erf_compute_vector_fwd(
         const TRegS &vmm_src) {
+
+    if (isa == sve_512) { // TODO: consider performance improvement for lower ISA
+        gelu_erf_minimax_approx_compute_vector_fwd(vmm_src);
+        return;
+    }
     // Here we approximate erf(x) using the expression by
     // Abramowitz and Stegun from ``Handbook of Mathematical
     // Functions''
@@ -1657,9 +1812,248 @@ void jit_uni_eltwise_injector_f32<isa>::register_table_entries() {
             {bwd_mish_max_x_for_equation_f, {0x41b17217, true}}};
 
     // tanh(x) constants for four interval approximation
-    static const table_t tanh_consts {
-            {tanh_range, {0x3d4ccccd, true}},
+    // and for polynomial approximation
+    static const table_t tanh_consts {{tanh_range, {0x3d4ccccd, true}},
             {tanh_m1d3, {0xbeaaaaab, true}},
+            {tanh_idx_bias, {0x39800000, true}},
+            {tanh_idx_mask, {0xffc00000, true}},
+            {tanh_linear_ubound, {0x39ddb3d7, true}},
+            {tanh_saturation_lbound, {0x41102cb3, true}}};
+
+    // tanh(x) polynomial approximation
+    // For each coefficient, there is 32 entries
+    static const table_t tanh_polynomial_table {
+            // coefficients of degree 0
+            {tanh_pol_table, {0x00000000, false}},
+            {tanh_pol_table, {0x39bfffff, false}},
+            {tanh_pol_table, {0x39ffffff, false}},
+            {tanh_pol_table, {0x3a3ffffe, false}},
+            {tanh_pol_table, {0x3a7ffffb, false}},
+            {tanh_pol_table, {0x3abffff7, false}},
+            {tanh_pol_table, {0x3affffeb, false}},
+            {tanh_pol_table, {0x3b3fffdc, false}},
+            {tanh_pol_table, {0x3b7fffab, false}},
+            {tanh_pol_table, {0x3bbfff70, false}},
+            {tanh_pol_table, {0x3bfffeab, false}},
+            {tanh_pol_table, {0x3c3ffdc0, false}},
+            {tanh_pol_table, {0x3c7ffaab, false}},
+            {tanh_pol_table, {0x3cbff701, false}},
+            {tanh_pol_table, {0x3cffeaad, false}},
+            {tanh_pol_table, {0x3d3fdc08, false}},
+            {tanh_pol_table, {0x3d7faacd, false}},
+            {tanh_pol_table, {0x3dbf7081, false}},
+            {tanh_pol_table, {0x3dfeacc9, false}},
+            {tanh_pol_table, {0x3e3dc7fd, false}},
+            {tanh_pol_table, {0x3e7acbf5, false}},
+            {tanh_pol_table, {0x3eb77a9f, false}},
+            {tanh_pol_table, {0x3eec9a9f, false}},
+            {tanh_pol_table, {0x3f22991f, false}},
+            {tanh_pol_table, {0x3f42f7d6, false}},
+            {tanh_pol_table, {0x3f67b7cc, false}},
+            {tanh_pol_table, {0x3f76ca83, false}},
+            {tanh_pol_table, {0x3f7ebbe9, false}},
+            {tanh_pol_table, {0x3f7fd40c, false}},
+            {tanh_pol_table, {0x3f7fff32, false}},
+            {tanh_pol_table, {0x3f7ffffc, false}},
+            {tanh_pol_table, {0x3f800000, false}},
+            // coefficients of degree 1
+            {tanh_pol_table, {0x3f800000, false}},
+            {tanh_pol_table, {0x3f800018, false}},
+            {tanh_pol_table, {0x3f7fffe8, false}},
+            {tanh_pol_table, {0x3f7fffda, false}},
+            {tanh_pol_table, {0x3f7fffdc, false}},
+            {tanh_pol_table, {0x3f7fffdc, false}},
+            {tanh_pol_table, {0x3f7fffac, false}},
+            {tanh_pol_table, {0x3f7fff70, false}},
+            {tanh_pol_table, {0x3f7ffeec, false}},
+            {tanh_pol_table, {0x3f7ffdc0, false}},
+            {tanh_pol_table, {0x3f7ffbed, false}},
+            {tanh_pol_table, {0x3f7ff704, false}},
+            {tanh_pol_table, {0x3f7feff5, false}},
+            {tanh_pol_table, {0x3f7fdbca, false}},
+            {tanh_pol_table, {0x3f7fbfff, false}},
+            {tanh_pol_table, {0x3f7f7041, false}},
+            {tanh_pol_table, {0x3f7f009b, false}},
+            {tanh_pol_table, {0x3f7dc36c, false}},
+            {tanh_pol_table, {0x3f7c0aa8, false}},
+            {tanh_pol_table, {0x3f7734b8, false}},
+            {tanh_pol_table, {0x3f70a4de, false}},
+            {tanh_pol_table, {0x3f5f1fd8, false}},
+            {tanh_pol_table, {0x3f495493, false}},
+            {tanh_pol_table, {0x3f18b9ec, false}},
+            {tanh_pol_table, {0x3ed706cb, false}},
+            {tanh_pol_table, {0x3e390b06, false}},
+            {tanh_pol_table, {0x3d90b11f, false}},
+            {tanh_pol_table, {0x3c21a053, false}},
+            {tanh_pol_table, {0x3aaf7fdb, false}},
+            {tanh_pol_table, {0x37ccc1a3, false}},
+            {tanh_pol_table, {0x355c6733, false}},
+            {tanh_pol_table, {0x00000000, false}},
+            // coefficients of degree 2
+            {tanh_pol_table, {0x00000000, false}},
+            {tanh_pol_table, {0xbe4e0ff1, false}},
+            {tanh_pol_table, {0x3d25b1b1, false}},
+            {tanh_pol_table, {0x3d6b6dab, false}},
+            {tanh_pol_table, {0x3c9fb1d5, false}},
+            {tanh_pol_table, {0xbabff06f, false}},
+            {tanh_pol_table, {0x3c07b3f6, false}},
+            {tanh_pol_table, {0xbb3fc1bc, false}},
+            {tanh_pol_table, {0x3a9f5921, false}},
+            {tanh_pol_table, {0xbbbf06f2, false}},
+            {tanh_pol_table, {0xbbb0f402, false}},
+            {tanh_pol_table, {0xbc47db9e, false}},
+            {tanh_pol_table, {0xbc73d5e7, false}},
+            {tanh_pol_table, {0xbca25bda, false}},
+            {tanh_pol_table, {0xbcfca780, false}},
+            {tanh_pol_table, {0xbd40e07c, false}},
+            {tanh_pol_table, {0xbd7dab03, false}},
+            {tanh_pol_table, {0xbdbe4a0f, false}},
+            {tanh_pol_table, {0xbdfb14a5, false}},
+            {tanh_pol_table, {0xbe36cc8d, false}},
+            {tanh_pol_table, {0xbe6bd102, false}},
+            {tanh_pol_table, {0xbe9fe7c5, false}},
+            {tanh_pol_table, {0xbeba0f10, false}},
+            {tanh_pol_table, {0xbec206a8, false}},
+            {tanh_pol_table, {0xbea3c388, false}},
+            {tanh_pol_table, {0xbe277d62, false}},
+            {tanh_pol_table, {0xbd8b7960, false}},
+            {tanh_pol_table, {0xbc209f49, false}},
+            {tanh_pol_table, {0xbaad44ca, false}},
+            {tanh_pol_table, {0xb7c6eeac, false}},
+            {tanh_pol_table, {0xb663aa41, false}},
+            {tanh_pol_table, {0x00000000, false}},
+            // coefficients of degree 3
+            {tanh_pol_table, {0x00000000, false}},
+            {tanh_pol_table, {0x45b3ae96, false}},
+            {tanh_pol_table, {0xc414eb20, false}},
+            {tanh_pol_table, {0xc450e02e, false}},
+            {tanh_pol_table, {0xc3152b4e, false}},
+            {tanh_pol_table, {0xbead2f56, false}},
+            {tanh_pol_table, {0xc2162e02, false}},
+            {tanh_pol_table, {0xbeb4bd5a, false}},
+            {tanh_pol_table, {0xc11a59a4, false}},
+            {tanh_pol_table, {0xbed2f507, false}},
+            {tanh_pol_table, {0xc020d32c, false}},
+            {tanh_pol_table, {0x3dd0f506, false}},
+            {tanh_pol_table, {0xbf2a75e2, false}},
+            {tanh_pol_table, {0xbff950e3, false}},
+            {tanh_pol_table, {0xbed47334, false}},
+            {tanh_pol_table, {0xbe809b8c, false}},
+            {tanh_pol_table, {0xbeb64532, false}},
+            {tanh_pol_table, {0xbe961a5b, false}},
+            {tanh_pol_table, {0xbe9b63ac, false}},
+            {tanh_pol_table, {0xbea0d4b2, false}},
+            {tanh_pol_table, {0xbe828a77, false}},
+            {tanh_pol_table, {0xbe378612, false}},
+            {tanh_pol_table, {0xbdc20908, false}},
+            {tanh_pol_table, {0x3d2d3957, false}},
+            {tanh_pol_table, {0x3dd46e89, false}},
+            {tanh_pol_table, {0x3db3f629, false}},
+            {tanh_pol_table, {0x3d2c5e7b, false}},
+            {tanh_pol_table, {0x3bd20403, false}},
+            {tanh_pol_table, {0x3a59dfae, false}},
+            {tanh_pol_table, {0x3770af45, false}},
+            {tanh_pol_table, {0x372cc014, false}},
+            {tanh_pol_table, {0x00000000, false}},
+            // coefficients of degree 4
+            {tanh_pol_table, {0x00000000, false}},
+            {tanh_pol_table, {0xcc981a1b, false}},
+            {tanh_pol_table, {0x4a7edd3d, false}},
+            {tanh_pol_table, {0x4ab1007c, false}},
+            {tanh_pol_table, {0x48fedd9c, false}},
+            {tanh_pol_table, {0x41a557b5, false}},
+            {tanh_pol_table, {0x477ee32a, false}},
+            {tanh_pol_table, {0x422557f5, false}},
+            {tanh_pol_table, {0x45ff3ce4, false}},
+            {tanh_pol_table, {0x42a55641, false}},
+            {tanh_pol_table, {0x446e0867, false}},
+            {tanh_pol_table, {0xc33dc19a, false}},
+            {tanh_pol_table, {0x42915214, false}},
+            {tanh_pol_table, {0x43af4fad, false}},
+            {tanh_pol_table, {0x4110fe88, false}},
+            {tanh_pol_table, {0xc1099b75, false}},
+            {tanh_pol_table, {0x3fc8a8dc, false}},
+            {tanh_pol_table, {0xbfbeaef5, false}},
+            {tanh_pol_table, {0xbe365aad, false}},
+            {tanh_pol_table, {0x3f4d9652, false}},
+            {tanh_pol_table, {0x3ddfa08f, false}},
+            {tanh_pol_table, {0x3e34e9b8, false}},
+            {tanh_pol_table, {0x3e2d07a6, false}},
+            {tanh_pol_table, {0x3dc63567, false}},
+            {tanh_pol_table, {0x3cdaeb78, false}},
+            {tanh_pol_table, {0xbcd17537, false}},
+            {tanh_pol_table, {0xbc92829c, false}},
+            {tanh_pol_table, {0xbb43ab99, false}},
+            {tanh_pol_table, {0xb9b471dd, false}},
+            {tanh_pol_table, {0xb6baad5a, false}},
+            {tanh_pol_table, {0xb78bafc7, false}},
+            {tanh_pol_table, {0x00000000, false}},
+            // coefficients of degree 5
+            {tanh_pol_table, {0x00000000, false}},
+            {tanh_pol_table, {0x52f688d5, false}},
+            {tanh_pol_table, {0xd0505c72, false}},
+            {tanh_pol_table, {0xd08f98e3, false}},
+            {tanh_pol_table, {0xce505cc9, false}},
+            {tanh_pol_table, {0xc7162b8a, false}},
+            {tanh_pol_table, {0xcc5061d6, false}},
+            {tanh_pol_table, {0xc7162bdf, false}},
+            {tanh_pol_table, {0xca50b37f, false}},
+            {tanh_pol_table, {0xc7162a3a, false}},
+            {tanh_pol_table, {0xc8422086, false}},
+            {tanh_pol_table, {0x471a714e, false}},
+            {tanh_pol_table, {0xc5ece1f1, false}},
+            {tanh_pol_table, {0xc70e3d90, false}},
+            {tanh_pol_table, {0xc3eba94a, false}},
+            {tanh_pol_table, {0x43e0c424, false}},
+            {tanh_pol_table, {0xc21f4552, false}},
+            {tanh_pol_table, {0x42217cc8, false}},
+            {tanh_pol_table, {0x405e7dc4, false}},
+            {tanh_pol_table, {0xc10dd401, false}},
+            {tanh_pol_table, {0x3e96b602, false}},
+            {tanh_pol_table, {0xbd1a6d2f, false}},
+            {tanh_pol_table, {0xbd393883, false}},
+            {tanh_pol_table, {0xbd674682, false}},
+            {tanh_pol_table, {0xbd310016, false}},
+            {tanh_pol_table, {0xb961e269, false}},
+            {tanh_pol_table, {0x3ba32495, false}},
+            {tanh_pol_table, {0x3a7680d5, false}},
+            {tanh_pol_table, {0x38b3173c, false}},
+            {tanh_pol_table, {0x35a9deea, false}},
+            {tanh_pol_table, {0x375c3f2a, false}},
+            {tanh_pol_table, {0x00000000, false}},
+            // coefficients of degree 6
+            {tanh_pol_table, {0x00000000, false}},
+            {tanh_pol_table, {0xd8995ed1, false}},
+            {tanh_pol_table, {0x558285ea, false}},
+            {tanh_pol_table, {0x55b2cd69, false}},
+            {tanh_pol_table, {0x53028625, false}},
+            {tanh_pol_table, {0x4bc9991f, false}},
+            {tanh_pol_table, {0x5082898a, false}},
+            {tanh_pol_table, {0x4b4999b3, false}},
+            {tanh_pol_table, {0x4e02c07c, false}},
+            {tanh_pol_table, {0x4ac99764, false}},
+            {tanh_pol_table, {0x4b72c822, false}},
+            {tanh_pol_table, {0xca40c0e1, false}},
+            {tanh_pol_table, {0x489413e4, false}},
+            {tanh_pol_table, {0x49b12224, false}},
+            {tanh_pol_table, {0x46134c4e, false}},
+            {tanh_pol_table, {0xc60c2d57, false}},
+            {tanh_pol_table, {0x43c83910, false}},
+            {tanh_pol_table, {0xc3c872d1, false}},
+            {tanh_pol_table, {0xc186bc9e, false}},
+            {tanh_pol_table, {0x42325bc3, false}},
+            {tanh_pol_table, {0xbf2ffa4a, false}},
+            {tanh_pol_table, {0x3d9a203c, false}},
+            {tanh_pol_table, {0xbc545a43, false}},
+            {tanh_pol_table, {0xbae08fee, false}},
+            {tanh_pol_table, {0x3c80225d, false}},
+            {tanh_pol_table, {0x3b1fd1df, false}},
+            {tanh_pol_table, {0xba36b9d1, false}},
+            {tanh_pol_table, {0xb91de544, false}},
+            {tanh_pol_table, {0xb71f100f, false}},
+            {tanh_pol_table, {0xb408e2ed, false}},
+            {tanh_pol_table, {0xb685fec8, false}},
+            {tanh_pol_table, {0x00000000, false}},
     };
 
     // soft_relu(x) constants
@@ -1703,6 +2097,215 @@ void jit_uni_eltwise_injector_f32<isa>::register_table_entries() {
             {gelu_erf_pol, {0xbfba00e3, true}}, // p4 = -1.453152027f
             {gelu_erf_pol, {0x3f87dc22, true}}, // p5 = 1.061405429f
     };
+    // gelu_erf(x) constants for direct erf approximation (formula defined)
+    static const table_t gelu_erf_minimax_consts {
+            {gelu_erf_idx_bias, {0xc21fffff, true}},
+            {gelu_erf_rbound, {0x40b15cee, true}},
+            {gelu_erf_one, {0x00000001, true}},
+            {gelu_erf_twenty_three, {0x00000017, true}},
+            {gelu_erf_twenty_four, {0x00000018, true}},
+    };
+    // gelu_erf(x) minimax polynomials for piecewise approximaxtion
+    static const table_t gelu_erf_minimax_polynomial {
+            // coefficients of degree  0
+            {gelu_erf_minimax_pol, {0xa6f2cb94, false}}, // -0x1.e59728p-50
+            {gelu_erf_minimax_pol, {0x32827792, false}}, // 0x1.04ef24p-26
+            {gelu_erf_minimax_pol, {0x3381cc0c, false}}, // 0x1.039818p-24
+            {gelu_erf_minimax_pol, {0x34523d4a, false}}, // 0x1.a47a94p-23
+            {gelu_erf_minimax_pol, {0x351ac44d, false}}, // 0x1.35889ap-21
+            {gelu_erf_minimax_pol, {0x35f36d88, false}}, // 0x1.e6db1p-20
+            {gelu_erf_minimax_pol, {0x36ee8229, false}}, // 0x1.dd0452p-18
+            {gelu_erf_minimax_pol, {0x37b8a3bb, false}}, // 0x1.714776p-16
+            {gelu_erf_minimax_pol, {0x3867a213, false}}, // 0x1.cf4426p-15
+            {gelu_erf_minimax_pol, {0x3940033b, false}}, // 0x1.800676p-13
+            {gelu_erf_minimax_pol, {0x3a2a5a1d, false}}, // 0x1.54b43ap-11
+            {gelu_erf_minimax_pol, {0x3ae35863, false}}, // 0x1.c6b0c6p-10
+            {gelu_erf_minimax_pol, {0x3b7828f2, false}}, // 0x1.f051e4p-9
+            {gelu_erf_minimax_pol, {0x3c08b14b, false}}, // 0x1.116296p-7
+            {gelu_erf_minimax_pol, {0x3c515ed3, false}}, // 0x1.a2bda6p-7
+            {gelu_erf_minimax_pol, {0xbb503236, false}}, // -0x1.a0646cp-9
+            {gelu_erf_minimax_pol, {0xbd8d8e5e, false}}, // -0x1.1b1cbcp-4
+            {gelu_erf_minimax_pol, {0xbe8abcd9, false}}, // -0x1.1579b2p-2
+            {gelu_erf_minimax_pol, {0xbf0c19a2, false}}, // -0x1.183344p-1
+            {gelu_erf_minimax_pol, {0xbeccb328, false}}, // -0x1.99665p-2
+            {gelu_erf_minimax_pol, {0x3e176ced, false}}, // 0x1.2ed9dap-3
+            {gelu_erf_minimax_pol, {0x3f470d99, false}}, // 0x1.8e1b32p-1
+            {gelu_erf_minimax_pol, {0x3f7abb28, false}}, // 0x1.f5765p-1
+            {gelu_erf_minimax_pol, {0x3f800000, false}}, // 0x1p0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            // coefficients of degree 1
+            {gelu_erf_minimax_pol, {0x3f4c422a, false}}, // 0x1.988454p-1
+            {gelu_erf_minimax_pol, {0x3f4c421f, false}}, // 0x1.98843ep-1
+            {gelu_erf_minimax_pol, {0x3f4c4207, false}}, // 0x1.98840ep-1
+            {gelu_erf_minimax_pol, {0x3f4c41cb, false}}, // 0x1.988396p-1
+            {gelu_erf_minimax_pol, {0x3f4c413b, false}}, // 0x1.988276p-1
+            {gelu_erf_minimax_pol, {0x3f4c3fad, false}}, // 0x1.987f5ap-1
+            {gelu_erf_minimax_pol, {0x3f4c3a2f, false}}, // 0x1.98745ep-1
+            {gelu_erf_minimax_pol, {0x3f4c2d40, false}}, // 0x1.985a8p-1
+            {gelu_erf_minimax_pol, {0x3f4c146a, false}}, // 0x1.9828d4p-1
+            {gelu_erf_minimax_pol, {0x3f4bc341, false}}, // 0x1.978682p-1
+            {gelu_erf_minimax_pol, {0x3f4ad08c, false}}, // 0x1.95a118p-1
+            {gelu_erf_minimax_pol, {0x3f48f8cf, false}}, // 0x1.91f19ep-1
+            {gelu_erf_minimax_pol, {0x3f45fac7, false}}, // 0x1.8bf58ep-1
+            {gelu_erf_minimax_pol, {0x3f404e07, false}}, // 0x1.809c0ep-1
+            {gelu_erf_minimax_pol, {0x3f3b980f, false}}, // 0x1.77301ep-1
+            {gelu_erf_minimax_pol, {0x3f48dff3, false}}, // 0x1.91bfe6p-1
+            {gelu_erf_minimax_pol, {0x3f78b21b, false}}, // 0x1.f16436p-1
+            {gelu_erf_minimax_pol, {0x3fbb0704, false}}, // 0x1.760e08p0
+            {gelu_erf_minimax_pol, {0x40019c32, false}}, // 0x1.033864p1
+            {gelu_erf_minimax_pol, {0x3fe536d6, false}}, // 0x1.ca6dacp0
+            {gelu_erf_minimax_pol, {0x3f81331e, false}}, // 0x1.02663cp0
+            {gelu_erf_minimax_pol, {0x3e6c8684, false}}, // 0x1.d90d08p-3
+            {gelu_erf_minimax_pol, {0x3c98f936, false}}, // 0x1.31f26cp-6
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x3f800000, false}}, // 0x1p0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            // coefficients of degree 2
+            {gelu_erf_minimax_pol, {0xb62173f4, false}}, // -0x1.42e7e8p-19
+            {gelu_erf_minimax_pol, {0x3735e4cf, false}}, // 0x1.6bc99ep-17
+            {gelu_erf_minimax_pol, {0x37f2ff89, false}}, // 0x1.e5ff12p-16
+            {gelu_erf_minimax_pol, {0x388c23be, false}}, // 0x1.18477cp-14
+            {gelu_erf_minimax_pol, {0x3917535c, false}}, // 0x1.2ea6b8p-13
+            {gelu_erf_minimax_pol, {0x39ab2ab0, false}}, // 0x1.56556p-12
+            {gelu_erf_minimax_pol, {0x3a60fadb, false}}, // 0x1.c1f5b6p-11
+            {gelu_erf_minimax_pol, {0x3af9b960, false}}, // 0x1.f372cp-10
+            {gelu_erf_minimax_pol, {0x3b6e5491, false}}, // 0x1.dca922p-9
+            {gelu_erf_minimax_pol, {0x3c0a4ec5, false}}, // 0x1.149d8ap-7
+            {gelu_erf_minimax_pol, {0x3ca5aa8c, false}}, // 0x1.4b5518p-6
+            {gelu_erf_minimax_pol, {0x3d2138d9, false}}, // 0x1.4271b2p-5
+            {gelu_erf_minimax_pol, {0x3d8737d4, false}}, // 0x1.0e6fa8p-4
+            {gelu_erf_minimax_pol, {0x3ddfb660, false}}, // 0x1.bf6ccp-4
+            {gelu_erf_minimax_pol, {0x3e0f27ab, false}}, // 0x1.1e4f56p-3
+            {gelu_erf_minimax_pol, {0x3d94004b, false}}, // 0x1.280096p-4
+            {gelu_erf_minimax_pol, {0xbe0efdeb, false}}, // -0x1.1dfbd6p-3
+            {gelu_erf_minimax_pol, {0xbf1d96c3, false}}, // -0x1.3b2d86p-1
+            {gelu_erf_minimax_pol, {0xbf89db58, false}}, // -0x1.13b6bp0
+            {gelu_erf_minimax_pol, {0xbf6d9897, false}}, // -0x1.db312ep-1
+            {gelu_erf_minimax_pol, {0xbef69fb8, false}}, // -0x1.ed3f7p-2
+            {gelu_erf_minimax_pol, {0xbdc4f8a8, false}}, // -0x1.89f15p-4
+            {gelu_erf_minimax_pol, {0xbbde6422, false}}, // -0x1.bcc844p-8
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            // coefficients of degree 3
+            {gelu_erf_minimax_pol, {0xbe081a19, false}}, // -0x1.103432p-3
+            {gelu_erf_minimax_pol, {0xbe084570, false}}, // -0x1.108aep-3
+            {gelu_erf_minimax_pol, {0xbe08639b, false}}, // -0x1.10c736p-3
+            {gelu_erf_minimax_pol, {0xbe089837, false}}, // -0x1.11306ep-3
+            {gelu_erf_minimax_pol, {0xbe08f409, false}}, // -0x1.11e812p-3
+            {gelu_erf_minimax_pol, {0xbe09ab95, false}}, // -0x1.13572ap-3
+            {gelu_erf_minimax_pol, {0xbe0b66d0, false}}, // -0x1.16cdap-3
+            {gelu_erf_minimax_pol, {0xbe0e400a, false}}, // -0x1.1c8014p-3
+            {gelu_erf_minimax_pol, {0xbe124df8, false}}, // -0x1.249bfp-3
+            {gelu_erf_minimax_pol, {0xbe1bde02, false}}, // -0x1.37bc04p-3
+            {gelu_erf_minimax_pol, {0xbe2f19c9, false}}, // -0x1.5e3392p-3
+            {gelu_erf_minimax_pol, {0xbe4931bf, false}}, // -0x1.92637ep-3
+            {gelu_erf_minimax_pol, {0xbe685fbc, false}}, // -0x1.d0bf78p-3
+            {gelu_erf_minimax_pol, {0xbe89c95f, false}}, // -0x1.1392bep-2
+            {gelu_erf_minimax_pol, {0xbe96cbca, false}}, // -0x1.2d9794p-2
+            {gelu_erf_minimax_pol, {0xbe8044aa, false}}, // -0x1.008954p-2
+            {gelu_erf_minimax_pol, {0xbe0550f2, false}}, // -0x1.0aa1e4p-3
+            {gelu_erf_minimax_pol, {0x3dcfd6a1, false}}, // 0x1.9fad42p-4
+            {gelu_erf_minimax_pol, {0x3e94c826, false}}, // 0x1.29904cp-2
+            {gelu_erf_minimax_pol, {0x3e79345f, false}}, // 0x1.f268bep-3
+            {gelu_erf_minimax_pol, {0x3decec91, false}}, // 0x1.d9d922p-4
+            {gelu_erf_minimax_pol, {0x3ca46568, false}}, // 0x1.48cadp-6
+            {gelu_erf_minimax_pol, {0x3aa1e00a, false}}, // 0x1.43c014p-10
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            // coefficients of degree 4
+            {gelu_erf_minimax_pol, {0xba3d61db, false}}, // -0x1.7ac3b6p-11
+            {gelu_erf_minimax_pol, {0x39f097a3, false}}, // 0x1.e12f46p-12
+            {gelu_erf_minimax_pol, {0x3a5845dc, false}}, // 0x1.b08bb8p-11
+            {gelu_erf_minimax_pol, {0x3ab1fa35, false}}, // 0x1.63f46ap-10
+            {gelu_erf_minimax_pol, {0x3b0cefb8, false}}, // 0x1.19df7p-9
+            {gelu_erf_minimax_pol, {0x3b653ab6, false}}, // 0x1.ca756cp-9
+            {gelu_erf_minimax_pol, {0x3bcae527, false}}, // 0x1.95ca4ep-8
+            {gelu_erf_minimax_pol, {0x3c221712, false}}, // 0x1.442e24p-7
+            {gelu_erf_minimax_pol, {0x3c6c5840, false}}, // 0x1.d8b08p-7
+            {gelu_erf_minimax_pol, {0x3cc0a703, false}}, // 0x1.814e06p-6
+            {gelu_erf_minimax_pol, {0x3d1dcc19, false}}, // 0x1.3b9832p-5
+            {gelu_erf_minimax_pol, {0x3d63656d, false}}, // 0x1.c6cadap-5
+            {gelu_erf_minimax_pol, {0x3d955907, false}}, // 0x1.2ab20ep-4
+            {gelu_erf_minimax_pol, {0x3dbf9910, false}}, // 0x1.7f322p-4
+            {gelu_erf_minimax_pol, {0x3dd53f69, false}}, // 0x1.aa7ed2p-4
+            {gelu_erf_minimax_pol, {0x3db7dcef, false}}, // 0x1.6fb9dep-4
+            {gelu_erf_minimax_pol, {0x3d639ebe, false}}, // 0x1.c73d7cp-5
+            {gelu_erf_minimax_pol, {0xba6ede48, false}}, // -0x1.ddbc9p-11
+            {gelu_erf_minimax_pol, {0xbd22be69, false}}, // -0x1.457cd2p-5
+            {gelu_erf_minimax_pol, {0xbd041cf1, false}}, // -0x1.0839e2p-5
+            {gelu_erf_minimax_pol, {0xbc64f5ab, false}}, // -0x1.c9eb56p-7
+            {gelu_erf_minimax_pol, {0xbb097a32, false}}, // -0x1.12f464p-9
+            {gelu_erf_minimax_pol, {0xb8ebf380, false}}, // -0x1.d7e7p-14
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            // coefficients of degree 5
+            {gelu_erf_minimax_pol, {0x3cb7d80c, false}}, // 0x1.6fb018p-6
+            {gelu_erf_minimax_pol, {0x3c9b6050, false}}, // 0x1.36c0ap-6
+            {gelu_erf_minimax_pol, {0x3c978d11, false}}, // 0x1.2f1a22p-6
+            {gelu_erf_minimax_pol, {0x3c92e850, false}}, // 0x1.25d0ap-6
+            {gelu_erf_minimax_pol, {0x3c8d058b, false}}, // 0x1.1a0b16p-6
+            {gelu_erf_minimax_pol, {0x3c848454, false}}, // 0x1.0908a8p-6
+            {gelu_erf_minimax_pol, {0x3c6cd623, false}}, // 0x1.d9ac46p-7
+            {gelu_erf_minimax_pol, {0x3c4c824b, false}}, // 0x1.990496p-7
+            {gelu_erf_minimax_pol, {0x3c2a7935, false}}, // 0x1.54f26ap-7
+            {gelu_erf_minimax_pol, {0x3be0b390, false}}, // 0x1.c1672p-8
+            {gelu_erf_minimax_pol, {0x3b0651ac, false}}, // 0x1.0ca358p-9
+            {gelu_erf_minimax_pol, {0xbb232f53, false}}, // -0x1.465ea6p-9
+            {gelu_erf_minimax_pol, {0xbbd42fa0, false}}, // -0x1.a85f4p-8
+            {gelu_erf_minimax_pol, {0xbc2c5366, false}}, // -0x1.58a6ccp-7
+            {gelu_erf_minimax_pol, {0xbc492c9e, false}}, // -0x1.92593cp-7
+            {gelu_erf_minimax_pol, {0xbc2a7aa6, false}}, // -0x1.54f54cp-7
+            {gelu_erf_minimax_pol, {0xbbd55d04, false}}, // -0x1.aaba08p-8
+            {gelu_erf_minimax_pol, {0xba823a76, false}}, // -0x1.0474ecp-10
+            {gelu_erf_minimax_pol, {0x3b102aa8, false}}, // 0x1.20555p-9
+            {gelu_erf_minimax_pol, {0x3ae25a7e, false}}, // 0x1.c4b4fcp-10
+            {gelu_erf_minimax_pol, {0x3a31f792, false}}, // 0x1.63ef24p-11
+            {gelu_erf_minimax_pol, {0x38b84375, false}}, // 0x1.7086eap-14
+            {gelu_erf_minimax_pol, {0x3689bb5a, false}}, // 0x1.1376b4p-18
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+            {gelu_erf_minimax_pol, {0x00000000, false}}, // 0 padd
+    };
 
     // This object takes care about which constants and polynomials to include.
     struct need_t {
@@ -1775,12 +2378,14 @@ void jit_uni_eltwise_injector_f32<isa>::register_table_entries() {
     if (need.exp()) push_entries_of(exp_consts2);
     if (need.mish()) push_entries_of(mish_consts);
     if (need.tanh()) push_entries_of(tanh_consts);
+    if (need.tanh()) push_entries_of(tanh_polynomial_table);
     if (need.soft_relu()) push_entries_of(soft_relu_consts);
     if (need.soft_relu()) push_entries_of(soft_relu_polynomial);
     if (need.gelu_tanh()) push_entries_of(gelu_tanh_consts);
     if (need.gelu_erf()) push_entries_of(gelu_erf_consts);
     if (need.gelu_erf()) push_entries_of(gelu_erf_polynomial);
-
+    if (need.gelu_erf()) push_entries_of(gelu_erf_minimax_consts);
+    if (need.gelu_erf()) push_entries_of(gelu_erf_minimax_polynomial);
     // Now that we registered the entries, we set the offsets.  No
     // entries should be registered after this point.  This allows to
     // expect the same order when injecting the table entries in
diff --git a/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.hpp b/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.hpp
index 7301d99d567..355f877ccb2 100644
--- a/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.hpp
+++ b/src/cpu/aarch64/injectors/jit_uni_eltwise_injector.hpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2019-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
+* Copyright 2021-2024 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -215,6 +215,7 @@ struct jit_uni_eltwise_injector_f32 {
     void relu_zero_ns_compute_vector_fwd(const TRegS &vmm_src);
     void elu_compute_vector_fwd(const TRegS &vmm_src);
     void tanh_compute_vector_fwd(const TRegS &vmm_src);
+    void tanh_polynomial_approx_compute_vector_fwd(const TRegS &vmm_src);
     void square_compute_vector_fwd(const TRegS &vmm_src);
     void abs_compute_vector_fwd(const TRegS &vmm_src);
     void sqrt_compute_vector_fwd(const TRegS &vmm_src);
@@ -277,12 +278,23 @@ struct jit_uni_eltwise_injector_f32 {
         bwd_mish_max_x_for_equation_f,
         tanh_range, // tanh(x) = x - x^3/3 for |x| < tanh_range
         tanh_m1d3, // -1/3
+        tanh_idx_bias, // bias applied during index computation
+        tanh_idx_mask, // mask applied to extract index
+        tanh_linear_ubound, // arg below which tanh(x) = x
+        tanh_saturation_lbound, // arg after which tanh(x) = 1.f
+        tanh_pol_table, // table of polynomial coefficients
         soft_relu_one_twenty_six, // 126.f
         soft_relu_mantissa_sign_mask, // mask for mantissa bits and sign
         soft_relu_pol, // see correspondent table for float values
         gelu_tanh_fitting_const, // 0.044715f
         gelu_tanh_fitting_const_times_three, // 0.134145f
         gelu_tanh_sqrt_two_over_pi, // sqrtf(2.f/pi) = 0.797884f
+        gelu_erf_idx_bias, // bias applied to compute table index
+        gelu_erf_rbound, // upper bound at which we clamp erf at 1
+        gelu_erf_one, // just the integer value 1, used for index clamping
+        gelu_erf_twenty_three, // just the integer value 23, used for index clamping
+        gelu_erf_twenty_four, // just the integer value 24, used for index clamping
+        gelu_erf_minimax_pol, // see correspondent table for float values
         gelu_erf_approx_const, // 0.3275911f - implementation based for approx
         gelu_erf_one_over_sqrt_two, // 1.f / sqrtf(2.f)
         gelu_erf_one_over_sqrt_pi, // 1.f / sqrtf(pi) = 0.564190f
diff --git a/src/cpu/aarch64/jit_brdgmm_dw_conv.cpp b/src/cpu/aarch64/jit_brdgmm_dw_conv.cpp
index 24e018aef02..226864baad2 100644
--- a/src/cpu/aarch64/jit_brdgmm_dw_conv.cpp
+++ b/src/cpu/aarch64/jit_brdgmm_dw_conv.cpp
@@ -108,7 +108,7 @@ status_t brdgmm_dw_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     //     const auto isa = sve_512;
 
     auto skip_mask = skip_mask_t::post_ops;
-    if (is_int8) skip_mask |= skip_mask_t::scales_runtime;
+    if (is_int8) skip_mask |= skip_mask_t::scales;
 
     bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
             && one_of(true, is_f32, is_int8) && (isa != isa_undef)
@@ -200,7 +200,7 @@ status_t brdgmm_dw_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     const auto &wei_scales = attr_.scales_.get(DNNL_ARG_WEIGHTS);
     jcp.with_scale = !src_scales.has_default_values()
             || !wei_scales.has_default_values();
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
 
     const bool scales_ok
             = attr_scales_ok({DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST});
diff --git a/src/cpu/aarch64/jit_brdgmm_dw_conv.hpp b/src/cpu/aarch64/jit_brdgmm_dw_conv.hpp
index 830c9e56bfd..61d6a726fcf 100644
--- a/src/cpu/aarch64/jit_brdgmm_dw_conv.hpp
+++ b/src/cpu/aarch64/jit_brdgmm_dw_conv.hpp
@@ -34,15 +34,13 @@ namespace aarch64 {
 template <cpu_isa_t isa>
 struct brdgmm_dw_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brdgmm_dw:", jcp_.isa, ""),
                 brdgmm_dw_convolution_fwd_t);
 
         status_t init(engine_t *engine);
-        jit_brdgmm_conv_conf_t jcp_;
+        jit_brdgmm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         std::vector<brgemm_t> bcps_;
         std::vector<brgemm_batch_element_t> batches_;
         std::vector<int> bs_;
diff --git a/src/cpu/aarch64/jit_brgemm_1x1_conv.cpp b/src/cpu/aarch64/jit_brgemm_1x1_conv.cpp
index d9e8e49d3d0..808b6685b19 100644
--- a/src/cpu/aarch64/jit_brgemm_1x1_conv.cpp
+++ b/src/cpu/aarch64/jit_brgemm_1x1_conv.cpp
@@ -54,8 +54,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
 
     using skip_mask_t = primitive_attr_t::skip_mask_t;
     auto skip_mask = skip_mask_t::post_ops | skip_mask_t::sum_dt
-            | skip_mask_t::zero_points_runtime;
-    if (one_of(src_type, u8, s8)) skip_mask |= skip_mask_t::scales_runtime;
+            | skip_mask_t::zero_points;
+    if (one_of(src_type, u8, s8)) skip_mask |= skip_mask_t::scales;
 
     bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
             && expect_data_types(src_type, wei_type, data_type::undef, dst_type,
@@ -115,6 +115,7 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
         brg.with_weights_scale_adjust = jcp_.scale_adjust_factor != 1.0f;
         CHECK(brgemm_desc_set_postops(
                 &brg, attr(), &dst_md_, LDD, jcp_.bia_dt));
+        CHECK(brgemm_desc_finalize(&brg));
         brgs_->insert(brg_idx, brg);
     }
 
diff --git a/src/cpu/aarch64/jit_brgemm_1x1_conv.hpp b/src/cpu/aarch64/jit_brgemm_1x1_conv.hpp
index 7843d14d7a0..20e698c4c61 100644
--- a/src/cpu/aarch64/jit_brgemm_1x1_conv.hpp
+++ b/src/cpu/aarch64/jit_brgemm_1x1_conv.hpp
@@ -43,11 +43,7 @@ namespace aarch64 {
 template <cpu_isa_t isa>
 struct brgemm_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , with_sum(false)
-            , sum_scale(0) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgconv_1x1:", isa, ""),
                 brgemm_1x1_convolution_fwd_t);
@@ -55,13 +51,13 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
         status_t init(engine_t *engine);
 
         std::shared_ptr<brgemm_containers::brgemm_desc_container_t> brgs_;
-        bool with_sum;
-        float sum_scale;
+        bool with_sum = false;
+        float sum_scale = 0.f;
 
         bool need_postwork;
         int ic_chunks;
 
-        jit_brgemm_conv_conf_t jcp_;
+        jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool arg_scales_ok() const {
@@ -70,12 +66,20 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
             return attr_scales_ok(supported_args);
         }
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
     };
 
diff --git a/src/cpu/aarch64/jit_brgemm_conv.cpp b/src/cpu/aarch64/jit_brgemm_conv.cpp
index c649e0cb690..36b126c9cd3 100644
--- a/src/cpu/aarch64/jit_brgemm_conv.cpp
+++ b/src/cpu/aarch64/jit_brgemm_conv.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
-* Copyright 2024 FUJITSU LIMITED
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,8 +43,8 @@ using namespace jit_uni_brgemm_conv_comp_pad_kernel;
 #define ndims_pick(v5, v4, v3) \
     ((ndims == 5) ? (v5) : (ndims == 4) ? (v4) : (ndims == 3) ? (v3) : 0)
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::init_batch(int icc,
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::pd_t::init_batch(int icc,
         const char *src_base, const char *wei_base, int n_ic_blocks,
         int ic_block_s, int iid_b, int iih_b, int iiw_b,
         const dim_t *const __restrict kw_top_vpads,
@@ -117,8 +117,8 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::init_batch(int icc,
     }
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-inline void brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::get_A_B(int icc,
+template <cpu_isa_t isa>
+inline void brgemm_convolution_fwd_t<isa>::pd_t::get_A_B(int icc,
         const char *src_base, const char *wei_base, int ic_block_s, int iid_b,
         int iih_b, int iiw_b, int kd_b, int kh_b, const void *&ptrA,
         const void *&ptrB) const {
@@ -147,10 +147,9 @@ inline void brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::get_A_B(int icc,
     ptrB = wei_base_kh + wei_kw * wei_kw_offset;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::add_brg_descriptor(
-        int vM, int i_N, int i_K, int i_init, int kd_b, int kd_e, int kh_b,
-        int kh_e) {
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::pd_t::add_brg_descriptor(int vM,
+        int i_N, int i_K, int i_init, int kd_b, int kd_e, int kh_b, int kh_e) {
 
     const auto src_type = src_md(0)->data_type;
     const auto wei_type = weights_md(0)->data_type;
@@ -265,7 +264,7 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::add_brg_descriptor(
     brgattr.hint_expected_B_size = 0;
     brgattr.hint_expected_C_size = 0;
 
-    brgattr.wary_tail_read = false;
+    brgattr.wary_A_k_tail_read = false;
     brgattr.bd_mask_level = jcp_.use_M_mask;
 
     brgattr.max_top_vpad = jcp_.max_vpad;
@@ -280,14 +279,15 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::add_brg_descriptor(
     brg.with_weights_scale_adjust = jcp_.scale_adjust_factor != 1.0f;
     CHECK(brgemm_desc_set_postops(&brg, attr(), &dst_md_, LDD, jcp_.bia_dt));
 
+    CHECK(brgemm_desc_finalize(&brg));
+
     brgemm_descriptors_->insert(brg_idx, brg, bd_mask, stoffs);
 
     return status::success;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::init(
-        engine_t *engine) {
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     using namespace data_type;
     using namespace utils;
     brgemm_descriptors_
@@ -304,15 +304,15 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::init(
     // executing 'use_inversion == true' as FWD. This can only work if the
     // diff_src_desc and diff_dst_desc are defined in the aforementioned.
     const convolution_desc_t &cd = *desc();
-    if (use_inversion
+    if (cd.use_inversion
             && one_of(true, types::is_zero_md(&cd.diff_src_desc),
                     types::is_zero_md(&cd.diff_dst_desc)))
         return status::unimplemented;
 
     using skip_mask_t = primitive_attr_t::skip_mask_t;
     auto skip_mask = skip_mask_t::post_ops | skip_mask_t::sum_dt
-            | skip_mask_t::zero_points_runtime;
-    if (is_int8) skip_mask |= skip_mask_t::scales_runtime;
+            | skip_mask_t::zero_points;
+    if (is_int8) skip_mask |= skip_mask_t::scales;
 
     bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
             && IMPLICATION(is_int8,
@@ -334,6 +334,8 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::init(
     // For exec_base it makes sense to use unrolled kernel only if
     // there is no padding by width.
     // 2. For exec_trans block by kw is always KW
+    // 3. 'false' is used intentionally to disable the condition, ensuring that
+    // the assert fails only when jcp_.use_uker is true, regardless of exec_type.
     assert(IMPLICATION(jcp_.use_uker,
             false && one_of(jcp_.exec_type, exec_base, exec_trans)));
     assert(IMPLICATION(jcp_.use_interleave_stores, jcp_.use_uker));
@@ -533,13 +535,12 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::pd_t::init(
     return status::success;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-brgemm_convolution_fwd_t<isa, use_inversion>::brgemm_convolution_fwd_t(
-        const pd_t *apd)
+template <cpu_isa_t isa>
+brgemm_convolution_fwd_t<isa>::brgemm_convolution_fwd_t(const pd_t *apd)
     : primitive_t(apd), bias_d(pd()->weights_md(1)) {}
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::get_kw_range(
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::get_kw_range(
         int ow, int &kw_s, int &kw_full_s, int &kw_full_f, int &kw_f) const {
     // This function needed for exec_base only
     const auto _pd = pd();
@@ -568,8 +569,8 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::get_kw_range(
     if (kw_full_f == -1) kw_full_s = kw_full_f = kw_f;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-inline void brgemm_convolution_fwd_t<isa, use_inversion>::get_ow_range(
+template <cpu_isa_t isa>
+inline void brgemm_convolution_fwd_t<isa>::get_ow_range(
         int ow, int kw, int &ow_s, int &ow_f) const {
     // This function needed for exec_base only
     const auto _pd = pd();
@@ -600,9 +601,9 @@ inline void brgemm_convolution_fwd_t<isa, use_inversion>::get_ow_range(
     ow_f = nstl::min(nstl::max(ow_f, ow_s), ow + M);
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::add_brg_kernel(int M,
-        int i_N, int i_K, int i_init, int kd_b, int kd_e, int kh_b, int kh_e) {
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::add_brg_kernel(int M, int i_N, int i_K,
+        int i_init, int kd_b, int kd_e, int kh_b, int kh_e) {
     if (M <= 0) return status::success;
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
@@ -621,8 +622,8 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::add_brg_kernel(int M,
     return status::success;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::add_po_kernel(
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::add_po_kernel(
         brgemm_t *bcfg, int ker_idx, bool is_init) {
     if (!bcfg) return status::success;
     const auto _pd = pd();
@@ -639,8 +640,8 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::add_po_kernel(
     return status::success;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::add_po_kernels(
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::add_po_kernels(
         int i_N, int init_bcast_dim, int po_bcast_dim) {
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
@@ -674,10 +675,10 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::add_po_kernels(
         }
     }
 }
-template <cpu_isa_t isa, bool use_inversion>
-int brgemm_convolution_fwd_t<isa, use_inversion>::get_comp_ker_idx(
-        const int kd_b, const int kd_e, const int kh_b, const int kh_e,
-        const int kw_b, const int kw_e) const {
+template <cpu_isa_t isa>
+int brgemm_convolution_fwd_t<isa>::get_comp_ker_idx(const int kd_b,
+        const int kd_e, const int kh_b, const int kh_e, const int kw_b,
+        const int kw_e) const {
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
 
@@ -694,11 +695,10 @@ int brgemm_convolution_fwd_t<isa, use_inversion>::get_comp_ker_idx(
     return -1;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-inline int brgemm_convolution_fwd_t<isa, use_inversion>::get_comp_offset(
-        const int g, const int ocb, const int ow, const int kd_b,
-        const int kd_e, const int kh_b, const int kh_e, const int kw_b,
-        const int kw_e) const {
+template <cpu_isa_t isa>
+inline int brgemm_convolution_fwd_t<isa>::get_comp_offset(const int g,
+        const int ocb, const int ow, const int kd_b, const int kd_e,
+        const int kh_b, const int kh_e, const int kw_b, const int kw_e) const {
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
 
@@ -712,8 +712,8 @@ inline int brgemm_convolution_fwd_t<isa, use_inversion>::get_comp_offset(
             : (g * jcp.nb_oc + ocb) * jcp.oc_block;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::init(engine_t *engine) {
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
 
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
@@ -1052,8 +1052,8 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::init(engine_t *engine) {
 
     return status::success;
 }
-template <cpu_isa_t isa, bool use_inversion>
-struct brgemm_convolution_fwd_t<isa, use_inversion>::brgemm_thread_ctx_t {
+template <cpu_isa_t isa>
+struct brgemm_convolution_fwd_t<isa>::brgemm_thread_ctx_t {
     brgemm_thread_ctx_t(brgemm_exec_ctx_t &brgemm_ctx_, int ithr_,
             brgemm_batch_element_t *__restrict brg_batch_, char *c_buffer_,
             char *wsp_tile_)
@@ -1080,9 +1080,8 @@ struct brgemm_convolution_fwd_t<isa, use_inversion>::brgemm_thread_ctx_t {
     const float *dst_scales {nullptr};
 };
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::execute(
-        const exec_ctx_t &ctx) const {
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
 
@@ -1264,8 +1263,8 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::execute(
     return status::success;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-status_t brgemm_convolution_fwd_t<isa, use_inversion>::cal_compensation(
+template <cpu_isa_t isa>
+status_t brgemm_convolution_fwd_t<isa>::cal_compensation(
         const char *__restrict weights, int32_t *src_zp_buffer,
         int32_t *s8s8_comp_buffer) const {
     const auto _pd = pd();
@@ -1330,8 +1329,8 @@ status_t brgemm_convolution_fwd_t<isa, use_inversion>::cal_compensation(
     return status::success;
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::perform_outwork(
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::perform_outwork(
         const brgemm_thread_ctx_t &btc, char *dst_base, const char *bias_w,
         int ow, int g_oc, bool is_oc_tail, int ker_ow_s, int ker_ow_f, int kd_l,
         int kh_l, bool maybe_do_init, bool do_postwork,
@@ -1415,8 +1414,8 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::perform_outwork(
     }
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-inline void brgemm_convolution_fwd_t<isa, use_inversion>::call_brgemm_kernel(
+template <cpu_isa_t isa>
+inline void brgemm_convolution_fwd_t<isa>::call_brgemm_kernel(
         const brgemm_thread_ctx_t &btc, const brgemm_kernel_t *brg_ker,
         int batch_size, char *ptr_C, char *ptr_D, const char *bias_w, int g_oc,
         bool do_postops, int comp_ker_offs, bool do_only_comp) const {
@@ -1465,8 +1464,8 @@ inline void brgemm_convolution_fwd_t<isa, use_inversion>::call_brgemm_kernel(
                 ptr_C, static_cast<void *>(btc.wsp_tile));
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::maybe_conv_inp(int ithr,
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::maybe_conv_inp(int ithr,
         const char *__restrict src, char *__restrict inp_buffer,
         uint8_t *__restrict inp_buffer_mask, int g, int n, int icc, int odb,
         int ohb, int owb, int last_g, int last_n, int last_icc, int last_odb,
@@ -1646,9 +1645,8 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::maybe_conv_inp(int ithr,
     char *ptr_D; \
     int kd_b(0), kd_e(0), kh_b(0), kh_e(0), k_l(0), iiw_b(0);
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::ker_base(
-        brgemm_thread_ctx_t &btc) const {
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::ker_base(brgemm_thread_ctx_t &btc) const {
 
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
@@ -1797,8 +1795,8 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::ker_base(
     }
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::ker_trans(
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::ker_trans(
         brgemm_thread_ctx_t &btc, char *inp_buffer) const {
 
     const auto _pd = pd();
@@ -1922,9 +1920,8 @@ void brgemm_convolution_fwd_t<isa, use_inversion>::ker_trans(
     }
 }
 
-template <cpu_isa_t isa, bool use_inversion>
-void brgemm_convolution_fwd_t<isa, use_inversion>::ker_vpad(
-        brgemm_thread_ctx_t &btc) const {
+template <cpu_isa_t isa>
+void brgemm_convolution_fwd_t<isa>::ker_vpad(brgemm_thread_ctx_t &btc) const {
 
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
diff --git a/src/cpu/aarch64/jit_brgemm_conv.hpp b/src/cpu/aarch64/jit_brgemm_conv.hpp
index 2f476a2552a..dedcf753be2 100644
--- a/src/cpu/aarch64/jit_brgemm_conv.hpp
+++ b/src/cpu/aarch64/jit_brgemm_conv.hpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
-* Copyright 2024 FUJITSU LIMITED
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,18 +41,13 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-template <cpu_isa_t isa, bool use_inversion = false>
+template <cpu_isa_t isa>
 struct brgemm_convolution_fwd_t : public primitive_t {
 
     struct brgemm_thread_ctx_t;
 
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::hint_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , with_sum(false) {}
-
-        ~pd_t() = default;
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         // ------- DECLARE_COMMON_PD_t -----
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgconv:", jcp_.isa, ""),
@@ -63,8 +58,8 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         int brgs_sz_;
         std::shared_ptr<brgemm_containers::brgemm_desc_container_t>
                 brgemm_descriptors_;
-        bool with_sum;
-        jit_brgemm_conv_conf_t jcp_;
+        bool with_sum = false;
+        jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
         int ic_chunks;
         bool need_postwork;
@@ -122,7 +117,7 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         }
 
         inline int maybe_invert(int k, int K) const {
-            return use_inversion ? K - 1 - k : k;
+            return desc()->use_inversion ? K - 1 - k : k;
         };
 
         void init_batch(int icc, const char *src_base, const char *wei_base,
@@ -149,12 +144,20 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         }
 
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
         int KD, KH, KW, EXT_KD, EXT_KH, EXT_KW, KS, KD_BLOCK, KH_BLOCK,
@@ -207,7 +210,7 @@ struct brgemm_convolution_fwd_t : public primitive_t {
     }
 
     inline int maybe_invert_range(int k, int k_inv, int K) const {
-        return use_inversion ? K - k_inv : k;
+        return pd()->desc()->use_inversion ? K - k_inv : k;
     };
 
     void get_kw_range(
diff --git a/src/cpu/aarch64/jit_brgemm_conv_bwd.cpp b/src/cpu/aarch64/jit_brgemm_conv_bwd.cpp
new file mode 100644
index 00000000000..79210d804b0
--- /dev/null
+++ b/src/cpu/aarch64/jit_brgemm_conv_bwd.cpp
@@ -0,0 +1,185 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/dnnl_thread.hpp"
+#include "common/nstl.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/aarch64/jit_brgemm_1x1_conv.hpp"
+#include "cpu/aarch64/jit_brgemm_conv_bwd.hpp"
+#include "cpu/cpu_convolution_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+namespace {
+status_t weights_axes_permutation(
+        memory_desc_t *o_md, const memory_desc_t *i_md, bool with_groups) {
+    int perm[DNNL_MAX_NDIMS] {}; // bwd conv to fwd conv weight permutation
+    for (int d = 0; d < DNNL_MAX_NDIMS; ++d)
+        perm[d] = d;
+    nstl::swap(perm[0 + with_groups], perm[1 + with_groups]);
+
+    return memory_desc_permute_axes(*o_md, *i_md, perm);
+}
+
+status_t fwd_conv_desc_create(
+        convolution_desc_t *fwd_conv_d, const convolution_desc_t *bwd_conv_d) {
+    // create a new weights descriptor with OC and IC transposed;
+    // spatial inversion is handled by inverting indices on-the-fly
+    memory_desc_t fwd_weights_md;
+    const memory_desc_t &bwd_weights_md = bwd_conv_d->weights_desc;
+    const bool with_groups
+            = bwd_weights_md.ndims == bwd_conv_d->diff_src_desc.ndims + 1;
+    CHECK(weights_axes_permutation(
+            &fwd_weights_md, &bwd_weights_md, with_groups));
+
+    // create a fwd convolution descriptor with padding adjusted
+    // to the perspective of backward propagation, namely:
+    // - left padding replaced by left overflow
+    // - right padding replaced by right overflow
+    const int ndims_spatial = bwd_conv_d->diff_src_desc.ndims - 2;
+    dims_t overflow_l;
+    dims_t overflow_r;
+    dim_t ks = 1;
+    for (int i = 0; i < ndims_spatial; i++) {
+        VDISPATCH_CONV_IC(bwd_conv_d->strides[i] == 1,
+                VERBOSE_UNSUPPORTED_FEATURE,
+                "only unit strides are allowed for bwd-to-fwd conversion");
+        const dim_t K
+                = bwd_weights_md.dims[bwd_weights_md.ndims - ndims_spatial + i];
+        ks *= K;
+        const dim_t D = bwd_conv_d->dilates[i];
+        const dim_t PL = bwd_conv_d->padding[0][i]; // left padding
+        const dim_t PR = bwd_conv_d->padding[1][i]; // right padding
+        constexpr dim_t S = 1;
+        // the following relations hold for unit stride only
+        overflow_l[i] = ((K - 1) * (D + 1) - PL) / S;
+        overflow_r[i] = ((K - 1) * (D + 1) - PR) / S;
+    }
+    CHECK(conv_desc_init(fwd_conv_d, prop_kind::forward_training,
+            alg_kind::convolution_direct, &bwd_conv_d->diff_dst_desc,
+            &fwd_weights_md, &bwd_conv_d->bias_desc, &bwd_conv_d->diff_src_desc,
+            bwd_conv_d->strides, bwd_conv_d->dilates, overflow_l, overflow_r));
+
+    // HACK: Set diff_src_desc and diff_dst_desc as a signal to the primitive
+    //       descriptor cache that we are using the bwd-via-fwd version of
+    //       fwd conv and thus need a separate cache entry. Only needed for
+    //       non-1x1 convs due to spatial inversion of weights. This assumes
+    //       that external users only use the API to create conv descs, and
+    //       relies on common/convolution.cpp only setting the expected mem descs.
+    // TODO: Pass this information via attributes or integrate the bwd-via-fwd
+    //       method directly into fwd conv implementations.
+    const bool with_spatial_inversion = ks > 1;
+    if (with_spatial_inversion) {
+        fwd_conv_d->diff_src_desc = fwd_conv_d->src_desc;
+        fwd_conv_d->diff_dst_desc = fwd_conv_d->dst_desc;
+    }
+    // Note: internal field to hint this conv is created from deconv.
+    fwd_conv_d->use_inversion = true;
+    return status::success;
+}
+} // namespace
+
+template <cpu_isa_t isa>
+status_t brgemm_convolution_bwd_t<isa>::pd_t::init(engine_t *engine) {
+    using namespace data_type;
+    using namespace utils;
+
+    VDISPATCH_CONV(is_bwd_d(), VERBOSE_BAD_PROPKIND);
+    VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+            VERBOSE_BAD_ALGORITHM);
+    VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+    VDISPATCH_CONV(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+
+    convolution_desc_t fwd_conv_d = convolution_desc_t();
+    CHECK(fwd_conv_desc_create(&fwd_conv_d, desc()));
+
+    primitive_desc_iterator_t it(engine,
+            reinterpret_cast<const op_desc_t *>(&fwd_conv_d), attr(), nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+
+    while (++it != it.end()) {
+        fwd_pd_ = *it;
+        using fwd_1x1_conv_pd_t =
+                typename brgemm_1x1_convolution_fwd_t<isa>::pd_t;
+        const auto pd_1x1 = dynamic_cast<fwd_1x1_conv_pd_t *>((*it).get());
+        if (pd_1x1 != nullptr) {
+            break; // 1x1 implementation found
+        }
+
+        using fwd_conv_pd_t = typename brgemm_convolution_fwd_t<isa>::pd_t;
+
+        const auto pd = dynamic_cast<fwd_conv_pd_t *>((*it).get());
+        if (pd != nullptr) {
+            break; // non-1x1 implementation found
+        }
+    }
+
+    VDISPATCH_CONV(it != it.end(), "Implementation wasn't found");
+
+    if (weights_md_.format_kind == format_kind::any)
+        CHECK(weights_axes_permutation(
+                &weights_md_, fwd_pd_->weights_md(), with_groups()));
+    if (diff_src_md_.format_kind == format_kind::any)
+        diff_src_md_ = *fwd_pd_->dst_md();
+    if (diff_dst_md_.format_kind == format_kind::any)
+        diff_dst_md_ = *fwd_pd_->src_md();
+    if (bias_md_.format_kind == format_kind::any)
+        bias_md_ = *fwd_pd_->weights_md(1);
+
+    init_name();
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(
+            memory_tracking::names::key_nested, fwd_pd_->scratchpad_registry());
+
+    return status::success;
+}
+
+template <cpu_isa_t isa>
+status_t brgemm_convolution_bwd_t<isa>::init(engine_t *engine) {
+    return pd()->fwd_pd_->create_primitive(fwd_p_, engine);
+}
+
+template <cpu_isa_t isa>
+status_t brgemm_convolution_bwd_t<isa>::execute(const exec_ctx_t &ctx) const {
+    const auto &args = ctx.args();
+    exec_args_t conv_args;
+    conv_args[DNNL_ARG_DST] = args.at(DNNL_ARG_DIFF_SRC);
+    conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST);
+    conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
+    if (pd()->with_bias()) conv_args[DNNL_ARG_BIAS] = args.at(DNNL_ARG_BIAS);
+
+    exec_ctx_t fwd_ctx(ctx, std::move(conv_args));
+
+    nested_scratchpad_t ns(ctx, memory_tracking::names::key_nested, fwd_p_);
+    fwd_ctx.set_scratchpad_grantor(ns.grantor());
+    return fwd_p_->execute(fwd_ctx);
+}
+
+template struct brgemm_convolution_bwd_t<sve_512>;
+template struct brgemm_convolution_bwd_t<sve_256>;
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/aarch64/jit_brgemm_conv_bwd.hpp b/src/cpu/aarch64/jit_brgemm_conv_bwd.hpp
new file mode 100644
index 00000000000..01498b291a6
--- /dev/null
+++ b/src/cpu/aarch64/jit_brgemm_conv_bwd.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_AARCH64_JIT_BRGEMM_CONV_BWD_HPP
+#define CPU_AARCH64_JIT_BRGEMM_CONV_BWD_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/aarch64/jit_brgemm_conv.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+template <cpu_isa_t isa>
+struct brgemm_convolution_bwd_t : public primitive_t {
+
+    struct pd_t : public cpu_convolution_bwd_data_pd_t {
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
+
+        DECLARE_COMMON_PD_T(name_.c_str(), brgemm_convolution_bwd_t);
+
+        status_t init(engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> fwd_pd_;
+
+    private:
+        std::string name_ = JIT_IMPL_NAME_HELPER("brg_conv_bwd:", isa, "");
+
+        void init_name() {
+            name_.append("+");
+            name_.append(fwd_pd_->name());
+        }
+    };
+
+    brgemm_convolution_bwd_t(const pd_t *apd) : primitive_t(apd) {};
+
+    ~brgemm_convolution_bwd_t() override = default;
+
+    status_t init(engine_t *engine) override;
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const {
+        return static_cast<const pd_t *>(primitive_t::pd().get());
+    }
+    std::shared_ptr<primitive_t> fwd_p_;
+};
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/aarch64/jit_brgemm_conv_comp_pad_kernel.hpp b/src/cpu/aarch64/jit_brgemm_conv_comp_pad_kernel.hpp
index 0472aafb91a..96f86c2084a 100644
--- a/src/cpu/aarch64/jit_brgemm_conv_comp_pad_kernel.hpp
+++ b/src/cpu/aarch64/jit_brgemm_conv_comp_pad_kernel.hpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2022-2023 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,15 +44,14 @@ struct jit_uni_brgemm_conv_comp_pad_kernel_t : public jit_generator {
 
     using XReg = const Xbyak_aarch64::XReg;
 
-    jit_uni_brgemm_conv_comp_pad_kernel_t<isa>(
-            const jit_brgemm_conv_conf_t &ajcp);
+    jit_uni_brgemm_conv_comp_pad_kernel_t(const jit_brgemm_conv_conf_t &ajcp);
 
     ~jit_uni_brgemm_conv_comp_pad_kernel_t() = default;
 
 protected:
     static constexpr bool is_ymm_ = true;
 
-    jit_brgemm_conv_conf_t jcp_;
+    jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     const int inp_dsz_;
     const int out_dsz_;
     const size_t nb_ic_;
diff --git a/src/cpu/aarch64/jit_brgemm_conv_utils.cpp b/src/cpu/aarch64/jit_brgemm_conv_utils.cpp
index b93db5c423d..d10662b96ce 100644
--- a/src/cpu/aarch64/jit_brgemm_conv_utils.cpp
+++ b/src/cpu/aarch64/jit_brgemm_conv_utils.cpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -725,9 +726,9 @@ status_t brg_blocking_t::estimate_brgemm_ur() {
     const float alpha = 1.0;
     const float beta = 0.0;
     brgemm_t brg;
-    brgemm_utils::init_brgemm_conf(&brg, isa, brgemm_addr, src_dt, wei_dt,
+    CHECK(brgemm_utils::init_brgemm_conf(&brg, isa, brgemm_addr, src_dt, wei_dt,
             brgemm_row_major, alpha, beta, LDA, LDB, LDC, vM, vN, vK, nullptr,
-            is_bf32);
+            is_bf32));
     CHECK(brgemm_utils::brgemm_blocking(&brg));
     ur = brg.bd_block;
     ur_block = brg.bd_block;
@@ -771,9 +772,9 @@ status_t brg_blocking_t::get_brgemm_ur(
                         * rnd_up(oc, oc_block) * wei_dsz;
                 const auto strides_ptr
                         = (brg_type == brgemm_strd) ? &brg_strides : nullptr;
-                brgemm_utils::init_brgemm_conf(&brg, isa, brg_type, src_dt,
-                        wei_dt, brgemm_row_major, alpha, vbeta, LDA, LDB, LDC,
-                        vM, vN, vK, strides_ptr, is_bf32);
+                CHECK(brgemm_utils::init_brgemm_conf(&brg, isa, brg_type,
+                        src_dt, wei_dt, brgemm_row_major, alpha, vbeta, LDA,
+                        LDB, LDC, vM, vN, vK, strides_ptr, is_bf32));
                 CHECK(brgemm_utils::brgemm_blocking(&brg));
 
                 brgemm_attr_t brgattr;
@@ -1758,19 +1759,23 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     const int prelu_ind = p.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
 
+    const auto &zp = attr.zero_points_;
     jcp.src_zero_point
             = get_zp_type(attr, DNNL_ARG_SRC) != brgemm_broadcast_t::none;
     jcp.dst_zero_point
             = get_zp_type(attr, DNNL_ARG_DST) != brgemm_broadcast_t::none;
 
-    const bool has_zero_points = jcp.src_zero_point || jcp.dst_zero_point;
-    const bool params_ok
-            = IMPLICATION(has_zero_points, utils::one_of(jcp.src_dt, u8, s8))
-            && IMPLICATION(
-                    jcp.src_zero_point, attr.zero_points_.common(DNNL_ARG_SRC))
-            && IMPLICATION(
-                    jcp.dst_zero_point, attr.zero_points_.common(DNNL_ARG_DST));
-    if (!params_ok) return status::unimplemented;
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.src_zero_point || jcp.dst_zero_point,
+                              utils::one_of(jcp.src_dt, s8, u8)),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.src_zero_point, zp.get_mask(DNNL_ARG_SRC) == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.dst_zero_point, zp.get_mask(DNNL_ARG_DST) == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
 
     jcp.nthr = nthreads;
     jcp.kh_sets = 1;
@@ -1992,7 +1997,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     jcp.with_scales = !src_scales.has_default_values()
             || !wei_scales.has_default_values()
             || jcp.scale_adjust_factor != 1.0f;
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
 
     // disables the shape with small ic but large spatial
     // or specific large spatial shapes for int8 conv
@@ -2189,7 +2194,7 @@ status_t init_1x1_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     jcp.with_scales = !src_scales.has_default_values()
             || !wei_scales.has_default_values()
             || jcp.scale_adjust_factor != 1.0f;
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
 
     // enable ununroll_bd_loop for big shapes to reduce kernel sizes
     jcp.ununroll_bd_loop
diff --git a/src/cpu/aarch64/jit_brgemm_post_ops.hpp b/src/cpu/aarch64/jit_brgemm_post_ops.hpp
index 2809e1813b6..5aed828a582 100644
--- a/src/cpu/aarch64/jit_brgemm_post_ops.hpp
+++ b/src/cpu/aarch64/jit_brgemm_post_ops.hpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2020-2023 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -196,7 +197,7 @@ struct jit_brgemm_kernel_diff_bias_t : public jit_generator {
     }
 
     void generate() override {
-        size_t simd_w_;
+        size_t simd_w_ = 0;
         switch (brg_.isa_impl) {
             case sve_512:
                 simd_w_ = cpu_isa_traits<sve_512>::vlen / sizeof(float);
@@ -204,7 +205,10 @@ struct jit_brgemm_kernel_diff_bias_t : public jit_generator {
             case sve_256:
                 simd_w_ = cpu_isa_traits<sve_256>::vlen / sizeof(float);
                 break;
-            default: assert(!"unsupported isa");
+            default: {
+                assert(!"unsupported isa");
+                return;
+            }
         }
         preamble();
         if (simd_w_ != cpu_sveLen / sizeof(float)) {
@@ -321,8 +325,8 @@ struct jit_brgemm_kernel_post_ops : public jit_generator {
         const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
         // per_oc: conv: 1 << 0, (1 << 1) + (1 << 0) (with groups)
         // per_oc: ip: 1 << 0
-        is_oc_scale_
-                = utils::one_of(wei_scales.mask_, 1 << 0, (1 << 1) + (1 << 0));
+        is_oc_scale_ = utils::one_of(
+                wei_scales.get_mask(), 1 << 0, (1 << 1) + (1 << 0));
 
         LDD_ = brg.LDD;
         inp_dt_ = brg.dt_c;
@@ -850,7 +854,7 @@ struct jit_brgemm_kernel_post_ops : public jit_generator {
     }
 
     void generate() override {
-        size_t simd_w_;
+        size_t simd_w_ = 0;
         switch (brg.isa_impl) {
             case sve_512:
                 simd_w_ = cpu_isa_traits<sve_512>::vlen / sizeof(float);
@@ -858,7 +862,10 @@ struct jit_brgemm_kernel_post_ops : public jit_generator {
             case sve_256:
                 simd_w_ = cpu_isa_traits<sve_256>::vlen / sizeof(float);
                 break;
-            default: assert(!"unsupported isa");
+            default: {
+                assert(!"unsupported isa");
+                return;
+            }
         }
         preamble();
         if (simd_w_ != cpu_sveLen / sizeof(float)) {
diff --git a/src/cpu/aarch64/jit_primitive_conf.hpp b/src/cpu/aarch64/jit_primitive_conf.hpp
index ef223f20aab..22af4a66fa9 100644
--- a/src/cpu/aarch64/jit_primitive_conf.hpp
+++ b/src/cpu/aarch64/jit_primitive_conf.hpp
@@ -36,6 +36,7 @@ enum conv_version_t {
     ver_unused,
     ver_fma,
     ver_sve_512,
+    ver_sve_256,
 };
 
 enum conv_loop_order_t {
diff --git a/src/cpu/aarch64/jit_sve_1x1_conv_kernel.cpp b/src/cpu/aarch64/jit_sve_1x1_conv_kernel.cpp
new file mode 100644
index 00000000000..5bb7da464ea
--- /dev/null
+++ b/src/cpu/aarch64/jit_sve_1x1_conv_kernel.cpp
@@ -0,0 +1,1398 @@
+/*******************************************************************************
+* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 FUJITSU LIMITED
+* Copyright 2024-2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <assert.h>
+#include <float.h>
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/aarch64/cpu_barrier.hpp"
+#include "cpu/platform.hpp"
+
+#include "cpu/aarch64/injectors/injector_utils.hpp"
+#include "cpu/aarch64/injectors/jit_uni_binary_injector.hpp"
+#include "cpu/aarch64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/aarch64/jit_sve_1x1_conv_kernel.hpp"
+#include "cpu/aarch64/jit_uni_1x1_conv_utils.hpp"
+
+#define GET_OFF(field) \
+    static_cast<int32_t>(offsetof(jit_1x1_conv_call_s, field))
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+using namespace dnnl::impl::format_tag;
+using namespace dnnl::impl::prop_kind;
+using namespace dnnl::impl::utils;
+
+template <cpu_isa_t isa_>
+jit_sve_1x1_conv_kernel<isa_>::jit_sve_1x1_conv_kernel(
+        const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
+        const memory_desc_t &dst_md)
+    : jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary) {
+        using namespace binary_injector;
+        static constexpr bool preserve_gpr = true;
+        static constexpr bool preserve_vmm = false;
+        static constexpr size_t helper_vmm_idx = 31;
+        const size_t tail_size = jcp.oc_without_padding % isa_simd_width_;
+        static constexpr bool use_exact_tail_scalar_bcast = true;
+
+        const rhs_arg_static_params_t rhs_arg_static_params {helper_vmm_idx,
+                x14, x15, x13, preserve_gpr, preserve_vmm,
+                GET_OFF(post_ops_binary_rhs_arg_vec), GET_OFF(dst_orig),
+                memory_desc_wrapper(dst_md), tail_size, k_load_dim_mask,
+                use_exact_tail_scalar_bcast};
+        const static_params_t static_params {
+                this->param1, rhs_arg_static_params};
+
+        postops_injector_ = utils::make_unique<
+                injector::jit_uni_postops_injector_t<isa_>>(
+                this, jcp.post_ops, static_params);
+    }
+}
+
+template <cpu_isa_t isa_>
+void jit_sve_1x1_conv_kernel<isa_>::bcast_loop(int load_loop_blk) {
+
+    mov(aux1_reg_bcast_data, reg_bcast_data);
+    mov(aux_reg_bcast_data, reg_bcast_data);
+    mov(aux_reg_output_data, reg_output_data);
+    ldr(reg_bcast_loop_iter, ptr(X_SP, reg_bcast_loop_work_offt));
+
+    Label bcast_loop;
+    Label bcast_loop_tail;
+    Label large_tail;
+
+    cmp_imm(reg_bcast_loop_iter, jcp.bcast_block, reg_tmp_imm);
+    b(LT, bcast_loop_tail);
+
+    L(bcast_loop);
+    {
+        assert(jcp.bcast_block % jcp.ur == 0);
+        int num_substeps = jcp.bcast_block / jcp.ur;
+        assert(num_substeps > 0 && num_substeps < 10);
+        for (int i = 0; i < num_substeps; i++) {
+            if (i + 1 == num_substeps) L(large_tail);
+            reduce_loop(load_loop_blk, jcp.ur, i, false);
+            if (i < num_substeps - 1) {
+                add_imm(aux1_reg_bcast_data, aux1_reg_bcast_data,
+                        jcp.bcast_loop_bcast_substep, reg_tmp_imm);
+                add_imm(aux_reg_output_data, aux_reg_output_data,
+                        jcp.bcast_loop_output_substep, reg_tmp_imm);
+            } else {
+                add_imm(aux1_reg_bcast_data, aux1_reg_bcast_data,
+                        jcp.bcast_loop_bcast_step
+                                - (num_substeps - 1)
+                                        * jcp.bcast_loop_bcast_substep,
+                        reg_tmp_imm);
+                add_imm(aux_reg_output_data, aux_reg_output_data,
+                        jcp.bcast_loop_output_step
+                                - (num_substeps - 1)
+                                        * jcp.bcast_loop_output_substep,
+                        reg_tmp_imm);
+            }
+            subs_imm(reg_bcast_loop_iter, reg_bcast_loop_iter, jcp.ur,
+                    reg_tmp_imm);
+        }
+        cmp_imm(reg_bcast_loop_iter, jcp.bcast_block, reg_tmp_imm);
+        b(GE, bcast_loop);
+    }
+
+    L(bcast_loop_tail);
+    if (jcp.ur_tail) {
+        Label bcast_loop_tail_out;
+        if (jcp.ur_tail >= jcp.ur) {
+            cmp_imm(reg_bcast_loop_iter, jcp.ur, reg_tmp_imm);
+            b(GE, large_tail);
+        }
+        if (jcp.ur_tail % jcp.ur) {
+            cmp(reg_bcast_loop_iter, 0);
+            b(LE, bcast_loop_tail_out);
+            reduce_loop(load_loop_blk, jcp.ur_tail % jcp.ur, 0, true);
+            L(bcast_loop_tail_out);
+        }
+    }
+}
+
+template <cpu_isa_t isa_>
+Xbyak_aarch64::XReg jit_sve_1x1_conv_kernel<isa_>::output_ptr(
+        const bool is_out_layout_nxc, const int i_load, const int i_ur,
+        Xbyak_aarch64::XReg addr) {
+    if (one_of(jcp.prop_kind, forward_training, forward_inference,
+                backward_data)) {
+        int i_load_shift = is_out_layout_nxc
+                ? jcp.load_block
+                : (jcp.with_dw_conv ? jcp.ow : jcp.bcast_dim) * jcp.load_block;
+        int i_ur_shift = is_out_layout_nxc ? jcp.load_dim : jcp.load_block;
+        int offset = (i_load * i_load_shift + i_ur * i_ur_shift)
+                * jcp.typesize_out;
+        EVEX_compress_addr(addr, X_TMP_0, aux_reg_output_data, offset);
+    } else {
+        int offset = jcp.typesize_out * jcp.load_block * i_ur;
+        mov(X_TMP_0, i_load);
+        mul(X_TMP_0, reg_output_stride, X_TMP_0);
+        add_imm(X_TMP_1, X_TMP_0, offset, X_TMP_2);
+        add(addr, aux_reg_output_data, X_TMP_1);
+    }
+    return addr;
+}
+
+static int vreg_accum_idx(
+        const int load_loop_blk, const int i_load, const int i_ur) {
+    return (i_ur * load_loop_blk + i_load);
+}
+
+template <typename F>
+static void iterate(const int load_loop_blk, const int ur, const bool mask_tail,
+        const F &fun) {
+    for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+        const bool mask_flag = mask_tail && i_load + 1 == load_loop_blk;
+        for (int i_ur = 0; i_ur < ur; ++i_ur)
+            fun(mask_flag, i_load, i_ur);
+    }
+}
+template <typename F>
+static void iterate(const int load_loop_blk, const int ur, const F &fun) {
+    iterate(load_loop_blk, ur, false, fun);
+}
+
+template <cpu_isa_t isa_>
+void jit_sve_1x1_conv_kernel<isa_>::apply_postops(
+        const bool is_out_layout_nxc, const int load_loop_blk, const int ur) {
+    injector_utils::vmm_index_set_t vmm_idxs;
+    if (jcp.with_binary) {
+        binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
+        const auto mask_tail = jcp.oc_without_padding % jcp.load_block;
+        iterate(load_loop_blk, ur, mask_tail,
+                [&](const bool mask_flag, const int i_load, const int i_ur) {
+                    const auto vmm_idx
+                            = vreg_accum_idx(load_loop_blk, i_load, i_ur);
+                    vmm_idxs.emplace(vmm_idx);
+
+                    rhs_arg_params.vmm_idx_to_out_reg.emplace(
+                            vmm_idx, aux_reg_output_data);
+                    rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(vmm_idx,
+                            get_output_offset(is_out_layout_nxc, i_load, i_ur));
+                    if (mask_flag)
+                        rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
+                });
+
+        ldr(abi_param1, ptr(X_SP, reg_abi_param1_backup));
+
+        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+    } else {
+        iterate(load_loop_blk, ur,
+                [&](const bool, const int i_load, const int i_ur) {
+                    vmm_idxs.emplace(
+                            vreg_accum_idx(load_loop_blk, i_load, i_ur));
+                });
+        postops_injector_->compute_vector_range(vmm_idxs);
+    }
+}
+
+template <cpu_isa_t isa_>
+void jit_sve_1x1_conv_kernel<isa_>::reduce_loop(
+        int load_loop_blk, int ur, int substep, bool wraparound) {
+
+    const bool out_layout_nxc = is_out_layout_nxc(jcp);
+    const bool load_layout_nxc = is_load_layout_nxc(jcp);
+    const bool bcast_layout_nxc = is_bcast_layout_nxc(jcp);
+    const int reduce_dim_tail = jcp.reduce_dim % jcp.reduce_block;
+    const int load_dim_tail = jcp.load_dim % jcp.load_block;
+
+    auto vreg_load
+            = [=](int i_load) { return ZReg(ur * load_loop_blk + i_load); };
+
+    auto vreg_accum = [=](int i_load, int i_ur) {
+        return ZReg(vreg_accum_idx(load_loop_blk, i_load, i_ur));
+    };
+
+    auto bias_ptr = [=](int i_load) {
+        return EVEX_compress_addr(X_DEFAULT_ADDR, X_TMP_0, reg_bias_data,
+                jcp.typesize_out * jcp.oc_block * i_load);
+    };
+
+    auto bcast_ptr = [=](int i_reduce, int i_ur, bool bcast,
+                             const Xbyak_aarch64::XReg addr,
+                             const Xbyak_aarch64::XReg tmp) {
+        assert(i_ur < jcp.ur);
+        assert(i_reduce <= jcp.reduce_loop_unroll);
+        int offt;
+        if (one_of(jcp.prop_kind, forward_training, forward_inference,
+                    backward_data)) {
+            assert(jcp.reduce_loop_unroll == jcp.reduce_block);
+            const int reduce_mul = bcast_layout_nxc ? jcp.reduce_dim
+                                                    : jcp.reduce_loop_unroll;
+            offt = (i_reduce == jcp.reduce_loop_unroll)
+                    ? (jcp.bcast_dim + i_ur) * reduce_mul
+                    : i_ur * reduce_mul + i_reduce;
+        } else {
+            int rmul = bcast_layout_nxc ? jcp.ic : jcp.ic_block;
+            offt = i_reduce * rmul + i_ur;
+        }
+        return EVEX_compress_addr(
+                addr, tmp, aux_reg_bcast_data, jcp.typesize_in * offt, bcast);
+    };
+
+    auto load_ptr = [=](int i_reduce, int i_load,
+                            const Xbyak_aarch64::XReg addr,
+                            const Xbyak_aarch64::XReg tmp) {
+        int offt;
+        int u0 = i_reduce % jcp.reduce_loop_unroll;
+        int u1 = i_reduce / jcp.reduce_loop_unroll;
+        int lmul = jcp.load_block
+                * (load_layout_nxc ? 1
+                                   : utils::rnd_up(
+                                           jcp.reduce_dim, jcp.reduce_block));
+        int rmul = load_layout_nxc ? jcp.load_dim : jcp.load_block;
+        offt = i_load * lmul + u0 * rmul;
+        return EVEX_compress_addr(addr, tmp, aux_reg_load_data,
+                u1 * jcp.reduce_loop_load_step + jcp.typesize_in * offt);
+    };
+
+    auto init = [=]() {
+        Label init_done;
+        Label init_zero;
+
+        if (jcp.with_bias
+                && one_of(jcp.prop_kind, forward_training, forward_inference)) {
+            tst(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
+            b(EQ, init_zero);
+
+            for (int i_load = 0; i_load < load_loop_blk; i_load++)
+                for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                    auto vreg_acc = vreg_accum(i_load, i_ur);
+                    if (i_load + 1 == load_loop_blk && load_dim_tail)
+                        ld1w(vreg_acc.s, k_load_dim_mask / T_z,
+                                ptr(bias_ptr(i_load)));
+                    else
+                        ld1w(vreg_acc.s, P_ALL_ONE / T_z,
+                                ptr(bias_ptr(i_load)));
+                }
+            b(init_done);
+        }
+
+        L(init_zero);
+
+        /* Zero clear */
+        for (int i_load = 0; i_load < load_loop_blk; ++i_load)
+            for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                auto r = vreg_accum(i_load, i_ur);
+                eor(r.d, r.d, r.d);
+            }
+        L(init_done);
+    };
+
+    auto store = [=]() {
+        Label store_noadd;
+        if (!jcp.with_sum) {
+            tst(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
+            b(NE, store_noadd);
+        }
+
+        for (int i_ur = 0; i_ur < ur; ++i_ur)
+            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                auto r = vreg_accum(i_load, i_ur).s;
+                if (i_load + 1 == load_loop_blk && load_dim_tail)
+                    ld1w(zreg_tmp.s, k_load_dim_mask / T_z,
+                            ptr(output_ptr(out_layout_nxc, i_load, i_ur,
+                                    X_DEFAULT_ADDR)));
+                else
+                    ld1w(zreg_tmp.s, P_ALL_ONE / T_z,
+                            ptr(output_ptr(out_layout_nxc, i_load, i_ur,
+                                    X_DEFAULT_ADDR)));
+                fadd(r, r, zreg_tmp.s);
+            }
+
+        L(store_noadd);
+        if (jcp.with_eltwise || jcp.with_binary) {
+            Label store_nopostops;
+            tst(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
+            b(EQ, store_nopostops);
+
+            apply_postops(out_layout_nxc, load_loop_blk, ur);
+
+            L(store_nopostops);
+        }
+
+        auto store_output = [=](bool output_is_aligned) {
+            const auto mask_flag = load_dim_tail;
+            for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                    auto vreg_acc = vreg_accum(i_load, i_ur);
+                    // for nxc_layout-bwd_w, weights are still padded and the
+                    // output_ptr here can be uninitialized scratchpad.
+                    // To ensure final output (after reduction) is zero-padded,
+                    // here we zero-pad output by omitting the mask.
+                    if (jcp.prop_kind != backward_weights
+                            && (i_load + 1 == load_loop_blk && mask_flag)) {
+                        st1w(vreg_acc.s, k_load_dim_mask / T_z,
+                                ptr(output_ptr(out_layout_nxc, i_load, i_ur,
+                                        X_DEFAULT_ADDR)));
+                    } else {
+                        st1w(vreg_acc.s, P_ALL_ONE / T_z,
+                                ptr(output_ptr(out_layout_nxc, i_load, i_ur,
+                                        X_DEFAULT_ADDR)));
+                    }
+                }
+            }
+        };
+
+        Label unaligned_store, end_store;
+        tst(aux_reg_output_data, cpu_isa_traits<isa_>::vlen - 1);
+        b(NE, unaligned_store);
+        store_output(true);
+        b(end_store);
+        L(unaligned_store);
+        { store_output(false); }
+        L(end_store);
+    };
+
+    auto fma_block = [=](bool last_block) {
+        const int i_reduce_end = reduce_dim_tail && last_block
+                ? reduce_dim_tail
+                : jcp.reduce_loop_unroll;
+
+        for (int i_reduce = 0; i_reduce < i_reduce_end; i_reduce++) {
+            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                auto vreg = vreg_load(i_load);
+                if (i_load + 1 == load_loop_blk && load_dim_tail)
+                    ld1w(vreg.s, k_load_dim_mask / T_z,
+                            ptr(load_ptr(i_reduce, i_load, X_DEFAULT_ADDR,
+                                    X_TMP_0)));
+                else
+                    ld1w(vreg.s, P_ALL_ONE / T_z,
+                            ptr(load_ptr(i_reduce, i_load, X_DEFAULT_ADDR,
+                                    X_TMP_0)));
+            }
+
+            for (int i_ur = 0; i_ur < ur; ++i_ur) {
+                if (jcp.expl_bcast && load_loop_blk > 1) {
+                    ldr(W_TMP_0,
+                            ptr(bcast_ptr(i_reduce, i_ur, false, X_DEFAULT_ADDR,
+                                    X_TMP_1)));
+                    dup(vreg_bcast.s, W_TMP_0);
+                }
+                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
+                    auto vreg_acc = vreg_accum(i_load, i_ur);
+                    if (i_load + 1 == load_loop_blk && load_dim_tail) {
+                        ld1rw(zreg_tmp.s, P_ALL_ONE,
+                                ptr(bcast_ptr(i_reduce, i_ur, true,
+                                        X_DEFAULT_ADDR, X_TMP_0)));
+                        fmla(vreg_acc.s, k_load_dim_mask / T_m,
+                                vreg_load(i_load).s, zreg_tmp.s);
+                    } else if (jcp.expl_bcast && load_loop_blk > 1) {
+                        fmla(vreg_acc.s, P_ALL_ONE / T_m, vreg_load(i_load).s,
+                                vreg_bcast.s);
+                    } else {
+                        ld1rw(zreg_tmp.s, P_ALL_ONE,
+                                ptr(bcast_ptr(i_reduce, i_ur, true,
+                                        X_DEFAULT_ADDR, X_TMP_0)));
+                        fmla(vreg_acc.s, P_ALL_ONE / T_m, vreg_load(i_load).s,
+                                zreg_tmp.s);
+                    }
+                }
+            }
+        }
+    };
+
+    Label reduce_loop;
+    Label reduce_loop_tail;
+
+    mov(aux_reg_load_data, reg_load_data);
+
+    mov(aux_reg_bcast_data, aux1_reg_bcast_data);
+    init();
+
+    mov(reduce_loop_iter, reg_reduce_loop_work);
+    subs_imm(reduce_loop_iter, reduce_loop_iter, jcp.reduce_loop_unroll,
+            reg_tmp_imm);
+    b(LE, reduce_loop_tail);
+
+    L(reduce_loop);
+    {
+        fma_block(false);
+        add_imm(aux_reg_bcast_data, aux_reg_bcast_data,
+                jcp.reduce_loop_bcast_step, reg_tmp_imm);
+        add_imm(aux_reg_load_data, aux_reg_load_data, jcp.reduce_loop_load_step,
+                reg_tmp_imm);
+        subs_imm(reduce_loop_iter, reduce_loop_iter, jcp.reduce_loop_unroll,
+                reg_tmp_imm);
+        b(GT, reduce_loop);
+    }
+
+    L(reduce_loop_tail);
+    fma_block(true);
+
+    store();
+}
+
+template <cpu_isa_t isa_>
+void jit_sve_1x1_conv_kernel<isa_>::generate() {
+    preamble();
+
+    sub_imm(X_SP, X_SP, stack_space_needed, X_TMP_0);
+    if (jcp.with_binary) {
+        const auto zeroed_reg = x15;
+        eor(zeroed_reg, zeroed_reg, zeroed_reg);
+        str(zeroed_reg, ptr(X_SP, reg_binary_post_op_acc_off));
+        str(param1, ptr(X_SP, reg_abi_param1_backup));
+    }
+
+    /* Pointers indicate weight, input, and output data */
+    ldr(reg_bcast_data, ptr(abi_param1, GET_OFF(bcast_data))); // Input
+    ldr(reg_load_data, ptr(abi_param1, GET_OFF(load_data))); // Weight
+    ldr(reg_output_data, ptr(abi_param1, GET_OFF(output_data))); // Output
+
+    /* Pointer indicates bias data if the layer has bias option */
+    if (jcp.with_bias) ldr(reg_bias_data, ptr(abi_param1, GET_OFF(bias_data)));
+
+    /* Get workloads of each loop */
+    ldr(reg_load_loop_work, ptr(abi_param1, GET_OFF(load_dim)));
+    ldr(reg_bcast_loop_work, ptr(abi_param1, GET_OFF(bcast_dim)));
+    str(reg_bcast_loop_work, ptr(X_SP, reg_bcast_loop_work_offt));
+    ldr(reg_reduce_loop_work, ptr(abi_param1, GET_OFF(reduce_dim)));
+
+    /* A flag for controlling reduce loop */
+    ldr(reg_reduce_pos_flag, ptr(abi_param1, GET_OFF(first_last_flag)));
+    if (jcp.prop_kind == backward_weights)
+        ldr(reg_output_stride, ptr(param1, GET_OFF(output_stride)));
+
+    const int load_dim_tail
+            = (one_of(jcp.prop_kind, forward_training, forward_inference)
+                              ? jcp.oc_without_padding
+                              : jcp.load_dim)
+            % jcp.load_block;
+    if (load_dim_tail) {
+        const WReg w_tmp(reg_load_dim_tail_mask.getIdx());
+        mov_imm(w_tmp, (1 << load_dim_tail) - 1);
+        st1w(zreg_tmp1.s, P_ALL_ONE / T_z, ptr(X_TRANSLATOR_STACK, -1, MUL_VL));
+        index(zreg_tmp.s, 0, 1);
+        mov(zreg_tmp1.s, 1);
+        lsl(zreg_tmp1.s, P_ALL_ONE / T_m, zreg_tmp.s);
+        dup(zreg_tmp.s, w_tmp);
+        and_(zreg_tmp.d, zreg_tmp.d, zreg_tmp1.d);
+        cmpne(k_load_dim_tail_mask.s, P_ALL_ONE, zreg_tmp.s, 0);
+        ldr(zreg_tmp1, ptr(X_TRANSLATOR_STACK, -1, MUL_VL));
+    }
+
+    auto load_loop_body = [=](int load_loop_blk) {
+        if (load_dim_tail) {
+            eor(k_load_dim_mask.b, P_ALL_ONE / T_z, k_load_dim_mask.b,
+                    k_load_dim_mask.b);
+            not_(k_load_dim_mask.b, P_ALL_ONE / T_z, k_load_dim_mask.b);
+        }
+        subs_imm(reg_load_loop_work, reg_load_loop_work,
+                load_loop_blk * jcp.load_loop_iter_step, reg_tmp_imm);
+        if (load_dim_tail) {
+            Label no_update_mask;
+            b(GE, no_update_mask);
+            mov(k_load_dim_mask.b, k_load_dim_tail_mask.b);
+            L(no_update_mask);
+        }
+        bcast_loop(load_loop_blk);
+        add_imm(reg_load_data, reg_load_data,
+                load_loop_blk * jcp.load_loop_load_step, reg_tmp_imm);
+        switch (jcp.prop_kind) {
+            case forward_training:
+            case forward_inference:
+                add_imm(reg_bias_data, reg_bias_data,
+                        load_loop_blk * jcp.load_block * jcp.typesize_out,
+                        reg_tmp_imm);
+                add_imm(reg_output_data, reg_output_data,
+                        load_loop_blk * jcp.load_block * jcp.typesize_out
+                                * (is_out_layout_nxc(jcp)
+                                                ? 1
+                                                : (jcp.with_dw_conv
+                                                                ? jcp.ow
+                                                                : jcp.bcast_dim)),
+                        reg_tmp_imm);
+                if (jcp.with_binary) {
+                    const auto oc_off_oprnd = aux_reg_load_data;
+                    ldr(oc_off_oprnd, ptr(X_SP, reg_binary_post_op_acc_off));
+                    add_imm(oc_off_oprnd, oc_off_oprnd,
+                            jcp.load_block * load_loop_blk, X_TMP_0);
+                    str(oc_off_oprnd, ptr(X_SP, reg_binary_post_op_acc_off));
+                }
+                break;
+            case backward_data:
+                add_imm(reg_output_data, reg_output_data,
+                        load_loop_blk * jcp.load_block * jcp.typesize_out
+                                * (is_out_layout_nxc(jcp) ? 1 : jcp.bcast_dim),
+                        reg_tmp_imm);
+                break;
+            case backward_weights:
+                for (int i_load = 0; i_load < load_loop_blk; i_load++)
+                    add(reg_output_data, reg_output_data, reg_output_stride);
+                break;
+            default: assert(!"invalid prop_kind");
+        }
+    };
+
+    const int simd_w = cpu_isa_traits<isa_>::vlen / sizeof(float);
+
+    Label load_loop_blk[7];
+
+    // with an implicit load_loop_block          {6, 5, 4, 3, 2,  1}
+    static const int ur_cases_fma_embd_bcast[] = {2, 4, 5, 8, 14, 32};
+    static const int ur_cases_fma_expl_bcast[] = {2, 5, 6, 9, 14, 32};
+
+    const int size_ur_cases_fma = jcp.expl_bcast
+            ? sizeof(ur_cases_fma_expl_bcast)
+            : sizeof(ur_cases_fma_embd_bcast);
+
+    const int *ur_cases_fma = jcp.expl_bcast ? ur_cases_fma_expl_bcast
+                                             : ur_cases_fma_embd_bcast;
+    const int *ur_cases = ur_cases_fma;
+    const int num_ur_cases = size_ur_cases_fma / sizeof(*ur_cases);
+
+    for (int ur_idx = num_ur_cases - 1; ur_idx > 0; ur_idx--) {
+        int label_idx = num_ur_cases - ur_idx - 1;
+        if (jcp.nb_load > label_idx && jcp.ur <= ur_cases[ur_idx]) {
+            cmp_imm(reg_load_loop_work, simd_w * (label_idx + 1), reg_tmp_imm);
+            b(LE, load_loop_blk[label_idx]);
+        }
+    }
+
+    for (int ur_idx = 0; ur_idx < num_ur_cases; ur_idx++) {
+        int label_idx = num_ur_cases - ur_idx - 1;
+        if (jcp.nb_load > label_idx && jcp.ur <= ur_cases[ur_idx]) {
+            L(load_loop_blk[label_idx]);
+            {
+                if (label_idx == 0) {
+                    cmp(reg_load_loop_work, 0);
+                    b(LE, load_loop_blk[num_ur_cases]);
+                }
+                load_loop_body(label_idx + 1);
+                if (label_idx - 1 > 0) {
+                    cmp_imm(reg_load_loop_work, 2 * label_idx * simd_w,
+                            reg_tmp_imm);
+                    b(EQ, load_loop_blk[label_idx - 1]);
+                }
+                cmp_imm(reg_load_loop_work, label_idx * simd_w, reg_tmp_imm);
+                b(GT, load_loop_blk[label_idx]);
+            }
+            for (int idx = label_idx - 1; idx >= 0; --idx) {
+                cmp_imm(reg_load_loop_work, simd_w * (idx + 1), reg_tmp_imm);
+                b(GE, load_loop_blk[idx]);
+            }
+            if (ur_idx < num_ur_cases - 2) {
+                cmp_imm(reg_load_loop_work, simd_w, reg_tmp_imm);
+                b(LE, load_loop_blk[0]);
+            }
+        }
+    }
+    L(load_loop_blk[num_ur_cases]);
+
+    add_imm(X_SP, X_SP, stack_space_needed, X_TMP_0);
+
+    postamble();
+    if (jcp.with_eltwise) postops_injector_->prepare_table();
+}
+
+template <cpu_isa_t isa_>
+status_t jit_sve_1x1_conv_kernel<isa_>::init_conf(jit_1x1_conv_conf_t &jcp,
+        const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
+        const primitive_attr_t &attr, int nthreads, bool reduce_src) {
+
+    /* arch check */
+    if (!mayiuse(isa_)) { return status::unimplemented; }
+    jcp.isa = isa_;
+
+    if (!everyone_is(data_type::f32, src_d.data_type(), weights_d.data_type(),
+                dst_d.data_type())) {
+        return status::unimplemented;
+    }
+
+    jcp.nthr = nthreads;
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    const int simd_w = cpu_isa_traits<isa_>::vlen / sizeof(float);
+    const int ndims = src_d.ndims();
+    /* Forward_[training, inference], backward_[data, weight] */
+    jcp.prop_kind = cd.prop_kind;
+
+    /* Check group option */
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    /* Batchsize */
+    jcp.mb = src_d.dims()[0];
+    /* Channel */
+    jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups;
+    jcp.oc = jcp.oc_without_padding;
+    jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups;
+    jcp.ic = jcp.ic_without_padding;
+    /* D, H, W */
+    jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2];
+    jcp.ow = dst_d.dims()[ndims - 1];
+    /* Kernel size */
+    jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims - 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
+    /* padding params */
+    jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims - 4];
+    jcp.l_pad = cd.padding[0][ndims - 3];
+    /* stride params */
+    jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims - 4];
+    jcp.stride_w = cd.strides[ndims - 3];
+    /* bias info */
+    jcp.with_bias = pick_by_prop_kind(jcp.prop_kind, cd.bias_desc.format_kind,
+                            format_kind::undef, cd.diff_bias_desc.format_kind)
+            != format_kind::undef;
+
+    /* Spatials */
+    jcp.os = jcp.od * jcp.oh * jcp.ow;
+    jcp.is = jcp.id * jcp.ih * jcp.iw;
+
+    /* Depthwise conv check */
+    const auto &post_ops = attr.post_ops_;
+    const int dw_conv_ind = post_ops.find(primitive_kind::convolution);
+    jcp.with_dw_conv = dw_conv_ind != -1;
+    if (jcp.with_dw_conv) { return status::unimplemented; }
+
+    /* Post operation check */
+    // Using dw_conv_ind as upper-bound below, as post-ops after it will be
+    // handled in depthwise convolution.
+    const int eltwise_ind
+            = post_ops.find(primitive_kind::eltwise, 0, dw_conv_ind);
+    jcp.with_eltwise = eltwise_ind != -1;
+    if (jcp.with_eltwise) {
+        if (dst_d.data_type() == data_type::s32) {
+            return status::unimplemented;
+        }
+    }
+
+    const int sum_ind = post_ops.find(primitive_kind::sum, 0, dw_conv_ind);
+    jcp.with_sum = sum_ind != -1;
+
+    const int binary_ind
+            = post_ops.find(primitive_kind::binary, 0, dw_conv_ind);
+    jcp.with_binary = binary_ind != -1;
+
+    if (dw_conv_ind >= 0) {
+        // dw_conv and post_ops after it are handled externally, so skip them
+        jcp.post_ops.entry_.assign(post_ops.entry_.cbegin(),
+                post_ops.entry_.cbegin() + dw_conv_ind);
+    } else {
+        jcp.post_ops = post_ops;
+    }
+
+    /* Data format check */
+    const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
+    bool is_data_layout_nxc;
+    format_tag_t required_dat_tag;
+
+    switch (isa_) {
+        case sve_512: {
+            const auto dat_tag_nCx16c
+                    = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
+            jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
+            jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
+            is_data_layout_nxc
+                    = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
+            required_dat_tag
+                    = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
+            break;
+        }
+        case sve_256: {
+            const auto dat_tag_nCx8c = pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
+            jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
+            jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
+            is_data_layout_nxc
+                    = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
+            required_dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx8c;
+            break;
+        }
+        default: break;
+    }
+    /* Channel padding check */
+    bool ok_to_pad_channels = true && !is_data_layout_nxc && jcp.ngroups == 1
+            && src_d.data_type() == data_type::f32;
+
+    /* Input and output must be multiple of simd_w */
+    if (ok_to_pad_channels) {
+        jcp.oc = rnd_up(jcp.oc, simd_w);
+        jcp.ic = rnd_up(jcp.ic, simd_w);
+    }
+
+    using namespace injector;
+
+    static constexpr bool sum_at_pos_0_only = true;
+    static constexpr bool sum_requires_scale_one = true;
+    static constexpr bool sum_requires_zp_zero = true;
+    const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(jcp.isa,
+            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            sum_requires_scale_one, sum_requires_zp_zero));
+    if (!post_ops_ok_) { return status::unimplemented; }
+
+    bool args_ok = true && jcp.ngroups == 1 && jcp.src_tag == required_dat_tag
+            && jcp.dst_tag == required_dat_tag
+            && IMPLICATION(!is_data_layout_nxc,
+                    jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0)
+            && jcp.f_pad == 0 && jcp.t_pad == 0 && jcp.l_pad == 0
+            && jcp.stride_w == 1 && jcp.stride_h == 1 && jcp.stride_d == 1
+            && jcp.kd == 1 && jcp.kh == 1 && jcp.kw == 1 && jcp.ow == jcp.iw
+            && jcp.oh == jcp.ih && jcp.od == jcp.id; // enforce rpad=0
+    if (!args_ok) { return status::unimplemented; }
+
+    /* Channel blocking size is simd_w */
+    jcp.ic_block = jcp.oc_block = simd_w;
+
+    switch (isa_) {
+        case sve_512: {
+            jcp.ver = ver_sve_512;
+            break;
+        }
+        case sve_256: {
+            jcp.ver = ver_sve_256;
+            break;
+        }
+        default: break;
+    }
+
+    if (everyone_is(data_type::f32, src_d.data_type(), weights_d.data_type(),
+                dst_d.data_type())) {
+        const int is_bwd_d = jcp.prop_kind == backward_data;
+
+        /* Set weight data layout tag */
+        format_tag_t wei_tag;
+        switch (isa_) {
+            case sve_512: {
+                wei_tag = with_groups
+                        ? pick(2 * ndims - 6 + is_bwd_d, gOIw16i16o, gIOw16o16i,
+                                gOIhw16i16o, gIOhw16o16i, gOIdhw16i16o,
+                                gIOdhw16o16i)
+                        : pick(2 * ndims - 6 + is_bwd_d, OIw16i16o, IOw16o16i,
+                                OIhw16i16o, IOhw16o16i, OIdhw16i16o,
+                                IOdhw16o16i);
+                break;
+            }
+            case sve_256: {
+                wei_tag = with_groups
+                        ? pick(2 * ndims - 6 + is_bwd_d, gOIw8i8o, gIOw8o8i,
+                                gOIhw8i8o, gIOhw8o8i, gOIdhw8i8o, gIOdhw8o8i)
+                        : pick(2 * ndims - 6 + is_bwd_d, OIw8i8o, IOw8o8i,
+                                OIhw8i8o, IOhw8o8i, OIdhw8i8o, IOdhw8o8i);
+                break;
+            }
+            default: break;
+        }
+
+        jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
+
+        if (jcp.wei_tag != wei_tag) return status::unimplemented;
+
+        //        jcp.fma_step = 1;
+        jcp.typesize_in = sizeof(prec_traits_t<data_type::f32>::type);
+        jcp.typesize_out = sizeof(prec_traits_t<data_type::f32>::type);
+    } else {
+        // TODO: currently, only support fp32;
+        return status::unimplemented;
+    }
+
+    /* once all the formats are set, check the padding consistency */
+
+    if (!is_data_layout_nxc) {
+        args_ok = true && jcp.ic <= src_d.padded_dims()[1]
+                && jcp.oc <= dst_d.padded_dims()[1]
+                && jcp.ic <= weights_d.padded_dims()[with_groups + 1]
+                && jcp.oc <= weights_d.padded_dims()[with_groups + 0];
+        if (!args_ok) { return status::unimplemented; }
+    }
+
+    // TODO: Optimize bellow params
+    const int SMALL_SPATIAL = 10;
+    const int BIG_SPATIAL = 65;
+    const int BIG_REDUCE_DIM = 1024;
+    const int BIG_LOAD_DIM = (jcp.reduce_dim >= 512) ? 256 : 512;
+
+    int load_blocking {0};
+    int load_blocking_max {0};
+    int bcast_blocking {0};
+    int bcast_blocking_max {0};
+    int reduce_blocking {0};
+    int reduce_blocking_max {0};
+
+    jcp.load_grp_count = 1;
+
+    // TODO: mov check funcs into platform files
+    const int L1_capacity
+            = platform::get_per_core_cache_size(1) / sizeof(float);
+    const int L2_size = platform::get_per_core_cache_size(2) / sizeof(float);
+    const int L2_capacity = (L2_size * 3) / 4;
+
+    /* FWD, BWD data */
+
+    if (one_of(jcp.prop_kind, forward_training, forward_inference,
+                backward_data)) {
+        if (one_of(jcp.prop_kind, forward_training, forward_inference)) {
+            /* Forward */
+            if (jcp.with_dw_conv) jcp.ur = nstl::min(jcp.ow, jcp.ur);
+            jcp.reduce_dim = jcp.ic; // src channel
+            jcp.reduce_block = jcp.ic_block; // src simd_w
+
+            jcp.load_dim = jcp.oc; // dst channel
+            jcp.load_block = jcp.oc_block; // dst simd_W
+
+            jcp.bcast_dim = jcp.is; // src H*W
+        } else {
+            /* Backward data */
+            jcp.reduce_dim = jcp.oc; // src channel
+            jcp.reduce_block = jcp.oc_block; // src simd_w
+
+            jcp.load_dim = jcp.ic; // dst channel
+            jcp.load_block = jcp.ic_block; // dst simd_w
+
+            jcp.bcast_dim = jcp.os; // src H*W
+        }
+        /* # of consecutive channel elements  */
+        jcp.reduce_loop_unroll = jcp.reduce_block;
+
+        /* Offset to move to the next 16 input channel elements with the same H*W position */
+        jcp.reduce_loop_bcast_step = jcp.reduce_loop_unroll
+                * (is_data_layout_nxc ? 1 : jcp.bcast_dim) * jcp.typesize_in;
+
+        /* Offset: 16o*16i (filter) */
+        jcp.reduce_loop_load_step
+                = jcp.reduce_loop_unroll * jcp.load_block * jcp.typesize_in;
+
+        /* Offset: I/16 * 16o */
+        jcp.load_loop_load_step
+                = (utils::rnd_up(jcp.reduce_dim, jcp.reduce_block))
+                * jcp.load_block * jcp.typesize_in;
+
+        /* adjusting registry blocking */
+        int max_regs, min_regs, size_threshold;
+
+        /* spatial : H*D of dst */
+        const int spatial
+                = (one_of(jcp.prop_kind, forward_training, forward_inference))
+                ? jcp.od * jcp.oh // forward
+                : jcp.id * jcp.ih; // backward
+
+        if ((8 * jcp.mb) / jcp.nthr >= 1
+                // NHWC perf: RN50 mb=1
+                || (is_data_layout_nxc && jcp.mb == 1)) {
+            max_regs = 9; // max # of ur_w
+            min_regs = 6; // min # of ur_w
+            size_threshold = 14;
+            jcp.expl_bcast = true;
+
+            /*
+            *  H*D of dst  > SMALL_SPATIAL
+            */
+            if (jcp.load_dim > 128 && jcp.load_dim < BIG_LOAD_DIM
+                    && spatial > SMALL_SPATIAL && spatial < BIG_SPATIAL
+                    && jcp.reduce_dim < 256) {
+                max_regs = 6;
+                min_regs = 5;
+            }
+        } else {
+            max_regs = 30;
+            min_regs = 9;
+            size_threshold = 14;
+            jcp.expl_bcast = false;
+            jcp.use_vmovntps = true;
+        }
+        jcp.ur = 1;
+
+        for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) {
+            /*
+             *  H*D of dst >= size_threshold, (H*D of dst) % ur_w == 0
+             *  or
+             *  H*D of dst < size_threshold, (H*W of dst) % ur_w == 0
+             */
+            if ((spatial >= size_threshold && spatial % ur_w == 0)
+                    || (spatial < size_threshold && jcp.os % ur_w == 0)) {
+                jcp.ur = ur_w;
+                break;
+            }
+        }
+
+        if (jcp.ur == 1) {
+            // If ur = 1, then min(max_regs, H*W of dst)
+            jcp.ur = nstl::min(max_regs, jcp.os);
+            int os_tail = jcp.os % max_regs;
+            for (int i = max_regs; i >= min_regs; i--) {
+                int i_tail = jcp.os % i;
+                if (i_tail > os_tail || i_tail == 0) {
+                    jcp.ur = i;
+                    os_tail = i_tail;
+                    if (i_tail == 0) break;
+                }
+            }
+        }
+        jcp.bcast_block = jcp.ur; // block size of bcast (input data)
+        /* Number of steps for the dst address to output, used in bcast_loop() */
+        jcp.bcast_loop_output_step = jcp.ur * jcp.typesize_out
+                * (is_data_layout_nxc ? jcp.load_dim : jcp.load_block);
+        jcp.bcast_loop_output_substep = -1; // unused
+
+        /* Number of steps for the src address to be broadcasted in bcast_loop() */
+        jcp.bcast_loop_bcast_step = jcp.ur * jcp.typesize_in
+                * (is_data_layout_nxc ? jcp.reduce_dim : jcp.reduce_block);
+        jcp.bcast_loop_bcast_substep = -1; // unused
+
+        jcp.load_loop_iter_step = jcp.load_block;
+
+        if (jcp.prop_kind == backward_data)
+            jcp.loop_order = loop_lbr;
+        else
+            jcp.loop_order = reduce_src ? loop_blr : loop_lbr;
+
+        int nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block);
+        int nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
+        int nb_load = div_up(jcp.load_dim, jcp.load_block);
+        if (is_data_layout_nxc) {
+            reduce_blocking = jcp.reduce_dim;
+        } else if (jcp.expl_bcast) {
+            if (jcp.load_dim <= BIG_LOAD_DIM && spatial > SMALL_SPATIAL
+                    && spatial < BIG_SPATIAL) {
+                reduce_blocking = nstl::min(jcp.reduce_dim, 80);
+            } else if (spatial > SMALL_SPATIAL)
+                reduce_blocking = nstl::min(jcp.reduce_dim, 512);
+            else
+                reduce_blocking = nstl::min(jcp.reduce_dim, 256);
+        } else {
+            reduce_blocking = nb_reduce;
+            if (spatial <= SMALL_SPATIAL && jcp.reduce_dim >= BIG_REDUCE_DIM)
+                reduce_blocking = 16;
+            else if (spatial > SMALL_SPATIAL
+                    && jcp.reduce_dim >= BIG_REDUCE_DIM)
+                reduce_blocking = 8;
+            reduce_blocking = best_divider(nb_reduce, 1, reduce_blocking, true);
+            reduce_blocking *= jcp.reduce_block;
+        }
+
+        // Check input data cache aliasing.
+        // For other ISA constants may be updated.
+        // 64 * 1024 is chosen due to 1MB L2 16-way cache.
+        // 7 is empirical value. It is about half of 16.
+        // So we leave about half of the set for other data - weights, dst
+        int way_size = (16 * 1024) / jcp.typesize_in;
+        int max_hits = 7;
+        if (!is_data_layout_nxc
+                && jcp.bcast_dim * reduce_blocking > way_size * max_hits) {
+            int nrb = reduce_blocking / simd_w;
+            int sp = jcp.bcast_dim;
+            int wl = way_size / simd_w;
+            for (int start_off = 0; start_off < jcp.ur; start_off++) {
+                for (int off = start_off, hits = 0; off < sp * nrb; off += wl) {
+                    if (off % sp >= jcp.ur || ++hits < max_hits) continue;
+                    int max_r_blocking = simd_w * nstl::max(1, (off + wl) / sp);
+                    reduce_blocking
+                            = nstl::min(reduce_blocking, max_r_blocking);
+                    break;
+                }
+            }
+        }
+
+        if (reduce_blocking < jcp.reduce_dim) {
+            if (jcp.prop_kind == backward_data)
+                jcp.loop_order = reduce_src ? loop_lbr : loop_rlb;
+            else
+                jcp.loop_order = reduce_src ? loop_rbl : loop_rlb;
+        }
+        load_blocking = jcp.load_dim;
+
+        /* Number of weight elements to be loaded for dest */
+        int load_size = jcp.load_dim * jcp.reduce_dim;
+        /* Number of elements to be broadcasted from src */
+        auto bcast_size
+                = (dim_t)jcp.mb * jcp.ngroups * jcp.bcast_dim * jcp.reduce_dim;
+
+        /* 12 cores per CMG */
+        if (jcp.nthr <= 12 && jcp.mb < jcp.nthr
+                && nb_load * nb_bcast > jcp.nthr) {
+            // Some heuristic here
+            float calc_koef = 0.01, best_cost = FLT_MAX;
+            int n_lgc = jcp.nthr;
+            float ratio = (float)load_size / (float)bcast_size;
+            int best_lgc = ratio > 1 ? n_lgc : 1;
+            auto calc_job_cost = [&](int lb, int tg, float mem_k) {
+                int bb_size = jcp.mb * div_up(nb_bcast, tg);
+                float calc_size = (float)(bb_size * jcp.ur)
+                        * (lb * jcp.load_block) * jcp.reduce_dim;
+                float mem_size = (float)(bb_size * jcp.ur + lb * jcp.load_block)
+                        * jcp.reduce_dim;
+                return calc_koef * calc_size + mem_k * mem_size;
+            };
+            for (int lgc, ilgc = 0; ilgc < n_lgc; ilgc++) {
+                lgc = ratio > 1 ? n_lgc - ilgc : ilgc + 1;
+                int min_lb = nb_load / lgc;
+                int max_lb = div_up(nb_load, lgc);
+                int min_tg = jcp.nthr / lgc;
+                int max_tg = div_up(jcp.nthr, lgc);
+                // Some heuristic here
+                float mem_koef = (max_tg == 1) ? 1.f : 1.3f;
+                float job_cost = 0.;
+                if (jcp.nthr % lgc < nb_load % lgc) {
+                    job_cost = calc_job_cost(max_lb, min_tg, mem_koef);
+                } else {
+                    auto job_cost1 = calc_job_cost(max_lb, max_tg, mem_koef);
+                    auto job_cost2 = calc_job_cost(min_lb, min_tg, mem_koef);
+                    job_cost = nstl::max(job_cost1, job_cost2);
+                }
+
+                if (job_cost < best_cost) {
+                    best_lgc = lgc;
+                    best_cost = job_cost;
+                }
+            }
+            jcp.load_grp_count = best_lgc;
+            load_blocking
+                    = div_up(nb_load, jcp.load_grp_count) * jcp.load_block;
+        } else {
+            jcp.load_grp_count
+                    = div_up(jcp.nthr, jcp.mb * jcp.ngroups * nb_bcast);
+            jcp.load_grp_count = best_divider(jcp.nthr, jcp.load_grp_count,
+                    2 * jcp.load_grp_count, false);
+        }
+        if (jcp.expl_bcast && jcp.bcast_dim <= 64 && load_size >= L2_size) {
+            jcp.load_grp_count = nstl::max(jcp.load_grp_count, 4);
+        } else if (jcp.bcast_dim <= 49 && jcp.mb <= jcp.nthr
+                && jcp.load_dim > 512 && jcp.load_dim / jcp.reduce_dim >= 4) {
+            jcp.load_grp_count = nstl::max(jcp.load_grp_count, 2);
+            load_blocking = jcp.load_block;
+        }
+
+        auto get_thr_eff = [=](int load_chunk, int nthr) {
+            int lgc = div_up(nb_load, load_chunk);
+            int thr_per_grp = div_up(nthr, lgc);
+            int bcast_per_thr
+                    = div_up(jcp.mb * nb_bcast, thr_per_grp) * jcp.bcast_block;
+            int load_per_thr = load_chunk * simd_w;
+            float data_norm = (bcast_per_thr + load_per_thr) / 2.f;
+            float data_eff
+                    = (bcast_per_thr * load_per_thr) / (data_norm * data_norm);
+            float thr_eff_over_grp
+                    = (float)nstl::max(1, nthr / lgc) / div_up(nthr, lgc);
+            float thr_eff_in_grp = ((float)jcp.mb * nb_bcast)
+                    / rnd_up(jcp.mb * nb_bcast, thr_per_grp);
+            float thr_eff = thr_eff_over_grp * thr_eff_in_grp;
+            float load_eff = (float)nb_load / rnd_up(nb_load, lgc);
+            float overall_eff = data_eff + thr_eff + load_eff;
+            return overall_eff;
+        };
+
+        auto get_load_chunk = [=](int nthr) {
+            float best_eff = -1.0f;
+            int best_lgc = 1;
+            float eff;
+
+            for (int load_chunk = 1; load_chunk <= nb_load; load_chunk++) {
+                int lgc = div_up(nb_load, load_chunk);
+                if (lgc > nthr) continue;
+                eff = get_thr_eff(load_chunk, nthr);
+                if (eff > best_eff) {
+                    best_eff = eff;
+                    best_lgc = lgc;
+                }
+            }
+            return best_lgc;
+        };
+
+        /* adjust the thread decomposition
+         * to improve the thr_eff for small problem size
+         * the threshold 8192 is empirical 
+         * TODO: Threshold can be increase for init stride > 1*/
+        if (sizeof(float) * bcast_size < 8192 && jcp.mb < jcp.nthr
+                && nb_load * nb_bcast < jcp.nthr) {
+            float best_thr_eff = -1.0f;
+            float thr_eff = -1.0f;
+            int overall_lgc = jcp.load_grp_count;
+            int lgc = 1;
+            int best_nthr = jcp.nthr;
+            int end_nthr = with_groups ? jcp.ngroups : 1;
+            for (int nthr = jcp.nthr / 2; nthr >= end_nthr; nthr--) {
+                lgc = get_load_chunk(nthr);
+                thr_eff = get_thr_eff(lgc, nthr);
+                if (best_thr_eff < thr_eff) {
+                    best_thr_eff = thr_eff;
+                    overall_lgc = lgc;
+                    best_nthr = nthr;
+                }
+            }
+            jcp.nthr = best_nthr;
+            jcp.load_grp_count = overall_lgc;
+            load_blocking
+                    = div_up(nb_load, jcp.load_grp_count) * jcp.load_block;
+        }
+
+        bcast_blocking = div_up(jcp.mb * jcp.ngroups * nb_bcast,
+                                 div_up(jcp.nthr, jcp.load_grp_count))
+                * jcp.bcast_block;
+        bcast_blocking = nstl::min(jcp.bcast_dim, bcast_blocking);
+        bcast_blocking = rnd_up(bcast_blocking, jcp.bcast_block);
+
+        int space_for_bcast = (L2_capacity - /* kernel_size - */
+                2 * jcp.load_block * reduce_blocking - jcp.ur * reduce_blocking
+                - 3 * 1024);
+        if (jcp.reduce_dim * jcp.bcast_dim > L2_capacity) space_for_bcast /= 2;
+
+        int bcast_in_cache
+                = nstl::max(jcp.bcast_block, space_for_bcast / reduce_blocking);
+        bcast_blocking = nstl::min(
+                bcast_blocking, rnd_dn(bcast_in_cache, jcp.bcast_block));
+        // NHWC perf
+        if (is_data_layout_nxc) bcast_blocking = jcp.bcast_block;
+
+        load_blocking_max = load_blocking;
+        bcast_blocking_max = bcast_blocking * 3 / 2;
+        reduce_blocking_max = reduce_blocking;
+
+        jcp.ur_tail = (jcp.with_dw_conv ? jcp.ow : jcp.bcast_dim) % jcp.ur;
+
+    } else if (jcp.prop_kind == backward_weights) { /* BWD weight */
+
+        jcp.reduce_dim = jcp.is;
+
+        jcp.reduce_block = best_divider(jcp.reduce_dim, 7, 16, true);
+        if (jcp.reduce_dim % jcp.reduce_block != 0)
+            jcp.reduce_block = best_divider(jcp.iw, 4, jcp.iw, false);
+        if (jcp.reduce_block > 256) { jcp.reduce_block = 1; }
+
+        jcp.load_dim = jcp.oc;
+        jcp.load_block = jcp.oc_block;
+
+        jcp.bcast_dim = jcp.ic;
+        jcp.bcast_block = jcp.ic_block;
+
+        if (jcp.reduce_block <= 19 &&
+                // maskrcnn optimization for nxc; don't reduce ur when ocb<=1
+                !(is_data_layout_nxc && jcp.load_dim <= jcp.load_block)) {
+            // if reduce_block is big then generated JIT code may be big
+            // for small values of ur because reduce_loop_unroll = reduce_block
+            jcp.ur = jcp.bcast_block / 2;
+            jcp.expl_bcast = true;
+        } else {
+            jcp.ur = jcp.bcast_block;
+            jcp.expl_bcast = false;
+        }
+
+        jcp.ur_tail = jcp.bcast_dim % jcp.bcast_block;
+        jcp.reduce_loop_unroll = jcp.reduce_block;
+        jcp.reduce_loop_bcast_step = jcp.typesize_in * jcp.reduce_loop_unroll
+                * (is_data_layout_nxc ? jcp.ic : jcp.ic_block);
+        jcp.reduce_loop_load_step = jcp.typesize_in * jcp.reduce_loop_unroll
+                * (is_data_layout_nxc ? jcp.oc : jcp.oc_block);
+
+        jcp.bcast_loop_output_step
+                = jcp.oc_block * jcp.ic_block * jcp.typesize_out;
+        jcp.bcast_loop_output_substep
+                = jcp.oc_block * jcp.ur * jcp.typesize_out;
+        jcp.bcast_loop_bcast_step = jcp.ic_block
+                * (is_data_layout_nxc ? 1
+                                      : utils::rnd_up(
+                                              jcp.reduce_dim, jcp.reduce_block))
+                * jcp.typesize_in;
+        jcp.bcast_loop_bcast_substep = jcp.ur * jcp.typesize_in;
+
+        jcp.load_loop_load_step = jcp.typesize_in * jcp.oc_block
+                * (is_data_layout_nxc ? 1 : jcp.os);
+        jcp.load_loop_iter_step = jcp.oc_block;
+
+        /* --- */
+        balance(jcp);
+
+        load_blocking = div_up(jcp.load_dim, jcp.load_block);
+        load_blocking = best_divider(load_blocking, 16, load_blocking, false);
+        load_blocking *= jcp.load_block;
+
+        load_blocking_max = load_blocking;
+        assert(IMPLICATION(
+                !is_data_layout_nxc, jcp.load_dim % load_blocking == 0));
+
+        int max_bcast_blocking = div_up(jcp.bcast_dim, jcp.bcast_block);
+        int min_bcast_blocking = 5;
+
+        bcast_blocking = div_up(jcp.bcast_dim, jcp.bcast_block);
+        bcast_blocking = best_divider(
+                bcast_blocking, min_bcast_blocking, max_bcast_blocking, false);
+        bcast_blocking *= jcp.bcast_block;
+        bcast_blocking_max = bcast_blocking;
+        assert(IMPLICATION(
+                !is_data_layout_nxc, jcp.bcast_dim % bcast_blocking == 0));
+
+        // for reduction balance
+        if (is_data_layout_nxc && jcp.reduce_dim >= BIG_SPATIAL * BIG_SPATIAL
+                && jcp.load_dim >= BIG_LOAD_DIM / 2) {
+            reduce_blocking = rnd_up(nstl::min(jcp.ow, 256), jcp.reduce_block);
+        } else {
+            int max_reduce_blocking
+                    = nstl::min(L1_capacity / jcp.ur, jcp.reduce_dim);
+            int min_reduce_blocking = nstl::min(
+                    L1_capacity / jcp.ur, nstl::max(jcp.iw, jcp.ih));
+            reduce_blocking = best_divider(jcp.reduce_dim, min_reduce_blocking,
+                    max_reduce_blocking, true);
+            reduce_blocking
+                    = nstl::max(rnd_dn(reduce_blocking, jcp.reduce_block),
+                            jcp.reduce_block);
+        }
+
+        reduce_blocking_max = rnd_dn(reduce_blocking * 3 / 2, jcp.reduce_block);
+    } else {
+        return status::unimplemented;
+    }
+
+    assert(load_blocking);
+    assert(load_blocking_max);
+    assert(bcast_blocking);
+    assert(bcast_blocking_max);
+    assert(reduce_blocking);
+    assert(reduce_blocking_max);
+
+    if (!is_data_layout_nxc) {
+        assert(load_blocking % jcp.load_block == 0);
+        assert(reduce_blocking % jcp.reduce_block == 0);
+        assert(load_blocking_max % jcp.load_block == 0);
+        assert(reduce_blocking_max % jcp.reduce_block == 0);
+        assert(jcp.reduce_dim % jcp.reduce_block == 0);
+    }
+
+    assert(jcp.bcast_block % jcp.ur == 0);
+
+    jcp.nb_bcast_blocking = bcast_blocking / jcp.bcast_block;
+    jcp.nb_bcast_blocking_max = bcast_blocking_max / jcp.bcast_block;
+    jcp.nb_load_blocking = utils::div_up(load_blocking, jcp.load_block);
+    jcp.nb_load_blocking_max = utils::div_up(load_blocking_max, jcp.load_block);
+    jcp.nb_reduce_blocking = utils::div_up(reduce_blocking, jcp.reduce_block);
+    jcp.nb_reduce_blocking_max
+            = utils::div_up(reduce_blocking_max, jcp.reduce_block);
+
+    jcp.nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block);
+    jcp.nb_load = div_up(jcp.load_dim, jcp.load_block);
+    jcp.nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
+    return status::success;
+}
+template <cpu_isa_t isa_>
+void jit_sve_1x1_conv_kernel<isa_>::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad,
+        const jit_1x1_conv_conf_t &jcp) {
+
+    using namespace dnnl::impl::memory_tracking::names;
+
+    // Fox nxc layout bias is padded only for bwd_wb direction, as  bias
+    // reduction kernels can't handle tails yet.
+    if (jcp.with_bias && jcp.prop_kind != backward_data
+            && (jcp.oc != jcp.oc_without_padding // blocked layout
+                    || (jcp.prop_kind == backward_weights // nxc layout
+                            && jcp.oc % jcp.oc_block != 0))) {
+
+        const size_t nelems_padded_bias
+                = jcp.ngroups * utils::rnd_up(jcp.oc, jcp.oc_block);
+        scratchpad.book(
+                key_conv_padded_bias, nelems_padded_bias, jcp.typesize_out);
+    }
+
+    if (jcp.prop_kind == backward_weights) {
+        const size_t wei_size = (size_t)jcp.ngroups
+                * rnd_up(jcp.oc, jcp.oc_block) * rnd_up(jcp.ic, jcp.ic_block);
+        scratchpad.book(key_conv_wei_reduction, wei_size * (jcp.nthr_mb - 1),
+                jcp.typesize_out);
+    }
+}
+
+/* BWD W*/
+template <cpu_isa_t isa_>
+void jit_sve_1x1_conv_kernel<isa_>::balance(jit_1x1_conv_conf_t &jcp) {
+    int nthreads = jcp.nthr;
+    // initialize jcp reduction threading properties
+    jcp.nthr = jcp.nthr_mb = jcp.nthr_g = jcp.nthr_oc_b = jcp.nthr_ic_b = 1;
+    if (nthreads < jcp.ngroups) {
+        /* simplification... fortunately it doesn't hurt much */
+        return;
+    }
+    const int nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block);
+    const int nb_load = div_up(jcp.load_dim, jcp.load_block);
+    const int nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
+
+    jcp.nthr_g = jcp.ngroups;
+    const int nthr = nthreads / jcp.nthr_g;
+
+    auto calc_mem_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) {
+        /* calculate per thread memory cost (read/write). high level
+        * optimizer tries to minimize memory consumption. few notes: (n1)
+        * unclear why, but that essentially helps first convolution...
+        *  (n2) assuming the reduction over minibatch is always there:
+        *    - instead of 8 it should be 5 here (write ~= 2 read):
+        *      kernel: temporal workspace 1 write
+        *      reduction: 1 read from workspace and 1 write to the diff_wei
+        *    - but experiments showed 8 works better than 5 or 6... */
+        int bcast_koeff = 1;
+        int load_koeff = 1;
+        int output_koeff = 12;
+        return 0
+                + (size_t)bcast_koeff * div_up(jcp.mb * nb_reduce, nthr_mb)
+                * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_bcast, nthr_ic_b)
+                * jcp.ic_block * jcp.reduce_block / jcp.stride_h
+                / jcp.stride_w /* (n1) */
+                + (size_t)load_koeff * div_up(jcp.mb * nb_reduce, nthr_mb)
+                * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b)
+                * jcp.oc_block * jcp.reduce_block
+                + (size_t)output_koeff /* (n2) */
+                * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b)
+                * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block * jcp.oc_block;
+    };
+
+    int nthr_mb = 1, nthr_oc_b = 1, nthr_ic_b = 1;
+    auto best_mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b);
+
+    /* step 1: find the best thread distribution with lowest memory cost */
+    const int nthr_mb_max = nstl::min(nthr, jcp.mb * nb_reduce);
+    for (nthr_mb = 1; nthr_mb <= nthr_mb_max; ++nthr_mb) {
+        const int nthr_par = nthr / nthr_mb;
+        const int nthr_oc_b_max = nstl::min(nthr_par, nb_load);
+        for (nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) {
+            nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, nb_bcast);
+            auto mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b);
+            if (mem_cost <= best_mem_cost) {
+                best_mem_cost = mem_cost;
+                jcp.nthr_mb = nthr_mb;
+                jcp.nthr_oc_b = nthr_oc_b;
+                jcp.nthr_ic_b = nthr_ic_b;
+            }
+        }
+    }
+    if (jcp.nthr_mb > nthreads / 2 && jcp.nthr_mb < nthreads)
+        jcp.nthr_mb = nstl::min(jcp.mb, nthreads);
+
+    jcp.nthr = jcp.nthr_mb * jcp.nthr_g * jcp.nthr_oc_b * jcp.nthr_ic_b;
+    assert(jcp.nthr <= nthreads);
+}
+
+template struct jit_sve_1x1_conv_kernel<sve_512>;
+template struct jit_sve_1x1_conv_kernel<sve_256>;
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/jit_sve_1x1_conv_kernel.hpp b/src/cpu/aarch64/jit_sve_1x1_conv_kernel.hpp
new file mode 100644
index 00000000000..5bfd5db50de
--- /dev/null
+++ b/src/cpu/aarch64/jit_sve_1x1_conv_kernel.hpp
@@ -0,0 +1,205 @@
+/*******************************************************************************
+* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_AARCH64_JIT_SVE_1x1_CONV_KERNEL_HPP
+#define CPU_AARCH64_JIT_SVE_1x1_CONV_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/aarch64/injectors/jit_uni_postops_injector.hpp"
+#include "cpu/aarch64/jit_generator.hpp"
+#include "cpu/aarch64/jit_op_imm_check.hpp"
+#include "cpu/aarch64/jit_primitive_conf.hpp"
+
+using namespace Xbyak_aarch64;
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+/* Get vector offsets, ofs / VL(eg VL: 512bits = 64Bytes ) */
+#define VL64_OFS(ofs) (ofs >> cpu_isa_traits<isa_>::vlen_shift)
+
+template <cpu_isa_t isa_ = isa_undef>
+struct jit_sve_1x1_conv_kernel : public jit_generator {
+    jit_sve_1x1_conv_kernel(const jit_1x1_conv_conf_t &ajcp,
+            const primitive_attr_t &attr, const memory_desc_t &dst_md);
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sve_1x1_conv_kernel)
+
+    static status_t init_conf(jit_1x1_conv_conf_t &jcp,
+            const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d, const primitive_attr_t &attr,
+            int nthreads, bool reduce_src);
+
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+            const jit_1x1_conv_conf_t &jcp);
+
+    jit_1x1_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+
+private:
+    using reg64_t = const XReg;
+
+    /* Flags and loop variables */
+    reg64_t reg_reduce_pos_flag = x1;
+    reg64_t reduce_loop_iter = x2;
+    reg64_t reg_bcast_loop_iter = x3;
+    reg64_t reg_relu_ns = x20; // For forward
+    reg64_t reg_output_stride = x20; // For backward
+
+    /* Pointer */
+    reg64_t reg_bcast_data = x5; // Input
+    reg64_t reg_load_data = x6; // Weight
+    reg64_t reg_output_data = x7; // Output
+    reg64_t reg_bias_data = x8; // bias
+    reg64_t aux1_reg_bcast_data = x9;
+    reg64_t aux_reg_output_data = x10;
+    reg64_t aux_reg_bcast_data = x11;
+    reg64_t aux_reg_load_data = x12;
+    reg64_t reg_prev_bcast_addr
+            = x13; // Input: The reg keeps addr accessed by previous ldr inst
+    reg64_t reg_prev_out_addr
+            = x14; // Output: The reg keeps addr accessed by previous ldr or str inst
+
+    /* Workload */
+    reg64_t reg_load_loop_work = x15;
+    reg64_t reg_reduce_loop_work = x16;
+    reg64_t reg_bcast_loop_work = x17;
+
+    /* Temporay registers */
+    reg64_t reg_tmp_imm = x27; // tmp for add_imm
+    reg64_t reg_tmp_ofs = x19; // tmp reg to calc bwd wei offset in out_load
+
+    reg64_t reg_load_dim_tail_mask = aux_reg_load_data;
+
+    std::unique_ptr<injector::jit_uni_postops_injector_t<isa_>>
+            postops_injector_;
+
+    constexpr static int isa_simd_width_
+            = cpu_isa_traits<isa_>::vlen / sizeof(float);
+
+    ZReg vreg_bcast = ZReg(31);
+    PReg k_load_dim_mask = p2;
+    PReg k_load_dim_tail_mask = p3;
+    ZReg zreg_tmp = ZReg(31);
+    ZReg zreg_tmp1 = ZReg(30);
+
+    constexpr static int reg64_size_ = sizeof(int64_t);
+    constexpr static int reg_bcast_loop_work_offt = 0;
+    constexpr static int reg_binary_post_op_acc_off = 1 * reg64_size_;
+    constexpr static int reg_abi_param1_backup = 2 * reg64_size_;
+    constexpr static int stack_space_needed = 3 * reg64_size_;
+
+    template <typename T>
+    Xbyak_aarch64::XReg EVEX_compress_addr(const Xbyak_aarch64::XReg &addr,
+            const Xbyak_aarch64::XReg &x_tmp, Xbyak_aarch64::XReg base,
+            T raw_offt, bool bcast = false) {
+
+        assert(raw_offt <= INT_MAX);
+        auto offt = static_cast<int>(raw_offt);
+
+        add_imm(addr, base, offt, x_tmp);
+        if (bcast) {
+            // addr is the same as addr when bcast is false.
+        }
+        return addr;
+    }
+
+    void prefetch(
+            const std::string prfop, int level, reg64_t in, long long int ofs) {
+        bool for_load = false;
+        if (prfop == "LD") {
+            for_load = true;
+        } else if (prfop == "ST") {
+            for_load = false;
+        } else {
+            assert(!"invalid prfop");
+        }
+
+        bool cacheline_aligned = ((ofs & 0xFF) == 0) ? true : false;
+        if (cacheline_aligned == true) {
+            Prfop op;
+            switch (level) {
+                case 1: op = (for_load == true) ? PLDL1KEEP : PSTL1KEEP; break;
+                case 2: op = (for_load == true) ? PLDL2KEEP : PSTL2KEEP; break;
+                case 3: op = (for_load == true) ? PLDL3KEEP : PSTL3KEEP; break;
+                default: assert(!"invalid prfop"); break;
+            }
+
+            if (prfm_imm_check(ofs)) {
+                prfm(op, ptr(in, static_cast<int32_t>(ofs)));
+            } else {
+                add_imm(reg_tmp_ofs, in, ofs, reg_tmp_imm);
+                prfm(op, ptr(reg_tmp_ofs));
+            }
+        } else {
+            PrfopSve op_sve;
+            switch (level) {
+                case 1:
+                    op_sve = (for_load == true) ? PLDL1KEEP_SVE : PSTL1KEEP_SVE;
+                    break;
+                case 2:
+                    op_sve = (for_load == true) ? PLDL2KEEP_SVE : PSTL2KEEP_SVE;
+                    break;
+                case 3:
+                    op_sve = (for_load == true) ? PLDL3KEEP_SVE : PSTL3KEEP_SVE;
+                    break;
+                default: assert(!"invalid prfop"); break;
+            }
+
+            if (prfw_imm_check(ofs)) {
+                prfw(op_sve, P_ALL_ONE,
+                        ptr(in, static_cast<int32_t>(VL64_OFS(ofs))));
+            } else {
+                add_imm(reg_tmp_ofs, in, ofs, reg_tmp_imm);
+                prfw(op_sve, P_ALL_ONE, ptr(reg_tmp_ofs));
+            }
+        }
+    }
+
+    void bcast_loop(int load_loop_blk);
+    void reduce_loop(int load_loop_blk, int ur, int substep, bool wraparound);
+
+    void generate() override;
+    static void balance(jit_1x1_conv_conf_t &jcp);
+
+    inline size_t get_output_offset(
+            const bool is_out_layout_nxc, const int i_load, const int i_ur) {
+        const size_t i_load_shift = is_out_layout_nxc
+                ? jcp.load_block
+                : (jcp.with_dw_conv ? jcp.ow : jcp.bcast_dim) * jcp.load_block;
+        const size_t i_ur_shift
+                = is_out_layout_nxc ? jcp.load_dim : jcp.load_block;
+        return jcp.typesize_out * (i_load * i_load_shift + i_ur * i_ur_shift);
+    }
+
+    Xbyak_aarch64::XReg output_ptr(const bool out_layout_nxc, const int i_load,
+            const int i_ur, Xbyak_aarch64::XReg addr);
+    void apply_postops(const bool is_out_layout_nxc, const int load_loop_blk,
+            const int ur);
+};
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/aarch64/jit_sve_1x1_convolution.cpp b/src/cpu/aarch64/jit_sve_1x1_convolution.cpp
new file mode 100644
index 00000000000..065863d93c8
--- /dev/null
+++ b/src/cpu/aarch64/jit_sve_1x1_convolution.cpp
@@ -0,0 +1,1057 @@
+/*******************************************************************************
+* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/aarch64/jit_generator.hpp"
+
+#include "cpu/aarch64/jit_sve_1x1_convolution.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+#define data_blk_off(f, n, c, d, h, w) \
+    ((ndims == 3) ? (f).blk_off(n, c, w) \
+                  : ((ndims == 4) ? (f).blk_off(n, c, h, w) \
+                                  : (f).blk_off(n, c, d, h, w)))
+/* convolution forward */
+
+template <data_type_t src_type, data_type_t wei_type, data_type_t dst_type,
+        cpu_isa_t isa_>
+void jit_sve_1x1_convolution_fwd_t<src_type, wei_type, dst_type,
+        isa_>::execute_forward(const exec_ctx_t &ctx) const {
+    const auto &jcp = kernel_->jcp;
+    auto src = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const dst_data_t *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
+    auto weights_dw = CTX_IN_MEM(
+            const wei_data_t *, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS);
+    auto bias_dw = CTX_IN_MEM(
+            const dst_data_t *, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS);
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+    const auto post_ops_binary_rhs_arg_vec_dw = pd()->dw_conv_pd_
+            ? binary_injector::prepare_binary_args(
+                    pd()->dw_conv_pd_->jcp_.post_ops, ctx,
+                    pd()->jcp_.post_ops.entry_.size() + 1)
+            : std::vector<const void *> {};
+
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    if (pd()->wants_padded_bias()) {
+        auto padded_bias
+                = scratchpad.template get<dst_data_t>(key_conv_padded_bias);
+        utils::array_copy(padded_bias, bias, jcp.oc_without_padding);
+        utils::array_set(padded_bias + jcp.oc_without_padding, 0.f,
+                jcp.oc - jcp.oc_without_padding);
+        bias = padded_bias;
+    }
+
+    parallel(jcp.nthr, [&](const int ithr, const int nthr) {
+        execute_forward_thr(ithr, nthr, src, weights, bias, weights_dw, bias_dw,
+                dst, scratchpad, post_ops_binary_rhs_arg_vec.data(),
+                post_ops_binary_rhs_arg_vec_dw.data());
+    });
+
+    if (pd()->wants_zero_pad_dst()) ctx.zero_pad_output(DNNL_ARG_DST);
+}
+
+template <data_type_t src_type, data_type_t wei_type, data_type_t dst_type,
+        cpu_isa_t isa_>
+void jit_sve_1x1_convolution_fwd_t<src_type, wei_type, dst_type,
+        isa_>::execute_forward_thr(const int ithr, const int nthr,
+        const src_data_t *src, const wei_data_t *weights,
+        const dst_data_t *bias, const wei_data_t *weights_dw,
+        const dst_data_t *bias_dw, dst_data_t *dst,
+        const memory_tracking::grantor_t &scratchpad,
+        const void *post_ops_binary_rhs_arg_vec,
+        const void *post_ops_binary_rhs_arg_vec_dw) const {
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+    const memory_desc_wrapper dw_weights_d(
+            pd()->arg_md(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS));
+    const memory_desc_wrapper dw_bias_d(
+            pd()->arg_md(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS));
+
+    const auto &jcp = kernel_->jcp;
+    auto rtus_space = pd()->rtus_.reduce_src_
+            ? scratchpad.get<src_data_t>(key_conv_rtus_space)
+            : nullptr;
+
+    const int ndims = src_d.ndims();
+    const int stride_d = (ndims == 5) ? pd()->desc()->strides[0] : 1;
+    const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[ndims - 4];
+    const int stride_w = pd()->desc()->strides[ndims - 3];
+
+    auto step = [](int default_step, int remaining, int tail_step) {
+        assert(default_step <= tail_step);
+        return remaining < tail_step ? remaining : default_step;
+    };
+
+    auto p = jit_1x1_conv_call_s();
+    auto rp = typename rtus_driver_t<isa_>::call_params_t();
+    const int nb_oc = jcp.nb_load;
+    const int nb_ic = jcp.nb_reduce;
+    const int nb_ic_blocking = jcp.nb_reduce_blocking;
+
+    // override some constants for fused dw_conv
+    const int os_block = jcp.with_dw_conv ? jcp.ow : jcp.bcast_block;
+    const int nb_bcast = jcp.with_dw_conv ? jcp.oh : jcp.nb_bcast;
+    const int nb_bcast_blocking = jcp.with_dw_conv ? 1 : jcp.nb_bcast_blocking;
+    const int nb_bcast_blocking_max
+            = jcp.with_dw_conv ? 1 : jcp.nb_bcast_blocking_max;
+    const int nb_load_blocking = jcp.nb_load_blocking;
+    const int nb_load_blocking_max = jcp.with_dw_conv
+            ? jcp.nb_load_blocking
+            : jcp.nb_load_blocking_max;
+    const bool is_dst_layout_nxc = utils::one_of(
+            jcp.dst_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
+    const bool is_src_layout_nxc = utils::one_of(
+            jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
+
+    // Begin: declare Variables needed for dw conv.
+    memory_tracking::grantor_t dw_scratchpad(
+            scratchpad, memory_tracking::names::prefix_fusion);
+    dst_data_t *pbuf;
+    size_t row_offset;
+    const int nb_buffer = jcp.nb_load_blocking;
+    std::vector<dst_data_t *> addrs;
+    // End
+
+    auto init_bcast = [&](int iwork, int bcast_end, int &n, int &g,
+                              int &bcast_step, int &od, int &oh, int &ow,
+                              int &id, int &ih, int &iw) {
+        int osb {0};
+        nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb, nb_bcast);
+        bcast_step = step(
+                nb_bcast_blocking, nb_bcast - osb, nb_bcast_blocking_max);
+        bcast_step = nstl::min(bcast_step, bcast_end - iwork);
+
+        const int os = osb * os_block;
+        od = os / (jcp.oh * jcp.ow);
+        int os_2d = os % (jcp.oh * jcp.ow);
+        oh = os_2d / jcp.ow;
+        ow = os_2d % jcp.ow;
+
+        id = od * stride_d;
+        ih = oh * stride_h;
+        iw = ow * stride_w;
+        rp.iw_start = iw;
+
+        p.bcast_dim = this_block_size(os, jcp.os, bcast_step * os_block);
+        rp.os = p.bcast_dim;
+    };
+
+    auto init_load = [&](int ocb, int ocb_end, int &load_step) {
+        load_step = step(nb_load_blocking, ocb_end - ocb, nb_load_blocking_max);
+        const auto max_oc
+                = nstl::min(ocb_end * jcp.oc_block, jcp.oc_without_padding);
+        p.load_dim = this_block_size(
+                ocb * jcp.oc_block, max_oc, load_step * jcp.oc_block);
+    };
+
+    auto init_reduce = [&](int icb) {
+        const int nb_ic_blocking_step
+                = nstl::min(icb + nb_ic_blocking, nb_ic) - icb;
+        p.first_last_flag = 0 | (icb == 0 ? FLAG_REDUCE_FIRST : 0)
+                | (icb + nb_ic_blocking_step >= nb_ic ? FLAG_REDUCE_LAST : 0);
+
+        p.reduce_dim = this_block_size(
+                icb * jcp.ic_block, jcp.ic, nb_ic_blocking_step * jcp.ic_block);
+        rp.icb = p.reduce_dim;
+    };
+
+    auto ker_1x1 = [&](int ocb, int ocb_start, int icb, int n, int g, int od,
+                           int oh, int ow, int id, int ih, int iw) {
+        const int oc_off_idx = is_dst_layout_nxc
+                ? g * jcp.oc + ocb * jcp.oc_block
+                : g * nb_oc + ocb;
+        const size_t dst_off = data_blk_off(dst_d, n, oc_off_idx, od, oh, ow);
+
+        p.output_data = jcp.with_dw_conv
+                ? pbuf + (oh % pd()->dw_conv_pd_->jcp_.kh) * row_offset
+                : &dst[dst_off];
+        p.bias_data = bias
+                ? &bias[oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block)]
+                : nullptr;
+
+        p.load_data
+                = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb)
+                                               : weights_d.blk_off(ocb, icb)];
+        const int ic_off_idx = is_src_layout_nxc
+                ? g * jcp.ic + icb * jcp.ic_block
+                : g * nb_ic + icb;
+        if (pd()->rtus_.reduce_src_) {
+            rp.ws = rtus_space + ithr * pd()->rtus_.space_per_thread_
+                    + (is_src_layout_nxc ? ic_off_idx
+                                         : jcp.is * ic_off_idx * jcp.ic_block);
+            if (ocb == ocb_start) {
+                rp.src = src + data_blk_off(src_d, n, ic_off_idx, id, ih, iw);
+                (*rtus_driver_)(&rp);
+            }
+            p.bcast_data = rp.ws;
+        } else
+            p.bcast_data = src + data_blk_off(src_d, n, ic_off_idx, id, ih, iw);
+
+        p.oc_l_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block);
+        p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
+        p.dst_orig = dst;
+
+        (*kernel_)(&p);
+    };
+    auto conv_1x1 = [&](int bcast_start, int bcast_end, int ocb_start,
+                            int ocb_end) {
+        if (bcast_start >= bcast_end || ocb_start >= ocb_end) return;
+
+        if (jcp.loop_order == loop_rlb) {
+            for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
+                init_reduce(icb);
+                int ocb = ocb_start;
+                while (ocb < ocb_end) {
+                    int load_step;
+                    init_load(ocb, ocb_end, load_step);
+                    int iwork = bcast_start;
+                    while (iwork < bcast_end) {
+                        int n {0}, g {0}, bcast_step {0}, od {0}, oh {0},
+                                ow {0}, id {0}, ih {0}, iw {0};
+                        init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh,
+                                ow, id, ih, iw);
+                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
+                                iw);
+                        iwork += bcast_step;
+                    }
+                    ocb += load_step;
+                }
+            }
+        } else if (jcp.loop_order == loop_lbr) {
+            int ocb = ocb_start;
+            while (ocb < ocb_end) {
+                int load_step;
+                init_load(ocb, ocb_end, load_step);
+                int iwork = bcast_start;
+                while (iwork < bcast_end) {
+                    int n {0}, g {0}, bcast_step {0}, od {0}, oh {0}, ow {0},
+                            id {0}, ih {0}, iw {0};
+                    init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh, ow,
+                            id, ih, iw);
+                    for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
+                        init_reduce(icb);
+                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
+                                iw);
+                    }
+                    iwork += bcast_step;
+                }
+                ocb += load_step;
+            }
+        } else if (jcp.loop_order == loop_rbl) {
+            for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
+                init_reduce(icb);
+                int iwork = bcast_start;
+                while (iwork < bcast_end) {
+                    int n {0}, g {0}, bcast_step {0}, od {0}, oh {0}, ow {0},
+                            id {0}, ih {0}, iw {0};
+                    init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh, ow,
+                            id, ih, iw);
+                    int ocb = ocb_start;
+                    while (ocb < ocb_end) {
+                        int load_step;
+                        init_load(ocb, ocb_end, load_step);
+                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
+                                iw);
+                        ocb += load_step;
+                    }
+                    iwork += bcast_step;
+                }
+            }
+        } else if (jcp.loop_order == loop_blr) {
+            int iwork = bcast_start;
+            while (iwork < bcast_end) {
+                int n {0}, g {0}, bcast_step {0}, od {0}, oh {0}, ow {0},
+                        id {0}, ih {0}, iw {0};
+                init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh, ow, id,
+                        ih, iw);
+                int ocb = ocb_start;
+                while (ocb < ocb_end) {
+                    int load_step;
+                    init_load(ocb, ocb_end, load_step);
+                    for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
+                        init_reduce(icb);
+                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
+                                iw);
+                    }
+                    ocb += load_step;
+                }
+                iwork += bcast_step;
+            }
+        } else {
+            assert(!"unsupported loop order");
+        }
+    };
+
+    auto ker_dw = [&](int n, int ocb_start, int load_step, int &dw_oh) {
+        auto &jcp_dw = pd()->dw_conv_pd_->jcp_;
+        int oh_1x1 = nstl::max(dw_oh * jcp_dw.stride_h - jcp_dw.t_pad, 0);
+
+        for (int i = 0; i < jcp_dw.kh; ++i)
+            addrs[i] = pbuf + ((oh_1x1++) % jcp_dw.kh) * row_offset;
+
+        const auto ocb_end = ocb_start + load_step;
+        const auto wch_stride = (is_src_layout_nxc ? 1 : jcp_dw.iw)
+                * jcp_dw.nb_ch_blocking * jcp_dw.ch_block;
+        const int dil_h = jcp_dw.dilate_h + 1;
+        const int str_h = jcp_dw.stride_h;
+        const int ch_num = jcp_dw.nb_ch_blocking;
+        const int ow = 0;
+        const int kw = 0;
+
+        for (int ch = ocb_start; ch < ocb_end; ch += jcp_dw.nb_ch_blocking) {
+
+            const int i_t_overflow
+                    = nstl::max(0, (int)(jcp_dw.t_pad - dw_oh * str_h));
+            const int i_b_overflow
+                    = nstl::max(jcp_dw.ih,
+                              (int)(dw_oh * str_h + (jcp_dw.kh - 1) * dil_h
+                                      - jcp_dw.t_pad + 1))
+                    - jcp_dw.ih;
+
+            const int kh = div_up(i_t_overflow, dil_h);
+            const int kh_padding = jcp_dw.kh - div_up(i_t_overflow, dil_h)
+                    - div_up(i_b_overflow, dil_h);
+
+            jit_conv_call_s par_conv_dw;
+
+            par_conv_dw.src = addrs.data();
+
+            const size_t ch_step = is_dst_layout_nxc
+                    ? jcp_dw.ch_block
+                    : dst_d.blk_off(0, 1, 0, 0);
+            par_conv_dw.dst
+                    = &dst[dst_d.blk_off(n, 0, dw_oh, ow) + ch * ch_step];
+
+            par_conv_dw.filt
+                    = &weights_dw[dw_weights_d.blk_off(ch, 0, 0, kh, kw)];
+            if (bias)
+                par_conv_dw.bias
+                        = &bias_dw[dw_bias_d.blk_off(ch * jcp_dw.ch_block)];
+
+            par_conv_dw.kh_padding = (size_t)nstl::max(0, kh_padding);
+
+            par_conv_dw.load_work = (nstl::min(ch + ch_num, jcp_dw.nb_ch) - ch)
+                    * jcp_dw.ch_block;
+
+            par_conv_dw.oc_l_off = ch * jcp_dw.ch_block;
+            par_conv_dw.post_ops_binary_rhs_arg_vec
+                    = post_ops_binary_rhs_arg_vec_dw;
+            par_conv_dw.dst_orig = dst;
+
+            (*kernel_dw_)(&par_conv_dw);
+
+            for (int i = 0; i < jcp_dw.kh; ++i)
+                addrs[i] += wch_stride;
+        }
+    };
+
+    auto conv_dw = [&]() {
+        // Set variables
+        auto dw_conv_buffer
+                = dw_scratchpad.get<dst_data_t>(key_fusion_inout_buffer);
+        auto &jcp_dw = pd()->dw_conv_pd_->jcp_;
+
+        const auto dw_conv_buffer_size_
+                = (size_t)jcp_dw.kh * jcp.ow * nb_buffer * jcp.oc_block;
+        pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_;
+        row_offset = dw_conv_buffer_size_ / jcp_dw.kh;
+        addrs.resize(jcp_dw.kh);
+
+        int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0};
+        balance2D(nthr, ithr, jcp.mb * jcp.ngroups * jcp_dw.oh, bcast_start,
+                bcast_end, nb_oc, ocb_start, ocb_end, jcp.load_grp_count);
+
+        while (ocb_start < ocb_end) {
+            int load_step;
+            init_load(ocb_start, ocb_end, load_step);
+
+            int oh_1x1 = 0;
+            auto bcast_iter = bcast_start;
+            while (bcast_iter < bcast_end) {
+                int n {0}, g {0}, oh_dw {0};
+                nd_iterator_init(bcast_iter, n, jcp.mb, g, jcp.ngroups, oh_dw,
+                        jcp_dw.oh);
+                if (oh_dw == 0) oh_1x1 = 0; // Reset over mb boundary
+                const int oh_1x1_range = oh_dw * jcp_dw.stride_h - jcp_dw.t_pad;
+                const int oh_1x1_begin = nstl::max(oh_1x1_range, 0);
+                const int oh_1x1_end
+                        = nstl::min(oh_1x1_range + jcp_dw.kh, jcp.oh);
+                oh_1x1 = nstl::max(
+                        oh_1x1_begin, oh_1x1); // Skip rows computed previously
+
+                // dw_spatial to 1x1 spatial conversion. if jcp.oh != jcp_dw.oh
+                const int bcast_start_1x1
+                        = n * jcp.ngroups * jcp.oh + g * jcp.oh + oh_1x1;
+                const int bcast_end_1x1 = bcast_start_1x1 - oh_1x1 + oh_1x1_end;
+
+                conv_1x1(bcast_start_1x1, bcast_end_1x1, ocb_start,
+                        ocb_start + load_step);
+                oh_1x1 = oh_1x1_end;
+                ker_dw(n, g * nb_oc + ocb_start, load_step, oh_dw);
+
+                bcast_iter += nb_bcast_blocking;
+            }
+            ocb_start += load_step;
+        }
+    };
+
+    if (jcp.with_dw_conv) {
+        conv_dw();
+    } else {
+
+        const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
+        int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0};
+        balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load,
+                ocb_start, ocb_end, jcp.load_grp_count);
+
+        conv_1x1(bcast_start, bcast_end, ocb_start, ocb_end);
+    }
+}
+
+template struct jit_sve_1x1_convolution_fwd_t<data_type::f32, data_type::f32,
+        data_type::f32, sve_256>;
+template struct jit_sve_1x1_convolution_fwd_t<data_type::f32, data_type::f32,
+        data_type::f32, sve_512>;
+
+/* convolution backward wtr data */
+template <data_type_t diff_dst_type, data_type_t wei_type,
+        data_type_t diff_src_type, cpu_isa_t isa_>
+void jit_sve_1x1_convolution_bwd_data_t<diff_dst_type, wei_type, diff_src_type,
+        isa_>::execute_backward_data(const exec_ctx_t &ctx) const {
+    auto diff_dst = CTX_IN_MEM(const diff_dst_data_t *, DNNL_ARG_DIFF_DST);
+    auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
+    auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
+
+    const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+    const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
+
+    const auto &jcp = kernel_->jcp;
+    auto rtus_space = pd()->rtus_.reduce_src_
+            ? ctx.get_scratchpad_grantor().template get<diff_src_data_t>(
+                    key_conv_rtus_space)
+            : nullptr;
+
+    const int ndims = diff_src_d.ndims();
+
+    assert(jcp.stride_w == 1 && jcp.stride_h == 1 && jcp.stride_d == 1);
+
+    const int stride_d = (ndims == 5) ? pd()->desc()->strides[0] : 1;
+    const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[ndims - 4];
+    const int stride_w = pd()->desc()->strides[ndims - 3];
+
+    const int nb_ic = jcp.nb_load;
+    const int nb_oc = jcp.nb_reduce;
+    const int os_block = jcp.bcast_block;
+    const int nb_oc_blocking = jcp.nb_reduce_blocking;
+
+    const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
+
+    auto step = [](int default_step, int remaining, int tail_step) {
+        assert(default_step <= tail_step);
+        return remaining < tail_step ? remaining : default_step;
+    };
+
+    parallel(jcp.nthr, [&](const int ithr, const int nthr) {
+        auto p = jit_1x1_conv_call_s();
+        auto rp = typename rtus_driver_t<isa_>::call_params_t();
+
+        int bcast_start {0}, bcast_end {0}, icb_start {0}, icb_end {0};
+        balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load,
+                icb_start, icb_end, jcp.load_grp_count);
+
+        bool reduce_outer
+                = (jcp.loop_order == loop_rbl || jcp.loop_order == loop_rlb);
+        int nboc_outer = reduce_outer ? nb_oc : 1;
+        int ocb_outer_step = reduce_outer ? nb_oc_blocking : 1;
+
+        int nboc_inner = reduce_outer ? 1 : nb_oc;
+        int ocb_inner_step = reduce_outer ? 1 : nb_oc_blocking;
+        const int max_ic = nstl::min(icb_end * jcp.ic_block, jcp.ic);
+
+        for (int ocb_outer = 0; ocb_outer < nboc_outer;
+                ocb_outer += ocb_outer_step) {
+            size_t cur_ocb_outer
+                    = nstl::min(ocb_outer + ocb_outer_step, nboc_outer)
+                    - ocb_outer;
+
+            int load_step = 0;
+            for (int icb = icb_start; icb < icb_end; icb += load_step) {
+                load_step = step(jcp.nb_load_blocking, jcp.nb_load - icb,
+                        jcp.nb_load_blocking_max);
+
+                p.load_dim = this_block_size(
+                        icb * jcp.ic_block, max_ic, load_step * jcp.ic_block);
+                rp.icb = p.load_dim;
+                int bcast_step;
+                for (int iwork = bcast_start; iwork < bcast_end;
+                        iwork += bcast_step) {
+                    int n {0}, g {0}, osb {0};
+                    nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb,
+                            jcp.nb_bcast);
+
+                    bcast_step = step(jcp.nb_bcast_blocking, jcp.nb_bcast - osb,
+                            jcp.nb_bcast_blocking_max);
+                    bcast_step = nstl::min(bcast_step, bcast_end - iwork);
+
+                    const int os = osb * os_block;
+                    p.bcast_dim = this_block_size(
+                            os, jcp.os, bcast_step * os_block);
+                    rp.os = p.bcast_dim;
+                    const int od = os / (jcp.oh * jcp.ow);
+                    const int os_2d = os % (jcp.oh * jcp.ow);
+                    const int oh = os_2d / jcp.ow;
+                    const int ow = os_2d % jcp.ow;
+                    const int id = od * stride_d;
+                    const int ih = oh * stride_h;
+                    const int iw = ow * stride_w;
+                    rp.iw_start = iw;
+                    const bool is_dsrc_layout_nxc
+                            = utils::one_of(jcp.src_tag, format_tag::nwc,
+                                    format_tag::nhwc, format_tag::ndhwc);
+                    const int ic_off_idx = is_dsrc_layout_nxc
+                            ? g * jcp.ic + icb * jcp.ic_block
+                            : g * nb_ic + icb;
+                    rp.src = diff_src
+                            + data_blk_off(
+                                    diff_src_d, n, ic_off_idx, id, ih, iw);
+                    if (pd()->rtus_.reduce_src_) {
+                        rp.ws = rtus_space
+                                + ithr * pd()->rtus_.space_per_thread_;
+                        p.output_data = rp.ws;
+                    } else
+                        p.output_data = rp.src;
+
+                    for (int ocb_inner = 0; ocb_inner < nboc_inner;
+                            ocb_inner += ocb_inner_step) {
+                        int cur_ocb_inner
+                                = nstl::min(ocb_inner + ocb_inner_step,
+                                          nboc_inner)
+                                - ocb_inner;
+
+                        int ocb = reduce_outer ? ocb_outer : ocb_inner;
+                        int nb_oc_blocking_step
+                                = reduce_outer ? cur_ocb_outer : cur_ocb_inner;
+                        const bool is_ddst_layout_nxc
+                                = utils::one_of(jcp.dst_tag, format_tag::nwc,
+                                        format_tag::nhwc, format_tag::ndhwc);
+                        const int oc_off_idx = is_ddst_layout_nxc
+                                ? g * jcp.oc + ocb * jcp.oc_block
+                                : g * nb_oc + ocb;
+                        size_t diff_dst_off = data_blk_off(
+                                diff_dst_d, n, oc_off_idx, od, oh, ow);
+                        p.bcast_data = &diff_dst[diff_dst_off];
+
+                        p.load_data = &weights[pd()->with_groups()
+                                        ? weights_d.blk_off(g, ocb, icb)
+                                        : weights_d.blk_off(ocb, icb)];
+
+                        p.first_last_flag = ocb == 0 ? FLAG_REDUCE_FIRST : 0;
+
+                        p.reduce_dim = this_block_size(ocb * jcp.oc_block,
+                                jcp.oc, nb_oc_blocking_step * jcp.oc_block);
+
+                        (*kernel_)(&p);
+                    }
+                    if (pd()->rtus_.reduce_src_) (*rtus_driver_)(&rp);
+                }
+            }
+        }
+    });
+}
+
+template struct jit_sve_1x1_convolution_bwd_data_t<data_type::f32,
+        data_type::f32, data_type::f32, sve_256>;
+template struct jit_sve_1x1_convolution_bwd_data_t<data_type::f32,
+        data_type::f32, data_type::f32, sve_512>;
+
+/* convolution backward wtr weights */
+
+#define wht_blk_off(d, g, ...) \
+    (pd()->with_groups() ? (d).blk_off((g), __VA_ARGS__) \
+                         : (d).blk_off(__VA_ARGS__))
+
+template <data_type_t diff_dst_type, data_type_t wei_type,
+        data_type_t diff_src_type, cpu_isa_t isa_>
+status_t jit_sve_1x1_convolution_bwd_weights_t<diff_dst_type, wei_type,
+        diff_src_type, isa_>::init(engine_t *engine) {
+
+    CHECK(safe_ptr_assign(kernel_,
+            new jit_sve_1x1_conv_kernel<isa_>(
+                    pd()->jcp_, *pd()->attr(), *pd()->dst_md(0))));
+    CHECK(safe_ptr_assign(
+            acc_ker_, new cpu_accumulator_1d_t<data_type::f32, isa_>()));
+    CHECK(safe_ptr_assign(reducer_bias_,
+            new cpu_reducer_t<data_type::f32, isa_>(pd()->reducer_bia_conf_)));
+    CHECK(kernel_->create_kernel());
+    CHECK(acc_ker_->create_kernel());
+    CHECK(reducer_bias_->create_kernel());
+
+    CHECK(init_rtus_driver<isa_>(this));
+    return status::success;
+}
+template <data_type_t diff_dst_type, data_type_t wei_type,
+        data_type_t diff_src_type, cpu_isa_t isa_>
+void jit_sve_1x1_convolution_bwd_weights_t<diff_dst_type, wei_type,
+        diff_src_type, isa_>::execute_backward_weights(const exec_ctx_t &ctx)
+        const {
+    auto diff_dst = CTX_IN_MEM(const data_t *, DNNL_ARG_DIFF_DST);
+    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto diff_weights = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_WEIGHTS);
+    auto diff_bias_in = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_BIAS);
+
+    const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper diff_weights_d(pd()->diff_weights_md(0));
+
+    const auto &jcp = kernel_->jcp;
+
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+    auto rtus_space = pd()->rtus_.reduce_src_
+            ? scratchpad.get<data_t>(key_conv_rtus_space)
+            : NULL;
+    const bool is_bias_padded
+            = pd()->with_bias() && jcp.oc_without_padding % jcp.oc_block != 0;
+
+    data_t *diff_bias = is_bias_padded
+            ? scratchpad.get<data_t>(key_conv_padded_bias)
+            : diff_bias_in;
+    auto wei_reduction = scratchpad.get<data_t>(key_conv_wei_reduction);
+
+    const int ndims = src_d.ndims();
+    const int wei_size = jcp.ngroups * rnd_up(jcp.oc, jcp.oc_block)
+            * rnd_up(jcp.ic, jcp.ic_block);
+
+    simple_barrier::ctx_t reduction_barrier;
+    simple_barrier::ctx_init(&reduction_barrier);
+
+    const auto reducer_bia_scratchpad
+            = memory_tracking::grantor_t(scratchpad, prefix_reducer_bia);
+    auto rb = this->reducer_bias_.get();
+    rb->init(reducer_bia_scratchpad);
+
+    // TODO (Roma): remove this restriction
+    assert(jcp.stride_w == 1 && jcp.stride_h == 1);
+
+    const int nb_ic = jcp.nb_bcast;
+    const int nb_ic_blocking = jcp.nb_bcast_blocking;
+
+    const int nb_oc = jcp.nb_load;
+    const int nb_oc_blocking = jcp.nb_load_blocking;
+
+    const int sp_nb = jcp.nb_reduce;
+    const int mb_sp_work = jcp.mb * sp_nb;
+
+    const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0];
+    const int stride_w = pd()->desc()->strides[ndims - 3];
+
+    auto step = [](int default_step, int remaining, int tail_step) {
+        assert(default_step <= tail_step);
+        return remaining < tail_step ? remaining : default_step;
+    };
+
+    const bool is_src_layout_nxc = utils::one_of(
+            jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
+
+    const bool is_ddst_layout_nxc = utils::one_of(
+            jcp.dst_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
+
+    auto maybe_zero_icpad = [&](const int g_start, const int g_end,
+                                    const int ocb_start, const int ocb_end) {
+        // write zeros to IC padded region.
+        const int ic_tail = jcp.ic_without_padding % jcp.ic_block;
+        if (is_ddst_layout_nxc && ic_tail != 0) {
+            for_(int g = g_start; g < g_end; ++g)
+            for (int z_ocb = ocb_start; z_ocb < ocb_end; ++z_ocb) {
+                const int z_icb = nb_ic - 1;
+                const size_t off = wht_blk_off(diff_weights_d, g, z_ocb, z_icb)
+                        + ic_tail * jcp.oc_block;
+                data_t *z_wei = diff_weights + off;
+                const int zero_work
+                        = (nb_ic * jcp.ic_block - jcp.ic_without_padding)
+                        * jcp.oc_block;
+                PRAGMA_OMP_SIMD()
+                for (int o = 0; o < zero_work; ++o) {
+                    z_wei[o] = 0;
+                }
+            }
+        }
+    };
+
+    auto ker = [&](const int ithr, const int nthr) {
+        assert(nthr == jcp.nthr);
+
+        const int ithr_ic_b = ithr % jcp.nthr_ic_b;
+        const int ithr_oc_b = ithr / jcp.nthr_ic_b % jcp.nthr_oc_b;
+        const int ithr_g = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b % jcp.nthr_g;
+        const int ithr_mb = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b / jcp.nthr_g;
+
+        /* reduction dimension */
+        int mb_sp_b_start {0}, mb_sp_b_end {0};
+        balance211(
+                mb_sp_work, jcp.nthr_mb, ithr_mb, mb_sp_b_start, mb_sp_b_end);
+
+        /* independent dimensions */
+        int g_start {0}, oc_b_start {0}, ic_b_start {0};
+        int g_end {0}, oc_b_end {0}, ic_b_end {0};
+
+        balance211(jcp.ngroups, jcp.nthr_g, ithr_g, g_start, g_end);
+        balance211(jcp.nb_load, jcp.nthr_oc_b, ithr_oc_b, oc_b_start, oc_b_end);
+        balance211(
+                jcp.nb_bcast, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end);
+
+        const int g_work = g_end - g_start;
+        const int oc_b_work = oc_b_end - oc_b_start;
+        const int ic_b_work = ic_b_end - ic_b_start;
+        const bool cache_aliasing
+                = (jcp.ic * jcp.ngroups * sizeof(float)) % 1024 == 0;
+        int reduce_step = jcp.nb_reduce_blocking;
+        int reduce_step_max = jcp.nb_reduce_blocking_max;
+        if (is_src_layout_nxc && cache_aliasing) {
+            // Experiments show 4 is a magic number with the tested shapes.
+            // TODO: maybe tune for shapes with sp_dim%4 != 0
+            reduce_step = nstl::min(4, reduce_step);
+            reduce_step_max = reduce_step;
+        }
+
+        data_t *diff_wei = ithr_mb == 0
+                ? diff_weights
+                : wei_reduction + (ithr_mb - 1) * wei_size;
+
+        int sp_b_step = 0;
+        for (int mb_sp_b = mb_sp_b_start; mb_sp_b < mb_sp_b_end;
+                mb_sp_b += sp_b_step) {
+            int img {0}, sp_b {0};
+            nd_iterator_init(mb_sp_b, img, jcp.mb, sp_b, sp_nb);
+            sp_b_step = step(reduce_step,
+                    nstl::min(sp_nb - sp_b, mb_sp_b_end - mb_sp_b),
+                    reduce_step_max);
+
+            for (int g = g_start; g < g_end; ++g) {
+                int load_step = 0;
+                int bcast_step = 0;
+                for (int ic_b = ic_b_start; ic_b < ic_b_end;
+                        ic_b += bcast_step) {
+                    if (is_src_layout_nxc && cache_aliasing) {
+                        bcast_step = ic_b_work;
+                    } else {
+                        bcast_step = step(nb_ic_blocking, ic_b_end - ic_b,
+                                jcp.nb_bcast_blocking_max);
+                    }
+
+                    for (int oc_b = oc_b_start; oc_b < oc_b_end;
+                            oc_b += load_step) {
+                        load_step = step(nb_oc_blocking, oc_b_end - oc_b,
+                                jcp.nb_load_blocking_max);
+                        const int _ic_b = g * nb_ic + ic_b;
+                        const int oc_off_idx = is_ddst_layout_nxc
+                                ? g * jcp.oc + oc_b * jcp.oc_block
+                                : g * nb_oc + oc_b;
+
+                        data_t *store_to;
+
+                        const size_t off
+                                = wht_blk_off(diff_weights_d, g, oc_b, ic_b);
+                        store_to = diff_wei + off;
+
+                        const int ic_off_idx
+                                = (is_src_layout_nxc ? jcp.ic_block : 1)
+                                * _ic_b;
+                        const data_t *diff_src
+                                = &src[src_d.blk_off(img, ic_off_idx)];
+
+                        int sp_b_end = sp_b + sp_b_step;
+                        const data_t *pdiff_dst = &diff_dst[diff_dst_d.blk_off(
+                                img, oc_off_idx)];
+                        const data_t *local_src = diff_src;
+
+                        auto p = jit_1x1_conv_call_s();
+                        auto rp = typename rtus_driver_t<isa_>::call_params_t();
+                        p.output_stride = utils::rnd_up(jcp.ic, jcp.ic_block)
+                                * jcp.oc_block * jcp.typesize_out;
+
+                        p.load_dim = this_block_size(oc_b * jcp.oc_block,
+                                jcp.oc, load_step * jcp.oc_block);
+
+                        p.bcast_dim = this_block_size(ic_b * jcp.ic_block,
+                                jcp.ic, bcast_step * jcp.ic_block);
+                        rp.icb = p.bcast_dim;
+                        p.output_data = store_to;
+
+                        p.reduce_dim = sp_b_step * jcp.reduce_block;
+                        rp.os = p.reduce_dim;
+                        p.first_last_flag = 0
+                                | (mb_sp_b == mb_sp_b_start ? FLAG_REDUCE_FIRST
+                                                            : 0)
+                                | (sp_b_end == sp_nb ? FLAG_SP_LAST : 0);
+
+                        int sp = sp_b * jcp.reduce_block;
+                        int oc_mult
+                                = is_ddst_layout_nxc ? jcp.oc : jcp.oc_block;
+                        p.load_data = pdiff_dst + sp * oc_mult;
+
+                        if (pd()->rtus_.reduce_src_) {
+                            const int oh = sp / jcp.ow;
+                            const int ow = sp % jcp.ow;
+
+                            const int ih = oh * stride_h;
+                            const int iw = ow * stride_w;
+                            rp.iw_start = iw;
+
+                            rp.ws = rtus_space
+                                    + ithr * pd()->rtus_.space_per_thread_
+                                    + sp * jcp.ic_block;
+
+                            if (ndims == 3)
+                                rp.src = local_src
+                                        + iw * src_d.blocking_desc().strides[2];
+                            else
+                                rp.src = local_src
+                                        + ih * src_d.blocking_desc().strides[2]
+                                        + iw * src_d.blocking_desc().strides[3];
+                            (*rtus_driver_)(&rp);
+
+                            p.bcast_data = rp.ws;
+                        } else {
+                            int ic_mult
+                                    = is_src_layout_nxc ? jcp.ic : jcp.ic_block;
+                            p.bcast_data = local_src + sp * ic_mult;
+                        }
+
+                        (*kernel_)(&p);
+                    }
+                }
+            }
+        }
+
+        if (ithr_mb == 0 && ic_b_end >= jcp.nb_bcast) {
+            maybe_zero_icpad(g_start, g_end, oc_b_start, oc_b_end);
+        }
+
+        /* diff_weights[:] += sum(wei_reduction[thr_mb][:]) */
+        if (dnnl_thr_syncable() && jcp.nthr_mb > 1) {
+            simple_barrier::barrier(&reduction_barrier, jcp.nthr);
+            const int work = g_work * oc_b_work * ic_b_work;
+            int start {0}, end {0};
+            balance211(work, jcp.nthr_mb, ithr_mb, start, end);
+            if (start == end) return;
+
+            for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) {
+                int w = start;
+                int sub_g_start {0}, sub_oc_b_start {0}, sub_ic_b_start {0};
+                nd_iterator_init(w, sub_g_start, g_work, sub_oc_b_start,
+                        oc_b_work, sub_ic_b_start, ic_b_work);
+                while (w < end) {
+                    const int g = g_start + sub_g_start;
+                    const int oc_b = oc_b_start + sub_oc_b_start;
+                    const int ic_b = ic_b_start + sub_ic_b_start;
+                    const int ic_to_accumulate
+                            = nstl::min(end - w, ic_b_work - sub_ic_b_start)
+                            * jcp.ic_block;
+                    const int acc_size
+                            = this_block_size(ic_b * jcp.ic_block,
+                                      jcp.ic_without_padding, ic_to_accumulate)
+                            * jcp.oc_block;
+
+                    const size_t off
+                            = wht_blk_off(diff_weights_d, g, oc_b, ic_b);
+                    data_t *d = diff_weights + off;
+                    data_t *s = wei_reduction + (thr_mb - 1) * wei_size + off;
+
+                    acc_ker_->accumulate(d, s, acc_size);
+
+                    nd_iterator_jump(w, end, sub_g_start, g_work,
+                            sub_oc_b_start, oc_b_work, sub_ic_b_start,
+                            ic_b_work);
+                }
+            }
+        }
+    };
+
+    auto ker_bias = [&](int ithr, int nthr) {
+        assert(nthr == rb->balancer().nthr_);
+
+        const int b_job_start = rb->balancer().ithr_job_off(ithr);
+        const int b_njobs = rb->balancer().ithr_njobs(ithr);
+
+        if (b_njobs == 0) return;
+
+        /* reduction dimension */
+        int img_start {0}, img_end {0};
+
+        balance211(jcp.mb, rb->balancer().nthr_per_group_,
+                rb->balancer().id_in_group(ithr), img_start, img_end);
+
+        /* jobs */
+        int g_start {0}, ocb_start {0};
+        nd_iterator_init(
+                b_job_start, g_start, jcp.ngroups, ocb_start, jcp.nb_load);
+
+        for (int img = img_start; img < img_end; ++img) {
+            int g = g_start, ocb = ocb_start;
+            for (int b_job_loc = 0; b_job_loc < b_njobs; ++b_job_loc) {
+                const int oc_off_idx = is_ddst_layout_nxc
+                        ? g * jcp.oc + ocb * jcp.oc_block
+                        : g * jcp.nb_load + ocb;
+                const data_t *d_dst
+                        = &diff_dst[diff_dst_d.blk_off(img, oc_off_idx)];
+
+                data_t *d_bias = rb->get_local_ptr(ithr, diff_bias,
+                                         reducer_bia_scratchpad)
+                        + b_job_loc * rb->balancer().job_size_;
+                const int sp_shift = is_ddst_layout_nxc ? jcp.ngroups * jcp.oc
+                                                        : jcp.oc_block;
+                const auto max_oc = this_block_size(
+                        ocb * jcp.oc_block, jcp.oc, jcp.oc_block);
+                if (img == img_start)
+                    for (int o = 0; o < jcp.oc_block; ++o)
+                        d_bias[o] = 0.;
+
+                for (int os = 0; os < jcp.os; ++os) {
+                    PRAGMA_OMP_SIMD()
+                    for (int o = 0; o < max_oc; ++o)
+                        d_bias[o] += d_dst[o];
+                    d_dst += sp_shift;
+                }
+
+                nd_iterator_step(g, jcp.ngroups, ocb, jcp.nb_load);
+            }
+        }
+
+        if (dnnl_thr_syncable())
+            rb->reduce(ithr, diff_bias, reducer_bia_scratchpad);
+    };
+
+    if (dnnl_thr_syncable()) {
+        parallel(jcp.nthr, [&](const int ithr, const int nthr) {
+            ker(ithr, jcp.nthr);
+            if (pd()->with_bias()) ker_bias(ithr, jcp.nthr);
+        });
+    } else {
+        parallel(jcp.nthr, [&](int ithr, int nthr) { ker(ithr, nthr); });
+        if (jcp.nthr_mb > 1)
+            parallel(jcp.nthr, [&](int ithr, int nthr) {
+                assert(nthr == jcp.nthr);
+
+                const int ithr_ic_b = ithr % jcp.nthr_ic_b;
+                const int ithr_oc_b = ithr / jcp.nthr_ic_b % jcp.nthr_oc_b;
+                const int ithr_g
+                        = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b % jcp.nthr_g;
+                const int ithr_mb
+                        = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b / jcp.nthr_g;
+
+                /* independent dimensions */
+                int g_start {0}, oc_b_start {0}, ic_b_start {0};
+                int g_end {0}, oc_b_end {0}, ic_b_end {0};
+
+                balance211(jcp.ngroups, jcp.nthr_g, ithr_g, g_start, g_end);
+                balance211(jcp.nb_load, jcp.nthr_oc_b, ithr_oc_b, oc_b_start,
+                        oc_b_end);
+                balance211(jcp.nb_bcast, jcp.nthr_ic_b, ithr_ic_b, ic_b_start,
+                        ic_b_end);
+
+                const int g_work = g_end - g_start;
+                const int oc_b_work = oc_b_end - oc_b_start;
+                const int ic_b_work = ic_b_end - ic_b_start;
+
+                const int work = g_work * oc_b_work * ic_b_work;
+                int start {0}, end {0};
+                balance211(work, jcp.nthr_mb, ithr_mb, start, end);
+                if (start == end) return;
+
+                for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) {
+                    int w = start;
+                    int sub_g_start {0}, sub_oc_b_start {0}, sub_ic_b_start {0};
+                    nd_iterator_init(w, sub_g_start, g_work, sub_oc_b_start,
+                            oc_b_work, sub_ic_b_start, ic_b_work);
+                    while (w < end) {
+                        const int g = g_start + sub_g_start;
+                        const int oc_b = oc_b_start + sub_oc_b_start;
+                        const int ic_b = ic_b_start + sub_ic_b_start;
+                        const int ic_to_accumulate
+                                = nstl::min(end - w, ic_b_work - sub_ic_b_start)
+                                * jcp.ic_block;
+                        const int acc_size
+                                = this_block_size(ic_b * jcp.ic_block,
+                                          jcp.ic_without_padding,
+                                          ic_to_accumulate)
+                                * jcp.oc_block;
+
+                        const size_t off
+                                = wht_blk_off(diff_weights_d, g, oc_b, ic_b);
+                        data_t *d = diff_weights + off;
+                        data_t *s
+                                = wei_reduction + (thr_mb - 1) * wei_size + off;
+
+                        acc_ker_->accumulate(d, s, acc_size);
+
+                        nd_iterator_jump(w, end, sub_g_start, g_work,
+                                sub_oc_b_start, oc_b_work, sub_ic_b_start,
+                                ic_b_work);
+                    }
+                }
+            });
+        if (pd()->with_bias()) {
+            parallel(jcp.nthr,
+                    [&](int ithr, int nthr) { ker_bias(ithr, nthr); });
+            parallel(jcp.nthr, [&](int ithr, int nthr) {
+                assert(nthr == rb->balancer().nthr_);
+                MAYBE_UNUSED(nthr);
+                if (rb->balancer().ithr_njobs(ithr) == 0) return;
+                rb->reduce_nolock(ithr, diff_bias, reducer_bia_scratchpad);
+            });
+        }
+    }
+
+    /* TODO: put this in ker_bias */
+    if (is_bias_padded) {
+        assert(IMPLICATION(!is_ddst_layout_nxc, jcp.ngroups == 1));
+        const int padded_stride = rnd_up(jcp.oc, jcp.oc_block);
+        const int stride = jcp.oc_without_padding;
+        for (int g = 0; g < jcp.ngroups; ++g) {
+            utils::array_copy(diff_bias_in + g * stride,
+                    diff_bias + g * padded_stride, stride);
+        }
+    }
+}
+
+template struct jit_sve_1x1_convolution_bwd_weights_t<data_type::f32,
+        data_type::f32, data_type::f32, sve_256>;
+template struct jit_sve_1x1_convolution_bwd_weights_t<data_type::f32,
+        data_type::f32, data_type::f32, sve_512>;
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/jit_sve_1x1_convolution.hpp b/src/cpu/aarch64/jit_sve_1x1_convolution.hpp
new file mode 100644
index 00000000000..fd0a19d94c5
--- /dev/null
+++ b/src/cpu/aarch64/jit_sve_1x1_convolution.hpp
@@ -0,0 +1,664 @@
+/*******************************************************************************
+* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_AARCH64_JIT_SVE_1X1_CONVOLUTION_HPP
+#define CPU_AARCH64_JIT_SVE_1X1_CONVOLUTION_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_hashing.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/cpu_convolution_pd.hpp"
+#include "cpu/dw_convolution_utils.hpp"
+#include "cpu/platform.hpp"
+
+#include "cpu/aarch64/cpu_reducer.hpp"
+#include "cpu/aarch64/jit_sve_1x1_conv_kernel.hpp"
+#include "cpu/aarch64/jit_uni_1x1_conv_utils.hpp"
+#include "cpu/aarch64/jit_uni_dw_convolution.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+
+template <impl::data_type_t src_type, impl::data_type_t wei_type = src_type,
+        impl::data_type_t dst_type = src_type, cpu_isa_t isa_ = isa_undef>
+struct jit_sve_1x1_convolution_fwd_t : public primitive_t {
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
+
+        pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
+            if (copy(other) != status::success) is_initialized_ = false;
+        }
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", isa_, ""),
+                jit_sve_1x1_convolution_fwd_t);
+
+        status_t init(engine_t *engine) {
+            using namespace utils;
+
+            bool ok = true && is_fwd()
+                    && set_default_alg_kind(alg_kind::convolution_direct)
+                    && expect_data_types(src_type, wei_type, dst_type, dst_type,
+                            data_type::undef)
+                    && attr()->has_default_values(
+                            primitive_attr_t::skip_mask_t::post_ops, dst_type)
+                    && !has_zero_dim_memory() && set_default_formats()
+                    && attr_.set_default_formats(dst_md(0)) == status::success;
+            if (!ok) { return status::unimplemented; }
+
+            const convolution_desc_t *conv_d = desc();
+            const memory_desc_t *src_d = src_md();
+            rtus_prepare(this, conv_d, src_d, dst_md());
+
+            CHECK(jit_sve_1x1_conv_kernel<isa_>::init_conf(jcp_, *conv_d,
+                    *src_d, *weights_md(), *dst_md(), *attr(),
+                    dnnl_get_max_threads(), rtus_.reduce_src_));
+            if (jcp_.with_dw_conv) CHECK(depthwise_po_init(engine));
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_sve_1x1_conv_kernel<isa_>::init_scratchpad(scratchpad, jcp_);
+
+            rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
+
+            return status::success;
+        }
+
+        const memory_desc_t *dst_md(
+                int index = 0, bool user_input = false) const override {
+            return jcp_.with_dw_conv
+                    ? dw_conv_pd_->dst_md(index, user_input)
+                    : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
+        }
+
+        const memory_desc_t *arg_md(
+                int arg, bool user_input = false) const override {
+            if (jcp_.with_dw_conv) {
+                switch (arg) {
+                    case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
+                        return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
+                    case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS:
+                        return dw_conv_pd_->weights_md(0);
+                    case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS:
+                        return dw_conv_pd_->weights_md(1);
+                    default: break;
+                }
+            }
+            return convolution_fwd_pd_t::arg_md(arg, user_input);
+        }
+
+        arg_usage_t arg_usage(int arg) const override {
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
+                return arg_usage_t::input;
+
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
+                    && attr_post_op_dw_inputs() > 1)
+                return arg_usage_t::input;
+
+            return convolution_fwd_pd_t::arg_usage(arg);
+        }
+
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+        using dw_pd_t = jit_sve_512_dw_convolution_fwd_t::pd_t;
+        std::unique_ptr<dw_pd_t> dw_conv_pd_;
+
+    protected:
+        bool set_default_formats() {
+            using namespace format_tag;
+
+            const memory_desc_wrapper src_d(&src_md_);
+            const memory_desc_wrapper dst_d(&dst_md_);
+
+            const auto dat_tag_nxc = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
+            format_tag_t dat_tag, wei_tag;
+
+            switch (isa_) {
+                case sve_512: {
+                    const auto dat_tag_nCx16c = utils::pick(
+                            ndims() - 3, nCw16c, nChw16c, nCdhw16c);
+                    const auto curr_src_tag = src_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx16c);
+                    const auto curr_dst_tag = dst_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx16c);
+                    const auto is_data_layout_nxc
+                            = IMPLICATION(curr_src_tag != dat_tag_nxc,
+                                      src_d.format_kind() == format_kind::any)
+                            && IMPLICATION(curr_dst_tag != dat_tag_nxc,
+                                    dst_d.format_kind() == format_kind::any)
+                            && utils::one_of(
+                                    dat_tag_nxc, curr_src_tag, curr_dst_tag);
+                    dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
+                    wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
+                            OIw16i16o, gOIw16i16o, OIhw16i16o, gOIhw16i16o,
+                            OIdhw16i16o, gOIdhw16i16o);
+                    break;
+                }
+                case sve_256: {
+                    const auto dat_tag_nCx8c
+                            = utils::pick(ndims() - 3, nCw8c, nChw8c, nCdhw8c);
+                    const auto curr_src_tag = src_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx8c);
+                    const auto curr_dst_tag = dst_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx8c);
+                    const auto is_data_layout_nxc
+                            = IMPLICATION(curr_src_tag != dat_tag_nxc,
+                                      src_d.format_kind() == format_kind::any)
+                            && IMPLICATION(curr_dst_tag != dat_tag_nxc,
+                                    dst_d.format_kind() == format_kind::any)
+                            && utils::one_of(
+                                    dat_tag_nxc, curr_src_tag, curr_dst_tag);
+                    dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx8c;
+                    wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
+                            OIw8i8o, gOIw8i8o, OIhw8i8o, gOIhw8i8o, OIdhw8i8o,
+                            gOIdhw8i8o);
+                    break;
+                }
+                default: break;
+            }
+            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
+        }
+        status_t copy(const pd_t &other) {
+            jcp_ = other.jcp_;
+            rtus_ = other.rtus_;
+            if (other.dw_conv_pd_) {
+                dw_conv_pd_.reset(other.dw_conv_pd_->clone());
+                if (!dw_conv_pd_) return status::out_of_memory;
+            }
+            return status::success;
+        }
+
+        status_t depthwise_po_init(engine_t *engine) {
+
+            using namespace memory_tracking;
+            auto &jcp_1x1 = jcp_;
+            primitive_attr_t attr_1x1(*attr());
+            if (!attr_1x1.is_initialized()) return status::out_of_memory;
+            const auto &src_md = dst_md_;
+            const memory_desc_wrapper src_d(src_md);
+            const auto nthr = dnnl_get_max_threads();
+            auto l2_cache = platform::get_per_core_cache_size(2) * nthr;
+
+            // Note: A robust fusion implementation would be to check if both
+            // 1x1 conv and dw conv that are considered here for fusion are
+            // optimal independently. This would require creating a new
+            // primitive_desc through primitive_iterator & check if they match.
+            // Due to concern that these creations and/or checks could be heavy,
+            // for 1x1: Check that no better ISA is available.
+            // for dw: Always fuse with same ISA.
+            // Caveat: May be a better dw conv exists.
+
+            // TODO: Add a check if better ISA exists following above note.
+            bool ok = true
+                    && (attr_1x1.post_ops_.find(primitive_kind::sum) == -1)
+                    // TODO: Below may be further tuned.
+                    && (l2_cache * 2 < src_d.size())
+                    // load_grp_count check can be redundant due to l2 check
+                    // above. Adding it explicitly as the current driver doesn't
+                    // work if this condition fails.
+                    && (jcp_1x1.load_grp_count < 2);
+            if (!ok) return status::unimplemented;
+
+            int dw_po_index
+                    = attr_1x1.post_ops_.find(primitive_kind::convolution);
+            convolution_desc_t cd_dw;
+            primitive_attr_t attr_dw;
+            CHECK(get_depthwise_conv_desc(
+                    cd_dw, src_md, attr_1x1, attr_dw, dw_po_index));
+
+            // The code below doesn't work because currently it requires `jcp_`
+            // member which is not available from the common interface. In turn,
+            // this means the common pd creation interface through an iterator
+            // can't be used and a specific convolution implementation's pd is
+            // required here. It restricts the usage of inherited
+            // `convolution_pd_t` constructor.
+            // ANCHOR: USING_INHERITED_IS_IMPOSSIBLE.
+            //
+            // ```cpp
+            // primitive_desc_iterator_t it(
+            //         engine, (op_desc_t *)&cd_dw, &attr_dw, nullptr);
+            // if (!it.is_initialized()) return status::out_of_memory;
+            // while (++it != it.end()) {
+            //     dw_conv_pd_ = *it;
+            //     break;
+            // }
+            // VDISPATCH_CONV_IC(dw_conv_pd_, "dw_conv_pd hasn't been created");
+            // ```
+            //
+            // ```compiler output
+            // error: ‘using element_type = struct dnnl::impl::primitive_desc_t’
+            // {aka ‘struct dnnl::impl::primitive_desc_t’} has no member named
+            // ‘jcp_’
+            // auto &jcp_dw = dw_conv_pd_->jcp_;
+            //                             ^~~~
+            // ```
+            //
+            // TODO: figure out the way to initialize fused conv through a
+            // normal interface without hacks accessing specific members.
+            CHECK(safe_ptr_assign(
+                    dw_conv_pd_, new dw_pd_t(&cd_dw, &attr_dw, nullptr)));
+            CHECK(dw_conv_pd_->init(engine));
+            auto &jcp_dw = dw_conv_pd_->jcp_;
+
+            ok = true
+                    && (dnnl_memory_desc_equal(&src_md, dw_conv_pd_->src_md(0)))
+                    && (jcp_1x1.oc_without_padding % jcp_1x1.oc_block == 0)
+                    && IMPLICATION(
+                            jcp_dw.ow_block, jcp_dw.ow_block == jcp_dw.ow);
+            if (!ok) return status::unimplemented;
+
+            assert(dw_conv_pd_->dst_md(0)->format_kind != format_kind::any);
+            assert(dw_conv_pd_->weights_md(0)->format_kind != format_kind::any);
+            assert(IMPLICATION(
+                    dw_conv_pd_->weights_md(1)->data_type != data_type::undef,
+                    dw_conv_pd_->weights_md(1)->format_kind
+                            != format_kind::any));
+
+            jcp_dw.is_fused_conv = true;
+            // TODO: Support/experiment arbitary oc_work in dw conv.
+            // Until then we keep oc_work perfectly divisible.
+            while (jcp_1x1.nb_load % jcp_1x1.nb_load_blocking != 0)
+                --jcp_1x1.nb_load_blocking;
+            jcp_1x1.nb_load_blocking_max = jcp_1x1.nb_load_blocking;
+
+            while (jcp_1x1.nb_load_blocking % jcp_dw.nb_ch_blocking != 0)
+                --jcp_dw.nb_ch_blocking;
+
+            jcp_dw.dw_conv_buffer_oc
+                    = jcp_1x1.nb_load_blocking * jcp_1x1.oc_block;
+
+            const auto dat_tag_nxc = utils::pick(ndims() - 3, format_tag::nwc,
+                    format_tag::nhwc, format_tag::ndhwc);
+            const bool is_data_nxc = utils::everyone_is(
+                    dat_tag_nxc, jcp_1x1.src_tag, jcp_1x1.dst_tag);
+            if (!is_data_nxc)
+                jcp_1x1.bcast_loop_output_step = jcp_1x1.ur * jcp_1x1.load_block
+                        * jcp_1x1.typesize_out;
+
+            registrar_t scratchpad(scratchpad_registry_);
+            registrar_t dw_scratchpad(scratchpad, names::prefix_fusion);
+
+            size_t dw_conv_buffer_size_ = (size_t)nthr * jcp_dw.kh * jcp_dw.iw
+                    * jcp_dw.dw_conv_buffer_oc;
+            assert(dw_conv_buffer_size_);
+            dw_scratchpad.book(memory_tracking::names::key_fusion_inout_buffer,
+                    dw_conv_buffer_size_,
+                    types::data_type_size(dw_conv_pd_->src_md()->data_type));
+
+            jit_uni_dw_conv_fwd_kernel<isa_, data_type::f32>::init_scratchpad(
+                    dw_scratchpad, jcp_dw);
+
+            return status::success;
+        }
+    };
+
+    template <cpu_isa_t isa, typename conv_t>
+    friend status_t init_rtus_driver(conv_t *self);
+
+    jit_sve_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    typedef typename prec_traits_t<src_type>::type src_data_t;
+    typedef typename prec_traits_t<wei_type>::type wei_data_t;
+    typedef typename prec_traits_t<dst_type>::type dst_data_t;
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_,
+                new jit_sve_1x1_conv_kernel<isa_>(
+                        pd()->jcp_, *pd()->attr(), *pd()->dst_md(0))));
+        CHECK(kernel_->create_kernel());
+
+        if (pd()->jcp_.with_dw_conv) {
+            CHECK(safe_ptr_assign(
+                    kernel_dw_, new dw_conv_kernel_t(pd()->dw_conv_pd_->jcp_)));
+            CHECK(kernel_dw_->create_kernel());
+        }
+        CHECK(init_rtus_driver<isa_>(this));
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_forward(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_forward(const exec_ctx_t &ctx) const;
+    void execute_forward_thr(const int ithr, const int nthr,
+            const src_data_t *src, const wei_data_t *weights,
+            const dst_data_t *bias, const wei_data_t *weights_dw,
+            const dst_data_t *bias_dw, dst_data_t *dst,
+            const memory_tracking::grantor_t &scratchpad,
+            const void *post_ops_binary_rhs_arg_vec,
+            const void *post_ops_binary_rhs_arg_vec_dw) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<jit_sve_1x1_conv_kernel<isa_>> kernel_;
+    std::unique_ptr<rtus_driver_t<isa_>> rtus_driver_;
+    using dw_conv_kernel_t = jit_uni_dw_conv_fwd_kernel_f32<isa_>;
+    std::unique_ptr<dw_conv_kernel_t> kernel_dw_;
+};
+
+using jit_sve_256_1x1_convolution_fwd_f32_t
+        = jit_sve_1x1_convolution_fwd_t<data_type::f32, data_type::f32,
+                data_type::f32, sve_256>;
+using jit_sve_512_1x1_convolution_fwd_f32_t
+        = jit_sve_1x1_convolution_fwd_t<data_type::f32, data_type::f32,
+                data_type::f32, sve_512>;
+
+template <impl::data_type_t diff_dst_type,
+        impl::data_type_t wei_type = diff_dst_type,
+        impl::data_type_t diff_src_type = diff_dst_type,
+        cpu_isa_t isa_ = isa_undef>
+struct jit_sve_1x1_convolution_bwd_data_t : public primitive_t {
+    struct pd_t : public cpu_convolution_bwd_data_pd_t {
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", isa_, ""),
+                jit_sve_1x1_convolution_bwd_data_t);
+
+        status_t init(engine_t *engine) {
+            bool ok = true && desc()->prop_kind == prop_kind::backward_data
+                    && set_default_alg_kind(alg_kind::convolution_direct)
+                    && expect_data_types(diff_src_type, wei_type,
+                            data_type::undef, diff_dst_type, data_type::undef)
+                    && attr()->has_default_values() && !has_zero_dim_memory()
+                    && set_default_formats();
+            if (!ok) return status::unimplemented;
+
+            const convolution_desc_t *conv_d = desc();
+            const memory_desc_t *diff_src_d = diff_src_md();
+            rtus_prepare(this, conv_d, diff_src_d, diff_dst_md());
+
+            status_t status = jit_sve_1x1_conv_kernel<isa_>::init_conf(jcp_,
+                    *conv_d, *diff_src_d, *weights_md(), *diff_dst_md(),
+                    *attr(), dnnl_get_max_threads(), rtus_.reduce_src_);
+            if (status != status::success) return status;
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_sve_1x1_conv_kernel<isa_>::init_scratchpad(scratchpad, jcp_);
+
+            rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
+
+            return status::success;
+        }
+
+        // TODO (Roma): structs conf header cleanup
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+
+    protected:
+        bool set_default_formats() {
+            using namespace format_tag;
+
+            const memory_desc_wrapper diff_src_d(&diff_src_md_);
+            const memory_desc_wrapper diff_dst_d(&diff_dst_md_);
+
+            const auto dat_tag_nxc = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
+            format_tag_t dat_tag, wei_tag;
+
+            switch (isa_) {
+                case sve_512: {
+                    const auto dat_tag_nCx16c = utils::pick(
+                            ndims() - 3, nCw16c, nChw16c, nCdhw16c);
+                    const auto curr_src_tag = diff_src_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx16c);
+                    const auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx16c);
+                    const auto is_data_layout_nxc
+                            = IMPLICATION(curr_src_tag != dat_tag_nxc,
+                                      diff_src_d.format_kind()
+                                              == format_kind::any)
+                            && IMPLICATION(curr_dst_tag != dat_tag_nxc,
+                                    diff_dst_d.format_kind()
+                                            == format_kind::any)
+                            && utils::one_of(
+                                    dat_tag_nxc, curr_src_tag, curr_dst_tag);
+                    dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
+                    wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
+                            IOw16o16i, gIOw16o16i, IOhw16o16i, gIOhw16o16i,
+                            IOdhw16o16i, gIOdhw16o16i);
+                    break;
+                }
+                case sve_256: {
+                    const auto dat_tag_nCx8c
+                            = utils::pick(ndims() - 3, nCw8c, nChw8c, nCdhw8c);
+                    const auto curr_src_tag = diff_src_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx8c);
+                    const auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx8c);
+                    const auto is_data_layout_nxc
+                            = IMPLICATION(curr_src_tag != dat_tag_nxc,
+                                      diff_src_d.format_kind()
+                                              == format_kind::any)
+                            && IMPLICATION(curr_dst_tag != dat_tag_nxc,
+                                    diff_dst_d.format_kind()
+                                            == format_kind::any)
+                            && utils::one_of(
+                                    dat_tag_nxc, curr_src_tag, curr_dst_tag);
+                    dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx8c;
+                    wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
+                            IOw8o8i, gIOw8o8i, IOhw8o8i, gIOhw8o8i, IOdhw8o8i,
+                            gIOdhw8o8i);
+                    break;
+                }
+                default: break;
+            }
+            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
+        }
+    };
+
+    template <cpu_isa_t isa, typename conv_t>
+    friend status_t init_rtus_driver(conv_t *self);
+
+    jit_sve_1x1_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
+
+    typedef typename prec_traits_t<diff_dst_type>::type diff_dst_data_t;
+    typedef typename prec_traits_t<wei_type>::type wei_data_t;
+    typedef typename prec_traits_t<diff_src_type>::type diff_src_data_t;
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_,
+                new jit_sve_1x1_conv_kernel<isa_>(
+                        pd()->jcp_, *pd()->attr(), *pd()->dst_md(0))));
+        CHECK(kernel_->create_kernel());
+        CHECK(init_rtus_driver<isa_>(this));
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_backward_data(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_backward_data(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<jit_sve_1x1_conv_kernel<isa_>> kernel_;
+    std::unique_ptr<rtus_driver_t<isa_>> rtus_driver_;
+};
+using jit_sve_256_1x1_convolution_bwd_data_f32_t
+        = jit_sve_1x1_convolution_bwd_data_t<data_type::f32, data_type::f32,
+                data_type::f32, sve_256>;
+
+using jit_sve_512_1x1_convolution_bwd_data_f32_t
+        = jit_sve_1x1_convolution_bwd_data_t<data_type::f32, data_type::f32,
+                data_type::f32, sve_512>;
+
+/* Backward weight */
+template <impl::data_type_t diff_dst_type,
+        impl::data_type_t wei_type = diff_dst_type,
+        impl::data_type_t diff_src_type = diff_dst_type,
+        cpu_isa_t isa_ = isa_undef>
+struct jit_sve_1x1_convolution_bwd_weights_t : public primitive_t {
+    struct pd_t : public cpu_convolution_bwd_weights_pd_t {
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", isa_, ""),
+                jit_sve_1x1_convolution_bwd_weights_t);
+
+        status_t init(engine_t *engine) {
+            bool ok = true && desc()->prop_kind == prop_kind::backward_weights
+                    && set_default_alg_kind(alg_kind::convolution_direct)
+                    && expect_data_types(data_type::f32, data_type::f32,
+                            data_type::f32, data_type::f32, data_type::f32)
+                    && attr()->has_default_values() && !has_zero_dim_memory()
+                    && set_default_formats();
+            if (!ok) { return status::unimplemented; }
+
+            const convolution_desc_t *conv_d = desc();
+            const memory_desc_t *src_d = src_md();
+            rtus_prepare(this, conv_d, src_d, diff_dst_md());
+
+            status_t status = jit_sve_1x1_conv_kernel<isa_>::init_conf(jcp_,
+                    *conv_d, *src_d, *diff_weights_md(), *diff_dst_md(),
+                    *attr(), dnnl_get_max_threads(), rtus_.reduce_src_);
+            if (status != status::success) return status;
+
+            init_balancers();
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_sve_1x1_conv_kernel<isa_>::init_scratchpad(scratchpad, jcp_);
+
+            auto reducer_bia_scratchpad = memory_tracking::registrar_t(
+                    scratchpad, memory_tracking::names::prefix_reducer_bia);
+            reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad);
+            rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
+
+            return status::success;
+        }
+
+        // TODO (Roma): structs conf header cleanup
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        typename cpu_reducer_t<data_type::f32, isa_>::conf_t reducer_bia_conf_;
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+
+    protected:
+        bool set_default_formats() {
+            using namespace format_tag;
+
+            const memory_desc_wrapper src_d(&src_md_);
+            const memory_desc_wrapper diff_dst_d(&diff_dst_md_);
+
+            const auto dat_tag_nxc = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
+
+            format_tag_t dat_tag, wei_tag;
+
+            switch (isa_) {
+                case sve_512: {
+                    auto dat_tag_nCx16c = utils::pick(
+                            ndims() - 3, nCw16c, nChw16c, nCdhw16c);
+                    const auto curr_src_tag = src_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx16c);
+                    const auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx16c);
+                    const auto is_data_layout_nxc
+                            = IMPLICATION(curr_src_tag != dat_tag_nxc,
+                                      src_d.format_kind() == format_kind::any)
+                            && IMPLICATION(curr_dst_tag != dat_tag_nxc,
+                                    diff_dst_d.format_kind()
+                                            == format_kind::any)
+                            && utils::one_of(
+                                    dat_tag_nxc, curr_src_tag, curr_dst_tag);
+
+                    dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
+                    wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
+                            OIw16i16o, gOIw16i16o, OIhw16i16o, gOIhw16i16o,
+                            OIdhw16i16o, gOIdhw16i16o);
+                    break;
+                }
+                case sve_256: {
+                    const auto dat_tag_nCx8c
+                            = utils::pick(ndims() - 3, nCw8c, nChw8c, nCdhw8c);
+                    const auto curr_src_tag = src_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx8c);
+                    const auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
+                            dat_tag_nxc, dat_tag_nCx8c);
+                    const auto is_data_layout_nxc
+                            = IMPLICATION(curr_src_tag != dat_tag_nxc,
+                                      src_d.format_kind() == format_kind::any)
+                            && IMPLICATION(curr_dst_tag != dat_tag_nxc,
+                                    diff_dst_d.format_kind()
+                                            == format_kind::any)
+                            && utils::one_of(
+                                    dat_tag_nxc, curr_src_tag, curr_dst_tag);
+
+                    dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx8c;
+                    wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
+                            OIw8i8o, gOIw8i8o, OIhw8i8o, gOIhw8i8o, OIdhw8i8o,
+                            gOIdhw8i8o);
+                    break;
+                }
+                default: break;
+            }
+            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
+        }
+
+    private:
+        void init_balancers() {
+            const size_t max_buffer_size = jcp_.nthr * 3 * 5 * 5 * 16 * 16;
+            if (with_bias()) {
+                reducer_bia_conf_.init(reduce_balancer_t(jcp_.nthr,
+                        jcp_.oc_block, jcp_.ngroups * jcp_.nb_load, jcp_.mb,
+                        max_buffer_size, true));
+            }
+        }
+    };
+
+    template <cpu_isa_t isa, typename conv_t>
+    friend status_t init_rtus_driver(conv_t *self);
+
+    jit_sve_1x1_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
+
+    typedef typename prec_traits_t<data_type::f32>::type data_t;
+
+    status_t init(engine_t *engine) override;
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_backward_weights(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_backward_weights(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<jit_sve_1x1_conv_kernel<isa_>> kernel_;
+    std::unique_ptr<cpu_accumulator_1d_t<data_type::f32, isa_>> acc_ker_;
+    std::unique_ptr<cpu_reducer_t<data_type::f32, isa_>> reducer_bias_;
+    // std::unique_ptr<jit_transpose4x16_src> trans_kernel_;
+    std::unique_ptr<rtus_driver_t<isa_>> rtus_driver_;
+};
+
+using jit_sve_256_1x1_convolution_bwd_weights_t
+        = jit_sve_1x1_convolution_bwd_weights_t<data_type::f32, data_type::f32,
+                data_type::f32, sve_256>;
+
+using jit_sve_512_1x1_convolution_bwd_weights_t
+        = jit_sve_1x1_convolution_bwd_weights_t<data_type::f32, data_type::f32,
+                data_type::f32, sve_512>;
+
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/aarch64/jit_sve_512_1x1_conv_kernel.cpp b/src/cpu/aarch64/jit_sve_512_1x1_conv_kernel.cpp
deleted file mode 100644
index 827b8904633..00000000000
--- a/src/cpu/aarch64/jit_sve_512_1x1_conv_kernel.cpp
+++ /dev/null
@@ -1,1333 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <assert.h>
-#include <float.h>
-
-#include "common/c_types_map.hpp"
-#include "common/dnnl_thread.hpp"
-#include "common/memory.hpp"
-#include "common/memory_tracking.hpp"
-#include "common/nstl.hpp"
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-
-#include "cpu/aarch64/cpu_barrier.hpp"
-#include "cpu/platform.hpp"
-
-#include "cpu/aarch64/injectors/injector_utils.hpp"
-#include "cpu/aarch64/injectors/jit_uni_binary_injector.hpp"
-#include "cpu/aarch64/injectors/jit_uni_eltwise_injector.hpp"
-#include "cpu/aarch64/jit_sve_512_1x1_conv_kernel.hpp"
-#include "cpu/aarch64/jit_uni_1x1_conv_utils.hpp"
-
-#define GET_OFF(field) \
-    static_cast<int32_t>(offsetof(jit_1x1_conv_call_s, field))
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-using namespace dnnl::impl::format_tag;
-using namespace dnnl::impl::prop_kind;
-using namespace dnnl::impl::utils;
-
-jit_sve_512_1x1_conv_kernel::jit_sve_512_1x1_conv_kernel(
-        const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
-        const memory_desc_t &dst_md)
-    : jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
-        using namespace binary_injector;
-        static constexpr bool preserve_gpr = true;
-        static constexpr bool preserve_vmm = false;
-        static constexpr size_t helper_vmm_idx = 31;
-        const size_t tail_size = jcp.oc_without_padding % isa_simd_width_;
-        static constexpr bool use_exact_tail_scalar_bcast = true;
-
-        const rhs_arg_static_params_t rhs_arg_static_params {helper_vmm_idx,
-                x14, x15, x13, preserve_gpr, preserve_vmm,
-                GET_OFF(post_ops_binary_rhs_arg_vec), GET_OFF(dst_orig),
-                memory_desc_wrapper(dst_md), tail_size, k_load_dim_mask,
-                use_exact_tail_scalar_bcast};
-        const static_params_t static_params {
-                this->param1, rhs_arg_static_params};
-
-        postops_injector_ = utils::make_unique<
-                injector::jit_uni_postops_injector_t<sve_512>>(
-                this, jcp.post_ops, static_params);
-    }
-}
-
-void jit_sve_512_1x1_conv_kernel::bcast_loop(int load_loop_blk) {
-
-    mov(aux1_reg_bcast_data, reg_bcast_data);
-    mov(aux_reg_bcast_data, reg_bcast_data);
-    mov(aux_reg_output_data, reg_output_data);
-    ldr(reg_bcast_loop_iter, ptr(X_SP, reg_bcast_loop_work_offt));
-
-    Label bcast_loop;
-    Label bcast_loop_tail;
-    Label large_tail;
-
-    cmp_imm(reg_bcast_loop_iter, jcp.bcast_block, reg_tmp_imm);
-    b(LT, bcast_loop_tail);
-
-    L(bcast_loop);
-    {
-        assert(jcp.bcast_block % jcp.ur == 0);
-        int num_substeps = jcp.bcast_block / jcp.ur;
-        assert(num_substeps > 0 && num_substeps < 10);
-        for (int i = 0; i < num_substeps; i++) {
-            if (i + 1 == num_substeps) L(large_tail);
-            reduce_loop(load_loop_blk, jcp.ur, i, false);
-            if (i < num_substeps - 1) {
-                add_imm(aux1_reg_bcast_data, aux1_reg_bcast_data,
-                        jcp.bcast_loop_bcast_substep, reg_tmp_imm);
-                add_imm(aux_reg_output_data, aux_reg_output_data,
-                        jcp.bcast_loop_output_substep, reg_tmp_imm);
-            } else {
-                add_imm(aux1_reg_bcast_data, aux1_reg_bcast_data,
-                        jcp.bcast_loop_bcast_step
-                                - (num_substeps - 1)
-                                        * jcp.bcast_loop_bcast_substep,
-                        reg_tmp_imm);
-                add_imm(aux_reg_output_data, aux_reg_output_data,
-                        jcp.bcast_loop_output_step
-                                - (num_substeps - 1)
-                                        * jcp.bcast_loop_output_substep,
-                        reg_tmp_imm);
-            }
-            subs_imm(reg_bcast_loop_iter, reg_bcast_loop_iter, jcp.ur,
-                    reg_tmp_imm);
-        }
-        cmp_imm(reg_bcast_loop_iter, jcp.bcast_block, reg_tmp_imm);
-        b(GE, bcast_loop);
-    }
-
-    L(bcast_loop_tail);
-    if (jcp.ur_tail) {
-        Label bcast_loop_tail_out;
-        if (jcp.ur_tail >= jcp.ur) {
-            cmp_imm(reg_bcast_loop_iter, jcp.ur, reg_tmp_imm);
-            b(GE, large_tail);
-        }
-        if (jcp.ur_tail % jcp.ur) {
-            cmp(reg_bcast_loop_iter, 0);
-            b(LE, bcast_loop_tail_out);
-            reduce_loop(load_loop_blk, jcp.ur_tail % jcp.ur, 0, true);
-            L(bcast_loop_tail_out);
-        }
-    }
-}
-
-Xbyak_aarch64::XReg jit_sve_512_1x1_conv_kernel::output_ptr(
-        const bool is_out_layout_nxc, const int i_load, const int i_ur,
-        Xbyak_aarch64::XReg addr) {
-    if (one_of(jcp.prop_kind, forward_training, forward_inference,
-                backward_data)) {
-        int i_load_shift = is_out_layout_nxc
-                ? jcp.load_block
-                : (jcp.with_dw_conv ? jcp.ow : jcp.bcast_dim) * jcp.load_block;
-        int i_ur_shift = is_out_layout_nxc ? jcp.load_dim : jcp.load_block;
-        int offset = (i_load * i_load_shift + i_ur * i_ur_shift)
-                * jcp.typesize_out;
-        EVEX_compress_addr(addr, X_TMP_0, aux_reg_output_data, offset);
-    } else {
-        int offset = jcp.typesize_out * jcp.load_block * i_ur;
-        mov(X_TMP_0, i_load);
-        mul(X_TMP_0, reg_output_stride, X_TMP_0);
-        add_imm(X_TMP_1, X_TMP_0, offset, X_TMP_2);
-        add(addr, aux_reg_output_data, X_TMP_1);
-    }
-    return addr;
-}
-
-static int vreg_accum_idx(
-        const int load_loop_blk, const int i_load, const int i_ur) {
-    return (i_ur * load_loop_blk + i_load);
-}
-
-template <typename F>
-static void iterate(const int load_loop_blk, const int ur, const bool mask_tail,
-        const F &fun) {
-    for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-        const bool mask_flag = mask_tail && i_load + 1 == load_loop_blk;
-        for (int i_ur = 0; i_ur < ur; ++i_ur)
-            fun(mask_flag, i_load, i_ur);
-    }
-}
-template <typename F>
-static void iterate(const int load_loop_blk, const int ur, const F &fun) {
-    iterate(load_loop_blk, ur, false, fun);
-}
-
-void jit_sve_512_1x1_conv_kernel::apply_postops(
-        const bool is_out_layout_nxc, const int load_loop_blk, const int ur) {
-    injector_utils::vmm_index_set_t vmm_idxs;
-    if (jcp.with_binary) {
-        binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
-        const auto mask_tail = jcp.oc_without_padding % jcp.load_block;
-        iterate(load_loop_blk, ur, mask_tail,
-                [&](const bool mask_flag, const int i_load, const int i_ur) {
-                    const auto vmm_idx
-                            = vreg_accum_idx(load_loop_blk, i_load, i_ur);
-                    vmm_idxs.emplace(vmm_idx);
-
-                    rhs_arg_params.vmm_idx_to_out_reg.emplace(
-                            vmm_idx, aux_reg_output_data);
-                    rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(vmm_idx,
-                            get_output_offset(is_out_layout_nxc, i_load, i_ur));
-                    if (mask_flag)
-                        rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
-                });
-
-        ldr(abi_param1, ptr(X_SP, reg_abi_param1_backup));
-
-        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
-    } else {
-        iterate(load_loop_blk, ur,
-                [&](const bool, const int i_load, const int i_ur) {
-                    vmm_idxs.emplace(
-                            vreg_accum_idx(load_loop_blk, i_load, i_ur));
-                });
-        postops_injector_->compute_vector_range(vmm_idxs);
-    }
-}
-
-void jit_sve_512_1x1_conv_kernel::reduce_loop(
-        int load_loop_blk, int ur, int substep, bool wraparound) {
-
-    const bool out_layout_nxc = is_out_layout_nxc(jcp);
-    const bool load_layout_nxc = is_load_layout_nxc(jcp);
-    const bool bcast_layout_nxc = is_bcast_layout_nxc(jcp);
-    const int reduce_dim_tail = jcp.reduce_dim % jcp.reduce_block;
-    const int load_dim_tail = jcp.load_dim % jcp.load_block;
-
-    auto vreg_load
-            = [=](int i_load) { return ZReg(ur * load_loop_blk + i_load); };
-
-    auto vreg_accum = [=](int i_load, int i_ur) {
-        return ZReg(vreg_accum_idx(load_loop_blk, i_load, i_ur));
-    };
-
-    auto bias_ptr = [=](int i_load) {
-        return EVEX_compress_addr(X_DEFAULT_ADDR, X_TMP_0, reg_bias_data,
-                jcp.typesize_out * jcp.oc_block * i_load);
-    };
-
-    auto bcast_ptr = [=](int i_reduce, int i_ur, bool bcast,
-                             const Xbyak_aarch64::XReg addr,
-                             const Xbyak_aarch64::XReg tmp) {
-        assert(i_ur < jcp.ur);
-        assert(i_reduce <= jcp.reduce_loop_unroll);
-        int offt;
-        if (one_of(jcp.prop_kind, forward_training, forward_inference,
-                    backward_data)) {
-            assert(jcp.reduce_loop_unroll == jcp.reduce_block);
-            const int reduce_mul = bcast_layout_nxc ? jcp.reduce_dim
-                                                    : jcp.reduce_loop_unroll;
-            offt = (i_reduce == jcp.reduce_loop_unroll)
-                    ? (jcp.bcast_dim + i_ur) * reduce_mul
-                    : i_ur * reduce_mul + i_reduce;
-        } else {
-            int rmul = bcast_layout_nxc ? jcp.ic : jcp.ic_block;
-            offt = i_reduce * rmul + i_ur;
-        }
-        return EVEX_compress_addr(
-                addr, tmp, aux_reg_bcast_data, jcp.typesize_in * offt, bcast);
-    };
-
-    auto load_ptr = [=](int i_reduce, int i_load,
-                            const Xbyak_aarch64::XReg addr,
-                            const Xbyak_aarch64::XReg tmp) {
-        int offt;
-        int u0 = i_reduce % jcp.reduce_loop_unroll;
-        int u1 = i_reduce / jcp.reduce_loop_unroll;
-        int lmul = jcp.load_block
-                * (load_layout_nxc ? 1
-                                   : utils::rnd_up(
-                                           jcp.reduce_dim, jcp.reduce_block));
-        int rmul = load_layout_nxc ? jcp.load_dim : jcp.load_block;
-        offt = i_load * lmul + u0 * rmul;
-        return EVEX_compress_addr(addr, tmp, aux_reg_load_data,
-                u1 * jcp.reduce_loop_load_step + jcp.typesize_in * offt);
-    };
-
-    auto init = [=]() {
-        Label init_done;
-        Label init_zero;
-
-        if (jcp.with_bias
-                && one_of(jcp.prop_kind, forward_training, forward_inference)) {
-            tst(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
-            b(EQ, init_zero);
-
-            for (int i_load = 0; i_load < load_loop_blk; i_load++)
-                for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                    auto vreg_acc = vreg_accum(i_load, i_ur);
-                    if (i_load + 1 == load_loop_blk && load_dim_tail)
-                        ld1w(vreg_acc.s, k_load_dim_mask / T_z,
-                                ptr(bias_ptr(i_load)));
-                    else
-                        ldr(vreg_acc, ptr(bias_ptr(i_load)));
-                }
-            b(init_done);
-        }
-
-        L(init_zero);
-
-        /* Zero clear */
-        for (int i_load = 0; i_load < load_loop_blk; ++i_load)
-            for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                auto r = vreg_accum(i_load, i_ur);
-                eor(r.d, r.d, r.d);
-            }
-        L(init_done);
-    };
-
-    auto store = [=]() {
-        Label store_noadd;
-        if (!jcp.with_sum) {
-            tst(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
-            b(NE, store_noadd);
-        }
-
-        for (int i_ur = 0; i_ur < ur; ++i_ur)
-            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                auto r = vreg_accum(i_load, i_ur).s;
-                if (i_load + 1 == load_loop_blk && load_dim_tail)
-                    ld1w(zreg_tmp.s, k_load_dim_mask / T_z,
-                            ptr(output_ptr(out_layout_nxc, i_load, i_ur,
-                                    X_DEFAULT_ADDR)));
-                else
-                    ldr(zreg_tmp,
-                            ptr(output_ptr(out_layout_nxc, i_load, i_ur,
-                                    X_DEFAULT_ADDR)));
-                fadd(r, r, zreg_tmp.s);
-            }
-
-        L(store_noadd);
-        if (jcp.with_eltwise || jcp.with_binary) {
-            Label store_nopostops;
-            tst(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
-            b(EQ, store_nopostops);
-
-            apply_postops(out_layout_nxc, load_loop_blk, ur);
-
-            L(store_nopostops);
-        }
-
-        auto store_output = [=](bool output_is_aligned) {
-            const auto mask_flag = load_dim_tail;
-            for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                    auto vreg_acc = vreg_accum(i_load, i_ur);
-                    // for nxc_layout-bwd_w, weights are still padded and the
-                    // output_ptr here can be uninitialized scratchpad.
-                    // To ensure final output (after reduction) is zero-padded,
-                    // here we zero-pad output by omitting the mask.
-                    if (jcp.prop_kind != backward_weights
-                            && (i_load + 1 == load_loop_blk && mask_flag)) {
-                        st1w(vreg_acc.s, k_load_dim_mask / T_z,
-                                ptr(output_ptr(out_layout_nxc, i_load, i_ur,
-                                        X_DEFAULT_ADDR)));
-                    } else {
-                        str(vreg_acc,
-                                ptr(output_ptr(out_layout_nxc, i_load, i_ur,
-                                        X_DEFAULT_ADDR)));
-                    }
-                }
-            }
-        };
-
-        Label unaligned_store, end_store;
-        tst(aux_reg_output_data, cpu_isa_traits<sve_512>::vlen - 1);
-        b(NE, unaligned_store);
-        store_output(true);
-        b(end_store);
-        L(unaligned_store);
-        { store_output(false); }
-        L(end_store);
-    };
-
-    auto fma_block = [=](bool last_block) {
-        const int i_reduce_end = reduce_dim_tail && last_block
-                ? reduce_dim_tail
-                : jcp.reduce_loop_unroll;
-
-        for (int i_reduce = 0; i_reduce < i_reduce_end; i_reduce++) {
-            for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                auto vreg = vreg_load(i_load);
-                if (i_load + 1 == load_loop_blk && load_dim_tail)
-                    ld1w(vreg.s, k_load_dim_mask / T_z,
-                            ptr(load_ptr(i_reduce, i_load, X_DEFAULT_ADDR,
-                                    X_TMP_0)));
-                else
-                    ldr(vreg,
-                            ptr(load_ptr(i_reduce, i_load, X_DEFAULT_ADDR,
-                                    X_TMP_0)));
-            }
-
-            for (int i_ur = 0; i_ur < ur; ++i_ur) {
-                if (jcp.expl_bcast && load_loop_blk > 1) {
-                    ldr(W_TMP_0,
-                            ptr(bcast_ptr(i_reduce, i_ur, false, X_DEFAULT_ADDR,
-                                    X_TMP_1)));
-                    dup(vreg_bcast.s, W_TMP_0);
-                }
-                for (int i_load = 0; i_load < load_loop_blk; ++i_load) {
-                    auto vreg_acc = vreg_accum(i_load, i_ur);
-                    if (i_load + 1 == load_loop_blk && load_dim_tail) {
-                        ld1rw(zreg_tmp.s, P_ALL_ONE,
-                                ptr(bcast_ptr(i_reduce, i_ur, true,
-                                        X_DEFAULT_ADDR, X_TMP_0)));
-                        fmla(vreg_acc.s, k_load_dim_mask / T_m,
-                                vreg_load(i_load).s, zreg_tmp.s);
-                    } else if (jcp.expl_bcast && load_loop_blk > 1) {
-                        fmla(vreg_acc.s, P_ALL_ONE / T_m, vreg_load(i_load).s,
-                                vreg_bcast.s);
-                    } else {
-                        ld1rw(zreg_tmp.s, P_ALL_ONE,
-                                ptr(bcast_ptr(i_reduce, i_ur, true,
-                                        X_DEFAULT_ADDR, X_TMP_0)));
-                        fmla(vreg_acc.s, P_ALL_ONE / T_m, vreg_load(i_load).s,
-                                zreg_tmp.s);
-                    }
-                }
-            }
-        }
-    };
-
-    Label reduce_loop;
-    Label reduce_loop_tail;
-
-    mov(aux_reg_load_data, reg_load_data);
-
-    mov(aux_reg_bcast_data, aux1_reg_bcast_data);
-    init();
-
-    mov(reduce_loop_iter, reg_reduce_loop_work);
-    subs_imm(reduce_loop_iter, reduce_loop_iter, jcp.reduce_loop_unroll,
-            reg_tmp_imm);
-    b(LE, reduce_loop_tail);
-
-    L(reduce_loop);
-    {
-        fma_block(false);
-        add_imm(aux_reg_bcast_data, aux_reg_bcast_data,
-                jcp.reduce_loop_bcast_step, reg_tmp_imm);
-        add_imm(aux_reg_load_data, aux_reg_load_data, jcp.reduce_loop_load_step,
-                reg_tmp_imm);
-        subs_imm(reduce_loop_iter, reduce_loop_iter, jcp.reduce_loop_unroll,
-                reg_tmp_imm);
-        b(GT, reduce_loop);
-    }
-
-    L(reduce_loop_tail);
-    fma_block(true);
-
-    store();
-}
-
-void jit_sve_512_1x1_conv_kernel::generate() {
-    preamble();
-
-    sub_imm(X_SP, X_SP, stack_space_needed, X_TMP_0);
-    if (jcp.with_binary) {
-        const auto zeroed_reg = x15;
-        eor(zeroed_reg, zeroed_reg, zeroed_reg);
-        str(zeroed_reg, ptr(X_SP, reg_binary_post_op_acc_off));
-        str(param1, ptr(X_SP, reg_abi_param1_backup));
-    }
-
-    /* Pointers indicate weight, input, and output data */
-    ldr(reg_bcast_data, ptr(abi_param1, GET_OFF(bcast_data))); // Input
-    ldr(reg_load_data, ptr(abi_param1, GET_OFF(load_data))); // Weight
-    ldr(reg_output_data, ptr(abi_param1, GET_OFF(output_data))); // Output
-
-    /* Pointer indicates bias data if the layer has bias option */
-    if (jcp.with_bias) ldr(reg_bias_data, ptr(abi_param1, GET_OFF(bias_data)));
-
-    /* Get workloads of each loop */
-    ldr(reg_load_loop_work, ptr(abi_param1, GET_OFF(load_dim)));
-    ldr(reg_bcast_loop_work, ptr(abi_param1, GET_OFF(bcast_dim)));
-    str(reg_bcast_loop_work, ptr(X_SP, reg_bcast_loop_work_offt));
-    ldr(reg_reduce_loop_work, ptr(abi_param1, GET_OFF(reduce_dim)));
-
-    /* A flag for controlling reduce loop */
-    ldr(reg_reduce_pos_flag, ptr(abi_param1, GET_OFF(first_last_flag)));
-    if (jcp.prop_kind == backward_weights)
-        ldr(reg_output_stride, ptr(param1, GET_OFF(output_stride)));
-
-    const int load_dim_tail
-            = (one_of(jcp.prop_kind, forward_training, forward_inference)
-                              ? jcp.oc_without_padding
-                              : jcp.load_dim)
-            % jcp.load_block;
-    if (load_dim_tail) {
-        const WReg w_tmp(reg_load_dim_tail_mask.getIdx());
-        mov_imm(w_tmp, (1 << load_dim_tail) - 1);
-        str(zreg_tmp1, ptr(X_TRANSLATOR_STACK, -1, MUL_VL));
-        index(zreg_tmp.s, 0, 1);
-        mov(zreg_tmp1.s, 1);
-        lsl(zreg_tmp1.s, P_ALL_ONE / T_m, zreg_tmp.s);
-        dup(zreg_tmp.s, w_tmp);
-        and_(zreg_tmp.d, zreg_tmp.d, zreg_tmp1.d);
-        cmpne(k_load_dim_tail_mask.s, P_ALL_ONE, zreg_tmp.s, 0);
-        ldr(zreg_tmp1, ptr(X_TRANSLATOR_STACK, -1, MUL_VL));
-    }
-
-    auto load_loop_body = [=](int load_loop_blk) {
-        if (load_dim_tail) {
-            eor(k_load_dim_mask.b, P_ALL_ONE / T_z, k_load_dim_mask.b,
-                    k_load_dim_mask.b);
-            not_(k_load_dim_mask.b, P_ALL_ONE / T_z, k_load_dim_mask.b);
-        }
-        subs_imm(reg_load_loop_work, reg_load_loop_work,
-                load_loop_blk * jcp.load_loop_iter_step, reg_tmp_imm);
-        if (load_dim_tail) {
-            Label no_update_mask;
-            b(GE, no_update_mask);
-            mov(k_load_dim_mask.b, k_load_dim_tail_mask.b);
-            L(no_update_mask);
-        }
-        bcast_loop(load_loop_blk);
-        add_imm(reg_load_data, reg_load_data,
-                load_loop_blk * jcp.load_loop_load_step, reg_tmp_imm);
-        switch (jcp.prop_kind) {
-            case forward_training:
-            case forward_inference:
-                add_imm(reg_bias_data, reg_bias_data,
-                        load_loop_blk * jcp.load_block * jcp.typesize_out,
-                        reg_tmp_imm);
-                add_imm(reg_output_data, reg_output_data,
-                        load_loop_blk * jcp.load_block * jcp.typesize_out
-                                * (is_out_layout_nxc(jcp)
-                                                ? 1
-                                                : (jcp.with_dw_conv
-                                                                ? jcp.ow
-                                                                : jcp.bcast_dim)),
-                        reg_tmp_imm);
-                if (jcp.with_binary) {
-                    const auto oc_off_oprnd = aux_reg_load_data;
-                    ldr(oc_off_oprnd, ptr(X_SP, reg_binary_post_op_acc_off));
-                    add_imm(oc_off_oprnd, oc_off_oprnd,
-                            jcp.load_block * load_loop_blk, X_TMP_0);
-                    str(oc_off_oprnd, ptr(X_SP, reg_binary_post_op_acc_off));
-                }
-                break;
-            case backward_data:
-                add_imm(reg_output_data, reg_output_data,
-                        load_loop_blk * jcp.load_block * jcp.typesize_out
-                                * (is_out_layout_nxc(jcp) ? 1 : jcp.bcast_dim),
-                        reg_tmp_imm);
-                break;
-            case backward_weights:
-                for (int i_load = 0; i_load < load_loop_blk; i_load++)
-                    add(reg_output_data, reg_output_data, reg_output_stride);
-                break;
-            default: assert(!"invalid prop_kind");
-        }
-    };
-
-    const int simd_w = cpu_isa_traits<sve_512>::vlen / sizeof(float);
-
-    Label load_loop_blk[7];
-
-    // with an implicit load_loop_block          {6, 5, 4, 3, 2,  1}
-    static const int ur_cases_fma_embd_bcast[] = {2, 4, 5, 8, 14, 32};
-    static const int ur_cases_fma_expl_bcast[] = {2, 5, 6, 9, 14, 32};
-
-    const int size_ur_cases_fma = jcp.expl_bcast
-            ? sizeof(ur_cases_fma_expl_bcast)
-            : sizeof(ur_cases_fma_embd_bcast);
-
-    const int *ur_cases_fma = jcp.expl_bcast ? ur_cases_fma_expl_bcast
-                                             : ur_cases_fma_embd_bcast;
-    const int *ur_cases = ur_cases_fma;
-    const int num_ur_cases = size_ur_cases_fma / sizeof(*ur_cases);
-
-    for (int ur_idx = num_ur_cases - 1; ur_idx > 0; ur_idx--) {
-        int label_idx = num_ur_cases - ur_idx - 1;
-        if (jcp.nb_load > label_idx && jcp.ur <= ur_cases[ur_idx]) {
-            cmp_imm(reg_load_loop_work, simd_w * (label_idx + 1), reg_tmp_imm);
-            b(LE, load_loop_blk[label_idx]);
-        }
-    }
-
-    for (int ur_idx = 0; ur_idx < num_ur_cases; ur_idx++) {
-        int label_idx = num_ur_cases - ur_idx - 1;
-        if (jcp.nb_load > label_idx && jcp.ur <= ur_cases[ur_idx]) {
-            L(load_loop_blk[label_idx]);
-            {
-                if (label_idx == 0) {
-                    cmp(reg_load_loop_work, 0);
-                    b(LE, load_loop_blk[num_ur_cases]);
-                }
-                load_loop_body(label_idx + 1);
-                if (label_idx - 1 > 0) {
-                    cmp_imm(reg_load_loop_work, 2 * label_idx * simd_w,
-                            reg_tmp_imm);
-                    b(EQ, load_loop_blk[label_idx - 1]);
-                }
-                cmp_imm(reg_load_loop_work, label_idx * simd_w, reg_tmp_imm);
-                b(GT, load_loop_blk[label_idx]);
-            }
-            for (int idx = label_idx - 1; idx >= 0; --idx) {
-                cmp_imm(reg_load_loop_work, simd_w * (idx + 1), reg_tmp_imm);
-                b(GE, load_loop_blk[idx]);
-            }
-            if (ur_idx < num_ur_cases - 2) {
-                cmp_imm(reg_load_loop_work, simd_w, reg_tmp_imm);
-                b(LE, load_loop_blk[0]);
-            }
-        }
-    }
-    L(load_loop_blk[num_ur_cases]);
-
-    add_imm(X_SP, X_SP, stack_space_needed, X_TMP_0);
-
-    postamble();
-    if (jcp.with_eltwise) postops_injector_->prepare_table();
-}
-
-status_t jit_sve_512_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp,
-        const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
-        const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
-        const primitive_attr_t &attr, int nthreads, bool reduce_src) {
-
-    /* arch check */
-    if (!mayiuse(sve_512)) return status::unimplemented;
-
-    if (!everyone_is(data_type::f32, src_d.data_type(), weights_d.data_type(),
-                dst_d.data_type()))
-        return status::unimplemented;
-
-    jcp.nthr = nthreads;
-
-    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
-    const int simd_w = cpu_isa_traits<sve_512>::vlen / sizeof(float);
-    const int ndims = src_d.ndims();
-    /* Forward_[training, inference], backward_[data, weight] */
-    jcp.prop_kind = cd.prop_kind;
-
-    /* Check group option */
-    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
-    /* Batchsize */
-    jcp.mb = src_d.dims()[0];
-    /* Channel */
-    jcp.oc_without_padding = dst_d.dims()[1] / jcp.ngroups;
-    jcp.oc = jcp.oc_without_padding;
-    jcp.ic_without_padding = src_d.dims()[1] / jcp.ngroups;
-    jcp.ic = jcp.ic_without_padding;
-    /* D, H, W */
-    jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
-    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2];
-    jcp.iw = src_d.dims()[ndims - 1];
-    jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
-    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2];
-    jcp.ow = dst_d.dims()[ndims - 1];
-    /* Kernel size */
-    jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
-    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims - 2];
-    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
-    /* padding params */
-    jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims - 4];
-    jcp.l_pad = cd.padding[0][ndims - 3];
-    /* stride params */
-    jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims - 4];
-    jcp.stride_w = cd.strides[ndims - 3];
-    /* bias info */
-    jcp.with_bias = pick_by_prop_kind(jcp.prop_kind, cd.bias_desc.format_kind,
-                            format_kind::undef, cd.diff_bias_desc.format_kind)
-            != format_kind::undef;
-
-    /* Spatials */
-    jcp.os = jcp.od * jcp.oh * jcp.ow;
-    jcp.is = jcp.id * jcp.ih * jcp.iw;
-
-    /* Depthwise conv check */
-    const auto &post_ops = attr.post_ops_;
-    const int dw_conv_ind = post_ops.find(primitive_kind::convolution);
-    jcp.with_dw_conv = dw_conv_ind != -1;
-    if (jcp.with_dw_conv) return status::unimplemented;
-
-    /* Post operation check */
-    // Using dw_conv_ind as upper-bound below, as post-ops after it will be
-    // handled in depthwise convolution.
-    const int eltwise_ind
-            = post_ops.find(primitive_kind::eltwise, 0, dw_conv_ind);
-    jcp.with_eltwise = eltwise_ind != -1;
-    if (jcp.with_eltwise) {
-        if (dst_d.data_type() == data_type::s32) return status::unimplemented;
-    }
-
-    const int sum_ind = post_ops.find(primitive_kind::sum, 0, dw_conv_ind);
-    jcp.with_sum = sum_ind != -1;
-
-    const int binary_ind
-            = post_ops.find(primitive_kind::binary, 0, dw_conv_ind);
-    jcp.with_binary = binary_ind != -1;
-
-    if (dw_conv_ind >= 0) {
-        // dw_conv and post_ops after it are handled externally, so skip them
-        jcp.post_ops.entry_.assign(post_ops.entry_.cbegin(),
-                post_ops.entry_.cbegin() + dw_conv_ind);
-    } else {
-        jcp.post_ops = post_ops;
-    }
-
-    /* Data format check */
-    const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
-    const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-    bool is_data_layout_nxc
-            = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
-    auto required_dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
-
-    /* Channel padding check */
-    bool ok_to_pad_channels = true && !is_data_layout_nxc && jcp.ngroups == 1
-            && src_d.data_type() == data_type::f32;
-
-    /* Input and output must be multiple of simd_w */
-    if (ok_to_pad_channels) {
-        jcp.oc = rnd_up(jcp.oc, simd_w);
-        jcp.ic = rnd_up(jcp.ic, simd_w);
-    }
-
-    using namespace injector;
-
-    static constexpr bool sum_at_pos_0_only = true;
-    static constexpr bool sum_requires_scale_one = true;
-    static constexpr bool sum_requires_zp_zero = true;
-    const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(jcp.isa,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
-            sum_requires_scale_one, sum_requires_zp_zero));
-    if (!post_ops_ok_) return status::unimplemented;
-
-    bool args_ok = true && jcp.ngroups == 1 && jcp.src_tag == required_dat_tag
-            && jcp.dst_tag == required_dat_tag
-            && IMPLICATION(!is_data_layout_nxc,
-                    jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0)
-            && jcp.f_pad == 0 && jcp.t_pad == 0 && jcp.l_pad == 0
-            && jcp.stride_w == 1 && jcp.stride_h == 1 && jcp.stride_d == 1
-            && jcp.kd == 1 && jcp.kh == 1 && jcp.kw == 1 && jcp.ow == jcp.iw
-            && jcp.oh == jcp.ih && jcp.od == jcp.id; // enforce rpad=0
-    if (!args_ok) return status::unimplemented;
-
-    /* Channel blocking size is simd_w */
-    jcp.ic_block = jcp.oc_block = simd_w;
-
-    jcp.ver = ver_sve_512;
-    if (everyone_is(data_type::f32, src_d.data_type(), weights_d.data_type(),
-                dst_d.data_type())) {
-        const int is_bwd_d = jcp.prop_kind == backward_data;
-        /* Set weight data layout tag */
-        format_tag_t wei_tag = with_groups
-                ? pick(2 * ndims - 6 + is_bwd_d, gOIw16i16o, gIOw16o16i,
-                        gOIhw16i16o, gIOhw16o16i, gOIdhw16i16o, gIOdhw16o16i)
-                : pick(2 * ndims - 6 + is_bwd_d, OIw16i16o, IOw16o16i,
-                        OIhw16i16o, IOhw16o16i, OIdhw16i16o, IOdhw16o16i);
-
-        jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
-        if (jcp.wei_tag != wei_tag) return status::unimplemented;
-
-        //        jcp.fma_step = 1;
-        jcp.typesize_in = sizeof(prec_traits<data_type::f32>::type);
-        jcp.typesize_out = sizeof(prec_traits<data_type::f32>::type);
-    } else {
-        // TODO: currently, only support fp32
-        return status::unimplemented;
-    }
-
-    /* once all the formats are set, check the padding consistency */
-    if (!is_data_layout_nxc) {
-        args_ok = true && jcp.ic <= src_d.padded_dims()[1]
-                && jcp.oc <= dst_d.padded_dims()[1]
-                && jcp.ic <= weights_d.padded_dims()[with_groups + 1]
-                && jcp.oc <= weights_d.padded_dims()[with_groups + 0];
-        if (!args_ok) return status::unimplemented;
-    }
-    // TODO: Optimize bellow params
-    const int SMALL_SPATIAL = 10;
-    const int BIG_SPATIAL = 65;
-    const int BIG_REDUCE_DIM = 1024;
-    const int BIG_LOAD_DIM = (jcp.reduce_dim >= 512) ? 256 : 512;
-
-    int load_blocking {0};
-    int load_blocking_max {0};
-    int bcast_blocking {0};
-    int bcast_blocking_max {0};
-    int reduce_blocking {0};
-    int reduce_blocking_max {0};
-
-    jcp.load_grp_count = 1;
-
-    // TODO: mov check funcs into platform files
-    const int L1_capacity
-            = platform::get_per_core_cache_size(1) / sizeof(float);
-    const int L2_size = platform::get_per_core_cache_size(2) / sizeof(float);
-    const int L2_capacity = (L2_size * 3) / 4;
-
-    /* FWD, BWD data */
-    if (one_of(jcp.prop_kind, forward_training, forward_inference,
-                backward_data)) {
-
-        if (one_of(jcp.prop_kind, forward_training, forward_inference)) {
-            /* Forward */
-            if (jcp.with_dw_conv) jcp.ur = nstl::min(jcp.ow, jcp.ur);
-            jcp.reduce_dim = jcp.ic; // src channel
-            jcp.reduce_block = jcp.ic_block; // src simd_w
-
-            jcp.load_dim = jcp.oc; // dst channel
-            jcp.load_block = jcp.oc_block; // dst simd_W
-
-            jcp.bcast_dim = jcp.is; // src H*W
-        } else {
-            /* Backward data */
-            jcp.reduce_dim = jcp.oc; // src channel
-            jcp.reduce_block = jcp.oc_block; // src simd_w
-
-            jcp.load_dim = jcp.ic; // dst channel
-            jcp.load_block = jcp.ic_block; // dst simd_w
-
-            jcp.bcast_dim = jcp.os; // src H*W
-        }
-
-        /* # of consecutive channel elements  */
-        jcp.reduce_loop_unroll = jcp.reduce_block;
-
-        /* Offset to move to the next 16 input channel elements with the same H*W position */
-        jcp.reduce_loop_bcast_step = jcp.reduce_loop_unroll
-                * (is_data_layout_nxc ? 1 : jcp.bcast_dim) * jcp.typesize_in;
-
-        /* Offset: 16o*16i (filter) */
-        jcp.reduce_loop_load_step
-                = jcp.reduce_loop_unroll * jcp.load_block * jcp.typesize_in;
-
-        /* Offset: I/16 * 16o */
-        jcp.load_loop_load_step
-                = (utils::rnd_up(jcp.reduce_dim, jcp.reduce_block))
-                * jcp.load_block * jcp.typesize_in;
-
-        /* adjusting registry blocking */
-        int max_regs, min_regs, size_threshold;
-
-        /* spatial : H*D of dst */
-        const int spatial
-                = (one_of(jcp.prop_kind, forward_training, forward_inference))
-                ? jcp.od * jcp.oh // forward
-                : jcp.id * jcp.ih; // backward
-
-        if ((8 * jcp.mb) / jcp.nthr >= 1
-                // NHWC perf: RN50 mb=1
-                || (is_data_layout_nxc && jcp.mb == 1)) {
-            max_regs = 9; // max # of ur_w
-            min_regs = 6; // min # of ur_w
-            size_threshold = 14;
-            jcp.expl_bcast = true;
-
-            /*
-            *  H*D of dst  > SMALL_SPATIAL
-            */
-            if (jcp.load_dim > 128 && jcp.load_dim < BIG_LOAD_DIM
-                    && spatial > SMALL_SPATIAL && spatial < BIG_SPATIAL
-                    && jcp.reduce_dim < 256) {
-                max_regs = 6;
-                min_regs = 5;
-            }
-        } else {
-            max_regs = 30;
-            min_regs = 9;
-            size_threshold = 14;
-            jcp.expl_bcast = false;
-            jcp.use_vmovntps = true;
-        }
-        jcp.ur = 1;
-
-        for (int ur_w = max_regs; ur_w >= min_regs; ur_w--) {
-            /*
-             *  H*D of dst >= size_threshold, (H*D of dst) % ur_w == 0
-             *  or
-             *  H*D of dst < size_threshold, (H*W of dst) % ur_w == 0
-             */
-            if ((spatial >= size_threshold && spatial % ur_w == 0)
-                    || (spatial < size_threshold && jcp.os % ur_w == 0)) {
-                jcp.ur = ur_w;
-                break;
-            }
-        }
-
-        if (jcp.ur == 1) {
-            // If ur = 1, then min(max_regs, H*W of dst)
-            jcp.ur = nstl::min(max_regs, jcp.os);
-            int os_tail = jcp.os % max_regs;
-            for (int i = max_regs; i >= min_regs; i--) {
-                int i_tail = jcp.os % i;
-                if (i_tail > os_tail || i_tail == 0) {
-                    jcp.ur = i;
-                    os_tail = i_tail;
-                    if (i_tail == 0) break;
-                }
-            }
-        }
-        jcp.bcast_block = jcp.ur; // block size of bcast (input data)
-        /* Number of steps for the dst address to output, used in bcast_loop() */
-        jcp.bcast_loop_output_step = jcp.ur * jcp.typesize_out
-                * (is_data_layout_nxc ? jcp.load_dim : jcp.load_block);
-        jcp.bcast_loop_output_substep = -1; // unused
-
-        /* Number of steps for the src address to be broadcasted in bcast_loop() */
-        jcp.bcast_loop_bcast_step = jcp.ur * jcp.typesize_in
-                * (is_data_layout_nxc ? jcp.reduce_dim : jcp.reduce_block);
-        jcp.bcast_loop_bcast_substep = -1; // unused
-
-        jcp.load_loop_iter_step = jcp.load_block;
-
-        if (jcp.prop_kind == backward_data)
-            jcp.loop_order = loop_lbr;
-        else
-            jcp.loop_order = reduce_src ? loop_blr : loop_lbr;
-
-        int nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block);
-        int nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
-        int nb_load = div_up(jcp.load_dim, jcp.load_block);
-        if (is_data_layout_nxc) {
-            reduce_blocking = jcp.reduce_dim;
-        } else if (jcp.expl_bcast) {
-            if (jcp.load_dim <= BIG_LOAD_DIM && spatial > SMALL_SPATIAL
-                    && spatial < BIG_SPATIAL) {
-                reduce_blocking = nstl::min(jcp.reduce_dim, 80);
-            } else if (spatial > SMALL_SPATIAL)
-                reduce_blocking = nstl::min(jcp.reduce_dim, 512);
-            else
-                reduce_blocking = nstl::min(jcp.reduce_dim, 256);
-        } else {
-            reduce_blocking = nb_reduce;
-            if (spatial <= SMALL_SPATIAL && jcp.reduce_dim >= BIG_REDUCE_DIM)
-                reduce_blocking = 16;
-            else if (spatial > SMALL_SPATIAL
-                    && jcp.reduce_dim >= BIG_REDUCE_DIM)
-                reduce_blocking = 8;
-            reduce_blocking = best_divider(nb_reduce, 1, reduce_blocking, true);
-            reduce_blocking *= jcp.reduce_block;
-        }
-
-        // Check input data cache aliasing.
-        // For other ISA constants may be updated.
-        // 64 * 1024 is chosen due to 1MB L2 16-way cache.
-        // 7 is empirical value. It is about half of 16.
-        // So we leave about half of the set for other data - weights, dst
-        int way_size = (16 * 1024) / jcp.typesize_in;
-        int max_hits = 7;
-        if (!is_data_layout_nxc
-                && jcp.bcast_dim * reduce_blocking > way_size * max_hits) {
-            int nrb = reduce_blocking / simd_w;
-            int sp = jcp.bcast_dim;
-            int wl = way_size / simd_w;
-            for (int start_off = 0; start_off < jcp.ur; start_off++) {
-                for (int off = start_off, hits = 0; off < sp * nrb; off += wl) {
-                    if (off % sp >= jcp.ur || ++hits < max_hits) continue;
-                    int max_r_blocking = simd_w * nstl::max(1, (off + wl) / sp);
-                    reduce_blocking
-                            = nstl::min(reduce_blocking, max_r_blocking);
-                    break;
-                }
-            }
-        }
-
-        if (reduce_blocking < jcp.reduce_dim) {
-            if (jcp.prop_kind == backward_data)
-                jcp.loop_order = reduce_src ? loop_lbr : loop_rlb;
-            else
-                jcp.loop_order = reduce_src ? loop_rbl : loop_rlb;
-        }
-        load_blocking = jcp.load_dim;
-
-        /* Number of weight elements to be loaded for dest */
-        int load_size = jcp.load_dim * jcp.reduce_dim;
-        /* Number of elements to be broadcasted from src */
-        auto bcast_size
-                = (dim_t)jcp.mb * jcp.ngroups * jcp.bcast_dim * jcp.reduce_dim;
-
-        /* 12 cores per CMG */
-        if (jcp.nthr <= 12 && jcp.mb < jcp.nthr
-                && nb_load * nb_bcast > jcp.nthr) {
-            // Some heuristic here
-            float calc_koef = 0.01, best_cost = FLT_MAX;
-            int n_lgc = jcp.nthr;
-            float ratio = (float)load_size / (float)bcast_size;
-            int best_lgc = ratio > 1 ? n_lgc : 1;
-            auto calc_job_cost = [&](int lb, int tg, float mem_k) {
-                int bb_size = jcp.mb * div_up(nb_bcast, tg);
-                float calc_size = (float)(bb_size * jcp.ur)
-                        * (lb * jcp.load_block) * jcp.reduce_dim;
-                float mem_size = (float)(bb_size * jcp.ur + lb * jcp.load_block)
-                        * jcp.reduce_dim;
-                return calc_koef * calc_size + mem_k * mem_size;
-            };
-            for (int lgc, ilgc = 0; ilgc < n_lgc; ilgc++) {
-                lgc = ratio > 1 ? n_lgc - ilgc : ilgc + 1;
-                int min_lb = nb_load / lgc;
-                int max_lb = div_up(nb_load, lgc);
-                int min_tg = jcp.nthr / lgc;
-                int max_tg = div_up(jcp.nthr, lgc);
-                // Some heuristic here
-                float mem_koef = (max_tg == 1) ? 1.f : 1.3f;
-                float job_cost = 0.;
-                if (jcp.nthr % lgc < nb_load % lgc) {
-                    job_cost = calc_job_cost(max_lb, min_tg, mem_koef);
-                } else {
-                    auto job_cost1 = calc_job_cost(max_lb, max_tg, mem_koef);
-                    auto job_cost2 = calc_job_cost(min_lb, min_tg, mem_koef);
-                    job_cost = nstl::max(job_cost1, job_cost2);
-                }
-
-                if (job_cost < best_cost) {
-                    best_lgc = lgc;
-                    best_cost = job_cost;
-                }
-            }
-            jcp.load_grp_count = best_lgc;
-            load_blocking
-                    = div_up(nb_load, jcp.load_grp_count) * jcp.load_block;
-        } else {
-            jcp.load_grp_count
-                    = div_up(jcp.nthr, jcp.mb * jcp.ngroups * nb_bcast);
-            jcp.load_grp_count = best_divider(jcp.nthr, jcp.load_grp_count,
-                    2 * jcp.load_grp_count, false);
-        }
-        if (jcp.expl_bcast && jcp.bcast_dim <= 64 && load_size >= L2_size) {
-            jcp.load_grp_count = nstl::max(jcp.load_grp_count, 4);
-        } else if (jcp.bcast_dim <= 49 && jcp.mb <= jcp.nthr
-                && jcp.load_dim > 512 && jcp.load_dim / jcp.reduce_dim >= 4) {
-            jcp.load_grp_count = nstl::max(jcp.load_grp_count, 2);
-            load_blocking = jcp.load_block;
-        }
-
-        auto get_thr_eff = [=](int load_chunk, int nthr) {
-            int lgc = div_up(nb_load, load_chunk);
-            int thr_per_grp = div_up(nthr, lgc);
-            int bcast_per_thr
-                    = div_up(jcp.mb * nb_bcast, thr_per_grp) * jcp.bcast_block;
-            int load_per_thr = load_chunk * simd_w;
-            float data_norm = (bcast_per_thr + load_per_thr) / 2.f;
-            float data_eff
-                    = (bcast_per_thr * load_per_thr) / (data_norm * data_norm);
-            float thr_eff_over_grp
-                    = (float)nstl::max(1, nthr / lgc) / div_up(nthr, lgc);
-            float thr_eff_in_grp = ((float)jcp.mb * nb_bcast)
-                    / rnd_up(jcp.mb * nb_bcast, thr_per_grp);
-            float thr_eff = thr_eff_over_grp * thr_eff_in_grp;
-            float load_eff = (float)nb_load / rnd_up(nb_load, lgc);
-            float overall_eff = data_eff + thr_eff + load_eff;
-            return overall_eff;
-        };
-
-        auto get_load_chunk = [=](int nthr) {
-            float best_eff = -1.0f;
-            int best_lgc = 1;
-            float eff;
-
-            for (int load_chunk = 1; load_chunk <= nb_load; load_chunk++) {
-                int lgc = div_up(nb_load, load_chunk);
-                if (lgc > nthr) continue;
-                eff = get_thr_eff(load_chunk, nthr);
-                if (eff > best_eff) {
-                    best_eff = eff;
-                    best_lgc = lgc;
-                }
-            }
-            return best_lgc;
-        };
-
-        /* adjust the thread decomposition
-         * to improve the thr_eff for small problem size
-         * the threshold 8192 is empirical 
-         * TODO: Threshold can be increase for init stride > 1*/
-        if (sizeof(float) * bcast_size < 8192 && jcp.mb < jcp.nthr
-                && nb_load * nb_bcast < jcp.nthr) {
-            float best_thr_eff = -1.0f;
-            float thr_eff = -1.0f;
-            int overall_lgc = jcp.load_grp_count;
-            int lgc = 1;
-            int best_nthr = jcp.nthr;
-            int end_nthr = with_groups ? jcp.ngroups : 1;
-            for (int nthr = jcp.nthr / 2; nthr >= end_nthr; nthr--) {
-                lgc = get_load_chunk(nthr);
-                thr_eff = get_thr_eff(lgc, nthr);
-                if (best_thr_eff < thr_eff) {
-                    best_thr_eff = thr_eff;
-                    overall_lgc = lgc;
-                    best_nthr = nthr;
-                }
-            }
-            jcp.nthr = best_nthr;
-            jcp.load_grp_count = overall_lgc;
-            load_blocking
-                    = div_up(nb_load, jcp.load_grp_count) * jcp.load_block;
-        }
-
-        bcast_blocking = div_up(jcp.mb * jcp.ngroups * nb_bcast,
-                                 div_up(jcp.nthr, jcp.load_grp_count))
-                * jcp.bcast_block;
-        bcast_blocking = nstl::min(jcp.bcast_dim, bcast_blocking);
-        bcast_blocking = rnd_up(bcast_blocking, jcp.bcast_block);
-
-        int space_for_bcast = (L2_capacity - /* kernel_size - */
-                2 * jcp.load_block * reduce_blocking - jcp.ur * reduce_blocking
-                - 3 * 1024);
-        if (jcp.reduce_dim * jcp.bcast_dim > L2_capacity) space_for_bcast /= 2;
-
-        int bcast_in_cache
-                = nstl::max(jcp.bcast_block, space_for_bcast / reduce_blocking);
-        bcast_blocking = nstl::min(
-                bcast_blocking, rnd_dn(bcast_in_cache, jcp.bcast_block));
-        // NHWC perf
-        if (is_data_layout_nxc) bcast_blocking = jcp.bcast_block;
-
-        load_blocking_max = load_blocking;
-        bcast_blocking_max = bcast_blocking * 3 / 2;
-        reduce_blocking_max = reduce_blocking;
-
-        jcp.ur_tail = (jcp.with_dw_conv ? jcp.ow : jcp.bcast_dim) % jcp.ur;
-
-    } else if (jcp.prop_kind == backward_weights) { /* BWD weight */
-
-        jcp.reduce_dim = jcp.is;
-
-        jcp.reduce_block = best_divider(jcp.reduce_dim, 7, 16, true);
-        if (jcp.reduce_dim % jcp.reduce_block != 0)
-            jcp.reduce_block = best_divider(jcp.iw, 4, jcp.iw, false);
-        if (jcp.reduce_block > 256) { jcp.reduce_block = 1; }
-
-        jcp.load_dim = jcp.oc;
-        jcp.load_block = jcp.oc_block;
-
-        jcp.bcast_dim = jcp.ic;
-        jcp.bcast_block = jcp.ic_block;
-
-        if (jcp.reduce_block <= 19 &&
-                // maskrcnn optimization for nxc; don't reduce ur when ocb<=1
-                !(is_data_layout_nxc && jcp.load_dim <= jcp.load_block)) {
-            // if reduce_block is big then generated JIT code may be big
-            // for small values of ur because reduce_loop_unroll = reduce_block
-            jcp.ur = jcp.bcast_block / 2;
-            jcp.expl_bcast = true;
-        } else {
-            jcp.ur = jcp.bcast_block;
-            jcp.expl_bcast = false;
-        }
-
-        jcp.ur_tail = jcp.bcast_dim % jcp.bcast_block;
-        jcp.reduce_loop_unroll = jcp.reduce_block;
-        jcp.reduce_loop_bcast_step = jcp.typesize_in * jcp.reduce_loop_unroll
-                * (is_data_layout_nxc ? jcp.ic : jcp.ic_block);
-        jcp.reduce_loop_load_step = jcp.typesize_in * jcp.reduce_loop_unroll
-                * (is_data_layout_nxc ? jcp.oc : jcp.oc_block);
-
-        jcp.bcast_loop_output_step
-                = jcp.oc_block * jcp.ic_block * jcp.typesize_out;
-        jcp.bcast_loop_output_substep
-                = jcp.oc_block * jcp.ur * jcp.typesize_out;
-        jcp.bcast_loop_bcast_step = jcp.ic_block
-                * (is_data_layout_nxc ? 1
-                                      : utils::rnd_up(
-                                              jcp.reduce_dim, jcp.reduce_block))
-                * jcp.typesize_in;
-        jcp.bcast_loop_bcast_substep = jcp.ur * jcp.typesize_in;
-
-        jcp.load_loop_load_step = jcp.typesize_in * jcp.oc_block
-                * (is_data_layout_nxc ? 1 : jcp.os);
-        jcp.load_loop_iter_step = jcp.oc_block;
-
-        /* --- */
-        balance(jcp);
-
-        load_blocking = div_up(jcp.load_dim, jcp.load_block);
-        load_blocking = best_divider(load_blocking, 16, load_blocking, false);
-        load_blocking *= jcp.load_block;
-
-        load_blocking_max = load_blocking;
-        assert(IMPLICATION(
-                !is_data_layout_nxc, jcp.load_dim % load_blocking == 0));
-
-        int max_bcast_blocking = div_up(jcp.bcast_dim, jcp.bcast_block);
-        int min_bcast_blocking = 5;
-
-        bcast_blocking = div_up(jcp.bcast_dim, jcp.bcast_block);
-        bcast_blocking = best_divider(
-                bcast_blocking, min_bcast_blocking, max_bcast_blocking, false);
-        bcast_blocking *= jcp.bcast_block;
-        bcast_blocking_max = bcast_blocking;
-        assert(IMPLICATION(
-                !is_data_layout_nxc, jcp.bcast_dim % bcast_blocking == 0));
-
-        // for reduction balance
-        if (is_data_layout_nxc && jcp.reduce_dim >= BIG_SPATIAL * BIG_SPATIAL
-                && jcp.load_dim >= BIG_LOAD_DIM / 2) {
-            reduce_blocking = rnd_up(nstl::min(jcp.ow, 256), jcp.reduce_block);
-        } else {
-            int max_reduce_blocking
-                    = nstl::min(L1_capacity / jcp.ur, jcp.reduce_dim);
-            int min_reduce_blocking = nstl::min(
-                    L1_capacity / jcp.ur, nstl::max(jcp.iw, jcp.ih));
-            reduce_blocking = best_divider(jcp.reduce_dim, min_reduce_blocking,
-                    max_reduce_blocking, true);
-            reduce_blocking
-                    = nstl::max(rnd_dn(reduce_blocking, jcp.reduce_block),
-                            jcp.reduce_block);
-        }
-
-        reduce_blocking_max = rnd_dn(reduce_blocking * 3 / 2, jcp.reduce_block);
-    } else
-        return status::unimplemented;
-
-    assert(load_blocking);
-    assert(load_blocking_max);
-    assert(bcast_blocking);
-    assert(bcast_blocking_max);
-    assert(reduce_blocking);
-    assert(reduce_blocking_max);
-
-    if (!is_data_layout_nxc) {
-        assert(load_blocking % jcp.load_block == 0);
-        assert(reduce_blocking % jcp.reduce_block == 0);
-        assert(load_blocking_max % jcp.load_block == 0);
-        assert(reduce_blocking_max % jcp.reduce_block == 0);
-        assert(jcp.reduce_dim % jcp.reduce_block == 0);
-    }
-
-    assert(jcp.bcast_block % jcp.ur == 0);
-
-    jcp.nb_bcast_blocking = bcast_blocking / jcp.bcast_block;
-    jcp.nb_bcast_blocking_max = bcast_blocking_max / jcp.bcast_block;
-    jcp.nb_load_blocking = utils::div_up(load_blocking, jcp.load_block);
-    jcp.nb_load_blocking_max = utils::div_up(load_blocking_max, jcp.load_block);
-    jcp.nb_reduce_blocking = utils::div_up(reduce_blocking, jcp.reduce_block);
-    jcp.nb_reduce_blocking_max
-            = utils::div_up(reduce_blocking_max, jcp.reduce_block);
-
-    jcp.nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block);
-    jcp.nb_load = div_up(jcp.load_dim, jcp.load_block);
-    jcp.nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
-
-    return status::success;
-}
-
-void jit_sve_512_1x1_conv_kernel::init_scratchpad(
-        memory_tracking::registrar_t &scratchpad,
-        const jit_1x1_conv_conf_t &jcp) {
-
-    using namespace dnnl::impl::memory_tracking::names;
-
-    // Fox nxc layout bias is padded only for bwd_wb direction, as  bias
-    // reduction kernels can't handle tails yet.
-    if (jcp.with_bias && jcp.prop_kind != backward_data
-            && (jcp.oc != jcp.oc_without_padding // blocked layout
-                    || (jcp.prop_kind == backward_weights // nxc layout
-                            && jcp.oc % jcp.oc_block != 0))) {
-
-        const size_t nelems_padded_bias
-                = jcp.ngroups * utils::rnd_up(jcp.oc, jcp.oc_block);
-        scratchpad.book(
-                key_conv_padded_bias, nelems_padded_bias, jcp.typesize_out);
-    }
-
-    if (jcp.prop_kind == backward_weights) {
-        const size_t wei_size = (size_t)jcp.ngroups
-                * rnd_up(jcp.oc, jcp.oc_block) * rnd_up(jcp.ic, jcp.ic_block);
-        scratchpad.book(key_conv_wei_reduction, wei_size * (jcp.nthr_mb - 1),
-                jcp.typesize_out);
-    }
-}
-
-/* BWD W*/
-void jit_sve_512_1x1_conv_kernel::balance(jit_1x1_conv_conf_t &jcp) {
-    int nthreads = jcp.nthr;
-    // initialize jcp reduction threading properties
-    jcp.nthr = jcp.nthr_mb = jcp.nthr_g = jcp.nthr_oc_b = jcp.nthr_ic_b = 1;
-    if (nthreads < jcp.ngroups) {
-        /* simplification... fortunately it doesn't hurt much */
-        return;
-    }
-    const int nb_bcast = div_up(jcp.bcast_dim, jcp.bcast_block);
-    const int nb_load = div_up(jcp.load_dim, jcp.load_block);
-    const int nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
-
-    jcp.nthr_g = jcp.ngroups;
-    const int nthr = nthreads / jcp.nthr_g;
-
-    auto calc_mem_cost = [=](int nthr_mb, int nthr_oc_b, int nthr_ic_b) {
-        /* calculate per thread memory cost (read/write). high level
-        * optimizer tries to minimize memory consumption. few notes: (n1)
-        * unclear why, but that essentially helps first convolution...
-        *  (n2) assuming the reduction over minibatch is always there:
-        *    - instead of 8 it should be 5 here (write ~= 2 read):
-        *      kernel: temporal workspace 1 write
-        *      reduction: 1 read from workspace and 1 write to the diff_wei
-        *    - but experiments showed 8 works better than 5 or 6... */
-        int bcast_koeff = 1;
-        int load_koeff = 1;
-        int output_koeff = 12;
-        return 0
-                + (size_t)bcast_koeff * div_up(jcp.mb * nb_reduce, nthr_mb)
-                * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_bcast, nthr_ic_b)
-                * jcp.ic_block * jcp.reduce_block / jcp.stride_h
-                / jcp.stride_w /* (n1) */
-                + (size_t)load_koeff * div_up(jcp.mb * nb_reduce, nthr_mb)
-                * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b)
-                * jcp.oc_block * jcp.reduce_block
-                + (size_t)output_koeff /* (n2) */
-                * div_up(jcp.ngroups, jcp.nthr_g) * div_up(nb_load, nthr_oc_b)
-                * div_up(nb_bcast, nthr_ic_b) * jcp.ic_block * jcp.oc_block;
-    };
-
-    int nthr_mb = 1, nthr_oc_b = 1, nthr_ic_b = 1;
-    auto best_mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b);
-
-    /* step 1: find the best thread distribution with lowest memory cost */
-    const int nthr_mb_max = nstl::min(nthr, jcp.mb * nb_reduce);
-    for (nthr_mb = 1; nthr_mb <= nthr_mb_max; ++nthr_mb) {
-        const int nthr_par = nthr / nthr_mb;
-        const int nthr_oc_b_max = nstl::min(nthr_par, nb_load);
-        for (nthr_oc_b = 1; nthr_oc_b <= nthr_oc_b_max; ++nthr_oc_b) {
-            nthr_ic_b = nstl::min(nthr_par / nthr_oc_b, nb_bcast);
-            auto mem_cost = calc_mem_cost(nthr_mb, nthr_oc_b, nthr_ic_b);
-            if (mem_cost <= best_mem_cost) {
-                best_mem_cost = mem_cost;
-                jcp.nthr_mb = nthr_mb;
-                jcp.nthr_oc_b = nthr_oc_b;
-                jcp.nthr_ic_b = nthr_ic_b;
-            }
-        }
-    }
-    if (jcp.nthr_mb > nthreads / 2 && jcp.nthr_mb < nthreads)
-        jcp.nthr_mb = nstl::min(jcp.mb, nthreads);
-
-    jcp.nthr = jcp.nthr_mb * jcp.nthr_g * jcp.nthr_oc_b * jcp.nthr_ic_b;
-    assert(jcp.nthr <= nthreads);
-}
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/aarch64/jit_sve_512_1x1_conv_kernel.hpp b/src/cpu/aarch64/jit_sve_512_1x1_conv_kernel.hpp
deleted file mode 100644
index 2d41be54911..00000000000
--- a/src/cpu/aarch64/jit_sve_512_1x1_conv_kernel.hpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_AARCH64_JIT_SVE_1x1_CONV_KERNEL_HPP
-#define CPU_AARCH64_JIT_SVE_1x1_CONV_KERNEL_HPP
-
-#include "common/c_types_map.hpp"
-#include "common/memory_tracking.hpp"
-
-#include "cpu/aarch64/injectors/jit_uni_postops_injector.hpp"
-#include "cpu/aarch64/jit_generator.hpp"
-#include "cpu/aarch64/jit_op_imm_check.hpp"
-#include "cpu/aarch64/jit_primitive_conf.hpp"
-
-using namespace Xbyak_aarch64;
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-/* Get vector offsets, ofs / VL(VL: 512bits = 64Bytes) */
-#define VL64_OFS(ofs) ((ofs) >> 6)
-
-struct jit_sve_512_1x1_conv_kernel : public jit_generator {
-    jit_sve_512_1x1_conv_kernel(const jit_1x1_conv_conf_t &ajcp,
-            const primitive_attr_t &attr, const memory_desc_t &dst_md);
-
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sve_512_1x1_conv_kernel)
-
-    static status_t init_conf(jit_1x1_conv_conf_t &jcp,
-            const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
-            const memory_desc_wrapper &weights_d,
-            const memory_desc_wrapper &dst_d, const primitive_attr_t &attr,
-            int nthreads, bool reduce_src);
-
-    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
-            const jit_1x1_conv_conf_t &jcp);
-
-    jit_1x1_conv_conf_t jcp;
-    const primitive_attr_t &attr_;
-
-private:
-    using reg64_t = const XReg;
-
-    /* Flags and loop variables */
-    reg64_t reg_reduce_pos_flag = x1;
-    reg64_t reduce_loop_iter = x2;
-    reg64_t reg_bcast_loop_iter = x3;
-    reg64_t reg_relu_ns = x20; // For forward
-    reg64_t reg_output_stride = x20; // For backward
-
-    /* Pointer */
-    reg64_t reg_bcast_data = x5; // Input
-    reg64_t reg_load_data = x6; // Weight
-    reg64_t reg_output_data = x7; // Output
-    reg64_t reg_bias_data = x8; // bias
-    reg64_t aux1_reg_bcast_data = x9;
-    reg64_t aux_reg_output_data = x10;
-    reg64_t aux_reg_bcast_data = x11;
-    reg64_t aux_reg_load_data = x12;
-    reg64_t reg_prev_bcast_addr
-            = x13; // Input: The reg keeps addr accessed by previous ldr inst
-    reg64_t reg_prev_out_addr
-            = x14; // Output: The reg keeps addr accessed by previous ldr or str inst
-
-    /* Workload */
-    reg64_t reg_load_loop_work = x15;
-    reg64_t reg_reduce_loop_work = x16;
-    reg64_t reg_bcast_loop_work = x17;
-
-    /* Temporay registers */
-    reg64_t reg_tmp_imm = x27; // tmp for add_imm
-    reg64_t reg_tmp_ofs = x19; // tmp reg to calc bwd wei offset in out_load
-
-    reg64_t reg_load_dim_tail_mask = aux_reg_load_data;
-
-    std::unique_ptr<injector::jit_uni_postops_injector_t<sve_512>>
-            postops_injector_;
-
-    constexpr static int isa_simd_width_
-            = cpu_isa_traits<sve_512>::vlen / sizeof(float);
-
-    ZReg vreg_bcast = ZReg(31);
-    PReg k_load_dim_mask = p2;
-    PReg k_load_dim_tail_mask = p3;
-    ZReg zreg_tmp = ZReg(31);
-    ZReg zreg_tmp1 = ZReg(30);
-
-    constexpr static int reg64_size_ = sizeof(int64_t);
-    constexpr static int reg_bcast_loop_work_offt = 0;
-    constexpr static int reg_binary_post_op_acc_off = 1 * reg64_size_;
-    constexpr static int reg_abi_param1_backup = 2 * reg64_size_;
-    constexpr static int stack_space_needed = 3 * reg64_size_;
-
-    template <typename T>
-    Xbyak_aarch64::XReg EVEX_compress_addr(const Xbyak_aarch64::XReg &addr,
-            const Xbyak_aarch64::XReg &x_tmp, Xbyak_aarch64::XReg base,
-            T raw_offt, bool bcast = false) {
-
-        assert(raw_offt <= INT_MAX);
-        auto offt = static_cast<int>(raw_offt);
-
-        add_imm(addr, base, offt, x_tmp);
-        if (bcast) {
-            // addr is the same as addr when bcast is false.
-        }
-        return addr;
-    }
-
-    void prefetch(
-            const std::string prfop, int level, reg64_t in, long long int ofs) {
-        bool for_load = false;
-        if (prfop == "LD") {
-            for_load = true;
-        } else if (prfop == "ST") {
-            for_load = false;
-        } else {
-            assert(!"invalid prfop");
-        }
-
-        bool cacheline_aligned = ((ofs & 0xFF) == 0) ? true : false;
-        if (cacheline_aligned == true) {
-            Prfop op;
-            switch (level) {
-                case 1: op = (for_load == true) ? PLDL1KEEP : PSTL1KEEP; break;
-                case 2: op = (for_load == true) ? PLDL2KEEP : PSTL2KEEP; break;
-                case 3: op = (for_load == true) ? PLDL3KEEP : PSTL3KEEP; break;
-                default: assert(!"invalid prfop"); break;
-            }
-
-            if (prfm_imm_check(ofs)) {
-                prfm(op, ptr(in, static_cast<int32_t>(ofs)));
-            } else {
-                add_imm(reg_tmp_ofs, in, ofs, reg_tmp_imm);
-                prfm(op, ptr(reg_tmp_ofs));
-            }
-        } else {
-            PrfopSve op_sve;
-            switch (level) {
-                case 1:
-                    op_sve = (for_load == true) ? PLDL1KEEP_SVE : PSTL1KEEP_SVE;
-                    break;
-                case 2:
-                    op_sve = (for_load == true) ? PLDL2KEEP_SVE : PSTL2KEEP_SVE;
-                    break;
-                case 3:
-                    op_sve = (for_load == true) ? PLDL3KEEP_SVE : PSTL3KEEP_SVE;
-                    break;
-                default: assert(!"invalid prfop"); break;
-            }
-
-            if (prfw_imm_check(ofs)) {
-                prfw(op_sve, P_ALL_ONE,
-                        ptr(in, static_cast<int32_t>(VL64_OFS(ofs))));
-            } else {
-                add_imm(reg_tmp_ofs, in, ofs, reg_tmp_imm);
-                prfw(op_sve, P_ALL_ONE, ptr(reg_tmp_ofs));
-            }
-        }
-    }
-
-    void bcast_loop(int load_loop_blk);
-    void reduce_loop(int load_loop_blk, int ur, int substep, bool wraparound);
-
-    void generate() override;
-    static void balance(jit_1x1_conv_conf_t &jcp);
-
-    inline size_t get_output_offset(
-            const bool is_out_layout_nxc, const int i_load, const int i_ur) {
-        const size_t i_load_shift = is_out_layout_nxc
-                ? jcp.load_block
-                : (jcp.with_dw_conv ? jcp.ow : jcp.bcast_dim) * jcp.load_block;
-        const size_t i_ur_shift
-                = is_out_layout_nxc ? jcp.load_dim : jcp.load_block;
-        return jcp.typesize_out * (i_load * i_load_shift + i_ur * i_ur_shift);
-    }
-
-    Xbyak_aarch64::XReg output_ptr(const bool out_layout_nxc, const int i_load,
-            const int i_ur, Xbyak_aarch64::XReg addr);
-    void apply_postops(const bool is_out_layout_nxc, const int load_loop_blk,
-            const int ur);
-};
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/aarch64/jit_sve_512_1x1_convolution.cpp b/src/cpu/aarch64/jit_sve_512_1x1_convolution.cpp
deleted file mode 100644
index 4d311bdb611..00000000000
--- a/src/cpu/aarch64/jit_sve_512_1x1_convolution.cpp
+++ /dev/null
@@ -1,1040 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "common/c_types_map.hpp"
-#include "common/dnnl_thread.hpp"
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-
-#include "cpu/aarch64/jit_generator.hpp"
-
-#include "cpu/aarch64/jit_sve_512_1x1_convolution.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-using namespace dnnl::impl::status;
-using namespace dnnl::impl::memory_tracking::names;
-using namespace dnnl::impl::utils;
-
-#define data_blk_off(f, n, c, d, h, w) \
-    ((ndims == 3) ? (f).blk_off(n, c, w) \
-                  : ((ndims == 4) ? (f).blk_off(n, c, h, w) \
-                                  : (f).blk_off(n, c, d, h, w)))
-/* convolution forward */
-
-template <data_type_t src_type, data_type_t wei_type, data_type_t dst_type>
-void jit_sve_512_1x1_convolution_fwd_t<src_type, wei_type,
-        dst_type>::execute_forward(const exec_ctx_t &ctx) const {
-    const auto &jcp = kernel_->jcp;
-    auto src = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
-    auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
-    auto bias = CTX_IN_MEM(const dst_data_t *, DNNL_ARG_BIAS);
-    auto dst = CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
-    auto weights_dw = CTX_IN_MEM(
-            const wei_data_t *, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS);
-    auto bias_dw = CTX_IN_MEM(
-            const dst_data_t *, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS);
-    const auto post_ops_binary_rhs_arg_vec
-            = binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
-    const auto post_ops_binary_rhs_arg_vec_dw = pd()->dw_conv_pd_
-            ? binary_injector::prepare_binary_args(
-                    pd()->dw_conv_pd_->jcp_.post_ops, ctx,
-                    pd()->jcp_.post_ops.entry_.size() + 1)
-            : std::vector<const void *> {};
-
-    auto scratchpad = ctx.get_scratchpad_grantor();
-
-    if (pd()->wants_padded_bias()) {
-        auto padded_bias
-                = scratchpad.template get<dst_data_t>(key_conv_padded_bias);
-        utils::array_copy(padded_bias, bias, jcp.oc_without_padding);
-        utils::array_set(padded_bias + jcp.oc_without_padding, 0.f,
-                jcp.oc - jcp.oc_without_padding);
-        bias = padded_bias;
-    }
-
-    parallel(jcp.nthr, [&](const int ithr, const int nthr) {
-        execute_forward_thr(ithr, nthr, src, weights, bias, weights_dw, bias_dw,
-                dst, scratchpad, post_ops_binary_rhs_arg_vec.data(),
-                post_ops_binary_rhs_arg_vec_dw.data());
-    });
-
-    if (pd()->wants_zero_pad_dst()) ctx.zero_pad_output(DNNL_ARG_DST);
-}
-
-template <data_type_t src_type, data_type_t wei_type, data_type_t dst_type>
-void jit_sve_512_1x1_convolution_fwd_t<src_type, wei_type,
-        dst_type>::execute_forward_thr(const int ithr, const int nthr,
-        const src_data_t *src, const wei_data_t *weights,
-        const dst_data_t *bias, const wei_data_t *weights_dw,
-        const dst_data_t *bias_dw, dst_data_t *dst,
-        const memory_tracking::grantor_t &scratchpad,
-        const void *post_ops_binary_rhs_arg_vec,
-        const void *post_ops_binary_rhs_arg_vec_dw) const {
-    const memory_desc_wrapper src_d(pd()->src_md());
-    const memory_desc_wrapper dst_d(pd()->dst_md());
-    const memory_desc_wrapper weights_d(pd()->weights_md(0));
-    const memory_desc_wrapper dw_weights_d(
-            pd()->arg_md(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS));
-    const memory_desc_wrapper dw_bias_d(
-            pd()->arg_md(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS));
-
-    const auto &jcp = kernel_->jcp;
-    auto rtus_space = pd()->rtus_.reduce_src_
-            ? scratchpad.get<src_data_t>(key_conv_rtus_space)
-            : nullptr;
-
-    const int ndims = src_d.ndims();
-    const int stride_d = (ndims == 5) ? pd()->desc()->strides[0] : 1;
-    const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[ndims - 4];
-    const int stride_w = pd()->desc()->strides[ndims - 3];
-
-    auto step = [](int default_step, int remaining, int tail_step) {
-        assert(default_step <= tail_step);
-        return remaining < tail_step ? remaining : default_step;
-    };
-
-    auto p = jit_1x1_conv_call_s();
-
-    auto rp = rtus_driver_t<sve_512>::call_params_t();
-    const int nb_oc = jcp.nb_load;
-    const int nb_ic = jcp.nb_reduce;
-    const int nb_ic_blocking = jcp.nb_reduce_blocking;
-
-    // override some constants for fused dw_conv
-    const int os_block = jcp.with_dw_conv ? jcp.ow : jcp.bcast_block;
-    const int nb_bcast = jcp.with_dw_conv ? jcp.oh : jcp.nb_bcast;
-    const int nb_bcast_blocking = jcp.with_dw_conv ? 1 : jcp.nb_bcast_blocking;
-    const int nb_bcast_blocking_max
-            = jcp.with_dw_conv ? 1 : jcp.nb_bcast_blocking_max;
-    const int nb_load_blocking = jcp.nb_load_blocking;
-    const int nb_load_blocking_max = jcp.with_dw_conv
-            ? jcp.nb_load_blocking
-            : jcp.nb_load_blocking_max;
-    const bool is_dst_layout_nxc = utils::one_of(
-            jcp.dst_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
-    const bool is_src_layout_nxc = utils::one_of(
-            jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
-
-    // Begin: declare Variables needed for dw conv.
-    memory_tracking::grantor_t dw_scratchpad(
-            scratchpad, memory_tracking::names::prefix_fusion);
-    dst_data_t *pbuf;
-    size_t row_offset;
-    const int nb_buffer = jcp.nb_load_blocking;
-    std::vector<dst_data_t *> addrs;
-    // End
-
-    auto init_bcast = [&](int iwork, int bcast_end, int &n, int &g,
-                              int &bcast_step, int &od, int &oh, int &ow,
-                              int &id, int &ih, int &iw) {
-        int osb {0};
-        nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb, nb_bcast);
-        bcast_step = step(
-                nb_bcast_blocking, nb_bcast - osb, nb_bcast_blocking_max);
-        bcast_step = nstl::min(bcast_step, bcast_end - iwork);
-
-        const int os = osb * os_block;
-        od = os / (jcp.oh * jcp.ow);
-        int os_2d = os % (jcp.oh * jcp.ow);
-        oh = os_2d / jcp.ow;
-        ow = os_2d % jcp.ow;
-
-        id = od * stride_d;
-        ih = oh * stride_h;
-        iw = ow * stride_w;
-        rp.iw_start = iw;
-
-        p.bcast_dim = this_block_size(os, jcp.os, bcast_step * os_block);
-        rp.os = p.bcast_dim;
-    };
-
-    auto init_load = [&](int ocb, int ocb_end, int &load_step) {
-        load_step = step(nb_load_blocking, ocb_end - ocb, nb_load_blocking_max);
-        const auto max_oc
-                = nstl::min(ocb_end * jcp.oc_block, jcp.oc_without_padding);
-        p.load_dim = this_block_size(
-                ocb * jcp.oc_block, max_oc, load_step * jcp.oc_block);
-    };
-
-    auto init_reduce = [&](int icb) {
-        const int nb_ic_blocking_step
-                = nstl::min(icb + nb_ic_blocking, nb_ic) - icb;
-        p.first_last_flag = 0 | (icb == 0 ? FLAG_REDUCE_FIRST : 0)
-                | (icb + nb_ic_blocking_step >= nb_ic ? FLAG_REDUCE_LAST : 0);
-
-        p.reduce_dim = this_block_size(
-                icb * jcp.ic_block, jcp.ic, nb_ic_blocking_step * jcp.ic_block);
-        rp.icb = p.reduce_dim;
-    };
-
-    auto ker_1x1 = [&](int ocb, int ocb_start, int icb, int n, int g, int od,
-                           int oh, int ow, int id, int ih, int iw) {
-        const int oc_off_idx = is_dst_layout_nxc
-                ? g * jcp.oc + ocb * jcp.oc_block
-                : g * nb_oc + ocb;
-        const size_t dst_off = data_blk_off(dst_d, n, oc_off_idx, od, oh, ow);
-
-        p.output_data = jcp.with_dw_conv
-                ? pbuf + (oh % pd()->dw_conv_pd_->jcp_.kh) * row_offset
-                : &dst[dst_off];
-        p.bias_data = bias
-                ? &bias[oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block)]
-                : nullptr;
-
-        p.load_data
-                = &weights[pd()->with_groups() ? weights_d.blk_off(g, ocb, icb)
-                                               : weights_d.blk_off(ocb, icb)];
-        const int ic_off_idx = is_src_layout_nxc
-                ? g * jcp.ic + icb * jcp.ic_block
-                : g * nb_ic + icb;
-        if (pd()->rtus_.reduce_src_) {
-            rp.ws = rtus_space + ithr * pd()->rtus_.space_per_thread_
-                    + (is_src_layout_nxc ? ic_off_idx
-                                         : jcp.is * ic_off_idx * jcp.ic_block);
-            if (ocb == ocb_start) {
-                rp.src = src + data_blk_off(src_d, n, ic_off_idx, id, ih, iw);
-                (*rtus_driver_)(&rp);
-            }
-            p.bcast_data = rp.ws;
-        } else
-            p.bcast_data = src + data_blk_off(src_d, n, ic_off_idx, id, ih, iw);
-
-        p.oc_l_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block);
-        p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
-        p.dst_orig = dst;
-
-        (*kernel_)(&p);
-    };
-    auto conv_1x1 = [&](int bcast_start, int bcast_end, int ocb_start,
-                            int ocb_end) {
-        if (bcast_start >= bcast_end || ocb_start >= ocb_end) return;
-
-        if (jcp.loop_order == loop_rlb) {
-            for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
-                init_reduce(icb);
-                int ocb = ocb_start;
-                while (ocb < ocb_end) {
-                    int load_step;
-                    init_load(ocb, ocb_end, load_step);
-                    int iwork = bcast_start;
-                    while (iwork < bcast_end) {
-                        int n {0}, g {0}, bcast_step {0}, od {0}, oh {0},
-                                ow {0}, id {0}, ih {0}, iw {0};
-                        init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh,
-                                ow, id, ih, iw);
-                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
-                                iw);
-                        iwork += bcast_step;
-                    }
-                    ocb += load_step;
-                }
-            }
-        } else if (jcp.loop_order == loop_lbr) {
-            int ocb = ocb_start;
-            while (ocb < ocb_end) {
-                int load_step;
-                init_load(ocb, ocb_end, load_step);
-                int iwork = bcast_start;
-                while (iwork < bcast_end) {
-                    int n {0}, g {0}, bcast_step {0}, od {0}, oh {0}, ow {0},
-                            id {0}, ih {0}, iw {0};
-                    init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh, ow,
-                            id, ih, iw);
-                    for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
-                        init_reduce(icb);
-                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
-                                iw);
-                    }
-                    iwork += bcast_step;
-                }
-                ocb += load_step;
-            }
-        } else if (jcp.loop_order == loop_rbl) {
-            for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
-                init_reduce(icb);
-                int iwork = bcast_start;
-                while (iwork < bcast_end) {
-                    int n {0}, g {0}, bcast_step {0}, od {0}, oh {0}, ow {0},
-                            id {0}, ih {0}, iw {0};
-                    init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh, ow,
-                            id, ih, iw);
-                    int ocb = ocb_start;
-                    while (ocb < ocb_end) {
-                        int load_step;
-                        init_load(ocb, ocb_end, load_step);
-                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
-                                iw);
-                        ocb += load_step;
-                    }
-                    iwork += bcast_step;
-                }
-            }
-        } else if (jcp.loop_order == loop_blr) {
-            int iwork = bcast_start;
-            while (iwork < bcast_end) {
-                int n {0}, g {0}, bcast_step {0}, od {0}, oh {0}, ow {0},
-                        id {0}, ih {0}, iw {0};
-                init_bcast(iwork, bcast_end, n, g, bcast_step, od, oh, ow, id,
-                        ih, iw);
-                int ocb = ocb_start;
-                while (ocb < ocb_end) {
-                    int load_step;
-                    init_load(ocb, ocb_end, load_step);
-                    for (int icb = 0; icb < nb_ic; icb += nb_ic_blocking) {
-                        init_reduce(icb);
-                        ker_1x1(ocb, ocb_start, icb, n, g, od, oh, ow, id, ih,
-                                iw);
-                    }
-                    ocb += load_step;
-                }
-                iwork += bcast_step;
-            }
-        } else {
-            assert(!"unsupported loop order");
-        }
-    };
-
-    auto ker_dw = [&](int n, int ocb_start, int load_step, int &dw_oh) {
-        auto &jcp_dw = pd()->dw_conv_pd_->jcp_;
-        int oh_1x1 = nstl::max(dw_oh * jcp_dw.stride_h - jcp_dw.t_pad, 0);
-
-        for (int i = 0; i < jcp_dw.kh; ++i)
-            addrs[i] = pbuf + ((oh_1x1++) % jcp_dw.kh) * row_offset;
-
-        const auto ocb_end = ocb_start + load_step;
-        const auto wch_stride = (is_src_layout_nxc ? 1 : jcp_dw.iw)
-                * jcp_dw.nb_ch_blocking * jcp_dw.ch_block;
-        const int dil_h = jcp_dw.dilate_h + 1;
-        const int str_h = jcp_dw.stride_h;
-        const int ch_num = jcp_dw.nb_ch_blocking;
-        const int ow = 0;
-        const int kw = 0;
-
-        for (int ch = ocb_start; ch < ocb_end; ch += jcp_dw.nb_ch_blocking) {
-
-            const int i_t_overflow
-                    = nstl::max(0, (int)(jcp_dw.t_pad - dw_oh * str_h));
-            const int i_b_overflow
-                    = nstl::max(jcp_dw.ih,
-                              (int)(dw_oh * str_h + (jcp_dw.kh - 1) * dil_h
-                                      - jcp_dw.t_pad + 1))
-                    - jcp_dw.ih;
-
-            const int kh = div_up(i_t_overflow, dil_h);
-            const int kh_padding = jcp_dw.kh - div_up(i_t_overflow, dil_h)
-                    - div_up(i_b_overflow, dil_h);
-
-            jit_conv_call_s par_conv_dw;
-
-            par_conv_dw.src = addrs.data();
-
-            const size_t ch_step = is_dst_layout_nxc
-                    ? jcp_dw.ch_block
-                    : dst_d.blk_off(0, 1, 0, 0);
-            par_conv_dw.dst
-                    = &dst[dst_d.blk_off(n, 0, dw_oh, ow) + ch * ch_step];
-
-            par_conv_dw.filt
-                    = &weights_dw[dw_weights_d.blk_off(ch, 0, 0, kh, kw)];
-            if (bias)
-                par_conv_dw.bias
-                        = &bias_dw[dw_bias_d.blk_off(ch * jcp_dw.ch_block)];
-
-            par_conv_dw.kh_padding = (size_t)nstl::max(0, kh_padding);
-
-            par_conv_dw.load_work = (nstl::min(ch + ch_num, jcp_dw.nb_ch) - ch)
-                    * jcp_dw.ch_block;
-
-            par_conv_dw.oc_l_off = ch * jcp_dw.ch_block;
-            par_conv_dw.post_ops_binary_rhs_arg_vec
-                    = post_ops_binary_rhs_arg_vec_dw;
-            par_conv_dw.dst_orig = dst;
-
-            (*kernel_dw_)(&par_conv_dw);
-
-            for (int i = 0; i < jcp_dw.kh; ++i)
-                addrs[i] += wch_stride;
-        }
-    };
-
-    auto conv_dw = [&]() {
-        // Set variables
-        auto dw_conv_buffer
-                = dw_scratchpad.get<dst_data_t>(key_fusion_inout_buffer);
-        auto &jcp_dw = pd()->dw_conv_pd_->jcp_;
-
-        const auto dw_conv_buffer_size_
-                = (size_t)jcp_dw.kh * jcp.ow * nb_buffer * jcp.oc_block;
-        pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_;
-        row_offset = dw_conv_buffer_size_ / jcp_dw.kh;
-        addrs.resize(jcp_dw.kh);
-
-        int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0};
-        balance2D(nthr, ithr, jcp.mb * jcp.ngroups * jcp_dw.oh, bcast_start,
-                bcast_end, nb_oc, ocb_start, ocb_end, jcp.load_grp_count);
-
-        while (ocb_start < ocb_end) {
-            int load_step;
-            init_load(ocb_start, ocb_end, load_step);
-
-            int oh_1x1 = 0;
-            auto bcast_iter = bcast_start;
-            while (bcast_iter < bcast_end) {
-                int n {0}, g {0}, oh_dw {0};
-                nd_iterator_init(bcast_iter, n, jcp.mb, g, jcp.ngroups, oh_dw,
-                        jcp_dw.oh);
-                if (oh_dw == 0) oh_1x1 = 0; // Reset over mb boundary
-                const int oh_1x1_range = oh_dw * jcp_dw.stride_h - jcp_dw.t_pad;
-                const int oh_1x1_begin = nstl::max(oh_1x1_range, 0);
-                const int oh_1x1_end
-                        = nstl::min(oh_1x1_range + jcp_dw.kh, jcp.oh);
-                oh_1x1 = nstl::max(
-                        oh_1x1_begin, oh_1x1); // Skip rows computed previously
-
-                // dw_spatial to 1x1 spatial conversion. if jcp.oh != jcp_dw.oh
-                const int bcast_start_1x1
-                        = n * jcp.ngroups * jcp.oh + g * jcp.oh + oh_1x1;
-                const int bcast_end_1x1 = bcast_start_1x1 - oh_1x1 + oh_1x1_end;
-
-                conv_1x1(bcast_start_1x1, bcast_end_1x1, ocb_start,
-                        ocb_start + load_step);
-                oh_1x1 = oh_1x1_end;
-                ker_dw(n, g * nb_oc + ocb_start, load_step, oh_dw);
-
-                bcast_iter += nb_bcast_blocking;
-            }
-            ocb_start += load_step;
-        }
-    };
-
-    if (jcp.with_dw_conv) {
-        conv_dw();
-    } else {
-
-        const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
-        int bcast_start {0}, bcast_end {0}, ocb_start {0}, ocb_end {0};
-        balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load,
-                ocb_start, ocb_end, jcp.load_grp_count);
-
-        conv_1x1(bcast_start, bcast_end, ocb_start, ocb_end);
-    }
-}
-
-template struct jit_sve_512_1x1_convolution_fwd_t<data_type::f32>;
-
-/* convolution backward wtr data */
-template <data_type_t diff_dst_type, data_type_t wei_type,
-        data_type_t diff_src_type>
-void jit_sve_512_1x1_convolution_bwd_data_t<diff_dst_type, wei_type,
-        diff_src_type>::execute_backward_data(const exec_ctx_t &ctx) const {
-    auto diff_dst = CTX_IN_MEM(const diff_dst_data_t *, DNNL_ARG_DIFF_DST);
-    auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
-    auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
-
-    const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
-    const memory_desc_wrapper weights_d(pd()->weights_md(0));
-    const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
-
-    const auto &jcp = kernel_->jcp;
-    auto rtus_space = pd()->rtus_.reduce_src_
-            ? ctx.get_scratchpad_grantor().template get<diff_src_data_t>(
-                    key_conv_rtus_space)
-            : nullptr;
-
-    const int ndims = diff_src_d.ndims();
-
-    assert(jcp.stride_w == 1 && jcp.stride_h == 1 && jcp.stride_d == 1);
-
-    const int stride_d = (ndims == 5) ? pd()->desc()->strides[0] : 1;
-    const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[ndims - 4];
-    const int stride_w = pd()->desc()->strides[ndims - 3];
-
-    const int nb_ic = jcp.nb_load;
-    const int nb_oc = jcp.nb_reduce;
-    const int os_block = jcp.bcast_block;
-    const int nb_oc_blocking = jcp.nb_reduce_blocking;
-
-    const int work_amount = jcp.mb * jcp.ngroups * jcp.nb_bcast;
-
-    auto step = [](int default_step, int remaining, int tail_step) {
-        assert(default_step <= tail_step);
-        return remaining < tail_step ? remaining : default_step;
-    };
-
-    parallel(jcp.nthr, [&](const int ithr, const int nthr) {
-        auto p = jit_1x1_conv_call_s();
-        auto rp = rtus_driver_t<sve_512>::call_params_t();
-
-        int bcast_start {0}, bcast_end {0}, icb_start {0}, icb_end {0};
-        balance2D(nthr, ithr, work_amount, bcast_start, bcast_end, jcp.nb_load,
-                icb_start, icb_end, jcp.load_grp_count);
-
-        bool reduce_outer
-                = (jcp.loop_order == loop_rbl || jcp.loop_order == loop_rlb);
-        int nboc_outer = reduce_outer ? nb_oc : 1;
-        int ocb_outer_step = reduce_outer ? nb_oc_blocking : 1;
-
-        int nboc_inner = reduce_outer ? 1 : nb_oc;
-        int ocb_inner_step = reduce_outer ? 1 : nb_oc_blocking;
-        const int max_ic = nstl::min(icb_end * jcp.ic_block, jcp.ic);
-
-        for (int ocb_outer = 0; ocb_outer < nboc_outer;
-                ocb_outer += ocb_outer_step) {
-            size_t cur_ocb_outer
-                    = nstl::min(ocb_outer + ocb_outer_step, nboc_outer)
-                    - ocb_outer;
-
-            int load_step = 0;
-            for (int icb = icb_start; icb < icb_end; icb += load_step) {
-                load_step = step(jcp.nb_load_blocking, jcp.nb_load - icb,
-                        jcp.nb_load_blocking_max);
-
-                p.load_dim = this_block_size(
-                        icb * jcp.ic_block, max_ic, load_step * jcp.ic_block);
-                rp.icb = p.load_dim;
-                int bcast_step;
-                for (int iwork = bcast_start; iwork < bcast_end;
-                        iwork += bcast_step) {
-                    int n {0}, g {0}, osb {0};
-                    nd_iterator_init(iwork, n, jcp.mb, g, jcp.ngroups, osb,
-                            jcp.nb_bcast);
-
-                    bcast_step = step(jcp.nb_bcast_blocking, jcp.nb_bcast - osb,
-                            jcp.nb_bcast_blocking_max);
-                    bcast_step = nstl::min(bcast_step, bcast_end - iwork);
-
-                    const int os = osb * os_block;
-                    p.bcast_dim = this_block_size(
-                            os, jcp.os, bcast_step * os_block);
-                    rp.os = p.bcast_dim;
-                    const int od = os / (jcp.oh * jcp.ow);
-                    const int os_2d = os % (jcp.oh * jcp.ow);
-                    const int oh = os_2d / jcp.ow;
-                    const int ow = os_2d % jcp.ow;
-                    const int id = od * stride_d;
-                    const int ih = oh * stride_h;
-                    const int iw = ow * stride_w;
-                    rp.iw_start = iw;
-                    const bool is_dsrc_layout_nxc
-                            = utils::one_of(jcp.src_tag, format_tag::nwc,
-                                    format_tag::nhwc, format_tag::ndhwc);
-                    const int ic_off_idx = is_dsrc_layout_nxc
-                            ? g * jcp.ic + icb * jcp.ic_block
-                            : g * nb_ic + icb;
-                    rp.src = diff_src
-                            + data_blk_off(
-                                    diff_src_d, n, ic_off_idx, id, ih, iw);
-                    if (pd()->rtus_.reduce_src_) {
-                        rp.ws = rtus_space
-                                + ithr * pd()->rtus_.space_per_thread_;
-                        p.output_data = rp.ws;
-                    } else
-                        p.output_data = rp.src;
-
-                    for (int ocb_inner = 0; ocb_inner < nboc_inner;
-                            ocb_inner += ocb_inner_step) {
-                        int cur_ocb_inner
-                                = nstl::min(ocb_inner + ocb_inner_step,
-                                          nboc_inner)
-                                - ocb_inner;
-
-                        int ocb = reduce_outer ? ocb_outer : ocb_inner;
-                        int nb_oc_blocking_step
-                                = reduce_outer ? cur_ocb_outer : cur_ocb_inner;
-                        const bool is_ddst_layout_nxc
-                                = utils::one_of(jcp.dst_tag, format_tag::nwc,
-                                        format_tag::nhwc, format_tag::ndhwc);
-                        const int oc_off_idx = is_ddst_layout_nxc
-                                ? g * jcp.oc + ocb * jcp.oc_block
-                                : g * nb_oc + ocb;
-                        size_t diff_dst_off = data_blk_off(
-                                diff_dst_d, n, oc_off_idx, od, oh, ow);
-                        p.bcast_data = &diff_dst[diff_dst_off];
-
-                        p.load_data = &weights[pd()->with_groups()
-                                        ? weights_d.blk_off(g, ocb, icb)
-                                        : weights_d.blk_off(ocb, icb)];
-
-                        p.first_last_flag = ocb == 0 ? FLAG_REDUCE_FIRST : 0;
-
-                        p.reduce_dim = this_block_size(ocb * jcp.oc_block,
-                                jcp.oc, nb_oc_blocking_step * jcp.oc_block);
-
-                        (*kernel_)(&p);
-                    }
-                    if (pd()->rtus_.reduce_src_) (*rtus_driver_)(&rp);
-                }
-            }
-        }
-    });
-}
-
-template struct jit_sve_512_1x1_convolution_bwd_data_t<data_type::f32>;
-
-/* convolution backward wtr weights */
-
-#define wht_blk_off(d, g, ...) \
-    (pd()->with_groups() ? (d).blk_off((g), __VA_ARGS__) \
-                         : (d).blk_off(__VA_ARGS__))
-
-status_t jit_sve_512_1x1_convolution_bwd_weights_t ::init(engine_t *engine) {
-
-    CHECK(safe_ptr_assign(kernel_,
-            new jit_sve_512_1x1_conv_kernel(
-                    pd()->jcp_, *pd()->attr(), *pd()->dst_md(0))));
-    CHECK(safe_ptr_assign(
-            acc_ker_, new cpu_accumulator_1d_t<data_type::f32>()));
-    CHECK(safe_ptr_assign(reducer_bias_,
-            new cpu_reducer_t<data_type::f32>(pd()->reducer_bia_conf_)));
-    CHECK(kernel_->create_kernel());
-    CHECK(acc_ker_->create_kernel());
-    CHECK(reducer_bias_->create_kernel());
-
-    CHECK(init_rtus_driver<sve_512>(this));
-    return status::success;
-}
-
-void jit_sve_512_1x1_convolution_bwd_weights_t::execute_backward_weights(
-        const exec_ctx_t &ctx) const {
-    auto diff_dst = CTX_IN_MEM(const data_t *, DNNL_ARG_DIFF_DST);
-    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
-    auto diff_weights = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_WEIGHTS);
-    auto diff_bias_in = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_BIAS);
-
-    const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
-    const memory_desc_wrapper src_d(pd()->src_md());
-    const memory_desc_wrapper diff_weights_d(pd()->diff_weights_md(0));
-
-    const auto &jcp = kernel_->jcp;
-
-    const auto scratchpad = ctx.get_scratchpad_grantor();
-    auto rtus_space = pd()->rtus_.reduce_src_
-            ? scratchpad.get<data_t>(key_conv_rtus_space)
-            : NULL;
-    const bool is_bias_padded
-            = pd()->with_bias() && jcp.oc_without_padding % jcp.oc_block != 0;
-
-    data_t *diff_bias = is_bias_padded
-            ? scratchpad.get<data_t>(key_conv_padded_bias)
-            : diff_bias_in;
-    auto wei_reduction = scratchpad.get<data_t>(key_conv_wei_reduction);
-
-    const int ndims = src_d.ndims();
-    const int wei_size = jcp.ngroups * rnd_up(jcp.oc, jcp.oc_block)
-            * rnd_up(jcp.ic, jcp.ic_block);
-
-    simple_barrier::ctx_t reduction_barrier;
-    simple_barrier::ctx_init(&reduction_barrier);
-
-    const auto reducer_bia_scratchpad
-            = memory_tracking::grantor_t(scratchpad, prefix_reducer_bia);
-    auto rb = this->reducer_bias_.get();
-    rb->init(reducer_bia_scratchpad);
-
-    // TODO (Roma): remove this restriction
-    assert(jcp.stride_w == 1 && jcp.stride_h == 1);
-
-    const int nb_ic = jcp.nb_bcast;
-    const int nb_ic_blocking = jcp.nb_bcast_blocking;
-
-    const int nb_oc = jcp.nb_load;
-    const int nb_oc_blocking = jcp.nb_load_blocking;
-
-    const int sp_nb = jcp.nb_reduce;
-    const int mb_sp_work = jcp.mb * sp_nb;
-
-    const int stride_h = (ndims == 3) ? 1 : pd()->desc()->strides[0];
-    const int stride_w = pd()->desc()->strides[ndims - 3];
-
-    auto step = [](int default_step, int remaining, int tail_step) {
-        assert(default_step <= tail_step);
-        return remaining < tail_step ? remaining : default_step;
-    };
-
-    const bool is_src_layout_nxc = utils::one_of(
-            jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
-
-    const bool is_ddst_layout_nxc = utils::one_of(
-            jcp.dst_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
-
-    auto maybe_zero_icpad = [&](const int g_start, const int g_end,
-                                    const int ocb_start, const int ocb_end) {
-        // write zeros to IC padded region.
-        const int ic_tail = jcp.ic_without_padding % jcp.ic_block;
-        if (is_ddst_layout_nxc && ic_tail != 0) {
-            for_(int g = g_start; g < g_end; ++g)
-            for (int z_ocb = ocb_start; z_ocb < ocb_end; ++z_ocb) {
-                const int z_icb = nb_ic - 1;
-                const size_t off = wht_blk_off(diff_weights_d, g, z_ocb, z_icb)
-                        + ic_tail * jcp.oc_block;
-                data_t *z_wei = diff_weights + off;
-                const int zero_work
-                        = (nb_ic * jcp.ic_block - jcp.ic_without_padding)
-                        * jcp.oc_block;
-                PRAGMA_OMP_SIMD()
-                for (int o = 0; o < zero_work; ++o) {
-                    z_wei[o] = 0;
-                }
-            }
-        }
-    };
-
-    auto ker = [&](const int ithr, const int nthr) {
-        assert(nthr == jcp.nthr);
-
-        const int ithr_ic_b = ithr % jcp.nthr_ic_b;
-        const int ithr_oc_b = ithr / jcp.nthr_ic_b % jcp.nthr_oc_b;
-        const int ithr_g = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b % jcp.nthr_g;
-        const int ithr_mb = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b / jcp.nthr_g;
-
-        /* reduction dimension */
-        int mb_sp_b_start {0}, mb_sp_b_end {0};
-        balance211(
-                mb_sp_work, jcp.nthr_mb, ithr_mb, mb_sp_b_start, mb_sp_b_end);
-
-        /* independent dimensions */
-        int g_start {0}, oc_b_start {0}, ic_b_start {0};
-        int g_end {0}, oc_b_end {0}, ic_b_end {0};
-
-        balance211(jcp.ngroups, jcp.nthr_g, ithr_g, g_start, g_end);
-        balance211(jcp.nb_load, jcp.nthr_oc_b, ithr_oc_b, oc_b_start, oc_b_end);
-        balance211(
-                jcp.nb_bcast, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end);
-
-        const int g_work = g_end - g_start;
-        const int oc_b_work = oc_b_end - oc_b_start;
-        const int ic_b_work = ic_b_end - ic_b_start;
-        const bool cache_aliasing
-                = (jcp.ic * jcp.ngroups * sizeof(float)) % 1024 == 0;
-        int reduce_step = jcp.nb_reduce_blocking;
-        int reduce_step_max = jcp.nb_reduce_blocking_max;
-        if (is_src_layout_nxc && cache_aliasing) {
-            // Experiments show 4 is a magic number with the tested shapes.
-            // TODO: maybe tune for shapes with sp_dim%4 != 0
-            reduce_step = nstl::min(4, reduce_step);
-            reduce_step_max = reduce_step;
-        }
-
-        data_t *diff_wei = ithr_mb == 0
-                ? diff_weights
-                : wei_reduction + (ithr_mb - 1) * wei_size;
-
-        int sp_b_step = 0;
-        for (int mb_sp_b = mb_sp_b_start; mb_sp_b < mb_sp_b_end;
-                mb_sp_b += sp_b_step) {
-            int img {0}, sp_b {0};
-            nd_iterator_init(mb_sp_b, img, jcp.mb, sp_b, sp_nb);
-            sp_b_step = step(reduce_step,
-                    nstl::min(sp_nb - sp_b, mb_sp_b_end - mb_sp_b),
-                    reduce_step_max);
-
-            for (int g = g_start; g < g_end; ++g) {
-                int load_step = 0;
-                int bcast_step = 0;
-                for (int ic_b = ic_b_start; ic_b < ic_b_end;
-                        ic_b += bcast_step) {
-                    if (is_src_layout_nxc && cache_aliasing) {
-                        bcast_step = ic_b_work;
-                    } else {
-                        bcast_step = step(nb_ic_blocking, ic_b_end - ic_b,
-                                jcp.nb_bcast_blocking_max);
-                    }
-
-                    for (int oc_b = oc_b_start; oc_b < oc_b_end;
-                            oc_b += load_step) {
-                        load_step = step(nb_oc_blocking, oc_b_end - oc_b,
-                                jcp.nb_load_blocking_max);
-                        const int _ic_b = g * nb_ic + ic_b;
-                        const int oc_off_idx = is_ddst_layout_nxc
-                                ? g * jcp.oc + oc_b * jcp.oc_block
-                                : g * nb_oc + oc_b;
-
-                        data_t *store_to;
-
-                        const size_t off
-                                = wht_blk_off(diff_weights_d, g, oc_b, ic_b);
-                        store_to = diff_wei + off;
-
-                        const int ic_off_idx
-                                = (is_src_layout_nxc ? jcp.ic_block : 1)
-                                * _ic_b;
-                        const data_t *diff_src
-                                = &src[src_d.blk_off(img, ic_off_idx)];
-
-                        int sp_b_end = sp_b + sp_b_step;
-                        const data_t *pdiff_dst = &diff_dst[diff_dst_d.blk_off(
-                                img, oc_off_idx)];
-                        const data_t *local_src = diff_src;
-
-                        auto p = jit_1x1_conv_call_s();
-                        auto rp = rtus_driver_t<sve_512>::call_params_t();
-                        p.output_stride = utils::rnd_up(jcp.ic, jcp.ic_block)
-                                * jcp.oc_block * jcp.typesize_out;
-
-                        p.load_dim = this_block_size(oc_b * jcp.oc_block,
-                                jcp.oc, load_step * jcp.oc_block);
-
-                        p.bcast_dim = this_block_size(ic_b * jcp.ic_block,
-                                jcp.ic, bcast_step * jcp.ic_block);
-                        rp.icb = p.bcast_dim;
-                        p.output_data = store_to;
-
-                        p.reduce_dim = sp_b_step * jcp.reduce_block;
-                        rp.os = p.reduce_dim;
-                        p.first_last_flag = 0
-                                | (mb_sp_b == mb_sp_b_start ? FLAG_REDUCE_FIRST
-                                                            : 0)
-                                | (sp_b_end == sp_nb ? FLAG_SP_LAST : 0);
-
-                        int sp = sp_b * jcp.reduce_block;
-                        int oc_mult
-                                = is_ddst_layout_nxc ? jcp.oc : jcp.oc_block;
-                        p.load_data = pdiff_dst + sp * oc_mult;
-
-                        if (pd()->rtus_.reduce_src_) {
-                            const int oh = sp / jcp.ow;
-                            const int ow = sp % jcp.ow;
-
-                            const int ih = oh * stride_h;
-                            const int iw = ow * stride_w;
-                            rp.iw_start = iw;
-
-                            rp.ws = rtus_space
-                                    + ithr * pd()->rtus_.space_per_thread_
-                                    + sp * jcp.ic_block;
-
-                            if (ndims == 3)
-                                rp.src = local_src
-                                        + iw * src_d.blocking_desc().strides[2];
-                            else
-                                rp.src = local_src
-                                        + ih * src_d.blocking_desc().strides[2]
-                                        + iw * src_d.blocking_desc().strides[3];
-                            (*rtus_driver_)(&rp);
-
-                            p.bcast_data = rp.ws;
-                        } else {
-                            int ic_mult
-                                    = is_src_layout_nxc ? jcp.ic : jcp.ic_block;
-                            p.bcast_data = local_src + sp * ic_mult;
-                        }
-
-                        (*kernel_)(&p);
-                    }
-                }
-            }
-        }
-
-        if (ithr_mb == 0 && ic_b_end >= jcp.nb_bcast) {
-            maybe_zero_icpad(g_start, g_end, oc_b_start, oc_b_end);
-        }
-
-        /* diff_weights[:] += sum(wei_reduction[thr_mb][:]) */
-        if (dnnl_thr_syncable() && jcp.nthr_mb > 1) {
-            simple_barrier::barrier(&reduction_barrier, jcp.nthr);
-            const int work = g_work * oc_b_work * ic_b_work;
-            int start {0}, end {0};
-            balance211(work, jcp.nthr_mb, ithr_mb, start, end);
-            if (start == end) return;
-
-            for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) {
-                int w = start;
-                int sub_g_start {0}, sub_oc_b_start {0}, sub_ic_b_start {0};
-                nd_iterator_init(w, sub_g_start, g_work, sub_oc_b_start,
-                        oc_b_work, sub_ic_b_start, ic_b_work);
-                while (w < end) {
-                    const int g = g_start + sub_g_start;
-                    const int oc_b = oc_b_start + sub_oc_b_start;
-                    const int ic_b = ic_b_start + sub_ic_b_start;
-                    const int ic_to_accumulate
-                            = nstl::min(end - w, ic_b_work - sub_ic_b_start)
-                            * jcp.ic_block;
-                    const int acc_size
-                            = this_block_size(ic_b * jcp.ic_block,
-                                      jcp.ic_without_padding, ic_to_accumulate)
-                            * jcp.oc_block;
-
-                    const size_t off
-                            = wht_blk_off(diff_weights_d, g, oc_b, ic_b);
-                    data_t *d = diff_weights + off;
-                    data_t *s = wei_reduction + (thr_mb - 1) * wei_size + off;
-
-                    acc_ker_->accumulate(d, s, acc_size);
-
-                    nd_iterator_jump(w, end, sub_g_start, g_work,
-                            sub_oc_b_start, oc_b_work, sub_ic_b_start,
-                            ic_b_work);
-                }
-            }
-        }
-    };
-
-    auto ker_bias = [&](int ithr, int nthr) {
-        assert(nthr == rb->balancer().nthr_);
-
-        const int b_job_start = rb->balancer().ithr_job_off(ithr);
-        const int b_njobs = rb->balancer().ithr_njobs(ithr);
-
-        if (b_njobs == 0) return;
-
-        /* reduction dimension */
-        int img_start {0}, img_end {0};
-
-        balance211(jcp.mb, rb->balancer().nthr_per_group_,
-                rb->balancer().id_in_group(ithr), img_start, img_end);
-
-        /* jobs */
-        int g_start {0}, ocb_start {0};
-        nd_iterator_init(
-                b_job_start, g_start, jcp.ngroups, ocb_start, jcp.nb_load);
-
-        for (int img = img_start; img < img_end; ++img) {
-            int g = g_start, ocb = ocb_start;
-            for (int b_job_loc = 0; b_job_loc < b_njobs; ++b_job_loc) {
-                const int oc_off_idx = is_ddst_layout_nxc
-                        ? g * jcp.oc + ocb * jcp.oc_block
-                        : g * jcp.nb_load + ocb;
-                const data_t *d_dst
-                        = &diff_dst[diff_dst_d.blk_off(img, oc_off_idx)];
-
-                data_t *d_bias = rb->get_local_ptr(ithr, diff_bias,
-                                         reducer_bia_scratchpad)
-                        + b_job_loc * rb->balancer().job_size_;
-                const int sp_shift = is_ddst_layout_nxc ? jcp.ngroups * jcp.oc
-                                                        : jcp.oc_block;
-                const auto max_oc = this_block_size(
-                        ocb * jcp.oc_block, jcp.oc, jcp.oc_block);
-                if (img == img_start)
-                    for (int o = 0; o < 16; ++o)
-                        d_bias[o] = 0.;
-
-                for (int os = 0; os < jcp.os; ++os) {
-                    PRAGMA_OMP_SIMD()
-                    for (int o = 0; o < max_oc; ++o)
-                        d_bias[o] += d_dst[o];
-                    d_dst += sp_shift;
-                }
-
-                nd_iterator_step(g, jcp.ngroups, ocb, jcp.nb_load);
-            }
-        }
-
-        if (dnnl_thr_syncable())
-            rb->reduce(ithr, diff_bias, reducer_bia_scratchpad);
-    };
-
-    if (dnnl_thr_syncable()) {
-        parallel(jcp.nthr, [&](const int ithr, const int nthr) {
-            ker(ithr, jcp.nthr);
-            if (pd()->with_bias()) ker_bias(ithr, jcp.nthr);
-        });
-    } else {
-        parallel(jcp.nthr, [&](int ithr, int nthr) { ker(ithr, nthr); });
-        if (jcp.nthr_mb > 1)
-            parallel(jcp.nthr, [&](int ithr, int nthr) {
-                assert(nthr == jcp.nthr);
-
-                const int ithr_ic_b = ithr % jcp.nthr_ic_b;
-                const int ithr_oc_b = ithr / jcp.nthr_ic_b % jcp.nthr_oc_b;
-                const int ithr_g
-                        = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b % jcp.nthr_g;
-                const int ithr_mb
-                        = ithr / jcp.nthr_ic_b / jcp.nthr_oc_b / jcp.nthr_g;
-
-                /* independent dimensions */
-                int g_start {0}, oc_b_start {0}, ic_b_start {0};
-                int g_end {0}, oc_b_end {0}, ic_b_end {0};
-
-                balance211(jcp.ngroups, jcp.nthr_g, ithr_g, g_start, g_end);
-                balance211(jcp.nb_load, jcp.nthr_oc_b, ithr_oc_b, oc_b_start,
-                        oc_b_end);
-                balance211(jcp.nb_bcast, jcp.nthr_ic_b, ithr_ic_b, ic_b_start,
-                        ic_b_end);
-
-                const int g_work = g_end - g_start;
-                const int oc_b_work = oc_b_end - oc_b_start;
-                const int ic_b_work = ic_b_end - ic_b_start;
-
-                const int work = g_work * oc_b_work * ic_b_work;
-                int start {0}, end {0};
-                balance211(work, jcp.nthr_mb, ithr_mb, start, end);
-                if (start == end) return;
-
-                for (int thr_mb = 1; thr_mb < jcp.nthr_mb; ++thr_mb) {
-                    int w = start;
-                    int sub_g_start {0}, sub_oc_b_start {0}, sub_ic_b_start {0};
-                    nd_iterator_init(w, sub_g_start, g_work, sub_oc_b_start,
-                            oc_b_work, sub_ic_b_start, ic_b_work);
-                    while (w < end) {
-                        const int g = g_start + sub_g_start;
-                        const int oc_b = oc_b_start + sub_oc_b_start;
-                        const int ic_b = ic_b_start + sub_ic_b_start;
-                        const int ic_to_accumulate
-                                = nstl::min(end - w, ic_b_work - sub_ic_b_start)
-                                * jcp.ic_block;
-                        const int acc_size
-                                = this_block_size(ic_b * jcp.ic_block,
-                                          jcp.ic_without_padding,
-                                          ic_to_accumulate)
-                                * jcp.oc_block;
-
-                        const size_t off
-                                = wht_blk_off(diff_weights_d, g, oc_b, ic_b);
-                        data_t *d = diff_weights + off;
-                        data_t *s
-                                = wei_reduction + (thr_mb - 1) * wei_size + off;
-
-                        acc_ker_->accumulate(d, s, acc_size);
-
-                        nd_iterator_jump(w, end, sub_g_start, g_work,
-                                sub_oc_b_start, oc_b_work, sub_ic_b_start,
-                                ic_b_work);
-                    }
-                }
-            });
-        if (pd()->with_bias()) {
-            parallel(jcp.nthr,
-                    [&](int ithr, int nthr) { ker_bias(ithr, nthr); });
-            parallel(jcp.nthr, [&](int ithr, int nthr) {
-                assert(nthr == rb->balancer().nthr_);
-                MAYBE_UNUSED(nthr);
-                if (rb->balancer().ithr_njobs(ithr) == 0) return;
-                rb->reduce_nolock(ithr, diff_bias, reducer_bia_scratchpad);
-            });
-        }
-    }
-
-    /* TODO: put this in ker_bias */
-    if (is_bias_padded) {
-        assert(IMPLICATION(!is_ddst_layout_nxc, jcp.ngroups == 1));
-        const int padded_stride = rnd_up(jcp.oc, jcp.oc_block);
-        const int stride = jcp.oc_without_padding;
-        for (int g = 0; g < jcp.ngroups; ++g) {
-            utils::array_copy(diff_bias_in + g * stride,
-                    diff_bias + g * padded_stride, stride);
-        }
-    }
-}
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/cpu/aarch64/jit_sve_512_1x1_convolution.hpp b/src/cpu/aarch64/jit_sve_512_1x1_convolution.hpp
deleted file mode 100644
index ac7b1f2b1d1..00000000000
--- a/src/cpu/aarch64/jit_sve_512_1x1_convolution.hpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_AARCH64_JIT_SVE_1X1_CONVOLUTION_HPP
-#define CPU_AARCH64_JIT_SVE_1X1_CONVOLUTION_HPP
-
-#include "common/c_types_map.hpp"
-#include "common/dnnl_thread.hpp"
-#include "common/memory_tracking.hpp"
-#include "common/primitive.hpp"
-#include "common/primitive_hashing.hpp"
-#include "common/utils.hpp"
-
-#include "cpu/cpu_convolution_pd.hpp"
-#include "cpu/dw_convolution_utils.hpp"
-#include "cpu/platform.hpp"
-
-#include "cpu/aarch64/cpu_reducer.hpp"
-#include "cpu/aarch64/jit_sve_512_1x1_conv_kernel.hpp"
-#include "cpu/aarch64/jit_uni_1x1_conv_utils.hpp"
-#include "cpu/aarch64/jit_uni_dw_convolution.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace aarch64 {
-
-template <impl::data_type_t src_type, impl::data_type_t wei_type = src_type,
-        impl::data_type_t dst_type = src_type>
-struct jit_sve_512_1x1_convolution_fwd_t : public primitive_t {
-    struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
-        pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
-            if (copy(other) != status::success) is_initialized_ = false;
-        }
-
-        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", sve_512, ""),
-                jit_sve_512_1x1_convolution_fwd_t);
-
-        status_t init(engine_t *engine) {
-            using namespace utils;
-
-            bool ok = true && is_fwd()
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && expect_data_types(src_type, wei_type, dst_type, dst_type,
-                            data_type::undef)
-                    && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::post_ops, dst_type)
-                    && !has_zero_dim_memory() && set_default_formats()
-                    && attr_.set_default_formats(dst_md(0)) == status::success;
-            if (!ok) return status::unimplemented;
-
-            const convolution_desc_t *conv_d = desc();
-            const memory_desc_t *src_d = src_md();
-            rtus_prepare(this, conv_d, src_d, dst_md());
-
-            CHECK(jit_sve_512_1x1_conv_kernel::init_conf(jcp_, *conv_d, *src_d,
-                    *weights_md(), *dst_md(), *attr(), dnnl_get_max_threads(),
-                    rtus_.reduce_src_));
-            if (jcp_.with_dw_conv) CHECK(depthwise_po_init(engine));
-
-            auto scratchpad = scratchpad_registry().registrar();
-            jit_sve_512_1x1_conv_kernel::init_scratchpad(scratchpad, jcp_);
-
-            rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
-
-            return status::success;
-        }
-
-        const memory_desc_t *dst_md(
-                int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
-                    ? dw_conv_pd_->dst_md(index, user_input)
-                    : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
-        }
-
-        const memory_desc_t *arg_md(
-                int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
-                switch (arg) {
-                    case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
-                        return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
-                    case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS:
-                        return dw_conv_pd_->weights_md(0);
-                    case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS:
-                        return dw_conv_pd_->weights_md(1);
-                    default: break;
-                }
-            }
-            return convolution_fwd_pd_t::arg_md(arg, user_input);
-        }
-
-        arg_usage_t arg_usage(int arg) const override {
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
-                return arg_usage_t::input;
-
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
-
-            return convolution_fwd_pd_t::arg_usage(arg);
-        }
-
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
-        using dw_pd_t = jit_sve_512_dw_convolution_fwd_t::pd_t;
-        std::unique_ptr<dw_pd_t> dw_conv_pd_;
-
-    protected:
-        bool set_default_formats() {
-            using namespace format_tag;
-
-            const memory_desc_wrapper src_d(&src_md_);
-            const memory_desc_wrapper dst_d(&dst_md_);
-
-            const auto dat_tag_nxc = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
-            const auto dat_tag_nCx16c
-                    = utils::pick(ndims() - 3, nCw16c, nChw16c, nCdhw16c);
-            const auto curr_src_tag
-                    = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-            const auto curr_dst_tag
-                    = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-            const auto is_data_layout_nxc
-                    = IMPLICATION(curr_src_tag != dat_tag_nxc,
-                              src_d.format_kind() == format_kind::any)
-                    && IMPLICATION(curr_dst_tag != dat_tag_nxc,
-                            dst_d.format_kind() == format_kind::any)
-                    && utils::one_of(dat_tag_nxc, curr_src_tag, curr_dst_tag);
-            auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
-            auto wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
-                    OIw16i16o, gOIw16i16o, OIhw16i16o, gOIhw16i16o, OIdhw16i16o,
-                    gOIdhw16i16o);
-
-            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
-        }
-        status_t copy(const pd_t &other) {
-            jcp_ = other.jcp_;
-            rtus_ = other.rtus_;
-            if (other.dw_conv_pd_) {
-                dw_conv_pd_.reset(other.dw_conv_pd_->clone());
-                if (!dw_conv_pd_) return status::out_of_memory;
-            }
-            return status::success;
-        }
-
-        status_t depthwise_po_init(engine_t *engine) {
-
-            using namespace memory_tracking;
-            auto &jcp_1x1 = jcp_;
-            primitive_attr_t attr_1x1(*attr());
-            if (!attr_1x1.is_initialized()) return status::out_of_memory;
-            const auto &src_md = dst_md_;
-            const memory_desc_wrapper src_d(src_md);
-            const auto nthr = dnnl_get_max_threads();
-            auto l2_cache = platform::get_per_core_cache_size(2) * nthr;
-
-            // Note: A robust fusion implementation would be to check if both
-            // 1x1 conv and dw conv that are considered here for fusion are
-            // optimal independently. This would require creating a new
-            // primitive_desc through primitive_iterator & check if they match.
-            // Due to concern that these creations and/or checks could be heavy,
-            // for 1x1: Check that no better ISA is available.
-            // for dw: Always fuse with same ISA.
-            // Caveat: May be a better dw conv exists.
-
-            // TODO: Add a check if better ISA exists following above note.
-            bool ok = true
-                    && (attr_1x1.post_ops_.find(primitive_kind::sum) == -1)
-                    // TODO: Below may be further tuned.
-                    && (l2_cache * 2 < src_d.size())
-                    // load_grp_count check can be redundant due to l2 check
-                    // above. Adding it explicitly as the current driver doesn't
-                    // work if this condition fails.
-                    && (jcp_1x1.load_grp_count < 2);
-            if (!ok) return status::unimplemented;
-
-            int dw_po_index
-                    = attr_1x1.post_ops_.find(primitive_kind::convolution);
-            convolution_desc_t cd_dw;
-            primitive_attr_t attr_dw;
-            CHECK(get_depthwise_conv_desc(
-                    cd_dw, src_md, attr_1x1, attr_dw, dw_po_index));
-
-            CHECK(safe_ptr_assign(
-                    dw_conv_pd_, new dw_pd_t(&cd_dw, &attr_dw, nullptr)));
-            CHECK(dw_conv_pd_->init(engine));
-            auto &jcp_dw = dw_conv_pd_->jcp_;
-
-            ok = true
-                    && (dnnl_memory_desc_equal(&src_md, dw_conv_pd_->src_md(0)))
-                    && (jcp_1x1.oc_without_padding % jcp_1x1.oc_block == 0)
-                    && IMPLICATION(
-                            jcp_dw.ow_block, jcp_dw.ow_block == jcp_dw.ow);
-            if (!ok) return status::unimplemented;
-
-            assert(dw_conv_pd_->dst_md(0)->format_kind != format_kind::any);
-            assert(dw_conv_pd_->weights_md(0)->format_kind != format_kind::any);
-            assert(IMPLICATION(
-                    dw_conv_pd_->weights_md(1)->data_type != data_type::undef,
-                    dw_conv_pd_->weights_md(1)->format_kind
-                            != format_kind::any));
-
-            jcp_dw.is_fused_conv = true;
-            // TODO: Support/experiment arbitary oc_work in dw conv.
-            // Until then we keep oc_work perfectly divisible.
-            while (jcp_1x1.nb_load % jcp_1x1.nb_load_blocking != 0)
-                --jcp_1x1.nb_load_blocking;
-            jcp_1x1.nb_load_blocking_max = jcp_1x1.nb_load_blocking;
-
-            while (jcp_1x1.nb_load_blocking % jcp_dw.nb_ch_blocking != 0)
-                --jcp_dw.nb_ch_blocking;
-
-            jcp_dw.dw_conv_buffer_oc
-                    = jcp_1x1.nb_load_blocking * jcp_1x1.oc_block;
-
-            const auto dat_tag_nxc = utils::pick(ndims() - 3, format_tag::nwc,
-                    format_tag::nhwc, format_tag::ndhwc);
-            const bool is_data_nxc = utils::everyone_is(
-                    dat_tag_nxc, jcp_1x1.src_tag, jcp_1x1.dst_tag);
-            if (!is_data_nxc)
-                jcp_1x1.bcast_loop_output_step = jcp_1x1.ur * jcp_1x1.load_block
-                        * jcp_1x1.typesize_out;
-
-            registrar_t scratchpad(scratchpad_registry_);
-            registrar_t dw_scratchpad(scratchpad, names::prefix_fusion);
-
-            size_t dw_conv_buffer_size_ = (size_t)nthr * jcp_dw.kh * jcp_dw.iw
-                    * jcp_dw.dw_conv_buffer_oc;
-            assert(dw_conv_buffer_size_);
-            dw_scratchpad.book(memory_tracking::names::key_fusion_inout_buffer,
-                    dw_conv_buffer_size_,
-                    types::data_type_size(dw_conv_pd_->src_md()->data_type));
-
-            jit_uni_dw_conv_fwd_kernel<sve_512,
-                    data_type::f32>::init_scratchpad(dw_scratchpad, jcp_dw);
-
-            return status::success;
-        }
-    };
-
-    template <cpu_isa_t isa, typename conv_t>
-    friend status_t init_rtus_driver(conv_t *self);
-
-    jit_sve_512_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
-
-    status_t init(engine_t *engine) override {
-        CHECK(safe_ptr_assign(kernel_,
-                new jit_sve_512_1x1_conv_kernel(
-                        pd()->jcp_, *pd()->attr(), *pd()->dst_md(0))));
-        CHECK(kernel_->create_kernel());
-
-        if (pd()->jcp_.with_dw_conv) {
-            CHECK(safe_ptr_assign(
-                    kernel_dw_, new dw_conv_kernel_t(pd()->dw_conv_pd_->jcp_)));
-            CHECK(kernel_dw_->create_kernel());
-        }
-
-        CHECK(init_rtus_driver<sve_512>(this));
-        return status::success;
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        execute_forward(ctx);
-        return status::success;
-    }
-
-private:
-    void execute_forward(const exec_ctx_t &ctx) const;
-    void execute_forward_thr(const int ithr, const int nthr,
-            const src_data_t *src, const wei_data_t *weights,
-            const dst_data_t *bias, const wei_data_t *weights_dw,
-            const dst_data_t *bias_dw, dst_data_t *dst,
-            const memory_tracking::grantor_t &scratchpad,
-            const void *post_ops_binary_rhs_arg_vec,
-            const void *post_ops_binary_rhs_arg_vec_dw) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-
-    std::unique_ptr<jit_sve_512_1x1_conv_kernel> kernel_;
-    std::unique_ptr<rtus_driver_t<sve_512>> rtus_driver_;
-    using dw_conv_kernel_t = jit_uni_dw_conv_fwd_kernel_f32<sve_512>;
-    std::unique_ptr<dw_conv_kernel_t> kernel_dw_;
-};
-
-using jit_sve_512_1x1_convolution_fwd_f32_t
-        = jit_sve_512_1x1_convolution_fwd_t<data_type::f32>;
-
-template <impl::data_type_t diff_dst_type,
-        impl::data_type_t wei_type = diff_dst_type,
-        impl::data_type_t diff_src_type = diff_dst_type>
-struct jit_sve_512_1x1_convolution_bwd_data_t : public primitive_t {
-    struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
-        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", sve_512, ""),
-                jit_sve_512_1x1_convolution_bwd_data_t);
-
-        status_t init(engine_t *engine) {
-            bool ok = true && desc()->prop_kind == prop_kind::backward_data
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && expect_data_types(diff_src_type, wei_type,
-                            data_type::undef, diff_dst_type, data_type::undef)
-                    && attr()->has_default_values() && !has_zero_dim_memory()
-                    && set_default_formats();
-            if (!ok) return status::unimplemented;
-
-            const convolution_desc_t *conv_d = desc();
-            const memory_desc_t *diff_src_d = diff_src_md();
-            rtus_prepare(this, conv_d, diff_src_d, diff_dst_md());
-
-            status_t status = jit_sve_512_1x1_conv_kernel::init_conf(jcp_,
-                    *conv_d, *diff_src_d, *weights_md(), *diff_dst_md(),
-                    *attr(), dnnl_get_max_threads(), rtus_.reduce_src_);
-            if (status != status::success) return status;
-
-            auto scratchpad = scratchpad_registry().registrar();
-            jit_sve_512_1x1_conv_kernel::init_scratchpad(scratchpad, jcp_);
-
-            rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
-
-            return status::success;
-        }
-
-        // TODO (Roma): structs conf header cleanup
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
-
-    protected:
-        bool set_default_formats() {
-            using namespace format_tag;
-
-            const memory_desc_wrapper diff_src_d(&diff_src_md_);
-            const memory_desc_wrapper diff_dst_d(&diff_dst_md_);
-
-            const auto dat_tag_nxc = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
-            const auto dat_tag_nCx16c
-                    = utils::pick(ndims() - 3, nCw16c, nChw16c, nCdhw16c);
-            const auto curr_src_tag = diff_src_d.matches_one_of_tag(
-                    dat_tag_nxc, dat_tag_nCx16c);
-            const auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
-                    dat_tag_nxc, dat_tag_nCx16c);
-            const auto is_data_layout_nxc
-                    = IMPLICATION(curr_src_tag != dat_tag_nxc,
-                              diff_src_d.format_kind() == format_kind::any)
-                    && IMPLICATION(curr_dst_tag != dat_tag_nxc,
-                            diff_dst_d.format_kind() == format_kind::any)
-                    && utils::one_of(dat_tag_nxc, curr_src_tag, curr_dst_tag);
-            auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
-            auto wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
-                    IOw16o16i, gIOw16o16i, IOhw16o16i, gIOhw16o16i, IOdhw16o16i,
-                    gIOdhw16o16i);
-
-            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
-        }
-    };
-
-    template <cpu_isa_t isa, typename conv_t>
-    friend status_t init_rtus_driver(conv_t *self);
-
-    jit_sve_512_1x1_convolution_bwd_data_t(const pd_t *apd)
-        : primitive_t(apd) {}
-
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
-
-    status_t init(engine_t *engine) override {
-        CHECK(safe_ptr_assign(kernel_,
-                new jit_sve_512_1x1_conv_kernel(
-                        pd()->jcp_, *pd()->attr(), *pd()->dst_md(0))));
-        CHECK(kernel_->create_kernel());
-        CHECK(init_rtus_driver<sve_512>(this));
-        return status::success;
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        execute_backward_data(ctx);
-        return status::success;
-    }
-
-private:
-    void execute_backward_data(const exec_ctx_t &ctx) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::unique_ptr<jit_sve_512_1x1_conv_kernel> kernel_;
-    std::unique_ptr<rtus_driver_t<sve_512>> rtus_driver_;
-};
-
-using jit_sve_512_1x1_convolution_bwd_data_f32_t
-        = jit_sve_512_1x1_convolution_bwd_data_t<data_type::f32>;
-
-/* Backward weight */
-struct jit_sve_512_1x1_convolution_bwd_weights_t : public primitive_t {
-    struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
-
-        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", sve_512, ""),
-                jit_sve_512_1x1_convolution_bwd_weights_t);
-
-        status_t init(engine_t *engine) {
-            bool ok = true && desc()->prop_kind == prop_kind::backward_weights
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && expect_data_types(data_type::f32, data_type::f32,
-                            data_type::f32, data_type::f32, data_type::f32)
-                    && attr()->has_default_values() && !has_zero_dim_memory()
-                    && set_default_formats();
-            if (!ok) return status::unimplemented;
-
-            const convolution_desc_t *conv_d = desc();
-            const memory_desc_t *src_d = src_md();
-            rtus_prepare(this, conv_d, src_d, diff_dst_md());
-
-            status_t status = jit_sve_512_1x1_conv_kernel::init_conf(jcp_,
-                    *conv_d, *src_d, *diff_weights_md(), *diff_dst_md(),
-                    *attr(), dnnl_get_max_threads(), rtus_.reduce_src_);
-            if (status != status::success) return status;
-
-            init_balancers();
-
-            auto scratchpad = scratchpad_registry().registrar();
-            jit_sve_512_1x1_conv_kernel::init_scratchpad(scratchpad, jcp_);
-
-            auto reducer_bia_scratchpad = memory_tracking::registrar_t(
-                    scratchpad, memory_tracking::names::prefix_reducer_bia);
-            reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad);
-            rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
-
-            return status::success;
-        }
-
-        // TODO (Roma): structs conf header cleanup
-        jit_1x1_conv_conf_t jcp_;
-        cpu_reducer_t<data_type::f32>::conf_t reducer_bia_conf_;
-        reduce_to_unit_stride_t rtus_;
-
-    protected:
-        bool set_default_formats() {
-            using namespace format_tag;
-
-            const memory_desc_wrapper src_d(&src_md_);
-            const memory_desc_wrapper diff_dst_d(&diff_dst_md_);
-
-            const auto dat_tag_nxc = utils::pick(ndims() - 3, nwc, nhwc, ndhwc);
-            const auto dat_tag_nCx16c
-                    = utils::pick(ndims() - 3, nCw16c, nChw16c, nCdhw16c);
-            const auto curr_src_tag
-                    = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-            const auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
-                    dat_tag_nxc, dat_tag_nCx16c);
-            const auto is_data_layout_nxc
-                    = IMPLICATION(curr_src_tag != dat_tag_nxc,
-                              src_d.format_kind() == format_kind::any)
-                    && IMPLICATION(curr_dst_tag != dat_tag_nxc,
-                            diff_dst_d.format_kind() == format_kind::any)
-                    && utils::one_of(dat_tag_nxc, curr_src_tag, curr_dst_tag);
-
-            auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
-            auto wei_tag = utils::pick(2 * ndims() - 6 + with_groups(),
-                    OIw16i16o, gOIw16i16o, OIhw16i16o, gOIhw16i16o, OIdhw16i16o,
-                    gOIdhw16i16o);
-
-            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
-        }
-
-    private:
-        void init_balancers() {
-            const size_t max_buffer_size = jcp_.nthr * 3 * 5 * 5 * 16 * 16;
-            if (with_bias()) {
-                reducer_bia_conf_.init(reduce_balancer_t(jcp_.nthr,
-                        jcp_.oc_block, jcp_.ngroups * jcp_.nb_load, jcp_.mb,
-                        max_buffer_size, true));
-            }
-        }
-    };
-
-    template <cpu_isa_t isa, typename conv_t>
-    friend status_t init_rtus_driver(conv_t *self);
-
-    jit_sve_512_1x1_convolution_bwd_weights_t(const pd_t *apd)
-        : primitive_t(apd) {}
-
-    typedef typename prec_traits<data_type::f32>::type data_t;
-
-    status_t init(engine_t *engine) override;
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        execute_backward_weights(ctx);
-        return status::success;
-    }
-
-private:
-    void execute_backward_weights(const exec_ctx_t &ctx) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-
-    std::unique_ptr<jit_sve_512_1x1_conv_kernel> kernel_;
-    std::unique_ptr<cpu_accumulator_1d_t<data_type::f32>> acc_ker_;
-    std::unique_ptr<cpu_reducer_t<data_type::f32>> reducer_bias_;
-    // std::unique_ptr<jit_transpose4x16_src> trans_kernel_;
-    std::unique_ptr<rtus_driver_t<sve_512>> rtus_driver_;
-};
-
-} // namespace aarch64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.cpp b/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.cpp
index 593a933f104..0694fc72d6e 100644
--- a/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.cpp
+++ b/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.cpp
@@ -114,10 +114,11 @@ status_t _jit_sve_512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
     if (jcp.is_depthwise && (!jcp.signed_input || is_3d))
         return status::unimplemented;
 
-    if (!zero_points_valid(&attr)) return status::unimplemented;
+    VDISPATCH_DECONVOLUTION_IC(
+            zero_points_valid(&attr), VERBOSE_UNSUPPORTED_ZP_CFG);
     jcp.src_zero_point = !attr.zero_points_.has_default_values(DNNL_ARG_SRC);
     jcp.dst_zero_point = !attr.zero_points_.has_default_values(DNNL_ARG_DST);
-    jcp.zp_src_is_common = attr.zero_points_.common(DNNL_ARG_SRC);
+    jcp.zp_src_is_common = attr.zero_points_.get_mask(DNNL_ARG_SRC) == 0;
 
     format_tag_t dat_tag = utils::pick(
             ndims - 3, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
@@ -274,12 +275,8 @@ status_t _jit_sve_512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
     //save post_ops desc for further usage
     jcp.post_ops = p;
 
-    const auto &oscales = attr.output_scales_;
-    jcp.is_oc_scale = oscales.mask_ == 1 << 1;
-
-    // only common and per-oc-channel scales are supported
-    const bool oscales_ok = one_of(oscales.mask_, 0, 1 << 1);
-    if (!oscales_ok) return status::unimplemented;
+    // TODO: add proper scaling support.
+    jcp.is_oc_scale = false;
 
     jcp.dst_dt = dst_d.data_type();
     jcp.bia_dt = jcp.with_bias ? bias_d.data_type() : data_type::undef;
@@ -1416,7 +1413,8 @@ status_t jit_sve_512_core_x8s8s32x_deconvolution_fwd_t::execute_forward_1d(
     const int oc_chunks = jcp.nb_oc / jcp.nb_oc_blocking;
     const int nb_groups = jcp.nb_ch;
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
     const size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<int8_t *>(weights);
     int32_t *compensation = (!jcp.signed_input)
@@ -1514,7 +1512,8 @@ status_t jit_sve_512_core_x8s8s32x_deconvolution_fwd_t::execute_forward_2d(
     size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
     size_t wht_kh_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
     const size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<int8_t *>(weights);
     int32_t *compensation = (!jcp.signed_input)
@@ -1675,7 +1674,8 @@ status_t jit_sve_512_core_x8s8s32x_deconvolution_fwd_t::execute_forward_3d(
     size_t wht_kd_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
     size_t wht_kh_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<int8_t *>(weights);
     int32_t *compensation = (!jcp.signed_input)
diff --git a/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.hpp b/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.hpp
index 2429ac43df5..db1188ea620 100644
--- a/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.hpp
+++ b/src/cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.hpp
@@ -286,9 +286,8 @@ struct jit_sve_512_core_x8s8s32x_deconvolution_fwd_t : public primitive_t {
                                     weights_md(1)->data_type, f32, s32, s8, u8))
                     && utils::one_of(dst_md(0)->data_type, f32, s32, s8, u8)
                     && desc()->accum_data_type == s32
-                    && attr()->has_default_values(skip_mask_t::oscale_runtime
-                            | skip_mask_t::post_ops
-                            | skip_mask_t::zero_points_runtime);
+                    && attr()->has_default_values(
+                            skip_mask_t::post_ops | skip_mask_t::zero_points);
             if (!ok) return status::unimplemented;
 
             CHECK(_jit_sve_512_core_x8s8s32x_deconv_fwd_kernel::init_conf(jcp_,
@@ -302,7 +301,7 @@ struct jit_sve_512_core_x8s8s32x_deconvolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_sve_512_core_x8s8s32x_deconvolution_fwd_t(const pd_t *apd)
diff --git a/src/cpu/aarch64/jit_sve_512_x8s8s32x_conv_kernel.cpp b/src/cpu/aarch64/jit_sve_512_x8s8s32x_conv_kernel.cpp
index c8a2314ae06..bb2442f694e 100644
--- a/src/cpu/aarch64/jit_sve_512_x8s8s32x_conv_kernel.cpp
+++ b/src/cpu/aarch64/jit_sve_512_x8s8s32x_conv_kernel.cpp
@@ -1426,12 +1426,8 @@ status_t jit_sve_512_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
 
     pick_loop_order(jcp, jcp.nthr);
 
-    const auto &oscales = attr.output_scales_;
-    jcp.is_oc_scale = oscales.mask_ == 1 << 1;
-
-    // only common and per-oc-channel scales are supported
-    const bool oscales_ok = one_of(oscales.mask_, 0, 1 << 1);
-    if (!oscales_ok) return status::unimplemented;
+    // TODO: enable quantization.
+    jcp.is_oc_scale = false;
 
     jcp.wei_adj_scale
             = (weights_d.extra().flags & memory_extra_flags::scale_adjust)
diff --git a/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.cpp b/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.cpp
index e1c0d5ee562..e7795140983 100644
--- a/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.cpp
+++ b/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.cpp
@@ -61,7 +61,8 @@ jit_sve_512_x8s8s32x_convolution_fwd_t<src_type, dst_type>::execute_forward_1d(
     assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
     assert(jcp.nb_ch % jcp.nb_ch_blocking == 0);
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<wei_data_t *>(weights);
@@ -174,7 +175,8 @@ jit_sve_512_x8s8s32x_convolution_fwd_t<src_type, dst_type>::execute_forward_2d(
     assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
     assert(jcp.nb_ch % jcp.nb_ch_blocking == 0);
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<wei_data_t *>(weights);
@@ -319,7 +321,8 @@ status_t jit_sve_512_x8s8s32x_convolution_fwd_t<src_type,
     assert(jcp.nb_oc_blocking == 1);
     assert(jcp.nb_ch % jcp.nb_ch_blocking == 0);
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<wei_data_t *>(weights);
@@ -408,7 +411,8 @@ jit_sve_512_x8s8s32x_convolution_fwd_t<src_type, dst_type>::execute_forward_3d(
     assert(jcp.nb_oc % jcp.nb_oc_blocking == 0);
     assert(jcp.nb_ch % jcp.nb_ch_blocking == 0);
 
-    DEFINE_SCALES_BUFFER(oscales);
+    // TODO: add support for scaling based on latest programming model.
+    DEFINE_ARG_SCALES_BUFFER(oscales, DNNL_ARG_WEIGHTS);
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<wei_data_t *>(weights);
diff --git a/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.hpp b/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.hpp
index e6c73302079..60b66651359 100644
--- a/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.hpp
+++ b/src/cpu/aarch64/jit_sve_512_x8s8s32x_convolution.hpp
@@ -36,9 +36,7 @@ namespace aarch64 {
 template <impl::data_type_t src_type, impl::data_type_t dst_type>
 struct jit_sve_512_x8s8s32x_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_int8:", sve_512, ""),
                 jit_sve_512_x8s8s32x_convolution_fwd_t);
@@ -53,9 +51,7 @@ struct jit_sve_512_x8s8s32x_convolution_fwd_t : public primitive_t {
                             utils::one_of(bias_md_.data_type, data_type::f32,
                                     data_type::s32, data_type::s8,
                                     data_type::u8))
-                    && attr()->has_default_values(
-                            smask_t::oscale_runtime | smask_t::post_ops,
-                            dst_type)
+                    && attr()->has_default_values(smask_t::post_ops, dst_type)
                     && !has_zero_dim_memory();
             if (!ok) return status::unimplemented;
 
@@ -71,15 +67,15 @@ struct jit_sve_512_x8s8s32x_convolution_fwd_t : public primitive_t {
             return status;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_sve_512_x8s8s32x_convolution_fwd_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<data_type::s8>::type wei_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    typedef typename prec_traits_t<src_type>::type src_data_t;
+    typedef typename prec_traits_t<data_type::s8>::type wei_data_t;
+    typedef typename prec_traits_t<dst_type>::type dst_data_t;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
diff --git a/src/cpu/aarch64/jit_sve_conv_kernel.cpp b/src/cpu/aarch64/jit_sve_conv_kernel.cpp
index 3acec02fe29..26cb17300af 100644
--- a/src/cpu/aarch64/jit_sve_conv_kernel.cpp
+++ b/src/cpu/aarch64/jit_sve_conv_kernel.cpp
@@ -1250,8 +1250,7 @@ void jit_sve_conv_bwd_data_kernel_f32<isa>::store_output(int ur_w) {
 
     auto out_load = [=](int aux_output_offset, int idx, int prev_ofs) {
         int ofs = aux_output_offset;
-        if ((VL_OFS(ofs, isa) < LDRMAX) && (VL_OFS(ofs, isa) >= (-1 * LDRMAX))
-                && ((ofs & 0x3f) == 0)) {
+        if (ldr_imm_check(ofs) && (ofs % 64 == 0)) {
             add_imm(X_DEFAULT_ADDR, reg_src, ofs, X_TMP_0);
             ld1w(zreg_tmp(idx).s, P_ALL_ONE / T_z, ptr(X_DEFAULT_ADDR));
         } else {
@@ -1273,8 +1272,7 @@ void jit_sve_conv_bwd_data_kernel_f32<isa>::store_output(int ur_w) {
     auto out_str = [=](int j, int k, int aux_output_offset, int prev_ofs) {
         int ofs = aux_output_offset;
 
-        if ((VL_OFS(ofs, isa) < LDRMAX) && (VL_OFS(ofs, isa) >= (-1 * LDRMAX))
-                && ((ofs & 0x3f) == 0)) {
+        if (ldr_imm_check(ofs) && (ofs % 64 == 0)) {
             add_imm(X_DEFAULT_ADDR, reg_src, ofs, X_TMP_0);
             st1w(zreg_out(j, k).s, P_ALL_ONE / T_z, ptr(X_DEFAULT_ADDR));
 
@@ -1415,8 +1413,7 @@ void jit_sve_conv_bwd_data_kernel_f32<isa>::compute_loop_fma(
     auto ker_load = [=](int i, int aux_kernel_offset) {
         int ofs = aux_kernel_offset;
 
-        if ((VL_OFS(ofs, isa) < LDRMAX) && (VL_OFS(ofs, isa) >= (-1 * LDRMAX))
-                && ((ofs & 0x3f) == 0)) {
+        if (ldr_imm_check(ofs) && (ofs % 64 == 0)) {
             add_imm(X_DEFAULT_ADDR, aux_reg_ker, ofs, X_TMP_0);
             ld1w(zreg_ker(i).s, P_ALL_ONE / T_z, ptr(X_DEFAULT_ADDR));
 
@@ -4467,4 +4464,4 @@ template struct jit_sve_conv_bwd_weights_kernel_f32<sve_256>;
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
\ No newline at end of file
+} // namespace dnnl
diff --git a/src/cpu/aarch64/jit_sve_convolution.hpp b/src/cpu/aarch64/jit_sve_convolution.hpp
index 16b397406d7..628e94c87ed 100644
--- a/src/cpu/aarch64/jit_sve_convolution.hpp
+++ b/src/cpu/aarch64/jit_sve_convolution.hpp
@@ -39,9 +39,7 @@ template <impl::data_type_t src_type, impl::data_type_t wei_type = src_type,
         impl::data_type_t dst_type = src_type, cpu_isa_t isa = isa_undef>
 struct jit_sve_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", isa, ""),
                 jit_sve_convolution_fwd_t);
@@ -67,14 +65,14 @@ struct jit_sve_convolution_fwd_t : public primitive_t {
             return status;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_sve_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    typedef typename prec_traits_t<src_type>::type src_data_t;
+    typedef typename prec_traits_t<wei_type>::type wei_data_t;
+    typedef typename prec_traits_t<dst_type>::type dst_data_t;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -114,9 +112,7 @@ template <impl::data_type_t diff_dst_type,
         cpu_isa_t isa = isa_undef>
 struct jit_sve_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", isa, ""),
                 jit_sve_convolution_bwd_data_t);
@@ -141,14 +137,14 @@ struct jit_sve_convolution_bwd_data_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_sve_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
+    typedef typename prec_traits_t<diff_dst_type>::type diff_dst_data_t;
+    typedef typename prec_traits_t<wei_type>::type wei_data_t;
+    typedef typename prec_traits_t<diff_src_type>::type diff_src_data_t;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -183,10 +179,8 @@ template <impl::data_type_t src_type,
         cpu_isa_t isa = isa_undef>
 struct jit_sve_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", isa, ""),
                 jit_sve_convolution_bwd_weights_t);
@@ -218,7 +212,7 @@ struct jit_sve_convolution_bwd_weights_t : public primitive_t {
             return status;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         typename cpu_reducer_t<diff_weights_type, isa>::conf_t
                 reducer_bia_conf_;
 
@@ -235,9 +229,9 @@ struct jit_sve_convolution_bwd_weights_t : public primitive_t {
 
     jit_sve_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<diff_weights_type>::type diff_weights_data_t;
+    typedef typename prec_traits_t<src_type>::type src_data_t;
+    typedef typename prec_traits_t<diff_dst_type>::type diff_dst_data_t;
+    typedef typename prec_traits_t<diff_weights_type>::type diff_weights_data_t;
 
     status_t init(engine_t *engine) override;
 
@@ -274,4 +268,4 @@ struct jit_sve_convolution_bwd_weights_t : public primitive_t {
 
 #endif
 
-// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
\ No newline at end of file
+// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/aarch64/jit_uni_1x1_conv_utils.hpp b/src/cpu/aarch64/jit_uni_1x1_conv_utils.hpp
index 521a8c54dc7..8b52047dc14 100644
--- a/src/cpu/aarch64/jit_uni_1x1_conv_utils.hpp
+++ b/src/cpu/aarch64/jit_uni_1x1_conv_utils.hpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
-* Copyright 2021-2023 FUJITSU LIMITED
+* Copyright 2021-2024 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ struct reduce_to_unit_stride_t {
 
 /* 1x1-kernel does not support non-unit strides so far, so the idea is:
  *  - for fwd or bwd_weights: to copy src to a scratch memory (with strides
- *    equal to 1) and then call the kernel
+ *    equal to 1) and then call the kernel  
  *  - for bwd_data: reduce the problem to the one with unit stride by
  *    performing computations in a scratch memory (with strides equal to 1)
  *    and then copy the result to diff_src */
@@ -50,7 +50,6 @@ template <typename conv_pd_t>
 inline void rtus_prepare(conv_pd_t *self, const convolution_desc_t *&conv_d,
         const memory_desc_t *&src_d, const memory_desc_t *dst_d) {
     const int ndims = src_d->ndims;
-
     bool rtus_applicable = utils::one_of(ndims, 3, 4);
     if (ndims == 3)
         rtus_applicable = rtus_applicable && conv_d->strides[0] != 1
@@ -182,13 +181,15 @@ struct rtus_driver_t : public jit_generator {
             ZReg res = ZReg(idx);
             if (is_nspc_) {
                 switch (isa) {
-                    case sve_512: res = ZReg(idx); break;
+                    case sve_512:
+                    case sve_256: res = ZReg(idx); break;
                     default: assert(!"Not supported isa"); res = ZReg(idx);
                 }
                 return res;
             }
             switch (isa) {
                 case sve_512:
+                case sve_256:
                     switch (typesize) {
                         case 4: res = ZReg(idx); break;
                         default:
@@ -202,7 +203,7 @@ struct rtus_driver_t : public jit_generator {
         reg_zero = Vmm(0, typesize);
         reg_v = Vmm(1, typesize);
 
-        vlen_ = reg_v.getBit() / 8;
+        vlen_ = cpu_isa_traits<isa>::vlen;
         vlen_shift_ = 0;
 
         int tvlen = is_nspc_ ? typesize_ : vlen_;
@@ -217,7 +218,6 @@ struct rtus_driver_t : public jit_generator {
 
     void loop_is() {
         using namespace Xbyak_aarch64;
-
         mov(reg_cur_src, reg_src);
         mov(reg_cur_iw, reg_iw_start);
         mov(reg_cur_os, reg_os);
@@ -285,7 +285,7 @@ struct rtus_driver_t : public jit_generator {
         mov(reg_cur_src, reg_src);
         mov(reg_cur_iw, reg_iw_start);
 
-        if (isa == sve_512) {
+        if (isa == sve_256 || isa == sve_512) {
             and_(reg_icb_remainder, reg_icb, (vlen_ / typesize_) - 1);
             mov_imm(X_TMP_0, 0);
             whilelt(tail_mask.s, X_TMP_0, reg_icb_remainder);
@@ -356,8 +356,9 @@ struct rtus_driver_t : public jit_generator {
 
         const size_t w_step_factor = ic_ * typesize_;
         const size_t max_load_store_bytes = typesize_ == 4 ? 32 : 16;
-        const size_t load_store_size
-                = isa == sve_512 ? vlen_ : max_load_store_bytes;
+        const size_t load_store_size = (isa == sve_256 || isa == sve_512)
+                ? vlen_
+                : max_load_store_bytes;
 
         Label is_loop, ic_loop, ic_loop_tail, ic_loop_finish;
         L(is_loop);
@@ -467,7 +468,7 @@ struct rtus_driver_t : public jit_generator {
 
     void generate() override {
         using namespace Xbyak_aarch64;
-        assert(isa == sve_512);
+        assert(isa == sve_256 || isa == sve_512);
 
         preamble();
 #define READ_PARAM(what) \
diff --git a/src/cpu/aarch64/jit_uni_batch_normalization.hpp b/src/cpu/aarch64/jit_uni_batch_normalization.hpp
index 3311f5b665a..7197ce2d815 100644
--- a/src/cpu/aarch64/jit_uni_batch_normalization.hpp
+++ b/src/cpu/aarch64/jit_uni_batch_normalization.hpp
@@ -42,10 +42,8 @@ struct driver_t;
 template <cpu_isa_t isa>
 struct jit_uni_batch_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("bnorm_jit:", isa, ""),
                 jit_uni_batch_normalization_fwd_t);
@@ -70,10 +68,8 @@ struct jit_uni_batch_normalization_fwd_t : public primitive_t {
 template <cpu_isa_t isa>
 struct jit_uni_batch_normalization_bwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_bwd_pd_t::
+                cpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("bnorm_jit:", isa, ""),
                 jit_uni_batch_normalization_bwd_t);
diff --git a/src/cpu/aarch64/jit_uni_batch_normalization_s8.hpp b/src/cpu/aarch64/jit_uni_batch_normalization_s8.hpp
index 3b96d7ed5ed..3950ffcccfd 100644
--- a/src/cpu/aarch64/jit_uni_batch_normalization_s8.hpp
+++ b/src/cpu/aarch64/jit_uni_batch_normalization_s8.hpp
@@ -41,10 +41,8 @@ struct driver_t;
 template <cpu_isa_t isa>
 struct jit_uni_batch_normalization_s8_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("bnorm_s8_jit:", isa, ""),
                 jit_uni_batch_normalization_s8_fwd_t);
diff --git a/src/cpu/aarch64/jit_uni_binary.cpp b/src/cpu/aarch64/jit_uni_binary.cpp
index 2dd83c3b1a6..bfaf2b64b45 100644
--- a/src/cpu/aarch64/jit_uni_binary.cpp
+++ b/src/cpu/aarch64/jit_uni_binary.cpp
@@ -118,7 +118,7 @@ status_t jit_uni_binary_t::pd_t::init(engine_t *engine) {
             && data_format_supported(src0_md_, conf_.isa)
             && set_default_params() == status::success && !has_zero_dim_memory()
             && IMPLICATION(!conf_.is_i8, src0_md_ == dst_md_) && is_applicable()
-            && attr()->has_default_values(sm::post_ops | sm::scales_runtime)
+            && attr()->has_default_values(sm::post_ops | sm::scales)
             && attr_.set_default_formats(dst_md(0)) == status::success;
     if (!ok) return status::unimplemented;
 
@@ -140,10 +140,8 @@ status_t jit_uni_binary_t::pd_t::init(engine_t *engine) {
                     po, src0_md_, get_supported_postops_bcast_strategies());
     conf_.op_type = get_op_type(src0_md_);
     assert(conf_.op_type != op_t::none);
-    conf_.do_scale_src0 = !attr()->scales_.get(DNNL_ARG_SRC_0).defined()
-            || !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.do_scale_src1 = !attr()->scales_.get(DNNL_ARG_SRC_1).defined()
-            || !attr()->scales_.get(DNNL_ARG_SRC_1).has_default_values();
+    conf_.do_scale_src0 = !attr()->scales_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.do_scale_src1 = !attr()->scales_.has_default_values(DNNL_ARG_SRC_1);
     const auto sum_idx = po.find(primitive_kind::sum);
     conf_.do_sum = sum_idx != -1 && po.entry_[sum_idx].sum.scale != 0.f;
     conf_.with_eltwise = po.find(primitive_kind::eltwise) != -1;
diff --git a/src/cpu/aarch64/jit_uni_dw_conv_kernel_f32.cpp b/src/cpu/aarch64/jit_uni_dw_conv_kernel_f32.cpp
index 557d19166d4..f146c82c728 100644
--- a/src/cpu/aarch64/jit_uni_dw_conv_kernel_f32.cpp
+++ b/src/cpu/aarch64/jit_uni_dw_conv_kernel_f32.cpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2021-2022 Intel Corporation
 * Copyright 2021-2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -1099,9 +1100,14 @@ void jit_uni_dw_conv_bwd_weights_kernel_f32<isa>::generate() {
     const int simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
     preamble();
     //TO DO : renaming predicate register (P_ALL_ONE)
-    if (simd_w_ != cpu_sveLen / sizeof(float))
+    if (simd_w_ != cpu_sveLen / sizeof(float)) {
         set_preg(P_ALL_ONE.s, simd_w_, X_TMP_0, X_TMP_1);
-    if (simd_w_ != 16 || simd_w_ != 8) assert(!"Unsupport: simd_w != 16, 8");
+    }
+
+    if (simd_w_ != 16 && simd_w_ != 8) {
+        assert(!"Unsupported: simd_w != 16, 8");
+    }
+
     ldr(reg_input_baddr,
             ptr(abi_param1,
                     static_cast<int32_t>(offsetof(jit_dw_conv_call_s, input))));
diff --git a/src/cpu/aarch64/jit_uni_dw_convolution.hpp b/src/cpu/aarch64/jit_uni_dw_convolution.hpp
index 74adbc4c7b0..de3739681e5 100644
--- a/src/cpu/aarch64/jit_uni_dw_convolution.hpp
+++ b/src/cpu/aarch64/jit_uni_dw_convolution.hpp
@@ -36,7 +36,9 @@ namespace aarch64 {
 template <cpu_isa_t isa, data_type_t src_type, data_type_t dst_type = src_type>
 struct jit_uni_dw_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
+        // Note: check `USING_INHERITED_IS_IMPOSSIBLE` comment in other files
+        // for details why this ctor can't be removed.
+        pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
                 const typename pd_t::base_class *hint_fwd_pd)
             : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
 
@@ -69,15 +71,15 @@ struct jit_uni_dw_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_uni_dw_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type f32_data_t;
-    typedef typename prec_traits<data_type::bf16>::type bf16_data_t;
-    typedef typename prec_traits<src_type>::type data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    typedef typename prec_traits_t<data_type::f32>::type f32_data_t;
+    typedef typename prec_traits_t<data_type::bf16>::type bf16_data_t;
+    typedef typename prec_traits_t<src_type>::type data_t;
+    typedef typename prec_traits_t<dst_type>::type dst_data_t;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -105,9 +107,7 @@ template <cpu_isa_t isa, data_type_t diff_dst_type,
         data_type_t diff_src_type = diff_dst_type>
 struct jit_uni_dw_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_dw:", jcp_.isa, ""),
                 jit_uni_dw_convolution_bwd_data_t);
@@ -134,7 +134,7 @@ struct jit_uni_dw_convolution_bwd_data_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool set_default_formats() {
@@ -158,9 +158,9 @@ struct jit_uni_dw_convolution_bwd_data_t : public primitive_t {
 
     jit_uni_dw_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<diff_dst_type>::type wei_data_t;
+    typedef typename prec_traits_t<diff_src_type>::type diff_src_data_t;
+    typedef typename prec_traits_t<diff_dst_type>::type diff_dst_data_t;
+    typedef typename prec_traits_t<diff_dst_type>::type wei_data_t;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -191,10 +191,9 @@ template <cpu_isa_t isa, data_type_t src_type,
         data_type_t diff_weights_type = src_type>
 struct jit_uni_dw_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
+
         using jit_uni_dw_convolution_bwd_weights
                 = jit_uni_dw_convolution_bwd_weights_t<isa, src_type,
                         diff_weights_type>;
@@ -229,7 +228,7 @@ struct jit_uni_dw_convolution_bwd_weights_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool set_default_formats() {
@@ -253,11 +252,11 @@ struct jit_uni_dw_convolution_bwd_weights_t : public primitive_t {
 
     jit_uni_dw_convolution_bwd_weights_t(const pd_t *apd);
 
-    typedef typename prec_traits<data_type::bf16>::type bf16_data_t;
-    typedef typename prec_traits<data_type::f32>::type f32_data_t;
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<src_type>::type diff_dst_data_t;
-    typedef typename prec_traits<diff_weights_type>::type diff_weights_data_t;
+    typedef typename prec_traits_t<data_type::bf16>::type bf16_data_t;
+    typedef typename prec_traits_t<data_type::f32>::type f32_data_t;
+    typedef typename prec_traits_t<src_type>::type src_data_t;
+    typedef typename prec_traits_t<src_type>::type diff_dst_data_t;
+    typedef typename prec_traits_t<diff_weights_type>::type diff_weights_data_t;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
diff --git a/src/cpu/aarch64/jit_uni_eltwise.hpp b/src/cpu/aarch64/jit_uni_eltwise.hpp
index 97ee5ed4c11..9665a7e9c12 100644
--- a/src/cpu/aarch64/jit_uni_eltwise.hpp
+++ b/src/cpu/aarch64/jit_uni_eltwise.hpp
@@ -50,7 +50,7 @@ struct jit_uni_eltwise_fwd_t : public primitive_t {
     jit_uni_eltwise_fwd_t(const pd_t *apd);
     virtual ~jit_uni_eltwise_fwd_t();
 
-    typedef typename prec_traits<d_type>::type data_t;
+    typedef typename prec_traits_t<d_type>::type data_t;
 
     status_t init(engine_t *engine) override;
 
@@ -75,7 +75,7 @@ struct jit_uni_eltwise_bwd_t : public primitive_t {
     jit_uni_eltwise_bwd_t(const pd_t *apd);
     virtual ~jit_uni_eltwise_bwd_t();
 
-    typedef typename prec_traits<d_type>::type data_t;
+    typedef typename prec_traits_t<d_type>::type data_t;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/aarch64/jit_uni_eltwise_int.hpp b/src/cpu/aarch64/jit_uni_eltwise_int.hpp
index 7f646a2275a..bb487ff0393 100644
--- a/src/cpu/aarch64/jit_uni_eltwise_int.hpp
+++ b/src/cpu/aarch64/jit_uni_eltwise_int.hpp
@@ -50,7 +50,7 @@ struct jit_uni_eltwise_int_fwd_t : public primitive_t {
     jit_uni_eltwise_int_fwd_t(const pd_t *apd);
     ~jit_uni_eltwise_int_fwd_t();
 
-    typedef typename prec_traits<d_type>::type data_t;
+    typedef typename prec_traits_t<d_type>::type data_t;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/aarch64/jit_uni_i8i8_pooling.cpp b/src/cpu/aarch64/jit_uni_i8i8_pooling.cpp
index 6e14591b282..dfe28ec0475 100644
--- a/src/cpu/aarch64/jit_uni_i8i8_pooling.cpp
+++ b/src/cpu/aarch64/jit_uni_i8i8_pooling.cpp
@@ -128,8 +128,8 @@ struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
     // thus we need to take into account ratio of sizes s32/i8 = 4
     static constexpr data_type_t avg_proc_dt = data_type::s32;
     enum : int {
-        s32_to_i8_ratio = sizeof(typename prec_traits<avg_proc_dt>::type)
-                / sizeof(typename prec_traits<data_type::u8>::type),
+        s32_to_i8_ratio = sizeof(typename prec_traits_t<avg_proc_dt>::type)
+                / sizeof(typename prec_traits_t<data_type::u8>::type),
         max_num_ll = s32_to_i8_ratio,
         mmx_msk_base_reg = 3
     };
diff --git a/src/cpu/aarch64/jit_uni_pooling.cpp b/src/cpu/aarch64/jit_uni_pooling.cpp
index 46d26aeb977..e60f8fc4472 100644
--- a/src/cpu/aarch64/jit_uni_pooling.cpp
+++ b/src/cpu/aarch64/jit_uni_pooling.cpp
@@ -560,7 +560,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward(const data_t *src,
     const auto post_ops_binary_rhs_arg_vec
             = binary_injector::prepare_binary_args(jpp.post_ops, ctx);
 
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
     using namespace jit_uni_pooling_utils;
 
     const auto transpose_facade
@@ -688,7 +688,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward_3d(const data_t *src,
     const auto post_ops_binary_rhs_arg_vec
             = binary_injector::prepare_binary_args(jpp.post_ops, ctx);
 
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
     using namespace jit_uni_pooling_utils;
     static constexpr int first_ithr = 0;
 
@@ -893,7 +893,7 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
         const exec_ctx_t &ctx) const {
 
     using namespace jit_uni_pooling_utils;
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
 
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
@@ -1018,7 +1018,7 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
 
     const auto &jpp = pd()->jpp_;
 
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
     using namespace jit_uni_pooling_utils;
     static constexpr int first_ithr = 0;
 
diff --git a/src/cpu/aarch64/jit_uni_pooling.hpp b/src/cpu/aarch64/jit_uni_pooling.hpp
index 8f6448c4bd7..ac854c75fce 100644
--- a/src/cpu/aarch64/jit_uni_pooling.hpp
+++ b/src/cpu/aarch64/jit_uni_pooling.hpp
@@ -82,7 +82,7 @@ struct jit_uni_pooling_fwd_t : public primitive_t {
     jit_uni_pooling_fwd_t &operator=(jit_uni_pooling_fwd_t &&) = default;
     ~jit_uni_pooling_fwd_t();
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
@@ -151,7 +151,7 @@ struct jit_uni_pooling_bwd_t : public primitive_t {
     jit_uni_pooling_bwd_t &operator=(jit_uni_pooling_bwd_t &&) = default;
     ~jit_uni_pooling_bwd_t();
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/aarch64/jit_uni_reorder.cpp b/src/cpu/aarch64/jit_uni_reorder.cpp
index 6d08a7f55a6..f2aa1f42d2c 100644
--- a/src/cpu/aarch64/jit_uni_reorder.cpp
+++ b/src/cpu/aarch64/jit_uni_reorder.cpp
@@ -1,7 +1,7 @@
 /*******************************************************************************
 * Copyright 2018-2023 Intel Corporation
 * Copyright 2020-2024 FUJITSU LIMITED
-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -161,14 +161,28 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
     static bool applicable(const prb_t &p) {
         using namespace data_type;
 
+        bool bf16_ok
+                = (mayiuse_bf16() && (p.itype == bf16) && (p.otype == bf16)
+                          && !interim_f32_needed(p, false) && p.beta == 0.f)
+                || (p.itype != bf16 && p.otype != bf16)
+                || (p.itype == f32 && p.otype == bf16 && mayiuse_bf16()
+                        && p.beta == 0.f)
+                || (p.itype == bf16 && p.otype == f32 && mayiuse_bf16()
+                        && p.beta == 0.f);
+
+        bool is_f16 = (p.itype == f16 || p.otype == f16);
+        bool f16_ok = (p.itype == f32 && p.otype == f16 && p.beta == 0.f)
+                || (p.itype == f16 && p.otype == f32 && p.beta == 0.f);
+
         bool ok = true && p.ndims > 0
-                && utils::one_of(p.itype, f32, s32, data_type::s8, u8)
-                && utils::one_of(p.otype, f32, bf16, s32, data_type::s8, u8)
+                && utils::one_of(
+                        p.itype, f32, f16, bf16, s32, data_type::s8, u8)
+                && utils::one_of(
+                        p.otype, f32, f16, bf16, s32, data_type::s8, u8)
                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
                 && simple_impl_desc_init(p, nullptr) && prb_has_small_strides(p)
-                && IMPLICATION(
-                        p.otype == bf16, p.itype == f32 && mayiuse_bf16());
+                && bf16_ok && IMPLICATION(is_f16, f16_ok);
 
         return ok;
     }
@@ -272,7 +286,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                           case f32:
                               /* do nothing */
                               break;
+                          case f16: cvt_v_f16_f32(startIdx, regNum); break;
                           case s32: cvt_z_s32_f32(startIdx, regNum); break;
+                          case bf16: cvt_v_bf16_fp32(startIdx, regNum); break;
                           case data_type::s8:
                               cvt_z_s8_s32(startIdx, regNum);
                               cvt_z_s32_f32(startIdx, regNum);
@@ -302,6 +318,12 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                         cvt_z_s32_s8(startIdx, regNum);
                     if (idt == u8) cvt_z_u8_s8(startIdx, regNum);
                     break;
+                case data_type::bf16:
+                    if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
+                    break;
+                case data_type::f16:
+                    if (idt == f32) cvt_v_f32_f16(startIdx, regNum);
+                    break;
                 case u8:
                     if (idt == f32) cvt_z_f32_s32(startIdx, regNum);
                     if (utils::one_of(idt, f32, s32))
@@ -614,6 +636,8 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                               /* do nothing */
                               break;
                           case s32: cvt_v_s32_f32(startIdx, regNum); break;
+                          case bf16: cvt_v_bf16_fp32(startIdx, regNum); break;
+                          case f16: cvt_v_f16_f32(startIdx, regNum); break;
                           case data_type::s8:
                               cvt_v_s8_s32(startIdx, regNum);
                               cvt_v_s32_f32(startIdx, regNum);
@@ -629,6 +653,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         auto cvt2odt = [=](const int startIdx, const int regNum,
                                data_type_t odt, data_type_t idt) {
             switch (odt) {
+                case f32:
+                    if (idt == bf16) cvt_v_bf16_fp32(startIdx, regNum);
+                    if (idt == f16) cvt_v_f16_f32(startIdx, regNum);
+                    break;
                 case s32:
                     if (idt == f32)
                         cvt_v_f32_s32(startIdx, regNum);
@@ -652,6 +680,9 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                 case bf16:
                     if (idt == f32) cvt_v_f32_bf16(startIdx, regNum);
                     break;
+                case f16:
+                    if (idt == f32) cvt_v_f32_f16(startIdx, regNum);
+                    break;
                 default: assert(!"unreachable");
             }
         };
@@ -702,7 +733,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         const int load_tail_step
                 = !can_load_xmm && can_store_xmm ? ur_step : load_step;
 
-        const bool interim_f32 = interim_f32_needed();
+        const bool interim_f32 = interim_f32_needed(prb_, compensation_needed_);
 
         const bool need_saturation
                 = (utils::one_of(prb_.otype, u8, data_type::s8, s32)
@@ -775,7 +806,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
             // transposition on the fly
             const bool fast_return = prb_.src_scale_type != scale_type_t::MANY
                     && prb_.dst_scale_type != scale_type_t::MANY
-                    && prb_.beta == 0.f;
+                    && prb_.beta == 0.f && !prb_.req_src_zp && !prb_.req_dst_zp;
             if (fast_return) {
                 if (prb_.src_scale_type == scale_type_t::COMMON)
                     for (int ur = 0; ur < reg_unroll; ur += load_step)
@@ -1285,17 +1316,17 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         }
     }
 
-    bool interim_f32_needed() {
+    static bool interim_f32_needed(const prb_t &prb, bool compensation_needed) {
         using namespace data_type;
-
-        return utils::one_of(f32, prb_.itype, prb_.otype)
-                || prb_.src_scale_type != scale_type_t::NONE
-                || prb_.dst_scale_type != scale_type_t::NONE || prb_.beta != 0.f
-                || ((prb_.req_src_zp || prb_.req_dst_zp)
-                                ? !(prb_.itype == s32 && prb_.otype == s32)
+        bool ret = utils::one_of(f32, prb.itype, prb.otype)
+                || prb.src_scale_type != scale_type_t::NONE
+                || prb.dst_scale_type != scale_type_t::NONE || prb.beta != 0.f
+                || ((prb.req_src_zp || prb.req_dst_zp)
+                                ? !(prb.itype == s32 && prb.otype == s32)
                                 : false)
-                || (prb_.itype != f32 && compensation_needed_)
-                || prb_.scale_adjust != 1.f;
+                || (prb.itype != f32 && compensation_needed)
+                || prb.scale_adjust != 1.f;
+        return ret;
     }
 
     void process_unroll_generic(
@@ -1313,7 +1344,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
 
         int curr = 0; // will switch between 0 and 1
 
-        const bool interim_f32 = interim_f32_needed();
+        const bool interim_f32 = interim_f32_needed(prb_, compensation_needed_);
 
         if (prb_.req_src_zp) {
             add_imm(X_DEFAULT_ADDR, PARAM(src_zp), X_TMP_0);
@@ -1685,6 +1716,18 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         UNROLL_INST2(bfcvtn, VReg4H(i), VReg4S(i));
     }
 
+    void cvt_v_bf16_fp32(const size_t startIdx, const size_t regNum) {
+        UNROLL_INST2(shll, VReg4S(i), VReg4H(i), 16);
+    }
+
+    void cvt_v_f16_f32(const size_t startIdx, const size_t regNum) {
+        UNROLL_INST2(fcvtl, VReg4S(i), VReg4H(i));
+    }
+
+    void cvt_v_f32_f16(const size_t startIdx, const size_t regNum) {
+        UNROLL_INST2(fcvtn, VReg4H(i), VReg4S(i));
+    }
+
     void cvt_z_s8_s32(const size_t startIdx, const size_t regNum) {
         cvt_z_b_s(startIdx, regNum);
         UNROLL_INST(sxtb, ZRegS, tmp, P_ALL_ONE / T_m, tmp);
@@ -2730,9 +2773,10 @@ static void prb_thread_kernel_balance(
 
     if (want_borrow_ker_from_drv || want_borrow_drv_from_ker) {
         DEBUG({
-            printf("split: ");
-            prb_dump(prb);
-            printf("ndims_ker_max = %d\n", ndims_ker_max);
+            verbose_printf(
+                    verbose_t::debuginfo, "split: %s\n", prb_dump(prb).c_str());
+            verbose_printf(verbose_t::debuginfo, "ndims_ker_max = %d\n",
+                    ndims_ker_max);
         });
     }
 }
@@ -2767,13 +2811,10 @@ status_t jit_uni_reorder_t::pd_t::init_scratchpad() {
                 compensation_reduce_size);
     }
 
-    const memory_desc_wrapper input_d(src_md());
-    int scales_mask = -1;
-    bool is_set = false;
-    CHECK(attr()->scales_.get(DNNL_ARG_DST, &scales_mask, &is_set));
-
-    if (is_set && scales_mask > 0) {
-        get_D_values(input_d, scales_mask, nullptr, &D_mask_, nullptr);
+    if (!attr()->scales_.has_default_values(DNNL_ARG_DST)) {
+        const memory_desc_wrapper input_d(src_md());
+        int mask = attr()->scales_.get_mask(DNNL_ARG_DST);
+        get_D_values(input_d, mask, nullptr, &D_mask_, nullptr);
         if (D_mask_ > 1) {
             scratchpad.template book<float>(
                     memory_tracking::names::key_reorder_precomputed_dst_scales,
@@ -2797,8 +2838,8 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
 
     prb_block_for_cache(prb);
     DEBUG({
-        printf("cache: ");
-        prb_dump(prb);
+        verbose_printf(
+                verbose_t::debuginfo, "cache: %s\n", prb_dump(prb).c_str());
     });
 
     int ndims_ker_max {};
@@ -2817,8 +2858,8 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
         return status::unimplemented;
 
     DEBUG({
-        printf("ker  : ");
-        prb_dump(ker_desc.prb);
+        verbose_printf(verbose_t::debuginfo, "ker  : %s\n",
+                prb_dump(ker_desc.prb).c_str());
     });
 
     auto _pd = make_unique_pd<pd_t>(
@@ -3027,12 +3068,12 @@ void jit_uni_reorder_t::omp_driver(const char *in, char *out,
     out += pd()->prb_.ooff * data_type_size(pd()->prb_.otype);
 
     DEBUG({
-        printf("prb : ");
-        tr::prb_dump(pd()->prb_);
+        verbose_printf(verbose_t::debuginfo, "prb : %s\n",
+                tr::prb_dump(pd()->prb_).c_str());
     });
     DEBUG({
-        printf("ker : ");
-        tr::prb_dump(pd()->ker_desc_.prb);
+        verbose_printf(verbose_t::debuginfo, "ker : %s\n",
+                tr::prb_dump(pd()->ker_desc_.prb).c_str());
     });
 
     int ndims = pd()->prb_.ndims;
@@ -3236,8 +3277,8 @@ status_t jit_blk_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
 
     prb_tile_normalize(prb);
     DEBUG({
-        printf("tile : ");
-        prb_dump(prb);
+        verbose_printf(
+                verbose_t::debuginfo, "tile : %s\n", prb_dump(prb).c_str());
     });
 
     if (!tr::jit_single_blk_kernel_t::applicable(prb)) {
diff --git a/src/cpu/aarch64/jit_uni_reorder.hpp b/src/cpu/aarch64/jit_uni_reorder.hpp
index 4587fd82e21..83ac55ed855 100644
--- a/src/cpu/aarch64/jit_uni_reorder.hpp
+++ b/src/cpu/aarch64/jit_uni_reorder.hpp
@@ -149,8 +149,8 @@ void prb_node_swap(prb_t &p, int d0, int d1);
  * to the right if d0 > d1 */
 void prb_node_move(prb_t &p, int d0, int d1);
 
-/** dumps the problem to stdout */
-void prb_dump(const prb_t &p);
+/** dumps the problem to a string */
+std::string prb_dump(const prb_t &p);
 
 struct call_param_t {
     const void *in = nullptr;
diff --git a/src/cpu/aarch64/jit_uni_reorder_utils.cpp b/src/cpu/aarch64/jit_uni_reorder_utils.cpp
index 5000f904f0d..90e78f3877b 100644
--- a/src/cpu/aarch64/jit_uni_reorder_utils.cpp
+++ b/src/cpu/aarch64/jit_uni_reorder_utils.cpp
@@ -205,9 +205,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
     bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc()
             && !im_d.has_runtime_dims_or_strides() && !im_d.has_zero_dim()
             && !om_d.has_runtime_dims_or_strides() && !om_d.has_zero_dim()
-            && attr->has_default_values(
-                    primitive_attr_t::skip_mask_t::scales_runtime
-                    | primitive_attr_t::skip_mask_t::zero_points_runtime
+            && attr->has_default_values(primitive_attr_t::skip_mask_t::scales
+                    | primitive_attr_t::skip_mask_t::zero_points
                     | primitive_attr_t::skip_mask_t::post_ops)
             && check_post_ops(attr);
     if (!ok) return unimplemented;
@@ -276,24 +275,21 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
 
     p.src_scale_type = scale_type_t::NONE;
     int src_mask = 0;
-    bool is_src_set = false;
-    CHECK(attr->scales_.get(DNNL_ARG_SRC, &src_mask, &is_src_set));
-    if (is_src_set) {
+    if (!attr->scales_.has_default_values(DNNL_ARG_SRC)) {
+        src_mask = attr->scales_.get_mask(DNNL_ARG_SRC);
         p.src_scale_type
                 = src_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY;
     }
 
     p.dst_scale_type = scale_type_t::NONE;
     int dst_mask = 0;
-    bool is_dst_set = false;
-    CHECK(attr->scales_.get(DNNL_ARG_DST, &dst_mask, &is_dst_set));
-    if (is_dst_set) {
+    if (!attr->scales_.has_default_values(DNNL_ARG_DST)) {
+        dst_mask = attr->scales_.get_mask(DNNL_ARG_DST);
         p.dst_scale_type
                 = dst_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY;
     }
 
-    if (is_src_set && is_dst_set && src_mask != dst_mask)
-        return status::unimplemented;
+    if (src_mask != dst_mask) return status::unimplemented;
 
     p.scale_adjust = (om_d.extra().flags & memory_extra_flags::scale_adjust)
             ? om_d.extra().scale_adjust
@@ -431,14 +427,14 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
     p.beta = sum_idx == -1 ? 0.f : attr->post_ops_.entry_[sum_idx].sum.scale;
 
     DEBUG({
-        printf("init : ");
-        prb_dump(p);
+        verbose_printf(
+                verbose_t::debuginfo, "init : %s\n", prb_dump(p).c_str());
     });
     // Sort the prb array in increasing sizes of the output stride
     prb_normalize(p);
     DEBUG({
-        printf("norm : ");
-        prb_dump(p);
+        verbose_printf(
+                verbose_t::debuginfo, "norm : %s\n", prb_dump(p).c_str());
     });
 
     // compensation strides require prb_normalized
@@ -448,8 +444,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
              * sides of the reorder */
     prb_simplify(p);
     DEBUG({
-        printf("smpl : ");
-        prb_dump(p);
+        verbose_printf(
+                verbose_t::debuginfo, "smpl : %s\n", prb_dump(p).c_str());
     });
 
     return success;
@@ -605,16 +601,20 @@ void prb_node_move(prb_t &p, int d0, int d1) {
     p.nodes[d1] = node;
 }
 
-void prb_dump(const prb_t &p) {
-    printf("@@@ type:%s:%s ndims:%d ", dnnl_dt2str(p.itype),
-            dnnl_dt2str(p.otype), p.ndims);
-    for (int d = 0; d < p.ndims; ++d)
-        printf("[%zu:%zu:%d:%d:%s:%td:%td:%td:%td]", p.nodes[d].n,
-                p.nodes[d].tail_size, p.nodes[d].dim_id,
-                p.nodes[d].parent_node_id,
-                p.nodes[d].is_zero_pad_needed ? "true" : "false", p.nodes[d].is,
-                p.nodes[d].os, p.nodes[d].ss, p.nodes[d].cs);
-    printf(" off:%zu:%zu\n", p.ioff, p.ooff);
+std::string prb_dump(const prb_t &p) {
+    std::stringstream ss;
+    ss << "@@@ type:" << dnnl_dt2str(p.itype) << ':' << dnnl_dt2str(p.otype)
+       << " ndims:" << p.ndims;
+    for (int d = 0; d < p.ndims; ++d) {
+        if (d != 0) ss << 'x';
+        const auto &node = p.nodes[d];
+        ss << '[' << node.n << ':' << node.tail_size << ':' << node.dim_id
+           << ':' << node.parent_node_id << ':'
+           << (node.is_zero_pad_needed ? "true" : "false") << ':' << node.is
+           << ':' << node.os << ':' << node.ss << ':' << node.cs << ']';
+    }
+    ss << " off:" << p.ioff << ':' << p.ooff;
+    return ss.str();
 }
 
 } // namespace tr
diff --git a/src/cpu/aarch64/jit_uni_softmax.cpp b/src/cpu/aarch64/jit_uni_softmax.cpp
index ecd91200f3f..1450f7788f9 100644
--- a/src/cpu/aarch64/jit_uni_softmax.cpp
+++ b/src/cpu/aarch64/jit_uni_softmax.cpp
@@ -16,6 +16,7 @@
 *******************************************************************************/
 
 #include <assert.h>
+#include <memory>
 
 #include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
@@ -668,12 +669,10 @@ struct jit_softmax_t<sve_128> : public jit_softmax_base_t<sve_128> {
 template <cpu_isa_t isa>
 jit_uni_softmax_fwd_t<isa>::jit_uni_softmax_fwd_t(const pd_t *apd)
     : primitive_t(apd)
-    , softmax_driver_(new softmax_impl::driver_t<isa>(pd())) {}
+    , softmax_driver_(utils::make_unique<softmax_impl::driver_t<isa>>(pd())) {}
 
 template <cpu_isa_t isa>
-jit_uni_softmax_fwd_t<isa>::~jit_uni_softmax_fwd_t() {
-    delete softmax_driver_;
-}
+jit_uni_softmax_fwd_t<isa>::~jit_uni_softmax_fwd_t() = default;
 
 template <cpu_isa_t isa>
 status_t jit_uni_softmax_fwd_t<isa>::init(engine_t *engine) {
@@ -725,12 +724,10 @@ status_t jit_uni_softmax_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
 template <cpu_isa_t isa>
 jit_uni_softmax_bwd_t<isa>::jit_uni_softmax_bwd_t(const pd_t *apd)
     : primitive_t(apd)
-    , softmax_driver_(new softmax_impl::driver_t<isa>(pd())) {}
+    , softmax_driver_(utils::make_unique<softmax_impl::driver_t<isa>>(pd())) {}
 
 template <cpu_isa_t isa>
-jit_uni_softmax_bwd_t<isa>::~jit_uni_softmax_bwd_t() {
-    delete softmax_driver_;
-}
+jit_uni_softmax_bwd_t<isa>::~jit_uni_softmax_bwd_t() = default;
 
 template <cpu_isa_t isa>
 status_t jit_uni_softmax_bwd_t<isa>::init(engine_t *engine) {
diff --git a/src/cpu/aarch64/jit_uni_softmax.hpp b/src/cpu/aarch64/jit_uni_softmax.hpp
index b8933442b1e..090d4300b56 100644
--- a/src/cpu/aarch64/jit_uni_softmax.hpp
+++ b/src/cpu/aarch64/jit_uni_softmax.hpp
@@ -19,6 +19,7 @@
 #define CPU_AARCH64_JIT_UNI_SOFTMAX_HPP
 
 #include <assert.h>
+#include <memory>
 
 #include "common/c_types_map.hpp"
 #include "common/memory_tracking.hpp"
@@ -80,7 +81,7 @@ struct jit_uni_softmax_fwd_t : public primitive_t {
                             utils::one_of(bf16, src_dt, dst_dt), mayiuse_bf16())
                     && (mayiuse(sve_512) || mayiuse(sve_256)
                             || mayiuse(sve_128))
-                    && attr()->has_default_values(skip_mask_t::scales_runtime)
+                    && attr()->has_default_values(skip_mask_t::scales)
                     && attr_scales_ok()
                     && set_default_formats() == status::success;
             if (!ok) return status::unimplemented;
@@ -119,7 +120,9 @@ struct jit_uni_softmax_fwd_t : public primitive_t {
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    softmax_impl::driver_t<isa> *softmax_driver_;
+    std::unique_ptr<softmax_impl::driver_t<isa>> softmax_driver_;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_softmax_fwd_t);
 };
 
 template <cpu_isa_t isa>
@@ -191,7 +194,9 @@ struct jit_uni_softmax_bwd_t : public primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
-    softmax_impl::driver_t<isa> *softmax_driver_;
+    std::unique_ptr<softmax_impl::driver_t<isa>> softmax_driver_;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_softmax_bwd_t);
 };
 
 } // namespace aarch64
diff --git a/src/cpu/aarch64/matmul/brgemm_matmul.cpp b/src/cpu/aarch64/matmul/brgemm_matmul.cpp
index bebdae12041..1f7bd0088d6 100644
--- a/src/cpu/aarch64/matmul/brgemm_matmul.cpp
+++ b/src/cpu/aarch64/matmul/brgemm_matmul.cpp
@@ -1,6 +1,7 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
 * Copyright 2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
@@ -71,9 +72,9 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
         const std::vector<int> supported_args
                 = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
         bool ok = attr_scales_ok(supported_args);
-        if (!attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
-                && !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values()
-                && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0) {
+        if (!attr()->scales_.has_default_values(DNNL_ARG_SRC)
+                && !attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS)
+                && attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) > 0) {
             // This case requires scratchpad
             if (N() == DNNL_RUNTIME_DIM_VAL) ok = false;
         }
@@ -83,8 +84,18 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
         return ok;
     };
 
-    auto check_attr_zero_points
-            = [&]() -> bool { return attr()->zero_points_.common(); };
+    auto check_attr_zero_points = [&]() -> bool {
+        const auto &zp = attr()->zero_points_;
+        static const std::vector<int> supported_args {
+                DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+        for (int arg : supported_args) {
+            if (!zp.has_default_values(arg)) {
+                const int mask = zp.get_mask(arg);
+                if (mask > 0) return false;
+            }
+        }
+        return true;
+    };
 
     // The current version supports runtime value for M dimension in the case
     // of 2d problems only and do not support any runtime strides for B and C
@@ -101,9 +112,8 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
     VDISPATCH_MATMUL(
             no_dynamic_strides_for_B_and_C, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
     VDISPATCH_MATMUL(
-            attr()->has_default_values(
-                    primitive_attr_t::skip_mask_t::scales_runtime
-                            | primitive_attr_t::skip_mask_t::zero_points_runtime
+            attr()->has_default_values(primitive_attr_t::skip_mask_t::scales
+                            | primitive_attr_t::skip_mask_t::zero_points
                             | primitive_attr_t::skip_mask_t::post_ops
                             | primitive_attr_t::skip_mask_t::sum_dt,
                     dst_dt),
@@ -158,6 +168,9 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
                 = bgmmc_.post_ops_applicable && bgmmc_.nthr_k > 1;
 
         CHECK(brgemm_desc_set_attr(&brg, brgattr));
+
+        CHECK(brgemm_desc_finalize(&brg));
+
         bgmmc_.wsp_tile_per_thr_bytes = nstl::max(
                 brg.get_wsp_buffer_size(), bgmmc_.wsp_tile_per_thr_bytes);
     }
@@ -642,7 +655,6 @@ void brgemm_matmul_t<isa>::copy_b_chunk_in_buffer(
                 = (void *)brgmm_ctx.get_s8s8_comp_ptr(ithr, b_idx, n_blk_idx);
         ctx.current_K_start = k;
         ctx.current_K_iters = nstl::min(bgmmc.K_blk, bgmmc.K);
-        assert(isa == sve_512);
         (*copy_B_kernel_)(&ctx);
     }
 
@@ -654,7 +666,6 @@ void brgemm_matmul_t<isa>::copy_b_chunk_in_buffer(
                 = (void *)brgmm_ctx.get_s8s8_comp_ptr(ithr, b_idx, n_blk_idx);
         ctx.current_K_start = k;
         ctx.current_K_iters = bgmmc.K % bgmmc.K_blk;
-        assert(isa == sve_512);
         (*copy_B_kernel_)(&ctx);
     }
 }
diff --git a/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp b/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp
index 0aa98fe9c2b..bfb917afbbd 100644
--- a/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp
+++ b/src/cpu/aarch64/matmul/brgemm_matmul_reorders.cpp
@@ -85,7 +85,8 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init(
     matmul_conf_for_reorder_.K = dims[ndims - 2];
     matmul_conf_for_reorder_.N = dims[ndims - 1];
     matmul_conf_for_reorder_.wei_n_blk = matmul_conf_for_reorder_.N_blk
-            = matmul_conf_for_reorder_.LDB = matmul::get_default_n_block(otag);
+            = matmul_conf_for_reorder_.LDB
+            = matmul::get_default_n_block(otag, matmul_conf_for_reorder_);
     matmul_conf_for_reorder_.N_tail
             = matmul_conf_for_reorder_.N % matmul_conf_for_reorder_.N_blk;
     matmul_conf_for_reorder_.K_blk = 16 * vnni_granularity;
diff --git a/src/cpu/aarch64/matmul/brgemm_matmul_utils.cpp b/src/cpu/aarch64/matmul/brgemm_matmul_utils.cpp
index bd9bc023eaf..0610147c752 100644
--- a/src/cpu/aarch64/matmul/brgemm_matmul_utils.cpp
+++ b/src/cpu/aarch64/matmul/brgemm_matmul_utils.cpp
@@ -1,5 +1,7 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
+* Copyright 2023-2024 FUJITSU LIMITED
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,7 +49,8 @@ using namespace dnnl::impl::utils;
 using namespace data_type;
 using namespace format_tag;
 
-int get_default_n_block(format_tag_t matrix_b_tag) {
+int get_default_n_block(
+        format_tag_t matrix_b_tag, brgemm_matmul_conf_t &bgmmc) {
     // Note: consider using weights mem_descriptor 'inner_blks' to
     // return B's inner block for non-default cases.
     switch (matrix_b_tag) {
@@ -75,7 +78,23 @@ int get_default_n_block(format_tag_t matrix_b_tag) {
         case BA16a16b:
         case BA16a16b2a:
         case BA16a16b4a: return 16;
-        default: return 64;
+        default: {
+            if (bgmmc.N == 16 || bgmmc.N == 32 || bgmmc.N == 64) return bgmmc.N;
+            if (!mayiuse(sve_512)) {
+                if (bgmmc.N <= 16)
+                    return 16;
+                else {
+                    // It is observed that for M,K>512, N block of 64 works better provided that thread distribution is not hindered.
+                    if (bgmmc.N / 64 >= bgmmc.nthr && bgmmc.K > 512
+                            && bgmmc.M > 512)
+                        return 64;
+                    else
+                        return 32;
+                }
+
+            } else
+                return 64;
+        }
     }
 }
 
@@ -128,9 +147,8 @@ bool post_ops_ok(brgemm_matmul_conf_t &bgmmc, const primitive_attr_t &attr,
                     true /*sum_requires_same_params*/, bcast_set));
 }
 
-status_t check_isa_with_datatype(
-        const cpu_isa_t isa, const brgemm_matmul_conf_utils_t &bm_conf_utils) {
-    if (bm_conf_utils.is_f32() && !bm_conf_utils.is_int8()
+status_t check_datatype(const brgemm_matmul_conf_utils_t &bm_conf_utils) {
+    if (bm_conf_utils.is_f32() && !bm_conf_utils.is_bf32()
             && !bm_conf_utils.is_bf16() && !bm_conf_utils.is_f16()
             && !bm_conf_utils.is_int8())
         return status::success;
@@ -178,7 +196,7 @@ status_t brgemm_matmul_conf_utils_t::set_or_check_B_tag(
 
     if (B_any_layout) {
         const int default_n_block = init_n_tag
-                ? get_default_n_block(format_tag::undef)
+                ? get_default_n_block(format_tag::undef, bgmmc)
                 : bgmmc.N_blk;
         bgmmc.wei_tag = blocked_B_layouts_allowed
                 ? this->pick_blocked_B_layout(default_n_block)
@@ -320,10 +338,6 @@ format_tag_t brgemm_matmul_conf_utils_t::pick_blocked_B_layout(
             default: return format_tag::undef;
         }
 
-    assert(!this->is_bf16());
-    assert(!this->is_f16());
-    assert(!this->is_bf32());
-
     // Note: bf32 assumes f32 blocking
     if (this->is_f32() || this->is_bf32() || this->is_f16()) switch (n_blk) {
             case 64: return bgmmc.ndims == 3 ? aCB16b64c : BA16a64b;
@@ -580,14 +594,17 @@ float compute_blocking_heuristic_sve_256(brgemm_matmul_conf_t &bgmmc,
     const int nthr = bgmmc.nthr;
 
     const int max_m_blk = nstl::min(/*64*/ 256, matmul.M);
-    int min_m_blk = nstl::min(32, matmul.M); // max_m_blk
+    // It is found that for 2d shapes min_m_blk = 128 works better than 32 for most of the shapes.
+    int min_m = (matmul.batch > 1) ? 32 : 128;
+    int min_m_blk = nstl::min(min_m, matmul.M); // max_m_blk
 
     int n_blk = bgmmc.N_blk;
     const int n_chunks = div_up(matmul.N, n_blk);
     const int max_n_chunks = bgmmc.use_buffer_a ? 16 : 1;
     const int n_chunks_start = nstl::min(max_n_chunks, n_chunks);
 
-    int default_k_blk = 1024;
+    //It is found that for M<512 k_blk of 128 works better than 1024 for most of the shapes.
+    int default_k_blk = (matmul.M >= 512) ? 1024 : 128;
     int k_blk = nstl::min(matmul.K, default_k_blk);
     int start_nthr_k = 1;
 
@@ -597,7 +614,22 @@ float compute_blocking_heuristic_sve_256(brgemm_matmul_conf_t &bgmmc,
     const bool low_parallel_work = static_cast<size_t>(nthr) > max_parallel;
     if (low_parallel_work) {
 
-        min_m_blk = nstl::min(matmul.M, 16);
+        int best_m_blk = 0;
+        float scr = 0, best_scr = 16 * nthr;
+        for (int i = 16; i >= 4; i--) {
+            scr = 0.7 * (matmul.M % i)
+                    + 0.3 * std::abs(nthr - ((float)matmul.M / (float)i));
+            if (scr < best_scr) {
+                best_scr = scr;
+                best_m_blk = i;
+            }
+        }
+        min_m_blk = nstl::min(matmul.M, best_m_blk);
+        // Here min_m_blk is set based on M value and no.of threads. Decreasing m_blk size will
+        // increase no.of m blocks which might make better utilisation of threads. But it is found
+        // that m_blk being a factor of M is more important than max thread utilisation.Therefore
+        // in scoring that has been given more weightage(0.7). This was experimentally verified to
+        // be the best hueristics with multiple shapes.
 
         bool low_spatial_work = matmul.M <= 40;
         if (low_spatial_work) {
@@ -732,8 +764,7 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
             dst_d.format_kind() == format_kind::any,
             bias_md.format_kind == format_kind::any);
 
-    VCHECK_BG(check_isa_with_datatype(isa, bm_conf_utils),
-            VERBOSE_ISA_DT_MISMATCH);
+    VCHECK_BG(check_datatype(bm_conf_utils), VERBOSE_UNSUPPORTED_DT);
 
     bgmmc.a_dt_sz = bgmmc.tr_a_dt_sz = types::data_type_size(bgmmc.src_dt);
     bgmmc.b_dt_sz = bgmmc.tr_b_dt_sz = types::data_type_size(bgmmc.wei_dt);
@@ -752,21 +783,22 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
 
     const auto &src_scales = attr.scales_.get(DNNL_ARG_SRC);
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
-    bgmmc.with_scales = !src_scales.has_default_values()
-            || !wei_scales.has_default_values();
-    if (bgmmc.with_scales) {
-        bgmmc.is_oscale_per_n = wei_scales.mask_ == 1 << (bgmmc.ndims - 1);
+    const bool has_wei_scales = !wei_scales.has_default_values();
+    bgmmc.with_scales = !src_scales.has_default_values() || has_wei_scales;
+    if (has_wei_scales) {
+        bgmmc.is_oscale_per_n
+                = wei_scales.get_mask() == (1 << (bgmmc.ndims - 1));
 
         // only common and per-oc-channel scales are supported
-        VCONDCHECK_BG(wei_scales.mask_ == 0 || bgmmc.is_oscale_per_n,
+        VCONDCHECK_BG(wei_scales.get_mask() == 0 || bgmmc.is_oscale_per_n,
                 VERBOSE_UNSUPPORTED_SCALES_CFG);
     }
 
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
     bgmmc.with_dst_scales = !dst_scales.has_default_values();
     // only common scales are supported
-    if (bgmmc.with_dst_scales && dst_scales.mask_ != 0)
-        return status::unimplemented;
+    VCONDCHECK_BG(!(bgmmc.with_dst_scales && dst_scales.get_mask() > 0),
+            VERBOSE_UNSUPPORTED_SCALES_CFG);
 
     const auto &p = attr.post_ops_;
     bgmmc.with_sum = p.find(primitive_kind::sum) != -1;
@@ -834,7 +866,7 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
 
     VCHECK_BG(attr.set_default_formats(&dst_md), VERBOSE_UNSUPPORTED_TAG);
 
-    bgmmc.wei_n_blk = get_default_n_block(bgmmc.wei_tag);
+    bgmmc.wei_n_blk = get_default_n_block(bgmmc.wei_tag, bgmmc);
 
     bgmmc.blocked_B = bm_conf_utils.get_blocked_B();
     bgmmc.use_buffer_b = bm_conf_utils.use_buffer_b();
@@ -1107,4 +1139,4 @@ void init_scratchpad(memory_tracking::registrar_t &scratchpad,
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
\ No newline at end of file
+} // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/brgemm_matmul_utils.hpp b/src/cpu/aarch64/matmul/brgemm_matmul_utils.hpp
index fb5d88b14f0..ec4e1b75a27 100644
--- a/src/cpu/aarch64/matmul/brgemm_matmul_utils.hpp
+++ b/src/cpu/aarch64/matmul/brgemm_matmul_utils.hpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2021-2023 Intel Corporation
+* Copyright 2023-2024 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -312,7 +313,7 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
 void init_scratchpad(memory_tracking::registrar_t &scratchpad,
         const brgemm_matmul_conf_t &bgmmc);
 
-int get_default_n_block(format_tag_t matrix_b_tag);
+int get_default_n_block(format_tag_t, brgemm_matmul_conf_t &bgmmc);
 
 } // namespace matmul
 } // namespace aarch64
diff --git a/src/cpu/aarch64/matmul/jit_int8_kernel_types.hpp b/src/cpu/aarch64/matmul/jit_int8_kernel_types.hpp
new file mode 100644
index 00000000000..27d55f19381
--- /dev/null
+++ b/src/cpu/aarch64/matmul/jit_int8_kernel_types.hpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_AARCH64_JIT_INT8_KERNEL_TYPES_HPP
+#define CPU_AARCH64_JIT_INT8_KERNEL_TYPES_HPP
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+namespace matmul {
+
+typedef enum {
+    none = 0,
+    per_tensor = 1,
+    per_m = 2,
+    per_n = 3,
+    per_k = 4,
+} jit_int8_broadcast_t;
+
+struct dyn_vals_t {
+    int f = 0;
+    dim_t M = 0;
+    dim_t K = 0;
+    dim_t N = 0;
+    dim_t B = 0;
+    int is_s8 = 0, is_u8 = 0;
+    int mtail, ktail, ntail, m_blk, k_blk, n_blk;
+    int get_min_max = 0, reorder_a = 0, reorder_b = 0, cal_src = 0;
+    int is_mtail = 0, is_ktail = 0;
+};
+
+struct dyn_params_t {
+    const float *dyn_src;
+    const int8_t *src;
+    int8_t *dst;
+    float *max, *min;
+    int *nk, *nm, *nn;
+    int *tl, *mtl, *ntl;
+};
+
+struct brg_int8_t {
+    int M, K, N;
+    const int m_blk = 8, n_blk = 4, k_blk = 8;
+    const int ld_block = 6, rd_block = 4, bd_block = 8;
+    int na, nb;
+    int m_tail, n_tail, k_tail;
+    int is_m_tail, is_k_tail, is_n_tail, is_zp_cal;
+    int dst_dt_sz;
+    bool is_s8;
+    bool is_bias;
+    bool with_scales;
+    bool with_dst_scales;
+    bool is_oc_scales;
+    jit_int8_broadcast_t zp_type_a = jit_int8_broadcast_t::none;
+    jit_int8_broadcast_t zp_type_b = jit_int8_broadcast_t::none;
+    jit_int8_broadcast_t zp_type_c = jit_int8_broadcast_t::none;
+    bool is_zp_b_int8 = false;
+    bool b_reo = true;
+    data_type_t zp_b_dt;
+    dim_t B;
+};
+
+struct call_params_t {
+    const uint8_t *src, *wei;
+    float *dst;
+    const float *bias, *scales, *dst_scales;
+    dim_t M, K, N;
+    char *buf_B_ptr_;
+    int *na, *nb;
+    int32_t *src_zero_point, *wei_zero_point, *dst_zero_point;
+    const int8_t *wei_zero_point_buf;
+    float *zp_a_ptr, *zp_b_ptr;
+};
+
+} // namespace matmul
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+#endif
diff --git a/src/cpu/aarch64/matmul/jit_int8_matmul.cpp b/src/cpu/aarch64/matmul/jit_int8_matmul.cpp
new file mode 100644
index 00000000000..97e3a17b45a
--- /dev/null
+++ b/src/cpu/aarch64/matmul/jit_int8_matmul.cpp
@@ -0,0 +1,1478 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <ranges>
+#include <vector>
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/math_utils.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/nstl.hpp"
+#include "common/tag_traits.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "cpu/aarch64/jit_generator.hpp"
+#include "cpu/cpu_primitive.hpp"
+#include "cpu/matmul/matmul_utils.hpp"
+#include "cpu/scale_utils.hpp"
+
+#include "cpu/platform.hpp"
+#include "cpu/primitive_attr_postops.hpp"
+
+#include "cpu/aarch64/matmul/jit_int8_kernel_types.hpp"
+#include "cpu/aarch64/matmul/jit_int8_matmul.hpp"
+#include "cpu/aarch64/matmul/jit_int8_matmul_utils.hpp"
+
+#define GET_OFF(field) (uint32_t) offsetof(call_params_t, field)
+
+#define LDR_IMM(reg, addr, off) \
+    { \
+        const uint64_t IMM12_MASK = ~uint64_t(0xfff); \
+        if ((off & IMM12_MASK) == 0) { \
+            ldr(reg, ptr(addr, off)); \
+        } else { \
+            add_imm(X_DEFAULT_ADDR, addr, off, X_TMP_0); \
+            ldr(reg, ptr(X_DEFAULT_ADDR)); \
+        } \
+    }
+
+#define VCHECK_BG(f, msg, ...) \
+    VCHECK(primitive, create, dispatch, brgemm_matmul, f, msg, ##__VA_ARGS__);
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+namespace matmul {
+
+using namespace Xbyak_aarch64;
+using namespace dnnl::impl::cpu::matmul;
+using namespace dnnl::impl::format_tag;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+using namespace nstl;
+
+using namespace data_type;
+
+struct jit_int8_matmul_kernel_t : public jit_generator {
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_int8_matmul_kernel_t)
+
+    XReg reg_param = abi_param1;
+    XReg reg_a = x3;
+    XReg reg_b = x4;
+    XReg reg_c = x5;
+    XReg reg_aux_a = x6;
+    XReg reg_aux_b = x7;
+    XReg reg_aux_c = x8;
+    XReg reg_aux_a1 = x9;
+    XReg reg_zp_aux_b_buf = x10;
+    XReg reg_aux_c1 = x11;
+    XReg reg_ld_loop = x12;
+    XReg reg_rd_loop = x13;
+    XReg reg_bd_loop = x14;
+    XReg reg_tmp = x15;
+    XReg reg_tmp_1 = x16;
+    XReg reg_bias = x17;
+    XReg reg_zp_a = x18;
+
+    XReg reg_scales = x20;
+    XReg reg_aux_scales = x24; //used X_TMP_1
+    XReg reg_na = x25; //used X_TMP_2
+    XReg reg_zp_b = x26; //used X_TMP_3
+    XReg reg_zp_aux_b = x27; //used X_TMP_4
+    PReg prd_ld = p1;
+    PReg prd_st = p2;
+    PReg prd_b = p3;
+    PReg prd_8 = p4;
+    PReg prd_zp_b_tl = p5;
+    XReg reg_zp_val_c = x2;
+
+    XReg reg_zp_val_a = reg_scales;
+    XReg reg_zp_val_b = reg_aux_scales;
+
+    call_params_t inp;
+
+    void operator()(const call_params_t *p) {
+        return jit_generator::operator()(p);
+    }
+
+    ZReg loadb(int ld) { return ZReg(ld + 1); }
+    ZReg acc(int bd, int ld) {
+        return ZReg(bd * brg_.ld_block + ld + brg_.ld_block + 1);
+    }
+    void zero_regs() {
+        for (int a = 0; a < brg_.bd_block / 2; a++)
+            for (int b = 0; b < brg_.ld_block; b++)
+                eor(acc(a, b).d, acc(a, b).d, acc(a, b).d);
+    }
+    void store_regs(int bdb, int ldb, int tail) {
+        for (int a = 0; a < bdb; a++) {
+            for (int b = 0; b < ldb; b++) {
+                if (brg_.is_s8)
+                    scvtf(acc(a, b).s, P_ALL_ONE, acc(a, b).s);
+                else
+                    ucvtf(acc(a, b).s, P_ALL_ONE, acc(a, b).s);
+            }
+        }
+
+        for (int a = 0; a < bdb; a++) {
+            for (int b = 0; b < ldb; b += 2) {
+                if (b + 1 < ldb) {
+                    uzp1(z31.d, acc(a, b).d, acc(a, b + 1).d);
+                    uzp2(acc(a, b + 1).d, acc(a, b).d, acc(a, b + 1).d);
+                    mov(acc(a, b).d, z31.d);
+                } else {
+                    uzp1(z31.d, acc(a, b).d, acc(a, b).d);
+                    uzp2(acc(a, b + 1).d, acc(a, b).d, acc(a, b).d);
+                    mov(acc(a, b).d, z31.d);
+                }
+            }
+        }
+
+        if (brg_.zp_type_a != jit_int8_broadcast_t::none) {
+            for (int b = 0; b < ldb; b += 2) {
+                PReg p = (brg_.is_n_tail && b >= ldb - 2) ? prd_b : P_ALL_ONE;
+                ld1w(z31.s, p, ptr(reg_zp_a, b / 2, MUL_VL));
+                for (int a = 0; a < bdb; a++) {
+                    fsub(acc(a, b).s, acc(a, b).s, z31.s);
+                    fsub(acc(a, b + 1).s, acc(a, b + 1).s, z31.s);
+                }
+            }
+        }
+
+        if (brg_.zp_type_b != jit_int8_broadcast_t::none) {
+            int ao = 0;
+            if (brg_.is_zp_b_int8) {
+                mov(reg_tmp_1, reg_zp_aux_b_buf);
+                int ilp = (brg_.is_n_tail) ? n_blks : 3;
+                for (int i = 0; i < ilp; i++) {
+                    PReg p = (brg_.is_n_tail && i == ilp - 1) ? prd_zp_b_tl
+                                                              : prd_8;
+                    ld1b(ZRegB(i + 1), p, ptr(reg_tmp_1));
+                    if (brg_.zp_b_dt == u8) {
+                        uunpklo(ZRegH(i + 1), ZRegB(i + 1));
+                        uunpklo(ZRegS(i + 1), ZRegH(i + 1));
+                        ucvtf(ZRegS(i + 1), P_ALL_ONE, ZRegS(i + 1));
+                    } else {
+                        sunpklo(ZRegH(i + 1), ZRegB(i + 1));
+                        sunpklo(ZRegS(i + 1), ZRegH(i + 1));
+                        scvtf(ZRegS(i + 1), P_ALL_ONE, ZRegS(i + 1));
+                    }
+                    add_imm(reg_tmp_1, reg_tmp_1, 8, X_TMP_0);
+                }
+            }
+            for (int a = 0; a < bdb; a++) {
+                ld1rw(z31.s, P_ALL_ONE, ptr(reg_zp_aux_b, ao * 4));
+                ld1rw(z0.s, P_ALL_ONE, ptr(reg_zp_aux_b, (ao + 1) * 4));
+                for (int b = 0; b < ldb; b += 2) {
+                    if (brg_.is_zp_b_int8) {
+                        fmul(z4.s, z31.s, ZRegS(b / 2 + 1));
+                        fmul(z5.s, z0.s, ZRegS(b / 2 + 1));
+                        fsub(acc(a, b).s, acc(a, b).s, z4.s);
+                        fsub(acc(a, b + 1).s, acc(a, b + 1).s, z5.s);
+                    } else {
+                        fsub(acc(a, b).s, acc(a, b).s, z31.s);
+                        fsub(acc(a, b + 1).s, acc(a, b + 1).s, z0.s);
+                    }
+                }
+                ao += 2;
+            }
+        }
+
+        if (brg_.with_scales) {
+            for (int b = 0; b < ldb; b += 2) {
+                PReg p = (brg_.is_n_tail && b >= ldb - 2) ? prd_b : P_ALL_ONE;
+                if (brg_.is_oc_scales) {
+                    ld1w(z31.s, p, ptr(reg_scales, b / 2, MUL_VL));
+                } else {
+                    ld1w(z31.s, p, ptr(reg_scales));
+                }
+
+                for (int a = 0; a < bdb; a++) {
+                    fmul(acc(a, b).s, acc(a, b).s, z31.s);
+                    fmul(acc(a, b + 1).s, acc(a, b + 1).s, z31.s);
+                }
+            }
+        }
+
+        if (brg_.is_bias) {
+            for (int b = 0; b < ldb; b += 2) {
+                PReg p = (brg_.is_n_tail && b >= ldb - 2) ? prd_b : P_ALL_ONE;
+                ld1w(z31.s, p, ptr(reg_bias, b / 2, MUL_VL));
+                for (int a = 0; a < bdb; a++) {
+                    fadd(acc(a, b).s, acc(a, b).s, z31.s);
+                    fadd(acc(a, b + 1).s, acc(a, b + 1).s, z31.s);
+                }
+            }
+        }
+
+        if (brg_.with_dst_scales) {
+            ld1rw(z31.s, P_ALL_ONE, ptr(reg_aux_scales));
+            for (int b = 0; b < ldb; b += 2) {
+                for (int a = 0; a < bdb; a++) {
+                    fmul(acc(a, b).s, acc(a, b).s, z31.s);
+                    fmul(acc(a, b + 1).s, acc(a, b + 1).s, z31.s);
+                }
+            }
+        }
+
+        if (brg_.zp_type_c != jit_int8_broadcast_t::none) {
+            LDR_IMM(reg_zp_val_c, reg_param, GET_OFF(dst_zero_point));
+            ldr(W_TMP_0, ptr(reg_zp_val_c));
+            dup(z0.s, W_TMP_0);
+            scvtf(z0.s, P_ALL_ONE, z0.s);
+            for (int b = 0; b < ldb; b += 2) {
+                for (int a = 0; a < bdb; a++) {
+                    fadd(acc(a, b).s, acc(a, b).s, z0.s);
+                    fadd(acc(a, b + 1).s, acc(a, b + 1).s, z0.s);
+                }
+            }
+        }
+
+        mov(reg_tmp, reg_aux_c);
+        add_imm(reg_tmp_1, reg_aux_c, brg_.N * brg_.dst_dt_sz, X_TMP_0);
+        for (int a = 0; a < bdb; a++) {
+            for (int b = 0; b < ldb; b += 2) {
+                PReg p = (brg_.is_n_tail && b >= ldb - 2) ? prd_st : P_ALL_ONE;
+                int vl = b / 2;
+                st1w(acc(a, b).s, p, ptr(reg_tmp, vl, MUL_VL));
+                if (a >= bdb - 1 && brg_.is_m_tail) {
+                    if (brg_.m_tail % 2 == 0)
+                        st1w(acc(a, b + 1).s, p, ptr(reg_tmp_1, vl, MUL_VL));
+                } else {
+                    st1w(acc(a, b + 1).s, p, ptr(reg_tmp_1, vl, MUL_VL));
+                }
+            }
+            add_imm(reg_tmp, reg_tmp, 2 * brg_.N * brg_.dst_dt_sz, X_TMP_0);
+            add_imm(reg_tmp_1, reg_tmp_1, 2 * brg_.N * brg_.dst_dt_sz, X_TMP_0);
+        }
+    }
+
+    void microkernel(int rdb, int bdb, int ldb, int tail) {
+        int a_off = 0, rd, ld, bd;
+        mov(reg_tmp, reg_aux_b);
+        for (rd = 0; rd < rdb; rd++) {
+            int ao = 0;
+
+            for (ld = 0; ld < ldb; ld++) {
+                PReg p = (brg_.is_n_tail && ld == ldb - 1) ? prd_ld : P_ALL_ONE;
+                ld1b(loadb(ld).b, p, ptr(reg_tmp, ld, MUL_VL));
+            }
+            for (bd = 0; bd < bdb; bd++) {
+                add_imm(X_DEFAULT_ADDR, reg_aux_a, a_off + ao, X_TMP_0);
+                ld1rqb(z0.b, P_ALL_ONE, ptr(X_DEFAULT_ADDR));
+                ao += brg_.m_blk * 2;
+
+                for (ld = 0; ld < ldb; ld++) {
+                    if (brg_.is_s8)
+                        smmla(acc(bd, ld).s, z0.b, loadb(ld).b);
+                    else
+                        ummla(acc(bd, ld).s, z0.b, loadb(ld).b);
+                }
+            }
+            a_off += brg_.m_blk * brg_.k_blk;
+            add_imm(reg_tmp, reg_tmp, brg_.k_blk * brg_.n_blk * brg_.ld_block,
+                    X_TMP_0);
+        }
+    }
+
+    void loop_k(int bdb, int ldb, int tail) {
+        zero_regs();
+        mov(reg_aux_a, reg_aux_a1);
+        mov(reg_aux_b, reg_b);
+        if (k_full_blks > 0) {
+            mov(reg_rd_loop, k_full_blks);
+            Label l0;
+            L(l0);
+            microkernel(brg_.rd_block, bdb, ldb, tail);
+            add_imm(reg_aux_a, reg_aux_a,
+                    brg_.m_blk * brg_.k_blk * brg_.rd_block, X_TMP_0);
+            add_imm(reg_aux_b, reg_aux_b,
+                    brg_.k_blk * brg_.n_blk * brg_.ld_block * brg_.rd_block,
+                    X_TMP_0);
+            sub(reg_rd_loop, reg_rd_loop, 1);
+            cmp(reg_rd_loop, 0);
+            b(GT, l0);
+        }
+        if (k_tail_blk > 0) {
+            microkernel(k_tail_blk, bdb, ldb, tail);
+            add_imm(reg_aux_a, reg_aux_a, brg_.m_blk * brg_.k_blk * k_tail_blk,
+                    X_TMP_0);
+            add_imm(reg_aux_b, reg_aux_b,
+                    brg_.k_blk * brg_.n_blk * brg_.ld_block * k_tail_blk,
+                    X_TMP_0);
+        }
+        if (k_residual_blk > 0) { microkernel(1, bdb, ldb, tail); }
+        store_regs(bdb, ldb, tail);
+    }
+
+    void loop_k_zp(int bdb, int ldb, int is_a, int is_b) {
+        eor(z3.d, z3.d, z3.d);
+        eor(z4.d, z4.d, z4.d);
+        for (int i = 0; i < 6; i++)
+            eor(acc(2, i).d, acc(2, i).d, acc(2, i).d);
+        mov(reg_aux_a, reg_aux_a1);
+        mov(reg_aux_b, reg_b);
+        if (k_full_blks > 0) {
+            mov(reg_rd_loop, k_full_blks);
+            Label l0;
+            L(l0);
+            zp_comp(brg_.rd_block, bdb, ldb, is_a, is_b);
+            add_imm(reg_aux_a, reg_aux_a,
+                    brg_.m_blk * brg_.k_blk * brg_.rd_block, X_TMP_0);
+            add_imm(reg_aux_b, reg_aux_b,
+                    brg_.k_blk * brg_.n_blk * brg_.ld_block * brg_.rd_block,
+                    X_TMP_0);
+            sub(reg_rd_loop, reg_rd_loop, 1);
+            cmp(reg_rd_loop, 0);
+            b(GT, l0);
+        }
+        if (k_tail_blk > 0) {
+            zp_comp(k_tail_blk, bdb, ldb, is_a, is_b);
+            add_imm(reg_aux_a, reg_aux_a, brg_.m_blk * brg_.k_blk * k_tail_blk,
+                    X_TMP_0);
+            add_imm(reg_aux_b, reg_aux_b,
+                    brg_.k_blk * brg_.n_blk * brg_.ld_block * k_tail_blk,
+                    X_TMP_0);
+        }
+        if (k_residual_blk > 0) { zp_comp(1, bdb, ldb, is_a, is_b); }
+
+        if (brg_.zp_type_b != jit_int8_broadcast_t::none && is_b == 1) {
+            uzp1(z3.d, z3.d, z4.d);
+            scvtf(z3.s, P_ALL_ONE, z3.s);
+            if (!brg_.is_zp_b_int8) {
+                ldr(W_TMP_0, ptr(reg_zp_val_b));
+                dup(z0.s, W_TMP_0);
+                scvtf(z0.s, P_ALL_ONE, z0.s);
+                fmul(z3.s, P_ALL_ONE, z0.s);
+            } else {
+                if (brg_.zp_type_a != jit_int8_broadcast_t::none) {
+                    ldr(W_TMP_0, ptr(reg_zp_val_a));
+                    dup(z0.s, W_TMP_0);
+                    mov_imm(W_TMP_0, brg_.K);
+                    dup(z1.s, W_TMP_0);
+                    scvtf(z0.s, P_ALL_ONE, z0.s);
+                    scvtf(z1.s, P_ALL_ONE, z1.s);
+                    fmul(z0.s, z1.s, z0.s);
+                    fsub(z3.s, z3.s, z0.s);
+                }
+            }
+            st1w(z3.s, P_ALL_ONE, ptr(reg_zp_b));
+        }
+
+        if ((brg_.zp_type_a != jit_int8_broadcast_t::none) && is_a == 1) {
+            ldr(W_TMP_0, ptr(reg_zp_val_a));
+            dup(z2.s, W_TMP_0);
+            scvtf(z2.s, P_ALL_ONE, z2.s);
+            uzp1(acc(2, 0).d, acc(2, 0).d, acc(2, 1).d);
+            uzp1(acc(2, 2).d, acc(2, 2).d, acc(2, 3).d);
+            uzp1(acc(2, 4).d, acc(2, 4).d, acc(2, 5).d);
+
+            scvtf(acc(2, 0).s, P_ALL_ONE, acc(2, 0).s);
+            scvtf(acc(2, 2).s, P_ALL_ONE, acc(2, 2).s);
+            scvtf(acc(2, 4).s, P_ALL_ONE, acc(2, 4).s);
+            if (brg_.zp_type_b != jit_int8_broadcast_t::none
+                    && !brg_.is_zp_b_int8) {
+                ldr(W_TMP_0, ptr(reg_zp_val_b));
+                dup(z0.s, W_TMP_0);
+                mov_imm(W_TMP_0, brg_.K);
+                dup(z1.s, W_TMP_0);
+                scvtf(z0.s, P_ALL_ONE, z0.s);
+                scvtf(z1.s, P_ALL_ONE, z1.s);
+                fmul(z0.s, z1.s, z0.s);
+                fsub(acc(2, 0).s, acc(2, 0).s, z0.s);
+                fsub(acc(2, 2).s, acc(2, 2).s, z0.s);
+                fsub(acc(2, 4).s, acc(2, 4).s, z0.s);
+            }
+            fmul(acc(2, 0).s, P_ALL_ONE, z2.s);
+            fmul(acc(2, 2).s, P_ALL_ONE, z2.s);
+            fmul(acc(2, 4).s, P_ALL_ONE, z2.s);
+
+            st1w(acc(2, 0).s, P_ALL_ONE, ptr(reg_zp_a));
+            st1w(acc(2, 2).s, P_ALL_ONE, ptr(reg_zp_a, 1, MUL_VL));
+            st1w(acc(2, 4).s, P_ALL_ONE, ptr(reg_zp_a, 2, MUL_VL));
+        }
+    }
+
+    void han_blk() {
+        Label ld_loop, bd_loop;
+        LDR_IMM(reg_tmp, reg_param, GET_OFF(nb));
+        LDR_IMM(reg_na, reg_param, GET_OFF(na));
+        ldr(WReg(reg_ld_loop.getIdx()), ptr(reg_tmp));
+        mov(reg_aux_a1, reg_a);
+        // mov(reg_b,reg_b);
+        mov(reg_aux_c1, reg_c);
+        mov(reg_aux_c, reg_aux_c1);
+        mov(reg_zp_aux_b, reg_zp_b);
+        L(ld_loop);
+        ldr(WReg(reg_bd_loop.getIdx()), ptr(reg_na));
+        L(bd_loop);
+        loop_k(bdb, ldb, 0);
+        add_imm(reg_aux_a1, reg_aux_a1,
+                div_up(brg_.K, brg_.k_blk) * brg_.k_blk * brg_.bd_block,
+                X_TMP_0);
+        add_imm(reg_aux_c, reg_aux_c, brg_.N * brg_.bd_block * brg_.dst_dt_sz,
+                X_TMP_0);
+        add_imm(reg_zp_aux_b, reg_zp_aux_b, brg_.m_blk * brg_.dst_dt_sz,
+                X_TMP_0);
+        sub(reg_bd_loop, reg_bd_loop, 1);
+        cmp(reg_bd_loop, 0);
+        b(GT, bd_loop);
+        mov(reg_aux_a1, reg_a);
+        mov(reg_zp_aux_b, reg_zp_b);
+        add_imm(reg_b, reg_b,
+                (brg_.n_blk * brg_.ld_block) * div_up(brg_.K, brg_.k_blk)
+                        * brg_.k_blk,
+                X_TMP_0);
+        add_imm(reg_aux_c1, reg_aux_c1,
+                brg_.dst_dt_sz * (brg_.n_blk * brg_.ld_block), X_TMP_0);
+        add_imm(reg_zp_a, reg_zp_a, brg_.n_blk * brg_.ld_block * brg_.dst_dt_sz,
+                X_TMP_0);
+        if (brg_.is_oc_scales)
+            add_imm(reg_scales, reg_scales,
+                    brg_.dst_dt_sz * (brg_.n_blk * brg_.ld_block), X_TMP_0);
+        add_imm(reg_bias, reg_bias,
+                brg_.dst_dt_sz * (brg_.n_blk * brg_.ld_block), X_TMP_0);
+        mov(reg_aux_c, reg_aux_c1);
+        sub(reg_ld_loop, reg_ld_loop, 1);
+        cmp(reg_ld_loop, 0);
+        b(GT, ld_loop);
+    }
+
+    void han_blk_zp() {
+        Label ld_loop, bd_loop, skip_ld_loop, skip_bd_loop;
+        LDR_IMM(reg_tmp, reg_param, GET_OFF(nb));
+        LDR_IMM(reg_na, reg_param, GET_OFF(na));
+        ldr(WReg(reg_ld_loop.getIdx()), ptr(reg_tmp));
+        ldr(WReg(reg_bd_loop.getIdx()), ptr(reg_na));
+        mov(reg_aux_a1, reg_a);
+        // mov(reg_b,reg_b);
+        if (brg_.zp_type_b != jit_int8_broadcast_t::none) {
+            cmp(reg_bd_loop, 0);
+            b(EQ, skip_bd_loop);
+            L(bd_loop);
+            loop_k_zp(bdb, ldb, 0, 1);
+            add_imm(reg_aux_a1, reg_aux_a1,
+                    div_up(brg_.K, brg_.k_blk) * brg_.k_blk * brg_.bd_block,
+                    X_TMP_0);
+            add_imm(reg_zp_b, reg_zp_b, brg_.m_blk * brg_.dst_dt_sz, X_TMP_0);
+            sub(reg_bd_loop, reg_bd_loop, 1);
+            cmp(reg_bd_loop, 0);
+            b(GT, bd_loop);
+            L(skip_bd_loop);
+        }
+        if (brg_.zp_type_a != jit_int8_broadcast_t::none) {
+            cmp(reg_ld_loop, 0);
+            b(EQ, skip_ld_loop);
+            L(ld_loop);
+            loop_k_zp(bdb, ldb, 1, 0);
+            add_imm(reg_zp_a, reg_zp_a,
+                    brg_.n_blk * brg_.ld_block * brg_.dst_dt_sz, X_TMP_0);
+            add_imm(reg_b, reg_b,
+                    (brg_.n_blk * brg_.ld_block) * div_up(brg_.K, brg_.k_blk)
+                            * brg_.k_blk,
+                    X_TMP_0);
+            sub(reg_ld_loop, reg_ld_loop, 1);
+            cmp(reg_ld_loop, 0);
+            b(GT, ld_loop);
+            L(skip_ld_loop);
+        }
+    }
+
+    void zp_comp(int rdb, int bdb, int ldb, int is_a, int is_b) {
+
+        dup(z0.b, 1);
+        int rd, ld;
+        if (brg_.zp_type_b != jit_int8_broadcast_t::none && is_b == 1) {
+            mov(reg_tmp, reg_aux_a);
+            for (rd = 0; rd < rdb; rd++) {
+                ld1b(z1.b, P_ALL_ONE / T_z, ptr(reg_tmp));
+                ld1b(z2.b, P_ALL_ONE / T_z, ptr(reg_tmp, 1, MUL_VL));
+                add_imm(reg_tmp, reg_tmp, brg_.k_blk * brg_.m_blk, X_TMP_0);
+                if (brg_.is_s8) {
+                    smmla(z3.s, z0.b, z1.b);
+                    smmla(z4.s, z0.b, z2.b);
+                } else {
+                    ummla(z3.s, z0.b, z1.b);
+                    ummla(z4.s, z0.b, z2.b);
+                }
+            }
+        }
+        if ((brg_.zp_type_a != jit_int8_broadcast_t::none) && is_a == 1) {
+            mov(reg_tmp, reg_aux_b);
+
+            for (rd = 0; rd < rdb; rd++) {
+                for (ld = 0; ld < ldb; ld++) {
+                    PReg p = (brg_.is_n_tail && ld == ldb - 1) ? prd_ld
+                                                               : P_ALL_ONE;
+                    ld1b(acc(1, ld).b, p, ptr(reg_tmp, ld, MUL_VL));
+                }
+                add_imm(reg_tmp, reg_tmp,
+                        brg_.k_blk * brg_.n_blk * brg_.ld_block, X_TMP_0);
+                for (ld = 0; ld < ldb; ld++) {
+                    if (brg_.is_s8) {
+                        smmla(acc(2, ld).s, z0.b, acc(1, ld).b);
+                    } else {
+                        ummla(acc(2, ld).s, z0.b, acc(1, ld).b);
+                    }
+                }
+            }
+        }
+    }
+
+    void config() {
+        int m, pred_st = 0, pred_ld = 0, sv_len = 8, pred_b = 8;
+        n_blks = div_up(brg_.n_tail, 8);
+        k_full_blks = brg_.K / (brg_.k_blk * brg_.rd_block);
+        m = brg_.K % (brg_.k_blk * brg_.rd_block);
+        k_tail_blk = m / brg_.k_blk;
+        k_residual_blk = m % brg_.k_blk;
+        ldb = (brg_.is_n_tail) ? div_up(brg_.n_tail, 4) : brg_.ld_block;
+        bdb = (brg_.is_m_tail) ? div_up(brg_.m_tail, 2) : brg_.bd_block / 2;
+        rdb = (brg_.is_k_tail) ? div_up(brg_.k_tail, brg_.k_blk) : 4;
+
+        int pred_zp_b_tl = (brg_.n_tail % 8 == 0) ? 8 : brg_.n_tail % 8;
+        set_preg(prd_8.b, 8, X_TMP_0, X_TMP_1);
+        set_preg(prd_zp_b_tl.b, pred_zp_b_tl, X_TMP_0, X_TMP_1);
+
+        if (brg_.is_n_tail) {
+            pred_b = (brg_.n_tail % 8 == 0) ? sv_len : (brg_.n_tail % 8);
+            if (brg_.n_tail % brg_.n_blk == 0) {
+                pred_st = (brg_.n_tail % (brg_.n_blk * 2) == 0) ? sv_len
+                                                                : sv_len / 2;
+                pred_ld = sv_len * brg_.dst_dt_sz;
+            } else {
+                pred_ld = (brg_.n_tail % brg_.n_blk) * brg_.k_blk;
+                pred_st = (ldb % 2 == 0)
+                        ? (sv_len / 2) + (brg_.n_tail % brg_.n_blk)
+                        : (brg_.n_tail % brg_.n_blk);
+            }
+        }
+        set_preg(prd_ld.b, pred_ld, X_TMP_0, X_TMP_1);
+        set_preg(prd_st.s, pred_st, X_TMP_0, X_TMP_1);
+        set_preg(prd_b.s, pred_b, X_TMP_0, X_TMP_1);
+    }
+
+    void generate() override {
+        preamble();
+        config();
+
+        LDR_IMM(reg_a, reg_param, GET_OFF(src));
+        LDR_IMM(reg_b, reg_param, GET_OFF(wei));
+        LDR_IMM(reg_c, reg_param, GET_OFF(dst));
+        LDR_IMM(reg_zp_b, reg_param, GET_OFF(zp_b_ptr));
+        LDR_IMM(reg_zp_a, reg_param, GET_OFF(zp_a_ptr));
+        if (brg_.is_zp_cal) {
+            LDR_IMM(reg_zp_val_b, reg_param, GET_OFF(wei_zero_point));
+            LDR_IMM(reg_zp_val_a, reg_param, GET_OFF(src_zero_point));
+            han_blk_zp();
+        } else {
+
+            LDR_IMM(reg_bias, reg_param, GET_OFF(bias));
+            LDR_IMM(reg_scales, reg_param, GET_OFF(scales));
+            LDR_IMM(reg_aux_scales, reg_param, GET_OFF(dst_scales));
+            LDR_IMM(reg_zp_aux_b_buf, reg_param, GET_OFF(wei_zero_point_buf));
+            han_blk();
+        }
+
+        postamble();
+    }
+
+    jit_int8_matmul_kernel_t(const brg_int8_t &k) : brg_(k) {}
+    ~jit_int8_matmul_kernel_t() override = default;
+
+private:
+    brg_int8_t brg_;
+    int ldb;
+    int bdb;
+    int rdb;
+    int k_full_blks;
+    int k_tail_blk;
+    int k_residual_blk;
+    int n_blks;
+};
+
+status_t jit_int8_matmul_t::pd_t::init(engine_t *engine) {
+
+    const auto src_type = src_md(0)->data_type;
+    const auto wei_type = weights_md(0)->data_type;
+    const auto dst_type = dst_md(0)->data_type;
+
+    const memory_desc_wrapper src_d(src_md_);
+    const memory_desc_wrapper weights_d(weights_md_);
+    const memory_desc_wrapper dst_d(dst_md_);
+    const memory_desc_wrapper bias_d(bias_md_);
+
+    const bool no_runtime_dims_or_strides
+            = !(src_d.has_runtime_dims_or_strides()
+                    || weights_d.has_runtime_dims_or_strides());
+
+    VDISPATCH_MATMUL(
+            no_runtime_dims_or_strides, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+    bool is_s8_wei = utils::everyone_is(s8, wei_type);
+    bool is_u8 = utils::everyone_is(u8, src_type, wei_type);
+    bool is_s8 = utils::everyone_is(s8, src_type, wei_type);
+
+    int dims = src_d.ndims();
+
+    auto check_attr_scales = [&]() -> bool {
+        const std::vector<int> supported_args
+                = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+        bool ok = attr_scales_ok(supported_args);
+        auto is_src_scl
+                = !attr()->scales_.get(DNNL_ARG_SRC).has_default_values();
+        auto is_wei_scl
+                = !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
+        auto dst_scl_msk = attr()->scales_.get(DNNL_ARG_DST).get_mask();
+        auto wei_scl_msk = attr()->scales_.get(DNNL_ARG_WEIGHTS).get_mask();
+        auto src_scl_msk = attr()->scales_.get(DNNL_ARG_SRC).get_mask();
+
+        if (src_scl_msk > 0
+                || (wei_scl_msk > 0 && wei_scl_msk != 1 << (dims - 1))
+                || dst_scl_msk > 0)
+            return false;
+
+        if (is_src_scl && is_wei_scl && wei_scl_msk > 0) {
+            // This case requires scratchpad.
+            if (N() == DNNL_RUNTIME_DIM_VAL) ok = false;
+        }
+        return ok;
+    };
+
+    auto check_bias = [&]() -> bool {
+        if (bias_d.format_any()) {
+            if (bias_d.has_runtime_dims_or_strides()) return false;
+            status_t status = memory_desc_init_by_strides(bias_md_, nullptr);
+            if (status != status::success) return false;
+        }
+
+        const auto bia_dt = weights_md(1)->data_type;
+        return IMPLICATION(with_bias(), bia_dt == f32 && is_bias_1xN());
+    };
+
+    auto init_zp_type = [&](brg_int8_t *brg_) -> bool {
+        auto zero_points = attr()->zero_points_;
+
+        auto wt_int8 = zero_points.get_data_type(DNNL_ARG_WEIGHTS) == u8
+                || zero_points.get_data_type(DNNL_ARG_WEIGHTS) == s8;
+        if (!zero_points.has_default_data_type(DNNL_ARG_SRC)
+                || !zero_points.has_default_data_type(DNNL_ARG_DST)
+                || (!zero_points.has_default_data_type(DNNL_ARG_WEIGHTS)
+                        && !wt_int8))
+            return false;
+
+        if (!zero_points.has_default_data_type(DNNL_ARG_WEIGHTS)) {
+            switch (zero_points.get_data_type(DNNL_ARG_WEIGHTS)) {
+                case u8: {
+                    brg_->zp_b_dt = u8;
+                    brg_->is_zp_b_int8 = true;
+                    break;
+                }
+                case s8: {
+                    brg_->zp_b_dt = s8;
+                    brg_->is_zp_b_int8 = true;
+                    break;
+                }
+                case s32: {
+                    brg_->is_zp_b_int8 = false;
+                    break;
+                }
+                default: return false;
+            }
+        }
+
+        if (zero_points.get_mask(DNNL_ARG_SRC) > 0
+                || zero_points.get_mask(DNNL_ARG_DST) > 0
+                || (zero_points.get_mask(DNNL_ARG_WEIGHTS) > 0
+                        && (zero_points.get_mask(DNNL_ARG_WEIGHTS))
+                                != (3 << (dims - 2))))
+            return false;
+
+        brg_->zp_type_a = zero_points.has_default_values(DNNL_ARG_SRC)
+                ? jit_int8_broadcast_t::none
+                : jit_int8_broadcast_t::per_tensor;
+
+        brg_->zp_type_b = zero_points.has_default_values(DNNL_ARG_WEIGHTS)
+                ? jit_int8_broadcast_t::none
+                : jit_int8_broadcast_t::per_tensor;
+
+        brg_->zp_type_c = zero_points.has_default_values(DNNL_ARG_DST)
+                ? jit_int8_broadcast_t::none
+                : jit_int8_broadcast_t::per_tensor;
+
+        return true;
+    };
+
+    VDISPATCH_MATMUL(init_zp_type(&brg_), VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_MATMUL(check_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
+
+    VDISPATCH_MATMUL(check_attr_scales(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+    bool no_post_ops = attr()->post_ops_.has_default_values();
+    const bool problem_dt_correct
+            = (is_s8 || is_u8) && utils::everyone_is(f32, dst_type);
+
+    VDISPATCH_MATMUL(problem_dt_correct, VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_MATMUL(no_post_ops, VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_MATMUL(formats_ok(), VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_MATMUL(get_sve_length() == 32, VERBOSE_UNSUPPORTED_ISA);
+
+    auto is_src_any = src_d.format_kind() == format_kind::any;
+    auto is_dst_any = dst_d.format_kind() == format_kind::any;
+
+    switch (dims) {
+        case 2: {
+            if (is_src_any)
+                VCHECK_BG(memory_desc_init_by_tag(src_md_, format_tag::ab),
+                        VERBOSE_UNSUPPORTED_TAG);
+            if (is_dst_any)
+                VCHECK_BG(memory_desc_init_by_tag(dst_md_, format_tag::ab),
+                        VERBOSE_UNSUPPORTED_TAG);
+            if (!weights_d.matches_tag(format_tag::ab)) {
+                brg_.b_reo = false;
+                VCHECK_BG(memory_desc_init_by_tag(
+                                  weights_md_, format_tag::BA24b8a),
+                        VERBOSE_UNSUPPORTED_TAG);
+            } else {
+                VCHECK_BG(memory_desc_init_by_tag(weights_md_, format_tag::ab),
+                        VERBOSE_UNSUPPORTED_TAG);
+            }
+            break;
+        }
+        case 3: {
+            if (is_src_any)
+                VCHECK_BG(memory_desc_init_by_tag(src_md_, format_tag::abc),
+                        VERBOSE_UNSUPPORTED_TAG);
+            if (is_dst_any)
+                VCHECK_BG(memory_desc_init_by_tag(dst_md_, format_tag::abc),
+                        VERBOSE_UNSUPPORTED_TAG);
+            if (!weights_d.matches_tag(format_tag::abc)) {
+                brg_.b_reo = false;
+                VCHECK_BG(memory_desc_init_by_tag(
+                                  weights_md_, format_tag::aCB24c8b),
+                        VERBOSE_UNSUPPORTED_TAG);
+            } else {
+                VCHECK_BG(memory_desc_init_by_tag(weights_md_, format_tag::abc),
+                        VERBOSE_UNSUPPORTED_TAG);
+            }
+            if (src_d.dims()[0] != weights_d.dims()[0])
+                return status::unimplemented;
+            break;
+        }
+        case 4: {
+            if (is_src_any)
+                VCHECK_BG(memory_desc_init_by_tag(src_md_, format_tag::abcd),
+                        VERBOSE_UNSUPPORTED_TAG);
+            if (is_dst_any)
+                VCHECK_BG(memory_desc_init_by_tag(dst_md_, format_tag::abcd),
+                        VERBOSE_UNSUPPORTED_TAG);
+            if (!weights_d.matches_tag(format_tag::abcd)) {
+                brg_.b_reo = false;
+                VCHECK_BG(memory_desc_init_by_tag(
+                                  weights_md_, format_tag::abDC24d8c),
+                        VERBOSE_UNSUPPORTED_TAG);
+            } else {
+                VCHECK_BG(
+                        memory_desc_init_by_tag(weights_md_, format_tag::abcd),
+                        VERBOSE_UNSUPPORTED_TAG);
+            }
+            if (src_d.dims()[0] != weights_d.dims()[0]
+                    || src_d.dims()[1] != weights_d.dims()[1])
+                return status::unimplemented;
+            break;
+        }
+        default: return status::unimplemented;
+    }
+
+    bool is_scales = !attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
+            || !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
+
+    bool is_dst_scales
+            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
+
+    const auto &wei_scales = attr()->scales_.get(DNNL_ARG_WEIGHTS);
+
+    matmul_helper_t helper(src_d, weights_d, dst_d);
+    brg_.K = helper.K();
+    brg_.M = helper.M();
+    brg_.N = helper.N();
+    brg_.dst_dt_sz = 4;
+    brg_.na = 1;
+    brg_.nb = 1;
+    brg_.m_tail = brg_.M % brg_.m_blk;
+    brg_.k_tail = brg_.K % (brg_.k_blk * brg_.rd_block);
+    brg_.n_tail = brg_.N % (brg_.n_blk * brg_.ld_block);
+    brg_.is_s8 = is_s8_wei;
+    brg_.is_bias = with_bias();
+    brg_.B = batch();
+    brg_.with_scales = is_scales;
+    brg_.with_dst_scales = is_dst_scales;
+    brg_.is_oc_scales = wei_scales.get_mask() > 0;
+    dyn_.K = brg_.K;
+    dyn_.N = brg_.N;
+    dyn_.M = brg_.M;
+    dyn_.B = brg_.B;
+    dyn_.mtail = brg_.m_tail;
+    dyn_.m_blk = brg_.m_blk;
+    dyn_.k_blk = brg_.k_blk;
+    dyn_.n_blk = brg_.n_blk * brg_.ld_block;
+    dyn_.ntail = brg_.n_tail;
+    dyn_.ktail = dyn_.K % brg_.k_blk;
+
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(key_brgemm_primitive_zp_comp_a,
+            div_up(brg_.N, (brg_.n_blk * brg_.ld_block))
+                    * (brg_.n_blk * brg_.ld_block) * brg_.dst_dt_sz * brg_.B,
+            sizeof(char));
+    scratchpad.book(key_brgemm_primitive_zp_comp_b,
+            div_up(brg_.M, brg_.m_blk) * brg_.m_blk * brg_.dst_dt_sz * brg_.B,
+            sizeof(char));
+    scratchpad.book(key_brgemm_primitive_buffer_a,
+            brg_.B * div_up(brg_.M, brg_.m_blk) * div_up(brg_.K, brg_.k_blk)
+                    * brg_.m_blk * brg_.k_blk,
+            sizeof(char));
+    scratchpad.book(key_brgemm_primitive_buffer_b, brg_.B * brg_.M * brg_.K,
+            sizeof(char));
+    if (brg_.b_reo)
+        scratchpad.book(key_gemm_blocked_b,
+                brg_.B * div_up(brg_.N, (brg_.n_blk * brg_.ld_block))
+                        * (brg_.n_blk * brg_.ld_block)
+                        * div_up(brg_.K, brg_.k_blk) * brg_.k_blk,
+                sizeof(char));
+    book_precomputed_scales(scratchpad, attr()->scales_, N());
+
+    return status::success;
+}
+
+status_t jit_int8_matmul_t::init(engine_t *engine) {
+
+    const auto &b1 = pd()->get_b();
+    const auto &d1 = pd()->get_d();
+
+    dyn_vals_t d;
+    d.K = d1.K;
+    d.M = d1.M;
+    d.B = d1.B;
+    d.N = d1.N;
+    d.mtail = d1.mtail;
+    d.ktail = d1.ktail;
+    d.ntail = d1.ntail;
+    d.k_blk = d1.k_blk;
+    d.m_blk = d1.m_blk;
+    d.n_blk = d1.n_blk;
+
+    brg_int8_t b;
+    b.M = b1.M;
+    b.K = b1.K;
+    b.N = b1.N;
+    b.na = b1.na;
+    b.nb = b1.nb;
+    b.m_tail = b1.m_tail;
+    b.n_tail = b1.n_tail;
+    b.k_tail = b1.k_tail;
+    b.dst_dt_sz = b1.dst_dt_sz;
+    b.is_s8 = b1.is_s8;
+    b.B = b1.B;
+    b.is_bias = b1.is_bias;
+    b.zp_type_a = b1.zp_type_a;
+    b.zp_type_b = b1.zp_type_b;
+    b.zp_type_c = b1.zp_type_c;
+    b.is_zp_b_int8 = b1.is_zp_b_int8;
+    b.zp_b_dt = b1.zp_b_dt;
+    b.with_scales = b1.with_scales;
+    b.with_dst_scales = b1.with_dst_scales;
+    b.is_oc_scales = b1.is_oc_scales;
+    b.b_reo = b1.b_reo;
+
+    for (int z = 0; z < 2; z++)
+        for (int m = 0; m < 2; m++)
+            for (int n = 0; n < 2; n++)
+                for (int k = 0; k < 2; k++) {
+                    int idx = pd()->get_idx(z, m, k, n, b1);
+                    if (idx == -1 || idx > 15) continue;
+                    b.is_m_tail = m;
+                    b.is_k_tail = k;
+                    b.is_n_tail = n;
+                    b.is_zp_cal = z;
+                    int8_kernels_[idx]
+                            = std::unique_ptr<jit_int8_matmul_kernel_t> {
+                                    new jit_int8_matmul_kernel_t(b)};
+                    if (!int8_kernels_[idx]) return status::runtime_error;
+                    CHECK(int8_kernels_[idx]->create_kernel());
+                }
+
+    d.reorder_a = 1;
+    d.reorder_b = 0;
+    reo_ker_a_ = std::unique_ptr<jit_int8_matmul_utils_kernel_t> {
+            new jit_int8_matmul_utils_kernel_t(d)};
+    CHECK(reo_ker_a_->create_kernel());
+
+    d.reorder_b = 1;
+    d.reorder_a = 0;
+    reo_ker_b_ = std::unique_ptr<jit_int8_matmul_utils_kernel_t> {
+            new jit_int8_matmul_utils_kernel_t(d)};
+    CHECK(reo_ker_b_->create_kernel());
+
+    return status::success;
+}
+
+jit_int8_matmul_t::jit_int8_matmul_t(const pd_t *apd) : primitive_t(apd) {}
+jit_int8_matmul_t::~jit_int8_matmul_t() = default;
+
+status_t jit_int8_matmul_t::execute(const exec_ctx_t &ctx) const {
+    const auto *weights_b = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS);
+    const auto *src_b = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
+    auto dst = CTX_OUT_MEM(float *, DNNL_ARG_DST);
+    const auto *bias = CTX_IN_MEM(const float *, DNNL_ARG_BIAS);
+
+    DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
+    DEFINE_ZERO_POINT_VALUE(wei_zero_point, DNNL_ARG_WEIGHTS);
+    DEFINE_ZERO_POINT_VALUE(dst_zero_point, DNNL_ARG_DST);
+    DEFINE_ZERO_POINTS_BUFFER(wei_zero_point_buf, DNNL_ARG_WEIGHTS);
+    DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
+    DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
+    DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
+
+    const auto &b = pd()->get_b();
+    const auto &d = pd()->get_d();
+
+    auto &scratchpad = ctx.get_scratchpad_grantor();
+
+    int num_threads = dnnl_get_current_num_threads();
+    char *src = scratchpad.template get<char>(key_brgemm_primitive_buffer_a);
+    char *weights = (b.b_reo)
+            ? scratchpad.template get<char>(key_gemm_blocked_b)
+            : (char *)weights_b;
+    char *zp_ptr_a
+            = scratchpad.template get<char>(key_brgemm_primitive_zp_comp_a);
+    char *zp_ptr_b
+            = scratchpad.template get<char>(key_brgemm_primitive_zp_comp_b);
+    const float *oscales = precompute_scales(
+            scratchpad, src_scales, wei_scales, pd()->N(), pd()->attr());
+
+    const dim_t B = b.B;
+    const dim_t M = b.M;
+    const dim_t N = b.N;
+    const dim_t K = b.K;
+
+    auto reorder_a = [&]() {
+        int m_blks = div_up(M, b.m_blk);
+        int k_blks = div_up(K, b.k_blk);
+        int n_blks = div_up(N, (b.n_blk * b.ld_block));
+        int parallel_work = B * m_blks * k_blks;
+        int parallel_work_mn = B * m_blks * n_blks;
+        int blk_per_bt = m_blks * k_blks;
+        int nt = std::min(num_threads, parallel_work);
+        nt = std::min(parallel_work_mn, nt);
+        auto tmp_src = src_b;
+
+        parallel(nt, [&](const int ithr, const int nthr) {
+            int start {0}, end {0};
+            balance211(parallel_work, nt, ithr, start, end);
+
+            int bt = start / blk_per_bt;
+            int bs = start % blk_per_bt;
+            int nobl = end - start;
+            int nobt = 1;
+            int noblf = end - start, nobll;
+
+            if (bs + nobl > blk_per_bt) {
+                nobt += div_up(nobl - (blk_per_bt - bs), blk_per_bt);
+                noblf = blk_per_bt - bs;
+                nobll = (nobl - (blk_per_bt - bs)) % blk_per_bt;
+                if (nobll == 0) nobll = blk_per_bt;
+            }
+            int nob;
+            for (int i = 0; i < nobt; i++) {
+                nob = (i == 0) ? noblf : ((i == nobt - 1) ? nobll : blk_per_bt);
+                bs = start % blk_per_bt;
+                int m_blk_src = bs / k_blks;
+                int k_blk_src = bs % k_blks;
+                int m_blk_dst = bs / k_blks;
+                int k_blk_dst = bs % k_blks;
+
+                int k1 = std::min(k_blks - k_blk_src, nob);
+                int k_tmp = nob - k1;
+                int m1 = (k_tmp > 0) ? k_tmp / k_blks : 0;
+                int k2 = (k_tmp > 0) ? k_tmp % k_blks : 0;
+                int src_ad = (bt * M * K) + (m_blk_src * b.m_blk * K)
+                        + (k_blk_src * b.k_blk);
+                int dst_ad = (bt * m_blks * k_blks * b.m_blk * b.k_blk)
+                        + (m_blk_dst * k_blks * b.m_blk * b.k_blk)
+                        + (k_blk_dst * b.m_blk * b.k_blk);
+                int src_new = src_ad, dst_new = dst_ad;
+
+                dyn_params_t k;
+
+                if (k1 > 0) {
+                    int a = 1;
+                    int mtl = (d.mtail > 0) ? 1 : 0;
+                    int tl = (d.ktail > 0) ? 1 : 0;
+                    if (k1 + k_blk_src < k_blks) tl = 0;
+                    if (1 + m_blk_src < m_blks) mtl = 0;
+                    k.src = (int8_t *)tmp_src + src_ad;
+                    k.dst = (int8_t *)src + dst_ad;
+                    k.nm = &a;
+                    k.nk = &k1;
+                    k.tl = &tl;
+                    k.mtl = &mtl;
+                    (*reo_ker_a_)(&k);
+                }
+
+                if (m1 > 0) {
+                    int mtl = (d.mtail > 0) ? 1 : 0;
+                    int tl = (d.ktail > 0) ? 1 : 0;
+                    if (1 + m1 + m_blk_src < m_blks) mtl = 0;
+                    if (k1 != k_blks) {
+                        src_new = src_ad - b.k_blk * (k_blks - k1)
+                                + b.m_blk * K;
+                    } else {
+                        src_new = src_ad + b.m_blk * K;
+                    }
+                    dst_new = dst_ad + b.m_blk * b.k_blk * k1;
+                    k.src = (int8_t *)tmp_src + src_new;
+                    k.dst = (int8_t *)src + dst_new;
+                    k.nm = &m1;
+                    k.nk = &k_blks;
+                    k.tl = &tl;
+                    k.mtl = &mtl;
+                    (*reo_ker_a_)(&k);
+                }
+                if (k2 > 0) {
+                    int a = 1, tl = 0;
+                    int mtl = (d.mtail > 0) ? 1 : 0;
+                    if (1 + 1 + m1 + m_blk_src < m_blks) mtl = 0;
+                    if (m1 < 1) {
+                        src_new = src_ad - b.k_blk * (k_blks - k1)
+                                + (b.m_blk * K);
+                        dst_new = dst_ad + b.m_blk * b.k_blk * k1;
+                    } else {
+                        src_new += K * m1 * b.m_blk;
+                        dst_new += b.m_blk * b.k_blk * k_blks * m1;
+                    }
+                    k.src = (int8_t *)tmp_src + src_new;
+                    k.dst = (int8_t *)src + dst_new;
+                    k.nm = &a;
+                    k.nk = &k2;
+                    k.tl = &tl;
+                    k.mtl = &mtl;
+                    (*reo_ker_a_)(&k);
+                }
+                bt++;
+                start += nob;
+            }
+        });
+    };
+
+    auto reorder_b = [&]() {
+        int k_blks = div_up(K, d.k_blk);
+        int n_blks = div_up(N, d.n_blk);
+        int parallel_work = B * n_blks * k_blks;
+        int blk_per_bt = n_blks * k_blks;
+        int nt = std::min(num_threads, parallel_work);
+
+        parallel(nt, [&](const int ithr, const int nthr) {
+            int start {0}, end {0};
+            balance211(parallel_work, nt, ithr, start, end);
+
+            int bt = start / blk_per_bt;
+            int bs = start % blk_per_bt;
+            int nobl = end - start;
+            int nobt = 1;
+            int noblf = end - start, nobll;
+
+            if (bs + nobl > blk_per_bt) {
+                nobt += div_up(nobl - (blk_per_bt - bs), blk_per_bt);
+                noblf = blk_per_bt - bs;
+                nobll = (nobl - (blk_per_bt - bs)) % blk_per_bt;
+                if (nobll == 0) nobll = blk_per_bt;
+            }
+            int nob;
+            for (int i = 0; i < nobt; i++) {
+                nob = (i == 0) ? noblf : ((i == nobt - 1) ? nobll : blk_per_bt);
+                bs = start % blk_per_bt;
+                int n_blk_src = bs / k_blks;
+                int k_blk_src = bs % k_blks;
+                int n_blk_dst = bs / k_blks;
+                int k_blk_dst = bs % k_blks;
+
+                int k1 = std::min(k_blks - k_blk_src, nob);
+                int k_tmp = nob - k1;
+                int n1 = (k_tmp > 0) ? k_tmp / k_blks : 0;
+                int k2 = (k_tmp > 0) ? k_tmp % k_blks : 0;
+                int src_ad = (bt * N * K) + (n_blk_src * d.n_blk)
+                        + (k_blk_src * d.k_blk * N);
+                int dst_ad = (bt * n_blks * k_blks * d.k_blk * d.n_blk)
+                        + (n_blk_dst * k_blks * d.k_blk * d.n_blk)
+                        + (k_blk_dst * d.k_blk * d.n_blk);
+                int src_new = src_ad, dst_new = dst_ad;
+
+                dyn_params_t k;
+
+                if (k1 > 0) {
+                    int a = 1;
+                    int ntl = (d.ntail > 0) ? 1 : 0;
+                    int tl = (d.ktail > 0) ? 1 : 0;
+
+                    if (k1 + k_blk_src < k_blks) tl = 0;
+                    if (1 + n_blk_src < n_blks) ntl = 0;
+                    k.src = (int8_t *)weights_b + src_ad;
+                    k.dst = (int8_t *)weights + dst_ad;
+                    k.nn = &a;
+                    k.nk = &k1;
+                    k.tl = &tl;
+                    k.ntl = &ntl;
+                    (*reo_ker_b_)(&k);
+                }
+
+                if (n1 > 0) {
+                    int ntl = (d.ntail > 0) ? 1 : 0;
+                    int tl = (d.ktail > 0) ? 1 : 0;
+                    if (1 + n1 + n_blk_src < n_blks) ntl = 0;
+
+                    if (k1 != k_blks) {
+                        src_new = src_ad - d.k_blk * N * (k_blks - k1)
+                                + d.n_blk;
+                    } else {
+                        src_new = src_ad + d.n_blk;
+                    }
+                    dst_new = dst_ad + d.k_blk * d.n_blk * k1;
+                    k.src = (int8_t *)weights_b + src_new;
+                    k.dst = (int8_t *)weights + dst_new;
+                    k.nn = &n1;
+                    k.nk = &k_blks;
+                    k.tl = &tl;
+                    k.ntl = &ntl;
+                    (*reo_ker_b_)(&k);
+                }
+                if (k2 > 0) {
+                    int a = 1, tl = 0;
+                    int ntl = (d.ntail > 0) ? 1 : 0;
+                    if (1 + 1 + n1 + n_blk_src < n_blks) ntl = 0;
+                    if (n1 < 1) {
+                        src_new = src_ad - d.k_blk * N * (k_blks - k1)
+                                + d.n_blk;
+                        dst_new = dst_ad + d.k_blk * d.n_blk * k1;
+                    } else {
+                        src_new += n1 * d.n_blk;
+                        dst_new += d.k_blk * d.n_blk * k_blks * n1;
+                    }
+                    k.src = (int8_t *)weights_b + src_new;
+                    k.dst = (int8_t *)weights + dst_new;
+                    k.nn = &a;
+                    k.nk = &k2;
+                    k.tl = &tl;
+                    k.ntl = &ntl;
+                    (*reo_ker_b_)(&k);
+                }
+                bt++;
+                start += nob;
+            }
+        });
+    };
+
+    auto kernel_execute = [&](int idx, int na, int nb, int m_blk_adr,
+                                  int n_blk_adr, int dst_adr, int bias_addr,
+                                  int scl_addr, int zp_ptr_a_adr,
+                                  int zp_ptr_b_adr, int zp_b_buf) {
+        call_params_t p;
+        p.na = &na;
+        p.nb = &nb;
+        p.src = (uint8_t *)src + m_blk_adr;
+        p.wei = (uint8_t *)weights + n_blk_adr;
+        p.dst = dst + dst_adr;
+        p.bias = (float *)bias + bias_addr;
+        p.scales = oscales + scl_addr;
+        p.dst_scales = dst_scales;
+        p.src_zero_point = &src_zero_point;
+        if (b.is_zp_b_int8)
+            p.wei_zero_point_buf = (int8_t *)wei_zero_point_buf + zp_b_buf;
+        else
+            p.wei_zero_point = &wei_zero_point;
+        p.dst_zero_point = &dst_zero_point;
+        p.M = M;
+        p.N = N;
+        p.K = K;
+        p.zp_a_ptr = (float *)zp_ptr_a + zp_ptr_a_adr;
+        p.zp_b_ptr = (float *)zp_ptr_b + zp_ptr_b_adr;
+        (*int8_kernels_[idx])(&p);
+    };
+
+    auto kernel_execute_zp = [&]() {
+        int num_a_blocks = div_up(M, b.m_blk);
+        int num_b_blocks = div_up(N, (b.n_blk * b.ld_block));
+        int ktail = (b.k_tail == 0) ? 0 : 1;
+        int parallel_work = B * num_a_blocks;
+        int nt = std::min(num_threads, parallel_work);
+        if (b.zp_type_b != jit_int8_broadcast_t::none)
+            parallel(nt, [&](const int ithr, const int nthr) {
+                int start {0}, end {0};
+                balance211(parallel_work, nt, ithr, start, end);
+                int batch = start / num_a_blocks;
+                int m_st = start % num_a_blocks;
+                int m_ed = end - start + m_st;
+                int mtail
+                        = (m_ed == num_a_blocks) ? ((b.m_tail > 0) ? 1 : 0) : 0;
+                int m_blk_adr = (batch
+                                        * (num_a_blocks * b.m_blk
+                                                * div_up(K, b.k_blk) * b.k_blk))
+                        + m_st * b.m_blk * div_up(K, b.k_blk) * b.k_blk;
+                int zp_ptr_b_adr
+                        = (batch * (num_a_blocks * b.m_blk)) + m_st * b.m_blk;
+
+                int idx = pd()->get_idx(1, 0, ktail, 0, b);
+                if (idx < 0) {
+                    assert(!"Requested int8 matmul kernel was not created.");
+                    return;
+                }
+                int n_a = m_ed - m_st;
+                if (mtail) n_a -= 1;
+                kernel_execute(
+                        idx, n_a, 0, m_blk_adr, 0, 0, 0, 0, 0, zp_ptr_b_adr, 0);
+
+                if (mtail) {
+                    idx = pd()->get_idx(1, mtail, ktail, 0, b);
+                    if (idx < 0) {
+                        assert(!"Requested int8 matmul kernel was not "
+                                "created.");
+                        return;
+                    }
+                    m_blk_adr += n_a * b.m_blk * div_up(K, b.k_blk) * b.k_blk;
+                    zp_ptr_b_adr += n_a * b.m_blk;
+                    kernel_execute(idx, 1, 0, m_blk_adr, 0, 0, 0, 0, 0,
+                            zp_ptr_b_adr, 0);
+                }
+                start++;
+            });
+
+        parallel_work = B * num_b_blocks;
+        nt = std::min(num_threads, parallel_work);
+        if (b.zp_type_a != jit_int8_broadcast_t::none)
+            parallel(nt, [&](const int ithr, const int nthr) {
+                int start {0}, end {0};
+                balance211(parallel_work, nt, ithr, start, end);
+                int batch = start / num_b_blocks;
+                int n_st = start % num_b_blocks;
+                int n_ed = n_st + end - start;
+                int ntail
+                        = (n_ed == num_b_blocks) ? ((b.n_tail > 0) ? 1 : 0) : 0;
+                int n_blk_adr = (batch
+                                        * (num_b_blocks * (b.n_blk * b.ld_block)
+                                                * div_up(K, b.k_blk) * b.k_blk))
+                        + n_st * (b.n_blk * b.ld_block) * div_up(K, b.k_blk)
+                                * b.k_blk;
+                int zp_ptr_a_adr
+                        = (batch * num_b_blocks * (b.n_blk * b.ld_block))
+                        + n_st * (b.n_blk * b.ld_block);
+
+                int idx = pd()->get_idx(1, 0, ktail, 0, b);
+                if (idx < 0) {
+                    assert(!"Requested int8 matmul kernel was not created.");
+                    return;
+                }
+                int n_b = n_ed - n_st;
+                if (ntail == 1) n_b -= 1;
+
+                kernel_execute(
+                        idx, 0, n_b, 0, n_blk_adr, 0, 0, 0, zp_ptr_a_adr, 0, 0);
+
+                if (ntail) {
+                    idx = pd()->get_idx(1, 0, ktail, 1, b);
+                    if (idx < 0) {
+                        assert(!"Requested int8 matmul kernel was not "
+                                "created.");
+                        return;
+                    }
+                    n_blk_adr += n_b * (b.n_blk * b.ld_block)
+                            * div_up(K, b.k_blk) * b.k_blk;
+                    zp_ptr_a_adr += n_b * (b.n_blk * b.ld_block);
+                    kernel_execute(idx, 0, 1, 0, n_blk_adr, 0, 0, 0,
+                            zp_ptr_a_adr, 0, 0);
+                }
+
+                start++;
+            });
+    };
+
+    if (b.b_reo) reorder_b();
+
+    reorder_a();
+
+    if (b.zp_type_a != jit_int8_broadcast_t::none
+            || b.zp_type_b != jit_int8_broadcast_t::none)
+        kernel_execute_zp();
+
+    int m_block_sz = 32;
+    int n_block_sz = 24;
+    int m_block1 = div_up(m_block_sz, b.m_blk);
+    int n_block1 = div_up(n_block_sz, (b.n_blk * b.ld_block));
+    int m_block1_rs = div_up(M % m_block_sz, b.m_blk);
+    int n_block1_rs = div_up(N % n_block_sz, (b.n_blk * b.ld_block));
+
+    int num_a_blocks_act = div_up(M, b.m_blk);
+    int num_b_blocks_act = div_up(N, (b.n_blk * b.ld_block));
+    int num_a_blocks = div_up(M, m_block_sz);
+    int num_b_blocks = div_up(N, n_block_sz);
+    int ktail = (b.k_tail == 0) ? 0 : 1;
+    int parallel_work = B * num_a_blocks * num_b_blocks;
+    int nt = std::min(num_threads, parallel_work);
+
+    parallel(nt, [&](const int ithr, const int nthr) {
+        int start {0}, end {0};
+        balance211(parallel_work, nt, ithr, start, end);
+        while (start < end) {
+            int batch = start / (num_a_blocks * num_b_blocks);
+            int batch_start = start % (num_a_blocks * num_b_blocks);
+            int m_block = batch_start % num_a_blocks;
+            int n_block = batch_start / num_a_blocks;
+            int mtail
+                    = (m_block1_rs != 0 && m_block == num_a_blocks - 1) ? 1 : 0;
+            int ntail
+                    = (n_block1_rs != 0 && n_block == num_b_blocks - 1) ? 1 : 0;
+            int dst_adr = (batch * M * N) + m_block * b.m_blk * m_block1 * N
+                    + n_block * (b.n_blk * b.ld_block) * n_block1;
+            int m_blk_adr = (batch
+                                    * (num_a_blocks_act * b.m_blk
+                                            * div_up(K, b.k_blk) * b.k_blk))
+                    + m_block * b.m_blk * m_block1 * div_up(K, b.k_blk)
+                            * b.k_blk;
+            int n_blk_adr = (batch
+                                    * (num_b_blocks_act * (b.n_blk * b.ld_block)
+                                            * div_up(K, b.k_blk) * b.k_blk))
+                    + n_block * (b.n_blk * b.ld_block) * n_block1
+                            * div_up(K, b.k_blk) * b.k_blk;
+            int zp_ptr_a_adr
+                    = (batch * (num_b_blocks_act * (b.n_blk * b.ld_block)))
+                    + n_block * (b.n_blk * b.ld_block) * n_block1;
+            int zp_ptr_b_adr = (batch * (num_a_blocks_act * b.m_blk))
+                    + m_block * b.m_blk * m_block1;
+            int bias_addr = n_block * (b.n_blk * b.ld_block) * n_block1;
+            int zp_b_buf = n_block * (b.n_blk * b.ld_block) * n_block1;
+            int scl_addr = (b.is_oc_scales)
+                    ? (n_block * (b.n_blk * b.ld_block) * n_block1)
+                    : 0;
+            int idx = pd()->get_idx(0, 0, ktail, 0, b);
+            if (idx < 0) {
+                assert(!"Requested int8 matmul kernel was not created.");
+                return;
+            }
+            int n_a = m_block1, n_b = n_block1;
+            n_a = (mtail) ? ((b.m_tail) ? m_block1_rs - 1 : m_block1_rs)
+                          : m_block1;
+            n_b = (ntail) ? ((b.n_tail) ? n_block1_rs - 1 : n_block1_rs)
+                          : n_block1;
+
+            if (n_a > 0 && n_b > 0) {
+
+                kernel_execute(idx, n_a, n_b, m_blk_adr, n_blk_adr, dst_adr,
+                        bias_addr, scl_addr, zp_ptr_a_adr, zp_ptr_b_adr,
+                        zp_b_buf);
+            }
+
+            if (mtail && b.m_tail > 0 && n_b > 0) {
+                int new_dst_adr = dst_adr + b.m_blk * n_a * N;
+                int new_m_blk_adr = m_blk_adr
+                        + b.m_blk * n_a * div_up(K, b.k_blk) * b.k_blk;
+                int new_zp_ptr_b_adr = zp_ptr_b_adr + b.m_blk * n_a;
+                idx = pd()->get_idx(0, 1, ktail, 0, b);
+                if (idx < 0) {
+                    assert(!"Requested int8 matmul kernel was not created.");
+                    return;
+                }
+                int na = 1;
+                kernel_execute(idx, na, n_b, new_m_blk_adr, n_blk_adr,
+                        new_dst_adr, bias_addr, scl_addr, zp_ptr_a_adr,
+                        new_zp_ptr_b_adr, zp_b_buf);
+            }
+
+            if (ntail && b.n_tail > 0 && n_a > 0) {
+                int new_dst_adr = dst_adr + (b.n_blk * b.ld_block) * n_b;
+                int new_n_blk_adr = n_blk_adr
+                        + (b.n_blk * b.ld_block) * n_b * div_up(K, b.k_blk)
+                                * b.k_blk;
+                int new_zp_b_buf = zp_b_buf + (b.n_blk * b.ld_block) * n_b;
+                int new_zp_ptr_a_adr
+                        = zp_ptr_a_adr + (b.n_blk * b.ld_block) * n_b;
+                int new_bias_addr = bias_addr + (b.n_blk * b.ld_block) * n_b;
+                int new_scl_addr = scl_addr
+                        + ((b.is_oc_scales) ? ((b.n_blk * b.ld_block) * n_b)
+                                            : 0);
+                idx = pd()->get_idx(0, 0, ktail, 1, b);
+                if (idx < 0) {
+                    assert(!"Requested int8 matmul kernel was not created.");
+                    return;
+                }
+                int nb = 1;
+
+                kernel_execute(idx, n_a, nb, m_blk_adr, new_n_blk_adr,
+                        new_dst_adr, new_bias_addr, new_scl_addr,
+                        new_zp_ptr_a_adr, zp_ptr_b_adr, new_zp_b_buf);
+            }
+
+            if (mtail && b.m_tail > 0 && ntail && b.n_tail > 0) {
+                int new_dst_adr = dst_adr + (b.n_blk * b.ld_block) * n_b
+                        + b.m_blk * n_a * N;
+                int new_m_blk_adr = m_blk_adr
+                        + b.m_blk * n_a * div_up(K, b.k_blk) * b.k_blk;
+                int new_n_blk_adr = n_blk_adr
+                        + (b.n_blk * b.ld_block) * n_b * div_up(K, b.k_blk)
+                                * b.k_blk;
+                int new_zp_b_buf = zp_b_buf + (b.n_blk * b.ld_block) * n_b;
+                int new_zp_ptr_a_adr
+                        = zp_ptr_a_adr + (b.n_blk * b.ld_block) * n_b;
+                int new_zp_ptr_b_adr = zp_ptr_b_adr + b.m_blk * n_a;
+                int new_bias_addr = bias_addr + (b.n_blk * b.ld_block) * n_b;
+                int new_scl_addr = scl_addr
+                        + ((b.is_oc_scales) ? ((b.n_blk * b.ld_block) * n_b)
+                                            : 0);
+                idx = pd()->get_idx(0, 1, ktail, 1, b);
+                if (idx < 0) {
+                    assert(!"Requested int8 matmul kernel was not created.");
+                    return;
+                }
+                int nb = 1, na = 1;
+                kernel_execute(idx, na, nb, new_m_blk_adr, new_n_blk_adr,
+                        new_dst_adr, new_bias_addr, new_scl_addr,
+                        new_zp_ptr_a_adr, new_zp_ptr_b_adr, new_zp_b_buf);
+            }
+            start++;
+        }
+    });
+
+    return status::success;
+}
+
+} // namespace matmul
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/jit_int8_matmul.hpp b/src/cpu/aarch64/matmul/jit_int8_matmul.hpp
new file mode 100644
index 00000000000..6cc32633a2a
--- /dev/null
+++ b/src/cpu/aarch64/matmul/jit_int8_matmul.hpp
@@ -0,0 +1,111 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_AARCH64_JIT_INT8_MATMUL_HPP
+#define CPU_AARCH64_JIT_INT8_MATMUL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/platform.hpp"
+#include "cpu/primitive_attr_postops.hpp"
+
+#include "cpu/aarch64/matmul/jit_int8_kernel_types.hpp"
+#include "cpu/matmul/cpu_matmul_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+namespace matmul {
+
+struct jit_int8_matmul_kernel_t;
+struct jit_int8_matmul_utils_kernel_t;
+
+struct jit_int8_matmul_t : public primitive_t {
+    struct pd_t : public dnnl::impl::cpu::matmul::cpu_matmul_pd_t {
+        using ::dnnl::impl::cpu::matmul::cpu_matmul_pd_t::cpu_matmul_pd_t;
+
+        DECLARE_COMMON_PD_T("jit:int8", jit_int8_matmul_t);
+
+        status_t init(engine_t *engine);
+
+        bool formats_ok() const {
+
+            const memory_desc_wrapper src_d(src_md_);
+            const memory_desc_wrapper weights_d(weights_md_);
+            const memory_desc_wrapper dst_d(dst_md_);
+            const bool is_dst = dst_d.matches_one_of_tag(format_tag::ab,
+                                        format_tag::abc, format_tag::abcd)
+                            != format_tag::undef
+                    || dst_d.format_kind() == format_kind::any;
+            const bool is_wei
+                    = weights_d.matches_one_of_tag(format_tag::ab,
+                              format_tag::abc, format_tag::abcd,
+                              format_tag::BA24b8a, format_tag::aCB24c8b,
+                              format_tag::abDC24d8c)
+                            != format_tag::undef
+                    || weights_d.format_kind() == format_kind::any;
+            const bool is_src = src_d.matches_one_of_tag(format_tag::ab,
+                                        format_tag::abc, format_tag::abcd)
+                            != format_tag::undef
+                    || src_d.format_kind() == format_kind::any;
+            return is_dst && is_wei && is_src;
+        }
+        const brg_int8_t &get_b() const { return brg_; }
+
+        const dyn_vals_t &get_d() const { return dyn_; }
+
+        int get_idx(int z, int m, int k, int n, const brg_int8_t b) const {
+
+            if (b.zp_type_a == jit_int8_broadcast_t::none
+                    && b.zp_type_b == jit_int8_broadcast_t::none && z == 1)
+                return -1;
+            int mt = b.M % b.m_blk;
+            int nt = b.N % (b.n_blk * b.ld_block);
+            int kt = b.K % (b.k_blk * 4);
+            if ((m == 1 && mt == 0) || (k == 1 && kt == 0)
+                    || (n == 1 && nt == 0) || (k == 0 && kt == 1))
+                return -1;
+            return k + n * 2 + m * 2 * 2 + z * 2 * 2 * 2;
+        }
+
+    private:
+        brg_int8_t brg_;
+        dyn_vals_t dyn_;
+    };
+
+    jit_int8_matmul_t(const pd_t *apd);
+    ~jit_int8_matmul_t() override;
+    int get_idx(int z, int m, int k, int n, int M, int K, int N);
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<jit_int8_matmul_kernel_t> int8_kernels_[16];
+    std::unique_ptr<jit_int8_matmul_utils_kernel_t> reo_ker_a_;
+    std::unique_ptr<jit_int8_matmul_utils_kernel_t> reo_ker_b_;
+};
+
+} // namespace matmul
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+#endif
diff --git a/src/cpu/aarch64/matmul/jit_int8_matmul_utils.cpp b/src/cpu/aarch64/matmul/jit_int8_matmul_utils.cpp
new file mode 100644
index 00000000000..3908598677f
--- /dev/null
+++ b/src/cpu/aarch64/matmul/jit_int8_matmul_utils.cpp
@@ -0,0 +1,294 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+
+#include "common/math_utils.hpp"
+#include "cpu/aarch64/jit_generator.hpp"
+#include "cpu/aarch64/matmul/jit_int8_matmul_utils.hpp"
+
+#define GET_OFF(field) (uint32_t) offsetof(dyn_params_t, field)
+
+#define LDR_IMM(reg, addr, off) \
+    { \
+        const uint64_t IMM12_MASK = ~uint64_t(0xfff); \
+        if ((off & IMM12_MASK) == 0) { \
+            ldr(reg, ptr(addr, off)); \
+        } else { \
+            add_imm(X_DEFAULT_ADDR, addr, off, X_TMP_0); \
+            ldr(reg, ptr(X_DEFAULT_ADDR)); \
+        } \
+    }
+
+#define VCHECK_BG(f, msg, ...) \
+    VCHECK(primitive, create, dispatch, brgemm_matmul, f, msg, ##__VA_ARGS__);
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+namespace matmul {
+
+using namespace Xbyak_aarch64;
+using namespace dnnl::impl::format_tag;
+using namespace dnnl::impl::utils;
+
+using namespace nstl;
+
+using namespace data_type;
+
+void jit_int8_matmul_utils_kernel_t::reo_A_8x8(int lp, int kt) {
+    mov(reg_tmp_1, reg_tmp);
+    if (kt > 0) {
+        for (int i = 0; i < lp; i++) {
+            ld1b(ZRegB(i), prd_ld, ptr(reg_tmp_1));
+            add_imm(reg_tmp_1, reg_tmp_1, dyn_.K, X_TMP_0);
+            st1b(ZRegB(i), prd_st, ptr(reg_dst));
+            add_imm(reg_dst, reg_dst, dyn_.k_blk, X_TMP_0);
+        }
+        for (int i = 0; i < dyn_.m_blk - lp; i++) {
+            mov(ZRegB(i), 0);
+            st1b(ZRegB(i), prd_st, ptr(reg_dst));
+            add_imm(reg_dst, reg_dst, dyn_.k_blk, X_TMP_0);
+        }
+    } else {
+        for (int i = 0; i < lp; i++) {
+            ldr(DReg(i), ptr(reg_tmp_1));
+            add_imm(reg_tmp_1, reg_tmp_1, dyn_.K, X_TMP_0);
+            str(DReg(i), ptr(reg_dst));
+            add_imm(reg_dst, reg_dst, dyn_.k_blk, X_TMP_0);
+        }
+        for (int i = 0; i < dyn_.m_blk - lp; i++) {
+            mov(ZRegB(i), 0);
+            st1b(ZRegB(i), prd_st, ptr(reg_dst));
+            add_imm(reg_dst, reg_dst, dyn_.k_blk, X_TMP_0);
+        }
+    }
+}
+
+void jit_int8_matmul_utils_kernel_t::reo_B_8x24(int lp, int nt) {
+    auto p = (nt > 0) ? prd_p3 : prd_ld;
+    mov(reg_tmp, reg_aux_a);
+    for (int i = 0; i < lp; i++) {
+        ld1b(ZRegB(i), p, ptr(reg_tmp));
+        add_imm(reg_tmp, reg_tmp, dyn_.N, X_TMP_4);
+    }
+    for (int i = lp; i < dyn_.k_blk; i++) {
+        mov(ZRegB(i), 0);
+    }
+
+    zip2(ZRegB(8), ZRegB(0), ZRegB(1));
+    zip1(ZRegB(0), ZRegB(0), ZRegB(1));
+    zip2(ZRegB(10), ZRegB(2), ZRegB(3));
+    zip1(ZRegB(2), ZRegB(2), ZRegB(3));
+    zip2(ZRegB(12), ZRegB(4), ZRegB(5));
+    zip1(ZRegB(4), ZRegB(4), ZRegB(5));
+    zip2(ZRegB(14), ZRegB(6), ZRegB(7));
+    zip1(ZRegB(6), ZRegB(6), ZRegB(7));
+
+    zip2(ZRegH(1), ZRegH(0), ZRegH(2));
+    zip1(ZRegH(0), ZRegH(0), ZRegH(2));
+    zip2(ZRegH(5), ZRegH(4), ZRegH(6));
+    zip1(ZRegH(4), ZRegH(4), ZRegH(6));
+    zip1(ZRegH(8), ZRegH(8), ZRegH(10));
+    zip1(ZRegH(12), ZRegH(12), ZRegH(14));
+
+    zip2(ZRegS(2), ZRegS(0), ZRegS(4));
+    zip1(ZRegS(0), ZRegS(0), ZRegS(4));
+    zip2(ZRegS(6), ZRegS(1), ZRegS(5));
+    zip1(ZRegS(1), ZRegS(1), ZRegS(5));
+    zip2(ZRegS(10), ZRegS(8), ZRegS(12));
+    zip1(ZRegS(8), ZRegS(8), ZRegS(12));
+
+    str(ZReg(0), ptr(reg_aux_b, 0, MUL_VL));
+    str(ZReg(2), ptr(reg_aux_b, 1, MUL_VL));
+    str(ZReg(1), ptr(reg_aux_b, 2, MUL_VL));
+    str(ZReg(6), ptr(reg_aux_b, 3, MUL_VL));
+    str(ZReg(8), ptr(reg_aux_b, 4, MUL_VL));
+    str(ZReg(10), ptr(reg_aux_b, 5, MUL_VL));
+
+    add_imm(reg_aux_b, reg_aux_b, dyn_.n_blk * dyn_.k_blk, X_TMP_4);
+}
+
+void jit_int8_matmul_utils_kernel_t::gen_reo_a() {
+
+    int ktl = (dyn_.ktail) ? dyn_.ktail : dyn_.k_blk;
+
+    set_preg(prd_ld.b, ktl, X_TMP_0, X_TMP_1);
+    set_preg(prd_st.b, dyn_.k_blk, X_TMP_0, X_TMP_1);
+
+    int lp = (dyn_.mtail) ? dyn_.mtail : dyn_.m_blk;
+
+    Label m_loop, last_m, m_end, k_loop, last_k, k_end, k_loop_1, last_k_1,
+            k_end_1;
+
+    LDR_IMM(reg_max, reg_param, GET_OFF(nk));
+    LDR_IMM(reg_min, reg_param, GET_OFF(nm));
+
+    LDR_IMM(reg_tmp_2, reg_param, GET_OFF(tl));
+    ldr(WReg(reg_tail.getIdx()), ptr(reg_tmp_2));
+
+    LDR_IMM(reg_tmp_2, reg_param, GET_OFF(mtl));
+    ldr(WReg(reg_m_tail.getIdx()), ptr(reg_tmp_2));
+
+    ldr(WReg(reg_m_loop.getIdx()), ptr(reg_min));
+
+    cmp(reg_m_loop, 1);
+    b(EQ, last_m);
+    L(m_loop);
+    ldr(WReg(reg_k_loop.getIdx()), ptr(reg_max));
+    mov(reg_tmp, reg_src);
+    cmp(reg_k_loop, 1);
+    b(EQ, last_k);
+    L(k_loop);
+    reo_A_8x8(dyn_.m_blk, 0);
+    add_imm(reg_tmp, reg_tmp, dyn_.k_blk, X_TMP_0);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_k_loop, 1);
+    b(GT, k_loop);
+    b(LT, k_end);
+    L(last_k);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_tail, 0);
+    b(EQ, k_loop);
+    reo_A_8x8(dyn_.m_blk, 1);
+    L(k_end);
+    add_imm(reg_src, reg_src, dyn_.K * dyn_.m_blk, X_TMP_0);
+    sub(reg_m_loop, reg_m_loop, 1);
+    cmp(reg_m_loop, 1);
+    b(GT, m_loop);
+    b(LT, m_end);
+
+    L(last_m);
+    sub(reg_m_loop, reg_m_loop, 1);
+    cmp(reg_m_tail, 0);
+    b(EQ, m_loop);
+    ldr(WReg(reg_k_loop.getIdx()), ptr(reg_max));
+    mov(reg_tmp, reg_src);
+    cmp(reg_k_loop, 1);
+    b(EQ, last_k_1);
+    L(k_loop_1);
+    reo_A_8x8(lp, 0);
+    add_imm(reg_tmp, reg_tmp, dyn_.k_blk, X_TMP_0);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_k_loop, 1);
+    b(GT, k_loop_1);
+    b(LT, k_end_1);
+    L(last_k_1);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_tail, 0);
+    b(EQ, k_loop_1);
+    reo_A_8x8(lp, 1);
+    L(k_end_1);
+    L(m_end);
+}
+
+void jit_int8_matmul_utils_kernel_t::gen_reo_b() {
+
+    int lp = (dyn_.ktail > 0) ? dyn_.ktail : dyn_.k_blk;
+
+    set_preg(prd_ld.b, dyn_.n_blk, X_TMP_4, X_TMP_1);
+    set_preg(prd_p3.b, dyn_.ntail, X_TMP_4, X_TMP_1);
+
+    LDR_IMM(reg_max, reg_param, GET_OFF(nn));
+    LDR_IMM(reg_min, reg_param, GET_OFF(nk));
+
+    LDR_IMM(reg_tmp_2, reg_param, GET_OFF(tl));
+    ldr(WReg(reg_tail.getIdx()), ptr(reg_tmp_2));
+
+    LDR_IMM(reg_tmp_2, reg_param, GET_OFF(ntl));
+    ldr(WReg(reg_n_tail.getIdx()), ptr(reg_tmp_2));
+
+    ldr(WReg(reg_n_loop.getIdx()), ptr(reg_max));
+    ldr(WReg(reg_k_loop.getIdx()), ptr(reg_min));
+
+    mov(reg_aux_a, reg_src);
+    mov(reg_aux_b, reg_dst);
+
+    Label n_loop, last_n, n_end, k_loop, last_k, k_end, k_loop_1, last_k_1,
+            k_end_1;
+
+    cmp(reg_n_loop, 1);
+    b(EQ, last_n);
+    L(n_loop);
+    ldr(WReg(reg_k_loop.getIdx()), ptr(reg_min));
+    mov(reg_aux_a, reg_src);
+    cmp(reg_k_loop, 1);
+    b(EQ, last_k);
+    L(k_loop);
+    reo_B_8x24(dyn_.k_blk, 0);
+    add_imm(reg_aux_a, reg_aux_a, dyn_.k_blk * dyn_.N, X_TMP_4);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_k_loop, 1);
+    b(GT, k_loop);
+    b(LT, k_end);
+    L(last_k);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_tail, 0);
+    b(EQ, k_loop);
+    reo_B_8x24(lp, 0);
+    L(k_end);
+    add_imm(reg_src, reg_src, dyn_.n_blk, X_TMP_4);
+    sub(reg_n_loop, reg_n_loop, 1);
+    cmp(reg_n_loop, 1);
+    b(GT, n_loop);
+    b(LT, n_end);
+
+    L(last_n);
+    sub(reg_n_loop, reg_n_loop, 1);
+    cmp(reg_n_tail, 0);
+    b(EQ, n_loop);
+    ldr(WReg(reg_k_loop.getIdx()), ptr(reg_min));
+    mov(reg_aux_a, reg_src);
+    cmp(reg_k_loop, 1);
+    b(EQ, last_k_1);
+    L(k_loop_1);
+    reo_B_8x24(dyn_.k_blk, dyn_.ntail);
+    add_imm(reg_aux_a, reg_aux_a, dyn_.k_blk * dyn_.N, X_TMP_4);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_k_loop, 1);
+    b(GT, k_loop_1);
+    b(LT, k_end_1);
+    L(last_k_1);
+    sub(reg_k_loop, reg_k_loop, 1);
+    cmp(reg_tail, 0);
+    b(EQ, k_loop_1);
+    reo_B_8x24(lp, dyn_.ntail);
+    L(k_end_1);
+    L(n_end);
+}
+
+void jit_int8_matmul_utils_kernel_t::generate() {
+
+    preamble();
+
+    if (dyn_.reorder_a == 1) {
+        LDR_IMM(reg_src, reg_param, GET_OFF(src));
+        LDR_IMM(reg_dst, reg_param, GET_OFF(dst));
+        gen_reo_a();
+    } else if (dyn_.reorder_b == 1) {
+        LDR_IMM(reg_src, reg_param, GET_OFF(src));
+        LDR_IMM(reg_dst, reg_param, GET_OFF(dst));
+        gen_reo_b();
+    }
+
+    postamble();
+}
+} // namespace matmul
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/jit_int8_matmul_utils.hpp b/src/cpu/aarch64/matmul/jit_int8_matmul_utils.hpp
new file mode 100644
index 00000000000..d7905f84896
--- /dev/null
+++ b/src/cpu/aarch64/matmul/jit_int8_matmul_utils.hpp
@@ -0,0 +1,86 @@
+/*******************************************************************************
+* Copyright 2025 FUJITSU LIMITED
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_AARCH64_JIT_INT8_MATMUL_UTILS_HPP
+#define CPU_AARCH64_JIT_INT8_MATMUL_UTILS_HPP
+
+// #include "common/primitive.hpp"
+#include "cpu/aarch64/jit_generator.hpp"
+#include "cpu/aarch64/matmul/jit_int8_kernel_types.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace aarch64 {
+namespace matmul {
+
+using namespace Xbyak_aarch64;
+struct jit_int8_matmul_utils_kernel_t : public jit_generator {
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_int8_matmul_utils_kernel_t);
+
+    XReg reg_param = abi_param1;
+    XReg reg_src = x3;
+    XReg reg_dst = x4;
+    XReg reg_scl = x5;
+    XReg reg_zp = x6;
+    XReg reg_tmp = x7;
+    XReg reg_tmp_2 = x8;
+    XReg reg_max = x9;
+    XReg reg_min = x10;
+    XReg reg_tmp_1 = x11;
+    XReg reg_k_loop = x12;
+    XReg reg_m_loop = x13;
+    XReg reg_loop = x14;
+    XReg reg_tail = x15;
+    XReg reg_m_tail = x16;
+    XReg reg_aux_b = x17;
+    XReg reg_aux_a = x18;
+
+    PReg prd_ld = p1;
+    PReg prd_st = p2;
+    PReg prd_p1 = p3;
+    PReg prd_p2 = p4;
+    PReg prd_p3 = p5;
+
+    XReg reg_n_loop = reg_m_loop;
+    XReg reg_n_tail = reg_m_tail;
+
+    int f32_dt_sz = 4;
+
+    void operator()(const dyn_params_t *p) {
+        return jit_generator::operator()(p);
+    }
+
+    jit_int8_matmul_utils_kernel_t(const dyn_vals_t &k) : dyn_(k) {}
+    ~jit_int8_matmul_utils_kernel_t() override = default;
+
+private:
+    void gen_reo_a();
+    void gen_reo_b();
+    void reo_A_8x8(int, int);
+    void reo_B_8x24(int, int);
+    void generate() override;
+
+    dyn_vals_t dyn_;
+};
+
+} // namespace matmul
+} // namespace aarch64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+#endif
diff --git a/src/cpu/aarch64/xbyak_aarch64/_clang-format b/src/cpu/aarch64/xbyak_aarch64/_clang-format
deleted file mode 100644
index af422e6188f..00000000000
--- a/src/cpu/aarch64/xbyak_aarch64/_clang-format
+++ /dev/null
@@ -1,127 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  LLVM
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignConsecutiveMacros: false
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Right
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllConstructorInitializersOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     300
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        1
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: false
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 120
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-ReflowComments:  true
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseTab:          Never
-...
-
diff --git a/src/cpu/acl/CMakeLists.txt b/src/cpu/acl/CMakeLists.txt
new file mode 100644
index 00000000000..abe0a5c49eb
--- /dev/null
+++ b/src/cpu/acl/CMakeLists.txt
@@ -0,0 +1,33 @@
+#*******************************************************************************
+# Copyright 2020-2022 Arm Ltd. and affiliates
+# Copyright 2020-2021 FUJITSU LIMITED
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#*******************************************************************************
+file(GLOB_RECURSE SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]
+    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
+    )
+# If the runtime is not THREADPOOL remove threadpool_scheduler sources.
+if(NOT DNNL_CPU_RUNTIME STREQUAL "THREADPOOL")
+    list(APPEND ACL_THREADPOOL_FILES
+        ${CMAKE_CURRENT_SOURCE_DIR}/acl_threadpool_scheduler.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/acl_threadpool_scheduler.hpp
+    )
+    list(REMOVE_ITEM SOURCES ${ACL_THREADPOOL_FILES})
+endif()
+set(OBJ_LIB ${DNNL_LIBRARY_NAME}_cpu_acl)
+add_library(${OBJ_LIB} OBJECT ${SOURCES})
+set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+    $<TARGET_OBJECTS:${OBJ_LIB}>)
+enable_conditional_compilation4(${OBJ_LIB})
\ No newline at end of file
diff --git a/src/cpu/aarch64/acl_batch_normalization.cpp b/src/cpu/acl/acl_batch_normalization.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_batch_normalization.cpp
rename to src/cpu/acl/acl_batch_normalization.cpp
index 77a723207fc..83f4c5061a0 100644
--- a/src/cpu/aarch64/acl_batch_normalization.cpp
+++ b/src/cpu/acl/acl_batch_normalization.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_batch_normalization.hpp"
+#include "cpu/acl/acl_batch_normalization.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_batch_normalization_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
@@ -72,7 +72,7 @@ status_t acl_batch_normalization_fwd_t::execute_forward(
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_batch_normalization.hpp b/src/cpu/acl/acl_batch_normalization.hpp
similarity index 95%
rename from src/cpu/aarch64/acl_batch_normalization.hpp
rename to src/cpu/acl/acl_batch_normalization.hpp
index 9e91e8b7279..ef7e4c22cbd 100644
--- a/src/cpu/aarch64/acl_batch_normalization.hpp
+++ b/src/cpu/acl/acl_batch_normalization.hpp
@@ -14,18 +14,18 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_BATCH_NORMALIZATION_HPP
-#define CPU_AARCH64_ACL_BATCH_NORMALIZATION_HPP
+#ifndef CPU_ACL_BATCH_NORMALIZATION_HPP
+#define CPU_ACL_BATCH_NORMALIZATION_HPP
 
 #include "cpu/cpu_batch_normalization_pd.hpp"
 
-#include "cpu/aarch64/acl_post_ops.hpp"
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_post_ops.hpp"
+#include "cpu/acl/acl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_batch_normalization_obj_t {
     arm_compute::NEBatchNormalizationLayer bnorm;
@@ -92,12 +92,6 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
         using cpu_batch_normalization_fwd_pd_t::
                 cpu_batch_normalization_fwd_pd_t;
 
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , abp() {}
-
         DECLARE_COMMON_PD_T("acl", acl_batch_normalization_fwd_t);
 
         status_t init(engine_t *engine) {
@@ -240,7 +234,7 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
             return status::success;
         }
 
-        acl_batch_normalization_conf_t abp;
+        acl_batch_normalization_conf_t abp = utils::zero<decltype(abp)>();
 
         acl_post_ops_t post_ops;
     }; // pd_t
@@ -272,7 +266,7 @@ struct acl_batch_normalization_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_batch_normalization_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_benchmark_scheduler.cpp b/src/cpu/acl/acl_benchmark_scheduler.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_benchmark_scheduler.cpp
rename to src/cpu/acl/acl_benchmark_scheduler.cpp
index e8658dd6e6a..b2ceb96339e 100644
--- a/src/cpu/aarch64/acl_benchmark_scheduler.cpp
+++ b/src/cpu/acl/acl_benchmark_scheduler.cpp
@@ -14,13 +14,13 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_benchmark_scheduler.hpp"
+#include "cpu/acl/acl_benchmark_scheduler.hpp"
 #include "common/verbose.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 using namespace arm_compute;
 
 BenchmarkScheduler::BenchmarkScheduler(IScheduler &real_scheduler)
@@ -72,7 +72,7 @@ void BenchmarkScheduler::run_workloads(std::vector<Workload> &workloads) {
     ARM_COMPUTE_ERROR("Can't be reached");
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
\ No newline at end of file
diff --git a/src/cpu/aarch64/acl_benchmark_scheduler.hpp b/src/cpu/acl/acl_benchmark_scheduler.hpp
similarity index 92%
rename from src/cpu/aarch64/acl_benchmark_scheduler.hpp
rename to src/cpu/acl/acl_benchmark_scheduler.hpp
index 8fddf7ea298..a23a903c385 100644
--- a/src/cpu/aarch64/acl_benchmark_scheduler.hpp
+++ b/src/cpu/acl/acl_benchmark_scheduler.hpp
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#ifndef CPU_AARCH64_ACL_BENCHMARK_SCHEDULER_HPP
-#define CPU_AARCH64_ACL_BENCHMARK_SCHEDULER_HPP
+#ifndef CPU_ACL_BENCHMARK_SCHEDULER_HPP
+#define CPU_ACL_BENCHMARK_SCHEDULER_HPP
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/runtime/IScheduler.h"
@@ -22,7 +22,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 // BenchmarkScheduler implement's ACL IScheduler interface and acts as an interceptor scheduler
 // when DNNL_VERBOSE=profile,profile_externals. It intercepts calls made by the actual scheduler used by ACL and adds
 // timers to benchmark execution time of ACL kernels and store kernel information.
@@ -52,9 +52,9 @@ class BenchmarkScheduler final : public arm_compute::IScheduler {
     IScheduler &_real_scheduler;
 };
 
-#endif // CPU_AARCH64_ACL_BENCHMARK_SCHEDULER_HPP
+#endif // CPU_ACL_BENCHMARK_SCHEDULER_HPP
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_binary.cpp b/src/cpu/acl/acl_binary.cpp
similarity index 99%
rename from src/cpu/aarch64/acl_binary.cpp
rename to src/cpu/acl/acl_binary.cpp
index b1b70c80636..04418e65e70 100644
--- a/src/cpu/aarch64/acl_binary.cpp
+++ b/src/cpu/acl/acl_binary.cpp
@@ -27,7 +27,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_binary_t::pd_t::init(engine_t *engine) {
     using namespace acl_utils;
@@ -229,7 +229,7 @@ const acl_binary_t::pd_t *acl_binary_t::pd() const {
     return static_cast<const pd_t *>(primitive_t::pd().get());
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_binary.hpp b/src/cpu/acl/acl_binary.hpp
similarity index 95%
rename from src/cpu/aarch64/acl_binary.hpp
rename to src/cpu/acl/acl_binary.hpp
index 41ecdded523..7040fe8aa42 100644
--- a/src/cpu/aarch64/acl_binary.hpp
+++ b/src/cpu/acl/acl_binary.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_BINARY_HPP
-#define CPU_AARCH64_ACL_BINARY_HPP
+#ifndef CPU_ACL_BINARY_HPP
+#define CPU_ACL_BINARY_HPP
 
 #include "acl_utils.hpp"
 #include "cpu/cpu_binary_pd.hpp"
@@ -28,7 +28,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_binary_conf_t {
     arm_compute::TensorInfo src0_info;
@@ -73,7 +73,7 @@ struct acl_binary_t : public primitive_t {
 
 }; // acl_binary_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/acl/acl_convolution_utils.cpp
similarity index 84%
rename from src/cpu/aarch64/acl_convolution_utils.cpp
rename to src/cpu/acl/acl_convolution_utils.cpp
index 15437746069..fcb3c36e394 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/acl/acl_convolution_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Arm Ltd. and affiliates
+* Copyright 2020-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "acl_convolution_utils.hpp"
+#include "cpu/acl/acl_convolution_utils.hpp"
 #include "common/convolution_pd.hpp"
 #include "common/utils.hpp"
 #include "oneapi/dnnl/dnnl.h"
@@ -22,7 +22,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace acl_convolution_utils {
 
@@ -283,9 +283,63 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
 
     return status::success;
 }
+
+status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+        memory_desc_t &weights_md, memory_desc_t &dst_md,
+        memory_desc_t &bias_md, const convolution_desc_t &cd,
+        const primitive_attr_t &attr) {
+
+    // Under these conditions, fallback to faster GEMM-based convolution
+    // unless the user explicitly specifies Winograd algorithm
+    // clang-format off
+
+    // Heuristic only for servers
+    if (dnnl_get_max_threads() > 28 && cd.alg_kind == alg_kind::convolution_auto) {
+            return status::unimplemented;
+    }
+    // Heuristic for other devices
+    if (one_of(true, src_md.dims[1] < 64, // ic
+                     dst_md.dims[1] < 64) // oc
+                  && cd.alg_kind == alg_kind::convolution_auto) {
+            return status::unimplemented;
+    }
+
+    // clang-format on
+
+    // General Compute Library checks, memory tags are also set there
+    acp.alg_winograd = true;
+    CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
+
+    const bool shape_ok
+            // only unit strides allowed
+            = (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
+            // Note: Compute Library supports arbitrary padding for wino kernels
+            // but we only allow small padding to be consistent with oneDNN
+            && (acp.padstride_info.pad().first <= 1) // padding left/right
+            && (acp.padstride_info.pad().second <= 1) // padding top/bottom
+            // only non-dilated convolutions allowed
+            && (acp.dilation_info == arm_compute::Size2D(1, 1));
+
+    ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
+
+    // clang-format off
+    // Validate convolution manually to check for return status
+    ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
+        &acp.src_tensor_info,
+        &acp.wei_tensor_info,
+        acp.with_bias ? &acp.bia_tensor_info : nullptr,
+        &acp.dst_tensor_info,
+        acp.padstride_info,
+        acp.act_info,
+        true)); // enable_fast_math flag in ACL Winograd
+    // clang-format on
+
+    return status::success;
+}
+
 } // namespace acl_convolution_utils
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/acl/acl_convolution_utils.hpp b/src/cpu/acl/acl_convolution_utils.hpp
new file mode 100644
index 00000000000..fb616e71a7c
--- /dev/null
+++ b/src/cpu/acl/acl_convolution_utils.hpp
@@ -0,0 +1,239 @@
+/*******************************************************************************
+* Copyright 2020-2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_ACL_CONVOLUTION_UTILS_HPP
+#define CPU_ACL_CONVOLUTION_UTILS_HPP
+
+#include <map>
+#include "acl_post_ops.hpp"
+#include "acl_utils.hpp"
+#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
+#include "cpu/cpu_convolution_pd.hpp"
+#include <type_traits>
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+
+template <typename ConvOp>
+struct acl_obj_t {
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor wei_tensor;
+    arm_compute::Tensor bia_tensor;
+    arm_compute::Tensor dst_tensor;
+    ConvOp conv;
+    arm_compute::experimental::MemoryRequirements aux_mem_req;
+};
+
+struct acl_conv_conf_t {
+    bool with_bias;
+    bool fast_math;
+    // If this is true, the result of the convolution goes into a temporarily
+    // allocated ACL tensor to be accumulated into the oneDNN dst during postops
+    bool use_dst_acc_for_sum;
+    // Tells that the selected algorithm is Winograd. This is needed because the
+    // algorithm can be set to algorithm::convolution_auto and later on we need to
+    // skip fixed-format protocol as ACL Winograd does not support it.
+    bool alg_winograd;
+    arm_compute::TensorInfo src_tensor_info;
+    arm_compute::TensorInfo wei_tensor_info;
+    arm_compute::TensorInfo bia_tensor_info;
+    arm_compute::TensorInfo dst_tensor_info;
+
+    arm_compute::PadStrideInfo padstride_info;
+    arm_compute::Size2D dilation_info;
+    // Additional information about the weights not included in wei_tensor_info
+    arm_compute::WeightsInfo weights_info;
+    // Note: this will default to not enabled, and will do nothing
+    arm_compute::ActivationLayerInfo act_info;
+};
+
+namespace acl_convolution_utils {
+
+status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+        memory_desc_t &weights_md, memory_desc_t &dst_md,
+        memory_desc_t &bias_md, const convolution_desc_t &cd,
+        const primitive_attr_t &attr);
+
+status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+        memory_desc_t &weights_md, memory_desc_t &dst_md,
+        memory_desc_t &bias_md, const convolution_desc_t &cd,
+        const primitive_attr_t &attr);
+
+} // namespace acl_convolution_utils
+
+// Keys are anonymous with local linkage. So deduce the type automagically.
+using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
+
+template <typename op_t, typename post_ops_t>
+status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
+        const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
+        post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
+        arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
+        const dnnl::impl::memory_desc_t &dst_md) {
+
+    // Book temp mem.
+    const auto aux_mem_req = conv.workspace();
+    for (const auto &key : conv_keys) {
+        const auto id = key.first;
+        if (aux_mem_req[id].size > 0) {
+            scratchpad.book(key.second, aux_mem_req[id].size, 1,
+                    aux_mem_req[id].alignment, aux_mem_req[id].alignment);
+        }
+    }
+
+    CHECK(post_ops.init(engine, attr_post_ops, dst_md, act_info));
+    use_dst_acc_for_sum = post_ops.has_sum();
+
+    if (use_dst_acc_for_sum) {
+        const memory_desc_wrapper dst_d(&dst_md);
+        scratchpad.book(memory_tracking::names::key_generic_acc, dst_d.nelems(),
+                dst_d.data_type_size());
+    }
+
+    return status::success;
+}
+
+template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
+        typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
+        typename bia_data_t = src_data_t>
+status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
+        conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
+        const std::map<int, conv_key_t> &conv_keys) {
+
+    auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
+    auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
+
+    // import_memory() and free() methods do not allocate/free any additional
+    // memory, only acquire/release pointers.
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor wei_tensor;
+    arm_compute::Tensor bia_tensor = nullptr;
+    arm_compute::Tensor dst_tensor;
+
+    auto const acp = pd->acp_;
+    src_tensor.allocator()->init(acp.src_tensor_info);
+    wei_tensor.allocator()->init(acp.wei_tensor_info);
+    dst_tensor.allocator()->init(acp.dst_tensor_info);
+
+    src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
+    wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));
+
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // If we have an unfused sum post op, put the result in a scratchpad tensor.
+    // Result will be summed to the dst during acl_post_ops.execute
+    auto dst_base = acp.use_dst_acc_for_sum
+            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
+            : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
+    dst_tensor.allocator()->import_memory(dst_base);
+
+    if (acp.with_bias) {
+        auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
+        bia_tensor.allocator()->init(acp.bia_tensor_info);
+        bia_tensor.allocator()->import_memory(
+                const_cast<bia_data_t *>(bia_base));
+    }
+
+    // Constness of the weight tensor matters for depthwise conv in ACL.
+    // Otherwise, it will package the weights more often than needed, as
+    // it will expect the weights to change within the duration of the run
+    // func.
+    arm_compute::ITensorPack pack;
+    pack.add_tensor(arm_compute::TensorType::ACL_SRC_0, &src_tensor);
+    pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &wei_tensor);
+    pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &bia_tensor);
+    pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor);
+
+    // Get temp workspaces.
+    const auto aux_mem = acl_conv_obj->aux_mem_req;
+
+    // Hold onto tmp tensors while we need pack.
+    std::vector<arm_compute::Tensor> tmp_tensors(aux_mem.size());
+    for (const auto &key : conv_keys) {
+        const auto id = key.first;
+        if (aux_mem[id].size > 0) {
+            const auto info = arm_compute::TensorInfo(
+                    arm_compute::TensorShape(aux_mem[id].size), 1,
+                    arm_compute::DataType::U8);
+            auto buffer = scratchpad.get<void>(key.second);
+            tmp_tensors[id].allocator()->init(info, aux_mem[id].alignment);
+            tmp_tensors[id].allocator()->import_memory(buffer);
+            pack.add_tensor(aux_mem[id].slot, &tmp_tensors[id]);
+        }
+    }
+
+    acl_conv_obj->conv.run(pack);
+
+    void *dst = dst_tensor.buffer();
+    pd->post_ops.execute(ctx, dst);
+
+    return status::success;
+}
+
+template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
+        typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
+        typename bia_data_t = src_data_t>
+status_t execute_forward_conv_acl(
+        const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
+    bool with_bias = pd->acp_.with_bias;
+    bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;
+
+    auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
+    auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
+
+    // import_memory() and free() methods do not allocate/free any additional
+    // memory, only acquire/release pointers.
+    acl_conv_obj.src_tensor.allocator()->import_memory(
+            const_cast<src_data_t *>(src_base));
+    acl_conv_obj.wei_tensor.allocator()->import_memory(
+            const_cast<wei_data_t *>(wei_base));
+
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // If we have an unfused sum post op, put the result in a scratchpad tensor.
+    // Result will be summed to the dst during acl_post_ops.execute
+    auto dst_base = use_dst_acc_for_sum
+            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
+            : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
+    acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);
+
+    if (with_bias) {
+        auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
+        acl_conv_obj.bia_tensor.allocator()->import_memory(
+                const_cast<bia_data_t *>(bia_base));
+    }
+
+    acl_conv_obj.conv.run();
+
+    acl_conv_obj.src_tensor.allocator()->free();
+    acl_conv_obj.wei_tensor.allocator()->free();
+    if (with_bias) { acl_conv_obj.bia_tensor.allocator()->free(); }
+
+    void *dst = acl_conv_obj.dst_tensor.buffer();
+    pd->post_ops.execute(ctx, dst);
+
+    acl_conv_obj.dst_tensor.allocator()->free();
+
+    return status::success;
+}
+
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_ACL_CONVOLUTION_UTILS_HPP
diff --git a/src/cpu/aarch64/acl_deconvolution.cpp b/src/cpu/acl/acl_deconvolution.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_deconvolution.cpp
rename to src/cpu/acl/acl_deconvolution.cpp
index cdeca9cb8bb..0eef20dbabc 100644
--- a/src/cpu/aarch64/acl_deconvolution.cpp
+++ b/src/cpu/acl/acl_deconvolution.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_deconvolution.hpp"
+#include "cpu/acl/acl_deconvolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
 
@@ -64,7 +64,7 @@ status_t acl_deconvolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_deconvolution.hpp b/src/cpu/acl/acl_deconvolution.hpp
similarity index 92%
rename from src/cpu/aarch64/acl_deconvolution.hpp
rename to src/cpu/acl/acl_deconvolution.hpp
index 97413c7ba65..18c8c1f1a67 100644
--- a/src/cpu/aarch64/acl_deconvolution.hpp
+++ b/src/cpu/acl/acl_deconvolution.hpp
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_DECONVOLUTION_HPP
-#define CPU_AARCH64_ACL_DECONVOLUTION_HPP
+#ifndef CPU_ACL_DECONVOLUTION_HPP
+#define CPU_ACL_DECONVOLUTION_HPP
 
-#include "cpu/aarch64/acl_post_ops.hpp"
+#include "cpu/acl/acl_post_ops.hpp"
 #include "cpu/cpu_deconvolution_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_deconv_obj_t {
     arm_compute::NEDeconvolutionLayer deconv;
@@ -82,10 +82,6 @@ struct acl_deconv_resource_t : public resource_t {
 struct acl_deconvolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_fwd_pd_t {
         using cpu_deconvolution_fwd_pd_t::cpu_deconvolution_fwd_pd_t;
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , acl_pd_conf() {}
 
         DECLARE_COMMON_PD_T(
                 "acl", acl_deconvolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
@@ -193,8 +189,9 @@ struct acl_deconvolution_fwd_t : public primitive_t {
             }
 
             // Data layout
-            const auto acl_layout = is_nspc ? arm_compute::DataLayout::NHWC
-                                            : arm_compute::DataLayout::NCHW;
+            const arm_compute::DataLayout acl_layout = is_nspc
+                    ? arm_compute::DataLayout::NHWC
+                    : arm_compute::DataLayout::NCHW;
 
             acl_pd_conf.src_info = arm_compute::TensorInfo(is_nspc
                             ? arm_compute::TensorShape(ic, iw, ih, mb)
@@ -243,18 +240,15 @@ struct acl_deconvolution_fwd_t : public primitive_t {
             // padding is set for convolution. Otherwise, describe deconvolution as convolution of
             // upsampling input with stride = 1 and pad = 0.
             arm_compute::ConvolutionMethod conv_method;
-            arm_compute::TensorInfo *conv_src_info;
+            arm_compute::TensorInfo conv_src_info(
+                    acl_pd_conf.src_info.clone()->set_is_resizable(true));
             unsigned int pad_left = 0;
             unsigned int pad_right = 0;
             unsigned int pad_top = 0;
             unsigned int pad_bottom = 0;
             if (sh != 1 || sw != 1) {
-                arm_compute::TensorInfo scale_out_info(
-                        acl_pd_conf.src_info.clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape));
-                conv_src_info = &scale_out_info;
+                conv_src_info.reset_padding();
+                conv_src_info.set_tensor_shape(scale_out_shape);
             } else {
                 // compute correct padding here
                 pad_left = pr > pl ? pr - pl : 0;
@@ -269,15 +263,13 @@ struct acl_deconvolution_fwd_t : public primitive_t {
                 pad_right += deconv_pad_x / 2;
                 pad_top += deconv_pad_y / 2;
                 pad_bottom += deconv_pad_y / 2;
-
-                conv_src_info = &acl_pd_conf.src_info;
             }
             const arm_compute::PadStrideInfo conv_info(1, 1, pad_left,
                     pad_right, pad_top, pad_bottom,
                     arm_compute::DimensionRoundingType::CEIL);
             conv_method
                     = arm_compute::NEConvolutionLayer::get_convolution_method(
-                            conv_src_info, &acl_pd_conf.wei_info,
+                            &conv_src_info, &acl_pd_conf.wei_info,
                             &acl_pd_conf.dst_info, conv_info,
                             arm_compute::WeightsInfo(),
                             arm_compute::Size2D(1U, 1U),
@@ -302,7 +294,7 @@ struct acl_deconvolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        acl_deconv_conf_t acl_pd_conf;
+        acl_deconv_conf_t acl_pd_conf = utils::zero<decltype(acl_pd_conf)>();
         acl_post_ops_t post_ops;
 
     private:
@@ -338,7 +330,7 @@ struct acl_deconvolution_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_deconvolution_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/acl/acl_depthwise_convolution.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_depthwise_convolution.cpp
rename to src/cpu/acl/acl_depthwise_convolution.cpp
index 4752cfd5852..15edd205f76 100644
--- a/src/cpu/aarch64/acl_depthwise_convolution.cpp
+++ b/src/cpu/acl/acl_depthwise_convolution.cpp
@@ -14,15 +14,15 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_depthwise_convolution.hpp"
+#include "cpu/acl/acl_depthwise_convolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace {
-using data_t = prec_traits<data_type::f32>::type;
+using data_t = prec_traits_t<data_type::f32>::type;
 
 // Keys are anonymous. So deduce the type automagically.
 using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
@@ -87,7 +87,7 @@ status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) {
     acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
     return status::success;
 }
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/acl/acl_depthwise_convolution.hpp
similarity index 81%
rename from src/cpu/aarch64/acl_depthwise_convolution.hpp
rename to src/cpu/acl/acl_depthwise_convolution.hpp
index 3e3f0e1ccbc..61c39332a67 100644
--- a/src/cpu/aarch64/acl_depthwise_convolution.hpp
+++ b/src/cpu/acl/acl_depthwise_convolution.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
-#define CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
+#ifndef CPU_ACL_DEPTHWISE_CONVOLUTION_HPP
+#define CPU_ACL_DEPTHWISE_CONVOLUTION_HPP
 
 #include "acl_convolution_utils.hpp"
 #include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
@@ -24,23 +24,21 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_depthwise_convolution_fwd_t : public primitive_t {
 
     using Op = arm_compute::experimental::op::CpuDepthwiseConv2d;
 
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), acp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("depthwise_convolution:acl",
                 acl_depthwise_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine);
 
-        acl_conv_conf_t acp_;
+        acl_conv_conf_t acp_ = utils::zero<decltype(acp_)>();
         acl_post_ops_t post_ops;
     };
 
@@ -59,9 +57,9 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t {
     std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 };
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
+#endif // CPU_ACL_DEPTHWISE_CONVOLUTION_HPP
diff --git a/src/cpu/aarch64/acl_eltwise.cpp b/src/cpu/acl/acl_eltwise.cpp
similarity index 98%
rename from src/cpu/aarch64/acl_eltwise.cpp
rename to src/cpu/acl/acl_eltwise.cpp
index e7789825f42..98f539dd8f9 100644
--- a/src/cpu/aarch64/acl_eltwise.cpp
+++ b/src/cpu/acl/acl_eltwise.cpp
@@ -19,7 +19,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_eltwise_fwd_t::execute(const exec_ctx_t &ctx) const {
     return execute_forward(ctx);
@@ -108,7 +108,7 @@ status_t acl_eltwise_fwd_t::pd_t::init(engine_t *engine) {
 
     return status::success;
 }
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_eltwise.hpp b/src/cpu/acl/acl_eltwise.hpp
similarity index 93%
rename from src/cpu/aarch64/acl_eltwise.hpp
rename to src/cpu/acl/acl_eltwise.hpp
index bd64eac1936..45869414bec 100644
--- a/src/cpu/aarch64/acl_eltwise.hpp
+++ b/src/cpu/acl/acl_eltwise.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_ELTWISE_HPP
-#define CPU_AARCH64_ACL_ELTWISE_HPP
+#ifndef CPU_ACL_ELTWISE_HPP
+#define CPU_ACL_ELTWISE_HPP
 
 #include <memory>
 #include "cpu/cpu_eltwise_pd.hpp"
@@ -27,7 +27,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_eltwise_conf_t {
     arm_compute::ActivationLayerInfo act_info;
@@ -71,9 +71,9 @@ struct acl_eltwise_fwd_t : public primitive_t {
     friend struct acl_post_ops_t;
 }; // acl_eltwise_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_ELTWISE_HPP
+#endif // CPU_ACL_ELTWISE_HPP
diff --git a/src/cpu/aarch64/acl_gemm_convolution.cpp b/src/cpu/acl/acl_gemm_convolution.cpp
similarity index 98%
rename from src/cpu/aarch64/acl_gemm_convolution.cpp
rename to src/cpu/acl/acl_gemm_convolution.cpp
index d3a663d8c63..922ea42e396 100644
--- a/src/cpu/aarch64/acl_gemm_convolution.cpp
+++ b/src/cpu/acl/acl_gemm_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Arm Ltd. and affiliates
+* Copyright 2020-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace {
 // Keys are anonymous. So deduce the type automagically.
@@ -112,7 +112,7 @@ template struct acl_gemm_convolution_fwd_t<f32>;
 template struct acl_gemm_convolution_fwd_t<f16>;
 template struct acl_gemm_convolution_fwd_t<s8, s8, s8, s32>;
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_gemm_convolution.hpp b/src/cpu/acl/acl_gemm_convolution.hpp
similarity index 75%
rename from src/cpu/aarch64/acl_gemm_convolution.hpp
rename to src/cpu/acl/acl_gemm_convolution.hpp
index 23fe03f2d85..14d0050c7ab 100644
--- a/src/cpu/aarch64/acl_gemm_convolution.hpp
+++ b/src/cpu/acl/acl_gemm_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Arm Ltd. and affiliates
+* Copyright 2020-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_GEMM_CONVOLUTION_HPP
-#define CPU_AARCH64_ACL_GEMM_CONVOLUTION_HPP
+#ifndef CPU_ACL_GEMM_CONVOLUTION_HPP
+#define CPU_ACL_GEMM_CONVOLUTION_HPP
 
 #include "common/memory_tracking.hpp"
 #include "cpu/cpu_convolution_pd.hpp"
@@ -27,7 +27,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 template <data_type_t src_type, data_type_t wei_type = src_type,
         data_type_t dst_type = src_type, data_type_t bia_type = dst_type>
@@ -36,16 +36,14 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
     using Op = arm_compute::experimental::op::CpuGemmConv2d;
 
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), acp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 "gemm:acl", acl_gemm_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine);
 
-        acl_conv_conf_t acp_;
+        acl_conv_conf_t acp_ = utils::zero<decltype(acp_)>();
         acl_post_ops_t post_ops;
     };
 
@@ -54,10 +52,10 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
 
     status_t init(engine_t *engine) override;
 
-    using src_data_t = typename prec_traits<src_type>::type;
-    using wei_data_t = typename prec_traits<wei_type>::type;
-    using dst_data_t = typename prec_traits<dst_type>::type;
-    using bia_data_t = typename prec_traits<bia_type>::type;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using wei_data_t = typename prec_traits_t<wei_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
+    using bia_data_t = typename prec_traits_t<bia_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
@@ -69,7 +67,7 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
     std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 }; // acl_gemm_convolution_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.cpp b/src/cpu/acl/acl_indirect_gemm_convolution.cpp
similarity index 94%
rename from src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
rename to src/cpu/acl/acl_indirect_gemm_convolution.cpp
index 19ee68062da..d543b2b6b19 100644
--- a/src/cpu/aarch64/acl_indirect_gemm_convolution.cpp
+++ b/src/cpu/acl/acl_indirect_gemm_convolution.cpp
@@ -22,10 +22,10 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace {
-using data_t = typename prec_traits<data_type::f32>::type;
+using data_t = typename prec_traits_t<data_type::f32>::type;
 
 // Keys are anonymous. So deduce the type automagically.
 using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
@@ -96,11 +96,13 @@ status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
 
     const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
             && attr()->has_default_values(smask_t::post_ops, f16);
+    const bool is_bf16_ok = expect_data_types(bf16, bf16, bf16, bf16, undef)
+            && attr_.post_ops_.len() == 0;
     const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
             && attr()->has_default_values(
                     smask_t::post_ops | smask_t::fpmath_mode, f32);
     bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
-            && utils::one_of(true, is_fp16_ok, is_fp32_ok)
+            && utils::one_of(true, is_fp16_ok, is_bf16_ok, is_fp32_ok)
             && !has_zero_dim_memory();
     if (!ok) return status::unimplemented;
 
@@ -120,7 +122,7 @@ status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
             dst_md_);
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/acl/acl_indirect_gemm_convolution.hpp
similarity index 81%
rename from src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
rename to src/cpu/acl/acl_indirect_gemm_convolution.hpp
index d5b914e5fd7..7286cc3ced6 100644
--- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
+++ b/src/cpu/acl/acl_indirect_gemm_convolution.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
-#define CPU_AARCH64_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
+#ifndef CPU_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
+#define CPU_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
 
 #include "cpu/cpu_convolution_pd.hpp"
 
@@ -25,24 +25,21 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
 
     using Op = arm_compute::experimental::op::CpuGemmDirectConv2d;
 
     struct pd_t : public cpu_convolution_fwd_pd_t {
-
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), acp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("indirect_gemm:acl",
                 acl_indirect_gemm_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine);
 
-        acl_conv_conf_t acp_;
+        acl_conv_conf_t acp_ = utils::zero<decltype(acp_)>();
         acl_post_ops_t post_ops;
 
     private:
@@ -64,9 +61,9 @@ struct acl_indirect_gemm_convolution_fwd_t : public primitive_t {
     std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 };
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
+#endif // CPU_ACL_INDIRECT_GEMM_CONVOLUTION_HPP
diff --git a/src/cpu/aarch64/acl_inner_product.cpp b/src/cpu/acl/acl_inner_product.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_inner_product.cpp
rename to src/cpu/acl/acl_inner_product.cpp
index 34de43ae638..9dcceaa2d30 100644
--- a/src/cpu/aarch64/acl_inner_product.cpp
+++ b/src/cpu/acl/acl_inner_product.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_inner_product.hpp"
+#include "cpu/acl/acl_inner_product.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
 
@@ -70,7 +70,7 @@ status_t acl_inner_product_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/acl/acl_inner_product.hpp
similarity index 93%
rename from src/cpu/aarch64/acl_inner_product.hpp
rename to src/cpu/acl/acl_inner_product.hpp
index 336168ba626..4dd84e4fa8b 100644
--- a/src/cpu/aarch64/acl_inner_product.hpp
+++ b/src/cpu/acl/acl_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Arm Ltd. and affiliates
+* Copyright 2021-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,18 +14,18 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_INNER_PRODUCT_HPP
-#define CPU_AARCH64_ACL_INNER_PRODUCT_HPP
+#ifndef CPU_ACL_INNER_PRODUCT_HPP
+#define CPU_ACL_INNER_PRODUCT_HPP
 
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 #include "cpu/cpu_inner_product_pd.hpp"
 
-#include "cpu/aarch64/acl_post_ops.hpp"
+#include "cpu/acl/acl_post_ops.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_ip_obj_t {
     arm_compute::NEFullyConnectedLayer fc;
@@ -85,10 +85,6 @@ struct acl_inner_product_fwd_t : public primitive_t {
     struct pd_t : public cpu_inner_product_fwd_pd_t {
         using cpu_inner_product_fwd_pd_t::cpu_inner_product_fwd_pd_t;
 
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_inner_product_fwd_pd_t(adesc, attr, hint_fwd_pd), aip() {}
-
         DECLARE_COMMON_PD_T("acl", acl_inner_product_fwd_t);
 
         status_t init(engine_t *engine) {
@@ -101,6 +97,9 @@ struct acl_inner_product_fwd_t : public primitive_t {
             const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
                     && attr()->has_default_values(
                             smask_t::post_ops | smask_t::fpmath_mode, f32);
+            const bool is_bf16_ok
+                    = expect_data_types(bf16, bf16, bf16, bf16, undef)
+                    && attr()->has_default_values(smask_t::post_ops, bf16);
             const bool is_fp32_bf16_ok
                     = expect_data_types(f32, bf16, f32, f32, undef)
                     && attr()->has_default_values(
@@ -109,8 +108,8 @@ struct acl_inner_product_fwd_t : public primitive_t {
                     = utils::one_of(weights_format_kind_received,
                             format_kind::any, format_kind::blocked);
             const bool ok = is_fwd() && !has_zero_dim_memory()
-                    && utils::one_of(
-                            true, is_fp16_ok, is_fp32_ok, is_fp32_bf16_ok)
+                    && utils::one_of(true, is_fp16_ok, is_fp32_ok,
+                            is_fp32_bf16_ok, is_bf16_ok)
                     && is_weights_md_format_ok
                     && set_default_params(true) == status::success;
 
@@ -128,7 +127,7 @@ struct acl_inner_product_fwd_t : public primitive_t {
             return status::success;
         }
 
-        acl_ip_conf_t aip;
+        acl_ip_conf_t aip = utils::zero<decltype(aip)>();
 
         acl_post_ops_t post_ops;
 
@@ -257,8 +256,11 @@ struct acl_inner_product_fwd_t : public primitive_t {
 
             // Fallback
             int block_by = arm_compute::block_by(expected_weight_format);
+            bool is_bf16 = src_md()->data_type == data_type::bf16
+                    && weights_md()->data_type == data_type::bf16
+                    && dst_md()->data_type == data_type::bf16;
             if (is_4d && weights_md_.dims[inner_dim] % block_by != 0
-                    && aip.fc_info.enable_fast_math) {
+                    && (aip.fc_info.enable_fast_math || is_bf16)) {
                 aip.fc_info.enable_fast_math = false;
                 aip.weights_info.set_weight_format(
                         arm_compute::WeightFormat::ANY);
@@ -331,9 +333,9 @@ struct acl_inner_product_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_inner_product_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_INNER_PRODUCT_HPP
+#endif // CPU_ACL_INNER_PRODUCT_HPP
diff --git a/src/cpu/aarch64/acl_layer_normalization.cpp b/src/cpu/acl/acl_layer_normalization.cpp
similarity index 94%
rename from src/cpu/aarch64/acl_layer_normalization.cpp
rename to src/cpu/acl/acl_layer_normalization.cpp
index 05bcb1766f1..11c4796d7d5 100644
--- a/src/cpu/aarch64/acl_layer_normalization.cpp
+++ b/src/cpu/acl/acl_layer_normalization.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_layer_normalization.hpp"
+#include "cpu/acl/acl_layer_normalization.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_layer_normalization_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
@@ -48,7 +48,7 @@ status_t acl_layer_normalization_fwd_t::execute_forward(
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_layer_normalization.hpp b/src/cpu/acl/acl_layer_normalization.hpp
similarity index 92%
rename from src/cpu/aarch64/acl_layer_normalization.hpp
rename to src/cpu/acl/acl_layer_normalization.hpp
index 80dd681a84b..9363511a521 100644
--- a/src/cpu/aarch64/acl_layer_normalization.hpp
+++ b/src/cpu/acl/acl_layer_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Arm Ltd. and affiliates
+* Copyright 2023-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
-#define CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
+#ifndef CPU_ACL_LAYER_NORMALIZATION_HPP
+#define CPU_ACL_LAYER_NORMALIZATION_HPP
 
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 #include "cpu/cpu_layer_normalization_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_msdnorm_obj_t {
     arm_compute::NEMeanStdDevNormalizationLayer msdNorm;
@@ -68,11 +68,6 @@ struct acl_layer_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_layer_normalization_fwd_pd_t {
         using cpu_layer_normalization_fwd_pd_t::
                 cpu_layer_normalization_fwd_pd_t;
-        pd_t(const layer_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_layer_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , anp() {}
 
         DECLARE_COMMON_PD_T("acl", acl_layer_normalization_fwd_t);
 
@@ -81,9 +76,10 @@ struct acl_layer_normalization_fwd_t : public primitive_t {
             // dir and flags
             ACL_CHECK_SUPPORT(
                     !is_fwd(), "ACL lnorm supports forward propagation only");
-            ACL_CHECK_SUPPORT(is_training() && !use_global_stats(),
-                    "ACL only supports forward training with lnorm if stats "
-                    "are provided (use global stats)");
+            ACL_CHECK_SUPPORT(
+                    is_training(), "ACL supports inference only for lnorm");
+            ACL_CHECK_SUPPORT(use_global_stats(),
+                    "ACL does not support global stats with lnorm");
             ACL_CHECK_SUPPORT(use_scale() || use_shift(),
                     "ACL does not support lnorm scale and shift");
 
@@ -219,7 +215,7 @@ struct acl_layer_normalization_fwd_t : public primitive_t {
                             || X * C > acl_better_XC_per_thread * threads);
         }
 
-        acl_msdnorm_conf_t anp;
+        acl_msdnorm_conf_t anp = utils::zero<decltype(anp)>();
 
     }; // pd_t
 
@@ -250,9 +246,9 @@ struct acl_layer_normalization_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_layer_normalization_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_LAYER_NORMALIZATION_HPP
+#endif // CPU_ACL_LAYER_NORMALIZATION_HPP
diff --git a/src/cpu/aarch64/acl_pooling.cpp b/src/cpu/acl/acl_pooling.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_pooling.cpp
rename to src/cpu/acl/acl_pooling.cpp
index 1aac8c53a34..e3ecd290638 100644
--- a/src/cpu/aarch64/acl_pooling.cpp
+++ b/src/cpu/acl/acl_pooling.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_pooling.hpp"
+#include "cpu/acl/acl_pooling.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_pooling_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     // Lock here is needed because resource_mapper does not support
@@ -52,7 +52,7 @@ status_t acl_pooling_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_pooling.hpp b/src/cpu/acl/acl_pooling.hpp
similarity index 96%
rename from src/cpu/aarch64/acl_pooling.hpp
rename to src/cpu/acl/acl_pooling.hpp
index a397d69aa7d..a696dac8e69 100644
--- a/src/cpu/aarch64/acl_pooling.hpp
+++ b/src/cpu/acl/acl_pooling.hpp
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_POOLING_HPP
-#define CPU_AARCH64_ACL_POOLING_HPP
+#ifndef CPU_ACL_POOLING_HPP
+#define CPU_ACL_POOLING_HPP
 
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 #include "cpu/cpu_pooling_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_pooling_obj_t {
     arm_compute::NEPoolingLayer pool;
@@ -77,9 +77,6 @@ struct acl_pooling_resource_t : public resource_t {
 struct acl_pooling_fwd_t : public primitive_t {
     struct pd_t : public cpu_pooling_fwd_pd_t {
         using cpu_pooling_fwd_pd_t::cpu_pooling_fwd_pd_t;
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : cpu_pooling_fwd_pd_t(adesc, attr, hint_fwd_pd), app() {}
 
         DECLARE_COMMON_PD_T("acl", acl_pooling_fwd_t);
 
@@ -265,7 +262,7 @@ struct acl_pooling_fwd_t : public primitive_t {
             return problem_size > cutoff * thread_count;
         }
 
-        acl_pooling_conf_t app;
+        acl_pooling_conf_t app = utils::zero<decltype(app)>();
     };
 
     acl_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
@@ -295,9 +292,9 @@ struct acl_pooling_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_pooling_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_POOLING_HPP
+#endif // CPU_ACL_POOLING_HPP
diff --git a/src/cpu/aarch64/acl_post_ops.cpp b/src/cpu/acl/acl_post_ops.cpp
similarity index 97%
rename from src/cpu/aarch64/acl_post_ops.cpp
rename to src/cpu/acl/acl_post_ops.cpp
index dbb1bf2d53c..816d195a920 100644
--- a/src/cpu/aarch64/acl_post_ops.cpp
+++ b/src/cpu/acl/acl_post_ops.cpp
@@ -15,12 +15,12 @@
 *******************************************************************************/
 
 #include "common/float16.hpp"
-#include "cpu/aarch64/acl_gemm_convolution.hpp"
+#include "cpu/acl/acl_gemm_convolution.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_post_ops_t::execute(
         const exec_ctx_t &ctx, void *src, void *dst) const {
@@ -97,7 +97,7 @@ status_t acl_post_ops_t::execute(
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_post_ops.hpp b/src/cpu/acl/acl_post_ops.hpp
similarity index 94%
rename from src/cpu/aarch64/acl_post_ops.hpp
rename to src/cpu/acl/acl_post_ops.hpp
index 5c80f413463..d5e470e4578 100644
--- a/src/cpu/aarch64/acl_post_ops.hpp
+++ b/src/cpu/acl/acl_post_ops.hpp
@@ -14,16 +14,16 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_POST_OPS_HPP
-#define CPU_AARCH64_ACL_POST_OPS_HPP
+#ifndef CPU_ACL_POST_OPS_HPP
+#define CPU_ACL_POST_OPS_HPP
 
-#include "cpu/aarch64/acl_binary.hpp"
-#include "cpu/aarch64/acl_eltwise.hpp"
+#include "cpu/acl/acl_binary.hpp"
+#include "cpu/acl/acl_eltwise.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_post_ops_t {
 
@@ -142,10 +142,8 @@ struct acl_post_ops_t {
 
         CHECK(base_post_ops.set_default_formats(&dst_md));
         dst_data_type = dst_md.data_type;
-        // If the first entry is eltwise, we fuse it, except when the datatype
-        // is fp16 because in this case we want to execute the eltwise in fp32.
-        if (base_post_ops.len() >= 1 && base_post_ops.entry_[0].is_eltwise()
-                && dst_data_type != data_type::f16) {
+        // If the first entry is eltwise, we fuse it
+        if (base_post_ops.len() >= 1 && base_post_ops.entry_[0].is_eltwise()) {
 
             const auto &first_po = base_post_ops.entry_[0].eltwise;
             ACL_CHECK_SUPPORT(first_po.scale != 1.0f,
@@ -178,7 +176,7 @@ struct acl_post_ops_t {
     std::vector<std::shared_ptr<primitive_t>> post_op_primitives;
 };
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_prelu.cpp b/src/cpu/acl/acl_prelu.cpp
similarity index 96%
rename from src/cpu/aarch64/acl_prelu.cpp
rename to src/cpu/acl/acl_prelu.cpp
index e2aae9392c0..b118fe20811 100644
--- a/src/cpu/aarch64/acl_prelu.cpp
+++ b/src/cpu/acl/acl_prelu.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_prelu.hpp"
+#include "cpu/acl/acl_prelu.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 status_t acl_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
 
@@ -51,7 +51,7 @@ status_t acl_prelu_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_prelu.hpp b/src/cpu/acl/acl_prelu.hpp
similarity index 96%
rename from src/cpu/aarch64/acl_prelu.hpp
rename to src/cpu/acl/acl_prelu.hpp
index 8517d1bb3ee..a7b70402687 100644
--- a/src/cpu/aarch64/acl_prelu.hpp
+++ b/src/cpu/acl/acl_prelu.hpp
@@ -13,16 +13,16 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#ifndef CPU_AARCH64_ACL_PRELU_HPP
-#define CPU_AARCH64_ACL_PRELU_HPP
+#ifndef CPU_ACL_PRELU_HPP
+#define CPU_ACL_PRELU_HPP
 
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 #include "cpu/cpu_prelu_pd.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 struct acl_prelu_obj_t {
     arm_compute::NEPReluLayer prelu;
@@ -151,9 +151,9 @@ struct acl_prelu_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 }; // acl_prelu_fwd_t
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_PRELU_HPP
+#endif // CPU_ACL_PRELU_HPP
diff --git a/src/cpu/acl/acl_softmax.cpp b/src/cpu/acl/acl_softmax.cpp
new file mode 100644
index 00000000000..9b8fea25759
--- /dev/null
+++ b/src/cpu/acl/acl_softmax.cpp
@@ -0,0 +1,173 @@
+/*******************************************************************************
+* Copyright 2021-2024 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/acl/acl_softmax.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+
+const acl_softmax_fwd_t::pd_t *acl_softmax_fwd_t::pd() const {
+    return static_cast<const pd_t *>(primitive_t::pd().get());
+}
+
+status_t acl_softmax_fwd_t::pd_t::init(engine_t *engine) {
+
+    bool ok = is_fwd()
+            && set_default_formats() == status::success
+            // ACL only supports matching src/dst (this must come after
+            // set_default_formats() to handle format_kind::any)
+            && *src_md() == *dst_md()
+            && utils::one_of(
+                    src_md()->data_type, data_type::f32, data_type::f16)
+            && attr()->has_default_values();
+    if (!ok) return status::unimplemented;
+
+    // Get memory desc to find sizes and dims
+    const memory_desc_wrapper src_d(src_md());
+    const data_type_t data_type = src_d.data_type();
+
+    // ACL only supports plain tensors, can be permuted but not blocked
+    if (!src_d.is_plain()) return status::unimplemented;
+
+    // Guards against a 0-sized dimension
+    if (src_d.has_zero_dim()) return status::unimplemented;
+
+    // No scaling
+    asp_.beta = 1;
+
+    asp_.is_logsoftmax = is_logsoftmax();
+
+    // The strides give us the in memory inner size
+    dim_t inner_size_ = src_d.blocking_desc().strides[axis()];
+
+    dim_t axis_size_ = axis_size();
+
+    // The outer size is any left-over dimensions not inner or on the axis
+    dim_t outer_size_ = src_d.nelems() / (inner_size_ * axis_size_);
+
+    // In this context, NHWC tells ACL that the logical and physical
+    // dimensions are the same
+    arm_compute::DataLayout acl_layout = arm_compute::DataLayout::NHWC;
+
+    const arm_compute::DataType acl_data_t
+            = acl_utils::get_acl_data_t(data_type);
+
+    const int threads = dnnl_get_max_threads();
+
+    // A rough empirical heuristic created by fitting a polynomial
+    // of the tensor sizes and thread count to the run time of the
+    // ref and ACL softmax. This variable is greater than zero when
+    // ref is faster, and less than zero when ACL is faster. We can
+    // interpret the constant term as the constant overhead
+    // associated with calling the external library and the negative
+    // coefficient on total_size as ACL being faster at processing
+    // each element
+    auto calculate_performance_diff = [&](double axis_coeff) {
+        double acl_ref_performance_diff = 1 + 0.005 * outer_size_
+                + axis_coeff * axis_size_
+                        * std::ceil(double(outer_size_) / threads);
+
+        if (threads > 1 || outer_size_ > 1) {
+            acl_ref_performance_diff
+                    += 17; // Adds constant overhead for using threads within ACL
+        }
+        return acl_ref_performance_diff;
+    };
+
+    if (inner_size_ == 1) {
+        double acl_ref_performance_diff = calculate_performance_diff(-0.0027);
+        if (acl_ref_performance_diff > 0) return status::unimplemented;
+
+        // If the inner size is 1, we can get rid of the dimension.
+        // This stops ACL doing a unnecessary permute
+        arm_compute::TensorShape acl_tensor_shape
+                = arm_compute::TensorShape(axis_size_, outer_size_);
+        asp_.axis = 0;
+
+        asp_.src_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+        asp_.dst_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+    } else {
+        // A rough empirical heuristic, see comment above
+        // The only difference here is that ACL does a reorder, and so
+        // is considerably better
+        double acl_ref_performance_diff
+                = calculate_performance_diff(-0.01 * inner_size_);
+        if (acl_ref_performance_diff > 0) return status::unimplemented;
+
+        // Irrespective of the input dimensions, we construct a tensor
+        // with dimensions such that softmax can be applied over the
+        // middle axis (1), with the correct stride and vector length.
+        arm_compute::TensorShape acl_tensor_shape = arm_compute::TensorShape(
+                inner_size_, axis_size_, outer_size_);
+        asp_.axis = 1;
+
+        asp_.src_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+        asp_.dst_info = arm_compute::TensorInfo(
+                acl_tensor_shape, 1, acl_data_t, acl_layout);
+    }
+
+    // Validate manually to check for return status
+    ACL_CHECK_VALID(arm_compute::experimental::op::CpuSoftmax::validate(
+            &asp_.src_info, &asp_.dst_info, asp_.beta, asp_.axis));
+
+    return status::success;
+}
+
+status_t acl_softmax_fwd_t::init(engine_t *engine) {
+    auto asp = pd()->asp_;
+
+    auto op = std::make_unique<arm_compute::experimental::op::CpuSoftmax>();
+
+    softmax_op_ = std::move(op);
+    // Configure softmax operation, mem allocation happens.
+    softmax_op_->configure(&asp.src_info, &asp.dst_info, asp.beta, asp.axis,
+            asp.is_logsoftmax);
+
+    return status::success;
+}
+
+status_t acl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
+    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
+
+    auto asp = pd()->asp_;
+
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor dst_tensor;
+
+    src_tensor.allocator()->init(asp.src_info);
+    src_tensor.allocator()->import_memory(const_cast<void *>(src));
+    dst_tensor.allocator()->init(asp.dst_info);
+    dst_tensor.allocator()->import_memory(dst);
+
+    arm_compute::ITensorPack run_pack {
+            {arm_compute::TensorType::ACL_SRC_0, &src_tensor},
+            {arm_compute::TensorType::ACL_DST, &dst_tensor}};
+
+    softmax_op_->run(run_pack);
+
+    return status::success;
+}
+
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/acl/acl_softmax.hpp b/src/cpu/acl/acl_softmax.hpp
new file mode 100644
index 00000000000..470eea9a1a3
--- /dev/null
+++ b/src/cpu/acl/acl_softmax.hpp
@@ -0,0 +1,71 @@
+/*******************************************************************************
+* Copyright 2021-2024 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_ACL_SOFTMAX_HPP
+#define CPU_ACL_SOFTMAX_HPP
+
+#include "cpu/cpu_softmax_pd.hpp"
+
+#include "cpu/acl/acl_utils.hpp"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/experimental/operators/CpuSoftmax.h"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+
+struct acl_softmax_conf_t {
+    arm_compute::TensorInfo src_info;
+    arm_compute::TensorInfo dst_info;
+    float beta;
+    int32_t axis;
+    bool is_logsoftmax;
+};
+
+struct acl_softmax_fwd_t : public primitive_t {
+    struct pd_t : public cpu_softmax_fwd_pd_t {
+        using cpu_softmax_fwd_pd_t::cpu_softmax_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T("acl", acl_softmax_fwd_t);
+        status_t init(engine_t *engine);
+
+        acl_softmax_conf_t asp_;
+    }; // pd_t
+
+    // constructor
+    acl_softmax_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        return execute_forward(ctx);
+    }
+
+private:
+    const pd_t *pd() const;
+
+    status_t init(engine_t *engine) override;
+    status_t execute_forward(const exec_ctx_t &ctx) const;
+    std::unique_ptr<arm_compute::experimental::op::CpuSoftmax> softmax_op_;
+}; // acl_softmax_fwd_t
+
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/acl/acl_thread.cpp b/src/cpu/acl/acl_thread.cpp
new file mode 100644
index 00000000000..5ab2e428605
--- /dev/null
+++ b/src/cpu/acl/acl_thread.cpp
@@ -0,0 +1,125 @@
+/*******************************************************************************
+* Copyright 2022-2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/acl/acl_thread.hpp"
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+#include "cpu/acl/acl_threadpool_scheduler.hpp"
+#endif
+#include "cpu/acl/acl_benchmark_scheduler.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+
+namespace acl_thread_utils {
+
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+void acl_thread_bind() {
+    static std::once_flag flag_once;
+    // The threads in Compute Library are bound for the cores 0..max_threads-1
+    // dnnl_get_max_threads() returns OMP_NUM_THREADS
+    const int max_threads = dnnl_get_max_threads();
+    // arm_compute::Scheduler does not support concurrent access thus a
+    // workaround here restricts it to only one call
+    std::call_once(flag_once, [&]() {
+        arm_compute::Scheduler::get().set_num_threads(max_threads);
+    });
+}
+// Swap BenchmarkScheduler for default ACL scheduler builds (i.e. CPPScheduler, OMPScheduler)
+void acl_set_benchmark_scheduler_default() {
+    static std::once_flag flag_once;
+    arm_compute::IScheduler *_real_scheduler = &arm_compute::Scheduler::get();
+    std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+            = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+    // set Benchmark scheduler in ACL
+    std::call_once(flag_once, [&]() {
+        arm_compute::Scheduler::set(
+                std::static_pointer_cast<arm_compute::IScheduler>(
+                        benchmark_scheduler));
+    });
+}
+#endif
+
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+
+void acl_set_tp_scheduler() {
+    static thread_local std::once_flag flag_once;
+    // Create threadpool scheduler
+    std::call_once(flag_once, [&]() {
+        // Create threadpool scheduler
+        std::shared_ptr<arm_compute::IScheduler> threadpool_scheduler
+                = std::make_unique<ThreadpoolScheduler>();
+        arm_compute::Scheduler::set(threadpool_scheduler);
+    });
+}
+
+void acl_set_threadpool_num_threads() {
+    using namespace dnnl::impl::threadpool_utils;
+    static thread_local std::once_flag flag_once;
+    threadpool_interop::threadpool_iface *tp = get_active_threadpool();
+    // Check active threadpool
+    bool is_main = get_active_threadpool() == tp;
+    if (is_main) {
+        // Set num threads based on threadpool size
+        const int num_threads = (tp) ? dnnl_get_max_threads() : 1;
+        std::call_once(flag_once, [&]() {
+            arm_compute::Scheduler::get().set_num_threads(num_threads);
+        });
+    }
+}
+// Swap BenchmarkScheduler for custom scheduler builds (i.e. ThreadPoolScheduler)
+void acl_set_tp_benchmark_scheduler() {
+    static thread_local std::once_flag flag_once;
+    std::call_once(flag_once, [&]() {
+        // Create threadpool scheduler
+        std::unique_ptr<arm_compute::IScheduler> threadpool_scheduler
+                = std::make_unique<ThreadpoolScheduler>();
+        arm_compute::IScheduler *_real_scheduler = nullptr;
+        _real_scheduler = threadpool_scheduler.release();
+
+        // Create benchmark scheduler and set TP as real scheduler
+        std::shared_ptr<arm_compute::IScheduler> benchmark_scheduler
+                = std::make_unique<BenchmarkScheduler>(*_real_scheduler);
+
+        arm_compute::Scheduler::set(benchmark_scheduler);
+    });
+}
+#endif
+
+void set_acl_threading() {
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+    acl_thread_bind();
+    if (get_verbose(verbose_t::profile_externals)) {
+        acl_set_benchmark_scheduler_default();
+    }
+#endif
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+    if (get_verbose(verbose_t::profile_externals)) {
+        acl_set_tp_benchmark_scheduler();
+    } else {
+        acl_set_tp_scheduler();
+    }
+
+#endif
+}
+
+} // namespace acl_thread_utils
+
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/aarch64/acl_thread.hpp b/src/cpu/acl/acl_thread.hpp
similarity index 92%
rename from src/cpu/aarch64/acl_thread.hpp
rename to src/cpu/acl/acl_thread.hpp
index f073376e63a..26b65564d79 100644
--- a/src/cpu/aarch64/acl_thread.hpp
+++ b/src/cpu/acl/acl_thread.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_THREAD_HPP
-#define CPU_AARCH64_ACL_THREAD_HPP
+#ifndef CPU_ACL_THREAD_HPP
+#define CPU_ACL_THREAD_HPP
 
 #include "common/dnnl_thread.hpp"
 #include "common/verbose.hpp"
@@ -25,7 +25,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace acl_thread_utils {
 
@@ -49,9 +49,9 @@ void acl_set_tp_benchmark_scheduler();
 void set_acl_threading();
 } // namespace acl_thread_utils
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_THREAD_HPP
+#endif // CPU_ACL_THREAD_HPP
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.cpp b/src/cpu/acl/acl_threadpool_scheduler.cpp
similarity index 84%
rename from src/cpu/aarch64/acl_threadpool_scheduler.cpp
rename to src/cpu/acl/acl_threadpool_scheduler.cpp
index 30910398d9c..ae559c5ead9 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.cpp
+++ b/src/cpu/acl/acl_threadpool_scheduler.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Arm Ltd. and affiliates
+* Copyright 2022-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,33 +14,26 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_threadpool_scheduler.hpp"
+#include "cpu/acl/acl_threadpool_scheduler.hpp"
 
 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
 
-#include "cpu/aarch64/acl_thread.hpp"
-
 #include "common/counting_barrier.hpp"
 #include "common/dnnl_thread.hpp"
+#include "cpu/acl/acl_thread.hpp"
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/IScheduler.h"
 
-// BARRIER
 #include <atomic>
 #include <cassert>
-#include <chrono>
 #include <mutex>
-#include <thread>
-#include <condition_variable>
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 using namespace arm_compute;
 
@@ -51,7 +44,7 @@ class ThreadFeeder {
 
     /// Function to check the next element in the range if there is one.
     bool get_next(unsigned int &next) {
-        next = atomic_fetch_add_explicit(
+        next = std::atomic_fetch_add_explicit(
                 &_atomic_counter, 1u, std::memory_order_relaxed);
         return next < _end;
     }
@@ -70,11 +63,8 @@ void process_workloads(std::vector<IScheduler::Workload> &workloads,
     } while (feeder.get_next(workload_index));
 }
 
-ThreadpoolScheduler::ThreadpoolScheduler() {
-    using namespace dnnl::impl::threadpool_utils;
-    // Set number of threads to one when threadpool is not available.
-    _num_threads = get_active_threadpool() == nullptr ? 1 : num_threads_hint();
-}
+ThreadpoolScheduler::ThreadpoolScheduler()
+    : _num_threads(dnnl_get_max_threads()) {}
 
 ThreadpoolScheduler::~ThreadpoolScheduler() = default;
 
@@ -83,8 +73,8 @@ unsigned int ThreadpoolScheduler::num_threads() const {
 }
 
 void ThreadpoolScheduler::set_num_threads(unsigned int num_threads) {
-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
-    _num_threads = num_threads == 0 ? num_threads_hint() : num_threads;
+    std::lock_guard<std::mutex> lock(this->_mtx);
+    _num_threads = num_threads == 0 ? dnnl_get_max_threads() : num_threads;
 }
 
 void ThreadpoolScheduler::schedule(ICPPKernel *kernel, const Hints &hints) {
@@ -104,7 +94,7 @@ void ThreadpoolScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints,
 void ThreadpoolScheduler::run_workloads(
         std::vector<arm_compute::IScheduler::Workload> &workloads) {
 
-    arm_compute::lock_guard<std::mutex> lock(this->_run_workloads_mutex);
+    std::lock_guard<std::mutex> lock(this->_mtx);
 
     const unsigned int num_threads
             = std::min(static_cast<unsigned int>(_num_threads),
@@ -145,7 +135,7 @@ void ThreadpoolScheduler::run_workloads(
     if (is_async) b.wait();
 }
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_threadpool_scheduler.hpp b/src/cpu/acl/acl_threadpool_scheduler.hpp
similarity index 84%
rename from src/cpu/aarch64/acl_threadpool_scheduler.hpp
rename to src/cpu/acl/acl_threadpool_scheduler.hpp
index e9ba21c8032..6370141010e 100644
--- a/src/cpu/aarch64/acl_threadpool_scheduler.hpp
+++ b/src/cpu/acl/acl_threadpool_scheduler.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022, 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,25 +14,26 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_THREADPOOL_SCHEDULER_HPP
-#define CPU_AARCH64_ACL_THREADPOOL_SCHEDULER_HPP
+#ifndef CPU_ACL_THREADPOOL_SCHEDULER_HPP
+#define CPU_ACL_THREADPOOL_SCHEDULER_HPP
 
 #include "oneapi/dnnl/dnnl_config.h"
 
 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
 
 #include "arm_compute/runtime/IScheduler.h"
-#include "support/Mutex.h"
+
+#include <mutex>
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 class ThreadpoolScheduler final : public arm_compute::IScheduler {
 public:
     ThreadpoolScheduler();
-    ~ThreadpoolScheduler();
+    ~ThreadpoolScheduler() override;
 
     /// Sets the number of threads the scheduler will use to run the kernels.
     void set_num_threads(unsigned int num_threads) override;
@@ -54,15 +55,15 @@ class ThreadpoolScheduler final : public arm_compute::IScheduler {
     void run_workloads(std::vector<Workload> &workloads) override;
 
 private:
-    uint _num_threads {};
-    arm_compute::Mutex _run_workloads_mutex {};
+    unsigned int _num_threads {};
+    std::mutex _mtx;
 };
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_THREADPOOL_SCHEDULER_HPP
+#endif // CPU_ACL_THREADPOOL_SCHEDULER_HPP
 
 #endif // DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
diff --git a/src/cpu/aarch64/acl_utils.cpp b/src/cpu/acl/acl_utils.cpp
similarity index 99%
rename from src/cpu/aarch64/acl_utils.cpp
rename to src/cpu/acl/acl_utils.cpp
index eaf415df01f..a5d7b8a6048 100644
--- a/src/cpu/aarch64/acl_utils.cpp
+++ b/src/cpu/acl/acl_utils.cpp
@@ -14,12 +14,12 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace acl_utils {
 
@@ -345,7 +345,7 @@ void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
 
 } // namespace acl_utils
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/acl_utils.hpp b/src/cpu/acl/acl_utils.hpp
similarity index 97%
rename from src/cpu/aarch64/acl_utils.hpp
rename to src/cpu/acl/acl_utils.hpp
index f76a78b9ff1..939d0001f4d 100644
--- a/src/cpu/aarch64/acl_utils.hpp
+++ b/src/cpu/acl/acl_utils.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_UTILS_HPP
-#define CPU_AARCH64_ACL_UTILS_HPP
+#ifndef CPU_ACL_UTILS_HPP
+#define CPU_ACL_UTILS_HPP
 
 #include <mutex>
 
@@ -33,7 +33,7 @@
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace acl_utils {
 
@@ -124,9 +124,9 @@ void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
 
 } // namespace acl_utils
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_UTILS_HPP
+#endif // CPU_ACL_UTILS_HPP
diff --git a/src/cpu/acl/acl_winograd_convolution.cpp b/src/cpu/acl/acl_winograd_convolution.cpp
new file mode 100644
index 00000000000..eb2e0bd9883
--- /dev/null
+++ b/src/cpu/acl/acl_winograd_convolution.cpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2020-2023, 2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/acl/acl_winograd_convolution.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+using data_t = prec_traits_t<data_type::f32>::type;
+
+status_t acl_wino_convolution_fwd_t::execute_forward(
+        const exec_ctx_t &ctx) const {
+    // Lock here is needed because resource_mapper does not support
+    // concurrent multithreaded access.
+    std::lock_guard<std::mutex> _lock {this->mtx};
+    // Retrieve primitive resource and configured Compute Library objects
+    auto *acl_resource
+            = ctx.get_resource_mapper()->get<acl_wino_resource_t>(this);
+    acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &acl_wino_obj
+            = acl_resource->get_acl_obj();
+
+    return execute_forward_conv_acl<
+            acl_obj_t<arm_compute::NEWinogradConvolutionLayer>, pd_t, data_t>(
+            ctx, acl_wino_obj, pd());
+}
+
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/acl/acl_winograd_convolution.hpp b/src/cpu/acl/acl_winograd_convolution.hpp
new file mode 100644
index 00000000000..9c29ea376a3
--- /dev/null
+++ b/src/cpu/acl/acl_winograd_convolution.hpp
@@ -0,0 +1,152 @@
+/*******************************************************************************
+* Copyright 2020-2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_ACL_WINOGRAD_CONVOLUTION_HPP
+#define CPU_ACL_WINOGRAD_CONVOLUTION_HPP
+
+#include "cpu/cpu_convolution_pd.hpp"
+
+#include "cpu/acl/acl_convolution_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+
+struct acl_wino_resource_t : public resource_t {
+    acl_wino_resource_t()
+        : acl_wino_obj_(utils::make_unique<
+                acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>()) {}
+
+    status_t configure(const acl_conv_conf_t &acp) {
+        if (!acl_wino_obj_) return status::out_of_memory;
+
+        // Init Compute Library tensors based on info from descriptor
+        acl_wino_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
+        acl_wino_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
+        acl_wino_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
+        acl_wino_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);
+
+        // clang-format off
+        acl_wino_obj_->conv.configure(
+            &acl_wino_obj_->src_tensor,
+            &acl_wino_obj_->wei_tensor,
+            acp.with_bias ? &acl_wino_obj_->bia_tensor : nullptr,
+            &acl_wino_obj_->dst_tensor,
+            acp.padstride_info,
+            acp.act_info,
+            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
+        // clang-format on
+
+        return status::success;
+    }
+
+    acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &get_acl_obj() const {
+        return *acl_wino_obj_;
+    }
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_wino_resource_t);
+
+private:
+    std::unique_ptr<acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>
+            acl_wino_obj_;
+}; // acl_wino_resource_t
+
+struct acl_wino_convolution_fwd_t : public primitive_t {
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T(
+                "wino:acl", acl_wino_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
+
+        status_t init(engine_t *engine) {
+            using namespace data_type;
+            const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
+                    && attr()->has_default_values(
+                            primitive_attr_t::skip_mask_t::post_ops, f16);
+            const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
+                    && attr()->has_default_values(
+                            primitive_attr_t::skip_mask_t::post_ops, f32);
+            bool ok = is_fwd()
+                    && utils::one_of(desc()->alg_kind,
+                            alg_kind::convolution_auto,
+                            alg_kind::convolution_winograd)
+                    && utils::one_of(true, is_fp16_ok, is_fp32_ok)
+                    && !has_zero_dim_memory();
+
+            ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
+            if (!ok) return status::unimplemented;
+
+            CHECK(acl_convolution_utils::init_conf_wino(acp_, src_md_,
+                    weights_md_, dst_md_, bias_md_, *desc(), *attr()));
+
+            set_default_alg_kind(alg_kind::convolution_winograd);
+
+            CHECK(post_ops.init(
+                    engine, attr_.post_ops_, dst_md_, acp_.act_info));
+            acp_.use_dst_acc_for_sum = post_ops.has_sum();
+
+            if (acp_.use_dst_acc_for_sum) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_generic_acc,
+                        dst_d.nelems(), dst_d.data_type_size());
+            }
+
+            return status::success;
+        }
+
+        acl_conv_conf_t acp_ = utils::zero<decltype(acp_)>();
+        acl_post_ops_t post_ops;
+    };
+
+    acl_wino_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    status_t create_resource(
+            engine_t *engine, resource_mapper_t &mapper) const override {
+        if (mapper.has_resource(this)) return status::success;
+
+        auto r = utils::make_unique<acl_wino_resource_t>();
+        if (!r) return status::out_of_memory;
+
+        // Configure the resource based on information from primitive descriptor
+        CHECK(r->configure(pd()->acp_));
+        mapper.add(this, std::move(r));
+
+        return status::success;
+    }
+
+    ~acl_wino_convolution_fwd_t() override = default;
+
+    using data_t = typename prec_traits_t<data_type::f32>::type;
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        return execute_forward(ctx);
+    }
+
+private:
+    // To guard the const execute_forward(), the mutex must be 'mutable'
+    mutable std::mutex mtx;
+    status_t execute_forward(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+}; // acl_wino_convolution_fwd_t
+
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_ACL_WINOGRAD_CONVOLUTION_HPP
diff --git a/src/cpu/aarch64/matmul/acl_lowp_matmul.cpp b/src/cpu/acl/matmul/acl_lowp_matmul.cpp
similarity index 75%
rename from src/cpu/aarch64/matmul/acl_lowp_matmul.cpp
rename to src/cpu/acl/matmul/acl_lowp_matmul.cpp
index 076d5fd321a..925431cea0c 100644
--- a/src/cpu/aarch64/matmul/acl_lowp_matmul.cpp
+++ b/src/cpu/acl/matmul/acl_lowp_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Arm Ltd. and affiliates
+* Copyright 2024-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,12 +14,14 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/matmul/acl_lowp_matmul.hpp"
+#include "cpu/acl/matmul/acl_lowp_matmul.hpp"
+
+#include "src/cpu/CpuTypes.h"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 namespace matmul {
 
 status_t acl_lowp_matmul_resource_t::configure(
@@ -61,18 +63,25 @@ status_t acl_lowp_matmul_resource_t::configure(
 status_t acl_lowp_matmul_t::pd_t::init(engine_t *engine) {
     VDISPATCH_MATMUL(set_default_formats(), "failed to set default formats");
     using smask_t = primitive_attr_t::skip_mask_t;
-    VDISPATCH_MATMUL(
-            attr()->has_default_values(smask_t::scales_runtime
-                    | smask_t::zero_points_runtime | smask_t::post_ops),
+    VDISPATCH_MATMUL(attr()->has_default_values(smask_t::scales
+                             | smask_t::zero_points | smask_t::post_ops),
             "only scale, zero point and post-ops attrs supported");
 
-    VDISPATCH_MATMUL(attr()->scales_.get(DNNL_ARG_SRC).mask_ == 0
-                    && attr()->zero_points_.get(DNNL_ARG_SRC) == 0
-                    && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0
-                    && attr()->zero_points_.get(DNNL_ARG_WEIGHTS) == 0
-                    && attr()->scales_.get(DNNL_ARG_DST).mask_ == 0
-                    && attr()->zero_points_.get(DNNL_ARG_DST) == 0,
-            "common scales and zero points only");
+    static const std::vector<int> supported_args {
+            DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+    for (int arg : supported_args) {
+        if (attr()->scales_.has_default_values(arg)) continue;
+
+        VDISPATCH_MATMUL(attr()->scales_.get_mask(arg) == 0,
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+    }
+
+    for (int arg : supported_args) {
+        if (attr()->zero_points_.has_default_values(arg)) continue;
+
+        VDISPATCH_MATMUL(attr()->zero_points_.get_mask(arg) == 0,
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+    }
 
     VDISPATCH_MATMUL(
             !has_runtime_dims_or_strides(), VERBOSE_RUNTIMEDIM_UNSUPPORTED);
@@ -82,6 +91,14 @@ status_t acl_lowp_matmul_t::pd_t::init(engine_t *engine) {
     const memory_desc_wrapper bia_d(bias_md_);
     const memory_desc_wrapper dst_d(dst_md_);
 
+    cpu::matmul::matmul_helper_t helper(src_d, wei_d, dst_d);
+    const dim_t M = helper.M();
+    const dim_t N = helper.N();
+    const dim_t K = helper.K();
+    const dim_t dst_batch = helper.batch();
+    const dim_t src_batch = helper.src_batch();
+    const dim_t wei_batch = helper.wei_batch();
+
     using namespace data_type;
 
     // Note that has_default_values checks the argument for default zero
@@ -100,39 +117,66 @@ status_t acl_lowp_matmul_t::pd_t::init(engine_t *engine) {
             VERBOSE_UNSUPPORTED_DT_CFG);
     almc_.dst_is_s8 = dst_d.data_type() == s8;
 
-    VDISPATCH_MATMUL(src_d.matches_tag(format_tag::ab)
-                    && wei_d.matches_tag(format_tag::ab)
-                    && dst_d.matches_tag(format_tag::ab),
-            VERBOSE_UNSUPPORTED_TAG);
+    // reject in case the op is running on a cpu that have i8mm instruction set.
+    // this is a temporary fix until the issue is resolved.
+    VDISPATCH_MATMUL(
+            arm_compute::CPUInfo::get().has_i8mm() || dst_d.data_type() != s8,
+            "Op not supported on CPUs without i8mm instructions when dest "
+            "datatype is s8");
+
+    using namespace format_tag;
+    auto src_tag = memory_desc_matches_one_of_tag(src_md_, abcd, abc, ab);
+    auto wei_tag = memory_desc_matches_one_of_tag(weights_md_, abcd, abc, ab);
+    auto dst_tag = memory_desc_matches_one_of_tag(dst_md_, abcd, abc, ab);
 
-    VDISPATCH_MATMUL_SC(
-            memory_desc_init_by_tag(bias_md_, bias_md_.ndims, bias_md_.dims,
-                    bias_md_.data_type, format_tag::ab),
+    ACL_CHECK_SUPPORT(
+            utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
+            "Format tag is undefined");
+
+    VDISPATCH_MATMUL_SC(memory_desc_init_by_tag(bias_md_, bias_md_.ndims,
+                                bias_md_.dims, bias_md_.data_type, dst_tag),
             VERBOSE_UNSUPPORTED_BIAS_CFG);
 
     // We set the QuantizationInfo to be dynamic because it is re-set in run()
-    almc_.src_tensor_info
-            = arm_compute::TensorInfo(arm_compute::TensorShape(K(), M()), 1,
-                    arm_compute::DataType::QASYMM8_SIGNED,
-                    arm_compute::QuantizationInfo(1.0, 0, true));
+    almc_.src_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(K, M, 1, src_batch), 1,
+            arm_compute::DataType::QASYMM8_SIGNED,
+            arm_compute::QuantizationInfo(1.0, 0, true));
     almc_.src_tensor_info.set_are_values_constant(false);
 
     almc_.wei_tensor_info
-            = arm_compute::TensorInfo(arm_compute::TensorShape(N(), K()), 1,
-                    arm_compute::DataType::QASYMM8_SIGNED,
+            = arm_compute::TensorInfo(arm_compute::TensorShape(N, K, wei_batch),
+                    1, arm_compute::DataType::QASYMM8_SIGNED,
                     arm_compute::QuantizationInfo(1.0, 0, true));
     almc_.wei_tensor_info.set_are_values_constant(false);
 
     almc_.bia_tensor_info = arm_compute::TensorInfo(
             arm_compute::TensorShape(), 1, arm_compute::DataType::F32);
     almc_.with_bias = bia_d.format_kind() != format_kind::undef;
+
     if (almc_.with_bias) {
-        // This is not currently guarded in ACL
-        VDISPATCH_MATMUL(bia_d.ndims() == 2 && bia_d.dims()[0] == 1
-                        && bia_d.dims()[1] == N(),
-                "Only 1xN bias is supported");
-        almc_.bia_tensor_info.set_tensor_shape(
-                arm_compute::TensorShape(bia_d.dims()[1], bia_d.dims()[0]));
+        switch (bia_d.ndims()) {
+            case 2:
+                VDISPATCH_MATMUL(bia_d.dims()[0] == 1 && bia_d.dims()[1] == N,
+                        "Only 1xN bias is supported for 2D input");
+                almc_.bia_tensor_info.set_tensor_shape(
+                        arm_compute::TensorShape(bia_d.dims()[1], 1));
+                break;
+            case 3:
+                VDISPATCH_MATMUL(bia_d.dims()[0] == 1 && bia_d.dims()[1] == 1
+                                && bia_d.dims()[2] == N,
+                        "Only 1x1xN bias is supported for 3D input");
+                almc_.bia_tensor_info.set_tensor_shape(
+                        arm_compute::TensorShape(bia_d.dims()[2], 1, 1));
+                break;
+            case 4:
+                VDISPATCH_MATMUL(bia_d.dims()[0] == 1 && bia_d.dims()[1] == 1
+                                && bia_d.dims()[2] == 1 && bia_d.dims()[3] == N,
+                        "Only 1x1x1xN bias is supported for 4D input");
+                almc_.bia_tensor_info.set_tensor_shape(
+                        arm_compute::TensorShape(bia_d.dims()[3], 1, 1, 1));
+                break;
+        }
     }
 
     // We can fuse sum if it is the first post op
@@ -166,14 +210,15 @@ status_t acl_lowp_matmul_t::pd_t::init(engine_t *engine) {
             almc_.gemm_info.accumulate() ? 1 : 0));
 
     almc_.dst_tensor_info = arm_compute::TensorInfo(
-            arm_compute::TensorShape(N(), M()), arm_compute::Format::F32);
+            arm_compute::TensorShape(N, M, 1, dst_batch),
+            arm_compute::Format::F32);
 
     almc_.dst_cast_tensor_info = almc_.dst_tensor_info;
 
-    almc_.dst_s8_tensor_info
-            = arm_compute::TensorInfo(arm_compute::TensorShape(N(), M()), 1,
-                    arm_compute::DataType::QASYMM8_SIGNED,
-                    arm_compute::QuantizationInfo(1.0, 0, true));
+    almc_.dst_s8_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(N, M, 1, dst_batch), 1,
+            arm_compute::DataType::QASYMM8_SIGNED,
+            arm_compute::QuantizationInfo(1.0, 0, true));
 
     ACL_CHECK_VALID(arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
             &almc_.src_tensor_info, &almc_.wei_tensor_info,
@@ -203,11 +248,11 @@ status_t acl_lowp_matmul_t::pd_t::init_scratchpad(
     const memory_desc_wrapper dst_d(&dst_md_);
     if (almc_.use_dst_acc) {
         scratchpad.book(memory_tracking::names::key_matmul_dst_in_acc_dt,
-                dst_d.nelems(), sizeof(float32_t));
+                dst_d.nelems(), sizeof(arm_compute::float32_t));
     }
     if (almc_.use_cast_acc) {
         scratchpad.book(memory_tracking::names::key_matmul_dst_cast_acc,
-                dst_d.nelems(), sizeof(float32_t));
+                dst_d.nelems(), sizeof(arm_compute::float32_t));
     }
     return status::success;
 }
@@ -326,7 +371,7 @@ status_t acl_lowp_matmul_t::execute(const exec_ctx_t &ctx) const {
 };
 
 } // namespace matmul
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp b/src/cpu/acl/matmul/acl_lowp_matmul.hpp
similarity index 89%
rename from src/cpu/aarch64/matmul/acl_lowp_matmul.hpp
rename to src/cpu/acl/matmul/acl_lowp_matmul.hpp
index 30502aea1cc..3aaee9a70df 100644
--- a/src/cpu/aarch64/matmul/acl_lowp_matmul.hpp
+++ b/src/cpu/acl/matmul/acl_lowp_matmul.hpp
@@ -21,16 +21,18 @@
 #include "cpu/matmul/cpu_matmul_pd.hpp"
 #include "cpu/matmul/matmul_utils.hpp"
 
+#include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
-#include "cpu/aarch64/acl_post_ops.hpp"
-#include "cpu/aarch64/acl_utils.hpp"
+
+#include "cpu/acl/acl_post_ops.hpp"
+#include "cpu/acl/acl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 namespace matmul {
 
 struct acl_lowp_matmul_obj_t {
@@ -76,11 +78,6 @@ struct acl_lowp_matmul_resource_t : public resource_t {
 
 struct acl_lowp_matmul_t : public primitive_t {
     struct pd_t : public dnnl::impl::cpu::matmul::cpu_matmul_pd_t {
-
-        pd_t(const matmul_desc_t *adesc, const primitive_attr_t *attr,
-                const cpu_matmul_pd_t *hint_fwd_pd)
-            : cpu_matmul_pd_t(adesc, attr, hint_fwd_pd), almc_() {}
-
         using cpu_matmul_pd_t::cpu_matmul_pd_t;
 
         DECLARE_COMMON_PD_T(
@@ -90,7 +87,7 @@ struct acl_lowp_matmul_t : public primitive_t {
 
         status_t init_scratchpad(memory_tracking::registrar_t &scratchpad);
 
-        acl_lowp_matmul_conf_t almc_;
+        acl_lowp_matmul_conf_t almc_ = utils::zero<decltype(almc_)>();
         acl_post_ops_t acl_post_ops;
     };
 
@@ -106,9 +103,9 @@ struct acl_lowp_matmul_t : public primitive_t {
 };
 
 } // namespace matmul
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_LOWP_MATMUL_HPP
\ No newline at end of file
+#endif // ACL_LOWP_MATMUL_HPP
diff --git a/src/cpu/acl/matmul/acl_lowp_matmul_sq.cpp b/src/cpu/acl/matmul/acl_lowp_matmul_sq.cpp
new file mode 100644
index 00000000000..e4ac7dbb485
--- /dev/null
+++ b/src/cpu/acl/matmul/acl_lowp_matmul_sq.cpp
@@ -0,0 +1,273 @@
+/*******************************************************************************
+* Copyright 2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/acl/matmul/acl_lowp_matmul_sq.hpp"
+
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
+
+#include "cpu/acl/acl_utils.hpp"
+#include "src/cpu/CpuTypes.h"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+namespace matmul {
+status_t acl_lowp_matmul_sq_resource_t::configure(
+        const acl_lowp_matmul_sq_conf_t &almc) {
+    if (!acl_obj_) return status::out_of_memory;
+    acl_obj_->src_tensor.allocator()->init(almc.src_tensor_info);
+    acl_obj_->wei_tensor.allocator()->init(almc.wei_tensor_info);
+    if (almc.with_bias) {
+        acl_obj_->bia_tensor.allocator()->init(almc.bia_tensor_info);
+    }
+    acl_obj_->dst_tensor.allocator()->init(almc.dst_tensor_info);
+    arm_compute::QuantizationInfo qi {1.0, 0, true};
+    acl_obj_->src_tensor.info()->set_quantization_info(qi);
+    acl_obj_->wei_tensor.info()->set_quantization_info(qi);
+    acl_obj_->dst_tensor.info()->set_quantization_info(qi);
+    acl_obj_->gemm.configure(&acl_obj_->src_tensor, &acl_obj_->wei_tensor,
+            almc.with_bias ? &acl_obj_->bia_tensor : nullptr,
+            &acl_obj_->dst_tensor, almc.gemm_info);
+    return status::success;
+}
+status_t acl_lowp_matmul_sq_t::pd_t::init(engine_t *engine) {
+    VDISPATCH_MATMUL(set_default_formats(), "failed to set default formats");
+    using smask_t = primitive_attr_t::skip_mask_t;
+    VDISPATCH_MATMUL(attr()->has_default_values(smask_t::scales
+                             | smask_t::zero_points | smask_t::post_ops),
+            "only scale, zero point and post-ops attrs supported");
+
+    static const std::vector<int> supported_args {
+            DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+    for (int arg : supported_args) {
+        if (attr()->scales_.has_default_values(arg)) continue;
+
+        VDISPATCH_MATMUL(attr()->scales_.get_mask(arg) == 0,
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+    }
+
+    for (int arg : supported_args) {
+        if (attr()->zero_points_.has_default_values(arg)) continue;
+
+        VDISPATCH_MATMUL(attr()->zero_points_.get_mask(arg) == 0,
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+    }
+
+    VDISPATCH_MATMUL(
+            !has_runtime_dims_or_strides(), VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+    const memory_desc_wrapper src_d(src_md_);
+    const memory_desc_wrapper wei_d(weights_md_);
+    const memory_desc_wrapper bia_d(bias_md_);
+    const memory_desc_wrapper dst_d(dst_md_);
+
+    cpu::matmul::matmul_helper_t helper(src_d, wei_d, dst_d);
+    const dim_t M = helper.M();
+    const dim_t N = helper.N();
+    const dim_t K = helper.K();
+    const dim_t dst_batch = helper.batch();
+    const dim_t src_batch = helper.src_batch();
+    const dim_t wei_batch = helper.wei_batch();
+
+    using namespace data_type;
+    VDISPATCH_MATMUL(utils::one_of(src_d.data_type(), s8, u8)
+                    && wei_d.data_type() == s8
+                    && (src_d.data_type() == s8 ? dst_d.data_type() == s8
+                                                : dst_d.data_type() == u8),
+            VERBOSE_UNSUPPORTED_DT_CFG);
+    VDISPATCH_MATMUL(utils::one_of(bia_d.data_type(), f32, undef),
+            VERBOSE_UNSUPPORTED_DT_CFG);
+
+    // reject in case the op is running on a cpu that have i8mm instruction set.
+    // this is a temporary fix until the issue is resolved.
+    VDISPATCH_MATMUL(arm_compute::CPUInfo::get().has_i8mm(),
+            "Op not supported on CPUs without i8mm instructions");
+
+    // ACL batch dimension only support s32 for 3D and 4D
+    VDISPATCH_MATMUL(
+            wei_batch == 1, "Batch dimension must be 1 for the weights");
+
+    using namespace format_tag;
+    auto src_tag = memory_desc_matches_one_of_tag(src_md_, abcd, abc, ab);
+    auto wei_tag = memory_desc_matches_one_of_tag(weights_md_, abcd, abc, ab);
+    auto dst_tag = memory_desc_matches_one_of_tag(dst_md_, abcd, abc, ab);
+
+    ACL_CHECK_SUPPORT(
+            utils::one_of(format_tag::undef, src_tag, wei_tag, dst_tag),
+            "Format tag is undefined");
+
+    VDISPATCH_MATMUL_SC(memory_desc_init_by_tag(bias_md_, bias_md_.ndims,
+                                bias_md_.dims, bias_md_.data_type, dst_tag),
+            VERBOSE_UNSUPPORTED_BIAS_CFG);
+
+    almc_.bia_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(), 1, arm_compute::DataType::S32);
+    almc_.with_bias = bia_d.format_kind() != format_kind::undef;
+
+    almc_.src_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(K, M, 1, src_batch), 1,
+            acl_utils::get_acl_data_t(src_d.data_type(), true),
+            arm_compute::QuantizationInfo(1.0, 0, true));
+    almc_.src_tensor_info.set_are_values_constant(false);
+
+    almc_.wei_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(N, K, 1, wei_batch), 1,
+            acl_utils::get_acl_data_t(wei_d.data_type(), true),
+            arm_compute::QuantizationInfo(1.0, 0, true));
+    almc_.wei_tensor_info.set_are_values_constant(false);
+    almc_.dst_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(N, M, 1, dst_batch), 1,
+            acl_utils::get_acl_data_t(dst_d.data_type(), true),
+            arm_compute::QuantizationInfo(1.0, 0, true));
+
+    almc_.bia_tensor_info = arm_compute::TensorInfo(
+            arm_compute::TensorShape(), 1, arm_compute::DataType::S32);
+    almc_.with_bias = bia_d.format_kind() != format_kind::undef;
+
+    if (almc_.with_bias) {
+        switch (bia_d.ndims()) {
+            case 2:
+                VDISPATCH_MATMUL(bia_d.dims()[0] == 1 && bia_d.dims()[1] == N,
+                        "Only 1xN bias is supported for 2D input");
+                almc_.bia_tensor_info.set_tensor_shape(arm_compute::TensorShape(
+                        bia_d.dims()[1], bia_d.dims()[0]));
+                break;
+            case 3:
+                VDISPATCH_MATMUL(bia_d.dims()[0] == 1 && bia_d.dims()[1] == 1
+                                && bia_d.dims()[2] == N,
+                        "Only 1x1xN bias is supported for 3D input");
+                almc_.bia_tensor_info.set_tensor_shape(
+                        arm_compute::TensorShape(bia_d.dims()[2], 1, 1));
+                break;
+            case 4:
+                VDISPATCH_MATMUL(bia_d.dims()[0] == 1 && bia_d.dims()[1] == 1
+                                && bia_d.dims()[2] == 1 && bia_d.dims()[3] == N,
+                        "Only 1x1x1xN bias is supported for 4D input");
+                almc_.bia_tensor_info.set_tensor_shape(
+                        arm_compute::TensorShape(bia_d.dims()[3], 1, 1, 1));
+                break;
+        }
+    }
+
+    arm_compute::GEMMLowpOutputStageInfo info;
+    info.type = arm_compute::GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    info.gemmlowp_multiplier = 1073741824;
+    info.gemmlowp_shift = -1;
+    info.gemmlowp_offset = 0;
+    info.gemmlowp_min_bound = -128;
+    info.gemmlowp_max_bound = 127;
+    info.output_data_type = almc_.dst_tensor_info.data_type();
+    almc_.gemm_info.set_gemmlowp_output_stage(info);
+    auto scratchpad = scratchpad_registry().registrar();
+    const dnnl::impl::memory_desc_t dst_md_ {desc_.dst_desc};
+    arm_compute::ActivationLayerInfo act_info;
+
+    CHECK(init_scratchpad(engine, scratchpad, acl_post_ops, attr_.post_ops_,
+            act_info, dst_md_));
+    almc_.gemm_info.set_activation_info(act_info);
+
+    ACL_CHECK_VALID(arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
+            &almc_.src_tensor_info, &almc_.wei_tensor_info,
+            almc_.with_bias ? &almc_.bia_tensor_info : nullptr,
+            &almc_.dst_tensor_info, almc_.gemm_info));
+    return status::success;
+}
+
+status_t acl_lowp_matmul_sq_t::pd_t::init_scratchpad(engine_t *engine,
+        memory_tracking::registrar_t &scratchpad, acl_post_ops_t &post_ops,
+        dnnl::impl::post_ops_t &attr_post_ops,
+        arm_compute::ActivationLayerInfo &act_info,
+        const dnnl::impl::memory_desc_t &dst_md) {
+    CHECK(post_ops.init(engine, attr_post_ops, dst_md, act_info));
+    // ACL only accepts s32 bias for quantization and since
+    // the current bias vector is f32 we need to convert.
+    if (almc_.with_bias) {
+        const memory_desc_wrapper bias_d(&bias_md_);
+        scratchpad.book(memory_tracking::names::key_conv_bias_s32_convert,
+                bias_d.nelems(), bias_d.data_type_size());
+    }
+    return status::success;
+}
+status_t acl_lowp_matmul_sq_t::create_resource(
+        engine_t *engine, resource_mapper_t &mapper) const {
+    if (mapper.has_resource(this)) return status::success;
+    auto r = utils::make_unique<acl_lowp_matmul_sq_resource_t>();
+    if (!r) return status::out_of_memory;
+    CHECK(r->configure(pd()->almc_));
+    mapper.add(this, std::move(r));
+    return status::success;
+}
+status_t acl_lowp_matmul_sq_t::execute(const exec_ctx_t &ctx) const {
+    std::lock_guard<std::mutex> _lock {this->mtx_};
+    bool with_bias = pd()->almc_.with_bias;
+    acl_lowp_matmul_sq_obj_t &acl_obj
+            = ctx.get_resource_mapper()
+                      ->get<acl_lowp_matmul_sq_resource_t>(this)
+                      ->get_acl_obj();
+    auto src = CTX_IN_MEM(const int8_t *, DNNL_ARG_SRC);
+    auto wei = CTX_IN_MEM(const int8_t *, DNNL_ARG_WEIGHTS);
+    auto dst = CTX_OUT_MEM(const int8_t *, DNNL_ARG_DST);
+    acl_obj.src_tensor.allocator()->import_memory(const_cast<int8_t *>(src));
+    acl_obj.wei_tensor.allocator()->import_memory(const_cast<int8_t *>(wei));
+    acl_obj.dst_tensor.allocator()->import_memory(const_cast<int8_t *>(dst));
+    DEFINE_ARG_SCALES_BUFFER(src_scale, DNNL_ARG_SRC);
+    DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
+    DEFINE_ARG_SCALES_BUFFER(wei_scale, DNNL_ARG_WEIGHTS);
+    DEFINE_ZERO_POINT_VALUE(wei_zero_point, DNNL_ARG_WEIGHTS);
+    DEFINE_ARG_SCALES_BUFFER(dst_scale, DNNL_ARG_DST);
+    DEFINE_ZERO_POINT_VALUE(dst_zero_point, DNNL_ARG_DST);
+    if (with_bias) {
+        const auto scratchpad = ctx.get_scratchpad_grantor();
+        auto bia_s32_base = scratchpad.get<uint32_t>(
+                memory_tracking::names::key_conv_bias_s32_convert);
+        auto bia_f32_base = CTX_IN_MEM(const arm_compute::float32_t *, DNNL_ARG_BIAS);
+        const float bias_scale = 1 / (*src_scale * (*wei_scale));
+        const int num_elements
+                = acl_obj.bia_tensor.info()->total_size() / sizeof(arm_compute::float32_t);
+        parallel_nd(num_elements, [&](dim_t e) {
+            const auto b = int32_t(std::round(bia_f32_base[e] * bias_scale));
+            bia_s32_base[e] = b;
+        });
+        acl_obj.bia_tensor.allocator()->init(*acl_obj.bia_tensor.info());
+        acl_obj.bia_tensor.allocator()->import_memory(bia_s32_base);
+    }
+    acl_obj.src_tensor.info()->set_quantization_info(
+            arm_compute::QuantizationInfo(*src_scale, -src_zero_point, true));
+    acl_obj.wei_tensor.info()->set_quantization_info(
+            arm_compute::QuantizationInfo(*wei_scale, -wei_zero_point, true));
+    // for efficiency reasons, oneDNN saves the inverse of the destination
+    acl_obj.dst_tensor.info()->set_quantization_info(
+            arm_compute::QuantizationInfo(
+                    1.0 / (*dst_scale), dst_zero_point, true));
+    // The two calls below are stateful and, therefore, not fully thread-safe.
+    // This issue is being addressed, and the lock will be removed when the
+    // matmul stateless work is finished.
+    acl_obj.gemm.update_quantization_parameters();
+    acl_obj.gemm.run();
+    // free() here tells ACL it can no longer use it, it does not deallocate
+    acl_obj.src_tensor.allocator()->free();
+    acl_obj.wei_tensor.allocator()->free();
+    if (with_bias) { acl_obj.bia_tensor.allocator()->free(); }
+    acl_obj.dst_tensor.allocator()->free();
+    return status::success;
+};
+} // namespace matmul
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/acl/matmul/acl_lowp_matmul_sq.hpp b/src/cpu/acl/matmul/acl_lowp_matmul_sq.hpp
new file mode 100644
index 00000000000..d9c6192206f
--- /dev/null
+++ b/src/cpu/acl/matmul/acl_lowp_matmul_sq.hpp
@@ -0,0 +1,105 @@
+/*******************************************************************************
+* Copyright 2025 Arm Ltd. and affiliates
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef ACL_LOWP_MATMUL_SQ_HPP
+#define ACL_LOWP_MATMUL_SQ_HPP
+
+#include <random>
+
+#include "cpu/cpu_primitive.hpp"
+#include "cpu/matmul/cpu_matmul_pd.hpp"
+#include "cpu/matmul/matmul_utils.hpp"
+
+#include "cpu/acl/acl_post_ops.hpp"
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace acl {
+namespace matmul {
+
+struct acl_lowp_matmul_sq_obj_t {
+    arm_compute::GEMMLowpOutputStageInfo info;
+    arm_compute::NEGEMMLowpMatrixMultiplyCore gemm;
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor wei_tensor;
+    arm_compute::Tensor bia_tensor;
+    arm_compute::Tensor dst_tensor;
+};
+
+struct acl_lowp_matmul_sq_conf_t {
+    bool with_bias;
+    arm_compute::TensorInfo src_tensor_info;
+    arm_compute::TensorInfo wei_tensor_info;
+    arm_compute::TensorInfo bia_tensor_info;
+    arm_compute::TensorInfo dst_tensor_info;
+    arm_compute::GEMMInfo gemm_info;
+};
+
+struct acl_lowp_matmul_sq_resource_t : public resource_t {
+    acl_lowp_matmul_sq_resource_t()
+        : acl_obj_(utils::make_unique<acl_lowp_matmul_sq_obj_t>()) {}
+
+    status_t configure(const acl_lowp_matmul_sq_conf_t &almc);
+
+    acl_lowp_matmul_sq_obj_t &get_acl_obj() const { return *acl_obj_; }
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_lowp_matmul_sq_resource_t);
+
+private:
+    std::unique_ptr<acl_lowp_matmul_sq_obj_t> acl_obj_;
+};
+
+struct acl_lowp_matmul_sq_t : public primitive_t {
+    struct pd_t : public dnnl::impl::cpu::matmul::cpu_matmul_pd_t {
+
+        using cpu_matmul_pd_t::cpu_matmul_pd_t;
+
+        DECLARE_COMMON_PD_T("lowp_gemm_sq:acl", acl_lowp_matmul_sq_t,
+                USE_GLOBAL_SCRATCHPAD);
+
+        status_t init(engine_t *engine);
+
+        status_t init_scratchpad(engine_t *engine,
+                memory_tracking::registrar_t &scratchpad,
+                acl_post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
+                arm_compute::ActivationLayerInfo &act_info,
+                const dnnl::impl::memory_desc_t &dst_md);
+
+        acl_lowp_matmul_sq_conf_t almc_;
+        acl_post_ops_t acl_post_ops;
+    };
+
+    acl_lowp_matmul_sq_t(const pd_t *apd) : primitive_t(apd) {}
+
+    status_t create_resource(engine_t *engine, resource_mapper_t &mapper) const;
+
+    status_t execute(const exec_ctx_t &ctx) const;
+
+private:
+    mutable std::mutex mtx_;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+};
+
+} // namespace matmul
+} // namespace acl
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // ACL_LOWP_MATMUL_HPP
\ No newline at end of file
diff --git a/src/cpu/aarch64/matmul/acl_matmul.cpp b/src/cpu/acl/matmul/acl_matmul.cpp
similarity index 85%
rename from src/cpu/aarch64/matmul/acl_matmul.cpp
rename to src/cpu/acl/matmul/acl_matmul.cpp
index 3d3e95a491d..2da752d7883 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.cpp
+++ b/src/cpu/acl/matmul/acl_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Arm Ltd. and affiliates
+* Copyright 2021-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,18 +14,20 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/matmul/acl_matmul.hpp"
+#include "cpu/acl/matmul/acl_matmul.hpp"
+
+#include <mutex>
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 namespace matmul {
 
 using namespace data_type;
 
 namespace {
-using data_t = prec_traits<data_type::f32>::type;
+using data_t = prec_traits_t<data_type::f32>::type;
 } // namespace
 
 status_t acl_matmul_t::init(engine_t *engine) {
@@ -76,19 +78,24 @@ status_t acl_matmul_t::pd_t::init(engine_t *engine) {
             = utils::everyone_is(data_type::bf16, src_md()->data_type,
                       weights_md()->data_type, dst_md()->data_type)
             && platform::has_data_type_support(data_type::bf16);
+    const bool is_bf16f32_ok
+            = utils::everyone_is(data_type::bf16, src_md()->data_type,
+                      weights_md()->data_type)
+            && utils::everyone_is(data_type::f32, dst_md()->data_type)
+            && platform::has_data_type_support(data_type::bf16);
 
     // we need to save this state as it can change inside set_default_formats()
     weights_format_kind_ = weights_md_.format_kind;
 
     VDISPATCH_MATMUL(is_dense_format_kind(), VERBOSE_UNSUPPORTED_SPARSE_CFG);
-    VDISPATCH_MATMUL(utils::one_of(true, is_fp32_ok, is_fp16_ok, is_bf16_ok),
+    VDISPATCH_MATMUL(utils::one_of(true, is_fp32_ok, is_fp16_ok, is_bf16_ok,
+                             is_bf16f32_ok),
             VERBOSE_UNSUPPORTED_DT_CFG);
     VDISPATCH_MATMUL(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
     VDISPATCH_MATMUL(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
-    VDISPATCH_MATMUL(attr()->has_default_values(smask_t::oscale
-                             | smask_t::post_ops | smask_t::fpmath_mode),
+    VDISPATCH_MATMUL(attr()->has_default_values(
+                             smask_t::post_ops | smask_t::fpmath_mode),
             VERBOSE_UNSUPPORTED_ATTR);
-    VDISPATCH_MATMUL(attr_oscale_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_MATMUL(
             !has_runtime_dims_or_strides(), VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
@@ -145,18 +152,16 @@ status_t acl_matmul_t::pd_t::init(engine_t *engine) {
     auto scratchpad = scratchpad_registry().registrar();
     arm_compute::experimental::MemoryRequirements aux_mem_req;
 
-    // Query buffer memory requirement, if not using fixed-format kernel
-    if (weights_format_kind_ != format_kind::any) {
-        arm_compute::experimental::op::ll::CpuGemmAssemblyDispatch asm_gemm;
-        if (amp_.do_transC) {
-            asm_gemm.configure(&amp_.wei_tensor_info, &amp_.src_tensor_info,
-                    nullptr, &amp_.dst_acc_info, amp_.gemm_info);
-        } else {
-            asm_gemm.configure(&amp_.src_tensor_info, &amp_.wei_tensor_info,
-                    nullptr, &amp_.dst_tensor_info, amp_.gemm_info);
-        }
-        aux_mem_req = asm_gemm.workspace();
+    // Query buffer memory requirement
+    arm_compute::experimental::op::ll::CpuGemmAssemblyDispatch asm_gemm;
+    if (amp_.do_transC) {
+        asm_gemm.configure(&amp_.wei_tensor_info, &amp_.src_tensor_info,
+                nullptr, &amp_.dst_acc_info, amp_.gemm_info);
+    } else {
+        asm_gemm.configure(&amp_.src_tensor_info, &amp_.wei_tensor_info,
+                nullptr, &amp_.dst_tensor_info, amp_.gemm_info);
     }
+    aux_mem_req = asm_gemm.workspace();
     CHECK(acl_matmul_utils::init_scratchpad(
             scratchpad, amp_, src_md_, weights_md_, dst_md_, aux_mem_req));
 
@@ -165,12 +170,20 @@ status_t acl_matmul_t::pd_t::init(engine_t *engine) {
 
 template <bool IsFixedFormat>
 status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
-
     status_t status = status::success;
     auto src_base = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
     auto wei_base = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
 
-    auto amp = pd()->amp_;
+    const auto &amp = pd()->amp_;
+
+    std::unique_lock<std::mutex> locker {mtx_, std::defer_lock};
+
+    // Some of the underlying kernels used by ACL still require some state and
+    // are not safe to be called in parallel with different execution contexts.
+    // Eventually when all kernels are truly stateless, this guard can be
+    // removed.
+    if (!acl_obj_->asm_gemm.has_stateless_impl()) { locker.lock(); }
+
     bool is_transA = amp.is_transA;
     bool is_transB = amp.is_transB;
     bool do_transC = amp.do_transC;
@@ -287,27 +300,28 @@ status_t acl_matmul_t::execute_forward(const exec_ctx_t &ctx) const {
     }
 
     // Get pointer to scratchpad memory and create a workspace tensor for
-    // CpuGemm. Fixed-format kernel does not need this workspace tensor.
+    // CpuGemmAssemblyDispatch.
     std::vector<arm_compute::Tensor> tmp_tensors(acl_obj_->aux_mem_req.size());
-    if (!IsFixedFormat) {
-        for (const auto &key : matmul_keys) {
-            const auto id = key.first;
-            if (acl_obj_->aux_mem_req[id].size > 0) {
-                const auto info = arm_compute::TensorInfo(
-                        arm_compute::TensorShape(
-                                acl_obj_->aux_mem_req[id].size),
-                        1, arm_compute::DataType::U8);
-                auto buffer = scratchpad.get<void>(key.second);
-                tmp_tensors[id].allocator()->init(
-                        info, acl_obj_->aux_mem_req[id].alignment);
-                tmp_tensors[id].allocator()->import_memory(buffer);
-                matmul_pack.add_tensor(
-                        acl_obj_->aux_mem_req[id].slot, &tmp_tensors[id]);
-            }
+    for (const auto &key : matmul_keys) {
+        const auto id = key.first;
+        if (acl_obj_->aux_mem_req[id].size > 0) {
+            auto info = arm_compute::TensorInfo(
+                    arm_compute::TensorShape(acl_obj_->aux_mem_req[id].size), 1,
+                    arm_compute::DataType::U8);
+
+            auto *buffer = scratchpad.get<void>(key.second);
+
+            tmp_tensors[id].allocator()->init(
+                    info, acl_obj_->aux_mem_req[id].alignment);
+            tmp_tensors[id].allocator()->import_memory(buffer);
+
+            matmul_pack.add_tensor(
+                    acl_obj_->aux_mem_req[id].slot, &tmp_tensors[id]);
         }
     }
 
     acl_obj_->asm_gemm.run(matmul_pack);
+
     if (do_act) {
         auto dst_to_use = do_transC ? &dst_acc_tensor : &dst_tensor;
         arm_compute::ITensorPack act_pack;
@@ -337,7 +351,7 @@ template status_t acl_matmul_t::execute_forward<false>(
         const exec_ctx_t &ctx) const;
 
 } // namespace matmul
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/acl_matmul.hpp b/src/cpu/acl/matmul/acl_matmul.hpp
similarity index 76%
rename from src/cpu/aarch64/matmul/acl_matmul.hpp
rename to src/cpu/acl/matmul/acl_matmul.hpp
index 30641a746a7..cd587e2cd60 100644
--- a/src/cpu/aarch64/matmul/acl_matmul.hpp
+++ b/src/cpu/acl/matmul/acl_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Arm Ltd. and affiliates
+* Copyright 2021-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,38 +17,29 @@
 #ifndef ACL_MATMUL_HPP
 #define ACL_MATMUL_HPP
 
-#include "common/utils.hpp"
-#include "cpu/aarch64/acl_post_ops.hpp"
-#include "cpu/aarch64/matmul/acl_matmul_utils.hpp"
+#include "cpu/acl/acl_post_ops.hpp"
+#include "cpu/acl/matmul/acl_matmul_utils.hpp"
+#include "cpu/matmul/cpu_matmul_pd.hpp"
+
+#include <mutex>
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 namespace matmul {
 
 struct acl_matmul_t : public primitive_t {
     struct pd_t : public dnnl::impl::cpu::matmul::cpu_matmul_pd_t {
-
-        pd_t(const matmul_desc_t *adesc, const primitive_attr_t *attr,
-                const cpu_matmul_pd_t *hint_fwd_pd)
-            : cpu_matmul_pd_t(adesc, attr, hint_fwd_pd), amp_() {}
-
         using cpu_matmul_pd_t::cpu_matmul_pd_t;
 
         DECLARE_COMMON_PD_T("gemm:acl", acl_matmul_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine);
 
-        acl_matmul_conf_t amp_;
+        acl_matmul_conf_t amp_ = utils::zero<decltype(amp_)>();
         acl_post_ops_t acl_post_ops;
         dnnl::impl::format_kind_t weights_format_kind_;
-
-    protected:
-        bool attr_oscale_ok() const {
-            const auto &oscale = attr()->output_scales_;
-            return oscale.mask_ == 0;
-        }
     };
 
     acl_matmul_t(const pd_t *apd)
@@ -71,10 +62,11 @@ struct acl_matmul_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
     std::unique_ptr<acl_matmul_obj_t> acl_obj_;
+    mutable std::mutex mtx_;
 }; // acl_matmul_t
 
 } // namespace matmul
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/acl/matmul/acl_matmul_utils.cpp
similarity index 92%
rename from src/cpu/aarch64/matmul/acl_matmul_utils.cpp
rename to src/cpu/acl/matmul/acl_matmul_utils.cpp
index a921422ac0b..9ea9cfcdde1 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
+++ b/src/cpu/acl/matmul/acl_matmul_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Arm Ltd. and affiliates
+* Copyright 2021-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "cpu/acl/matmul/acl_matmul_utils.hpp"
+#include "cpu/acl/acl_utils.hpp"
 #include "cpu/matmul/matmul_utils.hpp"
 
-#include "cpu/aarch64/matmul/acl_matmul_utils.hpp"
-
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace acl_matmul_utils {
 
@@ -47,10 +47,26 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
     // for e.g when ab in abcd is 1x1
     bool batch_ok = IMPLICATION(src_batch > 1, wei_batch == 1)
             && IMPLICATION(wei_batch > 1, src_batch == 1);
+
     ACL_CHECK_SUPPORT(src_d.ndims() == 4 && src_batch != wei_batch && !batch_ok,
             "matmul broadcast supported only for 3D shapes and 4D shapes when "
             "ab is 1x1");
 
+    if (src_d.ndims() == 4 && src_batch == wei_batch
+            && src_d.dims()[0] != wei_d.dims()[0]) { // 4D broadcast occurred
+        if (src_d.dims()[0] == 1 && wei_d.dims()[0] != 1) { // Broadcast src
+            ACL_CHECK_SUPPORT(
+                    IMPLICATION(src_d.dims()[1] != 1, wei_d.dims()[1] == 1),
+                    "acl only broadcasts one of src or wei at once");
+        }
+
+        if (wei_d.dims()[0] == 1 && src_d.dims()[0] != 1) { // Broadcast wei
+            ACL_CHECK_SUPPORT(
+                    IMPLICATION(src_d.dims()[1] == 1, wei_d.dims()[1] != 1),
+                    "acl only broadcasts one of src or wei at once");
+        }
+    }
+
     // ACL does not support bias
     bool with_bias = md.bias_desc.format_kind != format_kind::undef;
     ACL_CHECK_SUPPORT(with_bias, "ACL does not support bias for matmul");
@@ -221,7 +237,7 @@ template status_t init_conf_matmul<false>(acl_matmul_conf_t &amp,
 
 } // namespace acl_matmul_utils
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp b/src/cpu/acl/matmul/acl_matmul_utils.hpp
similarity index 85%
rename from src/cpu/aarch64/matmul/acl_matmul_utils.hpp
rename to src/cpu/acl/matmul/acl_matmul_utils.hpp
index cc8eae44ea7..d55cf71263f 100644
--- a/src/cpu/aarch64/matmul/acl_matmul_utils.hpp
+++ b/src/cpu/acl/matmul/acl_matmul_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Arm Ltd. and affiliates
+* Copyright 2021-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,30 +14,30 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_AARCH64_ACL_MATMUL_UTILS_HPP
-#define CPU_AARCH64_ACL_MATMUL_UTILS_HPP
+#ifndef CPU_ACL_MATMUL_UTILS_HPP
+#define CPU_ACL_MATMUL_UTILS_HPP
 
 #include "arm_compute/runtime/experimental/low_level/CpuGemmAssemblyDispatch.h"
 #include "arm_compute/runtime/experimental/operators/CpuActivation.h"
 #include "arm_compute/runtime/experimental/operators/CpuTranspose.h"
 
-#include "cpu/matmul/cpu_matmul_pd.hpp"
-
-#include "cpu/aarch64/acl_utils.hpp"
+#include "common/memory_tracking.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
-namespace aarch64 {
+namespace acl {
 
 namespace {
 // Keys are anonymous. So deduce the type automagically.
 using matmul_key_t = decltype(memory_tracking::names::key_gemm_asm_tmp_buffer);
 
 // Map: [slot , key]
-const std::map<int, matmul_key_t> matmul_keys
-        = {{0, matmul_key_t::key_gemm_asm_tmp_buffer},
-                {2, matmul_key_t::key_gemm_pretranspose}};
+const std::map<int, matmul_key_t> matmul_keys = {
+        {0, matmul_key_t::key_gemm_asm_tmp_buffer},
+        {1, matmul_key_t::key_gemm_pretransposed_rhs},
+        {2, matmul_key_t::key_gemm_pretranspose},
+};
 } // namespace
 
 struct acl_matmul_obj_t {
@@ -80,9 +80,9 @@ status_t init_scratchpad(memory_tracking::registrar_t &scratchpad,
 
 } // namespace acl_matmul_utils
 
-} // namespace aarch64
+} // namespace acl
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
 
-#endif // CPU_AARCH64_ACL_MATMUL_UTILS_HPP
+#endif // CPU_ACL_MATMUL_UTILS_HPP
diff --git a/src/cpu/binary_injector_utils.cpp b/src/cpu/binary_injector_utils.cpp
index 339979907ce..b6dc0688cfe 100644
--- a/src/cpu/binary_injector_utils.cpp
+++ b/src/cpu/binary_injector_utils.cpp
@@ -30,7 +30,7 @@ std::vector<const void *> prepare_binary_args(const post_ops_t &post_ops,
 
     unsigned idx = first_arg_idx_offset;
     for (const auto &post_op : post_ops.entry_) {
-        if (post_op.is_binary()) {
+        if (post_op.is_binary() || post_op.is_depthwise() || post_op.is_quantization()) {
             post_ops_binary_rhs_arg_vec.emplace_back(CTX_IN_MEM(const void *,
                     DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1));
         }
diff --git a/src/cpu/cpu_batch_normalization_list.cpp b/src/cpu/cpu_batch_normalization_list.cpp
index ab093a380f0..cf7490ccbaa 100644
--- a/src/cpu/cpu_batch_normalization_list.cpp
+++ b/src/cpu/cpu_batch_normalization_list.cpp
@@ -32,11 +32,12 @@ using namespace dnnl::impl::cpu::x64;
 #if DNNL_AARCH64
 #include "cpu/aarch64/jit_uni_batch_normalization.hpp"
 #include "cpu/aarch64/jit_uni_batch_normalization_s8.hpp"
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_batch_normalization.hpp"
-#endif
 using namespace dnnl::impl::cpu::aarch64;
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_batch_normalization.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -51,52 +52,43 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_BNORM_P({
         {{forward}, {
             /* fp */
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_fwd_t<avx512_core>)
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_fwd_t<avx2>)
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_fwd_t<sse41>)
-            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_fwd_t<avx512_core>)
-            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_fwd_t<avx2>)
-            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t<asimd>)
-            CPU_INSTANCE_AARCH64_ACL(acl_batch_normalization_fwd_t)
-            CPU_INSTANCE(ncsp_batch_normalization_fwd_t<f32>)
-            CPU_INSTANCE(ncsp_batch_normalization_fwd_t<bf16>)
-            CPU_INSTANCE(ncsp_batch_normalization_fwd_t<f16>)
-            CPU_INSTANCE(nspc_batch_normalization_fwd_t<f32>)
-            CPU_INSTANCE(nspc_batch_normalization_fwd_t<bf16>)
-            CPU_INSTANCE(nspc_batch_normalization_fwd_t<f16>)
-            CPU_INSTANCE(ref_batch_normalization_fwd_t<f32>)
-            CPU_INSTANCE(ref_batch_normalization_fwd_t<bf16>)
-            CPU_INSTANCE(ref_batch_normalization_fwd_t<f16>)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_fwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_fwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_fwd_t, sse41)
+            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_fwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_fwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t, sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_fwd_t, asimd)
+            CPU_INSTANCE(ncsp_batch_normalization_fwd_t, f32)
+            CPU_INSTANCE(ncsp_batch_normalization_fwd_t, bf16)
+            CPU_INSTANCE(nspc_batch_normalization_fwd_t, f32)
+            CPU_INSTANCE(nspc_batch_normalization_fwd_t, bf16)
+            CPU_INSTANCE(ref_batch_normalization_fwd_t, f32)
+            CPU_INSTANCE(ref_batch_normalization_fwd_t, bf16)
             /* int */
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_s8_fwd_t<avx512_core>)
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_s8_fwd_t<avx2>)
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_s8_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_s8_fwd_t<sve_512>)
-            CPU_INSTANCE(ref_batch_normalization_fwd_t<s8>)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_s8_fwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_s8_fwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_s8_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_s8_fwd_t, sve_512)
+            CPU_INSTANCE(ref_batch_normalization_fwd_t, s8)
             nullptr,
         }},
         {{backward}, REG_BWD_PK({
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_bwd_t<avx512_core>)
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_bwd_t<avx2>)
-            CPU_INSTANCE_X64(jit_uni_batch_normalization_bwd_t<sse41>)
-            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_bwd_t<avx512_core>)
-            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_bwd_t<avx2>)
-            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_bwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_bwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_bwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_bwd_t<asimd>)
-            CPU_INSTANCE(ncsp_batch_normalization_bwd_t<f32>)
-            CPU_INSTANCE(ncsp_batch_normalization_bwd_t<bf16>)
-            CPU_INSTANCE(ncsp_batch_normalization_bwd_t<f16>)
-            CPU_INSTANCE(nspc_batch_normalization_bwd_t<f32>)
-            CPU_INSTANCE(nspc_batch_normalization_bwd_t<bf16>)
-            CPU_INSTANCE(nspc_batch_normalization_bwd_t<f16>)
-            CPU_INSTANCE(ref_batch_normalization_bwd_t<f32>)
-            CPU_INSTANCE(ref_batch_normalization_bwd_t<bf16>)
-            CPU_INSTANCE(ref_batch_normalization_bwd_t<f16>)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_bwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_bwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_batch_normalization_bwd_t, sse41)
+            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_bwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_bwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_tbb_batch_normalization_bwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_bwd_t, sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_batch_normalization_bwd_t, asimd)
+            CPU_INSTANCE(ncsp_batch_normalization_bwd_t, f32)
+            CPU_INSTANCE(ncsp_batch_normalization_bwd_t, bf16)
+            CPU_INSTANCE(nspc_batch_normalization_bwd_t, f32)
+            CPU_INSTANCE(nspc_batch_normalization_bwd_t, bf16)
+            CPU_INSTANCE(ref_batch_normalization_bwd_t, f32)
+            CPU_INSTANCE(ref_batch_normalization_bwd_t, bf16)
             nullptr,
         })},
     });
diff --git a/src/cpu/cpu_binary_list.cpp b/src/cpu/cpu_binary_list.cpp
index 49bad158f1c..d37a0d39017 100644
--- a/src/cpu/cpu_binary_list.cpp
+++ b/src/cpu/cpu_binary_list.cpp
@@ -25,11 +25,12 @@
 using namespace dnnl::impl::cpu::x64;
 #elif DNNL_AARCH64
 #include "cpu/aarch64/jit_uni_binary.hpp"
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_binary.hpp"
-#endif
 using namespace dnnl::impl::cpu::aarch64;
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_binary.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -39,10 +40,10 @@ namespace {
 using namespace dnnl::impl::data_type;
 
 // clang-format off
-constexpr impl_list_item_t impl_list[] = REG_BINARY_P({
+const impl_list_item_t impl_list[] = REG_BINARY_P({
         CPU_INSTANCE_X64(jit_uni_binary_t)
         CPU_INSTANCE_AARCH64(jit_uni_binary_t)
-        CPU_INSTANCE_AARCH64_ACL(acl_binary_t)
+        CPU_INSTANCE_ACL(acl_binary_t)
         CPU_INSTANCE(ref_binary_t)
         /* eol */
         nullptr,
diff --git a/src/cpu/cpu_concat.cpp b/src/cpu/cpu_concat.cpp
index 0af6336d709..06567411cdf 100644
--- a/src/cpu/cpu_concat.cpp
+++ b/src/cpu/cpu_concat.cpp
@@ -26,22 +26,24 @@ namespace cpu {
 
 namespace {
 using namespace dnnl::impl::data_type;
-#define INSTANCE(...) \
+#define INSTANCE_IMPL(...) \
     impl_list_item_t(impl_list_item_t::concat_type_deduction_helper_t< \
-            __VA_ARGS__::pd_t>()),
+            __VA_ARGS__::pd_t>())
+#define INSTANCE(...) DNNL_PRIMITIVE_IMPL(INSTANCE_IMPL, __VA_ARGS__)
 // clang-format off
-constexpr impl_list_item_t cpu_concat_impl_list[] = REG_CONCAT_P({
-        INSTANCE(simple_concat_t<f32>)
-        INSTANCE(simple_concat_t<u8>)
-        INSTANCE(simple_concat_t<s8>)
-        INSTANCE(simple_concat_t<s32>)
-        INSTANCE(simple_concat_t<bf16>)
-        INSTANCE(simple_concat_t<f16>)
+const impl_list_item_t cpu_concat_impl_list[] = REG_CONCAT_P({
+        INSTANCE(simple_concat_t, f32)
+        INSTANCE(simple_concat_t, u8)
+        INSTANCE(simple_concat_t, s8)
+        INSTANCE(simple_concat_t, s32)
+        INSTANCE(simple_concat_t, bf16)
+        INSTANCE(simple_concat_t, f16)
         INSTANCE(ref_concat_t)
         nullptr,
 });
 // clang-format on
 #undef INSTANCE
+#undef INSTANCE_IMPL
 } // namespace
 
 const impl_list_item_t *
diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
index daf5cb4915d..bc38bb3f478 100644
--- a/src/cpu/cpu_convolution_list.cpp
+++ b/src/cpu/cpu_convolution_list.cpp
@@ -1,7 +1,7 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-* Copyright 2020-2024 Arm Ltd. and affiliates
-* Copyright 2020-2024 FUJITSU LIMITED
+* Copyright 2019-2025 Intel Corporation
+* Copyright 2020-2025 Arm Ltd. and affiliates
+* Copyright 2020-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@
 #include "cpu/x64/gemm_bf16_convolution.hpp"
 #include "cpu/x64/ip_convolution.hpp"
 #include "cpu/x64/jit_avx2_1x1_convolution.hpp"
+#include "cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.hpp"
 #include "cpu/x64/jit_avx2_convolution.hpp"
 #include "cpu/x64/jit_avx512_common_1x1_convolution.hpp"
 #include "cpu/x64/jit_avx512_common_convolution.hpp"
@@ -52,25 +53,30 @@
 #include "cpu/x64/jit_sse41_1x1_convolution.hpp"
 #include "cpu/x64/jit_sse41_convolution.hpp"
 #include "cpu/x64/jit_uni_dw_convolution.hpp"
+#include "cpu/x64/jit_uni_fork_dw_convolution.hpp"
+#include "cpu/x64/jit_uni_ncsp_convolution.hpp"
 #include "cpu/x64/jit_uni_x8s8s32x_1x1_convolution.hpp"
 #include "cpu/x64/jit_uni_x8s8s32x_convolution.hpp"
+#include "cpu/x64/jit_uni_planar_convolution.hpp"
 using namespace dnnl::impl::cpu::x64;
 #elif DNNL_AARCH64
 #include "cpu/aarch64/jit_brdgmm_dw_conv.hpp"
 #include "cpu/aarch64/jit_brgemm_1x1_conv.hpp"
 #include "cpu/aarch64/jit_brgemm_conv.hpp"
-#include "cpu/aarch64/jit_sve_512_1x1_convolution.hpp"
+#include "cpu/aarch64/jit_brgemm_conv_bwd.hpp"
+#include "cpu/aarch64/jit_sve_1x1_convolution.hpp"
 #include "cpu/aarch64/jit_sve_512_x8s8s32x_convolution.hpp"
 #include "cpu/aarch64/jit_sve_convolution.hpp"
 #include "cpu/aarch64/jit_uni_dw_convolution.hpp"
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_depthwise_convolution.hpp"
-#include "cpu/aarch64/acl_gemm_convolution.hpp"
-#include "cpu/aarch64/acl_indirect_gemm_convolution.hpp"
-#include "cpu/aarch64/acl_winograd_convolution.hpp"
-#endif
 using namespace dnnl::impl::cpu::aarch64;
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_gemm_convolution.hpp"
+#include "cpu/acl/acl_indirect_gemm_convolution.hpp"
+#include "cpu/acl/acl_depthwise_convolution.hpp"
+#include "cpu/acl/acl_winograd_convolution.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -84,8 +90,8 @@ using namespace dnnl::impl::prop_kind;
     { \
         {forward, dtsrc, dtwei, dtdst}, { \
             CPU_INSTANCE_AMX( \
-                    brgemm_1x1_convolution_fwd_t<avx10_1_512_amx_fp16>) \
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx10_1_512_amx_fp16>) \
+                    brgemm_1x1_convolution_fwd_t, avx10_1_512_amx_fp16) \
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx10_1_512_amx_fp16) \
             CPU_INSTANCE(ref_convolution_fwd_t) nullptr, \
         } \
     }
@@ -121,75 +127,105 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, f32, f32, f32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(jit_avx512_common_planar_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_common_dw_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(jit_avx512_common_fork_dw_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_common_1x1_convolution_fwd_f32_t)
-            CPU_INSTANCE_AVX512(jit_avx512_common_convolution_fwd_t<f32>)
+            CPU_INSTANCE_AVX512(jit_avx512_common_convolution_fwd_t, f32)
+            CPU_INSTANCE_AVX2(jit_avx2_planar_convolution_fwd_t)
             CPU_INSTANCE_AVX2(jit_avx2_dw_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_avx2_fork_dw_convolution_fwd_t)
+            CPU_INSTANCE_AVX2(jit_avx2_1x1_convolution_with_dw_conv_fwd_t)
             CPU_INSTANCE_AVX2(jit_avx2_1x1_convolution_fwd_t)
             CPU_INSTANCE_SSE41(jit_sse41_dw_convolution_fwd_t)
+            CPU_INSTANCE_SSE41(jit_sse41_fork_dw_convolution_fwd_t)
             CPU_INSTANCE_SSE41(jit_sse41_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX2(jit_avx2_convolution_fwd_t)
             CPU_INSTANCE_SSE41(jit_sse41_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_wino_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_fwd_t<sve_512,data_type::f32>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_fwd_f32_t)
-            CPU_INSTANCE_AARCH64(jit_sve_convolution_fwd_t<f32,f32,f32,sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_fwd_t<sve_256,data_type::f32>)
-            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f32>)
-            CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(jit_sve_convolution_fwd_t<f32,f32,f32,sve_256>)
+            CPU_INSTANCE_ACL(acl_wino_convolution_fwd_t)
+            CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t,sve_512)
+            CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t,sve_512)
+            CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t,sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_fwd_t,sve_512,data_type::f32)
+            CPU_INSTANCE_AARCH64(jit_sve_1x1_convolution_fwd_t,f32,f32,f32,sve_512)
+            CPU_INSTANCE_AARCH64(jit_sve_convolution_fwd_t,f32,f32,f32,sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_fwd_t,sve_256,data_type::f32)
+            CPU_INSTANCE_AARCH64(jit_sve_1x1_convolution_fwd_t,f32,f32,f32,sve_256)
+            CPU_INSTANCE_AARCH64(jit_sve_convolution_fwd_t,f32,f32,f32,sve_256)
+            CPU_INSTANCE_ACL(acl_depthwise_convolution_fwd_t)
+            CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t)
+            CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t,f32)
+            CPU_INSTANCE_AARCH64(brdgmm_dw_convolution_fwd_t,sve_256)
+            CPU_INSTANCE_AARCH64(brgemm_1x1_convolution_fwd_t,sve_256)
+            CPU_INSTANCE_AARCH64(brgemm_convolution_fwd_t,sve_256)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t)
             CPU_INSTANCE(gemm_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
         }},
+        {{forward, f32, f16, f32}, {
+            CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2)
+            CPU_INSTANCE(ref_convolution_fwd_t)
+            nullptr,
+        }},
+        {{forward, f32, bf16, f32}, {
+            CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2)
+            CPU_INSTANCE(ref_convolution_fwd_t)
+            nullptr,
+        }},
         {{forward, bf16, bf16, f32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t<avx512_core, bf16, f32>)
-            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_fwd_t<f32>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t, avx512_core, bf16, f32)
+            CPU_INSTANCE_AVX512(jit_uni_fork_dw_convolution_fwd_t, avx512_core, bf16, f32)
+            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_fwd_t, f32)
             CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(gemm_bf16_convolution_fwd_t<f32>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(gemm_bf16_convolution_fwd_t,f32)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t,avx2_vnni_2)
             CPU_INSTANCE(ref_convolution_fwd_t)
             nullptr,
         }},
         {{forward, bf16, bf16, bf16}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t<avx512_core, bf16, bf16>)
-            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_fwd_t<bf16>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t, avx512_core, bf16, bf16)
+            CPU_INSTANCE_AVX512(jit_uni_fork_dw_convolution_fwd_t, avx512_core, bf16, bf16)
+            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_fwd_t, bf16)
             CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(gemm_bf16_convolution_fwd_t<bf16>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(gemm_bf16_convolution_fwd_t,bf16)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
@@ -197,28 +233,30 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, f16, f16, f32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t,avx512_core_fp16, f16, f32)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t,avx512_core_fp16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t,avx512_core_fp16)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t,avx2_vnni_2)
             CPU_INSTANCE(ref_convolution_fwd_t)
             nullptr,
         }},
         {{forward, f16, f16, f16}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AARCH64_ACL(acl_wino_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_depthwise_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_indirect_gemm_convolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<f16>)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_fwd_t,avx512_core_fp16, f16, f16)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t,avx512_core_fp16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t,avx512_core_fp16)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_ACL(acl_wino_convolution_fwd_t)
+            CPU_INSTANCE_ACL(acl_depthwise_convolution_fwd_t)
+            CPU_INSTANCE_ACL(acl_indirect_gemm_convolution_fwd_t)
+            CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t, f16)
             CPU_INSTANCE(ref_convolution_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
             nullptr,
@@ -242,80 +280,106 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         // BWD_D fp
         {{backward_data, f32, f32, f32}, REG_BWD_D_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_data_t)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_common_dw_convolution_bwd_data_t)
+            CPU_INSTANCE_AVX512(jit_avx512_common_fork_dw_convolution_bwd_data_t)
             CPU_INSTANCE_AVX512(jit_avx512_common_1x1_convolution_bwd_data_f32_t)
-            CPU_INSTANCE_AVX512(jit_avx512_common_convolution_bwd_data_t<f32>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t<avx2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2>)
+            CPU_INSTANCE_AVX512(jit_avx512_common_convolution_bwd_data_t, f32)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t, avx2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2)
             CPU_INSTANCE_AVX2(jit_avx2_dw_convolution_bwd_data_t)
+            CPU_INSTANCE_AVX2(jit_avx2_fork_dw_convolution_bwd_data_t)
             CPU_INSTANCE_AVX2(jit_avx2_1x1_convolution_bwd_data_t)
-            CPU_INSTANCE_SSE41(jit_sse41_dw_convolution_bwd_data_t)
+            CPU_INSTANCE_SSE41(jit_sse41_fork_dw_convolution_bwd_data_t)
             CPU_INSTANCE_AVX2(jit_avx2_convolution_bwd_data_t)
-            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_data_t<sve_512,data_type::f32>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_bwd_data_f32_t)
-            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_data_t<f32,f32,f32,sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_data_t<sve_256,data_type::f32>)
-            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_data_t<f32,f32,f32,sve_256>)
+            CPU_INSTANCE_AARCH64(brgemm_convolution_bwd_t,sve_512)
+            CPU_INSTANCE_AARCH64(brgemm_convolution_bwd_t,sve_256)
+            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_data_t,sve_512,data_type::f32)
+            CPU_INSTANCE_AARCH64(jit_sve_1x1_convolution_bwd_data_t,f32,f32,f32,sve_512)
+            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_data_t,f32,f32,f32,sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_data_t,sve_256,data_type::f32)
+            CPU_INSTANCE_AARCH64(jit_sve_1x1_convolution_bwd_data_t,f32,f32,f32,sve_256)
+            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_data_t,f32,f32,f32,sve_256)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_bwd_data_t)
             CPU_INSTANCE(gemm_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_bwd_data_t)
             nullptr,
         })},
+        {{backward_data, f32, bf16, f32}, REG_BWD_D_PK({
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t, avx2)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx2)
+            CPU_INSTANCE(ref_convolution_bwd_data_t)
+            nullptr,
+        })},
+        {{backward_data, f32, f16, f32}, REG_BWD_D_PK({
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t, avx2)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx2)
+            CPU_INSTANCE(ref_convolution_bwd_data_t)
+            nullptr,
+        })},
         {{backward_data, f32, bf16, bf16}, REG_BWD_D_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_data_t)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_data_t<avx512_core, bf16, f32>)
-            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_data_t<f32>)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_data_t, avx512_core, bf16, f32)
+            CPU_INSTANCE_AVX512(jit_uni_fork_dw_convolution_bwd_data_t, avx512_core, bf16, f32)
+            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_data_t, f32)
             CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_bwd_data_t)
-            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_data_t<f32>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_bwd_data_t)
+            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_data_t,f32)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t,avx2_vnni_2)
             CPU_INSTANCE(ref_convolution_bwd_data_t)
             nullptr,
         })},
         {{backward_data, bf16, bf16, bf16}, REG_BWD_D_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_data_t)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_data_t<avx512_core, bf16, bf16>)
-            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_data_t<bf16>)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_data_t, avx512_core, bf16, bf16)
+            CPU_INSTANCE_AVX512(jit_uni_fork_dw_convolution_bwd_data_t, avx512_core, bf16, bf16)
+            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_data_t, bf16)
             CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_bwd_data_t)
-            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_data_t<bf16>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_bwd_data_t)
+            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_data_t,bf16)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t,avx2_vnni_2)
             CPU_INSTANCE(ref_convolution_bwd_data_t)
             nullptr,
         })},
         {{backward_data, f32, f16, f16}, REG_BWD_D_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_data_t)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx_fp16>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t, avx512_core_amx_fp16)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx_fp16)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core_fp16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_fp16)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
             CPU_INSTANCE(ref_convolution_bwd_data_t)
             nullptr,
         })},
         {{backward_data, f16, f16, f16}, REG_BWD_D_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_data_t)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx_fp16>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_t, avx512_core_amx_fp16)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx_fp16)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_data_t)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_t, avx512_core_fp16)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_fp16)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
             CPU_INSTANCE(ref_convolution_bwd_data_t)
             nullptr,
         })},
@@ -340,39 +404,43 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_X64(ip_convolution_bwd_weights_t)
             CPU_INSTANCE_AVX512(jit_avx512_common_dw_convolution_bwd_weights_t)
             CPU_INSTANCE_AVX512(jit_avx512_common_1x1_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(jit_avx512_common_convolution_bwd_weights_t<f32>)
+            CPU_INSTANCE_AVX512(jit_avx512_common_convolution_bwd_weights_t, f32)
             CPU_INSTANCE_AVX2(jit_avx2_dw_convolution_bwd_weights_t)
             CPU_INSTANCE_AVX2(jit_avx2_1x1_convolution_bwd_weights_t)
             CPU_INSTANCE_SSE41(jit_sse41_dw_convolution_bwd_weights_t)
             CPU_INSTANCE_AVX2(jit_avx2_convolution_bwd_weights_t)
-            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_weights_t<sve_512,data_type::f32>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_1x1_convolution_bwd_weights_t)
-            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_weights_t<f32,f32,f32,sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_weights_t<sve_256,data_type::f32>)
-            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_weights_t<f32,f32,f32,sve_256>)
+            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_weights_t,sve_512,data_type::f32)
+            CPU_INSTANCE_AARCH64(jit_sve_1x1_convolution_bwd_weights_t,f32,f32,f32,sve_512)
+            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_weights_t,f32,f32,f32,sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_dw_convolution_bwd_weights_t,sve_256,data_type::f32)
+            CPU_INSTANCE_AARCH64(jit_sve_1x1_convolution_bwd_weights_t,f32,f32,f32,sve_256)
+            CPU_INSTANCE_AARCH64(jit_sve_convolution_bwd_weights_t,f32,f32,f32,sve_256)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_bwd_weights_t)
             CPU_INSTANCE(gemm_convolution_bwd_weights_t)
             CPU_INSTANCE(ref_convolution_bwd_weights_t)
             nullptr,
         })},
         {{backward_weights, bf16, f32, bf16}, REG_BWD_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_weights_t<avx512_core, bf16, f32>)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_weights_t, avx512_core, bf16, f32)
             CPU_INSTANCE_AMX(brgemm_convolution_bwd_weights_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t<f32>)
+            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t, f32)
             CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_weights_t<f32>)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_bwd_weights_t)
+            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_weights_t,f32)
             CPU_INSTANCE(ref_convolution_bwd_weights_t)
             nullptr,
         })},
         {{backward_weights, bf16, bf16, bf16}, REG_BWD_PK({
             CPU_INSTANCE_X64(ip_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_weights_t<avx512_core, bf16, bf16>)
+            CPU_INSTANCE_AVX512(jit_uni_dw_convolution_bwd_weights_t, avx512_core, bf16, bf16)
             CPU_INSTANCE_AMX(brgemm_convolution_bwd_weights_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t<bf16>)
+            CPU_INSTANCE_AVX512(jit_avx512_core_bf16_1x1_convolution_bwd_weights_t, bf16)
             CPU_INSTANCE_AVX512(jit_avx512_core_bf16_convolution_bwd_weights_t)
-            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_weights_t<bf16>)
+            // CPU_INSTANCE_X64(jit_uni_ncsp_convolution_bwd_weights_t)
+            CPU_INSTANCE_AVX512(gemm_bf16_convolution_bwd_weights_t,bf16)
             CPU_INSTANCE(ref_convolution_bwd_weights_t)
             nullptr,
         })},
@@ -408,25 +476,25 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, s8, s8, f32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<s8, f32>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, s8, f32)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
@@ -436,14 +504,30 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
+            CPU_INSTANCE(ref_convolution_int8_fwd_t)
+            nullptr,
+        }},
+        {{forward, s8, s8, f16}, {
+            CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
+            CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
+            CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             nullptr,
@@ -451,25 +535,25 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, s8, s8, s32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<s8, s32>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, s8, s32)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
@@ -478,26 +562,26 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, s8, s8, s8}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<s8, s8>)
-            CPU_INSTANCE_AARCH64_ACL(acl_gemm_convolution_fwd_t<s8, s8, s8, s32>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, s8, s8)
+            CPU_INSTANCE_ACL(acl_gemm_convolution_fwd_t, s8, s8, s8, s32)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
@@ -506,25 +590,25 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, s8, s8, u8}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<s8, u8>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, s8, u8)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
@@ -534,43 +618,61 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, u8, s8, f32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<u8, f32>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, u8, f32)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             nullptr,
         }},
         {{forward, u8, s8, bf16}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
+            CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
+            CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
+            CPU_INSTANCE(ref_convolution_int8_fwd_t)
+            nullptr,
+        }},
+        {{forward, u8, s8, f16}, {
+            CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             nullptr,
@@ -578,25 +680,25 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, u8, s8, s32}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<u8, s32>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, u8, s32)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             nullptr,
@@ -604,25 +706,25 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, u8, s8, s8}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<u8, s8>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, u8, s8)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
@@ -631,25 +733,25 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         {{forward, u8, s8, u8}, {
             CPU_INSTANCE_AVX512(brdgmm_dw_convolution_fwd_t)
             CPU_INSTANCE_X64(ip_convolution_fwd_t)
-            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_1x1_convolution_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AMX(brgemm_convolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_1x1_convolution_fwd_t)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_convolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_1x1_convolution_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX512(brgemm_convolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_convolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t<u8, u8>)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_1x1_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_convolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_convolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_convolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_convolution_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_sve_512_x8s8s32x_convolution_fwd_t, u8, u8)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_fwd_t)
             CPU_INSTANCE(ref_convolution_int8_fwd_t)
             CPU_INSTANCE(ref_fused_convolution_fwd_t)
@@ -657,100 +759,100 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
         }},
         // BWD int8 (diff_dst:u8)
         {{backward_data, f32, s8, u8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, bf16, s8, u8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, s32, s8, u8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, s8, s8, u8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, u8, s8, u8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         // BWD int8 (diff_dst:s8)
         {{backward_data, f32, s8, s8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, bf16, s8, s8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, s32, s8, s8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, s8, s8, s8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
         })},
         {{backward_data, u8, s8, s8}, REG_BWD_D_PK({
-            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t<avx2_vnni>)
+            CPU_INSTANCE_AMX(brgemm_convolution_bwd_strided_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_convolution_bwd_strided_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_convolution_bwd_strided_t, avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_convolution_bwd_data_t)
             CPU_INSTANCE(ref_convolution_int8_bwd_data_t)
             nullptr,
@@ -783,4 +885,4 @@ const impl_list_item_t *get_convolution_impl_list(
 
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
\ No newline at end of file
+} // namespace dnnl
diff --git a/src/cpu/cpu_deconvolution_list.cpp b/src/cpu/cpu_deconvolution_list.cpp
index 468f4711452..a904216222b 100644
--- a/src/cpu/cpu_deconvolution_list.cpp
+++ b/src/cpu/cpu_deconvolution_list.cpp
@@ -1,7 +1,7 @@
 /*******************************************************************************
 * Copyright 2019-2023 Intel Corporation
 * Copyright 2022 FUJITSU LIMITED
-* Copyright 2022 Arm Ltd. and affiliates
+* Copyright 2022, 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,10 +16,9 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
 #include "cpu/cpu_engine.hpp"
-
 #include "cpu/ref_deconvolution.hpp"
-
 #if DNNL_X64
 #include "cpu/x64/jit_avx512_core_amx_deconvolution.hpp"
 #include "cpu/x64/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp"
@@ -30,11 +29,12 @@
 using namespace dnnl::impl::cpu::x64;
 #elif DNNL_AARCH64
 #include "cpu/aarch64/jit_sve_512_core_x8s8s32x_deconvolution.hpp"
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_deconvolution.hpp"
-#endif
 using namespace dnnl::impl::cpu::aarch64;
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_deconvolution.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -48,24 +48,24 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_DECONV_P({
         {{forward}, {
-            CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t<avx512_core_amx>)
+            CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t, avx512_core_amx_fp16)
+            CPU_INSTANCE_AMX(brgemm_deconvolution_fwd_t, avx512_core_amx)
             CPU_INSTANCE_AMX(jit_avx512_core_amx_deconvolution_fwd_t)
-            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t<avx512_core>)
+            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t, avx512_core_fp16)
+            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t, avx512_core_bf16)
+            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_deconvolution_fwd_t, avx512_core)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t)
             CPU_INSTANCE_AVX512(jit_avx512_core_x8s8s32x_deconvolution_fwd_t)
-            CPU_INSTANCE_AVX2(brgemm_deconvolution_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_deconvolution_fwd_t<avx2_vnni>)
-            CPU_INSTANCE_AVX2(brgemm_deconvolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_deconvolution_fwd_t<avx2>)
-            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_deconvolution_fwd_t<avx2>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_deconvolution_fwd_t<sse41>)
-            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_deconvolution_fwd_t<sse41>)
+            CPU_INSTANCE_AVX2(brgemm_deconvolution_fwd_t, avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_deconvolution_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_deconvolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_1x1_deconvolution_fwd_t, avx2)
+            CPU_INSTANCE_AVX2(jit_uni_x8s8s32x_deconvolution_fwd_t, avx2)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_1x1_deconvolution_fwd_t, sse41)
+            CPU_INSTANCE_SSE41(jit_uni_x8s8s32x_deconvolution_fwd_t, sse41)
             CPU_INSTANCE_AARCH64(jit_sve_512_core_x8s8s32x_deconvolution_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_deconvolution_fwd_t)
+            CPU_INSTANCE_ACL(acl_deconvolution_fwd_t)
             CPU_INSTANCE(ref_deconvolution_fwd_t)
             nullptr,
         }},
diff --git a/src/cpu/cpu_eltwise_list.cpp b/src/cpu/cpu_eltwise_list.cpp
index 03d4f107449..704a18f1c69 100644
--- a/src/cpu/cpu_eltwise_list.cpp
+++ b/src/cpu/cpu_eltwise_list.cpp
@@ -27,11 +27,12 @@ using namespace dnnl::impl::cpu::x64;
 #elif DNNL_AARCH64
 #include "cpu/aarch64/jit_uni_eltwise.hpp"
 #include "cpu/aarch64/jit_uni_eltwise_int.hpp"
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_eltwise.hpp"
-#endif // DNNL_AARCH64_USE_ACL
 using namespace dnnl::impl::cpu::aarch64;
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_eltwise.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -45,59 +46,59 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_ELTWISE_P({
         {{forward}, {
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx512_core_amx, f8_e4m3>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx512_core_amx, f8_e5m2>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx512_core_fp16, f16>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx512_core, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx512_core, bf16>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx2_vnni_2, f16>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx2_vnni_2, bf16>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx2, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<sse41, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<avx512_core, s32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<avx512_core, s8>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<avx512_core, u8>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<avx2, s32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<avx2, s8>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<avx2, u8>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<sse41, s32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<sse41, s8>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t<sse41, u8>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_fwd_t<sve_512, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_fwd_t<sve_256, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_fwd_t<sve_128, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t<sve_512, s32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t<sve_512, s8>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t<sve_512, u8>)
-            CPU_INSTANCE_AARCH64_ACL(acl_eltwise_fwd_t)
-            CPU_INSTANCE(ref_eltwise_fwd_t<f32>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<bf16>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<f16>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<s32>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<s8>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<u8>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<f8_e4m3>)
-            CPU_INSTANCE(ref_eltwise_fwd_t<f8_e5m2>)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx512_core_amx, f8_e4m3)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx512_core_amx, f8_e5m2)
+            // CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx512_core_fp16, f16>)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx512_core, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx512_core, bf16)
+            // CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t<avx2_vnni_2, f16>)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx2_vnni_2, bf16)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx2, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, avx, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_fwd_t, sse41, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, avx512_core, s32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, avx512_core, s8)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, avx512_core, u8)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, avx2, s32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, avx2, s8)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, avx2, u8)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, sse41, s32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, sse41, s8)
+            CPU_INSTANCE_X64(jit_uni_eltwise_int_fwd_t, sse41, u8)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_fwd_t, sve_512, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_fwd_t, sve_256, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_fwd_t, sve_128, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t, sve_512, s32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t, sve_512, s8)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_int_fwd_t, sve_512, u8)
+            CPU_INSTANCE_ACL(acl_eltwise_fwd_t)
+            CPU_INSTANCE(ref_eltwise_fwd_t, f32)
+            CPU_INSTANCE(ref_eltwise_fwd_t, bf16)
+            // CPU_INSTANCE(ref_eltwise_fwd_t<f16>)
+            CPU_INSTANCE(ref_eltwise_fwd_t, s32)
+            CPU_INSTANCE(ref_eltwise_fwd_t, s8)
+            CPU_INSTANCE(ref_eltwise_fwd_t, u8)
+            CPU_INSTANCE(ref_eltwise_fwd_t, f8_e4m3)
+            CPU_INSTANCE(ref_eltwise_fwd_t, f8_e5m2)
             nullptr,
         }},
         {{backward}, REG_BWD_PK({
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx512_core_amx, f8_e4m3>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx512_core_amx, f8_e5m2>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx512_core_fp16, f16>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx512_core, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx512_core, bf16>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx2, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx, f32>)
-            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<sse41, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_bwd_t<sve_512, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_bwd_t<sve_256, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_eltwise_bwd_t<sve_128, f32>)
-            CPU_INSTANCE(ref_eltwise_bwd_t<f32>)
-            CPU_INSTANCE(ref_eltwise_bwd_t<bf16>)
-            CPU_INSTANCE(ref_eltwise_bwd_t<f16>)
-            CPU_INSTANCE(ref_eltwise_bwd_t<f8_e4m3>)
-            CPU_INSTANCE(ref_eltwise_bwd_t<f8_e5m2>)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, avx512_core_amx, f8_e4m3)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, avx512_core_amx, f8_e5m2)
+            // CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t<avx512_core_fp16, f16>)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, avx512_core, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, avx512_core, bf16)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, avx2, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, avx, f32)
+            CPU_INSTANCE_X64(jit_uni_eltwise_bwd_t, sse41, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_bwd_t, sve_512, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_bwd_t, sve_256, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_eltwise_bwd_t, sve_128, f32)
+            CPU_INSTANCE(ref_eltwise_bwd_t, f32)
+            CPU_INSTANCE(ref_eltwise_bwd_t, bf16)
+            // CPU_INSTANCE(ref_eltwise_bwd_t<f16>)
+            CPU_INSTANCE(ref_eltwise_bwd_t, f8_e4m3)
+            CPU_INSTANCE(ref_eltwise_bwd_t, f8_e5m2)
             nullptr,
         })},
     });
diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
index c9263eda2cd..751038e698b 100644
--- a/src/cpu/cpu_engine.cpp
+++ b/src/cpu/cpu_engine.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,6 +30,9 @@ namespace cpu {
 
 status_t cpu_engine_t::create_memory_storage(
         memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
+    assert(runtime_kind() != runtime_kind::sycl);
+    if (runtime_kind() == runtime_kind::sycl) return status::runtime_error;
+
     auto _storage = new cpu_memory_storage_t(this);
     if (_storage == nullptr) return status::out_of_memory;
     status_t status = _storage->init(flags, size, handle);
diff --git a/src/cpu/cpu_engine.hpp b/src/cpu/cpu_engine.hpp
index 494a5dc7f51..d98f79b95bd 100644
--- a/src/cpu/cpu_engine.hpp
+++ b/src/cpu/cpu_engine.hpp
@@ -29,21 +29,25 @@
 
 #include "cpu/platform.hpp"
 
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_thread.hpp"
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_thread.hpp"
 #endif
 
-#define CPU_INSTANCE(...) \
+#define CPU_INSTANCE_IMPL(...) \
     impl_list_item_t( \
-            impl_list_item_t::type_deduction_helper_t<__VA_ARGS__::pd_t>()),
-#define CPU_INSTANCE_X64(...) DNNL_X64_ONLY(CPU_INSTANCE(__VA_ARGS__))
+            impl_list_item_t::type_deduction_helper_t<__VA_ARGS__::pd_t>())
+#define CPU_INSTANCE(...) DNNL_PRIMITIVE_IMPL(CPU_INSTANCE_IMPL, __VA_ARGS__)
+// Expanding DNNL_X64_ONLY in order to fix Conditional Compilation failure on Windows + CPU plugin.
+// DNNL_X64_ONLY == CONCAT2(Z_DO_IF_, DNNL_X64)
+#define CPU_INSTANCE_X64(...) \
+    CONCAT2(Z_DO_IF_, DNNL_X64)(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_SSE41(...) REG_SSE41_ISA(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_AVX2(...) REG_AVX2_ISA(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_AVX512(...) REG_AVX512_ISA(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_AMX(...) REG_AMX_ISA(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_AARCH64(...) DNNL_AARCH64_ONLY(CPU_INSTANCE(__VA_ARGS__))
-#define CPU_INSTANCE_AARCH64_ACL(...) \
-    DNNL_AARCH64_ACL_ONLY(CPU_INSTANCE(__VA_ARGS__))
+#define CPU_INSTANCE_ARM(...) DNNL_ARM_ONLY(CPU_INSTANCE(__VA_ARGS__))
+#define CPU_INSTANCE_ACL(...) DNNL_ACL_ONLY(CPU_INSTANCE(__VA_ARGS__))
 #define CPU_INSTANCE_RV64GCV(...) DNNL_RV64GCV_ONLY(CPU_INSTANCE(__VA_ARGS__))
 
 namespace dnnl {
@@ -88,7 +92,7 @@ class cpu_engine_impl_list_t {
 #define CASE(kind) \
     case primitive_kind::kind: \
         return get_##kind##_impl_list((const kind##_desc_t *)desc);
-        switch ((int) desc->kind) {
+        switch ((int)desc->primitive_kind) {
             CASE(batch_normalization);
             CASE(binary);
             CASE(convolution);
@@ -156,8 +160,8 @@ class cpu_engine_factory_t : public engine_factory_t {
         *engine = new cpu_engine_t(new impl::engine_impl_t(
                 engine_kind::cpu, get_cpu_native_runtime(), 0));
 
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
-        dnnl::impl::cpu::aarch64::acl_thread_utils::set_acl_threading();
+#if DNNL_USE_ACL
+        dnnl::impl::cpu::acl::acl_thread_utils::set_acl_threading();
 #endif
         return status::success;
     };
diff --git a/src/cpu/cpu_inner_product_list.cpp b/src/cpu/cpu_inner_product_list.cpp
index 1f595473047..51f754c8450 100644
--- a/src/cpu/cpu_inner_product_list.cpp
+++ b/src/cpu/cpu_inner_product_list.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
+* Copyright 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,12 +25,13 @@
 #if DNNL_X64
 #include "cpu/x64/gemm_bf16_inner_product.hpp"
 #include "cpu/x64/jit_brgemm_inner_product.hpp"
+#include "cpu/x64/matmul_inner_product.hpp"
 using namespace dnnl::impl::cpu::x64;
 #endif
 
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_inner_product.hpp"
-using namespace dnnl::impl::cpu::aarch64;
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_inner_product.hpp"
+using namespace dnnl::impl::cpu::acl;
 #endif
 
 namespace dnnl {
@@ -40,46 +42,166 @@ namespace {
 using namespace dnnl::impl::data_type;
 using namespace dnnl::impl::prop_kind;
 
+#define BRGEMM_FP8_FWD_IP(dtsrc, dtwei, dtdst) \
+    { \
+        {forward, dtsrc, dtwei, dtdst}, { \
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx10_1_512_amx_fp16>) \
+            CPU_INSTANCE(ref_inner_product_fwd_t) nullptr, \
+        } \
+    }
+
 // clang-format off
 const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_IP_P({
         {{forward, f32, f32, f32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>) // bf32
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2>)
-            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
-            CPU_INSTANCE(gemm_inner_product_fwd_t<f32>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx) // bf32
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2)
+            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
+            CPU_INSTANCE(gemm_inner_product_fwd_t, f32)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
+        {{forward, f32, u8, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, s8, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, nf4, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, f4_e2m1, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, s4, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, u4, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2_vnni)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, f16, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
+        {{forward, f32, bf16, f32}, {
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t, avx2)
+            nullptr,
+        }},
         {{forward, bf16, bf16, f32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_fwd_t<f32>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_bf16)
+            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_fwd_t,f32)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
         {{forward, bf16, bf16, bf16}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_fwd_t<bf16>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_bf16)
+            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_fwd_t,bf16)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
+        {{forward, bf16, u8, f32}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, u8, bf16}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, s8, f32}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, s8, bf16}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, nf4, f32}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, nf4, bf16}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, f4_e2m1, f32}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, f4_e2m1, bf16}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, s4, f32}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, s4, bf16}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, u4, f32}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
+        {{forward, bf16, u4, bf16}, {
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t, avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t, avx512_core_bf16)
+            nullptr,
+        }},
         {{forward, f16, f16, f32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_fp16)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
         {{forward, f16, f16, f16}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_fp16>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_fp16)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_fwd_t)
             nullptr,
         }},
@@ -89,170 +211,208 @@ const std::map<pk_dt_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map()
          * in fp32 and weights are in bf16
          */
         {{forward, f32, bf16, f32}, {
-            CPU_INSTANCE_AARCH64_ACL(acl_inner_product_fwd_t)
+            CPU_INSTANCE_ACL(acl_inner_product_fwd_t)
             nullptr,
         }},
+
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e5m2, f16),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e5m2, f32),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e5m2, f8_e5m2),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e5m2, f8_e4m3),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e4m3, f16),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e4m3, f32),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e4m3, f8_e5m2),
+        BRGEMM_FP8_FWD_IP(f8_e5m2, f8_e4m3, f8_e4m3),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e5m2, f16),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e5m2, f32),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e5m2, f8_e5m2),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e5m2, f8_e4m3),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e4m3, f16),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e4m3, f32),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e4m3, f8_e5m2),
+        BRGEMM_FP8_FWD_IP(f8_e4m3, f8_e4m3, f8_e4m3),
+
         {{backward_data, f32, f32, f32}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t<avx512_core_amx>) // bf32
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_bwd_data_t<avx2>)
-            CPU_INSTANCE(gemm_inner_product_bwd_data_t<f32>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_data_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t,avx512_core_amx) // bf32
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_bwd_data_t,avx2)
+            CPU_INSTANCE(gemm_inner_product_bwd_data_t,f32)
             CPU_INSTANCE(ref_inner_product_bwd_data_t)
             nullptr,
         })},
         {{backward_data, f32, bf16, bf16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_data_t<f32>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_data_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t,avx512_core_bf16)
+            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_data_t,f32)
             CPU_INSTANCE(ref_inner_product_bwd_data_t)
             nullptr,
         })},
         {{backward_data, bf16, bf16, bf16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_data_t<bf16>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_data_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t,avx512_core_bf16)
+            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_data_t,bf16)
             CPU_INSTANCE(ref_inner_product_bwd_data_t)
             nullptr,
         })},
         {{backward_data, f32, f16, f16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t<avx512_core_fp16>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_data_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t,avx512_core_fp16)
             CPU_INSTANCE(ref_inner_product_bwd_data_t)
             nullptr,
         })},
         {{backward_data, f16, f16, f16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t<avx512_core_fp16>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_data_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_data_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_data_t,avx512_core_fp16)
             CPU_INSTANCE(ref_inner_product_bwd_data_t)
             nullptr,
         })},
         {{backward_weights, f32, f32, f32}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t<avx512_core_amx>) // bf32
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_bwd_weights_t<avx2>)
-            CPU_INSTANCE(gemm_inner_product_bwd_weights_t<f32>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_weights_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t,avx512_core_amx) // bf32
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_bwd_weights_t,avx2)
+            CPU_INSTANCE(gemm_inner_product_bwd_weights_t,f32)
             CPU_INSTANCE(ref_inner_product_bwd_weights_t)
             nullptr,
         })},
         {{backward_weights, bf16, f32, bf16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_weights_t<f32>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_weights_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t,avx512_core_bf16)
+            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_weights_t,f32)
             CPU_INSTANCE(ref_inner_product_bwd_weights_t)
             nullptr,
         })},
         {{backward_weights, bf16, bf16, bf16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t<avx512_core_bf16>)
-            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_weights_t<bf16>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_weights_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t,avx512_core_bf16)
+            CPU_INSTANCE_AVX512(gemm_bf16_inner_product_bwd_weights_t,bf16)
             CPU_INSTANCE(ref_inner_product_bwd_weights_t)
             nullptr,
         })},
         {{backward_weights, f16, f32, f16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t<avx512_core_fp16>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_weights_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t,avx512_core_fp16)
             CPU_INSTANCE(ref_inner_product_bwd_weights_t)
             nullptr,
         })},
         {{backward_weights, f16, f16, f16}, REG_BWD_PK({
-            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t<avx512_core_amx_fp16>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t<avx512_core_fp16>)
+            CPU_INSTANCE_X64(matmul_inner_product_bwd_weights_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_bwd_weights_t,avx512_core_amx_fp16)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_bwd_weights_t,avx512_core_fp16)
             CPU_INSTANCE(ref_inner_product_bwd_weights_t)
             nullptr,
         })},
         {{forward, s8, s8, f32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, s8, s8, s32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, s8, s8, s8}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, s8, s8, u8}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, u8, s8, f32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, u8, s8, s32}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, u8, s8, s8}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, u8, s8, u8}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni)
             CPU_INSTANCE(gemm_x8s8s32x_inner_product_fwd_t)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, s8, s8, bf16}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
         {{forward, u8, s8, bf16}, {
-            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t<avx512_core_amx>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core_vnni>)
-            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t<avx512_core>)
-            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t<avx2_vnni_2>)
+            //CPU_INSTANCE_X64(matmul_inner_product_fwd_t)
+            CPU_INSTANCE_AMX(brgemm_inner_product_fwd_t,avx512_core_amx)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core_vnni)
+            CPU_INSTANCE_AVX512(brgemm_inner_product_fwd_t,avx512_core)
+            CPU_INSTANCE_AVX2(brgemm_inner_product_fwd_t,avx2_vnni_2)
             CPU_INSTANCE(ref_inner_product_int8_fwd_t)
             nullptr,
         }},
diff --git a/src/cpu/cpu_inner_product_pd.hpp b/src/cpu/cpu_inner_product_pd.hpp
index 7554af4a81f..0d6742d1a3b 100644
--- a/src/cpu/cpu_inner_product_pd.hpp
+++ b/src/cpu/cpu_inner_product_pd.hpp
@@ -193,8 +193,8 @@ struct cpu_inner_product_fwd_pd_t : public inner_product_fwd_pd_t {
             /* with batch = 1, no transpose to use the faster gemv kernels */
             /* otherwise, we transpose the weights to improve efficiency of
              * no-copy kernels */
-            if (MB() > 1 && transpose_leading_dim(OC(), IC_total()))
-                transpose_md(weights_md_);
+//            if (MB() > 1 && transpose_leading_dim(OC(), IC_total()))
+//                transpose_md(weights_md_);
             return status::success;
         };
 
diff --git a/src/cpu/cpu_layer_normalization_list.cpp b/src/cpu/cpu_layer_normalization_list.cpp
index 222233bf74f..d3b33b6c27c 100644
--- a/src/cpu/cpu_layer_normalization_list.cpp
+++ b/src/cpu/cpu_layer_normalization_list.cpp
@@ -25,8 +25,9 @@
 using namespace dnnl::impl::cpu::x64;
 #endif
 
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_layer_normalization.hpp"
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_layer_normalization.hpp"
+using namespace dnnl::impl::cpu::acl;
 #endif
 
 namespace dnnl {
@@ -37,16 +38,12 @@ namespace {
 using namespace dnnl::impl::data_type;
 using namespace dnnl::impl::prop_kind;
 
-#if DNNL_AARCH64_USE_ACL
-using namespace dnnl::impl::cpu::aarch64;
-#endif
-
 // clang-format off
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_LNORM_P({
         {{forward}, {
             CPU_INSTANCE_X64(jit_uni_layer_normalization_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_layer_normalization_fwd_t)
+            CPU_INSTANCE_ACL(acl_layer_normalization_fwd_t)
             CPU_INSTANCE(simple_layer_normalization_fwd_t)
             CPU_INSTANCE(ref_layer_normalization_fwd_t)
             nullptr,
diff --git a/src/cpu/cpu_lrn_list.cpp b/src/cpu/cpu_lrn_list.cpp
index 4f369af72b7..74b12dec11c 100644
--- a/src/cpu/cpu_lrn_list.cpp
+++ b/src/cpu/cpu_lrn_list.cpp
@@ -36,32 +36,25 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_LRN_P({
         {{forward}, {
-            CPU_INSTANCE_X64(jit_avx512_common_lrn_fwd_t<f32>)
-            CPU_INSTANCE_X64(jit_avx512_common_lrn_fwd_t<bf16>)
-            CPU_INSTANCE_X64(jit_avx512_common_lrn_fwd_t<f16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<avx512_core_fp16, f16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<avx512_core, f32>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<avx512_core, bf16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<avx2_vnni_2, bf16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<avx2_vnni_2, f16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<avx2, f32>)
-            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t<sse41, f32>)
-            CPU_INSTANCE(ref_lrn_fwd_t<f32>)
-            CPU_INSTANCE(ref_lrn_fwd_t<bf16>)
-            CPU_INSTANCE(ref_lrn_fwd_t<f16>)
+            CPU_INSTANCE_X64(jit_avx512_common_lrn_fwd_t, f32)
+            CPU_INSTANCE_X64(jit_avx512_common_lrn_fwd_t, bf16)
+            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t, avx512_core, f32)
+            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t, avx512_core, bf16)
+            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t, avx2_vnni_2, bf16)
+            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t, avx2, f32)
+            CPU_INSTANCE_X64(jit_uni_lrn_fwd_t, sse41, f32)
+            CPU_INSTANCE(ref_lrn_fwd_t, f32)
+            CPU_INSTANCE(ref_lrn_fwd_t, bf16)
             nullptr,
         }},
         {{backward}, REG_BWD_PK({
-            CPU_INSTANCE_X64(jit_avx512_common_lrn_bwd_t<f32>)
-            CPU_INSTANCE_X64(jit_avx512_common_lrn_bwd_t<bf16>)
-            CPU_INSTANCE_X64(jit_avx512_common_lrn_bwd_t<f16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t<avx512_core_fp16, f16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t<avx512_core, f32>)
-            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t<avx512_core, bf16>)
-            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t<avx2, f32>)
-            CPU_INSTANCE(ref_lrn_bwd_t<f32>)
-            CPU_INSTANCE(ref_lrn_bwd_t<bf16>)
-            CPU_INSTANCE(ref_lrn_bwd_t<f16>)
+            CPU_INSTANCE_X64(jit_avx512_common_lrn_bwd_t, f32)
+            CPU_INSTANCE_X64(jit_avx512_common_lrn_bwd_t, bf16)
+            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t, avx512_core, f32)
+            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t, avx512_core, bf16)
+            CPU_INSTANCE_X64(jit_uni_lrn_bwd_t, avx2, f32)
+            CPU_INSTANCE(ref_lrn_bwd_t, f32)
+            CPU_INSTANCE(ref_lrn_bwd_t, bf16)
             nullptr,
         })},
     });
diff --git a/src/cpu/cpu_pooling_list.cpp b/src/cpu/cpu_pooling_list.cpp
index 951395c44bc..20e060c7e3c 100644
--- a/src/cpu/cpu_pooling_list.cpp
+++ b/src/cpu/cpu_pooling_list.cpp
@@ -30,15 +30,16 @@ using namespace dnnl::impl::cpu::x64;
 #include "cpu/aarch64/jit_uni_i8i8_pooling.hpp"
 #include "cpu/aarch64/jit_uni_pooling.hpp"
 using namespace dnnl::impl::cpu::aarch64;
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_pooling.hpp"
-#endif // DNNL_AARCH64_USE_ACL
 #elif DNNL_RV64
 #if DNNL_RISCV_USE_RVV_INTRINSICS
 #include "cpu/rv64/rvv_nchw_pooling.hpp"
 using namespace dnnl::impl::cpu::rv64;
 #endif // DNNL_RISCV_USE_RVV_INTRINSICS
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_pooling.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -53,60 +54,62 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_POOLING_P({
         {{forward}, {
             /* fp */
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx512_core_fp16, f16>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx512_core_fp16, f8_e5m2>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx512_core_fp16, f8_e4m3>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx512_core, bf16>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx512_core, f32>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx2_vnni_2, bf16>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx2_vnni_2, f16>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx2, f32>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<avx, f32>)
-            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t<sse41, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_pooling_fwd_t<sve_512, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_pooling_fwd_t<sve_256, f32>)
-            CPU_INSTANCE_AARCH64_ACL(acl_pooling_fwd_t)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core_fp16, f16)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core_fp16, f8_e5m2)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core_fp16, f8_e4m3)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core, bf16)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx512_core, f32)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx2_vnni_2, bf16)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx2_vnni_2, f16)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx2, f32)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, avx, f32)
+            CPU_INSTANCE_X64(jit_uni_pooling_fwd_t, sse41, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_pooling_fwd_t, sve_512, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_pooling_fwd_t, sve_256, f32)
+            CPU_INSTANCE_ACL(acl_pooling_fwd_t)
             CPU_INSTANCE_RV64GCV(riscv_nchw_pooling_fwd_t<f32>)
-            CPU_INSTANCE(nchw_pooling_fwd_t<bf16>)
-            CPU_INSTANCE(nchw_pooling_fwd_t<f32>)
-            CPU_INSTANCE(nchw_pooling_fwd_t<f16>)
-            CPU_INSTANCE(nchw_pooling_fwd_t<f8_e5m2>)
-            CPU_INSTANCE(nchw_pooling_fwd_t<f8_e4m3>)
-            CPU_INSTANCE(nhwc_pooling_fwd_t<bf16>)
-            CPU_INSTANCE(nhwc_pooling_fwd_t<f32>)
-            CPU_INSTANCE(nhwc_pooling_fwd_t<f16>)
-            CPU_INSTANCE(nhwc_pooling_fwd_t<f8_e5m2>)
-            CPU_INSTANCE(nhwc_pooling_fwd_t<f8_e4m3>)
-            CPU_INSTANCE(ref_pooling_fwd_t<f32>)
-            CPU_INSTANCE(ref_pooling_fwd_t<bf16, f32>)
-            CPU_INSTANCE(ref_pooling_fwd_t<f16, f32>)
-            CPU_INSTANCE(ref_pooling_fwd_t<f8_e5m2, f32>)
-            CPU_INSTANCE(ref_pooling_fwd_t<f8_e4m3, f32>)
+            CPU_INSTANCE(nchw_pooling_fwd_t, bf16)
+            CPU_INSTANCE(nchw_pooling_fwd_t, f32)
+            CPU_INSTANCE(nchw_pooling_fwd_t, f16)
+            CPU_INSTANCE(nchw_pooling_fwd_t, f8_e5m2)
+            CPU_INSTANCE(nchw_pooling_fwd_t, f8_e4m3)
+            CPU_INSTANCE(nhwc_pooling_fwd_t, bf16)
+            CPU_INSTANCE(nhwc_pooling_fwd_t, f32)
+            CPU_INSTANCE(nhwc_pooling_fwd_t, f16)
+            CPU_INSTANCE(nhwc_pooling_fwd_t, f8_e5m2)
+            CPU_INSTANCE(nhwc_pooling_fwd_t, f8_e4m3)
+            CPU_INSTANCE(ref_pooling_fwd_t, f32, f32, f32)
+            CPU_INSTANCE(ref_pooling_fwd_t, bf16, bf16, f32)
+            CPU_INSTANCE(ref_pooling_fwd_t, f16, f16, f32)
+            CPU_INSTANCE(ref_pooling_fwd_t, f8_e5m2, f8_e5m2, f32)
+            CPU_INSTANCE(ref_pooling_fwd_t, f8_e4m3, f8_e4m3, f32)
             /* int */
-            CPU_INSTANCE_X64(jit_uni_i8i8_pooling_fwd_t<avx512_core>)
-            CPU_INSTANCE_X64(jit_uni_i8i8_pooling_fwd_t<avx2>)
-            CPU_INSTANCE_X64(jit_uni_i8i8_pooling_fwd_t<sse41>)
-            CPU_INSTANCE_AARCH64(jit_uni_i8i8_pooling_fwd_t<sve_512>)
-            CPU_INSTANCE(ref_pooling_fwd_t<s32>)
-            CPU_INSTANCE(ref_pooling_fwd_t<s8, s32>)
-            CPU_INSTANCE(ref_pooling_fwd_t<u8, s32>)
+            CPU_INSTANCE_X64(jit_uni_i8i8_pooling_fwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_i8i8_pooling_fwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_i8i8_pooling_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_uni_i8i8_pooling_fwd_t, sve_512)
+            CPU_INSTANCE(ref_pooling_fwd_t, s32, s32, s32)
+            CPU_INSTANCE(ref_pooling_fwd_t, s8, s8, s32)
+            CPU_INSTANCE(ref_pooling_fwd_t, s8, f32, f32)
+            CPU_INSTANCE(ref_pooling_fwd_t, u8, u8, s32)
+            CPU_INSTANCE(ref_pooling_fwd_t, u8, f32, f32)
             nullptr,
         }},
         {{backward}, REG_BWD_PK({
-            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t<avx512_core_fp16, f16>)
-            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t<avx512_core, bf16>)
-            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t<avx512_core, f32>)
-            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t<avx2, f32>)
-            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t<avx, f32>)
-            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t<sse41, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_pooling_bwd_t<sve_512, f32>)
-            CPU_INSTANCE_AARCH64(jit_uni_pooling_bwd_t<sve_256, f32>)
-            CPU_INSTANCE(nchw_pooling_bwd_t<bf16>)
-            CPU_INSTANCE(nchw_pooling_bwd_t<f32>)
-            CPU_INSTANCE(nchw_pooling_bwd_t<f16>)
-            CPU_INSTANCE(nhwc_pooling_bwd_t<bf16>)
-            CPU_INSTANCE(nhwc_pooling_bwd_t<f32>)
-            CPU_INSTANCE(nhwc_pooling_bwd_t<f16>)
+            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t, avx512_core_fp16, f16)
+            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t, avx512_core, bf16)
+            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t, avx512_core, f32)
+            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t, avx2, f32)
+            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t, avx, f32)
+            CPU_INSTANCE_X64(jit_uni_pooling_bwd_t, sse41, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_pooling_bwd_t, sve_512, f32)
+            CPU_INSTANCE_AARCH64(jit_uni_pooling_bwd_t, sve_256, f32)
+            CPU_INSTANCE(nchw_pooling_bwd_t, bf16)
+            CPU_INSTANCE(nchw_pooling_bwd_t, f32)
+            CPU_INSTANCE(nchw_pooling_bwd_t, f16)
+            CPU_INSTANCE(nhwc_pooling_bwd_t, bf16)
+            CPU_INSTANCE(nhwc_pooling_bwd_t, f32)
+            CPU_INSTANCE(nhwc_pooling_bwd_t, f16)
             CPU_INSTANCE(ref_pooling_bwd_t)
             nullptr,
         })},
diff --git a/src/cpu/cpu_prelu_list.cpp b/src/cpu/cpu_prelu_list.cpp
index 883c356b18e..c7ff78c3424 100644
--- a/src/cpu/cpu_prelu_list.cpp
+++ b/src/cpu/cpu_prelu_list.cpp
@@ -23,9 +23,9 @@
 #include "cpu/x64/prelu/jit_prelu_forward.hpp"
 
 using namespace dnnl::impl::cpu::x64;
-#elif DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_prelu.hpp"
-using namespace dnnl::impl::cpu::aarch64;
+#elif DNNL_USE_ACL
+#include "cpu/acl/acl_prelu.hpp"
+using namespace dnnl::impl::cpu::acl;
 #endif
 
 namespace dnnl {
@@ -41,7 +41,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map = REG_PRELU_P({
         {{forward}, {
             CPU_INSTANCE_X64(jit_prelu_fwd_t)
-            CPU_INSTANCE_AARCH64_ACL(acl_prelu_fwd_t)
+            CPU_INSTANCE_ACL(acl_prelu_fwd_t)
             CPU_INSTANCE(ref_prelu_fwd_t)
             nullptr,
         }},
diff --git a/src/cpu/cpu_primitive.hpp b/src/cpu/cpu_primitive.hpp
index ff315b8bedf..ff531fd2705 100644
--- a/src/cpu/cpu_primitive.hpp
+++ b/src/cpu/cpu_primitive.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,59 +29,33 @@
 
 #include "cpu/ref_io_helper.hpp"
 
-#define DEFINE_SCALES_BUFFER_ATTR_ARG(attr, scales, arg) \
-    alignas(16) float CONCAT2(scales, _buf16)[16] = {0}; \
-    const float *scales {nullptr}; \
-    if ((attr)) { \
-        if ((attr)->output_scales_.has_default_values()) { \
-            utils::array_set(CONCAT2(scales, _buf16), 1.0f, 16); \
-            scales = CONCAT2(scales, _buf16); \
-        } else { \
-            scales = CTX_IN_MEM(const float *, arg); \
-            VCHECK_ATTR(scales != nullptr, \
-                    "Scales buffer for arg %d is missing", arg); \
-            const auto scales_d = ctx.memory_mdw(arg); \
-            VCHECK_ATTR(scales_d.data_type() == data_type::f32, \
-                    "Scales data type is not f32"); \
-            VCHECK_ATTR(scales_d.ndims() == 1, "Scales ndims is not 1"); \
-            if (scales_d.dims()[0] == 1) { \
-                utils::array_set(CONCAT2(scales, _buf16), scales[0], 16); \
-                scales = CONCAT2(scales, _buf16); \
-            } \
-        } \
-    } \
-    MAYBE_UNUSED(scales);
-
-#define DEFINE_SCALES_BUFFER_ATTR(attr, scales) \
-    DEFINE_SCALES_BUFFER_ATTR_ARG(attr, scales, DNNL_ARG_ATTR_OUTPUT_SCALES);
-
-#define DEFINE_SCALES_BUFFER(scales) \
-    DEFINE_SCALES_BUFFER_ATTR(pd()->attr(), scales)
-
+//NOLINTBEGIN(bugprone-macro-parentheses)
+// These macros are actual pieces of code, can't put certain pieces into `()`.
+// TODO: consider making them functions.
 #define DEFINE_ARG_SCALES_BUFFER_ATTR(attr, scales, arg) \
     alignas(16) float CONCAT2(scales, _buf16)[16] = {0}; \
     const float *scales {nullptr}; \
     if ((attr)) { \
-        if ((attr)->scales_.get(arg).has_default_values()) { \
+        if ((attr)->scales_.has_default_values(arg)) { \
             utils::array_set(CONCAT2(scales, _buf16), 1.0f, 16); \
             scales = CONCAT2(scales, _buf16); \
         } else { \
-            scales = CTX_IN_MEM(const float *, DNNL_ARG_ATTR_SCALES | arg); \
+            scales = CTX_IN_MEM(const float *, DNNL_ARG_ATTR_SCALES | (arg)); \
             VCHECK_ATTR(scales != nullptr, \
-                    "Scales buffer for arg %d is missing", arg); \
-            const auto scales_d = ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | arg); \
+                    "Scales buffer for arg %d is missing", (arg)); \
+            const auto scales_d \
+                    = ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | (arg)); \
             VCHECK_ATTR( \
-                    utils::one_of(scales_d.data_type(), data_type::f32, \
-                            data_type::f16, data_type::bf16, data_type::e8m0), \
+                    utils::one_of(scales_d.data_type(), data_type::f32, data_type::e8m0) \
+                    && (scales_d.ndims() == 1 || scales_d.ndims() == 2), \
                     "Unsupported scales data type"); \
-            if (scales_d.nelems() == 1) { \
-                const float s = cpu::io::load_float_value( \
-                        scales_d.data_type(), scales, 0); \
-                if (utils::one_of(arg, DNNL_ARG_DST, \
+            if (scales_d.dims()[0] == 1) { \
+                if (utils::one_of((arg), DNNL_ARG_DST, \
                             DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_DST)) { \
-                    utils::array_set(CONCAT2(scales, _buf16), 1.f / s, 16); \
+                    utils::array_set( \
+                            CONCAT2(scales, _buf16), 1.f / scales[0], 16); \
                 } else { \
-                    utils::array_set(CONCAT2(scales, _buf16), s, 16); \
+                    utils::array_set(CONCAT2(scales, _buf16), scales[0], 16); \
                 } \
                 scales = CONCAT2(scales, _buf16); \
             } \
@@ -90,24 +64,83 @@
     MAYBE_UNUSED(scales);
 
 #define DEFINE_ARG_SCALES_BUFFER(scales, arg) \
-    DEFINE_ARG_SCALES_BUFFER_ATTR(pd()->attr(), scales, arg)
-
-#define DEFINE_ZERO_POINTS_BUFFER(zero_points_ptr, mem_arg) \
-    int32_t CONCAT2(default_zero_point_, mem_arg) = 0; \
-    const int32_t *zero_points_ptr \
-            = pd()->attr()->zero_points_.defined(mem_arg) \
-            ? &CONCAT2(default_zero_point_, mem_arg) \
-            : CTX_IN_MEM( \
-                    const int32_t *, DNNL_ARG_ATTR_ZERO_POINTS | mem_arg); \
-    VCHECK_ATTR(zero_points_ptr != nullptr, \
-            "Zero points buffer for arg %d is missing", mem_arg); \
+    DEFINE_ARG_SCALES_BUFFER_ATTR(pd()->attr(), scales, (arg))
+
+#define DEFINE_ZERO_POINTS_BUFFER_ATTR_U8(attr, zero_points_ptr, arg) \
+    uint8_t CONCAT2(default_zero_point_, arg) = 0; \
+    const uint8_t *zero_points_ptr {nullptr}; \
+    if ((attr)) { \
+        if ((attr)->zero_points_.has_default_values(arg)) { \
+            zero_points_ptr = &CONCAT2(default_zero_point_, arg); \
+        } else { \
+            /* CAVEAT: type should be void to force proper loads of zero-points.
+             * Accessing `zero_points_ptr` by index will lead to a crash for
+             * datatypes different from s32. */ \
+            zero_points_ptr = CTX_IN_MEM( \
+                    const uint8_t *, DNNL_ARG_ATTR_ZERO_POINTS | (arg)); \
+            VCHECK_ATTR(zero_points_ptr != nullptr, \
+                    "Zero points buffer for arg %d is missing", (arg)); \
+            const auto zero_points_d \
+                    = ctx.memory_mdw(DNNL_ARG_ATTR_ZERO_POINTS | (arg)); \
+            VCHECK_ATTR(utils::one_of(zero_points_d.data_type(), \
+                                data_type::s32, data_type::s8, data_type::u8, \
+                                data_type::s4, data_type::u4, data_type::f32), \
+                    VERBOSE_INVALID_DATATYPE, "zero points"); \
+        } \
+    } \
+    MAYBE_UNUSED(zero_points_ptr);
+
+#define DEFINE_ZERO_POINTS_BUFFER_ATTR(attr, zero_points_ptr, arg) \
+    int32_t CONCAT2(default_zero_point_, arg) = 0; \
+    const int32_t *zero_points_ptr {nullptr}; \
+    if ((attr)) { \
+        if ((attr)->zero_points_.has_default_values(arg)) { \
+            zero_points_ptr = &CONCAT2(default_zero_point_, arg); \
+        } else { \
+            /* CAVEAT: type should be void to force proper loads of zero-points.
+             * Accessing `zero_points_ptr` by index will lead to a crash for
+             * datatypes different from s32. */ \
+            zero_points_ptr = CTX_IN_MEM( \
+                    const int32_t *, DNNL_ARG_ATTR_ZERO_POINTS | (arg)); \
+            VCHECK_ATTR(zero_points_ptr != nullptr, \
+                    "Zero points buffer for arg %d is missing", (arg)); \
+            const auto zero_points_d \
+                    = ctx.memory_mdw(DNNL_ARG_ATTR_ZERO_POINTS | (arg)); \
+            VCHECK_ATTR(utils::one_of(zero_points_d.data_type(), \
+                                data_type::s32, data_type::s8, data_type::u8, \
+                                data_type::s4, data_type::u4), \
+                    VERBOSE_INVALID_DATATYPE, "zero points"); \
+        } \
+    } \
     MAYBE_UNUSED(zero_points_ptr);
 
+#define DEFINE_ZERO_POINTS_BUFFER(zero_points_ptr, arg) \
+    DEFINE_ZERO_POINTS_BUFFER_ATTR(pd()->attr(), zero_points_ptr, arg)
+
 #define ASSIGN_ARG_SCALE_VALUE(scale, mem_arg) \
     alignas(16) float CONCAT2(CONCAT2(scales, _buf16), mem_arg)[16] = {0}; \
-    if (pd()->attr()->scales_.get(mem_arg).has_default_values()) { \
+    if (pd()->attr()->scales_.has_default_values(mem_arg)) { \
         utils::array_set(CONCAT2(CONCAT2(scales, _buf16), mem_arg), 1.0f, 16); \
         scale = CONCAT2(CONCAT2(scales, _buf16), mem_arg); \
+    }
+
+#define DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zero_points_ptr, jcp) \
+    const uint8_t *input_zero_points_ptr = nullptr; \
+    if (jcp.with_input_zp) { \
+        input_zero_points_ptr = CTX_IN_MEM(const uint8_t *, DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC); \
+        if (input_zero_points_ptr == nullptr) return status::invalid_arguments; \
+    }
+
+#define DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation_ptr, jcp) \
+    const int32_t *output_compensation_ptr = nullptr; \
+    if (jcp.with_input_zp) { \
+        output_compensation_ptr = CTX_IN_MEM(const int32_t *, DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST); \
+        if (output_compensation_ptr == nullptr) return status::invalid_arguments; \
+    }
+
+#define ASSIGN_INPUT_SCALE_VALUE(scale, mem_arg) \
+    if (pd()->attr()->scales_.get(mem_arg).defined()) { \
+        scale = pd()->attr()->scales_.get(mem_arg).scales_; \
     } else { \
         const auto scale_d = ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | mem_arg); \
         VCHECK_ATTR(scale_d.data_type() == data_type::f32, \
@@ -145,4 +178,6 @@
 #define DEFINE_ZERO_POINT_VALUE(zero_point, mem_arg) \
     DEFINE_ZERO_POINT_VALUE_ATTR(pd()->attr(), zero_point, mem_arg)
 
+//NOLINTEND(bugprone-macro-parentheses)
+
 #endif // CPU_CPU_PRIMITIVE_HPP
diff --git a/src/cpu/cpu_reduction_list.cpp b/src/cpu/cpu_reduction_list.cpp
index 6dde9e1d93f..86465bafc1d 100644
--- a/src/cpu/cpu_reduction_list.cpp
+++ b/src/cpu/cpu_reduction_list.cpp
@@ -31,20 +31,18 @@ namespace {
 using namespace dnnl::impl::data_type;
 
 // clang-format off
-constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({
+const impl_list_item_t impl_list[] = REG_REDUCTION_P({
     CPU_INSTANCE_X64(jit_uni_reduction_t)
 
-    CPU_INSTANCE(ref_reduction_t<f32, f32, f32>)
-    CPU_INSTANCE(ref_reduction_t<bf16, bf16, f32>)
-    CPU_INSTANCE(ref_reduction_t<bf16, f32, f32>)
-    CPU_INSTANCE(ref_reduction_t<f16, f16, f32>)
-    CPU_INSTANCE(ref_reduction_t<f16, f32, f32>)
-    CPU_INSTANCE(ref_reduction_t<s8, s8, s32>)
-    CPU_INSTANCE(ref_reduction_t<s8, s32, s32>)
-    CPU_INSTANCE(ref_reduction_t<s8, f32, s32>)
-    CPU_INSTANCE(ref_reduction_t<u8, u8, s32>)
-    CPU_INSTANCE(ref_reduction_t<u8, s32, s32>)
-    CPU_INSTANCE(ref_reduction_t<u8, f32, s32>)
+    CPU_INSTANCE(ref_reduction_t, f32, f32, f32)
+    CPU_INSTANCE(ref_reduction_t, bf16, bf16, f32)
+    CPU_INSTANCE(ref_reduction_t, bf16, f32, f32)
+    CPU_INSTANCE(ref_reduction_t, s8, s8, s32)
+    CPU_INSTANCE(ref_reduction_t, s8, s32, s32)
+    CPU_INSTANCE(ref_reduction_t, s8, f32, s32)
+    CPU_INSTANCE(ref_reduction_t, u8, u8, s32)
+    CPU_INSTANCE(ref_reduction_t, u8, s32, s32)
+    CPU_INSTANCE(ref_reduction_t, u8, f32, s32)
     /* eol */
     nullptr,
 });
diff --git a/src/cpu/cpu_shuffle_list.cpp b/src/cpu/cpu_shuffle_list.cpp
index e81a19e89c5..cb4681415ba 100644
--- a/src/cpu/cpu_shuffle_list.cpp
+++ b/src/cpu/cpu_shuffle_list.cpp
@@ -36,14 +36,14 @@ namespace {
 using namespace dnnl::impl::data_type;
 
 // clang-format off
-constexpr impl_list_item_t impl_list[] = REG_SHUFFLE_P({
-        CPU_INSTANCE_X64(jit_uni_shuffle_t<avx512_core>)
-        CPU_INSTANCE_X64(jit_uni_shuffle_t<avx>)
-        CPU_INSTANCE_X64(jit_uni_shuffle_t<sse41>)
-        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t<sve_512>)
-        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t<sve_256>)
-        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t<sve_128>)
-        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t<asimd>)
+const impl_list_item_t impl_list[] = REG_SHUFFLE_P({
+        CPU_INSTANCE_X64(jit_uni_shuffle_t, avx512_core)
+        CPU_INSTANCE_X64(jit_uni_shuffle_t, avx)
+        CPU_INSTANCE_X64(jit_uni_shuffle_t, sse41)
+        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t, sve_512)
+        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t, sve_256)
+        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t, sve_128)
+        CPU_INSTANCE_AARCH64(jit_uni_shuffle_t, asimd)
         CPU_INSTANCE(ref_shuffle_t)
         /* eol */
         nullptr,
diff --git a/src/cpu/cpu_softmax_list.cpp b/src/cpu/cpu_softmax_list.cpp
index 5168f0708a1..20017f388a7 100644
--- a/src/cpu/cpu_softmax_list.cpp
+++ b/src/cpu/cpu_softmax_list.cpp
@@ -22,14 +22,16 @@
 
 #if DNNL_X64
 #include "cpu/x64/jit_uni_softmax.hpp"
+#include "cpu/x64/jit_uni_fork_softmax.hpp"
 using namespace dnnl::impl::cpu::x64;
 #elif DNNL_AARCH64
 #include "cpu/aarch64/jit_uni_softmax.hpp"
-#if DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/acl_softmax.hpp"
-#endif
 using namespace dnnl::impl::cpu::aarch64;
 #endif
+#if DNNL_USE_ACL
+#include "cpu/acl/acl_softmax.hpp"
+using namespace dnnl::impl::cpu::acl;
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -44,18 +46,21 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &impl_list_map() {
     static std::map<pk_impl_key_t, std::vector<impl_list_item_t>> the_map =  REG_SOFTMAX_P({
         {{forward}, {
             CPU_INSTANCE_X64(jit_uni_softmax_fwd_t)
-            CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t<sve_128>)
-            CPU_INSTANCE_AARCH64_ACL(acl_softmax_fwd_t)
+            CPU_INSTANCE_X64(jit_uni_fork_softmax_fwd_t, avx512_core)
+            CPU_INSTANCE_X64(jit_uni_fork_softmax_fwd_t, avx2)
+            CPU_INSTANCE_X64(jit_uni_fork_softmax_fwd_t, sse41)
+            CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t, sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t, sve_256)
+            CPU_INSTANCE_AARCH64(jit_uni_softmax_fwd_t, sve_128)
+            CPU_INSTANCE_ACL(acl_softmax_fwd_t)
             CPU_INSTANCE(ref_softmax_fwd_t)
             nullptr,
         }},
         {{backward}, REG_BWD_PK({
             CPU_INSTANCE_X64(jit_uni_softmax_bwd_t)
-            CPU_INSTANCE_AARCH64(jit_uni_softmax_bwd_t<sve_512>)
-            CPU_INSTANCE_AARCH64(jit_uni_softmax_bwd_t<sve_256>)
-            CPU_INSTANCE_AARCH64(jit_uni_softmax_bwd_t<sve_128>)
+            CPU_INSTANCE_AARCH64(jit_uni_softmax_bwd_t, sve_512)
+            CPU_INSTANCE_AARCH64(jit_uni_softmax_bwd_t, sve_256)
+            CPU_INSTANCE_AARCH64(jit_uni_softmax_bwd_t, sve_128)
             CPU_INSTANCE(ref_softmax_bwd_t)
             nullptr,
         })},
diff --git a/src/cpu/cpu_stream.hpp b/src/cpu/cpu_stream.hpp
index 30d5a6e058b..7bf2cac3a44 100644
--- a/src/cpu/cpu_stream.hpp
+++ b/src/cpu/cpu_stream.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace cpu {
 struct cpu_stream_t : public stream_t {
     cpu_stream_t(engine_t *engine, impl::stream_impl_t *stream_impl)
         : stream_t(engine, stream_impl) {}
-    virtual ~cpu_stream_t() = default;
+    ~cpu_stream_t() override = default;
 
     dnnl::impl::status_t wait() override {
         // CPU execution is synchronous so return immediately
diff --git a/src/cpu/cpu_sum.cpp b/src/cpu/cpu_sum.cpp
index 3f1ff8911e9..24095785fcc 100644
--- a/src/cpu/cpu_sum.cpp
+++ b/src/cpu/cpu_sum.cpp
@@ -32,21 +32,18 @@ namespace cpu {
 
 namespace {
 using namespace dnnl::impl::data_type;
+
 #define INSTANCE(...) \
     impl_list_item_t(impl_list_item_t::sum_type_deduction_helper_t< \
             __VA_ARGS__::pd_t>()),
 #define SUM_INSTANCE_AVX512(...) REG_AVX512_ISA(INSTANCE(__VA_ARGS__))
 #define SUM_INSTANCE_AVX2(...) REG_AVX2_ISA(INSTANCE(__VA_ARGS__))
 // clang-format off
-constexpr impl_list_item_t cpu_sum_impl_list[] = REG_SUM_P({
+const impl_list_item_t cpu_sum_impl_list[] = REG_SUM_P({
         SUM_INSTANCE_AVX512(jit_xf16_sum_t<bf16, bf16, avx512_core>)
         SUM_INSTANCE_AVX512(jit_xf16_sum_t<bf16, f32, avx512_core>)
         SUM_INSTANCE_AVX2(jit_xf16_sum_t<bf16, bf16, avx2_vnni_2>)
         SUM_INSTANCE_AVX2(jit_xf16_sum_t<bf16, f32, avx2_vnni_2>)
-        SUM_INSTANCE_AVX2(jit_xf16_sum_t<f16, f16, avx2_vnni_2>)
-        SUM_INSTANCE_AVX2(jit_xf16_sum_t<f16, f32, avx2_vnni_2>)
-        INSTANCE(simple_sum_t<f16>)
-        INSTANCE(simple_sum_t<f16, f32>)
         INSTANCE(simple_sum_t<bf16>)
         INSTANCE(simple_sum_t<bf16, f32>)
         INSTANCE(simple_sum_t<f32>)
diff --git a/src/cpu/dw_convolution_utils.hpp b/src/cpu/dw_convolution_utils.hpp
index 088e01b9964..f10a13334b1 100644
--- a/src/cpu/dw_convolution_utils.hpp
+++ b/src/cpu/dw_convolution_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,27 +39,27 @@ inline status_t get_depthwise_conv_desc(convolution_desc_t &cd_dw,
             || !attr_1x1.post_ops_.entry_[dw_po_index].is_convolution())
         return status::invalid_arguments;
 
+    // todo: [AV] remove this check when we use original oneDNN dw conv fusing
+    if (attr_1x1.post_ops_.entry_[dw_po_index].is_convolution())
+        return status::unimplemented;
+
     // Create new attributes with scales from depthwise post-op and copy
     // post-ops after depthwise post-op.
     auto &dw_po = attr_1x1.post_ops_.entry_[dw_po_index].depthwise_conv;
 
-    // erase 1x1 conv scales
-    for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-        auto &scale = attr_dw.scales_.get(arg);
-        if (!scale.has_default_values()) attr_dw.scales_.reset(arg);
-    }
-
     const auto &dw_src_scales = attr_1x1.scales_.get(DNNL_ARG_DST);
     const auto &dw_wei_scales
             = attr_1x1.scales_.get(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS);
     const auto &dw_dst_scales
             = attr_1x1.scales_.get(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_DST);
+
+    assert(attr_dw.scales_.has_default_values());
     if (!dw_src_scales.has_default_values())
-        attr_dw.scales_.set(DNNL_ARG_SRC, dw_src_scales.mask_);
+        CHECK(attr_dw.scales_.set(DNNL_ARG_SRC, dw_src_scales.get_mask()));
     if (!dw_wei_scales.has_default_values())
-        attr_dw.scales_.set(DNNL_ARG_WEIGHTS, dw_wei_scales.mask_);
+        CHECK(attr_dw.scales_.set(DNNL_ARG_WEIGHTS, dw_wei_scales.get_mask()));
     if (!dw_dst_scales.has_default_values())
-        attr_dw.scales_.set(DNNL_ARG_DST, dw_dst_scales.mask_);
+        CHECK(attr_dw.scales_.set(DNNL_ARG_DST, dw_dst_scales.get_mask()));
 
     auto dw_po_len = attr_1x1.post_ops_.len() - (dw_po_index + 1);
     attr_dw.post_ops_.entry_.resize(dw_po_len);
diff --git a/src/cpu/gemm/f32/ref_gemm_f32.cpp b/src/cpu/gemm/f32/ref_gemm_f32.cpp
index e7d69f01727..944df461e3c 100644
--- a/src/cpu/gemm/f32/ref_gemm_f32.cpp
+++ b/src/cpu/gemm/f32/ref_gemm_f32.cpp
@@ -38,7 +38,10 @@ template <typename data_t>
 void copy_A(
         bool isTransA, dim_t K, const data_t *A, const dim_t lda, data_t *ws) {
     for (dim_t k = 0; k < K; k++) {
+#if !defined(_MSC_VER)
+        // Compilation with '#pragma omp simd' in this place on VS2019 to lead to fatal error C1001
         PRAGMA_OMP_SIMD()
+#endif
         for (dim_t i = 0; i < unroll_factor<data_t>::m; i++) {
             ws[i] = isTransA ? A[i * lda + k] : A[i + k * lda];
         }
diff --git a/src/cpu/gemm_convolution.cpp b/src/cpu/gemm_convolution.cpp
index 672997f6171..80edde22afa 100644
--- a/src/cpu/gemm_convolution.cpp
+++ b/src/cpu/gemm_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +23,9 @@
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
 #include "cpu/gemm_convolution.hpp"
+#if DNNL_X64
+#include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -51,13 +54,20 @@ status_t gemm_convolution_fwd_t::execute_forward_nspc(
     auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
     auto dst_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
 
+#if DNNL_X64
+    const auto post_ops_binary_rhs_arg_vec
+            = x64::binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+#else
+    const auto post_ops_binary_rhs_arg_vec = std::vector<const void *>();
+#endif
+
     auto scratchpad = ctx.get_scratchpad_grantor();
     const conv_gemm_conf_t &jcp = pd()->jcp_;
     std::atomic<status_t> st(status::success);
 
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         status_t st_thr = execute_forward_thr_nspc(ctx, ithr, nthr, src_base,
-                wei_base, bia_base, dst_base, scratchpad);
+                wei_base, bia_base, dst_base, scratchpad, post_ops_binary_rhs_arg_vec);
         if (st_thr != status::success) st = st_thr;
     });
 
@@ -67,7 +77,7 @@ status_t gemm_convolution_fwd_t::execute_forward_nspc(
 status_t gemm_convolution_fwd_t::execute_forward_thr_nspc(const exec_ctx_t &ctx,
         const int ithr, const int nthr, const data_t *src_base,
         const data_t *wei_base, const data_t *bia_base, data_t *dst_base,
-        const memory_tracking::grantor_t &scratchpad) const {
+        const memory_tracking::grantor_t &scratchpad, const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const {
     const conv_gemm_conf_t &jcp = pd()->jcp_;
 
     // Src Format: mb-spatial-groups-input_channels
@@ -151,68 +161,16 @@ status_t gemm_convolution_fwd_t::execute_forward_thr_nspc(const exec_ctx_t &ctx,
                     &LDC);
             if (st != status::success) return st;
 
-            if (jcp.with_bias || jcp.with_eltwise || jcp.with_binary) {
-                parallel(0, [&](int ithr, int nthr) {
-                    dim_t start, end;
-                    balance211(N * jcp.oc, nthr, ithr, start, end);
-
-                    const size_t first_oc = start % jcp.oc;
-                    const size_t last_oc = (end - 1) % jcp.oc;
-                    const size_t first_os = start / jcp.oc;
-                    const size_t last_os = (end - 1) / jcp.oc;
-
-                    for (size_t os = first_os; os <= last_os; ++os) {
-                        const size_t start_oc = (os == first_os) ? first_oc : 0;
-                        const size_t end_oc
-                                = (os == last_os) ? last_oc : jcp.oc - 1;
-
-                        const data_t *__restrict bia_arr
-                                = bia_base ? bia_base + g * jcp.oc : nullptr;
-                        data_t *__restrict dst_arr = dst + os * dst_os_stride;
-
-                        if (jcp.with_bias) {
-                            PRAGMA_OMP_SIMD()
-                            for (size_t oc = start_oc; oc <= end_oc; oc++) {
-                                dst_arr[oc] += bia_arr[oc];
-                            }
-                        }
+            if (pp_kernel_) {
+                const size_t first_oc = g * jcp.oc;
+                const size_t last_oc = jcp.oc;
+                const size_t first_os = 0;
+                const size_t last_os = N;
 
-                        if (jcp.with_eltwise || jcp.with_binary) {
-                            bool fast_relu_done = false;
-                            if (jcp.with_eltwise && jcp.post_ops.len() == 1) {
-                                // fast branch for ReLU case
-                                const auto &eltwise
-                                        = jcp.post_ops.entry_.back().eltwise;
-
-                                if (eltwise.alg == alg_kind::eltwise_relu) {
-                                    const auto alpha = eltwise.alpha;
-                                    const auto scale = eltwise.scale;
-                                    PRAGMA_OMP_SIMD()
-                                    for (size_t oc = start_oc; oc <= end_oc;
-                                            oc++) {
-                                        if (dst_arr[oc] < 0)
-                                            dst_arr[oc] *= alpha;
-                                        dst_arr[oc] *= scale;
-                                    }
-                                    fast_relu_done = true;
-                                }
-                            }
-                            if (!fast_relu_done) {
-                                ref_post_ops_t::args_t args;
-                                args.ctx = &ctx;
-                                args.dst_md = pd()->dst_md();
-
-                                for (size_t oc = start_oc; oc <= end_oc; oc++) {
-                                    // jcp.od is not part of jcp.os, so multiply
-                                    // jcp.od to get spatial offset.
-                                    args.l_offset = (g * jcp.oc + oc)
-                                            * (jcp.os * jcp.od);
-                                    post_ops_->execute(dst_arr[oc], args);
-                                }
-                            }
-                        }
-                    }
-                });
+                for (size_t os = first_os; os < last_os; ++os) {
+                    data_t* dst_local = dst + os * dst_os_stride;
+                    (*pp_kernel_)(dst_base, dst_local, bia_base, 1, first_oc, last_oc, 1, post_ops_binary_rhs_arg_vec);
+                }
             }
         }
         nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow);
@@ -226,16 +184,37 @@ status_t gemm_convolution_fwd_t::execute_forward_ncsp(
     auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
     auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+    auto dst_orig = dst;
+
+#if DNNL_X64
+    const auto post_ops_binary_rhs_arg_vec
+            = x64::binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+#else
+    const auto post_ops_binary_rhs_arg_vec = std::vector<const void *>();
+#endif
 
     auto col = ctx.get_scratchpad_grantor().get<data_t>(key_conv_gemm_col);
 
     const conv_gemm_conf_t &jcp = this->pd()->jcp_;
 
-    const size_t src_step = jcp.ic * jcp.ih * jcp.iw * jcp.id;
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+
+    // The second arg in template means sub_offset0 = true
+    // See `blk_off` method definition.
+    const size_t src_mb_stride = src_d.blk_off<false, true>(1);
+    const size_t src_g_stride = src_d.blk_off<false, true>(0, 1) * jcp.ic;
+
+    const size_t dst_mb_stride = dst_d.blk_off<false, true>(1);
+    const size_t dst_g_stride = dst_d.blk_off<false, true>(0, 1) * jcp.oc;
+
     const size_t weights_oc_size = jcp.ic * jcp.ks;
     const size_t weights_g_size = weights_oc_size * jcp.oc;
     const bool is_problem_3d = pd()->ndims() == 5;
 
+    src += src_d.off_l(0);
+    dst += dst_d.off_l(0);
+
     assert(IMPLICATION(is_problem_3d,
             jcp.os_block == jcp.os && jcp.ic_block == jcp.ic
                     && jcp.os_nb_block == 1));
@@ -254,7 +233,7 @@ status_t gemm_convolution_fwd_t::execute_forward_ncsp(
         auto inner_ker = [&](int spatial, const im_pos_t &curr, im_pos_t &prev,
                                  im_pos_t &step, const im_pos_t &end) {
             const data_t *_src
-                    = src + (curr.n * jcp.ngroups + curr.g) * src_step;
+                    = src + curr.n * src_mb_stride + curr.g * src_g_stride;
             step.oc = nstl::min(
                     jcp.oc_block, nstl::min(jcp.oc, end.oc) - curr.oc);
             step.sp = nstl::min(jcp.os_block,
@@ -275,10 +254,9 @@ status_t gemm_convolution_fwd_t::execute_forward_ncsp(
             const data_t one = 1.0;
 
             const dim_t M = jcp.os * jcp.od;
-            const size_t dst_step = jcp.oc * M;
             const dim_t m = step.sp;
             const dim_t LDA = jcp.im2col_sz ? m : M;
-            data_t *_dst = dst + (curr.n * jcp.ngroups + curr.g) * dst_step
+            data_t *_dst = dst + curr.n * dst_mb_stride + curr.g * dst_g_stride
                     + curr.oc * M + curr.od * jcp.os + curr.sp;
             const dim_t K = step.ic * jcp.ks;
             const dim_t LDB = jcp.ic * jcp.ks;
@@ -296,61 +274,8 @@ status_t gemm_convolution_fwd_t::execute_forward_ncsp(
                     &LDA, _weights, &LDB, &beta, _dst, &M);
             if (st != status::success) return st;
 
-            if (curr.ic == jcp.ic - step.ic) {
-                // TODO: for "outer threading" we have parallel section within
-                // outermost "parallel". It is not good. Consider to use
-                // "parallel" here with number of threads passed as parameter
-                const int oc_start = curr.g * jcp.oc + curr.oc;
-                if (jcp.with_eltwise || jcp.with_binary) {
-                    bool fast_relu_done = false;
-                    if (jcp.with_eltwise && jcp.post_ops.len() == 1) {
-                        // fast branch for ReLU case
-                        const auto &eltwise
-                                = jcp.post_ops.entry_.back().eltwise;
-                        if (eltwise.alg == alg_kind::eltwise_relu) {
-                            parallel_nd(step.oc, [&](dim_t oc) {
-                                data_t b = jcp.with_bias ? bias[oc_start + oc]
-                                                         : 0;
-                                data_t *d_ = _dst + oc * M;
-                                PRAGMA_OMP_SIMD()
-                                for (int oS = 0; oS < m; ++oS) {
-                                    d_[oS] += b;
-                                    if (d_[oS] < 0) d_[oS] *= eltwise.alpha;
-                                    d_[oS] *= eltwise.scale;
-                                }
-                            });
-                            fast_relu_done = true;
-                        }
-                    }
-                    if (!fast_relu_done) {
-                        parallel_nd(step.oc, [&](dim_t oc) {
-                            data_t b = jcp.with_bias ? bias[oc_start + oc] : 0;
-                            data_t *d_ = _dst + oc * M;
-
-                            ref_post_ops_t::args_t args;
-                            args.ctx = &ctx;
-                            args.dst_md = pd()->dst_md();
-                            args.l_offset = d_ - dst;
-
-                            PRAGMA_OMP_SIMD()
-                            for (int oS = 0; oS < m; ++oS) {
-                                d_[oS] += b;
-                                post_ops_->execute(d_[oS], args);
-                                args.l_offset++;
-                            }
-                        });
-                    }
-
-                } else if (jcp.with_bias) {
-                    parallel_nd(step.oc, [&](dim_t oc) {
-                        data_t b = bias[oc_start + oc];
-                        data_t *d_ = _dst + oc * M;
-                        PRAGMA_OMP_SIMD()
-                        for (int oS = 0; oS < m; ++oS) {
-                            d_[oS] += b;
-                        }
-                    });
-                }
+            if (pp_kernel_ && curr.ic == jcp.ic - step.ic) {
+                (*pp_kernel_)(dst_orig, _dst, bias, m, curr.g * jcp.oc + curr.oc, step.oc, M, post_ops_binary_rhs_arg_vec);
             }
 
             return status::success;
@@ -422,13 +347,20 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_nspc(
     auto bia_base = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
     auto diff_src_base = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_SRC);
 
+#if DNNL_X64
+    const auto post_ops_binary_rhs_arg_vec
+            = x64::binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+#else
+    const auto post_ops_binary_rhs_arg_vec = std::vector<const void *>();
+#endif
+
     auto scratchpad = ctx.get_scratchpad_grantor();
     const conv_gemm_conf_t &jcp = pd()->jcp_;
     std::atomic<status_t> st(status::success);
 
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         status_t st_thr = execute_backward_data_thr_nspc(ithr, nthr,
-                diff_dst_base, wei_base, bia_base, diff_src_base, scratchpad);
+                diff_dst_base, wei_base, bia_base, diff_src_base, scratchpad, post_ops_binary_rhs_arg_vec);
         if (st_thr != status::success) st = st_thr;
     });
 
@@ -438,7 +370,8 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_nspc(
 status_t gemm_convolution_bwd_data_t::execute_backward_data_thr_nspc(
         const int ithr, const int nthr, const data_t *diff_dst_base,
         const data_t *wei_base, const data_t *bia_base, data_t *diff_src_base,
-        const memory_tracking::grantor_t &scratchpad) const {
+        const memory_tracking::grantor_t &scratchpad,
+        const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const {
     const conv_gemm_conf_t &jcp = pd()->jcp_;
 
     // Diff_dst Format: mb-spatial-groups-output_channels
@@ -458,6 +391,8 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_thr_nspc(
     // threads share work across mini-batch and groups
     const dim_t work_amount = jcp.ngroups * jcp.mb;
 
+    const auto &p = pd()->attr()->post_ops_;
+
     data_t *__restrict col = scratchpad.get<data_t>(key_conv_gemm_col)
             + (ptrdiff_t)ithr * jcp.im2col_sz;
     const bool acc_needed = jcp.ngroups > 1;
@@ -506,6 +441,31 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_thr_nspc(
                 }
             });
         }
+        if (p.len() > 0) {
+            std::size_t post_ops_data_idx = 0;
+            int depthwise_inj_idx = 0;
+            for (int i = 0; i < p.len(); i++) {
+                auto &post_op = p.entry_[i];
+                if (post_op.is_depthwise()) {
+                    auto depthwise_base = reinterpret_cast<const float*>(post_ops_binary_rhs_arg_vec[post_ops_data_idx]);
+                    auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                    auto depthwise_bias = post_op.depthwise.alg == alg_kind::depthwise_scale_shift
+                            ? depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts]
+                            : nullptr;
+
+                    parallel_nd(static_cast<size_t>(jcp.is) * jcp.id, [&](size_t is) {
+                        data_t *__restrict diff_src_arr
+                                = diff_src + is * diff_src_os_stride;
+                        for (int ic = 0; ic < jcp.ic; ic++) {
+                            diff_src_arr[ic] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(diff_src_arr[ic],
+                                depthwise_weights + g * jcp.ic + ic, depthwise_bias + g * jcp.ic + ic);
+                        }
+                    });
+                    post_ops_data_idx++;
+                    depthwise_inj_idx++;
+                }
+            }
+        }
         nd_iterator_step(n, jcp.mb, g, jcp.ngroups);
     }
     return status::success;
@@ -517,13 +477,28 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_ncsp(
     auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_SRC);
 
+#if DNNL_X64
+    const auto post_ops_binary_rhs_arg_vec
+            = x64::binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+#else
+    const auto post_ops_binary_rhs_arg_vec = std::vector<const void *>();
+#endif
+
     auto col = ctx.get_scratchpad_grantor().get<data_t>(key_conv_gemm_col);
 
     const conv_gemm_conf_t &jcp = this->pd()->jcp_;
 
     const dim_t M = jcp.os * jcp.od;
-    const size_t src_step = (size_t)jcp.ic * jcp.ih * jcp.iw * jcp.id;
-    const size_t dst_step = (size_t)jcp.oc * M;
+    const size_t src_step_to_clean = (size_t)jcp.ic * jcp.ih * jcp.iw * jcp.id;
+    const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
+    const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
+
+    // The second arg in template means sub_offset0 = true
+    // See `blk_off` method definition.
+    const size_t src_step = diff_src_d.blk_off<false, true>(1) / jcp.ngroups;
+    const size_t dst_step = diff_dst_d.blk_off<false, true>(1) / jcp.ngroups;
+    diff_src += diff_src_d.off_l(0);
+    diff_dst += diff_dst_d.off_l(0);
     const size_t weights_g_size = (size_t)jcp.ic * jcp.oc * jcp.ks;
 
     const dim_t m = jcp.os_block;
@@ -533,6 +508,8 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_ncsp(
     const dim_t work_amount = (size_t)jcp.ngroups * jcp.mb;
     const bool is_problem_3d = pd()->ndims() == 5;
 
+    const auto &p = pd()->attr()->post_ops_;
+
     std::atomic<status_t> st(status::success);
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz;
@@ -547,7 +524,7 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_ncsp(
             if (is_problem_3d && jcp.im2col_sz > 0) {
                 // jit_gemm_convolution_utils::col2im_3d() assumes that the
                 // accumulator is initialized by zeroes
-                for (size_t i = 0; i < src_step; i++)
+                for (size_t i = 0; i < src_step_to_clean; i++)
                     _diff_src[i] = (data_t)0;
             }
 
@@ -580,6 +557,31 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_ncsp(
                     }
                 }
             }
+            if (p.len() > 0) {
+                std::size_t post_ops_data_idx = 0;
+                int depthwise_inj_idx = 0;
+                for (int i = 0; i < p.len(); i++) {
+                    auto &post_op = p.entry_[i];
+                    if (post_op.is_depthwise()) {
+                        auto depthwise_base = reinterpret_cast<const float*>(post_ops_binary_rhs_arg_vec[post_ops_data_idx]);
+                        auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                        auto depthwise_bias = post_op.depthwise.alg == alg_kind::depthwise_scale_shift
+                                              ? depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts]
+                                              : nullptr;
+                        parallel_nd(jcp.ic, [&](const int ic) {
+                            for (int id = 0; id < jcp.id; ++id) {
+                                data_t *d_ = _diff_src + ic * jcp.id * jcp.is + id * jcp.is;
+                                for (int iS = 0; iS < jcp.is; ++iS) {
+                                    d_[iS] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(d_[iS],
+                                            depthwise_weights + g * jcp.ic + ic, depthwise_bias + g * jcp.ic + ic);
+                                }
+                            }
+                        });
+                        post_ops_data_idx++;
+                        depthwise_inj_idx++;
+                    }
+                }
+            }
             nd_iterator_step(g, jcp.ngroups, n, jcp.mb);
         }
     });
diff --git a/src/cpu/gemm_convolution.hpp b/src/cpu/gemm_convolution.hpp
index c321266ebc4..1000d8fa16f 100644
--- a/src/cpu/gemm_convolution.hpp
+++ b/src/cpu/gemm_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,15 +28,19 @@
 #include "cpu/gemm_convolution_utils.hpp"
 #include "cpu/primitive_attr_postops.hpp"
 
+#include "ref_depthwise_injector.hpp"
+
+#if DNNL_X64
+#include "cpu/x64/cpu_isa_traits.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+#endif
 namespace dnnl {
 namespace impl {
 namespace cpu {
 
 struct gemm_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 GEMM_IMPL_STR, gemm_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
@@ -55,48 +59,59 @@ struct gemm_convolution_fwd_t : public primitive_t {
                             primitive_attr_t::skip_mask_t::post_ops, f32),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
-
             auto scratchpad = scratchpad_registry().registrar();
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads());
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool post_ops_ok() const {
+            using namespace dnnl::impl::primitive_kind;
             auto const &po = attr()->post_ops_;
-            auto is_sum_ok = [&](int idx) {
-                return IMPLICATION(po.entry_[idx].kind == primitive_kind::sum,
-                        idx == 0 && po.entry_[idx].is_sum());
-            };
-            auto is_binary
-                    = [&](int idx) { return po.entry_[idx].is_binary(); };
-            auto is_prelu = [&](int idx) { return po.entry_[idx].is_prelu(); };
-            auto is_binary_or_prelu_supported = [&](int idx) {
-                bool ok = dnnl::impl::get_rhs_arg_broadcasting_strategy(
-                                  binary_injector_utils::get_src1_desc(
-                                          po.entry_[idx], dst_md_),
-                                  dst_md_,
-                                  {broadcasting_strategy_t::scalar,
-                                          broadcasting_strategy_t::per_oc})
-                        != broadcasting_strategy_t::unsupported;
-                return ok;
-            };
-
-            if (!ref_post_ops_t::primitive_kind_ok(attr()->post_ops_))
-                return false;
 
-            for (int idx = 0; idx < po.len(); idx++) {
-                bool ok = is_sum_ok(idx)
-                        && IMPLICATION(is_binary(idx) || is_prelu(idx),
-                                is_binary_or_prelu_supported(idx));
-                if (!ok) return false;
-            }
+            auto all_post_ops_supported = [&]() {
+                for (int i = 0; i < po.len(); i++) {
+                    const auto &post_op = po.entry_[i];
+                    if (!utils::one_of(post_op.kind, sum, binary, eltwise,
+                                depthwise, quantization))
+                        return false;
+
+#if DNNL_X64
+                    using namespace cpu::x64;
+                    cpu_isa_t isa = isa_undef;
+                    if (po.entry_[i].kind == binary) {
+                        auto dst_md = this->dst_md();
+                        if (mayiuse(avx512_core))
+                            isa = avx512_core;
+                        else if (mayiuse(avx2))
+                            isa = avx2;
+                        else if (mayiuse(sse41))
+                            isa = sse41;
+                        if ((isa == isa_undef)
+                                || !binary_injector::is_supported(isa,
+                                        binary_injector::get_src1_desc(
+                                                post_op, *dst_md),
+                                        *dst_md, default_strategies())) {
+                            return false;
+                        }
+                    }
+#endif
+                }
+                return true;
+            };
+            auto contain = [&](dnnl::impl::primitive_kind_t kind) { return po.find(kind) != -1; };
+            auto position = [&](dnnl::impl::primitive_kind_t kind) { return po.find(kind); };
+            auto count = [&](dnnl::impl::primitive_kind_t kind) { return po.count(kind); };
 
-            return true;
+            return all_post_ops_supported() &&
+                   count(primitive_kind::sum) <= 1 &&
+                   IMPLICATION(contain(primitive_kind::sum), position(primitive_kind::sum) == 0);
         }
     };
 
@@ -104,18 +119,20 @@ struct gemm_convolution_fwd_t : public primitive_t {
         : primitive_t(apd), post_ops_(nullptr) {}
 
     status_t init(engine_t *engine) override {
+        const auto &post_ops = pd()->attr()->post_ops_;
         const data_t one = 1.0, zero = 0.0;
         const auto &jcp = pd()->jcp_;
         beta_ = jcp.with_sum ? one : zero;
 
-        if (jcp.with_eltwise || jcp.with_binary) {
-            CHECK(safe_ptr_assign(post_ops_, new ref_post_ops_t(jcp.post_ops)));
-            CHECK(post_ops_->init(pd()->dst_md()));
-        }
-        return status::success;
+        bool has_bias = pd()->with_bias();
+        bool has_post_ops = post_ops.len() > 0;
+        postops_in_ip_ = has_bias || has_post_ops;
+
+        CHECK(safe_ptr_assign(pp_kernel_, pp_kernel_t::create(pd(), pd()->jcp_)));
+        return (pp_kernel_) ? pp_kernel_->create_kernel() : status::success;
     }
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         bool is_nspc = pd()->jcp_.is_nspc;
@@ -128,9 +145,13 @@ struct gemm_convolution_fwd_t : public primitive_t {
     status_t execute_forward_thr_nspc(const exec_ctx_t &ctx, const int ithr,
             const int nthr, const data_t *src_base, const data_t *wei_base,
             const data_t *bia_base, data_t *dst_base,
-            const memory_tracking::grantor_t &scratchpad) const;
+            const memory_tracking::grantor_t &scratchpad,
+            const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
+    using pp_kernel_t = gemm_convolution_utils::pp_kernel_t;
+    std::unique_ptr<pp_kernel_t> pp_kernel_;
+    bool postops_in_ip_;
     data_t beta_;
 
     std::unique_ptr<ref_post_ops_t> post_ops_;
@@ -138,9 +159,7 @@ struct gemm_convolution_fwd_t : public primitive_t {
 
 struct gemm_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_convolution_bwd_data_t,
                 USE_GLOBAL_SCRATCHPAD);
@@ -156,21 +175,56 @@ struct gemm_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), diff_src_md_, weights_md_, diff_dst_md_, bias_md_,
                     attr_, dnnl_get_max_threads());
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+
+    protected:
+        virtual bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
-    gemm_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    gemm_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {
+        const auto &post_ops = pd()->attr()->post_ops_;
+        for (int i = 0; i < post_ops.len(); i++) {
+            auto &post_op = post_ops.entry_[i];
+            if (post_op.is_depthwise()) {
+                depthwise_injectors.push_back(new ref_depthwise_scalar_fwd_t(post_op.depthwise.alg));
+            }
+        }
+    }
+
+    ~gemm_convolution_bwd_data_t() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
+
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         bool is_nspc = pd()->jcp_.is_nspc;
@@ -184,17 +238,18 @@ struct gemm_convolution_bwd_data_t : public primitive_t {
     status_t execute_backward_data_thr_nspc(const int ithr, const int nthr,
             const data_t *diff_dst_base, const data_t *wei_base,
             const data_t *bia_base, data_t *diff_src_base,
-            const memory_tracking::grantor_t &scratchpad) const;
+            const memory_tracking::grantor_t &scratchpad,
+            const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const;
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    nstl::vector<ref_depthwise_scalar_fwd_t*> depthwise_injectors;
 };
 
 struct gemm_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_convolution_bwd_weights_t,
                 USE_GLOBAL_SCRATCHPAD);
@@ -213,17 +268,19 @@ struct gemm_convolution_bwd_weights_t : public primitive_t {
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, diff_weights_md_, diff_dst_md_,
                     diff_bias_md_, attr_, dnnl_get_max_threads());
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     gemm_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         const bool is_nspc = pd()->jcp_.is_nspc;
diff --git a/src/cpu/gemm_convolution_utils.cpp b/src/cpu/gemm_convolution_utils.cpp
index 2de4ddcf39f..5060fa1fd03 100644
--- a/src/cpu/gemm_convolution_utils.cpp
+++ b/src/cpu/gemm_convolution_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +23,10 @@
 #include "common/utils.hpp"
 #include "cpu/gemm_convolution_utils.hpp"
 #include "cpu/scale_utils.hpp"
+
+#include "ref_eltwise.hpp"
+#include "ref_depthwise_injector.hpp"
+
 #if DNNL_X64
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
 #endif
@@ -30,6 +34,7 @@
 #include "cpu/platform.hpp"
 
 #if DNNL_X64
+#include "cpu/x64/jit_gemm_convolution_utils.hpp"
 #include "cpu/x64/cpu_isa_traits.hpp"
 #endif
 
@@ -51,13 +56,173 @@ single_gemm_conv_chunk_desc_t::single_gemm_conv_chunk_desc_t(dim_t d_off,
     , w_off_(w_off)
     , w_size_(w_size) {}
 
+namespace gemm_convolution_utils {
+
+struct ref_pp_kernel_t : pp_kernel_t {
+    ref_pp_kernel_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
+            : pp_kernel_t(pd, jcp) {
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            if (post_op.is_eltwise()) {
+                ref_eltwise_injectors_.push_back(new ref_eltwise_scalar_fwd_t(post_op.eltwise));
+            } else if (post_op.is_depthwise()) {
+                ref_depthwise_injectors_.push_back(new ref_depthwise_scalar_fwd_t(
+                        post_op.depthwise.alg));
+            }
+        }
+    }
+    ~ref_pp_kernel_t() {
+        for (auto impl : ref_eltwise_injectors_)
+            delete impl;
+        ref_eltwise_injectors_.clear();
+        for (auto impl : ref_depthwise_injectors_)
+            delete impl;
+        ref_depthwise_injectors_.clear();
+    }
+
+    virtual void operator()(float *dst_orig, float *dst, const float *bias, const int len, const int oc_start, const int oc_work, const int oc_stride,
+                            const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const override;
+
+    static bool post_ops_ok(const convolution_pd_t *pd) {
+        using namespace dnnl::impl::primitive_kind;
+        const auto& po = pd->attr()->post_ops_;
+        for (int i = 0; i < po.len(); i++) {
+            if (!utils::one_of(po.entry_[i].kind, eltwise, depthwise, quantization)) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+private:
+    nstl::vector<ref_eltwise_scalar_fwd_t*> ref_eltwise_injectors_;
+    nstl::vector<ref_depthwise_scalar_fwd_t*> ref_depthwise_injectors_;
+};
+
+void ref_pp_kernel_t::operator()(float *dst_orig, float *dst, const float *bias, const int len,const int oc_start, const int oc_work, const int oc_stride,
+                                 const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const {
+    // TODO: for "outer threading" we have parallel section within
+    // outermost "parallel". It is not good. Consider to use
+    // "parallel" here with number of threads passed as parameter
+    const auto &p = post_ops_;
+    bool need_bias = do_bias_;
+    if (p.len() > 0) {
+        std::size_t post_ops_data_idx = 0;
+        int eltwise_inj_idx = 0;
+        int depthwise_inj_idx = 0;
+
+        for (int i = 0; i < p.len(); i++) {
+            auto &post_op = p.entry_[i];
+            // todo: sum?
+            if (post_op.is_eltwise()) {
+                parallel_nd(oc_work, [&](const int oc) {
+                    float b = need_bias ? bias[oc_start + oc] : 0;
+                    float *d_ = dst + oc * oc_stride;
+                    for (int oS = 0; oS < len; ++oS) {
+                        d_[oS] += b;
+                        d_[oS] = ref_eltwise_injectors_[eltwise_inj_idx]->compute_scalar(d_[oS]);
+                    }
+                });
+
+                eltwise_inj_idx++;
+                need_bias = false;
+            } else if (post_op.is_depthwise()) {
+                auto depthwise_base = reinterpret_cast<const float*>(post_ops_binary_rhs_arg_vec[post_ops_data_idx]);
+                auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                auto depthwise_bias = depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts];
+
+                parallel_nd(oc_work, [&](const int oc) {
+                    float b = need_bias ? bias[oc_start + oc] : 0;
+                    float *d_ = dst + oc * oc_stride;
+                    for (int oS = 0; oS < len; ++oS) {
+                        d_[oS] += b;
+                        d_[oS] = ref_depthwise_injectors_[depthwise_inj_idx]->compute_scalar(d_[oS],
+                                                                                             depthwise_weights + oc_start + oc,
+                                                                                             depthwise_bias + oc_start + oc);
+                    }
+                });
+
+                post_ops_data_idx++;
+                depthwise_inj_idx++;
+                need_bias = false;
+            } else if (post_op.is_quantization()) {
+                auto quant = post_op.quantization;
+                auto quantization_base = reinterpret_cast<const float*>(post_ops_binary_rhs_arg_vec[post_ops_data_idx]);
+                auto pcl =  quantization_base + post_op.quantization.offset[quant.crop_low];
+                auto pch =  quantization_base + post_op.quantization.offset[quant.crop_high];
+                auto pisc = quantization_base + post_op.quantization.offset[quant.inp_scale];
+                auto pish = quantization_base + post_op.quantization.offset[quant.inp_shift];
+                auto posc = quantization_base + post_op.quantization.offset[quant.output_scale];
+                auto posh = quantization_base + post_op.quantization.offset[quant.output_shift];
+
+                parallel_nd(oc_work, [&](const int oc) {
+                    float b = need_bias ? bias[oc_start + oc] : 0;
+                    float *d_ = dst + oc * oc_stride;
+
+                    int cl_idx = !quant.per_channel[quant.crop_low] ? 0 : oc_start + oc;
+                    int ch_idx = !quant.per_channel[quant.crop_high] ? 0 : oc_start + oc;
+                    int isc_idx = !quant.per_channel[quant.inp_scale] ? 0 : oc_start + oc;
+                    int ish_idx = !quant.per_channel[quant.inp_shift] ? 0 : oc_start + oc;
+                    int osc_idx = !quant.per_channel[quant.output_scale] ? 0 : oc_start + oc;
+                    int osh_idx = !quant.per_channel[quant.output_shift] ? 0 : oc_start + oc;
+
+                    PRAGMA_OMP_SIMD()
+                    for (int oS = 0; oS < len; ++oS) {
+                        d_[oS] += b;
+
+                        d_[oS] = nstl::min(pch[ch_idx], nstl::max(pcl[cl_idx], d_[oS]));
+                        d_[oS] = d_[oS] * pisc[isc_idx] + pish[ish_idx];
+                        d_[oS] = roundf(d_[oS]);
+                        d_[oS] = d_[oS] * posc[osc_idx] + posh[osh_idx];
+                    }
+                });
+
+                post_ops_data_idx++;
+                need_bias = false;
+            }
+        }
+    }
+
+    if (need_bias) {
+        parallel_nd(oc_work, [&](const int oc) {
+            float b = bias[oc_start + oc];
+            float *d_ = dst + oc * oc_stride;
+            PRAGMA_OMP_SIMD()
+            for (int oS = 0; oS < len; ++oS) {
+                d_[oS] += b;
+            }
+        });
+    }
+}
+
+// Interface section
+
+pp_kernel_t::pp_kernel_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
+        : do_bias_(pd->with_bias()), post_ops_(pd->attr()->post_ops_) {}
+
+pp_kernel_t *pp_kernel_t::create(
+        const convolution_pd_t *pd, const conv_gemm_conf_t &jcp) {
+#if DNNL_X64
+    auto *res
+            = x64::gemm_convolution_utils::jit_pp_kernel_create(pd, jcp);
+    if (res) return res;
+#endif
+
+    if (ref_pp_kernel_t::post_ops_ok(pd)) {
+        return new ref_pp_kernel_t(pd, jcp);
+    }
+
+    return nullptr;
+}
+} // namespace gemm_convolution_utils
+
 namespace jit_gemm_convolution_utils {
 
 template <typename data_type_t>
 void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
         data_type_t *col, dim_t od, int spatial_step, int spatial_block) {
     using data_t =
-            typename conditional<data_traits<data_type_t>::data_type == bf16,
+            typename conditional<data_traits_t<data_type_t>::data_type == bf16,
                     uint16_t, data_type_t>::type;
     const data_t *__restrict _im
             = reinterpret_cast<const data_t *__restrict>(im);
@@ -277,13 +442,14 @@ template void transpose_dt(const conv_gemm_conf_t &jcp,
 /* col[kd][kh][kw][g][ic][od][oh][ow] <-- im2col_dt_3d(im[id][ih][iw][g][ic]) */
 template <typename orig_im_dt, typename orig_col_dt>
 void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
-        orig_col_dt *__restrict _col, dim_t od) {
+        orig_col_dt *__restrict _col, dim_t od, const uint8_t *__restrict input_zp) {
     // For performance reasons, use uint16_t as a proxy for bfloat16_t
-    using im_dt = typename utils::conditional<data_traits<orig_im_dt>::data_type
-                    == bf16,
-            uint16_t, orig_im_dt>::type;
+    using im_dt =
+            typename utils::conditional<data_traits_t<orig_im_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_im_dt>::type;
     using col_dt =
-            typename utils::conditional<data_traits<orig_col_dt>::data_type
+            typename utils::conditional<data_traits_t<orig_col_dt>::data_type
                             == bf16,
                     uint16_t, orig_col_dt>::type;
     const im_dt *__restrict imtr
@@ -307,15 +473,18 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
     const dim_t IHW = jcp.ih * jcp.iw;
     const dim_t OHW = jcp.oh * jcp.ow;
 
+    bool with_input_zp = input_zp != nullptr;
+
     if (sd == 1 && sh == 1 && sw == 1 && dd == 1 && dh == 1 && dw == 1)
-        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+        parallel_nd_legacy(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
                     col_dt *__restrict col_loc = col + kd * col_kd_s
                             + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
                     const dim_t id = od - fp + kd;
                     if (id < 0 || id >= jcp.id) {
+                        col_dt izp = with_input_zp ? (col_dt)input_zp[ic] : shift;
                         for (ptrdiff_t i = 0; i < OHW; i++)
-                            col_loc[i] = shift;
+                            col_loc[i] = izp;
                         return;
                     }
                     const im_dt *__restrict imtr_loc
@@ -337,14 +506,15 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     }
                 });
     else if (sd == 2 && sh == 2 && sw == 2 && dd == 1 && dh == 1 && dw == 1)
-        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+        parallel_nd_legacy(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
                     col_dt *__restrict col_loc = col + kd * col_kd_s
                             + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
                     const dim_t id = od * 2 - fp + kd;
                     if (id < 0 || id >= jcp.id) {
+                        col_dt izp = with_input_zp ? (col_dt)input_zp[ic] : shift;
                         for (ptrdiff_t i = 0; i < OHW; i++)
-                            col_loc[i] = shift;
+                            col_loc[i] = izp;
                         return;
                     }
                     const im_dt *__restrict imtr_loc
@@ -368,14 +538,15 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     }
                 });
     else
-        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+        parallel_nd_legacy(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
                     col_dt *__restrict col_loc = col + kd * col_kd_s
                             + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
                     const dim_t id = od * sd - fp + kd * dd;
                     if (id < 0 || id >= jcp.id) {
+                        col_dt izp = with_input_zp ? (col_dt)input_zp[ic] : shift;
                         for (ptrdiff_t i = 0; i < OHW; i++)
-                            col_loc[i] = shift;
+                            col_loc[i] = izp;
                         return;
                     }
                     const im_dt *__restrict imtr_loc
@@ -402,13 +573,13 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
 }
 
 template void im2col_dt_3d<int8_t, uint8_t>(const conv_gemm_conf_t &jcp,
-        const void *__restrict im, uint8_t *__restrict col, dim_t od);
+        const void *__restrict im, uint8_t *__restrict col, dim_t od, const uint8_t *__restrict input_zp);
 template void im2col_dt_3d<uint8_t, uint8_t>(const conv_gemm_conf_t &jcp,
-        const void *__restrict im, uint8_t *__restrict col, dim_t od);
+        const void *__restrict im, uint8_t *__restrict col, dim_t od, const uint8_t *__restrict input_zp);
 template void im2col_dt_3d<float, float>(const conv_gemm_conf_t &jcp,
-        const void *__restrict im, float *__restrict col, dim_t od);
+        const void *__restrict im, float *__restrict col, dim_t od, const uint8_t *__restrict input_zp);
 template void im2col_dt_3d<bfloat16_t, bfloat16_t>(const conv_gemm_conf_t &jcp,
-        const void *__restrict im, bfloat16_t *__restrict col, dim_t od);
+        const void *__restrict im, bfloat16_t *__restrict col, dim_t od, const uint8_t *__restrict input_zp);
 
 /* col[ic][kh][kw][oh][ow] <-- im2col(im[ic][ih][iw]) */
 template <typename data_type_t>
@@ -416,7 +587,7 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
         data_type_t *__restrict col, dim_t ss, dim_t sb, dim_t cs, dim_t cb) {
 
     using data_t =
-            typename utils::conditional<data_traits<data_type_t>::data_type
+            typename utils::conditional<data_traits_t<data_type_t>::data_type
                             == bf16,
                     uint16_t, data_type_t>::type;
     const data_t *__restrict _im
@@ -511,7 +682,7 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
         // Generated code is more optimized for stride_w == 1
         // because innermost loop is by width
         if (sw == 1)
-            parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
+            parallel_nd_legacy(cb, jcp.kh, jcp.kw, oh_range,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
                         const dim_t oh = ohr + oh_begin;
                         const dim_t ih = oh * sh - tp + kh * dh;
@@ -536,7 +707,7 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
                             }
                     });
         else
-            parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
+            parallel_nd_legacy(cb, jcp.kh, jcp.kw, oh_range,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
                         const dim_t oh = ohr + oh_begin;
                         const dim_t ih = oh * sh - tp + kh * dh;
@@ -575,13 +746,14 @@ template void im2col(const conv_gemm_conf_t &jcp,
 template <typename orig_im_dt, typename orig_col_dt>
 void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
         void *__restrict _imtr, orig_col_dt *__restrict _col, dim_t hs,
-        dim_t hb, dim_t ws, dim_t wb) {
+        dim_t hb, dim_t ws, dim_t wb, const uint8_t *__restrict input_zp) {
     // For performance reasons, use uint16_t as a proxy for bfloat16_t
-    using im_dt = typename utils::conditional<data_traits<orig_im_dt>::data_type
-                    == bf16,
-            uint16_t, orig_im_dt>::type;
+    using im_dt =
+            typename utils::conditional<data_traits_t<orig_im_dt>::data_type
+                            == bf16,
+                    uint16_t, orig_im_dt>::type;
     using col_dt =
-            typename utils::conditional<data_traits<orig_col_dt>::data_type
+            typename utils::conditional<data_traits_t<orig_col_dt>::data_type
                             == bf16,
                     uint16_t, orig_col_dt>::type;
     const im_dt *__restrict im = reinterpret_cast<const im_dt *__restrict>(_im);
@@ -598,6 +770,8 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
     const dim_t tp = jcp.t_pad;
     const dim_t lp = jcp.l_pad;
 
+    bool with_input_zp = input_zp != nullptr;
+
     if (jcp.outer_threading && sh == 1 && sw == 1 && dh == 1 && dw == 1) {
         /* im[ih][iw][ic] --> imtr[ic][ih][iw] --> col[kh][kw][ic][oh][ow] */
         const dim_t hp = hs - tp;
@@ -641,61 +815,103 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
                 const dim_t ow_start = saturate(dim_t(0), wb, ow_kw);
                 const dim_t ow_end = saturate(dim_t(0), wb, ow_kw + iwb);
                 for (dim_t ic = 0; ic < jcp.ic; ic++) {
+                    uint8_t izp = with_input_zp ? input_zp[ic] : (uint8_t) 0;
                     const ptrdiff_t col_idx_ic = col_idx_kw + ic * col_ic_str;
                     const dim_t imtr_idx_ic = ic * imtr_ic_stride - imtr_shift;
                     for (dim_t oh = 0; oh < oh_start; oh++) {
                         const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb;
-                        for (dim_t ow = 0; ow < wb; ++ow)
-                            col[col_idx_oh + ow] = shift;
+                        if (with_input_zp) {
+                            for (dim_t ow = 0; ow < wb; ++ow)
+                                col[col_idx_oh + ow] = izp;
+                        } else {
+                            for (dim_t ow = 0; ow < wb; ++ow)
+                                col[col_idx_oh + ow] = shift;
+                        }
                     }
                     for (dim_t oh = oh_start; oh < oh_end; oh++) {
                         const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb;
                         const ptrdiff_t imtr_idx_oh = imtr_idx_ic + oh * iwb;
-                        for (dim_t ow = 0; ow < ow_start; ++ow)
-                            col[col_idx_oh + ow] = shift;
-                        for (dim_t ow = ow_start; ow < ow_end; ++ow)
-                            col[col_idx_oh + ow]
-                                    = imtr[imtr_idx_oh + ow] + shift;
-                        for (dim_t ow = ow_end; ow < wb; ++ow)
-                            col[col_idx_oh + ow] = shift;
+                        if (with_input_zp) {
+                            for (dim_t ow = 0; ow < ow_start; ++ow)
+                                col[col_idx_oh + ow] = izp;
+                            for (dim_t ow = ow_start; ow < ow_end; ++ow)
+                                col[col_idx_oh + ow]
+                                        = imtr[imtr_idx_oh + ow];
+                            for (dim_t ow = ow_end; ow < wb; ++ow)
+                                col[col_idx_oh + ow] = izp;
+                        } else {
+                            for (dim_t ow = 0; ow < ow_start; ++ow)
+                                col[col_idx_oh + ow] = shift;
+                            for (dim_t ow = ow_start; ow < ow_end; ++ow)
+                                col[col_idx_oh + ow]
+                                        = imtr[imtr_idx_oh + ow] + shift;
+                            for (dim_t ow = ow_end; ow < wb; ++ow)
+                                col[col_idx_oh + ow] = shift;
+                        }
                     }
                     for (dim_t oh = oh_end; oh < hb; oh++) {
                         const ptrdiff_t col_idx_oh = col_idx_ic + oh * wb;
-                        for (dim_t ow = 0; ow < wb; ++ow)
-                            col[col_idx_oh + ow] = shift;
+                        if (with_input_zp) {
+                            for (dim_t ow = 0; ow < wb; ++ow)
+                                col[col_idx_oh + ow] = izp;
+                        } else {
+                            for (dim_t ow = 0; ow < wb; ++ow)
+                                col[col_idx_oh + ow] = shift;
+                        }
                     }
                 }
             }
         }
     } else {
-        parallel_nd(jcp.kh, jcp.kw, jcp.ic, hb,
+        parallel_nd_legacy(jcp.kh, jcp.kw, jcp.ic, hb,
                 [&](dim_t kh, dim_t kw, dim_t ic, dim_t oh) {
                     const dim_t hp = tp - kh * dh;
                     const dim_t ih = (oh + hs) * sh - hp;
                     const ptrdiff_t col_idx_base
                             = (((kh * jcp.kw + kw) * jcp.ic + ic) * hb + oh)
                             * wb;
+                    uint8_t izp = with_input_zp ? input_zp[ic] : (uint8_t) 0;
                     if (ih < 0 || ih >= jcp.ih)
-                        for (dim_t ow = 0; ow < wb; ow++)
-                            col[col_idx_base + ow] = shift;
+                        if (with_input_zp) {
+                            for (dim_t ow = 0; ow < wb; ow++)
+                                col[col_idx_base + ow] = izp;
+                        } else {
+                            for (dim_t ow = 0; ow < wb; ow++)
+                                col[col_idx_base + ow] = shift;
+                        }
                     else {
                         const dim_t wp = lp - kw * dw;
                         const dim_t ow_start
                                 = saturate(dim_t(0), wb, div_up(wp, sw) - ws);
                         const dim_t ow_end = saturate(
                                 dim_t(0), wb, div_up(jcp.iw + wp, sw) - ws);
-                        for (dim_t ow = 0; ow < ow_start; ow++)
-                            col[col_idx_base + ow] = shift;
-                        const dim_t iw_base = ws * sw - wp;
-                        const ptrdiff_t im_idx_base = ih * im_ih_stride + ic;
-                        for (dim_t ow = ow_start; ow < ow_end; ow++) {
-                            const dim_t iw = iw_base + ow * sw;
-                            const ptrdiff_t im_idx
-                                    = im_idx_base + iw * im_iw_stride;
-                            col[col_idx_base + ow] = im[im_idx] + shift;
+                        if (with_input_zp) {
+                            for (dim_t ow = 0; ow < ow_start; ow++)
+                                col[col_idx_base + ow] = izp;
+                            const dim_t iw_base = ws * sw - wp;
+                            const ptrdiff_t im_idx_base = ih * im_ih_stride + ic;
+                            for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                                const dim_t iw = iw_base + ow * sw;
+                                const ptrdiff_t im_idx
+                                        = im_idx_base + iw * im_iw_stride;
+                                col[col_idx_base + ow] = im[im_idx];
+                            }
+                            for (dim_t ow = ow_end; ow < wb; ow++)
+                                col[col_idx_base + ow] = izp;
+                        } else {
+                            for (dim_t ow = 0; ow < ow_start; ow++)
+                                col[col_idx_base + ow] = shift;
+                            const dim_t iw_base = ws * sw - wp;
+                            const ptrdiff_t im_idx_base = ih * im_ih_stride + ic;
+                            for (dim_t ow = ow_start; ow < ow_end; ow++) {
+                                const dim_t iw = iw_base + ow * sw;
+                                const ptrdiff_t im_idx
+                                        = im_idx_base + iw * im_iw_stride;
+                                col[col_idx_base + ow] = im[im_idx] + shift;
+                            }
+                            for (dim_t ow = ow_end; ow < wb; ow++)
+                                col[col_idx_base + ow] = shift;
                         }
-                        for (dim_t ow = ow_end; ow < wb; ow++)
-                            col[col_idx_base + ow] = shift;
                     }
                 });
     }
@@ -703,26 +919,25 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
 
 template void im2col_dt<int8_t, uint8_t>(const conv_gemm_conf_t &jcp,
         const void *__restrict im, void *__restrict imtr,
-        uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+        uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb, const uint8_t *__restrict input_zp);
 template void im2col_dt<uint8_t, uint8_t>(const conv_gemm_conf_t &jcp,
         const void *__restrict im, void *__restrict imtr,
-        uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+        uint8_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb, const uint8_t *__restrict input_zp);
 template void im2col_dt<float, float>(const conv_gemm_conf_t &jcp,
         const void *__restrict im, void *__restrict imtr, float *__restrict col,
-        dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+        dim_t hs, dim_t hb, dim_t ws, dim_t wb, const uint8_t *__restrict input_zp);
 
 template void im2col_dt<bfloat16_t, bfloat16_t>(const conv_gemm_conf_t &jcp,
         const void *__restrict im, void *__restrict imtr,
-        bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb);
+        bfloat16_t *__restrict col, dim_t hs, dim_t hb, dim_t ws, dim_t wb, const uint8_t *__restrict input_zp);
 
 /* im[id][ih][iw][ic] <-- col2im_dt_3d(col[od][oh][ow][kd][kh][kw][ic]) */
 template <typename orig_T>
 void col2im_dt(const conv_gemm_conf_t &jcp, const orig_T *__restrict _col,
         orig_T *__restrict _im) {
     // For performance reasons, use uint16_t as a proxy for bfloat16_t
-    using T =
-            typename utils::conditional<data_traits<orig_T>::data_type == bf16,
-                    uint16_t, orig_T>::type;
+    using T = typename utils::conditional<
+            data_traits_t<orig_T>::data_type == bf16, uint16_t, orig_T>::type;
     const T *__restrict col = reinterpret_cast<const T *__restrict>(_col);
     T *__restrict im = reinterpret_cast<T *__restrict>(_im);
 
@@ -1080,16 +1295,16 @@ status_t init_conf(conv_gemm_conf_t &jcp,
             CHECK(memory_desc_init_by_tag(src_md, desired_src_tag));
             src_tag = desired_src_tag;
         } else {
-            src_tag = memory_desc_matches_one_of_tag(
-                    src_md, nwc, nhwc, ndhwc, ncw, nchw, ncdhw);
+            src_tag = src_d.mb_stride_relaxed_match(
+                    nwc, nhwc, ndhwc, ncw, nchw, ncdhw);
         }
 
         if (dst_d.format_kind() == format_kind::any) {
             CHECK(memory_desc_init_by_tag(dst_md, desired_dst_tag));
             dst_tag = desired_dst_tag;
         } else {
-            dst_tag = memory_desc_matches_one_of_tag(
-                    dst_md, nwc, nhwc, ndhwc, ncw, nchw, ncdhw);
+            dst_tag = dst_d.mb_stride_relaxed_match(
+                    nwc, nhwc, ndhwc, ncw, nchw, ncdhw);
         }
 
         if (src_tag == format_tag::undef || dst_tag == format_tag::undef)
@@ -1134,6 +1349,29 @@ status_t init_conf(conv_gemm_conf_t &jcp,
     const bool is_bwd_w = jcp.prop_kind == backward_weights;
     const bool is_fwd = !is_bwd_d && !is_bwd_w;
 
+    const auto dst_max_size
+            = static_cast<size_t>(jcp.iw) * jcp.ih * jcp.id * jcp.ic * 4;
+    const auto src_max_size
+            = static_cast<size_t>(jcp.ow) * jcp.oh * jcp.od * jcp.oc * 4;
+    VDISPATCH_CONV_IC(dst_max_size <= INT_MAX && src_max_size <= INT_MAX,
+            VERBOSE_UNSUPPORTED_FEATURE,
+            "dst/scr size > INT_MAX is not supported");
+
+    jcp.with_input_zp = !attr.input_zero_points_.has_default_values();
+    if (jcp.with_input_zp) {
+        if (attr.input_zero_points_.count_ != 1 && attr.input_zero_points_.count_ != jcp.ic * jcp.ngroups)
+            return status::unimplemented;
+
+        if (attr.output_compensations_.count_ != jcp.oc * jcp.ngroups)
+            return status::unimplemented;
+    }
+
+    jcp.with_weights_zp = !attr.weights_zero_points_.has_default_values();
+    if (jcp.with_weights_zp) {
+        if (attr.weights_zero_points_.count_ != 1 && attr.weights_zero_points_.count_ != jcp.oc * jcp.ngroups)
+            return status::unimplemented;
+    }
+
     bool is_int8_conv = (is_fwd ? utils::one_of(src_d.data_type(), s8, u8)
                                 : utils::one_of(dst_d.data_type(), s8, u8))
             && weights_d.data_type() == s8;
@@ -1165,7 +1403,7 @@ status_t init_conf(conv_gemm_conf_t &jcp,
 
         VDISPATCH_CONV_IC(
                 post_ops_ok(post_ops_ok_args_t(x64::avx512_core,
-                        {binary, eltwise, sum}, attr.post_ops_, &dst_d,
+                        {binary, eltwise, sum, depthwise, prelu}, attr.post_ops_, &dst_d,
                         sum_at_pos_0_only, sum_requires_scale_one,
                         sum_requires_zp_zero)),
                 VERBOSE_UNSUPPORTED_POSTOP);
@@ -1181,6 +1419,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
     const int sum_ind = jcp.post_ops.find(primitive_kind::sum);
     jcp.with_sum = sum_ind != -1;
+    const int depthwise_ind = jcp.post_ops.find(primitive_kind::depthwise);
+    jcp.with_depthwise = depthwise_ind != -1;
 
     bool is_bf16_conv = false
             || (is_fwd
@@ -2125,8 +2365,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
     jcp.dst_os_stride = dst_d.is_blocking_desc()
             ? dst_d.blocking_desc().strides[ndims - 1]
             : 0;
-    jcp.scale_idx_mult = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0;
-    jcp.with_dst_scale = !attr.scales_.get(DNNL_ARG_DST).has_default_values();
+    jcp.scale_idx_mult = attr.scales_.get_mask(DNNL_ARG_WEIGHTS) > 0;
+    jcp.with_dst_scale = !attr.scales_.has_default_values(DNNL_ARG_DST);
     book_precomputed_scales(scratchpad, attr.scales_, jcp.ngroups * jcp.oc);
 
     if (jcp.zp.src_exists) {
@@ -2134,8 +2374,8 @@ status_t init_conf(conv_gemm_conf_t &jcp,
         if (size) scratchpad.book<int32_t>(key_conv_gemm_zp_src_comp, size);
     }
 
-    VDISPATCH_CONV_IC(
-            scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
+    // VDISPATCH_CONV_IC(
+    //         scratchpad.size() <= scratchpad_limit, VERBOSE_SCRATCHPAD_LIMIT);
 
     return status::success;
 }
diff --git a/src/cpu/gemm_convolution_utils.hpp b/src/cpu/gemm_convolution_utils.hpp
index 43e9784bc44..222b6d5f71b 100644
--- a/src/cpu/gemm_convolution_utils.hpp
+++ b/src/cpu/gemm_convolution_utils.hpp
@@ -43,6 +43,7 @@ struct conv_gemm_conf_t {
     bool with_bias;
     bool with_eltwise;
     bool with_binary;
+    bool with_depthwise;
     bool with_sum;
     post_ops_t post_ops;
     bool is_nspc;
@@ -69,6 +70,9 @@ struct conv_gemm_conf_t {
     size_t dst_os_stride;
     size_t scale_idx_mult;
     bool with_dst_scale;
+
+    bool with_input_zp;
+    bool with_weights_zp;
 };
 
 struct single_gemm_conv_chunk_desc_t {
@@ -84,6 +88,28 @@ struct single_gemm_conv_chunk_desc_t {
     dim_t w_size_ = 0;
 };
 
+namespace gemm_convolution_utils {
+
+struct pp_kernel_t {
+    static pp_kernel_t *create(
+            const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
+
+    virtual ~pp_kernel_t() = default;
+
+    virtual void operator()(float *dst_orig, float *dst, const float *bias, const int len, const int oc_start, const int oc_work, const int oc_stride,
+                            const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const = 0;
+
+    virtual status_t create_kernel() { return status::success; }
+
+protected:
+    pp_kernel_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
+
+    bool do_bias_ = false;
+    post_ops_t post_ops_;
+};
+
+} // namespace gemm_convolution_utils
+
 namespace jit_gemm_convolution_utils {
 template <typename data_type_t>
 void im2col_3d(const conv_gemm_conf_t &jcp, const data_type_t *im,
@@ -95,7 +121,7 @@ void transpose_dt(const conv_gemm_conf_t &jcp, const T *__restrict im,
 
 template <typename im_dt, typename col_dt>
 void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict im,
-        col_dt *__restrict col, dim_t od);
+        col_dt *__restrict col, dim_t od, const uint8_t *__restrict input_zp = nullptr);
 
 template <typename data_type_t>
 void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
@@ -104,7 +130,7 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
 template <typename im_dt, typename col_dt>
 void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict im,
         void *__restrict imtr, col_dt *__restrict col, dim_t hs, dim_t hb,
-        dim_t ws, dim_t wb);
+        dim_t ws, dim_t wb, const uint8_t *__restrict input_zp = nullptr);
 
 template <typename T>
 void col2im_dt(
diff --git a/src/cpu/gemm_inner_product.hpp b/src/cpu/gemm_inner_product.hpp
index 1b7df0d241e..ce32c913024 100644
--- a/src/cpu/gemm_inner_product.hpp
+++ b/src/cpu/gemm_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -120,7 +120,7 @@ struct gemm_inner_product_fwd_t : public primitive_t {
         return pp_kernel_->create_kernel();
     }
 
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
@@ -163,7 +163,7 @@ struct gemm_inner_product_bwd_data_t : public primitive_t {
     };
 
     gemm_inner_product_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward_data(ctx);
@@ -208,7 +208,7 @@ struct gemm_inner_product_bwd_weights_t : public primitive_t {
     };
 
     gemm_inner_product_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward_weights(ctx);
diff --git a/src/cpu/gemm_inner_product_utils.cpp b/src/cpu/gemm_inner_product_utils.cpp
index 815e953898b..2d637d543cf 100644
--- a/src/cpu/gemm_inner_product_utils.cpp
+++ b/src/cpu/gemm_inner_product_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -157,17 +157,17 @@ pp_kernel_t::pp_kernel_t(size_t OC, size_t MB, dim_t dst_mb_stride,
     , bias_data_type_(bias_dt)
     , acc_data_type_(acc_dt)
     , dst_data_type_(dst_md->data_type)
-    , do_scale_(!attr->scales_.get(DNNL_ARG_SRC).has_default_values()
-              || !attr->scales_.get(DNNL_ARG_WEIGHTS).has_default_values())
+    , do_scale_(!attr->scales_.has_default_values(DNNL_ARG_SRC)
+              || !attr->scales_.has_default_values(DNNL_ARG_WEIGHTS))
     , ndims_(dst_md->ndims) {
 
-    if (do_scale_) {
-        int wei_mask = attr->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    if (!attr->scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
+        int wei_mask = attr->scales_.get_mask(DNNL_ARG_WEIGHTS);
         // matmul: per_oc: 1 << (ndims_ - 1)
         // ip: per_oc: 1 << 0
         scale_idx_mult_ = wei_mask == (1 << (ndims_ - 1)) || wei_mask == 1 << 0;
     }
-    do_dst_scale_ = !attr->scales_.get(DNNL_ARG_DST).has_default_values();
+    do_dst_scale_ = !attr->scales_.has_default_values(DNNL_ARG_DST);
 
     post_ops_ = attr->post_ops_;
     const int eltwise_ind = post_ops_.find(primitive_kind::eltwise);
diff --git a/src/cpu/gemm_x8s8s32x_convolution.cpp b/src/cpu/gemm_x8s8s32x_convolution.cpp
index 8482ae65eb0..8f464e14eae 100644
--- a/src/cpu/gemm_x8s8s32x_convolution.cpp
+++ b/src/cpu/gemm_x8s8s32x_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -121,6 +121,9 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward(
             = binary_injector_utils::prepare_binary_args(
                     this->pd()->attr()->post_ops_, ctx);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp_base, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation_base, jcp);
+
     auto scratchpad = ctx.get_scratchpad_grantor();
 
     assert(IMPLICATION(jcp.ow_block != jcp.ow, jcp.oh_block == 1));
@@ -135,15 +138,15 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *scales = precompute_scales(scratchpad, src_scales, wei_scales,
-            pd()->IC(), pd()->OC(), false, wei_scale_mask != 0, pd()->attr());
+            pd()->IC(), pd()->OC(), false, wei_scale_mask > 0, pd()->attr());
 
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         status_t st_thr = execute_forward_thr(ithr, nthr, src_base, wei_base,
                 bia_base, dst_base, scales, dst_scales, zp, scratchpad,
-                post_ops_binary_rhs_arg_vec.data(), ctx);
+                post_ops_binary_rhs_arg_vec.data(), ctx,
+                input_zp_base, output_compensation_base);
 
         if (st_thr != status::success) st = st_thr;
     });
@@ -163,7 +166,8 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
         const char *bia_base, void *dst_base, const float *scales,
         const float *dst_scales, const zero_point_call_params_t &zp,
         const memory_tracking::grantor_t &scratchpad,
-        const void *post_ops_binary_rhs_arg_vec, const exec_ctx_t &ctx) const {
+        const void *post_ops_binary_rhs_arg_vec, const exec_ctx_t &ctx,
+        const uint8_t *input_zp_base, const int32_t *output_compensation_base) const {
 
     const conv_gemm_conf_t &jcp = this->pd()->jcp_;
 
@@ -190,18 +194,11 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
             + (ptrdiff_t)ithr * jcp.oh_block * jcp.ow_block * jcp.oc;
 
     const int32_t *_wei_comp
-            = jcp.signed_input ? get_wei_comp(wei_base, wei_md) : nullptr;
-
-    const bool should_apply_zp_src_comp_pad = jcp.zp.src_exists
-            && jit_gemm_convolution_utils::padding_exists(jcp);
-    const bool should_apply_zp_src_comp_pad_jit_pp
-            = should_apply_zp_src_comp_pad
-            && gemm_x8s8s32x_convolution_utils::mayiuse_jit_pp_kernel(
-                    dst_md.data_type());
-    const bool should_apply_zp_src_comp_outside_pp
-            = should_apply_zp_src_comp_pad
-            && !gemm_x8s8s32x_convolution_utils::mayiuse_jit_pp_kernel(
-                    dst_md.data_type());
+            = jcp.signed_input ? get_wei_comp(wei_base, wei_md) :
+              jcp.with_input_zp ? output_compensation_base : nullptr;
+
+    const bool should_apply_zp_src_comp_pad_jit_pp = false;
+    const bool should_apply_zp_src_comp_outside_pp = false;
 
     dim_t g {0}, n {0}, ohb {0}, owb {0};
     dim_t start = 0, end = 0;
@@ -217,7 +214,7 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
     balance211(work_amount, nthr, ithr, start, end);
     nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow);
     const uint8_t shift = jcp.signed_input ? 128 : 0;
-    parallel_nd(jcp.im2col_sz, [&](ptrdiff_t i) { col[i] = shift; });
+    parallel_nd_legacy(jcp.im2col_sz, [&](ptrdiff_t i) { col[i] = shift; });
 
     status_t st = status::success;
 
@@ -237,6 +234,11 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
         for (int od = 0; od < jcp.od; od++) {
             const auto dst_off = n * dst_mb_stride + g * dst_g_stride
                     + ((od * jcp.oh + oh) * jcp.ow + ow) * jcp.dst_os_stride;
+
+            const uint8_t *__restrict input_zp = nullptr;
+            if (jcp.with_input_zp)
+                input_zp = input_zp_base + g * jcp.ic;
+
             char *__restrict dst = (char *)dst_base
                     + types::data_type_size(dst_md.data_type()) * dst_off;
             if (jcp.im2col_sz) {
@@ -244,20 +246,20 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
                     case data_type::s8: {
                         if (is_problem_3d)
                             jit_gemm_convolution_utils::im2col_dt_3d<int8_t,
-                                    uint8_t>(jcp, imtr, col, od);
+                                    uint8_t>(jcp, imtr, col, od, input_zp);
                         else
                             jit_gemm_convolution_utils::im2col_dt<int8_t,
                                     uint8_t>(jcp, src, imtr, col, oh, h_step,
-                                    ow, w_step);
+                                    ow, w_step, input_zp);
                     } break;
                     case data_type::u8: {
                         if (is_problem_3d)
                             jit_gemm_convolution_utils::im2col_dt_3d<uint8_t,
-                                    uint8_t>(jcp, imtr, col, od);
+                                    uint8_t>(jcp, imtr, col, od, input_zp);
                         else
                             jit_gemm_convolution_utils::im2col_dt<uint8_t,
                                     uint8_t>(jcp, src, imtr, col, oh, h_step,
-                                    ow, w_step);
+                                    ow, w_step, input_zp);
                     } break;
                     default: assert(!"unsupported data type"); break;
                 }
@@ -275,10 +277,10 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
             const float onef = 1.f, zerof = 0.f;
             const char *__restrict src_od
                     = src + od * jcp.oh * jcp.ow * jcp.ngroups * jcp.ic;
-            st = gemm_s8u8s32("N", BT, jcp.signed_input ? "C" : "F", &M, &N, &K,
+            st = gemm_s8u8s32("N", BT, (jcp.signed_input || jcp.with_input_zp) ? "C" : "F", &M, &N, &K,
                     &onef, wei, &LDA, &off_a,
                     jcp.im2col_sz ? col : (uint8_t *)src_od, &LDB, &off_b,
-                    &zerof, acc, &M, jcp.signed_input ? wei_comp : &off_c);
+                    &zerof, acc, &M, (jcp.signed_input || jcp.with_input_zp) ? wei_comp : &off_c);
 
             if (st != status::success) return st;
 
@@ -358,16 +360,15 @@ status_t gemm_x8s8s32x_convolution_bwd_data_t::execute_backward_data_thr(
     const auto diff_src_dt_size
             = types::data_type_size(diff_src_md.data_type());
 
-    const int scale_idx_mult = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_
+    const int scale_idx_mult = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS)
             == (1 << static_cast<int>(pd()->with_groups()));
     DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *scales = precompute_scales(scratchpad, src_scales, wei_scales,
-            pd()->IC(), pd()->OC(), false, wei_scale_mask != 0, pd()->attr());
+            pd()->IC(), pd()->OC(), false, wei_scale_mask > 0, pd()->attr());
 
     const dim_t work_amount = jcp.ngroups * jcp.mb;
 
diff --git a/src/cpu/gemm_x8s8s32x_convolution.hpp b/src/cpu/gemm_x8s8s32x_convolution.hpp
index cb5cccd11b8..866d5f8927f 100644
--- a/src/cpu/gemm_x8s8s32x_convolution.hpp
+++ b/src/cpu/gemm_x8s8s32x_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,9 +39,7 @@ namespace cpu {
 
 struct gemm_x8s8s32x_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(src_md()->data_type == data_type::u8
                         ? IGEMM_S8U8S32_IMPL_STR
@@ -71,34 +69,56 @@ struct gemm_x8s8s32x_convolution_fwd_t : public primitive_t {
 
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
 
-            VDISPATCH_CONV(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                                    | skip_mask_t::zero_points_runtime
-                                    | skip_mask_t::post_ops
-                                    | skip_mask_t::sum_dt,
-                            dst_type),
+            VDISPATCH_CONV(attr()->has_default_values(skip_mask_t::scales
+                                           | skip_mask_t::zero_points
+                                           | skip_mask_t::post_ops
+                                           | skip_mask_t::sum_dt
+                                           | primitive_attr_t::skip_mask_t::input_zero_points
+                                           | primitive_attr_t::skip_mask_t::output_compensations,
+                                   dst_type),
                     VERBOSE_UNSUPPORTED_ATTR);
 
-            VDISPATCH_CONV(attr()->post_ops_.check_sum_consistency(dst_type,
-                                   /* is_int8 */ true),
+            // VDISPATCH_CONV(attr()->post_ops_.check_sum_consistency(dst_type,
+            //                        /* is_int8 */ true),
+            //         VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(post_ops_ok(),
                     VERBOSE_UNSUPPORTED_POSTOP);
             VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_CONV(zero_points_valid(attr()), VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             CHECK(jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
 
-            VDISPATCH_CONV(gemm_x8s8s32x_convolution_utils::post_ops_ok(
-                                   attr()->post_ops_, &dst_md_),
-                    VERBOSE_UNSUPPORTED_POSTOP);
+            // VDISPATCH_CONV(gemm_x8s8s32x_convolution_utils::post_ops_ok(
+            //                        attr()->post_ops_, &dst_md_),
+            //         VERBOSE_UNSUPPORTED_POSTOP);
 
             return status::success;
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+
+    protected:
+        bool post_ops_ok() const {
+            using namespace dnnl::impl::primitive_kind;
+            auto const &po = attr()->post_ops_;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < po.len(); i++) {
+                    ok = ok && utils::one_of(po.entry_[i].kind, sum, binary, eltwise, depthwise, quantization);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
     gemm_x8s8s32x_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
@@ -121,7 +141,8 @@ struct gemm_x8s8s32x_convolution_fwd_t : public primitive_t {
             const zero_point_call_params_t &zp,
             const memory_tracking::grantor_t &scratchpad,
             const void *post_ops_binary_rhs_arg_vec,
-            const exec_ctx_t &ctx) const;
+            const exec_ctx_t &ctx,
+            const uint8_t *input_zp_base, const int32_t *output_compensation_base) const;
 
     using pp_ker_t = gemm_x8s8s32x_convolution_utils::pp_ker_t;
     std::unique_ptr<pp_ker_t> pp_ker_;
@@ -129,9 +150,7 @@ struct gemm_x8s8s32x_convolution_fwd_t : public primitive_t {
 
 struct gemm_x8s8s32x_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(diff_dst_md()->data_type == data_type::u8
                         ? IGEMM_S8U8S32_IMPL_STR
@@ -158,13 +177,15 @@ struct gemm_x8s8s32x_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
 
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-            VDISPATCH_CONV(
-                    attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime),
+            VDISPATCH_CONV(attr()->has_default_values(
+                                   primitive_attr_t::skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
 
             auto scratchpad = scratchpad_registry().registrar();
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), diff_src_md_, weights_md_, diff_dst_md_, bias_md_,
                     attr_, dnnl_get_max_threads());
@@ -172,7 +193,7 @@ struct gemm_x8s8s32x_convolution_bwd_data_t : public primitive_t {
 
         bool support_bias() const override { return true; }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     gemm_x8s8s32x_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
diff --git a/src/cpu/gemm_x8s8s32x_convolution_utils.cpp b/src/cpu/gemm_x8s8s32x_convolution_utils.cpp
index 4d01a014b52..0df5540fe17 100644
--- a/src/cpu/gemm_x8s8s32x_convolution_utils.cpp
+++ b/src/cpu/gemm_x8s8s32x_convolution_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,11 +39,29 @@ namespace gemm_x8s8s32x_convolution_utils {
 template <typename dst_data_t>
 struct ref_pp_ker_t : pp_ker_t {
     ref_pp_ker_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
-        : pp_ker_t(pd, jcp), dst_md_(pd->dst_md()) {}
+        : pp_ker_t(pd, jcp) {
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            if (post_op.is_eltwise()) {
+                ref_eltwise_injectors_.push_back(new ref_eltwise_scalar_fwd_t(post_op.eltwise));
+            } else if (post_op.is_depthwise()) {
+                ref_depthwise_injectors_.push_back(new ref_depthwise_scalar_fwd_t(
+                        post_op.depthwise.alg));
+            }
+        }
+    }
+    ~ref_pp_ker_t() {
+        for (auto impl : ref_eltwise_injectors_)
+            delete impl;
+        ref_eltwise_injectors_.clear();
+        for (auto impl : ref_depthwise_injectors_)
+            delete impl;
+        ref_depthwise_injectors_.clear();
+    }
 
     using acc_data_t = pp_ker_t::acc_data_t;
 
-    void operator()(void *dst, const acc_data_t *acc, const char *bias,
+    void operator()(void *dst, acc_data_t *acc, const char *bias,
             const float *scales, float dst_scale, float sum_scale,
             float signed_scale, int g, size_t start, size_t end,
             const zero_point_call_params_t &zp,
@@ -51,88 +69,182 @@ struct ref_pp_ker_t : pp_ker_t {
             const exec_ctx_t &ctx, const memory_desc_t &dst_md,
             const single_gemm_conv_chunk_desc_t &chunk_desc) const override;
 
-    status_t create_kernel() override {
-        if (this->jcp_.with_eltwise || this->jcp_.with_binary) {
-            ref_post_ops_
-                    = utils::make_unique<ref_post_ops_t>(this->jcp_.post_ops);
-            if (!ref_post_ops_) return status::out_of_memory;
-            return ref_post_ops_->init(dst_md_);
-        }
-        return status::success;
-    }
-
 private:
-    std::unique_ptr<ref_post_ops_t> ref_post_ops_;
-    const memory_desc_t *dst_md_;
+    nstl::vector<ref_eltwise_scalar_fwd_t*> ref_eltwise_injectors_;
+    nstl::vector<ref_depthwise_scalar_fwd_t*> ref_depthwise_injectors_;
 };
 
 template <typename dst_data_t>
-void ref_pp_ker_t<dst_data_t>::operator()(void *void_dst, const acc_data_t *acc,
+void ref_pp_ker_t<dst_data_t>::operator()(void *void_dst, acc_data_t *acc,
         const char *bias, const float *scales, float dst_scale, float sum_scale,
         float signed_scale, int g, size_t start, size_t end,
         const zero_point_call_params_t &zp,
-        const void * /* post_ops_binary_rhs_arg_vec */,
+        const void * post_ops_binary_rhs_arg_vec,
         const void * /* dst_orig */, const exec_ctx_t &ctx,
         const memory_desc_t &dst_md,
         const single_gemm_conv_chunk_desc_t &chunk_desc) const {
 
     if (end <= start) return;
 
-    assert(data_traits<dst_data_t>::data_type == jcp_.dst_data_type);
+    assert(data_traits_t<dst_data_t>::data_type == dst_data_type_);
+    dst_data_t *dst = (dst_data_t *)void_dst;
 
-    const lldiv_t dv_start = std::div((long long)start, (long long)jcp_.oc);
-    const lldiv_t dv_end = std::div((long long)(end - 1), (long long)jcp_.oc);
-    const size_t first_oc = dv_start.rem;
-    const size_t last_oc = dv_end.rem;
-    const size_t first_os = dv_start.quot;
-    const size_t last_os = dv_end.quot;
+    const size_t first_oc = start % OC_;
+    const size_t last_oc = (end - 1) % OC_;
+    const size_t first_os = start / OC_;
+    const size_t last_os = (end - 1) / OC_;
     const int32_t zp_dst_val = jcp_.zp.dst_exists ? *(zp.dst) : 0;
 
-    ref_post_ops_t::args_t args;
-    args.ctx = &ctx;
-    args.dst_md = &dst_md;
+    if (post_ops_.len() == 0) {
+        for (size_t os = first_os; os <= last_os; os++) {
+            const size_t start_oc = (os == first_os) ? first_oc : 0;
+            const size_t end_oc = (os == last_os) ? last_oc : OC_ - 1;
+            for (size_t oc = start_oc; oc <= end_oc; oc++) {
+                const size_t acc_off = os * jcp_.oc + oc;
+                const size_t dst_off = os * dst_os_stride_ + oc;
+
+                float d = (float) (acc[acc_off]);
+                if (jcp_.signed_input) d *= signed_scale;
 
-    for (size_t os = first_os; os <= last_os; os++) {
-        const size_t start_oc = (os == first_os) ? first_oc : 0;
-        const size_t end_oc = (os == last_os) ? last_oc : jcp_.oc - 1;
-        for (size_t oc = start_oc; oc <= end_oc; oc++) {
-            const size_t acc_off = os * jcp_.oc + oc;
-            const size_t dst_off = os * jcp_.dst_os_stride + oc;
+                if (do_bias_)
+                    d += math::get_bias(bias, g * jcp_.oc + oc, bias_data_type_);
 
-            int32_t data_s32 = acc[acc_off];
+                d *= scales[(g * jcp_.oc + oc) * jcp_.scale_idx_mult];
 
-            if (jcp_.zp.src_exists) {
-                const auto oc_offset = g * jcp_.oc + oc;
-                data_s32 += zp.src_comp[oc_offset];
+                // quantize data
+                if (jcp_.with_dst_scale) d *= dst_scale;
+                if (jcp_.zp.dst_exists) d += zp_dst_val;
+
+                dst[dst_off] = dnnl::impl::cpu::q10n::qz_a1b0_t<float, dst_data_t>()(d);
             }
+        }
+    } else {
+        float* acc_fp = reinterpret_cast<float*>(acc);
 
-            float data = static_cast<float>(data_s32);
+        auto load = [&](int idx, size_t oc, size_t os, size_t acc_off, size_t dst_off) {
+            float d;
+            if (idx == 0) {
+                d = (float) (acc[acc_off]);
 
-            if (jcp_.signed_input) data *= signed_scale;
+                if (jcp_.signed_input)
+                    d *= signed_scale;
 
-            // dequantize data
-            data *= scales[(g * jcp_.oc + oc) * jcp_.scale_idx_mult];
+                if (do_bias_)
+                    d += math::get_bias(bias, g * jcp_.oc + oc,
+                                        bias_data_type_);
 
-            if (jcp_.with_bias) {
-                const float b = io::load_float_value(
-                        jcp_.bias_data_type, bias, g * jcp_.oc + oc);
-                data += b;
+                d *= scales[(g * jcp_.oc + oc) * jcp_.scale_idx_mult];
+            } else {
+                d = acc_fp[acc_off];
             }
 
-            if (jcp_.with_sum)
-                data += sum_scale
-                        * io::load_float_value(
-                                jcp_.sum_data_type, void_dst, dst_off);
-            if (jcp_.with_eltwise || jcp_.with_binary) {
-                args.l_offset = (g * jcp_.oc + oc) * jcp_.os;
-                ref_post_ops_->execute(data, args);
+            return d;
+        };
+
+        auto store = [&](int idx, float d, size_t acc_off, size_t dst_off) {
+            if (idx == post_ops_.len() - 1)
+                dst[dst_off] = dnnl::impl::cpu::q10n::qz_a1b0_t<float, dst_data_t>()(d);
+            else
+                acc_fp[acc_off] = d;
+        };
+
+        auto post_ops_data_ptrs = reinterpret_cast<const float* const*>(post_ops_binary_rhs_arg_vec);
+        std::size_t post_ops_data_idx = 0;
+        int eltwise_inj_idx = 0;
+        int depthwise_inj_idx = 0;
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            if (post_op.is_eltwise()) {
+                for (size_t os = first_os; os <= last_os; os++) {
+                    const size_t start_oc = (os == first_os) ? first_oc : 0;
+                    const size_t end_oc = (os == last_os) ? last_oc : OC_ - 1;
+                    for (size_t oc = start_oc; oc <= end_oc; oc++) {
+                        const size_t acc_off = os * jcp_.oc + oc;
+                        const size_t dst_off = os * this->dst_os_stride_ + oc;
+
+                        float d = load(i, oc, os, acc_off, dst_off);
+
+                        d = ref_eltwise_injectors_[eltwise_inj_idx]->compute_scalar(d);
+
+                        store(i, d, acc_off, dst_off);
+                    }
+                }
+                eltwise_inj_idx++;
+            } else if (post_op.is_depthwise()) {
+                for (size_t os = first_os; os <= last_os; os++) {
+                    const size_t start_oc = (os == first_os) ? first_oc : 0;
+                    const size_t end_oc = (os == last_os) ? last_oc : OC_ - 1;
+                    for (size_t oc = start_oc; oc <= end_oc; oc++) {
+                        const size_t acc_off = os * jcp_.oc + oc;
+                        const size_t dst_off = os * this->dst_os_stride_ + oc;
+
+                        auto depthwise_base = post_ops_data_ptrs[post_ops_data_idx];
+                        auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                        auto depthwise_bias = depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts];
+
+                        float d = load(i, oc, os, acc_off, dst_off);
+
+                        d = ref_depthwise_injectors_[depthwise_inj_idx]->compute_scalar(d, depthwise_weights + g * jcp_.oc + oc,
+                                                                                        depthwise_bias + g * jcp_.oc + oc);
+
+                        store(i, d, acc_off, dst_off);
+
+                    }
+                }
+                post_ops_data_idx++;
+                depthwise_inj_idx++;
+            } else if (post_op.is_quantization()) {
+                for (size_t os = first_os; os <= last_os; os++) {
+                    const size_t start_oc = (os == first_os) ? first_oc : 0;
+                    const size_t end_oc = (os == last_os) ? last_oc : OC_ - 1;
+                    for (size_t oc = start_oc; oc <= end_oc; oc++) {
+                        const size_t acc_off = os * jcp_.oc + oc;
+                        const size_t dst_off = os * this->dst_os_stride_ + oc;
+
+                        auto quant = post_op.quantization;
+                        auto quantization_base = post_ops_data_ptrs[post_ops_data_idx];
+                        auto pcl = quantization_base + post_op.quantization.offset[quant.crop_low];
+                        auto pch = quantization_base + post_op.quantization.offset[quant.crop_high];
+                        auto pisc = quantization_base + post_op.quantization.offset[quant.inp_scale];
+                        auto pish = quantization_base + post_op.quantization.offset[quant.inp_shift];
+                        auto posc = quantization_base + post_op.quantization.offset[quant.output_scale];
+                        auto posh = quantization_base + post_op.quantization.offset[quant.output_shift];
+
+                        float d = load(i, oc, os, acc_off, dst_off);
+
+                        int cl_idx = !quant.per_channel[quant.crop_low] ? 0 : g * jcp_.oc + oc;
+                        int ch_idx = !quant.per_channel[quant.crop_high] ? 0 : g * jcp_.oc + oc;
+                        int isc_idx = !quant.per_channel[quant.inp_scale] ? 0 : g * jcp_.oc + oc;
+                        int ish_idx = !quant.per_channel[quant.inp_shift] ? 0 : g * jcp_.oc + oc;
+                        int osc_idx = !quant.per_channel[quant.output_scale] ? 0 : g * jcp_.oc + oc;
+                        int osh_idx = !quant.per_channel[quant.output_shift] ? 0 : g * jcp_.oc + oc;
+
+                        d = nstl::min(pch[ch_idx], nstl::max(pcl[cl_idx], d));
+                        d = d * pisc[isc_idx] + pish[ish_idx];
+                        d = roundf(d);
+                        d = d * posc[osc_idx] + posh[osh_idx];
+
+                        store(i, d, acc_off, dst_off);
+
+                    }
+                }
+                post_ops_data_idx++;
+            } else if (post_op.is_sum()) {
+                for (size_t os = first_os; os <= last_os; os++) {
+                    const size_t start_oc = (os == first_os) ? first_oc : 0;
+                    const size_t end_oc = (os == last_os) ? last_oc : OC_ - 1;
+                    for (size_t oc = start_oc; oc <= end_oc; oc++) {
+                        const size_t acc_off = os * jcp_.oc + oc;
+                        const size_t dst_off = os * this->dst_os_stride_ + oc;
+
+                        float d = load(i, oc, os, acc_off, dst_off);
+
+                        d += post_op.sum.scale * math::get_sum((char *) dst, dst_off, post_op.sum.dt);
+
+                        store(i, d, acc_off, dst_off);
+                    }
+                }
             }
-
-            // quantize data
-            if (jcp_.with_dst_scale) data *= dst_scale;
-            if (jcp_.zp.dst_exists) data += static_cast<float>(zp_dst_val);
-
-            io::store_float_value(jcp_.dst_data_type, data, void_dst, dst_off);
         }
     }
 }
@@ -140,7 +252,23 @@ void ref_pp_ker_t<dst_data_t>::operator()(void *void_dst, const acc_data_t *acc,
 // Interface section
 
 pp_ker_t::pp_ker_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
-    : jcp_(jcp) {}
+    : jcp_(jcp)
+    , post_ops_(pd->attr()->post_ops_)
+    , OC_(jcp_.oc)
+{
+    const auto dst_md = memory_desc_wrapper(pd->dst_md());
+
+    dst_os_stride_ = dst_md.blocking_desc().strides[pd->ndims() - 1];
+    dst_data_type_ = dst_md.data_type();
+    // Use weight scale to do DQ.
+    do_scale_ = !pd->attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
+
+    do_bias_ = pd->with_bias();
+    if (do_bias_) {
+        bias_data_type_ = pd->desc()->bias_desc.data_type;
+        assert(bias_data_type_ != data_type::undef);
+    }
+}
 
 pp_ker_t *pp_ker_t::create(
         const convolution_pd_t *pd, const conv_gemm_conf_t &jcp) {
@@ -160,21 +288,6 @@ pp_ker_t *pp_ker_t::create(
     return nullptr;
 }
 
-bool post_ops_ok(const post_ops_t &post_ops, const memory_desc_wrapper *dst_d) {
-#if DNNL_X64
-    return x64::gemm_x8s8s32x_convolution_utils::post_ops_ok(post_ops, dst_d);
-#endif
-    return std::all_of(post_ops.entry_.cbegin(), post_ops.entry_.cend(),
-            [](const dnnl_post_ops::entry_t &post_op) {
-                return post_op.is_eltwise() || post_op.is_sum()
-                        || post_op.is_binary() || post_op.is_prelu();
-            });
-}
-
-bool post_ops_ok(const post_ops_t &post_ops, const memory_desc_t *dst_d) {
-    const auto dst_md = memory_desc_wrapper(dst_d);
-    return post_ops_ok(post_ops, &dst_md);
-}
 
 bool mayiuse_jit_pp_kernel(data_type_t dst_dt) noexcept {
 #if DNNL_X64
diff --git a/src/cpu/gemm_x8s8s32x_convolution_utils.hpp b/src/cpu/gemm_x8s8s32x_convolution_utils.hpp
index e133222f963..86e949ea995 100644
--- a/src/cpu/gemm_x8s8s32x_convolution_utils.hpp
+++ b/src/cpu/gemm_x8s8s32x_convolution_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,9 +32,9 @@ struct pp_ker_t {
             const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
     virtual ~pp_ker_t() = default;
 
-    typedef typename prec_traits<data_type::s32>::type acc_data_t;
+    using acc_data_t = typename prec_traits_t<data_type::s32>::type;
 
-    virtual void operator()(void *dst, const acc_data_t *acc, const char *bias,
+    virtual void operator()(void *dst, acc_data_t *acc, const char *bias,
             const float *scales, float dst_scale, float sum_scale,
             float signed_scale, int g, size_t start, size_t end,
             const zero_point_call_params_t &zp,
@@ -42,17 +42,25 @@ struct pp_ker_t {
             const exec_ctx_t &ctx, const memory_desc_t &dst_md,
             const single_gemm_conv_chunk_desc_t &chunk_desc) const = 0;
 
+    size_t dst_os_stride_;
+
     virtual status_t create_kernel() { return status::success; }
 
 protected:
     pp_ker_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
 
     const conv_gemm_conf_t &jcp_;
-};
+    const post_ops_t &post_ops_;
+    size_t OC_;
+
+    bool mayiuse_jit_pp_kernel(data_type_t dst_dt) noexcept;
 
-bool post_ops_ok(const post_ops_t &post_ops, const memory_desc_wrapper *dst_d);
-bool post_ops_ok(const post_ops_t &post_ops, const memory_desc_t *dst_d);
-bool mayiuse_jit_pp_kernel(data_type_t dst_dt) noexcept;
+    bool do_bias_ = false;
+    bool do_scale_ = false;
+
+    data_type_t bias_data_type_ = data_type::undef;
+    data_type_t dst_data_type_ = data_type::undef;
+};
 
 } // namespace gemm_x8s8s32x_convolution_utils
 } // namespace cpu
diff --git a/src/cpu/gemm_x8s8s32x_inner_product.cpp b/src/cpu/gemm_x8s8s32x_inner_product.cpp
index 341a584a276..cad125ea7be 100644
--- a/src/cpu/gemm_x8s8s32x_inner_product.cpp
+++ b/src/cpu/gemm_x8s8s32x_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,10 +64,9 @@ status_t gemm_x8s8s32x_inner_product_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
     auto scratchpad = ctx.get_scratchpad_grantor();
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *scales = precompute_scales(scratchpad, src_scales, wei_scales,
-            IC, OC, false, wei_scale_mask != 0, pd()->attr());
+            IC, OC, false, wei_scale_mask > 0, pd()->attr());
 
     int32_t *acc = pd()->dst_is_acc_
             ? (int32_t *)dst
diff --git a/src/cpu/gemm_x8s8s32x_inner_product.hpp b/src/cpu/gemm_x8s8s32x_inner_product.hpp
index ea62c604e05..bda7860417b 100644
--- a/src/cpu/gemm_x8s8s32x_inner_product.hpp
+++ b/src/cpu/gemm_x8s8s32x_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ struct gemm_x8s8s32x_inner_product_fwd_t : public primitive_t {
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_INNER_PRODUCT(
                     attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime
+                            primitive_attr_t::skip_mask_t::scales
                                     | primitive_attr_t::skip_mask_t::post_ops,
                             dst_md()->data_type),
                     VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/cpu/jit_utils/jit_utils.cpp b/src/cpu/jit_utils/jit_utils.cpp
index 431cd71a4a9..d95484401c0 100644
--- a/src/cpu/jit_utils/jit_utils.cpp
+++ b/src/cpu/jit_utils/jit_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 * Copyright 2021 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,6 +16,7 @@
 *******************************************************************************/
 
 #include <mutex>
+#include <iostream>
 
 #include "common/utils.hpp"
 #include "common/verbose.hpp"
@@ -31,7 +32,7 @@
 #endif
 
 #if DNNL_ENABLE_JIT_PROFILING
-#include "common/ittnotify/jitprofiling.h"
+#include "ittnotify/jitprofiling.h"
 #ifdef __linux__
 #include "cpu/jit_utils/linux_perf/linux_perf.hpp"
 #endif
@@ -60,7 +61,7 @@ void dump_jit_code(const void *code, size_t code_size, const char *code_name) {
         // TODO (Roma): support prefix for code / linux perf dumps
         snprintf(fname, MAX_FNAME_LEN, DUMP_BASE_FNAME "%s" DUMP_EXT_FNAME,
                 code_name);
-
+        std::cout << "[ oneDNN ] dump_jit_code: " << fname << std::endl;
         FILE *fp = fopen(fname, "wb+");
         // Failure to dump code is not fatal
         if (fp) {
@@ -97,7 +98,7 @@ void register_jit_code_vtune(const void *code, size_t code_size,
     }
 #else
     if (flags & DNNL_JIT_PROFILE_VTUNE)
-        VERROR(primitive, jit_profiling,
+        VWARN(primitive, jit_profiling,
                 "VTune Profiler integration is not supported");
 #endif
 #else
@@ -137,7 +138,9 @@ void register_jit_code(const void *code, size_t code_size,
     char unique_code_name[MAX_CODENAME_LEN + 1];
     snprintf(unique_code_name, MAX_CODENAME_LEN, "%s.%d", code_name,
             unique_id++);
-
+    if (code && get_jit_dump()) {
+        std::cout << "[ oneDNN ] register_jit_code: " << unique_code_name << ", " << code_name << std::endl;
+    }
     dump_jit_code(code, code_size, unique_code_name);
     // VTune Profiler does not need a unique name, because it uses
     // unique method_id
diff --git a/src/cpu/jit_utils/linux_perf/linux_perf.cpp b/src/cpu/jit_utils/linux_perf/linux_perf.cpp
index 2a815d77505..0dc561518d9 100644
--- a/src/cpu/jit_utils/linux_perf/linux_perf.cpp
+++ b/src/cpu/jit_utils/linux_perf/linux_perf.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 * Copyright 2021 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -195,7 +195,7 @@ class linux_perf_jitdump_t {
         }
 #else
         if (use_tsc) {
-            VERROR(primitive, linux_perf,
+            VWARN(primitive, linux_perf,
                     "TSC timestamps is not supported. clock_gettime() is used "
                     "instead.");
         }
diff --git a/src/cpu/matmul/cpu_matmul_list.cpp b/src/cpu/matmul/cpu_matmul_list.cpp
index 6a53d0920c6..6868128bcf0 100644
--- a/src/cpu/matmul/cpu_matmul_list.cpp
+++ b/src/cpu/matmul/cpu_matmul_list.cpp
@@ -1,7 +1,7 @@
 /*******************************************************************************
 * Copyright 2019-2024 Intel Corporation
-* Copyright 2024 FUJITSU LIMITED
-* Copyright 2021-2024 Arm Ltd. and affiliates
+* Copyright 2024-2025 FUJITSU LIMITED
+* Copyright 2021-2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,15 +30,23 @@
 #include "cpu/x64/matmul/jit_uni_sparse_matmul.hpp"
 using namespace dnnl::impl::cpu::x64::matmul;
 using namespace dnnl::impl::cpu::x64;
-#elif DNNL_AARCH64
+#endif
+
+#if DNNL_AARCH64
 #include "cpu/aarch64/matmul/brgemm_matmul.hpp"
-#ifdef DNNL_AARCH64_USE_ACL
-#include "cpu/aarch64/matmul/acl_lowp_matmul.hpp"
-#include "cpu/aarch64/matmul/acl_matmul.hpp"
+#include "cpu/aarch64/matmul/jit_int8_matmul.hpp"
 #endif
+
+#ifdef DNNL_USE_ACL
+#include "cpu/acl/matmul/acl_lowp_matmul.hpp"
+#include "cpu/acl/matmul/acl_lowp_matmul_sq.hpp"
+#include "cpu/acl/matmul/acl_matmul.hpp"
+#if DNNL_AARCH64
 using namespace dnnl::impl::cpu::aarch64::matmul;
 using namespace dnnl::impl::cpu::aarch64;
-
+#endif
+using namespace dnnl::impl::cpu::acl::matmul;
+using namespace dnnl::impl::cpu::acl;
 #endif
 
 namespace dnnl {
@@ -71,25 +79,27 @@ using namespace dnnl::impl::cpu::matmul;
 #endif
 
 // clang-format off
-constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({
-
-        CPU_INSTANCE_AARCH64(brgemm_matmul_t<sve_512>)        
-        CPU_INSTANCE_AARCH64_ACL(acl_lowp_matmul_t)
-        CPU_INSTANCE_AARCH64_ACL(acl_matmul_t) 
-        CPU_INSTANCE_AARCH64(brgemm_matmul_t<sve_256>)       
-        CPU_INSTANCE_AMX(brgemm_matmul_t<avx512_core_amx_fp16>)
-        CPU_INSTANCE_AMX(brgemm_matmul_t<avx512_core_amx>)
-        CPU_INSTANCE_AVX512(brgemm_matmul_t<avx512_core_fp16>)
-        CPU_INSTANCE_AVX512(brgemm_matmul_t<avx512_core_bf16>)
-        CPU_INSTANCE_AVX512(brgemm_matmul_t<avx512_core_vnni>)
-        CPU_INSTANCE_AVX512(brgemm_matmul_t<avx512_core>)
-        CPU_INSTANCE_AVX2(brgemm_matmul_t<avx2_vnni_2>)
-        CPU_INSTANCE_AVX2(brgemm_matmul_t<avx2_vnni>)
+const impl_list_item_t impl_list[] = REG_MATMUL_P({
+
+        CPU_INSTANCE_AARCH64(brgemm_matmul_t<sve_512>)
+        CPU_INSTANCE_ACL(acl_lowp_matmul_sq_t)
+        CPU_INSTANCE_ACL(acl_lowp_matmul_t)
+        CPU_INSTANCE_ACL(acl_matmul_t)
+        CPU_INSTANCE_AARCH64(brgemm_matmul_t,sve_256)
+        CPU_INSTANCE_AARCH64(jit_int8_matmul_t)
+        CPU_INSTANCE_AMX(brgemm_matmul_t,avx512_core_amx_fp16)
+        CPU_INSTANCE_AMX(brgemm_matmul_t,avx512_core_amx)
+        CPU_INSTANCE_AVX512(brgemm_matmul_t,avx512_core_fp16)
+        CPU_INSTANCE_AVX512(brgemm_matmul_t,avx512_core_bf16)
+        CPU_INSTANCE_AVX512(brgemm_matmul_t,avx512_core_vnni)
+        CPU_INSTANCE_AVX512(brgemm_matmul_t,avx512_core)
+        CPU_INSTANCE_AVX2(brgemm_matmul_t,avx2_vnni_2)
+        CPU_INSTANCE_AVX2(brgemm_matmul_t,avx2_vnni)
         CPU_INSTANCE(gemm_f32_matmul_t)
-        CPU_INSTANCE(gemm_bf16_matmul_t<f32>)
-        CPU_INSTANCE(gemm_bf16_matmul_t<bf16>)
+        CPU_INSTANCE(gemm_bf16_matmul_t, f32)
+        CPU_INSTANCE(gemm_bf16_matmul_t, bf16)
         CPU_INSTANCE(gemm_x8s8s32x_matmul_t)
-        CPU_INSTANCE_AVX2(brgemm_matmul_t<avx2>)
+        CPU_INSTANCE_AVX2(brgemm_matmul_t, avx2)
         CPU_INSTANCE(ref_matmul_t)
         CPU_INSTANCE(ref_matmul_int8_t)
         // These implementations are enabled only when DNNL_EXPERIMENTAL_SPARSE
@@ -112,4 +122,4 @@ const impl_list_item_t *get_matmul_impl_list(const matmul_desc_t *desc) {
 
 } // namespace cpu
 } // namespace impl
-} // namespace dnnl
+} // namespace dnnl
\ No newline at end of file
diff --git a/src/cpu/matmul/gemm_bf16_matmul.cpp b/src/cpu/matmul/gemm_bf16_matmul.cpp
index cd415b94743..c61b54be5cf 100644
--- a/src/cpu/matmul/gemm_bf16_matmul.cpp
+++ b/src/cpu/matmul/gemm_bf16_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -66,9 +66,9 @@ status_t gemm_bf16_matmul_t<dst_type>::pd_t::init(engine_t *engine) {
     VDISPATCH_MATMUL(x64::mayiuse(x64::avx512_core), VERBOSE_UNSUPPORTED_ISA);
 #endif
 
-    VDISPATCH_MATMUL(attr()->has_default_values(
-                             primitive_attr_t::skip_mask_t::scales_runtime
-                             | primitive_attr_t::skip_mask_t::post_ops),
+    VDISPATCH_MATMUL(
+            attr()->has_default_values(primitive_attr_t::skip_mask_t::scales
+                    | primitive_attr_t::skip_mask_t::post_ops),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_MATMUL(attr()->post_ops_.check_sum_consistency(dst_type,
                              /* is_int8 */ false),
@@ -105,9 +105,9 @@ status_t gemm_bf16_matmul_t<dst_type>::pd_t::check_and_configure_attributes(
         engine_t *engine) {
     auto check_attr_scales = [&]() -> bool {
         bool ok = attr_scales_ok();
-        if (!attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
-                && !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values()
-                && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0) {
+        if (!attr()->scales_.has_default_values(DNNL_ARG_SRC)
+                && !attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS)
+                && attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) > 0) {
             // This case requires scratchpad with unknown size
             if (N() == DNNL_RUNTIME_DIM_VAL) ok = false;
         }
@@ -145,11 +145,15 @@ status_t gemm_bf16_matmul_t<dst_type>::pd_t::check_and_configure_attributes(
     // set state
     CHECK(params_.pp_attr_.copy_from(*attr()));
     params_.gemm_applies_output_scales_
-            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0 && !with_bias();
+            = attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) == 0 && !with_bias();
 
     if (params_.gemm_applies_output_scales_) {
-        params_.pp_attr_.scales_.reset(DNNL_ARG_SRC);
-        params_.pp_attr_.scales_.reset(DNNL_ARG_WEIGHTS);
+        VDISPATCH_MATMUL_SC(params_.pp_attr_.scales_.set(
+                                    DNNL_ARG_SRC, default_quant_entry()),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+        VDISPATCH_MATMUL_SC(params_.pp_attr_.scales_.set(
+                                    DNNL_ARG_WEIGHTS, default_quant_entry()),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
     }
 
     // check post-ops
@@ -203,11 +207,10 @@ status_t gemm_bf16_matmul_t<dst_type>::execute_ref(
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
     auto scratchpad = ctx.get_scratchpad_grantor();
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *scales = precompute_scales(scratchpad, src_scales, wei_scales,
             src_d.dims()[ndims - 1], dst_d.dims()[ndims - 1], false,
-            wei_scale_mask != 0, pd()->attr());
+            wei_scale_mask > 0, pd()->attr());
 
     if (src_d.has_zero_dim() || weights_d.has_zero_dim()
             || dst_d.has_zero_dim())
@@ -254,7 +257,7 @@ status_t gemm_bf16_matmul_t<dst_type>::execute_ref(
     const float beta = params.gemm_beta_;
     const dim_t acc_ldc = dst_is_acc ? ldc : N;
     const int scale_idx_mult
-            = this->pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_
+            = this->pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS)
             == (1 << (ndims - 1));
 
     std::atomic<status_t> st(status::success);
@@ -271,11 +274,7 @@ status_t gemm_bf16_matmul_t<dst_type>::execute_ref(
         const dim_t acc_stride = gemm_based::get_scratchpad_block_elements(
                 batch, M, N, use_single_gemm_call, nthr);
 
-#ifdef GCC_WA_LAMBDA_C_CAST
-        parallel(nthr, [= WA_THIS_COPY_CAPTURE, &st](int ithr, int nthr) {
-#else
         parallel(nthr, [&](int ithr, int nthr) {
-#endif
             size_t t_work_start {0}, t_work_end {0};
             balance211(work_amount, nthr, ithr, t_work_start, t_work_end);
 
diff --git a/src/cpu/matmul/gemm_bf16_matmul.hpp b/src/cpu/matmul/gemm_bf16_matmul.hpp
index 0df8bdd2317..0556db01a3e 100644
--- a/src/cpu/matmul/gemm_bf16_matmul.hpp
+++ b/src/cpu/matmul/gemm_bf16_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,10 +90,10 @@ struct gemm_bf16_matmul_t : public primitive_t {
     static constexpr data_type_t weights_type = data_type::bf16;
     static constexpr data_type_t acc_type = data_type::f32;
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<weights_type>::type weights_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
-    typedef typename prec_traits<acc_type>::type acc_data_t;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using weights_data_t = typename prec_traits_t<weights_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
+    using acc_data_t = typename prec_traits_t<acc_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_ref(ctx);
diff --git a/src/cpu/matmul/gemm_f32_matmul.cpp b/src/cpu/matmul/gemm_f32_matmul.cpp
index de57af38944..76c9d1b7be9 100644
--- a/src/cpu/matmul/gemm_f32_matmul.cpp
+++ b/src/cpu/matmul/gemm_f32_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,9 +50,9 @@ status_t gemm_f32_matmul_t::pd_t::init(engine_t *engine) {
 
     auto check_attr_scales = [&]() -> bool {
         bool ok = attr_scales_ok();
-        if (!attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
-                && !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values()
-                && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0) {
+        if (!attr()->scales_.has_default_values(DNNL_ARG_SRC)
+                && !attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS)
+                && attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) > 0) {
             // This case requires scratchpad with unknown size
             if (N() == DNNL_RUNTIME_DIM_VAL) ok = false;
         }
@@ -92,11 +92,11 @@ status_t gemm_f32_matmul_t::pd_t::init(engine_t *engine) {
     VDISPATCH_MATMUL(is_dense_format_kind(), VERBOSE_UNSUPPORTED_SPARSE_CFG);
     VDISPATCH_MATMUL(problem_dt_correct, VERBOSE_UNSUPPORTED_DT_CFG);
     VDISPATCH_MATMUL(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-    VDISPATCH_MATMUL(attr()->has_default_values(
-                             primitive_attr_t::skip_mask_t::scales_runtime
-                                     | primitive_attr_t::skip_mask_t::post_ops
-                                     | primitive_attr_t::skip_mask_t::sum_dt,
-                             dst_type),
+    VDISPATCH_MATMUL(
+            attr()->has_default_values(primitive_attr_t::skip_mask_t::scales
+                            | primitive_attr_t::skip_mask_t::post_ops
+                            | primitive_attr_t::skip_mask_t::sum_dt,
+                    dst_type),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_MATMUL(attr()->post_ops_.check_sum_consistency(dst_type,
                              /* is_int8 */ false),
@@ -131,10 +131,14 @@ status_t gemm_f32_matmul_t::pd_t::configure_attributes() {
 
     CHECK(params_.pp_attr_.copy_from(*attr()));
     params_.gemm_applies_output_scales_
-            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0 && !with_bias();
+            = attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) == 0 && !with_bias();
     if (params_.gemm_applies_output_scales_) {
-        params_.pp_attr_.scales_.reset(DNNL_ARG_SRC);
-        params_.pp_attr_.scales_.reset(DNNL_ARG_WEIGHTS);
+        VDISPATCH_MATMUL_SC(params_.pp_attr_.scales_.set(
+                                    DNNL_ARG_SRC, default_quant_entry()),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+        VDISPATCH_MATMUL_SC(params_.pp_attr_.scales_.set(
+                                    DNNL_ARG_WEIGHTS, default_quant_entry()),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
     }
 
     const auto &po = params_.pp_attr_.post_ops_;
@@ -186,11 +190,10 @@ status_t gemm_f32_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
     auto scratchpad = ctx.get_scratchpad_grantor();
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *scales = precompute_scales(scratchpad, src_scales, wei_scales,
             src_d.dims()[ndims - 1], dst_d.dims()[ndims - 1], false,
-            wei_scale_mask != 0, pd()->attr());
+            wei_scale_mask > 0, pd()->attr());
 
     if (src_d.has_zero_dim() || weights_d.has_zero_dim()
             || dst_d.has_zero_dim())
@@ -237,7 +240,7 @@ status_t gemm_f32_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
 
     const dim_t acc_ldc = dst_is_acc ? ldc : N;
     const int scale_idx_mult
-            = this->pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_
+            = this->pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS)
             == (1 << (ndims - 1));
 
     std::atomic<status_t> st(status::success);
diff --git a/src/cpu/matmul/gemm_f32_matmul.hpp b/src/cpu/matmul/gemm_f32_matmul.hpp
index 447de227565..dac4206c198 100644
--- a/src/cpu/matmul/gemm_f32_matmul.hpp
+++ b/src/cpu/matmul/gemm_f32_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -88,10 +88,10 @@ struct gemm_f32_matmul_t : public primitive_t {
     static constexpr data_type_t dst_type = data_type::f32;
     static constexpr data_type_t acc_type = data_type::f32;
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<weights_type>::type weights_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
-    typedef typename prec_traits<acc_type>::type acc_data_t;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using weights_data_t = typename prec_traits_t<weights_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
+    using acc_data_t = typename prec_traits_t<acc_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_ref(ctx);
diff --git a/src/cpu/matmul/gemm_x8s8s32x_matmul.cpp b/src/cpu/matmul/gemm_x8s8s32x_matmul.cpp
index 5fab321d7af..a9a7e209928 100644
--- a/src/cpu/matmul/gemm_x8s8s32x_matmul.cpp
+++ b/src/cpu/matmul/gemm_x8s8s32x_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,17 +61,27 @@ status_t gemm_x8s8s32x_matmul_t::pd_t::init(engine_t *engine) {
 
     auto check_attr_scales = [&]() -> bool {
         bool ok = attr_scales_ok();
-        if (!attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
-                && !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values()
-                && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0) {
+        if (!attr()->scales_.has_default_values(DNNL_ARG_SRC)
+                && !attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS)
+                && attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) > 0) {
             // This case requires scratchpad with unknown size
             if (N() == DNNL_RUNTIME_DIM_VAL) ok = false;
         }
         return ok;
     };
 
-    auto check_attr_zero_points
-            = [&]() -> bool { return attr()->zero_points_.common(); };
+    auto check_attr_zero_points = [&]() -> bool {
+        const auto &zp = attr()->zero_points_;
+        static const std::vector<int> supported_args {
+                DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+        for (int arg : supported_args) {
+            if (!zp.has_default_values(arg)) {
+                const int mask = zp.get_mask(arg);
+                if (mask > 0) return false;
+            }
+        }
+        return true;
+    };
 
     auto check_attr_post_ops = [&]() -> bool {
         using namespace primitive_kind;
@@ -117,9 +127,8 @@ status_t gemm_x8s8s32x_matmul_t::pd_t::init(engine_t *engine) {
     VDISPATCH_MATMUL(check_attr_scales(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_MATMUL(check_attr_zero_points(), VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_MATMUL(
-            attr()->has_default_values(
-                    primitive_attr_t::skip_mask_t::scales_runtime
-                            | primitive_attr_t::skip_mask_t::zero_points_runtime
+            attr()->has_default_values(primitive_attr_t::skip_mask_t::scales
+                            | primitive_attr_t::skip_mask_t::zero_points
                             | primitive_attr_t::skip_mask_t::post_ops
                             | primitive_attr_t::skip_mask_t::sum_dt,
                     dst_md()->data_type),
@@ -203,11 +212,10 @@ status_t gemm_x8s8s32x_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
     auto &scratchpad = ctx.get_scratchpad_grantor();
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *scales = precompute_scales(scratchpad, src_scales, wei_scales,
             src_d.dims()[ndims - 1], dst_d.dims()[ndims - 1], false,
-            wei_scale_mask != 0, pd()->attr());
+            wei_scale_mask > 0, pd()->attr());
 
     DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINT_VALUE(weights_zero_point, DNNL_ARG_WEIGHTS);
@@ -245,7 +253,9 @@ status_t gemm_x8s8s32x_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     const char transB = helper.transB();
     const dim_t lda = helper.lda();
     const dim_t ldb = helper.ldb();
-    const dim_t ldc = helper.ldc();
+    const dim_t ldc = dst_d.ndims() == 2 && dst_d.count_non_unit_dims(1)
+            ? N
+            : helper.ldc();
     const int ldx_dim_idx = pd()->ndims() - 2;
     const dim_t *src_strides = &src_d.blocking_desc().strides[ldx_dim_idx];
     const dim_t *weights_strides
@@ -276,7 +286,7 @@ status_t gemm_x8s8s32x_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     const float beta = params.gemm_beta_;
     const dim_t acc_ldc = dst_is_acc ? ldc : N;
     const int scale_idx_mult
-            = this->pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_
+            = this->pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS)
             == (1 << (ndims - 1));
 
     std::atomic<status_t> st(status::success);
@@ -297,11 +307,7 @@ status_t gemm_x8s8s32x_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
         bool postops_in_matmul = need_post_processing(pd(), dst_zero_point_f32);
         assert(IMPLICATION(postops_in_matmul, params.has_pp_kernel_));
 
-#ifdef GCC_WA_LAMBDA_C_CAST
-        parallel(nthr, [= WA_THIS_COPY_CAPTURE, &st](int ithr, int nthr) {
-#else
         parallel(nthr, [&](int ithr, int nthr) {
-#endif
             size_t t_work_start {0}, t_work_end {0};
             balance211(work_amount, nthr, ithr, t_work_start, t_work_end);
 
diff --git a/src/cpu/matmul/matmul_utils.hpp b/src/cpu/matmul/matmul_utils.hpp
index 28b3c7310d3..c1e9011bd82 100644
--- a/src/cpu/matmul/matmul_utils.hpp
+++ b/src/cpu/matmul/matmul_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,6 +19,7 @@
 #define CPU_MATMUL_UTILS_HPP
 
 #include "common/memory_desc_wrapper.hpp"
+#include "common/tag_traits.hpp"
 #include "common/utils.hpp"
 
 #include "cpu/binary_injector_utils.hpp"
@@ -150,6 +151,50 @@ struct matmul_helper_t {
         return true;
     }
 
+    // TODO: consolidate these functions with ones in simple_reorder.hpp, as they
+    // are copy-pasted, and address TODOs from there.
+    static status_t get_quant_md(memory_desc_t &md, const int ndims,
+            const dims_t in_dims, const int quant_mask, const dim_t g0,
+            const dim_t g1, const data_type_t dt) {
+        if (dt == data_type::undef || quant_mask < 0) {
+            md = glob_zero_md;
+            return status::success;
+        }
+
+        dims_t quant_dims {};
+        utils::copy_dims_with_mask(quant_dims, in_dims, ndims, quant_mask,
+                /* fill_with_ones = */ true);
+        if (ndims >= 2) {
+            quant_dims[ndims - 1] /= g1;
+            quant_dims[ndims - 2] /= g0;
+        }
+
+        CHECK(memory_desc_init_by_tag(
+                md, ndims, quant_dims, dt, get_abx_tag(ndims)));
+        return status::success;
+    }
+
+    static dim_t get_quant_off(const dims_t &input_idx, const int ndims,
+            const int quant_mask, const dim_t g0, const dim_t g1,
+            const memory_desc_t &quant_md) {
+        if (types::is_zero_md(&quant_md)) return 0;
+
+        dims_t quant_idx {};
+        utils::array_copy(quant_idx, input_idx, ndims);
+        utils::apply_mask_on_dims(quant_idx, ndims, quant_mask);
+        // Note: an `idx` must divide by a group value as grouped quantization
+        // applies to consecutive points.
+        // Using quant dimensions in `l_dims_by_l_offset` will lead to wrapping
+        // around dimensions instead of applying consecutively.
+        if (ndims >= 2) {
+            quant_idx[ndims - 1] /= g1;
+            quant_idx[ndims - 2] /= g0;
+        }
+
+        const memory_desc_wrapper q_mdw(quant_md);
+        return q_mdw.off_v(quant_idx);
+    }
+
 private:
     mdw_t src_md_;
     mdw_t weights_md_;
diff --git a/src/cpu/matmul/ref_matmul.cpp b/src/cpu/matmul/ref_matmul.cpp
index c1a182e2721..19dc7ea2894 100644
--- a/src/cpu/matmul/ref_matmul.cpp
+++ b/src/cpu/matmul/ref_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <float.h>
 #include <math.h>
 
+#include <algorithm>
 #include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
 #include "common/math_utils.hpp"
@@ -87,17 +88,14 @@ status_t ref_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     const auto &attr_zps = pd()->attr()->zero_points_;
     const bool with_wei_zero_points
             = !attr_zps.has_default_values(DNNL_ARG_WEIGHTS);
-    int wei_zp_mask = 0;
-    attr_zps.get(DNNL_ARG_WEIGHTS, &wei_zp_mask);
-    const bool wei_zp_per_n = wei_zp_mask & pd()->wei_qmask_N();
-    const bool wei_zp_per_k = wei_zp_mask & pd()->wei_qmask_K();
-    const dim_t wei_zp_stride_n = wei_zp_per_n ? 1 : 0;
-    const dim_t wei_zp_stride_k = wei_zp_per_k ? wei_zp_per_n ? N : 1 : 0;
+    int wei_zp_mask = attr_zps.get_mask(DNNL_ARG_WEIGHTS);
     const auto &wei_zp_dt = attr_zps.get_data_type(DNNL_ARG_WEIGHTS);
-    const auto wei_zp_group_ndims = attr_zps.get_groups_ndims(DNNL_ARG_WEIGHTS);
-    const auto wei_zp_group_k = wei_zp_group_ndims > 0
-            ? attr_zps.get_groups(DNNL_ARG_WEIGHTS)[0]
-            : 1;
+    const auto wei_zp_group_k = attr_zps.get_group(DNNL_ARG_WEIGHTS, 0);
+    const auto wei_zp_group_n = attr_zps.get_group(DNNL_ARG_WEIGHTS, 1);
+    // Initialize a memory desc for quant entries for easier offset calculation.
+    memory_desc_t wei_zp_md {};
+    CHECK(matmul_helper_t::get_quant_md(wei_zp_md, ndims, weights_d.dims(),
+            wei_zp_mask, wei_zp_group_k, wei_zp_group_n, wei_zp_dt));
 
     const int src_mask
             = utils::get_dims_mask(dst_d.dims(), src_d.dims(), ndims);
@@ -108,25 +106,23 @@ status_t ref_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
 
     // arg scales section
     const auto &attr_scales = pd()->attr()->scales_;
-    const bool with_src_scales
-            = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+    const bool with_src_scales = !attr_scales.has_default_values(DNNL_ARG_SRC);
     const bool with_wei_scales
-            = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_values();
-    const bool with_dst_scales
-            = !attr_scales.get(DNNL_ARG_DST).has_default_values();
-    const auto wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-    const bool wei_scale_per_n = wei_scale_mask & pd()->wei_qmask_N();
-    const bool wei_scale_per_k = wei_scale_mask & pd()->wei_qmask_K();
-    const dim_t wei_scale_stride_n = wei_scale_per_n ? 1 : 0;
-    const dim_t wei_scale_stride_k
-            = wei_scale_per_k ? wei_scale_per_n ? N : 1 : 0;
-    const auto &wei_scale_dt = attr_scales.get(DNNL_ARG_WEIGHTS).data_type_;
-    const auto scales_d
+            = !attr_scales.has_default_values(DNNL_ARG_WEIGHTS);
+    const bool with_dst_scales = !attr_scales.has_default_values(DNNL_ARG_DST);
+    const auto wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+    const dim_t wei_scale_stride_n
+            = (wei_scale_mask & pd()->wei_qmask_N()) ? 1 : 0;
+    const auto &wei_scale_dt = attr_scales.get_data_type(DNNL_ARG_WEIGHTS);
+    const auto wei_scales_d
             = ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
-    const auto wei_scale_group_ndim = attr_scales.get(DNNL_ARG_WEIGHTS).ndims_;
-    const auto wei_scale_group_k = wei_scale_group_ndim > 0
-            ? attr_scales.get(DNNL_ARG_WEIGHTS).group_dims_[0]
-            : 1;
+    const auto wei_scale_group_k = attr_scales.get_group(DNNL_ARG_WEIGHTS, 0);
+    const auto wei_scale_group_n = attr_scales.get_group(DNNL_ARG_WEIGHTS, 1);
+    // Initialize a memory desc for quant entries for easier offset calculation.
+    memory_desc_t wei_scale_md {};
+    CHECK(matmul_helper_t::get_quant_md(wei_scale_md, ndims, weights_d.dims(),
+            wei_scale_mask, wei_scale_group_k, wei_scale_group_n,
+            wei_scale_dt));
 
     auto dst_rnd_mode = pd()->attr()->rounding_mode_.get(DNNL_ARG_DST);
 
@@ -152,17 +148,24 @@ status_t ref_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
                     weights_d.data_type(), weights, weights_off);
             // weights decompression should happen before the operation
             if (with_wei_decompression) {
-                if (with_wei_zero_points)
-                    w -= io::load_float_value(wei_zp_dt, wei_zero_points,
-                            wei_zp_stride_n * n
-                                    + wei_zp_stride_k * (k / wei_zp_group_k));
+                if (with_wei_zero_points) {
+                    const dim_t wei_zp_offset = matmul_helper_t::get_quant_off(
+                            weights_dims_idx, ndims, wei_zp_mask,
+                            wei_zp_group_k, wei_zp_group_n, wei_zp_md);
+                    const auto wei_zp = io::load_int_value(
+                            wei_zp_dt, wei_zero_points, wei_zp_offset);
+                    w -= wei_zp;
+                }
                 if (with_wei_scales) {
-                    float wei_scale = scales_d.nelems() == 1
+                    const dim_t wei_scale_offset
+                            = matmul_helper_t::get_quant_off(weights_dims_idx,
+                                    ndims, wei_scale_mask, wei_scale_group_k,
+                                    wei_scale_group_n, wei_scale_md);
+                    // Single scale value was already converted into f32.
+                    const float wei_scale = wei_scales_d.nelems() == 1
                             ? wei_scales[0]
-                            : io::load_float_value(wei_scale_dt, wei_scales,
-                                    wei_scale_stride_n * n
-                                            + wei_scale_stride_k
-                                                    * (k / wei_scale_group_k));
+                            : io::load_float_value(
+                                    wei_scale_dt, wei_scales, wei_scale_offset);
                     w *= wei_scale;
                 }
             }
@@ -182,36 +185,52 @@ status_t ref_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     auto sum_dt = pd()->attr()->post_ops_.get_sum_dt(dst_d.data_type());
     bool with_dropout = !pd()->attr()->dropout_.has_default_values();
 
-    // computations
-    parallel_nd(batch, M, N, [&](dim_t mb, dim_t m, dim_t n) {
-        dims_t dst_dims_idx;
-        // account for M, N dims for index calculations
-        const size_t l_offset = mb * M * N + m * N + n;
-        utils::l_dims_by_l_offset(dst_dims_idx, l_offset, dst_d.dims(), ndims);
-        float d = ker(dst_dims_idx, m, n);
-        if (with_src_scales) d *= src_scales[0];
-        if (with_wei_scales && !with_wei_decompression)
-            d *= wei_scales[wei_scale_stride_n * n];
-        if (bias) d += ker_bias(dst_dims_idx);
-
-        const auto dst_off = dst_d.off_v(dst_dims_idx);
-        if (non_default_attrs) {
-            if (with_dropout)
-                d = ref_dropout(d, dropout_mask, dst_off, *p, *seed);
-            ref_post_ops_t::args_t args;
-            args.dst_val = io::load_float_value(sum_dt, dst, dst_off);
-            args.ctx = &ctx;
-            args.l_offset = l_offset;
-            args.dst_md = pd()->dst_md();
-            ref_post_ops->execute(d, args);
-        }
-        if (with_dst_scales) d *= dst_scales[0];
-        if (dst_rnd_mode == rounding_mode::stochastic)
-            d = math::stochastic_round_fwd(
-                    d, dst_off, rnd_seed[0], dst_d.data_type());
-        io::store_float_value(dst_d.data_type(), d, dst, dst_off);
-        utils::dim_iterator(dst_d.dims(), dst_dims_idx, batch_ndims);
-    });
+    // computations Note: If dst type is < 8 bits, we cannot split a
+    // byte during store or we get a race condition. To simplify
+    // logic, we limit parallelization on M and N by a factor of 2.
+    parallel_nd(batch, utils::div_up(M, 2), utils::div_up(N, 2),
+            [&](dim_t mb, dim_t m_, dim_t n_) {
+                for_(int m = 2 * m_; m < std::min<int>(2 * (m_ + 1), M); m++)
+                for (int n = 2 * n_; n < std::min<int>(2 * (n_ + 1), N); n++) {
+                    dims_t dst_dims_idx;
+                    // account for M, N dims for index calculations
+                    const size_t l_offset = mb * M * N + m * N + n;
+                    utils::l_dims_by_l_offset(
+                            dst_dims_idx, l_offset, dst_d.dims(), ndims);
+                    float d = ker(dst_dims_idx, m, n);
+                    if (with_src_scales) d *= src_scales[0];
+                    if (with_wei_scales && !with_wei_decompression) {
+                        // Single scale value was already converted into f32.
+                        const float wei_scale = wei_scales_d.nelems() == 1
+                                ? wei_scales[0]
+                                : io::load_float_value(wei_scale_dt, wei_scales,
+                                        wei_scale_stride_n * n);
+                        d *= wei_scale;
+                    }
+                    if (bias) d += ker_bias(dst_dims_idx);
+
+                    const auto dst_off = dst_d.off_v(dst_dims_idx);
+                    if (non_default_attrs) {
+                        if (with_dropout)
+                            d = ref_dropout(
+                                    d, dropout_mask, dst_off, *p, *seed);
+                        ref_post_ops_t::args_t args;
+                        args.dst_val
+                                = io::load_float_value(sum_dt, dst, dst_off);
+                        args.ctx = &ctx;
+                        args.l_offset = l_offset;
+                        args.dst_md = pd()->dst_md();
+                        ref_post_ops->execute(d, args);
+                    }
+                    if (with_dst_scales) d *= dst_scales[0];
+                    if (dst_rnd_mode == rounding_mode::stochastic)
+                        d = math::stochastic_round_fwd(
+                                d, dst_off, rnd_seed[0], dst_d.data_type());
+                    io::store_float_value(dst_d.data_type(), d, dst, dst_off);
+                    utils::dim_iterator(
+                            dst_d.dims(), dst_dims_idx, batch_ndims);
+                }
+            });
 
     return status::success;
 }
diff --git a/src/cpu/matmul/ref_matmul.hpp b/src/cpu/matmul/ref_matmul.hpp
index 1c2d3ea392b..19dc04adc0a 100644
--- a/src/cpu/matmul/ref_matmul.hpp
+++ b/src/cpu/matmul/ref_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,78 +49,110 @@ struct ref_matmul_t : public primitive_t {
             const auto bia_type = weights_md(1)->data_type;
             const auto dst_type = dst_md(0)->data_type;
 
-            bool ok = is_dense_format_kind()
-                    && utils::one_of(src_type, f32, bf16, f16, f8_e5m2, f8_e4m3)
-                    && utils::one_of(wei_type, f32, bf16, f16, f8_e5m2, f8_e4m3,
-                            u8, s8, u4, s4)
-                    && utils::one_of(dst_type, f32, bf16, f16, f8_e5m2, f8_e4m3)
-                    && (src_type == wei_type
-                            || utils::one_of(wei_type, u8, s8, u4, s4))
-                    /* int8 weights decompression support */
-                    && IMPLICATION(utils::one_of(wei_type, u8, s8),
-                            attr_.mayiconvert(wei_type, src_type))
-                    && IMPLICATION(src_type == f32, dst_type == f32)
-                    && IMPLICATION(src_type == bf16,
-                            utils::one_of(dst_type, f32, bf16))
-                    && IMPLICATION(
-                            src_type == f16, utils::one_of(dst_type, f32, f16))
-                    // TODO: any implication on allowed dst data type for fp8?
-                    && IMPLICATION(with_bias(),
+            VDISPATCH_MATMUL(
+                    is_dense_format_kind(), VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            VDISPATCH_MATMUL(utils::one_of(src_type, f32, bf16, f16, f8_e5m2,
+                                     f8_e4m3, f4_e2m1, f4_e3m0),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(utils::one_of(wei_type, f32, bf16, f16, f8_e5m2,
+                                     f8_e4m3, f4_e2m1, f4_e3m0, u8, s8, u4, s4),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(utils::one_of(dst_type, f32, bf16, f16, f8_e5m2,
+                                     f8_e4m3, f4_e2m1, f4_e3m0),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL((src_type == wei_type
+                                     || utils::one_of(wei_type, bf16, f16, u8,
+                                             s8, u4, s4, f4_e3m0)),
+                    VERBOSE_UNSUPPORTED_DT);
+            /* int8 weights decompression support */
+            VDISPATCH_MATMUL(IMPLICATION(utils::one_of(wei_type, u8, s8),
+                                     attr_.mayiconvert(wei_type, src_type)),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(IMPLICATION(src_type == f32, dst_type == f32),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(IMPLICATION(src_type == bf16,
+                                     utils::one_of(dst_type, f32, bf16)),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(IMPLICATION(src_type == f16,
+                                     utils::one_of(dst_type, f32, f16)),
+                    VERBOSE_UNSUPPORTED_DT);
+            // TODO: any implication on allowed dst data type for fp8?
+            VDISPATCH_MATMUL(
+                    IMPLICATION(with_bias(),
                             utils::one_of(
                                     bia_type, f32, bf16, f16, f8_e5m2, f8_e4m3)
                                     && IMPLICATION(
-                                            src_type == f32, bia_type == f32)
-                                    && IMPLICATION(src_type == f16,
+                                            wei_type == f32, bia_type == f32)
+                                    && IMPLICATION(wei_type == f16,
                                             utils::one_of(bia_type, f32, f16))
-                                    && IMPLICATION(src_type == bf16,
+                                    && IMPLICATION(wei_type == bf16,
                                             utils::one_of(bia_type, f32, bf16))
                             // TODO: any implication on allowed bias
                             // data type for fp8?
-                            )
-                    && platform::has_data_type_support(src_type)
-                    && attr()->has_default_values(
-                            smask_t::scales_runtime_data_type
-                                    | smask_t::scales_runtime_groups
-                                    | smask_t::zero_points_runtime_data_type
-                                    | smask_t::zero_points_runtime_groups
+                            ),
+                    VERBOSE_UNSUPPORTED_BIAS_CFG);
+            VDISPATCH_MATMUL(platform::has_data_type_support(src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(
+                    attr()->has_default_values(smask_t::scales_data_type
+                                    | smask_t::scales_groups
+                                    | smask_t::zero_points_data_type
+                                    | smask_t::zero_points_groups
                                     | smask_t::post_ops | smask_t::sum_dt
                                     | smask_t::fpmath_mode | smask_t::dropout
                                     | smask_t::rounding_mode,
-                            dst_type)
-                    && attr_.post_ops_.check_sum_consistency(dst_type,
-                            /* is_int8 */ false)
-                    && ref_post_ops_t::primitive_kind_ok(attr()->post_ops_)
-                    && attr_scales_ok() && set_default_formats()
-                    && zero_points_ok()
-                    && attr_.set_default_formats(dst_md(0)) == status::success
-                    && IMPLICATION(!attr_.dropout_.has_default_values(),
+                            dst_type),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(attr_.post_ops_.check_sum_consistency(dst_type,
+                                     /* is_int8 */ false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_MATMUL(
+                    ref_post_ops_t::primitive_kind_ok(attr()->post_ops_),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_MATMUL(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_MATMUL(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_MATMUL(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
+            VDISPATCH_MATMUL(
+                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_MATMUL(
+                    IMPLICATION(!attr_.dropout_.has_default_values(),
                             utils::one_of(
                                     attr_.dropout_.dropout_desc_.data_type, u8,
-                                    s8))
-                    && IMPLICATION(!attr_.dropout_.has_default_values(),
+                                    s8)),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(
+                    IMPLICATION(!attr_.dropout_.has_default_values(),
                             memory_desc_wrapper(dst_md(0)).similar_to(
-                                    attr_.dropout_.dropout_desc_, true, false));
-            return ok ? status::success : status::unimplemented;
+                                    attr_.dropout_.dropout_desc_, true, false)),
+                    VERBOSE_UNSUPPORTED_ATTR);
+
+            return status::success;
         }
 
     private:
         bool zero_points_ok() const {
+            const auto &zp = attr()->zero_points_;
+            if (!zp.has_default_values(DNNL_ARG_SRC)) { return false; }
             /* weights decompression requires zero points support */
-            int mask_wei = 0;
-            attr()->zero_points_.get(DNNL_ARG_WEIGHTS, &mask_wei);
-            const auto wei_group_ndims
-                    = attr()->zero_points_.get_groups_ndims(DNNL_ARG_WEIGHTS);
-            const auto wei_group_dims
-                    = attr()->zero_points_.get_groups(DNNL_ARG_WEIGHTS);
-
-            return attr()->zero_points_.has_default_values(DNNL_ARG_SRC)
-                    && attr()->zero_points_.has_default_values(DNNL_ARG_DST)
-                    && utils::one_of(mask_wei, 0, wei_qmask_N(),
-                            wei_qmask_N() + wei_qmask_K())
-                    && utils::one_of(wei_group_ndims, 0, 2)
-                    && IMPLICATION(wei_group_ndims == 2,
-                            wei_group_dims[1] == 1
-                                    && K() % wei_group_dims[0] == 0);
+            if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+                if (!zp.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+                    const auto gK = zp.get_group(DNNL_ARG_WEIGHTS, 0);
+                    bool ok = IMPLICATION(gK > 1, K() % gK == 0);
+                    if (!ok) return false;
+
+                    const auto gN = zp.get_group(DNNL_ARG_WEIGHTS, 1);
+                    ok = IMPLICATION(gN > 1, N() % gN == 0);
+                    if (!ok) return false;
+
+                    // Only one non-unit group is supported.
+                    ok = utils::one_of(1, gK, gN);
+                    if (!ok) return false;
+                }
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) { return false; }
+
+            return true;
         }
     };
 
diff --git a/src/cpu/matmul/ref_matmul_int8.cpp b/src/cpu/matmul/ref_matmul_int8.cpp
index e336f46b3c0..daea0cfa3d9 100644
--- a/src/cpu/matmul/ref_matmul_int8.cpp
+++ b/src/cpu/matmul/ref_matmul_int8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ status_t ref_matmul_int8_t::execute_ref(const exec_ctx_t &ctx) const {
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
+    DEFINE_ZERO_POINTS_BUFFER(src_zero_points, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(wei_zero_points, DNNL_ARG_WEIGHTS);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
@@ -78,24 +78,26 @@ status_t ref_matmul_int8_t::execute_ref(const exec_ctx_t &ctx) const {
     const auto &attr_zps = pd()->attr()->zero_points_;
     const bool with_src_zero_points
             = !attr_zps.has_default_values(DNNL_ARG_SRC);
+    int src_zp_mask = attr_zps.get_mask(DNNL_ARG_SRC);
+    const auto &src_zp_dt = attr_zps.get_data_type(DNNL_ARG_SRC);
+    const auto src_zp_group_k = attr_zps.get_group(DNNL_ARG_SRC, 1);
+    const auto src_zp_ngroups_k = K / src_zp_group_k;
+    // Initialize a memory desc for quant entries for easier offset calculation.
+    memory_desc_t src_zp_md {};
+    CHECK(matmul_helper_t::get_quant_md(src_zp_md, ndims, src_d.dims(),
+            src_zp_mask, 1, src_zp_group_k, src_zp_dt));
+
     const bool with_wei_zero_points
             = !attr_zps.has_default_values(DNNL_ARG_WEIGHTS);
-    int src_zp_mask = 0;
-    int wei_zp_mask = 0;
-    attr_zps.get(DNNL_ARG_SRC, &src_zp_mask);
-    attr_zps.get(DNNL_ARG_WEIGHTS, &wei_zp_mask);
-    const bool src_zp_per_k = src_zp_mask & pd()->src_qmask_K();
-    const bool wei_zp_per_n = wei_zp_mask & pd()->wei_qmask_N();
-    const bool wei_zp_per_k = wei_zp_mask & pd()->wei_qmask_K();
+    int wei_zp_mask = attr_zps.get_mask(DNNL_ARG_WEIGHTS);
     const auto &wei_zp_dt = attr_zps.get_data_type(DNNL_ARG_WEIGHTS);
-    const auto wei_zp_group_ndims = attr_zps.get_groups_ndims(DNNL_ARG_WEIGHTS);
-    const auto wei_zp_group_k = wei_zp_group_ndims > 0
-            ? attr_zps.get_groups(DNNL_ARG_WEIGHTS)[0]
-            : (wei_zp_per_k ? 1 : K);
-    const dim_t src_zp_stride_k = src_zp_per_k ? 1 : 0;
-    const dim_t wei_zp_stride_n = wei_zp_per_n ? 1 : 0;
-    const dim_t wei_zp_stride_k = wei_zp_group_k < K ? wei_zp_per_n ? N : 1 : 0;
+    const auto wei_zp_group_k = attr_zps.get_group(DNNL_ARG_WEIGHTS, 0);
+    const auto wei_zp_group_n = attr_zps.get_group(DNNL_ARG_WEIGHTS, 1);
     const auto wei_zp_ngroups_k = K / wei_zp_group_k;
+    // Initialize a memory desc for quant entries for easier offset calculation.
+    memory_desc_t wei_zp_md {};
+    CHECK(matmul_helper_t::get_quant_md(wei_zp_md, ndims, weights_d.dims(),
+            wei_zp_mask, wei_zp_group_k, wei_zp_group_n, wei_zp_dt));
 
     const int src_mask
             = utils::get_dims_mask(dst_d.dims(), src_d.dims(), ndims);
@@ -105,46 +107,40 @@ status_t ref_matmul_int8_t::execute_ref(const exec_ctx_t &ctx) const {
             = utils::get_dims_mask(dst_d.dims(), bia_d.dims(), ndims);
 
     // zp_idx_mult = 1 for per_dim1 zero points and 0, otherwise
-    const int dst_zp_idx_mult = !attr_zps.common(DNNL_ARG_DST);
+    const int dst_zp_idx_mult = !attr_zps.has_default_values(DNNL_ARG_DST)
+            && attr_zps.get_mask(DNNL_ARG_DST) > 0;
 
     // arg scales section
     const auto &attr_scales = pd()->attr()->scales_;
-    const bool with_src_scales
-            = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
     const bool with_wei_scales
-            = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_values();
-    const bool with_dst_scales
-            = !attr_scales.get(DNNL_ARG_DST).has_default_values();
-    const int src_scale_mask = attr_scales.get(DNNL_ARG_SRC).mask_;
-    const int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-    const auto &src_scales_dt = attr_scales.get_data_type(DNNL_ARG_SRC);
-    const auto &wei_scales_dt = attr_scales.get_data_type(DNNL_ARG_WEIGHTS);
-    const bool src_scale_per_k = src_scale_mask & pd()->src_qmask_K();
-    const bool src_scale_per_m = src_scale_mask & pd()->src_qmask_M();
-    const bool wei_scale_per_n = wei_scale_mask & pd()->wei_qmask_N();
-    const bool wei_scale_per_k = wei_scale_mask & pd()->wei_qmask_K();
-    const auto src_scale_group_ndim = attr_scales.get(DNNL_ARG_SRC).ndims_;
-    const auto wei_scale_group_ndim = attr_scales.get(DNNL_ARG_WEIGHTS).ndims_;
-    const auto src_scale_group_k = src_scale_group_ndim > 0
-            ? attr_scales.get(DNNL_ARG_SRC).group_dims_[1]
-            : (src_scale_per_k ? 1 : K);
-    const auto wei_scale_group_k = wei_scale_group_ndim > 0
-            ? attr_scales.get(DNNL_ARG_WEIGHTS).group_dims_[0]
-            : (wei_scale_per_k ? 1 : K);
-    const auto src_scale_ngroups_k = K / src_scale_group_k;
+            = !attr_scales.has_default_values(DNNL_ARG_WEIGHTS);
+    const bool with_dst_scales = !attr_scales.has_default_values(DNNL_ARG_DST);
+    const int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+    const auto &wei_scale_dt = attr_scales.get_data_type(DNNL_ARG_WEIGHTS);
+    const auto wei_scale_group_k = attr_scales.get_group(DNNL_ARG_WEIGHTS, 0);
+    const auto wei_scale_group_n = attr_scales.get_group(DNNL_ARG_WEIGHTS, 1);
     const auto wei_scale_ngroups_k = K / wei_scale_group_k;
-    const dim_t wei_scale_stride_n = wei_scale_per_n ? 1 : 0;
-    const dim_t src_scale_stride_k = src_scale_group_k < K ? 1 : 0;
-    const dim_t wei_scale_stride_k
-            = wei_scale_group_k < K ? wei_scale_per_n ? N : 1 : 0;
-    const dim_t src_scale_stride_m = src_scale_per_m
-            ? src_scale_group_k < K ? src_scale_ngroups_k : 1
-            : 0;
-    const auto scale_ngroups_k
-            = std::max(src_scale_ngroups_k, wei_scale_ngroups_k);
+    // Initialize a memory desc for quant entries for easier offset calculation.
+    memory_desc_t wei_scale_md {};
+    CHECK(matmul_helper_t::get_quant_md(wei_scale_md, ndims, weights_d.dims(),
+            wei_scale_mask, wei_scale_group_k, wei_scale_group_n,
+            wei_scale_dt));
+
+    const bool with_src_scales = !attr_scales.has_default_values(DNNL_ARG_SRC);
+    const int src_scale_mask = attr_scales.get_mask(DNNL_ARG_SRC);
+    const auto &src_scale_dt = attr_scales.get_data_type(DNNL_ARG_SRC);
+    const auto src_scale_group_k = attr_scales.get_group(DNNL_ARG_SRC, 1);
+    const auto src_scale_ngroups_k = K / src_scale_group_k;
+    // Initialize a memory desc for quant entries for easier offset calculation.
+    memory_desc_t src_scale_md {};
+    CHECK(matmul_helper_t::get_quant_md(src_scale_md, ndims, src_d.dims(),
+            src_scale_mask, 1, src_scale_group_k, src_scale_dt));
 
     // For compute kernel, the minimal group is picked.
-    const auto ngroups_k = std::max(wei_zp_ngroups_k, scale_ngroups_k);
+    const auto zp_ngroups_k = std::max(src_zp_ngroups_k, wei_zp_ngroups_k);
+    const auto scale_ngroups_k
+            = std::max(src_scale_ngroups_k, wei_scale_ngroups_k);
+    const auto ngroups_k = std::max(zp_ngroups_k, scale_ngroups_k);
     const auto group_k = K / ngroups_k;
 
     // mm kernel
@@ -161,6 +157,7 @@ status_t ref_matmul_int8_t::execute_ref(const exec_ctx_t &ctx) const {
         utils::copy_dims_with_mask(src_dims_idx, dst_dims_idx, ndims, src_mask);
         utils::copy_dims_with_mask(
                 weights_dims_idx, dst_dims_idx, ndims, wei_mask);
+
         src_dims_idx[ndims - 2] = m;
         weights_dims_idx[ndims - 1] = n;
         auto &src_k_dim = src_dims_idx[ndims - 1];
@@ -176,12 +173,17 @@ status_t ref_matmul_int8_t::execute_ref(const exec_ctx_t &ctx) const {
                 int w = io::load_int_value(
                         weights_d.data_type(), weights, weights_off);
                 if (with_src_zero_points) {
-                    s -= io::load_int_value(data_type::s32, src_zero_point,
-                            src_zp_stride_k * k);
+                    const dim_t src_zp_offset = matmul_helper_t::get_quant_off(
+                            src_dims_idx, ndims, src_zp_mask, 1, src_zp_group_k,
+                            src_zp_md);
+                    const auto src_zp = io::load_int_value(
+                            src_zp_dt, src_zero_points, src_zp_offset);
+                    s -= src_zp;
                 }
                 if (with_wei_zero_points) {
-                    const auto wei_zp_offset = wei_zp_stride_n * n
-                            + wei_zp_stride_k * (wei_k_dim / wei_zp_group_k);
+                    const dim_t wei_zp_offset = matmul_helper_t::get_quant_off(
+                            weights_dims_idx, ndims, wei_zp_mask,
+                            wei_zp_group_k, wei_zp_group_n, wei_zp_md);
                     const auto wei_zp = io::load_int_value(
                             wei_zp_dt, wei_zero_points, wei_zp_offset);
                     w -= wei_zp;
@@ -192,24 +194,25 @@ status_t ref_matmul_int8_t::execute_ref(const exec_ctx_t &ctx) const {
             // Apply scaling after computing a group.
             float acc_f = static_cast<float>(acc);
             if (with_src_scales) {
+                const dim_t src_scale_offset = matmul_helper_t::get_quant_off(
+                        src_dims_idx, ndims, src_scale_mask, 1,
+                        src_scale_group_k, src_scale_md);
                 // Single scale value was already converted into f32.
-                const auto src_scale_offset
-                        = src_scale_stride_k * (src_k_dim / src_scale_group_k)
-                        + src_scale_stride_m * m;
                 const float src_scale = src_scales_d.nelems() == 1
                         ? src_scales[0]
                         : io::load_float_value(
-                                src_scales_dt, src_scales, src_scale_offset);
+                                src_scale_dt, src_scales, src_scale_offset);
                 acc_f *= src_scale;
             }
             if (with_wei_scales) {
+                const dim_t wei_scale_offset = matmul_helper_t::get_quant_off(
+                        weights_dims_idx, ndims, wei_scale_mask,
+                        wei_scale_group_k, wei_scale_group_n, wei_scale_md);
                 // Single scale value was already converted into f32.
-                const auto wei_scale_offset = wei_scale_stride_n * n
-                        + wei_scale_stride_k * (wei_k_dim / wei_scale_group_k);
                 const float wei_scale = wei_scales_d.nelems() == 1
                         ? wei_scales[0]
                         : io::load_float_value(
-                                wei_scales_dt, wei_scales, wei_scale_offset);
+                                wei_scale_dt, wei_scales, wei_scale_offset);
                 acc_f *= wei_scale;
             }
             d += acc_f;
diff --git a/src/cpu/matmul/ref_matmul_int8.hpp b/src/cpu/matmul/ref_matmul_int8.hpp
index e0ac608d8c6..d9cee62d993 100644
--- a/src/cpu/matmul/ref_matmul_int8.hpp
+++ b/src/cpu/matmul/ref_matmul_int8.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,50 +48,84 @@ struct ref_matmul_int8_t : public primitive_t {
             const auto bia_type = weights_md(1)->data_type;
             const auto dst_type = dst_md(0)->data_type;
 
-            bool ok = is_dense_format_kind() && utils::one_of(src_type, s8, u8)
-                    && utils::one_of(wei_type, s8, u8, s4, u4)
-                    && IMPLICATION(with_bias(),
-                            utils::one_of(
-                                    bia_type, f32, bf16, f16, s32, s8, u8))
-                    && utils::one_of(dst_type, f32, bf16, f16, s32, s8, u8)
-                    && attr()->has_default_values(
-                            smask_t::scales_runtime_data_type
-                                    | smask_t::scales_runtime_groups
-                                    | smask_t::zero_points_runtime_data_type
-                                    | smask_t::zero_points_runtime_groups
+            VDISPATCH_MATMUL(
+                    is_dense_format_kind(), VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            VDISPATCH_MATMUL(
+                    utils::one_of(src_type, s8, u8), VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(utils::one_of(wei_type, s8, u8, s4, u4),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(IMPLICATION(with_bias(),
+                                     utils::one_of(bia_type, f32, bf16, f16,
+                                             s32, s8, u8)),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(
+                    utils::one_of(dst_type, f32, bf16, f16, s32, s8, u8),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_MATMUL(
+                    attr()->has_default_values(smask_t::scales_data_type
+                                    | smask_t::scales_groups
+                                    | smask_t::zero_points_data_type
+                                    | smask_t::zero_points_groups
                                     | smask_t::post_ops | smask_t::sum_dt,
-                            dst_type)
-                    && attr_.post_ops_.check_sum_consistency(dst_type,
-                            /* is_int8 */ true)
-                    && ref_post_ops_t::primitive_kind_ok(attr()->post_ops_)
-                    && attr_scales_ok() && attr_zero_points_ok()
-                    && set_default_formats()
-                    && attr_.set_default_formats(dst_md(0)) == status::success;
-            return ok ? status::success : status::unimplemented;
+                            dst_type),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(attr_.post_ops_.check_sum_consistency(dst_type,
+                                     /* is_int8 */ true),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_MATMUL(
+                    ref_post_ops_t::primitive_kind_ok(attr()->post_ops_),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_MATMUL(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_MATMUL(attr_zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
+            VDISPATCH_MATMUL(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_MATMUL(
+                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    VERBOSE_UNSUPPORTED_POSTOP);
+
+            return status::success;
         }
 
     private:
         bool attr_zero_points_ok() const {
-            int mask_src = 0, mask_wei = 0, mask_dst = 0;
-            CHECK_BOOL(attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src));
-            CHECK_BOOL(attr()->zero_points_.get(DNNL_ARG_WEIGHTS, &mask_wei));
-            CHECK_BOOL(attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst));
-
-            const auto wei_group_ndims
-                    = attr()->zero_points_.get_groups_ndims(DNNL_ARG_WEIGHTS);
-            const auto wei_group_dims
-                    = attr()->zero_points_.get_groups(DNNL_ARG_WEIGHTS);
-
-            bool mask_src_ok = utils::one_of(mask_src, 0, wei_qmask_N());
-            bool mask_wei_ok = utils::one_of(
-                    mask_wei, 0, wei_qmask_N(), wei_qmask_K() + wei_qmask_N());
-            bool mask_dst_ok = utils::one_of(mask_dst, 0, wei_qmask_N());
-
-            return mask_src_ok && mask_wei_ok && mask_dst_ok
-                    && utils::one_of(wei_group_ndims, 0, 2)
-                    && IMPLICATION(wei_group_ndims == 2,
-                            wei_group_dims[1] == 1
-                                    && K() % wei_group_dims[0] == 0);
+            const auto &zp = attr()->zero_points_;
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                bool ok = utils::one_of(mask_src, 0, src_qmask_K(),
+                        src_qmask_M() + src_qmask_K());
+                if (!ok) return false;
+
+                if (!zp.get(DNNL_ARG_SRC).has_default_groups()) {
+                    const auto gM = zp.get_group(DNNL_ARG_SRC, 0);
+                    ok = gM == 1;
+                    if (!ok) return false;
+
+                    const auto gK = zp.get_group(DNNL_ARG_SRC, 1);
+                    ok = IMPLICATION(gK > 1, K() % gK == 0);
+                    if (!ok) return false;
+                }
+            }
+            /* weights decompression requires zero points support */
+            if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+                if (!zp.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+                    const auto gK = zp.get_group(DNNL_ARG_WEIGHTS, 0);
+                    bool ok = IMPLICATION(gK > 1, K() % gK == 0);
+                    if (!ok) return false;
+
+                    const auto gN = zp.get_group(DNNL_ARG_WEIGHTS, 1);
+                    ok = IMPLICATION(gN > 1, N() % gN == 0);
+                    if (!ok) return false;
+
+                    // Only one non-unit group is supported.
+                    ok = utils::one_of(1, gK, gN);
+                    if (!ok) return false;
+                }
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                bool ok = utils::one_of(mask_dst, 0, wei_qmask_N());
+                if (!ok) return false;
+            }
+            return true;
         }
     };
 
diff --git a/src/cpu/matmul/ref_sparse_matmul.cpp b/src/cpu/matmul/ref_sparse_matmul.cpp
index f95a35c505a..d08b1b09d73 100644
--- a/src/cpu/matmul/ref_sparse_matmul.cpp
+++ b/src/cpu/matmul/ref_sparse_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include "common/math_utils.hpp"
 #include "common/type_helpers.hpp"
 
+#include "cpu/ref_io_helper.hpp"
+
 #include "cpu/matmul/ref_sparse_matmul.hpp"
 
 namespace dnnl {
@@ -27,7 +29,7 @@ namespace matmul {
 
 status_t ref_sparse_matmul_t::execute(const exec_ctx_t &ctx) const {
     status_t status = status::success;
-    auto dst = CTX_OUT_CLEAN_MEM(float *, DNNL_ARG_DST, status);
+    auto dst = CTX_OUT_CLEAN_MEM(void *, DNNL_ARG_DST, status);
     CHECK(status);
 
     const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
@@ -38,48 +40,161 @@ status_t ref_sparse_matmul_t::execute(const exec_ctx_t &ctx) const {
     const dim_t N = dst_d.dims()[1];
     const dim_t K = src_d.dims()[1];
 
-    parallel_nd(M, N, [&](dim_t i, dim_t j) { dst[i * N + j] = 0.0f; });
+    const data_type_t mm_dt = src_d.data_type();
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    parallel_nd(M, N, [&](dim_t i, dim_t j) {
+        const dim_t dst_idx = i * N + j;
+        io::store_float_value(dst_d.data_type(), 0.0f, dst, dst_idx);
+    });
 
     if (weights_d.is_sparse_desc()) {
-        const auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
-        const auto wei_values = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS, 0);
-        const auto wei_indices
-                = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 1);
-        const auto wei_pointers
-                = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 2);
 
+        const auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
+        const auto wei_values = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS, 0);
+        auto wei_buffer_1 = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 1);
+        auto wei_buffer_2 = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 2);
+
+        // Both COO and CSR encoded data is operated on using CSR kernel for
+        // matrix multiplication.
+        // For COO encoding, data preparation includes using a temporary
+        // buffer to convert the data to the CSR format.
+        // Matrix multiplication is then carried out using the CSR encoded data.
+        const int32_t *wei_indices = nullptr;
+        const int32_t *wei_pointers = nullptr;
+
+        if (weights_d.encoding() == sparse_encoding::csr) {
+            // For CSR encodings, pointer and indices assignment is
+            // staightforward as,
+            // index 1 - index buffer, index 2 - pointer buffer.
+            wei_indices = wei_buffer_1;
+            wei_pointers = wei_buffer_2;
+        } else if (weights_d.encoding() == sparse_encoding::coo) {
+            // For COO encodings, the two index buffers hold the row and column
+            // indices respectively. For CSR conversion, the row indices are
+            // compressed to generate the CSR pointers.
+            wei_indices = wei_buffer_2;
+
+            int32_t *wei_row_pointers = scratchpad.template get<int32_t>(
+                    memory_tracking::names::key_matmul_sparse_tmp_ptr);
+
+            parallel_nd(K + 1, [&](dim_t k) {
+                io::store_float_value(
+                        weights_d.metadata_type(0), 0, wei_row_pointers, k);
+            });
+
+            cvt_coo_indices_to_csr_pointers(
+                    wei_buffer_1, wei_row_pointers, weights_d.nnz(), K);
+
+            wei_pointers = wei_row_pointers;
+        }
+
+        run_csr_kernel(src, wei_values, wei_indices, wei_pointers, dst, M, N, K,
+                mm_dt, src_d.is_sparse_desc());
+
+    } else if (src_d.is_sparse_desc()) {
+        const auto weights = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
+        const auto src_values = CTX_IN_MEM(const void *, DNNL_ARG_SRC, 0);
+        auto src_buffer_1 = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 1);
+        auto src_buffer_2 = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 2);
+
+        // Both COO and CSR encoded data is operated on using CSR kernel for
+        // matrix multiplication.
+        // For COO encoding, data preparation includes using a temporary
+        // buffer to convert the data to the CSR format.
+        // Matrix multiplication is then carried out using the CSR encoded data.
+        const int32_t *src_indices = nullptr;
+        const int32_t *src_pointers = nullptr;
+
+        if (src_d.encoding() == sparse_encoding::csr) {
+            // For CSR encodings, pointer and indices assignment is
+            // staightforward as
+            // index 1 - index buffer, index 2 - pointer buffer.
+            src_indices = src_buffer_1;
+            src_pointers = src_buffer_2;
+        } else if (src_d.encoding() == sparse_encoding::coo) {
+            // For COO encodings, the two index buffers hold the row and column
+            // indices respectively. For CSR conversion, the row indices are
+            // compressed to generate the CSR pointers.
+            src_indices = src_buffer_2;
+
+            int32_t *src_row_pointers = scratchpad.template get<int32_t>(
+                    memory_tracking::names::key_matmul_sparse_tmp_ptr);
+
+            parallel_nd(M + 1, [&](dim_t m) {
+                io::store_float_value(
+                        src_d.metadata_type(0), 0, src_row_pointers, m);
+            });
+
+            cvt_coo_indices_to_csr_pointers(
+                    src_buffer_1, src_row_pointers, src_d.nnz(), M);
+            src_pointers = src_row_pointers;
+        }
+
+        run_csr_kernel(weights, src_values, src_indices, src_pointers, dst, M,
+                N, K, mm_dt, src_d.is_sparse_desc());
+    }
+    return status::success;
+}
+
+void ref_sparse_matmul_t::cvt_coo_indices_to_csr_pointers(
+        const int32_t *indices, int32_t *pointers, const int nnz,
+        const int nrows) const {
+    parallel_nd(
+            nnz, [&](dim_t i) { fetch_and_add(&pointers[indices[i] + 1], 1); });
+    for (int i = 0; i < nrows; ++i) {
+        pointers[i + 1] += pointers[i];
+    }
+}
+
+void ref_sparse_matmul_t::run_csr_kernel(const void *dmat, const void *values,
+        const int32_t *indices, const int32_t *pointers, void *res,
+        const dim_t M, const dim_t N, const dim_t K, const data_type_t mm_dt,
+        bool is_src_sparse) const {
+
+    if (is_src_sparse) {
+        // With a sparse source tensor, the matrix multiplication is carried out
+        // for a sparse multiplier with parallelization over the sparse rows
+        // of the multiplier matrix.
         parallel_nd(M, [&](dim_t m) {
-            for (dim_t k = 0; k < K; k++) {
-                const dim_t row_start = wei_pointers[k];
-                const dim_t row_end = wei_pointers[k + 1];
-                for (dim_t n = row_start; n < row_end; n++) {
-                    const dim_t src_idx = m * K + k;
-                    const dim_t dst_idx = m * N + wei_indices[n];
-                    dst[dst_idx] = dst[dst_idx] + src[src_idx] * wei_values[n];
+            const dim_t row_start = pointers[m];
+            const dim_t row_end = pointers[m + 1];
+
+            for (dim_t n = 0; n < N; n++) {
+                const dim_t c_idx = m * N + n;
+                float c_val = io::load_float_value(mm_dt, res, c_idx);
+
+                for (dim_t k = row_start; k < row_end; k++) {
+                    const dim_t b_idx = indices[k] * N + n;
+                    const float a_val = io::load_float_value(mm_dt, values, k);
+                    const float b_val
+                            = io::load_float_value(mm_dt, dmat, b_idx);
+                    c_val += a_val * b_val;
                 }
+                io::store_float_value(mm_dt, c_val, res, c_idx);
             }
         });
-    } else if (src_d.is_sparse_desc()) {
-        const auto weights = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS);
-        const auto src_values = CTX_IN_MEM(const float *, DNNL_ARG_SRC, 0);
-        const auto src_indices = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 1);
-        const auto src_pointers = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 2);
-
+    } else {
+        // With a sparse weights tensor, the matrix multiplication is carried
+        // out for a sparse multiplicand with parallelization over the dense
+        // rows of the multiplier matrix.
         parallel_nd(M, [&](dim_t m) {
-            const dim_t row_start = src_pointers[m];
-            const dim_t row_end = src_pointers[m + 1];
-            for (dim_t k = row_start; k < row_end; k++) {
-                for (dim_t n = 0; n < N; n++) {
-                    const dim_t dst_idx = m * N + n;
-                    const dim_t wei_idx = src_indices[k] * N + n;
-                    dst[dst_idx]
-                            = dst[dst_idx] + src_values[k] * weights[wei_idx];
+            for (dim_t k = 0; k < K; k++) {
+                const dim_t row_start = pointers[k];
+                const dim_t row_end = pointers[k + 1];
+                for (dim_t n = row_start; n < row_end; n++) {
+                    const dim_t a_idx = m * K + k;
+                    const dim_t c_idx = m * N + indices[n];
+                    const float a_val
+                            = io::load_float_value(mm_dt, dmat, a_idx);
+                    const float b_val = io::load_float_value(mm_dt, values, n);
+                    float c_val = io::load_float_value(mm_dt, res, c_idx);
+                    c_val += a_val * b_val;
+                    io::store_float_value(mm_dt, c_val, res, c_idx);
                 }
             }
         });
     }
-
-    return status::success;
 }
 
 } // namespace matmul
diff --git a/src/cpu/matmul/ref_sparse_matmul.hpp b/src/cpu/matmul/ref_sparse_matmul.hpp
index 2b7dbae8c08..16d63318deb 100644
--- a/src/cpu/matmul/ref_sparse_matmul.hpp
+++ b/src/cpu/matmul/ref_sparse_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,25 +44,62 @@ struct ref_sparse_matmul_t : public primitive_t {
             memory_desc_wrapper src_d(src_md());
             memory_desc_wrapper wei_d(weights_md(0));
 
-            const bool ok
-                    = utils::everyone_is(f32, src_type, wei_type, dst_type)
-                    && utils::one_of(true, wei_d.is_sparse_desc(),
-                            src_d.is_sparse_desc())
-                    && IMPLICATION(wei_d.is_sparse_desc(),
-                            wei_d.encoding() == sparse_encoding::csr)
-                    && IMPLICATION(src_d.is_sparse_desc(),
-                            src_d.encoding() == sparse_encoding::csr)
-                    && IMPLICATION(
-                            wei_d.is_sparse_desc(), !src_d.is_sparse_desc())
-                    && IMPLICATION(src_d.is_sparse_desc(),
-                            utils::everyone_is(s32, src_d.metadata_type(0),
-                                    src_d.metadata_type(1)))
-                    && IMPLICATION(wei_d.is_sparse_desc(),
-                            utils::everyone_is(s32, wei_d.metadata_type(0),
-                                    wei_d.metadata_type(1)))
-                    && !with_bias() && attr()->has_default_values()
-                    && set_default_formats() && formats_ok(src_d, wei_d);
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_MATMUL(wei_d.is_sparse_desc() || src_d.is_sparse_desc(),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            VDISPATCH_MATMUL(wei_d.is_sparse_desc() ^ src_d.is_sparse_desc(),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            VDISPATCH_MATMUL(IMPLICATION(src_d.is_sparse_desc(),
+                                     utils::one_of(src_d.encoding(),
+                                             sparse_encoding::csr,
+                                             sparse_encoding::coo)),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            VDISPATCH_MATMUL(IMPLICATION(wei_d.is_sparse_desc(),
+                                     utils::one_of(wei_d.encoding(),
+                                             sparse_encoding::csr,
+                                             sparse_encoding::coo)),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            VDISPATCH_MATMUL(
+                    utils::everyone_is(f16, src_type, wei_type, dst_type)
+                            || utils::everyone_is(
+                                    f32, src_type, wei_type, dst_type),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+
+            if (src_d.is_sparse_desc()) {
+                sparse_mem_encoding = src_d.encoding();
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::coo,
+                                s32 == src_d.metadata_type(0)),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::csr,
+                                utils::everyone_is(s32, src_d.metadata_type(0),
+                                        src_d.metadata_type(1))),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            }
+            if (wei_d.is_sparse_desc()) {
+                sparse_mem_encoding = wei_d.encoding();
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::coo,
+                                s32 == wei_d.metadata_type(0)),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::csr,
+                                utils::everyone_is(s32, wei_d.metadata_type(0),
+                                        wei_d.metadata_type(1))),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            }
+
+            VDISPATCH_MATMUL(!with_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
+            VDISPATCH_MATMUL(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(set_default_formats(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(formats_ok(src_d, wei_d), VERBOSE_UNSUPPORTED_TAG);
+
+            init_scratchpad();
+            return status::success;
         }
 
         bool formats_ok(const memory_desc_wrapper &src_d,
@@ -76,10 +113,41 @@ struct ref_sparse_matmul_t : public primitive_t {
                 return src_d.matches_one_of_tag(format_tag::ab);
             return false;
         }
+
+    private:
+        void init_scratchpad() {
+            using namespace memory_tracking::names;
+            const memory_desc_wrapper src_d(src_md());
+            const memory_desc_wrapper wei_d(weights_md());
+
+            if (sparse_mem_encoding == sparse_encoding::coo) {
+                auto scratchpad = scratchpad_registry().registrar();
+                const bool is_wei_sparse = wei_d.is_sparse_desc();
+                const auto ptr_size
+                        = src_d.dims()[static_cast<int>(is_wei_sparse)] + 1;
+                scratchpad.template book<int32_t>(
+                        key_matmul_sparse_tmp_ptr, ptr_size);
+            }
+        }
+
+        sparse_encoding_t sparse_mem_encoding = sparse_encoding::undef;
     };
 
     ref_sparse_matmul_t(const pd_t *apd) : primitive_t(apd) {}
 
+    // COO sparse encodings are converted to CSR format by
+    // compressing the respective row indices into CSR pointers.
+    void cvt_coo_indices_to_csr_pointers(const int32_t *indices,
+            int32_t *pointers, const int nnz, const int nrows) const;
+
+    // Executes the matrix mutiplication, C = A x B where one of the input
+    // matrices is dense. Operation indices are determined depending on
+    // whether the mulitplier or multiplicand is dense
+    void run_csr_kernel(const void *dmat, const void *values,
+            const int32_t *indices, const int32_t *pointers, void *res,
+            const dim_t M, const dim_t N, const dim_t K,
+            const data_type_t mm_dt, bool is_src_sparse) const;
+
     status_t execute(const exec_ctx_t &ctx) const override;
 
 private:
diff --git a/src/cpu/nchw_pooling.cpp b/src/cpu/nchw_pooling.cpp
index 6b709e4cd3f..6454e68d22b 100644
--- a/src/cpu/nchw_pooling.cpp
+++ b/src/cpu/nchw_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,13 +36,18 @@ template <>
 status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
         const exec_ctx_t &ctx) const {
     const auto alg = pd()->desc()->alg_kind;
-    const auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
     auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
     auto ws = CTX_OUT_MEM(unsigned char *, DNNL_ARG_WORKSPACE);
 
     const memory_desc_wrapper ws_d(pd()->workspace_md());
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
     const data_type_t ws_dt = ws ? ws_d.data_type() : data_type::undef;
 
+    src += src_d.off_l(0);
+    dst += dst_d.off_l(0);
+
     const dim_t MB = pd()->MB();
     const dim_t C = pd()->OC();
     const dim_t OD = pd()->OD();
@@ -61,7 +66,7 @@ status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
     const dim_t padT = pd()->padT();
     const dim_t padL = pd()->padL();
 
-    const auto apply_offset = [](int index, int offset) {
+    const auto apply_offset = [](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -74,7 +79,7 @@ status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
                     + (size_t)OW * oh + (size_t)ow;
             if (ws_dt == data_type::u8) {
                 assert(0 <= value
-                        && value <= numeric_limits<typename prec_traits<
+                        && value <= numeric_limits<typename prec_traits_t<
                                         data_type::u8>::type>::max());
                 ws[ws_offset] = value;
             } else
@@ -87,6 +92,10 @@ status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
         const auto src_off = IW * IH * ID * C * mb + IW * IH * ID * c;
         const auto *src_loc = &src[src_off];
 
+        data_t d_val = d[0];
+        dim_t kd_max = 0;
+        dim_t kh_max = 0;
+        dim_t kw_max = 0;
         for_(dim_t kd = 0; kd < KD; ++kd)
         for_(dim_t kh = 0; kh < KH; ++kh)
         for (dim_t kw = 0; kw < KW; ++kw) {
@@ -99,11 +108,18 @@ status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
 
             const auto src_off_loc = IW * IH * id + IW * ih + iw;
             const auto &s = src_loc[src_off_loc];
-            if (s > d[0]) {
-                d[0] = s;
-                set_ws(mb, c, od, oh, ow, kd * KH * KW + kh * KW + kw);
+            if (s > d_val) {
+                d_val = s;
+                kd_max = kd;
+                kh_max = kh;
+                kw_max = kw;
             }
         }
+
+        if (d_val > d[0]) {
+            d[0] = d_val;
+            set_ws(mb, c, od, oh, ow, kd_max * KH * KW + kh_max * KW + kw_max);
+        }
     };
 
     const auto ker_avg = [=](data_t *d, dim_t mb, dim_t c, dim_t od, dim_t oh,
@@ -254,7 +270,7 @@ status_t nchw_pooling_fwd_t<d_type>::execute_forward(
     const size_t blocked_size = src_size / simd_w;
     const size_t tail_size = src_size % simd_w;
 
-    auto apply_offset = [=](int index, int offset) {
+    auto apply_offset = [=](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -267,7 +283,7 @@ status_t nchw_pooling_fwd_t<d_type>::execute_forward(
                     + (size_t)OW * oh + (size_t)ow;
             if (ws_dt == data_type::u8) {
                 assert(0 <= value
-                        && value <= numeric_limits<typename prec_traits<
+                        && value <= numeric_limits<typename prec_traits_t<
                                         data_type::u8>::type>::max());
                 ws[ws_offset] = value;
             } else
@@ -280,6 +296,10 @@ status_t nchw_pooling_fwd_t<d_type>::execute_forward(
         const auto src_off = IW * IH * ID * C * mb + IW * IH * ID * c;
         const auto *src_loc = &cvt_wsp[src_off];
 
+        float d_val = d[0];
+        dim_t kd_max = 0;
+        dim_t kh_max = 0;
+        dim_t kw_max = 0;
         for_(dim_t kd = 0; kd < KD; ++kd)
         for_(dim_t kh = 0; kh < KH; ++kh)
         for (dim_t kw = 0; kw < KW; ++kw) {
@@ -292,11 +312,18 @@ status_t nchw_pooling_fwd_t<d_type>::execute_forward(
 
             const auto src_off_loc = IW * IH * id + IW * ih + iw;
             const auto &s = src_loc[src_off_loc];
-            if (s > d[0]) {
-                d[0] = s;
-                set_ws(mb, c, od, oh, ow, kd * KH * KW + kh * KW + kw);
+            if (s > d_val) {
+                d_val = s;
+                kd_max = kd;
+                kh_max = kh;
+                kw_max = kw;
             }
         }
+
+        if (d_val > d[0]) {
+            d[0] = d_val;
+            set_ws(mb, c, od, oh, ow, kd_max * KH * KW + kh_max * KW + kw_max);
+        }
     };
 
     auto ker_avg = [=](float *d, dim_t mb, dim_t c, dim_t od, dim_t oh,
@@ -442,7 +469,7 @@ status_t nchw_pooling_bwd_t<data_type::f32>::execute_backward(
     const dim_t padT = pd()->padT();
     const dim_t padL = pd()->padL();
 
-    auto apply_offset = [=](int index, int offset) {
+    auto apply_offset = [=](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -486,7 +513,7 @@ status_t nchw_pooling_bwd_t<data_type::f32>::execute_backward(
         diff_src[diff_src_offset] += d[0];
     };
 
-    auto ker_avg = [=](const data_t *d, dim_t mb, dim_t c, dim_t od, dim_t oh,
+    auto ker_avg = [=](data_t d, dim_t mb, dim_t c, dim_t od, dim_t oh,
                            dim_t ow) {
         dim_t id_start = apply_offset(od * SD, padF);
         dim_t ih_start = apply_offset(oh * SH, padT);
@@ -506,7 +533,7 @@ status_t nchw_pooling_bwd_t<data_type::f32>::execute_backward(
             size_t diff_src_offset = (size_t)mb * C * ID * IH * IW
                     + (size_t)c * ID * IH * IW + (size_t)id * IH * IW
                     + (size_t)ih * IW + (size_t)iw;
-            diff_src[diff_src_offset] += d[0] / num_summands;
+            diff_src[diff_src_offset] += d / num_summands;
         }
     };
 
@@ -544,7 +571,7 @@ status_t nchw_pooling_bwd_t<data_type::f32>::execute_backward(
                 size_t diff_dst_offset = diff_dst_offset_b
                         + (size_t)od * OH * OW + (size_t)oh * OW;
                 for (dim_t ow = ow_start; ow < ow_end; ++ow) {
-                    const data_t *d = &diff_dst[diff_dst_offset + ow];
+                    data_t d = diff_dst[diff_dst_offset + ow];
                     ker_avg(d, mb, c, od, oh, ow);
                 }
             }
@@ -595,7 +622,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
     const size_t dst_sp_size = pd()->OD() * pd()->OH() * pd()->OW();
     const size_t src_sp_size = pd()->ID() * pd()->IH() * pd()->IW();
 
-    auto apply_offset = [=](int index, int offset) {
+    auto apply_offset = [=](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -638,8 +665,8 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
         diff_src[diff_src_offset] += d[0];
     };
 
-    auto ker_avg = [=](const float *d, float *diff_src, dim_t mb, dim_t c,
-                           dim_t od, dim_t oh, dim_t ow) {
+    auto ker_avg = [=](float d, float *diff_src, dim_t mb, dim_t c, dim_t od,
+                           dim_t oh, dim_t ow) {
         auto id_start = apply_offset(od * SD, padF);
         auto ih_start = apply_offset(oh * SH, padT);
         auto iw_start = apply_offset(ow * SW, padL);
@@ -657,7 +684,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
         for (dim_t iw = iw_start; iw < iw_end; ++iw) {
             size_t diff_src_offset
                     = (size_t)id * IH * IW + (size_t)ih * IW + (size_t)iw;
-            diff_src[diff_src_offset] += d[0] / num_summands;
+            diff_src[diff_src_offset] += d / num_summands;
         }
     };
 
@@ -677,6 +704,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
     if (alg == alg_kind::pooling_max) {
         parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
                 [&](int ithr, int, dim_t mb, dim_t cb) {
+                    assert(ithr < pd()->nbuf_);
                     bool is_last_c_block
                             = c_blk_tail > 0 && (cb + 1) * c_blk > C;
                     dim_t curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
@@ -713,6 +741,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
     } else {
         parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
                 [&](int ithr, int, dim_t mb, dim_t cb) {
+                    assert(ithr < pd()->nbuf_);
                     bool is_last_c_block
                             = c_blk_tail > 0 && (cb + 1) * c_blk > C;
                     dim_t curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
@@ -734,8 +763,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
                         size_t diff_dst_offset = (size_t)c * OD * OH * OW
                                 + (size_t)od * OH * OW + (size_t)oh * OW;
                         for (dim_t ow = ow_start; ow < ow_end; ++ow) {
-                            const float *d
-                                    = &diff_dst_fp32[diff_dst_offset + ow];
+                            float d = diff_dst_fp32[diff_dst_offset + ow];
                             ker_avg(d, &diff_src_fp32[c * ID * IH * IW], mb,
                                     cb * c_blk + c, od, oh, ow);
                         }
diff --git a/src/cpu/nchw_pooling.hpp b/src/cpu/nchw_pooling.hpp
index ae3b2fc5367..7ea95f2a66e 100644
--- a/src/cpu/nchw_pooling.hpp
+++ b/src/cpu/nchw_pooling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,6 +51,9 @@ struct nchw_pooling_fwd_t : public primitive_t {
                             alg_kind::pooling_avg_include_padding,
                             alg_kind::pooling_avg_exclude_padding),
                     VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_POOLING(
+                    memory_desc_wrapper(dst_md()).is_dense(false),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
             VDISPATCH_POOLING(utils::everyone_is(d_type, src_md()->data_type,
                                       dst_md()->data_type),
                     VERBOSE_UNSUPPORTED_DT);
@@ -101,7 +104,7 @@ struct nchw_pooling_fwd_t : public primitive_t {
 
     nchw_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override {
         ref_post_ops_
@@ -174,8 +177,9 @@ struct nchw_pooling_bwd_t : public primitive_t {
             return status::success;
         }
 
-        dim_t channel_block_size_;
+        dim_t channel_block_size_ {1};
         int nthr_; // To not exceed the limit in execute used for set up.
+        int nbuf_ {0};
 
     private:
         void init_scratchpad() {
@@ -185,31 +189,39 @@ struct nchw_pooling_bwd_t : public primitive_t {
                 size_t src_sz_ = ID() * IH() * IW();
                 auto scratchpad = scratchpad_registry().registrar();
 
+                // The value of nbuf_ must be in compliance with arguments of
+                // parallel_nd_ext called from execute_backward for data_type!=f32
+                nbuf_ = nstl::min(static_cast<dim_t>(nthr_),
+                        MB() * utils::div_up(IC(), channel_block_size_));
+
                 scratchpad.template book<float>(key_pool_src_bf16cvt,
-                        src_sz_ * nthr_ * channel_block_size_);
+                        src_sz_ * nbuf_ * channel_block_size_);
                 scratchpad.template book<float>(key_pool_dst_bf16cvt,
-                        dst_sz_ * nthr_ * channel_block_size_);
+                        dst_sz_ * nbuf_ * channel_block_size_);
             }
         }
 
         void calculate_channel_block_size() {
-            // calculate channels block size at which the data fits into half
-            // of L1, it allows to improve performance for problems with small
-            // spatial
-            dim_t dst_sz_ = OD() * OH() * OW();
-            dim_t src_sz_ = ID() * IH() * IW();
-            dim_t C_per_thr = nstl::min(MB() * IC() / nthr_, IC());
-            const dim_t max_block_size
-                    = platform::get_per_core_cache_size(1) / 2;
-            dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
-            channel_block_size_ = nstl::max(
-                    nstl::min(C_per_thr, max_block_size / data_size_per_ch),
-                    (dim_t)1);
+            using namespace memory_tracking::names;
+            if (diff_dst_md()->data_type != data_type::f32) {
+                // calculate channels block size at which the data fits into half
+                // of L1, it allows to improve performance for problems with small
+                // spatial
+                dim_t dst_sz_ = OD() * OH() * OW();
+                dim_t src_sz_ = ID() * IH() * IW();
+                dim_t C_per_thr = nstl::min(MB() * IC() / nthr_, IC());
+                const dim_t max_block_size
+                        = platform::get_per_core_cache_size(1) / 2;
+                dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
+                channel_block_size_ = nstl::max(
+                        nstl::min(C_per_thr, max_block_size / data_size_per_ch),
+                        (dim_t)1);
+            }
         }
     };
 
     nchw_pooling_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<d_type>::type data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward(ctx);
diff --git a/src/cpu/ncsp_batch_normalization.hpp b/src/cpu/ncsp_batch_normalization.hpp
index 2cfe4d834b4..0cde9f513b5 100644
--- a/src/cpu/ncsp_batch_normalization.hpp
+++ b/src/cpu/ncsp_batch_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -112,11 +112,11 @@ struct ncsp_batch_normalization_fwd_t : public primitive_t {
         }
     };
 
-    typedef typename prec_traits<d_type>::type data_t;
-    typedef float acc_data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
+    using acc_data_t = float;
 
     ncsp_batch_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-    ~ncsp_batch_normalization_fwd_t() {}
+    ~ncsp_batch_normalization_fwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
@@ -209,11 +209,11 @@ struct ncsp_batch_normalization_bwd_t : public primitive_t {
         }
     };
 
-    typedef typename prec_traits<d_type>::type data_t;
-    typedef float acc_data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
+    using acc_data_t = float;
 
     ncsp_batch_normalization_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    ~ncsp_batch_normalization_bwd_t() {}
+    ~ncsp_batch_normalization_bwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward(ctx);
diff --git a/src/cpu/ncsp_group_normalization.hpp b/src/cpu/ncsp_group_normalization.hpp
index 85c5f68bb6d..5c8237a0bf3 100644
--- a/src/cpu/ncsp_group_normalization.hpp
+++ b/src/cpu/ncsp_group_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,8 +68,7 @@ struct ncsp_group_normalization_fwd_t : public primitive_t {
             VDISPATCH_GNORM(memory_desc_matches_one_of_tag(
                                     *dst_md(), ncdhw, nchw, ncw, nc),
                     VERBOSE_UNSUPPORTED_TAG_S, "dst");
-            VDISPATCH_GNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime)
+            VDISPATCH_GNORM(attr()->has_default_values(skip_mask_t::scales)
                             && attr_scales_ok(),
                     VERBOSE_UNSUPPORTED_ATTR);
             nthr_ = dnnl_get_max_threads();
diff --git a/src/cpu/nhwc_pooling.cpp b/src/cpu/nhwc_pooling.cpp
index b20ee0a92a0..754b5ab4c40 100644
--- a/src/cpu/nhwc_pooling.cpp
+++ b/src/cpu/nhwc_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,17 +55,17 @@ namespace cpu {
             = MEM_D(name).blocking_desc().strides[ndims - 1];
 
 namespace nhwc_pooling {
-size_t strided_offset(const int _n, const size_t _sn, const int _d,
-        const size_t _sd, const int _h, const size_t _sh, const int _w,
+size_t strided_offset(const dim_t _n, const size_t _sn, const dim_t _d,
+        const size_t _sd, const dim_t _h, const size_t _sh, const dim_t _w,
         const size_t _sw) {
     return _n * _sn + _d * _sd + _h * _sh + _w * _sw;
 }
 } // namespace nhwc_pooling
 
 template <data_type_t d_type>
-void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const int n,
+void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const dim_t n,
         const ker_data_t *src, const size_t num, ker_data_t *dst) const {
-    for (int i = 0; i < n; ++i) {
+    for (dim_t i = 0; i < n; ++i) {
         const float ftmp = ((float)src[i]) / num;
         dst[i] = q10n::out_round<ker_data_t>(ftmp);
     }
@@ -73,21 +73,21 @@ void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const int n,
 
 template <data_type_t d_type>
 void nhwc_pooling_fwd_t<d_type>::array_add(
-        const int n, const ker_data_t *src, ker_data_t *dst) const {
-    for (int i = 0; i < n; ++i) {
+        const dim_t n, const ker_data_t *src, ker_data_t *dst) const {
+    for (dim_t i = 0; i < n; ++i) {
         dst[i] += src[i];
     }
 }
 
 template <data_type_t d_type>
-void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const int n, ker_data_t *dst,
+void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const dim_t n, ker_data_t *dst,
         const ker_data_t *src, unsigned char *ws, const size_t ws_offset,
         const data_type_t ws_dt, const int index) const {
     assert(ws);
 #if SAFE_TO_USE_OMP_SIMD
     PRAGMA_OMP_SIMD()
 #endif
-    for (int oc = 0; oc < n; ++oc) {
+    for (dim_t oc = 0; oc < n; ++oc) {
         const auto s = src[oc];
         ker_data_t mv = dst[oc];
 
@@ -130,14 +130,14 @@ void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const int n, ker_data_t *dst,
 }
 
 template <data_type_t d_type>
-void nhwc_pooling_fwd_t<d_type>::array_nhwc_initialize(const int n,
+void nhwc_pooling_fwd_t<d_type>::array_nhwc_initialize(const dim_t n,
         ker_data_t *dst, unsigned char *ws, const size_t ws_offset,
         const data_type_t ws_dt) const {
     assert(ws && (ws_dt == data_type::u8 || ws_dt == data_type::s32));
 #if SAFE_TO_USE_OMP_SIMD
     PRAGMA_OMP_SIMD()
 #endif
-    for (int oc = 0; oc < n; ++oc) {
+    for (dim_t oc = 0; oc < n; ++oc) {
         if (ws_dt == data_type::u8)
             ws[ws_offset + oc] = 0;
         else
@@ -189,7 +189,7 @@ status_t nhwc_pooling_fwd_t<data_type::f32>::execute_forward(
     DECLARE_READ_STRIDES(src);
     DECLARE_READ_STRIDES(dst);
 
-    const auto apply_offset = [](int index, int offset) {
+    const auto apply_offset = [](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
diff --git a/src/cpu/nhwc_pooling.hpp b/src/cpu/nhwc_pooling.hpp
index 44c71049b9e..98fb378ddcd 100644
--- a/src/cpu/nhwc_pooling.hpp
+++ b/src/cpu/nhwc_pooling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@ namespace impl {
 namespace cpu {
 
 namespace nhwc_pooling {
-size_t strided_offset(const int _n, const size_t _sn, const int _d,
-        const size_t _sd, const int _h, const size_t _sh, const int _w,
+size_t strided_offset(const dim_t _n, const size_t _sn, const dim_t _d,
+        const size_t _sd, const dim_t _h, const size_t _sh, const dim_t _w,
         const size_t _sw);
 }
 
@@ -113,8 +113,8 @@ struct nhwc_pooling_fwd_t : public primitive_t {
 
     nhwc_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    using data_t = typename prec_traits<d_type>::type;
-    using ker_data_t = typename prec_traits<data_type::f32>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
+    using ker_data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         ref_post_ops_
@@ -130,14 +130,15 @@ struct nhwc_pooling_fwd_t : public primitive_t {
 
 private:
     status_t execute_forward(const exec_ctx_t &ctx) const;
-    void array_div_by_const(const int n, const ker_data_t *src,
+    void array_div_by_const(const dim_t n, const ker_data_t *src,
             const size_t num, ker_data_t *dst) const;
-    void array_add(const int n, const ker_data_t *src, ker_data_t *dst) const;
-    void array_nhwc_max(const int n, ker_data_t *dst, const ker_data_t *src,
+    void array_add(const dim_t n, const ker_data_t *src, ker_data_t *dst) const;
+    void array_nhwc_max(const dim_t n, ker_data_t *dst, const ker_data_t *src,
             unsigned char *ws, const size_t ws_offset, const data_type_t ws_dt,
             const int index) const;
-    void array_nhwc_initialize(const int n, ker_data_t *dst, unsigned char *ws,
-            const size_t ws_offset, const data_type_t ws_dt) const;
+    void array_nhwc_initialize(const dim_t n, ker_data_t *dst,
+            unsigned char *ws, const size_t ws_offset,
+            const data_type_t ws_dt) const;
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::unique_ptr<ref_post_ops_t> ref_post_ops_;
@@ -210,7 +211,7 @@ struct nhwc_pooling_bwd_t : public primitive_t {
     };
 
     nhwc_pooling_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<d_type>::type data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward(ctx);
diff --git a/src/cpu/nspc_batch_normalization.hpp b/src/cpu/nspc_batch_normalization.hpp
index 90a8a2e0029..456f5c6b9f0 100644
--- a/src/cpu/nspc_batch_normalization.hpp
+++ b/src/cpu/nspc_batch_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,10 +36,8 @@ namespace cpu {
 template <data_type_t d_type>
 struct nspc_batch_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("nspc_bnorm:any", nspc_batch_normalization_fwd_t);
 
@@ -111,11 +109,11 @@ struct nspc_batch_normalization_fwd_t : public primitive_t {
         }
     };
 
-    typedef typename prec_traits<d_type>::type data_t;
-    typedef float acc_data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
+    using acc_data_t = float;
 
     nspc_batch_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-    ~nspc_batch_normalization_fwd_t() {}
+    ~nspc_batch_normalization_fwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
@@ -129,10 +127,8 @@ struct nspc_batch_normalization_fwd_t : public primitive_t {
 template <data_type_t d_type>
 struct nspc_batch_normalization_bwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_bwd_pd_t::
+                cpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("nspc_bnorm:any", nspc_batch_normalization_bwd_t);
 
@@ -204,11 +200,11 @@ struct nspc_batch_normalization_bwd_t : public primitive_t {
         }
     };
 
-    typedef typename prec_traits<d_type>::type data_t;
-    typedef float acc_data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
+    using acc_data_t = float;
 
     nspc_batch_normalization_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    ~nspc_batch_normalization_bwd_t() {}
+    ~nspc_batch_normalization_bwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward(ctx);
diff --git a/src/cpu/platform.cpp b/src/cpu/platform.cpp
index f372543ccb7..d4f5a217c0f 100644
--- a/src/cpu/platform.cpp
+++ b/src/cpu/platform.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2024 FUJITSU LIMITED
 * Copyright 2022-2024 Arm Ltd. and affiliates
 *
@@ -34,10 +34,12 @@
 #include "cpu/x64/cpu_isa_traits.hpp"
 #elif DNNL_AARCH64
 #include "cpu/aarch64/cpu_isa_traits.hpp"
-#if DNNL_AARCH64_USE_ACL
+#endif
+#if DNNL_USE_ACL
 // For checking if fp16 isa is supported on the platform
 #include "arm_compute/core/CPP/CPPTypes.h"
-#endif
+// For setting the number of threads for ACL
+#include "src/common/cpuinfo/CpuInfo.h"
 #endif
 
 // For DNNL_X64 build we compute the timestamp using rdtsc. Use std::chrono for
@@ -82,6 +84,8 @@ status_t set_max_cpu_isa(dnnl_cpu_isa_t isa) {
 status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints) {
 #if DNNL_X64
     return x64::set_cpu_isa_hints(isa_hints);
+#elif DNNL_AARCH64
+    return status::success;
 #else
     return status::unimplemented;
 #endif
@@ -124,7 +128,7 @@ bool has_data_type_support(data_type_t data_type) {
 #if DNNL_X64
             return x64::mayiuse(x64::avx512_core_fp16)
                     || x64::mayiuse(x64::avx2_vnni_2);
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_fp16();
 #else
             return false;
@@ -151,7 +155,7 @@ bool has_training_support(data_type_t data_type) {
 #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
             return true;
 #endif
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_bf16();
 #else
             return false;
@@ -159,7 +163,7 @@ bool has_training_support(data_type_t data_type) {
         case data_type::f16:
 #if DNNL_X64
             return x64::mayiuse(x64::avx512_core_fp16);
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_fp16();
 #else
             return false;
@@ -205,8 +209,8 @@ unsigned get_per_core_cache_size(int level) {
 unsigned get_num_cores() {
 #if DNNL_X64
     return x64::cpu().getNumCores(Xbyak::util::CoreLevel);
-#elif DNNL_AARCH64_USE_ACL
-    return aarch64::cpu().getNumCores(Xbyak_aarch64::util::CoreLevel);
+#elif DNNL_USE_ACL
+    return arm_compute::cpuinfo::num_threads_hint();
 #else
     return 1;
 #endif
@@ -256,9 +260,9 @@ unsigned get_max_threads_to_use() {
 int get_vector_register_size() {
 #if DNNL_X64
     using namespace x64;
-    if (mayiuse(avx512_core)) return cpu_isa_traits<avx512_core>::vlen;
-    if (mayiuse(avx)) return cpu_isa_traits<avx>::vlen;
-    if (mayiuse(sse41)) return cpu_isa_traits<sse41>::vlen;
+    if (mayiuse(avx512_core)) return cpu_isa_traits_t<avx512_core>::vlen;
+    if (mayiuse(avx)) return cpu_isa_traits_t<avx>::vlen;
+    if (mayiuse(sse41)) return cpu_isa_traits_t<sse41>::vlen;
 #elif DNNL_AARCH64
     using namespace aarch64;
     if (mayiuse(asimd)) return cpu_isa_traits<asimd>::vlen;
diff --git a/src/cpu/platform.hpp b/src/cpu/platform.hpp
index 1de81f578e6..af0d6e944a8 100644
--- a/src/cpu/platform.hpp
+++ b/src/cpu/platform.hpp
@@ -26,7 +26,9 @@
 
 // Possible architectures:
 // - DNNL_X64
+// - DNNL_X86
 // - DNNL_AARCH64
+// - DNNL_ARM
 // - DNNL_PPC64
 // - DNNL_S390X
 // - DNNL_RV64
@@ -35,12 +37,19 @@
 
 #if defined(DNNL_X64) + defined(DNNL_AARCH64) + defined(DNNL_PPC64) \
                 + defined(DNNL_S390X) + defined(DNNL_RV64) \
+                + defined(DNNL_ARM) + defined(DNNL_X86) \
                 + defined(DNNL_ARCH_GENERIC) \
         == 0
-#if defined(__x86_64__) || defined(_M_X64)
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || \
+      defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
 #define DNNL_X64 1
-#elif defined(__aarch64__)
+#elif defined(i386) || defined(__i386) || defined(__i386__) || defined(__IA32__) || defined(_M_I86) || \
+      defined(_M_IX86) || defined(__X86__) || defined(_X86_) || defined(__I86__) || defined(__386)
+#define DNNL_X86 1
+#elif defined(__aarch64__) || defined(_M_ARM64)
 #define DNNL_AARCH64 1
+#elif defined(__arm__) || defined(_M_ARM) || defined(__ARMEL__)
+#define DNNL_ARM 1
 #elif defined(__powerpc64__) || defined(__PPC64__) || defined(_ARCH_PPC64)
 #define DNNL_PPC64 1
 #elif defined(__s390x__)
@@ -54,6 +63,7 @@
 
 #if defined(DNNL_X64) + defined(DNNL_AARCH64) + defined(DNNL_PPC64) \
                 + defined(DNNL_S390X) + defined(DNNL_RV64) \
+                + defined(DNNL_ARM) + defined(DNNL_X86) \
                 + defined(DNNL_ARCH_GENERIC) \
         != 1
 #error One and only one architecture should be defined at a time
@@ -62,9 +72,15 @@
 #if !defined(DNNL_X64)
 #define DNNL_X64 0
 #endif
+#if !defined(DNNL_X86)
+#define DNNL_X86 0
+#endif
 #if !defined(DNNL_AARCH64)
 #define DNNL_AARCH64 0
 #endif
+#if !defined(DNNL_ARM)
+#define DNNL_ARM 0
+#endif
 #if !defined(DNNL_PPC64)
 #define DNNL_PPC64 0
 #endif
@@ -84,6 +100,7 @@
 #define DNNL_PPC64_ONLY(...) Z_CONDITIONAL_DO(DNNL_PPC64_ONLY, __VA_ARGS__)
 #define DNNL_S390X_ONLY(...) Z_CONDITIONAL_DO(DNNL_S390X_ONLY, __VA_ARGS__)
 #define DNNL_AARCH64_ONLY(...) Z_CONDITIONAL_DO(DNNL_AARCH64, __VA_ARGS__)
+#define DNNL_ARM_ONLY(...) Z_CONDITIONAL_DO(DNNL_ARM, __VA_ARGS__)
 
 // Using RISC-V implementations optimized with RVV Intrinsics is optional for RISC-V builds
 // and can be enabled with DNNL_ARCH_OPT_FLAGS="-march=<ISA-string>" option, where <ISA-string>
@@ -98,11 +115,11 @@
 #define DNNL_NON_X64_ONLY(...) Z_CONDITIONAL_DO(Z_NOT(DNNL_X64), __VA_ARGS__)
 
 // Using Arm Compute Library kernels is optional for AArch64 builds
-// and can be enabled with the DNNL_AARCH64_USE_ACL CMake option
-#if defined(DNNL_AARCH64) && defined(DNNL_AARCH64_USE_ACL)
-#define DNNL_AARCH64_ACL_ONLY(...) __VA_ARGS__
+// and can be enabled with the DNNL_USE_ACL CMake option
+#ifdef DNNL_USE_ACL
+#define DNNL_ACL_ONLY(...) __VA_ARGS__
 #else
-#define DNNL_AARCH64_ACL_ONLY(...)
+#define DNNL_ACL_ONLY(...)
 #endif
 
 // Primitive ISA section for configuring knobs.
diff --git a/src/cpu/ppc64/ppc64_gemm_s8x8s32.cpp b/src/cpu/ppc64/ppc64_gemm_s8x8s32.cpp
index 33f88cb17c4..f7ce2b90ac3 100644
--- a/src/cpu/ppc64/ppc64_gemm_s8x8s32.cpp
+++ b/src/cpu/ppc64/ppc64_gemm_s8x8s32.cpp
@@ -150,8 +150,9 @@ dnnl_status_t cblas_gemm_s8x8s32_ppc64(int ATflag, int BTflag,
                 }
             }
             for (int i = 0; i < m; ++i) {
-                comparray[i] = out_round<int32_t>(saturate<int32_t>(
-                        ((double)comparray[i]) * alpha * -128.0));
+                comparray[i] = cpu::q10n::out_round<int32_t>(
+                        cpu::q10n::saturate<int32_t>(
+                                ((double)comparray[i]) * alpha * -128.0));
             }
             for (int j = 0; j < n; ++j) {
                 int *ca = comparray;
diff --git a/src/cpu/primitive_attr_postops.cpp b/src/cpu/primitive_attr_postops.cpp
index fa80cb23683..d5ddd73cd77 100644
--- a/src/cpu/primitive_attr_postops.cpp
+++ b/src/cpu/primitive_attr_postops.cpp
@@ -26,7 +26,7 @@ namespace cpu {
 using namespace alg_kind;
 using namespace math;
 
-float compute_binary_scalar(alg_kind_t alg, float x, float y) {
+float compute_binary_scalar(alg_kind_t alg, float x, float y, bool c) {
     switch (alg) {
         case binary_add: return x + y;
         case binary_div: return x / y;
@@ -40,7 +40,9 @@ float compute_binary_scalar(alg_kind_t alg, float x, float y) {
         case binary_lt: return x < y;
         case binary_eq: return x == y;
         case binary_ne: return x != y;
-        default: assert(!"not supported operation!"); return NAN;
+        case binary_select: return c ? x : y;
+        case binary_prelu: return x >= 0 ? x : x * y;
+        default: assert(!"unsupported operation!"); return NAN;
     }
 }
 
@@ -69,6 +71,9 @@ float compute_eltwise_scalar_fwd(
         case eltwise_mish: d = mish_fwd(s); break;
         case eltwise_hardsigmoid: d = hardsigmoid_fwd(s, alpha, beta); break;
         case eltwise_hardswish: d = hardswish_fwd(s, alpha, beta); break;
+        case eltwise_hsigmoid: d = hsigmoid_fwd(s); break;
+        case eltwise_round_half_away_from_zero: d = round_half_away_from_zero_fwd(s); break;
+        case eltwise_round_half_to_even: d = round_half_to_even_fwd(s); break;
         case eltwise_relu_use_dst_for_bwd: d = relu_fwd(s, alpha); break;
         case eltwise_tanh_use_dst_for_bwd: d = tanh_fwd(s); break;
         case eltwise_elu_use_dst_for_bwd: d = elu_fwd(s, alpha); break;
@@ -136,15 +141,16 @@ ref_binary_scalar_t::ref_binary_scalar_t(alg_kind_t alg) : alg_(alg) {
             alg_kind::binary_min, alg_kind::binary_mul, alg_kind::binary_div,
             alg_kind::binary_sub, alg_kind::binary_ge, alg_kind::binary_gt,
             alg_kind::binary_le, alg_kind::binary_lt, alg_kind::binary_eq,
-            alg_kind::binary_ne));
+            alg_kind::binary_ne, alg_kind::binary_select, alg_kind::binary_prelu));
 }
 
 ref_binary_scalar_t::ref_binary_scalar_t(
         const post_ops_t::entry_t::binary_t &binary)
     : ref_binary_scalar_t(binary.alg) {}
 
-float ref_binary_scalar_t::compute_scalar(float src0, float src1) const {
-    return compute_binary_scalar(alg_, src0, src1);
+float ref_binary_scalar_t::compute_scalar(
+        float src0, float src1, bool src2) const {
+    return compute_binary_scalar(alg_, src0, src1, src2);
 }
 
 ref_eltwise_scalar_fwd_t::ref_eltwise_scalar_fwd_t(
@@ -155,11 +161,12 @@ ref_eltwise_scalar_fwd_t::ref_eltwise_scalar_fwd_t(
             eltwise_soft_relu, eltwise_mish, eltwise_logistic, eltwise_exp,
             eltwise_gelu_tanh, eltwise_swish, eltwise_log, eltwise_clip,
             eltwise_clip_v2, eltwise_pow, eltwise_gelu_erf, eltwise_round,
-            eltwise_hardsigmoid, eltwise_hardswish,
-            eltwise_relu_use_dst_for_bwd, eltwise_tanh_use_dst_for_bwd,
-            eltwise_elu_use_dst_for_bwd, eltwise_sqrt_use_dst_for_bwd,
-            eltwise_logistic_use_dst_for_bwd, eltwise_exp_use_dst_for_bwd,
-            eltwise_clip_v2_use_dst_for_bwd));
+            eltwise_hardswish, eltwise_hardsigmoid,
+            eltwise_hsigmoid, eltwise_round_half_away_from_zero, eltwise_round_half_to_even,
+            eltwise_relu_use_dst_for_bwd,
+            eltwise_tanh_use_dst_for_bwd, eltwise_elu_use_dst_for_bwd,
+            eltwise_sqrt_use_dst_for_bwd, eltwise_logistic_use_dst_for_bwd,
+            eltwise_exp_use_dst_for_bwd, eltwise_clip_v2_use_dst_for_bwd));
 }
 
 ref_eltwise_scalar_fwd_t::ref_eltwise_scalar_fwd_t(
@@ -179,6 +186,8 @@ ref_post_ops_t::ref_post_ops_t(const post_ops_t &po, bool skip_sum)
             eltwise_po_.emplace_back(e.eltwise);
         } else if (po_.contain(primitive_kind::binary, idx)) {
             binary_po_.emplace_back(e.binary);
+        } else if (po_.contain(primitive_kind::depthwise, idx)) {
+            depthwise_po_.emplace_back(e.depthwise.alg);
         }
     }
 }
@@ -273,12 +282,13 @@ float ref_dropout(
     return (m) ? src * inv_q : 0;
 }
 
-void ref_post_ops_t::execute(float &res, const args_t &args) const {
+void ref_post_ops_t::execute(float &res, const args_t &args, const size_t oc) const {
     if (po_.len() == 0) return;
 
     auto it_eltwise_po = eltwise_po_.begin();
     auto it_binary_po = binary_po_.begin();
     auto it_prelu_md = prelu_md_.begin();
+    auto it_depthwise_po = depthwise_po_.begin();
     for (auto idx = 0; idx < po_.len(); ++idx) {
         const auto &e = po_.entry_[idx];
         switch (e.kind) {
@@ -308,7 +318,7 @@ void ref_post_ops_t::execute(float &res, const args_t &args) const {
                         (DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1));
                 const float val_po = io::load_float_value(
                         src1_desc.data_type, src1_binary_po, off);
-                res = it_binary_po->compute_scalar(res, val_po);
+                res = it_binary_po->compute_scalar(res, val_po, false);
                 ++it_binary_po;
             } break;
             case primitive_kind::prelu: {
@@ -339,6 +349,46 @@ void ref_post_ops_t::execute(float &res, const args_t &args) const {
                 res = weights_value * res;
                 ++it_prelu_md;
             } break;
+            case primitive_kind::depthwise: {
+                const exec_ctx_t &ctx = *args.ctx;
+                auto depthwise_base = CTX_IN_MEM(const float *, (DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1));
+                auto depthwise_weights = depthwise_base + e.depthwise.offset[e.depthwise.scales];
+                auto depthwise_bias = depthwise_base + e.depthwise.offset[e.depthwise.shifts];
+
+                res = it_depthwise_po->compute_scalar(res, depthwise_weights + oc, depthwise_bias + oc);
+
+                ++it_depthwise_po;
+            } break;
+            case primitive_kind::quantization: {
+                bool do_dequantization = e.quantization.alg == alg_kind::quantization_quantize_dequantize;
+                bool do_rounding = do_dequantization || args.dst_md->data_type == dnnl_f32 || idx != po_.len() - 1;
+
+                auto quant = e.quantization;
+                const exec_ctx_t &ctx = *args.ctx;
+                auto quantization_base = CTX_IN_MEM(const float *, (DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1));
+                const auto pcl =  quantization_base + quant.offset[quant.crop_low];
+                const auto pch =  quantization_base + quant.offset[quant.crop_high];
+                const auto pisc = quantization_base + quant.offset[quant.inp_scale];
+                const auto pish = quantization_base + quant.offset[quant.inp_shift];
+                const auto posc = quantization_base + quant.offset[quant.output_scale];
+                const auto posh = quantization_base + quant.offset[quant.output_shift];
+
+                int cl_idx = !quant.per_channel[quant.crop_low] ? 0 : oc;
+                int ch_idx = !quant.per_channel[quant.crop_high] ? 0 : oc;
+                int isc_idx = !quant.per_channel[quant.inp_scale] ? 0 : oc;
+                int ish_idx = !quant.per_channel[quant.inp_shift] ? 0 : oc;
+                int osc_idx = !quant.per_channel[quant.output_scale] ? 0 : oc;
+                int osh_idx = !quant.per_channel[quant.output_shift] ? 0 : oc;
+
+                res = nstl::min(pch[ch_idx], nstl::max(pcl[cl_idx], res));
+                res = res * pisc[isc_idx] + pish[ish_idx];
+
+                if (do_rounding)
+                    res = roundf(res);
+
+                if (do_dequantization)
+                    res = res * posc[osc_idx] + posh[osh_idx];
+            } break;
             default: assert(!"unsupported post op primitive kind!");
         }
     }
diff --git a/src/cpu/primitive_attr_postops.hpp b/src/cpu/primitive_attr_postops.hpp
index bcb09b2e004..3759b89a8d4 100644
--- a/src/cpu/primitive_attr_postops.hpp
+++ b/src/cpu/primitive_attr_postops.hpp
@@ -22,11 +22,13 @@
 #include "common/primitive.hpp"
 #include "common/primitive_attr.hpp"
 
+#include "ref_depthwise_injector.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
 
-float compute_binary_scalar(alg_kind_t alg, float x, float y);
+float compute_binary_scalar(alg_kind_t alg, float x, float y, bool c);
 float compute_eltwise_scalar_fwd(
         const alg_kind_t alg, float s, float alpha, float beta);
 float compute_eltwise_scalar_bwd(
@@ -36,7 +38,7 @@ struct ref_binary_scalar_t {
     ref_binary_scalar_t(alg_kind_t alg);
     ref_binary_scalar_t(const post_ops_t::entry_t::binary_t &binary);
 
-    float compute_scalar(float src0, float src1) const;
+    float compute_scalar(float src0, float src1, bool src2) const;
 
 private:
     const alg_kind_t alg_;
@@ -71,7 +73,7 @@ struct ref_post_ops_t {
 
     status_t init(const memory_desc_t *dst_md);
 
-    void execute(float &res, const args_t &args = args_t()) const;
+    void execute(float &res, const args_t &args = args_t(), const size_t oc = 0) const;
 
     static bool primitive_kind_ok(const post_ops_t &po) {
         using namespace primitive_kind;
@@ -86,6 +88,7 @@ struct ref_post_ops_t {
 
     std::vector<ref_eltwise_scalar_fwd_t> eltwise_po_;
     std::vector<ref_binary_scalar_t> binary_po_;
+    std::vector<ref_depthwise_scalar_fwd_t> depthwise_po_;
     std::vector<memory_desc_t> prelu_md_;
 };
 
diff --git a/src/cpu/ref_batch_normalization.cpp b/src/cpu/ref_batch_normalization.cpp
index 0e4a23e8d7e..6ab3c20742f 100644
--- a/src/cpu/ref_batch_normalization.cpp
+++ b/src/cpu/ref_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -158,8 +158,8 @@ status_t ref_batch_normalization_fwd_t<d_type>::execute_forward(
                 }
             }
             if (d_type == s8)
-                dst[d_off]
-                        = q10n::qz_a1b0<float, data_t>()(maybe_post_op(bn_res));
+                dst[d_off] = q10n::qz_a1b0_t<float, data_t>()(
+                        maybe_post_op(bn_res));
             else
                 dst[d_off] = maybe_post_op(bn_res);
         }
diff --git a/src/cpu/ref_batch_normalization.hpp b/src/cpu/ref_batch_normalization.hpp
index 2e712945533..2932b2a9c7d 100644
--- a/src/cpu/ref_batch_normalization.hpp
+++ b/src/cpu/ref_batch_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,10 +35,8 @@ namespace cpu {
 template <data_type_t d_type>
 struct ref_batch_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_batch_normalization_fwd_t);
 
@@ -80,7 +78,7 @@ struct ref_batch_normalization_fwd_t : public primitive_t {
 
     ref_batch_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<d_type>::type data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
@@ -94,10 +92,8 @@ struct ref_batch_normalization_fwd_t : public primitive_t {
 template <data_type_t d_type>
 struct ref_batch_normalization_bwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_bwd_pd_t::
+                cpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_batch_normalization_bwd_t);
 
@@ -138,7 +134,7 @@ struct ref_batch_normalization_bwd_t : public primitive_t {
     };
 
     ref_batch_normalization_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<d_type>::type data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward(ctx);
diff --git a/src/cpu/ref_binary.cpp b/src/cpu/ref_binary.cpp
index 8d6788b77df..ad7f28ef592 100644
--- a/src/cpu/ref_binary.cpp
+++ b/src/cpu/ref_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,6 +37,8 @@ namespace cpu {
 status_t ref_binary_t::execute_ref(const exec_ctx_t &ctx) const {
     const auto src0 = CTX_IN_MEM(const void *, DNNL_ARG_SRC_0);
     const auto src1 = CTX_IN_MEM(const void *, DNNL_ARG_SRC_1);
+    const auto src2 = CTX_IN_MEM(const void *, DNNL_ARG_SRC_2);
+
     auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
 
     const float *scales[2];
@@ -45,10 +47,12 @@ status_t ref_binary_t::execute_ref(const exec_ctx_t &ctx) const {
 
     const memory_desc_wrapper src0_d(pd()->src_md(0));
     const memory_desc_wrapper src1_d(pd()->src_md(1));
+    const memory_desc_wrapper src2_d(pd()->src_md(2));
     const memory_desc_wrapper dst_d(pd()->dst_md());
 
     const auto src0_dt = src0_d.data_type();
     const auto src1_dt = src1_d.data_type();
+    const auto src2_dt = src2_d.data_type();
     const auto dst_dt = dst_d.data_type();
 
     const auto alg = pd()->desc()->alg_kind;
@@ -85,10 +89,11 @@ status_t ref_binary_t::execute_ref(const exec_ctx_t &ctx) const {
     }
 
     parallel_nd(nelems, [&](dim_t i) {
-        dims_t dims_src0, dims_src1; // decomposition for physical offsets
+        // decomposition for physical offsets
+        dims_t dims_src0, dims_src1, dims_src2;
         utils::l_dims_by_l_offset(dims_src0, i, dst_d.dims(), ndims);
         utils::l_dims_by_l_offset(dims_src1, i, dst_d.dims(), ndims);
-        auto off_C = dst_d.off_v(dims_src0);
+        auto off_D = dst_d.off_v(dims_src0);
 
         int mask_src0
                 = utils::get_dims_mask(dst_d.dims(), src0_d.dims(), ndims);
@@ -101,12 +106,22 @@ status_t ref_binary_t::execute_ref(const exec_ctx_t &ctx) const {
 
         float x_f = io::load_float_value(src0_dt, src0, off_A);
         float y_f = io::load_float_value(src1_dt, src1, off_B);
-        float dst_f = io::load_float_value(dst_dt, dst, off_C);
+        float dst_f = io::load_float_value(dst_dt, dst, off_D);
 
         x_f *= scales[0][0];
         y_f *= scales[1][0];
 
-        float acc = compute_binary_scalar(alg, x_f, y_f);
+        bool c_f = false;
+        if (pd()->is_ternary_op()) {
+            utils::l_dims_by_l_offset(dims_src2, i, dst_d.dims(), ndims);
+            int mask_src2
+                    = utils::get_dims_mask(dst_d.dims(), src2_d.dims(), ndims);
+            utils::apply_mask_on_dims(dims_src2, ndims, mask_src2);
+            const auto off_C = src2_d.off_v(dims_src2);
+            c_f = static_cast<bool>(io::load_int_value(src2_dt, src2, off_C));
+        }
+
+        float acc = compute_binary_scalar(alg, x_f, y_f, c_f);
 
         if (has_postops) {
             ref_post_ops_t::args_t args;
@@ -117,7 +132,7 @@ status_t ref_binary_t::execute_ref(const exec_ctx_t &ctx) const {
             ref_post_ops->execute(acc, args);
         }
 
-        io::store_float_value(dst_dt, acc, dst, off_C);
+        io::store_float_value(dst_dt, acc, dst, off_D);
     });
 
     return status::success;
diff --git a/src/cpu/ref_binary.hpp b/src/cpu/ref_binary.hpp
index 459475a580d..b59c0688fbc 100644
--- a/src/cpu/ref_binary.hpp
+++ b/src/cpu/ref_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,13 +49,17 @@ struct ref_binary_t : public primitive_t {
             VDISPATCH_BINARY(
                     platform::has_data_type_support(src_md(1)->data_type),
                     VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_BINARY(IMPLICATION(is_ternary_op(),
+                                     platform::has_data_type_support(
+                                             src_md(2)->data_type)),
+                    VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_BINARY(
                     platform::has_data_type_support(dst_md()->data_type),
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_BINARY(set_default_params() == status::success,
                     VERBOSE_UNSUPPORTED_TAG);
-            VDISPATCH_BINARY(attr()->has_default_values(
-                                     sm::post_ops | sm::scales_runtime),
+            VDISPATCH_BINARY(
+                    attr()->has_default_values(sm::post_ops | sm::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_BINARY(IMPLICATION(!attr()->scales_.has_default_values(),
                                      check_scales_mask()),
diff --git a/src/cpu/ref_concat.hpp b/src/cpu/ref_concat.hpp
index 81a86883f17..6b87295dfcf 100644
--- a/src/cpu/ref_concat.hpp
+++ b/src/cpu/ref_concat.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,19 +31,15 @@ namespace cpu {
 
 struct ref_concat_t : public primitive_t {
     struct pd_t : public cpu_concat_pd_t {
-        pd_t(const primitive_attr_t *attr, const memory_desc_t *dst_md, int n,
-                int concat_dim, const memory_desc_t *const *src_mds)
-            : cpu_concat_pd_t(attr, dst_md, n, concat_dim, src_mds)
-            , tent_dst_md_(types::zero_md()) {}
-        pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        using cpu_concat_pd_t::cpu_concat_pd_t;
 
         DECLARE_CONCAT_PD_T("ref:any", ref_concat_t);
 
         status_t init(engine_t *engine) {
             using sm = primitive_attr_t::skip_mask_t;
-            VDISPATCH_CONCAT(attr()->has_default_values(sm::scales_runtime),
+            VDISPATCH_CONCAT(attr()->has_default_values(sm::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
+            tent_dst_md_ = types::zero_md();
             status_t status = cpu_concat_pd_t::init();
             if (status != status::success) {
                 assert(dst_md_.format_kind != format_kind::undef);
@@ -62,11 +58,10 @@ struct ref_concat_t : public primitive_t {
             reorder_pds_.resize(n_ + use_tent_dst());
             for (int i = 0; i < n_; ++i) {
                 primitive_attr_t r_attr;
-                if (!sc.get(DNNL_ARG_MULTIPLE_SRC + i).has_default_values()) {
-                    int mask = 0;
-                    CHECK(sc.get(DNNL_ARG_MULTIPLE_SRC + i, &mask, nullptr));
-                    if (mask != 0) return status::unimplemented;
-                    r_attr.scales_.set(DNNL_ARG_SRC, mask);
+                if (!sc.has_default_values(DNNL_ARG_MULTIPLE_SRC + i)) {
+                    int mask = sc.get_mask(DNNL_ARG_MULTIPLE_SRC + i);
+                    VDISPATCH_CONCAT(mask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+                    CHECK(r_attr.scales_.set(DNNL_ARG_SRC, mask));
                 }
                 CHECK(reorder_primitive_desc_create(reorder_pds_[i], engine,
                         src_md(i), src_image_md(i), &r_attr));
@@ -114,7 +109,7 @@ struct ref_concat_t : public primitive_t {
         return status::success;
     }
 
-    ~ref_concat_t() = default;
+    ~ref_concat_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         using namespace memory_tracking::names;
@@ -145,8 +140,10 @@ struct ref_concat_t : public primitive_t {
                     = scratchpad.get_memory_storage(key_concat_tent_dst);
 
             for (int i = 0; i < n; ++i) {
-                memory_t tent_dst_i(engine, pd()->src_image_md(i),
-                        tent_dst_storage->clone());
+                std::unique_ptr<memory_t, memory_deleter_t> tent_dst_i;
+                CHECK(safe_ptr_assign(tent_dst_i,
+                        new memory_t(engine, pd()->src_image_md(i),
+                                tent_dst_storage->clone())));
                 const auto &src_scales_arg = ctx.args().find(
                         DNNL_ARG_ATTR_SCALES | (DNNL_ARG_MULTIPLE_SRC + i));
                 const memory_arg_t *src_scales = nullptr;
@@ -154,18 +151,22 @@ struct ref_concat_t : public primitive_t {
                     src_scales = &src_scales_arg->second;
                 execute_reorder(reorders_[i],
                         ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i),
-                        {&tent_dst_i, false}, src_scales, i);
+                        {tent_dst_i.get(), false}, src_scales, i);
             }
 
-            memory_t tent_dst(
-                    engine, &pd()->tent_dst_md_, tent_dst_storage->clone());
-            execute_reorder(reorders_[n], {&tent_dst, true},
+            std::unique_ptr<memory_t, memory_deleter_t> tent_dst;
+            CHECK(safe_ptr_assign(tent_dst,
+                    new memory_t(engine, &pd()->tent_dst_md_,
+                            tent_dst_storage->clone())));
+            execute_reorder(reorders_[n], {tent_dst.get(), true},
                     ctx.args().at(DNNL_ARG_DST), nullptr, n);
         } else {
             auto &dst_mem_storage = CTX_OUT_STORAGE(DNNL_ARG_DST);
             for (int i = 0; i < n; ++i) {
-                memory_t tent_dst_i(
-                        engine, pd()->src_image_md(i), dst_mem_storage.clone());
+                std::unique_ptr<memory_t, memory_deleter_t> tent_dst_i;
+                CHECK(safe_ptr_assign(tent_dst_i,
+                        new memory_t(engine, pd()->src_image_md(i),
+                                dst_mem_storage.clone())));
                 const auto &src_scales_arg = ctx.args().find(
                         DNNL_ARG_ATTR_SCALES | (DNNL_ARG_MULTIPLE_SRC + i));
                 const memory_arg_t *src_scales = nullptr;
@@ -173,7 +174,7 @@ struct ref_concat_t : public primitive_t {
                     src_scales = &src_scales_arg->second;
                 execute_reorder(reorders_[i],
                         ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i),
-                        {&tent_dst_i, false}, src_scales, i);
+                        {tent_dst_i.get(), false}, src_scales, i);
             }
         }
         return status::success;
diff --git a/src/cpu/ref_convolution.cpp b/src/cpu/ref_convolution.cpp
index b97c7b76942..22c48174819 100644
--- a/src/cpu/ref_convolution.cpp
+++ b/src/cpu/ref_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2021 Intel Corporation
+* Copyright 2016-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,6 +35,8 @@ status_t ref_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
     auto weights = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
     auto bias = CTX_IN_MEM(const void *, DNNL_ARG_BIAS);
+    const auto rnd_seed
+            = CTX_IN_MEM(const uint32_t *, DNNL_ARG_ATTR_ROUNDING_SEED);
     auto dst = CTX_OUT_CLEAN_MEM(void *, DNNL_ARG_DST, status);
     CHECK(status);
 
@@ -73,6 +75,7 @@ status_t ref_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     const auto padL = pd()->padL();
 
     const auto ndims = pd()->desc()->src_desc.ndims;
+    const auto dst_rnd_mode = pd()->attr()->rounding_mode_.get(DNNL_ARG_DST);
 
     auto ker = [=](dim_t g, dim_t mb, dim_t oc, dim_t od, dim_t oh, dim_t ow) {
         float d = 0;
@@ -211,7 +214,11 @@ status_t ref_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
                 args.ctx = &ctx;
                 args.l_offset = dst_l_off;
                 args.dst_md = pd()->dst_md();
-                ref_post_ops->execute(d, args);
+                ref_post_ops->execute(d, args, g*OC + oc);
+                if (dst_rnd_mode == rounding_mode::stochastic)
+                    d = math::stochastic_round_fwd(
+                            d, dst_off, rnd_seed[0], dst_d.data_type());
+
 
                 io::store_float_value(dst_d.data_type(), d, dst, dst_off);
             });
@@ -387,6 +394,8 @@ status_t ref_convolution_bwd_data_t::execute_backward_data(
         return ds;
     };
 
+    const auto &p = pd()->attr()->post_ops_;
+
     parallel_nd(G, MB, IC, ID, IH, IW,
             [&](dim_t g, dim_t mb, dim_t ic, dim_t id, dim_t ih, dim_t iw) {
                 float ds = 0;
@@ -396,6 +405,19 @@ status_t ref_convolution_bwd_data_t::execute_backward_data(
                 else
                     ds += ker(g, mb, ic, id, ih, iw);
 
+                int depthwise_inj_idx = 0;
+                for (int i = 0; i < p.len(); i++) {
+                    auto &post_op = p.entry_[i];
+                    if (post_op.is_depthwise()) {
+                        auto depthwise_base = CTX_IN_MEM(const float *, (DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1));
+                        auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                        auto depthwise_bias = depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts];
+
+                        ds = depthwise_injectors[depthwise_inj_idx]->compute_scalar(ds, depthwise_weights + g * IC + ic, depthwise_bias + g * IC + ic);
+                        depthwise_inj_idx++;
+                    }
+                }
+
                 const auto diff_src_off = ref_conv_utils::get_data_off(
                         diff_src_d, ndims, mb, g * IC + ic, id, ih, iw);
                 io::store_float_value(
diff --git a/src/cpu/ref_convolution.hpp b/src/cpu/ref_convolution.hpp
index dcbb7c909c0..d5ef320b6ea 100644
--- a/src/cpu/ref_convolution.hpp
+++ b/src/cpu/ref_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include "cpu/cpu_convolution_pd.hpp"
 #include "cpu/primitive_attr_postops.hpp"
 
+#include "ref_depthwise_injector.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
@@ -45,23 +47,42 @@ struct ref_convolution_fwd_t : public primitive_t {
             const auto bia_type = weights_md(1)->data_type;
             const auto dst_type = dst_md(0)->data_type;
 
-            bool ok = is_fwd()
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && platform::has_data_type_support(src_type)
-                    && platform::has_data_type_support(bia_type)
-                    && platform::has_data_type_support(dst_type)
-                    && utils::one_of(src_type, f32, bf16, f16, f8_e5m2, f8_e4m3)
-                    && src_type == wei_type
-                    && utils::one_of(dst_type, src_type, f32)
-                    && utils::one_of(bia_type, data_type::undef, src_type, f32)
-                    && set_default_formats()
-                    && attr()->has_default_values(
-                            smask_t::post_ops | smask_t::sum_dt, dst_type)
-                    && attr()->post_ops_.check_sum_consistency(
-                            dst_type, /* is_int8 */ false)
-                    && post_ops_ok()
-                    && attr_.set_default_formats(dst_md(0)) == status::success;
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(platform::has_data_type_support(src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(platform::has_data_type_support(bia_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(platform::has_data_type_support(dst_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(
+                    utils::one_of(src_type, f32, bf16, f16, f8_e5m2, f8_e4m3),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(IMPLICATION(src_type != wei_type,
+                                   utils::one_of(wei_type, f16, bf16)
+                                           && src_type == f32),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(dst_type, src_type, f32),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(
+                    utils::one_of(bia_type, data_type::undef, src_type, f32),
+                    VERBOSE_UNSUPPORTED_BIAS_CFG);
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(smask_t::post_ops
+                                    | smask_t::sum_dt | smask_t::rounding_mode,
+                            dst_type),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(attr()->post_ops_.check_sum_consistency(
+                                   dst_type, /* is_int8 */ false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(
+                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    VERBOSE_UNSUPPORTED_POSTOP);
+
+            return status::success;
         }
 
     protected:
@@ -111,16 +132,31 @@ struct ref_convolution_bwd_data_t : public primitive_t {
             const auto wei_type = weights_md(0)->data_type;
             const auto diff_dst_type = diff_dst_md(0)->data_type;
 
-            bool ok = desc()->prop_kind == prop_kind::backward_data
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && platform::has_data_type_support(diff_src_type)
-                    && platform::has_data_type_support(diff_dst_type)
-                    && utils::one_of(diff_dst_type, f32, bf16, f16)
-                    && wei_type == diff_dst_type
-                    && utils::one_of(diff_src_type, f32, diff_dst_type)
-                    && set_default_formats() && attr()->has_default_values();
-
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_CONV(desc()->prop_kind == prop_kind::backward_data,
+                    VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(platform::has_data_type_support(diff_src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(platform::has_data_type_support(diff_dst_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(diff_dst_type, f32, bf16, f16),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(IMPLICATION(wei_type != diff_dst_type,
+                                   utils::one_of(wei_type, f16, bf16)
+                                           && diff_dst_type == f32),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(diff_src_type, f32, diff_dst_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(primitive_attr_t::skip_mask_t::post_ops), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_POSTOP);
+
+            return status::success;
         }
 
     protected:
@@ -132,9 +168,41 @@ struct ref_convolution_bwd_data_t : public primitive_t {
                     : utils::pick(ndims() - 3, oiw, oihw, oidhw);
             return set_default_formats_common(dat_tag, wei_tag, dat_tag);
         }
+
+        bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
-    ref_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
+    ref_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {
+        const auto &post_ops = pd()->attr()->post_ops_;
+
+        for (int i = 0; i < post_ops.len(); i++) {
+            auto &post_op = post_ops.entry_[i];
+            if (post_op.is_depthwise()) {
+                depthwise_injectors.push_back(new ref_depthwise_scalar_fwd_t(post_op.depthwise.alg));
+            }
+        }
+    }
+
+    ~ref_convolution_bwd_data_t() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward_data(ctx);
@@ -143,6 +211,8 @@ struct ref_convolution_bwd_data_t : public primitive_t {
 private:
     status_t execute_backward_data(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    nstl::vector<ref_depthwise_scalar_fwd_t*> depthwise_injectors;
 };
 
 struct ref_convolution_bwd_weights_t : public primitive_t {
@@ -159,17 +229,27 @@ struct ref_convolution_bwd_weights_t : public primitive_t {
             const auto diff_bia_type = diff_weights_md(1)->data_type;
             const auto diff_dst_type = diff_dst_md(0)->data_type;
 
-            bool ok = desc()->prop_kind == prop_kind::backward_weights
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && platform::has_data_type_support(src_type)
-                    && platform::has_data_type_support(diff_wei_type)
-                    && utils::one_of(src_type, f32, bf16, f16)
-                    && diff_dst_type == src_type
-                    && utils::one_of(diff_wei_type, f32, src_type)
-                    && utils::one_of(
-                            diff_bia_type, data_type::undef, f32, src_type)
-                    && set_default_formats() && attr()->has_default_values();
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_CONV(desc()->prop_kind == prop_kind::backward_weights,
+                    VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(platform::has_data_type_support(src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(platform::has_data_type_support(diff_wei_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(src_type, f32, bf16, f16),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(diff_dst_type == src_type, VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(diff_wei_type, f32, src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(diff_bia_type, data_type::undef, f32,
+                                   src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+
+            return status::success;
         }
 
     protected:
diff --git a/src/cpu/ref_convolution_int8.cpp b/src/cpu/ref_convolution_int8.cpp
index b1c99eb8cda..f2c36332888 100644
--- a/src/cpu/ref_convolution_int8.cpp
+++ b/src/cpu/ref_convolution_int8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ namespace {
 void dequantize(float &d, dim_t g, dim_t C, dim_t c, const float *wei_scales,
         bool with_groups, int wei_mask, const float *src_scales) {
     // scale_idx_mult = 1 for per_channel scales and 0, otherwise
-    const int wei_scale_idx_mult = wei_mask != 0;
+    const int wei_scale_idx_mult = wei_mask > 0;
     float scale = 1.0f;
     if (src_scales) scale *= src_scales[0];
     if (wei_scales) scale *= wei_scales[(g * C + c) * wei_scale_idx_mult];
@@ -63,8 +63,7 @@ status_t ref_convolution_int8_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
 
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
@@ -107,9 +106,9 @@ status_t ref_convolution_int8_fwd_t::execute_forward(
 
     // zp_idx_mult = 1 for per_dim1 zero points and 0, otherwise
     const int src_zp_idx_mult
-            = !pd()->attr()->zero_points_.common(DNNL_ARG_SRC);
+            = pd()->attr()->zero_points_.get_mask(DNNL_ARG_SRC) > 0;
     const int dst_zp_idx_mult
-            = !pd()->attr()->zero_points_.common(DNNL_ARG_DST);
+            = pd()->attr()->zero_points_.get_mask(DNNL_ARG_DST) > 0;
 
     auto ker = [=](dim_t g, dim_t mb, dim_t oc, dim_t od, dim_t oh, dim_t ow) {
         int d = 0;
@@ -290,8 +289,7 @@ status_t ref_convolution_int8_bwd_data_t::execute_backward_data(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(diff_dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
 
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
diff --git a/src/cpu/ref_convolution_int8.hpp b/src/cpu/ref_convolution_int8.hpp
index 86a2b6a1554..b6b650c7ac5 100644
--- a/src/cpu/ref_convolution_int8.hpp
+++ b/src/cpu/ref_convolution_int8.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,22 +45,35 @@ struct ref_convolution_int8_fwd_t : public primitive_t {
             const auto bia_type = weights_md(1)->data_type;
             const auto dst_type = dst_md(0)->data_type;
 
-            bool ok = is_fwd()
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && utils::one_of(src_type, s8, u8) && wei_type == s8
-                    && IMPLICATION(with_bias(),
-                            utils::one_of(bia_type, f32, bf16, s32, s8, u8))
-                    && utils::one_of(dst_type, f32, bf16, s32, s8, u8)
-                    && set_default_formats()
-                    && attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::zero_points_runtime
-                                    | smask_t::post_ops | smask_t::sum_dt,
-                            dst_type)
-                    && attr()->post_ops_.check_sum_consistency(dst_type,
-                            /* is_int8 */ true)
-                    && attr_scales_ok() && zero_points_ok() && post_ops_ok()
-                    && attr_.set_default_formats(dst_md(0)) == status::success;
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(
+                    utils::one_of(src_type, s8, u8), VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(wei_type == s8, VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(
+                    IMPLICATION(with_bias(),
+                            utils::one_of(bia_type, f32, bf16, s32, s8, u8)),
+                    VERBOSE_UNSUPPORTED_BIAS_CFG);
+            VDISPATCH_CONV(utils::one_of(dst_type, f32, bf16, s32, s8, u8),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(smask_t::scales
+                                    | smask_t::zero_points | smask_t::post_ops
+                                    | smask_t::sum_dt,
+                            dst_type),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(attr()->post_ops_.check_sum_consistency(dst_type,
+                                   /* is_int8 */ true),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_CONV(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
+            VDISPATCH_CONV(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(
+                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            return status::success;
         }
 
     protected:
@@ -74,13 +87,18 @@ struct ref_convolution_int8_fwd_t : public primitive_t {
         }
 
         bool zero_points_ok() const {
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && (mask_src == 0 || mask_src == 1 << 1)
-                    && (mask_dst == 0 || mask_dst == 1 << 1);
+            if (!attr()->zero_points_.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = attr()->zero_points_.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0 || mask_src == (1 << 1);
+                if (!ok) return false;
+            }
+            if (!attr()->zero_points_.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = attr()->zero_points_.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0 || mask_dst == (1 << 1);
+                if (!ok) return false;
+            }
+
+            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
         bool post_ops_ok() const {
@@ -120,16 +138,22 @@ struct ref_convolution_int8_bwd_data_t : public primitive_t {
             const auto wei_type = weights_md(0)->data_type;
             const auto diff_dst_type = diff_dst_md(0)->data_type;
 
-            bool ok = desc()->prop_kind == prop_kind::backward_data
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && utils::one_of(diff_dst_type, s8, u8) && wei_type == s8
-                    && utils::one_of(diff_src_type, f32, bf16, s32, s8, u8)
-                    && set_default_formats()
-                    && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime)
-                    && attr_scales_ok();
-
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_CONV(desc()->prop_kind == prop_kind::backward_data,
+                    VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(utils::one_of(diff_dst_type, s8, u8),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(wei_type == s8, VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(utils::one_of(diff_src_type, f32, bf16, s32, s8, u8),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(attr()->has_default_values(
+                                   primitive_attr_t::skip_mask_t::scales),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+            return status::success;
         }
 
     protected:
diff --git a/src/cpu/ref_deconvolution.cpp b/src/cpu/ref_deconvolution.cpp
index facacbd2ffd..f14126764e0 100644
--- a/src/cpu/ref_deconvolution.cpp
+++ b/src/cpu/ref_deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -174,8 +174,7 @@ status_t ref_deconvolution_fwd_t::compute_oscale(
 
     DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
 
     const memory_desc_wrapper dst_d(pd()->dst_md());
 
@@ -190,7 +189,7 @@ status_t ref_deconvolution_fwd_t::compute_oscale(
     const auto maybe_oscale = [](float &d, dim_t oc, const float *src_scales,
                                       const float *wei_scales, int wei_mask) {
         // scale_idx_mult = 1 for per_oc scales and 0, otherwise
-        const int wei_scale_idx_mult = wei_mask != 0;
+        const int wei_scale_idx_mult = wei_mask > 0;
         d *= src_scales[0] * wei_scales[oc * wei_scale_idx_mult];
     };
 
@@ -216,11 +215,14 @@ status_t ref_deconvolution_fwd_t::compute_ref_attrs(const exec_ctx_t &ctx,
     auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
 
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
-    const int dst_scale_mask = pd()->attr()->scales_.get(DNNL_ARG_DST).mask_;
+    const bool has_dst_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_DST);
+    const int dst_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_DST);
 
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
-    const bool is_dst_zp_common
-            = pd()->attr()->zero_points_.common(DNNL_ARG_DST);
+    const bool has_dst_zp
+            = !pd()->attr()->zero_points_.has_default_values(DNNL_ARG_DST);
+    const int dst_zp_mask = pd()->attr()->zero_points_.get_mask(DNNL_ARG_DST);
 
     const memory_desc_wrapper dst_d(pd()->dst_md());
 
@@ -232,20 +234,6 @@ status_t ref_deconvolution_fwd_t::compute_ref_attrs(const exec_ctx_t &ctx,
     const auto OCP = dst_d.padded_dims()[1];
     const auto ndims = pd()->desc()->src_desc.ndims;
 
-    const auto maybe_dst_zero_point = [=](float &result, dim_t oc) {
-        if (is_dst_zp_common)
-            result += dst_zero_point[0];
-        else
-            result += dst_zero_point[oc];
-    };
-
-    const auto maybe_scale
-            = [](float &d, dim_t oc, const float *scales, int mask) {
-                  // scale_idx_mult = 1 for per_oc scales and 0, otherwise
-                  const int scale_idx_mult = mask != 0;
-                  d *= scales[oc * scale_idx_mult];
-              };
-
     const auto sum_dt = pd()->attr()->post_ops_.get_sum_dt(dst_d.data_type());
 
     parallel_nd(MB, OCP, OD, OH, OW,
@@ -269,8 +257,13 @@ status_t ref_deconvolution_fwd_t::compute_ref_attrs(const exec_ctx_t &ctx,
                     args.l_offset = dst_l_off;
                     args.dst_md = pd()->dst_md();
                     ref_post_ops->execute(tmp_result, args);
-                    maybe_scale(tmp_result, ocp, dst_scales, dst_scale_mask);
-                    maybe_dst_zero_point(tmp_result, ocp);
+                    if (has_dst_scales) {
+                        // scale_idx_mult = 1 for per_oc scales and 0, otherwise
+                        tmp_result *= dst_scales[ocp * (dst_scale_mask > 0)];
+                    }
+                    if (has_dst_zp) {
+                        tmp_result += dst_zero_point[ocp * (dst_zp_mask > 0)];
+                    }
                 }
                 io::store_float_value(
                         dst_d.data_type(), tmp_result, dst, dst_off);
@@ -300,7 +293,7 @@ dim_t get_weights_off(const memory_desc_wrapper &wei_d, bool with_groups,
 template <data_type_t wei_type>
 static void compute_src_zp_compensation(const exec_ctx_t &ctx,
         const int32_t *src_zero_point, const bool is_src_zp_common,
-        typename prec_traits<wei_type>::type *wei,
+        typename prec_traits_t<wei_type>::type *wei,
         const cpu_deconvolution_fwd_pd_t *pd) {
     using namespace memory_tracking::names;
 
@@ -347,7 +340,8 @@ template <data_type_t wei_type>
 static std::function<int32_t(
         const dim_t, const dim_t, const dim_t, const dim_t, const dim_t)>
 prepare_zp_pad_comp_ker(const dim_t ndims, const int32_t *src_zero_point,
-        const bool is_src_zp_common, typename prec_traits<wei_type>::type *wei,
+        const bool is_src_zp_common,
+        typename prec_traits_t<wei_type>::type *wei,
         const cpu_deconvolution_fwd_pd_t *deconv_pd) {
 
     const auto KH = deconv_pd->KH();
@@ -423,7 +417,7 @@ prepare_zp_pad_comp_ker(const dim_t ndims, const int32_t *src_zero_point,
 template <data_type_t wei_type>
 static status_t apply_src_zero_point(const exec_ctx_t &ctx,
         const cpu_deconvolution_fwd_pd_t *deconv_pd, float *conv_output) {
-    using wei_data_t = typename prec_traits<wei_type>::type;
+    using wei_data_t = typename prec_traits_t<wei_type>::type;
     using namespace memory_tracking::names;
     using namespace data_type;
 
@@ -432,7 +426,7 @@ static status_t apply_src_zero_point(const exec_ctx_t &ctx,
     const auto wei = CTX_OUT_MEM(wei_data_t *, DNNL_ARG_WEIGHTS);
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     const bool is_src_zp_common
-            = deconv_pd->attr()->zero_points_.common(DNNL_ARG_SRC);
+            = deconv_pd->attr()->zero_points_.get_mask(DNNL_ARG_SRC) == 0;
 
     const auto scratchpad = ctx.get_scratchpad_grantor();
     const int32_t *const zp_src_compensation
@@ -487,9 +481,11 @@ status_t ref_deconvolution_fwd_t::execute(const exec_ctx_t &ctx) const {
 
     // Create intermediate memory for f32 output if needed.
     auto dst = args.at(DNNL_ARG_DST);
-    memory_t tmp_memory(dst.mem->engine(), pd()->conv_pd_->diff_src_md(),
-            scratchpad.get_memory_storage(key_deconv_bias));
-    memory_arg_t tmp_conv_output = {&tmp_memory, false};
+    std::unique_ptr<memory_t, memory_deleter_t> tmp_memory;
+    CHECK(safe_ptr_assign(tmp_memory,
+            new memory_t(dst.mem->engine(), pd()->conv_pd_->diff_src_md(),
+                    scratchpad.get_memory_storage(key_deconv_bias))));
+    memory_arg_t tmp_conv_output = {tmp_memory.get(), false};
 
     conv_args[DNNL_ARG_DIFF_SRC]
             = ref_bias || non_default_attr ? tmp_conv_output : dst;
@@ -534,11 +530,10 @@ status_t ref_deconvolution_fwd_t::execute(const exec_ctx_t &ctx) const {
 
     float *conv_output = scratchpad.get<float>(key_deconv_bias);
 
-    const auto &arg_scales = pd()->attr()->scales_;
-    const auto &src_scales = arg_scales.get(DNNL_ARG_SRC);
-    const auto &wei_scales = arg_scales.get(DNNL_ARG_WEIGHTS);
+    const auto &scales = pd()->attr()->scales_;
 
-    if (!src_scales.has_default_values() || !wei_scales.has_default_values()) {
+    if (!scales.has_default_values(DNNL_ARG_SRC)
+            || !scales.has_default_values(DNNL_ARG_WEIGHTS)) {
         compute_oscale(ctx, conv_output);
     }
 
@@ -599,8 +594,8 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias(
 
 template <data_type_t dbia_type, data_type_t ddst_type>
 void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ncdhw(
-        typename prec_traits<dbia_type>::type *diff_bias,
-        const typename prec_traits<ddst_type>::type *diff_dst) const {
+        typename prec_traits_t<dbia_type>::type *diff_bias,
+        const typename prec_traits_t<ddst_type>::type *diff_dst) const {
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
 
     const auto OC = pd()->OC();
@@ -622,8 +617,8 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ncdhw(
 
 template <data_type_t dbia_type, data_type_t ddst_type>
 void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ndhwc(
-        typename prec_traits<dbia_type>::type *diff_bias,
-        const typename prec_traits<ddst_type>::type *diff_dst) const {
+        typename prec_traits_t<dbia_type>::type *diff_bias,
+        const typename prec_traits_t<ddst_type>::type *diff_dst) const {
     const auto MB = pd()->MB();
     const auto SP = pd()->OW() * pd()->OH() * pd()->OD();
     const auto OC = pd()->OC();
@@ -637,14 +632,15 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias_ndhwc(
                 db += diff_dst[offset];
             }
         }
-        diff_bias[oc] = static_cast<typename prec_traits<dbia_type>::type>(db);
+        diff_bias[oc]
+                = static_cast<typename prec_traits_t<dbia_type>::type>(db);
     });
 }
 
 template <data_type_t dbia_type, data_type_t ddst_type, dim_t blksize>
 void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc(
-        typename prec_traits<dbia_type>::type *diff_bias,
-        const typename prec_traits<ddst_type>::type *diff_dst) const {
+        typename prec_traits_t<dbia_type>::type *diff_bias,
+        const typename prec_traits_t<ddst_type>::type *diff_dst) const {
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
 
     const auto OC = pd()->OC();
@@ -677,8 +673,8 @@ void ref_deconvolution_bwd_weights_t::compute_bwd_bias_nCdhwXc(
 template <data_type_t dbia_type, data_type_t ddst_type>
 void ref_deconvolution_bwd_weights_t::compute_bias(
         const exec_ctx_t &ctx) const {
-    using dbia_data_t = typename prec_traits<dbia_type>::type;
-    using ddst_data_t = typename prec_traits<ddst_type>::type;
+    using dbia_data_t = typename prec_traits_t<dbia_type>::type;
+    using ddst_data_t = typename prec_traits_t<ddst_type>::type;
 
     auto diff_bias = CTX_OUT_MEM(dbia_data_t *, DNNL_ARG_DIFF_BIAS);
     auto diff_dst = CTX_IN_MEM(const ddst_data_t *, DNNL_ARG_DIFF_DST);
diff --git a/src/cpu/ref_deconvolution.hpp b/src/cpu/ref_deconvolution.hpp
index 05f88e54470..04e81e52399 100644
--- a/src/cpu/ref_deconvolution.hpp
+++ b/src/cpu/ref_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 * Copyright 2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -93,9 +93,7 @@ static status_t conv_descr_create(const deconvolution_desc_t *dd,
 
 struct ref_deconvolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_deconvolution_fwd_pd_t::cpu_deconvolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_deconvolution_fwd_pd_t(other)
@@ -104,8 +102,6 @@ struct ref_deconvolution_fwd_t : public primitive_t {
             , dst_tag_(other.dst_tag_)
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), ref_deconvolution_fwd_t);
 
         status_t init_convolution(engine_t *engine) {
@@ -167,14 +163,23 @@ struct ref_deconvolution_fwd_t : public primitive_t {
             using smask_t = primitive_attr_t::skip_mask_t;
             auto skip_mask = smask_t::post_ops | smask_t::sum_dt;
             if (utils::one_of(desc()->src_desc.data_type, s8, u8))
-                skip_mask |= smask_t::scales_runtime
-                        | smask_t::zero_points_runtime;
+                skip_mask |= smask_t::scales | smask_t::zero_points;
 
             VDISPATCH_DECONVOLUTION(is_fwd(), VERBOSE_BAD_PROPKIND);
             VDISPATCH_DECONVOLUTION(utils::one_of(desc()->alg_kind,
                                             alg_kind::deconvolution_direct,
                                             alg_kind::deconvolution_winograd),
                     VERBOSE_BAD_ALGORITHM);
+            // This implementation will check data types requirements through
+            // an underlying convolution implementation, however, convolution
+            // might be called without bias, thus, need to check bias data type
+            // if it was requested.
+            if (with_bias()) {
+                const auto bia_type = invariant_wei_md(1)->data_type;
+                VDISPATCH_DECONVOLUTION(utils::one_of(bia_type, f32, bf16, f16,
+                                                f8_e5m2, f8_e4m3),
+                        VERBOSE_UNSUPPORTED_DT);
+            }
             VDISPATCH_DECONVOLUTION(attr()->has_default_values(skip_mask),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_DECONVOLUTION(
@@ -256,16 +261,25 @@ struct ref_deconvolution_fwd_t : public primitive_t {
         }
 
         bool zero_points_ok() const {
+            const auto &zp = attr()->zero_points_;
+
             using namespace data_type;
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-
-            return IMPLICATION(!utils::one_of(src_md()->data_type, s8, u8),
-                           attr()->zero_points_.has_default_values())
-                    && attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && (mask_src == 0 || mask_src == 1 << 1)
-                    && (mask_dst == 0 || mask_dst == 1 << 1);
+            bool ok = IMPLICATION(!utils::one_of(src_md()->data_type, s8, u8),
+                    zp.has_default_values());
+            if (!ok) return false;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                ok = utils::one_of(mask_src, 0, (1 << 1));
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                ok = utils::one_of(mask_dst, 0, (1 << 1));
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
     };
 
@@ -312,17 +326,13 @@ struct ref_deconvolution_fwd_t : public primitive_t {
 
 struct ref_deconvolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_bwd_data_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_deconvolution_bwd_data_pd_t::cpu_deconvolution_bwd_data_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_deconvolution_bwd_data_pd_t(other)
             , conv_pd_(other.conv_pd_->clone())
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), ref_deconvolution_bwd_data_t);
 
         status_t init_convolution(engine_t *engine) {
@@ -357,7 +367,9 @@ struct ref_deconvolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_PROPKIND);
             VDISPATCH_DECONVOLUTION(utils::one_of(wei_type, f32, bf16, f16),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_DECONVOLUTION(ddst_type == wei_type,
+            VDISPATCH_DECONVOLUTION(IMPLICATION(ddst_type != wei_type,
+                                            utils::one_of(wei_type, bf16, f16)
+                                                    && ddst_type == f32),
                     VERBOSE_INCONSISTENT_DT, "diff_dst", "weights");
             VDISPATCH_DECONVOLUTION(utils::one_of(dsrc_type, wei_type, f32),
                     VERBOSE_UNSUPPORTED_DT);
@@ -396,7 +408,7 @@ struct ref_deconvolution_bwd_data_t : public primitive_t {
         }
     };
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     ref_deconvolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
@@ -404,7 +416,7 @@ struct ref_deconvolution_bwd_data_t : public primitive_t {
         return pd()->conv_pd_->create_primitive(conv_p_, engine);
     }
 
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
+#if DNNL_USE_ACL
     status_t create_resource(
             engine_t *engine, resource_mapper_t &mapper) const override {
         CHECK(conv_p_->create_resource(engine, mapper));
@@ -421,9 +433,8 @@ struct ref_deconvolution_bwd_data_t : public primitive_t {
 
 struct ref_deconvolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_bwd_weights_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_deconvolution_bwd_weights_pd_t::
+                cpu_deconvolution_bwd_weights_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_deconvolution_bwd_weights_pd_t(other)
@@ -431,8 +442,6 @@ struct ref_deconvolution_bwd_weights_t : public primitive_t {
             , dst_tag_(other.dst_tag_)
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), ref_deconvolution_bwd_weights_t);
 
         status_t init_convolution(engine_t *engine) {
@@ -469,18 +478,31 @@ struct ref_deconvolution_bwd_weights_t : public primitive_t {
         status_t init(engine_t *engine) {
             using namespace format_tag;
             using namespace data_type;
-            auto src_type = desc()->src_desc.data_type;
-            auto dwei_type = desc()->diff_weights_desc.data_type;
-            auto ddst_type = desc()->diff_dst_desc.data_type;
+            auto src_type = invariant_src_md()->data_type;
+            auto wei_type = invariant_wei_md(0)->data_type;
+            auto dst_type = invariant_dst_md()->data_type;
             VDISPATCH_DECONVOLUTION(
                     desc()->prop_kind == prop_kind::backward_weights,
                     VERBOSE_BAD_PROPKIND);
             VDISPATCH_DECONVOLUTION(utils::one_of(src_type, f32, bf16, f16),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_DECONVOLUTION(ddst_type == src_type,
+            VDISPATCH_DECONVOLUTION(dst_type == src_type,
                     VERBOSE_INCONSISTENT_DT, "diff_dst", "src");
-            VDISPATCH_DECONVOLUTION(utils::one_of(dwei_type, src_type, f32),
+            VDISPATCH_DECONVOLUTION(utils::one_of(wei_type, src_type, f32),
                     VERBOSE_UNSUPPORTED_DT);
+            // This implementation will check data types requirements through
+            // an underlying convolution implementation, however, convolution
+            // might be called without bias, thus, need to check bias data type
+            // if it was requested.
+            if (with_bias()) {
+                const auto bia_type = invariant_wei_md(1)->data_type;
+                VDISPATCH_DECONVOLUTION(utils::one_of(bia_type, f32, bf16, f16)
+                                && (bia_type == dst_type
+                                        || (bia_type == f32
+                                                && utils::one_of(
+                                                        dst_type, bf16, f16))),
+                        VERBOSE_UNSUPPORTED_DT);
+            }
             VDISPATCH_DECONVOLUTION(utils::one_of(desc()->alg_kind,
                                             alg_kind::deconvolution_direct,
                                             alg_kind::deconvolution_winograd),
@@ -539,18 +561,18 @@ struct ref_deconvolution_bwd_weights_t : public primitive_t {
 
     template <data_type_t dbia_type, data_type_t ddst_type>
     void compute_bwd_bias_ncdhw(
-            typename prec_traits<dbia_type>::type *diff_bias,
-            const typename prec_traits<ddst_type>::type *diff_dst) const;
+            typename prec_traits_t<dbia_type>::type *diff_bias,
+            const typename prec_traits_t<ddst_type>::type *diff_dst) const;
 
     template <data_type_t dbia_type, data_type_t ddst_type>
     void compute_bwd_bias_ndhwc(
-            typename prec_traits<dbia_type>::type *diff_bias,
-            const typename prec_traits<ddst_type>::type *diff_dst) const;
+            typename prec_traits_t<dbia_type>::type *diff_bias,
+            const typename prec_traits_t<ddst_type>::type *diff_dst) const;
 
     template <data_type_t dbia_type, data_type_t ddst_type, dim_t blksize>
     void compute_bwd_bias_nCdhwXc(
-            typename prec_traits<dbia_type>::type *diff_bias,
-            const typename prec_traits<ddst_type>::type *diff_dst) const;
+            typename prec_traits_t<dbia_type>::type *diff_bias,
+            const typename prec_traits_t<ddst_type>::type *diff_dst) const;
 
     template <data_type_t dbia_type, data_type_t ddst_type>
     void compute_bias(const exec_ctx_t &ctx) const;
diff --git a/src/cpu/ref_depthwise_injector.cpp b/src/cpu/ref_depthwise_injector.cpp
new file mode 100644
index 00000000000..585b661324a
--- /dev/null
+++ b/src/cpu/ref_depthwise_injector.cpp
@@ -0,0 +1,78 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ref_depthwise_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+using namespace alg_kind;
+
+template <typename T> inline T scale_shift_fwd(T s_val, T w_val, T b_val) {
+    return s_val*w_val + b_val;
+}
+
+template <typename T> inline T prelu_fwd(T s_val, T w_val) {
+    return s_val >= 0 ? s_val : s_val*w_val;
+}
+
+union float_raw {
+    float f;
+    unsigned short i[2];
+};
+
+static float bf16tof32(bfloat16_t bf16) {
+    union float_raw t = { 0 };
+    t.i[1] = bf16;
+    t.i[0] = 0;
+    return t.f;
+}
+
+static bfloat16_t f32tobf16(float f32) {
+    union float_raw t = { 0 };
+    t.f = f32;
+    return t.i[1];
+}
+
+inline bfloat16_t bf16_scale_shift_fwd(bfloat16_t s_val, bfloat16_t w_val, bfloat16_t b_val) {
+    return f32tobf16(bf16tof32(s_val) * bf16tof32(w_val) + bf16tof32(b_val));
+}
+
+inline bfloat16_t bf16_prelu_fwd(bfloat16_t s_val, bfloat16_t w_val) {
+    return s_val >= 0 ? s_val : f32tobf16(bf16tof32(s_val) * bf16tof32(w_val));
+}
+
+ref_depthwise_scalar_fwd_t::ref_depthwise_scalar_fwd_t(const alg_kind_t alg_)
+        : alg(alg_) {
+    using namespace alg_kind;
+
+    assert(utils::one_of(alg, depthwise_scale_shift, depthwise_prelu));
+}
+
+float ref_depthwise_scalar_fwd_t::compute_scalar(float s, const float* weights, const float* bias) const {
+    switch (alg) {
+        case depthwise_scale_shift: return scale_shift_fwd(s, *weights, *bias);
+        case depthwise_prelu: return prelu_fwd(s, *weights);
+        default: assert(!"unknown depthwise alg_kind");
+    }
+
+    return 0.0f;
+}
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/ref_depthwise_injector.hpp b/src/cpu/ref_depthwise_injector.hpp
new file mode 100644
index 00000000000..1a56e28cdc2
--- /dev/null
+++ b/src/cpu/ref_depthwise_injector.hpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef REF_DEPTHWISE_INJECTOR_HPP
+#define REF_DEPTHWISE_INJECTOR_HPP
+
+#include "common/primitive.hpp"
+#include "common/primitive_attr.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+struct ref_depthwise_scalar_fwd_t {
+public:
+    explicit ref_depthwise_scalar_fwd_t(alg_kind_t alg);
+    float compute_scalar(float s, const float* weights, const float* bias) const;
+
+private:
+    alg_kind_t alg;
+};
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/ref_eltwise.hpp b/src/cpu/ref_eltwise.hpp
index df0724e40b9..2adaa11c32c 100644
--- a/src/cpu/ref_eltwise.hpp
+++ b/src/cpu/ref_eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ struct ref_eltwise_fwd_t : public primitive_t {
         return status::success;
     }
 
-    using data_t = typename prec_traits<data_type>::type;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         if (pd()->use_dense_)
@@ -172,7 +172,7 @@ struct ref_eltwise_bwd_t : public primitive_t {
     };
 
     ref_eltwise_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         if (pd()->use_dense_)
diff --git a/src/cpu/ref_fused_convolution.hpp b/src/cpu/ref_fused_convolution.hpp
index 5fa764fcf3b..c01e2d1d008 100644
--- a/src/cpu/ref_fused_convolution.hpp
+++ b/src/cpu/ref_fused_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2022 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -74,23 +74,17 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
     };
 
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {
-            name_ = "ref_fused_convolution:any";
-        }
-
-        pd_t(const pd_t &other) = default;
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(name_.c_str(), ref_fused_convolution_fwd_t);
 
         virtual status_t init(engine_t *engine) {
             using namespace primitive_kind;
-            bool ok = true && is_fwd()
-                    && attr()->post_ops_.has_default_values(
-                            {binary, eltwise, convolution});
 
-            if (!ok) return status::unimplemented;
+            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(attr()->post_ops_.has_default_values(
+                                   {binary, eltwise, convolution}),
+                    VERBOSE_UNSUPPORTED_ATTR);
 
             CHECK(init_ops(engine));
             init_name();
@@ -99,21 +93,29 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *src_md(
                 int index = 0, bool user_input = false) const override {
+            if (op_pds_.empty())
+                return cpu_convolution_fwd_pd_t::src_md(index, user_input);
             return op_pds_.front()->src_md(index, user_input);
         }
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
+            if (op_pds_.empty())
+                return cpu_convolution_fwd_pd_t::dst_md(index, user_input);
             return op_pds_.back()->dst_md(index, user_input);
         }
 
         const memory_desc_t *weights_md(
                 int index = 0, bool user_input = false) const override {
+            if (op_pds_.empty())
+                return cpu_convolution_fwd_pd_t::weights_md(index, user_input);
             return op_pds_.front()->weights_md(index, user_input); // for now
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
+            if (op_pds_.empty())
+                return cpu_convolution_fwd_pd_t::arg_md(arg, user_input);
             // Binary post-op:
             // format_tag::any should be supported here since output dst_md
             // may be different from the intermediate one and they should be
@@ -157,9 +159,9 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC))
                 return arg_usage_t::input;
@@ -175,7 +177,7 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
         std::vector<arg_cache_t> args_;
 
     private:
-        std::string name_;
+        std::string name_ = "ref_fused_convolution:any";
         const unsigned int max_fusions_ = 1;
 
         status_t append_op(std::shared_ptr<primitive_desc_t> &op_pd,
@@ -222,10 +224,10 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
             primitive_attr_t attr_1x1(*attr());
             // erase dw_conv post-op scales
             for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-                auto &scale
-                        = attr_1x1.scales_.get(DNNL_ARG_ATTR_POST_OP_DW | arg);
-                if (!scale.has_default_values())
-                    attr_1x1.scales_.reset(DNNL_ARG_ATTR_POST_OP_DW | arg);
+                if (!attr_1x1.scales_.has_default_values(
+                            DNNL_ARG_ATTR_POST_OP_DW | arg))
+                    CHECK(attr_1x1.scales_.set(DNNL_ARG_ATTR_POST_OP_DW | arg,
+                            default_quant_entry()));
             }
             // erase post-ops after fusion as they will be handled separately
             auto &e = attr_1x1.post_ops_.entry_;
@@ -248,7 +250,7 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
             arg_cache.append_ctx_arg(DNNL_ARG_SRC);
             arg_cache.append_ctx_arg(DNNL_ARG_WEIGHTS);
             for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST})
-                if (!attr_1x1.scales_.get(arg).has_default_values())
+                if (!attr_1x1.scales_.has_default_values(arg))
                     arg_cache.append_ctx_arg(DNNL_ARG_ATTR_SCALES | arg);
             if (desc()->bias_desc.data_type != data_type::undef)
                 arg_cache.append_ctx_arg(DNNL_ARG_BIAS);
@@ -314,12 +316,12 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
                 arg_cache.append_ctx_arg(DNNL_ARG_WEIGHTS,
                         DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS);
                 for (auto arg : {DNNL_ARG_WEIGHTS, DNNL_ARG_DST})
-                    if (!attr_dw.scales_.get(arg).has_default_values())
+                    if (!attr_dw.scales_.has_default_values(arg))
                         arg_cache.append_ctx_arg(DNNL_ARG_ATTR_SCALES | arg,
                                 DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_ATTR_SCALES
                                         | arg);
                 // dw_conv src_scale = 1x1_conv dst_scale
-                if (!attr_1x1.scales_.get(DNNL_ARG_DST).has_default_values())
+                if (!attr_1x1.scales_.has_default_values(DNNL_ARG_DST))
                     arg_cache.append_ctx_arg(
                             DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC,
                             DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
@@ -387,7 +389,7 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
         return status::success;
     }
 
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
+#if DNNL_USE_ACL
     status_t create_resource(
             engine_t *engine, resource_mapper_t &mapper) const override {
         for (auto &p : primitives_) {
@@ -406,7 +408,7 @@ struct ref_fused_convolution_fwd_t : public primitive_t {
 
         const auto &ctx_args = ctx.args();
         const auto op_count = primitives_.size();
-        std::vector<std::unique_ptr<memory_t>> inout_memory;
+        std::vector<std::unique_ptr<memory_t, memory_deleter_t>> inout_memory;
 
         for (size_t i = 0; i < op_count; ++i) {
             const auto &op = primitives_[i];
diff --git a/src/cpu/ref_group_normalization.hpp b/src/cpu/ref_group_normalization.hpp
index 6a4e0aba676..86397164f98 100644
--- a/src/cpu/ref_group_normalization.hpp
+++ b/src/cpu/ref_group_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,9 +50,8 @@ struct ref_group_normalization_fwd_t : public primitive_t {
                             && platform::has_data_type_support(
                                     dst_md()->data_type),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_GNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+            VDISPATCH_GNORM(attr()->has_default_values(skip_mask_t::scales
+                                    | skip_mask_t::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_GNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_GNORM(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
diff --git a/src/cpu/ref_inner_product.hpp b/src/cpu/ref_inner_product.hpp
index 98f8df93557..042be1fb7ed 100644
--- a/src/cpu/ref_inner_product.hpp
+++ b/src/cpu/ref_inner_product.hpp
@@ -57,7 +57,8 @@ struct ref_inner_product_fwd_t : public primitive_t {
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_INNER_PRODUCT(platform::has_data_type_support(dst_type),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_INNER_PRODUCT(utils::one_of(src_type, f32, bf16, f16),
+            VDISPATCH_INNER_PRODUCT(
+                    utils::one_of(src_type, f32, bf16, f16, f8_e5m2, f8_e4m3),
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_INNER_PRODUCT(wei_type == src_type,
                     VERBOSE_INCONSISTENT_DT, "weights", "src");
diff --git a/src/cpu/ref_inner_product_int8.cpp b/src/cpu/ref_inner_product_int8.cpp
index 91198c680ab..322f39da638 100644
--- a/src/cpu/ref_inner_product_int8.cpp
+++ b/src/cpu/ref_inner_product_int8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -76,13 +76,12 @@ status_t ref_inner_product_int8_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
     const auto &attr_scales = pd()->attr()->scales_;
-    const bool with_dst_scales
-            = !attr_scales.get(DNNL_ARG_DST).has_default_values();
+    const bool with_dst_scales = !attr_scales.has_default_values(DNNL_ARG_DST);
 
     auto maybe_oscale = [&](float &d, dim_t oc) {
         // scale_idx_mult = 1 for per_oc scales and 0, otherwise
         const int scale_idx_mult
-                = attr_scales.get(DNNL_ARG_WEIGHTS).mask_ == (1 << 0);
+                = attr_scales.get_mask(DNNL_ARG_WEIGHTS) == (1 << 0);
         d *= src_scales[0] * wei_scales[oc * scale_idx_mult];
     };
 
diff --git a/src/cpu/ref_inner_product_int8.hpp b/src/cpu/ref_inner_product_int8.hpp
index f905715803f..4f16d2e368d 100644
--- a/src/cpu/ref_inner_product_int8.hpp
+++ b/src/cpu/ref_inner_product_int8.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ struct ref_inner_product_int8_fwd_t : public primitive_t {
                     set_default_params(allow_all_tags) == status::success,
                     VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_INNER_PRODUCT(
-                    attr()->has_default_values(smask_t::scales_runtime
+                    attr()->has_default_values(smask_t::scales
                             | smask_t::post_ops | smask_t::sum_dt),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_INNER_PRODUCT(
diff --git a/src/cpu/ref_io_helper.hpp b/src/cpu/ref_io_helper.hpp
index fc5ddb22998..046425bc044 100644
--- a/src/cpu/ref_io_helper.hpp
+++ b/src/cpu/ref_io_helper.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ inline int load_int_value(data_type_t dt, const void *ptr, dim_t idx) {
 #define CASE(dt) \
     case dt: \
         return static_cast<int>( \
-                reinterpret_cast<const typename prec_traits<dt>::type *>( \
+                reinterpret_cast<const typename prec_traits_t<dt>::type *>( \
                         ptr)[idx]);
 
     using namespace data_type;
@@ -44,17 +44,15 @@ inline int load_int_value(data_type_t dt, const void *ptr, dim_t idx) {
         CASE(s8);
         CASE(u8);
         case s4: {
-            const auto shift = idx % 2 ? int4_extract_t::high_half
-                                       : int4_extract_t::low_half;
-            auto val = int4_t::extract(
-                    reinterpret_cast<const uint8_t *>(ptr)[idx / 2], shift);
+            const nibble2_t nibble_pair(
+                    reinterpret_cast<const uint8_t *>(ptr)[idx / 2]);
+            int4_t val(nibble_pair.get(idx % 2));
             return static_cast<int>(val);
         }
         case u4: {
-            const auto shift = idx % 2 ? int4_extract_t::high_half
-                                       : int4_extract_t::low_half;
-            auto val = uint4_t::extract(
-                    reinterpret_cast<const uint8_t *>(ptr)[idx / 2], shift);
+            const nibble2_t nibble_pair(
+                    reinterpret_cast<const uint8_t *>(ptr)[idx / 2]);
+            uint4_t val(nibble_pair.get(idx % 2));
             return static_cast<int>(val);
         }
         default: assert(!"bad data_type");
@@ -64,12 +62,12 @@ inline int load_int_value(data_type_t dt, const void *ptr, dim_t idx) {
     return INT_MAX;
 }
 
-inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) {
+FORCE_INLINE float load_float_value(data_type_t dt, const void *ptr, dim_t idx) {
     assert(ptr);
 #define CASE(dt) \
     case dt: \
         return static_cast<float>( \
-                reinterpret_cast<const typename prec_traits<dt>::type *>( \
+                reinterpret_cast<const typename prec_traits_t<dt>::type *>( \
                         ptr)[idx]);
 
     using namespace data_type;
@@ -84,17 +82,27 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) {
         CASE(u8);
         CASE(e8m0);
         case s4: {
-            const auto shift = idx % 2 ? int4_extract_t::high_half
-                                       : int4_extract_t::low_half;
-            auto val = int4_t::extract(
-                    reinterpret_cast<const uint8_t *>(ptr)[idx / 2], shift);
+            const nibble2_t nibble_pair(
+                    static_cast<const uint8_t *>(ptr)[idx / 2]);
+            int4_t val(nibble_pair.get(idx % 2));
             return static_cast<float>(val);
         }
         case u4: {
-            const auto shift = idx % 2 ? int4_extract_t::high_half
-                                       : int4_extract_t::low_half;
-            auto val = uint4_t::extract(
-                    reinterpret_cast<const uint8_t *>(ptr)[idx / 2], shift);
+            const nibble2_t nibble_pair(
+                    static_cast<const uint8_t *>(ptr)[idx / 2]);
+            uint4_t val(nibble_pair.get(idx % 2));
+            return static_cast<float>(val);
+        }
+        case f4_e2m1: {
+            const nibble2_t nibble_pair
+                    = reinterpret_cast<const nibble2_t *>(ptr)[idx / 2];
+            float4_e2m1_t val(nibble_pair.get(idx % 2), true);
+            return static_cast<float>(val);
+        }
+        case f4_e3m0: {
+            const nibble2_t nibble_pair
+                    = reinterpret_cast<const nibble2_t *>(ptr)[idx / 2];
+            float4_e3m0_t val(nibble_pair.get(idx % 2), true);
             return static_cast<float>(val);
         }
         default: assert(!"bad data_type");
@@ -108,7 +116,7 @@ inline void store_float_value(data_type_t dt, float val, void *ptr, dim_t idx) {
     assert(ptr);
 #define CASE(dt) \
     case dt: { \
-        using type_ = typename prec_traits<dt>::type; \
+        using type_ = typename prec_traits_t<dt>::type; \
         *(reinterpret_cast<type_ *>(ptr) + idx) \
                 = cpu::q10n::saturate_and_round<type_>(val); \
     } break;
@@ -123,6 +131,22 @@ inline void store_float_value(data_type_t dt, float val, void *ptr, dim_t idx) {
         CASE(s32);
         CASE(s8);
         CASE(u8);
+        case f4_e2m1: {
+            auto dst_ = reinterpret_cast<nibble2_t *>(ptr);
+            nibble2_t nibble_pair = dst_[idx / 2];
+            float4_e2m1_t f4_val(val);
+            nibble_pair.set(f4_val.raw_bits_, idx % 2);
+            dst_[idx / 2] = nibble_pair;
+            break;
+        }
+        case f4_e3m0: {
+            auto dst_ = reinterpret_cast<nibble2_t *>(ptr);
+            nibble2_t nibble_pair = dst_[idx / 2];
+            float4_e3m0_t f4_val(val);
+            nibble_pair.set(f4_val.raw_bits_, idx % 2);
+            dst_[idx / 2] = nibble_pair;
+            break;
+        }
         default: assert(!"bad data_type");
     }
 
diff --git a/src/cpu/ref_layer_normalization.hpp b/src/cpu/ref_layer_normalization.hpp
index e6865cb7546..20cf4eb0fde 100644
--- a/src/cpu/ref_layer_normalization.hpp
+++ b/src/cpu/ref_layer_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -63,9 +63,8 @@ struct ref_layer_normalization_fwd_t : public primitive_t {
             VDISPATCH_LNORM(check_scale_shift_data_type({f32, bf16, f16}),
                     VERBOSE_UNSUPPORTED_FEATURE,
                     "unsupported scale or shift data type");
-            VDISPATCH_LNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+            VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales
+                                    | skip_mask_t::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_LNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_LNORM(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
diff --git a/src/cpu/ref_lrn.hpp b/src/cpu/ref_lrn.hpp
index 85dfa0c9b97..6fe97419eb6 100644
--- a/src/cpu/ref_lrn.hpp
+++ b/src/cpu/ref_lrn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,7 +69,7 @@ struct ref_lrn_fwd_t : public primitive_t {
     };
 
     ref_lrn_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<d_type>::type data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         using namespace format_tag;
@@ -127,7 +127,7 @@ struct ref_lrn_bwd_t : public primitive_t {
     };
 
     ref_lrn_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    typedef typename prec_traits<d_type>::type data_t;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         using namespace format_tag;
diff --git a/src/cpu/ref_pooling.cpp b/src/cpu/ref_pooling.cpp
index 00dcb566860..6c3368f0aad 100644
--- a/src/cpu/ref_pooling.cpp
+++ b/src/cpu/ref_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,13 +43,13 @@ static inline dim_t get_offset(const memory_desc_wrapper &mdw, dim_t n, dim_t c,
 
 using namespace nstl;
 
-template <data_type_t data_type, data_type_t acc_type>
-status_t ref_pooling_fwd_t<data_type, acc_type>::execute_forward(
+template <data_type_t src_type, data_type_t dst_type, data_type_t acc_type>
+status_t ref_pooling_fwd_t<src_type, dst_type, acc_type>::execute_forward(
         const exec_ctx_t &ctx) const {
 
     status_t status = status::success;
-    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
-    auto dst = CTX_OUT_CLEAN_MEM(data_t *, DNNL_ARG_DST, status);
+    auto src = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
+    auto dst = CTX_OUT_CLEAN_MEM(dst_data_t *, DNNL_ARG_DST, status);
     CHECK(status);
     auto ws = CTX_OUT_CLEAN_MEM(unsigned char *, DNNL_ARG_WORKSPACE, status);
     CHECK(status);
@@ -89,7 +89,7 @@ status_t ref_pooling_fwd_t<data_type, acc_type>::execute_forward(
             const auto off = get_offset(ws_d, mb, oc, od, oh, ow);
             if (ws_dt == data_type::u8) {
                 assert(0 <= value
-                        && value <= numeric_limits<typename prec_traits<
+                        && value <= numeric_limits<typename prec_traits_t<
                                         data_type::u8>::type>::max());
                 ws[off] = value;
             } else
@@ -167,12 +167,39 @@ status_t ref_pooling_fwd_t<data_type, acc_type>::execute_forward(
                     * (KW - iw_start_excluded - iw_end_excluded);
         }
         d /= num_summands;
+
+        const auto &p = pd()->attr()->post_ops_;
+        for (int i = 0; i < p.len(); i++) {
+            auto &post_op = p.entry_[i];
+            if (post_op.is_quantization()) {
+                auto quant = post_op.quantization;
+                auto quantization_base = CTX_IN_MEM(const float *, (DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1));
+                const auto crop_low_data =  quantization_base + quant.offset[quant.crop_low];
+                const auto crop_high_data =  quantization_base + quant.offset[quant.crop_high];
+                const auto inp_scale_data = quantization_base + quant.offset[quant.inp_scale];
+                const auto inp_shift_data = quantization_base + quant.offset[quant.inp_shift];
+                const auto output_scale_data = quantization_base + quant.offset[quant.output_scale];
+                const auto output_shift_data = quantization_base + quant.offset[quant.output_shift];
+
+                float cl = crop_low_data[!quant.per_channel[quant.crop_low] ? 0 : oc];
+                float ch = crop_high_data[!quant.per_channel[quant.crop_high] ? 0 : oc];
+                float isc = inp_scale_data[!quant.per_channel[quant.inp_scale] ? 0 : oc];
+                float ish = inp_shift_data[!quant.per_channel[quant.inp_shift] ? 0 : oc];
+                float osc = output_scale_data[!quant.per_channel[quant.output_scale] ? 0 : oc];
+                float osh = output_shift_data[!quant.per_channel[quant.output_shift] ? 0 : oc];
+
+                d = nstl::min(ch, nstl::max(cl, d));
+                d = d * isc + ish;
+                d = roundf(d);
+                d = d * osc + osh;
+            }
+        }
     };
 
     const bool is_max_pool = alg == alg_kind::pooling_max;
 
     float base_res
-            = is_max_pool ? (float)numeric_limits<data_t>::lowest() : 0.f;
+            = is_max_pool ? (float)numeric_limits<src_data_t>::lowest() : 0.f;
     using ker_t
             = std::function<void(float &, dim_t, dim_t, dim_t, dim_t, dim_t)>;
     ker_t kernel = is_max_pool ? (ker_t)ker_max : (ker_t)ker_avg;
@@ -191,7 +218,7 @@ status_t ref_pooling_fwd_t<data_type, acc_type>::execute_forward(
                 args.dst_md = pd()->dst_md();
                 ref_post_ops->execute(res, args);
 
-                dst[data_p_off] = cpu::q10n::saturate_and_round<data_t>(res);
+                dst[data_p_off] = cpu::q10n::saturate_and_round<dst_data_t>(res);
             });
 
     return status::success;
@@ -371,14 +398,16 @@ status_t ref_pooling_bwd_t::execute(const exec_ctx_t &ctx) const {
     return status::success;
 }
 
-template struct ref_pooling_fwd_t<data_type::f32>;
-template struct ref_pooling_fwd_t<data_type::s32>;
-template struct ref_pooling_fwd_t<data_type::bf16, data_type::f32>;
-template struct ref_pooling_fwd_t<data_type::f16, data_type::f32>;
-template struct ref_pooling_fwd_t<data_type::f8_e5m2, data_type::f32>;
-template struct ref_pooling_fwd_t<data_type::f8_e4m3, data_type::f32>;
-template struct ref_pooling_fwd_t<data_type::s8, data_type::s32>;
-template struct ref_pooling_fwd_t<data_type::u8, data_type::s32>;
+template struct ref_pooling_fwd_t<data_type::f32, data_type::f32, data_type::f32>;
+template struct ref_pooling_fwd_t<data_type::s32, data_type::s32, data_type::s32>;
+template struct ref_pooling_fwd_t<data_type::bf16, data_type::bf16, data_type::f32>;
+template struct ref_pooling_fwd_t<data_type::f16, data_type::f16,data_type::f32>;
+template struct ref_pooling_fwd_t<data_type::f8_e5m2, data_type::f8_e5m2, data_type::f32>;
+template struct ref_pooling_fwd_t<data_type::f8_e4m3, data_type::f8_e4m3, data_type::f32>;
+template struct ref_pooling_fwd_t<data_type::s8, data_type::s8, data_type::s32>;
+template struct ref_pooling_fwd_t<data_type::u8, data_type::u8, data_type::s32>;
+template struct ref_pooling_fwd_t<data_type::s8, data_type::f32, data_type::f32>;
+template struct ref_pooling_fwd_t<data_type::u8, data_type::f32, data_type::f32>;
 
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/ref_pooling.hpp b/src/cpu/ref_pooling.hpp
index d6e89f5b195..c11de0703b5 100644
--- a/src/cpu/ref_pooling.hpp
+++ b/src/cpu/ref_pooling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace dnnl {
 namespace impl {
 namespace cpu {
 
-template <impl::data_type_t data_type, impl::data_type_t acc_type = data_type>
+template <impl::data_type_t src_type, impl::data_type_t dst_type, impl::data_type_t acc_type = src_type>
 struct ref_pooling_fwd_t : public primitive_t {
     struct pd_t : public cpu_pooling_fwd_pd_t {
         using cpu_pooling_fwd_pd_t::cpu_pooling_fwd_pd_t;
@@ -43,23 +43,29 @@ struct ref_pooling_fwd_t : public primitive_t {
         status_t init(engine_t *engine) {
             using sm = primitive_attr_t::skip_mask_t;
 
-            VDISPATCH_POOLING(platform::has_data_type_support(data_type),
+            VDISPATCH_POOLING(platform::has_data_type_support(src_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_POOLING(platform::has_data_type_support(dst_type),
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_POOLING(set_default_params() == status::success,
                     VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_POOLING(is_fwd(), VERBOSE_BAD_PROPKIND);
-            VDISPATCH_POOLING(utils::everyone_is(data_type, src_md()->data_type,
-                                      dst_md()->data_type),
+            VDISPATCH_POOLING(utils::everyone_is(src_type, src_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_POOLING(utils::everyone_is(dst_type, dst_md()->data_type),
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_POOLING(desc()->accum_data_type == acc_type,
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_POOLING(attr()->has_default_values(sm::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
+            // VDISPATCH_POOLING(
+            //         ref_post_ops_t::primitive_kind_ok(attr()->post_ops_),
+            //         VERBOSE_UNSUPPORTED_POSTOP);
             VDISPATCH_POOLING(
-                    ref_post_ops_t::primitive_kind_ok(attr()->post_ops_),
+                    attr_.set_default_formats(dst_md(0)) == status::success,
                     VERBOSE_UNSUPPORTED_POSTOP);
             VDISPATCH_POOLING(
-                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    is_supported_post_ops(),
                     VERBOSE_UNSUPPORTED_POSTOP);
 
             bool is_training = desc_.prop_kind == prop_kind::forward_training;
@@ -68,6 +74,24 @@ struct ref_pooling_fwd_t : public primitive_t {
 
             return status::success;
         }
+
+        virtual bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::quantization);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported() &&
+                   IMPLICATION(p.len() > 0, (desc()->alg_kind == dnnl_pooling_avg_include_padding || desc()->alg_kind == dnnl_pooling_avg_exclude_padding) &&
+                                           src_type != data_type::bf16);
+
+        }
     };
 
     ref_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
@@ -80,8 +104,9 @@ struct ref_pooling_fwd_t : public primitive_t {
         return status::success;
     }
 
-    using data_t = typename prec_traits<data_type>::type;
-    using acc_data_t = typename prec_traits<acc_type>::type;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
+    using acc_data_t = typename prec_traits_t<acc_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
diff --git a/src/cpu/ref_reduction.hpp b/src/cpu/ref_reduction.hpp
index 5017b5c57ef..e59033d3461 100644
--- a/src/cpu/ref_reduction.hpp
+++ b/src/cpu/ref_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -75,9 +75,9 @@ struct ref_reduction_t : public primitive_t {
         return status::success;
     }
 
-    using src_t = typename prec_traits<src_type>::type;
-    using acc_t = typename prec_traits<acc_type>::type;
-    using dst_t = typename prec_traits<dst_type>::type;
+    using src_t = typename prec_traits_t<src_type>::type;
+    using acc_t = typename prec_traits_t<acc_type>::type;
+    using dst_t = typename prec_traits_t<dst_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_ref(ctx);
diff --git a/src/cpu/ref_resampling.cpp b/src/cpu/ref_resampling.cpp
index 740ae062084..63456c620b1 100644
--- a/src/cpu/ref_resampling.cpp
+++ b/src/cpu/ref_resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ template <data_type_t type>
 load_fn_t create_load() {
     return [](const byte *base, dim_t offset) -> float {
         return static_cast<float>(
-                reinterpret_cast<const typename prec_traits<type>::type *>(
+                reinterpret_cast<const typename prec_traits_t<type>::type *>(
                         base)[offset]);
     };
 }
@@ -55,7 +55,7 @@ load_fn_t create_load<data_type::f32>() {
 }
 template <data_type_t type>
 store_fn_t create_store() {
-    using dst_t = typename prec_traits<type>::type;
+    using dst_t = typename prec_traits_t<type>::type;
     return [](const float val, byte *base, const dim_t offset) {
         *reinterpret_cast<dst_t *>(base + sizeof(dst_t) * offset)
                 = cpu::q10n::saturate_and_round<dst_t>(val);
diff --git a/src/cpu/ref_resampling.hpp b/src/cpu/ref_resampling.hpp
index bb0c4e63465..cc6941ca58e 100644
--- a/src/cpu/ref_resampling.hpp
+++ b/src/cpu/ref_resampling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,7 +67,8 @@ struct ref_resampling_fwd_t : public primitive_t {
     };
 
     ref_resampling_fwd_t(const pd_t *apd);
-    ~ref_resampling_fwd_t();
+
+    ~ref_resampling_fwd_t() override;
 
     status_t init(engine_t *engine) override {
         ref_post_ops_
@@ -114,7 +115,8 @@ struct ref_resampling_bwd_t : public primitive_t {
     };
 
     ref_resampling_bwd_t(const pd_t *apd);
-    ~ref_resampling_bwd_t();
+
+    ~ref_resampling_bwd_t() override;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         execute_backward(ctx);
diff --git a/src/cpu/ref_shuffle.cpp b/src/cpu/ref_shuffle.cpp
index 0e7d86eeb82..81f8b58296f 100644
--- a/src/cpu/ref_shuffle.cpp
+++ b/src/cpu/ref_shuffle.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ template <int data_type_size>
 status_t ref_shuffle_t::execute_(const exec_ctx_t &ctx) const {
     using namespace prop_kind;
     using namespace utils;
-    using data_t = typename typesize_traits<data_type_size>::type;
+    using data_t = typename typesize_traits_t<data_type_size>::type;
 
     const memory_desc_wrapper src_d(
             pd()->is_fwd() ? pd()->src_md() : pd()->diff_src_md());
diff --git a/src/cpu/ref_shuffle.hpp b/src/cpu/ref_shuffle.hpp
index 5d2adf13407..168c7cd6170 100644
--- a/src/cpu/ref_shuffle.hpp
+++ b/src/cpu/ref_shuffle.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -92,7 +92,7 @@ struct ref_shuffle_t : public primitive_t {
         return dnnl_success;
     }
 
-    ~ref_shuffle_t() { free(rev_transposed_); }
+    ~ref_shuffle_t() override { free(rev_transposed_); }
 
     status_t execute(const exec_ctx_t &ctx) const override {
         const memory_desc_wrapper src_d(
diff --git a/src/cpu/ref_softmax.cpp b/src/cpu/ref_softmax.cpp
index 93c93f13a02..b709ddf88ea 100644
--- a/src/cpu/ref_softmax.cpp
+++ b/src/cpu/ref_softmax.cpp
@@ -53,7 +53,9 @@ status_t ref_softmax_fwd_t::execute_forward_dense(const exec_ctx_t &ctx) const {
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
 
-    const auto interim_dt = data_type::f32;
+    const auto interim_dt = pd()->need_intermediate_scratchpad()
+            ? data_type::f32
+            : dst_d.data_type();
     const auto is_inplace = (src == dst);
     const auto has_padding = is_padding(dst_d);
     const auto zero_padding = has_padding && !is_inplace;
@@ -210,7 +212,9 @@ status_t ref_softmax_fwd_t::execute_forward_generic(
 
     void *interim_ptr
             = pd()->need_intermediate_scratchpad() ? interim_scratchpad : dst;
-    const auto interim_dt = data_type::f32;
+    const auto interim_dt = pd()->need_intermediate_scratchpad()
+            ? data_type::f32
+            : dst_d.data_type();
     const auto is_inplace = (src == dst);
     const auto has_padding = is_padding(dst_d);
     if (has_padding && !is_inplace) {
diff --git a/src/cpu/ref_softmax.hpp b/src/cpu/ref_softmax.hpp
index 8f05eb08b78..de75fee361d 100644
--- a/src/cpu/ref_softmax.hpp
+++ b/src/cpu/ref_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,11 +30,6 @@
 
 #include "cpu/cpu_softmax_pd.hpp"
 
-#define VCHECK_SOFTMAX(cond, msg, ...) \
-    VCONDCHECK(primitive, create, dispatch, softmax, (cond), \
-            status::unimplemented, "%s," msg, this->info(engine), \
-            ##__VA_ARGS__)
-
 namespace dnnl {
 namespace impl {
 namespace cpu {
@@ -49,26 +44,30 @@ struct ref_softmax_fwd_t : public primitive_t {
             using namespace data_type;
             using skip_mask_t = primitive_attr_t::skip_mask_t;
 
-            bool ok = is_fwd()
-                    && utils::one_of(
-                            src_md()->data_type, f32, bf16, f16, s8, u8)
-                    && utils::one_of(
-                            dst_md()->data_type, f32, bf16, f16, s8, u8)
-                    && platform::has_data_type_support(src_md()->data_type)
-                    && platform::has_data_type_support(dst_md()->data_type);
-            if (!ok) return status::unimplemented;
-
-            VCHECK_SOFTMAX(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+            VDISPATCH_SOFTMAX(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_SOFTMAX(
+                    platform::has_data_type_support(src_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(
+                    platform::has_data_type_support(dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(
+                    utils::one_of(src_md()->data_type, f32, bf16, f16, s8, u8),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(
+                    utils::one_of(dst_md()->data_type, f32, bf16, f16, s8, u8),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(attr()->has_default_values(skip_mask_t::scales
+                                      | skip_mask_t::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
-            VCHECK_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
-            VCHECK_SOFTMAX(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
-#undef VCHECK_SOFTMAX
+            VDISPATCH_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_SOFTMAX(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
 
-            ok = set_default_formats() == status::success
-                    && attr_.set_default_formats(dst_md(0)) == status::success;
-            if (!ok) return status::unimplemented;
+            VDISPATCH_SOFTMAX(set_default_formats() == status::success,
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_SOFTMAX(
+                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    VERBOSE_UNSUPPORTED_POSTOP);
 
             nthr_ = 0;
             init_scratchpad();
@@ -79,9 +78,19 @@ struct ref_softmax_fwd_t : public primitive_t {
         int nthr_; // To not exceed the limit in execute used for set up.
 
         bool need_intermediate_scratchpad() const {
-            return dst_md()->data_type
-                    != types::default_accum_data_type(
-                            src_md()->data_type, dst_md()->data_type);
+            const auto src_dt = src_md()->data_type;
+            const auto dst_dt = dst_md()->data_type;
+            // Relaxed accumulation allows to downconvert intermediate results
+            // directly from xf16 or xf8 to dst avoiding scratchpad memory.
+            const bool relaxed_acc = src_dt == dst_dt
+                    && !types::is_integral_dt(dst_dt)
+                    && utils::one_of(attr()->acc_mode_,
+                            accumulation_mode::relaxed, accumulation_mode::any);
+            const bool need_scratchpad = dst_md()->data_type
+                            != types::default_accum_data_type(
+                                    src_md()->data_type, dst_md()->data_type)
+                    && !relaxed_acc;
+            return need_scratchpad;
         }
 
     private:
diff --git a/src/cpu/ref_sum.hpp b/src/cpu/ref_sum.hpp
index 917308f6122..e0e2cfb8669 100644
--- a/src/cpu/ref_sum.hpp
+++ b/src/cpu/ref_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct ref_sum_t : public primitive_t {
             reorder_pds_.resize(n_ + need_output_reorder());
             for (int i = 0; i < n_; ++i) {
                 primitive_attr_t r_attr;
-                r_attr.scales_.set(DNNL_ARG_SRC, 0);
+                CHECK(r_attr.scales_.set(DNNL_ARG_SRC, 0));
                 if (i != 0) r_attr.post_ops_.append_sum(1.0);
                 CHECK(reorder_primitive_desc_create(reorder_pds_[i], engine,
                         src_md(i), dst_acc_md(), &r_attr));
@@ -97,9 +97,10 @@ struct ref_sum_t : public primitive_t {
 
         scales_mem_.resize(n);
         for (size_t i = 0; i < n; ++i)
-            scales_mem_[i] = std::make_shared<memory_t>(get_service_engine(),
-                    &scales_md, use_runtime_ptr,
-                    const_cast<float *>(&(scales[i])));
+            CHECK(safe_ptr_assign(scales_mem_[i],
+                    new memory_t(get_service_engine(), &scales_md,
+                            use_runtime_ptr,
+                            const_cast<float *>(&(scales[i])))));
         return status::success;
     }
 
@@ -116,14 +117,17 @@ struct ref_sum_t : public primitive_t {
                         key_sum_reduction)
                 : nullptr;
         auto dst = ctx.args().at(DNNL_ARG_DST);
-        memory_t acc(
-                dst.mem->engine(), pd()->dst_acc_md(), std::move(sum_reduce));
-        memory_arg_t dst_acc = {&acc, false};
+
+        std::unique_ptr<memory_t, memory_deleter_t> acc;
+        CHECK(safe_ptr_assign(acc,
+                new memory_t(dst.mem->engine(), pd()->dst_acc_md(),
+                        std::move(sum_reduce))));
+        memory_arg_t dst_acc = {acc.get(), false};
 
         /* fix: clang MemorySanitizer: use-of-uninitialized-value */
         if (pd()->need_output_reorder()) {
-            const memory_desc_wrapper acc_d(acc.md());
-            std::memset(acc.memory_storage()->data_handle(), 0, acc_d.size());
+            const memory_desc_wrapper acc_d(acc->md());
+            std::memset(acc->memory_storage()->data_handle(), 0, acc_d.size());
         }
 
         for (int i = 0; i < n; ++i) {
@@ -140,7 +144,7 @@ struct ref_sum_t : public primitive_t {
         }
 
         if (pd()->need_output_reorder()) {
-            dst_acc = {&acc, true};
+            dst_acc.is_const = true;
             r_args[DNNL_ARG_SRC] = dst_acc;
             r_args[DNNL_ARG_DST] = dst;
             exec_ctx_t r_ctx(ctx, std::move(r_args));
@@ -155,7 +159,7 @@ struct ref_sum_t : public primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::vector<std::shared_ptr<primitive_t>> reorders_;
-    std::vector<std::shared_ptr<memory_t>> scales_mem_;
+    std::vector<std::unique_ptr<memory_t, memory_deleter_t>> scales_mem_;
 };
 
 } // namespace cpu
diff --git a/src/cpu/reorder/cpu_reorder.cpp b/src/cpu/reorder/cpu_reorder.cpp
index d9d8912d91b..1e8b0961636 100644
--- a/src/cpu/reorder/cpu_reorder.cpp
+++ b/src/cpu/reorder/cpu_reorder.cpp
@@ -25,6 +25,8 @@ namespace cpu {
 static const std::map<reorder_impl_key_t, const void *> &
 regular_impl_list_map() {
     static const std::map<reorder_impl_key_t, const void *> the_map = {
+            {{f32, f4_e2m1, 0}, &regular_fp4_impl_list_map()},
+            {{f32, f4_e3m0, 0}, &regular_fp4_impl_list_map()},
             {{f32, e8m0, 0}, &regular_f32_fp8_impl_list_map()},
             {{f32, f8_e5m2, 0}, &regular_f32_fp8_impl_list_map()},
             {{f32, f8_e4m3, 0}, &regular_f32_fp8_impl_list_map()},
@@ -34,8 +36,11 @@ regular_impl_list_map() {
             {{f32, s32, 0}, &regular_f32_s32_impl_list_map()},
             {{f32, s8, 0}, &regular_f32_s8_impl_list_map()},
             {{f32, u8, 0}, &regular_f32_u8_impl_list_map()},
+            {{f4_e3m0, data_type::undef, 0}, &regular_fp4_impl_list_map()},
             {{f8_e5m2, data_type::undef, 0}, &regular_fp8_impl_list_map()},
             {{f8_e4m3, data_type::undef, 0}, &regular_fp8_impl_list_map()},
+            {{e8m0, data_type::undef, 0}, &regular_fp8_impl_list_map()},
+            {{f32, bin, 0}, &regular_f32_bin_impl_list_map()},
             {{bf16, data_type::undef, 0}, &regular_bf16_impl_list_map()},
             {{f16, data_type::undef, 0}, &regular_f16_impl_list_map()},
             {{s32, data_type::undef, 0}, &regular_s32_impl_list_map()},
@@ -43,8 +48,13 @@ regular_impl_list_map() {
             {{u8, data_type::undef, 0}, &regular_u8_impl_list_map()},
             {{f32, s4, 0}, &regular_s4_impl_list_map()},
             {{f32, u4, 0}, &regular_u4_impl_list_map()},
-            {{s4, f32, 0}, &regular_s4_impl_list_map()},
+            {{s4, data_type::undef, 0}, &regular_s4_impl_list_map()},
             {{u4, f32, 0}, &regular_u4_impl_list_map()},
+            {{bin, data_type::undef, 0}, &regular_bin_impl_list_map()},
+            {{nf4, data_type::undef, 0}, &regular_nf4_impl_list_map()},
+            {{f4_e2m1, data_type::undef, 0}, &regular_f4_impl_list_map()},
+            {{s4, data_type::undef, 0}, &regular_s4_impl_list_map()},
+            {{u4, data_type::undef, 0}, &regular_u4_impl_list_map()},
     };
     return the_map;
 }
diff --git a/src/cpu/reorder/cpu_reorder.hpp b/src/cpu/reorder/cpu_reorder.hpp
index dc0105966b1..7816e0397a4 100644
--- a/src/cpu/reorder/cpu_reorder.hpp
+++ b/src/cpu/reorder/cpu_reorder.hpp
@@ -33,13 +33,14 @@
 
 #if DNNL_X64
 #include "cpu/x64/jit_uni_reorder.hpp"
+#include "cpu/x64/jit_uni_reorder_direct_copy.hpp"
 #include "cpu/x64/matmul/brgemm_matmul_reorders.hpp"
 #elif DNNL_AARCH64
 #include "cpu/aarch64/jit_uni_reorder.hpp"
 #include "cpu/aarch64/matmul/brgemm_matmul_reorders.hpp"
 #endif
 
-#if DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
+#if DNNL_AARCH64 && DNNL_USE_ACL
 #include "cpu/aarch64/acl_reorder.hpp"
 #endif
 
@@ -74,6 +75,7 @@ using impl_list_map_t
         = std::map<reorder_impl_key_t, std::vector<impl_list_item_t>>;
 
 /* regular reorders */
+extern const impl_list_map_t &regular_fp4_impl_list_map();
 extern const impl_list_map_t &regular_f32_fp8_impl_list_map();
 extern const impl_list_map_t &regular_f32_bf16_impl_list_map();
 extern const impl_list_map_t &regular_f32_f16_impl_list_map();
@@ -82,6 +84,7 @@ extern const impl_list_map_t &regular_f32_s32_impl_list_map();
 extern const impl_list_map_t &regular_f32_s8_impl_list_map();
 extern const impl_list_map_t &regular_f32_u8_impl_list_map();
 extern const impl_list_map_t &regular_fp8_impl_list_map();
+extern const impl_list_map_t &regular_f32_bin_impl_list_map();
 extern const impl_list_map_t &regular_bf16_impl_list_map();
 extern const impl_list_map_t &regular_f16_impl_list_map();
 extern const impl_list_map_t &regular_s32_impl_list_map();
@@ -89,6 +92,11 @@ extern const impl_list_map_t &regular_s8_impl_list_map();
 extern const impl_list_map_t &regular_u8_impl_list_map();
 extern const impl_list_map_t &regular_s4_impl_list_map();
 extern const impl_list_map_t &regular_u4_impl_list_map();
+extern const impl_list_map_t &regular_bin_impl_list_map();
+extern const impl_list_map_t &regular_nf4_impl_list_map();
+extern const impl_list_map_t &regular_f4_impl_list_map();
+extern const impl_list_map_t &regular_s4_impl_list_map();
+extern const impl_list_map_t &regular_u4_impl_list_map();
 
 /* conv reorders w/ compensation */
 extern const impl_list_map_t &comp_f32_s8_impl_list_map();
@@ -96,6 +104,8 @@ extern const impl_list_map_t &comp_bf16_s8_impl_list_map();
 extern const impl_list_map_t &comp_s8_s8_impl_list_map();
 
 // clang-format off
+#define REG_SPARSE_SR(idt, ifmt, odt, ofmt, ...) \
+    CPU_REORDER_INSTANCE(simple_sparse_reorder_t, idt, ifmt, odt, ofmt, __VA_ARGS__)
 
 // Some compilers do not allow guarding implementations with macros
 // in the impl list.
@@ -115,17 +125,44 @@ extern const impl_list_map_t &comp_s8_s8_impl_list_map();
 #define REG_SPARSE_SR_X64(...)
 #endif
 
-#define REG_SR(idt, ifmt, odt, ofmt, ...) \
+using spec_reference = spec::reference;
+using spec_direct_copy = spec::direct_copy;
+using spec_direct_copy_except_dim_0 = spec::direct_copy_except_dim_0;
+using spec_conv_req_comp = spec::conv_req_comp;
+constexpr bool fmt_order_keep = fmt_order::keep;
+constexpr bool fmt_order_reverse = fmt_order::reverse;
+constexpr bool fmt_order_any = fmt_order::any;
+
+#if DNNL_X64
+using x64_jit_blk_reorder_t = x64::jit_blk_reorder_t;
+using x64_jit_uni_reorder_t = x64::jit_uni_reorder_t;
+using x64_brgemm_matmul_copy_reorder_t = x64::brgemm_matmul_copy_reorder_t;
+using x64_jit_uni_reorder_direct_copy_t = x64::jit_uni_reorder_direct_copy_t;
+#elif DNNL_AARCH64
+using aarch64_jit_blk_reorder_t = aarch64::jit_blk_reorder_t;
+using aarch64_jit_uni_reorder_t = aarch64::jit_uni_reorder_t;
+#endif
+
+#define CPU_REORDER_INSTANCE_IMPL(...) \
     impl_list_item_t(impl_list_item_t::reorder_type_deduction_helper_t< \
-            simple_reorder_t<idt, ifmt, odt, ofmt, __VA_ARGS__>::pd_t>()),
+            __VA_ARGS__::pd_t>())
+
+#define CPU_REORDER_INSTANCE(...) \
+    DNNL_PRIMITIVE_IMPL(CPU_REORDER_INSTANCE_IMPL, __VA_ARGS__)
+
+#define REG_SR(idt, ifmt, odt, ofmt, ...) \
+    CPU_REORDER_INSTANCE(simple_reorder_t, idt, ifmt, odt, ofmt, __VA_ARGS__)
+    /* impl_list_item_t(impl_list_item_t::reorder_type_deduction_helper_t< \
+             simple_reorder_t<idt, ifmt, odt, ofmt, __VA_ARGS__>::pd_t>()),
+    */
 
 #define REG_SR_BIDIR(idt, ifmt, odt, ofmt) \
-    REG_SR(idt, ifmt, odt, ofmt, fmt_order::keep) \
-    REG_SR(idt, ifmt, odt, ofmt, fmt_order::reverse)
+    REG_SR(idt, ifmt, odt, ofmt, fmt_order_keep) \
+    REG_SR(idt, ifmt, odt, ofmt, fmt_order_reverse)
 
 #define REG_SR_DIRECT_COPY(idt, odt)				  \
-    REG_SR(idt, any, odt, any, fmt_order::any, spec::direct_copy) \
-    REG_SR(idt, any, odt, any, fmt_order::any, spec::direct_copy_except_dim_0)
+    REG_SR(idt, any, odt, any, fmt_order_any, spec_direct_copy) \
+    REG_SR(idt, any, odt, any, fmt_order_any, spec_direct_copy_except_dim_0)
 
 // clang-format on
 
@@ -147,10 +184,6 @@ extern const impl_list_map_t &comp_s8_s8_impl_list_map();
 #define REG_FAST_DIRECT_COPY(sdt, ddt)
 #endif
 
-#define CPU_REORDER_INSTANCE(...) \
-    impl_list_item_t(impl_list_item_t::reorder_type_deduction_helper_t< \
-            __VA_ARGS__::pd_t>()),
-
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
diff --git a/src/cpu/reorder/cpu_reorder_comp_bf16_s8.cpp b/src/cpu/reorder/cpu_reorder_comp_bf16_s8.cpp
index d34f4fe1e48..759706c737b 100644
--- a/src/cpu/reorder/cpu_reorder_comp_bf16_s8.cpp
+++ b/src/cpu/reorder/cpu_reorder_comp_bf16_s8.cpp
@@ -26,170 +26,170 @@ const impl_list_map_t &comp_bf16_s8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // bf16 -> s8
         {{bf16, s8, 2}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oi, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, format_tag::io, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oi, s8, OI4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, format_tag::io, s8, OI4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oi, s8, OI4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, format_tag::io, s8, OI4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp))
-            REG_SR(bf16, ab, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ab, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ab, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ab, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ba, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ba, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ba, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, ba, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oi, s8, OI4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, format_tag::io, s8, OI4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oi, s8, OI4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, format_tag::io, s8, OI4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oi, s8, OI4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, format_tag::io, s8, OI4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ab, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ba, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp))
+            REG_SR(bf16, ab, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ab, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ab, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ab, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ba, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ba, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ba, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, ba, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp)
             nullptr,
         }},
         // bf16 -> s8
         {{bf16, s8, 3}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, wio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            REG_SR(bf16, abc, s8, aCB16b16c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, abc, s8, aCB16b32c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, abc, s8, aCB16b48c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, abc, s8, aCB16b64c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, acb, s8, aCB16b16c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, acb, s8, aCB16b32c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, acb, s8, aCB16b48c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(bf16, acb, s8, aCB16b64c4b, fmt_order::keep, spec::conv_req_comp)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, wio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, iwo, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oiw, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wio, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            REG_SR(bf16, abc, s8, aCB16b16c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, abc, s8, aCB16b32c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, abc, s8, aCB16b48c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, abc, s8, aCB16b64c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, acb, s8, aCB16b16c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, acb, s8, aCB16b32c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, acb, s8, aCB16b48c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(bf16, acb, s8, aCB16b64c4b, fmt_order_keep, spec_conv_req_comp)
             nullptr,
         }},
         {{bf16, s8, 4}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, hwio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, wigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, Goiw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, Goiw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, Goiw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, Goiw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, Goiw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, Goiw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOwi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOwi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, hwio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, wigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, Goiw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, Goiw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, Goiw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, Goiw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, Goiw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, Goiw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOwi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOwi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goiw, s8, gOIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, wigo, s8, gOIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, ihwo, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oihw, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwio, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
         {{bf16, s8, 5}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, hwigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, dhwio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, Goihw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, Goihw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, Goihw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, Goihw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, Goihw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, Goihw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOwhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOwhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, hwigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, dhwio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, Goihw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, Goihw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, Goihw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, Goihw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, Goihw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, Goihw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOwhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOwhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goihw, s8, gOIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, hwigo, s8, gOIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, idhwo, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, oidhw, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, dhwio, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
         {{bf16, s8, 6}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, dhwigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, any, s8, dhwigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(bf16, goidhw, s8, gOIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
     });
diff --git a/src/cpu/reorder/cpu_reorder_comp_f32_s8.cpp b/src/cpu/reorder/cpu_reorder_comp_f32_s8.cpp
index a296ac8d2f6..104868ac072 100644
--- a/src/cpu/reorder/cpu_reorder_comp_f32_s8.cpp
+++ b/src/cpu/reorder/cpu_reorder_comp_f32_s8.cpp
@@ -27,168 +27,168 @@ const impl_list_map_t &comp_f32_s8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> s8
         {{f32, s8, 2}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oi, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, format_tag::io, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oi, s8, OI4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, format_tag::io, s8, OI4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oi, s8, OI4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, format_tag::io, s8, OI4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            REG_SR(f32, ab, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ab, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ab, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ab, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ba, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ba, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ba, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, ba, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oi, s8, OI4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, format_tag::io, s8, OI4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oi, s8, OI4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, format_tag::io, s8, OI4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oi, s8, OI4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, format_tag::io, s8, OI4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            REG_SR(f32, ab, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ab, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ab, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ab, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ba, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ba, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ba, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, ba, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp)
             nullptr,
         }},
         // f32 -> s8
         {{f32, s8, 3}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, wio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            REG_SR(f32, abc, s8, aCB16b16c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, abc, s8, aCB16b32c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, abc, s8, aCB16b48c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, abc, s8, aCB16b64c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, acb, s8, aCB16b16c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, acb, s8, aCB16b32c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, acb, s8, aCB16b48c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(f32, acb, s8, aCB16b64c4b, fmt_order::keep, spec::conv_req_comp)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, wio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, iwo, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oiw, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wio, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            REG_SR(f32, abc, s8, aCB16b16c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, abc, s8, aCB16b32c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, abc, s8, aCB16b48c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, abc, s8, aCB16b64c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, acb, s8, aCB16b16c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, acb, s8, aCB16b32c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, acb, s8, aCB16b48c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(f32, acb, s8, aCB16b64c4b, fmt_order_keep, spec_conv_req_comp)
             nullptr,
         }},
         {{f32, s8, 4}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, hwio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, wigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, Goiw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, Goiw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, Goiw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, Goiw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, Goiw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, Goiw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOwi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOwi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, hwio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, wigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, Goiw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, Goiw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, Goiw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, Goiw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, Goiw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, Goiw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOwi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOwi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goiw, s8, gOIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, wigo, s8, gOIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, ihwo, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oihw, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwio, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
         {{f32, s8, 5}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, hwigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, dhwio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, Goihw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, Goihw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, Goihw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, Goihw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, Goihw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, Goihw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOwhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOwhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, hwigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, dhwio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, Goihw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, Goihw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, Goihw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, Goihw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, Goihw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, Goihw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOwhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOwhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goihw, s8, gOIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, hwigo, s8, gOIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, idhwo, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, oidhw, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, dhwio, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
         {{f32, s8, 6}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, dhwigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(f32, any, s8, dhwigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(f32, goidhw, s8, gOIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
     });
diff --git a/src/cpu/reorder/cpu_reorder_comp_s8_s8.cpp b/src/cpu/reorder/cpu_reorder_comp_s8_s8.cpp
index 0b8c9322e48..4cb92ea0832 100644
--- a/src/cpu/reorder/cpu_reorder_comp_s8_s8.cpp
+++ b/src/cpu/reorder/cpu_reorder_comp_s8_s8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2023 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,170 +27,175 @@ const impl_list_map_t &comp_s8_s8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // s8 -> s8
         {{s8, s8, 2}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, format_tag::io, s8, OI4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, format_tag::io, s8, OI4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, format_tag::io, s8, OI4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            REG_SR(s8, ab, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ab, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ab, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ab, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ba, s8, BA16a16b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ba, s8, BA16a32b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ba, s8, BA16a48b4a, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, ba, s8, BA16a64b4a, fmt_order::keep, spec::conv_req_comp)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, format_tag::io, s8, OI4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, format_tag::io, s8, OI4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oi, s8, OI4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, format_tag::io, s8, OI4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            REG_SR(s8, ab, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ab, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ab, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ab, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ba, s8, BA16a16b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ba, s8, BA16a32b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ba, s8, BA16a48b4a, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, ba, s8, BA16a64b4a, fmt_order_keep, spec_conv_req_comp)
 
             nullptr,
         }},
         // s8 -> s8
         {{s8, s8, 3}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, wio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, Owi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            REG_SR(s8, abc, s8, aCB16b16c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, abc, s8, aCB16b32c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, abc, s8, aCB16b48c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, abc, s8, aCB16b64c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, acb, s8, aCB16b16c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, acb, s8, aCB16b32c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, acb, s8, aCB16b48c4b, fmt_order::keep, spec::conv_req_comp)
-            REG_SR(s8, acb, s8, aCB16b64c4b, fmt_order::keep, spec::conv_req_comp)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, wio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, Owi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, iwo, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oiw, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wio, s8, OIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            REG_SR(s8, abc, s8, aCB16b16c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, abc, s8, aCB16b32c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, abc, s8, aCB16b48c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, abc, s8, aCB16b64c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, acb, s8, aCB16b16c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, acb, s8, aCB16b32c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, acb, s8, aCB16b48c4b, fmt_order_keep, spec_conv_req_comp)
+            REG_SR(s8, acb, s8, aCB16b64c4b, fmt_order_keep, spec_conv_req_comp)
             nullptr,
         }},
         {{s8, s8, 4}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, wigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, Goiw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, Goiw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, Goiw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, Goiw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, Goiw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, Goiw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOwi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOwi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, Owhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, wigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, Goiw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, Goiw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, Goiw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, Goiw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, Goiw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, Goiw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOwi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOwi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goiw, s8, gOIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, wigo, s8, gOIw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, Owhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, ihwo, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oihw, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwio, s8, OIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
         {{s8, s8, 5}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, dhwio, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4i32o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4i64o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, Goihw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, Goihw16g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, Goihw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, Goihw8g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, Goihw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, Goihw4g, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOwhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOwhi16o, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, hwigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, dhwio, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4i32o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4i64o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, Goihw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, Goihw16g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, Goihw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, Goihw8g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, Goihw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, Goihw4g, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOwhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOwhi16o, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goihw, s8, gOIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, hwigo, s8, gOIhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, idhwo, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, oidhw, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, dhwio, s8, OIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
         {{s8, s8, 6}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, dhwigo, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw4i16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw2i8o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw4o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOdhwI16o4i, fmt_order::keep, spec::conv_req_comp))
-            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw16i16o4i, fmt_order::keep, spec::conv_req_comp))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+            DNNL_NON_X64_ONLY(REG_SR(s8, any, s8, dhwigo, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw4i16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw2i8o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw4o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOdhwI16o4i, fmt_order_keep, spec_conv_req_comp))
+            DNNL_NON_X64_ONLY(REG_SR(s8, goidhw, s8, gOIdhw16i16o4i, fmt_order_keep, spec_conv_req_comp))
             nullptr,
         }},
     });
diff --git a/src/cpu/reorder/cpu_reorder_pd.hpp b/src/cpu/reorder/cpu_reorder_pd.hpp
index d1c8499c151..1fbac15ee32 100644
--- a/src/cpu/reorder/cpu_reorder_pd.hpp
+++ b/src/cpu/reorder/cpu_reorder_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,6 +38,9 @@ struct cpu_reorder_pd_t : public reorder_pd_t {
                 post_ops.len() == 1
                         && post_ops.entry_[0].kind == primitive_kind::sum);
         VDISPATCH_REORDER(args_ok, VERBOSE_UNSUPPORTED_POSTOP);
+        auto gpu_zp = memory_extra_flags::compensation_gpu_conv_asymmetric_src;
+        VDISPATCH_REORDER(!(dst_md()->extra.flags & gpu_zp),
+                VERBOSE_UNSUPPORTED_MD_FLAG, "extra");
         return status::success;
     }
 
@@ -82,15 +85,15 @@ struct cpu_reorder_pd_t : public reorder_pd_t {
             const float *dst_scales) const {
         using namespace dnnl::impl::memory_tracking::names;
 
-        int mask = -1;
-        bool is_set = false;
-        auto status = attr->scales_.get(DNNL_ARG_DST, &mask, &is_set);
-        if (status != status::success) return nullptr;
+        if (attr->scales_.has_default_values(DNNL_ARG_DST)) {
+            return dst_scales;
+        }
 
         // It's possible that mask > 0 but `count` is still `1`. This case is
         //   covered by `DEFINE_ARG_SCALES_BUFFER` macro and no need to inverse
         //   in such case.
-        if (is_set && mask > 0 && count > 1) {
+        int mask = attr->scales_.get_mask(DNNL_ARG_DST);
+        if (mask > 0 && count > 1) {
             auto loc_scales = scratchpad.template get<float>(
                     key_reorder_precomputed_dst_scales);
             if (!loc_scales) return nullptr;
diff --git a/src/cpu/reorder/cpu_reorder_regular_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_bf16.cpp
index 192d36137bb..388afbfa39c 100644
--- a/src/cpu/reorder/cpu_reorder_regular_bf16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_bf16.cpp
@@ -26,11 +26,11 @@ const impl_list_map_t &regular_bf16_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // bf16 ->
         {{bf16, data_type::undef, 0}, {
-            CPU_REORDER_INSTANCE(rnn_weights_reorder_t<bf16, bf16>)
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            CPU_REORDER_INSTANCE(rnn_weights_reorder_t, bf16, bf16)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(bf16, any, f32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(bf16, any, f32, nCdhw16c))
@@ -53,14 +53,14 @@ const impl_list_map_t &regular_bf16_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(bf16, any, u8, OIdhw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(bf16, any, u8, OIdhw16i16o))
 
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            REG_SR(bf16, any, bf16, any, fmt_order::any, spec::reference)
-            REG_SR(bf16, any, f32, any, fmt_order::any, spec::reference)
-            REG_SR(bf16, any, s8, any, fmt_order::any, spec::reference)
-            REG_SR(bf16, any, u8, any, fmt_order::any, spec::reference)
-            REG_SR(bf16, any, f8_e5m2, any, fmt_order::any, spec::reference)
-            REG_SR(bf16, any, f8_e4m3, any, fmt_order::any, spec::reference)
+            REG_SR(bf16, any, bf16, any, fmt_order_any, spec_reference)
+            REG_SR(bf16, any, f32, any, fmt_order_any, spec_reference)
+            REG_SR(bf16, any, s8, any, fmt_order_any, spec_reference)
+            REG_SR(bf16, any, u8, any, fmt_order_any, spec_reference)
+            REG_SR(bf16, any, f8_e5m2, any, fmt_order_any, spec_reference)
+            REG_SR(bf16, any, f8_e4m3, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_bin.cpp b/src/cpu/reorder/cpu_reorder_regular_bin.cpp
new file mode 100644
index 00000000000..3078feb1c2b
--- /dev/null
+++ b/src/cpu/reorder/cpu_reorder_regular_bin.cpp
@@ -0,0 +1,47 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/reorder/cpu_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+// clang-format off
+
+const impl_list_map_t &regular_bin_impl_list_map() {
+    static const impl_list_map_t the_map = REG_REORDER_P({
+        // bin ->
+        {{bin, data_type::undef, 4}, {
+            REG_SR_DIRECT_COPY(bin, bin)
+
+            REG_SR(bin, any, bin, OIhw8o32i, fmt_order_keep)
+
+            REG_SR(bin, any, bin, OIhw16o32i, fmt_order_keep)
+
+            REG_SR_BIDIR(u8, any, u8, nChw8c)
+
+            nullptr,
+        }},
+    });
+    return the_map;
+}
+
+// clang-format on
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/reorder/cpu_reorder_regular_f16.cpp b/src/cpu/reorder/cpu_reorder_regular_f16.cpp
index 6d3bd322fef..5d6bb97ac57 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f16.cpp
@@ -28,16 +28,19 @@ const impl_list_map_t &regular_f16_impl_list_map() {
         {{f16, data_type::undef, 0}, {
             DNNL_AARCH64_ONLY(REG_SR_DIRECT_COPY(f16, f16))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-
-            REG_SR(f16, any, f8_e5m2, any, fmt_order::any, spec::reference)
-            REG_SR(f16, any, f8_e4m3, any, fmt_order::any, spec::reference)
-            REG_SR(f16, any, f16, any, fmt_order::any, spec::reference)
-            REG_SR(f16, any, f32, any, fmt_order::any, spec::reference)
-            REG_SR(f16, any, s8, any, fmt_order::any, spec::reference)
-            REG_SR(f16, any, u8, any, fmt_order::any, spec::reference)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+
+            REG_SR(f16, any, f8_e5m2, any, fmt_order_any, spec_reference)
+            REG_SR(f16, any, f8_e4m3, any, fmt_order_any, spec_reference)
+            REG_SR(f16, any, f16, any, fmt_order_any, spec_reference)
+            REG_SR(f16, any, f32, any, fmt_order_any, spec_reference)
+            REG_SR(f16, any, s8, any, fmt_order_any, spec_reference)
+            REG_SR(f16, any, u8, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
index 213f44723f7..9b6d5cd4f2d 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bf16.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 * Copyright 2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,15 +27,16 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> bf16
         {{f32, bf16, 0}, {
-            CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, bf16>)
+            CPU_REORDER_INSTANCE(rnn_weights_reorder_t, f32, bf16)
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, bf16, nCdhw16c))
 
-            DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
+            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_REORDER_INSTANCE(acl::acl_reorder_fwd_t)))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
 
             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw8i16o2i, fmt_order::keep))
@@ -47,7 +48,7 @@ const impl_list_map_t &regular_f32_bf16_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR(f32, oihw, bf16, OIhw16i16o, fmt_order::keep))
             DNNL_NON_X64_ONLY(REG_SR(f32, goihw, bf16, gOIhw16i16o, fmt_order::keep))
 
-            REG_SR(f32, any, bf16, any, fmt_order::any, spec::reference)
+            REG_SR(f32, any, bf16, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp
new file mode 100644
index 00000000000..f050b6a648e
--- /dev/null
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_bin.cpp
@@ -0,0 +1,42 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/reorder/cpu_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+// clang-format off
+
+const impl_list_map_t &regular_f32_bin_impl_list_map() {
+    static const impl_list_map_t the_map = REG_REORDER_P({
+        // bin ->
+        {{f32, bin, 4}, {
+            REG_SR_BIDIR(f32, nchw, bin, nhwc)
+            REG_SR_BIDIR(f32, nhwc, bin, nhwc)
+
+             nullptr,
+        }},
+    });
+    return the_map;
+}
+
+// clang-format on
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp
index a7b1b006549..d4da37cc42d 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f16.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
+* Copyright 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,11 +27,13 @@ const impl_list_map_t &regular_f32_f16_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> f16
         {{f32, f16, 0}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            REG_SR(f32, any, f16, any, fmt_order::any, spec::reference)
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
 
+            REG_SR(f32, any, f16, any, fmt_order::any, spec::reference)
             nullptr,
         }},
     });
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
index a01fa058785..b2f86ff709d 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_f32.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2020-2024 Intel Corporation
-* Copyright 2022 FUJITSU LIMITED
+* Copyright 2022-2024 FUJITSU LIMITED
 * Copyright 2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,30 +28,34 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> f32
         {{f32, f32, 0}, {
-            REG_FAST_DIRECT_COPY_F32_F32
-
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
+            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_REORDER_INSTANCE(acl::acl_reorder_fwd_t)))
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-            REG_SR(f32, any, f32, any, fmt_order::any, spec::reference)
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+
+            REG_FAST_DIRECT_COPY_F32_F32
+
+            REG_SR(f32, any, f32, any, fmt_order_any, spec::reference)
 
             nullptr,
         }},
         {{f32, f32, 3}, {
-            REG_FAST_DIRECT_COPY_F32_F32
-
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
             DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::brgemm_matmul_matrix_B_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+
+            REG_FAST_DIRECT_COPY_F32_F32
+
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCw8c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCw4c))
@@ -66,23 +70,26 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIw16i16o))
+            DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, IOw8o8i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, IOw16o16i))
 
-            REG_SR(f32, any, f32, any, fmt_order::any, spec::reference)
+            REG_SR(f32, any, f32, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
         {{f32, f32, 4}, {
-            CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
+            CPU_REORDER_INSTANCE(rnn_weights_reorder_t, f32, f32)
 
-            REG_FAST_DIRECT_COPY_F32_F32
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(DNNL_ACL_ONLY(CPU_REORDER_INSTANCE(acl::acl_reorder_fwd_t)))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            DNNL_AARCH64_ACL_ONLY(CPU_REORDER_INSTANCE(aarch64::acl_reorder_fwd_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            REG_FAST_DIRECT_COPY_F32_F32
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nChw8c))
@@ -98,6 +105,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIw16i16o))
+            DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gIOw8o8i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gIOw16o16i))
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIhw4i4o))
@@ -113,24 +121,27 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, Ohwi16o))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIhw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIhw16i16o))
+            DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, IOhw8o8i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, IOhw16o16i))
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIhw4i16o4i))
 
-            REG_SR(f32, any, f32, any, fmt_order::any, spec::reference)
+            REG_SR(f32, any, f32, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
         {{f32, f32, 5}, {
-            CPU_REORDER_INSTANCE(rnn_weights_reorder_t<f32, f32>)
+            CPU_REORDER_INSTANCE(rnn_weights_reorder_t, f32, f32)
 
-            REG_FAST_DIRECT_COPY_F32_F32
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            REG_FAST_DIRECT_COPY_F32_F32
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCdhw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, nCdhw8c))
@@ -151,6 +162,7 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOhwi16o))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIhw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIhw16i16o))
+            DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gIOhw8o8i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gIOhw16o16i))
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIdhw4i4o))
@@ -164,23 +176,24 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, Odhwi16o))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIdhw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, OIdhw16i16o))
+            DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, IOdhw8o8i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, IOdhw16o16i))
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIhw4i16o4i))
 
-            REG_SR(f32, any, f32, any, fmt_order::any, spec::reference)
+            REG_SR(f32, any, f32, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
         {{f32, f32, 6}, {
-            REG_FAST_DIRECT_COPY_F32_F32
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
+            REG_FAST_DIRECT_COPY_F32_F32
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIdhw4i4o))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIdhw4o4i))
@@ -193,9 +206,10 @@ const impl_list_map_t &regular_f32_f32_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOdhwi16o))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIdhw16o16i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gOIdhw16i16o))
+            DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gIOdhw8o8i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, f32, gIOdhw16o16i))
 
-            REG_SR(f32, any, f32, any, fmt_order::any, spec::reference)
+            REG_SR(f32, any, f32, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_fp8.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_fp8.cpp
index e313c77fb1e..dd642125d53 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_fp8.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_fp8.cpp
@@ -31,6 +31,7 @@ const impl_list_map_t &regular_f32_fp8_impl_list_map() {
         }},
         // f32 -> f8_e5m2
         {{f32, f8_e5m2, 0}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
 
@@ -40,6 +41,7 @@ const impl_list_map_t &regular_f32_fp8_impl_list_map() {
         }},
         // f32 -> f8_e4m3
         {{f32, f8_e4m3, 0}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
 
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp
index b1881df80e0..7961f8f361b 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 * Copyright 2022 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,15 +27,18 @@ const impl_list_map_t &regular_f32_s32_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> s32
         {{f32, s32, 0}, {
-            REG_FAST_DIRECT_COPY(f32, s32)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            REG_FAST_DIRECT_COPY(f32, s32)
 
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, s32, nChw16c))
-            REG_SR(f32, any, s32, any, fmt_order::any, spec::reference)
+
+            REG_SR(f32, any, s32, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp
index 7ce25752c7c..a3878c5d630 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_s8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 * Copyright 2022 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,23 +27,28 @@ const impl_list_map_t &regular_f32_s8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> s8
         {{f32, s8, 0}, {
-            CPU_REORDER_INSTANCE(rnn_data_reorder_t<f32, s8>)
-            CPU_REORDER_INSTANCE(rnn_weights_reorder_s8_t<f32>)
-            CPU_REORDER_INSTANCE(rnn_brgemm_weights_reorder_s8_t<f32, s8>)
+            // TODO: move it down when checks for sparse md are implemented in other implementations.
+            DNNL_X64_ONLY(REG_SPARSE_SR(f32, oi, s8, OI16i64o4i, sparse_inputs_order::keep, sparse_spec::reference))
+            DNNL_X64_ONLY(REG_SPARSE_SR(f32, format_tag::io, s8, OI16i64o4i, sparse_inputs_order::keep, sparse_spec::reference))
 
-            REG_FAST_DIRECT_COPY(f32, s8)
+            CPU_REORDER_INSTANCE(rnn_data_reorder_t, f32, s8)
+            CPU_REORDER_INSTANCE(rnn_weights_reorder_s8_t, f32)
+            CPU_REORDER_INSTANCE(rnn_brgemm_weights_reorder_s8_t, f32, s8)
+
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            REG_FAST_DIRECT_COPY(f32, s8)
 
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, s8, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, s8, OIhw4i16o4i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, s8, gOIhw4i16o4i))
 
-            REG_SR(f32, any, s8, any, fmt_order::any, spec::reference)
+            REG_SR(f32, any, s8, any, fmt_order_any, spec_reference)
 
             REG_SPARSE_SR_X64(f32, any, s8, any)
 
diff --git a/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp b/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp
index d306c3abeb8..923e74a28ac 100644
--- a/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_f32_u8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 * Copyright 2022 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,17 +27,20 @@ const impl_list_map_t &regular_f32_u8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f32 -> u8
         {{f32, u8, 0}, {
-            CPU_REORDER_INSTANCE(rnn_data_reorder_t<f32, u8>)
+            CPU_REORDER_INSTANCE(rnn_data_reorder_t, f32, u8)
 
-            REG_FAST_DIRECT_COPY(f32, u8)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
+            REG_FAST_DIRECT_COPY(f32, u8)
 
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(f32, any, u8, nChw16c))
-            REG_SR(f32, any, u8, any, fmt_order::any, spec::reference)
+
+            REG_SR(f32, any, u8, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_f4.cpp b/src/cpu/reorder/cpu_reorder_regular_f4.cpp
new file mode 100644
index 00000000000..f42b401726c
--- /dev/null
+++ b/src/cpu/reorder/cpu_reorder_regular_f4.cpp
@@ -0,0 +1,48 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/reorder/cpu_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+// clang-format off
+
+const impl_list_map_t &regular_f4_impl_list_map() {
+    static const impl_list_map_t the_map = REG_REORDER_P({
+        // f4_e2m1 ->
+        {{f4_e2m1, data_type::undef, 0}, {
+            REG_SR(f4_e2m1, any, f4_e2m1, OI8i8o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI8i16o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI8i24o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI8i32o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI8i64o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI16i16o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI16i32o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI16i48o2i, fmt_order_keep)
+            REG_SR(f4_e2m1, any, f4_e2m1, OI16i64o2i, fmt_order_keep)
+            nullptr,
+        }},
+    });
+    return the_map;
+}
+
+// clang-format on
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/reorder/cpu_reorder_regular_fp4.cpp b/src/cpu/reorder/cpu_reorder_regular_fp4.cpp
new file mode 100644
index 00000000000..49e3a0ae604
--- /dev/null
+++ b/src/cpu/reorder/cpu_reorder_regular_fp4.cpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/reorder/cpu_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+// clang-format off
+
+const impl_list_map_t &regular_fp4_impl_list_map() {
+    static const impl_list_map_t the_map = REG_REORDER_P({
+        {{f32, f4_e2m1, 0}, {
+            REG_SR(f32, any, f4_e2m1, any, fmt_order::any, spec::reference)
+            nullptr,
+        }},
+        {{f4_e2m1, data_type::undef, 0}, {
+            REG_SR(f4_e2m1, any, f32, any, fmt_order::any, spec::reference)
+            nullptr,
+        }},
+        {{f32, f4_e3m0, 0}, {
+            REG_SR(f32, any, f4_e3m0, any, fmt_order::any, spec::reference)
+            nullptr,
+        }},
+        {{f4_e3m0, data_type::undef, 0}, {
+            REG_SR(f4_e3m0, any, f32, any, fmt_order::any, spec::reference)
+            nullptr,
+        }},
+    });
+    return the_map;
+}
+
+// clang-format on
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/reorder/cpu_reorder_regular_fp8.cpp b/src/cpu/reorder/cpu_reorder_regular_fp8.cpp
index 81ef168d728..bd08fda826d 100644
--- a/src/cpu/reorder/cpu_reorder_regular_fp8.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_fp8.cpp
@@ -26,6 +26,7 @@ const impl_list_map_t &regular_fp8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // f8_e5m2 ->
         {{f8_e5m2, data_type::undef, 0}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
 
@@ -38,6 +39,7 @@ const impl_list_map_t &regular_fp8_impl_list_map() {
         }},
         // f8_e4m3 ->
         {{f8_e4m3, data_type::undef, 0}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
             DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
 
@@ -46,6 +48,12 @@ const impl_list_map_t &regular_fp8_impl_list_map() {
             REG_SR(f8_e4m3, any, bf16, any, fmt_order::any, spec::reference)
             REG_SR(f8_e4m3, any, f32, any, fmt_order::any, spec::reference)
 
+            nullptr,
+        }},
+        // f8_e8m0 ->
+        {{e8m0, data_type::undef, 0}, {
+            REG_SR(e8m0, any, e8m0, any, fmt_order::any, spec::reference)
+
             nullptr,
         }},
     });
diff --git a/src/cpu/reorder/cpu_reorder_regular_nf4.cpp b/src/cpu/reorder/cpu_reorder_regular_nf4.cpp
new file mode 100644
index 00000000000..67d7f3bb96c
--- /dev/null
+++ b/src/cpu/reorder/cpu_reorder_regular_nf4.cpp
@@ -0,0 +1,49 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/reorder/cpu_reorder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+// clang-format off
+
+const impl_list_map_t &regular_nf4_impl_list_map() {
+    static const impl_list_map_t the_map = REG_REORDER_P({
+        // nf4 ->
+        {{nf4, data_type::undef, 0}, {
+            REG_SR(nf4, any, nf4, OI8i8o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI8i16o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI8i24o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI8i32o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI8i64o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI16i16o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI16i32o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI16i48o2i, fmt_order_keep)
+            REG_SR(nf4, any, nf4, OI16i64o2i, fmt_order_keep)
+            REG_SR(nf4, any, f32, any, fmt_order_keep, spec::reference)
+            nullptr,
+        }},
+    });
+    return the_map;
+}
+
+// clang-format on
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/reorder/cpu_reorder_regular_s32.cpp b/src/cpu/reorder/cpu_reorder_regular_s32.cpp
index a8197402b0a..30cd1392b37 100644
--- a/src/cpu/reorder/cpu_reorder_regular_s32.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 * Copyright 2022 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,26 +27,27 @@ const impl_list_map_t &regular_s32_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // s32 ->
         {{s32, data_type::undef, 0}, {
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
+
             REG_FAST_DIRECT_COPY(s32, f32)
             REG_FAST_DIRECT_COPY(s32, s32)
             REG_FAST_DIRECT_COPY(s32, s8)
             REG_FAST_DIRECT_COPY(s32, u8)
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s32, any, f32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s32, any, s32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s32, any, s8, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s32, any, u8, nChw16c))
 
-            REG_SR(s32, any, f32, any, fmt_order::any, spec::reference)
-            REG_SR(s32, any, s32, any, fmt_order::any, spec::reference)
-            REG_SR(s32, any, s8, any, fmt_order::any, spec::reference)
-            REG_SR(s32, any, u8, any, fmt_order::any, spec::reference)
+            REG_SR(s32, any, f32, any, fmt_order_any, spec_reference)
+            REG_SR(s32, any, s32, any, fmt_order_any, spec_reference)
+            REG_SR(s32, any, s8, any, fmt_order_any, spec_reference)
+            REG_SR(s32, any, u8, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_s4.cpp b/src/cpu/reorder/cpu_reorder_regular_s4.cpp
index 17bfdba758e..901a683df6c 100644
--- a/src/cpu/reorder/cpu_reorder_regular_s4.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_s4.cpp
@@ -28,9 +28,26 @@ const impl_list_map_t &regular_s4_impl_list_map() {
             REG_SR(f32, any, s4, any, fmt_order::any, spec::reference)
             nullptr,
         }},
+        {{s4, data_type::undef, 0}, {
+            REG_SR(s4, any, s4, OI8i8o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI8i16o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI8i24o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI8i32o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI8i64o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI16i16o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI16i32o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI16i48o2i, fmt_order_keep)
+            REG_SR(s4, any, s4, OI16i64o2i, fmt_order_keep)
+            REG_SR(s4, any, u8, any, fmt_order_keep, spec::reference)
+            REG_SR(s4, any, f32, any, fmt_order_keep, spec::reference)
+            REG_SR(s4, any, f32, any, fmt_order::any, spec::reference)
+            REG_SR(s4, any, bf16, any, fmt_order::any, spec::reference)
+            REG_SR(s4, any, f16, any, fmt_order::any, spec::reference)
+            nullptr,
+        }},
         {{s4, f32, 0}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
             REG_SR(s4, any, f32, any, fmt_order::any, spec::reference)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
             nullptr,
         }},
     });
diff --git a/src/cpu/reorder/cpu_reorder_regular_s8.cpp b/src/cpu/reorder/cpu_reorder_regular_s8.cpp
index c7199e1c41f..a346f1f50b5 100644
--- a/src/cpu/reorder/cpu_reorder_regular_s8.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_s8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 * Copyright 2022 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,9 +28,13 @@ const impl_list_map_t &regular_s8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // s8 ->
         {{s8, data_type::undef, 0}, {
-            CPU_REORDER_INSTANCE(rnn_weights_reorder_s8_t<s8>)
-            CPU_REORDER_INSTANCE(rnn_brgemm_weights_reorder_s8_t<s8, s8>)
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
+            // // TODO: move it down when checks for sparse md are implemented in other implementations.
+            DNNL_X64_ONLY(REG_SPARSE_SR(s8, oi, s8, OI16i64o4i, sparse_inputs_order::keep, sparse_spec::reference))
+            DNNL_X64_ONLY(REG_SPARSE_SR(s8, format_tag::io, s8, OI16i64o4i, sparse_inputs_order::keep, sparse_spec::reference))
+
+            CPU_REORDER_INSTANCE(rnn_weights_reorder_s8_t,s8)
+            CPU_REORDER_INSTANCE(rnn_brgemm_weights_reorder_s8_t,s8, s8)
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
 
             REG_FAST_DIRECT_COPY(s8, f32)
             REG_FAST_DIRECT_COPY(s8, s32)
@@ -39,12 +43,12 @@ const impl_list_map_t &regular_s8_impl_list_map() {
             REG_FAST_DIRECT_COPY(s8, s8)
             REG_FAST_DIRECT_COPY(s8, u8)
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
 
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s8, any, f32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s8, any, s32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s8, any, bf16, nChw16c))
@@ -58,12 +62,12 @@ const impl_list_map_t &regular_s8_impl_list_map() {
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s8, any, bf16, gOIhw4i16o4i))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(s8, any, s8, gOIhw4i16o4i))
 
-            REG_SR(s8, any, f32, any, fmt_order::any, spec::reference)
-            REG_SR(s8, any, s32, any, fmt_order::any, spec::reference)
-            REG_SR(s8, any, bf16, any, fmt_order::any, spec::reference)
-            REG_SR(s8, any, f16, any, fmt_order::any, spec::reference)
-            REG_SR(s8, any, s8, any, fmt_order::any, spec::reference)
-            REG_SR(s8, any, u8, any, fmt_order::any, spec::reference)
+            REG_SR(s8, any, f32, any, fmt_order_any, spec_reference)
+            REG_SR(s8, any, s32, any, fmt_order_any, spec_reference)
+            REG_SR(s8, any, bf16, any, fmt_order_any, spec_reference)
+            REG_SR(s8, any, f16, any, fmt_order_any, spec_reference)
+            REG_SR(s8, any, s8, any, fmt_order_any, spec_reference)
+            REG_SR(s8, any, u8, any, fmt_order_any, spec_reference)
 
             REG_SPARSE_SR_X64(s8, any, s8, any)
 
diff --git a/src/cpu/reorder/cpu_reorder_regular_u4.cpp b/src/cpu/reorder/cpu_reorder_regular_u4.cpp
index 60a85da4a30..3cb62066af7 100644
--- a/src/cpu/reorder/cpu_reorder_regular_u4.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_u4.cpp
@@ -28,8 +28,29 @@ const impl_list_map_t &regular_u4_impl_list_map() {
             REG_SR(f32, any, u4, any, fmt_order::any, spec::reference)
             nullptr,
         }},
+        {{u4, data_type::undef, 0}, {
+            REG_SR(u4, any, u4, OI8i8o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI8i16o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI8i24o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI8i32o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI8i64o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i16o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i32o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i48o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i64o2i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i16o4i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i32o4i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i48o4i, fmt_order_keep)
+            REG_SR(u4, any, u4, OI16i64o4i, fmt_order_keep)
+            REG_SR(u4, any, u8, any, fmt_order_keep, spec::reference)
+            REG_SR(u4, any, f32, any, fmt_order_keep, spec::reference)
+            REG_SR(u4, any, f32, any, fmt_order::any, spec::reference)
+            REG_SR(u4, any, bf16, any, fmt_order::any, spec::reference)
+            REG_SR(u4, any, f16, any, fmt_order::any, spec::reference)
+            nullptr,
+        }},
         {{u4, f32, 0}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
             REG_SR(u4, any, f32, any, fmt_order::any, spec::reference)
             nullptr,
         }},
diff --git a/src/cpu/reorder/cpu_reorder_regular_u8.cpp b/src/cpu/reorder/cpu_reorder_regular_u8.cpp
index c96343e19b8..97c5c135420 100644
--- a/src/cpu/reorder/cpu_reorder_regular_u8.cpp
+++ b/src/cpu/reorder/cpu_reorder_regular_u8.cpp
@@ -27,7 +27,14 @@ const impl_list_map_t &regular_u8_impl_list_map() {
     static const impl_list_map_t the_map = REG_REORDER_P({
         // u8 ->
         {{u8, data_type::undef, 0}, {
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::brgemm_matmul_matrix_B_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_brgemm_matmul_copy_reorder_t))
+
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_direct_copy_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_blk_reorder_t))
+            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64_jit_uni_reorder_t))
+
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_blk_reorder_t))
+            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64_jit_uni_reorder_t))
 
             REG_FAST_DIRECT_COPY(u8, f32)
             REG_FAST_DIRECT_COPY(u8, s32)
@@ -35,23 +42,17 @@ const impl_list_map_t &regular_u8_impl_list_map() {
             REG_FAST_DIRECT_COPY(u8, s8)
             REG_FAST_DIRECT_COPY(u8, u8)
 
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_blk_reorder_t))
-            DNNL_X64_ONLY(CPU_REORDER_INSTANCE(x64::jit_uni_reorder_t))
-
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_blk_reorder_t))
-            DNNL_AARCH64_ONLY(CPU_REORDER_INSTANCE(aarch64::jit_uni_reorder_t))
-
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(u8, any, f32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(u8, any, s32, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(u8, any, bf16, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(u8, any, s8, nChw16c))
             DNNL_NON_X64_ONLY(REG_SR_BIDIR(u8, any, u8, nChw16c))
 
-            REG_SR(u8, any, f32, any, fmt_order::any, spec::reference)
-            REG_SR(u8, any, s32, any, fmt_order::any, spec::reference)
-            REG_SR(u8, any, bf16, any, fmt_order::any, spec::reference)
-            REG_SR(u8, any, u8, any, fmt_order::any, spec::reference)
-            REG_SR(u8, any, s8, any, fmt_order::any, spec::reference)
+            REG_SR(u8, any, f32, any, fmt_order_any, spec_reference)
+            REG_SR(u8, any, s32, any, fmt_order_any, spec_reference)
+            REG_SR(u8, any, bf16, any, fmt_order_any, spec_reference)
+            REG_SR(u8, any, u8, any, fmt_order_any, spec_reference)
+            REG_SR(u8, any, s8, any, fmt_order_any, spec_reference)
 
             nullptr,
         }},
diff --git a/src/cpu/reorder/simple_reorder.hpp b/src/cpu/reorder/simple_reorder.hpp
index 63aee106f49..115c4419db9 100644
--- a/src/cpu/reorder/simple_reorder.hpp
+++ b/src/cpu/reorder/simple_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,13 +43,13 @@ using bd = block_dim_t;
 using ib = inner_blk_t;
 
 template <impl::data_type_t type>
-using data_t = typename prec_traits<type>::type;
+using data_t = typename prec_traits_t<type>::type;
 
 template <impl::data_type_t type_i, impl::data_type_t type_o>
-using _qz_a1b0 = q10n::qz_a1b0<data_t<type_i>, data_t<type_o>>;
+using _qz_a1b0 = q10n::qz_a1b0_t<data_t<type_i>, data_t<type_o>>;
 
 template <impl::data_type_t type_i, impl::data_type_t type_o>
-using _qz = q10n::qz<data_t<type_i>, data_t<type_o>>;
+using _qz = q10n::qz_t<data_t<type_i>, data_t<type_o>>;
 
 namespace fmt_order {
 const bool keep = true;
@@ -79,6 +79,9 @@ struct conv_req_comp {}; // {s8, u8: asymmetric quantization}
     const auto output_d = ctx.memory_mdw(DNNL_ARG_TO, pd->dst_md()); \
     DEFINE_ARG_SCALES_BUFFER_ATTR(pd->attr(), src_scales, DNNL_ARG_FROM); \
     DEFINE_ARG_SCALES_BUFFER_ATTR(pd->attr(), dst_scales_, DNNL_ARG_TO); \
+    const auto src_scales_d \
+            = ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_FROM); \
+    MAYBE_UNUSED(src_scales_d); \
     int src_scales_mask, dst_scales_mask; \
     CHECK(get_scales_mask(pd->attr(), &src_scales_mask, &dst_scales_mask)); \
     int scales_mask = std::max(src_scales_mask, dst_scales_mask); \
@@ -88,7 +91,12 @@ struct conv_req_comp {}; // {s8, u8: asymmetric quantization}
     const float *dst_scales = pd->precompute_scales( \
             scratchpad, pd->attr(), D_mask, dst_scales_); \
     MAYBE_UNUSED(dst_scales); \
-    DEFINE_ZERO_POINT_VALUE_ATTR(pd->attr(), src_zp, DNNL_ARG_FROM); \
+    DEFINE_ZERO_POINTS_BUFFER_ATTR(pd->attr(), src_zero_points, DNNL_ARG_FROM) \
+    const auto src_zps_d \
+            = ctx.memory_mdw(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_FROM); \
+    MAYBE_UNUSED(src_zps_d); \
+    int src_zp = src_zero_points ? src_zero_points[0] : 0; \
+    MAYBE_UNUSED(src_zp); \
     DEFINE_ZERO_POINT_VALUE_ATTR(pd->attr(), dst_zp, DNNL_ARG_TO); \
     const float alpha = src_scales[0] * dst_scales[0]; \
     MAYBE_UNUSED(alpha); \
@@ -125,12 +133,12 @@ inline status_t get_scales_mask(
         return status::invalid_arguments;
 
     *src_mask = 0;
-    if (!s.get(DNNL_ARG_SRC).has_default_values())
-        *src_mask = s.get(DNNL_ARG_SRC).mask_;
+    if (!s.has_default_values(DNNL_ARG_SRC))
+        *src_mask = s.get_mask(DNNL_ARG_SRC);
 
     *dst_mask = 0;
-    if (!s.get(DNNL_ARG_DST).has_default_values())
-        *dst_mask = s.get(DNNL_ARG_DST).mask_;
+    if (!s.has_default_values(DNNL_ARG_DST))
+        *dst_mask = s.get_mask(DNNL_ARG_DST);
 
     // This is used in a check function.
     if (*src_mask > 0 && *dst_mask > 0 && *dst_mask != *src_mask)
@@ -140,15 +148,68 @@ inline status_t get_scales_mask(
 inline bool simple_attr_check(const primitive_attr_t *attr,
         bool many_scales_support, bool sum_support) {
     using smask_t = primitive_attr_t::skip_mask_t;
-    smask_t skip_mask = smask_t::scales_runtime;
+    smask_t skip_mask = smask_t::scales;
     if (sum_support) skip_mask = skip_mask | smask_t::post_ops;
     if (!attr->has_default_values(skip_mask)) return false;
+    for (int arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
+        // Data type for scales is not generally supported.
+        if (!attr->scales_.has_default_data_type(arg)) return false;
+        // Groups are generally not supported.
+        if (!attr->scales_.get(arg).has_default_groups()) return false;
+    }
     if (many_scales_support) return true;
     int src_mask, dst_mask;
     if (get_scales_mask(attr, &src_mask, &dst_mask) != status::success)
         return false;
     return src_mask == 0 && dst_mask == 0;
 }
+
+// TODO: once re-factor for quantization happens, for each entry maintain a md
+// in complaince with correspondent argument for easier offset computation.
+inline status_t get_quant_md(memory_desc_t &md, const int ndims,
+        const dims_t in_dims, const int quant_mask, const dim_t g0,
+        const dim_t g1, const data_type_t dt) {
+    dims_t quant_dims {};
+    // TODO: incorporate groups into `utils::copy_dims_with_mask` to simplify
+    // the logic.
+    utils::copy_dims_with_mask(quant_dims, in_dims, ndims, quant_mask,
+            /* fill_with_ones = */ true);
+    if (ndims >= 2) {
+        if (utils::one_of(0, g0, g1)) return status::runtime_error;
+        quant_dims[ndims - 1] /= g1;
+        quant_dims[ndims - 2] /= g0;
+    }
+
+    CHECK(memory_desc_init_by_tag(
+            md, ndims, quant_dims, dt, get_abx_tag(ndims)));
+    return status::success;
+}
+
+// Returns an offset of a quantization entry based on logical offset dimensions
+// of the correspondent input - `input_idx`, `quant_mask`, groups `g0` and
+// `g1` when they are supported (otherwise, pass `1`), and `quant_dims`.
+//
+// Offset is always concide with logical index because quantization entries
+// don't have a notion of physical formats.
+inline dim_t get_quant_off(const dims_t &input_idx, const int ndims,
+        const int quant_mask, const dim_t g0, const dim_t g1,
+        const memory_desc_t &quant_md) {
+    dims_t quant_idx {};
+    utils::array_copy(quant_idx, input_idx, ndims);
+    utils::apply_mask_on_dims(quant_idx, ndims, quant_mask);
+    // Note: an `idx` must divide by a group value as grouped quantization
+    // applies to consecutive points.
+    // Using quant dimensions in `l_dims_by_l_offset` will lead to wrapping
+    // around dimensions instead of applying consecutively.
+    if (ndims >= 2) {
+        quant_idx[ndims - 1] /= g1;
+        quant_idx[ndims - 2] /= g0;
+    }
+
+    const memory_desc_wrapper q_mdw(quant_md);
+    return q_mdw.off_v(quant_idx);
+}
+
 } // namespace
 
 /* specific reorders: implementation */
@@ -160,16 +221,16 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                                 format_tag::hwigo, format_tag::dhwio,
                                 format_tag::dhwigo),
                 spec::conv_req_comp>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace data_type;
         using namespace utils;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
         int src_scales_mask, dst_scales_mask;
-        auto status = get_scales_mask(attr, &src_scales_mask, &dst_scales_mask);
-        if (status != status::success) return false;
+        CHECK(get_scales_mask(attr, &src_scales_mask, &dst_scales_mask));
         int scales_mask = std::max(src_scales_mask, dst_scales_mask);
 
         static constexpr bool w_groups = one_of(
@@ -184,16 +245,32 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             return IMPLICATION(check, mask == (w_groups ? 0x3 : 0x1));
         };
 
-        return simple_attr_check(attr, true, false)
-                && output_d.matches_tag(tag_o) && input_d.is_plain()
-                && (req_comp || req_asymmetric_comp)
-                && mask_ok(req_comp, output_d.extra().compensation_mask)
-                && mask_ok(req_asymmetric_comp,
-                        output_d.extra().asymm_compensation_mask)
-                && IMPLICATION(!w_groups, one_of(scales_mask, 0, 0x1))
-                && IMPLICATION(w_groups, one_of(scales_mask, 0, 0x3))
-                && one_of(input_d.data_type(), f32, s8, bf16)
-                && output_d.data_type() == s8;
+        VDISPATCH_REORDER_IC(one_of(input_d.data_type(), f32, s8, bf16),
+                VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == s8, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, true, false), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(
+                input_d.is_plain(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(
+                (req_comp || req_asymmetric_comp), "compensation is required");
+        VDISPATCH_REORDER_IC(
+                mask_ok(req_comp, output_d.extra().compensation_mask),
+                VERBOSE_UNSUPPORTED_FEATURE, "s8s8 compensation configuration");
+        VDISPATCH_REORDER_IC(mask_ok(req_asymmetric_comp,
+                                     output_d.extra().asymm_compensation_mask),
+                VERBOSE_UNSUPPORTED_FEATURE,
+                "zero-points compensation configuration");
+        VDISPATCH_REORDER_IC(
+                IMPLICATION(!w_groups, one_of(scales_mask, 0, 0x1)),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+        VDISPATCH_REORDER_IC(IMPLICATION(w_groups, one_of(scales_mask, 0, 0x3)),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -266,7 +343,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 const float s = src_scales[src_scales_mask == 0 ? 0 : os_off];
                 const float d = dst_scales[dst_scales_mask == 0 ? 0 : os_off];
 
-                o = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                o = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                         i, s * adj_scale * d);
                 if (req_comp) cp[g * OC + oc] -= (int32_t)o;
                 if (has_asymmetric_comp) zp[g * OC + oc] -= (int32_t)o;
@@ -320,23 +397,23 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                                         format_tag::gOIdhw2i8o4i,
                                         format_tag::gOIdhw4o4i))),
                 spec::conv_req_comp>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace format_tag;
         using namespace data_type;
         using namespace utils;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
         int src_scales_mask, dst_scales_mask;
-        auto status = get_scales_mask(attr, &src_scales_mask, &dst_scales_mask);
-        if (status != status::success) return false;
+        CHECK(get_scales_mask(attr, &src_scales_mask, &dst_scales_mask));
         int scales_mask = std::max(src_scales_mask, dst_scales_mask);
 
-        const bool w_groups = !one_of(tag_o, OIw4i16o4i, OIw2i8o4i, OIw4o4i,
-                OIhw4i16o4i, OIhw2i8o4i, OIhw4o4i, OIdhw4i16o4i, OIdhw2i8o4i,
-                OIdhw4o4i, OI4i16o4i, OI4i32o4i, OI4i64o4i, OIw4i32o4i,
-                OIw4i64o4i, OIhw4i32o4i, OIhw4i64o4i, OIdhw4i32o4i,
+        static constexpr bool w_groups = !one_of(tag_o, OIw4i16o4i, OIw2i8o4i,
+                OIw4o4i, OIhw4i16o4i, OIhw2i8o4i, OIhw4o4i, OIdhw4i16o4i,
+                OIdhw2i8o4i, OIdhw4o4i, OI4i16o4i, OI4i32o4i, OI4i64o4i,
+                OIw4i32o4i, OIw4i64o4i, OIhw4i32o4i, OIhw4i64o4i, OIdhw4i32o4i,
                 OIdhw4i64o4i);
 
         const bool req_comp = output_d.extra().flags
@@ -348,16 +425,31 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             return IMPLICATION(check, mask == (w_groups ? 0x3 : 0x1));
         };
 
-        return simple_attr_check(attr, true, false)
-                && input_d.matches_tag(tag_i) && output_d.matches_tag(tag_o)
-                && (req_comp || req_asymmetric_comp)
-                && mask_ok(req_comp, output_d.extra().compensation_mask)
-                && mask_ok(req_asymmetric_comp,
-                        output_d.extra().asymm_compensation_mask)
-                && IMPLICATION(!w_groups, one_of(scales_mask, 0, 0x1))
-                && IMPLICATION(w_groups, one_of(scales_mask, 0, 0x3))
-                && one_of(input_d.data_type(), f32, s8, bf16)
-                && output_d.data_type() == s8;
+        VDISPATCH_REORDER_IC(one_of(input_d.data_type(), f32, s8, bf16),
+                VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == s8, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, true, false), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(
+                (req_comp || req_asymmetric_comp), "compensation is required");
+        VDISPATCH_REORDER_IC(
+                mask_ok(req_comp, output_d.extra().compensation_mask),
+                "s8s8 compensation configuration is not supported");
+        VDISPATCH_REORDER_IC(mask_ok(req_asymmetric_comp,
+                                     output_d.extra().asymm_compensation_mask),
+                "zero-points compensation configuration is not supported");
+        VDISPATCH_REORDER_IC(
+                IMPLICATION(!w_groups, one_of(scales_mask, 0, 0x1)),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+        VDISPATCH_REORDER_IC(IMPLICATION(w_groups, one_of(scales_mask, 0, 0x3)),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -380,17 +472,18 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         constexpr int is_3d = utils::one_of(tag_o, gOIdhw4i16o4i, OIdhw4i16o4i,
                 gOIdhw2i8o4i, OIdhw2i8o4i, gOIdhw4o4i, OIdhw4o4i, OIdhw4i32o4i,
                 OIdhw4i64o4i);
-        constexpr dim_t icblksize = utils::one_of(tag_traits<tag_o>::inner_blks,
-                                            ib::_4a4b, ib::_4b4c)
+        constexpr dim_t icblksize
+                = utils::one_of(
+                          tag_traits_t<tag_o>::inner_blks, ib::_4a4b, ib::_4b4c)
                 ? 4
-                : utils::one_of(tag_traits<tag_o>::inner_blks, ib::_2c8b4c,
+                : utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_2c8b4c,
                           ib::_2b8a4b)
                 ? 8
                 : 16;
         constexpr dim_t ocblksize
-                = tag_traits<tag_o>::inner_blks == ib::_4b32a4b ? 32
-                : tag_traits<tag_o>::inner_blks == ib::_4b64a4b ? 64
-                                                                : icblksize;
+                = tag_traits_t<tag_o>::inner_blks == ib::_4b32a4b ? 32
+                : tag_traits_t<tag_o>::inner_blks == ib::_4b64a4b ? 64
+                                                                  : icblksize;
 
         const auto &plain_d = order_keep ? input_d : output_d;
         const auto &dims = input_d.dims();
@@ -444,7 +537,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                            int32_t *c, int32_t *zp, const float *s,
                            const float *d, const dim_t oc_block,
                            const dim_t ic_block) {
-#define index AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks>
+#define index AB_or_BC_blk_off<tag_traits_t<tag_o>::inner_blks>
             for_(dim_t ic = 0; ic < ic_block; ++ic)
             for (dim_t oc = 0; oc < oc_block; ++oc) {
                 const auto plain_off
@@ -454,7 +547,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 const float src_scale = s[src_scales_mask == 0 ? 0 : os_off];
                 const float dst_scale = d[dst_scales_mask == 0 ? 0 : os_off];
                 out[index(oc, ic)]
-                        = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                        = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                                 inp[plain_off],
                                 src_scale * adj_scale * dst_scale);
                 if (req_comp) c[oc] -= (128 * (int32_t)(out[index(oc, ic)]));
@@ -536,37 +629,45 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                                     tag_i, format_tag::goihw, format_tag::hwigo)
                                 && utils::one_of(tag_o, format_tag::gOwhi16o)),
                 spec::conv_req_comp>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace format_tag;
         using namespace data_type;
         using namespace utils;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
-        const bool w_groups = !one_of(tag_o, Owi16o, Owhi16o);
+        static constexpr bool w_groups = !one_of(tag_o, Owi16o, Owhi16o);
 
         // Current formats are only used in jit kernels that natively
         // support s8 instructions, hence, there is no need for signed
         // compensation.
         const bool req_comp = output_d.extra().flags
                 & memory_extra_flags::compensation_conv_s8s8;
-
         const bool req_asymmetric_comp = output_d.extra().flags
                 & memory_extra_flags::compensation_conv_asymmetric_src;
 
         auto mask_ok = [&](bool check, int mask) {
-            const int c_mask = 0x1,
-                      g_mask = 0x3; // mask for i/o-channel and ngroups
-            return IMPLICATION(check, mask == (w_groups ? g_mask : c_mask));
+            return IMPLICATION(check, mask == (w_groups ? 0x3 : 0x1));
         };
 
-        return simple_attr_check(attr, true, false)
-                && input_d.matches_tag(tag_i) && output_d.matches_tag(tag_o)
-                && mask_ok(req_asymmetric_comp,
-                        output_d.extra().asymm_compensation_mask)
-                && one_of(input_d.data_type(), f32, s8, bf16)
-                && output_d.data_type() == s8 && !req_comp;
+        VDISPATCH_REORDER_IC(one_of(input_d.data_type(), f32, s8, bf16),
+                VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == s8, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, true, false), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(!req_comp, "compensation is not supported");
+        VDISPATCH_REORDER_IC(mask_ok(req_asymmetric_comp,
+                                     output_d.extra().asymm_compensation_mask),
+                "zero-points compensation configuration is not supported");
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -609,7 +710,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             for (dim_t oc = 0; oc < oc_block; ++oc) {
                 const auto plain_off
                         = oc * plain_d.blocking_desc().strides[w_groups + 0];
-                out[oc] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                out[oc] = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                         inp[plain_off], s[oc] * adj_scale * d[oc]);
                 if (has_asymmetric_comp) zp[oc] -= (int32_t)(out[oc]);
             }
@@ -689,45 +790,55 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                                 && utils::one_of(tag_o, format_tag::gOdhwI16o4i,
                                         format_tag::gOIdhw16i16o4i)),
                 spec::conv_req_comp>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace format_tag;
         using namespace data_type;
         using namespace utils;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
         int src_scales_mask, dst_scales_mask;
-        auto status = get_scales_mask(attr, &src_scales_mask, &dst_scales_mask);
-        if (status != status::success) return false;
+        CHECK(get_scales_mask(attr, &src_scales_mask, &dst_scales_mask));
         int scales_mask = std::max(src_scales_mask, dst_scales_mask);
 
-        const bool w_groups = !one_of(tag_o, OwI16o4i, OIw16i16o4i, OhwI16o4i,
-                OIhw16i16o4i, OdhwI16o4i, OIdhw16i16o4i);
+        static constexpr bool w_groups = !one_of(tag_o, OwI16o4i, OIw16i16o4i,
+                OhwI16o4i, OIhw16i16o4i, OdhwI16o4i, OIdhw16i16o4i);
 
         // Current formats are only used in jit kernels that natively
         // support s8 instructions, hence, there is no need for signed
         // compensation.
         const bool req_comp = output_d.extra().flags
                 & memory_extra_flags::compensation_conv_s8s8;
-
         const bool req_asymmetric_comp = output_d.extra().flags
                 & memory_extra_flags::compensation_conv_asymmetric_src;
 
         auto mask_ok = [&](bool check, int mask) {
-            const int c_mask = 0x1,
-                      g_mask = 0x3; // mask for o-channel and ngroups
-            return IMPLICATION(check, mask == (w_groups ? g_mask : c_mask));
+            return IMPLICATION(check, mask == (w_groups ? 0x3 : 0x1));
         };
 
-        return simple_attr_check(attr, true, false)
-                && input_d.matches_tag(tag_i) && output_d.matches_tag(tag_o)
-                && mask_ok(req_asymmetric_comp,
-                        output_d.extra().asymm_compensation_mask)
-                && one_of(input_d.data_type(), f32, s8, bf16)
-                && IMPLICATION(!w_groups, one_of(scales_mask, 0, 0x1))
-                && IMPLICATION(w_groups, one_of(scales_mask, 0, 0x3))
-                && output_d.data_type() == s8 && !req_comp;
+        VDISPATCH_REORDER_IC(one_of(input_d.data_type(), f32, s8, bf16),
+                VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == s8, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, true, false), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(!req_comp, "compensation is not supported");
+        VDISPATCH_REORDER_IC(mask_ok(req_asymmetric_comp,
+                                     output_d.extra().asymm_compensation_mask),
+                "zero-points compensation configuration is not supported");
+        VDISPATCH_REORDER_IC(
+                IMPLICATION(!w_groups, one_of(scales_mask, 0, 0x1)),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+        VDISPATCH_REORDER_IC(IMPLICATION(w_groups, one_of(scales_mask, 0, 0x3)),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -746,11 +857,11 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
         constexpr dim_t oc_blksize = 16;
         constexpr dim_t ic_blksize
-                = utils::one_of(tag_traits<tag_o>::inner_blks, ib::_16b16a4b,
+                = utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_16b16a4b,
                           ib::_16c16b4c)
                 ? 64
-                : utils::one_of(
-                          tag_traits<tag_o>::inner_blks, ib::_16a4b, ib::_16b4c)
+                : utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_16a4b,
+                          ib::_16b4c)
                 ? 4
                 : 1;
         assert(ic_blksize != 1);
@@ -791,9 +902,9 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 const auto plain_off
                         = oc * plain_d.blocking_desc().strides[w_groups + 0]
                         + ic * plain_d.blocking_desc().strides[w_groups + 1];
-                auto index = AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks>(
+                auto index = AB_or_BC_blk_off<tag_traits_t<tag_o>::inner_blks>(
                         oc, ic);
-                out[index] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                out[index] = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                         inp[plain_off], s[oc] * adj_scale * d[oc]);
 
                 if (has_asymmetric_comp) zp[oc] -= (int32_t)(out[index]);
@@ -857,13 +968,20 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                                 format_tag::aCB16b48c4b,
                                 format_tag::aCB16b64c4b)),
                 spec::conv_req_comp>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace format_tag;
         using namespace data_type;
         using namespace utils;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        int src_scales_mask, dst_scales_mask;
+        CHECK(get_scales_mask(attr, &src_scales_mask, &dst_scales_mask));
+        int scales_mask = std::max(src_scales_mask, dst_scales_mask);
+        const size_t D_mask
+                = array_product(input_d.dims(), math::ilog2q(scales_mask + 1));
 
         const bool req_comp = output_d.extra().flags
                 & memory_extra_flags::compensation_conv_s8s8;
@@ -876,21 +994,26 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                     check, mask == (1 << ndims) - 1 - (1 << (ndims - 2)));
         };
 
-        int src_scales_mask, dst_scales_mask;
-        auto status = get_scales_mask(attr, &src_scales_mask, &dst_scales_mask);
-        if (status != status::success) return false;
-        int scales_mask = std::max(src_scales_mask, dst_scales_mask);
-        const size_t D_mask
-                = array_product(input_d.dims(), math::ilog2q(scales_mask + 1));
+        VDISPATCH_REORDER_IC(one_of(input_d.data_type(), f32, s8, bf16, f16,
+                                     f8_e5m2, f8_e4m3),
+                VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == s8, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, true, false), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(
+                mask_ok(req_comp, output_d.extra().compensation_mask),
+                "s8s8 compensation configuration is not supported");
+        VDISPATCH_REORDER_IC(mask_ok(req_asymmetric_comp,
+                                     output_d.extra().asymm_compensation_mask),
+                "zero-points compensation configuration is not supported");
+        VDISPATCH_REORDER_IC(D_mask == 1, VERBOSE_UNSUPPORTED_SCALES_CFG);
 
-        return simple_attr_check(attr, true, false)
-                && input_d.matches_tag(tag_i) && output_d.matches_tag(tag_o)
-                && mask_ok(req_comp, output_d.extra().compensation_mask)
-                && mask_ok(req_asymmetric_comp,
-                        output_d.extra().asymm_compensation_mask)
-                && one_of(input_d.data_type(), f32, s8, bf16, f16, f8_e5m2,
-                        f8_e4m3)
-                && output_d.data_type() == s8 && D_mask == 1;
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -904,16 +1027,16 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         // 3D: batch_dim <-> a, d0 <-> b, d1 <-> c
         constexpr dim_t D0_blksize = 64;
         constexpr dim_t D1_blksize
-                = (utils::one_of(tag_traits<tag_o>::inner_blks, ib::_16a64b4a,
+                = (utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_16a64b4a,
                           ib::_16b64c4b))
                 ? 64
-                : (utils::one_of(tag_traits<tag_o>::inner_blks, ib::_16a48b4a,
+                : (utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_16a48b4a,
                           ib::_16b48c4b))
                 ? 48
-                : (utils::one_of(tag_traits<tag_o>::inner_blks, ib::_16a32b4a,
+                : (utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_16a32b4a,
                           ib::_16b32c4b))
                 ? 32
-                : (utils::one_of(tag_traits<tag_o>::inner_blks, ib::_16a16b4a,
+                : (utils::one_of(tag_traits_t<tag_o>::inner_blks, ib::_16a16b4a,
                           ib::_16b16c4b))
                 ? 16
                 : 1;
@@ -952,10 +1075,11 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                             = d0 * plain_d.blocking_desc().strides[ndims - 2]
                             + d1 * plain_d.blocking_desc().strides[ndims - 1];
                     auto index
-                            = AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks>(
+                            = AB_or_BC_blk_off<tag_traits_t<tag_o>::inner_blks>(
                                     d0, d1);
-                    out[index] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
-                            inp[plain_off], s[0] * adj_scale * d[0]);
+                    out[index]
+                            = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
+                                    inp[plain_off], s[0] * adj_scale * d[0]);
 
                     auto o = static_cast<int32_t>(out[index]);
                     if (req_comp) cp[d1] -= (128 * o);
@@ -963,18 +1087,19 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 }
                 for (int d1 = d1_block; d1 < D1_blksize; ++d1) {
                     auto index
-                            = AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks>(
+                            = AB_or_BC_blk_off<tag_traits_t<tag_o>::inner_blks>(
                                     d0, d1);
-                    out[index] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
-                            0, s[0] * adj_scale * d[0]);
+                    out[index]
+                            = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
+                                    0, s[0] * adj_scale * d[0]);
                 }
             }
 
             for_(int d0 = d0_block; d0 < D0_blksize; ++d0)
             for (int d1 = 0; d1 < D1_blksize; ++d1) {
-                auto index = AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks>(
+                auto index = AB_or_BC_blk_off<tag_traits_t<tag_o>::inner_blks>(
                         d0, d1);
-                out[index] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                out[index] = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                         0, s[0] * adj_scale * d[0]);
             }
         };
@@ -1041,16 +1166,16 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                                         format_tag::Goihw8g,
                                         format_tag::Goihw4g)),
                 spec::conv_req_comp>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace data_type;
         using namespace utils;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
         int src_scales_mask, dst_scales_mask;
-        auto status = get_scales_mask(attr, &src_scales_mask, &dst_scales_mask);
-        if (status != status::success) return false;
+        CHECK(get_scales_mask(attr, &src_scales_mask, &dst_scales_mask));
         int scales_mask = std::max(src_scales_mask, dst_scales_mask);
 
         const dim_t g = input_d.dims()[0];
@@ -1064,22 +1189,36 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         int s8s8_comp_mask = output_d.extra().compensation_mask;
         int zp_comp_mask = output_d.extra().asymm_compensation_mask;
         int comp_mask = std::max(s8s8_comp_mask, zp_comp_mask);
-
         const size_t D_mask
                 = array_product(input_d.dims(), math::ilog2q(comp_mask + 1));
 
-        return order_keep && oc == 1 && ic == 1 // depth-wise case
-                && simple_attr_check(attr, true, false)
-                && (req_comp || req_asymmetric_comp)
-                && IMPLICATION(req_comp && req_asymmetric_comp,
+        VDISPATCH_REORDER_IC(order_keep, "unsupported internal impl detail");
+        VDISPATCH_REORDER_IC(ic == 1, "non-depthwise case is not supported");
+        VDISPATCH_REORDER_IC(oc == 1, "non-depthwise case is not supported");
+        VDISPATCH_REORDER_IC(one_of(input_d.data_type(), f32, s8, bf16),
+                VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == s8, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, true, false), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(
+                (req_comp || req_asymmetric_comp), "compensation is required");
+        VDISPATCH_REORDER_IC(
+                IMPLICATION(req_comp && req_asymmetric_comp,
                         output_d.extra().compensation_mask
-                                == output_d.extra().asymm_compensation_mask)
-                && input_d.matches_tag(tag_i) && output_d.matches_tag(tag_o)
-                && IMPLICATION(
-                        req_comp, one_of(D_mask, (size_t)1, (size_t)g * oc))
-                && one_of(scales_mask, 0, 0x3)
-                && one_of(input_d.data_type(), f32, s8, bf16)
-                && output_d.data_type() == s8;
+                                == output_d.extra().asymm_compensation_mask),
+                "compensation configuration is not supported");
+        VDISPATCH_REORDER_IC(IMPLICATION(req_comp,
+                                     one_of(D_mask, (size_t)1, (size_t)g * oc)),
+                "compensation mask is not supported");
+        VDISPATCH_REORDER_IC(
+                one_of(scales_mask, 0, 0x3), VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -1128,7 +1267,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         = src_scales[src_scales_mask == 0 ? 0 : g * OC];
                 const float dst_scale
                         = dst_scales[dst_scales_mask == 0 ? 0 : g * OC];
-                out[g] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                out[g] = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                         inp[i_off], src_scale * adj_scale * dst_scale);
             }
         };
@@ -1220,15 +1359,26 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         || tag_o == format_tag::IOhw8o16i2o)
                 && type_i == data_type::f32
                 && type_o == data_type::bf16)>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace data_type;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        VDISPATCH_REORDER_IC(order_keep, "unsupported internal impl detail");
+        VDISPATCH_REORDER_IC(
+                input_d.data_type() == f32, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == bf16, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                attr->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
 
-        return order_keep && input_d.matches_tag(tag_i)
-                && output_d.matches_tag(tag_o) && input_d.data_type() == f32
-                && output_d.data_type() == bf16 && attr->has_default_values();
+        return status::success;
     }
 
     static size_t get_scratchpad_size(const memory_desc_wrapper &input_d,
@@ -1322,25 +1472,39 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-        typename utils::enable_if<(tag_i == format_tag::nchw
-                                          && tag_o == format_tag::nChw16c)
+        typename utils::enable_if<((tag_i == format_tag::nchw
+                                          && tag_o == format_tag::nChw16c) ||
+                                    (tag_i == format_tag::ncw
+                                          && tag_o == format_tag::nCw16c))
                 && type_i == data_type::f32
                 && type_o == data_type::bf16>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         using namespace data_type;
 
-        if (input_d.has_runtime_dims_or_strides()) return false;
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        VDISPATCH_REORDER_IC(order_keep, "unsupported internal impl detail");
+        VDISPATCH_REORDER_IC(
+                input_d.data_type() == f32, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                output_d.data_type() == bf16, VERBOSE_UNSUPPORTED_DT);
+        VDISPATCH_REORDER_IC(
+                attr->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(input_d.matches_tag(tag_i),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(output_d.matches_tag(tag_o),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
 
-        return input_d.matches_tag(tag_i) && output_d.matches_tag(tag_o)
-                && input_d.data_type() == f32 && output_d.data_type() == bf16
-                && attr->has_default_values();
+        return status::success;
     }
 
     static size_t get_scratchpad_size(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d) {
+        constexpr int ndims = tag_traits_t<tag_i>::ndims;
         const size_t blksize = 16;
-        const size_t W = input_d.dims()[3];
+        const size_t W = input_d.dims()[ndims - 1];
         return sizeof(float) * blksize * W * dnnl_get_max_threads();
     }
 
@@ -1348,14 +1512,15 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         DECLARE_COMMON_PARAMS();
 
         const dim_t blksize = 16;
+        const dim_t ndims = tag_traits_t<tag_i>::ndims;
 
         const auto &flat_d = input_d;
         const auto &dims = input_d.dims();
         const auto &pdims = output_d.padded_dims();
 
         const dim_t C = dims[1];
-        const dim_t H = dims[2];
-        const dim_t W = dims[3];
+        const dim_t H = ndims == 3 ? 1 : dims[ndims - 2];
+        const dim_t W = dims[ndims - 1];
 
         const dim_t wsp_size = W * blksize;
         float *wspace = scratchpad.template get<float>(
@@ -1368,7 +1533,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 for (c = 0; c < curr_c_block; ++c) {
                     const ptrdiff_t flat_off = 0
                             + c * flat_d.blocking_desc().strides[1]
-                            + w * flat_d.blocking_desc().strides[3];
+                            + w * flat_d.blocking_desc().strides[ndims - 1];
                     o[w * blksize + c] = i[flat_off];
                 }
                 for (/* continue */; c < c_block; ++c) {
@@ -1407,10 +1572,20 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         && tag_o == format_tag::nChw16c)
                 || (utils::one_of(tag_i, format_tag::nCw4c, format_tag::nCw8c)
                         && tag_o == format_tag::nCw16c)>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d)
-                && simple_attr_check(attr, false, true);
+        using namespace data_type;
+
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        VDISPATCH_REORDER_IC(
+                simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d),
+                "unsupported configuration");
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, false, true), VERBOSE_UNSUPPORTED_ATTR);
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -1423,7 +1598,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         constexpr int is_3d = utils::one_of(tag_i, nCdhw4c, nCdhw8c);
 
         constexpr dim_t blksize_i
-                = tag_traits<tag_i>::inner_blks == ib::_4b ? 4 : 8;
+                = tag_traits_t<tag_i>::inner_blks == ib::_4b ? 4 : 8;
         constexpr dim_t blksize_16 = 16;
 
         constexpr dim_t ic_mult = order_keep ? blksize_16 / blksize_i : 1;
@@ -1519,24 +1694,28 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 };
 
 #define PLAIN_TO_BLOCKED_IS_APPLICABLE() \
-    static bool is_applicable(const memory_desc_wrapper &input_d, \
+    static status_t is_applicable(const memory_desc_wrapper &input_d, \
             const memory_desc_wrapper &output_d, \
             const primitive_attr_t *attr) { \
-        return !input_d.has_runtime_dims_or_strides() \
-                && simple_attr_check(attr, false, true) \
-                && (order_keep ? output_d.matches_tag(tag_o) \
-                                        && input_d.is_plain() \
-                               : input_d.matches_tag(tag_o) \
-                                        && output_d.is_plain()); \
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(), \
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED); \
+        VDISPATCH_REORDER_IC(simple_attr_check(attr, false, true), \
+                VERBOSE_UNSUPPORTED_ATTR); \
+        VDISPATCH_REORDER_IC((order_keep ? output_d.matches_tag(tag_o) \
+                                                     && input_d.is_plain() \
+                                         : input_d.matches_tag(tag_o) \
+                                                     && output_d.is_plain()), \
+                "unsupported memory formats configuration"); \
+        return status::success; \
     }
 
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         typename utils::enable_if<tag_i == format_tag::any
-                && (tag_traits<tag_o>::block_dims == bd::_A
-                        || tag_traits<tag_o>::block_dims == bd::_B)
-                && tag_traits<tag_o>::ndims >= 3
-                && tag_traits<tag_o>::ndims <= 6>::type> {
+                && (tag_traits_t<tag_o>::block_dims == bd::_A
+                        || tag_traits_t<tag_o>::block_dims == bd::_B)
+                && tag_traits_t<tag_o>::ndims >= 3
+                && tag_traits_t<tag_o>::ndims <= 6>::type> {
     PLAIN_TO_BLOCKED_IS_APPLICABLE();
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -1549,8 +1728,8 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         const dims_t &dims = input_d.dims();
         const dims_t &pdims = block_d.padded_dims();
 
-        const int ndims = tag_traits<tag_o>::ndims;
-        const int blk_idx = tag_traits<tag_o>::block_dims == bd::_A ? 0 : 1;
+        const int ndims = tag_traits_t<tag_o>::ndims;
+        const int blk_idx = tag_traits_t<tag_o>::block_dims == bd::_A ? 0 : 1;
 
         const dim_t H0 = dims[0];
         const dim_t H1 = dims[1];
@@ -1565,7 +1744,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         using namespace utils;
 
         dim_t blksize = -1;
-        switch (tag_traits<tag_o>::inner_blks) {
+        switch (tag_traits_t<tag_o>::inner_blks) {
             case ib::_4a:
             case ib::_4b: blksize = 4; break;
             case ib::_8a:
@@ -1684,14 +1863,17 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         typename utils::enable_if<tag_i == format_tag::any
-                && (tag_traits<tag_o>::block_dims == bd::_AB
-                        || tag_traits<tag_o>::block_dims == bd::_BC)
-                && IMPLICATION(tag_traits<tag_o>::block_dims == bd::_AB,
-                        tag_traits<tag_o>::ndims >= 3
-                                && tag_traits<tag_o>::ndims <= 5)
-                && IMPLICATION(tag_traits<tag_o>::block_dims == bd::_BC,
-                        tag_traits<tag_o>::ndims >= 4
-                                && tag_traits<tag_o>::ndims <= 6)>::type> {
+                && (tag_traits_t<tag_o>::block_dims == bd::_AB
+                        || tag_traits_t<tag_o>::block_dims == bd::_BC)
+                && IMPLICATION(tag_traits_t<tag_o>::block_dims == bd::_AB,
+                        tag_traits_t<tag_o>::ndims >= 3
+                                && tag_traits_t<tag_o>::ndims <= 5)
+                && IMPLICATION(tag_traits_t<tag_o>::block_dims == bd::_BC,
+                        tag_traits_t<tag_o>::ndims >= 4
+                                && tag_traits_t<tag_o>::ndims <= 6)
+                && (type_i != dnnl_bin && type_o != dnnl_bin)
+                && (type_i != dnnl_nf4 && type_o != dnnl_nf4)
+                && (type_i != dnnl_f4_e2m1 && type_o != dnnl_f4_e2m1)>::type> {
     PLAIN_TO_BLOCKED_IS_APPLICABLE();
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -1704,9 +1886,10 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         const auto &pdims
                 = order_keep ? output_d.padded_dims() : input_d.padded_dims();
 
-        constexpr int ndims = tag_traits<tag_o>::ndims;
+        constexpr int ndims = tag_traits_t<tag_o>::ndims;
 
-        static constexpr bool with_g = tag_traits<tag_o>::block_dims == bd::_BC;
+        static constexpr bool with_g
+                = tag_traits_t<tag_o>::block_dims == bd::_BC;
         const dim_t G = with_g ? dims[0] : 1;
 
         const dim_t H0 = dims[0 + with_g];
@@ -1723,7 +1906,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
         dim_t blksize_0 = -1;
         dim_t blksize_1 = -1;
-        switch (tag_traits<tag_o>::inner_blks) {
+        switch (tag_traits_t<tag_o>::inner_blks) {
             case ib::_4b4a:
             case ib::_4b4c:
             case ib::_4c4b:
@@ -1777,7 +1960,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
         auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o,
                            const int block_h0, const int block_h1) {
-#define blk_off AB_or_BC_blk_off<tag_traits<tag_o>::inner_blks>
+#define blk_off AB_or_BC_blk_off<tag_traits_t<tag_o>::inner_blks>
             if (alpha == 1.0 && beta == 0.0) {
                 for (int h0 = 0; h0 < block_h0; ++h0) {
                     for (int h1 = 0; h1 < block_h1; ++h1) {
@@ -1872,6 +2055,322 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
     }
 };
 
+template <SIMPLE_REORDER_TEMPL_DECL>
+struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
+typename utils::enable_if<(tag_i == format_tag::nchw || tag_i == format_tag::nhwc) &&
+                           tag_o == format_tag::nhwc &&
+                           (type_i == dnnl_bin || type_o == dnnl_bin)>::type>
+{
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
+                              const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        VDISPATCH_REORDER_IC(
+                simple_fmt_check(order_keep, tag_i, tag_o, input_d, output_d),
+                "unsupported configuration");
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, false, false), VERBOSE_UNSUPPORTED_ATTR);
+        return status::success;
+    }
+
+    GET_SCRATCHPAD_SIZE_ZERO();
+
+    static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
+        DECLARE_COMMON_PARAMS();
+
+        const auto &dims = input_d.dims();
+        const int C = dims[1];
+        const int H = dims[2];
+        const int W = dims[3];
+
+        int nbits = 8;
+        const int CB = utils::div_up(C, nbits);
+
+        auto ker = [&](const data_t<type_i> *i, data_t<type_o> *o) {
+            for (int cb = 0; cb < CB; ++cb) {
+                uint8_t bin_val = 0x00;
+                for (int c = cb * nbits, shift = 0; c < std::min(C, (cb + 1) * nbits); c++, shift++) {
+                    const ptrdiff_t flat_off = c * input_d.blocking_desc().strides[1];
+
+                    auto bit = uint8_t((i[flat_off] > 0) ? 0x01 : 0x00);
+                    bin_val |= (bit << shift);
+                }
+
+                o[cb] = bin_val;
+            }
+        };
+
+        parallel_nd(dims[0], H, W,
+            [&](int n, int h, int w) {
+                auto iidx = input_d.blk_off(n, 0, h, w);
+                auto oidx = output_d.blk_off(n, 0, h, w);
+
+                auto i = &input[iidx];
+                auto o = &output[oidx / nbits];
+                ker(i, o);
+        });
+
+        return status::success;
+    }
+};
+
+template <SIMPLE_REORDER_TEMPL_DECL>
+struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
+typename utils::enable_if<tag_i == format_tag::any &&
+                          (tag_o == format_tag::OIhw8o32i || tag_o == format_tag::OIhw16o32i) &&
+                          type_i == dnnl_bin &&
+                          type_o == dnnl_bin>::type>
+{
+    PLAIN_TO_BLOCKED_IS_APPLICABLE();
+
+    GET_SCRATCHPAD_SIZE_ZERO();
+
+    static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
+        DECLARE_COMMON_PARAMS();
+
+        static constexpr bool w_groups = false;
+        constexpr int blksize_o = tag_o == format_tag::OIhw8o32i ? 8 : 16;
+        constexpr int blksize_i = 32;
+
+        const auto &dims = input_d.dims();
+        const auto &pdims = order_keep
+            ? output_d.padded_dims()
+            : input_d.padded_dims();
+
+        const int G = w_groups ? dims[0] : 1;
+        const int OC = dims[w_groups + 0];
+        const int NB_OC = pdims[w_groups + 0] / blksize_o;
+        const int IC = dims[w_groups + 1];
+        const int NB_IC = pdims[w_groups + 1] / blksize_i;
+        const int H = dims[w_groups + 2];
+        const int W = dims[w_groups + 3];
+
+        constexpr int i_mult_o = blksize_o;
+        constexpr int i_mult_i = blksize_i;
+        constexpr int nbits = 8;
+
+        auto extract_bit = [](uint8_t val, uint8_t bit) -> uint8_t {
+            return (uint8_t) ((val >> bit) & 0x0001);
+        };
+
+        parallel_nd(G, NB_OC, NB_IC, H, W,
+            [&](int g, int nb_oc, int nb_ic, int h, int w) {
+                const int oc_block = nstl::min(blksize_o, OC - nb_oc * blksize_o);
+                const int ic_block = nstl::min(blksize_i, IC - nb_ic * blksize_i);
+
+                for (int oc = 0; oc < oc_block; ++oc) {
+                    for (int icb = 0; icb < utils::div_up(ic_block, nbits); ++icb) {
+
+                        uint8_t bin_val = 0x00;
+                        for (int ic = icb*nbits, shift = 0; ic < std::min(IC, (icb + 1)*nbits); ic++, shift++) {
+                            size_t iidx = (i_mult_o * nb_oc + oc) * input_d.blocking_desc().strides[0] +
+                                          (i_mult_i * nb_ic + ic) * input_d.blocking_desc().strides[1] +
+                                                                h * input_d.blocking_desc().strides[2] +
+                                                                w;
+
+                            uint8_t bit = extract_bit(input[iidx / nbits], (uint8_t)(iidx % nbits));
+                            bin_val |= (bit << shift);
+                        }
+
+                        size_t oidx = output_d.blk_off<!w_groups>(g, nb_oc, nb_ic, h, w) + oc * blksize_i + icb * nbits;
+                        output[oidx / nbits] = bin_val;
+
+                    }
+                }
+            });
+
+        return status::success;
+    }
+};
+
+template <SIMPLE_REORDER_TEMPL_DECL>
+struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
+typename utils::enable_if<tag_i == format_tag::any &&
+                          tag_traits_t<tag_o>::block_dims == bd::_AB &&
+                          utils::one_of(type_i, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) &&
+                          type_i == type_o>::type>
+{
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
+                              const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        if (!(!input_d.has_runtime_dims_or_strides() &&
+             simple_attr_check(attr, false, true) &&
+             (order_keep ? output_d.matches_tag(tag_o) && input_d.is_plain()
+                         : input_d.matches_tag(tag_o) && output_d.is_plain())))
+            return status::invalid_arguments;
+
+        if (output_d.blocking_desc().inner_nblks != 3 ||
+            !utils::one_of(output_d.blocking_desc().inner_blks[2], 2, 4) ||
+            output_d.blocking_desc().inner_idxs[2] != 1)
+            return status::invalid_arguments;
+
+        return status::success;
+    }
+
+    GET_SCRATCHPAD_SIZE_ZERO();
+
+    static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
+        DECLARE_COMMON_PARAMS();
+
+        int blksize_o = 1;
+        int blksize_i = 1;
+
+        for (int i = 0; i < output_d.blocking_desc().inner_nblks; i++) {
+            if (output_d.blocking_desc().inner_idxs[i] == 0)
+                blksize_o *= output_d.blocking_desc().inner_blks[i];
+            else
+                blksize_i *= output_d.blocking_desc().inner_blks[i];
+        }
+
+        const auto &dims = input_d.dims();
+        const auto &pdims = order_keep
+            ? output_d.padded_dims()
+            : input_d.padded_dims();
+
+        const int OC = dims[0];
+        const int NB_OC = pdims[0] / blksize_o;
+        const int IC = dims[1];
+        const int NB_IC = pdims[1] / blksize_i;
+
+        int i_mult_o = blksize_o;
+        int i_mult_i = blksize_i;
+
+        auto extract_half_byte = [&](uint8_t val, bool high_half) -> uint8_t {
+            uint8_t shift = high_half ? 4 : 0;
+
+            return (uint8_t) ((val >> shift) & 0x000F);
+        };
+
+        auto insert_half_byte = [](uint8_t dst, uint8_t val, bool high_half) -> uint8_t {
+            uint8_t shift = high_half ? 0 : 4;
+            return dst | (uint8_t) (val << shift);
+        };
+
+        if (output_d.blocking_desc().inner_blks[2] == 4) {
+            parallel_nd(NB_OC, NB_IC,
+                [&](int nb_oc, int nb_ic) {
+                    const int oc_block = nstl::min(blksize_o, OC - nb_oc * blksize_o);
+                    const int ic_block = nstl::min(blksize_i, IC - nb_ic * blksize_i);
+
+                    for (int icb = 0; icb < utils::div_up(ic_block, 8); ++icb) {
+                        for (int oc = 0; oc < oc_block; ++oc) {
+                             const int ic_int_block = nstl::min(8, ic_block - icb * 8);
+                            for (int ic = 0; ic < ic_int_block; ++ic) {
+                                size_t iidx = (i_mult_o * nb_oc + oc) * input_d.blocking_desc().strides[0] +
+                                            (i_mult_i * nb_ic + icb * 8 + ic) * input_d.blocking_desc().strides[1];
+                                size_t oidx = output_d.blk_off<false>(nb_oc, nb_ic) + icb * blksize_o * 8 + oc * 8 + 2 * (ic % 4) + ic / 4;
+                                const uint8_t* packed_val = reinterpret_cast<const uint8_t *>(input);
+                                auto src_val = extract_half_byte(packed_val[iidx / 2], (uint8_t)(iidx % 2));
+                                uint8_t* output_val = reinterpret_cast<uint8_t *>(output);
+                                uint8_t dst_val = oidx % 2 == 0 ? 0 : output_val[oidx / 2];
+                                dst_val = insert_half_byte(dst_val, src_val, (uint8_t)(oidx % 2));
+                                output_val[oidx / 2] = dst_val;
+                            }
+                        }
+                    }
+                });
+        } else {
+            parallel_nd(NB_OC, NB_IC,
+                [&](int nb_oc, int nb_ic) {
+                    const int oc_block = nstl::min(blksize_o, OC - nb_oc * blksize_o);
+                    const int ic_block = nstl::min(blksize_i, IC - nb_ic * blksize_i);
+
+                    for (int icb = 0; icb < utils::div_up(ic_block, 2); ++icb) {
+                        for (int oc = 0; oc < oc_block; ++oc) {
+                            for (int ic = 0; ic < 2; ++ic) {
+                                size_t iidx = (i_mult_o * nb_oc + oc) * input_d.blocking_desc().strides[0] +
+                                            (i_mult_i * nb_ic + icb *2 + ic) * input_d.blocking_desc().strides[1];
+                                size_t oidx = output_d.blk_off<false>(nb_oc, nb_ic) + icb * blksize_o * 2 + oc * 2 + ic;
+                                const uint8_t* packed_val = reinterpret_cast<const uint8_t *>(input);
+                                auto src_val = extract_half_byte(packed_val[iidx / 2], (uint8_t)(iidx % 2));
+                                uint8_t* output_val = reinterpret_cast<uint8_t *>(output);
+                                uint8_t dst_val = ic == 1 ? output_val[oidx / 2] : 0;
+                                dst_val = insert_half_byte(dst_val, src_val, (uint8_t)(oidx % 2));
+                                output_val[oidx / 2] = dst_val;
+                            }
+                        }
+                    }
+                });
+        }
+
+        return status::success;
+    }
+};
+
+template <SIMPLE_REORDER_TEMPL_DECL>
+struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
+        typename utils::enable_if<tag_i == format_tag::any
+                        && tag_o == format_tag::any
+                        && utils::one_of(type_i, data_type::nf4, data_type::s4, data_type::u4)
+                        && utils::one_of(type_o, data_type::u8, data_type::f32),
+                spec::reference>::type> {
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
+            const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        if (!input_d.has_runtime_dims_or_strides()
+            && input_d.is_dense() && output_d.is_dense()
+            && simple_attr_check(attr, false, true)) {
+                return status::success;
+        }
+        return status::invalid_arguments;
+    }
+
+    GET_SCRATCHPAD_SIZE_ZERO();
+
+    static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
+        DECLARE_COMMON_PARAMS();
+        using namespace utils;
+
+        input += input_d.blk_off(0);
+        output += output_d.blk_off(0);
+
+        const dim_t work_amount = input_d.nelems();
+
+        auto extract_half_byte = [&](uint8_t val, bool high_half) -> uint8_t {
+            uint8_t shift = high_half ? 4 : 0;
+
+            return (uint8_t)((val >> shift) & 0x000F);
+        };
+
+        parallel(0, [&](const int ithr, const int nthr) {
+            dim_t start {0}, end {0};
+            balance211(work_amount, nthr, ithr, start, end);
+            if (utils::one_of(type_i, dnnl_s4, dnnl_u4)) {
+                PRAGMA_OMP_SIMD()
+                for (dim_t idx = start; idx < end; idx++) {
+                    const auto i_off = input_d.off_l(idx);
+                    const auto o_off = output_d.off_l(idx);
+                    const int8_t src_val = extract_half_byte(input[i_off / 2], i_off % 2);
+                    output[o_off] = _qz_a1b0<dnnl_s8, type_o>()(src_val);
+                }
+            } else {
+                static const std::array<float, 16> lookup = {-1.0f,
+                                                -0.6961928009986877f,
+                                                -0.5250730514526367f,
+                                                -0.39491748809814453f,
+                                                -0.28444138169288635f,
+                                                -0.18477343022823334f,
+                                                -0.09105003625154495f,
+                                                0.0f,
+                                                0.07958029955625534f,
+                                                0.16093020141124725f,
+                                                0.24611230194568634f,
+                                                0.33791524171829224f,
+                                                0.44070982933044434f,
+                                                0.5626170039176941f,
+                                                0.7229568362236023f,
+                                                1.0f};
+
+                PRAGMA_OMP_SIMD()
+                for (dim_t idx = start; idx < end; idx++) {
+                    const auto i_off = input_d.off_l(idx);
+                    const auto o_off = output_d.off_l(idx);
+                    const uint8_t idx_val = extract_half_byte(input[i_off / 2], i_off % 2);
+                    output[o_off] = lookup[idx_val];
+                }
+            }
+        });
+
+        return status::success;
+    }
+};
+
 /* generic and direct-copy reorders */
 
 template <SIMPLE_REORDER_TEMPL_DECL>
@@ -1880,12 +2379,21 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         && tag_o == format_tag::any
                         && order_keep == fmt_order::any,
                 spec::direct_copy>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return !input_d.has_runtime_dims_or_strides()
-                && input_d.similar_to(output_d, true, false, 0)
-                && input_d.is_dense() && output_d.is_dense()
-                && simple_attr_check(attr, false, true);
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, false, true), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(
+                input_d.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(
+                output_d.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(input_d.similar_to(output_d, true, false, 0),
+                VERBOSE_TENSOR_FORMAT_MISMATCH, "src", "dst");
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -1911,25 +2419,26 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             if (alpha == 1.0 && beta == 0.0) {
                 PRAGMA_OMP_SIMD()
                 for (size_t e = start; e < end; ++e) {
-                    output[e] = q10n::qz_a1b0<data_t<type_i>, data_t<type_o>>()(
-                            input[e]);
+                    output[e]
+                            = q10n::qz_a1b0_t<data_t<type_i>, data_t<type_o>>()(
+                                    input[e]);
                 }
             } else if (alpha == 1.0) {
                 PRAGMA_OMP_SIMD()
                 for (size_t e = start; e < end; ++e) {
-                    output[e] = q10n::qz_a1<data_t<type_i>, data_t<type_o>>()(
+                    output[e] = q10n::qz_a1_t<data_t<type_i>, data_t<type_o>>()(
                             input[e], output[e], beta);
                 }
             } else if (beta == 0.0) {
                 PRAGMA_OMP_SIMD()
                 for (size_t e = start; e < end; ++e) {
-                    output[e] = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
+                    output[e] = q10n::qz_b0_t<data_t<type_i>, data_t<type_o>>()(
                             input[e], alpha);
                 }
             } else {
                 PRAGMA_OMP_SIMD()
                 for (size_t e = start; e < end; ++e) {
-                    output[e] = q10n::qz<data_t<type_i>, data_t<type_o>>()(
+                    output[e] = q10n::qz_t<data_t<type_i>, data_t<type_o>>()(
                             input[e], output[e], alpha, beta);
                 }
             }
@@ -1938,28 +2447,27 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 if (alpha == 1.0 && beta == 0.0) {
                     PRAGMA_OMP_SIMD()
                     for (size_t e = nelems - rem_elems; e < nelems; ++e) {
-                        output[e] = q10n::qz_a1b0<data_t<type_i>,
+                        output[e] = q10n::qz_a1b0_t<data_t<type_i>,
                                 data_t<type_o>>()(input[e]);
                     }
                 } else if (alpha == 1.0) {
                     PRAGMA_OMP_SIMD()
                     for (size_t e = nelems - rem_elems; e < nelems; ++e) {
-                        output[e]
-                                = q10n::qz_a1<data_t<type_i>, data_t<type_o>>()(
-                                        input[e], output[e], beta);
+                        output[e] = q10n::qz_a1_t<data_t<type_i>,
+                                data_t<type_o>>()(input[e], output[e], beta);
                     }
                 } else if (beta == 0.0) {
                     PRAGMA_OMP_SIMD()
                     for (size_t e = nelems - rem_elems; e < nelems; ++e) {
-                        output[e]
-                                = q10n::qz_b0<data_t<type_i>, data_t<type_o>>()(
-                                        input[e], alpha);
+                        output[e] = q10n::qz_b0_t<data_t<type_i>,
+                                data_t<type_o>>()(input[e], alpha);
                     }
                 } else {
                     PRAGMA_OMP_SIMD()
                     for (size_t e = nelems - rem_elems; e < nelems; ++e) {
-                        output[e] = q10n::qz<data_t<type_i>, data_t<type_o>>()(
-                                input[e], output[e], alpha, beta);
+                        output[e]
+                                = q10n::qz_t<data_t<type_i>, data_t<type_o>>()(
+                                        input[e], output[e], alpha, beta);
                     }
                 }
             }
@@ -1971,13 +2479,23 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         typename utils::enable_if<tag_i == format_tag::any
-                        && tag_o == format_tag::any && type_i == dnnl_f32
-                        && utils::one_of(type_o, dnnl_s4, dnnl_u4),
+                        && tag_o == format_tag::any && type_i == data_type::f32
+                        && utils::one_of(type_o, data_type::s4, data_type::u4,
+                                data_type::f4_e2m1, data_type::f4_e3m0),
                 spec::reference>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return !input_d.has_runtime_dims_or_strides() && input_d.is_dense()
-                && output_d.is_dense() && simple_attr_check(attr, false, true);
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, false, true), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(
+                input_d.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(
+                output_d.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+
+        return status::success;
     }
 
     static size_t get_scratchpad_size(const memory_desc_wrapper &input_d,
@@ -2024,19 +2542,19 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
             dim_t start {0}, end {0};
             balance211(work_amount, nthr, ithr, start, end);
             PRAGMA_OMP_SIMD()
-            for_(dim_t j = start; j < end; j++)
-            for (int i = 0; i < 2; ++i) {
-                const auto idx = 2 * j + i;
-                const auto i_off = need_transform ? idx : input_d.off_l(idx);
+            for (dim_t j = start; j < end; j++) {
+                const auto idx = 2 * j;
+
+                const auto i0_off = need_transform ? idx : input_d.off_l(idx);
+                auto val0 = _qz_a1b0<data_type::f32, type_o>()(wspace[i0_off]);
+
+                const auto i1_off
+                        = need_transform ? idx + 1 : input_d.off_l(idx + 1);
+                auto val1 = _qz_a1b0<data_type::f32, type_o>()(wspace[i1_off]);
+
                 const auto o_off = need_transform ? idx : output_d.off_l(idx);
-                const auto shift = i % 2 ? int4_extract_t::high_half
-                                         : int4_extract_t::low_half;
-                auto src_val
-                        = _qz_a1b0<data_type::f32, type_o>()(wspace[i_off]);
-                const uint8_t dst_val = i == 0
-                        ? 0
-                        : reinterpret_cast<uint8_t *>(output)[o_off / 2];
-                output[o_off / 2] = src_val.insert(dst_val, shift);
+                nibble2_t o_val(val0.raw_bits_, val1.raw_bits_);
+                reinterpret_cast<uint8_t *>(output)[o_off / 2] = o_val.get();
             }
         });
 
@@ -2048,13 +2566,33 @@ template <SIMPLE_REORDER_TEMPL_DECL>
 struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         typename utils::enable_if<tag_i == format_tag::any
                         && tag_o == format_tag::any
-                        && utils::one_of(type_i, dnnl_s4, dnnl_u4)
-                        && type_o == dnnl_f32,
+                        && ((utils::one_of(type_i, data_type::s4, data_type::u4,
+                                data_type::f4_e2m1, data_type::f4_e3m0)
+                        && utils::one_of(type_o, // data_type::f32,
+                                data_type::bf16, data_type::f16)) ||
+                           (utils::one_of(type_i, data_type::f4_e2m1, data_type::f4_e3m0)
+                            && utils::one_of(type_o, data_type::f32))),
                 spec::reference>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        return !input_d.has_runtime_dims_or_strides() && input_d.is_dense()
-                && output_d.is_dense() && simple_attr_check(attr, false, true);
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+        VDISPATCH_REORDER_IC(
+                input_d.nelems() % 2 == 0, "Unsupported dimensions");
+        VDISPATCH_REORDER_IC(
+                input_d.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(
+                output_d.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+
+        using smask_t = primitive_attr_t::skip_mask_t;
+        smask_t skip_mask = smask_t::scales_data_type | smask_t::scales_groups
+                | smask_t::zero_points_data_type | smask_t::zero_points_groups;
+        VDISPATCH_REORDER_IC(
+                attr->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(attr->scales_.has_default_values(DNNL_ARG_DST),
+                VERBOSE_UNSUPPORTED_SCALES_CFG);
+        return status::success;
     }
 
     static size_t get_scratchpad_size(const memory_desc_wrapper &input_d,
@@ -2069,53 +2607,112 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         input += input_d.blk_off(0);
         output += output_d.blk_off(0);
 
+        // TODO: optimization: use int8/f8 types for workspace to save memory.
         data_t<type_o> *wspace = scratchpad.template get<data_t<type_o>>(
                 memory_tracking::names::key_reorder_space);
 
-        // When formats of the input and the output are not identical, the idea
-        // is to reorder the data from the input format to the output format
-        // but within the same data type, and after the format reorder apply
-        // the compression into int4 as on `abx` format.
+        // The implementation splits data conversion and format conversion in
+        // two passes for cases when it's not straightforward to perform both
+        // at once. The second pass applicability is determined by:
+        // * Transformation between incompatible formats is needed, especially
+        //   when int4 source in not dense in the last dimension...
         const bool need_transform = input_d.strides()[input_d.ndims() - 1] != 1;
-        wspace = need_transform ? wspace : output;
+        // * Post-processing, including advanced dequantization parameters as
+        //   groups.
+        const auto &scales = pd->attr()->scales_;
+        const bool has_src_scales = !scales.has_default_values(DNNL_ARG_SRC);
+        const auto &zps = pd->attr()->zero_points_;
+        const bool has_src_zps = !zps.has_default_values(DNNL_ARG_SRC);
+
+        const bool need_second_pass
+                = need_transform || has_src_scales || has_src_zps;
+        wspace = need_second_pass ? wspace : output;
 
         // To avoid clashes between threads each byte (or 2 elements)
         // is handled by a single thread
         const dim_t work_amount = input_d.nelems() / 2;
 
         parallel(0, [&](const int ithr, const int nthr) {
+            auto u8_input = reinterpret_cast<const uint8_t *>(input);
             dim_t start {0}, end {0};
             balance211(work_amount, nthr, ithr, start, end);
             PRAGMA_OMP_SIMD()
-            for_(dim_t j = start; j < end; j++)
-            for (int i = 0; i < 2; ++i) {
-                const auto idx = 2 * j + i;
-                const auto i_off = need_transform ? idx : input_d.off_l(idx);
-                const auto o_off = need_transform ? idx : output_d.off_l(idx);
-                const auto shift = i % 2 ? int4_extract_t::high_half
-                                         : int4_extract_t::low_half;
-                auto src_val = data_t<type_i>::extract(
-                        reinterpret_cast<const uint8_t *>(input)[i_off / 2],
-                        shift);
-                reinterpret_cast<data_t<type_o> *>(wspace)[o_off]
-                        = static_cast<float>(src_val);
+            for (dim_t j = start; j < end; j++) {
+                const auto idx = 2 * j;
+                const auto i_off = need_second_pass ? idx : input_d.off_l(idx);
+                const nibble2_t in_nibble(u8_input[i_off / 2]);
+
+                for (int i = 0; i < 2; ++i) {
+                    const auto o_off = need_second_pass
+                            ? idx + i
+                            : output_d.off_l(idx + i);
+                    data_t<type_i> src_val(in_nibble.get(i));
+                    reinterpret_cast<data_t<type_o> *>(wspace)[o_off]
+                            = static_cast<float>(src_val);
+                }
             }
         });
 
-        if (need_transform) {
-            const dim_t work_amount = output_d.nelems();
-            parallel(0, [&](const int ithr, const int nthr) {
-                dim_t start {0}, end {0};
-                balance211(work_amount, nthr, ithr, start, end);
-                PRAGMA_OMP_SIMD()
-                for (dim_t idx = start; idx < end; idx++) {
-                    const auto i_off = input_d.off_l(idx);
-                    const auto o_off = output_d.off_l(idx);
-                    output[o_off] = wspace[i_off];
-                }
-            });
+        if (!need_second_pass) return status::success;
+
+        const int ndims = input_d.ndims();
+        // Applied to the pre-last dimension.
+        const auto src_scales_group0 = scales.get_group(DNNL_ARG_SRC, 0);
+        // Applied to the last dimension.
+        const auto src_scales_group1 = scales.get_group(DNNL_ARG_SRC, 1);
+
+        memory_desc_t src_scales_md {};
+        if (has_src_scales) {
+            get_quant_md(src_scales_md, ndims, input_d.dims(), src_scales_mask,
+                    src_scales_group0, src_scales_group1,
+                    src_scales_d.data_type());
         }
 
+        int src_zps_mask = zps.get_mask(DNNL_ARG_SRC);
+        // Applied to the pre-last dimension.
+        const auto src_zps_group0 = zps.get_group(DNNL_ARG_SRC, 0);
+        // Applied to the last dimension.
+        const auto src_zps_group1 = zps.get_group(DNNL_ARG_SRC, 1);
+        memory_desc_t src_zps_md {};
+        if (has_src_zps) {
+            get_quant_md(src_zps_md, ndims, input_d.dims(), src_zps_mask,
+                    src_zps_group0, src_zps_group1, src_zps_d.data_type());
+        }
+
+        parallel_nd(input_d.nelems(), [&](dim_t idx) {
+            // Must be per thread; when shared, race condition happens.
+            dims_t input_idx {};
+            float src_scale = 1.f;
+            if (has_src_scales || has_src_zps) {
+                utils::l_dims_by_l_offset(
+                        input_idx, idx, input_d.dims(), ndims);
+            }
+            if (has_src_scales) {
+                const dim_t src_scales_off = get_quant_off(input_idx, ndims,
+                        src_scales_mask, src_scales_group0, src_scales_group1,
+                        src_scales_md);
+                // A single scale has already been pre-processed by the
+                // library-managed macros.
+                src_scale = src_scales_d.nelems() == 1
+                        ? src_scales[0]
+                        : io::load_float_value(src_scales_d.data_type(),
+                                src_scales, src_scales_off);
+            }
+
+            int src_zp_val = 0; // Avoid clashing with the one defined for rest.
+            if (has_src_zps) {
+                const dim_t src_zps_off
+                        = get_quant_off(input_idx, ndims, src_zps_mask,
+                                src_zps_group0, src_zps_group1, src_zps_md);
+                src_zp_val = io::load_float_value(
+                        src_zps_d.data_type(), src_zero_points, src_zps_off);
+            }
+
+            const auto i_off = input_d.off_l(idx);
+            const auto o_off = output_d.off_l(idx);
+            output[o_off] = src_scale * (wspace[i_off] - src_zp_val);
+        });
+
         return status::success;
     }
 };
@@ -2126,15 +2723,24 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         && tag_o == format_tag::any
                         && order_keep == fmt_order::any,
                 spec::direct_copy_except_dim_0>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
         auto is_dense_no_0 = [](const memory_desc_wrapper &data_d) {
             return nelems_no_dim_0(data_d) == _size_no_dim_0(data_d);
         };
-        return !input_d.has_runtime_dims_or_strides()
-                && input_d.similar_to(output_d, true, false, 1)
-                && is_dense_no_0(input_d) && is_dense_no_0(output_d)
-                && simple_attr_check(attr, false, true);
+        VDISPATCH_REORDER_IC(
+                simple_attr_check(attr, false, true), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(is_dense_no_0(input_d),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(is_dense_no_0(output_d),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(input_d.similar_to(output_d, true, false, 1),
+                VERBOSE_TENSOR_FORMAT_MISMATCH, "src", "dst");
+
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -2205,11 +2811,7 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
 
         const auto &blk = data_d.blocking_desc();
 
-        dim_t blk_size = 1;
-        for (int iblk = 0; iblk < blk.inner_nblks; ++iblk)
-            blk_size *= blk.inner_blks[iblk];
-
-        dim_t max_size = blk_size;
+        dim_t max_size = data_d.blk_size();
         for (int d = 1; d < data_d.ndims(); ++d) {
             max_size = nstl::max(max_size,
                     data_d.padded_dims()[d] / blocks[d] * blk.strides[d]);
@@ -2225,10 +2827,12 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                         && tag_o == format_tag::any
                         && order_keep == fmt_order::any
                         // u4/s4 requires a special implementation
-                        && !utils::one_of(type_i, dnnl_s4, dnnl_u4)
-                        && !utils::one_of(type_o, dnnl_s4, dnnl_u4),
+                        && !utils::one_of(type_i, data_type::s4, data_type::u4,
+                                data_type::f4_e2m1, data_type::f4_e3m0, data_type::nf4)
+                        && !utils::one_of(type_o, data_type::s4, data_type::u4,
+                                data_type::f4_e2m1, data_type::f4_e3m0, data_type::nf4),
                 spec::reference>::type> {
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
         /* supported smask: 0x0...011..10...0,
          * i.e. 1 should be contiguous */
@@ -2241,17 +2845,31 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
                 ;
             for (; smask > 0 && smask & 0x1; smask >>= 1)
                 ;
-            if (smask != 0) return false;
+            VDISPATCH_REORDER_IC(smask == 0, VERBOSE_UNSUPPORTED_SCALES_CFG);
+        }
+
+        using smask_t = primitive_attr_t::skip_mask_t;
+        smask_t skip_mask = smask_t::scales_data_type | smask_t::scales_groups
+                | smask_t::zero_points_data_type | smask_t::zero_points_groups
+                | smask_t::post_ops;
+        VDISPATCH_REORDER_IC(
+                attr->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR);
+        VDISPATCH_REORDER_IC(simple_po_check(attr), VERBOSE_UNSUPPORTED_POSTOP);
+        const auto &scales = attr->scales_;
+        const bool has_dst_scales = !scales.has_default_values(DNNL_ARG_DST);
+        if (has_dst_scales) {
+            VDISPATCH_REORDER_IC(scales.has_default_data_type(DNNL_ARG_DST)
+                            && scales.has_default_groups(DNNL_ARG_DST),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
         }
+        VDISPATCH_REORDER_IC(
+                input_d.is_blocking_desc() && !input_d.is_additional_buffer(),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+        VDISPATCH_REORDER_IC(
+                output_d.is_blocking_desc() && !output_d.is_additional_buffer(),
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
 
-        using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
-        return input_d.is_blocking_desc() && output_d.is_blocking_desc()
-                && !output_d.is_additional_buffer()
-                && !input_d.is_additional_buffer()
-                && attr->has_default_values(skip_mask_t::scales_runtime
-                        | skip_mask_t::zero_points_runtime
-                        | skip_mask_t::post_ops)
-                && simple_po_check(attr);
+        return status::success;
     }
 
     GET_SCRATCHPAD_SIZE_ZERO();
@@ -2264,23 +2882,83 @@ struct simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
         // TODO: apply zero padding inside parallel_nd()
         ctx.zero_pad_output(DNNL_ARG_TO);
 
-        parallel_nd(D_start, D_mask, D_rest,
-                [&](ptrdiff_t ds, ptrdiff_t dm, ptrdiff_t dr) {
-                    const float src_scale
-                            = src_scales[src_scales_mask == 0 ? 0 : dm];
-                    const float dst_scale
-                            = dst_scales[dst_scales_mask == 0 ? 0 : dm];
-
-                    const size_t e = (ds * D_mask + dm) * D_rest + dr;
-                    const auto &i = input[input_d.off_l(e)];
-                    auto &o = output[output_d.off_l(e)];
-
-                    float f = src_scale * ((float)i - src_zp);
-                    if (beta) f += beta * o;
-                    f = f * dst_scale + dst_zp;
-                    o = _qz_a1b0<data_type::f32, type_o>()(f);
-                });
+        const int ndims = input_d.ndims();
+        const auto &scales = pd->attr()->scales_;
+        const bool has_src_scales = !scales.has_default_values(DNNL_ARG_SRC);
+        // Applied to the pre-last dimension.
+        const auto src_scales_group0 = scales.get_group(DNNL_ARG_SRC, 0);
+        // Applied to the last dimension.
+        const auto src_scales_group1 = scales.get_group(DNNL_ARG_SRC, 1);
+        memory_desc_t src_scales_md {};
+        if (has_src_scales) {
+            get_quant_md(src_scales_md, ndims, input_d.dims(), src_scales_mask,
+                    src_scales_group0, src_scales_group1,
+                    src_scales_d.data_type());
+        }
 
+        const bool has_dst_scales = !scales.has_default_values(DNNL_ARG_DST);
+        memory_desc_t dst_scales_md {};
+        if (has_dst_scales) {
+            get_quant_md(dst_scales_md, ndims, input_d.dims(), dst_scales_mask,
+                    1, 1, data_type::f32);
+        }
+
+        const auto &zps = pd->attr()->zero_points_;
+        int src_zps_mask = zps.get_mask(DNNL_ARG_SRC);
+        const bool has_src_zps = !zps.has_default_values(DNNL_ARG_SRC);
+        // Applied to the pre-last dimension.
+        const auto src_zps_group0 = zps.get_group(DNNL_ARG_SRC, 0);
+        // Applied to the last dimension.
+        const auto src_zps_group1 = zps.get_group(DNNL_ARG_SRC, 1);
+        memory_desc_t src_zps_md {};
+        if (has_src_zps) {
+            get_quant_md(src_zps_md, ndims, input_d.dims(), src_zps_mask,
+                    src_zps_group0, src_zps_group1, src_zps_d.data_type());
+        }
+
+        parallel_nd(input_d.nelems(), [&](dim_t idx) {
+            // Must be per thread; when shared, race condition happens.
+            dims_t input_idx {};
+            float src_scale = 1.f;
+            if (has_src_scales || has_dst_scales || has_src_zps) {
+                utils::l_dims_by_l_offset(
+                        input_idx, idx, input_d.dims(), ndims);
+            }
+            if (has_src_scales) {
+                const dim_t src_scales_off = get_quant_off(input_idx, ndims,
+                        src_scales_mask, src_scales_group0, src_scales_group1,
+                        src_scales_md);
+                // A single scale has already been pre-processed by the
+                // library-managed macros.
+                src_scale = src_scales_d.nelems() == 1
+                        ? src_scales[0]
+                        : io::load_float_value(src_scales_d.data_type(),
+                                src_scales, src_scales_off);
+            }
+
+            float dst_scale = 1.f;
+            if (has_dst_scales) {
+                const dim_t dst_scales_off = get_quant_off(
+                        input_idx, ndims, dst_scales_mask, 1, 1, dst_scales_md);
+                dst_scale = dst_scales[dst_scales_off];
+            }
+
+            int src_zp_val = 0; // Avoid clashing with the one defined for rest.
+            if (has_src_zps) {
+                const dim_t src_zps_off
+                        = get_quant_off(input_idx, ndims, src_zps_mask,
+                                src_zps_group0, src_zps_group1, src_zps_md);
+                src_zp_val = io::load_float_value(
+                        src_zps_d.data_type(), src_zero_points, src_zps_off);
+            }
+
+            const auto i_off = input_d.off_l(idx);
+            const auto o_off = output_d.off_l(idx);
+            float d = src_scale * (input[i_off] - src_zp_val);
+            if (beta) d += beta * output[o_off];
+            d = d * dst_scale + dst_zp;
+            output[o_off] = _qz_a1b0<data_type::f32, type_o>()(d);
+        });
         return status::success;
     }
 };
@@ -2299,24 +2977,36 @@ struct simple_reorder_t : public primitive_t {
                 const primitive_attr_t *attr, engine_t *src_engine,
                 const memory_desc_t *src_md, engine_t *dst_engine,
                 const memory_desc_t *dst_md) {
-            using skip_mask_t = dnnl_primitive_attr::skip_mask_t;
-            bool args_ok = impl::is_dense_format_kind({src_md, dst_md})
-                    && src_md->data_type == type_i
-                    && dst_md->data_type == type_o
-                    && attr->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::zero_points
-                            | skip_mask_t::zero_points_runtime
-                            | skip_mask_t::post_ops)
-                    && simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
-                            spec>::is_applicable(src_md, dst_md, attr);
-            if (!args_ok) return status::invalid_arguments;
+            // Since `type_i` and `type_o` are templated arguments, no need
+            // to put them under verbose_dispatch logic.
+            bool ok = src_md->data_type == type_i
+                    && dst_md->data_type == type_o;
+            if (!ok) return status::invalid_arguments;
+
+            VDISPATCH_REORDER_IC(impl::is_dense_format_kind({src_md, dst_md}),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            using skip_mask_t = primitive_attr_t::skip_mask_t;
+            VDISPATCH_REORDER_IC(
+                    attr->has_default_values(skip_mask_t::scales_data_type
+                            | skip_mask_t::scales_groups
+                            | skip_mask_t::zero_points_data_type
+                            | skip_mask_t::zero_points_groups
+                            | skip_mask_t::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+
+            auto status = simple_reorder_impl<SIMPLE_REORDER_TEMPL_CALL,
+                    spec>::is_applicable(src_md, dst_md, attr);
+            if (status != status::success) return status;
 
-            int mask = -1;
-            bool is_set = false;
-            CHECK(attr->scales_.get(DNNL_ARG_DST, &mask, &is_set));
             const memory_desc_wrapper input_d(src_md);
-            if (input_d.has_runtime_dims_or_strides() && is_set && mask > 0)
-                return status::unimplemented;
+
+            int mask = -1;
+            if (!attr->scales_.has_default_values(DNNL_ARG_DST)) {
+                mask = attr->scales_.get_mask(DNNL_ARG_DST);
+                if (input_d.has_runtime_dims_or_strides() && mask > 0)
+                    return status::unimplemented;
+            }
 
             auto _pd = make_unique_pd<pd_t>(attr, src_engine->kind(), src_md,
                     dst_engine->kind(), dst_md);
@@ -2330,7 +3020,7 @@ struct simple_reorder_t : public primitive_t {
             scratchpad.book(memory_tracking::names::key_reorder_space,
                     scratchpad_sz_, 1, 16);
 
-            if (is_set && mask > 0) {
+            if (mask > 0) {
                 dim_t D_mask;
                 _pd->get_D_values(input_d, mask, nullptr, &D_mask, nullptr);
                 scratchpad.template book<float>(
diff --git a/src/cpu/reorder/simple_sparse_reorder.hpp b/src/cpu/reorder/simple_sparse_reorder.hpp
index 2eb0cd4c203..b65dbefa97c 100644
--- a/src/cpu/reorder/simple_sparse_reorder.hpp
+++ b/src/cpu/reorder/simple_sparse_reorder.hpp
@@ -1,37 +1,38 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
+*
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
+
 #ifndef CPU_REORDER_SIMPLE_SPARSE_REORDER_HPP
 #define CPU_REORDER_SIMPLE_SPARSE_REORDER_HPP
 
-#include <bitset>
-#include <iostream>
-
 #include <assert.h>
+
+#include "simple_reorder.hpp"
+
 #include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
 #include "common/math_utils.hpp"
 #include "common/primitive.hpp"
-#include "common/reorder.hpp"
-
 #include "common/primitive_attr.hpp"
-#include "common/stream.hpp"
 #include "common/tag_traits.hpp"
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
+
 #include "cpu/cpu_primitive.hpp"
 #include "cpu/reorder/cpu_reorder_pd.hpp"
+
 #include "cpu/simple_q10n.hpp"
 
 namespace dnnl {
@@ -40,9 +41,6 @@ namespace cpu {
 
 // The following cases can be covered:
 //
-// Note: `sparse_tag` is a regular format tag describing
-//        a regular tensor with sparse data.
-//
 // - sparse_tag -> sparse_tag
 // - encoding -> encoding
 //
@@ -55,10 +53,23 @@ namespace cpu {
 // - dense_tag -> encoding
 // - encoding -> dense_tag
 #define SIMPLE_SPARSE_REORDER_TEMPL_DECL \
-    impl::data_type_t type_i, typename fmt_i_t, fmt_i_t fmt_i, \
-            impl::data_type_t type_o, typename fmt_o_t, fmt_o_t fmt_o
+    impl::data_type_t type_i, format_tag_t fmt_i, \
+            impl::data_type_t type_o, format_tag_t fmt_o, \
+            bool order_keep
+
 #define SIMPLE_SPARSE_REORDER_TEMPL_CALL \
-    type_i, fmt_i_t, fmt_i, type_o, fmt_o_t, fmt_o
+    type_i, fmt_i, type_o, fmt_o, order_keep
+
+// TODO: move common code to reorder_utils.hpp.
+namespace sparse_spec {
+struct reference {};
+} // namespace sparse_spec
+
+namespace sparse_inputs_order {
+constexpr bool keep = true;
+constexpr bool reverse = false;
+constexpr bool any = keep;
+} // namespace sparse_inputs_order
 
 template <SIMPLE_SPARSE_REORDER_TEMPL_DECL, typename spec = void>
 struct simple_sparse_reorder_impl {};
@@ -66,113 +77,132 @@ struct simple_sparse_reorder_impl {};
 namespace {
 template <typename T>
 constexpr bool is_format_tag(T) {
-    return std::is_same<T, format_tag_t>::value;
+    return std::is_same<T, format_tag_t>::value ? true : false;
 }
 } // namespace
 
+using namespace data_type;
+
+// TODO: think about combining compression reorders with sparse reorders.
+/* specific reorders: IP compression */
 template <SIMPLE_SPARSE_REORDER_TEMPL_DECL>
 struct simple_sparse_reorder_impl<SIMPLE_SPARSE_REORDER_TEMPL_CALL,
         typename utils::enable_if<(is_format_tag(fmt_i)
-                                          && (fmt_i == format_tag::any))
-                && (is_format_tag(fmt_o)
-                        && (fmt_o == format_tag::any))>::type> {
+                                          && (fmt_i == format_tag::oi
+                                                  || fmt_i == format_tag::io))
+                        && (is_format_tag(fmt_o)
+                                && fmt_o == format_tag::OI16i64o4i),
+                sparse_spec::reference>::type> {
 
-    static bool is_applicable(const memory_desc_wrapper &input_d,
+    static status_t is_applicable(const memory_desc_wrapper &input_d,
             const memory_desc_wrapper &output_d, const primitive_attr_t *attr) {
-        // This reorder expects a non-plain format for destination.
-        return input_d.is_blocking_desc() && output_d.is_sparse_desc()
-                && output_d.sparse_desc().encoding == sparse_encoding::packed
-                && output_d.blocking_desc().inner_nblks > 0
-                && output_d.blk_size() % 64 == 0;
-    }
-
-    static size_t get_scratchpad_size(const memory_desc_wrapper &input_d,
-            const memory_desc_wrapper &output_d) {
-        const auto nelems = output_d.nelems(true);
-        const auto tmp_output_sz = nelems * output_d.data_type_size();
-        const auto nnz_per_blocks_sz
-                = nelems / output_d.blk_size() * sizeof(dim_t);
-        return tmp_output_sz + nnz_per_blocks_sz;
-    }
 
-    static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx,
-            const std::shared_ptr<primitive_t> &reorder) {
-        auto output_values = CTX_OUT_MEM(data_t<type_o> *, DNNL_ARG_TO, 0);
-        auto output_offsets = CTX_OUT_MEM(int64_t *, DNNL_ARG_TO, 1);
-        auto output_bitmask = CTX_OUT_MEM(uint64_t *, DNNL_ARG_TO, 2);
+        VDISPATCH_REORDER_IC(!input_d.has_runtime_dims_or_strides(),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+        const size_t D_mask = utils::array_product(
+                input_d.dims(), math::ilog2q(attr->scales_.get_mask(DNNL_ARG_SRC) - INT_MIN + 1));
+        const size_t oc = (input_d.dims()[0]);
+        VDISPATCH_REORDER_IC(output_d.matches_tag(fmt_o) && input_d.matches_tag(fmt_i),
+                VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+        // VDISPATCH_REORDER_IC(
+        //         input_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+        VDISPATCH_REORDER_IC(
+                output_d.is_sparse_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+        VDISPATCH_REORDER_IC(
+                output_d.sparse_desc().encoding == sparse_encoding::packed,
+                VERBOSE_UNSUPPORTED_FEATURE,
+                "only sparse_encoding::packed is supported for dst");
+        // VDISPATCH_REORDER_IC(output_d.blocking_desc().inner_nblks > 0,
+        //         VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        // VDISPATCH_REORDER_IC(output_d.blk_size() % 64 == 0,
+        //         VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+        VDISPATCH_REORDER_IC(utils::one_of(input_d.data_type(), data_type::f32, data_type::s8),
+                VERBOSE_UNSUPPORTED_DT, "src");
+        VDISPATCH_REORDER_IC(utils::one_of(output_d.data_type(), data_type::s8) && (D_mask == 1 || D_mask == oc),
+                VERBOSE_UNSUPPORTED_DT, "dst");
 
-        engine_t *engine = ctx.stream()->engine();
-        const auto scratchpad = ctx.get_scratchpad_grantor();
-        auto wspace_mem_storage = scratchpad.get_memory_storage(
-                memory_tracking::names::key_reorder_space);
-        memory_t wspace_mem(
-                engine, reorder->pd()->dst_md(), std::move(wspace_mem_storage));
-
-        exec_args_t r_args;
-        r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_FROM);
-        r_args[DNNL_ARG_DST] = {&wspace_mem, false};
-        exec_ctx_t r_ctx(ctx, std::move(r_args));
+        return status::success;
+    }
 
-        nested_scratchpad_t ns(
-                ctx, memory_tracking::names::key_nested, reorder);
-        r_ctx.set_scratchpad_grantor(ns.grantor());
-        reorder->execute(r_ctx);
+    GET_SCRATCHPAD_SIZE_ZERO();
 
-        auto *wspace = scratchpad.template get<data_t<type_o>>(
-                memory_tracking::names::key_reorder_space);
+    static status_t execute(const cpu_reorder_pd_t *pd, const exec_ctx_t &ctx) {
+        auto input = CTX_IN_MEM(const data_t<type_i> *, DNNL_ARG_FROM);
+        auto output = CTX_OUT_MEM(data_t<type_o> *, DNNL_ARG_TO);
 
+        const auto input_d = ctx.memory_mdw(DNNL_ARG_FROM, pd->src_md());
         const auto output_d = ctx.memory_mdw(DNNL_ARG_TO, pd->dst_md());
-        const auto nelems = output_d.nelems(true);
-        const auto blk_sz = output_d.blk_size();
-        const auto nblks = nelems / blk_sz;
-
-        dim_t *nnz_per_blocks
-                = reinterpret_cast<dim_t *>(reinterpret_cast<char *>(wspace)
-                        + nelems * output_d.data_type_size());
-
-        static constexpr int bitmask_step = sizeof(uint64_t) * CHAR_BIT;
-        // Fill output_bitmask and move non-zero elements to the begining of the
-        // blocks. Also, remember number of non-zero elements per-block to
-        // calculate output_offsets later.
-        parallel_nd(nblks, [&](dim_t b) {
-            dim_t nnz_per_blk = 0;
-            for (dim_t i = 0; i < blk_sz / bitmask_step; i++) {
-                uint64_t &bm = output_bitmask[b * blk_sz / bitmask_step + i];
-                bm = 0;
-                for (dim_t j = 0; j < bitmask_step; j++) {
-                    const auto v = wspace[b * blk_sz + bitmask_step * i + j];
-                    if (v != 0) {
-                        wspace[b * blk_sz + nnz_per_blk++] = v;
-                        bm |= (uint64_t(1) << j);
+
+        const auto &input_dims = input_d.dims();
+        const auto &padded_dims = output_d.padded_dims();
+        constexpr int i_outer_blksize = 16;
+        constexpr int i_blksize = i_outer_blksize * 4;
+        constexpr int o_blksize = 64;
+
+        const int OC = input_dims[0];
+        const int NB_OC = padded_dims[0] / o_blksize;
+        const int IC = input_dims[1];
+        const int NB_IC = padded_dims[1] / i_blksize;
+        const int plain_o_stride = input_d.blocking_desc().strides[0];
+        const int plain_i_stride = input_d.blocking_desc().strides[1];
+        size_t offset = padded_dims[0] * padded_dims[1];
+
+        int total_blocks = offset / 4096;
+        using comp_tile_len_type = int;
+        comp_tile_len_type *comp_tile_len_ptr = reinterpret_cast<comp_tile_len_type *>(output);
+        int comp_tile_len_index = 0;
+        int cl_length = 0;
+        // Wasting memory space due to allocation a buffer for the whole tensor?
+        int output_offset = ceil((float)total_blocks * sizeof(comp_tile_len_type) / 64.0) * 64;
+        uint64_t *bitmask_ptr = reinterpret_cast<uint64_t *>(output + output_offset + offset);
+        auto outp = &output[output_d.blk_off(0, 0, 0, 0) + output_offset];
+
+        // TODO: add threading.
+        for (int O = 0; O < NB_OC; O++) {
+            for (int I = 0; I < NB_IC; I++) {
+                auto inp
+                        = &input[input_d.blk_off(o_blksize * O, i_blksize * I)];
+                const int oc_block = nstl::min(o_blksize, OC - O * o_blksize);
+                const int ic_block = nstl::min(i_blksize, IC - I * i_blksize);
+                int non_zeros = 0;
+                int bitmask_idx = (O * NB_IC + I) * i_blksize;
+                comp_tile_len_ptr[comp_tile_len_index] = cl_length;
+
+                for (int ic_base = 0; ic_base < ic_block;
+                        ic_base += 4) { // 64, steps of 4
+                    bitmask_ptr[bitmask_idx] = 0;
+                    int bit = 0;
+                    int count = 0;
+                    for (int oc = 0; oc < oc_block; oc++) { // 64
+                        if (count % 64 == 0) {
+                            bitmask_ptr[bitmask_idx] = 0;
+                            bit = 0;
+                        }
+                        int plain_off = oc * plain_o_stride
+                                + ic_base * plain_i_stride;
+                        int ic_block_here = nstl::min(4, ic_block - ic_base);
+                        for (int ic = 0; ic < ic_block_here; ic++) { // 4
+                            data_t<type_o> o = inp[plain_off];
+                            if (o != 0) {
+                                *outp++ = o;
+                                bitmask_ptr[bitmask_idx] |= (1UL << bit);
+                                non_zeros++;
+                            }
+                            plain_off += plain_i_stride;
+                            bit++;
+                            count++;
+                        }
+                        if (count % 64 == 0) { bitmask_idx++; }
                     }
                 }
+                comp_tile_len_type cl = (comp_tile_len_type)ceil(non_zeros / 64.0);
+                comp_tile_len_index++;
+                cl_length = comp_tile_len_ptr[comp_tile_len_index - 1] + cl;
+                int unsed_bytes_in_cl = 64 - (non_zeros % 64);
+                if (unsed_bytes_in_cl == 64) { unsed_bytes_in_cl = 0; }
+                outp += unsed_bytes_in_cl; // 64: next output starts in new cacheline
             }
-            nnz_per_blocks[b] = nnz_per_blk;
-        });
-
-        // Calculate output_offsets using previously computed number of non-zero
-        // elements in each block.
-        parallel_nd(nblks, [&](dim_t b) {
-            dim_t off = 0;
-            if (b != 0) {
-                for (dim_t i = 0; i < b; i++) {
-                    off += nnz_per_blocks[i];
-                }
-            }
-            output_offsets[b] = off;
-        });
-
-        // Use the calculated output_offsets and number of non-zero elements
-        // per block to copy the non-zero elements that we moved to the
-        // begining of the blocks to output_values.
-        parallel_nd(nblks, [&](dim_t b) {
-            const auto nnz_per_blk = nnz_per_blocks[b];
-            const auto blk_off = output_offsets[b];
-            for (dim_t i = 0; i < nnz_per_blk; i++) {
-                output_values[blk_off + i] = wspace[b * blk_sz + i];
-            }
-        });
-
+        }
         return status::success;
     }
 };
@@ -181,9 +211,8 @@ template <SIMPLE_SPARSE_REORDER_TEMPL_DECL, typename spec = void>
 struct simple_sparse_reorder_t : public primitive_t {
     struct pd_t : public cpu_reorder_pd_t {
         using cpu_reorder_pd_t::cpu_reorder_pd_t;
-        DECLARE_COMMON_PD_T("simple::any", simple_sparse_reorder_t);
 
-        std::shared_ptr<primitive_desc_t> reorder_pd_;
+        DECLARE_COMMON_PD_T("simple_sparse:any", simple_sparse_reorder_t);
 
     private:
         static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
@@ -191,39 +220,24 @@ struct simple_sparse_reorder_t : public primitive_t {
                 const memory_desc_t *src_md, engine_t *dst_engine,
                 const memory_desc_t *dst_md) {
 
-            const bool args_ok = src_md->data_type == type_i
+            const bool ok = src_md->data_type == type_i
                     && dst_md->data_type == type_o
+                    && attr->has_default_values()
                     && simple_sparse_reorder_impl<
-                            SIMPLE_SPARSE_REORDER_TEMPL_CALL>::
-                            is_applicable(src_md, dst_md, attr);
-            if (!args_ok) return status::invalid_arguments;
+                            SIMPLE_SPARSE_REORDER_TEMPL_CALL,
+                            spec>::is_applicable(src_md, dst_md, attr) == status::success;
+            if (!ok) return status::invalid_arguments;
 
-            auto _pd = make_unique_pd<pd_t>(attr, src_engine->kind(), src_md,
+            auto _pd = new pd_t(attr, src_engine->kind(), src_md,
                     dst_engine->kind(), dst_md);
             if (_pd == nullptr) return status::out_of_memory;
-            CHECK(_pd->init(engine, src_engine, dst_engine));
-
-            CHECK(_pd->init_scratchpad_md());
-            return safe_ptr_assign(*reorder_pd, _pd.release());
-        }
+            if (_pd->init(engine, src_engine, dst_engine) != status::success) {
+                delete _pd;
+                return status::unimplemented;
+            }
 
-        status_t init(
-                engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
-            // Convert sparse packed desc to blocking desc.
-            auto converted_dst_md = cvt_sparse_packed2blocked(*this->dst_md());
-
-            CHECK(reorder_primitive_desc_create(
-                    reorder_pd_, engine, src_md(), &converted_dst_md, attr()));
-
-            const size_t scratchpad_sz_ = simple_sparse_reorder_impl<
-                    SIMPLE_SPARSE_REORDER_TEMPL_CALL>::
-                    get_scratchpad_size(src_md(), dst_md());
-            auto scratchpad = scratchpad_registry().registrar();
-            scratchpad.book(memory_tracking::names::key_reorder_space,
-                    scratchpad_sz_, 1, 16);
-            scratchpad.book(memory_tracking::names::key_nested,
-                    reorder_pd_->scratchpad_registry());
-            return status::success;
+            _pd->init_scratchpad_md();
+            return safe_ptr_assign(*reorder_pd, _pd);
         }
 
         friend dnnl::impl::impl_list_item_t;
@@ -231,18 +245,13 @@ struct simple_sparse_reorder_t : public primitive_t {
 
     simple_sparse_reorder_t(const pd_t *apd) : primitive_t(apd) {}
 
-    status_t init(engine_t *engine) override {
-        return pd()->reorder_pd_->create_primitive(reorder_, engine);
-    }
-
     status_t execute(const exec_ctx_t &ctx) const override {
-        return simple_sparse_reorder_impl<
-                SIMPLE_SPARSE_REORDER_TEMPL_CALL>::execute(pd(), ctx, reorder_);
+        return simple_sparse_reorder_impl<SIMPLE_SPARSE_REORDER_TEMPL_CALL,
+                spec>::execute(pd(), ctx);
     }
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::shared_ptr<primitive_t> reorder_;
 };
 
 #undef SIMPLE_SPARSE_REORDER_TEMPL_DECL
@@ -251,4 +260,5 @@ struct simple_sparse_reorder_t : public primitive_t {
 } // namespace cpu
 } // namespace impl
 } // namespace dnnl
+
 #endif
diff --git a/src/cpu/rnn/postgemm_dispatcher.hpp b/src/cpu/rnn/postgemm_dispatcher.hpp
index 1c38e44d2fa..be43824f7aa 100644
--- a/src/cpu/rnn/postgemm_dispatcher.hpp
+++ b/src/cpu/rnn/postgemm_dispatcher.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,14 +52,14 @@ template <prop_kind_t aprop, impl::data_type_t src_type,
         impl::data_type_t scratch_type, impl::data_type_t acc_type>
 struct rnn_postgemm_dispatcher {
 
-    typedef typename prec_traits<src_type>::type src_layer_t;
-    typedef typename prec_traits<src_type>::type src_iter_t;
-    typedef typename prec_traits<src_type>::type dst_layer_t;
-    typedef typename prec_traits<src_type>::type dst_iter_t;
-    typedef typename prec_traits<acc_type>::type gemm_acc_t;
-    typedef typename prec_traits<scratch_type>::type scratch_t;
-    typedef typename prec_traits<src_type>::type ht_t;
-    typedef typename prec_traits<src_type>::type gates_t;
+    using src_layer_t = typename prec_traits_t<src_type>::type;
+    using src_iter_t = typename prec_traits_t<src_type>::type;
+    using dst_layer_t = typename prec_traits_t<src_type>::type;
+    using dst_iter_t = typename prec_traits_t<src_type>::type;
+    using gemm_acc_t = typename prec_traits_t<acc_type>::type;
+    using scratch_t = typename prec_traits_t<scratch_type>::type;
+    using ht_t = typename prec_traits_t<src_type>::type;
+    using gates_t = typename prec_traits_t<src_type>::type;
 
     using class_name
             = rnn_postgemm_dispatcher<aprop, src_type, scratch_type, acc_type>;
@@ -253,20 +253,25 @@ struct rnn_postgemm_dispatcher {
                 && !mayiuse(avx512_core))
             return status::success;
 
+//NOLINTBEGIN(bugprone-macro-parentheses)
+// Can't put types into `()`:
+// error: expected type-specifier before ‘)’ token
 #define CREATE_WITH_DIR(k, ker_t) \
     do { \
         if (mayiuse(avx512_core)) \
-            k.reset(new ker_t<avx512_core, src_type, scratch_type>(rnn, pd_)); \
+            (k).reset( \
+                    new ker_t<avx512_core, src_type, scratch_type>(rnn, pd_)); \
         else if (mayiuse(avx2)) \
-            k.reset(new ker_t<avx2, src_type, scratch_type>(rnn, pd_)); \
+            (k).reset(new ker_t<avx2, src_type, scratch_type>(rnn, pd_)); \
         else \
-            k.reset(new ker_t<sse41, src_type, scratch_type>(rnn, pd_)); \
+            (k).reset(new ker_t<sse41, src_type, scratch_type>(rnn, pd_)); \
     } while (0)
 #define CREATE(k, ker_t) \
     do { \
-        if (jit_fwd) CREATE_WITH_DIR(k, CONCAT2(ker_t, _fwd)); \
-        if (jit_bwd) CREATE_WITH_DIR(k, CONCAT2(ker_t, _bwd)); \
+        if (jit_fwd) CREATE_WITH_DIR((k), CONCAT2(ker_t, _fwd)); \
+        if (jit_bwd) CREATE_WITH_DIR((k), CONCAT2(ker_t, _bwd)); \
     } while (0)
+        //NOLINTEND(bugprone-macro-parentheses)
 
         if (pd_->cell_kind() == alg_kind::vanilla_lstm) {
             CREATE(rnn_postgemm_, jit_uni_lstm_cell_postgemm);
diff --git a/src/cpu/rnn/ref_postgemm_lstm.cpp b/src/cpu/rnn/ref_postgemm_lstm.cpp
index 8fef036b710..d452eb39fc2 100644
--- a/src/cpu/rnn/ref_postgemm_lstm.cpp
+++ b/src/cpu/rnn/ref_postgemm_lstm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -188,7 +188,7 @@ rnn_postgemm_sig(rnn_postgemm_fwd_u8_t::lstm_postgemm) {
 
     const auto quantize_f32_u8 = [&](float f) {
         float qf = f * data_scale + data_shift;
-        return q10n::qz_a1b0<float, dst_layer_t>()(qf);
+        return q10n::qz_a1b0_t<float, dst_layer_t>()(qf);
     };
 
     const auto dequantize_s32_f32 = [&](gemm_acc_t s, int gate, int j) {
@@ -229,7 +229,7 @@ rnn_postgemm_sig(rnn_postgemm_fwd_s8_t::lstm_postgemm) {
 
     const auto quantize_f32_s8 = [&](float f) {
         float qf = f * data_scale + data_shift;
-        return q10n::qz_a1b0<float, dst_layer_t>()(qf);
+        return q10n::qz_a1b0_t<float, dst_layer_t>()(qf);
     };
 
     const auto dequantize_s32_f32 = [&](gemm_acc_t s, int gate, int j) {
diff --git a/src/cpu/rnn/ref_postgemm_lstm_projection.cpp b/src/cpu/rnn/ref_postgemm_lstm_projection.cpp
index 153603ecafe..5a3c728d8db 100644
--- a/src/cpu/rnn/ref_postgemm_lstm_projection.cpp
+++ b/src/cpu/rnn/ref_postgemm_lstm_projection.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ rnn_postgemm_sig(rnn_postgemm_fwd_u8_t::lstm_projection_postgemm) {
         float qf = f * data_scale + data_shift;
         qf = nstl::min(qf, 255.0f);
         qf = nstl::max(qf, 0.0f);
-        return q10n::qz_a1b0<float, dst_layer_t>()(qf);
+        return q10n::qz_a1b0_t<float, dst_layer_t>()(qf);
     };
 
     const auto dequantize_s32_f32 = [&](gemm_acc_t s, int j) {
@@ -149,7 +149,7 @@ rnn_postgemm_sig(rnn_postgemm_fwd_s8_t::lstm_projection_postgemm) {
 
     const auto quantize_f32_s8 = [&](float f) {
         const float qf = f * data_scale + data_shift;
-        return q10n::qz_a1b0<float, dst_layer_t>()(qf);
+        return q10n::qz_a1b0_t<float, dst_layer_t>()(qf);
     };
 
     const auto dequantize_s32_f32 = [&](gemm_acc_t s, int j) {
diff --git a/src/cpu/rnn/ref_rnn.cpp b/src/cpu/rnn/ref_rnn.cpp
index 2df2652daea..21ccbd20d54 100644
--- a/src/cpu/rnn/ref_rnn.cpp
+++ b/src/cpu/rnn/ref_rnn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -320,6 +320,9 @@ _ref_rnn_common_t<aprop, src_type, weights_type, acc_type>::pd_t::init_brgemm(
     bool allow_down_conversion_to_bf16
             = is_f32 && is_fpmath_bf16 && is_impl_bf16;
 
+    // Initialized rnn_ early to get correct verbose output
+    rnn_ = zero<decltype(rnn_)>();
+    rnn_.is_brgemm = true;
     VDISPATCH_RNN(
             one_of(cell_kind, alg_kind::vanilla_rnn, alg_kind::vanilla_lstm,
                     alg_kind::vanilla_gru, alg_kind::lbr_gru,
@@ -352,8 +355,6 @@ _ref_rnn_common_t<aprop, src_type, weights_type, acc_type>::pd_t::init_brgemm(
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_RNN(this->with_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
 
-    rnn_ = zero<decltype(rnn_)>();
-    rnn_.is_brgemm = true;
     VDISPATCH_RNN(init_conf<class_name>(rnn_, *this->desc(), *this->attr(),
                           this->src_md(0), this->src_md(1), this->src_md(2),
                           this->weights_md(0), this->weights_md(1),
@@ -413,7 +414,7 @@ _ref_rnn_common_t<aprop, src_type, weights_type, acc_type>::pd_t::init_brgemm(
     VDISPATCH_RNN(
             !(rnn_.is_signed_int8_conf() && !is_superset(isa, avx512_core_amx)),
             VERBOSE_ISA_DT_MISMATCH);
-    VDISPATCH_RNN(!(rnn_.is_int8_conf() && !is_superset(isa, avx512_core_vnni)),
+    VDISPATCH_RNN(!(rnn_.is_int8_conf() && !is_superset(isa, avx2)),
             VERBOSE_ISA_DT_MISMATCH);
     VDISPATCH_RNN(!(rnn_.is_f32_conf() && !is_superset(isa, avx2)),
             VERBOSE_ISA_DT_MISMATCH);
@@ -829,20 +830,33 @@ template <prop_kind_t aprop, data_type_t src_type, data_type_t weights_type,
 rnn_matmul_sig((_ref_rnn_common_t<aprop, src_type, weights_type,
         acc_type>::execute_matmul)) {
 
-    engine_t *engine = ctx.stream()->engine();
+    // Service engine is just a global classic CPU engine that is used
+    // when it's required to create memory_t objects for classic CPU
+    // engine regardless of the CPU runtime. For example, SYCL CPU engine
+    // cannot be used to create such objects.
+    engine_t *service_engine = get_service_engine();
     constexpr auto mem_flag = memory_flags_t::use_runtime_ptr;
-    memory_t src_mem(
-            engine, matmul_prim->pd()->src_md(), mem_flag, (void *)(a_));
-    memory_t wei_mem(
-            engine, matmul_prim->pd()->weights_md(), mem_flag, (void *)(b_));
-    memory_t dst_mem(
-            engine, matmul_prim->pd()->dst_md(), mem_flag, (void *)(c_));
+
+    // a_, b_ and c_ are regular, raw CPU pointers that can only be used with
+    // memory_t objects created for the classic CPU engine.
+    std::unique_ptr<memory_t, memory_deleter_t> src_mem;
+    CHECK(safe_ptr_assign(src_mem,
+            new memory_t(service_engine, matmul_prim->pd()->src_md(), mem_flag,
+                    (void *)(a_))));
+    std::unique_ptr<memory_t, memory_deleter_t> wei_mem;
+    CHECK(safe_ptr_assign(wei_mem,
+            new memory_t(service_engine, matmul_prim->pd()->weights_md(),
+                    mem_flag, (void *)(b_))));
+    std::unique_ptr<memory_t, memory_deleter_t> dst_mem;
+    CHECK(safe_ptr_assign(dst_mem,
+            new memory_t(service_engine, matmul_prim->pd()->dst_md(), mem_flag,
+                    (void *)(c_))));
 
     exec_args_t matmul_args;
     // Note Matmul src and wei may not directly map to RNN primitive src and wei
-    matmul_args[DNNL_ARG_SRC] = {&wei_mem, true};
-    matmul_args[DNNL_ARG_WEIGHTS] = {&src_mem, true};
-    matmul_args[DNNL_ARG_DST] = {&dst_mem, false};
+    matmul_args[DNNL_ARG_SRC] = {wei_mem.get(), true};
+    matmul_args[DNNL_ARG_WEIGHTS] = {src_mem.get(), true};
+    matmul_args[DNNL_ARG_DST] = {dst_mem.get(), false};
 
     exec_ctx_t matmul_ctx(ctx, std::move(matmul_args));
     nested_scratchpad_t ns(ctx, key_nested_multiple, matmul_prim);
@@ -1409,7 +1423,7 @@ void copy_init_iter_fwd_template(const rnn_conf_t &rnn, const rnn_pd_t *pd,
     const auto maybe_q = [&](input_data_t f) {
         if (quantize) {
             float qf = f * data_scale + data_shift;
-            return q10n::qz_a1b0<float, src_data_t>()(qf);
+            return q10n::qz_a1b0_t<float, src_data_t>()(qf);
         } else
             return (src_data_t)f;
     };
@@ -1575,7 +1589,7 @@ void copy_res_layer_fwd_template(const rnn_conf_t &rnn, const rnn_pd_t *pd,
             PRAGMA_OMP_SIMD()
             for (int s = 0; s < rnn.dlc; s++) {
                 float val = (float)ss[s] + dd[s];
-                val = q10n::qz_a1b0<float, src_data_t>()(val);
+                val = q10n::qz_a1b0_t<float, src_data_t>()(val);
                 dd[s] = (dst_layer_dt)((val - 2 * shift) / scale);
             }
         } else if (rnn_u8u8_case
@@ -2132,11 +2146,13 @@ status_t _ref_rnn_common_t<aprop, src_type, weights_type, acc_type>::execute(
         auto wei_iter_mem
                 = scratchpad.get_memory_storage(key_rnn_bf32_wei_iter_trans);
         {
-            memory_t reorder_dst(
-                    engine, &wei_layer_desc, std::move(wei_layer_mem));
+            std::unique_ptr<memory_t, memory_deleter_t> reorder_dst;
+            CHECK(safe_ptr_assign(reorder_dst,
+                    new memory_t(engine, &wei_layer_desc,
+                            std::move(wei_layer_mem))));
             exec_args_t reorder_args;
             reorder_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_WEIGHTS_LAYER);
-            reorder_args[DNNL_ARG_DST] = {&reorder_dst, false};
+            reorder_args[DNNL_ARG_DST] = {reorder_dst.get(), false};
             exec_ctx_t reorder_ctx(ctx, std::move(reorder_args));
             nested_scratchpad_t ns(
                     ctx, key_nested_multiple, bf32_wei_layer_reorder_);
@@ -2148,11 +2164,13 @@ status_t _ref_rnn_common_t<aprop, src_type, weights_type, acc_type>::execute(
         }
 
         {
-            memory_t reorder_dst(
-                    engine, &wei_iter_desc, std::move(wei_iter_mem));
+            std::unique_ptr<memory_t, memory_deleter_t> reorder_dst;
+            CHECK(safe_ptr_assign(reorder_dst,
+                    new memory_t(
+                            engine, &wei_iter_desc, std::move(wei_iter_mem))));
             exec_args_t reorder_args;
             reorder_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_WEIGHTS_ITER);
-            reorder_args[DNNL_ARG_DST] = {&reorder_dst, false};
+            reorder_args[DNNL_ARG_DST] = {reorder_dst.get(), false};
             exec_ctx_t reorder_ctx(ctx, std::move(reorder_args));
             nested_scratchpad_t ns(
                     ctx, key_nested_multiple, bf32_wei_iter_reorder_);
diff --git a/src/cpu/rnn/ref_rnn.hpp b/src/cpu/rnn/ref_rnn.hpp
index a479867bd26..bb2262fa257 100644
--- a/src/cpu/rnn/ref_rnn.hpp
+++ b/src/cpu/rnn/ref_rnn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 * Copyright 2018-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -62,7 +62,7 @@ void gates_reduction(const rnn_utils::rnn_conf_t &rnn,
     // @todo block k on simd-width to enable vectorization in
     // parallel_nd path
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP && _OPENMP >= 201307 \
-        && (!defined(__INTEL_COMPILER) || __INTEL_COMPILER < 1910)
+        && defined __INTEL_COMPILER && __INTEL_COMPILER < 1910
 #pragma omp parallel for simd collapse(2)
     for (int i = 0; i < rnn.n_gates; i++)
         for (int k = 0; k < rnn.dhc; k++)
@@ -97,16 +97,16 @@ struct _ref_rnn_common_t : public primitive_t {
             rnn_postgemm_bwd_t<src_type, scratch_type, acc_type>>::type;
 
     /* These types are defined for each element in the cell execution */
-    typedef typename prec_traits<src_type>::type src_layer_t;
-    typedef typename prec_traits<src_type>::type src_iter_t;
-    typedef typename prec_traits<src_type>::type dst_layer_t;
-    typedef typename prec_traits<src_type>::type dst_iter_t;
-    typedef typename prec_traits<weights_type>::type weights_t;
-    typedef typename prec_traits<src_type>::type gemm_data_t;
-    typedef typename prec_traits<acc_type>::type gemm_acc_t;
-    typedef typename prec_traits<scratch_type>::type scratch_t;
-    typedef typename prec_traits<src_type>::type ht_t;
-    typedef typename prec_traits<src_type>::type gates_t;
+    using src_layer_t = typename prec_traits_t<src_type>::type;
+    using src_iter_t = typename prec_traits_t<src_type>::type;
+    using dst_layer_t = typename prec_traits_t<src_type>::type;
+    using dst_iter_t = typename prec_traits_t<src_type>::type;
+    using weights_t = typename prec_traits_t<weights_type>::type;
+    using gemm_data_t = typename prec_traits_t<src_type>::type;
+    using gemm_acc_t = typename prec_traits_t<acc_type>::type;
+    using scratch_t = typename prec_traits_t<scratch_type>::type;
+    using ht_t = typename prec_traits_t<src_type>::type;
+    using gates_t = typename prec_traits_t<src_type>::type;
 
     using class_name
             = _ref_rnn_common_t<aprop, src_type, weights_type, acc_type>;
@@ -172,7 +172,7 @@ struct _ref_rnn_common_t : public primitive_t {
         : primitive_t(apd), rnn_postgemm_(nullptr) {}
 
     status_t init(engine_t *engine) override;
-    virtual ~_ref_rnn_common_t() { delete rnn_postgemm_; }
+    ~_ref_rnn_common_t() override { delete rnn_postgemm_; }
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
diff --git a/src/cpu/rnn/rnn_reorders.hpp b/src/cpu/rnn/rnn_reorders.hpp
index 5156350d860..79b1ff21e93 100644
--- a/src/cpu/rnn/rnn_reorders.hpp
+++ b/src/cpu/rnn/rnn_reorders.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ template <data_type_t type_i>
 static inline void quantize_igo(int8_t *scratch_quantized,
         const memory_desc_wrapper &src_d, const float *src, int mask,
         float *scales) {
-    typedef typename prec_traits<type_i>::type in_data_t;
+    using in_data_t = typename prec_traits_t<type_i>::type;
 
     // TODO: trivial strides assumes here.
     //       Use proper strides where appropriate
@@ -76,7 +76,7 @@ static inline void quantize_igo(int8_t *scratch_quantized,
             for (int go = 0; go < G * O; go++) {
                 const float s = scales[(mask == 0) ? 0 : go];
                 scratch_quantized[ldi * G * O + go]
-                        = q10n::qz_b0<in_data_t, int8_t>()(
+                        = q10n::qz_b0_t<in_data_t, int8_t>()(
                                 src[ldi * G * O + go], s);
             }
         }
@@ -87,7 +87,7 @@ template <data_type_t type_i>
 static inline void quantize_goi(int8_t *scratch_quantized,
         const memory_desc_wrapper &src_d, const float *src, int mask,
         float *scales) {
-    typedef typename prec_traits<type_i>::type in_data_t;
+    using in_data_t = typename prec_traits_t<type_i>::type;
 
     // TODO: trivial strides assumes here.
     //       Use proper strides where appropriate
@@ -100,7 +100,7 @@ static inline void quantize_goi(int8_t *scratch_quantized,
         PRAGMA_OMP_SIMD()
         for (dim_t i = 0; i < I; i++) {
             scratch_quantized[ld * I * G * O + i * G * O + go]
-                    = q10n::qz_b0<in_data_t, int8_t>()(
+                    = q10n::qz_b0_t<in_data_t, int8_t>()(
                             src[ld * G * O * I + go * I + i], s);
         }
     });
@@ -232,8 +232,8 @@ struct rnn_data_reorder_t : public primitive_t {
     rnn_data_reorder_t(const pd_t *apd) : primitive_t(apd) {}
 
 private:
-    typedef typename prec_traits<type_i>::type in_data_t;
-    typedef typename prec_traits<type_o>::type out_data_t;
+    using in_data_t = typename prec_traits_t<type_i>::type;
+    using out_data_t = typename prec_traits_t<type_o>::type;
 
     bool is_dense() const {
         const memory_desc_wrapper &input_d = pd()->src_md();
@@ -271,7 +271,7 @@ struct rnn_data_reorder_t : public primitive_t {
                 PRAGMA_OMP_SIMD()
                 for (int j = 0; j < inner_dim; ++j) {
                     const float in = (float)i_[j] * scale + shift;
-                    o_[j] = q10n::qz_a1b0<float, out_data_t>()(in);
+                    o_[j] = q10n::qz_a1b0_t<float, out_data_t>()(in);
                 }
             }
         });
@@ -288,7 +288,8 @@ struct rnn_data_reorder_t : public primitive_t {
         const size_t nelems = input_d.nelems();
         parallel_nd(nelems, [&](size_t i) {
             const float in = (float)input[input_d.off_l(i)] * scale + shift;
-            output[output_d.off_l(i)] = q10n::qz_a1b0<float, out_data_t>()(in);
+            output[output_d.off_l(i)]
+                    = q10n::qz_a1b0_t<float, out_data_t>()(in);
         });
         return status::success;
     }
@@ -428,7 +429,7 @@ struct rnn_weights_reorder_s8_t : public primitive_t {
     rnn_weights_reorder_s8_t(const pd_t *apd) : primitive_t(apd) {}
 
 private:
-    typedef typename prec_traits<type_i>::type in_data_t;
+    using in_data_t = typename prec_traits_t<type_i>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         // TODO: trivial strides assumed here.
@@ -615,8 +616,8 @@ struct rnn_weights_reorder_t : public primitive_t {
     rnn_weights_reorder_t(const pd_t *apd) : primitive_t(apd) {}
 
 private:
-    typedef typename prec_traits<type_i>::type in_data_t;
-    typedef typename prec_traits<type_o>::type out_data_t;
+    using in_data_t = typename prec_traits_t<type_i>::type;
+    using out_data_t = typename prec_traits_t<type_o>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         // TODO: trivial strides assumed here.
@@ -779,12 +780,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
                 return unimplemented;
 
             // Check the proper memory desc has been passed to u8s8 and s8s8
-            // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation
-            // have common bit so we have to perform additional checks to
-            // separate these two cases
             const bool check_u8s8 = (od.extra().flags & rnn_u8s8_compensation)
-                    && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                            od.extra().flags)
                     && od.extra().compensation_mask
                             == ((id.ndims() == 5) ? 27 /* 11011 */
                                                   : 13 /* 1101 */);
@@ -802,7 +798,8 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
             format_tag_t otag, itag;
 
             itag = id.matches_one_of_tag(ldigo, ldio);
-            otag = od.matches_one_of_tag(ldgOI64o4i, ldgOI32o4i, ldOI32o4i);
+            otag = od.matches_one_of_tag(
+                    ldgOI64o4i, ldgOI32o4i, ldgOI16o4i, ldOI32o4i, ldOI16o4i);
             if (itag != format_tag::undef && otag != format_tag::undef) {
                 _pd->itag_ = itag;
                 _pd->otag_ = otag;
@@ -842,8 +839,8 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
     rnn_brgemm_weights_reorder_s8_t(const pd_t *apd) : primitive_t(apd) {}
 
 private:
-    typedef typename prec_traits<type_i>::type in_data_t;
-    typedef typename prec_traits<type_o>::type out_data_t;
+    using in_data_t = typename prec_traits_t<type_i>::type;
+    using out_data_t = typename prec_traits_t<type_o>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         using namespace format_tag;
@@ -860,15 +857,13 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
             return status::success;
         }
 
-        const auto &blocked_d = dst_d;
-        const auto &pdims = blocked_d.padded_dims();
-
-        const int o_block = pd()->otag_ == ldgOI64o4i ? 64 : 32;
+        const int o_block = dst_d.blocking_desc().inner_blks[0];
         static constexpr int i_block = 4;
 
         dim_t L, D, I, G, O;
         init_dims(L, D, I, G, O, src_d);
 
+        const auto &pdims = dst_d.padded_dims();
         const dim_t pI = pdims[2];
         const dim_t pO = (src_d.ndims() == 5) ? pdims[4] : pdims[3];
         const dim_t IB = pI / i_block;
@@ -886,9 +881,7 @@ struct rnn_brgemm_weights_reorder_s8_t : public primitive_t {
                           .template get<void>(memory_tracking::names::
                                           key_reorder_rnn_weights_reduction);
         float *comp = reinterpret_cast<float *>(dst + compensation_offset);
-        const bool req_s8s8_comp = (dst_d.extra().flags & rnn_u8s8_compensation)
-                && !types::extra_flag_rnn_s8s8_compensation_is_set(
-                        dst_d.extra().flags);
+        const bool req_s8s8_comp = dst_d.extra().flags & rnn_u8s8_compensation;
         const auto mask_ok = [&](int mask) {
             return mask
                     == ((src_d.ndims() == 5) ? 27 /* 11011 */
diff --git a/src/cpu/rnn/rnn_utils.cpp b/src/cpu/rnn/rnn_utils.cpp
index e4342fe93eb..a4a51608ad1 100644
--- a/src/cpu/rnn/rnn_utils.cpp
+++ b/src/cpu/rnn/rnn_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -76,8 +76,8 @@ bool rnn_utils::is_ldoi(const memory_desc_wrapper &mdw) {
 bool rnn_utils::is_ldigo_blocked(const memory_desc_wrapper &mdw) {
     format_tag_t md_format_tag = mdw.matches_one_of_tag(format_tag::ldgOi32o,
             format_tag::ldgOI32o2i, format_tag::ldgOI32o4i,
-            format_tag::ldgOI64o2i, format_tag::ldgOI64o4i,
-            format_tag::ldgOi16o);
+            format_tag::ldgOI16o4i, format_tag::ldgOI64o2i,
+            format_tag::ldgOI64o4i, format_tag::ldgOi16o);
     return md_format_tag != format_tag::undef;
 }
 
@@ -88,8 +88,8 @@ bool rnn_utils::is_ldgoi_blocked(const memory_desc_wrapper &mdw) {
 }
 
 bool rnn_utils::is_ldio_blocked(const memory_desc_wrapper &mdw) {
-    format_tag_t md_format_tag = mdw.matches_one_of_tag(
-            format_tag::ldOi32o, format_tag::ldOI32o4i, format_tag::ldOi16o);
+    format_tag_t md_format_tag = mdw.matches_one_of_tag(format_tag::ldOi32o,
+            format_tag::ldOI32o4i, ldOI16o4i, format_tag::ldOi16o);
     return md_format_tag != format_tag::undef;
 }
 
@@ -286,14 +286,16 @@ status_t rnn_utils::set_expected_desc(rnn_conf_t &rnn,
 
             if (weights_type == weights_type_t::projection) {
                 if (rnn.is_int8_conf())
-                    tag = format_tag::ldOI32o4i;
+                    tag = utils::map(n_block, format_tag::undef, 32,
+                            format_tag::ldOI32o4i, 16, format_tag::ldOI16o4i);
                 else
                     tag = utils::map(n_block, format_tag::undef, 32,
                             format_tag::ldOi32o, 16, format_tag::ldOi16o);
             } else if (rnn.is_fwd) {
                 if (rnn.is_int8_conf())
                     tag = utils::map(n_block, format_tag::undef, 64,
-                            format_tag::ldgOI64o4i, 32, ldgOI32o4i);
+                            format_tag::ldgOI64o4i, 32, ldgOI32o4i, 16,
+                            ldgOI16o4i);
                 else if (rnn.is_xf16_conf())
                     tag = utils::map(n_block, format_tag::undef, 64,
                             format_tag::ldgOI64o2i, 32, ldgOI32o2i);
diff --git a/src/cpu/rnn/rnn_utils.hpp b/src/cpu/rnn/rnn_utils.hpp
index 0bd61ba9365..f120e733cd3 100644
--- a/src/cpu/rnn/rnn_utils.hpp
+++ b/src/cpu/rnn/rnn_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -316,7 +316,7 @@ struct rnn_conf_t {
     size_t weights_iter_comp_offset = 0, weights_iter_pack_size = 0;
     size_t weights_projection_comp_offset = 0, weights_projection_pack_size = 0;
 
-    bool copy_bias = 0;
+    bool copy_bias = false;
     int weights_layer_ld = 0, weights_layer_nld = 0;
     int diff_weights_layer_ld = 0, diff_weights_layer_nld = 0;
     int weights_iter_ld = 0, weights_iter_nld = 0;
@@ -347,9 +347,10 @@ struct rnn_conf_t {
     int dst_iter_c_ld_ = 0, dst_iter_c_nld_ = 0;
 
     int weights_iter_compensation_size = 0, weights_layer_compensation_size = 0;
-    bool is_fwd = 0, is_training = 0, is_lbr = 0, is_lstm_peephole = 0,
-         is_lstm_projection = 0, is_augru = 0, is_orig_gru = 0;
-    bool use_workspace = 0;
+    bool is_fwd = false, is_training = false, is_lbr = false,
+         is_lstm_peephole = false, is_lstm_projection = false, is_augru = false,
+         is_orig_gru = false;
+    bool use_workspace = false;
 
     // Size of workspace for each tensor in bytes
     // Notes:
@@ -630,7 +631,7 @@ struct rnn_conf_t {
     int dhc_block_peephole, dhc_tail_peephole, dhc_blocks_peephole;
     bool brgemm_fwd_iter_layer_fuse_possible = false;
 
-    dim_t nthr;
+    int nthr;
 #if DNNL_X64
     x64::cpu_isa_t brgemm_isa;
 #endif
@@ -683,7 +684,7 @@ bool init_conf(rnn_conf_t &rnn, const rnn_desc_t &rd,
     rnn.dst_iter_c_dt = dst_iter_c_d.is_zero() ? data_type::f32
                                                : dst_iter_c_d.data_type();
 
-    rnn.cell_dt = data_traits<typename T::src_layer_t>::data_type;
+    rnn.cell_dt = data_traits_t<typename T::src_layer_t>::data_type;
     switch (rd.direction) {
         case dnnl_unidirectional_left2right: rnn.exec_dir = l2r; break;
         case dnnl_unidirectional_right2left: rnn.exec_dir = r2l; break;
diff --git a/src/cpu/rv64/CMakeLists.txt b/src/cpu/rv64/CMakeLists.txt
index 6be072fd790..fe5f9bb37e0 100644
--- a/src/cpu/rv64/CMakeLists.txt
+++ b/src/cpu/rv64/CMakeLists.txt
@@ -32,7 +32,7 @@ if(NOT DNNL_RISCV_USE_RVV_INTRINSICS)
     endif()
 endif()
 
-if(NOT DNNL_CPU_RUNTIME STREQUAL "SEQ")
+if(NOT (DNNL_CPU_RUNTIME STREQUAL "SEQ" OR DNNL_CPU_RUNTIME STREQUAL "OMP"))
     message(FATAL_ERROR "Only sequential runtime is now supported for a RISC-V CPU")
 endif()
 
diff --git a/src/cpu/rv64/rvv_nchw_pooling.cpp b/src/cpu/rv64/rvv_nchw_pooling.cpp
index 5ded8584251..e4a1b566c7f 100644
--- a/src/cpu/rv64/rvv_nchw_pooling.cpp
+++ b/src/cpu/rv64/rvv_nchw_pooling.cpp
@@ -57,9 +57,9 @@ void MaxPooling(const float *src, float *dst, const dim_t batch,
                         int ow_offset = ow * strideW - padLeft;
                         size_t size = std::min(ow_offset + kerW, inW)
                                 - std::max(ow_offset, 0);
-                        size_t cycleLength = vsetvl_e32m8(size);
-                        vfloat32m8_t vmax
-                                = vle32_v_f32m8(&arr_flt_min[0], cycleLength);
+                        size_t cycleLength = __riscv_vsetvl_e32m8(size);
+                        vfloat32m8_t vmax = __riscv_vle32_v_f32m8(
+                                &arr_flt_min[0], cycleLength);
 
                         for (int id = std::max(od_offset, 0);
                                 id < std::min(od_offset + kerD, inD); id++)
@@ -73,34 +73,35 @@ void MaxPooling(const float *src, float *dst, const dim_t batch,
                                 size_t iw = 0;
                                 for (; iw < size - cycleLength;
                                         iw += cycleLength) {
-                                    vfloat32m8_t vsrc = vle32_v_f32m8(
+                                    vfloat32m8_t vsrc = __riscv_vle32_v_f32m8(
                                             &local_src[local_src_offset + iw],
                                             cycleLength);
-                                    vmax = vfmax_vv_f32m8(
+                                    vmax = __riscv_vfmax_vv_f32m8(
                                             vsrc, vmax, cycleLength);
                                 }
 
-                                size_t tailLength = vsetvl_e32m8(size - iw);
+                                size_t tailLength
+                                        = __riscv_vsetvl_e32m8(size - iw);
                                 {
-                                    vfloat32m8_t vsrc = vle32_v_f32m8(
+                                    vfloat32m8_t vsrc = __riscv_vle32_v_f32m8(
                                             &local_src[local_src_offset + iw],
                                             tailLength);
-                                    vmax = vfmax_vv_f32m8(
+                                    vmax = __riscv_vfmax_vv_f32m8(
                                             vsrc, vmax, tailLength);
                                 }
                             }
 
                         vfloat32m1_t min_scalar;
                         float min = -__FLT_MAX__;
-                        min_scalar = vle32_v_f32m1(&min, 1);
+                        min_scalar = __riscv_vle32_v_f32m1(&min, 1);
 
-                        cycleLength = vsetvl_e32m8(size);
+                        cycleLength = __riscv_vsetvl_e32m8(size);
                         vfloat32m1_t vred_res;
-                        vred_res = vfredmax_vs_f32m8_f32m1(
-                                vred_res, vmax, min_scalar, cycleLength);
+                        vred_res = __riscv_vfredmax_vs_f32m8_f32m1(
+                                vmax, min_scalar, cycleLength);
 
                         float red_res;
-                        vse32_v_f32m1(&red_res, vred_res, 1);
+                        __riscv_vse32_v_f32m1(&red_res, vred_res, 1);
                         dst[dst_offset] = red_res;
                     }
 }
diff --git a/src/cpu/rv64/rvv_nchw_pooling.hpp b/src/cpu/rv64/rvv_nchw_pooling.hpp
index 4fc0d134b47..86df99c6ec0 100644
--- a/src/cpu/rv64/rvv_nchw_pooling.hpp
+++ b/src/cpu/rv64/rvv_nchw_pooling.hpp
@@ -1,5 +1,5 @@
 /******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 * Copyright 2023 KNS Group LLC (YADRO)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -66,7 +66,7 @@ struct riscv_nchw_pooling_fwd_t : public primitive_t {
 
     riscv_nchw_pooling_fwd_t(const pd_t *apd);
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
diff --git a/src/cpu/scale_utils.cpp b/src/cpu/scale_utils.cpp
index c6d92a33e2f..ad4e502c473 100644
--- a/src/cpu/scale_utils.cpp
+++ b/src/cpu/scale_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,21 +32,18 @@ constexpr size_t scales_simd_w = 16;
 }
 
 void book_precomputed_scales(memory_tracking::registrar_t &scratchpad,
-        const arg_scales_t &attr_scales, size_t wei_scale_count,
+        const scales_t &attr_scales, size_t wei_scale_count,
         bool force_scales_book) {
     using namespace dnnl::impl::memory_tracking::names;
 
-    const bool with_src_scales
-            = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+    const bool with_src_scales = !attr_scales.has_default_values(DNNL_ARG_SRC);
     const bool with_wei_scales
-            = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_values();
-    const auto wei_scales_dt = attr_scales.get(DNNL_ARG_WEIGHTS).data_type_;
-    const auto wei_scale_groups_ndims
-            = attr_scales.get(DNNL_ARG_WEIGHTS).ndims_;
+            = !attr_scales.has_default_values(DNNL_ARG_WEIGHTS);
+
     if ((with_src_scales && with_wei_scales) || force_scales_book
-            || (wei_scales_dt != data_type::f32 && with_wei_scales)
-            || (wei_scale_groups_ndims > 0 && with_wei_scales)) {
-        const int wei_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
+            || !attr_scales.has_default_data_type(DNNL_ARG_WEIGHTS)
+            || !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+        const int wei_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
         const size_t precomputed_scales_size = wei_mask == 0
                 ? scales_simd_w
                 : nstl::max(
@@ -60,27 +57,26 @@ void book_precomputed_scales(memory_tracking::registrar_t &scratchpad,
 bool req_copy_scales(
         const primitive_attr_t *attr, const float scale_adjust_factor) {
     const auto &attr_scales = attr->scales_;
-    const bool with_src_scales
-            = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+    const bool with_src_scales = !attr_scales.has_default_values(DNNL_ARG_SRC);
     const bool with_wei_scales
-            = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_values();
-    const auto wei_scales_dt = attr_scales.get(DNNL_ARG_WEIGHTS).data_type_;
-    const auto wei_scale_groups_ndims
-            = attr_scales.get(DNNL_ARG_WEIGHTS).ndims_;
+            = !attr_scales.has_default_values(DNNL_ARG_WEIGHTS);
     return (with_src_scales && with_wei_scales) || scale_adjust_factor != 1.0f
-            || (wei_scales_dt != data_type::f32 && with_wei_scales)
-            || (wei_scale_groups_ndims > 0 && with_wei_scales);
+            || !attr_scales.has_default_data_type(DNNL_ARG_WEIGHTS)
+            || !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_groups();
 }
 
 const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
         const float *src_scales, const float *wei_scales, dim_t oc,
         const primitive_attr_t *attr, float scale_adjust_factor) {
-    // Note: per-ic-channel is no supported in default
-    const int wei_scale_mask = attr->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    // Note: per-ic-channel is no supported by default.
+    const int wei_scale_mask = attr->scales_.get_mask(DNNL_ARG_WEIGHTS);
     return precompute_scales(scratchpad, src_scales, wei_scales, 1, oc, false,
-            wei_scale_mask != 0, attr, scale_adjust_factor, false);
+            wei_scale_mask > 0, attr, scale_adjust_factor, false);
 }
 
+// Note: `wei_scale_per_ic` and `wei_scale_per_oc` could be identified in this
+// function unless different primitives have same definition of `per_ic` and
+// `per_oc` masks. Mostly, matmul is different from anybody else.
 const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
         const float *src_scales, const float *wei_scales, dim_t IC, dim_t OC,
         const bool wei_scale_per_ic, const bool wei_scale_per_oc,
@@ -89,18 +85,16 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
     using namespace dnnl::impl::memory_tracking::names;
 
     const auto &attr_scales = attr->scales_;
-    const bool with_src_scales
-            = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+    const bool with_src_scales = !attr_scales.has_default_values(DNNL_ARG_SRC);
     const auto wei_scale_count
             = (wei_scale_per_ic ? IC : 1) * (wei_scale_per_oc ? OC : 1);
 
     const float *scales = nullptr;
     if (req_copy_scales(attr, scale_adjust_factor)) {
-        const int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
         size_t size = 0;
         auto loc_scales
                 = scratchpad.template get<float>(key_precomputed_scales, &size);
-        if (wei_scale_mask == 0 || wei_scale_count == 1) {
+        if (wei_scale_count == 1) {
             const size_t count = nstl::min(size / sizeof(float), scales_simd_w);
             utils::array_set(loc_scales,
                     src_scales[0] * wei_scales[0] * scale_adjust_factor, count);
@@ -108,12 +102,9 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
             const dim_t count = nstl::min(
                     static_cast<dim_t>(size / sizeof(float)), wei_scale_count);
             const auto wei_scale_dt
-                    = attr_scales.get(DNNL_ARG_WEIGHTS).data_type_;
-            const auto wei_scale_groups_ndims
-                    = attr_scales.get(DNNL_ARG_WEIGHTS).ndims_;
-            const auto wei_scale_groups_ic = wei_scale_groups_ndims > 0
-                    ? attr_scales.get(DNNL_ARG_WEIGHTS).group_dims_[0]
-                    : 1;
+                    = attr_scales.get_data_type(DNNL_ARG_WEIGHTS);
+            const auto wei_scale_groups_ic
+                    = attr_scales.get_group(DNNL_ARG_WEIGHTS, 0);
             // Note: per-ic-channel scales is only supported for
             // weights decompression for now
             if ((wei_scale_per_ic && wei_scale_groups_ic > 1)
diff --git a/src/cpu/scale_utils.hpp b/src/cpu/scale_utils.hpp
index 7c1ce535889..48164b776d4 100644
--- a/src/cpu/scale_utils.hpp
+++ b/src/cpu/scale_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace impl {
 namespace cpu {
 
 void book_precomputed_scales(memory_tracking::registrar_t &scratchpad,
-        const arg_scales_t &attr_scales, size_t wei_scales_count,
+        const scales_t &attr_scales, size_t wei_scales_count,
         bool force_scales_book = false);
 
 bool req_copy_scales(
diff --git a/src/cpu/simple_concat.cpp b/src/cpu/simple_concat.cpp
index 234f6bf6d7f..f4a49d264f7 100644
--- a/src/cpu/simple_concat.cpp
+++ b/src/cpu/simple_concat.cpp
@@ -74,6 +74,16 @@ status_t simple_concat_t<data_type>::execute(const exec_ctx_t &ctx) const {
     // Applies when concat axis is the outermost dimension, e.g. concat_axis = 0
     // or concat_axis = 1, and dims[0] = 1;
     if (!has_outer_loop) {
+        // @todo CPU_PLUGIN:
+        // the following implementation was used to fix some performace issues
+        // Now after original oneDNN re-designed this piece it seems to be not applicable
+        // anymore
+        // for (int a = 0; a < num_arrs; ++a) {
+        //     const data_t *i = &iptrs[a][0];
+        //     data_t *o = &optrs[a][0];
+        //     parallel_nd_legacy(nelems_to_copy[a], [&](dim_t e) { o[e] = i[e]; });
+        // }
+
         int nthr = dnnl_get_max_threads();
         parallel(nthr, [&](int ithr, int nthr) {
             for (int a = 0; a < num_arrs; ++a) {
@@ -104,7 +114,7 @@ status_t simple_concat_t<data_type>::execute(const exec_ctx_t &ctx) const {
     const auto L1_size = platform::get_per_core_cache_size(1);
     UNUSED(L1_size); // for Windows
 
-    parallel_nd(phys_dims[0], phys_dims[1], phys_dims[2], phys_dims[3],
+    parallel_nd_legacy(phys_dims[0], phys_dims[1], phys_dims[2], phys_dims[3],
             phys_dims[4], num_arrs,
             [&](dim_t n0, dim_t n1, dim_t n2, dim_t n3, dim_t n4, dim_t a) {
                 // check if zero memory
diff --git a/src/cpu/simple_concat.hpp b/src/cpu/simple_concat.hpp
index ff0c5e22deb..ece8014e483 100644
--- a/src/cpu/simple_concat.hpp
+++ b/src/cpu/simple_concat.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -168,7 +168,7 @@ struct simple_concat_t : public primitive_t {
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
diff --git a/src/cpu/simple_layer_normalization.cpp b/src/cpu/simple_layer_normalization.cpp
index e80f8cbbf48..493115fab57 100644
--- a/src/cpu/simple_layer_normalization.cpp
+++ b/src/cpu/simple_layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,8 +52,8 @@ status_t simple_layer_normalization_fwd_t::pd_t::init(engine_t *engine) {
     VDISPATCH_LNORM(stat_md()->data_type == f32, VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_LNORM(check_scale_shift_data_type(), VERBOSE_UNSUPPORTED_FEATURE,
             "unsupported scale or shift data type");
-    VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+    VDISPATCH_LNORM(attr()->has_default_values(
+                            skip_mask_t::scales | skip_mask_t::post_ops),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_LNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_LNORM(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
diff --git a/src/cpu/simple_layer_normalization.hpp b/src/cpu/simple_layer_normalization.hpp
index 95b01ba1788..aacf6b127e6 100644
--- a/src/cpu/simple_layer_normalization.hpp
+++ b/src/cpu/simple_layer_normalization.hpp
@@ -105,24 +105,29 @@ struct simple_layer_normalization_fwd_t : public primitive_t {
         auto scratchpad = ctx.get_scratchpad_grantor();
         auto mean_mem = scratchpad.get_memory_storage(key_lnorm_tmp_mean);
         auto variance_mem = scratchpad.get_memory_storage(key_lnorm_tmp_var);
-        memory_t mean(engine, &(pd()->reordered_stat_md_), std::move(mean_mem));
-        memory_t variance(
-                engine, &(pd()->reordered_stat_md_), std::move(variance_mem));
+        std::unique_ptr<memory_t, memory_deleter_t> mean;
+        CHECK(safe_ptr_assign(mean,
+                new memory_t(engine, &(pd()->reordered_stat_md_),
+                        std::move(mean_mem))));
+        std::unique_ptr<memory_t, memory_deleter_t> variance;
+        CHECK(safe_ptr_assign(variance,
+                new memory_t(engine, &(pd()->reordered_stat_md_),
+                        std::move(variance_mem))));
 
         // reorder input stats
         if (pd()->stats_are_src() && reorder_) {
-            reorder_stat(
-                    ctx, engine, ctx.args().at(DNNL_ARG_MEAN), {&mean, false});
+            reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_MEAN),
+                    {mean.get(), false});
             reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_VARIANCE),
-                    {&variance, false});
+                    {variance.get(), false});
         }
         status_t status = execute_forward(ctx);
         if (status != status::success) return status;
         // reorder output stats
         if (!pd()->stats_are_src() && reorder_) {
-            reorder_stat(
-                    ctx, engine, {&mean, true}, ctx.args().at(DNNL_ARG_MEAN));
-            reorder_stat(ctx, engine, {&variance, true},
+            reorder_stat(ctx, engine, {mean.get(), true},
+                    ctx.args().at(DNNL_ARG_MEAN));
+            reorder_stat(ctx, engine, {variance.get(), true},
                     ctx.args().at(DNNL_ARG_VARIANCE));
         }
 
@@ -208,14 +213,18 @@ struct simple_layer_normalization_bwd_t : public primitive_t {
             auto mean_mem = scratchpad.get_memory_storage(key_lnorm_tmp_mean);
             auto variance_mem
                     = scratchpad.get_memory_storage(key_lnorm_tmp_var);
-            memory_t mean(
-                    engine, &(pd()->reordered_stat_md_), std::move(mean_mem));
-            memory_t variance(engine, &(pd()->reordered_stat_md_),
-                    std::move(variance_mem));
-            reorder_stat(
-                    ctx, engine, ctx.args().at(DNNL_ARG_MEAN), {&mean, false});
+            std::unique_ptr<memory_t, memory_deleter_t> mean;
+            CHECK(safe_ptr_assign(mean,
+                    new memory_t(engine, &(pd()->reordered_stat_md_),
+                            std::move(mean_mem))));
+            std::unique_ptr<memory_t, memory_deleter_t> variance;
+            CHECK(safe_ptr_assign(variance,
+                    new memory_t(engine, &(pd()->reordered_stat_md_),
+                            std::move(variance_mem))));
+            reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_MEAN),
+                    {mean.get(), false});
             reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_VARIANCE),
-                    {&variance, false});
+                    {variance.get(), false});
         }
 
         return execute_backward(ctx);
diff --git a/src/cpu/simple_q10n.hpp b/src/cpu/simple_q10n.hpp
index 9b31cb120c4..10f2ca62a06 100644
--- a/src/cpu/simple_q10n.hpp
+++ b/src/cpu/simple_q10n.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ saturate(const acc_t &x) {
     acc_t v = x;
     acc_t lbound = (acc_t)nstl::numeric_limits<data_t>::lowest();
     // Pick up a modified version of max value when do f32 -> s32.
-    acc_t ubound = types::max_value<acc_t>(data_traits<data_t>::data_type);
+    acc_t ubound = types::max_value<acc_t>(data_traits_t<data_t>::data_type);
     if (v < lbound) v = lbound;
     if (v > ubound) v = ubound;
     return v;
@@ -82,33 +82,33 @@ inline out_t saturate_and_round(acc_t f) {
 
 /* Quantization with alpha == 1 and beta == 0 */
 template <typename in_t, typename out_t, typename enabled = void>
-struct qz_a1b0 {
+struct qz_a1b0_t {
     out_t operator()(in_t in) { return saturate_and_round<out_t>((float)in); }
 };
 
 template <typename in_t, typename out_t>
-struct qz_a1b0<in_t, out_t,
+struct qz_a1b0_t<in_t, out_t,
         typename utils::enable_if<true && nstl::is_integral<in_t>::value
                 && !is_subset<in_t, out_t>::value>::type> {
     out_t operator()(in_t in) { return saturate<out_t>(in); }
 };
 
 template <typename in_t, typename out_t>
-struct qz_a1b0<in_t, out_t,
+struct qz_a1b0_t<in_t, out_t,
         typename utils::enable_if<is_subset<in_t, out_t>::value>::type> {
     out_t operator()(in_t in) { return (out_t)in; }
 };
 
 /* Quantization with alpha == 1 */
 template <typename in_t, typename out_t>
-struct qz_a1 {
+struct qz_a1_t {
     out_t operator()(in_t in, out_t out, float beta) {
         return saturate_and_round<out_t>((float)in + beta * out);
     }
 };
 
 template <typename in_t>
-struct qz_a1<in_t, float> {
+struct qz_a1_t<in_t, float> {
     float operator()(in_t in, float out, float beta) {
         return (float)in + beta * out;
     }
@@ -116,55 +116,55 @@ struct qz_a1<in_t, float> {
 
 /* Quantization with beta == 0 */
 template <typename in_t, typename out_t>
-struct qz_b0 {
+struct qz_b0_t {
     out_t operator()(in_t in, float alpha) {
         return saturate_and_round<out_t>(alpha * in);
     }
 };
 
 template <typename in_t>
-struct qz_b0<in_t, float> {
+struct qz_b0_t<in_t, float> {
     float operator()(in_t in, float alpha) { return alpha * in; }
 };
 
 /* Quantization */
 template <typename in_t, typename out_t>
-struct qz {
+struct qz_t {
     out_t operator()(in_t in, out_t out, float alpha, float beta) {
         return saturate_and_round<out_t>(alpha * in + (beta ? beta * out : 0));
     }
 };
 
 template <typename in_t>
-struct qz<in_t, float> {
+struct qz_t<in_t, float> {
     float operator()(in_t in, float out, float alpha, float beta) {
         return alpha * in + (beta ? beta * out : 0);
     }
 };
 
 template <>
-struct qz<bfloat16_t, bfloat16_t> {
+struct qz_t<bfloat16_t, bfloat16_t> {
     float operator()(bfloat16_t in, bfloat16_t out, float alpha, float beta) {
         return (bfloat16_t)(alpha * (float)in + (beta ? beta * (float)out : 0));
     }
 };
 
 template <>
-struct qz<float, bfloat16_t> {
+struct qz_t<float, bfloat16_t> {
     float operator()(float in, bfloat16_t out, float alpha, float beta) {
         return (bfloat16_t)(alpha * in + (beta ? beta * out : 0));
     }
 };
 
 template <>
-struct qz<float16_t, float16_t> {
+struct qz_t<float16_t, float16_t> {
     float operator()(float16_t in, float16_t out, float alpha, float beta) {
         return (float16_t)(alpha * (float)in + (beta ? beta * (float)out : 0));
     }
 };
 
 template <>
-struct qz<float, float16_t> {
+struct qz_t<float, float16_t> {
     float operator()(float in, float16_t out, float alpha, float beta) {
         return (float16_t)(alpha * in + (beta ? beta * out : 0));
     }
diff --git a/src/cpu/simple_resampling.cpp b/src/cpu/simple_resampling.cpp
index 0babdbe8265..7838c01ef9b 100644
--- a/src/cpu/simple_resampling.cpp
+++ b/src/cpu/simple_resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,8 +40,8 @@ template <data_type_t src_type, data_type_t dst_type>
 struct simple_resampling_kernel_t : public simple_resampling_base_t {
     simple_resampling_kernel_t(const resampling_pd_t *pd);
 
-    using src_data_t = typename prec_traits<src_type>::type;
-    using dst_data_t = typename prec_traits<dst_type>::type;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
 
     status_t init() override;
     status_t execute(const exec_ctx_t &ctx) const override;
@@ -179,25 +179,19 @@ void simple_resampling_kernel_t<src_type, dst_type>::fill_coeffs() {
     if (pd_->is_fwd()) {
         linear_coeffs_.reserve(pd_->OD() + pd_->OH() + pd_->OW());
         for (dim_t od = 0; od < pd_->OD(); od++)
-            linear_coeffs_.emplace_back(
-                    linear_coeffs_t(od, pd_->OD(), pd_->ID()));
+            linear_coeffs_.emplace_back(od, pd_->OD(), pd_->ID());
         for (dim_t oh = 0; oh < pd_->OH(); oh++)
-            linear_coeffs_.emplace_back(
-                    linear_coeffs_t(oh, pd_->OH(), pd_->IH()));
+            linear_coeffs_.emplace_back(oh, pd_->OH(), pd_->IH());
         for (dim_t ow = 0; ow < pd_->OW(); ow++)
-            linear_coeffs_.emplace_back(
-                    linear_coeffs_t(ow, pd_->OW(), pd_->IW()));
+            linear_coeffs_.emplace_back(ow, pd_->OW(), pd_->IW());
     } else {
         bwd_linear_coeffs_.reserve(pd_->ID() + pd_->IH() + pd_->IW());
         for (dim_t id = 0; id < pd_->ID(); id++)
-            bwd_linear_coeffs_.emplace_back(
-                    bwd_linear_coeffs_t(id, pd_->OD(), pd_->ID()));
+            bwd_linear_coeffs_.emplace_back(id, pd_->OD(), pd_->ID());
         for (dim_t ih = 0; ih < pd_->IH(); ih++)
-            bwd_linear_coeffs_.emplace_back(
-                    bwd_linear_coeffs_t(ih, pd_->OH(), pd_->IH()));
+            bwd_linear_coeffs_.emplace_back(ih, pd_->OH(), pd_->IH());
         for (dim_t iw = 0; iw < pd_->IW(); iw++)
-            bwd_linear_coeffs_.emplace_back(
-                    bwd_linear_coeffs_t(iw, pd_->OW(), pd_->IW()));
+            bwd_linear_coeffs_.emplace_back(iw, pd_->OW(), pd_->IW());
     }
 }
 
diff --git a/src/cpu/simple_resampling.hpp b/src/cpu/simple_resampling.hpp
index f632baa27a4..a9ccef95af2 100644
--- a/src/cpu/simple_resampling.hpp
+++ b/src/cpu/simple_resampling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -103,7 +103,8 @@ struct simple_resampling_fwd_t : public primitive_t {
 
     simple_resampling_fwd_t(const pd_t *apd);
     status_t init(engine_t *engine) override;
-    ~simple_resampling_fwd_t() = default;
+
+    ~simple_resampling_fwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
@@ -149,7 +150,8 @@ struct simple_resampling_bwd_t : public primitive_t {
 
     simple_resampling_bwd_t(const pd_t *apd);
     status_t init(engine_t *engine) override;
-    ~simple_resampling_bwd_t() = default;
+
+    ~simple_resampling_bwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
diff --git a/src/cpu/simple_sum.hpp b/src/cpu/simple_sum.hpp
index e8b72a21910..db2d4b1ace4 100644
--- a/src/cpu/simple_sum.hpp
+++ b/src/cpu/simple_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -124,9 +124,9 @@ struct simple_sum_t : public primitive_t {
     status_t execute(const exec_ctx_t &ctx) const override;
 
     enum { max_num_arrs = 16 };
-    typedef typename prec_traits<src_data_type>::type src_data_t;
-    typedef typename prec_traits<dst_data_type>::type dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
+    using src_data_t = typename prec_traits_t<src_data_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_data_type>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
diff --git a/src/cpu/sycl/engine.hpp b/src/cpu/sycl/engine.hpp
index 9a2f8a67b4d..0563ab53dcd 100644
--- a/src/cpu/sycl/engine.hpp
+++ b/src/cpu/sycl/engine.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,6 +44,9 @@ class engine_t : public cpu::cpu_engine_t {
 
     status_t create_memory_storage(memory_storage_t **storage, unsigned flags,
             size_t size, void *handle) override {
+        assert(runtime_kind() == runtime_kind::sycl);
+        if (runtime_kind() != runtime_kind::sycl) return status::runtime_error;
+
         return impl()->create_memory_storage(
                 storage, this, flags, size, handle);
     }
@@ -53,15 +56,12 @@ class engine_t : public cpu::cpu_engine_t {
         return cpu::sycl::stream_t::create_stream(stream, this, stream_impl);
     }
 
-    const ::sycl::device &device() const { return impl()->device(); }
-    const ::sycl::context &context() const { return impl()->context(); }
-
-    xpu::sycl::backend_t backend() const { return impl()->backend(); }
-
     bool mayiuse_system_memory_allocators() const override {
         return impl()->mayiuse_system_memory_allocators();
     }
 
+    DECLARE_COMMON_SYCL_ENGINE_FUNCTIONS();
+
 protected:
     const xpu::sycl::engine_impl_t *impl() const {
         return (const xpu::sycl::engine_impl_t *)impl::engine_t::impl();
diff --git a/src/cpu/sycl/stream_cpu_thunk.cpp b/src/cpu/sycl/stream_cpu_thunk.cpp
index c6fc0723758..fe6d5276936 100644
--- a/src/cpu/sycl/stream_cpu_thunk.cpp
+++ b/src/cpu/sycl/stream_cpu_thunk.cpp
@@ -41,6 +41,9 @@ void dnnl_impl_sycl_cpu_thunk(const thunk_params_t *params) {
 
     prim_iface->execute(submit_ctx->exec_ctx);
 
+    for (auto &m : submit_ctx->exec_ctx.args())
+        m.second.mem->release();
+
     const_cast<primitive_iface_t *>(prim_iface)->release();
 
     delete submit_ctx;
diff --git a/src/cpu/sycl/stream_submit_cpu_primitive.cpp b/src/cpu/sycl/stream_submit_cpu_primitive.cpp
index 4fb94689fce..f2df6ca2d92 100644
--- a/src/cpu/sycl/stream_submit_cpu_primitive.cpp
+++ b/src/cpu/sycl/stream_submit_cpu_primitive.cpp
@@ -109,6 +109,7 @@ void submit_cpu_primitive(stream_t *stream, const primitive_iface_t *prim_iface,
 
     std::vector<const memory_storage_t *> sycl_mem_storages;
     for (auto &a : exec_ctx.args()) {
+        a.second.mem->retain();
         if (a.second.mem->engine()->runtime_kind() == runtime_kind::sycl) {
             auto *mem_storage = a.second.mem->memory_storage();
             if (!mem_storage->is_null()) {
diff --git a/src/cpu/ukernel/attr_params.cpp b/src/cpu/ukernel/attr_params.cpp
new file mode 100644
index 00000000000..f56da41e789
--- /dev/null
+++ b/src/cpu/ukernel/attr_params.cpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_ukernel.h"
+
+#include "cpu/ukernel/c_types_map.hpp"
+
+#if DNNL_X64
+#include "cpu/x64/ukernel/attr_params.hpp"
+#endif
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu;
+using namespace dnnl::impl::cpu::ukernel;
+
+status_t dnnl_ukernel_attr_params_create(attr_params_t **attr_params) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_ukernel_attr_params_create(attr_params);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_set_post_ops_args(
+        attr_params_t *attr_params, const void **post_ops_args) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_ukernel_attr_params_set_post_ops_args(
+            attr_params, post_ops_args);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_set_A_scales(
+        attr_params_t *attr_params, const void *a_scales) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_ukernel_attr_params_set_A_scales(
+            attr_params, a_scales);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_set_B_scales(
+        attr_params_t *attr_params, const void *b_scales) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_ukernel_attr_params_set_B_scales(
+            attr_params, b_scales);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_set_D_scales(
+        attr_params_t *attr_params, const void *d_scales) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_ukernel_attr_params_set_D_scales(
+            attr_params, d_scales);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_ukernel_attr_params_destroy(attr_params_t *attr_params) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_ukernel_attr_params_destroy(attr_params);
+#endif
+    return status::unimplemented;
+}
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/ukernel/brgemm.cpp b/src/cpu/ukernel/brgemm.cpp
new file mode 100644
index 00000000000..bb3c27de1f9
--- /dev/null
+++ b/src/cpu/ukernel/brgemm.cpp
@@ -0,0 +1,157 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_ukernel.h"
+
+#include "cpu/ukernel/c_types_map.hpp"
+
+#if DNNL_X64
+#include "cpu/x64/ukernel/brgemm.hpp"
+#endif
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu;
+using namespace dnnl::impl::cpu::ukernel;
+
+status_t dnnl_brgemm_create(brgemm_t **brgemm, dim_t M, dim_t N, dim_t K,
+        dim_t batch_size, dim_t lda, dim_t ldb, dim_t ldc, data_type_t a_dt,
+        data_type_t b_dt, data_type_t c_dt) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_create(
+            brgemm, M, N, K, batch_size, lda, ldb, ldc, a_dt, b_dt, c_dt);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_set_add_C(brgemm_t *brgemm, int add_C) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_set_add_C(brgemm, add_C);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_set_post_ops(brgemm_t *brgemm, dim_t ldd, data_type_t d_dt,
+        const post_ops_t *post_ops) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_set_post_ops(brgemm, ldd, d_dt, post_ops);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_set_A_scales(brgemm_t *brgemm, int a_scale_mask) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_set_A_scales(brgemm, a_scale_mask);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_set_B_scales(brgemm_t *brgemm, int b_scale_mask) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_set_B_scales(brgemm, b_scale_mask);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_set_D_scales(brgemm_t *brgemm, int d_scale_mask) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_set_D_scales(brgemm, d_scale_mask);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_finalize(brgemm_t *brgemm) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_finalize(brgemm);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_get_B_pack_type(
+        pack_type_t *pack_type, data_type_t dt_a, data_type_t dt_b) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_get_B_pack_type(pack_type, dt_a, dt_b);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_get_scratchpad_size(const brgemm_t *brgemm, size_t *size) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_get_scratchpad_size(brgemm, size);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_is_execute_postops_valid(
+        const brgemm_t *brgemm, int *valid) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_is_execute_postops_valid(brgemm, valid);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_set_hw_context(const brgemm_t *brgemm) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_set_hw_context(brgemm);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_release_hw_context() {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_release_hw_context();
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_generate(brgemm_t *brgemm) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_generate(brgemm);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_execute(const brgemm_t *brgemm, const void *A_ptr,
+        const void *B_ptr, const dim_t *A_B_offsets, void *C_ptr,
+        void *scratchpad_ptr) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_execute(
+            brgemm, A_ptr, B_ptr, A_B_offsets, C_ptr, scratchpad_ptr);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_execute_postops(const brgemm_t *brgemm, const void *A_ptr,
+        const void *B_ptr, const dim_t *A_B_offsets, const void *C_ptr,
+        void *D_ptr, void *scratchpad_ptr, const attr_params_t *attr_params) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_execute_postops(brgemm, A_ptr, B_ptr,
+            A_B_offsets, C_ptr, D_ptr, scratchpad_ptr, attr_params);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_brgemm_destroy(brgemm_t *brgemm) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_brgemm_destroy(brgemm);
+#endif
+    return status::unimplemented;
+}
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/ukernel/c_types_map.hpp b/src/cpu/ukernel/c_types_map.hpp
new file mode 100644
index 00000000000..f4835779137
--- /dev/null
+++ b/src/cpu/ukernel/c_types_map.hpp
@@ -0,0 +1,53 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_UKERNEL_C_TYPES_MAP_HPP
+#define CPU_UKERNEL_C_TYPES_MAP_HPP
+
+#include "oneapi/dnnl/dnnl_ukernel_types.h"
+
+#include "common/c_types_map.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+// A section identical to c_map_types.hpp but just for brgemm ukernel so far.
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace ukernel {
+
+using pack_type_t = dnnl_pack_type_t;
+namespace pack_type {
+const pack_type_t undef = dnnl_pack_type_undef;
+const pack_type_t no_trans = dnnl_pack_type_no_trans;
+const pack_type_t trans = dnnl_pack_type_trans;
+const pack_type_t pack32 = dnnl_pack_type_pack32;
+} // namespace pack_type
+
+using attr_params_t = dnnl_ukernel_attr_params;
+using brgemm_t = dnnl_brgemm;
+using transform_t = dnnl_transform;
+
+} // namespace ukernel
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/ukernel/transform.cpp b/src/cpu/ukernel/transform.cpp
new file mode 100644
index 00000000000..d76fb5ece5a
--- /dev/null
+++ b/src/cpu/ukernel/transform.cpp
@@ -0,0 +1,65 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_ukernel.h"
+
+#include "cpu/ukernel/c_types_map.hpp"
+
+#if DNNL_X64
+#include "cpu/x64/ukernel/transform.hpp"
+#endif
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu;
+using namespace dnnl::impl::cpu::ukernel;
+
+status_t dnnl_transform_create(transform_t **transform, dim_t K, dim_t N,
+        pack_type_t in_pack_type, dim_t in_ld, dim_t out_ld, data_type_t in_dt,
+        data_type_t out_dt) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_transform_create(
+            transform, K, N, in_pack_type, in_ld, out_ld, in_dt, out_dt);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_transform_generate(transform_t *transform) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_transform_generate(transform);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_transform_execute(
+        const transform_t *transform, const void *in_ptr, void *out_ptr) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_transform_execute(transform, in_ptr, out_ptr);
+#endif
+    return status::unimplemented;
+}
+
+status_t dnnl_transform_destroy(transform_t *transform) {
+#if DNNL_X64
+    return x64::ukernel::dnnl_transform_destroy(transform);
+#endif
+    return status::unimplemented;
+}
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/CMakeLists.txt b/src/cpu/x64/CMakeLists.txt
index a03c573ecea..9f232c929f7 100644
--- a/src/cpu/x64/CMakeLists.txt
+++ b/src/cpu/x64/CMakeLists.txt
@@ -93,3 +93,4 @@ set(OBJ_LIB ${LIB_PACKAGE_NAME}_cpu_x64)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
+enable_conditional_compilation4(${OBJ_LIB})
diff --git a/src/cpu/x64/amx_tile_configure.cpp b/src/cpu/x64/amx_tile_configure.cpp
index 64bb3d80deb..9464c604617 100644
--- a/src/cpu/x64/amx_tile_configure.cpp
+++ b/src/cpu/x64/amx_tile_configure.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,12 +22,12 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_amx_tilecfg_t : public jit_generator {
+struct jit_amx_tilecfg_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_amx_tilecfg_t)
 
     // TODO: Need to check status
     jit_amx_tilecfg_t(bool lazy = false)
-        : jit_generator(jit_name(), avx512_core_amx), is_lazy_(lazy) {
+        : jit_generator_t(jit_name(), avx512_core_amx), is_lazy_(lazy) {
         create_kernel();
     }
 
@@ -54,10 +54,11 @@ struct jit_amx_tilecfg_t : public jit_generator {
             sttilecfg(ptr[abi_param2]);
             // Move tilecfg into Zmm for further comparison.
             vmovdqu64(Xbyak::Zmm(0), ptr[abi_param2]);
-            // Sets `1` per word if values are equal.
+            // Sets `1` per word, 32 words total for Zmms, if values are equal.
             vpcmpeqw(Xbyak::Opmask(0), Xbyak::Zmm(0), ptr[abi_param1]);
-            // `kortestw` will set CF=1 if all `1` in the mask.
-            kortestw(Xbyak::Opmask(0), Xbyak::Opmask(0));
+            // `kortestd` will set CF=1 if all `1` in the mask. Double word
+            // takes 32 bits to compare.
+            kortestd(Xbyak::Opmask(0), Xbyak::Opmask(0));
             // Checks if CF=1. If it is, everything matched, skipping config...
             jc(skip_tilecfg, T_NEAR);
             // ... otherwise, configure tile with user palette.
@@ -71,11 +72,11 @@ struct jit_amx_tilecfg_t : public jit_generator {
     }
 };
 
-struct jit_amx_tilerelease_t : public jit_generator {
+struct jit_amx_tilerelease_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_amx_tilerelease_t)
 
     // TODO: Need to check status
-    jit_amx_tilerelease_t() : jit_generator(jit_name(), avx512_core_amx) {
+    jit_amx_tilerelease_t() : jit_generator_t(jit_name(), avx512_core_amx) {
         create_kernel();
     }
 
diff --git a/src/cpu/x64/brgemm/brgemm.cpp b/src/cpu/x64/brgemm/brgemm.cpp
index 70c383978f2..4afa6f012c9 100644
--- a/src/cpu/x64/brgemm/brgemm.cpp
+++ b/src/cpu/x64/brgemm/brgemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -81,7 +81,9 @@ void brgemm_desc_t::cleanup_dst_md() {
 
 void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
         const brgemm_batch_element_t *batch, void *ptr_C, void *scratch,
-        const brgemm_dynamic_values_t *dynamic_values) {
+        const brgemm_dynamic_values_t *dynamic_values,
+        const void *ptr_wei_scales, const void *ptr_wei_zero_points,
+        const void *ptr_src_scales, const void *ptr_src_grouped_sum, size_t ic) {
     brgemm_kernel_params_t brgemm_p;
 
     brgemm_p.batch = batch;
@@ -101,6 +103,11 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
         brgemm_p.dynamic_LDC = dynamic_values->dynamic_LDC;
         brgemm_p.dynamic_LDD = dynamic_values->dynamic_LDD;
     }
+    brgemm_p.ptr_wei_scales = ptr_wei_scales;
+    brgemm_p.ptr_wei_zero_points = ptr_wei_zero_points;
+    brgemm_p.ptr_src_scales = ptr_src_scales;
+    brgemm_p.ptr_src_grouped_sum = ptr_src_grouped_sum;
+    brgemm_p.ic = ic;
 
     assert(brg_kernel);
 
@@ -110,7 +117,9 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
 void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
         const void *addr_A, const void *addr_B,
         const brgemm_batch_element_t *batch, void *ptr_C, void *scratch,
-        const brgemm_dynamic_values_t *dynamic_values) {
+        const brgemm_dynamic_values_t *dynamic_values,
+        const void *ptr_wei_scales, const void *ptr_wei_zero_points,
+        const void *ptr_src_scales, const void *ptr_src_grouped_sum, size_t ic) {
     brgemm_kernel_params_t brgemm_p;
 
     brgemm_p.batch = batch;
@@ -124,13 +133,17 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
     brgemm_p.do_apply_comp = 0;
     brgemm_p.skip_accm = 0;
     brgemm_p.BS = bs;
+    brgemm_p.ptr_wei_scales = ptr_wei_scales;
+    brgemm_p.ptr_wei_zero_points = ptr_wei_zero_points;
+    brgemm_p.ptr_src_scales = ptr_src_scales;
+    brgemm_p.ptr_src_grouped_sum = ptr_src_grouped_sum;
+    brgemm_p.ic = ic;
     if (dynamic_values) {
         brgemm_p.dynamic_LDA = dynamic_values->dynamic_LDA;
         brgemm_p.dynamic_LDB = dynamic_values->dynamic_LDB;
         brgemm_p.dynamic_LDC = dynamic_values->dynamic_LDC;
         brgemm_p.dynamic_LDD = dynamic_values->dynamic_LDD;
     }
-
     assert(brg_kernel);
     (*brg_kernel)(&brgemm_p);
 }
@@ -138,7 +151,9 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
 void brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs,
         const brgemm_batch_element_t *batch, void *ptr_C, void *ptr_D,
         const brgemm_post_ops_data_t &post_ops_data, void *scratch,
-        const brgemm_dynamic_values_t *dynamic_values) {
+        const brgemm_dynamic_values_t *dynamic_values,
+        const void *ptr_wei_scales, const void *ptr_wei_zero_points,
+        const void *ptr_src_scales, const void *ptr_src_grouped_sum, size_t ic) {
     brgemm_kernel_params_t brgemm_p;
 
     brgemm_p.batch = batch;
@@ -165,13 +180,17 @@ void brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs,
     brgemm_p.b_zp_compensations = post_ops_data.b_zp_compensations;
     brgemm_p.c_zp_values = post_ops_data.c_zp_values;
     brgemm_p.ptr_dst_scales = post_ops_data.dst_scales;
+    brgemm_p.ptr_wei_scales = ptr_wei_scales;
+    brgemm_p.ptr_wei_zero_points = ptr_wei_zero_points;
+    brgemm_p.ptr_src_scales = ptr_src_scales;
+    brgemm_p.ptr_src_grouped_sum = ptr_src_grouped_sum;
+    brgemm_p.ic = ic;
     if (dynamic_values) {
         brgemm_p.dynamic_LDA = dynamic_values->dynamic_LDA;
         brgemm_p.dynamic_LDB = dynamic_values->dynamic_LDB;
         brgemm_p.dynamic_LDC = dynamic_values->dynamic_LDC;
         brgemm_p.dynamic_LDD = dynamic_values->dynamic_LDD;
     }
-
     assert(brg_kernel);
     (*brg_kernel)(&brgemm_p);
 }
@@ -180,7 +199,9 @@ void brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs,
         const void *addr_A, const void *addr_B,
         const brgemm_batch_element_t *batch, void *ptr_C, void *ptr_D,
         const brgemm_post_ops_data_t &post_ops_data, void *scratch,
-        const brgemm_dynamic_values_t *dynamic_values) {
+        const brgemm_dynamic_values_t *dynamic_values,
+        const void *ptr_wei_scales, const void *ptr_wei_zero_points,
+        const void *ptr_src_scales, const void *ptr_src_grouped_sum, size_t ic) {
     brgemm_kernel_params_t brgemm_p;
 
     brgemm_p.batch = batch;
@@ -205,8 +226,14 @@ void brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs,
     brgemm_p.first_mb_matrix_addr_off = post_ops_data.first_mb_matrix_addr_off;
     brgemm_p.a_zp_compensations = post_ops_data.a_zp_compensations;
     brgemm_p.b_zp_compensations = post_ops_data.b_zp_compensations;
+    brgemm_p.a_zp_values = post_ops_data.a_zp_values;
     brgemm_p.c_zp_values = post_ops_data.c_zp_values;
     brgemm_p.ptr_dst_scales = post_ops_data.dst_scales;
+    brgemm_p.ptr_wei_scales = ptr_wei_scales;
+    brgemm_p.ptr_wei_zero_points = ptr_wei_zero_points;
+    brgemm_p.ptr_src_scales = ptr_src_scales;
+    brgemm_p.ptr_src_grouped_sum = ptr_src_grouped_sum;
+    brgemm_p.ic = ic;
     if (dynamic_values) {
         brgemm_p.dynamic_LDA = dynamic_values->dynamic_LDA;
         brgemm_p.dynamic_LDB = dynamic_values->dynamic_LDB;
@@ -218,11 +245,13 @@ void brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel, int bs,
     (*brg_kernel)(&brgemm_p);
 }
 
+// from ov dyn_quant
 status_t brgemm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa,
         brgemm_batch_kind_t type, impl::data_type_t dt_a,
         impl::data_type_t dt_b, bool transA, bool transB,
         brgemm_layout_t layout, float alpha, float beta, dim_t LDA, dim_t LDB,
-        dim_t LDC, dim_t M, dim_t N, dim_t K, const brgemm_strides_t *strides) {
+        dim_t LDC, dim_t M, dim_t N, dim_t K, const brgemm_strides_t *strides,
+        bool is_weights_decompression, bool is_src_dynamic_quantization, const memory_desc_t *wei_md, const primitive_attr_t *attr) {
     /*
     m - number of rows of the matrix op(A) and number of rows of the matrix C
     n - number of columns of the matrix op(B) and number of columns of the matrix C
@@ -230,37 +259,95 @@ status_t brgemm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa,
 
     Matrices are in row-major layouts:
         A: lda * m, LDA - lda must be at least max(1, k)
-        B: ldb * k, LDB - ldb must be at least max(1, n)
-        C: ldc * m, LDC - ldc must be at least max(1, n)
+                    B: ldb * k, LDB - ldb must be at least max(1, n)
+                    C: ldc * m, LDC - ldc must be at least max(1, n)
 
-    Matrices are in column-major layouts:
+                    Matrices are in column-major layouts:
         A: lda * k, LDA - lda must be at least max(1, m)
-        B: ldb * n, LDB - ldb must be at least max(1, k)
-        C: ldc * n, LDC - ldc must be at least max(1, m)
-    */
-    if (brg == nullptr) return status::invalid_arguments;
+                    B: ldb * n, LDB - ldb must be at least max(1, k)
+                    C: ldc * n, LDC - ldc must be at least max(1, m)
+                    */
+                    if (brg == nullptr) return status::invalid_arguments;
     if (transA || transB) return status::unimplemented;
-    if (type == brgemm_batch_kind_t::brgemm_batch_kind_undef)
-        return status::invalid_arguments;
+
+    brg->with_wei_decomp = is_weights_decompression;
+    brg->with_src_dyn_quant = is_src_dynamic_quantization;
 
     brgemm_utils::init_brgemm_conf(brg, isa, type, dt_a, dt_b, layout, alpha,
             beta, LDA, LDB, LDC, M, N, K, strides);
 
-    if (utils::one_of(true, brg->is_runtime_lda, brg->is_runtime_ldb))
-        return status::unimplemented;
-
     if (M <= 0 || N <= 0 || K <= 0) return status::invalid_arguments;
 
+    // Upper bound, this can likely be improved by accounting for blocking
+    dim_t max_a_stride = brg->LDA * types::data_type_size(brg->dt_a)
+            * (brg->layout == brgemm_col_major ? K : M);
+    dim_t max_b_stride = brg->LDB * types::data_type_size(brg->dt_b)
+            * (brg->layout == brgemm_col_major ? N : K);
+    dim_t max_c_stride = brg->LDC * types::data_type_size(brg->dt_c)
+            * (brg->layout == brgemm_col_major ? N : M);
+
+    // Required for EVEX encoding for offsets
+    const dim_t max_stride = std::numeric_limits<int32_t>::max();
+    if ((max_a_stride > max_stride && !brg->is_runtime_lda)
+            || (max_b_stride > max_stride && !brg->is_runtime_ldb)
+            || (max_c_stride >= max_stride && !brg->is_runtime_ldc))
+        return status::unimplemented;
+
     if (utils::everyone_is(false, brg->is_int8, brg->is_bf16, brg->is_f32,
-                brg->is_f16, brg->is_fp8))
+                brg->is_f16/*, brg->is_fp8*/))
         return status::unimplemented;
 
-    // Only amx_int8 kernel supports u8 weights.
+    // Only avx512_core_amx kernel supports u8 weights.
     if (!IMPLICATION(
-                brg->dt_b == u8, is_superset(brg->isa_impl, avx512_core_amx)))
+            brg->dt_b == u8, is_superset(brg->isa_impl, avx512_core_amx)) && !brg->with_wei_decomp)
         return status::unimplemented;
 
-    CHECK(brgemm_blocking(brg));
+    const memory_desc_wrapper wei_d(wei_md);
+    if (brg->with_wei_decomp) {
+        brg->with_grouped_wei_decomp = false;
+
+        auto wei_scales = attr->scales_.get(DNNL_ARG_WEIGHTS);
+        brg->with_wei_decomp_scales = !wei_scales.has_default_values();
+        brg->wei_decomp_scales_group_size = wei_d.dims()[1];
+        if (brg->with_wei_decomp_scales) {
+            brg->wei_decomp_scales_dt = wei_scales.get_data_type();
+            if (!one_of(brg->wei_decomp_scales_dt, f32, e8m0))
+                return status::unimplemented;
+
+            auto ld_dim = wei_scales.get_dims()[0];
+            brg->wei_decomp_scales_stride = ld_dim > 1 ? ld_dim : 0;
+            brg->wei_decomp_scales_group_size = wei_d.dims()[1] / wei_scales.get_dims()[1];
+            brg->with_grouped_wei_decomp |= wei_scales.get_dims()[1] != 1;
+        }
+
+        brg->with_wei_decomp_zero_points = !attr->zero_points_.has_default_values(DNNL_ARG_WEIGHTS);
+        brg->wei_decomp_zero_points_group_size = wei_d.dims()[1];
+        if (brg->with_wei_decomp_zero_points) {
+            brg->wei_decomp_zero_points_dt = attr->zero_points_.get_data_type(DNNL_ARG_WEIGHTS);
+            if (!one_of(brg->wei_decomp_zero_points_dt, f32, u8))
+                return status::unimplemented;
+
+            auto ld_dim = attr->zero_points_.get_dims(DNNL_ARG_WEIGHTS)[0];
+            brg->wei_decomp_zero_points_stride = ld_dim > 1 ? ld_dim : 0;
+            brg->wei_decomp_zero_points_group_size = wei_d.dims()[1] / attr->zero_points_.get_dims(DNNL_ARG_WEIGHTS)[1];
+            brg->with_grouped_wei_decomp |= attr->zero_points_.get_dims(DNNL_ARG_WEIGHTS)[1] != 1;
+        }
+    }
+
+    brg->src_scales_group_size = wei_d.dims()[1];
+    if (brg->with_src_dyn_quant) {
+        brg->src_scales_group_size = attr->src_dyn_quant_params_.get();
+        brg->with_grouped_wei_decomp = true;
+        brg->src_scales_stride = div_up(wei_d.dims()[1], brg->src_scales_group_size);
+    }
+
+    CHECK(brgemm_desc_finalize(brg));
+
+    brg->src_sum_group_size = wei_d.dims()[1];
+    if (brg->with_src_dyn_quant) {
+        brg->src_sum_group_size = brg->rd_block;
+        brg->src_grouped_sum_stride = div_up(wei_d.dims()[1], brg->src_sum_group_size);
+    }
 
     // avx2_vnni_2 kernel with xf16 data type requires blocked weights.
     if (brg->isa_impl == avx2_vnni_2 && brg->is_xf16()
@@ -290,14 +377,13 @@ status_t brdgmm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa,
                 false, brg->is_int8, brg->is_bf16, brg->is_f32, brg->is_f16))
         return status::unimplemented;
 
-    CHECK(brdgmm_blocking(brg));
-
     return status::success;
 }
 
 status_t brgemm_desc_set_postops(brgemm_desc_t *brg,
         const primitive_attr_t *attr, const memory_desc_t *dst_md, dim_t LDD,
-        impl::data_type_t dt_bias) {
+        impl::data_type_t dt_bias,
+        bool is_weights_decompression) {
     if (!brg || !dst_md) return status::invalid_arguments;
 
     brg->set_attr(attr);
@@ -348,13 +434,15 @@ status_t brgemm_desc_set_postops(brgemm_desc_t *brg,
                                 data_type::f16)))
         return status::unimplemented;
     const auto bias_f8_e5m2_compatible
-            = one_of(dt_d, data_type::f32, data_type::f16, data_type::f8_e5m2)
+            = one_of(dt_d, data_type::f32, data_type::f16, data_type::bf16,
+                      data_type::f8_e5m2)
             && one_of(dt_bias, data_type::undef, data_type::f32, data_type::f16,
-                    data_type::f8_e5m2, data_type::f8_e4m3);
+                    data_type::bf16, data_type::f8_e5m2, data_type::f8_e4m3);
     const auto bias_f8_e4m3_compatible
-            = one_of(dt_d, data_type::f32, data_type::f16, data_type::f8_e4m3)
+            = one_of(dt_d, data_type::f32, data_type::f16, data_type::bf16,
+                      data_type::f8_e4m3)
             && one_of(dt_bias, data_type::undef, data_type::f32, data_type::f16,
-                    data_type::f8_e4m3, data_type::f8_e5m2);
+                    data_type::bf16, data_type::f8_e4m3, data_type::f8_e5m2);
     if (!IMPLICATION(brg->is_fp8,
                 bias_f8_e5m2_compatible || bias_f8_e4m3_compatible))
         return status::unimplemented;
@@ -371,9 +459,6 @@ status_t brgemm_desc_set_postops(brgemm_desc_t *brg,
         brg->is_bf16_emu
                 = !(mayiuse(avx512_core_bf16) || brg->isa_impl == avx2_vnni_2);
 
-    // Rerun blocking heuristic due to reduced zmm register count
-    if (brg->is_bf16_emu && brg->is_dgmm) CHECK(brdgmm_blocking(brg));
-
     if (!brg->attr()) return status::success;
 
     using namespace injector;
@@ -400,6 +485,7 @@ status_t brgemm_desc_set_postops(brgemm_desc_t *brg,
                             false /*sum_requires_zp_zero*/,
                             true /*sum_requires_same_params*/,
                             {broadcasting_strategy_t::per_oc,
+                                    broadcasting_strategy_t::per_oc_d,
                                     broadcasting_strategy_t::scalar,
                                     broadcasting_strategy_t::per_mb,
                                     broadcasting_strategy_t::per_mb_spatial,
@@ -426,55 +512,60 @@ status_t brgemm_desc_set_postops(brgemm_desc_t *brg,
     const auto &wei_scales = attr->scales_.get(DNNL_ARG_WEIGHTS);
     brg->with_scales = !brg->skip_scales
             && (!src_scales.has_default_values()
-                    || !wei_scales.has_default_values()
+                    || (!wei_scales.has_default_values() && !is_weights_decompression)
                     || brg->with_weights_scale_adjust);
     if (brg->with_scales) {
         // Note. the current version supports only two different output scale
         // types:
-        //     1) common (mask_ = 0)
+        //     1) common (mask = 0)
         //     2) per_n_dim_scale - broadcast across n dimension;
         //        for convolution and inner product promitives it corresponds
-        //        to "per_oc" mask_ = 1 << 1; for matmul - to
-        //        mask_ = (1 << (ndims - 1))), where ndims is number of
+        //        to "per_oc" mask = 1 << 1; for matmul - to
+        //        mask = (1 << (ndims - 1))), where ndims is number of
         //        dimensions for original matmul problem
-        // So if wei_scales.mask_ != 0 (not common) it's assumed here that scale
-        // type is per_n_dim_scale and driver which calls brgemm kernel checked
-        // that mask has correct value for this case
-        brg->is_oc_scale = wei_scales.mask_ != 0;
+        // So if wei_scales.get_mask() > 0 (not common) it's assumed here that
+        // scale type is per_n_dim_scale and driver which calls brgemm kernel
+        // checked that mask has correct value for this case
+        brg->is_oc_scale = wei_scales.get_mask() > 0;
     }
 
     const auto &dst_scales = attr->scales_.get(DNNL_ARG_DST);
     brg->with_dst_scales = !dst_scales.has_default_values();
-    const bool scales_ok = src_scales.mask_ == 0 && dst_scales.mask_ == 0
-            && attr->scales_.has_default_values(
-                    {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST});
+    const bool scales_ok = attr->scales_.has_default_values({DNNL_ARG_SRC,
+                                   DNNL_ARG_WEIGHTS, DNNL_ARG_DST})
+            && IMPLICATION(!src_scales.has_default_values(),
+                    src_scales.get_mask() == 0)
+            && IMPLICATION(!dst_scales.has_default_values(),
+                    dst_scales.get_mask() == 0);
     if (!scales_ok) return status::unimplemented;
 
     auto init_zp_type
             = [&](brgemm_broadcast_t &zp_type, int mem_arg) -> status_t {
-        auto zero_points = attr->zero_points_;
-
-        // common zero point type is supported for now
-        if (!zero_points.common(mem_arg)) return status::unimplemented;
+        const auto &zp = attr->zero_points_;
+        // Always init a default value;
+        zp_type = brgemm_broadcast_t::none;
 
         const bool skip_zero_point
-                = mem_arg == DNNL_ARG_WEIGHTS && brg->skip_zp_b_compensation;
-        zp_type = zero_points.has_default_values(mem_arg) || skip_zero_point
-                ? brgemm_broadcast_t::none
-                : brgemm_broadcast_t::per_tensor;
+                = (mem_arg == DNNL_ARG_WEIGHTS && brg->skip_zp_b_compensation);
+        if (skip_zero_point) return status::success;
+        if (!zp.has_default_values(mem_arg)) {
+            int mask = zp.get_mask(mem_arg);
+            if (mask == 0) {
+                zp_type = brgemm_broadcast_t::per_tensor;
+            } else if (mask == (1 << 1)) {
+                zp_type = brgemm_broadcast_t::per_n;
+            } else if (mask == 1 && mem_arg == DNNL_ARG_WEIGHTS ) {
+                zp_type = brgemm_broadcast_t::none;
+            } else {
+                return status::unimplemented;
+            }
+        }
         return status::success;
     };
 
-    init_zp_type(brg->zp_type_a, DNNL_ARG_SRC);
-    init_zp_type(brg->zp_type_b, DNNL_ARG_WEIGHTS);
-    init_zp_type(brg->zp_type_c, DNNL_ARG_DST);
-
-    // Post-ops may use vector registers so brgemm/brdgmm blocking may need to
-    // be updated
-    if (brg->is_dgmm)
-        CHECK(brdgmm_blocking(brg));
-    else
-        CHECK(brgemm_blocking(brg));
+    CHECK(init_zp_type(brg->zp_type_a, DNNL_ARG_SRC));
+    CHECK(init_zp_type(brg->zp_type_b, DNNL_ARG_WEIGHTS));
+    CHECK(init_zp_type(brg->zp_type_c, DNNL_ARG_DST));
 
     return status::success;
 }
@@ -505,42 +596,6 @@ status_t brgemm_desc_set_attr(
 
     if (brgattr.fpmath_mode != fpmath_mode::strict) maybe_try_bf32(brg);
 
-    const int max_vpad = nstl::max(brgattr.max_top_vpad,
-            brgattr.max_bottom_vpad); // these should be equal
-    bool hint_blocking_set
-            = (brgattr.hint_bd_block != 0 || brgattr.hint_bd_block2 != 0
-                    || brgattr.hint_ld_block != 0 || brgattr.hint_ld_block2 != 0
-                    || brgattr.hint_load_nt_A != brgemm_hint_nt_undef
-                    || brgattr.hint_load_nt_B != brgemm_hint_nt_undef
-                    || brgattr.hint_bs_group > 1);
-    if (brgattr.use_uker || brg->is_bf16_tmm || hint_blocking_set
-            || brgattr.bd_mask_level
-            || brgattr.fpmath_mode != fpmath_mode::strict || max_vpad > 0) {
-        if (brg->is_dgmm)
-            CHECK(brdgmm_blocking(brg));
-        else
-            CHECK(brgemm_blocking(brg));
-    }
-
-    if (!brg->is_dgmm) {
-        // virtual padding is restricted by bd_block size due to
-        // brgemm_kernel implementation. TODO: remove this restriction
-        const int min_bd_block
-                = brg->bdb_tail > 0 ? brg->bdb_tail : brg->bd_block;
-        if ((max_vpad > min_bd_block)) return status::unimplemented;
-    }
-
-    brg->LDA2 = (brgattr.LDA2 != 0) ? brgattr.LDA2 : brg->LDA;
-    brg->LDB2 = (brgattr.LDB2 != 0) ? brgattr.LDB2 : brg->LDB;
-    brg->LDC2_M = (brgattr.LDC2_M != 0) ? brgattr.LDC2_M : brg->LDC;
-    brg->LDC2_N = (brgattr.LDC2_N != 0) ? brgattr.LDC2_N : brg->ld_block;
-
-    brg->is_blocked = (brg->LDA2 != brg->LDA || brg->LDB2 != brg->LDB
-            || brg->LDC2_M != brg->LDC || brg->LDC2_N != brg->ld_block);
-
-    if (!IMPLICATION(brg->is_blocked, brg->layout == brgemm_row_major))
-        return status::invalid_arguments;
-
     // virtual padding is not supported for "amx"
     if ((brgattr.max_top_vpad > 0 || brgattr.max_bottom_vpad > 0)
             && (brg->is_tmm))
@@ -568,6 +623,28 @@ status_t brgemm_desc_set_attr(
     return status::success;
 }
 
+status_t brgemm_desc_finalize(brgemm_desc_t *brg) {
+    if (brg == nullptr) return status::invalid_arguments;
+
+    const int max_vpad = nstl::max(
+            brg->brgattr.max_top_vpad, brg->brgattr.max_bottom_vpad);
+
+    if (brg->is_dgmm)
+        CHECK(brdgmm_blocking(brg));
+    else
+        CHECK(brgemm_blocking(brg));
+
+    if (!brg->is_dgmm) {
+        // virtual padding is restricted by bd_block size due to
+        // brgemm_kernel implementation. TODO: remove this restriction
+        const int min_bd_block
+                = brg->bdb_tail > 0 ? brg->bdb_tail : brg->bd_block;
+        if ((max_vpad > min_bd_block)) return status::unimplemented;
+    }
+
+    return status::success;
+}
+
 status_t brgemm_kernel_create(
         brgemm_kernel_t **brg_kernel, const brgemm_desc_t &brg) {
     if (!brg_kernel) return status::invalid_arguments;
@@ -617,10 +694,11 @@ status_t brgemm_kernel_destroy(brgemm_kernel_t *brg_kernel) {
 status_t brgemm_init_tiles(const brgemm_desc_t &brg, char palette[64]) {
     if (!brg.is_tmm) return status::unimplemented;
 
-    //TODO: Add support of tail processing by reduction dimension
     auto rd_block = (!brg.rdb && brg.rdb_tail) ? brg.rdb_tail : brg.rd_block;
     if (brg.is_input_convert())
         rd_block = utils::rnd_up(rd_block, 2 /*vnni_granularity*/);
+    else
+        rd_block = utils::rnd_up(rd_block, brg.rd_step);
 
     palette_config_t *buff = (palette_config_t *)(palette);
 
@@ -762,11 +840,13 @@ int brgemm_cmp(const brgemm_desc_t &lhs, const brgemm_desc_t &rhs) {
     CMP_BRGEMM_FIELD(brgattr.hint_prfB.dist2);
     CMP_BRGEMM_FIELD(brgattr.hint_prfC.dist1);
     CMP_BRGEMM_FIELD(brgattr.hint_prfC.dist2);
-    CMP_BRGEMM_FIELD(brgattr.wary_tail_read);
+    CMP_BRGEMM_FIELD(brgattr.wary_A_k_tail_read);
+    CMP_BRGEMM_FIELD(brgattr.extendable_k);
     CMP_BRGEMM_FIELD(brgattr.generate_skip_accumulation);
     CMP_BRGEMM_FIELD(brgattr.bd_mask_level);
     CMP_BRGEMM_FIELD(brgattr.use_uker);
     CMP_BRGEMM_FIELD(brgattr.use_interleave_stores);
+    CMP_BRGEMM_FIELD(brgattr.b_is_vnni);
     CMP_BRGEMM_FIELD(brgattr.fpmath_mode);
     CMP_BRGEMM_FIELD(brgattr.LDA2);
     CMP_BRGEMM_FIELD(brgattr.LDB2);
diff --git a/src/cpu/x64/brgemm/brgemm.hpp b/src/cpu/x64/brgemm/brgemm.hpp
index 084013e13c5..1f1b7771606 100644
--- a/src/cpu/x64/brgemm/brgemm.hpp
+++ b/src/cpu/x64/brgemm/brgemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,12 +62,15 @@ namespace x64 {
 /// @param strides Strides between the matrices in the batch. Can be nullptr.
 ///        TODO: what does "Can be nullptr" mean?
 ///
+
 status_t DNNL_API brgemm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa,
         brgemm_batch_kind_t type, impl::data_type_t dt_a,
         impl::data_type_t dt_b, bool transA, bool transB,
         brgemm_layout_t layout, float alpha, float beta, dim_t LDA, dim_t LDB,
         dim_t LDC, dim_t M, dim_t N, dim_t K,
-        const brgemm_strides_t *strides = nullptr);
+        const brgemm_strides_t *strides = nullptr,
+        bool is_weights_decompression = false, bool is_src_dynamic_quantization = false,
+        const memory_desc_t *wei_md = nullptr, const primitive_attr_t *attr = nullptr);
 
 /// Initializes a BRGEMM descriptor with B matrix as a diagonal matrix
 /// represented in packed vector format.
@@ -119,7 +122,8 @@ status_t DNNL_API brdgmm_desc_init(brgemm_desc_t *brg, cpu_isa_t isa,
 ///
 status_t DNNL_API brgemm_desc_set_postops(brgemm_desc_t *brg,
         const primitive_attr_t *attr, const memory_desc_t *dst_md, dim_t LDD,
-        impl::data_type_t dt_bias = impl::data_type::undef);
+        impl::data_type_t dt_bias = impl::data_type::undef,
+        bool is_weights_decompression = false);
 
 /// Adds BRGEMM attributes to BRGEMM descriptor
 ///
@@ -130,6 +134,15 @@ status_t DNNL_API brgemm_desc_set_postops(brgemm_desc_t *brg,
 status_t DNNL_API brgemm_desc_set_attr(
         brgemm_desc_t *brg, const brgemm_attr_t &brgattr);
 
+/// Finalize BRGEMM descriptor.
+///
+/// @param brg Output BRGEMM descriptor
+/// This function must be called after all the fields of the descriptor are set.
+/// It finalizes the descriptor including internal blocking parameters to
+/// prepare it for the kernel creation.
+///
+status_t DNNL_API brgemm_desc_finalize(brgemm_desc_t *brg);
+
 /// Generates a BRGEMM kernel based on descriptor
 ///
 /// @param brg_kernel Output BRGEMM kernel
@@ -169,7 +182,9 @@ status_t DNNL_API brgemm_kernel_destroy(brgemm_kernel_t *brg_kernel);
 void DNNL_API brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
         const brgemm_batch_element_t *batch, void *ptr_C,
         void *scratch = nullptr,
-        const brgemm_dynamic_values_t *dynamic_values = nullptr);
+        const brgemm_dynamic_values_t *dynamic_values = nullptr,
+        const void *ptr_wei_scales = nullptr, const void *ptr_wei_zero_points = nullptr,
+        const void *ptr_src_scales = nullptr, const void *ptr_src_grouped_sum = nullptr, size_t ic = 0);
 
 /// Execute BRGEMM kernel (brgemm_offs and brgemm_strd version)
 ///
@@ -197,7 +212,9 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
         const void *addr_A, const void *addr_B,
         const brgemm_batch_element_t *batch, void *ptr_C,
         void *scratch = nullptr,
-        const brgemm_dynamic_values_t *dynamic_values = nullptr);
+        const brgemm_dynamic_values_t *dynamic_values = nullptr,
+        const void *ptr_wei_scales = nullptr, const void *ptr_wei_zero_points = nullptr,
+        const void *ptr_src_scales = nullptr, const void *ptr_src_grouped_sum = nullptr, size_t ic = 0);
 
 /// Execute BRGEMM kernel (brgemm_addr version)
 ///
@@ -224,7 +241,9 @@ void brgemm_kernel_execute(const brgemm_kernel_t *brg_kernel, int bs,
 void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel,
         int bs, const brgemm_batch_element_t *batch, void *ptr_C, void *ptr_D,
         const brgemm_post_ops_data_t &post_ops_data, void *scratch = nullptr,
-        const brgemm_dynamic_values_t *dynamic_values = nullptr);
+        const brgemm_dynamic_values_t *dynamic_values = nullptr,
+        const void *ptr_wei_scales = nullptr, const void *ptr_wei_zero_points = nullptr,
+        const void *ptr_src_scales = nullptr, const void *ptr_src_grouped_sum = nullptr, size_t ic = 0);
 
 /// Execute BRGEMM kernel (brgemm_offs and brgemm_strd version)
 ///
@@ -255,7 +274,9 @@ void DNNL_API brgemm_kernel_execute_postops(const brgemm_kernel_t *brg_kernel,
         int bs, const void *addr_A, const void *addr_B,
         const brgemm_batch_element_t *batch, void *ptr_C, void *ptr_D,
         const brgemm_post_ops_data_t &post_ops_data, void *scratch = nullptr,
-        const brgemm_dynamic_values_t *dynamic_values = nullptr);
+        const brgemm_dynamic_values_t *dynamic_values = nullptr,
+        const void *ptr_wei_scales = nullptr, const void *ptr_wei_zero_points = nullptr,
+        const void *ptr_src_scales = nullptr, const void *ptr_src_grouped_sum = nullptr, size_t ic = 0);
 
 /// AMX utilities: Creates a palette based on BRGEMM descriptor
 ///
diff --git a/src/cpu/x64/brgemm/brgemm_containers.cpp b/src/cpu/x64/brgemm/brgemm_containers.cpp
index b0a3d047d43..4fc7df86066 100644
--- a/src/cpu/x64/brgemm/brgemm_containers.cpp
+++ b/src/cpu/x64/brgemm/brgemm_containers.cpp
@@ -49,6 +49,10 @@ bool brgemm_desc_container_t::insert(int idx, brgemm_desc_t &brg,
     brg.brgattr.static_offsets = static_offsets_list_.back().data();
 
     const auto ret = map_.insert({brg, idx});
+    const int ref_size = refs_.size();
+    if (idx > ref_size - 1) {
+        refs_.resize(idx + 1);
+    }
     refs_[idx] = &(ret.first->first);
     // if there was no insertion then clean bd_mask and static_offsets
     if (!ret.second) {
diff --git a/src/cpu/x64/brgemm/brgemm_containers.hpp b/src/cpu/x64/brgemm/brgemm_containers.hpp
index 5a2eabe4b4b..5f5a7c67177 100644
--- a/src/cpu/x64/brgemm/brgemm_containers.hpp
+++ b/src/cpu/x64/brgemm/brgemm_containers.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace brgemm_containers {
 
 struct brgemm_desc_container_t {
 public:
-    brgemm_desc_container_t() {}
+    brgemm_desc_container_t() = default;
     brgemm_desc_container_t(size_t ns) { resize(ns); }
     void resize(size_t ns) { refs_.resize(ns); }
     inline const brgemm_desc_t *operator[](int idx) const { return refs_[idx]; }
@@ -71,7 +71,7 @@ struct brgemm_desc_container_t {
 // #define BRGEMM_KERNEL_GLOBAL_STORAGE
 
 struct brgemm_kernel_container_t {
-    brgemm_kernel_container_t() {}
+    brgemm_kernel_container_t() = default;
     brgemm_kernel_container_t(size_t ns) { resize(ns); }
     void resize(size_t ns) { refs_.resize(ns); }
     inline const brgemm_kernel_t *operator[](int idx) const {
@@ -111,9 +111,9 @@ struct brgemm_kernel_container_t {
 };
 
 struct brgemm_palette_container_t {
-    typedef std::array<char, AMX_PALETTE_SIZE> S_t;
+    using S_t = std::array<char, AMX_PALETTE_SIZE>;
 
-    brgemm_palette_container_t() {}
+    brgemm_palette_container_t() = default;
     brgemm_palette_container_t(size_t ns) { resize(ns); }
     void resize(size_t ns) { refs_.resize(ns); }
 
diff --git a/src/cpu/x64/brgemm/brgemm_types.hpp b/src/cpu/x64/brgemm/brgemm_types.hpp
index 9054081576e..624777de8ac 100644
--- a/src/cpu/x64/brgemm/brgemm_types.hpp
+++ b/src/cpu/x64/brgemm/brgemm_types.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include "common/primitive_attr.hpp"
 #include "cpu/platform.hpp"
 #include "cpu/x64/cpu_isa_traits.hpp"
+#include "cpu/x64/jit_generator.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -96,11 +97,7 @@ struct brgemm_prf_t {
 };
 
 struct brgemm_batch_element_t {
-    brgemm_batch_element_t() {
-        ptr.A = ptr.B = nullptr;
-        vvpad.top = vvpad.bottom = 0;
-        has_s8s8_comp_batch_pad = 0;
-    }
+    brgemm_batch_element_t() { ptr.A = ptr.B = nullptr; }
     union {
         struct {
             const void *A;
@@ -112,14 +109,14 @@ struct brgemm_batch_element_t {
         } offset;
     };
     struct {
-        dim_t top;
-        dim_t bottom;
+        dim_t top = 0;
+        dim_t bottom = 0;
     } vvpad; // w.r.t. M dimension
 
     // Used to calculate compensation when batch padding is present.
     // Note: batch_pad represent the overlap between weights and the height
     // dimension w.r.t. convolution dimensions.
-    dim_t has_s8s8_comp_batch_pad;
+    dim_t has_s8s8_comp_batch_pad = 0;
 };
 
 struct DNNL_API brgemm_attr_t {
@@ -138,7 +135,22 @@ struct DNNL_API brgemm_attr_t {
             = brgemm_kernel_prefetching_t::brgemm_prf_default;
     brgemm_prf_t hint_prfA, hint_prfB, hint_prfC;
 
-    bool wary_tail_read;
+    // This parameter determines how we will read the tail by K dimension from
+    // matrix A. For AMX if the parameter is true then the brgemm will first
+    // copy the data to the intermediate buffer and only then use the tileload.
+    // For non-AMX the A data are loaded byte by byte if flag is set
+    bool wary_A_k_tail_read {false};
+    // For AMX the K dimension given to the brgemm is required to be divisible
+    // by vnni granularity. In addition blocking by K dimension may not be
+    // optimal if K greater than tile size and divisible by it.
+    // The parameter 'extendable_k' enables the brgemm to use the optimal K
+    // block size assuming that the following requirements for the matrix B are
+    // fulfilled:
+    //  - It is ​​properly blocked (64 bytes block by K dimension).
+    //  - The dimension K is padded by zeros.
+    // For K tail handling in this case the brgemm behavior is determined by the
+    // 'wary_A_k_tail_read' parameter.
+    bool extendable_k {false};
     bool generate_skip_accumulation;
     // Value of bd_mask_level specifies how bd_mask is used in brgemm kernel
     // 0 – bd_mask is not used
@@ -152,6 +164,7 @@ struct DNNL_API brgemm_attr_t {
     // interleave stores or not
     bool use_interleave_stores;
     impl::fpmath_mode_t fpmath_mode = fpmath_mode::strict;
+    bool b_is_vnni {false};
     // Second level leading dimension describing distance between 16-line
     // blocks in case of blocked layout. Used to calculate address of next
     // bd block. By default are equal to regular leading dimension parameters
@@ -189,7 +202,7 @@ struct DNNL_API brgemm_attr_t {
 };
 
 struct brgemm_desc_t {
-    brgemm_desc_t() {}
+    brgemm_desc_t() = default;
     brgemm_desc_t(const brgemm_desc_t &other);
     DNNL_API ~brgemm_desc_t();
 
@@ -235,6 +248,7 @@ struct brgemm_desc_t {
     bool with_scales = false;
     bool skip_zp_b_compensation = false;
     bool skip_scales = false;
+    bool n_bcast_1_load = false;
 
     brgemm_broadcast_t zp_type_a = brgemm_broadcast_t::none;
     brgemm_broadcast_t zp_type_b = brgemm_broadcast_t::none;
@@ -294,6 +308,7 @@ struct brgemm_desc_t {
 
     static constexpr int MAX_VPAD = 100;
     static constexpr int AMX_TILES_NUM = 8;
+    static constexpr int tilesize = 1024;
 
     void set_attr(const primitive_attr_t *ppdattr);
     void set_dst_md(const memory_desc_t *pdst_md);
@@ -307,6 +322,22 @@ struct brgemm_desc_t {
 
     bool is_input_convert() const { return is_bf32 || is_fp8_via_convert(); }
 
+    bool with_wei_decomp = false;
+    bool with_grouped_wei_decomp = false;
+    bool with_wei_decomp_scales = false;
+    bool with_wei_decomp_zero_points = false;
+    int wei_decomp_scales_stride = 0;
+    int wei_decomp_zero_points_stride = 0;
+    int wei_decomp_scales_group_size = 0;
+    int wei_decomp_zero_points_group_size = 0;
+    impl::data_type_t wei_decomp_scales_dt = data_type::undef;
+    impl::data_type_t wei_decomp_zero_points_dt = data_type::undef;
+    bool with_src_dyn_quant = false;
+    int src_scales_group_size = 0;
+    int src_scales_stride = 0;
+    int src_sum_group_size = 0;
+    int src_grouped_sum_stride = 0;
+
     bool is_row_major() const {
         assert(layout != brgemm_layout_undef);
         return layout == brgemm_row_major;
@@ -364,43 +395,113 @@ struct brgemm_desc_t {
         return (get_num_C_tiles() + get_num_A_tiles() + N);
     }
 
+    int get_convert_wsp_buffer_size() const noexcept {
+        if (!is_input_convert()) return 0;
+        const int n_bdb = bd_block2;
+        const int n_rdb = rdb + (rdb_tail != 0);
+        const int n_ldb = ldb + (ldb_tail != 0);
+        const int downcvt_tiles = brgattr.max_bs * n_rdb * (n_bdb + n_ldb);
+        return downcvt_tiles * tilesize;
+    }
+
     int get_wsp_buffer_size() const noexcept {
         int sz = 0;
         if (is_tmm) {
-            constexpr int tilesize = 1024;
             sz = get_num_C_tiles() * tilesize; // postops buffer
-            if (is_input_convert()) {
-                const int n_bdb = bd_block2;
-                const int n_rdb = rdb + (rdb_tail != 0);
-                const int n_ldb = ldb + (ldb_tail != 0);
-                const int downcvt_tiles
-                        = brgattr.max_bs * n_rdb * (n_bdb + n_ldb);
-                sz += downcvt_tiles * tilesize;
-            }
+            sz += get_convert_wsp_buffer_size();
+            if (amx_wary_k_tail()) sz += tilesize;
         }
         return sz;
     }
 
+    // A class version of the `static` version of the function.
+    // Note: used in benchdnn only, not used inside the library.
+    bool is_b_data_layout_vnni() const {
+        return is_b_data_layout_vnni(dt_a, dt_b, brgattr.b_is_vnni, isa_impl);
+    }
+
     // This function indicates when VNNI granularity packing is expected by the
     // kernel.
     //
-    // Note: used in benchdnn only, not used inside the library.
+    // Note: used as the `static` function in ukernel only, not anywhere else.
+    //   `static`-ness is required to identify if the transform routine must be
+    //   used for the ukernel to work properly. This information is critical
+    //   because the transform routine accepts only 4 `ldb` values which affects
+    //   ukernel creation. Otherwise, the user must create the ukernel object,
+    //   query the packing info, and if it's required, likely re-create the
+    //   object with a different `ldb` value, which may not work because
+    //   creation stage for user's application may not provide all the info to
+    //   create a ukernel object.
     // Note: for `bf32` (or brgattr.fpmath_mode_ == bf16) the function returns
     //   `true` because the data transformation to vnni layout is internal and
     //   transparent to the user.
-    bool is_b_data_layout_vnni() const {
+    // Note: the library MUST NOT break the ability to provide this information
+    //   without brgemm_desc_t object creation.
+    static bool is_b_data_layout_vnni(data_type_t dt_a, data_type_t dt_b,
+            bool attr_b_is_vnni, cpu_isa_t isa) {
         using namespace data_type;
         switch (dt_b) {
             case f32: return false;
             // Note: `dt_a == f32` means implicit up-conversion of B to f32.
-            case f16: return (isa_impl != avx512_core_fp16) && (dt_a != f32);
+            case f16:
+                return dt_a != f32
+                        && (is_f16_b_non_amx_vnni(dt_b, attr_b_is_vnni, isa)
+                                || is_superset(isa, avx512_core_amx_fp16)
+                                || is_superset(isa, avx2_vnni_2));
             // Note: `dt_a == f32` means implicit up-conversion of B to f32.
             case bf16: return dt_a != f32;
             default: return true;
         }
     }
+
+    // This function indicates when the kernel would operate with the D pointer
+    // (`true`) and when not (`false`). It's important to distinguish these two
+    // cases due to the fact that kernel would ignore D pointer completely if
+    // no post-accumulation work is identified.
+    //
+    // Correspondent decisions are done in `store_accumulators` function.
+    // The function is used inside kernel generation and ukernel API.
+    // TODO: extend usage to primitives (each of them utilize their own copy
+    // of this definition).
+    bool are_post_ops_applicable() const {
+        const bool has_zero_points = !utils::everyone_is(
+                brgemm_broadcast_t::none, zp_type_a, zp_type_b, zp_type_c);
+        return dt_c != dt_d || with_eltwise || with_binary || with_scales
+                || with_bias || with_sum || req_s8s8_compensation
+                || has_zero_points || with_dst_scales;
+    }
+
     bool is_xf16() const noexcept { return is_bf16 || is_f16; }
 
+    bool is_f16_b_non_amx_vnni() const {
+        return is_f16_b_non_amx_vnni(dt_b, brgattr.b_is_vnni, isa_impl);
+    }
+
+    // Note: `static` version appears because of `static is_b_data_layout_vnni`.
+    static bool is_f16_b_non_amx_vnni(
+            data_type_t dt_b, bool attr_b_is_vnni, cpu_isa_t isa) {
+        // This function controls the code section which relies on
+        // `avx512_core_fp16` instructions directly.
+        return dt_b == data_type::f16 && attr_b_is_vnni
+                && isa == avx512_core_fp16;
+    }
+
+    bool reduce_by_words() const {
+        return is_bf16_tmm || is_f16_tmm || is_input_convert();
+    }
+    int max_rd_block() const { return reduce_by_words() ? 32 : 64; }
+    int rd_block_step() const { return (reduce_by_words() && !is_fp8) ? 2 : 4; }
+
+    bool amx_may_extend_k() const {
+        return (is_superset(isa_impl, avx512_core_amx) && brgattr.extendable_k
+                && (reduce_dim % data_type_vnni_granularity(dt_a)
+                        || (reduce_dim > max_rd_block()
+                                && reduce_dim % max_rd_block())));
+    }
+    bool amx_wary_k_tail() const {
+        return amx_may_extend_k() && brgattr.wary_A_k_tail_read;
+    }
+
     bool operator==(const brgemm_desc_t &rhs) const;
     bool operator<(const brgemm_desc_t &rhs) const;
 
@@ -430,6 +531,12 @@ struct brgemm_dynamic_values_t {
         , dynamic_LDD(LDD) {}
 };
 
+struct brgemm_decomp_kernel_params_t {
+    const void *ptr_B;
+    const void *scratch_buf;
+    const void *bitmask_ptr;
+};
+
 struct brgemm_kernel_params_t {
     const void *ptr_A;
     const void *ptr_B;
@@ -464,10 +571,17 @@ struct brgemm_kernel_params_t {
 
     const void *a_zp_compensations = nullptr;
     const void *b_zp_compensations = nullptr;
+    const void *a_zp_values = nullptr;
     const void *c_zp_values = nullptr;
     size_t skip_accm = 0;
     int32_t zp_a_val = 1;
     const void *ptr_dst_scales = nullptr;
+
+    const void *ptr_wei_scales = nullptr;
+    const void *ptr_wei_zero_points = nullptr;
+    const void *ptr_src_scales = nullptr;
+    const void *ptr_src_grouped_sum = nullptr;
+    size_t ic;
     dim_t dynamic_LDA = 0;
     dim_t dynamic_LDB = 0;
     dim_t dynamic_LDC = 0;
@@ -479,24 +593,34 @@ struct jit_brgemm_kernel_t;
 struct jit_brgemm_amx_uker_base_t;
 template <typename Vmm>
 struct jit_brdgmm_kernel_base_t;
-class jit_generator;
+class jit_generator_t;
 
 struct brgemm_kernel_t {
-    brgemm_kernel_t() {};
-    virtual ~brgemm_kernel_t() {};
+    brgemm_kernel_t() = default;
+    virtual ~brgemm_kernel_t() = default;
     virtual status_t create_kernel() = 0;
     virtual void operator()(brgemm_kernel_params_t *) const = 0;
-    virtual const jit_generator *get_jit_generator() const = 0;
+    virtual const jit_generator_t *get_jit_generator() const = 0;
+    virtual const brgemm_desc_t &get_brg() const = 0;
+};
+
+struct jit_base_brgemm_kernel_t : public jit_generator_t {
+    jit_base_brgemm_kernel_t(const char *impl_name, cpu_isa_t isa_impl)
+        : jit_generator_t(impl_name, isa_impl) {}
+    virtual const brgemm_desc_t &get_brg() const = 0;
 };
 
 template <typename Vmm>
 struct brgemm_kernel_common_t : public brgemm_kernel_t {
     brgemm_kernel_common_t(const brgemm_desc_t &abrd);
-    ~brgemm_kernel_common_t();
+    ~brgemm_kernel_common_t() override;
 
-    status_t create_kernel();
-    void operator()(brgemm_kernel_params_t *) const;
-    virtual const jit_generator *get_jit_generator() const;
+    status_t create_kernel() override;
+    void operator()(brgemm_kernel_params_t *) const override;
+    const jit_generator_t *get_jit_generator() const override;
+    const brgemm_desc_t &get_brg() const override {
+        return ((jit_base_brgemm_kernel_t *)brgemm_kernel_)->get_brg();
+    }
 
 private:
     jit_brgemm_kernel_t<Vmm> *brgemm_kernel_ = nullptr;
@@ -506,11 +630,14 @@ struct brgemm_kernel_common_t : public brgemm_kernel_t {
 
 struct brgemm_amx_uker_t : public brgemm_kernel_t {
     brgemm_amx_uker_t(const brgemm_desc_t &abrd);
-    ~brgemm_amx_uker_t();
+    ~brgemm_amx_uker_t() override;
 
-    status_t create_kernel();
-    void operator()(brgemm_kernel_params_t *) const;
-    virtual const jit_generator *get_jit_generator() const;
+    status_t create_kernel() override;
+    void operator()(brgemm_kernel_params_t *) const override;
+    const jit_generator_t *get_jit_generator() const override;
+    const brgemm_desc_t &get_brg() const override {
+        return ((jit_base_brgemm_kernel_t *)brgemm_kernel_)->get_brg();
+    }
 
 private:
     jit_brgemm_amx_uker_base_t *brgemm_kernel_ = nullptr;
@@ -521,11 +648,14 @@ struct brgemm_amx_uker_t : public brgemm_kernel_t {
 template <typename Vmm>
 struct brdgmm_kernel_t : public brgemm_kernel_t {
     brdgmm_kernel_t(const brgemm_desc_t &abrd);
-    ~brdgmm_kernel_t();
+    ~brdgmm_kernel_t() override;
 
-    status_t create_kernel();
-    void operator()(brgemm_kernel_params_t *) const;
-    virtual const jit_generator *get_jit_generator() const;
+    status_t create_kernel() override;
+    void operator()(brgemm_kernel_params_t *) const override;
+    const jit_generator_t *get_jit_generator() const override;
+    const brgemm_desc_t &get_brg() const override {
+        return ((jit_base_brgemm_kernel_t *)brgemm_kernel_)->get_brg();
+    }
 
 private:
     jit_brdgmm_kernel_base_t<Vmm> *brgemm_kernel_ = nullptr;
@@ -574,7 +704,8 @@ struct brgemm_post_ops_data_t {
             const void *b_zp_compensations = nullptr,
             const void *c_zp_values = nullptr, bool skip_accumulation = false,
             int32_t zp_a_val = 1, bool do_only_comp = false,
-            bool do_only_zp_a_val = false, const float *dst_scales = nullptr)
+            bool do_only_zp_a_val = false, const float *dst_scales = nullptr,
+            const void *a_zp_values = nullptr)
         : bias(bias)
         , scales(scales)
         , binary_post_ops_rhs(binary_post_ops_rhs)
@@ -589,7 +720,8 @@ struct brgemm_post_ops_data_t {
         , zp_a_val {zp_a_val}
         , do_only_comp {do_only_comp}
         , do_only_zp_a_val {do_only_zp_a_val}
-        , dst_scales(dst_scales) {}
+        , dst_scales(dst_scales)
+        , a_zp_values(a_zp_values) {}
 
     const void *bias = nullptr;
     const float *scales = nullptr;
@@ -606,6 +738,7 @@ struct brgemm_post_ops_data_t {
     const bool do_only_comp = false;
     const bool do_only_zp_a_val = false;
     const float *dst_scales = nullptr;
+    const void *a_zp_values = nullptr;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/brgemm/brgemm_utils.cpp b/src/cpu/x64/brgemm/brgemm_utils.cpp
index d62d908f101..48760f79b6c 100644
--- a/src/cpu/x64/brgemm/brgemm_utils.cpp
+++ b/src/cpu/x64/brgemm/brgemm_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,10 +51,14 @@ void init_kernel_datatype(
         brgemm_desc_t *brg, impl::data_type_t dt_a, impl::data_type_t dt_b) {
     assert(dt_a != data_type::undef && dt_b != data_type::undef);
     brg->is_int8 = utils::one_of(dt_a, data_type::u8, data_type::s8)
-            && utils::one_of(dt_b, data_type::u8, data_type::s8);
-    brg->is_bf16 = (dt_a == data_type::bf16) && (dt_b == data_type::bf16);
-    brg->is_f32 = (dt_a == data_type::f32) && (dt_b == data_type::f32);
-    brg->is_f16 = utils::one_of(data_type::f16, dt_a, dt_b);
+            && utils::one_of(dt_b, data_type::u8, data_type::s8, data_type::u4);
+    brg->is_bf16 = (dt_a == data_type::bf16) && utils::one_of(dt_b, data_type::bf16, data_type::u8, data_type::s8, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1);
+    // Note: f32:bf16 is treated as f32 case while f32:f16 has already been
+    // treated as f16. Probably, need a common ground here.
+    brg->is_f32 = (dt_a == data_type::f32)
+            && utils::one_of(
+                    dt_b, data_type::f32, data_type::f16, data_type::bf16, data_type::u8, data_type::s8, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1);
+    brg->is_f16 = utils::one_of(data_type::f16, dt_a, dt_b) && !brg->is_f32;
     brg->is_fp8 = one_of(dt_a, data_type::f8_e5m2, data_type::f8_e4m3)
             && one_of(dt_b, data_type::f8_e5m2, data_type::f8_e4m3);
     assert(brg->is_int8 || brg->is_bf16 || brg->is_f32 || brg->is_f16
@@ -131,24 +135,45 @@ void set_isa_impl(brgemm_desc_t *brg) {
                 is_isa_ok(avx512_core_fp16), avx512_core_fp16, is_isa_ok(avx2),
                 avx2);
     } else if (brg->is_bf16) {
-        brg->isa_impl = utils::map(true, isa_undef, is_isa_ok(avx512_core_amx),
-                avx512_core_amx, is_isa_ok(avx512_core_bf16), avx512_core_bf16,
-                is_isa_ok(avx2_vnni_2), avx2_vnni_2);
+        if (brg->dt_a == data_type::f32 && brg->dt_b == data_type::bf16) {
+            // Distinguish f32:bf16 case upconversion for bf16 on AVX512_CORE
+            // and AVX2.
+            brg->isa_impl = utils::map(true, isa_undef,
+                    is_isa_ok(avx512_core_amx), avx512_core_amx,
+                    is_isa_ok(avx512_core_bf16), avx512_core_bf16,
+                    is_isa_ok(avx512_core), avx512_core, is_isa_ok(avx2_vnni_2),
+                    avx2_vnni_2, is_isa_ok(avx2), avx2);
+        } else {
+            brg->isa_impl = utils::map(true, isa_undef,
+                    is_isa_ok(avx512_core_amx), avx512_core_amx,
+                    is_isa_ok(avx512_core_bf16), avx512_core_bf16,
+                    is_isa_ok(avx2_vnni_2), avx2_vnni_2);
+        }
     } else if (brg->is_f16) {
         if (everyone_is(data_type::f16, brg->dt_a, brg->dt_b)) {
             brg->isa_impl = utils::map(true, isa_undef,
                     is_isa_ok(avx512_core_amx_fp16), avx512_core_amx_fp16,
                     is_isa_ok(avx512_core_fp16), avx512_core_fp16,
                     is_isa_ok(avx2_vnni_2), avx2_vnni_2);
+        } else if (brg->dt_a == data_type::f32 && brg->dt_b == data_type::f16) {
+            // Distinguish f32:f16 case upconversion for f16 on AVX512_CORE and
+            // AVX2.
+            brg->isa_impl = utils::map(true, isa_undef,
+                    is_isa_ok(avx512_core_fp16), avx512_core_fp16,
+                    is_isa_ok(avx512_core), avx512_core, is_isa_ok(avx2), avx2);
         } else {
             brg->isa_impl = utils::map(true, isa_undef,
                     is_isa_ok(avx512_core_fp16), avx512_core_fp16);
         }
     } else if (brg->is_int8) {
-        brg->isa_impl = utils::map(true, isa_undef, is_isa_ok(avx512_core_amx),
-                avx512_core_amx, is_isa_ok(avx512_core_vnni), avx512_core_vnni,
-                is_isa_ok(avx512_core), avx512_core, is_isa_ok(avx2_vnni_2),
-                avx2_vnni_2, is_isa_ok(avx2_vnni), avx2_vnni);
+        brg->isa_impl
+                = utils::map(true, isa_undef, is_isa_ok(avx512_core_amx_fp16),
+                        avx512_core_amx_fp16, is_isa_ok(avx512_core_amx),
+                        avx512_core_amx, is_isa_ok(avx512_core_fp16),
+                        avx512_core_fp16, is_isa_ok(avx512_core_vnni),
+                        avx512_core_vnni, is_isa_ok(avx512_core), avx512_core,
+                        is_isa_ok(avx2_vnni_2), avx2_vnni_2,
+                        is_isa_ok(avx2_vnni), avx2_vnni, is_isa_ok(avx2), avx2);
     } else if (brg->is_fp8) {
         brg->isa_impl = utils::map(true, isa_undef,
                 is_isa_ok(avx10_1_512_amx_fp16), avx10_1_512_amx_fp16);
@@ -190,17 +215,19 @@ int calculate_max_bcast_block(brgemm_desc_t *brg, const int adj_ld_block2) {
                       || brg->brgattr.max_bottom_vpad > 0)
             && brg->zp_type_a != brgemm_broadcast_t::none;
     const int beta_regs = !one_of(brg->beta, 1.f, 0.f);
+    const int b_vnni_regs = brg->is_f16_b_non_amx_vnni() ? 2 : 0;
 
     const int max_isa_regs = isa_num_vregs(brg->isa_impl);
     // note: the 'adj_ld_block2' already removes the necessary registers
     // for 'embd_bcst'
     auto max_reg_count = max_isa_regs - max_bcst_regs - beta_regs
-            - req_compensation - req_zp_a_comp_pads;
+            - req_compensation - req_zp_a_comp_pads - b_vnni_regs;
     if (req_zp_a_comp_pads)
         max_reg_count
                 = nstl::min(max_reg_count, max_isa_regs - max_bcst_regs - 5);
 
-    const int postops_regs = brg->attr()
+    // For dynamic quantization case it is more performant to maximize the amount of accumulators
+    const int postops_regs = brg->attr() && !brg->with_src_dyn_quant
             ? injector::aux_vec_count(
                     brg->attr()->post_ops_, brg->isa_impl, true)
             : 0;
@@ -218,238 +245,175 @@ int calculate_max_bcast_block(brgemm_desc_t *brg, const int adj_ld_block2) {
     // non-VNNI INT8 dot product required 2 temp vectors
     if (brg->is_int8 && !brg->has_int8_vnni) max_bcast_block -= 2;
 
+    if (one_of(brg->dt_b, data_type::nf4) && brg->isa_impl == avx2) max_bcast_block -= 5;
+    if (one_of(brg->dt_b, data_type::f4_e2m1) && brg->isa_impl == avx2) max_bcast_block -= 2;
+    if (one_of(brg->dt_b, data_type::nf4, data_type::f4_e2m1) && brg->isa_impl != avx2) max_bcast_block -= 1;
+    if (brg->with_wei_decomp_zero_points && brg->wei_decomp_zero_points_stride == 0 && !brg->with_src_dyn_quant) max_bcast_block -= 1;
+    if (brg->with_src_dyn_quant) max_bcast_block -= 1;
+
     max_bcast_block /= adj_ld_block2;
 
     return max_bcast_block;
 }
 
-status_t brgemm_blocking(brgemm_desc_t *brg) {
-
-    set_isa_impl(brg);
-    if (brg->isa_impl == isa_undef) return status::unimplemented;
-    assert(!brg->is_dgmm); // should not be called from brdgmm
-    if (brg->is_dgmm) return status::unimplemented;
-    set_brg_vmm(brg);
-    if (!(brg->is_tmm || brg->is_zmm || brg->is_ymm))
-        return status::unimplemented;
-
-    if (!brg->is_tmm) {
-        const int simd_w = is_superset(brg->isa_impl, avx512_core) ? 16 : 8;
-        brg->ld_block = simd_w;
-        brg->ldb = brg->load_dim / brg->ld_block;
-        brg->ldb_tail = brg->load_dim % brg->ld_block;
-
-        int adj_ld_block2 = calculate_ldb_params(brg, 4);
-        int max_bcast_block = calculate_max_bcast_block(brg, adj_ld_block2);
-
-        // reduce 'ld_block2' to allow a larger 'bd_block'
-        const int max_vpad = nstl::max(
-                brg->brgattr.max_top_vpad, brg->brgattr.max_bottom_vpad);
-        if (is_superset(brg->isa_impl, avx2) && max_bcast_block < max_vpad) {
-            adj_ld_block2 = calculate_ldb_params(brg, 2);
-            max_bcast_block = calculate_max_bcast_block(brg, adj_ld_block2);
+status_t brgemm_blocking_tmm(brgemm_desc_t *brg) {
+    const auto L1 = platform::get_per_core_cache_size(1);
+
+    // Blocking configuration for AMX
+    const auto BD = brg->bcast_dim;
+    const auto BD_R16 = rnd_up(BD, 16);
+    const auto LD = brg->load_dim;
+    const auto LD_R16 = rnd_up(LD, 16);
+
+    const int max_width = 16, min_width = 1;
+    brg->ld_block = 16;
+    brg->ldb = LD / brg->ld_block;
+    brg->ldb_tail = LD % brg->ld_block;
+
+    auto find_bdb_bd_mask = [&](int bd_block, int &bdb, int &bdb_tail) {
+        if (brg->brgattr.bd_mask_level != 2 || BD == 0) {
+            bdb = div_up(BD, bd_block);
+            bdb_tail = BD % bd_block;
+            return;
         }
 
-        const int min_block = 1;
-        float best_bd_block_eff = 0.f;
-        brg->bd_block = 1;
-        for (int bd_block = max_bcast_block; bd_block >= min_block;
-                bd_block--) {
-            const auto bd_block_disb = static_cast<float>(brg->bcast_dim)
-                    / rnd_up(brg->bcast_dim, bd_block);
-            const auto brgemm_microkernel_eff
-                    = (static_cast<float>(adj_ld_block2) * bd_block)
-                    / (((adj_ld_block2) + bd_block) * max_bcast_block);
-            const auto bd_block_eff = bd_block_disb * brgemm_microkernel_eff;
-
-            float block_foot_print = static_cast<float>(brg->typesize_A)
-                    * (bd_block * brg->reduce_dim);
-            if (block_foot_print <= static_cast<float>(
-                        platform::get_per_core_cache_size(1))
-                    && (bd_block_eff > best_bd_block_eff)) {
-                brg->bd_block = bd_block;
-                best_bd_block_eff = bd_block_eff;
+        bdb = 0;
+        bdb_tail = 0;
+        for (int i = 0; i < BD;) {
+            if (brg->brgattr.bd_mask_level == 2
+                    && brg->brgattr.bd_mask[i] == 0) {
+                i++;
+            } else {
+                i += bd_block;
+                if (i > BD) {
+                    bdb_tail = BD - i + bd_block;
+                    if (brg->brgattr.use_uker) bdb++;
+                } else
+                    bdb++;
             }
         }
-        brg->bdb = brg->bcast_dim / brg->bd_block;
-        brg->bdb_tail = brg->bcast_dim % brg->bd_block;
-
-        const int rd_unroll = 4;
-        const data_type_t rd_block_dt = get_mac_emu_data_type(
-                brg->dt_a, brg->isa_impl, brg->isa_impl != avx2_vnni_2);
-        if (rd_block_dt == dnnl_data_type_undef) return status::unimplemented;
-        const int vnni_granularity = data_type_vnni_granularity(rd_block_dt);
-        brg->rd_block = rd_unroll * vnni_granularity;
-        brg->rdb = brg->reduce_dim / brg->rd_block;
-        brg->rdb_tail = brg->reduce_dim % brg->rd_block;
+    };
 
-        brg->is_M_tail = false;
-    } else {
-        // Blocking configuration for AMX
-        const int max_width = 16, min_width = 1;
-        brg->ld_block = 16;
-        brg->ldb = brg->load_dim / brg->ld_block;
-        brg->ldb_tail = brg->load_dim % brg->ld_block;
-
-        auto find_bdb_bd_mask = [&](int bd_block, int &bdb, int &bdb_tail) {
-            if (brg->brgattr.bd_mask_level != 2 || brg->bcast_dim == 0) {
-                bdb = div_up(brg->bcast_dim, bd_block);
-                bdb_tail = brg->bcast_dim % bd_block;
-                return;
+    auto find_bd_block_for_bd_mask = [&]() {
+        if (brg->brgattr.bd_mask_level != 2 || BD == 0) return false;
+
+        auto min_bdb = INT_MAX;
+        const auto start_bd_block = nstl::min(max_width, BD);
+        auto best_bd_block = start_bd_block;
+        for (auto bd_block = start_bd_block; bd_block > 0; bd_block--) {
+            int bdb = 0;
+            int bdb_tail = 0;
+            find_bdb_bd_mask(bd_block, bdb, bdb_tail);
+            // bcast_dim should be divided by bd_block
+            if (bdb < min_bdb && bdb_tail == 0) {
+                min_bdb = bdb;
+                best_bd_block = bd_block;
             }
+        }
+        brg->bd_block = best_bd_block;
+        brg->bdb_tail = 0;
+        brg->bdb = min_bdb;
+        return true;
+    };
 
-            bdb = 0;
-            bdb_tail = 0;
-            for (int i = 0; i < brg->bcast_dim;) {
-                if (brg->brgattr.bd_mask_level == 2
-                        && brg->brgattr.bd_mask[i] == 0) {
-                    i++;
-                } else {
-                    i += bd_block;
-                    if (i > brg->bcast_dim) {
-                        bdb_tail = brg->bcast_dim - i + bd_block;
-                        if (brg->brgattr.use_uker) bdb++;
-                    } else
-                        bdb++;
-                }
-            }
-        };
+    auto set_decomposition_by_ld = [&]() {
+        if (brg->bd_block2 == 1 && brg->ldb > 0 && brg->ldb_tail == 0) {
+            if (brg->ldb % 3 == 0)
+                brg->ld_block2 = 3;
+            else if (brg->ldb % 2 == 0)
+                brg->ld_block2 = 2;
+            else
+                brg->ld_block2 = 1;
+        } else {
+            brg->ld_block2
+                    = (brg->ldb > 0 && brg->ldb % 2 == 0 && brg->ldb_tail == 0
+                              && brg->bd_block2 < 3)
+                    ? 2
+                    : 1;
+        }
+        brg->ldb2 = brg->ldb / brg->ld_block2;
+        brg->ldb2_tail = brg->ldb % brg->ld_block2;
 
-        auto find_bd_block_for_bd_mask = [&]() {
-            if (brg->brgattr.bd_mask_level != 2 || brg->bcast_dim == 0)
-                return false;
-
-            auto min_bdb = INT_MAX;
-            const auto start_bd_block = nstl::min(max_width, brg->bcast_dim);
-            auto best_bd_block = start_bd_block;
-            for (auto bd_block = start_bd_block; bd_block > 0; bd_block--) {
-                int bdb = 0;
-                int bdb_tail = 0;
-                find_bdb_bd_mask(bd_block, bdb, bdb_tail);
-                // bcast_dim should be divided by bd_block
-                if (bdb < min_bdb && bdb_tail == 0) {
-                    min_bdb = bdb;
-                    best_bd_block = bd_block;
-                }
-            }
-            brg->bd_block = best_bd_block;
-            brg->bdb_tail = 0;
-            brg->bdb = min_bdb;
-            return true;
-        };
+        // Re-adjust the bd_block2 if possible
+        if (brg->ld_block2 == 1 && !brg->is_M_tail && brg->ldb_tail == 0) {
+            brg->bd_block2 = (brg->bdb >= 3) ? 3 : (brg->bdb >= 2) ? 2 : 1;
+            brg->bdb2 = brg->bdb / brg->bd_block2;
+            brg->bdb2_tail = (brg->bd_block2 == 1) ? brg->bdb
+                                                   : brg->bdb % brg->bd_block2;
+        }
+    };
 
-        auto set_decomposition_by_ld = [&]() {
-            if (brg->bd_block2 == 1 && brg->ldb > 0 && brg->ldb_tail == 0) {
-                if (brg->ldb % 3 == 0)
-                    brg->ld_block2 = 3;
-                else if (brg->ldb % 2 == 0)
-                    brg->ld_block2 = 2;
-                else
-                    brg->ld_block2 = 1;
-            } else {
-                brg->ld_block2
-                        = (brg->ldb > 0 && brg->ldb % 2 == 0
-                                  && brg->ldb_tail == 0 && brg->bd_block2 < 3)
-                        ? 2
-                        : 1;
-            }
-            brg->ldb2 = brg->ldb / brg->ld_block2;
-            brg->ldb2_tail = brg->ldb % brg->ld_block2;
-
-            // Re-adjust the bd_block2 if possible
-            if (brg->ld_block2 == 1 && !brg->is_M_tail && brg->ldb_tail == 0) {
-                brg->bd_block2 = (brg->bdb >= 3) ? 3 : (brg->bdb >= 2) ? 2 : 1;
-                brg->bdb2 = brg->bdb / brg->bd_block2;
-                brg->bdb2_tail = (brg->bd_block2 == 1)
-                        ? brg->bdb
-                        : brg->bdb % brg->bd_block2;
+    auto try_3x1_decomposition = [&](int width_step) {
+        brg->is_M_tail = false;
+        if (BD > (width_step - 1) * max_width && BD < width_step * max_width
+                && brg->ldb_tail == 0) {
+            if (!find_bd_block_for_bd_mask()) {
+                brg->bd_block = max_width;
+                brg->bdb = div_up(BD, brg->bd_block);
+                brg->bdb_tail = BD % brg->bd_block;
+                brg->is_M_tail = true;
             }
-        };
+            brg->bd_block2 = width_step;
+            brg->bdb2 = brg->bdb / brg->bd_block2;
+            brg->bdb2_tail = brg->bdb % brg->bd_block2;
+            set_decomposition_by_ld();
+            return true;
+        }
+        return false;
+    };
 
-        auto try_3x1_decomposition = [&](int width_step) {
-            brg->is_M_tail = false;
-            if (brg->bcast_dim > (width_step - 1) * max_width
-                    && brg->bcast_dim < width_step * max_width
-                    && brg->ldb_tail == 0) {
-                if (!find_bd_block_for_bd_mask()) {
-                    brg->bd_block = max_width;
-                    brg->bdb = div_up(brg->bcast_dim, brg->bd_block);
-                    brg->bdb_tail = brg->bcast_dim % brg->bd_block;
-                    brg->is_M_tail = true;
+    auto try_2x2_decomposition = [&]() {
+        if (!find_bd_block_for_bd_mask()) {
+            for (int m_block = max_width; m_block >= min_width; m_block--) {
+                if (BD % m_block == 0) {
+                    brg->bd_block = m_block;
+                    break;
                 }
-                brg->bd_block2 = width_step;
-                brg->bdb2 = brg->bdb / brg->bd_block2;
-                brg->bdb2_tail = brg->bdb % brg->bd_block2;
-                set_decomposition_by_ld();
-                return true;
             }
-            return false;
-        };
-
-        auto try_2x2_decomposition = [&]() {
-            if (!find_bd_block_for_bd_mask()) {
-                for (int m_block = max_width; m_block >= min_width; m_block--) {
-                    if (brg->bcast_dim % m_block == 0) {
-                        brg->bd_block = m_block;
-                        break;
+            if (brg->bd_block == 1) {
+                brg->bd_block = nstl::min(max_width, BD);
+                brg->bdb_tail = BD % max_width;
+                for (int i = max_width; i >= min_width; i--) {
+                    const auto i_tail = BD % i;
+                    if (i_tail > brg->bdb_tail || i_tail == 0) {
+                        brg->bd_block = i;
+                        brg->bdb_tail = i_tail;
+                        if (i_tail == 0) break;
                     }
                 }
-                if (brg->bd_block == 1) {
-                    brg->bd_block = nstl::min(max_width, brg->bcast_dim);
-                    brg->bdb_tail = brg->bcast_dim % max_width;
-                    for (int i = max_width; i >= min_width; i--) {
-                        const auto i_tail = brg->bcast_dim % i;
-                        if (i_tail > brg->bdb_tail || i_tail == 0) {
-                            brg->bd_block = i;
-                            brg->bdb_tail = i_tail;
-                            if (i_tail == 0) break;
-                        }
-                    }
-                }
-                brg->bdb = brg->bcast_dim / brg->bd_block;
-                brg->bdb_tail = brg->bcast_dim % brg->bd_block;
             }
+            brg->bdb = BD / brg->bd_block;
+            brg->bdb_tail = BD % brg->bd_block;
+        }
 
-            brg->bd_block2 = (brg->bdb >= 2) ? 2 : 1;
-            brg->bdb2 = brg->bdb / brg->bd_block2;
-            brg->bdb2_tail = (brg->bd_block2 == 1) ? brg->bdb
-                                                   : brg->bdb % brg->bd_block2;
-
-            brg->is_M_tail = false;
+        brg->bd_block2 = (brg->bdb >= 2) ? 2 : 1;
+        brg->bdb2 = brg->bdb / brg->bd_block2;
+        brg->bdb2_tail
+                = (brg->bd_block2 == 1) ? brg->bdb : brg->bdb % brg->bd_block2;
 
-            set_decomposition_by_ld();
+        brg->is_M_tail = false;
 
-            return !(brg->ld_block2 == 1 || brg->bd_block2 == 1
-                    || brg->bd_block < 8);
-        };
+        set_decomposition_by_ld();
 
-        bool is_decomposition_defined = false;
-        for (int i = decomposition_2x2; i != undefined; i++) {
-            switch (i) {
-                case decomposition_2x2:
-                    is_decomposition_defined = try_2x2_decomposition();
-                    break;
-                case decomposition_3x1_3:
-                    is_decomposition_defined = try_3x1_decomposition(3);
-                    break;
-                case decomposition_3x1_2:
-                    is_decomposition_defined = try_3x1_decomposition(2);
-                    break;
-                default: assert(!"invalid value"); break;
-            };
-            if (is_decomposition_defined) break;
-        }
-        if (!is_decomposition_defined) try_2x2_decomposition();
+        return !(brg->ld_block2 == 1 || brg->bd_block2 == 1
+                || brg->bd_block < 8);
+    };
 
-        auto recalc_bd_block = [&](int new_bd_block) {
-            if (new_bd_block == 0) return;
+    auto recalc_blocking = [&](int new_bd_block, int new_ld_block,
+                                   int new_bd_block2, int new_ld_block2) {
+        if (new_bd_block != 0) {
             brg->bd_block = new_bd_block;
             find_bdb_bd_mask(brg->bd_block, brg->bdb, brg->bdb_tail);
             brg->is_M_tail = (brg->bdb_tail != 0);
-        };
+        }
 
-        auto recalc_bd_block2 = [&](int new_bd_block2) {
-            if (new_bd_block2 == 0) return;
+        if (new_ld_block != 0) {
+            brg->ld_block = new_ld_block;
+            brg->ldb = div_up(LD, brg->ld_block);
+            brg->ldb_tail = LD % brg->ld_block;
+        }
+
+        if (new_bd_block2 != 0) {
             brg->bd_block2 = new_bd_block2;
             if (can_dispatch_uker(brg)) {
                 brg->bdb2 = div_up(brg->bdb, brg->bd_block2);
@@ -460,17 +424,9 @@ status_t brgemm_blocking(brgemm_desc_t *brg) {
                 brg->bdb2 = full_bd_blocks / brg->bd_block2;
                 brg->bdb2_tail = full_bd_blocks % brg->bd_block2;
             }
-        };
-
-        auto recalc_ld_block = [&](int new_ld_block) {
-            if (new_ld_block == 0) return;
-            brg->ld_block = new_ld_block;
-            brg->ldb = div_up(brg->load_dim, brg->ld_block);
-            brg->ldb_tail = brg->load_dim % brg->ld_block;
-        };
+        }
 
-        auto recalc_ld_block2 = [&](int new_ld_block2) {
-            if (new_ld_block2 == 0) return;
+        if (new_ld_block2 != 0) {
             brg->ld_block2 = new_ld_block2;
             if (can_dispatch_uker(brg)) {
                 brg->ldb2 = div_up(brg->ldb, brg->ld_block2);
@@ -481,217 +437,184 @@ status_t brgemm_blocking(brgemm_desc_t *brg) {
                 brg->ldb2 = full_ld_blocks / brg->ld_block2;
                 brg->ldb2_tail = full_ld_blocks % brg->ld_block2;
             }
-        };
+        }
+    };
 
-        const bool try_load_nt_A
-                = (brg->innermost_loop == brgemm_bd_loop_innermost);
-        const bool try_load_nt_B
-                = (brg->innermost_loop == brgemm_ld_loop_innermost);
-        const bool try_load_nt
-                = (static_cast<size_t>(brg->typesize_A)
-                                  * brg->brgattr.hint_expected_A_size
-                          + static_cast<size_t>(brg->typesize_B)
-                                  * brg->brgattr.hint_expected_B_size
-                          + static_cast<size_t>(brg->typesize_C)
-                                  * brg->brgattr.hint_expected_C_size)
-                >= platform::get_per_core_cache_size(1);
-        brg->load_nt_A = try_load_nt_A && try_load_nt;
-        brg->load_nt_B = try_load_nt_B && try_load_nt;
-
-        recalc_bd_block(brg->bd_block);
-        recalc_bd_block2(brg->bd_block2);
-        recalc_ld_block(brg->ld_block);
-        recalc_ld_block2(brg->ld_block2);
-
-        if (can_dispatch_uker(brg)) {
-            // Blocking heuristics for some shapes
-            // TODO: Review these criterias
-            size_t eff_K
-                    = brg->reduce_dim * brg->typesize_A * brg->brgattr.K_koef;
-            auto L1 = platform::get_per_core_cache_size(1);
-            auto low_K = (L1 - 4 * 1024) / (6 * 16);
-
-            // TODO: if rdb_tail != 0 then we should limit
-            // blocking because we need extra tiles for A and B to load rdb_tail
-            // if bd_mask_level != 0 it means it aligned to 16
-
-            bool bdb_block_tail = !(brg->bd_block > 12
-                    && (brg->bcast_dim % brg->bd_block == 0
-                            && brg->brgattr.bd_mask_level == 0));
-            bool ldb_tail_16 = (brg->load_dim % 16 != 0);
-            if (everyone_is(false, bdb_block_tail, ldb_tail_16)) {
-                // try to use 1x(4|5) or (4|5)x1 decomposition for specific
-                // range of K
-                auto upper_K5 = (L1 - 5 * 1024) / (5 * 16);
-                auto upper_K4 = (L1 - 4 * 1024) / (4 * 16);
-                bool K5_fit_L1 = (low_K <= eff_K && eff_K < upper_K5);
-                bool K4_fit_L1 = (low_K <= eff_K && eff_K < upper_K4);
-                bool bd_big = (brg->bcast_dim > 32);
-                bool ld_big = (brg->load_dim > 32);
-                if (brg->load_dim % 80 == 0 && K5_fit_L1 && bd_big) {
-
-                    recalc_ld_block(16);
-                    recalc_bd_block2(1);
-                    recalc_ld_block2(5);
-                    brg->load_nt_A = true;
-                    brg->load_nt_B = false;
-                    brg->innermost_loop = brgemm_bd_loop_innermost;
-                } else if (brg->load_dim % 64 == 0 && K4_fit_L1 && bd_big) {
-
-                    recalc_ld_block(16);
-                    recalc_bd_block2(1);
-                    recalc_ld_block2(4);
-                    brg->load_nt_A = true;
-                    brg->load_nt_B = false;
-                    brg->innermost_loop = brgemm_bd_loop_innermost;
-                } else if ((brg->bcast_dim % 80 == 0
-                                   || (brg->brgattr.bd_mask_level != 0
-                                           && brg->bdb % 4 == 0))
-                        && K5_fit_L1 && ld_big) {
-
-                    recalc_ld_block(16);
-                    recalc_bd_block2(5);
-                    recalc_ld_block2(1);
-                    brg->load_nt_A = false;
-                    brg->load_nt_B = true;
-                    brg->innermost_loop = brgemm_ld_loop_innermost;
-                } else if ((brg->bcast_dim % 64 == 0
-                                   || (brg->brgattr.bd_mask_level != 0
-                                           && brg->bdb % 4 == 0))
-                        && K4_fit_L1 && ld_big) {
-
-                    recalc_bd_block(16);
-                    recalc_ld_block(16);
-                    recalc_bd_block2(4);
-                    recalc_ld_block2(1);
-                    brg->load_nt_A = false;
-                    brg->load_nt_B = true;
-                    brg->innermost_loop = brgemm_ld_loop_innermost;
-                }
-            }
-            // Tile decomposition for shapes with small dimensions
-            // or dimensions with tails
-            if (ldb_tail_16 && !bdb_block_tail && brg->load_dim > 64
-                    && brg->ld_block < 8) {
-                recalc_ld_block(16);
-                recalc_bd_block2(2);
-                recalc_ld_block2(1);
-            } else if (ldb_tail_16 && !bdb_block_tail
-                    && rnd_up(brg->load_dim, 16) == 64
-                    && (brg->ld_block < 8 || brg->ldb_tail > 0)) {
-                recalc_ld_block(16);
-                recalc_bd_block2(1);
-                recalc_ld_block2(4);
-            } else if (ldb_tail_16 && !bdb_block_tail
-                    && rnd_up(brg->load_dim, 16) == 48
-                    && (brg->ld_block < 8 || brg->ldb_tail > 0)) {
-                recalc_ld_block(16);
-                recalc_bd_block2(1);
-                recalc_ld_block2(3);
-            } else if (ldb_tail_16 && !bdb_block_tail
-                    && rnd_up(brg->load_dim, 16) == 32
-                    && (brg->ld_block < 8 || brg->ldb_tail > 0)) {
-                recalc_ld_block(16);
-                recalc_bd_block2(2);
-                recalc_ld_block2(2);
-            } else if (brg->bcast_dim <= 16) {
-                recalc_bd_block(brg->bcast_dim);
-                recalc_ld_block(16);
-                recalc_bd_block2(1);
-                recalc_ld_block2(
-                        nstl::min(ldb_tail_16 ? ((brg->ldb > 4) ? 3 : 4) : 5,
-                                div_up(brg->load_dim, 16)));
-            } else if (bdb_block_tail && !ldb_tail_16 && brg->bcast_dim > 64
-                    && (brg->bd_block < 8 || brg->bdb_tail > 0)) {
-
-                recalc_bd_block(16);
-                recalc_ld_block(16);
-                recalc_bd_block2(1);
-                recalc_ld_block2(2);
-            } else if (bdb_block_tail && !ldb_tail_16
-                    && rnd_up(brg->bcast_dim, 16) == 64
-                    && (brg->bd_block < 8 || brg->bdb_tail > 0)) {
-                recalc_bd_block(16);
-                recalc_ld_block(16);
-                recalc_bd_block2(4);
-                recalc_ld_block2(1);
-            } else if (bdb_block_tail && !ldb_tail_16
-                    && rnd_up(brg->bcast_dim, 16) == 48
-                    && (brg->bd_block < 8 || brg->bdb_tail > 0)) {
-                recalc_bd_block(16);
-                recalc_ld_block(16);
-                recalc_bd_block2(3);
-                recalc_ld_block2(1);
-            } else if (bdb_block_tail && !ldb_tail_16
-                    && rnd_up(brg->bcast_dim, 16) == 32
-                    && (brg->bd_block < 8 || brg->bdb_tail > 0)
-                    && (brg->load_dim % 32 == 0)) {
-
-                recalc_bd_block(16);
-                recalc_ld_block(16);
-                recalc_bd_block2(2);
-                recalc_ld_block2(2);
-            } else if (brg->load_dim <= 16) {
-                recalc_bd_block(16);
-                recalc_ld_block(16); // we can't use ld_block other than 16
-                recalc_bd_block2(
-                        nstl::min(brg->bdb_tail ? (brg->bdb > 4 ? 3 : 4) : 5,
-                                div_up(brg->bcast_dim, 16)));
-                recalc_ld_block2(1);
-            } else if (bdb_block_tail && ldb_tail_16
-                    && rnd_up(brg->bcast_dim, 16) == 32
-                    && rnd_up(brg->load_dim, 16) == 32
-                    && (brg->ld_block < 8 || brg->ldb_tail > 0
-                            || brg->bd_block < 8 || brg->bdb_tail > 0)) {
-                recalc_bd_block(16);
-                recalc_ld_block(16);
-                recalc_bd_block2(2);
-                recalc_ld_block2(2);
-            }
-            // if interleave stores and small number of iterations then
-            // try to increase them
-            auto n_iterations = brg->bdb2 * brg->bdb2;
-            if (false && brg->brgattr.use_interleave_stores
-                    && n_iterations < 4) {
-                int k_it = div_up(4, n_iterations);
-                if (brg->bdb2 > brg->ldb2)
-                    recalc_bd_block2(div_up(brg->bdb2, k_it));
-                else
-                    recalc_ld_block2(div_up(brg->ldb2, k_it));
+    auto recalc_blocking_ext
+            = [&](int new_bd_block, int new_ld_block, int new_bd_block2,
+                      int new_ld_block2, bool load_nt_A, bool load_nt_B,
+                      brgemm_kernel_innermost_loop_t innermost_loop) {
+                  recalc_blocking(new_bd_block, new_ld_block, new_bd_block2,
+                          new_ld_block2);
+                  brg->load_nt_A = load_nt_A;
+                  brg->load_nt_B = load_nt_B;
+                  brg->innermost_loop = innermost_loop;
+              };
+
+    bool is_decomposition_defined = false;
+    for (int i = decomposition_2x2; i != undefined; i++) {
+        switch (i) {
+            case decomposition_2x2:
+                is_decomposition_defined = try_2x2_decomposition();
+                break;
+            case decomposition_3x1_3:
+                is_decomposition_defined = try_3x1_decomposition(3);
+                break;
+            case decomposition_3x1_2:
+                is_decomposition_defined = try_3x1_decomposition(2);
+                break;
+            default: assert(!"invalid value"); break;
+        };
+        if (is_decomposition_defined) break;
+    }
+    if (!is_decomposition_defined) try_2x2_decomposition();
+
+    const bool try_load_nt_A
+            = (brg->innermost_loop == brgemm_bd_loop_innermost);
+    const bool try_load_nt_B
+            = (brg->innermost_loop == brgemm_ld_loop_innermost);
+    const bool try_load_nt
+            = (static_cast<size_t>(brg->typesize_A)
+                              * brg->brgattr.hint_expected_A_size
+                      + static_cast<size_t>(brg->typesize_B)
+                              * brg->brgattr.hint_expected_B_size
+                      + static_cast<size_t>(brg->typesize_C)
+                              * brg->brgattr.hint_expected_C_size)
+            >= L1;
+    brg->load_nt_A = try_load_nt_A && try_load_nt;
+    brg->load_nt_B = try_load_nt_B && try_load_nt;
+
+    recalc_blocking(
+            brg->bd_block, brg->ld_block, brg->bd_block2, brg->ld_block2);
+
+    if (can_dispatch_uker(brg)) {
+        // Blocking heuristics for some shapes
+        // TODO: Review these criteria
+        const size_t eff_K
+                = brg->reduce_dim * brg->typesize_A * brg->brgattr.K_koef;
+        const auto low_K = (L1 - 4 * 1024) / (6 * 16);
+
+        // TODO: if rdb_tail != 0 then we should limit
+        // blocking because we need extra tiles for A and B to load rdb_tail
+        // if bd_mask_level != 0 it means it aligned to 16
+
+        const bool bdb_block_tail = !(brg->bd_block > 12
+                && (BD % brg->bd_block == 0
+                        && brg->brgattr.bd_mask_level == 0));
+        const bool ldb_tail_16 = (LD % 16 != 0);
+        if (everyone_is(false, bdb_block_tail, ldb_tail_16)) {
+            // try to use 1x(4|5) or (4|5)x1 decomposition for specific
+            // range of K
+            const auto upper_K5 = (L1 - 5 * 1024) / (5 * 16);
+            const auto upper_K4 = (L1 - 4 * 1024) / (4 * 16);
+            const bool K5_fit_L1 = (low_K <= eff_K && eff_K < upper_K5);
+            const bool K4_fit_L1 = (low_K <= eff_K && eff_K < upper_K4);
+            const bool bd_big = (BD > 32);
+            const bool ld_big = (LD > 32);
+            const bool aligned_bd_mask
+                    = brg->brgattr.bd_mask_level != 0 && brg->bdb % 4 == 0;
+            if (LD % 80 == 0 && K5_fit_L1 && bd_big) {
+                recalc_blocking_ext(
+                        0, 16, 1, 5, true, false, brgemm_bd_loop_innermost);
+            } else if (LD % 64 == 0 && K4_fit_L1 && bd_big) {
+                recalc_blocking_ext(
+                        0, 16, 1, 4, true, false, brgemm_bd_loop_innermost);
+            } else if ((BD % 80 == 0 || aligned_bd_mask) && K5_fit_L1
+                    && ld_big) {
+
+                recalc_blocking_ext(
+                        0, 16, 5, 1, false, true, brgemm_ld_loop_innermost);
+            } else if ((BD % 64 == 0 || aligned_bd_mask) && K4_fit_L1
+                    && ld_big) {
+                recalc_blocking_ext(
+                        16, 16, 4, 1, false, true, brgemm_ld_loop_innermost);
             }
         }
+        // Tile decomposition for shapes with small dimensions
+        // or dimensions with tails
+        const bool weak_ldb = brg->ld_block < 8 || brg->ldb_tail > 0;
+        const bool weak_bdb = brg->bd_block < 8 || brg->bdb_tail > 0;
+        const bool ldb_tail_only = ldb_tail_16 && !bdb_block_tail;
+        const bool bdb_tail_only = bdb_block_tail && !ldb_tail_16;
+        if (ldb_tail_only && LD > 64 && brg->ld_block < 8) {
+            recalc_blocking(0, 16, 2, 1);
+        } else if (ldb_tail_only && weak_ldb && LD_R16 == 64) {
+            recalc_blocking(0, 16, 1, 4);
+        } else if (ldb_tail_only && weak_ldb && LD_R16 == 48) {
+            recalc_blocking(0, 16, 1, 3);
+        } else if (ldb_tail_only && weak_ldb && LD_R16 == 32) {
+            recalc_blocking(0, 16, 2, 2);
+        } else if (BD <= 16) {
+            // Have to call recalc_blocking twice to calculate ldb
+            recalc_blocking(BD, 16, 0, 0);
+            const auto ld_block2 = nstl::min(
+                    ldb_tail_16 ? ((brg->ldb > 4) ? 3 : 4) : 5, div_up(LD, 16));
+            recalc_blocking(0, 0, 1, ld_block2);
+        } else if (bdb_tail_only && weak_bdb && BD > 64) {
+            recalc_blocking(16, 16, 1, 2);
+        } else if (bdb_tail_only && weak_bdb && BD_R16 == 64) {
+            recalc_blocking(16, 16, 4, 1);
+        } else if (bdb_tail_only && weak_bdb && BD_R16 == 48) {
+            recalc_blocking(16, 16, 3, 1);
+        } else if (bdb_tail_only && weak_bdb && BD_R16 == 32
+                && (LD % 32 == 0)) {
+            recalc_blocking(16, 16, 2, 2);
+        } else if (LD <= 16) {
+            // Have to call recalc_blocking twice to calculate bdb
+            // we can't use ld_block other than 16
+            recalc_blocking(16, 16, 0, 0);
+            const auto bd_block2 = nstl::min(
+                    brg->bdb_tail ? (brg->bdb > 4 ? 3 : 4) : 5, div_up(BD, 16));
+            recalc_blocking(0, 0, bd_block2, 1);
+        } else if (bdb_block_tail && ldb_tail_16 && BD_R16 == 32 && LD_R16 == 32
+                && (weak_ldb || weak_bdb)) {
+            recalc_blocking(16, 16, 2, 2);
+        }
 
-        if (brg->get_num_A_tiles() + brg->get_num_B_tiles()
-                        + brg->get_num_C_tiles()
-                > brgemm_desc_t::AMX_TILES_NUM) {
-            assert(!"brgemm internal error: invalid blocking");
-            return status::runtime_error;
+        // The code below is a draft for the future optimization of interleave
+        // stores and small number of iterations.
+        // TODO: review and enable if needed
+#if 0
+        // if interleave stores and small number of iterations then
+        // try to increase them
+        const auto n_iterations = brg->bdb2 * brg->bdb2;
+        if (brg->brgattr.use_interleave_stores && n_iterations < 4) {
+            int k_it = div_up(4, n_iterations);
+            if (brg->bdb2 > brg->ldb2)
+                recalc_blocking(0, 0, div_up(brg->bdb2, k_it), 0);
+            else
+                recalc_blocking(0, 0, 0, div_up(brg->ldb2, k_it));
         }
+#endif
+    }
 
-        // check hints for blocking parameters
-        recalc_bd_block(brg->brgattr.hint_bd_block);
-        recalc_bd_block2(brg->brgattr.hint_bd_block2
-                        ? brg->brgattr.hint_bd_block2
-                        : brg->bd_block2);
-        recalc_ld_block(brg->brgattr.hint_ld_block);
-        recalc_ld_block2(brg->brgattr.hint_ld_block2
-                        ? brg->brgattr.hint_ld_block2
-                        : brg->ld_block2);
-
-        if (brg->brgattr.hint_load_nt_A != brgemm_hint_nt_undef)
-            brg->load_nt_A
-                    = (brg->brgattr.hint_load_nt_A == brgemm_hint_nt_true);
-        if (brg->brgattr.hint_load_nt_B != brgemm_hint_nt_undef)
-            brg->load_nt_B
-                    = (brg->brgattr.hint_load_nt_B == brgemm_hint_nt_true);
-
-        const bool reduce_by_words = brg->is_bf16_tmm || brg->is_f16_tmm
-                || brg->is_input_convert();
-        const auto max_rd_block = reduce_by_words ? 32 : 64;
-        const auto rd_block_step = (reduce_by_words && !brg->is_fp8) ? 2 : 4;
-        // TODO: if rd_block calculated is very small then maybe it makes
-        // sense to use 1x2 or 2x1 blocking with supporting rd_block
-        // and rdb_tail
+    if (brg->get_num_A_tiles() + brg->get_num_B_tiles() + brg->get_num_C_tiles()
+            > brgemm_desc_t::AMX_TILES_NUM) {
+        assert(!"brgemm internal error: invalid blocking");
+        return status::runtime_error;
+    }
+
+    // check hints for blocking parameters
+    recalc_blocking(brg->brgattr.hint_bd_block, brg->brgattr.hint_ld_block,
+            brg->brgattr.hint_bd_block2 ? brg->brgattr.hint_bd_block2
+                                        : brg->bd_block2,
+            brg->brgattr.hint_ld_block2 ? brg->brgattr.hint_ld_block2
+                                        : brg->ld_block2);
+
+    if (brg->brgattr.hint_load_nt_A != brgemm_hint_nt_undef)
+        brg->load_nt_A = (brg->brgattr.hint_load_nt_A == brgemm_hint_nt_true);
+    if (brg->brgattr.hint_load_nt_B != brgemm_hint_nt_undef)
+        brg->load_nt_B = (brg->brgattr.hint_load_nt_B == brgemm_hint_nt_true);
+
+    // TODO: if rd_block calculated is very small then maybe it makes
+    // sense to use 1x2 or 2x1 blocking with supporting rd_block
+    // and rdb_tail
+    const auto rd_block_step = brg->rd_block_step();
+    const auto max_rd_block = brg->max_rd_block();
+    if (brg->amx_may_extend_k()) {
+        brg->rd_block = nstl::min(
+                rnd_up(brg->reduce_dim, brg->rd_step), max_rd_block);
+    } else {
         brg->rd_block = rd_block_step;
         for (int i = max_rd_block; i > 0; i -= rd_block_step) {
             if (brg->reduce_dim % i == 0) {
@@ -699,33 +622,186 @@ status_t brgemm_blocking(brgemm_desc_t *brg) {
                 break;
             }
         }
-        brg->rdb = brg->reduce_dim / brg->rd_block;
-        brg->rdb_tail = brg->reduce_dim % brg->rd_block;
-
-        // Remove these guards in the future (add tail processing by reduction
-        // dimension)
-        // TODO: these checks do not work for fp8-f16 and f16-fp8 cfgs
-        if (!IMPLICATION(
-                    brg->rdb > 0 && brg->rdb_tail, brg->is_input_convert())) {
-            return status::unimplemented;
+    }
+
+    brg->rdb = brg->reduce_dim / brg->rd_block;
+    brg->rdb_tail = brg->reduce_dim % brg->rd_block;
+
+    // Remove these guards in the future (add tail processing by reduction
+    // dimension)
+    // TODO: these checks do not work for fp8-f16 and f16-fp8 cfgs
+    if (!IMPLICATION(brg->rdb > 0 && brg->rdb_tail,
+                brg->is_input_convert() || brg->amx_wary_k_tail())) {
+        return status::unimplemented;
+    }
+
+    if (!IMPLICATION((brg->rdb_tail
+                             % ((brg->is_bf16_tmm || brg->is_f16_tmm) ? 2 : 4))
+                        != 0,
+                brg->is_input_convert() || brg->amx_wary_k_tail())) {
+        return status::unimplemented;
+    }
+
+    //TODO: check this condition
+    brg->interleave_tilestores_ = brg->beta == 0
+                    && (brg->brgattr.use_interleave_stores
+                            && (brg->bd_block2 * brg->ld_block2 == 4)
+                            && !brg->brgattr.var_bs)
+            ? true
+            : false;
+    return status::success;
+}
+
+status_t brgemm_blocking_vmm(brgemm_desc_t *brg) {
+    const auto L1 = platform::get_per_core_cache_size(1);
+
+    const int simd_w = is_superset(brg->isa_impl, avx512_core) ? 16 : 8;
+    brg->ld_block = simd_w;
+    brg->ldb = brg->load_dim / brg->ld_block;
+    brg->ldb_tail = brg->load_dim % brg->ld_block;
+
+    const int max_vpad = nstl::max(
+            brg->brgattr.max_top_vpad, brg->brgattr.max_bottom_vpad);
+
+    int max_bcast_block {0}, min_bcast_block {0}, adj_ld_block2 {0};
+    if (brg->with_src_dyn_quant) {
+        adj_ld_block2 = calculate_ldb_params(brg, 4);
+        max_bcast_block = calculate_max_bcast_block(brg, adj_ld_block2);
+        // reduce 'ld_block2' to allow a larger 'bd_block'
+        if (is_superset(brg->isa_impl, avx2) && max_bcast_block < max_vpad) {
+            for (int try_ld_block2 = 2; try_ld_block2 > 0; --try_ld_block2) {
+                adj_ld_block2 = calculate_ldb_params(brg, try_ld_block2);
+                max_bcast_block = calculate_max_bcast_block(brg, adj_ld_block2);
+                if (max_bcast_block >= max_vpad) break;
+            }
+            // bcast block in brgemm kernel should be greater than virtual
+            // padding to avoid possible functional issues
+            if (max_bcast_block < max_vpad) return status::unimplemented;
+        }
+    } else {
+        // iterate ld_block2 starting from 4 to allow bd_block larger than
+        // virtual padding
+        bool few_regs = utils::one_of(brg->isa_impl, avx2, avx2_vnni, avx2_vnni_2);
+        bool hint_n_bcast_1_load
+            = brg->brgattr.hint_loop_order == brgemm_lo_bl_1load;
+        for (int try_ld_block2 = 4; try_ld_block2 > 0; --try_ld_block2) {
+            adj_ld_block2 = calculate_ldb_params(brg, try_ld_block2);
+            brg->n_bcast_1_load
+                = (few_regs && adj_ld_block2 == 4) || hint_n_bcast_1_load;
+            max_bcast_block = calculate_max_bcast_block(brg, adj_ld_block2);
+            const auto bdb_tail = brg->bcast_dim % max_bcast_block;
+            min_bcast_block = bdb_tail > 0 ? bdb_tail : max_bcast_block;
+            if (min_bcast_block >= max_vpad) break;
         }
-        if (!IMPLICATION(
-                    (brg->rdb_tail
-                            % ((brg->is_bf16_tmm || brg->is_f16_tmm) ? 2 : 4))
-                            != 0,
-                    brg->is_input_convert())) {
-            return status::unimplemented;
+        // bcast block in brgemm kernel should be greater than virtual
+        // padding to avoid possible functional issues
+        if (min_bcast_block < max_vpad) return status::unimplemented;
+    }
+
+    const int min_block = nstl::max(1, max_vpad);
+
+    float best_bd_block_eff = 0.f;
+    if (max_bcast_block == 0) max_bcast_block = 1;
+    brg->bd_block = max_bcast_block;
+    for (int bd_block = max_bcast_block; bd_block >= min_block; bd_block--) {
+        const auto bd_block_disb = static_cast<float>(brg->bcast_dim)
+                / rnd_up(brg->bcast_dim, bd_block);
+        const auto brgemm_microkernel_eff
+                = (static_cast<float>(adj_ld_block2) * bd_block)
+                / (((adj_ld_block2) + bd_block) * max_bcast_block);
+        const auto bd_block_eff = bd_block_disb * brgemm_microkernel_eff;
+
+        float block_foot_print = static_cast<float>(brg->typesize_A)
+                * (bd_block * brg->reduce_dim);
+        if (block_foot_print <= static_cast<float>(L1)
+                && (bd_block_eff > best_bd_block_eff)) {
+            brg->bd_block = bd_block;
+            best_bd_block_eff = bd_block_eff;
         }
+    }
+    brg->bdb = brg->bcast_dim / brg->bd_block;
+    brg->bdb_tail = brg->bcast_dim % brg->bd_block;
+
+    const data_type_t rd_block_dt = get_mac_emu_data_type(
+            brg->dt_a, brg->isa_impl, brg->isa_impl != avx2_vnni_2);
+    if (rd_block_dt == dnnl_data_type_undef) return status::unimplemented;
+    const int vnni_granularity
+            = (brg->is_f16 && brg->isa_impl == avx512_core_fp16)
+            ? 1
+            : data_type_vnni_granularity(brg->dt_a);
+    int rd_unroll = one_of(brg->dt_b, data_type::nf4, data_type::u4, data_type::s4, data_type::f4_e2m1) ? 32 : 4;
+    if (brg->with_grouped_wei_decomp && !brg->with_src_dyn_quant) {
+        auto min_group_size = nstl::min(brg->wei_decomp_scales_group_size, brg->wei_decomp_zero_points_group_size);
+        min_group_size = nstl::min(min_group_size, brg->src_scales_group_size);
+        rd_unroll = nstl::min(rd_unroll, min_group_size / vnni_granularity);
+        rd_unroll = nstl::min(rd_unroll, min_group_size / vnni_granularity);
+        brg->rd_block = rd_unroll * vnni_granularity;
+    } else if (brg->with_src_dyn_quant) {
+        brg->rd_block = brg->src_scales_group_size;
+        auto min_group_size = nstl::min(brg->wei_decomp_scales_group_size, brg->wei_decomp_zero_points_group_size);
+        brg->rd_block = nstl::min(brg->rd_block, min_group_size);
+    } else {
+        brg->rd_block = rd_unroll * vnni_granularity;
+    }
+
+    brg->rdb = brg->reduce_dim / brg->rd_block;
+    brg->rdb_tail = brg->reduce_dim % brg->rd_block;
+
+    brg->is_M_tail = false;
+    // avx2_vnni_2 kernel with xf16 data type requires blocked weights.
+    if (brg->isa_impl == avx2_vnni_2 && brg->is_xf16()
+            && brg->LDB % brg->ld_block > 0)
+        return status::unimplemented;
+
+    return status::success;
+}
 
-        //TODO: check this condition
-        brg->interleave_tilestores_ = brg->beta == 0
-                        && (brg->brgattr.use_interleave_stores
-                                && (brg->bd_block2 * brg->ld_block2 == 4)
-                                && !brg->brgattr.var_bs)
-                ? true
-                : false;
+status_t brgemm_blocking(brgemm_desc_t *brg) {
+    const bool is_b_in_vnni_format = !(brg->dt_b == data_type::f16 && brg->isa_impl == avx512_core_fp16) &&
+                                     !(one_of(brg->dt_a, data_type::f32, data_type::bf16) &&
+                                     one_of(brg->dt_b, data_type::u8, data_type::s8)) &&
+                                     !(one_of(brg->dt_a, data_type::f32) && one_of(brg->dt_b, data_type::bf16, data_type::f16));
+    brg->ld_step = is_b_in_vnni_format ? data_type_vnni_granularity(brg->dt_b) : 1;
+    const bool has_no_vnni_compute_instruction
+            = (brg->is_f16 && one_of(brg->isa_impl, avx2_vnni_2, avx512_core_fp16))
+            || (brg->is_bf16 && brg->isa_impl == avx2_vnni_2)
+            || (one_of(brg->dt_a, data_type::f32, data_type::bf16) && one_of(brg->dt_b, data_type::u8, data_type::s8, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1))
+            || (one_of(brg->dt_a, data_type::f32) && one_of(brg->dt_b, data_type::bf16, data_type::f16));
+    brg->rd_step = has_no_vnni_compute_instruction ? 1 : data_type_vnni_granularity(brg->dt_b);
+
+    if (brg->with_src_dyn_quant && one_of(brg->dt_b, data_type::u4)) {
+        brg->ld_step = 8;
+        brg->rd_step = 4;
     }
 
+    set_isa_impl(brg);
+    if (brg->isa_impl == isa_undef) return status::unimplemented;
+    assert(!brg->is_dgmm); // should not be called from brdgmm
+    if (brg->is_dgmm) return status::unimplemented;
+    set_brg_vmm(brg);
+    if (!(brg->is_tmm || brg->is_zmm || brg->is_ymm))
+        return status::unimplemented;
+
+    if (brg->is_tmm)
+        CHECK(brgemm_blocking_tmm(brg));
+    else
+        CHECK(brgemm_blocking_vmm(brg));
+
+    if (!IMPLICATION(brg->brgattr.LDB2 == 0, brg->load_dim <= brg->LDB))
+        return status::invalid_arguments;
+
+    brg->LDA2 = (brg->brgattr.LDA2 != 0) ? brg->brgattr.LDA2 : brg->LDA;
+    brg->LDB2 = (brg->brgattr.LDB2 != 0) ? brg->brgattr.LDB2 : brg->LDB;
+    brg->LDC2_M = (brg->brgattr.LDC2_M != 0) ? brg->brgattr.LDC2_M : brg->LDC;
+    brg->LDC2_N
+            = (brg->brgattr.LDC2_N != 0) ? brg->brgattr.LDC2_N : brg->ld_block;
+
+    brg->is_blocked = (brg->LDA2 != brg->LDA || brg->LDB2 != brg->LDB
+            || brg->LDC2_M != brg->LDC || brg->LDC2_N != brg->ld_block);
+
+    if (!IMPLICATION(brg->is_blocked, brg->layout == brgemm_row_major))
+        return status::invalid_arguments;
+
     return status::success;
 }
 
@@ -839,7 +915,8 @@ void init_brgemm_conf(brgemm_desc_t *brg, cpu_isa_t isa,
 
     brg->isa_user = isa;
     set_isa_impl(brg);
-    brg->is_int8_tmm = brg->is_int8 && brg->isa_impl == avx512_core_amx;
+    brg->is_int8_tmm
+            = brg->is_int8 && is_superset(brg->isa_impl, avx512_core_amx);
     brg->is_bf16_tmm = brg->is_bf16 && brg->isa_impl == avx512_core_amx;
     brg->is_f16_tmm = brg->is_f16 && brg->isa_impl == avx512_core_amx_fp16;
     brg->is_bf32 = is_bf32
@@ -851,8 +928,9 @@ void init_brgemm_conf(brgemm_desc_t *brg, cpu_isa_t isa,
     brg->has_int8_vnni = isa_has_int8_vnni(brg->isa_impl);
 
     set_brg_vmm(brg); // TODO: Investigate if it is really needed here.
-    brg->req_s8s8_compensation = brg->is_int8 && brg->dt_a == data_type::s8
-            && !isa_has_s8s8(brg->isa_impl);
+    brg->req_s8s8_compensation = brg->is_int8 && !brg->is_int8_tmm
+            && !isa_has_s8s8(brg->isa_impl) && brg->dt_a == data_type::s8
+            && !brg->with_src_dyn_quant;
 
     brg->LDA = (brg->is_row_major()) ? static_cast<int>(LDA)
                                      : static_cast<int>(LDB);
@@ -875,15 +953,6 @@ void init_brgemm_conf(brgemm_desc_t *brg, cpu_isa_t isa,
     brg->bd_block2 = 0;
     brg->bdb2 = 0;
     brg->bdb2_tail = 0;
-
-    const data_type_t ld_step_compute_dt
-            = get_mac_emu_data_type(brg->dt_b, brg->isa_impl,
-                    brg->isa_impl != avx2_vnni_2 && !brg->is_fp8_via_convert());
-    brg->ld_step = data_type_vnni_granularity(ld_step_compute_dt);
-
-    const data_type_t rd_step_compute_dt = get_mac_emu_data_type(
-            brg->dt_b, brg->isa_impl, !brg->is_fp8_via_convert());
-    brg->rd_step = data_type_vnni_granularity(rd_step_compute_dt);
 }
 
 void init_brdgmm_conf(brgemm_desc_t *brg, cpu_isa_t isa,
diff --git a/src/cpu/x64/brgemm/brgemm_utils.hpp b/src/cpu/x64/brgemm/brgemm_utils.hpp
index db2fc9a2a8d..2ff100d2351 100644
--- a/src/cpu/x64/brgemm/brgemm_utils.hpp
+++ b/src/cpu/x64/brgemm/brgemm_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,12 +28,17 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
+void init_kernel_datatype(
+        brgemm_desc_t *brg, data_type_t dt_a, data_type_t dt_b);
+
 namespace brgemm_utils {
 
 bool can_dispatch_uker(const brgemm_desc_t *brg);
 
 void maybe_try_bf32(brgemm_desc_t *brg);
 
+void set_isa_impl(brgemm_desc_t *brg);
+
 status_t brgemm_blocking(brgemm_desc_t *brg);
 
 status_t brdgmm_blocking(brgemm_desc_t *brg);
diff --git a/src/cpu/x64/brgemm/capi/brgemm_api.cpp b/src/cpu/x64/brgemm/capi/brgemm_api.cpp
deleted file mode 100644
index 79d5dcc73b8..00000000000
--- a/src/cpu/x64/brgemm/capi/brgemm_api.cpp
+++ /dev/null
@@ -1,688 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "oneapi/dnnl/dnnl_ukernel.h"
-
-#include "common/c_types_map.hpp"
-#include "common/memory_desc_wrapper.hpp"
-#include "common/verbose.hpp"
-
-#include "cpu/ref_io_helper.hpp"
-
-#include "cpu/x64/amx_tile_configure.hpp"
-
-#include "cpu/x64/brgemm/brgemm.hpp"
-
-#include "cpu/x64/brgemm/capi/brgemm_api.hpp"
-
-#ifdef DNNL_EXPERIMENTAL_UKERNEL
-
-using namespace dnnl::impl;
-using namespace dnnl::impl::format_tag;
-using namespace dnnl::impl::status;
-using namespace dnnl::impl::cpu::x64;
-
-using brgemm_t = dnnl_brgemm;
-using transform_t = dnnl_transform;
-
-#define VCHECK_BRGEMM(cond, msg, ...) \
-    VCONDCHECK(ukernel, create, check, brgemm, (cond), \
-            status::invalid_arguments, msg, ##__VA_ARGS__)
-
-#define VCHECK_BRGEMM_STATUS(status, cond, msg, ...) \
-    VCONDCHECK(ukernel, create, check, brgemm, (cond), (status), msg, \
-            ##__VA_ARGS__)
-
-status_t attr_params_t::set_post_ops_args(const void **post_ops_args) {
-    post_ops_args_ = post_ops_args;
-    return status::success;
-}
-
-status_t attr_params_t::set_scales(const void *scales, int arg) {
-    switch (arg) {
-        case DNNL_ARG_SRC: a_scales_ = scales; break;
-        case DNNL_ARG_WEIGHTS: b_scales_ = scales; break;
-        case DNNL_ARG_DST: d_scales_ = scales; break;
-        default: assert(!"unsupported arg");
-    }
-    return status::success;
-}
-
-const void *attr_params_t::get_scales(int arg) const {
-    switch (arg) {
-        case DNNL_ARG_SRC: return a_scales_;
-        case DNNL_ARG_WEIGHTS: return b_scales_;
-        case DNNL_ARG_DST: return d_scales_;
-        default: assert(!"unsupported arg");
-    }
-    return nullptr;
-}
-
-dnnl_brgemm::~dnnl_brgemm() {
-    brgemm_kernel_destroy(brgemm_kernel_);
-}
-
-// Typical usage is either `1.f` to append to previous result, or `0.f` to write
-// C from scratch.
-status_t brgemm_t::set_add_C(int add_C) {
-    if (add_C == 0)
-        beta_ = 0.f;
-    else if (add_C == 1)
-        beta_ = 1.f;
-    return status::success;
-}
-
-status_t brgemm_t::set_post_ops(
-        dim_t ldd, data_type_t d_dt, const post_ops_t *post_ops) {
-    ldd_ = ldd;
-    d_dt_ = d_dt;
-    CHECK(attr_.set_post_ops(*post_ops));
-    return status::success;
-}
-
-status_t brgemm_t::set_scales(int mask, int arg) {
-    if (mask < 0) return status::invalid_arguments;
-    CHECK(attr_.scales_.set(arg, mask));
-    return status::success;
-}
-
-status_t brgemm_t::finalize() {
-    brgemm_batch_kind_t batch_kind = brgemm_batch_kind_t::brgemm_offs;
-
-    auto status = brgemm_desc_init(&brgemm_desc_, cpu_isa_t::isa_undef,
-            batch_kind, a_dt_, b_dt_, /* transA = */ false,
-            /* trans_B = */ false, brgemm_row_major, /* alpha = */ 1.f, beta_,
-            lda_, ldb_, ldc_, M_, N_, K_,
-            /* strides = */ nullptr);
-    if (status != status::success) {
-        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_init failed");
-    }
-
-    memory_desc_t D_md;
-    dims_t dims {M_, N_};
-    dims_t strides {ldc_, 1};
-    status = memory_desc_init_by_strides(
-            D_md, /* ndims = */ 2, dims, d_dt_, strides);
-    if (status != status::success) {
-        VCHECK_BRGEMM_STATUS(status, false, "D_md creation failed");
-    }
-
-    status = brgemm_desc_set_postops(
-            &brgemm_desc_, &attr_, &D_md, ldd_, data_type::undef);
-    if (status != status::success) {
-        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_set_postops failed");
-    }
-
-    brgemm_attr_t brgemm_attr;
-    brgemm_attr.max_bs = batch_size_;
-    if (mayiuse(avx512_core_amx)) {
-        brgemm_attr.use_uker = true;
-        brgemm_attr.use_interleave_stores = true;
-        brgemm_attr.hint_prefetching = brgemm_kernel_prefetching_t::brgemm_prf0;
-    }
-
-    status = brgemm_desc_set_attr(&brgemm_desc_, brgemm_attr);
-    if (status != status::success) {
-        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_set_attr failed");
-    }
-
-    // Note: API can't take a compensation buffer externally. Users must add
-    // compensation on their own as a binary post-op.
-    brgemm_desc_.req_s8s8_compensation = false;
-
-    return status::success;
-}
-
-pack_type_t brgemm_t::get_B_pack_type() const {
-    if (brgemm_desc_.is_b_data_layout_vnni()) return pack_type::pack32;
-    return pack_type::no_trans;
-}
-
-size_t brgemm_t::get_scratchpad_size() const {
-    return brgemm_desc_.get_wsp_buffer_size();
-}
-
-status_t brgemm_t::set_hw_context() const {
-    char palette[AMX_PALETTE_SIZE] = {};
-    auto status = brgemm_init_tiles(brgemm_desc_, palette);
-    // If status isn't successful, it means tiles configuration is not required.
-    if (status == status::success) {
-        status = amx_tile_lazy_configure(palette);
-        VCHECK_BRGEMM_STATUS(
-                status, status == status::success, "amx_tile_configure failed");
-    }
-    return status::success;
-}
-
-status_t brgemm_t::generate() {
-    // Re-generation won't take any effect.
-    if (brgemm_kernel_ != nullptr) return status::success;
-
-    auto status = brgemm_kernel_create(&brgemm_kernel_, brgemm_desc_);
-    VCHECK_BRGEMM_STATUS(
-            status, status == status::success, "brgemm_kernel_create failed");
-
-    // Generate a verbose info string at the point where configuration is done.
-    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
-        create_verbose_info();
-    }
-    return status::success;
-}
-
-status_t brgemm_t::execute(const void *A_ptr, const void *B_ptr,
-        const dim_t *A_B_offsets, void *C_ptr, void *scratchpad_ptr) const {
-    const auto batch_size = brgemm_desc_.brgattr.max_bs;
-    std::vector<brgemm_batch_element_t> v_batch_element(batch_size);
-    for (int i = 0; i < batch_size; i++) {
-        v_batch_element[i].offset.A = A_B_offsets[2 * i];
-        v_batch_element[i].offset.B = A_B_offsets[2 * i + 1];
-    }
-
-    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
-        double start_ms = get_msec();
-        brgemm_kernel_execute(brgemm_kernel_, batch_size, A_ptr, B_ptr,
-                v_batch_element.data(), C_ptr, scratchpad_ptr,
-                /* dynamic_values = */ nullptr);
-        double duration_ms = get_msec() - start_ms;
-
-        std::stringstream ss;
-        ss << "cpu,brgemm,,undef," << verbose_info_;
-        VPROF(start_ms, ukernel, exec, VERBOSE_profile, ss.str().c_str(),
-                duration_ms);
-    } else {
-        brgemm_kernel_execute(brgemm_kernel_, batch_size, A_ptr, B_ptr,
-                v_batch_element.data(), C_ptr, scratchpad_ptr,
-                /* dynamic_values = */ nullptr);
-    }
-    return status::success;
-}
-
-status_t brgemm_t::execute(const void *A_ptr, const void *B_ptr,
-        const dim_t *A_B_offsets, const void *C_ptr, void *D_ptr,
-        void *scratchpad_ptr, const attr_params_t *attr_params) const {
-    if (attr_params == nullptr) return status::invalid_arguments;
-
-    const auto batch_size = brgemm_desc_.brgattr.max_bs;
-    std::vector<brgemm_batch_element_t> v_batch_element(batch_size);
-    for (int i = 0; i < batch_size; i++) {
-        v_batch_element[i].offset.A = A_B_offsets[2 * i];
-        v_batch_element[i].offset.B = A_B_offsets[2 * i + 1];
-    }
-
-    brgemm_post_ops_data_t post_ops_data;
-    // Note: this member is used to compute an offset from the base DST address.
-    // Thus, it's not a C buffer that should be passed, but D buffer.
-    post_ops_data.data_C_ptr_ = reinterpret_cast<const char *>(D_ptr);
-    // This member expects a pointer to a vector of pointers to binary_po args.
-    // It's exactly what `attr_params` stores when gets a pointer from the user.
-    post_ops_data.binary_post_ops_rhs = attr_params->get_post_ops_args();
-
-    // Scales (quantization case, happens after accumulation). Require manual
-    // combining when both are present, and extending to full simd broadcast,
-    // when single values are provided.
-    // Note: this piece is pretty close to what `precompute_scales` does.
-    // TODO: switch to `precompute_scales` directly.
-    alignas(64) float scales_buf[16] = {0};
-    // TODO: delegate extra memory to scratchpad?
-    std::vector<float> wei_scales_v(N_);
-
-    const bool has_src_scales
-            = !attr_.scales_.get(DNNL_ARG_SRC).has_default_values();
-    const bool has_wei_scales
-            = !attr_.scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-
-    // Save src scale value to re-use it.
-    float src_scale_val = 1.f;
-    if (has_src_scales) {
-        const void *src_scales_ptr = attr_params->get_scales(DNNL_ARG_SRC);
-        if (src_scales_ptr == nullptr) return status::invalid_arguments;
-
-        src_scale_val
-                = cpu::io::load_float_value(data_type::f32, src_scales_ptr, 0);
-    }
-    if (has_wei_scales) {
-        // Handle weights entirely here to avoid duplicating the logic.
-
-        const void *wei_scales_ptr = attr_params->get_scales(DNNL_ARG_WEIGHTS);
-        if (wei_scales_ptr == nullptr) return status::invalid_arguments;
-
-        int wei_mask = attr_.scales_.get(DNNL_ARG_WEIGHTS).mask_;
-        if (wei_mask > 0) {
-            for (dim_t i = 0; i < N_; i++) {
-                const float wei_scale_val = cpu::io::load_float_value(
-                        data_type::f32, wei_scales_ptr, i);
-                wei_scales_v[i] = wei_scale_val * src_scale_val;
-            }
-            post_ops_data.scales = wei_scales_v.data();
-        } else {
-            const float s = cpu::io::load_float_value(
-                    data_type::f32, wei_scales_ptr, 0);
-            utils::array_set(scales_buf, s * src_scale_val, 16);
-            post_ops_data.scales = scales_buf;
-        }
-    } else if (has_src_scales) {
-        utils::array_set(scales_buf, src_scale_val, 16);
-        post_ops_data.scales = scales_buf;
-    }
-
-    // Destination scales. Require manual extending to full simd broadcast.
-    alignas(64) float dst_scales_buf[16] = {0};
-    if (!attr_.scales_.get(DNNL_ARG_DST).has_default_values()) {
-        const void *dst_scales_ptr = attr_params->get_scales(DNNL_ARG_DST);
-        if (dst_scales_ptr == nullptr) return status::invalid_arguments;
-
-        const float s
-                = cpu::io::load_float_value(data_type::f32, dst_scales_ptr, 0);
-        utils::array_set(dst_scales_buf, 1.f / s, 16);
-        post_ops_data.dst_scales = dst_scales_buf;
-    }
-
-    if (D_ptr && c_dt_ == d_dt_
-            && attr_.has_default_values(
-                    primitive_attr_t::skip_mask_t::fpmath_mode)) {
-        C_ptr = D_ptr;
-    }
-
-    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
-        double start_ms = get_msec();
-        brgemm_kernel_execute_postops(brgemm_kernel_, batch_size, A_ptr, B_ptr,
-                v_batch_element.data(), const_cast<void *>(C_ptr), D_ptr,
-                post_ops_data, scratchpad_ptr,
-                /* dynamic_values = */ nullptr);
-        double duration_ms = get_msec() - start_ms;
-
-        std::stringstream ss;
-        ss << "cpu,brgemm,,undef," << verbose_info_;
-        VPROF(start_ms, ukernel, exec, VERBOSE_profile, ss.str().c_str(),
-                duration_ms);
-    } else {
-        brgemm_kernel_execute_postops(brgemm_kernel_, batch_size, A_ptr, B_ptr,
-                v_batch_element.data(), const_cast<void *>(C_ptr), D_ptr,
-                post_ops_data, scratchpad_ptr,
-                /* dynamic_values = */ nullptr);
-    }
-    return status::success;
-}
-
-status_t brgemm_t::create_verbose_info() {
-#if defined(DISABLE_VERBOSE)
-    return status::success;
-#else
-    const auto &d = brgemm_desc_;
-    std::stringstream ss;
-
-    memory_desc_t src_md;
-    const dims_t src_dims = {M_, K_};
-    const dims_t src_strides = {lda_, 1};
-    CHECK(memory_desc_init_by_strides(src_md, 2, src_dims, a_dt_, src_strides));
-
-    memory_desc_t wei_md;
-    const dims_t wei_dims = {K_, N_};
-    const dims_t wei_strides = {ldb_, 1};
-    CHECK(memory_desc_init_by_strides(wei_md, 2, wei_dims, b_dt_, wei_strides));
-
-    memory_desc_t dst_md;
-    const dims_t dst_dims = {M_, N_};
-    const dims_t dst_strides = {ldd_, 1};
-    CHECK(memory_desc_init_by_strides(dst_md, 2, dst_dims, d_dt_, dst_strides));
-
-    ss << md2fmt_str("src", &src_md, format_kind::undef) << " ";
-    ss << md2fmt_str("wei", &wei_md, format_kind::undef) << " ";
-    ss << md2fmt_str("dst", &dst_md, format_kind::undef);
-    ss << "," << attr2str(&attr_) << ",";
-    ss << "bs:" << d.brgattr.max_bs << " beta:" << beta_;
-    ss << "," << md2dim_str(&src_md) << ":" << md2dim_str(&wei_md);
-
-    verbose_info_ = ss.str();
-    return status::success;
-#endif
-}
-
-dnnl_transform::dnnl_transform(dim_t K, dim_t N, pack_type_t in_pack_type,
-        dim_t in_ld, dim_t out_ld, data_type_t in_dt, data_type_t out_dt)
-    : K_(K)
-    , N_(N)
-    , in_ld_(in_ld)
-    , out_ld_(out_ld)
-    , in_dt_(in_dt)
-    , out_dt_(out_dt) {
-    // Check for a valid in_ld depending on a pack type.
-    assert(in_pack_type == pack_type::no_trans ? in_ld_ >= N_ : in_ld_ >= K_);
-    // Only special N_blk sizes are supported by matmul copy routines. Rest
-    // will crash.
-    assert(utils::one_of(out_ld_, 16, 32, 48, 64));
-
-    const auto in_tag = in_pack_type == pack_type::trans ? format_tag::ba
-                                                         : format_tag::ab;
-    auto status = matmul::init_conf(bmc_, /* batch = */ 1, K_, N_, in_ld_,
-            out_ld_, in_dt_, out_dt_, in_tag);
-    assert(status == status::success);
-    if (status != status::success) return;
-
-    if (in_pack_type == pack_type::trans) {
-        strides_[0] = 1;
-        strides_[1] = in_ld_;
-    } else if (in_pack_type == pack_type::no_trans) {
-        strides_[0] = in_ld_;
-        strides_[1] = 1;
-    } else {
-        assert(!"Unsupported pack type");
-    }
-}
-
-status_t transform_t::generate() {
-    // Re-generation won't take any effect.
-    if (pack_B_kernel_ != nullptr) return status::success;
-
-    CHECK(matmul::create_brgemm_matmul_copy_b(pack_B_kernel_, &bmc_));
-
-    // Generate a verbose info string at the point where configuration is done.
-    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
-        CHECK(create_verbose_info());
-    }
-    return status::success;
-}
-
-status_t transform_t::execute(const void *src, void *dst) const {
-    double start_ms = 0;
-    if (get_verbose(verbose_t::exec_profile, component_t::ukernel))
-        start_ms = get_msec();
-
-    const uint8_t *src_ptr = reinterpret_cast<const uint8_t *>(src);
-    uint8_t *dst_ptr = reinterpret_cast<uint8_t *>(dst);
-
-    const auto &kernel_conf = bmc_;
-    const dim_t n_blks = utils::div_up(kernel_conf.N, kernel_conf.N_blk);
-    const dim_t k_blks = utils::div_up(kernel_conf.K, kernel_conf.K_blk);
-    const auto blk_size = kernel_conf.K_blk * kernel_conf.N_blk;
-
-    const auto i_dt_sz = kernel_conf.b_dt_sz;
-    const auto o_dt_sz = kernel_conf.a_dt_sz;
-
-    for (dim_t n_blk_idx = 0; n_blk_idx < n_blks; n_blk_idx++) {
-        const auto n = n_blk_idx * kernel_conf.N_blk;
-        const bool is_N_tail = (kernel_conf.N - n) < kernel_conf.N_blk;
-        auto ker_exec_ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t();
-        ker_exec_ctx.current_N_blk
-                = is_N_tail ? kernel_conf.N_tail : kernel_conf.N_blk;
-
-        int k_blk_idx = 0;
-        for (; k_blk_idx < kernel_conf.K / kernel_conf.K_blk; k_blk_idx++) {
-            const auto k = k_blk_idx * kernel_conf.K_blk;
-            const auto src_offset
-                    = i_dt_sz * (k * strides_[0] + n * strides_[1]);
-            const auto dst_offset
-                    = o_dt_sz * (k_blk_idx * blk_size + n_blk_idx * k_blks);
-            ker_exec_ctx.src = &src_ptr[src_offset];
-            ker_exec_ctx.tr_src = &dst_ptr[dst_offset];
-            ker_exec_ctx.current_K_start = k;
-            ker_exec_ctx.current_K_iters = kernel_conf.K_blk;
-            (*pack_B_kernel_)(&ker_exec_ctx);
-        }
-        if (kernel_conf.K_tail > 0) {
-            const auto k = k_blk_idx * kernel_conf.K_blk;
-            const auto src_offset
-                    = i_dt_sz * (k * strides_[0] + n * strides_[1]);
-            const auto dst_offset
-                    = o_dt_sz * (k_blk_idx * blk_size + n_blk_idx * k_blks);
-            ker_exec_ctx.src = &src_ptr[src_offset];
-            ker_exec_ctx.tr_src = &dst_ptr[dst_offset];
-            ker_exec_ctx.current_K_start = k;
-            ker_exec_ctx.current_K_iters = kernel_conf.K_tail;
-            (*pack_B_kernel_)(&ker_exec_ctx);
-        }
-    }
-
-    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
-        double duration_ms = get_msec() - start_ms;
-
-        std::stringstream ss;
-        ss << "cpu,transform,pack_B,undef," << verbose_info_;
-        VPROF(start_ms, ukernel, exec, VERBOSE_profile, ss.str().c_str(),
-                duration_ms);
-    }
-    return status::success;
-}
-
-status_t transform_t::create_verbose_info() {
-#if defined(DISABLE_VERBOSE)
-    return status::success;
-#else
-    std::stringstream ss;
-
-    memory_desc_t src_md;
-    const dims_t dims = {K_, N_};
-    CHECK(memory_desc_init_by_strides(src_md, 2, dims, in_dt_, strides_));
-
-    memory_desc_t dst_md;
-    const dims_t dst_strides = {out_ld_, 1};
-    CHECK(memory_desc_init_by_strides(dst_md, 2, dims, out_dt_, dst_strides));
-
-    ss << md2fmt_str("src", &src_md, format_kind::undef) << " ";
-    ss << md2fmt_str("dst", &dst_md, format_kind::undef);
-    ss << ",,," << md2dim_str(&src_md);
-
-    verbose_info_ = ss.str();
-    return status::success;
-#endif
-}
-
-////////////////
-// Public API //
-////////////////
-
-/////////////////////////
-// Attribute arguments //
-/////////////////////////
-
-status_t dnnl_ukernel_attr_params_create(attr_params_t **attr_params) {
-    *attr_params = new attr_params_t();
-    return status::success;
-}
-
-status_t dnnl_ukernel_attr_params_set_post_ops_args(
-        attr_params_t *attr_params, const void **post_ops_args) {
-    if (attr_params == nullptr) return status::invalid_arguments;
-
-    CHECK(attr_params->set_post_ops_args(post_ops_args));
-    return status::success;
-}
-
-status_t dnnl_ukernel_attr_params_set_A_scales(
-        attr_params_t *attr_params, const void *a_scales) {
-    if (attr_params == nullptr) return status::invalid_arguments;
-
-    CHECK(attr_params->set_scales(a_scales, DNNL_ARG_SRC));
-    return status::success;
-}
-
-status_t dnnl_ukernel_attr_params_set_B_scales(
-        attr_params_t *attr_params, const void *b_scales) {
-    if (attr_params == nullptr) return status::invalid_arguments;
-
-    CHECK(attr_params->set_scales(b_scales, DNNL_ARG_WEIGHTS));
-    return status::success;
-}
-
-status_t dnnl_ukernel_attr_params_set_D_scales(
-        attr_params_t *attr_params, const void *d_scales) {
-    if (attr_params == nullptr) return status::invalid_arguments;
-
-    CHECK(attr_params->set_scales(d_scales, DNNL_ARG_DST));
-    return status::success;
-}
-
-status_t dnnl_ukernel_attr_params_destroy(attr_params_t *attr_params) {
-    delete attr_params;
-    return status::success;
-}
-
-////////////
-// BRGeMM //
-////////////
-
-status_t dnnl_brgemm_create(brgemm_t **brgemm, dim_t M, dim_t N, dim_t K,
-        dim_t batch_size, dim_t lda, dim_t ldb, dim_t ldc, data_type_t a_dt,
-        data_type_t b_dt, data_type_t c_dt) {
-    if (batch_size <= 0) {
-        VCHECK_BRGEMM_STATUS(
-                status::invalid_arguments, false, "batch size is non-positive");
-    }
-
-    *brgemm = new brgemm_t(
-            M, N, K, batch_size, lda, ldb, ldc, a_dt, b_dt, c_dt);
-    return status::success;
-}
-
-status_t dnnl_brgemm_set_add_C(brgemm_t *brgemm, int add_C) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->set_add_C(add_C));
-    return status::success;
-}
-
-status_t dnnl_brgemm_set_post_ops(brgemm_t *brgemm, dim_t ldd, data_type_t d_dt,
-        const post_ops_t *post_ops) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->set_post_ops(ldd, d_dt, post_ops));
-    return status::success;
-}
-
-status_t dnnl_brgemm_set_A_scales(brgemm_t *brgemm, int a_scale_mask) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->set_scales(a_scale_mask, DNNL_ARG_SRC));
-    return status::success;
-}
-
-status_t dnnl_brgemm_set_B_scales(brgemm_t *brgemm, int b_scale_mask) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->set_scales(b_scale_mask, DNNL_ARG_WEIGHTS));
-    return status::success;
-}
-
-status_t dnnl_brgemm_set_D_scales(brgemm_t *brgemm, int d_scale_mask) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->set_scales(d_scale_mask, DNNL_ARG_DST));
-    return status::success;
-}
-
-status_t dnnl_brgemm_finalize(brgemm_t *brgemm) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->finalize());
-    return status::success;
-}
-
-status_t dnnl_brgemm_get_B_pack_type(
-        const brgemm_t *brgemm, dnnl_pack_type_t *pack_type) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    if (pack_type) *pack_type = brgemm->get_B_pack_type();
-    return status::success;
-}
-
-status_t dnnl_brgemm_get_scratchpad_size(const brgemm_t *brgemm, size_t *size) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    if (size) *size = brgemm->get_scratchpad_size();
-    return status::success;
-}
-
-status_t dnnl_brgemm_set_hw_context(const brgemm_t *brgemm) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->set_hw_context());
-    return status::success;
-}
-
-status_t dnnl_brgemm_release_hw_context() {
-    if (mayiuse(avx512_core_amx)) {
-        VCHECK_BRGEMM(amx_tile_release() == status::success,
-                "amx_tile_release failed");
-    }
-
-    return status::success;
-}
-
-status_t dnnl_brgemm_generate(brgemm_t *brgemm) {
-    if (brgemm == nullptr) return invalid_arguments;
-
-    CHECK(brgemm->generate());
-    return status::success;
-}
-
-status_t dnnl_brgemm_execute(const brgemm_t *brgemm, const void *A_ptr,
-        const void *B_ptr, const dim_t *A_B_offsets, void *C_ptr,
-        void *scratchpad_ptr) {
-    CHECK(brgemm->execute(A_ptr, B_ptr, A_B_offsets, C_ptr, scratchpad_ptr));
-    return status::success;
-}
-
-status_t dnnl_brgemm_execute_postops(const brgemm_t *brgemm, const void *A_ptr,
-        const void *B_ptr, const dim_t *A_B_offsets, const void *C_ptr,
-        void *D_ptr, void *scratchpad_ptr, const attr_params_t *attr_params) {
-    CHECK(brgemm->execute(A_ptr, B_ptr, A_B_offsets, C_ptr, D_ptr,
-            scratchpad_ptr, attr_params));
-    return status::success;
-}
-
-status_t dnnl_brgemm_destroy(brgemm_t *brgemm) {
-    delete brgemm;
-    return status::success;
-}
-
-///////////////
-// Transform //
-///////////////
-
-status_t dnnl_transform_create(transform_t **transform, dim_t K, dim_t N,
-        pack_type_t in_pack_type, dim_t in_ld, dim_t out_ld, data_type_t in_dt,
-        data_type_t out_dt) {
-    if (transform == nullptr) return status::invalid_arguments;
-
-    *transform
-            = new transform_t(K, N, in_pack_type, in_ld, out_ld, in_dt, out_dt);
-    return status::success;
-}
-
-status_t dnnl_transform_generate(transform_t *transform) {
-    if (transform == nullptr) return status::invalid_arguments;
-
-    CHECK(transform->generate());
-    return status::success;
-}
-
-status_t dnnl_transform_execute(
-        const transform_t *transform, const void *in_ptr, void *out_ptr) {
-    if (utils::any_null(transform, in_ptr, out_ptr))
-        return status::invalid_arguments;
-
-    CHECK(transform->execute(in_ptr, out_ptr));
-    return status::success;
-}
-
-status_t dnnl_transform_destroy(transform_t *transform) {
-    delete transform;
-    return status::success;
-}
-
-#endif
-
-//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/brgemm/capi/brgemm_api.hpp b/src/cpu/x64/brgemm/capi/brgemm_api.hpp
deleted file mode 100644
index 0a2604e9520..00000000000
--- a/src/cpu/x64/brgemm/capi/brgemm_api.hpp
+++ /dev/null
@@ -1,173 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef CPU_X64_BRGEMM_CAPI_BRGEMM_API_HPP
-#define CPU_X64_BRGEMM_CAPI_BRGEMM_API_HPP
-
-#include <memory>
-
-#include "cpu/x64/matmul/brgemm_matmul_copy_utils.hpp"
-#include "cpu/x64/matmul/brgemm_matmul_utils.hpp"
-
-#include "cpu/x64/brgemm/brgemm_types.hpp"
-
-#ifdef DNNL_EXPERIMENTAL_UKERNEL
-
-// A section identical to c_map_types.hpp but just for brgemm ukernel so far.
-namespace dnnl {
-namespace impl {
-namespace cpu {
-namespace x64 {
-
-using pack_type_t = dnnl_pack_type_t;
-namespace pack_type {
-const pack_type_t undef = dnnl_pack_type_undef;
-const pack_type_t no_trans = dnnl_pack_type_no_trans;
-const pack_type_t trans = dnnl_pack_type_trans;
-const pack_type_t pack32 = dnnl_pack_type_pack32;
-} // namespace pack_type
-
-using attr_params_t = dnnl_ukernel_attr_params;
-
-} // namespace x64
-} // namespace cpu
-} // namespace impl
-} // namespace dnnl
-
-struct dnnl_ukernel_attr_params : public dnnl::impl::c_compatible {
-    dnnl_ukernel_attr_params() = default;
-
-    dnnl::impl::status_t set_post_ops_args(const void **post_ops_args);
-    const void *get_post_ops_args() const { return post_ops_args_; }
-
-    dnnl::impl::status_t set_scales(const void *scales, int arg);
-    const void *get_scales(int arg) const;
-
-private:
-    const void *post_ops_args_;
-    const void *a_scales_;
-    const void *b_scales_;
-    const void *d_scales_;
-};
-
-struct dnnl_brgemm : public dnnl::impl::c_compatible {
-    dnnl_brgemm(dnnl::impl::dim_t M, dnnl::impl::dim_t N, dnnl::impl::dim_t K,
-            dnnl::impl::dim_t batch_size, dnnl::impl::dim_t lda,
-            dnnl::impl::dim_t ldb, dnnl::impl::dim_t ldc,
-            dnnl::impl::data_type_t a_dt, dnnl::impl::data_type_t b_dt,
-            dnnl::impl::data_type_t c_dt)
-        : M_(M)
-        , N_(N)
-        , K_(K)
-        , batch_size_(batch_size)
-        , lda_(lda)
-        , ldb_(ldb)
-        , ldc_(ldc)
-        , ldd_(ldc) // User may overwrite with set_post_ops().
-        , a_dt_(a_dt)
-        , b_dt_(b_dt)
-        , c_dt_(c_dt)
-        , d_dt_(c_dt) // User may overwrite with set_post_ops().
-        , beta_(0.f) // User may overwrite with set_add_C().
-        , brgemm_kernel_(nullptr) {}
-
-    ~dnnl_brgemm();
-
-    dnnl::impl::status_t set_add_C(int add_C);
-
-    dnnl::impl::status_t set_post_ops(dnnl::impl::dim_t ldd,
-            dnnl::impl::data_type_t d_dt,
-            const dnnl::impl::post_ops_t *post_ops);
-
-    dnnl::impl::status_t set_scales(int mask, int arg);
-
-    dnnl::impl::status_t finalize();
-
-    dnnl::impl::cpu::x64::pack_type_t get_B_pack_type() const;
-
-    size_t get_scratchpad_size() const;
-
-    dnnl::impl::status_t set_hw_context() const;
-
-    dnnl::impl::status_t generate();
-
-    dnnl::impl::status_t execute(const void *A_ptr, const void *B_ptr,
-            const dnnl::impl::dim_t *A_B_offsets, void *C_ptr,
-            void *scratchpad_ptr) const;
-    dnnl::impl::status_t execute(const void *A_ptr, const void *B_ptr,
-            const dnnl::impl::dim_t *A_B_offsets, const void *C_ptr,
-            void *D_ptr, void *scratchpad_ptr,
-            const dnnl::impl::cpu::x64::attr_params_t *attr_params) const;
-
-private:
-    // User's inputs.
-    dnnl::impl::dim_t M_, N_, K_, batch_size_;
-    dnnl::impl::dim_t lda_, ldb_, ldc_, ldd_;
-    dnnl::impl::data_type_t a_dt_, b_dt_, c_dt_, d_dt_;
-    float beta_;
-    // A copy of attributes to avoid dependency on user's attributes lifetime.
-    dnnl::impl::primitive_attr_t attr_;
-
-    // A main kernel.
-    dnnl::impl::cpu::x64::brgemm_desc_t brgemm_desc_;
-    dnnl::impl::cpu::x64::brgemm_kernel_t *brgemm_kernel_;
-
-    // Creates a `verbose_info_` string once during `generate()` call, and calls
-    // it during execute(). This is done to avoid string re-creation.
-    dnnl::impl::status_t create_verbose_info();
-    std::string verbose_info_;
-};
-
-struct dnnl_transform : public dnnl::impl::c_compatible {
-    // Ctor that follows a call to initialize matmul conf struct.
-    dnnl_transform(dnnl::impl::dim_t K, dnnl::impl::dim_t N,
-            dnnl::impl::cpu::x64::pack_type_t in_pack_type,
-            dnnl::impl::dim_t in_ld, dnnl::impl::dim_t out_ld,
-            dnnl::impl::data_type_t in_dt, dnnl::impl::data_type_t out_dt);
-
-    // Generates a transform kernel.
-    dnnl::impl::status_t generate();
-
-    // Executes a transform kernel.
-    dnnl::impl::status_t execute(const void *src, void *dst) const;
-
-private:
-    // User's inputs.
-    dnnl::impl::dim_t K_, N_;
-    dnnl::impl::dim_t in_ld_, out_ld_;
-    dnnl::impl::data_type_t in_dt_, out_dt_;
-    // Save `strides_` for `execute` to get proper source offset.
-    dnnl::impl::dims_t strides_;
-
-    // A transform kernel.
-    // Note: though it's a generic class for any kind of transformation, so far
-    // it's only matmul's copy_B.
-    dnnl::impl::cpu::x64::matmul::brgemm_matmul_conf_t bmc_;
-    // `unique_ptr` is required by API that generates a kernel.
-    std::unique_ptr<dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_b_t>
-            pack_B_kernel_;
-
-    // Creates a `verbose_info_` string once during `generate()` call, and calls
-    // it during execute(). This is done to avoid string re-creation.
-    dnnl::impl::status_t create_verbose_info();
-    std::string verbose_info_;
-};
-
-#endif
-
-#endif
-
-//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp b/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp
index 39c5a990b05..7a8aaf66445 100644
--- a/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp
+++ b/src/cpu/x64/brgemm/jit_brdgmm_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include "cpu/x64/brgemm/jit_brdgmm_kernel.hpp"
 #include "cpu/x64/cpu_barrier.hpp"
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
-#include "cpu/x64/jit_generator.hpp"
 
 #define GET_OFF(field) offsetof(brgemm_kernel_params_t, field)
 #define GET_OFF_BATCH_ELEMENT(field) offsetof(brgemm_batch_element_t, field)
@@ -39,12 +38,13 @@ using namespace Xbyak;
 template <typename Wmm>
 jit_brdgmm_kernel_base_t<Wmm>::jit_brdgmm_kernel_base_t(
         const brgemm_desc_t &abrd)
-    : jit_generator(jit_name(), abrd.isa_impl)
+    : jit_base_brgemm_kernel_t(jit_name(), abrd.isa_impl)
     , brg(abrd)
-    , simd_w_(vreg_traits<Vmm>::vlen / brg.typesize_C)
+    , simd_w_(vreg_traits_t<Vmm>::vlen / brg.typesize_C)
     , max_vmms_(isa_num_vregs(brg.isa_impl))
     , compute_dst_zp_(brg.zp_type_c != brgemm_broadcast_t::none)
     , compute_src_zp_(brg.zp_type_a != brgemm_broadcast_t::none)
+    , is_src_zp_bcast_(brg.zp_type_a == brgemm_broadcast_t::per_tensor)
     , compute_compensation_(compute_src_zp_ || brg.req_s8s8_compensation)
     , has_vpad_(brg.brgattr.max_top_vpad > 0 || brg.brgattr.max_bottom_vpad > 0)
     , has_bpad_(brg.brgattr.max_top_bpad > 0 || brg.brgattr.max_bottom_bpad > 0)
@@ -147,7 +147,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::read_params() {
     }
 
     if (compute_src_zp_) {
-        mov(reg_tmp, ptr[param1 + GET_OFF(zp_a_val)]);
+        mov(reg_tmp, ptr[param1 + GET_OFF(a_zp_values)]);
         mov(ptr[rsp + src_zp_value_], reg_tmp);
 
         mov(reg_tmp, ptr[param1 + GET_OFF(a_zp_compensations)]);
@@ -238,8 +238,8 @@ void jit_brdgmm_kernel_base_t<Wmm>::cvt2ps(data_type_t type_in,
         bool store) {
     const int tail_size = tail_length();
     const bool is_load_tail = op.isMEM() && mask_flag && tail_size > 0
-            && (tail_size
-                    < static_cast<int>(vreg_traits<Vmm>::vlen / sizeof(float)));
+            && (tail_size < static_cast<int>(
+                        vreg_traits_t<Vmm>::vlen / sizeof(float)));
     if (IMPLICATION(is_load_tail, isa_has_masks(brg.isa_impl))) {
         const Vmm vmm = maybe_mask(vmm_in, is_load_tail, store);
         switch (type_in) {
@@ -473,9 +473,9 @@ void jit_brdgmm_kernel_base_t<Wmm>::store_accumulators_apply_post_ops(
 
     const bool dt_requires_saturation
             = one_of(brg.dt_d, data_type::u8, data_type::s8, data_type::s32);
-    auto vmm_lbound = vmm_tmp(0);
-    auto vmm_ubound = vmm_tmp(1);
     if (dt_requires_saturation) {
+        auto vmm_lbound = vmm_tmp(0);
+        auto vmm_ubound = vmm_tmp(1);
         init_saturate_f32(
                 vmm_lbound, vmm_ubound, reg_tmp, data_type::f32, brg.dt_d);
     }
@@ -484,6 +484,8 @@ void jit_brdgmm_kernel_base_t<Wmm>::store_accumulators_apply_post_ops(
 
     for (int m = 0; m < m_blocks; m++) {
         if (dt_requires_saturation) {
+            auto vmm_lbound = vmm_tmp(0);
+            auto vmm_ubound = vmm_tmp(1);
             for_(int n = 0; n < n_blocks; n++)
             for (int v_i = 0; v_i < v_substep; ++v_i) {
                 if (get_substep_simd(n, v_i, has_n_tail) <= 0) continue;
@@ -511,10 +513,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::store_accumulators_apply_post_ops(
                         if (brg.is_bf16_emu)
                             bf16_emu_->vcvtneps2bf16(vmm_low, vmm);
                         else
-                            vcvtneps2bf16(vmm_low, vmm,
-                                    brg.isa_impl == avx2_vnni_2
-                                            ? Xbyak::VexEncoding
-                                            : Xbyak::EvexEncoding);
+                            vcvtneps2bf16(vmm_low, vmm, get_encoding());
                         if (mask_flag)
                             vmovdqu16(addr, r_vmm_low);
                         else
@@ -553,9 +552,9 @@ void jit_brdgmm_kernel_base_t<Wmm>::store_accumulators_without_post_ops(
 
     const bool dt_requires_saturation
             = brg.is_int8 && brg.dt_c != data_type::s32;
-    auto vmm_lbound = vmm_tmp(0);
-    auto vmm_ubound = vmm_tmp(1);
     if (dt_requires_saturation) {
+        auto vmm_lbound = vmm_tmp(0);
+        auto vmm_ubound = vmm_tmp(1);
         init_saturate_f32(
                 vmm_lbound, vmm_ubound, reg_tmp, data_type::f32, brg.dt_d);
     }
@@ -567,8 +566,11 @@ void jit_brdgmm_kernel_base_t<Wmm>::store_accumulators_without_post_ops(
         if (substep_simd <= 0) continue;
         const bool mask_flag = substep_simd < simd_w_;
         auto vmm_acc = accm(m_blocks, n_blocks, m, n, v_i);
-        if (dt_requires_saturation)
+        if (dt_requires_saturation) {
+            auto vmm_lbound = vmm_tmp(0);
+            auto vmm_ubound = vmm_tmp(1);
             saturate_cvt_f32(vmm_acc, vmm_lbound, vmm_ubound, brg.dt_d);
+        }
         const auto offset = C_offset(m, n, v_i);
         if (IMPLICATION(mask_flag, isa_has_masks(brg.isa_impl))) {
             auto vmm_acc_masked = maybe_mask(vmm_acc, mask_flag, true);
@@ -604,6 +606,17 @@ void jit_brdgmm_kernel_base_t<Wmm>::maybe_transpose_interleaved_vnni_to_plain(
     }
 }
 
+template <typename Wmm>
+void jit_brdgmm_kernel_base_t<Wmm>::load_src_zp() {
+    mov(reg_src_zero_point, ptr[rsp + src_zp_value_]);
+    lea(reg_src_zero_point,
+            is_src_zp_bcast_
+                    ? ptr_b[reg_src_zero_point]
+                    : ptr[reg_src_zero_point + reg_aux_N * sizeof(int32_t)]);
+    if (!is_superset(brg.isa_impl, avx512_core) && is_src_zp_bcast_)
+        uni_vpbroadcastd(vmm_bcast(), ptr[reg_src_zero_point]);
+}
+
 template <typename Wmm>
 void jit_brdgmm_kernel_base_t<Wmm>::compute_int8_compensation(
         int m_blocks, int n_blocks, bool has_n_tail) {
@@ -615,12 +628,10 @@ void jit_brdgmm_kernel_base_t<Wmm>::compute_int8_compensation(
         lea(reg_s8s8_comp, ptr[reg_s8s8_comp + reg_aux_N * sizeof(int32_t)]);
     }
     if (compute_src_zp_) {
-        lea(reg_src_zero_point, ptr[rsp + src_zp_value_]);
+        load_src_zp();
         mov(reg_zp_compensation, ptr[rsp + zp_compensation_]);
         lea(reg_zp_compensation,
                 ptr[reg_zp_compensation + reg_aux_N * sizeof(int32_t)]);
-        if (!is_superset(brg.isa_impl, avx512_core))
-            uni_vpbroadcastd(vmm_bcast(), ptr[reg_src_zero_point]);
     }
 
     for_(int v_i = 0; v_i < v_substep; ++v_i)
@@ -635,16 +646,35 @@ void jit_brdgmm_kernel_base_t<Wmm>::compute_int8_compensation(
         }
         if (compute_src_zp_) {
             // zero_point: conv(src_x8, wei_s8) - src_shift_s32 * compensation_s32
-            const Vmm vmm_zp = vmm_zp_comp();
-            vmovups(vmm_zp,
-                    maybe_EVEX_compress_addr(reg_zp_compensation, offset));
-            if (is_superset(brg.isa_impl, avx512_core)) {
-                const bool src_zp_is_common = true;
-                vpmulld(vmm_zp, vmm_zp,
-                        maybe_EVEX_compress_addr(
-                                reg_src_zero_point, 0, src_zp_is_common));
+            const bool is_tail
+                    = n + 1 == n_blocks && has_n_tail && substep_simd < simd_w_;
+            const Vmm vmm_zp = isa_has_masks(brg.isa_impl)
+                    ? maybe_mask(vmm_zp_comp(), is_tail, false)
+                    : vmm_zp_comp();
+            if (IMPLICATION(is_tail, isa_has_masks(brg.isa_impl))) {
+                vmovups(vmm_zp,
+                        maybe_EVEX_compress_addr(reg_zp_compensation, offset));
+                if (is_src_zp_bcast_) {
+                    if (is_superset(brg.isa_impl, avx512_core))
+                        vpmulld(vmm_zp, vmm_zp,
+                                maybe_EVEX_compress_addr(
+                                        reg_src_zero_point, 0, true));
+                    else
+                        vpmulld(vmm_zp, vmm_zp, vmm_bcast());
+                } else
+                    vpmulld(vmm_zp, vmm_zp,
+                            maybe_EVEX_compress_addr(
+                                    reg_src_zero_point, offset));
             } else {
-                vpmulld(vmm_zp, vmm_zp, vmm_bcast());
+                const int tail_size = tail_length();
+                const Vmm ymm_tmp
+                        = vmm_bcast(); // used for bcast or tail processing in avx2
+                load_data(data_type::s32, vmm_zp,
+                        ptr[reg_zp_compensation + offset], tail_size);
+                if (!is_src_zp_bcast_)
+                    load_data(data_type::s32, ymm_tmp,
+                            ptr[reg_src_zero_point + offset], tail_size);
+                vpmulld(vmm_zp, vmm_zp, ymm_tmp);
             }
         }
         for (int m = 0; m < m_blocks; m++) {
@@ -696,9 +726,9 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_a(
             + is_tail_block * v_i * simd_w_ * brg.typesize_A];
     if (IMPLICATION(mask_flag, isa_has_masks(brg.isa_impl))) {
         vmma = maybe_mask(vmma, mask_flag, false);
-        if (brg.is_f32) {
+        if (brg.dt_a == data_type::f32) {
             vmovups(vmma, addr);
-        } else if (brg.is_bf16) {
+        } else if (brg.dt_a == data_type::bf16) {
             if (brg.isa_impl == avx2_vnni_2) {
                 if (is_tail_block) {
                     vpmovzxwd(vmma, addr);
@@ -711,7 +741,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_a(
                 vpmovzxwd(vmma, addr);
                 if (is_slow_bf16_vnni()) vpslld(vmma, vmma, 16);
             }
-        } else if (brg.is_f16) {
+        } else if (brg.dt_b == data_type::f16) {
             if (brg.isa_impl == avx2_vnni_2) {
                 if (is_tail_block)
                     vcvtph2ps(vmma, addr);
@@ -721,7 +751,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_a(
                     vcvtneoph2ps(vmma, addr);
             } else
                 vcvtph2ps(vmma, addr);
-        } else if (brg.is_int8) {
+        } else if (utils::one_of(brg.dt_a, data_type::s8, data_type::u8)) {
             if (is_fast_vnni_int8()) {
                 assert(!mask_flag);
                 vbroadcasti32x4(vmma, addr);
@@ -747,9 +777,9 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_b(
     const bool is_tail_block = has_n_tail && (n_i + 1 == n_blocks);
     const auto addr = ptr[reg_aux_B + B_offset(n_i)
             + is_tail_block * v_i * simd_w_ * brg.typesize_B];
-    if (brg.is_f32) {
+    if (brg.dt_b == data_type::f32) {
         vmovups(vmmb, addr);
-    } else if (brg.is_int8) {
+    } else if (brg.dt_b == data_type::s8) {
         if (wei_zp) { // load weights for zero-point computation
             vpmovsxbd(vmmb, addr);
             if (is_fast_vnni_int8()) vpermd(vmmb, vmm_permute(), vmmb);
@@ -762,7 +792,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_b(
                 vpmovsxbd(vmmb, addr);
             }
         }
-    } else if (brg.is_f16) {
+    } else if (brg.dt_b == data_type::f16) {
         if (brg.isa_impl == avx2_vnni_2) {
             if (is_tail_block)
                 vcvtph2ps(vmmb, addr);
@@ -772,7 +802,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_b(
                 vcvtneoph2ps(vmmb, addr);
         } else
             vcvtph2ps(vmmb, addr);
-    } else if (brg.is_bf16) {
+    } else if (brg.dt_b == data_type::bf16) {
         if (brg.isa_impl == avx2_vnni_2) {
             if (is_tail_block) {
                 vpmovzxwd(vmmb, addr);
@@ -783,31 +813,52 @@ void jit_brdgmm_kernel_base_t<Wmm>::load_b(
                 vcvtneobf162ps(vmmb, addr);
         } else {
             vpmovzxwd(vmmb, addr);
-            if (is_slow_bf16_vnni()) vpslld(vmmb, vmmb, 16);
+            if (is_slow_bf16_vnni() || brg.is_f32) vpslld(vmmb, vmmb, 16);
         }
     }
 }
 
 template <typename Wmm>
 void jit_brdgmm_kernel_base_t<Wmm>::comp_dot_product(
-        compute_pad_kernel_t kernel_type, Vmm vmm_acc, Vmm vmmb) {
+        compute_pad_kernel_t kernel_type, Vmm vmm_acc, Vmm vmmb, int n,
+        bool is_tail_block) {
     switch (kernel_type) {
         case compute_pad_kernel_t::s8s8_kernel:
-            vpdpbusd(vmm_acc, vmm_shift(), vmmb,
-                    is_superset(brg.isa_impl, avx512_core)
-                            ? Xbyak::EvexEncoding
-                            : Xbyak::VexEncoding);
+            vpdpbusd(vmm_acc, vmm_shift(), vmmb, get_encoding());
             break;
-        case compute_pad_kernel_t::zero_point_kernel:
-            if (is_superset(brg.isa_impl, avx512_core)) {
-                vpmulld(vmm_zp_comp(), vmmb,
-                        maybe_EVEX_compress_addr(reg_src_zero_point, 0, true));
+        case compute_pad_kernel_t::zero_point_kernel: {
+            const Vmm vmm_zp = isa_has_masks(brg.isa_impl)
+                    ? maybe_mask(vmm_zp_comp(), is_tail_block, false)
+                    : vmm_zp_comp();
+            const size_t offset = comp_offset(n);
+            if (IMPLICATION(is_tail_block, isa_has_masks(brg.isa_impl))) {
+                if (is_src_zp_bcast_) {
+                    if (is_superset(brg.isa_impl, avx512_core))
+                        vpmulld(vmm_zp, vmmb,
+                                maybe_EVEX_compress_addr(
+                                        reg_src_zero_point, 0, true));
+                    else
+                        vpmulld(vmm_zp, vmmb, vmm_bcast());
+                } else {
+                    const Xbyak::Address src_zp_addr = maybe_EVEX_compress_addr(
+                            reg_src_zero_point, offset);
+                    if (is_fast_vnni_int8()) {
+                        vmovups(vmm_zp, src_zp_addr);
+                        vpermd(vmm_zp, vmm_permute(), vmm_zp);
+                        vpmulld(vmm_zp, vmmb, vmm_zp);
+                    } else
+                        vpmulld(vmm_zp, vmmb, src_zp_addr);
+                }
             } else {
-                uni_vpbroadcastd(vmm_bcast(), ptr[reg_src_zero_point]);
-                vpmulld(vmm_zp_comp(), vmmb, vmm_bcast());
+                const Vmm ymm_tmp
+                        = vmm_bcast(); // used for bcast or tail processing in avx2
+                if (!is_src_zp_bcast_)
+                    load_data(data_type::s32, ymm_tmp,
+                            ptr[reg_src_zero_point + offset], tail_length());
+                vpmulld(vmm_zp, vmmb, ymm_tmp);
             }
             vpaddd(vmm_acc, vmm_acc, vmm_zp_comp());
-            break;
+        } break;
         default: assert(!"unsupported comp_kernel type");
     }
 }
@@ -848,21 +899,25 @@ void jit_brdgmm_kernel_base_t<Wmm>::pad_comp_kernel(
 
     for (int pad_i = max_m_unroll; pad_i > 0; --pad_i) {
         L(jmp_table_labels[pad_i]);
-        if (is_zero_point_kernel)
-            lea(reg_src_zero_point, ptr[rsp + src_zp_value_]);
+        if (is_zero_point_kernel) load_src_zp();
         if (pad_i > m_blocks) continue;
         const int m_i = get_mi(pad_i);
         int p_b_i = 0;
         for (int n_i = 0; n_i < n_blocks; ++n_i, ++p_b_i) {
-            if (get_substep_simd(n_i, 0, has_tail) <= 0) continue;
+            const int substep_simd = get_substep_simd(n_i, 0, has_tail);
+            if (substep_simd <= 0) continue;
             const Vmm vmm_acc = accm(m_blocks, n_blocks, m_i, n_i, 0);
+            const bool is_tail_block
+                    = n_i + 1 == n_blocks && has_tail && substep_simd < simd_w_;
             if (p_b_i < n_preload_b_vmms) {
-                comp_dot_product(kernel_type, vmm_acc, vmm_b(p_b_i));
+                comp_dot_product(
+                        kernel_type, vmm_acc, vmm_b(p_b_i), n_i, is_tail_block);
             } else {
                 // preloaded vmm_b not available
                 const Vmm vmm_wei = vmm_b(max_bvmms - 1);
                 load_b(vmm_wei, n_i, 0, has_tail, load_broadcast_wei);
-                comp_dot_product(kernel_type, vmm_acc, vmm_wei);
+                comp_dot_product(
+                        kernel_type, vmm_acc, vmm_wei, n_i, is_tail_block);
             }
         }
     }
@@ -880,8 +935,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::batch_pad_kernel(
     auto kernel_body = [&](compute_pad_kernel_t kernel_type) {
         const bool is_zero_point_kernel
                 = kernel_type == compute_pad_kernel_t::zero_point_kernel;
-        if (is_zero_point_kernel)
-            lea(reg_src_zero_point, ptr[rsp + src_zp_value_]);
+        if (is_zero_point_kernel) load_src_zp();
         for (int nb_i = 0; nb_i < n_blocks; nb_i += max_bvmms) {
             const int n_e = nstl::min(nb_i + max_bvmms, n_blocks) - nb_i;
             for (int i = 0; i < n_e; ++i) {
@@ -893,9 +947,13 @@ void jit_brdgmm_kernel_base_t<Wmm>::batch_pad_kernel(
             for_(int m_i = 0; m_i < m_blocks; ++m_i)
             for (int i = 0; i < n_e; ++i) {
                 const int n_i = nb_i + i;
-                if (get_substep_simd(n_i, 0, has_tail) <= 0) continue;
+                const int substep_simd = get_substep_simd(n_i, 0, has_tail);
+                if (substep_simd <= 0) continue;
                 const Vmm vmm_acc = accm(m_blocks, n_blocks, m_i, n_i, 0);
-                comp_dot_product(kernel_type, vmm_acc, vmm_b(i));
+                const bool is_tail_block
+                        = n_i + 1 == n_e && has_tail && substep_simd < simd_w_;
+                comp_dot_product(
+                        kernel_type, vmm_acc, vmm_b(i), n_i, is_tail_block);
             }
         }
     };
@@ -938,10 +996,7 @@ void jit_brdgmm_kernel_base_t<Wmm>::brdgmm_microkernel(int m_blocks,
             if (brg.dt_a == data_type::s8 && isa_has_s8s8(brg.isa_impl))
                 vpdpbssd(vmm_acc, vmma, vmmb);
             else
-                vpdpbusd(vmm_acc, vmma, vmmb,
-                        is_superset(brg.isa_impl, avx512_core)
-                                ? Xbyak::EvexEncoding
-                                : Xbyak::VexEncoding);
+                vpdpbusd(vmm_acc, vmma, vmmb, get_encoding());
         }
     };
 
@@ -1007,8 +1062,8 @@ void jit_brdgmm_kernel_base_t<Wmm>::brdgmm_microkernel(int m_blocks,
 
             align(64);
             L(jmp_table_base);
-            for (int m_i = 0; m_i < m_blocks; ++m_i) {
-                putL(jmp_table_labels[m_i]);
+            for (const auto &label : jmp_table_labels) {
+                putL(label);
             }
         }
 
@@ -1384,7 +1439,7 @@ void brdgmm_kernel_t<Wmm>::operator()(brgemm_kernel_params_t *params) const {
 }
 
 template <typename Wmm>
-const jit_generator *brdgmm_kernel_t<Wmm>::get_jit_generator() const {
+const jit_generator_t *brdgmm_kernel_t<Wmm>::get_jit_generator() const {
     return brgemm_kernel_;
 }
 
diff --git a/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp b/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp
index e3d6138dd5e..236d027de56 100644
--- a/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp
+++ b/src/cpu/x64/brgemm/jit_brdgmm_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ namespace cpu {
 namespace x64 {
 
 template <typename Wmm>
-struct jit_brdgmm_kernel_base_t : public jit_generator {
+struct jit_brdgmm_kernel_base_t : public jit_base_brgemm_kernel_t {
     jit_brdgmm_kernel_base_t(const brgemm_desc_t &abrd);
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brdgmm_kernel_base_t)
@@ -160,13 +160,15 @@ struct jit_brdgmm_kernel_base_t : public jit_generator {
         return vmm_alloc.get_compute_vmm_count();
     }
 
+    const brgemm_desc_t &get_brg() const override { return brg; }
+
 private:
     // note: this kernel doesn't yet support TMM's. We differentiate Wmm and Vmm
     // just to follow same template style as brgemm_kernel.
     using Vmm =
             typename utils::conditional<std::is_same<Wmm, Xbyak::Tmm>::value,
                     Xbyak::Zmm, Wmm>::type;
-    using Vmm_low_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_low_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
     using po_injector_t = injector::jit_uni_postops_injector_base_t<Vmm>;
     std::unique_ptr<po_injector_t> postops_injector_;
     std::unique_ptr<bf16_emulation_t> bf16_emu_;
@@ -230,6 +232,7 @@ struct jit_brdgmm_kernel_base_t : public jit_generator {
     const int simd_w_;
     const int max_vmms_;
     const bool compute_dst_zp_, compute_src_zp_;
+    const bool is_src_zp_bcast_;
     const bool compute_compensation_; // code-path for either s8s8 or src_zp
     const bool has_vpad_; // vertical padding w.r.t. M dimension
     const bool has_bpad_; // batch pad is computed for the overlap between the
@@ -341,7 +344,8 @@ struct jit_brdgmm_kernel_base_t : public jit_generator {
     void load_b(
             Vmm vmmb, int n_i, int v_i, bool has_n_tail, bool wei_zp = false);
     void comp_dot_product(compute_pad_kernel_t kernel_type, Vmm vmm_acc,
-            Vmm vmmb); // int8 compensation dot_product (zp and s8s8)
+            Vmm vmmb, int n,
+            bool is_tail_block); // int8 compensation dot_product (zp and s8s8)
     void pad_comp_kernel(compute_pad_kernel_t kernel_type, int m_blocks,
             int n_blocks, int padding, const Xbyak::Reg64 reg_pad,
             const std::function<int(int)> &get_mi, bool has_tail = false);
@@ -360,6 +364,7 @@ struct jit_brdgmm_kernel_base_t : public jit_generator {
     void apply_post_ops(int m_blocks, int n_blocks, bool has_n_tail);
     void maybe_transpose_interleaved_vnni_to_plain(
             int m_blocks, int n_blocks, bool has_n_tail);
+    void load_src_zp();
     void compute_int8_compensation(int m_blocks, int n_blocks, bool has_n_tail);
     void store_accumulators(int m_blocks, int n_blocks, bool has_n_tail);
     void store_accumulators_without_post_ops(
diff --git a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp
index 388e8c01742..fef520a0929 100644
--- a/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp
+++ b/src/cpu/x64/brgemm/jit_brgemm_amx_uker.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,6 @@
 #include "cpu/x64/cpu_isa_traits.hpp"
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
 #include "cpu/x64/jit_avx512_core_fp8cvt.hpp"
-#include "cpu/x64/jit_generator.hpp"
 
 #define GET_OFF(field) offsetof(brgemm_kernel_params_t, field)
 #define GET_OFF_BATCH_ELEMENT(field) offsetof(brgemm_batch_element_t, field)
@@ -39,9 +38,9 @@ namespace x64 {
 using namespace dnnl::impl::utils;
 using namespace Xbyak;
 
-struct jit_brgemm_amx_uker_base_t : public jit_generator {
+struct jit_brgemm_amx_uker_base_t : public jit_base_brgemm_kernel_t {
     jit_brgemm_amx_uker_base_t(const brgemm_desc_t &abrg)
-        : jit_generator(jit_name(), abrg.isa_impl)
+        : jit_base_brgemm_kernel_t(jit_name(), abrg.isa_impl)
         , brg(abrg)
         , postops_injector_(nullptr) {
 
@@ -135,6 +134,8 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
 
     brgemm_desc_t brg;
 
+    const brgemm_desc_t &get_brg() const override { return brg; }
+
 private:
     using po_injector_t = injector::jit_uni_postops_injector_base_t<Zmm>;
     std::unique_ptr<po_injector_t> postops_injector_;
@@ -145,8 +146,7 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
     using reg64_t = const Xbyak::Reg64;
     enum {
         simd_w = 16,
-        zmm_width_in_bytes = cpu_isa_traits<avx512_core>::vlen,
-        tile_size = 1024
+        zmm_width_in_bytes = cpu_isa_traits_t<avx512_core>::vlen,
     };
 
     // Register decomposition
@@ -259,10 +259,13 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
 
     struct dim_iteration_t {
         size_t idx = 0;
-        std ::vector<iteration_block_t> blocks;
+        std::vector<iteration_block_t> blocks;
         virtual bool operator==(const dim_iteration_t &rhs) const {
             return blocks == rhs.blocks;
         }
+        virtual bool operator!=(const dim_iteration_t &rhs) const {
+            return !operator==(rhs);
+        }
 
         size_t pos(size_t b) const {
             assert(b < blocks.size());
@@ -279,12 +282,12 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
             return blocks[b].block;
         }
 
-        int is_tail(size_t b) const {
+        bool is_tail(size_t b) const {
             assert(b < blocks.size());
             return blocks[b].is_tail;
         }
 
-        int block2() const { return blocks.size(); }
+        int block2() const { return static_cast<int>(blocks.size()); }
 
         int length() const {
             if (blocks.empty()) return 0;
@@ -307,13 +310,20 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
         bd_iteration_t *similar {nullptr};
         Label lstart;
 
-        virtual bool operator==(const bd_iteration_t &rhs) const {
+        bool operator==(const dim_iteration_t &_rhs) const override {
+            // `downcast` will catch a type mismatch in debug mode.
+            // Note: it supports only a pointer type so far.
+            const bd_iteration_t &rhs
+                    = *utils::downcast<const bd_iteration_t *>(&_rhs);
             bool res = dim_iteration_t::operator==(rhs)
                     && A_shift == rhs.A_shift && C_shift == rhs.C_shift
                     && D_shift == rhs.D_shift && bd_mask == rhs.bd_mask
                     && zp_comp_pad_a_shift == rhs.zp_comp_pad_a_shift;
             return res;
         }
+        bool operator!=(const dim_iteration_t &_rhs) const override {
+            return !operator==(_rhs);
+        }
     };
 
     struct bs_iteration_t {
@@ -398,6 +408,7 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
     Xbyak::Opmask ld_full_mask = Xbyak::Opmask(2);
     Xbyak::Opmask ld_tail_mask = Xbyak::Opmask(3);
     Xbyak::Opmask fp_col_mask = Xbyak::Opmask(4);
+    Xbyak::Opmask rd_tail_mask = Xbyak::Opmask(5);
 
     // Zmm map below
     const Xbyak::Zmm &zmm_tmp_1() const noexcept { return this->zmm0; }
@@ -518,6 +529,10 @@ struct jit_brgemm_amx_uker_base_t : public jit_generator {
             reg64_t reg_base, size_t offset, reg64_t reg_stride,
             matrix_kind_t mk);
 
+    bool maybe_pre_process_k_tail(brgemm_iteration_t &bi, int bdb,
+            const Tmm &t1, reg64_t reg_base, size_t offset, reg64_t reg_stride,
+            matrix_kind_t mk);
+
     void maybe_tileloadd_nt(
             brgemm_iteration_t &bi, matrix_kind_t mk, int xdb, size_t offset);
 
@@ -709,11 +724,12 @@ size_t jit_brgemm_amx_uker_base_t::B_offset(
 
     const auto rdb_B_offset = bi.rdi->pos(0) * brg.rd_block * LDB_size_;
 
-    const auto ldb_B_offset = bi.ldi->pos(0) * ld_block_B_size_ * brg.ld_step;
+    const auto ldb_offs = bi.ldi->pos(ldb) * brg.ld_block;
+    const auto ldb_B_offset = brg.typesize_B
+            * ((ldb_offs / brg.LDB) * brg.brgattr.LDB2
+                    + (ldb_offs % brg.LDB) * brg.rd_step);
 
-    return rdb_B_offset + ldb_B_offset
-            + (brg.is_blocked ? 1 : brg.rd_step) * ldb * ld_block_B_size_
-            + bs_offs;
+    return rdb_B_offset + ldb_B_offset + bs_offs;
 }
 
 size_t jit_brgemm_amx_uker_base_t::C_offset(const brgemm_iteration_t &bi,
@@ -721,7 +737,12 @@ size_t jit_brgemm_amx_uker_base_t::C_offset(const brgemm_iteration_t &bi,
     const auto bi_bd_start = get_out_bd(bi.bdi, 0, 0);
     const auto bd = get_out_bd(bi.bdi, bdb, inp_bd);
     const auto bd_shift = bd - (ununroll_bd_loop ? bi_bd_start : 0);
-    return (size_t)bd_shift * LDC2_size_M_ + (size_t)ldb * LDC2_size_N_;
+    size_t ldc_elem = (size_t)ldb * brg.ld_block;
+    size_t bloc_idx = ldc_elem / brg.LDC;
+    size_t in_block = ldc_elem % brg.LDC;
+
+    return (size_t)bd_shift * LDC2_size_M_ + (size_t)bloc_idx * LDC2_size_N_
+            + in_block * brg.typesize_C;
 }
 
 size_t jit_brgemm_amx_uker_base_t::D_offset(const brgemm_iteration_t &bi,
@@ -1103,9 +1124,14 @@ void jit_brgemm_amx_uker_base_t::prefetch_CD_range(brgemm_iteration_t &bi,
             auto ptr_D = EVEX_compress_addr(reg_D, d_offset);
             uni_prefetch(ptr_D, pft, true);
         } else if (are_post_ops_applicable_) {
-            const auto c_offset = C_offset(bi, bdb, bd, ldb_pos);
-            auto ptr_C = EVEX_compress_addr(reg_C, c_offset);
-            uni_prefetch(ptr_C, pft, true);
+            //            TODO: split hints C and D hints
+            //              Using prefetchw for the C matrix is generally harmful
+            //              because the C matrix is frequently reused and remains in the cache.
+            //              However, it is very necessary for the D matrix
+
+            //            const auto c_offset = C_offset(bi, bdb, bd, ldb_pos);
+            //            auto ptr_C = EVEX_compress_addr(reg_C, c_offset);
+            //            uni_prefetch(ptr_C, pft, true);
         } else {
             const auto d_offset = D_offset(bi, bdb, bd, ldb_pos);
             auto ptr_D = EVEX_compress_addr(reg_D, d_offset);
@@ -1666,11 +1692,17 @@ void jit_brgemm_amx_uker_base_t::maybe_tileloadd_nt(
     auto reg_base = is_A ? reg_A : reg_B;
     auto reg_stride = is_A ? reg_stride_lda : reg_stride_ldb;
 
-    if (brg.is_input_convert())
+    if (brg.is_input_convert()) {
         // try_load_nt is not supported in maybe_pre_process_data as there is
         // no guarantee that the data is cache line aligned.
         maybe_pre_process_data(bi, t1, reg_base, offset, reg_stride, mk);
-    else if (load_nt)
+        return;
+    }
+
+    if (maybe_pre_process_k_tail(bi, xdb, t1, reg_base, offset, reg_stride, mk))
+        return;
+
+    if (load_nt)
         tileloaddt1(t1, ptr[reg_base + offset + reg_stride]);
     else
         tileloadd(t1, ptr[reg_base + offset + reg_stride]);
@@ -1771,10 +1803,9 @@ void jit_brgemm_amx_uker_base_t::fp8_to_f16_upconvert(brgemm_iteration_t &bi,
     assert(max_num_cols > 0);
 
     if (col_tail) {
-        const int tail_mask = (1 << col_tail) - 1;
-        auto reg_tmp_32 = reg_tmp_gpr.cvt32();
-        mov(reg_tmp_32, tail_mask);
-        kmovd(fp_col_mask, reg_tmp_32);
+        const auto tail_mask = (static_cast<size_t>(1) << col_tail) - 1;
+        mov(reg_tmp_gpr, tail_mask);
+        kmovq(fp_col_mask, reg_tmp_gpr);
     }
 
     // Note: using the same register used in col_tail, so order is important
@@ -1810,10 +1841,9 @@ void jit_brgemm_amx_uker_base_t::bf32_downconvert(brgemm_iteration_t &bi,
     assert(max_num_cols > 0);
 
     if (col_tail) {
-        const int tail_mask = (1 << col_tail) - 1;
-        auto reg_tmp_32 = reg_tmp_gpr.cvt32();
-        mov(reg_tmp_32, tail_mask);
-        kmovw(fp_col_mask, reg_tmp_32);
+        const auto tail_mask = (static_cast<size_t>(1) << col_tail) - 1;
+        mov(reg_tmp_gpr, tail_mask);
+        kmovq(fp_col_mask, reg_tmp_gpr);
     }
 
     // Note: using the same register used in col_tail, so order is important
@@ -1898,10 +1928,9 @@ void jit_brgemm_amx_uker_base_t::bf32_downconvert_to_vnni(
     };
 
     if (col_tail) {
-        const int tail_mask = (1 << col_tail) - 1;
-        auto reg_tmp_32 = reg_tmp_gpr.cvt32();
-        mov(reg_tmp_32, tail_mask);
-        kmovw(fp_col_mask, reg_tmp_32);
+        const auto tail_mask = (static_cast<size_t>(1) << col_tail) - 1;
+        mov(reg_tmp_gpr, tail_mask);
+        kmovq(fp_col_mask, reg_tmp_gpr);
     }
 
     // Note: using the same register used in col_tail, so order is important
@@ -1962,12 +1991,12 @@ void jit_brgemm_amx_uker_base_t::maybe_pre_process_data(brgemm_iteration_t &bi,
     auto &transform_buf = is_A ? transform_buf_map_A_ : transform_buf_map_B_;
 
     const auto transform_offset
-            = use_ils_ ? brg.get_num_C_tiles() * tile_size : 0;
+            = use_ils_ ? brg.get_num_C_tiles() * brgemm_desc_t::tilesize : 0;
     const auto max_bdb2 = tloop.bdis[0].block2();
     const auto max_rdb = tloop.rdis.size();
     const auto matrix_a_offset = transform_offset;
     const auto matrix_b_offset = transform_offset
-            + tile_size
+            + brgemm_desc_t::tilesize
                     * (nstl::max<int>(should_save_transform(mk),
                             should_save_transform(matrix_A) * brg.brgattr.max_bs
                                     * max_bdb2 * max_rdb));
@@ -1977,7 +2006,7 @@ void jit_brgemm_amx_uker_base_t::maybe_pre_process_data(brgemm_iteration_t &bi,
 
     if (transform_buf.find(key) != transform_buf.end()) {
         auto buf_idx = transform_buf[key];
-        auto offt = matrix_offset + buf_idx * tile_size;
+        auto offt = matrix_offset + buf_idx * brgemm_desc_t::tilesize;
         tileloadd(t1, ptr[reg_buf + reg_converted_stride + offt]);
         return;
     }
@@ -1986,7 +2015,7 @@ void jit_brgemm_amx_uker_base_t::maybe_pre_process_data(brgemm_iteration_t &bi,
     // save offset of the transformation if required.
     if (should_save_transform(mk)) {
         auto buf_idx = transform_buf.size();
-        buf_offt = matrix_offset + buf_idx * tile_size;
+        buf_offt = matrix_offset + buf_idx * brgemm_desc_t::tilesize;
         transform_buf[key] = buf_idx;
     }
 
@@ -2020,6 +2049,72 @@ void jit_brgemm_amx_uker_base_t::maybe_pre_process_data(brgemm_iteration_t &bi,
     if (buf_offt) sub(reg_buf, buf_offt);
 }
 
+bool jit_brgemm_amx_uker_base_t::maybe_pre_process_k_tail(
+        brgemm_iteration_t &bi, int bdb, const Tmm &t1, reg64_t reg_base,
+        size_t offset, reg64_t reg_stride, matrix_kind_t mk) {
+    const auto &tloop = imap_[bi.apply_postops];
+
+    const auto need_k_tail_processing = mk == matrix_A && brg.amx_wary_k_tail()
+            && brg.rdb_tail != 0 && bi.bdi->idx == tloop.bdis.size() - 1
+            && bdb == bi.bdi->block2() - 1 && bi.last_bsi
+            && tloop.is_last_rdi(bi.rdi);
+
+    if (!need_k_tail_processing) return false;
+
+    auto transform_offset = brg.get_num_C_tiles() * brgemm_desc_t::tilesize
+            + brg.get_convert_wsp_buffer_size();
+
+    if (transform_offset) add(reg_buf, transform_offset);
+    mov(reg_converted_stride, zmm_width_in_bytes);
+
+    // reuse transformed data from matrix A for ldi > 0
+    if (bi.ldi->idx == 0) {
+        const auto num_rows = palette_.rows[t1.getIdx()];
+        const auto num_col_bytes = palette_.cols[t1.getIdx()];
+
+        const auto max_num_cols
+                = nstl::min<int>(num_col_bytes / brg.typesize_A, brg.rdb_tail);
+        const size_t col_tail
+                = max_num_cols % (zmm_width_in_bytes / brg.typesize_A);
+        if (col_tail) {
+            const auto tail_mask = (static_cast<size_t>(1) << col_tail) - 1;
+            mov(reg_tmp_gpr, tail_mask);
+            kmovq(rd_tail_mask, reg_tmp_gpr);
+        }
+        auto zmm_1 = zmm_tmp_1();
+        auto zmm_1_masked = col_tail ? zmm_1 | rd_tail_mask | T_z : zmm_1;
+
+        assert(max_num_cols > 0);
+
+        const auto reg_data_aux = reg_tmp_gpr;
+        lea(reg_data_aux, ptr[reg_base + offset]);
+
+        for (int r = 0; r < num_rows; ++r) {
+            switch (brg.dt_a) {
+                case data_type::bf16:
+                case data_type::f16:
+                    vmovdqu16(zmm_1_masked, ptr[reg_data_aux]);
+                    break;
+                case data_type::f8_e5m2:
+                case data_type::f8_e4m3:
+                case data_type::s8:
+                case data_type::u8:
+                    vmovdqu8(zmm_1_masked, ptr[reg_data_aux]);
+                    break;
+                default: assert(!"unsupported data type");
+            }
+            vmovups(ptr[reg_buf + r * zmm_width_in_bytes], zmm_1);
+            add(reg_data_aux, reg_stride);
+        }
+    }
+    // load into tmm from the transformed data.
+    tileloadd(t1, ptr[reg_buf + reg_converted_stride]);
+
+    // reset buf pointer
+    if (transform_offset) sub(reg_buf, transform_offset);
+    return true;
+}
+
 void jit_brgemm_amx_uker_base_t::gemm_microkernel_amx(brgemm_iteration_t &bi) {
     prf0A.reset();
     prf1A.reset();
@@ -2064,8 +2159,8 @@ void jit_brgemm_amx_uker_base_t::gemm_microkernel_amx(brgemm_iteration_t &bi) {
 
 void jit_brgemm_amx_uker_base_t::rdb_loop(brgemm_iteration_t &bi) {
     const auto &tloop = imap_[bi.apply_postops];
-    for (size_t irdi = 0; irdi < tloop.rdis.size(); irdi++) {
-        bi.rdi = &(tloop.rdis[irdi]);
+    for (auto &rdi : tloop.rdis) {
+        bi.rdi = &rdi;
         gemm_microkernel_amx(bi);
     }
 }
@@ -2195,8 +2290,8 @@ void jit_brgemm_amx_uker_base_t::ldb_loop(brgemm_iteration_t &bi) {
     // we move to next bdb2 block.
     const auto &tloop = imap_[bi.apply_postops];
     transform_buf_map_A_.clear();
-    for (size_t ildi = 0; ildi < tloop.ldis.size(); ildi++) {
-        bi.ldi = &(tloop.ldis[ildi]);
+    for (auto &ldi : tloop.ldis) {
+        bi.ldi = &ldi;
         ldb_loop_body(bi);
     }
 }
@@ -2206,6 +2301,9 @@ jit_brgemm_amx_uker_base_t::find_similar(
         const bd_iteration_t *bdi, bool apply_postops) {
     auto &tloop = imap_[apply_postops];
     const auto cidx = bdi->idx;
+    // if wary_k_tail is true then last iteration is unique
+    if (brg.amx_wary_k_tail() && cidx == tloop.bdis.size() - 1) return nullptr;
+
     for (size_t i = (actual_ils(apply_postops) ? 1 : 0); i < cidx; i++) {
         if (*bdi == tloop.bdis[i]
                 && IMPLICATION(actual_ils(apply_postops),
@@ -2253,8 +2351,8 @@ void jit_brgemm_amx_uker_base_t::bdb_loop(brgemm_iteration_t &bi) {
         mov(ptr[rsp + reg_iter_labels_list_offs_], reg_iter_labels_list);
     }
 
-    for (size_t ibdi = 0; ibdi < tloop.bdis.size(); ibdi++) {
-        bi.bdi = &(tloop.bdis[ibdi]);
+    for (auto &bdi : tloop.bdis) {
+        bi.bdi = &bdi;
         bdb_loop_body(bi);
     }
     if (ununroll_bd_loop) {
@@ -2263,8 +2361,8 @@ void jit_brgemm_amx_uker_base_t::bdb_loop(brgemm_iteration_t &bi) {
 
         align(64);
         L(iteration_pointers);
-        for (size_t ibdi = 0; ibdi < tloop.bdis.size(); ibdi++) {
-            putL(tloop.bdis[ibdi].lstart);
+        for (const auto &bdi : tloop.bdis) {
+            putL(bdi.lstart);
         }
         putL(loop_end);
         L(loop_end);
@@ -2326,11 +2424,9 @@ void jit_brgemm_amx_uker_base_t::fill_imap() {
                 auto abdb = bdb + ibdb;
                 if (abdb >= brg.bdb) break;
                 if (brg.bdb_tail && abdb == brg.bdb - 1)
-                    bdi.blocks.emplace_back(
-                            iteration_block_t(bdi_pos, brg.bdb_tail, true));
+                    bdi.blocks.emplace_back(bdi_pos, brg.bdb_tail, true);
                 else
-                    bdi.blocks.emplace_back(
-                            iteration_block_t(bdi_pos, brg.bd_block, false));
+                    bdi.blocks.emplace_back(bdi_pos, brg.bd_block, false);
                 bdi_pos += brg.bd_block;
                 if (bdi_pos >= brg.bcast_dim) break;
                 bdi_pos = skipped_bd_mask(bdi_pos);
@@ -2371,11 +2467,9 @@ void jit_brgemm_amx_uker_base_t::fill_imap() {
                 auto aldb = ldb + ildb;
                 if (aldb >= brg.ldb) break;
                 if (brg.ldb_tail && aldb == brg.ldb - 1)
-                    ldi.blocks.emplace_back(
-                            iteration_block_t(ldi_pos, brg.ldb_tail, true));
+                    ldi.blocks.emplace_back(ldi_pos, brg.ldb_tail, true);
                 else
-                    ldi.blocks.emplace_back(
-                            iteration_block_t(ldi_pos, brg.ld_block, false));
+                    ldi.blocks.emplace_back(ldi_pos, brg.ld_block, false);
                 ldi_pos++;
             }
             ldi.idx = tloop.ldis.size();
@@ -2387,15 +2481,14 @@ void jit_brgemm_amx_uker_base_t::fill_imap() {
         rdi.blocks.reserve(1);
         for (int rdb = 0; rdb < brg.rdb; rdb++) {
             rdi.blocks.clear();
-            rdi.blocks.emplace_back(iteration_block_t(rdi_pos, brg.rd_block));
+            rdi.blocks.emplace_back(rdi_pos, brg.rd_block);
             rdi.idx = tloop.rdis.size();
             tloop.rdis.push_back(rdi);
             rdi_pos++;
         }
         if (brg.rdb_tail > 0) {
             rdi.blocks.clear();
-            rdi.blocks.emplace_back(
-                    iteration_block_t(rdi_pos, brg.rdb_tail, true));
+            rdi.blocks.emplace_back(rdi_pos, brg.rdb_tail, true);
             rdi.idx = tloop.rdis.size();
             tloop.rdis.push_back(rdi);
         }
@@ -2571,23 +2664,17 @@ void jit_brgemm_amx_uker_base_t::generate() {
             && brg.brgattr.bd_mask_level == 0;
     need_to_apply_alpha_beta_
             = (brg.beta != 0.f && !may_load_accumulators_) || brg.alpha != 1.f;
-    const bool has_zero_points = !everyone_is(brgemm_broadcast_t::none,
-            brg.zp_type_a, brg.zp_type_b, brg.zp_type_c);
-    are_post_ops_applicable_ = one_of(true, brg.with_eltwise, brg.with_binary,
-            brg.with_scales, brg.with_bias, brg.with_sum, brg.dt_d != brg.dt_c,
-            has_zero_points, brg.with_dst_scales);
-
-    // second level blocking eligible only if we don't use store by vectors for now
-    assert(IMPLICATION(are_post_ops_applicable_ || need_to_apply_alpha_beta_
-                    || brg.brgattr.bd_mask_level,
-            !brg.is_blocked && !brg.brgattr.var_bs));
+    are_post_ops_applicable_ = brg.are_post_ops_applicable();
+
+    assert(IMPLICATION(brg.brgattr.LDB2 == 0, brg.load_dim <= brg.LDB));
+
     assert(IMPLICATION(brg.brgattr.var_bs,
             IMPLICATION(brg.is_input_convert(), brg.is_fp8_via_convert())));
     read_params();
     prepare_bd_mask();
 
     Label permute_index_table;
-    if (brg.is_input_convert()) {
+    if (brg.is_input_convert() || brg.amx_wary_k_tail()) {
         // save tiles description for later use
         brgemm_init_tiles(brg, (char *)(&palette_));
         // load permute indices
@@ -2669,7 +2756,7 @@ void brgemm_amx_uker_t::operator()(brgemm_kernel_params_t *params) const {
     (*brgemm_kernel_)(params);
 }
 
-const jit_generator *brgemm_amx_uker_t::get_jit_generator() const {
+const jit_generator_t *brgemm_amx_uker_t::get_jit_generator() const {
     return brgemm_kernel_;
 }
 
diff --git a/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp b/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp
index 0d26602aafd..d81e3f959db 100644
--- a/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp
+++ b/src/cpu/x64/brgemm/jit_brgemm_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "common/utils.hpp"
 
 #include "cpu/platform.hpp"
+#include "cpu/x64/brgemm/brgemm.hpp"
 #include "cpu/x64/brgemm/brgemm_types.hpp"
 #include "cpu/x64/cpu_barrier.hpp"
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
@@ -40,21 +41,18 @@ namespace x64 {
 using namespace dnnl::impl::utils;
 using namespace Xbyak;
 template <typename Wmm>
-struct jit_brgemm_kernel_t : public jit_generator {
+struct jit_brgemm_kernel_t : public jit_base_brgemm_kernel_t {
     jit_brgemm_kernel_t(const brgemm_desc_t &abrg)
-        : jit_generator(jit_name(), abrg.isa_impl)
+        : jit_base_brgemm_kernel_t(jit_name(), abrg.isa_impl)
         , brg(abrg)
         , postops_injector_(nullptr)
-        , max_effective_vregs(isa_num_vregs(brg.isa_impl)
-                  - (brg.is_int8 && !brg.has_int8_vnni
-                                  ? 2
-                                  : (brg.is_fp8_via_convert() ? 5 : 0))) {
+        , max_effective_vregs(get_max_effective_vregs(brg)) {
 
         // The implementation uses is_superset(), is_subset() utilities.
         // So avoid isa_all, isa_undef in these comparisions.
         assert(!utils::one_of(brg.isa_impl, isa_all, isa_undef));
-        const int is_ldb2_tail = brg.ldb2_tail ? 1 : 0;
-        const int is_ldb_tail = brg.ldb_tail ? 1 : 0;
+        const dim_t is_ldb2_tail = brg.ldb2_tail ? 1 : 0;
+        const dim_t is_ldb_tail = brg.ldb_tail ? 1 : 0;
         is_ldb_loop_ = brg.ldb2 + is_ldb2_tail + is_ldb_tail > 1;
 
         bool has_f8_e5m2_binary_postops = false;
@@ -82,15 +80,15 @@ struct jit_brgemm_kernel_t : public jit_generator {
                 // 'fp8_to_f16_upconvert()' param and would collision with these
                 // emulation vmms
                 f8_e5m2_emulator_ = utils::make_unique<fp8_emulation_e5m2_t>(
-                        this, xmm_fp8_emu_aux1, xmm_fp8_emu_aux2,
-                        xmm_fp8_emu_aux3, kmask_fp8_aux, reg64_fp8_aux);
+                        this, vmm_fp8_emu_aux1(), vmm_fp8_emu_aux2(),
+                        vmm_fp8_emu_aux3(), kmask_fp8_aux, reg64_fp8_aux);
             if (one_of(data_type::f8_e4m3, brg.dt_a, brg.dt_b, brg.dt_c,
                         brg.dt_d)
                     || has_f8_e4m3_binary_postops)
                 f8_e4m3_emulator_ = utils::make_unique<fp8_emulation_e4m3_t>(
-                        this, xmm_fp8_emu_aux1, xmm_fp8_emu_aux2,
-                        xmm_fp8_emu_aux3, xmm_fp8_emu_aux4, xmm_fp8_emu_aux5,
-                        reg64_fp8_aux);
+                        this, vmm_fp8_emu_aux1(), vmm_fp8_emu_aux2(),
+                        vmm_fp8_emu_aux3(), vmm_fp8_emu_aux4(),
+                        vmm_fp8_emu_aux5(), reg64_fp8_aux);
         }
 
         if (brg.with_eltwise || brg.with_binary || brg.with_sum) {
@@ -131,16 +129,18 @@ struct jit_brgemm_kernel_t : public jit_generator {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_t)
 
-    brgemm_desc_t brg;
+    const brgemm_desc_t &get_brg() const override { return brg; }
 
 private:
+    brgemm_desc_t brg;
+
     enum matrix_kind_t { matrix_A, matrix_B };
     static constexpr int zmm_width_in_bytes_
-            = cpu_isa_traits<avx512_core>::vlen;
+            = cpu_isa_traits_t<avx512_core>::vlen;
     using Vmm =
             typename utils::conditional<std::is_same<Wmm, Xbyak::Tmm>::value,
                     Xbyak::Zmm, Wmm>::type;
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
     using po_injector_t = injector::jit_uni_postops_injector_base_t<Vmm>;
     std::unique_ptr<po_injector_t> postops_injector_;
     std::unique_ptr<bf16_emulation_t> bf16_emu_;
@@ -149,6 +149,8 @@ struct jit_brgemm_kernel_t : public jit_generator {
 
     Xbyak::Label avx_tail_mask_;
     Xbyak::Label sum_zp_scale_data_;
+    Xbyak::Label f16_perm_even_table_;
+    Xbyak::Label f16_perm_odd_table_;
     using reg64_t = const Xbyak::Reg64;
 
     // Register decomposition
@@ -198,6 +200,13 @@ struct jit_brgemm_kernel_t : public jit_generator {
     const reg64_t reg_aux_zp_comp_b = reg_rdb_loop;
     const reg64_t reg_zp_c_values = reg_rdb_loop;
     const reg64_t reg_aux_zp_c_values = reg_rdb_loop;
+    const reg64_t reg_wei_scales = reg_rdb_loop;
+    const reg64_t reg_aux_wei_scales = reg_rdb_loop;
+    const reg64_t reg_wei_zp = reg_rdb_loop;
+    const reg64_t reg_aux_wei_zp = reg_rdb_loop;
+    const reg64_t reg_ic = reg_rdb_loop;
+    const reg64_t reg_src_scales = reg_rdb_loop;
+    const reg64_t reg_src_grouped_sum = reg_rdb_loop;
     const reg64_t reg_tmp_read_values = reg_rdb_loop;
 
     const reg64_t reg_aux_scales = reg_aux_B;
@@ -262,10 +271,27 @@ struct jit_brgemm_kernel_t : public jit_generator {
     constexpr static int reg_aux_D_backup_offs_ = 232;
     constexpr static int reg_aux_D_bdb_loop_backup_offs_ = 240;
     constexpr static int reg_aux_D_bdb_loop_shift_offs_ = 248;
+    constexpr static int reg_wei_scales_offs_ = 256;
+    constexpr static int reg_aux_wei_scales_offs_ = 264;
+    constexpr static int reg_wei_zero_points_offs_ = 272;
+    constexpr static int reg_aux_wei_zero_points_offs_ = 280;
+    constexpr static int reg_ic_offs_ = 288;
+    constexpr static int reg_aux2_D_offs_ = 296;
+    constexpr static int reg_aux2_wei_scales_offs_ = 304;
+    constexpr static int reg_aux2_wei_zero_points_offs_ = 312;
+    constexpr static int reg_aux_ic_offs_ = 320;
+    constexpr static int reg_reg_a_offset_offs_ = 328;
+    constexpr static int reg_src_scales_offs_ = 336;
+    constexpr static int reg_aux_src_scales_offs_ = 344;
+    constexpr static int reg_aux2_src_scales_offs_ = 352;
+    constexpr static int reg_src_grouped_sum_offs_ = 360;
+    constexpr static int reg_aux_src_grouped_sum_offs_ = 368;
+    constexpr static int reg_aux2_src_grouped_sum_offs_ = 376;
     // these are used for FP8 as temporary push/pop spaces
-    constexpr static int reg_val_tmp_1_ = 256;
-    constexpr static int reg_val_tmp_2_ = 264;
-    constexpr static int stack_space_needed_ = 272;
+    constexpr static int reg_val_tmp_1_ = 384;
+    constexpr static int reg_val_tmp_2_ = 392;
+    constexpr static int stack_space_needed_ = 400;
+
 
     bool is_ldb_loop_ = false;
     bool with_binary_non_scalar_bcast_ = false;
@@ -275,14 +301,46 @@ struct jit_brgemm_kernel_t : public jit_generator {
     Xbyak::Opmask ld_tail_mask = Xbyak::Opmask(3);
     Xbyak::Opmask fp8_col_mask = Xbyak::Opmask(4);
     Xbyak::Opmask kmask_fp8_aux = Xbyak::Opmask(5);
+    Xbyak::Opmask rd_tail_mask = Xbyak::Opmask(6);
+
+    static int get_max_effective_vregs(const brgemm_desc_t &brg) {
+        auto used_vregs = 0;
+        if (brg.is_int8 && !brg.has_int8_vnni)
+            used_vregs = 2;
+        else if (brg.is_fp8_via_convert())
+            used_vregs = 5;
+        else if (brg.is_f16_b_non_amx_vnni())
+            used_vregs = 2;
+
+        if (one_of(brg.dt_b, data_type::nf4) && brg.isa_impl == avx2) {
+            used_vregs += 5;
+        }
+
+        if (one_of(brg.dt_b, data_type::f4_e2m1) && brg.isa_impl == avx2) {
+            used_vregs += 2;
+        }
+
+        if (one_of(brg.dt_b, data_type::nf4, data_type::f4_e2m1) && brg.isa_impl != avx2) {
+            used_vregs += 1;
+        }
+
+        if (brg.with_wei_decomp_zero_points && brg.wei_decomp_zero_points_stride == 0 && !brg.with_src_dyn_quant) {
+            used_vregs += 1;
+        }
 
-    Vmm accm(int ld_block, int bd, int ld) {
+        if (brg.with_src_dyn_quant) {
+            used_vregs += 1;
+        }
+        return isa_num_vregs(brg.isa_impl) - used_vregs;
+    }
+
+    Vmm accm(dim_t ld_block, dim_t bd, dim_t ld) {
         return Vmm(max_effective_vregs - 1 - (bd * ld_block + ld));
     }
 
-    Vmm bcst(int bd = 0) {
-        if (n_bcast_1_load) {
-            int idx = max_effective_vregs - 1 - (brg.ld_block2 * brg.bd_block)
+    Vmm bcst(dim_t bd = 0) {
+        if (brg.n_bcast_1_load) {
+            dim_t idx = max_effective_vregs - 1 - (brg.ld_block2 * brg.bd_block)
                     - bd;
             assert(idx > 0);
             return Vmm(idx);
@@ -290,18 +348,18 @@ struct jit_brgemm_kernel_t : public jit_generator {
             return Vmm(0);
     }
 
-    Vmm load(int ld = 0) {
-        if (n_bcast_1_load) {
+    Vmm load(dim_t ld = 0) {
+        if (brg.n_bcast_1_load) {
             return Vmm(0);
         } else {
-            int idx = max_effective_vregs - 1 - (brg.ld_block2 * brg.bd_block)
+            dim_t idx = max_effective_vregs - 1 - (brg.ld_block2 * brg.bd_block)
                     - ld;
             assert(idx > 0);
             return Vmm(idx);
         }
     }
 
-    Vmm vmm_tmp(int i) {
+    Vmm vmm_tmp(dim_t i) {
         assert(IMPLICATION(!brg.is_tmm,
                 i >= 0
                         && i < max_effective_vregs
@@ -310,6 +368,10 @@ struct jit_brgemm_kernel_t : public jit_generator {
     }
 
     Vmm vmm_tail_mask() { return vmm_tmp(1); }
+    Vmm vmm_beta() { return vmm_tmp(1); }
+    Vmm vmm_lbound() { return vmm_tmp(1); }
+    Vmm vmm_ubound() { return vmm_tmp(0); }
+
     Vmm vmm_one_bytes() const noexcept { return Vmm(3); }
     Vmm vmm_zp_a_shift() const noexcept { return Vmm(2); }
     Vmm vmm_inp_shift() const noexcept { return Vmm(1); }
@@ -322,11 +384,13 @@ struct jit_brgemm_kernel_t : public jit_generator {
     // note: zmm reserv_5 is not necessary since it's only used for 'vdpbf16ps'
 
     // fp8 emulation convert
-    Vmm xmm_fp8_emu_aux1 = Vmm(1);
-    Vmm xmm_fp8_emu_aux2 = Vmm(2);
-    Vmm xmm_fp8_emu_aux3 = Vmm(3);
-    Vmm xmm_fp8_emu_aux4 = Vmm(4);
-    Vmm xmm_fp8_emu_aux5 = Vmm(5);
+    Vmm vmm_fp8_emu_aux1() const noexcept { return Vmm(1); }
+    Vmm vmm_fp8_emu_aux2() const noexcept { return Vmm(2); }
+    Vmm vmm_fp8_emu_aux3() const noexcept { return Vmm(3); }
+    Vmm vmm_fp8_emu_aux4() const noexcept { return Vmm(4); }
+    Vmm vmm_fp8_emu_aux5() const noexcept { return Vmm(5); }
+
+    Zmm zmm_tmp_1() const noexcept { return Zmm(1); }
 
     // Required in every dot product for INT8 non-VNNI computation.
     Vmm int8_ones_words() const noexcept {
@@ -336,6 +400,13 @@ struct jit_brgemm_kernel_t : public jit_generator {
         return Vmm(isa_num_vregs(brg.isa_impl) - 2);
     }
 
+    Vmm f16_perm_even_vreg() const noexcept {
+        return Vmm(isa_num_vregs(brg.isa_impl) - 1);
+    }
+    Vmm f16_perm_odd_vreg() const noexcept {
+        return Vmm(isa_num_vregs(brg.isa_impl) - 2);
+    }
+
     Vmm vmm_mask(const Vmm vmm_in, bool mask_flag, bool store,
             Xbyak::Opmask ktail_mask) const;
     Vmm_lower_t vmm_lower_mask(const Vmm_lower_t vmm_lower_in, bool mask_flag,
@@ -344,263 +415,287 @@ struct jit_brgemm_kernel_t : public jit_generator {
 
     void cvt2ps(data_type_t type_in, const Vmm vmm_in, const Xbyak::Operand &op,
             bool mask_flag, bool store, Xbyak::Opmask ktail_mask,
-            int tail_size);
+            dim_t tail_size);
 
     void advance_ldb_post_op_regs();
-    void restore_ldb_post_op_regs(int ld_block2);
-    void advance_bdb_post_op_regs(int adj_bd_block);
-    void restore_bdb_post_op_regs(int bd_block2);
-    void ldb_regs_shift(int ld_block2, bool is_tail = false);
-    void advance_bd_block2_post_op_regs(int bd_block2);
+    void restore_ldb_post_op_regs(dim_t ld_block2);
+    void advance_bdb_post_op_regs(dim_t adj_bd_block);
+    void restore_bdb_post_op_regs(dim_t bd_block2);
+    void ldb_regs_shift(dim_t ld_block2, bool is_tail = false);
+    void advance_bd_block2_post_op_regs(dim_t bd_block2);
 
     void copy_post_ops_stack_values_to_aux(bool is_reg_tail);
     void read_params();
-    void zero_accumulators(int bd_block2, bool is_bdb_tail, int ld_block,
+    void zero_accumulators(dim_t bd_block2, bool is_bdb_tail, dim_t ld_block,
             bool is_ld_tail, bool skip_accumulation);
 
-    void fp8_to_f16_upconvert(int num_rows, int tile_num_col_bytes,
-            reg64_t reg_base, int offset, reg64_t reg_data_stride,
+    void fp8_to_f16_upconvert(dim_t num_rows, dim_t tile_num_col_bytes,
+            reg64_t reg_base, dim_t offset, reg64_t reg_data_stride,
             data_type_t dt, bool is_rd_tail);
-    void fp8_to_f16_upconvert_to_vnni(int num_rows, int tile_num_col_bytes,
-            reg64_t reg_base, int offset, reg64_t reg_data_stride,
+    void fp8_to_f16_upconvert_to_vnni(dim_t num_rows, dim_t tile_num_col_bytes,
+            reg64_t reg_base, dim_t offset, reg64_t reg_data_stride,
             data_type_t dt, bool is_rd_tail);
-    void store_accumulators(int bd_block2, bool is_bdb_tail, int ld_block,
+    void store_accumulators(dim_t bd_block2, bool is_bdb_tail, dim_t ld_block,
             bool is_ld_tail, bool skip_accumulation);
     void store_accumulators_without_post_ops(
-            int bd_block, int ld_block, bool is_ld_tail);
-    void store_accumulators_apply_post_ops(int bd_block, int ld_block,
-            int ldb_and_bdb_offset, bool is_ld_tail);
-    void apply_compensation(int bd_block, int ld_block, bool is_ld_tail);
-    void apply_alpha_beta(int bd_block, int ld_block, bool is_ld_tail);
-    void apply_post_ops(int bd_block, int ld_block2, int ldb_and_bdb_offset,
-            bool is_ld_tail);
+            dim_t bd_block, dim_t ld_block, bool is_ld_tail);
+    void store_accumulators_apply_post_ops(dim_t bd_block, dim_t ld_block,
+            dim_t ldb_and_bdb_offset, bool is_ld_tail);
+    void apply_compensation(dim_t bd_block, dim_t ld_block, bool is_ld_tail);
+    void apply_alpha_beta(dim_t bd_block, dim_t ld_block, bool is_ld_tail);
+    void apply_post_ops(dim_t bd_block, dim_t ld_block2,
+            dim_t ldb_and_bdb_offset, bool is_ld_tail);
     void restore_A_B_matrices();
     void set_A_B_matrices();
 
-    void compute_int8_compensation(int rd_loop, int bd_b, int bd_e,
-            int bd_block, int ld_block2, bool is_ld_tail, int vpad);
+    void compute_int8_compensation(dim_t rd_loop, dim_t bd_b, dim_t bd_e,
+            dim_t bd_block, dim_t ld_block2, bool is_ld_tail, dim_t vpad);
     void maybe_pre_process_data(matrix_kind_t matrix_kind, const Tmm &t1,
-            reg64_t reg_base, size_t offset, reg64_t reg_stride, int num_rows,
-            int num_col_bytes, bool is_rd_tail);
-    void maybe_tileloadd_nt(matrix_kind_t matrix_kind, int idx, int offset,
-            bool is_rd_tail, bool is_tail);
+            reg64_t reg_base, dim_t offset, reg64_t reg_stride, dim_t num_rows,
+            dim_t num_col_bytes, bool is_rd_tail);
+    bool maybe_pre_process_k_tail(bool last_bdb, bool is_rd_tail, const Tmm &t1,
+            reg64_t reg_base, dim_t offset, reg64_t reg_stride,
+            matrix_kind_t mk);
+    void maybe_tileloadd_nt(matrix_kind_t matrix_kind, dim_t idx, dim_t offset,
+            bool is_rd_tail, bool is_tail, bool last_bdb);
     void dot_product(Vmm v1, Vmm v2, Vmm v3);
-    void gemm_microkernel(int bd_block2, bool is_bdb_tail, int ld_block,
-            bool is_rd_tail, bool is_ld_tail, int vpad, int rows_for_rd_tail);
-    void gemm_microkernel_amx(int bd_block2, bool is_bdb_tail, int ld_block,
-            bool is_rd_tail, bool is_ld_tail);
-
-    void ldb_loop(int bd_block2, bool is_bdb_tail, int ld_block,
-            int ldb_loop_length, bool is_reg_tail, bool is_ld_tail,
-            bool check_top_vpad, bool check_bottom_vpad, int rows_for_rd_tail,
+    void gemm_microkernel(dim_t bd_block2, bool is_bdb_tail, dim_t ld_block,
+            bool is_rd_tail, bool is_ld_tail, dim_t vpad,
+            dim_t rows_for_rd_tail);
+    void gemm_microkernel_amx(dim_t bd_block2, bool is_bdb_tail,
+            dim_t ld_block2, bool is_rd_tail, bool is_ld_tail, bool last_bdb);
+    void gemm_microkernel_dyn_quant(dim_t bd_block2, bool is_bdb_tail, dim_t ld_block,
+            bool is_rd_tail, bool is_ld_tail, dim_t vpad, dim_t rows_for_rd_tail);
+
+    void ldb_loop(dim_t bd_block2, bool is_bdb_tail, dim_t ld_block,
+            dim_t ldb_loop_length, bool is_reg_tail, bool is_ld_tail,
+            bool first_bdb, bool last_bdb, dim_t rows_for_rd_tail,
             bool skip_accumulation);
     void bdb_loop();
 
     void generate() override;
 
-    int A_offset(int bd, int rd, bool is_amx = false) const noexcept;
-    int B_offset(int ld, int rd, bool is_amx = false) const noexcept;
-    int C_offset(int bd, int ld) const noexcept;
-    int D_offset(int bd, int ld) const noexcept;
-
-    int rdb_A_offset() const noexcept;
-    int rdb_B_offset() const noexcept;
-
-    int ldb_B_offset(int ld_block2, bool is_tail = false) const noexcept;
-    int ldb_C_offset(int ld_block2, bool is_tail = false) const noexcept;
-    int ldb_D_offset(int ld_block2, bool is_tail = false) const noexcept;
-    int ldb_po_offset(int ld_block2, bool is_tail = false) const noexcept;
-
-    int bdb_A_offset(int bd_block2) const noexcept;
-    int bdb_C_offset(int bd_block2) const noexcept;
-    int bdb_D_offset(int bd_block2) const noexcept;
-    int bdb_po_offset(int bd_block2) const noexcept;
-
-    int bias_offset(int ld, bool is_tail = false) const noexcept;
-    int oc_logical_offset(int ld, bool is_tail = false) const noexcept;
-
-    int compensations_offset(int ld, bool is_tail = false) const noexcept;
-    int bdb_compensation_offset(int bd_block2) const noexcept;
-    int bd_compensation_offset(int ld, int bd) const noexcept;
-    int scales_offset(int ld, bool is_tail = false) const noexcept;
-    int zp_comp_a_offset(int ld, bool is_tail = false) const noexcept;
-    int bd_zp_comp_a_offset(int ld, int bd) const noexcept;
-    int bdb_zp_comp_a_offset(int bd_block2) const noexcept;
-    int zp_comp_b_offset(int bd) const noexcept;
-    int bdb_zp_comp_b_offset(int bd_block2) const noexcept;
-    int zp_c_values_offset(int ld, bool is_tail = false) const noexcept;
-
-    bool n_bcast_1_load = false;
+    dim_t A_offset(dim_t bd, dim_t rd, bool is_amx = false) const noexcept;
+    dim_t B_offset(dim_t ld, dim_t rd, bool is_amx = false) const noexcept;
+    dim_t C_offset(dim_t bd, dim_t ld) const noexcept;
+    dim_t D_offset(dim_t bd, dim_t ld) const noexcept;
+
+    dim_t rdb_A_offset() const noexcept;
+    dim_t rdb_B_offset() const noexcept;
+
+    dim_t ldb_B_offset(dim_t ld_block2, bool is_tail = false) const noexcept;
+    dim_t ldb_C_offset(dim_t ld_block2, bool is_tail = false) const noexcept;
+    dim_t ldb_D_offset(dim_t ld_block2, bool is_tail = false) const noexcept;
+    dim_t ldb_po_offset(dim_t ld_block2, bool is_tail = false) const noexcept;
+
+    dim_t bdb_A_offset(dim_t bd_block2) const noexcept;
+    dim_t bdb_C_offset(dim_t bd_block2) const noexcept;
+    dim_t bdb_D_offset(dim_t bd_block2) const noexcept;
+    dim_t bdb_po_offset(dim_t bd_block2) const noexcept;
+
+    dim_t bias_offset(dim_t ld, bool is_tail = false) const noexcept;
+    dim_t oc_logical_offset(dim_t ld, bool is_tail = false) const noexcept;
+
+    dim_t compensations_offset(dim_t ld, bool is_tail = false) const noexcept;
+    dim_t bdb_compensation_offset(dim_t bd_block2) const noexcept;
+    dim_t bd_compensation_offset(dim_t ld, dim_t bd) const noexcept;
+    dim_t scales_offset(dim_t ld, bool is_tail = false) const noexcept;
+    dim_t zp_comp_a_offset(dim_t ld, bool is_tail = false) const noexcept;
+    dim_t bd_zp_comp_a_offset(dim_t ld, dim_t bd) const noexcept;
+    dim_t bdb_zp_comp_a_offset(dim_t bd_block2) const noexcept;
+    dim_t zp_comp_b_offset(dim_t bd) const noexcept;
+    dim_t bdb_zp_comp_b_offset(dim_t bd_block2) const noexcept;
+    dim_t zp_c_values_offset(dim_t ld, bool is_tail = false) const noexcept;
+    dim_t wei_scales_offset(dim_t ld, bool is_tail = false) const noexcept;
+    dim_t wei_zp_offset(dim_t ld, bool is_tail = false) const noexcept;
     bool vpad_exist = false;
     bool need_comp_pads = false;
+    palette_config_t palette_;
 };
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::A_offset(
-        int bd, int rd, bool is_amx) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::A_offset(
+        dim_t bd, dim_t rd, bool is_amx) const noexcept {
     return (is_amx) ? brg.typesize_A * (bd * brg.bd_block * brg.LDA)
                     : brg.typesize_A * (bd * brg.LDA + rd);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::B_offset(
-        int ld, int rd, bool is_amx) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::B_offset(
+        dim_t ld, dim_t rd, bool is_amx) const noexcept {
+    int typesize_scale = one_of(brg.dt_b, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
     if (is_amx) {
-        return brg.typesize_B * (brg.rd_step * ld * brg.ld_block);
+        return brg.typesize_B * (brg.rd_step * ld * brg.ld_block) / typesize_scale;
     } else {
-        const int data_vnni_granularity = brg.ld_step;
-        const int rdb0 = rd / data_vnni_granularity;
+        const dim_t rdb0 = rd / brg.ld_step;
         // Note: Offsets for elements within vnni_granularity are expected to be
         // handled within gemm_microkernel (for ex: odd-even converts).
-        // hence no `rd % data_vnni_granularity`
+        // hence no `rd % brg.ld_step`
         return brg.typesize_B
-                * (rdb0 * data_vnni_granularity * brg.LDB
-                        + data_vnni_granularity * ld * brg.ld_block);
+                * (rdb0 * brg.ld_step * brg.LDB
+                        + brg.ld_step * ld * brg.ld_block) / typesize_scale;
     }
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::C_offset(int bd, int ld) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::C_offset(dim_t bd, dim_t ld) const noexcept {
     const auto bd_shift = brg.is_runtime_ldc ? 0 : bd * brg.LDC;
     return brg.typesize_C * (bd_shift + ld * brg.ld_block);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::D_offset(int bd, int ld) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::D_offset(dim_t bd, dim_t ld) const noexcept {
     const auto bd_shift = brg.is_runtime_ldd ? 0 : bd * brg.LDD;
     return brg.typesize_D * (bd_shift + ld * brg.ld_block);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::rdb_A_offset() const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::rdb_A_offset() const noexcept {
     return brg.typesize_A * brg.rd_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::rdb_B_offset() const noexcept {
-    return brg.typesize_B * brg.rd_block * brg.LDB;
+dim_t jit_brgemm_kernel_t<Wmm>::rdb_B_offset() const noexcept {
+    int typesize_scale = one_of(brg.dt_b, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+    return brg.typesize_B * brg.rd_block * brg.LDB / typesize_scale;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::ldb_B_offset(
-        int ld_block2, bool is_tail) const noexcept {
-    return (is_tail) ? brg.typesize_B * brg.ldb_tail * brg.ld_step
-                     : brg.typesize_B * ld_block2 * brg.ld_block * brg.ld_step;
+dim_t jit_brgemm_kernel_t<Wmm>::ldb_B_offset(
+        dim_t ld_block2, bool is_tail) const noexcept {
+    int typesize_scale = one_of(brg.dt_b, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+    return (is_tail) ? brg.typesize_B * brg.ldb_tail * brg.ld_step / typesize_scale
+                     : brg.typesize_B * ld_block2 * brg.ld_block * brg.ld_step / typesize_scale;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::ldb_C_offset(
-        int ld_block2, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::ldb_C_offset(
+        dim_t ld_block2, bool is_tail) const noexcept {
     return (is_tail) ? brg.typesize_C * brg.ldb_tail
                      : brg.typesize_C * ld_block2 * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::ldb_D_offset(
-        int ld_block2, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::ldb_D_offset(
+        dim_t ld_block2, bool is_tail) const noexcept {
     return (is_tail) ? brg.typesize_D * brg.ldb_tail
                      : brg.typesize_D * ld_block2 * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::ldb_po_offset(
-        int ld_block2, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::ldb_po_offset(
+        dim_t ld_block2, bool is_tail) const noexcept {
     return (is_tail) ? brg.ldb_tail : ld_block2 * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_A_offset(int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_A_offset(dim_t bd_block2) const noexcept {
     return brg.typesize_A * bd_block2 * brg.bd_block * brg.LDA;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_C_offset(int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_C_offset(dim_t bd_block2) const noexcept {
     return bd_block2 * brg.bd_block
             * (brg.is_runtime_ldc ? 1 : brg.typesize_C * brg.LDC);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_D_offset(int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_D_offset(dim_t bd_block2) const noexcept {
     return bd_block2 * brg.bd_block
             * (brg.is_runtime_ldd ? 1 : brg.typesize_D * brg.LDD);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_po_offset(int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_po_offset(dim_t bd_block2) const noexcept {
     return bd_block2 * brg.bd_block * brg.LDD;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bias_offset(int ld, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bias_offset(
+        dim_t ld, bool is_tail) const noexcept {
     return (is_tail) ? brg.typesize_bias * brg.ldb_tail
                      : brg.typesize_bias * ld * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::oc_logical_offset(
-        int ld, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::oc_logical_offset(
+        dim_t ld, bool is_tail) const noexcept {
     return (is_tail) ? brg.ldb_tail : ld * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::compensations_offset(
-        int ld, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::compensations_offset(
+        dim_t ld, bool is_tail) const noexcept {
     return (is_tail) ? sizeof(int32_t) * brg.ldb_tail
                      : sizeof(int32_t) * ld * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_compensation_offset(
-        int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_compensation_offset(
+        dim_t bd_block2) const noexcept {
     return sizeof(int32_t) * bd_block2 * brg.bd_block * brg.LDB;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bd_compensation_offset(
-        int ld, int bd) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bd_compensation_offset(
+        dim_t ld, dim_t bd) const noexcept {
     return sizeof(int32_t) * (ld * brg.ld_block + bd * brg.LDB);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::scales_offset(
-        int ld, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::scales_offset(
+        dim_t ld, bool is_tail) const noexcept {
     return (is_tail) ? brg.is_oc_scale * sizeof(float) * brg.ldb_tail
                      : brg.is_oc_scale * sizeof(float) * ld * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::zp_comp_a_offset(
-        int ld, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::zp_comp_a_offset(
+        dim_t ld, bool is_tail) const noexcept {
     return (is_tail) ? sizeof(int32_t) * brg.ldb_tail
                      : sizeof(int32_t) * ld * brg.ld_block;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_zp_comp_a_offset(
-        int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::wei_scales_offset(
+        dim_t ld, bool is_tail) const noexcept {
+    return (is_tail) ? types::data_type_size(brg.wei_decomp_scales_dt) * brg.ldb_tail
+                     : types::data_type_size(brg.wei_decomp_scales_dt) * ld * brg.ld_block;
+}
+
+template <typename Wmm>
+dim_t jit_brgemm_kernel_t<Wmm>::wei_zp_offset(
+        dim_t ld, bool is_tail) const noexcept {
+    return (is_tail) ? types::data_type_size(brg.wei_decomp_zero_points_dt) * brg.ldb_tail
+                     : types::data_type_size(brg.wei_decomp_zero_points_dt) * ld * brg.ld_block;
+}
+
+template <typename Wmm>
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_zp_comp_a_offset(
+        dim_t bd_block2) const noexcept {
     return sizeof(int32_t) * bd_block2 * brg.bd_block * brg.LDB;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bd_zp_comp_a_offset(
-        int ld, int bd) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bd_zp_comp_a_offset(
+        dim_t ld, dim_t bd) const noexcept {
     return sizeof(int32_t) * (ld * brg.ld_block + bd * brg.LDB);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::zp_comp_b_offset(int bd) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::zp_comp_b_offset(dim_t bd) const noexcept {
     return sizeof(int32_t) * bd;
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::bdb_zp_comp_b_offset(
-        int bd_block2) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::bdb_zp_comp_b_offset(
+        dim_t bd_block2) const noexcept {
     return zp_comp_b_offset(bd_block2 * brg.bd_block);
 }
 
 template <typename Wmm>
-int jit_brgemm_kernel_t<Wmm>::zp_c_values_offset(
-        int ld, bool is_tail) const noexcept {
+dim_t jit_brgemm_kernel_t<Wmm>::zp_c_values_offset(
+        dim_t ld, bool is_tail) const noexcept {
     if (brg.zp_type_c == brgemm_broadcast_t::per_n) {
         return (is_tail) ? sizeof(int32_t) * brg.ldb_tail
                          : sizeof(int32_t) * ld * brg.ld_block;
@@ -636,10 +731,10 @@ void jit_brgemm_kernel_t<Wmm>::maybe_set_avx_mask(bool is_ld_tail) {
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::cvt2ps(data_type_t type_in, const Vmm vmm_in,
         const Xbyak::Operand &op, bool mask_flag, bool store,
-        Xbyak::Opmask ktail_mask, int tail_size) {
+        Xbyak::Opmask ktail_mask, dim_t tail_size) {
     Vmm vmm = vmm_in;
-    const bool has_tail
-            = op.isMEM() && tail_size != vreg_traits<Vmm>::vlen / sizeof(float);
+    const bool has_tail = op.isMEM()
+            && tail_size != vreg_traits_t<Vmm>::vlen / sizeof(float);
     if (IMPLICATION(has_tail, is_superset(brg.isa_impl, avx512_core))) {
         vmm = vmm_mask(vmm_in, mask_flag, store, ktail_mask);
     } else {
@@ -700,7 +795,7 @@ void jit_brgemm_kernel_t<Wmm>::advance_ldb_post_op_regs() {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::restore_ldb_post_op_regs(int ld_block2) {
+void jit_brgemm_kernel_t<Wmm>::restore_ldb_post_op_regs(dim_t ld_block2) {
     if (brg.with_bias) {
         mov(reg_aux_bias, ptr[rsp + reg_aux_bias_offs_]);
         sub(reg_aux_bias, bias_offset(ld_block2 - 1));
@@ -724,7 +819,7 @@ void jit_brgemm_kernel_t<Wmm>::restore_ldb_post_op_regs(int ld_block2) {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::advance_bdb_post_op_regs(int adj_bd_block) {
+void jit_brgemm_kernel_t<Wmm>::advance_bdb_post_op_regs(dim_t adj_bd_block) {
     if (brg.zp_type_b != brgemm_broadcast_t::none) {
         mov(reg_aux_zp_comp_b, ptr[rsp + reg_aux_zp_comp_b_offs_]);
         add(reg_aux_zp_comp_b, bdb_zp_comp_b_offset(1));
@@ -739,7 +834,7 @@ void jit_brgemm_kernel_t<Wmm>::advance_bdb_post_op_regs(int adj_bd_block) {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::restore_bdb_post_op_regs(int bd_block2) {
+void jit_brgemm_kernel_t<Wmm>::restore_bdb_post_op_regs(dim_t bd_block2) {
     bool post_processed = false;
     if (bd_block2 > 1) {
         if (brg.zp_type_b != brgemm_broadcast_t::none) {
@@ -759,14 +854,16 @@ void jit_brgemm_kernel_t<Wmm>::restore_bdb_post_op_regs(int bd_block2) {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::ldb_regs_shift(int ld_block2, bool is_tail) {
-    int C_offset = (is_tail) ? ldb_C_offset(1, true) : ldb_C_offset(ld_block2);
-    int D_offset = (is_tail) ? ldb_D_offset(1, true) : ldb_D_offset(ld_block2);
+void jit_brgemm_kernel_t<Wmm>::ldb_regs_shift(dim_t ld_block2, bool is_tail) {
+    dim_t C_offset
+            = (is_tail) ? ldb_C_offset(1, true) : ldb_C_offset(ld_block2);
+    dim_t D_offset
+            = (is_tail) ? ldb_D_offset(1, true) : ldb_D_offset(ld_block2);
     add(reg_aux_C, C_offset);
     add(reg_aux_D, D_offset);
 
     add(reg_b_offset,
-            (is_tail) ? ldb_B_offset(1, true) : ldb_B_offset(ld_block2));
+            (is_tail) ? ldb_B_offset(0, true) : ldb_B_offset(ld_block2));
 
     if (brg.with_bias) {
         mov(reg_aux_bias, ptr[rsp + reg_aux_bias_offs_]);
@@ -787,6 +884,23 @@ void jit_brgemm_kernel_t<Wmm>::ldb_regs_shift(int ld_block2, bool is_tail) {
                 (is_tail) ? scales_offset(1, true) : scales_offset(ld_block2));
         mov(ptr[rsp + reg_aux_scales_offs_], reg_aux_scales);
     }
+
+    if (brg.with_wei_decomp) {
+        if (brg.with_wei_decomp_scales && brg.wei_decomp_scales_stride != 0 ) {
+            mov(reg_aux_wei_scales, ptr[rsp + reg_aux_wei_scales_offs_]);
+            add(reg_aux_wei_scales, (is_tail) ? wei_scales_offset(1, true) : wei_scales_offset(ld_block2));
+            mov(ptr[rsp + reg_aux_wei_scales_offs_], reg_aux_wei_scales);
+            mov(ptr[rsp + reg_aux2_wei_scales_offs_], reg_aux_wei_scales);
+        }
+
+        if (brg.with_wei_decomp_zero_points && brg.wei_decomp_zero_points_stride != 0 ) {
+            mov(reg_aux_wei_zp, ptr[rsp + reg_aux_wei_zero_points_offs_]);
+            add(reg_aux_wei_zp, (is_tail) ? wei_zp_offset(1, true) : wei_zp_offset(ld_block2));
+            mov(ptr[rsp + reg_aux_wei_zero_points_offs_], reg_aux_wei_zp);
+            mov(ptr[rsp + reg_aux2_wei_zero_points_offs_], reg_aux_wei_zp);
+        }
+    }
+
     if (brg.zp_type_a != brgemm_broadcast_t::none) {
         mov(reg_aux_zp_comp_a, ptr[rsp + reg_aux_zp_comp_a_offs_]);
         add(reg_aux_zp_comp_a,
@@ -804,7 +918,7 @@ void jit_brgemm_kernel_t<Wmm>::ldb_regs_shift(int ld_block2, bool is_tail) {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::advance_bd_block2_post_op_regs(int bd_block2) {
+void jit_brgemm_kernel_t<Wmm>::advance_bd_block2_post_op_regs(dim_t bd_block2) {
     if (brg.req_comp_pads_with_bcast && brg.req_s8s8_compensation) {
         mov(reg_compensation, ptr[rsp + reg_comp_offs_]);
         add(reg_compensation, bdb_compensation_offset(bd_block2));
@@ -854,6 +968,29 @@ void jit_brgemm_kernel_t<Wmm>::copy_post_ops_stack_values_to_aux(
             mov(reg_zp_c_values, ptr[rsp + reg_zp_c_values_offs_]);
             mov(ptr[rsp + reg_aux_zp_c_values_offs_], reg_zp_c_values);
         }
+
+        if (brg.with_wei_decomp_scales) {
+            mov(reg_wei_scales, ptr[rsp + reg_wei_scales_offs_]);
+            mov(ptr[rsp + reg_aux_wei_scales_offs_], reg_wei_scales);
+            mov(ptr[rsp + reg_aux2_wei_scales_offs_], reg_wei_scales);
+        }
+        if (brg.with_wei_decomp_zero_points) {
+            mov(reg_wei_zp, ptr[rsp + reg_wei_zero_points_offs_]);
+            mov(ptr[rsp + reg_aux_wei_zero_points_offs_], reg_wei_zp);
+            mov(ptr[rsp + reg_aux2_wei_zero_points_offs_], reg_wei_zp);
+        }
+
+    }
+    if (brg.with_src_dyn_quant) {
+        mov(reg_src_scales, ptr[rsp + reg_src_scales_offs_]);
+        mov(ptr[rsp + reg_aux_src_scales_offs_], reg_src_scales);
+        mov(ptr[rsp + reg_aux2_src_scales_offs_], reg_src_scales);
+
+        if (brg.with_wei_decomp_zero_points) {
+            mov(reg_src_grouped_sum, ptr[rsp + reg_src_grouped_sum_offs_]);
+            mov(ptr[rsp + reg_aux_src_grouped_sum_offs_], reg_src_grouped_sum);
+            mov(ptr[rsp + reg_aux2_src_grouped_sum_offs_], reg_src_grouped_sum);
+        }
     }
     if (brg.zp_type_b != brgemm_broadcast_t::none) {
         mov(reg_zp_comp_b, ptr[rsp + reg_zp_comp_b_offs_]);
@@ -917,6 +1054,25 @@ void jit_brgemm_kernel_t<Wmm>::read_params() {
         mov(ptr[rsp + reg_zp_comp_b_offs_], reg_zp_comp_b);
     }
 
+    if (brg.with_wei_decomp) {
+        mov(reg_wei_scales, ptr[param1 + GET_OFF(ptr_wei_scales)]);
+        mov(ptr[rsp + reg_wei_scales_offs_], reg_wei_scales);
+
+        mov(reg_wei_zp, ptr[param1 + GET_OFF(ptr_wei_zero_points)]);
+        mov(ptr[rsp + reg_wei_zero_points_offs_], reg_wei_zp);
+
+        mov(reg_ic, ptr[param1 + GET_OFF(ic)]);
+        mov(ptr[rsp + reg_ic_offs_], reg_ic);
+    }
+
+    if (brg.with_src_dyn_quant) {
+        mov(reg_src_scales, ptr[param1 + GET_OFF(ptr_src_scales)]);
+        mov(ptr[rsp + reg_src_scales_offs_], reg_src_scales);
+
+        mov(reg_src_grouped_sum, ptr[param1 + GET_OFF(ptr_src_grouped_sum)]);
+        mov(ptr[rsp + reg_src_grouped_sum_offs_], reg_src_grouped_sum);
+    }
+
     if (brg.zp_type_c != brgemm_broadcast_t::none) {
         mov(reg_zp_c_values, ptr[param1 + GET_OFF(c_zp_values)]);
         mov(ptr[rsp + reg_zp_c_values_offs_], reg_zp_c_values);
@@ -953,21 +1109,21 @@ void jit_brgemm_kernel_t<Wmm>::read_params() {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::zero_accumulators(int bd_block2,
-        bool is_bdb_tail, int ld_block2, bool is_ld_tail,
+void jit_brgemm_kernel_t<Wmm>::zero_accumulators(dim_t bd_block2,
+        bool is_bdb_tail, dim_t ld_block2, bool is_ld_tail,
         bool skip_accumulation) {
     if (brg.is_tmm) {
         // avoid usage of tile registers if there is no accumulation
         if (skip_accumulation) return;
-        for_(int bdb = 0; bdb < bd_block2; bdb++)
-        for (int ldb = 0; ldb < ld_block2; ldb++) {
-            int idx = (is_ld_tail) ? brg.ld_block2 : ldb;
+        for_(dim_t bdb = 0; bdb < bd_block2; bdb++)
+        for (dim_t ldb = 0; ldb < ld_block2; ldb++) {
+            dim_t idx = (is_ld_tail) ? brg.ld_block2 : ldb;
             tilezero(Tmm(brg.get_C_tensor(bdb, idx, is_bdb_tail, is_ld_tail)));
         }
     } else {
-        int bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
-        for_(int bd = 0; bd < bd_block; bd++)
-        for (int ld = 0; ld < ld_block2; ld++) {
+        dim_t bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
+        for_(dim_t bd = 0; bd < bd_block; bd++)
+        for (dim_t ld = 0; ld < ld_block2; ld++) {
             auto vmm = accm(ld_block2, bd, ld);
             uni_vpxor(vmm, vmm, vmm);
         }
@@ -977,30 +1133,30 @@ void jit_brgemm_kernel_t<Wmm>::zero_accumulators(int bd_block2,
 // This method up-converts the data from bf8 to f16 and saves at reg_buf.
 // Generally used by matrix_A, where no vnni transformation of data is needed.
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert(int num_rows,
-        int tile_num_col_bytes, reg64_t reg_base, int offset,
+void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert(dim_t num_rows,
+        dim_t tile_num_col_bytes, reg64_t reg_base, dim_t offset,
         reg64_t reg_data_stride, data_type_t dt, bool is_rd_tail) {
 
-    int rd_block = is_rd_tail ? brg.rdb_tail : brg.rd_block;
+    dim_t rd_block = is_rd_tail ? brg.rdb_tail : brg.rd_block;
 
-    const int max_num_cols = rd_block; //tile_num_col_bytes / sizeof(float16_t);
-    const int col_tail = max_num_cols % 32;
+    const dim_t max_num_cols
+            = rd_block; //tile_num_col_bytes / sizeof(float16_t);
+    const dim_t col_tail = max_num_cols % 32;
     auto zmm_1 = vmm_tmp(0);
     auto zmm_1_masked = col_tail ? zmm_1 | fp8_col_mask | T_z : zmm_1;
 
     assert(max_num_cols > 0);
 
     if (col_tail) {
-        const int tail_mask = (1 << col_tail) - 1;
-        auto reg_tmp_32 = reg_tmp_gpr.cvt32();
-        mov(reg_tmp_32, tail_mask);
-        kmovd(fp8_col_mask, reg_tmp_32);
+        const auto tail_mask = (static_cast<size_t>(1) << col_tail) - 1;
+        mov(reg_tmp_gpr, tail_mask);
+        kmovq(fp8_col_mask, reg_tmp_gpr);
     }
     // Note: using the same register used in col_tail, so order is important
     const auto reg_data_aux = reg_tmp_gpr;
     lea(reg_data_aux, ptr[reg_base + offset]);
 
-    for (int r = 0; r < num_rows; ++r) {
+    for (dim_t r = 0; r < num_rows; ++r) {
         if (dt == data_type::f8_e5m2)
             f8_e5m2_emulator_->vcvt_f8_to_f16(zmm_1_masked, ptr[reg_data_aux]);
         else if (dt == data_type::f8_e4m3)
@@ -1016,11 +1172,11 @@ void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert(int num_rows,
 // This method up-converts and transforms the data from fp8_vnni to f16_vnni
 // format. Generally used by matrix_B.
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert_to_vnni(int num_rows,
-        int tile_num_col_bytes, reg64_t reg_base, int offset,
+void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert_to_vnni(dim_t num_rows,
+        dim_t tile_num_col_bytes, reg64_t reg_base, dim_t offset,
         reg64_t reg_data_stride, data_type_t dt, bool is_rd_tail) {
-    const int num_cols_ele = tile_num_col_bytes / 2; // 32 for full tile
-    const int num_N = num_cols_ele / 2; // 16 for full tile
+    const dim_t num_cols_ele = tile_num_col_bytes / 2; // 32 for full tile
+    const dim_t num_N = num_cols_ele / 2; // 16 for full tile
     const auto zmm_2 = vmm_tmp(2);
 
     assert(num_N > 0 && "bad tile parameters");
@@ -1029,9 +1185,9 @@ void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert_to_vnni(int num_rows,
     const auto reg_data_aux = reg_tmp_gpr;
     lea(reg_data_aux, ptr[reg_base + offset]);
 
-    int rd_block = is_rd_tail ? brg.rdb_tail : brg.rd_block;
-    const int vnni_granularity = data_type_vnni_granularity(data_type::f16);
-    const int r_end = utils::div_up(rd_block, vnni_granularity);
+    dim_t rd_block = is_rd_tail ? brg.rdb_tail : brg.rd_block;
+    const dim_t vnni_granularity = data_type_vnni_granularity(data_type::f16);
+    const dim_t r_end = utils::div_up(rd_block, vnni_granularity);
     assert(r_end <= num_rows && "bad tile parameters");
 
     if (dt == data_type::f8_e5m2)
@@ -1046,16 +1202,16 @@ void jit_brgemm_kernel_t<Wmm>::fp8_to_f16_upconvert_to_vnni(int num_rows,
     // zero rest of the tile data
     if (r_end < num_rows) {
         vpxord(zmm_2, zmm_2, zmm_2);
-        for (int r = r_end; r < num_rows; ++r)
+        for (dim_t r = r_end; r < num_rows; ++r)
             vmovups(ptr[reg_buf_aux + r * zmm_width_in_bytes_], zmm_2);
     }
 }
 
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::apply_alpha_beta(
-        int bd_block, int ld_block2, bool is_ld_tail) {
+        dim_t bd_block, dim_t ld_block2, bool is_ld_tail) {
     const bool apply_alpha = brg.alpha != 1.f;
-    const bool dq2ps_required = brg.is_int8 && (apply_alpha || brg.beta != 1.f);
+    const bool dq2ps_required = brg.is_int8 && (apply_alpha || brg.beta != 1.f) && !brg.with_src_dyn_quant;
 
     auto vmm_alpha = vmm_tmp(0);
     if (apply_alpha) {
@@ -1063,8 +1219,8 @@ void jit_brgemm_kernel_t<Wmm>::apply_alpha_beta(
         uni_vmovq(Xmm(vmm_alpha.getIdx()), reg_tmp_gpr);
         uni_vbroadcastss(vmm_alpha, Xmm(vmm_alpha.getIdx()));
     }
-    for_(int bd = 0; bd < bd_block; bd++)
-    for (int ld = 0; ld < ld_block2; ld++) {
+    for_(dim_t bd = 0; bd < bd_block; bd++)
+    for (dim_t ld = 0; ld < ld_block2; ld++) {
         auto vmm = accm(ld_block2, bd, ld);
         if (dq2ps_required) uni_vcvtdq2ps(vmm, vmm);
         if (apply_alpha) uni_vmulps(vmm, vmm, vmm_alpha);
@@ -1074,11 +1230,10 @@ void jit_brgemm_kernel_t<Wmm>::apply_alpha_beta(
     const bool use_vadd_for_beta = brg.beta == 1.f && !dq2ps_required;
     const bool need_init_beta_vmm = brg.beta != 1.f;
     auto vmm_prev_dst = vmm_tmp(0);
-    auto vmm_beta = vmm_tail_mask();
     if (need_init_beta_vmm) {
         mov(reg_tmp_gpr, float2int(static_cast<float>(brg.beta)));
-        uni_vmovq(Xmm(vmm_beta.getIdx()), reg_tmp_gpr);
-        uni_vbroadcastss(vmm_beta, Xmm(vmm_beta.getIdx()));
+        uni_vmovq(Xmm(vmm_beta().getIdx()), reg_tmp_gpr);
+        uni_vbroadcastss(vmm_beta(), Xmm(vmm_beta().getIdx()));
     }
 
     if (brg.is_runtime_ldc && bd_block > 1)
@@ -1086,8 +1241,8 @@ void jit_brgemm_kernel_t<Wmm>::apply_alpha_beta(
 
     if (brg.is_fp8_via_convert()) mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux);
 
-    for_(int bd = 0; bd < bd_block; bd++)
-    for (int ld = 0; ld < ld_block2; ld++) {
+    for_(dim_t bd = 0; bd < bd_block; bd++)
+    for (dim_t ld = 0; ld < ld_block2; ld++) {
         const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
         const auto k_mask = is_tail ? ld_tail_mask : ld_full_mask;
         auto vmm = accm(ld_block2, bd, ld);
@@ -1095,25 +1250,25 @@ void jit_brgemm_kernel_t<Wmm>::apply_alpha_beta(
         if (use_vadd_for_beta) {
             if (IMPLICATION(is_tail, is_superset(brg.isa_impl, avx512_core))) {
                 auto vmm_masked = vmm_mask(vmm, is_tail, false, k_mask);
-                if (brg.is_int8)
+                if (brg.is_int8 && !brg.with_src_dyn_quant)
                     uni_vpaddd(vmm_masked, vmm, ptr_C);
                 else
                     uni_vaddps(vmm_masked, vmm, ptr_C);
             } else {
                 vmaskmovps(vmm_prev_dst, vmm_tail_mask(), ptr_C);
-                if (brg.is_int8)
+                if (brg.is_int8 && !brg.with_src_dyn_quant)
                     uni_vpaddd(vmm, vmm, vmm_prev_dst);
                 else
                     uni_vaddps(vmm, vmm, vmm_prev_dst);
             }
         } else {
-            const int ld_size = is_tail ? brg.ldb_tail : brg.ld_block;
+            const dim_t ld_size = is_tail ? brg.ldb_tail : brg.ld_block;
             cvt2ps(brg.dt_c, vmm_prev_dst, ptr_C, is_tail, false, k_mask,
                     ld_size);
             if (brg.beta == 1.f)
                 uni_vaddps(vmm, vmm, vmm_prev_dst);
             else
-                uni_vfmadd231ps(vmm, vmm_prev_dst, vmm_beta);
+                uni_vfmadd231ps(vmm, vmm_prev_dst, vmm_beta());
         }
         if (brg.is_runtime_ldc && bd_block > 1 && ld == ld_block2 - 1)
             add(reg_aux_C, ptr[rsp + reg_C_shift_bytes_offs_]);
@@ -1128,8 +1283,8 @@ void jit_brgemm_kernel_t<Wmm>::apply_alpha_beta(
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::apply_post_ops(
-        int bd_block, int ld_block2, int ldb_and_bdb_offset, bool is_ld_tail) {
+void jit_brgemm_kernel_t<Wmm>::apply_post_ops(dim_t bd_block, dim_t ld_block2,
+        dim_t ldb_and_bdb_offset, bool is_ld_tail) {
 
     binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
 
@@ -1143,16 +1298,16 @@ void jit_brgemm_kernel_t<Wmm>::apply_post_ops(
     if (brg.is_runtime_ldd && bd_block > 1)
         mov(ptr[rsp + reg_aux_D_backup_offs_], reg_aux_D);
 
-    const int bd_block_shift = brg.is_runtime_ldd ? 1 : bd_block;
-    for (int bd_block_idx = 0; bd_block_idx < bd_block;
+    const dim_t bd_block_shift = brg.is_runtime_ldd ? 1 : bd_block;
+    for (dim_t bd_block_idx = 0; bd_block_idx < bd_block;
             bd_block_idx += bd_block_shift) {
-        int bd_start = bd_block_idx;
-        int bd_end = bd_start + bd_block_shift;
+        dim_t bd_start = bd_block_idx;
+        dim_t bd_end = bd_start + bd_block_shift;
 
         const auto set_binary_injecotr_params = [&] {
             if (!brg.with_binary || !with_binary_non_scalar_bcast_) return;
-            for_(int bd = bd_start; bd < bd_end; bd++)
-            for (int ld = 0; ld < ld_block2; ld++) {
+            for_(dim_t bd = bd_start; bd < bd_end; bd++)
+            for (dim_t ld = 0; ld < ld_block2; ld++) {
                 const auto vmm_idx = accm(ld_block2, bd, ld).getIdx();
 
                 rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, reg_aux_D);
@@ -1205,14 +1360,14 @@ void jit_brgemm_kernel_t<Wmm>::apply_post_ops(
                 // objects above that use push/pop
                 if (brg.is_fp8_via_convert()) push(reg64_fp8_aux);
 
-                for_(int bd = bd_start; bd < bd_end; bd++)
-                for (int ld = 0; ld < ld_block2; ld++) {
+                for_(dim_t bd = bd_start; bd < bd_end; bd++)
+                for (dim_t ld = 0; ld < ld_block2; ld++) {
                     const auto vmm = accm(ld_block2, bd, ld);
                     const auto addr = ptr[reg_aux_D + D_offset(bd, ld)];
                     const auto vmm_prev_dst = vmm_tmp(0);
                     const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
                     const auto k_mask = is_tail ? ld_tail_mask : ld_full_mask;
-                    const int ld_size = is_tail ? brg.ldb_tail : brg.ld_block;
+                    const dim_t ld_size = is_tail ? brg.ldb_tail : brg.ld_block;
                     cvt2ps(brg.sum_dt, vmm_prev_dst, addr, is_tail, false,
                             k_mask, ld_size);
                     if (p_sum_zp_reg_set)
@@ -1253,8 +1408,8 @@ void jit_brgemm_kernel_t<Wmm>::apply_post_ops(
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
-        int bd_block, int ld_block2, int ldb_and_bdb_offset, bool is_ld_tail) {
+void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(dim_t bd_block,
+        dim_t ld_block2, dim_t ldb_and_bdb_offset, bool is_ld_tail) {
     auto k_mask = (!is_ld_tail) ? ld_full_mask : ld_tail_mask;
 
     // if (brg.is_int8 && alpha_or_beta_applicable && !beta_uses_vadd) ->
@@ -1263,11 +1418,12 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
     const bool beta_uses_vadd
             = brg.beta == 1.f && IMPLICATION(brg.is_int8, brg.alpha == 1.0f);
     const bool dq2ps_required = brg.is_int8
-            && IMPLICATION(alpha_or_beta_applicable, beta_uses_vadd);
+            && IMPLICATION(alpha_or_beta_applicable, beta_uses_vadd)
+            && !brg.with_src_dyn_quant;
 
     if (brg.with_scales) {
         mov(reg_aux_scales, ptr[rsp + reg_aux_scales_offs_]);
-        for (int ld = 0; ld < ld_block2; ld++) {
+        for (dim_t ld = 0; ld < ld_block2; ld++) {
             const auto addr = ptr[reg_aux_scales + scales_offset(ld)];
             const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
             auto vmm_scales = vmm_tmp(0);
@@ -1279,7 +1435,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
                 auto vmm_scales = vmm_tmp(0);
                 vmaskmovps(vmm_scales, vmm_tail_mask(), addr);
             }
-            for (int bd = 0; bd < bd_block; bd++) {
+            for (dim_t bd = 0; bd < bd_block; bd++) {
                 auto vmm = accm(ld_block2, bd, ld);
                 if (dq2ps_required) uni_vcvtdq2ps(vmm, vmm);
                 uni_vmulps(vmm, vmm, vmm_scales);
@@ -1290,7 +1446,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
     if (brg.with_bias) { mov(reg_aux_bias, ptr[rsp + reg_aux_bias_offs_]); }
 
     if (brg.is_fp8_via_convert()) mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux);
-    for (int ld = 0; ld < ld_block2; ld++) {
+    for (dim_t ld = 0; ld < ld_block2; ld++) {
         auto vmm_bias = vmm_tmp(0);
         if (brg.with_bias) {
             auto ptr_bias = ptr[reg_aux_bias + bias_offset(ld)];
@@ -1298,7 +1454,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
             cvt2ps(brg.dt_bias, vmm_bias, ptr_bias, is_tail, false, k_mask,
                     is_tail ? brg.ldb_tail : brg.ld_block);
         }
-        for (int bd = 0; bd < bd_block; bd++) {
+        for (dim_t bd = 0; bd < bd_block; bd++) {
             auto vmm = accm(ld_block2, bd, ld);
             if (dq2ps_required && !brg.with_scales) uni_vcvtdq2ps(vmm, vmm);
             if (brg.with_bias) uni_vaddps(vmm, vmm, vmm_bias);
@@ -1314,8 +1470,8 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
         auto vmm_dst_scales = vmm_tmp(0);
         vbroadcastss(vmm_dst_scales, ptr[reg_aux_dst_scales]);
 
-        for (int ld = 0; ld < ld_block2; ld++) {
-            for (int bd = 0; bd < bd_block; bd++) {
+        for (dim_t ld = 0; ld < ld_block2; ld++) {
+            for (dim_t bd = 0; bd < bd_block; bd++) {
                 auto vmm = accm(ld_block2, bd, ld);
                 vmulps(vmm, vmm, vmm_dst_scales);
             }
@@ -1337,10 +1493,10 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
         if (brg.is_fp8_via_convert())
             mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux);
 
-        for (int ld = 0; ld < ld_block2; ld++) {
+        for (dim_t ld = 0; ld < ld_block2; ld++) {
             const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
             if (brg.zp_type_c == brgemm_broadcast_t::per_n) {
-                int zp_c_off = zp_c_values_offset(ld);
+                dim_t zp_c_off = zp_c_values_offset(ld);
                 if (is_superset(brg.isa_impl, avx512_core)) {
                     auto zp_c_addr
                             = EVEX_compress_addr(reg_aux_zp_c_values, zp_c_off);
@@ -1352,7 +1508,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
                             k_mask, is_tail ? brg.ldb_tail : brg.ld_block);
                 }
             }
-            for (int bd = 0; bd < bd_block; bd++) {
+            for (dim_t bd = 0; bd < bd_block; bd++) {
                 auto vmm = accm(ld_block2, bd, ld);
                 uni_vaddps(vmm, vmm, vmm_zp_c);
             }
@@ -1363,16 +1519,14 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
 
     const bool dt_requires_saturation
             = one_of(brg.dt_d, data_type::u8, data_type::s8, data_type::s32);
-    auto vmm_lbound = vmm_tail_mask();
-    auto vmm_ubound = vmm_tmp(0);
-    assert(vmm_lbound.getIdx() != vmm_ubound.getIdx());
+    assert(vmm_lbound().getIdx() != vmm_ubound().getIdx());
     if (dt_requires_saturation) {
-        init_saturate_f32(
-                vmm_lbound, vmm_ubound, reg_tmp_gpr, data_type::f32, brg.dt_d);
-        for (int bd = 0; bd < bd_block; bd++) {
-            for (int ld = 0; ld < ld_block2; ld++) {
+        init_saturate_f32(vmm_lbound(), vmm_ubound(), reg_tmp_gpr,
+                data_type::f32, brg.dt_d);
+        for (dim_t bd = 0; bd < bd_block; bd++) {
+            for (dim_t ld = 0; ld < ld_block2; ld++) {
                 auto vmm = accm(ld_block2, bd, ld);
-                saturate_cvt_f32(vmm, vmm_lbound, vmm_ubound, brg.dt_d);
+                saturate_cvt_f32(vmm, vmm_lbound(), vmm_ubound(), brg.dt_d);
             }
         }
         // below call is not required as s32 doesn't use vmm_lbound
@@ -1385,8 +1539,8 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
         mov(ptr[rsp + reg_aux_D_backup_offs_], reg_aux_D);
 
     if (brg.is_fp8_via_convert()) mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux);
-    for_(int bd = 0; bd < bd_block; bd++)
-    for (int ld = 0; ld < ld_block2; ld++) {
+    for_(dim_t bd = 0; bd < bd_block; bd++)
+    for (dim_t ld = 0; ld < ld_block2; ld++) {
         auto addr = ptr[reg_aux_D + D_offset(bd, ld)];
         auto vmm = accm(ld_block2, bd, ld);
         auto vmm_lower = Vmm_lower_t(vmm.getIdx());
@@ -1431,7 +1585,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
                 default: assert(!"unknown dst_dt");
             }
         } else {
-            const int ld_block = is_tail ? brg.ldb_tail : brg.ld_block;
+            const dim_t ld_block = is_tail ? brg.ldb_tail : brg.ld_block;
             if (is_tail && types::data_type_size(brg.dt_b) == sizeof(float))
                 vmaskmovps(addr, vmm_tail_mask(), vmm);
             else
@@ -1449,7 +1603,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_apply_post_ops(
 
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::apply_compensation(
-        int bd_block, int ld_block2, bool is_ld_tail) {
+        dim_t bd_block, dim_t ld_block2, bool is_ld_tail) {
     // apply compensation to accumulated values
     // to avoid the loss of accuracy when converting s32 to f32
     auto k_mask = (!is_ld_tail) ? ld_full_mask : ld_tail_mask;
@@ -1461,9 +1615,9 @@ void jit_brgemm_kernel_t<Wmm>::apply_compensation(
 
         mov(reg_aux_zp_comp_a, ptr[rsp + reg_aux_zp_comp_a_offs_]);
         const auto vmm_zp_comp_a = vmm_tmp(0);
-        for (int ld = 0; ld < ld_block2; ld++) {
+        for (dim_t ld = 0; ld < ld_block2; ld++) {
             const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
-            for (int bd = 0; bd < bd_block; bd++) {
+            for (dim_t bd = 0; bd < bd_block; bd++) {
                 if (IMPLICATION(!brg.req_comp_pads_with_bcast, bd == 0)) {
                     const auto zp_comp_a_addr = ptr[reg_aux_zp_comp_a
                             + bd_zp_comp_a_offset(ld, bd)];
@@ -1488,9 +1642,9 @@ void jit_brgemm_kernel_t<Wmm>::apply_compensation(
 
     if (brg.zp_type_b != brgemm_broadcast_t::none) {
         mov(reg_aux_zp_comp_b, ptr[rsp + reg_aux_zp_comp_b_offs_]);
-        for (int bd = 0; bd < bd_block; bd++) {
-            int zp_comp_b_off = zp_comp_b_offset(bd);
-            for (int ld = 0; ld < ld_block2; ld++) {
+        for (dim_t bd = 0; bd < bd_block; bd++) {
+            dim_t zp_comp_b_off = zp_comp_b_offset(bd);
+            for (dim_t ld = 0; ld < ld_block2; ld++) {
                 auto vmm = accm(ld_block2, bd, ld);
                 if (is_superset(brg.isa_impl, avx512_core)) {
                     const auto zp_comp_b_addr = EVEX_compress_addr(
@@ -1509,9 +1663,9 @@ void jit_brgemm_kernel_t<Wmm>::apply_compensation(
     if (!brg.req_cal_comp_pads && brg.req_s8s8_compensation) {
         mov(reg_aux_compensation, ptr[rsp + reg_aux_comp_offs_]);
         auto vmm_comp = vmm_tmp(0);
-        for (int ld = 0; ld < ld_block2; ld++) {
+        for (dim_t ld = 0; ld < ld_block2; ld++) {
             const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
-            for (int bd = 0; bd < bd_block; bd++) {
+            for (dim_t bd = 0; bd < bd_block; bd++) {
                 if (IMPLICATION(!brg.req_comp_pads_with_bcast, bd == 0)) {
                     const auto comp_addr = ptr[reg_aux_compensation
                             + bd_compensation_offset(ld, bd)];
@@ -1532,7 +1686,7 @@ void jit_brgemm_kernel_t<Wmm>::apply_compensation(
 
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::store_accumulators_without_post_ops(
-        int bd_block, int ld_block2, bool is_ld_tail) {
+        dim_t bd_block, dim_t ld_block2, bool is_ld_tail) {
 
     // if (brg.is_int8 && alpha_or_beta_applicable && !beta_uses_vadd) ->
     // accumulated values are converted to ps in apply_alpha_beta()
@@ -1543,14 +1697,12 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_without_post_ops(
             && !IMPLICATION(alpha_or_beta_applicable, beta_uses_vadd);
 
     if (dt_requires_saturation) {
-        auto vmm_ubound = vmm_tmp(0);
-        auto vmm_lbound = vmm_tmp(1);
-        init_saturate_f32(
-                vmm_lbound, vmm_ubound, reg_tmp_gpr, data_type::f32, brg.dt_d);
-        for (int bd = 0; bd < bd_block; bd++) {
-            for (int ld = 0; ld < ld_block2; ld++) {
+        init_saturate_f32(vmm_lbound(), vmm_ubound(), reg_tmp_gpr,
+                data_type::f32, brg.dt_d);
+        for (dim_t bd = 0; bd < bd_block; bd++) {
+            for (dim_t ld = 0; ld < ld_block2; ld++) {
                 auto vmm = accm(ld_block2, bd, ld);
-                saturate_cvt_f32(vmm, vmm_lbound, vmm_ubound, brg.dt_d);
+                saturate_cvt_f32(vmm, vmm_lbound(), vmm_ubound(), brg.dt_d);
             }
         }
         // below call is not required as s32 doesn't use vmm_lbound
@@ -1560,8 +1712,8 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_without_post_ops(
     if (brg.is_runtime_ldc && bd_block > 1)
         mov(ptr[rsp + reg_aux_C_backup_offs_], reg_aux_C);
 
-    for_(int bd = 0; bd < bd_block; bd++)
-    for (int ld = 0; ld < ld_block2; ld++) {
+    for_(dim_t bd = 0; bd < bd_block; bd++)
+    for (dim_t ld = 0; ld < ld_block2; ld++) {
         auto vmm = accm(ld_block2, bd, ld);
         const auto addr_c = ptr[reg_aux_C + C_offset(bd, ld)];
         const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
@@ -1581,15 +1733,12 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators_without_post_ops(
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::store_accumulators(int bd_block2,
-        bool is_bdb_tail, int ld_block2, bool is_ld_tail,
+void jit_brgemm_kernel_t<Wmm>::store_accumulators(dim_t bd_block2,
+        bool is_bdb_tail, dim_t ld_block2, bool is_ld_tail,
         bool skip_accumulation) {
     const bool has_zero_points = !everyone_is(brgemm_broadcast_t::none,
             brg.zp_type_a, brg.zp_type_b, brg.zp_type_c);
-    const bool are_post_ops_applicable = one_of(true, brg.with_eltwise,
-            brg.with_binary, brg.with_scales, brg.with_bias, brg.with_sum,
-            brg.dt_d != brg.dt_c, brg.req_s8s8_compensation, has_zero_points,
-            brg.with_dst_scales);
+    const bool are_post_ops_applicable = brg.are_post_ops_applicable();
     const bool need_to_apply_alpha_beta = brg.beta != 0.f || brg.alpha != 1.f;
     const bool need_generate_zp_a_compensation
             = brg.is_int8 && (brg.req_s8s8_compensation || has_zero_points);
@@ -1627,16 +1776,16 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators(int bd_block2,
             }
 
             mov(reg_buf, ptr[rsp + reg_buf_offs_]);
-            for (int bdb = 0; bdb < bd_block2; bdb++) {
-                int adj_bd_block = (brg.is_M_tail && is_bdb_tail)
+            for (dim_t bdb = 0; bdb < bd_block2; bdb++) {
+                dim_t adj_bd_block = (brg.is_M_tail && is_bdb_tail)
                         ? brg.bdb_tail
                         : brg.bd_block;
-                for (int ldb = 0; ldb < ld_block2; ldb++) {
-                    int idx = (is_ld_tail) ? brg.ld_block2 : ldb;
+                for (dim_t ldb = 0; ldb < ld_block2; ldb++) {
+                    dim_t idx = (is_ld_tail) ? brg.ld_block2 : ldb;
                     if (need_to_apply_alpha_beta || are_post_ops_applicable
                             || apply_zp_a_compensation) {
                         if (skip_accumulation) {
-                            for (int bd = 0; bd < adj_bd_block; bd++) {
+                            for (dim_t bd = 0; bd < adj_bd_block; bd++) {
                                 auto vreg_acc = accm(1, bd, 0);
                                 uni_vpxor(vreg_acc, vreg_acc, vreg_acc);
                             }
@@ -1644,7 +1793,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators(int bd_block2,
                             tilestored(ptr[reg_buf + reg_stride_ld_block],
                                     Tmm(brg.get_C_tensor(bdb, idx, is_bdb_tail,
                                             is_ld_tail)));
-                            for (int bd = 0; bd < adj_bd_block; bd++) {
+                            for (dim_t bd = 0; bd < adj_bd_block; bd++) {
                                 size_t buf_offset
                                         = (bd * brg.ld_block) * brg.typesize_C;
                                 auto vreg_acc = is_ld_tail
@@ -1765,7 +1914,7 @@ void jit_brgemm_kernel_t<Wmm>::store_accumulators(int bd_block2,
         store_accumulators_amx(false);
         L_aligned(label_done);
     } else {
-        int bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
+        dim_t bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
 
         if (need_generate_zp_a_compensation) {
             Label label_store_without_comp;
@@ -1870,11 +2019,10 @@ void jit_brgemm_kernel_t<Wmm>::set_A_B_matrices() {
 
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::maybe_pre_process_data(matrix_kind_t matrix_kind,
-        const Tmm &t1, reg64_t reg_base, size_t offset, reg64_t reg_stride,
-        int num_rows, int num_col_bytes, bool is_rd_tail) {
-    constexpr int tile_size = 1024;
+        const Tmm &t1, reg64_t reg_base, dim_t offset, reg64_t reg_stride,
+        dim_t num_rows, dim_t num_col_bytes, bool is_rd_tail) {
     const auto transform_offset = brg.brgattr.use_interleave_stores
-            ? brg.get_num_C_tiles() * tile_size
+            ? brg.get_num_C_tiles() * brgemm_desc_t::tilesize
             : 0;
     add(reg_buf_aux, transform_offset);
 
@@ -1897,12 +2045,12 @@ void jit_brgemm_kernel_t<Wmm>::maybe_pre_process_data(matrix_kind_t matrix_kind,
 
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::maybe_tileloadd_nt(matrix_kind_t matrix_kind,
-        int idx, int offset, bool is_rd_tail, bool is_tail) {
+        dim_t idx, dim_t offset, bool is_rd_tail, bool is_tail, bool last_bdb) {
 
     const bool is_A = matrix_kind == matrix_kind_t::matrix_A;
 
-    const int tmm_idx = is_A ? brg.get_A_tensor(idx, is_tail)
-                             : brg.get_B_tensor(idx, is_tail);
+    const dim_t tmm_idx = is_A ? brg.get_A_tensor(idx, is_tail)
+                               : brg.get_B_tensor(idx, is_tail);
     auto t1 = Tmm(tmm_idx);
 
     auto reg_base = is_A ? reg_aux_A : reg_aux_B;
@@ -1912,24 +2060,25 @@ void jit_brgemm_kernel_t<Wmm>::maybe_tileloadd_nt(matrix_kind_t matrix_kind,
             == (is_A ? brgemm_bd_loop_innermost : brgemm_ld_loop_innermost);
 
     if (brg.is_fp8_via_convert()) {
-        const int typesize_A
+        const dim_t typesize_A
                 = brg.is_input_convert() ? sizeof(int16_t) : brg.typesize_A;
-        const int typesize_B
+        const dim_t typesize_B
                 = brg.is_input_convert() ? sizeof(int16_t) : brg.typesize_B;
-        int rd_step = 4 / typesize_A;
-        int rd_block = (!brg.rdb && brg.rdb_tail) ? brg.rdb_tail : brg.rd_block;
+        dim_t rd_step = 4 / typesize_A;
+        dim_t rd_block
+                = (!brg.rdb && brg.rdb_tail) ? brg.rdb_tail : brg.rd_block;
         if (brg.is_input_convert()) {
             const int vnni_granularity
                     = data_type_vnni_granularity(data_type::f16);
             rd_block = utils::rnd_up(rd_block, vnni_granularity);
         }
 
-        int A_col = typesize_A * rd_block;
-        int A_row = is_tail ? brg.bdb_tail : brg.bd_block;
+        dim_t A_col = typesize_A * rd_block;
+        dim_t A_row = is_tail ? brg.bdb_tail : brg.bd_block;
 
-        int B_col = (is_tail ? brg.ldb_tail : brg.ld_block) * typesize_B
+        dim_t B_col = (is_tail ? brg.ldb_tail : brg.ld_block) * typesize_B
                 * rd_step;
-        int B_row = brg.typesize_C != 0 ? A_col / brg.typesize_C : 0;
+        dim_t B_row = brg.typesize_C != 0 ? A_col / brg.typesize_C : 0;
         mov(ptr[rsp + reg_val_tmp_1_], reg64_fp8_aux);
         mov(ptr[rsp + reg_val_tmp_2_], reg_buf_aux);
 
@@ -1940,6 +2089,10 @@ void jit_brgemm_kernel_t<Wmm>::maybe_tileloadd_nt(matrix_kind_t matrix_kind,
         mov(reg64_fp8_aux, ptr[rsp + reg_val_tmp_1_]);
         mov(reg_buf_aux, ptr[rsp + reg_val_tmp_2_]);
     } else {
+        if (maybe_pre_process_k_tail(last_bdb || is_tail, is_rd_tail, t1,
+                    reg_base, offset, reg_stride, matrix_kind))
+            return;
+
         const size_t cache_footprint = static_cast<size_t>(brg.typesize_A)
                         * brg.brgattr.hint_expected_A_size
                 + static_cast<size_t>(brg.typesize_B)
@@ -1955,8 +2108,72 @@ void jit_brgemm_kernel_t<Wmm>::maybe_tileloadd_nt(matrix_kind_t matrix_kind,
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::gemm_microkernel_amx(int bd_block2,
-        bool is_bdb_tail, int ld_block2, bool is_rd_tail, bool is_ld_tail) {
+bool jit_brgemm_kernel_t<Wmm>::maybe_pre_process_k_tail(bool last_bdb,
+        bool is_rd_tail, const Tmm &t1, reg64_t reg_base, dim_t offset,
+        reg64_t reg_stride, matrix_kind_t mk) {
+
+    // TODO: check is it last bs to calculate need_k_tail_processing
+    const auto need_k_tail_processing = mk == matrix_A && brg.amx_wary_k_tail()
+            && brg.rdb_tail != 0 && last_bdb && is_rd_tail;
+    if (!need_k_tail_processing) return false;
+
+    const auto zmm_width_in_bytes = cpu_isa_traits_t<avx512_core>::vlen;
+
+    auto transform_offset = brg.get_num_C_tiles() * brgemm_desc_t::tilesize
+            + brg.get_convert_wsp_buffer_size();
+
+    //TODO: reuse transformed data from matrix A for ldi > 0
+    const dim_t num_rows = palette_.rows[t1.getIdx()];
+    const dim_t num_col_bytes = palette_.cols[t1.getIdx()];
+
+    const auto max_num_cols
+            = nstl::min<dim_t>(num_col_bytes / brg.typesize_A, brg.rdb_tail);
+    const size_t col_tail
+            = max_num_cols % (zmm_width_in_bytes / brg.typesize_A);
+    if (col_tail) {
+        const auto tail_mask = (static_cast<size_t>(1) << col_tail) - 1;
+        mov(reg_tmp_gpr, tail_mask);
+        kmovq(rd_tail_mask, reg_tmp_gpr);
+    }
+    auto zmm_1 = zmm_tmp_1();
+    auto zmm_1_masked = col_tail ? zmm_1 | rd_tail_mask | T_z : zmm_1;
+
+    assert(max_num_cols > 0);
+
+    mov(ptr[rsp + reg_val_tmp_2_], reg_buf_aux);
+
+    mov(reg_buf_aux, ptr[rsp + reg_buf_offs_]);
+    if (transform_offset) add(reg_buf_aux, transform_offset);
+
+    for (dim_t r = 0; r < num_rows; ++r) {
+        const auto row_offset = offset + r * brg.typesize_A * brg.LDA;
+        switch (brg.dt_a) {
+            case data_type::bf16:
+            case data_type::f16:
+                vmovdqu16(zmm_1_masked, ptr[reg_base + row_offset]);
+                break;
+            case data_type::f8_e5m2:
+            case data_type::f8_e4m3:
+            case data_type::s8:
+            case data_type::u8:
+                vmovdqu8(zmm_1_masked, ptr[reg_base + row_offset]);
+                break;
+            default: assert(!"unsupported data type");
+        }
+        vmovups(ptr[reg_buf_aux + r * zmm_width_in_bytes], zmm_1);
+    }
+    // load into tmm from the transformed data.
+    mov(reg_converted_stride, zmm_width_in_bytes);
+    tileloadd(t1, ptr[reg_buf_aux + reg_converted_stride]);
+    mov(reg_buf_aux, ptr[rsp + reg_val_tmp_2_]);
+
+    return true;
+}
+
+template <typename Wmm>
+void jit_brgemm_kernel_t<Wmm>::gemm_microkernel_amx(dim_t bd_block2,
+        bool is_bdb_tail, dim_t ld_block2, bool is_rd_tail, bool is_ld_tail,
+        bool last_bdb) {
     auto tdpbxxd = [this](const Tmm &x1, const Tmm &x2, const Tmm &x3) {
         if (brg.is_fp8) {
             if (brg.is_fp8_via_convert())
@@ -1979,20 +2196,20 @@ void jit_brgemm_kernel_t<Wmm>::gemm_microkernel_amx(int bd_block2,
             assert(!"unsupported combination");
         }
     };
-    int rbd_block = (is_rd_tail) ? 1 : brg.rdb;
-    for (int rdb = 0; rdb < rbd_block; rdb++) {
-        for (int bdb = 0; bdb < bd_block2; bdb++) {
+    dim_t rbd_block = (is_rd_tail) ? 1 : brg.rdb;
+    for (dim_t rdb = 0; rdb < rbd_block; rdb++) {
+        for (dim_t bdb = 0; bdb < bd_block2; bdb++) {
             maybe_tileloadd_nt(matrix_kind_t::matrix_A, bdb,
                     rdb * rdb_A_offset() + A_offset(bdb, 0, true), is_rd_tail,
-                    is_bdb_tail);
+                    is_bdb_tail, last_bdb && bdb == bd_block2 - 1);
         }
-        for (int ldb = 0; ldb < ld_block2; ldb++) {
+        for (dim_t ldb = 0; ldb < ld_block2; ldb++) {
 
-            const int idx = (is_ld_tail) ? brg.ld_block2 : ldb;
+            const dim_t idx = (is_ld_tail) ? brg.ld_block2 : ldb;
             maybe_tileloadd_nt(matrix_kind_t::matrix_B, idx,
                     rdb * rdb_B_offset() + B_offset(ldb, 0, true), is_rd_tail,
-                    is_ld_tail);
-            for (int bdb = 0; bdb < bd_block2; bdb++) {
+                    is_ld_tail, false);
+            for (dim_t bdb = 0; bdb < bd_block2; bdb++) {
                 tdpbxxd(Tmm(brg.get_C_tensor(
                                 bdb, idx, is_bdb_tail, is_ld_tail)),
                         Tmm(brg.get_A_tensor(bdb, is_bdb_tail)),
@@ -2017,9 +2234,7 @@ void jit_brgemm_kernel_t<Wmm>::dot_product(Vmm v1, Vmm v2, Vmm v3) {
         if (brg.dt_a == data_type::s8 && isa_has_s8s8(brg.isa_impl))
             vpdpbssd(v1, v3, v2);
         else if (brg.has_int8_vnni)
-            vpdpbusd(v1, v3, v2,
-                    is_superset(brg.isa_impl, avx512_core) ? EvexEncoding
-                                                           : VexEncoding);
+            vpdpbusd(v1, v3, v2, get_encoding());
         else {
             vpmaddubsw(int8_dot_product_temp(), v3, v2);
             vpmaddwd(int8_dot_product_temp(), int8_dot_product_temp(),
@@ -2030,12 +2245,13 @@ void jit_brgemm_kernel_t<Wmm>::dot_product(Vmm v1, Vmm v2, Vmm v3) {
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(int rd_loop, int bd_b,
-        int bd_e, int bd_block, int ld_block2, bool is_ld_tail, int vpad) {
+void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(dim_t rd_loop,
+        dim_t bd_b, dim_t bd_e, dim_t bd_block, dim_t ld_block2,
+        bool is_ld_tail, dim_t vpad) {
     assert(brg.is_int8);
 
     auto compensation_padding = [this, ld_block2](Vmm vmm_load, Vmm vmm_tmp,
-                                        int ld, int bd_b, int bd_e) {
+                                        dim_t ld, dim_t bd_b, dim_t bd_e) {
         // req_cal_comp_pads -> only calculate compensation along with
         // computation and do not use pre-calculated compensation.
         // Calculate comp padding as:
@@ -2046,7 +2262,7 @@ void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(int rd_loop, int bd_b,
                 dot_product(vmm_tmp, vmm_load, vmm_inp_shift());
             }
 
-            for (int bd = bd_b; bd < bd_e; bd++) {
+            for (dim_t bd = bd_b; bd < bd_e; bd++) {
                 auto vmm = accm(ld_block2, bd, ld);
                 if (brg.req_cal_comp_pads) {
                     uni_vpsubd(vmm, vmm, vmm_tmp);
@@ -2061,7 +2277,7 @@ void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(int rd_loop, int bd_b,
             dot_product(vmm_tmp, vmm_load, vmm_one_bytes());
             uni_vpmulld(vmm_tmp, vmm_tmp, vmm_zp_a_shift());
 
-            for (int bd = bd_b; bd < bd_e; bd++) {
+            for (dim_t bd = bd_b; bd < bd_e; bd++) {
                 auto vmm = accm(ld_block2, bd, ld);
                 if (brg.req_cal_comp_pads) {
                     uni_vpsubd(vmm, vmm, vmm_tmp);
@@ -2072,7 +2288,7 @@ void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(int rd_loop, int bd_b,
         }
     };
 
-    if (n_bcast_1_load && brg.zp_type_a != brgemm_broadcast_t::none) {
+    if (need_comp_pads && brg.zp_type_a != brgemm_broadcast_t::none) {
         mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
         const auto reg32_scratch = reg_zp_a_input_shift.cvt32();
         mov(reg32_scratch, 0x1010101);
@@ -2082,16 +2298,15 @@ void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(int rd_loop, int bd_b,
         mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
     }
 
-    for_(int rd = 0; rd < rd_loop; rd += brg.rd_step)
-    for (int ld = 0; ld < ld_block2; ++ld) {
+    for_(dim_t rd = 0; rd < rd_loop; rd += brg.rd_step)
+    for (dim_t ld = 0; ld < ld_block2; ++ld) {
         const auto addr = ptr[reg_aux_B + B_offset(ld, rd)];
         const bool is_tail = is_ld_tail && ld + 1 == ld_block2;
         if (IMPLICATION(is_tail, is_superset(brg.isa_impl, avx512_core))) {
             auto vmm_store = vmm_mask(load(), is_tail, false, ld_tail_mask);
             uni_vmovups(vmm_store, addr);
         } else {
-            load_bytes(
-                    load(), addr, brg.typesize_B * brg.ldb_tail * brg.ld_step);
+            load_bytes(load(), addr, ldb_B_offset(0, true));
         }
 
         if (brg.req_cal_comp_pads) {
@@ -2105,14 +2320,11 @@ void jit_brgemm_kernel_t<Wmm>::compute_int8_compensation(int rd_loop, int bd_b,
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(int bd_block2, bool is_bdb_tail,
-        int ld_block2, bool is_rd_tail, bool is_ld_tail, int vpad,
-        int rows_for_rd_tail) {
-    assert(!brg.is_fp8_via_convert() && "No non-AMX path for fp8");
-
-    MAYBE_UNUSED(bd_block2);
-    int bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
-    const auto bd_b = nstl::max(0, vpad);
+void jit_brgemm_kernel_t<Wmm>::gemm_microkernel_dyn_quant(dim_t bd_block2,
+        bool is_bdb_tail, dim_t ld_block2, bool is_rd_tail, bool is_ld_tail,
+        dim_t vpad, dim_t rows_for_rd_tail) {
+    dim_t bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
+    const auto bd_b = nstl::max((dim_t)0, vpad);
     const auto bd_e = nstl::min(bd_block, bd_block + vpad);
     const auto is_valid_bd
             = need_comp_pads && vpad != 0 ? bd_b <= bd_e : bd_b < bd_e;
@@ -2132,9 +2344,13 @@ void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(int bd_block2, bool is_bdb_tail,
     } else
         rd_loop = brg.rd_block;
 
+    bool maybe_load_bytes = (rows_for_rd_tail > 0 || brg.brgattr.wary_A_k_tail_read)
+            && is_rd_tail && rd_tail_size != 0 && (brg.is_bf16 || brg.is_int8);
+
     auto broadcast = [this, rd_tail_size](Vmm v1, size_t offset, bool is_tail,
                              data_type_t dt) {
         if (is_tail) {
+            uni_vpxor(v1, v1, v1);
             Xmm xmm_tmp = Xmm(v1.getIdx());
             load_bytes(
                     xmm_tmp, reg_aux_A, offset, rd_tail_size * brg.typesize_A);
@@ -2160,6 +2376,285 @@ void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(int bd_block2, bool is_bdb_tail,
         if (brg.req_s8s8_compensation) uni_vpaddb(v1, v1, vmm_inp_shift());
     };
 
+    static const int8_t mask_low_half[64] = {
+        0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+        0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+        0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+        0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F
+    };
+
+    mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+    mov(ptr[rsp + reg_ldb_loop_offs_], reg_ldb_loop);
+
+    auto reg_ptr = reg_bdb_loop;
+    auto vmm_mask_low_half = Vmm(isa_num_vregs(brg.isa_impl) - 1);
+    mov(reg_ptr, (size_t)mask_low_half);
+    uni_vmovups(vmm_mask_low_half, ptr[reg_ptr]);
+
+    const int vec_size = vreg_traits_t<Vmm>::vlen;
+    auto accums_stack_space = bd_e * ld_block2 * vec_size;
+    sub(rsp, accums_stack_space);
+    for (int bd = bd_b; bd < bd_e; bd++) {
+        for (int ld = 0; ld < ld_block2; ld++) {
+            auto vmm_accm = accm(ld_block2, bd, ld);
+            vmovups(ptr[rsp + (bd * ld_block2 + ld) * vec_size], vmm_accm);
+
+            uni_vxorps(vmm_accm, vmm_accm, vmm_accm);
+        }
+    }
+
+    for (int rd = 0; rd < rd_loop; rd += brg.rd_step) {
+        int prefetch_count_B = 0;
+        for (int ld = 0; ld < ld_block2; ld++) {
+            const auto addr = ptr[reg_aux_B + B_offset(ld, rd)];
+            const Vmm vmm_load = vmm_mask(load(ld), is_ld_tail, false, ld_tail_mask);
+            if (brg.dt_b == data_type::u8) {
+                uni_vmovups(vmm_load, addr);
+            } else if (brg.dt_b == data_type::u4) {
+                uni_vmovups(vmm_load, addr);
+                if (rd % 8 == 0)
+                    uni_vpsrld(vmm_load, vmm_load, 4);
+                uni_vandps(vmm_load, vmm_load, vmm_mask_low_half);
+            } else {
+                assert(!"unsupported combination");
+            }
+        }
+
+        bool have_to_load_bytes
+                = maybe_load_bytes && (rd == rd_loop - brg.rd_step);
+
+        auto rows_by_load_bytes = have_to_load_bytes ? rows_for_rd_tail : 0;
+        for (int bd = bd_b; bd < bd_e; bd++) {
+            if (!is_emdbd) {
+                const auto bd_by_load_bytes
+                        = (bd >= bd_e - rows_by_load_bytes
+                                || brg.brgattr.wary_A_k_tail_read);
+                    broadcast(bcst(), A_offset(bd, rd),
+                            have_to_load_bytes && bd_by_load_bytes, brg.dt_a);
+            }
+            if (prefetch_count_B < ld_block2) {
+                int typesize_scale = brg.dt_b == data_type::u4 ? 2 : 1;
+                prefetcht0(ptr[reg_aux_B + B_offset(prefetch_count_B++, rd)
+                        + brg.LDB * brg.rd_block * brg.typesize_B / typesize_scale]);
+            }
+            for (int ld = 0; ld < ld_block2; ld++) {
+                auto vmm = accm(ld_block2, bd, ld);
+                vpdpbusd(vmm, load(ld), bcst(), is_superset(brg.isa_impl, avx512_core) ? EvexEncoding : VexEncoding);
+            }
+        }
+    }
+
+    auto vmm_zero_point = [&](int ld) {
+        return load(ld);
+    };
+
+    auto reg_local_wei_zp = reg_ldb_loop;
+    auto reg_local_src_grouped_sum = reg_bdb_loop;
+    auto vmm_tmp = Vmm(isa_num_vregs(brg.isa_impl) - 1);
+    auto vmm_src_grouped_sum = bcst();
+
+    if (brg.with_wei_decomp_zero_points) {
+        mov(reg_local_wei_zp, ptr[rsp + reg_aux2_wei_zero_points_offs_ + accums_stack_space]);
+        if (brg.wei_decomp_zero_points_stride == 0) {
+            Vmm vmm_zp = vmm_zero_point(0);
+            auto reg_ptr_32 = Reg32(reg_ptr.getIdx());
+            movzx(reg_ptr_32, ptr[reg_local_wei_zp]);
+            uni_vmovq(Xmm(vmm_zp.getIdx()), reg_ptr);
+            uni_vbroadcastss(vmm_zp, Xmm(vmm_zp.getIdx()));
+        }
+
+        mov(reg_local_src_grouped_sum, ptr[rsp + reg_aux2_src_grouped_sum_offs_ + accums_stack_space]);
+        for (int bd = bd_b; bd < bd_e; bd++) {
+            uni_vbroadcastss(vmm_src_grouped_sum, ptr[reg_local_src_grouped_sum + bd * brg.src_grouped_sum_stride * sizeof(int32_t)]);
+            for (int ld = 0; ld < ld_block2; ld++) {
+                Vmm vmm_zp = brg.wei_decomp_zero_points_stride == 0 ? vmm_zero_point(0) : vmm_zero_point(ld);
+                if (bd == bd_b && brg.wei_decomp_zero_points_stride != 0) {
+                    uni_vpmovzxbd(vmm_zp, ptr[reg_local_wei_zp + ld * brg.ld_block * types::data_type_size(brg.wei_decomp_zero_points_dt)]);
+                }
+
+                auto vmm_accm = accm(ld_block2, bd, ld);
+                uni_vpmulld(vmm_tmp, vmm_src_grouped_sum, vmm_zp);
+                uni_vpsubd(vmm_accm, vmm_accm, vmm_tmp);
+            }
+        }
+    }
+
+    auto wei_scale = [&](int ld) {
+        return load(ld);
+    };
+
+    auto reg_local_src_scales = reg_ldb_loop;
+    auto reg_local_wei_scales = reg_bdb_loop;
+    auto vmm_src_scales = bcst();
+
+    mov(reg_local_wei_scales, ptr[rsp + reg_aux2_wei_scales_offs_ + accums_stack_space]);
+    mov(reg_local_src_scales, ptr[rsp + reg_aux2_src_scales_offs_ + accums_stack_space]);
+    if (brg.wei_decomp_scales_stride == 0) {
+        uni_vbroadcastss(wei_scale(0), ptr[reg_local_wei_scales]);
+    }
+
+    for (int bd = bd_b; bd < bd_e; bd++) {
+        uni_vbroadcastss(vmm_src_scales, ptr[reg_local_src_scales + bd * brg.src_scales_stride * sizeof(float)]);
+        for (int ld = 0; ld < ld_block2; ld++) {
+            auto vmm_wei_scale = brg.wei_decomp_scales_stride == 0 ? wei_scale(0) : wei_scale(ld);
+            if (bd == bd_b && brg.wei_decomp_scales_stride != 0) {
+                uni_vmovups(vmm_wei_scale, ptr[reg_local_wei_scales + ld * brg.ld_block * sizeof(float)]);
+            }
+
+            auto vmm_accm = accm(ld_block2, bd, ld);
+            uni_vcvtdq2ps(vmm_accm, vmm_accm);
+            uni_vmulps(vmm_tmp, vmm_accm, vmm_src_scales);
+            uni_vmovups(vmm_accm, ptr[rsp + (bd * ld_block2 + ld) * vec_size]);
+            uni_vfmadd231ps(vmm_accm, vmm_tmp, vmm_wei_scale);
+        }
+    }
+
+    add(rsp, accums_stack_space);
+    mov(reg_ldb_loop, ptr[rsp + reg_ldb_loop_offs_]);
+    mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+
+    return;
+}
+
+template <typename Wmm>
+void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(dim_t bd_block2,
+        bool is_bdb_tail, dim_t ld_block2, bool is_rd_tail, bool is_ld_tail,
+        dim_t vpad, dim_t rows_for_rd_tail) {
+    assert(!brg.is_fp8_via_convert() && "No non-AMX path for fp8");
+    MAYBE_UNUSED(bd_block2);
+
+    if (brg.with_src_dyn_quant) {
+        gemm_microkernel_dyn_quant(bd_block2, is_bdb_tail, ld_block2, is_rd_tail, is_ld_tail, vpad, rows_for_rd_tail);
+        return;
+    }
+
+    dim_t bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
+    const auto bd_b = nstl::max(dim_t(0), vpad);
+    const auto bd_e = nstl::min(bd_block, bd_block + vpad);
+    const auto is_valid_bd
+            = need_comp_pads && vpad != 0 ? bd_b <= bd_e : bd_b < bd_e;
+    if (!is_valid_bd) return;
+
+    bool is_emdbd = brg.embd_bcst;
+
+    dim_t rd_loop = 0, rd_tail_size = 0;
+    if (is_rd_tail) {
+        if (brg.is_bf16 || brg.is_int8) {
+            rd_tail_size = brg.rdb_tail % brg.rd_step;
+            rd_loop = (rd_tail_size != 0)
+                    ? ((brg.rdb_tail / brg.rd_step) + 1) * brg.rd_step
+                    : brg.rdb_tail;
+        } else
+            rd_loop = brg.rdb_tail;
+    } else
+        rd_loop = brg.rd_block;
+
+    if (brg.req_s8s8_compensation) {
+        mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+        mov(reg_s8_input_shift, 128);
+        uni_vpbroadcastb(vmm_inp_shift(), reg_s8_input_shift.cvt8());
+        mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+    }
+
+    auto broadcast_A = [this, rd_tail_size, is_rd_tail, rd_loop,
+                               rows_for_rd_tail,
+                               bd_e](Vmm vmm_bcast, dim_t bd, dim_t rd) {
+        const auto offset = A_offset(bd, rd);
+        const auto dt = brg.dt_a;
+        const bool maybe_load_bytes
+                = (rows_for_rd_tail > 0 || brg.brgattr.wary_A_k_tail_read)
+                && is_rd_tail && rd_tail_size != 0
+                && (brg.is_bf16 || brg.is_int8);
+        const bool have_to_load_bytes
+                = maybe_load_bytes && (rd == rd_loop - brg.rd_step);
+        const auto rows_by_load_bytes
+                = have_to_load_bytes ? rows_for_rd_tail : 0;
+        const auto bd_by_load_bytes = (bd >= bd_e - rows_by_load_bytes
+                || brg.brgattr.wary_A_k_tail_read);
+        const auto is_tail = have_to_load_bytes && bd_by_load_bytes;
+        if (is_tail) {
+            Xmm xmm_tmp = Xmm(vmm_bcast.getIdx());
+            load_bytes(
+                    xmm_tmp, reg_aux_A, offset, rd_tail_size * brg.typesize_A);
+            uni_vpbroadcastd(vmm_bcast, xmm_tmp);
+        } else {
+            if (dt == data_type::f32) {
+                uni_vbroadcastss(vmm_bcast, ptr[reg_aux_A + offset]);
+            } else if (dt == data_type::bf16) {
+                if (brg.isa_impl == avx2_vnni_2)
+                    vbcstnebf162ps(vmm_bcast, ptr[reg_aux_A + offset]);
+                else
+                    uni_vpbroadcastd(vmm_bcast, ptr[reg_aux_A + offset]);
+            } else if (one_of(dt, data_type::s8, data_type::u8)) {
+                uni_vpbroadcastd(vmm_bcast, ptr[reg_aux_A + offset]);
+            } else if (dt == data_type::f16) {
+                if (brg.isa_impl == avx2_vnni_2) {
+                    vbcstnesh2ps(vmm_bcast, ptr[reg_aux_A + offset]);
+                } else if (is_superset(brg.isa_impl, avx512_core_fp16)) {
+                    // Broadcast is not supported for legacy f16-conversions.
+                    vcvtph2psx(vmm_bcast, ptr_b[reg_aux_A + offset]);
+                }
+            }
+        }
+
+        if (brg.req_s8s8_compensation)
+            uni_vpaddb(vmm_bcast, vmm_bcast, vmm_inp_shift());
+    };
+
+    auto load_B = [this, is_ld_tail](dim_t vmm_load_idx, dim_t rd, dim_t ld) {
+        const Vmm vmm_load
+                = vmm_mask(load(vmm_load_idx), is_ld_tail, false, ld_tail_mask);
+        const auto addr = ptr[reg_aux_B + B_offset(ld, rd)];
+        // Note: Assuming the tails are properly padded/blocked for
+        // avx2_vnni_2 with xf16 data type, as the B matrix is generally
+        // at least double-blocked.
+        if (brg.dt_b == data_type::f16) {
+            if (brg.isa_impl == avx2_vnni_2) {
+                if (rd % 2 == 0)
+                    vcvtneeph2ps(vmm_load, addr);
+                else
+                    vcvtneoph2ps(vmm_load, addr);
+            } else if (brg.is_f16_b_non_amx_vnni()) {
+                const auto actual_B_offset = B_offset(ld, utils::rnd_dn(rd, 2));
+                const auto vnni_addr = ptr[reg_aux_B + actual_B_offset];
+                vmovups(vmm_load, vnni_addr);
+                if (rd % 2 == 0)
+                    vpermw(vmm_load, f16_perm_even_vreg(), vmm_load);
+                else
+                    vpermw(vmm_load, f16_perm_odd_vreg(), vmm_load);
+                vcvtph2psx(vmm_load, Vmm_lower_t(vmm_load.getIdx()));
+            } else if (is_ld_tail && !is_superset(brg.isa_impl, avx512_core)) {
+                load_bytes(vmm_load, addr, ldb_B_offset(0, true));
+                vcvtph2ps(vmm_load, Xmm(vmm_load.getIdx()));
+            } else {
+                uni_vcvtph2psx(vmm_load, addr);
+            }
+        } else if (brg.dt_b == data_type::bf16) {
+            if (brg.isa_impl == avx2_vnni_2) {
+                if (rd % 2 == 0)
+                    vcvtneebf162ps(vmm_load, addr);
+                else
+                    vcvtneobf162ps(vmm_load, addr);
+            } else if (utils::one_of(brg.isa_impl, avx512_core, avx2) && brg.is_f32) {
+                // Upconvert: load 16 bits and move them 16 bits left.
+                uni_vpmovzxwd(vmm_load, addr);
+                uni_vpslld(vmm_load, vmm_load, 16);
+            } else if (is_ld_tail && !is_superset(brg.isa_impl, avx512_core)) {
+                load_bytes(vmm_load, addr, ldb_B_offset(0, true));
+            } else {
+                uni_vmovups(vmm_load, addr);
+            }
+        } else if (is_ld_tail) {
+            if (is_superset(brg.isa_impl, avx512_core)) {
+                uni_vmovups(vmm_load, addr);
+            } else {
+                load_bytes(vmm_load, addr, ldb_B_offset(0, true));
+            }
+        } else {
+            uni_vmovups(vmm_load, addr);
+        }
+    };
+
     const bool comp_vpad = vpad != 0
             && (brg.req_s8s8_compensation
                     || brg.zp_type_a != brgemm_broadcast_t::none);
@@ -2167,52 +2662,14 @@ void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(int bd_block2, bool is_bdb_tail,
         compute_int8_compensation(
                 rd_loop, bd_b, bd_e, bd_block, ld_block2, is_ld_tail, vpad);
 
-    bool maybe_load_bytes = (rows_for_rd_tail > 0 || brg.brgattr.wary_tail_read)
-            && is_rd_tail && rd_tail_size != 0 && (brg.is_bf16 || brg.is_int8);
-    if (n_bcast_1_load) {
-        for (int rd = 0; rd < rd_loop; rd += brg.rd_step) {
-            bool have_to_load_bytes
-                    = maybe_load_bytes && (rd == rd_loop - brg.rd_step);
-
-            auto rows_by_load_bytes = have_to_load_bytes ? rows_for_rd_tail : 0;
-            for (int bd = bd_b; bd < bd_e && !is_emdbd; bd++) {
-                const auto bd_by_load_bytes = (bd >= bd_e - rows_by_load_bytes
-                        || brg.brgattr.wary_tail_read);
-                broadcast(bcst(bd), A_offset(bd, rd),
-                        have_to_load_bytes && bd_by_load_bytes, brg.dt_a);
-            }
-            for (int ld = 0; ld < ld_block2; ld++) {
-                const auto addr = ptr[reg_aux_B + B_offset(ld, rd)];
-                const Vmm vmm_load
-                        = vmm_mask(load(), is_ld_tail, false, ld_tail_mask);
-                // Note: Assuming the tails are properly padded/blocked for
-                // avx2_vnni_2 with xf16 data type, as the B matrix is generally
-                // at least double-blocked.
-                if (brg.dt_b == data_type::f16) {
-                    if (brg.isa_impl == avx2_vnni_2) {
-                        if (rd % 2 == 0)
-                            vcvtneeph2ps(vmm_load, addr);
-                        else
-                            vcvtneoph2ps(vmm_load, addr);
-                    } else
-                        vcvtph2psx(vmm_load, addr);
-                } else if (brg.dt_b == data_type::bf16
-                        && brg.isa_impl == avx2_vnni_2) {
-                    if (rd % 2 == 0)
-                        vcvtneebf162ps(vmm_load, addr);
-                    else
-                        vcvtneobf162ps(vmm_load, addr);
-                } else if (is_ld_tail) {
-                    if (is_superset(brg.isa_impl, avx512_core)) {
-                        uni_vmovups(vmm_load, addr);
-                    } else {
-                        load_bytes(vmm_load, addr,
-                                brg.typesize_B * brg.ldb_tail * brg.ld_step);
-                    }
-                } else {
-                    uni_vmovups(vmm_load, addr);
-                }
-                for (int bd = bd_b; bd < bd_e; bd++) {
+    for (dim_t rd = 0; rd < rd_loop; rd += brg.rd_step) {
+        if (brg.n_bcast_1_load) {
+            for (dim_t bd = bd_b; bd < bd_e && !is_emdbd; bd++)
+                broadcast_A(bcst(bd), bd, rd);
+            for (dim_t ld = 0; ld < ld_block2; ld++) {
+                load_B(0, rd, ld);
+
+                for (dim_t bd = bd_b; bd < bd_e; bd++) {
                     auto vmm = accm(ld_block2, bd, ld);
                     if (is_emdbd)
                         uni_vfmadd231ps(vmm, load(),
@@ -2221,61 +2678,337 @@ void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(int bd_block2, bool is_bdb_tail,
                         dot_product(vmm, load(), bcst(bd));
                 }
             }
-        }
-    } else {
-        for (int rd = 0; rd < rd_loop; rd += brg.rd_step) {
-            int prefetch_count_B = 0;
-            for (int ld = 0; ld < ld_block2; ld++) {
-                const auto addr = ptr[reg_aux_B + B_offset(ld, rd)];
-                const Vmm vmm_load
-                        = vmm_mask(load(ld), is_ld_tail, false, ld_tail_mask);
-                // Note: Assuming the tails are properly padded/blocked for
-                // avx2_vnni_2, as the B matrix is generally
-                // at least double-blocked.
-                if (brg.dt_b == data_type::f16) {
-                    if (brg.isa_impl == avx2_vnni_2) {
-                        if (rd % 2 == 0)
-                            vcvtneeph2ps(vmm_load, addr);
+        } else {
+            if (brg.with_wei_decomp) {
+                auto reg_local_wei_scales = reg_bdb_loop;
+                auto reg_local_wei_zp = reg_ldb_loop;
+                auto reg_ptr = reg_local_wei_zp;
+
+                auto accm_tmp = [&](int ld_block, int bd, int ld) {
+                    int idx = max_effective_vregs - 1 - 2 * (brg.ld_block2 * brg.bd_block) - ld;
+                    return Vmm(idx);
+                };
+
+                auto load_zero_points = [&](Vmm vmm_zp, Xbyak::Address addr) {
+                    if (brg.wei_decomp_zero_points_stride == 0) {
+                        switch (brg.wei_decomp_zero_points_dt) {
+                            case data_type::f32: {
+                                uni_vbroadcastss(vmm_zp, addr);
+                                break;
+                            }
+                            case data_type::u8: {
+                                auto xmm_zp = Xmm(vmm_zp.getIdx());
+                                auto reg_ptr_32 = Reg32(reg_ptr.getIdx());
+                                movzx(reg_ptr_32, addr);
+                                uni_vmovq(xmm_zp, reg_ptr);
+                                uni_vcvtdq2ps(xmm_zp, xmm_zp);
+                                uni_vbroadcastss(vmm_zp, xmm_zp);
+                                break;
+                            }
+                            default: assert(!"unsupported data type");
+                        }
+                    } else {
+                        switch (brg.wei_decomp_zero_points_dt) {
+                            case data_type::f32: {
+                                uni_vmovups(vmm_zp, addr);
+                                break;
+                            }
+                            case data_type::u8: {
+                                uni_vpmovzxbd(vmm_zp, addr);
+                                uni_vcvtdq2ps(vmm_zp, vmm_zp);
+                                break;
+                            }
+                            default: assert(!"unsupported data type");
+                        }
+                    }
+                };
+
+                auto load_scales = [&](Vmm vmm_scales, Xbyak::Address addr) {
+                    if (brg.wei_decomp_scales_stride == 0) {
+                        switch (brg.wei_decomp_scales_dt) {
+                            case data_type::f32: {
+                                uni_vbroadcastss(vmm_scales, addr);
+                                break;
+                            }
+                            case data_type::e8m0: {
+                                auto xmm_scales = Xmm(vmm_scales.getIdx());
+                                auto reg_ptr_32 = Reg32(reg_ptr.getIdx());
+                                movzx(reg_ptr_32, addr);
+                                uni_vmovq(xmm_scales, reg_ptr);
+                                uni_vpslld(xmm_scales, xmm_scales, 23);
+                                uni_vbroadcastss(vmm_scales, xmm_scales);
+                                break;
+                            }
+                            default: assert(!"unsupported data type");
+                        }
+                    } else {
+                        switch (brg.wei_decomp_scales_dt) {
+                            case data_type::f32: {
+                                uni_vmovups(vmm_scales, addr);
+                                break;
+                            }
+                            case data_type::e8m0: {
+                                uni_vpmovzxbd(vmm_scales, addr);
+                                uni_vpslld(vmm_scales, vmm_scales, 23);
+                                break;
+                            }
+                            default: assert(!"unsupported data type");
+                        }
+                    }
+                };
+
+                mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+                mov(ptr[rsp + reg_ldb_loop_offs_], reg_ldb_loop);
+
+                auto vmm_zero_points = Vmm(isa_num_vregs(brg.isa_impl) - 1);
+                auto vmm_mask8 = Vmm(isa_num_vregs(brg.isa_impl) - 1);
+                auto vmm_mask7 = Vmm(isa_num_vregs(brg.isa_impl) - 2);
+                auto vmm_lookup = Vmm(isa_num_vregs(brg.isa_impl) - 1);
+                auto vmm_lookup_low = Vmm(isa_num_vregs(brg.isa_impl) - 3);
+                auto vmm_lookup_high = Vmm(isa_num_vregs(brg.isa_impl) - 4);
+                auto vmm_mask_signed_bit = Vmm(isa_num_vregs(brg.isa_impl) - 2);
+                if (brg.dt_b == data_type::nf4) {
+                    static const float lookup[16] = {
+                        -1.0,
+                        -0.6961928009986877,
+                        -0.5250730514526367,
+                        -0.39491748809814453,
+                        -0.28444138169288635,
+                        -0.18477343022823334,
+                        -0.09105003625154495,
+                        0.0,
+                        0.07958029955625534,
+                        0.16093020141124725,
+                        0.24611230194568634,
+                        0.33791524171829224,
+                        0.44070982933044434,
+                        0.5626170039176941,
+                        0.7229568362236023,
+                        1.0};
+
+                    static const int32_t mask8[16] = {
+                        8, 8, 8, 8, 8, 8, 8, 8,
+                        8, 8, 8, 8, 8, 8, 8, 8
+                    };
+                    static const int32_t mask7[16] = {
+                        7, 7, 7, 7, 7, 7, 7, 7,
+                        7, 7, 7, 7, 7, 7, 7, 7
+                    };
+
+                    if (brg.isa_impl == avx2) {
+                        mov(reg_ptr, (size_t)lookup);
+                        uni_vmovups(vmm_lookup_low, ptr[reg_ptr]);
+                        mov(reg_ptr, (size_t)lookup);
+                        uni_vmovups(vmm_lookup_high, ptr[reg_ptr + 8 * sizeof(float)]);
+                        mov(reg_ptr, (size_t)mask8);
+                        uni_vmovups(vmm_mask8, ptr[reg_ptr]);
+                        mov(reg_ptr, (size_t)mask7);
+                        uni_vmovups(vmm_mask7, ptr[reg_ptr]);
+                        if (brg.wei_decomp_zero_points_stride == 0)
+                            vmm_zero_points = Vmm(isa_num_vregs(brg.isa_impl) - 6);
                         else
-                            vcvtneoph2ps(vmm_load, addr);
+                            vmm_zero_points = Vmm(isa_num_vregs(brg.isa_impl) - 5);
                     } else {
-                        vcvtph2psx(vmm_load, addr);
+                        mov(reg_ptr, (size_t)lookup);
+                        uni_vmovups(vmm_lookup, ptr[reg_ptr]);
+                        vmm_zero_points = Vmm(isa_num_vregs(brg.isa_impl) - 2);
                     }
-                } else if (brg.dt_b == data_type::bf16
-                        && brg.isa_impl == avx2_vnni_2) {
-                    if (rd % 2 == 0)
-                        vcvtneebf162ps(vmm_load, addr);
-                    else
-                        vcvtneobf162ps(vmm_load, addr);
-                } else if (is_ld_tail) {
-                    if (is_superset(brg.isa_impl, avx512_core)) {
-                        uni_vmovups(vmm_load, addr);
+                } else if (brg.dt_b == data_type::f4_e2m1) {
+                    static const float lookup[16] = {
+                        0.0f,   0.5f,
+                        1.0f,   1.5f,
+                        2.0f,   3.0f,
+                        4.0f,   6.0f,
+                        -0.0f,  -0.5f,
+                        -1.0f,  -1.5f,
+                        -2.0f,  -3.0f,
+                        -4.0f,  -6.0f
+                    };
+
+                    static const uint32_t mask_signed_bit[8] = {
+                        0x80000000, 0x80000000, 0x80000000, 0x80000000,
+                        0x80000000, 0x80000000, 0x80000000, 0x80000000,
+                    };
+
+                    if (brg.isa_impl == avx2) {
+                        mov(reg_ptr, (size_t)lookup);
+                        uni_vmovups(vmm_lookup, ptr[reg_ptr]);
+                        mov(reg_ptr, (size_t)mask_signed_bit);
+                        uni_vmovups(vmm_mask_signed_bit, ptr[reg_ptr]);
+                        vmm_zero_points = Vmm(isa_num_vregs(brg.isa_impl) - 3);
                     } else {
-                        load_bytes(vmm_load, addr,
-                                brg.typesize_B * brg.ldb_tail * brg.ld_step);
+                        mov(reg_ptr, (size_t)lookup);
+                        uni_vmovups(vmm_lookup, ptr[reg_ptr]);
+                        vmm_zero_points = Vmm(isa_num_vregs(brg.isa_impl) - 2);
                     }
-                } else {
-                    uni_vmovups(vmm_load, addr);
                 }
-            }
 
-            bool have_to_load_bytes
-                    = maybe_load_bytes && (rd == rd_loop - brg.rd_step);
+                mov(reg_local_wei_scales, ptr[rsp + reg_aux2_wei_scales_offs_]);
+                mov(reg_local_wei_zp, ptr[rsp + reg_aux2_wei_zero_points_offs_]);
 
-            auto rows_by_load_bytes = have_to_load_bytes ? rows_for_rd_tail : 0;
-            for (int bd = bd_b; bd < bd_e; bd++) {
-                if (!is_emdbd) {
-                    const auto bd_by_load_bytes
-                            = (bd >= bd_e - rows_by_load_bytes
-                                    || brg.brgattr.wary_tail_read);
-                    broadcast(bcst(), A_offset(bd, rd),
-                            have_to_load_bytes && bd_by_load_bytes, brg.dt_a);
+                if (brg.with_wei_decomp_zero_points && brg.wei_decomp_zero_points_stride == 0) {
+                    load_zero_points(vmm_zero_points, ptr[reg_local_wei_zp]);
+                }
+
+                for (int rd = 0; rd < rd_loop; rd += brg.rd_step) {
+                    int prefetch_count_B = 0;
+                    for (int ld = 0; ld < ld_block2; ld++) {
+                        const auto addr = ptr[reg_aux_B + B_offset(ld, rd)];
+                        const Vmm vmm_load = vmm_mask(load(ld), is_ld_tail, false, ld_tail_mask);
+                        if (brg.dt_b == data_type::u8) {
+                            uni_vpmovzxbd(vmm_load, addr);
+                            uni_vcvtdq2ps(vmm_load, vmm_load);
+                        } else if (brg.dt_b == data_type::s8) {
+                            uni_vpmovsxbd(vmm_load, addr);
+                            uni_vcvtdq2ps(vmm_load, vmm_load);
+                        } else if (brg.dt_b == data_type::u4) {
+                            uni_vpmovzxbd(vmm_load, addr);
+                            if (rd % 2 == 0) {
+                                uni_vpsrld(vmm_load, vmm_load, 4);
+                            } else {
+                                uni_vpslld(vmm_load, vmm_load, 28);
+                                uni_vpsrld(vmm_load, vmm_load, 28);
+                            }
+                            uni_vcvtdq2ps(vmm_load, vmm_load);
+                        } else if (brg.dt_b == data_type::s4) {
+                            if (rd % 2 == 0) {
+                                uni_vpmovsxbd(vmm_load, addr);
+                                vpsrad(vmm_load, vmm_load, 4);
+                            } else {
+                                uni_vpmovsxbd(vmm_load, addr);
+                                uni_vpslld(vmm_load, vmm_load, 28);
+                                vpsrad(vmm_load, vmm_load, 28);
+                            }
+                            uni_vcvtdq2ps(vmm_load, vmm_load);
+                        } else if (brg.dt_b == data_type::nf4) {
+                            uni_vpmovzxbd(vmm_load, addr);
+                            if (rd % 2 == 0) {
+                                uni_vpsrld(vmm_load, vmm_load, 4);
+                            } else {
+                                uni_vpslld(vmm_load, vmm_load, 28);
+                                uni_vpsrld(vmm_load, vmm_load, 28);
+                            }
+
+                            if (brg.isa_impl == avx2) {
+                                auto res = bcst();
+                                auto mask = Vmm(isa_num_vregs(brg.isa_impl) - 5);
+                                vpcmpgtd(mask, vmm_load, vmm_mask7);
+                                vpermd(res, vmm_load, vmm_lookup_low);
+                                vpsubd(vmm_load, vmm_load, vmm_mask8);
+                                vpermd(vmm_load, vmm_load, vmm_lookup_high);
+                                vblendvps(vmm_load, res, vmm_load, mask);
+                            } else {
+                                vpermd(vmm_load, vmm_load, vmm_lookup);
+                            }
+                        }  else if (brg.dt_b == data_type::f4_e2m1) {
+                            if (brg.isa_impl == avx2) {
+                                uni_vpmovsxbd(vmm_load, addr);
+                                if (rd % 2 == 0) {
+                                    vpsrad(vmm_load, vmm_load, 4);
+                                } else {
+                                    uni_vpslld(vmm_load, vmm_load, 28);
+                                    vpsrad(vmm_load, vmm_load, 28);
+                                }
+                                auto mask = bcst();
+                                uni_vpand(mask, vmm_load, vmm_mask_signed_bit);
+                                vpermd(vmm_load, vmm_load, vmm_lookup);
+                                uni_vorps(vmm_load, vmm_load, mask);
+                            } else {
+                                uni_vpmovzxbd(vmm_load, addr);
+                                if (rd % 2 == 0) {
+                                    uni_vpsrld(vmm_load, vmm_load, 4);
+                                } else {
+                                    uni_vpslld(vmm_load, vmm_load, 28);
+                                    uni_vpsrld(vmm_load, vmm_load, 28);
+                                }
+                                vpermd(vmm_load, vmm_load, vmm_lookup);
+                            }
+                        } else {
+                            assert(!"unsupported combination");
+                        }
+
+                        if (brg.with_wei_decomp_zero_points) {
+                            if (brg.wei_decomp_zero_points_stride == 0) {
+                                uni_vsubps(vmm_load, vmm_load, vmm_zero_points);
+                            } else {
+                                load_zero_points(bcst(), ptr[reg_local_wei_zp + ld * brg.ld_block * types::data_type_size(brg.wei_decomp_zero_points_dt)]);
+                                uni_vsubps(vmm_load, vmm_load, bcst());
+                            }
+                        }
+
+                        if (brg.with_wei_decomp_scales && brg.bd_block != 1) {
+                            if (brg.wei_decomp_scales_stride == 0) {
+                                load_scales(bcst(), ptr[reg_local_wei_scales]);
+                            } else {
+                                load_scales(bcst(), ptr[reg_local_wei_scales + ld * brg.ld_block * types::data_type_size(brg.wei_decomp_scales_dt)]);
+                            }
+                            uni_vmulps(vmm_load, vmm_load, bcst());
+                        }
+                    }
+
+                    for (int bd = bd_b; bd < bd_e; bd++) {
+                        if (!is_emdbd) {
+                            if (brg.dt_a == data_type::bf16) {
+                                vpbroadcastw(bcst(), ptr[reg_aux_A + A_offset(bd, rd)]);
+                                uni_vpmovzxwd(bcst(), bcst());
+                                uni_vpslld(bcst(), bcst(), 16);
+                            } else {
+                                broadcast_A(bcst(bd), bd, rd);
+                            }
+                        }
+                        if (prefetch_count_B < ld_block2) {
+                            prefetcht0(ptr[reg_aux_B + B_offset(prefetch_count_B++, rd)
+                                    + brg.LDB * brg.rd_block * brg.typesize_B]);
+                        }
+                        for (int ld = 0; ld < ld_block2; ld++) {
+                            auto vmm = brg.bd_block != 1 ? accm(ld_block2, bd, ld)
+                                                         : accm_tmp(ld_block2, bd, ld);
+                            if (brg.bd_block == 1 && rd == 0) {
+                                if (is_emdbd)
+                                    uni_vmulps(vmm, load(ld), ptr_b[reg_aux_A + A_offset(bd, rd)]);
+                                else
+                                    uni_vmulps(vmm, load(ld), bcst());
+                            } else {
+                                if (is_emdbd)
+                                    uni_vfmadd231ps(vmm, load(ld), ptr_b[reg_aux_A + A_offset(bd, rd)]);
+                                else
+                                    uni_vfmadd231ps(vmm, load(ld), bcst());
+                            }
+                        }
+                    }
+                }
+
+                if (brg.with_wei_decomp_scales && brg.bd_block == 1) {
+                    for (int ld = 0; ld < ld_block2; ld++) {
+                        auto vmm_accm_tmp = accm_tmp(ld_block2, 0, ld);
+                        auto vmm_accm = accm(ld_block2, 0, ld);
+                        if (brg.wei_decomp_scales_stride == 0) {
+                            load_scales(bcst(), ptr[reg_local_wei_scales]);
+                        } else {
+                            load_scales(bcst(), ptr[reg_local_wei_scales + ld * brg.ld_block * types::data_type_size(brg.wei_decomp_scales_dt)]);
+                        }
+                        uni_vfmadd231ps(vmm_accm, vmm_accm_tmp, bcst());
+                    }
                 }
+
+                mov(reg_ldb_loop, ptr[rsp + reg_ldb_loop_offs_]);
+                mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+
+                return;
+            }
+
+            dim_t prefetch_count_B = 0;
+            for (dim_t ld = 0; ld < ld_block2; ld++) {
+                load_B(ld, rd, ld);
+            }
+
+            for (dim_t bd = bd_b; bd < bd_e; bd++) {
+                if (!is_emdbd) broadcast_A(bcst(), bd, rd);
                 if (prefetch_count_B < ld_block2) {
                     prefetcht0(ptr[reg_aux_B + B_offset(prefetch_count_B++, rd)
                             + brg.LDB * brg.rd_block * brg.typesize_B]);
                 }
-                for (int ld = 0; ld < ld_block2; ld++) {
+                for (dim_t ld = 0; ld < ld_block2; ld++) {
                     auto vmm = accm(ld_block2, bd, ld);
                     if (is_emdbd)
                         uni_vfmadd231ps(vmm, load(ld),
@@ -2289,21 +3022,118 @@ void jit_brgemm_kernel_t<Wmm>::gemm_microkernel(int bd_block2, bool is_bdb_tail,
 }
 
 template <typename Wmm>
-void jit_brgemm_kernel_t<Wmm>::ldb_loop(int bd_block2, bool is_bdb_tail,
-        int ld_block2, int ldb_loop_length, bool is_reg_tail, bool is_ld_tail,
-        bool check_top_vpad, bool check_bottom_vpad, int rows_for_rd_tail,
+void jit_brgemm_kernel_t<Wmm>::ldb_loop(dim_t bd_block2, bool is_bdb_tail,
+        dim_t ld_block2, dim_t ldb_loop_length, bool is_reg_tail,
+        bool is_ld_tail, bool first_bdb, bool last_bdb, dim_t rows_for_rd_tail,
         bool skip_accumulation) {
+    auto ic_group_shift_generic = [&]() {
+        if ((brg.with_grouped_wei_decomp && (brg.wei_decomp_scales_stride != 0 || brg.wei_decomp_zero_points_stride != 0))
+                || brg.with_src_dyn_quant) {
+            auto reg_local_ic = reg_aux_D;
+            auto reg_local_wei_params = reg_bdb_loop;
+            auto reg_local_ic_group = reg_ldb_loop;
+
+            auto ic_group_shift = [&](int src_offs, int dst_offs, int group_size, int stride) {
+                mov(reg_local_ic, ptr[rsp + reg_aux_ic_offs_]);
+                mov(reg_local_ic_group, group_size);
+                xor_(rdx, rdx);
+                idiv(reg_local_ic_group);
+                imul(reg_local_ic, reg_local_ic, stride);
+
+                mov(reg_local_wei_params, ptr[rsp + src_offs]);
+                add(reg_local_wei_params, reg_local_ic);
+                mov(ptr[rsp + dst_offs], reg_local_wei_params);
+            };
+
+            mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+            mov(ptr[rsp + reg_aux2_D_offs_], reg_aux_D);
+            mov(ptr[rsp + reg_ldb_loop_offs_], reg_ldb_loop);
+            mov(ptr[rsp + reg_reg_a_offset_offs_], reg_a_offset); // preserve rdx for idiv
+
+            if (brg.with_wei_decomp_scales && brg.wei_decomp_scales_stride != 0) {
+                ic_group_shift(reg_aux_wei_scales_offs_, reg_aux2_wei_scales_offs_,
+                                brg.wei_decomp_scales_group_size, brg.wei_decomp_scales_stride * types::data_type_size(brg.wei_decomp_scales_dt));
+            }
+
+            if (brg.with_wei_decomp_zero_points && brg.wei_decomp_zero_points_stride != 0) {
+                ic_group_shift(reg_aux_wei_zero_points_offs_, reg_aux2_wei_zero_points_offs_,
+                                brg.wei_decomp_zero_points_group_size, brg.wei_decomp_zero_points_stride * types::data_type_size(brg.wei_decomp_zero_points_dt));
+            }
+
+            if (brg.with_src_dyn_quant) {
+                ic_group_shift(reg_aux_src_scales_offs_, reg_aux2_src_scales_offs_,
+                                brg.src_scales_group_size, sizeof(float));
+
+                if (brg.with_wei_decomp_zero_points) {
+                    ic_group_shift(reg_aux_src_grouped_sum_offs_, reg_aux2_src_grouped_sum_offs_,
+                                brg.src_sum_group_size, sizeof(int32_t));
+                }
+            }
+
+            mov(reg_local_ic, ptr[rsp + reg_aux_ic_offs_]);
+            add(reg_local_ic, brg.rd_block);
+            mov(ptr[rsp + reg_aux_ic_offs_], reg_local_ic);
+
+            mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+            mov(reg_aux_D, ptr[rsp + reg_aux2_D_offs_]);
+            mov(reg_ldb_loop, ptr[rsp + reg_ldb_loop_offs_]);
+            mov(reg_a_offset, ptr[rsp + reg_reg_a_offset_offs_]);
+        }
+    };
+
+    auto ic_group_shift_opt = [&](int rb) {
+        if ((brg.with_grouped_wei_decomp && (brg.wei_decomp_scales_stride != 0 || brg.wei_decomp_zero_points_stride != 0))
+                || brg.with_src_dyn_quant) {
+            mov(ptr[rsp + reg_bdb_loop_offs_], reg_rdb_loop);
+            auto reg_ptr = reg_rdb_loop;
+
+            auto ic_group_shift = [&](int src_offs, int dst_offs, int group_size, int stride) {
+                if ((rb + 1) * brg.rd_block % group_size == 0) {
+                    mov(reg_ptr, ptr[rsp + src_offs]);
+                    add(reg_ptr, stride);
+                    mov(ptr[rsp + dst_offs], reg_ptr);
+                }
+            };
+
+            if (brg.with_wei_decomp_scales && brg.wei_decomp_scales_stride != 0) {
+                ic_group_shift(reg_aux2_wei_scales_offs_, reg_aux2_wei_scales_offs_,
+                                brg.wei_decomp_scales_group_size, brg.wei_decomp_scales_stride * types::data_type_size(brg.wei_decomp_scales_dt));
+            }
+
+            if (brg.with_wei_decomp_zero_points && brg.wei_decomp_zero_points_stride != 0) {
+                ic_group_shift(reg_aux2_wei_zero_points_offs_, reg_aux2_wei_zero_points_offs_,
+                                brg.wei_decomp_zero_points_group_size, brg.wei_decomp_zero_points_stride * types::data_type_size(brg.wei_decomp_zero_points_dt));
+            }
+
+            if (brg.with_src_dyn_quant) {
+                ic_group_shift(reg_aux2_src_scales_offs_, reg_aux2_src_scales_offs_,
+                                brg.src_scales_group_size, sizeof(float));
+
+                if (brg.with_wei_decomp_zero_points) {
+                    ic_group_shift(reg_aux2_src_grouped_sum_offs_, reg_aux2_src_grouped_sum_offs_,
+                                brg.src_sum_group_size, sizeof(int32_t));
+                }
+            }
+
+            mov(reg_rdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+        }
+    };
 
     Label ldb_loop_label;
     Label BS_loop_label;
 
     copy_post_ops_stack_values_to_aux(is_reg_tail);
 
-    auto ld_loop_body = [&](int vpad) {
+    auto ld_loop_body = [&](dim_t vpad, bool last_bdb) {
+        if (brg.with_grouped_wei_decomp) {
+            mov(reg_ic, ptr[rsp + reg_ic_offs_]);
+            mov(ptr[rsp + reg_aux_ic_offs_], reg_ic);
+        }
+
         set_A_B_matrices();
 
-        int bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
-        const auto bd_b = nstl::max(0, vpad);
+        dim_t bd_block = (is_bdb_tail) ? brg.bdb_tail : brg.bd_block;
+        const auto bd_b = nstl::max(dim_t(0), vpad);
         const auto bd_e = nstl::min(bd_block, bd_block + vpad);
         const auto is_valid_bd
                 = need_comp_pads && vpad != 0 ? bd_b <= bd_e : bd_b < bd_e;
@@ -2311,32 +3141,87 @@ void jit_brgemm_kernel_t<Wmm>::ldb_loop(int bd_block2, bool is_bdb_tail,
 
         if (brg.is_tmm) {
             const bool is_rd_tail = false;
-            gemm_microkernel_amx(
-                    bd_block2, is_bdb_tail, ld_block2, is_rd_tail, is_ld_tail);
+            gemm_microkernel_amx(bd_block2, is_bdb_tail, ld_block2, is_rd_tail,
+                    is_ld_tail, last_bdb);
         } else {
-            if (brg.rdb > 0) {
-                Label rdb_loop_label;
-                mov(reg_rdb_loop, brg.rdb);
-                L_aligned(rdb_loop_label, 64);
-                {
-                    const bool is_rd_tail = false;
-                    gemm_microkernel(bd_block2, is_bdb_tail, ld_block2,
-                            is_rd_tail, is_ld_tail, vpad, rows_for_rd_tail);
+            ic_group_shift_generic();
+
+            auto rdb_group = brg.rd_block;
+            auto rd_size = brg.rdb * brg.rd_block + brg.rdb_tail;
+            if (brg.wei_decomp_scales_group_size < rd_size)
+                rdb_group = nstl::max(rdb_group, brg.wei_decomp_scales_group_size);
+            if (brg.wei_decomp_zero_points_group_size < rd_size)
+                rdb_group = nstl::max(rdb_group, brg.wei_decomp_zero_points_group_size);
+            if (brg.with_src_dyn_quant) {
+                rdb_group = nstl::max(rdb_group, brg.src_scales_group_size);
+                if (brg.with_wei_decomp_zero_points) {
+                    rdb_group = nstl::max(rdb_group, brg.src_sum_group_size);
+                }
+            }
+            rdb_group = rdb_group / brg.rd_block;
+            auto rbd_blocks = brg.rdb / rdb_group;
+            auto max_rdb_unroll = 8;
+
+            if (brg.with_wei_decomp && rdb_group <= max_rdb_unroll) {
+                if (rbd_blocks > 0) {
+                    Label rdb_loop_label;
+                    mov(reg_rdb_loop, rbd_blocks);
+                    L_aligned(rdb_loop_label, 64);
+                    {
+                        for (int rb = 0; rb < rdb_group; rb++) {
+                            gemm_microkernel(bd_block2, is_bdb_tail, ld_block2, false,
+                                    is_ld_tail, vpad, rows_for_rd_tail);
+
+                            add(reg_aux_A, rdb_A_offset());
+                            add(reg_aux_B, rdb_B_offset());
+
+                            ic_group_shift_opt(rb);
+                        }
+
+                        dec(reg_rdb_loop);
+                        cmp(reg_rdb_loop, 0);
+                    }
+                    jg(rdb_loop_label, T_NEAR);
+                }
+
+                for (int rb = rbd_blocks * rdb_group; rb < brg.rdb; rb++) {
+                    gemm_microkernel(bd_block2, is_bdb_tail, ld_block2, false,
+                            is_ld_tail, vpad, rows_for_rd_tail);
 
                     add(reg_aux_A, rdb_A_offset());
                     add(reg_aux_B, rdb_B_offset());
 
-                    dec(reg_rdb_loop);
-                    cmp(reg_rdb_loop, 0);
+                    ic_group_shift_opt(rb);
+
+                    mov(reg_rdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+                }
+            } else {
+                if (brg.rdb > 0) {
+                    Label rdb_loop_label;
+                    mov(reg_rdb_loop, brg.rdb);
+                    L_aligned(rdb_loop_label, 64);
+                    {
+                        const bool is_rd_tail = false;
+                        gemm_microkernel(bd_block2, is_bdb_tail, ld_block2,
+                                is_rd_tail, is_ld_tail, vpad, rows_for_rd_tail);
+
+                        add(reg_aux_A, rdb_A_offset());
+                        add(reg_aux_B, rdb_B_offset());
+
+                        ic_group_shift_generic();
+
+                        dec(reg_rdb_loop);
+                        cmp(reg_rdb_loop, 0);
+                    }
+                    jg(rdb_loop_label, T_NEAR);
                 }
-                jg(rdb_loop_label, T_NEAR);
             }
         }
         if (brg.rdb_tail != 0) {
             const bool is_rd_tail = true;
             if (brg.is_tmm) {
                 gemm_microkernel_amx(bd_block2, is_bdb_tail, ld_block2,
-                        is_rd_tail, is_ld_tail);
+                        is_rd_tail, is_ld_tail, last_bdb);
             } else {
                 gemm_microkernel(bd_block2, is_bdb_tail, ld_block2, is_rd_tail,
                         is_ld_tail, vpad, rows_for_rd_tail);
@@ -2368,28 +3253,14 @@ void jit_brgemm_kernel_t<Wmm>::ldb_loop(int bd_block2, bool is_bdb_tail,
                 mov(reg_stride_ldb, brg.rd_step * brg.typesize_B * brg.LDB);
             }
 
-            if (brg.req_s8s8_compensation) {
-                mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
-                mov(reg_s8_input_shift, 128);
-                uni_vpbroadcastb(vmm_inp_shift(), reg_s8_input_shift.cvt8());
-                mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
-            }
-            if (need_comp_pads && brg.zp_type_a != brgemm_broadcast_t::none) {
-                mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
-                const auto reg32_scratch = reg_zp_a_input_shift.cvt32();
-                mov(reg32_scratch, 0x1010101);
-                uni_vpbroadcastd(vmm_one_bytes(), reg32_scratch);
-                mov(reg32_scratch, ptr[rsp + reg_zp_a_val_offs_]);
-                uni_vpbroadcastd(vmm_zp_a_shift(), reg32_scratch);
-                mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
-            }
-
             if (brg.brgattr.max_bs > 1) mov(reg_BS_loop, reg_BS);
             L_aligned(BS_loop_label, 64);
             {
-                if (check_top_vpad || check_bottom_vpad) {
-                    const auto vpad_first = -brg.brgattr.max_bottom_vpad;
-                    const auto vpad_last = brg.brgattr.max_top_vpad;
+                if (first_bdb || last_bdb) {
+                    const auto vpad_first
+                            = last_bdb ? (-brg.brgattr.max_bottom_vpad) : 1;
+                    const auto vpad_last
+                            = first_bdb ? brg.brgattr.max_top_vpad : -1;
                     const auto n_vpads = vpad_last - vpad_first + 2;
                     constexpr auto MAX_N_VPADS = 2 * brgemm_desc_t::MAX_VPAD;
                     assert(n_vpads < MAX_N_VPADS);
@@ -2414,13 +3285,13 @@ void jit_brgemm_kernel_t<Wmm>::ldb_loop(int bd_block2, bool is_bdb_tail,
                     } else
                         xor_(reg_aux_A_vpad, reg_aux_A_vpad);
 
-                    for (int vpad = vpad_first; vpad <= vpad_last; vpad++) {
+                    for (dim_t vpad = vpad_first; vpad <= vpad_last; vpad++) {
                         const auto label_vpad = vpad - vpad_first;
                         L(Vpad_loop_iter_label[label_vpad]);
-                        if (!check_top_vpad && vpad > 0) continue;
-                        if (!check_bottom_vpad && vpad < 0) continue;
+                        if (!first_bdb && vpad > 0) continue;
+                        if (!last_bdb && vpad < 0) continue;
                         auto real_vpad = vpad;
-                        if (check_bottom_vpad && brg.bdb_tail && vpad < 0) {
+                        if (last_bdb && brg.bdb_tail && vpad < 0) {
                             if (!is_bdb_tail) {
                                 // for last full block before
                                 // bdb_tail && -vpad greater than bdb_tail
@@ -2440,14 +3311,14 @@ void jit_brgemm_kernel_t<Wmm>::ldb_loop(int bd_block2, bool is_bdb_tail,
                         }
                         cmp(reg_aux_A_vpad, vpad);
                         jne(Vpad_loop_iter_label[label_vpad + 1], T_NEAR);
-                        ld_loop_body(real_vpad);
+                        ld_loop_body(real_vpad, last_bdb);
                         jmp(Vpad_loop_end_label, T_NEAR);
                     }
                     L(Vpad_loop_iter_label[n_vpads - 1]);
-                    ld_loop_body(0);
+                    ld_loop_body(0, last_bdb);
                     L(Vpad_loop_end_label);
                 } else {
-                    ld_loop_body(0);
+                    ld_loop_body(0, last_bdb);
                 }
                 if (brg.brgattr.max_bs > 1) {
                     dec(reg_BS_loop);
@@ -2483,68 +3354,77 @@ void jit_brgemm_kernel_t<Wmm>::ldb_loop(int bd_block2, bool is_bdb_tail,
 
 template <typename Wmm>
 void jit_brgemm_kernel_t<Wmm>::bdb_loop() {
-    auto do_ldb_loop = [this](int bd_block2, bool is_bdb_tail,
-                               bool check_top_vpad, bool check_bottom_vpad,
-                               int rows_for_rd_tail, bool skip_accumulation) {
+    auto do_ldb_loop = [this](dim_t bd_block2, bool is_bdb_tail, bool first_bdb,
+                               bool last_bdb, dim_t rows_for_rd_tail,
+                               bool skip_accumulation) {
         if (brg.ldb2 > 0) {
             const bool is_ld_reg_tail = false;
             const bool is_ld_tail = false;
             ldb_loop(bd_block2, is_bdb_tail, brg.ld_block2, brg.ldb2,
-                    is_ld_reg_tail, is_ld_tail, check_top_vpad,
-                    check_bottom_vpad, rows_for_rd_tail, skip_accumulation);
+                    is_ld_reg_tail, is_ld_tail, first_bdb, last_bdb,
+                    rows_for_rd_tail, skip_accumulation);
         }
         if (brg.ldb2_tail > 0) {
             const bool is_ld_reg_tail = (brg.ldb2 == 0) ? false : true;
             const bool is_ld_tail = false;
             ldb_loop(bd_block2, is_bdb_tail, brg.ldb2_tail, 1, is_ld_reg_tail,
-                    is_ld_tail, check_top_vpad, check_bottom_vpad,
-                    rows_for_rd_tail, skip_accumulation);
+                    is_ld_tail, first_bdb, last_bdb, rows_for_rd_tail,
+                    skip_accumulation);
         }
         if (brg.ldb_tail > 0) {
             const bool is_ld_reg_tail
                     = (brg.ldb2 == 0 && brg.ldb2_tail == 0) ? false : true;
             const bool is_ld_tail = true;
             ldb_loop(bd_block2, is_bdb_tail, 1, 1, is_ld_reg_tail, is_ld_tail,
-                    check_top_vpad, check_bottom_vpad, rows_for_rd_tail,
-                    skip_accumulation);
-        }
-    };
-
-    auto bdb_loop_body = [this, do_ldb_loop](int bd_block2, bool is_bdb_tail,
-                                 bool check_top_vpad, bool check_bottom_vpad,
-                                 int rows_for_rd_tail, bool skip_accumulation) {
-        do_ldb_loop(bd_block2, is_bdb_tail, check_top_vpad, check_bottom_vpad,
-                rows_for_rd_tail, skip_accumulation);
-
-        if (brg.is_runtime_ldc) {
-            mov(ptr[rsp + reg_aux_C_bdb_loop_backup_offs_], reg_C);
-            xor_(reg_C, reg_C);
-            imul(reg_C, ptr[rsp + reg_C_shift_bytes_offs_],
-                    bdb_C_offset(bd_block2));
-            add(reg_C, ptr[rsp + reg_aux_C_bdb_loop_backup_offs_]);
-        } else {
-            add(reg_C, bdb_C_offset(bd_block2));
-        }
-        if (brg.is_runtime_ldd) {
-            mov(ptr[rsp + reg_aux_D_bdb_loop_backup_offs_], reg_D);
-            xor_(reg_D, reg_D);
-            imul(reg_D, ptr[rsp + reg_D_shift_bytes_offs_],
-                    bdb_D_offset(bd_block2));
-            add(reg_D, ptr[rsp + reg_aux_D_bdb_loop_backup_offs_]);
-        } else {
-            add(reg_D, bdb_D_offset(bd_block2));
+                    first_bdb, last_bdb, rows_for_rd_tail, skip_accumulation);
         }
-        add(reg_a_offset, bdb_A_offset(bd_block2));
-
-        advance_bd_block2_post_op_regs(bd_block2);
     };
 
-    int rows_for_rd_tail, bd_blocks_for_rd_tail;
+    auto bdb_loop_body
+            = [this, do_ldb_loop](dim_t bd_block2, bool is_bdb_tail,
+                      bool first_bdb, bool last_bdb, dim_t rows_for_rd_tail,
+                      bool skip_accumulation) {
+                  do_ldb_loop(bd_block2, is_bdb_tail, first_bdb, last_bdb,
+                          rows_for_rd_tail, skip_accumulation);
+
+                  if (brg.is_runtime_ldc) {
+                      mov(ptr[rsp + reg_aux_C_bdb_loop_backup_offs_], reg_C);
+                      xor_(reg_C, reg_C);
+                      imul(reg_C, ptr[rsp + reg_C_shift_bytes_offs_],
+                              bdb_C_offset(bd_block2));
+                      add(reg_C, ptr[rsp + reg_aux_C_bdb_loop_backup_offs_]);
+                  } else {
+                      add(reg_C, bdb_C_offset(bd_block2));
+                  }
+                  if (brg.is_runtime_ldd) {
+                      mov(ptr[rsp + reg_aux_D_bdb_loop_backup_offs_], reg_D);
+                      xor_(reg_D, reg_D);
+                      imul(reg_D, ptr[rsp + reg_D_shift_bytes_offs_],
+                              bdb_D_offset(bd_block2));
+                      add(reg_D, ptr[rsp + reg_aux_D_bdb_loop_backup_offs_]);
+                  } else {
+                      add(reg_D, bdb_D_offset(bd_block2));
+                  }
+                  add(reg_a_offset, bdb_A_offset(bd_block2));
+
+                  if (brg.with_src_dyn_quant) {
+                      mov(reg_src_scales, ptr[rsp + reg_src_scales_offs_]);
+                      add(reg_src_scales, bd_block2 * brg.bd_block * brg.src_scales_stride * sizeof(float));
+                      mov(ptr[rsp + reg_src_scales_offs_], reg_src_scales);
+
+                      mov(reg_src_grouped_sum, ptr[rsp + reg_src_grouped_sum_offs_]);
+                      add(reg_src_grouped_sum, bd_block2 * brg.bd_block * brg.src_grouped_sum_stride * sizeof(int32_t));
+                      mov(ptr[rsp + reg_src_grouped_sum_offs_], reg_src_grouped_sum);
+                  }
+
+                  advance_bd_block2_post_op_regs(bd_block2);
+              };
+
+    dim_t rows_for_rd_tail, bd_blocks_for_rd_tail;
 
     if (brg.is_tmm) {
         rows_for_rd_tail = 0;
         bd_blocks_for_rd_tail = 0;
-        n_bcast_1_load = false;
     } else {
         rows_for_rd_tail = 0;
         if (brg.rdb_tail != 0 && (brg.is_bf16 || brg.is_int8)) {
@@ -2554,23 +3434,10 @@ void jit_brgemm_kernel_t<Wmm>::bdb_loop() {
                     : 0;
         }
         bd_blocks_for_rd_tail
-                = div_up(nstl::max(0,
+                = div_up(nstl::max(dim_t(0),
                                  rows_for_rd_tail - brg.bdb_tail
                                          + brg.brgattr.max_bottom_vpad),
                         brg.bd_block);
-
-        auto ld_block2 = (brg.ldb2 > 0)
-                ? brg.ld_block2
-                : ((brg.ldb2_tail > 0) ? brg.ldb2_tail : 1);
-        const int free_vregs = max_effective_vregs - brg.req_s8s8_compensation;
-        n_bcast_1_load = brg.is_int8
-                && ((brg.bd_block * (ld_block2 + 1) < free_vregs)
-                        && (bd_blocks_for_rd_tail == 0)
-                        && (rows_for_rd_tail == 0));
-        if (brg.brgattr.hint_loop_order != brgemm_lo_default)
-            n_bcast_1_load = (brg.brgattr.hint_loop_order == brgemm_lo_bl_1load)
-                    ? true
-                    : false;
     }
 
     auto bdb_loop_avx512 = [&](bool skip_accumulation) {
@@ -2656,26 +3523,60 @@ void jit_brgemm_kernel_t<Wmm>::bdb_loop() {
         L_aligned(bdb_loop_end_label, 64);
     };
     auto bdb_loop_amx = [&](bool skip_accumulation) {
-        Label bdb_loop_label;
-        if (brg.bd_block2 >= 1) {
-            mov(reg_bdb_loop, brg.bdb2);
-            mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
-            L_aligned(bdb_loop_label, 64);
-            {
-                bdb_loop_body(brg.bd_block2, false, false, false, 0,
+        if (brg.amx_wary_k_tail()) {
+            Label bdb_loop_label;
+            auto bdblocks = brg.bdb2;
+            if (bdblocks > 1) {
+                mov(reg_bdb_loop, brg.bdb2);
+                mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+                L_aligned(bdb_loop_label, 64);
+                {
+                    bdb_loop_body(brg.bd_block2, false, false, false, 0,
+                            skip_accumulation);
+                    mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+                    dec(reg_bdb_loop);
+                    cmp(reg_bdb_loop, 1);
+                    mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+                }
+                jg(bdb_loop_label, T_NEAR);
+                bdblocks = 1;
+            }
+            if (bdblocks == 1) {
+                const bool last_bdb = brg.bdb2_tail == 0 && brg.bdb_tail == 0;
+                bdb_loop_body(brg.bd_block2, false, false, last_bdb, 0,
                         skip_accumulation);
-                mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
-                dec(reg_bdb_loop);
-                cmp(reg_bdb_loop, 0);
+            }
+
+            if (brg.bdb2_tail > 0) {
+                const bool last_bdb = brg.bdb_tail == 0;
+                bdb_loop_body(brg.bdb2_tail, false, false, last_bdb, 0,
+                        skip_accumulation);
+            }
+            if (brg.bdb_tail > 0)
+                do_ldb_loop(1, true, false, false, 0, skip_accumulation);
+
+        } else {
+            Label bdb_loop_label;
+            if (brg.bd_block2 >= 1) {
+                mov(reg_bdb_loop, brg.bdb2);
                 mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+                L_aligned(bdb_loop_label, 64);
+                {
+                    bdb_loop_body(brg.bd_block2, false, false, false, 0,
+                            skip_accumulation);
+                    mov(reg_bdb_loop, ptr[rsp + reg_bdb_loop_offs_]);
+                    dec(reg_bdb_loop);
+                    cmp(reg_bdb_loop, 0);
+                    mov(ptr[rsp + reg_bdb_loop_offs_], reg_bdb_loop);
+                }
+                jg(bdb_loop_label, T_NEAR);
             }
-            jg(bdb_loop_label, T_NEAR);
+            if (brg.bdb2_tail > 0)
+                bdb_loop_body(brg.bdb2_tail, false, false, false, 0,
+                        skip_accumulation);
+            if (brg.bdb_tail > 0)
+                do_ldb_loop(1, true, false, false, 0, skip_accumulation);
         }
-        if (brg.bdb2_tail > 0)
-            bdb_loop_body(
-                    brg.bdb2_tail, false, false, false, 0, skip_accumulation);
-        if (brg.bdb_tail > 0)
-            do_ldb_loop(1, true, false, false, 0, skip_accumulation);
     };
 
     auto bdb_loop_general = [&](bool skip_accumulation) {
@@ -2736,7 +3637,26 @@ void jit_brgemm_kernel_t<Wmm>::generate() {
 
     if (brg.is_int8 && !brg.has_int8_vnni) {
         mov(reg_tmp_gpr.cvt16(), 0x1);
-        vpbroadcastw(int8_ones_words(), reg_tmp_gpr.cvt16());
+
+        if (is_superset(brg.isa_impl, avx512_core))
+            vpbroadcastw(int8_ones_words(), reg_tmp_gpr.cvt16());
+        else if (is_superset(brg.isa_impl, avx2)) {
+            movq(Xmm(int8_ones_words().getIdx()), reg_tmp_gpr);
+            vpbroadcastw(int8_ones_words(), Xmm(int8_ones_words().getIdx()));
+        } else
+            assert(!"unsupported isa");
+    }
+
+    if (brg.is_f16_b_non_amx_vnni()) {
+        mov(reg_tmp_gpr, f16_perm_even_table_);
+        vmovups(f16_perm_even_vreg(), ptr[reg_tmp_gpr]);
+        mov(reg_tmp_gpr, f16_perm_odd_table_);
+        vmovups(f16_perm_odd_vreg(), ptr[reg_tmp_gpr]);
+    }
+
+    if (brg.is_tmm && brg.amx_wary_k_tail()) {
+        // save tiles description for later use
+        brgemm_init_tiles(brg, (char *)(&palette_));
     }
 
     read_params();
@@ -2748,19 +3668,19 @@ void jit_brgemm_kernel_t<Wmm>::generate() {
     postamble();
 
     align(32);
-    const int simd = vreg_traits<Vmm>::vlen / sizeof(float);
+    const dim_t simd = vreg_traits_t<Vmm>::vlen / sizeof(float);
     if (!isa_has_masks(brg.isa_impl) && brg.ldb_tail > 0) {
         L(avx_tail_mask_);
-        for (int i = 0; i < brg.ldb_tail; ++i)
+        for (dim_t i = 0; i < brg.ldb_tail; ++i)
             dd(0xffffffff);
-        for (int i = brg.ldb_tail; i < simd; ++i)
+        for (dim_t i = brg.ldb_tail; i < simd; ++i)
             dd(0);
     }
     if (!is_superset(brg.isa_impl, avx512_core) && brg.with_sum
             && brg.sum_scale != 1.f) {
         L(sum_zp_scale_data_);
-        const int scale_int = float2int(brg.sum_scale);
-        for (int i = 0; i < simd; ++i)
+        const dim_t scale_int = float2int(brg.sum_scale);
+        for (dim_t i = 0; i < simd; ++i)
             dd(scale_int);
     }
 
@@ -2771,6 +3691,25 @@ void jit_brgemm_kernel_t<Wmm>::generate() {
 
     if (brg.with_eltwise)
         postops_injector_->prepare_table(/* generate = */ true);
+
+    if (brg.is_f16_b_non_amx_vnni()) {
+        // convert interleaved vnni data with holes to packed.
+        align(64);
+        L(f16_perm_even_table_);
+        for (dim_t i = 0; i < 32; ++i) {
+            if (i < 16)
+                dw(uint16_t(2 * i));
+            else
+                dw(uint16_t(0));
+        }
+        align(64);
+        L(f16_perm_odd_table_);
+        for (dim_t i = 0; i < 32; ++i)
+            if (i < 16)
+                dw(uint16_t(2 * i + 1));
+            else
+                dw(uint16_t(0));
+    }
 }
 
 brgemm_attr_t::brgemm_attr_t()
@@ -2785,7 +3724,8 @@ brgemm_attr_t::brgemm_attr_t()
     , hint_innermost_loop(brgemm_ld_loop_innermost)
     , hint_loop_order(brgemm_kernel_loop_order_t::brgemm_lo_default)
     , hint_prefetching(brgemm_kernel_prefetching_t::brgemm_prf_default)
-    , wary_tail_read(true)
+    , wary_A_k_tail_read(true)
+    , extendable_k(false)
     , generate_skip_accumulation(false)
     , bd_mask_level(0)
     , use_uker(false)
@@ -2814,7 +3754,7 @@ void brgemm_kernel_common_t<Wmm>::operator()(
 }
 
 template <typename Wmm>
-const jit_generator *brgemm_kernel_common_t<Wmm>::get_jit_generator() const {
+const jit_generator_t *brgemm_kernel_common_t<Wmm>::get_jit_generator() const {
     return brgemm_kernel_;
 }
 
diff --git a/src/cpu/x64/cpu_barrier.cpp b/src/cpu/x64/cpu_barrier.cpp
index 24ab6515b02..2ab3bb5c4a5 100644
--- a/src/cpu/x64/cpu_barrier.cpp
+++ b/src/cpu/x64/cpu_barrier.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2022 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace x64 {
 namespace simple_barrier {
 
 void generate(
-        jit_generator &code, Xbyak::Reg64 reg_ctx, Xbyak::Reg64 reg_nthr) {
+        jit_generator_t &code, Xbyak::Reg64 reg_ctx, Xbyak::Reg64 reg_nthr) {
 #define BAR_CTR_OFF offsetof(ctx_t, ctr)
 #define BAR_SENSE_OFF offsetof(ctx_t, sense)
     using namespace Xbyak;
@@ -81,7 +81,7 @@ void generate(
 }
 
 /** jit barrier generator */
-struct jit_t : public jit_generator {
+struct jit_t : public jit_generator_t {
 
     void generate() override {
         simple_barrier::generate(*this, abi_param1, abi_param2);
@@ -89,7 +89,7 @@ struct jit_t : public jit_generator {
     }
 
     // TODO: Need to check status
-    jit_t() : jit_generator(jit_name()) { create_kernel(); }
+    jit_t() : jit_generator_t(jit_name()) { create_kernel(); }
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_t)
 };
diff --git a/src/cpu/x64/cpu_barrier.hpp b/src/cpu/x64/cpu_barrier.hpp
index c76d57911af..f5cd7966ac9 100644
--- a/src/cpu/x64/cpu_barrier.hpp
+++ b/src/cpu/x64/cpu_barrier.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,11 +68,12 @@ void barrier(ctx_t *ctx, int nthr);
 
 /** injects actual barrier implementation into another jitted code
  * @params:
- *   code      -- jit_generator object where the barrier is to be injected
+ *   code      -- jit_generator_t object where the barrier is to be injected
  *   reg_ctx   -- read-only register with pointer to the barrier context
  *   reg_nnthr -- read-only register with the # of synchronizing threads
  */
-void generate(jit_generator &code, Xbyak::Reg64 reg_ctx, Xbyak::Reg64 reg_nthr);
+void generate(
+        jit_generator_t &code, Xbyak::Reg64 reg_ctx, Xbyak::Reg64 reg_nthr);
 
 } // namespace simple_barrier
 
diff --git a/src/cpu/x64/cpu_isa_traits.cpp b/src/cpu/x64/cpu_isa_traits.cpp
index 931f13a8c2b..c9d718e1132 100644
--- a/src/cpu/x64/cpu_isa_traits.cpp
+++ b/src/cpu/x64/cpu_isa_traits.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ cpu_isa_t init_max_cpu_isa() {
     if (!isa_val.empty()) {
 
 #define IF_HANDLE_CASE(cpu_isa) \
-    if (isa_val.compare(cpu_isa_traits<cpu_isa>::user_option_env) == 0) \
+    if (isa_val.compare(cpu_isa_traits_t<cpu_isa>::user_option_env) == 0) \
     max_cpu_isa_val = cpu_isa
 #define ELSEIF_HANDLE_CASE(cpu_isa) else IF_HANDLE_CASE(cpu_isa)
 
@@ -206,7 +206,9 @@ status_t set_max_cpu_isa(dnnl_cpu_isa_t isa) {
 
     cpu_isa_t isa_to_set = isa_undef;
 #define HANDLE_CASE(cpu_isa) \
-    case cpu_isa_traits<cpu_isa>::user_option_val: isa_to_set = cpu_isa; break;
+    case cpu_isa_traits_t<cpu_isa>::user_option_val: \
+        isa_to_set = cpu_isa; \
+        break;
     switch (isa) {
         HANDLE_CASE(isa_all);
         HANDLE_CASE(sse41);
diff --git a/src/cpu/x64/cpu_isa_traits.hpp b/src/cpu/x64/cpu_isa_traits.hpp
index 89233c48d4e..3fda4777f73 100644
--- a/src/cpu/x64/cpu_isa_traits.hpp
+++ b/src/cpu/x64/cpu_isa_traits.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,17 +28,25 @@
 
 #include "cpu/platform.hpp"
 
+#if !defined(XBYAK64)
 #define XBYAK64
+#endif
+
+#if !defined(XBYAK_NO_OP_NAMES)
 #define XBYAK_NO_OP_NAMES
+#endif
+
 /* in order to make selinux happy memory that would be marked with X-bit should
  * be obtained with mmap */
+#if !defined(XBYAK_USE_MMAP_ALLOCATOR)
 #define XBYAK_USE_MMAP_ALLOCATOR
+#endif
 
+#ifdef DNNL_XBYAK_NO_EXCEPTION
+#if defined(NDEBUG) && !defined(XBYAK_NO_EXCEPTION)
 #define XBYAK_NO_EXCEPTION
-#ifndef NDEBUG
-#undef XBYAK_NO_EXCEPTION
 #endif
-
+#endif
 #if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
 /* turn off `size_t to other-type implicit casting` warning
  * currently we have a lot of jit-generated instructions that
@@ -47,8 +55,8 @@
 #pragma warning(disable : 4267)
 #endif
 #include "common/compiler_workarounds.hpp"
-#include "cpu/x64/xbyak/xbyak.h"
-#include "cpu/x64/xbyak/xbyak_util.h"
+#include "xbyak/xbyak.h"
+#include "xbyak/xbyak_util.h"
 
 namespace dnnl {
 namespace impl {
@@ -79,6 +87,7 @@ enum cpu_isa_bit_t : unsigned {
     amx_int8_bit = 1u << 15,
     amx_bf16_bit = 1u << 16,
     amx_fp16_bit = 1u << 17,
+    avx512_vpopcnt_bit = 1u << 18,
 
     // Fill in hints from most significant bit to least significant bit
     prefer_ymm_bit = 1u << (cpu_isa_total_bits - 1),
@@ -109,7 +118,11 @@ inline unsigned cvt2mask(dnnl_cpu_isa_hints_t hints) {
 };
 
 inline bool is_hints_bit_set(cpu_isa_bit_t hint_bit, bool soft) {
+#if DNNL_X64
     const dnnl_cpu_isa_hints_t hints = get_cpu_isa_hints(soft);
+#else
+    const dnnl_cpu_isa_hints_t hints = dnnl_cpu_isa_no_hints;
+#endif
     const unsigned cur_hints_mask = cpu_isa_hints_utils::cvt2mask(hints);
     return (cur_hints_mask & hint_bit) == hint_bit;
 }
@@ -136,6 +149,7 @@ enum cpu_isa_t : unsigned {
     avx512_core_amx = avx10_1_512_amx,
     avx10_1_512_amx_fp16 = avx10_1_512_amx | amx_fp16,
     avx512_core_amx_fp16 = avx10_1_512_amx_fp16,
+    avx512_vpopcnt = avx512_vpopcnt_bit,
     // NOTES: 1. isa_all by default has no isa specific hints
     isa_all = ~0u & ~cpu_isa_hints_utils::hints_mask,
 };
@@ -209,28 +223,28 @@ static inline bool is_superset(cpu_isa_t isa_1, cpu_isa_t isa_2) {
 }
 
 template <typename Vmm>
-struct vreg_traits {};
+struct vreg_traits_t {};
 
 template <>
-struct vreg_traits<Xbyak::Zmm> {
-    typedef Xbyak::Ymm Vmm_lower_t;
+struct vreg_traits_t<Xbyak::Zmm> {
+    using Vmm_lower_t = Xbyak::Ymm;
     static constexpr size_t vlen = 64;
 };
 
 template <>
-struct vreg_traits<Xbyak::Ymm> {
-    typedef Xbyak::Xmm Vmm_lower_t;
+struct vreg_traits_t<Xbyak::Ymm> {
+    using Vmm_lower_t = Xbyak::Xmm;
     static constexpr size_t vlen = 32;
 };
 
 template <>
-struct vreg_traits<Xbyak::Xmm> {
-    typedef Xbyak::Xmm Vmm_lower_t;
+struct vreg_traits_t<Xbyak::Xmm> {
+    using Vmm_lower_t = Xbyak::Xmm;
     static constexpr size_t vlen = 16;
 };
 
 template <cpu_isa_t>
-struct cpu_isa_traits {}; /* ::vlen -> 32 (for avx2) */
+struct cpu_isa_traits_t {}; /* ::vlen -> 32 (for avx2) */
 
 // pack struct so it can fit into a single 64-byte cache line
 #pragma pack(push, 1)
@@ -244,96 +258,105 @@ struct palette_config_t {
 #pragma pack(pop)
 
 template <>
-struct cpu_isa_traits<isa_all> {
+struct cpu_isa_traits_t<isa_all> {
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_default;
     static constexpr const char *user_option_env = "default";
 };
 
 template <>
-struct cpu_isa_traits<sse41> {
-    typedef Xbyak::Xmm Vmm;
+struct cpu_isa_traits_t<sse41> {
+    using Vmm = Xbyak::Xmm;
     static constexpr int vlen_shift = 4;
-    static constexpr int vlen = vreg_traits<Vmm>::vlen;
+    static constexpr int vlen = vreg_traits_t<Vmm>::vlen;
     static constexpr int n_vregs = 16;
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_sse41;
     static constexpr const char *user_option_env = "sse41";
 };
 
 template <>
-struct cpu_isa_traits<avx> {
-    typedef Xbyak::Ymm Vmm;
+struct cpu_isa_traits_t<avx> {
+    using Vmm = Xbyak::Ymm;
     static constexpr int vlen_shift = 5;
-    static constexpr int vlen = vreg_traits<Vmm>::vlen;
+    static constexpr int vlen = vreg_traits_t<Vmm>::vlen;
     static constexpr int n_vregs = 16;
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx;
     static constexpr const char *user_option_env = "avx";
 };
 
 template <>
-struct cpu_isa_traits<avx2> : public cpu_isa_traits<avx> {
+struct cpu_isa_traits_t<avx2> : public cpu_isa_traits_t<avx> {
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2;
     static constexpr const char *user_option_env = "avx2";
 };
 
 template <>
-struct cpu_isa_traits<avx2_vnni> : public cpu_isa_traits<avx2> {
+struct cpu_isa_traits_t<avx2_vnni> : public cpu_isa_traits_t<avx2> {
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2_vnni;
     static constexpr const char *user_option_env = "avx2_vnni";
 };
 
 template <>
-struct cpu_isa_traits<avx2_vnni_2> : public cpu_isa_traits<avx2> {
+struct cpu_isa_traits_t<avx2_vnni_2> : public cpu_isa_traits_t<avx2> {
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx2_vnni_2;
     static constexpr const char *user_option_env = "avx2_vnni_2";
 };
 
 template <>
-struct cpu_isa_traits<avx512_core> {
-    typedef Xbyak::Zmm Vmm;
+struct cpu_isa_traits_t<avx512_core> {
+    using Vmm = Xbyak::Zmm;
     static constexpr int vlen_shift = 6;
-    static constexpr int vlen = vreg_traits<Vmm>::vlen;
+    static constexpr int vlen = vreg_traits_t<Vmm>::vlen;
     static constexpr int n_vregs = 32;
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx512_core;
     static constexpr const char *user_option_env = "avx512_core";
 };
 
 template <>
-struct cpu_isa_traits<avx512_core_vnni> : public cpu_isa_traits<avx512_core> {
+struct cpu_isa_traits_t<avx512_core_vnni>
+    : public cpu_isa_traits_t<avx512_core> {
     static constexpr dnnl_cpu_isa_t user_option_val
             = dnnl_cpu_isa_avx512_core_vnni;
     static constexpr const char *user_option_env = "avx512_core_vnni";
 };
 
 template <>
-struct cpu_isa_traits<avx512_core_bf16> : public cpu_isa_traits<avx512_core> {
+struct cpu_isa_traits_t<avx512_core_bf16>
+    : public cpu_isa_traits_t<avx512_core> {
     static constexpr dnnl_cpu_isa_t user_option_val
             = dnnl_cpu_isa_avx512_core_bf16;
     static constexpr const char *user_option_env = "avx512_core_bf16";
 };
 
 template <>
-struct cpu_isa_traits<avx10_1_512_amx> {
-    typedef Xbyak::Zmm Vmm;
-    static constexpr int vlen = vreg_traits<Vmm>::vlen;
+struct cpu_isa_traits_t<avx10_1_512_amx> {
+    using Vmm = Xbyak::Zmm;
+    static constexpr int vlen = vreg_traits_t<Vmm>::vlen;
     static constexpr dnnl_cpu_isa_t user_option_val
             = dnnl_cpu_isa_avx10_1_512_amx;
     static constexpr const char *user_option_env = "avx10_1_512_amx";
 };
 
 template <>
-struct cpu_isa_traits<avx10_1_512> : public cpu_isa_traits<avx512_core> {
+struct cpu_isa_traits_t<avx10_1_512> : public cpu_isa_traits_t<avx512_core> {
     static constexpr dnnl_cpu_isa_t user_option_val = dnnl_cpu_isa_avx10_1_512;
     static constexpr const char *user_option_env = "avx10_1_512";
 };
 
 template <>
-struct cpu_isa_traits<avx10_1_512_amx_fp16> {
-    typedef Xbyak::Zmm Vmm;
+struct cpu_isa_traits_t<avx10_1_512_amx_fp16> {
+    using Vmm = Xbyak::Zmm;
     static constexpr dnnl_cpu_isa_t user_option_val
             = dnnl_cpu_isa_avx10_1_512_amx_fp16;
     static constexpr const char *user_option_env = "avx10_1_512_amx_fp16";
 };
 
+template <>
+struct cpu_isa_traits_t<avx512_vpopcnt> {
+    static constexpr dnnl_cpu_isa_t user_option_val
+            = dnnl_cpu_isa_avx512_vpopcnt;
+    static constexpr const char *user_option_env = "AVX512_VPOPCNT";
+};
+
 inline const Xbyak::util::Cpu &cpu() {
     const static Xbyak::util::Cpu cpu_;
     return cpu_;
@@ -354,11 +377,16 @@ bool DNNL_API is_available();
 
 namespace {
 
-static inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
+inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
     using namespace Xbyak::util;
-
-    unsigned cpu_isa_mask = x64::get_max_cpu_isa_mask(soft);
-    unsigned cpu_isa_no_hints = cpu_isa & ~cpu_isa_hints_utils::hints_mask;
+#if DNNL_X64
+    const unsigned cpu_isa_mask = x64::get_max_cpu_isa_mask(soft);
+#elif DNNL_X86
+    const unsigned cpu_isa_mask = isa_undef;
+#else
+    const unsigned cpu_isa_mask = isa_all;
+#endif
+    const unsigned cpu_isa_no_hints = cpu_isa & ~cpu_isa_hints_utils::hints_mask;
 
     if ((cpu_isa_mask & cpu_isa_no_hints) != cpu_isa_no_hints) return false;
 
@@ -412,29 +440,31 @@ static inline bool mayiuse(const cpu_isa_t cpu_isa, bool soft = false) {
         case avx512_core_amx_fp16:
             REG_AMX_ISA(return mayiuse(avx512_core_amx, soft)
                     && mayiuse(amx_fp16, soft));
+        case avx512_vpopcnt:
+            REG_AVX512_ISA(return cpu().has(Cpu::tAVX512_VPOPCNTDQ));
         case isa_all: return false;
         case isa_undef: return true;
     }
     return false;
 }
 
-static inline bool isa_has_int8_vnni(cpu_isa_t isa) {
+inline bool isa_has_int8_vnni(cpu_isa_t isa) {
     return is_superset(isa, avx512_core_vnni) || is_superset(isa, avx2_vnni);
 }
 
-static inline bool isa_has_s8s8(cpu_isa_t isa) {
+inline bool isa_has_s8s8(cpu_isa_t isa) {
     return is_superset(isa, amx_int8) || is_superset(isa, avx2_vnni_2);
 }
 
-static inline bool isa_has_bf16(cpu_isa_t isa) {
+inline bool isa_has_bf16(cpu_isa_t isa) {
     return is_superset(isa, avx512_core_bf16);
 }
 
-static inline bool isa_has_masks(cpu_isa_t isa) {
+inline bool isa_has_masks(cpu_isa_t isa) {
     return is_superset(isa, avx512_core);
 }
 
-static inline int isa_max_vlen(cpu_isa_t isa) {
+inline int isa_max_vlen(cpu_isa_t isa) {
     const bool is_avx512 = is_superset(isa, avx512_core);
     const bool is_avx = is_superset(isa, avx);
     const bool is_sse41 = is_superset(isa, sse41);
@@ -443,14 +473,14 @@ static inline int isa_max_vlen(cpu_isa_t isa) {
     MAYBE_UNUSED(is_sse41);
 
     if (is_avx512)
-        return cpu_isa_traits<avx512_core>::vlen;
+        return cpu_isa_traits_t<avx512_core>::vlen;
     else if (is_avx)
-        return cpu_isa_traits<avx>::vlen;
+        return cpu_isa_traits_t<avx>::vlen;
     else
-        return cpu_isa_traits<sse41>::vlen;
+        return cpu_isa_traits_t<sse41>::vlen;
 }
 
-static inline int isa_num_vregs(cpu_isa_t isa) {
+inline int isa_num_vregs(cpu_isa_t isa) {
     const bool is_avx512 = is_superset(isa, avx512_core);
     const bool is_avx = is_superset(isa, avx);
     const bool is_sse41 = is_superset(isa, sse41);
@@ -459,11 +489,11 @@ static inline int isa_num_vregs(cpu_isa_t isa) {
     MAYBE_UNUSED(is_sse41);
 
     if (is_avx512)
-        return cpu_isa_traits<avx512_core>::n_vregs;
+        return cpu_isa_traits_t<avx512_core>::n_vregs;
     else if (is_avx)
-        return cpu_isa_traits<avx>::n_vregs;
+        return cpu_isa_traits_t<avx>::n_vregs;
     else
-        return cpu_isa_traits<sse41>::n_vregs;
+        return cpu_isa_traits_t<sse41>::n_vregs;
 }
 
 } // namespace
@@ -494,10 +524,12 @@ inline data_type_t get_mac_emu_data_type(const data_type_t data_type,
     using namespace data_type;
     if (req_emulation) switch (data_type) {
             case bf16:
-                if (isa == avx2_vnni_2) return f32;
+                if (utils::one_of(isa, avx2, avx2_vnni_2, avx512_core))
+                    return f32;
                 break;
             case f16:
-                if (utils::one_of(isa, avx2_vnni_2, avx512_core_fp16))
+                if (utils::one_of(isa, avx2, avx2_vnni_2, avx512_core,
+                            avx512_core_fp16))
                     return f32;
                 break;
             case f8_e5m2:
@@ -520,7 +552,11 @@ inline size_t data_type_vnni_granularity(const data_type_t data_type) {
         case f32:
         case s32: return size_t(1);
         case f16:
-        case bf16: return size_t(2);
+        case bf16:
+        case s4:
+        case u4:
+        case nf4:
+        case f4_e2m1: return size_t(2);
         case f8_e5m2:
         case f8_e4m3:
         case s8:
diff --git a/src/cpu/x64/cpu_reducer.cpp b/src/cpu/x64/cpu_reducer.cpp
index a000d8b5fca..3bcb5ed0b5c 100644
--- a/src/cpu/x64/cpu_reducer.cpp
+++ b/src/cpu/x64/cpu_reducer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -97,12 +97,12 @@ void reduce_balancer_t::balance() {
 using namespace Xbyak;
 
 template <impl::data_type_t data_type>
-struct reducer_2d_driver_t : public jit_generator {
-    using data_t = typename prec_traits<data_type>::type;
+struct reducer_2d_driver_t : public jit_generator_t {
+    using data_t = typename prec_traits_t<data_type>::type;
 
     reducer_2d_driver_t(int n_src, size_t src_ld, size_t src_step,
             size_t dst_step, bool nullify_dst, const char *name)
-        : jit_generator(name)
+        : jit_generator_t(name)
         , n_src_(n_src)
         , src_ld_(src_ld)
         , src_step_(src_step)
@@ -122,11 +122,11 @@ template <impl::data_type_t data_type, cpu_isa_t isa>
 struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t)
 
-    using data_t = typename prec_traits<data_type>::type;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     void operator()(
             data_t *dst, const data_t *srcs, size_t ny, size_t nx) override {
-        jit_generator::operator()(dst, srcs, ny, nx);
+        jit_generator_t::operator()(dst, srcs, ny, nx);
     }
 
     /* cpu specific part */
@@ -145,9 +145,9 @@ struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
             this->paddd(x1, op);
     }
 
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
     const int typesize
-            = sizeof(typename dnnl::impl::prec_traits<data_type>::type);
+            = sizeof(typename dnnl::impl::prec_traits_t<data_type>::type);
     Xbyak::Reg64 reg_dst = abi_param1;
     Xbyak::Reg64 reg_src = abi_param2;
     Xbyak::Reg64 reg_ny = abi_param3;
@@ -195,17 +195,32 @@ struct reducer_2d_driver_f_s_32_t : public reducer_2d_driver_t<data_type> {
         for (int i = 0; i < nloads; ++i) {
             size_t off = base_off + i * load_len;
 
-            if (load_len == typesize)
-                this->uni_add(Xmm(i), this->ptr[reg_src + off]);
-            else if (load_len == vlen)
-                this->uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
+            if (load_len == typesize) {
+                assert(nloads == 1);
+                if (off > static_cast<size_t>(INT_MAX)) {
+                    this->mov(reg_long_offt, off);
+                    this->movd(Xmm(nloads + i),
+                            this->ptr[reg_src + reg_long_offt]);
+                    this->uni_add(Xmm(i), Xmm(nloads + i));
+                } else {
+                    this->movd(Xmm(nloads + i), this->ptr[reg_src + off]);
+                    this->uni_add(Xmm(i), Xmm(nloads + i));
+                }
+            } else if (load_len == vlen)
+                if (off > static_cast<size_t>(INT_MAX)) {
+                    this->mov(reg_long_offt, off);
+                    this->uni_vadd(
+                            Vmm(i), Vmm(i), vmmword[reg_src + reg_long_offt]);
+                } else {
+                    this->uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]);
+                }
             else
                 assert(!"unsupported");
         }
     }
 
     void loop_x() {
-        const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1};
+        const int nloads[] = {cpu_isa_traits_t<isa>::n_vregs, 1, 1};
         const int nbranches = sizeof(nloads) / sizeof(nloads[0]);
 
         const int load_len[nbranches] = {vlen, vlen, typesize};
diff --git a/src/cpu/x64/cpu_reducer.hpp b/src/cpu/x64/cpu_reducer.hpp
index 2ecf022b859..d07e7545b7d 100644
--- a/src/cpu/x64/cpu_reducer.hpp
+++ b/src/cpu/x64/cpu_reducer.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -168,7 +168,7 @@ struct reducer_2d_driver_t;
  */
 template <impl::data_type_t data_type>
 struct cpu_reducer_t {
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     struct conf_t {
         conf_t() = default;
@@ -248,7 +248,7 @@ struct cpu_reducer_t {
 
 template <impl::data_type_t data_type>
 struct cpu_reducer_2d_t {
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     struct conf_t {
         conf_t() = default;
@@ -333,7 +333,7 @@ struct cpu_reducer_2d_t {
 /** simple 1d accumulator: y[:] += x[:] */
 template <impl::data_type_t data_type>
 struct cpu_accumulator_1d_t {
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
     cpu_accumulator_1d_t();
     ~cpu_accumulator_1d_t();
diff --git a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.cpp b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.cpp
index 7f9b09824d2..81771dab6a8 100644
--- a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.cpp
+++ b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ static inline Zmm make_zmm(const Xmm &v) {
     return Zmm(v.getIdx());
 }
 
-void jit_avx512_core_amx_copy_kern::transpose(int s, const Ymm &dst1,
+void jit_avx512_core_amx_copy_kern_t::transpose(int s, const Ymm &dst1,
         const Ymm &dst2, const Ymm &src1, const Ymm &src2) {
     switch (s) {
         case 32:
@@ -91,8 +91,9 @@ void jit_avx512_core_amx_copy_kern::transpose(int s, const Ymm &dst1,
     }
 }
 
-void jit_avx512_core_amx_copy_kern::amxtrans8(const Ymm &dst1, const Ymm &dst2,
-        const Ymm &src1, const Ymm &src2, const Ymm &src3, const Ymm &src4) {
+void jit_avx512_core_amx_copy_kern_t::amxtrans8(const Ymm &dst1,
+        const Ymm &dst2, const Ymm &src1, const Ymm &src2, const Ymm &src3,
+        const Ymm &src4) {
     vpunpcklbw(dst1, src1, src2);
     vpunpckhbw(dst2, src1, src2);
     vpunpcklbw(src1, src3, src4);
@@ -107,7 +108,7 @@ void jit_avx512_core_amx_copy_kern::amxtrans8(const Ymm &dst1, const Ymm &dst2,
     vshufi32x4(src4, dst1, dst2, 0x03);
 }
 
-void jit_avx512_core_amx_copy_kern::amxtrans16(
+void jit_avx512_core_amx_copy_kern_t::amxtrans16(
         const Ymm &dst1, const Ymm &dst2, const Ymm &src1, const Ymm &src2) {
     vpunpcklwd(dst1, src1, src2);
     vpunpckhwd(dst2, src1, src2);
@@ -117,7 +118,7 @@ void jit_avx512_core_amx_copy_kern::amxtrans16(
     vshufi32x4(src2, src2, src2, 0xd8);
 }
 
-void jit_avx512_core_amx_copy_kern::load(
+void jit_avx512_core_amx_copy_kern_t::load(
         const Xmm &dst, const Address &src, bool corner) {
     if (!corner && isize_ == 1)
         vmovdqu8(dst, src);
@@ -129,14 +130,15 @@ void jit_avx512_core_amx_copy_kern::load(
         vmovdqu16(dst | k1 | T_z, src);
 }
 
-void jit_avx512_core_amx_copy_kern::store(const Address &dst, const Xmm &src) {
+void jit_avx512_core_amx_copy_kern_t::store(
+        const Address &dst, const Xmm &src) {
     if (size_ == 1)
         vmovdqu8(dst, src);
     else
         vmovdqu16(dst, src);
 }
 
-void jit_avx512_core_amx_copy_kern::kernel_AN(
+void jit_avx512_core_amx_copy_kern_t::kernel_AN(
         int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
     // Transpose data.
     int u[] = {32, 16, 8, 4};
@@ -170,7 +172,7 @@ void jit_avx512_core_amx_copy_kern::kernel_AN(
             }
 }
 
-void jit_avx512_core_amx_copy_kern::kernel_BN(
+void jit_avx512_core_amx_copy_kern_t::kernel_BN(
         int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
     // Store data.
     for (int i = 0; i < 16; i++)
@@ -179,7 +181,7 @@ void jit_avx512_core_amx_copy_kern::kernel_BN(
                     src_[i]);
 }
 
-void jit_avx512_core_amx_copy_kern::kernel_AT(
+void jit_avx512_core_amx_copy_kern_t::kernel_AT(
         int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
     Ymm v[16];
 
@@ -258,7 +260,7 @@ void jit_avx512_core_amx_copy_kern::kernel_AT(
     }
 }
 
-void jit_avx512_core_amx_copy_kern::kernel_BT(
+void jit_avx512_core_amx_copy_kern_t::kernel_BT(
         int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
     // Transpose data.
     int u[] = {16, 8, 4, 2, 1};
@@ -297,7 +299,7 @@ void jit_avx512_core_amx_copy_kern::kernel_BT(
     L(store_end);
 }
 
-void jit_avx512_core_amx_copy_kern::kernel(
+void jit_avx512_core_amx_copy_kern_t::kernel(
         int unroll_x, int unroll_y, int step, Reg64 A, Reg64 B, bool corner) {
 
     // Load matrix.
@@ -326,7 +328,7 @@ void jit_avx512_core_amx_copy_kern::kernel(
         kernel_BT(unroll_x, unroll_y, step, A, B, corner);
 }
 
-void jit_avx512_core_amx_copy_kern::copy_m(int unroll_m, int unroll_n) {
+void jit_avx512_core_amx_copy_kern_t::copy_m(int unroll_m, int unroll_n) {
     if (is_trans_) {
         mov(B1_, B_);
         add(B_, unroll_m * unroll_n * size_);
@@ -378,7 +380,7 @@ void jit_avx512_core_amx_copy_kern::copy_m(int unroll_m, int unroll_n) {
     L_aligned(kernel_tail_end);
 }
 
-void jit_avx512_core_amx_copy_kern::copy_ns(int unroll_n, Label &epilogue) {
+void jit_avx512_core_amx_copy_kern_t::copy_ns(int unroll_n, Label &epilogue) {
     if (unroll_n > 0) {
         copy_ns(unroll_n - 1, epilogue);
 
@@ -393,7 +395,7 @@ void jit_avx512_core_amx_copy_kern::copy_ns(int unroll_n, Label &epilogue) {
     }
 }
 
-void jit_avx512_core_amx_copy_kern::copy_n(int unroll_n, Label &epilogue) {
+void jit_avx512_core_amx_copy_kern_t::copy_n(int unroll_n, Label &epilogue) {
 
     Label copy_m_loop, copy_m_end;
 
@@ -422,7 +424,7 @@ void jit_avx512_core_amx_copy_kern::copy_n(int unroll_n, Label &epilogue) {
     copy_ns(unroll_n - 1, epilogue);
 }
 
-void jit_avx512_core_amx_copy_kern::generate() {
+void jit_avx512_core_amx_copy_kern_t::generate() {
     // Prologue
     preamble();
     sub(rsp, stack_alloc_size_);
@@ -494,9 +496,9 @@ void jit_avx512_core_amx_copy_kern::generate() {
     postamble();
 }
 
-jit_avx512_core_amx_copy_kern::jit_avx512_core_amx_copy_kern(
+jit_avx512_core_amx_copy_kern_t::jit_avx512_core_amx_copy_kern_t(
         bool is_a, bool is_trans, int isize)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , is_a_(is_a)
     , is_trans_(is_trans)
     , size_(isize)
diff --git a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.hpp b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.hpp
index 76d830f9750..db74267baef 100644
--- a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.hpp
+++ b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_copy_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_amx_copy_kern : public jit_generator {
+class jit_avx512_core_amx_copy_kern_t : public jit_generator_t {
 public:
-    jit_avx512_core_amx_copy_kern(bool is_a, bool is_trans, int isize);
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_copy_kern);
+    jit_avx512_core_amx_copy_kern_t(bool is_a, bool is_trans, int isize);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_copy_kern_t);
 
 protected:
     bool is_a_;
@@ -66,7 +66,7 @@ class jit_avx512_core_amx_copy_kern : public jit_generator {
     void copy_n(int unroll_n, Xbyak::Label &epilogue);
     void copy_ns(int unroll_n, Xbyak::Label &epilogue);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int offset_a_ = 0, offset_b_ = 0;
diff --git a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.cpp b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.cpp
index f9005d6ea6e..c92560cb70c 100644
--- a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.cpp
+++ b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ namespace x64 {
 #define TILED(X) dword[rsp + ((X) + 0xc0)]
 #define TILEQ(X) qword[rsp + ((X) + 0xc0)]
 
-void jit_avx512_core_amx_gemm_kern::generate() {
+void jit_avx512_core_amx_gemm_kern_t::generate() {
 
     int kerneltype = ((typea << 1) | typeb);
 
@@ -455,9 +455,9 @@ void jit_avx512_core_amx_gemm_kern::generate() {
     ret();
 }
 
-jit_avx512_core_amx_gemm_kern::jit_avx512_core_amx_gemm_kern(
+jit_avx512_core_amx_gemm_kern_t::jit_avx512_core_amx_gemm_kern_t(
         int typea, int typeb, int typec, int betaZero)
-    : jit_generator(jit_name(), avx512_core_amx)
+    : jit_generator_t(jit_name(), avx512_core_amx)
     , typea(typea)
     , typeb(typeb)
     , typec(typec)
diff --git a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.hpp b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.hpp
index 08987d8afc1..fab208e61cf 100644
--- a/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.hpp
+++ b/src/cpu/x64/gemm/amx/jit_avx512_core_amx_gemm_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,14 +24,14 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_amx_gemm_kern : public jit_generator {
+class jit_avx512_core_amx_gemm_kern_t : public jit_generator_t {
 public:
-    jit_avx512_core_amx_gemm_kern(
+    jit_avx512_core_amx_gemm_kern_t(
             int typea, int typeb, int typec, int betaZero);
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_gemm_kern);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_gemm_kern_t);
 
 protected:
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
     const int typea;
     const int typeb;
     const int typec;
diff --git a/src/cpu/x64/gemm/bf16/common_s16.hpp b/src/cpu/x64/gemm/bf16/common_s16.hpp
index 28eed475e01..c61e44190eb 100644
--- a/src/cpu/x64/gemm/bf16/common_s16.hpp
+++ b/src/cpu/x64/gemm/bf16/common_s16.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,68 +24,68 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_s16_48x8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_48x8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_48x8_copy_an_kern();
+    jit_avx512_core_s16_48x8_copy_an_kern_t();
 };
 
-class jit_avx512_core_s16_48x8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_48x8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_48x8_copy_at_kern();
+    jit_avx512_core_s16_48x8_copy_at_kern_t();
 };
 
-class jit_avx512_core_s16_48x8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_48x8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_48x8_copy_bn_kern();
+    jit_avx512_core_s16_48x8_copy_bn_kern_t();
 };
 
-class jit_avx512_core_s16_48x8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_48x8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_48x8_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_48x8_copy_bt_kern();
+    jit_avx512_core_s16_48x8_copy_bt_kern_t();
 };
 
-class jit_avx512_core_s16_24x8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_24x8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_24x8_copy_an_kern();
+    jit_avx512_core_s16_24x8_copy_an_kern_t();
 };
 
-class jit_avx512_core_s16_24x8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_24x8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_24x8_copy_at_kern();
+    jit_avx512_core_s16_24x8_copy_at_kern_t();
 };
 
-class jit_avx512_core_s16_24x8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_24x8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_24x8_copy_bn_kern();
+    jit_avx512_core_s16_24x8_copy_bn_kern_t();
 };
 
-class jit_avx512_core_s16_24x8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_s16_24x8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_s16_24x8_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_s16_24x8_copy_bt_kern();
+    jit_avx512_core_s16_24x8_copy_bt_kern_t();
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.cpp
index 124f6c441b2..17f1a27c19d 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ static inline Zmm make_zmm(const Xmm &v) {
 }
 
 // Load from or store to C.
-void jit_avx512_core_gemm_bf16bf16f32_kern::c_load(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::c_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(make_xmm(dst), src); break;
@@ -60,7 +60,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::c_load(
     }
 }
 
-void jit_avx512_core_gemm_bf16bf16f32_kern::c_store(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::c_store(
         const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(dst, make_xmm(src)); break;
@@ -76,7 +76,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::c_store(
 
 // Perform length-2 dot product accumulations of bfloat16 in parallel.
 // Use vdpbf16ps if available, otherwise emulate.
-void jit_avx512_core_gemm_bf16bf16f32_kern::dot_product(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     if (bfloat16_)
         vdpbf16ps(dst, src1, src2);
@@ -85,7 +85,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::dot_product(
 }
 
 // Inner kernel.
-void jit_avx512_core_gemm_bf16bf16f32_kern::kernel_loop(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::kernel_loop(
         int unroll_m, int unroll_n, bool cfetch) {
     int um_vecs = utils::div_up(unroll_m, c_nelems_);
     Label label_kernel_loop;
@@ -147,7 +147,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::kernel_loop(
 }
 
 // k remainder loop for kernel.
-void jit_avx512_core_gemm_bf16bf16f32_kern::remainder_kernel(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::remainder_kernel(
         int unroll_m, int unroll_n, int unroll_k, int bwidth) {
     int um_vecs = utils::div_up(unroll_m, c_nelems_);
 
@@ -181,7 +181,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::remainder_kernel(
 }
 
 // Inner loop.
-void jit_avx512_core_gemm_bf16bf16f32_kern::innerloop(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::innerloop(
         int unroll_m, int unroll_n) {
     int um_vecs = utils::div_up(unroll_m, c_nelems_);
     int stage1 = unroll_n, stage2 = unroll_n;
@@ -311,7 +311,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::innerloop(
 }
 
 // Outer loop.
-void jit_avx512_core_gemm_bf16bf16f32_kern::outerloop(
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::outerloop(
         int unroll_x, int unroll_y, Label *&cur_outerloop_label) {
     Label label_m_loop, label_n_loop, label_n_remainder_loops[6];
 
@@ -375,7 +375,7 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::outerloop(
     align(16);
 }
 
-void jit_avx512_core_gemm_bf16bf16f32_kern::generate() {
+void jit_avx512_core_gemm_bf16bf16f32_kern_t::generate() {
     // Prologue
     preamble();
     sub(rsp, stack_alloc_size_);
@@ -423,9 +423,10 @@ void jit_avx512_core_gemm_bf16bf16f32_kern::generate() {
     postamble();
 }
 
-jit_avx512_core_gemm_bf16bf16f32_kern::jit_avx512_core_gemm_bf16bf16f32_kern(
-        bool beta_zero, bool alpha_one, bool use_zmm)
-    : jit_generator(jit_name())
+jit_avx512_core_gemm_bf16bf16f32_kern_t::
+        jit_avx512_core_gemm_bf16bf16f32_kern_t(
+                bool beta_zero, bool alpha_one, bool use_zmm)
+    : jit_generator_t(jit_name())
     , beta_zero_(beta_zero)
     , alpha_one_(alpha_one)
     , bfloat16_(mayiuse(avx512_core_bf16))
@@ -502,17 +503,14 @@ jit_avx512_core_gemm_bf16bf16f32_kern::jit_avx512_core_gemm_bf16bf16f32_kern(
     zmm_tmp0_ = zmm6;
     zmm_tmp1_ = zmm3;
 
-    bf16_emu_ = nullptr;
     if (!bfloat16_ && use_zmm)
-        bf16_emu_ = new bf16_emulation_t(
+        bf16_emu_ = utils::make_unique<bf16_emulation_t>(
                 this, one_, even_, selector_, scratch_, zmm_tmp0_, zmm_tmp1_);
 }
 
-jit_avx512_core_gemm_bf16bf16f32_kern::
-        ~jit_avx512_core_gemm_bf16bf16f32_kern() {
-    delete bf16_emu_;
-}
-
+jit_avx512_core_gemm_bf16bf16f32_kern_t::
+        ~jit_avx512_core_gemm_bf16bf16f32_kern_t()
+        = default;
 } // namespace x64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.hpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.hpp
index 5362409a44d..bc176fa9467 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.hpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemm_bf16bf16f32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,12 +25,12 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_gemm_bf16bf16f32_kern : public jit_generator {
+class jit_avx512_core_gemm_bf16bf16f32_kern_t : public jit_generator_t {
 public:
-    jit_avx512_core_gemm_bf16bf16f32_kern(
+    jit_avx512_core_gemm_bf16bf16f32_kern_t(
             bool beta_zero, bool alpha_one, bool use_zmm);
-    ~jit_avx512_core_gemm_bf16bf16f32_kern();
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_bf16bf16f32_kern);
+    ~jit_avx512_core_gemm_bf16bf16f32_kern_t() override;
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_bf16bf16f32_kern_t);
 
 protected:
     bool beta_zero_;
@@ -58,7 +58,7 @@ class jit_avx512_core_gemm_bf16bf16f32_kern : public jit_generator {
     void innerloop(int unroll_m, int unroll_n);
     void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int UNROLL_N_ = 8;
@@ -90,13 +90,15 @@ class jit_avx512_core_gemm_bf16bf16f32_kern : public jit_generator {
             arg_coffset_r_;
 
     // For bfloat16 emulation on avx512 and avx512_vnni ISAs
-    bf16_emulation_t *bf16_emu_;
+    std::unique_ptr<bf16_emulation_t> bf16_emu_;
     Xbyak::Reg64 scratch_;
     Xbyak::Zmm one_;
     Xbyak::Zmm even_;
     Xbyak::Zmm selector_;
     Xbyak::Zmm zmm_tmp0_;
     Xbyak::Zmm zmm_tmp1_;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_core_gemm_bf16bf16f32_kern_t);
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.cpp
index 4d77805f9ff..42b0430e9a1 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ static inline Zmm make_zmm(const Xmm &v) {
 
 // Perform length-2 dot product accumulations of bfloat16 in parallel.
 // Use vdpbf16ps if available, otherwise emulate.
-void jit_avx512_core_gemv_bf16bf16f32_kern::dot_product(
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     if (bfloat16_)
         vdpbf16ps(dst, src1, src2);
@@ -58,7 +58,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::dot_product(
 }
 
 // Vector load for 16-bit values.
-void jit_avx512_core_gemv_bf16bf16f32_kern::v_load(
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::v_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     if (nelems >= 32)
         vmovdqu16(dst, src);
@@ -82,7 +82,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::v_load(
         vmovdqu16(make_xmm(dst) | k1 | T_z, src);
 }
 
-void jit_avx512_core_gemv_bf16bf16f32_kern::y_load(
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::y_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     if (nelems >= 16)
         vmovups(dst, src);
@@ -102,7 +102,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::y_load(
         vmovss(make_xmm(dst), src);
 }
 
-void jit_avx512_core_gemv_bf16bf16f32_kern::y_store(
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::y_store(
         const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems) {
     if (nelems >= 16)
         vmovups(dst, src);
@@ -122,7 +122,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::y_store(
         vmovss(dst, make_xmm(src));
 }
 
-void jit_avx512_core_gemv_bf16bf16f32_kern::kernel_loop_n(
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::kernel_loop_n(
         int unroll_m, int unroll_n, bool fetch, bool last) {
     int zmm_vecs = utils::div_up(unroll_m, 32);
 
@@ -203,7 +203,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::kernel_loop_n(
 }
 
 // Inner loop for A non-transposed.
-void jit_avx512_core_gemv_bf16bf16f32_kern::innerloop_n(int unroll_n) {
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::innerloop_n(int unroll_n) {
     mov(A1_, A_);
     if (unroll_n > 4) {
         lea(A2_, ptr[A1_ + LDA_ * 4]);
@@ -283,7 +283,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::innerloop_n(int unroll_n) {
     L_aligned(label_m_tail_end);
 }
 
-void jit_avx512_core_gemv_bf16bf16f32_kern::kernel_loop_t(
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::kernel_loop_t(
         int unroll_m, int unroll_n, bool fetch, bool last) {
 
     // Load x.
@@ -312,7 +312,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::kernel_loop_t(
 }
 
 // Inner loop for A transposed.
-void jit_avx512_core_gemv_bf16bf16f32_kern::innerloop_t(int unroll_n) {
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::innerloop_t(int unroll_n) {
     mov(A1_, A_);
     if (unroll_n > 4) {
         lea(A2_, ptr[A1_ + LDA_ * 4]);
@@ -431,7 +431,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::innerloop_t(int unroll_n) {
 }
 
 // Outer loop.
-void jit_avx512_core_gemv_bf16bf16f32_kern::outerloop(int unroll_y,
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::outerloop(int unroll_y,
         Label *&cur_outerloop_label, Label *&outerloop_end_label) {
 
     bool is_tail = unroll_y < UNROLL_N_;
@@ -464,7 +464,7 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::outerloop(int unroll_y,
     }
 }
 
-void jit_avx512_core_gemv_bf16bf16f32_kern::generate() {
+void jit_avx512_core_gemv_bf16bf16f32_kern_t::generate() {
     // Prologue
     preamble();
 
@@ -513,9 +513,9 @@ void jit_avx512_core_gemv_bf16bf16f32_kern::generate() {
 }
 
 // Function signature: gemv(*m, *n, *alpha, *a, *lda, *x, *incx, *y, *incy)
-jit_avx512_core_gemv_bf16bf16f32_kern::jit_avx512_core_gemv_bf16bf16f32_kern(
-        bool trans)
-    : jit_generator(jit_name())
+jit_avx512_core_gemv_bf16bf16f32_kern_t::
+        jit_avx512_core_gemv_bf16bf16f32_kern_t(bool trans)
+    : jit_generator_t(jit_name())
     , trans_(trans)
     , bfloat16_(mayiuse(avx512_core_bf16))
     , arg_lda_(0)
@@ -605,8 +605,8 @@ jit_avx512_core_gemv_bf16bf16f32_kern::jit_avx512_core_gemv_bf16bf16f32_kern(
                 this, one_, even_, selector_, gpr_, zmm_tmp0_, zmm_tmp1_);
 }
 
-jit_avx512_core_gemv_bf16bf16f32_kern::
-        ~jit_avx512_core_gemv_bf16bf16f32_kern() {
+jit_avx512_core_gemv_bf16bf16f32_kern_t::
+        ~jit_avx512_core_gemv_bf16bf16f32_kern_t() {
     delete bf16_emu_;
 }
 
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.hpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.hpp
index c7418ce7642..c108d6afc83 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.hpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_gemv_bf16bf16f32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,11 +25,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_gemv_bf16bf16f32_kern : public jit_generator {
+class jit_avx512_core_gemv_bf16bf16f32_kern_t : public jit_generator_t {
 public:
-    jit_avx512_core_gemv_bf16bf16f32_kern(bool trans);
-    ~jit_avx512_core_gemv_bf16bf16f32_kern();
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemv_bf16bf16f32_kern);
+    jit_avx512_core_gemv_bf16bf16f32_kern_t(bool trans);
+    ~jit_avx512_core_gemv_bf16bf16f32_kern_t() override;
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemv_bf16bf16f32_kern_t);
 
 protected:
     bool trans_;
@@ -52,7 +52,7 @@ class jit_avx512_core_gemv_bf16bf16f32_kern : public jit_generator {
     void outerloop(int unroll_y, Xbyak::Label *&cur_outerloop_label,
             Xbyak::Label *&outerloop_end_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int UNROLL_M_ = 64;
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_an_kern_autogen.cpp
index 22f089dc8b0..491a2a51c52 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_24x8_copy_an_kern::jit_avx512_core_s16_24x8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_24x8_copy_an_kern_t::
+        jit_avx512_core_s16_24x8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_24x8_copy_an_kern::generate() {
+void jit_avx512_core_s16_24x8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_at_kern_autogen.cpp
index 9a6032745f7..69f0d00e129 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_24x8_copy_at_kern::jit_avx512_core_s16_24x8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_24x8_copy_at_kern_t::
+        jit_avx512_core_s16_24x8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_24x8_copy_at_kern::generate() {
+void jit_avx512_core_s16_24x8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bn_kern_autogen.cpp
index be61df11e29..01db091bf68 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_24x8_copy_bn_kern::jit_avx512_core_s16_24x8_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_24x8_copy_bn_kern_t::
+        jit_avx512_core_s16_24x8_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_24x8_copy_bn_kern::generate() {
+void jit_avx512_core_s16_24x8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bt_kern_autogen.cpp
index cd62ed88dbd..5164dff7cb8 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_24x8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_24x8_copy_bt_kern::jit_avx512_core_s16_24x8_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_24x8_copy_bt_kern_t::
+        jit_avx512_core_s16_24x8_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_24x8_copy_bt_kern::generate() {
+void jit_avx512_core_s16_24x8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_an_kern_autogen.cpp
index 3a936e6a280..c6d3c901c04 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_48x8_copy_an_kern::jit_avx512_core_s16_48x8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_48x8_copy_an_kern_t::
+        jit_avx512_core_s16_48x8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_48x8_copy_an_kern::generate() {
+void jit_avx512_core_s16_48x8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_at_kern_autogen.cpp
index ced7abdd837..815d72b437b 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_48x8_copy_at_kern::jit_avx512_core_s16_48x8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_48x8_copy_at_kern_t::
+        jit_avx512_core_s16_48x8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_48x8_copy_at_kern::generate() {
+void jit_avx512_core_s16_48x8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bn_kern_autogen.cpp
index 196039ad816..da6d516438d 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_48x8_copy_bn_kern::jit_avx512_core_s16_48x8_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_48x8_copy_bn_kern_t::
+        jit_avx512_core_s16_48x8_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_48x8_copy_bn_kern::generate() {
+void jit_avx512_core_s16_48x8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bt_kern_autogen.cpp
index d448a2e121a..2f5918a5748 100644
--- a/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/bf16/jit_avx512_core_s16_48x8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_s16_48x8_copy_bt_kern::jit_avx512_core_s16_48x8_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_s16_48x8_copy_bt_kern_t::
+        jit_avx512_core_s16_48x8_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_s16_48x8_copy_bt_kern::generate() {
+void jit_avx512_core_s16_48x8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/common_f32.hpp b/src/cpu/x64/gemm/f32/common_f32.hpp
index 953aa9481e1..ed632c06c06 100644
--- a/src/cpu/x64/gemm/f32/common_f32.hpp
+++ b/src/cpu/x64/gemm/f32/common_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,176 +24,173 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_f32_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_f32_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_f32_copy_an_kern();
+    jit_avx512_core_f32_copy_an_kern_t();
 };
 
-class jit_avx512_core_f32_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_f32_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_at_kern_t);
+    void generate() override;
     void generate_part1(const Xbyak::Label &, const Xbyak::Label &,
-            const Xbyak::Label &, const Xbyak::Label &) ATTRIBUTE_OPTIMIZE;
-    void generate_part2(Xbyak::Label, Xbyak::Label, Xbyak::Label,
-            Xbyak::Label) ATTRIBUTE_OPTIMIZE;
+            const Xbyak::Label &, const Xbyak::Label &);
+    void generate_part2(Xbyak::Label, Xbyak::Label, Xbyak::Label, Xbyak::Label);
 
 public:
-    jit_avx512_core_f32_copy_at_kern();
+    jit_avx512_core_f32_copy_at_kern_t();
 };
 
-class jit_avx512_core_f32_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_f32_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_f32_copy_bn_kern();
+    jit_avx512_core_f32_copy_bn_kern_t();
 };
 
-class jit_avx512_core_f32_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_f32_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_f32_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_f32_copy_bt_kern();
+    jit_avx512_core_f32_copy_bt_kern_t();
 };
 
-class jit_avx2_f32_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_f32_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_f32_copy_an_kern();
+    jit_avx2_f32_copy_an_kern_t();
 };
 
-class jit_avx2_f32_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_f32_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_f32_copy_at_kern();
+    jit_avx2_f32_copy_at_kern_t();
 };
 
-class jit_avx2_f32_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_f32_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_f32_copy_bn_kern();
+    jit_avx2_f32_copy_bn_kern_t();
 };
 
-class jit_avx2_f32_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_f32_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_f32_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_f32_copy_bt_kern();
+    jit_avx2_f32_copy_bt_kern_t();
 };
 
-class jit_avx_f32_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_f32_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_f32_copy_an_kern();
+    jit_avx_f32_copy_an_kern_t();
 };
 
-class jit_avx_f32_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_f32_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_f32_copy_at_kern();
+    jit_avx_f32_copy_at_kern_t();
 };
 
-class jit_avx_f32_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_f32_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_f32_copy_bn_kern();
+    jit_avx_f32_copy_bn_kern_t();
 };
 
-class jit_avx_f32_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_f32_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_f32_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_f32_copy_bt_kern();
+    jit_avx_f32_copy_bt_kern_t();
 };
 
-class jit_avx_kernel_b0_sgemm_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_sgemm_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_b0_sgemm_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_sgemm_kern_t);
+    void generate() override;
     void generate_part1(const Xbyak::Label &, const Xbyak::Label &,
-            const Xbyak::Label &, const Xbyak::Label &) ATTRIBUTE_OPTIMIZE;
-    void generate_part2(Xbyak::Label, Xbyak::Label, Xbyak::Label,
-            Xbyak::Label) ATTRIBUTE_OPTIMIZE;
+            const Xbyak::Label &, const Xbyak::Label &);
+    void generate_part2(Xbyak::Label, Xbyak::Label, Xbyak::Label, Xbyak::Label);
 
 public:
-    jit_avx_kernel_b0_sgemm_kern();
+    jit_avx_kernel_b0_sgemm_kern_t();
 };
 
-class jit_avx_kernel_sgemm_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_sgemm_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
-    void generate_part1(const Xbyak::Label &, const Xbyak::Label &,
-            const Xbyak::Label &) ATTRIBUTE_OPTIMIZE;
-    void generate_part2(
-            Xbyak::Label &, Xbyak::Label &, Xbyak::Label &) ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_sgemm_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_sgemm_kern_t);
+    void generate() override;
+    void generate_part1(
+            const Xbyak::Label &, const Xbyak::Label &, const Xbyak::Label &);
+    void generate_part2(Xbyak::Label &, Xbyak::Label &, Xbyak::Label &);
 
 public:
-    jit_avx_kernel_sgemm_kern();
+    jit_avx_kernel_sgemm_kern_t();
 };
 
-class jit_sse41_f32_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_f32_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_f32_copy_an_kern();
+    jit_sse41_f32_copy_an_kern_t();
 };
 
-class jit_sse41_f32_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_f32_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_f32_copy_at_kern();
+    jit_sse41_f32_copy_at_kern_t();
 };
 
-class jit_sse41_f32_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_f32_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_f32_copy_bn_kern();
+    jit_sse41_f32_copy_bn_kern_t();
 };
 
-class jit_sse41_f32_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_f32_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_f32_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_f32_copy_bt_kern();
+    jit_sse41_f32_copy_bt_kern_t();
 };
 
-class jit_sse41_kernel_b0_sgemm_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_sgemm_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_b0_sgemm_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_sgemm_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_b0_sgemm_kern();
+    jit_sse41_kernel_b0_sgemm_kern_t();
 };
 
-class jit_sse41_kernel_sgemm_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_sgemm_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_sgemm_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_sgemm_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_sgemm_kern();
+    jit_sse41_kernel_sgemm_kern_t();
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_an_kern_autogen.cpp
index 3b14fe68440..ba136908bfa 100644
--- a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_f32_copy_an_kern::jit_avx2_f32_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_f32_copy_an_kern_t::jit_avx2_f32_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_f32_copy_an_kern::generate() {
+void jit_avx2_f32_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_at_kern_autogen.cpp
index 8f9205dfca5..daa3ece4b9c 100644
--- a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_f32_copy_at_kern::jit_avx2_f32_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_f32_copy_at_kern_t::jit_avx2_f32_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_f32_copy_at_kern::generate() {
+void jit_avx2_f32_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bn_kern_autogen.cpp
index 1b086a5e4de..f3e17a76a87 100644
--- a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_f32_copy_bn_kern::jit_avx2_f32_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_f32_copy_bn_kern_t::jit_avx2_f32_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_f32_copy_bn_kern::generate() {
+void jit_avx2_f32_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bt_kern_autogen.cpp
index 9fd7218234b..461d24d51e4 100644
--- a/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx2_f32_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_f32_copy_bt_kern::jit_avx2_f32_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_f32_copy_bt_kern_t::jit_avx2_f32_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_f32_copy_bt_kern::generate() {
+void jit_avx2_f32_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.cpp b/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.cpp
index d0fb52fa6c3..0a8dd0ddbaf 100644
--- a/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,14 +27,14 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-int jit_avx2_kernel_sgemm_kern::next_acc(int idx, int um, int un) const {
+int jit_avx2_kernel_sgemm_kern_t::next_acc(int idx, int um, int un) const {
     while (!(((idx / unroll_n_) < std::max(1, um / nelt_per_vecreg_))
             || ((idx % unroll_n_) < un)))
         idx++;
     return idx;
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchB_beforeBload(
+void jit_avx2_kernel_sgemm_kern_t::prefetchB_beforeBload(
         int um, int un, int k_idx, int n_idx) {
     if (!(mayiuse(avx512_core) && __BUILD_GEMM_AVX512)) {
         if ((n_idx == 0) && (k_idx == 0) && (un == unroll_n_) && (um != 16)) {
@@ -44,7 +44,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchB_beforeBload(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchB_beforeFMA(
+void jit_avx2_kernel_sgemm_kern_t::prefetchB_beforeFMA(
         int um, int un, int k_idx, int n_idx, int m_idx) {
     if (!(mayiuse(avx512_core) && __BUILD_GEMM_AVX512)) {
         if ((um == 16) || (un < unroll_n_)) {
@@ -61,7 +61,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchB_beforeFMA(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchA_afterFMA(
+void jit_avx2_kernel_sgemm_kern_t::prefetchA_afterFMA(
         int um, int un, int k_idx, int n_idx, int m_idx) {
     if (mayiuse(avx512_core) && __BUILD_GEMM_AVX512) {
         if ((um < unroll_m_) && (m_idx == 0)) {
@@ -85,7 +85,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchA_afterFMA(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchA_afterBload(
+void jit_avx2_kernel_sgemm_kern_t::prefetchA_afterBload(
         int um, int un, int k_idx, int n_idx) {
     if (!(mayiuse(avx512_core) && __BUILD_GEMM_AVX512)) {
         if ((um == unroll_m_) && (un == 2)) {
@@ -109,7 +109,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchA_afterBload(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchB_afterFMA(
+void jit_avx2_kernel_sgemm_kern_t::prefetchB_afterFMA(
         int k_idx, int n_idx, int m_idx) {
     if (mayiuse(avx512_core) && __BUILD_GEMM_AVX512) {
         if (((m_idx + (k_idx % (nb_zmm_a_ / unroll_m_reg_)) * unroll_m_reg_)
@@ -124,7 +124,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchB_afterFMA(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchA_beforeFMA(
+void jit_avx2_kernel_sgemm_kern_t::prefetchA_beforeFMA(
         int um, int un, int k_idx, int n_idx, int m_idx) {
     if (!(mayiuse(avx512_core) && __BUILD_GEMM_AVX512)) {
         if ((um == unroll_m_) && (un == unroll_n_)) {
@@ -158,7 +158,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchA_beforeFMA(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchC_afterBload(
+void jit_avx2_kernel_sgemm_kern_t::prefetchC_afterBload(
         int um, int un, int k_idx, int n_idx) {
     if (mayiuse(avx512_core) && __BUILD_GEMM_AVX512) {
         if (um == unroll_m_) {
@@ -172,7 +172,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchC_afterBload(
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::prefetchC_beforeKloop(int um) {
+void jit_avx2_kernel_sgemm_kern_t::prefetchC_beforeKloop(int um) {
     if (mayiuse(avx512_core) && __BUILD_GEMM_AVX512) {
         if (um < unroll_m_) {
             prefetchw(ptr[CO2_ + elt_size_ * 0]);
@@ -199,7 +199,7 @@ void jit_avx2_kernel_sgemm_kern::prefetchC_beforeKloop(int um) {
     }
 }
 
-void jit_avx2_kernel_sgemm_kern::generate() {
+void jit_avx2_kernel_sgemm_kern_t::generate() {
 
     int i, unroll_x, unroll_y, uy_bin, ux_bin;
     int C_off = is_windows ? 56 : 8;
@@ -435,8 +435,8 @@ void jit_avx2_kernel_sgemm_kern::generate() {
     postamble();
 }
 
-jit_avx2_kernel_sgemm_kern::jit_avx2_kernel_sgemm_kern(bool beta_zero)
-    : jit_generator(jit_name()), beta_zero_(beta_zero) {}
+jit_avx2_kernel_sgemm_kern_t::jit_avx2_kernel_sgemm_kern_t(bool beta_zero)
+    : jit_generator_t(jit_name()), beta_zero_(beta_zero) {}
 
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.hpp b/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.hpp
index c51d429c3e8..60b97371367 100644
--- a/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.hpp
+++ b/src/cpu/x64/gemm/f32/jit_avx2_kernel_sgemm_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,9 +29,9 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx2_kernel_sgemm_kern : public jit_generator {
+class jit_avx2_kernel_sgemm_kern_t : public jit_generator_t {
 
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_kernel_sgemm_kern);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_kernel_sgemm_kern_t);
     const int elt_size_ = 4;
     const int elt_size_bin_ = 2;
     int nelt_per_vecreg_ = mayiuse(avx512_core) && __BUILD_GEMM_AVX512 ? 16 : 8;
@@ -79,7 +79,7 @@ class jit_avx2_kernel_sgemm_kern : public jit_generator {
     void prefetchA_beforeFMA(int um, int un, int k_idx, int n_idx, int m_idx);
     void prefetchC_afterBload(int um, int un, int k_idx, int n_idx);
     void prefetchC_beforeKloop(int um);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
     template <typename T_reg, typename T_desta, typename T_srca>
     void loadA_betweenFMAs(int um, int un, int k_idx, int n_idx, int m_idx,
@@ -701,7 +701,7 @@ class jit_avx2_kernel_sgemm_kern : public jit_generator {
     }
 
 public:
-    jit_avx2_kernel_sgemm_kern(bool beta_zero);
+    jit_avx2_kernel_sgemm_kern_t(bool beta_zero);
 };
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_common_gemm_f32.cpp b/src/cpu/x64/gemm/f32/jit_avx512_common_gemm_f32.cpp
index 007bc74fc2e..85e9e4aec69 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_common_gemm_f32.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_common_gemm_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,18 +59,18 @@ namespace x64 {
 namespace avx512_common_gemm_f32 {
 using namespace gemm_utils;
 
-struct xbyak_gemm_t : public jit_generator {
+struct xbyak_gemm_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_common_gemm_f32_xbyak_gemm)
 
     xbyak_gemm_t(char isTransA, char isTransB, float beta, bool hasBias = false)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , isTransA(isTransA)
         , isTransB(isTransB)
         , beta(beta)
         , hasBias(hasBias)
         , STACK_K_CAPACITY((STACK_CAPACITY - 256) / (SIZE * UNROLL_M)) {}
 
-    void generate() override ATTRIBUTE_OPTIMIZE {
+    void generate() override {
         using namespace Xbyak;
         bool isBeta0 = (beta == 0.0);
         bool isBetaN = (!isBeta0 && beta != 1.0);
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_an_kern_autogen.cpp
index bca29715498..75b38090dcb 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_f32_copy_an_kern::jit_avx512_core_f32_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_f32_copy_an_kern_t::jit_avx512_core_f32_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_f32_copy_an_kern::generate() {
+void jit_avx512_core_f32_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part1_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part1_autogen.cpp
index 63bb212c563..d7230690c63 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part1_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part1_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_f32_copy_at_kern::jit_avx512_core_f32_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_f32_copy_at_kern_t::jit_avx512_core_f32_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_f32_copy_at_kern::generate() {
+void jit_avx512_core_f32_copy_at_kern_t::generate() {
     Xbyak::Label l1f80;
     Xbyak::Label l22b8;
     Xbyak::Label l2a5c;
@@ -48,9 +48,9 @@ void jit_avx512_core_f32_copy_at_kern::generate() {
 
     postamble();
 }
-void jit_avx512_core_f32_copy_at_kern::generate_part1(const Xbyak::Label &l4000,
-        const Xbyak::Label &l2a5c, const Xbyak::Label &l22b8,
-        const Xbyak::Label &l1f80) {
+void jit_avx512_core_f32_copy_at_kern_t::generate_part1(
+        const Xbyak::Label &l4000, const Xbyak::Label &l2a5c,
+        const Xbyak::Label &l22b8, const Xbyak::Label &l1f80) {
     Xbyak::Label l1d30;
     Xbyak::Label l1d0c;
     Xbyak::Label l1cfc;
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part2_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part2_autogen.cpp
index 51c776f1989..379a632bb1a 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part2_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_at_kern_part2_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-void jit_avx512_core_f32_copy_at_kern::generate_part2(Xbyak::Label l4000,
+void jit_avx512_core_f32_copy_at_kern_t::generate_part2(Xbyak::Label l4000,
         Xbyak::Label l2a5c, Xbyak::Label l22b8, Xbyak::Label l1f80) {
     std::vector<Xbyak::Label> labels(62);
     L(l1f80);
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bn_kern_autogen.cpp
index c49dbb2f743..ab581f6a2ad 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_f32_copy_bn_kern::jit_avx512_core_f32_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_f32_copy_bn_kern_t::jit_avx512_core_f32_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_f32_copy_bn_kern::generate() {
+void jit_avx512_core_f32_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bt_kern_autogen.cpp
index 24d3145349f..99e101a7525 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_core_f32_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_f32_copy_bt_kern::jit_avx512_core_f32_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_f32_copy_bt_kern_t::jit_avx512_core_f32_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_f32_copy_bt_kern::generate() {
+void jit_avx512_core_f32_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp b/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp
index 6c675d189d3..3787430d2bf 100644
--- a/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx512_core_gemm_smalln_tn_f32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,13 +43,13 @@ static inline Xbyak::Ymm make_ymm(const Xbyak::Zmm &v) {
 
 namespace avx512_core_gemm_smalln_tn_f32 {
 
-struct xbyak_gemm_smalln_tn_t : public jit_generator {
+struct xbyak_gemm_smalln_tn_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_smalln_tn_xbyak_gemm)
 
     xbyak_gemm_smalln_tn_t(int N, float beta, float alpha)
-        : jit_generator(jit_name()), N(N), beta(beta), alpha(alpha) {}
+        : jit_generator_t(jit_name()), N(N), beta(beta), alpha(alpha) {}
 
-    void generate() override ATTRIBUTE_OPTIMIZE {
+    void generate() override {
         using namespace Xbyak;
         /**
          * numN = 1 : 16 rows of A, 1x16 accumulators
diff --git a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_an_kern_autogen.cpp
index 117de225946..4354e22db58 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_f32_copy_an_kern::jit_avx_f32_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_f32_copy_an_kern_t::jit_avx_f32_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_f32_copy_an_kern::generate() {
+void jit_avx_f32_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_at_kern_autogen.cpp
index 20e8c67d6be..700ff542285 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_f32_copy_at_kern::jit_avx_f32_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_f32_copy_at_kern_t::jit_avx_f32_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_f32_copy_at_kern::generate() {
+void jit_avx_f32_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bn_kern_autogen.cpp
index 277144c5fbd..ed0494c469b 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_f32_copy_bn_kern::jit_avx_f32_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_f32_copy_bn_kern_t::jit_avx_f32_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_f32_copy_bn_kern::generate() {
+void jit_avx_f32_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bt_kern_autogen.cpp
index a7d9fe4fa04..e59bb0a5d8b 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_f32_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_f32_copy_bt_kern::jit_avx_f32_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_f32_copy_bt_kern_t::jit_avx_f32_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_f32_copy_bt_kern::generate() {
+void jit_avx_f32_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_avx_gemm_f32.cpp b/src/cpu/x64/gemm/f32/jit_avx_gemm_f32.cpp
index 8740d81d8c0..38a01ce662e 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_gemm_f32.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_gemm_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,11 +58,11 @@ namespace avx_gemm_f32 {
 using namespace gemm_utils;
 using namespace Xbyak;
 
-struct xbyak_gemm_t : public jit_generator {
+struct xbyak_gemm_t : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_gemm_f32_xbyak_gemm)
     xbyak_gemm_t(char isTransA, char isTransB, float beta, bool hasBias = false)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , isTransA(isTransA)
         , isTransB(isTransB)
         , hasBias(hasBias)
@@ -1966,7 +1966,7 @@ struct xbyak_gemm_t : public jit_generator {
         if (hasBias) { add(BIAS, unroll_m * SIZE); }
     }
 
-    void generate() override ATTRIBUTE_OPTIMIZE {
+    void generate() override {
         assert(IMPLICATION(!is_avx2, mayiuse(avx)));
 
         preamble();
diff --git a/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.cpp b/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.cpp
index d85f65fb581..394eb40f2e7 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ static inline Xmm make_xmm(const Xmm &v) {
 }
 
 // Load vector register data for x, y or A.
-void jit_avx_gemv_t_f32_kern::v_load(
+void jit_avx_gemv_t_f32_kern_t::v_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(make_xmm(dst), src); break;
@@ -52,7 +52,7 @@ void jit_avx_gemv_t_f32_kern::v_load(
 }
 
 // Store vector register data for x, y or A.
-void jit_avx_gemv_t_f32_kern::v_store(
+void jit_avx_gemv_t_f32_kern_t::v_store(
         const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(dst, make_xmm(src)); break;
@@ -67,7 +67,7 @@ void jit_avx_gemv_t_f32_kern::v_store(
 
 // Perform Hadamard product of 2 vectors and accumulate.
 // Use FMA instruction, otherwise emulate.
-void jit_avx_gemv_t_f32_kern::dot_product(
+void jit_avx_gemv_t_f32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     if (is_avx2_)
         vfmadd231ps(dst, src1, src2);
@@ -78,7 +78,7 @@ void jit_avx_gemv_t_f32_kern::dot_product(
 }
 
 // Inner loop.
-void jit_avx_gemv_t_f32_kern::innerloop(int unroll_m, int unroll_n) {
+void jit_avx_gemv_t_f32_kern_t::innerloop(int unroll_m, int unroll_n) {
     if ((unroll_m > M_UNROLL_) || (unroll_n > N_UNROLL_) || (unroll_m < 0)
             || (unroll_n < 0))
         return;
@@ -119,7 +119,7 @@ void jit_avx_gemv_t_f32_kern::innerloop(int unroll_m, int unroll_n) {
 }
 
 // Outer loop.
-void jit_avx_gemv_t_f32_kern::outerloop(
+void jit_avx_gemv_t_f32_kern_t::outerloop(
         int unroll_x, int unroll_y, Label *&cur_outerloop_label) {
     if ((unroll_x > M_UNROLL_) || (unroll_y > N_UNROLL_) || (unroll_y < 0)
             || (unroll_x < 0))
@@ -259,7 +259,7 @@ void jit_avx_gemv_t_f32_kern::outerloop(
     align(16);
 }
 
-void jit_avx_gemv_t_f32_kern::generate() {
+void jit_avx_gemv_t_f32_kern_t::generate() {
     // Prologue
     preamble();
 
@@ -301,8 +301,8 @@ void jit_avx_gemv_t_f32_kern::generate() {
 }
 
 // Function signature: gemv(*m, *n, *alpha, *a, *lda, *x, *incx, *y, *incy)
-jit_avx_gemv_t_f32_kern::jit_avx_gemv_t_f32_kern()
-    : jit_generator(jit_name())
+jit_avx_gemv_t_f32_kern_t::jit_avx_gemv_t_f32_kern_t()
+    : jit_generator_t(jit_name())
     , is_avx2_(mayiuse(avx2))
     , LDA_(is_windows ? rdi : r8)
     , X_(is_windows ? rsi : r9)
diff --git a/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.hpp b/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.hpp
index 1ed21b708ff..d4b07183ed5 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.hpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_gemv_t_f32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx_gemv_t_f32_kern : public jit_generator {
+class jit_avx_gemv_t_f32_kern_t : public jit_generator_t {
 public:
-    jit_avx_gemv_t_f32_kern(void);
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_gemv_t_f32_kern);
+    jit_avx_gemv_t_f32_kern_t(void);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_gemv_t_f32_kern_t);
 
 protected:
     bool is_avx2_;
@@ -40,7 +40,7 @@ class jit_avx_gemv_t_f32_kern : public jit_generator {
     void innerloop(int unroll_m, int unroll_n);
     void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int M_UNROLL_ = 16;
diff --git a/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part1_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part1_autogen.cpp
index 52fccd21619..32a2f5860dd 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part1_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part1_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_b0_sgemm_kern::jit_avx_kernel_b0_sgemm_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_b0_sgemm_kern_t::jit_avx_kernel_b0_sgemm_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_b0_sgemm_kern::generate() {
+void jit_avx_kernel_b0_sgemm_kern_t::generate() {
     Xbyak::Label l259c;
     Xbyak::Label l2774;
     Xbyak::Label l2834;
@@ -52,7 +52,7 @@ void jit_avx_kernel_b0_sgemm_kern::generate() {
     postamble();
 }
 
-void jit_avx_kernel_b0_sgemm_kern::generate_part1(const Xbyak::Label &l2cf4,
+void jit_avx_kernel_b0_sgemm_kern_t::generate_part1(const Xbyak::Label &l2cf4,
         const Xbyak::Label &l2834, const Xbyak::Label &l2774,
         const Xbyak::Label &l259c) {
     std::vector<Xbyak::Label> labels(55);
diff --git a/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part2_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part2_autogen.cpp
index 74d2c82cbc4..35a9ea2f626 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part2_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_kernel_b0_sgemm_kern_part2_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-void jit_avx_kernel_b0_sgemm_kern::generate_part2(Xbyak::Label l2cf4,
+void jit_avx_kernel_b0_sgemm_kern_t::generate_part2(Xbyak::Label l2cf4,
         Xbyak::Label l2834, Xbyak::Label l2774, Xbyak::Label l259c) {
     std::vector<Xbyak::Label> labels(57);
     L(labels[56]);
diff --git a/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part1_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part1_autogen.cpp
index 8ea5bd9a729..daeba0781ea 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part1_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part1_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_sgemm_kern::jit_avx_kernel_sgemm_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_sgemm_kern_t::jit_avx_kernel_sgemm_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_sgemm_kern::generate() {
+void jit_avx_kernel_sgemm_kern_t::generate() {
     Xbyak::Label l1efc;
     Xbyak::Label l1f44;
     Xbyak::Label l1f48;
@@ -40,13 +40,13 @@ void jit_avx_kernel_sgemm_kern::generate() {
     mov(C, ptr[OLD_C]);
     mov(LDC, ptr[OLD_LDC]);
 
-    jit_avx_kernel_sgemm_kern::generate_part1(l1efc, l1f44, l1f48);
-    jit_avx_kernel_sgemm_kern::generate_part2(l1efc, l1f44, l1f48);
+    jit_avx_kernel_sgemm_kern_t::generate_part1(l1efc, l1f44, l1f48);
+    jit_avx_kernel_sgemm_kern_t::generate_part2(l1efc, l1f44, l1f48);
 
     postamble();
 }
 
-void jit_avx_kernel_sgemm_kern::generate_part1(const Xbyak::Label &l1efc,
+void jit_avx_kernel_sgemm_kern_t::generate_part1(const Xbyak::Label &l1efc,
         const Xbyak::Label &l1f44, const Xbyak::Label &l1f48) {
     std::vector<Xbyak::Label> labels(44);
 
diff --git a/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part2_autogen.cpp b/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part2_autogen.cpp
index a8154c7a1c8..e1ff79875f0 100644
--- a/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part2_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_avx_kernel_sgemm_kern_part2_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-void jit_avx_kernel_sgemm_kern::generate_part2(
+void jit_avx_kernel_sgemm_kern_t::generate_part2(
         Xbyak::Label &l1efc, Xbyak::Label &l1f44, Xbyak::Label &l1f48) {
     std::vector<Xbyak::Label> labels(69);
     L(l1efc);
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_an_kern_autogen.cpp
index 57039cba5b0..9fe1c3a386b 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_f32_copy_an_kern::jit_sse41_f32_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_f32_copy_an_kern_t::jit_sse41_f32_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_f32_copy_an_kern::generate() {
+void jit_sse41_f32_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_at_kern_autogen.cpp
index b1381469d1b..d52a86a0726 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_f32_copy_at_kern::jit_sse41_f32_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_f32_copy_at_kern_t::jit_sse41_f32_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_f32_copy_at_kern::generate() {
+void jit_sse41_f32_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bn_kern_autogen.cpp
index f095bf750e9..36c56697f43 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_f32_copy_bn_kern::jit_sse41_f32_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_f32_copy_bn_kern_t::jit_sse41_f32_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_f32_copy_bn_kern::generate() {
+void jit_sse41_f32_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bt_kern_autogen.cpp
index 3f509e5dcef..b985134391e 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_f32_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_f32_copy_bt_kern::jit_sse41_f32_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_f32_copy_bt_kern_t::jit_sse41_f32_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_f32_copy_bt_kern::generate() {
+void jit_sse41_f32_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.cpp b/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.cpp
index cb195f55006..83b171466aa 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,7 +58,7 @@ static inline int log2_of_pow2(int n) {
 }
 
 // Load vector register data for x, y or A.
-void jit_sse41_gemv_n_f32_kern::v_load(
+void jit_sse41_gemv_n_f32_kern_t::v_load(
         const Xmm &dst, const Address &src, int nelems) {
     if (nelems >= v_nelems_) {
         uni_vmovups(dst, src);
@@ -82,7 +82,7 @@ void jit_sse41_gemv_n_f32_kern::v_load(
 }
 
 // Store vector register data for x, y or A.
-void jit_sse41_gemv_n_f32_kern::v_store(
+void jit_sse41_gemv_n_f32_kern_t::v_store(
         const Address &dst, const Xmm &src, int nelems) {
     if (nelems >= v_nelems_) {
         uni_vmovups(dst, src);
@@ -107,7 +107,7 @@ void jit_sse41_gemv_n_f32_kern::v_store(
 
 // Perform Hadamard product of 2 vectors and accumulate.
 // Use FMA instruction, otherwise emulate.
-void jit_sse41_gemv_n_f32_kern::dot_product(
+void jit_sse41_gemv_n_f32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     if (has_avx2_)
         vfmadd231ps(dst, src1, src2);
@@ -120,7 +120,7 @@ void jit_sse41_gemv_n_f32_kern::dot_product(
     }
 }
 
-void jit_sse41_gemv_n_f32_kern::kernel_loop(
+void jit_sse41_gemv_n_f32_kern_t::kernel_loop(
         int unroll_m, int unroll_n, bool fetch, bool last) {
     int um_vecs = utils::div_up(unroll_m, v_nelems_);
 
@@ -168,7 +168,7 @@ void jit_sse41_gemv_n_f32_kern::kernel_loop(
 }
 
 // Inner loop for A non-transposed.
-void jit_sse41_gemv_n_f32_kern::innerloop(int unroll_m, int unroll_n) {
+void jit_sse41_gemv_n_f32_kern_t::innerloop(int unroll_m, int unroll_n) {
     mov(Y1_, Y_);
 
     // Load x and scale by alpha.
@@ -237,7 +237,7 @@ void jit_sse41_gemv_n_f32_kern::innerloop(int unroll_m, int unroll_n) {
     L_aligned(label_m_loop_end);
 }
 
-void jit_sse41_gemv_n_f32_kern::outerloop(int unroll_x, int unroll_y,
+void jit_sse41_gemv_n_f32_kern_t::outerloop(int unroll_x, int unroll_y,
         Label *&cur_outerloop_label, Label *&outerloop_end_label) {
     bool is_tail = unroll_y < unroll_n_;
 
@@ -270,7 +270,7 @@ void jit_sse41_gemv_n_f32_kern::outerloop(int unroll_x, int unroll_y,
     }
 }
 
-void jit_sse41_gemv_n_f32_kern::generate() {
+void jit_sse41_gemv_n_f32_kern_t::generate() {
     // Prologue
     preamble();
 
@@ -313,8 +313,8 @@ void jit_sse41_gemv_n_f32_kern::generate() {
 }
 
 // Function signature: gemv(*m, *n, *alpha, *a, *lda, *x, *incx, *y, *incy)
-jit_sse41_gemv_n_f32_kern::jit_sse41_gemv_n_f32_kern(void)
-    : jit_generator(jit_name())
+jit_sse41_gemv_n_f32_kern_t::jit_sse41_gemv_n_f32_kern_t(void)
+    : jit_generator_t(jit_name())
     , has_avx512_(mayiuse(avx512_core) && __BUILD_GEMM_AVX512)
     , has_avx2_(mayiuse(avx2) && __BUILD_GEMM_AVX2)
     , has_avx_(mayiuse(avx) && __BUILD_GEMM_AVX2)
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.hpp b/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.hpp
index 89886aed939..8058122ea18 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.hpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_gemv_n_f32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_sse41_gemv_n_f32_kern : public jit_generator {
+class jit_sse41_gemv_n_f32_kern_t : public jit_generator_t {
 public:
-    jit_sse41_gemv_n_f32_kern();
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_gemv_n_f32_kern);
+    jit_sse41_gemv_n_f32_kern_t();
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_gemv_n_f32_kern_t);
 
 protected:
     bool has_avx512_;
@@ -61,7 +61,7 @@ class jit_sse41_gemv_n_f32_kern : public jit_generator {
             Xbyak::Label *&cur_outerloop_label,
             Xbyak::Label *&outerloop_end_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int max_um_vecs_ = 16;
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.cpp b/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.cpp
index b3b578975fc..da67ca31e78 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace x64 {
 using namespace Xbyak;
 
 // Load vector register data for x, y or A.
-void jit_sse41_gemv_t_f32_kern::v_load(
+void jit_sse41_gemv_t_f32_kern_t::v_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     switch (nelems) {
         case 1: movss(dst, src); break;
@@ -43,7 +43,7 @@ void jit_sse41_gemv_t_f32_kern::v_load(
 }
 
 // Store vector register data for x, y or A.
-void jit_sse41_gemv_t_f32_kern::v_store(
+void jit_sse41_gemv_t_f32_kern_t::v_store(
         const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems) {
     switch (nelems) {
         case 1: movss(dst, src); break;
@@ -56,14 +56,14 @@ void jit_sse41_gemv_t_f32_kern::v_store(
 }
 
 // Perform Hadamard product of 2 vectors and accumulate.
-void jit_sse41_gemv_t_f32_kern::dot_product(
+void jit_sse41_gemv_t_f32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     mulps(src2, src1);
     addps(dst, src2);
 }
 
 // Inner loop.
-void jit_sse41_gemv_t_f32_kern::innerloop(int unroll_m, int unroll_n) {
+void jit_sse41_gemv_t_f32_kern_t::innerloop(int unroll_m, int unroll_n) {
     if ((unroll_m > M_UNROLL_) || (unroll_n > N_UNROLL_) || (unroll_m < 0)
             || (unroll_n < 0))
         return;
@@ -104,7 +104,7 @@ void jit_sse41_gemv_t_f32_kern::innerloop(int unroll_m, int unroll_n) {
 }
 
 // Outer loop.
-void jit_sse41_gemv_t_f32_kern::outerloop(
+void jit_sse41_gemv_t_f32_kern_t::outerloop(
         int unroll_x, int unroll_y, Label *&cur_outerloop_label) {
     if ((unroll_x > M_UNROLL_) || (unroll_y > N_UNROLL_) || (unroll_y < 0)
             || unroll_x < 0)
@@ -230,7 +230,7 @@ void jit_sse41_gemv_t_f32_kern::outerloop(
     align(16);
 }
 
-void jit_sse41_gemv_t_f32_kern::generate() {
+void jit_sse41_gemv_t_f32_kern_t::generate() {
     // Prologue
     preamble();
 
@@ -272,8 +272,8 @@ void jit_sse41_gemv_t_f32_kern::generate() {
 }
 
 // Function signature: gemv(*m, *n, *alpha, *a, *lda, *x, *incx, *y, *incy)
-jit_sse41_gemv_t_f32_kern::jit_sse41_gemv_t_f32_kern()
-    : jit_generator(jit_name())
+jit_sse41_gemv_t_f32_kern_t::jit_sse41_gemv_t_f32_kern_t()
+    : jit_generator_t(jit_name())
     , LDA_(is_windows ? rdi : r8)
     , X_(is_windows ? rsi : r9)
     , INCY_(r10)
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.hpp b/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.hpp
index 8a32fb4beff..9f79643ab8e 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.hpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_gemv_t_f32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,10 +24,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_sse41_gemv_t_f32_kern : public jit_generator {
+class jit_sse41_gemv_t_f32_kern_t : public jit_generator_t {
 public:
-    jit_sse41_gemv_t_f32_kern(void);
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_gemv_t_f32_kern);
+    jit_sse41_gemv_t_f32_kern_t(void);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_gemv_t_f32_kern_t);
 
 protected:
     void v_load(const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems);
@@ -38,7 +38,7 @@ class jit_sse41_gemv_t_f32_kern : public jit_generator {
     void innerloop(int unroll_m, int unroll_n);
     void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int M_UNROLL_ = 8;
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_kernel_b0_sgemm_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_sse41_kernel_b0_sgemm_kern_autogen.cpp
index ae734d720b7..a2d2934f144 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_kernel_b0_sgemm_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_kernel_b0_sgemm_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_b0_sgemm_kern::jit_sse41_kernel_b0_sgemm_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_b0_sgemm_kern_t::jit_sse41_kernel_b0_sgemm_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_b0_sgemm_kern::generate() {
+void jit_sse41_kernel_b0_sgemm_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/f32/jit_sse41_kernel_sgemm_kern_autogen.cpp b/src/cpu/x64/gemm/f32/jit_sse41_kernel_sgemm_kern_autogen.cpp
index ba6a36882ed..6d900e70fb4 100644
--- a/src/cpu/x64/gemm/f32/jit_sse41_kernel_sgemm_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/f32/jit_sse41_kernel_sgemm_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_sgemm_kern::jit_sse41_kernel_sgemm_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_sgemm_kern_t::jit_sse41_kernel_sgemm_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_sgemm_kern::generate() {
+void jit_sse41_kernel_sgemm_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/gemm_driver.cpp b/src/cpu/x64/gemm/gemm_driver.cpp
index dae0d417f46..aaacd0931e2 100644
--- a/src/cpu/x64/gemm/gemm_driver.cpp
+++ b/src/cpu/x64/gemm/gemm_driver.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -80,15 +80,15 @@ int get_vector_length() {
         //dummy if
 #if __BUILD_GEMM_AVX512
     } else if (mayiuse(avx512_core)) {
-        v_bytes = cpu_isa_traits<avx512_core>::vlen;
+        v_bytes = cpu_isa_traits_t<avx512_core>::vlen;
 #endif
 #if __BUILD_GEMM_AVX2
     } else if (mayiuse(avx)) {
-        v_bytes = cpu_isa_traits<avx>::vlen;
+        v_bytes = cpu_isa_traits_t<avx>::vlen;
 #endif
 #if __BUILD_GEMM_SSE41
     } else if (mayiuse(sse41)) {
-        v_bytes = cpu_isa_traits<sse41>::vlen;
+        v_bytes = cpu_isa_traits_t<sse41>::vlen;
 #endif
     } else {
         assert(!"not supposed to be reached.");
@@ -115,7 +115,7 @@ static inline void add_results(const dim_t m, const dim_t n, const float alpha,
         c_type *c_data, const dim_t ldc, const c_type *co,
         offset_type offsetc) {
 
-    constexpr bool is_int8 = data_traits<c_type>::data_type == data_type::s32;
+    constexpr bool is_int8 = data_traits_t<c_type>::data_type == data_type::s32;
 
     for (dim_t j = 0; j < n; ++j) {
         for (dim_t i = 0; i < m; ++i) {
@@ -254,7 +254,7 @@ static inline void *align(void *ptr, size_t alignment) {
 template <typename scale_t, typename mat_t>
 void scale_matrix(
         dim_t m, dim_t n, scale_t alpha, mat_t *__restrict p_mat, dim_t ld) {
-    if (data_traits<mat_t>::data_type == data_type::f32) {
+    if (data_traits_t<mat_t>::data_type == data_type::f32) {
         for (dim_t j = 0; j < n; j++) {
             for (dim_t i = 0; i < m; i++) {
                 p_mat[i + j * ld] = (mat_t)((scale_t)p_mat[i + j * ld] * alpha);
@@ -400,8 +400,8 @@ void gemm_kernel(dim_t m, dim_t n, const dim_t k, const float alpha,
     bool row_req = false;
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_f32 = data_traits<a_type>::data_type == data_type::f32;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_f32 = data_traits_t<a_type>::data_type == data_type::f32;
     bool is_int8_amx = is_int8 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
 
     dim_t m_stk = col_offset_ws ? 1 : m;
@@ -547,8 +547,9 @@ static dnnl_status_t gemm_kernel_driver(int ithr, dim_t m, dim_t n, dim_t k,
     float alpha = arg->alpha;
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_bf16 = data_traits<a_type>::data_type == data_type::bf16;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_bf16
+            = data_traits_t<a_type>::data_type == data_type::bf16;
 
     bool is_int8_amx = is_int8 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
     bool is_bf16_amx = is_bf16 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
@@ -826,8 +827,9 @@ static dnnl_status_t kernel_driver_parallel_acopiedbcopy(int ithr, dim_t m,
     size_t b_buf_nelems = k * n_padd;
     size_t b_col_sum_nelems = n_padd;
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_bf16 = data_traits<a_type>::data_type == data_type::bf16;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_bf16
+            = data_traits_t<a_type>::data_type == data_type::bf16;
 
     bool is_int8_amx = is_int8 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
     bool is_bf16_amx = is_bf16 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
@@ -1050,7 +1052,7 @@ template <typename a_type, typename b_type, typename c_type>
 static inline bool nocopy_checker(
         int nthr, const gemm_info_t<a_type, b_type, c_type> *arg) {
 
-    if (data_traits<a_type>::data_type != data_type::f32) return false;
+    if (data_traits_t<a_type>::data_type != data_type::f32) return false;
 
     if (!(mayiuse(avx) && __BUILD_GEMM_AVX2)) return false;
 
@@ -1089,8 +1091,8 @@ static inline void set_thread_opts_nopack(int nthrs, int nthrs_spawn,
     static constexpr dim_t M2D_MIN = 384;
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    bool isSgemm = data_traits<a_type>::data_type == data_type::f32;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    bool isSgemm = data_traits_t<a_type>::data_type == data_type::f32;
 
     dim_t m = arg->m;
     dim_t n = arg->n;
@@ -1247,8 +1249,9 @@ static inline void set_thread_opts_pack(int nthrs,
         bool do_n_blocking = true) {
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_bf16 = data_traits<a_type>::data_type == data_type::bf16;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_bf16
+            = data_traits_t<a_type>::data_type == data_type::bf16;
 
     bool do_m_blocking_only = do_m_blocking && !do_n_blocking;
 
@@ -1362,8 +1365,9 @@ static inline int set_thread_opts(int nthrs, int nthrs_spawn,
     thread_info.thread_m = thread_info.thread_n = thread_info.thread_k = -1;
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_bf16 = data_traits<a_type>::data_type == data_type::bf16;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_bf16
+            = data_traits_t<a_type>::data_type == data_type::bf16;
 
     if (nocopy_checker(nthrs, arg)) {
         thread_info.copy = copy_type::no_copy;
@@ -1452,8 +1456,9 @@ static dnnl_status_t parallel_a_copy(const int ithr, const int nthrs,
     float alpha = arg->alpha;
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_bf16 = data_traits<a_type>::data_type == data_type::bf16;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_bf16
+            = data_traits_t<a_type>::data_type == data_type::bf16;
     bool is_int8_amx = is_int8 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
     bool is_bf16_amx = is_bf16 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
     bool is_amx = is_int8_amx || is_bf16_amx;
@@ -1608,7 +1613,7 @@ static inline void adjust_thread_count(dim_t m, dim_t n, dim_t k, int *nthrs) {
     auto veclen = get_vector_length<T>();
     const double fp_per_cycle = 2.0 * 2.0 * veclen;
 
-    const bool is_f32 = data_traits<T>::data_type == data_type::f32;
+    const bool is_f32 = data_traits_t<T>::data_type == data_type::f32;
 
     const bool is_avx512 = mayiuse(avx512_core) && __BUILD_GEMM_AVX512;
     const bool is_avx = mayiuse(avx) && __BUILD_GEMM_AVX2;
@@ -1729,8 +1734,9 @@ static dnnl_status_t gemm_threading_driver(
     auto is_a_packed = (arg->transa == packed);
     auto is_b_packed = (arg->transb == packed);
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
-    constexpr bool is_bf16 = data_traits<a_type>::data_type == data_type::bf16;
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
+    constexpr bool is_bf16
+            = data_traits_t<a_type>::data_type == data_type::bf16;
 
     if ((arg->m <= 0) || (arg->n <= 0)) return dnnl_success;
 
@@ -1971,7 +1977,7 @@ static dnnl_status_t gemm_threading_driver(
                         // This route is taken only if we realize we need no-copy
                         //  after launching the parallel section, due to less
                         //  threads being spawned than expected.
-                        assert(data_traits<a_type>::data_type
+                        assert(data_traits_t<a_type>::data_type
                                 == data_type::f32);
                         assert(arg->packing == pack_type::none);
 
@@ -2048,13 +2054,13 @@ dnnl_status_t gemm_driver(const char *transA, const char *transB,
         pack_type packing, gemm_pack_storage_t *pack_dst, bool measure_only) {
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_type>::data_type, data_type::s8, data_type::u8);
+            data_traits_t<a_type>::data_type, data_type::s8, data_type::u8);
     MAYBE_UNUSED(is_int8);
 
 #if __BUILD_GEMM_AVX512
     // gemm_driver supports bfloat16 gemm for Intel AVX512 and
     // Intel AVX512 BF16.
-    assert(IMPLICATION(data_traits<a_type>::data_type == data_type::bf16,
+    assert(IMPLICATION(data_traits_t<a_type>::data_type == data_type::bf16,
             mayiuse(avx512_core) && !force_nocopy));
 #endif
 
@@ -2067,8 +2073,8 @@ dnnl_status_t gemm_driver(const char *transA, const char *transB,
 #if __BUILD_GEMM_SSE41
     // gemm_driver supports sgemm for Intel AVX512, Intel AVX2, Intel AVX,
     // and Intel SSE4.1
-    assert(IMPLICATION(
-            data_traits<a_type>::data_type == data_type::f32, mayiuse(sse41)));
+    assert(IMPLICATION(data_traits_t<a_type>::data_type == data_type::f32,
+            mayiuse(sse41)));
 #endif
 
     // 8-bit integer gemm doesn't support nocopy kernels.
diff --git a/src/cpu/x64/gemm/gemm_driver.hpp b/src/cpu/x64/gemm/gemm_driver.hpp
index 650d1775a01..163349b1101 100644
--- a/src/cpu/x64/gemm/gemm_driver.hpp
+++ b/src/cpu/x64/gemm/gemm_driver.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2020 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ dnnl_status_t gemm_driver(const char *transA, const char *transB,
         const b_type *b, const dim_t *ldb, const b_type *ob, const float *beta,
         c_type *c, const dim_t *ldc, const c_type *oc,
         const bool force_jit_nocopy_gemm, pack_type packing = pack_type::none,
-        gemm_pack_storage_t *pack_dst = NULL, bool measure_only = false);
+        gemm_pack_storage_t *pack_dst = nullptr, bool measure_only = false);
 
 void prep_ref_gemm_s8u8s32_pack(
         bool do_a, dim_t rows, dim_t cols, gemm_pack_storage_t *pack_dst);
diff --git a/src/cpu/x64/gemm/gemm_info.cpp b/src/cpu/x64/gemm/gemm_info.cpp
index cd227f30306..05837b0293e 100644
--- a/src/cpu/x64/gemm/gemm_info.cpp
+++ b/src/cpu/x64/gemm/gemm_info.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include "common/bfloat16.hpp"
 #include "common/dnnl_traits.hpp"
+#include "common/dnnl_sel_build.hpp"
 
 #include "cpu/gemm/gemm.hpp"
 
@@ -139,7 +140,7 @@ gemm_info_t<a_t, b_t, c_t>::gemm_info_t(const char *transA, const char *transB,
     }
 
     constexpr bool is_int8 = utils::one_of(
-            data_traits<a_t>::data_type, data_type::s8, data_type::u8);
+            data_traits_t<a_t>::data_type, data_type::s8, data_type::u8);
     if (is_int8) this->ao = oa ? *oa : a_t(0);
     prepare_bo<b_t>(this->bo, ob);
 
@@ -155,7 +156,7 @@ gemm_info_t<a_t, b_t, c_t>::gemm_info_t(const char *transA, const char *transB,
         this->co = oc;
     }
 
-    bool is_sgemm = data_traits<a_t>::data_type == data_type::f32;
+    bool is_sgemm = data_traits_t<a_t>::data_type == data_type::f32;
     bool is_gemv = this->m == 1 || this->n == 1;
 
     // Copy-based sgemm doesn't support force-nocopy for ISAs older
@@ -213,7 +214,8 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
     // TODO: Add dispatching for 1-fma SKUs with support to bf16
     // instructions for AMX kernel.
     {
-        constexpr bool is_bf16 = data_traits<a_t>::data_type == data_type::bf16;
+        constexpr bool is_bf16
+                = data_traits_t<a_t>::data_type == data_type::bf16;
         const bool max_isa_supports_bf16_ymm = mayiuse(avx512_core_bf16_ymm)
                 && __BUILD_GEMM_AVX512
                 && !(mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX);
@@ -221,7 +223,7 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
         use_bf16_ymm = is_bf16 && max_isa_supports_bf16_ymm;
     }
 
-    switch (data_traits<a_t>::data_type) {
+    switch (data_traits_t<a_t>::data_type) {
         case data_type::s8:
             if (false) {
                 // dummy if
@@ -391,145 +393,158 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
     static std::once_flag initialized;
     static std::atomic<dnnl_status_t> st(dnnl_success);
     std::call_once(initialized, [&, um] {
-        const bool b_is_s8 = data_traits<b_t>::data_type == data_type::s8;
+        const bool b_is_s8 = data_traits_t<b_t>::data_type == data_type::s8;
         UNUSED(b_is_s8);
         constexpr bool is_int8 = utils::one_of(
-                data_traits<a_t>::data_type, data_type::s8, data_type::u8);
-        constexpr bool is_bf16 = data_traits<a_t>::data_type == data_type::bf16;
+                data_traits_t<a_t>::data_type, data_type::s8, data_type::u8);
+        constexpr bool is_bf16
+                = data_traits_t<a_t>::data_type == data_type::bf16;
         bool is_int8_amx
                 = is_int8 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
         bool is_bf16_amx
                 = is_bf16 && mayiuse(avx512_core_amx) && __BUILD_GEMM_AMX;
         bool is_amx = is_int8_amx || is_bf16_amx;
 
-        static maybe_unique_ptr<jit_generator> copy_a[2][2] = {{nullptr}};
-        static maybe_unique_ptr<jit_generator> copy_b[2][2] = {{nullptr}};
+        static maybe_unique_ptr<jit_generator_t> copy_a[2][2] = {{nullptr}};
+        static maybe_unique_ptr<jit_generator_t> copy_b[2][2] = {{nullptr}};
 
-        switch (data_traits<a_t>::data_type) {
+        switch (data_traits_t<a_t>::data_type) {
             case data_type::s8:
                 if (false) {
                     // dummy if
 #if __BUILD_GEMM_AMX
                 } else if (mayiuse(amx_int8)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_s8_amx_int8) {
                     for (int isTrans : {no_trans, do_trans}) {
                         copy_a[isTrans][no_sum].reset(
-                                new jit_avx512_core_amx_copy_kern(
+                                new jit_avx512_core_amx_copy_kern_t(
                                         true, !isTrans, sizeof(a_t)));
 
                         copy_b[isTrans][no_sum].reset(
-                                new jit_avx512_core_amx_copy_kern(
+                                new jit_avx512_core_amx_copy_kern_t(
                                         false, isTrans, sizeof(b_t)));
+                        }
                     }
 #endif
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_s8_avx512_core) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx512_core_u8_copy_an_kern());
+                            new jit_avx512_core_u8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx512_core_u8_copy_at_kern());
+                            new jit_avx512_core_u8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx512_core_u8_copy_bn_kern(b_is_s8));
+                            new jit_avx512_core_u8_copy_bn_kern_t(b_is_s8));
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx512_core_u8_copy_bt_kern(b_is_s8));
+                            new jit_avx512_core_u8_copy_bt_kern_t(b_is_s8));
 
                     copy_a[no_trans][do_sum].reset(
-                            new jit_avx512_core_u8_copy_sum_an_kern());
+                            new jit_avx512_core_u8_copy_sum_an_kern_t());
                     copy_a[do_trans][do_sum].reset(
-                            new jit_avx512_core_u8_copy_sum_at_kern());
+                            new jit_avx512_core_u8_copy_sum_at_kern_t());
 
                     copy_b[no_trans][do_sum].reset(
-                            new jit_avx512_core_u8_copy_sum_bn_kern(b_is_s8));
+                            new jit_avx512_core_u8_copy_sum_bn_kern_t(b_is_s8));
                     copy_b[do_trans][do_sum].reset(
-                            new jit_avx512_core_u8_copy_sum_bt_kern(b_is_s8));
+                            new jit_avx512_core_u8_copy_sum_bt_kern_t(b_is_s8));
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx2_vnni)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_s8_avx2_vnni) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx2_vnni_u8_copy_an_kern());
+                            new jit_avx2_vnni_u8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx2_vnni_u8_copy_at_kern());
+                            new jit_avx2_vnni_u8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx2_vnni_u8_copy_bn_kern());
+                            new jit_avx2_vnni_u8_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx2_vnni_u8_copy_bt_kern());
+                            new jit_avx2_vnni_u8_copy_bt_kern_t());
 
                     copy_a[no_trans][do_sum].reset(
-                            new jit_avx2_vnni_u8_copy_sum_an_kern());
+                            new jit_avx2_vnni_u8_copy_sum_an_kern_t());
                     copy_a[do_trans][do_sum].reset(
-                            new jit_avx2_vnni_u8_copy_sum_at_kern());
+                            new jit_avx2_vnni_u8_copy_sum_at_kern_t());
 
                     copy_b[no_trans][do_sum].reset(
-                            new jit_avx2_vnni_u8_copy_sum_bn_kern());
+                            new jit_avx2_vnni_u8_copy_sum_bn_kern_t());
                     copy_b[do_trans][do_sum].reset(
-                            new jit_avx2_vnni_u8_copy_sum_bt_kern());
+                            new jit_avx2_vnni_u8_copy_sum_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx2)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_s8_avx2) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx2_u8_copy_an_kern());
+                            new jit_avx2_u8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx2_u8_copy_at_kern());
+                            new jit_avx2_u8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx2_u8_copy_bn_kern());
+                            new jit_avx2_u8_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx2_u8_copy_bt_kern());
+                            new jit_avx2_u8_copy_bt_kern_t());
 
                     copy_a[no_trans][do_sum].reset(
-                            new jit_avx2_u8_copy_sum_an_kern());
+                            new jit_avx2_u8_copy_sum_an_kern_t());
                     copy_a[do_trans][do_sum].reset(
-                            new jit_avx2_u8_copy_sum_at_kern());
+                            new jit_avx2_u8_copy_sum_at_kern_t());
 
                     copy_b[no_trans][do_sum].reset(
-                            new jit_avx2_u8_copy_sum_bn_kern());
+                            new jit_avx2_u8_copy_sum_bn_kern_t());
                     copy_b[do_trans][do_sum].reset(
-                            new jit_avx2_u8_copy_sum_bt_kern());
+                            new jit_avx2_u8_copy_sum_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_s8_avx) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx_u8_copy_an_kern());
+                            new jit_avx_u8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx_u8_copy_at_kern());
+                            new jit_avx_u8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx_u8_copy_bn_kern());
+                            new jit_avx_u8_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx_u8_copy_bt_kern());
+                            new jit_avx_u8_copy_bt_kern_t());
 
                     copy_a[no_trans][do_sum].reset(
-                            new jit_avx_u8_copy_sum_an_kern());
+                            new jit_avx_u8_copy_sum_an_kern_t());
                     copy_a[do_trans][do_sum].reset(
-                            new jit_avx_u8_copy_sum_at_kern());
+                            new jit_avx_u8_copy_sum_at_kern_t());
 
                     copy_b[no_trans][do_sum].reset(
-                            new jit_avx_u8_copy_sum_bn_kern());
+                            new jit_avx_u8_copy_sum_bn_kern_t());
                     copy_b[do_trans][do_sum].reset(
-                            new jit_avx_u8_copy_sum_bt_kern());
+                            new jit_avx_u8_copy_sum_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_SSE41
                 } else if (mayiuse(sse41)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_s8_sse41) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_sse41_u8_copy_an_kern());
+                            new jit_sse41_u8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_sse41_u8_copy_at_kern());
+                            new jit_sse41_u8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_sse41_u8_copy_bn_kern());
+                            new jit_sse41_u8_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_sse41_u8_copy_bt_kern());
+                            new jit_sse41_u8_copy_bt_kern_t());
 
                     copy_a[no_trans][do_sum].reset(
-                            new jit_sse41_u8_copy_sum_an_kern());
+                            new jit_sse41_u8_copy_sum_an_kern_t());
                     copy_a[do_trans][do_sum].reset(
-                            new jit_sse41_u8_copy_sum_at_kern());
+                            new jit_sse41_u8_copy_sum_at_kern_t());
 
                     copy_b[no_trans][do_sum].reset(
-                            new jit_sse41_u8_copy_sum_bn_kern());
+                            new jit_sse41_u8_copy_sum_bn_kern_t());
                     copy_b[do_trans][do_sum].reset(
-                            new jit_sse41_u8_copy_sum_bt_kern());
+                            new jit_sse41_u8_copy_sum_bt_kern_t());
+                    }
 #endif
                 }
                 break;
@@ -539,39 +554,45 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
                     // dummy if
 #if __BUILD_GEMM_AMX
                 } else if (mayiuse(amx_bf16)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_bf16_amx_bf16) {
                     for (int isTrans : {no_trans, do_trans}) {
                         copy_a[isTrans][no_sum].reset(
-                                new jit_avx512_core_amx_copy_kern(
+                                new jit_avx512_core_amx_copy_kern_t(
                                         true, !isTrans, sizeof(a_t)));
 
                         copy_b[isTrans][no_sum].reset(
-                                new jit_avx512_core_amx_copy_kern(
+                                new jit_avx512_core_amx_copy_kern_t(
                                         false, isTrans, sizeof(b_t)));
+                        }
                     }
 #endif
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core) && !use_bf16_ymm) {
+                    DNNL_CSCOPE(jit_init_copy_kern_bf16_avx512_core_not_use_bf16_ymm) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx512_core_s16_48x8_copy_an_kern());
+                            new jit_avx512_core_s16_48x8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx512_core_s16_48x8_copy_at_kern());
+                            new jit_avx512_core_s16_48x8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx512_core_s16_48x8_copy_bn_kern());
+                            new jit_avx512_core_s16_48x8_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx512_core_s16_48x8_copy_bt_kern());
+                            new jit_avx512_core_s16_48x8_copy_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core) && use_bf16_ymm) {
+                    DNNL_CSCOPE(jit_init_copy_kern_bf16_avx512_core_use_bf16_ymm) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx512_core_s16_24x8_copy_an_kern());
+                            new jit_avx512_core_s16_24x8_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx512_core_s16_24x8_copy_at_kern());
+                            new jit_avx512_core_s16_24x8_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx512_core_s16_24x8_copy_bn_kern());
+                            new jit_avx512_core_s16_24x8_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx512_core_s16_24x8_copy_bt_kern());
+                            new jit_avx512_core_s16_24x8_copy_bt_kern_t());
+                    }
 #endif
                 }
                 break;
@@ -581,51 +602,59 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
                     // dummy if
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_f32_avx512_core) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx512_core_f32_copy_an_kern());
+                            new jit_avx512_core_f32_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx512_core_f32_copy_at_kern());
+                            new jit_avx512_core_f32_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx512_core_f32_copy_bn_kern());
+                            new jit_avx512_core_f32_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx512_core_f32_copy_bt_kern());
+                            new jit_avx512_core_f32_copy_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx2)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_f32_avx2) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx2_f32_copy_an_kern());
+                            new jit_avx2_f32_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx2_f32_copy_at_kern());
+                            new jit_avx2_f32_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx2_f32_copy_bn_kern());
+                            new jit_avx2_f32_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx2_f32_copy_bt_kern());
+                            new jit_avx2_f32_copy_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_f32_avx) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_avx_f32_copy_an_kern());
+                            new jit_avx_f32_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_avx_f32_copy_at_kern());
+                            new jit_avx_f32_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_avx_f32_copy_bn_kern());
+                            new jit_avx_f32_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_avx_f32_copy_bt_kern());
+                            new jit_avx_f32_copy_bt_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_SSE41
                 } else if (mayiuse(sse41)) {
+                    DNNL_CSCOPE(jit_init_copy_kern_f32_sse41) {
                     copy_a[no_trans][no_sum].reset(
-                            new jit_sse41_f32_copy_an_kern());
+                            new jit_sse41_f32_copy_an_kern_t());
                     copy_a[do_trans][no_sum].reset(
-                            new jit_sse41_f32_copy_at_kern());
+                            new jit_sse41_f32_copy_at_kern_t());
 
                     copy_b[no_trans][no_sum].reset(
-                            new jit_sse41_f32_copy_bn_kern());
+                            new jit_sse41_f32_copy_bn_kern_t());
                     copy_b[do_trans][no_sum].reset(
-                            new jit_sse41_f32_copy_bt_kern());
+                            new jit_sse41_f32_copy_bt_kern_t());
+                    }
 #endif
                 }
                 break;
@@ -633,87 +662,98 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
             default: break;
         }
 
-        constexpr bool is_a_s8 = data_traits<a_t>::data_type == data_type::s8;
-        constexpr bool is_b_s8 = data_traits<b_t>::data_type == data_type::s8;
-        constexpr bool is_c_s32 = data_traits<c_t>::data_type == data_type::s32;
+        constexpr bool is_a_s8 = data_traits_t<a_t>::data_type == data_type::s8;
+        constexpr bool is_b_s8 = data_traits_t<b_t>::data_type == data_type::s8;
+        constexpr bool is_c_s32
+                = data_traits_t<c_t>::data_type == data_type::s32;
         UNUSED(is_a_s8);
         UNUSED(is_b_s8);
         UNUSED(is_c_s32);
 
-        static maybe_unique_ptr<jit_generator> kernel[2][2][2][2]
+        static maybe_unique_ptr<jit_generator_t> kernel[2][2][2][2]
                 = {{{{nullptr}}}};
-        switch (data_traits<a_t>::data_type) {
+        switch (data_traits_t<a_t>::data_type) {
             case data_type::s8:
                 if (false) {
                     // dummy if
 #if __BUILD_GEMM_AMX
                 } else if (mayiuse(avx512_core_amx)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_s8_avx512_core_bf16_amx_int8) {
                     for (int isBeta0 : {no_beta0, do_beta0}) {
                         kernel[isBeta0][do_alpha1][no_sum][no_sum].reset(
-                                new jit_avx512_core_amx_gemm_kern(
+                                new jit_avx512_core_amx_gemm_kern_t(
                                         is_a_s8, is_b_s8, is_c_s32, isBeta0));
                     }
+                    }
 #endif
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_s8_avx512_core) {
                     for (int isBeta0 : {no_beta0, do_beta0})
                         for (int doColSum : {no_sum, do_sum})
                             for (int doRowSum : {no_sum, do_sum}) {
                                 kernel[isBeta0][do_alpha1][doColSum][doRowSum].reset(
-                                        new jit_avx512_core_gemm_s8u8s32_kern(
+                                        new jit_avx512_core_gemm_s8u8s32_kern_t(
                                                 isBeta0, doColSum, doRowSum));
                             }
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx2)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_s8_avx2) {
                     for (int isBeta0 : {no_beta0, do_beta0})
                         for (int doColSum : {no_sum, do_sum})
                             for (int doRowSum : {no_sum, do_sum}) {
                                 kernel[isBeta0][do_alpha1][doColSum][doRowSum]
-                                        .reset(new jit_avx2_gemm_s8u8s32_kern(
+                                        .reset(new jit_avx2_gemm_s8u8s32_kern_t(
                                                 isBeta0, doColSum, doRowSum,
                                                 um));
                             }
+                    }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_s8_avx) {
                     kernel[no_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_avx_kernel_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_gemm_s8u8s32_kern_t());
                     kernel[no_beta0][do_alpha1][do_sum][no_sum].reset(
-                            new jit_avx_kernel_c_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_c_gemm_s8u8s32_kern_t());
                     kernel[no_beta0][do_alpha1][no_sum][do_sum].reset(
-                            new jit_avx_kernel_r_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_r_gemm_s8u8s32_kern_t());
                     kernel[no_beta0][do_alpha1][do_sum][do_sum].reset(
-                            new jit_avx_kernel_b_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_b_gemm_s8u8s32_kern_t());
 
                     kernel[do_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_avx_kernel_b0_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_b0_gemm_s8u8s32_kern_t());
                     kernel[do_beta0][do_alpha1][do_sum][no_sum].reset(
-                            new jit_avx_kernel_b0_c_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t());
                     kernel[do_beta0][do_alpha1][no_sum][do_sum].reset(
-                            new jit_avx_kernel_b0_r_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t());
                     kernel[do_beta0][do_alpha1][do_sum][do_sum].reset(
-                            new jit_avx_kernel_b0_b_gemm_s8u8s32_kern());
+                            new jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_SSE41
                 } else if (mayiuse(sse41)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_s8_sse41) {
                     kernel[no_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_sse41_kernel_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_gemm_s8u8s32_kern_t());
                     kernel[no_beta0][do_alpha1][do_sum][no_sum].reset(
-                            new jit_sse41_kernel_c_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_c_gemm_s8u8s32_kern_t());
                     kernel[no_beta0][do_alpha1][no_sum][do_sum].reset(
-                            new jit_sse41_kernel_r_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_r_gemm_s8u8s32_kern_t());
                     kernel[no_beta0][do_alpha1][do_sum][do_sum].reset(
-                            new jit_sse41_kernel_b_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_b_gemm_s8u8s32_kern_t());
 
                     kernel[do_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_sse41_kernel_b0_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_b0_gemm_s8u8s32_kern_t());
                     kernel[do_beta0][do_alpha1][do_sum][no_sum].reset(
-                            new jit_sse41_kernel_b0_c_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t());
                     kernel[do_beta0][do_alpha1][no_sum][do_sum].reset(
-                            new jit_sse41_kernel_b0_r_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t());
                     kernel[do_beta0][do_alpha1][do_sum][do_sum].reset(
-                            new jit_sse41_kernel_b0_b_gemm_s8u8s32_kern());
+                            new jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t());
+                    }
 #endif
                 }
                 break;
@@ -723,20 +763,24 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
                     // dummy if
 #if __BUILD_GEMM_AMX
                 } else if (mayiuse(avx512_core_amx)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_bf16_avx512_core_bf16_amx_bf16) {
                     for (int isBeta0 : {no_beta0, do_beta0}) {
                         kernel[isBeta0][do_alpha1][no_sum][no_sum].reset(
-                                new jit_avx512_core_amx_gemm_kern(
+                                new jit_avx512_core_amx_gemm_kern_t(
                                         is_a_s8, is_b_s8, is_c_s32, isBeta0));
                     }
+                    }
 #endif
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_bf16_avx512_core) {
                     for (int isBeta0 : {no_beta0, do_beta0})
                         for (int isAlpha1 : {no_alpha1, do_alpha1}) {
                             kernel[isBeta0][isAlpha1][no_sum][no_sum].reset(
-                                    new jit_avx512_core_gemm_bf16bf16f32_kern(
+                                    new jit_avx512_core_gemm_bf16bf16f32_kern_t(
                                             isBeta0, isAlpha1, !use_bf16_ymm));
                         }
+                    }
 #endif
                 }
                 break;
@@ -746,24 +790,30 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
                     // dummy if
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx2)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_f32_avx2) {
                     for (int isBeta0 : {no_beta0, do_beta0}) {
                         kernel[isBeta0][do_alpha1][no_sum][no_sum].reset(
-                                new jit_avx2_kernel_sgemm_kern(isBeta0));
+                                new jit_avx2_kernel_sgemm_kern_t(isBeta0));
+                    }
                     }
 #endif
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_f32_avx) {
                     kernel[no_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_avx_kernel_sgemm_kern());
+                            new jit_avx_kernel_sgemm_kern_t());
                     kernel[do_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_avx_kernel_b0_sgemm_kern());
+                            new jit_avx_kernel_b0_sgemm_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_SSE41
                 } else if (mayiuse(sse41)) {
+                    DNNL_CSCOPE(jit_init_gemm_kern_f32_sse41) {
                     kernel[no_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_sse41_kernel_sgemm_kern());
+                            new jit_sse41_kernel_sgemm_kern_t());
                     kernel[do_beta0][do_alpha1][no_sum][no_sum].reset(
-                            new jit_sse41_kernel_b0_sgemm_kern());
+                            new jit_sse41_kernel_b0_sgemm_kern_t());
+                    }
 #endif
                 }
                 break;
@@ -771,22 +821,27 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
             default: break;
         }
 
-        static maybe_unique_ptr<jit_generator> gemv_kernel[2] = {nullptr};
-        static maybe_unique_ptr<jit_generator> gemv_s8s8s32_kernel = nullptr;
-        static maybe_unique_ptr<jit_generator> gemv_s8u8s32_kernel = nullptr;
-        static maybe_unique_ptr<jit_generator> gemv_u8s8s32_kernel = nullptr;
-        switch (data_traits<a_t>::data_type) {
+        static maybe_unique_ptr<jit_generator_t> gemv_kernel[2] = {nullptr};
+        static maybe_unique_ptr<jit_generator_t> gemv_s8s8s32_kernel = nullptr;
+        static maybe_unique_ptr<jit_generator_t> gemv_s8u8s32_kernel = nullptr;
+        static maybe_unique_ptr<jit_generator_t> gemv_u8s8s32_kernel = nullptr;
+        switch (data_traits_t<a_t>::data_type) {
             case data_type::s8:
                 if (false) {
                     // dummy if
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core)) {
+                    DNNL_CSCOPE(jit_init_gemv_kern_s8_avx512_core) {
                     gemv_s8s8s32_kernel.reset(
-                            new jit_avx512_core_gemv_s8x8s32_kern(ver_t::s8s8));
+                            new jit_avx512_core_gemv_s8x8s32_kern_t(
+                                    ver_t::s8s8));
                     gemv_s8u8s32_kernel.reset(
-                            new jit_avx512_core_gemv_s8x8s32_kern(ver_t::s8u8));
+                            new jit_avx512_core_gemv_s8x8s32_kern_t(
+                                    ver_t::s8u8));
                     gemv_u8s8s32_kernel.reset(
-                            new jit_avx512_core_gemv_s8x8s32_kern(ver_t::u8s8));
+                            new jit_avx512_core_gemv_s8x8s32_kern_t(
+                                    ver_t::u8s8));
+                    }
 #endif
                 }
                 break;
@@ -796,10 +851,12 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
                     // dummy if
 #if __BUILD_GEMM_AVX512
                 } else if (mayiuse(avx512_core)) {
+                    DNNL_CSCOPE(jit_init_gemv_kern_bf16_avx512_core) {
                     for (int isTrans : {no_trans, do_trans})
                         gemv_kernel[isTrans].reset(
-                                new jit_avx512_core_gemv_bf16bf16f32_kern(
+                                new jit_avx512_core_gemv_bf16bf16f32_kern_t(
                                         isTrans));
+                    }
 #endif
                 }
                 break;
@@ -809,16 +866,21 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
                     // dummy if
 #if __BUILD_GEMM_AVX2
                 } else if (mayiuse(avx)) {
+                    DNNL_CSCOPE(jit_init_gemv_kern_f32_avx) {
                     gemv_kernel[no_trans].reset(
-                            new jit_sse41_gemv_n_f32_kern());
-                    gemv_kernel[do_trans].reset(new jit_avx_gemv_t_f32_kern());
+                            new jit_sse41_gemv_n_f32_kern_t());
+                    gemv_kernel[do_trans].reset(
+                            new jit_avx_gemv_t_f32_kern_t());
+                    }
 #endif
 #if __BUILD_GEMM_SSE41
                 } else if (mayiuse(sse41)) {
+                    DNNL_CSCOPE(jit_init_gemv_kern_f32_sse41) {
                     gemv_kernel[no_trans].reset(
-                            new jit_sse41_gemv_n_f32_kern());
+                            new jit_sse41_gemv_n_f32_kern_t());
                     gemv_kernel[do_trans].reset(
-                            new jit_sse41_gemv_t_f32_kern());
+                            new jit_sse41_gemv_t_f32_kern_t());
+                    }
 #endif
                 }
                 break;
@@ -882,7 +944,7 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
         }
 
         // Set gemv floating point kernels
-        if (utils::one_of(data_traits<a_t>::data_type, data_type::f32,
+        if (utils::one_of(data_traits_t<a_t>::data_type, data_type::f32,
                     data_type::bf16)) {
             for (int isTrans : {no_trans, do_trans}) {
                 auto *p_gemv_kernel = gemv_kernel[isTrans].get();
@@ -895,7 +957,7 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
         }
 
         // Set gemv integer gemm kernels
-        if (data_traits<a_t>::data_type == data_type::s8) {
+        if (data_traits_t<a_t>::data_type == data_type::s8) {
             if (gemv_s8s8s32_kernel != nullptr) {
                 auto *kern = gemv_s8s8s32_kernel.get();
                 st = kern->create_kernel();
@@ -927,7 +989,7 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
     int copy_trans_a = (this->transa == do_trans) ? do_trans : no_trans;
     int copy_trans_b = (this->transb == do_trans) ? do_trans : no_trans;
 
-    constexpr bool is_bf16 = data_traits<a_t>::data_type == data_type::bf16;
+    constexpr bool is_bf16 = data_traits_t<a_t>::data_type == data_type::bf16;
     bool doAlpha1 = this->alpha != 1.0f && is_bf16 ? no_alpha1 : do_alpha1;
 
     {
@@ -950,7 +1012,7 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
     this->gemv_s8s8s32_kernel = nullptr;
     this->gemv_s8u8s32_kernel = nullptr;
     this->gemv_u8s8s32_kernel = nullptr;
-    if (data_traits<a_t>::data_type == data_type::s8) {
+    if (data_traits_t<a_t>::data_type == data_type::s8) {
         this->gemv_s8s8s32_kernel = gemv_s8s8s32_kern;
         this->gemv_s8u8s32_kernel = gemv_s8u8s32_kern;
         this->gemv_u8s8s32_kernel = gemv_u8s8s32_kern;
@@ -965,7 +1027,7 @@ void gemm_info_t<a_t, b_t, c_t>::jit_init(void) {
 template <typename a_t, typename b_t, typename c_t>
 bool gemm_info_t<a_t, b_t, c_t>::hasKernels(void) {
 
-    switch (data_traits<a_t>::data_type) {
+    switch (data_traits_t<a_t>::data_type) {
         case data_type::s8:
             if (mayiuse(sse41)) {
                 for (int isBeta0 : {no_beta0, do_beta0})
diff --git a/src/cpu/x64/gemm/gemm_pack.cpp b/src/cpu/x64/gemm/gemm_pack.cpp
index 091a4a69c60..1b98e6e26c5 100644
--- a/src/cpu/x64/gemm/gemm_pack.cpp
+++ b/src/cpu/x64/gemm/gemm_pack.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -79,8 +79,8 @@ static inline CBLAS_OFFSET cblas_offset(const char *offset) {
 template <typename a_dt, typename b_dt>
 static inline bool use_reference_igemm(void) {
     constexpr bool is_s8u8 = true
-            && data_traits<a_dt>::data_type == data_type::s8
-            && data_traits<b_dt>::data_type == data_type::u8;
+            && data_traits_t<a_dt>::data_type == data_type::s8
+            && data_traits_t<b_dt>::data_type == data_type::u8;
     if (is_s8u8)
         return !mayiuse(sse41);
     else
@@ -241,8 +241,8 @@ dnnl_status_t gemm_x8x8s32_pack_get_size(const char *identifier,
 
 #if USE_MKL_PACKED_GEMM
     constexpr bool is_s8u8 = true
-            && data_traits<a_dt>::data_type == data_type::s8
-            && data_traits<b_dt>::data_type == data_type::u8;
+            && data_traits_t<a_dt>::data_type == data_type::s8
+            && data_traits_t<b_dt>::data_type == data_type::u8;
 
     if (is_s8u8) {
         *size = cblas_gemm_s8u8s32_pack_get_size(
@@ -356,8 +356,8 @@ dnnl_status_t gemm_x8x8s32_pack(const char *identifier, const char *transa,
 
 #if USE_MKL_PACKED_GEMM
     constexpr bool is_s8u8 = true
-            && data_traits<a_dt>::data_type == data_type::s8
-            && data_traits<b_dt>::data_type == data_type::u8;
+            && data_traits_t<a_dt>::data_type == data_type::s8
+            && data_traits_t<b_dt>::data_type == data_type::u8;
 
     if (is_s8u8) {
         auto cblas_id = cblas_identifier(identifier);
@@ -459,8 +459,8 @@ dnnl_status_t gemm_x8x8s32_compute(const char *transa, const char *transb,
 
 #if USE_MKL_PACKED_GEMM
     constexpr bool is_s8u8 = true
-            && data_traits<a_dt>::data_type == data_type::s8
-            && data_traits<b_dt>::data_type == data_type::u8;
+            && data_traits_t<a_dt>::data_type == data_type::s8
+            && data_traits_t<b_dt>::data_type == data_type::u8;
 
     if (is_s8u8) {
         if (utils::any_null(transa, transb, offsetc, M, N, K, alpha, A, lda, ao,
diff --git a/src/cpu/x64/gemm/gemm_pack_storage.hpp b/src/cpu/x64/gemm/gemm_pack_storage.hpp
index 2f92e445c0a..73111f73c7d 100644
--- a/src/cpu/x64/gemm/gemm_pack_storage.hpp
+++ b/src/cpu/x64/gemm/gemm_pack_storage.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -108,14 +108,14 @@ struct gemm_pack_storage_t {
 
     template <typename data_type>
     data_type *row_sums(int ithr, dim_t r0, dim_t cblock) const {
-        if (!has_row_sums()) return NULL;
+        if (!has_row_sums()) return nullptr;
         auto id = thread_to_slice(ithr);
         return get_block<data_type>(sums_header->slice[id], r0, cblock);
     }
 
     template <typename data_type>
     data_type *col_sums(int ithr, dim_t rblock, dim_t c0) const {
-        if (!has_col_sums()) return NULL;
+        if (!has_col_sums()) return nullptr;
         auto id = thread_to_slice(ithr);
         return get_block<data_type>(sums_header->slice[id], rblock, c0);
     }
diff --git a/src/cpu/x64/gemm/gemm_threading.hpp b/src/cpu/x64/gemm/gemm_threading.hpp
index 3915dd54f12..b0af2760095 100644
--- a/src/cpu/x64/gemm/gemm_threading.hpp
+++ b/src/cpu/x64/gemm/gemm_threading.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ struct gemm_slice_t {
 };
 
 struct gemm_threading_t {
-    gemm_threading_t() {};
+    gemm_threading_t() = default;
 
     int nthrs_m, nthrs_n, nthrs_k;
     dim_t block_m, block_n, block_k; // Blocking sizes (-1 = default)
diff --git a/src/cpu/x64/gemm/gemm_utils.hpp b/src/cpu/x64/gemm/gemm_utils.hpp
index 76462b5b150..739ed73d812 100644
--- a/src/cpu/x64/gemm/gemm_utils.hpp
+++ b/src/cpu/x64/gemm/gemm_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -171,7 +171,7 @@ dnnl_status_t pack_no_copy(const T *src, dim_t ld_src, dim_t nrows, dim_t ncols,
     dim_t nrows_dst, ncols_dst;
     dim_t ld_dst, td_dst;
 
-    constexpr bool is_f32 = data_traits<T>::data_type == data_type::f32;
+    constexpr bool is_f32 = data_traits_t<T>::data_type == data_type::f32;
 
     if (!dst_pack->get_nocopy(0, trans_dst, ld_dst, td_dst))
         return dnnl_invalid_arguments;
diff --git a/src/cpu/x64/gemm/gemv_driver.cpp b/src/cpu/x64/gemm/gemv_driver.cpp
index 7b6ab72945f..241b84ca802 100644
--- a/src/cpu/x64/gemm/gemv_driver.cpp
+++ b/src/cpu/x64/gemm/gemv_driver.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -231,7 +231,7 @@ template <typename a_t>
 static inline int thread_checker(
         int nthr, const dim_t m, const dim_t n, int trans) {
     constexpr bool is_f32
-            = utils::one_of(data_traits<a_t>::data_type, data_type::f32);
+            = utils::one_of(data_traits_t<a_t>::data_type, data_type::f32);
 
     if (is_f32) {
         // Threshold based on performance measurement with warm and cold cache
@@ -317,7 +317,7 @@ template <typename T>
 static inline void part_1d(const dim_t m, const int ithr, const int nthr,
         T *addr, dim_t &off, dim_t &size) {
     constexpr bool is_f32
-            = utils::one_of(data_traits<T>::data_type, data_type::f32);
+            = utils::one_of(data_traits_t<T>::data_type, data_type::f32);
 
     if (ithr >= nthr) {
         size = 0;
@@ -397,9 +397,9 @@ static inline void gemv_threading_driver(const int trans, const dim_t m,
         const b_t *x, const dim_t incx, const float beta, c_t *y,
         const dim_t incy, const gemm_info_t<a_t, b_t, c_t> *arg) {
     constexpr bool is_f32
-            = utils::one_of(data_traits<a_t>::data_type, data_type::f32);
+            = utils::one_of(data_traits_t<a_t>::data_type, data_type::f32);
     constexpr bool is_bf16
-            = utils::one_of(data_traits<a_t>::data_type, data_type::bf16);
+            = utils::one_of(data_traits_t<a_t>::data_type, data_type::bf16);
 
     // Quick return if possible.
     if (m <= 0 || n <= 0) return;
diff --git a/src/cpu/x64/gemm/s8x8s32/common_u8.hpp b/src/cpu/x64/gemm/s8x8s32/common_u8.hpp
index 386821cfb4e..575f73a52a9 100644
--- a/src/cpu/x64/gemm/s8x8s32/common_u8.hpp
+++ b/src/cpu/x64/gemm/s8x8s32/common_u8.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,464 +22,464 @@
 #include "cpu/x64/jit_generator.hpp"
 
 #define PADD_BYTESIZE_ONPAGE(x, size) \
-    (((x) * (size) + PAGE_4K - 1) / PAGE_4K) * PAGE_4K
-#define NEXT_THR_STRIDE(x, size) (PADD_BYTESIZE_ONPAGE(x, size)) / size
+    ((((x) * (size) + PAGE_4K - 1) / PAGE_4K) * PAGE_4K)
+#define NEXT_THR_STRIDE(x, size) (PADD_BYTESIZE_ONPAGE(x, (size)) / (size))
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_u8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_u8_copy_an_kern();
+    jit_avx512_core_u8_copy_an_kern_t();
 };
 
-class jit_avx512_core_u8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_u8_copy_at_kern();
+    jit_avx512_core_u8_copy_at_kern_t();
 };
 
-class jit_avx512_core_u8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_bn_kern_t);
+    void generate() override;
     bool s8_case;
 
 public:
-    jit_avx512_core_u8_copy_bn_kern(bool s8 = false);
+    jit_avx512_core_u8_copy_bn_kern_t(bool s8 = false);
 };
 
-class jit_avx512_core_u8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_bt_kern_t);
+    void generate() override;
     bool s8_case;
 
 public:
-    jit_avx512_core_u8_copy_bt_kern(bool s8 = false);
+    jit_avx512_core_u8_copy_bt_kern_t(bool s8 = false);
 };
 
-class jit_avx512_core_u8_copy_sum_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_sum_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_u8_copy_sum_an_kern();
+    jit_avx512_core_u8_copy_sum_an_kern_t();
 };
 
-class jit_avx512_core_u8_copy_sum_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_sum_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx512_core_u8_copy_sum_at_kern();
+    jit_avx512_core_u8_copy_sum_at_kern_t();
 };
 
-class jit_avx512_core_u8_copy_sum_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_sum_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_bn_kern_t);
+    void generate() override;
     bool s8_case;
 
 public:
-    jit_avx512_core_u8_copy_sum_bn_kern(bool s8 = false);
+    jit_avx512_core_u8_copy_sum_bn_kern_t(bool s8 = false);
 };
 
-class jit_avx512_core_u8_copy_sum_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx512_core_u8_copy_sum_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_u8_copy_sum_bt_kern_t);
+    void generate() override;
     bool s8_case;
 
 public:
-    jit_avx512_core_u8_copy_sum_bt_kern(bool s8 = false);
+    jit_avx512_core_u8_copy_sum_bt_kern_t(bool s8 = false);
 };
 
-class jit_avx2_vnni_u8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_an_kern();
+    jit_avx2_vnni_u8_copy_an_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_at_kern();
+    jit_avx2_vnni_u8_copy_at_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_bn_kern();
+    jit_avx2_vnni_u8_copy_bn_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_bt_kern();
+    jit_avx2_vnni_u8_copy_bt_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_sum_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_sum_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_sum_an_kern();
+    jit_avx2_vnni_u8_copy_sum_an_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_sum_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_sum_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_sum_at_kern();
+    jit_avx2_vnni_u8_copy_sum_at_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_sum_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_sum_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_sum_bn_kern();
+    jit_avx2_vnni_u8_copy_sum_bn_kern_t();
 };
 
-class jit_avx2_vnni_u8_copy_sum_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_vnni_u8_copy_sum_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_u8_copy_sum_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_vnni_u8_copy_sum_bt_kern();
+    jit_avx2_vnni_u8_copy_sum_bt_kern_t();
 };
 
-class jit_avx2_u8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_an_kern();
+    jit_avx2_u8_copy_an_kern_t();
 };
 
-class jit_avx2_u8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_at_kern();
+    jit_avx2_u8_copy_at_kern_t();
 };
 
-class jit_avx2_u8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_bn_kern();
+    jit_avx2_u8_copy_bn_kern_t();
 };
 
-class jit_avx2_u8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_bt_kern();
+    jit_avx2_u8_copy_bt_kern_t();
 };
 
-class jit_avx2_u8_copy_sum_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_sum_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_sum_an_kern();
+    jit_avx2_u8_copy_sum_an_kern_t();
 };
 
-class jit_avx2_u8_copy_sum_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_sum_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_sum_at_kern();
+    jit_avx2_u8_copy_sum_at_kern_t();
 };
 
-class jit_avx2_u8_copy_sum_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_sum_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_sum_bn_kern();
+    jit_avx2_u8_copy_sum_bn_kern_t();
 };
 
-class jit_avx2_u8_copy_sum_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx2_u8_copy_sum_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_u8_copy_sum_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx2_u8_copy_sum_bt_kern();
+    jit_avx2_u8_copy_sum_bt_kern_t();
 };
 
-class jit_avx_u8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_an_kern();
+    jit_avx_u8_copy_an_kern_t();
 };
 
-class jit_avx_u8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_at_kern();
+    jit_avx_u8_copy_at_kern_t();
 };
 
-class jit_avx_u8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_bn_kern();
+    jit_avx_u8_copy_bn_kern_t();
 };
 
-class jit_avx_u8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_bt_kern();
+    jit_avx_u8_copy_bt_kern_t();
 };
 
-class jit_avx_u8_copy_sum_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_sum_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_an_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_sum_an_kern();
+    jit_avx_u8_copy_sum_an_kern_t();
 };
 
-class jit_avx_u8_copy_sum_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_sum_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_at_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_sum_at_kern();
+    jit_avx_u8_copy_sum_at_kern_t();
 };
 
-class jit_avx_u8_copy_sum_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_sum_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_sum_bn_kern();
+    jit_avx_u8_copy_sum_bn_kern_t();
 };
 
-class jit_avx_u8_copy_sum_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_u8_copy_sum_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_u8_copy_sum_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_u8_copy_sum_bt_kern();
+    jit_avx_u8_copy_sum_bt_kern_t();
 };
 
-class jit_avx_kernel_b0_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_b0_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_b0_gemm_s8u8s32_kern();
+    jit_avx_kernel_b0_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_b0_b_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_b_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_b0_b_gemm_s8u8s32_kern();
+    jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_b0_r_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_r_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_b0_r_gemm_s8u8s32_kern();
+    jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_b0_c_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_c_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_b0_c_gemm_s8u8s32_kern();
+    jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_gemm_s8u8s32_kern();
+    jit_avx_kernel_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_b_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_b_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_b_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_b_gemm_s8u8s32_kern();
+    jit_avx_kernel_b_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_r_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_r_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_r_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_r_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_r_gemm_s8u8s32_kern();
+    jit_avx_kernel_r_gemm_s8u8s32_kern_t();
 };
 
-class jit_avx_kernel_c_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_c_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_avx_kernel_c_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx_kernel_c_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_avx_kernel_c_gemm_s8u8s32_kern();
+    jit_avx_kernel_c_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_u8_copy_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_an_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_an_kern();
+    jit_sse41_u8_copy_an_kern_t();
 };
 
-class jit_sse41_u8_copy_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_at_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_at_kern();
+    jit_sse41_u8_copy_at_kern_t();
 };
 
-class jit_sse41_u8_copy_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_bn_kern();
+    jit_sse41_u8_copy_bn_kern_t();
 };
 
-class jit_sse41_u8_copy_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_bt_kern();
+    jit_sse41_u8_copy_bt_kern_t();
 };
 
-class jit_sse41_u8_copy_sum_an_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_an_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_sum_an_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_an_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_sum_an_kern();
+    jit_sse41_u8_copy_sum_an_kern_t();
 };
 
-class jit_sse41_u8_copy_sum_at_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_at_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_sum_at_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_at_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_sum_at_kern();
+    jit_sse41_u8_copy_sum_at_kern_t();
 };
 
-class jit_sse41_u8_copy_sum_bn_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_bn_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_sum_bn_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_bn_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_sum_bn_kern();
+    jit_sse41_u8_copy_sum_bn_kern_t();
 };
 
-class jit_sse41_u8_copy_sum_bt_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_bt_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_u8_copy_sum_bt_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_u8_copy_sum_bt_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_u8_copy_sum_bt_kern();
+    jit_sse41_u8_copy_sum_bt_kern_t();
 };
 
-class jit_sse41_kernel_b0_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_b0_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_b0_gemm_s8u8s32_kern();
+    jit_sse41_kernel_b0_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_b0_b_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_b_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_b0_b_gemm_s8u8s32_kern();
+    jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_b0_r_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_r_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_b0_r_gemm_s8u8s32_kern();
+    jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_b0_c_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_c_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_b0_c_gemm_s8u8s32_kern();
+    jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_gemm_s8u8s32_kern();
+    jit_sse41_kernel_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_b_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_b_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_b_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_b_gemm_s8u8s32_kern();
+    jit_sse41_kernel_b_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_r_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_r_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_r_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_r_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_r_gemm_s8u8s32_kern();
+    jit_sse41_kernel_r_gemm_s8u8s32_kern_t();
 };
 
-class jit_sse41_kernel_c_gemm_s8u8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_c_gemm_s8u8s32_kern);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+class jit_sse41_kernel_c_gemm_s8u8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_kernel_c_gemm_s8u8s32_kern_t);
+    void generate() override;
 
 public:
-    jit_sse41_kernel_c_gemm_s8u8s32_kern();
+    jit_sse41_kernel_c_gemm_s8u8s32_kern_t();
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.cpp
index d15e3cc71b6..a214221860a 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ static inline Xmm make_xmm(const Xmm &v) {
 }
 
 // Load from or store to C.
-void jit_avx2_gemm_s8u8s32_kern::c_load(
+void jit_avx2_gemm_s8u8s32_kern_t::c_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(make_xmm(dst), src); break;
@@ -51,7 +51,7 @@ void jit_avx2_gemm_s8u8s32_kern::c_load(
     }
 }
 
-void jit_avx2_gemm_s8u8s32_kern::c_store(
+void jit_avx2_gemm_s8u8s32_kern_t::c_store(
         const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(dst, make_xmm(src)); break;
@@ -67,7 +67,7 @@ void jit_avx2_gemm_s8u8s32_kern::c_store(
 // Perform length-4 dot product accumulations of unsigned and signed bytes
 //  in parallel.
 // Use VEX vpdpbusd if avx2-vnni available, otherwise emulate.
-void jit_avx2_gemm_s8u8s32_kern::dot_product(
+void jit_avx2_gemm_s8u8s32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     if (vnni_) {
         vpdpbusd(dst, src1, src2, VexEncoding);
@@ -79,7 +79,7 @@ void jit_avx2_gemm_s8u8s32_kern::dot_product(
 }
 
 // Inner kernel.
-void jit_avx2_gemm_s8u8s32_kern::kernel_loop(
+void jit_avx2_gemm_s8u8s32_kern_t::kernel_loop(
         int unroll_m, int unroll_n, bool cfetch) {
     int um_vecs = (unroll_m + 7) >> 3;
     Label label_kernel_loop;
@@ -137,7 +137,7 @@ void jit_avx2_gemm_s8u8s32_kern::kernel_loop(
 }
 
 // k remainder loop for kernel.
-void jit_avx2_gemm_s8u8s32_kern::remainder_kernel(
+void jit_avx2_gemm_s8u8s32_kern_t::remainder_kernel(
         int unroll_m, int unroll_n, int unroll_k, int bwidth) {
     Ymm b = b_regs_[0];
 
@@ -165,7 +165,7 @@ void jit_avx2_gemm_s8u8s32_kern::remainder_kernel(
 }
 
 // Inner loop.
-void jit_avx2_gemm_s8u8s32_kern::innerloop(int unroll_m, int unroll_n) {
+void jit_avx2_gemm_s8u8s32_kern_t::innerloop(int unroll_m, int unroll_n) {
     int um_vecs = (unroll_m + 7) >> 3;
     int stage1 = unroll_n, stage2 = mayiuse(avx2_vnni) ? 32 : 16;
 
@@ -308,7 +308,7 @@ void jit_avx2_gemm_s8u8s32_kern::innerloop(int unroll_m, int unroll_n) {
 }
 
 // Outer loop.
-void jit_avx2_gemm_s8u8s32_kern::outerloop(
+void jit_avx2_gemm_s8u8s32_kern_t::outerloop(
         int unroll_x, int unroll_y, Label *&cur_outerloop_label) {
     Label label_m_loop, label_n_loop;
     std::vector<Label> label_n_remainder_loops(6);
@@ -385,7 +385,7 @@ void jit_avx2_gemm_s8u8s32_kern::outerloop(
     align(16);
 }
 
-void jit_avx2_gemm_s8u8s32_kern::generate() {
+void jit_avx2_gemm_s8u8s32_kern_t::generate() {
     // Prologue
     preamble();
     sub(rsp, stack_alloc_size_);
@@ -452,9 +452,9 @@ void jit_avx2_gemm_s8u8s32_kern::generate() {
     postamble();
 }
 
-jit_avx2_gemm_s8u8s32_kern::jit_avx2_gemm_s8u8s32_kern(bool beta_zero,
+jit_avx2_gemm_s8u8s32_kern_t::jit_avx2_gemm_s8u8s32_kern_t(bool beta_zero,
         bool enable_offset_c, bool enable_offset_r, int unroll_m)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , beta_zero_(beta_zero)
     , enable_offset_c_(enable_offset_c)
     , enable_offset_r_(enable_offset_r)
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.hpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.hpp
index eb61a0eca22..36360690958 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.hpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_gemm_s8u8s32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2021 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,11 +24,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx2_gemm_s8u8s32_kern : public jit_generator {
+class jit_avx2_gemm_s8u8s32_kern_t : public jit_generator_t {
 public:
-    jit_avx2_gemm_s8u8s32_kern(bool beta_zero, bool enable_offset_c,
+    jit_avx2_gemm_s8u8s32_kern_t(bool beta_zero, bool enable_offset_c,
             bool enable_offset_r, int unroll_m);
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_gemm_s8u8s32_kern);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_gemm_s8u8s32_kern_t);
 
 protected:
     bool beta_zero_;
@@ -51,7 +51,7 @@ class jit_avx2_gemm_s8u8s32_kern : public jit_generator {
     void innerloop(int unroll_m, int unroll_n);
     void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int IGEMM_UNROLL_N_ = 4;
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_an_kern_autogen.cpp
index 2b3b0481ae4..35f55ec4cb6 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_an_kern::jit_avx2_u8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_an_kern_t::jit_avx2_u8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_an_kern::generate() {
+void jit_avx2_u8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_at_kern_autogen.cpp
index eba5bc7aa69..8a85ba4f34e 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_at_kern::jit_avx2_u8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_at_kern_t::jit_avx2_u8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_at_kern::generate() {
+void jit_avx2_u8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bn_kern_autogen.cpp
index c7693955be9..6ebfb2e8eaf 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_bn_kern::jit_avx2_u8_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_bn_kern_t::jit_avx2_u8_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_bn_kern::generate() {
+void jit_avx2_u8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bt_kern_autogen.cpp
index 457ca33123d..4429e96fa07 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_bt_kern::jit_avx2_u8_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_bt_kern_t::jit_avx2_u8_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_bt_kern::generate() {
+void jit_avx2_u8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_an_kern_autogen.cpp
index f0d95ff0a98..5c3c2c38d55 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_sum_an_kern::jit_avx2_u8_copy_sum_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_sum_an_kern_t::jit_avx2_u8_copy_sum_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_sum_an_kern::generate() {
+void jit_avx2_u8_copy_sum_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_at_kern_autogen.cpp
index c2b3c495035..463476f1dbe 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_sum_at_kern::jit_avx2_u8_copy_sum_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_sum_at_kern_t::jit_avx2_u8_copy_sum_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_sum_at_kern::generate() {
+void jit_avx2_u8_copy_sum_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bn_kern_autogen.cpp
index a5cc7eaa55b..61e18a0eb67 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_sum_bn_kern::jit_avx2_u8_copy_sum_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_sum_bn_kern_t::jit_avx2_u8_copy_sum_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_sum_bn_kern::generate() {
+void jit_avx2_u8_copy_sum_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bt_kern_autogen.cpp
index f1b7931da6d..3f8cc582654 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_u8_copy_sum_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_u8_copy_sum_bt_kern::jit_avx2_u8_copy_sum_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_u8_copy_sum_bt_kern_t::jit_avx2_u8_copy_sum_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_u8_copy_sum_bt_kern::generate() {
+void jit_avx2_u8_copy_sum_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_an_kern_autogen.cpp
index 1d49a2ddbec..6f00b12ec0b 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_an_kern::jit_avx2_vnni_u8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_an_kern_t::jit_avx2_vnni_u8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_an_kern::generate() {
+void jit_avx2_vnni_u8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_at_kern_autogen.cpp
index d4e82a44368..b034a9c7389 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_at_kern::jit_avx2_vnni_u8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_at_kern_t::jit_avx2_vnni_u8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_at_kern::generate() {
+void jit_avx2_vnni_u8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bn_kern_autogen.cpp
index e554d48e1d5..91fb5c2da69 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_bn_kern::jit_avx2_vnni_u8_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_bn_kern_t::jit_avx2_vnni_u8_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_bn_kern::generate() {
+void jit_avx2_vnni_u8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bt_kern_autogen.cpp
index 9ac53e82efd..de04fc54d9c 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_bt_kern::jit_avx2_vnni_u8_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_bt_kern_t::jit_avx2_vnni_u8_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_bt_kern::generate() {
+void jit_avx2_vnni_u8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_an_kern_autogen.cpp
index 476694dc1e9..d352684ec8b 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_sum_an_kern::jit_avx2_vnni_u8_copy_sum_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_sum_an_kern_t::jit_avx2_vnni_u8_copy_sum_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_sum_an_kern::generate() {
+void jit_avx2_vnni_u8_copy_sum_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_at_kern_autogen.cpp
index 2e7147d5938..e2cda4b25f2 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_sum_at_kern::jit_avx2_vnni_u8_copy_sum_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_sum_at_kern_t::jit_avx2_vnni_u8_copy_sum_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_sum_at_kern::generate() {
+void jit_avx2_vnni_u8_copy_sum_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bn_kern_autogen.cpp
index f338c8eb2fe..3bd69ce8b4e 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_sum_bn_kern::jit_avx2_vnni_u8_copy_sum_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_sum_bn_kern_t::jit_avx2_vnni_u8_copy_sum_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_sum_bn_kern::generate() {
+void jit_avx2_vnni_u8_copy_sum_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bt_kern_autogen.cpp
index ad94fb37065..bb4e3f0ef6e 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx2_vnni_u8_copy_sum_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx2_vnni_u8_copy_sum_bt_kern::jit_avx2_vnni_u8_copy_sum_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx2_vnni_u8_copy_sum_bt_kern_t::jit_avx2_vnni_u8_copy_sum_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx2_vnni_u8_copy_sum_bt_kern::generate() {
+void jit_avx2_vnni_u8_copy_sum_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp
index 0776cef4255..4aa60ab39d9 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ static inline Ymm make_ymm(const Xmm &v) {
 }
 
 // Load from or store to C.
-void jit_avx512_core_gemm_s8u8s32_kern::c_load(
+void jit_avx512_core_gemm_s8u8s32_kern_t::c_load(
         const Xbyak::Xmm &dst, const Xbyak::Address &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(make_xmm(dst), src); break;
@@ -55,7 +55,7 @@ void jit_avx512_core_gemm_s8u8s32_kern::c_load(
     }
 }
 
-void jit_avx512_core_gemm_s8u8s32_kern::c_store(
+void jit_avx512_core_gemm_s8u8s32_kern_t::c_store(
         const Xbyak::Address &dst, const Xbyak::Xmm &src, int nelems) {
     switch (nelems) {
         case 1: vmovss(dst, make_xmm(src)); break;
@@ -72,7 +72,7 @@ void jit_avx512_core_gemm_s8u8s32_kern::c_store(
 // Perform length-4 dot product accumulations of unsigned and signed bytes
 //  in parallel.
 // Use vpdpbusd if VNNI available, otherwise emulate.
-void jit_avx512_core_gemm_s8u8s32_kern::dot_product(
+void jit_avx512_core_gemm_s8u8s32_kern_t::dot_product(
         const Xmm &dst, const Xmm &src1, const Xmm &src2) {
     if (vnni_)
         vpdpbusd(dst, src1, src2);
@@ -84,7 +84,7 @@ void jit_avx512_core_gemm_s8u8s32_kern::dot_product(
 }
 
 // Inner kernel.
-void jit_avx512_core_gemm_s8u8s32_kern::kernel_loop(
+void jit_avx512_core_gemm_s8u8s32_kern_t::kernel_loop(
         int unroll_m, int unroll_n, bool cfetch) {
     int um_vecs = (unroll_m + 15) >> 4;
     Label label_kernel_loop;
@@ -145,7 +145,7 @@ void jit_avx512_core_gemm_s8u8s32_kern::kernel_loop(
 }
 
 // k remainder loop for kernel.
-void jit_avx512_core_gemm_s8u8s32_kern::remainder_kernel(
+void jit_avx512_core_gemm_s8u8s32_kern_t::remainder_kernel(
         int unroll_m, int unroll_n, int unroll_k, int bwidth) {
     if ((unroll_m > IGEMM_UNROLL_M_) || (unroll_n > IGEMM_UNROLL_N_)
             || (unroll_m < 0) || (unroll_n < 0))
@@ -183,7 +183,8 @@ void jit_avx512_core_gemm_s8u8s32_kern::remainder_kernel(
 }
 
 // Inner loop.
-void jit_avx512_core_gemm_s8u8s32_kern::innerloop(int unroll_m, int unroll_n) {
+void jit_avx512_core_gemm_s8u8s32_kern_t::innerloop(
+        int unroll_m, int unroll_n) {
     if ((unroll_m > IGEMM_UNROLL_M_) || (unroll_n > IGEMM_UNROLL_N_)
             || (unroll_m < 0) || (unroll_n < 0))
         return;
@@ -345,7 +346,7 @@ void jit_avx512_core_gemm_s8u8s32_kern::innerloop(int unroll_m, int unroll_n) {
 }
 
 // Outer loop.
-void jit_avx512_core_gemm_s8u8s32_kern::outerloop(
+void jit_avx512_core_gemm_s8u8s32_kern_t::outerloop(
         int unroll_x, int unroll_y, Label *&cur_outerloop_label) {
     Label label_m_loop, label_n_loop;
     std::vector<Label> label_n_remainder_loops(6);
@@ -422,7 +423,7 @@ void jit_avx512_core_gemm_s8u8s32_kern::outerloop(
     align(16);
 }
 
-void jit_avx512_core_gemm_s8u8s32_kern::generate() {
+void jit_avx512_core_gemm_s8u8s32_kern_t::generate() {
     // Prologue
     preamble();
     sub(rsp, stack_alloc_size_);
@@ -484,9 +485,9 @@ void jit_avx512_core_gemm_s8u8s32_kern::generate() {
     postamble();
 }
 
-jit_avx512_core_gemm_s8u8s32_kern::jit_avx512_core_gemm_s8u8s32_kern(
+jit_avx512_core_gemm_s8u8s32_kern_t::jit_avx512_core_gemm_s8u8s32_kern_t(
         bool beta_zero, bool enable_offset_c, bool enable_offset_r)
-    : jit_generator(jit_name(),
+    : jit_generator_t(jit_name(),
             mayiuse(avx512_core_vnni) ? avx512_core_vnni : avx512_core)
     , beta_zero_(beta_zero)
     , enable_offset_c_(enable_offset_c)
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp
index 80d6afd3653..4f26dca6890 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_gemm_s8u8s32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2021 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,11 +24,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_avx512_core_gemm_s8u8s32_kern : public jit_generator {
+class jit_avx512_core_gemm_s8u8s32_kern_t : public jit_generator_t {
 public:
-    jit_avx512_core_gemm_s8u8s32_kern(
+    jit_avx512_core_gemm_s8u8s32_kern_t(
             bool beta_zero, bool enable_offset_c, bool enable_offset_r);
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_s8u8s32_kern);
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemm_s8u8s32_kern_t);
 
 protected:
     bool beta_zero_;
@@ -50,7 +50,7 @@ class jit_avx512_core_gemm_s8u8s32_kern : public jit_generator {
     void innerloop(int unroll_m, int unroll_n);
     void outerloop(int unroll_x, int unroll_y, Xbyak::Label *&outerloop_label);
 
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
 private:
     static const int IGEMM_UNROLL_M_ = 48;
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.cpp
index 28a8aa904a2..f2c4a51dff9 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace x64 {
 
 using namespace Xbyak;
 
-void jit_avx512_core_gemv_s8x8s32_kern::vnni(
+void jit_avx512_core_gemv_s8x8s32_kern_t::vnni(
         Zmm acc, Zmm a, Zmm b, vnni_op_t op) {
     if (isa == avx512_core_vnni) {
         if (op == vnni_op_t::sub) vxorps(acc, acc, zmm_1_u1); // acc = -acc
@@ -56,7 +56,7 @@ void jit_avx512_core_gemv_s8x8s32_kern::vnni(
     }
 }
 
-void jit_avx512_core_gemv_s8x8s32_kern::n_loop_body(int nreg_acc, Reg64 A,
+void jit_avx512_core_gemv_s8x8s32_kern_t::n_loop_body(int nreg_acc, Reg64 A,
         Reg64 lda, Reg64 X, int use_mask, Opmask mask_n) {
     const int nreg_A = nreg_acc / 2 + (nreg_acc % 2);
 
@@ -100,7 +100,7 @@ void jit_avx512_core_gemv_s8x8s32_kern::n_loop_body(int nreg_acc, Reg64 A,
             vnni(zmm_acc(nreg_A + i), zmm_a(i), zmm_128_u8, vnni_op_t::sub);
 }
 
-void jit_avx512_core_gemv_s8x8s32_kern::shuffle_and_add(
+void jit_avx512_core_gemv_s8x8s32_kern_t::shuffle_and_add(
         Zmm dest, Zmm A, Zmm B, Zmm C, Zmm D) {
     vshufi32x4(dest, A, C, 0x44);
     vshufi32x4(A, A, C, 0xEE);
@@ -115,7 +115,7 @@ void jit_avx512_core_gemv_s8x8s32_kern::shuffle_and_add(
     vpaddd(dest, A, B); // dest = SAi|SBi|SCi|SDi
 }
 
-void jit_avx512_core_gemv_s8x8s32_kern::update_c(
+void jit_avx512_core_gemv_s8x8s32_kern_t::update_c(
         int nreg_acc, Reg64 Y, int use_mask, Opmask mask_m) {
     int l, i, k, j, last_it;
     Label store_label;
@@ -175,7 +175,7 @@ void jit_avx512_core_gemv_s8x8s32_kern::update_c(
     }
 }
 
-void jit_avx512_core_gemv_s8x8s32_kern::generate() {
+void jit_avx512_core_gemv_s8x8s32_kern_t::generate() {
 
     const int vec_len = 64; // bytes
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.hpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.hpp
index ac4092d98fd..aba69ea3e93 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.hpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_kernel_gemv_s8x8s32_kern.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,8 +33,8 @@ namespace x64 {
 
 enum class ver_t { undef, s8s8, s8u8, u8s8 };
 
-class jit_avx512_core_gemv_s8x8s32_kern : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemv_s8x8s32_kern);
+class jit_avx512_core_gemv_s8x8s32_kern_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_gemv_s8x8s32_kern_t);
 
     enum class vnni_op_t { add, sub };
 
@@ -44,7 +44,7 @@ class jit_avx512_core_gemv_s8x8s32_kern : public jit_generator {
     void shuffle_and_add(
             Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm, Xbyak::Zmm);
     void update_c(int, Xbyak::Reg64, int, Xbyak::Opmask);
-    void generate() override ATTRIBUTE_OPTIMIZE;
+    void generate() override;
 
     cpu_isa_t isa = isa_undef;
     ver_t ver = ver_t::undef;
@@ -78,8 +78,8 @@ class jit_avx512_core_gemv_s8x8s32_kern : public jit_generator {
     }
 
 public:
-    jit_avx512_core_gemv_s8x8s32_kern(ver_t ver)
-        : jit_generator(jit_name(),
+    jit_avx512_core_gemv_s8x8s32_kern_t(ver_t ver)
+        : jit_generator_t(jit_name(),
                 mayiuse(avx512_core_vnni) ? avx512_core_vnni : avx512_core)
         , ver(ver) {}
 };
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern_autogen.cpp
index 2550875bda9..18573a65b5b 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_an_kern::jit_avx512_core_u8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_u8_copy_an_kern_t::jit_avx512_core_u8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_u8_copy_an_kern::generate() {
+void jit_avx512_core_u8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern_autogen.cpp
index f085c71d309..b763c320ccb 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_at_kern::jit_avx512_core_u8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_u8_copy_at_kern_t::jit_avx512_core_u8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_u8_copy_at_kern::generate() {
+void jit_avx512_core_u8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern_autogen.cpp
index f7c5ad27ec9..9863709f122 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_bn_kern::jit_avx512_core_u8_copy_bn_kern(bool s8_case)
-    : jit_generator(jit_name()), s8_case(s8_case) {}
+jit_avx512_core_u8_copy_bn_kern_t::jit_avx512_core_u8_copy_bn_kern_t(
+        bool s8_case)
+    : jit_generator_t(jit_name()), s8_case(s8_case) {}
 
-void jit_avx512_core_u8_copy_bn_kern::generate() {
+void jit_avx512_core_u8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern_autogen.cpp
index 64c29a6cec0..1e0d3bc830c 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_bt_kern::jit_avx512_core_u8_copy_bt_kern(bool s8_case)
-    : jit_generator(jit_name()), s8_case(s8_case) {}
+jit_avx512_core_u8_copy_bt_kern_t::jit_avx512_core_u8_copy_bt_kern_t(
+        bool s8_case)
+    : jit_generator_t(jit_name()), s8_case(s8_case) {}
 
-void jit_avx512_core_u8_copy_bt_kern::generate() {
+void jit_avx512_core_u8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern_autogen.cpp
index 482ea306c16..1eff422c24b 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_sum_an_kern::jit_avx512_core_u8_copy_sum_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_u8_copy_sum_an_kern_t::jit_avx512_core_u8_copy_sum_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_u8_copy_sum_an_kern::generate() {
+void jit_avx512_core_u8_copy_sum_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern_autogen.cpp
index c7f4aa6bba2..36399704ff9 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_sum_at_kern::jit_avx512_core_u8_copy_sum_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx512_core_u8_copy_sum_at_kern_t::jit_avx512_core_u8_copy_sum_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx512_core_u8_copy_sum_at_kern::generate() {
+void jit_avx512_core_u8_copy_sum_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern_autogen.cpp
index df0690d4386..eb053bcdc29 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,11 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_sum_bn_kern::jit_avx512_core_u8_copy_sum_bn_kern(
+jit_avx512_core_u8_copy_sum_bn_kern_t::jit_avx512_core_u8_copy_sum_bn_kern_t(
         bool s8_case)
-    : jit_generator(jit_name()), s8_case(s8_case) {}
+    : jit_generator_t(jit_name()), s8_case(s8_case) {}
 
-void jit_avx512_core_u8_copy_sum_bn_kern::generate() {
+void jit_avx512_core_u8_copy_sum_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern_autogen.cpp
index 56f26c0fee6..a6c3aeb336c 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx512_core_u8_copy_sum_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,11 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx512_core_u8_copy_sum_bt_kern::jit_avx512_core_u8_copy_sum_bt_kern(
+jit_avx512_core_u8_copy_sum_bt_kern_t::jit_avx512_core_u8_copy_sum_bt_kern_t(
         bool s8_case)
-    : jit_generator(jit_name()), s8_case(s8_case) {}
+    : jit_generator_t(jit_name()), s8_case(s8_case) {}
 
-void jit_avx512_core_u8_copy_sum_bt_kern::generate() {
+void jit_avx512_core_u8_copy_sum_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
index 407e101023e..333f19fb1cd 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_b0_b_gemm_s8u8s32_kern::jit_avx_kernel_b0_b_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t::
+        jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_b0_b_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_b0_b_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
index a9e0404359c..ff43cc6a718 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_b0_c_gemm_s8u8s32_kern::jit_avx_kernel_b0_c_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t::
+        jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_b0_c_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_b0_c_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
index 787703f0d14..1b386ca5536 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_b0_gemm_s8u8s32_kern::jit_avx_kernel_b0_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_b0_gemm_s8u8s32_kern_t::jit_avx_kernel_b0_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_b0_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_b0_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
index 27e97312629..77b5604aea4 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_b0_r_gemm_s8u8s32_kern::jit_avx_kernel_b0_r_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t::
+        jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_b0_r_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_b0_r_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b_gemm_s8u8s32_kern_autogen.cpp
index 13a64796615..4a6d795f040 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_b_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_b_gemm_s8u8s32_kern::jit_avx_kernel_b_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_b_gemm_s8u8s32_kern_t::jit_avx_kernel_b_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_b_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_b_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_c_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_c_gemm_s8u8s32_kern_autogen.cpp
index 09e47717e1f..41acbb21c4a 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_c_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_c_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_c_gemm_s8u8s32_kern::jit_avx_kernel_c_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_c_gemm_s8u8s32_kern_t::jit_avx_kernel_c_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_c_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_c_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_gemm_s8u8s32_kern_autogen.cpp
index 90ff166e17b..b2d98ce5d48 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_gemm_s8u8s32_kern::jit_avx_kernel_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_gemm_s8u8s32_kern_t::jit_avx_kernel_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_r_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_r_gemm_s8u8s32_kern_autogen.cpp
index 12e426f9620..20844e3e88b 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_r_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_kernel_r_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_kernel_r_gemm_s8u8s32_kern::jit_avx_kernel_r_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_kernel_r_gemm_s8u8s32_kern_t::jit_avx_kernel_r_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_kernel_r_gemm_s8u8s32_kern::generate() {
+void jit_avx_kernel_r_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_an_kern_autogen.cpp
index b326d519015..311a6fb8226 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_an_kern::jit_avx_u8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_an_kern_t::jit_avx_u8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_an_kern::generate() {
+void jit_avx_u8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_at_kern_autogen.cpp
index 436a9f22e60..98871b49dcf 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_at_kern::jit_avx_u8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_at_kern_t::jit_avx_u8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_at_kern::generate() {
+void jit_avx_u8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bn_kern_autogen.cpp
index 59eb1dfffcd..456bd739616 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_bn_kern::jit_avx_u8_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_bn_kern_t::jit_avx_u8_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_bn_kern::generate() {
+void jit_avx_u8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bt_kern_autogen.cpp
index 449e323c69c..83a66c82bfa 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_bt_kern::jit_avx_u8_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_bt_kern_t::jit_avx_u8_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_bt_kern::generate() {
+void jit_avx_u8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_an_kern_autogen.cpp
index f69d61f18bf..7c6d3a716fc 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_sum_an_kern::jit_avx_u8_copy_sum_an_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_sum_an_kern_t::jit_avx_u8_copy_sum_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_sum_an_kern::generate() {
+void jit_avx_u8_copy_sum_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_at_kern_autogen.cpp
index dd01dfc19ac..51a51fb92fc 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_sum_at_kern::jit_avx_u8_copy_sum_at_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_sum_at_kern_t::jit_avx_u8_copy_sum_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_sum_at_kern::generate() {
+void jit_avx_u8_copy_sum_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bn_kern_autogen.cpp
index 187806913f1..003b5c65755 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_sum_bn_kern::jit_avx_u8_copy_sum_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_sum_bn_kern_t::jit_avx_u8_copy_sum_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_sum_bn_kern::generate() {
+void jit_avx_u8_copy_sum_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bt_kern_autogen.cpp
index f8df1c48a27..2d1d239c716 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_avx_u8_copy_sum_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_avx_u8_copy_sum_bt_kern::jit_avx_u8_copy_sum_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_avx_u8_copy_sum_bt_kern_t::jit_avx_u8_copy_sum_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_avx_u8_copy_sum_bt_kern::generate() {
+void jit_avx_u8_copy_sum_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
index b1e7500e332..dbb45762504 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,11 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_b0_b_gemm_s8u8s32_kern::
-        jit_sse41_kernel_b0_b_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t::
+        jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_b0_b_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
index 409777cac31..46638fafcbf 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,11 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_b0_c_gemm_s8u8s32_kern::
-        jit_sse41_kernel_b0_c_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t::
+        jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_b0_c_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
index 1807b589e25..76792402b6c 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_b0_gemm_s8u8s32_kern::jit_sse41_kernel_b0_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_b0_gemm_s8u8s32_kern_t::
+        jit_sse41_kernel_b0_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_b0_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_b0_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
index 782a5d29eb3..e2a2a0ec8d5 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,11 +23,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_b0_r_gemm_s8u8s32_kern::
-        jit_sse41_kernel_b0_r_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t::
+        jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_b0_r_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b_gemm_s8u8s32_kern_autogen.cpp
index cbed7557c21..fb57af6bc86 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_b_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_b_gemm_s8u8s32_kern::jit_sse41_kernel_b_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_b_gemm_s8u8s32_kern_t::jit_sse41_kernel_b_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_b_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_b_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_c_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_c_gemm_s8u8s32_kern_autogen.cpp
index 606e1e390fa..98d7a55af1b 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_c_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_c_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_c_gemm_s8u8s32_kern::jit_sse41_kernel_c_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_c_gemm_s8u8s32_kern_t::jit_sse41_kernel_c_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_c_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_c_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_gemm_s8u8s32_kern_autogen.cpp
index 4eb2fe9c89d..fd1f1b1cbe2 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_gemm_s8u8s32_kern::jit_sse41_kernel_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_gemm_s8u8s32_kern_t::jit_sse41_kernel_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_r_gemm_s8u8s32_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_r_gemm_s8u8s32_kern_autogen.cpp
index 4de9d776821..d5943b74c02 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_r_gemm_s8u8s32_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_kernel_r_gemm_s8u8s32_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_kernel_r_gemm_s8u8s32_kern::jit_sse41_kernel_r_gemm_s8u8s32_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_kernel_r_gemm_s8u8s32_kern_t::jit_sse41_kernel_r_gemm_s8u8s32_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_kernel_r_gemm_s8u8s32_kern::generate() {
+void jit_sse41_kernel_r_gemm_s8u8s32_kern_t::generate() {
 
 #ifndef _WIN32
 
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_an_kern_autogen.cpp
index ceec68a3f26..930536f1938 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_an_kern::jit_sse41_u8_copy_an_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_an_kern_t::jit_sse41_u8_copy_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_an_kern::generate() {
+void jit_sse41_u8_copy_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_at_kern_autogen.cpp
index daa4bd6aaa5..70a9238aa44 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_at_kern::jit_sse41_u8_copy_at_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_at_kern_t::jit_sse41_u8_copy_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_at_kern::generate() {
+void jit_sse41_u8_copy_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bn_kern_autogen.cpp
index 1da99eefb6a..51bf9b58ac1 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_bn_kern::jit_sse41_u8_copy_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_bn_kern_t::jit_sse41_u8_copy_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_bn_kern::generate() {
+void jit_sse41_u8_copy_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bt_kern_autogen.cpp
index 3d5382f3849..49d08475a64 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_bt_kern::jit_sse41_u8_copy_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_bt_kern_t::jit_sse41_u8_copy_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_bt_kern::generate() {
+void jit_sse41_u8_copy_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_an_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_an_kern_autogen.cpp
index 11e0048f627..31378134c51 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_an_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_an_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_sum_an_kern::jit_sse41_u8_copy_sum_an_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_sum_an_kern_t::jit_sse41_u8_copy_sum_an_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_sum_an_kern::generate() {
+void jit_sse41_u8_copy_sum_an_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_at_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_at_kern_autogen.cpp
index a9faa7dc314..c85c393d86e 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_at_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_at_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_sum_at_kern::jit_sse41_u8_copy_sum_at_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_sum_at_kern_t::jit_sse41_u8_copy_sum_at_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_sum_at_kern::generate() {
+void jit_sse41_u8_copy_sum_at_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bn_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bn_kern_autogen.cpp
index 61d0601ba8b..0780468a1c9 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bn_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bn_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_sum_bn_kern::jit_sse41_u8_copy_sum_bn_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_sum_bn_kern_t::jit_sse41_u8_copy_sum_bn_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_sum_bn_kern::generate() {
+void jit_sse41_u8_copy_sum_bn_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bt_kern_autogen.cpp b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bt_kern_autogen.cpp
index a46e0325391..a966771f58e 100644
--- a/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bt_kern_autogen.cpp
+++ b/src/cpu/x64/gemm/s8x8s32/jit_sse41_u8_copy_sum_bt_kern_autogen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,10 +23,10 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-jit_sse41_u8_copy_sum_bt_kern::jit_sse41_u8_copy_sum_bt_kern()
-    : jit_generator(jit_name()) {}
+jit_sse41_u8_copy_sum_bt_kern_t::jit_sse41_u8_copy_sum_bt_kern_t()
+    : jit_generator_t(jit_name()) {}
 
-void jit_sse41_u8_copy_sum_bt_kern::generate() {
+void jit_sse41_u8_copy_sum_bt_kern_t::generate() {
 
 #ifndef _WIN32
 #define M rdi
diff --git a/src/cpu/x64/gemm_bf16_convolution.cpp b/src/cpu/x64/gemm_bf16_convolution.cpp
index af19963abb1..69ba7bffce3 100644
--- a/src/cpu/x64/gemm_bf16_convolution.cpp
+++ b/src/cpu/x64/gemm_bf16_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -71,13 +71,17 @@ void cvt_acc_to_dst(const conv_gemm_conf_t &jcp, size_t g_start, size_t g_end,
 
 template <data_type_t dst_data_type>
 gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::pp_ker_t(const pd_t *pd)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , jcp_(pd->jcp_)
+    , post_ops_(pd->attr()->post_ops_)
     , do_sum_(dst_data_type != data_type::f32 && jcp_.with_sum)
     , max_data_reg_idx_(31)
     , max_unroll_(12)
     , compute_reg_step_(1)
-    , data_reg_base_idx_(0) {
+    , data_reg_base_idx_(0)
+    , attr_(pd->attr())
+    , jit_eltwise_injectors_(0)
+{
     using namespace types;
     using namespace Xbyak;
 
@@ -85,31 +89,38 @@ gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::pp_ker_t(const pd_t *pd)
         // bf16 is not supported
         return;
 
-    const auto &post_ops = jcp_.post_ops;
-    if (jcp_.with_eltwise || jcp_.with_binary) {
+    bool do_depthwise_ = false;
+    bool with_binary_ = false;
+    for (int i = 0; i < post_ops_.len(); i++) {
+        auto& post_op = post_ops_.entry_[i];
+        if (post_op.is_eltwise()) {
+            jit_eltwise_injectors_.push_back(new jit_uni_eltwise_injector_t<avx512_core>(this,
+                                                                                       post_op.eltwise, data_type::f32, true, reserved_eltwise_gpr, reserved_eltwise_maskr));
+        } else if (post_op.is_binary()) {
+            with_binary_ = true;
+            do_depthwise_ = false;
+        } else if (post_op.is_depthwise()) {
+            do_depthwise_ = true;
+        }
+    }
+    if (with_binary_) {
 #define PARAM_OFF(field) offsetof(ker_args, field)
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = true;
-        static constexpr size_t helper_vmm_idx = 31;
-        static constexpr size_t tail_size = 1;
+        static constexpr size_t helper_vmm_idx = 15;
+        static constexpr size_t tail_size = 0;
         static constexpr bool use_exact_tail_scalar_bcast = false;
-        const binary_injector::rhs_arg_static_params_t rhs_arg_static_params {
-                helper_vmm_idx, reserved_eltwise_gpr, r14, r15, preserve_gpr,
-                preserve_vmm, PARAM_OFF(post_ops_binary_rhs_arg_vec),
-                PARAM_OFF(dst_orig), memory_desc_wrapper(pd->dst_md()),
-                tail_size, kreg_rem_mask, use_exact_tail_scalar_bcast};
-        const binary_injector::static_params_t binary_static_params {
-                this->reg_param, rhs_arg_static_params};
-        static constexpr bool save_state = true;
-        const eltwise_injector::static_params_t eltwise_static_params {
-                save_state, reserved_eltwise_gpr, reserved_eltwise_maskr};
-
-        postops_injector_ = utils::make_unique<
-                injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, post_ops, binary_static_params, eltwise_static_params);
+        const binary_injector::rhs_arg_static_params_t rhs_sp {
+            helper_vmm_idx, reserved_eltwise_gpr, r13, r14, preserve_gpr,
+            preserve_vmm, PARAM_OFF(post_ops_binary_rhs_arg_vec),
+            PARAM_OFF(dst_orig), memory_desc_wrapper(pd->dst_md()),
+            tail_size, kreg_rem_mask, use_exact_tail_scalar_bcast};
 #undef PARAM_OFF
+        const binary_injector::static_params_t bsp {this->param1, rhs_sp};
+        jit_binary_injector_ = utils::make_unique<
+                binary_injector::jit_uni_binary_injector_t<avx512_core>>(
+                this, bsp);            
     }
-
     if (do_sum_) {
         compute_reg_step_ = 2;
         vreg_sum_scale = Zmm(data_reg_base_idx_++);
@@ -117,7 +128,10 @@ gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::pp_ker_t(const pd_t *pd)
 
     if (jcp_.with_bias) vreg_bias = Zmm(data_reg_base_idx_++);
 
-    vlen_ = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+    if (do_depthwise_)
+        vreg_dw = Zmm(data_reg_base_idx_++);
+
+    vlen_ = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
 
     isa_ = mayiuse(avx512_core_bf16) ? avx512_core_bf16
                                      : bf16_emulation_t::get_isa();
@@ -133,25 +147,6 @@ gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::pp_ker_t(const pd_t *pd)
             = (max_data_reg_idx_ - data_reg_base_idx_ + 1) / compute_reg_step_;
 }
 
-template <data_type_t dst_data_type>
-void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::apply_postops(
-        const bool apply_mask, const size_t out_offset, const int vmm_idx) {
-#define PARAM_OFF(x) offsetof(ker_args, x)
-    if (jcp_.with_eltwise || jcp_.with_binary) {
-        if (jcp_.with_binary) {
-            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
-            rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, reg_dst);
-            rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(
-                    vmm_idx, out_offset * sizeof(dst_data_t));
-            if (apply_mask) rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
-
-            postops_injector_->compute_vector(vmm_idx, rhs_arg_params);
-        } else
-            postops_injector_->compute_vector(vmm_idx);
-    }
-#undef PARAM_OFF
-}
-
 template <data_type_t dst_data_type>
 void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::generate() {
     using namespace Xbyak;
@@ -172,9 +167,9 @@ void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::generate() {
     mov(reg_len, ptr[reg_param + PARAM_OFF(spatial_length)]);
     mov(reg_oc_iter, ptr[reg_param + PARAM_OFF(oc_work)]);
 
+    mov(reg_oc_offset, ptr[reg_param + PARAM_OFF(oc_offset)]);
     if (do_sum_)
         vbroadcastss(vreg_sum_scale, ptr[reg_param + PARAM_OFF(sum_scale)]);
-#undef PARAM_OFF
 
     // Load accumulated value, apply sum (if any), bias (if any)
     // and relu (if any); then convert to destination type and store
@@ -211,7 +206,61 @@ void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::generate() {
             vfmadd231ps(vreg_dst(idx), vreg_prev_dst(idx), vreg_sum_scale);
         }
 
-        apply_postops(apply_mask, offset, vreg_dst_idx(idx));
+        if (jcp_.with_depthwise) {
+            push(reg_post_ops_data);
+            mov(reg_post_ops_data, ptr[reg_param + PARAM_OFF(post_ops_binary_rhs_arg_vec)]);
+        }
+
+        int eltwise_inj_idx = 0;
+        int binary_inj_idx = 0;
+        std::size_t post_ops_data_offset = 0;
+        const auto& p = attr_->post_ops_;
+        for (int i = 0; i < p.len(); i++) {
+            auto& post_op = p.entry_[i];
+            if (post_op.is_eltwise()) {
+                jit_eltwise_injectors_[eltwise_inj_idx]->compute_vector(vreg_dst_idx(idx));
+                eltwise_inj_idx++;
+            } else if (post_op.is_binary()){
+                binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
+                rhs_arg_params.vmm_idx_to_out_addr.emplace(idx, dst_addr);
+                rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(
+                        idx, 0);
+                if (apply_mask) 
+                    rhs_arg_params.vmm_tail_idx_.emplace(idx);
+                jit_binary_injector_->compute_vector(
+                        idx, binary_inj_idx, post_op, rhs_arg_params);
+
+                binary_inj_idx++;
+            } else if (post_op.is_depthwise()) {
+                mov(reg_dw, ptr[reg_post_ops_data + post_ops_data_offset]);
+                lea(reg_dw, ptr[reg_dw + reg_oc_offset]);
+
+                switch (post_op.depthwise.alg) {
+                    case alg_kind::depthwise_scale_shift: {
+                        vbroadcastss(vreg_dw, ptr[reg_dw + post_op.depthwise.offset[post_op.depthwise.scales] * sizeof(float)]);
+                        vmulps(vreg_dst(idx), vreg_dst(idx), vreg_dw);
+                        vbroadcastss(vreg_dw, ptr[reg_dw + post_op.depthwise.offset[post_op.depthwise.shifts] * sizeof(float)]);
+                        vaddps(vreg_dst(idx), vreg_dst(idx), vreg_dw);
+                        break;
+                    }
+                    case alg_kind::depthwise_prelu: {
+                        vpxord(vreg_dw, vreg_dw, vreg_dw);
+                        vcmpps(kmask, vreg_dst(idx), vreg_dw, _cmp_lt_os);
+                        vbroadcastss(vreg_dw, ptr[reg_dw + post_op.depthwise.offset[post_op.depthwise.scales] * sizeof(float)]);
+                        vmulps(vreg_dst(idx) | kmask, vreg_dst(idx), vreg_dw);
+                        break;
+                    }
+                    default: assert(!"unsupported depthwise algorithm");
+                }
+
+                binary_inj_idx++;
+                post_ops_data_offset += sizeof(float*);
+            }
+        }
+
+        if (jcp_.with_depthwise) {
+            pop(reg_post_ops_data);
+        }
 
         if (dst_data_type == data_type::bf16) {
             // TODO: implement store by zmm registers for bf16
@@ -285,22 +334,25 @@ void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::generate() {
     add(reg_acc_base, reg_acc_str);
     if (jcp_.with_bias) add(reg_bias, sizeof(acc_data_t));
 
+    add(reg_oc_offset, sizeof(float));
+
     dec(reg_oc_iter);
     jnz(oc_loop, T_NEAR); // oc_loop end
 
     L(oc_loop_end);
 
     postamble();
+    for (auto& inj : jit_eltwise_injectors_)
+        inj->prepare_table();
 
-    if (jcp_.with_eltwise)
-        postops_injector_->prepare_table(/* generate = */ true);
+#undef PARAM_OFF
 }
 
 // operator () specialized for nspc format
 template <data_type_t dst_data_type>
 void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::operator()(
         dst_data_t *dst, const acc_data_t *acc, const acc_data_t *bias,
-        float sum_scale, size_t oc_work,
+        float sum_scale, size_t oc_work, size_t g_offset,
         const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
         const size_t g_oc_offset) {
 
@@ -313,18 +365,19 @@ void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::operator()(
     args.acc_stride_in_bytes = sizeof(acc_data_t);
     args.spatial_length = 1;
     args.oc_work = oc_work;
+    args.oc_offset = g_offset * sizeof(float);
 
     args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
     args.dst_orig = dst_orig;
     args.g_oc_offset = g_oc_offset;
-    jit_generator::operator()(&args);
+    jit_generator_t::operator()(&args);
 }
 
 // operator () specialized for ncsp format
 template <data_type_t dst_data_type>
 void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::operator()(
         dst_data_t *dst, const acc_data_t *acc, const acc_data_t *bias,
-        float sum_scale, size_t dst_stride_in_elements,
+        size_t g_offset, size_t start_oc, float sum_scale, size_t dst_stride_in_elements,
         size_t acc_stride_in_elements, size_t sp_len, size_t oc_len,
         const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
         const size_t g_oc_offset) {
@@ -339,11 +392,12 @@ void gemm_bf16_convolution_fwd_t<dst_data_type>::pp_ker_t::operator()(
     args.acc_stride_in_bytes = acc_stride_in_elements * sizeof(acc_data_t);
     args.spatial_length = sp_len;
     args.oc_work = oc_len;
+    args.oc_offset = (start_oc + g_offset) * sizeof(float);
 
     args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
     args.dst_orig = dst_orig;
     args.g_oc_offset = g_oc_offset;
-    jit_generator::operator()(&args);
+    jit_generator_t::operator()(&args);
 }
 
 template <data_type_t dst_data_type>
@@ -499,7 +553,7 @@ status_t gemm_bf16_convolution_fwd_t<dst_data_type>::execute_forward_thr_nspc(
 
                             (*pp_ker_)(dst_arr,
                                     acc_needed ? acc_arr : (float *)dst_arr,
-                                    bia_arr, sum_scale, jcp.oc,
+                                    bia_arr, sum_scale, jcp.oc, g * jcp.oc,
                                     post_ops_binary_rhs_arg_vec, dst_base,
                                     g * jcp.oc);
                         });
@@ -530,6 +584,11 @@ status_t gemm_bf16_convolution_fwd_t<dst_data_type>::execute_forward_ncsp(
             : nullptr;
 
     const conv_gemm_conf_t &jcp = this->pd()->jcp_;
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+
+    src += src_d.off_l(0);
+    dst += dst_d.off_l(0);
 
     float *bias = nullptr;
     if (jcp.with_bias) {
@@ -601,8 +660,7 @@ status_t gemm_bf16_convolution_fwd_t<dst_data_type>::execute_forward_ncsp(
         if (this->pd()->is_postprocess_required() && ic + ic_block >= jcp.ic) {
             size_t acc_str = LDC;
             size_t dst_str = M;
-            float *bias_ptr = bias ? bias + groups * jcp.oc + oc : nullptr;
-            (*pp_ker_)(dst_local, acc, bias_ptr, sum_scale, dst_str, acc_str, m,
+            (*pp_ker_)(dst_local, acc, bias, groups * jcp.oc, oc, sum_scale, dst_str, acc_str, m,
                     oc_block, post_ops_binary_rhs_arg_vec.data(), dst,
                     groups * jcp.oc + oc);
         }
@@ -667,13 +725,16 @@ status_t gemm_bf16_convolution_bwd_data_t<diff_src_data_type>::
     auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src_base = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto post_ops_binary_rhs_arg_vec
+            = x64::binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+
     auto scratchpad = ctx.get_scratchpad_grantor();
     const conv_gemm_conf_t &jcp = pd()->jcp_;
 
     std::atomic<status_t> st(status::success);
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         status_t st_thr = execute_backward_data_thr_nspc(
-                ithr, nthr, diff_src_base, wei_base, diff_dst_base, scratchpad);
+                ithr, nthr, diff_src_base, wei_base, diff_dst_base, scratchpad, post_ops_binary_rhs_arg_vec);
         if (st_thr != status::success) st = st_thr;
     });
 
@@ -685,7 +746,8 @@ status_t gemm_bf16_convolution_bwd_data_t<
         diff_src_data_type>::execute_backward_data_thr_nspc(const int ithr,
         const int nthr, diff_src_data_t *diff_src_base,
         const wei_data_t *wei_base, const diff_dst_data_t *diff_dst_base,
-        const memory_tracking::grantor_t &scratchpad) const {
+        const memory_tracking::grantor_t &scratchpad,
+        const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const {
 
     const conv_gemm_conf_t &jcp = pd()->jcp_;
 
@@ -706,6 +768,8 @@ status_t gemm_bf16_convolution_bwd_data_t<
     // threads share work across mini-batch and groups
     const dim_t work_amount = jcp.ngroups * jcp.mb;
 
+    const auto& p = pd()->attr()->post_ops_;
+
     acc_data_t *__restrict col = scratchpad.get<acc_data_t>(key_conv_gemm_col)
             + (ptrdiff_t)ithr * jcp.im2col_sz;
     acc_data_t *__restrict acc = scratchpad.get<acc_data_t>(key_conv_gemm_acc)
@@ -738,6 +802,30 @@ status_t gemm_bf16_convolution_bwd_data_t<
         if (jcp.im2col_sz)
             jit_gemm_convolution_utils::col2im_dt<acc_data_t>(jcp, col, acc);
 
+        if (p.len() > 0) {
+            std::size_t post_ops_data_idx = 0;
+            int depthwise_inj_idx = 0;
+            for (int i = 0; i < p.len(); i++) {
+                auto &post_op = p.entry_[i];
+                if (post_op.is_depthwise()) {
+                    auto depthwise_base = reinterpret_cast<const float*>(post_ops_binary_rhs_arg_vec[post_ops_data_idx]);
+                    auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                    auto depthwise_bias = depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts];
+
+                    parallel_nd(static_cast<size_t>(jcp.is) * jcp.id, [&](size_t is) {
+                        diff_src_data_t*__restrict diff_src_arr
+                                = diff_src + is * diff_src_os_stride;
+                        for (int ic = 0; ic < jcp.ic; ic++) {
+                            diff_src_arr[ic] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(diff_src_arr[ic],
+                                depthwise_weights + g * jcp.ic + ic, depthwise_bias + g * jcp.ic + ic);
+                        }
+                    });
+                    post_ops_data_idx++;
+                    depthwise_inj_idx++;
+                }
+            }
+        }
+
         const bool is_diff_src_bf16 = diff_src_data_type == data_type::bf16;
 
         if (is_diff_src_bf16 && jcp.ngroups == 1 && jcp.nthr != 1) {
@@ -780,6 +868,9 @@ status_t gemm_bf16_convolution_bwd_data_t<diff_src_data_type>::
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto post_ops_binary_rhs_arg_vec
+            = x64::binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+
     auto col = ctx.get_scratchpad_grantor().template get<acc_data_t>(
             key_conv_gemm_col);
     acc_data_t *acc_base = diff_src_data_type == data_type::bf16
@@ -801,6 +892,8 @@ status_t gemm_bf16_convolution_bwd_data_t<diff_src_data_type>::
     const dim_t work_amount = (size_t)jcp.ngroups * jcp.mb;
     const bool is_problem_3d = pd()->ndims() == 5;
 
+    const auto& p = pd()->attr()->post_ops_;
+
     std::atomic<status_t> st(status::success);
 
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
@@ -854,6 +947,32 @@ status_t gemm_bf16_convolution_bwd_data_t<diff_src_data_type>::
                                 od, os_nb * jcp.os_block, os_block);
                 }
             }
+
+            if (p.len() > 0) {
+                std::size_t post_ops_data_idx = 0;
+                int depthwise_inj_idx = 0;
+                for (int i = 0; i < p.len(); i++) {
+                    auto &post_op = p.entry_[i];
+                    if (post_op.is_depthwise()) {
+                        auto depthwise_base = reinterpret_cast<const float*>(post_ops_binary_rhs_arg_vec[post_ops_data_idx]);
+                        auto depthwise_weights = depthwise_base + post_op.depthwise.offset[post_op.depthwise.scales];
+                        auto depthwise_bias = depthwise_base + post_op.depthwise.offset[post_op.depthwise.shifts];
+
+                        parallel_nd(jcp.ic, [&](const int ic) {
+                            for (int id = 0; id < jcp.id; ++id) {
+                                acc_data_t *d_ = acc + ic * jcp.id * jcp.is + id * jcp.is;
+                                for (int iS = 0; iS < jcp.is; ++iS) {
+                                    d_[iS] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(d_[iS],
+                                            depthwise_weights + g * jcp.ic + ic, depthwise_bias + g * jcp.ic + ic);
+                                }
+                            }
+                        });
+                        post_ops_data_idx++;
+                        depthwise_inj_idx++;
+                    }
+                }
+            }
+
             if (diff_src_data_type == data_type::bf16) {
                 size_t spatial_size = (size_t)jcp.ih * jcp.iw * jcp.id;
                 store_bfloat16_in_parallel((bfloat16_t *)diff_src_local,
diff --git a/src/cpu/x64/gemm_bf16_convolution.hpp b/src/cpu/x64/gemm_bf16_convolution.hpp
index 7fe15a2d36b..bfe77018a35 100644
--- a/src/cpu/x64/gemm_bf16_convolution.hpp
+++ b/src/cpu/x64/gemm_bf16_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,6 +28,9 @@
 #include "cpu/x64/cpu_reducer.hpp"
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
 #include "cpu/x64/jit_avx512_core_bf16cvt.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+#include "cpu/ref_depthwise_injector.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -37,9 +40,7 @@ namespace x64 {
 template <data_type_t dst_data_type>
 struct gemm_bf16_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_bf16_convolution_fwd_t,
                 USE_GLOBAL_SCRATCHPAD);
@@ -69,8 +70,13 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
                                    primitive_attr_t::skip_mask_t::post_ops,
                                    dst_data_type),
                     VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(post_ops_ok(),
+                    VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads(), true /* check_postops */);
@@ -88,16 +94,39 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
                     || is_pp_for_post_ops_required;
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+
+    protected:
+        virtual bool post_ops_ok() const {
+            auto const &po = this->attr()->post_ops_;
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < po.len(); i++) {
+                    ok = ok && utils::one_of(po.entry_[i].kind, primitive_kind::sum, primitive_kind::binary, primitive_kind::eltwise, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            auto contain = [&](dnnl::impl::primitive_kind_t kind) { return po.find(kind) != -1; };
+            auto position = [&](dnnl::impl::primitive_kind_t kind) { return po.find(kind); };
+            auto count = [&](dnnl::impl::primitive_kind_t kind) { return po.count(kind); };
+
+            return all_post_ops_supported() &&
+                   count(primitive_kind::sum) <= 1 &&
+                   IMPLICATION(contain(primitive_kind::sum), position(primitive_kind::sum) == 0);
+
+            return false;
+        }
     };
 
     gemm_bf16_convolution_fwd_t(const pd_t *apd)
         : primitive_t(apd), pp_ker_(nullptr) {}
 
-    typedef typename prec_traits<dst_data_type>::type dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
+    using dst_data_t = typename prec_traits_t<dst_data_type>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t init(engine_t *engine) override {
         const auto &post_ops = pd()->attr()->post_ops_;
@@ -130,17 +159,24 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
-    class pp_ker_t : public jit_generator {
+    class pp_ker_t : public jit_generator_t {
     public:
         DECLARE_CPU_JIT_AUX_FUNCTIONS(gemm_bf16_convolution_fwd_t::pp_kernel);
         pp_ker_t(const pd_t *pd);
 
+        ~pp_ker_t() {
+            for (auto inj : jit_eltwise_injectors_)
+                delete inj;
+            jit_eltwise_injectors_.clear();
+        }
+
         void operator()(dst_data_t *dst, const acc_data_t *acc,
-                const acc_data_t *bias, float sum_scale, size_t oc_work,
+                const acc_data_t *bias, float sum_scale, size_t oc_work, size_t g_offset,
                 const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
                 const size_t g_oc_offset);
         void operator()(dst_data_t *dst, const acc_data_t *acc,
-                const acc_data_t *bias, float sum_scale, size_t dst_str,
+                const acc_data_t *bias,
+                size_t g_offset, size_t start_oc, float sum_scale, size_t dst_str,
                 size_t acc_str, size_t sp_len, size_t oc,
                 const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
                 const size_t g_oc_offset);
@@ -155,6 +191,7 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
             size_t acc_stride_in_bytes;
             size_t spatial_length;
             size_t oc_work;
+            size_t oc_offset;
 
             size_t g_oc_offset;
             const void *post_ops_binary_rhs_arg_vec;
@@ -179,10 +216,16 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
         Xbyak::Reg64 reg_dst_str = r13;
         Xbyak::Reg64 reg_acc_str = r14;
 
+        using Vmm = typename cpu_isa_traits_t<avx512_core>::Vmm;
+        Xbyak::Reg64 reg_oc_offset = r10;
+        Xbyak::Reg64 reg_dw = r9;
+        Xbyak::Reg64 reg_post_ops_data = reg_bias;
+        Xbyak::Opmask kmask = k7;
+
         Xbyak::Reg64 reserved_eltwise_gpr = r10;
         Xbyak::Opmask reserved_eltwise_maskr = k2;
 
-        Xbyak::Zmm vreg_sum_scale, vreg_bias;
+        Xbyak::Zmm vreg_sum_scale, vreg_bias, vreg_dw;
 
         Xbyak::Zmm bf16_emu_reserv_1 = Xbyak::Zmm(27);
         Xbyak::Zmm bf16_emu_reserv_2 = Xbyak::Zmm(28);
@@ -192,14 +235,17 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
         Xbyak::Zmm bf16_emu_reserv_6 = Xbyak::Zmm(31);
 
         const conv_gemm_conf_t &jcp_;
+        post_ops_t post_ops_;
         const bool do_sum_;
         int max_data_reg_idx_, max_unroll_, compute_reg_step_;
         int data_reg_base_idx_;
         size_t vlen_;
         cpu_isa_t isa_;
         std::unique_ptr<bf16_emulation_t> bf16_emu_;
-        std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
-                postops_injector_;
+        const primitive_attr_t* attr_;
+        nstl::vector<jit_uni_eltwise_injector_t<avx512_core>*> jit_eltwise_injectors_;
+        std::unique_ptr<binary_injector::jit_uni_binary_injector_t<avx512_core>>
+                jit_binary_injector_;
 
         void apply_postops(const bool apply_mask, const size_t out_offset,
                 const int vmm_idx);
@@ -239,9 +285,7 @@ struct gemm_bf16_convolution_fwd_t : public primitive_t {
 template <data_type_t diff_src_data_type>
 struct gemm_bf16_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_bf16_convolution_bwd_data_t,
                 USE_GLOBAL_SCRATCHPAD);
@@ -259,23 +303,58 @@ struct gemm_bf16_convolution_bwd_data_t : public primitive_t {
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(mayiuse(avx512_core), VERBOSE_UNSUPPORTED_ISA);
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), diff_src_md_, weights_md_, diff_dst_md_, bias_md_,
                     attr_, dnnl_get_max_threads());
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+
+    protected:
+        virtual bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
-    gemm_bf16_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
+    gemm_bf16_convolution_bwd_data_t(const pd_t* apd) : primitive_t(apd) {
+        const auto& post_ops = pd()->attr()->post_ops_;
+        for (int i = 0; i < post_ops.len(); i++) {
+            auto& post_op = post_ops.entry_[i];
+            if (post_op.is_depthwise()) {
+                depthwise_injectors.push_back(new ref_depthwise_scalar_fwd_t(post_op.depthwise.alg));
+            }
+        }
+    }
+
+    ~gemm_bf16_convolution_bwd_data_t() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
-    typedef typename prec_traits<diff_src_data_type>::type diff_src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
+    using diff_src_data_t = typename prec_traits_t<diff_src_data_type>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         const bool is_nspc = pd()->jcp_.is_nspc;
@@ -289,18 +368,19 @@ struct gemm_bf16_convolution_bwd_data_t : public primitive_t {
     status_t execute_backward_data_thr_nspc(const int ithr, const int nthr,
             diff_src_data_t *diff_src_base, const wei_data_t *wei_base,
             const diff_dst_data_t *diff_dst_base,
-            const memory_tracking::grantor_t &scratchpad) const;
+            const memory_tracking::grantor_t &scratchpad,
+            const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const;
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    nstl::vector<ref_depthwise_scalar_fwd_t*> depthwise_injectors;
 };
 
 template <data_type_t diff_wei_data_type>
 struct gemm_bf16_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(GEMM_IMPL_STR, gemm_bf16_convolution_bwd_weights_t,
                 USE_GLOBAL_SCRATCHPAD);
@@ -326,21 +406,24 @@ struct gemm_bf16_convolution_bwd_weights_t : public primitive_t {
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            jcp_ = conv_gemm_conf_t();
             return jit_gemm_convolution_utils::init_conf(jcp_, scratchpad,
                     *desc(), src_md_, diff_weights_md_, diff_dst_md_,
                     diff_bias_md_, attr_, dnnl_get_max_threads());
         }
 
-        conv_gemm_conf_t jcp_;
+        conv_gemm_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     gemm_bf16_convolution_bwd_weights_t(const pd_t *apd)
         : primitive_t(apd), acc_ker_(nullptr) {}
 
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<diff_wei_data_type>::type diff_wei_data_t;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using diff_wei_data_t = typename prec_traits_t<diff_wei_data_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(
diff --git a/src/cpu/x64/gemm_bf16_inner_product.hpp b/src/cpu/x64/gemm_bf16_inner_product.hpp
index 460eb907b84..1ebce8ebff0 100644
--- a/src/cpu/x64/gemm_bf16_inner_product.hpp
+++ b/src/cpu/x64/gemm_bf16_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,6 +50,8 @@ struct gemm_bf16_inner_product_fwd_t : public primitive_t {
             using namespace utils;
             using namespace data_type;
 
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
             if (!mayiuse(avx512_core)) return status::unimplemented;
             VDISPATCH_INNER_PRODUCT(is_fwd(), VERBOSE_BAD_PROPKIND);
             VDISPATCH_INNER_PRODUCT(
@@ -63,6 +65,8 @@ struct gemm_bf16_inner_product_fwd_t : public primitive_t {
                     IMPLICATION(with_bias(),
                             one_of(weights_md(1)->data_type, f32, bf16)),
                     VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_INNER_PRODUCT(set_default_params() == status::success,
+                    VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_INNER_PRODUCT(
                     attr()->has_default_values(
                             primitive_attr_t::skip_mask_t::post_ops,
@@ -75,8 +79,6 @@ struct gemm_bf16_inner_product_fwd_t : public primitive_t {
             VDISPATCH_INNER_PRODUCT(inner_product_utils::post_ops_ok(
                                             attr()->post_ops_, &dst_md_),
                     VERBOSE_UNSUPPORTED_POSTOP);
-            VDISPATCH_INNER_PRODUCT(set_default_params() == status::success,
-                    VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_INNER_PRODUCT(dense_gemm_consitency_check(
                                             src_md(), weights_md(), dst_md()),
                     VERBOSE_INCOMPATIBLE_GEMM_FMT);
@@ -106,10 +108,10 @@ struct gemm_bf16_inner_product_fwd_t : public primitive_t {
 
     gemm_bf16_inner_product_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<dst_data_type>::type dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
+    using dst_data_t = typename prec_traits_t<dst_data_type>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t init(engine_t *engine) override {
         const bool has_bias = pd()->with_bias();
@@ -156,7 +158,8 @@ struct gemm_bf16_inner_product_bwd_data_t : public primitive_t {
 
         status_t init(engine_t *engine) {
             using namespace data_type;
-
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
             if (!mayiuse(avx512_core)) return status::unimplemented;
             VDISPATCH_INNER_PRODUCT(
                     desc()->prop_kind == prop_kind::backward_data,
@@ -201,10 +204,10 @@ struct gemm_bf16_inner_product_bwd_data_t : public primitive_t {
 
     gemm_bf16_inner_product_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
-    typedef typename prec_traits<diff_src_data_type>::type diff_src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
+    using diff_src_data_t = typename prec_traits_t<diff_src_data_type>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward_data(ctx);
@@ -227,7 +230,8 @@ struct gemm_bf16_inner_product_bwd_weights_t : public primitive_t {
         status_t init(engine_t *engine) {
             using namespace utils;
             using namespace data_type;
-
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
             if (!mayiuse(avx512_core)) return status::unimplemented;
             VDISPATCH_INNER_PRODUCT(
                     desc()->prop_kind == prop_kind::backward_weights,
@@ -312,10 +316,10 @@ struct gemm_bf16_inner_product_bwd_weights_t : public primitive_t {
         return status::success;
     }
 
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<diff_wei_data_type>::type diff_wei_data_t;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using diff_wei_data_t = typename prec_traits_t<diff_wei_data_type>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward_weights(ctx);
diff --git a/src/cpu/x64/injectors/injector_utils.cpp b/src/cpu/x64/injectors/injector_utils.cpp
index c79a8b4fc07..2254d73243d 100644
--- a/src/cpu/x64/injectors/injector_utils.cpp
+++ b/src/cpu/x64/injectors/injector_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ static std::size_t calc_vmm_to_preserve_size_bytes(
             });
 }
 
-register_preserve_guard_t::register_preserve_guard_t(jit_generator *host,
+register_preserve_guard_t::register_preserve_guard_t(jit_generator_t *host,
         std::initializer_list<Xbyak::Reg64> reg64_to_preserve,
         std::initializer_list<Xbyak::Xmm> vmm_to_preserve)
     : host_(host)
@@ -108,7 +108,7 @@ size_t register_preserve_guard_t::stack_space_occupied() const {
 };
 
 conditional_register_preserve_guard_t::conditional_register_preserve_guard_t(
-        bool condition_to_be_met, jit_generator *host,
+        bool condition_to_be_met, jit_generator_t *host,
         std::initializer_list<Xbyak::Reg64> reg64_to_preserve,
         std::initializer_list<Xbyak::Xmm> vmm_to_preserve)
     : register_preserve_guard_t {condition_to_be_met
diff --git a/src/cpu/x64/injectors/injector_utils.hpp b/src/cpu/x64/injectors/injector_utils.hpp
index 6498f814b67..5e3708c67ea 100644
--- a/src/cpu/x64/injectors/injector_utils.hpp
+++ b/src/cpu/x64/injectors/injector_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ inline layout_t get_layout_type(const memory_desc_wrapper &dst_d) {
 class register_preserve_guard_t {
 
 public:
-    register_preserve_guard_t(jit_generator *host,
+    register_preserve_guard_t(jit_generator_t *host,
             std::initializer_list<Xbyak::Reg64> reg64_to_preserve,
             std::initializer_list<Xbyak::Xmm> vmm_to_preserve = {});
     register_preserve_guard_t(register_preserve_guard_t &&other) = default;
@@ -64,7 +64,7 @@ class register_preserve_guard_t {
     size_t stack_space_occupied() const;
 
 private:
-    jit_generator *host_;
+    jit_generator_t *host_;
     std::stack<Xbyak::Reg64> reg64_stack_;
     std::stack<Xbyak::Xmm> vmm_stack_;
     size_t vmm_to_preserve_size_bytes_;
@@ -73,7 +73,7 @@ class register_preserve_guard_t {
 class conditional_register_preserve_guard_t : public register_preserve_guard_t {
 public:
     conditional_register_preserve_guard_t(bool condition_to_be_met,
-            jit_generator *host,
+            jit_generator_t *host,
             std::initializer_list<Xbyak::Reg64> reg64_to_preserve,
             std::initializer_list<Xbyak::Xmm> vmm_to_preserve = {});
     DNNL_DISALLOW_COPY_AND_ASSIGN(conditional_register_preserve_guard_t);
diff --git a/src/cpu/x64/injectors/jit_uni_binary_injector.cpp b/src/cpu/x64/injectors/jit_uni_binary_injector.cpp
index 2a6deac3417..5fcdbba5b05 100644
--- a/src/cpu/x64/injectors/jit_uni_binary_injector.cpp
+++ b/src/cpu/x64/injectors/jit_uni_binary_injector.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,9 +30,12 @@ namespace cpu {
 namespace x64 {
 namespace binary_injector {
 
+#define VCHECK_BIN_INJ_BOOL(cond, msg) \
+    VCONDCHECK(primitive, create, check, binary_injector, cond, false, msg);
+
 bcast_set_t get_all_strategies_supported_by_injector() {
     return bcast_set_t {broadcasting_strategy_t::scalar,
-            broadcasting_strategy_t::per_oc,
+            broadcasting_strategy_t::per_oc, broadcasting_strategy_t::per_oc_d,
             broadcasting_strategy_t::per_oc_spatial,
             broadcasting_strategy_t::per_mb,
             broadcasting_strategy_t::per_mb_spatial,
@@ -90,17 +93,22 @@ bool is_bcast_supported(const dnnl::impl::memory_desc_t &src1_desc,
 
     if (bcast_type == broadcasting_strategy_t::no_broadcast) {
         // in case of no broadcast data layout of dst and src1 have to be the same
-        if (!src1_desc_layout_same_as_dst_d(src1_desc, dst_d)) return false;
+        VCHECK_BIN_INJ_BOOL(src1_desc_layout_same_as_dst_d(src1_desc, dst_d),
+                "Dst and src1 layout are not the same");
+        return true;
     }
 
-    return bcast_type != broadcasting_strategy_t::unsupported;
+    VCHECK_BIN_INJ_BOOL(bcast_type != broadcasting_strategy_t::unsupported,
+            "Unsupported broadcast type");
+    return true;
 }
 
 bool is_supported(cpu_isa_t isa, const dnnl::impl::memory_desc_t &src1_desc,
         const memory_desc_wrapper &dst_d,
         const bcast_set_t &supported_strategy_set) {
-    return is_data_supported(isa, src1_desc.data_type)
-            && is_bcast_supported(src1_desc, dst_d, supported_strategy_set);
+    VCHECK_BIN_INJ_BOOL(is_data_supported(isa, src1_desc.data_type),
+            VERBOSE_ISA_DT_MISMATCH);
+    return is_bcast_supported(src1_desc, dst_d, supported_strategy_set);
 }
 
 bool binary_args_broadcast_supported(const post_ops_t &post_ops,
@@ -212,12 +220,14 @@ rhs_arg_static_params_t::rhs_arg_static_params_t(
         bool preserve_vmm_helper, std::size_t abi_param_offset,
         std::size_t dst_orig_offset, const memory_desc_wrapper &dst_d,
         std::size_t tail_size, const Xbyak::Opmask &tail_opmask,
-        bool use_exact_tail_scalar_bcast)
+        bool use_exact_tail_scalar_bcast, std::size_t rhs_prelu_helper_vmm_idx)
     : rhs_arg_static_params_t(rhs_dt_helper_vmm_idx, rhs_addr_reg,
             rhs_helper_reg, rhs_addr_cache_reg, preserve_gpr_helpers,
             preserve_vmm_helper, abi_param_offset, dst_orig_offset, dst_d,
             tail_size, tail_opmask, use_exact_tail_scalar_bcast, rhs_helper_reg,
-            true /*is_opmask_set*/) {}
+            true /*is_opmask_set*/) {
+    this->rhs_prelu_helper_vmm_idx = rhs_prelu_helper_vmm_idx;
+}
 
 rhs_arg_static_params_t::rhs_arg_static_params_t(
         std::size_t rhs_dt_helper_vmm_idx, const Xbyak::Reg64 &rhs_addr_reg,
@@ -226,12 +236,14 @@ rhs_arg_static_params_t::rhs_arg_static_params_t(
         bool preserve_vmm_helper, std::size_t abi_param_offset,
         std::size_t dst_orig_offset, const memory_desc_wrapper &dst_d,
         std::size_t tail_size, const Xbyak::Opmask &tail_opmask,
-        const Xbyak::Reg64 &reg_tail_size, bool use_exact_tail_scalar_bcast)
+        const Xbyak::Reg64 &reg_tail_size, bool use_exact_tail_scalar_bcast, std::size_t rhs_prelu_helper_vmm_idx)
     : rhs_arg_static_params_t(rhs_dt_helper_vmm_idx, rhs_addr_reg,
             rhs_helper_reg, rhs_addr_cache_reg, preserve_gpr_helpers,
             preserve_vmm_helper, abi_param_offset, dst_orig_offset, dst_d,
             tail_size, tail_opmask, use_exact_tail_scalar_bcast, reg_tail_size,
-            true /*is_opmask_set*/) {}
+            true /*is_opmask_set*/) {
+    this->rhs_prelu_helper_vmm_idx = rhs_prelu_helper_vmm_idx;
+}
 
 rhs_arg_static_params_t::rhs_arg_static_params_t(
         std::size_t rhs_dt_helper_vmm_idx, const Xbyak::Reg64 &rhs_addr_reg,
@@ -260,7 +272,7 @@ rhs_arg_static_params_t::rhs_arg_static_params_t(
 
 template <cpu_isa_t isa, typename Vmm>
 jit_uni_binary_injector_t<isa, Vmm>::jit_uni_binary_injector_t(
-        jit_generator *host, const static_params_t &static_params)
+        jit_generator_t *host, const static_params_t &static_params)
     : host_(host)
     , f8_e5m2_emu_(static_params.f8_e5m2_emu_)
     , f8_e4m3_emu_(static_params.f8_e4m3_emu_)
@@ -318,18 +330,18 @@ int jit_uni_binary_injector_t<isa, Vmm>::adjust_temp_vmm_hint(
 }
 
 template <typename Vmm>
-static void push_vmm(jit_generator *host, const Vmm &vmm) {
-    host->sub(host->rsp, vreg_traits<Vmm>::vlen);
+static void push_vmm(jit_generator_t *host, const Vmm &vmm) {
+    host->sub(host->rsp, vreg_traits_t<Vmm>::vlen);
     host->uni_vmovups(host->ptr[host->rsp], vmm);
 }
 
 template <typename Vmm>
-static void pop_vmm(jit_generator *host, const Vmm &vmm) {
+static void pop_vmm(jit_generator_t *host, const Vmm &vmm) {
     host->uni_vmovups(vmm, host->ptr[host->rsp]);
-    host->add(host->rsp, vreg_traits<Vmm>::vlen);
+    host->add(host->rsp, vreg_traits_t<Vmm>::vlen);
 }
 
-static void push_opmask(jit_generator *host, const Xbyak::Opmask &k) {
+static void push_opmask(jit_generator_t *host, const Xbyak::Opmask &k) {
     static constexpr int k_mask_size = 8;
     host->sub(host->rsp, k_mask_size);
     if (mayiuse(avx512_core))
@@ -338,7 +350,7 @@ static void push_opmask(jit_generator *host, const Xbyak::Opmask &k) {
         host->kmovw(host->ptr[host->rsp], k);
 }
 
-static void pop_opmask(jit_generator *host, const Xbyak::Opmask &k) {
+static void pop_opmask(jit_generator_t *host, const Xbyak::Opmask &k) {
     static constexpr int k_mask_size = 8;
     if (mayiuse(avx512_core))
         host->kmovq(k, host->ptr[host->rsp]);
@@ -348,8 +360,8 @@ static void pop_opmask(jit_generator *host, const Xbyak::Opmask &k) {
 }
 
 template <typename Vmm>
-static void restore_stack(jit_generator *host, const Vmm &vmm) {
-    host->add(host->rsp, vreg_traits<Vmm>::vlen);
+static void restore_stack(jit_generator_t *host, const Vmm &vmm) {
+    host->add(host->rsp, vreg_traits_t<Vmm>::vlen);
 }
 
 template <cpu_isa_t isa, typename Vmm>
@@ -387,7 +399,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::compute_vector_range(
     const auto end_idx = *(vmm_idxs.rbegin());
 
     // Phase 1 Validate temporary vmm user hint
-    static constexpr int max_vmm_idx = cpu_isa_traits<isa>::n_vregs - 1;
+    static constexpr int max_vmm_idx = cpu_isa_traits_t<isa>::n_vregs - 1;
     auto &vmm_hint = rhs_arg_static_params_.rhs_dt_helper_vmm_idx;
     vmm_hint = adjust_temp_vmm_hint(vmm_hint, start_idx, end_idx, max_vmm_idx);
 
@@ -414,7 +426,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::compute_vector_range(
             || rhs_arg_data_type != data_type::f32 || bcast_f32_non_avx512
             || should_preserve_vmm_tail || post_op.is_prelu();
     const auto tail_load_mode = rhs_arg_params.tail_load_mode;
-    const int simd_w = cpu_isa_traits<isa>::vlen
+    const int simd_w = cpu_isa_traits_t<isa>::vlen
             / types::data_type_size(dst_d.data_type());
     const int blk_size = dst_d.blocking_desc().inner_blks[0];
     const bool use_offset_conversions
@@ -426,6 +438,9 @@ void jit_uni_binary_injector_t<isa, Vmm>::compute_vector_range(
                     broadcasting_strategy_t::per_oc,
                     broadcasting_strategy_t::per_oc_spatial)
             && blk_size > simd_w;
+    const bool shoud_preserve_oc_d_offset_conversion_regs
+            = use_offset_conversions
+            && rhs_broadcasting_strategy == broadcasting_strategy_t::per_oc_d;
     const bool should_preserve_mb_sp_offset_conversion_regs
             = use_offset_conversions
             && utils::one_of(rhs_broadcasting_strategy,
@@ -476,6 +491,10 @@ void jit_uni_binary_injector_t<isa, Vmm>::compute_vector_range(
                             ? std::initializer_list<Xbyak::Reg64>(
                                     {rhs_arg_static_params_.rhs_addr_cache_reg,
                                             host_->rax, host_->rdx, host_->r8})
+                            : shoud_preserve_oc_d_offset_conversion_regs
+                            ? std::initializer_list<Xbyak::Reg64>(
+                                    {rhs_arg_static_params_.rhs_addr_cache_reg,
+                                            host_->rax, host_->rdx})
                             : should_preserve_mb_sp_offset_conversion_regs
                             ? std::initializer_list<Xbyak::Reg64>(
                                     {rhs_arg_static_params_.rhs_addr_cache_reg,
@@ -579,6 +598,13 @@ Xbyak::Address jit_uni_binary_injector_t<isa, Vmm>::prepare_rhs_arg_addr(
                     ? host_->ptr_b[rhs_addr_reg]
                     : host_->ptr[rhs_addr_reg];
         }
+        case broadcasting_strategy_t::per_oc_d: {
+            append_oc_d_offset(rhs_arg_params.vmm_idx_to_out_addr,
+                    rhs_arg_params.vmm_idx_to_out_reg,
+                    rhs_arg_params.vmm_idx_to_out_elem_off_val, vmm_idx,
+                    rhs_addr_reg, rhs_helper_reg, rhs_arg_elem_size, is_first);
+            return host_->ptr_b[rhs_addr_reg];
+        }
         case broadcasting_strategy_t::per_mb_spatial: {
             append_mb_sp_offset(rhs_arg_params.vmm_idx_to_out_addr,
                     rhs_arg_params.vmm_idx_to_out_reg,
@@ -825,7 +851,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::calculate_oc_blocked_base(
     // c = ((offset % strides[0]) / strides[1]) * strides[ndims - 1] + offset % blk_size
     // output = rax
     const auto dst_d = rhs_arg_static_params_.dst_d;
-    const int simd_w = cpu_isa_traits<isa>::vlen
+    const int simd_w = cpu_isa_traits_t<isa>::vlen
             / types::data_type_size(dst_d.data_type());
     const int blk_size = dst_d.blocking_desc().inner_blks[0];
     const auto rax = host_->rax;
@@ -923,6 +949,125 @@ void jit_uni_binary_injector_t<isa, Vmm>::calculate_oc_cspn_partial(
                                 : offset_adj);
 }
 
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_binary_injector_t<isa, Vmm>::append_oc_d_offset(
+        const std::map<int, Xbyak::Address> &vmm_idx_to_out_addr,
+        const std::map<int, Xbyak::Reg64> &vmm_idx_to_out_reg,
+        const std::map<int, size_t> &vmm_idx_to_out_elem_off_val, int vmm_idx,
+        const Xbyak::Reg64 &addr_reg, const Xbyak::Reg64 &tmp_reg,
+        std::size_t elem_size_bytes, bool is_first) const {
+
+    const auto it_out_addr = vmm_idx_to_out_addr.find(vmm_idx);
+    const auto it_out_reg = vmm_idx_to_out_reg.find(vmm_idx);
+
+    const bool is_out_addr = it_out_addr != vmm_idx_to_out_addr.end();
+    const bool is_out_reg = it_out_reg != vmm_idx_to_out_reg.end();
+
+    if (is_out_addr || is_out_reg) {
+        Xbyak::Address out_addr = is_out_addr ? it_out_addr->second
+                                              : host_->ptr[it_out_reg->second];
+        const auto it_off_val = vmm_idx_to_out_elem_off_val.find(vmm_idx);
+        const auto &addr_cache_reg = rhs_arg_static_params_.rhs_addr_cache_reg;
+
+        const auto dst_d = rhs_arg_static_params_.dst_d;
+        const auto strides = dst_d.blocking_desc().strides;
+        const auto layout = injector_utils::get_layout_type(dst_d);
+
+        if (is_first) {
+            calculate_no_broadcast_base(out_addr, tmp_reg);
+
+            const auto rax = host_->rax;
+            const auto rdx = host_->rdx;
+
+            const injector_utils::conditional_register_preserve_guard_t
+                    register_guard {is_out_reg ? utils::one_of(
+                                            it_out_reg->second, rax, rdx)
+                                               : false,
+                            host_, {it_out_reg->second}};
+
+            switch (layout) {
+                case injector_utils::layout_t::ncsp:
+                    calculate_oc_d_ncsp_base(strides, tmp_reg);
+                    break;
+                default: assert(!"unimplemented layout");
+            }
+
+            if (elem_size_bytes == 1) {
+                host_->add(addr_reg, rax);
+            } else {
+                const int shift_val = std::log2(elem_size_bytes);
+                host_->mov(tmp_reg, rax);
+                host_->sal(tmp_reg, shift_val);
+                host_->add(addr_reg, tmp_reg);
+            }
+            host_->mov(addr_cache_reg, addr_reg);
+        } else {
+            host_->mov(addr_reg, addr_cache_reg);
+        }
+
+        if (it_off_val != vmm_idx_to_out_elem_off_val.end()) {
+            switch (layout) {
+                case injector_utils::layout_t::ncsp:
+                    calculate_oc_d_ncsp_partial(strides, it_off_val->second,
+                            tmp_reg, elem_size_bytes);
+                    break;
+                default: assert(!"unimplemented layout");
+            }
+            host_->add(addr_reg, tmp_reg);
+        }
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_binary_injector_t<isa, Vmm>::calculate_oc_d_ncsp_base(
+        const dim_t *strides, const Xbyak::Reg64 &tmp_reg) const {
+
+    /* 
+    DST: b, g, g/c, sp -> a, b, c, d
+    POB: 1, g, g/c, 1 -> 1, b, c, 1
+    dstrides = {B*C*D, C*D, D, 1}
+    pstrides = {B*C, C, 1, 1}
+    dst_offset = a*dstride[0] + b*dstride[1] + c*dstride[2] + d*dstride[3]
+               = a*B*C*D + b*C*D + c*D + d
+    pob_offset = b*pstride[1] + c
+               = b*C + c
+    dst_offset = a*dstride[0] + b*dstride[1] + c*dstride[2] + d*dstride[3] = a*B*C*D + b*C*D + c*D + d 
+    pob_offset = b*pstride[1] + c = b*C + c
+
+    dst_offset % dstride[0] =  b*dstride[1] + c*dstride[2] + d*dstride[3] = b*C*D + c*D + d
+    dst_offset % dstride[0] / dstride[2]  = b*C + c + d/D  ; d/D = 0 -> max(d) = D-1 -> (D-1)/D = 0 floor int div. -> d/D = 0
+    dst_offset % dstride[0] / dstride[2] = b*C + c
+*/
+
+    // dst_offset % dstride[0] / dstride[2]
+    // output = rax
+    const auto rax = host_->rax;
+    const auto rdx = host_->rdx;
+    host_->mov(rax, tmp_reg);
+    host_->mov(tmp_reg, strides[0]);
+    host_->xor_(rdx, rdx);
+    host_->div(tmp_reg);
+    host_->mov(tmp_reg, strides[2]);
+    host_->mov(rax, rdx);
+    host_->xor_(rdx, rdx);
+    host_->div(tmp_reg);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_binary_injector_t<isa, Vmm>::calculate_oc_d_ncsp_partial(
+        const dim_t *strides, const std::size_t offset,
+        const Xbyak::Reg64 &tmp_reg, std::size_t elem_size_bytes) const {
+    // dst_offset % dstride[0] / dstride[2] = b*C + c
+    const auto offset_adj
+            = ((offset >> math::ilog2q(types::data_type_size(
+                        rhs_arg_static_params_.dst_d.data_type())))
+                      % strides[0])
+            / strides[2];
+    host_->mov(tmp_reg,
+            elem_size_bytes > 1 ? offset_adj << math::ilog2q(elem_size_bytes)
+                                : offset_adj);
+}
+
 template <cpu_isa_t isa, typename Vmm>
 void jit_uni_binary_injector_t<isa, Vmm>::append_mb_sp_offset(
         const std::map<int, Xbyak::Address> &vmm_idx_to_out_addr,
@@ -1093,7 +1238,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::calculate_mb_sp_blocked_base(
     // mb_sp_off = offset - (c * stride_c) - (n * (C - 1)DHW) - c % blk_size
     // output = rax
     const auto dst_d = rhs_arg_static_params_.dst_d;
-    const int simd_w = cpu_isa_traits<isa>::vlen
+    const int simd_w = cpu_isa_traits_t<isa>::vlen
             / types::data_type_size(dst_d.data_type());
     const int blk_size = dst_d.blocking_desc().inner_blks[0];
 
@@ -2310,7 +2455,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::inject_binary(
             = rhs_arg_data_type != data_type::f32 || (scalar_f32 && !is_avx512_)
             || with_tail_not_fusable_to_binary_op
             || !binary_op_with_unaligned_mem_operand_allowed_
-            || (cmp_op && !is_avx512_);
+            || ((cmp_op || alg == alg_kind::binary_prelu) && !is_avx512_);
 
     if (process_rhs_arg_using_tmp_vmm) {
 
@@ -2478,7 +2623,7 @@ struct helper_broadcast_s8u8_t {};
 
 template <typename Vmm>
 struct helper_broadcast_s8u8_t<avx, Vmm> {
-    static void execute_broadcast_s8u8_no_tail(jit_generator *host,
+    static void execute_broadcast_s8u8_no_tail(jit_generator_t *host,
             const int rhs_helper_reg_idx, const data_type_t &data_type,
             const Vmm &tmp_vmm, const Xbyak::Address &rhs_addr,
             const std::function<void()> &post_process) {
@@ -2615,7 +2760,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::execute_broadcast_tail_with_opmask(
 
 static constexpr int xmm_size_elem = 4;
 
-static void load_tail_avx(jit_generator *host, std::size_t ymm_idx,
+static void load_tail_avx(jit_generator_t *host, std::size_t ymm_idx,
         std::size_t tail_size, const std::function<void()> &init_op,
         const std::function<void(int, bool)> &ymm_upper_half_op,
         const std::function<void(int)> &ymm_lower_half_op) {
@@ -2644,7 +2789,7 @@ static void load_tail_avx(jit_generator *host, std::size_t ymm_idx,
     }
 }
 
-static void load_tail_avx(jit_generator *host, std::size_t ymm_idx,
+static void load_tail_avx(jit_generator_t *host, std::size_t ymm_idx,
         std::size_t tail_size,
         const std::function<void(int, bool)> &ymm_upper_half_op,
         const std::function<void(int)> &ymm_lower_half_op) {
@@ -2657,7 +2802,7 @@ static Xbyak::uint8 MM_SHUFFLE(
     return (((z) << 6) | ((y) << 4) | ((x) << 2) | (w));
 }
 
-static void execute_broadcast_f32_tail_avx(jit_generator *host,
+static void execute_broadcast_f32_tail_avx(jit_generator_t *host,
         const Xbyak::Ymm &vmm, const Xbyak::Address &rhs_addr,
         std::size_t tail_size) {
 
@@ -2682,7 +2827,7 @@ static void execute_broadcast_f32_tail_avx(jit_generator *host,
             host, vmm_idx, tail_size, init_op, upper_half_op, lower_half_op);
 }
 
-static void execute_broadcast_f32_tail_avx(jit_generator *host,
+static void execute_broadcast_f32_tail_avx(jit_generator_t *host,
         const Xbyak::Xmm &vmm, const Xbyak::Address &rhs_addr,
         std::size_t tail_size) {
 
@@ -2702,7 +2847,7 @@ struct helper_bcast_tail_t {};
 
 template <typename Vmm>
 struct helper_bcast_tail_t<avx2, Vmm> {
-    static void execute_broadcast_tail_statically(jit_generator *host,
+    static void execute_broadcast_tail_statically(jit_generator_t *host,
             const size_t tail_size, const data_type_t &data_type,
             const Vmm &tmp_vmm, const Xbyak::Address &rhs_addr) {
         host->uni_vxorps(tmp_vmm, tmp_vmm, tmp_vmm);
@@ -2725,7 +2870,7 @@ struct helper_bcast_tail_t<avx2, Vmm> {
 
 template <typename Vmm>
 struct helper_bcast_tail_t<avx2_vnni_2, Vmm> {
-    static void execute_broadcast_tail_statically(jit_generator *host,
+    static void execute_broadcast_tail_statically(jit_generator_t *host,
             const size_t tail_size, const data_type_t &data_type,
             const Vmm &tmp_vmm, const Xbyak::Address &rhs_addr,
             fp8_emulation_e5m2_t *f8_e5m2_emu,
@@ -2733,7 +2878,7 @@ struct helper_bcast_tail_t<avx2_vnni_2, Vmm> {
         if (utils::one_of(data_type, data_type::bf16, data_type::f16,
                     data_type::f8_e5m2, data_type::f8_e4m3)) {
             const auto tmp_lower_vmm =
-                    typename vreg_traits<Vmm>::Vmm_lower_t(tmp_vmm.getIdx());
+                    typename vreg_traits_t<Vmm>::Vmm_lower_t(tmp_vmm.getIdx());
             host->load_bytes(tmp_lower_vmm, rhs_addr,
                     tail_size * types::data_type_size(data_type));
             if (data_type == data_type::bf16) {
@@ -3082,7 +3227,7 @@ struct helper_load_tail_t {};
 
 template <typename Vmm>
 struct helper_load_tail_t<avx2, Vmm> {
-    static void load_rhs_tail_statically(jit_generator *host,
+    static void load_rhs_tail_statically(jit_generator_t *host,
             const size_t tail_size, const Xbyak::Reg64 &rhs_addr_reg,
             const data_type_t &data_type, const Vmm &tmp_vmm,
             const Xbyak::Address &rhs_addr) {
@@ -3097,13 +3242,13 @@ struct helper_load_tail_t<avx2, Vmm> {
 
 template <typename Vmm>
 struct helper_load_tail_t<avx2_vnni_2, Vmm> {
-    static void load_rhs_tail_statically(jit_generator *host,
+    static void load_rhs_tail_statically(jit_generator_t *host,
             const size_t tail_size, const Xbyak::Reg64 &rhs_addr_reg,
             const data_type_t &data_type, const Vmm &tmp_vmm,
             const Xbyak::Address &rhs_addr) {
         if (utils::one_of(data_type, data_type::bf16, data_type::f16)) {
             const auto tmp_lower_vmm =
-                    typename vreg_traits<Vmm>::Vmm_lower_t(tmp_vmm.getIdx());
+                    typename vreg_traits_t<Vmm>::Vmm_lower_t(tmp_vmm.getIdx());
             host->load_bytes(tmp_lower_vmm, rhs_addr_reg, 0,
                     tail_size * sizeof(bfloat16_t));
             if (data_type == data_type::bf16) {
@@ -3277,6 +3422,32 @@ jit_uni_binary_injector_t<isa, Vmm>::execute_cmp_binary(const Vmm &dst,
     pop_opmask(host_, cmp_mask);
 }
 
+template <cpu_isa_t isa, typename Vmm>
+template <typename T>
+typename std::enable_if<std::is_same<T, Xbyak::Zmm>::value
+        || std::is_same<T, Xbyak::Address>::value>::type
+jit_uni_binary_injector_t<isa, Vmm>::execute_prelu_binary(const Vmm &dst, const Vmm &lhs, const T &rhs) const {
+    Vmm dst_vmm = Vmm(dst.getIdx());
+    Xbyak::Opmask maybe_tail_kmask = Xbyak::Opmask(dst.getOpmaskIdx());
+
+    auto aux0_idx = rhs_arg_static_params_.rhs_prelu_helper_vmm_idx;
+    if (static_cast<int>(aux0_idx) == lhs.getIdx()) { 
+        aux0_idx = (aux0_idx + 1) % 16;
+    }
+    const Xbyak::Zmm zmm_aux0 = Xbyak::Zmm(aux0_idx);
+    Xbyak::Opmask aux_kmask = get_aux_kmask();
+
+    push_opmask(host_, aux_kmask);
+    push_vmm(host_, zmm_aux0);
+
+    host_->uni_vpxor(zmm_aux0, zmm_aux0, zmm_aux0);
+    host_->vcmpps(aux_kmask | maybe_tail_kmask, lhs, zmm_aux0, jit_generator_t::_cmp_lt_os);
+    pop_vmm(host_, zmm_aux0);
+    host_->uni_vmulps(dst_vmm | aux_kmask, lhs, rhs);
+    pop_opmask(host_, aux_kmask);
+}
+
+
 // SSE4.1., AVX and AVX2 implementation
 template <cpu_isa_t isa, typename Vmm>
 template <typename T>
@@ -3296,6 +3467,63 @@ jit_uni_binary_injector_t<isa, Vmm>::execute_cmp_binary(const Vmm &dst,
     host_->uni_vminps(dst, dst, vreg_one);
 }
 
+// todo: [antonvor] check sse41 path
+template <cpu_isa_t isa, typename Vmm>
+template <typename T>
+typename std::enable_if<!(std::is_same<T, Xbyak::Zmm>::value
+                          || std::is_same<T, Xbyak::Address>::value)>::type
+jit_uni_binary_injector_t<isa, Vmm>::execute_prelu_binary(const Vmm &dst,
+    const Vmm &lhs, const T &rhs) const {
+    if (is_superset(isa, avx)) {
+        host_->uni_vmulps(rhs, rhs, lhs);
+        host_->uni_vblendvps(dst, lhs, rhs, lhs);
+    } else {
+        using dnnl::impl::utils::one_of;
+        // in sse4 vmm_aux0 as mask it's index must be 0
+        Vmm vmm_aux0 = Vmm(rhs_arg_static_params_.rhs_prelu_helper_vmm_idx);
+
+        if (one_of(vmm_aux0, dst, lhs, rhs)) {
+            //let's find a vacant XMM register
+            int occupied_idices[] = {dst.getIdx(), lhs.getIdx(), rhs.getIdx()};
+
+            int fixup_reg_indx = 14;
+            while (std::any_of(std::begin(occupied_idices), std::end(occupied_idices),
+                [&](const int x) {return x == fixup_reg_indx;}) && --fixup_reg_indx > 0) {}
+            if (fixup_reg_indx < 0) assert(!"couldn't find a vacant XMM reg");
+
+            vmm_aux0 = Vmm(fixup_reg_indx);
+        }
+
+        push_vmm(host_, vmm_aux0);
+
+        auto swap_aux0 = [&](const Vmm &reg) {
+            Vmm vmm(reg.getIdx());
+            host_->vmovups(vmm_aux0, vmm);
+            std::swap(vmm_aux0, vmm);
+            return vmm;
+        };
+
+        const auto aux_orig_indx = vmm_aux0.getIdx();
+        // if XMM0 is occupied, we swap XMM0 with vmm_aux0 to use XMM0 as the mask register
+        const auto& dst_ = 0 == dst.getIdx() ? swap_aux0(dst) : dst;
+        const auto& lhs_ = 0 == lhs.getIdx() ? swap_aux0(lhs) : lhs;
+        const auto& rhs_ = 0 == rhs.getIdx() ? swap_aux0(rhs) : rhs;
+
+        host_->uni_vmulps(rhs_, rhs_, lhs_);
+        host_->vpxor(vmm_aux0, vmm_aux0, vmm_aux0);
+        host_->vcmpltps(vmm_aux0, lhs_, vmm_aux0);
+        host_->uni_vblendvps(dst_, lhs_, rhs_, vmm_aux0);
+
+        if (aux_orig_indx != 0) {
+            auto vmm_aux_orig = Vmm(aux_orig_indx);
+            host_->vmovups(vmm_aux0, vmm_aux_orig); // restore original Xmm0 value
+            std::swap(vmm_aux0, vmm_aux_orig);
+        }
+
+        pop_vmm(host_, vmm_aux0);
+    }
+}
+
 template <cpu_isa_t isa, typename Vmm>
 template <typename T>
 void jit_uni_binary_injector_t<isa, Vmm>::execute_binary(alg_kind_t binary_alg,
@@ -3308,22 +3536,25 @@ void jit_uni_binary_injector_t<isa, Vmm>::execute_binary(alg_kind_t binary_alg,
         case alg_kind::binary_div: host_->uni_vdivps(dst, lhs, rhs); break;
         case alg_kind::binary_sub: host_->uni_vsubps(dst, lhs, rhs); break;
         case alg_kind::binary_ge:
-            execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_nlt_us);
+            execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_nlt_us);
             break;
         case alg_kind::binary_gt:
-            execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_nle_us);
+            execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_nle_us);
             break;
         case alg_kind::binary_le:
-            execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_le_os);
+            execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_le_os);
             break;
         case alg_kind::binary_lt:
-            execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_lt_os);
+            execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_lt_os);
             break;
         case alg_kind::binary_eq:
-            execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_eq_oq);
+            execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_eq_oq);
             break;
         case alg_kind::binary_ne:
-            execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_neq_uq);
+            execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_neq_uq);
+            break;
+        case alg_kind::binary_prelu:
+            execute_prelu_binary(dst, lhs, rhs);
             break;
         default: assert(!"unsupported algorithm");
     }
@@ -3335,7 +3566,7 @@ struct helper_binary_t {};
 template <typename Vmm>
 struct helper_binary_t<avx, Vmm> {
     template <typename T, typename F>
-    static void execute_binary(jit_generator *host, F execute_cmp_binary,
+    static void execute_binary(jit_generator_t *host, F execute_cmp_binary,
             alg_kind_t binary_alg, const Vmm &dst, const Vmm &lhs,
             const T &rhs) {
         switch (binary_alg) {
@@ -3346,22 +3577,22 @@ struct helper_binary_t<avx, Vmm> {
             case alg_kind::binary_div: host->uni_vdivps(dst, lhs, rhs); break;
             case alg_kind::binary_sub: host->uni_vsubps(dst, lhs, rhs); break;
             case alg_kind::binary_ge:
-                execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_nlt_us);
+                execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_nlt_us);
                 break;
             case alg_kind::binary_gt:
-                execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_nle_us);
+                execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_nle_us);
                 break;
             case alg_kind::binary_le:
-                execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_le_os);
+                execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_le_os);
                 break;
             case alg_kind::binary_lt:
-                execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_lt_os);
+                execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_lt_os);
                 break;
             case alg_kind::binary_eq:
-                execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_eq_oq);
+                execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_eq_oq);
                 break;
             case alg_kind::binary_ne:
-                execute_cmp_binary(dst, lhs, rhs, jit_generator::_cmp_neq_uq);
+                execute_cmp_binary(dst, lhs, rhs, jit_generator_t::_cmp_neq_uq);
                 break;
             default: assert(!"unsupported algorithm");
         }
@@ -3409,7 +3640,7 @@ void jit_uni_binary_injector_t<isa, Vmm>::execute_prelu(
         Xbyak::Opmask aux_kmask = get_aux_kmask();
         host_->vxorps(tmp_vmm, tmp_vmm, tmp_vmm);
         host_->vcmpps(aux_kmask | maybe_tail_kmask, dst_vmm, tmp_vmm,
-                jit_generator::_cmp_le_os);
+                jit_generator_t::_cmp_le_os);
         host_->vmulps(dst_vmm | aux_kmask, dst_vmm, rhs);
     } else if (is_superset(isa, avx)) {
         // Three operand version
@@ -3458,6 +3689,8 @@ template class jit_uni_binary_injector_t<avx, Xbyak::Ymm>;
 template class jit_uni_binary_injector_t<avx, Xbyak::Xmm>;
 template class jit_uni_binary_injector_t<sse41>;
 
+#undef VCHECK_BIN_INJ_BOOL
+
 } // namespace binary_injector
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/injectors/jit_uni_binary_injector.hpp b/src/cpu/x64/injectors/jit_uni_binary_injector.hpp
index 2bc289665c4..824f40865ee 100644
--- a/src/cpu/x64/injectors/jit_uni_binary_injector.hpp
+++ b/src/cpu/x64/injectors/jit_uni_binary_injector.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -113,7 +113,7 @@ struct rhs_arg_static_params_t {
             bool preserve_vmm_helper, std::size_t abi_param_offset,
             std::size_t dst_orig_offset, const memory_desc_wrapper &dst_d,
             std::size_t tail_size, const Xbyak::Opmask &tail_opmask,
-            bool use_exact_tail_scalar_bcast);
+            bool use_exact_tail_scalar_bcast, std::size_t rhs_prelu_helper_vmm_idx = 0);
     rhs_arg_static_params_t(std::size_t rhs_dt_helper_vmm_idx,
             const Xbyak::Reg64 &rhs_addr_reg,
             const Xbyak::Reg64 &rhs_helper_reg,
@@ -122,11 +122,11 @@ struct rhs_arg_static_params_t {
             std::size_t dst_orig_offset, const memory_desc_wrapper &dst_d,
             std::size_t tail_size, const Xbyak::Opmask &tail_opmask,
             const Xbyak::Reg64 &reg_tail_size,
-            bool use_exact_tail_scalar_bcast);
+            bool use_exact_tail_scalar_bcast, std::size_t rhs_prelu_helper_vmm_idx = 0);
 
     bool is_opmask_set() const noexcept { return is_opmask_set_; }
 
-    mutable std::size_t rhs_dt_helper_vmm_idx;
+    mutable std::size_t rhs_dt_helper_vmm_idx = 0;
     Xbyak::Reg64 rhs_addr_reg;
     Xbyak::Reg64 rhs_helper_reg;
     Xbyak::Reg64 rhs_addr_cache_reg;
@@ -141,6 +141,8 @@ struct rhs_arg_static_params_t {
     Xbyak::Reg64 reg_tail_size;
     bool is_tail;
 
+    mutable std::size_t rhs_prelu_helper_vmm_idx = 0;
+
 private:
     rhs_arg_static_params_t(std::size_t rhs_dt_helper_vmm_idx,
             const Xbyak::Reg64 &rhs_addr_reg,
@@ -161,7 +163,7 @@ struct rhs_arg_static_params_t {
  *
  * @param param1 - register storing abi param1. At the moment of calling
  * compute_vector_range method can be different than the default one defined
- * inside jit_generator.
+ * inside jit_generator_t.
  * @param bcast_set_t supported_strategy_set - set allowing disabling particular
  * bcast strategies
  * @param rhs_arg_static_params - params related to all binary post-ops right-hand side
@@ -255,11 +257,11 @@ bool is_supported(cpu_isa_t isa, const dnnl::impl::memory_desc_t &src1_desc,
  * isa: sse41, avx, avx2, avx512 with core, bf16 extensions as well as data
  * types: f32, bf16, s32, u8, s8.
  */
-template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits<isa>::Vmm>
+template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits_t<isa>::Vmm>
 class jit_uni_binary_injector_t {
 public:
     jit_uni_binary_injector_t(
-            jit_generator *host, const static_params_t &static_params);
+            jit_generator_t *host, const static_params_t &static_params);
 
     /*
      * Generates code of binary post_op injected to host primitive. Applied to
@@ -362,6 +364,19 @@ class jit_uni_binary_injector_t {
             const std::size_t offset, const Xbyak::Reg64 &tmp_reg,
             std::size_t elem_size_bytes) const;
 
+    void append_oc_d_offset(
+            const std::map<int, Xbyak::Address> &vmm_idx_to_out_addr,
+            const std::map<int, Xbyak::Reg64> &vmm_idx_to_out_reg,
+            const std::map<int, size_t> &vmm_idx_to_out_elem_off_val,
+            int vmm_idx, const Xbyak::Reg64 &addr_reg,
+            const Xbyak::Reg64 &tmp_reg, std::size_t elem_size_bytes,
+            bool is_first) const;
+    void calculate_oc_d_ncsp_base(
+            const dim_t *strides, const Xbyak::Reg64 &tmp_reg) const;
+    void calculate_oc_d_ncsp_partial(const dim_t *strides,
+            const std::size_t offset, const Xbyak::Reg64 &tmp_reg,
+            std::size_t elem_size_bytes) const;
+
     void append_mb_sp_offset(
             const std::map<int, Xbyak::Address> &vmm_idx_to_out_addr,
             const std::map<int, Xbyak::Reg64> &vmm_idx_to_out_reg,
@@ -521,11 +536,19 @@ class jit_uni_binary_injector_t {
     execute_cmp_binary(const Vmm &dst, const Vmm &lhs, const T &rhs,
             const unsigned int cmp_predicate) const;
     template <typename T>
+    typename std::enable_if<std::is_same<T, Xbyak::Zmm>::value
+            || std::is_same<T, Xbyak::Address>::value>::type
+    execute_prelu_binary(const Vmm &dst, const Vmm &lhs, const T &rhs) const;
+    template <typename T>
     typename std::enable_if<!(std::is_same<T, Xbyak::Zmm>::value
             || std::is_same<T, Xbyak::Address>::value)>::type
     execute_cmp_binary(const Vmm &dst, const Vmm &lhs, const T &rhs,
             const unsigned int cmp_predicate) const;
     template <typename T>
+    typename std::enable_if<!(std::is_same<T, Xbyak::Zmm>::value
+            || std::is_same<T, Xbyak::Address>::value)>::type
+    execute_prelu_binary(const Vmm &dst, const Vmm &lhs, const T &rhs) const;
+    template <typename T>
     void execute_binary(alg_kind_t binary_alg, const Vmm &dst, const Vmm &lhs,
             const T &rhs) const;
     void execute_prelu(const Vmm &dst, const Xbyak::Operand &rhs) const;
@@ -584,7 +607,7 @@ class jit_uni_binary_injector_t {
     */
     Xbyak::Opmask get_aux_kmask() const;
 
-    jit_generator *host_;
+    jit_generator_t *host_;
     fp8_emulation_e5m2_t *f8_e5m2_emu_ {nullptr};
     fp8_emulation_e4m3_t *f8_e4m3_emu_ {nullptr};
     const rhs_arg_static_params_t rhs_arg_static_params_;
diff --git a/src/cpu/x64/injectors/jit_uni_depthwise_injector.cpp b/src/cpu/x64/injectors/jit_uni_depthwise_injector.cpp
new file mode 100644
index 00000000000..f017003b802
--- /dev/null
+++ b/src/cpu/x64/injectors/jit_uni_depthwise_injector.cpp
@@ -0,0 +1,274 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/nstl.hpp"
+#include "common/utils.hpp"
+#include "cpu/x64/injectors/injector_utils.hpp"
+
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa>
+int jit_uni_depthwise_injector_f32<isa>::aux_vecs_count(alg_kind_t depthwise_alg, bool is_broadcast) {
+    switch (depthwise_alg) {
+        case alg_kind::depthwise_scale_shift: return isa == sse41 || is_broadcast ? 1 : 0;
+        case alg_kind::depthwise_prelu: return 2;
+        default: assert(!"unsupported depthwise algorithm");
+    }
+
+    return 0;
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::injector_preamble(size_t start_idx, size_t end_idx, bool is_broadcast) {
+    preserved_vecs_count = 0;
+    vecs_to_preserve = (size_t)jit_uni_depthwise_injector_f32<isa>::aux_vecs_count(depthwise_alg, is_broadcast);
+
+    for (size_t i = 0; i < vecs_count; i++) {
+        if (preserved_vecs_count >= vecs_to_preserve)
+            break;
+
+        if (i < start_idx || i >= end_idx) {
+            preserved_vec_idxs[preserved_vecs_count] = i;
+            preserved_vecs_count++;
+        }
+    }
+
+    start_idx_tail = start_idx;
+    size_t preserved_vecs_count_tail = vecs_to_preserve - preserved_vecs_count;
+    for (size_t i = 0; i < preserved_vecs_count_tail; i++) {
+        preserved_vec_idxs[preserved_vecs_count] = start_idx + i;
+        preserved_vecs_count++;
+        start_idx_tail = start_idx + i + 1;
+    }
+
+    h->sub(h->rsp, preserved_vecs_count * vlen);
+    for (size_t i = 0; i < preserved_vecs_count; ++i)
+        h->uni_vmovups(h->ptr[h->rsp + i * vlen], Vmm(preserved_vec_idxs[i]));
+
+    assign_regs();
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::injector_preamble_tail(size_t start_idx, size_t end_idx) {
+    size_t tail_vecs_to_preserve = start_idx_tail - start_idx;
+    int idx_off = (vecs_to_preserve - tail_vecs_to_preserve);
+
+    if (tail_vecs_to_preserve > 0) {
+        h->add(h->rsp, idx_off * vlen);
+        for (size_t i = 0; i < tail_vecs_to_preserve; ++i)
+            h->uni_vmovups(Vmm(preserved_vec_idxs[idx_off + i]), h->ptr[h->rsp + i * vlen]);
+
+        for (size_t i = 0; i < tail_vecs_to_preserve; ++i) {
+            preserved_vec_idxs[idx_off + i] += tail_vecs_to_preserve;
+        }
+
+        for (size_t i = 0; i < tail_vecs_to_preserve; ++i)
+            h->uni_vmovups(h->ptr[h->rsp + i * vlen], Vmm(preserved_vec_idxs[idx_off + i]));
+        h->sub(h->rsp, idx_off * vlen);
+
+        assign_regs();
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::injector_postamble() {
+    for (size_t i = 0; i < preserved_vecs_count; ++i)
+        h->uni_vmovups(Vmm(preserved_vec_idxs[i]), h->ptr[h->rsp + i * vlen]);
+    h->add(h->rsp, preserved_vecs_count * vlen);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::assign_regs() {
+    vmm_mask = Vmm(preserved_vec_idxs[0]);
+    vmm_aux0 = Vmm(preserved_vec_idxs[1]);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::scale_shift_compute_vector(const Vmm &vmm_src,
+        const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast, int offset, bool scale_only) {
+    size_t weights_off = post_op_.depthwise.offset[post_op_.depthwise.scales] * sizeof(float);
+    size_t bias_off = post_op_.depthwise.offset[post_op_.depthwise.shifts] * sizeof(float);
+
+    if (isa == sse41) {
+        if (is_broadcast)
+            h->uni_vbroadcastss(vmm_mask, h->ptr[p_weights + weights_off]);
+        else
+            h->movups(vmm_mask, h->ptr[p_weights + offset + weights_off]);
+        h->mulps(vmm_src, vmm_mask);
+        if (!scale_only) {
+            if (is_broadcast)
+                h->uni_vbroadcastss(vmm_mask, h->ptr[p_bias + bias_off]);
+            else
+                h->movups(vmm_mask, h->ptr[p_bias + offset + bias_off]);
+            h->addps(vmm_src, vmm_mask);
+        }
+    } else {
+        if (is_broadcast) {
+            h->uni_vbroadcastss(vmm_mask, h->ptr[p_weights + weights_off]);
+            h->uni_vmulps(vmm_src, vmm_src, vmm_mask);
+            if (!scale_only) {
+                h->uni_vbroadcastss(vmm_mask, h->ptr[p_bias + bias_off]);
+                h->uni_vaddps(vmm_src, vmm_src, vmm_mask);
+            }
+        } else {
+            h->uni_vmulps(vmm_src, vmm_src, h->ptr[p_weights + offset + weights_off]);
+            if (!scale_only)
+                h->uni_vaddps(vmm_src, vmm_src, h->ptr[p_bias + offset + bias_off]);
+        }
+    };
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::prelu_compute_vector(const Vmm &vmm_src,
+        const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast, int offset) {
+    const unsigned char _cmp_gt_os = 6;
+    const unsigned char _cmp_lt_os = 1;
+    size_t weights_off =  post_op_.depthwise.offset[post_op_.depthwise.scales] * sizeof(float);
+
+    if (isa == sse41) {
+        h->pxor(vmm_mask, vmm_mask);
+        h->cmpps(vmm_mask, vmm_src, _cmp_gt_os);
+        if (is_broadcast)
+            h->uni_vbroadcastss(vmm_aux0, h->ptr[p_weights + weights_off]);
+        else
+            h->movups(vmm_aux0, h->ptr[p_weights + offset + weights_off]);
+        h->mulps(vmm_aux0, vmm_src);
+        h->blendvps(vmm_src, vmm_aux0);
+    } else if (isa == avx2) {
+        if (is_broadcast) {
+            h->uni_vbroadcastss(vmm_mask, h->ptr[p_weights + weights_off]);
+            h->vmulps(vmm_aux0, vmm_src, vmm_mask);
+        } else
+            h->vmulps(vmm_aux0, vmm_src, h->ptr[p_weights + offset + weights_off]);
+        h->vxorps(vmm_mask, vmm_mask, vmm_mask);
+        h->vcmpgtps(vmm_mask, vmm_src, vmm_mask);
+        h->vblendvps(vmm_src, vmm_aux0, vmm_src, vmm_mask);
+    } else if (isa == avx512_core) {
+        h->vxorpd(vmm_mask, vmm_mask, vmm_mask);
+        h->vmovups(vmm_aux0, vmm_src);
+        h->vcmpps(k_mask, vmm_src, vmm_mask, _cmp_lt_os);
+        if (is_broadcast) {
+            h->uni_vbroadcastss(vmm_mask, h->ptr[p_weights + weights_off]);
+            h->vmulps(vmm_src | k_mask, vmm_aux0, vmm_mask);
+        } else
+            h->vmulps(vmm_src | k_mask, vmm_aux0, h->ptr[p_weights + offset + weights_off]);
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::compute_body(size_t start_idx, size_t end_idx,
+        const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast, bool scale_only) {
+    for (size_t idx = start_idx; idx < end_idx; idx++) {
+        switch (depthwise_alg) {
+            case alg_kind::depthwise_scale_shift:
+                scale_shift_compute_vector(Vmm(idx), p_weights, p_bias, is_broadcast, 0, scale_only); break;
+            case alg_kind::depthwise_prelu:
+                prelu_compute_vector(Vmm(idx), p_weights, p_bias, is_broadcast); break;
+            default: assert(!"unsupported depthwise algorithm");
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::compute_vector_range(int start_idx, int end_idx,
+        const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast, bool scale_only) {
+    injector_preamble(start_idx, end_idx, is_broadcast);
+    compute_body(start_idx_tail, end_idx, p_weights, p_bias, is_broadcast, scale_only);
+    injector_preamble_tail(start_idx, end_idx);
+    compute_body(start_idx, start_idx_tail, p_weights, p_bias, is_broadcast, scale_only);
+    injector_postamble();
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::init_ptrs(const Xbyak::RegExp& ptr_data,
+                                                    const Xbyak::Reg64& reg_d_weights, const Xbyak::Reg64& reg_d_bias,
+                                                    const Xbyak::Operand& ch_off, bool is_broadcast) {
+    h->mov(reg_d_weights, h->ptr[ptr_data]);
+    if (post_op_.depthwise.alg == alg_kind::depthwise_scale_shift)
+        h->mov(reg_d_bias, h->ptr[ptr_data]);
+
+    if (!is_broadcast) {
+        h->add(reg_d_weights, ch_off);
+        if (post_op_.depthwise.alg == alg_kind::depthwise_scale_shift)
+            h->add(reg_d_bias, ch_off);
+    }
+}
+
+template <typename Vmm>
+static void push_vmm(jit_generator_t *host, const Vmm &vmm) {
+    host->sub(host->rsp, vreg_traits_t<Vmm>::vlen);
+    host->uni_vmovups(host->ptr[host->rsp], vmm);
+}
+
+template <typename Vmm>
+static void pop_vmm(jit_generator_t *host, const Vmm &vmm) {
+    host->uni_vmovups(vmm, host->ptr[host->rsp]);
+    host->add(host->rsp, vreg_traits_t<Vmm>::vlen);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_depthwise_injector_f32<isa>::compute(int start_idx, int end_idx,
+                                                  int vmm_d_weights_idx, int vmm_d_bias_idx,
+                                                  const Xbyak::Reg64& reg_d_weights, const Xbyak::Reg64& reg_d_bias,
+                                                  bool is_broadcast, int offset, bool need_to_preserve) {
+    vmm_mask = Vmm(vmm_d_weights_idx);
+    vmm_aux0 = Vmm(vmm_d_bias_idx);
+
+    if (need_to_preserve) {
+        preserved_vecs_count = aux_vecs_count(depthwise_alg, is_broadcast);
+        if (preserved_vecs_count > 0)
+            push_vmm(h, vmm_mask);
+        if (preserved_vecs_count > 1)
+            push_vmm(h, vmm_aux0);
+    }
+
+    for (int idx = start_idx; idx < end_idx; idx++) {
+        switch (depthwise_alg) {
+            case alg_kind::depthwise_scale_shift:
+                scale_shift_compute_vector(Vmm(idx), reg_d_weights, reg_d_bias, is_broadcast, offset); break;
+            case alg_kind::depthwise_prelu:
+                prelu_compute_vector(Vmm(idx), reg_d_weights, reg_d_bias, is_broadcast, offset); break;
+            default: assert(!"unsupported depthwise algorithm");
+        }
+    }
+
+    if (need_to_preserve) {
+        if (preserved_vecs_count > 1)
+            pop_vmm(h, vmm_aux0);
+        if (preserved_vecs_count > 1)
+            pop_vmm(h, vmm_mask);
+    }
+}
+
+template struct jit_uni_depthwise_injector_f32<avx512_core_fp16>;
+template struct jit_uni_depthwise_injector_f32<avx512_core_bf16>;
+template struct jit_uni_depthwise_injector_f32<avx512_core>;
+template struct jit_uni_depthwise_injector_f32<avx>;
+template struct jit_uni_depthwise_injector_f32<avx2_vnni_2>;
+template struct jit_uni_depthwise_injector_f32<avx2>;
+template struct jit_uni_depthwise_injector_f32<sse41>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/injectors/jit_uni_depthwise_injector.hpp b/src/cpu/x64/injectors/jit_uni_depthwise_injector.hpp
new file mode 100644
index 00000000000..edbd31edc06
--- /dev/null
+++ b/src/cpu/x64/injectors/jit_uni_depthwise_injector.hpp
@@ -0,0 +1,142 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_DEPTHWISE_INJECTOR_HPP
+#define CPU_X64_JIT_UNI_DEPTHWISE_INJECTOR_HPP
+
+#include <assert.h>
+
+#include "../../../common/c_types_map.hpp"
+#include "../../../common/primitive_attr.hpp"
+#include "../../../common/type_helpers.hpp"
+#include "../../../common/utils.hpp"
+
+#include "../jit_generator.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+namespace depthwise_injector {
+
+struct static_params_t {
+    static_params_t(int vmm_d_weights_idx = 0, int vmm_d_bias_idx = 0,
+                    Xbyak::Reg64 reg_d_weights = Xbyak::Reg64(0), Xbyak::Reg64 reg_d_bias = Xbyak::Reg64(0)) :
+        vmm_d_weights_idx(vmm_d_weights_idx), vmm_d_bias_idx(vmm_d_bias_idx), reg_d_weights(reg_d_weights), reg_d_bias(reg_d_bias) {}
+
+    int vmm_d_weights_idx;
+    int vmm_d_bias_idx;
+    Xbyak::Reg64 reg_d_weights;
+    Xbyak::Reg64 reg_d_bias;
+};
+
+struct dynamic_params_t {
+    dynamic_params_t(int vmm_d_weights_idx = 0, int vmm_d_bias_idx = 0,
+                     Xbyak::Reg64 reg_d_weights = Xbyak::Reg64(0), Xbyak::Reg64 reg_d_bias = Xbyak::Reg64(0),
+                     Xbyak::Reg64 reg_init_off = Xbyak::Reg64(0), const std::map<size_t, int> vmm_idx_off = {},
+                     Xbyak::Reg64 reg_post_ops_data = Xbyak::Reg64(0), int base_post_ops_data_offset = 0) :
+            vmm_d_weights_idx(vmm_d_weights_idx), vmm_d_bias_idx(vmm_d_bias_idx), reg_d_weights(reg_d_weights), reg_d_bias(reg_d_bias),
+            reg_init_off(reg_init_off), reg_init_off_addr(0), vmm_idx_off(vmm_idx_off), useAddr(false),
+            reg_post_ops_data(reg_post_ops_data), base_post_ops_data_offset(base_post_ops_data_offset) {}
+
+    dynamic_params_t(int vmm_d_weights_idx, int vmm_d_bias_idx,
+                     Xbyak::Reg64 reg_d_weights, Xbyak::Reg64 reg_d_bias,
+                     Xbyak::Address reg_init_off, const std::map<size_t, int> vmm_idx_off,
+                     Xbyak::Reg64 reg_post_ops_data = Xbyak::Reg64(0), int base_post_ops_data_offset = 0) :
+            vmm_d_weights_idx(vmm_d_weights_idx), vmm_d_bias_idx(vmm_d_bias_idx), reg_d_weights(reg_d_weights), reg_d_bias(reg_d_bias),
+            reg_init_off(0), reg_init_off_addr(reg_init_off), vmm_idx_off(vmm_idx_off), useAddr(true),
+            reg_post_ops_data(reg_post_ops_data), base_post_ops_data_offset(base_post_ops_data_offset) {}
+
+    int vmm_d_weights_idx;
+    int vmm_d_bias_idx;
+    Xbyak::Reg64 reg_d_weights;
+    Xbyak::Reg64 reg_d_bias;
+    Xbyak::Reg64 reg_init_off;
+    Xbyak::Address reg_init_off_addr;
+    std::map<size_t, int> vmm_idx_off;
+    bool useAddr;
+    Xbyak::Reg64 reg_post_ops_data;
+    int base_post_ops_data_offset;
+};
+
+} // quantization_injector
+
+template <cpu_isa_t isa>
+struct jit_uni_depthwise_injector_f32 {
+    using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+
+    jit_uni_depthwise_injector_f32(jit_generator_t* host, dnnl_post_ops::entry_t post_op, Xbyak::Opmask k_mask_ = Xbyak::Opmask(1))
+            : h(host), post_op_(post_op), k_mask(k_mask_) {
+        depthwise_alg = post_op.depthwise.alg;
+        assert(utils::one_of(depthwise_alg, alg_kind::depthwise_scale_shift, alg_kind::depthwise_prelu));
+    }
+
+    void compute_vector_range(int start_idx, int end_idx, const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast = false, bool scale_only = false);
+
+    void init_ptrs(const Xbyak::RegExp& ptr_data,
+                   const Xbyak::Reg64& reg_d_weights, const Xbyak::Reg64& reg_d_bias,
+                   const Xbyak::Operand& ch_off, bool is_broadcast);
+
+    void compute(int start_idx, int end_idx,
+                 int vmm_d_weights_idx, int vmm_d_bias_idx,
+                 const Xbyak::Reg64& reg_d_weights, const Xbyak::Reg64& reg_d_bias,
+                 bool is_broadcast = false, int offset = 0, bool need_to_preserve = false);
+
+    static constexpr size_t memoryStep() {
+        return sizeof(float*);
+    }
+
+private:
+    jit_generator_t* h;
+
+    size_t vlen = cpu_isa_traits_t<isa>::vlen;
+
+    alg_kind_t depthwise_alg;
+
+    mutable Vmm vmm_mask;
+    mutable Vmm vmm_aux0;
+
+    dnnl_post_ops::entry_t post_op_;
+
+    Xbyak::Opmask k_mask;
+
+    const static size_t preserved_vecs_max = 5;
+    size_t vecs_to_preserve = 0;
+    size_t vecs_count = isa == avx512_core ? 32 : 16;
+    size_t preserved_vecs_count = 0;
+    size_t preserved_vec_idxs[preserved_vecs_max] = {0};
+    size_t start_idx_tail = 0;
+
+    int aux_vecs_count(alg_kind_t elt_alg, bool is_broadcast);
+
+    void compute_body(size_t start_idx, size_t end_idx, const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast = false,  bool scale_only = false);
+    void injector_preamble(size_t start_idx, size_t end_idx, bool is_broadcast = false);
+    void injector_preamble_tail(size_t start_idx, size_t end_idx);
+    void injector_postamble();
+    void assign_regs();
+
+    void scale_shift_compute_vector(const Vmm &vmm_src, const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast = false, int offset = 0, bool scale_only = false);
+    void prelu_compute_vector(const Vmm &vmm_src, const Xbyak::Reg64& p_weights, const Xbyak::Reg64& p_bias, bool is_broadcast = false, int offset = 0);
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/injectors/jit_uni_eltwise_injector.cpp b/src/cpu/x64/injectors/jit_uni_eltwise_injector.cpp
index a9a13422689..b1e45e48bef 100644
--- a/src/cpu/x64/injectors/jit_uni_eltwise_injector.cpp
+++ b/src/cpu/x64/injectors/jit_uni_eltwise_injector.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,6 +29,9 @@ namespace x64 {
 
 namespace eltwise_injector {
 
+#define VCHECK_ELT_INJ_BOOL(cond, msg) \
+    VCONDCHECK(primitive, create, check, binary_injector, cond, false, msg);
+
 bool is_isa_supported(cpu_isa_t isa) {
     return is_superset(isa, sse41);
 }
@@ -41,6 +44,7 @@ bool is_alg_supported(alg_kind_t alg) {
             eltwise_gelu_tanh, eltwise_hardsigmoid, eltwise_hardswish,
             eltwise_swish, eltwise_log, eltwise_clip, eltwise_clip_v2,
             eltwise_pow, eltwise_gelu_erf, eltwise_round,
+            eltwise_hsigmoid, eltwise_round_half_away_from_zero, eltwise_round_half_to_even,
             eltwise_relu_use_dst_for_bwd, eltwise_tanh_use_dst_for_bwd,
             eltwise_elu_use_dst_for_bwd, eltwise_sqrt_use_dst_for_bwd,
             eltwise_logistic_use_dst_for_bwd, eltwise_exp_use_dst_for_bwd,
@@ -48,24 +52,27 @@ bool is_alg_supported(alg_kind_t alg) {
 }
 
 bool is_supported(cpu_isa_t isa, alg_kind_t alg, data_type_t dt) {
-    if (dt != data_type::f32) return false;
-
-    return is_isa_supported(isa) && is_alg_supported(alg);
+    VCHECK_ELT_INJ_BOOL(dt == data_type::f32, VERBOSE_UNSUPPORTED_DT);
+    VCHECK_ELT_INJ_BOOL(is_isa_supported(isa), VERBOSE_UNSUPPORTED_ISA);
+    VCHECK_ELT_INJ_BOOL(is_alg_supported(alg), "Unsupported algorithm");
+    return true;
 }
 
+#undef VCHECK_ELT_INJ_BOOL
+
 } // namespace eltwise_injector
 
 using namespace Xbyak;
 
 template <cpu_isa_t isa, typename Wmm>
-size_t jit_uni_eltwise_injector<isa, Wmm>::get_stack_vmm_space() {
+size_t jit_uni_eltwise_injector_t<isa, Wmm>::get_stack_vmm_space() {
     return (save_state_ * preserve_vmm_ * n_vregs_to_preserve_
                    + op_vecs_count(alg_, is_fwd_))
             * vlen_;
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::injector_preamble(
+void jit_uni_eltwise_injector_t<isa, Wmm>::injector_preamble(
         const injector_utils::vmm_index_set_t &vmm_compute_idxs,
         injector_utils::vmm_index_set_iterator_t &start_idx_tail_it,
         const injector_utils::vmm_index_set_t &vmm_aux_indices) {
@@ -217,7 +224,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::injector_preamble(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::injector_preamble_tail(
+void jit_uni_eltwise_injector_t<isa, Wmm>::injector_preamble_tail(
         size_t n_vregs_not_preserved) {
     // There was enough vmm registers to compute everything in one round.
     if (n_vregs_not_preserved == 0) return;
@@ -256,7 +263,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::injector_preamble_tail(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::injector_postamble() {
+void jit_uni_eltwise_injector_t<isa, Wmm>::injector_postamble() {
     using namespace Xbyak::util;
     const int stack_vmm_space = get_stack_vmm_space();
 
@@ -282,14 +289,15 @@ void jit_uni_eltwise_injector<isa, Wmm>::injector_postamble() {
     }
 
     if (!save_state_) return;
-    for (int i = aux_gprs_count(alg_, is_fwd_, alpha_) - 1; i >= 0; --i)
+    for (int i = static_cast<int>(aux_gprs_count(alg_, is_fwd_, alpha_)) - 1;
+            i >= 0; --i)
         h->pop(Reg64(preserved_gpr_indices_[i]));
 
     if (preserve_p_table_) h->pop(p_table_);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::assign_regs() {
+void jit_uni_eltwise_injector_t<isa, Wmm>::assign_regs() {
     vmm_mask_ = Vmm(preserved_vmm_tail_indices_[0]);
 
     // For avx we need a register to save the upper part of Ymm
@@ -316,13 +324,13 @@ void jit_uni_eltwise_injector<isa, Wmm>::assign_regs() {
 // initialized with stock values from the injector, or with external values
 // provided by the user.
 template <cpu_isa_t isa, typename Wmm>
-Wmm jit_uni_eltwise_injector<isa, Wmm>::vmm_aux(size_t idx) {
+Wmm jit_uni_eltwise_injector_t<isa, Wmm>::vmm_aux(size_t idx) {
     assert(idx < (n_vregs_preserved_ - need_vmm_mask_register_));
     return Vmm(preserved_vmm_indices_[idx]);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::vec_shift(const Vmm &vmm_dst,
+void jit_uni_eltwise_injector_t<isa, Wmm>::vec_shift(const Vmm &vmm_dst,
         const Vmm &vmm_src, bool shift_left, const int imm) {
     if (isa != avx) {
         if (shift_left)
@@ -350,7 +358,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::vec_shift(const Vmm &vmm_dst,
 // Uses injector masks objects: k_mask_ (>= avx512_core) or vmm_mask_ (<= avx2).
 // Stores a mask by applying cmpps on two inputs w/ a given predicate.
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::compute_cmp_mask(const Vmm &vmm_src,
+void jit_uni_eltwise_injector_t<isa, Wmm>::compute_cmp_mask(const Vmm &vmm_src,
         const Xbyak::Operand &compare_operand, int cmp_predicate) {
     if (is_avx512_) {
         h->vcmpps(k_mask_, vmm_src, compare_operand, cmp_predicate);
@@ -362,7 +370,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::compute_cmp_mask(const Vmm &vmm_src,
 // Uses injector masks objects: k_mask_ (>= avx512_core) or vmm_mask_ (<= avx2).
 // Blends a result of second input into a first input w/ a stored mask.
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::blend_with_mask(
+void jit_uni_eltwise_injector_t<isa, Wmm>::blend_with_mask(
         const Vmm &vmm_dst, const Xbyak::Operand &src) {
     if (is_avx512_) {
         h->vblendmps(vmm_dst | k_mask_, vmm_dst, src);
@@ -375,7 +383,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::blend_with_mask(
 // Tests a mask for all zeros. If all zeroes occur, set ZF = 1.
 // Nicely combines with jump_if_zero (jz).
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::test_mask() {
+void jit_uni_eltwise_injector_t<isa, Wmm>::test_mask() {
     if (is_avx512_) {
         h->kortestw(k_mask_, k_mask_);
     } else {
@@ -384,7 +392,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::test_mask() {
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::exp_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::exp_compute_vector_fwd(
         const Vmm &vmm_src) {
     // exp(x) =
     // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
@@ -450,7 +458,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::exp_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::relu_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::relu_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vmovups(vmm_aux(0), vmm_src);
     compute_cmp_mask(vmm_src, table_val(zero), _cmp_gt_os);
@@ -459,13 +467,13 @@ void jit_uni_eltwise_injector<isa, Wmm>::relu_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::relu_zero_ns_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::relu_zero_ns_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vmaxps(vmm_src, vmm_src, table_val(zero));
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::elu_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::elu_compute_vector_fwd(
         const Vmm &vmm_src) {
     // IMPORTANT: we use vmm_aux(2) for the mask as exp_compute does not use it.
     h->uni_vmovups(vmm_aux(2), vmm_src);
@@ -482,7 +490,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::elu_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::tanh_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::tanh_compute_vector_fwd(
         const Vmm &vmm_src) {
     // we add a check as the avx2 code cannot be used for avx
     assert(IMPLICATION(isa == avx2, mayiuse(avx2)));
@@ -688,7 +696,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::tanh_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::gelu_tanh_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::gelu_tanh_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vmovups(vmm_aux(0), vmm_src);
 
@@ -714,26 +722,26 @@ void jit_uni_eltwise_injector<isa, Wmm>::gelu_tanh_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::square_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::square_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vmulps(vmm_src, vmm_src, vmm_src);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::abs_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::abs_compute_vector_fwd(
         const Vmm &vmm_src) {
     // compute abs(x) = _mm_and_ps(x, 01111..111));
     h->uni_vandps(vmm_src, vmm_src, table_val(positive_mask));
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::sqrt_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::sqrt_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vsqrtps(vmm_src, vmm_src);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::linear_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::linear_compute_vector_fwd(
         const Vmm &vmm_src) {
     // compute x = alpha * x + beta;
     h->uni_vmovups(vmm_aux(0), table_val(alpha));
@@ -741,14 +749,14 @@ void jit_uni_eltwise_injector<isa, Wmm>::linear_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::clip_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::clip_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vmaxps(vmm_src, vmm_src, table_val(alpha));
     h->uni_vminps(vmm_src, vmm_src, table_val(beta));
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::mish_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::mish_compute_vector_fwd(
         const Vmm &vmm_src) {
     // An equation other than mish(x) = x*tanh(srelu(x)) was used
     // to calculate mish, but it should be remembered that it is equivalent
@@ -781,7 +789,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::mish_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::hardswish_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::hardswish_compute_vector_fwd(
         const Vmm &vmm_src) {
     // result = x * hardsigmoid(x)
     h->uni_vmovups(vmm_aux(0), vmm_src);
@@ -790,7 +798,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::hardswish_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::hardsigmoid_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::hardsigmoid_compute_vector_fwd(
         const Vmm &vmm_src) {
     // result = max(0, min(1, alpha * x + beta))
     h->uni_vmulps(vmm_src, vmm_src, table_val(alpha));
@@ -800,7 +808,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::hardsigmoid_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::soft_relu_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::soft_relu_compute_vector_fwd(
         const Vmm &vmm_src) {
     // alpha scaling
     h->uni_vmulps(vmm_src, vmm_src, table_val(alpha));
@@ -910,7 +918,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::soft_relu_compute_vector_fwd(
 
     // get vmm_mask_ = src > max logf
     // y = (x < max log f) ? soft_relu(x) : x
-    compute_cmp_mask(vmm_aux(2), table_val(exp_ln_flt_max_f), _cmp_gt_os);
+    compute_cmp_mask(vmm_aux(2), table_val(soft_relu_twenty), _cmp_gt_os);
     blend_with_mask(vmm_src, vmm_aux(2));
     if (alpha_ == 1.f) { // standard soft_relu case
         // Skip an instruction.
@@ -922,7 +930,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::soft_relu_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::logistic_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::logistic_compute_vector_fwd(
         const Vmm &vmm_src) {
     // To avoid exp(x) overflow happened at x > logf(FLT_MAX), negate positive,
     // compute exp(x), where x <= 0 to get 0 <= exp(x) <= 1 and restore value
@@ -955,7 +963,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::logistic_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::swish_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::swish_compute_vector_fwd(
         const Vmm &vmm_src) {
     // Save src data for later usage
     h->uni_vmovups(vmm_aux(3), vmm_src);
@@ -969,7 +977,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::swish_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::log_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::log_compute_vector_fwd(
         const Vmm &vmm_src) {
     // From J.-M. Muller and others, Handbook of Floating-Point Arithmetic, 2010
     // Here is a brief mathematics to approximate log(x):
@@ -1164,7 +1172,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::log_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::pow_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::pow_compute_vector_fwd(
         const Vmm &vmm_src) {
     // dispatch between special cases.
     if (beta_ == -1) { // alpha / x
@@ -1277,7 +1285,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::pow_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa,
+void jit_uni_eltwise_injector_t<isa,
         Wmm>::gelu_erf_minimax_approx_compute_vector_fwd(const Vmm &vmm_src) {
     using namespace Xbyak::util;
 
@@ -1345,7 +1353,7 @@ void jit_uni_eltwise_injector<isa,
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::gelu_erf_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::gelu_erf_compute_vector_fwd(
         const Vmm &vmm_src) {
     if (is_avx512_) {
         gelu_erf_minimax_approx_compute_vector_fwd(vmm_src);
@@ -1414,7 +1422,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::gelu_erf_compute_vector_fwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::relu_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::relu_compute_vector_bwd(
         const Vmm &vmm_src) {
     // invariant to whether `s` or `d` is passed.
     // get mask of `s` > 0
@@ -1425,7 +1433,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::relu_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::elu_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::elu_compute_vector_bwd(
         const Vmm &vmm_src) {
     if (use_dst_) {
         // get mask of `d` > 0
@@ -1450,7 +1458,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::elu_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::tanh_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::tanh_compute_vector_bwd(
         const Vmm &vmm_src) {
     // res = 1 - d^2 = 1 - tanh^2(s)
     if (!use_dst_) tanh_compute_vector_fwd(vmm_src);
@@ -1460,7 +1468,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::tanh_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::gelu_tanh_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::gelu_tanh_compute_vector_bwd(
         const Vmm &vmm_src) {
     h->uni_vmovups(vmm_aux(0), vmm_src);
 
@@ -1507,14 +1515,14 @@ void jit_uni_eltwise_injector<isa, Wmm>::gelu_tanh_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::square_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::square_compute_vector_bwd(
         const Vmm &vmm_src) {
     // res = 2 * s
     h->uni_vmulps(vmm_src, vmm_src, table_val(two));
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::abs_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::abs_compute_vector_bwd(
         const Vmm &vmm_src) {
     // replace positive values with 1.f
     compute_cmp_mask(vmm_src, table_val(zero), _cmp_gt_os);
@@ -1525,7 +1533,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::abs_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::sqrt_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::sqrt_compute_vector_bwd(
         const Vmm &vmm_src) {
     // res = 0.5 / d = 0.5 / sqrt(s)
     if (!use_dst_) sqrt_compute_vector_fwd(vmm_src);
@@ -1536,20 +1544,20 @@ void jit_uni_eltwise_injector<isa, Wmm>::sqrt_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::linear_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::linear_compute_vector_bwd(
         const Vmm &vmm_src) {
     h->uni_vmovups(vmm_src, table_val(alpha));
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::soft_relu_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::soft_relu_compute_vector_bwd(
         const Vmm &vmm_src) {
     h->uni_vmulps(vmm_src, vmm_src, table_val(alpha));
     logistic_compute_vector_fwd(vmm_src);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::mish_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::mish_compute_vector_bwd(
         const Vmm &vmm_src) {
     // IMPORTANT: we use vmm_aux(2) to save src as exp does not use it.
     h->uni_vmovups(vmm_aux(2), vmm_src); // vmm_aux(2) = x
@@ -1590,7 +1598,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::mish_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::logistic_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::logistic_compute_vector_bwd(
         const Vmm &vmm_src) {
     // res = d * (1 - d) = d - d * d; d = logistic(s)
     if (!use_dst_) logistic_compute_vector_fwd(vmm_src);
@@ -1601,13 +1609,13 @@ void jit_uni_eltwise_injector<isa, Wmm>::logistic_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::exp_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::exp_compute_vector_bwd(
         const Vmm &vmm_src) {
     if (!use_dst_) exp_compute_vector_fwd(vmm_src);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::swish_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::swish_compute_vector_bwd(
         const Vmm &vmm_src) {
     // R = alpha * s
     h->uni_vmulps(vmm_src, vmm_src, table_val(alpha));
@@ -1631,7 +1639,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::swish_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::log_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::log_compute_vector_bwd(
         const Vmm &vmm_src) {
     // res = 1 / s
     h->uni_vmovups(vmm_aux(0), table_val(one));
@@ -1641,7 +1649,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::log_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::clip_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::clip_compute_vector_bwd(
         const Vmm &vmm_src) {
     // set result with 1.f
     h->uni_vmovups(vmm_aux(0), table_val(one));
@@ -1657,7 +1665,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::clip_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::pow_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::pow_compute_vector_bwd(
         const Vmm &vmm_src) {
     // dispatch some special cases.
     if (beta_ == 0) { // zero
@@ -1688,7 +1696,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::pow_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::gelu_erf_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::gelu_erf_compute_vector_bwd(
         const Vmm &vmm_src) {
     // R = s / sqrt(2)
     h->uni_vmulps(vmm_src, vmm_src,
@@ -1752,7 +1760,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::gelu_erf_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::hardswish_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::hardswish_compute_vector_bwd(
         const Vmm &vmm_src) {
     // Get mask for 0 < alpha * x + beta < 1
     h->uni_vmovups(vmm_aux(0), vmm_src);
@@ -1769,7 +1777,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::hardswish_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::hardsigmoid_compute_vector_bwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::hardsigmoid_compute_vector_bwd(
         const Vmm &vmm_src) {
     // Get mask for 0 < alpha * x + beta < 1
     // Zero rest values.
@@ -1786,13 +1794,13 @@ void jit_uni_eltwise_injector<isa, Wmm>::hardsigmoid_compute_vector_bwd(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::round_compute_vector_fwd(
+void jit_uni_eltwise_injector_t<isa, Wmm>::round_compute_vector_fwd(
         const Vmm &vmm_src) {
     h->uni_vroundps(vmm_src, vmm_src, _op_mxcsr);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-size_t jit_uni_eltwise_injector<isa, Wmm>::aux_gprs_count(
+size_t jit_uni_eltwise_injector_t<isa, Wmm>::aux_gprs_count(
         alg_kind_t alg, bool is_fwd, float alpha) {
     using namespace alg_kind;
     int ret = 0;
@@ -1806,13 +1814,13 @@ size_t jit_uni_eltwise_injector<isa, Wmm>::aux_gprs_count(
 };
 
 template <cpu_isa_t isa, typename Wmm>
-bool jit_uni_eltwise_injector<isa, Wmm>::need_vmm_stack_ptr(
+bool jit_uni_eltwise_injector_t<isa, Wmm>::need_vmm_stack_ptr(
         alg_kind_t alg, bool is_fwd, float alpha) {
     return op_vecs_count(alg, is_fwd) + aux_vecs_count(alg, is_fwd, alpha);
 }
 
 template <cpu_isa_t isa, typename Wmm>
-size_t jit_uni_eltwise_injector<isa, Wmm>::op_vecs_count(
+size_t jit_uni_eltwise_injector_t<isa, Wmm>::op_vecs_count(
         alg_kind_t alg, bool is_fwd) {
     using namespace alg_kind;
     int ret = 0;
@@ -1834,7 +1842,50 @@ size_t jit_uni_eltwise_injector<isa, Wmm>::op_vecs_count(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-size_t jit_uni_eltwise_injector<isa, Wmm>::aux_vecs_count(
+void jit_uni_eltwise_injector_t<isa, Wmm>::hsigmoid_compute_vector_fwd(
+        const Vmm &vmm_src) {
+    // x + 3
+    h->uni_vaddps(vmm_src, vmm_src, table_val(hsigmoid, 0));
+    // relu6(x + 3)
+    h->uni_vmaxps(vmm_src, vmm_src, table_val(zero));
+    h->uni_vminps(vmm_src, vmm_src, table_val(hsigmoid, 1));
+    // relu6(x + 3) / 6
+    h->uni_vmulps(vmm_src, vmm_src, table_val(hsigmoid, 2));
+}
+
+template <cpu_isa_t isa, typename Wmm>
+void jit_uni_eltwise_injector_t<isa, Wmm>::round_half_to_even_compute_vector_fwd(
+        const Vmm &vmm_src) {
+    h->uni_vroundps(vmm_src, vmm_src, _op_near);
+}
+
+template <cpu_isa_t isa, typename Wmm>
+void jit_uni_eltwise_injector_t<isa, Wmm>::round_half_away_from_zero_compute_vector_fwd(
+        const Vmm &vmm_src) {
+    // create a mask of negative numbers for later returning sign
+    compute_cmp_mask(vmm_src, table_val(zero), _cmp_lt_os);
+
+    // round half away from zero for positive numbers
+    h->uni_vandps(vmm_src, vmm_src, table_val(positive_mask));
+    h->uni_vaddps(vmm_src, vmm_src, table_val(half));
+    h->uni_vroundps(vmm_src, vmm_src, _op_floor);
+
+    // return a sign for negative numbers using the mask
+    if (isa == sse41) {
+        h->movups(vmm_aux(1), vmm_src);
+        h->mulps(vmm_aux(1), table_val(minus_one));
+        h->blendvps(vmm_src, vmm_aux(1));
+    } else if (isa == avx2) {
+        h->vmulps(vmm_aux(1), vmm_src, table_val(minus_one));
+        h->vblendvps(vmm_src, vmm_src, vmm_aux(1), vmm_mask_);
+    } else if (isa == avx512_core) {
+        h->vmulps(vmm_aux(1), vmm_src, table_val(minus_one));
+        h->vblendmps(vmm_src | k_mask_, vmm_src, vmm_aux(1));
+    }
+}
+
+template <cpu_isa_t isa, typename Wmm>
+size_t jit_uni_eltwise_injector_t<isa, Wmm>::aux_vecs_count(
         alg_kind_t alg, bool is_fwd, float alpha) {
     // For avx we need a register to save the upper part of Ymm
     const bool extra_avx_vmm = isa == avx;
@@ -1873,6 +1924,9 @@ size_t jit_uni_eltwise_injector<isa, Wmm>::aux_vecs_count(
             case eltwise_round: n_vmms = 0; break;
             case eltwise_hardswish: n_vmms = 1; break;
             case eltwise_hardsigmoid: n_vmms = 0; break;
+            case eltwise_hsigmoid: n_vmms = 0; break;
+            case eltwise_round_half_to_even: n_vmms = 0; break;
+            case eltwise_round_half_away_from_zero: n_vmms = 2; break;
             default: assert(!"unsupported eltwise algorithm");
         }
     } else {
@@ -1916,7 +1970,7 @@ size_t jit_uni_eltwise_injector<isa, Wmm>::aux_vecs_count(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-bool jit_uni_eltwise_injector<isa, Wmm>::need_mask_register(
+bool jit_uni_eltwise_injector_t<isa, Wmm>::need_mask_register(
         alg_kind_t alg, bool is_fwd, float alpha) {
     if (is_superset(isa, avx512_core)) return false;
 
@@ -1951,6 +2005,9 @@ bool jit_uni_eltwise_injector<isa, Wmm>::need_mask_register(
             case eltwise_round: return false;
             case eltwise_hardswish: return false;
             case eltwise_hardsigmoid: return false;
+            case eltwise_hsigmoid: return false;
+            case eltwise_round_half_to_even: return false;
+            case eltwise_round_half_away_from_zero: return true;
             default: assert(!"unsupported eltwise algorithm");
         }
     } else {
@@ -1990,7 +2047,7 @@ bool jit_uni_eltwise_injector<isa, Wmm>::need_mask_register(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::compute_body(
+void jit_uni_eltwise_injector_t<isa, Wmm>::compute_body(
         const injector_utils::vmm_index_set_iterator_t &start_idx_it,
         const injector_utils::vmm_index_set_iterator_t &end_idx_it) {
     using namespace alg_kind;
@@ -2042,6 +2099,9 @@ void jit_uni_eltwise_injector<isa, Wmm>::compute_body(
                 case eltwise_hardsigmoid:
                     hardsigmoid_compute_vector_fwd(Vmm(idx));
                     break;
+                case eltwise_hsigmoid: hsigmoid_compute_vector_fwd(Vmm(idx)); break;
+                case eltwise_round_half_to_even: round_half_to_even_compute_vector_fwd(Vmm(idx)); break;
+                case eltwise_round_half_away_from_zero: round_half_away_from_zero_compute_vector_fwd(Vmm(idx)); break;
                 default: assert(!"unsupported eltwise algorithm");
             }
         } else {
@@ -2095,7 +2155,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::compute_body(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::compute_vector_range(
+void jit_uni_eltwise_injector_t<isa, Wmm>::compute_vector_range(
         size_t start_compute_idx, size_t end_compute_idx,
         const injector_utils::vmm_index_set_t &vmm_aux_indices) {
     injector_utils::vmm_index_set_t vmm_compute_idxs;
@@ -2105,7 +2165,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::compute_vector_range(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::compute_vector_range(
+void jit_uni_eltwise_injector_t<isa, Wmm>::compute_vector_range(
         const injector_utils::vmm_index_set_t &vmm_compute_idxs,
         const injector_utils::vmm_index_set_t &vmm_aux_indices) {
     if (vmm_compute_idxs.empty()) return;
@@ -2128,7 +2188,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::compute_vector_range(
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::prepare_table(bool gen_table) {
+void jit_uni_eltwise_injector_t<isa, Wmm>::prepare_table(bool gen_table) {
     if (!gen_table) return;
 
     h->align(64);
@@ -2170,7 +2230,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::prepare_table(bool gen_table) {
 }
 
 template <cpu_isa_t isa, typename Wmm>
-void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
+void jit_uni_eltwise_injector_t<isa, Wmm>::register_table_entries() {
     // This function is responsible to pick all necessary constants
     // for a given algorithm, compute right offset for them to be used
     // in table_val() and save the hexadecimal value of them, which
@@ -2452,6 +2512,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
     static const table_t soft_relu_consts {
             {soft_relu_one_twenty_six, {0x42fc0000, true}},
             {soft_relu_mantissa_sign_mask, {0x807fffff, true}},
+            {soft_relu_twenty, {0x41a00000, true}},
     };
 
     // soft_relu ln(1 + x) polynomial approximation
@@ -2826,6 +2887,13 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
                     {0xc2b00f34, true}}, // 63: -88.029693603515625
     };
 
+    // hsigmoid(x) polynomial approximation
+    static const table_t hsigmoid_values {
+            {hsigmoid, {0x40400000, true}}, // 3
+            {hsigmoid, {0x40C00000, true}}, // 6
+            {hsigmoid, {0x3e2aaaaa, true}}, // 1 / 6
+    };
+
     // This object takes care about which constants and polynomials to include.
     struct need_t {
         need_t(alg_kind_t alg) {
@@ -2845,6 +2913,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
                 case eltwise_mish: mish_ = true; break;
                 case eltwise_tanh_use_dst_for_bwd:
                 case eltwise_tanh: tanh_ = true; break;
+                case eltwise_hsigmoid: hsigmoid_ = true; break;
                 default: break;
             }
         }
@@ -2856,6 +2925,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
         bool gelu_tanh_ = false;
         bool gelu_erf_ = false;
         bool log_ = false;
+        bool hsigmoid_ = false;
 
         bool exp() const { return exp_ || soft_relu_ || gelu_erf_ || mish_; }
         bool mish() const { return mish_; }
@@ -2864,6 +2934,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
         bool gelu_tanh() const { return gelu_tanh_; }
         bool gelu_erf() const { return gelu_erf_; }
         bool log() const { return log_; }
+        bool hsigmoid() const { return hsigmoid_; }
     };
 
     need_t need(alg_);
@@ -2903,6 +2974,7 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
     if (need.log()) push_entries_of(log_consts);
     if (need.log()) push_entries_of(log_polynomial);
     if (need.log()) push_entries_of(log_predefined_values);
+    if (need.hsigmoid()) push_entries_of(hsigmoid_values);
 
     // Now that we registered the entries, we set the offsets.  No
     // entries should be registered after this point.  This allows to
@@ -2916,20 +2988,20 @@ void jit_uni_eltwise_injector<isa, Wmm>::register_table_entries() {
     }
 }
 
-template struct jit_uni_eltwise_injector<avx512_core_fp16>;
-template struct jit_uni_eltwise_injector<avx512_core_fp16, Xbyak::Ymm>;
-template struct jit_uni_eltwise_injector<avx512_core_fp16, Xbyak::Xmm>;
-template struct jit_uni_eltwise_injector<avx512_core_bf16>;
-template struct jit_uni_eltwise_injector<avx512_core>;
-template struct jit_uni_eltwise_injector<avx512_core, Ymm>;
-template struct jit_uni_eltwise_injector<avx512_core, Xmm>;
-template struct jit_uni_eltwise_injector<avx2_vnni_2>;
-template struct jit_uni_eltwise_injector<avx2_vnni_2, Xmm>;
-template struct jit_uni_eltwise_injector<avx2>;
-template struct jit_uni_eltwise_injector<avx2, Xmm>;
-template struct jit_uni_eltwise_injector<avx>;
-template struct jit_uni_eltwise_injector<avx, Xmm>;
-template struct jit_uni_eltwise_injector<sse41>;
+template struct jit_uni_eltwise_injector_t<avx512_core_fp16>;
+template struct jit_uni_eltwise_injector_t<avx512_core_fp16, Xbyak::Ymm>;
+template struct jit_uni_eltwise_injector_t<avx512_core_fp16, Xbyak::Xmm>;
+template struct jit_uni_eltwise_injector_t<avx512_core_bf16>;
+template struct jit_uni_eltwise_injector_t<avx512_core>;
+template struct jit_uni_eltwise_injector_t<avx512_core, Ymm>;
+template struct jit_uni_eltwise_injector_t<avx512_core, Xmm>;
+template struct jit_uni_eltwise_injector_t<avx2_vnni_2>;
+template struct jit_uni_eltwise_injector_t<avx2_vnni_2, Xmm>;
+template struct jit_uni_eltwise_injector_t<avx2>;
+template struct jit_uni_eltwise_injector_t<avx2, Xmm>;
+template struct jit_uni_eltwise_injector_t<avx>;
+template struct jit_uni_eltwise_injector_t<avx, Xmm>;
+template struct jit_uni_eltwise_injector_t<sse41>;
 
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/injectors/jit_uni_eltwise_injector.hpp b/src/cpu/x64/injectors/jit_uni_eltwise_injector.hpp
index 2348672e41a..06e45f91f7f 100644
--- a/src/cpu/x64/injectors/jit_uni_eltwise_injector.hpp
+++ b/src/cpu/x64/injectors/jit_uni_eltwise_injector.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -75,8 +75,8 @@ bool is_supported(cpu_isa_t isa, alg_kind_t alg, data_type_t dt);
 
 } // namespace eltwise_injector
 
-template <cpu_isa_t isa, typename Wmm = typename cpu_isa_traits<isa>::Vmm>
-struct jit_uni_eltwise_injector {
+template <cpu_isa_t isa, typename Wmm = typename cpu_isa_traits_t<isa>::Vmm>
+struct jit_uni_eltwise_injector_t {
     using Vmm = Wmm;
 
     // Arguments description:
@@ -91,9 +91,9 @@ struct jit_uni_eltwise_injector {
     //   - algorithm derivative.
     // use_dst - defines whether source or destination point is passed to alg
     //   code. Depends on algorithm. See `_use_dst_for_bwd` algs definition.
-    jit_uni_eltwise_injector(jit_generator *host, alg_kind_t alg, float alpha,
-            float beta, float scale, data_type_t dt = data_type::f32,
-            bool save_state = true,
+    jit_uni_eltwise_injector_t(jit_generator_t *host, alg_kind_t alg,
+            float alpha, float beta, float scale,
+            data_type_t dt = data_type::f32, bool save_state = true,
             Xbyak::Reg64 p_table = Xbyak::Reg64(Xbyak::Operand::RAX),
             Xbyak::Opmask k_mask = Xbyak::Opmask(1), bool is_fwd = true,
             bool use_dst = false, bool preserve_vmm = true,
@@ -117,14 +117,14 @@ struct jit_uni_eltwise_injector {
         register_table_entries();
     }
 
-    jit_uni_eltwise_injector(jit_generator *host,
+    jit_uni_eltwise_injector_t(jit_generator_t *host,
             const post_ops_t::entry_t::eltwise_t &eltwise,
             data_type_t dt = data_type::f32, bool save_state = true,
             Xbyak::Reg64 p_table = Xbyak::Reg64(Xbyak::Operand::RAX),
             Xbyak::Opmask k_mask = Xbyak::Opmask(1), bool is_fwd = true,
             bool use_dst = false, bool preserve_vmm = true,
             bool preserve_p_table = true)
-        : jit_uni_eltwise_injector(host, eltwise.alg, eltwise.alpha,
+        : jit_uni_eltwise_injector_t(host, eltwise.alg, eltwise.alpha,
                 eltwise.beta, eltwise.scale, dt, save_state, p_table, k_mask,
                 is_fwd, use_dst, preserve_vmm, preserve_p_table) {}
 
@@ -152,7 +152,7 @@ struct jit_uni_eltwise_injector {
     const float scale_;
     const data_type_t dt_;
 
-    jit_generator *const h;
+    jit_generator_t *const h;
 
     const bool save_state_;
     const Xbyak::Reg64 p_table_;
@@ -165,24 +165,25 @@ struct jit_uni_eltwise_injector {
 
     Xbyak::Label l_table_;
 
-    // if only the injector was inherited from jit_generator...
+    // if only the injector was inherited from jit_generator_t...
     enum {
-        _cmp_eq_oq = jit_generator::_cmp_eq_oq,
-        _cmp_neq_uq = jit_generator::_cmp_neq_uq,
-        _cmp_lt_os = jit_generator::_cmp_lt_os,
-        _cmp_le_os = jit_generator::_cmp_le_os,
-        _cmp_ge_os = jit_generator::_cmp_nlt_us,
-        _cmp_gt_os = jit_generator::_cmp_nle_us,
-        _op_floor = jit_generator::_op_floor,
-        _op_mxcsr = jit_generator::_op_mxcsr
+        _cmp_eq_oq = jit_generator_t::_cmp_eq_oq,
+        _cmp_neq_uq = jit_generator_t::_cmp_neq_uq,
+        _cmp_lt_os = jit_generator_t::_cmp_lt_os,
+        _cmp_le_os = jit_generator_t::_cmp_le_os,
+        _cmp_ge_os = jit_generator_t::_cmp_nlt_us,
+        _cmp_gt_os = jit_generator_t::_cmp_nle_us,
+        _op_floor = jit_generator_t::_op_floor,
+        _op_mxcsr = jit_generator_t::_op_mxcsr,
+        _op_near = jit_generator_t::_op_near
     };
 
     const bool is_avx512_ = is_superset(isa, avx512_core);
 
-    static constexpr size_t vlen_ = vreg_traits<Vmm>::vlen;
+    static constexpr size_t vlen_ = vreg_traits_t<Vmm>::vlen;
     static constexpr size_t preserved_vecs_max_ = 6;
     static constexpr size_t preserved_gprs_max_ = 5;
-    static constexpr size_t n_vregs_ = cpu_isa_traits<isa>::n_vregs;
+    static constexpr size_t n_vregs_ = cpu_isa_traits_t<isa>::n_vregs;
     static constexpr int n_mantissa_bits_ = 23;
 
     const size_t n_vregs_to_preserve_;
@@ -245,6 +246,9 @@ struct jit_uni_eltwise_injector {
     void round_compute_vector_fwd(const Vmm &vmm_src);
     void hardswish_compute_vector_fwd(const Vmm &vmm_src);
     void hardsigmoid_compute_vector_fwd(const Vmm &vmm_src);
+    void hsigmoid_compute_vector_fwd(const Vmm &vmm_src);
+    void round_half_to_even_compute_vector_fwd(const Vmm &vmm_src);
+    void round_half_away_from_zero_compute_vector_fwd(const Vmm &vmm_src);
 
     void exp_compute_vector_bwd(const Vmm &vmm_src);
     void relu_compute_vector_bwd(const Vmm &vmm_src);
@@ -298,6 +302,7 @@ struct jit_uni_eltwise_injector {
         tanh_pol_table, // table of polynomial coefficients
         soft_relu_one_twenty_six, // 126.f
         soft_relu_mantissa_sign_mask, // mask for mantissa bits and sign
+        soft_relu_twenty, // 20.f
         soft_relu_pol, // see correspondent table for float values
         gelu_tanh_fitting_const, // 0.044715f
         gelu_tanh_fitting_const_times_three, // 0.134145f
@@ -324,6 +329,7 @@ struct jit_uni_eltwise_injector {
         log_five_bit_offset, // 5 bits off (31 = 2^5 - 1)
         log_pol, // see correspondent table for float values
         log_predefined_vals, // see correspondent table for float values
+        hsigmoid, // hsigmoid
         undef_key,
     };
 
diff --git a/src/cpu/x64/injectors/jit_uni_postops_injector.cpp b/src/cpu/x64/injectors/jit_uni_postops_injector.cpp
index dfa7ca4f533..9aeb64ac176 100644
--- a/src/cpu/x64/injectors/jit_uni_postops_injector.cpp
+++ b/src/cpu/x64/injectors/jit_uni_postops_injector.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,12 +23,15 @@ namespace cpu {
 namespace x64 {
 namespace injector {
 
+#define VCHECK_PO_INJ_BOOL(cond, msg) \
+    VCONDCHECK(primitive, create, check, postops_injector, cond, false, msg);
+
 size_t aux_vec_count(const post_ops_t &post_ops, cpu_isa_t isa, bool is_fwd) {
     size_t res = 0;
 #define CASE_ELTWISE_SUPERSET(_isa) \
     if (is_superset(isa, _isa)) { \
         res = nstl::max(res, \
-                jit_uni_eltwise_injector<_isa>::aux_vecs_count( \
+                jit_uni_eltwise_injector_t<_isa>::aux_vecs_count( \
                         post_op.eltwise.alg, is_fwd, post_op.eltwise.alpha)); \
         continue; \
     }
@@ -50,9 +53,45 @@ size_t aux_vec_count(const post_ops_t &post_ops, cpu_isa_t isa, bool is_fwd) {
 
 template <cpu_isa_t isa, typename Vmm>
 jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
-        jit_generator *host, const post_ops_t &post_ops,
+        jit_generator_t *host, const post_ops_t &post_ops,
+        const eltwise_injector::static_params_t &eltwise_static_params,
+        const quantization_injector::static_params_t &quantization_static_params)
+        : post_ops_(post_ops)
+        , host_(host)
+        , binary_injector_(nullptr) {
+
+    const auto &esp = eltwise_static_params;
+    const auto &qsp = quantization_static_params;
+
+    for (int i = 0; i < post_ops_.len(); i++) {
+        const auto &post_op = post_ops_.entry_[i];
+
+        if (post_op.is_eltwise()) {
+            alg_to_eltwise_injector_.emplace(post_op.eltwise.alg,
+                                             jit_uni_eltwise_injector_t<isa, Vmm>(host_, post_op.eltwise, data_type::f32,
+                                                                                esp.save_state, esp.p_table_, esp.k_mask_, esp.is_fwd,
+                                                                                esp.use_dst));
+        } else if (post_op.is_depthwise()) {
+            depthwise_injectors.emplace_back(new jit_uni_depthwise_injector_f32<isa>(
+                    host,
+                    post_op
+            ));
+        } else if (post_op.is_quantization()) {
+            quantization_injectors.emplace_back(new jit_uni_quantization_injector_f32<isa, Vmm>(
+                    host,
+                    post_op,
+                    Vmm(qsp.vmm_d_weights_idx), Vmm(qsp.vmm_d_bias_idx), qsp.reg_d_weights, qsp.reg_d_bias
+            ));
+        }
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
+        jit_generator_t *host, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params,
         const eltwise_injector::static_params_t &eltwise_static_params,
+        const quantization_injector::static_params_t &quantization_static_params,
         const lambda_jit_injectors_t &lambda_jit_injectors)
     : post_ops_(post_ops)
     , host_(host)
@@ -60,11 +99,13 @@ jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
     , lambda_jit_injectors_(lambda_jit_injectors) {
 
     const auto &esp = eltwise_static_params;
+    const auto &qsp = quantization_static_params;
     bool is_like_binary = false;
     bool is_eltwise = false;
 
-    for (int i = 0; i < post_ops.len(); i++) {
-        const auto &post_op = post_ops.entry_[i];
+    for (int i = 0; i < post_ops_.len(); i++) {
+        const auto &post_op = post_ops_.entry_[i];
+
         if (post_op.is_eltwise()) {
             is_eltwise = true;
             // Note: `dt` argument for eltwise injector is not propagated from
@@ -72,12 +113,23 @@ jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
             // moment. Once the use case show up, add the argument to the
             // top-level ctor and propagate its value.
             alg_to_eltwise_injector_.emplace(i,
-                    jit_uni_eltwise_injector<isa, Vmm>(host_, post_op.eltwise,
+                    jit_uni_eltwise_injector_t<isa, Vmm>(host_, post_op.eltwise,
                             data_type::f32, esp.save_state, esp.p_table_,
                             esp.k_mask_, esp.is_fwd, esp.use_dst,
                             esp.preserve_vmm, esp.preserve_p_table));
         } else if (post_op.is_like_binary()) {
             is_like_binary = true;
+        } else if (post_op.is_depthwise()) {
+            depthwise_injectors.emplace_back(new jit_uni_depthwise_injector_f32<isa>(
+                    host,
+                    post_op
+            ));
+        } else if (post_op.is_quantization()) {
+            quantization_injectors.emplace_back(new jit_uni_quantization_injector_f32<isa, Vmm>(
+                    host,
+                    post_op,
+                    Vmm(qsp.vmm_d_weights_idx), Vmm(qsp.vmm_d_bias_idx), qsp.reg_d_weights, qsp.reg_d_bias
+            ));
         }
     }
 
@@ -97,32 +149,44 @@ jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
 
 template <cpu_isa_t isa, typename Vmm>
 jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
-        jit_generator *host, const post_ops_t &post_ops,
+        jit_generator_t *host, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params)
     : jit_uni_postops_injector_t(host, post_ops, binary_static_params,
-            eltwise_injector::static_params_t(), lambda_jit_injectors_t()) {}
+            eltwise_injector::static_params_t(), quantization_injector::static_params_t(),
+            lambda_jit_injectors_t()) {}
 
 template <cpu_isa_t isa, typename Vmm>
 jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
-        jit_generator *host, const post_ops_t &post_ops,
+        jit_generator_t *host, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params,
         const lambda_jit_injectors_t &lambda_jit_injectors)
     : jit_uni_postops_injector_t(host, post_ops, binary_static_params,
-            eltwise_injector::static_params_t(), lambda_jit_injectors) {}
+            eltwise_injector::static_params_t(), quantization_injector::static_params_t(),
+            lambda_jit_injectors) {}
 
 template <cpu_isa_t isa, typename Vmm>
 jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(
-        jit_generator *host, const post_ops_t &post_ops,
+        jit_generator_t *host, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params,
         const eltwise_injector::static_params_t &eltwise_static_params)
     : jit_uni_postops_injector_t(host, post_ops, binary_static_params,
-            eltwise_static_params, lambda_jit_injectors_t()) {}
+            eltwise_static_params,
+            quantization_injector::static_params_t(), lambda_jit_injectors_t()) {}
+
+template <cpu_isa_t isa, typename Vmm>
+jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(jit_generator_t *host,
+                                                            const post_ops_t &post_ops,
+                                                            const binary_injector::static_params_t &binary_static_params,
+                                                            const quantization_injector::static_params_t &quantization_static_params)
+        : jit_uni_postops_injector_t(host, post_ops, binary_static_params,
+                                     eltwise_injector::static_params_t(),
+                                     quantization_static_params, lambda_jit_injectors_t()) {}
 
 // Specialization instantiations are needed to avoid instantiating ISA with
 // Vmm that don't make any sense like sse41 + Zmm.
 template <>
 jit_uni_postops_injector_base_t<Xbyak::Zmm> *
-jit_uni_postops_injector_base_t<Xbyak::Zmm>::create(jit_generator *host,
+jit_uni_postops_injector_base_t<Xbyak::Zmm>::create(jit_generator_t *host,
         cpu_isa_t isa, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params,
         const eltwise_injector::static_params_t &eltwise_static_params) {
@@ -158,7 +222,7 @@ jit_uni_postops_injector_base_t<Xbyak::Zmm>::create(jit_generator *host,
 
 template <>
 jit_uni_postops_injector_base_t<Xbyak::Ymm> *
-jit_uni_postops_injector_base_t<Xbyak::Ymm>::create(jit_generator *host,
+jit_uni_postops_injector_base_t<Xbyak::Ymm>::create(jit_generator_t *host,
         cpu_isa_t isa, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params,
         const eltwise_injector::static_params_t &eltwise_static_params) {
@@ -198,7 +262,7 @@ jit_uni_postops_injector_base_t<Xbyak::Ymm>::create(jit_generator *host,
 
 template <>
 jit_uni_postops_injector_base_t<Xbyak::Xmm> *
-jit_uni_postops_injector_base_t<Xbyak::Xmm>::create(jit_generator *host,
+jit_uni_postops_injector_base_t<Xbyak::Xmm>::create(jit_generator_t *host,
         cpu_isa_t isa, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params,
         const eltwise_injector::static_params_t &eltwise_static_params) {
@@ -240,14 +304,23 @@ jit_uni_postops_injector_base_t<Xbyak::Xmm>::create(jit_generator *host,
 
 template <typename Vmm>
 jit_uni_postops_injector_base_t<Vmm> *
-jit_uni_postops_injector_base_t<Vmm>::create(jit_generator *host, cpu_isa_t isa,
-        const post_ops_t &post_ops,
+jit_uni_postops_injector_base_t<Vmm>::create(jit_generator_t *host,
+        cpu_isa_t isa, const post_ops_t &post_ops,
         const binary_injector::static_params_t &binary_static_params) {
     const eltwise_injector::static_params_t eltwise_static_params;
     return create(
             host, isa, post_ops, binary_static_params, eltwise_static_params);
 }
 
+template <cpu_isa_t isa, typename Vmm>
+jit_uni_postops_injector_t<isa, Vmm>::jit_uni_postops_injector_t(jit_generator_t *host,
+        const post_ops_t &post_ops,
+        const binary_injector::static_params_t &binary_static_params,
+        const eltwise_injector::static_params_t &eltwise_static_params,
+        const quantization_injector::static_params_t &quantization_static_params)
+        : jit_uni_postops_injector_t(host, post_ops, binary_static_params,
+                eltwise_static_params, quantization_static_params, lambda_jit_injectors_t()) {}
+
 template <cpu_isa_t isa, typename Vmm>
 void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
         size_t start_idx, size_t end_idx,
@@ -259,6 +332,19 @@ void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
     compute_vector_range(vmm_idxs, rhs_arg_params);
 }
 
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
+        size_t start_idx, size_t end_idx,
+        const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+        const depthwise_injector::dynamic_params_t &ddp,
+        const quantization_injector::dynamic_params_t &qdp) {
+
+    injector_utils::vmm_index_set_t vmm_idxs;
+    for (size_t i = start_idx; i < end_idx; i++)
+        vmm_idxs.emplace(i);
+    compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
+}
+
 template <cpu_isa_t isa, typename Vmm>
 void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
         size_t start_idx, size_t end_idx) {
@@ -269,17 +355,102 @@ void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
 template <cpu_isa_t isa, typename Vmm>
 void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
         const injector_utils::vmm_index_set_t &vmm_idxs,
-        const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params) {
+        const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+        const depthwise_injector::dynamic_params_t &ddp,
+        const quantization_injector::dynamic_params_t &qdp, bool is_broadcast) {
 
     std::size_t rhs_arg_idx = 0;
+    std::size_t quantization_inj_idx = 0;
+    std::size_t depthwise_inj_idx = 0;
+    std::size_t post_ops_data_offset = 0;
     for (int i = 0; i < post_ops_.len(); i++) {
         const auto &post_op = post_ops_.entry_[i];
+
         if (post_op.is_eltwise()) {
             alg_to_eltwise_injector_.at(i).compute_vector_range(vmm_idxs);
         } else if (post_op.is_like_binary()) {
             binary_injector_->compute_vector_range(
                     vmm_idxs, rhs_arg_idx, post_op, rhs_arg_params);
             ++rhs_arg_idx;
+        } else if (post_op.is_depthwise()) {
+            const Xbyak::RegExp depthwise_arg_base = ddp.reg_post_ops_data + ddp.base_post_ops_data_offset + post_ops_data_offset;
+            if (ddp.useAddr)
+                depthwise_injectors[depthwise_inj_idx]->init_ptrs(depthwise_arg_base, ddp.reg_d_weights, ddp.reg_d_bias, ddp.reg_init_off_addr, false);
+            else
+                depthwise_injectors[depthwise_inj_idx]->init_ptrs(depthwise_arg_base, ddp.reg_d_weights, ddp.reg_d_bias, ddp.reg_init_off, false);
+
+            bool need_to_preserve = false;
+            if (post_op.depthwise.alg == dnnl_depthwise_prelu && isa == sse41)
+                need_to_preserve = true;
+
+            for (auto vmm_idx : vmm_idxs) {
+                depthwise_injectors[depthwise_inj_idx]->compute(vmm_idx, vmm_idx + 1,
+                                                                need_to_preserve ? 0 : ddp.vmm_d_weights_idx, ddp.vmm_d_bias_idx,
+                                                                ddp.reg_d_weights, ddp.reg_d_bias,
+                                                                is_broadcast, ddp.vmm_idx_off.at(vmm_idx), need_to_preserve);
+            }
+
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            ++rhs_arg_idx;
+            depthwise_inj_idx++;
+        } else if (post_op.is_quantization()) {
+            std::vector<std::pair<int, std::set<size_t>>> vecOfVmmIdxsSets;
+
+            std::multimap<int, size_t> offsetVmmIdxMap;
+            for (auto vmm_idx : vmm_idxs) {
+                offsetVmmIdxMap.insert({qdp.vmm_idx_off.at(vmm_idx), vmm_idx});
+            }
+
+            auto externalIt = offsetVmmIdxMap.begin();
+            while (externalIt != offsetVmmIdxMap.end()) {
+                auto internalIt = externalIt;
+                auto endInternalIt = offsetVmmIdxMap.upper_bound(externalIt->first);
+
+                std::set<size_t> vmmIndexesToProcess;
+                while (internalIt != endInternalIt) {
+                    vmmIndexesToProcess.insert(internalIt->second);
+                    internalIt++;
+                }
+                vecOfVmmIdxsSets.push_back({externalIt->first, vmmIndexesToProcess});
+
+                externalIt = endInternalIt;
+            }
+
+            bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
+            bool do_rounding = do_dequantization || qdp.dst_dt == dnnl_f32 || i != post_ops_.len() - 1;
+
+            const Xbyak::RegExp quant_arg_base = qdp.reg_post_ops_data + qdp.base_post_ops_data_offset + post_ops_data_offset;
+            if (qdp.useAddr)
+                quantization_injectors[quantization_inj_idx]->init_crop_ptrs(quant_arg_base, qdp.reg_oc_off_addr);
+            else
+                quantization_injectors[quantization_inj_idx]->init_crop_ptrs(quant_arg_base, qdp.reg_oc_off);
+
+            for (auto &IdxSetPair : vecOfVmmIdxsSets) {
+                quantization_injectors[quantization_inj_idx]->compute_crop(IdxSetPair.second, IdxSetPair.first, false, is_broadcast);
+            }
+
+            if (qdp.useAddr)
+                quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(quant_arg_base, qdp.reg_oc_off_addr);
+            else
+                quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(quant_arg_base, qdp.reg_oc_off);
+
+            for (auto &IdxSetPair : vecOfVmmIdxsSets) {
+                quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(IdxSetPair.second, IdxSetPair.first, do_rounding,
+                                                                                        false, is_broadcast);
+            }
+
+            if (qdp.useAddr)
+                quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(quant_arg_base, qdp.reg_oc_off_addr);
+            else
+                quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(quant_arg_base, qdp.reg_oc_off);
+
+            for (auto &IdxSetPair : vecOfVmmIdxsSets) {
+                quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(IdxSetPair.second, IdxSetPair.first, false, is_broadcast);
+            }
+
+            post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep();
+            ++rhs_arg_idx;
+            quantization_inj_idx++;
         } else {
             const auto lam = lambda_jit_injectors_.find(post_op.kind);
             if (lam != lambda_jit_injectors_.end()) lam->second();
@@ -292,6 +463,13 @@ void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
     compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t());
 }
 
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_postops_injector_t<isa, Vmm>::compute_vector_range(
+        const injector_utils::vmm_index_set_t &vmm_idxs,
+        const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params) {
+    compute_vector_range(vmm_idxs, rhs_arg_params, depthwise_injector::dynamic_params_t(), quantization_injector::dynamic_params_t());
+}
+
 template <cpu_isa_t isa, typename Vmm>
 void jit_uni_postops_injector_t<isa, Vmm>::prepare_table(bool gen_table) {
     for (auto &alg_elt_inject : alg_to_eltwise_injector_)
@@ -309,12 +487,54 @@ void jit_uni_postops_injector_t<isa, Vmm>::compute_vector(size_t idx) {
     compute_vector_range({idx});
 }
 
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_postops_injector_t<isa, Vmm>::compute_vector(size_t idx,
+        const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+        const depthwise_injector::dynamic_params_t &ddp,
+        const quantization_injector::dynamic_params_t &qdp) {
+    compute_vector_range({idx}, rhs_arg_params, ddp, qdp);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_postops_injector_t<isa, Vmm>::compute_vector(size_t idx,
+        const depthwise_injector::dynamic_params_t &ddp,
+        const quantization_injector::dynamic_params_t &qdp, bool is_broadcast) {
+    compute_vector_range({idx}, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp, is_broadcast);
+}
+
 template <cpu_isa_t isa, typename Vmm>
 void jit_uni_postops_injector_t<isa, Vmm>::set_lambda_injector(
         dnnl_primitive_kind_t kind, const std::function<void()> &jit_injector) {
     lambda_jit_injectors_[kind] = jit_injector;
 }
 
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_postops_injector_t<isa, Vmm>::push_post_ops_data_on_stack(const Xbyak::Reg64& post_ops_data_reg, std::size_t post_ops_data_offset,
+        const Xbyak::Reg64& aux_reg0, const Xbyak::Reg64& aux_reg1) {
+    for (int i = 0; i < post_ops_.len(); i++) {
+        if (post_ops_.entry_[i].is_depthwise() || post_ops_.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        host_->sub(host_->rsp, post_ops_pointers_count * sizeof(float *));
+
+        host_->mov(aux_reg0, host_->ptr[post_ops_data_reg + post_ops_data_offset]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            host_->mov(aux_reg1, host_->ptr[aux_reg0 + i * sizeof(float *)]);
+            host_->mov(host_->ptr[host_->rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_postops_injector_t<isa, Vmm>::reset_stack_pointer() {
+    if (post_ops_pointers_count != 0) {
+        host_->add(host_->rsp, post_ops_pointers_count * sizeof(float *));
+    }
+}
+
 post_ops_ok_args_t::post_ops_ok_args_t(const cpu_isa_t isa,
         const std::vector<post_op_type> &accepted_post_op_types,
         const post_ops_t &post_ops, const memory_desc_wrapper *dst_d,
@@ -345,9 +565,8 @@ bool post_ops_ok(const post_ops_ok_args_t &post_ops_ok_args) {
     const auto &enabled_bcast_strategy
             = post_ops_ok_args.enabled_bcast_strategy;
 
-    VCONDCHECK(primitive, create, check, injector,
-            dst_d != nullptr && dst_d->md_->format_kind != dnnl_format_kind_any,
-            false, VERBOSE_UNSUPPORTED_FORMAT_KIND);
+    VCHECK_PO_INJ_BOOL(dst_d && dst_d->md_->format_kind != dnnl_format_kind_any,
+            VERBOSE_UNSUPPORTED_FORMAT_KIND);
 
     // Save scale and zero point of first sum postop in order to check that any
     // subsequent sum postops have the same values. This check is necessary
@@ -362,39 +581,43 @@ bool post_ops_ok(const post_ops_ok_args_t &post_ops_ok_args) {
     const auto is_accepted_postop = [&](const int idx) {
         for (const auto &post_op : accepted_post_op_types) {
             const auto &entry = post_ops.entry_[idx];
+            // Note: check for post-op kinds is needed as `post_op` value
+            // represents all supported but not only passed kinds.
             switch (post_op) {
                 case sum:
-                    if (entry.is_sum(false, false)) {
-                        if (sum_requires_same_params
-                                && entry.sum.scale != sum_scale)
-                            return false;
-                        if (sum_requires_same_params
-                                && entry.sum.zero_point != sum_zero_point)
-                            return false;
-                        if (sum_requires_scale_one && entry.sum.scale != 1)
-                            return false;
-                        if (sum_requires_zp_zero && entry.sum.zero_point != 0)
-                            return false;
-                        return IMPLICATION(sum_at_pos_0_only, idx == 0);
+                    if (!entry.is_sum(false, false)) continue;
+                    if (sum_requires_same_params) {
+                        VCHECK_PO_INJ_BOOL(entry.sum.scale == sum_scale,
+                                "Unsupported sum scale value");
+                        VCHECK_PO_INJ_BOOL(
+                                entry.sum.zero_point == sum_zero_point,
+                                "Unsupported sum zero-point value");
                     }
-                    break;
-                case eltwise:
-                    if (entry.is_eltwise()) {
-                        const auto alg = entry.eltwise.alg;
-                        return eltwise_injector::is_supported(
-                                isa, alg, data_type::f32);
+                    if (sum_requires_scale_one) {
+                        VCHECK_PO_INJ_BOOL(entry.sum.scale == 1.f,
+                                "Unsupported sum scale value");
+                    }
+                    if (sum_requires_zp_zero) {
+                        VCHECK_PO_INJ_BOOL(entry.sum.zero_point == 0,
+                                "Unsupported sum zero-point value");
                     }
-                    break;
+                    VCHECK_PO_INJ_BOOL(IMPLICATION(sum_at_pos_0_only, idx == 0),
+                            "Unsupported sum position in post-ops");
+                    return true;
+                case eltwise:
+                    if (!entry.is_eltwise()) continue;
+                    return eltwise_injector::is_supported(
+                            isa, entry.eltwise.alg, data_type::f32);
                 case binary:
                 case prelu:
-                    if (entry.is_like_binary()) {
-                        assert(dst_d != nullptr && "dst_d is null");
-                        return binary_injector::is_supported(isa,
-                                binary_injector::get_src1_desc(entry, *dst_d),
-                                *dst_d, enabled_bcast_strategy);
-                    }
-                    break;
-                default: assert(false && "Unhandled post_op type");
+                    if (!entry.is_like_binary()) continue;
+                    assert(dst_d && "dst_d is null");
+                    return binary_injector::is_supported(isa,
+                            binary_injector::get_src1_desc(entry, *dst_d),
+                            *dst_d, enabled_bcast_strategy);
+                case depthwise: if (entry.is_depthwise()) return true; break;
+                case quantization: if (entry.is_quantization()) return true; break;
+                default: assert(!"Unhandled post_op type");
             }
         }
         return false;
@@ -426,6 +649,8 @@ template class jit_uni_postops_injector_base_t<Xbyak::Zmm>;
 template class jit_uni_postops_injector_base_t<Xbyak::Ymm>;
 template class jit_uni_postops_injector_base_t<Xbyak::Xmm>;
 
+#undef VCHECK_PO_INJ_BOOL
+
 } // namespace injector
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/injectors/jit_uni_postops_injector.hpp b/src/cpu/x64/injectors/jit_uni_postops_injector.hpp
index eee12c7d7b3..dbb5f06d194 100644
--- a/src/cpu/x64/injectors/jit_uni_postops_injector.hpp
+++ b/src/cpu/x64/injectors/jit_uni_postops_injector.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include "cpu/x64/injectors/injector_utils.hpp"
 #include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
 #include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_quantization_injector.hpp"
 #include "cpu/x64/jit_generator.hpp"
 #include <initializer_list>
 
@@ -65,12 +67,12 @@ class jit_uni_postops_injector_base_t {
     // cases it's aligned with the former kernel ISA if such enum value is
     // instantiated for injectors. If not, uses the next available isa enum
     // value in compliance with same vector length.
-    static jit_uni_postops_injector_base_t *create(jit_generator *host,
+    static jit_uni_postops_injector_base_t *create(jit_generator_t *host,
             cpu_isa_t isa, const post_ops_t &post_ops,
             const binary_injector::static_params_t &binary_static_params,
             const eltwise_injector::static_params_t &eltwise_static_params);
 
-    static jit_uni_postops_injector_base_t *create(jit_generator *host,
+    static jit_uni_postops_injector_base_t *create(jit_generator_t *host,
             cpu_isa_t isa, const post_ops_t &post_ops,
             const binary_injector::static_params_t &binary_static_params);
 
@@ -95,6 +97,26 @@ class jit_uni_postops_injector_base_t {
             = 0;
     virtual void compute_vector_range(size_t start_idx, size_t end_idx) = 0;
 
+    virtual void compute_vector_range(const injector_utils::vmm_index_set_t &vmm_idxs,
+            const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+            const depthwise_injector::dynamic_params_t &ddp,
+            const quantization_injector::dynamic_params_t &qdp, bool is_broadcast = false) = 0;
+
+    virtual void compute_vector_range(
+            size_t start_idx, size_t end_idx,
+            const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+            const depthwise_injector::dynamic_params_t &ddp,
+            const quantization_injector::dynamic_params_t &qdp)  = 0;
+
+    virtual void compute_vector(size_t idx,
+                        const depthwise_injector::dynamic_params_t &ddp,
+                        const quantization_injector::dynamic_params_t &qdp, bool is_broadcast = false) = 0;
+
+    virtual void compute_vector(size_t idx,
+            const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+            const depthwise_injector::dynamic_params_t &ddp,
+            const quantization_injector::dynamic_params_t &qdp) = 0;
+
     // Generates code of post_ops chain injected to host primitive. Applied to
     // a single vector register index.
     // @rhs_arg_params: see jit_uni_binary_injector description
@@ -112,7 +134,7 @@ class jit_uni_postops_injector_base_t {
 
 // A parent isa-specific post-ops injector class. A specific instance is
 // assigned based on `cpu_isa_t isa` argument in the base class.
-template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits<isa>::Vmm>
+template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits_t<isa>::Vmm>
 class jit_uni_postops_injector_t : public jit_uni_postops_injector_base_t<Vmm> {
 public:
     /*
@@ -126,21 +148,52 @@ class jit_uni_postops_injector_t : public jit_uni_postops_injector_base_t<Vmm> {
      * @param lambda_jit_injectors <optional> - allows user specify custom injector
      * function for given post-op type
      */
-    jit_uni_postops_injector_t(jit_generator *host, const post_ops_t &post_ops,
+    jit_uni_postops_injector_t(jit_generator_t *host,
+            const post_ops_t &post_ops,
             const binary_injector::static_params_t &binary_static_params);
-    jit_uni_postops_injector_t(jit_generator *host, const post_ops_t &post_ops,
+    jit_uni_postops_injector_t(jit_generator_t *host,
+            const post_ops_t &post_ops,
             const binary_injector::static_params_t &binary_static_params,
             const lambda_jit_injectors_t &lambda_jit_injectors);
-    jit_uni_postops_injector_t(jit_generator *host, const post_ops_t &post_ops,
+    jit_uni_postops_injector_t(jit_generator_t *host,
+            const post_ops_t &post_ops,
             const binary_injector::static_params_t &binary_static_params,
             const eltwise_injector::static_params_t &eltwise_static_params);
-    jit_uni_postops_injector_t(jit_generator *host, const post_ops_t &post_ops,
+    jit_uni_postops_injector_t(jit_generator_t *host,
+            const post_ops_t &post_ops,
             const binary_injector::static_params_t &binary_static_params,
+            const quantization_injector::static_params_t &quantization_static_params);
+    jit_uni_postops_injector_t(jit_generator_t *host, const post_ops_t &post_ops,
             const eltwise_injector::static_params_t &eltwise_static_params,
+            const quantization_injector::static_params_t &quantization_static_params);
+    jit_uni_postops_injector_t(jit_generator_t *host, const post_ops_t &post_ops,
+            const binary_injector::static_params_t &binary_static_params,
+            const eltwise_injector::static_params_t &eltwise_static_params,
+            const quantization_injector::static_params_t &quantization_static_params);
+    jit_uni_postops_injector_t(jit_generator_t *host, const post_ops_t &post_ops,
+            const binary_injector::static_params_t &binary_static_params,
+            const eltwise_injector::static_params_t &eltwise_static_params,
+            const quantization_injector::static_params_t &quantization_static_params,
             const lambda_jit_injectors_t &lambda_jit_injectors);
 
-    virtual ~jit_uni_postops_injector_t() = default;
+    ~jit_uni_postops_injector_t() override = default;
 
+   void compute_vector_range(const injector_utils::vmm_index_set_t &vmm_idxs,
+            const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+            const depthwise_injector::dynamic_params_t &ddp,
+            const quantization_injector::dynamic_params_t &qdp, bool is_broadcast = false) override;
+    void compute_vector_range(
+            size_t start_idx, size_t end_idx,
+            const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+            const depthwise_injector::dynamic_params_t &ddp,
+            const quantization_injector::dynamic_params_t &qdp) override;
+    void compute_vector(size_t idx,
+                        const depthwise_injector::dynamic_params_t &ddp,
+                        const quantization_injector::dynamic_params_t &qdp, bool is_broadcast = false) override;
+    void compute_vector(size_t idx,
+            const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params,
+            const depthwise_injector::dynamic_params_t &ddp,
+            const quantization_injector::dynamic_params_t &qdp) override;
     // See `jit_uni_postops_injector_base_t::compute_vector_range(...)`
     void compute_vector_range(const injector_utils::vmm_index_set_t &vmm_idxs,
             const binary_injector::rhs_arg_dynamic_params_t &rhs_arg_params)
@@ -169,17 +222,25 @@ class jit_uni_postops_injector_t : public jit_uni_postops_injector_base_t<Vmm> {
     void set_lambda_injector(lambda_jit_injectors_t::key_type,
             const lambda_jit_injectors_t::mapped_type &jit_injector) override;
 
+    void push_post_ops_data_on_stack(const Xbyak::Reg64& post_ops_data_reg, std::size_t post_ops_data_offset,
+                                     const Xbyak::Reg64& aux_reg0, const Xbyak::Reg64& aux_reg1);
+    void reset_stack_pointer();
+
 private:
     post_ops_t post_ops_;
-    jit_generator *host_;
+    jit_generator_t *host_;
     // Key is a numerical order of a post-op in attributes.
-    std::map<int, jit_uni_eltwise_injector<isa, Vmm>> alg_to_eltwise_injector_;
+    std::map<int, jit_uni_eltwise_injector_t<isa, Vmm>>
+            alg_to_eltwise_injector_;
     std::unique_ptr<binary_injector::jit_uni_binary_injector_t<isa, Vmm>>
             binary_injector_;
     lambda_jit_injectors_t lambda_jit_injectors_;
+    nstl::vector<std::unique_ptr<jit_uni_depthwise_injector_f32<isa>>> depthwise_injectors;
+    nstl::vector<std::unique_ptr<jit_uni_quantization_injector_f32<isa, Vmm>>> quantization_injectors;
+    std::size_t post_ops_pointers_count = 0;
 };
 
-enum post_op_type { sum = 0, eltwise, binary, prelu };
+enum post_op_type { sum = 0, eltwise, binary, prelu, depthwise, quantization };
 
 struct post_ops_ok_args_t {
     post_ops_ok_args_t(const cpu_isa_t isa,
diff --git a/src/cpu/x64/injectors/jit_uni_quantization_injector.cpp b/src/cpu/x64/injectors/jit_uni_quantization_injector.cpp
new file mode 100644
index 00000000000..5d4db2172b3
--- /dev/null
+++ b/src/cpu/x64/injectors/jit_uni_quantization_injector.cpp
@@ -0,0 +1,303 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/nstl.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/injectors/jit_uni_quantization_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::init_crop_ptrs(const Xbyak::RegExp& ptr_begin, const Xbyak::Operand& ch_off) {
+    h->mov(reg_d_weights_, h->ptr[ptr_begin]);
+    h->mov(reg_d_bias_, h->ptr[ptr_begin]);
+
+    if (post_op_.quantization.per_channel[post_op_.quantization.crop_low] && !post_op_.quantization.all_default[post_op_.quantization.crop_low])
+        h->add(reg_d_weights_, ch_off);
+    if (post_op_.quantization.per_channel[post_op_.quantization.crop_high] && !post_op_.quantization.all_default[post_op_.quantization.crop_high])
+        h->add(reg_d_bias_, ch_off);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_crop_impl(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar, bool is_broadcast) {
+    size_t weights_off =  post_op_.quantization.offset[post_op_.quantization.crop_low] * sizeof(float);
+    size_t bias_off =  post_op_.quantization.offset[post_op_.quantization.crop_high] * sizeof(float);
+
+    if (is_scalar) {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.crop_low])
+            h->uni_vmovss(xmm_d_weights_, h->ptr[reg_d_weights_ + weights_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.crop_low])
+            h->uni_vpxor(vmm_d_weights_, vmm_d_weights_, vmm_d_weights_);
+        else
+            h->uni_vmovss(xmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+    } else {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.crop_low])
+            h->uni_vbroadcastss(vmm_d_weights_, h->ptr[reg_d_weights_ + weights_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.crop_low])
+            h->uni_vpxor(vmm_d_weights_, vmm_d_weights_, vmm_d_weights_);
+        else if (is_broadcast)
+            h->uni_vbroadcastss(vmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+        else
+            h->uni_vmovups(vmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+    }
+
+    if (vmm_d_weights_.getIdx() == vmm_d_bias_.getIdx()) {
+        for (auto vmmIdx : vmmIdxs) {
+            Vmm vmm_dst = Vmm(vmmIdx);
+            h->uni_vmaxps(vmm_dst, vmm_dst, vmm_d_weights_);
+        }
+    }
+
+    if (is_scalar) {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.crop_high])
+            h->uni_vmovss(xmm_d_bias_, h->ptr[reg_d_bias_ + bias_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.crop_high])
+            h->uni_vpxor(vmm_d_bias_, vmm_d_bias_, vmm_d_bias_);
+        else
+            h->uni_vmovss(xmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+    } else {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.crop_high])
+            h->uni_vbroadcastss(vmm_d_bias_, h->ptr[reg_d_bias_ + bias_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.crop_high])
+            h->uni_vpxor(vmm_d_bias_, vmm_d_bias_, vmm_d_bias_);
+        else if (is_broadcast)
+            h->uni_vbroadcastss(vmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+        else
+            h->uni_vmovups(vmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+    }
+
+    for (auto vmmIdx : vmmIdxs) {
+        Vmm vmm_dst = Vmm(vmmIdx);
+
+        if (vmm_d_weights_.getIdx() != vmm_d_bias_.getIdx())
+            h->uni_vmaxps(vmm_dst, vmm_dst, vmm_d_weights_);
+
+        h->uni_vminps(vmm_dst, vmm_dst, vmm_d_bias_);
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_crop(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar, bool is_broadcast) {
+    compute_crop_impl(vmmIdxs, offset, is_scalar, is_broadcast);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_crop(int start_idx, int end_idx, int offset, bool is_scalar, bool is_broadcast) {
+    std::set<size_t> vmmIdxs;
+    for (int i = start_idx; i < end_idx; i++) {
+        vmmIdxs.insert(i);
+    }
+
+    compute_crop_impl(vmmIdxs, offset, is_scalar, is_broadcast);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::init_input_scale_shift_ptrs(const Xbyak::RegExp& ptr_begin, const Xbyak::Operand& ch_off) {
+    h->mov(reg_d_weights_, h->ptr[ptr_begin]);
+    h->mov(reg_d_bias_, h->ptr[ptr_begin]);
+
+    if (post_op_.quantization.per_channel[post_op_.quantization.inp_scale])
+        h->add(reg_d_weights_, ch_off);
+    if (post_op_.quantization.per_channel[post_op_.quantization.inp_shift] && !post_op_.quantization.all_default[post_op_.quantization.inp_shift])
+        h->add(reg_d_bias_, ch_off);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_input_scale_shift_impl(
+        const std::set<size_t>& vmmIdxs, int offset, bool do_rounding, bool is_scalar, bool is_broadcast) {
+    size_t weights_off =  post_op_.quantization.offset[post_op_.quantization.inp_scale] * sizeof(float);
+    size_t bias_off =  post_op_.quantization.offset[post_op_.quantization.inp_shift] * sizeof(float);
+
+    if (is_scalar) {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.inp_scale])
+            h->uni_vmovss(xmm_d_weights_, h->ptr[reg_d_weights_ + weights_off]);
+        else
+            h->uni_vmovss(xmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+    } else {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.inp_scale])
+            h->uni_vbroadcastss(vmm_d_weights_, h->ptr[reg_d_weights_ + weights_off]);
+        else if (is_broadcast)
+            h->uni_vbroadcastss(vmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+        else
+            h->uni_vmovups(vmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+    }
+
+    if (vmm_d_weights_.getIdx() == vmm_d_bias_.getIdx()) {
+        for (auto vmmIdx : vmmIdxs) {
+            Vmm vmm_dst = Vmm(vmmIdx);
+
+            h->uni_vmulps(vmm_dst, vmm_dst, vmm_d_weights_);
+        }
+    }
+
+    if (is_scalar) {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.inp_shift])
+            h->uni_vmovss(xmm_d_bias_, h->ptr[reg_d_bias_ + bias_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.inp_shift])
+            h->uni_vpxor(vmm_d_bias_, vmm_d_bias_, vmm_d_bias_);
+        else
+            h->uni_vmovss(xmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+    } else {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.inp_shift])
+            h->uni_vbroadcastss(vmm_d_bias_, h->ptr[reg_d_bias_ + bias_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.inp_shift])
+            h->uni_vpxor(vmm_d_bias_, vmm_d_bias_, vmm_d_bias_);
+        else if (is_broadcast)
+            h->uni_vbroadcastss(vmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+        else
+            h->uni_vmovups(vmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+    }
+
+    for (auto vmmIdx : vmmIdxs) {
+        Vmm vmm_dst = Vmm(vmmIdx);
+
+        if (vmm_d_weights_.getIdx() == vmm_d_bias_.getIdx())
+            h->uni_vaddps(vmm_dst, vmm_dst, vmm_d_bias_);
+        else
+            h->uni_vfmadd213ps(vmm_dst, vmm_d_weights_, vmm_d_bias_);
+
+        if (do_rounding)
+            h->uni_vroundps(vmm_dst, vmm_dst, 0);
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_input_scale_shift(int start_idx, int end_idx, int offset, bool do_rounding, bool is_scalar, bool is_broadcast) {
+    std::set<size_t> vmmIdxs;
+    for (int i = start_idx; i < end_idx; i++) {
+        vmmIdxs.insert(i);
+    }
+
+    compute_input_scale_shift_impl(vmmIdxs, offset, do_rounding, is_scalar, is_broadcast);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_input_scale_shift(const std::set<size_t>& vmmIdxs, int offset, bool do_rounding, bool is_scalar, bool is_broadcast) {
+    compute_input_scale_shift_impl(vmmIdxs, offset, do_rounding, is_scalar, is_broadcast);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::init_output_scale_shift_ptrs(const Xbyak::RegExp& ptr_begin, const Xbyak::Operand& ch_off) {
+    if (!do_dequantization)
+        return;
+
+    h->mov(reg_d_weights_, h->ptr[ptr_begin]);
+    h->mov(reg_d_bias_, h->ptr[ptr_begin]);
+
+    if (post_op_.quantization.per_channel[post_op_.quantization.output_scale])
+        h->add(reg_d_weights_, ch_off);
+    if (post_op_.quantization.per_channel[post_op_.quantization.output_shift] && !post_op_.quantization.all_default[post_op_.quantization.output_shift])
+        h->add(reg_d_bias_, ch_off);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_output_scale_shift_impl(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar, bool is_broadcast) {
+    size_t weights_off =  post_op_.quantization.offset[post_op_.quantization.output_scale] * sizeof(float);
+    size_t bias_off =  post_op_.quantization.offset[post_op_.quantization.output_shift] * sizeof(float);
+
+    if (!do_dequantization)
+        return;
+
+    if (is_scalar) {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.output_scale])
+            h->uni_vmovss(xmm_d_weights_, h->ptr[reg_d_weights_ + weights_off]);
+        else
+            h->uni_vmovss(xmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+    } else {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.output_scale])
+            h->uni_vbroadcastss(vmm_d_weights_, h->ptr[reg_d_weights_ + weights_off]);
+        else if (is_broadcast)
+            h->uni_vbroadcastss(vmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+        else
+            h->uni_vmovups(vmm_d_weights_, h->ptr[reg_d_weights_ + offset + weights_off]);
+    }
+
+    if (vmm_d_weights_.getIdx() == vmm_d_bias_.getIdx()) {
+        for (auto &vmmIdx : vmmIdxs) {
+            Vmm vmm_dst = Vmm(vmmIdx);
+
+            h->uni_vmulps(vmm_dst, vmm_dst, vmm_d_weights_);
+        }
+    }
+
+    if (is_scalar) {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.output_shift])
+            h->uni_vmovss(xmm_d_bias_, h->ptr[reg_d_bias_ + bias_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.output_shift])
+            h->uni_vpxor(vmm_d_bias_, vmm_d_bias_, vmm_d_bias_);
+        else
+            h->uni_vmovss(xmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+    } else {
+        if (!post_op_.quantization.per_channel[post_op_.quantization.output_shift])
+            h->uni_vbroadcastss(vmm_d_bias_, h->ptr[reg_d_bias_ + bias_off]);
+        else if (post_op_.quantization.all_default[post_op_.quantization.output_shift])
+            h->uni_vpxor(vmm_d_bias_, vmm_d_bias_, vmm_d_bias_);
+        else if (is_broadcast)
+            h->uni_vbroadcastss(vmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+        else
+            h->uni_vmovups(vmm_d_bias_, h->ptr[reg_d_bias_ + offset + bias_off]);
+    }
+
+    for (auto &vmmIdx : vmmIdxs) {
+        Vmm vmm_dst = Vmm(vmmIdx);
+
+        if (vmm_d_weights_.getIdx() == vmm_d_bias_.getIdx())
+            h->uni_vaddps(vmm_dst, vmm_dst, vmm_d_bias_);
+        else
+            h->uni_vfmadd213ps(vmm_dst, vmm_d_weights_, vmm_d_bias_);
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_output_scale_shift(int start_idx, int end_idx, int offset, bool is_scalar, bool is_broadcast) {
+    std::set<size_t> vmmIdxs;
+    for (int i = start_idx; i < end_idx; i++) {
+        vmmIdxs.insert(i);
+    }
+
+    compute_output_scale_shift_impl(vmmIdxs, offset, is_scalar, is_broadcast);
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_quantization_injector_f32<isa, Vmm>::compute_output_scale_shift(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar, bool is_broadcast) {
+    compute_output_scale_shift_impl(vmmIdxs, offset, is_scalar, is_broadcast);
+}
+
+template struct jit_uni_quantization_injector_f32<avx512_core_fp16>;
+template struct jit_uni_quantization_injector_f32<avx512_core_fp16, Xbyak::Ymm>;
+template struct jit_uni_quantization_injector_f32<avx512_core_fp16, Xbyak::Xmm>;
+template struct jit_uni_quantization_injector_f32<avx512_core_bf16>;
+template struct jit_uni_quantization_injector_f32<avx512_core>;
+template struct jit_uni_quantization_injector_f32<avx512_core, Xbyak::Ymm>;
+template struct jit_uni_quantization_injector_f32<avx512_core, Xbyak::Xmm>;
+template struct jit_uni_quantization_injector_f32<avx2_vnni_2, Xbyak::Ymm>;
+template struct jit_uni_quantization_injector_f32<avx2_vnni_2, Xbyak::Xmm>;
+template struct jit_uni_quantization_injector_f32<avx2, Xbyak::Ymm>;
+template struct jit_uni_quantization_injector_f32<avx2, Xbyak::Xmm>;
+template struct jit_uni_quantization_injector_f32<avx, Xbyak::Ymm>;
+template struct jit_uni_quantization_injector_f32<avx, Xbyak::Xmm>;
+template struct jit_uni_quantization_injector_f32<sse41>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/injectors/jit_uni_quantization_injector.hpp b/src/cpu/x64/injectors/jit_uni_quantization_injector.hpp
new file mode 100644
index 00000000000..058ef8e2b8f
--- /dev/null
+++ b/src/cpu/x64/injectors/jit_uni_quantization_injector.hpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_QUANTIZATION_INJECTOR_HPP
+#define CPU_X64_JIT_UNI_QUANTIZATION_INJECTOR_HPP
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/primitive_attr.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include <set>
+
+#include "cpu/x64/jit_generator.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+namespace quantization_injector {
+
+struct static_params_t {
+    static_params_t(int vmm_d_weights_idx = 0, int vmm_d_bias_idx = 0,
+                    Xbyak::Reg64 reg_d_weights = Xbyak::Reg64(0), Xbyak::Reg64 reg_d_bias = Xbyak::Reg64(0)) :
+        vmm_d_weights_idx(vmm_d_weights_idx), vmm_d_bias_idx(vmm_d_bias_idx), reg_d_weights(reg_d_weights), reg_d_bias(reg_d_bias) {}
+
+    int vmm_d_weights_idx;
+    int vmm_d_bias_idx;
+    Xbyak::Reg64 reg_d_weights;
+    Xbyak::Reg64 reg_d_bias;
+};
+
+struct dynamic_params_t {
+    dynamic_params_t() :
+        reg_oc_off(Xbyak::Reg64(0)), reg_oc_off_addr(0), vmm_idx_off({}), dst_dt(dnnl_f32), useAddr(false) {
+    }
+
+    dynamic_params_t(Xbyak::Reg64 reg_oc_off, const std::map<size_t, int>& vmm_idx_off, data_type_t dst_dt,
+            Xbyak::Reg64 reg_post_ops_data = Xbyak::Reg64(0), int base_post_ops_data_offset = 0) :
+        reg_oc_off(reg_oc_off), reg_oc_off_addr(0), vmm_idx_off(vmm_idx_off), dst_dt(dst_dt), useAddr(false),
+        reg_post_ops_data(reg_post_ops_data), base_post_ops_data_offset(base_post_ops_data_offset) {
+    }
+
+    dynamic_params_t(Xbyak::Address reg_oc_off, const std::map<size_t, int>& vmm_idx_off, data_type_t dst_dt,
+            Xbyak::Reg64 reg_post_ops_data = Xbyak::Reg64(0), int base_post_ops_data_offset = 0) :
+        reg_oc_off(0), reg_oc_off_addr(reg_oc_off), vmm_idx_off(vmm_idx_off), dst_dt(dst_dt), useAddr(true),
+        reg_post_ops_data(reg_post_ops_data), base_post_ops_data_offset(base_post_ops_data_offset) {
+    }
+
+    Xbyak::Reg64 reg_oc_off;
+    Xbyak::Address reg_oc_off_addr;
+    std::map<size_t, int> vmm_idx_off;
+    data_type_t dst_dt;
+    bool useAddr;
+    Xbyak::Reg64 reg_post_ops_data;
+    int base_post_ops_data_offset;
+};
+
+} // quantization_injector
+
+template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits_t<isa>::Vmm>
+struct jit_uni_quantization_injector_f32 {
+    jit_uni_quantization_injector_f32(jit_generator_t* host, dnnl_post_ops::entry_t post_op,
+            Vmm vmm_d_weights, Vmm vmm_d_bias, Xbyak::Reg64 reg_d_weights, Xbyak::Reg64 reg_d_bias)
+        : h(host), post_op_(post_op), vmm_d_weights_(vmm_d_weights), vmm_d_bias_(vmm_d_bias), reg_d_weights_(reg_d_weights), reg_d_bias_(reg_d_bias) {
+        assert(post_op.is_quantization());
+        assert(utils::one_of(post_op.quantization.alg, alg_kind::quantization_quantize, alg_kind::quantization_quantize_dequantize));
+
+        do_dequantization = post_op_.quantization.alg == alg_kind::quantization_quantize_dequantize;
+
+        xmm_d_weights_ = Xbyak::Xmm(vmm_d_weights.getIdx());
+        xmm_d_bias_ = Xbyak::Xmm(vmm_d_bias.getIdx());
+    }
+
+    void init_crop_ptrs(const Xbyak::RegExp& ptr_begin, const Xbyak::Operand& ch_off);
+    void init_input_scale_shift_ptrs(const Xbyak::RegExp& ptr_begin, const Xbyak::Operand& ch_off);
+    void init_output_scale_shift_ptrs(const Xbyak::RegExp& ptr_begin, const Xbyak::Operand& ch_off);
+
+    void compute_crop(int start_idx, int end_idx, int offset, bool is_scalar = false, bool is_broadcast = false);
+    void compute_input_scale_shift(int start_idx, int end_idx, int offset, bool do_rounding, bool is_scalar = false, bool is_broadcast = false);
+    void compute_output_scale_shift(int start_idx, int end_idx, int offset, bool is_scalar = false, bool is_broadcast = false);
+
+    void compute_crop(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar, bool is_broadcast);
+    void compute_input_scale_shift(const std::set<size_t>& vmmIdxs, int offset, bool do_rounding, bool is_scalar = false, bool is_broadcast = false);
+    void compute_output_scale_shift(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar = false, bool is_broadcast = false);
+
+    // in bytes
+    static constexpr size_t memoryStep() {
+        return sizeof(float*);
+    }
+
+private:
+    void compute_crop_impl(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar, bool is_broadcast);
+    void compute_input_scale_shift_impl(const std::set<size_t>& vmmIdxs, int offset, bool do_rounding, bool is_scalar = false, bool is_broadcast = false);
+    void compute_output_scale_shift_impl(const std::set<size_t>& vmmIdxs, int offset, bool is_scalar = false, bool is_broadcast = false);
+
+    jit_generator_t* h;
+
+    size_t vlen = cpu_isa_traits_t<isa>::vlen;
+
+    dnnl_post_ops::entry_t post_op_;
+
+    Vmm vmm_d_weights_;
+    Vmm vmm_d_bias_;
+    Xbyak::Xmm xmm_d_weights_;
+    Xbyak::Xmm xmm_d_bias_;
+
+    Xbyak::Reg64 reg_d_weights_;
+    Xbyak::Reg64 reg_d_bias_;
+
+    bool do_dequantization;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/ip_convolution.hpp b/src/cpu/x64/ip_convolution.hpp
index 9eb17ed3402..cb57a1bbcba 100644
--- a/src/cpu/x64/ip_convolution.hpp
+++ b/src/cpu/x64/ip_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include "common/primitive.hpp"
 #include "common/primitive_desc_iterator.hpp"
 #include "common/utils.hpp"
+#include "cpu/gemm/gemm.hpp"
 
 #include "cpu/cpu_convolution_pd.hpp"
 #include "cpu/cpu_inner_product_pd.hpp"
@@ -150,17 +151,13 @@ status_t set_and_or_check_formats(const convolution_desc_t &desc,
 
 struct ip_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_convolution_fwd_pd_t(other)
             , ip_pd_(other.ip_pd_->clone())
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), ip_convolution_fwd_t);
 
         status_t init_ip(engine_t *engine) {
@@ -172,8 +169,19 @@ struct ip_convolution_fwd_t : public primitive_t {
 
             while (++it != it.end()) {
                 ip_pd_ = *it;
-                const bool ok = ip_pd_->weights_md()->extra.flags == 0;
-                if (ok) return status::success;
+                const bool has_no_compensation
+                        = ip_pd_->weights_md()->extra.flags
+                        == memory_extra_flags::none;
+                if (has_no_compensation) {
+                    // avoid gemm implementation
+                    std::string impl_name(ip_pd_->name());
+                    if (std::string::npos != impl_name.find(GEMM_IMPL_STR))
+                        continue;
+                    // avoid reference implementation
+                    if (std::string::npos != impl_name.find("ref:any"))
+                        continue;
+                    return status::success;
+                }
             }
             return status::unimplemented;
         }
@@ -186,7 +194,7 @@ struct ip_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(attr()->has_default_values(
-                                   smask_t::scales_runtime | smask_t::post_ops),
+                                   smask_t::scales | smask_t::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
 
             CHECK(check_conv_ip(this));
@@ -256,17 +264,13 @@ struct ip_convolution_fwd_t : public primitive_t {
 
 struct ip_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_convolution_bwd_data_pd_t(other)
             , ip_pd_(other.ip_pd_->clone())
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), ip_convolution_bwd_data_t);
 
         status_t init_ip(engine_t *engine) {
@@ -277,16 +281,18 @@ struct ip_convolution_bwd_data_t : public primitive_t {
             if (!it.is_initialized()) return status::out_of_memory;
             while (++it != it.end()) {
                 ip_pd_ = *it;
-
-                // Avoid dispatching reference for f16 data-type.
-                const bool is_f16 = weights_md_.data_type == data_type::f16;
-                if (is_f16) {
-                    const std::string impl_name(ip_pd_->name());
-                    if (std::string::npos != impl_name.find("ref"))
-                        return status::unimplemented;
-                } else {
-                    const bool ok = ip_pd_->weights_md()->extra.flags == 0;
-                    if (ok) return status::success;
+                const bool has_no_compensation
+                        = ip_pd_->weights_md()->extra.flags
+                        == memory_extra_flags::none;
+                if (has_no_compensation) {
+                    // avoid gemm implementation
+                    std::string impl_name(ip_pd_->name());
+                    if (std::string::npos != impl_name.find(GEMM_IMPL_STR))
+                        continue;
+                    // avoid reference implementation
+                    if (std::string::npos != impl_name.find("ref:any"))
+                        continue;
+                    return status::success;
                 }
             }
             return status::unimplemented;
@@ -367,17 +373,14 @@ struct ip_convolution_bwd_data_t : public primitive_t {
 
 struct ip_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_convolution_bwd_weights_pd_t(other)
             , ip_pd_(other.ip_pd_->clone())
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), ip_convolution_bwd_weights_t);
 
         status_t init_ip(engine_t *engine) {
@@ -389,8 +392,19 @@ struct ip_convolution_bwd_weights_t : public primitive_t {
 
             while (++it != it.end()) {
                 ip_pd_ = *it;
-                const bool ok = ip_pd_->weights_md()->extra.flags == 0;
-                if (ok) return status::success;
+                const bool has_no_compensation
+                        = ip_pd_->weights_md()->extra.flags
+                        == memory_extra_flags::none;
+                if (has_no_compensation) {
+                    // avoid gemm implementation
+                    std::string impl_name(ip_pd_->name());
+                    if (std::string::npos != impl_name.find(GEMM_IMPL_STR))
+                        continue;
+                    // avoid reference implementation
+                    if (std::string::npos != impl_name.find("ref:any"))
+                        continue;
+                    return status::success;
+                }
             }
             return status::unimplemented;
         }
diff --git a/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.cpp b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.cpp
index fb6310c9807..7a19cec021d 100644
--- a/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.cpp
+++ b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 * Copyright 2018 YANDEX LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -48,8 +48,8 @@ using namespace Xbyak;
 jit_avx2_1x1_conv_kernel_f32::jit_avx2_1x1_conv_kernel_f32(
         const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), avx2), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), avx2), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -63,10 +63,12 @@ jit_avx2_1x1_conv_kernel_f32::jit_avx2_1x1_conv_kernel_f32(
                 memory_desc_wrapper(dst_md), tail_size,
                 use_exact_tail_scalar_bcast};
         static_params_t static_params {this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+            {ymm_d_weights.getIdx(), ymm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx2>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -147,13 +149,22 @@ void iterate(const int load_loop_blk, const int ur, const F &f) {
 
 void jit_avx2_1x1_conv_kernel_f32::apply_postops(
         const int load_loop_blk, const int ur, const int load_dim_tail) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         assert(ur * load_loop_blk < 14);
 
         Label store_nopost_ops;
         test(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
         jz(store_nopost_ops, T_NEAR);
 
+        std::map<size_t, int> vmm_idx_off;
+        iterate(load_loop_blk, ur, load_dim_tail,
+                [&](const bool, const int i, const int j) {
+                    vmm_idx_off.insert({vreg_accum_idx(load_loop_blk, i, j), i * jcp.oc_block * sizeof(float)});
+                });
+        depthwise_injector::dynamic_params_t ddp {ymm_d_weights.getIdx(), ymm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  reg_oc_off, vmm_idx_off, this->rsp};
+        quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt, this->rsp};
+
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
             binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
@@ -203,14 +214,14 @@ void jit_avx2_1x1_conv_kernel_f32::apply_postops(
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             }
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
         } else {
             iterate(load_loop_blk, ur, load_dim_tail,
                     [&](const bool, const int i, const int j) {
                         vmm_idxs.emplace(vreg_accum_idx(load_loop_blk, i, j));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
         L(store_nopost_ops);
     }
@@ -262,8 +273,9 @@ void jit_avx2_1x1_conv_kernel_f32::generate_reduce_loop(
             default:
                 offt = (i * rnd_up(jcp.ic, jcp.ic_block) + u0) * jcp.oc_block;
         }
-        return ptr[aux_reg_load_data + u1 * jcp.reduce_loop_load_step
-                + sizeof(float) * offt];
+        return make_safe_addr(aux_reg_load_data,
+                u1 * jcp.reduce_loop_load_step + sizeof(float) * offt,
+                reg_long_offt);
     };
 
     auto get_output_offset = [this](int i, int j) {
@@ -567,6 +579,9 @@ void jit_avx2_1x1_conv_kernel_f32::generate() {
 
     sub(rsp, stack_space_needed);
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_bcast_data, reg_load_data);
+
     if (jcp.with_binary) {
         mov(ptr[rsp + reg_abi_param1_backup], abi_param1);
         if (jcp.with_dw_conv) {
@@ -594,6 +609,7 @@ void jit_avx2_1x1_conv_kernel_f32::generate() {
     mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
     if (jcp.prop_kind == backward_weights)
         mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     auto generate_load_loop_body = [&](int load_loop_blk) {
         generate_bcast_loop(load_loop_blk);
@@ -627,6 +643,7 @@ void jit_avx2_1x1_conv_kernel_f32::generate() {
             default: assert(!"invalid prop_kind");
         }
         sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
     };
 
     Label load_loop_blk_8;
@@ -676,6 +693,9 @@ void jit_avx2_1x1_conv_kernel_f32::generate() {
 
     add(rsp, stack_space_needed);
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -690,6 +710,11 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     if (!mayiuse(avx)) return status::unimplemented;
     jcp.isa = mayiuse(avx2) ? avx2 : avx;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     // TODO (Roma): this code is duplicated from the generic kernel; maybe the
     // configuration struct could do some stuff below
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
@@ -733,8 +758,8 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.os = static_cast<dim_t>(jcp.od) * jcp.oh * jcp.ow;
     jcp.is = static_cast<dim_t>(jcp.id) * jcp.ih * jcp.iw;
 
-    jcp.typesize_in = sizeof(prec_traits<data_type::f32>::type);
-    jcp.typesize_out = sizeof(prec_traits<data_type::f32>::type);
+    jcp.typesize_in = sizeof(prec_traits_t<data_type::f32>::type);
+    jcp.typesize_out = sizeof(prec_traits_t<data_type::f32>::type);
 
     const auto &post_ops = attr.post_ops_;
     const int dw_conv_ind = post_ops.find(primitive_kind::convolution);
@@ -752,6 +777,9 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     const int prelu_ind = post_ops.find(primitive_kind::prelu, 0, dw_conv_ind);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
 
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise, 0, dw_conv_ind) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization, 0, dw_conv_ind) != -1;
+
     if (dw_conv_ind >= 0) {
         // dw_conv and post_ops after it are handled externally, so skip them
         jcp.post_ops.entry_.assign(post_ops.entry_.cbegin(),
@@ -762,8 +790,8 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
 
     const auto dat_tag_nxc = utils::pick(ndims - 3, nwc, nhwc, ndhwc);
     const auto dat_tag_nCx8c = utils::pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
-    jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
     const bool is_data_layout_nxc
             = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
     const auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx8c;
@@ -784,7 +812,7 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
         jcp.ic = rnd_up(jcp.ic, simd_w);
     }
 
-    if (jcp.with_eltwise || jcp.with_binary)
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization)
         VDISPATCH_CONV_IC(jcp.isa >= avx2, VERBOSE_UNSUPPORTED_FEATURE,
                 "eltwise and binary post-ops not implemented on isa");
 
@@ -793,7 +821,7 @@ status_t jit_avx2_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(jcp.isa,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
diff --git a/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.hpp b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.hpp
index b7d429e00e4..097b9509407 100644
--- a/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.hpp
+++ b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx2_1x1_conv_kernel_f32 : public jit_generator {
+struct jit_avx2_1x1_conv_kernel_f32 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_1x1_conv_kernel_f32)
 
     jit_avx2_1x1_conv_kernel_f32(const jit_1x1_conv_conf_t &ajcp,
@@ -52,7 +52,7 @@ struct jit_avx2_1x1_conv_kernel_f32 : public jit_generator {
             postops_injector_;
 
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx2>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx2>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
     using ymm_t = const Xbyak::Ymm;
 
@@ -85,6 +85,12 @@ struct jit_avx2_1x1_conv_kernel_f32 : public jit_generator {
     constexpr static int reg_dw_binary_output_off = 3 * reg64_size_;
     constexpr static int stack_space_needed = 4 * reg64_size_;
 
+    reg64_t reg_oc_off = load_loop_iter;
+    reg64_t reg_d_weights = aux_reg_bcast_data;
+    reg64_t reg_d_bias = reduce_loop_iter; // todo: [AV] check, conflict with out_off_oprnd (r15)
+    ymm_t ymm_d_weights = Xbyak::Ymm(14);
+    ymm_t ymm_d_bias = Xbyak::Ymm(15);
+
     ymm_t vreg_bcast = ymm_t(15);
     ymm_t vtmp = ymm_t(14);
 
diff --git a/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.cpp b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.cpp
new file mode 100644
index 00000000000..189a1003006
--- /dev/null
+++ b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.cpp
@@ -0,0 +1,838 @@
+/*******************************************************************************
+* Copyright 2016-2020 Intel Corporation
+* Copyright 2018 YANDEX LLC
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/* [todo] antonvor:
+ * This file contains the old plugin behavior in order to fix performance
+ * problems after upgrading to OneDNN v1.6. This kernel is executed only on
+ * machines with avx2 instruction set support and in the case of a fused
+ * convolution. Remove after problems are fixed.
+*/
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/memory.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.hpp"
+#include "cpu/x64/jit_uni_1x1_conv_utils.hpp"
+
+#define GET_OFF(field) offsetof(jit_1x1_conv_call_s, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::prop_kind;
+using namespace dnnl::impl::format_tag;
+using namespace dnnl::impl::utils;
+
+using namespace Xbyak;
+
+void jit_avx2_1x1_conv_kernel_f32_old::generate_bcast_loop(int load_loop_blk) {
+    mov(aux1_reg_bcast_data, reg_bcast_data);
+    mov(aux_reg_output_data, reg_output_data);
+    mov(bcast_loop_iter, reg_bcast_loop_work);
+
+    Label bcast_loop, bcast_loop_tail;
+
+    cmp(bcast_loop_iter, jcp.ur);
+    jl(bcast_loop_tail, T_NEAR);
+
+    L(bcast_loop); {
+        assert(jcp.bcast_block % jcp.ur == 0);
+        int num_substeps = jcp.bcast_block / jcp.ur;
+        assert(num_substeps > 0 && num_substeps < 10);
+        for (int i = 0; i < num_substeps; i++) {
+            generate_reduce_loop(load_loop_blk, jcp.ur);
+            if (i < num_substeps - 1) {
+                add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_substep);
+                add(aux_reg_output_data, jcp.bcast_loop_output_substep);
+            } else {
+                add(aux1_reg_bcast_data, jcp.bcast_loop_bcast_step
+                        - (num_substeps - 1) * jcp.bcast_loop_bcast_substep);
+                add(aux_reg_output_data, jcp.bcast_loop_output_step
+                        - (num_substeps - 1) * jcp.bcast_loop_output_substep);
+            }
+        }
+        sub(bcast_loop_iter, jcp.bcast_block);
+        cmp(bcast_loop_iter, jcp.bcast_block);
+        jge(bcast_loop, T_NEAR);
+    }
+
+    L(bcast_loop_tail);
+    if (jcp.ur_tail) {
+        Label bcast_loop_tail_out;
+        cmp(bcast_loop_iter, 0);
+        jz(bcast_loop_tail_out, T_NEAR);
+        generate_reduce_loop(load_loop_blk, jcp.ur_tail);
+        L(bcast_loop_tail_out);
+    }
+}
+
+void jit_avx2_1x1_conv_kernel_f32_old::generate_reduce_loop(
+        int load_loop_blk, int ur) {
+
+    auto vreg_load = [=](int i) {
+        return Ymm(ur * load_loop_blk + i);
+    };
+
+    auto vreg_accum = [=](int i, int j) {
+        return Ymm(j + i*ur);
+    };
+
+    auto bias_ptr = [=](int i) {
+        return ptr[reg_bias_data + sizeof(float) * jcp.oc_block * i];
+    };
+
+    auto bcast_ptr = [=](int u, int j) {
+        assert(j < jcp.ur);
+        assert(u <= jcp.reduce_loop_unroll);
+        size_t offt;
+        if (one_of(jcp.prop_kind,
+                    forward_training, forward_inference, backward_data))
+        {
+            assert(jcp.reduce_loop_unroll == (jcp.prop_kind == backward_data)
+                    ? jcp.oc_block : jcp.ic_block);
+            auto height = (jcp.prop_kind == backward_data) ? jcp.os : jcp.is;
+            offt = (u == jcp.reduce_loop_unroll)
+                ? (height + j) * jcp.reduce_loop_unroll
+                : j * jcp.reduce_loop_unroll + u;
+        } else
+            offt = u * jcp.ic_block + j;
+        return ptr[aux_reg_bcast_data + sizeof(float) * offt];
+    };
+
+    auto load_ptr = [=](int u, int i) {
+        size_t offt;
+        size_t u0 = u % jcp.reduce_loop_unroll;
+        size_t u1 = u / jcp.reduce_loop_unroll;
+        switch (jcp.prop_kind) {
+        case backward_data:
+            offt = (i * jcp.oc_block + u0) * jcp.ic_block;
+            break;
+        case backward_weights:
+            offt = (i * jcp.os + u0) * jcp.oc_block;
+            break;
+        default:
+            offt = (i * jcp.ic + u0) * jcp.oc_block;
+        }
+        return ptr[aux_reg_load_data
+            + u1 * jcp.reduce_loop_load_step + sizeof(float) * offt];
+    };
+
+    auto output_ptr = [=](int i, int j) {
+        switch (jcp.prop_kind) {
+        case backward_data:
+            return ptr[aux_reg_output_data +
+                (i * jcp.is + j) * jcp.ic_block * sizeof(float)];
+        case backward_weights:
+            return ptr[aux_reg_output_data
+                + (i ? reg_output_stride * i : 0) // TODO: Xbyak should allow 0 scale
+                + sizeof(float) * jcp.oc_block * j];
+        default:
+            if (jcp.with_dw_conv) {
+                return ptr[aux_reg_output_data +
+                           (i * jcp_dw.kh * jcp.ow + j) * jcp.oc_block * sizeof(float)];
+            } else {
+                return ptr[aux_reg_output_data +
+                           (i * jcp.os + j) * jcp.oc_block * sizeof(float)];
+            }
+        }
+    };
+
+    auto init = [=]() {
+        Label init_done, init_zero;
+
+        if (jcp.with_bias && one_of(jcp.prop_kind, forward_training,
+                    forward_inference)) {
+            test(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
+            jz(init_zero);
+
+            for (int i = 0; i < load_loop_blk; i++)
+                for (int j = 0; j < ur; ++j)
+                    vmovups(vreg_accum(i, j), bias_ptr(i));
+            jmp(init_done);
+        }
+
+        L(init_zero);
+        for (int i = 0; i < load_loop_blk; ++i)
+            for (int j = 0; j < ur; ++j) {
+                auto r = vreg_accum(i, j);
+                vxorps(r, r, r);
+            }
+
+        L(init_done);
+        for (int i = 0; i < load_loop_blk; ++i)
+            vmovups(vreg_load(i), load_ptr(0, i));
+        vbroadcastss(vreg_bcast, bcast_ptr(0, 0));
+    };
+
+    auto store = [=]() {
+        Label store_noadd;
+
+        if (!jcp.with_sum) {
+            test(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
+            jnz(store_noadd, T_NEAR);
+        }
+
+        for (int j = 0; j < ur; ++j)
+            for (int i = 0; i < load_loop_blk; ++i) {
+                auto r = vreg_accum(i, j);
+                vaddps(r, r, output_ptr(i, j));
+            }
+
+        L(store_noadd);
+
+        Label store_norelu;
+        test(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
+        jz(store_norelu, T_NEAR);
+
+        int eltwise_inj_idx = 0;
+        int depthwise_inj_idx = 0;
+        int quantization_inj_idx = 0;
+        std::size_t post_ops_data_offset = 0;
+        const auto &p = attr_.post_ops_;
+
+        int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len();
+        for (int i = 0; i < end_idx; i++) {
+            auto& post_op = p.entry_[i];
+            if (post_op.is_eltwise()) {
+                eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur * load_loop_blk);
+                eltwise_inj_idx++;
+            } else if (post_op.is_depthwise()) {
+                mov(reg_d_weights, ptr[this->rsp + post_ops_data_offset]);
+                add(reg_d_weights, reg_oc_off);
+
+                for (int j = 0; j < load_loop_blk; ++j) {
+                    int start_idx = vreg_accum(j, 0).getIdx();
+                    int end_idx = start_idx + ur;
+
+                    depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                            start_idx, end_idx, reg_d_weights, reg_d_weights);
+
+                    add(reg_d_weights, jcp.oc_block * sizeof(float));
+                }
+
+                post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+                depthwise_inj_idx++;
+            } else if (post_op.is_quantization()) {
+                const Xbyak::RegExp quant_arg_base = this->rsp + post_ops_data_offset;
+                quantization_injectors[quantization_inj_idx]->init_crop_ptrs(quant_arg_base, reg_oc_off);
+                for (int ii = 0; ii < load_loop_blk; ii++) {
+                    int s_idx = vreg_accum(ii, 0).getIdx();
+                    quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + ur, ii * jcp.oc_block * sizeof(float));
+                }
+
+                quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(quant_arg_base, reg_oc_off);
+                for (int ii = 0; ii < load_loop_blk; ii++) {
+                    int s_idx = vreg_accum(ii, 0).getIdx();
+                    quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + ur, ii * jcp.oc_block * sizeof(float), true);
+                }
+
+                quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(quant_arg_base, reg_oc_off);
+                for (int ii = 0; ii < load_loop_blk; ii++) {
+                    int s_idx = vreg_accum(ii, 0).getIdx();
+                    quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + ur, ii * jcp.oc_block * sizeof(float));
+                }
+
+                post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep();
+                quantization_inj_idx++;
+            }
+        }
+
+        L(store_norelu);
+
+        for (int j = 0; j < ur; ++j)
+            for (int i = 0; i < load_loop_blk; ++i) {
+                vmovups(output_ptr(i, j), vreg_accum(i, j));
+            }
+    };
+
+    auto fma_block = [=](bool last_block) {
+        for (int u = 0; u < jcp.reduce_loop_unroll; ++u) {
+            for (int j = 0; j < ur; ++j) {
+                for (int i = 0; i < load_loop_blk; ++i) {
+                    if (mayiuse(avx2))
+                        vfmadd231ps(vreg_accum(i, j), vreg_load(i), vreg_bcast);
+                    else { // Intel(R) Advanced Vector Extensions (Intel(R) AVX) support
+                        vmulps(vtmp, vreg_bcast, vreg_load(i));
+                        vaddps(vreg_accum(i, j), vreg_accum(i, j), vtmp);
+                    }
+                    if (j == ur - 1 && !(last_block
+                                && u == jcp.reduce_loop_unroll - 1))
+                        vmovups(vreg_load(i), load_ptr(u + 1, i));
+                }
+                if (j < ur - 1)
+                    vbroadcastss(vreg_bcast, bcast_ptr(u, j + 1));
+            }
+            if (!last_block || u < jcp.reduce_loop_unroll - 1)
+                vbroadcastss(vreg_bcast, bcast_ptr(u + 1, 0));
+        }
+    };
+
+    Label reduce_loop, reduce_loop_tail;
+
+    mov(aux_reg_load_data, reg_load_data);
+    mov(aux_reg_bcast_data, aux1_reg_bcast_data);
+
+    init();
+
+    mov(reduce_loop_iter, reg_reduce_loop_work);
+    sub(reduce_loop_iter, jcp.reduce_loop_unroll);
+    jle(reduce_loop_tail, T_NEAR);
+
+    L(reduce_loop); {
+        fma_block(false);
+        add(aux_reg_bcast_data, jcp.reduce_loop_bcast_step);
+        add(aux_reg_load_data, jcp.reduce_loop_load_step);
+        sub(reduce_loop_iter, jcp.reduce_loop_unroll);
+        jg(reduce_loop, T_NEAR);
+    }
+
+    L(reduce_loop_tail);
+    fma_block(true);
+
+    store();
+}
+
+void jit_avx2_1x1_conv_kernel_f32_old::generate_diff_bias_loop(int load_loop_blk) {
+    if (!jcp.with_bias || jcp.prop_kind != backward_weights) return;
+
+    Label diff_bias_loop, diff_bias_loop_out, diff_bias_init_out;
+    Label diff_bias_load;
+
+    auto diff_bias_ptr = [=](int i) {
+        return ptr[reg_diff_bias_data + i * jcp.oc_block * sizeof(float)];
+    };
+
+    auto load_ptr = [=](int u, int i) {
+        return ptr[aux_reg_load_data
+            + (i * jcp.os + u) * jcp.oc_block * sizeof(float)];
+    };
+
+    auto diff_bias_reg = [=](int i) { return Ymm(i); };
+
+    mov(reg_diff_bias_data, ptr[rsp + reg_diff_bias_data_stack_offt]);
+    cmp(reg_diff_bias_data, 0);
+    je(diff_bias_loop_out, T_NEAR);
+
+    test(reg_reduce_pos_flag, FLAG_REDUCE_FIRST);
+    jz(diff_bias_load, T_NEAR);
+
+    for (int i = 0; i < load_loop_blk; ++i) {
+        auto r = diff_bias_reg(i);
+        vxorps(r, r, r);
+    }
+    jmp(diff_bias_init_out, T_NEAR);
+
+    L(diff_bias_load);
+    for (int i = 0; i < load_loop_blk; ++i)
+        vmovups(diff_bias_reg(i), diff_bias_ptr(i));
+
+    L(diff_bias_init_out);
+    mov(aux_reg_load_data, reg_load_data);
+    mov(reduce_loop_iter, reg_reduce_loop_work);
+    L(diff_bias_loop); {
+        for(int u = 0; u < jcp.reduce_loop_unroll; ++u)
+            for (int i = 0; i < load_loop_blk; ++i)
+                vaddps(diff_bias_reg(i), diff_bias_reg(i), load_ptr(u, i));
+        assert(jcp.reduce_dim % jcp.reduce_loop_unroll == 0);
+        add(aux_reg_load_data, jcp.reduce_loop_load_step);
+        sub(reduce_loop_iter, jcp.reduce_loop_unroll);
+        jnz(diff_bias_loop, T_NEAR);
+    }
+
+    for (int i = 0; i < load_loop_blk; i++)
+        vmovups(diff_bias_ptr(i), diff_bias_reg(i));
+    add(reg_diff_bias_data, load_loop_blk * jcp.oc_block * sizeof(float));
+    mov(ptr[rsp + reg_diff_bias_data_stack_offt], reg_diff_bias_data);
+
+    L(diff_bias_loop_out);
+}
+
+void jit_avx2_1x1_conv_kernel_f32_old::generate() {
+    const auto &p = attr_.post_ops_;
+    int end_idx = jcp.with_dw_conv ? p.find(primitive_kind::convolution) : p.len();
+    for (int i = 0; i < end_idx; i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors.push_back(new jit_uni_eltwise_injector_t<avx2>(
+                    this,
+                    post_op.eltwise
+            ));
+        } else if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<avx2>(
+                    this,
+                    post_op
+            ));
+        } else if (post_op.is_quantization()) {
+            quantization_injectors.push_back(new jit_uni_quantization_injector_f32<avx2>(
+                    this,
+                    post_op,
+                    ymm_d_weights, ymm_d_bias, reg_d_weights, reg_d_bias
+            ));
+        }
+    }
+
+    preamble();
+
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_bcast_data;
+        auto aux_reg1 = reg_load_data;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
+    mov(reg_bcast_data, ptr[param1 + GET_OFF(bcast_data)]);
+    mov(reg_load_data, ptr[param1 + GET_OFF(load_data)]);
+    mov(reg_output_data, ptr[param1 + GET_OFF(output_data)]);
+    if (jcp.with_bias) {
+        if (jcp.prop_kind == backward_weights) {
+            sub(rsp, stack_space_needed);
+            mov(reg_diff_bias_data, ptr[param1 + GET_OFF(bias_data)]);
+            mov(ptr[rsp + reg_diff_bias_data_stack_offt], reg_diff_bias_data);
+        } else
+            mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]);
+    }
+
+    mov(reg_load_loop_work, ptr[param1 + GET_OFF(load_dim)]);
+    mov(reg_bcast_loop_work, ptr[param1 + GET_OFF(bcast_dim)]);
+    mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]);
+    mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
+    if (jcp.prop_kind == backward_weights)
+        mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
+
+    auto generate_load_loop_body = [=] (int load_loop_blk) {
+        generate_bcast_loop(load_loop_blk);
+        add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
+        switch (jcp.prop_kind) {
+        case forward_training:
+        case forward_inference:
+            add(reg_bias_data, load_loop_blk * jcp.oc_block * sizeof(float));
+            if (jcp.with_dw_conv)
+                add(reg_output_data,
+                    load_loop_blk * jcp.ow * jcp.oc_block * sizeof(float));
+            else
+                add(reg_output_data,
+                    load_loop_blk * jcp.os * jcp.oc_block * sizeof(float));
+            break;
+        case backward_data:
+            add(reg_output_data,
+                    load_loop_blk * jcp.is * jcp.ic_block * sizeof(float));
+            break;
+        case backward_weights:
+            for (int i = 0; i < load_loop_blk; i++)
+                add(reg_output_data, reg_output_stride);
+            break;
+        default:
+            assert(!"invalid prop_kind");
+        }
+        sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
+    };
+
+    Label load_loop_blk_8;
+    Label load_loop_blk_16;
+    Label load_loop_blk_24;
+    Label load_loop_blk_end;
+
+    cmp(reg_load_loop_work, 8);
+    jle(load_loop_blk_8, T_NEAR);
+
+    cmp(reg_load_loop_work, 32);
+    je(load_loop_blk_16, T_NEAR);
+
+    cmp(reg_load_loop_work, 16);
+    jle(load_loop_blk_16, T_NEAR);
+
+    L(load_loop_blk_24); {
+        generate_diff_bias_loop(3);
+        generate_load_loop_body(3);
+        cmp(reg_load_loop_work, 32);
+        je(load_loop_blk_16);
+        cmp(reg_load_loop_work, 24);
+        jge(load_loop_blk_24);
+    }
+
+    cmp(reg_load_loop_work, 8);
+    jle(load_loop_blk_8, T_NEAR);
+
+    L(load_loop_blk_16); {
+        generate_diff_bias_loop(2);
+        generate_load_loop_body(2);
+        cmp(reg_load_loop_work, 16);
+        jge(load_loop_blk_16);
+    }
+
+    L(load_loop_blk_8); {
+        cmp(reg_load_loop_work, 0);
+        je(load_loop_blk_end, T_NEAR);
+        generate_diff_bias_loop(1);
+        generate_load_loop_body(1);
+    }
+
+    L(load_loop_blk_end);
+
+    if (jcp.with_bias && jcp.prop_kind == backward_weights)
+        add(rsp, 8);
+
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
+    postamble();
+
+    for (auto& inj : eltwise_injectors)
+        inj->prepare_table();
+}
+
+bool jit_avx2_1x1_conv_kernel_f32_old::post_ops_ok(
+        jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+
+    int dw_conv_idx = p.find(primitive_kind::convolution);
+    bool with_dw_conv = dw_conv_idx != -1;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        int end_idx = with_dw_conv ? dw_conv_idx : p.len();
+        for (int i = 0; i < end_idx; i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::sum, primitive_kind::eltwise, primitive_kind::depthwise,
+                    primitive_kind::quantization);
+        }
+        return ok;
+    };
+    auto contain = [&](dnnl::impl::primitive_kind_t kind) { return p.find(kind, 0, dw_conv_idx) != -1; };
+    auto position = [&](dnnl::impl::primitive_kind_t kind) { return p.find(kind, 0, dw_conv_idx); };
+    auto count = [&](dnnl::impl::primitive_kind_t kind) { return p.count(kind, 0, dw_conv_idx); };
+
+    return all_post_ops_supported() &&
+           count(primitive_kind::sum) <= 1 &&
+           IMPLICATION(contain(primitive_kind::sum), position(primitive_kind::sum) == 0) &&
+           IMPLICATION(with_dw_conv, !contain(primitive_kind::sum));
+}
+
+status_t jit_avx2_1x1_conv_kernel_f32_old::init_conf(jit_1x1_conv_conf_t &jcp,
+                                                 const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+                                                 const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
+                                                 const primitive_attr_t &attr)
+{
+    if (!mayiuse(avx)) return status::unimplemented;
+
+    // TODO (Roma): this code is duplicated from the generic kernel; maybe the
+    // configuration struct could do some stuff below
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    const int ndims = src_d.ndims();
+
+    jcp.prop_kind = cd.prop_kind;
+
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.oc_without_padding = jcp.oc;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[2];
+    jcp.ow = dst_d.dims()[ndims - 1];
+
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + 2];
+    jcp.kw = weights_d.dims()[with_groups + ndims - 1];
+
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][ndims - 3];
+
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[0];
+    jcp.stride_w = cd.strides[ndims - 3];
+
+    const auto dat_tag_nxc = utils::pick(ndims - 3, nwc, nhwc, ndhwc);
+    const auto dat_tag_nCx8c = utils::pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
+
+    jcp.src_dt = cd.src_desc.data_type;
+    jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
+    jcp.dst_dt = cd.dst_desc.data_type;
+
+    if (!post_ops_ok(jcp, attr))
+        return status::unimplemented;
+
+    const auto &p = attr.post_ops_;
+
+    int dw_conv_ind = p.find(primitive_kind::convolution);
+    jcp.with_dw_conv = dw_conv_ind != -1;
+
+    if (jcp.with_dw_conv && !mayiuse(avx2))
+        return status::unimplemented;
+
+    if (jcp.with_dw_conv) {
+        // dw_conv and post_ops after it are handled externally, so skip them
+        jcp.post_ops.entry_.assign(p.entry_.cbegin(),
+                                   p.entry_.cbegin() + dw_conv_ind);
+
+        jcp.dw_conv_oh = jcp.oh;
+        jcp.dw_conv_ow = jcp.ow;
+        jcp.oh = p.entry_[dw_conv_ind].depthwise_conv_old.in_h;
+        jcp.ow = p.entry_[dw_conv_ind].depthwise_conv_old.in_w;
+
+        jcp.dw_conv_dst_dt = jcp.dst_dt;
+        jcp.dst_dt = p.entry_[dw_conv_ind].depthwise_conv_old.in_dt;
+    }
+
+    if (!mayiuse(avx2)) {
+        for (int i = 0; i < p.len(); i++) {
+            auto &post_op = p.entry_[i];
+            if (post_op.is_eltwise()) {
+                if (post_op.eltwise.alg != alg_kind::eltwise_relu)
+                    return status::unimplemented;
+            } else if (post_op.is_depthwise() || post_op.is_quantization()) {
+                return status::unimplemented;
+            }
+        }
+    }
+
+    jcp.with_sum = p.find(primitive_kind::sum, 0, dw_conv_ind) != -1;
+
+    jcp.os = jcp.oh * jcp.ow;
+    jcp.is = jcp.ih * jcp.iw;
+
+    const int is_bwd_d = jcp.prop_kind == backward_data;
+    format_tag_t wei_tag = with_groups
+                           ? utils::pick(2 * ndims - 6 + is_bwd_d, gOIw8i8o, gOIw8o8i, gOIhw8i8o,
+                                         gOIhw8o8i)
+                           : utils::pick(2 * ndims - 6 + is_bwd_d, OIw8i8o, OIw8o8i, OIhw8i8o,
+                                         OIhw8o8i);
+    jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
+    const int simd_w = 8;
+
+    jcp.oc = rnd_up(jcp.oc, simd_w);
+    jcp.ic = rnd_up(jcp.ic, simd_w);
+
+    jcp.dst_tag = dat_tag_nCx8c;
+    const bool is_data_layout_nxc
+            = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
+    const auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx8c;
+    bool args_ok = true && jcp.ngroups == 1 && jcp.src_tag == dat_tag
+                   && jcp.wei_tag == wei_tag && jcp.dst_tag == dat_tag;
+
+    if (!args_ok) return status::unimplemented;
+
+    args_ok = true
+        && jcp.ih == jcp.oh && jcp.iw == jcp.ow
+        && jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0
+        && jcp.t_pad == 0 && jcp.l_pad == 0
+        && jcp.stride_w == 1 && jcp.stride_h == 1 // TODO: support some strides
+        && jcp.kh == 1 && jcp.kw == 1;
+    if (!args_ok) return status::unimplemented;
+
+    // TODO: remove this restriction
+    // optimized 1x1 bwd_w does not support Intel AVX
+    if (jcp.prop_kind == backward_weights && !mayiuse(avx2))
+        return status::unimplemented;
+
+    jcp.ic_block = jcp.oc_block = simd_w;
+
+    jcp.ur = mayiuse(avx2) ? 4 : 3; // Intel AVX support
+
+    int load_blocking{ 0 };
+    int load_blocking_max{ 0 };
+    int bcast_blocking{ 0 };
+    int bcast_blocking_max{ 0 };
+    int reduce_blocking{ 0 };
+
+    if (one_of(jcp.prop_kind, forward_training, forward_inference)) {
+        jcp.reduce_dim = jcp.ic;
+        jcp.reduce_block = jcp.ic_block;
+
+        jcp.load_dim = jcp.oc;
+        jcp.load_block = jcp.oc_block;
+
+        jcp.bcast_dim = jcp.with_dw_conv ? jcp.iw : jcp.is;
+        jcp.bcast_block = jcp.ur;
+
+        jcp.reduce_loop_unroll = jcp.reduce_block;
+        jcp.reduce_loop_bcast_step
+            = jcp.reduce_loop_unroll * jcp.is * sizeof(float);
+        jcp.reduce_loop_load_step
+            = jcp.reduce_loop_unroll * jcp.oc_block * sizeof(float);
+
+        jcp.bcast_loop_output_step = jcp.ur * jcp.oc_block * sizeof(float);
+        jcp.bcast_loop_output_substep = -1; // unused
+        jcp.bcast_loop_bcast_step = jcp.ur * jcp.ic_block * sizeof(float);
+        jcp.bcast_loop_bcast_substep = -1; // unused
+
+        jcp.load_loop_load_step = jcp.ic * jcp.oc_block * sizeof(float);
+        jcp.load_loop_iter_step = jcp.oc_block;
+
+        load_blocking = jcp.with_dw_conv ? nstl::min(3 * jcp.load_block, jcp.oc) : 120; // assumes the kernel is jcp.ur x 3
+        load_blocking_max = jcp.with_dw_conv ? nstl::min(3 * jcp.load_block, jcp.oc) : 144;
+        bcast_blocking = 128; // affects load balancing across threads
+        bcast_blocking_max = 192;
+        reduce_blocking = 128; // affects L1$ utilization
+    } else if (jcp.prop_kind == backward_data) {
+        jcp.reduce_dim = jcp.oc;
+        jcp.reduce_block = jcp.oc_block;
+
+        jcp.load_dim = jcp.ic;
+        jcp.load_block = jcp.oc_block;
+
+        jcp.bcast_dim = jcp.os;
+        jcp.bcast_block = jcp.ur;
+
+        jcp.reduce_loop_unroll = jcp.reduce_block;
+        jcp.reduce_loop_bcast_step
+            = jcp.reduce_loop_unroll * jcp.os * sizeof(float);
+        jcp.reduce_loop_load_step
+            = jcp.reduce_loop_unroll * jcp.ic * sizeof(float);
+
+        jcp.bcast_loop_output_step = jcp.ur * jcp.ic_block * sizeof(float);
+        jcp.bcast_loop_output_substep = -1; // unused
+        jcp.bcast_loop_bcast_step = jcp.ur * jcp.oc_block * sizeof(float);
+        jcp.bcast_loop_bcast_substep = -1; // unused
+
+        jcp.load_loop_load_step = jcp.oc_block * jcp.ic_block * sizeof(float);
+        jcp.load_loop_iter_step = jcp.ic_block;
+
+        load_blocking = 96; // assumes the kernel is jcp.ur x 3
+        load_blocking_max = 144;
+        bcast_blocking = 128; // affects load balancing across threads
+        bcast_blocking_max = 196;
+        reduce_blocking = 64; // affects L1$ utilization
+    } else if (jcp.prop_kind == backward_weights) {
+        jcp.reduce_dim = jcp.os;
+        jcp.reduce_block = 1;
+
+        jcp.load_dim = jcp.oc;
+        jcp.load_block = jcp.oc_block;
+
+        jcp.bcast_dim = jcp.ic;
+        jcp.bcast_block = jcp.ic_block;
+
+        jcp.reduce_loop_unroll = jcp.reduce_block;
+        jcp.reduce_loop_bcast_step
+            = jcp.reduce_loop_unroll * jcp.ic_block * sizeof(float);
+        jcp.reduce_loop_load_step
+            = jcp.reduce_loop_unroll * jcp.oc_block * sizeof(float);
+
+        jcp.bcast_loop_output_step = jcp.oc_block * jcp.ic_block * sizeof(float);
+        jcp.bcast_loop_output_substep = jcp.oc_block * jcp.ur * sizeof(float);
+        jcp.bcast_loop_bcast_step = jcp.ic_block * jcp.is * sizeof(float);
+        jcp.bcast_loop_bcast_substep = jcp.ur * sizeof(float);
+
+        jcp.load_loop_load_step = jcp.oc_block * jcp.os * sizeof(float);
+        jcp.load_loop_iter_step = jcp.oc_block;
+
+        /* --- */
+
+        load_blocking = div_up(jcp.load_dim, jcp.load_block);
+        while (true) {
+            if (load_blocking <= 32) break;
+            else if (load_blocking % 2 == 0) load_blocking /= 2;
+            else if (load_blocking % 3 == 0) load_blocking /= 3;
+            else break;
+        }
+        load_blocking *= jcp.load_block;
+        load_blocking_max = load_blocking;
+        assert(jcp.load_dim % load_blocking == 0);
+
+        bcast_blocking = div_up(jcp.bcast_dim, jcp.bcast_block);
+        while (true) {
+            if (bcast_blocking <= 9) break;
+            else if (bcast_blocking % 2 == 0) bcast_blocking /= 2;
+            else if (bcast_blocking % 3 == 0) bcast_blocking /= 3;
+            else break;
+        }
+        bcast_blocking *= jcp.bcast_block;
+        bcast_blocking_max = bcast_blocking;
+        assert(jcp.bcast_dim % bcast_blocking == 0);
+
+        reduce_blocking = 128; // affects L1$ utilization
+    } else
+        return status::unimplemented;
+
+    assert(load_blocking);
+    assert(load_blocking_max);
+    assert(bcast_blocking);
+    assert(bcast_blocking_max);
+    assert(reduce_blocking);
+
+    assert(jcp.bcast_block % jcp.ur == 0);
+    jcp.ur_tail = jcp.bcast_dim % jcp.ur;
+
+    jcp.nb_bcast_blocking = bcast_blocking / jcp.bcast_block;
+    jcp.nb_bcast_blocking_max = bcast_blocking_max / jcp.bcast_block;
+    jcp.nb_load_blocking = load_blocking / jcp.load_block;
+    jcp.nb_load_blocking_max = load_blocking_max / jcp.load_block;
+    jcp.nb_reduce_blocking = reduce_blocking / jcp.reduce_block;
+
+    jcp.nb_bcast = jcp.with_dw_conv ? jcp.ih : div_up(jcp.bcast_dim, jcp.bcast_block);
+    jcp.nb_load = div_up(jcp.load_dim, jcp.load_block);
+    jcp.nb_reduce = div_up(jcp.reduce_dim, jcp.reduce_block);
+
+    return status::success;
+}
+
+void jit_avx2_1x1_conv_kernel_f32_old::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad,
+        const jit_1x1_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw) {
+    using namespace dnnl::impl::memory_tracking::names;
+
+    if (jcp.with_bias && jcp.prop_kind != backward_data
+        && (jcp.oc != jcp.oc_without_padding // blocked format
+            || (jcp.prop_kind == backward_weights // nxc format
+                && jcp.oc % jcp.oc_block != 0))) {
+        const size_t nelems_padded_bias
+                = jcp.ngroups * rnd_up(jcp.oc, jcp.oc_block);
+        scratchpad.book<float>(key_conv_padded_bias, nelems_padded_bias);
+    }
+
+    if (jcp.with_dw_conv) {
+        const int nthreads = dnnl_get_max_threads();
+        size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * (jcp.oc / jcp.oc_block);
+        scratchpad.book<float>(key_dw_conv_buffer, dw_conv_buffer_size_ * nthreads);
+
+        if (jcp.oc != jcp.oc_without_padding)
+            scratchpad.book<float>(key_dw_conv_padded_bias, jcp.oc);
+    }
+}
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.hpp b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.hpp
new file mode 100644
index 00000000000..d338bb7aae9
--- /dev/null
+++ b/src/cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.hpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+* Copyright 2016-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/* [todo] antonvor:
+ * This file contains the old plugin behavior in order to fix performance
+ * problems after upgrading to OneDNN v1.6. This kernel is executed only on
+ * machines with avx2 instruction set support and in the case of a fused
+ * convolution. Remove after problems are fixed.
+*/
+
+#ifndef CPU_X64_JIT_AVX2_1X1_CONV_KERNEL_F32_OLD_HPP
+#define CPU_X64_JIT_AVX2_1X1_CONV_KERNEL_F32_OLD_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_quantization_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct jit_avx2_1x1_conv_kernel_f32_old : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_1x1_conv_kernel_f32_old)
+
+    jit_avx2_1x1_conv_kernel_f32_old(
+            const jit_1x1_conv_conf_t &ajcp, jit_conv_conf_t ajcp_dw, const primitive_attr_t &attr)
+            : jit_generator_t(jit_name()), jcp(ajcp), jcp_dw(ajcp_dw), attr_(attr) {}
+
+    ~jit_avx2_1x1_conv_kernel_f32_old() {
+        for (auto inj : eltwise_injectors)
+            delete inj;
+        eltwise_injectors.clear();
+
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+
+        for (auto inj : quantization_injectors)
+            delete inj;
+        quantization_injectors.clear();
+    }
+
+    static bool post_ops_ok(
+            jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr);
+
+    static status_t init_conf(jit_1x1_conv_conf_t &jcp,
+                              const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
+                              const memory_desc_wrapper &weights_d,
+                              const memory_desc_wrapper &dst_d, const primitive_attr_t &attr);
+
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+                                const jit_1x1_conv_conf_t &jcp, const jit_conv_conf_t &jcp_dw = jit_conv_conf_t());
+
+    jit_1x1_conv_conf_t jcp;
+    jit_conv_conf_t jcp_dw;
+    const primitive_attr_t &attr_;
+
+private:
+    using reg64_t = const Xbyak::Reg64;
+    using ymm_t = const Xbyak::Ymm;
+
+    reg64_t reg_bcast_data = rax;
+    reg64_t reg_load_data = rsi;
+    reg64_t reg_output_data = rbx;
+    reg64_t aux_reg_bcast_data = rdx;
+    reg64_t aux1_reg_bcast_data = abi_not_param1;
+    reg64_t aux_reg_output_data = rbp;
+    reg64_t reg_load_loop_work = r9;
+    reg64_t reg_bcast_loop_work = r10;
+    reg64_t reg_reduce_loop_work = r11;
+    reg64_t load_loop_iter = r13;
+    reg64_t aux_reg_load_data = load_loop_iter;
+    reg64_t bcast_loop_iter = r14;
+    reg64_t reduce_loop_iter = r15;
+    reg64_t imm_addr64 = reduce_loop_iter;
+    reg64_t reg_reduce_pos_flag = r8;
+    reg64_t reg_output_stride = r12;
+    reg64_t reg_bias_data = r12;
+    reg64_t reg_diff_bias_data = bcast_loop_iter;
+
+    reg64_t reg_oc_off = abi_param1;
+    reg64_t reg_d_weights = aux_reg_bcast_data;
+    reg64_t reg_d_bias = reduce_loop_iter;
+
+    int reg_diff_bias_data_stack_offt = 0;
+    int stack_space_needed = 8;
+
+    ymm_t vreg_bcast = ymm_t(15);
+    ymm_t vtmp = ymm_t(14);
+
+    ymm_t ymm_d_weights = Xbyak::Ymm(14);
+    ymm_t ymm_d_bias = Xbyak::Ymm(15);
+
+    void generate_bcast_loop(int load_loop_blk);
+    void generate_reduce_loop(int load_loop_blk, int ur);
+    void generate_diff_bias_loop(int load_loop_blk);
+
+    nstl::vector<jit_uni_eltwise_injector_t<avx2>*> eltwise_injectors;
+    nstl::vector<jit_uni_depthwise_injector_f32<avx2>*> depthwise_injectors;
+    nstl::vector<jit_uni_quantization_injector_f32<avx2>*> quantization_injectors;
+
+    void generate() override;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_avx2_1x1_convolution.cpp b/src/cpu/x64/jit_avx2_1x1_convolution.cpp
index 9aa8a335698..22d2bcaba6d 100644
--- a/src/cpu/x64/jit_avx2_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_avx2_1x1_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
     const int nb_buffer = jcp.nb_load_blocking;
     auto jcp_dw = pd()->jcp_dw_;
     std::vector<data_t *> addrs;
-    jit_generator *dw_jit_ker = nullptr;
+    jit_generator_t *dw_jit_ker = nullptr;
 
     const bool is_src_layout_nxc = utils::one_of(
             jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
@@ -217,6 +217,8 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
         p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
         p.dst_orig = static_cast<const float *>(p.output_data) - dst_off;
 
+        p.oc_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
+
         (*kernel_)(&p);
     };
 
@@ -296,6 +298,8 @@ void jit_avx2_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
                     = post_ops_binary_rhs_arg_vec_dw;
             par_conv_dw.dst_orig = dst;
 
+            par_conv_dw.oc_off = ch * jcp_dw->ch_block * sizeof(float);
+
             (*dw_jit_ker)(&par_conv_dw);
 
             for (int i = 0; i < jcp_dw->kh; ++i)
@@ -375,6 +379,9 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data(
     auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto post_ops_binary_rhs_arg_vec
+        = binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
@@ -473,11 +480,15 @@ void jit_avx2_1x1_convolution_bwd_data_t::execute_backward_data(
                                     ? weights_d.blk_off(g, ocb, icb)
                                     : weights_d.blk_off(ocb, icb)];
 
-                    p.first_last_flag = ocb == 0 ? FLAG_REDUCE_FIRST : 0;
+                    p.first_last_flag = 0 | (ocb == 0 ? FLAG_REDUCE_FIRST : 0)
+                                        | (ocb + jcp.nb_reduce_blocking >= jcp.nb_reduce ? FLAG_REDUCE_LAST : 0);
 
                     p.reduce_dim = this_block_size(ocb * jcp.oc_block, jcp.oc,
                             nb_oc_blocking * jcp.oc_block);
 
+                    p.oc_off = ic_off_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+                    p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+
                     (*kernel_)(&p);
                 }
 
diff --git a/src/cpu/x64/jit_avx2_1x1_convolution.hpp b/src/cpu/x64/jit_avx2_1x1_convolution.hpp
index 4d942d1a50a..786640eb26b 100644
--- a/src/cpu/x64/jit_avx2_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_avx2_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,12 +42,7 @@ struct jit_avx2_1x1_convolution_fwd_t : public primitive_t {
     // TODO: (Roma) Code duplication duplication! Remove with templates
     //              (maybe...)!
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_()
-            , jcp_dw_(nullptr) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
             if (copy(other) != status::success) is_initialized_ = false;
@@ -75,11 +70,18 @@ struct jit_avx2_1x1_convolution_fwd_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, dst_md(), weights_md());
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx2_1x1_conv_kernel_f32::init_conf(
                     jcp_, *conv_d, *src_d, *weights_md(), *dst_md(), *attr()));
-            if (jcp_.with_dw_conv) CHECK(depthwise_po_init(engine));
+            if (jcp_.with_dw_conv) {
+                // todo: [antonvor] enable when new behavior of dw convolution fusing from oneDNN 1.6 will be supported
+                return status::unimplemented;
+                CHECK(depthwise_po_init(engine));
+            }
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx2_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_);
@@ -95,14 +97,14 @@ struct jit_avx2_1x1_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
+            return dw_conv_pd_ && jcp_.with_dw_conv
                     ? dw_conv_pd_->dst_md(index, user_input)
                     : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
+            if (dw_conv_pd_ && jcp_.with_dw_conv) {
                 switch (arg) {
                     case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
                         return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
@@ -120,16 +122,16 @@ struct jit_avx2_1x1_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
             return convolution_fwd_pd_t::arg_usage(arg);
         }
 
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
-        jit_conv_conf_t *jcp_dw_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+        jit_conv_conf_t *jcp_dw_ = nullptr;
         std::unique_ptr<cpu_convolution_fwd_pd_t> dw_conv_pd_;
 
     protected:
@@ -339,7 +341,7 @@ struct jit_avx2_1x1_convolution_fwd_t : public primitive_t {
         return status::success;
     }
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         execute_forward(ctx);
@@ -368,11 +370,7 @@ struct jit_avx2_1x1_convolution_fwd_t : public primitive_t {
 
 struct jit_avx2_1x1_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", avx2, ""),
                 jit_avx2_1x1_convolution_bwd_data_t);
@@ -389,17 +387,18 @@ struct jit_avx2_1x1_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *diff_src_d = diff_src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, diff_src_d, diff_dst_md(), weights_md());
 
-            status_t status = jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_,
-                    *conv_d, *diff_src_d, *weights_md(), *diff_dst_md(),
-                    *attr());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, *conv_d,
+                    *diff_src_d, *weights_md(), *diff_dst_md(), *attr()));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx2_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_);
@@ -409,8 +408,8 @@ struct jit_avx2_1x1_convolution_bwd_data_t : public primitive_t {
             return status::success;
         }
 
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
 
     protected:
         bool set_default_formats() {
@@ -439,6 +438,23 @@ struct jit_avx2_1x1_convolution_bwd_data_t : public primitive_t {
 
             return set_default_formats_common(dat_tag, wei_tag, dat_tag);
         }
+
+        virtual bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
     template <cpu_isa_t isa, typename conv_t>
@@ -446,7 +462,7 @@ struct jit_avx2_1x1_convolution_bwd_data_t : public primitive_t {
 
     jit_avx2_1x1_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -472,11 +488,8 @@ struct jit_avx2_1x1_convolution_bwd_data_t : public primitive_t {
 
 struct jit_avx2_1x1_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", avx2, ""),
                 jit_avx2_1x1_convolution_bwd_weights_t);
@@ -497,12 +510,13 @@ struct jit_avx2_1x1_convolution_bwd_weights_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, diff_dst_md(), diff_weights_md());
 
-            status_t status = jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_,
-                    *conv_d, *src_d, *diff_weights_md(), *diff_dst_md(),
-                    *attr());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx2_1x1_conv_kernel_f32::init_conf(jcp_, *conv_d, *src_d,
+                    *diff_weights_md(), *diff_dst_md(), *attr()));
 
             init_balancers();
 
@@ -522,10 +536,10 @@ struct jit_avx2_1x1_convolution_bwd_weights_t : public primitive_t {
             return status::success;
         }
 
-        jit_1x1_conv_conf_t jcp_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         cpu_reducer_t<data_type::f32>::conf_t reducer_bia_conf_;
         cpu_reducer_2d_t<data_type::f32>::conf_t reducer_wei_conf_;
-        reduce_to_unit_stride_t rtus_;
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
 
     protected:
         bool set_default_formats() {
@@ -595,7 +609,7 @@ struct jit_avx2_1x1_convolution_bwd_weights_t : public primitive_t {
     jit_avx2_1x1_convolution_bwd_weights_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.cpp b/src/cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.cpp
new file mode 100644
index 00000000000..197e1c8e7a9
--- /dev/null
+++ b/src/cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.cpp
@@ -0,0 +1,251 @@
+/*******************************************************************************
+* Copyright 2016-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/* [todo] antonvor:
+ * This file contains the old plugin behavior in order to fix performance
+ * problems after upgrading to OneDNN v1.6. This kernel is executed only on
+ * machines with avx2 instruction set support and in the case of a fused
+ * convolution. Remove after problems are fixed.
+*/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+
+#include "cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+#define data_blk_off(f, n, c, d, h, w) \
+((ndims == 3) ? (f).blk_off(n, c, w) \
+  : ((ndims == 4) ? (f).blk_off(n, c, h, w) \
+                  : (f).blk_off(n, c, d, h, w)))
+/* convolution forward */
+
+void jit_avx2_1x1_convolution_with_dw_conv_fwd_t::execute_forward(
+        const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+
+    auto weights_dw = CTX_IN_MEM(
+            const data_t *, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS);
+    auto bias_dw = CTX_IN_MEM(
+            const data_t *, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS);
+
+    const auto &jcp = kernel_old_->jcp;
+    const auto &jcp_dw = kernel_dw_->jcp;
+
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+    const auto post_ops_binary_rhs_arg_vec_dw = binary_injector::prepare_binary_args(jcp_dw.post_ops, ctx, jcp.post_ops.entry_.size() + 1);
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    auto rtus_space = pd()->rtus_.reduce_src_
+                      ? scratchpad.get<data_t>(key_conv_rtus_space)
+                      : nullptr;
+
+    const int MB = pd()->MB();
+
+    int ocb_work = jcp.with_dw_conv ? utils::div_up(jcp.nb_load, jcp.nb_load_blocking) : 1;
+    const int work_amount = MB * jcp.ngroups * ocb_work * jcp.nb_bcast;
+
+    auto step = [](int default_step, int remaining, int tail_step) {
+        assert(default_step <= tail_step);
+        return remaining < tail_step ? remaining : default_step;
+    };
+
+    auto ker = [&](const int ithr, const int nthr) {
+        // TODO (Roma): remove this restriction
+        assert(jcp.stride_w == 1 && jcp.stride_h == 1);
+
+        auto compute_block_1x1 = [&](float* ws_p, int n, int g, int oh, int ow, int ih, int iw, int os, int os_block, int bcast_step, int ocb, int load_step,
+                                     int num_rows) {
+            auto rp = rtus_driver_t<avx2>::call_params_t();
+            auto p = jit_1x1_conv_call_s();
+
+            for (int h = 0; h < num_rows; h++) {
+                ih = nstl::max((oh + h) * jcp.stride_h - jcp.t_pad, 0);
+
+                if ((oh + h) < 0 || (oh + h) >= jcp.ih) {
+                    for (int chb = ocb; chb < ocb + load_step; chb++) {
+                        memset(ws_p + (((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block +
+                               (chb - ocb) * jcp_dw.kh * jcp.ow * jcp.oc_block, 0, jcp.ow * jcp.oc_block * sizeof(float));
+                    }
+                } else {
+                    const int _ocb = g * jcp.nb_load + ocb;
+
+                    rp.iw_start = iw;
+                    p.bcast_dim = this_block_size(os, jcp.os, bcast_step * os_block);
+
+                    rp.os = p.bcast_dim;
+                    p.load_dim = this_block_size(ocb * jcp.oc_block, jcp.oc, load_step * jcp.oc_block);
+
+                    p.output_data = &ws_p[(((oh + h) + 1) % jcp_dw.kh) * jcp.ow * jcp.oc_block];
+
+                    p.bias_data = &bias[_ocb * jcp.oc_block];
+
+                    for (int icb = 0; icb < jcp.nb_reduce; icb += jcp.nb_reduce_blocking) {
+                        p.first_last_flag = 0
+                                            | (icb == 0 ? FLAG_REDUCE_FIRST : 0)
+                                            | (icb + jcp.nb_reduce_blocking >= jcp.nb_reduce
+                                               ? FLAG_REDUCE_LAST : 0);
+
+                        p.reduce_dim = this_block_size(icb * jcp.ic_block, jcp.ic,
+                                                       jcp.nb_reduce_blocking * jcp.ic_block);
+                        rp.icb = p.reduce_dim / jcp.reduce_block;
+
+                        p.load_data = &weights[pd()->with_groups()
+                                               ? weights_d.blk_off(g, ocb, icb)
+                                               : weights_d.blk_off(ocb, icb)];
+
+                        const int _icb = g * jcp.nb_reduce + icb;
+                        if (pd()->rtus_.reduce_src_) {
+                            rp.ws = rtus_space
+                                    + ithr * pd()->rtus_.space_per_thread_
+                                    + _icb * jcp.is * jcp.ic_block;
+
+                            if (ocb == 0) {
+                                rp.src = src + src_d.blk_off(n, _icb, ih, iw);
+                                (*rtus_driver_)(&rp);
+                            }
+
+                            p.bcast_data = rp.ws;
+                        } else {
+                            p.bcast_data = src + src_d.blk_off(n, _icb, ih, iw);
+                        }
+
+                        p.oc_off = _ocb * jcp.oc_block * sizeof(float);
+                        p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+
+                        (*kernel_old_)(&p);
+                    }
+                }
+            }
+        };
+
+        auto compute_row_dw = [&](const float* ws_p, int n, int ocb, int load_step, int dst_idx) {
+
+            for (int chb = ocb; chb < ocb + load_step; chb++) {
+                auto par_conv_dw = jit_conv_call_s();
+
+                par_conv_dw.src_row0 = &ws_p[(((dst_idx+1) - 1) % jcp_dw.kh) * jcp_dw.iw * jcp_dw.ch_block +
+                                             (chb - ocb) * jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block];
+                par_conv_dw.src_row1 = &ws_p[(((dst_idx+1) - 0) % jcp_dw.kh) * jcp_dw.iw * jcp_dw.ch_block +
+                                             (chb - ocb) * jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block];
+                par_conv_dw.src_row2 = &ws_p[(((dst_idx+1) + 1) % jcp_dw.kh) * jcp_dw.iw * jcp_dw.ch_block +
+                                             (chb - ocb) * jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block];
+
+                par_conv_dw.dst = &dst[n*jcp_dw.oc*jcp_dw.oh*jcp_dw.ow + chb*jcp_dw.ch_block*jcp_dw.oh*jcp_dw.ow +
+                                       dst_idx/jcp_dw.stride_h*jcp_dw.ow*jcp_dw.ch_block];
+
+                par_conv_dw.kh_padding = jcp_dw.kh;
+                par_conv_dw.filt = &weights_dw[chb * jcp_dw.kh * jcp_dw.kw * jcp_dw.ch_block];
+                par_conv_dw.bias = &bias_dw[chb * jcp_dw.ch_block];
+                par_conv_dw.ur_w = (size_t)(jcp_dw.ow);
+                par_conv_dw.oc_work = nstl::min((chb + 1) * jcp_dw.ch_block, (int)jcp_dw.oc) - chb*jcp_dw.ch_block;
+                par_conv_dw.oc_off = chb * jcp_dw.ch_block * sizeof(float);
+                par_conv_dw.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec_dw.data();
+
+                (*kernel_dw_)(&par_conv_dw);
+            }
+        };
+
+        assert(jcp.stride_w == 1 && jcp.stride_h == 1);
+
+        int start{0}, end{0};
+        balance211(work_amount, nthr, ithr, start, end);
+
+        auto dw_conv_buffer = scratchpad.get<data_t>(key_dw_conv_buffer);
+        size_t dw_conv_buffer_size_ = (size_t)jcp_dw.kh * jcp_dw.iw * jcp_dw.ch_block * (jcp.oc / jcp.oc_block);
+        auto pbuf = dw_conv_buffer + ithr * dw_conv_buffer_size_;
+
+        const int os_block = jcp.iw;
+
+        int iwork = start;
+        while (iwork < end) {
+            int n{0}, g{0}, ocbb{0}, osb{0};
+            nd_iterator_init(iwork, n, MB, g, jcp.ngroups, ocbb, ocb_work, osb,
+                             jcp.nb_bcast);
+            int bcast_step = 1;
+
+            const int os = osb * os_block;
+            const int oh = os / jcp.ow;
+            const int ow = os % jcp.ow;
+
+            const int ih = nstl::max(oh * jcp.stride_h - jcp.t_pad, 0);
+            const int iw = nstl::max(ow * jcp.stride_w - jcp.l_pad, 0);
+
+            int ocb = ocbb * jcp.nb_load_blocking;
+
+            const int load_step = step(jcp.nb_load_blocking,
+                                       jcp.nb_load - ocb, jcp.nb_load_blocking_max);
+
+            if (iwork == start || oh == 0) {
+                bcast_step = nstl::min(1, end - iwork);
+                compute_block_1x1(pbuf, n, g, oh - 1, ow, ih, iw, os, os_block, bcast_step, ocb, load_step, bcast_step + 2);
+            } else {
+                bcast_step = nstl::min(1, end - iwork);
+                compute_block_1x1(pbuf, n, g, oh + 1, ow, ih, iw, os, os_block, bcast_step, ocb, load_step, bcast_step);
+            }
+
+            if ((oh % jcp_dw.stride_h == 0)) {
+                compute_row_dw(pbuf, n, ocb, load_step, oh);
+            }
+
+            iwork += bcast_step;
+        }
+    };
+
+    if (pd()->wants_padded_bias()) {
+        auto padded_bias = scratchpad.get<data_t>(key_conv_padded_bias);
+        utils::array_copy(padded_bias, bias, jcp.oc_without_padding);
+        utils::array_set(padded_bias + jcp.oc_without_padding, 0.f,
+                         jcp.oc - jcp.oc_without_padding);
+        bias = padded_bias;
+
+        auto dw_padded_bias = scratchpad.get<data_t>(key_dw_conv_padded_bias);
+        utils::array_copy(dw_padded_bias, bias_dw, jcp.oc_without_padding);
+        utils::array_set(dw_padded_bias + jcp.oc_without_padding, 0.f,
+                         jcp.oc - jcp.oc_without_padding);
+        bias_dw = dw_padded_bias;
+    }
+
+    parallel(0, ker);
+
+    if (pd()->wants_zero_pad_dst()) ctx.zero_pad_output(DNNL_ARG_DST);
+}
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.hpp b/src/cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.hpp
new file mode 100644
index 00000000000..332b017d09f
--- /dev/null
+++ b/src/cpu/x64/jit_avx2_1x1_convolution_with_dw_conv.hpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+* Copyright 2016-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/* [todo] antonvor:
+ * This file contains the old plugin behavior in order to fix performance
+ * problems after upgrading to OneDNN v1.6. This kernel is executed only on
+ * machines with avx2 instruction set support and in the case of a fused
+ * convolution. Remove after problems are fixed.
+*/
+
+#ifndef CPU_X64_JIT_AVX2_1X1_CONVOLUTION_WITH_DW_CONV_HPP
+#define CPU_X64_JIT_AVX2_1X1_CONVOLUTION_WITH_DW_CONV_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_hashing.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/cpu_convolution_pd.hpp"
+#include "cpu/dw_convolution_utils.hpp"
+#include "cpu/platform.hpp"
+
+#include "cpu/x64/cpu_reducer.hpp"
+#include "cpu/x64/jit_avx2_1x1_conv_kernel_f32.hpp"
+#include "cpu/x64/jit_uni_1x1_conv_utils.hpp"
+#include "cpu/x64/jit_uni_dw_convolution.hpp"
+
+#include "cpu/x64/jit_avx2_1x1_conv_kernel_f32_old.hpp"
+#include "cpu/x64/jit_uni_dw_conv_row_f32.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct jit_avx2_1x1_convolution_with_dw_conv_fwd_t : public primitive_t {
+    // TODO: (Roma) Code duplication duplication! Remove with templates
+    //              (maybe...)!
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+             const typename pd_t::base_class *hint_fwd_pd)
+                : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
+                , jcp_(), jcp_dw_(), rtus_() {}
+
+        pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
+            if (copy(other) != status::success) is_initialized_ = false;
+        }
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1_with_dw_conv:", avx2, ""),
+                            jit_avx2_1x1_convolution_with_dw_conv_fwd_t);
+
+        status_t init(engine_t *engine) {
+            using namespace prop_kind;
+            assert(engine->kind() == engine_kind::cpu);
+            bool ok = true
+                      && this->set_default_formats()
+                      && utils::one_of(this->desc()->prop_kind, forward_training,
+                                       forward_inference)
+                      && utils::one_of(this->desc()->alg_kind,
+                                       alg_kind::convolution_auto,
+                                       alg_kind::convolution_direct)
+                      && !this->has_zero_dim_memory()
+                      && utils::everyone_is(data_type::f32,
+                                            this->desc()->src_desc.data_type,
+                                            this->desc()->weights_desc.data_type,
+                                            this->desc()->dst_desc.data_type)
+                      && IMPLICATION(this->with_bias(),
+                                     data_type::f32 == this->desc()->bias_desc.data_type);
+            if (!ok) return status::unimplemented;
+
+            const convolution_desc_t *conv_d = this->desc();
+            const memory_desc_t *src_d = src_md();
+            rtus_prepare(this, conv_d, src_d, dst_md(), weights_md());
+
+            status_t sts_1x1 = jit_avx2_1x1_conv_kernel_f32_old::init_conf(
+                    jcp_, *conv_d, *src_d, *weights_md(), *dst_md(), *attr());
+            if (sts_1x1 != status::success) return sts_1x1;
+
+            if (jcp_.with_dw_conv) {
+                status_t sts_dw = jit_uni_dw_conv_row_f32<avx2>::init_conf(jcp_, jcp_dw_, *this->attr());
+                if (sts_dw != status::success) return sts_dw;
+            } else {
+                return status::unimplemented;
+            }
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_avx2_1x1_conv_kernel_f32_old::init_scratchpad(scratchpad, jcp_, jcp_dw_);
+
+            rtus_prepare_space_info(this, scratchpad, dnnl_get_max_threads());
+
+            return status::success;
+        }
+
+        arg_usage_t arg_usage(int arg) const override {
+            return convolution_fwd_pd_t::arg_usage(arg);
+        }
+
+        jit_1x1_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_dw_;
+        reduce_to_unit_stride_t rtus_;
+
+    protected:
+        bool set_default_formats() {
+            using namespace format_tag;
+
+            auto dat_tag = utils::pick(ndims() - 3, nCw8c, nChw8c, nCdhw8c);
+            auto wei_tag = with_groups()
+                           ? utils::pick(ndims() - 3, gOIw8i8o, gOIhw8i8o, gOIdhw8i8o)
+                           : utils::pick(ndims() - 3, OIw8i8o, OIhw8i8o, OIdhw8i8o);
+
+            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
+        }
+
+        status_t copy(const pd_t &other) {
+            jcp_ = other.jcp_;
+            rtus_ = other.rtus_;
+            jcp_dw_ = other.jcp_dw_;
+
+            return status::success;
+        }
+    };
+
+    template <cpu_isa_t isa, typename conv_t>
+    friend status_t init_rtus_driver(conv_t *self);
+
+    jit_avx2_1x1_convolution_with_dw_conv_fwd_t(const pd_t *apd) : primitive_t(apd),
+        kernel_old_(nullptr), rtus_driver_(nullptr) {
+        kernel_old_ = new jit_avx2_1x1_conv_kernel_f32_old(pd()->jcp_, pd()->jcp_dw_, *pd()->attr());
+        init_rtus_driver<avx2>(this);
+
+        if (pd()->jcp_.with_dw_conv) {
+            kernel_dw_ = new jit_uni_dw_conv_row_f32<avx2>(pd()->jcp_dw_, *pd()->attr(), pd()->jcp_dw_.ch_block);
+        }
+    }
+
+    status_t init(engine_t *engine) override {
+        CHECK(kernel_old_->create_kernel());
+        if (kernel_dw_)
+            CHECK(kernel_dw_->create_kernel());
+        return status::success;
+    }
+
+    ~jit_avx2_1x1_convolution_with_dw_conv_fwd_t() {
+        delete kernel_old_;
+        delete rtus_driver_;
+        delete kernel_dw_;
+    }
+
+    typedef typename prec_traits_t<data_type::f32>::type data_t;
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_forward(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_forward(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    jit_avx2_1x1_conv_kernel_f32_old *kernel_old_;
+    jit_uni_dw_conv_row_f32<avx2> *kernel_dw_;
+    rtus_driver_t<avx2> *rtus_driver_;
+
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_avx2_conv_kernel_f32.cpp b/src/cpu/x64/jit_avx2_conv_kernel_f32.cpp
index d5cb8b49f58..c839ec56c0e 100644
--- a/src/cpu/x64/jit_avx2_conv_kernel_f32.cpp
+++ b/src/cpu/x64/jit_avx2_conv_kernel_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 * Copyright 2018 YANDEX LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -53,8 +53,8 @@ bool tag_is_flat(format_tag_t tag, format_tag_t ncx, format_tag_t nxc, int ic) {
 jit_avx2_conv_fwd_kernel_f32::jit_avx2_conv_fwd_kernel_f32(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), avx2), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), avx2), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -68,10 +68,12 @@ jit_avx2_conv_fwd_kernel_f32::jit_avx2_conv_fwd_kernel_f32(
                 memory_desc_wrapper(dst_md), tail_size,
                 use_exact_tail_scalar_bcast};
         static_params_t static_params {this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {ymm_d_weights.getIdx(), ymm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx2>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -123,6 +125,9 @@ void jit_avx2_conv_fwd_kernel_f32::oh_step_unroll_kw(
                 compute(ic_tail);
             else {
                 Label ic_blk_tail, ic_blk_done;
+                // ic tail only exist in nxc layout. reg_channel may be overided in fakequantize postops 'reg_d_bias'
+                if (is_src_layout_nxc())
+                    mov(reg_channel, ptr[param1 + GET_OFF(reduce_work)]);
                 cmp(reg_channel, ic_block);
                 jl(ic_blk_tail, T_NEAR);
 
@@ -208,11 +213,21 @@ void iterate(const int load_loop_blk, const int ur, const F &f) {
 
 void jit_avx2_conv_fwd_kernel_f32::apply_postops(
         const int oc_blocks, const int ur_w, const int oc_tail) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         Label regular_store;
         test(reg_ci_flag, FLAG_IC_LAST);
         je(regular_store, T_NEAR);
 
+        std::map<size_t, int> vmm_idx_off;
+        iterate(oc_blocks, ur_w, [&](const bool, const int i, const int j) {
+            vmm_idx_off.insert({get_ymm_idx(ur_w, i, j), i * jcp.oc_block * sizeof(float)});
+        });
+        depthwise_injector::dynamic_params_t ddp {ymm_d_weights.getIdx(), ymm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
             binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
@@ -244,14 +259,14 @@ void jit_avx2_conv_fwd_kernel_f32::apply_postops(
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             }
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
 
         } else {
             iterate(oc_blocks, ur_w, [&](const bool, const int i, const int j) {
                 vmm_idxs.emplace(get_ymm_idx(ur_w, i, j));
             });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
         L(regular_store);
     }
@@ -265,6 +280,7 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(
 
     if (oc_tail) {
         push(reg_oc_blocks);
+        base_post_ops_data_offset += reg64_size;
         mov(reg_oc_flag, ptr[param1 + GET_OFF(oc_flag)]);
     }
 
@@ -485,7 +501,10 @@ void jit_avx2_conv_fwd_kernel_f32::width_blk_step(
         L(store_done);
     }
 
-    if (oc_tail) pop(reg_oc_blocks);
+    if (oc_tail) {
+        pop(reg_oc_blocks);
+        base_post_ops_data_offset -= reg64_size;
+    }
 }
 
 inline void jit_avx2_conv_fwd_kernel_f32::solve_common(int oc_blocks) {
@@ -540,6 +559,9 @@ inline void jit_avx2_conv_fwd_kernel_f32::solve_common(int oc_blocks) {
 void jit_avx2_conv_fwd_kernel_f32::generate() {
     this->preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_input, reg_output);
+
     mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
     mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
     mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
@@ -576,6 +598,9 @@ void jit_avx2_conv_fwd_kernel_f32::generate() {
         solve_common(nb_oc_tail);
     }
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     this->postamble();
 
     if (jcp.with_eltwise)
@@ -588,6 +613,12 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
         const primitive_attr_t &attr) {
     // disabling verbose dispatch messages for unsupported isa for better readability
     if (!mayiuse(avx)) return status::unimplemented;
+
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.isa = mayiuse(avx2) ? avx2 : avx;
 
     jcp.nthr = dnnl_get_max_threads();
@@ -635,11 +666,11 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
             jcp.t_pad, jcp.oh, jcp.ih, jcp.stride_h, ext_kh);
     jcp.back_pad = calculate_end_padding(
             jcp.f_pad, jcp.od, jcp.id, jcp.stride_d, ext_kd);
-    bool kernel_outside_src = false || ext_kw <= jcp.l_pad
-            || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad || ext_kh <= jcp.b_pad
-            || ext_kd <= jcp.f_pad || ext_kd <= jcp.back_pad;
-    VDISPATCH_CONV_IC(!kernel_outside_src, VERBOSE_UNSUPPORTED_PAD_FEATURE,
-            "weights and src size mismatch");
+    // bool kernel_outside_src = false || ext_kw <= jcp.l_pad
+    //         || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad || ext_kh <= jcp.b_pad
+    //         || ext_kd <= jcp.f_pad || ext_kd <= jcp.back_pad;
+    // VDISPATCH_CONV_IC(!kernel_outside_src, VERBOSE_UNSUPPORTED_PAD_FEATURE,
+    //         "weights and src size mismatch");
 
     const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
     const auto dat_tag_ncx = pick(ndims - 3, ncw, nchw, ncdhw);
@@ -650,10 +681,10 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     auto wei_tag_Oxio = with_groups ? pick(ndims - 3, gOwi8o, gOhwi8o, gOdhwi8o)
                                     : pick(ndims - 3, Owi8o, Ohwi8o, Odhwi8o);
 
-    jcp.src_tag
-            = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_ncx, dat_tag_nCx8c);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(
+            dat_tag_nxc, dat_tag_ncx, dat_tag_nCx8c);
     jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag_OIxio, wei_tag_Oxio);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
 
     jcp.typesize_in = types::data_type_size(src_d.data_type());
     jcp.typesize_out = types::data_type_size(dst_d.data_type());
@@ -676,6 +707,8 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     const int binary_ind = post_ops.find(primitive_kind::binary);
     const int prelu_ind = post_ops.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization) != -1;
 
     jcp.post_ops = post_ops;
 
@@ -696,7 +729,7 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
         if (mimo) jcp.ic = rnd_up(jcp.ic, simd_w);
     }
 
-    if (jcp.with_eltwise || jcp.with_binary)
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization)
         VDISPATCH_CONV_IC(mayiuse(avx2), VERBOSE_UNSUPPORTED_FEATURE,
                 "eltwise and binary post-ops not implemented on isa");
 
@@ -705,7 +738,7 @@ status_t jit_avx2_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(jcp.isa,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
@@ -860,6 +893,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(
         }
 
     if (oc_tail) {
+        base_post_ops_data_offset += reg64_size;
         push(reg_long_offt);
         mov(reg_reduce_work, ptr[param1 + GET_OFF(reduce_work)]);
     }
@@ -879,6 +913,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(
 
     if (jcp.ndims == 5) {
         assert(jcp.nb_oc_blocking == 1);
+        base_post_ops_data_offset += reg64_size;
         push(oi_iter);
 
         mov(reg_ki, ptr[this->param1 + GET_OFF(kd_padding)]);
@@ -970,6 +1005,7 @@ void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(
         L(skip_kd_loop);
 
         pop(oi_iter);
+        base_post_ops_data_offset -= reg64_size;
     }
 
     if (one_of(jcp.ndims, 3, 4)) {
@@ -988,15 +1024,44 @@ void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(
         mov(reg_channel, ptr[param1 + GET_OFF(channel)]);
     }
 
-    if (oc_tail) pop(reg_long_offt);
+    if (oc_tail) {
+        pop(reg_long_offt);
+        base_post_ops_data_offset -= reg64_size;
+    }
 
     auto load_store_dsrc = [&](bool is_tail) {
+        std::size_t post_ops_data_offset = 0;
+        int depthwise_inj_idx = 0;
         mov(reg_channel, ptr[param1 + GET_OFF(channel)]);
-        Label no_update_label;
+        Label no_update_label, skip_post_ops;
         cmp(reg_channel, 0);
         je(no_update_label, T_NEAR);
+        const auto &p = jcp.post_ops;
+        const bool with_depthwise = p.find(primitive_kind::depthwise) != -1;
+
+        for (int ii = 0; ii < nb_ic_block; ii++) {
+            if (with_depthwise) {
+                post_ops_data_offset = 0;
+                depthwise_inj_idx = 0;
+                base_post_ops_data_offset += reg64_size;
+                push(reg_d_weights);
+                mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+                add(reg_d_weights, ptr[this->param1 + GET_OFF(ic_off)]);
+                add(reg_d_weights, jcp.ic_block * ii * sizeof(float));
+
+                for (int kk = 0; kk < p.len(); kk++) {
+                    auto& post_op = p.entry_[kk];
+                    if (post_op.is_depthwise()) {
+                        depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                                ur_w * ii, ur_w * ii + ur_w, reg_d_weights, reg_d_weights, false, true);
+                        post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+                        depthwise_inj_idx++;
+                    }
+                }
+                pop(reg_d_weights);
+                base_post_ops_data_offset -= reg64_size;
+            }
 
-        for (int ii = 0; ii < nb_ic_block; ii++)
             for (int jj = 0; jj < ur_w; jj++) {
                 if (is_tail && ii == nb_ic_block - 1)
                     load_bytes(Ymm(15), reg_dsrc, get_dsrc_offset(ii, jj),
@@ -1007,8 +1072,35 @@ void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(
                                     reg_long_offt));
                 vaddps(Ymm(ur_w * ii + jj), Ymm(ur_w * ii + jj), Ymm(15));
             }
+        }
+
+        jmp(skip_post_ops, T_NEAR);
 
         L(no_update_label);
+        depthwise_inj_idx = 0;
+        post_ops_data_offset = 0;
+        for (int i = 0; i < p.len(); i++) {
+            auto& post_op = p.entry_[i];
+            if (post_op.is_depthwise()) {
+                base_post_ops_data_offset += reg64_size;
+                push(reg_d_weights);
+
+                mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+                add(reg_d_weights, ptr[this->param1 + GET_OFF(ic_off)]);
+                for (int ii = 0; ii < nb_ic_block; ii++) {
+                    depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                            ur_w * ii, ur_w * ii + ur_w, reg_d_weights, reg_d_weights);
+
+                    add(reg_d_weights, jcp.ic_block * sizeof(float));
+                }
+                pop(reg_d_weights);
+                base_post_ops_data_offset -= reg64_size;
+
+                post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+                depthwise_inj_idx++;
+            }
+        }
+        L(skip_post_ops);
 
         for (int ii = 0; ii < nb_ic_block; ii++)
             for (int jj = 0; jj < ur_w; jj++) {
@@ -1041,8 +1133,39 @@ void jit_avx2_conv_bwd_data_kernel_f32::compute_loop(
 }
 
 void jit_avx2_conv_bwd_data_kernel_f32::generate() {
+    const auto &p = jcp.post_ops;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<avx2>(
+                    this,
+                    post_op
+            ));
+        }
+    }
+
     preamble();
 
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_dsrc;
+        auto aux_reg1 = reg_ddst;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
     mov(reg_dsrc, ptr[this->param1 + GET_OFF(src)]);
     mov(reg_ddst, ptr[this->param1 + GET_OFF(dst)]);
     mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
@@ -1102,9 +1225,30 @@ void jit_avx2_conv_bwd_data_kernel_f32::generate() {
         if (jcp.ur_w_tail != 0) compute_loop(jcp.ur_w_tail, 0, r_overflow);
     }
 
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
     this->postamble();
 }
 
+bool jit_avx2_conv_bwd_data_kernel_f32::post_ops_ok(const jit_conv_conf_t& jcp) {
+    const auto &p = jcp.post_ops;
+    if (p.len() > 1)
+        return false;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = 0; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+        }
+        return ok;
+    };
+
+    return all_post_ops_supported();
+}
+
 status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
         const convolution_desc_t &cd, const memory_desc_wrapper &diff_src_d,
         const memory_desc_wrapper &weights_d,
@@ -1112,6 +1256,11 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     // disabling verbose dispatch messages for unsupported isa for better readability
     if (!mayiuse(avx2)) return status::unimplemented;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, diff_src_d, weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.nthr = dnnl_get_max_threads();
 
     const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1;
@@ -1125,6 +1274,7 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.oc = diff_dst_d.dims()[1] / jcp.ngroups;
     jcp.oc_without_padding = jcp.oc;
     jcp.ic = diff_src_d.dims()[1] / jcp.ngroups;
+    jcp.ic_without_padding = jcp.ic;
 
     jcp.id = (ndims == 5) ? diff_src_d.dims()[2] : 1;
     jcp.ih = (ndims == 3) ? 1 : diff_src_d.dims()[ndims - 2];
@@ -1157,6 +1307,20 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
 
     const int simd_w = 8;
 
+    if (!post_ops_ok(jcp))
+        return status::unimplemented;
+
+    const auto &p = jcp.post_ops;
+    if (!mayiuse(avx2)) {
+        for (int i = 0; i < p.len(); i++) {
+            auto &post_op = p.entry_[i];
+            if (post_op.is_depthwise()) {
+                return status::unimplemented;
+            }
+        }
+    }
+    jcp.post_ops = p;
+
     /* derivatives */
     jcp.idp = jcp.id + 2 * jcp.f_pad;
     jcp.ihp = jcp.ih + 2 * jcp.t_pad;
@@ -1170,8 +1334,10 @@ status_t jit_avx2_conv_bwd_data_kernel_f32::init_conf(jit_conv_conf_t &jcp,
             ? pick(ndims - 3, gOIw8o8i, gOIhw8o8i, gOIdhw8o8i)
             : pick(ndims - 3, OIw8o8i, OIhw8o8i, OIdhw8o8i);
 
-    jcp.src_tag = diff_src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
-    jcp.dst_tag = diff_dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.src_tag
+            = diff_src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.dst_tag
+            = diff_dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
     jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
 
     jcp.typesize_in = types::data_type_size(diff_src_d.data_type());
@@ -1331,6 +1497,11 @@ status_t jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     // disabling verbose dispatch messages for unsupported isa for better readability
     if (!mayiuse(avx2)) return status::unimplemented;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, diff_weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1;
     int ndims = src_d.ndims();
     jcp.ndims = ndims;
diff --git a/src/cpu/x64/jit_avx2_conv_kernel_f32.hpp b/src/cpu/x64/jit_avx2_conv_kernel_f32.hpp
index 9050c113a2f..a18886ac8ad 100644
--- a/src/cpu/x64/jit_avx2_conv_kernel_f32.hpp
+++ b/src/cpu/x64/jit_avx2_conv_kernel_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
+struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator_t {
     jit_avx2_conv_fwd_kernel_f32(const jit_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
 
@@ -51,7 +51,7 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
             postops_injector_;
 
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx2>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx2>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
     reg64_t reg_input = rax;
     reg64_t aux_reg_input = r8;
@@ -80,6 +80,14 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
 
     Xbyak::Ymm ytmp = Xbyak::Ymm(14);
 
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = ki_iter;
+
+    Xbyak::Ymm ymm_d_weights = Xbyak::Ymm(14);
+    Xbyak::Ymm ymm_d_bias = Xbyak::Ymm(15);
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
     inline void oh_step_unroll_kw(
             int ur_w, int pad_l, int pad_r, int oc_blocks);
     inline void oh_step_nopad(int ur_w, int pad_l, int pad_r, int oc_blocks);
@@ -87,18 +95,18 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
     inline void width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks);
     inline void solve_common(int oc_blocks);
 
-    inline dim_t filter_w_to_input(int ki, int oi = 0, int pad_l = 0) {
+    inline dim_t filter_w_to_input(int ki, int oi = 0, int pad_l = 0) const {
         return static_cast<dim_t>(ki) * (jcp.dilate_w + 1) + oi * jcp.stride_w
                 - pad_l;
     };
-    inline dim_t filter_h_to_input(int ki) {
+    inline dim_t filter_h_to_input(int ki) const {
         return static_cast<dim_t>(ki) * (jcp.dilate_h + 1) * jcp.iw;
     };
-    inline dim_t filter_d_to_input(int ki) {
+    inline dim_t filter_d_to_input(int ki) const {
         return static_cast<dim_t>(ki) * (jcp.dilate_d + 1) * jcp.iw * jcp.ih;
     };
 
-    inline dim_t get_input_offset(int i_ic, int i_iw) {
+    inline dim_t get_input_offset(int i_ic, int i_iw) const {
         dim_t offset;
         if (utils::one_of(jcp.src_tag, format_tag::ncw, format_tag::nchw,
                     format_tag::ncdhw)) {
@@ -112,7 +120,7 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_output_offset(int i_oc_block, int i_ow) {
+    inline dim_t get_output_offset(int i_oc_block, int i_ow) const {
         dim_t offset;
         if (utils::one_of(jcp.dst_tag, format_tag::nwc, format_tag::nhwc,
                     format_tag::ndhwc)) {
@@ -126,7 +134,7 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_kernel_offset(int i_oc_block, int ki, int i_ic) {
+    inline dim_t get_kernel_offset(int i_oc_block, int ki, int i_ic) const {
         dim_t block_step_size = jcp.ic_block * jcp.oc_block;
         dim_t ic_block_step_size = static_cast<dim_t>(jcp.kd) * jcp.kh * jcp.kw
                 * block_step_size;
@@ -137,7 +145,7 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline bool is_src_layout_nxc() {
+    inline bool is_src_layout_nxc() const {
         return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
@@ -145,12 +153,19 @@ struct jit_avx2_conv_fwd_kernel_f32 : public jit_generator {
     void generate() override;
 };
 
-struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
+struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_conv_bwd_data_kernel_f32)
 
     jit_avx2_conv_bwd_data_kernel_f32(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp) {}
+        : jit_generator_t(jit_name()), jcp(ajcp) {}
+
+    ~jit_avx2_conv_bwd_data_kernel_f32() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
+    static bool post_ops_ok(const jit_conv_conf_t& jcp);
     static status_t init_conf(jit_conv_conf_t &jcp,
             const convolution_desc_t &cd, const memory_desc_wrapper &diff_src_d,
             const memory_desc_wrapper &weights_d,
@@ -185,11 +200,18 @@ struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
     reg64_t reg_reduce_work = reg_long_offt;
     Xbyak::Reg32 reg_ci_flag = r13d; // used for nxc tails
 
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = rbp;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    nstl::vector<jit_uni_depthwise_injector_f32<avx2>*> depthwise_injectors;
+
     inline void compute_loop(int ur_w, int l_overflow, int r_overflow);
 
     void generate() override;
 
-    inline int get_iw_start(int ki, int l_overflow) {
+    inline int get_iw_start(int ki, int l_overflow) const {
         int res = (jcp.iw - 1 + jcp.r_pad) % jcp.stride_w
                 + l_overflow * jcp.stride_w
                 - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1);
@@ -199,7 +221,7 @@ struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
         return res;
     }
 
-    inline int get_iw_end(int ur_w, int ki, int r_overflow) {
+    inline int get_iw_end(int ur_w, int ki, int r_overflow) const {
         if (utils::one_of(ur_w, jcp.iw, jcp.ur_w_tail))
             ur_w += nstl::min(0, jcp.r_pad); // remove negative padding
         int res = (ur_w - 1 + jcp.l_pad) % jcp.stride_w
@@ -210,11 +232,11 @@ struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
         return ur_w - res;
     }
 
-    inline dim_t filter_w_to_ddst(int ki, int oi = 0, int pad_l = 0) {
+    inline dim_t filter_w_to_ddst(int ki, int oi = 0, int pad_l = 0) const {
         return (oi + pad_l - ki * (jcp.dilate_w + 1)) / jcp.stride_w;
     }
 
-    inline dim_t get_ddst_offset(int i_oc_block, int i_ow, int i_oc) {
+    inline dim_t get_ddst_offset(int i_oc_block, int i_ow, int i_oc) const {
         dim_t offset;
         if (utils::one_of(jcp.dst_tag, format_tag::nwc, format_tag::nhwc,
                     format_tag::ndhwc)) {
@@ -228,7 +250,7 @@ struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_dsrc_offset(int i_ic_block, int i_iw) {
+    inline dim_t get_dsrc_offset(int i_ic_block, int i_iw) const {
         dim_t offset;
         if (utils::one_of(jcp.src_tag, format_tag::nwc, format_tag::nhwc,
                     format_tag::ndhwc)) {
@@ -243,7 +265,7 @@ struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
     }
 
     inline dim_t get_kernel_offset(
-            int i_oc_block, int i_ic_block, int ki, int i_oc) {
+            int i_oc_block, int i_ic_block, int ki, int i_oc) const {
         dim_t block_step_size = jcp.ic_block * jcp.oc_block;
         dim_t ic_block_step_size = static_cast<dim_t>(jcp.kd) * jcp.kh * jcp.kw
                 * block_step_size;
@@ -256,11 +278,11 @@ struct jit_avx2_conv_bwd_data_kernel_f32 : public jit_generator {
     }
 };
 
-struct jit_avx2_conv_bwd_weights_kernel_f32 : public jit_generator {
+struct jit_avx2_conv_bwd_weights_kernel_f32 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_conv_bwd_weights_kernel_f32)
 
     jit_avx2_conv_bwd_weights_kernel_f32(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp) {}
+        : jit_generator_t(jit_name()), jcp(ajcp) {}
 
     static status_t init_conf(jit_conv_conf_t &jcp,
             const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
@@ -300,7 +322,7 @@ struct jit_avx2_conv_bwd_weights_kernel_f32 : public jit_generator {
     inline void compute_oh_step_common(int ic_block_step, int max_ur_w);
     inline void compute_oh_loop_common();
 
-    inline dim_t get_input_offset(int i_ic, int i_iw) {
+    inline dim_t get_input_offset(int i_ic, int i_iw) const {
         dim_t offset;
         if (utils::one_of(jcp.src_tag, format_tag::ncw, format_tag::nchw,
                     format_tag::ncdhw)) {
@@ -314,7 +336,7 @@ struct jit_avx2_conv_bwd_weights_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_output_offset(int i_oc_block, int i_ow) {
+    inline dim_t get_output_offset(int i_oc_block, int i_ow) const {
         dim_t offset;
         if (utils::one_of(jcp.dst_tag, format_tag::nwc, format_tag::nhwc,
                     format_tag::ndhwc)) {
@@ -328,7 +350,7 @@ struct jit_avx2_conv_bwd_weights_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_kernel_offset(int ki, int i_ic) {
+    inline dim_t get_kernel_offset(int ki, int i_ic) const {
         dim_t block_step_size = jcp.ic_block * jcp.oc_block;
         dim_t offset = static_cast<dim_t>(ki) * block_step_size
                 + i_ic * jcp.oc_block;
diff --git a/src/cpu/x64/jit_avx2_convolution.cpp b/src/cpu/x64/jit_avx2_convolution.cpp
index c9daceacd62..d5a88ab4825 100644
--- a/src/cpu/x64/jit_avx2_convolution.cpp
+++ b/src/cpu/x64/jit_avx2_convolution.cpp
@@ -138,7 +138,7 @@ void jit_avx2_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
                         par_conv.flags |= FLAG_IC_FIRST;
                     }
 
-                    if ((jcp.with_eltwise || jcp.with_binary)
+                    if ((jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization)
                             && icb + 1 == jcp.nb_ic)
                         par_conv.flags |= FLAG_IC_LAST;
 
@@ -164,6 +164,7 @@ void jit_avx2_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
                     par_conv.post_ops_binary_rhs_arg_vec
                             = post_ops_binary_rhs_arg_vec.data();
                     par_conv.dst_orig = dst;
+                    par_conv.oc_off = _oc * oc_bias_scale * sizeof(float);
 
                     (*kernel_)(&par_conv);
                 }
@@ -193,6 +194,8 @@ void jit_avx2_convolution_bwd_data_t::execute_backward_data(
     auto diff_dst = CTX_IN_MEM(const data_t *, DNNL_ARG_DIFF_DST);
     auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(data_t *, DNNL_ARG_DIFF_SRC);
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
 
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
@@ -334,6 +337,10 @@ void jit_avx2_convolution_bwd_data_t::execute_backward_data(
                             par_conv.flags |= FLAG_IC_LAST;
                     }
 
+                    par_conv.ic_off = (g * jcp.ic_without_padding + jcp.nb_ic_blocking * icbb * jcp.ic_block) * sizeof(float);
+                    par_conv.post_ops_binary_rhs_arg_vec
+                            = post_ops_binary_rhs_arg_vec.data();
+
                     (*kernel_)(&par_conv);
                 }
             }
diff --git a/src/cpu/x64/jit_avx2_convolution.hpp b/src/cpu/x64/jit_avx2_convolution.hpp
index 1bf683eb7f5..56808622b1b 100644
--- a/src/cpu/x64/jit_avx2_convolution.hpp
+++ b/src/cpu/x64/jit_avx2_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,9 +35,7 @@ namespace x64 {
 
 struct jit_avx2_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", jcp_.isa, ""),
                 jit_avx2_convolution_fwd_t);
@@ -60,6 +58,7 @@ struct jit_avx2_convolution_fwd_t : public primitive_t {
                     attr_.set_default_formats(dst_md(0)) == status::success,
                     VERBOSE_UNSUPPORTED_POSTOP);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx2_conv_fwd_kernel_f32::init_conf(
                     jcp_, *desc(), src_md(), weights_md(), dst_md(), *attr()));
 
@@ -69,7 +68,7 @@ struct jit_avx2_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool set_default_formats() {
@@ -110,7 +109,7 @@ struct jit_avx2_convolution_fwd_t : public primitive_t {
 
     jit_avx2_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -133,9 +132,7 @@ struct jit_avx2_convolution_fwd_t : public primitive_t {
 
 struct jit_avx2_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", avx2, ""),
                 jit_avx2_convolution_bwd_data_t);
@@ -151,13 +148,13 @@ struct jit_avx2_convolution_bwd_data_t : public primitive_t {
             VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-            VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
 
-            status_t status = jit_avx2_conv_bwd_data_kernel_f32::init_conf(jcp_,
-                    *desc(), *diff_src_md(), *weights_md(), *diff_dst_md());
-            if (status != status::success) return status;
+            jcp_.post_ops = attr()->post_ops_;
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx2_conv_bwd_data_kernel_f32::init_conf(jcp_, *desc(),
+                    *diff_src_md(), *weights_md(), *diff_dst_md()));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx2_conv_bwd_data_kernel_f32::init_scratchpad(
@@ -166,7 +163,7 @@ struct jit_avx2_convolution_bwd_data_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool set_default_formats() {
@@ -200,7 +197,7 @@ struct jit_avx2_convolution_bwd_data_t : public primitive_t {
 
     jit_avx2_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(
@@ -222,10 +219,8 @@ struct jit_avx2_convolution_bwd_data_t : public primitive_t {
 
 struct jit_avx2_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", avx2, ""),
                 jit_avx2_convolution_bwd_weights_t);
@@ -244,10 +239,9 @@ struct jit_avx2_convolution_bwd_weights_t : public primitive_t {
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
 
-            status_t status = jit_avx2_conv_bwd_weights_kernel_f32::init_conf(
-                    jcp_, *desc(), *src_md(), *diff_weights_md(),
-                    *diff_dst_md());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx2_conv_bwd_weights_kernel_f32::init_conf(jcp_, *desc(),
+                    *src_md(), *diff_weights_md(), *diff_dst_md()));
 
             init_balancers();
 
@@ -266,7 +260,7 @@ struct jit_avx2_convolution_bwd_weights_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         cpu_reducer_t<data_type::f32>::conf_t reducer_bia_conf_;
         cpu_reducer_t<data_type::f32>::conf_t reducer_wei_conf_;
 
@@ -326,7 +320,7 @@ struct jit_avx2_convolution_bwd_weights_t : public primitive_t {
 
     jit_avx2_convolution_bwd_weights_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(
diff --git a/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.cpp b/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.cpp
index e801c3ca3dd..2eec66cf59a 100644
--- a/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,8 +50,8 @@ using namespace Xbyak;
 jit_avx512_common_1x1_conv_kernel::jit_avx512_common_1x1_conv_kernel(
         const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name()), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name()), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -66,10 +66,12 @@ jit_avx512_common_1x1_conv_kernel::jit_avx512_common_1x1_conv_kernel(
                 use_exact_tail_scalar_bcast};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -169,6 +171,16 @@ static void iterate(const int load_loop_blk, const int ur, const F &fun) {
 
 void jit_avx512_common_1x1_conv_kernel::apply_postops(
         const bool is_out_layout_nxc, const int load_loop_blk, const int ur) {
+    std::map<size_t, int> vmm_idx_off;
+    iterate(load_loop_blk, ur,
+            [&](const bool, const int i_load, const int i_ur) {
+                vmm_idx_off.insert({vreg_accum_idx(load_loop_blk, i_load, i_ur), i_load * jcp.load_block * sizeof(float)});
+            });
+
+    depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                              reg_oc_off, vmm_idx_off, this->rsp, base_post_ops_data_offset};
+    quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt, this->rsp, base_post_ops_data_offset};
+
     injector_utils::vmm_index_set_t vmm_idxs;
     if (jcp.with_binary) {
         binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
@@ -194,7 +206,7 @@ void jit_avx512_common_1x1_conv_kernel::apply_postops(
 
         mov(abi_param1, ptr[rsp + reg_abi_param1_backup]);
 
-        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
         if (jcp.with_dw_conv) {
             sub(aux_reg_output_data,
                     EVEX_compress_addr(rsp, reg_dw_binary_output_off));
@@ -205,7 +217,7 @@ void jit_avx512_common_1x1_conv_kernel::apply_postops(
                     vmm_idxs.emplace(
                             vreg_accum_idx(load_loop_blk, i_load, i_ur));
                 });
-        postops_injector_->compute_vector_range(vmm_idxs);
+        postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
     }
 }
 
@@ -310,7 +322,7 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(
 
         L(store_noadd);
 
-        if (jcp.with_eltwise || jcp.with_binary) {
+        if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
             Label store_nopostops;
             test(reg_reduce_pos_flag, FLAG_REDUCE_LAST);
             jz(store_nopostops, T_NEAR);
@@ -339,7 +351,7 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(
         };
 
         Label unaligned_store, end_store;
-        test(aux_reg_output_data, cpu_isa_traits<avx512_core>::vlen - 1);
+        test(aux_reg_output_data, cpu_isa_traits_t<avx512_core>::vlen - 1);
         jnz(unaligned_store, T_NEAR);
         store_output(true);
         jmp(end_store, T_NEAR);
@@ -409,7 +421,12 @@ void jit_avx512_common_1x1_conv_kernel::reduce_loop(
 void jit_avx512_common_1x1_conv_kernel::generate() {
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_bcast_data, reg_load_data);
+
     sub(rsp, stack_space_needed);
+    base_post_ops_data_offset += stack_space_needed;
+
     if (jcp.with_binary) {
         mov(EVEX_compress_addr(rsp, reg_abi_param1_backup), abi_param1);
         if (jcp.with_dw_conv) {
@@ -433,6 +450,7 @@ void jit_avx512_common_1x1_conv_kernel::generate() {
     mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
     if (jcp.prop_kind == backward_weights)
         mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     const int load_dim_tail
             = (one_of(jcp.prop_kind, forward_training, forward_inference)
@@ -457,6 +475,7 @@ void jit_avx512_common_1x1_conv_kernel::generate() {
         }
         bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
         const size_t offst_with_dw_conv = load_loop_blk * jcp.load_block
                 * jcp.typesize_out
                 * (is_out_layout_nxc(jcp)
@@ -549,6 +568,10 @@ void jit_avx512_common_1x1_conv_kernel::generate() {
     L(load_loop_blk[num_ur_cases]);
 
     add(rsp, stack_space_needed);
+    base_post_ops_data_offset -= stack_space_needed;
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
 
     postamble();
 
@@ -566,10 +589,15 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp,
                 dst_d.data_type()))
         return status::unimplemented;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.nthr = nthreads;
 
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
-    const int simd_w = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+    const int simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     const int ndims = src_d.ndims();
 
     jcp.prop_kind = cd.prop_kind;
@@ -627,6 +655,8 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp,
             = post_ops.find(primitive_kind::binary, 0, dw_conv_ind);
     const int prelu_ind = post_ops.find(primitive_kind::prelu, 0, dw_conv_ind);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise, 0, dw_conv_ind) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization, 0, dw_conv_ind) != -1;
 
     if (dw_conv_ind >= 0) {
         // dw_conv and post_ops after it are handled externally, so skip them
@@ -638,8 +668,8 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp,
 
     const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
     const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx16c);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx16c);
     bool is_data_layout_nxc
             = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
     auto required_dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
@@ -657,7 +687,7 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp,
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
@@ -683,8 +713,8 @@ status_t jit_avx512_common_1x1_conv_kernel::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
     if (jcp.wei_tag != wei_tag) return status::unimplemented;
 
-    jcp.typesize_in = sizeof(prec_traits<data_type::f32>::type);
-    jcp.typesize_out = sizeof(prec_traits<data_type::f32>::type);
+    jcp.typesize_in = sizeof(prec_traits_t<data_type::f32>::type);
+    jcp.typesize_out = sizeof(prec_traits_t<data_type::f32>::type);
 
     /* once all the formats are set, check the padding consistency */
     if (!is_data_layout_nxc) {
diff --git a/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.hpp b/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.hpp
index 9a926841fc0..98336ef03f8 100644
--- a/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_common_1x1_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx512_common_1x1_conv_kernel : public jit_generator {
+struct jit_avx512_common_1x1_conv_kernel : public jit_generator_t {
     jit_avx512_common_1x1_conv_kernel(const jit_1x1_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
 
@@ -52,7 +52,7 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator {
             postops_injector_;
 
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
     using zmm_t = const Xbyak::Zmm;
 
@@ -67,7 +67,7 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator {
     reg64_t reg_load_loop_work = rsi;
     reg64_t reg_reduce_loop_work = r11;
     reg64_t reg_bcast_loop_iter = rdx;
-    reg64_t reduce_loop_iter = abi_param1;
+    reg64_t reduce_loop_iter = r13;
     reg64_t reg_reduce_pos_flag = rax;
     reg64_t reg_output_stride = r13;
     reg64_t reg_bias_data = r12;
@@ -87,6 +87,14 @@ struct jit_avx512_common_1x1_conv_kernel : public jit_generator {
     constexpr static int reg_dw_binary_output_off = 3 * reg64_size_;
     constexpr static int stack_space_needed = 4 * reg64_size_;
 
+    reg64_t reg_oc_off = abi_param1;
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = r13;
+    int base_post_ops_data_offset = 0;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     void bcast_loop(int load_loop_blk);
     void reduce_loop(int load_loop_blk, int ur, int substep, bool wraparound);
 
diff --git a/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp b/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp
index c803cd539cb..35daff41f03 100644
--- a/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_common_1x1_convolution.cpp
@@ -220,6 +220,7 @@ void jit_avx512_common_1x1_convolution_fwd_t<src_type, wei_type,
 
         p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
         p.dst_orig = static_cast<const float *>(p.output_data) - dst_off;
+        p.oc_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
 
         (*kernel_)(&p);
     };
@@ -368,6 +369,8 @@ void jit_avx512_common_1x1_convolution_fwd_t<src_type, wei_type,
                     = post_ops_binary_rhs_arg_vec_dw;
             par_conv_dw.dst_orig = dst;
 
+            par_conv_dw.oc_off = ch * jcp_dw.ch_block * sizeof(float);
+
             (*kernel_dw_)(&par_conv_dw);
 
             for (int i = 0; i < jcp_dw.kh; ++i)
@@ -450,11 +453,14 @@ void jit_avx512_common_1x1_convolution_bwd_data_t<diff_dst_type, wei_type,
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
 
-    const auto &jcp = kernel_->jcp;
     auto rtus_space = pd()->rtus_.reduce_src_
             ? ctx.get_scratchpad_grantor().template get<diff_src_data_t>(
                     key_conv_rtus_space)
@@ -576,11 +582,15 @@ void jit_avx512_common_1x1_convolution_bwd_data_t<diff_dst_type, wei_type,
                                         ? weights_d.blk_off(g, ocb, icb)
                                         : weights_d.blk_off(ocb, icb)];
 
-                        p.first_last_flag = ocb == 0 ? FLAG_REDUCE_FIRST : 0;
+                        p.first_last_flag = 0 | (ocb == 0 ? FLAG_REDUCE_FIRST : 0)
+                                            | (ocb + jcp.nb_reduce_blocking >= jcp.nb_reduce ? FLAG_REDUCE_LAST : 0);
 
                         p.reduce_dim = this_block_size(ocb * jcp.oc_block,
                                 jcp.oc, nb_oc_blocking_step * jcp.oc_block);
 
+                        p.oc_off = ic_off_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+                        p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+
                         (*kernel_)(&p);
                     }
                     if (pd()->rtus_.reduce_src_) (*rtus_driver_)(&rp);
diff --git a/src/cpu/x64/jit_avx512_common_1x1_convolution.hpp b/src/cpu/x64/jit_avx512_common_1x1_convolution.hpp
index 9a61a6b9986..0ff9bab07cf 100644
--- a/src/cpu/x64/jit_avx512_common_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_common_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,11 +43,7 @@ template <impl::data_type_t src_type, impl::data_type_t wei_type = src_type,
         impl::data_type_t dst_type = src_type>
 struct jit_avx512_common_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
             if (copy(other) != status::success) is_initialized_ = false;
@@ -76,8 +72,11 @@ struct jit_avx512_common_1x1_convolution_fwd_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, dst_md(), weights_md());
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_common_1x1_conv_kernel::init_conf(jcp_, *conv_d,
                     *src_d, *weights_md(), *dst_md(), *attr(),
                     dnnl_get_max_threads(), rtus_.reduce_src_));
@@ -98,14 +97,14 @@ struct jit_avx512_common_1x1_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
+            return dw_conv_pd_ && jcp_.with_dw_conv
                     ? dw_conv_pd_->dst_md(index, user_input)
                     : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
+            if (dw_conv_pd_ && jcp_.with_dw_conv) {
                 switch (arg) {
                     case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
                         return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
@@ -123,15 +122,15 @@ struct jit_avx512_common_1x1_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
             return convolution_fwd_pd_t::arg_usage(arg);
         }
 
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
         using dw_pd_t = jit_avx512_common_dw_convolution_fwd_t::pd_t;
         std::unique_ptr<dw_pd_t> dw_conv_pd_;
 
@@ -214,9 +213,39 @@ struct jit_avx512_common_1x1_convolution_fwd_t : public primitive_t {
             CHECK(get_depthwise_conv_desc(
                     cd_dw, src_md, attr_1x1, attr_dw, dw_po_index));
 
+            // The code below doesn't work because currently it requires `jcp_`
+            // member which is not available from the common interface. In turn,
+            // this means the common pd creation interface through an iterator
+            // can't be used and a specific convolution implementation's pd is
+            // required here. It restricts the usage of inherited
+            // `convolution_pd_t` constructor.
+            // ANCHOR: USING_INHERITED_IS_IMPOSSIBLE.
+            //
+            // ```cpp
+            // primitive_desc_iterator_t it(
+            //         engine, (op_desc_t *)&cd_dw, &attr_dw, nullptr);
+            // if (!it.is_initialized()) return status::out_of_memory;
+            // while (++it != it.end()) {
+            //     dw_conv_pd_ = *it;
+            //     break;
+            // }
+            // VDISPATCH_CONV_IC(dw_conv_pd_, "dw_conv_pd hasn't been created");
+            // ```
+            //
+            // ```compiler output
+            // error: ‘using element_type = struct dnnl::impl::primitive_desc_t’
+            // {aka ‘struct dnnl::impl::primitive_desc_t’} has no member named
+            // ‘jcp_’
+            // auto &jcp_dw = dw_conv_pd_->jcp_;
+            //                             ^~~~
+            // ```
+            //
+            // TODO: figure out the way to initialize fused conv through a
+            // normal interface without hacks accessing specific members.
             CHECK(safe_ptr_assign(
                     dw_conv_pd_, new dw_pd_t(&cd_dw, &attr_dw, nullptr)));
             CHECK(dw_conv_pd_->init(engine));
+
             auto &jcp_dw = dw_conv_pd_->jcp_;
 
             VDISPATCH_CONV_IC(
@@ -282,9 +311,9 @@ struct jit_avx512_common_1x1_convolution_fwd_t : public primitive_t {
     jit_avx512_common_1x1_convolution_fwd_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using wei_data_t = typename prec_traits_t<wei_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -333,11 +362,7 @@ template <impl::data_type_t diff_dst_type,
         impl::data_type_t diff_src_type = diff_dst_type>
 struct jit_avx512_common_1x1_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", avx512_core, ""),
                 jit_avx512_common_1x1_convolution_bwd_data_t);
@@ -353,17 +378,19 @@ struct jit_avx512_common_1x1_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *diff_src_d = diff_src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, diff_src_d, diff_dst_md(), weights_md());
 
-            status_t status = jit_avx512_common_1x1_conv_kernel::init_conf(jcp_,
-                    *conv_d, *diff_src_d, *weights_md(), *diff_dst_md(),
-                    *attr(), dnnl_get_max_threads(), rtus_.reduce_src_);
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_common_1x1_conv_kernel::init_conf(jcp_, *conv_d,
+                    *diff_src_d, *weights_md(), *diff_dst_md(), *attr(),
+                    dnnl_get_max_threads(), rtus_.reduce_src_));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx512_common_1x1_conv_kernel::init_scratchpad(
@@ -375,8 +402,8 @@ struct jit_avx512_common_1x1_convolution_bwd_data_t : public primitive_t {
         }
 
         // TODO (Roma): structs conf header cleanup
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
 
     protected:
         bool set_default_formats() {
@@ -405,6 +432,23 @@ struct jit_avx512_common_1x1_convolution_bwd_data_t : public primitive_t {
 
             return set_default_formats_common(dat_tag, wei_tag, dat_tag);
         }
+
+        bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
     template <cpu_isa_t isa, typename conv_t>
@@ -413,9 +457,9 @@ struct jit_avx512_common_1x1_convolution_bwd_data_t : public primitive_t {
     jit_avx512_common_1x1_convolution_bwd_data_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
+    using diff_dst_data_t = typename prec_traits_t<diff_dst_type>::type;
+    using wei_data_t = typename prec_traits_t<wei_type>::type;
+    using diff_src_data_t = typename prec_traits_t<diff_src_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -444,11 +488,8 @@ using jit_avx512_common_1x1_convolution_bwd_data_f32_t
 
 struct jit_avx512_common_1x1_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", avx512_core, ""),
                 jit_avx512_common_1x1_convolution_bwd_weights_t);
@@ -468,12 +509,14 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, diff_dst_md(), diff_weights_md());
 
-            status_t status = jit_avx512_common_1x1_conv_kernel::init_conf(jcp_,
-                    *conv_d, *src_d, *diff_weights_md(), *diff_dst_md(),
-                    *attr(), dnnl_get_max_threads(), rtus_.reduce_src_);
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_common_1x1_conv_kernel::init_conf(jcp_, *conv_d,
+                    *src_d, *diff_weights_md(), *diff_dst_md(), *attr(),
+                    dnnl_get_max_threads(), rtus_.reduce_src_));
 
             init_balancers();
 
@@ -491,9 +534,9 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public primitive_t {
         }
 
         // TODO (Roma): structs conf header cleanup
-        jit_1x1_conv_conf_t jcp_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         cpu_reducer_t<data_type::f32>::conf_t reducer_bia_conf_;
-        reduce_to_unit_stride_t rtus_;
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
 
     protected:
         bool set_default_formats() {
@@ -541,7 +584,7 @@ struct jit_avx512_common_1x1_convolution_bwd_weights_t : public primitive_t {
     jit_avx512_common_1x1_convolution_bwd_weights_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_avx512_common_conv_kernel.cpp b/src/cpu/x64/jit_avx512_common_conv_kernel.cpp
index 5f507870678..12880b844ad 100644
--- a/src/cpu/x64/jit_avx512_common_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_common_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
 
+#include "cpu/cpu_convolution_pd.hpp"
 #include "cpu/platform.hpp"
 #include "cpu/x64/cpu_barrier.hpp"
 #include "cpu/x64/injectors/injector_utils.hpp"
@@ -75,10 +76,7 @@ inline status_t init_tag(format_tag_t &tag, memory_desc_t &md,
 }
 
 inline bool is_1stconv(const jit_conv_conf_t &jcp) {
-    if (mayiuse(avx512_core))
-        return (jcp.ic < 16 && jcp.ngroups == 1);
-    else
-        return one_of(jcp.ic, 1, 3);
+    return one_of(jcp.ic, 1, 2, 3);
 }
 
 inline bool is_ow_threading_on(const jit_conv_conf_t &jcp) {
@@ -95,8 +93,8 @@ template <typename Vmm>
 _jit_avx512_common_conv_fwd_kernel<Vmm>::_jit_avx512_common_conv_fwd_kernel(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name()), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name()), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -111,10 +109,12 @@ _jit_avx512_common_conv_fwd_kernel<Vmm>::_jit_avx512_common_conv_fwd_kernel(
                 use_exact_tail_scalar_bcast};
         const binary_injector::static_params_t static_params {
                 this->param1, rhs_args_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -144,6 +144,17 @@ static void iterate(const int nb_oc_blocking, const int ur_w, const F &fun) {
 
 template <typename Vmm>
 void _jit_avx512_common_conv_fwd_kernel<Vmm>::apply_postops(int ur_w) {
+    std::map<size_t, int> vmm_idx_off;
+    iterate(jcp.nb_oc_blocking, ur_w,
+            [&](const bool, const int i_load, const int i_ur) {
+                vmm_idx_off.insert({vmm_out_idx(i_ur, i_load), i_load * jcp.oc_block * sizeof(float)});
+            });
+    depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                              ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                              this->rsp, base_post_ops_data_offset};
+    quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                 this->rsp, base_post_ops_data_offset};
+
     injector_utils::vmm_index_set_t vmm_idxs;
     if (jcp.with_binary) {
         binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
@@ -164,13 +175,13 @@ void _jit_avx512_common_conv_fwd_kernel<Vmm>::apply_postops(int ur_w) {
                     }
                 });
 
-        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
     } else {
         iterate(jcp.nb_oc_blocking, ur_w,
                 [&](const bool, const int i_load, const int i_ur) {
                     vmm_idxs.emplace(vmm_out_idx(i_ur, i_load));
                 });
-        postops_injector_->compute_vector_range(vmm_idxs);
+        postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
     }
 }
 
@@ -223,7 +234,7 @@ void _jit_avx512_common_conv_fwd_kernel<Vmm>::store_output(int ur_w) {
 
     L(post_ops_label);
 
-    if (jcp.with_eltwise || jcp.with_binary) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         test(reg_channel, FLAG_IC_LAST);
         jz(store_label, T_NEAR);
 
@@ -509,7 +520,10 @@ void _jit_avx512_common_conv_fwd_kernel<Vmm>::compute_loop_fma_core(
 template <typename Vmm>
 void _jit_avx512_common_conv_fwd_kernel<Vmm>::compute_loop(
         int ur_w, int pad_l, int pad_r) {
-    if (jcp.ndims == 5) push(reg_oi);
+    if (jcp.ndims == 5) {
+        push(reg_oi);
+        base_post_ops_data_offset += reg64_size;
+    }
 
     prepare_output(ur_w);
 
@@ -564,7 +578,10 @@ void _jit_avx512_common_conv_fwd_kernel<Vmm>::compute_loop(
 
     L(skip_compute_loop);
     store_output(ur_w);
-    if (jcp.ndims == 5) pop(reg_oi);
+    if (jcp.ndims == 5) {
+        pop(reg_oi);
+        base_post_ops_data_offset -= reg64_size;
+    }
 }
 
 template <typename Vmm>
@@ -588,6 +605,10 @@ void _jit_avx512_common_conv_fwd_kernel<Vmm>::generate() {
             * (is_dst_layout_nxc() ? jcp.ngroups * jcp.oc : jcp.oc_block);
 
     preamble();
+
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_inp, reg_out);
+
     mov(reg_inp, ptr[param1 + GET_OFF(src)]);
     mov(reg_out, ptr[param1 + GET_OFF(dst)]);
     mov(reg_ker, ptr[param1 + GET_OFF(filt)]);
@@ -769,6 +790,10 @@ void _jit_avx512_common_conv_fwd_kernel<Vmm>::generate() {
         if (ur_w_tail != 0) { compute_loop(ur_w_tail, 0, r_pad); }
         L(end_label);
     }
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -791,6 +816,10 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     if (!everyone_is(data_type::f32, src_d.data_type(), weights_d.data_type(),
                 dst_d.data_type()))
         return status::unimplemented;
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
 
     const int regs = 28;
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
@@ -835,19 +864,19 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
             jcp.t_pad, jcp.oh, jcp.ih, jcp.stride_h, ext_kh);
     jcp.back_pad = calculate_end_padding(
             jcp.f_pad, jcp.od, jcp.id, jcp.stride_d, ext_kd);
-    bool kernel_outside_src = false || ext_kw <= jcp.l_pad
-            || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad || ext_kh <= jcp.b_pad
-            || ext_kd <= jcp.f_pad || ext_kd <= jcp.back_pad;
-    if (kernel_outside_src) return status::unimplemented;
+//    bool kernel_outside_src = false || ext_kw <= jcp.l_pad
+//            || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad || ext_kh <= jcp.b_pad
+//            || ext_kd <= jcp.f_pad || ext_kd <= jcp.back_pad;
+//    if (kernel_outside_src) return status::unimplemented;
 
     const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
     const auto dat_tag_ncx = pick(ndims - 3, ncw, nchw, ncdhw);
     const auto dat_tag_nCx4c = pick(ndims - 3, nCw4c, nChw4c, nCdhw4c);
     const auto dat_tag_nCx8c = pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
     const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    auto curr_src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c,
-            dat_tag_nCx8c, dat_tag_nCx4c, dat_tag_ncx);
-    auto curr_dst_tag = dst_d.matches_one_of_tag(
+    auto curr_src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc,
+            dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c, dat_tag_ncx);
+    auto curr_dst_tag = dst_d.mb_stride_relaxed_match(
             dat_tag_nxc, dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c);
     bool is_data_layout_nxc = IMPLICATION(curr_src_tag != dat_tag_nxc,
                                       src_d.format_kind() == format_kind::any)
@@ -860,7 +889,7 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     bool ok_to_pad_channels = true && !is_data_layout_nxc && jcp.ngroups == 1
             && src_d.data_type() == data_type::f32;
 
-    const int full_simd_w = cpu_isa_traits<avx512_core>::vlen / typesize;
+    const int full_simd_w = cpu_isa_traits_t<avx512_core>::vlen / typesize;
     jcp.simd_w = full_simd_w;
     bool ok_to_try_lower_zmm = true
             && IMPLICATION(is_data_layout_nxc,
@@ -956,6 +985,8 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const int binary_ind = post_ops.find(primitive_kind::binary);
     const int prelu_ind = post_ops.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization) != -1;
 
     jcp.post_ops = post_ops;
 
@@ -964,7 +995,7 @@ status_t jit_avx512_common_conv_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
@@ -1306,23 +1337,71 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::prepare_output(
 
 template <typename Vmm>
 void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::store_output(int ur_w) {
-    Label no_update_label;
+    std::size_t post_ops_data_offset = 0;
+    int depthwise_inj_idx = 0;
+    Label no_update_label, skip_post_ops;
     const int ic_tail = jcp.ic_without_padding % jcp.simd_w;
     const bool dsrc_layout_nxc = is_dsrc_layout_nxc();
     mov(reg_channel, ptr[param + GET_OFF(channel)]);
     cmp(reg_channel, 0);
     je(no_update_label, T_NEAR);
+    const auto &p = jcp.post_ops;
+    const bool with_depthwise = p.find(primitive_kind::depthwise) != -1;
+
+    if (with_depthwise) {
+        mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+        add(reg_d_weights, ptr[this->param1 + GET_OFF(oc_off)]);
+    }
+
     for (int k = 0; k < jcp.nb_ic_blocking; k++) {
+        if (with_depthwise) {
+            post_ops_data_offset = 0;
+            depthwise_inj_idx = 0;
+            for (int i = 0; i < p.len(); i++) {
+                auto& post_op = p.entry_[i];
+                if (post_op.is_depthwise()) {
+                    depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                        k*jcp.ur_w, k*jcp.ur_w + ur_w, reg_d_weights, reg_d_weights, false, true);
+
+                    post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+                    depthwise_inj_idx++;
+                }
+            }
+            add(reg_d_weights, jcp.ic_block * sizeof(float));
+        }
+
         for (int j = 0; j < ur_w; j++) {
             Vmm vmm = vmm_out(j, k);
             size_t aux_src_offset = get_diff_src_offset(j, k);
             vaddps(vmm,
-                    EVEX_compress_addr_safe(
-                            reg_src, aux_src_offset, reg_long_offt));
+                        EVEX_compress_addr_safe(
+                                reg_src, aux_src_offset, reg_long_offt));
         }
     }
+    jmp(skip_post_ops, T_NEAR);
 
     L(no_update_label);
+    post_ops_data_offset = 0;
+    depthwise_inj_idx = 0;
+
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+            add(reg_d_weights, ptr[this->param1 + GET_OFF(oc_off)]);
+
+            for (int k = 0; k < jcp.nb_ic_blocking; k++) {
+                depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                        k*jcp.ur_w, k*jcp.ur_w + ur_w, reg_d_weights, reg_d_weights);
+
+                add(reg_d_weights, jcp.ic_block * sizeof(float));
+            }
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+        }
+    }
+    L(skip_post_ops);
+
     for (int k = 0; k < jcp.nb_ic_blocking; k++) {
         for (int j = 0; j < ur_w; j++) {
             Vmm vmm = vmm_out(j, k);
@@ -1369,6 +1448,7 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop_fma(
     }
 
     if (jcp.ndims == 5) {
+        base_post_ops_data_offset += reg64_size;
         push(reg_src);
 
         mov(reg_ki, ptr[param + GET_OFF(kd_padding)]);
@@ -1378,6 +1458,7 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop_fma(
             // aux_reg_ker_d == reg_ker we need to save its value and restore
             // it after kd loop
             assert(aux_reg_ker_d == reg_ker);
+            base_post_ops_data_offset += reg64_size;
             push(aux_reg_ker_d);
         } else
             mov(aux_reg_ker_d, ptr[param + GET_OFF(filt)]);
@@ -1475,10 +1556,16 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop_fma(
         dec(reg_ki);
         cmp(reg_ki, 0);
         jg(kd_label, T_NEAR);
-        if (ocb_loop_in_compute_function) pop(aux_reg_ker_d);
+        if (ocb_loop_in_compute_function) {
+            pop(aux_reg_ker_d);
+            base_post_ops_data_offset -= reg64_size;
+        }
     }
 
-    if (jcp.ndims == 5) { pop(reg_src); }
+    if (jcp.ndims == 5) {
+        pop(reg_src);
+        base_post_ops_data_offset -= reg64_size;
+    }
 }
 
 template <typename Vmm>
@@ -1515,6 +1602,7 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop_fma_core(
 
     const bool ocb_loop_in_compute_function = ddst_layout_nxc;
     if (jcp.ndims == 5) {
+        base_post_ops_data_offset += reg64_size;
         push(reg_src);
 
         mov(reg_ki, ptr[param + GET_OFF(kd_padding)]);
@@ -1524,6 +1612,7 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop_fma_core(
             // aux_reg_ker_d == reg_ker we need to save its value and restore
             // it after kd loop
             assert(aux_reg_ker_d == reg_ker);
+            base_post_ops_data_offset += reg64_size;
             push(aux_reg_ker_d);
         } else
             mov(aux_reg_ker_d, ptr[param + GET_OFF(filt)]);
@@ -1596,15 +1685,22 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop_fma_core(
         cmp(reg_ki, 0);
         jg(kd_label, T_NEAR);
 
-        if (ocb_loop_in_compute_function) pop(aux_reg_ker_d);
+        if (ocb_loop_in_compute_function) {
+            pop(aux_reg_ker_d);
+            base_post_ops_data_offset -= reg64_size;
+        }
         pop(reg_src);
+        base_post_ops_data_offset -= reg64_size;
     }
 }
 
 template <typename Vmm>
 inline void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop(
         int ur_w, int l_overflow, int r_overflow, int k_offset) {
-    if (jcp.ndims == 5) push(reg_oi);
+    if (jcp.ndims == 5) {
+        base_post_ops_data_offset += reg64_size;
+        push(reg_oi);
+    }
 
     prepare_output(ur_w);
 
@@ -1621,6 +1717,7 @@ inline void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop(
     const bool generate_ocb_loop = jcp.nb_oc > 1 && is_ddst_layout_nxc();
     Label oc_loop;
     if (generate_ocb_loop) {
+        base_post_ops_data_offset += 2 * reg64_size;
         push(reg_dst);
         push(reg_ker);
 
@@ -1643,15 +1740,30 @@ inline void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::compute_loop(
 
         pop(reg_ker);
         pop(reg_dst);
+        base_post_ops_data_offset -= 2 * reg64_size;
     }
 
     L(skip_compute_loop);
     store_output(ur_w);
-    if (jcp.ndims == 5) pop(reg_oi);
+    if (jcp.ndims == 5) {
+        pop(reg_oi);
+        base_post_ops_data_offset -= reg64_size;
+    }
 }
 
 template <typename Vmm>
 void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::generate() {
+    const auto &p = jcp.post_ops;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<avx512_core>(
+                    this,
+                    post_op
+            ));
+        }
+    }
+
     int iw = jcp.iw;
     int kw = jcp.kw;
     int ur_w = jcp.ur_w;
@@ -1670,6 +1782,26 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::generate() {
 
     preamble();
 
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_src;
+        auto aux_reg1 = reg_dst;
+
+        mov(aux_reg0, ptr[this->param + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
     mov(reg_src, ptr[param + GET_OFF(src)]);
     mov(reg_dst, ptr[param + GET_OFF(dst)]);
     mov(reg_ker, ptr[param + GET_OFF(filt)]);
@@ -1834,15 +1966,39 @@ void _jit_avx512_common_conv_bwd_data_kernel_f32<Vmm>::generate() {
     }
     L(end_label);
 
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
     postamble();
 }
 
+bool jit_avx512_common_conv_bwd_data_kernel_f32::post_ops_ok(const jit_conv_conf_t &jcp) {
+    const auto &p = jcp.post_ops;
+    if (p.len() > 1)
+        return false;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = 0; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+        }
+        return ok;
+    };
+
+    return all_post_ops_supported();
+}
+
 status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
         jit_conv_conf_t &jcp, const convolution_desc_t &cd,
         memory_desc_t &diff_src_md, memory_desc_t &weights_md,
         memory_desc_t &diff_dst_md, int nthreads) {
     if (!mayiuse(avx512_core)) return status::unimplemented;
 
+    if (!post_ops_ok(jcp))
+        return status::unimplemented;
+
     const memory_desc_wrapper diff_src_d(&diff_src_md);
     const memory_desc_wrapper weights_d(&weights_md);
     const memory_desc_wrapper diff_dst_d(&diff_dst_md);
@@ -1851,6 +2007,10 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
     if (!everyone_is(data_type::f32, diff_dst_d.data_type(),
                 weights_d.data_type(), diff_src_d.data_type()))
         return status::unimplemented;
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, diff_src_d, weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
 
     const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1;
     int ndims = diff_src_d.ndims();
@@ -1913,9 +2073,9 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
     const auto dat_tag_nCx4c = pick(ndims - 3, nCw4c, nChw4c, nCdhw4c);
     const auto dat_tag_nCx8c = pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
     const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    auto curr_src_tag = diff_src_d.matches_one_of_tag(
+    auto curr_src_tag = diff_src_d.mb_stride_relaxed_match(
             dat_tag_nxc, dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c);
-    auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
+    auto curr_dst_tag = diff_dst_d.mb_stride_relaxed_match(
             dat_tag_nxc, dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c);
     bool is_data_layout_nxc
             = IMPLICATION(curr_src_tag != dat_tag_nxc,
@@ -1929,7 +2089,7 @@ status_t jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
     bool ok_to_pad_channels = true && !is_data_layout_nxc && jcp.ngroups == 1
             && diff_src_d.data_type() == data_type::f32;
 
-    const int full_simd_w = cpu_isa_traits<avx512_core>::vlen / typesize;
+    const int full_simd_w = cpu_isa_traits_t<avx512_core>::vlen / typesize;
     jcp.simd_w = full_simd_w;
     bool ok_to_try_lower_zmm = true
             && IMPLICATION(is_data_layout_nxc,
@@ -3037,7 +3197,7 @@ void jit_avx512_common_conv_bwd_weights_kernel_f32::maybe_zero_kernel() {
     L(zeroing_loop);
     {
         assert(jcp.oc_block * jcp.typesize_out
-                == cpu_isa_traits<avx512_core>::vlen);
+                == cpu_isa_traits_t<avx512_core>::vlen);
         for (int ic1 = 0; ic1 < jcp.ic_block; ic1++)
             vmovups(ptr[reg_kernel + reg_tmp
                             + ic1 * jcp.oc_block * jcp.typesize_out],
@@ -3898,13 +4058,17 @@ status_t jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
     if (!utils::everyone_is(data_type::f32, src_d.data_type(),
                 diff_weights_d.data_type(), diff_dst_d.data_type()))
         return status::unimplemented;
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, diff_weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
 
     const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1;
     int ndims = src_d.ndims();
 
     jcp = zero<decltype(jcp)>();
 
-    jcp.simd_w = cpu_isa_traits<avx512_core>::vlen / typesize;
+    jcp.simd_w = cpu_isa_traits_t<avx512_core>::vlen / typesize;
     jcp.nthr = jcp.aligned_threads = nthreads;
     jcp.ndims = ndims;
     jcp.prop_kind = cd.prop_kind;
diff --git a/src/cpu/x64/jit_avx512_common_conv_kernel.hpp b/src/cpu/x64/jit_avx512_common_conv_kernel.hpp
index 163ff4ddc78..ff47a3d4bde 100644
--- a/src/cpu/x64/jit_avx512_common_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_common_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ namespace cpu {
 namespace x64 {
 
 template <typename Vmm>
-struct _jit_avx512_common_conv_fwd_kernel : public jit_generator {
+struct _jit_avx512_common_conv_fwd_kernel : public jit_generator_t {
 
     _jit_avx512_common_conv_fwd_kernel(const jit_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
@@ -43,7 +43,7 @@ struct _jit_avx512_common_conv_fwd_kernel : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
     enum {
         typesize = sizeof(float),
@@ -113,6 +113,14 @@ struct _jit_avx512_common_conv_fwd_kernel : public jit_generator {
     std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
 
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = reg_kj;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     inline void prepare_output(int ur_w);
     inline void apply_postops(int ur_w);
     inline void store_output(int ur_w);
@@ -204,7 +212,7 @@ struct jit_avx512_common_conv_fwd_kernel {
         return status::out_of_memory;
     }
 
-    ~jit_avx512_common_conv_fwd_kernel() {}
+    ~jit_avx512_common_conv_fwd_kernel() = default;
 
     enum { typesize = sizeof(float) };
 
@@ -221,14 +229,20 @@ struct jit_avx512_common_conv_fwd_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_common_conv_fwd_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 template <typename Vmm>
-struct _jit_avx512_common_conv_bwd_data_kernel_f32 : public jit_generator {
+struct _jit_avx512_common_conv_bwd_data_kernel_f32 : public jit_generator_t {
 
     _jit_avx512_common_conv_bwd_data_kernel_f32(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp) {}
+        : jit_generator_t(jit_name()), jcp(ajcp) {}
+
+    ~_jit_avx512_common_conv_bwd_data_kernel_f32() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_avx512_common_conv_bwd_data_kernel_f32)
     jit_conv_conf_t jcp;
@@ -285,6 +299,13 @@ struct _jit_avx512_common_conv_bwd_data_kernel_f32 : public jit_generator {
 
     Vmm vmm_wei = Vmm(31);
 
+    reg64_t reg_d_weights = aux_reg_ker;
+    reg64_t reg_d_bias = reg_kj;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    nstl::vector<jit_uni_depthwise_injector_f32<avx512_core>*> depthwise_injectors;
+
     inline void prepare_output(int ur_w);
     inline void store_output(int ur_w);
     inline void compute_loop_fma(int ur_w, int l_overflow, int r_overflow);
@@ -374,10 +395,11 @@ struct jit_avx512_common_conv_bwd_data_kernel_f32 {
         return status::out_of_memory;
     }
 
-    ~jit_avx512_common_conv_bwd_data_kernel_f32() {}
+    ~jit_avx512_common_conv_bwd_data_kernel_f32() = default;
 
     enum { typesize = sizeof(float) };
 
+    static bool post_ops_ok(const jit_conv_conf_t &jcp);
     static status_t init_conf(jit_conv_conf_t &jcp,
             const convolution_desc_t &cd, memory_desc_t &diff_src_d,
             memory_desc_t &weights_d, memory_desc_t &diff_dst_d, int nthreads);
@@ -389,13 +411,13 @@ struct jit_avx512_common_conv_bwd_data_kernel_f32 {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_common_conv_bwd_data_kernel_f32);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
-struct jit_avx512_common_conv_bwd_weights_kernel_f32 : public jit_generator {
+struct jit_avx512_common_conv_bwd_weights_kernel_f32 : public jit_generator_t {
 
     jit_avx512_common_conv_bwd_weights_kernel_f32(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp) {}
+        : jit_generator_t(jit_name()), jcp(ajcp) {}
 
     void generate() override {
         if (jcp.harness != harness_nxc)
@@ -471,11 +493,11 @@ struct jit_avx512_common_conv_bwd_weights_kernel_f32 : public jit_generator {
     inline void compute_od_loop_partial();
 
     inline void compute_loop();
-    inline bool is_src_layout_nxc() {
+    inline bool is_src_layout_nxc() const {
         return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
-    inline bool is_ddst_layout_nxc() {
+    inline bool is_ddst_layout_nxc() const {
         return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
@@ -494,7 +516,7 @@ struct jit_avx512_common_conv_bwd_weights_kernel_f32 : public jit_generator {
         return input_offset + typesize * local_input_offset;
     };
 
-    inline int get_iw_idx(int ow, int kw, int l_pad) {
+    inline int get_iw_idx(int ow, int kw, int l_pad) const {
         return ow * jcp.stride_w + kw * (jcp.dilate_w + 1) - l_pad;
     }
 
diff --git a/src/cpu/x64/jit_avx512_common_convolution.cpp b/src/cpu/x64/jit_avx512_common_convolution.cpp
index 17c42de750b..8196aad7d30 100644
--- a/src/cpu/x64/jit_avx512_common_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_common_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,8 @@ using jit_conv_ker_t = void (*)(jit_conv_call_s *);
 
 inline void jit_conv_ker_pipeline(const jit_conv_ker_t ker, jit_conv_call_s &p,
         const void *src, const void *dst, const void *filt, const void *bias,
-        int channel, int kh_padding, int reduce_work, int load_work) {
+        int channel, int kh_padding, int reduce_work, int load_work, int oc_off,
+        const void *post_ops_binary_rhs_arg_vec) {
     p.src = src;
     p.dst = dst;
     p.filt = filt;
@@ -47,6 +48,8 @@ inline void jit_conv_ker_pipeline(const jit_conv_ker_t ker, jit_conv_call_s &p,
     p.kh_padding = kh_padding;
     p.reduce_work = reduce_work;
     p.load_work = load_work;
+    p.oc_off = oc_off;
+    p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
 
     ker(&p);
 }
@@ -54,17 +57,17 @@ inline void jit_conv_ker_pipeline(const jit_conv_ker_t ker, jit_conv_call_s &p,
 inline void jit_conv_ker_pipeline_iw_thr(const jit_conv_ker_t ker,
         jit_conv_call_s &p, const void *src, const void *dst, const void *filt,
         const void *bias, int channel, int kh_padding, int iwb, int reduce_work,
-        int load_work) {
+        int load_work, int oc_off, const void *post_ops_binary_rhs_arg_vec) {
     p.iwb = iwb;
 
     jit_conv_ker_pipeline(ker, p, src, dst, filt, bias, channel, kh_padding,
-            reduce_work, load_work);
+            reduce_work, load_work, oc_off, post_ops_binary_rhs_arg_vec);
 }
 
 inline void jit_conv_3d_ker_pipeline(const jit_conv_ker_t ker,
         jit_conv_call_s &p, const void *src, const void *dst, const void *filt,
         const void *bias, int channel, int kh_padding, int kd_padding,
-        int reduce_work, int load_work) {
+        int reduce_work, int load_work, int oc_off, const void *post_ops_binary_rhs_arg_vec) {
     p.src = src;
     p.dst = dst;
     p.filt = filt;
@@ -76,15 +79,18 @@ inline void jit_conv_3d_ker_pipeline(const jit_conv_ker_t ker,
     p.kd_padding = kd_padding;
     p.reduce_work = reduce_work;
     p.load_work = load_work;
+    p.oc_off = oc_off;
+    p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
 
     ker(&p);
 }
+
 // The special case for the driver with ow-parallelization (FWD)
 inline void jit_conv_ker_pipeline_ow_thr(jit_conv_ker_t ker, jit_conv_call_s &p,
         const void *src, const void *dst, const void *filt, const void *bias,
         int channel, int kh_padding, int owb, int reduce_work, int load_work,
         const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
-        int flags) {
+        int flags, int oc_off) {
     p.owb = owb;
     p.flags = flags;
 
@@ -92,7 +98,7 @@ inline void jit_conv_ker_pipeline_ow_thr(jit_conv_ker_t ker, jit_conv_call_s &p,
     p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
 
     jit_conv_ker_pipeline(ker, p, src, dst, filt, bias, channel, kh_padding,
-            reduce_work, load_work);
+            reduce_work, load_work, oc_off, post_ops_binary_rhs_arg_vec);
 }
 
 // The special case for the driver with ow-parallelization (FWD)
@@ -101,7 +107,7 @@ inline void jit_conv_3d_ker_pipeline_ow_thr(const jit_conv_ker_t ker,
         jit_conv_call_s &p, const void *src, const void *dst, const void *filt,
         const void *bias, int channel, int kh_padding, int kd_padding, int owb,
         int reduce_work, int load_work, const void *post_ops_binary_rhs_arg_vec,
-        const void *dst_orig, int flags) {
+        const void *dst_orig, int flags, int oc_off) {
 
     p.dst_orig = dst_orig;
     p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
@@ -110,7 +116,7 @@ inline void jit_conv_3d_ker_pipeline_ow_thr(const jit_conv_ker_t ker,
     p.flags = flags;
 
     jit_conv_3d_ker_pipeline(ker, p, src, dst, filt, bias, channel, kh_padding,
-            kd_padding, reduce_work, load_work);
+            kd_padding, reduce_work, load_work, oc_off, post_ops_binary_rhs_arg_vec);
 }
 
 inline void jit_conv_ker_pipeline_bwd_w(const jit_conv_ker_t ker,
@@ -118,7 +124,7 @@ inline void jit_conv_ker_pipeline_bwd_w(const jit_conv_ker_t ker,
         const void *bias, int channel, int kh_padding, size_t reduce_work,
         size_t load_work) {
     jit_conv_ker_pipeline(ker, p, src, dst, filt, bias, channel, kh_padding,
-            reduce_work, load_work);
+            reduce_work, load_work, 0, nullptr);
 }
 
 void jit_conv_2d_ker_bwd_w_pipeline(const jit_conv_ker_t ker,
@@ -215,7 +221,10 @@ void jit_avx512_common_convolution_fwd_t<src_type, wei_type,
         start_copy = start;
 
         auto par_conv = jit_conv_call_s();
-        size_t src_c_stride = src_d.blk_off(0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t src_c_stride = src_d.blk_off<false, true>(0, 1);
         size_t wht_ic_stride = wht_blk_off(weights_d, 0, 0, 1);
 
         for (int icb_l2 = 0; icb_l2 < jcp.nb_ic; icb_l2 += jcp.nb_ic_L2) {
@@ -278,9 +287,10 @@ void jit_avx512_common_convolution_fwd_t<src_type, wei_type,
                         ic_work = utils::this_block_size(icb * jcp.ic_block,
                                 jcp.ic, icb_step * jcp.ic_block);
                     }
+                    int oc_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
                     jit_conv_ker_pipeline_ow_thr(jit_ker, par_conv, src_w,
                             dst_w, wht_w, bias_w, icb, 1, owb, ic_work, oc_work,
-                            post_ops_binary_rhs_arg_vec.data(), dst, flags);
+                            post_ops_binary_rhs_arg_vec.data(), dst, flags, oc_off);
 
                     src_w += src_c_stride;
                     wht_w += wht_ic_stride;
@@ -338,9 +348,12 @@ void jit_avx512_common_convolution_fwd_t<src_type, wei_type,
         start_copy = start;
 
         auto par_conv = jit_conv_call_s();
-        size_t src_h_stride = src_d.blk_off(0, 0, 1);
-        size_t src_c_stride = src_d.blk_off(0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t src_h_stride = src_d.blk_off<false, true>(0, 0, 1);
+        size_t src_c_stride = src_d.blk_off<false, true>(0, 1);
+        size_t dst_h_stride = dst_d.blk_off<false, true>(0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_ic_stride = wht_blk_off(weights_d, 0, 0, 1);
 
@@ -431,11 +444,12 @@ void jit_avx512_common_convolution_fwd_t<src_type, wei_type,
                                     + i_t_overflow * dilate_h * src_h_stride;
                             auto aux_wht = wht_w + i_t_overflow * wht_h_stride;
 
+                            int oc_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
                             jit_conv_ker_pipeline_ow_thr(jit_ker, par_conv,
                                     aux_src, dst_c, aux_wht, bias_w, icb,
                                     kh_padding, owb, ic_work, oc_work,
                                     post_ops_binary_rhs_arg_vec.data(), dst,
-                                    flags);
+                                    flags, oc_off);
 
                             src_c += src_h_stride * jcp.stride_h;
                             dst_c += dst_h_stride;
@@ -495,10 +509,13 @@ void jit_avx512_common_convolution_fwd_t<src_type, wei_type,
         start_copy = start;
 
         auto par_conv = jit_conv_call_s();
-        size_t src_d_stride = src_d.blk_off(0, 0, 1);
-        size_t src_h_stride = src_d.blk_off(0, 0, 0, 1);
-        size_t src_c_stride = src_d.blk_off(0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t src_d_stride = src_d.blk_off<false, true>(0, 0, 1);
+        size_t src_h_stride = src_d.blk_off<false, true>(0, 0, 0, 1);
+        size_t src_c_stride = src_d.blk_off<false, true>(0, 1);
+        size_t dst_h_stride = dst_d.blk_off<false, true>(0, 0, 0, 1);
         size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
         size_t wht_ic_stride = wht_blk_off(weights_d, 0, 0, 1);
@@ -591,12 +608,13 @@ void jit_avx512_common_convolution_fwd_t<src_type, wei_type,
                                 dilate_h);
                         int kh_padding = nstl::max(
                                 0, jcp.kh - i_t_overflow - i_b_overflow);
+                        int oc_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
                         jit_conv_3d_ker_pipeline_ow_thr(jit_ker, par_conv,
                                 src_c + i_t_overflow * dilate_h * src_h_stride,
                                 dst_c, wht_w + i_t_overflow * wht_h_stride,
                                 bias_w, icb, kh_padding, kd_padding, owb,
                                 ic_work, oc_work,
-                                post_ops_binary_rhs_arg_vec.data(), dst, flags);
+                                post_ops_binary_rhs_arg_vec.data(), dst, flags, oc_off);
 
                         src_c += src_h_stride * jcp.stride_h;
                         dst_c += dst_h_stride;
@@ -634,11 +652,14 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
 
-    const auto &jcp = pd()->jcp_;
     const jit_conv_ker_t jit_ker = (decltype(jit_ker))kernel_->jit_ker();
 
     int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
@@ -653,7 +674,10 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
         start_copy = start;
 
         auto par_conv = jit_conv_call_s();
-        size_t diff_dst_c_stride = diff_dst_d.blk_off(0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t diff_dst_c_stride = diff_dst_d.blk_off<false, true>(0, 1);
         size_t wht_oc_stride = wht_blk_off(weights_d, 0, 1);
 
         for (int ocb_l2 = 0; ocb_l2 < jcp.nb_oc; ocb_l2 += jcp.nb_oc_L2) {
@@ -701,6 +725,9 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
                 const int load_work = utils::this_block_size(icb * jcp.ic_block,
                         jcp.ic, jcp.nb_ic_blocking * jcp.ic_block);
                 int reduce_work = ocb_step * jcp.oc_block;
+
+                int ic_off = ic_off_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+
                 for (int ocb = ocb_l2; ocb < ocb_end; ocb += ocb_step) {
                     int curr_nb_oc = nstl::min(ocb_step, ocb_end - ocb);
                     if (ocb + curr_nb_oc >= jcp.nb_oc) {
@@ -710,7 +737,7 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
 
                     jit_conv_ker_pipeline_iw_thr(jit_ker, par_conv, diff_src_w,
                             diff_dst_w, wht_w, nullptr, ocb, 1, iwb,
-                            reduce_work, load_work);
+                            reduce_work, load_work, ic_off, post_ops_binary_rhs_arg_vec.data());
                     diff_dst_w += diff_dst_c_stride;
                     wht_w += wht_oc_stride;
                 }
@@ -743,11 +770,14 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
 
-    const auto &jcp = pd()->jcp_;
     const jit_conv_ker_t jit_ker = (decltype(jit_ker))kernel_->jit_ker();
 
     int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
@@ -762,9 +792,12 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
         start_copy = start;
 
         auto par_conv = jit_conv_call_s();
-        size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 1);
-        size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 1);
-        size_t diff_dst_c_stride = diff_dst_d.blk_off(0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t diff_src_h_stride = diff_src_d.blk_off<false, true>(0, 0, 1);
+        size_t diff_dst_h_stride = diff_dst_d.blk_off<false, true>(0, 0, 1);
+        size_t diff_dst_c_stride = diff_dst_d.blk_off<false, true>(0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_oc_stride = wht_blk_off(weights_d, 0, 1);
 
@@ -869,11 +902,14 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
                             oj = (ij + jcp.t_pad - k_lo) / jcp.stride_h;
                         }
 
+                        int ic_off = ic_off_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+
                         jit_conv_ker_pipeline_iw_thr(jit_ker, par_conv,
                                 diff_src_w + ij * diff_src_h_stride,
                                 diff_dst_w + oj * diff_dst_h_stride,
                                 wht_w + k_lo * wht_h_stride, nullptr, ocb,
-                                k_len, iwb, reduce_work, load_work);
+                                k_len, iwb, reduce_work, load_work, ic_off,
+                                post_ops_binary_rhs_arg_vec.data());
                     }
                     diff_dst_w += diff_dst_c_stride;
                     wht_w += wht_oc_stride;
@@ -904,11 +940,14 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
 
-    const auto &jcp = pd()->jcp_;
     const jit_conv_ker_t jit_ker = (decltype(jit_ker))kernel_->jit_ker();
 
     int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
@@ -923,11 +962,14 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
         start_copy = start;
 
         auto par_conv = jit_conv_call_s();
-        size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 0, 1);
-        size_t diff_src_d_stride = diff_src_d.blk_off(0, 0, 1);
-        size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 0, 1);
-        size_t diff_dst_d_stride = diff_dst_d.blk_off(0, 0, 1);
-        size_t diff_dst_c_stride = diff_dst_d.blk_off(0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t diff_src_h_stride = diff_src_d.blk_off<false, true>(0, 0, 0, 1);
+        size_t diff_src_d_stride = diff_src_d.blk_off<false, true>(0, 0, 1);
+        size_t diff_dst_h_stride = diff_dst_d.blk_off<false, true>(0, 0, 0, 1);
+        size_t diff_dst_d_stride = diff_dst_d.blk_off<false, true>(0, 0, 1);
+        size_t diff_dst_c_stride = diff_dst_d.blk_off<false, true>(0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
         size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_oc_stride = wht_blk_off(weights_d, 0, 1);
@@ -1076,11 +1118,14 @@ void jit_avx512_common_convolution_bwd_data_t<diff_dst_type, wei_type,
                         }
                         assert(k_len >= 0);
 
+                        int ic_off = ic_off_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+
                         jit_conv_3d_ker_pipeline(jit_ker, par_conv,
                                 diff_src_w + ij * diff_src_h_stride,
                                 diff_dst_w + oj * diff_dst_h_stride,
                                 wht_w + k_lo * wht_h_stride, nullptr, ocb,
-                                k_len, d_len, reduce_work, load_work);
+                                k_len, d_len, reduce_work, load_work, ic_off,
+                                post_ops_binary_rhs_arg_vec.data());
                     }
                     diff_dst_w += diff_dst_c_stride;
                     wht_w += wht_oc_stride;
diff --git a/src/cpu/x64/jit_avx512_common_convolution.hpp b/src/cpu/x64/jit_avx512_common_convolution.hpp
index c1919900f69..bc22533b84c 100644
--- a/src/cpu/x64/jit_avx512_common_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_common_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,9 +39,7 @@ template <impl::data_type_t src_type, impl::data_type_t wei_type = src_type,
         impl::data_type_t dst_type = src_type>
 struct jit_avx512_common_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", avx512_core, ""),
                 jit_avx512_common_convolution_fwd_t);
@@ -59,6 +57,7 @@ struct jit_avx512_common_convolution_fwd_t : public primitive_t {
                             primitive_attr_t::skip_mask_t::post_ops, dst_type),
                     VERBOSE_UNSUPPORTED_ATTR);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_common_conv_fwd_kernel::init_conf(jcp_, *desc(),
                     src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -70,14 +69,14 @@ struct jit_avx512_common_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_common_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using wei_data_t = typename prec_traits_t<wei_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -116,9 +115,7 @@ template <impl::data_type_t diff_dst_type,
         impl::data_type_t diff_src_type = diff_dst_type>
 struct jit_avx512_common_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", avx512_core, ""),
                 jit_avx512_common_convolution_bwd_data_t);
@@ -133,14 +130,12 @@ struct jit_avx512_common_convolution_bwd_data_t : public primitive_t {
             VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-            VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
-            status_t status
-                    = jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(
-                            jcp_, *desc(), diff_src_md_, weights_md_,
-                            diff_dst_md_, dnnl_get_max_threads());
-            if (status != status::success) return status;
+            jcp_.post_ops = attr()->post_ops_;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_common_conv_bwd_data_kernel_f32::init_conf(jcp_,
+                    *desc(), diff_src_md_, weights_md_, diff_dst_md_,
+                    dnnl_get_max_threads()));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx512_common_conv_bwd_data_kernel_f32::init_scratchpad(
@@ -149,15 +144,15 @@ struct jit_avx512_common_convolution_bwd_data_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_common_convolution_bwd_data_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<wei_type>::type wei_data_t;
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
+    using diff_dst_data_t = typename prec_traits_t<diff_dst_type>::type;
+    using wei_data_t = typename prec_traits_t<wei_type>::type;
+    using diff_src_data_t = typename prec_traits_t<diff_src_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -191,10 +186,8 @@ template <impl::data_type_t src_type,
         impl::data_type_t diff_weights_type = src_type>
 struct jit_avx512_common_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", avx512_core, ""),
                 jit_avx512_common_convolution_bwd_weights_t);
@@ -212,12 +205,10 @@ struct jit_avx512_common_convolution_bwd_weights_t : public primitive_t {
             VDISPATCH_CONV(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
-            status_t status
-                    = jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(
-                            jcp_, *desc(), src_md_, diff_weights_md_,
-                            diff_bias_md_, diff_dst_md_,
-                            dnnl_get_max_threads());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_common_conv_bwd_weights_kernel_f32::init_conf(jcp_,
+                    *desc(), src_md_, diff_weights_md_, diff_bias_md_,
+                    diff_dst_md_, dnnl_get_max_threads()));
 
             init_balancers();
 
@@ -229,10 +220,10 @@ struct jit_avx512_common_convolution_bwd_weights_t : public primitive_t {
                     scratchpad, memory_tracking::names::prefix_reducer_bia);
             reducer_bia_conf_.init_scratchpad(reducer_bia_scratchpad);
 
-            return status;
+            return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         typename cpu_reducer_t<diff_weights_type>::conf_t reducer_bia_conf_;
 
     private:
@@ -249,9 +240,9 @@ struct jit_avx512_common_convolution_bwd_weights_t : public primitive_t {
     jit_avx512_common_convolution_bwd_weights_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<diff_weights_type>::type diff_weights_data_t;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using diff_dst_data_t = typename prec_traits_t<diff_dst_type>::type;
+    using diff_weights_data_t = typename prec_traits_t<diff_weights_type>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.cpp
index e57e4b9ed76..a9e68fefef1 100644
--- a/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "common/c_types_map.hpp"
+#include "common/convolution_pd.hpp"
 #include "common/memory_tracking.hpp"
 #include "common/nstl.hpp"
 #include "common/type_helpers.hpp"
@@ -40,8 +41,8 @@ using namespace Xbyak;
 jit_avx512_core_amx_1x1_fwd_kernel_t::jit_avx512_core_amx_1x1_fwd_kernel_t(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         const auto &rhs_addr_reg = bin_injector_helper_reg_1;
         const auto &rhs_helper_reg = bin_injector_helper_reg_2;
@@ -58,10 +59,12 @@ jit_avx512_core_amx_1x1_fwd_kernel_t::jit_avx512_core_amx_1x1_fwd_kernel_t(
                 use_exact_tail_scalar_bcast};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params =
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -226,21 +229,31 @@ void jit_avx512_core_amx_1x1_fwd_kernel_t::apply_sum(const Zmm zmm_out,
 
 void jit_avx512_core_amx_1x1_fwd_kernel_t::apply_postops(const Zmm zmm_out,
         const float *p_sum_scale, const int32_t *p_sum_zp,
-        const Xbyak::Address &addr, const size_t off, const bool mask_flag) {
+        const Xbyak::Address &addr, const size_t off, const bool mask_flag, const int ocb) {
     if (jcp.with_eltwise || jcp.with_binary
-            || (jcp.with_sum && p_sum_scale != nullptr)) {
+            || (jcp.with_sum && p_sum_scale != nullptr) || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        vmm_idx_off.insert({zmm_out.getIdx(), ocb * jcp.oc_block * sizeof(float)});
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, this->rsp};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt, this->rsp};
+
+        binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
+
         apply_sum(zmm_out, p_sum_scale, p_sum_zp, addr, mask_flag);
 
         const auto vmm_idx = zmm_out.getIdx();
         if (jcp.with_binary) {
-            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
             rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, out_ptr);
             rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(vmm_idx, off);
             if (mask_flag) rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
+        }
 
-            postops_injector_->compute_vector(vmm_idx, rhs_arg_params);
-        } else {
-            postops_injector_->compute_vector(vmm_idx);
+        postops_injector_->compute_vector_range({(size_t)vmm_idx}, rhs_arg_params, ddp, qdp);
+
+        if ((jcp.with_depthwise || jcp.with_quantization) && jcp.src_zero_point) {
+            // restore reg_zp_compensation register which was overwritten by legacy postOps
+            mov(reg_zp_compensation, ptr[param1 + GET_OFF(zp_compensation)]);
         }
     }
 }
@@ -451,7 +464,7 @@ void jit_avx512_core_amx_1x1_fwd_kernel_t::store_output_vector_int8(
 
     if (jcp.with_bias) vaddps(zmm_out_msk, zmm_out, zmm_bias);
 
-    apply_postops(zmm_out, p_sum_scale, p_sum_zp, addr, off, mask_flag);
+    apply_postops(zmm_out, p_sum_scale, p_sum_zp, addr, off, mask_flag, ocb);
 
     if (jcp.dst_scale) {
         mov(reg_ptr_dst_scale, ptr[param1 + GET_OFF(dst_scale)]);
@@ -564,7 +577,7 @@ void jit_avx512_core_amx_1x1_fwd_kernel_t::store_output_vector_bf16(
 
     static constexpr auto skip_sum_in_injection = nullptr;
     apply_postops(zmm_out, skip_sum_in_injection, skip_sum_in_injection, addr,
-            off, mask_flag);
+            off, mask_flag, ocb);
 
     if (jcp.dst_dt == data_type::bf16) {
         store_output_ymm_bf16(zmm_out.getIdx(), addr, mask_flag);
@@ -838,6 +851,9 @@ int jit_avx512_core_amx_1x1_fwd_kernel_t::get_ic_tail() const {
 void jit_avx512_core_amx_1x1_fwd_kernel_t::generate() {
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(param1, GET_OFF(post_ops_binary_rhs_arg_vec), inp_ptr, wei_ptr);
+
     last_oc_block_flag_ = (jcp.oc_without_padding != jcp.oc);
     if (last_oc_block_flag_) {
         Xbyak::Label mask_is_set;
@@ -887,6 +903,10 @@ void jit_avx512_core_amx_1x1_fwd_kernel_t::generate() {
     osb_loop();
 
     L(label_done);
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -952,6 +972,11 @@ status_t jit_avx512_core_amx_1x1_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     bool is_1d = ndims == 3;
     bool is_3d = ndims == 5;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool is_bf16_convolution
             = everyone_is(true, src_d.data_type() == data_type::bf16,
                     weights_d.data_type() == data_type::bf16,
@@ -1029,8 +1054,8 @@ status_t jit_avx512_core_amx_1x1_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const auto zp = attr.zero_points_;
     jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
     jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
-    jcp.zp_src_is_common = zp.common(
-            DNNL_ARG_SRC); // otherwise, it's per-channel (not supported)
+    // If it's not per-tensor, then it's per-channel (not supported)
+    jcp.zp_src_is_common = zp.get_mask(DNNL_ARG_SRC) == 0;
     if (!IMPLICATION(jcp.src_zero_point, jcp.zp_src_is_common)
             || !IMPLICATION(jcp.dst_zero_point || jcp.src_zero_point,
                     is_int8_convolution))
@@ -1131,6 +1156,12 @@ status_t jit_avx512_core_amx_1x1_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
     jcp.sum_dt = p.get_sum_dt(jcp.dst_dt);
 
+    if (jcp.with_sum)
+        jcp.sum_dt = p.entry_[sum_ind].sum.dt;
+
+    jcp.with_depthwise = p.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = p.find(primitive_kind::quantization) != -1;
+
     jcp.post_ops = p;
     jcp.is_fast_postops = is_fast_postops(jcp);
 
@@ -1139,7 +1170,7 @@ status_t jit_avx512_core_amx_1x1_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const bool sum_requires_scale_one = sum_at_pos_0_only;
     const bool sum_requires_zp_zero = sum_at_pos_0_only;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
@@ -1217,9 +1248,8 @@ status_t jit_avx512_core_amx_1x1_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
             = (avaliable_ops) ? ops_tile_store / avaliable_ops + 1 : 0;
     if (jcp.per_one_pstore > 12) jcp.per_one_pstore = 0;
 
-    const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = attr.scales_.get_mask(DNNL_ARG_WEIGHTS) > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     return status::success;
diff --git a/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.hpp
index 1fa1ba63d2a..a7ba990b2eb 100644
--- a/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_amx_1x1_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx512_core_amx_1x1_fwd_kernel_t : public jit_generator {
+struct jit_avx512_core_amx_1x1_fwd_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_1x1_fwd_kernel_t)
 
     jit_avx512_core_amx_1x1_fwd_kernel_t(const jit_conv_conf_t &ajcp,
@@ -52,7 +52,7 @@ struct jit_avx512_core_amx_1x1_fwd_kernel_t : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
 
@@ -114,6 +114,12 @@ struct jit_avx512_core_amx_1x1_fwd_kernel_t : public jit_generator {
 
     const Xbyak::Opmask ktail_mask = k2;
 
+    const Xbyak::Reg64 reg_d_weights = reg_last_h;
+    const Xbyak::Reg64 reg_d_bias = reg_oc_blocks;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     bool is_bf16() const;
 
     void init_runtime_counters();
@@ -135,7 +141,7 @@ struct jit_avx512_core_amx_1x1_fwd_kernel_t : public jit_generator {
     Xbyak::Zmm zmm_out(const int idx) {
         const int upper_limit
                 = is_bf16() ? zmm_idx_limit_bf16 : zmm_idx_limit_int8;
-        assert(upper_limit > idx);
+//        assert(upper_limit > idx);
         MAYBE_UNUSED(upper_limit);
         return Xbyak::Zmm(idx);
     }
@@ -151,7 +157,7 @@ struct jit_avx512_core_amx_1x1_fwd_kernel_t : public jit_generator {
             const bool mask_flag);
     void apply_postops(const Xbyak::Zmm zmm_out, const float *p_sum_scale,
             const int32_t *p_sum_zp, const Xbyak::Address &addr,
-            const size_t off, const bool mask_flag);
+            const size_t off, const bool mask_flag, const int ocb);
     static bool is_fast_postops(const jit_conv_conf_t &jcp);
     void store_output_vectors_int8(int ocb, int osb);
     void store_output_vector_int8(
diff --git a/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.cpp b/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.cpp
index 5a61a9fa38f..d2e3907a4b6 100644
--- a/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -73,11 +73,10 @@ status_t jit_avx512_core_amx_1x1_convolution_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(
             ctx.get_scratchpad_grantor(), src_scales, wei_scales, pd()->IC(),
-            pd()->OC(), false, wei_scale_mask != 0, pd()->attr(),
+            pd()->OC(), false, wei_scale_mask > 0, pd()->attr(),
             jit_scale_precompute_.get());
 
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
@@ -168,7 +167,7 @@ status_t jit_avx512_core_amx_1x1_convolution_fwd_t::execute_forward(
                     = jcp.src_zero_point ? zp_compensation + oc : nullptr;
             p.src_zero_point = jcp.src_zero_point ? src_zero_point : nullptr;
             p.dst_zero_point = jcp.dst_zero_point ? dst_zero_point : nullptr;
-
+            p.oc_off = oc * sizeof(float);
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
             p.dst_orig = dst;
 
diff --git a/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.hpp b/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.hpp
index 9479230dc13..93e45378b53 100644
--- a/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_amx_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,9 +37,7 @@ namespace x64 {
 
 struct jit_avx512_core_amx_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_1x1:", jcp_.isa, ""),
                 jit_avx512_core_amx_1x1_convolution_fwd_t);
@@ -53,7 +51,7 @@ struct jit_avx512_core_amx_1x1_convolution_fwd_t : public primitive_t {
                               && utils::one_of(dst_md(0)->data_type, f32, bf16))
                     && IMPLICATION(with_bias(),
                             utils::one_of(weights_md(1)->data_type, f32, bf16))
-                    && attr()->has_default_values(smask_t::post_ops);
+                    && attr()->has_default_values(smask_t::post_ops, dst_md(0)->data_type);
             bool is_int8_convolution
                     = utils::one_of(src_md(0)->data_type, s8, u8)
                     && weights_md(0)->data_type == s8
@@ -62,9 +60,8 @@ struct jit_avx512_core_amx_1x1_convolution_fwd_t : public primitive_t {
                     && IMPLICATION(with_bias(),
                             utils::one_of(
                                     weights_md(1)->data_type, f32, s32, s8, u8))
-                    && attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::post_ops
-                                    | smask_t::zero_points_runtime
+                    && attr()->has_default_values(smask_t::scales
+                                    | smask_t::post_ops | smask_t::zero_points
                                     | smask_t::sum_dt,
                             dst_md(0)->data_type);
 
@@ -81,6 +78,7 @@ struct jit_avx512_core_amx_1x1_convolution_fwd_t : public primitive_t {
                     VERBOSE_UNSUPPORTED_POSTOP);
             VDISPATCH_CONV(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_amx_1x1_fwd_kernel_t::init_conf(jcp_, *desc(),
                     src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -92,16 +90,24 @@ struct jit_avx512_core_amx_1x1_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
     };
 
@@ -119,8 +125,8 @@ struct jit_avx512_core_amx_1x1_convolution_fwd_t : public primitive_t {
         const auto attr = pd()->attr();
         if (is_jit_supported && pd()->OC() > 1 && req_copy_scales(attr)) {
             const auto &attr_scales = attr->scales_;
-            int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-            if (wei_scale_mask != 0) {
+            int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+            if (wei_scale_mask > 0) {
                 CHECK(safe_ptr_assign(jit_scale_precompute_,
                         new jit_avx512_core_scale_precompute_t(attr)));
                 CHECK(jit_scale_precompute_->create_kernel());
diff --git a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp
index 1d4422a81fb..a68fd9159e5 100644
--- a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -1063,8 +1063,8 @@ void jit_avx512_core_amx_copy_to_pbuffer_t::generate() {
 jit_avx512_core_amx_fwd_kernel_t::jit_avx512_core_amx_fwd_kernel_t(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         const auto &rhs_addr_reg = bin_injector_helper_reg_1;
         const auto &rhs_helper_reg = bin_injector_helper_reg_2;
@@ -1083,9 +1083,12 @@ jit_avx512_core_amx_fwd_kernel_t::jit_avx512_core_amx_fwd_kernel_t(
         const binary_injector::static_params_t static_params {
                 this->param1, rhs_arg_static_params};
 
+        quantization_injector::static_params_t quantization_static_params =
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
+
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
     copy_to_pbuffer_
             = utils::make_unique<jit_avx512_core_amx_copy_to_pbuffer_t>(jcp);
@@ -1103,7 +1106,7 @@ status_t jit_avx512_core_amx_fwd_kernel_t::create_kernel() {
             && IMPLICATION(jcp.is_relo, copy_to_wbuffer_);
     if (!allocation_ok) return status::out_of_memory;
 
-    CHECK(jit_generator::create_kernel());
+    CHECK(jit_generator_t::create_kernel());
     CHECK(copy_to_pbuffer_->create_kernel());
     if (jcp.is_relo) CHECK(copy_to_wbuffer_->create_kernel());
     if (jcp.req_zero_point_buffer) {
@@ -1395,21 +1398,33 @@ void jit_avx512_core_amx_fwd_kernel_t::apply_sum(const Zmm &zmm_out,
 
 void jit_avx512_core_amx_fwd_kernel_t::apply_postops(const Zmm &zmm_out,
         const float *p_sum_scale, const int32_t *p_sum_zp,
-        const Xbyak::Address &addr, const size_t off, const bool mask_flag) {
+        const Xbyak::Address &addr, const size_t off, const bool mask_flag, const int ocb) {
     if (jcp.with_eltwise || jcp.with_binary
-            || (jcp.with_sum && p_sum_scale != nullptr)) {
+            || (jcp.with_sum && p_sum_scale != nullptr) || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        vmm_idx_off.insert({zmm_out.getIdx(), ocb * jcp.oc_block * sizeof(float)});
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
+        binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
+
         apply_sum(zmm_out, p_sum_scale, p_sum_zp, addr, mask_flag);
 
         const auto vmm_idx = zmm_out.getIdx();
         if (jcp.with_binary) {
-            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
             rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, reg_out_ptr);
             rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(vmm_idx, off);
             if (mask_flag) rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
+        }
 
-            postops_injector_->compute_vector(vmm_idx, rhs_arg_params);
-        } else {
-            postops_injector_->compute_vector(vmm_idx);
+        postops_injector_->compute_vector_range({(size_t)vmm_idx}, rhs_arg_params, ddp, qdp);
+
+        if ((jcp.with_depthwise || jcp.with_quantization) && jcp.src_zero_point) {
+            // restore reg_zp_compensation register which was overwritten by legacy postOps
+            mov(reg_zp_compensation, ptr[param1 + GET_OFF(zp_compensation)]);
         }
     }
 }
@@ -1455,7 +1470,7 @@ void jit_avx512_core_amx_fwd_kernel_t::store_output_vector_bf16(
 
     static constexpr auto skip_sum_injection = nullptr;
     apply_postops(zmm_out, skip_sum_injection, skip_sum_injection, addr, off,
-            mask_flag);
+            mask_flag, ocb);
 
     if (jcp.dst_dt == data_type::bf16) {
         store_output_ymm_bf16(zmm_out.getIdx(), addr, mask_flag);
@@ -1524,7 +1539,7 @@ void jit_avx512_core_amx_fwd_kernel_t::store_output_vector_int8(
             EVEX_compress_addr(reg_ptr_scales, scale_offset));
     if (jcp.with_bias) vaddps(zmm_out, zmm_out, zmm_bias);
 
-    apply_postops(zmm_out, p_sum_scale, p_sum_zp, addr, off, mask_flag);
+    apply_postops(zmm_out, p_sum_scale, p_sum_zp, addr, off, mask_flag, ocb);
 
     if (jcp.dst_scale) { vmulps(zmm_out_msk, zmm_out, zmm_dst_scale); }
     if (jcp.dst_zero_point) { vaddps(zmm_out, zmm_out, zmm_dst_zp); }
@@ -1779,6 +1794,7 @@ void jit_avx512_core_amx_fwd_kernel_t::compute_icb_loop(int width,
 
         push(reg_inp_ptr);
         push(reg_wei_ptr);
+        base_post_ops_data_offset += 2 * reg64_size;
 
         for (int ireduce = 0; ireduce < nreduce; ireduce += stride) {
             for (int ohb = 0; ohb < jcp.nb_oh_blocking; ohb++) {
@@ -1804,6 +1820,7 @@ void jit_avx512_core_amx_fwd_kernel_t::compute_icb_loop(int width,
         }
         pop(reg_wei_ptr);
         pop(reg_inp_ptr);
+        base_post_ops_data_offset -= 2 * reg64_size;
 
         store_output(width, tail, do_store, handle_h_blk, t_pad_output,
                 b_pad_output, l_pad_output, r_pad_output, is_last_oh_block);
@@ -1847,6 +1864,7 @@ void jit_avx512_core_amx_fwd_kernel_t::compute_icb_loop(int width,
                 dec(reg_kd);
                 jl(kd_skip_compute, T_NEAR);
                 push(reg_kd);
+                base_post_ops_data_offset += reg64_size;
             }
             for (int kh = 0; kh < jcp.kh; kh++) {
                 for (int set_idx = 0; set_idx < jcp.n_stride_sets;
@@ -1874,7 +1892,10 @@ void jit_avx512_core_amx_fwd_kernel_t::compute_icb_loop(int width,
                     }
                 }
             }
-            if (check_kd_padding) pop(reg_kd);
+            if (check_kd_padding) {
+                pop(reg_kd);
+                base_post_ops_data_offset -= reg64_size;
+            }
         }
         L(kd_skip_compute);
     }
@@ -2071,6 +2092,9 @@ void jit_avx512_core_amx_fwd_kernel_t::compute_ow_loop() {
 void jit_avx512_core_amx_fwd_kernel_t::generate() {
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_inp_ptr, reg_wei_ptr);
+
     mov(reg_inp_ptr, ptr[param1 + GET_OFF(src)]);
     mov(reg_wei_ptr, ptr[param1 + GET_OFF(filt)]);
     mov(reg_out_ptr, ptr[param1 + GET_OFF(dst)]);
@@ -2113,6 +2137,9 @@ void jit_avx512_core_amx_fwd_kernel_t::generate() {
     }
     compute_ow_loop();
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -2263,6 +2290,11 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     bool is_1d = ndims == 3;
     bool is_3d = ndims == 5;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool is_bf16_convolution
             = everyone_is(true, src_d.data_type() == data_type::bf16,
                     weights_d.data_type() == data_type::bf16,
@@ -2358,8 +2390,8 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const auto zp = attr.zero_points_;
     jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
     jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
-    jcp.zp_src_is_common = zp.common(
-            DNNL_ARG_SRC); // otherwise, it's per-channel (not supported)
+    // If it's not per-tensor, then it's per-channel (not supported)
+    jcp.zp_src_is_common = zp.get_mask(DNNL_ARG_SRC) == 0;
 
     VDISPATCH_CONV_IC(
             !(!IMPLICATION(jcp.src_zero_point, jcp.zp_src_is_common)
@@ -2390,7 +2422,7 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
         CHECK(memory_desc_init_by_tag(src_md, dat_tag_opt));
         jcp.src_tag = dat_tag_opt;
     } else
-        jcp.src_tag = src_d.matches_one_of_tag(dat_tag_alt, dat_tag_opt);
+        jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag_alt, dat_tag_opt);
 
     VDISPATCH_CONV_IC(one_of(jcp.src_tag, dat_tag_alt, dat_tag_opt),
             VERBOSE_UNSUPPORTED_TAG_S, "src");
@@ -2405,7 +2437,7 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
         CHECK(memory_desc_init_by_tag(dst_md, jcp.src_tag));
         jcp.dst_tag = jcp.src_tag;
     } else
-        jcp.dst_tag = dst_d.matches_one_of_tag(jcp.src_tag);
+        jcp.dst_tag = dst_d.mb_stride_relaxed_match(jcp.src_tag);
 
     VDISPATCH_CONV_IC(jcp.dst_tag == jcp.src_tag, VERBOSE_UNSUPPORTED_TAG);
 
@@ -2499,6 +2531,8 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const int prelu_ind = p.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
     jcp.sum_dt = p.get_sum_dt(jcp.dst_dt);
+    jcp.with_depthwise = p.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = p.find(primitive_kind::quantization) != -1;
 
     jcp.post_ops = p;
 
@@ -2507,7 +2541,7 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const bool sum_requires_scale_one = sum_at_pos_0_only;
     const bool sum_requires_zp_zero = sum_at_pos_0_only;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
@@ -2557,9 +2591,10 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
 
     jcp.nb_oc_blocking_thr_chunk = 1;
 
-    const int target_palette = amx::get_target_palette();
-    jcp.max_tiles = amx::get_max_tiles(target_palette);
-    jcp.full_tile_width = amx::get_max_rows(target_palette);
+    // @todo This change must be explained with a comment
+    // const int target_palette = amx::get_target_palette();
+    jcp.max_tiles = 8; //amx::get_max_tiles(target_palette);
+    jcp.full_tile_width = 16; //amx::get_max_rows(target_palette);
     VDISPATCH_CONV_IC(!(jcp.max_tiles != 8 || jcp.full_tile_width != 16),
             VERBOSE_BLOCKING_FAIL, "bad blocking parameters");
 
@@ -2655,7 +2690,7 @@ status_t jit_avx512_core_amx_fwd_kernel_t::init_conf(jit_conv_conf_t &jcp,
 
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     // Note: currently unsupported, results in seg-fault
@@ -3225,7 +3260,7 @@ void jit_avx512_core_amx_bwd_data_kernel_t::store_output_vector_xf16(
                 vpslld(zmm_in, zmm_in, 16);
                 break;
             case data_type::f16: vcvtph2ps(zmm_in_k, addr); break;
-            case data_type::f32: vaddps(zmm_in_k, addr); return;
+            case data_type::f32: vmovups(zmm_in_k, addr); break;
             default: assert(!"Unsupported data type in xf16 conv");
         }
         vaddps(zmm_out, zmm_in);
@@ -3240,7 +3275,7 @@ void jit_avx512_core_amx_bwd_data_kernel_t::store_output_vector_xf16(
     }
 
     const int eltwise_ind = p.find(primitive_kind::eltwise);
-    if (eltwise_ind != -1) eltwise_injector_->compute_vector(zmm_out.getIdx());
+    if (eltwise_ind != -1) idx_to_eltwise_injector_.at(eltwise_ind).compute_vector(zmm_out.getIdx());
 
     const Ymm ymm_out = Ymm(zmm_out.getIdx());
     const Ymm ymm_out_k = ymm_mask(ymm_out, mask_flag, true);
@@ -3300,7 +3335,7 @@ void jit_avx512_core_amx_bwd_data_kernel_t::store_output_vector_int8(
     if (jcp.with_bias) vaddps(zmm_out, zmm_out, zmm_bias);
 
     /* Do post-ops */
-    if (maybe_eltwise(0)) eltwise_injector_->compute_vector(zmm_out.getIdx());
+    if (maybe_eltwise(0)) idx_to_eltwise_injector_.at(0).compute_vector(zmm_out.getIdx());
     if (p_sum_scale) { // post_op: sum
         cvt2ps(jcp.dsrc_dt, zmm_prev_dst, addr, mask_flag);
         if (*p_sum_zp != 0) {
@@ -3312,7 +3347,12 @@ void jit_avx512_core_amx_bwd_data_kernel_t::store_output_vector_int8(
         else
             vfmadd231ps(zmm_out, zmm_prev_dst, zword_b[reg_ptr_sum_scale]);
     }
-    if (maybe_eltwise(1)) eltwise_injector_->compute_vector(zmm_out.getIdx());
+
+    if (maybe_eltwise(1)) idx_to_eltwise_injector_.at(1).compute_vector(zmm_out.getIdx());
+    for (auto i = 2; i < jcp.post_ops.len(); i++) {
+        if (idx_to_eltwise_injector_.count(i) != 0)
+            idx_to_eltwise_injector_.at(i).compute_vector(zmm_out.getIdx());
+    }
 
     if (jcp.dst_scale) { vmulps(zmm_out_msk, zmm_out, zmm_dst_scale); }
 
@@ -3642,7 +3682,10 @@ void jit_avx512_core_amx_bwd_data_kernel_t::generate() {
 
     postamble();
 
-    if (jcp.with_eltwise) eltwise_injector_->prepare_table();
+    if (jcp.with_eltwise) {
+        for (auto &elt_injector : idx_to_eltwise_injector_)
+            elt_injector.second.prepare_table();
+    }
 }
 
 bool jit_avx512_core_amx_bwd_data_kernel_t::post_ops_ok(
@@ -3660,6 +3703,12 @@ bool jit_avx512_core_amx_bwd_data_kernel_t::post_ops_ok(
         else
             return p.contain(sum, idx);
     };
+    //Add more element-wise post-ops supported for int8 deconv.
+    bool all_eltwise = jcp.is_int8_deconvolution;
+    for (auto i = 0; i < p.len(); i++)
+        all_eltwise &= is_eltwise(i);
+    if (all_eltwise)
+        return true;
 
     switch (p.len()) {
         case 0: return true;
@@ -3718,6 +3767,11 @@ status_t jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const memory_desc_wrapper diff_dst_d(&diff_dst_md);
     const memory_desc_wrapper bias_d(bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, diff_src_d, weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1;
     int ndims = diff_src_d.ndims();
     bool is_1d = ndims == 3;
@@ -3745,6 +3799,7 @@ status_t jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jit_conv_conf_t &jcp,
 
     jcp = zero<decltype(jcp)>();
     jcp.isa = is_f16 ? avx512_core_amx_fp16 : avx512_core_amx;
+    jcp.is_int8_deconvolution = is_int8_deconvolution;
     jcp.ndims = ndims;
     jcp.prop_kind = cd.prop_kind;
     jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
@@ -3832,7 +3887,7 @@ status_t jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jit_conv_conf_t &jcp,
             VERBOSE_UNSUPPORTED_TAG_S, "src");
 
     jcp.is_nspc = jcp.src_tag == dat_tag_nspc;
-    assert(IMPLICATION(is_int8_deconvolution, jcp.is_nspc));
+    assert(IMPLICATION(jcp.is_int8_deconvolution, jcp.is_nspc));
 
     // TODO: remove all support for nChw16c from this implementation
     VDISPATCH_CONV_IC(jcp.is_nspc, VERBOSE_UNSUPPORTED_TAG_S, "src");
@@ -3871,7 +3926,7 @@ status_t jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jit_conv_conf_t &jcp,
     const auto &p = attr.post_ops_;
     const int eltwise_ind = p.find(primitive_kind::eltwise);
     jcp.with_eltwise = eltwise_ind != -1;
-    if (jcp.with_eltwise) jcp.eltwise = p.entry_[eltwise_ind].eltwise;
+    jcp.post_ops = p;
 
     auto set_or_check_wei_format = [&]() {
         using namespace format_tag;
@@ -3884,7 +3939,7 @@ status_t jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jit_conv_conf_t &jcp,
             wei_tag = pick(with_groups + 2 * (ndims - 3), OIw16i16o2i,
                     gOIw16i16o2i, OIhw16i16o2i, gOIhw16i16o2i, OIdhw16i16o2i,
                     gOIdhw16i16o2i);
-        else if (is_int8_deconvolution)
+        else if (jcp.is_int8_deconvolution)
             wei_tag = pick(with_groups + 2 * (ndims - 3), OIw16i16o4i,
                     gOIw16i16o4i, OIhw16i16o4i, gOIhw16i16o4i, OIdhw16i16o4i,
                     gOIdhw16i16o4i);
@@ -3969,7 +4024,7 @@ status_t jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jit_conv_conf_t &jcp,
 
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_ic_scale = wei_scales.mask_ != 0;
+    jcp.is_ic_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     return status::success;
@@ -5155,6 +5210,10 @@ status_t jit_avx512_core_amx_bwd_weights_kernel_t::init_conf(
     const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1;
     int ndims = src_d.ndims();
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, diff_weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
     VDISPATCH_CONV_IC(mayiuse(avx512_core_amx), VERBOSE_UNSUPPORTED_ISA);
     jcp.isa = avx512_core_amx;
 
@@ -5396,12 +5455,14 @@ status_t jit_avx512_core_amx_bwd_weights_kernel_t::init_conf(
         jcp.nthr_ic_b = nthr_ic_b;
 
         // TODO: Optimize memory allocation when threaded on height and depth
-        jcp.tr_src_buf_size = jcp.tr_iw * jcp.ic_block * jcp.ih * jcp.id;
+        jcp.tr_src_buf_size = static_cast<size_t>(jcp.tr_iw) * jcp.ic_block
+                * jcp.ih * jcp.id;
         jcp.tr_src_buf_count = jcp.global_transpose
                 ? jcp.nthr_mb * jcp.nb_ic * jcp.ngroups
                 : jcp.nthr;
 
-        jcp.tr_diff_dst_buf_size = jcp.tr_ow * jcp.oc_block * jcp.oh * jcp.od;
+        jcp.tr_diff_dst_buf_size = static_cast<size_t>(jcp.tr_ow) * jcp.oc_block
+                * jcp.oh * jcp.od;
         jcp.tr_diff_dst_buf_count = jcp.global_transpose
                 ? jcp.nthr_mb * jcp.nb_oc * jcp.ngroups
                 : jcp.nthr;
@@ -5449,8 +5510,9 @@ status_t jit_avx512_core_amx_bwd_weights_kernel_t::init_scratchpad(
     if (IMPLICATION(jcp.nthr_mb == 1,
                 (jcp.with_bias && jcp.bia_dt == data_type::bf16)
                         || jcp.wei_dt == data_type::bf16)) {
-        const size_t wei_size = jcp.ngroups * jcp.nb_oc * jcp.oc_block
-                * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw * jcp.kd;
+        const size_t wei_size = static_cast<size_t>(jcp.ngroups) * jcp.nb_oc
+                * jcp.oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw
+                * jcp.kd;
         const size_t bia_size
                 = jcp.with_bias * jcp.ngroups * jcp.nb_oc * jcp.oc_block;
 
@@ -5609,6 +5671,11 @@ void jit_avx512_core_amx_bwd_weights_kernel_t::balance(const jit_conv_conf_t &j,
 dim_t jit_avx512_core_amx_bwd_bias_kernel_t::get_ddst_offset(
         dim_t w_idx, dim_t hd_idx) const {
     int ow_per_oc = data_type_vnni_granularity(jcp.ddst_dt);
+    if (ow_per_oc == 0) {
+        assert("Invalid vnni granularity.");
+        return 0;
+    }
+
     dim_t w_off = utils::rnd_dn(w_idx, ow_per_oc) * jcp.oc_block
             + w_idx % ow_per_oc;
     return jcp.typesize_in * (w_off + jcp.tr_ow * jcp.oc_block * hd_idx);
@@ -5691,8 +5758,13 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias_row(int ocb) {
     };
 
     Label ow_loop;
-    const int sp_substep = data_type_vnni_granularity(jcp.ddst_dt);
-    const int niters = jcp.tr_ow / sp_substep;
+    const dim_t sp_substep = data_type_vnni_granularity(jcp.ddst_dt);
+    if (sp_substep == 0) {
+        assert("Invalid vnni granularity.");
+        return;
+    }
+
+    const dim_t niters = jcp.tr_ow / sp_substep;
     if (niters > 0) {
         mov(reg_tmp, niters);
         L(ow_loop);
@@ -5737,7 +5809,7 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias(
             // and [P, O, N, M, L, K, J, I] -> [P, O, L, K, N, M, J, I]
             vpermq(yreg_bias_acc0, ptr[reg_bias + offset], 0xd8);
             vpermq(yreg_bias_acc1,
-                    ptr[reg_bias + offset + vreg_traits<Ymm>::vlen], 0xd8);
+                    ptr[reg_bias + offset + vreg_traits_t<Ymm>::vlen], 0xd8);
         } else if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) {
             // the data is in plain format, transform while loading to
             // pseudo-vnni layout to 2 ymm registers conforming to calculations
@@ -5748,7 +5820,7 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias(
             vpermd(yreg_bias_acc0, yreg_permute_to_vnni,
                     ptr[reg_bias + offset]);
             vpermd(yreg_bias_acc1, yreg_permute_to_vnni,
-                    ptr[reg_bias + offset + vreg_traits<Ymm>::vlen]);
+                    ptr[reg_bias + offset + vreg_traits_t<Ymm>::vlen]);
         } else {
             assert(!"non-supported type");
         }
@@ -5772,7 +5844,7 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias(
             vpermq(yreg_bias_acc0, yreg_bias_acc0, 0xd8);
             vpermq(yreg_bias_acc1, yreg_bias_acc1, 0xd8);
             vmovups(ptr[reg_bias + offset], yreg_bias_acc0);
-            vmovups(ptr[reg_bias + offset + vreg_traits<Ymm>::vlen],
+            vmovups(ptr[reg_bias + offset + vreg_traits_t<Ymm>::vlen],
                     yreg_bias_acc1);
         } else if (one_of(jcp.ddst_dt, f8_e5m2, f8_e4m3)) {
             // transform to plain before storing.
@@ -5781,7 +5853,7 @@ void jit_avx512_core_amx_bwd_bias_kernel_t::compute_diff_bias(
             vpermd(yreg_bias_acc0, yreg_permute_to_plain, yreg_bias_acc0);
             vpermd(yreg_bias_acc1, yreg_permute_to_plain, yreg_bias_acc1);
             vmovups(ptr[reg_bias + offset], yreg_bias_acc0);
-            vmovups(ptr[reg_bias + offset + vreg_traits<Ymm>::vlen],
+            vmovups(ptr[reg_bias + offset + vreg_traits_t<Ymm>::vlen],
                     yreg_bias_acc1);
         } else {
             assert(!"non-supported type");
diff --git a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp
index 4aa52520753..0f1b4bc1072 100644
--- a/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_amx_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,14 +36,14 @@ namespace x64 {
 
 /* This struct computes the compensation for src_zero_point related to
  * padding */
-struct jit_avx512_core_amx_compute_zp_pbuff_t : public jit_generator {
+struct jit_avx512_core_amx_compute_zp_pbuff_t : public jit_generator_t {
 
     using reg64_t = const Xbyak::Reg64;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_compute_zp_pbuff_t)
 
     jit_avx512_core_amx_compute_zp_pbuff_t(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp) {}
 
     static const int max_regs_ur = 30;
 
@@ -89,28 +89,28 @@ struct jit_avx512_core_amx_compute_zp_pbuff_t : public jit_generator {
 
     void generate() override;
 
-    Xbyak::Zmm zmm_out(int i_ur, int i_oc) {
+    Xbyak::Zmm zmm_out(int i_ur, int i_oc) const {
         int idx = i_ur * jcp.nb_oc_blocking + i_oc;
         assert(idx < max_regs_ur);
         return Xbyak::Zmm(idx);
     }
-    int get_ow_start(int ki, int pad_l) {
+    int get_ow_start(int ki, int pad_l) const {
         return nstl::max(0,
                 utils::div_up(pad_l - ki * (jcp.dilate_w + 1), jcp.stride_w));
     }
-    int get_ow_end(int ur_w, int ki, int pad_r) {
+    int get_ow_end(int ur_w, int ki, int pad_r) const {
         int filter_overlap = pad_r - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1);
         return ur_w - nstl::max(0, utils::div_up(filter_overlap, jcp.stride_w));
     }
 };
 
-struct jit_avx512_core_amx_copy_to_wbuffer_t : public jit_generator {
+struct jit_avx512_core_amx_copy_to_wbuffer_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_copy_to_wbuffer_t)
 
     using reg64_t = Xbyak::Reg64;
 
     jit_avx512_core_amx_copy_to_wbuffer_t(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp) {}
 
 private:
     jit_conv_conf_t jcp;
@@ -129,13 +129,13 @@ struct jit_avx512_core_amx_copy_to_wbuffer_t : public jit_generator {
     void generate() override;
 };
 
-struct jit_avx512_core_amx_copy_to_pbuffer_t : public jit_generator {
+struct jit_avx512_core_amx_copy_to_pbuffer_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_copy_to_pbuffer_t)
 
     using reg64_t = Xbyak::Reg64;
 
     jit_avx512_core_amx_copy_to_pbuffer_t(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp) {}
 
 private:
     jit_conv_conf_t jcp;
@@ -185,7 +185,7 @@ struct jit_avx512_core_amx_copy_to_pbuffer_t : public jit_generator {
     void copy_row_reduced_lowering();
 };
 
-struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator {
+struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_fwd_kernel_t)
 
     jit_avx512_core_amx_fwd_kernel_t(const jit_conv_conf_t &ajcp,
@@ -269,7 +269,7 @@ struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
     std::unique_ptr<jit_avx512_core_amx_copy_to_pbuffer_t> copy_to_pbuffer_;
@@ -342,6 +342,14 @@ struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator {
     const Xbyak::Reg64 bin_injector_helper_reg_2 = r15;
     const Xbyak::Reg64 bin_injector_helper_reg_3 = r11;
 
+    const Xbyak::Reg64 reg_d_weights = reg_zp_compensation;
+    const Xbyak::Reg64 reg_d_bias = reg_src_zero_point;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    const Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    const Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     // AUX: Steps, shifts and offsets
     size_t get_inp_icb_step() const;
     size_t get_wei_icb_step() const;
@@ -373,7 +381,7 @@ struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator {
             const int s_pad_output, const int e_pad_output);
     void cvt2ps(data_type_t type_in, const Xbyak::Zmm &ymm_in,
             const Xbyak::Operand &op, bool mask_flag = false);
-    Xbyak::Zmm zmm_out(const int idx) {
+    Xbyak::Zmm zmm_out(const int idx) const {
         const int upper_limit = jcp.src_dt == data_type::bf16
                 ? zmm_idx_limit_bf16
                 : zmm_idx_limit_int8;
@@ -390,7 +398,7 @@ struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator {
             const bool mask_flag);
     void apply_postops(const Xbyak::Zmm &zmm_out, const float *p_sum_scale,
             const int32_t *p_sum_zp, const Xbyak::Address &addr,
-            const size_t off, const bool mask_flag);
+            const size_t off, const bool mask_flag, const int ocb);
     inline void store_output_ymm_bf16(
             const int idx, const Xbyak::Address &addr, const bool mask_flag);
     void store_output_vector_bf16(
@@ -420,13 +428,13 @@ struct jit_avx512_core_amx_fwd_kernel_t : public jit_generator {
     void generate() override;
 };
 
-struct jit_avx512_core_amx_bwd_data_copy_kernel_t : public jit_generator {
+struct jit_avx512_core_amx_bwd_data_copy_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_bwd_data_copy_kernel_t)
 
     using reg64_t = Xbyak::Reg64;
 
     jit_avx512_core_amx_bwd_data_copy_kernel_t(jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp) {}
 
 private:
     jit_conv_conf_t jcp;
@@ -466,29 +474,33 @@ struct jit_avx512_core_amx_bwd_data_copy_kernel_t : public jit_generator {
     void kd_loop(bool is_masked);
 };
 
-struct jit_avx512_core_amx_bwd_data_kernel_t : public jit_generator {
+struct jit_avx512_core_amx_bwd_data_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_bwd_data_kernel_t)
 
     jit_avx512_core_amx_bwd_data_kernel_t(
             const jit_conv_conf_t &ajcp, const primitive_attr_t &attr)
-        : jit_generator(jit_name(), avx512_core_amx)
+        : jit_generator_t(jit_name(), avx512_core_amx)
         , jcp(ajcp)
         , attr_(attr)
-        , eltwise_injector_(nullptr)
         , bwd_data_copy_kernel_(nullptr) {
-        if (jcp.with_eltwise)
-            eltwise_injector_
-                    = utils::make_unique<jit_uni_eltwise_injector<avx512_core>>(
-                            this, jcp.eltwise);
+        if (jcp.with_eltwise) {
+            for (int i = 0; i < jcp.post_ops.len(); i++) {
+                const auto post_op = jcp.post_ops.entry_[i];
+                if (post_op.is_eltwise())
+                    idx_to_eltwise_injector_.emplace(i,
+                                            jit_uni_eltwise_injector_t<avx512_core>(this, post_op.eltwise));
+            }
+        }
         bwd_data_copy_kernel_ = utils::make_unique<
                 jit_avx512_core_amx_bwd_data_copy_kernel_t>(jcp);
     }
     status_t create_kernel() override {
-        CHECK(jit_generator::create_kernel());
+        CHECK(jit_generator_t::create_kernel());
         CHECK(bwd_data_copy_kernel_->create_kernel());
         return status::success;
     }
-    ~jit_avx512_core_amx_bwd_data_kernel_t() = default;
+
+    ~jit_avx512_core_amx_bwd_data_kernel_t() override = default;
 
     static bool post_ops_ok(const jit_conv_conf_t &jcp, primitive_attr_t &attr);
 
@@ -510,7 +522,9 @@ struct jit_avx512_core_amx_bwd_data_kernel_t : public jit_generator {
     }
 
 private:
-    std::unique_ptr<jit_uni_eltwise_injector<avx512_core>> eltwise_injector_;
+    std::unique_ptr<jit_uni_eltwise_injector_t<avx512_core>> eltwise_injector_;
+    std::map<int, jit_uni_eltwise_injector_t<avx512_core>>
+            idx_to_eltwise_injector_;
     std::unique_ptr<jit_avx512_core_amx_bwd_data_copy_kernel_t>
             bwd_data_copy_kernel_;
 
@@ -577,7 +591,7 @@ struct jit_avx512_core_amx_bwd_data_kernel_t : public jit_generator {
     int get_inp_tensor(int h) const;
     int get_wei_tensor(int i) const;
 
-    inline bool gaps_in_store() {
+    inline bool gaps_in_store() const {
         const int gen_kd = (jcp.kd - 1) * (jcp.dilate_d + 1) + 1;
         return gen_kd < jcp.stride_d || jcp.dilate_d > 0;
     }
@@ -609,12 +623,12 @@ struct jit_avx512_core_amx_bwd_data_kernel_t : public jit_generator {
     void generate() override;
 };
 
-struct jit_avx512_core_amx_bwd_weights_kernel_t : public jit_generator {
+struct jit_avx512_core_amx_bwd_weights_kernel_t : public jit_generator_t {
 
     jit_avx512_core_amx_bwd_weights_kernel_t(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp) {}
 
-    ~jit_avx512_core_amx_bwd_weights_kernel_t() {}
+    ~jit_avx512_core_amx_bwd_weights_kernel_t() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_bwd_weights_kernel_t)
 
@@ -696,32 +710,36 @@ struct jit_avx512_core_amx_bwd_weights_kernel_t : public jit_generator {
     static void balance(const jit_conv_conf_t &j, int &nthr, int &nthr_mb,
             int &nthr_g, int &nthr_oc_b, int &nthr_ic_b);
 
-    inline dim_t filter_w_to_src(int kw, int ow = 0, int pad_l = 0) {
+    inline dim_t filter_w_to_src(int kw, int ow = 0, int pad_l = 0) const {
         return static_cast<dim_t>(kw) * (jcp.dilate_w + 1) + ow - pad_l;
     }
-    inline dim_t filter_h_to_src(int kh) { return kh * (jcp.dilate_h + 1); }
-    inline dim_t filter_d_to_src(int kd) {
+    inline dim_t filter_h_to_src(int kh) const {
+        return kh * (jcp.dilate_h + 1);
+    }
+    inline dim_t filter_d_to_src(int kd) const {
         return static_cast<dim_t>(kd) * (jcp.dilate_d + 1) * jcp.ih;
     }
 
-    inline dim_t get_src_offset(dim_t ic_idx, dim_t w_idx, dim_t hd_idx = 0) {
+    inline dim_t get_src_offset(
+            dim_t ic_idx, dim_t w_idx, dim_t hd_idx = 0) const {
         return static_cast<dim_t>(jcp.typesize_in)
                 * (hd_idx * jcp.tr_iw * jcp.ic_block + jcp.tr_iw * ic_idx
                         + w_idx);
     }
 
-    inline dim_t get_ddst_offset(dim_t w_idx, dim_t hd_idx = 0) {
+    inline dim_t get_ddst_offset(dim_t w_idx, dim_t hd_idx = 0) const {
         int ow_per_oc = 2;
         dim_t w_off = w_idx / ow_per_oc * ow_per_oc * jcp.oc_block
                 + w_idx % ow_per_oc;
         return jcp.typesize_in * (w_off + jcp.tr_ow * jcp.oc_block * hd_idx);
     }
 
-    inline dim_t get_kernel_offset(int ic_idx, dim_t ksp_idx) {
+    inline dim_t get_kernel_offset(int ic_idx, dim_t ksp_idx) const {
         return jcp.typesize_out * jcp.oc_block
                 * (ksp_idx * jcp.ic_block + ic_idx);
     }
-    inline dim_t get_full_kernel_offset(int ocb, int icb, int kh, int kw) {
+    inline dim_t get_full_kernel_offset(
+            int ocb, int icb, int kh, int kw) const {
         return jcp.typesize_out
                 * (static_cast<dim_t>(ocb) * jcp.nb_ic * jcp.kd * jcp.kh
                                 * jcp.kw * jcp.ic_block * jcp.oc_block
@@ -744,12 +762,12 @@ struct jit_avx512_core_amx_bwd_weights_kernel_t : public jit_generator {
     int ddst_save_offset = 0;
 };
 
-struct jit_avx512_core_amx_bwd_bias_kernel_t : public jit_generator {
+struct jit_avx512_core_amx_bwd_bias_kernel_t : public jit_generator_t {
 
     jit_avx512_core_amx_bwd_bias_kernel_t(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), jcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), jcp(ajcp) {}
 
-    ~jit_avx512_core_amx_bwd_bias_kernel_t() {}
+    ~jit_avx512_core_amx_bwd_bias_kernel_t() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_amx_bwd_bias_kernel_t)
 
diff --git a/src/cpu/x64/jit_avx512_core_amx_conv_utils.hpp b/src/cpu/x64/jit_avx512_core_amx_conv_utils.hpp
index 9578db8ef7b..22f4536e7b0 100644
--- a/src/cpu/x64/jit_avx512_core_amx_conv_utils.hpp
+++ b/src/cpu/x64/jit_avx512_core_amx_conv_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct spatial_features_3d {
         , init_overflow_(0)
         , end_overflow_(0) {}
 
-    inline int get_init_overflow(const int in) {
+    inline int get_init_overflow(const int in) const {
         if (is_fast_path_)
             return nstl::max(0, filter_size_ - 1 - in - init_pad_);
         if (dilate_ != 1)
@@ -58,7 +58,7 @@ struct spatial_features_3d {
         return nstl::max(0, (filter_size_ - 1 - in - init_pad_) / stride_);
     }
 
-    inline int get_end_overflow(const int in) {
+    inline int get_end_overflow(const int in) const {
         if (is_fast_path_)
             return nstl::max(0, filter_size_ - input_size_ + in - end_pad_);
         if (dilate_ != 1)
@@ -97,13 +97,13 @@ struct spatial_features_3d {
                 : in + init_pad_ - end_overflow_ * dilate_;
     }
 
-    inline int get_filter_padding() {
+    inline int get_filter_padding() const {
         return filter_ - init_overflow_ - end_overflow_;
     }
 
-    inline int get_lower_offset() { return lower_offset_; }
+    inline int get_lower_offset() const { return lower_offset_; }
 
-    inline int get_output_offset() { return output_offset_; }
+    inline int get_output_offset() const { return output_offset_; }
 
 private:
     const int input_size_;
diff --git a/src/cpu/x64/jit_avx512_core_amx_convolution.cpp b/src/cpu/x64/jit_avx512_core_amx_convolution.cpp
index 2377ca8bf47..892354e2bbc 100644
--- a/src/cpu/x64/jit_avx512_core_amx_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_amx_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -99,11 +99,10 @@ jit_avx512_core_amx_convolution_fwd_t::execute_forward_reduced_lowering(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(
             ctx.get_scratchpad_grantor(), src_scales, wei_scales, pd()->IC(),
-            pd()->OC(), false, wei_scale_mask != 0, pd()->attr(),
+            pd()->OC(), false, wei_scale_mask > 0, pd()->attr(),
             jit_scale_precompute_.get());
 
     auto inp_p_buffer = ctx.get_scratchpad_grantor().template get<char>(
@@ -393,6 +392,7 @@ jit_avx512_core_amx_convolution_fwd_t::execute_forward_reduced_lowering(
 
                 p.oc_blocks = occ * jcp.nb_oc_blocking;
 
+                p.oc_off = oc * sizeof(float);
                 p.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
                 p.dst_orig = dst;
@@ -457,11 +457,10 @@ status_t jit_avx512_core_amx_convolution_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(
             ctx.get_scratchpad_grantor(), src_scales, wei_scales, pd()->IC(),
-            pd()->OC(), false, wei_scale_mask != 0, pd()->attr(),
+            pd()->OC(), false, wei_scale_mask > 0, pd()->attr(),
             jit_scale_precompute_.get());
 
     // TODO: use block offset instead of hand-calculated one
@@ -787,6 +786,7 @@ status_t jit_avx512_core_amx_convolution_fwd_t::execute_forward(
 
                 p.oc_blocks = occ * jcp.nb_oc_blocking;
 
+                p.oc_off = oc * sizeof(float);
                 p.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
                 p.dst_orig = dst;
@@ -831,11 +831,10 @@ status_t jit_avx512_core_amx_convolution_bwd_data_t::execute_backward(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(
             ctx.get_scratchpad_grantor(), src_scales, wei_scales, pd()->IC(),
-            pd()->OC(), false, wei_scale_mask != 0, pd()->attr(),
+            pd()->OC(), false, wei_scale_mask > 0, pd()->attr(),
             jit_scale_precompute_.get());
 
     amx_utils::execute_backward_convolution_body(ctx, pd()->jcp_, kernel_,
diff --git a/src/cpu/x64/jit_avx512_core_amx_convolution.hpp b/src/cpu/x64/jit_avx512_core_amx_convolution.hpp
index 6be4ec34f25..c65f1155b8e 100644
--- a/src/cpu/x64/jit_avx512_core_amx_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_amx_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,9 +40,7 @@ namespace x64 {
 
 struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", jcp_.isa, ""),
                 jit_avx512_core_amx_convolution_fwd_t);
@@ -56,7 +54,7 @@ struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
                               && utils::one_of(dst_md(0)->data_type, f32, bf16))
                     && IMPLICATION(with_bias(),
                             utils::one_of(weights_md(1)->data_type, f32, bf16))
-                    && attr()->has_default_values(smask_t::post_ops);
+                    && attr()->has_default_values(smask_t::post_ops, dst_md(0)->data_type);
             bool is_int8_convolution
                     = utils::one_of(src_md(0)->data_type, s8, u8)
                     && weights_md(0)->data_type == s8
@@ -65,9 +63,8 @@ struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
                     && IMPLICATION(with_bias(),
                             utils::one_of(
                                     weights_md(1)->data_type, f32, s32, s8, u8))
-                    && attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::post_ops
-                                    | smask_t::zero_points_runtime
+                    && attr()->has_default_values(smask_t::scales
+                                    | smask_t::post_ops | smask_t::zero_points
                                     | smask_t::sum_dt,
                             dst_md(0)->data_type);
 
@@ -84,6 +81,7 @@ struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_CONV(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_amx_fwd_kernel_t::init_conf(jcp_, *desc(),
                     src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -95,16 +93,24 @@ struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
     };
 
@@ -121,8 +127,8 @@ struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
         const auto attr = pd()->attr();
         if (is_jit_supported && pd()->OC() > 1 && req_copy_scales(attr)) {
             const auto &attr_scales = attr->scales_;
-            int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-            if (wei_scale_mask != 0) {
+            int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+            if (wei_scale_mask > 0) {
                 CHECK(safe_ptr_assign(jit_scale_precompute_,
                         new jit_avx512_core_scale_precompute_t(attr)));
                 CHECK(jit_scale_precompute_->create_kernel());
@@ -153,9 +159,7 @@ struct jit_avx512_core_amx_convolution_fwd_t : public primitive_t {
 
 struct jit_avx512_core_amx_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", jcp_.isa, ""),
                 jit_avx512_core_amx_convolution_bwd_data_t);
@@ -177,19 +181,19 @@ struct jit_avx512_core_amx_convolution_bwd_data_t : public primitive_t {
             VDISPATCH_CONV(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
-            status_t status = jit_avx512_core_amx_bwd_data_kernel_t::init_conf(
-                    jcp_, *desc(), diff_src_md_, weights_md_, diff_dst_md_,
-                    nullptr /* no bias */, attr_, dnnl_get_max_threads());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jcp_,
+                    *desc(), diff_src_md_, weights_md_, diff_dst_md_,
+                    nullptr /* no bias */, attr_, dnnl_get_max_threads()));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx512_core_amx_bwd_data_kernel_t::init_scratchpad(
                     scratchpad, jcp_, *attr());
 
-            return status;
+            return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_amx_convolution_bwd_data_t(const pd_t *apd)
@@ -206,8 +210,8 @@ struct jit_avx512_core_amx_convolution_bwd_data_t : public primitive_t {
         const auto attr = pd()->attr();
         if (is_jit_supported && pd()->OC() > 1 && req_copy_scales(attr)) {
             const auto &attr_scales = attr->scales_;
-            int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-            if (wei_scale_mask != 0) {
+            int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+            if (wei_scale_mask > 0) {
                 CHECK(safe_ptr_assign(jit_scale_precompute_,
                         new jit_avx512_core_scale_precompute_t(attr)));
                 CHECK(jit_scale_precompute_->create_kernel());
@@ -235,10 +239,8 @@ struct jit_avx512_core_amx_convolution_bwd_data_t : public primitive_t {
 
 struct jit_avx512_core_amx_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", jcp_.isa, ""),
                 jit_avx512_core_amx_convolution_bwd_weights_t);
@@ -262,28 +264,26 @@ struct jit_avx512_core_amx_convolution_bwd_weights_t : public primitive_t {
                             utils::one_of(diff_bias_md_.data_type, f32, bf16)),
                     VERBOSE_UNSUPPORTED_BIAS_CFG);
 
-            status_t status
-                    = jit_avx512_core_amx_bwd_weights_kernel_t::init_conf(jcp_,
-                            *desc(), src_md_, diff_weights_md_, diff_bias_md_,
-                            diff_dst_md_, dnnl_get_max_threads());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_core_amx_bwd_weights_kernel_t::init_conf(jcp_,
+                    *desc(), src_md_, diff_weights_md_, diff_bias_md_,
+                    diff_dst_md_, dnnl_get_max_threads()));
 
             auto scratchpad = scratchpad_registry().registrar();
-            status = jit_avx512_core_amx_bwd_weights_kernel_t::init_scratchpad(
-                    scratchpad, jcp_, src_md_, diff_weights_md_, diff_dst_md_);
-            if (status != status::success) return status;
+            CHECK(jit_avx512_core_amx_bwd_weights_kernel_t::init_scratchpad(
+                    scratchpad, jcp_, src_md_, diff_weights_md_, diff_dst_md_));
 
-            return status;
+            return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_amx_convolution_bwd_weights_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_avx512_core_amx_deconvolution.cpp b/src/cpu/x64/jit_avx512_core_amx_deconvolution.cpp
index de930820f80..cabd338661d 100644
--- a/src/cpu/x64/jit_avx512_core_amx_deconvolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_amx_deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,11 +78,10 @@ status_t jit_avx512_core_amx_deconvolution_fwd_t::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = precompute_scales(ctx.get_scratchpad_grantor(),
             src_scales, wei_scales, src_d.dims()[1], dst_d.dims()[1], false,
-            wei_scale_mask != 0, pd()->attr());
+            wei_scale_mask > 0, pd()->attr());
 
     // The body of bwd/d convolution harness is called with:
     //   1. src as input instead of diff_dst
diff --git a/src/cpu/x64/jit_avx512_core_amx_deconvolution.hpp b/src/cpu/x64/jit_avx512_core_amx_deconvolution.hpp
index 8571fe7a2e4..9a0d5c78a4c 100644
--- a/src/cpu/x64/jit_avx512_core_amx_deconvolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_amx_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,9 +35,7 @@ namespace x64 {
 
 struct jit_avx512_core_amx_deconvolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_deconvolution_fwd_pd_t::cpu_deconvolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("jit_deconvolution:", jcp_.isa, ""),
@@ -62,7 +60,7 @@ struct jit_avx512_core_amx_deconvolution_fwd_t : public primitive_t {
                     && IMPLICATION(with_bias(),
                             utils::one_of(bias_md_.data_type, f32, s32, s8, u8))
                     && attr()->has_default_values(
-                            smask_t::scales_runtime | smask_t::post_ops)
+                            smask_t::scales | smask_t::post_ops)
                     && attr_scales_ok();
 
             VDISPATCH_DECONVOLUTION(is_fwd(), VERBOSE_BAD_PROPKIND);
@@ -75,6 +73,7 @@ struct jit_avx512_core_amx_deconvolution_fwd_t : public primitive_t {
             VDISPATCH_DECONVOLUTION(
                     !has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_amx_bwd_data_kernel_t::init_conf(jcp_,
                     *desc(), dst_md_, weights_md_, src_md_, &bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -86,7 +85,7 @@ struct jit_avx512_core_amx_deconvolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_amx_deconvolution_fwd_t(const pd_t *apd)
diff --git a/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.cpp
index 1f93669ec74..32983bbf1c3 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,8 +45,8 @@ using namespace Xbyak;
 jit_avx512_core_bf16_1x1_conv_kernel::jit_avx512_core_bf16_1x1_conv_kernel(
         const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), avx512_core_bf16), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), avx512_core_bf16), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -61,10 +61,12 @@ jit_avx512_core_bf16_1x1_conv_kernel::jit_avx512_core_bf16_1x1_conv_kernel(
                 use_exact_tail_scalar_bcast};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 
     if (!isa_has_bf16(jcp.isa))
@@ -181,7 +183,17 @@ static void iterate(const int load_loop_blk, const int ur, const F &f) {
 
 void jit_avx512_core_bf16_1x1_conv_kernel::apply_postops(
         const int load_loop_blk, const int ur) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(load_loop_blk, ur,
+                [&](const bool, const int i_load, const int i_ur) {
+                    vmm_idx_off.insert({vreg_accum_idx(load_loop_blk, i_load, i_ur), i_load * jcp.oc_block * sizeof(float)});
+                });
+
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  reg_oc_off, vmm_idx_off, this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt, this->rsp, base_post_ops_data_offset};
+
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
             binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
@@ -237,7 +249,7 @@ void jit_avx512_core_bf16_1x1_conv_kernel::apply_postops(
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             }
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
 
         } else {
@@ -246,7 +258,7 @@ void jit_avx512_core_bf16_1x1_conv_kernel::apply_postops(
                         vmm_idxs.emplace(
                                 vreg_accum_idx(load_loop_blk, i_load, i_ur));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
     }
 }
@@ -854,6 +866,8 @@ void jit_avx512_core_bf16_1x1_conv_kernel::reduce_loop(
     mov(aux_reg_bcast_data, aux1_reg_bcast_data);
     init();
 
+    push(reg_oc_off);
+
     mov(reduce_loop_iter, reg_reduce_loop_work);
     Label reduce_loop_exit;
     cmp(reduce_loop_iter, jcp.reduce_loop_unroll);
@@ -875,6 +889,9 @@ void jit_avx512_core_bf16_1x1_conv_kernel::reduce_loop(
 
     fma_block(true);
     L(reduce_loop_exit);
+
+    pop(reg_oc_off);
+
     store();
 }
 
@@ -993,7 +1010,12 @@ void jit_avx512_core_bf16_1x1_conv_kernel::compute_diff_bias(
 void jit_avx512_core_bf16_1x1_conv_kernel::generate() {
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_bcast_data, reg_load_data);
+
     sub(rsp, stack_space_needed);
+    base_post_ops_data_offset += stack_space_needed;
+
     if (jcp.with_binary) {
         mov(EVEX_compress_addr(rsp, reg_abi_param1_backup), abi_param1);
         if (jcp.with_dw_conv) {
@@ -1051,6 +1073,7 @@ void jit_avx512_core_bf16_1x1_conv_kernel::generate() {
         mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
     }
 
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
     auto load_loop_body = [&](int load_loop_blk) {
         Label no_update_mask, update_mask_done;
         if (load_dim_tail) {
@@ -1072,6 +1095,8 @@ void jit_avx512_core_bf16_1x1_conv_kernel::generate() {
         mov(reg_load_loop_work, ptr[rsp + reg_load_loop_work_off]);
 
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
+
         const size_t off_with_dw_conv = load_loop_blk * jcp.load_block
                 * jcp.typesize_out
                 * (is_out_layout_nxc()
@@ -1169,6 +1194,10 @@ void jit_avx512_core_bf16_1x1_conv_kernel::generate() {
     L(load_loop_blk[num_ur_cases]);
 
     add(rsp, stack_space_needed);
+    base_post_ops_data_offset -= stack_space_needed;
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
 
     postamble();
 
@@ -1193,9 +1222,14 @@ status_t jit_avx512_core_bf16_1x1_conv_kernel::init_conf(
         const memory_desc_wrapper &dst_d, primitive_attr_t &attr, int nthreads,
         bool reduce_src) {
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
-    const int simd_w = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+    const int simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     const int ndims = src_d.ndims();
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.nthr = nthreads;
     jcp.isa = mayiuse(avx512_core_bf16) ? avx512_core_bf16
                                         : bf16_emulation_t::get_isa();
@@ -1260,6 +1294,9 @@ status_t jit_avx512_core_bf16_1x1_conv_kernel::init_conf(
     const int prelu_ind = post_ops.find(primitive_kind::prelu, 0, dw_conv_ind);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
 
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise, 0, dw_conv_ind) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization, 0, dw_conv_ind) != -1;
+
     if (dw_conv_ind >= 0) {
         // dw_conv and post_ops after it are handled externally, so skip them
         jcp.post_ops.entry_.assign(post_ops.entry_.cbegin(),
@@ -1273,15 +1310,15 @@ status_t jit_avx512_core_bf16_1x1_conv_kernel::init_conf(
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
     using namespace format_tag;
     const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
     const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx16c);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx16c);
     bool is_data_layout_nxc
             = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
     auto required_dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_nCx16c;
@@ -1330,7 +1367,7 @@ status_t jit_avx512_core_bf16_1x1_conv_kernel::init_conf(
         jcp.dst_dt = src_d.data_type();
     } else if (jcp.prop_kind == backward_weights) {
         jcp.typesize_in = types::data_type_size(src_d.data_type());
-        jcp.typesize_out = sizeof(prec_traits<data_type::f32>::type);
+        jcp.typesize_out = sizeof(prec_traits_t<data_type::f32>::type);
         jcp.dst_dt = weights_d.data_type();
     }
 
diff --git a/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.hpp
index b7b457f8cc9..c24e745890a 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_1x1_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx512_core_bf16_1x1_conv_kernel : public jit_generator {
+struct jit_avx512_core_bf16_1x1_conv_kernel : public jit_generator_t {
     jit_avx512_core_bf16_1x1_conv_kernel(const jit_1x1_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
 
@@ -49,7 +49,7 @@ struct jit_avx512_core_bf16_1x1_conv_kernel : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
 
@@ -108,6 +108,13 @@ struct jit_avx512_core_bf16_1x1_conv_kernel : public jit_generator {
     Xbyak::Opmask half_mask = Xbyak::Opmask(6);
     Xbyak::Opmask half_mask_hi = Xbyak::Opmask(5);
     Xbyak::Label dst_prm_table;
+    reg64_t reg_oc_off = abi_param1;
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = aux_reg_bcast_data;
+    int base_post_ops_data_offset = 0;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
 
     constexpr static int reg64_size_ = sizeof(int64_t);
     constexpr static int bcast_loop_work_offt = 0;
@@ -126,7 +133,7 @@ struct jit_avx512_core_bf16_1x1_conv_kernel : public jit_generator {
     void apply_postops(const int load_loop_blk, const int ur);
     void generate() override;
     static void balance(jit_1x1_conv_conf_t &jcp, int nthreads);
-    inline bool is_bcast_layout_nxc() {
+    inline bool is_bcast_layout_nxc() const {
         switch (jcp.prop_kind) {
             case prop_kind::forward_training:
             case prop_kind::forward_inference:
@@ -142,13 +149,13 @@ struct jit_avx512_core_bf16_1x1_conv_kernel : public jit_generator {
             default: assert(!"invalid prop_kind"); return false;
         }
     }
-    inline bool is_load_layout_nxc() {
+    inline bool is_load_layout_nxc() const {
         return jcp.prop_kind == prop_kind::backward_weights
                 && jcp.uses_permw_transposition
                 && utils::one_of(jcp.dst_tag, format_tag::ndhwc,
                         format_tag::nhwc, format_tag::nwc);
     }
-    inline bool is_out_layout_nxc() {
+    inline bool is_out_layout_nxc() const {
         switch (jcp.prop_kind) {
             case prop_kind::forward_training:
             case prop_kind::forward_inference:
diff --git a/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.cpp b/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.cpp
index 1358e7d7ed4..9189001ea30 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -179,6 +179,8 @@ void jit_avx512_core_bf16_1x1_convolution_fwd_t<dst_type>::execute_forward_thr(
     const bool is_src_layout_nxc = utils::one_of(
             jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
 
+    auto start_off = dst_d.off_l(0);
+
     auto step = [](int default_step, int remaining, int tail_step) {
         assert(default_step <= tail_step);
         return remaining < tail_step ? remaining : default_step;
@@ -267,11 +269,12 @@ void jit_avx512_core_bf16_1x1_convolution_fwd_t<dst_type>::execute_forward_thr(
                 : rnd_up((jcp.load_dim / grp_count), jcp.load_block);
         const size_t str_size = jcp.bcast_dim * max_load_per_thread;
         p.store_buffer = store_buffer + ithr * str_size
-                + data_blk_off(dst_d, 0, 0, od, oh, ow);
+                + data_blk_off(dst_d, 0, 0, od, oh, ow) - start_off;
 
         p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
         p.dst_orig = static_cast<const char *>(p.output_data)
                 - dst_off * dst_d.data_type_size();
+        p.oc_off = oc_off_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
 
         (*kernel_)(&p);
     };
@@ -380,6 +383,8 @@ void jit_avx512_core_bf16_1x1_convolution_fwd_t<dst_type>::execute_forward_thr(
                     = post_ops_binary_rhs_arg_vec_dw;
             par_conv_dw.dst_orig = dst;
 
+            par_conv_dw.oc_off = ch * jcp_dw->ch_block * sizeof(float);
+
             (*kernel_dw_)(&par_conv_dw);
 
             for (int i = 0; i < jcp_dw->kh; ++i)
@@ -462,12 +467,16 @@ void jit_avx512_core_bf16_1x1_convolution_bwd_data_t<
     auto diff_dst = CTX_IN_MEM(const diff_dst_data_t *, DNNL_ARG_DIFF_DST);
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
+
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+
     auto scratchpad = ctx.get_scratchpad_grantor();
     const auto &jcp = kernel_->jcp;
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         assert(nthr == jcp.nthr);
         execute_backward_data_thr(
-                ithr, nthr, diff_dst, weights, diff_src, scratchpad);
+                ithr, nthr, diff_dst, weights, diff_src, scratchpad, post_ops_binary_rhs_arg_vec.data());
     });
 }
 
@@ -476,7 +485,8 @@ void jit_avx512_core_bf16_1x1_convolution_bwd_data_t<
         diff_src_type>::execute_backward_data_thr(const int ithr,
         const int nthr, const diff_dst_data_t *diff_dst,
         const wei_data_t *weights, diff_src_data_t *diff_src,
-        const memory_tracking::grantor_t &scratchpad) const {
+        const memory_tracking::grantor_t &scratchpad,
+        const void *post_ops_binary_rhs_arg_vec) const {
 
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -589,6 +599,10 @@ void jit_avx512_core_bf16_1x1_convolution_bwd_data_t<
         const size_t str_size = jcp.bcast_dim * max_load_per_thread;
         p.store_buffer = store_buffer + ithr * str_size
                 + data_blk_off(diff_src_d, 0, 0, id, ih, iw);
+
+        p.oc_off = ic_off_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+        p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
+
         (*kernel_)(&p);
         if (pd()->rtus_.reduce_src_) (*rtus_driver_)(&rp);
     };
diff --git a/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.hpp b/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.hpp
index b03ed83e890..24789f1a6b0 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,13 +42,7 @@ namespace x64 {
 template <impl::data_type_t dst_type>
 struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        using dw_conv_pd_type = cpu_convolution_fwd_pd_t;
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_()
-            , jcp_dw_(nullptr) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
             if (copy(other) != status::success) is_initialized_ = false;
@@ -59,7 +53,8 @@ struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
 
         status_t init(engine_t *engine) {
             using namespace data_type;
-            // disabling verbose dispatch messages for unsupported isa for better readability
+            // Disabling verbose dispatch messages for unsupported isa for
+            // better readability.
             if (!mayiuse(avx512_core)) return status::unimplemented;
 
             VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
@@ -84,8 +79,11 @@ struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, dst_md(), weights_md());
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_bf16_1x1_conv_kernel::init_conf(jcp_, *conv_d,
                     *src_d, *weights_md(), *dst_md(), attr_,
                     dnnl_get_max_threads(), rtus_.reduce_src_));
@@ -107,14 +105,14 @@ struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
+            return dw_conv_pd_ && jcp_.with_dw_conv
                     ? dw_conv_pd_->dst_md(index, user_input)
                     : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
+            if (dw_conv_pd_ && jcp_.with_dw_conv) {
                 switch (arg) {
                     case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
                         return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
@@ -132,16 +130,16 @@ struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
             return convolution_fwd_pd_t::arg_usage(arg);
         }
 
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
-        jit_conv_conf_t *jcp_dw_; // doesn't own a resource
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+        jit_conv_conf_t *jcp_dw_ = nullptr; // doesn't own a resource
         std::unique_ptr<cpu_convolution_fwd_pd_t> dw_conv_pd_;
 
     protected:
@@ -324,12 +322,12 @@ struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
     jit_avx512_core_bf16_1x1_convolution_fwd_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
     // Note: In case of fused depthwise convolution, the final output datatype
     // may not be dst_data_t.
-    typedef typename prec_traits<dst_type>::type dw_wei_data_t;
+    using dw_wei_data_t = typename prec_traits_t<dst_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -372,11 +370,7 @@ struct jit_avx512_core_bf16_1x1_convolution_fwd_t : public primitive_t {
 template <impl::data_type_t diff_src_type>
 struct jit_avx512_core_bf16_1x1_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_bf16_1x1:", jcp_.isa, ""),
                 jit_avx512_core_bf16_1x1_convolution_bwd_data_t);
@@ -394,30 +388,30 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *diff_src_d = diff_src_md();
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, diff_src_d, diff_dst_md(), weights_md());
 
-            status_t status = jit_avx512_core_bf16_1x1_conv_kernel::init_conf(
-                    jcp_, *conv_d, *diff_src_d, *weights_md(), *diff_dst_md(),
-                    attr_, dnnl_get_max_threads(), rtus_.reduce_src_);
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_core_bf16_1x1_conv_kernel::init_conf(jcp_, *conv_d,
+                    *diff_src_d, *weights_md(), *diff_dst_md(), attr_,
+                    dnnl_get_max_threads(), rtus_.reduce_src_));
 
             auto scratchpad = scratchpad_registry().registrar();
-            status = jit_avx512_core_bf16_1x1_conv_kernel::init_scratchpad(
-                    scratchpad, jcp_);
-            if (status != status::success) return status;
+            CHECK(jit_avx512_core_bf16_1x1_conv_kernel::init_scratchpad(
+                    scratchpad, jcp_));
             rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
 
             return status::success;
         }
 
         // TODO (Roma): structs conf header cleanup
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
 
     protected:
         bool set_default_formats() {
@@ -446,6 +440,23 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_data_t : public primitive_t {
 
             return set_default_formats_common(dat_tag, wei_tag, dat_tag);
         }
+
+        bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
     template <cpu_isa_t isa, typename conv_t>
@@ -454,9 +465,9 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_data_t : public primitive_t {
     jit_avx512_core_bf16_1x1_convolution_bwd_data_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using diff_src_data_t = typename prec_traits_t<diff_src_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -476,7 +487,8 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_data_t : public primitive_t {
     void execute_backward_data(const exec_ctx_t &ctx) const;
     void execute_backward_data_thr(const int, const int,
             const diff_dst_data_t *, const wei_data_t *, diff_src_data_t *,
-            const memory_tracking::grantor_t &scratchpad) const;
+            const memory_tracking::grantor_t &scratchpad,
+            const void *post_ops_binary_rhs_arg_vec) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
     std::unique_ptr<jit_avx512_core_bf16_1x1_conv_kernel> kernel_;
@@ -487,11 +499,8 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_data_t : public primitive_t {
 template <impl::data_type_t diff_weights_type>
 struct jit_avx512_core_bf16_1x1_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_bf16_1x1:", jcp_.isa, ""),
                 jit_avx512_core_bf16_1x1_convolution_bwd_weights_t);
@@ -520,18 +529,18 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_weights_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
-            rtus_prepare(
-                    this, conv_d, src_d, diff_dst_md(), diff_weights_md(0));
 
-            status_t status = jit_avx512_core_bf16_1x1_conv_kernel::init_conf(
-                    jcp_, *conv_d, *src_d, *diff_weights_md(0), *diff_dst_md(),
-                    attr_, dnnl_get_max_threads(), rtus_.reduce_src_);
-            if (status != status::success) return status;
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
+            rtus_prepare(this, conv_d, src_d, diff_dst_md(), diff_weights_md());
+
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_core_bf16_1x1_conv_kernel::init_conf(jcp_, *conv_d,
+                    *src_d, *diff_weights_md(0), *diff_dst_md(), attr_,
+                    dnnl_get_max_threads(), rtus_.reduce_src_));
 
             auto scratchpad = scratchpad_registry().registrar();
-            status = jit_avx512_core_bf16_1x1_conv_kernel::init_scratchpad(
-                    scratchpad, jcp_);
-            if (status != status::success) return status;
+            CHECK(jit_avx512_core_bf16_1x1_conv_kernel::init_scratchpad(
+                    scratchpad, jcp_));
 
             rtus_prepare_space_info(this, scratchpad, jcp_.nthr);
 
@@ -539,8 +548,8 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_weights_t : public primitive_t {
         }
 
         // TODO (Roma): structs conf header cleanup
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
 
     protected:
         bool set_default_formats() {
@@ -585,10 +594,10 @@ struct jit_avx512_core_bf16_1x1_convolution_bwd_weights_t : public primitive_t {
         return status::success;
     }
 
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
 
-    typedef typename prec_traits<diff_weights_type>::type diff_wei_data_t;
+    using diff_wei_data_t = typename prec_traits_t<diff_weights_type>::type;
 
 private:
     void execute_backward_weights(const exec_ctx_t &ctx) const;
diff --git a/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.cpp
index c205543021a..d9478dd207c 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "common/bfloat16.hpp"
 #include "common/c_types_map.hpp"
+#include "common/convolution_pd.hpp"
 #include "common/dnnl_thread.hpp"
 #include "common/math_utils.hpp"
 #include "common/nstl.hpp"
@@ -91,7 +92,7 @@ inline bool is_1stconv(const jit_conv_conf_t &jcp) {
                     * nstl::max(jcp.typesize_in, jcp.typesize_out) * jcp.id
                     * jcp.ih * jcp.iw
             < INT_MAX;
-    return jcp.ic < 16 && jcp.ngroups == 1 && no_big_offt;
+    return one_of(jcp.ic, 1, 2, 3) && jcp.ngroups == 1 && no_big_offt;
 }
 } // namespace
 
@@ -99,8 +100,8 @@ template <typename Vmm>
 _jit_avx512_core_bf16_fwd_kernel<Vmm>::_jit_avx512_core_bf16_fwd_kernel(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), avx512_core_bf16), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), avx512_core_bf16), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -118,10 +119,12 @@ _jit_avx512_core_bf16_fwd_kernel<Vmm>::_jit_avx512_core_bf16_fwd_kernel(
                 use_exact_tail_scalar_bcast};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
-                injector::jit_uni_postops_injector_t<avx512_core, Vmm>>(
-                this, jcp.post_ops, static_params);
+                injector::jit_uni_postops_injector_t<avx512_core>>(
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
     if (!isa_has_bf16(jcp.isa))
         bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
@@ -168,7 +171,18 @@ static void iterate(const int nb_oc_block, const int ur_w, const F &f) {
 
 template <typename Vmm>
 void _jit_avx512_core_bf16_fwd_kernel<Vmm>::apply_postops(int ur_w) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(jcp.nb_oc_blocking, ur_w,
+                [&](const bool, const int k, const int j) {
+                    vmm_idx_off.insert({vmm_dst_idx(j, k), k * jcp.oc_block * sizeof(float)});
+                });
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
             binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
@@ -205,7 +219,7 @@ void _jit_avx512_core_bf16_fwd_kernel<Vmm>::apply_postops(int ur_w) {
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             }
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
 
         } else {
@@ -213,7 +227,7 @@ void _jit_avx512_core_bf16_fwd_kernel<Vmm>::apply_postops(int ur_w) {
                     [&](const bool, const int k, const int j) {
                         vmm_idxs.emplace(vmm_dst_idx(j, k));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
     }
 }
@@ -588,7 +602,14 @@ void _jit_avx512_core_bf16_fwd_kernel<Vmm>::generate() {
             = get_src_offset(0, filter_w_to_src(0, 0, l_pad));
 
     preamble();
-    if (jcp.ndims == 5) sub(rsp, stack_space_needed_);
+
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_src, reg_dst);
+
+    if (jcp.ndims == 5) {
+        sub(rsp, stack_space_needed_);
+        base_post_ops_data_offset += stack_space_needed_;
+    }
 
     if (jcp.is_1stconv || jcp.ic_tail) {
         Xbyak::Reg64 reg_alt_mask = r8;
@@ -800,7 +821,14 @@ void _jit_avx512_core_bf16_fwd_kernel<Vmm>::generate() {
         L(end_label);
     }
 
-    if (jcp.ndims == 5) add(rsp, stack_space_needed_);
+    if (jcp.ndims == 5) {
+        add(rsp, stack_space_needed_);
+        base_post_ops_data_offset -= stack_space_needed_;
+    }
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -828,6 +856,11 @@ status_t jit_avx512_core_bf16_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const memory_desc_wrapper dst_d(&dst_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     int ndims = src_d.ndims();
 
@@ -881,19 +914,19 @@ status_t jit_avx512_core_bf16_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
             jcp.t_pad, jcp.oh, jcp.ih, jcp.stride_h, ext_kh);
     jcp.back_pad = calculate_end_padding(
             jcp.f_pad, jcp.od, jcp.id, jcp.stride_d, ext_kd);
-    bool kernel_outside_src = false || ext_kw <= jcp.l_pad
-            || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad || ext_kh <= jcp.b_pad
-            || ext_kd <= jcp.f_pad || ext_kd <= jcp.back_pad;
-    if (kernel_outside_src) return status::unimplemented;
+//    bool kernel_outside_src = false || ext_kw <= jcp.l_pad
+//            || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad || ext_kh <= jcp.b_pad
+//            || ext_kd <= jcp.f_pad || ext_kd <= jcp.back_pad;
+//    if (kernel_outside_src) return status::unimplemented;
 
     const auto dat_tag_nxc = pick(ndims - 3, nwc, nhwc, ndhwc);
     const auto dat_tag_ncx = pick(ndims - 3, ncw, nchw, ncdhw);
     const auto dat_tag_nCx4c = pick(ndims - 3, nCw4c, nChw4c, nCdhw4c);
     const auto dat_tag_nCx8c = pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
     const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    auto curr_src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx16c,
-            dat_tag_nCx8c, dat_tag_nCx4c, dat_tag_ncx);
-    auto curr_dst_tag = dst_d.matches_one_of_tag(
+    auto curr_src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc,
+            dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c, dat_tag_ncx);
+    auto curr_dst_tag = dst_d.mb_stride_relaxed_match(
             dat_tag_nxc, dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c);
     bool is_data_layout_nxc = IMPLICATION(curr_src_tag != dat_tag_nxc,
                                       src_d.format_kind() == format_kind::any)
@@ -905,7 +938,7 @@ status_t jit_avx512_core_bf16_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const int regs = isa_has_bf16(jcp.isa) ? 31 /* expl_bcast case */ : 26;
     const bool ok_to_pad_channels = jcp.ngroups == 1 && !is_data_layout_nxc;
 
-    jcp.simd_w = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+    jcp.simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
 
     const bool ok_to_try_lower_zmm = true
             && IMPLICATION(is_data_layout_nxc,
@@ -1000,6 +1033,8 @@ status_t jit_avx512_core_bf16_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const int binary_ind = post_ops.find(primitive_kind::binary);
     const int prelu_ind = post_ops.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization) != -1;
 
     jcp.ic_tail = is_data_layout_nxc ? jcp.ic % jcp.simd_w : 0;
     if (is_data_layout_nxc)
@@ -1017,7 +1052,7 @@ status_t jit_avx512_core_bf16_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
@@ -1101,6 +1136,27 @@ void _jit_avx512_core_bf16_bwd_data_kernel<Vmm>::store_output(int ur_w) {
     if (!isa_has_bf16(jcp.isa)) bf16_emu_->init_vcvtneps2bf16();
     const int ic_tail = jcp.ic_tail;
 
+    int depthwise_inj_idx = 0;
+    std::size_t post_ops_data_offset = 0;
+    const auto& p = jcp.post_ops;
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            mov(reg_d_weights, ptr[this->rsp + post_ops_data_offset]);
+            add(reg_d_weights, ptr[this->param1 + GET_OFF(oc_off)]);
+
+            for (int k = 0; k < jcp.nb_ic_blocking; k++) {
+                depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                    k * jcp.ur_w, k * jcp.ur_w + ur_w, reg_d_weights, reg_d_weights);
+
+                add(reg_d_weights, jcp.ic_block * sizeof(float));
+            }
+
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+        }
+    }
+
     if (jcp.dst_dt == data_type::f32) {
         for (int k = 0; k < jcp.nb_ic_blocking; k++)
             for (int j = 0; j < ur_w; j++) {
@@ -1345,6 +1401,17 @@ void _jit_avx512_core_bf16_bwd_data_kernel<Vmm>::compute_loop(
 
 template <typename Vmm>
 void _jit_avx512_core_bf16_bwd_data_kernel<Vmm>::generate() {
+    const auto &p = jcp.post_ops;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<avx512_core>(
+                    this,
+                    post_op
+            ));
+        }
+    }
+
     int iw = jcp.iw;
     int kw = jcp.kw;
     int ur_w = jcp.ur_w;
@@ -1359,6 +1426,26 @@ void _jit_avx512_core_bf16_bwd_data_kernel<Vmm>::generate() {
 
     preamble();
 
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_src;
+        auto aux_reg1 = reg_dst;
+
+        mov(aux_reg0, ptr[this->param + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
     if (jcp.simd_w == 4) {
         Reg32 reg_tail_32 = reg_oc.cvt32();
         mov(reg_tail_32, (1 << jcp.simd_w) - 1);
@@ -1528,17 +1615,43 @@ void _jit_avx512_core_bf16_bwd_data_kernel<Vmm>::generate() {
     if (ur_w_tail != 0) { compute_loop(ur_w_tail, 0, r_overflow); }
     L(end_label);
 
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
     postamble();
 }
 
+bool jit_avx512_core_bf16_bwd_data_kernel::post_ops_ok(
+    jit_conv_conf_t& jcp) {
+    const auto& p = jcp.post_ops;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = 0; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+        }
+        return ok;
+    };
+
+    return all_post_ops_supported();
+}
+
 status_t jit_avx512_core_bf16_bwd_data_kernel::init_conf(jit_conv_conf_t &jcp,
         const convolution_desc_t &cd, memory_desc_t &diff_src_md,
-        memory_desc_t &weights_md, memory_desc_t &diff_dst_md, int nthreads) {
+        memory_desc_t &weights_md, memory_desc_t &diff_dst_md,
+        int nthreads) {
 
     const memory_desc_wrapper diff_src_d(&diff_src_md);
     const memory_desc_wrapper weights_d(&weights_md);
     const memory_desc_wrapper diff_dst_d(&diff_dst_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, diff_src_d, weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1;
     int ndims = diff_src_d.ndims();
 
@@ -1608,9 +1721,9 @@ status_t jit_avx512_core_bf16_bwd_data_kernel::init_conf(jit_conv_conf_t &jcp,
     const auto dat_tag_nCx4c = pick(ndims - 3, nCw4c, nChw4c, nCdhw4c);
     const auto dat_tag_nCx8c = pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
     const auto dat_tag_nCx16c = pick(ndims - 3, nCw16c, nChw16c, nCdhw16c);
-    auto curr_src_tag = diff_src_d.matches_one_of_tag(
+    auto curr_src_tag = diff_src_d.mb_stride_relaxed_match(
             dat_tag_nxc, dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c);
-    auto curr_dst_tag = diff_dst_d.matches_one_of_tag(
+    auto curr_dst_tag = diff_dst_d.mb_stride_relaxed_match(
             dat_tag_nxc, dat_tag_nCx16c, dat_tag_nCx8c, dat_tag_nCx4c);
     bool is_data_layout_nxc
             = IMPLICATION(curr_src_tag != dat_tag_nxc,
@@ -1621,7 +1734,7 @@ status_t jit_avx512_core_bf16_bwd_data_kernel::init_conf(jit_conv_conf_t &jcp,
 
     bool ok_to_pad_channels = jcp.ngroups == 1 && !is_data_layout_nxc;
 
-    jcp.simd_w = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+    jcp.simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
 
     const bool ok_to_try_lower_zmm = true
             && IMPLICATION(is_data_layout_nxc,
@@ -1693,6 +1806,8 @@ status_t jit_avx512_core_bf16_bwd_data_kernel::init_conf(jit_conv_conf_t &jcp,
             && jcp.oc <= weights_d.padded_dims()[with_groups + 0];
     if (!args_ok) return status::unimplemented;
 
+    if (!post_ops_ok(jcp)) return status::unimplemented;
+
     jcp.nb_ic = utils::div_up(jcp.ic, jcp.ic_block);
     jcp.nb_oc = utils::div_up(jcp.oc, jcp.oc_block);
 
@@ -2647,8 +2762,8 @@ void jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::compute_ic_block_step(
                 src_offset, kernel_offset, ddst_offset, is_tail);
 }
 
-void jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 ::get_ur_w(
-        int &ur_w, int &ur_w_tail, int &ur_w_trips) {
+void jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::get_ur_w(
+        int &ur_w, int &ur_w_tail, int &ur_w_trips) const {
     if (jcp.tr_ow <= max_ur_w) {
         ur_w = jcp.tr_ow;
         ur_w_tail = 0;
@@ -3276,7 +3391,7 @@ void jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::maybe_zero_kernel() {
     xor_(reg_tmp, reg_tmp);
     L(zeroing_loop);
     {
-        assert(get_kernel_offset(1, 0) == cpu_isa_traits<avx512_core>::vlen);
+        assert(get_kernel_offset(1, 0) == cpu_isa_traits_t<avx512_core>::vlen);
         for (int ic1 = 0; ic1 < jcp.ic_block; ic1++)
             vmovups(ptr[reg_kernel + reg_tmp + get_kernel_offset(ic1, 0)],
                     zero);
@@ -4119,13 +4234,18 @@ status_t jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::init_conf(
         jit_conv_conf_t &jcp, const convolution_desc_t &cd,
         memory_desc_t &src_md, memory_desc_t &diff_weights_md,
         memory_desc_t &diff_bias_md, memory_desc_t &diff_dst_md, int nthreads) {
-    const int simd_w = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+    const int simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
 
     const memory_desc_wrapper src_d(&src_md);
     const memory_desc_wrapper diff_weights_d(&diff_weights_md);
     const memory_desc_wrapper diff_dst_d(&diff_dst_md);
     const memory_desc_wrapper diff_bias_d(&diff_bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, diff_weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = diff_weights_d.ndims() == src_d.ndims() + 1;
     int ndims = src_d.ndims();
 
@@ -4441,11 +4561,13 @@ status_t jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::init_conf(
             && jcp.oc <= diff_weights_d.padded_dims()[with_groups + 0];
     if (!args_ok) return status::unimplemented;
 
-    int inp_row_size = jcp.ic_block * jcp.tr_iw * jcp.typesize_in;
-    int out_row_size = jcp.oc_block * jcp.tr_ow * jcp.typesize_in;
-    int full_spat_min_h_block_size
+    const auto inp_row_size
+            = static_cast<size_t>(jcp.ic_block) * jcp.tr_iw * jcp.typesize_in;
+    const auto out_row_size
+            = static_cast<size_t>(jcp.oc_block) * jcp.tr_ow * jcp.typesize_in;
+    const auto full_spat_min_h_block_size
             = nstl::max(1, nstl::max(jcp.b_pad, jcp.t_pad));
-    int full_spat_working_set_size
+    const auto full_spat_working_set_size
             = (inp_row_size + out_row_size) * full_spat_min_h_block_size;
     bool use_full_spat_loop = isa_has_bf16(jcp.isa) && jcp.ndims < 5
             && jcp.ih == jcp.oh && jcp.iw == jcp.ow
@@ -4481,14 +4603,15 @@ status_t jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::init_conf(
 
         // TODO: Optimize memory allocation when threaded on height and depth
         if (jcp.transpose_src) {
-            jcp.tr_src_buf_size = jcp.tr_iw * jcp.ic_block * jcp.ih * jcp.id;
+            jcp.tr_src_buf_size = static_cast<size_t>(jcp.tr_iw) * jcp.ic_block
+                    * jcp.ih * jcp.id;
             jcp.tr_src_buf_count = jcp.global_transpose
                     ? jcp.nthr_mb * jcp.nb_ic * jcp.ngroups
                     : jcp.nthr;
         }
         if (jcp.transpose_dst) {
-            jcp.tr_diff_dst_buf_size
-                    = jcp.tr_ow * jcp.oc_block * jcp.oh * jcp.od;
+            jcp.tr_diff_dst_buf_size = static_cast<size_t>(jcp.tr_ow)
+                    * jcp.oc_block * jcp.oh * jcp.od;
             jcp.tr_diff_dst_buf_count = jcp.global_transpose
                     ? jcp.nthr_mb * jcp.nb_oc * jcp.ngroups
                     : jcp.nthr;
diff --git a/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.hpp
index 3e7cc332423..dba7d1bcf61 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ namespace cpu {
 namespace x64 {
 
 template <typename Vmm>
-struct _jit_avx512_core_bf16_fwd_kernel : public jit_generator {
+struct _jit_avx512_core_bf16_fwd_kernel : public jit_generator_t {
     _jit_avx512_core_bf16_fwd_kernel(const jit_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
 
@@ -43,7 +43,7 @@ struct _jit_avx512_core_bf16_fwd_kernel : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     using Vmm_down_t =
             typename utils::conditional<std::is_same<Vmm, Xbyak::Zmm>::value,
                     Xbyak::Ymm, Xbyak::Xmm>::type;
@@ -127,10 +127,17 @@ struct _jit_avx512_core_bf16_fwd_kernel : public jit_generator {
     constexpr static int off_reg_ker_ = 8;
     constexpr static int stack_space_needed_ = 16;
 
-    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core, Vmm>>
+    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
     std::unique_ptr<bf16_emulation_t> bf16_emu_;
 
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = reg_kj;
+    int base_post_ops_data_offset = 0;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     inline void prepare_dst(int ur_w);
     void apply_postops(int ur_w);
     inline void store_dst(int ur_w);
@@ -256,14 +263,14 @@ struct jit_avx512_core_bf16_fwd_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_core_bf16_fwd_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 template <typename Vmm>
-struct _jit_avx512_core_bf16_bwd_data_kernel : public jit_generator {
+struct _jit_avx512_core_bf16_bwd_data_kernel : public jit_generator_t {
 
     _jit_avx512_core_bf16_bwd_data_kernel(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_bf16)
+        : jit_generator_t(jit_name(), avx512_core_bf16)
         , jcp(ajcp)
         , bf16_emu_(nullptr) {
         if (!isa_has_bf16(jcp.isa))
@@ -272,6 +279,12 @@ struct _jit_avx512_core_bf16_bwd_data_kernel : public jit_generator {
                     bf16_emu_scratch, bf16_emu_reserv_4, bf16_emu_reserv_5);
     }
 
+    ~_jit_avx512_core_bf16_bwd_data_kernel() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
+
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_avx512_core_bf16_bwd_data_kernel_f32)
 
     const jit_conv_conf_t &jcp;
@@ -352,6 +365,11 @@ struct _jit_avx512_core_bf16_bwd_data_kernel : public jit_generator {
     Vmm vmm_wei = Vmm(31);
     std::unique_ptr<bf16_emulation_t> bf16_emu_;
 
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = reg_kj;
+
+    nstl::vector<jit_uni_depthwise_injector_f32<avx512_core>*> depthwise_injectors;
+
     inline void prepare_output(int ur_w);
     inline void store_output(int ur_w);
     inline void compute_loop(int ur_w, int l_overflow, int r_overflow);
@@ -436,18 +454,15 @@ struct jit_avx512_core_bf16_bwd_data_kernel {
         switch (ajcp.ic_block) {
             case 16:
                 kernel_ = utils::make_unique<
-                        _jit_avx512_core_bf16_bwd_data_kernel<Xbyak::Zmm>>(
-                        ajcp);
+                        _jit_avx512_core_bf16_bwd_data_kernel<Xbyak::Zmm>>(ajcp);
                 return;
             case 8:
                 kernel_ = utils::make_unique<
-                        _jit_avx512_core_bf16_bwd_data_kernel<Xbyak::Ymm>>(
-                        ajcp);
+                        _jit_avx512_core_bf16_bwd_data_kernel<Xbyak::Ymm>>(ajcp);
                 return;
             case 4:
                 kernel_ = utils::make_unique<
-                        _jit_avx512_core_bf16_bwd_data_kernel<Xbyak::Xmm>>(
-                        ajcp);
+                        _jit_avx512_core_bf16_bwd_data_kernel<Xbyak::Xmm>>(ajcp);
                 return;
             default: assert(!"invalid channel blocking");
         }
@@ -460,6 +475,8 @@ struct jit_avx512_core_bf16_bwd_data_kernel {
 
     ~jit_avx512_core_bf16_bwd_data_kernel() = default;
 
+    static bool post_ops_ok(jit_conv_conf_t& jcp);
+
     static status_t init_conf(jit_conv_conf_t &jcp,
             const convolution_desc_t &cd, memory_desc_t &diff_src_md,
             memory_desc_t &weights_md, memory_desc_t &diff_dst_md,
@@ -469,14 +486,15 @@ struct jit_avx512_core_bf16_bwd_data_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_core_bf16_bwd_data_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
-struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
+struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32
+    : public jit_generator_t {
 
     jit_avx512_core_bf16_conv_bwd_weights_kernel_f32(
             const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_bf16)
+        : jit_generator_t(jit_name(), avx512_core_bf16)
         , jcp(ajcp)
         , bf16_emu_(nullptr) {
         if (!isa_has_bf16(jcp.isa)) {
@@ -485,7 +503,7 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
         }
     }
 
-    ~jit_avx512_core_bf16_conv_bwd_weights_kernel_f32() = default;
+    ~jit_avx512_core_bf16_conv_bwd_weights_kernel_f32() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             jit_avx512_core_bf16_conv_bwd_weights_kernel_f32)
@@ -563,7 +581,7 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
     reg64_t scratch = r11;
 
     inline void maybe_zero_kernel();
-    inline void get_ur_w(int &ur_w, int &ur_w_tail, int &ur_w_trips);
+    inline void get_ur_w(int &ur_w, int &ur_w_tail, int &ur_w_trips) const;
     inline void compute_oh_step_unroll_ow_icblock(int ic_block_step);
     inline void od_step_comeback_pointers();
     inline void oh_step_comeback_pointers();
@@ -596,12 +614,12 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
     inline void compute_ic_block_step_vpermw_expl(int ur_w, int pad_l,
             int pad_r, int ic_block_step, int src_offset, int kernel_offset,
             int ddst_offset, bool is_tail = false);
-    inline bool is_src_layout_nxc() {
+    inline bool is_src_layout_nxc() const {
         return jcp.uses_permw_transposition
                 && utils::one_of(jcp.src_tag, format_tag::ndhwc,
                         format_tag::nhwc, format_tag::nwc);
     }
-    inline bool is_ddst_layout_nxc() {
+    inline bool is_ddst_layout_nxc() const {
         return jcp.uses_permw_transposition
                 && utils::one_of(jcp.dst_tag, format_tag::ndhwc,
                         format_tag::nhwc, format_tag::nwc);
@@ -613,7 +631,7 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
             int &nthr_g, int &nthr_oc_b, int &nthr_ic_b);
 
     void get_w_positions(int ur_w, int pad_l, int pad_r, int i_ur, int i_kw,
-            int &iw_0, int &iw_1) {
+            int &iw_0, int &iw_1) const {
         auto get_w_position = [&](int idx) {
             int iw = i_ur + idx;
             if (iw >= ur_w) return -1;
@@ -649,15 +667,15 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
         return rt;
     }
 
-    inline dim_t filter_w_to_src(int kw, int ow = 0, int pad_l = 0) {
+    inline dim_t filter_w_to_src(int kw, int ow = 0, int pad_l = 0) const {
         int stride_w = jcp.transpose_src ? 1 : jcp.stride_w;
         return static_cast<dim_t>(kw) * (jcp.dilate_w + 1) + ow * stride_w
                 - pad_l;
     }
-    inline dim_t filter_h_to_src(int kh) {
+    inline dim_t filter_h_to_src(int kh) const {
         return static_cast<dim_t>(kh) * (jcp.dilate_h + 1);
     }
-    inline dim_t filter_d_to_src(int kd) {
+    inline dim_t filter_d_to_src(int kd) const {
         return static_cast<dim_t>(kd) * (jcp.dilate_d + 1) * jcp.ih;
     }
 
@@ -698,7 +716,7 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
         return jcp.typesize_in * (w_off + hd_off);
     }
 
-    inline dim_t get_kernel_offset(int ic_idx, dim_t ksp_idx) {
+    inline dim_t get_kernel_offset(int ic_idx, dim_t ksp_idx) const {
         // Only the ic_idx index inside the block is supported,
         // ic_idx == jcp.ic_block is considered as a shift inside one block
         // and not as moving to the next ic block.
@@ -708,7 +726,7 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
                 * (ksp_idx * jcp.ic_block + ic_idx);
     }
 
-    Xbyak::Zmm get_perm_reg() {
+    Xbyak::Zmm get_perm_reg() const {
         int idx = !(jcp.uses_permw_transposition
                           && jcp.kernel_kind == expl_bcast)
                 ? 24
@@ -720,7 +738,7 @@ struct jit_avx512_core_bf16_conv_bwd_weights_kernel_f32 : public jit_generator {
     inline int interleave_w_reorder_size(int ur_w) const;
     inline int interleave_w_reorder_bytes(int ur_w);
     inline int interleave_stack_size(int ur_w, int ic_block_step);
-    inline int permw_stack_size(int ur_w) {
+    inline int permw_stack_size(int ur_w) const {
         return (ur_w + jcp.kw - 1) * sizeof_cacheline;
     }
 
diff --git a/src/cpu/x64/jit_avx512_core_bf16_convolution.cpp b/src/cpu/x64/jit_avx512_core_bf16_convolution.cpp
index a7ed44e7802..c640b7ca7d8 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -132,6 +132,8 @@ void jit_avx512_core_bf16_convolution_fwd_t::execute_forward_1d(
             par_conv.dst_orig = dst;
             par_conv.post_ops_binary_rhs_arg_vec
                     = post_ops_binary_rhs_arg_vec.data();
+            par_conv.oc_off = oc_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
+
             (*kernel_)(&par_conv);
 
             if (jcp.loop_order == loop_cwgn) {
@@ -185,8 +187,10 @@ void jit_avx512_core_bf16_convolution_fwd_t::execute_forward_2d(
         balance211(work_amount, nthr, ithr, start, end);
         auto par_conv = jit_conv_call_s();
 
-        size_t src_h_stride = src_d.blk_off(0, 0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 1);
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t src_h_stride = src_d.blk_off<false, true>(0, 0, 1);
+        size_t dst_h_stride = dst_d.blk_off<false, true>(0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
 
         int n {0}, gg {0}, occ {0}, oh_s {0}, owb {0};
@@ -255,6 +259,8 @@ void jit_avx512_core_bf16_convolution_fwd_t::execute_forward_2d(
                 par_conv.dst_orig = dst;
                 par_conv.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
+                par_conv.oc_off = oc_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
+
                 (*kernel_)(&par_conv);
 
                 src_w += src_h_stride * jcp.stride_h;
@@ -309,9 +315,11 @@ void jit_avx512_core_bf16_convolution_fwd_t::execute_forward_3d(
         balance211(work_amount, nthr, ithr, start, end);
         auto par_conv = jit_conv_call_s();
 
-        size_t src_d_stride = src_d.blk_off(0, 0, 1);
-        size_t src_h_stride = src_d.blk_off(0, 0, 0, 1);
-        size_t dst_h_stride = dst_d.blk_off(0, 0, 0, 1);
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t src_d_stride = src_d.blk_off<false, true>(0, 0, 1);
+        size_t src_h_stride = src_d.blk_off<false, true>(0, 0, 0, 1);
+        size_t dst_h_stride = dst_d.blk_off<false, true>(0, 0, 0, 1);
         size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
 
@@ -392,6 +400,8 @@ void jit_avx512_core_bf16_convolution_fwd_t::execute_forward_3d(
                 par_conv.dst_orig = dst;
                 par_conv.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
+                par_conv.oc_off = oc_idx * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
+
                 (*kernel_)(&par_conv);
 
                 src_w += src_h_stride * jcp.stride_h;
@@ -419,12 +429,14 @@ void jit_avx512_core_bf16_convolution_bwd_data_t ::execute_backward_data_3d(
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(char *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
 
-    const auto &jcp = pd()->jcp_;
-
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         int start {0}, end {0};
         int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
@@ -436,8 +448,10 @@ void jit_avx512_core_bf16_convolution_bwd_data_t ::execute_backward_data_3d(
 
         auto par_conv = jit_conv_call_s();
 
-        size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 0, 1);
-        size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 0, 1);
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t diff_src_h_stride = diff_src_d.blk_off<false, true>(0, 0, 0, 1);
+        size_t diff_dst_h_stride = diff_dst_d.blk_off<false, true>(0, 0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
 
         bool is_fast_path_d = jcp.dilate_d == 0 && jcp.stride_d == 1;
@@ -559,6 +573,9 @@ void jit_avx512_core_bf16_convolution_bwd_data_t ::execute_backward_data_3d(
                 par_conv.filt = wht_w + kh_lo * wht_h_stride;
                 par_conv.kh_padding = kh_len;
                 par_conv.kd_padding = kd_len;
+                par_conv.oc_off = ic_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+                par_conv.post_ops_binary_rhs_arg_vec
+                        = post_ops_binary_rhs_arg_vec.data();
 
                 (*kernel_)(&par_conv);
             }
@@ -585,12 +602,14 @@ void jit_avx512_core_bf16_convolution_bwd_data_t ::execute_backward_data(
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(char *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
 
-    const auto &jcp = pd()->jcp_;
-
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         int start {0}, end {0};
         int ic_chunks = jcp.nb_ic / jcp.nb_ic_blocking;
@@ -601,8 +620,11 @@ void jit_avx512_core_bf16_convolution_bwd_data_t ::execute_backward_data(
         balance211(work_amount, nthr, ithr, start, end);
 
         auto par_conv = jit_conv_call_s();
-        size_t diff_src_h_stride = diff_src_d.blk_off(0, 0, 1);
-        size_t diff_dst_h_stride = diff_dst_d.blk_off(0, 0, 1);
+
+        // The second arg in template means sub_offset0 = true
+        // See `blk_off` method definition.
+        size_t diff_src_h_stride = diff_src_d.blk_off<false, true>(0, 0, 1);
+        size_t diff_dst_h_stride = diff_dst_d.blk_off<false, true>(0, 0, 1);
         size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
 
         bool is_fast_path = jcp.dilate_h == 0 && jcp.stride_h == 1;
@@ -696,6 +718,9 @@ void jit_avx512_core_bf16_convolution_bwd_data_t ::execute_backward_data(
                 par_conv.filt = wht_w + k_lo * wht_h_stride;
                 par_conv.kh_padding = k_len;
                 par_conv.iwb = iwb;
+                par_conv.oc_off =  ic_idx * (is_dsrc_layout_nxc ? 1 : jcp.ic_block) * sizeof(float);
+                par_conv.post_ops_binary_rhs_arg_vec
+                        = post_ops_binary_rhs_arg_vec.data();
 
                 (*kernel_)(&par_conv);
             }
@@ -1492,8 +1517,9 @@ void jit_avx512_core_bf16_convolution_bwd_weights_t ::compute_diff_weights(
             jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
     const bool is_ddst_layout_nxc = utils::one_of(
             jcp.dst_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
-    const int wei_size = jcp.ngroups * jcp.nb_oc * jcp.oc_block * jcp.nb_ic
-            * jcp.ic_block * jcp.kh * jcp.kw * jcp.kd;
+    const size_t wei_size = static_cast<size_t>(jcp.ngroups) * jcp.nb_oc
+            * jcp.oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw
+            * jcp.kd;
     const int bias_buf_size = jcp.ngroups * jcp.nb_oc * jcp.oc_block;
 
     float *diff_wei;
@@ -1545,7 +1571,7 @@ void jit_avx512_core_bf16_convolution_bwd_weights_t ::compute_diff_weights(
     auto uker_trans = [&](int img, int g = 0, int ic_b = 0) {
         int j {0}, d {0};
         int my_work = jcp.ih * jcp.id;
-        int ic;
+        dim_t ic;
         int icb_start = ic_b;
         if (jcp.global_transpose) {
             const int work_amount = is_src_layout_nxc
@@ -1573,11 +1599,13 @@ void jit_avx512_core_bf16_convolution_bwd_weights_t ::compute_diff_weights(
             g += ti->g_start;
             ic_b += ti->ic_b_start;
             icb_start = ic_b;
-            ic = is_src_layout_nxc ? g * jcp.ic + ic_b * jcp.ic_block
-                                   : g * jcp.nb_ic + ic_b;
+            ic = is_src_layout_nxc ? static_cast<dim_t>(g) * jcp.ic
+                            + static_cast<dim_t>(ic_b) * jcp.ic_block
+                                   : static_cast<dim_t>(g) * jcp.nb_ic + ic_b;
         } else {
-            ic = is_src_layout_nxc ? g * jcp.ic + ic_b * jcp.ic_block
-                                   : g * jcp.nb_ic + ic_b;
+            ic = is_src_layout_nxc ? static_cast<dim_t>(g) * jcp.ic
+                            + static_cast<dim_t>(ic_b) * jcp.ic_block
+                                   : static_cast<dim_t>(g) * jcp.nb_ic + ic_b;
             g = 0;
             ic_b = 0;
         }
@@ -1585,7 +1613,9 @@ void jit_avx512_core_bf16_convolution_bwd_weights_t ::compute_diff_weights(
         const auto local_gwork = need_local_gwork ? ti->g_work : 1;
 
         for (int gg = g; gg < g + local_gwork; ++gg) {
-            if (need_local_gwork) ic = gg * jcp.ic + ic_b * jcp.ic_block;
+            if (need_local_gwork)
+                ic = static_cast<dim_t>(gg) * jcp.ic
+                        + static_cast<dim_t>(ic_b) * jcp.ic_block;
             src_data_t *tr_src = (jcp.ndims == 5)
                     ? &ti->tr_src[tr_src_off_3d(gg, ic_b, d, j)]
                     : &ti->tr_src[tr_src_off(gg, ic_b, j)];
@@ -1775,8 +1805,9 @@ void jit_avx512_core_bf16_convolution_bwd_weights_t ::
     const memory_desc_wrapper diff_weights_d(pd()->diff_weights_md(0));
 
     const auto &jcp = kernel_->jcp;
-    const int wei_size = jcp.ngroups * jcp.nb_oc * jcp.oc_block * jcp.nb_ic
-            * jcp.ic_block * jcp.kh * jcp.kw * ((jcp.ndims == 5) ? jcp.kd : 1);
+    const size_t wei_size = static_cast<size_t>(jcp.ngroups) * jcp.nb_oc
+            * jcp.oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw
+            * ((jcp.ndims == 5) ? jcp.kd : 1);
 
     const bool is_bf16_out = diff_weights_d.data_type() == data_type::bf16;
     const bool is_bf16_bias = jcp.with_bias && jcp.bia_dt == data_type::bf16;
diff --git a/src/cpu/x64/jit_avx512_core_bf16_convolution.hpp b/src/cpu/x64/jit_avx512_core_bf16_convolution.hpp
index 010975cc106..6895af44a84 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,9 +37,7 @@ namespace x64 {
 
 struct jit_avx512_core_bf16_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_bf16:", jcp_.isa, ""),
                 jit_avx512_core_bf16_convolution_fwd_t);
@@ -68,6 +66,7 @@ struct jit_avx512_core_bf16_convolution_fwd_t : public primitive_t {
                             utils::one_of(weights_md(1)->data_type, f32, bf16)),
                     VERBOSE_UNSUPPORTED_BIAS_CFG);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_bf16_fwd_kernel::init_conf(jcp_, *desc(),
                     src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -78,14 +77,14 @@ struct jit_avx512_core_bf16_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_bf16_convolution_fwd_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -121,9 +120,7 @@ struct jit_avx512_core_bf16_convolution_fwd_t : public primitive_t {
 
 struct jit_avx512_core_bf16_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_bf16:", jcp_.isa, ""),
                 jit_avx512_core_bf16_convolution_bwd_data_t);
@@ -145,22 +142,24 @@ struct jit_avx512_core_bf16_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    attr()->has_default_values(primitive_attr_t::skip_mask_t::post_ops), VERBOSE_UNSUPPORTED_ATTR);
 
-            status_t status = jit_avx512_core_bf16_bwd_data_kernel::init_conf(
-                    jcp_, *desc(), diff_src_md_, weights_md_, diff_dst_md_,
-                    dnnl_get_max_threads());
-            return status;
+            jcp_.post_ops = attr()->post_ops_;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_core_bf16_bwd_data_kernel::init_conf(jcp_, *desc(),
+                    diff_src_md_, weights_md_, diff_dst_md_,
+                    dnnl_get_max_threads()));
+            return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_bf16_convolution_bwd_data_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
-    typedef typename prec_traits<data_type::bf16>::type wei_data_t;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using wei_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(
@@ -188,10 +187,8 @@ struct jit_avx512_core_bf16_convolution_bwd_data_t : public primitive_t {
 
 struct jit_avx512_core_bf16_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_bf16:", jcp_.isa, ""),
                 jit_avx512_core_bf16_convolution_bwd_weights_t);
@@ -218,27 +215,26 @@ struct jit_avx512_core_bf16_convolution_bwd_weights_t : public primitive_t {
             VDISPATCH_CONV(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
-            status_t status = jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::
-                    init_conf(jcp_, *desc(), src_md_, diff_weights_md_,
-                            diff_bias_md_, diff_dst_md_,
-                            dnnl_get_max_threads());
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::init_conf(
+                    jcp_, *desc(), src_md_, diff_weights_md_, diff_bias_md_,
+                    diff_dst_md_, dnnl_get_max_threads()));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_avx512_core_bf16_conv_bwd_weights_kernel_f32::init_scratchpad(
                     scratchpad, jcp_);
 
-            return status;
+            return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_bf16_convolution_bwd_weights_t(const pd_t *apd)
         : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::bf16>::type src_data_t;
-    typedef typename prec_traits<data_type::bf16>::type diff_dst_data_t;
+    using src_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using diff_dst_data_t = typename prec_traits_t<data_type::bf16>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.cpp
index 4cd112b00cf..9431342df24 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,15 +35,15 @@ using namespace dnnl::impl::utils;
 
 jit_avx512_dw_conv_fwd_kernel_bf16::jit_avx512_dw_conv_fwd_kernel_bf16(
         const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md)
-    : jit_generator(jit_name()), jcp(ajcp) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name()), jcp(ajcp) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
         static constexpr size_t helper_vmm_idx = 31;
         static constexpr bool use_exact_tail_scalar_bcast = true;
         const size_t tail_size = jcp.oc_without_padding
-                % (cpu_isa_traits<avx512_core>::vlen / sizeof(float));
+                % (cpu_isa_traits_t<avx512_core>::vlen / sizeof(float));
 
         const rhs_arg_static_params_t rhs_arg_static_params {helper_vmm_idx,
                 r14, r15, r12, preserve_gpr, preserve_vmm,
@@ -52,10 +52,12 @@ jit_avx512_dw_conv_fwd_kernel_bf16::jit_avx512_dw_conv_fwd_kernel_bf16(
                 use_exact_tail_scalar_bcast};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
     if (!isa_has_bf16(jcp.isa))
         bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
@@ -202,7 +204,17 @@ static void iterate(const int ur_ch_blocks, const int ur_w, const F &f) {
 
 void jit_avx512_dw_conv_fwd_kernel_bf16::apply_postops(
         int ur_ch_blocks, int ur_w, bool last_ch_block_flag) {
-    if (this->jcp.with_eltwise || this->jcp.with_binary) {
+    if (this->jcp.with_eltwise || this->jcp.with_binary || this->jcp.with_depthwise || this->jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(ur_ch_blocks, ur_w, [&](int ch, int ow, int) {
+            vmm_idx_off.insert({get_acc_reg_idx(ch * ur_w + ow), ch * jcp.ch_block * sizeof(float)});
+        });
+
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(dummy_oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(dummy_oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
 
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
@@ -244,20 +256,20 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::apply_postops(
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params);
+                        vmm_idxs, rhs_arg_params, ddp, qdp);
             } else if (last_ch_block_flag)
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params_tail);
+                        vmm_idxs, rhs_arg_params_tail, ddp, qdp);
             else /* if (!last_ch_block_flag) */
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params);
+                        vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
 
         } else {
             iterate(ur_ch_blocks, ur_w, [&](int ch, int ow, int) {
                 vmm_idxs.emplace(get_acc_reg_idx(ch * ur_w + ow));
             });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
     }
 }
@@ -403,7 +415,15 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::compute_loop(
     const bool masked_ch_block_tail = jcp.oc % jcp.ch_block != 0;
     const bool ch_loop = ur_ch_blocks > jcp.nb_ch_blocking;
 
+    // There is no free register to keep record of channel offset
+    // so we use a memory variable for that
+    if (this->jcp.with_depthwise || this->jcp.with_quantization) {
+        mov(reg_d_bias, ptr[this->param1 + GET_OFF(oc_off)]);
+        mov(ptr[this->param1 + GET_OFF(dummy_oc_off)], reg_d_bias);
+    }
+
     push(reg_ch_blocks);
+    base_post_ops_data_offset += reg64_size;
 
     if (ch_loop) {
         Label ch_loop_label, ch_tail_label, skip_ch_tail_label;
@@ -415,7 +435,11 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::compute_loop(
         push(reg_kernel);
         push(reg_input);
         push(reg_output);
-        if (jcp.with_bias) push(reg_bias);
+        base_post_ops_data_offset += 3 * reg64_size;
+        if (jcp.with_bias) {
+            push(reg_bias);
+            base_post_ops_data_offset += reg64_size;
+        }
 
         if (nb_ch >= jcp.nb_ch_blocking) {
             if (nb_ch_blocking_tail) {
@@ -430,6 +454,11 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::compute_loop(
                 add(reg_input, inp_ch_stride);
                 add(reg_output, out_ch_stride);
                 if (jcp.with_bias) add(reg_bias, bias_stride);
+
+                if (this->jcp.with_depthwise || this->jcp.with_quantization) {
+                    add(qword[this->param1 + GET_OFF(dummy_oc_off)], ch_step*sizeof(float));
+                }
+
                 sub(reg_ch_blocks, ch_step);
                 cmp(reg_ch_blocks, ch_step);
                 jge(ch_loop_label, T_NEAR);
@@ -443,16 +472,21 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::compute_loop(
             compute(nb_ch_blocking_tail, masked_ch_block_tail);
             L(skip_ch_tail_label);
         }
-        if (jcp.with_bias) pop(reg_bias);
+        if (jcp.with_bias) {
+            pop(reg_bias);
+            base_post_ops_data_offset -= reg64_size;
+        }
         pop(reg_output);
         pop(reg_input);
         pop(reg_kernel);
+        base_post_ops_data_offset -= 3 * reg64_size;
 
     } else {
         compute(ur_ch_blocks, masked_ch_block_tail);
     }
 
     pop(reg_ch_blocks);
+    base_post_ops_data_offset -= reg64_size;
 }
 
 void jit_avx512_dw_conv_fwd_kernel_bf16::loop_ow(int ur_ch_blocks) {
@@ -527,6 +561,9 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::loop_ow(int ur_ch_blocks) {
 void jit_avx512_dw_conv_fwd_kernel_bf16::generate() {
     this->preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_input, reg_output);
+
     assert(mayiuse(avx512_core));
     if (jcp.is_fused_conv) {
         mov(reg_input_buffer_ptr, ptr[this->param1 + GET_OFF(src)]);
@@ -608,6 +645,9 @@ void jit_avx512_dw_conv_fwd_kernel_bf16::generate() {
         L(exit_label);
     }
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -705,6 +745,32 @@ inline void jit_avx512_dw_conv_bwd_data_kernel_bf16::apply_filter(
     L(iter_exit_label);
 }
 
+void jit_avx512_dw_conv_bwd_data_kernel_bf16::apply_postprocess(int ur_ch_blocks, int ur_str_) {
+    const auto& p = jcp.post_ops;
+    std::size_t post_ops_data_offset = 0;
+    int depthwise_inj_idx = 0;
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+            add(reg_d_weights, ptr[this->param1 + GET_OFF(ic_off)]);
+
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                int start_idx = get_acc_reg(ur_str_ * ch).getIdx();
+                int end_idx = get_acc_reg(ur_str_ * ch + ur_str_).getIdx();
+
+                depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                    start_idx, end_idx, reg_d_weights, reg_d_weights);
+
+                add(reg_d_weights, jcp.ch_block * sizeof(float));
+            }
+
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+        }
+    }
+}
+
 inline void jit_avx512_dw_conv_bwd_data_kernel_bf16::store_dsrc(
         int ur_ch_blocks, int ur_str_w, bool last_ch_block_flag) {
     int ch_blk = jcp.ch_block;
@@ -762,6 +828,7 @@ inline void jit_avx512_dw_conv_bwd_data_kernel_bf16::ch_loop_body(
 
                   load_ddst(ur_ch_blocks, unroll_w);
                   apply_filter(ur_ch_blocks, unroll_w, is_last_ch);
+                  apply_postprocess(ur_ch_blocks, unroll_w);
                   store_dsrc(ur_ch_blocks, unroll_w, is_last_ch);
               };
 
@@ -780,6 +847,7 @@ inline void jit_avx512_dw_conv_bwd_data_kernel_bf16::ch_loop_body(
         const size_t data_ch_stride = (size_t)jcp.nb_ch_blocking * jcp.ch_block;
 
         mov(aux_reg_ch_blocks, reg_ch_blocks);
+        base_post_ops_data_offset += 3 * reg64_size;
         push(reg_dsrc);
         push(reg_ddst);
         push(reg_kernel);
@@ -816,6 +884,7 @@ inline void jit_avx512_dw_conv_bwd_data_kernel_bf16::ch_loop_body(
         pop(reg_kernel);
         pop(reg_ddst);
         pop(reg_dsrc);
+        base_post_ops_data_offset -= 3 * reg64_size;
 
     } else {
         call_compute_body(ur_ch_blocks, unroll_w, jcp.ch_tail);
@@ -853,7 +922,39 @@ inline void jit_avx512_dw_conv_bwd_data_kernel_bf16::unroll_width_body(
 void jit_avx512_dw_conv_bwd_data_kernel_bf16::generate() {
     assert(is_dsrc_layout_nxc() == is_ddst_layout_nxc());
 
+    const auto& p = jcp.post_ops;
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<avx512_core>(
+                this,
+                post_op
+                ));
+        }
+    }
+
     preamble();
+
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_dsrc;
+        auto aux_reg1 = reg_ddst;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
     mov(reg_dsrc, ptr[this->param1 + GET_OFF(src)]);
     mov(reg_ddst, ptr[this->param1 + GET_OFF(dst)]);
     mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
@@ -892,6 +993,11 @@ void jit_avx512_dw_conv_bwd_data_kernel_bf16::generate() {
         int ch_blocks_tail = jcp.nb_ch % jcp.nb_ch_blocking;
         if (ch_blocks_tail) { ch_blocks_loop(ch_blocks_tail); }
     }
+
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
     postamble();
 }
 #undef GET_OFF
diff --git a/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.hpp
index 7ec11eeed79..0d550cd3a5b 100644
--- a/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #ifndef CPU_X64_JIT_AVX512_CORE_BF16_DW_CONV_KERNEL_HPP
 #define CPU_X64_JIT_AVX512_CORE_BF16_DW_CONV_KERNEL_HPP
 
+#include <memory>
 #include "common/c_types_map.hpp"
 #include "common/memory_tracking.hpp"
 
@@ -25,13 +26,14 @@
 #include "cpu/x64/jit_primitive_conf.hpp"
 
 #include "cpu/x64/jit_avx512_core_bf16cvt.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx512_dw_conv_fwd_kernel_bf16 : public jit_generator {
+struct jit_avx512_dw_conv_fwd_kernel_bf16 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_dw_conv_fwd_kernel_bf16)
 
     jit_avx512_dw_conv_fwd_kernel_bf16(
@@ -70,6 +72,14 @@ struct jit_avx512_dw_conv_fwd_kernel_bf16 : public jit_generator {
     mask_t ktail_mask = k_oc_tail_mask;
     mask_t k_ch_tail_mask_extended = Xbyak::Opmask(3);
 
+    reg64_t reg_d_weights = abi_not_param1;
+    reg64_t reg_d_bias = iter_kh;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     Xbyak::Zmm zmm_ker_reg = Xbyak::Zmm(0);
     Xbyak::Zmm zmm_src_reg = Xbyak::Zmm(1);
     Xbyak::Zmm zmm_prev_dst = Xbyak::Zmm(31);
@@ -86,12 +96,12 @@ struct jit_avx512_dw_conv_fwd_kernel_bf16 : public jit_generator {
 
     Xbyak::Zmm get_acc_reg(int idx);
 
-    int get_ow_start(int ki, int pad_l) {
+    int get_ow_start(int ki, int pad_l) const {
         return nstl::max(0,
                 utils::div_up(pad_l - ki * (jcp.dilate_w + 1), jcp.stride_w));
     }
 
-    int get_ow_end(int ur_w, int ki, int pad_r) {
+    int get_ow_end(int ur_w, int ki, int pad_r) const {
         return ur_w
                 - nstl::max(0,
                         utils::div_up(
@@ -99,12 +109,12 @@ struct jit_avx512_dw_conv_fwd_kernel_bf16 : public jit_generator {
                                 jcp.stride_w));
     }
 
-    inline bool is_src_layout_nxc() {
+    inline bool is_src_layout_nxc() const {
         return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
 
-    inline bool is_dst_layout_nxc() {
+    inline bool is_dst_layout_nxc() const {
         return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
@@ -126,19 +136,23 @@ struct jit_avx512_dw_conv_fwd_kernel_bf16 : public jit_generator {
     void generate() override;
 };
 
-struct jit_avx512_dw_conv_bwd_data_kernel_bf16 : public jit_generator {
+struct jit_avx512_dw_conv_bwd_data_kernel_bf16 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_dw_conv_bwd_data_kernel_bf16)
 
     jit_avx512_dw_conv_bwd_data_kernel_bf16(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp), bf16_emu_(nullptr) {
+        : jit_generator_t(jit_name()), jcp(ajcp), bf16_emu_(nullptr) {
 
         if (!isa_has_bf16(jcp.isa))
-            bf16_emu_ = new bf16_emulation_t(this, bf16_emu_reserv_1,
-                    bf16_emu_reserv_2, bf16_emu_reserv_3, bf16_emu_reserv_4,
-                    bf16_emu_reserv_5, bf16_emu_reserv_6);
+            bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
+                    bf16_emu_reserv_1, bf16_emu_reserv_2, bf16_emu_reserv_3,
+                    bf16_emu_reserv_4, bf16_emu_reserv_5, bf16_emu_reserv_6);
     }
 
-    ~jit_avx512_dw_conv_bwd_data_kernel_bf16() { delete bf16_emu_; }
+    ~jit_avx512_dw_conv_bwd_data_kernel_bf16() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
     jit_conv_conf_t jcp;
 
@@ -146,7 +160,7 @@ struct jit_avx512_dw_conv_bwd_data_kernel_bf16 : public jit_generator {
     using reg64_t = const Xbyak::Reg64;
 
     const int acc_idx_start = 2;
-    inline int get_max_regs() { return isa_has_bf16(jcp.isa) ? 30 : 25; };
+    inline int get_max_regs() const { return isa_has_bf16(jcp.isa) ? 30 : 25; };
 
     Xbyak::Zmm zmm_ker_reg = Xbyak::Zmm(0);
     Xbyak::Zmm zmm_dst_reg = Xbyak::Zmm(1);
@@ -176,6 +190,11 @@ struct jit_avx512_dw_conv_bwd_data_kernel_bf16 : public jit_generator {
     reg64_t reg_tmp = r15;
     Xbyak::Opmask k_ch_tail_mask = Xbyak::Opmask(1);
 
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = iter_kh;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
     Xbyak::Zmm bf16_emu_reserv_1 = Xbyak::Zmm(26);
     Xbyak::Zmm bf16_emu_reserv_2 = Xbyak::Zmm(27);
     Xbyak::Zmm bf16_emu_reserv_3 = Xbyak::Zmm(28);
@@ -183,39 +202,44 @@ struct jit_avx512_dw_conv_bwd_data_kernel_bf16 : public jit_generator {
     Xbyak::Zmm bf16_emu_reserv_5 = Xbyak::Zmm(29);
     Xbyak::Zmm bf16_emu_reserv_6 = Xbyak::Zmm(30);
 
-    bf16_emulation_t *bf16_emu_;
+    std::unique_ptr<bf16_emulation_t> bf16_emu_;
+
+    nstl::vector<jit_uni_depthwise_injector_f32<avx512_core>*> depthwise_injectors;
 
     inline void ch_loop_body(int ur_ch_blocks, int unroll_w);
     inline void unroll_width_body(int ur_ch_blocks);
     inline void load_ddst(int ur_ch_blocks, int ur_str_w);
     inline void apply_filter(int ur_ch_blocks, int ur_str_w, bool is_last_ch);
+    inline void apply_postprocess(int ur_ch_blocks, int ur_str_w);
     inline void store_dsrc(int ur_ch_blocks, int ur_str_w, bool is_last_ch);
 
     void generate() override;
-    inline bool is_dsrc_layout_nxc() {
+    inline bool is_dsrc_layout_nxc() const {
         return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
-    inline bool is_ddst_layout_nxc() {
+    inline bool is_ddst_layout_nxc() const {
         return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_dw_conv_bwd_data_kernel_bf16);
 };
 
-struct jit_avx512_dw_conv_bwd_weights_kernel_bf16 : public jit_generator {
+struct jit_avx512_dw_conv_bwd_weights_kernel_bf16 : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_dw_conv_bwd_weights_kernel_bf16)
 
     jit_avx512_dw_conv_bwd_weights_kernel_bf16(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp), bf16_emu_(nullptr) {
+        : jit_generator_t(jit_name()), jcp(ajcp), bf16_emu_(nullptr) {
 
         if (!isa_has_bf16(jcp.isa))
-            bf16_emu_ = new bf16_emulation_t(this, bf16_emu_reserv_1,
-                    bf16_emu_reserv_2, bf16_emu_reserv_3, bf16_emu_reserv_4,
-                    bf16_emu_reserv_5, bf16_emu_reserv_6);
+            bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
+                    bf16_emu_reserv_1, bf16_emu_reserv_2, bf16_emu_reserv_3,
+                    bf16_emu_reserv_4, bf16_emu_reserv_5, bf16_emu_reserv_6);
     }
 
-    ~jit_avx512_dw_conv_bwd_weights_kernel_bf16() { delete bf16_emu_; }
+    ~jit_avx512_dw_conv_bwd_weights_kernel_bf16() override = default;
 
     jit_conv_conf_t jcp;
 
@@ -275,7 +299,9 @@ struct jit_avx512_dw_conv_bwd_weights_kernel_bf16 : public jit_generator {
     Xbyak::Zmm bf16_emu_reserv_5 = Xbyak::Zmm(29);
     Xbyak::Zmm bf16_emu_reserv_6 = Xbyak::Zmm(30);
 
-    bf16_emulation_t *bf16_emu_;
+    std::unique_ptr<bf16_emulation_t> bf16_emu_;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_dw_conv_bwd_weights_kernel_bf16)
 
     /* Micro-kernel JIT'ing, fusing 'kw' and 'ow_block' loops into unrolled FMAs
      */
@@ -312,15 +338,15 @@ struct jit_avx512_dw_conv_bwd_weights_kernel_bf16 : public jit_generator {
 
     void generate() override;
 
-    inline bool is_layout_nxc() {
+    inline bool is_layout_nxc() const {
         return utils::everyone_is(
                 true, is_src_layout_nxc(), is_ddst_layout_nxc());
     }
-    inline bool is_src_layout_nxc() {
+    inline bool is_src_layout_nxc() const {
         return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
-    inline bool is_ddst_layout_nxc() {
+    inline bool is_ddst_layout_nxc() const {
         return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
                 format_tag::nwc);
     }
diff --git a/src/cpu/x64/jit_avx512_core_bf16cvt.hpp b/src/cpu/x64/jit_avx512_core_bf16cvt.hpp
index bc65d9e1cf2..0ee2dfc591f 100644
--- a/src/cpu/x64/jit_avx512_core_bf16cvt.hpp
+++ b/src/cpu/x64/jit_avx512_core_bf16cvt.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,8 +53,8 @@ struct bf16_emulation_t {
     using Xmm_t = const Xbyak::Xmm;
     using reg64_t = const Xbyak::Reg64;
 
-    bf16_emulation_t(jit_generator *host, Zmm_t one, Zmm_t even, Zmm_t selector,
-            reg64_t scratch, Zmm_t tr0, Zmm_t tr1)
+    bf16_emulation_t(jit_generator_t *host, Zmm_t one, Zmm_t even,
+            Zmm_t selector, reg64_t scratch, Zmm_t tr0, Zmm_t tr1)
         : host_(host)
         , one_(one)
         , even_(even)
@@ -63,8 +63,8 @@ struct bf16_emulation_t {
         , tr0_(tr0)
         , tr1_(tr1) {}
 
-    bf16_emulation_t(jit_generator *host, Zmm_t one, Zmm_t even, Zmm_t selector,
-            reg64_t scratch, Zmm_t tr0)
+    bf16_emulation_t(jit_generator_t *host, Zmm_t one, Zmm_t even,
+            Zmm_t selector, reg64_t scratch, Zmm_t tr0)
         : bf16_emulation_t(host, one, even, selector, scratch, tr0, tr0) {}
 
     void vdpbf16ps(Zmm_t &acc, Zmm_t wei, Zmm_t inp) {
@@ -117,7 +117,7 @@ struct bf16_emulation_t {
     }
 
 public:
-    void init_vcvtneps2bf16() {
+    void init_vcvtneps2bf16(bool preserve_scratch = false) {
         const int selector_int32 =
                 /* qnan input to qnan output (presenrving input bits 0..21) */
                 encode_fixup_selector(
@@ -134,6 +134,8 @@ struct bf16_emulation_t {
                 /* pos inf input copied to output */
                 encode_fixup_selector(
                         fixup_input_code_pinf, fixup_output_code_copy_input);
+        if (preserve_scratch)
+            host_->push(scratch_);
 
         host_->xor_(scratch_, scratch_);
         host_->mov(scratch_.cvt32(), 0x1);
@@ -146,12 +148,15 @@ struct bf16_emulation_t {
         host_->xor_(scratch_, scratch_);
         host_->mov(scratch_.cvt32(), selector_int32);
         host_->vpbroadcastd(selector_, scratch_.cvt32());
+
+        if (preserve_scratch)
+            host_->pop(scratch_);
     }
 
     static cpu_isa_t get_isa() { return avx512_core; }
 
 private:
-    jit_generator *const host_;
+    jit_generator_t *const host_;
     Zmm_t one_;
     Zmm_t even_;
     Zmm_t selector_;
@@ -175,18 +180,18 @@ struct bf16_emulation_t {
 
 // performs element-by-element sum of inp and add float arrays and stores
 // result to bfloat16 out array with downconversion
-struct jit_avx512_core_add_cvt_ps_to_bf16_t : public jit_generator {
+struct jit_avx512_core_add_cvt_ps_to_bf16_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_add_cvt_ps_to_bf16)
 
     jit_avx512_core_add_cvt_ps_to_bf16_t()
-        : jit_generator(jit_name()), simd_w_(16) {
+        : jit_generator_t(jit_name()), simd_w_(16) {
         bf16_emu_ = utils::make_unique<bf16_emulation_t>(
                 this, one, even, selector, scratch, fp32_tmp, fp32_tmp);
 
         UNUSED_STATUS(create_kernel());
     }
 
-    ~jit_avx512_core_add_cvt_ps_to_bf16_t() = default;
+    ~jit_avx512_core_add_cvt_ps_to_bf16_t() override = default;
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_core_add_cvt_ps_to_bf16_t)
 
     void generate() override {
@@ -253,7 +258,7 @@ struct jit_avx512_core_add_cvt_ps_to_bf16_t : public jit_generator {
     }
 
     void operator()(bf16_support::jit_call_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
         msan_unpoison(params->out, params->nelems * sizeof(bfloat16_t));
     }
 
@@ -288,16 +293,17 @@ struct jit_avx512_core_add_cvt_ps_to_bf16_t : public jit_generator {
 // it is required for quick implementation of 1x1 bf16 bwd_w jit kernel
 // w/o using permw instruction inside
 // TODO: consider modification/replacement for outer transformation jit kernel
-struct jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t : public jit_generator {
+//NOLINTNEXTLINE(readability-identifier-naming)
+struct jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_bf16_reorder_s16c_to_S16c2s)
 
     jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t()
-        : jit_generator(jit_name()), simd_w_(16), in_stride_(16) {}
+        : jit_generator_t(jit_name()), simd_w_(16), in_stride_(16) {}
 
     jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t(int in_stride)
-        : jit_generator(jit_name()), simd_w_(16), in_stride_(in_stride) {}
+        : jit_generator_t(jit_name()), simd_w_(16), in_stride_(in_stride) {}
 
-    ~jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t() {}
+    ~jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t() override = default;
 
     void generate() override {
         preamble();
@@ -379,7 +385,7 @@ struct jit_avx512_core_bf16_reorder_s16c_to_S16c2s_t : public jit_generator {
     }
 
     void operator()(bf16_support::jit_call_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
         msan_unpoison(params->out, params->nelems * sizeof(bfloat16_t));
     }
 
diff --git a/src/cpu/x64/jit_avx512_core_f16_dw_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_f16_dw_conv_kernel.cpp
new file mode 100644
index 00000000000..969cbe78c21
--- /dev/null
+++ b/src/cpu/x64/jit_avx512_core_f16_dw_conv_kernel.cpp
@@ -0,0 +1,558 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/memory.hpp"
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_avx512_core_f16_dw_conv_kernel.hpp"
+
+#define GET_OFF(field) offsetof(jit_conv_call_s, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::prop_kind;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+using namespace Xbyak;
+
+jit_avx512_dw_conv_fwd_kernel_f16::jit_avx512_dw_conv_fwd_kernel_f16(
+        const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md)
+    : jit_generator_t(jit_name()), jcp(ajcp) {
+    const auto simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
+    const auto tail_size = jcp.oc_without_padding % simd_w;
+    if (jcp.with_eltwise || jcp.with_binary) {
+        using namespace binary_injector;
+        static constexpr auto preserve_gpr = true;
+        static constexpr auto preserve_zmm = false;
+        static constexpr auto helper_zmm_idx = 31;
+        static constexpr auto use_exact_tail_scalar_bcast = true;
+        rhs_arg_static_params_t rhs_arg_static_params {helper_zmm_idx, r14, r15,
+                r12, preserve_gpr, preserve_zmm,
+                GET_OFF(post_ops_binary_rhs_arg_vec), GET_OFF(dst_orig),
+                memory_desc_wrapper(dst_md), tail_size, k_oc_tail_mask,
+                use_exact_tail_scalar_bcast};
+        static_params_t static_params {this->param1, rhs_arg_static_params};
+
+        postops_injector_ = utils::make_unique<
+                injector::jit_uni_postops_injector_t<avx512_core>>(
+                this, jcp.post_ops, static_params);
+    }
+
+    const io::jit_io_multi_dt_helper_t<Xbyak::Zmm>::data_types_t data_types {
+            jcp.src_dt, jcp.dst_dt};
+    io_ = utils::make_unique<io::jit_io_multi_dt_helper_t<Xbyak::Zmm>>(this,
+            jcp.isa, data_types, io::io_conf_t {},
+            io::io_tail_conf_t {simd_w, tail_size, k_oc_tail_mask, 0, reg_tmp});
+}
+
+static bool check_if_tail(const bool is_ch_tail, const int c_tail, const int ch,
+        const int ur_ch_blocks, const int simd_w) {
+    return is_ch_tail && (ch + 1 == ur_ch_blocks) && simd_w > c_tail;
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::load_src(
+        int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+
+    const auto dst_layout_nxc = is_dst_layout_nxc();
+    const auto ch_blk = jcp.ch_block;
+    const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
+    const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+    const int simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
+    const int c_tail = jcp.oc % jcp.ch_block;
+
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        const bool is_tail_load
+                = check_if_tail(is_ch_tail, c_tail, ch, ur_ch_blocks, simd_w);
+        if ((ch + 1 == ur_ch_blocks) && is_ch_tail && c_tail <= 0) continue;
+        for (int ow = 0; ow < ur_w; ow++) {
+            Zmm zmm_acc = get_acc_reg(ch * ur_w + ow);
+
+            if (jcp.with_bias) {
+                const Zmm zmm_acc_msk = is_tail_load
+                        ? zmm_acc | k_oc_tail_mask | T_z
+                        : zmm_acc;
+                const int b_off = ch * ch_blk;
+                uni_vmovups(zmm_acc_msk, ptr[reg_bias + b_off * sizeof(float)]);
+            } else {
+                uni_vpxor(zmm_acc, zmm_acc, zmm_acc);
+            }
+
+            if (jcp.with_sum) {
+                const int o_off = ch * ocb_stride + ow * ow_stride;
+                // using ker_zmm as zmm_tmp as it is safe to do so.
+                auto zmm_tmp = get_ker_reg(0);
+                io_->at(jcp.src_dt)
+                        ->load(ptr[reg_output + o_off * jcp.typesize_in],
+                                zmm_tmp, is_tail_load);
+                uni_vaddps(zmm_acc, zmm_acc, zmm_tmp);
+            }
+        }
+    }
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::apply_filter_unrolled(
+        int ur_ch_blocks, int ur_w, int pad_l, int pad_r, bool is_ch_tail) {
+    int ch_blk = jcp.ch_block;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto iw_stride = src_layout_nxc ? jcp.ngroups : ch_blk;
+    const auto ih_stride = jcp.iw * iw_stride;
+    const auto icb_stride = src_layout_nxc
+            ? ch_blk
+            : (jcp.is_fused_conv ? 1 : jcp.ih) * jcp.iw * ch_blk;
+    const int simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
+
+    auto get_input_spatial_index = [=](int oi, int ki) {
+        return (ki * dilate_w + oi * stride_w - pad_l);
+    };
+
+    auto get_input_offset = [&](int ii, int ci) {
+        return (ci * icb_stride + ii * iw_stride) * jcp.typesize_in;
+    };
+
+    int ii_start = 0;
+    int ii_end = -1;
+    if (jcp.is_resrc_depthwise) {
+        // find bounds of input spatial indices
+        bool first = true;
+        for (int ki = 0; ki < jcp.kw; ki++) {
+            int oi_start = get_ow_start(ki, pad_l);
+            int oi_end = get_ow_end(ur_w, ki, pad_r);
+            for (int oi = oi_start; oi < oi_end; oi++) {
+                int ii = get_input_spatial_index(oi, ki);
+                if (first || ii < ii_start) ii_start = ii;
+                if (first || ii > ii_end) ii_end = ii;
+                first = false;
+            }
+        }
+    }
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label);
+    {
+        if (jcp.is_fused_conv) {
+            mov(aux_reg_input, ptr[aux_reg_input_buffer_ptr]);
+            add(aux_reg_input, reg_iw_offset);
+        }
+        const int c_tail = jcp.oc % jcp.ch_block;
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            const bool is_tail_load = check_if_tail(
+                    is_ch_tail, c_tail, ch, ur_ch_blocks, simd_w);
+            if ((ch + 1 == ur_ch_blocks) && is_ch_tail && c_tail <= 0) continue;
+            if (jcp.is_resrc_depthwise) {
+                // now we can load input once and reuse up to jcp.kw times
+                for (int ii = ii_start; ii <= ii_end; ii++) {
+                    Zmm zmm_src = get_src_reg(ii);
+                    const int inp_off = get_input_offset(ii, ch);
+                    io_->at(jcp.src_dt)
+                            ->load(ptr[aux_reg_input + inp_off], zmm_src,
+                                    is_tail_load);
+                }
+            }
+            for (int kw = 0; kw < jcp.kw; kw++) {
+                const int ker_off = ch * jcp.kh * jcp.kw * ch_blk + kw * ch_blk;
+
+                Zmm zmm_ker = get_ker_reg(0);
+                io_->at(jcp.src_dt)
+                        ->load(ptr[aux_reg_kernel + ker_off * jcp.typesize_in],
+                                zmm_ker, is_tail_load);
+                int ow_start = get_ow_start(kw, pad_l);
+                int ow_end = get_ow_end(ur_w, kw, pad_r);
+                for (int ow = ow_start; ow < ow_end; ow++) {
+
+                    const int ii = get_input_spatial_index(ow, kw);
+                    Zmm zmm_src = jcp.is_resrc_depthwise ? get_src_reg(ii)
+                                                         : get_src_reg(0);
+                    if (!jcp.is_resrc_depthwise) {
+                        const int inp_off = get_input_offset(ii, ch);
+                        io_->at(jcp.src_dt)
+                                ->load(ptr[aux_reg_input + inp_off], zmm_src,
+                                        is_tail_load);
+                    }
+                    Zmm zmm_acc = get_acc_reg(ch * ur_w + ow);
+                    const Zmm zmm_ker_msk = is_tail_load
+                            ? zmm_ker | k_oc_tail_mask | T_z
+                            : zmm_ker;
+                    uni_vfmadd231ps(zmm_acc, zmm_src, zmm_ker_msk);
+                }
+            }
+        }
+
+        add(aux_reg_kernel, jcp.kw * ch_blk * jcp.typesize_in);
+        if (jcp.is_fused_conv) {
+            // Move to next row pointer in the buffer
+            add(aux_reg_input_buffer_ptr, sizeof(void *));
+        } else {
+            add(aux_reg_input, ih_stride * dilate_h * jcp.typesize_in);
+        }
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <typename F>
+void iterate(const int ur_ch_blocks, const int ur_w, const bool mask_tail,
+        const F &f) {
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        const bool mask_flag = mask_tail && ch + 1 == ur_ch_blocks;
+        for (int ow = 0; ow < ur_w; ow++)
+            f(ch, ow, mask_flag);
+    }
+}
+
+template <typename F>
+void iterate(const int ur_ch_blocks, const int ur_w, const F &f) {
+    iterate(ur_ch_blocks, ur_w, false, f);
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::apply_postops(
+        const int ur_ch_blocks, const int ur_w, const bool is_ch_tail) {
+    if (this->jcp.with_eltwise || this->jcp.with_binary) {
+        injector_utils::vmm_index_set_t zmm_idxs;
+        if (jcp.with_binary) {
+            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
+                    rhs_arg_params_tail;
+            const auto dst_layout_nxc = is_dst_layout_nxc();
+            const auto ch_blk = jcp.ch_block;
+            const auto ocb_stride
+                    = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
+            const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+            const auto mask_tail_blocked_layout
+                    = jcp.oc_without_padding % jcp.ch_block && !dst_layout_nxc;
+            const int c_tail = jcp.oc_without_padding % jcp.ch_block;
+            iterate(ur_ch_blocks, ur_w, mask_tail_blocked_layout,
+                    [&](const int ch, const int ow,
+                            const bool mask_flag_blocked_layout) {
+                        const int simd_w = cpu_isa_traits_t<avx512_core>::vlen
+                                / sizeof(float);
+                        const bool is_tail_load = check_if_tail(
+                                is_ch_tail, c_tail, ch, ur_ch_blocks, simd_w);
+                        if ((ch + 1 == ur_ch_blocks) && is_ch_tail
+                                && c_tail <= 0)
+                            return;
+                        const size_t o_off = jcp.typesize_out
+                                * (ch * ocb_stride + ow * ow_stride);
+                        const auto zmm_idx = get_acc_reg_idx(ch * ur_w + ow);
+                        zmm_idxs.emplace(zmm_idx);
+
+                        rhs_arg_params_tail.vmm_idx_to_out_reg.emplace(
+                                zmm_idx, reg_output);
+                        rhs_arg_params_tail.vmm_idx_to_out_elem_off_val.emplace(
+                                zmm_idx, o_off);
+                        if (mask_flag_blocked_layout || is_tail_load)
+                            rhs_arg_params_tail.vmm_tail_idx_.emplace(zmm_idx);
+                    });
+            rhs_arg_params = rhs_arg_params_tail;
+            rhs_arg_params.vmm_tail_idx_.clear();
+
+            Label postops_done;
+            if (mask_tail_blocked_layout) {
+                // mask_tail_blocked_layout approach of dynamic tail handling is
+                // used in blocked layout only. TODO: may be unify?
+                Label postops_no_tail;
+                mov(reg_tmp, ptr[param1 + GET_OFF(load_work)]);
+                cmp(reg_tmp, jcp.nb_ch_blocking * jcp.ch_block);
+                jge(postops_no_tail, T_NEAR);
+                postops_injector_->compute_vector_range(
+                        zmm_idxs, rhs_arg_params_tail);
+                jmp(postops_done, T_NEAR);
+                L(postops_no_tail);
+            } else if (is_ch_tail) {
+                postops_injector_->compute_vector_range(
+                        zmm_idxs, rhs_arg_params_tail);
+            }
+            if (!is_ch_tail) {
+                postops_injector_->compute_vector_range(
+                        zmm_idxs, rhs_arg_params);
+                L(postops_done);
+            }
+        } else {
+            iterate(ur_ch_blocks, ur_w,
+                    [&](const int ch, const int ow, const bool) {
+                        zmm_idxs.emplace(get_acc_reg_idx(ch * ur_w + ow));
+                    });
+            postops_injector_->compute_vector_range(zmm_idxs);
+        }
+    }
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::store_dst(
+        int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+
+    const auto dst_layout_nxc = is_dst_layout_nxc();
+    const auto ch_blk = jcp.ch_block;
+    const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
+    const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+    const int simd_w = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
+    const int c_tail = jcp.oc_without_padding % jcp.ch_block;
+
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        const bool is_tail_load
+                = check_if_tail(is_ch_tail, c_tail, ch, ur_ch_blocks, simd_w);
+        if ((ch + 1 == ur_ch_blocks) && is_ch_tail && c_tail <= 0) continue;
+        for (int ow = 0; ow < ur_w; ow++) {
+            const int o_off = ch * ocb_stride + ow * ow_stride;
+            Zmm zmm_dst = get_acc_reg(ch * ur_w + ow);
+            io_->at(jcp.dst_dt)
+                    ->store(zmm_dst, ptr[reg_output + o_off * jcp.typesize_out],
+                            is_tail_load);
+        }
+    }
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::compute_loop(
+        int ur_w, int ur_ch_blocks, int pad_l, int pad_r) {
+
+    const bool ch_loop = ur_ch_blocks > jcp.nb_ch_blocking;
+    // ch_loop currently happen only when data layout is nxc. The strides are
+    // calculated for this layout only.
+    const size_t wei_ch_stride = (size_t)jcp.nb_ch_blocking * jcp.kh * jcp.kw
+            * jcp.ch_block * jcp.typesize_in;
+    const size_t inp_ch_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * jcp.typesize_in;
+    const size_t out_ch_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * jcp.typesize_out;
+    const size_t bias_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * sizeof(float);
+
+    auto compute = [&](int ur_ch_blocks, bool is_ch_tail) {
+        if (jcp.is_fused_conv) {
+            mov(aux_reg_input_buffer_ptr, reg_input_buffer_ptr);
+        } else {
+            mov(aux_reg_input, reg_input);
+        }
+
+        mov(aux_reg_kernel, reg_kernel);
+        load_src(ur_ch_blocks, ur_w, is_ch_tail);
+        apply_filter_unrolled(ur_ch_blocks, ur_w, pad_l, pad_r, is_ch_tail);
+        apply_postops(ur_ch_blocks, ur_w, is_ch_tail);
+        store_dst(ur_ch_blocks, ur_w, is_ch_tail);
+    };
+
+    mov(aux_reg_ch_blocks, reg_ch_blocks);
+    if (ch_loop) {
+        Label ch_loop_label, ch_tail_label, skip_ch_tail_label;
+        const int ch_block_tail = jcp.nb_ch
+                - (utils::rnd_dn(jcp.oc / jcp.ch_block, jcp.nb_ch_blocking));
+        const int ch_step = jcp.nb_ch_blocking * jcp.ch_block;
+
+        push(reg_kernel);
+        push(reg_input);
+        push(reg_output);
+        if (jcp.with_bias) push(reg_bias);
+
+        if ((jcp.oc / jcp.ch_block) >= jcp.nb_ch_blocking) {
+            if (ch_block_tail) {
+                cmp(aux_reg_ch_blocks, ch_step);
+                jl(ch_tail_label, T_NEAR);
+            }
+
+            L(ch_loop_label);
+            {
+                compute(jcp.nb_ch_blocking, false);
+                add(reg_kernel, wei_ch_stride);
+                add(reg_input, inp_ch_stride);
+                add(reg_output, out_ch_stride);
+                if (jcp.with_bias) add(reg_bias, bias_stride);
+                sub(aux_reg_ch_blocks, ch_step);
+                cmp(aux_reg_ch_blocks, ch_step);
+                jge(ch_loop_label, T_NEAR);
+            }
+        }
+
+        if (ch_block_tail) {
+            // ch work range [1, jcp.nb_ch_blocking * ch_block)
+            L(ch_tail_label);
+            cmp(aux_reg_ch_blocks, 0);
+            jle(skip_ch_tail_label, T_NEAR);
+            compute(ch_block_tail, jcp.oc % jcp.ch_block);
+            L(skip_ch_tail_label);
+        }
+
+        if (jcp.with_bias) pop(reg_bias);
+        pop(reg_output);
+        pop(reg_input);
+        pop(reg_kernel);
+
+    } else {
+        compute(ur_ch_blocks, jcp.oc % jcp.ch_block);
+    }
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::ow_loop(int ur_ch_blocks) {
+
+    int iw = jcp.iw;
+    int ow = jcp.ow;
+    int kw = jcp.kw;
+    int l_pad = jcp.l_pad;
+    int ur_w = jcp.ur_w;
+    int ur_w_tail = jcp.ur_w_tail;
+    int stride_w = jcp.stride_w;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto dat_c_stride = src_layout_nxc ? jcp.ngroups : jcp.ch_block;
+    size_t inp_shift = (size_t)jcp.typesize_in * ur_w * stride_w * dat_c_stride;
+    size_t out_shift = (size_t)jcp.typesize_out * ur_w * dat_c_stride;
+
+    int inp_shift_pad
+            = jcp.typesize_in * (ur_w * stride_w - l_pad) * dat_c_stride;
+
+    int r_pad = nstl::max(0, jcp.r_pad);
+    int n_oi = ow / ur_w;
+    int r_pad1 = calculate_end_padding(l_pad, ur_w * n_oi, iw, stride_w,
+            calculate_extended_filter_size(kw, jcp.dilate_w));
+
+    assert(jcp.nb_ow <= 1);
+
+    if (r_pad1 > 0) n_oi--;
+    xor_(reg_oi, reg_oi);
+    if (ow == ur_w) {
+        compute_loop(ur_w, ur_ch_blocks, l_pad, r_pad);
+    } else {
+        if (n_oi == 0) {
+            compute_loop(ur_w, ur_ch_blocks, l_pad, r_pad1);
+            add(reg_input, inp_shift_pad);
+            add(reg_output, out_shift);
+            if (ur_w_tail != 0) {
+                compute_loop(ur_w_tail, ur_ch_blocks, 0, r_pad);
+            }
+        } else {
+            if (l_pad > 0) {
+                compute_loop(ur_w, ur_ch_blocks, l_pad, 0);
+                add(reg_input, inp_shift_pad);
+                add(reg_output, out_shift);
+                inc(reg_oi);
+            }
+            if ((l_pad <= 0 && n_oi > 0) || (l_pad > 0 && n_oi > 1)) {
+                Label ow_loop_label;
+                L(ow_loop_label);
+                {
+                    compute_loop(ur_w, ur_ch_blocks, 0, 0);
+                    add(reg_input, inp_shift);
+                    add(reg_output, out_shift);
+
+                    inc(reg_oi);
+                    cmp(reg_oi, n_oi);
+                    jl(ow_loop_label, T_NEAR);
+                }
+            }
+            if (r_pad1 > 0) {
+                compute_loop(ur_w, ur_ch_blocks, 0, r_pad1);
+                add(reg_input, inp_shift);
+                add(reg_output, out_shift);
+            }
+            if (ur_w_tail != 0) {
+                compute_loop(ur_w_tail, ur_ch_blocks, 0, r_pad);
+            }
+        }
+    }
+}
+
+void jit_avx512_dw_conv_fwd_kernel_f16::generate() {
+    this->preamble();
+
+    if (jcp.is_fused_conv) {
+        mov(reg_input_buffer_ptr, ptr[this->param1 + GET_OFF(src)]);
+        /* In case of fused depthwise convolution, `param.src` is not a pointer
+        to input, instead it points to a buffer containing pointers to
+        consecutive rows of input in format Cwc with blocking nb_ch_blocking.
+        Example: [ptr_to_inp_row0, ptr_to_inp_row1, ptr_to_inp_row2].
+        Traverse the data as
+            mov(reg_data, ptr[reg_input_buffer_ptr])
+            ... process row0 ...
+            add(reg_input_buffer_ptr, sizeof(void*))
+            mov(reg_data, ptr[reg_input_buffer_ptr])
+            ... process row1 ...
+            add(reg_input_buffer_ptr, sizeof(void*))
+            mov(reg_data, ptr[reg_input_buffer_ptr])
+            ... process row2 ...
+        */
+        xor_(reg_iw_offset, reg_iw_offset);
+    } else {
+        mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
+    }
+    mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
+    if (jcp.with_bias) mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    mov(reg_ch_blocks, ptr[this->param1 + GET_OFF(load_work)]);
+
+    Label ch_blocks_tail_label;
+    Label exit_label;
+
+    int ch_blocks_tail = jcp.nb_ch % jcp.nb_ch_blocking;
+
+    const auto oc_tail = jcp.oc_without_padding % jcp.ch_block;
+    if (oc_tail != 0) {
+        // Prepare masks for tailing
+        // Not using io_->prepare_tail_mask() since mask needs shifting
+        const int oc_tail_shift
+                = jcp.ch_block - jcp.oc_without_padding % jcp.ch_block;
+        static constexpr auto zmm_full_mask = ((1 << 16) - 1);
+        Reg32 reg_tail_32 = reg_tail.cvt32();
+        mov(reg_tail_32, (zmm_full_mask >> oc_tail_shift));
+        kmovw(k_oc_tail_mask, reg_tail_32);
+    }
+
+    if (is_src_layout_nxc()) {
+        ow_loop(jcp.nb_ch);
+    } else {
+        cmp(reg_ch_blocks, (jcp.nb_ch_blocking - 1) * jcp.ch_block);
+        jle(ch_blocks_tail ? ch_blocks_tail_label : exit_label, T_NEAR);
+
+        ow_loop(jcp.nb_ch_blocking); // channel main loop
+
+        if (ch_blocks_tail) {
+            jmp(exit_label, T_NEAR);
+            L(ch_blocks_tail_label);
+            ow_loop(ch_blocks_tail); // channel tail loop
+        }
+
+        L(exit_label);
+    }
+
+    this->postamble();
+
+    if (jcp.with_eltwise)
+        postops_injector_->prepare_table(/* generate = */ true);
+}
+
+#undef GET_OFF
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_avx512_core_f16_dw_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_f16_dw_conv_kernel.hpp
new file mode 100644
index 00000000000..5a5079ea7c5
--- /dev/null
+++ b/src/cpu/x64/jit_avx512_core_f16_dw_conv_kernel.hpp
@@ -0,0 +1,121 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_AVX512_CORE_F16_DW_CONV_KERNEL_HPP
+#define CPU_X64_JIT_AVX512_CORE_F16_DW_CONV_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+#include "utils/jit_io_helper.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct jit_avx512_dw_conv_fwd_kernel_f16 : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_dw_conv_fwd_kernel_f16)
+
+    jit_avx512_dw_conv_fwd_kernel_f16(
+            const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md);
+
+    jit_conv_conf_t jcp;
+
+private:
+    using reg64_t = const Xbyak::Reg64;
+    using mask_t = const Xbyak::Opmask;
+
+    // dw convolution
+    reg64_t reg_input = r8;
+    reg64_t aux_reg_input = r9;
+    reg64_t reg_kernel = r10;
+    reg64_t aux_reg_kernel = r11;
+    reg64_t reg_ch_blocks = r12;
+    reg64_t reg_output = r13;
+    reg64_t reg_bias = r14;
+    reg64_t reg_kh = r15;
+    reg64_t iter_kh = rax;
+    reg64_t reg_oi = rbx;
+    reg64_t aux_reg_ch_blocks = rsi;
+    // fused convolution
+    reg64_t reg_input_buffer_ptr = rdx;
+    reg64_t aux_reg_input_buffer_ptr = rbp;
+    reg64_t reg_iw_offset = reg_input; //Hack: clear reg_input early in kernel
+
+    reg64_t reg_tmp = reg_ch_blocks;
+    reg64_t reg_tail = rax;
+    mask_t k_oc_tail_mask = Xbyak::Opmask(2);
+
+    inline void load_src(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+    inline void compute_loop(int ur_w, int ur_ch_blocks, int pad_l, int pad_r);
+    inline void ow_loop(int ur_ch_blocks);
+    inline void apply_filter_unrolled(
+            int ur_ch_blocks, int ur_w, int pad_l, int pad_r, bool is_ch_tail);
+    inline void apply_postops(
+            const int ur_ch_blocks, const int ur_w, const bool is_ch_tail);
+    inline void store_dst(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+
+    inline Xbyak::Zmm get_ker_reg(int idx) { return Xbyak::Zmm(idx + 0); }
+    inline Xbyak::Zmm get_src_reg(int idx) { return Xbyak::Zmm(idx + 1); }
+    inline int get_acc_reg_idx(int idx) {
+        const int max_regs = 32;
+        return idx + (max_regs - jcp.ur_w * jcp.nb_ch_blocking);
+    }
+    inline Xbyak::Zmm get_acc_reg(int idx) {
+        return Xbyak::Zmm(get_acc_reg_idx(idx));
+    }
+
+    int get_ow_start(int ki, int pad_l) {
+        return nstl::max(0,
+                utils::div_up(pad_l - ki * (jcp.dilate_w + 1), jcp.stride_w));
+    }
+
+    int get_ow_end(int ur_w, int ki, int pad_r) {
+        return ur_w
+                - nstl::max(0,
+                        utils::div_up(
+                                pad_r - (jcp.kw - 1 - ki) * (jcp.dilate_w + 1),
+                                jcp.stride_w));
+    }
+
+    inline bool is_src_layout_nxc() {
+        return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
+                format_tag::nwc);
+    }
+    inline bool is_dst_layout_nxc() {
+        return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
+                format_tag::nwc);
+    }
+
+    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
+            postops_injector_;
+    std::unique_ptr<io::jit_io_multi_dt_helper_t<Xbyak::Zmm>> io_;
+
+    void generate() override;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.cpp
new file mode 100644
index 00000000000..e94abef6ea3
--- /dev/null
+++ b/src/cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.cpp
@@ -0,0 +1,896 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "jit_avx512_core_fork_bf16_dw_conv_kernel.hpp"
+
+#define GET_OFF(field) offsetof(jit_conv_call_s, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace Xbyak;
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::load_src(int ur_ch_blocks, int ur_w, bool last_ch_block_flag) {
+    const auto dst_layout_nxc = is_dst_layout_nxc();
+    const auto ch_blk = jcp.ch_block;
+    const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
+    const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        const bool mask_flag = last_ch_block_flag && ch == ur_ch_blocks - 1;
+        for (int ow = 0; ow < ur_w; ow++) {
+            Zmm zmm_acc = get_acc_reg(ch * ur_w + ow);
+            const Zmm zmm_acc_msk
+                    = mask_flag ? zmm_acc | ktail_mask | T_z : zmm_acc;
+
+            if (this->jcp.with_bias) {
+                int b_off = ch * ch_blk;
+                uni_vmovups(zmm_acc_msk, vmmword[reg_bias + b_off * sizeof(float)]);
+            } else {
+                uni_vpxor(zmm_acc, zmm_acc, zmm_acc);
+            }
+            if (this->jcp.with_sum) {
+                int o_off = ch * ocb_stride + ow * ow_stride;
+                if (jcp.dst_dt == data_type::bf16) {
+                    const Zmm zmm_prev_dst_msk = mask_flag
+                                                 ? zmm_prev_dst | ktail_mask | T_z
+                                                 : zmm_prev_dst;
+                    vpmovzxwd(zmm_prev_dst_msk,
+                            vmmword[reg_output + o_off * jcp.typesize_out]);
+                    vpslld(zmm_prev_dst, zmm_prev_dst, 16);
+                    vaddps(zmm_acc, zmm_prev_dst);
+                } else {
+                    uni_vaddps(zmm_acc_msk, zmm_acc_msk,
+                            vmmword[reg_output + o_off * jcp.typesize_out]);
+                }
+            }
+        }
+    }
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::apply_filter(
+        int ur_ch_blocks, int ur_w, bool last_ch_block_flag) {
+    int ch_block = jcp.ch_block;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto iw_stride = src_layout_nxc ? jcp.ngroups : ch_block;
+    const auto ih_stride = jcp.iw * iw_stride;
+    const auto icb_stride = src_layout_nxc
+                            ? ch_block
+                            : jcp.ih * jcp.iw * ch_block;
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    push(aux1_reg_kernel);
+    base_post_ops_data_offset += reg64_size;
+    L(kh_label); {
+        mov(iter_kw, reg_kw);
+        mov(aux1_reg_input, aux_reg_input);
+        mov(aux1_reg_kernel, aux_reg_kernel);
+
+        Label kw_label;
+        L(kw_label); {
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                const bool mask_flag = last_ch_block_flag && ch == ur_ch_blocks - 1;
+                int ker_off = ch * jcp.kh * jcp.kw * ch_block;
+                const Zmm zmm_ker_reg_msk = mask_flag
+                                            ? zmm_ker_reg | ktail_mask | T_z
+                                            : zmm_ker_reg;
+                vpmovzxwd(zmm_ker_reg_msk,
+                        ptr[aux1_reg_kernel + ker_off * jcp.typesize_in]);
+                for (int ow = 0; ow < ur_w; ow++) {
+                    const Zmm zmm_src_reg_msk = mask_flag
+                                                ? zmm_src_reg | ktail_mask | T_z
+                                                : zmm_src_reg;
+                    Zmm zmm_acc = get_acc_reg(ch * ur_w + ow);
+                    int inp_off = ch * icb_stride
+                            + ow * stride_w * iw_stride;
+                    /* zero-extend bf16 to packed 32-bit int */
+                    vpmovzxwd(zmm_src_reg_msk,
+                            ptr[aux1_reg_input + inp_off * jcp.typesize_in]);
+                    if (!isa_has_bf16(jcp.isa)) {
+                        bf16_emu_->vdpbf16ps(zmm_acc, zmm_ker_reg, zmm_src_reg);
+                    } else {
+                        vdpbf16ps(zmm_acc, zmm_ker_reg, zmm_src_reg);
+                    }
+                }
+            }
+            add(aux1_reg_kernel, ch_block * jcp.typesize_in);
+            add(aux1_reg_input, iw_stride * dilate_w * jcp.typesize_in);
+
+            dec(iter_kw);
+            cmp(iter_kw, 0);
+            jg(kw_label, T_NEAR);
+        }
+        add(aux_reg_kernel, jcp.kw * ch_block * jcp.typesize_in);
+        add(aux_reg_input, ih_stride * dilate_h * jcp.typesize_in);
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+        pop(aux1_reg_kernel);
+        base_post_ops_data_offset -= reg64_size;
+    }
+
+    L(iter_exit_label);
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::apply_filter_unrolled(
+        int ur_ch_blocks, int ur_w, bool last_ch_block_flag) {
+    int ch_blk = jcp.ch_block;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto iw_stride = src_layout_nxc ? jcp.ngroups : ch_blk;
+    const auto ih_stride = jcp.iw * iw_stride;
+    const auto icb_stride = src_layout_nxc
+                            ? ch_blk
+                            : jcp.ih * jcp.iw * ch_blk;
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label); {
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            const bool mask_flag = last_ch_block_flag && ch == ur_ch_blocks - 1;
+            for (int kw = 0; kw < jcp.kw; kw++) {
+                int ker_off = ch * jcp.kh * jcp.kw * ch_blk + kw * ch_blk;
+                const Zmm zmm_ker_reg_msk = mask_flag
+                                            ? zmm_ker_reg | ktail_mask | T_z
+                                            : zmm_ker_reg;
+
+                vpmovzxwd(zmm_ker_reg_msk,
+                        ptr[aux_reg_kernel + ker_off * jcp.typesize_in]);
+                for (int ow = 0; ow < ur_w; ow++) {
+                    const Zmm zmm_src_reg_msk = mask_flag
+                                                ? zmm_src_reg | ktail_mask | T_z
+                                                : zmm_src_reg;
+                    Zmm zmm_acc = get_acc_reg(ch * ur_w + ow);
+                    int inp_off = ch * icb_stride
+                            + ow * stride_w * iw_stride + kw * dilate_w * iw_stride;
+                    /* zero-extend bf16 to packed 32-bit int */
+                    vpmovzxwd(zmm_src_reg_msk,
+                            ptr[aux_reg_input + inp_off * jcp.typesize_in]);
+                    if (!isa_has_bf16(jcp.isa)) {
+                        bf16_emu_->vdpbf16ps(zmm_acc, zmm_ker_reg, zmm_src_reg);
+                    } else {
+                        vdpbf16ps(zmm_acc, zmm_ker_reg, zmm_src_reg);
+                    }
+                }
+            }
+        }
+
+        add(aux_reg_kernel, jcp.kw * ch_blk * jcp.typesize_in);
+        add(aux_reg_input, ih_stride * dilate_h * jcp.typesize_in);
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <typename F>
+static void iterate(const int ur_ch_blocks, const int ur_w,
+        const bool mask_tail, const F &f) {
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        const bool mask_flag = mask_tail && ch + 1 == ur_ch_blocks;
+        for (int ow = 0; ow < ur_w; ow++)
+            f(ch, ow, mask_flag);
+    }
+}
+template <typename F>
+static void iterate(const int ur_ch_blocks, const int ur_w, const F &f) {
+    iterate(ur_ch_blocks, ur_w, false, f);
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::apply_postprocess(
+        int ur_ch_blocks, int ur_w, bool last_ch_block_flag) {
+    int eltwise_inj_idx = 0;
+    int depthwise_inj_idx = 0;
+    int binary_inj_idx = 0;
+    std::size_t post_ops_data_offset = 0;
+    const auto& p = attr_.post_ops_;
+
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            int start_idx = get_acc_reg(0).getIdx();
+            int end_idx = get_acc_reg(ur_w * ur_ch_blocks).getIdx();
+
+            eltwise_injectors[eltwise_inj_idx]->compute_vector_range(start_idx, end_idx);
+            eltwise_inj_idx++;
+        } else if (post_op.is_binary()) {
+            injector_utils::vmm_index_set_t vmm_idxs;
+            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
+                    rhs_arg_params_tail;
+
+            const auto dst_layout_nxc = is_dst_layout_nxc();
+            // width per output channel block
+            const auto ch_blk = jcp.ch_block;
+            // ncx: next output channel block stride
+            const auto ocb_stride
+                    = dst_layout_nxc ? ch_blk : jcp.od * jcp.oh * jcp.ow * ch_blk;
+            // ncx: next w inside a output channel block
+            const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+            // ncx: if has tail
+            const auto mask_tail_blocked_layout
+                    = jcp.oc_without_padding % jcp.ch_block && !dst_layout_nxc;
+            // tail value
+            const auto mask_tail = jcp.oc_without_padding % jcp.ch_block;
+
+            iterate(ur_ch_blocks, ur_w, mask_tail,
+                    [&](int ch, int ow, int mask_flag) {
+                        const size_t aux_output_l_off = jcp.typesize_out
+                                * (ch * ocb_stride + ow * ow_stride);
+                        const auto vmm_idx = get_acc_reg(ch * ur_w + ow).getIdx();
+                        vmm_idxs.emplace(vmm_idx);
+
+                        rhs_arg_params_tail.vmm_idx_to_out_reg.emplace(
+                                vmm_idx, reg_output);
+                        rhs_arg_params_tail.vmm_idx_to_out_elem_off_val.emplace(
+                                vmm_idx, aux_output_l_off);
+                        if (mask_flag)
+                            rhs_arg_params_tail.vmm_tail_idx_.emplace(vmm_idx);
+                    });
+            rhs_arg_params = rhs_arg_params_tail;
+            rhs_arg_params.vmm_tail_idx_.clear();
+
+            Label postops_done;
+            if (mask_tail_blocked_layout) {              
+                Label postops_no_tail;
+                push(aux_reg_blocks_offset);
+                mov(aux_reg_blocks_offset, ptr[param1 + GET_OFF(load_work)]);
+                cmp(aux_reg_blocks_offset, jcp.nb_ch_blocking * jcp.ch_block);
+                pop(aux_reg_blocks_offset);
+                jge(postops_no_tail, T_NEAR);
+                binary_injector->compute_vector_range(vmm_idxs,
+                    binary_inj_idx, post_op, rhs_arg_params_tail);
+                jmp(postops_done, T_NEAR);
+                L(postops_no_tail);
+            } else if (last_ch_block_flag) {
+                binary_injector->compute_vector_range(vmm_idxs,
+                    binary_inj_idx, post_op, rhs_arg_params_tail);                
+            } else {
+                binary_injector->compute_vector_range(vmm_idxs,
+                    binary_inj_idx, post_op, rhs_arg_params);                
+                L(postops_done);
+            }
+            binary_inj_idx++;
+        } else if (post_op.is_depthwise()) {
+            push(aux_reg_blocks_offset);
+            base_post_ops_data_offset += reg64_size;
+            add(aux_reg_blocks_offset, ptr[this->param1 + GET_OFF(oc_off)]); //add offset of processed blocks
+
+            mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+            add(reg_d_weights, aux_reg_blocks_offset);
+
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                int start_idx = get_acc_reg(ur_w * ch).getIdx();
+                int end_idx = get_acc_reg(ur_w * ch + ur_w).getIdx();
+
+                depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                    start_idx, end_idx, reg_d_weights, reg_d_weights);
+
+                add(reg_d_weights, jcp.ch_block * sizeof(float));
+            }
+            pop(aux_reg_blocks_offset);
+            base_post_ops_data_offset -= reg64_size;
+
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+            binary_inj_idx++;
+        }
+    }
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::store_dst(int ur_ch_blocks, int ur_w, bool last_ch_block_flag) {
+    const auto dst_layout_nxc = is_dst_layout_nxc();
+    const auto ch_blk = jcp.ch_block;
+    const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
+    const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+
+    if (jcp.dst_dt == data_type::bf16 && (!isa_has_bf16(jcp.isa)))
+        bf16_emu_->init_vcvtneps2bf16();
+
+    if (dst_layout_nxc && jcp.dst_dt == data_type::bf16
+        && isa_has_bf16(jcp.isa)) {
+        for (int j = 0; j < ur_w; ++j) {
+            int n_2bf2ps = (ur_ch_blocks / 2) * 2;
+            int ch = 0;
+            for (; ch < n_2bf2ps; ch += 2) {
+                size_t aux_output_offset
+                        = (size_t)ch * ocb_stride + j * ow_stride;
+                auto addr = ptr[reg_output
+                                + aux_output_offset * jcp.typesize_out];
+                auto zmm_dst = get_acc_reg(ch * ur_w + j);
+                vcvtne2ps2bf16(
+                        zmm_dst, get_acc_reg((ch + 1) * ur_w + j), zmm_dst);
+                bool mask_flag = last_ch_block_flag && ch + 2 == ur_ch_blocks;
+                Zmm zmm_dst_msk = mask_flag ? zmm_dst | k_ch_tail_mask_extended
+                                            : zmm_dst;
+                vmovdqu16(addr, zmm_dst_msk);
+            }
+            /* Perform tail write for odd ch sizes */
+            if (ch < ur_ch_blocks) {
+                size_t aux_output_offset
+                        = (size_t) ch * ocb_stride + j * ow_stride;
+                auto addr = ptr[reg_output
+                                + aux_output_offset * jcp.typesize_out];
+                auto zmm_dst = get_acc_reg(ch * ur_w + j);
+                auto ymm_dst = Ymm(zmm_dst.getIdx());
+                vcvtneps2bf16(ymm_dst, zmm_dst);
+                Ymm ymm_dst_msk = last_ch_block_flag ? ymm_dst | ktail_mask : ymm_dst;
+                vmovdqu16(addr, ymm_dst_msk);
+            }
+        }
+    } else {
+        // also used for case when dst_layout_nxc && dst.dt == f32
+        if (jcp.dst_dt == data_type::f32) {
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                bool mask_flag = last_ch_block_flag && ch == ur_ch_blocks - 1;
+                for (int ow = 0; ow < ur_w; ow++) {
+                    int o_off = ch * ocb_stride + ow * ow_stride;
+                    Zmm zmm_dst = get_acc_reg(ch * ur_w + ow);
+                    Zmm zmm_dst_msk = mask_flag ? zmm_dst | ktail_mask : zmm_dst;
+                    vmovups(vmmword[reg_output + o_off * jcp.typesize_out],
+                            zmm_dst_msk);
+                }
+            }
+        } else if (jcp.dst_dt == data_type::bf16) {
+            if (isa_has_bf16(jcp.isa)) { // !dst_layout_nxc()
+                assert(jcp.ngroups % jcp.ch_block == 0);
+                for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                    int n_2bf2ps = (ur_w / 2) * 2;
+                    int j = 0;
+                    for (; j < n_2bf2ps; j += 2) {
+                        size_t aux_output_offset
+                                = (size_t)ch * ocb_stride + j * ow_stride;
+                        auto addr = ptr[reg_output
+                                        + aux_output_offset * jcp.typesize_out];
+                        auto zmm_dst = get_acc_reg(ch * ur_w + j);
+                        vcvtne2ps2bf16(zmm_dst, get_acc_reg(ch * ur_w + j + 1),
+                                       get_acc_reg(ch * ur_w + j));
+                        vmovups(addr, zmm_dst);
+                    }
+                    /* Perform tail write for odd ur_w sizes */
+                    if (j < ur_w) {
+                        size_t aux_output_offset
+                                = (size_t)ch * ocb_stride + j * ow_stride;
+                        auto addr = ptr[reg_output
+                                        + aux_output_offset * jcp.typesize_out];
+                        auto zmm_dst = get_acc_reg(ch * ur_w + j);
+                        auto ymm_dst = Ymm(zmm_dst.getIdx());
+                        vcvtneps2bf16(ymm_dst, zmm_dst);
+                        vmovups(addr, ymm_dst);
+                    }
+                }
+            } else {
+                for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                    bool mask_flag = last_ch_block_flag && ch == ur_ch_blocks - 1;
+                    for (int ow = 0; ow < ur_w; ow++) {
+                        int o_off = ch * ocb_stride + ow * ow_stride;
+                        Zmm zmm_dst = get_acc_reg(ch * ur_w + ow);
+
+                        /* down-convert f32 output to bf16 */
+                        auto ymm_dst = Ymm(zmm_dst.getIdx());
+                        bf16_emu_->vcvtneps2bf16(ymm_dst, zmm_dst);
+
+                        Ymm ymm_dst_msk = mask_flag ? ymm_dst | ktail_mask : ymm_dst;
+                        vmovdqu16(ptr[reg_output + o_off * jcp.typesize_out], ymm_dst_msk);
+                    }
+                }
+            }
+        } else
+            assert(!"unsupported destination type");
+    }
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::compute_loop(int ur_w, int ur_ch_blocks) {
+    const bool ch_loop = ur_ch_blocks > jcp.nb_ch_blocking;
+    // ch_loop currently happen only when data layout is nxc. The strides are
+    // calculated for this layout only.
+    const size_t wei_ch_stride = (size_t)jcp.nb_ch_blocking * jcp.kd * jcp.kh * jcp.kw
+                                 * jcp.ch_block * jcp.typesize_in;
+    const size_t inp_ch_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * jcp.typesize_in;
+    const size_t out_ch_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * jcp.typesize_out;
+    const size_t bias_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * sizeof(float);
+
+    auto compute = [&](int ur_ch_blocks, bool last_ch_block_flag = false) {
+        mov(aux_reg_input, reg_input);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_src(ur_ch_blocks, ur_w, last_ch_block_flag);
+        if (ur_w == 1) {
+            apply_filter(ur_ch_blocks, ur_w, last_ch_block_flag);
+        } else {
+            apply_filter_unrolled(ur_ch_blocks, ur_w, last_ch_block_flag);
+        }
+        apply_postprocess(ur_ch_blocks, ur_w, last_ch_block_flag);
+        store_dst(ur_ch_blocks, ur_w, last_ch_block_flag);
+    };
+
+    const bool masked_ch_block_tail = jcp.oc % jcp.ch_block != 0;
+
+    xor_(aux_reg_blocks_offset, aux_reg_blocks_offset);
+
+    if (ch_loop) {
+        Label ch_loop_label, ch_tail_label, skip_ch_tail_label;
+        const int nb_ch = jcp.oc / jcp.ch_block;
+        const int nb_ch_blocking_tail = jcp.nb_ch - utils::rnd_dn(nb_ch, jcp.nb_ch_blocking);
+        const int ch_step = jcp.nb_ch_blocking * jcp.ch_block;
+
+        push(aux_reg_ch_blocks);
+        mov(aux_reg_ch_blocks, reg_ch_blocks);
+        push(reg_kernel);
+        push(reg_input);
+        push(reg_output);
+        base_post_ops_data_offset += 4 * reg64_size;
+        if (jcp.with_bias) {
+            push(reg_bias);
+            base_post_ops_data_offset += reg64_size;
+        }
+
+        if (nb_ch >= jcp.nb_ch_blocking) {
+            if (nb_ch_blocking_tail) {
+                cmp(aux_reg_ch_blocks, ch_step);
+                jl(ch_tail_label, T_NEAR);
+            }
+
+            L(ch_loop_label);
+            {
+                compute(jcp.nb_ch_blocking);
+                add(reg_kernel, wei_ch_stride);
+                add(reg_input, inp_ch_stride);
+                add(reg_output, out_ch_stride);
+                if (jcp.with_bias) add(reg_bias, bias_stride);
+                sub(aux_reg_ch_blocks, ch_step);
+                add(aux_reg_blocks_offset, ch_step * sizeof(float)); //add initial offset of processed blocks
+                cmp(aux_reg_ch_blocks, ch_step);
+                jge(ch_loop_label, T_NEAR);
+            }
+        }
+
+        if (nb_ch_blocking_tail) {
+            // ch work range [1, jcp.nb_ch_blocking * ch_block)
+            L(ch_tail_label);
+            cmp(aux_reg_ch_blocks, 0);
+            jle(skip_ch_tail_label, T_NEAR);
+            compute(nb_ch_blocking_tail, masked_ch_block_tail);
+            L(skip_ch_tail_label);
+        }
+
+        if (jcp.with_bias) {
+            pop(reg_bias);
+            base_post_ops_data_offset -= reg64_size;
+        }
+        pop(reg_output);
+        pop(reg_input);
+        pop(reg_kernel);
+        pop(aux_reg_ch_blocks);
+        base_post_ops_data_offset -= 4 * reg64_size;
+
+    } else {
+        compute(ur_ch_blocks, masked_ch_block_tail);
+    }
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::loop_ow(int ur_ch_blocks) {
+
+    Label unrolled_w_label;
+    Label tail_w_label;
+    Label exit_label;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto dat_c_stride = src_layout_nxc ? jcp.ngroups : jcp.ch_block;
+
+    L(unrolled_w_label); {
+        int ur_w = jcp.ur_w;
+
+        size_t inp_shift = (size_t)jcp.typesize_in * ur_w * jcp.stride_w * dat_c_stride;
+        size_t out_shift = (size_t)jcp.typesize_out * ur_w * dat_c_stride;
+
+        cmp(reg_ur_w, ur_w);
+        jl(tail_w_label, T_NEAR);
+
+        compute_loop(ur_w, ur_ch_blocks);
+
+        add(reg_input, inp_shift);
+        add(reg_output, out_shift);
+
+        sub(reg_ur_w, ur_w);
+        jmp(unrolled_w_label);
+    }
+
+    L(tail_w_label); {
+        int ur_w = 1;
+
+        size_t inp_shift = (size_t)jcp.typesize_in * ur_w * jcp.stride_w * dat_c_stride;
+        size_t out_shift = (size_t)jcp.typesize_out * ur_w * dat_c_stride;
+
+        cmp(reg_ur_w, ur_w);
+        jl(exit_label, T_NEAR);
+
+        compute_loop(ur_w, ur_ch_blocks);
+
+        add(reg_input, inp_shift);
+        add(reg_output, out_shift);
+
+        sub(reg_ur_w, ur_w);
+        jmp(tail_w_label);
+    }
+
+    L(exit_label);
+}
+
+void jit_avx512_fork_dw_conv_fwd_kernel_bf16::generate() {
+    const auto& p = attr_.post_ops_;
+    bool with_binary = false;
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors.push_back(new jit_uni_eltwise_injector_t<avx512_core>(
+                this,
+                post_op.eltwise
+                ));
+        } else if (post_op.is_binary()) {
+            with_binary = true;
+        } else if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<avx512_core>(
+                this,
+                post_op
+                ));
+        }
+    }
+    if (with_binary) {
+        static constexpr bool preserve_gpr = true;
+        // need to tune if no need to preserve it
+        static constexpr bool preserve_vmm = true;
+        static constexpr size_t helper_vmm_idx = 31;
+        const size_t tail_size = jcp.oc_without_padding
+                % (cpu_isa_traits_t<avx512_core>::vlen / sizeof(float));
+        static constexpr bool use_exact_tail_scalar_bcast = false;
+        const binary_injector::rhs_arg_static_params_t rhs_sp {
+            helper_vmm_idx, r10, r11, r12, preserve_gpr,
+            preserve_vmm, GET_OFF(post_ops_binary_rhs_arg_vec),
+            GET_OFF(dst_orig), memory_desc_wrapper(&dst_md_),
+            tail_size, k_oc_tail_mask, use_exact_tail_scalar_bcast};
+        const binary_injector::static_params_t bsp {this->param1, rhs_sp};
+        binary_injector = utils::make_unique<
+                binary_injector::jit_uni_binary_injector_t<avx512_core>>(
+                this, bsp);
+    }
+    this->preamble();
+
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_input;
+        auto aux_reg1 = reg_output;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
+    mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
+    if (jcp.with_bias)
+        mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    mov(reg_kw, ptr[this->param1 + GET_OFF(kw_padding)]);
+    mov(reg_ch_blocks, ptr[this->param1 + GET_OFF(load_work)]);
+    mov(reg_ur_w, ptr[this->param1 + GET_OFF(ur_w)]);
+
+    Label ch_blocks_tail_label;
+    Label exit_label;
+
+    int ch_blocks_tail = jcp.nb_ch % jcp.nb_ch_blocking;
+    const auto oc_tail = jcp.oc_without_padding % jcp.ch_block;
+    if (oc_tail != 0) {
+        // Note: is_src_layout_nxc() == true, otherwise channels are padded
+        // Prepare masks for tailing
+        const int oc_tail_shift
+                = jcp.ch_block - jcp.oc_without_padding % jcp.ch_block;
+        static constexpr auto zmm_16b_mask = ((1 << 16) - 1);
+
+        // To account for special store optimization, where two oc_blocks are
+        // combined with one single write, extend the mask for 32 bits
+        // (i.e. 32 bfloat16 elements)
+        const bool need_extended_mask = jcp.dst_dt == data_type::bf16
+                                        && isa_has_bf16(jcp.isa) && jcp.nb_ch_blocking > 1;
+        if (need_extended_mask)
+            kxnord(k_ch_tail_mask_extended, k_ch_tail_mask_extended,
+                   k_ch_tail_mask_extended);
+
+        Label done;
+        mov(reg_tail, ptr[this->param1 + GET_OFF(load_work)]);
+        cmp(reg_tail, jcp.nb_ch_blocking * jcp.ch_block);
+        je(done, T_NEAR);
+        Reg32 reg_tail_32 = reg_tail.cvt32();
+        mov(reg_tail_32, zmm_16b_mask >> oc_tail_shift);
+        kmovw(k_oc_tail_mask, reg_tail_32);
+        if (need_extended_mask) {
+            auto zmm_32b_mask = (1 << (oc_tail + jcp.ch_block)) - 1;
+            mov(reg_tail_32, zmm_32b_mask);
+            kmovd(k_ch_tail_mask_extended, reg_tail_32);
+        }
+        L(done);
+    }
+
+    if (is_src_layout_nxc()) {
+        loop_ow(jcp.nb_ch);
+    } else {
+        cmp(reg_ch_blocks, (jcp.nb_ch_blocking - 1) * jcp.ch_block);
+        jle(ch_blocks_tail ? ch_blocks_tail_label : exit_label, T_NEAR);
+
+        loop_ow(jcp.nb_ch_blocking); // channel main loop
+
+        if (ch_blocks_tail) {
+            jmp(exit_label, T_NEAR);
+            L(ch_blocks_tail_label);
+
+            loop_ow(ch_blocks_tail); // channel tail loop
+        }
+
+        L(exit_label);
+    }
+
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
+    this->postamble();
+
+    for (auto& inj : eltwise_injectors)
+        inj->prepare_table();
+}
+
+inline void jit_avx512_fork_dw_conv_bwd_data_kernel_bf16::load_ddst(
+        int ur_ch_blocks, int ur_str_w) {
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        for (int w = 0; w < ur_str_w; w++) {
+            Zmm zmm_acc = get_acc_reg(ch * ur_str_w + w);
+            uni_vpxor(zmm_acc, zmm_acc, zmm_acc);
+        }
+    }
+}
+
+inline void jit_avx512_fork_dw_conv_bwd_data_kernel_bf16::apply_filter(
+        int ur_ch_blocks, int ur_str_w) {
+    int kw = jcp.kw;
+    int kh = jcp.kh;
+    int ow = jcp.ow;
+    int oh = jcp.oh;
+
+    int ch_blk = jcp.ch_block;
+    int stride_h = jcp.stride_h;
+    int stride_w = jcp.stride_w;
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label); {
+        mov(aux1_reg_ddst, aux_reg_ddst);
+        mov(aux1_reg_kernel, aux_reg_kernel);
+
+        mov(iter_kw, reg_kw);
+        Label kw_label;
+        L(kw_label); {
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                int ker_off = ch * kh * kw * ch_blk;
+                vpmovzxwd(zmm_ker_reg,
+                        ptr[aux1_reg_kernel + ker_off * jcp.typesize_in]);
+
+                for (int w = 0; w < ur_str_w; w++) {
+                    Zmm zmm_acc = get_acc_reg(ch * ur_str_w + w);
+                    int ddst_off = (ch * oh * ow + w) * ch_blk;
+                    vpmovzxwd(zmm_dst_reg,
+                            ptr[aux1_reg_ddst + ddst_off * jcp.typesize_in]);
+
+                    if (!isa_has_bf16(jcp.isa)) {
+                        bf16_emu_->vdpbf16ps(
+                                zmm_acc, zmm_dst_reg, zmm_ker_reg);
+                    } else {
+                        vdpbf16ps(zmm_acc, zmm_ker_reg, zmm_dst_reg);
+                    }
+                }
+            }
+
+            add(aux1_reg_kernel, ch_blk * stride_w * jcp.typesize_in);
+            sub(aux1_reg_ddst, ch_blk * jcp.typesize_in);
+
+            sub(iter_kw, stride_w);
+            cmp(iter_kw, 0);
+            jg(kw_label, T_NEAR);
+        }
+
+        add(aux_reg_kernel, kw * ch_blk * stride_h * jcp.typesize_in);
+        sub(aux_reg_ddst, ow * ch_blk * jcp.typesize_in);
+
+        sub(iter_kh, stride_h);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+inline void jit_avx512_fork_dw_conv_bwd_data_kernel_bf16::store_dsrc(
+        int ur_ch_blocks, int ur_str_w) {
+    int ch_blk = jcp.ch_block;
+    int iw = jcp.iw;
+    int ih = jcp.ih;
+    int stride_w = jcp.stride_w;
+
+    if (jcp.dsrc_dt == data_type::bf16 && (!isa_has_bf16(jcp.isa)))
+        bf16_emu_->init_vcvtneps2bf16();
+
+    for (int ch = 0; ch < ur_ch_blocks; ch++) {
+        for (int w = 0; w < ur_str_w; w++) {
+            int dsrc_off = (ch * ih * iw + w * stride_w) * ch_blk;
+            auto zmm_dsrc = get_acc_reg(ch * ur_str_w + w);
+
+            if (jcp.dsrc_dt == data_type::f32) {
+                uni_vmovups(
+                        ptr[reg_dsrc + dsrc_off * jcp.typesize_out], zmm_dsrc);
+            } else if (jcp.dsrc_dt == data_type::bf16) {
+                auto ymm_dsrc = Ymm(zmm_dsrc.getIdx());
+                if (isa_has_bf16(jcp.isa)) {
+                    vcvtneps2bf16(ymm_dsrc, zmm_dsrc);
+                } else {
+                    bf16_emu_->vcvtneps2bf16(ymm_dsrc, zmm_dsrc);
+                }
+                vmovups(ptr[reg_dsrc + dsrc_off * jcp.typesize_out], ymm_dsrc);
+            }
+        }
+    }
+    /* Note: current 'store_dsrc' is limited to storing 'ymm' output. This is
+     * because of the current implementation approach that calculates convolution as
+     * a strided backward-pass. To increase store throughput by writing 'zmm'
+     * registers, changes are needed in both JIT-kernel and Driver code. */
+}
+
+inline void jit_avx512_fork_dw_conv_bwd_data_kernel_bf16::loop_body(
+        int ur_ch_blocks) {
+    Label unrolled_w_label;
+    Label tail_w_label;
+    Label exit_label;
+
+    L(unrolled_w_label); {
+        int ur_w = jcp.ur_w;
+
+        cmp(reg_ur_str_w, ur_w);
+        jl(tail_w_label, T_NEAR);
+
+        mov(aux_reg_ddst, reg_ddst);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_ddst(ur_ch_blocks, ur_w);
+        apply_filter(ur_ch_blocks, ur_w);
+        store_dsrc(ur_ch_blocks, ur_w);
+
+        add(reg_dsrc, jcp.typesize_out * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_ddst, jcp.typesize_in * ur_w * jcp.ch_block);
+
+        sub(reg_ur_str_w, ur_w);
+        jmp(unrolled_w_label);
+    }
+
+    L(tail_w_label); {
+        int ur_w = 1;
+
+        cmp(reg_ur_str_w, ur_w);
+        jl(exit_label, T_NEAR);
+
+        mov(aux_reg_ddst, reg_ddst);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_ddst(ur_ch_blocks, ur_w);
+        apply_filter(ur_ch_blocks, ur_w);
+        store_dsrc(ur_ch_blocks, ur_w);
+
+        add(reg_dsrc, jcp.typesize_out * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_ddst, jcp.typesize_in * ur_w * jcp.ch_block);
+
+        sub(reg_ur_str_w, ur_w);
+        jmp(tail_w_label);
+    }
+
+    L(exit_label);
+}
+
+void jit_avx512_fork_dw_conv_bwd_data_kernel_bf16::generate() {
+    preamble();
+    mov(reg_dsrc, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_ddst, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    mov(reg_kw, ptr[this->param1 + GET_OFF(kw_padding)]);
+    mov(reg_ch_blocks, ptr[this->param1 + GET_OFF(ch_blocks)]);
+    mov(reg_ur_str_w, ptr[this->param1 + GET_OFF(ur_str_w)]);
+
+    Label ch_blocks_tail_label;
+    Label exit_label;
+
+    int ch_blocks_tail = jcp.nb_ch % jcp.nb_ch_blocking;
+
+    cmp(reg_ch_blocks, jcp.nb_ch_blocking);
+    jne(ch_blocks_tail ? ch_blocks_tail_label : exit_label, T_NEAR);
+
+    loop_body(jcp.nb_ch_blocking); // channel main loop
+
+    if (ch_blocks_tail) {
+        L(ch_blocks_tail_label);
+
+        cmp(reg_ch_blocks, ch_blocks_tail);
+        jne(exit_label, T_NEAR);
+
+        loop_body(ch_blocks_tail); // channel tail loop
+    }
+
+    L(exit_label);
+    this->postamble();
+}
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.hpp
new file mode 100644
index 00000000000..7d71bc9bc79
--- /dev/null
+++ b/src/cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.hpp
@@ -0,0 +1,212 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_AVX512_CORE_FORK_BF16_DW_CONV_KERNEL_HPP
+#define CPU_X64_JIT_AVX512_CORE_FORK_BF16_DW_CONV_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+
+#include "cpu/x64/jit_avx512_core_bf16cvt.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct jit_avx512_fork_dw_conv_fwd_kernel_bf16 : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_fork_dw_conv_fwd_kernel_bf16)
+
+    jit_avx512_fork_dw_conv_fwd_kernel_bf16(const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md, const primitive_attr_t& attr)
+        : jit_generator_t(jit_name()), jcp(ajcp), attr_(attr), bf16_emu_(nullptr), dst_md_(dst_md) {
+        if (!isa_has_bf16(jcp.isa))
+            bf16_emu_ = new bf16_emulation_t(this, bf16_emu_reserv_1,
+                    bf16_emu_reserv_2, bf16_emu_reserv_3, bf16_emu_reserv_4,
+                    bf16_emu_reserv_5, bf16_emu_reserv_6);
+    }
+
+    ~jit_avx512_fork_dw_conv_fwd_kernel_bf16() {
+        for (auto inj : eltwise_injectors)
+            delete inj;
+        eltwise_injectors.clear();
+
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+
+        delete bf16_emu_;
+    }
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t& attr_;
+
+private:
+    using reg64_t = const Xbyak::Reg64;
+    using mask_t = const Xbyak::Opmask;
+    const Xbyak::AddressFrame &vmmword = zword;
+
+    const int acc_idx_start = 2;
+    inline int get_max_regs() { return isa_has_bf16(jcp.isa) ? 30 : 25; };
+
+    // dw convolution
+    reg64_t reg_input = r8;
+    reg64_t aux_reg_input = r9;
+    reg64_t aux1_reg_input = r10;
+    reg64_t reg_kernel = r11;
+    reg64_t aux_reg_kernel = r12;
+    reg64_t reg_ch_blocks = r13;
+    reg64_t reg_output = r14;
+    reg64_t reg_bias = r15;
+    reg64_t reg_kh = rax;
+    reg64_t reg_kw = rbx;
+    reg64_t iter_kh = rdx;
+    reg64_t iter_kw = rsi;
+    reg64_t reg_ur_w = rbp;
+    reg64_t reg_tail = abi_not_param1;
+    reg64_t aux1_reg_kernel = reg_ch_blocks;
+    reg64_t imm_addr64 = aux1_reg_input;
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = iter_kh;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+    reg64_t aux_reg_ch_blocks = reg_ur_w;
+    reg64_t aux_reg_blocks_offset = reg_tail;
+
+    mask_t k_oc_tail_mask = Xbyak::Opmask(2);
+    mask_t ktail_mask = k_oc_tail_mask;
+    mask_t k_ch_tail_mask_extended = Xbyak::Opmask(3);
+
+    Xbyak::Zmm zmm_ker_reg = Xbyak::Zmm(0);
+    Xbyak::Zmm zmm_src_reg = Xbyak::Zmm(1);
+    Xbyak::Zmm zmm_prev_dst = Xbyak::Zmm(31);
+
+    /* Registers used for bfloat16 emulation */
+    Xbyak::Zmm bf16_emu_reserv_1 = Xbyak::Zmm(26);
+    Xbyak::Zmm bf16_emu_reserv_2 = Xbyak::Zmm(27);
+    Xbyak::Zmm bf16_emu_reserv_3 = Xbyak::Zmm(28);
+    reg64_t bf16_emu_reserv_4 = iter_kw;
+    Xbyak::Zmm bf16_emu_reserv_5 = Xbyak::Zmm(29);
+    Xbyak::Zmm bf16_emu_reserv_6 = Xbyak::Zmm(30);
+
+    inline Xbyak::Zmm get_acc_reg(int idx) {
+        assert(idx + acc_idx_start <= get_max_regs());
+        return Xbyak::Zmm(idx + acc_idx_start);
+    }
+
+    inline bool is_src_layout_nxc() {
+        return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
+                             format_tag::nwc);
+    }
+
+    inline bool is_dst_layout_nxc() {
+        return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
+                             format_tag::nwc);
+    }
+
+    inline void load_src(int ur_ch_blocks, int ur_w, bool last_ch_block_flag);
+    inline void compute_loop(int ur_w, int ur_ch_blocks);
+    inline void apply_filter(int ur_ch_blocks, int ur_w, bool last_ch_block_flag);
+    inline void apply_filter_unrolled(int ur_ch_blocks, int ur_w, bool last_ch_block_flag);
+    inline void apply_postprocess(int ur_ch_blocks, int ur_w, bool last_ch_block_flag);
+    inline void store_dst(int ur_ch_blocks, int ur_w, bool last_ch_block_flag);
+    inline void loop_ow(int ur_ch_blocks);
+
+    nstl::vector<jit_uni_eltwise_injector_t<avx512_core>*> eltwise_injectors;
+    nstl::vector<jit_uni_depthwise_injector_f32<avx512_core>*> depthwise_injectors;
+    std::unique_ptr<binary_injector::jit_uni_binary_injector_t<avx512_core>>
+            binary_injector;
+
+    bf16_emulation_t *bf16_emu_;
+    memory_desc_t dst_md_;
+
+    void generate() override;
+};
+
+struct jit_avx512_fork_dw_conv_bwd_data_kernel_bf16 : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_fork_dw_conv_bwd_data_kernel_bf16)
+
+    jit_avx512_fork_dw_conv_bwd_data_kernel_bf16(const jit_conv_conf_t &ajcp, const primitive_attr_t&)
+        : jit_generator_t(jit_name()), jcp(ajcp), bf16_emu_(nullptr) {
+
+        if (!isa_has_bf16(jcp.isa))
+            bf16_emu_ = new bf16_emulation_t(this, bf16_emu_reserv_1,
+                    bf16_emu_reserv_2, bf16_emu_reserv_3, bf16_emu_reserv_4,
+                    bf16_emu_reserv_5, bf16_emu_reserv_6);
+    }
+
+    ~jit_avx512_fork_dw_conv_bwd_data_kernel_bf16() { delete bf16_emu_; }
+
+    jit_conv_conf_t jcp;
+
+private:
+    using reg64_t = const Xbyak::Reg64;
+
+    const int acc_idx_start = 2;
+    inline int get_max_regs() { return isa_has_bf16(jcp.isa) ? 30 : 25; };
+
+    Xbyak::Zmm zmm_ker_reg = Xbyak::Zmm(0);
+    Xbyak::Zmm zmm_dst_reg = Xbyak::Zmm(1);
+
+    inline Xbyak::Zmm get_acc_reg(int idx) {
+        assert(idx + acc_idx_start <= get_max_regs());
+        return Xbyak::Zmm(idx + acc_idx_start);
+    }
+
+    reg64_t reg_ddst = rax;
+    reg64_t aux_reg_ddst = r8;
+    reg64_t aux1_reg_ddst = abi_not_param1;
+    reg64_t reg_kernel = rdx;
+    reg64_t aux_reg_kernel = r10;
+    reg64_t aux1_reg_kernel = rbp;
+    reg64_t reg_dsrc = rsi;
+
+    reg64_t reg_ur_str_w = r9;
+    reg64_t reg_ch_blocks = rbx;
+
+    reg64_t iter_kh = r11;
+    reg64_t iter_kw = r12;
+    reg64_t reg_kh = r13;
+    reg64_t reg_kw = r14;
+
+    Xbyak::Zmm bf16_emu_reserv_1 = Xbyak::Zmm(26);
+    Xbyak::Zmm bf16_emu_reserv_2 = Xbyak::Zmm(27);
+    Xbyak::Zmm bf16_emu_reserv_3 = Xbyak::Zmm(28);
+    reg64_t bf16_emu_reserv_4 = iter_kw;
+    Xbyak::Zmm bf16_emu_reserv_5 = Xbyak::Zmm(29);
+    Xbyak::Zmm bf16_emu_reserv_6 = Xbyak::Zmm(30);
+
+    bf16_emulation_t *bf16_emu_;
+
+    inline void loop_body(int ur_ch_blocks);
+    inline void load_ddst(int ur_ch_blocks, int ur_str_w);
+    inline void apply_filter(int ur_ch_blocks, int ur_str_w);
+    inline void store_dsrc(int ur_ch_blocks, int ur_str_w);
+
+    void generate() override;
+};
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_avx512_core_fp16cvt.hpp b/src/cpu/x64/jit_avx512_core_fp16cvt.hpp
index e61433f4383..4580c1f2930 100644
--- a/src/cpu/x64/jit_avx512_core_fp16cvt.hpp
+++ b/src/cpu/x64/jit_avx512_core_fp16cvt.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,18 +45,18 @@ struct jit_call_t {
 
 // performs element-by-element sum of inp and add float arrays and stores
 // result to float16 out array with downconversion
-struct jit_avx512_core_fp16_add_cvt_ps_to_f16_t : public jit_generator {
+struct jit_avx512_core_fp16_add_cvt_ps_to_f16_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_add_cvt_ps_to_f16)
 
     jit_avx512_core_fp16_add_cvt_ps_to_f16_t()
-        : jit_generator(jit_name()), simd_w_(16) {
+        : jit_generator_t(jit_name()), simd_w_(16) {
         create_kernel();
     }
 
     void generate() override;
 
     void operator()(f16_support::jit_call_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
         msan_unpoison(params->out, params->nelems * sizeof(float16_t));
     }
 
diff --git a/src/cpu/x64/jit_avx512_core_fp8cvt.cpp b/src/cpu/x64/jit_avx512_core_fp8cvt.cpp
index 9acd4c719ce..80abf550ce0 100644
--- a/src/cpu/x64/jit_avx512_core_fp8cvt.cpp
+++ b/src/cpu/x64/jit_avx512_core_fp8cvt.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -215,7 +215,7 @@ void fp8_emulation_e5m2_t::vcvt_f8_to_f16_vnni_block(int num_rows,
 
     prepare_f8_to_f16_vnni_masks(zmm_permute_idx);
 
-    const auto zmm_width_in_bytes = cpu_isa_traits<avx512_core>::vlen;
+    const auto zmm_width_in_bytes = cpu_isa_traits_t<avx512_core>::vlen;
 
     for (int r = 0; r < num_rows; r += 2) {
         perform_f8_to_f16_vnni_conversion(
@@ -304,7 +304,7 @@ void fp8_emulation_e4m3_t::vcvt_f8_to_f16_vnni(const Xbyak::Zmm &zmm_out1,
 void fp8_emulation_e4m3_t::vcvt_f8_to_f16_vnni_block(int num_rows,
         const Xbyak::Reg64 &reg_data_in, const Xbyak::Reg64 &reg_stride_in,
         const Xbyak::Reg64 &reg_data_out) {
-    const auto zmm_width_in_bytes = cpu_isa_traits<avx512_core>::vlen;
+    const auto zmm_width_in_bytes = cpu_isa_traits_t<avx512_core>::vlen;
     const Xbyak::Zmm zmm_out1(xmm_aux4_.getIdx());
     const Xbyak::Zmm zmm_out2(xmm_aux5_.getIdx());
     const Xbyak::Ymm ymm_out1(zmm_out1.getIdx());
@@ -485,7 +485,7 @@ void fp8_emulation_e4m3_t::tabulate(const data_type_t dt,
 }
 
 jit_cvt_fp8_t::jit_cvt_fp8_t(f32_convert_mode_t mode)
-    : jit_generator(jit_name(), avx512_core_fp16), mode_(mode) {
+    : jit_generator_t(jit_name(), avx512_core_fp16), mode_(mode) {
     switch (mode) {
         case f8_e5m2_to_f16:
         case f8_e5m2_to_f32:
diff --git a/src/cpu/x64/jit_avx512_core_fp8cvt.hpp b/src/cpu/x64/jit_avx512_core_fp8cvt.hpp
index d8f782ffe4c..5fcc2ccca26 100644
--- a/src/cpu/x64/jit_avx512_core_fp8cvt.hpp
+++ b/src/cpu/x64/jit_avx512_core_fp8cvt.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ namespace cpu {
 namespace x64 {
 
 struct fp8_emulation_base_t {
-    fp8_emulation_base_t(jit_generator *host, const Xbyak::Xmm &xmm_aux1,
+    fp8_emulation_base_t(jit_generator_t *host, const Xbyak::Xmm &xmm_aux1,
             const Xbyak::Xmm &xmm_aux2, const Xbyak::Xmm &xmm_aux3,
             const Xbyak::Reg64 reg64_aux)
         : host_(host)
@@ -68,7 +68,7 @@ struct fp8_emulation_base_t {
             const Xbyak::Xmm &xmm_out, const Xbyak::Operand &op_in);
 
 protected:
-    jit_generator *const host_;
+    jit_generator_t *const host_;
     Xbyak::Label label_table_to_f8_;
     Xbyak::Label label_vnni_permute_index_table_;
     const Xbyak::Xmm xmm_aux1_;
@@ -102,7 +102,7 @@ struct fp8_emulation_base_t {
 };
 
 struct fp8_emulation_e5m2_t : public fp8_emulation_base_t {
-    fp8_emulation_e5m2_t(jit_generator *host, const Xbyak::Xmm &xmm_aux1,
+    fp8_emulation_e5m2_t(jit_generator_t *host, const Xbyak::Xmm &xmm_aux1,
             const Xbyak::Xmm &xmm_aux2, const Xbyak::Xmm &xmm_aux3,
             const Xbyak::Opmask kmask_aux_, const Xbyak::Reg64 reg64_aux)
         : fp8_emulation_base_t(host, xmm_aux1, xmm_aux2, xmm_aux3, reg64_aux)
@@ -136,7 +136,7 @@ struct fp8_emulation_e5m2_t : public fp8_emulation_base_t {
 };
 
 struct fp8_emulation_e4m3_t : public fp8_emulation_base_t {
-    fp8_emulation_e4m3_t(jit_generator *host, const Xbyak::Xmm &xmm_aux1,
+    fp8_emulation_e4m3_t(jit_generator_t *host, const Xbyak::Xmm &xmm_aux1,
             const Xbyak::Xmm &xmm_aux2, const Xbyak::Xmm &xmm_aux3,
             const Xbyak::Xmm &xmm_aux4, const Xbyak::Xmm &xmm_aux5,
             const Xbyak::Reg64 reg64_aux)
@@ -187,7 +187,7 @@ enum f32_convert_mode_t {
     f32_to_f16,
 };
 
-struct jit_cvt_fp8_t : public jit_generator {
+struct jit_cvt_fp8_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_cvt_fp8_t)
 
     jit_cvt_fp8_t(f32_convert_mode_t mode);
diff --git a/src/cpu/x64/jit_avx512_core_resampling.cpp b/src/cpu/x64/jit_avx512_core_resampling.cpp
index 5a82b4ce1d2..91eaa631bcf 100644
--- a/src/cpu/x64/jit_avx512_core_resampling.cpp
+++ b/src/cpu/x64/jit_avx512_core_resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -752,7 +752,7 @@ struct jit_avx512_core_resampling_kernel_t
     }
 
     static constexpr std::size_t simd_w() {
-        return cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+        return cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     }
 
     Zmm zmm_src = Zmm(1);
@@ -801,7 +801,7 @@ struct jit_avx512_core_resampling_kernel_t
 jit_avx512_core_resampling_kernel_base_t::
         jit_avx512_core_resampling_kernel_base_t(
                 const resampling_pd_t *pd, const char *name)
-    : jit_generator(name), pd_(pd) {}
+    : jit_generator_t(name), pd_(pd) {}
 
 data_type_t jit_avx512_core_resampling_kernel_base_t::src_data_type() const {
     if (pd_->is_fwd())
@@ -820,6 +820,8 @@ data_type_t jit_avx512_core_resampling_kernel_base_t::dst_data_type() const {
 status_t jit_avx512_core_resampling_bwd_t::pd_t::init(engine_t *engine) {
     using namespace format_tag;
     using namespace data_type;
+    // disabling verbose dispatch messages for unsupported isa for
+    // better readability
     if (!mayiuse(avx512_core)) return status::unimplemented;
     VDISPATCH_RESAMPLING(!is_fwd(), VERBOSE_BAD_PROPKIND);
     VDISPATCH_RESAMPLING(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
diff --git a/src/cpu/x64/jit_avx512_core_resampling.hpp b/src/cpu/x64/jit_avx512_core_resampling.hpp
index 2c1e30fbe1a..5e034ae8056 100644
--- a/src/cpu/x64/jit_avx512_core_resampling.hpp
+++ b/src/cpu/x64/jit_avx512_core_resampling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,10 +32,10 @@ namespace x64 {
 
 struct jit_resampling_args_t;
 
-struct jit_avx512_core_resampling_kernel_base_t : public jit_generator {
+struct jit_avx512_core_resampling_kernel_base_t : public jit_generator_t {
     jit_avx512_core_resampling_kernel_base_t(
             const resampling_pd_t *pd, const char *name);
-    virtual ~jit_avx512_core_resampling_kernel_base_t() = default;
+    ~jit_avx512_core_resampling_kernel_base_t() override = default;
 
 protected:
     const resampling_pd_t *pd_;
@@ -55,7 +55,7 @@ struct jit_avx512_core_resampling_bwd_t : public primitive_t {
     };
 
     jit_avx512_core_resampling_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    ~jit_avx512_core_resampling_bwd_t();
+    ~jit_avx512_core_resampling_bwd_t() override;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_avx512_core_scale_precompute.cpp b/src/cpu/x64/jit_avx512_core_scale_precompute.cpp
index 1f98294a715..9abae9b9e1f 100644
--- a/src/cpu/x64/jit_avx512_core_scale_precompute.cpp
+++ b/src/cpu/x64/jit_avx512_core_scale_precompute.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
     ;
     if (jit_scale_precompute) {
         const auto &attr_scales = attr->scales_;
-        const int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
+        const int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
         size_t size = 0;
         auto loc_scales = scratchpad.template get<float>(
                 memory_tracking::names::key_precomputed_scales, &size);
@@ -53,10 +53,10 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
         const auto wei_scale_stride_ic
                 = wei_scale_per_ic ? wei_scale_per_oc ? OC : 1 : 0;
         const auto with_wei_scale
-                = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_values();
-        const auto wei_scale_groups_ndims
-                = with_wei_scale ? attr_scales.get(DNNL_ARG_WEIGHTS).ndims_ : 0;
-        const auto wei_scale_group_stride = wei_scale_groups_ndims > 0
+                = !attr_scales.has_default_values(DNNL_ARG_WEIGHTS);
+        const auto wei_scale_has_groups = with_wei_scale
+                && !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_groups();
+        const auto wei_scale_group_stride = wei_scale_has_groups
                 ? wei_scale_stride_ic * sizeof(float)
                 : 0;
 
@@ -66,14 +66,14 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
 
         assert(req_copy_scales(attr, scale_adjust_factor));
         assert(mayiuse(avx512_core));
-        assert(wei_scale_mask != 0);
-        if (wei_scale_groups_ndims > 0) {
+        assert(wei_scale_mask > 0);
+        if (wei_scale_has_groups) {
             assert(count == wei_scale_count);
             const auto wei_scale_groups_ic
-                    = attr_scales.get(DNNL_ARG_WEIGHTS).group_dims_[0];
+                    = attr_scales.get_group(DNNL_ARG_WEIGHTS, 0);
             const dim_t wei_scale_nb_ic = IC / wei_scale_groups_ic;
             const auto wei_scale_dt_sz = types::data_type_size(
-                    attr_scales.get(DNNL_ARG_WEIGHTS).data_type_);
+                    attr_scales.get_data_type(DNNL_ARG_WEIGHTS));
             for (int nb_ic = 0; nb_ic < wei_scale_nb_ic; nb_ic++) {
                 const auto offset = nb_ic * wei_scale_stride_ic;
                 jrp.nelems_ = wei_scale_stride_ic;
diff --git a/src/cpu/x64/jit_avx512_core_scale_precompute.hpp b/src/cpu/x64/jit_avx512_core_scale_precompute.hpp
index 432a9aee05c..66c2a80663d 100644
--- a/src/cpu/x64/jit_avx512_core_scale_precompute.hpp
+++ b/src/cpu/x64/jit_avx512_core_scale_precompute.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,39 +62,34 @@ const float *precompute_scales(const memory_tracking::grantor_t &scratchpad,
         float scale_adjust_factor = 1.0f, bool req_transpose = false);
 } // namespace scale_utils
 
-struct jit_avx512_core_scale_precompute_t : public jit_generator {
+struct jit_avx512_core_scale_precompute_t : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_scale_precompute_t)
 
     jit_avx512_core_scale_precompute_t(
             const primitive_attr_t *attr, const float scale_adjust_factor = 1)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , attr_(attr)
-        , with_wei_scales_(
-                  !attr_->scales_.get(DNNL_ARG_WEIGHTS).has_default_values())
+        , with_wei_scales_(!attr_->scales_.has_default_values(DNNL_ARG_WEIGHTS))
         , wei_scales_dt_(with_wei_scales_
-                          ? attr_->scales_.get(DNNL_ARG_WEIGHTS).data_type_
+                          ? attr_->scales_.get_data_type(DNNL_ARG_WEIGHTS)
                           : data_type::f32)
         , wei_scales_dsz_(types::data_type_size(wei_scales_dt_))
-        , wei_groups_ic_(with_wei_scales_
-                                  && attr_->scales_.get(DNNL_ARG_WEIGHTS).ndims_
-                                          > 0
-                          ? attr_->scales_.get(DNNL_ARG_WEIGHTS).group_dims_[0]
-                          : 1)
+        , wei_groups_ic_(attr_->scales_.get_group(DNNL_ARG_WEIGHTS, 0))
         , scale_adjust_factor_(scale_adjust_factor)
         , compute_scale_factor_(scale_adjust_factor_ != 1) {}
 
     void generate() override;
 
     void operator()(scale_utils::jit_call_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
         msan_unpoison(params->scales_, params->nelems_ * sizeof(float));
     }
 
 private:
     constexpr static int simd_w_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
-    using Vmm = typename cpu_isa_traits<avx512_core>::Vmm;
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
+    using Vmm = typename cpu_isa_traits_t<avx512_core>::Vmm;
 
     const primitive_attr_t *attr_;
     const bool with_wei_scales_;
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp
index b8e581de704..852277e95a3 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
 *******************************************************************************/
 
 #include <assert.h>
+#include <cpu/cpu_primitive.hpp>
 
 #include "common/c_types_map.hpp"
+#include "common/convolution_pd.hpp"
 #include "common/memory.hpp"
 #include "common/memory_tracking.hpp"
 #include "common/nstl.hpp"
@@ -48,11 +50,11 @@ _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::
         _jit_avx512_core_x8s8s32x_1x1_conv_kernel(
                 const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
                 const memory_desc_t &dst_md)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , jcp(ajcp)
     , attr_(attr)
     , postops_injector_(nullptr) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -70,10 +72,12 @@ _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::
                 use_exact_tail_scalar_bcast};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<avx512_core, Vmm>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
     if (jcp.dst_dt == data_type::bf16 && !isa_has_bf16(jcp.isa))
         bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
@@ -230,7 +234,17 @@ template <typename Vmm>
 void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::apply_postops(
         const int load_loop_blk, const int ur, const bool mask_flag_in,
         const float *p_sum_scale, const int32_t *p_sum_zp) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(load_loop_blk, ur,
+                [&](const bool, const int i_load, const int i_ur) {
+                    vmm_idx_off.insert({vreg_accum_idx(load_loop_blk, i_load, i_ur), i_load * jcp.load_block * sizeof(float)});
+                });
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  reg_oc_off, vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
 
         apply_sum(load_loop_blk, ur, mask_flag_in, p_sum_scale, p_sum_zp);
 
@@ -278,7 +292,7 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::apply_postops(
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             }
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
 
         } else {
@@ -287,7 +301,7 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::apply_postops(
                         vmm_idxs.emplace(
                                 vreg_accum_idx(load_loop_blk, i_load, i_ur));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
     }
 }
@@ -385,12 +399,12 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::reduce_loop(
             auto vmm_bias = vmm_tmp;
             auto vmm_comp = vmm_bcast;
             if (jcp.with_bias) {
-                if (jcp.signed_input || jcp.dst_scale)
+                if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                     mov(reg_bias_data,
                             EVEX_compress_addr(rsp, reg_bias_data_off));
                 cvt2ps(jcp.bia_dt, vmm_bias, bias_ptr(i_load), mask_flag);
             }
-            if (jcp.signed_input) {
+            if (jcp.signed_input || jcp.with_input_zp) {
                 mov(reg_comp_data, EVEX_compress_addr(rsp, reg_comp_data_off));
                 cvt2ps(data_type::s32, vmm_comp, comp_ptr(i_load), mask_flag);
             }
@@ -410,7 +424,7 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::reduce_loop(
             for (int i_ur = 0; i_ur < ur; ++i_ur) {
                 auto r = vreg_accum(load_loop_blk, i_load, i_ur);
                 vcvtdq2ps(r, r);
-                if (jcp.signed_input) vaddps(r, r, vmm_comp);
+                if (jcp.signed_input || jcp.with_input_zp) vaddps(r, r, vmm_comp);
                 if (jcp.src_zero_point) vaddps(r, r, vmm_zp);
 
                 const Vmm mask_vmm = mask_flag ? r | k_load_dim_mask | T_z : r;
@@ -465,8 +479,11 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::reduce_loop(
             }
         }
 
-        if (jcp.dst_dt == data_type::bf16 && !isa_has_bf16(jcp.isa))
-            bf16_emu_->init_vcvtneps2bf16();
+        if (jcp.dst_dt == data_type::bf16 && !isa_has_bf16(jcp.isa)) {
+            // bf16_emu_reserv_4 is used as reg_oc_off in these postOps
+            bool preserve_scrach = jcp.with_depthwise || jcp.with_quantization;
+            bf16_emu_->init_vcvtneps2bf16(preserve_scrach);
+        }
 
         // store to the destination
         if (jcp.dst_dt == data_type::bf16 && isa_has_bf16(jcp.isa)) {
@@ -572,6 +589,8 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::reduce_loop(
     Label reduce_loop;
     Label reduce_loop_tail;
 
+    push(reg_oc_off);
+
     mov(aux_reg_load_data, reg_load_data);
 
     mov(aux_reg_bcast_data, aux1_reg_bcast_data);
@@ -597,6 +616,8 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::reduce_loop(
         fma_block(false);
     }
 
+    pop(reg_oc_off);
+
     if (jcp.oc_without_padding != jcp.oc) {
         Label end_store, common_store;
         mov(EVEX_compress_addr(rsp, reg_bcast_data_off), reg_bcast_data);
@@ -626,9 +647,11 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::reduce_loop(
 
 template <typename Vmm>
 void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
-
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_load_data, reg_output_data);
+
     const int simd_w = jcp.ic_block;
     xor_(reg_scratch, reg_scratch);
     Reg16 _t = reg_scratch.cvt16();
@@ -636,11 +659,13 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
     vpbroadcastw(vmm_one, _t);
 
     sub(rsp, stack_space_needed);
+    base_post_ops_data_offset += stack_space_needed;
+
     if (jcp.with_binary)
         mov(EVEX_compress_addr(rsp, reg_abi_param1_backup), abi_param1);
 
     if (jcp.with_bias) mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]);
-    if (jcp.signed_input) {
+    if (jcp.signed_input || jcp.with_input_zp) {
         mov(EVEX_compress_addr(rsp, reg_bias_data_off), reg_bias_data);
         mov(reg_comp_data, ptr[param1 + GET_OFF(compensation)]);
         mov(EVEX_compress_addr(rsp, reg_comp_data_off), reg_comp_data);
@@ -654,7 +679,7 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
                 reg_src_zero_point);
     }
     if (jcp.dst_scale) {
-        if (!jcp.signed_input)
+        if (!jcp.signed_input && !jcp.with_input_zp)
             mov(EVEX_compress_addr(rsp, reg_bias_data_off), reg_bias_data);
         mov(reg_ptr_dst_scale, ptr[param1 + GET_OFF(dst_scale)]);
         mov(EVEX_compress_addr(rsp, reg_dst_scale_off), reg_ptr_dst_scale);
@@ -675,6 +700,7 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
     mov(EVEX_compress_addr(rsp, bcast_loop_work_off), reg_bcast_loop_work);
     mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]);
     mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     if (jcp.ic_block == 4 && jcp.dst_dt == data_type::bf16) {
         Reg32 reg_tail_32 = reg_load_dim_tail_mask.cvt32();
@@ -730,14 +756,14 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
         bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
         if (jcp.with_bias) {
-            if (jcp.signed_input || jcp.dst_scale)
+            if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                 mov(reg_bias_data, EVEX_compress_addr(rsp, reg_bias_data_off));
             add(reg_bias_data,
                     load_loop_blk * jcp.load_block * jcp.typesize_bia);
-            if (jcp.signed_input || jcp.dst_scale)
+            if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                 mov(EVEX_compress_addr(rsp, reg_bias_data_off), reg_bias_data);
         }
-        if (jcp.signed_input) {
+        if (jcp.signed_input || jcp.with_input_zp) {
             mov(reg_comp_data, EVEX_compress_addr(rsp, reg_comp_data_off));
             add(reg_comp_data,
                     load_loop_blk * jcp.load_block * sizeof(int32_t));
@@ -760,6 +786,7 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
         mov(reg_bcast_data, EVEX_compress_addr(rsp, reg_bcast_data_off));
         add(reg_output_data, load_loop_blk * jcp.load_block * jcp.typesize_out);
         sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
     };
 
     Label load_loop_blk[7];
@@ -813,8 +840,12 @@ void _jit_avx512_core_x8s8s32x_1x1_conv_kernel<Vmm>::generate() {
     }
     L(load_loop_blk[num_ur_cases]);
 
+    base_post_ops_data_offset -= stack_space_needed;
     add(rsp, stack_space_needed);
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -838,6 +869,11 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
     const memory_desc_wrapper dst_d(&dst_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     if (!one_of(src_d.data_type(), data_type::u8, data_type::s8)
             || weights_d.data_type() != data_type::s8
@@ -886,6 +922,20 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
     jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
     jcp.signed_input = (src_d.data_type() == data_type::s8);
 
+    jcp.with_input_zp = !attr.input_zero_points_.has_default_values();
+    jcp.with_weights_zp = !attr.weights_zero_points_.has_default_values();
+
+    if (jcp.with_input_zp) {
+        if (attr.input_zero_points_.count_ != 1 && attr.input_zero_points_.count_ != jcp.ic * jcp.ngroups)
+            return status::unimplemented;
+
+        if (attr.output_compensations_.count_ != jcp.oc * jcp.ngroups)
+            return status::unimplemented;
+    }
+
+    if (jcp.with_weights_zp)
+        return status::unimplemented;
+
     jcp.os = static_cast<dim_t>(jcp.od) * jcp.oh * jcp.ow;
     jcp.is = static_cast<dim_t>(jcp.id) * jcp.ih * jcp.iw;
 
@@ -907,6 +957,11 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
 
     const int sum_ind = post_ops.find(primitive_kind::sum, 0, dw_conv_ind);
     jcp.with_sum = sum_ind != -1;
+    if (jcp.with_sum)
+        jcp.sum_dt = post_ops.entry_[sum_ind].sum.dt;
+
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise, 0, dw_conv_ind) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization, 0, dw_conv_ind) != -1;
 
     if (dw_conv_ind >= 0) {
         // dw_conv and post_ops after it are handled externally, so skip them
@@ -920,7 +975,7 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
     jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
     jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
     jcp.zp_src_is_common
-            = zp.common(DNNL_ARG_SRC); // otherwise, it's per-channel
+            = zp.get_mask(DNNL_ARG_SRC) == 0; // otherwise, it's per-channel
     assert(IMPLICATION(jcp.src_zero_point, jcp.zp_src_is_common));
 
     if ((jcp.dst_zero_point || jcp.src_zero_point) && jcp.with_dw_conv)
@@ -944,7 +999,7 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
     static constexpr bool sum_requires_scale_one = false;
     static constexpr bool sum_requires_zp_zero = false;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
@@ -969,11 +1024,10 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
         CHECK_BOOL(memory_desc_init_by_tag(want_wei_md, wei_tag));
 
         if (jcp.signed_input) {
-            want_wei_md.extra.flags = 0 | compensation_conv_s8s8 | scale_adjust;
+            want_wei_md.extra.flags = 0 | compensation_conv_s8s8;
             want_wei_md.extra.compensation_mask
                     = (1 << 0) + (with_groups ? (1 << 1) : 0);
-            want_wei_md.extra.scale_adjust
-                    = mayiuse(avx512_core_vnni) ? 1.f : 0.5f;
+            want_wei_md.extra.scale_adjust = 1.f;
         }
         if (jcp.src_zero_point) {
             want_wei_md.extra.flags |= compensation_conv_asymmetric_src;
@@ -1182,8 +1236,8 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
     jcp.nb_load_chunk = 1;
     // peformance improvements for googlenet_v3, mb=1;
     // TODO: generalize this condition and rewrite it in appropriate manner
-    int ncores_per_socket = (int)cpu().getNumCores(
-            Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
+    int ncores_per_socket
+            = (int)cpu().getNumCores(Xbyak::util::CpuTopologyLevel::CoreLevel);
     if (jcp.mb == 1 && jcp.nb_load % 4 == 0 && jcp.ic / jcp.oc >= 4
             && jcp.ic * jcp.oc <= L2_size && jcp.nthr <= ncores_per_socket) {
         jcp.nb_load_chunk = 4;
@@ -1206,7 +1260,7 @@ status_t jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(
 
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     jcp.wei_adj_scale
@@ -1222,7 +1276,7 @@ void jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_scratchpad(
         const jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
     using namespace dnnl::impl::memory_tracking::names;
 
-    const int wei_mask = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
     const dim_t scales_count
             = wei_mask == 0 ? 1 : static_cast<dim_t>(jcp.oc) * jcp.ngroups;
     const dim_t count = nstl::max<dim_t>(scales_count, (dim_t)jcp.ic_block);
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp
index 6eb52072d8c..2d38bde32ec 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ namespace cpu {
 namespace x64 {
 
 template <typename Vmm>
-struct _jit_avx512_core_x8s8s32x_1x1_conv_kernel : public jit_generator {
+struct _jit_avx512_core_x8s8s32x_1x1_conv_kernel : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_avx512_core_x8s8s32x_1x1_conv_fwd_ker_t)
     _jit_avx512_core_x8s8s32x_1x1_conv_kernel(const jit_1x1_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
@@ -42,7 +42,7 @@ struct _jit_avx512_core_x8s8s32x_1x1_conv_kernel : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     using Vmm_down_t =
             typename utils::conditional<std::is_same<Vmm, Xbyak::Zmm>::value,
                     Xbyak::Ymm, Xbyak::Xmm>::type;
@@ -83,6 +83,15 @@ struct _jit_avx512_core_x8s8s32x_1x1_conv_kernel : public jit_generator {
     const Xbyak::Opmask k_load_dim_tail_mask = Xbyak::Opmask(4);
     const Xbyak::Opmask k_load_dim_tail_mask_extended = Xbyak::Opmask(5);
     const Xbyak::Opmask postops_mask = Xbyak::Opmask(6);
+
+    const Xbyak::Reg64 reg_d_weights = aux_reg_bcast_data;
+    const Xbyak::Reg64 reg_d_bias = reduce_loop_iter;
+    const Xbyak::Reg64 reg_oc_off = aux_reg_load_data;
+    int base_post_ops_data_offset = 0;
+
+    Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     const Xbyak::Opmask vmask = k7;
 
     const Vmm vmm_tmp = Vmm(28);
@@ -194,7 +203,7 @@ struct jit_avx512_core_x8s8s32x_1x1_conv_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_core_x8s8s32x_1x1_conv_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
index 538a89cf5ee..5c428e718ce 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,13 +64,15 @@ status_t jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, pd()->jcp_);
+
     auto scratchpad = ctx.get_scratchpad_grantor();
 
     auto local_scales
             = scratchpad.template get<float>(key_conv_adjusted_scales);
     // Src scale is always a single value
     float src_scale = src_scales[0];
-    int wei_mask = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.f;
@@ -92,7 +94,7 @@ status_t jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward(
         auto dw_local_scales
                 = dw_scratchpad.template get<float>(key_conv_adjusted_scales);
         auto attr_dw = pd()->dw_conv_pd_->attr();
-        int wei_mask = attr_dw->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+        int wei_mask = attr_dw->scales_.get_mask(DNNL_ARG_WEIGHTS);
         dim_t count = wei_mask == 0 ? 1 : pd()->dw_conv_pd_->OC();
         float factor = 1.f / jcp_dw->wei_adj_scale;
         if (count == 1) {
@@ -110,7 +112,8 @@ status_t jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward(
                 dst, local_scales, dst_scales, dw_oscales, dw_dst_scales,
                 src_zero_point, dst_zero_point, scratchpad,
                 post_ops_binary_rhs_arg_vec.data(),
-                post_ops_binary_rhs_arg_vec_dw.data());
+                post_ops_binary_rhs_arg_vec_dw.data(),
+                output_compensation);
     });
     return status::success;
 }
@@ -123,7 +126,8 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward_thr(
         const int32_t *src_zero_point, const int32_t *dst_zero_point,
         const memory_tracking::grantor_t &scratchpad,
         const void *post_ops_binary_rhs_arg_vec,
-        const void *post_ops_binary_rhs_arg_vec_dw) const {
+        const void *post_ops_binary_rhs_arg_vec_dw,
+        const int32_t *output_compensation) const {
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_1x1_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -154,9 +158,8 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward_thr(
 
     auto offset = weights_d.size() - weights_d.additional_buffer_size();
     char *w = const_cast<char *>(weights);
-    const int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(w + offset)
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(w + offset) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     const int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(w + offset)
                     + (jcp.signed_input ? jcp.ngroups * jcp.oc : 0)
@@ -269,7 +272,7 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward_thr(
                 : weights_d.blk_off(ocb, icb);
         p.load_data = weights + wei_offset;
         p.bias_data = &bias[_ocb * jcp.oc_block * bia_dt_size];
-        p.compensation = (jcp.signed_input) ? &compensation[_ocb * jcp.oc_block]
+        p.compensation = (jcp.signed_input || jcp.with_input_zp) ? &compensation[_ocb * jcp.oc_block]
                                             : nullptr;
         p.zp_compensation = jcp.src_zero_point
                 ? zp_compensation + _ocb * jcp.oc_block
@@ -298,6 +301,7 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward_thr(
         p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
         p.dst_orig = static_cast<const char *>(p.output_data)
                 - dst_off * dst_dt_size;
+        p.oc_off = _ocb * jcp.oc_block * sizeof(float);
 
         (*kernel_)(&p);
     };
@@ -423,6 +427,7 @@ void jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t::execute_forward_thr(
             par_conv_dw.post_ops_binary_rhs_arg_vec
                     = post_ops_binary_rhs_arg_vec_dw;
             par_conv_dw.dst_orig = dst;
+            par_conv_dw.oc_off = ocb * jcp_dw->ch_block * sizeof(float);
 
             (*kernel_dw_)(&par_conv_dw);
 
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.hpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.hpp
index 21b5e19fce0..edc41c3330c 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,13 +39,7 @@ namespace x64 {
 
 struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        using dw_conv_pd_type = cpu_convolution_fwd_pd_t;
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_()
-            , jcp_dw_(nullptr) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
             if (copy(other) != status::success) is_initialized_ = false;
@@ -79,9 +73,11 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
 
             VDISPATCH_CONV(
-                    attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::zero_points_runtime
-                                    | smask_t::post_ops | smask_t::sum_dt,
+                    attr()->has_default_values(smask_t::scales
+                                    | smask_t::zero_points | smask_t::post_ops
+                                    | smask_t::sum_dt
+                                    | smask_t::input_zero_points
+                                    | smask_t::output_compensations,
                             dst_md(0)->data_type),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(attr()->scales_.has_default_values({DNNL_ARG_SRC,
@@ -106,8 +102,11 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, dst_md(), weights_md());
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_x8s8s32x_1x1_conv_kernel::init_conf(jcp_,
                     *conv_d, src_d, weights_md_, dst_md_, bias_md_, *attr(),
                     dnnl_get_max_threads(), rtus_.reduce_src_));
@@ -128,14 +127,14 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
+            return dw_conv_pd_ && jcp_.with_dw_conv
                     ? dw_conv_pd_->dst_md(index, user_input)
                     : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
+            if (dw_conv_pd_ && jcp_.with_dw_conv) {
                 switch (arg) {
                     case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
                         return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
@@ -153,19 +152,16 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_ATTR_OUTPUT_SCALES)
-                    && jcp_.with_dw_conv)
-                return arg_usage_t::input;
             return convolution_fwd_pd_t::arg_usage(arg);
         }
 
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
-        jit_conv_conf_t *jcp_dw_; // doesn't own a resource
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+        jit_conv_conf_t *jcp_dw_ = nullptr; // doesn't own a resource
         std::unique_ptr<cpu_convolution_fwd_pd_t> dw_conv_pd_;
         using dw_pd_t =
                 typename jit_avx512_core_x8s8s32x_convolution_fwd_t::pd_t;
@@ -177,12 +173,20 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
         }
 
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
         status_t copy(const pd_t &other) {
@@ -309,7 +313,7 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
     // Note: In case of fused depthwise convolution, the final output data type
     // after fusion may not be same as for dst.
-    typedef typename prec_traits<data_type::s32>::type acc_data_t;
+    using acc_data_t = typename prec_traits_t<data_type::s32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -342,7 +346,8 @@ struct jit_avx512_core_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
             const int32_t *dst_zero_point,
             const memory_tracking::grantor_t &scratchpad,
             const void *post_ops_binary_rhs_arg_vec,
-            const void *post_ops_binary_rhs_arg_vec_dw) const;
+            const void *post_ops_binary_rhs_arg_vec_dw,
+            const int32_t *output_compensation) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::unique_ptr<jit_avx512_core_x8s8s32x_1x1_conv_kernel> kernel_;
     std::unique_ptr<rtus_driver_t<avx512_core>> rtus_driver_;
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp
index 3bcc0467620..7c7b37a53da 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,17 +38,12 @@ namespace x64 {
 
 struct jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_deconvolution_fwd_pd_t::cpu_deconvolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_deconvolution_fwd_pd_t(other)
             , conv_pd_(other.conv_pd_->clone()) {}
 
-        pd_t() = delete;
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(),
                 jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t);
 
@@ -101,9 +96,8 @@ struct jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
             VDISPATCH_DECONVOLUTION(
                     desc()->accum_data_type == s32, VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_DECONVOLUTION(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops
-                            | skip_mask_t::zero_points_runtime),
+                    attr()->has_default_values(skip_mask_t::scales
+                            | skip_mask_t::post_ops | skip_mask_t::zero_points),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_DECONVOLUTION(
                     zero_points_valid(attr(), true /*per_oc_bcast_accepted*/),
@@ -117,7 +111,7 @@ struct jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_1x1_conv_conf_t jcp_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         status_t set_default_params() {
@@ -136,10 +130,11 @@ struct jit_avx512_core_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
         std::shared_ptr<primitive_desc_t> conv_pd_;
 
     private:
-        std::string name_ = JIT_IMPL_NAME_HELPER("jit_1x1_deconvolution:",
-                (jcp_.has_vnni ? avx512_core_vnni : avx512_core), "");
+        std::string name_;
 
         void init_name() {
+            name_ = JIT_IMPL_NAME_HELPER(
+                    "jit_1x1_deconvolution:", jcp_.isa, "");
             name_.append("+");
             name_.append(conv_pd_->name());
         }
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.cpp
index a808e692752..fa30d74d04b 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.cpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "common/c_types_map.hpp"
+#include "common/convolution_pd.hpp"
 #include "common/memory.hpp"
 #include "common/memory_tracking.hpp"
 #include "common/nstl.hpp"
@@ -44,7 +45,7 @@ void pick_loop_order(jit_conv_conf_t &jcp, int nthr) {
     jcp.loop_order = loop_cwgn;
     if (jcp.ngroups > 1) {
         jcp.loop_order = loop_ngcw;
-        if (jcp.mb < nthr)
+        if (jcp.mb < nthr && jcp.ndims != 5)
             jcp.loop_order = jcp.ndims == 3 ? loop_nwcg : loop_nhwcg;
     } else if (jcp.mb >= nthr && jcp.ic_without_padding <= 16) {
         jcp.loop_order = loop_ngcw;
@@ -56,11 +57,11 @@ template <typename Vmm>
 _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::_jit_avx512_core_x8s8s32x_fwd_kernel(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), ajcp.isa)
+    : jit_generator_t(jit_name(), ajcp.isa)
     , jcp(ajcp)
     , attr_(attr)
     , postops_injector_(nullptr) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
@@ -82,9 +83,18 @@ _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::_jit_avx512_core_x8s8s32x_fwd_kernel(
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
 
+        quantization_injector::static_params_t quantization_static_params;
+        int max_ur_w = nstl::max(jcp.ur_w, jcp.ur_w_tail);
+        int nb_oc_block = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking;
+        int last_accum_idx = vmm_out(max_ur_w - 1, nb_oc_block - 1).getIdx();
+        if (last_accum_idx >= 30)
+            quantization_static_params = {zmm_d_weights.getIdx(), zmm_d_weights.getIdx(), reg_d_weights, reg_d_bias};
+        else
+            quantization_static_params = {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
+
         postops_injector_ = utils::make_unique<
-                injector::jit_uni_postops_injector_t<avx512_core, Vmm>>(
-                this, jcp.post_ops, static_params);
+                injector::jit_uni_postops_injector_t<avx512_core>>(
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
     if (!isa_has_bf16(jcp.isa) && jcp.dst_dt == data_type::bf16)
         bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
@@ -204,9 +214,20 @@ template <typename Vmm>
 void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::apply_postops(int ur_w,
         bool last_oc_block_flag, const int nb_oc_block, const int oc_block,
         const float *p_sum_scale, const int32_t *p_sum_zp) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(nb_oc_block, ur_w,
+                [&](const bool, const int k, const int j) {
+                    vmm_idx_off.insert({vmm_out_idx(j, k), k * oc_block * sizeof(float)});
+                });
+        depthwise_injector::dynamic_params_t ddp {zmm_d_weights.getIdx(), zmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
         apply_sum(ur_w, last_oc_block_flag, nb_oc_block, oc_block, p_sum_scale,
-                p_sum_zp);
+                  p_sum_zp);
 
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
@@ -230,13 +251,13 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::apply_postops(int ur_w,
                             rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
                     });
 
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
         } else {
             iterate(nb_oc_block, ur_w,
                     [&](const bool, const int k, const int j) {
                         vmm_idxs.emplace(vmm_out_idx(j, k));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
     }
 }
@@ -250,7 +271,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::store_output(
 
     mov(reg_bias, ptr[param1 + GET_OFF(bias)]);
     mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]);
-    if (jcp.signed_input)
+    if (jcp.signed_input || jcp.with_input_zp)
         mov(reg_compensation, ptr[param1 + GET_OFF(compensation)]);
 
     if (jcp.src_zero_point) {
@@ -277,7 +298,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::store_output(
 
             cvt2ps(jcp.bia_dt, vmm_bias, bias_addr, mask_flag);
         }
-        if (jcp.signed_input) {
+        if (jcp.signed_input || jcp.with_input_zp) {
             int comp_offset = sizeof(int32_t) * k * oc_block;
             Vmm vmm_comp_ = vmm_mask(vmm_comp, mask_flag);
             vmovups(vmm_comp_,
@@ -301,7 +322,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::store_output(
             /* add comp in s32 to avoid loss of precision
                when convert s32 to f32 in integer(2^24)
                TODO: do the same to bias */
-            if (jcp.signed_input) vpaddd(vmm, vmm, vmm_comp);
+            if (jcp.signed_input || jcp.with_input_zp) vpaddd(vmm, vmm, vmm_comp);
             if (jcp.src_zero_point) vpaddd(vmm, vmm, vmm_zp);
             vcvtdq2ps(vmm, vmm);
 
@@ -433,10 +454,11 @@ template <>
 void _jit_avx512_core_x8s8s32x_fwd_kernel<Zmm>::compute_ker_dw(int ur_w,
         int pad_l, int pad_r, ic_block_t last_ic_block_flag, bool h_padded) {
 
-    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input);
+    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input || jcp.with_input_zp);
 
     if (jcp.src_zero_point) {
         push(aux_reg_ker_d);
+        base_post_ops_data_offset += reg64_size;
         mov(reg_src_zero_point, ptr[param1 + GET_OFF(src_zero_point)]);
     }
 
@@ -458,7 +480,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Zmm>::compute_ker_dw(int ur_w,
     };
 
     auto kernel_offset = [this](int ci, int ki) {
-        return jcp.typesize_in * ((ci * jcp.kh * jcp.kw + ki) * jcp.ch_block);
+        return jcp.typesize_in * ((ci * jcp.kd * jcp.kh * jcp.kw + ki) * jcp.ch_block);
     };
 
     auto compute = [this](Zmm vreg_acc, Zmm vreg_wei, Zmm vreg_src) {
@@ -488,9 +510,17 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Zmm>::compute_ker_dw(int ur_w,
         }
     }
 
-    if (jcp.signed_input) vmovups(zmm_shifted_zero, vmm_shift);
+    if (jcp.signed_input || jcp.with_input_zp) vmovups(zmm_shifted_zero, vmm_shift);
 
     for (int ci = 0; ci < jcp.nb_ch_blocking; ci++) {
+        if (jcp.with_input_zp && (h_padded || get_ow_start(0, pad_l) != 0 || get_ow_end(ur_w, jcp.kw-1, pad_r) != ur_w)) {
+            if (jcp.is_fast_depthwise) {
+                vbroadcasti32x4(zmm_shifted_zero, ptr[reg_input_zp + ci * jcp.ch_block]);
+            } else {
+                vpmovzxbd(zmm_shifted_zero, ptr[reg_input_zp + ci * jcp.ch_block]);
+            }
+        }
+
         const bool mask_flag = last_ic_block_flag != no_last_block
                 && ci == jcp.nb_ch_blocking - 1;
         if (jcp.is_resrc_depthwise && !h_padded) {
@@ -528,14 +558,14 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Zmm>::compute_ker_dw(int ur_w,
                 }
 
                 if (h_padded) {
-                    assert(jcp.signed_input);
+                    assert(jcp.signed_input || jcp.with_input_zp);
                     for (int oi = 0; oi < ur_w; oi++)
                         compute(zmm_out(oi, ci), zmm_wei, zmm_shifted_zero);
                 } else {
                     const Zmm r_zmm_src
                             = mask_flag ? zmm_src | ktail_mask : zmm_src;
-                    int start_ = jcp.signed_input ? 0 : oi_start;
-                    int end_ = jcp.signed_input ? ur_w : oi_end;
+                    int start_ = (jcp.signed_input || jcp.with_input_zp) ? 0 : oi_start;
+                    int end_ = (jcp.signed_input || jcp.with_input_zp) ? ur_w : oi_end;
                     for (int oi = start_; oi < end_; oi++) {
                         if (oi >= oi_start && oi < oi_end) {
                             if (jcp.is_resrc_depthwise) {
@@ -559,7 +589,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Zmm>::compute_ker_dw(int ur_w,
                             }
                             compute(zmm_out(oi, ci), zmm_wei, zmm_src);
                         } else {
-                            assert(jcp.signed_input);
+                            assert(jcp.signed_input|| jcp.with_input_zp);
                             compute(zmm_out(oi, ci), zmm_wei, zmm_shifted_zero);
                         }
                     }
@@ -587,7 +617,10 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Zmm>::compute_ker_dw(int ur_w,
             }
         }
     }
-    if (jcp.src_zero_point) pop(aux_reg_ker_d);
+    if (jcp.src_zero_point) {
+        pop(aux_reg_ker_d);
+        base_post_ops_data_offset -= reg64_size;
+    }
 }
 
 template <typename Vmm>
@@ -596,12 +629,13 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::compute_ker(int ur_w, int pad_l,
     if (jcp.is_depthwise)
         return compute_ker_dw(ur_w, pad_l, pad_r, last_ic_block_flag, h_padded);
 
-    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input);
+    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input || jcp.with_input_zp);
 
-    assert(IMPLICATION(h_padded, jcp.src_zero_point || jcp.signed_input));
+    assert(IMPLICATION(h_padded, jcp.src_zero_point || jcp.signed_input || jcp.with_input_zp));
 
     if (jcp.src_zero_point) {
         push(aux_reg_ker_d);
+        base_post_ops_data_offset += reg64_size;
         mov(reg_src_zero_point, ptr[param1 + GET_OFF(src_zero_point)]);
     }
 
@@ -640,8 +674,8 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::compute_ker(int ur_w, int pad_l,
         int jj_start = get_ow_start(ki, pad_l);
         int jj_end = get_ow_end(ur_w, ki, pad_r);
         int ic_tail_size = jcp.ic_without_padding % ic_sub_step;
-        int _start = jcp.signed_input ? 0 : jj_start;
-        int _end = jcp.signed_input ? ur_w : jj_end;
+        int _start = (jcp.signed_input || jcp.with_input_zp) ? 0 : jj_start;
+        int _end = (jcp.signed_input || jcp.with_input_zp) ? ur_w : jj_end;
         /* Skip the last loads of input
             if (ic%16)/ic_sub_step < ic_block/ic_sub_step */
         int icb = (last_ic_block_flag != no_last_block)
@@ -650,6 +684,9 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::compute_ker(int ur_w, int pad_l,
         if (compute_kernel) {
             for (int ic = 0; ic < icb; ic++) {
                 if (h_padded) {
+                    if (jcp.with_input_zp)
+                        uni_vpbroadcastd(vmm_shift, ptr[reg_input_zp + ic_sub_step * ic * sizeof(uint8_t)]);
+
                     // fill padded area with shifted value in first iteration
                     if (ic == 0) {
                         Vmm inp = vmm_inp(0, nb_oc_block);
@@ -677,7 +714,10 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::compute_ker(int ur_w, int pad_l,
                         } else {
                             // fill padded area with shifted value in
                             // first iteration
-                            if (jcp.signed_input && ic == 0) {
+                            if ((jcp.signed_input || jcp.with_input_zp) && ic == 0) {
+                                if (jcp.with_input_zp)
+                                    uni_vpbroadcastd(vmm_shift, ptr[reg_input_zp + 4 * ic * sizeof(uint8_t)]);
+
                                 Vmm inp = vmm_inp(jj, nb_oc_block);
                                 vmovups(inp, vmm_shift);
                             }
@@ -729,7 +769,10 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::compute_ker(int ur_w, int pad_l,
         }
     }
 
-    if (jcp.src_zero_point) pop(aux_reg_ker_d);
+    if (jcp.src_zero_point) {
+        pop(aux_reg_ker_d);
+        base_post_ops_data_offset -= reg64_size;
+    }
 }
 
 template <typename Vmm>
@@ -749,7 +792,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::kh_loop(
     if (jcp.ndims == 5) {
         mov(aux_reg_ker_d, reg_ker);
         mov(aux_reg_inp_d, reg_inp);
-        if (jcp.signed_input || jcp.src_zero_point) {
+        if (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) {
             //TODO: May be avoided when f_pad=0 and dd0
             //TODO: Potential optimization by precomputing, when kd <<< od?
             mov(reg_ki, ptr[param1 + GET_OFF(f_overflow)]);
@@ -774,8 +817,8 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::kh_loop(
         }
 
         mov(reg_ki, ptr[param1 + GET_OFF(kd_padding)]);
-        if ((jcp.signed_input || jcp.src_zero_point) || (jcp.dilate_d >= jcp.id)
-                || (!(jcp.signed_input || jcp.src_zero_point)
+        if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) || (jcp.dilate_d >= jcp.id)
+                || (!(jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                         && (jcp.kd - 1) * (jcp.dilate_d + 1)
                                 < nstl::max(jcp.f_pad, jcp.back_pad))) {
             cmp(reg_ki, 0);
@@ -793,7 +836,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::kh_loop(
         mov(aux_reg_ker, reg_ker);
     }
 
-    if ((jcp.signed_input || jcp.src_zero_point) && jcp.ndims > 3) {
+    if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) && jcp.ndims > 3) {
         mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]);
         cmp(reg_overflow, 0);
         je(no_t_overflow_label, T_NEAR);
@@ -809,8 +852,8 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::kh_loop(
         L(no_t_overflow_label);
     }
     mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]);
-    if (jcp.signed_input || jcp.src_zero_point || (jcp.dilate_h >= jcp.ih)
-            || (!(jcp.signed_input || jcp.src_zero_point)
+    if (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp || (jcp.dilate_h >= jcp.ih)
+            || (!(jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                     && (jcp.kh - 1) * (jcp.dilate_h + 1)
                             < nstl::max(jcp.t_pad, jcp.b_pad))) {
         cmp(reg_kj, 0);
@@ -835,7 +878,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::kh_loop(
         jg(kh_label, T_NEAR);
     }
     L(skip_kh_loop);
-    if ((jcp.signed_input || jcp.src_zero_point) && jcp.ndims > 3) {
+    if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) && jcp.ndims > 3) {
         mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]);
         cmp(reg_overflow, 0);
         je(no_b_overflow_label, T_NEAR);
@@ -858,7 +901,7 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::kh_loop(
         jne(kd_label, T_NEAR);
 
         L(skip_kd_loop);
-        if (jcp.signed_input || jcp.src_zero_point) {
+        if (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) {
             mov(reg_ki, ptr[param1 + GET_OFF(back_overflow)]);
             cmp(reg_ki, 0);
             je(no_back_overflow_label, T_NEAR);
@@ -898,6 +941,9 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::icb_loop(
     // IC loop
     Label icb_label;
     mov(reg_icb, jcp.nb_ic);
+    if (jcp.with_input_zp)
+        mov(reg_input_zp, ptr[param1 + GET_OFF(input_zp)]);
+
     L(icb_label);
     const bool do_icb_loop
             = jcp.is_depthwise ? jcp.nb_ch > jcp.nb_ch_blocking : jcp.nb_ic > 1;
@@ -930,6 +976,8 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::icb_loop(
                 * jcp.ic_block;
         add(reg_inp, jcp.typesize_in * inp_step);
         safe_add(reg_ker, jcp.typesize_in * ker_step, reg_ker_long_offt);
+        if (jcp.with_input_zp)
+            add(reg_input_zp, sizeof(uint8_t) * inp_step);
 
         dec(reg_icb);
         cmp(reg_icb, 0);
@@ -978,18 +1026,25 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::generate() {
             * (jcp.ur_w * jcp.oc_without_padding * jcp.ngroups);
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_inp, reg_out);
+
+    bool with_quantization = attr_.post_ops_.find(primitive_kind::quantization) != -1;
+
     if (jcp.is_depthwise) {
         bool is_zero_point = jcp.src_zero_point || jcp.dst_zero_point;
         int idx = jcp.max_regs_ur - 1 + 2 * is_zero_point;
         if (!jcp.is_resrc_depthwise) zmm_src = Zmm(++idx);
         if (!jcp.has_vnni) zmm_tmp = Zmm(++idx);
         if (jcp.is_fast_depthwise) zmm_permute = Zmm(++idx);
-        if (jcp.signed_input) zmm_shifted_zero = Zmm(++idx);
+        if (jcp.signed_input || jcp.with_input_zp) zmm_shifted_zero = Zmm(++idx);
         // due to extra register used for shifts and compensations
         // and/or saturation, we increment by one more
-        if (jcp.signed_input || jcp.need_saturation) ++idx;
+        if (jcp.signed_input || jcp.with_input_zp || jcp.need_saturation) ++idx;
+        if (with_quantization) ++idx;
 
-        assert(IMPLICATION(!is_zero_point && jcp.dst_dt != data_type::bf16,
+        assert(IMPLICATION(!is_zero_point && !jcp.with_input_zp
+                        && jcp.dst_dt != data_type::bf16,
                 idx == ker_dw_reg_base_idx));
     }
     if (!jcp.is_depthwise && (!jcp.has_vnni)) {
@@ -1342,6 +1397,9 @@ void _jit_avx512_core_x8s8s32x_fwd_kernel<Vmm>::generate() {
     L(done_compute);
     assert(ow_block_jmp_table.size() == static_cast<size_t>(label_cntr));
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -1368,6 +1426,11 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     const memory_desc_wrapper dst_d(&dst_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     const int ndims = src_d.ndims();
     const bool is_1d = ndims == 3;
@@ -1440,8 +1503,21 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
         }
     }
 
-    if (jcp.is_depthwise && is_3d)
-        // NOTE: 3D depthwise is not currently supported here.
+    jcp.with_input_zp = !attr.input_zero_points_.has_default_values();
+    jcp.with_weights_zp = !attr.weights_zero_points_.has_default_values();
+
+    if (jcp.with_input_zp) {
+        if (attr.input_zero_points_.count_ != 1 && attr.input_zero_points_.count_ != jcp.ic * jcp.ngroups)
+            return status::unimplemented;
+
+        if (attr.output_compensations_.count_ != jcp.oc * jcp.ngroups)
+            return status::unimplemented;
+    }
+
+    if (jcp.with_input_zp && jcp.is_depthwise && ndims == 5)
+        return status::unimplemented;
+
+    if (jcp.with_weights_zp)
         return status::unimplemented;
 
     if (jcp.is_depthwise) {
@@ -1478,7 +1554,7 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
     jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
     jcp.zp_src_is_common
-            = zp.common(DNNL_ARG_SRC); // otherwise, it's per-channel
+            = zp.get_mask(DNNL_ARG_SRC) == 0; // otherwise, it's per-channel
     assert(IMPLICATION(jcp.src_zero_point, jcp.zp_src_is_common));
 
     if ((jcp.dst_zero_point || jcp.src_zero_point) && jcp.is_fused_conv)
@@ -1486,7 +1562,7 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
 
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     jcp.has_vnni = mayiuse(avx512_core_vnni);
@@ -1500,11 +1576,14 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     jcp.is_resrc_depthwise = jcp.is_depthwise && jcp.stride_w < jcp.kw
             && jcp.kw < 4 && jcp.dilate_w == 0;
 
+    jcp.with_quantization = attr.post_ops_.find(primitive_kind::quantization) != -1;
+
     if (jcp.is_depthwise) {
         jcp.max_regs_ur = 31 - jcp.is_fast_depthwise - !jcp.is_resrc_depthwise
-                - jcp.signed_input - (!jcp.has_vnni)
-                - (jcp.signed_input || jcp.need_saturation) // both alias
-                - (bf16_req_extra_regs ? 4 : 0);
+                - (jcp.signed_input || jcp.with_input_zp) - (!jcp.has_vnni)
+                - (jcp.signed_input || jcp.with_input_zp || jcp.need_saturation) // both alias
+                - (bf16_req_extra_regs ? 4 : 0)
+                - jcp.with_quantization;
     } else {
         jcp.max_regs_ur = bf16_req_extra_regs ? 26 : jcp.has_vnni ? 31 : 28;
     }
@@ -1520,7 +1599,8 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
         format_tag_t wei_tag;
         if (jcp.ic_block == 16 || jcp.ch_block == 16) {
             if (is_3d) {
-                wei_tag = with_groups ? gOIdhw4i16o4i : OIdhw4i16o4i;
+                wei_tag = with_groups ? jcp.is_depthwise ? Goidhw16g : gOIdhw4i16o4i
+                                      : OIdhw4i16o4i;
             } else if (is_1d) {
                 wei_tag = with_groups ? jcp.is_depthwise ? Goiw16g : gOIw4i16o4i
                                       : OIw4i16o4i;
@@ -1602,7 +1682,10 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
 
     const int sum_ind = post_ops.find(primitive_kind::sum);
     jcp.with_sum = sum_ind != -1;
-    jcp.sum_dt = post_ops.get_sum_dt(jcp.dst_dt);
+    if (jcp.with_sum)
+        jcp.sum_dt = post_ops.get_sum_dt(jcp.dst_dt);
+
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
 
     jcp.post_ops = post_ops;
 
@@ -1611,7 +1694,7 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     static constexpr bool sum_requires_scale_one = false;
     static constexpr bool sum_requires_zp_zero = false;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(avx512_core,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     if (!post_ops_ok_) return status::unimplemented;
 
@@ -1644,8 +1727,8 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
     int max_threading_nb_oc_chunk = 4;
     // Performance improvements for googlenet_v3 and resnet_50 with mb = 1;
     // TODO: generalize this condition and rewrite it in appropriate manner
-    int ncores_per_socket = (int)cpu().getNumCores(
-            Xbyak::util::IntelCpuTopologyLevel::CoreLevel);
+    int ncores_per_socket
+            = (int)cpu().getNumCores(Xbyak::util::CpuTopologyLevel::CoreLevel);
     if (jcp.has_vnni && jcp.mb == 1 && jcp.kh == 3 && jcp.kw == 3
             && jcp.stride_w == 1 && jcp.ic % 64 == 0
             && jcp.nthr <= ncores_per_socket)
@@ -1765,9 +1848,11 @@ status_t jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jit_conv_conf_t &jcp,
 void jit_avx512_core_x8s8s32x_fwd_kernel::init_scratchpad(
         memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp,
         const primitive_attr_t &attr) {
-    const int wei_mask = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
-    const dim_t scales_count = wei_mask == 0 ? 1 : jcp.oc * jcp.ngroups;
-    dim_t count = wei_mask == 0 ? (dim_t)16 : scales_count;
+    dim_t count = 16;
+    if (!attr.scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
+        const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_mask > 0) count = jcp.oc * jcp.ngroups;
+    }
     scratchpad.book<float>(key_conv_adjusted_scales, count);
 }
 
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.hpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.hpp
index 2a0333317f3..88078b7ba82 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ namespace cpu {
 namespace x64 {
 
 template <typename Vmm>
-struct _jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator {
+struct _jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_avx512_core_x8s8s32x_conv_fwd_ker_t)
 
     enum { STATE_FIRST_DST_LOAD = 0x1U };
@@ -45,12 +45,12 @@ struct _jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
+            = cpu_isa_traits_t<avx512_core>::vlen / sizeof(float);
     using Vmm_down_t =
             typename utils::conditional<std::is_same<Vmm, Xbyak::Zmm>::value,
                     Xbyak::Ymm, Xbyak::Xmm>::type;
     const int ic_sub_step = 4;
-    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core, Vmm>>
+    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
 
     enum {
@@ -105,6 +105,16 @@ struct _jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator {
     /* binary post-op operand */
     const Xbyak::Reg64 temp_offset_reg = r12;
 
+    const Xbyak::Reg64 reg_input_zp = abi_not_param1;
+
+    const Xbyak::Reg64 reg_d_weights = r15;
+    const Xbyak::Reg64 reg_d_bias = r13;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    const Xbyak::Zmm zmm_d_weights = Xbyak::Zmm(31);
+    const Xbyak::Zmm zmm_d_bias = Xbyak::Zmm(30);
+
     const Xbyak::Opmask ktail_mask = Xbyak::Opmask(2);
     const Xbyak::Opmask kblend_mask = Xbyak::Opmask(3);
     const Xbyak::Opmask postops_mask = Xbyak::Opmask(4);
@@ -178,7 +188,8 @@ struct _jit_avx512_core_x8s8s32x_fwd_kernel : public jit_generator {
         const int idx = i_ic + nb_x_blocking * jcp.ur_w;
         const int max_idx = jcp.src_zero_point ? ker_zp_reg_base_idx
                                                : ker_dw_reg_base_idx;
-        assert(idx < max_idx);
+        // todo: [antonvor] fix assert
+//        assert(idx < max_idx);
         MAYBE_UNUSED(max_idx);
 
         return Xbyak::Zmm(idx);
@@ -283,7 +294,7 @@ struct jit_avx512_core_x8s8s32x_fwd_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_avx512_core_x8s8s32x_fwd_kernel)
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp
index 1242fd8e3f8..527224de685 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,19 +42,18 @@ const float *jit_avx512_core_x8s8s32x_convolution_fwd_t::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
-    const float src_scale = src_scales[0];
-    const int wei_mask = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
+    const int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.f;
-    switch (wei_mask) {
-        case 0:
-            utils::array_set(loc_scales, src_scale * wei_scales[0] * factor,
-                    pd()->jcp_.simd_w);
-            break;
-        default:
-            for (dim_t c = 0; c < pd()->OC(); c++)
-                loc_scales[c] = src_scale * wei_scales[c] * factor;
+    if (has_wei_scales && wei_mask > 0) {
+        for (dim_t c = 0; c < pd()->OC(); c++)
+            loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
+    } else {
+        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
+                pd()->jcp_.simd_w);
     }
     return loc_scales;
 }
@@ -73,6 +72,9 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_1d(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -97,9 +99,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_1d(
     size_t ch_offset = jcp.is_depthwise ? jcp.nb_ch * jcp.ch_block
                                         : jcp.ngroups * jcp.oc;
     auto w = const_cast<char *>(weights);
-    int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[extra_data_offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[extra_data_offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[extra_data_offset])
                     + (jcp.signed_input ? ch_offset : 0)
@@ -147,7 +148,7 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_1d(
 
             p.bias = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
                           : nullptr;
-            p.compensation = (jcp.signed_input) ? compensation + g_oc : nullptr;
+            p.compensation = (jcp.signed_input || jcp.with_input_zp) ? compensation + g_oc : nullptr;
             p.zp_compensation
                     = jcp.src_zero_point ? zp_compensation + g_oc : nullptr;
             p.src_zero_point = jcp.src_zero_point ? src_zero_point : nullptr;
@@ -165,6 +166,10 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_1d(
 
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
             p.dst_orig = dst;
+            p.oc_off = g_oc * sizeof(float);
+            if (jcp.with_input_zp)
+                p.input_zp = input_zp + g_ic;
+
             (*kernel_)(&p);
 
             ++start;
@@ -209,6 +214,10 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -228,9 +237,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d(
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<char *>(weights);
-    int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                            (jcp.with_input_zp) ? output_compensation : nullptr;
     int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.ngroups * jcp.oc : 0)
@@ -284,8 +292,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d(
 
                 auto bias_w = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
                                    : nullptr;
-                int32_t *compensation_w
-                        = (jcp.signed_input) ? compensation + g_oc : nullptr;
+                const int32_t *compensation_w
+                        = (jcp.signed_input || jcp.with_input_zp) ? compensation + g_oc : nullptr;
 
                 auto dst_w = dst
                         + dst_dt_size * dst_d.blk_off(n, g_oc, oh_s, ow_s);
@@ -307,7 +315,7 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d(
                     int kh_padding = nstl::max(
                             0, jcp.kh - i_t_overflow - i_b_overflow);
 
-                    size_t wei_stride = (jcp.signed_input || jcp.src_zero_point)
+                    size_t wei_stride = (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                             ? 0
                             : i_t_overflow * wht_h_stride;
                     p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
@@ -333,6 +341,10 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d(
                     p.post_ops_binary_rhs_arg_vec
                             = post_ops_binary_rhs_arg_vec.data();
                     p.dst_orig = dst;
+                    p.oc_off = g_oc * sizeof(float);
+                    if (jcp.with_input_zp)
+                        p.input_zp = input_zp + g_ic;
+
                     (*kernel_)(&p);
 
                     src_w += src_h_stride * jcp.stride_h;
@@ -377,6 +389,9 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d_dw(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -398,9 +413,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d_dw(
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<char *>(weights);
-    int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.nb_ch * jcp.ch_block : 0)
@@ -424,8 +438,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d_dw(
 
                 auto bias_w = bias ? bias + (bias_d.blk_off(g) * bia_dt_size)
                                    : nullptr;
-                int32_t *compensation_w
-                        = jcp.signed_input ? compensation + g : nullptr;
+                const int32_t *compensation_w
+                        = (jcp.signed_input || jcp.with_input_zp) ? compensation + g : nullptr;
 
                 auto dst_w
                         = dst + dst_dt_size * dst_d.blk_off(n, g, oh_s, ow_s);
@@ -445,7 +459,7 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d_dw(
                 int kh_padding
                         = nstl::max(0, jcp.kh - i_t_overflow - i_b_overflow);
 
-                size_t wei_stride = (jcp.signed_input || jcp.src_zero_point)
+                size_t wei_stride = (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                         ? 0
                         : i_t_overflow * wht_h_stride;
                 p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
@@ -470,6 +484,9 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_2d_dw(
                 p.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
                 p.dst_orig = dst;
+                p.oc_off = g * sizeof(float);
+                if (jcp.with_input_zp)
+                    p.input_zp = input_zp + g;
 
                 (*kernel_)(&p);
             });
@@ -489,6 +506,9 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -512,9 +532,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<char *>(weights);
-    int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                            (jcp.with_input_zp) ? output_compensation : nullptr;
     int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.ngroups * jcp.oc : 0)
@@ -582,8 +601,8 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
 
                 auto bias_w = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
                                    : nullptr;
-                int32_t *compensation_w
-                        = (jcp.signed_input) ? compensation + g_oc : nullptr;
+                const int32_t *compensation_w
+                        = (jcp.signed_input || jcp.with_input_zp) ? compensation + g_oc : nullptr;
 
                 auto dst_w = dst
                         + dst_dt_size
@@ -591,7 +610,7 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
                 auto src_w = src + src_d.blk_off(n, g_ic, id_s, ih_s, iw_s)
                         + d_f_overflow * dilate_d * src_d_stride;
                 auto wht_w = weights + wht_blk_off(weights_d, g, ocb, 0)
-                        + ((jcp.signed_input || jcp.src_zero_point)
+                        + ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                                           ? 0
                                           : d_f_overflow)
                                 * wht_d_stride;
@@ -611,7 +630,7 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
                     int kh_padding = nstl::max(
                             0, jcp.kh - i_t_overflow - i_b_overflow);
 
-                    size_t wei_stride = (jcp.signed_input || jcp.src_zero_point)
+                    size_t wei_stride = (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                             ? 0
                             : wht_h_stride * i_t_overflow;
                     p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
@@ -640,6 +659,10 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
                     p.post_ops_binary_rhs_arg_vec
                             = post_ops_binary_rhs_arg_vec.data();
                     p.dst_orig = dst;
+                    p.oc_off = g_oc * sizeof(float);
+                    if (jcp.with_input_zp)
+                        p.input_zp = input_zp + g_ic;
+
                     (*kernel_)(&p);
 
                     src_w += src_h_stride * jcp.stride_h;
@@ -669,6 +692,121 @@ status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d(
     return status::success;
 }
 
+status_t jit_avx512_core_x8s8s32x_convolution_fwd_t::execute_forward_3d_dw(const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const char *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const char *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const char *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(char *, DNNL_ARG_DST);
+
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+    const memory_desc_wrapper bias_d(pd()->weights_md(1));
+
+    const size_t bia_dt_size
+            = pd()->with_bias() ? types::data_type_size(bias_d.data_type()) : 0;
+    const size_t dst_dt_size = types::data_type_size(dst_d.data_type());
+
+    assert(jcp.ic_block == 1);
+    assert(jcp.oc_block == 1);
+    assert(jcp.nb_ic == 1);
+    assert(jcp.nb_oc == 1);
+    assert(jcp.nb_oc_blocking == 1);
+    assert(jcp.nb_ch % jcp.nb_ch_blocking == 0);
+
+    DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
+    DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
+    DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
+
+    const float *oscales = adjust_oscales(
+            ctx.get_scratchpad_grantor(), src_scales, wei_scales);
+
+    size_t offset = weights_d.size() - weights_d.additional_buffer_size();
+    auto w = const_cast<char *>(weights);
+    const int32_t* compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                            (jcp.with_input_zp) ? output_compensation : 0;
+    int nb_groups = jcp.nb_ch / jcp.nb_ch_blocking;
+    int group_block = jcp.ch_block;
+
+    parallel_nd(jcp.mb, jcp.od, jcp.oh, jcp.nb_ow, nb_groups, [&](int n, int od_s, int oh_s, int owb, int gg) {
+        auto p = jit_conv_call_s();
+
+        size_t src_d_stride = src_d.blk_off(0, 0, 1);
+        size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
+
+        size_t src_h_stride = src_d.blk_off(0, 0, 0, 1);
+        size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
+
+        int gb = gg * jcp.nb_ch_blocking;
+        int g = gb * group_block;
+
+        int id_s = -jcp.f_pad + od_s * jcp.stride_d;
+
+        int ih_s = -jcp.t_pad + oh_s * jcp.stride_h;
+        int ow_s = owb * jcp.ow_block;
+        int iw_s = ow_s * jcp.stride_w;
+
+        auto bias_w = bias ? bias + (bias_d.blk_off(g) * bia_dt_size) : 0;
+        const int32_t *compensation_w = (jcp.signed_input || jcp.with_input_zp) ? compensation + g : 0;
+
+        auto dst_w = dst + dst_dt_size * dst_d.blk_off(n, g, od_s, oh_s, ow_s);
+        auto src_w = src + src_d.blk_off(n, g, id_s, ih_s, iw_s);
+        auto wht_w = weights + wht_blk_off(weights_d, gb, 0);
+
+        auto scales = &oscales[jcp.is_oc_scale * g];
+
+        int dilate_d = jcp.dilate_d + 1;
+        int i_f_overflow = nstl::min(jcp.kd, div_up(max(0, -id_s), dilate_d));
+        int i_back_overflow = nstl::min(jcp.kd,
+                                        div_up(max(0, id_s - jcp.id + (jcp.kd - 1) * dilate_d + 1),
+                                               dilate_d));
+        int kd_padding = nstl::max(0, jcp.kd - i_f_overflow - i_back_overflow);
+
+        size_t wei_d_stride = (jcp.signed_input || jcp.with_input_zp) ? 0 : i_f_overflow * wht_d_stride;
+
+        int dilate_h = jcp.dilate_h + 1;
+        int i_t_overflow = nstl::min(jcp.kh, div_up(max(0, -ih_s), dilate_h));
+        int i_b_overflow = nstl::min(jcp.kh,
+                                     div_up(max(0, ih_s - jcp.ih + (jcp.kh - 1) * dilate_h + 1),
+                                            dilate_h));
+        int kh_padding = nstl::max(0, jcp.kh - i_t_overflow - i_b_overflow);
+
+        size_t wei_h_stride = (jcp.signed_input || jcp.with_input_zp) ? 0 : i_t_overflow * wht_h_stride;
+        p.src = src_w + i_t_overflow * dilate_h * src_h_stride
+                + i_f_overflow * dilate_d * src_d_stride;
+        p.dst = dst_w;
+        p.filt = wht_w + wei_d_stride + wei_h_stride;
+        p.bias = bias_w;
+        p.compensation = compensation_w;
+        p.oc_blocks = gb;
+        p.kd_padding = kd_padding;
+        p.kh_padding = kh_padding;
+        p.scales = scales;
+        p.dst_scale = dst_scales;
+        p.f_overflow = i_f_overflow;
+        p.back_overflow = i_back_overflow;
+        p.t_overflow = i_t_overflow;
+        p.b_overflow = i_b_overflow;
+        p.owb = owb;
+        p.post_ops_binary_rhs_arg_vec
+                = post_ops_binary_rhs_arg_vec.data();
+
+        p.oc_off = g * sizeof(float);
+        if (jcp.with_input_zp)
+            p.input_zp = input_zp + g;
+
+        (*kernel_)(&p);
+    });
+    return status::success;
+}
+
 } // namespace x64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.hpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.hpp
index 78dad1c5880..941af1e5459 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,9 +34,7 @@ namespace x64 {
 
 struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_int8:", jcp_.isa, ""),
                 jit_avx512_core_x8s8s32x_convolution_fwd_t);
@@ -65,9 +63,11 @@ struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
 
             VDISPATCH_CONV(
-                    attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::zero_points_runtime
-                                    | smask_t::post_ops | smask_t::sum_dt,
+                    attr()->has_default_values(smask_t::scales
+                                    | smask_t::zero_points | smask_t::post_ops
+                                    | smask_t::sum_dt
+                                    | smask_t::input_zero_points
+                                    | smask_t::output_compensations,
                             dst_md(0)->data_type),
                     VERBOSE_UNSUPPORTED_ATTR);
 
@@ -78,6 +78,7 @@ struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_CONV(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_avx512_core_x8s8s32x_fwd_kernel::init_conf(jcp_, *desc(),
                     src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -89,16 +90,24 @@ struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
     };
 
@@ -121,8 +130,12 @@ struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public primitive_t {
                 return execute_forward_2d_dw(ctx);
             else
                 return execute_forward_2d(ctx);
-        else if (_pd->ndims() == 5)
-            return execute_forward_3d(ctx);
+        else if (_pd->ndims() == 5) {
+            if (_pd->jcp_.is_depthwise)
+                return execute_forward_3d_dw(ctx);
+            else
+                return execute_forward_3d(ctx);
+        }
         return status::unimplemented;
     }
 
@@ -131,6 +144,7 @@ struct jit_avx512_core_x8s8s32x_convolution_fwd_t : public primitive_t {
     status_t execute_forward_2d(const exec_ctx_t &ctx) const;
     status_t execute_forward_2d_dw(const exec_ctx_t &ctx) const;
     status_t execute_forward_3d(const exec_ctx_t &ctx) const;
+    status_t execute_forward_3d_dw(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     const float *adjust_oscales(const memory_tracking::grantor_t &scratchpad,
             const float *src_scales, const float *wei_scales) const;
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp
index c02bc84bdfe..e8301a667c8 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,12 +43,12 @@ template <typename Vmm>
 jit_avx512_core_x8s8s32x_deconv_fwd_kernel<Vmm>::
         jit_avx512_core_x8s8s32x_deconv_fwd_kernel(const jit_conv_conf_t &ajcp,
                 const primitive_attr_t &attr, const memory_desc_t &dst_md)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , jcp(ajcp)
     , attr_(attr)
     , postops_injector_(nullptr) {
 
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         const std::size_t tail_size = jcp.is_depthwise
                 ? jcp.ngroups % jcp.ch_block
                 : jcp.oc_without_padding % jcp.oc_block;
@@ -65,9 +65,19 @@ jit_avx512_core_x8s8s32x_deconv_fwd_kernel<Vmm>::
                 use_exact_tail_scalar_bcast};
         const binary_injector::static_params_t bsp {this->param1, rhs_sp};
 
+        if (jcp.has_vnni) {
+            vmm_d_weights = Vmm(28);
+            vmm_d_bias = Vmm(29);
+        } else {
+            vmm_d_weights = Vmm(26);
+            vmm_d_bias = Vmm(27);
+        }
+
+        const quantization_injector::static_params_t qsp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
+
         postops_injector_ = utils::make_unique<
-                injector::jit_uni_postops_injector_t<avx512_core, Vmm>>(
-                this, jcp.post_ops, bsp);
+                injector::jit_uni_postops_injector_t<avx512_core>>(
+                this, jcp.post_ops, bsp, qsp);
     }
 }
 
@@ -126,7 +136,7 @@ status_t _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
             zero_points_valid(&attr), VERBOSE_UNSUPPORTED_ZP_CFG);
     jcp.src_zero_point = !attr.zero_points_.has_default_values(DNNL_ARG_SRC);
     jcp.dst_zero_point = !attr.zero_points_.has_default_values(DNNL_ARG_DST);
-    jcp.zp_src_is_common = attr.zero_points_.common(DNNL_ARG_SRC);
+    jcp.zp_src_is_common = attr.zero_points_.get_mask(DNNL_ARG_SRC) == 0;
 
     format_tag_t dat_tag = utils::pick(
             ndims - 3, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
@@ -175,12 +185,10 @@ status_t _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
 
         if (jcp.signed_input && !jcp.is_depthwise) {
             want_wei_md.extra.flags = 0
-                    | memory_extra_flags::compensation_conv_s8s8
-                    | memory_extra_flags::scale_adjust;
+                    | memory_extra_flags::compensation_conv_s8s8;
             want_wei_md.extra.compensation_mask = (1 << 0)
                     + (with_groups && !jcp.is_depthwise ? (1 << 1) : 0);
-            want_wei_md.extra.scale_adjust
-                    = mayiuse(avx512_core_vnni) ? 1.f : 0.5f;
+            want_wei_md.extra.scale_adjust = 1.f;
         }
         if (jcp.src_zero_point) set_zp_src_comp_flags(want_wei_md, with_groups);
 
@@ -284,6 +292,11 @@ status_t _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
     const int prelu_ind = p.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
 
+    const int depthwise_ind = p.find(primitive_kind::depthwise);
+    jcp.with_depthwise = depthwise_ind != -1;
+    const int quantization_ind = p.find(primitive_kind::quantization);
+    jcp.with_quantization = quantization_ind != -1;
+
     const int sum_ind = p.find(primitive_kind::sum);
     jcp.with_sum = sum_ind != -1;
 
@@ -294,7 +307,7 @@ status_t _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
 
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     jcp.dst_dt = dst_d.data_type();
@@ -309,7 +322,12 @@ status_t _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf(
     jcp.nb_ic = jcp.ic / jcp.ic_block;
 
     /* kernel blocking params */
-    const int regs = jcp.has_vnni ? 30 : 28;
+    int max_regs = jcp.has_vnni ? 30 : 28;
+    if (jcp.with_depthwise || jcp.with_quantization) {
+        max_regs -= 2;
+    }
+
+    const int regs = max_regs;
     jcp.nb_ch_blocking = 1;
     jcp.nb_oc_blocking = nstl::min(4, jcp.nb_oc);
     for (; jcp.nb_oc_blocking > 1; jcp.nb_oc_blocking--)
@@ -379,14 +397,14 @@ bool _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::post_ops_ok(
     static constexpr bool sum_requires_scale_one = false;
 
     return injector::post_ops_ok(
-            post_ops_ok_args_t(avx512_core, {eltwise, binary, sum}, post_ops,
+            post_ops_ok_args_t(avx512_core, {eltwise, binary, sum, depthwise, quantization}, post_ops,
                     &dst_d, sum_at_pos_0_only, sum_requires_scale_one));
 }
 
 void _jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_scratchpad(
         memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp,
         const primitive_attr_t &attr) {
-    const int mask = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
     const dim_t scales_count
             = mask == 0 ? 1 : static_cast<dim_t>(jcp.oc) * jcp.ngroups;
     const dim_t count = nstl::max<dim_t>(scales_count, 16);
@@ -986,7 +1004,7 @@ void jit_avx512_core_x8s8s32x_deconv_fwd_kernel<Vmm>::store_output(
         }
     }
     /* Do post-ops */
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         const auto &p = attr_.post_ops_;
         const int sum_idx = p.find(primitive_kind::sum);
         const float *p_sum_scale
@@ -1042,10 +1060,23 @@ void jit_avx512_core_x8s8s32x_deconv_fwd_kernel<Vmm>::store_output(
                 }
             }
         }
+
+        std::map<size_t, int> vmm_idx_off;
+        for (int ocb = 0; ocb < jcp.nb_oc_blocking; ocb++) {
+            for (int ur = 0; ur < ur_w; ur++) {
+                vmm_idx_off.insert({vmm_out(ur, ocb).getIdx(), ocb * jcp.oc_block * sizeof(float)});
+            }
+        }
+        depthwise_injector::dynamic_params_t ddp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
         const int nb_oc_block
                 = jcp.is_depthwise ? jcp.nb_ch_blocking : jcp.nb_oc_blocking;
         postops_injector_->compute_vector_range(
-                0, nb_oc_block * ur_w, rhs_arg_params);
+                0, nb_oc_block * ur_w, rhs_arg_params, ddp, qdp);
     }
 
     if (jcp.dst_scale) {
@@ -1280,8 +1311,13 @@ template <typename Vmm>
 void jit_avx512_core_x8s8s32x_deconv_fwd_kernel<Vmm>::generate() {
     preamble();
 
-    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp))
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_src, reg_filt);
+
+    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp)) {
         sub(rsp, reserved_stack_size_);
+        base_post_ops_data_offset += reserved_stack_size_;
+    }
 
     xor_(reg_scratch, reg_scratch);
     Reg16 _t = reg_scratch.cvt16();
@@ -1380,8 +1416,13 @@ void jit_avx512_core_x8s8s32x_deconv_fwd_kernel<Vmm>::generate() {
         icb_loop(jcp.ur_w_tail, l_overflow, r_overflow, true);
     }
 
-    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp))
+    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp)) {
         add(rsp, reserved_stack_size_);
+        base_post_ops_data_offset -= reserved_stack_size_;
+    }
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
 
     postamble();
 
@@ -1393,16 +1434,18 @@ const float *jit_avx512_core_x8s8s32x_deconvolution_fwd_t::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
-    int wei_mask = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
+    int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.0f;
-    if (wei_mask == 0) {
-        utils::array_set(
-                loc_scales, src_scales[0] * wei_scales[0] * factor, 16);
-    } else {
+    if (has_wei_scales && wei_mask > 0) {
         for (dim_t c = 0; c < pd()->OC(); c++)
             loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
+    } else {
+        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
+                /* WHY: pd()->jcp_.simd_w = 0!!! */ 16);
     }
     return loc_scales;
 }
@@ -1489,6 +1532,7 @@ status_t jit_avx512_core_x8s8s32x_deconvolution_fwd_t::execute_forward_1d(
             p.kh_padding = jcp.kh;
             p.oc_blocks = jcp.is_depthwise ? g : ocb;
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+            p.oc_off = g_oc * sizeof(float);
             p.zp_compensation
                     = jcp.src_zero_point ? zp_compensation + g_oc : nullptr;
             p.zp_src_pad_str_compensation
@@ -1654,6 +1698,7 @@ status_t jit_avx512_core_x8s8s32x_deconvolution_fwd_t::execute_forward_2d(
                 p.oc_blocks = jcp.is_depthwise ? g : ocb;
                 p.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
+                p.oc_off = g_oc * sizeof(float);
                 p.zp_compensation
                         = jcp.src_zero_point ? zp_compensation + g_oc : nullptr;
                 p.zp_src_pad_str_compensation = jcp.src_zero_point
@@ -1877,6 +1922,7 @@ status_t jit_avx512_core_x8s8s32x_deconvolution_fwd_t::execute_forward_3d(
                 p.oc_blocks = jcp.is_depthwise ? g : ocb;
                 p.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
+                p.oc_off = g_oc * sizeof(float);
                 p.zp_compensation
                         = jcp.src_zero_point ? zp_compensation + g_oc : nullptr;
                 p.zp_src_pad_str_compensation = jcp.src_zero_point
diff --git a/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.hpp b/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.hpp
index 5c0475f637b..47cf795b24b 100644
--- a/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.hpp
+++ b/src/cpu/x64/jit_avx512_core_x8s8s32x_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,18 +72,19 @@ struct ur_w_blks_params_t {
 };
 
 template <typename Vmm>
-struct jit_avx512_core_x8s8s32x_deconv_fwd_kernel : public jit_generator {
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_x8s8s32x_deconv_fwd_ker_t);
+struct jit_avx512_core_x8s8s32x_deconv_fwd_kernel : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_x8s8s32x_deconv_fwd_kernel);
 
     jit_avx512_core_x8s8s32x_deconv_fwd_kernel(const jit_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
-    ~jit_avx512_core_x8s8s32x_deconv_fwd_kernel();
+
+    ~jit_avx512_core_x8s8s32x_deconv_fwd_kernel() override;
 
     const jit_conv_conf_t &jcp;
     const primitive_attr_t &attr_;
 
 private:
-    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core, Vmm>>
+    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
             postops_injector_;
 
     const int ic_sub_step = 4;
@@ -138,6 +139,13 @@ struct jit_avx512_core_x8s8s32x_deconv_fwd_kernel : public jit_generator {
     const Vmm vmm_dst_scale = Vmm(31);
     const Vmm vmm_prev_dst = Vmm(31);
 
+    /* depthwise and quantization post ops */
+    const Xbyak::Reg64 reg_d_weights = r15;
+    const Xbyak::Reg64 reg_d_bias = r13;
+    int base_post_ops_data_offset = 0;
+    Vmm vmm_d_weights;
+    Vmm vmm_d_bias;
+
     Vmm vmm_out(int i_ur, int i_oc) {
         int idx = i_ur * jcp.nb_oc_blocking + i_oc;
         assert(idx < 31);
@@ -240,7 +248,7 @@ struct _jit_avx512_core_x8s8s32x_deconv_fwd_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(_jit_avx512_core_x8s8s32x_deconv_fwd_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 struct jit_avx512_core_x8s8s32x_deconvolution_fwd_t : public primitive_t {
@@ -274,13 +282,13 @@ struct jit_avx512_core_x8s8s32x_deconvolution_fwd_t : public primitive_t {
             VDISPATCH_DECONVOLUTION(
                     desc()->accum_data_type == s32, VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_DECONVOLUTION(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops
-                            | skip_mask_t::zero_points_runtime),
+                    attr()->has_default_values(skip_mask_t::scales
+                            | skip_mask_t::post_ops | skip_mask_t::zero_points),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_DECONVOLUTION(
                     attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(_jit_avx512_core_x8s8s32x_deconv_fwd_kernel::init_conf(jcp_,
                     *desc(), src_md_, weights_md_, dst_md_, with_bias(),
                     bias_md_, attr_, dnnl_get_max_threads()));
@@ -292,7 +300,7 @@ struct jit_avx512_core_x8s8s32x_deconvolution_fwd_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_avx512_core_x8s8s32x_deconvolution_fwd_t(const pd_t *apd)
diff --git a/src/cpu/x64/jit_avx512_sparse_decompress_kernel.hpp b/src/cpu/x64/jit_avx512_sparse_decompress_kernel.hpp
index 3f4145ea470..c18705b5650 100644
--- a/src/cpu/x64/jit_avx512_sparse_decompress_kernel.hpp
+++ b/src/cpu/x64/jit_avx512_sparse_decompress_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_avx512_sparse_decompress_kernel_t : public jit_generator {
+struct jit_avx512_sparse_decompress_kernel_t : public jit_generator_t {
     struct call_params_t {
         const void *src_ptr;
         const void *bitmask_ptr;
@@ -37,7 +37,7 @@ struct jit_avx512_sparse_decompress_kernel_t : public jit_generator {
 
     jit_avx512_sparse_decompress_kernel_t(
             const matmul::brgemm_matmul_conf_t &bgmmc)
-        : jit_generator("brgemm_decompress", avx512_core_amx) {
+        : jit_generator_t("brgemm_decompress", avx512_core_amx) {
         switch (bgmmc.wei_tag) {
             case format_tag::BA16a64b4a:
             case format_tag::aCB16b64c4b: b_blk_sz_ = 64; break;
@@ -64,7 +64,7 @@ struct jit_avx512_sparse_decompress_kernel_t : public jit_generator {
 
     status_t create_kernel() override {
         CHECK(ctor_status_);
-        return jit_generator::create_kernel();
+        return jit_generator_t::create_kernel();
     }
 
 private:
diff --git a/src/cpu/x64/jit_brdgmm_dw_conv.cpp b/src/cpu/x64/jit_brdgmm_dw_conv.cpp
index 75a9481b2a8..c10073fec28 100644
--- a/src/cpu/x64/jit_brdgmm_dw_conv.cpp
+++ b/src/cpu/x64/jit_brdgmm_dw_conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -89,10 +89,10 @@ bool post_ops_ok(jit_brdgmm_conv_conf_t &jcp, const primitive_attr_t &attr,
                     broadcasting_strategy_t::no_broadcast}));
 }
 
-cpu_isa_t get_supported_isa(
-        bool is_f32, bool is_int8, bool is_bf16, bool is_f16) {
+cpu_isa_t get_supported_isa(bool is_f32, bool is_int8, bool is_bf16,
+        bool is_f16, bool is_f32_bf16, bool is_f32_f16) {
     std::vector<cpu_isa_t> isa_list;
-    if (is_f32) {
+    if (one_of(true, is_f32, is_f32_bf16, is_f32_f16)) {
         isa_list = {avx512_core, avx2};
     } else if (is_int8) {
         isa_list = {avx512_core_vnni, avx2_vnni_2, avx2_vnni};
@@ -125,31 +125,35 @@ status_t brdgmm_dw_convolution_fwd_t::pd_t::init(engine_t *engine) {
             && one_of(dst_type, bf16, f32);
     const bool is_f16 = everyone_is(f16, src_type, wei_type)
             && one_of(dst_type, f16, f32);
-    const cpu_isa_t isa = get_supported_isa(is_f32, is_int8, is_bf16, is_f16);
+    const bool is_f32_bf16
+            = everyone_is(f32, src_type, dst_type) && wei_type == bf16;
+    const bool is_f32_f16
+            = everyone_is(f32, src_type, dst_type) && wei_type == f16;
+    const cpu_isa_t isa = get_supported_isa(
+            is_f32, is_int8, is_bf16, is_f16, is_f32_bf16, is_f32_f16);
 
     auto skip_mask = skip_mask_t::post_ops;
-    if (is_int8)
-        skip_mask |= (skip_mask_t::scales_runtime
-                | skip_mask_t::zero_points_runtime);
+    if (is_int8) skip_mask |= (skip_mask_t::scales | skip_mask_t::zero_points);
 
     VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
-    VDISPATCH_CONV(one_of(true, is_f32, is_int8, is_bf16, is_f16),
+    VDISPATCH_CONV(one_of(true, is_f32, is_int8, is_bf16, is_f16, is_f32_f16,
+                           is_f32_bf16),
             VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_CONV(
             IMPLICATION(is_int8,
                     one_of(bia_type, data_type::undef, f32, s32, s8, u8)),
             VERBOSE_UNSUPPORTED_BIAS_CFG);
-    VDISPATCH_CONV(
-            IMPLICATION(!is_int8,
-                    one_of(bia_type, data_type::undef, src_type, dst_type)),
+    VDISPATCH_CONV(IMPLICATION(!is_int8,
+                           one_of(bia_type, data_type::undef, data_type::f32,
+                                   src_type, dst_type)),
             VERBOSE_UNSUPPORTED_BIAS_CFG);
     VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
             VERBOSE_BAD_ALGORITHM);
     VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
     VDISPATCH_CONV(
             (isa != isa_undef) && mayiuse(isa), "undefined or unsupported isa");
-    VDISPATCH_CONV(
-            attr()->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_CONV(attr()->has_default_values(skip_mask, dst_type),
+            VERBOSE_UNSUPPORTED_ATTR);
 
     auto &jcp = jcp_;
 
@@ -158,6 +162,11 @@ status_t brdgmm_dw_convolution_fwd_t::pd_t::init(engine_t *engine) {
     const memory_desc_wrapper dst_d(&dst_md_);
     const memory_desc_wrapper bias_d(&bias_md_);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const int ndims = src_d.ndims();
     const bool is_3d = ndims == 5;
     // Currently this kernel only supports 2D and 3D convolutions.
@@ -240,30 +249,35 @@ status_t brdgmm_dw_convolution_fwd_t::pd_t::init(engine_t *engine) {
     const auto &wei_scales = attr_.scales_.get(DNNL_ARG_WEIGHTS);
     jcp.with_scale = !src_scales.has_default_values()
             || !wei_scales.has_default_values();
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
 
     const bool scales_ok
             = attr_scales_ok({DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST});
     VDISPATCH_CONV(scales_ok, VERBOSE_UNSUPPORTED_SCALES_CFG);
 
-    const auto zp_attr = attr()->zero_points_;
-    jcp.src_zero_point = !zp_attr.has_default_values(DNNL_ARG_SRC);
-    jcp.dst_zero_point = !zp_attr.has_default_values(DNNL_ARG_DST);
-
-    // Only common zero points for the whole output tensor is supported now
-    const bool has_zero_points = jcp.src_zero_point || jcp.dst_zero_point;
-    const bool params_ok
-            = IMPLICATION(has_zero_points, utils::one_of(jcp.src_dt, u8, s8))
-            && IMPLICATION(jcp.src_zero_point,
-                    attr()->zero_points_.common(DNNL_ARG_SRC))
-            && IMPLICATION(jcp.dst_zero_point,
-                    attr()->zero_points_.common(DNNL_ARG_DST));
-    VDISPATCH_CONV(params_ok, VERBOSE_UNSUPPORTED_ZP_CFG);
-
-    VDISPATCH_CONV(!(jcp.src_zero_point
-                           && cd.weights_desc.format_kind != format_kind::any),
+    const auto &zp = attr()->zero_points_;
+    jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
+    jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
+
+    VDISPATCH_CONV(IMPLICATION(jcp.src_zero_point || jcp.dst_zero_point,
+                           utils::one_of(jcp.src_dt, s8, u8)),
             VERBOSE_UNSUPPORTED_ZP_CFG);
 
+    VDISPATCH_CONV(
+            IMPLICATION(jcp.src_zero_point,
+                    utils::one_of(zp.get_mask(DNNL_ARG_SRC), 0, (1 << 1))),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_CONV(
+            IMPLICATION(jcp.dst_zero_point, zp.get_mask(DNNL_ARG_DST) == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    // Source zero_point requires compensation, thus, must initialize weights
+    // descriptor and can't take predefined one.
+    const bool src_zp_format_ok = IMPLICATION(jcp.src_zero_point,
+            cd.weights_desc.format_kind == format_kind::any);
+    VDISPATCH_CONV(src_zp_format_ok, VERBOSE_UNSUPPORTED_ZP_CFG);
+
     // strd is only feasible for 1D (i.e., height dim is one)
     // and if there are no tails (for calculating matrix_B strides).
     // Since, we cannot always predict the blocking is 8 or 16.
@@ -428,6 +442,7 @@ status_t brdgmm_dw_convolution_fwd_t::pd_t::init_brdgmm_conf() {
                 LDA, LDC, M, N, &strides));
         CHECK(brgemm_desc_set_attr(&bcp, brg_attr));
         CHECK(brgemm_desc_set_postops(&bcp, attr(), dst_md(), LDD, jcp.bia_dt));
+        CHECK(brgemm_desc_finalize(&bcp));
         ++idx;
         return status::success;
     };
@@ -555,8 +570,8 @@ status_t brdgmm_dw_convolution_fwd_t::init(engine_t *engine) {
     const auto attr = pd()->attr();
     if (is_jit_supported && pd()->OC() > 1 && req_copy_scales(attr)) {
         const auto &attr_scales = attr->scales_;
-        int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-        if (wei_scale_mask != 0) {
+        int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_scale_mask > 0) {
             CHECK(safe_ptr_assign(jit_scale_precompute_,
                     new jit_avx512_core_scale_precompute_t(attr)));
             CHECK(jit_scale_precompute_->create_kernel());
@@ -583,14 +598,13 @@ status_t brdgmm_dw_convolution_fwd_t::execute(const exec_ctx_t &ctx) const {
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
+    DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(
             ctx.get_scratchpad_grantor(), src_scales, wei_scales, pd()->IC(),
-            pd()->OC(), false, wei_scale_mask != 0, pd()->attr(),
+            pd()->OC(), false, wei_scale_mask > 0, pd()->attr(),
             jit_scale_precompute_.get());
 
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -753,8 +767,12 @@ status_t brdgmm_dw_convolution_fwd_t::execute(const exec_ctx_t &ctx) const {
                 post_ops_data.scales = &oscales[jcp.is_oc_scale * ch];
                 post_ops_data.oc_logical_off = ch;
                 post_ops_data.dst_scales = dst_scales;
-                post_ops_data.zp_a_val
-                        = jcp.src_zero_point ? src_zero_point : 1;
+                const bool is_bcast_zp
+                        = pd()->attr()->zero_points_.get_mask(DNNL_ARG_SRC)
+                        == 0;
+                post_ops_data.a_zp_values = jcp.src_zero_point
+                        ? src_zero_point + ch * !is_bcast_zp
+                        : nullptr;
                 post_ops_data.c_zp_values
                         = jcp.dst_zero_point ? dst_zero_point : nullptr;
                 post_ops_data.a_zp_compensations
diff --git a/src/cpu/x64/jit_brdgmm_dw_conv.hpp b/src/cpu/x64/jit_brdgmm_dw_conv.hpp
index 83edafab86d..6d60ba4773d 100644
--- a/src/cpu/x64/jit_brdgmm_dw_conv.hpp
+++ b/src/cpu/x64/jit_brdgmm_dw_conv.hpp
@@ -33,15 +33,13 @@ namespace x64 {
 
 struct brdgmm_dw_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brdgmm_dw:", jcp_.isa, ""),
                 brdgmm_dw_convolution_fwd_t);
 
         status_t init(engine_t *engine);
-        jit_brdgmm_conv_conf_t jcp_;
+        jit_brdgmm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         std::vector<brgemm_desc_t> bcps_;
         std::vector<brgemm_batch_element_t> batches_;
         std::vector<int> bs_;
diff --git a/src/cpu/x64/jit_brgemm_1x1_conv.cpp b/src/cpu/x64/jit_brgemm_1x1_conv.cpp
index b94c4a1ecc1..7a0d3169227 100644
--- a/src/cpu/x64/jit_brgemm_1x1_conv.cpp
+++ b/src/cpu/x64/jit_brgemm_1x1_conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -54,8 +54,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
 
     using skip_mask_t = primitive_attr_t::skip_mask_t;
     auto skip_mask = skip_mask_t::post_ops | skip_mask_t::sum_dt
-            | skip_mask_t::zero_points_runtime | skip_mask_t::fpmath_mode;
-    if (one_of(src_type, u8, s8)) skip_mask |= skip_mask_t::scales_runtime;
+            | skip_mask_t::zero_points | skip_mask_t::fpmath_mode;
+    if (one_of(src_type, u8, s8)) skip_mask |= skip_mask_t::scales;
 
     VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
     VDISPATCH_CONV(expect_data_types(src_type, wei_type, data_type::undef,
@@ -102,7 +102,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
         if (vM == 0 || vN == 0 || vK == 0) continue;
 
         if (!jcp_.is_reduced_rtus) {
-            brgemm_init_params_.emplace_front(i_init, vM, vN, vK, jcp_.LDA);
+            brgemm_init_params_.emplace_front(
+                    i_init, vM, vN, vK, jcp_.LDA, jcp_.extendable_k);
         } else {
             const bool is_accum_kernel = i_init == 0;
 
@@ -119,7 +120,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
             const auto brgemm_K = use_rtus_K ? rtus_k : vK;
             const bool use_rtus_LDA = use_rtus_K && is_accum_kernel;
             const auto LDA = use_rtus_LDA ? jcp_.rtus_padded_ic_size : jcp_.LDA;
-            brgemm_init_params_.emplace_front(i_init, vM, vN, brgemm_K, LDA);
+            brgemm_init_params_.emplace_front(
+                    i_init, vM, vN, brgemm_K, LDA, false);
         }
     }
 
@@ -139,7 +141,7 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
             const auto LDA = use_rtus_LDA ? jcp_.rtus_padded_ic_size : jcp_.LDA;
             constexpr int extra_m_kernel_start_idx = 2;
             brgemm_init_params_.emplace_front(
-                    extra_m_kernel_start_idx + idx, vM, vN, vK, LDA);
+                    extra_m_kernel_start_idx + idx, vM, vN, vK, LDA, false);
         }
     }
 
@@ -200,7 +202,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init_brgemm_desc() {
         brgattr.hint_expected_B_size = vN * vK;
         brgattr.hint_expected_C_size = bd_blocking * vN;
 
-        brgattr.wary_tail_read = false;
+        brgattr.wary_A_k_tail_read = params.wary_tail_read_;
+        brgattr.extendable_k = jcp_.extendable_k;
         brgattr.use_uker = jcp_.use_uker;
         brgattr.use_interleave_stores = jcp_.use_interleave_stores;
         brgattr.hint_prefetching = jcp_.hint_prefetching;
@@ -218,6 +221,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::pd_t::init_brgemm_desc() {
         brg.with_weights_scale_adjust = jcp_.scale_adjust_factor != 1.0f;
         CHECK(brgemm_desc_set_postops(
                 &brg, attr(), &dst_md_, LDD, jcp_.bia_dt));
+        CHECK(brgemm_desc_finalize(&brg));
+
         jcp_.amx_buf_size_per_thread = nstl::max(
                 brg.get_wsp_buffer_size(), jcp_.amx_buf_size_per_thread);
         brgs_->insert(brg_idx, brg);
@@ -265,7 +270,7 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::init(engine_t *engine) {
     wei_ic_stride = jcp.wei_plain ? jcp.oc_without_padding : jcp.oc_block;
     wei_ocb_stride = jcp.wei_plain
             ? jcp.oc_block
-            : (dim_t)rnd_up(jcp.ic, last_ic_block) * jcp.oc_block;
+            : (dim_t)rnd_up(jcp.icp, last_ic_block) * jcp.oc_block;
     wei_g_stride = jcp.wei_plain ? jcp.oc : jcp.nb_oc * wei_ocb_stride;
 
     if (jcp.is_rtus) {
@@ -281,8 +286,8 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::init(engine_t *engine) {
     if (is_jit_supported && pd()->OC() > 1
             && req_copy_scales(attr, jcp.scale_adjust_factor)) {
         const auto &attr_scales = attr->scales_;
-        int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-        if (wei_scale_mask != 0) {
+        int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_scale_mask > 0) {
             CHECK(safe_ptr_assign(jit_scale_precompute_,
                     new jit_avx512_core_scale_precompute_t(
                             attr, jcp.scale_adjust_factor)));
@@ -333,14 +338,20 @@ void brgemm_1x1_convolution_fwd_t<isa>::maybe_rtus(int ithr,
             : icc * jcp.nb_ic_blocking * jcp.ic_block;
     const auto g_ic = g * jcp.ic_without_padding + icc_tail_start;
 
+    const memory_desc_wrapper src_d(pd()->src_md());
+
     auto call_kernel = [&](int nh, int nw, int od, int oh, int ow) {
         assert(nh == 0 || (nw == 0 && ow == 0));
         if (utils::everyone_is(0, nh, nw)) return;
         const int id = od * jcp.stride_d;
         const int ih = oh * jcp.stride_h;
         const int iw = ow * jcp.stride_w;
-        const auto inp_offset = n * src_d_sz + id * src_h_sz + ih * src_w_sz
-                + iw * jcp.ngroups * jcp.ic_without_padding + g_ic;
+
+        // Using blk_off to offset batch is motivated input\output striding aligment
+        const auto inp_offset = src_d.off_l(0)
+                + n * src_d.blk_off<false, true>(1) + id * src_h_sz
+                + ih * src_w_sz + iw * jcp.ngroups * jcp.ic_without_padding
+                + g_ic;
         auto p = jit_avx512_core_brgemm_conv_trans_kernel::
                 jit_brgemm_conv_trans_kernel_call_s();
         p.h_count = nh;
@@ -426,7 +437,6 @@ void brgemm_1x1_convolution_fwd_t<isa>::exec_ker(
 
     const int icb = icc * jcp.nb_ic_blocking;
     const int ic = icb * jcp.ic_block;
-    const int g_ic = g * jcp.ic + ic;
 
     const bool use_special_m_idx = get_extra_m_kernel_req(jcp) && is_last_os;
     const int kernel_init = static_cast<int>(icc == 0) + 2 * use_special_m_idx;
@@ -441,19 +451,33 @@ void brgemm_1x1_convolution_fwd_t<isa>::exec_ker(
             : (icc == pd()->ic_chunks_ - 1
                     && ((jcp.ic - ic) % jcp.ic_block != 0));
 
-    const auto src_offset = n * src_d_sz + id * src_h_sz + ih * src_w_sz
-            + iw * jcp.ngroups * jcp.ic_without_padding + g_ic;
-    const auto rtus_src
-            = jcp.is_reduced_rtus ? src + src_dt_size * src_offset : inp_buffer;
+    // Using blk_off to offset batch is motivated input\output striding aligment
+    // See `blk_off` definition.
+    const auto src_mb_c_offset = src_dt_size
+            * (src_d.off_l(0) + n * src_d.blk_off<false, true>(1)
+                    + g * src_d.blk_off<false, true>(0, 1) * jcp.ic + ic);
+    const auto src_hw_offset = src_dt_size
+            * (id * src_h_sz + ih * src_w_sz
+                    + iw * jcp.ngroups * jcp.ic_without_padding);
+
+    const auto rtus_src = jcp.is_reduced_rtus
+            ? src + src_mb_c_offset + src_hw_offset
+            : inp_buffer;
     const auto src_base
-            = jcp.is_rtus ? rtus_src : src + src_dt_size * src_offset;
+            = jcp.is_rtus ? rtus_src : src + src_mb_c_offset + src_hw_offset;
 
     const auto wei_offset = g * wei_g_stride + ocb * wei_ocb_stride;
     const auto wei_base = weights + wei_dt_size * wei_offset;
-    const auto ptr_D = dst
-            + dst_dt_size
-                    * (n * dst_d_sz + od * dst_h_sz + oh * dst_w_sz
-                            + ow * jcp.oc_without_padding + g_oc);
+
+    // Using blk_off to offset batch is motivated input\output striding aligment
+    // See `blk_off` definition.
+    const auto dst_base = dst_dt_size
+            * (dst_d.off_l(0) + n * dst_d.blk_off<false, true>(1)
+                    + g * dst_d.blk_off<false, true>(0, 1) * jcp.oc + oc);
+    const auto dst_offset = dst_dt_size
+            * (od * dst_h_sz + oh * dst_w_sz + ow * jcp.oc_without_padding);
+
+    const auto ptr_D = dst + dst_base + dst_offset;
     char *const ptr_C = (jcp.use_buffer) ? c_buffer : (char *)ptr_D;
 
     const auto bias_w
@@ -471,6 +495,8 @@ void brgemm_1x1_convolution_fwd_t<isa>::exec_ker(
             ? &s8s8_compensation[comp_offset]
             : nullptr;
 
+    const bool wary_tail_read = jcp.extendable_k;
+
     const auto call_brgemm = [&](int brg_idx, int ic_block_s, int n_ic_blocks,
                                      bool do_postops, bool brgemm_is_ic_tail) {
         // NOTE: avoid some costly tile reconfigurations here by keeping track
@@ -522,18 +548,21 @@ void brgemm_1x1_convolution_fwd_t<isa>::exec_ker(
 
     const auto do_post_work = (pd()->need_postwork_ || jcp.use_buffer)
             && icc == pd()->ic_chunks_ - 1;
-    if (jcp.is_reduced_rtus || nb_ic_b > 0) {
-        const auto brg_idx
-                = get_brg_idx(kernel_init, is_os_tail, is_oc_tail, false);
-        call_brgemm(brg_idx, 0, jcp.is_reduced_rtus ? 1 : nb_ic_b,
-                do_post_work && !is_ic_tail, false);
+    if (jcp.is_reduced_rtus) {
+        const auto brg_idx = get_brg_idx(
+                kernel_init, is_os_tail, is_oc_tail, false, wary_tail_read);
+        call_brgemm(brg_idx, 0, 1, do_post_work && !is_ic_tail, false);
+    } else if (nb_ic_b > 0) {
+        const auto brg_idx = get_brg_idx(
+                kernel_init, is_os_tail, is_oc_tail, false, wary_tail_read);
+        call_brgemm(brg_idx, 0, nb_ic_b, do_post_work && !is_ic_tail, false);
     }
     if (is_ic_tail) {
         const auto use_init_ker = jcp.is_reduced_rtus
                 ? kernel_init - 1
                 : kernel_init && (nb_ic_b == 0);
-        const auto brg_idx = get_brg_idx(
-                use_init_ker, is_os_tail, is_oc_tail, !jcp.is_reduced_rtus);
+        const auto brg_idx = get_brg_idx(use_init_ker, is_os_tail, is_oc_tail,
+                !jcp.is_reduced_rtus, wary_tail_read);
         call_brgemm(brg_idx, jcp.is_reduced_rtus ? 0 : nb_ic_b, 1, do_post_work,
                 jcp.is_reduced_rtus);
     }
@@ -694,11 +723,10 @@ status_t brgemm_1x1_convolution_fwd_t<isa>::execute_forward_all(
     DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(scratchpad,
             src_scales, wei_scales, pd()->IC(), pd()->OC(), false,
-            wei_scale_mask != 0, pd()->attr(), jit_scale_precompute_.get(),
+            wei_scale_mask > 0, pd()->attr(), jit_scale_precompute_.get(),
             jcp.scale_adjust_factor);
 
     DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
diff --git a/src/cpu/x64/jit_brgemm_1x1_conv.hpp b/src/cpu/x64/jit_brgemm_1x1_conv.hpp
index f2c6e68071c..44f03222bcb 100644
--- a/src/cpu/x64/jit_brgemm_1x1_conv.hpp
+++ b/src/cpu/x64/jit_brgemm_1x1_conv.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,9 +46,7 @@ namespace x64 {
 template <cpu_isa_t isa>
 struct brgemm_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgconv_1x1:", isa, ""),
                 brgemm_1x1_convolution_fwd_t);
@@ -56,12 +54,18 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
         status_t init(engine_t *engine);
 
         struct brgemm_init_params_t {
-            brgemm_init_params_t(
-                    int k_accum_idx, int m, int n, int k, size_t lda)
-                : k_accum_idx_(k_accum_idx), M_(m), N_(n), K_(k), LDA_(lda) {}
+            brgemm_init_params_t(int k_accum_idx, int m, int n, int k,
+                    size_t lda, bool wary_tail_read)
+                : k_accum_idx_(k_accum_idx)
+                , M_(m)
+                , N_(n)
+                , K_(k)
+                , LDA_(lda)
+                , wary_tail_read_(wary_tail_read) {}
             const int k_accum_idx_; // controls brgemm:beta param
             const int M_, N_, K_;
             const size_t LDA_;
+            bool wary_tail_read_ {false};
         };
 
         std::shared_ptr<brgemm_containers::brgemm_desc_container_t> brgs_;
@@ -70,7 +74,7 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
         bool need_postwork_;
         int ic_chunks_;
 
-        jit_brgemm_conv_conf_t jcp_;
+        jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool arg_scales_ok() const {
@@ -79,12 +83,20 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
             return attr_scales_ok(supported_args);
         }
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
     private:
@@ -94,7 +106,7 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
     brgemm_1x1_convolution_fwd_t(const pd_t *apd)
         : primitive_t(apd), bias_d(pd()->weights_md(1)) {}
 
-    ~brgemm_1x1_convolution_fwd_t() {}
+    ~brgemm_1x1_convolution_fwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override {
         execute_forward_all(ctx);
@@ -154,19 +166,21 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
 
     static int get_brg_idx(const jit_brgemm_conv_conf_t &jcp,
             const typename pd_t::brgemm_init_params_t &bparams) {
-        const int k_accum_idx = bparams.k_accum_idx_;
         const int is_M_tail = bparams.M_ == jcp.M_tail;
         const int is_N_tail = bparams.N_ == jcp.N_tail;
         const int is_K_tail = bparams.K_ == jcp.K_tail;
-        return get_brg_idx(k_accum_idx, is_M_tail, is_N_tail, is_K_tail);
+        return get_brg_idx(bparams.k_accum_idx_, is_M_tail, is_N_tail,
+                is_K_tail, bparams.wary_tail_read_);
     }
 
     static int get_brg_idx(int do_initialization, bool is_M_tail,
-            bool is_N_tail, bool is_K_tail) {
-        return (((int)do_initialization * 2 + (int)is_M_tail) * 2
-                       + (int)is_N_tail)
+            bool is_N_tail, bool is_K_tail, bool wary_tail_read) {
+        return ((((int)do_initialization * 2 + (int)is_M_tail) * 2
+                        + (int)is_N_tail)
+                               * 2
+                       + (int)is_K_tail)
                 * 2
-                + (int)is_K_tail;
+                + (int)wary_tail_read;
     }
 
     static int get_ker_po_idx(int is_M_tail, bool is_N_tail) {
@@ -188,8 +202,8 @@ struct brgemm_1x1_convolution_fwd_t : public primitive_t {
         return jcp.is_reduced_rtus && (!get_extra_m_kernel_req(jcp));
     }
 
-    brgemm_containers::brgemm_kernel_container_t brg_kernels_ {32};
-    brgemm_containers::brgemm_palette_container_t brgemm_palettes_ {32};
+    brgemm_containers::brgemm_kernel_container_t brg_kernels_ {64};
+    brgemm_containers::brgemm_palette_container_t brgemm_palettes_ {64};
 
     std::unique_ptr<jit_avx512_core_brgemm_conv_trans_kernel::
                     jit_avx512_core_brgemm_conv_rtus_kernel_t>
diff --git a/src/cpu/x64/jit_brgemm_conv.cpp b/src/cpu/x64/jit_brgemm_conv.cpp
index 97bdc249c3a..3d1c1752df2 100644
--- a/src/cpu/x64/jit_brgemm_conv.cpp
+++ b/src/cpu/x64/jit_brgemm_conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -284,6 +284,8 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::add_brg_descriptor(int vM,
     brgattr.hint_innermost_loop = jcp_.brgemm_bd_loop_innermost
             ? brgemm_bd_loop_innermost
             : brgemm_innermost_undef;
+    brgattr.hint_loop_order = jcp_.brgemm_kernel_loop_order;
+
     if (jcp_.amx_tile_load_xx) {
         // assuming 2x2 decomposition in amx brgemm kernel
         // and overlap of input by kw
@@ -300,7 +302,7 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::add_brg_descriptor(int vM,
         brgattr.hint_expected_C_size = 0;
     }
 
-    brgattr.wary_tail_read = false;
+    brgattr.wary_A_k_tail_read = false;
     brgattr.bd_mask_level = jcp_.use_M_mask;
 
     if (is_amx) {
@@ -316,9 +318,11 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::add_brg_descriptor(int vM,
     CHECK(brgemm_desc_set_attr(&brg, brgattr));
 
     auto LDD = jcp_.oc_without_padding;
-    brg.with_sum = with_sum;
+    brg.with_sum = with_sum_;
     brg.with_weights_scale_adjust = jcp_.scale_adjust_factor != 1.0f;
     CHECK(brgemm_desc_set_postops(&brg, attr(), &dst_md_, LDD, jcp_.bia_dt));
+    CHECK(brgemm_desc_finalize(&brg));
+
     jcp_.amx_buf_size_per_thread = nstl::max(
             brg.get_wsp_buffer_size(), jcp_.amx_buf_size_per_thread);
 
@@ -334,64 +338,6 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::add_brg_descriptor(int vM,
     return status::success;
 }
 
-template <cpu_isa_t isa>
-void brgemm_convolution_fwd_t<isa>::pd_t::get_kw_range(
-        int ow, int &kw_s, int &kw_full_s, int &kw_full_f, int &kw_f) const {
-    // This function is used for exec_base only
-    // TODO: calculate these values instead direct loop by kw
-
-    const bool is_ow_tail = (jcp_.ow - ow < jcp_.ow_block);
-    const auto M = is_ow_tail ? jcp_.ow_tail : jcp_.ow_block;
-    kw_s = kw_full_s = kw_full_f = kw_f = -1;
-    for (int kw = 0; kw < jcp_.kw; kw++) {
-        int ow_s {0}, ow_f {0};
-        get_ow_range(ow, kw, ow_s, ow_f);
-        if (ow_s < ow_f) {
-            if (kw_s == -1) kw_s = kw;
-            kw_f = kw + 1;
-            if (ow_f - ow_s == M) {
-                if (kw_full_s == -1) kw_full_s = kw;
-                kw_full_f = kw + 1;
-            }
-        }
-    }
-    if (kw_f == -1) {
-        kw_s = 0;
-        kw_f = 0;
-    }
-    if (kw_full_f == -1) kw_full_s = kw_full_f = kw_f;
-}
-
-template <cpu_isa_t isa>
-void brgemm_convolution_fwd_t<isa>::pd_t::get_ow_range(
-        int ow, int kw, int &ow_s, int &ow_f) const {
-    // This function is used for exec_base only
-
-    const bool is_ow_tail = (jcp_.ow - ow < jcp_.ow_block);
-    const auto M = is_ow_tail ? jcp_.ow_tail : jcp_.ow_block;
-
-    const auto IW = jcp_.iw;
-    const auto SW = jcp_.stride_w;
-    const auto LP = jcp_.l_pad;
-    const auto DW = jcp_.dilate_w + 1;
-
-    const auto iiw = ow * SW - LP;
-    auto iw_lp = iiw + kw * DW;
-    const auto iw_rp = iw_lp + (M - 1) * SW - IW + 1;
-    ow_s = ow;
-
-    int ker_idx = 0;
-    if (iw_lp < 0) {
-        iw_lp = nstl::abs(iw_lp);
-        ker_idx += div_up(iw_lp, SW);
-        ow_s += ker_idx;
-    }
-    if (iw_rp > 0) ker_idx += div_up(iw_rp, SW);
-    ow_f = ow_s + (M - ker_idx);
-    ow_s = nstl::min(ow_s, ow + M);
-    ow_f = nstl::min(nstl::max(ow_f, ow_s), ow + M);
-}
-
 template <cpu_isa_t isa>
 status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     using namespace data_type;
@@ -417,8 +363,8 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
 
     using skip_mask_t = primitive_attr_t::skip_mask_t;
     auto skip_mask = skip_mask_t::post_ops | skip_mask_t::sum_dt
-            | skip_mask_t::zero_points_runtime | skip_mask_t::fpmath_mode;
-    if (is_int8) skip_mask |= skip_mask_t::scales_runtime;
+            | skip_mask_t::zero_points | skip_mask_t::fpmath_mode;
+    if (is_int8) skip_mask |= skip_mask_t::scales;
 
     VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
     VDISPATCH_CONV(IMPLICATION(is_int8,
@@ -497,10 +443,8 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     // const variables used for address calculations
     src_w_sz = static_cast<dim_t>(IW) * jcp_.ngroups * jcp_.ic_without_padding;
     src_h_sz = IH * src_w_sz;
-    src_d_sz = ID * src_h_sz;
     dst_w_sz = static_cast<dim_t>(OW) * jcp_.oc_without_padding;
     dst_h_sz = OH * dst_w_sz;
-    dst_d_sz = OD * dst_h_sz;
     rd = jcp_.ic;
     if (jcp_.is_relo_wi())
         rd *= jcp_.kw;
@@ -598,7 +542,7 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
 
     const auto &p = attr()->post_ops_;
     const int sum_idx = p.find(primitive_kind::sum);
-    with_sum = (sum_idx != -1);
+    with_sum_ = (sum_idx != -1);
 
     // os_blocking is supported for exec_trans only
     assert(IMPLICATION(jcp_.exec_type != exec_trans, !jcp_.is_os_blocking));
@@ -630,7 +574,7 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     if (jcp_.K > 0) Kv.push_back(false);
     if (has_K_tail) Kv.push_back(true);
 
-    const auto first_K_init_only = jcp_.exec_type == exec_trans
+    const auto first_K_init_only = one_of(jcp_.exec_type, exec_trans, exec_vpad)
             && (jcp_.ic / jcp_.ic_block <= 1)
             && (KD_BLOCK == KD && KH_BLOCK == KH);
 
@@ -659,13 +603,14 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
         int kw_s {0}, kw_full_s {0}, kw_full_f {0}, kw_f {0}, ow_s {0},
                 ow_f {0};
         for (int ow = 0; ow < OW; ow += jcp_.ow_block) {
-            get_kw_range(ow, kw_s, kw_full_s, kw_full_f, kw_f);
+            brgemm_convolution_utils::get_kw_range(
+                    jcp_, ow, kw_s, kw_full_s, kw_full_f, kw_f);
             for (int kw = kw_s; kw < kw_f; kw++) {
-                get_ow_range(ow, kw, ow_s, ow_f);
-                if (ow_f - ow_s <= 0) continue;
-
-                auto M = ow_f - ow_s;
+                brgemm_convolution_utils::get_ow_range(
+                        jcp_, ow, kw, ow_s, ow_f);
+                const auto M = ow_f - ow_s;
                 if (M <= 0) continue;
+
                 for (const auto &key_value_pair : batchsizes) {
                     const int kd_b = key_value_pair.first[0];
                     const int kd_e = key_value_pair.first[1];
@@ -690,9 +635,11 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
 
         for (int ow = (jcp_.nb_ow - 1) * jcp_.ow_block; ow >= 0;
                 ow -= jcp_.ow_block) {
-            get_kw_range(ow, kw_s, kw_full_s, kw_full_f, kw_f);
+            brgemm_convolution_utils::get_kw_range(
+                    jcp_, ow, kw_s, kw_full_s, kw_full_f, kw_f);
             for (int kw = kw_s; kw < kw_f; kw++) {
-                get_ow_range(ow, kw, ow_s, ow_f);
+                brgemm_convolution_utils::get_ow_range(
+                        jcp_, ow, kw, ow_s, ow_f);
                 if (ow_f - ow_s <= 0) continue;
 
                 auto M = ow_f - ow_s;
@@ -730,6 +677,20 @@ status_t brgemm_convolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     return status::success;
 }
 
+template <cpu_isa_t isa>
+dim_t brgemm_convolution_fwd_t<isa>::get_src_base_offset(
+        const brgemm_thread_ctx_t &btc, const dim_t ic) const {
+    const auto &jcp = pd()->jcp_;
+    const memory_desc_wrapper src_d(pd()->src_md());
+
+    // The second arg in template means sub_offset0 = true
+    // See `blk_off` method definition.
+    const auto batch_offs = btc.n * src_d.blk_off<false, true>(1);
+    const auto group_offs
+            = btc.g * src_d.blk_off<false, true>(0, 1) * jcp.ic + ic;
+    return src_dsz * (src_d.off_l(0) + batch_offs + group_offs);
+}
+
 template <cpu_isa_t isa>
 brgemm_convolution_fwd_t<isa>::brgemm_convolution_fwd_t(const pd_t *apd)
     : primitive_t(apd), bias_d(pd()->weights_md(1)) {}
@@ -934,10 +895,8 @@ status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
     // const variables used for address calculations
     src_w_sz = static_cast<dim_t>(IW) * jcp.ngroups * jcp.ic_without_padding;
     src_h_sz = IH * src_w_sz;
-    src_d_sz = ID * src_h_sz;
     dst_w_sz = static_cast<dim_t>(OW) * jcp.oc_without_padding;
     dst_h_sz = OH * dst_w_sz;
-    dst_d_sz = OD * dst_h_sz;
 
     const auto comp_buffer_os = jcp.exec_type != exec_vpad ? jcp.ow : 1;
     comp_ow_sz = static_cast<dim_t>(jcp.oc_block);
@@ -974,8 +933,8 @@ status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
     if (is_jit_supported && pd()->OC() > 1
             && req_copy_scales(attr, jcp.scale_adjust_factor)) {
         const auto &attr_scales = attr->scales_;
-        int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-        if (wei_scale_mask != 0) {
+        int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_scale_mask > 0) {
             CHECK(safe_ptr_assign(jit_scale_precompute_,
                     new jit_avx512_core_scale_precompute_t(
                             attr, jcp.scale_adjust_factor)));
@@ -1087,16 +1046,19 @@ status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
         int kw_s {0}, kw_full_s {0}, kw_full_f {0}, kw_f {0}, ow_s {0},
                 ow_f {0};
         for (int ow = 0; ow < OW; ow += jcp.ow_block) {
-            _pd->get_kw_range(ow, kw_s, kw_full_s, kw_full_f, kw_f);
+            brgemm_convolution_utils::get_kw_range(
+                    jcp, ow, kw_s, kw_full_s, kw_full_f, kw_f);
             bool is_ow_tail = (jcp.ow - ow < jcp.ow_block);
             for_(int i_N = 0; i_N < 2; i_N++)
             for (int i_side = 0; i_side < 2; i_side++) {
                 auto M = is_ow_tail ? jcp.M_tail : jcp.M;
                 if (M <= 0) continue;
-                _pd->get_ow_range(ow, kw_s, ow_s, ow_f);
+                brgemm_convolution_utils::get_ow_range(
+                        jcp, ow, kw_s, ow_s, ow_f);
                 const auto init_bcast_dim
                         = (i_side == 0) ? (ow_s - ow) : (ow + M - ow_f);
-                _pd->get_ow_range(ow, kw_f - 1, ow_s, ow_f);
+                brgemm_convolution_utils::get_ow_range(
+                        jcp, ow, kw_f - 1, ow_s, ow_f);
                 const auto po_bcast_dim
                         = (i_side == 0) ? (ow_s - ow) : (ow + M - ow_f);
                 add_po_kernels(i_N, init_bcast_dim, po_bcast_dim);
@@ -1107,7 +1069,8 @@ status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
 
         for (int ow = (jcp.nb_ow - 1) * jcp.ow_block; ow >= 0;
                 ow -= jcp.ow_block) {
-            _pd->get_kw_range(ow, kw_s, kw_full_s, kw_full_f, kw_f);
+            brgemm_convolution_utils::get_kw_range(
+                    jcp, ow, kw_s, kw_full_s, kw_full_f, kw_f);
 
             bool is_ow_tail = (jcp.ow - ow < jcp.ow_block);
 
@@ -1115,10 +1078,12 @@ status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
             for (int i_side = 0; i_side < 2; i_side++) {
                 auto M = is_ow_tail ? jcp.M_tail : jcp.M;
                 if (M <= 0) continue;
-                _pd->get_ow_range(ow, kw_s, ow_s, ow_f);
+                brgemm_convolution_utils::get_ow_range(
+                        jcp, ow, kw_s, ow_s, ow_f);
                 const auto init_bcast_dim
                         = (i_side == 0) ? (ow_s - ow) : (ow + M - ow_f);
-                _pd->get_ow_range(ow, kw_f - 1, ow_s, ow_f);
+                brgemm_convolution_utils::get_ow_range(
+                        jcp, ow, kw_f - 1, ow_s, ow_f);
                 const auto po_bcast_dim
                         = (i_side == 0) ? (ow_s - ow) : (ow + M - ow_f);
                 add_po_kernels(i_N, init_bcast_dim, po_bcast_dim);
@@ -1241,7 +1206,8 @@ status_t brgemm_convolution_fwd_t<isa>::init(engine_t *engine) {
                         - div_up(
                                 nstl::max(0, iih - IH + (KH - 1) * DH + 1), DH);
                 const auto kh_f = ndims_pick(kh_f_, kh_f_, 1);
-                _pd->get_kw_range(ow, kw_s, kw_full_s, kw_full_f, kw_f);
+                brgemm_convolution_utils::get_kw_range(
+                        jcp, ow, kw_s, kw_full_s, kw_full_f, kw_f);
                 if (kd_f > kd_s && kh_f > kh_s && kw_f > kw_s) {
                     if (jcp.exec_type != exec_trans) {
                         update_kernels(kd_s, kd_f, kh_s, kh_f, 0, KW);
@@ -1308,11 +1274,10 @@ status_t brgemm_convolution_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
 
     const memory_tracking::grantor_t scratchpad = ctx.get_scratchpad_grantor();
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(scratchpad,
             src_scales, wei_scales, pd()->IC(), pd()->OC(), false,
-            wei_scale_mask != 0, pd()->attr(), jit_scale_precompute_.get(),
+            wei_scale_mask > 0, pd()->attr(), jit_scale_precompute_.get(),
             jcp.scale_adjust_factor);
 
     brgemm_exec_ctx_t brgemm_ctx(ctx, _pd);
@@ -1416,16 +1381,9 @@ status_t brgemm_convolution_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
 
         dim_t start {0}, end {0};
         balance211(work_amount, nthr, ithr, start, end);
-        int n {0}, g {0}, ocb {0}, odb {0}, ohb {0}, owb {0};
-        if (jcp.loop_order == loop_ndhwgc)
-            nd_iterator_init(start, n, jcp.mb, odb, jcp.nb_od, ohb, jcp.nb_oh,
-                    owb, jcp.nb_ow, g, jcp.ngroups, ocb, jcp.nb_oc);
-        else if (jcp.loop_order == loop_ngcdhw)
-            nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ocb, jcp.nb_oc,
-                    odb, jcp.nb_od, ohb, jcp.nb_oh, owb, jcp.nb_ow);
-        else
-            assert(!"Unknown loop order");
 
+        int n {0}, g {0}, ocb {0}, odb {0}, ohb {0}, owb {0};
+        BRGEMM_CONV_ITERATOR_INIT;
         for (auto work = start; work < end; work++) {
             btc.g = g;
             btc.n = n;
@@ -1479,14 +1437,7 @@ status_t brgemm_convolution_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
                 last_btc.ohb = ohb;
                 last_btc.owb = owb;
             }
-            if (jcp.loop_order == loop_ndhwgc)
-                nd_iterator_step(n, jcp.mb, odb, jcp.nb_od, ohb, jcp.nb_oh, owb,
-                        jcp.nb_ow, g, jcp.ngroups, ocb, jcp.nb_oc);
-            else if (jcp.loop_order == loop_ngcdhw)
-                nd_iterator_step(n, jcp.mb, g, jcp.ngroups, ocb, jcp.nb_oc, odb,
-                        jcp.nb_od, ohb, jcp.nb_oh, owb, jcp.nb_ow);
-            else
-                assert(!"Unknown loop order");
+            BRGEMM_CONV_ITERATOR_STEP;
         }
         if (is_amx) { amx_tile_release(); }
     });
@@ -1526,6 +1477,10 @@ status_t brgemm_convolution_fwd_t<isa>::cal_compensation(
 
     const int max_ker_sz = adjusted_k.size();
     const auto comp_buffer_ow = jcp.exec_type != exec_vpad ? jcp.ow : 1;
+    // TODO: revise the thread distribution here because the work_amount may be
+    // insufficient
+    // TODO: revise comp_vpad_pbuffer_ generator to avoid huge code for cases
+    // with big ow
     const auto work_amount
             = static_cast<dim_t>(jcp.ngroups) * jcp.nb_oc * max_ker_sz;
     const auto is_small_shape = work_amount <= jcp.nthr
@@ -1622,9 +1577,9 @@ void brgemm_convolution_fwd_t<isa>::perform_outwork(
     const bool is_ow_tail = (OW - ow < jcp.ow_block);
 
     const auto M = is_ow_tail ? jcp.M_tail : jcp.M;
-    const auto kdh_l = kd_l * kh_l;
-    const auto ow_s = (kdh_l <= 0) ? ow : ker_ow_s;
-    const auto ow_f = (kdh_l <= 0) ? ow : ker_ow_f;
+    const auto valid_kdh_l = kd_l > 0 && kh_l > 0;
+    const auto ow_s = valid_kdh_l ? ker_ow_s : ow;
+    const auto ow_f = valid_kdh_l ? ker_ow_f : ow;
     assert(ow <= ow_s && ow_s <= ow_f && ow_f <= ow + M);
 
     brgemm_kernel_post_ops_args_t p;
@@ -1886,7 +1841,10 @@ void brgemm_convolution_fwd_t<isa>::maybe_conv_inp(brgemm_thread_ctx_t &btc,
 
     const auto base_ih_buf = (jcp.copy_block_only ? 0 : ih_start)
             + (jcp.is_relo_whi() ? 0 : TP);
-    const auto base_inp_offset_start = static_cast<dim_t>(btc.n) * src_d_sz
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const auto base_inp_offset_start = src_d.off_l(0)
+            + static_cast<dim_t>(btc.n) * src_d.blk_off<false, true>(1)
             + iw * jcp.ngroups * jcp.ic_without_padding + g_ic;
 
     if (jcp.is_relo_whi()) {
@@ -2020,7 +1978,6 @@ void brgemm_convolution_fwd_t<isa>::maybe_conv_inp(brgemm_thread_ctx_t &btc,
     const int g_oc = btc.g * jcp.oc + oc; \
     const int icb = btc.icc * jcp.nb_ic_blocking; \
     const int ic = icb * jcp.ic_block; \
-    const int g_ic = btc.g * jcp.ic + ic; \
     const int ow = btc.owb * jcp.ow_block; \
     const int oh = btc.ohb * jcp.oh_block; \
     const int iid = ndims_pick(btc.od * SD - FP, 0, 0); \
@@ -2050,8 +2007,13 @@ void brgemm_convolution_fwd_t<isa>::maybe_conv_inp(brgemm_thread_ctx_t &btc,
             = bias ? bias + (bias_d.blk_off(g_oc) * bia_dsz) : nullptr; \
     const auto nb_ic_b = nstl::min(jcp.nb_ic_blocking, jcp.nb_ic - icb) \
             - (is_ic_tail ? 1 : 0); \
-    char *const __restrict dst_base \
-            = btc.brgemm_ctx.dst + dst_dsz * (btc.n * dst_d_sz + g_oc); \
+    const memory_desc_wrapper dst_d(pd()->dst_md()); \
+    char *const __restrict dst_base = btc.brgemm_ctx.dst \
+            + dst_dsz \
+                    * (dst_d.off_l(0) + btc.n * dst_d.blk_off<false, true>(1) \
+                            + btc.g * dst_d.blk_off<false, true>(0, 1) \
+                                    * jcp.oc \
+                            + oc); \
     char *ptr_C; \
     char *ptr_D; \
     int kd_b(0), kd_e(0), kh_b(0), kh_e(0), k_l(0), iiw_b(0);
@@ -2068,10 +2030,12 @@ void brgemm_convolution_fwd_t<isa>::ker_base(brgemm_thread_ctx_t &btc) const {
     MAYBE_UNUSED(is_oh_tail);
 
     int kw_s {0}, kw_full_s {0}, kw_f {0}, kw_full_f {0}, kw_b(0), kw_e(0);
+    int ow_b {0}, ow_e {0};
 
-    _pd->get_kw_range(ow, kw_s, kw_full_s, kw_full_f, kw_f);
+    brgemm_convolution_utils::get_kw_range(
+            jcp, ow, kw_s, kw_full_s, kw_full_f, kw_f);
 
-    const auto src_base = src + src_dsz * (btc.n * src_d_sz + g_ic);
+    const auto src_base = src + get_src_base_offset(btc, ic);
     const auto wei_base = weights
             + wei_dsz
                     * (btc.g * _pd->wei_g_stride
@@ -2105,8 +2069,7 @@ void brgemm_convolution_fwd_t<isa>::ker_base(brgemm_thread_ctx_t &btc) const {
 
     const auto kdhw_loop = [&]() {
         if (kw_e - kw_b <= 0) return;
-        int ow_b {0}, ow_e {0};
-        _pd->get_ow_range(ow, kw_b, ow_b, ow_e);
+        brgemm_convolution_utils::get_ow_range(jcp, ow, kw_b, ow_b, ow_e);
         const auto do_init
                 = btc.icc == 0 && kd_b == kd_s && kh_b == kh_s && kw_b == kw_s;
         const auto do_postwork = _pd->need_postwork
@@ -2204,7 +2167,8 @@ void brgemm_convolution_fwd_t<isa>::ker_base(brgemm_thread_ctx_t &btc) const {
         const auto do_init = btc.icc == 0;
         const auto do_postwork
                 = _pd->need_postwork && btc.icc == (_pd->ic_chunks - 1);
-        perform_outwork(btc, dst_base, bias_w, ow, g_oc, is_oc_tail, ow, ow,
+        brgemm_convolution_utils::get_ow_range(jcp, ow, kw_b, ow_b, ow_e);
+        perform_outwork(btc, dst_base, bias_w, ow, g_oc, is_oc_tail, ow_b, ow_e,
                 kd_l, kh_l, do_init, do_postwork, 0, false);
     }
 }
@@ -2218,7 +2182,6 @@ void brgemm_convolution_fwd_t<isa>::ker_trans(brgemm_thread_ctx_t &btc) const {
 
     BRGEMM_CONV_KER_HEADER;
 
-    MAYBE_UNUSED(g_ic);
     MAYBE_UNUSED(src);
 
     const auto wei_base = weights
@@ -2355,8 +2318,7 @@ void brgemm_convolution_fwd_t<isa>::ker_vpad(brgemm_thread_ctx_t &btc) const {
     BRGEMM_CONV_KER_HEADER;
     MAYBE_UNUSED(is_oh_tail);
 
-    const char *const __restrict src_base
-            = src + src_dsz * (btc.n * src_d_sz + g_ic);
+    const char *const __restrict src_base = src + get_src_base_offset(btc, ic);
     const char *const __restrict wei_base = weights
             + wei_dsz
                     * (btc.g * _pd->wei_g_stride
diff --git a/src/cpu/x64/jit_brgemm_conv.hpp b/src/cpu/x64/jit_brgemm_conv.hpp
index 2b84bc7a576..e2466316a84 100644
--- a/src/cpu/x64/jit_brgemm_conv.hpp
+++ b/src/cpu/x64/jit_brgemm_conv.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,12 +53,7 @@ struct brgemm_convolution_fwd_t : public primitive_t {
     struct brgemm_thread_ctx_t;
 
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::hint_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , with_sum(false) {}
-
-        ~pd_t() = default;
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brg_conv_fwd:", isa, ""),
                 brgemm_convolution_fwd_t);
@@ -68,8 +63,8 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         int brgs_sz_;
         std::shared_ptr<brgemm_containers::brgemm_desc_container_t>
                 brgemm_descriptors_;
-        bool with_sum;
-        jit_brgemm_conv_conf_t jcp_;
+        bool with_sum_ = false;
+        jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
         int ic_chunks;
         bool need_postwork;
@@ -99,10 +94,6 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         int brg_indices_c {0};
         Arrmap<8> brg_indices;
 
-        void get_kw_range(int ow, int &kw_s, int &kw_full_s, int &kw_full_e,
-                int &kw_e) const;
-        void get_ow_range(int ow, int kw, int &ow_s, int &ow_e) const;
-
         int get_brg_idx(int m, bool do_initialization, bool is_N_tail,
                 bool is_K_tail, int kd_b, int kd_e, int kh_b, int kh_e) const;
 
@@ -145,27 +136,34 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         }
 
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
         int KD, KH, KW, EXT_KD, EXT_KH, EXT_KW, KS, KD_BLOCK, KH_BLOCK,
                 KW_BLOCK, KD_BLOCK_PAD, KH_BLOCK_PAD, ID, IH, IW, IDP, IHP, IWP,
                 OD, OH, OW, SD, SH, SW, FP, TP, LP, DD, DH, DW;
         size_t acc_dsz, bia_dsz, src_dsz, wei_dsz, dst_dsz;
-        dim_t src_w_sz, src_h_sz, src_d_sz, dst_w_sz, dst_h_sz, dst_d_sz,
-                wei_ocb_sz;
+        dim_t src_w_sz, src_h_sz, dst_w_sz, dst_h_sz, wei_ocb_sz;
         dim_t adj_src_h_sz, adj_src_h_offset, src_iw_offset, src_d_offset,
                 wei_ic_offset, wei_kd_offset, wei_kh_offset, wei_kw_offset;
     };
 
     brgemm_convolution_fwd_t(const pd_t *apd);
 
-    ~brgemm_convolution_fwd_t() = default;
+    ~brgemm_convolution_fwd_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
@@ -206,6 +204,9 @@ struct brgemm_convolution_fwd_t : public primitive_t {
         return pd()->desc()->use_inversion ? K - k_inv : k;
     };
 
+    dim_t get_src_base_offset(
+            const brgemm_thread_ctx_t &btc, const dim_t ic) const;
+
     void ker_base(brgemm_thread_ctx_t &btc) const;
     void ker_trans(brgemm_thread_ctx_t &btc) const;
     void ker_vpad(brgemm_thread_ctx_t &btc) const;
@@ -255,7 +256,7 @@ struct brgemm_convolution_fwd_t : public primitive_t {
             copy_to_relo_pbuffer_;
     std::unique_ptr<jit_brgemm_relo_copy_to_wbuffer_t> copy_to_relo_wbuffer_;
 
-    std::unique_ptr<jit_generator> comp_vpad_pbuffer_;
+    std::unique_ptr<jit_generator_t> comp_vpad_pbuffer_;
 
     std::unique_ptr<jit_avx512_core_scale_precompute_t> jit_scale_precompute_;
 
@@ -272,7 +273,7 @@ struct brgemm_convolution_fwd_t : public primitive_t {
     int KD, KH, KW, EXT_KD, EXT_KH, EXT_KW, KS, KD_BLOCK, KH_BLOCK, KW_BLOCK,
             KD_BLOCK_PAD, KH_BLOCK_PAD, ID, IH, IW, IDP, IHP, IWP, OD, OH, OW,
             SD, SH, SW, FP, TP, LP, DD, DH, DW;
-    dim_t src_w_sz, src_h_sz, src_d_sz, dst_w_sz, dst_h_sz, dst_d_sz;
+    dim_t src_w_sz, src_h_sz, dst_w_sz, dst_h_sz;
     dim_t ker_vpad_sz, comp_ocb_sz, comp_ker_sz, comp_kw_sz, comp_ow_sz;
 
     bool is_relo_with_relo_weights;
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd.hpp b/src/cpu/x64/jit_brgemm_conv_bwd.hpp
index d4143a8f136..3b123f64401 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,11 +37,7 @@ template <cpu_isa_t isa>
 struct brgemm_convolution_bwd_t : public primitive_t {
 
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::hint_class *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        ~pd_t() = default;
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(name_.c_str(), brgemm_convolution_bwd_t);
 
@@ -50,9 +46,10 @@ struct brgemm_convolution_bwd_t : public primitive_t {
         std::shared_ptr<primitive_desc_t> fwd_pd_;
 
     private:
-        std::string name_ = JIT_IMPL_NAME_HELPER("brg_conv_bwd:", isa, "");
+        std::string name_;
 
         void init_name() {
+            name_ = JIT_IMPL_NAME_HELPER("brg_conv_bwd:", isa, "");
             name_.append("+");
             name_.append(fwd_pd_->name());
         }
@@ -60,7 +57,7 @@ struct brgemm_convolution_bwd_t : public primitive_t {
 
     brgemm_convolution_bwd_t(const pd_t *apd) : primitive_t(apd) {};
 
-    ~brgemm_convolution_bwd_t() = default;
+    ~brgemm_convolution_bwd_t() override = default;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp
index 25be385360e..111c9c13ff5 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ template <typename Vmm>
 jit_avx512_core_brgemm_conv_bwd_copy_kernel_t<Vmm>::
         jit_avx512_core_brgemm_conv_bwd_copy_kernel_t(
                 const jit_brgemm_conv_conf_t &ajcp)
-    : jit_generator(jit_name()), jcp(ajcp) {}
+    : jit_generator_t(jit_name()), jcp(ajcp) {}
 
 // use different vmovdqu32/16/8 due to case when tail mask used
 template <typename Vmm>
@@ -87,7 +87,7 @@ template <typename Vmm>
 void jit_avx512_core_brgemm_conv_bwd_copy_kernel_t<Vmm>::generate() {
     preamble();
 
-    const auto VL = vreg_traits<Vmm>::vlen;
+    const auto VL = vreg_traits_t<Vmm>::vlen;
     const auto simd_w = VL / jcp.dst_dsz;
     const auto n_vec = jcp.ic_block / simd_w;
     const auto n_tail_vec = (jcp.ic % jcp.ic_block) / simd_w;
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.hpp
index 04a7162f7e1..fc7dae75c7a 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_copy_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ struct jit_brgemm_conv_bwd_copy_kernel_call_s {
 };
 
 template <typename Vmm>
-struct jit_avx512_core_brgemm_conv_bwd_copy_kernel_t : public jit_generator {
+struct jit_avx512_core_brgemm_conv_bwd_copy_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_brgemm_conv_bwd_copy_kernel_t)
 
     using reg64_t = const Xbyak::Reg64;
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp
index 0cadd8b60f7..d4cb81f7f0b 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -87,11 +87,11 @@ status_t brgemm_convolution_bwd_strided_t<isa>::pd_t::init(engine_t *engine) {
     if (cd.use_inversion)
         skip_mask |= skip_mask_t::post_ops | skip_mask_t::sum_dt;
     if (is_int8 && cd.use_inversion)
-        skip_mask |= skip_mask_t::scales_runtime
-                | skip_mask_t::zero_points_runtime;
+        skip_mask |= skip_mask_t::scales | skip_mask_t::zero_points;
 
     const bool is_f32_supported
-            = everyone_is(f32, diff_src_type, wei_type, diff_dst_type);
+            = everyone_is(f32, diff_src_type, wei_type, diff_dst_type)
+            && IMPLICATION(with_bias(), bias_md_.data_type == f32);
 
     const bool is_xf16_supported = one_of(wei_type, bf16, f16)
             && wei_type == diff_dst_type && one_of(diff_src_type, wei_type, f32)
@@ -110,15 +110,21 @@ status_t brgemm_convolution_bwd_strided_t<isa>::pd_t::init(engine_t *engine) {
             && one_of(diff_src_type, wei_type, f32, f8_e5m2, f8_e4m3)
             && IMPLICATION(
                     with_bias(), one_of(bias_md_.data_type, f32, wei_type));
+    const bool is_f32_xf16_supported = one_of(wei_type, bf16, f16)
+            && everyone_is(f32, diff_src_type, diff_dst_type)
+            && IMPLICATION(with_bias(), bias_md_.data_type == f32);
 
     VDISPATCH_CONV(is_bwd_d(), VERBOSE_BAD_PROPKIND);
     VDISPATCH_CONV(
             impl_supports_datatype(diff_src_type), VERBOSE_UNSUPPORTED_DT);
-    VDISPATCH_CONV(impl_supports_datatype(wei_type), VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_CONV(impl_supports_datatype(
+                           is_f32_xf16_supported ? diff_dst_type : wei_type),
+            VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_CONV(
             impl_supports_datatype(diff_dst_type), VERBOSE_UNSUPPORTED_DT);
-    VDISPATCH_CONV(one_of(true, is_f32_supported, is_xf16_supported,
-                           is_int8_supported, is_fp8_supported),
+    VDISPATCH_CONV(
+            one_of(true, is_f32_supported, is_xf16_supported, is_int8_supported,
+                    is_fp8_supported, is_f32_xf16_supported),
             VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
             VERBOSE_BAD_ALGORITHM);
@@ -130,107 +136,82 @@ status_t brgemm_convolution_bwd_strided_t<isa>::pd_t::init(engine_t *engine) {
                                    diff_src_type, is_int8_supported)),
             VERBOSE_UNSUPPORTED_POSTOP);
 
-    const auto is_amx = brgemm_convolution_bwd_utils::is_amx(isa);
-
     CHECK(brgemm_convolution_bwd_utils::init_conf(jcp_, isa, desc_,
             diff_dst_md_, weights_md_, diff_src_md_, bias_md_, attr_,
             dnnl_get_max_threads()));
 
-    const auto adj_M = nstl::max(jcp_.M, jcp_.M_tail);
-
-    brgs_sz_ = adj_M * 2 * 2 * 2;
+    brgs_sz_ = 2 * 2 * 2 * 2;
     brgs_ = std::make_shared<brgemm_containers::brgemm_desc_container_t>();
     brgs_->resize(brgs_sz_);
 
-    const float alpha = 1.0;
-    const float beta = 1.0;
+    brg_indices_c = 0;
 
-    const auto &p = attr()->post_ops_;
-    const int sum_idx = p.find(primitive_kind::sum);
-    const bool with_sum = (sum_idx != -1);
-
-    const auto M_end = nstl::max(jcp_.M, jcp_.M_tail);
-    for (int i = 0; i < M_end; i++) {
-        auto vM = i + 1;
-        // init only needed brgemm descriptors
-        if (one_of(jcp_.exec_type, exec_trans, exec_vpad) && vM != jcp_.M
-                && vM != jcp_.M_tail)
-            continue;
-        for_(int i_init = 0; i_init < 2; i_init++)
-        for_(int i_N = 0; i_N < 2; i_N++)
-        for (int i_K = 0; i_K < 2; i_K++) {
-            auto vbeta = (i_init) ? 0 : beta;
-            auto vN = (i_N) ? jcp_.N_tail : jcp_.N;
-            auto vK = (i_K) ? jcp_.K_tail : jcp_.K;
-            auto vbrgM = jcp_.use_M_mask
-                    ? (vM == jcp_.M ? jcp_.brgM : jcp_.brgM_tail)
-                    : vM;
-            auto brg_idx = get_brg_idx(jcp_.max_batch, i, i_init, i_N, i_K);
-            // if brgemm_desc_t already created then skip this iteration
-            if ((*brgs_)[brg_idx] != nullptr) continue;
-            brgemm_desc_t brg;
-            if (vN == 0 || vK == 0) continue;
-            brgemm_strides_t brg_strides;
-            brg_strides.stride_a = jcp_.brg_stride_a;
-            brg_strides.stride_b = jcp_.brg_stride_b;
-            brg.req_cal_comp_pads = jcp_.req_brg_comp_pad;
-            brg.req_comp_pads_with_bcast
-                    = jcp_.req_cal_comp_pad && jcp_.exec_type == exec_trans;
-            const auto strides_ptr
-                    = (jcp_.brg_type == brgemm_strd) ? &brg_strides : nullptr;
-            CHECK(brgemm_desc_init(&brg, isa, jcp_.brg_type, diff_dst_type,
-                    wei_type, false, false, brgemm_row_major, alpha, vbeta,
-                    jcp_.LDA, jcp_.LDB, jcp_.LDC, vbrgM, vN, vK, strides_ptr));
-
-            brgemm_attr_t brgattr;
-            brgattr.use_uker = jcp_.use_uker;
-            brgattr.use_interleave_stores = jcp_.use_interleave_stores;
-            brgattr.hint_prefetching = jcp_.hint_prefetching;
-            brgattr.max_bs = jcp_.max_batch;
-            brgattr.hint_innermost_loop = jcp_.brgemm_bd_loop_innermost
-                    ? brgemm_bd_loop_innermost
-                    : brgemm_ld_loop_innermost;
-            if (jcp_.amx_tile_load_xx) {
-                // assuming 2x2 decomposition in amx brgemm kernel
-                // and overlap of input by kw
-                const auto bd_blocking = 2 * jcp_.amx_h;
-                const auto ld_blocking = 2 * 16;
-                brgattr.hint_expected_A_size
-                        = bd_blocking * jcp_.K * jcp_.kd_block * jcp_.kh_block;
-                brgattr.hint_expected_B_size = ld_blocking * jcp_.K
-                        * jcp_.kd_block * jcp_.kh_block * jcp_.kw_block;
-                brgattr.hint_expected_C_size = bd_blocking * ld_blocking;
-            } else {
-                brgattr.hint_expected_A_size = 0;
-                brgattr.hint_expected_B_size = 0;
-                brgattr.hint_expected_C_size = 0;
-            }
+    auto ndims = jcp_.ndims;
+
+    const auto KD = ndims_pick(jcp_.kd, 1, 1);
+    const auto KH = ndims_pick(jcp_.kh, jcp_.kh, 1);
 
-            brgattr.wary_tail_read = false;
-            // use_M_mask is always 0 for brgemm_convolution_bwd_strided_t
-            brgattr.bd_mask = nullptr;
-            brgattr.bd_mask_level = jcp_.use_M_mask;
-
-            if (is_amx) {
-                brgattr.max_top_vpad = 0;
-                brgattr.max_bottom_vpad = 0;
-            } else {
-                brgattr.max_top_vpad = jcp_.max_vpad;
-                brgattr.max_bottom_vpad = jcp_.max_vpad;
+    const auto KD_BLOCK = ndims_pick(jcp_.kd_block, 1, 1);
+    const auto KH_BLOCK = ndims_pick(jcp_.kh_block, jcp_.kh_block, 1);
+
+    const auto SW = jcp_.stride_w;
+    const auto IW = jcp_.iw;
+    // TODO: this is only needed if we have d/h padding exceeding kd/kh
+    int M_begin = 0;
+    int M_end = (jcp_.M_tail == jcp_.M) ? 1 : 2;
+    int N_begin = 0;
+    int N_end = (jcp_.N_tail == jcp_.N) ? 1 : 2;
+    int K_begin = 0;
+    int K_end = (jcp_.K_tail == jcp_.K) ? 1 : 2;
+    int i_init_begin = (div_up(jcp_.nb_oc, jcp_.nb_oc_blocking) == 1
+                               && KD_BLOCK == KD && KH_BLOCK == KH)
+            ? 1
+            : 0;
+    int i_init_end = 2;
+
+    for_(int i_N = N_begin; i_N < N_end; i_N++)
+    for_(int i_M = M_begin; i_M < M_end; i_M++)
+    for_(int i_init = i_init_begin; i_init < i_init_end; i_init++)
+    for (int i_K = K_begin; i_K < K_end; i_K++) {
+        auto M = (i_M) ? jcp_.M_tail : jcp_.M;
+        if (M <= 0) continue;
+        CHECK(add_brg_descriptor(M, i_N, i_K, i_init));
+    }
+
+    if (jcp_.exec_type == exec_base) {
+        // create brgemm kernels for iw_blocks with padded areas and
+        // apply post-ops on final iteration by kw to padded areas in iw_block
+        int kw_s {0}, kw_full_s {0}, kw_full_f {0}, kw_f {0}, iw_s {0},
+                M_without_overflow {0};
+
+        auto init_kernels_kw_loop = [&](int sw, int iw) -> status_t {
+            const auto iw_str = iw + sw;
+            get_kw_range(iw_str, iw, kw_s, kw_full_s, kw_full_f, kw_f);
+            for (int kw = kw_s; kw < kw_f; kw++) {
+                get_iw_range(iw_str, iw, kw, iw_s, M_without_overflow);
+                if (M_without_overflow <= 0) continue;
+                for_(int i_init = 0; i_init < 2; i_init++)
+                for_(int i_N = 0; i_N < 2; i_N++)
+                for (int i_K = 0; i_K < 2; i_K++) {
+                    CHECK(add_brg_descriptor(
+                            M_without_overflow, i_N, i_K, i_init));
+                }
+            }
+            return status::success;
+        };
+        for (int sw = 0; sw < SW; sw++) {
+            for (int iw = 0; iw < IW; iw += jcp_.iw_block) {
+                CHECK(init_kernels_kw_loop(sw, iw));
+                if (kw_f == jcp_.kw && kw_s == 0) break;
+            }
+            for (int iw = (jcp_.nb_iw - 1) * jcp_.iw_block; iw >= 0;
+                    iw -= jcp_.iw_block) {
+                CHECK(init_kernels_kw_loop(sw, iw));
+                if (kw_f == jcp_.kw && kw_s == 0) break;
             }
-            brgattr.generate_skip_accumulation = true;
-            CHECK(brgemm_desc_set_attr(&brg, brgattr));
-
-            auto LDD = jcp_.stride_w * jcp_.ic_without_padding;
-            brg.with_sum = with_sum;
-            brg.with_weights_scale_adjust = jcp_.scale_adjust_factor != 1.0f;
-            CHECK(brgemm_desc_set_postops(
-                    &brg, attr(), &diff_src_md_, LDD, jcp_.bia_dt));
-            jcp_.amx_buf_size_per_thread = nstl::max(
-                    brg.get_wsp_buffer_size(), jcp_.amx_buf_size_per_thread);
-            brgs_->insert(brg_idx, brg);
         }
     }
+    brgs_sz_ = brgs_->refs_size();
 
     auto scratchpad = scratchpad_registry().registrar();
     brgemm_convolution_bwd_utils::init_scratchpad(scratchpad, jcp_);
@@ -242,39 +223,121 @@ status_t brgemm_convolution_bwd_strided_t<isa>::pd_t::init(engine_t *engine) {
 }
 
 template <cpu_isa_t isa>
-void brgemm_convolution_bwd_strided_t<isa>::get_kw_range(int iw, int iw_raw,
-        int &kw_s, int &kw_full_s, int &kw_full_f, int &kw_f) const {
-    // This function is needed for exec_base only
-    const auto _pd = pd();
-    const auto &jcp = _pd->jcp_;
+status_t brgemm_convolution_bwd_strided_t<isa>::pd_t::add_brg_descriptor(
+        int vM, bool is_N_tail, bool is_K_tail, bool do_init) {
+
+    if (do_init && is_K_tail && jcp_.K > 0) return status::success;
+
+    const float alpha = 1.0;
+    const float beta = 1.0;
+
+    auto vbeta = do_init ? 0 : beta;
+    auto vN = is_N_tail ? jcp_.N_tail : jcp_.N;
+    auto vK = is_K_tail ? jcp_.K_tail : jcp_.K;
+
+    const auto is_amx = brgemm_convolution_bwd_utils::is_amx(isa);
+
+    if (vN == 0 || vK == 0) return status::success;
+
+    auto brg_idx = get_brg_idx(vM, do_init, is_N_tail, is_K_tail);
+    // if brgemm_desc_t already created then skip this iteration
+    if (brg_idx != -1) return status::success;
+
+    brgemm_desc_t brg;
+    brgemm_strides_t brg_strides;
+    brg_strides.stride_a = jcp_.brg_stride_a;
+    brg_strides.stride_b = jcp_.brg_stride_b;
+    brg.req_cal_comp_pads = jcp_.req_brg_comp_pad;
+    brg.req_comp_pads_with_bcast
+            = jcp_.req_cal_comp_pad && jcp_.exec_type == exec_trans;
+    const auto strides_ptr
+            = (jcp_.brg_type == brgemm_strd) ? &brg_strides : nullptr;
+    CHECK(brgemm_desc_init(&brg, isa, jcp_.brg_type, jcp_.src_dt, jcp_.wei_dt,
+            false, false, brgemm_row_major, alpha, vbeta, jcp_.LDA, jcp_.LDB,
+            jcp_.LDC, vM, vN, vK, strides_ptr));
+
+    brgemm_attr_t brgattr;
+    brgattr.use_uker = jcp_.use_uker;
+    brgattr.use_interleave_stores = jcp_.use_interleave_stores;
+    brgattr.hint_prefetching = jcp_.hint_prefetching;
+    brgattr.max_bs = jcp_.max_batch;
+    brgattr.hint_innermost_loop = jcp_.brgemm_bd_loop_innermost
+            ? brgemm_bd_loop_innermost
+            : brgemm_ld_loop_innermost;
+    if (jcp_.amx_tile_load_xx) {
+        // assuming 2x2 decomposition in amx brgemm kernel
+        // and overlap of input by kw
+        const auto bd_blocking = 2 * jcp_.amx_h;
+        const auto ld_blocking = 2 * 16;
+        brgattr.hint_expected_A_size
+                = bd_blocking * jcp_.K * jcp_.kd_block * jcp_.kh_block;
+        brgattr.hint_expected_B_size = ld_blocking * jcp_.K * jcp_.kd_block
+                * jcp_.kh_block * jcp_.kw_block;
+        brgattr.hint_expected_C_size = bd_blocking * ld_blocking;
+    } else {
+        brgattr.hint_expected_A_size = 0;
+        brgattr.hint_expected_B_size = 0;
+        brgattr.hint_expected_C_size = 0;
+    }
+
+    brgattr.wary_A_k_tail_read = false;
+    // use_M_mask is always 0 for brgemm_convolution_bwd_strided_t
+    brgattr.bd_mask = nullptr;
+    brgattr.bd_mask_level = jcp_.use_M_mask;
+
+    if (is_amx) {
+        brgattr.max_top_vpad = 0;
+        brgattr.max_bottom_vpad = 0;
+    } else {
+        brgattr.max_top_vpad = jcp_.max_vpad;
+        brgattr.max_bottom_vpad = jcp_.max_vpad;
+    }
+    brgattr.generate_skip_accumulation = true;
+    CHECK(brgemm_desc_set_attr(&brg, brgattr));
+
+    auto LDD = jcp_.stride_w * jcp_.ic_without_padding;
+    brg.with_sum = jcp_.with_sum;
+    brg.with_weights_scale_adjust = jcp_.scale_adjust_factor != 1.0f;
+    CHECK(brgemm_desc_set_postops(
+            &brg, attr(), &diff_src_md_, LDD, jcp_.bia_dt));
+    CHECK(brgemm_desc_finalize(&brg));
+
+    jcp_.amx_buf_size_per_thread = nstl::max(
+            brg.get_wsp_buffer_size(), jcp_.amx_buf_size_per_thread);
+    brg_idx = brgs_->insert(brg);
+
+    const std::array<int, 4> key = {vM, is_N_tail, is_K_tail, do_init};
+    if (brg_indices.find(key) == brg_indices.end()) {
+        brg_indices.insert({key, brg_idx});
+        brg_indices_c++;
+    }
 
+    return status::success;
+}
+
+template <cpu_isa_t isa>
+void brgemm_convolution_bwd_strided_t<isa>::pd_t::get_kw_range(int iw,
+        int iw_raw, int &kw_s, int &kw_full_s, int &kw_full_f,
+        int &kw_f) const {
+    // This function is needed for exec_base only
     brgemm_convolution_bwd_utils::get_kw_range(
-            jcp, iw, iw_raw, kw_s, kw_full_s, kw_full_f, kw_f);
+            jcp_, iw, iw_raw, kw_s, kw_full_s, kw_full_f, kw_f);
 }
 
 template <cpu_isa_t isa>
-void brgemm_convolution_bwd_strided_t<isa>::get_iw_range(
+void brgemm_convolution_bwd_strided_t<isa>::pd_t::get_iw_range(
         int iw, int iw_raw, int kw, int &iw_s, int &M_without_overflow) const {
     // This function is needed for exec_base only
-    const auto _pd = pd();
-    const auto &jcp = _pd->jcp_;
 
     brgemm_convolution_bwd_utils::get_iw_range(
-            jcp, iw, iw_raw, kw, iw_s, M_without_overflow);
+            jcp_, iw, iw_raw, kw, iw_s, M_without_overflow);
 }
 
 template <cpu_isa_t isa>
-status_t brgemm_convolution_bwd_strided_t<isa>::add_brg_kernel(
-        int bs, int M, int i_N, int i_K, int i_init) {
-    if (M <= 0) return status::success;
+status_t brgemm_convolution_bwd_strided_t<isa>::add_brg_kernel(int brg_idx) {
     const auto _pd = pd();
-    const auto &jcp = _pd->jcp_;
     const auto &brgs = *(_pd->brgs_);
 
-    auto N = (i_N) ? jcp.N_tail : jcp.N;
-    auto K = (i_K) ? jcp.K_tail : jcp.K;
-    if (N <= 0 || K <= 0) return status::success;
-    auto brg_idx = _pd->get_brg_idx(bs, M - 1, i_init, i_N, i_K);
     auto brg = brgs[brg_idx];
     if (!brg_kernels_[brg_idx] && brg && brg->bcast_dim > 0 && brg->load_dim > 0
             && brg->reduce_dim > 0) {
@@ -306,7 +369,6 @@ status_t brgemm_convolution_bwd_strided_t<isa>::add_po_kernel(
     return status::success;
 }
 
-// TODO: consolidate with jit_brgemm_conv.cpp version.
 template <cpu_isa_t isa>
 void brgemm_convolution_bwd_strided_t<isa>::add_po_kernels(
         int i_N, int init_bcast_dim, int po_bcast_dim) {
@@ -318,9 +380,9 @@ void brgemm_convolution_bwd_strided_t<isa>::add_po_kernels(
     if (N <= 0) return;
     auto i_K = (jcp.K_tail > 0);
 
+    const auto brg_idx = _pd->get_any_brg_idx(i_N, i_K);
+
     if (init_bcast_dim > 0) {
-        auto brg_idx = _pd->get_brg_idx(
-                _pd->first_bs, init_bcast_dim - 1, 0, i_N, i_K);
         if (brgs[brg_idx]) {
             auto init_cfg = *(brgs[brg_idx]);
             auto ker_init_idx = get_ker_po_idx(init_bcast_dim - 1, false, i_N);
@@ -332,8 +394,6 @@ void brgemm_convolution_bwd_strided_t<isa>::add_po_kernels(
     }
 
     if ((need_postwork || jcp.use_buffer) && po_bcast_dim > 0) {
-        auto brg_idx = _pd->get_brg_idx(
-                _pd->first_bs, po_bcast_dim - 1, 0, i_N, i_K);
         if (brgs[brg_idx]) {
             auto po_cfg = *(brgs[brg_idx]);
             auto ker_po_idx = get_ker_po_idx(po_bcast_dim - 1, true, i_N);
@@ -390,27 +450,17 @@ void brgemm_convolution_bwd_strided_t<isa>::create_kernels() {
     const auto _pd = pd();
     const auto &jcp = _pd->jcp_;
 
-    // TODO: this is only needed if we have d/h padding exceeding kd/kh
+    is_amx = brgemm_convolution_bwd_utils::is_amx(isa);
+
+    for (const auto &key_value_pair : _pd->brg_indices) {
+        const int brg_idx = key_value_pair.second;
+        add_brg_kernel(brg_idx);
+    }
+
     int M_begin = 0;
     int M_end = (jcp.M_tail == jcp.M) ? 1 : 2;
     int N_begin = 0;
     int N_end = (jcp.N_tail == jcp.N) ? 1 : 2;
-    int K_begin = 0;
-    int K_end = (jcp.K_tail == jcp.K) ? 1 : 2;
-    int i_init_begin = (div_up(jcp.nb_oc, jcp.nb_oc_blocking) == 1
-                               && KD_BLOCK == KD && KH_BLOCK == KH)
-            ? 1
-            : 0;
-    int i_init_end = 2;
-
-    for_(int i_N = N_begin; i_N < N_end; i_N++)
-    for_(int i_M = M_begin; i_M < M_end; i_M++)
-    for_(int i_init = i_init_begin; i_init < i_init_end; i_init++)
-    for (int i_K = K_begin; i_K < K_end; i_K++) {
-        auto M = (i_M) ? jcp.M_tail : jcp.M;
-        if (M <= 0) continue;
-        add_brg_kernel(jcp.max_batch, M, i_N, i_K, i_init);
-    }
 
     if (jcp.exec_type == exec_base) {
         for_(int i_N = N_begin; i_N < N_end; i_N++)
@@ -429,16 +479,10 @@ void brgemm_convolution_bwd_strided_t<isa>::create_kernels() {
 
         auto init_kernels_kw_loop = [&](int sw, int iw) {
             const auto iw_str = iw + sw;
-            get_kw_range(iw_str, iw, kw_s, kw_full_s, kw_full_f, kw_f);
+            _pd->get_kw_range(iw_str, iw, kw_s, kw_full_s, kw_full_f, kw_f);
             for (int kw = kw_s; kw < kw_f; kw++) {
-                get_iw_range(iw_str, iw, kw, iw_s, M_without_overflow);
+                _pd->get_iw_range(iw_str, iw, kw, iw_s, M_without_overflow);
                 if (M_without_overflow <= 0) continue;
-                for_(int i_init = 0; i_init < 2; i_init++)
-                for_(int i_N = 0; i_N < 2; i_N++)
-                for (int i_K = 0; i_K < 2; i_K++) {
-                    add_brg_kernel(jcp.max_batch, M_without_overflow, i_N, i_K,
-                            i_init);
-                }
 
                 bool is_iw_tail = (jcp.iw - iw < jcp.iw_block);
                 for_(int i_N = 0; i_N < 2; i_N++)
@@ -448,13 +492,13 @@ void brgemm_convolution_bwd_strided_t<isa>::create_kernels() {
                                       SW)
                             * SW;
                     if (M <= 0) continue;
-                    get_iw_range(iw_str, iw, kw, iw_s, M_without_overflow);
+                    _pd->get_iw_range(iw_str, iw, kw, iw_s, M_without_overflow);
                     iw_f = iw_s + (M_without_overflow * SW);
                     const auto init_bcast_dim
                             = ((i_side == 0) ? (iw_s - iw_str)
                                              : (iw_str + M - iw_f))
                             / SW;
-                    get_iw_range(
+                    _pd->get_iw_range(
                             iw_str, iw, kw_f - kw, iw_s, M_without_overflow);
                     iw_f = iw_s + (M_without_overflow * SW);
                     const auto po_bcast_dim
@@ -610,8 +654,8 @@ status_t brgemm_convolution_bwd_strided_t<isa>::init(engine_t *engine) {
     if (is_jit_supported && pd()->IC() > 1
             && req_copy_scales(attr, jcp.scale_adjust_factor)) {
         const auto &attr_scales = attr->scales_;
-        int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-        if (wei_scale_mask != 0) {
+        int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_scale_mask > 0) {
             CHECK(safe_ptr_assign(jit_scale_precompute_,
                     new jit_avx512_core_scale_precompute_t(
                             attr, jcp.scale_adjust_factor)));
@@ -661,11 +705,10 @@ status_t brgemm_convolution_bwd_strided_t<isa>::execute(
 
     const memory_tracking::grantor_t scratchpad = ctx.get_scratchpad_grantor();
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales = scale_utils::precompute_scales(scratchpad,
             src_scales, wei_scales, pd()->OC(), pd()->IC(), false,
-            wei_scale_mask != 0, pd()->attr(), jit_scale_precompute_.get(),
+            wei_scale_mask > 0, pd()->attr(), jit_scale_precompute_.get(),
             jcp.scale_adjust_factor);
 
     brgemm_bwd_exec_ctx_t brgemm_ctx(ctx, _pd);
@@ -1186,7 +1229,7 @@ void brgemm_convolution_bwd_strided_t<isa>::ker_base(
             = bias ? bias + (bias_d.blk_off(g_ic) * bia_dsz) : nullptr;
     int kw_s {0}, kw_full_s {0}, kw_f {0}, kw_full_f {0}, kw_b(0), kw_e(0);
     int kd_s_(0), kh_s_(0), kd_f_(0), kh_f_(0);
-    get_kw_range(iw, iw_raw, kw_s, kw_full_s, kw_full_f, kw_f);
+    _pd->get_kw_range(iw, iw_raw, kw_s, kw_full_s, kw_full_f, kw_f);
 
     // od = (id + FP - kd * DD) / SD <-- general relation for all sets of (od, id, kd) that overlap
     // for a given index from diff_src, we need to find the appropriate stride sector
@@ -1221,6 +1264,10 @@ void brgemm_convolution_bwd_strided_t<isa>::ker_base(
     const auto call_brgemm = [&](int iw, int brg_idx, int oc_block_s,
                                      int n_oc_blocks, size_t comp_ker_offs,
                                      bool do_postops, bool do_only_comp) {
+        if (brg_idx < 0) {
+            assert(!"Requested brgemm kernel was not created.");
+            return;
+        }
         const auto kh_ee = kh_e;
         int k_sum = 0;
         int32_t *src_zp = jcp.src_zero_point
@@ -1287,7 +1334,7 @@ void brgemm_convolution_bwd_strided_t<isa>::ker_base(
     const auto kdhw_loop = [&]() {
         if (kw_e - kw_b <= 0 || kw_b >= jcp.kw) return;
         int iw_b {0}, M_without_overflow {0};
-        get_iw_range(iw, iw_raw, kw_b, iw_b, M_without_overflow);
+        _pd->get_iw_range(iw, iw_raw, kw_b, iw_b, M_without_overflow);
         const auto iw_e = iw_b + M_without_overflow;
         const auto do_init
                 = btc.occ == 0 && kd_b == kd_s && kh_b == kh_s && kw_b == kw_s;
@@ -1319,27 +1366,20 @@ void brgemm_convolution_bwd_strided_t<isa>::ker_base(
         const auto comp_ker_offs = get_comp_offset(
                 btc.g, btc.icb, 0, kd_s, kd_f, kh_s, kh_f, kw_b, kw_e);
 
-        const auto ker_i = iw_l - 1;
-        int kernel_idx[2][2];
-        kernel_idx[false][false]
-                = _pd->get_brg_idx(k_l, ker_i, false, is_ic_tail, false);
-        kernel_idx[true][false]
-                = _pd->get_brg_idx(k_l, ker_i, true, is_ic_tail, false);
-        kernel_idx[false][true]
-                = _pd->get_brg_idx(k_l, ker_i, false, is_ic_tail, true);
-        kernel_idx[true][true]
-                = _pd->get_brg_idx(k_l, ker_i, true, is_ic_tail, true);
+        const auto ker_i = iw_l;
 
         if (iw_l > 0 && k_l > 0) {
             if (nb_oc_b > 0) {
-                const auto brg_idx = kernel_idx[do_init][false];
+                const auto brg_idx
+                        = _pd->get_brg_idx(ker_i, do_init, is_ic_tail, false);
                 call_brgemm(iw_b, brg_idx, 0, nb_oc_b, comp_ker_offs,
                         do_postwork && !is_oc_tail, do_only_comp);
             }
 
             if (is_oc_tail) {
                 const auto use_init_ker = (do_init && nb_oc_b == 0);
-                const auto brg_oc_tail_idx = kernel_idx[use_init_ker][true];
+                const auto brg_oc_tail_idx = _pd->get_brg_idx(
+                        ker_i, use_init_ker, is_ic_tail, true);
                 call_brgemm(iw_b, brg_oc_tail_idx, nb_oc_b, 1, comp_ker_offs,
                         do_postwork, do_only_comp);
             }
@@ -1477,12 +1517,16 @@ void brgemm_convolution_bwd_strided_t<isa>::ker_trans(
 
     ptr_C = (jcp.use_buffer) ? btc.c_buffer : static_cast<char *>(ptr_D);
 
-    const auto ker_i = (jcp.M > 0 ? jcp.M : jcp.M_tail) - 1;
+    const auto ker_i = (jcp.M > 0 ? jcp.M : jcp.M_tail);
 
     bool is_first_call_postops = false,
          is_first_call_postops_state_changed = false;
     const auto call_brgemm = [&](int brg_idx, int oc_block_s, int n_oc_blocks,
                                      size_t comp_ker_offs, bool do_postops) {
+        if (brg_idx < 0) {
+            assert(!"Requested brgemm kernel was not created.");
+            return;
+        }
         const auto kh_ee = kh_e;
         const auto kw_e = kw_f;
         const auto pbuf_base = inp_buffer;
@@ -1566,25 +1610,17 @@ void brgemm_convolution_bwd_strided_t<isa>::ker_trans(
                         btc.g, btc.icb, iw, kd_s, kd_f, kh_s, kh_f, 0, KW)
                 : get_comp_offset(btc.g, btc.icb, iw, 0, 0, 0, 0, 0, 0);
 
-        int kernel_idx[2][2];
-        kernel_idx[false][false]
-                = _pd->get_brg_idx(k_l, ker_i, false, is_ic_tail, false);
-        kernel_idx[true][false]
-                = _pd->get_brg_idx(k_l, ker_i, true, is_ic_tail, false);
-        kernel_idx[false][true]
-                = _pd->get_brg_idx(k_l, ker_i, false, is_ic_tail, true);
-        kernel_idx[true][true]
-                = _pd->get_brg_idx(k_l, ker_i, true, is_ic_tail, true);
-
         if (nb_oc_b > 0) {
-            const auto brg_idx = kernel_idx[do_init][false];
+            const auto brg_idx
+                    = _pd->get_brg_idx(ker_i, do_init, is_ic_tail, false);
             call_brgemm(brg_idx, 0, nb_oc_b, comp_ker_offs,
                     do_postwork && !is_oc_tail);
         }
 
         if (is_oc_tail) {
             const auto use_init_ker = (do_init && nb_oc_b == 0);
-            const auto brg_oc_tail_idx = kernel_idx[use_init_ker][true];
+            const auto brg_oc_tail_idx
+                    = _pd->get_brg_idx(ker_i, use_init_ker, is_ic_tail, true);
             call_brgemm(
                     brg_oc_tail_idx, nb_oc_b, 1, comp_ker_offs, do_postwork);
         }
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp
index 494070bb6ee..649c8d17786 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_strided.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,11 +44,7 @@ template <cpu_isa_t isa>
 struct brgemm_convolution_bwd_strided_t : public primitive_t {
 
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::hint_class *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        ~pd_t() = default;
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgconv_strided:", isa, ""),
                 brgemm_convolution_bwd_strided_t);
@@ -58,26 +54,59 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t {
         int brgs_sz_;
         std::shared_ptr<brgemm_containers::brgemm_desc_container_t> brgs_;
 
-        jit_brgemm_conv_conf_t jcp_;
+        jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         // batch size info
         const int first_bs = 0;
-        int get_brg_idx(int bs, int m, bool do_initialization, bool is_N_tail,
+
+        // need custom hasher to use array as key in unordered_map
+        template <int asize>
+        struct ahasher {
+            size_t operator()(const std::array<int, asize> &a) const {
+                size_t seed = 0;
+                for (auto e : a)
+                    seed = hash_combine(seed, e);
+                return seed;
+            }
+        };
+        template <int asize>
+        using Arrmap = std::unordered_map<std::array<int, asize>, int,
+                ahasher<asize>>;
+
+        int brg_indices_c {0};
+        Arrmap<4> brg_indices;
+
+        int get_brg_idx(int m, bool do_initialization, bool is_N_tail,
                 bool is_K_tail) const {
-            const int bs_c = 1;
-            auto bs_idx = 0;
-            return (((m * bs_c + bs_idx) * 2
-                            + static_cast<int>(do_initialization))
-                                   * 2
-                           + static_cast<int>(is_N_tail))
-                    * 2
-                    + static_cast<int>(is_K_tail);
+            const auto brg_idx = brg_indices.find(
+                    {m, is_N_tail, is_K_tail, do_initialization});
+            if (brg_idx == brg_indices.end()) return -1;
+            return brg_idx->second;
+        }
+
+        int get_any_brg_idx(bool is_N_tail, bool is_K_tail) const {
+            // return first defined brgemm_descriptor for specified parameters
+            for (const auto &key_value_pair : brg_indices) {
+                const bool i_N = key_value_pair.first[1];
+                const bool i_K = key_value_pair.first[2];
+                if ((jcp_.N == jcp_.N_tail || is_N_tail == i_N)
+                        && (jcp_.K == jcp_.K_tail || is_K_tail == i_K))
+                    return key_value_pair.second;
+            }
+            return 0;
         }
+
+        status_t add_brg_descriptor(
+                int M, bool is_N_tail, bool is_K_tail, bool do_init);
+        void get_kw_range(int iw, int iw_raw, int &kw_s, int &kw_full_s,
+                int &kw_full_e, int &kw_e) const;
+        void get_iw_range(
+                int iw, int iw_raw, int kw, int &ow_s, int &ow_e) const;
     };
 
     brgemm_convolution_bwd_strided_t(const pd_t *apd)
         : primitive_t(apd), bias_d(pd()->weights_md(1)) {}
 
-    ~brgemm_convolution_bwd_strided_t() = default;
+    ~brgemm_convolution_bwd_strided_t() override = default;
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
@@ -157,10 +186,6 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t {
         return utils::div_up(IW, SW) * (iw % SW) + iw / SW;
     }
 
-    void get_kw_range(int iw, int iw_raw, int &kw_s, int &kw_full_s,
-            int &kw_full_e, int &kw_e) const;
-    void get_iw_range(int iw, int iw_raw, int kw, int &ow_s, int &ow_e) const;
-
     void ker_base(brgemm_bwd_thread_ctx_t &btc) const;
     void ker_trans(brgemm_bwd_thread_ctx_t &btc, char *inp_buffer) const;
 
@@ -188,7 +213,7 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t {
 
     status_t add_po_kernel(brgemm_desc_t *bcfg, int ker_idx, bool is_init);
     void add_po_kernels(int i_N, int init_bcast_dim, int po_bcast_dim);
-    status_t add_brg_kernel(int bs, int M, int i_N, int i_K, int i_init);
+    status_t add_brg_kernel(int brg_idx);
 
     void cal_compensation(const char *__restrict weights,
             int32_t *src_zp_buffer, int32_t *s8s8_comp_buffer) const;
@@ -208,7 +233,7 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t {
 
     std::vector<std::unique_ptr<jit_brgemm_kernel_post_ops_base_t>> kernels_po_;
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     std::unique_ptr<jit_avx512_core_brgemm_conv_bwd_trans_kernel::
                     jit_avx512_core_brgemm_conv_bwd_trans_kernel_t<Vmm>>
@@ -217,7 +242,7 @@ struct brgemm_convolution_bwd_strided_t : public primitive_t {
     std::unique_ptr<jit_avx512_core_brgemm_conv_bwd_copy_kernel::
                     jit_avx512_core_brgemm_conv_bwd_copy_kernel_t<Vmm>>
             copy_to_output_buffer_;
-    std::unique_ptr<jit_generator> comp_vpad_pbuffer_;
+    std::unique_ptr<jit_generator_t> comp_vpad_pbuffer_;
     std::unique_ptr<jit_avx512_core_scale_precompute_t> jit_scale_precompute_;
 
     size_t acc_dsz, bia_dsz, src_dsz, wei_dsz, dst_dsz;
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp
index 95ceaba44eb..faed1753f59 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ template <typename Vmm>
 jit_avx512_core_brgemm_conv_bwd_trans_kernel_t<Vmm>::
         jit_avx512_core_brgemm_conv_bwd_trans_kernel_t(
                 const jit_brgemm_conv_conf_t &ajcp, const char *name)
-    : jit_generator(name)
+    : jit_generator_t(name)
     , jcp(ajcp)
     , inp_dsz(jcp.src_dsz)
     , oc_block_sz(inp_dsz * jcp.oc_block)
@@ -43,7 +43,7 @@ jit_avx512_core_brgemm_conv_bwd_trans_kernel_t<Vmm>::
     , dst_stride(jcp.owp)
     , dst_w_offset(oc_block_sz)
     , dst_h_offset(dst_stride * dst_w_offset)
-    , VL(vreg_traits<Vmm>::vlen)
+    , VL(vreg_traits_t<Vmm>::vlen)
     , n_vec(jcp.oc_block / jcp.simd_w)
     , n_tail_vec((jcp.oc_without_padding % jcp.oc_block) / jcp.simd_w) {}
 
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.hpp
index 8d90e9397ac..4ae60de006a 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_trans_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ struct jit_brgemm_conv_bwd_trans_kernel_call_s {
 };
 
 template <typename Vmm>
-struct jit_avx512_core_brgemm_conv_bwd_trans_kernel_t : public jit_generator {
+struct jit_avx512_core_brgemm_conv_bwd_trans_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             jit_avx512_core_brgemm_conv_bwd_trans_kernel_t)
 
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp
index 36459180dbf..31796954db6 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ inline status_t init_tag(format_tag_t &tag, memory_desc_t &md,
         CHECK(memory_desc_init_by_tag(md, tag_value));
         tag = tag_value;
     } else {
-        tag = mdw.matches_one_of_tag(tag_value);
+        tag = mdw.mb_stride_relaxed_match(tag_value);
     }
 
     VDISPATCH_CONV_IC(tag == tag_value, VERBOSE_UNSUPPORTED_TAG);
@@ -111,7 +111,8 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &diff_dst_md,
     } else {
         jcp.LDB = jcp.ic_block;
         const bool no_vnni_format = jcp.wei_dt == f32
-                || (jcp.wei_dt == f16 && jcp.isa == avx512_core_fp16);
+                || (jcp.wei_dt == f16 && jcp.isa == avx512_core_fp16)
+                || jcp.is_f32_bf16 || jcp.is_f32_f16;
         if (jcp.ic_block == 64) {
             if (is_3d) {
                 if (no_vnni_format)
@@ -1414,6 +1415,11 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     const memory_desc_wrapper diff_src_d(&diff_src_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, diff_src_d, weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1;
     int ndims = diff_src_d.ndims();
 
@@ -1496,11 +1502,17 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     jcp.is_bf32 = everyone_is(f32, jcp.src_dt, jcp.wei_dt)
             && one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::any)
             && isa == avx512_core_amx;
+    jcp.is_f32_bf16
+            = everyone_is(f32, jcp.src_dt, jcp.dst_dt) && jcp.wei_dt == bf16;
+    jcp.is_f32_f16
+            = everyone_is(f32, jcp.src_dt, jcp.dst_dt) && jcp.wei_dt == f16;
 
     VDISPATCH_CONV_IC(!jcp.is_bf32, VERBOSE_UNSUPPORTED_DT);
 
+    const auto wei_dt
+            = jcp.is_f32_f16 || jcp.is_f32_bf16 ? jcp.src_dt : jcp.wei_dt;
     const data_type_t last_oc_block_dt = get_mac_emu_data_type(
-            jcp.wei_dt, isa, isa == avx512_core_fp16 && !jcp.is_fp8_convert);
+            wei_dt, isa, isa == avx512_core_fp16 && !jcp.is_fp8_convert);
     jcp.vnni_block = data_type_vnni_granularity(last_oc_block_dt);
 
     // TODO: optimize grouped convolutions with small oc
@@ -1591,12 +1603,12 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
                             || one_of(jcp.isa, avx2_vnni, avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
 
-    VDISPATCH_CONV_IC(IMPLICATION(jcp.wei_dt == bf16,
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.wei_dt == bf16 && !jcp.is_f32_bf16,
                               is_superset(jcp.isa, avx512_core_bf16)
                                       || is_superset(jcp.isa, avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
 
-    VDISPATCH_CONV_IC(IMPLICATION(jcp.wei_dt == f16,
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.wei_dt == f16 && !jcp.is_f32_f16,
                               is_superset(jcp.isa, avx512_core_fp16)
                                       || is_superset(jcp.isa, avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
@@ -1604,6 +1616,14 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     VDISPATCH_CONV_IC(IMPLICATION(is_f32, one_of(isa, avx512_core, avx2)),
             VERBOSE_ISA_DT_MISMATCH);
 
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.is_f32_bf16, one_of(isa, avx512_core, avx2)),
+            VERBOSE_ISA_DT_MISMATCH);
+
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.is_f32_f16, one_of(isa, avx512_core, avx2)),
+            VERBOSE_ISA_DT_MISMATCH);
+
     jcp.amx_h = 16;
     jcp.amx_w = 64 / jcp.src_dsz;
 
@@ -1630,19 +1650,22 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
                       attr, cd.use_inversion ? DNNL_ARG_DST : DNNL_ARG_DIFF_SRC)
             != brgemm_broadcast_t::none;
 
-    const bool has_zero_points = jcp.src_zero_point || jcp.dst_zero_point;
+    const auto &zp = attr.zero_points_;
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.src_zero_point || jcp.dst_zero_point,
+                              utils::one_of(jcp.src_dt, s8, u8)),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.src_zero_point,
+                              zp.get_mask(cd.use_inversion ? DNNL_ARG_SRC
+                                                           : DNNL_ARG_DIFF_DST)
+                                      == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
 
-    const bool params_ok
-            = IMPLICATION(has_zero_points, utils::one_of(jcp.src_dt, u8, s8))
-            && IMPLICATION(jcp.src_zero_point,
-                    attr.zero_points_.common(cd.use_inversion
-                                    ? DNNL_ARG_SRC
-                                    : DNNL_ARG_DIFF_DST))
-            && IMPLICATION(jcp.dst_zero_point,
-                    attr.zero_points_.common(cd.use_inversion
-                                    ? DNNL_ARG_DST
-                                    : DNNL_ARG_DIFF_SRC));
-    VDISPATCH_CONV_IC(params_ok, VERBOSE_UNSUPPORTED_ZP_CFG);
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.dst_zero_point,
+                              zp.get_mask(cd.use_inversion ? DNNL_ARG_DST
+                                                           : DNNL_ARG_DIFF_SRC)
+                                      == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
 
     jcp.nthr = nthreads;
     jcp.copy_block_only = false;
@@ -2048,7 +2071,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
         jcp.with_scales = !src_scales.has_default_values()
                 || !wei_scales.has_default_values()
                 || jcp.scale_adjust_factor != 1.0f;
-        jcp.is_ic_scale = wei_scales.mask_ != 0;
+        jcp.is_ic_scale = wei_scales.get_mask() > 0;
     }
 
     jcp.req_brg_comp_pad = false;
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp
index 5eec6cb651c..90b768ea02b 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_w.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,15 +61,14 @@ status_t brgemm_convolution_bwd_weights_t::pd_t::init(engine_t *engine) {
 
     auto scratchpad = scratchpad_registry().registrar();
 
-    status_t status = brgemm_convolution_utils::init_conf_bwd_w(jcp_, *desc(),
-            src_md_, diff_weights_md_, diff_bias_md_, diff_dst_md_, attr_,
-            dnnl_get_max_threads());
-    if (status != status::success) return status;
+    // TODO: make `init_conf` assign initialized object to `jcp_`
+    CHECK(brgemm_convolution_utils::init_conf_bwd_w(jcp_, *desc(), src_md_,
+            diff_weights_md_, diff_bias_md_, diff_dst_md_, attr_,
+            dnnl_get_max_threads()));
 
-    status = brgemm_convolution_utils::init_scratchpad_bwd_w(
-            scratchpad, jcp_, src_md_, diff_weights_md_, diff_dst_md_);
+    CHECK(brgemm_convolution_utils::init_scratchpad_bwd_w(
+            scratchpad, jcp_, src_md_, diff_weights_md_, diff_dst_md_));
 
-    if (status != status::success) return status;
     copy2jit_jcp();
 
     bs_c = jcp_.var_bs ? 1 : (jcp_.max_batch + 1);
@@ -134,26 +133,38 @@ status_t brgemm_convolution_bwd_weights_t::pd_t::init(engine_t *engine) {
                 brgattr.hint_expected_B_size = 0;
                 brgattr.hint_expected_C_size = 0;
 
-                brgattr.wary_tail_read = false;
+                brgattr.wary_A_k_tail_read = false;
                 brgattr.bd_mask_level = jcp_.use_M_mask;
 
                 brgattr.max_top_vpad = 0;
                 brgattr.max_bottom_vpad = 0;
 
-                brgattr.LDA2 = jcp_.tr_iw * jcp_.ih_block * jcp_.id;
-                brgattr.LDB2
-                        = jcp_.tr_ow * jcp_.oc_block * jcp_.oh_block * jcp_.od;
+                const auto lda2_size = static_cast<size_t>(jcp_.tr_iw)
+                        * jcp_.ih_block * jcp_.id;
+                VDISPATCH_CONV_IC(lda2_size <= INT_MAX,
+                        VERBOSE_UNSUPPORTED_FEATURE,
+                        "lda2_size > INT_MAX is not supported");
+                brgattr.LDA2 = lda2_size;
+
+                const auto ldb2_size = static_cast<size_t>(jcp_.tr_ow)
+                        * jcp_.oc_block * jcp_.oh_block * jcp_.od;
+                VDISPATCH_CONV_IC(ldb2_size <= INT_MAX,
+                        VERBOSE_UNSUPPORTED_FEATURE,
+                        "ldb2_size > INT_MAX is not supported");
+                brgattr.LDB2 = ldb2_size;
+
                 brgattr.LDC2_M = jcp_.oc_block * jcp_.kd * jcp_.kh * jcp_.kw;
                 brgattr.LDC2_N = jcp_.nb_ic * jcp_.ic_block * jcp_.oc_block
                         * jcp_.kd * jcp_.kh * jcp_.kw;
 
                 CHECK(brgemm_desc_set_attr(&brg, brgattr));
+                CHECK(brgemm_desc_finalize(&brg));
 
                 brgs_->insert(brg_idx, brg);
             }
         }
     }
-    return status;
+    return status::success;
 }
 
 // jit_jcp used to initialize transpose kernels shared with jit implementation
@@ -404,6 +415,11 @@ struct brgemm_convolution_bwd_weights_t::thread_info_t {
 
         if (jcp.transform_to_vnni) {
             const int vnni_granularity = data_type_vnni_granularity(jcp.wei_dt);
+            if (vnni_granularity == 0) {
+                assert("Invalid vnni granularity.");
+                return;
+            }
+
             const auto icb_work = div_up(jcp.nb_ic, vnni_granularity);
             balance211(
                     icb_work, jcp.nthr_ic_b, ithr_ic_b, ic_b_start, ic_b_end);
@@ -1227,6 +1243,10 @@ void brgemm_convolution_bwd_weights_t::store_in_vnni_format(
     if (one_of(0, ti->g_work, ti->oc_b_work, ti->ic_b_work)) return;
 
     const int vnni_granularity = data_type_vnni_granularity(jcp.wei_dt);
+    if (vnni_granularity == 0) {
+        assert("Invalid vnni granularity.");
+        return;
+    }
 
     const auto icb2_work = div_up(ti->ic_b_work, vnni_granularity);
     const auto work = ti->g_work * ti->oc_b_work * icb2_work;
diff --git a/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp b/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp
index 74234712512..a515776d9b7 100644
--- a/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_bwd_w.hpp
@@ -46,14 +46,8 @@ namespace x64 {
 
 struct brgemm_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , brgs_sz_(0)
-            , bs_c(0) {}
-
-        ~pd_t() = default;
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("brgconv_bwd_w:", jcp_.isa, ""),
@@ -61,14 +55,14 @@ struct brgemm_convolution_bwd_weights_t : public primitive_t {
 
         status_t init(engine_t *engine);
 
-        jit_brgemm_conv_conf_t jcp_;
+        jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         jit_conv_conf_t jit_jcp_;
         void copy2jit_jcp();
 
-        int brgs_sz_;
+        int brgs_sz_ = 0;
         std::shared_ptr<brgemm_containers::brgemm_desc_container_t> brgs_;
 
-        int bs_c;
+        int bs_c = 0;
         std::vector<int> batchsizes;
         bool are_empty_bs {false};
 
@@ -174,7 +168,8 @@ struct brgemm_convolution_bwd_weights_t : public primitive_t {
     inline dim_t wei_offset_int(int g, int oc_b, int ic_b, int kX) const {
         const auto &jcp = pd()->jcp_;
         const dim_t kh_offset = jcp.kw * jcp.ic_block * jcp.oc_block;
-        dim_t extra_offset = (jcp.ndims == 5) ? kX * jcp.kh : kX;
+        dim_t extra_offset
+                = (jcp.ndims == 5) ? static_cast<dim_t>(kX) * jcp.kh : kX;
         const auto res = ((dim_t)((g * jcp.nb_oc + oc_b) * jcp.nb_ic + ic_b)
                                          * jcp.kd * jcp.kh
                                  + extra_offset)
@@ -185,11 +180,16 @@ struct brgemm_convolution_bwd_weights_t : public primitive_t {
     inline dim_t wei_offset_ext(int g, int oc_b, int ic_b) const {
         const auto &jcp = pd()->jcp_;
         const int vnni_granularity = data_type_vnni_granularity(jcp.wei_dt);
+        if (vnni_granularity == 0) {
+            assert("Invalid vnni granularity.");
+            return 0;
+        }
 
         const int vnni_ic_b = ic_b / vnni_granularity;
         const int vnni_ic_block = vnni_granularity * jcp.ic_block;
         const int vnni_nb_ic = utils::div_up(jcp.ic, vnni_ic_block);
-        const dim_t kh_offset = jcp.kw * jcp.oc_block * vnni_ic_block;
+        const dim_t kh_offset
+                = static_cast<dim_t>(jcp.kw) * jcp.oc_block * vnni_ic_block;
         const auto res
                 = (dim_t)((g * jcp.nb_oc + oc_b) * vnni_nb_ic + vnni_ic_b)
                 * jcp.kd * jcp.kh * kh_offset;
diff --git a/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.cpp
index a167314531e..e5d13322bed 100644
--- a/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ template <typename Vmm>
 jit_uni_brgemm_conv_comp_pad_kernel_t<Vmm>::
         jit_uni_brgemm_conv_comp_pad_kernel_t(
                 const jit_brgemm_conv_conf_t &ajcp)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , jcp_(ajcp)
     , inp_dsz_(jcp_.wei_dsz)
     , out_dsz_(jcp_.acc_dsz)
@@ -203,10 +203,7 @@ void jit_uni_brgemm_conv_comp_pad_kernel_t<Vmm>::compute(const int ic_step,
                     ? EVEX_compress_addr(reg_aux_in, oc_offset)
                     : ptr[reg_aux_in + oc_offset];
             if (jcp_.has_int8_vnni) {
-                vpdpbusd(vmm, vmm_one_bytes, addr,
-                        is_superset(jcp_.isa, avx512_core)
-                                ? Xbyak::EvexEncoding
-                                : Xbyak::VexEncoding);
+                vpdpbusd(vmm, vmm_one_bytes, addr, get_encoding());
             } else {
                 vpmaddubsw(zmm_int8_temp, vmm_one_bytes, addr);
                 vpmaddwd(zmm_int8_temp, zmm_int8_temp, zmm_one_words);
@@ -698,7 +695,7 @@ template <typename Vmm>
 jit_uni_brgemm_conv_relo_comp_pad_kernel_t<Vmm>::
         jit_uni_brgemm_conv_relo_comp_pad_kernel_t(
                 const jit_brgemm_conv_conf_t &ajcp)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , jcp_(ajcp)
     , inp_dsz_(jcp_.wei_dsz)
     , out_dsz_(jcp_.acc_dsz)
diff --git a/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.hpp b/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.hpp
index 41228651161..160952f57bb 100644
--- a/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_comp_pad_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ struct jit_brgemm_conv_comp_pad_call_s {
 // Variables with "ic" and "oc" are named from perspective of fwd
 // For bwd_d "ic" and "oc" are swapped
 template <typename Vmm>
-struct jit_uni_brgemm_conv_comp_pad_kernel_t : public jit_generator {
+struct jit_uni_brgemm_conv_comp_pad_kernel_t : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_brgemm_conv_comp_pad_kernel_t)
 
@@ -50,12 +50,12 @@ struct jit_uni_brgemm_conv_comp_pad_kernel_t : public jit_generator {
 
     jit_uni_brgemm_conv_comp_pad_kernel_t(const jit_brgemm_conv_conf_t &ajcp);
 
-    ~jit_uni_brgemm_conv_comp_pad_kernel_t() = default;
+    ~jit_uni_brgemm_conv_comp_pad_kernel_t() override = default;
 
 protected:
     static constexpr bool is_ymm_ = std::is_same<Vmm, Xbyak::Ymm>::value;
 
-    jit_brgemm_conv_conf_t jcp_;
+    jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     const int inp_dsz_;
     const int out_dsz_;
     const size_t nb_ic_;
@@ -96,7 +96,7 @@ struct jit_uni_brgemm_conv_comp_pad_kernel_t : public jit_generator {
     Xbyak::Zmm zmm_int8_temp = Xbyak::Zmm(26);
 
     const int last_ic_block_ = 4;
-    const int m_block2_ = vreg_traits<Vmm>::vlen / sizeof(int32_t);
+    const int m_block2_ = vreg_traits_t<Vmm>::vlen / sizeof(int32_t);
     static constexpr int max_oc_block_ = 64;
     const int n_max_regs_ = max_oc_block_ / m_block2_;
 
@@ -137,16 +137,16 @@ struct jit_uni_brgemm_conv_comp_pad_kernel_t : public jit_generator {
 };
 
 template <typename Vmm>
-struct jit_uni_brgemm_conv_relo_comp_pad_kernel_t : public jit_generator {
+struct jit_uni_brgemm_conv_relo_comp_pad_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_brgemm_conv_relo_comp_pad_kernel_t)
     using reg64_t = const Xbyak::Reg64;
 
     jit_uni_brgemm_conv_relo_comp_pad_kernel_t(
             const jit_brgemm_conv_conf_t &ajcp);
-    ~jit_uni_brgemm_conv_relo_comp_pad_kernel_t() = default;
+    ~jit_uni_brgemm_conv_relo_comp_pad_kernel_t() override = default;
 
 protected:
-    jit_brgemm_conv_conf_t jcp_;
+    jit_brgemm_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     const int inp_dsz_;
     const int out_dsz_;
     const int inp_oc_block_;
diff --git a/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp b/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp
index 7e043de1137..5b104d05a91 100644
--- a/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_trans_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ namespace jit_avx512_core_brgemm_conv_trans_kernel {
 jit_avx512_core_brgemm_conv_trans_kernel_t::
         jit_avx512_core_brgemm_conv_trans_kernel_t(
                 const jit_brgemm_conv_conf_t &ajcp, const char *name)
-    : jit_generator(name)
+    : jit_generator_t(name)
     , jcp(ajcp)
     , inp_dsz(jcp.src_dsz)
     , ic_block_sz(
@@ -42,7 +42,7 @@ jit_avx512_core_brgemm_conv_trans_kernel_t::
     , iw_size(inp_dsz * jcp.ngroups * jcp.ic_without_padding)
     , dst_w_block(dst_w(jcp, jcp.ow_block))
     , dst_stride(jcp.copy_block_only ? dst_w_block : jcp.iwp)
-    , VL(cpu_isa_traits<avx512_core>::vlen)
+    , VL(cpu_isa_traits_t<avx512_core>::vlen)
     , n_vec(ic_block_sz / jcp.simd_w)
     , n_tail_vec((jcp.ic_without_padding % ic_block_sz) / jcp.simd_w) {
 
diff --git a/src/cpu/x64/jit_brgemm_conv_trans_kernel.hpp b/src/cpu/x64/jit_brgemm_conv_trans_kernel.hpp
index 3a124af7e17..8b72171643f 100644
--- a/src/cpu/x64/jit_brgemm_conv_trans_kernel.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_trans_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ struct jit_brgemm_conv_trans_kernel_call_s {
     size_t b_pad;
 };
 
-struct jit_avx512_core_brgemm_conv_trans_kernel_t : public jit_generator {
+struct jit_avx512_core_brgemm_conv_trans_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_brgemm_conv_trans_kernel_t)
 
     using reg64_t = const Xbyak::Reg64;
diff --git a/src/cpu/x64/jit_brgemm_conv_utils.cpp b/src/cpu/x64/jit_brgemm_conv_utils.cpp
index 3682e6409e1..0994db33abc 100644
--- a/src/cpu/x64/jit_brgemm_conv_utils.cpp
+++ b/src/cpu/x64/jit_brgemm_conv_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "dnnl_types.h"
-
 #include "common/c_types_map.hpp"
 #include "common/convolution_pd.hpp"
 #include "common/dnnl_thread.hpp"
@@ -33,6 +31,8 @@
 #include "cpu/x64/jit_brgemm_conv_utils.hpp"
 #include "cpu/x64/jit_generator.hpp"
 
+#include <set>
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
@@ -51,10 +51,11 @@ bool allow_perf_heuristics(const jit_brgemm_conv_conf_t &jcp) {
     // Disable performance heuristics for plain weights as there are no other
     // optimized implementations.
     if (jcp.wei_plain) return false;
-    // Disable performance heuristics for f16 as there are no other
-    // optimized implementations.
+    // Disable performance heuristics for f16, fp8, f32 with xf16 weights
+    // as there are no other optimized implementations.
     if (jcp.wei_dt == f16) return false;
     if (one_of(jcp.wei_dt, f8_e5m2, f8_e4m3)) return false;
+    if (one_of(true, jcp.is_f32_f16, jcp.is_f32_bf16)) return false;
     return true;
 }
 } // namespace
@@ -109,7 +110,7 @@ struct brg_blocking_t : public jit_brgemm_conv_conf_t {
         max_regs = isa == isa_undef ? 0 : isa_num_vregs(isa);
     }
 
-    int ur, ur_block, ur_block_tail;
+    int ur, ur_block, ur_block_tail, adj_ocblock;
     int nb_kd, nb_kh, nb_kw;
     int max_regs;
     float eff;
@@ -118,13 +119,10 @@ struct brg_blocking_t : public jit_brgemm_conv_conf_t {
     // These are rough estimates of the latency (relative) of access to various
     // cache levels. This is enough for an estimation of data access cost.
     // TODO: Improve memory access estimates
-    static constexpr float L1_k = 1.f;
-    static constexpr float L2_k = 3.f;
-    static constexpr float L3_k = 15.f;
-    // TODO: At the moment, we are primarily evaluating the fit of the data into
-    // the L1/L2. Need to take into account the difference between the L3 and
-    // memory.
-    static constexpr float mem_k = 15.f;
+    static float L1_k;
+    static float L2_k;
+    static float L3_k;
+    static float mem_k;
     static constexpr int bench_iterations = 1;
 
     int sp, sp_block, nb_sp;
@@ -172,11 +170,22 @@ struct brg_blocking_t : public jit_brgemm_conv_conf_t {
         return (k > 1.f) ? (k - 1 + eff) / k : eff * koeff;
     }
 
-    static int estimate_ur(int oc_block) {
-        const auto est_ur = (oc_block == 64)
-                ? 6
-                : ((oc_block == 48) ? 9 : ((oc_block == 32) ? 14 : 28));
-        return est_ur;
+    static int estimate_ur(cpu_isa_t isa, int oc_block) {
+        if (one_of(isa, avx2, avx2_vnni, avx2_vnni_2)) {
+            switch (oc_block) {
+                case 32: return 3;
+                case 24: return 4;
+                case 16: return 6;
+                default: return 14;
+            }
+        } else {
+            switch (oc_block) {
+                case 64: return 6;
+                case 48: return 9;
+                case 32: return 14;
+                default: return 28;
+            }
+        }
     }
 
     int inp_w(int out_w, int ker_w) const {
@@ -214,7 +223,7 @@ inline status_t init_tag(format_tag_t &tag, memory_desc_t &md,
             tag = format_tag::undef;
         }
     } else {
-        tag = mdw.matches_one_of_tag(tag_value);
+        tag = mdw.mb_stride_relaxed_match(tag_value);
     }
 
     VDISPATCH_CONV_IC(tag == tag_value, VERBOSE_UNSUPPORTED_TAG);
@@ -378,6 +387,10 @@ status_t pick_tags(jit_brgemm_conv_conf_t &jcp, memory_desc_t &src_md,
 
 unsigned brg_blocking_t::L1;
 unsigned brg_blocking_t::L2;
+float brg_blocking_t::L1_k;
+float brg_blocking_t::L2_k;
+float brg_blocking_t::L3_k;
+float brg_blocking_t::mem_k;
 
 float brg_blocking_t::io_k(dim_t src, dim_t wei, dim_t dst, float n, float pk,
         bool is_broadcast, bool is_shared) const {
@@ -405,8 +418,6 @@ void brg_blocking_t::select_ic_block() {
     if (is_1x1 && is_amx(isa)) {
         // TODO: merge with non-1x1 code block below
         const int ic_padded_block = 16 * vnni_block;
-        assert(IMPLICATION(
-                !is_bf32, ic < ic_padded_block || ic % ic_padded_block == 0));
         MAYBE_UNUSED(ic_padded_block);
         // Note: bf32 requires ic_block be less than 64, otherwise it results
         // in incorrect output.
@@ -460,8 +471,8 @@ void brg_blocking_t::select_ic_block() {
         }
     } else {
         const auto est_ur = sp_block > 0
-                ? nstl::min(sp_block, estimate_ur(oc_block))
-                : estimate_ur(oc_block);
+                ? nstl::min(sp_block, estimate_ur(isa, oc_block))
+                : estimate_ur(isa, oc_block);
         const auto inp_ur = is_os_blocking ? est_ur : inp_w(est_ur, kw_block);
 
         if (kw_block > 1) {
@@ -608,14 +619,31 @@ status_t brg_blocking_t::estimate_brgemm_ur() {
     brgemm_utils::init_brgemm_conf(&brg, isa, brgemm_addr, src_dt, wei_dt,
             brgemm_row_major, alpha, beta, LDA, LDB, LDC, vM, vN, vK, nullptr,
             is_bf32);
+    if (exec_type == exec_vpad) {
+        brg.zp_type_a = src_zero_point ? brgemm_broadcast_t::per_tensor
+                                       : brgemm_broadcast_t::none;
+    }
+    brgemm_attr_t brgattr;
+    brgattr.max_bs = max_batch;
+    max_vpad = exec_type == exec_vpad ? nstl::max(l_pad, r_pad) : 0;
+    brgattr.max_top_vpad = max_vpad;
+    brgattr.max_bottom_vpad = max_vpad;
+    CHECK(brgemm_desc_set_attr(&brg, brgattr));
     CHECK(brgemm_utils::brgemm_blocking(&brg));
     ur = brg.bd_block * (is_amx(isa) ? brg.bd_block2 : 1);
     ur_block = brg.bd_block;
-    if (is_1x1 && is_amx(isa) && M > 0 && M_tail > 0) {
+    adj_ocblock = nstl::max(1, (brg.ldb2 != 0 ? brg.ld_block2 : brg.ldb2_tail));
+    if (((is_1x1 && is_amx(isa)) || max_vpad > 0) && M > 0 && M_tail > 0) {
         brgemm_desc_t brg_sp_tail;
         brgemm_utils::init_brgemm_conf(&brg_sp_tail, isa, brgemm_addr, src_dt,
                 wei_dt, brgemm_row_major, alpha, beta, LDA, LDB, LDC, M_tail,
                 vN, vK, nullptr, is_bf32);
+        if (exec_type == exec_vpad) {
+            brg_sp_tail.zp_type_a = src_zero_point
+                    ? brgemm_broadcast_t::per_tensor
+                    : brgemm_broadcast_t::none;
+        }
+        CHECK(brgemm_desc_set_attr(&brg_sp_tail, brgattr));
         CHECK(brgemm_utils::brgemm_blocking(&brg_sp_tail));
         ur_block_tail = brg_sp_tail.bd_block;
     } else {
@@ -633,52 +661,79 @@ status_t brg_blocking_t::get_brgemm_ur(
 
     LDD = oc_without_padding;
 
-    const float alpha = 1.0;
-    const float beta = 1.0;
-    const float beta_init = 0.0;
-
-    for (int i = 0; i < M; i++) {
-        auto vM = i + 1;
-        // init only needed brgemm descriptors
-        if ((utils::one_of(exec_type, exec_trans, exec_vpad) || is_1x1)
-                && vM != M && vM != M_tail)
-            continue;
-        for (int i_init = 0; i_init < 2; i_init++) {
-            for_(int i_N = 0; i_N < 2; i_N++)
-            for (int i_K = 0; i_K < 2; i_K++) {
-                auto vbeta = (i_init) ? beta_init : beta;
-                auto vN = (i_N) ? N_tail : N;
-                auto vK = (i_K) ? K_tail : K;
-                if (vN == 0 || vK == 0) continue;
-                brgemm_desc_t brg;
-                brgemm_strides_t brg_strides;
-                brg_strides.stride_a = ngroups * ic_without_padding
-                        * (dilate_w + 1) * src_dsz;
-                // weights are padded by oc_block and last_ic_block
-                brg_strides.stride_b = rnd_up(ic, vnni_block)
-                        * rnd_up(oc, oc_block) * wei_dsz;
-                const auto strides_ptr
-                        = (brg_type == brgemm_strd) ? &brg_strides : nullptr;
-                brgemm_utils::init_brgemm_conf(&brg, isa, brg_type, src_dt,
-                        wei_dt, brgemm_row_major, alpha, vbeta, LDA, LDB, LDC,
-                        vM, vN, vK, strides_ptr, is_bf32);
-                CHECK(brgemm_utils::brgemm_blocking(&brg));
-
-                brgemm_attr_t brgattr;
-                brgattr.max_bs = max_batch;
-                max_vpad = exec_type == exec_vpad ? nstl::max(l_pad, r_pad) : 0;
-                brgattr.max_top_vpad = max_vpad;
-                brgattr.max_bottom_vpad = max_vpad;
-                brgattr.fpmath_mode = attr->fpmath_.mode_;
-                CHECK(brgemm_desc_set_attr(&brg, brgattr));
-
-                brg.with_sum = with_sum;
-                CHECK(brgemm_desc_set_postops(
-                        &brg, attr, &dst_md, LDD, bia_dt));
+    // The code below will verify that for selected `brgemm_ur` all brgemm
+    // descriptors will be initialized successfully. If they do, stick to this
+    // `brgemm_ur`, otherwise, a new estimation will come.
+    //
+    // This set will filter only needed M for validating brgemm descriptors.
+    //
+    // Note: the same approach for `jcp_.exec_type == exec_base` to filter out
+    // M and extra brgemm descriptors is used in convolution implementation in
+    // `pd_t::init`. Maybe it's possible to align them, or make them relying on
+    // the same set of unique M values.
+    std::set<dim_t> unique_vM;
+    if (utils::one_of(exec_type, exec_trans, exec_vpad) || is_1x1) {
+        // Up to two M values for these exec_types.
+        unique_vM.emplace(M);
+        if (M_tail) unique_vM.emplace(M_tail);
+    } else {
+        // For `exec_base` exec_type need to compute all relevant M values.
+        for (int ow = 0; ow < this->ow; ow += this->ow_block) {
+            int kw_s = 0;
+            int kw_f = 0;
+            int dummy; // `kw_full_s` and `kw_full_f` are not needed here.
+            brgemm_convolution_utils::get_kw_range(
+                    /* jcp */ *this, ow, kw_s, dummy, dummy, kw_f);
+            for (int kw = kw_s; kw < kw_f; kw++) {
+                int ow_s = 0;
+                int ow_f = 0;
+                brgemm_convolution_utils::get_ow_range(
+                        /* jcp */ *this, ow, kw, ow_s, ow_f);
+                const auto M = ow_f - ow_s;
+                if (M <= 0) continue;
+                unique_vM.emplace(M);
             }
         }
     }
 
+    const float alpha = 1.0f;
+    const float beta = 1.0f;
+    const float beta_init = 0.0f;
+
+    for_(auto vM : unique_vM)
+    for_(int i_init = 0; i_init < 2; i_init++)
+    for_(int i_N = 0; i_N < 2; i_N++)
+    for (int i_K = 0; i_K < 2; i_K++) {
+        auto vbeta = (i_init) ? beta_init : beta;
+        auto vN = (i_N) ? N_tail : N;
+        auto vK = (i_K) ? K_tail : K;
+        if (vN == 0 || vK == 0) continue;
+        brgemm_desc_t brg;
+        brgemm_strides_t brg_strides;
+        brg_strides.stride_a
+                = ngroups * ic_without_padding * (dilate_w + 1) * src_dsz;
+        // weights are padded by oc_block and last_ic_block
+        brg_strides.stride_b
+                = rnd_up(ic, vnni_block) * rnd_up(oc, oc_block) * wei_dsz;
+        const auto strides_ptr
+                = (brg_type == brgemm_strd) ? &brg_strides : nullptr;
+        brgemm_utils::init_brgemm_conf(&brg, isa, brg_type, src_dt, wei_dt,
+                brgemm_row_major, alpha, vbeta, LDA, LDB, LDC, vM, vN, vK,
+                strides_ptr, is_bf32);
+
+        brgemm_attr_t brgattr;
+        brgattr.max_bs = max_batch;
+        max_vpad = exec_type == exec_vpad ? nstl::max(l_pad, r_pad) : 0;
+        brgattr.max_top_vpad = max_vpad;
+        brgattr.max_bottom_vpad = max_vpad;
+        brgattr.fpmath_mode = attr->fpmath_.mode_;
+        CHECK(brgemm_desc_set_attr(&brg, brgattr));
+
+        brg.with_sum = with_sum;
+        CHECK(brgemm_desc_set_postops(&brg, attr, &dst_md, LDD, bia_dt));
+        CHECK(brgemm_utils::brgemm_blocking(&brg));
+    }
+
     return status::success;
 }
 
@@ -729,10 +784,10 @@ bool brg_blocking_t::fast_check_oc_block() const {
 }
 
 float brg_blocking_t::est_eff() {
-    const auto ocblock = oc_block / acc_simd_w;
+    const auto jcp = *this;
 
-    const auto brgemm_microkernel_eff
-            = (static_cast<float>(ocblock) * ur) / ((ur + ocblock) * max_regs);
+    const auto brgemm_microkernel_eff = (static_cast<float>(adj_ocblock) * ur)
+            / ((ur + adj_ocblock) * max_regs);
 
     const auto ur_eff = static_cast<float>(sp_block) / rnd_up(sp_block, ur);
     const auto brgemm_eff = squeeze_val(ur
@@ -761,28 +816,18 @@ float brg_blocking_t::est_eff() {
             dim_t thr_job = 0;
             int start {0}, end {0};
             balance211(work_amount, nthr, ithr, start, end);
-            int n {0}, g {0}, ocb {0}, odp {0}, ohp {0}, spb {0};
-            if (loop_order == loop_ndhwgc)
-                nd_iterator_init(start, n, mb, odp, od, ohp, oh, spb, nb_sp, g,
-                        ngroups, ocb, nb_oc);
-            else if (loop_order == loop_ngcdhw)
-                nd_iterator_init(start, n, mb, g, ngroups, ocb, nb_oc, odp, od,
-                        ohp, oh, spb, nb_sp);
+            int n {0}, g {0}, ocb {0}, odb {0}, ohb {0}, owb {0};
+            BRGEMM_CONV_ITERATOR_INIT;
 
             for (auto work = start; work < end; work++) {
                 const int ocp = ocb * oc_block;
                 const auto oc_sz = nstl::min(oc - ocp, oc_block);
                 int sp_sz = 0;
-                const int spp = spb * sp_block;
+                const int spp = owb * sp_block;
                 sp_sz = nstl::min(sp - spp, sp_block);
                 thr_job += sp_sz * oc_sz;
 
-                if (loop_order == loop_ndhwgc)
-                    nd_iterator_step(n, mb, odp, od, ohp, oh, spb, nb_sp, g,
-                            ngroups, ocb, nb_oc);
-                else if (loop_order == loop_ngcdhw)
-                    nd_iterator_step(n, mb, g, ngroups, ocb, nb_oc, odp, od,
-                            ohp, oh, spb, nb_sp);
+                BRGEMM_CONV_ITERATOR_STEP;
             }
             thr_jobs[ithr] = thr_job;
         }
@@ -871,7 +916,7 @@ float brg_blocking_t::est_eff() {
 
     src_is = kd * kh * rnd_inp_simd(sp_block, kw, ic);
 
-    auto wei_op = kd * kh * kw * ocblock * ic;
+    auto wei_op = kd * kh * kw * adj_ocblock * ic;
     if (loop_order == loop_ndhwgc) {
         // -- harness: loop by oc_block --
         l++;
@@ -885,8 +930,8 @@ float brg_blocking_t::est_eff() {
     // -- harness: loop by sp_blocks --
     l++;
     loop[l].src.set(src_is, 1);
-    const auto rnd_oc_for_sp
-            = simd_w * ((loop_order == loop_ndhwgc) ? nsimd_oc_thr : ocblock);
+    const auto rnd_oc_for_sp = simd_w
+            * ((loop_order == loop_ndhwgc) ? nsimd_oc_thr : adj_ocblock);
     loop[l].dst.set(sp_block * rnd_oc_for_sp, 1);
     loop[l].wei.set(wei_op * simd_w, nb_sp_thr);
     // oh_block almost all is 1. TODO: manage oh_block != 1
@@ -1100,8 +1145,13 @@ void brg_blocking_t::iterate_ker_block(brg_blocking_t &best_brgb, int kd_block_,
                         || kh_block != kh || kw_block != kw
                         || kd_block_pad != kd || kh_block_pad != kh
                         || kw_block_pad != kw);
+        //if (exec_type == exec_base)
+        //    use_buffer = use_buffer || (maybe_use_buffer && iwp != iw);
+        // WA coredump: ws:2, pl: 1, will result r_pad=-1 and incorrect post ops jit kernel.
+        // case: onednn_verbose,exec,cpu,convolution,brgconv:avx512_core,forward_inference,src_f32::blocked:acb:f0 wei_f32:p:blocked:Acb16a:f0 bia_undef::undef::f0 dst_f32::blocked:acb:f0,attr-post-ops:sum:1:0:f32+eltwise_hardswish+binary_min:f32:0+binary_max:f32:0+binary_mul:f32:0+binary_add:f32:0+eltwise_round_half_to_even+binary_mul:f32:0+binary_add:f32:0 ,alg:convolution_direct,mb2_ic112oc6_iw7ow4kw1sw2dw0pw1,63700.4
         if (exec_type == exec_base)
-            use_buffer = use_buffer || (maybe_use_buffer && iwp != iw);
+            use_buffer = use_buffer || (maybe_use_buffer
+                            && (iwp != iw || (l_pad + nstl::max(0, r_pad)) > 0));
 
         const status_t st = estimate_brgemm_ur();
         if (st != status::success) continue;
@@ -1186,7 +1236,6 @@ bool brg_blocking_t::fast_check_oc_block_1x1() const {
 }
 
 float brg_blocking_t::est_eff_1x1() {
-    const auto ocblock = oc_block / acc_simd_w;
 
     auto calc_ave_blk = [&](int dim, int block, bool use_ave) -> float {
         const int nb = dim / block;
@@ -1218,7 +1267,8 @@ float brg_blocking_t::est_eff_1x1() {
     const auto brgemm_microkernel_eff = is_amx(isa)
             ? amx_fac * (static_cast<float>(ocb_ave) * spb_ave)
                     / (ocb_ave + spb_ave)
-            : (static_cast<float>(ocblock) * ur) / ((ur + ocblock) * max_regs);
+            : (static_cast<float>(adj_ocblock) * ur)
+                    / ((ur + adj_ocblock) * max_regs);
     const auto ur_eff = static_cast<float>(sp_block) / rnd_up(sp_block, ur);
 
     // heuristic sp_block: for reduced rtus, prioritize a smaller sp_block
@@ -1361,7 +1411,7 @@ float brg_blocking_t::est_eff_1x1() {
     loop[l].src.set(sp_block * ic_blocking_size, 1);
     loop[l].dst.set(sp_block * oc_block, ic_chunks);
     auto wei_is = oc_blocking_size;
-    auto wei_op = ocblock * ic;
+    auto wei_op = adj_ocblock * ic;
     loop[l].wei.set(wei_is, 1);
 
     if (loop_order == loop_ndhwgc) {
@@ -1374,8 +1424,8 @@ float brg_blocking_t::est_eff_1x1() {
         loop[l].wei.set(wei_is, 1);
     }
 
-    const auto rnd_oc_for_sp
-            = simd_w * ((loop_order == loop_ndhwgc) ? nsimd_oc_thr : ocblock);
+    const auto rnd_oc_for_sp = simd_w
+            * ((loop_order == loop_ndhwgc) ? nsimd_oc_thr : adj_ocblock);
     if (is_os_blocking) {
         // -- harness: loop by os_blocks --
         l++;
@@ -1591,14 +1641,37 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
         memory_desc_t &bias_md, primitive_attr_t &attr, int nthreads) {
     using namespace prop_kind;
 
-    brg_blocking_t::L1 = platform::get_per_core_cache_size(1);
+    // take L1 as 7/8 of the real size for L1
+    brg_blocking_t::L1 = (platform::get_per_core_cache_size(1) * 7) / 8;
     brg_blocking_t::L2 = platform::get_per_core_cache_size(2);
+    // here is hard-coded L2 size for avx2 performance cores
+    // TODO: get L2 size from the platform
+    if (one_of(isa, avx2, avx2_vnni, avx2_vnni_2))
+        brg_blocking_t::L2 = 2 * 1024 * 1024;
+    // take L2 as 3/4 of the real size for L2
+    brg_blocking_t::L2 = (brg_blocking_t::L2 * 3) / 4;
+
+    // These are rough estimates of the latency (relative) of access to various
+    // cache levels. This is enough for an estimation of data access cost.
+    // TODO: Improve memory access estimates
+    brg_blocking_t::L1_k = 1.f;
+    brg_blocking_t::L2_k = 2.3f;
+    brg_blocking_t::L3_k = 17.f;
+    // TODO: At the moment, we are primarily evaluating the fit of the data into
+    // the L1/L2. Need to take into account the difference between the L3 and
+    // memory.
+    brg_blocking_t::mem_k = 17.f;
 
     const memory_desc_wrapper src_d(&src_md);
     const memory_desc_wrapper weights_d(&weights_md);
     const memory_desc_wrapper dst_d(&dst_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     int ndims = src_d.ndims();
 
@@ -1675,6 +1748,10 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     jcp.is_fp8 = one_of(jcp.src_dt, f8_e5m2, f8_e4m3)
             && one_of(jcp.wei_dt, f8_e5m2, f8_e4m3);
     jcp.is_fp8_convert = jcp.is_fp8 && utils::one_of(isa, avx10_1_512_amx_fp16);
+    jcp.is_f32_f16
+            = everyone_is(f32, jcp.src_dt, jcp.dst_dt) && jcp.wei_dt == f16;
+    jcp.is_f32_bf16
+            = everyone_is(f32, jcp.src_dt, jcp.dst_dt) && jcp.wei_dt == bf16;
     jcp.src_dsz = types::data_type_size(jcp.src_dt);
     jcp.wei_dsz = types::data_type_size(jcp.wei_dt);
     jcp.dst_dsz = types::data_type_size(jcp.dst_dt);
@@ -1693,7 +1770,8 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
 
     const auto vnni_dt = jcp.prop_kind == prop_kind::backward_weights
             ? jcp.dst_dt
-            : jcp.wei_dt;
+            : utils::one_of(true, jcp.is_f32_bf16, jcp.is_f32_f16) ? jcp.src_dt
+                                                                   : jcp.wei_dt;
     const data_type_t vnni_block_dt = get_mac_emu_data_type(
             vnni_dt, isa, isa == avx10_1_512 && !jcp.is_fp8_convert);
     jcp.vnni_block = data_type_vnni_granularity(vnni_block_dt);
@@ -1780,11 +1858,11 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
                             || one_of(jcp.isa, avx2_vnni, avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
     VDISPATCH_CONV_IC(
-            IMPLICATION(jcp.wei_dt == bf16,
+            IMPLICATION(jcp.wei_dt == bf16 && !jcp.is_f32_bf16,
                     mayiuse(avx512_core_bf16) || mayiuse(avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
     VDISPATCH_CONV_IC(
-            IMPLICATION(jcp.wei_dt == f16,
+            IMPLICATION(jcp.wei_dt == f16 && !jcp.is_f32_f16,
                     mayiuse(avx512_core_fp16) || mayiuse(avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
     const bool is_f32
@@ -1792,6 +1870,12 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     VDISPATCH_CONV_IC(
             IMPLICATION(is_f32, one_of(isa, avx512_core, avx2) || jcp.is_bf32),
             VERBOSE_ISA_DT_MISMATCH);
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.is_f32_f16, one_of(isa, avx512_core, avx2)),
+            VERBOSE_ISA_DT_MISMATCH);
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.is_f32_bf16, one_of(isa, avx512_core, avx2)),
+            VERBOSE_ISA_DT_MISMATCH);
 
     jcp.amx_h = 16;
     jcp.amx_w = 64 / (jcp.is_bf32 ? types::data_type_size(bf16) : jcp.src_dsz);
@@ -1805,20 +1889,23 @@ status_t init_jcp(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     const int prelu_ind = p.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
 
+    const auto &zp = attr.zero_points_;
     jcp.src_zero_point
             = get_zp_type(attr, DNNL_ARG_SRC) != brgemm_broadcast_t::none;
     jcp.dst_zero_point
             = get_zp_type(attr, DNNL_ARG_DST) != brgemm_broadcast_t::none;
 
-    // Only common zero points for the whole output tensor is supported now
-    const bool has_zero_points = jcp.src_zero_point || jcp.dst_zero_point;
-    const bool params_ok
-            = IMPLICATION(has_zero_points, utils::one_of(jcp.src_dt, u8, s8))
-            && IMPLICATION(
-                    jcp.src_zero_point, attr.zero_points_.common(DNNL_ARG_SRC))
-            && IMPLICATION(
-                    jcp.dst_zero_point, attr.zero_points_.common(DNNL_ARG_DST));
-    VDISPATCH_CONV_IC(params_ok, VERBOSE_UNSUPPORTED_ZP_CFG);
+    VDISPATCH_CONV_IC(IMPLICATION(jcp.src_zero_point || jcp.dst_zero_point,
+                              utils::one_of(jcp.src_dt, s8, u8)),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.src_zero_point, zp.get_mask(DNNL_ARG_SRC) == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
+
+    VDISPATCH_CONV_IC(
+            IMPLICATION(jcp.dst_zero_point, zp.get_mask(DNNL_ARG_DST) == 0),
+            VERBOSE_UNSUPPORTED_ZP_CFG);
 
     jcp.nthr = nthreads;
     jcp.copy_block_only = false;
@@ -1925,12 +2012,19 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     using namespace data_type;
     // ======================= blocking =================================
 
-    auto bcast_amount
-            = static_cast<size_t>(jcp.id) * jcp.ih * jcp.iw * jcp.src_dsz;
+    auto bcast_amount = static_cast<size_t>(jcp.id) * jcp.ih * jcp.iw
+            * jcp.src_dsz * jcp.ic;
     auto wei_amount = static_cast<size_t>(jcp.oc) * jcp.kd * jcp.kh * jcp.kw
-            * jcp.wei_dsz;
+            * jcp.wei_dsz * jcp.ic;
 
-    jcp.loop_order = (bcast_amount < wei_amount) ? loop_ngcdhw : loop_ndhwgc;
+    jcp.loop_order
+            = (one_of(isa, avx2, avx2_vnni, avx2_vnni_2) && jcp.mb > jcp.nthr
+                      && bcast_amount > brg_blocking_t::L2
+                      && wei_amount > brg_blocking_t::L2)
+            ? loop_gcndhw
+            : ((bcast_amount < wei_amount) ? loop_ngcdhw : loop_ndhwgc);
+    jcp.brgemm_kernel_loop_order
+            = brgemm_kernel_loop_order_t::brgemm_lo_default;
 
     const int min_oc_block = jcp.acc_simd_w;
 
@@ -1946,6 +2040,13 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
         const bool small_amx_job = est_amx_job < 64 || jcp.oc < 256;
         auto start_ocb
                 = (is_amx(isa) && jcp.is_os_blocking && small_amx_job) ? 2 : 4;
+        if (one_of(isa, avx2, avx2_vnni, avx2_vnni_2)
+                && jcp.loop_order == loop_gcndhw)
+            start_ocb = 2;
+        if (one_of(isa, avx2, avx2_vnni, avx2_vnni_2)
+                && jcp.oh * jcp.ow >= 150 * 150)
+            start_ocb = 2;
+
         start_ocb = nstl::min(div_up(jcp.oc, jcp.acc_simd_w), start_ocb);
 
         auto finish_ocb = 1;
@@ -1980,6 +2081,8 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     bool try_exec_trans = false;
     bool try_exec_base = true;
 
+    // TODO: this logic seems not taking dilation into which can avoid pure
+    // kernel-in-pad cases.
     if (!is_amx(isa) && div_up(jcp.l_pad, jcp.stride_w) < jcp.kw
             && div_up(jcp.r_pad, jcp.stride_w) < jcp.kw) {
         try_exec_vpad = true;
@@ -2198,7 +2301,6 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
 #endif
 
     // ============ end blocking ===========================================
-
     jcp.brg_type
             = (jcp.use_uker && one_of(jcp.exec_type, exec_base, exec_trans))
             ? brgemm_static_offs
@@ -2226,7 +2328,7 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     if (!jcp.wei_plain)
         CHECK(pick_tags(jcp, src_md, weights_md, dst_md, bias_md));
 
-    jcp.buffer_size = jcp.LDC * jcp.M;
+    jcp.buffer_size = static_cast<dim_t>(jcp.LDC) * jcp.M;
 
     jcp.nb_od = div_up(jcp.od, jcp.od_block);
     jcp.nb_oh = div_up(jcp.oh, jcp.oh_block);
@@ -2287,26 +2389,12 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     jcp.with_scales = !src_scales.has_default_values()
             || !wei_scales.has_default_values()
             || jcp.scale_adjust_factor != 1.0f;
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
 
     const bool compensation_w_padding
             = (jcp.s8s8_compensation_required || jcp.src_zero_point)
             && !everyone_is(0, jcp.t_pad, jcp.back_pad, jcp.f_pad, jcp.b_pad,
                     jcp.l_pad, jcp.r_pad);
-
-    // For padding shapes, we calculate the comp along with the computation
-    // inside brgemm kernel when output size is small to get optimal perf
-    // Or we calculate the comp using brgemm_coomp_pad kernel
-    const auto output_sz = static_cast<dim_t>(jcp.mb) * jcp.ngroups * jcp.oc
-            * jcp.od * jcp.oh * jcp.ow;
-    jcp.req_brg_comp_pad = compensation_w_padding && jcp.exec_type != exec_trans
-            && IMPLICATION(!(jcp.is_relo() && jcp.relo_conv_weights),
-                    output_sz <= 8192 && jcp.oc < 512);
-    jcp.req_cal_comp_pad = compensation_w_padding && !jcp.req_brg_comp_pad
-            && IMPLICATION(jcp.exec_type == exec_vpad,
-                    jcp.t_pad > 0 || jcp.b_pad > 0 || jcp.f_pad > 0
-                            || jcp.back_pad > 0);
-
     // estimate the number of kernel range combination for compensation
     const auto kd_cnt = 1 + utils::div_up(abs(jcp.f_pad), jcp.dilate_d + 1)
             + utils::div_up(abs(jcp.back_pad), jcp.dilate_d + 1);
@@ -2318,9 +2406,29 @@ status_t init_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     const auto comp_buffer_ow = jcp.exec_type != exec_vpad ? jcp.ow : 1;
     jcp.comp_a_buffer_size = jcp.ngroups * jcp.nb_oc * jcp.ker_ranges_size
             * comp_buffer_ow * jcp.oc_block;
-
     jcp.s8s8_comp_buffer_size = jcp.comp_a_buffer_size;
 
+    // For padding shapes, we calculate the comp along with the computation
+    // inside brgemm kernel when output size is small to get optimal perf
+    // For shapes with large ow we calculate the comp inside brgemm kernel too
+    // because current implementation of brgemm_comp_pad kernel unrolled by ow
+    // so not optimal for large ow.
+    // Otherwise we calculate the comp using brgemm_comp_pad kernel
+    const auto output_sz = static_cast<dim_t>(jcp.mb) * jcp.ngroups * jcp.oc
+            * jcp.od * jcp.oh * jcp.ow;
+    // TODO: revise below condition to avoid limitation for big ow
+    const auto shape_for_brgemm_kernel
+            = (output_sz <= 8192 && jcp.oc < 512) || jcp.ow > 128;
+    const auto is_relo = jcp.is_relo() && jcp.relo_conv_weights;
+    jcp.req_brg_comp_pad = compensation_w_padding && jcp.exec_type != exec_trans
+            && IMPLICATION(!is_relo, shape_for_brgemm_kernel)
+            && IMPLICATION(
+                    jcp.exec_type == exec_vpad, jcp.comp_a_buffer_size > 1024);
+    jcp.req_cal_comp_pad = compensation_w_padding && !jcp.req_brg_comp_pad
+            && IMPLICATION(jcp.exec_type == exec_vpad,
+                    jcp.t_pad > 0 || jcp.b_pad > 0 || jcp.f_pad > 0
+                            || jcp.back_pad > 0);
+
     // enable ununroll_bd_loop for big shapes to reduce kernel sizes
     jcp.ununroll_bd_loop
             = static_cast<dim_t>(jcp.M) * jcp.N * (jcp.is_bf32 ? 1 : 2)
@@ -2365,7 +2473,9 @@ status_t init_1x1_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
         const int n_vnni_blocks = utils::div_up(jcp.ic, jcp.vnni_block);
         const int ic_block
                 = nstl::min(jcp.acc_simd_w, n_vnni_blocks) * jcp.vnni_block;
-        const bool do_zeropad = (!jcp.is_bf32)
+        // jcp.extendable_k = jcp.ic > jcp.simd_w && jcp.ic % jcp.simd_w;
+
+        const bool do_zeropad = !jcp.is_bf32 // && !jcp.extendable_k
                 && (jcp.ic % jcp.vnni_block != 0 || jcp.ic > ic_block);
         if (do_zeropad) jcp.ic = utils::rnd_up(jcp.ic, ic_block);
         const auto ic_padded_block = jcp.simd_w;
@@ -2542,7 +2652,7 @@ status_t init_1x1_conf(jit_brgemm_conv_conf_t &jcp, cpu_isa_t isa,
     jcp.with_scales = !src_scales.has_default_values()
             || !wei_scales.has_default_values()
             || jcp.scale_adjust_factor != 1.0f;
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
 
     // enable ununroll_bd_loop for big shapes to reduce kernel sizes
     jcp.ununroll_bd_loop
@@ -3238,9 +3348,10 @@ status_t init_conf_bwd_w(jit_brgemm_conv_conf_t &jcp,
     jcp.tr_diff_dst_buf_count = jcp.global_transpose
             ? jcp.nthr_mb * jcp.nb_oc * jcp.ngroups
             : jcp.nthr;
-    jcp.tr_src_block_size = jcp.tr_iw * jcp.ic_block * jcp.ih_block * jcp.id;
-    jcp.tr_diff_dst_block_size
-            = jcp.tr_ow * jcp.oc_block * jcp.oh_block * jcp.od;
+    jcp.tr_src_block_size = static_cast<size_t>(jcp.tr_iw) * jcp.ic_block
+            * jcp.ih_block * jcp.id;
+    jcp.tr_diff_dst_block_size = static_cast<size_t>(jcp.tr_ow) * jcp.oc_block
+            * jcp.oh_block * jcp.od;
 
     jcp.tr_src_buf_size = jcp.tr_src_block_size
             * (jcp.global_transpose ? 1 : jcp.nb_ic_blocking);
@@ -3300,7 +3411,7 @@ status_t init_scratchpad_bwd_w(memory_tracking::registrar_t &scratchpad,
     // (jcp.tr_diff_dst_buf_size + jcp.tr_iw * jcp.oc_block)
     const auto tr_diff_dst_size
             = jcp.tr_diff_dst_buf_count * jcp.tr_diff_dst_buf_size
-            + jcp.tr_iw * jcp.oc_block;
+            + static_cast<size_t>(jcp.tr_iw) * jcp.oc_block;
 
     const size_t min_align = 64;
     scratchpad.book(
@@ -3316,8 +3427,9 @@ status_t init_scratchpad_bwd_w(memory_tracking::registrar_t &scratchpad,
     if (IMPLICATION(jcp.nthr_mb == 1,
                 (jcp.with_bias && jcp.bia_dt != data_type::f32)
                         || jcp.wei_dt != data_type::f32)) {
-        const size_t wei_size = jcp.ngroups * jcp.nb_oc * jcp.oc_block
-                * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw * jcp.kd;
+        const size_t wei_size = static_cast<size_t>(jcp.ngroups) * jcp.nb_oc
+                * jcp.oc_block * jcp.nb_ic * jcp.ic_block * jcp.kh * jcp.kw
+                * jcp.kd;
         const size_t bia_size
                 = jcp.with_bias * jcp.ngroups * jcp.nb_oc * jcp.oc_block;
 
@@ -3364,6 +3476,65 @@ status_t init_scratchpad_bwd_w(memory_tracking::registrar_t &scratchpad,
     return status::success;
 }
 
+// Sets `ow_s` and `ow_f` values based on given `jcp`.
+void get_ow_range(const jit_brgemm_conv_conf_t &jcp, int ow, int kw, int &ow_s,
+        int &ow_f) {
+    // This function is used for exec_base only
+
+    const bool is_ow_tail = (jcp.ow - ow < jcp.ow_block);
+    const auto M = is_ow_tail ? jcp.ow_tail : jcp.ow_block;
+
+    const auto IW = jcp.iw;
+    const auto SW = jcp.stride_w;
+    const auto LP = jcp.l_pad;
+    const auto DW = jcp.dilate_w + 1;
+
+    const auto iiw = ow * SW - LP;
+    auto iw_lp = iiw + kw * DW;
+    const auto iw_rp = iw_lp + (M - 1) * SW - IW + 1;
+    ow_s = ow;
+
+    int ker_idx = 0;
+    if (iw_lp < 0) {
+        iw_lp = nstl::abs(iw_lp);
+        ker_idx += div_up(iw_lp, SW);
+        ow_s += ker_idx;
+    }
+    if (iw_rp > 0) ker_idx += div_up(iw_rp, SW);
+    ow_f = nstl::max(ow_s, ow_s + (M - ker_idx));
+
+    ow_s = nstl::min(ow_s, ow + M);
+    ow_f = nstl::min(ow_f, ow + M);
+}
+
+// Sets pairs of `kw_s`-`kw_f` and `kw_full_s`-`kw_full_f` based on given `jcp`.
+void get_kw_range(const jit_brgemm_conv_conf_t &jcp, int ow, int &kw_s,
+        int &kw_full_s, int &kw_full_f, int &kw_f) {
+    // This function is used for exec_base only
+    // TODO: calculate these values instead direct loop by kw
+
+    const bool is_ow_tail = (jcp.ow - ow < jcp.ow_block);
+    const auto M = is_ow_tail ? jcp.ow_tail : jcp.ow_block;
+    kw_s = kw_full_s = kw_full_f = kw_f = -1;
+    for (int kw = 0; kw < jcp.kw; kw++) {
+        int ow_s {0}, ow_f {0};
+        brgemm_convolution_utils::get_ow_range(jcp, ow, kw, ow_s, ow_f);
+        if (ow_s < ow_f) {
+            if (kw_s == -1) kw_s = kw;
+            kw_f = kw + 1;
+            if (ow_f - ow_s == M) {
+                if (kw_full_s == -1) kw_full_s = kw;
+                kw_full_f = kw + 1;
+            }
+        }
+    }
+    if (kw_f == -1) {
+        kw_s = 0;
+        kw_f = 0;
+    }
+    if (kw_full_f == -1) kw_full_s = kw_full_f = kw_f;
+}
+
 } // namespace brgemm_convolution_utils
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_brgemm_conv_utils.hpp b/src/cpu/x64/jit_brgemm_conv_utils.hpp
index bee9d321917..d4cdc380dc1 100644
--- a/src/cpu/x64/jit_brgemm_conv_utils.hpp
+++ b/src/cpu/x64/jit_brgemm_conv_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,6 +65,42 @@ status_t init_scratchpad_bwd_w(memory_tracking::registrar_t &scratchpad,
         const jit_brgemm_conv_conf_t &jcp, memory_desc_t &src_md,
         memory_desc_t &diff_weights_md, memory_desc_t &diff_dst_md);
 
+// TODO: make a part of `jit_brgemm_conv_conf_t` instead?
+void get_ow_range(const jit_brgemm_conv_conf_t &jcp, int ow, int kw, int &ow_s,
+        int &ow_f);
+void get_kw_range(const jit_brgemm_conv_conf_t &jcp, int ow, int &kw_s,
+        int &kw_full_s, int &kw_full_f, int &kw_f);
+
+#define BRGEMM_CONV_NDHWGC_ORDER \
+    n, jcp.mb, odb, jcp.nb_od, ohb, jcp.nb_oh, owb, jcp.nb_ow, g, jcp.ngroups, \
+            ocb, jcp.nb_oc
+#define BRGEMM_CONV_NGCDHW_ORDER \
+    n, jcp.mb, g, jcp.ngroups, ocb, jcp.nb_oc, odb, jcp.nb_od, ohb, jcp.nb_oh, \
+            owb, jcp.nb_ow
+#define BRGEMM_CONV_GCNDHW_ORDER \
+    g, jcp.ngroups, ocb, jcp.nb_oc, n, jcp.mb, odb, jcp.nb_od, ohb, jcp.nb_oh, \
+            owb, jcp.nb_ow
+
+#define BRGEMM_CONV_ITERATOR_INIT \
+    if (jcp.loop_order == loop_ndhwgc) \
+        nd_iterator_init(start, BRGEMM_CONV_NDHWGC_ORDER); \
+    else if (jcp.loop_order == loop_ngcdhw) \
+        nd_iterator_init(start, BRGEMM_CONV_NGCDHW_ORDER); \
+    else if (jcp.loop_order == loop_gcndhw) \
+        nd_iterator_init(start, BRGEMM_CONV_GCNDHW_ORDER); \
+    else \
+        assert(!"Unknown loop order");
+
+#define BRGEMM_CONV_ITERATOR_STEP \
+    if (jcp.loop_order == loop_ndhwgc) \
+        nd_iterator_step(BRGEMM_CONV_NDHWGC_ORDER); \
+    else if (jcp.loop_order == loop_ngcdhw) \
+        nd_iterator_step(BRGEMM_CONV_NGCDHW_ORDER); \
+    else if (jcp.loop_order == loop_gcndhw) \
+        nd_iterator_step(BRGEMM_CONV_GCNDHW_ORDER); \
+    else \
+        assert(!"Unknown loop order");
+
 } // namespace brgemm_convolution_utils
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_brgemm_decompress_kernel.cpp b/src/cpu/x64/jit_brgemm_decompress_kernel.cpp
new file mode 100644
index 00000000000..1f20e2f6da1
--- /dev/null
+++ b/src/cpu/x64/jit_brgemm_decompress_kernel.cpp
@@ -0,0 +1,106 @@
+/*******************************************************************************
+* Copyright 2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <float.h>
+
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_brgemm_decompress_kernel.hpp"
+
+#define GET_OFF(field) offsetof(brgemm_decomp_kernel_params_t, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::utils;
+using namespace Xbyak;
+
+void jit_brgemm_decompress_kernel_t::generate() {
+    preamble();
+    mov(wei_ptr, ptr[param1 + GET_OFF(ptr_B)]);
+    mov(reg_ptr_decomp_mask, ptr[param1 + GET_OFF(bitmask_ptr)]);
+    mov(reg_ptr_decomp_dst, ptr[param1 + GET_OFF(scratch_buf)]);
+    lea(reg_ptr_decomp_src, ptr[wei_ptr]);
+    // db(0xcc);
+    for (int block = 0; block < blocks_; block++) {
+        int wei_offset = block * 4096;
+        // lea(reg_ptr_decomp_src, ptr[wei_ptr +wei_offset]);
+        int bitmask_off = wei_offset / (1 * 8);
+        for (int cl = 0; cl < 64; cl = cl + 4) {
+            mov(reg_comp_mask_tmp1,
+                    ptr[reg_ptr_decomp_mask + cl * 8 + bitmask_off]);
+            kmovq(reg_comp_mask1, reg_comp_mask_tmp1);
+            mov(reg_comp_mask_tmp2,
+                    ptr[reg_ptr_decomp_mask + (cl + 1) * 8 + bitmask_off]);
+            kmovq(reg_comp_mask2, reg_comp_mask_tmp2);
+            mov(reg_comp_mask_tmp3,
+                    ptr[reg_ptr_decomp_mask + (cl + 2) * 8 + bitmask_off]);
+            kmovq(reg_comp_mask3, reg_comp_mask_tmp3);
+            mov(reg_comp_mask_tmp4,
+                    ptr[reg_ptr_decomp_mask + (cl + 3) * 8 + bitmask_off]);
+            kmovq(reg_comp_mask4, reg_comp_mask_tmp4);
+
+            vmovdqu8(zmm_comp1, ptr[reg_ptr_decomp_src]);
+            popcnt(reg_popcnt, reg_comp_mask_tmp1);
+            add(reg_ptr_decomp_src, reg_popcnt);
+
+            vmovdqu8(zmm_comp2, ptr[reg_ptr_decomp_src]);
+            popcnt(reg_popcnt, reg_comp_mask_tmp2);
+            add(reg_ptr_decomp_src, reg_popcnt);
+
+            vmovdqu8(zmm_comp3, ptr[reg_ptr_decomp_src]);
+            popcnt(reg_popcnt, reg_comp_mask_tmp3);
+            add(reg_ptr_decomp_src, reg_popcnt);
+
+            vmovdqu8(zmm_comp4, ptr[reg_ptr_decomp_src]);
+            popcnt(reg_popcnt, reg_comp_mask_tmp4);
+            add(reg_ptr_decomp_src, reg_popcnt);
+
+            vpexpandb(zmm_comp1 | reg_comp_mask1 | T_z, zmm_comp1);
+            vmovdqu8(ptr[reg_ptr_decomp_dst + wei_offset + cl * 64], zmm_comp1);
+
+            vpexpandb(zmm_comp2 | reg_comp_mask2 | T_z, zmm_comp2);
+            vmovdqu8(ptr[reg_ptr_decomp_dst + wei_offset + (cl + 1) * 64],
+                    zmm_comp2);
+
+            vpexpandb(zmm_comp3 | reg_comp_mask3 | T_z, zmm_comp3);
+            vmovdqu8(ptr[reg_ptr_decomp_dst + wei_offset + (cl + 2) * 64],
+                    zmm_comp3);
+
+            vpexpandb(zmm_comp4 | reg_comp_mask4 | T_z, zmm_comp4);
+            vmovdqu8(ptr[reg_ptr_decomp_dst + wei_offset + (cl + 3) * 64],
+                    zmm_comp4);
+        }
+        // db(0xcc);
+        // XXX: memory alignment of weights buffer can lead to issues.
+        mov(reg_ptr_decomp_src_align, reg_ptr_decomp_src);
+        not_(reg_ptr_decomp_src_align);
+        and_(reg_ptr_decomp_src_align, 0x3f); // get 6 LSBs of stack ptr
+        add(reg_ptr_decomp_src_align, 0x1);
+        and_(reg_ptr_decomp_src_align,
+                0x3f); // 0x0 if already aligned to cacheline
+        add(reg_ptr_decomp_src, reg_ptr_decomp_src_align);
+    }
+    postamble();
+}
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_brgemm_decompress_kernel.hpp b/src/cpu/x64/jit_brgemm_decompress_kernel.hpp
new file mode 100644
index 00000000000..83a288d3165
--- /dev/null
+++ b/src/cpu/x64/jit_brgemm_decompress_kernel.hpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+* Copyright 2019-2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_BRGEMM_DECOMPRESS_KERNEL_HPP
+#define CPU_X64_JIT_BRGEMM_DECOMPRESS_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+
+#include "cpu/x64/jit_brgemm_primitive_conf.hpp"
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct jit_brgemm_decompress_kernel_t : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_decompress_kernel_t)
+
+    jit_brgemm_decompress_kernel_t(const jit_brgemm_primitive_conf_t *jbgp)
+        : jit_generator_t(jit_name()) {
+        blocks_ = jbgp->ic * 64 / 4096;
+        create_kernel();
+    }
+
+    void tile_configure(const char *palette) const { (*this)(palette); }
+
+private:
+    int blocks_;
+
+    Xbyak::Reg64 wei_ptr = r14;
+    Xbyak::Reg64 dst_ptr = r13;
+
+    Xbyak::Zmm zmm_comp2 = Xbyak::Zmm(28);
+    Xbyak::Zmm zmm_comp1 = Xbyak::Zmm(27);
+    Xbyak::Zmm zmm_comp4 = Xbyak::Zmm(26);
+    Xbyak::Zmm zmm_comp3 = Xbyak::Zmm(25);
+
+    const Xbyak::Reg64 reg_ptr_decomp_src = r9;
+    const Xbyak::Reg64 reg_ptr_decomp_dst = r8; //r10;
+    const Xbyak::Reg64 reg_ptr_decomp_mask = rax; //rsi;
+    const Xbyak::Reg64 reg_popcnt = rsi;
+
+    const Xbyak::Reg64 reg_comp_mask_tmp1 = r10;
+    const Xbyak::Reg64 reg_comp_mask_tmp2 = r12;
+    const Xbyak::Reg64 reg_comp_mask_tmp3 = rbx;
+    const Xbyak::Reg64 reg_comp_mask_tmp4 = rdx;
+
+    const Xbyak::Reg64 reg_ptr_decomp_src_align = r10;
+
+    const Xbyak::Opmask reg_comp_mask1 = k1;
+    const Xbyak::Opmask reg_comp_mask2 = k2;
+    const Xbyak::Opmask reg_comp_mask3 = k3;
+    const Xbyak::Opmask reg_comp_mask4 = k4;
+
+    void generate() override;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_brgemm_deconv.cpp b/src/cpu/x64/jit_brgemm_deconv.cpp
index c8f2c8d8fd8..2ee07acba67 100644
--- a/src/cpu/x64/jit_brgemm_deconv.cpp
+++ b/src/cpu/x64/jit_brgemm_deconv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -162,8 +162,7 @@ status_t brgemm_deconvolution_fwd_t<isa>::pd_t::init(engine_t *engine) {
     const bool is_int8 = utils::one_of(src_type, s8, u8);
 
     auto skip_mask = smask_t::post_ops | smask_t::sum_dt;
-    if (is_int8)
-        skip_mask |= smask_t::scales_runtime | smask_t::zero_points_runtime;
+    if (is_int8) skip_mask |= smask_t::scales | smask_t::zero_points;
 
     VDISPATCH_DECONVOLUTION(is_fwd(), VERBOSE_BAD_PROPKIND);
     VDISPATCH_DECONVOLUTION((desc()->alg_kind & alg_kind::deconvolution_direct),
diff --git a/src/cpu/x64/jit_brgemm_deconv.hpp b/src/cpu/x64/jit_brgemm_deconv.hpp
index e10a4305cbb..80e51023a83 100644
--- a/src/cpu/x64/jit_brgemm_deconv.hpp
+++ b/src/cpu/x64/jit_brgemm_deconv.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,9 +38,7 @@ template <cpu_isa_t isa>
 struct brgemm_deconvolution_fwd_t : public primitive_t {
 
     struct pd_t : public cpu_deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::hint_class *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_deconvolution_fwd_pd_t::cpu_deconvolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_deconvolution_fwd_pd_t(other)
@@ -48,8 +46,6 @@ struct brgemm_deconvolution_fwd_t : public primitive_t {
             , has_strides_(other.has_strides_)
             , name_(other.name_) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(name_.c_str(), brgemm_deconvolution_fwd_t);
 
         status_t init(engine_t *engine);
@@ -59,16 +55,25 @@ struct brgemm_deconvolution_fwd_t : public primitive_t {
         }
 
         bool zero_points_ok() const {
+            const auto &zp = attr()->zero_points_;
+
             using namespace data_type;
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-
-            return IMPLICATION(!utils::one_of(src_md()->data_type, s8, u8),
-                           attr()->zero_points_.has_default_values())
-                    && attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && (mask_src == 0 || mask_src == 1 << 1)
-                    && (mask_dst == 0 || mask_dst == 1 << 1);
+            bool ok = IMPLICATION(!utils::one_of(src_md()->data_type, s8, u8),
+                    zp.has_default_values());
+            if (!ok) return false;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                ok = utils::one_of(mask_src, 0, (1 << 1));
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                ok = utils::one_of(mask_dst, 0, (1 << 1));
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
         brgemm_broadcast_t get_zp_type(int arg) const {
@@ -81,9 +86,10 @@ struct brgemm_deconvolution_fwd_t : public primitive_t {
         bool has_strides_ = false;
 
     private:
-        std::string name_ = JIT_IMPL_NAME_HELPER("brg_deconv:", isa, "");
+        std::string name_;
 
         void init_name() {
+            name_ = JIT_IMPL_NAME_HELPER("brg_deconv:", isa, "");
             name_.append("+");
             name_.append(conv_pd_->name());
         }
@@ -91,7 +97,7 @@ struct brgemm_deconvolution_fwd_t : public primitive_t {
 
     brgemm_deconvolution_fwd_t(const pd_t *apd) : primitive_t(apd) {};
 
-    ~brgemm_deconvolution_fwd_t() = default;
+    ~brgemm_deconvolution_fwd_t() override = default;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_brgemm_inner_product.cpp b/src/cpu/x64/jit_brgemm_inner_product.cpp
index 45d40417af8..6da6e904c2c 100644
--- a/src/cpu/x64/jit_brgemm_inner_product.cpp
+++ b/src/cpu/x64/jit_brgemm_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -91,15 +91,48 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
     const auto &jbgp = pd()->jbgp_;
 
     DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
-    DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
+    DEFINE_ARG_SCALES_BUFFER(wei_scales_f, DNNL_ARG_WEIGHTS);
     DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
 
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     const float *oscales
-            = scale_utils::precompute_scales(scratchpad, src_scales, wei_scales,
+            = scale_utils::precompute_scales(scratchpad, src_scales, wei_scales_f,
                     pd()->IC(), pd()->OC(), false, wei_scale_mask == (1 << 0),
                     pd()->attr(), jit_scale_precompute_.get());
+    DEFINE_ZERO_POINTS_BUFFER_ATTR_U8(pd()->attr(), wei_zero_points, DNNL_ARG_WEIGHTS);
+    auto wei_scales = reinterpret_cast<const uint8_t*>(wei_scales_f);
+
+    const auto wei_scales_d = ctx.memory_mdw(DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
+    const auto wei_zero_points_d = ctx.memory_mdw(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS);
+    int wei_scales_oc_stride = wei_scales_d.dims()[0] > 1 ? 1 : 0;
+    int wei_zero_points_oc_stride = wei_zero_points_d.dims()[0] > 1 ? 1 : 0;
+    size_t wei_scales_dt_size = jbgp.wei_decomp_scales_dt == data_type::undef ? 0 : types::data_type_size(jbgp.wei_decomp_scales_dt);
+    size_t wei_zero_points_dt_size = jbgp.wei_decomp_zero_points_dt == data_type::undef ? 0 : types::data_type_size(jbgp.wei_decomp_zero_points_dt);
+    if (jbgp.weights_decompression) {
+        // weights decompression algorithm requires weights scales to be
+        // applied before matmul to avoid huge numerical errors
+        oscales = src_scales;
+
+        // decompression algorithm assumes scales/zero_points buffers are aligned on oc_block size
+        if (jbgp.oc % jbgp.simd_w != 0) {
+            if (!pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values()) {
+                auto dims = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).get_dims();
+                auto decomp_scales_buf = scratchpad.template get<uint8_t>(key_decompression_scales);
+                std::memcpy(decomp_scales_buf, wei_scales, dims[0] * dims[1] * wei_scales_dt_size);
+                wei_scales = decomp_scales_buf;
+            }
+
+            if (!pd()->attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)) {
+                auto decomp_zp_buf = scratchpad.template get<uint8_t>(key_decompression_zero_points);
+                auto dims = pd()->attr()->zero_points_.get_dims(DNNL_ARG_WEIGHTS);
+                std::memcpy(decomp_zp_buf, wei_zero_points, dims[0] * dims[1] * wei_zero_points_dt_size);
+                wei_zero_points = decomp_zp_buf;
+            }
+        }
+    } else {
+        oscales = precompute_scales(ctx.get_scratchpad_grantor(),
+            src_scales, wei_scales_f, pd()->OC(), pd()->attr());
+    }
 
     const size_t src_dt_size = types::data_type_size(jbgp.src_dt);
     const size_t bia_dt_size
@@ -107,6 +140,62 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
     const size_t acc_dt_size = types::data_type_size(jbgp.acc_dt);
     const size_t dst_dt_size = types::data_type_size(jbgp.dst_dt);
 
+    int8_t* qsrc = nullptr;
+    float* src_dscales = nullptr;
+    int32_t* src_grouped_sum = nullptr;
+    if (jbgp.with_src_dynamic_quant) {
+        qsrc = scratchpad.template get<int8_t>(key_src_quantized);
+        src_dscales = scratchpad.template get<float>(key_src_dequantized_scales);
+        src_grouped_sum = scratchpad.template get<int32_t>(key_src_grouped_sum);
+
+        int ic_groups = div_up(jbgp.ic, jbgp.src_quant_group_size);
+        int ic_sum_groups = div_up(jbgp.ic, jbgp.src_sum_group_size);
+        auto src_ptr = reinterpret_cast<const float*>(src);
+        auto qsrc_ptr = qsrc;
+        auto src_dscales_ptr = src_dscales;
+        auto src_grouped_sum_ptr = src_grouped_sum;
+        int vec_loop_end = rnd_dn(jbgp.ic, jbgp.src_quant_group_size);
+
+        parallel_nd(jbgp.mb, [&](int mb) {
+            src_quantization_runtime_params_t rt_params = {};
+            rt_params.src_ptr = src_ptr + mb * jbgp.ic;
+            rt_params.qsrc_ptr = qsrc_ptr + mb * jbgp.ic;
+            rt_params.src_scales_ptr = src_dscales_ptr + mb * ic_groups;
+            rt_params.src_grouped_sum_ptr = src_grouped_sum_ptr + mb * ic_sum_groups;
+            rt_params.ic_size = vec_loop_end;
+            (*brg_src_quant_kernel_)(&rt_params);
+
+            if (vec_loop_end != jbgp.ic) {
+                float amax = 0;
+                for (int ic = vec_loop_end; ic < jbgp.ic; ic++) {
+                    amax = std::max(amax, std::abs(src_ptr[mb * jbgp.ic + ic]));
+                }
+
+                const float dscale = amax / 127;
+                const float qscale  = (dscale != 0) ? (1.0f / dscale) : 0;
+
+                src_dscales_ptr[mb * ic_groups + ic_groups - 1] = dscale;
+                for (int ic = vec_loop_end; ic < jbgp.ic; ic++) {
+                    qsrc_ptr[mb * jbgp.ic + ic] = std::round(src_ptr[mb * jbgp.ic + ic] * qscale);
+                }
+            }
+
+            if (jbgp.wei_decomp_zero_points_dt) {
+                for (int icb = vec_loop_end / jbgp.src_quant_group_size; icb < ic_sum_groups; icb++) {
+                    int ic_begin = icb * jbgp.src_sum_group_size;
+                    int ic_end = nstl::min(static_cast<int>((icb + 1) * jbgp.src_sum_group_size), jbgp.ic);
+                    int sum = 0;
+                    for (int ic = ic_begin; ic < ic_end; ic++) {
+                        sum += qsrc_ptr[mb * jbgp.ic + ic];
+                    }
+                    src_grouped_sum_ptr[mb * ic_sum_groups + icb] = sum;
+                }
+            }
+        });
+
+        src = reinterpret_cast<const char *>(qsrc);
+    }
+
     auto addr_batch_global = scratchpad.template get<brgemm_batch_element_t>(
             key_brgemm_primitive_batch);
     auto a_buffer_global = (jbgp.use_buffer_a)
@@ -121,6 +210,10 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
                     key_conv_amx_tile_buffer)
             : nullptr;
 
+    auto decomp_buf_global = (jbgp.weights_compressed || jbgp.weights_decompression)
+            ? scratchpad.template get<char>(key_brgemm_primitive_decomp_buf)
+            : nullptr;
+
     const int ic_chunks = div_up(jbgp.nb_ic, jbgp.nb_ic_blocking);
 
     const bool are_post_ops_applicable = one_of(true, jbgp.with_sum,
@@ -245,13 +338,22 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
         int brg_ker_idx = brgemm_inner_product_utils::get_brg_kernel_index(
                 is_bs_tail, kernel_init, is_os_tail, is_oc_tail, false);
         auto brg_kernel = brg_kernels_[brg_ker_idx].get();
-        const int ic_blocks_per_batch = jbgp.K / jbgp.ic_block;
+        const int ic_blocks_per_batch = div_up(jbgp.K, jbgp.ic_block);
 
-        const dim_t wei_cur_ocb = blk_off(weights_d, cur_ocb, 0, kd, kh, kw);
+        const dim_t wei_cur_ocb = blk_off(weights_d, cur_ocb, 0, kd, kh, kw)
+                / types::data_type_size(weights_d.data_type()) * types::data_type_size(jbgp.wei_dt);
+        // weights_d & jbgp.wei_dt has different data size
+        // printf("kd %ld, kh %ld, kw %ld weights_d %ld true_size %ld wei_cur_ocb %ld\n", kd, kh, kw,
+        //     weights_d.data_type(), jbgp.wei_dt, wei_cur_ocb);
+        // const dim_t wei_cur_ocb
+        // = get_blk_off(weights_d, jbgp.wei_dt, cur_ocb, 0);
 
         if (copy_buffer_a) {
             assert(!jbgp.is_bf32);
-            auto src_ptr = src + blk_off(src_d, n, ic, kd, kh, kw);
+            auto src_ptr = src
+                    + blk_off(src_d, n, ic, kd, kh, kw)
+                            / types::data_type_size(src_d.data_type())
+                            * types::data_type_size(jbgp.src_dt);
             copy_data_chunk(copy_src_kernel_, a_buffer, src_ptr,
                     is_os_tail ? jbgp.mb - n : jbgp.os_block, is_last_ic_chunk);
         }
@@ -262,12 +364,93 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
                 auto A_ptr = jbgp.use_buffer_a
                         ? (a_buffer + src_dt_size * b * jbgp.K)
                         : (src
-                                + blk_off(
-                                        src_d, n, ic + b * jbgp.K, kd, kh, kw));
+                                  + blk_off(src_d, n, ic + b * jbgp.K, kd, kh,
+                                            kw)
+                                          / types::data_type_size(
+                                                  src_d.data_type())
+                                          * types::data_type_size(jbgp.src_dt));
                 addr_batch[b].ptr.A = A_ptr;
-                const dim_t wei_offset = wei_cur_ocb
-                        + wei_ic_stride * (icb + b * ic_blocks_per_batch);
-                addr_batch[b].ptr.B = weights + wei_offset;
+                const dim_t wei_offset = (wei_cur_ocb
+                        + wei_ic_stride * (icb + b * ic_blocks_per_batch));
+                if (jbgp.weights_compressed) {
+                    using comp_tile_len_type = int;
+                    const comp_tile_len_type *compressed_tile_lengths_ptr
+                            = reinterpret_cast<const comp_tile_len_type *>(weights);
+                    int compressed_weights_offset = wei_offset / 4096;
+
+                    auto dcomp_params = brgemm_decomp_kernel_params_t();
+                    dcomp_params.ptr_B = weights + jbgp.weights_starting_offset
+                            + compressed_tile_lengths_ptr
+                                            [compressed_weights_offset]
+                                    * 64;
+                    dcomp_params.bitmask_ptr
+                            = weights + jbgp.weight_comp_bitmask_off + wei_offset / 8;
+                    const size_t decomp_buf_per_thr = (size_t)jbgp.ic * 64;
+                    auto decomp_buf = decomp_buf_global + ithr * decomp_buf_per_thr;
+
+                    dcomp_params.scratch_buf = decomp_buf;
+                    (*brg_decomp_kernel_)(&dcomp_params);
+                    addr_batch[b].ptr.B = decomp_buf;
+                } else if (jbgp.weights_decompression && jbgp.wei_decomp_algo == weights_decomp_kind_t::prepack) {
+                    int typesize_scale = one_of(jbgp.orig_wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+                    auto w_off = wei_offset * types::data_type_size(jbgp.orig_wei_dt) / types::data_type_size(jbgp.wei_dt) / typesize_scale;
+                    auto weights_ptr = reinterpret_cast<const uint8_t *>(&weights[w_off]);
+
+                    const size_t decomp_buf_per_thr = jbgp.ic_block * jbgp.nb_ic_blocking * jbgp.oc_block * types::data_type_size(jbgp.wei_dt);
+                    auto decomp_buf = decomp_buf_global + ithr * decomp_buf_per_thr + wei_ic_stride * b * ic_blocks_per_batch;
+
+                    const int ic_internal_block = pd()->jbgp_.wei_dt == data_type::bf16 ||
+                                                  one_of(pd()->jbgp_.orig_wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+                    auto wei_zero_points_ptr = wei_zero_points + wei_zero_points_oc_stride * oc * wei_zero_points_dt_size;
+                    auto wei_scales_ptr = wei_scales + wei_scales_oc_stride * oc * wei_scales_dt_size;
+
+                    if (jbgp.with_grouped_weights_decompression) {
+                        weights_decompression_runtime_params_t rt_params = {};
+                        auto ic_size = jbgp.ic_block * ic_blocks_per_batch / ic_internal_block;
+                        auto wei_scales_ic_group_size_local = jbgp.wei_scales_ic_group_size / ic_internal_block;
+                        auto wei_zero_points_ic_group_size_local = jbgp.wei_zero_points_ic_group_size / ic_internal_block;
+                        auto group_size = nstl::min(wei_scales_ic_group_size_local, wei_zero_points_ic_group_size_local);
+                        auto group_ic_blocks = div_up(ic_size, group_size);
+                        auto start_group_scales = ic / jbgp.wei_scales_ic_group_size;
+                        auto start_group_zero_points = ic / jbgp.wei_zero_points_ic_group_size;
+                        for (int icb_idx = 0; icb_idx < group_ic_blocks; icb_idx++) {
+                            auto ic_idx = icb_idx * group_size;
+                            auto scales_idx = ic_idx / wei_scales_ic_group_size_local + start_group_scales;
+                            auto zero_points_idx = ic_idx / wei_zero_points_ic_group_size_local + start_group_zero_points;
+
+                            rt_params.weights_ptr = weights_ptr + ic_idx * ic_internal_block * jbgp.oc_block * types::data_type_size(jbgp.orig_wei_dt) / typesize_scale;
+                            rt_params.decomp_buffer_ptr = decomp_buf + ic_idx * ic_internal_block *jbgp.oc_block * types::data_type_size(jbgp.wei_dt);
+                            rt_params.scales_ptr = wei_scales_ptr + scales_idx * wei_scales_d.dims()[0] * wei_scales_dt_size;
+                            rt_params.zero_points_ptr = wei_zero_points_ptr + zero_points_idx * wei_zero_points_d.dims()[0] * wei_zero_points_dt_size;
+                            rt_params.ic_size = nstl::min(group_size, ic_size - icb_idx * group_size);
+                            (*brg_weights_decomp_kernel_)(&rt_params);
+                        }
+                    } else {
+                        weights_decompression_runtime_params_t rt_params = {};
+                        rt_params.weights_ptr = weights_ptr;
+                        rt_params.decomp_buffer_ptr = decomp_buf;
+                        rt_params.scales_ptr = wei_scales_ptr;
+                        rt_params.zero_points_ptr = wei_zero_points_ptr;
+                        rt_params.ic_size = jbgp.ic_block * ic_blocks_per_batch / ic_internal_block;
+                        (*brg_weights_decomp_kernel_)(&rt_params);
+                    }
+
+                    addr_batch[b].ptr.B = decomp_buf;
+                } else {
+                    int typesize_scale = one_of(jbgp.wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+                    addr_batch[b].ptr.B = weights + wei_offset / typesize_scale;
+                }
+            }
+
+            int wei_scales_offset = 0;
+            int wei_zero_points_offset = 0;
+            int src_scales_offset = 0;
+            int src_grouped_sum_offset = 0;
+            if (jbgp.weights_decompression) {
+                wei_scales_offset = wei_scales_oc_stride * oc * wei_scales_dt_size;
+                wei_zero_points_offset = wei_zero_points_oc_stride * oc * wei_zero_points_dt_size;
+                src_scales_offset = n * div_up(jbgp.ic, jbgp.src_quant_group_size);
+                src_grouped_sum_offset = n * div_up(jbgp.ic, jbgp.src_sum_group_size);
             }
 
             auto ptr_D = dst + dst_off;
@@ -291,10 +474,12 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
 
                 brgemm_kernel_execute_postops(brg_kernel, gemm_batch,
                         addr_batch, (void *)ptr_C, (void *)ptr_D, post_ops_data,
-                        scratch);
+                        scratch, nullptr, wei_scales + wei_scales_offset, wei_zero_points + wei_zero_points_offset,
+                        src_dscales + src_scales_offset, src_grouped_sum + src_grouped_sum_offset, ic);
             } else {
                 brgemm_kernel_execute(brg_kernel, gemm_batch, addr_batch,
-                        (void *)ptr_C, is_amx ? (void *)wsp_tile : nullptr);
+                        (void *)ptr_C, is_amx ? (void *)wsp_tile : nullptr, nullptr, wei_scales + wei_scales_offset, wei_zero_points + wei_zero_points_offset,
+                        src_dscales + src_scales_offset, src_grouped_sum + src_grouped_sum_offset, ic);
             }
         }
 
@@ -310,10 +495,72 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
             int ic_block = gemm_batch * ic_blocks_per_batch;
             addr_batch[0].ptr.A = src
                     + blk_off(src_d, n, ic + ic_block * jbgp.ic_block, kd, kh,
-                            kw);
+                              kw)
+                            / types::data_type_size(src_d.data_type())
+                            * types::data_type_size(jbgp.src_dt);
             const dim_t wei_offset
-                    = wei_cur_ocb + wei_ic_stride * (icb + ic_block);
-            addr_batch[0].ptr.B = weights + wei_offset;
+                    = (wei_cur_ocb + wei_ic_stride * (icb + ic_block));
+
+            if (jbgp.weights_decompression && jbgp.wei_decomp_algo == weights_decomp_kind_t::prepack) {
+                int typesize_scale = one_of(jbgp.orig_wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+                auto w_off = wei_offset * types::data_type_size(jbgp.orig_wei_dt) / types::data_type_size(jbgp.wei_dt) / typesize_scale;
+                auto weights_ptr = reinterpret_cast<const uint8_t *>(&weights[w_off]);
+
+                const size_t decomp_buf_per_thr = jbgp.ic_block * jbgp.nb_ic_blocking * jbgp.oc_block * types::data_type_size(jbgp.wei_dt);
+                auto decomp_buf = decomp_buf_global + ithr * decomp_buf_per_thr;
+
+                const int ic_internal_block = pd()->jbgp_.wei_dt == data_type::bf16 ||
+                                              one_of(pd()->jbgp_.orig_wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1, data_type::f4_e2m1) ? 2 : 1;
+                auto wei_zero_points_ptr = wei_zero_points + wei_zero_points_oc_stride * oc * wei_zero_points_dt_size;
+                auto wei_scales_ptr = wei_scales + wei_scales_oc_stride * oc * wei_scales_dt_size;
+
+                if (jbgp.with_grouped_weights_decompression) {
+                    weights_decompression_runtime_params_t rt_params = {};
+                    auto ic_size = (jbgp.ic - (ic + ic_block * jbgp.ic_block)) / ic_internal_block;
+                    auto wei_scales_ic_group_size_local = jbgp.wei_scales_ic_group_size / ic_internal_block;
+                    auto wei_zero_points_ic_group_size_local = jbgp.wei_zero_points_ic_group_size / ic_internal_block;
+                    auto group_size = nstl::min(wei_scales_ic_group_size_local, wei_zero_points_ic_group_size_local);
+                    auto group_ic_blocks = div_up(ic_size, group_size);
+                    auto start_group_scales = ic / jbgp.wei_scales_ic_group_size;
+                    auto start_group_zero_points = ic / jbgp.wei_zero_points_ic_group_size;
+                    for (int icb_idx = 0; icb_idx < group_ic_blocks; icb_idx++) {
+                        auto ic_idx = icb_idx * group_size;
+                        auto scales_idx = ic_idx / wei_scales_ic_group_size_local + start_group_scales;
+                        auto zero_points_idx = ic_idx / wei_zero_points_ic_group_size_local + start_group_zero_points;
+
+                        rt_params.weights_ptr = weights_ptr + ic_idx * ic_internal_block * jbgp.oc_block * types::data_type_size(jbgp.orig_wei_dt) / typesize_scale;
+                        rt_params.decomp_buffer_ptr = decomp_buf + ic_idx * ic_internal_block * jbgp.oc_block * types::data_type_size(jbgp.wei_dt);
+                        rt_params.scales_ptr = wei_scales_ptr + scales_idx * wei_scales_d.dims()[0] * wei_scales_dt_size;
+                        rt_params.zero_points_ptr = wei_zero_points_ptr + zero_points_idx * wei_zero_points_d.dims()[0] * wei_zero_points_dt_size;
+                        rt_params.ic_size = nstl::min(group_size, ic_size - icb_idx * group_size);
+                        (*brg_weights_decomp_kernel_)(&rt_params);
+                    }
+                } else {
+                    weights_decompression_runtime_params_t rt_params = {};
+                    rt_params.weights_ptr = weights_ptr;
+                    rt_params.decomp_buffer_ptr = decomp_buf;
+                    rt_params.scales_ptr = wei_scales_ptr;
+                    rt_params.zero_points_ptr = wei_zero_points_ptr;
+                    rt_params.ic_size = (jbgp.ic - (ic + ic_block * jbgp.ic_block)) / ic_internal_block;
+                    (*brg_weights_decomp_kernel_)(&rt_params);
+                }
+
+                addr_batch[0].ptr.B = decomp_buf;
+            } else {
+                int typesize_scale = one_of(jbgp.wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+                addr_batch[0].ptr.B = weights + wei_offset / typesize_scale;
+            }
+
+            int wei_scales_offset = 0;
+            int wei_zero_points_offset = 0;
+            int src_scales_offset = 0;
+            int src_grouped_sum_offset = 0;
+            if (jbgp.weights_decompression) {
+                wei_scales_offset = wei_scales_oc_stride * oc * wei_scales_dt_size;
+                wei_zero_points_offset = wei_zero_points_oc_stride * oc * wei_zero_points_dt_size;
+                src_scales_offset = n * div_up(jbgp.ic, jbgp.src_quant_group_size);
+                src_grouped_sum_offset = n * div_up(jbgp.ic, jbgp.src_sum_group_size);
+            }
 
             auto brg_kernel_ic_tail = brg_kernels_[brg_ker_ic_tail_idx].get();
             auto ptr_D = dst + dst_off;
@@ -335,10 +582,12 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
                         nullptr, false, 1, false, false, dst_scales};
 
                 brgemm_kernel_execute_postops(brg_kernel_ic_tail, 1, addr_batch,
-                        (void *)ptr_C, (void *)ptr_D, post_ops_data, scratch);
+                        (void *)ptr_C, (void *)ptr_D, post_ops_data, scratch, nullptr, wei_scales + wei_scales_offset, wei_zero_points + wei_zero_points_offset,
+                        src_dscales + src_scales_offset, src_grouped_sum + src_grouped_sum_offset, ic);
             } else {
                 brgemm_kernel_execute(brg_kernel_ic_tail, 1, addr_batch,
-                        (void *)ptr_C, is_amx ? (void *)wsp_tile : nullptr);
+                        (void *)ptr_C, is_amx ? (void *)wsp_tile : nullptr, nullptr, wei_scales + wei_scales_offset, wei_zero_points + wei_zero_points_offset,
+                        src_dscales + src_scales_offset, src_grouped_sum + src_grouped_sum_offset, ic);
             }
         }
     };
@@ -621,7 +870,6 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
             }
         });
     }
-
     return status::success;
 }
 
diff --git a/src/cpu/x64/jit_brgemm_inner_product.hpp b/src/cpu/x64/jit_brgemm_inner_product.hpp
index 54ee9ee87c0..4649669843f 100644
--- a/src/cpu/x64/jit_brgemm_inner_product.hpp
+++ b/src/cpu/x64/jit_brgemm_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,6 +32,9 @@
 #include "cpu/x64/cpu_barrier.hpp"
 #include "cpu/x64/cpu_reducer.hpp"
 #include "cpu/x64/jit_avx512_core_scale_precompute.hpp"
+#include "cpu/x64/jit_brgemm_decompress_kernel.hpp"
+#include "cpu/x64/jit_brgemm_weights_decompression_kernel.hpp"
+#include "cpu/x64/jit_brgemm_src_quantization_kernel.hpp"
 #include "cpu/x64/jit_brgemm_inner_product_utils.hpp"
 #include "cpu/x64/jit_brgemm_post_ops.hpp"
 #include "cpu/x64/jit_brgemm_transpose_utils.hpp"
@@ -45,9 +48,7 @@ namespace x64 {
 template <cpu_isa_t isa>
 struct brgemm_inner_product_fwd_t : public primitive_t {
     struct pd_t : public cpu_inner_product_fwd_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_inner_product_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_inner_product_fwd_pd_t::cpu_inner_product_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgemm:", isa, ""),
                 brgemm_inner_product_fwd_t);
@@ -60,15 +61,33 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
             auto dst_dt = invariant_dst_md()->data_type;
             auto wei_dt = invariant_wei_md()->data_type;
             const bool is_int8 = one_of(src_dt, u8, s8);
+            const bool is_wei_decomp = (one_of(src_dt, f32, bf16) && one_of(wei_dt, u8, s8, nf4, s4, u4, f4_e2m1)) ||
+                                       (one_of(src_dt, f32) && one_of(wei_dt, f16, bf16));
 
             using skip_mask_t = primitive_attr_t::skip_mask_t;
             auto skip_mask = skip_mask_t::post_ops | skip_mask_t::sum_dt
                     | skip_mask_t::fpmath_mode;
-            if (is_int8) skip_mask |= skip_mask_t::scales_runtime;
+            if (is_int8) skip_mask |= skip_mask_t::scales;
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
+            if (is_wei_decomp) {
+                skip_mask |= skip_mask_t::scales;
+                skip_mask |= skip_mask_t::zero_points;
+                skip_mask |= skip_mask_t::src_dyn_quant_params;
+                // From oneDNN 3.5, those checks must be skipped if wei_decomp is enabled
+                // reference from src/plugins/intel_cpu/thirdparty/onednn/src/common/matmul.cpp:L62
+                skip_mask |= skip_mask_t::zero_points_data_type;
+                skip_mask |= skip_mask_t::zero_points_groups;
+                skip_mask |= skip_mask_t::scales_data_type;
+                skip_mask |= skip_mask_t::scales_groups;
+            }
 
             if (!mayiuse(isa)) return status::unimplemented;
 
-            VDISPATCH_INNER_PRODUCT(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_INNER_PRODUCT(
+                    // get_prop_kind() == prop_kind::forward_training,
+                    is_fwd(),
+                    VERBOSE_BAD_PROPKIND);
             VDISPATCH_INNER_PRODUCT(
                     expect_data_types(src_dt, wei_dt, data_type::undef, dst_dt,
                             data_type::undef),
@@ -104,6 +123,9 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
             const float beta = 1.0;
             const float beta_init = 0.0;
 
+            // f16/bf16 weights decompression doesn't need scales/zero-points which is handled by normal brgemm kernel
+            bool brgemm_with_wei_decomp = is_wei_decomp && jbgp_.wei_decomp_algo == weights_decomp_kind_t::immediate && !one_of(wei_dt, f16, bf16);
+
             for_(int i_bs = 0; i_bs < 2; i_bs++)
             for_(int i_init = 0; i_init < 2; i_init++)
             for_(int i_M = 0; i_M < 2; i_M++)
@@ -119,15 +141,17 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
                 brgemm_desc_t &brg = brg_descs_[idx];
                 CHECK(brgemm_desc_init(&brg, isa, jbgp_.brg_type, jbgp_.src_dt,
                         jbgp_.wei_dt, false, false, brgemm_row_major, alpha,
-                        vbeta, jbgp_.LDA, jbgp_.LDB, jbgp_.LDC, vM, vN, vK));
-
+                        vbeta, jbgp_.LDA, jbgp_.LDB, jbgp_.LDC, vM, vN, vK,
+                        nullptr, brgemm_with_wei_decomp,
+                        jbgp_.with_src_dynamic_quant,
+                        &weights_md_, attr()));
                 CHECK(brgemm_desc_set_postops(
-                        &brg, attr(), &dst_md_, jbgp_.LDD, jbgp_.bia_dt));
+                        &brg, attr(), &dst_md_, jbgp_.LDD, jbgp_.bia_dt, is_wei_decomp));
 
                 brgemm_attr_t brgattr;
                 if (jbgp_.is_amx) {
                     brgattr.max_bs = bs;
-                    brgattr.wary_tail_read = false;
+                    brgattr.wary_A_k_tail_read = false;
                     brgattr.hint_expected_A_size = jbgp_.mb * jbgp_.ic;
                     brgattr.hint_expected_B_size = jbgp_.oc * jbgp_.ic;
                     brgattr.hint_expected_C_size = jbgp_.mb * jbgp_.oc;
@@ -142,6 +166,7 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
                 }
 
                 CHECK(brgemm_desc_set_attr(&brg, brgattr));
+                CHECK(brgemm_desc_finalize(&brg));
 
                 if (jbgp_.is_amx)
                     jbgp_.amx_buf_size_per_thread
@@ -210,6 +235,56 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
             if (pd()->jbgp_.is_amx)
                 brgemm_palettes_.insert(idx, pd()->brg_descs_[idx]);
         }
+
+        if (pd()->jbgp_.weights_compressed) {
+            CHECK(safe_ptr_assign(brg_decomp_kernel_,
+                    new jit_brgemm_decompress_kernel_t(&pd()->jbgp_)));
+        }
+
+        if (pd()->jbgp_.weights_decompression && pd()->jbgp_.wei_decomp_algo == weights_decomp_kind_t::prepack) {
+            weights_decompression_compile_params_t jcp = {};
+            jcp.oc_size = pd()->jbgp_.oc_block;
+            jcp.ic_internal_size = pd()->jbgp_.wei_dt == data_type::bf16 ||
+                                   utils::one_of(pd()->jbgp_.orig_wei_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+            jcp.with_scales = !pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
+            jcp.broadcast_scales = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).get_dims()[0] == 1;
+            jcp.with_zero_points = !pd()->attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS);
+            jcp.broadcast_zero_points = pd()->attr()->zero_points_.get_dims(DNNL_ARG_WEIGHTS)[0] == 1;
+            jcp.weights_dt = pd()->jbgp_.orig_wei_dt;
+            jcp.decomp_buffer_dt = pd()->jbgp_.wei_dt;
+            jcp.scales_dt = pd()->jbgp_.wei_decomp_scales_dt;
+            jcp.zero_points_dt = pd()->jbgp_.wei_decomp_zero_points_dt;
+
+            if (is_superset(pd()->jbgp_.isa, avx512_core)) {
+                CHECK(safe_ptr_assign(brg_weights_decomp_kernel_,
+                        new jit_brgemm_weights_decompression_kernel_t<avx512_core>(jcp)));
+            } else if (is_superset(pd()->jbgp_.isa, avx2)) {
+                CHECK(safe_ptr_assign(brg_weights_decomp_kernel_,
+                        new jit_brgemm_weights_decompression_kernel_t<avx2>(jcp)));
+            } else {
+                return status::unimplemented;
+            }
+        }
+
+        if (pd()->jbgp_.with_src_dynamic_quant) {
+            src_quantization_compile_params_t jcp = {};
+            jcp.ic_quant_block = pd()->jbgp_.src_quant_group_size;
+            jcp.with_src_grouped_sum = !pd()->attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS);
+            jcp.src_sum_group_size = pd()->jbgp_.src_sum_group_size;
+            jcp.src_dt = pd()->jbgp_.orig_src_dt;
+            jcp.qsrc_dt = data_type::s8;
+
+            if (is_superset(pd()->jbgp_.isa, avx512_core)) {
+                CHECK(safe_ptr_assign(brg_src_quant_kernel_,
+                        new jit_brgemm_src_quantization_kernel_t<avx512_core>(jcp)));
+            } else if (is_superset(pd()->jbgp_.isa, avx2)) {
+                CHECK(safe_ptr_assign(brg_src_quant_kernel_,
+                        new jit_brgemm_src_quantization_kernel_t<avx2>(jcp)));
+            } else {
+                return status::unimplemented;
+            }
+        }
+
         if (pd()->jbgp_.use_buffer_a)
             CHECK(create_brgemm_copy_to_coarse(copy_src_kernel_, &pd()->jbgp_));
         if (pd()->jbgp_.nthr_ic_b > 1) {
@@ -219,12 +294,13 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
         }
 
         // JIT to precompute scales
-        const bool is_jit_supported = mayiuse(avx512_core);
+        // JIT precompute scales is not compatible with OV's weights_decompression
+        const bool is_jit_supported = mayiuse(avx512_core) && !(pd()->jbgp_.weights_decompression);
         const auto attr = pd()->attr();
         if (is_jit_supported && pd()->OC() > 1 && req_copy_scales(attr)) {
             const auto &attr_scales = attr->scales_;
-            int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-            if (wei_scale_mask != 0) {
+            int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+            if (wei_scale_mask > 0) {
                 CHECK(safe_ptr_assign(jit_scale_precompute_,
                         new jit_avx512_core_scale_precompute_t(attr)));
                 CHECK(jit_scale_precompute_->create_kernel());
@@ -249,14 +325,15 @@ struct brgemm_inner_product_fwd_t : public primitive_t {
     std::unique_ptr<jit_avx512_core_scale_precompute_t> jit_scale_precompute_;
     brgemm_containers::brgemm_palette_container_t brgemm_palettes_ {
             brgemm_inner_product_utils::max_num_brg_kernels_ip};
+    std::unique_ptr<jit_brgemm_decompress_kernel_t> brg_decomp_kernel_;
+    std::unique_ptr<jit_weights_decompression_kernel_t> brg_weights_decomp_kernel_;
+    std::unique_ptr<jit_src_quantization_kernel_t> brg_src_quant_kernel_;
 };
 
 template <cpu_isa_t isa>
 struct brgemm_inner_product_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_inner_product_bwd_data_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : cpu_inner_product_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_inner_product_bwd_data_pd_t::cpu_inner_product_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgemm_bwd_d:", isa, ""),
                 brgemm_inner_product_bwd_data_t);
@@ -267,15 +344,18 @@ struct brgemm_inner_product_bwd_data_t : public primitive_t {
             auto diff_src_dt = invariant_src_md()->data_type;
             auto diff_dst_dt = invariant_dst_md()->data_type;
             auto wei_dt = invariant_wei_md()->data_type;
-
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
             if (!mayiuse(isa)) return status::unimplemented;
             VDISPATCH_INNER_PRODUCT(
                     desc()->prop_kind == prop_kind::backward_data,
                     VERBOSE_BAD_PROPKIND);
             VDISPATCH_INNER_PRODUCT(
                     !has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-            VDISPATCH_INNER_PRODUCT(utils::one_of(diff_dst_dt, data_type::f32,
-                                            data_type::bf16, data_type::f16),
+            VDISPATCH_INNER_PRODUCT(
+                    utils::one_of(diff_dst_dt, data_type::f32, data_type::bf16,
+                            data_type::f16, data_type::f8_e5m2,
+                            data_type::f8_e4m3),
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_INNER_PRODUCT(wei_dt == diff_dst_dt,
                     VERBOSE_INCONSISTENT_DT, "weights", "diff_dst");
@@ -321,7 +401,7 @@ struct brgemm_inner_product_bwd_data_t : public primitive_t {
                 if (jbgp_.is_amx) {
                     brgemm_attr_t brgattr;
                     brgattr.max_bs = bs;
-                    brgattr.wary_tail_read = false;
+                    brgattr.wary_A_k_tail_read = false;
                     brgattr.hint_expected_A_size = jbgp_.mb * jbgp_.oc;
                     brgattr.hint_expected_B_size = jbgp_.oc * jbgp_.ic;
                     brgattr.hint_expected_C_size = jbgp_.mb * jbgp_.ic;
@@ -332,6 +412,9 @@ struct brgemm_inner_product_bwd_data_t : public primitive_t {
                     brgattr.fpmath_mode = attr()->fpmath_.mode_;
 
                     CHECK(brgemm_desc_set_attr(&brg, brgattr));
+                }
+                CHECK(brgemm_desc_finalize(&brg));
+                if (jbgp_.is_amx) {
                     jbgp_.amx_buf_size_per_thread
                             = nstl::max(brg.get_wsp_buffer_size(),
                                     jbgp_.amx_buf_size_per_thread);
@@ -430,9 +513,8 @@ struct brgemm_inner_product_bwd_data_t : public primitive_t {
 template <cpu_isa_t isa>
 struct brgemm_inner_product_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_inner_product_bwd_weights_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : cpu_inner_product_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_inner_product_bwd_weights_pd_t::
+                cpu_inner_product_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("brgemm_bwd_w:", isa, ""),
                 brgemm_inner_product_bwd_weights_t);
@@ -443,15 +525,28 @@ struct brgemm_inner_product_bwd_weights_t : public primitive_t {
             auto src_dt = invariant_src_md()->data_type;
             auto diff_wei_type = invariant_wei_md()->data_type;
             auto diff_dst_type = invariant_dst_md()->data_type;
-
+            auto diff_bia_type = invariant_bia_md()->data_type;
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
             if (!mayiuse(isa)) return status::unimplemented;
+
+            const bool is_f32 = utils::everyone_is(data_type::f32, src_dt,
+                                        diff_wei_type, diff_dst_type)
+                    && IMPLICATION(
+                            with_bias(), diff_bia_type == data_type::f32);
+
+            VDISPATCH_INNER_PRODUCT(IMPLICATION(!is_f32, mayiuse(avx512_core)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+
             VDISPATCH_INNER_PRODUCT(
                     desc()->prop_kind == prop_kind::backward_weights,
                     VERBOSE_BAD_PROPKIND);
             VDISPATCH_INNER_PRODUCT(
                     !has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-            VDISPATCH_INNER_PRODUCT(utils::one_of(src_dt, data_type::f32,
-                                            data_type::bf16, data_type::f16),
+            VDISPATCH_INNER_PRODUCT(
+                    utils::one_of(src_dt, data_type::f32, data_type::bf16,
+                            data_type::f16, data_type::f8_e5m2,
+                            data_type::f8_e4m3),
                     VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_INNER_PRODUCT(diff_dst_type == src_dt,
                     VERBOSE_INCONSISTENT_DT, "diff_dst", "src");
@@ -495,7 +590,7 @@ struct brgemm_inner_product_bwd_weights_t : public primitive_t {
                 if (jbgp_.is_amx) {
                     brgemm_attr_t brgattr;
                     brgattr.max_bs = bs;
-                    brgattr.wary_tail_read = false;
+                    brgattr.wary_A_k_tail_read = false;
                     brgattr.hint_expected_A_size = jbgp_.mb * jbgp_.ic;
                     brgattr.hint_expected_B_size = jbgp_.mb * jbgp_.oc;
                     brgattr.hint_expected_C_size = jbgp_.ic * jbgp_.oc;
@@ -506,6 +601,9 @@ struct brgemm_inner_product_bwd_weights_t : public primitive_t {
                     brgattr.fpmath_mode = attr()->fpmath_.mode_;
 
                     CHECK(brgemm_desc_set_attr(&brg, brgattr));
+                }
+                CHECK(brgemm_desc_finalize(&brg));
+                if (jbgp_.is_amx) {
                     jbgp_.amx_buf_size_per_thread
                             = nstl::max(brg.get_wsp_buffer_size(),
                                     jbgp_.amx_buf_size_per_thread);
@@ -589,8 +687,8 @@ struct brgemm_inner_product_bwd_weights_t : public primitive_t {
                 CHECK(create_brgemm_trans_to_vnni(trans_C_kernel_, &pd()->jbgp_,
                         jit_brgemm_trans_to_vnni_t::matrix_to_transform::
                                 matrix_C));
-        } else if (utils::one_of(
-                           jbgp.wei_dt, data_type::bf16, data_type::f16)) {
+        } else if (utils::one_of(jbgp.wei_dt, data_type::bf16, data_type::f16,
+                           data_type::f8_e5m2, data_type::f8_e4m3)) {
             CHECK(create_brgemm_amx_ip_trans_wei(diff_wei_trans_kernel_,
                     &pd()->jbgp_, ext_ic_block_, ext_oc_block_));
         }
@@ -610,8 +708,8 @@ struct brgemm_inner_product_bwd_weights_t : public primitive_t {
 
 private:
     struct thread_info_t;
-    using ker_diff_bias_t
-            = jit_brgemm_kernel_diff_bias_t<typename cpu_isa_traits<isa>::Vmm>;
+    using ker_diff_bias_t = jit_brgemm_kernel_diff_bias_t<
+            typename cpu_isa_traits_t<isa>::Vmm>;
     std::unique_ptr<ker_diff_bias_t> kernels_db_[2][2];
     std::unique_ptr<brgemm_kernel_t>
             brg_kernels_[brgemm_inner_product_utils::max_num_brg_kernels_ip];
diff --git a/src/cpu/x64/jit_brgemm_inner_product_utils.cpp b/src/cpu/x64/jit_brgemm_inner_product_utils.cpp
index ca1d613afb1..84861365a8e 100644
--- a/src/cpu/x64/jit_brgemm_inner_product_utils.cpp
+++ b/src/cpu/x64/jit_brgemm_inner_product_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -170,9 +170,11 @@ jit_brgemm_ip_conf_t::get_desired_weights_tag() const {
     using namespace format_tag;
     const int n_sp_dims = jbgp.ndims - 2;
     const bool is_xf16 = utils::one_of(jbgp.wei_dt, bf16, f16);
-    const bool is_not_vnni_tag = jbgp.wei_dt == f32
-            || (jbgp.wei_dt == f16 && jbgp.isa == avx512_core_fp16);
-    if (is_not_vnni_tag) {
+    const bool is_fp8 = utils::one_of(jbgp.wei_dt, f8_e5m2, f8_e4m3);
+    const bool is_not_vnni_tag = (jbgp.wei_dt == f32
+            || (jbgp.wei_dt == f16 && jbgp.isa == avx512_core_fp16)) && !jbgp.weights_decompression;
+    const bool is_half_prc_weights = (one_of(jbgp.orig_wei_dt, f16, bf16) && jbgp.src_dt == f32);
+    if (is_not_vnni_tag || (jbgp.weights_decompression && (one_of(jbgp.orig_wei_dt, u8, s8) || is_half_prc_weights) && !jbgp.with_src_dynamic_quant)) {
         if (is_superset(jbgp.isa, avx512_core))
             return {{64,
                             pick(n_sp_dims, OI16i64o, OwI16i64o, OhwI16i64o,
@@ -225,7 +227,7 @@ jit_brgemm_ip_conf_t::get_desired_weights_tag() const {
                             pick(n_sp_dims, OI8i8o2i, OwI8i8o2i, OhwI8i8o2i,
                                     OdhwI8i8o2i)}};
         }
-    } else if (jbgp.wei_dt == data_type::s8) {
+    } else if (jbgp.wei_dt == data_type::s8 || is_fp8 || (jbgp.with_src_dynamic_quant && jbgp.orig_wei_dt == data_type::u8)) {
         if (jbgp.is_amx) {
             return {{64,
                             pick(n_sp_dims, OI16i64o4i, OwI16i64o4i,
@@ -253,6 +255,49 @@ jit_brgemm_ip_conf_t::get_desired_weights_tag() const {
                             pick(n_sp_dims, OI4i8o4i, OwI4i8o4i, OhwI4i8o4i,
                                     OdhwI4i8o4i)}};
         }
+    } else if (jbgp.weights_decompression && one_of(jbgp.orig_wei_dt, nf4, s4, u4, f4_e2m1)) {
+        if (jbgp.with_src_dynamic_quant) {
+            return {{64,
+                            pick(n_sp_dims, OI16i64o4i, OIw16i64o4i,
+                                    OIhw16i64o4i, OIdhw16i64o4i)},
+                    {48,
+                            pick(n_sp_dims, OI16i48o4i, OIw16i48o4i,
+                                    OIhw16i48o4i, OIdhw16i48o4i)},
+                    {32,
+                            pick(n_sp_dims, OI16i32o4i, OIw16i32o4i,
+                                    OIhw16i32o4i, OIdhw16i32o4i)},
+                    {16,
+                            pick(n_sp_dims, OI16i16o4i, OIw16i16o4i,
+                                    OIhw16i16o4i, OIdhw16i16o4i)}};
+        } else {
+            if (is_superset(jbgp.isa, avx512_core)) {
+                return {{64,
+                                pick(n_sp_dims, OI16i64o2i, OIw16i64o2i,
+                                        OIhw16i64o2i, OIdhw16i64o2i)},
+                        {48,
+                                pick(n_sp_dims, OI16i48o2i, OIw16i48o2i,
+                                        OIhw16i48o2i, OIdhw16i48o2i)},
+                        {32,
+                                pick(n_sp_dims, OI16i32o2i, OIw16i32o2i,
+                                        OIhw16i32o2i, OIdhw16i32o2i)},
+                        {16,
+                                pick(n_sp_dims, OI16i16o2i, OIw16i16o2i,
+                                        OIhw16i16o2i, OIdhw16i16o2i)}};
+            } else {
+                return {{32,
+                                pick(n_sp_dims, OI8i32o2i, OIw8i32o2i, OIhw8i32o2i,
+                                        OIdhw8i32o2i)},
+                        {24,
+                                pick(n_sp_dims, OI8i24o2i, OIw8i24o2i, OIhw8i24o2i,
+                                        OIdhw8i24o2i)},
+                        {16,
+                                pick(n_sp_dims, OI8i16o2i, OIw8i16o2i, OIhw8i16o2i,
+                                        OIdhw8i16o2i)},
+                        {8,
+                                pick(n_sp_dims, OI8i8o2i, OIw8i8o2i, OIhw8i8o2i,
+                                        OIdhw8i8o2i)}};
+            }
+        }
     } else {
         return {{0, format_tag::undef}};
     }
@@ -329,6 +374,7 @@ bool jit_brgemm_ip_conf_t::adjust_thread_balance() const {
     const bool skip_thread_balancing = !jbgp.is_amx && !is_f32_compute_avx512;
     if (IMPLICATION(jbgp.is_wei_layout_any, skip_thread_balancing))
         return false;
+    if (jbgp.weights_compressed) return false; // @todo add comment about the reasoning
 
     int os_chunks = div_up(jbgp.os, get_os_block(true, false));
 
@@ -354,9 +400,30 @@ int jit_brgemm_ip_conf_t::get_adjusted_oc_block() const {
     const auto &jbgp = *this;
     const bool is_amx_xf16 = jbgp.is_amx && !jbgp.is_bf32;
     const bool is_f32_compute = !jbgp.is_bf32
-            && everyone_is(f32, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt);
+            && jbgp.weights_decompression ? everyone_is(f32, jbgp.src_dt, jbgp.dst_dt)
+                                          : everyone_is(f32, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt);
     const bool is_avx512 = is_superset(jbgp.isa, avx512_core);
+    const bool is_avx2 = is_superset(jbgp.isa, avx2);
     const bool is_f32_compute_avx512 = is_f32_compute && is_avx512;
+    const bool is_f32_compute_avx2 = !is_avx512 && is_avx2 && is_f32_compute;
+
+    // These heuristic are required to avoid usage different weight layouts in case of different data shapes.
+    // Applicibility is limited to big weights only (like LLM use cases) since minimal memory consumption and
+    // time for weights reorder are key optimization points there.
+    const size_t wei_size = static_cast<size_t>(jbgp.ic * jbgp.oc) * types::data_type_size(jbgp.wei_dt);
+    // Use oc block to be 32 if weight size >= 8MB on amx bf16 to optimized memory consumption.
+    if (jbgp.is_amx && jbgp.orig_wei_dt == bf16 && !jbgp.is_bf32 && wei_size >= 8 * (1 << 20))
+        return 32;
+    // Use oc block to be 64 if weight size >= 16MB on avx512 f32 to optimized memory consumption.
+    if ((is_f32_compute_avx512 || (jbgp.is_amx && jbgp.orig_wei_dt != bf16 && !jbgp.is_bf32))
+        && wei_size >= 16 * (1 << 20))
+        return 64;
+    // Use oc block to be 24 if weight size >= 16MB on avx2 f32 to optimized memory consumption.
+    if (is_f32_compute_avx2 && wei_size >= 16 * (1 << 20))
+        return 24;
+    if (jbgp.with_src_dynamic_quant) {
+        return 32;
+    }
 
     // we can't change block size on forward and weights update (external)
     // if layout is set by user, for backward data it can be chosen different
@@ -443,8 +510,7 @@ status_t jit_brgemm_ip_fwd_conf_t::init_conf(cpu_isa_t isa,
     const memory_desc_wrapper dst_d(&dst_md);
     if (!post_ops_ok(attr, dst_d)) return status::unimplemented;
     if (jbgp.with_scales) {
-        const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
-        jbgp.is_oc_scale = wei_scales.mask_ != 0;
+        jbgp.is_oc_scale = attr.scales_.get_mask(DNNL_ARG_WEIGHTS) > 0;
     }
 
     const int min_ic_divisor = is_amx_int8 ? 4 : is_amx_xf16 ? 2 : 1;
@@ -631,6 +697,33 @@ status_t jit_brgemm_ip_fwd_conf_t::init_conf(cpu_isa_t isa,
         jbgp.gemm_batch_size = nb_k_blocking;
     }
 
+    // Current implementation of grouped weights decompression algorithm requires K size to be aligned on group size.
+    // Besides that "batched" usage of brgemm block is not covered, so forcing the value to 1.
+    if (jbgp.with_src_dynamic_quant) {
+        size_t max_ic_group_size = k_blk;
+        if (jbgp.wei_scales_ic_group_size != static_cast<size_t>(jbgp.ic))
+            max_ic_group_size = std::max(max_ic_group_size, jbgp.wei_scales_ic_group_size);
+        if (jbgp.wei_zero_points_ic_group_size != static_cast<size_t>(jbgp.ic))
+            max_ic_group_size = std::max(max_ic_group_size, jbgp.wei_zero_points_ic_group_size);
+        max_ic_group_size = std::max(max_ic_group_size, jbgp.src_quant_group_size);
+        max_ic_group_size = std::max(max_ic_group_size, jbgp.src_sum_group_size);
+
+        if ((jbgp.nb_ic_blocking * k_blk) % max_ic_group_size != 0) {
+            jbgp.nb_ic_blocking = max_ic_group_size;
+        }
+        jbgp.K = k_blk * jbgp.nb_ic_blocking;
+        jbgp.gemm_batch_size = 1;
+        jbgp.nthr_ic_b = 1;
+    } else if (jbgp.with_grouped_weights_decompression) {
+        auto min_ic_group_size = std::min(jbgp.wei_scales_ic_group_size, jbgp.wei_zero_points_ic_group_size);
+        if ((jbgp.nb_ic_blocking * k_blk) % min_ic_group_size != 0) {
+            jbgp.nb_ic_blocking = 64;
+        }
+        jbgp.K = k_blk * jbgp.nb_ic_blocking;
+        jbgp.gemm_batch_size = 1;
+        jbgp.nthr_ic_b = 1;
+    }
+
     const int nthrs_other = jbgp.nthr / jbgp.nthr_ic_b;
     const int min_work = 15;
 
@@ -702,6 +795,7 @@ status_t jit_brgemm_ip_fwd_conf_t::init_conf(cpu_isa_t isa,
                 jbgp.src_dt, jbgp.wei_dt, false, false, brgemm_row_major, 1.0f,
                 1.0f, jbgp.ic_without_padding, jbgp.oc_block,
                 jbgp.oc_without_padding, jbgp.os_block, jbgp.oc_block, jbgp.K);
+        if (st == success) st = brgemm_desc_finalize(&brg_desc);
 
         if (st == success) {
             int bd_block = brg_desc.bd_block;
@@ -748,6 +842,22 @@ status_t jit_brgemm_ip_fwd_conf_t::init_conf(cpu_isa_t isa,
             return status::unimplemented;
     }
 
+    jbgp.wei_decomp_scales_buffer_size = jbgp.wei_decomp_zero_points_buffer_size = 0;
+    if (jbgp.weights_decompression) {
+        if (attr.scales_.get(DNNL_ARG_WEIGHTS).get_ndims()) {
+            auto wei_scales_dims = attr.scales_.get(DNNL_ARG_WEIGHTS).get_dims();
+            if (wei_scales_dims[0] % jbgp.simd_w) {
+                jbgp.wei_decomp_scales_buffer_size = rnd_up(wei_scales_dims[0], jbgp.simd_w) * wei_scales_dims[1];
+            }
+        }
+        if (attr.zero_points_.get_ndims(DNNL_ARG_WEIGHTS)) {
+            auto wei_zero_points_dims = attr.zero_points_.get_dims(DNNL_ARG_WEIGHTS);
+            if (wei_zero_points_dims[0] % jbgp.simd_w) {
+                jbgp.wei_decomp_zero_points_buffer_size = rnd_up(wei_zero_points_dims[0], jbgp.simd_w) * wei_zero_points_dims[1];
+            }
+        }
+    }
+
     return status::success;
 }
 
@@ -1315,14 +1425,102 @@ status_t jit_brgemm_ip_conf_t::init_conf_base(cpu_isa_t isa,
     jbgp.src_dt = src_d.data_type();
     jbgp.dst_dt = dst_d.data_type();
     jbgp.wei_dt = weights_d.data_type();
+
+    jbgp.weights_decompression = (one_of(jbgp.src_dt, f32, bf16) && one_of(jbgp.wei_dt, u8, s8, nf4, s4, u4, f4_e2m1)) ||
+                                 (one_of(jbgp.src_dt, f32) && one_of(jbgp.wei_dt, f16, bf16));
+    jbgp.wei_decomp_algo = weights_decomp_kind_t::immediate;
+    jbgp.orig_wei_dt = jbgp.wei_dt;
+    jbgp.with_grouped_weights_decompression = false;
+    jbgp.wei_decomp_scales_dt = data_type::undef;
+    jbgp.wei_decomp_zero_points_dt = data_type::undef;
+    jbgp.with_src_dynamic_quant = false;
+    if (jbgp.weights_decompression) {
+        jbgp.src_quant_group_size = jbgp.ic;
+        jbgp.src_sum_group_size = jbgp.ic;
+        if (!attr.src_dyn_quant_params_.has_default_values()) {
+            jbgp.with_src_dynamic_quant = true;
+        }
+
+        jbgp.wei_scales_ic_group_size = jbgp.ic;
+        auto wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
+        if (!wei_scales.has_default_values()) {
+            jbgp.wei_decomp_scales_dt = wei_scales.get_data_type();
+            if (!one_of(jbgp.wei_decomp_scales_dt, f32, e8m0))
+                return status::unimplemented;
+        } else {
+            jbgp.wei_decomp_scales_dt = f32;
+        }
+        if (!wei_scales.has_default_values() && wei_scales.get_dims()[1] != 1) {
+            jbgp.with_grouped_weights_decompression = true;
+            jbgp.wei_scales_ic_group_size = div_up(jbgp.ic, wei_scales.get_dims()[1]);
+        }
+        jbgp.wei_zero_points_ic_group_size = jbgp.ic;
+        if (!attr.zero_points_.has_default_values(DNNL_ARG_WEIGHTS)) {
+            if (attr.zero_points_.get_dims(DNNL_ARG_WEIGHTS)[1] != 1) {
+                jbgp.with_grouped_weights_decompression = true;
+                jbgp.wei_zero_points_ic_group_size = div_up(jbgp.ic, attr.zero_points_.get_dims(DNNL_ARG_WEIGHTS)[1]);
+            }
+
+            jbgp.wei_decomp_zero_points_dt = attr.zero_points_.get_data_type(DNNL_ARG_WEIGHTS);
+            if (!one_of(jbgp.wei_decomp_zero_points_dt, f32, u8))
+                return status::unimplemented;
+        }
+
+        if (jbgp.with_src_dynamic_quant) {
+            jbgp.src_quant_group_size = attr.src_dyn_quant_params_.get();
+        }
+
+        if (jbgp.mb > 4 && !jbgp.with_src_dynamic_quant) {
+            jbgp.wei_decomp_algo = weights_decomp_kind_t::prepack;
+        }
+
+        if (jbgp.wei_decomp_algo == weights_decomp_kind_t::prepack) {
+            jbgp.wei_dt = jbgp.src_dt;
+        }
+
+        // Current AMX implementation cannot provide perfromance benefit for immediate algorithm over avx512 version
+        if (jbgp.is_amx && jbgp.wei_decomp_algo == weights_decomp_kind_t::immediate)
+            return status::unimplemented;
+
+        auto min_group_size = nstl::min(jbgp.wei_scales_ic_group_size, jbgp.wei_zero_points_ic_group_size);
+        if (jbgp.wei_scales_ic_group_size % min_group_size)
+            return status::unimplemented;
+        if (jbgp.wei_zero_points_ic_group_size % min_group_size)
+            return status::unimplemented;
+
+        if (jbgp.with_src_dynamic_quant) {
+            if (!(one_of(jbgp.wei_dt, u4, u8) &&
+                  one_of(jbgp.wei_decomp_scales_dt, f32) &&
+                  one_of(jbgp.wei_decomp_zero_points_dt, u8, data_type::undef)))
+                return status::unimplemented;
+
+            const size_t simd_width = 16;
+            if (jbgp.src_quant_group_size == 0 || jbgp.src_quant_group_size % simd_width)
+                return status::unimplemented;
+
+            jbgp.orig_src_dt = jbgp.src_dt;
+            jbgp.src_dt = s8;
+
+            size_t rd_unroll = jbgp.src_quant_group_size;
+            jbgp.src_sum_group_size = nstl::min(rd_unroll, min_group_size);
+
+            if (jbgp.wei_scales_ic_group_size != static_cast<size_t>(jbgp.ic) && jbgp.wei_scales_ic_group_size % jbgp.src_sum_group_size)
+                return status::unimplemented;
+            if (jbgp.wei_zero_points_ic_group_size != static_cast<size_t>(jbgp.ic) && jbgp.wei_zero_points_ic_group_size % jbgp.src_sum_group_size)
+                return status::unimplemented;
+            if (jbgp.src_quant_group_size % jbgp.src_sum_group_size)
+                return status::unimplemented;
+        }
+    }
+
     jbgp.bia_dt = jbgp.with_bias
             ? pick_by_prop_kind(jbgp.prop_kind, ipd.bias_desc.data_type,
                     data_type::undef, ipd.diff_bias_desc.data_type)
             : data_type::undef;
     jbgp.req_s8s8_compensation
             = one_of(isa, avx512_core, avx512_core_vnni, avx2_vnni)
-            && jbgp.src_dt == s8;
-    const bool is_int8 = one_of(jbgp.src_dt, u8, s8) && jbgp.wei_dt == s8;
+            && jbgp.src_dt == s8 && !jbgp.with_src_dynamic_quant;
+    const bool is_int8 = (one_of(jbgp.src_dt, u8, s8) && jbgp.wei_dt == s8) || jbgp.with_src_dynamic_quant;
     const bool is_bf16
             = everyone_is(bf16, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt)
             || pick_by_prop_kind(jbgp.prop_kind,
@@ -1331,7 +1529,8 @@ status_t jit_brgemm_ip_conf_t::init_conf_base(cpu_isa_t isa,
                     everyone_is(bf16, jbgp.wei_dt, jbgp.dst_dt)
                             && jbgp.src_dt == f32,
                     everyone_is(bf16, jbgp.src_dt, jbgp.dst_dt)
-                            && jbgp.wei_dt == f32);
+                            && jbgp.wei_dt == f32)
+            || (jbgp.weights_decompression && jbgp.src_dt == bf16 && one_of(jbgp.dst_dt, f32, bf16));
     const bool is_f16 = everyone_is(f16, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt)
             || pick_by_prop_kind(jbgp.prop_kind,
                     everyone_is(f16, jbgp.src_dt, jbgp.wei_dt)
@@ -1340,11 +1539,30 @@ status_t jit_brgemm_ip_conf_t::init_conf_base(cpu_isa_t isa,
                             && jbgp.src_dt == f32,
                     everyone_is(f16, jbgp.src_dt, jbgp.dst_dt)
                             && jbgp.wei_dt == f32);
-    const bool is_f32 = everyone_is(f32, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt);
+    const bool is_f32 = jbgp.weights_decompression ? everyone_is(f32, jbgp.src_dt, jbgp.dst_dt)
+                                                   : everyone_is(f32, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt);
     jbgp.is_bf32 = is_f32
             && one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::any)
             && jbgp.is_amx;
 
+    const bool is_fp8
+            = everyone_is(f8_e5m2, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt)
+            || pick_by_prop_kind(jbgp.prop_kind,
+                    everyone_is(f8_e5m2, jbgp.src_dt, jbgp.wei_dt)
+                            && one_of(jbgp.dst_dt, f32, f16),
+                    everyone_is(f8_e5m2, jbgp.wei_dt, jbgp.dst_dt)
+                            && jbgp.src_dt == f32,
+                    everyone_is(f8_e5m2, jbgp.src_dt, jbgp.dst_dt)
+                            && jbgp.wei_dt == f32)
+            || everyone_is(f8_e4m3, jbgp.src_dt, jbgp.wei_dt, jbgp.dst_dt)
+            || pick_by_prop_kind(jbgp.prop_kind,
+                    everyone_is(f8_e4m3, jbgp.src_dt, jbgp.wei_dt)
+                            && one_of(jbgp.dst_dt, f32, f16),
+                    everyone_is(f8_e4m3, jbgp.wei_dt, jbgp.dst_dt)
+                            && jbgp.src_dt == f32,
+                    everyone_is(f8_e4m3, jbgp.src_dt, jbgp.dst_dt)
+                            && jbgp.wei_dt == f32);
+
     if (!IMPLICATION(is_int8,
                 one_of(isa, avx2_vnni, avx2_vnni_2, avx512_core,
                         avx512_core_vnni, avx512_core_amx)))
@@ -1358,13 +1576,33 @@ status_t jit_brgemm_ip_conf_t::init_conf_base(cpu_isa_t isa,
                 one_of(isa, avx2_vnni_2, avx512_core_fp16,
                         avx512_core_amx_fp16)))
         return status::unimplemented;
+    if (!IMPLICATION(is_fp8, one_of(isa, avx512_core_amx_fp16)))
+        return status::unimplemented;
 
-    if (!one_of(true, is_int8, is_bf16, is_f16, is_f32))
+    if (!one_of(true, is_int8, is_bf16, is_f16, is_f32, is_fp8))
         return status::unimplemented;
-    if (is_int8) {
+
+    jbgp.weights_compressed = false;
+    if (is_int8 && !jbgp.with_src_dynamic_quant) {
         jbgp.acc_dt = s32;
         jbgp.with_scales = true;
         jbgp.with_dst_scales = true;
+        jbgp.weights_compressed = weights_d.is_sparse_desc()
+                && weights_d.sparse_desc().encoding == sparse_encoding::packed;
+        // XXX: assumption on block size.
+        // TODO: generalize this.
+        if (jbgp.weights_compressed) {
+            jbgp.weights_compressed = true;
+            int total_blocks = (jbgp.oc * jbgp.ic) / 4096;
+            using comp_tile_len_type = int;
+            jbgp.weights_starting_offset
+                    = ceil((float)total_blocks * sizeof(comp_tile_len_type) / 64.0) * 64;
+            jbgp.weight_comp_bitmask_off = jbgp.weights_starting_offset + jbgp.ic * jbgp.oc;
+        }
+    } else if (is_bf16) {
+        jbgp.acc_dt = f32;
+    } else if (is_f32) {
+        jbgp.acc_dt = f32;
     } else
         jbgp.acc_dt = f32;
 
@@ -1410,7 +1648,10 @@ status_t jit_brgemm_ip_conf_t::init_conf_base(cpu_isa_t isa,
         if (jbgp.with_bias && bias_md.format_kind == format_kind::any)
             CHECK(memory_desc_init_by_tag(bias_md, x));
 
-        jbgp.is_wei_layout_any = weights_d.format_kind() == format_kind::any;
+        jbgp.is_wei_layout_any = (weights_d.format_kind() == format_kind::any)
+                || (weights_d.format_kind() == format_kind::sparse
+                        && weights_d.sparse_desc().encoding
+                                == sparse_encoding::packed);
 
         memory_desc_t want_wei_md = weights_md;
         jbgp.wei_tag = get_brgemm_ip_weights_tag(weights_md);
@@ -1478,6 +1719,32 @@ void jit_brgemm_ip_conf_t::init_scratchpad_base(
     if (jbgp.is_amx)
         scratchpad.book(key_conv_amx_tile_buffer,
                 (size_t)jbgp.nthr * jbgp.amx_buf_size_per_thread, sizeof(char));
+
+    if (jbgp.weights_compressed)
+        scratchpad.book(key_brgemm_primitive_decomp_buf,
+                (size_t)jbgp.nthr * jbgp.ic * 64,
+                types::data_type_size(jbgp.wei_dt));
+
+    if (jbgp.weights_decompression) {
+        if (jbgp.wei_decomp_algo == weights_decomp_kind_t::prepack) {
+            scratchpad.book(key_brgemm_primitive_decomp_buf,
+                (size_t)jbgp.nthr * jbgp.ic_block * jbgp.nb_ic_blocking * jbgp.oc_block,
+                types::data_type_size(jbgp.wei_dt));
+        }
+        if (jbgp.wei_decomp_scales_buffer_size)
+            scratchpad.book(key_decompression_scales, jbgp.wei_decomp_scales_buffer_size,
+                types::data_type_size(jbgp.wei_decomp_scales_dt));
+        if (jbgp.wei_decomp_zero_points_buffer_size)
+            scratchpad.book(key_decompression_zero_points, jbgp.wei_decomp_zero_points_buffer_size,
+                types::data_type_size(jbgp.wei_decomp_zero_points_dt));
+    }
+
+    if (jbgp.with_src_dynamic_quant) {
+        scratchpad.book(key_src_quantized, jbgp.mb * jbgp.ic, sizeof(int8_t));
+        scratchpad.book(key_src_dequantized_scales, jbgp.mb * div_up(jbgp.ic, jbgp.src_quant_group_size), sizeof(float));
+        if (jbgp.wei_decomp_zero_points_dt)
+            scratchpad.book(key_src_grouped_sum, jbgp.mb * div_up(jbgp.ic, jbgp.src_sum_group_size), sizeof(int32_t));
+    }
 }
 
 void jit_brgemm_ip_fwd_conf_t::init_scratchpad(
@@ -1689,7 +1956,9 @@ void jit_brgemm_ip_fwd_conf_t::choose_loop_order() {
     const bool is_int8 = one_of(src_dt, u8, s8) && wei_dt == s8;
     const bool is_compute_amx = (is_xf16 || is_int8) && is_amx;
 
-    if ((os_block < 32 || do_occ_osc) && (is_compute_amx || is_f32_avx2))
+    // Disable specific shape to use osb inner most order for perf purpose
+    if ((os_block < 32 || do_occ_osc) && (is_compute_amx || is_f32_avx2)
+            && !(os == 16384 && ic == 768 && oc == 30522))
         loop_order = icc_occ_osc_ocb_osb;
 }
 
diff --git a/src/cpu/x64/jit_brgemm_post_ops.cpp b/src/cpu/x64/jit_brgemm_post_ops.cpp
index d762fbad7f8..ad9c28a833a 100644
--- a/src/cpu/x64/jit_brgemm_post_ops.cpp
+++ b/src/cpu/x64/jit_brgemm_post_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,22 +23,50 @@ namespace x64 {
 
 #define GET_OFF(field) offsetof(brgemm_kernel_diff_bias_t, field)
 
+// This version is used from BRGEMM-based Inner Product for weights tensor.
 template <typename Vmm>
 dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::
         jit_brgemm_kernel_diff_bias_t(const jit_brgemm_primitive_conf_t &ajbgp,
                 const brgemm_desc_t &abrg)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , brg_(abrg)
-    , ddst_dt_(ajbgp.dst_dt)
+    , reduce_kind_(matmul_reduce_kind::undef)
+    , ddst_dt_((ajbgp.isa == avx512_core_fp16 && ajbgp.use_buffer_b)
+                      ? data_type::f32
+                      : ajbgp.dst_dt)
     , bia_dt_(ajbgp.bia_dt)
     , acc_dt_(ajbgp.acc_dt)
+    , ddst_typesize_(types::data_type_size(ddst_dt_))
     , bia_typesize_(types::data_type_size(bia_dt_))
-    , acc_typesize_(types::data_type_size(acc_dt_)) {
-    ddst_dt_ = (ajbgp.isa == avx512_core_fp16 && ajbgp.use_buffer_b)
-            ? data_type::f32
-            : ajbgp.dst_dt;
-    ddst_typesize_ = types::data_type_size(ddst_dt_);
-    mult_ = data_type_vnni_granularity(ddst_dt_);
+    , acc_typesize_(types::data_type_size(acc_dt_))
+    , mult_(data_type_vnni_granularity(ddst_dt_)) {}
+
+// This version is used from MatMul for src tensor.
+template <typename Vmm>
+dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::
+        jit_brgemm_kernel_diff_bias_t(const matmul::brgemm_matmul_conf_t &bgmmc,
+                const brgemm_desc_t &abrg)
+    : jit_generator_t(jit_name())
+    , brg_(abrg)
+    , reduce_kind_(bgmmc.reduce_kind)
+    // MatMul `src`.
+    , ddst_dt_((bgmmc.isa == avx512_core_fp16 && bgmmc.use_buffer_a)
+                      ? data_type::f32
+                      : bgmmc.src_dt)
+    // MatMul `reduce` buffer.
+    , bia_dt_(bgmmc.reduce_dt)
+    , acc_dt_(bgmmc.acc_dt)
+    , ddst_typesize_(types::data_type_size(ddst_dt_))
+    , bia_typesize_(types::data_type_size(bia_dt_))
+    , acc_typesize_(types::data_type_size(acc_dt_))
+    // Unused.
+    , mult_(0) {
+    // This kernel must be called after the copy A routine because it assumes
+    // that fp16 data has already been upconverted to f32.
+    // Only reduction for `src` is supported.
+    assert(reduce_kind_ == matmul_reduce_kind::src);
+    // `src` matrix is assumed to have a row major layout.
+    assert(bgmmc.treat_A_as_plain || bgmmc.use_buffer_a);
 }
 
 template <typename Vmm>
@@ -50,6 +78,8 @@ Vmm dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::vmm_mask(
             : vmm_in;
 }
 
+// Loads from ddst and adds it to bias accumulator. Used when ddst
+// is a matrix B.
 template <typename Vmm>
 void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::accumulate_bias(
         int idx, bool mask_flag) {
@@ -81,6 +111,36 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::accumulate_bias(
     }
 }
 
+// Loads from ddst and adds it to bias accumulator. Used when ddst
+// is a matrix A.
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::accumulate_bias(
+        bool mask_flag) {
+
+    auto vddst = get_ddst_reg(0);
+    auto vbias_acc = get_bias_reg(0);
+
+    auto vddst_load = vmm_mask(vddst, mask_flag, false, k_tail_mask);
+    auto addr_ddst = ptr[aux_reg_ddst];
+
+    if (ddst_dt_ == data_type::f16) {
+        vpmovzxwd(vddst_load, addr_ddst);
+        vpermw(vddst | k_f16_perm_mask | T_z, vreg_perm, vddst);
+        vcvtph2psx(vddst, Vmm_lower_t(vddst.getIdx()));
+        vaddps(vbias_acc, vbias_acc, vddst);
+    } else if (ddst_dt_ == data_type::bf16) {
+        vpmovzxwd(vddst_load, addr_ddst);
+        vdpbf16ps(vbias_acc, vreg_unit, vddst);
+    } else if (ddst_dt_ == data_type::f32) {
+        if (IMPLICATION(mask_flag, isa_has_masks(brg_.isa_impl)))
+            vmovups(vddst_load, addr_ddst);
+        else
+            vmaskmovps(vddst_load, vmm_tail_mask, addr_ddst);
+        vaddps(vbias_acc, vbias_acc, vddst);
+    } else
+        assert(!"Unsupported data type");
+}
+
 template <typename Vmm>
 void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::store(
         int idx, bool mask_flag) {
@@ -114,6 +174,101 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::store(
     }
 }
 
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::horizontal_sum(
+        Xbyak::Xmm src) {
+    vhaddps(src, src, src);
+    vhaddps(src, src, src);
+}
+
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::horizontal_sum(
+        Xbyak::Ymm src, Xbyak::Ymm workspace) {
+    const Xbyak::Xmm xmm_ws {workspace.getIdx()};
+    const Xbyak::Xmm xmm_src {src.getIdx()};
+
+    vextractf128(xmm_ws, src, 1);
+    vaddps(xmm_src, xmm_src, xmm_ws);
+    horizontal_sum(xmm_src);
+}
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::horizontal_sum(
+        Xbyak::Zmm src, Xbyak::Zmm workspace) {
+    const Xbyak::Ymm ymm_ws {workspace.getIdx()};
+    const Xbyak::Ymm ymm_src {src.getIdx()};
+
+    vextractf64x4(ymm_ws, src, 1);
+    vaddps(ymm_src, ymm_src, ymm_ws);
+    horizontal_sum(ymm_src, ymm_ws);
+}
+
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::loop_by_K() {
+    Xbyak::Label k_loop, init_zero, init_done, store_final, store_done;
+
+    mov(aux_reg_ddst, reg_ddst);
+
+    test(reg_flag, FLAG_REDUCE_FIRST);
+    jnz(init_zero, T_NEAR);
+
+    // Load data from bias acc when reg_flag != FLAG_REDUCE_FIRST.
+    auto vbias_acc = get_bias_reg(0);
+    auto addr_bias_acc = ptr[reg_bias_acc];
+    uni_vmovss(vbias_acc, addr_bias_acc);
+    jmp(init_done, T_NEAR);
+
+    // Zero out bias acc register.
+    L(init_zero);
+    uni_vxorps(vbias_acc, vbias_acc, vbias_acc);
+    L(init_done);
+
+    const auto k_size = brg_.reduce_dim / brg_.ld_block;
+    const auto k_tail = brg_.reduce_dim % brg_.ld_block;
+
+    // Do reduction over K.
+    if (k_size > 0) {
+        mov(reg_k_iter, brg_.reduce_dim / brg_.ld_block);
+        L(k_loop);
+        {
+            accumulate_bias(false);
+            add(aux_reg_ddst, ddst_typesize_ * brg_.ld_block);
+            sub(reg_k_iter, 1);
+            jnz(k_loop, T_NEAR);
+        }
+    }
+
+    if (k_tail > 0) accumulate_bias(true);
+
+    // Do horizontal reduction.
+    horizontal_sum(vbias_acc, get_workspace_reg());
+
+    test(reg_flag, FLAG_REDUCE_LAST);
+    jnz(store_final, T_NEAR);
+
+    // Store intermediate results to accumulator.
+    uni_vmovss(addr_bias_acc, vbias_acc);
+    jmp(store_done, T_NEAR);
+
+    L(store_final);
+
+    // Convert and store final results.
+    auto addr_bias = ptr[reg_bias];
+    auto vbias_acc_lower = get_bias_reg_lower(0);
+    switch (bia_dt_) {
+        case data_type::bf16:
+            vcvtneps2bf16(vbias_acc_lower, vbias_acc);
+            vmovdqu16(addr_bias, vbias_acc | k_store_mask);
+            break;
+        case data_type::f16:
+            vcvtps2ph(vbias_acc_lower, vbias_acc, 0x4);
+            vmovdqu16(addr_bias, vbias_acc | k_store_mask);
+            break;
+        case data_type::f32: uni_vmovss(addr_bias, vbias_acc); break;
+        default: assert("Unsupported bias data type");
+    }
+    L(store_done);
+}
+
 template <typename Vmm>
 void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::loop_by_N(
         int n_loop, int nb_tail) {
@@ -207,6 +362,13 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::init_masks(
         vmovups(vreg_perm | k_f16_perm_mask | T_z, ptr[rip + f16_perm_table_]);
     }
 
+    if (reduce_kind_ == matmul_reduce_kind::src
+            && utils::one_of(bia_dt_, data_type::f16, data_type::bf16)) {
+        assert(isa_has_masks(brg_.isa_impl));
+        mov(reg_mask, 1);
+        kmovq(k_store_mask, reg_mask);
+    }
+
     if (tail_length == 0) return;
     if (isa_has_masks(brg_.isa_impl)) {
         const auto full_mask = size_t {0xffffffffffffffff};
@@ -222,8 +384,8 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::init_masks(
 }
 
 template <typename Vmm>
-void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::generate() {
-    preamble();
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<
+        Vmm>::generate_for_b() {
 
     int nb = utils::div_up(brg_.load_dim, brg_.ld_block);
     int nb_tail = brg_.load_dim % brg_.ld_block;
@@ -237,13 +399,6 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::generate() {
 
     init_masks(nb_tail);
 
-    if (ddst_dt_ == data_type::bf16) {
-        auto reg_tmp = rax;
-        auto reg_unit_val = reg_tmp.cvt16();
-        mov(reg_unit_val, 0x3f80); // bf16 value of 1.
-        vpbroadcastw(vreg_unit, reg_unit_val);
-    }
-
     mov(reg_ddst, ptr[param1 + GET_OFF(ptr_diff_dst)]);
     mov(reg_bias_acc, ptr[param1 + GET_OFF(ptr_diff_bias_acc)]);
     mov(reg_bias, ptr[param1 + GET_OFF(ptr_diff_bias)]);
@@ -258,6 +413,53 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::generate() {
     }
 
     if (n_loop_tail > 0) loop_by_N(n_loop_tail, nb_tail);
+}
+
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<
+        Vmm>::generate_for_a() {
+
+    mov(reg_ddst, ptr[param1 + GET_OFF(ptr_diff_dst)]);
+    mov(reg_bias_acc, ptr[param1 + GET_OFF(ptr_diff_bias_acc)]);
+    mov(reg_bias, ptr[param1 + GET_OFF(ptr_diff_bias)]);
+    mov(reg_flag, ptr[param1 + GET_OFF(flags)]);
+
+    const int k_tail = brg_.reduce_dim % brg_.ld_block;
+    init_masks(k_tail);
+
+    for (int m = 0; m < brg_.load_dim; m++) {
+        loop_by_K();
+        add(reg_ddst, ddst_typesize_ * brg_.LDA);
+        add(reg_bias, bia_typesize_);
+        add(reg_bias_acc, acc_typesize_);
+    }
+}
+
+template <typename Vmm>
+void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::generate() {
+    preamble();
+
+    if (ddst_dt_ == data_type::bf16) {
+        auto reg_tmp = rax;
+        auto reg_unit_val = reg_tmp.cvt16();
+        mov(reg_unit_val, 0x3f80); // bf16 values of 1.
+        vpbroadcastw(vreg_unit, reg_unit_val);
+    }
+
+    int tail = 0;
+
+    // Currently, `reduce_kind` is `undef` when this kernel is used from
+    // BRGEMM-based Inner Product.
+    if (reduce_kind_ == matmul_reduce_kind::undef) {
+        tail = brg_.load_dim % brg_.ld_block;
+        generate_for_b();
+    } else if (reduce_kind_ == matmul_reduce_kind::src) {
+        tail = brg_.reduce_dim % brg_.ld_block;
+        generate_for_a();
+    } else {
+        assert(!"Unsupported reduce kind");
+    }
+
     postamble();
 
     if (ddst_dt_ == data_type::f16) {
@@ -270,12 +472,12 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_diff_bias_t<Vmm>::generate() {
             dw(f16_prm_array[i]);
     }
 
-    if (!isa_has_masks(brg_.isa_impl) && nb_tail > 0) {
+    if (!isa_has_masks(brg_.isa_impl) && tail > 0) {
         align(32);
         L(mask_label_);
-        for (int i = 0; i < nb_tail; ++i)
+        for (int i = 0; i < tail; ++i)
             dd(~uint32_t(0));
-        for (int i = nb_tail; i < brg_.ld_block; ++i)
+        for (int i = tail; i < brg_.ld_block; ++i)
             dd(0);
     }
 }
@@ -301,7 +503,7 @@ template <typename Vmm>
 dnnl::impl::cpu::x64::jit_brgemm_kernel_post_ops_t<
         Vmm>::jit_brgemm_kernel_post_ops_t(const brgemm_desc_t &abrg,
         const primitive_attr_t &aattr)
-    : jit_generator(jit_name(), abrg.isa_impl)
+    : jit_generator_t(jit_name(), abrg.isa_impl)
     , brg_(abrg)
     , attr_(aattr)
     , max_vregs_(isa_num_vregs(brg_.isa_impl))
@@ -374,7 +576,8 @@ dnnl::impl::cpu::x64::jit_brgemm_kernel_post_ops_t<
     const auto &wei_scales = attr_.scales_.get(DNNL_ARG_WEIGHTS);
     // per_oc: conv: 1 << 0, (1 << 1) + (1 << 0) (with groups)
     // per_oc: ip: 1 << 0
-    is_oc_scale_ = utils::one_of(wei_scales.mask_, 1 << 0, (1 << 1) + (1 << 0));
+    is_oc_scale_
+            = utils::one_of(wei_scales.get_mask(), 1 << 0, (1 << 1) + (1 << 0));
 
     inp_dt_ = brg_.dt_c;
     out_dt_ = brg_.dt_d;
@@ -424,7 +627,7 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_post_ops_t<Vmm>::cvt2ps(
         int tail_size, bool store, Xbyak::Opmask ktail_mask,
         bool skip_cvt2ps /*= false*/) {
     const bool is_tail = op.isMEM()
-            && tail_size != vreg_traits<Vmm>::vlen / sizeof(float)
+            && tail_size != vreg_traits_t<Vmm>::vlen / sizeof(float)
             // The current kernel is written such that tail_size = 0 implies
             // no tail and full vmm must be processed.
             && tail_size > 0;
@@ -780,7 +983,7 @@ void dnnl::impl::cpu::x64::jit_brgemm_kernel_post_ops_t<Vmm>::apply_post_ops(
                 default: assert(!"unknown dst_dt");
             }
         } else {
-            const int simd_w = vreg_traits<Vmm>::vlen / sizeof(float);
+            const int simd_w = vreg_traits_t<Vmm>::vlen / sizeof(float);
             const int nelems = tail > 0 ? tail : simd_w;
             store_data(out_dt_, vmm, aux_reg_out, offset, nelems);
         }
diff --git a/src/cpu/x64/jit_brgemm_post_ops.hpp b/src/cpu/x64/jit_brgemm_post_ops.hpp
index 65507b8a6d8..a1c7550dab9 100644
--- a/src/cpu/x64/jit_brgemm_post_ops.hpp
+++ b/src/cpu/x64/jit_brgemm_post_ops.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include "cpu/x64/jit_avx512_core_fp8cvt.hpp"
 #include "cpu/x64/jit_brgemm_primitive_conf.hpp"
 #include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/matmul/brgemm_matmul_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -49,14 +50,18 @@ struct brgemm_kernel_diff_bias_t {
 };
 
 template <typename Vmm>
-struct jit_brgemm_kernel_diff_bias_t : public jit_generator {
+struct jit_brgemm_kernel_diff_bias_t : public jit_generator_t {
     jit_brgemm_kernel_diff_bias_t(const jit_brgemm_primitive_conf_t &ajbgp,
             const brgemm_desc_t &abrg);
 
+    jit_brgemm_kernel_diff_bias_t(const matmul::brgemm_matmul_conf_t &bgmmc,
+            const brgemm_desc_t &abrg);
+
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_diff_bias_t)
 
 private:
     brgemm_desc_t brg_;
+    matmul_reduce_kind_t reduce_kind_;
     data_type_t ddst_dt_;
     data_type_t bia_dt_;
     data_type_t acc_dt_;
@@ -66,7 +71,7 @@ struct jit_brgemm_kernel_diff_bias_t : public jit_generator {
     int acc_typesize_;
     int mult_;
 
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
     using reg64_t = const Xbyak::Reg64;
     // Register decomposition
     const reg64_t param1 = abi_param1;
@@ -83,6 +88,7 @@ struct jit_brgemm_kernel_diff_bias_t : public jit_generator {
     Xbyak::Opmask k_full_mask = Xbyak::Opmask(2);
     Xbyak::Opmask k_tail_mask = Xbyak::Opmask(3);
     Xbyak::Opmask k_f16_perm_mask = Xbyak::Opmask(4);
+    Xbyak::Opmask k_store_mask = Xbyak::Opmask(5);
     Vmm vreg_unit = Vmm(31);
     Vmm vreg_perm = Vmm(30);
     Vmm vmm_tail_mask = Vmm(15); // use for avx tail loads
@@ -94,12 +100,25 @@ struct jit_brgemm_kernel_diff_bias_t : public jit_generator {
     Vmm get_bias_reg(int n) const { return Vmm(n); }
     Vmm_lower_t get_bias_reg_lower(int n) const { return Vmm_lower_t(n); }
     Vmm get_ddst_reg(int n) const { return Vmm(n + n_max_regs_); }
+    Vmm get_workspace_reg() const {
+        assert(reduce_kind_ == matmul_reduce_kind::src);
+        return Vmm(1);
+    }
 
     void accumulate_bias(int idx, bool mask_flag);
+    void accumulate_bias(bool mask_flag);
     void store(int idx, bool mask_flag);
     void loop_by_N(int n_loop, int nb_tail);
+    void loop_by_K();
     void init_masks(int tail_length);
     void generate() override;
+
+    void horizontal_sum(Xbyak::Xmm src);
+    void horizontal_sum(Xbyak::Ymm src, Xbyak::Ymm workspace);
+    void horizontal_sum(Xbyak::Zmm src, Xbyak::Zmm workspace);
+
+    void generate_for_a();
+    void generate_for_b();
 };
 
 struct brgemm_kernel_post_ops_args_t {
@@ -139,7 +158,7 @@ struct jit_brgemm_kernel_post_ops_base_t {
 // Shouldn't be called directly on implementation side.
 template <typename Vmm>
 struct jit_brgemm_kernel_post_ops_t : public jit_brgemm_kernel_post_ops_base_t,
-                                      public jit_generator {
+                                      public jit_generator_t {
 
     // TODO: the proper design should replace `brgemm_desc_t` argument and
     // introduce a dedicated struct with members properly initialized. This will
@@ -149,15 +168,15 @@ struct jit_brgemm_kernel_post_ops_t : public jit_brgemm_kernel_post_ops_base_t,
             const brgemm_desc_t &abrg, const primitive_attr_t &aattr);
 
     // These two methods are required for a base class to work since it's not
-    // derived from the jit_generator.
+    // derived from the jit_generator_t.
     status_t generate_kernel() override {
-        return jit_generator::create_kernel();
+        return jit_generator_t::create_kernel();
     }
     void operator()(brgemm_kernel_post_ops_args_t *args) const override {
-        return jit_generator::operator()(args);
+        return jit_generator_t::operator()(args);
     }
 
-    ~jit_brgemm_kernel_post_ops_t() = default;
+    ~jit_brgemm_kernel_post_ops_t() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_kernel_post_ops_t)
 
@@ -177,8 +196,8 @@ struct jit_brgemm_kernel_post_ops_t : public jit_brgemm_kernel_post_ops_base_t,
     data_type_t out_dt_;
     data_type_t bia_dt_;
 
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
-    using Vmm_lower2_t = typename vreg_traits<Vmm_lower_t>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
+    using Vmm_lower2_t = typename vreg_traits_t<Vmm_lower_t>::Vmm_lower_t;
     using po_injector_t = injector::jit_uni_postops_injector_base_t<Vmm>;
     std::unique_ptr<po_injector_t> postops_injector_;
     std::unique_ptr<bf16_emulation_t> bf16_emu_;
@@ -263,7 +282,7 @@ struct jit_brgemm_kernel_post_ops_t : public jit_brgemm_kernel_post_ops_base_t,
     }
 
     template <typename T>
-    const T maybe_mask(const T vmm_in, bool mask_flag, bool store,
+    T maybe_mask(const T vmm_in, bool mask_flag, bool store,
             Xbyak::Opmask ktail_mask) {
         assert(IMPLICATION(mask_flag, isa_has_masks(brg_.isa_impl)));
         return mask_flag
diff --git a/src/cpu/x64/jit_brgemm_primitive_conf.hpp b/src/cpu/x64/jit_brgemm_primitive_conf.hpp
index a543dfe04ed..60cbe03f718 100644
--- a/src/cpu/x64/jit_brgemm_primitive_conf.hpp
+++ b/src/cpu/x64/jit_brgemm_primitive_conf.hpp
@@ -25,6 +25,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
+enum weights_decomp_kind_t {
+    immediate,
+    prepack,
+};
+
 struct jit_brgemm_primitive_conf_t {
     prop_kind_t prop_kind;
     conv_harness_t harness;
@@ -47,6 +52,9 @@ struct jit_brgemm_primitive_conf_t {
     bool with_binary;
     bool with_scales;
     bool req_s8s8_compensation;
+    bool signed_input;
+    bool weights_compressed;
+    int weights_starting_offset;
     int nb_ic, ic_block, ic_block_ext;
     int nb_oc, oc_block, oc_block_ext;
     int nb_iw, iw_block;
@@ -69,6 +77,7 @@ struct jit_brgemm_primitive_conf_t {
     bool is_bf32;
 
     int is_oc_scale;
+    size_t weight_comp_bitmask_off;
 
     int LDA, LDB, LDC, LDD;
     int M, N, K, M_tail, N_tail, K_tail;
@@ -90,6 +99,22 @@ struct jit_brgemm_primitive_conf_t {
 
     // Compute foward weights oc-block.
     int get_weights_oc_block() const;
+
+    data_type_t orig_wei_dt;
+    weights_decomp_kind_t wei_decomp_algo;
+    bool weights_decompression;
+    bool with_grouped_weights_decompression;
+    size_t wei_scales_ic_group_size;
+    size_t wei_zero_points_ic_group_size;
+    size_t wei_decomp_scales_buffer_size;
+    size_t wei_decomp_zero_points_buffer_size;
+    data_type_t wei_decomp_scales_dt;
+    data_type_t wei_decomp_zero_points_dt;
+
+    bool with_src_dynamic_quant;
+    size_t src_quant_group_size;
+    size_t src_sum_group_size;
+    data_type_t orig_src_dt;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_brgemm_src_quantization_kernel.cpp b/src/cpu/x64/jit_brgemm_src_quantization_kernel.cpp
new file mode 100644
index 00000000000..b6aff7896b0
--- /dev/null
+++ b/src/cpu/x64/jit_brgemm_src_quantization_kernel.cpp
@@ -0,0 +1,194 @@
+/*******************************************************************************
+* Copyright 2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <float.h>
+
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_brgemm_src_quantization_kernel.hpp"
+
+#define GET_OFF(field) offsetof(src_quantization_runtime_params_t, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::utils;
+using namespace Xbyak;
+using namespace std::placeholders;
+
+template <cpu_isa_t isa>
+void jit_brgemm_src_quantization_kernel_t<isa>::load_src(Vmm vmm_load, const Xbyak::Address& addr) {
+    switch (jcp_.src_dt) {
+        case data_type::f32: {
+            uni_vmovups(vmm_load, addr);
+            break;
+        }
+        default: assert(!"unsupported data type");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_brgemm_src_quantization_kernel_t<isa>::horiz_op(Vmm vmm_src, Vmm vmm_aux, op_type type) {
+    auto uni_op = [&](const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        if (type == op_type::max) {
+            uni_vmaxps(x1, x2, op);
+        } else if (type == op_type::sum) {
+            uni_vpaddd(x1, x2, op);
+        } else {
+            assert(!"unsupported op type");
+        }
+    };
+
+    if (isa == avx512_core) {
+        Xbyak::Zmm zmm_src = Xbyak::Zmm(vmm_src.getIdx());
+        Xbyak::Zmm zmm_aux = Xbyak::Zmm(vmm_aux.getIdx());
+        vshuff32x4(zmm_aux, zmm_src, zmm_src, 0x4E);
+        uni_op(zmm_src, zmm_src, zmm_aux);
+        vshuff32x4(zmm_aux, zmm_src, zmm_src, 0xB1);
+        uni_op(zmm_src, zmm_src, zmm_aux);
+    } else if (isa == avx2) {
+        Xbyak::Ymm ymm_src = Xbyak::Ymm(vmm_src.getIdx());
+        Xbyak::Ymm ymm_aux = Xbyak::Ymm(vmm_aux.getIdx());
+        vperm2i128(ymm_aux, ymm_src, ymm_src, 0x01);
+        uni_op(ymm_src, ymm_src, ymm_aux);
+    } else {
+        assert(!"unsupported isa");
+    }
+    uni_vshufps(vmm_aux, vmm_src, vmm_src, 0x4E);
+    uni_op(vmm_src, vmm_src, vmm_aux);
+    uni_vshufps(vmm_aux, vmm_src, vmm_src, 0xB1);
+    uni_op(vmm_src, vmm_src, vmm_aux);
+}
+
+template <cpu_isa_t isa>
+void jit_brgemm_src_quantization_kernel_t<isa>::generate() {
+    preamble();
+
+    mov(reg_src, ptr[param1 + GET_OFF(src_ptr)]);
+    mov(reg_qsrc, ptr[param1 + GET_OFF(qsrc_ptr)]);
+    mov(reg_src_scales, ptr[param1 + GET_OFF(src_scales_ptr)]);
+    mov(reg_src_grouped_sum, ptr[param1 + GET_OFF(src_grouped_sum_ptr)]);
+    mov(reg_ic_size, ptr[param1 + GET_OFF(ic_size)]);
+
+    Xbyak::Label ic_loop_label;
+    Xbyak::Label ic_end_label;
+
+    size_t src_dt_size = types::data_type_size(jcp_.src_dt);
+    size_t qsrc_dt_size = types::data_type_size(jcp_.qsrc_dt);
+    size_t src_scales_dt_size = types::data_type_size(data_type::f32);
+    size_t src_grouped_sum_dt_size = types::data_type_size(data_type::s32);
+
+    static const float negative_zero[16] = {
+        -0.f, -0.f, -0.f, -0.f, -0.f, -0.f, -0.f, -0.f,
+        -0.f, -0.f, -0.f, -0.f, -0.f, -0.f, -0.f, -0.f
+    };
+
+    static const float positive_one[16] = {
+        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f, 1.f
+    };
+
+    static const float int8_max[16] = {
+        127.f, 127.f, 127.f, 127.f, 127.f, 127.f, 127.f, 127.f,
+        127.f, 127.f, 127.f, 127.f, 127.f, 127.f, 127.f, 127.f
+    };
+
+    mov(reg_tmp, (size_t)negative_zero);
+    uni_vmovups(vmm_sign_bit_mask(), ptr[reg_tmp]);
+
+    mov(reg_tmp, (size_t)positive_one);
+    uni_vmovups(vmm_one(), ptr[reg_tmp]);
+
+    mov(reg_tmp, (size_t)int8_max);
+    uni_vmovups(vmm_int8_max(), ptr[reg_tmp]);
+
+    L(ic_loop_label);
+    {
+        cmp(reg_ic_size, jcp_.ic_quant_block);
+        jl(ic_end_label, T_NEAR);
+
+        assert(!(jcp_.ic_quant_block % vec_size));
+        assert(!(jcp_.src_sum_group_size % vec_size));
+
+        int ic_blocks = jcp_.ic_quant_block / vec_size;
+        uni_vpxor(vmm_max(), vmm_max(), vmm_max());
+        for (int icb = 0; icb < ic_blocks; icb++) {
+            load_src(vmm_src(), ptr[reg_src + icb * vec_size * src_dt_size]);
+            vandnps(vmm_src(), vmm_sign_bit_mask(), vmm_src());
+            uni_vmaxps(vmm_max(), vmm_max(), vmm_src());
+        }
+
+        horiz_op(vmm_max(), vmm_aux(), op_type::max);
+
+        auto vmm_dscale = vmm_max();
+        uni_vbroadcastss(vmm_dscale, Xmm(vmm_dscale.getIdx()));
+        uni_vdivps(vmm_dscale, vmm_dscale, vmm_int8_max());
+
+        // todo: check zero case ( (dscale != 0) ? (1.0f / dscale) : 0;)
+        uni_vdivps(vmm_qscale(), vmm_one(), vmm_dscale);
+
+        uni_vmovss(ptr[reg_src_scales], Xmm(vmm_dscale.getIdx()));
+        if (jcp_.with_src_grouped_sum) {
+            uni_vxorps(vmm_src_sum_accum(), vmm_src_sum_accum(), vmm_src_sum_accum());
+        }
+        for (int icb = 0; icb < ic_blocks; icb++) {
+            load_src(vmm_src(), ptr[reg_src + icb * vec_size * src_dt_size]);
+            uni_vmulps(vmm_src(), vmm_src(), vmm_qscale());
+            uni_vcvtps2dq(vmm_src(), vmm_src());
+
+            if (jcp_.with_src_grouped_sum) {
+                uni_vpaddd(vmm_src_sum_accum(), vmm_src_sum_accum(), vmm_src());
+
+                if (((icb + 1) * vec_size) % jcp_.src_sum_group_size == 0) {
+                    horiz_op(vmm_src_sum_accum(), vmm_aux(), op_type::sum);
+                    uni_vmovss(ptr[reg_src_grouped_sum], Xmm(vmm_src_sum_accum().getIdx()));
+                    uni_vxorps(vmm_src_sum_accum(), vmm_src_sum_accum(), vmm_src_sum_accum());
+                    add(reg_src_grouped_sum, src_grouped_sum_dt_size);
+                }
+            }
+
+            if (isa == avx512_core) {
+                vpmovsdb(ptr[reg_qsrc + icb * vec_size * qsrc_dt_size], vmm_src());
+            } else {
+                uni_vpackssdw(vmm_src(), vmm_src(), vmm_src());
+                vpermq(Ymm(vmm_src().getIdx()), Ymm(vmm_src().getIdx()), 0x08);
+                uni_vpacksswb(vmm_src(), vmm_src(), vmm_src());
+                vmovq(ptr[reg_qsrc + icb * vec_size * qsrc_dt_size], Xmm(vmm_src().getIdx()));
+            }
+        }
+
+        sub(reg_ic_size, jcp_.ic_quant_block);
+        add(reg_src, src_dt_size * jcp_.ic_quant_block);
+        add(reg_qsrc, qsrc_dt_size * jcp_.ic_quant_block);
+        add(reg_src_scales, src_scales_dt_size);
+
+        jmp(ic_loop_label, T_NEAR);
+    }
+    L(ic_end_label);
+
+    postamble();
+}
+
+template struct jit_brgemm_src_quantization_kernel_t<avx512_core>;
+template struct jit_brgemm_src_quantization_kernel_t<avx2>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
\ No newline at end of file
diff --git a/src/cpu/x64/jit_brgemm_src_quantization_kernel.hpp b/src/cpu/x64/jit_brgemm_src_quantization_kernel.hpp
new file mode 100644
index 00000000000..c7ec91669a4
--- /dev/null
+++ b/src/cpu/x64/jit_brgemm_src_quantization_kernel.hpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+* Copyright 2019-2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_BRGEMM_SRC_QUANTIZATION_KERNEL_HPP
+#define CPU_X64_JIT_BRGEMM_SRC_QUANTIZATION_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+
+#include "cpu/x64/jit_brgemm_primitive_conf.hpp"
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct src_quantization_compile_params_t {
+    size_t ic_quant_block;
+    bool with_src_grouped_sum;
+    size_t src_sum_group_size;
+    data_type_t src_dt;
+    data_type_t qsrc_dt;
+};
+
+struct src_quantization_runtime_params_t {
+    const void *src_ptr;
+    const void *qsrc_ptr;
+    const void *src_scales_ptr;
+    const void *src_grouped_sum_ptr;
+    size_t ic_size;
+};
+
+struct jit_src_quantization_kernel_t {
+    void operator()(const src_quantization_runtime_params_t *args) { assert(ker_);
+        ker_(args);
+    }
+
+    jit_src_quantization_kernel_t(const src_quantization_compile_params_t& jcp) : ker_(nullptr), jcp_(jcp) {}
+    virtual ~jit_src_quantization_kernel_t() {}
+protected:
+    void (*ker_)(const src_quantization_runtime_params_t *);
+
+    src_quantization_compile_params_t jcp_;
+};
+
+template <cpu_isa_t isa>
+struct jit_brgemm_src_quantization_kernel_t : public jit_src_quantization_kernel_t, public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_src_quantization_kernel_t)
+
+    jit_brgemm_src_quantization_kernel_t(const src_quantization_compile_params_t& jcp)
+        : jit_src_quantization_kernel_t(jcp), jit_generator_t(jit_name()) {
+        vec_size = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+
+        create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+private:
+    using Vmm = typename utils::conditional3<isa == x64::sse41, Xbyak::Xmm, isa == x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+
+    static constexpr int n_vregs = cpu_isa_traits_t<isa>::n_vregs;
+
+    void generate() override;
+    void load_src(Vmm vmm_load, const Xbyak::Address& addr);
+
+    enum class op_type {max, sum};
+    void horiz_op(Vmm vmm_src, Vmm vmm_aux, op_type op);
+
+    Vmm vmm_src() {
+        return Vmm(0);
+    }
+
+    Vmm vmm_max() {
+        return Vmm(1);
+    }
+
+    Vmm vmm_sign_bit_mask() {
+        return Vmm(2);
+    }
+
+    Vmm vmm_aux() {
+        return Vmm(3);
+    }
+
+    Vmm vmm_int8_max() {
+        return Vmm(4);
+    }
+
+    Vmm vmm_qscale() {
+        return Vmm(5);
+    }
+
+    Vmm vmm_one() {
+        return Vmm(6);
+    }
+
+    Vmm vmm_src_sum_accum() {
+        return Vmm(7);
+    }
+
+    Xbyak::Reg64 reg_src = r8;
+    Xbyak::Reg64 reg_qsrc = r9;
+    Xbyak::Reg64 reg_src_scales = r10;
+    Xbyak::Reg64 reg_ic_size = r11;
+    Xbyak::Reg64 reg_tmp = r12;
+    Xbyak::Reg64 reg_src_grouped_sum = r13;
+
+    size_t vec_size;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_X64_JIT_BRGEMM_SRC_QUANTIZATION_KERNEL_HPP
\ No newline at end of file
diff --git a/src/cpu/x64/jit_brgemm_transpose_utils.cpp b/src/cpu/x64/jit_brgemm_transpose_utils.cpp
index 37d771dea9a..e8ba525bd73 100644
--- a/src/cpu/x64/jit_brgemm_transpose_utils.cpp
+++ b/src/cpu/x64/jit_brgemm_transpose_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,16 +35,18 @@ using namespace Xbyak;
 #define GET_OFF(x) offsetof(ctx_t, x)
 
 struct jit_brgemm_trans_m_k_f32_t : public jit_brgemm_trans_src_t,
-                                    public jit_generator {
+                                    public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_trans_m_k_f32_t)
 
     jit_brgemm_trans_m_k_f32_t(const jit_brgemm_primitive_conf_t *conf)
         : jit_brgemm_trans_src_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , transpose_size(isa_max_vlen(conf_->isa) / typesize) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -84,7 +86,7 @@ struct jit_brgemm_trans_m_k_f32_t : public jit_brgemm_trans_src_t,
     Xmm xmm_zero = xmm13;
     void kmovw(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
     void transpose_16x16(int nrows, int ncolumns);
     void transpose_16x16_avx2(int nrows, int ncolumns);
@@ -365,7 +367,7 @@ void jit_brgemm_trans_m_k_f32_t::init_masks(int tail_length) {
     } else if (tail_length) {
         lea(reg_tmp, ptr[rip + mask_label_]);
         vmovups(ymm_tail_mask, ptr[reg_tmp]);
-        vmovups(xmm_upper_tail_mask, ptr[reg_tmp + vreg_traits<Xmm>::vlen]);
+        vmovups(xmm_upper_tail_mask, ptr[reg_tmp + vreg_traits_t<Xmm>::vlen]);
     }
 }
 
@@ -470,13 +472,15 @@ void jit_brgemm_trans_m_k_f32_t::generate() {
 }
 
 struct jit_brgemm_trans_m_k_bf16_t : public jit_brgemm_trans_src_t,
-                                     public jit_generator {
+                                     public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_trans_m_k_bf16_t)
     jit_brgemm_trans_m_k_bf16_t(const jit_brgemm_primitive_conf_t *conf)
-        : jit_brgemm_trans_src_t(conf), jit_generator(jit_name()) {}
+        : jit_brgemm_trans_src_t(conf), jit_generator_t(jit_name()) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -543,12 +547,12 @@ void jit_brgemm_trans_m_k_bf16_t::transpose(
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     auto kmovd = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     };
 
     auto store = [&](Zmm r, int i) {
@@ -722,7 +726,7 @@ void jit_brgemm_trans_m_k_bf16_t::generate() {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     kmovw(kFFFF, 0xffff);
@@ -735,12 +739,12 @@ void jit_brgemm_trans_m_k_bf16_t::generate() {
 
     auto vmovdqa64 = [this](Zmm z, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(z, ptr[imm_addr64]);
     };
 
     auto vmovdqa32 = [this](Zmm z, const int32_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa32(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa32(z, ptr[imm_addr64]);
     };
 
     vmovdqa64(vidx1, idx1);
@@ -833,14 +837,16 @@ void jit_brgemm_trans_m_k_bf16_t::generate() {
 }
 
 struct jit_brgemm_trans_m_k_f16_t : public jit_brgemm_trans_src_t,
-                                    public jit_generator {
+                                    public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_trans_m_k_f16_t)
 
     jit_brgemm_trans_m_k_f16_t(const jit_brgemm_primitive_conf_t *conf)
-        : jit_brgemm_trans_src_t(conf), jit_generator(jit_name()) {}
+        : jit_brgemm_trans_src_t(conf), jit_generator_t(jit_name()) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -895,7 +901,7 @@ void jit_brgemm_trans_m_k_f16_t::transpose_16x16(int nrows, int ncolumns) {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     auto load = [&](int i) {
@@ -1042,7 +1048,7 @@ void jit_brgemm_trans_m_k_f16_t::generate() {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     kmovw(k3333, 0x3333); // 0011001100110011
@@ -1322,16 +1328,18 @@ void jit_brgemm_copy_to_coarse_t::generate() {
 }
 
 struct jit_trans_to_vnni_t : public jit_brgemm_trans_to_vnni_t,
-                             public jit_generator {
+                             public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_trans_to_vnni_t)
     jit_trans_to_vnni_t(const jit_brgemm_primitive_conf_t *conf,
             jit_brgemm_trans_to_vnni_t::matrix_to_transform_t
                     matrix_to_transform)
         : jit_brgemm_trans_to_vnni_t(conf, matrix_to_transform)
-        , jit_generator(jit_name()) {}
+        , jit_generator_t(jit_name()) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -1511,11 +1519,11 @@ void jit_trans_to_vnni_t::generate() {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
     auto kmovd = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     };
 
     kmovw(kFFFF, 0xffff); // 1111111111111111
@@ -1523,7 +1531,7 @@ void jit_trans_to_vnni_t::generate() {
 
     auto vmovdqa64 = [this](Zmm z, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(z, ptr[imm_addr64]);
     };
 
     vmovdqa64(vidx1, (const int64_t *)idx1);
@@ -1627,19 +1635,21 @@ void jit_trans_to_vnni_t::generate() {
 }
 
 struct jit_copy_f32_t : public jit_brgemm_trans_to_vnni_t,
-                        public jit_generator {
+                        public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_copy_f32_t)
     jit_copy_f32_t(const jit_brgemm_primitive_conf_t *conf,
             jit_brgemm_trans_to_vnni_t::matrix_to_transform_t
                     matrix_to_transform)
         : jit_brgemm_trans_to_vnni_t(conf, matrix_to_transform)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , column_step(isa_max_vlen(conf->isa) / typesize_data)
         , num_regs(isa_num_vregs(conf->isa))
         , col_shift(static_cast<dim_t>(column_step) * typesize_data) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -1726,7 +1736,7 @@ void jit_copy_f32_t::init_masks(int tail_length) {
     if (tail_length == 0) return;
     auto kmovd = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     };
 
     if (isa_has_masks(conf_->isa))
@@ -1812,13 +1822,13 @@ void jit_copy_f32_t::generate() {
 }
 
 struct jit_copy_f16_t : public jit_brgemm_trans_to_vnni_t,
-                        public jit_generator {
+                        public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_copy_f16_t)
     jit_copy_f16_t(const jit_brgemm_primitive_conf_t *conf,
             jit_brgemm_trans_to_vnni_t::matrix_to_transform_t
                     matrix_to_transform)
         : jit_brgemm_trans_to_vnni_t(conf, matrix_to_transform)
-        , jit_generator(jit_name()) {
+        , jit_generator_t(jit_name()) {
 
         // matrix_to_transform_ == matrix_B, copy(f16) -> f32
         // matrix_to_transform_ == matrix_C, copy(f32) -> f16 + zero_pad
@@ -1850,8 +1860,10 @@ struct jit_copy_f16_t : public jit_brgemm_trans_to_vnni_t,
         col_shift_out = column_step * typesize_out;
     }
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -1899,7 +1911,7 @@ void jit_copy_f16_t::copy_block(bool is_row_tail, bool is_col_tail) {
 
     auto kmovd = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     };
 
     const int nc_tail = ncolumns % column_step;
@@ -2111,16 +2123,18 @@ void jit_brgemm_relo_copy_to_wbuffer_t::generate() {
 }
 
 struct jit_brgemm_trans_wei_f32_t : public jit_brgemm_trans_wei_t,
-                                    public jit_generator {
+                                    public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_trans_wei_f32_t)
 
     jit_brgemm_trans_wei_f32_t(const jit_brgemm_primitive_conf_t *conf)
         : jit_brgemm_trans_wei_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , transpose_size(conf->simd_w) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -2152,7 +2166,7 @@ struct jit_brgemm_trans_wei_f32_t : public jit_brgemm_trans_wei_t,
 
     void kmovw(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     }
     void transpose_16x16(int nrows, int ncolumns);
     void transpose_8x8();
@@ -2296,8 +2310,8 @@ void jit_brgemm_trans_wei_f32_t::transpose_8x8() {
     mov(reg_tr_src_tmp, reg_tr_src);
     Xbyak::Ymm ymm_dummy = Ymm(0);
     Xbyak::Xmm xmm_dummy = Xmm(0);
-    jit_generator::transpose(reg_src, reg_tr_src_tmp, src_stride, tr_src_stride,
-            8, 8, data_type::f32,
+    jit_generator_t::transpose(reg_src, reg_tr_src_tmp, src_stride,
+            tr_src_stride, 8, 8, data_type::f32,
             /*unused*/ ymm_dummy, ymm_dummy, xmm_dummy);
 }
 
@@ -2408,14 +2422,16 @@ void jit_brgemm_trans_wei_f32_t::generate() {
 }
 
 struct jit_brgemm_trans_wei_bf16_t : public jit_brgemm_trans_wei_t,
-                                     public jit_generator {
+                                     public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_trans_wei_bf16_t)
 
     jit_brgemm_trans_wei_bf16_t(const jit_brgemm_primitive_conf_t *conf)
-        : jit_brgemm_trans_wei_t(conf), jit_generator(jit_name()) {}
+        : jit_brgemm_trans_wei_t(conf), jit_generator_t(jit_name()) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -2464,7 +2480,7 @@ void jit_brgemm_trans_wei_bf16_t::transpose_16x16_vnni(
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     auto load = [&](int i) {
@@ -2558,7 +2574,7 @@ void jit_brgemm_trans_wei_bf16_t::generate() {
 
     auto vmovdqa64 = [this](Zmm z, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(z, ptr[imm_addr64]);
     };
 
     vmovdqa64(v_abcdefgh_to_abefcdgh, (const int64_t *)abcdefgh_to_abefcdgh);
@@ -2621,14 +2637,16 @@ void jit_brgemm_trans_wei_bf16_t::generate() {
 }
 
 struct jit_brgemm_trans_wei_f16_t : public jit_brgemm_trans_wei_t,
-                                    public jit_generator {
+                                    public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_trans_wei_f16_t)
 
     jit_brgemm_trans_wei_f16_t(const jit_brgemm_primitive_conf_t *conf)
-        : jit_brgemm_trans_wei_t(conf), jit_generator(jit_name()) {}
+        : jit_brgemm_trans_wei_t(conf), jit_generator_t(jit_name()) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -2685,7 +2703,7 @@ void jit_brgemm_trans_wei_f16_t::transpose_16x16(int nrows, int ncolumns) {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     auto load = [&](int i) {
@@ -2829,7 +2847,7 @@ void jit_brgemm_trans_wei_f16_t::generate() {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     kmovw(k3333, 0x3333); // 0011001100110011
@@ -2897,16 +2915,18 @@ void jit_brgemm_trans_wei_f16_t::generate() {
 }
 
 struct jit_amx_ip_trans_diff_wei_to_vnni_t : public jit_amx_ip_trans_diff_wei,
-                                             public jit_generator {
+                                             public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_amx_ip_trans_diff_wei_to_vnni)
 
     jit_amx_ip_trans_diff_wei_to_vnni_t(const jit_brgemm_primitive_conf_t *jbgp,
             const int ext_ic_block, const int ext_oc_block)
         : jit_amx_ip_trans_diff_wei(jbgp, ext_ic_block, ext_oc_block)
-        , jit_generator(jit_name()) {}
+        , jit_generator_t(jit_name()) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     void generate() override;
@@ -2939,7 +2959,7 @@ void jit_amx_ip_trans_diff_wei_to_vnni_t::generate() {
             : 0xffff;
     auto kmovw = [this, regw_tmp](Xbyak::Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     auto reorder_oc_block = [&](int icb, int ic_block, bool is_oc_tail) {
diff --git a/src/cpu/x64/jit_brgemm_transpose_utils.hpp b/src/cpu/x64/jit_brgemm_transpose_utils.hpp
index 051257ef966..c889eef75c6 100644
--- a/src/cpu/x64/jit_brgemm_transpose_utils.hpp
+++ b/src/cpu/x64/jit_brgemm_transpose_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,12 +39,12 @@ struct jit_brgemm_trans_src_t {
 
     jit_brgemm_trans_src_t(const jit_brgemm_primitive_conf_t *conf)
         : conf_(conf) {}
-    virtual ~jit_brgemm_trans_src_t() {}
+    virtual ~jit_brgemm_trans_src_t() = default;
 
     const jit_brgemm_primitive_conf_t *conf_;
 };
 
-struct jit_brgemm_copy_to_coarse_t : public jit_generator {
+struct jit_brgemm_copy_to_coarse_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_copy_to_coarse_t)
 
     struct ctx_t {
@@ -55,11 +55,13 @@ struct jit_brgemm_copy_to_coarse_t : public jit_generator {
         dim_t last_row_blk;
     };
 
-    void operator()(ctx_t *ctx) { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     jit_brgemm_copy_to_coarse_t(const jit_brgemm_primitive_conf_t *conf)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , conf_(conf)
         , typesize_(sizeof(float) / data_type_vnni_granularity(conf_->wei_dt))
         , is_fwd_dir_(utils::one_of(conf_->prop_kind,
@@ -80,7 +82,8 @@ struct jit_brgemm_copy_to_coarse_t : public jit_generator {
 
         MAYBE_UNUSED(row_granularity_);
     }
-    ~jit_brgemm_copy_to_coarse_t() {}
+
+    ~jit_brgemm_copy_to_coarse_t() override = default;
 
 private:
     enum {
@@ -96,7 +99,7 @@ struct jit_brgemm_copy_to_coarse_t : public jit_generator {
             row_step_;
     const dim_t data_stride_, tr_data_stride_;
 
-    inline size_t addr_offset(int row_idx) {
+    inline size_t addr_offset(int row_idx) const {
         return row_idx * row_step_ * typesize_;
     }
 
@@ -153,7 +156,8 @@ struct jit_brgemm_trans_to_vnni_t {
     jit_brgemm_trans_to_vnni_t(const jit_brgemm_primitive_conf_t *conf,
             matrix_to_transform_t matrix_to_transform)
         : conf_(conf), matrix_to_transform_(matrix_to_transform) {}
-    virtual ~jit_brgemm_trans_to_vnni_t() {}
+
+    virtual ~jit_brgemm_trans_to_vnni_t() = default;
 
     const jit_brgemm_primitive_conf_t *conf_;
     matrix_to_transform_t matrix_to_transform_;
@@ -173,12 +177,12 @@ struct jit_brgemm_trans_wei_t {
 
     jit_brgemm_trans_wei_t(const jit_brgemm_primitive_conf_t *conf)
         : conf_(conf) {}
-    virtual ~jit_brgemm_trans_wei_t() {}
+    virtual ~jit_brgemm_trans_wei_t() = default;
 
     const jit_brgemm_primitive_conf_t *conf_;
 };
 
-struct jit_brgemm_relo_copy_to_wbuffer_t : public jit_generator {
+struct jit_brgemm_relo_copy_to_wbuffer_t : public jit_generator_t {
     struct cfg_t {
         data_type_t wei_dt {data_type_t::dnnl_data_type_undef};
         int out_oc_block {0};
@@ -200,7 +204,7 @@ struct jit_brgemm_relo_copy_to_wbuffer_t : public jit_generator {
     using reg64_t = Xbyak::Reg64;
 
     jit_brgemm_relo_copy_to_wbuffer_t(const cfg_t &ajcp)
-        : jit_generator(jit_name(), avx512_core_amx), wjcp(ajcp) {}
+        : jit_generator_t(jit_name(), avx512_core_amx), wjcp(ajcp) {}
 
 private:
     cfg_t wjcp;
@@ -239,7 +243,7 @@ struct jit_amx_ip_trans_diff_wei {
         , ext_ic_block_(ext_ic_block)
         , ext_oc_block_(ext_oc_block) {}
 
-    virtual ~jit_amx_ip_trans_diff_wei() {}
+    virtual ~jit_amx_ip_trans_diff_wei() = default;
 
     const jit_brgemm_primitive_conf_t *jbgp_;
 
diff --git a/src/cpu/x64/jit_brgemm_weights_decompression_kernel.cpp b/src/cpu/x64/jit_brgemm_weights_decompression_kernel.cpp
new file mode 100644
index 00000000000..346a6ef0313
--- /dev/null
+++ b/src/cpu/x64/jit_brgemm_weights_decompression_kernel.cpp
@@ -0,0 +1,374 @@
+/*******************************************************************************
+* Copyright 2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <float.h>
+
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_brgemm_weights_decompression_kernel.hpp"
+
+#define GET_OFF(field) offsetof(weights_decompression_runtime_params_t, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::utils;
+using namespace Xbyak;
+using namespace std::placeholders;
+
+template <cpu_isa_t isa>
+void jit_brgemm_weights_decompression_kernel_t<isa>::init_decomp_params(std::function<Vmm(int)> vmm_params, Xbyak::Reg64 reg_params, bool broadcast_values, data_type_t element_type) {
+    size_t oc_blocks_num = div_up(jcp_.oc_size, vec_size);
+    for (size_t ocb = 0; ocb < oc_blocks_num; ocb++) {
+        if (broadcast_values) {
+            switch (element_type) {
+                case data_type::f32: {
+                    uni_vbroadcastss(vmm_params(ocb), ptr[reg_params]);
+                    break;
+                }
+                case data_type::u8: {
+                    auto xmm_params = Xmm(vmm_params(ocb).getIdx());
+                    auto reg_tmp_32 = Reg32(reg_tmp.getIdx());
+                    movzx(reg_tmp_32, ptr[reg_params]);
+                    uni_vmovq(xmm_params, reg_tmp);
+                    uni_vcvtdq2ps(xmm_params, xmm_params);
+                    uni_vbroadcastss(vmm_params(ocb), xmm_params);
+                    break;
+                }
+                case data_type::e8m0: {
+                    auto xmm_params = Xmm(vmm_params(ocb).getIdx());
+                    auto reg_tmp_32 = Reg32(reg_tmp.getIdx());
+                    movzx(reg_tmp_32,  ptr[reg_params]);
+                    uni_vmovq(xmm_params, reg_tmp);
+                    uni_vpslld(xmm_params, xmm_params, 23);
+                    uni_vbroadcastss(vmm_params(ocb), xmm_params);
+                    break;
+                }
+                default: assert(!"unsupported data type");
+            }
+        } else {
+            const auto load_addr = ptr[reg_params + ocb * vec_size * types::data_type_size(element_type)];
+            switch (element_type) {
+                case data_type::f32: {
+                    uni_vmovups(vmm_params(ocb), load_addr);
+                    break;
+                }
+                case data_type::u8: {
+                    uni_vpmovzxbd(vmm_params(ocb), load_addr);
+                    uni_vcvtdq2ps(vmm_params(ocb), vmm_params(ocb));
+                    break;
+                }
+                case data_type::e8m0: {
+                    uni_vpmovzxbd(vmm_params(ocb), load_addr);
+                    uni_vpslld(vmm_params(ocb), vmm_params(ocb), 23);
+                    break;
+                }
+                default: assert(!"unsupported data type");
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_brgemm_weights_decompression_kernel_t<isa>::load_weights(Vmm vmm_load, const Xbyak::Address& addr, int ic) {
+    switch (jcp_.weights_dt) {
+        case data_type::u8: {
+            uni_vpmovzxbd(vmm_load, addr);
+            uni_vcvtdq2ps(vmm_load, vmm_load);
+            break;
+        }
+        case data_type::s8: {
+            uni_vpmovsxbd(vmm_load, addr);
+            uni_vcvtdq2ps(vmm_load, vmm_load);
+            break;
+        }
+        case data_type::u4: {
+            uni_vpmovzxbd(vmm_load, addr);
+            if (ic % 2 == 0) {
+                uni_vpsrld(vmm_load, vmm_load, 4);
+            } else {
+                uni_vpslld(vmm_load, vmm_load, 28);
+                uni_vpsrld(vmm_load, vmm_load, 28);
+            }
+            uni_vcvtdq2ps(vmm_load, vmm_load);
+            break;
+        }
+        case data_type::s4: {
+            uni_vpmovsxbd(vmm_load, addr);
+            if (ic % 2 == 0) {
+                vpsrad(vmm_load, vmm_load, 4);
+            } else {
+                uni_vpslld(vmm_load, vmm_load, 28);
+                vpsrad(vmm_load, vmm_load, 28);
+            }
+            uni_vcvtdq2ps(vmm_load, vmm_load);
+            break;
+        }
+        case data_type::nf4: {
+            uni_vpmovzxbd(vmm_load, addr);
+            if (ic % 2 == 0) {
+                uni_vpsrld(vmm_load, vmm_load, 4);
+            } else {
+                uni_vpslld(vmm_load, vmm_load, 28);
+                uni_vpsrld(vmm_load, vmm_load, 28);
+            }
+
+            if (isa == avx2) {
+                auto res = vmm_weights(1);
+                auto mask = vmm_weights(2);
+                vpcmpgtd(mask, vmm_load, vmm_mask7());
+                vpermd(res, vmm_load, vmm_lookup_low());
+                vpsubd(vmm_load, vmm_load, vmm_mask8());
+                vpermd(vmm_load, vmm_load, vmm_lookup_high());
+                vblendvps(vmm_load, res, vmm_load, mask);
+            } else {
+                vpermd(vmm_load, vmm_load, vmm_lookup());
+            }
+            break;
+        }
+        case data_type::f4_e2m1: {
+            if (isa == avx2) {
+                uni_vpmovsxbd(vmm_load, addr);
+                if (ic % 2 == 0) {
+                    vpsrad(vmm_load, vmm_load, 4);
+                } else {
+                    uni_vpslld(vmm_load, vmm_load, 28);
+                    vpsrad(vmm_load, vmm_load, 28);
+                }
+                auto mask = vmm_weights(1);
+                uni_vpand(mask, vmm_load, vmm_mask());
+                vpermd(vmm_load, vmm_load, vmm_lookup());
+                uni_vorps(vmm_load, vmm_load, mask);
+            } else {
+                uni_vpmovzxbd(vmm_load, addr);
+                if (ic % 2 == 0) {
+                    uni_vpsrld(vmm_load, vmm_load, 4);
+                } else {
+                    uni_vpslld(vmm_load, vmm_load, 28);
+                    uni_vpsrld(vmm_load, vmm_load, 28);
+                }
+                vpermd(vmm_load, vmm_load, vmm_lookup());
+            }
+            break;
+        }
+        case data_type::f16: {
+            vcvtph2ps(vmm_load, addr);
+            break;
+        }
+        case data_type::bf16: {
+            vpmovzxwd(vmm_load, addr);
+            uni_vpslld(vmm_load, vmm_load, 16);
+            break;
+        }
+        default: assert(!"unsupported data type");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_brgemm_weights_decompression_kernel_t<isa>::store_weights(const Xbyak::Address& addr, Vmm vmm_store) {
+    switch (jcp_.decomp_buffer_dt) {
+        case data_type::f32: {
+            uni_vmovups(addr, vmm_store);
+            break;
+        }
+        case data_type::bf16: {
+            Ymm ymm_store = Ymm(vmm_store.getIdx());
+            vcvtneps2bf16(ymm_store, vmm_store);
+            vmovdqu16(addr, ymm_store);
+            break;
+        }
+        default: assert(!"unsupported data type");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_brgemm_weights_decompression_kernel_t<isa>::generate() {
+    preamble();
+
+    mov(reg_weights, ptr[param1 + GET_OFF(weights_ptr)]);
+    mov(reg_decomp_buffer, ptr[param1 + GET_OFF(decomp_buffer_ptr)]);
+    if (jcp_.with_scales) {
+        mov(reg_scales, ptr[param1 + GET_OFF(scales_ptr)]);
+    }
+    if (jcp_.with_zero_points) {
+        mov(reg_zero_points, ptr[param1 + GET_OFF(zero_points_ptr)]);
+    }
+    mov(reg_ic_size, ptr[param1 + GET_OFF(ic_size)]);
+
+    if (jcp_.weights_dt == data_type::nf4) {
+        static const float lookup[16] = {
+            -1.0,
+            -0.6961928009986877,
+            -0.5250730514526367,
+            -0.39491748809814453,
+            -0.28444138169288635,
+            -0.18477343022823334,
+            -0.09105003625154495,
+            0.0,
+            0.07958029955625534,
+            0.16093020141124725,
+            0.24611230194568634,
+            0.33791524171829224,
+            0.44070982933044434,
+            0.5626170039176941,
+            0.7229568362236023,
+            1.0
+        };
+
+        static const int32_t mask8[16] = {
+            8, 8, 8, 8, 8, 8, 8, 8
+        };
+        static const int32_t mask7[16] = {
+            7, 7, 7, 7, 7, 7, 7, 7
+        };
+
+        if (isa == avx2) {
+            mov(reg_tmp, (size_t)lookup);
+            uni_vmovups(vmm_lookup_low(), ptr[reg_tmp]);
+            uni_vmovups(vmm_lookup_high(), ptr[reg_tmp + 8 * sizeof(float)]);
+            mov(reg_tmp, (size_t)mask8);
+            uni_vmovups(vmm_mask8(), ptr[reg_tmp]);
+            mov(reg_tmp, (size_t)mask7);
+            uni_vmovups(vmm_mask7(), ptr[reg_tmp]);
+        } else {
+            mov(reg_tmp, (size_t)lookup);
+            uni_vmovups(vmm_lookup(), ptr[reg_tmp]);
+        }
+    } else if (jcp_.weights_dt == data_type::f4_e2m1) {
+        static const float lookup[16] = {
+            0.0f,   0.5f,
+            1.0f,   1.5f,
+            2.0f,   3.0f,
+            4.0f,   6.0f,
+            -0.0f,  -0.5f,
+            -1.0f,  -1.5f,
+            -2.0f,  -3.0f,
+            -4.0f,  -6.0f
+        };
+
+        static const uint32_t mask_signed_bit[8] = {
+            0x80000000, 0x80000000, 0x80000000, 0x80000000,
+            0x80000000, 0x80000000, 0x80000000, 0x80000000,
+        };
+
+        if (isa == avx2) {
+            mov(reg_tmp, (size_t)lookup);
+            uni_vmovups(vmm_lookup(), ptr[reg_tmp]);
+            mov(reg_tmp, (size_t)mask_signed_bit);
+            uni_vmovups(vmm_mask(), ptr[reg_tmp]);
+        } else {
+            mov(reg_tmp, (size_t)lookup);
+            uni_vmovups(vmm_lookup(), ptr[reg_tmp]);
+        }
+    }
+
+    if (jcp_.with_scales)
+        init_decomp_params(std::bind(&jit_brgemm_weights_decompression_kernel_t::vmm_scales, this, _1), reg_scales, jcp_.broadcast_scales, jcp_.scales_dt);
+
+    if (jcp_.with_zero_points)
+        init_decomp_params(std::bind(&jit_brgemm_weights_decompression_kernel_t::vmm_zero_points, this, _1), reg_zero_points, jcp_.broadcast_zero_points, jcp_.zero_points_dt);
+
+    size_t oc_blocks_num = div_up(jcp_.oc_size, vec_size);
+
+    Xbyak::Label ic_loop_label;
+    Xbyak::Label ic_end_label;
+
+    size_t weights_dt_size = types::data_type_size(jcp_.weights_dt);
+    size_t typesize_scale = one_of(jcp_.weights_dt, data_type::nf4, data_type::s4, data_type::u4, data_type::f4_e2m1) ? 2 : 1;
+    size_t decomp_buf_dt_size = types::data_type_size(jcp_.decomp_buffer_dt);
+
+    L(ic_loop_label);
+    {
+        cmp(reg_ic_size, 1);
+        jl(ic_end_label, T_NEAR);
+
+        if (jcp_.decomp_buffer_dt == data_type::bf16) {
+            for (size_t ocb = 0; ocb < oc_blocks_num; ocb++) {
+                for (size_t ic = 0; ic < jcp_.ic_internal_size; ic++) {
+                    size_t weights_offset;
+                    if (jcp_.weights_dt == data_type::u8 || jcp_.weights_dt == data_type::s8)
+                        weights_offset = (ic * jcp_.oc_size + ocb * vec_size) * weights_dt_size / typesize_scale;
+                    else
+                        weights_offset = ocb * jcp_.ic_internal_size * vec_size * weights_dt_size / typesize_scale;
+                    auto vmm_load = vmm_weights(ic);
+                    const auto load_addr = ptr[reg_weights + weights_offset];
+                    load_weights(vmm_load, load_addr, ic);
+
+                    if (jcp_.with_zero_points)
+                        uni_vsubps(vmm_load, vmm_load, vmm_zero_points(ocb));
+                    if (jcp_.with_scales)
+                        uni_vmulps(vmm_load, vmm_load, vmm_scales(ocb));
+                }
+
+                auto ymm_store0 = Ymm(vmm_weights(0).getIdx());
+                auto ymm_store1 = Ymm(vmm_weights(1).getIdx());
+                auto ymm_aux0 = Ymm(vmm_weights(2).getIdx());
+                auto ymm_aux1 = Ymm(vmm_weights(3).getIdx());
+
+                vcvtneps2bf16(ymm_store0, vmm_weights(0));
+                vcvtneps2bf16(ymm_store1, vmm_weights(1));
+                vpunpcklwd(ymm_aux0, ymm_store0, ymm_store1);
+                vpunpckhwd(ymm_aux1, ymm_store0, ymm_store1);
+                vperm2i128(ymm_store0, ymm_aux0, ymm_aux1, 0x20);
+                vperm2i128(ymm_store1, ymm_aux0, ymm_aux1, 0x31);
+
+                for (size_t ic = 0; ic < jcp_.ic_internal_size; ic++) {
+                    auto ymm_store = Ymm(vmm_weights(ic).getIdx());
+                    size_t decomp_buffer_offset = (ocb * jcp_.ic_internal_size + ic) * vec_size * decomp_buf_dt_size;
+                    const auto decomp_buffer_addr = ptr[reg_decomp_buffer + decomp_buffer_offset];
+                    vmovdqu16(decomp_buffer_addr, ymm_store);
+                }
+            }
+        } else {
+            for (size_t ocb = 0; ocb < oc_blocks_num; ocb++) {
+                for (size_t ic = 0; ic < jcp_.ic_internal_size; ic++) {
+                    size_t weights_offset = ocb * jcp_.ic_internal_size * vec_size * weights_dt_size / typesize_scale;
+                    const auto weights_addr = ptr[reg_weights + weights_offset];
+                    load_weights(vmm_weights(0), weights_addr, ic);
+
+                    if (jcp_.with_zero_points)
+                        uni_vsubps(vmm_weights(0), vmm_weights(0), vmm_zero_points(ocb));
+                    if (jcp_.with_scales)
+                        uni_vmulps(vmm_weights(0), vmm_weights(0), vmm_scales(ocb));
+
+                    size_t decomp_buffer_offset = (ic * jcp_.oc_size + ocb * vec_size) * decomp_buf_dt_size;
+                    const auto decomp_buffer_addr = ptr[reg_decomp_buffer + decomp_buffer_offset];
+                    store_weights(decomp_buffer_addr, vmm_weights(0));
+                }
+            }
+        }
+
+        dec(reg_ic_size);
+        add(reg_weights, weights_dt_size * jcp_.oc_size * jcp_.ic_internal_size / typesize_scale);
+        add(reg_decomp_buffer, decomp_buf_dt_size * jcp_.oc_size * jcp_.ic_internal_size);
+
+        jmp(ic_loop_label, T_NEAR);
+    }
+    L(ic_end_label);
+
+    postamble();
+}
+
+template struct jit_brgemm_weights_decompression_kernel_t<avx512_core>;
+template struct jit_brgemm_weights_decompression_kernel_t<avx2>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
\ No newline at end of file
diff --git a/src/cpu/x64/jit_brgemm_weights_decompression_kernel.hpp b/src/cpu/x64/jit_brgemm_weights_decompression_kernel.hpp
new file mode 100644
index 00000000000..350f1ab056f
--- /dev/null
+++ b/src/cpu/x64/jit_brgemm_weights_decompression_kernel.hpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+* Copyright 2019-2022 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_BRGEMM_WEIGHTS_DECOMPRESSION_KERNEL_HPP
+#define CPU_X64_JIT_BRGEMM_WEIGHTS_DECOMPRESSION_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+
+#include "cpu/x64/jit_brgemm_primitive_conf.hpp"
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct weights_decompression_compile_params_t {
+    bool with_scales;
+    bool with_zero_points;
+    bool broadcast_scales;
+    bool broadcast_zero_points;
+    size_t oc_size;
+    size_t ic_internal_size;
+    data_type_t weights_dt;
+    data_type_t decomp_buffer_dt;
+    data_type_t scales_dt;
+    data_type_t zero_points_dt;
+};
+
+struct weights_decompression_runtime_params_t {
+    const void *weights_ptr;
+    const void *decomp_buffer_ptr;
+    const void *scales_ptr;
+    const void *zero_points_ptr;
+    size_t ic_size;
+};
+
+struct jit_weights_decompression_kernel_t {
+    void operator()(const weights_decompression_runtime_params_t *args) { assert(ker_);
+        ker_(args);
+    }
+
+    jit_weights_decompression_kernel_t(const weights_decompression_compile_params_t& jcp) : ker_(nullptr), jcp_(jcp) {}
+    virtual ~jit_weights_decompression_kernel_t() {}
+protected:
+    void (*ker_)(const weights_decompression_runtime_params_t *);
+
+    weights_decompression_compile_params_t jcp_;
+};
+
+template <cpu_isa_t isa>
+struct jit_brgemm_weights_decompression_kernel_t : public jit_weights_decompression_kernel_t, public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_weights_decompression_kernel_t)
+
+    jit_brgemm_weights_decompression_kernel_t(const weights_decompression_compile_params_t& jcp)
+        : jit_weights_decompression_kernel_t(jcp), jit_generator_t(jit_name()) {
+        vec_size = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+
+        create_kernel();
+        ker_ = (decltype(ker_))jit_ker();
+    }
+
+private:
+    using Vmm = typename utils::conditional3<isa == x64::sse41, Xbyak::Xmm, isa == x64::avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+
+    static constexpr int n_vregs = cpu_isa_traits_t<isa>::n_vregs;
+
+    void generate() override;
+    void init_decomp_params(std::function<Vmm(int)> vmm_params, Xbyak::Reg64 reg_params, bool broadcast_values, data_type_t element_type);
+    void load_weights(Vmm vmm_load, const Xbyak::Address& addr, int ic);
+    void store_weights(const Xbyak::Address& addr, Vmm vmm_store);
+
+    Vmm vmm_scales(int ocb) {
+        return Vmm(unroll_factor + ocb);
+    }
+
+    Vmm vmm_zero_points(int ocb) {
+        return Vmm(2 * unroll_factor + ocb);
+    }
+
+    Vmm vmm_weights(int ocb) {
+        assert(ocb < unroll_factor);
+        return Vmm(ocb);
+    }
+
+    Vmm vmm_tmp(int idx) {
+        return Vmm(n_vregs - idx - 1);
+    }
+
+    Vmm vmm_lookup() { return vmm_tmp(0); }
+    Vmm vmm_lookup_low() { return vmm_tmp(0); }
+    Vmm vmm_lookup_high() { return vmm_tmp(1); }
+    Vmm vmm_mask() { return vmm_tmp(1); }
+    Vmm vmm_mask8() { return vmm_tmp(2); }
+    Vmm vmm_mask7() { return vmm_tmp(3); }
+
+    Xbyak::Reg64 reg_weights = r8;
+    Xbyak::Reg64 reg_decomp_buffer = r9;
+    Xbyak::Reg64 reg_scales = r10;
+    Xbyak::Reg64 reg_zero_points = r11;
+    Xbyak::Reg64 reg_ic_size = r12;
+    Xbyak::Reg64 reg_tmp = r13;
+
+    size_t vec_size;
+
+    static constexpr int unroll_factor = 4;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // CPU_X64_JIT_BRGEMM_WEIGHTS_DECOMPRESSION_KERNEL_HPP
\ No newline at end of file
diff --git a/src/cpu/x64/jit_gemm_convolution_utils.cpp b/src/cpu/x64/jit_gemm_convolution_utils.cpp
new file mode 100644
index 00000000000..5c548ffe2cd
--- /dev/null
+++ b/src/cpu/x64/jit_gemm_convolution_utils.cpp
@@ -0,0 +1,425 @@
+/*******************************************************************************
+* Copyright 2020-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+
+#include "cpu/x64/jit_gemm_convolution_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace gemm_convolution_utils {
+
+using namespace dnnl::impl::cpu::gemm_convolution_utils;
+
+template <cpu_isa_t isa>
+struct jit_pp_kernel_t : pp_kernel_t, public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(
+            gemm_convolution_utils::jit_pp_kernel_t);
+
+    jit_pp_kernel_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
+            : pp_kernel_t(pd, jcp), jit_generator_t(jit_name()), idx_compute_vreg_start_(0), idx_compute_vreg_max_(isa == avx512_core ? 31 : 15) {
+        if (utils::one_of(isa, avx2, sse41)) {
+            idx_compute_vreg_start_ += 1;   //  Vmm(0) - for masks
+        }
+
+        bool only_eltwise = true;
+        bool with_binary = false;
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            if (post_op.is_eltwise()) {
+                jit_eltwise_injectors_.push_back(new jit_uni_eltwise_injector_t<isa>(
+                        this, post_op.eltwise, data_type::f32, true, eltwise_reserved_1_, eltwise_reserved_2_));
+            } else if (post_op.is_binary()) {
+                with_binary = true;
+                only_eltwise = false;
+            } else if (post_op.is_depthwise()) {
+                only_eltwise = false;
+                jit_depthwise_injectors_.push_back(new jit_uni_depthwise_injector_f32<isa>(
+                        this, post_op, depthwise_reserved_2_));
+            } else {
+                only_eltwise = false;
+            }
+        }
+        if (with_binary) {
+#define PARAM_OFF(field) offsetof(ker_args_t, field)
+            static constexpr bool preserve_gpr = true;
+            static constexpr bool preserve_vmm = true;
+            static constexpr size_t helper_vmm_idx = 15;
+            static constexpr size_t tail_size = 0;
+            static constexpr bool use_exact_tail_scalar_bcast = false;
+            const binary_injector::rhs_arg_static_params_t rhs_sp {
+                helper_vmm_idx, r13, r14, r15, preserve_gpr,
+                preserve_vmm, PARAM_OFF(post_ops_binary_rhs_arg_vec),
+                PARAM_OFF(dst_orig), memory_desc_wrapper(pd->dst_md()),
+                tail_size, kreg_rem_mask, use_exact_tail_scalar_bcast};
+#undef PARAM_OFF
+            const binary_injector::static_params_t bsp {this->reg_abi_bak, rhs_sp};
+            jit_binary_injector_ = utils::make_unique<
+                    binary_injector::jit_uni_binary_injector_t<isa>>(
+                    this, bsp);
+        }
+        if (post_ops_.len() > 0 && !only_eltwise) {
+            vreg_d_weights = Vmm(idx_compute_vreg_max_--);
+            vreg_d_bias = Vmm(idx_compute_vreg_max_--);
+        }
+        if (utils::one_of(isa, avx2, sse41)) {
+            vreg_zero = Vmm(idx_compute_vreg_start_++);
+            vreg_tmp = Vmm(idx_compute_vreg_start_++);
+        }
+    }
+    ~jit_pp_kernel_t() {
+        for (auto inj : jit_eltwise_injectors_)
+            delete inj;
+        jit_eltwise_injectors_.clear();
+        for (auto inj : jit_depthwise_injectors_)
+            delete inj;
+        jit_depthwise_injectors_.clear();
+    }
+
+    status_t create_kernel() override { return jit_generator_t::create_kernel(); }
+
+    void operator()(float *dst_orig, float *dst, const float *bias, const int len, const int oc_start, const int oc_work, const int oc_stride,
+                    const std::vector<const void *>& post_ops_binary_rhs_arg_vec) const override {
+        for (int oc = 0; oc < oc_work; oc++) {
+            ker_args_t args;
+            args.dst = dst + oc * oc_stride;
+            args.dst_orig = dst_orig;
+            args.bias = bias + oc_start + oc;
+            args.len = len;
+            args.oc_offset = oc_start + oc;
+            args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+            jit_generator_t::operator()(&args);
+        }
+    }
+
+    static bool post_ops_ok(const convolution_pd_t *pd) {
+        const auto& post_ops = pd->attr()->post_ops_;
+        auto dst_md = pd->dst_md();
+        for (int i = 0; i < post_ops.len(); i++) {
+            const auto &post_op = post_ops.entry_[i];
+            if (post_op.is_binary()) {
+                if (!binary_injector::is_supported(isa,
+                        binary_injector::get_src1_desc(post_op, *dst_md),
+                        *dst_md,
+                        default_strategies())) {
+                        return false;
+                }
+            }
+        }
+        return true;
+    }
+
+private:
+    void generate() override;
+
+    struct ker_args_t {
+        float *dst;
+        const float *bias;
+        size_t len;
+        size_t oc_offset;
+        const void *post_ops_binary_rhs_arg_vec;
+        float *dst_orig;
+    };
+
+    nstl::vector<jit_uni_eltwise_injector_t<isa> *> jit_eltwise_injectors_;
+    std::unique_ptr<binary_injector::jit_uni_binary_injector_t<isa>>
+            jit_binary_injector_;
+    nstl::vector<jit_uni_depthwise_injector_f32<isa> *> jit_depthwise_injectors_;
+
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    static const size_t vlen = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+
+    Xbyak::Reg64 reg_param = abi_param1;
+    Xbyak::Reg64 reg_dst = rdx;
+    Xbyak::Reg64 reg_bias = rbx;
+
+    Xbyak::Reg64 reg_len = r8;
+    Xbyak::Reg64 reg_tmp = rcx; // intentional for shifting purposes
+    Xbyak::Reg64 reg_abi_bak = rsi;
+    Xbyak::Reg64 reg_oc_offset = r9;
+    Xbyak::Reg64 reg_rem_mask = r10;
+    Xbyak::Opmask kreg_rem_mask = k1;
+
+    //  sse41/avx2
+    Xbyak::Reg64 reg_ptr_maskmovdqu_dst = rdi; // sse41: store destination - must be rdi
+    Xbyak::Label l_table;
+    Xbyak::Reg64 reg_table = r12;
+    Xbyak::Reg64 reg_shift_table = r13;
+    Vmm vreg_mask = Vmm(0); //  sse41: mask for blendvps must be in xmm0
+    Vmm vreg_zero;
+    Vmm vreg_tmp;     //  post_ops
+
+    //  post_ops
+    Xbyak::Reg64 eltwise_reserved_1_ = r11;
+    Xbyak::Opmask eltwise_reserved_2_ = k2;
+    Xbyak::Opmask depthwise_reserved_2_ = k2;
+    Xbyak::Reg64 reg_d_weights = r14;
+    Xbyak::Reg64 reg_d_bias = r15;
+    Xbyak::Reg64 reg_post_ops_data = rax;
+    Vmm vreg_d_weights, vreg_d_bias;
+
+    int idx_compute_vreg_start_;
+    int idx_compute_vreg_max_;
+
+    int idx_vreg_dst(int iter) {
+        int idx = idx_compute_vreg_start_ + 0;
+        assert(idx <= idx_compute_vreg_max_);
+        return idx;
+    }
+    int idx_vreg_bias(int iter) {
+        int idx = idx_compute_vreg_start_ + 1;
+        assert(idx <= idx_compute_vreg_max_);
+        return idx;
+    }
+
+    Vmm vreg_dst(int idx) { return Vmm(idx_vreg_dst(idx)); };
+    Vmm vreg_bias(int idx) { return Vmm(idx_vreg_bias(idx)); };
+};
+
+template <cpu_isa_t isa>
+void jit_pp_kernel_t<isa>::generate() {
+    using namespace Xbyak;
+    using namespace utils;
+
+    preamble();
+
+#define PARAM_OFF(x) offsetof(ker_args_t, x)
+    mov(reg_abi_bak, reg_param);
+    mov(reg_dst, ptr[reg_param + PARAM_OFF(dst)]);
+    mov(reg_bias, ptr[reg_param + PARAM_OFF(bias)]);
+    mov(reg_len, ptr[reg_param + PARAM_OFF(len)]);
+    mov(reg_oc_offset, ptr[reg_param + PARAM_OFF(oc_offset)]);
+    mov(reg_post_ops_data, ptr[reg_param + PARAM_OFF(post_ops_binary_rhs_arg_vec)]);
+#undef PARAM_OFF
+
+    if (utils::one_of(isa, avx2, sse41)) {
+        uni_vpxor(vreg_zero, vreg_zero, vreg_zero);
+        mov(reg_table, l_table);
+    }
+
+    auto apply_post_ops = [&](bool apply_mask) {
+        int eltwise_inj_idx = 0;
+        int binary_inj_idx = 0;
+        int depthwise_inj_idx = 0;
+        std::size_t post_ops_data_offset = 0;
+        auto vreg_dst_ = vreg_dst(0);
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            // todo: antonvor: sum?
+            if (post_op.is_eltwise()) {
+                jit_eltwise_injectors_[eltwise_inj_idx]->compute_vector(vreg_dst_.getIdx());
+                eltwise_inj_idx++;
+            } else if (post_op.is_binary()) {
+                binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
+                rhs_arg_params.vmm_idx_to_out_reg.emplace(vreg_dst_.getIdx(), reg_dst);
+                rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(
+                        vreg_dst_.getIdx(), 0 * sizeof(float));
+                if (mayiuse(avx512_core) && apply_mask) 
+                    rhs_arg_params.vmm_tail_idx_.emplace(vreg_dst_.getIdx());
+                jit_binary_injector_->compute_vector(
+                        vreg_dst_.getIdx(), binary_inj_idx, post_op, rhs_arg_params);
+
+                binary_inj_idx++;
+            } else if (post_op.is_depthwise()) {
+                mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]);
+                lea(reg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float)]);
+                jit_depthwise_injectors_[depthwise_inj_idx]->compute_vector_range(vreg_dst_.getIdx(), vreg_dst_.getIdx() + 1,
+                                                                                  reg_d_weights, reg_d_weights, true);
+                post_ops_data_offset += jit_depthwise_injectors_[depthwise_inj_idx]->memoryStep();
+                binary_inj_idx++;
+                depthwise_inj_idx++;
+            } else if (post_op.is_quantization()) {
+                bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
+                bool do_rounding = true;
+
+                size_t crop_low_off = post_op.quantization.offset[post_op.quantization.crop_low] * sizeof(float);
+                size_t crop_high_off = post_op.quantization.offset[post_op.quantization.crop_high] * sizeof(float);
+                mov(reg_d_weights, ptr[reg_post_ops_data + post_ops_data_offset]);
+                if (post_op.quantization.per_channel[post_op.quantization.crop_low]) {
+                    uni_vpbroadcastd(vreg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + crop_low_off]);
+                } else {
+                    uni_vpbroadcastd(vreg_d_weights, ptr[reg_d_weights + crop_low_off]);
+                }
+
+                if (post_op.quantization.per_channel[post_op.quantization.crop_high]) {
+                    uni_vpbroadcastd(vreg_d_bias, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + crop_high_off]);
+                } else {
+                    uni_vpbroadcastd(vreg_d_bias, ptr[reg_d_weights + crop_high_off]);
+                }
+
+                uni_vmaxps(vreg_dst_, vreg_dst_, vreg_d_weights);
+                uni_vminps(vreg_dst_, vreg_dst_, vreg_d_bias);
+
+                size_t inp_scale_off = post_op.quantization.offset[post_op.quantization.inp_scale] * sizeof(float);
+                size_t inp_shift_off = post_op.quantization.offset[post_op.quantization.inp_shift] * sizeof(float);
+                if (post_op.quantization.per_channel[post_op.quantization.inp_scale]) {
+                    uni_vpbroadcastd(vreg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + inp_scale_off]);
+                } else {
+                    uni_vpbroadcastd(vreg_d_weights, ptr[reg_d_weights + inp_scale_off]);
+                }
+
+                if (post_op.quantization.per_channel[post_op.quantization.inp_shift]) {
+                    uni_vpbroadcastd(vreg_d_bias, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + inp_shift_off]);
+                } else {
+                    uni_vpbroadcastd(vreg_d_bias, ptr[reg_d_weights + inp_shift_off]);
+                }
+
+                uni_vfmadd213ps(vreg_dst_, vreg_d_weights, vreg_d_bias);
+
+                if (do_rounding)
+                    uni_vroundps(vreg_dst_, vreg_dst_, 0);
+
+                size_t output_scale_off = post_op.quantization.offset[post_op.quantization.output_scale] * sizeof(float);
+                size_t output_shift_off = post_op.quantization.offset[post_op.quantization.output_shift] * sizeof(float);
+                if (do_dequantization) {
+                    if (post_op.quantization.per_channel[post_op.quantization.output_scale]) {
+                        uni_vpbroadcastd(vreg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + output_scale_off]);
+                    } else {
+                        uni_vpbroadcastd(vreg_d_weights, ptr[reg_d_weights + output_scale_off]);
+                    }
+
+                    if (post_op.quantization.per_channel[post_op.quantization.output_shift]) {
+                        uni_vpbroadcastd(vreg_d_bias, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + output_shift_off]);
+                    } else {
+                        uni_vpbroadcastd(vreg_d_bias, ptr[reg_d_weights + output_shift_off]);
+                    }
+
+                    uni_vfmadd213ps(vreg_dst_, vreg_d_weights, vreg_d_bias);
+                }
+
+                post_ops_data_offset += sizeof(float*);
+                binary_inj_idx++;
+            }
+        }
+    };
+
+    // Load accumulated value, convert to float, apply bias (if any), scaling,
+    // and eltwise (if any); then convert to destination type and store
+    auto compute = [&](bool apply_mask) {
+        auto dst_addr = ptr[reg_dst];
+        auto vreg_dst_ = vreg_dst(0);
+        if (isa == avx512_core) {
+            if (apply_mask)
+                vreg_dst_ = vreg_dst_ | kreg_rem_mask;
+            uni_vmovups(vreg_dst_, dst_addr);
+        } else {
+            if (apply_mask) {
+                if (isa != sse41) {
+                    vmaskmovps(vreg_tmp, vreg_mask, dst_addr);
+                    uni_vblendvps(vreg_dst_, vreg_zero, vreg_tmp, vreg_mask);
+                } else {
+                    uni_vmovups(vreg_dst_, dst_addr);
+                }
+            } else {
+                uni_vmovups(vreg_dst_, dst_addr);
+            }
+        }
+
+        if (do_bias_) {
+            auto vreg_bias_ = vreg_bias(0);
+            if (isa == avx512_core && apply_mask)
+                vreg_bias_ = vreg_bias_ | kreg_rem_mask;
+
+            uni_vpbroadcastd(vreg_bias_, ptr[reg_bias]);
+            uni_vaddps(vreg_dst_, vreg_dst_, vreg_bias_);
+        }
+
+        apply_post_ops(apply_mask);
+
+        if (isa == avx512_core) {
+            uni_vmovups(dst_addr, vreg_dst_);
+        } else {
+            if (apply_mask) {
+                if (isa != sse41) {
+                    vmaskmovps(dst_addr, vreg_mask, vreg_dst_);
+                } else {
+                    lea(reg_ptr_maskmovdqu_dst, dst_addr);
+                    maskmovdqu(vreg_dst_, vreg_mask);
+                }
+            } else {
+                uni_vmovups(dst_addr, vreg_dst_);
+            }
+        }
+    };
+
+    Label loop_end;
+    {
+        cmp(reg_len, 0);
+        je(loop_end, T_NEAR);
+
+        Label loop, loop_tail;
+        cmp(reg_len, vlen);
+        jl(loop_tail, T_NEAR);
+        L(loop); {
+            compute(false);
+            sub(reg_len, vlen);
+            add(reg_dst, vlen * sizeof(float));
+            cmp(reg_len, vlen);
+            jge(loop, T_NEAR);
+        }
+
+        L(loop_tail);
+        mov(reg_tmp, reg_len); // reg_tmp is rcx, and we need cl for the shift
+        if (isa == avx512_core) {
+            mov(reg_rem_mask, 1);
+            shl(reg_rem_mask, cl); // reg_tmp == rcx and reg_tail < vlen == 16
+            sub(reg_rem_mask, 1);
+            jz(loop_end, T_NEAR);
+            kmovq(kreg_rem_mask, reg_rem_mask);
+        } else {
+            mov(reg_shift_table, vlen);
+            sub(reg_shift_table, reg_tmp);
+            uni_vmovups(vreg_mask, ptr[reg_table + reg_shift_table * sizeof(float)]);
+        }
+        compute(true);
+    }
+    L(loop_end);
+
+    postamble();
+
+    for (auto& inj : jit_eltwise_injectors_)
+        inj->prepare_table();
+
+    if (utils::one_of(isa, avx2, sse41)) {
+        align(64);
+        L(l_table);
+        for (size_t i = 0; i < vlen; i++) dd(0xFFFFFFFF);
+        for (size_t i = 0; i < vlen; i++) dd(0x00000000);
+    }
+}
+
+pp_kernel_t *jit_pp_kernel_create(
+        const convolution_pd_t *pd, const conv_gemm_conf_t &jcp) {
+    if (mayiuse(avx512_core) && jit_pp_kernel_t<avx512_core>::post_ops_ok(pd)) {
+        return new jit_pp_kernel_t<avx512_core>(pd, jcp);
+    } else if (mayiuse(avx2) && jit_pp_kernel_t<avx2>::post_ops_ok(pd)) {
+        return new jit_pp_kernel_t<avx2>(pd, jcp);
+    } else if (mayiuse(sse41) && jit_pp_kernel_t<sse41>::post_ops_ok(pd)) {
+        return new jit_pp_kernel_t<sse41>(pd, jcp);
+    }
+    return nullptr;
+}
+
+} // namespace gemm_convolution_utils
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_gemm_convolution_utils.hpp b/src/cpu/x64/jit_gemm_convolution_utils.hpp
new file mode 100644
index 00000000000..728269c0919
--- /dev/null
+++ b/src/cpu/x64/jit_gemm_convolution_utils.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+* Copyright 2020-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_GEMM_CONVOLUTION_UTILS_HPP
+#define CPU_X64_JIT_GEMM_CONVOLUTION_UTILS_HPP
+
+#include "cpu/gemm_convolution_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace gemm_convolution_utils {
+
+cpu::gemm_convolution_utils::pp_kernel_t *jit_pp_kernel_create(
+        const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
+} // namespace gemm_convolution_utils
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_gemm_inner_product_utils.cpp b/src/cpu/x64/jit_gemm_inner_product_utils.cpp
index 52854681668..5b70984e63d 100644
--- a/src/cpu/x64/jit_gemm_inner_product_utils.cpp
+++ b/src/cpu/x64/jit_gemm_inner_product_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ using namespace Xbyak;
 using namespace data_type;
 
 template <cpu_isa_t isa>
-struct jit_pp_kernel_t : public pp_kernel_t, public jit_generator {
+struct jit_pp_kernel_t : public pp_kernel_t, public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(inner_product_utils::jit_pp_kernel_t);
 
     jit_pp_kernel_t(size_t OC, size_t MB, dim_t dst_mb_stride,
@@ -54,7 +54,9 @@ struct jit_pp_kernel_t : public pp_kernel_t, public jit_generator {
             size_t first_mb_matrix_addr_off, const exec_ctx_t &ctx,
             const memory_desc_t &dst_md) const override;
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
@@ -178,7 +180,7 @@ struct jit_pp_kernel_t : public pp_kernel_t, public jit_generator {
     int compute_vreg_bias_shift_ = 0;
     int compute_vreg_prev_dst_shift_ = 0;
 
-    const size_t vlen = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const size_t vlen = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     static constexpr int reg64_size_ = sizeof(int64_t);
     static constexpr int reg_binary_post_op_oc_off_ = 0;
     static constexpr int reg_binary_post_op_offset_ = 1 * reg64_size_;
@@ -270,7 +272,7 @@ jit_pp_kernel_t<isa>::jit_pp_kernel_t(size_t OC, size_t MB, dim_t dst_mb_stride,
         const memory_desc_t *dst_md, bool skip_sum)
     : pp_kernel_t(
             OC, MB, dst_mb_stride, attr, bias_dt, acc_dt, dst_md, skip_sum)
-    , jit_generator(jit_name(), isa) {
+    , jit_generator_t(jit_name(), isa) {
     assert(IMPLICATION(this->dst_data_type_ == bf16, mayiuse(avx512_core)));
 
     if (this->do_scale_) vreg_scale = Vmm(idx_compute_vreg_start_++);
@@ -288,7 +290,7 @@ jit_pp_kernel_t<isa>::jit_pp_kernel_t(size_t OC, size_t MB, dim_t dst_mb_stride,
 
     if (this->do_bias()) compute_vreg_bias_shift_ = compute_vregs_per_iter_++;
 
-    if (!attr->scales_.get(DNNL_ARG_DST).has_default_values()) {
+    if (!attr->scales_.has_default_values(DNNL_ARG_DST)) {
         this->do_dst_scale_ = true;
         vreg_dst_scale = Vmm(idx_compute_vreg_start_++);
     }
@@ -313,6 +315,7 @@ jit_pp_kernel_t<isa>::jit_pp_kernel_t(size_t OC, size_t MB, dim_t dst_mb_stride,
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = true;
         static const size_t helper_vmm_idx = is_avx512_ ? 31 : 15;
+        static const size_t prelu_helper_vmm_idx = is_avx512_ ? 30 : 0; // todo: [antonvor] check prelu_helper_vmm_idx if is_avx512_ == false
         static constexpr bool use_exact_tail_scalar_bcast = false;
         const auto dst_md_wrapper = memory_desc_wrapper(*dst_md);
 
@@ -333,7 +336,7 @@ jit_pp_kernel_t<isa>::jit_pp_kernel_t(size_t OC, size_t MB, dim_t dst_mb_stride,
                 helper_vmm_idx, eltwise_reserved_gpr_, r14, r15, preserve_gpr,
                 preserve_vmm, PARAM_OFF(post_ops_binary_rhs_arg_vec),
                 PARAM_OFF(dst_orig), dst_md_wrapper, tail_size, opmask_binary,
-                reg_tmp, use_exact_tail_scalar_bcast};
+                reg_tmp, use_exact_tail_scalar_bcast, prelu_helper_vmm_idx};
         static const bcast_set_t enabled_bcast_strategy
                 = {broadcasting_strategy_t::scalar,
                         broadcasting_strategy_t::per_oc,
@@ -1180,7 +1183,7 @@ void jit_pp_kernel_t<isa>::generate() {
             && (this->OC_ <= vlen / 2) && (this->MB_ >= vlen);
     bool supported_postops = this->do_scale_ || this->do_eltwise_
             || this->do_binary_ || this->do_prelu_ || this->do_sum_
-            || this->do_dst_zero_points_ || this->do_dst_scale_;
+            || this->do_dst_zero_points_ || this->do_dst_scale_ || (this->post_ops_.len() > 0);
     if (this->do_bias() && !supported_postops && dim_restrict
             && this->has_trivial_mb_stride()) {
         this->mb_blk_kernel_ = true;
@@ -1237,7 +1240,7 @@ void jit_pp_kernel_t<isa>::operator()(void *dst, const void *acc,
 
     args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
     args.dst_orig = dst_orig;
-    jit_generator::operator()(&args);
+    jit_generator_t::operator()(&args);
 }
 
 pp_kernel_t *jit_pp_kernel_create(size_t OC, size_t MB, dim_t dst_mb_stride,
diff --git a/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.cpp b/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.cpp
index 0736ce945ef..001a6e6f7a3 100644
--- a/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.cpp
+++ b/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace x64 {
 namespace gemm_x8s8s32x_convolution_utils {
 
 jit_gemm_x8s8s32x_zp_pad_comp_helper::jit_gemm_x8s8s32x_zp_pad_comp_helper(
-        jit_generator *host, const conv_gemm_conf_t &jcp,
+        jit_generator_t *host, const conv_gemm_conf_t &jcp,
         const Xbyak::Reg64 &reg_zp_pad_comp,
         const Xbyak::Reg64 &reg_zp_pad_comp_temp,
         const Xbyak::Reg8 &should_apply_zp_src_pad, const dim_t ndims)
@@ -88,7 +88,7 @@ void jit_gemm_x8s8s32x_zp_pad_comp_helper::zp_src_comp_pad_operation(
     if (op) {
         Xbyak::Label end;
         host_->cmp(should_apply_zp_src_pad_, 0);
-        host_->je(end, host_->T_NEAR);
+        host_->je(end, jit_generator_t::T_NEAR);
         op(reg_zp_pad_comp_);
         host_->L(end);
     }
@@ -195,7 +195,7 @@ void jit_gemm_x8s8s32x_zp_pad_comp_helper::load_zp_src_comp_pad_addr_if_needed(
         const Xbyak::Address &g_oc_offset) {
     Xbyak::Label calc_zp_src_comp_pad_addr, end;
     host_->cmp(should_apply_zp_src_pad_, 0);
-    host_->je(end, host_->T_NEAR);
+    host_->je(end, jit_generator_t::T_NEAR);
 
     host_->L(calc_zp_src_comp_pad_addr);
     {
@@ -251,20 +251,20 @@ void jit_gemm_x8s8s32x_zp_pad_comp_helper::get_zp_pad_com_dim(
     host_->L(lower_bound);
     {
         host_->cmp(dim_under_lower_bound, 0);
-        host_->je(upper_bound, host_->T_NEAR);
+        host_->je(upper_bound, jit_generator_t::T_NEAR);
         host_->mov(reg_zp_pad_comp_tmp_, out_point_dim);
         host_->mov(result, reg_zp_pad_comp_tmp_);
-        host_->jmp(end, host_->T_NEAR);
+        host_->jmp(end, jit_generator_t::T_NEAR);
     }
     host_->L(upper_bound);
     {
         host_->cmp(dim_over_eq_upper_bound, 0);
-        host_->je(mid_point, host_->T_NEAR);
+        host_->je(mid_point, jit_generator_t::T_NEAR);
         host_->mov(reg_zp_pad_comp_tmp_,
                 begin_pad + mid_pad + end_pad - out_dim_size);
         host_->add(reg_zp_pad_comp_tmp_, out_point_dim);
         host_->mov(result, reg_zp_pad_comp_tmp_);
-        host_->jmp(end, host_->T_NEAR);
+        host_->jmp(end, jit_generator_t::T_NEAR);
     }
 
     host_->L(mid_point);
@@ -299,7 +299,7 @@ void jit_gemm_x8s8s32x_zp_pad_comp_helper::next_point() {
     }
 
     host_->cmp(reg_w, w_size_addr_);
-    host_->jl(store_w, host_->T_NEAR);
+    host_->jl(store_w, jit_generator_t::T_NEAR);
 
     if (with_zp_pad_com_h_) {
 
diff --git a/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp b/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp
index 91dcb7cbff4..4b4489e831b 100644
--- a/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp
+++ b/src/cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,12 +30,12 @@ struct conv_gemm_conf_t;
 
 namespace x64 {
 
-class jit_generator;
+class jit_generator_t;
 
 namespace gemm_x8s8s32x_convolution_utils {
 
 struct jit_gemm_x8s8s32x_zp_pad_comp_helper {
-    jit_gemm_x8s8s32x_zp_pad_comp_helper(jit_generator *host,
+    jit_gemm_x8s8s32x_zp_pad_comp_helper(jit_generator_t *host,
             const conv_gemm_conf_t &jcp, const Xbyak::Reg64 &reg_zp_pad_comp,
             const Xbyak::Reg64 &reg_zp_pad_comp_temp,
             const Xbyak::Reg8 &should_apply_zp_src_pad, const dim_t ndims);
@@ -84,7 +84,7 @@ struct jit_gemm_x8s8s32x_zp_pad_comp_helper {
     void should_apply_zp_src_pad();
     void next_point();
 
-    jit_generator *const host_;
+    jit_generator_t *const host_;
     const conv_gemm_conf_t &jcp_;
     const Xbyak::Address w_addr_;
     const Xbyak::Address h_addr_;
diff --git a/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.cpp b/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.cpp
index 9bb3d0e1687..7133f5ac0e2 100644
--- a/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.cpp
+++ b/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <functional>
 
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
 #include "cpu/x64/jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp"
 #include "cpu/x64/jit_gemm_x8s8s32x_convolution_utils.hpp"
 #include "cpu/x64/jit_generator.hpp"
@@ -31,38 +32,183 @@ namespace x64 {
 namespace gemm_x8s8s32x_convolution_utils {
 using namespace dnnl::impl::cpu::gemm_x8s8s32x_convolution_utils;
 
-struct jit_pp_ker_t : pp_ker_t, public jit_generator {
+template <cpu_isa_t isa>
+struct jit_pp_ker_t : pp_ker_t, public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             gemm_x8s8s32x_convolution_utils::jit_pp_ker_t);
 
-    jit_pp_ker_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
+    jit_pp_ker_t(const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
+            : pp_ker_t(pd, jcp)
+            , jit_generator_t(jit_name())
+            , do_eltwise_(false)
+            , do_sum_(false)
+            , sum_scale_(0)
+            , sum_data_type_(dnnl_f32)
+            , default_OC_loop_unroll_(4)
+            , max_OC_loop_unroll_(isa == avx512_core ? 12 : 6)
+            , idx_compute_vreg_start_(0)
+            , idx_compute_vreg_max_(isa == avx512_core ? 31 : 15)
+            , compute_vregs_per_iter_(1)
+    {
+        if (utils::one_of(isa, avx2, sse41)) {
+            idx_compute_vreg_start_ += 2;   //  Vmm(0), Vmm(1) - for masks
+        }
+        if (do_scale_) {
+            vreg_scale = Vmm(idx_compute_vreg_start_++);
+        }
+        dst_data_type_size_ = types::data_type_size(dst_data_type_);
+        if (dst_data_type_ == data_type::u8 || utils::one_of(isa, avx2, sse41)) {
+            vreg_zero = Vmm(idx_compute_vreg_start_++);
+        }
+        bool only_eltwise_or_sum = true;
+        bool with_binary = false;
+        for (int idx = 0; idx < post_ops_.len(); ++idx) {
+            const auto &e = post_ops_.entry_[idx];
+            if (e.is_eltwise(true)) {
+                do_eltwise_ = true;
+            } else if (e.is_binary()) {
+                with_binary = true;
+                only_eltwise_or_sum = false;
+            } else if (e.is_sum()) {
+                do_sum_ = true;
+                sum_scale_ = e.sum.scale;
+                sum_data_type_ = e.sum.dt;
+            } else {
+                only_eltwise_or_sum = false;
+            }
+        }
+        if (with_binary) {
+#define PARAM_OFF(field) offsetof(ker_args_t, field)
+            static constexpr bool preserve_gpr = true;
+            static constexpr bool preserve_vmm = true;
+            static constexpr size_t helper_vmm_idx = 1;
+            static constexpr size_t tail_size = 0;
+            static constexpr bool use_exact_tail_scalar_bcast = false;
+            const binary_injector::rhs_arg_static_params_t rhs_sp {
+                helper_vmm_idx, r13, r14, r15, preserve_gpr,
+                preserve_vmm, PARAM_OFF(post_ops_binary_rhs_arg_vec),
+                PARAM_OFF(dst_orig), memory_desc_wrapper(pd->dst_md()),
+                tail_size, kreg_rem_mask_short, use_exact_tail_scalar_bcast};
+#undef PARAM_OFF
+            const binary_injector::static_params_t bsp {this->reg_param_bak, rhs_sp};
+            jit_binary_injector_ = utils::make_unique<
+                    binary_injector::jit_uni_binary_injector_t<isa>>(
+                    this, bsp);
+        }
+        if (post_ops_.len() > 0 && !only_eltwise_or_sum) {
+            vreg_d_weights = Vmm(idx_compute_vreg_max_--);
+            vreg_d_bias = Vmm(idx_compute_vreg_max_--);
+        }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
-    void operator()(void *void_dst, const acc_data_t *acc, const char *bias,
+        do_signed_scaling_ = jcp_.signed_input;
+        if (do_signed_scaling_)
+            vreg_signed_scale = Vmm(idx_compute_vreg_start_++);
+
+        if (do_bias_) {
+            bias_data_type_size_ = types::data_type_size(bias_data_type_);
+            compute_vregs_per_iter_++;
+        }
+
+
+        if (jcp.with_dst_scale) {
+            vreg_dst_scale = Vmm(idx_compute_vreg_start_++);
+            // compute_vregs_per_iter_++;
+        }
+
+        if (jcp.zp.dst_exists) {
+            vreg_zp_dst_common = Vmm(idx_compute_vreg_start_++);
+            // compute_vregs_per_iter_++;
+        }
+
+        if (do_sum_) {
+            vreg_sum_scale = Vmm(idx_compute_vreg_start_++);
+            compute_vregs_per_iter_++;
+        }
+
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            if (post_op.is_eltwise()) {
+                jit_eltwise_injectors_.push_back(new jit_uni_eltwise_injector_t<isa>(
+                        this, post_op.eltwise, data_type::f32, true, eltwise_reserved, mask_post_op_reserved));
+            } else if (post_op.is_depthwise()) {
+                jit_depthwise_injectors_.push_back(new jit_uni_depthwise_injector_f32<isa>(
+                        this, post_op, mask_post_op_reserved));
+            }
+        }
+
+        int max_unroll = (idx_compute_vreg_max_ - idx_compute_vreg_start_ + 1) / compute_vregs_per_iter_;
+        max_OC_loop_unroll_ = nstl::min(max_OC_loop_unroll_, max_unroll);
+        default_OC_loop_unroll_ = nstl::min(default_OC_loop_unroll_, max_unroll);
+    }
+    ~jit_pp_ker_t() {
+        for (auto inj : jit_eltwise_injectors_)
+            delete inj;
+        jit_eltwise_injectors_.clear();
+        for (auto inj : jit_depthwise_injectors_)
+            delete inj;
+        jit_depthwise_injectors_.clear();
+    }
+
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
+    void operator()(void *void_dst, acc_data_t *acc, const char *bias,
             const float *scales, float dst_scale, float sum_scale,
             float signed_scale, int g, size_t start, size_t end,
             const zero_point_call_params_t &zp,
             const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
             const exec_ctx_t & /* ctx */, const memory_desc_t & /* dst_md */,
-            const single_gemm_conv_chunk_desc_t &) const override;
+            const single_gemm_conv_chunk_desc_t &chunk_desc) const override {
+        if (end <= start) return;
+
+        char *dst = (char *)void_dst;
+
+        ker_args_t args;
+        size_t oc_offset = start % OC_;
+        size_t os_offset = start / OC_;
+        args.acc = acc + start;
+        args.dst = dst
+                   + (os_offset * dst_os_stride_ + oc_offset)
+                     * dst_data_type_size_;
+        const ptrdiff_t g_oc_offset = g * jcp_.oc;
+        const ptrdiff_t g_oc_offset_prologue = g_oc_offset + oc_offset;
+        args.dst_orig = dst_orig;
+        args.bias = bias + (g * jcp_.oc + oc_offset) * bias_data_type_size_;
+        args.zp_src = zp.src + (jcp_.zp.src_is_common ? 0 : g_oc_offset_prologue);
+        args.zp_src_comp
+            = zp.src_comp ? zp.src_comp + g_oc_offset_prologue : nullptr;
+        args.zp_dst = zp.dst;
+        args.scales = scales + jcp_.scale_idx_mult * (g * jcp_.oc + oc_offset);
+        args.dst_scale = dst_scale;
+        args.sum_scale = sum_scale_;
+        args.signed_scale = signed_scale;
+        args.len = end - start;
+        args.oc_offset = oc_offset;
+        args.g_offset = g * jcp_.oc;
+        args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
+        args.dst_orig = dst_orig;
+
+        if (zp_pad_comp_helper_) {
+            const auto hw
+                = std::div(static_cast<dim_t>(os_offset), chunk_desc.w_size_);
+            args.h = hw.quot + chunk_desc.h_off_;
+            args.w = hw.rem + chunk_desc.w_off_;
+            args.w_size = chunk_desc.w_size_ + chunk_desc.w_off_;
+            args.w_off = chunk_desc.w_off_;
+            args.zp_src_pad_comp = zp.src_pad_comp;
+            const auto zp_src_pad_com_d
+                = zp_pad_comp_helper_->calculate_zp_src_pad_com_d(
+                    chunk_desc.d_off_);
+            args.zp_src_pad_com_d_offset = zp_src_pad_com_d.offset;
+            args.should_apply_zp_src_pad_comp_d
+                = zp_src_pad_com_d.should_apply_pad_comp_d;
+        }
+
+        jit_generator_t::operator()(&args);
+    }
 
 private:
-    void apply_postops(
-            const Xbyak::Reg64 &reg_dst, const int idx, const size_t offset);
     void generate() override;
-    void append_zp_src_comp(size_t offset, int idx, bool apply_mask);
-    void load_as_f32(const Xbyak::Zmm &dst, const Xbyak::Opmask &mask,
-            const Xbyak::Address &src_addr, const data_type_t &src_dt);
-
-    int vreg_dst_idx(const int idx) const noexcept;
-    Xbyak::Zmm get_vreg_dst(int idx) const;
-    Xbyak::Zmm get_vreg_bias(int idx) const;
-    Xbyak::Zmm get_vreg_prev_dst(int idx) const;
-    Xbyak::Zmm get_vreg_zp_comp_src(int idx) const;
-    Xbyak::Zmm get_masked_vreg_dst(int idx, bool apply_mask) const;
-    Xbyak::Zmm reserve_zmm();
-
-    const Xbyak::Opmask &opmask_binary = k2;
 
     struct ker_args_t {
         char *dst;
@@ -79,7 +225,7 @@ struct jit_pp_ker_t : pp_ker_t, public jit_generator {
         const int32_t *zp_src_comp;
         const int32_t *zp_src_pad_comp;
         size_t g_oc_offset_prologue;
-        size_t g_oc_offset;
+        size_t g_offset;
         const void *post_ops_binary_rhs_arg_vec;
         const void *dst_orig;
         dim_t h;
@@ -90,453 +236,494 @@ struct jit_pp_ker_t : pp_ker_t, public jit_generator {
         bool should_apply_zp_src_pad_comp_d;
     };
 
-    std::unique_ptr<injector::jit_uni_postops_injector_t<avx512_core>>
-            postops_injector_;
+    nstl::vector<jit_uni_eltwise_injector_t<isa> *> jit_eltwise_injectors_;
+    nstl::vector<jit_uni_depthwise_injector_f32<isa> *> jit_depthwise_injectors_;
+    std::unique_ptr<binary_injector::jit_uni_binary_injector_t<isa>>
+            jit_binary_injector_;
 
     size_t number_of_reserved_zmm_regs_;
-    const size_t bias_data_type_size_;
-    const size_t dst_data_type_size_;
-    const bool saturation_needed_;
-
-    const Xbyak::Reg64 &reg_param_ = rdi;
-    const Xbyak::Reg64 &reg_tmp_ = rcx; // intentional for shifting purposes
-
-    const Xbyak::Reg64 &reg_dst_ = rdx;
-    const Xbyak::Reg64 &reg_acc_ = rax;
-    const Xbyak::Reg64 &reg_bias_ = rbx;
-    const Xbyak::Reg64 &reg_scales_ = rsi;
-    const Xbyak::Reg64 &reg_len_ = r8;
-    const Xbyak::Reg64 &reg_oc_offset_ = r9;
-    const Xbyak::Reg64 &reg_rem_mask_short_ = r10;
-    const Xbyak::Reg64 &reg_rem_mask_vlen_ = reg_rem_mask_short_;
-    const Xbyak::Reg64 &reg_zp_pad_comp_temp_ = r10;
-    const Xbyak::Reg64 &reg_zp_pad_comp_ = r11;
-    const Xbyak::Reg8 &reg_should_apply_src_pad_comp_ = r13b;
-
-    const Xbyak::Reg64 &reg_tmp_comp_
-            = r12; // used to broadcast scalar values to vreg
-    const Xbyak::Reg64 &reg_zp_src_comp_ = r14;
-
-    const Xbyak::Zmm vreg_zero_;
-    const Xbyak::Zmm vreg_scale_;
-    const Xbyak::Zmm vreg_dst_scale_;
-    const Xbyak::Zmm vreg_sum_scale_;
-    const Xbyak::Zmm vreg_signed_scale_;
-    const Xbyak::Zmm vreg_saturation_ubound_;
-    const Xbyak::Zmm vreg_zp_dst_common_;
-
-    const Xbyak::Opmask &kreg_rem_mask_short_ = k3;
-    const Xbyak::Opmask &kreg_rem_mask_vlen_ = k4;
-
-    static constexpr size_t def_unroll_ = 4u;
-    size_t zmm_step_;
-    const size_t bias_step_factor_;
-    const size_t sum_step_factor_;
-    const size_t max_unroll_;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    static const size_t vlen = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+
+    Xbyak::Reg64 reg_param = abi_param1;
+    Xbyak::Reg64 reg_param_bak = r11;
+    Xbyak::Reg64 reg_dst = rdx;
+    Xbyak::Reg64 reg_acc = rax;
+    Xbyak::Reg64 reg_bias = rbx;
+    Xbyak::Reg64 reg_scales = rsi;
+    Xbyak::Reg64 reg_g_offset = rbp;
+
+    Xbyak::Reg64 reg_len = r8;
+    Xbyak::Reg64 reg_tmp = rcx; // intentional for shifting purposes
+    Xbyak::Reg64 reg_oc_offset = r9;
+    Xbyak::Reg64 reg_rem_mask_short = r10;
+    Xbyak::Opmask kreg_rem_mask_short = k1;
+
+    Vmm vreg_zero, vreg_scale, vreg_sum_scale, vreg_signed_scale, vreg_comp;
+
+    //  sse41/avx2
+    Xbyak::Reg64 reg_ptr_maskmovdqu_dst = rdi; // sse41: store destination - must be rdi
+    Xbyak::Label l_table;
+    Xbyak::Reg64 reg_table = r12;
+    Xbyak::Reg64 reg_shift_table = r13;
+    Vmm vreg_mask = Vmm(0); //  sse41: mask for blendvps must be in xmm0
+    Vmm vreg_store_mask = Vmm(1);
+
+    //  post_ops
+    Xbyak::Opmask mask_post_op_reserved = k2;
+    Xbyak::Reg64 eltwise_reserved = rax;
+    Xbyak::Reg64 reg_d_weights = r14;
+    Xbyak::Reg64 reg_d_bias = r15;
+    Vmm vreg_d_weights, vreg_d_bias;
+
+    size_t dst_data_type_size_ = 0;
+    size_t bias_data_type_size_ = 0;
+
+    bool do_eltwise_;
+    bool do_sum_;
+    float sum_scale_;
+    data_type_t sum_data_type_;
+    bool do_signed_scaling_;
+
+    int default_OC_loop_unroll_;
+    int max_OC_loop_unroll_;
+    int idx_compute_vreg_start_;
+    int idx_compute_vreg_max_;
+    int compute_vregs_per_iter_;
+
+    Vmm vreg_dst_scale;
+    Vmm vreg_zp_dst_common;
 
     std::unique_ptr<jit_gemm_x8s8s32x_zp_pad_comp_helper> zp_pad_comp_helper_;
-};
 
-jit_pp_ker_t::jit_pp_ker_t(
-        const convolution_pd_t *pd, const conv_gemm_conf_t &jcp)
-    : pp_ker_t(pd, jcp)
-    , jit_generator(jit_name())
-    , number_of_reserved_zmm_regs_(0)
-    , bias_data_type_size_(jcp.bias_data_type != data_type::undef
-                      ? types::data_type_size(jcp.bias_data_type)
-                      : 0u)
-    , dst_data_type_size_(types::data_type_size(jcp.dst_data_type))
-    , saturation_needed_(utils::one_of(
-              jcp_.dst_data_type, data_type::u8, data_type::s8, data_type::s32))
-    , vreg_zero_((jcp_.with_eltwise || saturation_needed_) ? reserve_zmm()
-                                                           : Xbyak::Zmm(0))
-    , vreg_scale_(reserve_zmm())
-    , vreg_dst_scale_(reserve_zmm())
-    , vreg_sum_scale_(jcp_.with_sum ? reserve_zmm() : Xbyak::Zmm(0))
-    , vreg_signed_scale_(jcp_.signed_input ? reserve_zmm() : Xbyak::Zmm(0))
-    , vreg_saturation_ubound_(
-              saturation_needed_ ? reserve_zmm() : Xbyak::Zmm(0))
-    , vreg_zp_dst_common_(jcp_.zp.dst_exists ? reserve_zmm() : Xbyak::Zmm(0))
-    , zmm_step_(1u)
-    , bias_step_factor_(jcp_.with_bias ? zmm_step_++ : 0u)
-    , sum_step_factor_(jcp_.with_sum ? zmm_step_++ : 0)
-    , max_unroll_((cpu_isa_traits<avx512_core>::n_vregs
-                          - number_of_reserved_zmm_regs_)
-              / zmm_step_)
-    , zp_pad_comp_helper_(jit_gemm_convolution_utils::padding_exists(jcp)
-                              && jcp.zp.src_exists
-                      ? utils::make_unique<
-                              jit_gemm_x8s8s32x_zp_pad_comp_helper>(this, jcp_,
-                              reg_zp_pad_comp_, reg_zp_pad_comp_temp_,
-                              reg_should_apply_src_pad_comp_,
-                              pd->src_md()->ndims)
-                      : nullptr)
-
-{
-
-    if (jcp.with_eltwise || jcp.with_binary) {
-        using namespace binary_injector;
-        static constexpr bool preserve_gpr = true;
-        static constexpr bool preserve_vmm = true;
-        static constexpr size_t helper_vmm_idx = 31;
-        // tail_size = 1 just indicates that tailing is to be performed
-        // actual tail value is held in opmask passed to injector
-        static constexpr size_t tail_size = 1;
-        static constexpr bool use_exact_tail_scalar_bcast = false;
-
-#define PARAM_OFF(x) offsetof(ker_args_t, x)
-        const rhs_arg_static_params_t rhs_arg_static_params {helper_vmm_idx,
-                r13, r14, r15, preserve_gpr, preserve_vmm,
-                PARAM_OFF(post_ops_binary_rhs_arg_vec), PARAM_OFF(dst_orig),
-                memory_desc_wrapper(pd->dst_md()), tail_size, opmask_binary,
-                use_exact_tail_scalar_bcast};
-#undef PARAM_OFF
-
-        const static_params_t static_params {reg_param_, rhs_arg_static_params};
-
-        postops_injector_ = utils::make_unique<
-                injector::jit_uni_postops_injector_t<avx512_core>>(
-                this, jcp_.post_ops, static_params);
+    int idx_vreg_dst(int iter) {
+        int idx = idx_compute_vreg_start_ + iter * compute_vregs_per_iter_ + 0;
+        assert(idx <= idx_compute_vreg_max_);
+        return idx;
     }
-}
-
-void jit_pp_ker_t::operator()(void *void_dst, const acc_data_t *acc,
-        const char *bias, const float *scales, float dst_scale, float sum_scale,
-        float signed_scale, int g, size_t start, size_t end,
-        const zero_point_call_params_t &zp,
-        const void *post_ops_binary_rhs_arg_vec, const void *dst_orig,
-        const exec_ctx_t & /* ctx */, const memory_desc_t & /* dst_md */,
-        const single_gemm_conv_chunk_desc_t &chunk_desc) const {
-
-    if (end <= start) return;
-
-    char *dst = (char *)void_dst;
-
-    ker_args_t args;
-    const auto dv = std::div(start, jcp_.oc);
-    const size_t oc_offset = dv.rem;
-    const size_t os_offset = dv.quot;
-    args.acc = acc + start;
-    args.dst = dst
-            + (os_offset * jcp_.dst_os_stride + oc_offset)
-                    * dst_data_type_size_;
-
-    const ptrdiff_t g_oc_offset = g * jcp_.oc;
-    const ptrdiff_t g_oc_offset_prologue = g_oc_offset + oc_offset;
-    args.bias = bias + g_oc_offset_prologue * bias_data_type_size_;
-    args.zp_src = zp.src + (jcp_.zp.src_is_common ? 0 : g_oc_offset_prologue);
-    args.zp_src_comp
-            = zp.src_comp ? zp.src_comp + g_oc_offset_prologue : nullptr;
-    args.zp_dst = zp.dst;
-    args.scales = scales + jcp_.scale_idx_mult * g_oc_offset_prologue;
-    args.dst_scale = dst_scale;
-    args.sum_scale = sum_scale;
-    args.signed_scale = signed_scale;
-    args.len = end - start;
-    args.oc_offset = oc_offset;
-
-    args.g_oc_offset = g_oc_offset;
-    args.g_oc_offset_prologue = g_oc_offset_prologue;
-
-    args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
-    args.dst_orig = dst_orig;
-
-    if (zp_pad_comp_helper_) {
-        const auto hw
-                = std::div(static_cast<dim_t>(os_offset), chunk_desc.w_size_);
-        args.h = hw.quot + chunk_desc.h_off_;
-        args.w = hw.rem + chunk_desc.w_off_;
-        args.w_size = chunk_desc.w_size_ + chunk_desc.w_off_;
-        args.w_off = chunk_desc.w_off_;
-        args.zp_src_pad_comp = zp.src_pad_comp;
-        const auto zp_src_pad_com_d
-                = zp_pad_comp_helper_->calculate_zp_src_pad_com_d(
-                        chunk_desc.d_off_);
-        args.zp_src_pad_com_d_offset = zp_src_pad_com_d.offset;
-        args.should_apply_zp_src_pad_comp_d
-                = zp_src_pad_com_d.should_apply_pad_comp_d;
+    int idx_vreg_bias(int iter) {
+        int idx = idx_compute_vreg_start_ + iter * compute_vregs_per_iter_ + 1;
+        assert(idx <= idx_compute_vreg_max_);
+        return idx;
     }
-
-    jit_generator::operator()(&args);
-}
-
-Xbyak::Zmm jit_pp_ker_t::reserve_zmm() {
-    return Xbyak::Zmm(number_of_reserved_zmm_regs_++);
-}
-
-int jit_pp_ker_t::vreg_dst_idx(const int idx) const noexcept {
-    return (number_of_reserved_zmm_regs_ + idx * zmm_step_);
-}
-
-Xbyak::Zmm jit_pp_ker_t::get_vreg_dst(int idx) const {
-    return Xbyak::Zmm(vreg_dst_idx(idx));
-}
-
-Xbyak::Zmm jit_pp_ker_t::get_vreg_bias(int idx) const {
-    return Xbyak::Zmm(vreg_dst_idx(idx) + bias_step_factor_);
-}
-
-Xbyak::Zmm jit_pp_ker_t::get_vreg_prev_dst(int idx) const {
-    return Xbyak::Zmm(vreg_dst_idx(idx) + sum_step_factor_);
-}
-
-Xbyak::Zmm jit_pp_ker_t::get_masked_vreg_dst(int idx, bool apply_mask) const {
-    auto vreg_dst = this->get_vreg_dst(idx);
-    if (apply_mask)
-        vreg_dst = vreg_dst | kreg_rem_mask_short_;
-    else
-        vreg_dst = vreg_dst | kreg_rem_mask_vlen_;
-    return vreg_dst;
-}
-
-void jit_pp_ker_t::append_zp_src_comp(size_t offset, int idx, bool apply_mask) {
-    const auto vreg_dst_masked = get_masked_vreg_dst(idx, apply_mask);
-    const auto vreg_dst = get_vreg_dst(idx);
-    const auto zp_src_comp_offset = offset * sizeof(int32_t);
-    const auto zp_src_comp_addr = ptr[reg_zp_src_comp_ + zp_src_comp_offset];
-
-    vpaddd(vreg_dst_masked, vreg_dst, zp_src_comp_addr);
-
-    if (zp_pad_comp_helper_)
-        zp_pad_comp_helper_->zp_src_comp_pad_operation(
-                [&](const Xbyak::Reg64 &reg_zp_pad_comp) {
-                    vpaddd(vreg_dst_masked, vreg_dst,
-                            ptr[reg_zp_pad_comp + zp_src_comp_offset]);
-                });
-}
-
-void jit_pp_ker_t::apply_postops(
-        const Xbyak::Reg64 &reg_dst, const int idx, const size_t offset) {
-#define PARAM_OFF(x) offsetof(ker_args_t, x)
-    if (jcp_.with_eltwise || jcp_.with_binary) {
-        if (jcp_.with_binary) {
-            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
-            const auto vmm_idx = vreg_dst_idx(idx);
-
-            rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, reg_dst);
-            rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(vmm_idx,
-                    offset * types::data_type_size(jcp_.dst_data_type));
-            rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
-
-            postops_injector_->compute_vector(
-                    vreg_dst_idx(idx), rhs_arg_params);
-        } else
-            postops_injector_->compute_vector(vreg_dst_idx(idx));
+    int idx_vreg_prev_dst(int iter) {
+        int idx = idx_compute_vreg_start_ + iter * compute_vregs_per_iter_ + 2;
+        assert(idx <= idx_compute_vreg_max_);
+        return idx;
     }
-#undef PARAM_OFF
-}
 
-void jit_pp_ker_t::load_as_f32(const Xbyak::Zmm &dst,
-        const Xbyak::Opmask &mask_reg, const Xbyak::Address &src_addr,
-        const data_type_t &src_dt) {
-
-    const auto dst_masked = dst | mask_reg;
-
-    switch (src_dt) {
-        case data_type::s8: vpmovsxbd(dst_masked, src_addr); break;
-        case data_type::u8: vpmovzxbd(dst_masked, src_addr); break;
-        case data_type::s32: vcvtdq2ps(dst_masked, src_addr); break;
-        case data_type::f32: vmovups(dst_masked, src_addr); break;
-        default: assert(!"unimplemented");
+    Vmm vreg_dst(int idx) { return Vmm(idx_vreg_dst(idx)); };
+    Xbyak::Ymm ymm_dst(int idx) { return Xbyak::Ymm(idx_vreg_dst(idx)); };
+    Xbyak::Xmm xmm_dst(int idx) { return Xbyak::Xmm(idx_vreg_dst(idx)); };
+    Vmm vreg_bias(int idx) { return Vmm(idx_vreg_bias(idx)); };
+    Vmm vreg_prev_dst(int idx) { return Vmm(idx_vreg_prev_dst(idx)); };
+
+    Vmm get_masked_vreg_dst(int idx, bool apply_mask) {
+        Vmm vreg_dst_ = vreg_dst(idx);
+        if (apply_mask)
+            vreg_dst_ = vreg_dst_ | kreg_rem_mask_short;
+        // else
+        //     vreg_dst_ = vreg_dst_ | kreg_rem_mask_vlen_;
+        return vreg_dst_;
     }
+};
 
-    if (utils::one_of(src_dt, data_type::s8, data_type::u8))
-        vcvtdq2ps(dst_masked, dst);
-}
-
-void jit_pp_ker_t::generate() {
+template <cpu_isa_t isa>
+void jit_pp_ker_t<isa>::generate() {
     using namespace Xbyak;
     using namespace utils;
 
-    size_t vlen = cpu_isa_traits<avx512_core>::vlen / sizeof(float);
-    for (; vlen >= 1 && (jcp_.oc % vlen != 0); --vlen) {}
-
     preamble();
 
-#ifdef _WIN32
-    mov(reg_param_, rcx);
-#endif
+    const auto &p = post_ops_;
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
 
 #define PARAM_OFF(x) offsetof(ker_args_t, x)
-    mov(reg_dst_, ptr[reg_param_ + PARAM_OFF(dst)]);
-    mov(reg_acc_, ptr[reg_param_ + PARAM_OFF(acc)]);
-    mov(reg_bias_, ptr[reg_param_ + PARAM_OFF(bias)]);
-    mov(reg_scales_, ptr[reg_param_ + PARAM_OFF(scales)]);
-    mov(reg_len_, ptr[reg_param_ + PARAM_OFF(len)]);
-    mov(reg_oc_offset_, ptr[reg_param_ + PARAM_OFF(oc_offset)]);
-
-    if (jcp_.zp.src_exists) {
-        mov(reg_zp_src_comp_, ptr[reg_param_ + PARAM_OFF(zp_src_comp)]);
-        if (zp_pad_comp_helper_)
-            zp_pad_comp_helper_->init(PARAM_OFF(w), PARAM_OFF(h),
-                    PARAM_OFF(w_size), PARAM_OFF(w_off),
-                    PARAM_OFF(zp_src_pad_comp), PARAM_OFF(g_oc_offset_prologue),
-                    PARAM_OFF(g_oc_offset), PARAM_OFF(zp_src_pad_com_d_offset),
-                    PARAM_OFF(should_apply_zp_src_pad_comp_d));
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_dst;
+        auto aux_reg1 = reg_acc;
+
+        mov(aux_reg0, ptr[reg_param + PARAM_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
     }
 
+    mov(reg_param_bak, reg_param);
+    mov(reg_dst, ptr[reg_param + PARAM_OFF(dst)]);
+    mov(reg_acc, ptr[reg_param + PARAM_OFF(acc)]);
+    mov(reg_bias, ptr[reg_param + PARAM_OFF(bias)]);
+    mov(reg_scales, ptr[reg_param + PARAM_OFF(scales)]);
+    mov(reg_len, ptr[reg_param + PARAM_OFF(len)]);
+    mov(reg_oc_offset, ptr[reg_param + PARAM_OFF(oc_offset)]);
+    mov(reg_g_offset, ptr[reg_param + PARAM_OFF(g_offset)]);
+
     if (jcp_.zp.dst_exists) {
-        mov(reg_tmp_, ptr[reg_param_ + PARAM_OFF(zp_dst)]);
-        vcvtdq2ps(vreg_zp_dst_common_, ptr_b[reg_tmp_]);
+        mov(reg_tmp, ptr[reg_param + PARAM_OFF(zp_dst)]);
+        vcvtdq2ps(vreg_zp_dst_common, ptr_b[reg_tmp]);
     }
 
     if (jcp_.with_dst_scale)
-        vbroadcastss(vreg_dst_scale_, ptr[reg_param_ + PARAM_OFF(dst_scale)]);
-    if (jcp_.with_sum)
-        vbroadcastss(vreg_sum_scale_, ptr[reg_param_ + PARAM_OFF(sum_scale)]);
-    if (jcp_.signed_input)
-        vbroadcastss(
-                vreg_signed_scale_, ptr[reg_param_ + PARAM_OFF(signed_scale)]);
-    if (jcp_.scale_idx_mult == 0) vbroadcastss(vreg_scale_, dword[reg_scales_]);
+        vbroadcastss(vreg_dst_scale, ptr[reg_param + PARAM_OFF(dst_scale)]);
+    if (do_sum_)
+        uni_vbroadcastss(vreg_sum_scale, ptr[reg_param + PARAM_OFF(sum_scale)]);
+    if (do_signed_scaling_)
+        uni_vbroadcastss(vreg_signed_scale, ptr[reg_param + PARAM_OFF(signed_scale)]);
+    if (do_scale_ && jcp_.scale_idx_mult == 0)
+        uni_vbroadcastss(vreg_scale, dword[reg_scales]);
 #undef PARAM_OFF
 
-    mov(reg_rem_mask_vlen_, 1);
-    shl(reg_rem_mask_vlen_, vlen);
-    sub(reg_rem_mask_vlen_, 1);
-    kmovq(kreg_rem_mask_vlen_, reg_rem_mask_vlen_);
+    if (do_eltwise_ || dst_data_type_ == data_type::u8 || utils::one_of(isa, avx2, sse41))
+        uni_vpxor(vreg_zero, vreg_zero, vreg_zero);
+
+    if (utils::one_of(isa, avx2, sse41))
+        mov(reg_table, l_table);
+
+    auto apply_post_ops = [&](size_t offset, int idx, bool apply_mask) {
+        std::size_t post_ops_data_offset = 0;
+        int eltwise_inj_idx = 0;
+        int depthwise_inj_idx = 0;
+        int binary_inj_idx = 0;
+        for (int i = 0; i < post_ops_.len(); i++) {
+            auto &post_op = post_ops_.entry_[i];
+            if (post_op.is_sum()) {
+                auto dst_addr = ptr[reg_dst + offset * dst_data_type_size_];
+                auto vreg_prev_dst_ = vreg_prev_dst(idx);
+                switch (sum_data_type_) {
+                    case data_type::f32:
+                    case data_type::s32:
+                        uni_vmovups(vreg_prev_dst_, dst_addr);
+                        break;
+                    case data_type::s8:
+                        uni_vpmovsxbd(vreg_prev_dst_, dst_addr);
+                        break;
+                    case data_type::u8:
+                        uni_vpmovzxbd(vreg_prev_dst_, dst_addr);
+                        break;
+                    default:
+                        assert(!"unsupported data type");
+                }
+                if (sum_data_type_ != data_type::f32)
+                    uni_vcvtdq2ps(vreg_prev_dst(idx), vreg_prev_dst(idx));
+
+                uni_vfmadd231ps(vreg_dst(idx), vreg_prev_dst(idx), vreg_sum_scale);
+            } else if (post_op.is_binary()) {
+                binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
+                auto dst_addr = ptr[reg_dst + offset * dst_data_type_size_];
+                rhs_arg_params.vmm_idx_to_out_addr.emplace(idx, dst_addr);
+                rhs_arg_params.vmm_idx_to_out_elem_off_val.emplace(
+                        idx, 0 * sizeof(float));
+                if (mayiuse(avx512_core) && apply_mask) 
+                    rhs_arg_params.vmm_tail_idx_.emplace(idx);
+                jit_binary_injector_->compute_vector(
+                        idx, binary_inj_idx, post_op, rhs_arg_params);
+
+                binary_inj_idx++;
+            } else if (post_op.is_eltwise()) {
+                jit_eltwise_injectors_[eltwise_inj_idx]->compute_vector_range(vreg_dst(idx).getIdx(),
+                                                                              vreg_dst(idx).getIdx() + 1);
+                eltwise_inj_idx++;
+            } else if (post_op.is_depthwise()) {
+                add(reg_oc_offset, reg_g_offset);
+
+                const Xbyak::RegExp depthwise_arg_base = rsp + post_ops_data_offset;
+                mov(reg_d_weights, ptr[depthwise_arg_base]);
+                lea(reg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + offset]);
+
+                jit_depthwise_injectors_[depthwise_inj_idx]->compute_vector_range(vreg_dst(idx).getIdx(), vreg_dst(idx).getIdx() + 1, reg_d_weights, reg_d_weights);
+
+                sub(reg_oc_offset, reg_g_offset);
+
+                post_ops_data_offset += jit_depthwise_injectors_[depthwise_inj_idx]->memoryStep();
+                depthwise_inj_idx++;
+                binary_inj_idx++;
+            } else if (post_op.is_quantization()) {
+                add(reg_oc_offset, reg_g_offset);
+                bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
+                bool do_rounding = do_dequantization || dst_data_type_ == dnnl_f32 || i != post_ops_.len() - 1;
+
+                const Xbyak::RegExp quantization_arg_base = rsp + post_ops_data_offset;
+                size_t crop_low_off = post_op.quantization.offset[post_op.quantization.crop_low] * sizeof(float);
+                if (post_op.quantization.per_channel[post_op.quantization.crop_low]) {
+                    mov(reg_d_weights, ptr[quantization_arg_base]);
+                    uni_vmovups(vreg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + offset * sizeof(float) + crop_low_off]);
+                } else {
+                    mov(reg_d_weights, ptr[quantization_arg_base]);
+                    uni_vbroadcastss(vreg_d_weights, ptr[reg_d_weights + crop_low_off]);
+                }
 
-    if (jcp_.with_eltwise) vxorps(vreg_zero_, vreg_zero_, vreg_zero_);
-    if (saturation_needed_)
-        init_saturate_f32(vreg_zero_, vreg_saturation_ubound_, reg_tmp_comp_,
-                data_type::f32, jcp_.dst_data_type);
+                size_t crop_high_off = post_op.quantization.offset[post_op.quantization.crop_high] * sizeof(float);
+                if (post_op.quantization.per_channel[post_op.quantization.crop_high]) {
+                    mov(reg_d_bias, ptr[quantization_arg_base]);
+                    uni_vmovups(vreg_d_bias, ptr[reg_d_bias + reg_oc_offset * sizeof(float) + offset * sizeof(float) + crop_high_off]);
+                } else {
+                    mov(reg_d_bias, ptr[quantization_arg_base]);
+                    uni_vbroadcastss(vreg_d_bias, ptr[reg_d_bias + crop_high_off]);
+                }
 
-    // Load accumulated value, convert to float, apply sum (if any),
-    // bias (if any), scaling, and relu (if any);
-    // then convert to destination type and store
-    const auto compute = [&](size_t offset, int idx, bool apply_mask) {
-        auto acc_addr = ptr[reg_acc_ + offset * sizeof(acc_data_t)];
+                uni_vmaxps(vreg_dst(idx), vreg_dst(idx), vreg_d_weights);
+                uni_vminps(vreg_dst(idx), vreg_dst(idx), vreg_d_bias);
 
-        const auto &mask_reg
-                = apply_mask ? kreg_rem_mask_short_ : kreg_rem_mask_vlen_;
+                size_t inp_scale_off = post_op.quantization.offset[post_op.quantization.inp_scale] * sizeof(float);
+                if (post_op.quantization.per_channel[post_op.quantization.inp_scale]) {
+                    mov(reg_d_weights, ptr[quantization_arg_base]);
+                    uni_vmovups(vreg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + offset * sizeof(float) + inp_scale_off]);
+                } else {
+                    mov(reg_d_weights, ptr[quantization_arg_base ]);
+                    uni_vbroadcastss(vreg_d_weights, ptr[reg_d_weights + inp_scale_off]);
+                }
 
-        if (jcp_.scale_idx_mult > 0) {
-            assert(jcp_.scale_idx_mult == 1);
-            const auto scale_addr = ptr[reg_scales_ + offset * sizeof(float)];
-            auto vreg_scale = vreg_scale_;
-            vreg_scale = vreg_scale | mask_reg;
-            vmovups(vreg_scale, scale_addr);
-        }
+                size_t inp_shift_off = post_op.quantization.offset[post_op.quantization.inp_shift] * sizeof(float);
+                if (post_op.quantization.per_channel[post_op.quantization.inp_shift]) {
+                    mov(reg_d_bias, ptr[quantization_arg_base]);
+                    uni_vmovups(vreg_d_bias, ptr[reg_d_bias + reg_oc_offset * sizeof(float) + offset * sizeof(float) + inp_shift_off]);
+                } else {
+                    mov(reg_d_bias, ptr[quantization_arg_base]);
+                    uni_vbroadcastss(vreg_d_bias, ptr[reg_d_bias + inp_shift_off]);
+                }
 
-        if (jcp_.with_binary) kmovq(opmask_binary, mask_reg);
+                uni_vfmadd213ps(vreg_dst(idx), vreg_d_weights, vreg_d_bias);
+
+                if (do_rounding)
+                    uni_vroundps(vreg_dst(idx), vreg_dst(idx), 0);
+
+                if (do_dequantization) {
+                    size_t output_scale_off = post_op.quantization.offset[post_op.quantization.output_scale] * sizeof(float);
+                    if (post_op.quantization.per_channel[post_op.quantization.output_scale]) {
+                        mov(reg_d_weights, ptr[quantization_arg_base ]);
+                        uni_vmovups(vreg_d_weights, ptr[reg_d_weights + reg_oc_offset * sizeof(float) + offset * sizeof(float) + output_scale_off]);
+                    } else {
+                        mov(reg_d_weights, ptr[quantization_arg_base]);
+                        uni_vbroadcastss(vreg_d_weights, ptr[reg_d_weights + output_scale_off]);
+                    }
+
+                    size_t output_shift_off = post_op.quantization.offset[post_op.quantization.output_shift] * sizeof(float);
+                    if (post_op.quantization.per_channel[post_op.quantization.output_shift]) {
+                        mov(reg_d_bias, ptr[quantization_arg_base]);
+                        uni_vmovups(vreg_d_bias, ptr[reg_d_bias + reg_oc_offset * sizeof(float) + offset * sizeof(float) + output_shift_off]);
+                    } else {
+                        mov(reg_d_bias, ptr[quantization_arg_base]);
+                        uni_vbroadcastss(vreg_d_bias, ptr[reg_d_bias + output_shift_off]);
+                    }
+
+                    uni_vfmadd213ps(vreg_dst(idx), vreg_d_weights, vreg_d_bias);
+                }
+                sub(reg_oc_offset, reg_g_offset);
 
-        const auto vreg_dst_masked = get_masked_vreg_dst(idx, apply_mask);
-        const auto vreg_dst = get_vreg_dst(idx);
-        if (jcp_.zp.src_exists) {
-            vmovups(vreg_dst_masked, acc_addr);
-            append_zp_src_comp(offset, idx, apply_mask);
-            vcvtdq2ps(vreg_dst_masked, vreg_dst);
-        } else {
-            vcvtdq2ps(vreg_dst_masked, acc_addr);
+                post_ops_data_offset += sizeof(float*);
+                binary_inj_idx++;
+            }
         }
+    };
 
-        if (jcp_.signed_input)
-            vmulps(vreg_dst_masked, vreg_dst, vreg_signed_scale_);
-
-        vmulps(vreg_dst_masked, vreg_dst, vreg_scale_);
+    // Load accumulated value, convert to float,
+    // multiply weight scale, bias (if any), and simple operations (if any);
+    // then convert to destination type and store
+    auto compute = [&](size_t offset, int idx, bool apply_mask) {
+        auto acc_addr = ptr[reg_acc + offset * sizeof(acc_data_t)];
 
-        if (jcp_.with_bias) {
-            const auto bias_addr
-                    = ptr[reg_bias_ + offset * bias_data_type_size_];
-            const auto vreg_bias = get_vreg_bias(idx);
-            load_as_f32(vreg_bias, mask_reg, bias_addr, jcp_.bias_data_type);
-            vaddps(vreg_dst_masked, vreg_dst, vreg_bias);
+        if (do_scale_ && jcp_.scale_idx_mult > 0) {
+            assert(jcp_.scale_idx_mult == 1);
+            auto scale_addr = ptr[reg_scales + offset * sizeof(float)];
+            auto vreg_scale_ = vreg_scale;
+            if (isa == avx512_core) {
+                if (apply_mask)
+                    vreg_scale_ = vreg_scale_ | kreg_rem_mask_short;
+                uni_vmovups(vreg_scale_, scale_addr);
+            } else {
+                if (apply_mask)
+                    if (isa != sse41) {
+                        uni_vblendvps(vreg_scale, vreg_zero, scale_addr, vreg_mask);
+                    } else {
+                        uni_vmovups(vreg_scale, vreg_zero);
+                        uni_vblendvps(vreg_scale, vreg_scale, scale_addr, vreg_mask);
+                    }
+                else
+                    uni_vmovups(vreg_scale, scale_addr);
+            }
         }
 
-        const auto dst_addr = ptr[reg_dst_ + offset * dst_data_type_size_];
+        auto vreg_dst_ = vreg_dst(idx);
+        if (isa == avx512_core) {
+            if (apply_mask)
+                vreg_dst_ = vreg_dst_ | kreg_rem_mask_short;
+            uni_vcvtdq2ps(vreg_dst_, acc_addr);
+        } else {
+            if (apply_mask) {
+                if (isa != sse41) {
+                    uni_vblendvps(vreg_dst_, vreg_zero, acc_addr, vreg_mask);
+                } else {
+                    uni_vmovups(vreg_dst_, acc_addr);
+                }
+                uni_vcvtdq2ps(vreg_dst_, vreg_dst_);
+            } else {
+                if (isa == sse41) {
+                    uni_vmovups(vreg_dst_, acc_addr);
+                    uni_vcvtdq2ps(vreg_dst_, vreg_dst_);
+                } else {
+                    uni_vcvtdq2ps(vreg_dst_, acc_addr);
+                }
+            }
+        }
 
-        if (jcp_.with_sum) {
-            const auto vreg_prev_dst = get_vreg_prev_dst(idx);
-            load_as_f32(vreg_prev_dst, mask_reg, dst_addr, jcp_.sum_data_type);
-            vfmadd231ps(vreg_dst_masked, vreg_prev_dst, vreg_sum_scale_);
+        if (do_signed_scaling_)
+            uni_vmulps(vreg_dst(idx), vreg_dst(idx), vreg_signed_scale);
+
+        if (do_scale_)
+            uni_vmulps(vreg_dst(idx), vreg_dst(idx), vreg_scale);
+        if (do_bias_) {
+            auto bias_addr = ptr[reg_bias + offset * bias_data_type_size_];
+            auto vreg_bias_ = vreg_bias(idx);
+            if (isa == avx512_core && apply_mask)
+                vreg_bias_ = vreg_bias_ | kreg_rem_mask_short;
+
+            switch (bias_data_type_) {
+                case data_type::s8:
+                    uni_vpmovsxbd(vreg_bias_, bias_addr);
+                    break;
+                case data_type::u8:
+                    uni_vpmovzxbd(vreg_bias_, bias_addr);
+                    break;
+                case data_type::s32:
+                case data_type::f32:
+                    uni_vmovups(vreg_bias_, bias_addr);
+                    break;
+                default:
+                    assert(!"unimplemented");
+            }
+            if (bias_data_type_ != data_type::f32)
+                uni_vcvtdq2ps(vreg_bias(idx), vreg_bias(idx));
+            uni_vaddps(vreg_dst(idx), vreg_dst(idx), vreg_bias(idx));
         }
 
-        apply_postops(reg_dst_, idx, offset);
+        apply_post_ops(offset, idx, apply_mask);
 
         if (jcp_.with_dst_scale) {
-            vmulps(vreg_dst_masked, vreg_dst, vreg_dst_scale_);
+            uni_vmulps(vreg_dst_, vreg_dst(idx), vreg_dst_scale);
         }
 
         if (jcp_.zp.dst_exists) {
-            vaddps(vreg_dst_masked, vreg_dst, vreg_zp_dst_common_);
+            vaddps(vreg_dst_, vreg_dst(idx), vreg_zp_dst_common);
         }
 
-        if (saturation_needed_) {
-            saturate_cvt_f32(get_vreg_dst(idx), vreg_zero_,
-                    vreg_saturation_ubound_, jcp_.dst_data_type);
+        if (dst_data_type_ != data_type::f32) {
+            if (isa == avx512_core) {
+                auto rmode_control = T_rn_sae;
+                vcvtps2dq(vreg_dst(idx) | rmode_control, vreg_dst(idx));
+            } else {
+                uni_vcvtps2dq(vreg_dst(idx), vreg_dst(idx));
+            }
         }
 
-        switch (jcp_.dst_data_type) {
-            case data_type::s8: vpmovsdb(dst_addr, vreg_dst_masked); break;
-            case data_type::u8: vpmovusdb(dst_addr, vreg_dst_masked); break;
+        if (dst_data_type_ == data_type::u8)
+            uni_vpmaxsd(vreg_dst(idx), vreg_dst(idx), vreg_zero);
+
+        auto dst_addr = ptr[reg_dst + offset * dst_data_type_size_];
+        switch (dst_data_type_) {
+            case data_type::s8:
+                if (isa == avx512_core) {
+                    vpmovsdb(dst_addr, vreg_dst_);
+                } else {
+                    uni_vpackssdw(vreg_dst_, vreg_dst_, vreg_dst_);
+                    if (isa != sse41)
+                        vpermq(ymm_dst(idx), ymm_dst(idx), 0x08);
+                    uni_vpacksswb(vreg_dst_, vreg_dst_, vreg_dst_);
+                    if (apply_mask) {
+                        lea(reg_ptr_maskmovdqu_dst, dst_addr);
+                        maskmovdqu(vreg_dst_, vreg_store_mask);
+                    } else {
+                        if (isa != sse41) {
+                            vmovq(dst_addr, xmm_dst(idx));
+                        } else {
+                            movd(dst_addr, xmm_dst(idx));
+                        }
+                    }
+                }
+                break;
+            case data_type::u8:
+                if (isa == avx512_core) {
+                    vpmovusdb(dst_addr, vreg_dst_);
+                } else {
+                    uni_vpackusdw(vreg_dst_, vreg_dst_, vreg_dst_);
+                    if (isa != sse41)
+                        vpermq(ymm_dst(idx), ymm_dst(idx), 0x08);
+                    uni_vpackuswb(vreg_dst_, vreg_dst_, vreg_dst_);
+                    if (apply_mask) {
+                        lea(reg_ptr_maskmovdqu_dst, dst_addr);
+                        maskmovdqu(vreg_dst_, vreg_store_mask);
+                    } else {
+                        if (isa != sse41) {
+                            vmovq(dst_addr, xmm_dst(idx));
+                        } else {
+                            movd(dst_addr, xmm_dst(idx));
+                        }
+                    }
+                }
+                break;
             case data_type::f32:
-            case data_type::s32: vmovups(dst_addr, vreg_dst_masked); break;
-            default: assert(!"unimplemented");
+            case data_type::s32:
+                if (isa == avx512_core) {
+                    uni_vmovups(dst_addr, vreg_dst_);
+                } else {
+                    if (apply_mask) {
+                        if (isa != sse41) {
+                            vmaskmovps(dst_addr, vreg_mask, vreg_dst_);
+                        } else {
+                            lea(reg_ptr_maskmovdqu_dst, dst_addr);
+                            maskmovdqu(vreg_dst_, vreg_mask);
+                        }
+                    } else {
+                        uni_vmovups(dst_addr, vreg_dst_);
+                    }
+                }
+                break;
+            default:
+                assert(!"unimplemented");
         }
     };
 
     // Advance all pointers by an immediate
-    const auto advance_ptrs_imm = [&](const size_t offset,
-                                          const size_t binary_offset) {
-        add(reg_dst_, offset * dst_data_type_size_);
-        add(reg_acc_, offset * sizeof(acc_data_t));
+    auto advance_ptrs_imm = [&](size_t offset) {
+        add(reg_dst, offset * dst_data_type_size_);
+        add(reg_acc, offset * sizeof(acc_data_t));
         if (jcp_.scale_idx_mult) {
             assert(jcp_.scale_idx_mult == 1);
-            add(reg_scales_, offset * sizeof(float));
-        }
-        if (jcp_.with_bias) add(reg_bias_, offset * bias_data_type_size_);
-        if (jcp_.zp.src_exists) {
-            add(reg_zp_src_comp_, offset * sizeof(int32_t));
-
-            if (zp_pad_comp_helper_) {
-                zp_pad_comp_helper_->zp_src_comp_pad_operation(
-                        [&](const Xbyak::Reg64 &reg_zp_pad_comp) {
-                            add(reg_zp_pad_comp, offset * sizeof(int32_t));
-                        });
-            }
+            add(reg_scales, offset * sizeof(float));
         }
+        if (do_bias_)
+            add(reg_bias, offset * bias_data_type_size_);
     };
 
     // Advance all pointers by a value stored in a register
-    const auto advance_ptrs_reg = [&](const Reg64 offset,
-                                          const Reg64 binary_offset) {
-        lea(reg_dst_, ptr[reg_dst_ + offset * dst_data_type_size_]);
-        lea(reg_acc_, ptr[reg_acc_ + offset * sizeof(acc_data_t)]);
+    auto advance_ptrs_reg = [&](Reg64 offset) {
+        lea(reg_dst, ptr[reg_dst + offset * dst_data_type_size_]);
+        lea(reg_acc, ptr[reg_acc + offset * sizeof(acc_data_t)]);
         if (jcp_.scale_idx_mult) {
             assert(jcp_.scale_idx_mult == 1);
-            lea(reg_scales_, ptr[reg_scales_ + offset * sizeof(float)]);
-        }
-        if (jcp_.with_bias)
-            lea(reg_bias_, ptr[reg_bias_ + offset * bias_data_type_size_]);
-
-        if (jcp_.zp.src_exists) {
-            lea(reg_zp_src_comp_,
-                    ptr[reg_zp_src_comp_ + offset * sizeof(int32_t)]);
-
-            if (zp_pad_comp_helper_)
-                zp_pad_comp_helper_->zp_src_comp_pad_operation(
-                        [&](const Xbyak::Reg64 &reg_zp_pad_comp) {
-                            lea(reg_zp_pad_comp,
-                                    ptr[reg_zp_pad_comp
-                                            + offset * sizeof(int32_t)]);
-                        });
+            lea(reg_scales, ptr[reg_scales + offset * sizeof(float)]);
         }
+        if (do_bias_)
+            lea(reg_bias, ptr[reg_bias + offset * bias_data_type_size_]);
     };
 
     // Rewind pointers that point to data that is indexed by output channel
     // (bias or per-oc scaling factors)
-    const auto rewind_ptrs = [&]() {
-        if (jcp_.with_bias) sub(reg_bias_, jcp_.oc * bias_data_type_size_);
-        if (jcp_.zp.src_exists) {
-            const auto offset = jcp_.oc * sizeof(int32_t);
-            sub(reg_zp_src_comp_, offset);
-            if (zp_pad_comp_helper_)
-                zp_pad_comp_helper_->load_next_point_zp_src_comp_pad_addr();
-        }
+    auto rewind_ptrs = [&]() {
+        if (do_bias_)
+            sub(reg_bias, OC_ * bias_data_type_size_);
         if (jcp_.scale_idx_mult) {
             assert(jcp_.scale_idx_mult == 1);
-            sub(reg_scales_, jcp_.oc * sizeof(float));
+            sub(reg_scales, OC_ * sizeof(float));
         }
-        add(reg_dst_, (jcp_.dst_os_stride - jcp_.oc) * dst_data_type_size_);
+        add(reg_dst, (dst_os_stride_ - OC_) * dst_data_type_size_);
     };
 
     //                    <--------- OC --------------->
@@ -551,40 +738,55 @@ void jit_pp_ker_t::generate() {
     // |  .               | Epilogue loop|not accessed :                      .
     // v  ................+--------------+.............+.......................
 
+    bool do_post_ops = post_ops_.len() != 0;
+
     Label prologue_end;
-    cmp(reg_oc_offset_, 0);
+    cmp(reg_oc_offset, 0);
     je(prologue_end, T_NEAR);
 
     // Prologue loop
     {
-        mov(reg_tmp_, jcp_.oc);
-        sub(reg_tmp_, reg_oc_offset_);
-        cmp(reg_tmp_, reg_len_);
-        cmovg(reg_tmp_, reg_len_);
-        sub(reg_len_, reg_tmp_);
+        mov(reg_tmp, OC_);
+        sub(reg_tmp, reg_oc_offset);
+        cmp(reg_tmp, reg_len);
+        cmovg(reg_tmp, reg_len);
+        sub(reg_len, reg_tmp);
 
         Label prologue_loop, prologue_loop_tail, prologue_loop_end;
-        cmp(reg_tmp_, vlen);
-        jle(prologue_loop_tail, T_NEAR);
+        cmp(reg_tmp, vlen);
+        jl(prologue_loop_tail, T_NEAR);
         L(prologue_loop);
         {
-            compute(0, max_unroll_ - 1, false);
-            advance_ptrs_imm(vlen, vlen);
-            sub(reg_tmp_, vlen);
-            cmp(reg_tmp_, vlen);
+            compute(0, 0, false);
+            advance_ptrs_imm(vlen);
+            if (do_post_ops)
+                add(reg_oc_offset, vlen);
+            sub(reg_tmp, vlen);
+            cmp(reg_tmp, vlen);
             jge(prologue_loop, T_NEAR);
         }
 
         L(prologue_loop_tail);
-        mov(reg_rem_mask_short_, 1);
-        // cl == reg_tmp_ because reg_tmp_ <= vlen here
-        shl(reg_rem_mask_short_, cl);
-        sub(reg_rem_mask_short_, 1);
-        jz(prologue_loop_end, T_NEAR);
-
-        kmovq(kreg_rem_mask_short_, reg_rem_mask_short_);
-        compute(0, max_unroll_ - 1, true);
-        advance_ptrs_reg(reg_tmp_, reg_tmp_);
+        if (isa == avx512_core) {
+            mov(reg_rem_mask_short, 1);
+            // cl == reg_tmp because reg_tmp <= vlen here
+            shl(reg_rem_mask_short, cl);
+            sub(reg_rem_mask_short, 1);
+            jz(prologue_loop_end, T_NEAR);
+
+            kmovq(kreg_rem_mask_short, reg_rem_mask_short);
+        } else {
+            mov(reg_shift_table, vlen);
+            sub(reg_shift_table, reg_tmp);
+            uni_vmovups(vreg_mask, ptr[reg_table + reg_shift_table * sizeof(float)]);
+            if (dst_data_type_ == data_type::s8 || dst_data_type_ == data_type::u8) {
+                mov(reg_shift_table, vlen * sizeof(float));
+                sub(reg_shift_table, reg_tmp);
+                uni_vmovups(vreg_store_mask, ptr[reg_table + reg_shift_table]);
+            }
+        }
+        compute(0, 0, true);
+        advance_ptrs_reg(reg_tmp);
 
         L(prologue_loop_end);
         rewind_ptrs();
@@ -594,40 +796,55 @@ void jit_pp_ker_t::generate() {
     // Main loop
     Label main_loop_end;
     {
-        cmp(reg_len_, jcp_.oc);
-        jle(main_loop_end, T_NEAR);
-
-        Label main_loop;
-        L(main_loop);
-        {
-            size_t OC_loop, OC_tail;
-            if (static_cast<size_t>(jcp_.oc) < max_unroll_ * vlen) {
-                // Fully unroll small loops
-                OC_loop = 0;
-                OC_tail = jcp_.oc;
-            } else {
-                OC_loop = vlen * def_unroll_;
-                OC_tail = jcp_.oc % OC_loop;
-            }
+        cmp(reg_len, OC_);
+        jl(main_loop_end, T_NEAR);
+
+        size_t OC_loop, OC_tail;
+        if (OC_ < max_OC_loop_unroll_ * vlen) {
+            // Fully unroll small loops
+            OC_loop = 0;
+            OC_tail = OC_;
+        } else {
+            OC_loop = vlen * default_OC_loop_unroll_;
+            OC_tail = OC_ % OC_loop;
+        }
 
-            assert(!!OC_loop || !!OC_tail);
+        assert(!!OC_loop || !!OC_tail);
 
-            const int vlen_tail = OC_tail % vlen;
-            if (vlen_tail) {
+        if (OC_tail % vlen) {
+            int vlen_tail = OC_tail % vlen;
+            if (isa == avx512_core) {
                 unsigned tail_mask = (1 << vlen_tail) - 1;
-                mov(reg_tmp_, tail_mask);
-                kmovq(kreg_rem_mask_short_, reg_tmp_);
+                mov(reg_tmp, tail_mask);
+                kmovq(kreg_rem_mask_short, reg_tmp);
+            } else {
+                mov(reg_shift_table, vlen - vlen_tail);
+                uni_vmovups(vreg_mask, ptr[reg_table + reg_shift_table * sizeof(float)]);
+                if (dst_data_type_ == data_type::s8 || dst_data_type_ == data_type::u8) {
+                    mov(reg_shift_table, vlen * sizeof(float));
+                    sub(reg_shift_table, vlen_tail);
+                    uni_vmovups(vreg_store_mask, ptr[reg_table + reg_shift_table]);
+                }
             }
+        }
+
+        Label main_loop;
+        L(main_loop);
+        {
+            if (do_post_ops)
+                mov(reg_oc_offset, 0);
 
             if (OC_loop) {
-                mov(reg_tmp_, rnd_dn(jcp_.oc, OC_loop));
+                mov(reg_tmp, rnd_dn(OC_, OC_loop));
                 Label oc_loop;
                 L(oc_loop);
                 {
                     for (size_t offset = 0; offset < OC_loop; offset += vlen)
                         compute(offset, offset / vlen, false);
-                    advance_ptrs_imm(OC_loop, vlen);
-                    sub(reg_tmp_, OC_loop);
+                    advance_ptrs_imm(OC_loop);
+                    if (do_post_ops)
+                        add(reg_oc_offset, OC_loop);
+                    sub(reg_tmp, OC_loop);
                     jnz(oc_loop);
                 }
             }
@@ -637,14 +854,12 @@ void jit_pp_ker_t::generate() {
                     bool use_mask = (offset + vlen) > OC_tail;
                     compute(offset, offset / vlen, use_mask);
                 }
-                const size_t oc_tail_rem = OC_tail % vlen;
-                const size_t binary_offset = oc_tail_rem ? oc_tail_rem : vlen;
-                advance_ptrs_imm(OC_tail, binary_offset);
+                advance_ptrs_imm(OC_tail);
             }
 
             rewind_ptrs();
-            sub(reg_len_, jcp_.oc);
-            cmp(reg_len_, jcp_.oc);
+            sub(reg_len, OC_);
+            cmp(reg_len, OC_);
             jge(main_loop, T_NEAR);
         }
     }
@@ -653,52 +868,81 @@ void jit_pp_ker_t::generate() {
     // Epilogue loop
     Label epilogue_end;
     {
-        cmp(reg_len_, 0);
+        cmp(reg_len, 0);
         je(epilogue_end, T_NEAR);
 
         Label epilogue_loop, epilogue_loop_tail;
-        cmp(reg_len_, vlen);
-        jle(epilogue_loop_tail, T_NEAR);
+        if (do_post_ops)
+            mov(reg_oc_offset, 0);
+        cmp(reg_len, vlen);
+        jl(epilogue_loop_tail, T_NEAR);
         L(epilogue_loop);
         {
             compute(0, 0, false);
-            sub(reg_len_, vlen);
-            advance_ptrs_imm(vlen, vlen);
-            cmp(reg_len_, vlen);
+            sub(reg_len, vlen);
+            advance_ptrs_imm(vlen);
+            if (do_post_ops)
+                add(reg_oc_offset, vlen);
+            cmp(reg_len, vlen);
             jge(epilogue_loop, T_NEAR);
         }
 
         L(epilogue_loop_tail);
-        mov(reg_tmp_,
-                reg_len_); // reg_tmp_ is rcx, and we need cl for the shift
-        mov(reg_rem_mask_short_, 1);
-        shl(reg_rem_mask_short_, cl); // reg_tmp_ == rcx and reg_tail < vlen
-        sub(reg_rem_mask_short_, 1);
-        jz(epilogue_end, T_NEAR);
-        kmovq(kreg_rem_mask_short_, reg_rem_mask_short_);
+        mov(reg_tmp, reg_len); // reg_tmp is rcx, and we need cl for the shift
+        if (isa == avx512_core) {
+            mov(reg_rem_mask_short, 1);
+            shl(reg_rem_mask_short, cl); // reg_tmp == rcx and reg_tail < vlen
+            sub(reg_rem_mask_short, 1);
+            jz(epilogue_end, T_NEAR);
+            kmovq(kreg_rem_mask_short, reg_rem_mask_short);
+        } else {
+            mov(reg_shift_table, vlen);
+            sub(reg_shift_table, reg_tmp);
+            uni_vmovups(vreg_mask, ptr[reg_table + reg_shift_table * sizeof(float)]);
+            if (dst_data_type_ == data_type::s8 || dst_data_type_ == data_type::u8) {
+                mov(reg_shift_table, vlen * sizeof(float));
+                sub(reg_shift_table, reg_tmp);
+                uni_vmovups(vreg_store_mask, ptr[reg_table + reg_shift_table]);
+            }
+        }
         compute(0, 0, true);
     }
 
     L(epilogue_end);
 
-    if (zp_pad_comp_helper_) zp_pad_comp_helper_->fin();
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
 
     postamble();
 
-    if (jcp_.with_eltwise)
-        postops_injector_->prepare_table(/* generate = */ true);
+    for (auto &inj : jit_eltwise_injectors_)
+        inj->prepare_table(true);
+
+    if (utils::one_of(isa, avx2, sse41)) {
+        align(64);
+        L(l_table);
+        for (size_t i = 0; i < vlen; i++) dd(0xFFFFFFFF);
+        for (size_t i = 0; i < vlen; i++) dd(0x00000000);
+    }
 }
 
 bool mayiuse_jit_pp_kernel(data_type_t dst_dt) noexcept {
     const auto is_bf16_dst_dt = dst_dt == data_type::bf16;
     return mayiuse(avx512_core) && !is_bf16_dst_dt;
+
 }
 
 pp_ker_t *jit_pp_ker_create(
         const convolution_pd_t *pd, const conv_gemm_conf_t &jcp) {
-    return mayiuse_jit_pp_kernel(pd->dst_md()->data_type)
-            ? new jit_pp_ker_t(pd, jcp)
-            : nullptr;
+    if (mayiuse(avx512_core)) {
+        return new jit_pp_ker_t<avx512_core>(pd, jcp);
+    } else if (mayiuse(avx2)) {
+        return new jit_pp_ker_t<avx2>(pd, jcp);
+    } else if (mayiuse(sse41)) {
+        return new jit_pp_ker_t<sse41>(pd, jcp);
+    }
+    return nullptr;
 }
 
 bool post_ops_ok(const post_ops_t &post_ops, const memory_desc_wrapper *dst_d) {
diff --git a/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.hpp b/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.hpp
index 98142197dee..47347063d83 100644
--- a/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.hpp
+++ b/src/cpu/x64/jit_gemm_x8s8s32x_convolution_utils.hpp
@@ -28,8 +28,8 @@ namespace gemm_x8s8s32x_convolution_utils {
 
 cpu::gemm_x8s8s32x_convolution_utils::pp_ker_t *jit_pp_ker_create(
         const convolution_pd_t *pd, const conv_gemm_conf_t &jcp);
+
 bool mayiuse_jit_pp_kernel(data_type_t dst_dt) noexcept;
-bool post_ops_ok(const post_ops_t &post_ops, const memory_desc_wrapper *dst_d);
 
 } // namespace gemm_x8s8s32x_convolution_utils
 } // namespace x64
diff --git a/src/cpu/x64/jit_generator.cpp b/src/cpu/x64/jit_generator.cpp
index 17f88c68389..5c350a3df49 100644
--- a/src/cpu/x64/jit_generator.cpp
+++ b/src/cpu/x64/jit_generator.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-void jit_generator::transpose(const Xbyak::Reg64 &reg_src,
+void jit_generator_t::transpose(const Xbyak::Reg64 &reg_src,
         const Xbyak::Reg64 &reg_dst, dim_t src_stride, dim_t dst_stride,
         int nrows, int ncolumns, data_type_t dt, Xbyak::Ymm &ymm_tmp,
         Xbyak::Ymm &ymm_mask, Xbyak::Xmm &xmm_upper_mask) {
@@ -35,7 +35,7 @@ void jit_generator::transpose(const Xbyak::Reg64 &reg_src,
 
     // only avx2 version is supported for now. TODO for others
     const int transpose_size
-            = vreg_traits<Xbyak::Ymm>::vlen / types::data_type_size(dt);
+            = vreg_traits_t<Xbyak::Ymm>::vlen / types::data_type_size(dt);
     assert(is_valid_isa(avx2));
     assert(nrows <= transpose_size && ncolumns <= transpose_size);
 
diff --git a/src/cpu/x64/jit_generator.hpp b/src/cpu/x64/jit_generator.hpp
index 2fbd3080184..fcb792a2677 100644
--- a/src/cpu/x64/jit_generator.hpp
+++ b/src/cpu/x64/jit_generator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,12 +39,6 @@
 #define OFFSET_SHADOWSPACE 0x28
 #endif
 
-#if GCC_WA_NO_TREE_DOMINATOR_OPTS
-#define ATTRIBUTE_OPTIMIZE __attribute__((optimize("no-tree-dominator-opts")))
-#else
-#define ATTRIBUTE_OPTIMIZE
-#endif
-
 #define DECLARE_CPU_JIT_AUX_FUNCTIONS(gen_name) \
     const char *name() const override { return STRINGIFY(gen_name); } \
     const char *source_file() const override { return __FILE__; } \
@@ -66,21 +60,16 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-// TODO: move this to jit_generator class?
+// TODO: move this to jit_generator_t class?
 namespace {
 
-typedef enum {
-    MAX_CODE_SIZE = 256 * 1024,
-} max_code_size_t;
-
 // TODO: move this somewhere else? Although this is only used by jit kernels
 // (Roma)
-static inline int float2int(float x) {
+inline int float2int(float x) {
     return utils::bit_cast<int>(x);
 }
 
-static inline void tc_configure_tile(
-        palette_config_t *tc, int t, int rows, int cols) {
+inline void tc_configure_tile(palette_config_t *tc, int t, int rows, int cols) {
     const bool rows_ok = (size_t)t < sizeof(tc->rows) / sizeof(tc->rows[0]);
     const bool cols_ok = (size_t)t < sizeof(tc->cols) / sizeof(tc->cols[0]);
     if (rows_ok && cols_ok) {
@@ -151,9 +140,9 @@ constexpr Xbyak::Operand::Code abi_not_param_reg =
 
 #endif
 
-class jit_generator : public Xbyak::MmapAllocator,
-                      public Xbyak::CodeGenerator,
-                      public c_compatible {
+class jit_generator_t : public Xbyak::MmapAllocator,
+                        public Xbyak::CodeGenerator,
+                        public c_compatible {
 public:
     using c_compatible::operator new;
     using c_compatible::operator new[];
@@ -186,6 +175,7 @@ class jit_generator : public Xbyak::MmapAllocator,
         _cmp_nlt_us = 5u,
         _cmp_nle_us = 6u,
 
+        _op_near = 0u,
         _op_floor = 1u,
         _op_mxcsr = 4u,
     };
@@ -194,7 +184,9 @@ class jit_generator : public Xbyak::MmapAllocator,
     const int EVEX_max_8b_offt = 0x200;
     const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
 
-    inline size_t get_size_of_abi_save_regs() { return size_of_abi_save_regs; }
+    inline size_t get_size_of_abi_save_regs() const {
+        return size_of_abi_save_regs;
+    }
 
     void preamble() {
         if (xmm_to_preserve) {
@@ -245,8 +237,7 @@ class jit_generator : public Xbyak::MmapAllocator,
     // By default it assumes to be called after the prologue
     // Note: that we cannot use RBP inside as we override it in preamble
     // for address computation in EVEX instructions
-    inline const Xbyak::RegExp get_stack_params_address(
-            bool after_prolog = true) {
+    inline Xbyak::RegExp get_stack_params_address(bool after_prolog = true) {
         int saved_regs_size = after_prolog ? get_size_of_abi_save_regs() : 0;
 #ifdef _WIN32
         // Using stack layout described in MS ABI
@@ -289,6 +280,8 @@ class jit_generator : public Xbyak::MmapAllocator,
     template <typename T>
     Xbyak::Address EVEX_compress_addr(
             Xbyak::Reg64 base, T raw_offt, bool bcast = false) {
+        assert(is_valid_isa(avx512_core));
+
         using Xbyak::Address;
         using Xbyak::Reg64;
         using Xbyak::RegExp;
@@ -371,6 +364,16 @@ class jit_generator : public Xbyak::MmapAllocator,
         }
     }
 
+    // The function returns type of encoding (Evex or Vex) depending on the
+    // system ISA. It's designed to be used with instructions that require
+    // specific encoding when both encodings are supported on the system.
+    // Evex would be preferred over Vex when possible.
+    // The assumption is that both encoding mnemonics are supported by the
+    // hardware for `avx512_core+` systems.
+    Xbyak::PreferredEncoding get_encoding() {
+        return mayiuse(avx512_core) ? Xbyak::EvexEncoding : Xbyak::VexEncoding;
+    }
+
     // Disallow char-based labels completely
     void L(const char *label) = delete;
     void L(Xbyak::Label &label) { Xbyak::CodeGenerator::L(label); }
@@ -490,6 +493,14 @@ class jit_generator : public Xbyak::MmapAllocator,
         else
             movdqu(x, addr);
     }
+
+    void uni_vmovdqu(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2) {
+        if (is_valid_isa(avx))
+            vmovdqu(x1, x2);
+        else
+            movdqu(x1, x2);
+    }
+
     void uni_vmovdqu(const Xbyak::Ymm &x, const Xbyak::Address &addr) {
         vmovdqu(x, addr);
     }
@@ -767,6 +778,18 @@ class jit_generator : public Xbyak::MmapAllocator,
         }
     }
 
+    void uni_vhsubps(const Xbyak::Xmm &x, const Xbyak::Xmm &x2,
+                     const Xbyak::Operand &op) {
+        if (is_valid_isa(avx)) {
+            vhsubps(x, x2, op);
+        } else {
+            if (!x.isEqualIfNotInherited(x2)) {
+                movups(x, x2);
+            }
+            hsubps(x, op);
+        }
+    }
+
     void uni_vpsignd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
             const Xbyak::Operand &op) {
         if (is_valid_isa(avx))
@@ -867,6 +890,18 @@ class jit_generator : public Xbyak::MmapAllocator,
         vsubps(x, op1, op2);
     }
 
+    void uni_vaddsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1,
+                       const Xbyak::Operand &op2) {
+        if (is_valid_isa(avx)) {
+            vaddsubps(x, op1, op2);
+        } else {
+            if (!x.isEqualIfNotInherited(op1)) {
+                movups(x, op1);
+            }
+            addsubps(x, op2);
+        }
+    }
+
     void uni_vpmulld(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
             const Xbyak::Operand &op) {
         if (is_valid_isa(avx)) {
@@ -1525,6 +1560,16 @@ class jit_generator : public Xbyak::MmapAllocator,
         vcmpps(x1, x2, op, cmp_predicate);
     }
 
+    void uni_cmpneqps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
+            const Xbyak::Operand &op) {
+        if (is_valid_isa(avx))
+            vcmpneqps(x1, x2, op);
+        else {
+            if (x1.getIdx() != x2.getIdx()) uni_vmovups(x1, x2);
+            cmpneqps(x1, op);
+        }
+    }
+
     void uni_vtestps(const Xbyak::Xmm &x1, const Xbyak::Operand &op) {
         if (is_valid_isa(avx))
             vtestps(x1, op);
@@ -1637,6 +1682,17 @@ class jit_generator : public Xbyak::MmapAllocator,
             vcvtps2ph(x1, x2, _op_mxcsr);
     }
 
+    void uni_vcvttps2dq(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
+        if (is_valid_isa(avx))
+            vcvttps2dq(x, op);
+        else
+            cvttps2dq(x, op);
+    }
+
+    void uni_vcvttps2dq(const Xbyak::Ymm &x, const Xbyak::Operand &op) {
+        vcvttps2dq(x, op);
+    }
+
     void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Xmm &x2) {
         movmskps(x1.cvt64(), x2);
     }
@@ -1901,6 +1957,16 @@ class jit_generator : public Xbyak::MmapAllocator,
         vpshufb(x1, x2, op);
     }
 
+    void uni_vcmpgtps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
+                      const Xbyak::Operand &op) {
+        if (is_valid_isa(avx))
+            vcmpps(x1, x2, op, _cmp_nle_us);
+        else {
+            assert(x1.getIdx() == x2.getIdx());
+            cmpps(x1, op, _cmp_nle_us);
+        }
+    }
+
     void uni_vpand(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2,
             const Xbyak::Operand &op) {
         if (is_valid_isa(avx512_core) && x1.getBit() == 512)
@@ -1913,6 +1979,11 @@ class jit_generator : public Xbyak::MmapAllocator,
         }
     }
 
+    void uni_vpand(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
+                    const Xbyak::Operand &op = Xbyak::Operand()) {
+        vpand(x1, x2, op);
+    }
+
     void uni_vpslldq(
             const Xbyak::Xmm &x, const Xbyak::Operand &op, const int imm) {
         if (is_valid_isa(avx))
@@ -1922,6 +1993,7 @@ class jit_generator : public Xbyak::MmapAllocator,
             pslldq(x, imm);
         }
     }
+
     void uni_vpslldq(
             const Xbyak::Ymm &x, const Xbyak::Operand &op, const int imm) {
         vpslldq(x, op, imm);
@@ -1995,6 +2067,10 @@ class jit_generator : public Xbyak::MmapAllocator,
         }
     }
 
+    void uni_vpminsd(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op) {
+        vpminsd(x1, x2, op);
+    }
+
     void uni_movshdup(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
         if (is_valid_isa(avx))
             vmovshdup(x, op);
@@ -2021,6 +2097,19 @@ class jit_generator : public Xbyak::MmapAllocator,
 
     // End of custom instructions section.
 
+    void uni_vcmpgtps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2,
+                      const Xbyak::Operand &op) {
+        vcmpgtps(x1, x2, op);
+    }
+
+
+    void uni_vmovshdup(const Xbyak::Xmm &x, const Xbyak::Operand &op) {
+        if (is_valid_isa(avx))
+            vmovshdup(x, op);
+        else
+            movshdup(x, op);
+    }
+
     void mul_by_const(
             const Xbyak::Reg &out, const Xbyak::Reg64 &tmp, int value) {
         // Generates a shift + add sequence for multiplicating contents of the
@@ -2103,18 +2192,26 @@ class jit_generator : public Xbyak::MmapAllocator,
         using namespace data_type;
         if (!utils::one_of(odt, u8, s8, s32)) return;
 
-        // no need to apply lower saturation bound when odt is
+        // Note: no need to apply lower saturation bound when odt is
         // signed, as cvtps2dq will return MIN_INT if the value
         // does not fit. The param force_lbound, will force saturate values
         // unconditionally to lbound.
+        //
+        // Note: `vmaxps` and `vminps` would propagate the value from the second
+        // source operand if any of values is NaN. As `lbound` or `ubound` are
+        // fixed values, to propagate NaN from the input further, the register
+        // with data must be a second source operand (last argument).
+        // TODO: this will disalign the behavior with SSE41 which will require
+        // either a scratch register or stack allocation to handle the case.
+        // Since there's no request for SSE41, keep it as is for now.
         if (odt == u8 || force_lbound) {
             if (is_valid_isa(avx))
-                vmaxps(vmm, vmm, vmm_lbound);
+                vmaxps(vmm, vmm_lbound, vmm);
             else
                 maxps(vmm, vmm_lbound);
         }
         if (is_valid_isa(avx))
-            vminps(vmm, vmm, vmm_ubound);
+            vminps(vmm, vmm_ubound, vmm);
         else
             minps(vmm, vmm_ubound);
     }
@@ -2570,9 +2667,7 @@ class jit_generator : public Xbyak::MmapAllocator,
                 store_bytes(vmm, reg, offset, store_size);
                 break;
             case data_type::bf16:
-                vcvtneps2bf16(xmm, vmm,
-                        is_valid_isa(avx512_core_bf16) ? Xbyak::EvexEncoding
-                                                       : Xbyak::VexEncoding);
+                vcvtneps2bf16(xmm, vmm, get_encoding());
                 store_bytes(vmm, reg, offset, sizeof(bfloat16_t) * store_size);
                 break;
             case data_type::f16:
@@ -2627,7 +2722,7 @@ class jit_generator : public Xbyak::MmapAllocator,
                 load_bytes(
                         vmm, src_addr, sizeof(float16_t) * load_size, zero_vmm);
                 vcvtph2ps(vmm,
-                        typename vreg_traits<Vmm>::Vmm_lower_t(vmm.getIdx()));
+                        typename vreg_traits_t<Vmm>::Vmm_lower_t(vmm.getIdx()));
                 break;
             default: assert(!"unsupported source data type");
         }
@@ -2646,7 +2741,7 @@ class jit_generator : public Xbyak::MmapAllocator,
             const std::function<void(int)> &tail_process,
             const data_type_t data_type = data_type::f32) {
         const auto simd_w
-                = vreg_traits<Vmm>::vlen / types::data_type_size(data_type);
+                = vreg_traits_t<Vmm>::vlen / types::data_type_size(data_type);
 
         Xbyak::Label label_tbl, label_tbl_end;
         std::vector<Xbyak::Label> l_case(simd_w);
@@ -2688,23 +2783,46 @@ class jit_generator : public Xbyak::MmapAllocator,
             data_type_t dt,
             /*rest of vmms used only if there are tails*/ Xbyak::Ymm &ymm_tmp,
             Xbyak::Ymm &ymm_mask, Xbyak::Xmm &xmm_upper_mask);
-    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_generator);
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_generator_t);
 
-public:
     /* All uni_ instructions -- apart from uni_vzeroupper() -- will comply with
      * the max_cpu_isa argument */
-    jit_generator(const char *name, cpu_isa_t max_cpu_isa = get_max_cpu_isa())
+    jit_generator_t(const char *name, cpu_isa_t max_cpu_isa = get_max_cpu_isa())
         : Xbyak::MmapAllocator(name)
-        , Xbyak::CodeGenerator(MAX_CODE_SIZE, Xbyak::AutoGrow,
+        , Xbyak::CodeGenerator(max_code_size, Xbyak::AutoGrow,
                   /*allocator=*/this)
         , max_cpu_isa_(max_cpu_isa) {}
 
-    virtual ~jit_generator() {}
+    ~jit_generator_t() override = default;
 
     virtual const char *name() const = 0;
     virtual const char *source_file() const = 0;
 
+    void dump_debug_traces(const uint8_t *code, size_t code_size) const {
+#if !defined(NDEBUG) && defined(__GNUC__)
+        if (code && get_jit_dump()) {
+        #define MAX_FNAME_LEN 256
+            static int counter = 0;
+            char fname[MAX_FNAME_LEN + 1];
+            snprintf(fname, MAX_FNAME_LEN, "dnnl_traces_cpu_%s.%d.txt", name(),
+                    counter);
+            counter++;
+
+            std::cout << "[ oneDNN ] dump_debug_traces: " << fname << std::endl;
+            FILE *fp = fopen(fname, "wb+");
+            if (fp) {
+                for (const auto & p : get_debug_traces()) {
+                    fprintf(fp, "%lx:\t%s\n", p.first, p.second.c_str());
+                }
+                fclose(fp);
+            }
+        #undef MAX_FNAME_LEN
+        }
+#endif
+    }
+
     void register_jit_code(const Xbyak::uint8 *code, size_t code_size) const {
+        dump_debug_traces(code, code_size);
         jit_utils::register_jit_code(code, code_size, name(), source_file());
     }
 
@@ -2744,6 +2862,8 @@ class jit_generator : public Xbyak::MmapAllocator,
         return Xbyak::GetError() == Xbyak::ERR_NONE;
     }
 
+    static constexpr unsigned max_code_size = 256 * 1024;
+
 protected:
     virtual void generate() = 0;
     const Xbyak::uint8 *jit_ker_ = nullptr;
diff --git a/src/cpu/x64/jit_primitive_conf.hpp b/src/cpu/x64/jit_primitive_conf.hpp
index b9b1773db3c..acf1435cf30 100644
--- a/src/cpu/x64/jit_primitive_conf.hpp
+++ b/src/cpu/x64/jit_primitive_conf.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -102,6 +102,8 @@ struct jit_conv_conf_t {
     bool with_sum;
     bool with_eltwise;
     bool with_binary;
+    bool with_depthwise;
+    bool with_quantization;
 
     data_type_t sum_dt;
 
@@ -255,6 +257,17 @@ struct jit_conv_conf_t {
     int max_width;
 
     bool transform_to_vnni;
+
+    bool with_input_zp;
+    bool with_weights_zp;
+
+    int oh_block;
+    int oh_block_step;
+    int nb_ow_blocking;
+
+    int dw_conv_oh, dw_conv_ow;
+    data_type_t dw_conv_dst_dt;
+    bool is_int8_deconvolution;
 };
 
 // calculates filter size taking into account dilation
@@ -276,6 +289,32 @@ inline status_t init_tag(format_tag_t &tag, const memory_desc_wrapper &mdw,
     return tag == tag_value ? status::success : status::unimplemented;
 }
 
+inline bool has_large_size(const convolution_desc_t &cd,
+        const memory_desc_wrapper &src_d, const memory_desc_wrapper &weights_d,
+        const memory_desc_wrapper &dst_d) {
+    auto is_large = [](const dim_t val) { return val > INT_MAX; };
+    auto img_size = [](const memory_desc_wrapper &mem_d) {
+        // assuming that first dimension for src and dst is minibatch
+        const auto mb = mem_d.dims()[0] != 0 ? mem_d.dims()[0] : 1;
+        return mem_d.nelems() / mb;
+    };
+
+    // Check if numbers of elements (excepting minibatch) for any memory
+    // descriptor is greater than INT_MAX
+    if (is_large(img_size(src_d)) || is_large(weights_d.nelems())
+            || is_large(img_size(dst_d)))
+        return true;
+
+    const int ndims = src_d.ndims();
+    for (int d = 3; d <= ndims; d++) {
+        if (utils::one_of(true, is_large(cd.strides[ndims - d]),
+                    is_large(cd.padding[0][ndims - d]),
+                    is_large(cd.dilates[ndims - d])))
+            return true;
+    }
+    return false;
+}
+
 struct jit_conv_call_s {
     const void *src; /* hack, non-const for backward_data */
     const void *dst; /* hack, non-const for forward */
@@ -346,6 +385,20 @@ struct jit_conv_call_s {
     int oc_flag;
     size_t last_ic_block;
     size_t last_oc_block;
+
+    size_t oc_off;
+    //Used for holding oc offset like GP registers. Used when lack of regisers. 
+    size_t dummy_oc_off;
+    size_t ic_off;
+    size_t oc_off_prf;
+    size_t oh_blocks;
+
+    const void *input_zp;
+
+    size_t oc_work;
+    const void *src_row0; /* hack, non-const for backward_data */
+    const void *src_row1; /* hack, non-const for backward_data */
+    const void *src_row2; /* hack, non-const for backward_data */
 };
 
 struct jit_deconv_call_s {
@@ -374,6 +427,7 @@ struct jit_deconv_call_s {
     size_t kh_padding;
     size_t kd_padding;
     size_t oc_blocks;
+    size_t oc_off;
 };
 
 struct jit_dw_conv_call_s {
@@ -405,6 +459,8 @@ struct jit_1x1_conv_conf_t {
     bool with_sum;
     bool with_eltwise;
     bool with_binary;
+    bool with_depthwise;
+    bool with_quantization;
     bool with_dw_conv;
 
     post_ops_t post_ops;
@@ -441,6 +497,7 @@ struct jit_1x1_conv_conf_t {
     bool transpose_src;
     int nthr, nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b;
     int is_oc_scale;
+    data_type_t src_dt;
     data_type_t bia_dt;
     data_type_t dst_dt;
     data_type_t sum_dt;
@@ -455,6 +512,12 @@ struct jit_1x1_conv_conf_t {
 
     cpu_isa_t isa;
     bool uses_permw_transposition;
+
+    bool with_input_zp;
+    bool with_weights_zp;
+
+    int dw_conv_oh, dw_conv_ow;
+    data_type_t dw_conv_dst_dt;
 };
 
 struct jit_1x1_conv_call_s {
@@ -483,6 +546,8 @@ struct jit_1x1_conv_call_s {
     size_t output_stride; // used in backward_weights only
 
     size_t first_last_flag;
+
+    size_t oc_off;
 };
 
 struct jit_pool_conf_t {
@@ -524,8 +589,12 @@ struct jit_pool_conf_t {
     bool with_postops;
     bool with_eltwise;
     bool with_binary;
+    bool with_depthwise;
+    bool with_quantization;
     int nthr;
     memory_desc_t tmp_md;
+    bool needs_f32_accum_for_bf16;
+    dim_t f32_accum_block_size;
 };
 
 struct jit_pool_call_s {
@@ -588,7 +657,7 @@ struct jit_resampling_conf_t {
 
     cpu_isa_t isa = isa_undef;
 
-    post_ops_t post_ops = post_ops_t();
+    post_ops_t post_ops;
     bool with_postops = false;
     bool with_eltwise = false;
     bool with_binary = false;
@@ -661,6 +730,7 @@ struct jit_brdgmm_conv_conf_t {
 enum conv_brgemm_loop_order_t {
     loop_ndhwgc,
     loop_ngcdhw,
+    loop_gcndhw,
 };
 
 enum conv_brgemm_exec_type_t {
@@ -795,6 +865,8 @@ struct jit_brgemm_conv_conf_t {
     bool is_bf32;
     bool is_fp8 {false};
     bool is_fp8_convert {false};
+    bool is_f32_f16 {false};
+    bool is_f32_bf16 {false};
     bool comp_with_vpads;
 
     int nthr_mb, nthr_g, nthr_oc_b, nthr_ic_b, nthr_oh;
@@ -816,6 +888,8 @@ struct jit_brgemm_conv_conf_t {
     int vnni_block {1};
     bool has_uneven_iw;
     int trans_dim_koef {1};
+    bool extendable_k = false;
+    brgemm_kernel_loop_order_t brgemm_kernel_loop_order {brgemm_lo_default};
 };
 
 struct jit_shuffle_conf_t {
@@ -878,6 +952,7 @@ struct jit_binary_conf_t {
     bool is_i8 = false;
     bool is_bf16 = false;
     bool is_f16 = false;
+    bool is_ternary_op = false;
     bool is_src_different_layouts = false;
     dim_t outer_dims = 1;
     int src1_stride = 1;
@@ -886,12 +961,13 @@ struct jit_binary_conf_t {
 
     data_type_t src0_type = data_type::undef;
     data_type_t src1_type = data_type::undef;
+    data_type_t src2_type = data_type::undef;
     data_type_t dst_type = data_type::undef;
 };
 
 struct jit_binary_call_s {
     // keep all sizes at 8 bytes -- jit code expects this
-    const void *src0, *src1, *dst, *indices;
+    const void *src0, *src1, *src2, *dst, *indices;
     const float *scales_src0, *scales_src1;
     size_t spat_offt_count;
     const void *post_ops_binary_rhs_arg_vec;
@@ -916,7 +992,7 @@ struct jit_reduction_conf_t {
 
     bool is_saturation_needed = false;
 
-    post_ops_t post_ops = post_ops_t();
+    post_ops_t post_ops;
     bool with_postops = false;
     bool with_eltwise = false;
     bool with_binary = false;
@@ -931,6 +1007,26 @@ struct jit_reduction_call_s {
     const void *dst_orig = nullptr;
 };
 
+/* softmax */
+struct jit_softmax_conf_t {
+    size_t outer_size;
+    size_t channels;
+    size_t inner_size;
+    size_t ur_channel;
+    size_t ur_inner;
+    size_t outer_block;
+    size_t dt_size;
+    data_type_t dt;
+};
+
+struct jit_softmax_call_s {
+    const uint8_t* src;
+    uint8_t* dst;
+
+    size_t channels;
+    size_t work;
+};
+
 } // namespace x64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.cpp b/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.cpp
index 7f759a8bb46..1ad3c8df21a 100644
--- a/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.cpp
+++ b/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,12 +42,13 @@ using namespace Xbyak;
 jit_sse41_1x1_conv_kernel_f32::jit_sse41_1x1_conv_kernel_f32(
         const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), sse41), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), sse41), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
         static constexpr size_t helper_vmm_idx = 15;
-        const size_t tail_size = jcp.oc_without_padding % simd_w_;
+        // workaround tail process
+        const size_t tail_size = 0;
         static constexpr bool use_exact_tail_scalar_bcast = false;
 
         const binary_injector::rhs_arg_static_params_t rhs_arg_static_params {
@@ -57,9 +58,12 @@ jit_sse41_1x1_conv_kernel_f32::jit_sse41_1x1_conv_kernel_f32(
                 use_exact_tail_scalar_bcast};
         const binary_injector::static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {xmm_d_weights.getIdx(), xmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
+
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<sse41>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -130,6 +134,15 @@ static void iterate(const int load_loop_blk, const int ur, const F &f) {
 }
 void jit_sse41_1x1_conv_kernel_f32::apply_postops(
         const int load_loop_blk, const int ur) {
+    std::map<size_t, int> vmm_idx_off;
+    iterate(load_loop_blk, ur,
+            [&](const int i, const int j, const int n) {
+                vmm_idx_off.insert({reg_accum_idx(load_loop_blk, i, j, n), (2 * i + n) * jcp.load_block / 2 * sizeof(float)});
+            });
+    depthwise_injector::dynamic_params_t ddp {xmm_d_weights.getIdx(), xmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                              reg_oc_off, vmm_idx_off, this->rsp, base_post_ops_data_offset};
+    quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt, this->rsp, base_post_ops_data_offset};
+
     injector_utils::vmm_index_set_t vmm_idxs;
     if (jcp.with_binary) {
         binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
@@ -158,12 +171,12 @@ void jit_sse41_1x1_conv_kernel_f32::apply_postops(
         mov(abi_param1,
                 ptr[rsp + reg_abi_param1_backup + reg_guard_stack_occupied]);
 
-        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
     } else {
         iterate(load_loop_blk, ur, [&](const int i, const int j, const int n) {
             vmm_idxs.emplace(reg_accum_idx(load_loop_blk, i, j, n));
         });
-        postops_injector_->compute_vector_range(vmm_idxs);
+        postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
     }
 }
 
@@ -286,7 +299,7 @@ void jit_sse41_1x1_conv_kernel_f32::generate_reduce_loop(
 
         L(store_noadd);
 
-        if (jcp.with_eltwise || jcp.with_binary) {
+        if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
             assert(ur * load_loop_blk < 14);
 
             Label store_nopostops;
@@ -430,7 +443,12 @@ void jit_sse41_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk) {
 void jit_sse41_1x1_conv_kernel_f32::generate() {
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_bcast_data, reg_load_data);
+
     sub(rsp, stack_space_needed);
+    base_post_ops_data_offset += stack_space_needed;
+
     if (jcp.with_binary) {
         // backup abi_param1 for usage in post_ops processing
         mov(ptr[rsp + reg_abi_param1_backup], abi_param1);
@@ -460,6 +478,7 @@ void jit_sse41_1x1_conv_kernel_f32::generate() {
     mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
     if (jcp.prop_kind == backward_weights)
         mov(reg_output_stride, ptr[param1 + GET_OFF(output_stride)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     auto generate_load_loop_body = [&](int load_loop_blk) {
         const size_t offst_with_dw_conv
@@ -492,6 +511,7 @@ void jit_sse41_1x1_conv_kernel_f32::generate() {
             default: assert(!"invalid prop_kind");
         }
         sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
     };
 
     Label load_loop_blk_8;
@@ -540,6 +560,10 @@ void jit_sse41_1x1_conv_kernel_f32::generate() {
     L(load_loop_blk_end);
 
     add(rsp, stack_space_needed);
+    base_post_ops_data_offset -= stack_space_needed;
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
 
     postamble();
 
@@ -554,6 +578,11 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     // disabling verbose dispatch messages for unsupported isa for better readability
     if (!mayiuse(sse41)) return status::unimplemented;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     // TODO (Roma): this code is duplicated from the generic kernel; maybe the
     // configuration struct could do some stuff below
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
@@ -567,6 +596,7 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.mb = src_d.dims()[0];
 
     jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.oc_without_padding = jcp.oc;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
     jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
@@ -588,8 +618,8 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     jcp.os = jcp.oh * jcp.ow;
     jcp.is = jcp.ih * jcp.iw;
 
-    jcp.typesize_in = sizeof(prec_traits<data_type::f32>::type);
-    jcp.typesize_out = sizeof(prec_traits<data_type::f32>::type);
+    jcp.typesize_in = sizeof(prec_traits_t<data_type::f32>::type);
+    jcp.typesize_out = sizeof(prec_traits_t<data_type::f32>::type);
 
     const auto &post_ops = attr.post_ops_;
 
@@ -606,6 +636,9 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     const int prelu_ind = post_ops.find(primitive_kind::prelu, 0, dw_conv_ind);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
 
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise, 0, dw_conv_ind) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization, 0, dw_conv_ind) != -1;
+
     if (dw_conv_ind >= 0) {
         // dw_conv and post_ops after it are handled externally, so skip them
         jcp.post_ops.entry_.assign(post_ops.entry_.cbegin(),
@@ -618,15 +651,16 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     static constexpr bool sum_at_pos_0_only = true;
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
+
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(sse41,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
     const auto dat_tag_nxc = utils::pick(ndims - 3, nwc, nhwc);
     const auto dat_tag_blocked = utils::pick(ndims - 3, nCw8c, nChw8c);
-    jcp.src_tag = src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_blocked);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_blocked);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_blocked);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_blocked);
     const bool is_data_layout_nxc
             = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
     const auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_blocked;
@@ -645,6 +679,12 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
 
     const int simd_w = 4;
 
+    bool ok_to_pad_channels = true && !is_data_layout_nxc && jcp.ngroups == 1;
+    if (ok_to_pad_channels) {
+        jcp.oc = rnd_up(jcp.oc, simd_w*2);
+        jcp.ic = rnd_up(jcp.ic, simd_w*2);
+    }
+
     jcp.ic_block = jcp.oc_block = simd_w * 2;
 
     args_ok = true && jcp.oc % jcp.oc_block == 0 && jcp.ic % jcp.ic_block == 0
@@ -810,6 +850,15 @@ status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
     return status::success;
 }
 
+void jit_sse41_1x1_conv_kernel_f32::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad,
+        const jit_1x1_conv_conf_t &jcp) {
+    using namespace dnnl::impl::memory_tracking::names;
+
+    if (jcp.prop_kind != backward_data && jcp.oc != jcp.oc_without_padding)
+        scratchpad.book<float>(key_conv_padded_bias, sizeof(float) * jcp.oc);
+}
+
 } // namespace x64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.hpp b/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.hpp
index 6bb4fc45c34..5e41b613752 100644
--- a/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.hpp
+++ b/src/cpu/x64/jit_sse41_1x1_conv_kernel_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #ifndef CPU_X64_JIT_SSE41_1X1_CONV_KERNEL_F32_HPP
 #define CPU_X64_JIT_SSE41_1X1_CONV_KERNEL_F32_HPP
 
+#include "common/memory_tracking.hpp"
 #include "common/c_types_map.hpp"
 #include "common/memory.hpp"
 
@@ -29,7 +30,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_sse41_1x1_conv_kernel_f32 : public jit_generator {
+struct jit_sse41_1x1_conv_kernel_f32 : public jit_generator_t {
     jit_sse41_1x1_conv_kernel_f32(const jit_1x1_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
 
@@ -39,13 +40,17 @@ struct jit_sse41_1x1_conv_kernel_f32 : public jit_generator {
             const memory_desc_wrapper &dst_d, const primitive_attr_t &attr,
             int nthreads);
 
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+                                const jit_1x1_conv_conf_t &jcp);
+
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_1x1_conv_kernel_f32)
 
     jit_1x1_conv_conf_t jcp;
     const primitive_attr_t &attr_;
 
 private:
-    static constexpr auto simd_w_ = cpu_isa_traits<sse41>::vlen / sizeof(float);
+    static constexpr auto simd_w_
+            = cpu_isa_traits_t<sse41>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
     using xmm_t = const Xbyak::Xmm;
 
@@ -54,12 +59,12 @@ struct jit_sse41_1x1_conv_kernel_f32 : public jit_generator {
     reg64_t reg_output_data = rbx;
     reg64_t aux_reg_bcast_data = rdx;
     reg64_t aux1_reg_bcast_data = abi_not_param1;
-    reg64_t aux_reg_load_data = abi_param1;
     reg64_t aux_reg_output_data = rbp;
     reg64_t reg_load_loop_work = r9;
     reg64_t reg_bcast_loop_work = r10;
     reg64_t reg_reduce_loop_work = r11;
     reg64_t load_loop_iter = r13;
+    reg64_t aux_reg_load_data = load_loop_iter;
     reg64_t imm_addr64 = load_loop_iter;
     reg64_t bcast_loop_iter = r14;
     reg64_t reduce_loop_iter = r15;
@@ -79,6 +84,14 @@ struct jit_sse41_1x1_conv_kernel_f32 : public jit_generator {
     std::unique_ptr<injector::jit_uni_postops_injector_t<sse41>>
             postops_injector_;
 
+    reg64_t reg_oc_off = abi_param1;
+    reg64_t reg_d_weights = aux_reg_bcast_data;
+    reg64_t reg_d_bias = reduce_loop_iter;
+    int base_post_ops_data_offset = 0;
+
+    Xbyak::Xmm xmm_d_weights = Xbyak::Xmm(14);
+    Xbyak::Xmm xmm_d_bias = Xbyak::Xmm(15);
+
     void generate_bcast_loop(int load_loop_blk);
     void generate_reduce_loop(int load_loop_blk, int ur);
     void generate_diff_bias_loop(int load_loop_blk);
diff --git a/src/cpu/x64/jit_sse41_1x1_convolution.cpp b/src/cpu/x64/jit_sse41_1x1_convolution.cpp
index a7d7244cdea..ddd50624202 100644
--- a/src/cpu/x64/jit_sse41_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_sse41_1x1_convolution.cpp
@@ -32,6 +32,7 @@ namespace x64 {
 
 using namespace dnnl::impl::status;
 using namespace dnnl::impl::utils;
+using namespace dnnl::impl::memory_tracking::names;
 
 void jit_sse41_1x1_convolution_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
@@ -52,6 +53,15 @@ void jit_sse41_1x1_convolution_fwd_t::execute_forward(
             : std::vector<const void *> {};
 
     auto scratchpad = ctx.get_scratchpad_grantor();
+
+    if (pd()->wants_padded_bias()) {
+        auto padded_bias = scratchpad.get<data_t>(key_conv_padded_bias);
+        utils::array_copy(padded_bias, bias, kernel_->jcp.oc_without_padding);
+        utils::array_set(padded_bias + kernel_->jcp.oc_without_padding, 0.f,
+                         kernel_->jcp.oc - kernel_->jcp.oc_without_padding);
+        bias = padded_bias;
+    }
+
     parallel(kernel_->jcp.nthr, [&](const int ithr, const int nthr) {
         execute_forward_thr(ithr, nthr, src, weights, bias, weights_dw, bias_dw,
                 dst, scratchpad, post_ops_binary_rhs_arg_vec.data(),
@@ -172,6 +182,7 @@ void jit_sse41_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
         par_conv.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
         par_conv.dst_orig
                 = static_cast<const float *>(par_conv.output_data) - dst_off;
+        par_conv.oc_off = _ocb * jcp.oc_block * sizeof(float);
 
         (*kernel_)(&par_conv);
     };
@@ -251,6 +262,8 @@ void jit_sse41_1x1_convolution_fwd_t::execute_forward_thr(const int ithr,
                     = post_ops_binary_rhs_arg_vec_dw;
             par_conv_dw.dst_orig = dst;
 
+            par_conv_dw.oc_off = ch * jcp_dw.ch_block * sizeof(float);
+
             (*kernel_dw_)(&par_conv_dw);
 
             for (int i = 0; i < jcp_dw.kh; ++i)
diff --git a/src/cpu/x64/jit_sse41_1x1_convolution.hpp b/src/cpu/x64/jit_sse41_1x1_convolution.hpp
index 04214cfcaa7..215d476ccb9 100644
--- a/src/cpu/x64/jit_sse41_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_sse41_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,10 +39,7 @@ namespace x64 {
 
 struct jit_sse41_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        using dw_conv_pd_type = jit_sse41_dw_convolution_fwd_t::pd_t;
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
             if (copy(other) != status::success) is_initialized_ = false;
@@ -68,10 +65,18 @@ struct jit_sse41_1x1_convolution_fwd_t : public primitive_t {
                     attr_.set_default_formats(dst_md(0)) == status::success,
                     VERBOSE_UNSUPPORTED_POSTOP);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_sse41_1x1_conv_kernel_f32::init_conf(jcp_, *desc(),
                     *src_md(), *weights_md(), *dst_md(), *attr(),
                     dnnl_get_max_threads()));
-            if (jcp_.with_dw_conv) CHECK(depthwise_po_init(engine));
+            if (jcp_.with_dw_conv) {
+                // todo: [antonvor] enable when new behavior of dw convolution fusing from oneDNN 1.6 will be supported
+                return status::unimplemented;
+                CHECK(depthwise_po_init(engine));
+            }
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_sse41_1x1_conv_kernel_f32::init_scratchpad(scratchpad, jcp_);
 
             return status::success;
         }
@@ -82,14 +87,14 @@ struct jit_sse41_1x1_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
+            return dw_conv_pd_ && jcp_.with_dw_conv
                     ? dw_conv_pd_->dst_md(index, user_input)
                     : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
+            if (dw_conv_pd_ && jcp_.with_dw_conv) {
                 switch (arg) {
                     case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
                         return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
@@ -107,14 +112,14 @@ struct jit_sse41_1x1_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
             return convolution_fwd_pd_t::arg_usage(arg);
         }
 
-        jit_1x1_conv_conf_t jcp_;
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
         using dw_pd_t = jit_sse41_dw_convolution_fwd_t::pd_t;
         std::unique_ptr<dw_pd_t> dw_conv_pd_;
 
@@ -197,6 +202,35 @@ struct jit_sse41_1x1_convolution_fwd_t : public primitive_t {
             CHECK(get_depthwise_conv_desc(
                     cd_dw, src_md, attr_1x1, attr_dw, dw_po_index));
 
+            // The code below doesn't work because currently it requires `jcp_`
+            // member which is not available from the common interface. In turn,
+            // this means the common pd creation interface through an iterator
+            // can't be used and a specific convolution implementation's pd is
+            // required here. It restricts the usage of inherited
+            // `convolution_pd_t` constructor.
+            // ANCHOR: USING_INHERITED_IS_IMPOSSIBLE.
+            //
+            // ```cpp
+            // primitive_desc_iterator_t it(
+            //         engine, (op_desc_t *)&cd_dw, &attr_dw, nullptr);
+            // if (!it.is_initialized()) return status::out_of_memory;
+            // while (++it != it.end()) {
+            //     dw_conv_pd_ = *it;
+            //     break;
+            // }
+            // VDISPATCH_CONV_IC(dw_conv_pd_, "dw_conv_pd hasn't been created");
+            // ```
+            //
+            // ```compiler output
+            // error: ‘using element_type = struct dnnl::impl::primitive_desc_t’
+            // {aka ‘struct dnnl::impl::primitive_desc_t’} has no member named
+            // ‘jcp_’
+            // auto &jcp_dw = dw_conv_pd_->jcp_;
+            //                             ^~~~
+            // ```
+            //
+            // TODO: figure out the way to initialize fused conv through a
+            // normal interface without hacks accessing specific members.
             CHECK(safe_ptr_assign(
                     dw_conv_pd_, new dw_pd_t(&cd_dw, &attr_dw, nullptr)));
             CHECK(dw_conv_pd_->init(engine));
@@ -261,7 +295,7 @@ struct jit_sse41_1x1_convolution_fwd_t : public primitive_t {
 
     jit_sse41_1x1_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
diff --git a/src/cpu/x64/jit_sse41_conv_kernel_f32.cpp b/src/cpu/x64/jit_sse41_conv_kernel_f32.cpp
index ff3777c4caf..1f8321baf0b 100644
--- a/src/cpu/x64/jit_sse41_conv_kernel_f32.cpp
+++ b/src/cpu/x64/jit_sse41_conv_kernel_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,18 +35,21 @@ namespace x64 {
 using namespace dnnl::impl::format_tag;
 using namespace dnnl::impl::prop_kind;
 using namespace dnnl::impl::utils;
+using namespace dnnl::impl::memory_tracking::names;
 
 using namespace Xbyak;
 
 jit_sse41_conv_fwd_kernel_f32::jit_sse41_conv_fwd_kernel_f32(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), sse41), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), sse41), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
         static constexpr size_t helper_vmm_idx = 15;
-        const size_t tail_size = jcp.oc_without_padding % simd_w_;
+        // output channel tail need special compare to find it. Binary ops may ignore it:
+        // only load more data and do the computation, although a little waste but worth it.
+        const size_t tail_size = 0;
         static constexpr bool use_exact_tail_scalar_bcast = false;
 
         const binary_injector::rhs_arg_static_params_t rhs_arg_static_params {
@@ -56,10 +59,12 @@ jit_sse41_conv_fwd_kernel_f32::jit_sse41_conv_fwd_kernel_f32(
                 use_exact_tail_scalar_bcast};
         const binary_injector::static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {xmm_d_weights.getIdx(), xmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<sse41>>(
-                this, jcp.post_ops, static_params);
+                this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -157,6 +162,14 @@ static void iterate(const int oc_blocks, const int ur_w, const F &f) {
 }
 void jit_sse41_conv_fwd_kernel_f32::apply_postops(
         const int oc_blocks, const int ur_w) {
+    std::map<size_t, int> vmm_idx_off;
+    iterate(oc_blocks, ur_w, [&](const bool, const int i, const int j) {
+        vmm_idx_off.insert({get_xmm_idx(ur_w, i, j), i * jcp.oc_block * sizeof(float)});
+    });
+    depthwise_injector::dynamic_params_t ddp {xmm_d_weights.getIdx(), xmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                              reg_oc_off, vmm_idx_off, this->rsp};
+    quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt, this->rsp};
+
     injector_utils::vmm_index_set_t vmm_idxs;
     if (jcp.with_binary) {
         binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
@@ -174,12 +187,12 @@ void jit_sse41_conv_fwd_kernel_f32::apply_postops(
                         rhs_arg_params.vmm_tail_idx_.emplace(vmm_idx);
                 });
 
-        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+        postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
     } else {
         iterate(oc_blocks, ur_w, [&](const bool, const int i, const int j) {
             vmm_idxs.emplace(get_xmm_idx(ur_w, i, j));
         });
-        postops_injector_->compute_vector_range(vmm_idxs);
+        postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
     }
 }
 
@@ -238,7 +251,7 @@ void jit_sse41_conv_fwd_kernel_f32::width_blk_step(
     L(init_done);
 
     Label skip_kh_loop;
-    mov(kj, reg_kh);
+    mov(kj, ptr[this->param1 + GET_OFF(kh_padding)]);
     if ((jcp.dilate_h >= jcp.ih)
             || (jcp.kh - 1) * (jcp.dilate_h + 1)
                     < nstl::max(jcp.t_pad, jcp.b_pad)) {
@@ -265,7 +278,7 @@ void jit_sse41_conv_fwd_kernel_f32::width_blk_step(
 
     L(skip_kh_loop);
 
-    if (jcp.with_eltwise || jcp.with_binary) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         Label regular_store;
         test(reg_ci_flag, FLAG_IC_LAST);
         je(regular_store, T_NEAR);
@@ -287,12 +300,15 @@ void jit_sse41_conv_fwd_kernel_f32::width_blk_step(
     add(aux_reg_kernel, sizeof(float) * 4);
     add(reg_output, sizeof(float) * 4);
     add(reg_bias, sizeof(float) * 4);
+    add(reg_oc_off, sizeof(float) * 4);
+
     inc(simd_iter);
     cmp(simd_iter, 2);
     jl(init_simd_iter_loop, T_NEAR);
 
     sub(reg_output, sizeof(float) * 8);
     sub(reg_bias, sizeof(float) * 8);
+    sub(reg_oc_off, sizeof(float) * 8);
 }
 
 inline void jit_sse41_conv_fwd_kernel_f32::solve_common(int oc_blocks) {
@@ -347,13 +363,16 @@ inline void jit_sse41_conv_fwd_kernel_f32::solve_common(int oc_blocks) {
 void jit_sse41_conv_fwd_kernel_f32::generate() {
     this->preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_input, reg_output);
+
     mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
     mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
     mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
     if (jcp.with_bias) mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]);
-    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
     mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]);
     mov(reg_oc_blocks, ptr[this->param1 + GET_OFF(oc_blocks)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     int nb_oc_tail = jcp.nb_oc % jcp.nb_oc_blocking;
     Label tail, exit;
@@ -373,6 +392,9 @@ void jit_sse41_conv_fwd_kernel_f32::generate() {
 
     L(exit);
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     this->postamble();
 
     if (jcp.with_eltwise)
@@ -386,6 +408,11 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     // disabling verbose dispatch messages for unsupported isa for better readability
     if (!mayiuse(sse41)) return status::unimplemented;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.nthr = nthreads;
 
     jcp.prop_kind = cd.prop_kind;
@@ -398,6 +425,7 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     jcp.mb = src_d.dims()[0];
 
     jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.oc_without_padding = jcp.oc;
     jcp.ic = src_d.dims()[1] / jcp.ngroups;
 
     jcp.ih = (ndims == 3) ? 1 : src_d.dims()[2];
@@ -423,11 +451,11 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
             jcp.l_pad, jcp.ow, jcp.iw, jcp.stride_w, ext_kw);
     jcp.b_pad = calculate_end_padding(
             jcp.t_pad, jcp.oh, jcp.ih, jcp.stride_h, ext_kh);
-    bool kernel_outside_src = false || ext_kw <= jcp.l_pad
-            || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad
-            || ext_kh <= jcp.b_pad;
-    VDISPATCH_CONV_IC(!kernel_outside_src, VERBOSE_UNSUPPORTED_PAD_FEATURE,
-            "weights and src size mismatch");
+    // bool kernel_outside_src = false || ext_kw <= jcp.l_pad
+    //         || ext_kw <= jcp.r_pad || ext_kh <= jcp.t_pad
+    //         || ext_kh <= jcp.b_pad;
+    // VDISPATCH_CONV_IC(!kernel_outside_src, VERBOSE_UNSUPPORTED_PAD_FEATURE,
+    //         "weights and src size mismatch");
 
     const auto dat_tag_nxc = (ndims == 3 ? nwc : nhwc);
     const auto dat_tag_ncx = (ndims == 3 ? ncw : nchw);
@@ -438,10 +466,10 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     const auto wei_tag_Oxio = with_groups ? pick(ndims - 3, gOwi8o, gOhwi8o)
                                           : pick(ndims - 3, Owi8o, Ohwi8o);
 
-    jcp.src_tag
-            = src_d.matches_one_of_tag(dat_tag_ncx, dat_tag_nxc, dat_tag_nCx8c);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(
+            dat_tag_ncx, dat_tag_nxc, dat_tag_nCx8c);
     jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag_OIxio, wei_tag_Oxio);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_nCx8c);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_nCx8c);
 
     const bool is_data_layout_nxc
             = utils::everyone_is(dat_tag_nxc, jcp.src_tag, jcp.dst_tag);
@@ -456,6 +484,8 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     const int binary_ind = post_ops.find(primitive_kind::binary);
     const int prelu_ind = post_ops.find(primitive_kind::prelu);
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization) != -1;
 
     jcp.post_ops = post_ops;
 
@@ -463,12 +493,13 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     static constexpr bool sum_at_pos_0_only = true;
     static constexpr bool sum_requires_scale_one = true;
     static constexpr bool sum_requires_zp_zero = true;
+
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(sse41,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, sum_at_pos_0_only,
             sum_requires_scale_one, sum_requires_zp_zero));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
-    const bool flat = jcp.ic == 3;
+    const bool flat = one_of(jcp.ic, 1, 2, 3);
     const bool mimo = !flat;
 
     const bool tag_ok = true
@@ -491,7 +522,15 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     VDISPATCH_CONV_IC(channel_pad_ok, VERBOSE_UNSUPPORTED_PAD_FEATURE,
             "i/o and padded channel size mismatch");
 
+    bool ok_to_pad_channels = true && !is_data_layout_nxc && jcp.ngroups == 1;
+
     const int simd_w = 8; // 2 SSE vectors processing at once
+    if (ok_to_pad_channels) {
+        jcp.oc = rnd_up(jcp.oc, simd_w);
+        if (mimo) {
+            jcp.ic = rnd_up(jcp.ic, simd_w);
+        }
+    }
 
     jcp.ur_h = 1; /* no code-unrolling by h so far */
     jcp.ur_w = 3;
@@ -549,6 +588,15 @@ status_t jit_sse41_conv_fwd_kernel_f32::init_conf(jit_conv_conf_t &jcp,
     return status::success;
 }
 
+void jit_sse41_conv_fwd_kernel_f32::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad,
+        const jit_conv_conf_t &jcp) {
+    using namespace dnnl::impl::memory_tracking::names;
+
+    if (jcp.prop_kind != backward_data && jcp.oc != jcp.oc_without_padding)
+        scratchpad.book<float>(key_conv_padded_bias, sizeof(float) * jcp.oc);
+}
+
 } // namespace x64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/x64/jit_sse41_conv_kernel_f32.hpp b/src/cpu/x64/jit_sse41_conv_kernel_f32.hpp
index 330be81bc83..8736c9ed6a7 100644
--- a/src/cpu/x64/jit_sse41_conv_kernel_f32.hpp
+++ b/src/cpu/x64/jit_sse41_conv_kernel_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #ifndef CPU_X64_JIT_SSE41_CONV_KERNEL_F32_HPP
 #define CPU_X64_JIT_SSE41_CONV_KERNEL_F32_HPP
 
+#include "common/memory_tracking.hpp"
 #include "common/c_types_map.hpp"
 #include "common/memory.hpp"
 
@@ -29,7 +30,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator {
+struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator_t {
     jit_sse41_conv_fwd_kernel_f32(const jit_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
 
@@ -39,12 +40,16 @@ struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator {
             const memory_desc_wrapper &dst_d, const primitive_attr_t &attr,
             int nthreads);
 
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+                                const jit_conv_conf_t &jcp);
+
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_conv_fwd_kernel_f32)
     jit_conv_conf_t jcp;
     const primitive_attr_t &attr_;
 
 private:
-    static constexpr auto simd_w_ = cpu_isa_traits<sse41>::vlen / sizeof(float);
+    static constexpr auto simd_w_
+            = cpu_isa_traits_t<sse41>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
     reg64_t reg_input = rax;
     reg64_t aux_reg_input = r8;
@@ -56,7 +61,6 @@ struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator {
     reg64_t kj = r10;
     reg64_t oi_iter = r11;
     reg64_t ki_iter = r12;
-    reg64_t reg_kh = abi_not_param1;
     reg64_t simd_iter = r15;
     reg64_t reg_oc_blocks = r14;
     reg64_t imm_addr64 = reg_oc_blocks;
@@ -66,21 +70,28 @@ struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator {
     std::unique_ptr<injector::jit_uni_postops_injector_t<sse41>>
             postops_injector_;
 
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = ki_iter;
+    reg64_t reg_oc_off = abi_not_param1;
+
+    Xbyak::Xmm xmm_d_weights = Xbyak::Xmm(14);
+    Xbyak::Xmm xmm_d_bias = Xbyak::Xmm(15);
+
     inline void oh_step_unroll_kw(
             int ur_w, int pad_l, int pad_r, int oc_blocks);
     inline void oh_step_nopad(int ur_w, int pad_l, int pad_r, int oc_blocks);
     inline void width_blk_step(int ur_w, int pad_l, int pad_r, int oc_blocks);
     inline void solve_common(int oc_blocks);
 
-    inline dim_t filter_w_to_input(int ki, int oi = 0, int pad_l = 0) {
+    inline dim_t filter_w_to_input(int ki, int oi = 0, int pad_l = 0) const {
         return ki * (jcp.dilate_w + 1) + oi * jcp.stride_w - pad_l;
     }
 
-    inline dim_t filter_h_to_input(int ki) {
+    inline dim_t filter_h_to_input(int ki) const {
         return static_cast<dim_t>(ki) * (jcp.dilate_h + 1) * jcp.iw;
     }
 
-    inline dim_t get_input_offset(int i_ic, int i_iw) {
+    inline dim_t get_input_offset(int i_ic, int i_iw) const {
         dim_t offset;
         if (utils::one_of(jcp.src_tag, format_tag::ncw, format_tag::nchw,
                     format_tag::ncdhw)) {
@@ -94,7 +105,7 @@ struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_output_offset(int i_oc_block, int i_ow) {
+    inline dim_t get_output_offset(int i_oc_block, int i_ow) const {
         dim_t offset;
         if (utils::one_of(jcp.dst_tag, format_tag::nwc, format_tag::nhwc,
                     format_tag::ndhwc)) {
@@ -107,7 +118,7 @@ struct jit_sse41_conv_fwd_kernel_f32 : public jit_generator {
         return sizeof(float) * offset;
     }
 
-    inline dim_t get_kernel_offset(int i_oc_block, int ki, int i_ic) {
+    inline dim_t get_kernel_offset(int i_oc_block, int ki, int i_ic) const {
         dim_t block_step_size = jcp.ic_block * jcp.oc_block;
         dim_t ic_block_step_size = jcp.kh * jcp.kw * block_step_size;
         dim_t oc_block_step_size = jcp.nb_ic * ic_block_step_size;
diff --git a/src/cpu/x64/jit_sse41_convolution.cpp b/src/cpu/x64/jit_sse41_convolution.cpp
index d0c99a17b77..9a123a2317a 100644
--- a/src/cpu/x64/jit_sse41_convolution.cpp
+++ b/src/cpu/x64/jit_sse41_convolution.cpp
@@ -27,6 +27,7 @@ namespace x64 {
 
 using namespace dnnl::impl::status;
 using namespace dnnl::impl::utils;
+using namespace dnnl::impl::memory_tracking::names;
 
 #define src_blk_off(f, n, c, h, w) \
     (pd()->ndims() == 3) ? (f).blk_off(n, c, w) : (f).blk_off(n, c, h, w)
@@ -60,6 +61,15 @@ void jit_sse41_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     const bool is_dst_layout_nxc
             = one_of(jcp.dst_tag, format_tag::nwc, format_tag::nhwc);
 
+    auto scratchpad = ctx.get_scratchpad_grantor();
+    if (pd()->wants_padded_bias()) {
+        auto padded_bias = scratchpad.get<data_t>(key_conv_padded_bias);
+        utils::array_copy(padded_bias, bias, kernel_->jcp.oc_without_padding);
+        utils::array_set(padded_bias + kernel_->jcp.oc_without_padding, 0.f,
+                         kernel_->jcp.oc - kernel_->jcp.oc_without_padding);
+        bias = padded_bias;
+    }
+
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         assert(nthr == jcp.nthr);
 
@@ -116,7 +126,7 @@ void jit_sse41_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
                         par_conv.flags |= FLAG_IC_FIRST;
                     }
 
-                    if ((jcp.with_eltwise || jcp.with_binary)
+                    if ((jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization)
                             && icb + 1 == jcp.nb_ic) {
                         par_conv.flags |= FLAG_IC_LAST;
                     }
@@ -133,6 +143,7 @@ void jit_sse41_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
                     par_conv.post_ops_binary_rhs_arg_vec
                             = post_ops_binary_rhs_arg_vec.data();
                     par_conv.dst_orig = dst;
+                    par_conv.oc_off = _oc * (is_dst_layout_nxc ? 1 : jcp.oc_block) * sizeof(float);
 
                     (*kernel_)(&par_conv);
                 }
diff --git a/src/cpu/x64/jit_sse41_convolution.hpp b/src/cpu/x64/jit_sse41_convolution.hpp
index d7ec33ee019..5c36f274b33 100644
--- a/src/cpu/x64/jit_sse41_convolution.hpp
+++ b/src/cpu/x64/jit_sse41_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,9 +34,7 @@ namespace x64 {
 
 struct jit_sse41_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit:", sse41, ""),
                 jit_sse41_convolution_fwd_t);
@@ -58,14 +56,18 @@ struct jit_sse41_convolution_fwd_t : public primitive_t {
                     attr_.set_default_formats(dst_md(0)) == status::success,
                     VERBOSE_UNSUPPORTED_POSTOP);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_sse41_conv_fwd_kernel_f32::init_conf(jcp_, *desc(),
                     *src_md(), *weights_md(), *dst_md(), *attr(),
                     dnnl_get_max_threads()));
 
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_sse41_conv_fwd_kernel_f32::init_scratchpad(scratchpad, jcp_);
+
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool set_default_formats() {
@@ -88,7 +90,7 @@ struct jit_sse41_convolution_fwd_t : public primitive_t {
                     && IMPLICATION(curr_dst_tag != dat_tag_nxc,
                             dst_d.format_kind() == format_kind::any)
                     && utils::one_of(dat_tag_nxc, curr_src_tag, curr_dst_tag);
-            const bool flat = IC() == 3;
+            const bool flat = utils::one_of(IC(), 1, 2, 3);
             auto src_tag = is_data_layout_nxc ? dat_tag_nxc
                     : flat                    ? dat_tag_ncx
                                               : dat_tag_nCx8c;
@@ -105,7 +107,7 @@ struct jit_sse41_convolution_fwd_t : public primitive_t {
 
     jit_sse41_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type data_t;
+    using data_t = typename prec_traits_t<data_type::f32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
diff --git a/src/cpu/x64/jit_transpose_utils.cpp b/src/cpu/x64/jit_transpose_utils.cpp
index c4234f02736..7d856b60f02 100644
--- a/src/cpu/x64/jit_transpose_utils.cpp
+++ b/src/cpu/x64/jit_transpose_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,20 +34,22 @@ using namespace Xbyak;
 
 #define GET_OFF(x) offsetof(ctx_t, x)
 
-struct jit_trans_iw_ic_t : public jit_trans_src_t, public jit_generator {
+struct jit_trans_iw_ic_t : public jit_trans_src_t, public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_trans_iw_ic_t)
     jit_trans_iw_ic_t(const jit_conv_conf_t *conf)
         : jit_trans_src_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize(conf->src_dt == data_type::undef
                           ? 2
                           : types::data_type_size(conf->src_dt))
         , is_layout_nxc(utils::one_of(conf_->src_tag, format_tag::ndhwc,
                   format_tag::nhwc, format_tag::nwc)) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     int typesize = 0;
@@ -82,11 +84,11 @@ struct jit_trans_iw_ic_t : public jit_trans_src_t, public jit_generator {
 
     void kmovw(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     }
     void kmovd(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     }
     Zmm src_zmm(int i) { return Zmm(i); }
     Ymm src_ymm(int i) {
@@ -99,12 +101,12 @@ struct jit_trans_iw_ic_t : public jit_trans_src_t, public jit_generator {
     }
     void vmovdqa64(Zmm z, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(z, ptr[imm_addr64]);
     }
 
     void vmovdqa32(Zmm z, const int32_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa32(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa32(z, ptr[imm_addr64]);
     }
 
     void transpose(int nrows, int l_pad, int r_pad, bool nontemporal_stores);
@@ -135,7 +137,7 @@ void jit_trans_iw_ic_t::transpose_2b(
 
     auto kmovd = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     };
     int l_pad_tail {0}, l_pad_rows {0};
     int r_pad_tail {0}, r_pad_rows {0};
@@ -680,11 +682,11 @@ void jit_trans_iw_ic_t::generate() {
     postamble();
 }
 
-struct jit_trans_ow_oc_t : public jit_trans_dst_t, public jit_generator {
+struct jit_trans_ow_oc_t : public jit_trans_dst_t, public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_trans_ow_oc_t)
     jit_trans_ow_oc_t(const jit_conv_conf_t *conf)
         : jit_trans_dst_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize(conf->dst_dt == data_type::undef
                           ? 2
                           : types::data_type_size(conf->dst_dt))
@@ -694,9 +696,11 @@ struct jit_trans_ow_oc_t : public jit_trans_dst_t, public jit_generator {
                           ? 2
                           : data_type_vnni_granularity(conf->dst_dt)) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     int typesize = 0;
@@ -725,15 +729,15 @@ struct jit_trans_ow_oc_t : public jit_trans_dst_t, public jit_generator {
 
     void vmovdqa64(Zmm z, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(z, ptr[imm_addr64]);
     }
     void kmovw(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     }
     void kmovd(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     }
     Zmm src_zmm(int i) { return Zmm(i); }
     Ymm src_ymm(int i) {
@@ -1211,17 +1215,17 @@ void jit_transpose4x16_src::generate() {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     auto vmovdqa64 = [this](Zmm z, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(z, ptr[imm_addr64]);
     };
 
     auto vmovdqa32 = [this](Zmm z, const int32_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa32(z, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa32(z, ptr[imm_addr64]);
     };
 
     kmovw(kF0, 0xf0); // 11110000
@@ -1296,6 +1300,11 @@ void jit_diff_wei_trans_to_vnni_t::generate() {
         f8_emu = utils::make_unique<fp8_emulation_e4m3_t>(this, emu_reserv_1,
                 emu_reserv_2, emu_reserv_3, emu_reserv_4, emu_reserv_5,
                 emu_scratch);
+    if (utils::one_of(out_dt_, data_type::f8_e5m2, data_type::f8_e4m3)
+            && f8_emu == nullptr) {
+        assert("Failed to create f8 emulation kernel.");
+        return;
+    }
 
     const Zmm &zmm_idx = Zmm(31);
     auto get_zmm_src = [&](int idx, int ic) { return Zmm(4 * idx + ic); };
diff --git a/src/cpu/x64/jit_transpose_utils.hpp b/src/cpu/x64/jit_transpose_utils.hpp
index d3c99a79609..146c3cc6247 100644
--- a/src/cpu/x64/jit_transpose_utils.hpp
+++ b/src/cpu/x64/jit_transpose_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ struct jit_trans_src_t {
     virtual status_t create_kernel() = 0;
 
     jit_trans_src_t(const jit_conv_conf_t *conf) : conf_(conf) {}
-    virtual ~jit_trans_src_t() {}
+    virtual ~jit_trans_src_t() = default;
 
     const jit_conv_conf_t *conf_;
 };
@@ -61,7 +61,7 @@ struct jit_trans_dst_t {
     };
 
     jit_trans_dst_t(const jit_conv_conf_t *conf) : conf_(conf) {}
-    virtual ~jit_trans_dst_t() {}
+    virtual ~jit_trans_dst_t() = default;
 
     virtual void operator()(ctx_t *ctx) = 0;
     virtual status_t create_kernel() = 0;
@@ -75,12 +75,12 @@ struct jit_transpose4x16_src_t {
     bool tr_src_pf1;
 };
 
-struct jit_transpose4x16_src : public jit_generator {
+struct jit_transpose4x16_src : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_transpose4x16_src)
 
     jit_transpose4x16_src(const jit_1x1_conv_conf_t *aparams,
             jit_transpose4x16_src_t *tparams_)
-        : jit_generator(jit_name()), params(aparams), tparams(tparams_) {}
+        : jit_generator_t(jit_name()), params(aparams), tparams(tparams_) {}
 
     const jit_1x1_conv_conf_t *params;
     const jit_transpose4x16_src_t *tparams;
@@ -117,13 +117,13 @@ struct jit_transpose4x16_src : public jit_generator {
     void generate() override;
 };
 
-struct jit_diff_wei_trans_to_vnni_t : public jit_generator {
+struct jit_diff_wei_trans_to_vnni_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_diff_wei_trans_to_vnni_t)
 
     jit_diff_wei_trans_to_vnni_t(const data_type_t dt, const int &kd,
             const int &kh, const int &kw, const int &ic_block,
             const int &oc_block, const int nb_ic)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , out_dt_(dt)
         , kd_(kd)
         , kh_(kh)
@@ -132,9 +132,11 @@ struct jit_diff_wei_trans_to_vnni_t : public jit_generator {
         , oc_block_(oc_block)
         , nb_ic_(nb_ic) {}
 
-    ~jit_diff_wei_trans_to_vnni_t() {}
+    ~jit_diff_wei_trans_to_vnni_t() override = default;
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     const data_type_t out_dt_;
     const int kd_, kh_, kw_;
diff --git a/src/cpu/x64/jit_uni_1x1_conv_utils.hpp b/src/cpu/x64/jit_uni_1x1_conv_utils.hpp
index 3cec210e816..1084df95af6 100644
--- a/src/cpu/x64/jit_uni_1x1_conv_utils.hpp
+++ b/src/cpu/x64/jit_uni_1x1_conv_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -125,7 +125,7 @@ inline void rtus_prepare_space_info(conv_pd_t *self,
 }
 
 template <cpu_isa_t isa>
-struct rtus_driver_t : public jit_generator {
+struct rtus_driver_t : public jit_generator_t {
 
     struct call_params_t {
         const void *ws; /* reduced image (w/ strides = 1) */
@@ -169,7 +169,7 @@ struct rtus_driver_t : public jit_generator {
     rtus_driver_t(int iw, int stride_w, int src_step_h, int src_step_icb,
             int ws_step_icb, bool src_to_ws, size_t typesize, int ic,
             bool is_nspc = false)
-        : jit_generator(jit_name(), isa)
+        : jit_generator_t(jit_name(), isa)
         , iw_(iw)
         , stride_w_(stride_w)
         , src_step_h_(src_step_h)
@@ -600,7 +600,7 @@ inline int best_divider(int value, int min_divider, int max_divider,
     return x_divider;
 }
 
-typedef jit_1x1_conv_conf_t jcp_t;
+using jcp_t = jit_1x1_conv_conf_t;
 
 inline bool is_bcast_layout_nxc(const jcp_t &jcp) {
     switch (jcp.prop_kind) {
diff --git a/src/cpu/x64/jit_uni_batch_normalization.cpp b/src/cpu/x64/jit_uni_batch_normalization.cpp
index ce8e31ccb0c..31d480f5828 100644
--- a/src/cpu/x64/jit_uni_batch_normalization.cpp
+++ b/src/cpu/x64/jit_uni_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -135,7 +135,8 @@ struct jit_bnorm_conf_t {
     // given nthr and shape of problem, choose the thread partition
     // to use (ie set N_nthr, C_nthr, and S_nthr)
     bool thread_partition(bool spatial_thr_allowed, int nthr, dim_t N,
-            dim_t C_blks, dim_t SP, int &C_nthr, int &N_nthr, int &S_nthr) {
+            dim_t C_blks, dim_t SP, int &C_nthr, int &N_nthr,
+            int &S_nthr) const {
         if (((nthr <= C_blks) && IMPLICATION(is_nspc_, N == 1))
                 || !dnnl_thr_syncable()) {
             C_nthr = nthr;
@@ -214,7 +215,7 @@ struct jit_bnorm_conf_t {
 };
 
 template <cpu_isa_t isa>
-struct jit_bnorm_t : public jit_generator {
+struct jit_bnorm_t : public jit_generator_t {
     struct call_params_t {
         // keep all sizes at 8 bytes -- jit code expects this
         size_t N_ithr, N_nthr;
@@ -244,7 +245,7 @@ struct jit_bnorm_t : public jit_generator {
             : (isa == avx2)                      ? yword
                                                  : zword;
 
-    const int vlen = isa == sse41 ? 32 : cpu_isa_traits<isa>::vlen;
+    const int vlen = isa == sse41 ? 32 : cpu_isa_traits_t<isa>::vlen;
 
     const batch_normalization_pd_t *pd_ = nullptr;
     const jit_bnorm_conf_t *jbp_ = nullptr;
@@ -623,9 +624,7 @@ struct jit_bnorm_t : public jit_generator {
 
                 // convert f32 output to bf16
                 if (!use_bf16_emulation())
-                    vcvtneps2bf16(dst_reg, src_reg,
-                            mayiuse(avx512_core) ? Xbyak::EvexEncoding
-                                                 : Xbyak::VexEncoding);
+                    vcvtneps2bf16(dst_reg, src_reg, get_encoding());
                 else
                     bf16_emu_->vcvtneps2bf16(dst_reg, src_reg);
 
@@ -637,7 +636,7 @@ struct jit_bnorm_t : public jit_generator {
             } else if (is_f16_) {
                 auto src_reg = Vmm(src.getIdx());
                 auto dst_reg =
-                        typename vreg_traits<Vmm>::Vmm_lower_t(src.getIdx());
+                        typename vreg_traits_t<Vmm>::Vmm_lower_t(src.getIdx());
                 if (is_nt_store) {
                     if (mayiuse(avx512_core_fp16))
                         vcvtps2phx(dst_reg, src_reg);
@@ -2075,7 +2074,7 @@ struct jit_bnorm_t : public jit_generator {
     }
 
     jit_bnorm_t(const batch_normalization_pd_t *pd, const jit_bnorm_conf_t *jbp)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , pd_(pd)
         , jbp_(jbp)
         , is_bf16_(pd_->src_md()->data_type == data_type::bf16)
@@ -2122,7 +2121,7 @@ struct jit_bnorm_t : public jit_generator {
         postamble();
     }
 
-    void operator()(const call_params_t *p) { jit_generator::operator()(p); }
+    void operator()(const call_params_t *p) { jit_generator_t::operator()(p); }
 
     ~jit_bnorm_t() override { delete bf16_emu_; }
 };
@@ -2314,7 +2313,7 @@ struct driver_t : public c_compatible {
 private:
     enum {
         simd_w = isa == sse41 ? 8
-                              : cpu_isa_traits<isa>::vlen
+                              : cpu_isa_traits_t<isa>::vlen
                         / sizeof(acc_data_t) // BF16 will expand to FP32
     };
 
@@ -2421,7 +2420,7 @@ status_t jit_uni_batch_normalization_fwd_t<isa>::pd_t::init(engine_t *engine) {
             "bad padded dimensions for current isa");
 
     // Only IC % simd_w == 0 is supported for now
-    const int simd_w = cpu_isa_traits<isa>::vlen / sizeof(acc_data_t);
+    const int simd_w = cpu_isa_traits_t<isa>::vlen / sizeof(acc_data_t);
     VDISPATCH_BNORM(!(src_d.matches_one_of_tag(nc, nwc, nhwc, ndhwc)
                             && src_d.padded_dims()[1] % simd_w != 0),
             VERBOSE_UNSUPPORTED_PAD_FEATURE,
diff --git a/src/cpu/x64/jit_uni_batch_normalization.hpp b/src/cpu/x64/jit_uni_batch_normalization.hpp
index d223baa8fa4..9f5f8b9f78b 100644
--- a/src/cpu/x64/jit_uni_batch_normalization.hpp
+++ b/src/cpu/x64/jit_uni_batch_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2022 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,10 +41,8 @@ struct driver_t;
 template <cpu_isa_t isa>
 struct jit_uni_batch_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("bnorm_jit:",
@@ -65,7 +63,8 @@ struct jit_uni_batch_normalization_fwd_t : public primitive_t {
     };
 
     jit_uni_batch_normalization_fwd_t(const pd_t *apd);
-    ~jit_uni_batch_normalization_fwd_t();
+
+    ~jit_uni_batch_normalization_fwd_t() override;
 
     status_t init(engine_t *engine) override;
 
@@ -80,10 +79,8 @@ struct jit_uni_batch_normalization_fwd_t : public primitive_t {
 template <cpu_isa_t isa>
 struct jit_uni_batch_normalization_bwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_bwd_pd_t::
+                cpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("bnorm_jit:",
@@ -102,7 +99,8 @@ struct jit_uni_batch_normalization_bwd_t : public primitive_t {
     };
 
     jit_uni_batch_normalization_bwd_t(const pd_t *apd);
-    ~jit_uni_batch_normalization_bwd_t();
+
+    ~jit_uni_batch_normalization_bwd_t() override;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_uni_batch_normalization_s8.cpp b/src/cpu/x64/jit_uni_batch_normalization_s8.cpp
index 5ceb6f59eed..e0a609ee118 100644
--- a/src/cpu/x64/jit_uni_batch_normalization_s8.cpp
+++ b/src/cpu/x64/jit_uni_batch_normalization_s8.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,14 +45,14 @@ struct jit_uni_bnorm_s8_call_params_t {
 };
 
 template <cpu_isa_t isa>
-struct jit_bnorm_base_t : public jit_generator {
+struct jit_bnorm_base_t : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_bnorm_s8_t)
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const AddressFrame &vmmword
             = (isa == sse41) ? xword : ((isa == avx2) ? yword : zword);
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     const batch_normalization_pd_t *pd_;
 
@@ -82,7 +82,7 @@ struct jit_bnorm_base_t : public jit_generator {
     Vmm vmm_aux = Vmm(isa == avx512_core ? 28 : 10); // shared with 'veps'
     Vmm vmm_mask = Vmm(0); // used for AVX2 and SSE41
 
-    size_t simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
+    size_t simd_w_ = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     size_t c_in_xmm_ = (isa == sse41) ? 8 : 16;
     size_t chan_data_offt_;
     size_t num_c_blocks_;
@@ -208,7 +208,7 @@ struct jit_bnorm_base_t : public jit_generator {
     }
 
     jit_bnorm_base_t(const batch_normalization_pd_t *pd)
-        : jit_generator(jit_name()), pd_(pd) {}
+        : jit_generator_t(jit_name()), pd_(pd) {}
 };
 
 template <cpu_isa_t isa>
diff --git a/src/cpu/x64/jit_uni_batch_normalization_s8.hpp b/src/cpu/x64/jit_uni_batch_normalization_s8.hpp
index b9e32654a9c..41e5eb1aff1 100644
--- a/src/cpu/x64/jit_uni_batch_normalization_s8.hpp
+++ b/src/cpu/x64/jit_uni_batch_normalization_s8.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,10 +40,8 @@ struct driver_t;
 template <cpu_isa_t isa>
 struct jit_uni_batch_normalization_s8_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("bnorm_s8_jit:", isa, ""),
                 jit_uni_batch_normalization_s8_fwd_t);
@@ -51,10 +49,11 @@ struct jit_uni_batch_normalization_s8_fwd_t : public primitive_t {
         status_t init(engine_t *engine);
     };
 
-    typedef int8_t data_t;
+    using data_t = int8_t;
 
     jit_uni_batch_normalization_s8_fwd_t(const pd_t *apd);
-    ~jit_uni_batch_normalization_s8_fwd_t();
+
+    ~jit_uni_batch_normalization_s8_fwd_t() override;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_uni_binary.cpp b/src/cpu/x64/jit_uni_binary.cpp
index ce4b004950a..6fc8f16060e 100644
--- a/src/cpu/x64/jit_uni_binary.cpp
+++ b/src/cpu/x64/jit_uni_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@ namespace x64 {
 static bcast_set_t get_supported_postops_bcast_strategies() {
     return {broadcasting_strategy_t::scalar, broadcasting_strategy_t::per_oc,
             broadcasting_strategy_t::per_oc_spatial,
+            broadcasting_strategy_t::per_w,
             broadcasting_strategy_t::no_broadcast};
 }
 
@@ -139,8 +140,7 @@ status_t jit_uni_binary_t::pd_t::init(engine_t *engine) {
             VERBOSE_INCONSISTENT_MDS, "src", "dst");
     VDISPATCH_BINARY(
             is_applicable(), "not applicable for current implementation");
-    VDISPATCH_BINARY(
-            attr()->has_default_values(sm::post_ops | sm::scales_runtime),
+    VDISPATCH_BINARY(attr()->has_default_values(sm::post_ops | sm::scales),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_BINARY(attr_.set_default_formats(dst_md(0)) == status::success,
             VERBOSE_UNSUPPORTED_POSTOP);
@@ -172,10 +172,8 @@ status_t jit_uni_binary_t::pd_t::init(engine_t *engine) {
     conf_.is_f16 = conf_.dst_type == f16;
     conf_.op_type = get_op_type(src0_md_);
     assert(conf_.op_type != op_t::none);
-    conf_.do_scale_src0 = !attr()->scales_.get(DNNL_ARG_SRC_0).defined()
-            || !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.do_scale_src1 = !attr()->scales_.get(DNNL_ARG_SRC_1).defined()
-            || !attr()->scales_.get(DNNL_ARG_SRC_1).has_default_values();
+    conf_.do_scale_src0 = !attr()->scales_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.do_scale_src1 = !attr()->scales_.has_default_values(DNNL_ARG_SRC_1);
     const auto sum_idx = po.find(primitive_kind::sum);
     conf_.do_sum = sum_idx != -1 && po.entry_[sum_idx].sum.scale != 0.f;
     conf_.with_eltwise = po.find(primitive_kind::eltwise) != -1;
@@ -215,6 +213,15 @@ status_t jit_uni_binary_t::pd_t::init(engine_t *engine) {
             conf_.not_bcasted_sp_dims += !bcast_dims[d];
     }
 
+    if (is_ternary_op()) {
+        conf_.is_ternary_op = is_ternary_op();
+        conf_.src2_type = src_md(2)->data_type;
+        VDISPATCH_BINARY(data_type_supported(conf_.src2_type, conf_.isa),
+                VERBOSE_ISA_DT_MISMATCH);
+        // The kernel does not work for AVX, SSE41
+        VDISPATCH_BINARY(mayiuse(avx2), "unsupported isa for ternary op");
+    }
+
     return status::success;
 }
 
@@ -290,7 +297,7 @@ bool jit_uni_binary_t::pd_t::alg_preserves_zero() const {
     using namespace alg_kind;
     return utils::one_of(desc()->alg_kind, binary_add, binary_max, binary_min,
             binary_mul, binary_sub, binary_ge, binary_gt, binary_le, binary_lt,
-            binary_eq, binary_ne);
+            binary_eq, binary_ne, binary_select, binary_prelu);
 }
 
 bool jit_uni_binary_t::pd_t::check_scales_mask() const {
@@ -358,6 +365,7 @@ bool jit_uni_binary_t::pd_t::is_different_layouts_allowed(
 bool jit_uni_binary_t::pd_t::is_applicable() {
     const memory_desc_wrapper src0_d(src_md(0));
     const memory_desc_wrapper src1_d(src_md(1));
+    const memory_desc_wrapper src2_d(src_md(2));
     const memory_desc_wrapper dst_d(dst_md());
     const auto ndims = src0_d.ndims();
 
@@ -365,6 +373,9 @@ bool jit_uni_binary_t::pd_t::is_applicable() {
     // the next check
     bool ok = src0_d.is_dense(true) && src1_d.is_dense(true)
             && dst_d.is_dense(true);
+    ok = ok
+            && IMPLICATION(
+                    is_ternary_op(), src2_d.similar_to(src0_d, true, false, 0));
     if (!ok) return false;
 
     // TODO: fix implementation for tensor with paddings to work with any block
@@ -409,7 +420,7 @@ bool jit_uni_binary_t::pd_t::is_applicable() {
         if (utils::one_of(desc()->alg_kind, alg_kind::binary_ge,
                     alg_kind::binary_gt, alg_kind::binary_le,
                     alg_kind::binary_lt, alg_kind::binary_eq,
-                    alg_kind::binary_ne)
+                    alg_kind::binary_ne, alg_kind::binary_prelu)
                 && (has_oc_tail || has_outer_dims_tail))
             return false;
 
@@ -502,8 +513,8 @@ bool jit_uni_binary_t::post_ops_ok(const primitive_attr_t *attr,
         }
     }
 
-    const int vlen = is_avx512_core ? cpu_isa_traits<avx512_core>::vlen
-                                    : cpu_isa_traits<avx2>::vlen;
+    const int vlen = is_avx512_core ? cpu_isa_traits_t<avx512_core>::vlen
+                                    : cpu_isa_traits_t<avx2>::vlen;
     const bool postops_per_oc_broadcast_exists
             = binary_injector::any_binary_postop_rhs_per_oc_broadcast(
                     p, src0_d, supported_strategies);
@@ -699,8 +710,8 @@ status_t jit_uni_binary_t::init(engine_t *engine) {
 }
 
 void jit_uni_binary_t::execute_no_bcast_strategy(const data_t *src0,
-        const data_t *src1, data_t *dst, const float *scale0,
-        const float *scale1,
+        const data_t *src1, const data_t *src2, data_t *dst,
+        const float *scale0, const float *scale1,
         const std::vector<const void *> &post_ops_binary_rhs_arg_vec,
         const bcast_t bcast_type) const {
     const auto kernel = kernel_.get();
@@ -708,9 +719,14 @@ void jit_uni_binary_t::execute_no_bcast_strategy(const data_t *src0,
 
     const memory_desc_wrapper src0_d(pd()->src_md(0));
     const memory_desc_wrapper src1_d(pd()->src_md(1));
+    const memory_desc_wrapper src2_d(pd()->src_md(2));
     const memory_desc_wrapper dst_d(pd()->dst_md(0));
+
     const int src0_type_size = types::data_type_size(src0_d.data_type());
     const int src1_type_size = types::data_type_size(src1_d.data_type());
+    const int src2_type_size = pd()->is_ternary_op()
+            ? types::data_type_size(src2_d.data_type())
+            : 0;
     const int dst_type_size = types::data_type_size(dst_d.data_type());
 
     const auto &conf = pd()->get_conf();
@@ -770,6 +786,7 @@ void jit_uni_binary_t::execute_no_bcast_strategy(const data_t *src0,
                     p.src0 = src0 + (start + batch_off) * src0_type_size;
                     p.src1 = src1
                             + (start / outer_dims + batch_off) * src1_type_size;
+                    p.src2 = src2 + (start + batch_off) * src2_type_size;
                     p.dst = dst + (start + batch_off) * dst_type_size;
                     p.indices = &indices[0];
                     p.src1_stride_range = src1_stride_range;
@@ -806,6 +823,7 @@ void jit_uni_binary_t::execute_no_bcast_strategy(const data_t *src0,
             p.src0 = src0 + start * simd_w * src0_type_size;
             p.src1 = src1
                     + (point_broadcast ? 0 : (start * simd_w * src1_type_size));
+            p.src2 = src2 + start * simd_w * src2_type_size;
             p.dst = dst + start * simd_w * dst_type_size;
             p.scales_src0 = scale0;
             p.scales_src1 = scale1;
@@ -817,8 +835,8 @@ void jit_uni_binary_t::execute_no_bcast_strategy(const data_t *src0,
 }
 
 void jit_uni_binary_t::execute_bcast_per_batch_strategy(const data_t *src0,
-        const data_t *src1, data_t *dst, const float *scale0,
-        const float *scale1,
+        const data_t *src1, const data_t *src2, data_t *dst,
+        const float *scale0, const float *scale1,
         const std::vector<const void *> &post_ops_binary_rhs_arg_vec) const {
 
     const auto kernel = kernel_.get();
@@ -826,9 +844,14 @@ void jit_uni_binary_t::execute_bcast_per_batch_strategy(const data_t *src0,
 
     const memory_desc_wrapper src0_d(pd()->src_md(0));
     const memory_desc_wrapper src1_d(pd()->src_md(1));
+    const memory_desc_wrapper src2_d(pd()->src_md(2));
     const memory_desc_wrapper dst_d(pd()->dst_md(0));
+
     const int src0_type_size = types::data_type_size(src0_d.data_type());
     const int src1_type_size = types::data_type_size(src1_d.data_type());
+    const int src2_type_size = pd()->is_ternary_op()
+            ? types::data_type_size(src2_d.data_type())
+            : 0;
     const int dst_type_size = types::data_type_size(dst_d.data_type());
 
     const dim_t MB = src0_d.dims()[0];
@@ -856,6 +879,7 @@ void jit_uni_binary_t::execute_bcast_per_batch_strategy(const data_t *src0,
         const dim_t off = start * simd_w;
         p.src0 = src0 + (off + b * nelems0_per_b) * src0_type_size;
         p.src1 = src1 + off * src1_type_size;
+        p.src2 = src2 + (off + b * nelems0_per_b) * src2_type_size;
         p.dst = dst + (off + b * nelems0_per_b) * dst_type_size;
         p.scales_src0 = scale0;
         p.scales_src1 = scale1;
@@ -866,8 +890,8 @@ void jit_uni_binary_t::execute_bcast_per_batch_strategy(const data_t *src0,
 }
 
 void jit_uni_binary_t::execute_bcast_per_c_strategy(const data_t *src0,
-        const data_t *src1, data_t *dst, const float *scale0,
-        const float *scale1,
+        const data_t *src1, const data_t *src2, data_t *dst,
+        const float *scale0, const float *scale1,
         const std::vector<const void *> &post_ops_binary_rhs_arg_vec,
         const op_t op_type, const bcast_t bcast_type,
         const bool blocked_oc_tail) const {
@@ -877,10 +901,16 @@ void jit_uni_binary_t::execute_bcast_per_c_strategy(const data_t *src0,
 
     const memory_desc_wrapper src0_d(pd()->src_md(0));
     const memory_desc_wrapper src1_d(pd()->src_md(1));
+    const memory_desc_wrapper src2_d(pd()->src_md(2));
     const memory_desc_wrapper dst_d(pd()->dst_md(0));
+
     const int src0_type_size = types::data_type_size(src0_d.data_type());
     const int src1_type_size = types::data_type_size(src1_d.data_type());
+    const int src2_type_size = pd()->is_ternary_op()
+            ? types::data_type_size(src2_d.data_type())
+            : 0;
     const int dst_type_size = types::data_type_size(dst_d.data_type());
+
     const auto ndims = src0_d.ndims();
     const auto &dims = src0_d.dims();
     const dim_t MB = dims[0];
@@ -931,6 +961,7 @@ void jit_uni_binary_t::execute_bcast_per_c_strategy(const data_t *src0,
             p.dst = dst + off * dst_type_size;
             p.src0 = src0 + off * src0_type_size;
             p.src1 = src1 + src1_off(mb, C_blk, off) * src1_type_size;
+            p.src2 = src2 + off * src2_type_size;
             p.scales_src0 = scale0;
             p.scales_src1 = scale1;
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
@@ -955,6 +986,7 @@ void jit_uni_binary_t::execute_bcast_per_c_strategy(const data_t *src0,
             p.dst = dst + off * dst_type_size;
             p.src0 = src0 + off * src0_type_size;
             p.src1 = src1 + src1_off(mb, sp, off) * src1_type_size;
+            p.src2 = src2 + off * src2_type_size;
             p.scales_src0 = scale0;
             p.scales_src1 = scale1;
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
@@ -980,6 +1012,7 @@ void jit_uni_binary_t::execute_bcast_per_c_strategy(const data_t *src0,
             p.dst = dst + off * dst_type_size;
             p.src0 = src0 + off * src0_type_size;
             p.src1 = src1 + src1_off(mb, c, off) * src1_type_size;
+            p.src2 = src2 + off * src2_type_size;
             p.scales_src0 = scale0;
             p.scales_src1 = scale1;
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
@@ -990,8 +1023,8 @@ void jit_uni_binary_t::execute_bcast_per_c_strategy(const data_t *src0,
 }
 
 void jit_uni_binary_t::execute_bcast_per_w_strategy(const data_t *src0,
-        const data_t *src1, data_t *dst, const float *scale0,
-        const float *scale1,
+        const data_t *src1, const data_t *src2, data_t *dst,
+        const float *scale0, const float *scale1,
         const std::vector<const void *> &post_ops_binary_rhs_arg_vec,
         const op_t op_type, const bool blocked_oc_tail) const {
     const auto kernel = kernel_.get();
@@ -1000,10 +1033,16 @@ void jit_uni_binary_t::execute_bcast_per_w_strategy(const data_t *src0,
 
     const memory_desc_wrapper src0_d(pd()->src_md(0));
     const memory_desc_wrapper src1_d(pd()->src_md(1));
+    const memory_desc_wrapper src2_d(pd()->src_md(2));
     const memory_desc_wrapper dst_d(pd()->dst_md(0));
+
     const int src0_type_size = types::data_type_size(src0_d.data_type());
     const int src1_type_size = types::data_type_size(src1_d.data_type());
+    const int src2_type_size = pd()->is_ternary_op()
+            ? types::data_type_size(src2_d.data_type())
+            : 0;
     const int dst_type_size = types::data_type_size(dst_d.data_type());
+
     const auto ndims = src0_d.ndims();
     const auto &dims = src0_d.dims();
     const auto &bcast_dims = pd()->broadcast_dims();
@@ -1057,6 +1096,7 @@ void jit_uni_binary_t::execute_bcast_per_w_strategy(const data_t *src0,
                             ? sp * simd_w
                             : (mb * SP_no_bcast + sp) * simd_w;
                     p.src1 = src1 + src1_off * src1_type_size;
+                    p.src2 = src2 + off * src2_type_size;
                     p.scales_src0 = scale0;
                     p.scales_src1 = scale1;
                     p.post_ops_binary_rhs_arg_vec
@@ -1079,6 +1119,7 @@ void jit_uni_binary_t::execute_bcast_per_w_strategy(const data_t *src0,
             const dim_t src1_off
                     = bcast_dims[0] == 1 ? sp : mb * SP_no_bcast + sp;
             p.src1 = src1 + src1_off * src1_type_size;
+            p.src2 = src2 + off * src2_type_size;
             p.scales_src0 = scale0;
             p.scales_src1 = scale1;
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
@@ -1100,6 +1141,7 @@ void jit_uni_binary_t::execute_bcast_per_w_strategy(const data_t *src0,
             p.src0 = src0 + off * src0_type_size;
             const dim_t src1_off = bcast_dims[0] == 1 ? 0 : mb * SP_no_bcast;
             p.src1 = src1 + src1_off * src1_type_size;
+            p.src2 = src2 + off * src2_type_size;
             p.scales_src0 = scale0;
             p.scales_src1 = scale1;
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
@@ -1112,6 +1154,8 @@ void jit_uni_binary_t::execute_bcast_per_w_strategy(const data_t *src0,
 status_t jit_uni_binary_t::execute(const exec_ctx_t &ctx) const {
     const auto src0 = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC_0);
     const auto src1 = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC_1);
+    const auto src2 = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC_2);
+
     auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
     const auto &post_ops = pd()->attr()->post_ops_;
     const auto &post_ops_binary_rhs_arg_vec
@@ -1122,6 +1166,8 @@ status_t jit_uni_binary_t::execute(const exec_ctx_t &ctx) const {
 
     const memory_desc_wrapper src0_d(pd()->src_md(0));
     const memory_desc_wrapper src1_d(pd()->src_md(1));
+    const memory_desc_wrapper src2_d(pd()->src_md(2));
+
     const auto ndims = src0_d.ndims();
     const auto &dims = src0_d.dims();
     const dim_t C = ndims >= 2 ? dims[1] : 0;
@@ -1137,29 +1183,31 @@ status_t jit_uni_binary_t::execute(const exec_ctx_t &ctx) const {
     const bool has_oc_tail = C % simd_w;
     const bool point_broadcast_no_oc_tail = point_broadcast && !has_oc_tail;
     const auto alg = pd()->desc()->alg_kind;
+
     // Use strategy with kernel_tail for GreaterEqual op with oc_tail and
     // blocked format due to overwriting the vector tail by vcmpps.
     const bool vector_overwrite = utils::one_of(alg, alg_kind::binary_ge,
             alg_kind::binary_gt, alg_kind::binary_le, alg_kind::binary_lt,
-            alg_kind::binary_eq, alg_kind::binary_ne);
+            alg_kind::binary_eq, alg_kind::binary_ne, alg_kind::binary_prelu);
     const bool blocked_oc_tail = op_type == op_t::c_blocked && has_oc_tail
             && (with_postops || point_broadcast || bcast_type == bcast_t::per_w
                     || vector_overwrite);
 
     if ((bcast_type == bcast_t::none || point_broadcast_no_oc_tail)
             && !postops_per_oc_broadcast_exists && !blocked_oc_tail)
-        execute_no_bcast_strategy(src0, src1, dst, scales[0], scales[1],
+        execute_no_bcast_strategy(src0, src1, src2, dst, scales[0], scales[1],
                 post_ops_binary_rhs_arg_vec, bcast_type);
     else if (bcast_type == bcast_t::per_batch
             && !postops_per_oc_broadcast_exists && !blocked_oc_tail)
-        execute_bcast_per_batch_strategy(src0, src1, dst, scales[0], scales[1],
-                post_ops_binary_rhs_arg_vec);
+        execute_bcast_per_batch_strategy(src0, src1, src2, dst, scales[0],
+                scales[1], post_ops_binary_rhs_arg_vec);
     else if (bcast_type == bcast_t::per_w)
-        execute_bcast_per_w_strategy(src0, src1, dst, scales[0], scales[1],
-                post_ops_binary_rhs_arg_vec, op_type, blocked_oc_tail);
+        execute_bcast_per_w_strategy(src0, src1, src2, dst, scales[0],
+                scales[1], post_ops_binary_rhs_arg_vec, op_type,
+                blocked_oc_tail);
     else
-        execute_bcast_per_c_strategy(src0, src1, dst, scales[0], scales[1],
-                post_ops_binary_rhs_arg_vec, op_type, bcast_type,
+        execute_bcast_per_c_strategy(src0, src1, src2, dst, scales[0],
+                scales[1], post_ops_binary_rhs_arg_vec, op_type, bcast_type,
                 blocked_oc_tail);
 
     return status::success;
diff --git a/src/cpu/x64/jit_uni_binary.hpp b/src/cpu/x64/jit_uni_binary.hpp
index 6b608302beb..6e387dfa8aa 100644
--- a/src/cpu/x64/jit_uni_binary.hpp
+++ b/src/cpu/x64/jit_uni_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,27 +67,31 @@ struct jit_uni_binary_t : public primitive_t {
     };
 
     jit_uni_binary_t(const pd_t *apd);
-    ~jit_uni_binary_t() = default;
+
+    ~jit_uni_binary_t() override = default;
 
     status_t init(engine_t *engine) override;
 
     using data_t = int8_t;
 
     void execute_no_bcast_strategy(const data_t *src0, const data_t *src1,
-            data_t *dst, const float *scale0, const float *scale1,
+            const data_t *src2, data_t *dst, const float *scale0,
+            const float *scale1,
             const std::vector<const void *> &post_ops_binary_rhs_arg_vec,
             const bcast_t bcast_type) const;
     void execute_bcast_per_batch_strategy(const data_t *src0,
-            const data_t *src1, data_t *dst, const float *scale0,
-            const float *scale1,
+            const data_t *src1, const data_t *src2, data_t *dst,
+            const float *scale0, const float *scale1,
             const std::vector<const void *> &post_ops_binary_rhs_arg_vec) const;
     void execute_bcast_per_c_strategy(const data_t *src0, const data_t *src1,
-            data_t *dst, const float *scale0, const float *scale1,
+            const data_t *src2, data_t *dst, const float *scale0,
+            const float *scale1,
             const std::vector<const void *> &post_ops_binary_rhs_arg_vec,
             const op_t op_type, const bcast_t bcast_type,
             const bool blocked_oc_tail) const;
     void execute_bcast_per_w_strategy(const data_t *src0, const data_t *src1,
-            data_t *dst, const float *scale0, const float *scale1,
+            const data_t *src2, data_t *dst, const float *scale0,
+            const float *scale1,
             const std::vector<const void *> &post_ops_binary_rhs_arg_vec,
             const op_t op_type, const bool blocked_oc_tail) const;
 
diff --git a/src/cpu/x64/jit_uni_binary_kernel.cpp b/src/cpu/x64/jit_uni_binary_kernel.cpp
index 34c8889d14e..aba63ff1582 100644
--- a/src/cpu/x64/jit_uni_binary_kernel.cpp
+++ b/src/cpu/x64/jit_uni_binary_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@ namespace x64 {
 static bcast_set_t get_supported_postops_bcast_strategies() {
     return {broadcasting_strategy_t::scalar, broadcasting_strategy_t::per_oc,
             broadcasting_strategy_t::per_oc_spatial,
+            broadcasting_strategy_t::per_w,
             broadcasting_strategy_t::no_broadcast};
 }
 
@@ -41,7 +42,7 @@ static bool is_ne_xf16_supported(cpu_isa_t isa, const data_type_t dtype) {
 
 binary_kernel_t::binary_kernel_t(const size_t vlen, const binary_pd_t *pd,
         const jit_binary_conf_t conf, const char *name, bool tail_kernel)
-    : jit_generator(name, conf.isa)
+    : jit_generator_t(name, conf.isa)
     , vlen_(vlen)
     , simd_w_(vlen / sizeof(float))
     , pd_(pd)
@@ -90,19 +91,23 @@ size_t binary_kernel_t::get_tail_size() const {
 template <cpu_isa_t isa, typename Vmm>
 jit_uni_binary_kernel_t<isa, Vmm>::jit_uni_binary_kernel_t(
         const binary_pd_t *pd, const jit_binary_conf_t conf, bool tail_kernel)
-    : binary_kernel_t(vreg_traits<Vmm>::vlen, pd, conf, jit_name(), tail_kernel)
+    : binary_kernel_t(
+            vreg_traits_t<Vmm>::vlen, pd, conf, jit_name(), tail_kernel)
     , offt_src0_(vlen_ / ((conf_.is_bf16 || conf_.is_f16) ? 2 : 1))
     , offt_src1_(conf_.use_stride_src1 ? offt_src0_ : 0)
-    , io_(this, isa, {conf_.src0_type, conf_.src1_type, conf_.dst_type},
-              {false},
-              io::io_tail_conf_t {simd_w_, tail_size_, tail_opmask_,
-                      vmm_tail_vmask_.getIdx(), reg_tmp_},
-              io::io_emu_bf16_conf_t {vreg_bf16_emu_1_, vreg_bf16_emu_2_,
-                      vreg_bf16_emu_3_, reg_tmp_, vreg_bf16_emu_4_},
-              create_saturation_vmm_map(),
-              io::io_gather_conf_t {simd_w_, full_mask_,
-                      vmm_full_mask_.getIdx(), reg_tmp_, reg_tmp1_,
-                      vmm_tmp_gather_.getIdx()}) {
+    , offt_src2_(offt_src0_) {
+    typename io::jit_io_multi_dt_helper_t<Vmm>::data_types_t dts
+            = {conf_.src0_type, conf_.src1_type, conf_.dst_type};
+    if (conf.is_ternary_op) dts.emplace(conf_.src2_type);
+
+    io_ = io::jit_io_multi_dt_helper_t<Vmm>(this, isa, dts, {false},
+            io::io_tail_conf_t {simd_w_, tail_size_, tail_opmask_,
+                    vmm_tail_vmask_.getIdx(), reg_tmp_},
+            io::io_emu_bf16_conf_t {vreg_bf16_emu_1_, vreg_bf16_emu_2_,
+                    vreg_bf16_emu_3_, reg_tmp_, vreg_bf16_emu_4_},
+            create_saturation_vmm_map(),
+            io::io_gather_conf_t {simd_w_, full_mask_, vmm_full_mask_.getIdx(),
+                    reg_tmp_, reg_tmp1_, vmm_tmp_gather_.getIdx()});
     init();
 }
 
@@ -214,7 +219,9 @@ void jit_uni_binary_kernel_t<isa, Vmm>::apply_postops(int unroll, bool tail) {
                 this, {reg_tmp1_}};
 
         mov(reg_tmp1_, reg_dst_);
-        add(reg_tmp1_, reg_offt_dst);
+        mov(reg_tmp_, reg_offt_dst);
+        if (!conf_.is_i8) shl(reg_tmp_, is_xf16(conf_.dst_type) ? 1 : 2);
+        add(reg_tmp1_, reg_tmp_);
 
         for (int vmm_idx = 1; vmm_idx < unroll + vmm_start_idx_; vmm_idx++) {
             rhs_arg_params.vmm_idx_to_out_reg.emplace(vmm_idx, reg_tmp1_);
@@ -242,7 +249,9 @@ void jit_uni_binary_kernel_t<isa, Vmm>::load_kernel_params() {
                 ptr[reg_param_ + PARAM_OFF(spat_offt_count)]);
     mov(reg_src0_, ptr[reg_param_ + PARAM_OFF(src0)]);
     mov(reg_src1_, ptr[reg_param_ + PARAM_OFF(src1)]);
+    if (conf_.is_ternary_op) mov(reg_src2_, ptr[reg_param_ + PARAM_OFF(src2)]);
     mov(reg_dst_, ptr[reg_param_ + PARAM_OFF(dst)]);
+
     if (conf_.is_src_different_layouts) {
         mov(reg_tmp_, ptr[reg_param_ + PARAM_OFF(indices)]);
         uni_vmovdqu(vmm_indices_, ptr[reg_tmp_]);
@@ -257,9 +266,14 @@ void jit_uni_binary_kernel_t<isa, Vmm>::load_kernel_params() {
         mov(reg_scales_src1_, ptr[reg_param_ + PARAM_OFF(scales_src1)]);
 }
 
+// Note: unlike `src1_ptr(...)` and `dst_ptr(...)`, `offt` here is specified in
+// the number of elements without multiplying it by dt_size. See a comment for
+// `reg_offt_src0_` in the header file for more details.
 template <cpu_isa_t isa, typename Vmm>
 Address jit_uni_binary_kernel_t<isa, Vmm>::src0_ptr(size_t offt) {
-    return vmmword[reg_src0_ + reg_offt_src0_ + offt];
+    const auto src0_type_size = types::data_type_size(conf_.src0_type);
+    return vmmword[reg_src0_ + reg_offt_src0_ * src0_type_size
+            + offt * src0_type_size];
 }
 
 template <cpu_isa_t isa, typename Vmm>
@@ -267,9 +281,18 @@ Address jit_uni_binary_kernel_t<isa, Vmm>::src1_ptr(size_t offt) {
     return vmmword[reg_src1_ + reg_offt_src1_ + offt];
 }
 
+template <cpu_isa_t isa, typename Vmm>
+Address jit_uni_binary_kernel_t<isa, Vmm>::src2_ptr(size_t offt) {
+    const auto src2_type_size = types::data_type_size(conf_.src2_type);
+    return vmmword[reg_src2_ + reg_offt_src0_ * src2_type_size
+            + offt * src2_type_size];
+}
+
 template <cpu_isa_t isa, typename Vmm>
 Address jit_uni_binary_kernel_t<isa, Vmm>::dst_ptr(size_t offt) {
-    const Reg64 &reg_offt_dst = conf_.is_i8 ? reg_offt_dst_ : reg_offt_src0_;
+    const auto src0_type_size = types::data_type_size(conf_.src0_type);
+    const auto &reg_offt_dst
+            = conf_.is_i8 ? reg_offt_dst_ : (reg_offt_src0_ * src0_type_size);
     return vmmword[reg_dst_ + reg_offt_dst + offt];
 }
 
@@ -283,7 +306,7 @@ unsigned int jit_uni_binary_kernel_t<isa, Vmm>::cmp_predicate(alg_kind_t alg) {
         case binary_lt: return _cmp_lt_os;
         case binary_eq: return _cmp_eq_oq;
         case binary_ne: return _cmp_neq_uq;
-        default: assert(!"not supported operation!"); return -1;
+        default: assert(!"unsupported operation!"); return -1;
     }
 }
 
@@ -321,7 +344,59 @@ void jit_uni_binary_kernel_t<isa, Vmm>::perform_op(
             uni_vminps(v0, v0, vreg_one_);
         }
     } else
-        assert(!"not supported operation!");
+        assert(!"unsupported operation!");
+}
+
+template <cpu_isa_t isa, typename Vmm>
+Opmask jit_uni_binary_kernel_t<isa, Vmm>::get_select_opmask(int reg_idx) {
+    switch (reg_idx) {
+        case 0: return k5;
+        case 1: return k6;
+        case 2: return k7;
+        default: assert(!"unsupported range"); return k7;
+    }
+}
+
+template <cpu_isa_t isa, typename Vmm>
+void jit_uni_binary_kernel_t<isa, Vmm>::perform_ternary_op(const Vmm &v0,
+        const Vmm &v1, const Vmm &cond, const Vmm &s_src0, const Vmm &s_src1,
+        int reg_idx) {
+    using namespace alg_kind;
+    const auto alg = pd_->desc()->alg_kind;
+    if (conf_.do_scale_src0) uni_vmulps(v0, v0, s_src0);
+    if (conf_.do_scale_src1 && offt_src1_ != 0 && !conf_.broadcast_src1_value)
+        uni_vmulps(v1, v1, s_src1);
+
+    uni_vpxor(vreg_zero_, vreg_zero_, vreg_zero_);
+
+    // The following kernel implementation for the ternary select operation
+    // dst = src2 ? src0 : src1
+    // is constructed using VCMMPS and VBLENDVPS instructions which process three
+    // elements at a time for the src0, src1 and the additional src2 tensor for
+    // conditional input. As a result, the number of unroll registers for the
+    //  operation must be reduced to accommodate for the third input. This cuts
+    // down on the performance of the operation but must be done due to the
+    // limited available Vmm registers to account for the extra input.
+    // TODO: Another more optimized alternative to investigate is to break down the
+    // select operation into a set of arithmetic operations:
+    // dst = src1 * (src2) + src0 * (1 - src2)
+    // wherein the values can be directly load from the src0 and src1 addresses
+    // without requiring additional tmp registers to hold the intermediate
+    // computations. But this will require additional offset pre-calculations to
+    // account for all the cases of broadcasting / tensor data types. Considering
+    // the current number of free GPRs during computation and the kernel structure,
+    // this provides little gain over the current kernel implementation.
+    if (alg == binary_select) {
+        if (is_avx512) {
+            const Opmask cond_mask = get_select_opmask(reg_idx);
+            vcmpps(cond_mask, cond, vreg_zero_, _cmp_eq_oq);
+            vblendmps(v0 | cond_mask, v0, v1);
+        } else {
+            uni_vcmpps(cond, cond, vreg_zero_, _cmp_eq_oq);
+            uni_vblendvps(v0, v0, v1, cond);
+        }
+    } else
+        assert(!"unsupported operation!");
 }
 
 template <cpu_isa_t isa, typename Vmm>
@@ -454,9 +529,7 @@ void jit_uni_binary_kernel_t<isa, Vmm>::compute_ne_xf16_dst_body(
 
         if (can_load_two_simdw_src0) {
             io_.at(conf_.src0_type)
-                    ->load_two_simdw_xf16(
-                            src0_ptr(offt_base
-                                    * types::data_type_size(conf_.src0_type)),
+                    ->load_two_simdw_xf16(src0_ptr(offt_base),
                             vreg_tmp_even_src0, vreg_tmp_odd_src0);
             io_.at(conf_.src0_type)
                     ->merge_interleaved_to_plain(
@@ -483,10 +556,7 @@ void jit_uni_binary_kernel_t<isa, Vmm>::compute_ne_xf16_dst_body(
             const int offt = simd_w_ * j + offt_base;
             if (!can_load_two_simdw_src0)
                 io_.at(conf_.src0_type)
-                        ->load(src0_ptr(offt
-                                       * types::data_type_size(
-                                               conf_.src0_type)),
-                                vreg_tmp_src0, tail);
+                        ->load(src0_ptr(offt), vreg_tmp_src0, tail);
             if (offt_src1_ && !can_load_two_simdw_src1)
                 load_src1(vreg_tmp_src1, offt, tail);
 
@@ -510,17 +580,21 @@ void jit_uni_binary_kernel_t<isa, Vmm>::compute_dst_body(
                 : Vmm(unroll + i + vmm_start_idx_);
         const Vmm vreg_tmp_src1 = offt_src1_ ? vreg_tmp : vreg_bcast_src1_;
         const int offt = simd_w_ * i;
-        io_.at(conf_.src0_type)
-                ->load(src0_ptr(offt * types::data_type_size(conf_.src0_type)),
-                        vreg_tmp_src0, tail);
+        io_.at(conf_.src0_type)->load(src0_ptr(offt), vreg_tmp_src0, tail);
         if (offt_src1_) load_src1(vreg_tmp_src1, offt, tail);
 
         // avoid multiple multiplication on input scale for broadcasted vreg
         // not needed for different layouts
         if (!conf_.is_src_different_layouts)
             uni_vmovups(vreg_tmp, vreg_tmp_src1);
-        perform_op(
-                vreg_tmp_src0, vreg_tmp, vreg_scales_src0_, vreg_scales_src1_);
+        if (conf_.is_ternary_op) {
+            const Vmm vreg_tmp_src2 = Vmm(2 * unroll + i + vmm_start_idx_);
+            io_.at(conf_.src2_type)->load(src2_ptr(offt), vreg_tmp_src2, tail);
+            perform_ternary_op(vreg_tmp_src0, vreg_tmp, vreg_tmp_src2,
+                    vreg_scales_src0_, vreg_scales_src1_, i);
+        } else
+            perform_op(vreg_tmp_src0, vreg_tmp, vreg_scales_src0_,
+                    vreg_scales_src1_);
     }
 }
 
@@ -532,7 +606,8 @@ void jit_uni_binary_kernel_t<isa, Vmm>::compute_dst(int unroll, bool tail) {
             && IMPLICATION(is_xf16(conf_.src1_type),
                     offt_src1_ && !conf_.is_src_different_layouts)
             && (is_ne_xf16_supported(isa, conf_.src0_type)
-                    || is_ne_xf16_supported(isa, conf_.src1_type)))
+                    || is_ne_xf16_supported(isa, conf_.src1_type))
+            && !conf_.is_ternary_op)
         compute_ne_xf16_dst_body(unroll, tail);
     else
         compute_dst_body(unroll, tail);
@@ -546,7 +621,6 @@ template <cpu_isa_t isa, typename Vmm>
 void jit_uni_binary_kernel_t<isa, Vmm>::forward() {
     Label unroll_loop, unroll_loop_tail, nelems_tail, end;
 
-    const auto src0_type_size = types::data_type_size(conf_.src0_type);
     const auto src1_type_size = types::data_type_size(conf_.src1_type);
     const auto dst_type_size = types::data_type_size(conf_.dst_type);
 
@@ -560,11 +634,12 @@ void jit_uni_binary_kernel_t<isa, Vmm>::forward() {
             xor_(reg_offt_dst_, reg_offt_dst_); // offt_dst to get addr of dst
         }
 
-        xor_(reg_offt_src0_,
-                reg_offt_src0_); // offt_src0 to get addr of src0/dst
+        // offt_src0 to get addr of src0/dst
+        xor_(reg_offt_src0_, reg_offt_src0_);
+        // offt_src1 to get addr of src1
         if (!conf_.is_src_different_layouts)
-            xor_(reg_offt_src1_,
-                    reg_offt_src1_); // offt_src1 to get addr of src1
+            xor_(reg_offt_src1_, reg_offt_src1_);
+
         if (conf_.use_stride_rhs_postops && !conf_.is_i8)
             xor_(reg_off_rhs_postops_, reg_off_rhs_postops_);
     }
@@ -572,7 +647,7 @@ void jit_uni_binary_kernel_t<isa, Vmm>::forward() {
 
     if (utils::one_of(alg, alg_kind::binary_ge, alg_kind::binary_gt,
                 alg_kind::binary_le, alg_kind::binary_lt, alg_kind::binary_eq,
-                alg_kind::binary_ne)) {
+                alg_kind::binary_ne, alg_kind::binary_select)) {
         Xmm xreg_one = Xmm(vreg_one_.getIdx());
         mov(reg_tmp_, float2int(1));
         uni_vmovq(xreg_one, reg_tmp_);
@@ -601,7 +676,8 @@ void jit_uni_binary_kernel_t<isa, Vmm>::forward() {
 
         compute_dst(unroll_regs_, treat_each_compute_step_as_tail);
         sub(reg_reverse_spat_offt_, offt * dst_type_size);
-        add(reg_offt_src0_, offt * src0_type_size);
+        add(reg_offt_src0_, offt);
+
         if (conf_.is_i8) {
             if (!conf_.broadcast_src1_value && !conf_.is_src_different_layouts)
                 add(reg_offt_src1_, offt * src1_type_size);
@@ -621,7 +697,7 @@ void jit_uni_binary_kernel_t<isa, Vmm>::forward() {
 
         compute_dst(1, treat_each_compute_step_as_tail);
         sub(reg_reverse_spat_offt_, simd_w_ * dst_type_size);
-        add(reg_offt_src0_, simd_w_ * src0_type_size);
+        add(reg_offt_src0_, simd_w_);
         if (conf_.is_i8) {
             if (!conf_.broadcast_src1_value && !conf_.is_src_different_layouts)
                 add(reg_offt_src1_, simd_w_ * src1_type_size);
@@ -644,7 +720,7 @@ void jit_uni_binary_kernel_t<isa, Vmm>::forward() {
         compute_dst(1, true);
         // need to increase if forward over outer dims
         if (is_src1_outer_dims_tail_) {
-            add(reg_offt_src0_, tail_size_ * src0_type_size);
+            add(reg_offt_src0_, tail_size_);
             if (conf_.is_i8)
                 add(reg_offt_dst_, tail_size_);
             else {
@@ -669,8 +745,8 @@ void jit_uni_binary_kernel_t<isa, Vmm>::forward_over_outer_dims() {
         xor_(reg_offt_dst_, reg_offt_dst_); // offt_dst to get addr of dst
     }
 
-    xor_(reg_offt_src0_,
-            reg_offt_src0_); // offt_src0 to get addr of src0/dst
+    // offt_src0 to get addr of src0/dst
+    xor_(reg_offt_src0_, reg_offt_src0_);
     if (conf_.use_stride_rhs_postops && !conf_.is_i8)
         xor_(reg_off_rhs_postops_, reg_off_rhs_postops_);
 
diff --git a/src/cpu/x64/jit_uni_binary_kernel.hpp b/src/cpu/x64/jit_uni_binary_kernel.hpp
index 021c4749ade..b15d5358a86 100644
--- a/src/cpu/x64/jit_uni_binary_kernel.hpp
+++ b/src/cpu/x64/jit_uni_binary_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ namespace x64 {
 
 using namespace Xbyak;
 
-struct binary_kernel_t : public jit_generator {
+struct binary_kernel_t : public jit_generator_t {
     using op_t = binary_op_t;
     using bcast_t = binary_bcast_t;
 
@@ -47,7 +47,7 @@ struct binary_kernel_t : public jit_generator {
             bool tail_kernel = false);
     ~binary_kernel_t() override = default;
 
-    void operator()(jit_binary_call_s *p) { jit_generator::operator()(p); }
+    void operator()(jit_binary_call_s *p) { jit_generator_t::operator()(p); }
 
     size_t simd_w() const noexcept { return simd_w_; }
     size_t vlen() const noexcept { return vlen_; }
@@ -79,7 +79,16 @@ struct jit_uni_binary_kernel_t : public binary_kernel_t {
     const Reg64 reg_param_ = abi_param1;
     const Reg64 reg_src0_ = r8;
     const Reg64 reg_src1_ = r9;
+    const Reg64 reg_src2_ = rsi;
     const Reg64 reg_dst_ = r10;
+    // Note: for ternary operations, only src2 identical to src0 is supported.
+    // It means the processing of elements in both tensors is identical, thus,
+    // if the offset is in number of elements and not in bytes, multiplying such
+    // offset both src0 and src2 offsets by correspondent dt_size allows to
+    // compute them independently. This allows us to save a GPR (there's no
+    // `reg_offt_src2_` so far) until the moment a random broadcast for src2 is
+    // supported. There's no spare GPRs to use at the moment without code
+    // modifications.
     const Reg64 reg_offt_src0_ = r11;
     const Reg64 reg_outer_dims_range_ = r12;
     const Reg64 reg_offt_src1_ = rax;
@@ -117,9 +126,14 @@ struct jit_uni_binary_kernel_t : public binary_kernel_t {
     const Vmm vmm_indices_ = Vmm(is_avx512 ? 30 : 7);
     const Vmm vmm_gathered_src_ = Vmm(is_avx512 ? 31 : 8);
 
-    const size_t unroll_regs_ = is_avx512 ? 8 : 4;
+    // For the ternary select operation, a conditional value is required
+    // for computation in addition to src0 and src1. The number of unroll
+    // registers are adjusted to accomodate for the extra input.
+    const size_t unroll_regs_ = is_avx512 ? (conf_.is_ternary_op ? 3 : 8)
+                                          : (conf_.is_ternary_op ? 1 : 4);
     const size_t offt_src0_;
     const size_t offt_src1_;
+    const size_t offt_src2_;
 
     static constexpr cpu_isa_t inject_isa
             = isa == avx512_core_bf16 ? avx512_core : isa;
@@ -134,10 +148,14 @@ struct jit_uni_binary_kernel_t : public binary_kernel_t {
     void load_kernel_params();
     Address src0_ptr(size_t offt = 0);
     Address src1_ptr(size_t offt = 0);
+    Address src2_ptr(size_t offt = 0);
     Address dst_ptr(size_t offt = 0);
+    Opmask get_select_opmask(int reg_idx);
     unsigned int cmp_predicate(alg_kind_t alg);
     void perform_op(
             const Vmm &v0, const Vmm &v1, const Vmm &s_src0, const Vmm &s_src1);
+    void perform_ternary_op(const Vmm &v0, const Vmm &v1, const Vmm &cond,
+            const Vmm &s_src0, const Vmm &s_src1, int reg_idx);
     void prepare_isa_kernel();
     void compute_bcast(bool tail);
     void load_src1(const Vmm &vreg_src1, const int offt, bool tail);
diff --git a/src/cpu/x64/jit_uni_convert_xf16.cpp b/src/cpu/x64/jit_uni_convert_xf16.cpp
index bb2311e9788..1ed365aeaaa 100644
--- a/src/cpu/x64/jit_uni_convert_xf16.cpp
+++ b/src/cpu/x64/jit_uni_convert_xf16.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -331,7 +331,7 @@ void jit_uni_cvt_xf16_to_ps_t<isa>::convert_xf16(
 
 template <typename Wmm>
 struct helper_avx2_cvt_xf16_t {
-    static void convert_xf16(jit_generator *host,
+    static void convert_xf16(jit_generator_t *host,
             const impl::data_type_t input_dt, const Xbyak::Address in_addr,
             const int even_src, const int odd_src, const int tmp_1,
             const int tmp_2) {
diff --git a/src/cpu/x64/jit_uni_convert_xf16.hpp b/src/cpu/x64/jit_uni_convert_xf16.hpp
index 775beaace64..6e46e25a069 100644
--- a/src/cpu/x64/jit_uni_convert_xf16.hpp
+++ b/src/cpu/x64/jit_uni_convert_xf16.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,12 +51,12 @@ struct jit_cvt_xf16_to_ps_params_t {
 } // namespace cvt_xf16_support
 
 template <cpu_isa_t isa>
-struct jit_uni_cvt_ps_to_xf16_t : public jit_generator {
+struct jit_uni_cvt_ps_to_xf16_t : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_cvt_ps_to_xf16_t)
 
     jit_uni_cvt_ps_to_xf16_t(impl::data_type_t dt, size_t nelems = 0)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , output_dt_(dt)
         , nelems_(nelems)
         , is_dynamic_size_(nelems_ == 0)
@@ -70,9 +70,9 @@ struct jit_uni_cvt_ps_to_xf16_t : public jit_generator {
     const bool is_dynamic_size_;
     const int tail_size_;
 
-    constexpr static int simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    using Vmm_down_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    constexpr static int simd_w_ = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    using Vmm_down_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
 
     const Vmm vmm_input = Vmm(0);
     const Vmm_down_t vmm_output = Vmm_down_t(1);
@@ -152,18 +152,18 @@ struct jit_cvt_ps_to_xf16_t {
     }
 
 private:
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
     const size_t nelems_;
 };
 
 template <cpu_isa_t isa>
-struct jit_uni_cvt_xf16_to_ps_t : public jit_generator {
+struct jit_uni_cvt_xf16_to_ps_t : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_cvt_xf16_to_ps_t)
 
     jit_uni_cvt_xf16_to_ps_t(
             impl::data_type_t dt, bool with_add, size_t row_stride)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , input_dt_(dt)
         , with_add_(with_add)
         , row_stride_(row_stride) {
@@ -174,9 +174,9 @@ struct jit_uni_cvt_xf16_to_ps_t : public jit_generator {
 
 protected:
     constexpr static int elem_granularity = isa == avx2_vnni_2 ? 2 : 1;
-    constexpr static int simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    using Vmm_down_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    constexpr static int simd_w_ = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    using Vmm_down_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
 
     const impl::data_type_t input_dt_;
     const bool with_add_;
@@ -267,7 +267,7 @@ struct jit_cvt_xf16_to_ps_t {
     }
 
 private:
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.cpp b/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.cpp
index 3f908d3ef83..e1f1b06ee99 100644
--- a/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.cpp
+++ b/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace zp {
 
 jit_uni_deconv_zp_pad_str_kernel_base_t::
         jit_uni_deconv_zp_pad_str_kernel_base_t(const jit_conv_conf_t &jcp)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , jcp_(jcp)
     , tail_size_(jcp.is_depthwise ? jcp.ngroups % jcp.ch_block
                                   : jcp.oc_without_padding % jcp.oc_block) {}
@@ -122,7 +122,7 @@ void jit_uni_deconv_zp_pad_str_kernel_t<isa, Vmm>::init() {
 
 template <cpu_isa_t isa, typename Vmm>
 Vmm jit_uni_deconv_zp_pad_str_kernel_t<isa, Vmm>::get_next_vmm() {
-    static constexpr int max_v_regs = cpu_isa_traits<isa>::n_vregs;
+    static constexpr int max_v_regs = cpu_isa_traits_t<isa>::n_vregs;
 
     const Vmm vmm {static_cast<int>(current_vmm_++)};
 
@@ -144,9 +144,7 @@ void jit_uni_deconv_zp_pad_str_kernel_t<isa, Vmm>::compute_step(
     if (jcp_.is_depthwise)
         uni_vpaddd(result_acc_, result_acc_, wei_vmm);
     else if (jcp_.has_vnni)
-        vpdpbusd(result_acc_, vmm_one_bytes_, wei_vmm,
-                is_superset(isa, avx512_core) ? Xbyak::EvexEncoding
-                                              : Xbyak::VexEncoding);
+        vpdpbusd(result_acc_, vmm_one_bytes_, wei_vmm, get_encoding());
     else {
         uni_vpmaddubsw(vmm_tmp_, vmm_one_bytes_, wei_vmm);
         uni_vpmaddwd(vmm_tmp_, vmm_tmp_, vmm_one_words_);
@@ -157,7 +155,7 @@ void jit_uni_deconv_zp_pad_str_kernel_t<isa, Vmm>::compute_step(
 template <cpu_isa_t isa, typename Vmm,
         typename T = std::integral_constant<bool, (isa < avx512_core)>>
 struct helper_store_t {
-    static void store(jit_generator *gen, const Vmm &vmm,
+    static void store(jit_generator_t *gen, const Vmm &vmm,
             const Xbyak::Reg64 &reg_dst, const size_t size,
             const Xbyak::Opmask &opmask) {
         gen->store_bytes(vmm, reg_dst, 0, size);
@@ -167,7 +165,7 @@ struct helper_store_t {
 using isa_at_least_avx512_core = std::false_type;
 template <cpu_isa_t isa, typename Vmm>
 struct helper_store_t<isa, Vmm, isa_at_least_avx512_core> {
-    static void store(jit_generator *gen, const Vmm &vmm,
+    static void store(jit_generator_t *gen, const Vmm &vmm,
             const Xbyak::Reg64 &reg_dst, const size_t size,
             const Xbyak::Opmask &opmask) {
         using namespace Xbyak::util;
diff --git a/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.hpp b/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.hpp
index 9b5707804d8..462bf8c32a4 100644
--- a/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.hpp
+++ b/src/cpu/x64/jit_uni_deconv_zp_pad_str_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,12 +43,12 @@ struct jit_uni_deconv_zp_pad_str_call_params_t {
  *
  * output_format - dhwc
  */
-class jit_uni_deconv_zp_pad_str_kernel_base_t : public jit_generator {
+class jit_uni_deconv_zp_pad_str_kernel_base_t : public jit_generator_t {
 public:
     jit_uni_deconv_zp_pad_str_kernel_base_t(const jit_conv_conf_t &jcp);
 
     void operator()(const jit_uni_deconv_zp_pad_str_call_params_t *params) {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_deconv_zp_pad_str_kernel_base_t);
diff --git a/src/cpu/x64/jit_uni_dw_conv_kernel_f32.cpp b/src/cpu/x64/jit_uni_dw_conv_kernel_f32.cpp
index 290cba2d4e2..74ddfd7c4e8 100644
--- a/src/cpu/x64/jit_uni_dw_conv_kernel_f32.cpp
+++ b/src/cpu/x64/jit_uni_dw_conv_kernel_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,25 +38,27 @@ using namespace Xbyak;
 template <cpu_isa_t isa>
 jit_uni_dw_conv_fwd_kernel_f32<isa>::jit_uni_dw_conv_fwd_kernel_f32(
         const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), isa), jcp(ajcp) {
-    if (jcp.with_eltwise || jcp.with_binary) {
+    : jit_generator_t(jit_name(), isa), jcp(ajcp) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = false;
-        static constexpr size_t helper_vmm_idx = 31;
+        static constexpr size_t helper_vmm_idx = 2;
         static constexpr bool use_exact_tail_scalar_bcast = true;
         const size_t tail_size = jcp.oc_without_padding
-                % (cpu_isa_traits<isa>::vlen / sizeof(float));
+                % (cpu_isa_traits_t<isa>::vlen / sizeof(float));
         rhs_arg_static_params_t rhs_arg_static_params {helper_vmm_idx, r14, r15,
                 r12, preserve_gpr, preserve_vmm,
                 GET_OFF(post_ops_binary_rhs_arg_vec), GET_OFF(dst_orig),
                 memory_desc_wrapper(dst_md), tail_size, k_oc_tail_mask,
                 use_exact_tail_scalar_bcast};
         static_params_t static_params {this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_
                 = utils::make_unique<injector::jit_uni_postops_injector_t<isa>>(
-                        this, jcp.post_ops, static_params);
+                        this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -73,7 +75,7 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::load_src(
     const auto ch_blk = jcp.ch_block;
     const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
     const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
-    const int vlen = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const int vlen = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     const int c_tail = jcp.oc % jcp.ch_block;
 
     const int repeats = max_repeats();
@@ -140,7 +142,7 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::apply_filter_unrolled(
     const auto icb_stride = src_layout_nxc
             ? ch_blk
             : (jcp.is_fused_conv ? 1 : jcp.ih) * jcp.iw * ch_blk;
-    const int vlen = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const int vlen = cpu_isa_traits_t<isa>::vlen / sizeof(float);
 
     auto get_input_spatial_index = [=](int oi, int ki) {
         return (ki * dilate_w + oi * stride_w - pad_l);
@@ -272,8 +274,25 @@ void iterate(
 template <cpu_isa_t isa>
 void jit_uni_dw_conv_fwd_kernel_f32<isa>::apply_postops(
         const int ur_ch_blocks, const int ur_w, const bool is_ch_tail) {
-    if (this->jcp.with_eltwise || this->jcp.with_binary) {
+    if (this->jcp.with_eltwise || this->jcp.with_binary || this->jcp.with_depthwise || this->jcp.with_quantization) {
+        push(aux_reg_blocks_offset);
+        base_post_ops_data_offset += reg64_size;
+        add(aux_reg_blocks_offset, ptr[this->param1 + GET_OFF(oc_off)]); //add offset of processed blocks
+
         const int repeats = max_repeats();
+
+        std::map<size_t, int> vmm_idx_off;
+        iterate(repeats, ur_ch_blocks, ur_w,
+                [&](const int r, const int ch, const int ow, const bool) {
+                    vmm_idx_off.insert({get_acc_reg_idx(r * ur_ch_blocks * ur_w + ch * ur_w + ow), (ch * repeats + r) * jcp.ch_block / repeats * sizeof(float)});
+                });
+
+        depthwise_injector::dynamic_params_t ddp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  aux_reg_blocks_offset, vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {aux_reg_blocks_offset, vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
         injector_utils::vmm_index_set_t vmm_idxs;
         if (jcp.with_binary) {
             binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
@@ -290,7 +309,7 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::apply_postops(
                     [&](const int r, const int ch, const int ow,
                             const bool mask_flag_blocked_layout) {
                         const int vlen
-                                = cpu_isa_traits<isa>::vlen / sizeof(float);
+                                = cpu_isa_traits_t<isa>::vlen / sizeof(float);
                         const bool is_tail_load = check_if_tail_load(
                                 is_ch_tail, c_tail, ch, ur_ch_blocks, vlen, r);
                         if ((ch + 1 == ur_ch_blocks) && is_ch_tail
@@ -321,16 +340,16 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::apply_postops(
                 cmp(reg_tmp, jcp.nb_ch_blocking * jcp.ch_block);
                 jge(postops_no_tail, T_NEAR);
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params_tail);
+                        vmm_idxs, rhs_arg_params_tail, ddp, qdp);
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             } else if (is_ch_tail) {
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params_tail);
+                        vmm_idxs, rhs_arg_params_tail, ddp, qdp);
             }
             if (!is_ch_tail) {
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params);
+                        vmm_idxs, rhs_arg_params, ddp, qdp);
                 L(postops_done);
             }
         } else {
@@ -339,8 +358,10 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::apply_postops(
                         vmm_idxs.emplace(get_acc_reg_idx(
                                 r * ur_ch_blocks * ur_w + ch * ur_w + ow));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, binary_injector::rhs_arg_dynamic_params_t(), ddp, qdp);
         }
+        pop(aux_reg_blocks_offset);
+        base_post_ops_data_offset -= reg64_size;
     }
 }
 
@@ -408,7 +429,7 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::store_dst(
     const auto ch_blk = jcp.ch_block;
     const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.oh * jcp.ow * ch_blk;
     const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
-    const int vlen = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const int vlen = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     const int c_tail = jcp.oc_without_padding % jcp.ch_block;
 
     const int repeats = max_repeats();
@@ -464,6 +485,8 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::compute_loop(
     };
 
     mov(aux_reg_ch_blocks, reg_ch_blocks);
+    xor_(aux_reg_blocks_offset, aux_reg_blocks_offset);
+
     if (ch_loop) {
         Label ch_loop_label, ch_tail_label, skip_ch_tail_label;
         const int ch_block_tail = jcp.nb_ch
@@ -473,7 +496,11 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::compute_loop(
         push(reg_kernel);
         push(reg_input);
         push(reg_output);
-        if (jcp.with_bias) push(reg_bias);
+        base_post_ops_data_offset += 3 * reg64_size;
+        if (jcp.with_bias) {
+            push(reg_bias);
+            base_post_ops_data_offset += reg64_size;
+        }
 
         if ((jcp.oc / jcp.ch_block) >= jcp.nb_ch_blocking) {
             if (ch_block_tail) {
@@ -489,6 +516,7 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::compute_loop(
                 add(reg_output, out_ch_stride);
                 if (jcp.with_bias) add(reg_bias, bias_stride);
                 sub(aux_reg_ch_blocks, ch_step);
+                add(aux_reg_blocks_offset, ch_step * sizeof(float)); //add initial offset of processed blocks
                 cmp(aux_reg_ch_blocks, ch_step);
                 jge(ch_loop_label, T_NEAR);
             }
@@ -503,10 +531,14 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::compute_loop(
             L(skip_ch_tail_label);
         }
 
-        if (jcp.with_bias) pop(reg_bias);
+        if (jcp.with_bias) {
+            pop(reg_bias);
+            base_post_ops_data_offset -= reg64_size;
+        }
         pop(reg_output);
         pop(reg_input);
         pop(reg_kernel);
+        base_post_ops_data_offset -= 3 * reg64_size;
 
     } else {
         compute(ur_ch_blocks, jcp.oc % jcp.ch_block);
@@ -587,6 +619,9 @@ template <cpu_isa_t isa>
 void jit_uni_dw_conv_fwd_kernel_f32<isa>::generate() {
     this->preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_input, reg_output);
+
     if (jcp.is_fused_conv) {
         mov(reg_input_buffer_ptr, ptr[this->param1 + GET_OFF(src)]);
         /* In case of fused depthwise convolution, `param.src` is not a pointer
@@ -647,6 +682,9 @@ void jit_uni_dw_conv_fwd_kernel_f32<isa>::generate() {
         L(exit_label);
     }
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     this->postamble();
 
     if (jcp.with_eltwise)
@@ -800,6 +838,39 @@ inline void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::apply_filter(
     L(iter_exit_label);
 }
 
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::apply_postprocess(int ur_ch_blocks, int ur_str_w) {
+    int repeats = isa == sse41 ? 2 : 1;
+
+    const auto &p = jcp.post_ops;
+    std::size_t post_ops_data_offset = 0;
+    int depthwise_inj_idx = 0;
+    base_post_ops_data_offset += reg64_size;
+    push(reg_d_weights);
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+            add(reg_d_weights, ptr[this->param1 + GET_OFF(ic_off)]);
+
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int k = 0; k < repeats; k++) {
+                    int start_idx = get_acc_reg(k*ur_ch_blocks*ur_str_w + ur_str_w * ch).getIdx();
+                    int end_idx = get_acc_reg(k*ur_ch_blocks*ur_str_w + ur_str_w * ch + ur_str_w).getIdx();
+
+                    depthwise_injectors[depthwise_inj_idx]->compute_vector_range(start_idx, end_idx, reg_d_weights, reg_d_weights);
+
+                    add(reg_d_weights, jcp.ch_block / repeats * sizeof(float));
+                }
+            }
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+        }
+    }
+    pop(reg_d_weights);
+    base_post_ops_data_offset -= reg64_size;
+}
+
 template <cpu_isa_t isa>
 inline void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::store_dsrc(
         int ur_ch_blocks, int ur_str_w, bool is_last_ch) {
@@ -847,6 +918,7 @@ inline void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::ch_loop_body(
 
                   load_ddst(ur_ch_blocks, unroll_w);
                   apply_filter(ur_ch_blocks, unroll_w, is_last_ch);
+                  apply_postprocess(ur_ch_blocks, unroll_w);
                   store_dsrc(ur_ch_blocks, unroll_w, is_last_ch);
               };
 
@@ -866,6 +938,7 @@ inline void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::ch_loop_body(
                 = (size_t)jcp.nb_ch_blocking * jcp.ch_block * sizeof(float);
 
         mov(aux_reg_ch_blocks, reg_ch_blocks);
+        base_post_ops_data_offset += 3 * reg64_size;
         push(reg_dsrc);
         push(reg_ddst);
         push(reg_kernel);
@@ -902,6 +975,7 @@ inline void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::ch_loop_body(
         pop(reg_kernel);
         pop(reg_ddst);
         pop(reg_dsrc);
+        base_post_ops_data_offset -= 3 * reg64_size;
 
     } else {
         call_compute_body(ur_ch_blocks, unroll_w, jcp.ch_tail > 0);
@@ -940,8 +1014,39 @@ inline void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::unroll_width_body(
 
 template <cpu_isa_t isa>
 void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::generate() {
+    const auto &p = jcp.post_ops;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<isa>(
+                    this,
+                    post_op
+            ));
+        }
+    }
+
     preamble();
 
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_dsrc;
+        auto aux_reg1 = reg_ddst;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
     mov(reg_dsrc, ptr[this->param1 + GET_OFF(src)]);
     mov(reg_ddst, ptr[this->param1 + GET_OFF(dst)]);
     mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
@@ -982,6 +1087,10 @@ void jit_uni_dw_conv_bwd_data_kernel_f32<isa>::generate() {
         if (ch_blocks_tail) { ch_blocks_loop(ch_blocks_tail); }
     }
 
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
     this->postamble();
 }
 #undef GET_OFF
diff --git a/src/cpu/x64/jit_uni_dw_conv_kernel_f32.hpp b/src/cpu/x64/jit_uni_dw_conv_kernel_f32.hpp
index 1a152cec9e7..172e6735c53 100644
--- a/src/cpu/x64/jit_uni_dw_conv_kernel_f32.hpp
+++ b/src/cpu/x64/jit_uni_dw_conv_kernel_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ namespace cpu {
 namespace x64 {
 
 template <cpu_isa_t isa>
-struct jit_uni_dw_conv_fwd_kernel_f32 : public jit_generator {
+struct jit_uni_dw_conv_fwd_kernel_f32 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_fwd_kernel_f32)
 
     jit_uni_dw_conv_fwd_kernel_f32(
@@ -48,7 +48,7 @@ struct jit_uni_dw_conv_fwd_kernel_f32 : public jit_generator {
     const Xbyak::AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                             ? yword
                                                         : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     // dw convolution
     reg64_t reg_input = r8;
@@ -71,6 +71,16 @@ struct jit_uni_dw_conv_fwd_kernel_f32 : public jit_generator {
     reg64_t reg_tail = rax;
     mask_t k_oc_tail_mask = Xbyak::Opmask(2);
 
+    reg64_t reg_d_weights = aux_reg_input_buffer_ptr;
+    reg64_t reg_d_bias = iter_kh;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    reg64_t aux_reg_blocks_offset = abi_not_param1;
+
+    Vmm vmm_d_weights = Vmm(0);
+    Vmm vmm_d_bias = Vmm(1);
+
     inline void load_src(int ur_ch_blocks, int ur_w, bool is_ch_tail);
     inline void compute_loop(int ur_w, int ur_ch_blocks, int pad_l, int pad_r);
     inline void ow_loop(int ur_ch_blocks);
@@ -126,18 +136,24 @@ struct jit_uni_dw_conv_fwd_kernel_f32 : public jit_generator {
 };
 
 template <cpu_isa_t isa>
-struct jit_uni_dw_conv_bwd_data_kernel_f32 : public jit_generator {
+struct jit_uni_dw_conv_bwd_data_kernel_f32 : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_bwd_data_kernel_f32)
 
     jit_uni_dw_conv_bwd_data_kernel_f32(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp) {}
+        : jit_generator_t(jit_name()), jcp(ajcp) {}
+
+    ~jit_uni_dw_conv_bwd_data_kernel_f32() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
     jit_conv_conf_t jcp;
 
 private:
     using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
             isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
     const int reg_repeats_ = (isa == sse41) ? 2 : 1;
-    const int simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const int simd_w_ = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     using reg64_t = const Xbyak::Reg64;
 
     inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); }
@@ -164,6 +180,13 @@ struct jit_uni_dw_conv_bwd_data_kernel_f32 : public jit_generator {
     reg64_t reg_tmp = r15;
     Xbyak::Opmask k_ch_tail_mask = Xbyak::Opmask(1);
 
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = iter_kh;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    nstl::vector<jit_uni_depthwise_injector_f32<isa>*> depthwise_injectors;
+
     void load_vmm(Vmm &vmm, const Xbyak::Address &addr, bool tail);
     void store_vmm(Vmm &vmm, const Xbyak::Address &addr, bool tail);
 
@@ -171,6 +194,7 @@ struct jit_uni_dw_conv_bwd_data_kernel_f32 : public jit_generator {
     inline void unroll_width_body(int ur_ch_blocks);
     inline void load_ddst(int ur_ch_blocks, int ur_str_w);
     inline void apply_filter(int ur_ch_blocks, int ur_str_w, bool is_last_ch);
+    inline void apply_postprocess(int ur_ch_blocks, int ur_str_w);
     inline void store_dsrc(int ur_ch_blocks, int ur_str_w, bool is_last_ch);
 
     void generate() override;
@@ -190,12 +214,12 @@ struct jit_uni_dw_conv_bwd_data_kernel_f32 : public jit_generator {
 };
 
 template <cpu_isa_t isa>
-struct jit_uni_dw_conv_bwd_weights_kernel_f32 : public jit_generator {
+struct jit_uni_dw_conv_bwd_weights_kernel_f32 : public jit_generator_t {
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_bwd_weights_kernel_f32)
 
     jit_uni_dw_conv_bwd_weights_kernel_f32(const jit_conv_conf_t &ajcp)
-        : jit_generator(jit_name()), jcp(ajcp) {}
+        : jit_generator_t(jit_name()), jcp(ajcp) {}
 
     jit_conv_conf_t jcp;
 
@@ -203,7 +227,7 @@ struct jit_uni_dw_conv_bwd_weights_kernel_f32 : public jit_generator {
     using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
             isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
 
-    const int simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
+    const int simd_w_ = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     const int reg_repeats_ = (isa == sse41) ? 2 : 1;
     const int req_aux_vmm = isa == sse41 ? 1 : 0; // used for FMA operand
 
diff --git a/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp b/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp
index 0e5e54a2a1b..ca188f3639f 100644
--- a/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp
+++ b/src/cpu/x64/jit_uni_dw_conv_kernel_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,14 +39,19 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
     const memory_desc_wrapper dst_d(&dst_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const int ndims = src_d.ndims();
     // Currently this kernel only supports 2D convolutions.
     VDISPATCH_CONV_IC(ndims == 4, "kernel supports only 2D convolutions");
 
     jcp.prop_kind = cd.prop_kind;
 
-    const auto blocked_tag = isa == avx512_core ? nChw16c : nChw8c;
-    const auto wei_tag = isa == avx512_core ? Goihw16g : Goihw8g;
+    const auto blocked_tag = is_superset(isa, avx512_core) ? nChw16c : nChw8c;
+    const auto wei_tag = is_superset(isa, avx512_core) ? Goihw16g : Goihw8g;
     const auto nxc_tag = nhwc;
     const auto def_tag
             = (mayiuse(avx512_core)
@@ -60,7 +65,7 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
         CHECK(memory_desc_init_by_tag(src_md, def_tag));
         jcp.src_tag = def_tag;
     } else {
-        jcp.src_tag = src_d.matches_one_of_tag(blocked_tag, nxc_tag);
+        jcp.src_tag = src_d.mb_stride_relaxed_match(blocked_tag, nxc_tag);
     }
 
     if (weights_d.format_kind() == format_kind::any) {
@@ -74,7 +79,7 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
         CHECK(memory_desc_init_by_tag(dst_md, def_tag));
         jcp.dst_tag = def_tag;
     } else {
-        jcp.dst_tag = dst_d.matches_one_of_tag(blocked_tag, nxc_tag);
+        jcp.dst_tag = dst_d.mb_stride_relaxed_match(blocked_tag, nxc_tag);
     }
 
     if (jcp.with_bias) {
@@ -87,14 +92,18 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
     const bool is_data_layout_nxc = data_tag == nxc_tag;
 
     const bool is_bf16 = src_d.data_type() == data_type::bf16;
+    const bool is_f16 = src_d.data_type() == data_type::f16;
 
+    jcp.src_dt = cd.src_desc.data_type;
     jcp.dst_dt = cd.dst_desc.data_type;
-    jcp.isa = (is_bf16 && mayiuse(avx512_core_bf16)) ? avx512_core_bf16 : isa;
+    jcp.isa = (is_bf16 && mayiuse(avx512_core_bf16)) ? avx512_core_bf16
+            : is_f16 && mayiuse(avx512_core_fp16)    ? avx512_core_fp16
+                                                     : isa;
 
     VDISPATCH_CONV_IC(!(!mayiuse(isa) || (is_bf16 && !mayiuse(avx512_core))),
             VERBOSE_UNSUPPORTED_ISA);
 
-    const int simd_w = isa == avx512_core ? 16 : 8;
+    const int simd_w = is_superset(isa, avx512_core) ? 16 : 8;
 
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     VDISPATCH_CONV_IC(with_groups,
@@ -142,23 +151,26 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
 
     jcp.loop_order = loop_ngcw;
 
-    jcp.ur_w = is_bf16           ? (isa_has_bf16(jcp.isa) ? 6 : 4)
-            : isa == avx512_core ? 6
-            : isa == avx2        ? 4
-                                 : 3;
+    jcp.ur_w = is_bf16 ? (isa_has_bf16(jcp.isa) ? 6 : 4)
+            : utils::one_of(isa, avx512_core, avx512_core_fp16) ? 6
+            : isa == avx2                                       ? 4
+                                                                : 3;
     jcp.ur_w = nstl::min(jcp.ur_w, jcp.ow);
 
     jcp.ch_block = simd_w;
     jcp.nb_ch = div_up(jcp.oc, jcp.ch_block);
-    jcp.nb_ch_blocking = isa == avx512_core ? 4 : isa == avx2 ? 3 : 2;
+    jcp.nb_ch_blocking = is_superset(isa, avx512_core) ? 4
+            : isa == avx2                              ? 3
+                                                       : 2;
     if (jcp.nb_ch < jcp.nb_ch_blocking) jcp.nb_ch_blocking = jcp.nb_ch;
 
     if (is_data_layout_nxc) {
         jcp.loop_order = loop_nhwcg;
         const int resrc_depthwise_ur_w = (31 - jcp.kw + jcp.stride_w)
                 / (jcp.nb_ch_blocking + jcp.stride_w);
-        jcp.is_resrc_depthwise = (!is_bf16) && isa == avx512_core
-                && jcp.stride_w < jcp.kw && jcp.kw <= 5 && jcp.dilate_w == 0
+        jcp.is_resrc_depthwise = (!is_bf16 && !is_f16)
+                && is_superset(isa, avx512_core) && jcp.stride_w < jcp.kw
+                && jcp.kw <= 5 && jcp.dilate_w == 0
                 && resrc_depthwise_ur_w >= 2;
         if (jcp.is_resrc_depthwise) {
             jcp.ur_w = nstl::min(jcp.ow, resrc_depthwise_ur_w);
@@ -224,6 +236,8 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
                         broadcasting_strategy_t::per_oc,
                         broadcasting_strategy_t::no_broadcast);
     }
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization) != -1;
 
     jcp.post_ops = post_ops;
 
@@ -231,13 +245,13 @@ status_t jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
     static constexpr bool sum_at_pos_0_only = true;
     static constexpr bool sum_requires_scale_one = true;
     const bool post_ops_ok_ = post_ops_ok(
-            post_ops_ok_args_t(isa, {eltwise, binary, sum}, jcp.post_ops,
+            post_ops_ok_args_t(isa, {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops,
                     &dst_d, sum_at_pos_0_only, sum_requires_scale_one));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
     const bool ok_to_pad_channels = true && !is_data_layout_nxc
             && jcp.oc == jcp.ngroups && jcp.ic == jcp.ngroups
-            && one_of(isa, avx512_core, avx2);
+            && one_of(isa, avx512_core_fp16, avx512_core, avx2, sse41);
     if (ok_to_pad_channels) {
         jcp.oc = rnd_up(jcp.oc, simd_w);
         jcp.ic = rnd_up(jcp.oc, simd_w);
@@ -263,10 +277,30 @@ void jit_uni_dw_conv_fwd_kernel<isa, kernel_dt>::init_scratchpad(
     using namespace dnnl::impl::memory_tracking::names;
     if (jcp.bia_dt == data_type::bf16)
         scratchpad.book<float>(key_conv_bias_bf16_convert_wsp, jcp.oc);
+    else if (jcp.bia_dt == data_type::f16)
+        scratchpad.book<float>(key_conv_bias_f16_convert_wsp, jcp.oc);
     else if (jcp.with_bias && jcp.oc_without_padding != jcp.oc)
         scratchpad.book<float>(key_conv_padded_bias, jcp.oc);
 }
 
+template <cpu_isa_t isa, data_type_t kernel_dt>
+bool jit_uni_dw_conv_bwd_data_kernel<isa, kernel_dt>::post_ops_ok(const jit_conv_conf_t& jcp) {
+    const auto &p = jcp.post_ops;
+    if (p.len() > 1)
+        return false;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = 0; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+        }
+        return ok;
+    };
+
+    return all_post_ops_supported();
+}
+
 template <cpu_isa_t isa, data_type_t kernel_dt>
 status_t jit_uni_dw_conv_bwd_data_kernel<isa, kernel_dt>::init_conf(
         jit_conv_conf_t &jcp, const convolution_desc_t &cd,
@@ -279,6 +313,11 @@ status_t jit_uni_dw_conv_bwd_data_kernel<isa, kernel_dt>::init_conf(
     const memory_desc_wrapper weights_d(&weights_md);
     const memory_desc_wrapper diff_dst_d(&diff_dst_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, diff_src_d, weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.dsrc_dt = cd.diff_src_desc.data_type;
     const bool is_bf16 = diff_dst_d.data_type() == bf16;
     jcp.isa = (is_bf16 && mayiuse(avx512_core_bf16)) ? avx512_core_bf16 : isa;
@@ -330,9 +369,9 @@ status_t jit_uni_dw_conv_bwd_data_kernel<isa, kernel_dt>::init_conf(
     const auto wei_tag = isa == avx512_core ? Goihw16g : Goihw8g;
 
     auto curr_src_tag
-            = diff_src_d.matches_one_of_tag(dat_tag_nxc, dat_tag_blocked);
+            = diff_src_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_blocked);
     auto curr_dst_tag
-            = diff_dst_d.matches_one_of_tag(dat_tag_nxc, dat_tag_blocked);
+            = diff_dst_d.mb_stride_relaxed_match(dat_tag_nxc, dat_tag_blocked);
     bool is_data_layout_nxc
             = utils::everyone_is(dat_tag_nxc, curr_src_tag, curr_dst_tag);
     auto dat_tag = is_data_layout_nxc ? dat_tag_nxc : dat_tag_blocked;
@@ -371,6 +410,9 @@ status_t jit_uni_dw_conv_bwd_data_kernel<isa, kernel_dt>::init_conf(
     // from: 'simd_w_ * reg_repeats_ = 4 * 2'
     jcp.ch_block = isa == avx512_core ? 16 : 8;
 
+    if (!post_ops_ok(jcp))
+        return status::unimplemented;
+
     bool ok_to_pad_channels = !is_data_layout_nxc && jcp.oc == jcp.ngroups
             && jcp.ic == jcp.ngroups && one_of(isa, avx512_core, avx2);
     if (ok_to_pad_channels) {
@@ -452,6 +494,11 @@ status_t jit_uni_dw_conv_bwd_weights_kernel<isa, kernel_dt>::init_conf(
     const memory_desc_wrapper diff_bias_d(&diff_bias_md);
     const memory_desc_wrapper diff_dst_d(&diff_dst_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, diff_weights_d, diff_dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     jcp.dwei_dt = cd.diff_weights_desc.data_type;
     const int ndims = src_d.ndims();
     const bool is_bf16 = src_d.data_type() == data_type::bf16;
@@ -561,13 +608,19 @@ status_t jit_uni_dw_conv_bwd_weights_kernel<isa, kernel_dt>::init_conf(
             = !is_data_layout_nxc && one_of(isa, avx512_core, avx2);
     if (ok_to_pad_channels) { jcp.ngroups = rnd_up(jcp.ngroups, jcp.ch_block); }
 
-    bool args_ok = true
-            && IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0)
-            && jcp.dilate_h == 0 && jcp.dilate_w == 0 && jcp.kw <= 3
-            && jcp.stride_w <= jcp.kw // no gaps in kernel
-            && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1
-            && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1;
-    VDISPATCH_CONV_IC(args_ok, VERBOSE_BAD_PARAM, "");
+    VDISPATCH_CONV_IC(
+            IMPLICATION(!is_data_layout_nxc, jcp.ngroups % jcp.ch_block == 0),
+            VERBOSE_BAD_PARAM, "number of groups doesn't divide channel block");
+    VDISPATCH_CONV_IC(jcp.dilate_h == 0, VERBOSE_BAD_PARAM, "dilate_h");
+    VDISPATCH_CONV_IC(jcp.dilate_w == 0, VERBOSE_BAD_PARAM, "dilate_w");
+    VDISPATCH_CONV_IC(jcp.kw <= 3, VERBOSE_BAD_PARAM, "kw > 3");
+    // No gaps in the kernel.
+    VDISPATCH_CONV_IC(
+            jcp.stride_w <= jcp.kw, VERBOSE_BAD_PARAM, "stride_w > kw");
+    VDISPATCH_CONV_IC(jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1,
+            VERBOSE_BAD_PARAM, "oh != (ihp - kh) / stride_h + 1");
+    VDISPATCH_CONV_IC(jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1,
+            VERBOSE_BAD_PARAM, "ow != (iwp - kw) / stride_w + 1");
 
     jcp.nb_ch = div_up(jcp.ngroups, jcp.ch_block);
 
@@ -809,6 +862,8 @@ void jit_uni_dw_conv_bwd_weights_kernel<isa, kernel_dt>::partition_nthr_nxc(
     }
 }
 
+REG_AVX512_ISA(
+        template struct jit_uni_dw_conv_fwd_kernel<avx512_core_fp16, f16>);
 REG_AVX512_ISA(template struct jit_uni_dw_conv_fwd_kernel<avx512_core, bf16>);
 REG_AVX512_ISA(template struct jit_uni_dw_conv_fwd_kernel<avx512_core, f32>);
 REG_AVX2_ISA(template struct jit_uni_dw_conv_fwd_kernel<avx2, f32>);
diff --git a/src/cpu/x64/jit_uni_dw_conv_kernel_utils.hpp b/src/cpu/x64/jit_uni_dw_conv_kernel_utils.hpp
index 7f5902e055b..84e4b08c8f8 100644
--- a/src/cpu/x64/jit_uni_dw_conv_kernel_utils.hpp
+++ b/src/cpu/x64/jit_uni_dw_conv_kernel_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include "cpu/x64/jit_primitive_conf.hpp"
 
 #include "cpu/x64/jit_avx512_core_bf16_dw_conv_kernel.hpp"
+#include "cpu/x64/jit_avx512_core_f16_dw_conv_kernel.hpp"
 #include "cpu/x64/jit_uni_dw_conv_kernel_f32.hpp"
 
 namespace dnnl {
@@ -56,14 +57,17 @@ struct jit_uni_dw_conv_fwd_kernel {
     static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
             const jit_conv_conf_t &jcp);
 
-    jit_generator *ker() const { return ker_.get(); }
+    jit_generator_t *ker() const { return ker_.get(); }
     void operator()(const jit_conv_call_s *p) const { (*ker_)(p); }
 
 private:
     constexpr static bool ker_condition_
-            = isa == avx512_core && kernel_dt == data_type::bf16;
+            = (isa == avx512_core && kernel_dt == data_type::bf16)
+            || (isa == avx512_core_fp16 && kernel_dt == data_type::f16);
     using jit_kernel_t = typename utils::conditional<ker_condition_,
-            jit_avx512_dw_conv_fwd_kernel_bf16,
+            typename utils::conditional<kernel_dt == data_type::bf16,
+                    jit_avx512_dw_conv_fwd_kernel_bf16,
+                    jit_avx512_dw_conv_fwd_kernel_f16>::type,
             jit_uni_dw_conv_fwd_kernel_f32<isa>>::type;
     std::unique_ptr<jit_kernel_t> ker_;
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_dw_conv_fwd_kernel)
@@ -81,6 +85,8 @@ struct jit_uni_dw_conv_bwd_data_kernel {
     }
     ~jit_uni_dw_conv_bwd_data_kernel() = default;
 
+    static bool post_ops_ok(const jit_conv_conf_t& jcp);
+
     static status_t init_conf(jit_conv_conf_t &jcp,
             const convolution_desc_t &cd, memory_desc_t &diff_src_md,
             memory_desc_t &weights_md, memory_desc_t &diff_dst_md);
diff --git a/src/cpu/x64/jit_uni_dw_conv_row_f32.cpp b/src/cpu/x64/jit_uni_dw_conv_row_f32.cpp
new file mode 100644
index 00000000000..e5f37ba0972
--- /dev/null
+++ b/src/cpu/x64/jit_uni_dw_conv_row_f32.cpp
@@ -0,0 +1,708 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/* [todo] antonvor:
+ * This file contains the old plugin behavior in order to fix performance
+ * problems after upgrading to OneDNN v1.6. This kernel is executed only on
+ * machines with avx2 instruction set support and in the case of a fused
+ * convolution. Remove after problems are fixed.
+*/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/nstl.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_uni_dw_conv_row_f32.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace Xbyak;
+using namespace dnnl::impl::utils;
+
+#define GET_OFF_DW(field) offsetof(jit_conv_call_s, field)
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::clear_vmm_regs(int ur_w) {
+    int repeats = isa == sse41 ? 2 : 1;
+    for (int i = 0; i < repeats; i++) {
+        for (int ow = 0; ow < ur_w; ow++) {
+            Vmm vmm_acc = get_acc_reg(i*ur_w + ow);
+
+            uni_vpxor(vmm_acc, vmm_acc, vmm_acc);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::apply_filter(int ur_w, int kw_size) {
+    auto load_src = [=](Vmm vmm_src, const Xbyak::Address &op) {
+        if (jcp.src_dt == data_type::u8) {
+            uni_vpmovzxbd(vmm_src, op);
+        } else {
+            uni_vmovups(vmm_src, op);
+        }
+    };
+
+    auto load_ker = [=](Vmm vmm_ker, const Xbyak::Address &op) {
+        if (jcp.src_dt == data_type::u8) {
+            uni_vpmovsxbd(vmm_ker, op);
+        } else {
+            uni_vmovups(vmm_ker, op);
+        }
+    };
+
+    auto compute = [=](Vmm vmm_acc, Vmm vmm_src, Vmm vmm_ker) {
+        if (jcp.src_dt == data_type::u8) {
+            uni_vpmulld(vmm_src, vmm_src, vmm_ker);
+            uni_vpaddd(vmm_acc, vmm_acc, vmm_src);
+        } else {
+            uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker);
+        }
+    };
+
+    int ch_blk = jcp.ch_block;
+    int stride_w = jcp.stride_w;
+
+    Label exit_label;
+
+    int repeats = isa == sse41 ? 2 : 1;
+
+    cmp(reg_kh, 1);
+    jl(exit_label, T_NEAR);
+    for (int i = 0; i < repeats; i++) {
+        for (int kw = 0; kw < kw_size; kw++) {
+            int ker_off = kw * ch_blk + i*(jcp.ch_block / 2);
+
+            Vmm vmm_ker = get_ker_reg(0);
+            load_ker(vmm_ker, ptr[aux_reg_kernel + ker_off * jcp.typesize_in]);
+
+            for (int ow = 0; ow < ur_w; ow++) {
+                int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*(jcp.ch_block / 2);
+
+                Vmm vmm_src = get_src_reg(0);
+                load_src(vmm_src, ptr[aux_reg_input0 + inp_off * jcp.typesize_in]);
+
+                Vmm vmm_acc = get_acc_reg(i*ur_w + ow);
+                compute(vmm_acc, vmm_src, vmm_ker);
+            }
+        }
+    }
+    add(aux_reg_kernel, jcp.kw*ch_blk*jcp.typesize_in);
+
+    cmp(reg_kh, 2);
+    jl(exit_label, T_NEAR);
+    for (int i = 0; i < repeats; i++) {
+        for (int kw = 0; kw < kw_size; kw++) {
+            int ker_off = kw * ch_blk + i*(jcp.ch_block / 2);
+
+            Vmm vmm_ker = get_ker_reg(0);
+            load_ker(vmm_ker, ptr[aux_reg_kernel + ker_off * jcp.typesize_in]);
+
+            for (int ow = 0; ow < ur_w; ow++) {
+                int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*(jcp.ch_block / 2);
+
+                Vmm vmm_src = get_src_reg(0);
+                load_src(vmm_src, ptr[aux_reg_input1 + inp_off * jcp.typesize_in]);
+
+                Vmm vmm_acc = get_acc_reg(i*ur_w + ow);
+                compute(vmm_acc, vmm_src, vmm_ker);
+            }
+        }
+    }
+    add(aux_reg_kernel, jcp.kw*ch_blk*jcp.typesize_in);
+
+    cmp(reg_kh, 3);
+    jl(exit_label, T_NEAR);
+    for (int i = 0; i < repeats; i++) {
+        for (int kw = 0; kw < kw_size; kw++) {
+            int ker_off = kw * ch_blk + i*(jcp.ch_block / 2);
+
+            Vmm vmm_ker = get_ker_reg(0);
+            load_ker(vmm_ker, ptr[aux_reg_kernel + ker_off * jcp.typesize_in]);
+
+            for (int ow = 0; ow < ur_w; ow++) {
+                int inp_off = ow * stride_w * ch_blk + kw * ch_blk + i*(jcp.ch_block / 2);
+
+                Vmm vmm_src = get_src_reg(0);
+                load_src(vmm_src, ptr[aux_reg_input2 + inp_off * jcp.typesize_in]);
+
+                Vmm vmm_acc = get_acc_reg(i*ur_w + ow);
+                compute(vmm_acc, vmm_src, vmm_ker);
+            }
+        }
+    }
+
+    L(exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::cvt2ps(data_type_t type_in, Vmm vmm_in, const Operand &op, bool scalar_load) {
+    Xmm xmm_in = Xmm(vmm_in.getIdx());
+
+    switch (type_in) {
+        case data_type::f32:
+        case data_type::s32:
+            if (scalar_load) {
+                mov(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vmovups(vmm_in, op);
+            }
+            break;
+        case data_type::s8:
+            if (scalar_load) {
+                movsx(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vpmovsxbd(vmm_in, op);
+            }
+            break;
+        case data_type::u8:
+            if (scalar_load) {
+                movzx(reg_tmp_32, op);
+                movq(xmm_in, reg_tmp_64);
+            } else {
+                uni_vpmovzxbd(vmm_in, op);
+            }
+            break;
+        default: assert(!"unsupported data type");
+    }
+
+    if (type_in != data_type::f32)
+        uni_vcvtdq2ps(vmm_in, vmm_in);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::apply_postprocessing(int ur_w, int oc_step) {
+    int repeats = isa == sse41 ? 2 : 1;
+
+    for (int r = 0; r < repeats; r++) {
+        for (int ow = 0; ow < ur_w; ow++) {
+            if (jcp.src_dt == data_type::u8) {
+                uni_vcvtdq2ps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow));
+            }
+
+            if (jcp.with_bias) {
+                int b_off = r * (jcp.ch_block / 2);
+                cvt2ps(jcp.bia_dt, vmm_bias, ptr[reg_bias + b_off * jcp.typesize_bia], false);
+                uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_bias);
+            }
+        }
+    }
+
+    const auto &p = attr_.post_ops_;
+    if (jcp.with_sum) {
+        dnnl::impl::data_type_t sum_dt = jcp.dst_dt;
+        int start_idx = p.find(primitive_kind::convolution) + 1;
+        for (int i = start_idx; i < p.len(); i++) {
+            auto &post_op = p.entry_[i];
+            if (post_op.is_sum()) {
+                sum_dt = post_op.sum.dt;
+            }
+        }
+
+        for (int r = 0; r < repeats; r++) {
+            int tail_size = isa == sse41 ? nstl::min(jcp.ch_block / 2, oc_step - r * jcp.ch_block / 2) : oc_step;
+            bool is_scalar_store = isa == sse41 ? tail_size < jcp.ch_block / 2 : tail_size < jcp.ch_block;
+
+            for (int ow = 0; ow < ur_w; ow++) {
+                if (is_scalar_store) {
+                    if (isa == avx512_core) {
+                        int o_off = ow * ow_stride_;
+
+                        Vmm vmm_in = vmm_sum | ktail_mask | T_z;
+
+                        cvt2ps(sum_dt, vmm_in, ptr[reg_output + o_off * jcp.typesize_out], false);
+                        uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum);
+                    } else {
+                        for (int oc = 0; oc < tail_size; oc++) {
+                            int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2) + oc;
+
+                            uni_vpxor(vmm_sum, vmm_sum, vmm_sum);
+                            cvt2ps(sum_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], true);
+
+                            if (oc >= jcp.ch_block / 2) {
+                                vperm2i128(Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), Ymm(vmm_sum.getIdx()), 0x01);
+                            }
+                            uni_vpslldq(vmm_sum, vmm_sum, jcp.typesize_out * (oc % (jcp.ch_block / 2)));
+
+                            uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum);
+                        }
+                    }
+                } else {
+                    int o_off = ow * ow_stride_ + r * (jcp.ch_block / 2);
+
+                    uni_vpxor(vmm_sum, vmm_sum, vmm_sum);
+                    cvt2ps(sum_dt, vmm_sum, ptr[reg_output + o_off * jcp.typesize_out], false);
+
+                    uni_vaddps(get_acc_reg(r * ur_w + ow), get_acc_reg(r * ur_w + ow), vmm_sum);
+                }
+            }
+        }
+    }
+
+    int eltwise_inj_idx = 0;
+    int depthwise_inj_idx = 0;
+    int quantization_inj_idx = 0;
+    int start_idx = p.find(primitive_kind::convolution) + 1;
+    std::size_t post_ops_data_offset = 0;
+    for (int i = start_idx; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors[eltwise_inj_idx]->compute_vector_range(4, 4 + repeats * ur_w);
+            eltwise_inj_idx++;
+        } else if (post_op.is_depthwise()) {
+            mov(reg_d_weights, ptr[this->rsp + post_ops_data_offset]);
+            add(reg_d_weights, reg_oc_off);
+
+            depthwise_injectors[depthwise_inj_idx]->compute_vector_range(4, 4 + ur_w, reg_d_weights, reg_d_weights);
+
+            if (repeats == 2) {
+                add(reg_d_weights, (jcp.ch_block / 2) * sizeof(float));
+
+                depthwise_injectors[depthwise_inj_idx]->compute_vector_range(4 + ur_w, 4 + 2 * ur_w, reg_d_weights, reg_d_weights);
+            }
+
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+        } else if (post_op.is_quantization()) {
+            bool do_dequantization = post_op.quantization.alg == alg_kind::quantization_quantize_dequantize;
+            bool do_rounding = do_dequantization || jcp.dst_dt == dnnl_f32 || i != p.len() - 1;
+
+            const Xbyak::RegExp quant_arg_base = this->rsp + post_ops_data_offset;
+            quantization_injectors[quantization_inj_idx]->init_crop_ptrs(quant_arg_base, reg_oc_off);
+            for (int r = 0; r < repeats; r++) {
+                int s_idx = get_acc_reg(r * ur_w).getIdx();
+                quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + ur_w, r * (jcp.ch_block / 2) * sizeof(float));
+            }
+
+            quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(quant_arg_base, reg_oc_off);
+            for (int r = 0; r < repeats; r++) {
+                int s_idx = get_acc_reg(r * ur_w).getIdx();
+                quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + ur_w, r * (jcp.ch_block / 2) * sizeof(float), do_rounding);
+            }
+
+            quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(quant_arg_base, reg_oc_off);
+            for (int r = 0; r < repeats; r++) {
+                int s_idx = get_acc_reg(r * ur_w).getIdx();
+                quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + ur_w, r * (jcp.ch_block / 2) * sizeof(float));
+            }
+
+            post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep();
+            quantization_inj_idx++;
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::store_dst_typed(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store) {
+    Ymm ymm_dst = Ymm(vmm_dst.getIdx());
+    Xmm xmm_dst = Xmm(vmm_dst.getIdx());
+
+    switch (jcp.dst_dt) {
+        case data_type::f32:
+        case data_type::s32:
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_32);
+            } else {
+                uni_vmovups(op, vmm_dst);
+            }
+            break;
+        case data_type::s8:
+            uni_vpackssdw(vmm_dst, vmm_dst, vmm_dst);
+
+            if (isa != sse41 && !scalar_store)
+                vpermq(ymm_dst, ymm_dst, 0x08);
+
+            uni_vpacksswb(vmm_dst, vmm_dst, vmm_dst);
+
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+            } else {
+                if (isa != sse41)
+                    vmovq(op, xmm_dst);
+                else
+                    movd(op, xmm_dst);
+            }
+            break;
+        case data_type::u8:
+        case data_type::bin:
+            uni_vpackusdw(vmm_dst, vmm_dst, vmm_dst);
+
+            if (isa != sse41 && !scalar_store)
+                vpermq(ymm_dst, ymm_dst, 0x08);
+
+            uni_vpackuswb(vmm_dst, vmm_dst, vmm_dst);
+
+            if (scalar_store) {
+                movq(reg_tmp_64, xmm_dst);
+                mov(op, reg_tmp_8);
+            } else {
+                if (isa != sse41)
+                    vmovq(op, xmm_dst);
+                else
+                    movd(op, xmm_dst);
+            }
+            break;
+        default:
+            assert(!"unknown dst_dt");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::store_dst(int ur_w, int oc_step) {
+    int repeats = isa == sse41 && oc_step > (jcp.ch_block / 2) ? 2 : 1;
+
+    if (isa == avx512_core && oc_step != jcp.ch_block) {
+        int mask = (1 << oc_step) - 1;
+        mov(reg_tmp_32, mask);
+        kmovw(ktail_mask, reg_tmp_32);
+    }
+
+    for (int i = 0; i < repeats; i++) {
+        for (int ow = 0; ow < ur_w; ow++) {
+            Vmm vmm_dst = get_acc_reg(i * ur_w + ow);
+            if (jcp.dst_dt != data_type::f32 && jcp.dst_dt != data_type::bin) {
+                uni_vcvtps2dq(vmm_dst, vmm_dst);
+            }
+        }
+    }
+    for (int i = 0; i < repeats; i++) {
+        int tail_size = isa == sse41 ? nstl::min(jcp.ch_block / 2, oc_step - i * jcp.ch_block / 2) : oc_step;
+        bool is_scalar_store = isa == sse41 ? tail_size < jcp.ch_block / 2 : tail_size < jcp.ch_block;
+        if (is_scalar_store) {
+            for (int ow = 0; ow < ur_w; ow++) {
+                Vmm vmm_dst = get_acc_reg(i * ur_w + ow);
+
+                if (isa == avx512_core) {
+                    int o_off = ow * ow_stride_;
+
+                    store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst | ktail_mask, false);
+                } else {
+                    for (int oc = 0; oc < tail_size; oc++) {
+                        int o_off = ow * ow_stride_ + i * (jcp.ch_block / 2) + oc;
+                        store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, true);
+
+                        if (isa == sse41) {
+                            psrldq(vmm_dst, jcp.typesize_out);
+                        } else {
+                            Ymm ymm_dst = Ymm(vmm_dst.getIdx());
+
+                            vperm2i128(ymm_tmp, ymm_dst, ymm_dst, 0x01);
+                            vpalignr(ymm_dst, vmm_tmp, ymm_dst, jcp.typesize_out);
+                        }
+                    }
+                }
+            }
+        } else {
+            for (int ow = 0; ow < ur_w; ow++) {
+                int o_off = ow * ow_stride_ + i * (jcp.ch_block / 2);
+                Vmm vmm_dst = get_acc_reg(i * ur_w + ow);
+
+                store_dst_typed(ptr[reg_output + o_off * jcp.typesize_out], vmm_dst, false);
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::loop_body(int oc_step) {
+    Label left_pad_label;
+    Label right_pad_label;
+    Label unrolled_w_label;
+    Label tail_w_label;
+    Label exit_label;
+
+    int output_step = ow_stride_;
+
+    L(left_pad_label); {
+        int ur_w = 1;
+        int kw = jcp.iw == 1 ? jcp.kw - 2 : jcp.kw - 1;
+
+        mov(aux_reg_input0, reg_input0);
+        mov(aux_reg_input1, reg_input1);
+        mov(aux_reg_input2, reg_input2);
+        mov(aux_reg_kernel, reg_kernel);
+        add(aux_reg_kernel, jcp.ch_block*jcp.typesize_in);
+
+        clear_vmm_regs(ur_w);
+        apply_filter(ur_w, kw);
+        apply_postprocessing(ur_w, oc_step);
+        store_dst(ur_w, oc_step);
+
+        add(reg_input0, jcp.typesize_in * ur_w * jcp.ch_block * (jcp.stride_w-1));
+        add(reg_input1, jcp.typesize_in * ur_w * jcp.ch_block * (jcp.stride_w-1));
+        add(reg_input2, jcp.typesize_in * ur_w * jcp.ch_block * (jcp.stride_w-1));
+        add(reg_output, jcp.typesize_out * ur_w * output_step);
+
+        sub(reg_ur_w, ur_w);
+    }
+
+    L(unrolled_w_label); {
+        int ur_w = jcp.ur_w;
+        int kw = jcp.kw;
+
+        cmp(reg_ur_w, ur_w);
+        jle(tail_w_label, T_NEAR);
+
+        mov(aux_reg_input0, reg_input0);
+        mov(aux_reg_input1, reg_input1);
+        mov(aux_reg_input2, reg_input2);
+        mov(aux_reg_kernel, reg_kernel);
+
+        clear_vmm_regs(ur_w);
+        apply_filter(ur_w, kw);
+        apply_postprocessing(ur_w, oc_step);
+        store_dst(ur_w, oc_step);
+
+        add(reg_input0, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_input1, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_input2, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_output, jcp.typesize_out * ur_w * output_step);
+
+        sub(reg_ur_w, ur_w);
+        jmp(unrolled_w_label, T_NEAR);
+    }
+
+    L(tail_w_label); {
+        int ur_w = 1;
+        int kw = jcp.kw;
+
+        cmp(reg_ur_w, ur_w);
+        if (jcp.ow > 1)
+            jle(right_pad_label, T_NEAR);
+        else
+            jle(exit_label, T_NEAR);
+
+        mov(aux_reg_input0, reg_input0);
+        mov(aux_reg_input1, reg_input1);
+        mov(aux_reg_input2, reg_input2);
+        mov(aux_reg_kernel, reg_kernel);
+
+        clear_vmm_regs(ur_w);
+        apply_filter(ur_w, kw);
+        apply_postprocessing(ur_w, oc_step);
+        store_dst(ur_w, oc_step);
+
+        add(reg_input0, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_input1, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_input2, jcp.typesize_in * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_output, jcp.typesize_out * ur_w * output_step);
+
+        sub(reg_ur_w, ur_w);
+        jmp(tail_w_label, T_NEAR);
+    }
+
+    if (jcp.ow > 1) {
+        L(right_pad_label); {
+            int ur_w = 1;
+            int kw = jcp.kw - ((jcp.stride_w == 1) ? 1 : jcp.iw % jcp.stride_w);
+
+            mov(aux_reg_input0, reg_input0);
+            mov(aux_reg_input1, reg_input1);
+            mov(aux_reg_input2, reg_input2);
+            mov(aux_reg_kernel, reg_kernel);
+
+            clear_vmm_regs(ur_w);
+            apply_filter(ur_w, kw);
+            apply_postprocessing(ur_w, oc_step);
+            store_dst(ur_w, oc_step);
+
+            sub(reg_ur_w, ur_w);
+        }
+    }
+
+    L(exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_dw_conv_row_f32<isa>::generate() {
+    const auto &p = attr_.post_ops_;
+    int start_idx = p.find(primitive_kind::convolution) + 1;
+    for (int i = start_idx; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors.push_back(new jit_uni_eltwise_injector_t<isa>(
+                    this,
+                    post_op.eltwise
+            ));
+        } else if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<isa>(
+                    this,
+                    post_op
+            ));
+        } else if (post_op.is_quantization()) {
+            quantization_injectors.push_back(new jit_uni_quantization_injector_f32<isa>(
+                    this,
+                    post_op,
+                    vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias
+            ));
+        }
+    }
+
+    this->preamble();
+
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = start_idx; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_input0;
+        auto aux_reg1 = reg_input1;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF_DW(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
+    mov(reg_input0, ptr[this->param1 + GET_OFF_DW(src_row0)]);
+    mov(reg_input1, ptr[this->param1 + GET_OFF_DW(src_row1)]);
+    mov(reg_input2, ptr[this->param1 + GET_OFF_DW(src_row2)]);
+    mov(reg_output, ptr[this->param1 + GET_OFF_DW(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF_DW(filt)]);
+    if (jcp.with_bias)
+        mov(reg_bias, ptr[this->param1 + GET_OFF_DW(bias)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF_DW(kh_padding)]);
+    mov(reg_ur_w, ptr[this->param1 + GET_OFF_DW(ur_w)]);
+    mov(reg_oc_work, ptr[this->param1 + GET_OFF_DW(oc_work)]);
+    mov(reg_oc_off, ptr[this->param1 + GET_OFF_DW(oc_off)]);
+
+    Label tail_label;
+    Label exit_label;
+
+    cmp(reg_oc_work, jcp.ch_block);
+    jl(tail_label, T_NEAR);
+
+    loop_body(jcp.ch_block);
+    jmp(exit_label, T_NEAR);
+
+    L(tail_label);
+
+    if (jcp.oc % jcp.ch_block != 0)
+        loop_body(jcp.oc % jcp.ch_block);
+
+    L(exit_label);
+
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
+    this->postamble();
+
+    for (auto& inj : eltwise_injectors)
+        inj->prepare_table();
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_dw_conv_row_f32<isa>::post_ops_ok(jit_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+
+    int start_idx = p.find(primitive_kind::convolution) + 1;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = start_idx; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::sum, primitive_kind::eltwise, primitive_kind::depthwise,
+                                     primitive_kind::binarization, primitive_kind::quantization);
+        }
+        return ok;
+    };
+    auto contain = [&](dnnl::impl::primitive_kind_t kind) { return p.find(kind, start_idx, -1) != -1; };
+    auto position = [&](dnnl::impl::primitive_kind_t kind) { return p.find(kind, start_idx, -1); };
+    auto count = [&](dnnl::impl::primitive_kind_t kind) { return p.count(kind, start_idx, -1); };
+
+    return all_post_ops_supported() &&
+           count(primitive_kind::sum) <= 1 &&
+           count(primitive_kind::binarization) <= 1 &&
+           IMPLICATION(contain(primitive_kind::sum), position(primitive_kind::sum) == start_idx) &&
+           IMPLICATION(contain(primitive_kind::binarization), position(primitive_kind::binarization) == p.len()-1) &&
+           IMPLICATION(contain(primitive_kind::binarization), !contain(primitive_kind::sum));
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_dw_conv_row_f32<isa>::init_conf(jit_1x1_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw,
+                                                 const primitive_attr_t &attr) {
+    if (!mayiuse(isa)) return status::unimplemented;
+    const int simd_w = isa == avx512_core ? 16 : 8;
+
+    const auto &p = attr.post_ops_;
+
+    int dw_conv_ind = p.find(primitive_kind::convolution);
+    jcp_dw.with_sum = p.find(primitive_kind::sum, dw_conv_ind) != -1;
+
+    auto dw_po_len = p.len() - (dw_conv_ind + 1);
+    jcp_dw.post_ops.entry_.resize(dw_po_len);
+    for (int i = 0; i < dw_po_len; ++i) {
+        jcp_dw.post_ops.entry_[i] = p.entry_[i + dw_conv_ind + 1];
+    }
+
+    jcp_dw.ch_block = simd_w;
+    jcp_dw.with_bias = true;
+
+    jcp_dw.kh = p.entry_[dw_conv_ind].depthwise_conv_old.ker_h;
+    jcp_dw.kw = p.entry_[dw_conv_ind].depthwise_conv_old.ker_w;
+    jcp_dw.ic = jcp.oc;
+    jcp_dw.oc = jcp.oc;
+    jcp_dw.ih = p.entry_[dw_conv_ind].depthwise_conv_old.in_h;
+    jcp_dw.iw = p.entry_[dw_conv_ind].depthwise_conv_old.in_w;
+    jcp_dw.oh = jcp.dw_conv_oh;
+    jcp_dw.ow = jcp.dw_conv_ow;
+    jcp_dw.stride_h = p.entry_[dw_conv_ind].depthwise_conv_old.str_h;
+    jcp_dw.stride_w = p.entry_[dw_conv_ind].depthwise_conv_old.str_w;
+
+    if (jcp_dw.kh != 3 || jcp_dw.kw != 3)
+        return status::unimplemented;
+
+    if (!post_ops_ok(jcp_dw, attr))
+        return status::unimplemented;
+
+    jcp_dw.ur_w = 4;
+
+    jcp_dw.src_dt = jcp.dst_dt;
+    jcp_dw.dst_dt = jcp.dw_conv_dst_dt;
+    jcp_dw.bia_dt = jcp.bia_dt == dnnl_data_type_undef ? dnnl_f32 : jcp.bia_dt;
+    jcp_dw.typesize_in = (int)types::data_type_size(jcp_dw.src_dt);
+    jcp_dw.typesize_bia = (int)types::data_type_size(jcp_dw.bia_dt);
+    jcp_dw.typesize_out = (int)types::data_type_size(jcp_dw.dst_dt);
+
+    if (jcp_dw.src_dt != dnnl_f32 && jcp_dw.src_dt != dnnl_u8)
+        return status::unimplemented;
+
+    return status::success;
+}
+
+template struct jit_uni_dw_conv_row_f32<avx512_core>;
+template struct jit_uni_dw_conv_row_f32<avx2>;
+template struct jit_uni_dw_conv_row_f32<sse41>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_uni_dw_conv_row_f32.hpp b/src/cpu/x64/jit_uni_dw_conv_row_f32.hpp
new file mode 100644
index 00000000000..df5049f2264
--- /dev/null
+++ b/src/cpu/x64/jit_uni_dw_conv_row_f32.hpp
@@ -0,0 +1,157 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/* [todo] antonvor:
+ * This file contains the old plugin behavior in order to fix performance
+ * problems after upgrading to OneDNN v1.6. This kernel is executed only on
+ * machines with avx2 instruction set support and in the case of a fused
+ * convolution. Remove after problems are fixed.
+*/
+
+#ifndef CPU_X64_JIT_UNI_DW_CONV_ROW_F32_HPP
+#define CPU_X64_JIT_UNI_DW_CONV_ROW_F32_HPP
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/primitive_attr.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_quantization_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa>
+struct jit_uni_dw_conv_row_f32: public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_dw_conv_row_f32)
+
+    jit_uni_dw_conv_row_f32(jit_conv_conf_t ajcp, const primitive_attr_t &attr, int ow_stride)
+            : jit_generator_t(jit_name()), jcp(ajcp), attr_(attr), ow_stride_(ow_stride) {}
+
+    ~jit_uni_dw_conv_row_f32() {
+        for (auto inj : eltwise_injectors)
+            delete inj;
+        eltwise_injectors.clear();
+
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+
+        for (auto inj : quantization_injectors)
+            delete inj;
+        quantization_injectors.clear();
+    }
+
+    static bool post_ops_ok(jit_conv_conf_t &jcp,
+                            const primitive_attr_t &attr);
+    static status_t init_conf(jit_1x1_conv_conf_t &jcp, jit_conv_conf_t &jcp_dw, const primitive_attr_t &attr);
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    int ow_stride_;
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using reg64_t = const Xbyak::Reg64;
+    using reg32_t = const Xbyak::Reg32;
+    using reg16_t = const Xbyak::Reg16;
+    using reg8_t = const Xbyak::Reg8;
+    const Xbyak::AddressFrame &vmmword = (isa == sse41)
+        ? xword : (isa == avx2) ? yword : zword;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
+
+    // dw convolution
+    reg64_t reg_input0 = r8;
+    reg64_t reg_input1 = r9;
+    reg64_t reg_input2 = r10;
+    reg64_t aux_reg_input0 = r11;
+    reg64_t aux_reg_input1 = r12;
+    reg64_t aux_reg_input2 = r13;
+
+    reg64_t reg_kernel = r14;
+    reg64_t aux_reg_kernel = r15;
+    reg64_t reg_output = rdx;
+    reg64_t reg_bias = rbx;
+    reg64_t reg_kh = rax;
+    reg64_t reg_ur_w = rbp;
+    reg64_t reg_oc_work = abi_not_param1;
+
+    reg64_t reg_oc_off = rsi;
+    reg64_t reg_d_weights = aux_reg_input0;
+    reg64_t reg_d_bias = aux_reg_input1;
+
+    reg64_t reg_b_weights = r15;
+    reg64_t reg_b_mask = reg_d_bias;
+    reg64_t reg_b_out_mask = rbx;
+
+    reg32_t reg_tmp_32 = r11d;
+    reg64_t reg_tmp_64 = r11;
+    reg8_t reg_tmp_8 = r11b;
+    reg16_t reg_tmp_16 = r11w;
+
+    reg32_t reg_tmp2_32 = r13d;
+    reg64_t reg_tmp2_64 = r13;
+
+    inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); }
+    inline Vmm get_src_reg(int idx) { return Vmm(idx + 1); }
+    inline Vmm get_acc_reg(int idx) { return Vmm(idx + 4); }
+
+    Xbyak::Ymm ymm_tmp = Xbyak::Ymm(0);
+    Vmm vmm_tmp = Vmm(0);
+    Vmm vmm_sum = Vmm(0);
+    Vmm vmm_bias = Vmm(0);
+    Vmm vmm_thr = Vmm(0);
+    Vmm vmm_out_mask = Vmm(1);
+
+    Vmm vmm_d_weights = Vmm(0);
+    Vmm vmm_d_bias = Vmm(1);
+
+    const unsigned char _cmp_gt_os = 6;
+
+    Xbyak::Opmask ktail_mask = Xbyak::Opmask(2);
+    Xbyak::Opmask bin_mask0 = Xbyak::Opmask(5);
+    Xbyak::Opmask bin_mask1 = Xbyak::Opmask(6);
+
+    inline void clear_vmm_regs(int ur_w);
+    inline void apply_filter(int ur_w, int kw_size);
+    inline void cvt2ps(data_type_t type_in, Vmm vmm_in, const Xbyak::Operand &op, bool scalar_load);
+    inline void apply_postprocessing(int ur_w, int oc_step);
+    inline void store_dst_typed(const Xbyak::Address &op, Vmm vmm_dst, bool scalar_store);
+    inline void store_dst(int ur_w, int oc_step);
+    inline void loop_body(int oc_step);
+
+    void generate() override;
+
+    nstl::vector<jit_uni_eltwise_injector_t<isa>*> eltwise_injectors;
+    nstl::vector<jit_uni_depthwise_injector_f32<isa>*> depthwise_injectors;
+    nstl::vector<jit_uni_quantization_injector_f32<isa>*> quantization_injectors;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_uni_dw_convolution.cpp b/src/cpu/x64/jit_uni_dw_convolution.cpp
index 700e122805b..1b51ee38691 100644
--- a/src/cpu/x64/jit_uni_dw_convolution.cpp
+++ b/src/cpu/x64/jit_uni_dw_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "common/memory_tracking.hpp"
 
 #include "common/bfloat16.hpp"
+#include "common/float16.hpp"
 
 #include "cpu/x64/jit_uni_dw_convolution.hpp"
 
@@ -55,6 +56,13 @@ void jit_uni_dw_convolution_fwd_t<isa, src_type, dst_type>::execute_forward(
         cvt_bfloat16_to_float(bias, bias_in, jcp.oc_without_padding);
         utils::array_set(bias + jcp.oc_without_padding, 0.f,
                 jcp.oc - jcp.oc_without_padding);
+    } else if (pd()->desc()->bias_desc.data_type == f16) {
+        auto bias_in = CTX_IN_MEM(const f16_data_t *, DNNL_ARG_BIAS);
+        bias = ctx.get_scratchpad_grantor().template get<f32_data_t>(
+                key_conv_bias_f16_convert_wsp);
+        cvt_float16_to_float(bias, bias_in, jcp.oc_without_padding);
+        utils::array_set(bias + jcp.oc_without_padding, 0.f,
+                jcp.oc - jcp.oc_without_padding);
     } else {
         auto bias_in = CTX_IN_MEM(const f32_data_t *, DNNL_ARG_BIAS);
         if (pd()->wants_padded_bias()) {
@@ -143,6 +151,8 @@ void jit_uni_dw_convolution_fwd_t<isa, src_type, dst_type>::execute_forward(
             par_conv.post_ops_binary_rhs_arg_vec
                     = post_ops_binary_rhs_arg_vec.data();
             par_conv.dst_orig = dst;
+            par_conv.oc_off = ch * jcp.ch_block * sizeof(float);
+
             (*kernel_)(&par_conv);
 
             if (jcp.loop_order == loop_ngcw) {
@@ -159,6 +169,10 @@ void jit_uni_dw_convolution_fwd_t<isa, src_type, dst_type>::execute_forward(
     if (pd()->wants_zero_pad_dst()) ctx.zero_pad_output(DNNL_ARG_DST);
 }
 
+REG_AVX512_ISA(template struct jit_uni_dw_convolution_fwd_t<avx512_core_fp16,
+        f16, f32>);
+REG_AVX512_ISA(
+        template struct jit_uni_dw_convolution_fwd_t<avx512_core_fp16, f16>);
 REG_AVX512_ISA(
         template struct jit_uni_dw_convolution_fwd_t<avx512_core, bf16, f32>);
 REG_AVX512_ISA(template struct jit_uni_dw_convolution_fwd_t<avx512_core, bf16>);
@@ -173,12 +187,14 @@ void jit_uni_dw_convolution_bwd_data_t<isa, diff_dst_type,
     auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
     auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
 
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
 
-    const auto &jcp = pd()->jcp_;
-
     auto kernel_params = [&](int ur_str_w, int iw, int oh, int ih,
                                  int i_t_overflow, int i_b_overflow,
                                  int stride_off_h, int ch, int n,
@@ -219,6 +235,10 @@ void jit_uni_dw_convolution_bwd_data_t<isa, diff_dst_type,
                         static_cast<size_t>(jcp.oc), ch_work);
         par_conv.ch_blocks = load_work;
 
+        par_conv.ic_off = ch * jcp.ch_block * sizeof(float);
+        par_conv.post_ops_binary_rhs_arg_vec
+                = post_ops_binary_rhs_arg_vec.data();
+
         return par_conv;
     };
 
@@ -270,6 +290,9 @@ void jit_uni_dw_convolution_bwd_data_t<isa, diff_dst_type,
 
                 // main loop
                 ur_str_w = (aux_w - iw) / jcp.stride_w;
+                // may larger than the actual width and result crash
+                while (iw + ur_str_w * jcp.stride_w > jcp.iw)
+                    ur_str_w--;
                 if (ur_str_w > 0) {
                     jit_conv_call_s par_conv = kernel_params(ur_str_w, iw, oh,
                             ih, i_t_overflow, i_b_overflow, stride_off_h, ch, n,
diff --git a/src/cpu/x64/jit_uni_dw_convolution.hpp b/src/cpu/x64/jit_uni_dw_convolution.hpp
index 83a1f1d6ec5..baa48bd8a4e 100644
--- a/src/cpu/x64/jit_uni_dw_convolution.hpp
+++ b/src/cpu/x64/jit_uni_dw_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,10 +34,15 @@ namespace x64 {
 
 template <cpu_isa_t isa, data_type_t src_type, data_type_t dst_type = src_type>
 struct jit_uni_dw_convolution_fwd_t : public primitive_t {
+    using jit_uni_dw_conv_fwd_kernel_t
+            = jit_uni_dw_conv_fwd_kernel<isa, src_type>;
+
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
+        // Note: check `USING_INHERITED_IS_IMPOSSIBLE` comment in other files
+        // for details why this ctor can't be removed.
+        pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
                 const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_dw:", jcp_.isa, ""),
                 jit_uni_dw_convolution_fwd_t);
@@ -58,34 +63,33 @@ struct jit_uni_dw_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(
                     IMPLICATION(this->with_bias(),
                             utils::one_of(this->desc()->bias_desc.data_type,
-                                    f32, bf16)),
+                                    f32, bf16, f16)),
                     VERBOSE_UNSUPPORTED_BIAS_CFG);
 
-            auto status = jit_uni_dw_conv_fwd_kernel<isa, src_type>::init_conf(
-                    jcp_, *desc(), src_md_, weights_md_, bias_md_, dst_md_,
-                    attr_);
-            if (status != status::success) return status::unimplemented;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            CHECK(jit_uni_dw_conv_fwd_kernel_t::init_conf(jcp_, *desc(),
+                    src_md_, weights_md_, bias_md_, dst_md_, attr_));
 
             auto scratchpad = scratchpad_registry().registrar();
-            jit_uni_dw_conv_fwd_kernel<isa, src_type>::init_scratchpad(
-                    scratchpad, jcp_);
+            jit_uni_dw_conv_fwd_kernel_t::init_scratchpad(scratchpad, jcp_);
 
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_uni_dw_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<data_type::f32>::type f32_data_t;
-    typedef typename prec_traits<data_type::bf16>::type bf16_data_t;
-    typedef typename prec_traits<src_type>::type data_t;
-    typedef typename prec_traits<dst_type>::type dst_data_t;
+    using f32_data_t = typename prec_traits_t<data_type::f32>::type;
+    using bf16_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using f16_data_t = typename prec_traits_t<data_type::f16>::type;
+    using data_t = typename prec_traits_t<src_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
-                new jit_uni_dw_conv_fwd_kernel<isa, src_type>(
+                new jit_uni_dw_conv_fwd_kernel_t(
                         pd()->jcp_, *pd()->dst_md(0))));
         return kernel_->create_kernel();
     }
@@ -99,7 +103,7 @@ struct jit_uni_dw_convolution_fwd_t : public primitive_t {
     void execute_forward(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
-    std::unique_ptr<jit_uni_dw_conv_fwd_kernel<isa, src_type>> kernel_;
+    std::unique_ptr<jit_uni_dw_conv_fwd_kernel_t> kernel_;
 };
 
 using jit_avx512_common_dw_convolution_fwd_t
@@ -113,9 +117,7 @@ template <cpu_isa_t isa, data_type_t diff_dst_type,
         data_type_t diff_src_type = diff_dst_type>
 struct jit_uni_dw_convolution_bwd_data_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_data_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_dw:", jcp_.isa, ""),
                 jit_uni_dw_convolution_bwd_data_t);
@@ -130,13 +132,13 @@ struct jit_uni_dw_convolution_bwd_data_t : public primitive_t {
             VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
-            VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
-            status_t status = jit_uni_dw_conv_bwd_data_kernel<isa,
-                    diff_dst_type>::init_conf(jcp_, *desc(), diff_src_md_,
-                    weights_md_, diff_dst_md_);
-            if (status != status::success) return status;
+            jcp_.post_ops = attr()->post_ops_;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            using jit_uni_dw_conv_bwd_data_kernel_inst
+                    = jit_uni_dw_conv_bwd_data_kernel<isa, diff_dst_type>;
+            CHECK(jit_uni_dw_conv_bwd_data_kernel_inst::init_conf(
+                    jcp_, *desc(), diff_src_md_, weights_md_, diff_dst_md_));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_uni_dw_conv_bwd_data_kernel<isa,
@@ -145,14 +147,14 @@ struct jit_uni_dw_convolution_bwd_data_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_uni_dw_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
 
-    typedef typename prec_traits<diff_src_type>::type diff_src_data_t;
-    typedef typename prec_traits<diff_dst_type>::type diff_dst_data_t;
-    typedef typename prec_traits<diff_dst_type>::type wei_data_t;
+    using diff_src_data_t = typename prec_traits_t<diff_src_type>::type;
+    using diff_dst_data_t = typename prec_traits_t<diff_dst_type>::type;
+    using wei_data_t = typename prec_traits_t<diff_dst_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -185,10 +187,9 @@ template <cpu_isa_t isa, data_type_t src_type,
         data_type_t diff_weights_type = src_type>
 struct jit_uni_dw_convolution_bwd_weights_t : public primitive_t {
     struct pd_t : public cpu_convolution_bwd_weights_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_convolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_() {}
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
+
         using jit_uni_dw_convolution_bwd_weights
                 = jit_uni_dw_convolution_bwd_weights_t<isa, src_type,
                         diff_weights_type>;
@@ -217,10 +218,12 @@ struct jit_uni_dw_convolution_bwd_weights_t : public primitive_t {
             const int max_threads
                     = dnnl_in_parallel() ? 1 : dnnl_get_max_threads();
 
-            status_t status = jit_uni_dw_conv_bwd_weights_kernel<isa,
-                    src_type>::init_conf(jcp_, *desc(), src_md_,
-                    diff_weights_md_, diff_bias_md_, diff_dst_md_, max_threads);
-            if (status != status::success) return status;
+            // TODO: make `init_conf` assign initialized object to `jcp_`
+            using jit_uni_dw_conv_bwd_weights_kernel_inst
+                    = jit_uni_dw_conv_bwd_weights_kernel<isa, src_type>;
+            CHECK(jit_uni_dw_conv_bwd_weights_kernel_inst::init_conf(jcp_,
+                    *desc(), src_md_, diff_weights_md_, diff_bias_md_,
+                    diff_dst_md_, max_threads));
 
             auto scratchpad = scratchpad_registry().registrar();
             jit_uni_dw_conv_bwd_weights_kernel<isa, src_type>::init_scratchpad(
@@ -229,15 +232,15 @@ struct jit_uni_dw_convolution_bwd_weights_t : public primitive_t {
             return status::success;
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
     jit_uni_dw_convolution_bwd_weights_t(const pd_t *apd);
 
-    typedef typename prec_traits<data_type::f32>::type f32_data_t;
-    typedef typename prec_traits<data_type::bf16>::type bf16_data_t;
-    typedef typename prec_traits<src_type>::type src_data_t;
-    typedef typename prec_traits<src_type>::type diff_dst_data_t;
-    typedef typename prec_traits<diff_weights_type>::type diff_weights_data_t;
+    using f32_data_t = typename prec_traits_t<data_type::f32>::type;
+    using bf16_data_t = typename prec_traits_t<data_type::bf16>::type;
+    using src_data_t = typename prec_traits_t<src_type>::type;
+    using diff_dst_data_t = typename prec_traits_t<src_type>::type;
+    using diff_weights_data_t = typename prec_traits_t<diff_weights_type>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
diff --git a/src/cpu/x64/jit_uni_eltwise.cpp b/src/cpu/x64/jit_uni_eltwise.cpp
index 136ae40942a..e50a3552328 100644
--- a/src/cpu/x64/jit_uni_eltwise.cpp
+++ b/src/cpu/x64/jit_uni_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,12 +43,12 @@ struct jit_args_t {
     size_t work_amount;
 };
 
-struct jit_uni_eltwise_kernel : public jit_generator {
+struct jit_uni_eltwise_kernel : public jit_generator_t {
     jit_uni_eltwise_kernel(
             const eltwise_pd_t *pd, const char *name, cpu_isa_t isa)
-        : jit_generator(name, isa), pd_(pd) {}
+        : jit_generator_t(name, isa), pd_(pd) {}
 
-    void operator()(jit_args_t *p) { jit_generator::operator()(p); }
+    void operator()(jit_args_t *p) { jit_generator_t::operator()(p); }
 
 protected:
     const eltwise_pd_t *pd_;
@@ -81,9 +81,9 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel {
 
     jit_uni_kernel_t(const eltwise_pd_t *pd)
         : jit_uni_eltwise_kernel(pd, jit_name(), isa)
-        , vlen_(is_bf16() || is_f16() ? cpu_isa_traits<isa>::vlen / 2
-                          : is_f8()   ? cpu_isa_traits<isa>::vlen / 4
-                                      : cpu_isa_traits<isa>::vlen)
+        , vlen_(is_bf16() || is_f16() ? cpu_isa_traits_t<isa>::vlen / 2
+                          : is_f8()   ? cpu_isa_traits_t<isa>::vlen / 4
+                                      : cpu_isa_traits_t<isa>::vlen)
         , simd_w_(vlen_ / dtype_size())
         , is_fwd_(pd_->is_fwd()) {
 
@@ -92,8 +92,8 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel {
         // using the first 7 vregs can be considered volatile during the call
         // to eltwise injector
         const bool save_state = is_fwd_ ? false : true;
-        eltwise_injector_.reset(new jit_uni_eltwise_injector<injector_isa>(this,
-                desc.alg_kind, desc.alpha, desc.beta, 1.f, data_type::f32,
+        eltwise_injector_.reset(new jit_uni_eltwise_injector_t<injector_isa>(
+                this, desc.alg_kind, desc.alpha, desc.beta, 1.f, data_type::f32,
                 save_state, reg_injector_table, injector_mask, is_fwd_,
                 pd_->use_dst()));
         io::io_conf_t io_conf;
@@ -227,13 +227,11 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel {
         postamble();
 
         eltwise_injector_->prepare_table();
-        if (is_superset(isa, avx512_core_amx) && is_f8()
-                && io_.at(data_type()) != nullptr)
-            io_.at(data_type())->prepare_table_fp8();
+        if (is_f8()) io_.prepare_table_fp8();
     }
 
 private:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     static constexpr cpu_isa_t injector_isa
             = isa == avx512_core_amx ? avx512_core : isa;
 
@@ -263,7 +261,7 @@ struct jit_uni_kernel_t : public jit_uni_eltwise_kernel {
     Vmm vmm_src_odd = Vmm(8);
     Vmm vmm_diff_dst_even = vmm_diff_dst;
     Vmm vmm_diff_dst_odd = Vmm(9);
-    std::unique_ptr<jit_uni_eltwise_injector<injector_isa>> eltwise_injector_;
+    std::unique_ptr<jit_uni_eltwise_injector_t<injector_isa>> eltwise_injector_;
     io::jit_io_multi_dt_helper_t<Vmm> io_;
 
     /* bf16 and fp8 support */
diff --git a/src/cpu/x64/jit_uni_eltwise.hpp b/src/cpu/x64/jit_uni_eltwise.hpp
index b06de783f47..109c98d3cee 100644
--- a/src/cpu/x64/jit_uni_eltwise.hpp
+++ b/src/cpu/x64/jit_uni_eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2020 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,9 +52,10 @@ struct jit_uni_eltwise_fwd_t : public primitive_t {
     };
 
     jit_uni_eltwise_fwd_t(const pd_t *apd);
-    virtual ~jit_uni_eltwise_fwd_t();
 
-    typedef typename prec_traits<d_type>::type data_t;
+    ~jit_uni_eltwise_fwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
@@ -82,9 +83,10 @@ struct jit_uni_eltwise_bwd_t : public primitive_t {
     };
 
     jit_uni_eltwise_bwd_t(const pd_t *apd);
-    virtual ~jit_uni_eltwise_bwd_t();
 
-    typedef typename prec_traits<d_type>::type data_t;
+    ~jit_uni_eltwise_bwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_uni_eltwise_int.cpp b/src/cpu/x64/jit_uni_eltwise_int.cpp
index 5177d251954..7b9f422937a 100644
--- a/src/cpu/x64/jit_uni_eltwise_int.cpp
+++ b/src/cpu/x64/jit_uni_eltwise_int.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,12 +37,12 @@ struct jit_args_int8_t {
     size_t work_amount;
 };
 
-struct jit_uni_eltwise_int_kernel : public jit_generator {
+struct jit_uni_eltwise_int_kernel : public jit_generator_t {
     jit_uni_eltwise_int_kernel(
             const eltwise_pd_t *pd, const cpu_isa_t isa, const char *name)
-        : jit_generator(name, isa), pd_(pd) {}
+        : jit_generator_t(name, isa), pd_(pd) {}
 
-    void operator()(jit_args_int8_t *p) { jit_generator::operator()(p); }
+    void operator()(jit_args_int8_t *p) { jit_generator_t::operator()(p); }
 
 protected:
     data_type_t data_type() const { return pd_->src_md()->data_type; }
@@ -76,7 +76,7 @@ struct jit_uni_subkernel_int_t : public jit_uni_eltwise_int_kernel {
     void generate() override {
         Reg64 param = abi_param1;
 
-        const size_t vlen = cpu_isa_traits<isa>::vlen;
+        const size_t vlen = cpu_isa_traits_t<isa>::vlen;
         const size_t simd_w = vlen / sizeof(float);
         const size_t loop_dec[] = {simd_w, 1};
         const size_t uf[] = {1, 1};
@@ -128,7 +128,7 @@ struct jit_uni_subkernel_int_t : public jit_uni_eltwise_int_kernel {
     }
 
 private:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     using opmask_t = const Xbyak::Opmask;
 
     Reg64 reg_from = rax;
diff --git a/src/cpu/x64/jit_uni_eltwise_int.hpp b/src/cpu/x64/jit_uni_eltwise_int.hpp
index 58e442d836a..41ed838d4ad 100644
--- a/src/cpu/x64/jit_uni_eltwise_int.hpp
+++ b/src/cpu/x64/jit_uni_eltwise_int.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,9 +47,10 @@ struct jit_uni_eltwise_int_fwd_t : public primitive_t {
     };
 
     jit_uni_eltwise_int_fwd_t(const pd_t *apd);
-    ~jit_uni_eltwise_int_fwd_t();
 
-    typedef typename prec_traits<d_type>::type data_t;
+    ~jit_uni_eltwise_int_fwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_uni_fork_dw_conv_kernel_f32.cpp b/src/cpu/x64/jit_uni_fork_dw_conv_kernel_f32.cpp
new file mode 100644
index 00000000000..1824378a965
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_dw_conv_kernel_f32.cpp
@@ -0,0 +1,1131 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/memory.hpp"
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_uni_fork_dw_conv_kernel_f32.hpp"
+
+#define GET_OFF(field) offsetof(jit_conv_call_s, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::prop_kind;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+using namespace Xbyak;
+
+static bool check_if_tail_load(const bool is_ch_tail, const int c_tail, const int ch,
+                               const int ur_ch_blocks, const int vlen, const int i) {
+    return is_ch_tail && (ch + 1 == ur_ch_blocks) && ((i + 1) * vlen > c_tail);
+}
+
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::load_src(int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+    const auto dst_layout_nxc = is_dst_layout_nxc();
+    const auto ch_blk = jcp.ch_block;
+    const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.od * jcp.oh * jcp.ow * ch_blk;
+    const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+    const int vlen_numbers = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+    const int c_tail = jcp.oc % jcp.ch_block;
+
+    int repeats = jcp.ch_block / vlen_numbers;
+    assert((repeats == 1) || (repeats == 2 && isa == sse41));
+    for (int i = 0; i < repeats; i++) {
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            const bool is_tail_load = check_if_tail_load(
+                    is_ch_tail, c_tail, ch, ur_ch_blocks, vlen_numbers, i);
+            if ((ch + 1 == ur_ch_blocks) && is_ch_tail && c_tail <= i * vlen_numbers)
+                continue;
+            for (int ow = 0; ow < ur_w; ow++) {
+                Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w + ch*ur_w + ow);
+
+                int b_off = ch*ch_blk + i*vlen_numbers;
+                if (this->jcp.with_bias) {
+                    if (is_tail_load) {
+                        load_tail(vmm_acc, reg_bias, b_off * sizeof(float),
+                                  (c_tail - i*vlen_numbers) * sizeof(float));
+                    } else {
+                        uni_vmovups(vmm_acc,
+                                    vmmword[reg_bias + b_off * sizeof(float)]);
+                    }
+                } else {
+                    uni_vpxor(vmm_acc, vmm_acc, vmm_acc);
+                }
+
+                int o_off = ch*ocb_stride
+                    + ow*ow_stride + i*vlen_numbers;
+                if (this->jcp.with_sum) {
+                    if (is_tail_load) {
+                        if (this->jcp.with_bias) {
+                            // using ker_vmm as vmm_tmp as it is safe to do so.
+                            auto vmm_tmp = get_ker_reg(0);
+                            add_tail_from_mem(vmm_acc, vmm_tmp, reg_output,
+                                              o_off * sizeof(float),
+                                              (c_tail - i*vlen_numbers) * sizeof(float));
+                        } else {
+                            // nothing to add, just load dst.
+                            load_tail(vmm_acc, reg_output,
+                                      o_off * sizeof(float),
+                                      c_tail * sizeof(float));
+                        }
+                    } else {
+                        // blocked layout has dst padded, so no tail handling.
+                        uni_vaddps(vmm_acc, vmm_acc,
+                                   vmmword[reg_output + o_off*sizeof(float)]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::apply_filter(
+        int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+    int ch_blk = jcp.ch_block;
+    int dilate_d = jcp.dilate_d + 1;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto iw_stride = src_layout_nxc ? jcp.ngroups : ch_blk;
+    const auto ih_stride = jcp.iw * iw_stride;
+    const auto icb_stride = src_layout_nxc
+                            ? ch_blk
+                            : jcp.id * jcp.ih * jcp.iw * ch_blk;
+
+    Label iter_exit_label;
+    Label kd_label, iter_d_exit_label;
+
+    if (jcp.ndims == 5) {
+        push(reg_kd);
+        mov(reg_kd, ptr[this->param1 + GET_OFF(kd_padding)]);
+        cmp(reg_kd, 0);
+        je(iter_d_exit_label, T_NEAR);
+
+        push(reg_input);
+        push(reg_kernel);
+        base_post_ops_data_offset += 3 * reg64_size;
+
+        mov(aux_reg_inp_d, aux_reg_input);
+        mov(aux_reg_ker_d, aux_reg_kernel);
+
+        L(kd_label);
+    }
+
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    push(aux1_reg_kernel);
+    base_post_ops_data_offset += reg64_size;
+    L(kh_label); {
+        mov(iter_kw, reg_kw);
+        mov(aux1_reg_input, aux_reg_input);
+        mov(aux1_reg_kernel, aux_reg_kernel);
+
+        Label kw_label;
+        L(kw_label); {
+            const int vlen_numbers = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+            const int c_tail = jcp.oc % jcp.ch_block;
+            int repeats = jcp.ch_block / vlen_numbers;
+            assert((repeats == 1) || (repeats == 2 && isa == sse41));
+            for (int i = 0; i < repeats; i++) {
+                for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                    const bool is_tail_load = check_if_tail_load(
+                            is_ch_tail, c_tail, ch, ur_ch_blocks, vlen_numbers, i);
+                    if ((ch + 1 == ur_ch_blocks) && is_ch_tail
+                            && c_tail <= i*vlen_numbers)
+                        continue;
+                    int ker_off = ch*jcp.kd*jcp.kh*jcp.kw*ch_blk + i*vlen_numbers;
+                    Vmm vmm_ker = get_ker_reg(0);
+                    uni_vmovups(vmm_ker, ptr[aux1_reg_kernel
+                        + ker_off*sizeof(float)]);
+
+                    for (int ow = 0; ow < ur_w; ow++) {
+                        int inp_off = ch*icb_stride
+                            + ow*stride_w*iw_stride + i*vlen_numbers;
+                        Vmm vmm_src = get_src_reg(0);
+                        if (is_tail_load) {
+                            load_tail(vmm_src, aux1_reg_input,
+                                      inp_off * sizeof(float),
+                                      (c_tail - i*vlen_numbers) * sizeof(float));
+                        } else {
+                            uni_vmovups(vmm_src,
+                                        ptr[aux1_reg_input + inp_off*sizeof(float)]);
+                        }
+
+                        Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w
+                            + ch*ur_w + ow);
+                        uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker);
+                    }
+                }
+            }
+            add(aux1_reg_kernel, ch_blk*sizeof(float));
+            add(aux1_reg_input, iw_stride*dilate_w*sizeof(float));
+
+            dec(iter_kw);
+            cmp(iter_kw, 0);
+            jg(kw_label, T_NEAR);
+        }
+        add(aux_reg_kernel, jcp.kw*ch_blk*sizeof(float));
+        add(aux_reg_input, ih_stride*dilate_h*sizeof(float));
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+        pop(aux1_reg_kernel);
+        base_post_ops_data_offset -= reg64_size;
+    }
+
+    L(iter_exit_label);
+
+    if (jcp.ndims == 5) {
+        add(aux_reg_ker_d, jcp.kh*jcp.kw*ch_blk*sizeof(float));
+        add(aux_reg_inp_d, jcp.ih*dilate_d*ih_stride*sizeof(float));
+
+        mov(aux_reg_input, aux_reg_inp_d);
+        mov(aux_reg_kernel, aux_reg_ker_d);
+
+        dec(reg_kd);
+        cmp(reg_kd, 0);
+        jg(kd_label, T_NEAR);
+
+        pop(reg_kernel);
+        pop(reg_input);
+
+        L(iter_d_exit_label);
+        pop(reg_kd);
+        base_post_ops_data_offset -= 3 * reg64_size;
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::apply_filter_unrolled(
+        int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+    int ch_blk = jcp.ch_block;
+    int dilate_d = jcp.dilate_d + 1;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_w = jcp.dilate_w + 1;
+    int stride_w = jcp.stride_w;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto iw_stride = src_layout_nxc ? jcp.ngroups : ch_blk;
+    const auto ih_stride = jcp.iw * iw_stride;
+    const auto icb_stride = src_layout_nxc
+                            ? ch_blk
+                            : jcp.id * jcp.ih * jcp.iw * ch_blk;
+
+    Label iter_exit_label;
+    Label kd_label, iter_d_exit_label;
+
+    if (jcp.ndims == 5) {
+        push(reg_kd);
+        mov(reg_kd, ptr[this->param1 + GET_OFF(kd_padding)]);
+        cmp(reg_kd, 0);
+        je(iter_d_exit_label, T_NEAR);
+
+        push(reg_input);
+        push(reg_kernel);
+
+        base_post_ops_data_offset += 3 * reg64_size;
+
+        mov(aux_reg_inp_d, aux_reg_input);
+        mov(aux_reg_ker_d, aux_reg_kernel);
+
+        L(kd_label);
+    }
+
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label); {
+        const int vlen_numbers = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+        const int c_tail = jcp.oc % jcp.ch_block;
+        int repeats = jcp.ch_block / vlen_numbers;
+        assert((repeats == 1) || (repeats == 2 && isa == sse41));
+        for (int i = 0; i < repeats; i++) {
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                const bool is_tail_load = check_if_tail_load(
+                        is_ch_tail, c_tail, ch, ur_ch_blocks, vlen_numbers, i);
+                if ((ch + 1 == ur_ch_blocks) && is_ch_tail
+                    && c_tail <= i * vlen_numbers)
+                    continue;
+                for (int kw = 0; kw < jcp.kw; kw++) {
+                    int ker_off = ch*jcp.kd*jcp.kh*jcp.kw*ch_blk + kw*ch_blk + i*vlen_numbers;
+
+                    Vmm vmm_ker = get_ker_reg(0);
+                    uni_vmovups(vmm_ker, ptr[aux_reg_kernel
+                        + ker_off*sizeof(float)]);
+
+                    for (int ow = 0; ow < ur_w; ow++) {
+                        int inp_off = ch*icb_stride
+                            + ow*stride_w*iw_stride + kw*dilate_w*iw_stride + i*vlen_numbers;
+
+                        Vmm vmm_src = get_src_reg(0);
+                        if (is_tail_load) {
+                            load_tail(vmm_src, aux_reg_input,
+                                      inp_off * sizeof(float),
+                                      (c_tail - i*vlen_numbers) * sizeof(float));
+                        } else {
+                            uni_vmovups(vmm_src,
+                                        ptr[aux_reg_input + inp_off*sizeof(float)]);
+                        }
+
+                        Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_w
+                            + ch*ur_w + ow);
+                        uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker);
+                    }
+                }
+            }
+        }
+
+        add(aux_reg_kernel, jcp.kw*ch_blk*sizeof(float));
+        add(aux_reg_input, ih_stride*dilate_h*sizeof(float));
+
+        dec(iter_kh);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+
+    if (jcp.ndims == 5) {
+        add(aux_reg_ker_d, jcp.kh*jcp.kw*ch_blk*sizeof(float));
+        add(aux_reg_inp_d, jcp.ih*dilate_d*ih_stride*sizeof(float));
+
+        mov(aux_reg_input, aux_reg_inp_d);
+        mov(aux_reg_kernel, aux_reg_ker_d);
+
+        dec(reg_kd);
+        cmp(reg_kd, 0);
+        jg(kd_label, T_NEAR);
+
+        pop(reg_kernel);
+        pop(reg_input);
+
+        L(iter_d_exit_label);
+        pop(reg_kd);
+        base_post_ops_data_offset -= 3 * reg64_size;
+    }
+}
+
+template <typename F>
+void iterate(const int repeats, const int ur_ch_blocks, const int ur_w,
+        const bool mask_tail, const F &f) {
+    for (int r = 0; r < repeats; r++)
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            const bool mask_flag = mask_tail && ch + 1 == ur_ch_blocks;
+            for (int ow = 0; ow < ur_w; ow++)
+                f(r, ch, ow, mask_flag);
+        }
+}
+
+template <typename F>
+void iterate(
+        const int repeats, const int ur_ch_blocks, const int ur_w, const F &f) {
+    iterate(repeats, ur_ch_blocks, ur_w, false, f);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::apply_postprocess(int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+    int repeats = isa == sse41 ? 2 : 1;
+
+    int eltwise_inj_idx = 0;
+    int depthwise_inj_idx = 0;
+    int binary_inj_idx = 0;
+    int quantization_inj_idx = 0;
+    std::size_t post_ops_data_offset = 0;
+    const auto &p = attr_.post_ops_;
+
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            int start_idx = get_acc_reg(0).getIdx();
+            int end_idx = get_acc_reg(repeats * ur_w * ur_ch_blocks).getIdx();
+
+            eltwise_injectors[eltwise_inj_idx]->compute_vector_range(start_idx, end_idx);
+            eltwise_inj_idx++;
+        } else if (post_op.is_binary()) {
+            injector_utils::vmm_index_set_t vmm_idxs;
+            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params,
+                    rhs_arg_params_tail;
+            const auto dst_layout_nxc = is_dst_layout_nxc();
+            // width per output channel block
+            const auto ch_blk = jcp.ch_block;
+            // ncx: next output channel block stride
+            const auto ocb_stride
+                    = dst_layout_nxc ? ch_blk : jcp.od * jcp.oh * jcp.ow * ch_blk;
+            // ncx: next w inside a output channel block
+            const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+            // ncx: if has tail
+            const auto mask_tail_blocked_layout
+                    = jcp.oc_without_padding % jcp.ch_block && !dst_layout_nxc;
+            // tail value
+            const int c_tail = jcp.oc_without_padding % jcp.ch_block;    
+            iterate(repeats, ur_ch_blocks, ur_w, mask_tail_blocked_layout,
+                    [&](const int r, const int ch, const int ow,
+                            const bool mask_flag_blocked_layout) {
+                        const int vlen
+                                = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+                        const bool is_tail_load = check_if_tail_load(
+                                is_ch_tail, c_tail, ch, ur_ch_blocks, vlen, r);
+                        if ((ch + 1 == ur_ch_blocks) && is_ch_tail
+                                && c_tail <= r * vlen)
+                            return;
+                        const size_t o_off = jcp.typesize_out
+                                * (ch * ocb_stride + ow * ow_stride + r * vlen);
+                        const auto vmm_idx = get_acc_reg(
+                                r * ur_ch_blocks * ur_w + ch * ur_w + ow).getIdx();
+                        vmm_idxs.emplace(vmm_idx);
+
+                        rhs_arg_params_tail.vmm_idx_to_out_reg.emplace(
+                                vmm_idx, reg_output);
+                        rhs_arg_params_tail.vmm_idx_to_out_elem_off_val.emplace(
+                                vmm_idx, o_off);
+                        if (mask_flag_blocked_layout || is_tail_load)
+                            rhs_arg_params_tail.vmm_tail_idx_.emplace(vmm_idx);
+                    });
+            rhs_arg_params = rhs_arg_params_tail;
+            rhs_arg_params.vmm_tail_idx_.clear();
+
+            Label postops_done;
+            if (mask_tail_blocked_layout) {
+                // mask_tail_blocked_layout approach of dynamic tail handling is
+                // used in blocked layout only. TODO: may be unify?
+                Label postops_no_tail;
+                push(aux_reg_blocks_offset);
+                mov(aux_reg_blocks_offset, ptr[param1 + GET_OFF(load_work)]);
+                cmp(aux_reg_blocks_offset, jcp.nb_ch_blocking * jcp.ch_block);
+                pop(aux_reg_blocks_offset);
+                jge(postops_no_tail, T_NEAR);
+                binary_injector->compute_vector_range(vmm_idxs,
+                    binary_inj_idx, post_op, rhs_arg_params_tail);
+                jmp(postops_done, T_NEAR);
+                L(postops_no_tail);
+            } else if (is_ch_tail) {
+                binary_injector->compute_vector_range(vmm_idxs,
+                    binary_inj_idx, post_op, rhs_arg_params_tail);                
+            }
+            if (!is_ch_tail) {
+                binary_injector->compute_vector_range(vmm_idxs,
+                    binary_inj_idx, post_op, rhs_arg_params);                
+                L(postops_done);
+            }
+            binary_inj_idx++;
+        } else if (post_op.is_depthwise()) {
+            push(aux_reg_blocks_offset);
+            base_post_ops_data_offset += reg64_size;
+            add(aux_reg_blocks_offset, ptr[this->param1 + GET_OFF(oc_off)]); //add offset of processed blocks
+
+            mov(reg_d_weights, ptr[this->rsp + base_post_ops_data_offset + post_ops_data_offset]);
+            add(reg_d_weights, aux_reg_blocks_offset);
+
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int k = 0; k < repeats; k++) {
+                    int start_idx = get_acc_reg(k*ur_ch_blocks*ur_w + ur_w * ch).getIdx();
+                    int end_idx = get_acc_reg(k*ur_ch_blocks*ur_w + ur_w * ch + ur_w).getIdx();
+
+                    depthwise_injectors[depthwise_inj_idx]->compute_vector_range(
+                            start_idx, end_idx, reg_d_weights, reg_d_weights);
+
+                    add(reg_d_weights, jcp.ch_block / repeats * sizeof(float));
+                }
+            }
+            pop(aux_reg_blocks_offset);
+            base_post_ops_data_offset -= reg64_size;
+
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+            binary_inj_idx++;
+        } else if (post_op.is_quantization()) {
+            push(aux_reg_blocks_offset);
+            base_post_ops_data_offset += reg64_size;
+            add(aux_reg_blocks_offset, ptr[this->param1 + GET_OFF(oc_off)]); //add offset of processed blocks
+
+            const Xbyak::RegExp quant_arg_base = this->rsp + base_post_ops_data_offset + post_ops_data_offset;
+            quantization_injectors[quantization_inj_idx]->init_crop_ptrs(quant_arg_base, aux_reg_blocks_offset);
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int k = 0; k < repeats; k++) {
+                    int s_idx = get_acc_reg(k*ur_ch_blocks*ur_w + ch*ur_w).getIdx();
+                    quantization_injectors[quantization_inj_idx]->compute_crop(s_idx, s_idx + ur_w,
+                                                                               (k * (jcp.ch_block / 2) + ch * jcp.ch_block) * sizeof(float));
+                }
+            }
+
+            quantization_injectors[quantization_inj_idx]->init_input_scale_shift_ptrs(quant_arg_base, aux_reg_blocks_offset);
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int k = 0; k < repeats; k++) {
+                    int s_idx = get_acc_reg(k*ur_ch_blocks*ur_w + ch*ur_w).getIdx();
+                    quantization_injectors[quantization_inj_idx]->compute_input_scale_shift(s_idx, s_idx + ur_w,
+                                                                                            (k * (jcp.ch_block / 2) + ch * jcp.ch_block) * sizeof(float), true);
+                }
+            }
+
+            quantization_injectors[quantization_inj_idx]->init_output_scale_shift_ptrs(quant_arg_base, aux_reg_blocks_offset);
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int k = 0; k < repeats; k++) {
+                    int s_idx = get_acc_reg(k*ur_ch_blocks*ur_w + ch*ur_w).getIdx();
+                    quantization_injectors[quantization_inj_idx]->compute_output_scale_shift(s_idx, s_idx + ur_w,
+                                                                                             (k * (jcp.ch_block / 2) + ch * jcp.ch_block) * sizeof(float));
+                }
+            }
+            pop(aux_reg_blocks_offset);
+            base_post_ops_data_offset -= reg64_size;
+
+            post_ops_data_offset += quantization_injectors[quantization_inj_idx]->memoryStep();
+            quantization_inj_idx++;
+            binary_inj_idx++;
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::load_tail(
+        Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int load_size) {
+    uni_vmovups(vmm | k_oc_tail_mask | T_z, ptr[reg + offset]);
+}
+
+template <>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<avx2>::load_tail(
+        Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int load_size) {
+    load_bytes(vmm, reg, offset, load_size);
+}
+
+template <>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<sse41>::load_tail(
+        Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int load_size) {
+    load_bytes(vmm, reg, offset, load_size);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::add_tail_from_mem(Vmm &vmm_acc,
+                                                                 Vmm &vmm_tmp, const Xbyak::Reg64 &reg, int64_t offset, int load_size) {
+    uni_vaddps(vmm_acc | k_oc_tail_mask | T_z, vmm_acc, ptr[reg + offset]);
+}
+
+template <>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<avx2>::add_tail_from_mem(Vmm &vmm_acc,
+                                                                  Vmm &vmm_tmp, const Xbyak::Reg64 &reg, int64_t offset, int load_size) {
+    load_bytes(vmm_tmp, reg, offset, load_size);
+    uni_vaddps(vmm_acc, vmm_acc, vmm_tmp);
+}
+
+template <>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<sse41>::add_tail_from_mem(Vmm &vmm_acc,
+                                                                   Vmm &vmm_tmp, const Xbyak::Reg64 &reg, int64_t offset, int load_size) {
+    load_bytes(vmm_tmp, reg, offset, load_size);
+    uni_vaddps(vmm_acc, vmm_acc, vmm_tmp);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::store_tail(
+        Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int store_size) {
+    uni_vmovups(vmmword[reg + offset], vmm | k_oc_tail_mask);
+}
+
+template <>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<avx2>::store_tail(
+        Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int store_size) {
+    store_bytes(vmm, reg, offset, store_size);
+}
+
+template <>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<sse41>::store_tail(
+        Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int store_size) {
+    store_bytes(vmm, reg, offset, store_size);
+}
+
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::store_dst(
+        int ur_ch_blocks, int ur_w, bool is_ch_tail) {
+    const auto dst_layout_nxc = is_dst_layout_nxc();
+    const auto ch_blk = jcp.ch_block;
+    const auto ocb_stride = dst_layout_nxc ? ch_blk : jcp.od * jcp.oh * jcp.ow * ch_blk;
+    const auto ow_stride = dst_layout_nxc ? jcp.ngroups : ch_blk;
+    const int vlen_numbers = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+    const int c_tail = jcp.oc_without_padding % jcp.ch_block;
+
+    int repeats = jcp.ch_block / vlen_numbers;
+    assert((repeats == 1) || (repeats == 2 && isa == sse41));
+    for (int i = 0; i < repeats; i++) {
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            const bool is_tail_load = check_if_tail_load(
+                    is_ch_tail, c_tail, ch, ur_ch_blocks, vlen_numbers, i);
+            if ((ch + 1 == ur_ch_blocks) && is_ch_tail && c_tail <= i * vlen_numbers)
+                continue;
+            for (int ow = 0; ow < ur_w; ow++) {
+                int o_off = ch*ocb_stride + ow*ow_stride + i*vlen_numbers;
+                Vmm vmm_dst = get_acc_reg(i*ur_ch_blocks*ur_w + ch*ur_w + ow);
+
+                if (is_tail_load) {
+                    store_tail(vmm_dst, reg_output, o_off * sizeof(float),
+                               (c_tail - i*vlen_numbers) * sizeof(float));
+                } else
+                    uni_vmovups(vmmword[reg_output + o_off*sizeof(float)], vmm_dst);
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::compute_loop(int ur_w, int ur_ch_blocks) {
+    const bool ch_loop = ur_ch_blocks > jcp.nb_ch_blocking;
+    // ch_loop currently happen only when data layout is nxc. The strides are
+    // calculated for this layout only.
+    const size_t wei_ch_stride = (size_t)jcp.nb_ch_blocking * jcp.kd * jcp.kh * jcp.kw
+                                 * jcp.ch_block * sizeof(float);
+    const size_t inp_ch_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * sizeof(float);
+    const size_t out_ch_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * sizeof(float);
+    const size_t bias_stride
+            = (size_t)jcp.nb_ch_blocking * jcp.ch_block * sizeof(float);
+
+    auto compute = [&](int ur_ch_blocks, bool is_ch_tail) {
+        mov(aux_reg_input, reg_input);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_src(ur_ch_blocks, ur_w, is_ch_tail);
+        if (ur_w == 1) {
+            apply_filter(ur_ch_blocks, ur_w, is_ch_tail);
+        } else {
+            apply_filter_unrolled(ur_ch_blocks, ur_w, is_ch_tail);
+        }
+        apply_postprocess(ur_ch_blocks, ur_w, is_ch_tail);
+        store_dst(ur_ch_blocks, ur_w, is_ch_tail);
+    };
+
+    xor_(aux_reg_blocks_offset, aux_reg_blocks_offset);
+
+    if (ch_loop) {
+        Label ch_loop_label, ch_tail_label, skip_ch_tail_label;
+        const int ch_block_tail = jcp.nb_ch
+                                  - (utils::rnd_dn(jcp.oc / jcp.ch_block, jcp.nb_ch_blocking));
+        const int ch_step = jcp.nb_ch_blocking * jcp.ch_block;
+
+        push(aux_reg_ch_blocks);
+        mov(aux_reg_ch_blocks, reg_ch_blocks);
+        push(reg_kernel);
+        push(reg_input);
+        push(reg_output);
+        base_post_ops_data_offset += 4 * reg64_size;
+        if (jcp.with_bias) {
+            push(reg_bias);
+            base_post_ops_data_offset += reg64_size;
+        }
+
+        if ((jcp.oc / jcp.ch_block) >= jcp.nb_ch_blocking) {
+            if (ch_block_tail) {
+                cmp(aux_reg_ch_blocks, ch_step);
+                jl(ch_tail_label, T_NEAR);
+            }
+
+            L(ch_loop_label);
+            {
+                compute(jcp.nb_ch_blocking, false);
+                add(reg_kernel, wei_ch_stride);
+                add(reg_input, inp_ch_stride);
+                add(reg_output, out_ch_stride);
+                if (jcp.with_bias) add(reg_bias, bias_stride);
+                sub(aux_reg_ch_blocks, ch_step);
+                add(aux_reg_blocks_offset, ch_step * sizeof(float)); //add initial offset of processed blocks
+                cmp(aux_reg_ch_blocks, ch_step);
+                jge(ch_loop_label, T_NEAR);
+            }
+        }
+
+        if (ch_block_tail) {
+            // ch work range [1, jcp.nb_ch_blocking * ch_block)
+            L(ch_tail_label);
+            cmp(aux_reg_ch_blocks, 0);
+            jle(skip_ch_tail_label, T_NEAR);
+            compute(ch_block_tail, jcp.oc % jcp.ch_block);
+            L(skip_ch_tail_label);
+        }
+
+        if (jcp.with_bias) {
+            pop(reg_bias);
+            base_post_ops_data_offset -= reg64_size;
+        }
+        pop(reg_output);
+        pop(reg_input);
+        pop(reg_kernel);
+        pop(aux_reg_ch_blocks);
+        base_post_ops_data_offset -= 4 * reg64_size;
+
+    } else {
+        compute(ur_ch_blocks, jcp.oc % jcp.ch_block);
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::loop_body(int ur_ch_blocks) {
+    Label unrolled_w_label;
+    Label tail_w_label;
+    Label exit_label;
+
+    const auto src_layout_nxc = is_src_layout_nxc();
+    const auto dat_c_stride = src_layout_nxc ? jcp.ngroups : jcp.ch_block;
+
+    L(unrolled_w_label); {
+        int ur_w = jcp.ur_w;
+
+        size_t inp_shift = sizeof(float) * ur_w * jcp.stride_w * dat_c_stride;
+        size_t out_shift = sizeof(float) * ur_w * dat_c_stride;
+
+        cmp(reg_ur_w, ur_w);
+        jl(tail_w_label, T_NEAR);
+
+        compute_loop(ur_w, ur_ch_blocks);
+
+        add(reg_input, inp_shift);
+        add(reg_output, out_shift);
+
+        sub(reg_ur_w, ur_w);
+        jmp(unrolled_w_label);
+    }
+
+    L(tail_w_label); {
+        int ur_w = 1;
+
+        size_t inp_shift = sizeof(float) * ur_w * jcp.stride_w * dat_c_stride;
+        size_t out_shift = sizeof(float) * ur_w * dat_c_stride;
+
+        cmp(reg_ur_w, ur_w);
+        jl(exit_label, T_NEAR);
+
+        compute_loop(ur_w, ur_ch_blocks);
+
+        add(reg_input, inp_shift);
+        add(reg_output, out_shift);
+
+        sub(reg_ur_w, ur_w);
+        jmp(tail_w_label);
+    }
+
+    L(exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_fwd_kernel_f32<isa>::generate() {
+    const auto &p = attr_.post_ops_;
+    bool with_binary = false;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors.push_back(new jit_uni_eltwise_injector_t<isa>(
+                    this,
+                    post_op.eltwise
+            ));
+        } else if (post_op.is_binary()) {
+            with_binary = true;
+        } else if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<isa>(
+                    this,
+                    post_op
+            ));
+        } else if (post_op.is_quantization()) {
+            quantization_injectors.push_back(new jit_uni_quantization_injector_f32<isa>(
+                    this,
+                    post_op,
+                    vmm_d_weights, vmm_d_bias, reg_d_weights, reg_d_bias
+            ));
+        }
+    }
+    if (with_binary) {
+        static constexpr bool preserve_gpr = true;
+        static constexpr bool preserve_vmm = true;
+        static constexpr size_t helper_vmm_idx = 2;
+        size_t tail_size = jcp.oc_without_padding
+                % (cpu_isa_traits_t<isa>::vlen / sizeof(float));
+        static constexpr bool use_exact_tail_scalar_bcast = false;
+        const binary_injector::rhs_arg_static_params_t rhs_sp {
+            helper_vmm_idx, r10, r11, r12, preserve_gpr,
+            preserve_vmm, GET_OFF(post_ops_binary_rhs_arg_vec),
+            GET_OFF(dst_orig), memory_desc_wrapper(&dst_md_),
+            tail_size, k_oc_tail_mask, use_exact_tail_scalar_bcast};
+        const binary_injector::static_params_t bsp {this->param1, rhs_sp};
+        binary_injector = utils::make_unique<
+                binary_injector::jit_uni_binary_injector_t<isa>>(
+                this, bsp);
+    }
+
+    this->preamble();
+
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization() || p.entry_[i].is_binary()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_input;
+        auto aux_reg1 = reg_output;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
+    mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
+    if (jcp.with_bias)
+        mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]);
+    mov(reg_kw, ptr[this->param1 + GET_OFF(kw_padding)]);
+    mov(reg_ch_blocks, ptr[this->param1 + GET_OFF(load_work)]);
+    mov(reg_ur_w, ptr[this->param1 + GET_OFF(ur_w)]);
+
+    Label ch_blocks_tail_label;
+    Label exit_label;
+
+    int ch_blocks_tail = jcp.nb_ch % jcp.nb_ch_blocking;
+    if (mayiuse(cpu_isa_t::avx512_core)) {
+        const auto oc_tail = jcp.oc_without_padding % jcp.ch_block;
+        if (oc_tail != 0) {
+            // Prepare masks for tailing
+            const int oc_tail_shift
+                    = jcp.ch_block - jcp.oc_without_padding % jcp.ch_block;
+            static constexpr auto zmm_full_mask = ((1 << 16) - 1);
+            Reg32 reg_tail_32 = reg_tail.cvt32();
+            mov(reg_tail_32, (zmm_full_mask >> oc_tail_shift));
+            kmovw(k_oc_tail_mask, reg_tail_32);
+        }
+    }
+
+    if (is_src_layout_nxc()) {
+        loop_body(jcp.nb_ch);
+    } else {
+        cmp(reg_ch_blocks, (jcp.nb_ch_blocking - 1) * jcp.ch_block);
+        jle(ch_blocks_tail ? ch_blocks_tail_label : exit_label, T_NEAR);
+
+        loop_body(jcp.nb_ch_blocking); // channel main loop
+
+        if (ch_blocks_tail) {
+            jmp(exit_label, T_NEAR);
+            L(ch_blocks_tail_label);
+            loop_body(ch_blocks_tail); // channel tail loop
+        }
+
+        L(exit_label);
+    }
+
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
+    this->postamble();
+
+    for (auto& inj : eltwise_injectors)
+        inj->prepare_table();
+}
+
+template struct jit_uni_fork_dw_conv_fwd_kernel_f32<avx512_core>;
+template struct jit_uni_fork_dw_conv_fwd_kernel_f32<avx2>;
+template struct jit_uni_fork_dw_conv_fwd_kernel_f32<sse41>;
+
+template <cpu_isa_t isa>
+inline void jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>::load_ddst(
+        int ur_ch_blocks, int ur_str_w) {
+    int repeats = isa == sse41 ? 2 : 1;
+    for (int i = 0; i < repeats; i++) {
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            for (int w = 0; w < ur_str_w; w++) {
+                Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_str_w
+                    + ch*ur_str_w + w);
+                uni_vpxor(vmm_acc, vmm_acc, vmm_acc);
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>::apply_filter(
+        int ur_ch_blocks, int ur_str_w) {
+    int kw = jcp.kw;
+    int kh = jcp.kh;
+    int ow = jcp.ow;
+    int oh = jcp.oh;
+
+    int ch_blk = jcp.ch_block;
+    int stride_h = jcp.stride_h;
+    int stride_w = jcp.stride_w;
+
+    Label iter_exit_label;
+
+    cmp(reg_kh, 0);
+    je(iter_exit_label, T_NEAR);
+
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(iter_kh, reg_kh);
+    Label kh_label;
+    L(kh_label); {
+        mov(aux1_reg_ddst, aux_reg_ddst);
+        mov(aux1_reg_kernel, aux_reg_kernel);
+
+        mov(iter_kw, reg_kw);
+        Label kw_label;
+        L(kw_label); {
+            int repeats = isa == sse41 ? 2 : 1;
+            for (int i = 0; i < repeats; i++) {
+                for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                    int ker_off = ch*kh*kw*ch_blk + i*4;
+                    Vmm vmm_ker = get_ker_reg(0);
+                    uni_vmovups(vmm_ker, ptr[aux1_reg_kernel
+                        + ker_off*sizeof(float)]);
+
+                    for (int w = 0; w < ur_str_w; w++) {
+                        int ddst_off = (ch*oh*ow + w)*ch_blk + i*4;
+
+                        Vmm vmm_src = get_src_reg(0);
+                        uni_vmovups(vmm_src, ptr[aux1_reg_ddst
+                            + ddst_off*sizeof(float)]);
+
+                        Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_str_w
+                            + ch*ur_str_w + w);
+                        uni_vfmadd231ps(vmm_acc, vmm_src, vmm_ker);
+                    }
+                }
+            }
+
+            add(aux1_reg_kernel, ch_blk*stride_w*sizeof(float));
+            sub(aux1_reg_ddst, ch_blk*sizeof(float));
+
+            sub(iter_kw, stride_w);
+            cmp(iter_kw, 0);
+            jg(kw_label, T_NEAR);
+        }
+
+        add(aux_reg_kernel, kw*ch_blk*stride_h*sizeof(float));
+        sub(aux_reg_ddst, ow*ch_blk*sizeof(float));
+
+        sub(iter_kh, stride_h);
+        cmp(iter_kh, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>::apply_postprocess(int ur_ch_blocks, int ur_str_w) {
+    int repeats = isa == sse41 ? 2 : 1;
+
+    const auto &p = attr_.post_ops_;
+    std::size_t post_ops_data_offset = 0;
+    int depthwise_inj_idx = 0;
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            mov(reg_d_weights, ptr[this->rsp + post_ops_data_offset]);
+            add(reg_d_weights, ptr[this->param1 + GET_OFF(ic_off)]);
+
+            for (int ch = 0; ch < ur_ch_blocks; ch++) {
+                for (int k = 0; k < repeats; k++) {
+                    int start_idx = get_acc_reg(k*ur_ch_blocks*ur_str_w + ur_str_w * ch).getIdx();
+                    int end_idx = get_acc_reg(k*ur_ch_blocks*ur_str_w + ur_str_w * ch + ur_str_w).getIdx();
+
+                    depthwise_injectors[depthwise_inj_idx]->compute_vector_range(start_idx, end_idx, reg_d_weights, reg_d_weights);
+
+                    add(reg_d_weights, jcp.ch_block / repeats * sizeof(float));
+                    add(reg_d_bias, jcp.ch_block / repeats * sizeof(float));
+                }
+            }
+            post_ops_data_offset += depthwise_injectors[depthwise_inj_idx]->memoryStep();
+            depthwise_inj_idx++;
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>::store_dsrc(
+        int ur_ch_blocks, int ur_str_w) {
+    int ch_blk = jcp.ch_block;
+    int iw = jcp.iw;
+    int ih = jcp.ih;
+    int stride_w = jcp.stride_w;
+
+    int repeats = isa == sse41 ? 2 : 1;
+    for (int i = 0; i < repeats; i++) {
+        for (int ch = 0; ch < ur_ch_blocks; ch++) {
+            for (int w = 0; w < ur_str_w; w++) {
+                int dsrc_off = (ch*ih*iw + w*stride_w)*ch_blk + i*4;
+                Vmm vmm_acc = get_acc_reg(i*ur_ch_blocks*ur_str_w
+                    + ch*ur_str_w + w);
+
+                uni_vmovups(ptr[reg_dsrc + dsrc_off*sizeof(float)], vmm_acc);
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+inline void jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>::loop_body(
+        int ur_ch_blocks) {
+    Label unrolled_w_label;
+    Label tail_w_label;
+    Label exit_label;
+
+    L(unrolled_w_label); {
+        int ur_w = jcp.ur_w;
+
+        cmp(reg_ur_str_w, ur_w);
+        jl(tail_w_label, T_NEAR);
+
+        mov(aux_reg_ddst, reg_ddst);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_ddst(ur_ch_blocks, ur_w);
+        apply_filter(ur_ch_blocks, ur_w);
+        apply_postprocess(ur_ch_blocks, ur_w);
+        store_dsrc(ur_ch_blocks, ur_w);
+
+        add(reg_dsrc, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_ddst, sizeof(float) * ur_w * jcp.ch_block);
+
+        sub(reg_ur_str_w, ur_w);
+        jmp(unrolled_w_label);
+    }
+
+    L(tail_w_label); {
+        int ur_w = 1;
+
+        cmp(reg_ur_str_w, ur_w);
+        jl(exit_label, T_NEAR);
+
+        mov(aux_reg_ddst, reg_ddst);
+        mov(aux_reg_kernel, reg_kernel);
+
+        load_ddst(ur_ch_blocks, ur_w);
+        apply_filter(ur_ch_blocks, ur_w);
+        apply_postprocess(ur_ch_blocks, ur_w);
+        store_dsrc(ur_ch_blocks, ur_w);
+
+        add(reg_dsrc, sizeof(float) * ur_w * jcp.ch_block * jcp.stride_w);
+        add(reg_ddst, sizeof(float) * ur_w * jcp.ch_block);
+
+        sub(reg_ur_str_w, ur_w);
+        jmp(tail_w_label);
+    }
+
+    L(exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>::generate() {
+    const auto &p = attr_.post_ops_;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_depthwise()) {
+            depthwise_injectors.push_back(new jit_uni_depthwise_injector_f32<isa>(
+                    this,
+                    post_op
+            ));
+        }
+    }
+
+    preamble();
+
+    std::size_t post_ops_pointers_count = 0;
+    for (int i = 0; i < p.len(); i++) {
+        if (p.entry_[i].is_depthwise() || p.entry_[i].is_quantization()) {
+            post_ops_pointers_count++;
+        }
+    }
+
+    if (post_ops_pointers_count != 0) {
+        sub(rsp, post_ops_pointers_count * sizeof(float *));
+
+        auto aux_reg0 = reg_dsrc;
+        auto aux_reg1 = reg_ddst;
+
+        mov(aux_reg0, ptr[this->param1 + GET_OFF(post_ops_binary_rhs_arg_vec)]);
+        for (size_t i = 0; i < post_ops_pointers_count; i++) {
+            mov(aux_reg1, ptr[aux_reg0 + i * sizeof(float *)]);
+            mov(ptr[rsp + i * sizeof(float *)], aux_reg1);
+        }
+    }
+
+    mov(reg_dsrc, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_ddst, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    mov(reg_kw, ptr[this->param1 + GET_OFF(kw_padding)]);
+    mov(reg_ch_blocks, ptr[this->param1 + GET_OFF(ch_blocks)]);
+    mov(reg_ur_str_w, ptr[this->param1 + GET_OFF(ur_str_w)]);
+
+    Label ch_blocks_tail_label;
+    Label exit_label;
+
+    int ch_blocks_tail = jcp.nb_ch % jcp.nb_ch_blocking;
+
+    cmp(reg_ch_blocks, jcp.nb_ch_blocking);
+    jne(ch_blocks_tail ? ch_blocks_tail_label : exit_label, T_NEAR);
+
+    loop_body(jcp.nb_ch_blocking); // channel main loop
+
+    if (ch_blocks_tail) {
+        L(ch_blocks_tail_label);
+
+        cmp(reg_ch_blocks, ch_blocks_tail);
+        jne(exit_label, T_NEAR);
+
+        loop_body(ch_blocks_tail); // channel tail loop
+    }
+
+    L(exit_label);
+
+    if (post_ops_pointers_count != 0) {
+        add(rsp, post_ops_pointers_count * sizeof(float *));
+    }
+
+    this->postamble();
+}
+
+template struct jit_uni_fork_dw_conv_bwd_data_kernel_f32<avx512_core>;
+template struct jit_uni_fork_dw_conv_bwd_data_kernel_f32<avx2>;
+template struct jit_uni_fork_dw_conv_bwd_data_kernel_f32<sse41>;
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_uni_fork_dw_conv_kernel_f32.hpp b/src/cpu/x64/jit_uni_fork_dw_conv_kernel_f32.hpp
new file mode 100644
index 00000000000..eb6ac44cff2
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_dw_conv_kernel_f32.hpp
@@ -0,0 +1,200 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_FORK_DW_CONV_KERNEL_HPP
+#define CPU_X64_JIT_UNI_FORK_DW_CONV_KERNEL_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_quantization_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa>
+struct jit_uni_fork_dw_conv_fwd_kernel_f32 : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_fork_dw_conv_fwd_kernel_f32)
+
+    jit_uni_fork_dw_conv_fwd_kernel_f32(const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md, const primitive_attr_t &attr)
+            : jit_generator_t(jit_name()), jcp(ajcp), attr_(attr), dst_md_(dst_md) {
+    }
+
+    ~jit_uni_fork_dw_conv_fwd_kernel_f32() {
+        for (auto inj : eltwise_injectors)
+            delete inj;
+        eltwise_injectors.clear();
+
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+
+        for (auto inj : quantization_injectors)
+            delete inj;
+        quantization_injectors.clear();
+    }
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    memory_desc_t dst_md_;
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
+        isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using mask_t = const Xbyak::Opmask;
+    using reg64_t = const Xbyak::Reg64;
+    const Xbyak::AddressFrame &vmmword = (isa == sse41)
+        ? xword : (isa == avx2) ? yword : zword;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
+
+    // dw convolution
+    reg64_t reg_input = r8;
+    reg64_t aux_reg_input = r9;
+    reg64_t aux1_reg_input = r10;
+    reg64_t reg_kernel = r11;
+    reg64_t aux_reg_kernel = r12;
+    reg64_t reg_ch_blocks = r13;
+    reg64_t reg_output = r14;
+    reg64_t reg_bias = r15;
+    reg64_t reg_tail = rax;
+    reg64_t reg_kw = rbx;
+    reg64_t iter_kh = rdx;
+    reg64_t iter_kw = rsi;
+    reg64_t reg_ur_w = rbp;
+    reg64_t reg_kh = reg_tail;
+    reg64_t aux1_reg_kernel = reg_ch_blocks;
+    reg64_t imm_addr64 = aux1_reg_input;
+    reg64_t aux_reg_ch_blocks = reg_ur_w;
+    reg64_t aux_reg_blocks_offset = abi_not_param1;
+
+    reg64_t reg_d_weights = imm_addr64;
+    reg64_t reg_d_bias = iter_kh;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    reg64_t reg_kd = aux_reg_blocks_offset;
+    reg64_t aux_reg_inp_d = reg_input;
+    reg64_t aux_reg_ker_d = reg_kernel;
+
+    mask_t k_oc_tail_mask = Xbyak::Opmask(2);
+
+    Vmm vmm_d_weights = Vmm(0);
+    Vmm vmm_d_bias = Vmm(1);
+
+    inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); }
+    inline Vmm get_src_reg(int idx) { return Vmm(idx + 1); }
+    inline Vmm get_acc_reg(int idx) { return Vmm(idx + 4); }
+
+    inline bool is_src_layout_nxc() {
+        return utils::one_of(jcp.src_tag, format_tag::ndhwc, format_tag::nhwc,
+                             format_tag::nwc);
+    }
+    inline bool is_dst_layout_nxc() {
+        return utils::one_of(jcp.dst_tag, format_tag::ndhwc, format_tag::nhwc,
+                             format_tag::nwc);
+    }
+
+    inline void load_src(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+    inline void compute_loop(int ur_w, int ur_ch_blocks);
+    inline void apply_filter(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+    inline void apply_filter_unrolled(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+    inline void apply_postprocess(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+    inline void store_dst(int ur_ch_blocks, int ur_w, bool is_ch_tail);
+    inline void loop_body(int ur_ch_blocks);
+
+    void load_tail(
+            Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int load_size);
+    void add_tail_from_mem(Vmm &vmm_acc, Vmm &vmm_tmp, const Xbyak::Reg64 &reg,
+                           int64_t offset, int load_size);
+    void store_tail(
+            Vmm &vmm, const Xbyak::Reg64 &reg, int64_t offset, int store_size);
+
+    void generate() override;
+
+    nstl::vector<jit_uni_eltwise_injector_t<isa>*> eltwise_injectors;
+    nstl::vector<jit_uni_depthwise_injector_f32<isa>*> depthwise_injectors;
+    nstl::vector<jit_uni_quantization_injector_f32<isa>*> quantization_injectors;
+    std::unique_ptr<binary_injector::jit_uni_binary_injector_t<isa>> binary_injector;
+};
+
+template <cpu_isa_t isa>
+struct jit_uni_fork_dw_conv_bwd_data_kernel_f32: public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_fork_dw_conv_bwd_data_kernel_f32)
+
+    jit_uni_fork_dw_conv_bwd_data_kernel_f32(const jit_conv_conf_t &ajcp, const primitive_attr_t &attr)
+            : jit_generator_t(jit_name()), jcp(ajcp), attr_(attr) {}
+
+    ~jit_uni_fork_dw_conv_bwd_data_kernel_f32() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
+        isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using reg64_t = const Xbyak::Reg64;
+
+    inline Vmm get_ker_reg(int idx) { return Vmm(idx + 0); }
+    inline Vmm get_src_reg(int idx) { return Vmm(idx + 1); }
+    inline Vmm get_acc_reg(int idx) { return Vmm(idx + 4); }
+
+    reg64_t reg_ddst       = rax;
+    reg64_t aux_reg_ddst   = r8;
+    reg64_t aux1_reg_ddst = abi_not_param1;
+    reg64_t reg_kernel     = rdx;
+    reg64_t aux_reg_kernel = r10;
+    reg64_t aux1_reg_kernel = rbp;
+    reg64_t reg_dsrc       = rsi;
+
+    reg64_t reg_ur_str_w = r9;
+    reg64_t reg_ch_blocks = rbx;
+
+    reg64_t iter_kh = r11;
+    reg64_t iter_kw = r12;
+    reg64_t reg_kh  = r13;
+    reg64_t reg_kw  = r14;
+
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = iter_kh;
+
+    inline void loop_body(int ur_ch_blocks);
+    inline void load_ddst(int ur_ch_blocks, int ur_str_w);
+    inline void apply_filter(int ur_ch_blocks, int ur_str_w);
+    inline void apply_postprocess(int ur_ch_blocks, int ur_str_w);
+    inline void store_dsrc(int ur_ch_blocks, int ur_str_w);
+
+    void generate() override;
+
+    nstl::vector<jit_uni_depthwise_injector_f32<isa>*> depthwise_injectors;
+};
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_uni_fork_dw_conv_kernel_utils.hpp b/src/cpu/x64/jit_uni_fork_dw_conv_kernel_utils.hpp
new file mode 100644
index 00000000000..8cdbebe7e9d
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_dw_conv_kernel_utils.hpp
@@ -0,0 +1,438 @@
+/*******************************************************************************
+* Copyright 2019-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_FORK_DW_CONV_KERNEL_UTILS_HPP
+#define CPU_X64_JIT_UNI_FORK_DW_CONV_KERNEL_UTILS_HPP
+
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+
+#include "cpu/x64/jit_avx512_core_fork_bf16_dw_conv_kernel.hpp"
+#include "cpu/x64/jit_uni_fork_dw_conv_kernel_f32.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+struct jit_uni_fork_dw_conv_fwd_kernel {
+
+    jit_uni_fork_dw_conv_fwd_kernel(const jit_conv_conf_t &ajcp, const memory_desc_t &dst_md, const primitive_attr_t &attr) : ker_(nullptr) {
+        ker_ = new jit_kernel_t(ajcp, dst_md, attr);
+    }
+
+    status_t create_kernel() { return ker_->create_kernel(); }
+    ~jit_uni_fork_dw_conv_fwd_kernel() { delete ker_; }
+
+    static bool post_ops_ok(jit_conv_conf_t &jcp, const primitive_attr_t &attr);
+
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const convolution_desc_t &cd, memory_desc_t &src_md,
+            memory_desc_t &weights_md, memory_desc_t &bias_md,
+            memory_desc_t &dst_md, const primitive_attr_t &attr);
+
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+            const jit_conv_conf_t &jcp);
+
+    jit_generator_t *ker() const { return ker_; }
+    void operator()(const jit_conv_call_s *p) const { (*ker_)(p); }
+
+private:
+    using jit_kernel_t = typename utils::conditional<isa == avx512_core
+                    && kernel_dt == data_type::bf16,
+            jit_avx512_fork_dw_conv_fwd_kernel_bf16,
+            jit_uni_fork_dw_conv_fwd_kernel_f32<isa>>::type;
+    jit_kernel_t *ker_;
+};
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+bool jit_uni_fork_dw_conv_fwd_kernel<isa, kernel_dt>::post_ops_ok(
+        jit_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = 0; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::sum, primitive_kind::binary, primitive_kind::eltwise, primitive_kind::depthwise, primitive_kind::quantization);
+        }
+        return ok;
+    };
+    auto contain = [&](dnnl::impl::primitive_kind_t kind) { return p.find(kind) != -1; };
+    auto position = [&](dnnl::impl::primitive_kind_t kind) { return p.find(kind); };
+    auto count = [&](dnnl::impl::primitive_kind_t kind) { return p.count(kind); };
+
+    return all_post_ops_supported() &&
+           count(primitive_kind::sum) <= 1 &&
+           IMPLICATION(contain(primitive_kind::sum), position(primitive_kind::sum) == 0);
+}
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+status_t jit_uni_fork_dw_conv_fwd_kernel<isa, kernel_dt>::init_conf(
+        jit_conv_conf_t &jcp, const convolution_desc_t &cd,
+        memory_desc_t &src_md, memory_desc_t &weights_md,
+        memory_desc_t &bias_md, memory_desc_t &dst_md,
+        const primitive_attr_t &attr) {
+
+    using namespace dnnl::impl::format_tag;
+    using namespace dnnl::impl::utils;
+
+    const memory_desc_wrapper src_d(&src_md);
+    const memory_desc_wrapper weights_d(&weights_md);
+    const memory_desc_wrapper dst_d(&dst_md);
+    const memory_desc_wrapper bias_d(&bias_md);
+
+    const int ndims = src_d.ndims();
+
+    const auto blocked_tag = one_of(isa, avx512_core) ?
+                             pick(ndims - 3, nCw16c, nChw16c, nCdhw16c) :
+                             pick(ndims - 3, nCw8c, nChw8c, nCdhw8c);
+    const auto wei_tag = one_of(isa, avx512_core) ?
+                         pick(ndims - 3, Goiw16g, Goihw16g, Goidhw16g) :
+                         pick(ndims - 3, Goiw8g, Goihw8g, Goidhw8g);
+    const auto nxc_tag = pick(ndims - 3, nwc, nhwc, ndhwc);
+
+    jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
+
+    if (src_d.format_kind() == format_kind::any) {
+        CHECK(memory_desc_init_by_tag(src_md, blocked_tag));
+        jcp.src_tag = blocked_tag;
+    } else {
+        jcp.src_tag = src_d.mb_stride_relaxed_match(blocked_tag, nxc_tag);
+    }
+
+    if (weights_d.format_kind() == format_kind::any) {
+        CHECK(memory_desc_init_by_tag(weights_md, wei_tag));
+        jcp.wei_tag = wei_tag;
+    } else {
+        jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
+    }
+
+    if (dst_d.format_kind() == format_kind::any) {
+        CHECK(memory_desc_init_by_tag(dst_md, blocked_tag));
+        jcp.dst_tag = blocked_tag;
+    } else {
+        jcp.dst_tag = dst_d.mb_stride_relaxed_match(blocked_tag, nxc_tag);
+    }
+
+    if (jcp.with_bias) {
+        if (bias_d.format_kind() == format_kind::any)
+            CHECK(memory_desc_init_by_tag(bias_md, format_tag::x));
+    }
+
+    if (jcp.dst_tag != jcp.src_tag) return status::unimplemented;
+    const auto data_tag = jcp.src_tag;
+    const bool is_data_layout_nxc = data_tag == nxc_tag;
+
+    const bool is_bf16 = src_d.data_type() == data_type::bf16;
+    // 3D bf16 fork DW kernel does not support 3D convolution
+    if (is_bf16 && ndims == 5) return status::unimplemented;
+
+    jcp.dst_dt = cd.dst_desc.data_type;
+    jcp.isa = (is_bf16 && mayiuse(avx512_core_bf16)) ? avx512_core_bf16 : isa;
+
+    if (!mayiuse(isa) || (is_bf16 && !mayiuse(avx512_core)))
+        return status::unimplemented;
+
+    const int simd_w = one_of(isa, avx512_core, avx512_core) ? 16 : 8;
+
+    jcp.prop_kind = cd.prop_kind;
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    if (!with_groups) return status::unimplemented;
+
+    jcp.ndims = ndims;
+
+    jcp.ngroups = weights_d.dims()[0];
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1];
+    jcp.oc_without_padding = jcp.oc;
+    jcp.ic = src_d.dims()[1];
+
+    jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims - 2];
+    jcp.iw = src_d.dims()[ndims - 1];
+    jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims - 2];
+    jcp.ow = dst_d.dims()[ndims - 1];
+
+    jcp.kd = (ndims == 5) ? weights_d.dims()[3] : 1;
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[ndims - 1];
+    jcp.kw = weights_d.dims()[ndims];
+
+    jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims - 4];
+    jcp.l_pad = cd.padding[0][ndims - 3];
+    jcp.back_pad = (ndims == 5) ? cd.padding[1][0] : 0;
+    jcp.b_pad = (ndims == 3) ? 0 : cd.padding[1][ndims - 4];
+    jcp.r_pad = cd.padding[1][ndims - 3];
+
+    jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims - 4];
+    jcp.stride_w = cd.strides[ndims - 3];
+
+    jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims - 4];
+    jcp.dilate_w = cd.dilates[ndims - 3];
+
+    jcp.loop_order = loop_ngcw;
+
+    if (is_data_layout_nxc) {
+        jcp.loop_order = loop_nhwcg;
+    }
+
+    if (!post_ops_ok(jcp, attr))
+            return status::unimplemented;
+
+    const auto &p = attr.post_ops_;
+    jcp.with_sum = p.find(primitive_kind::sum) != -1;
+    const int eltwise_ind = p.find(primitive_kind::eltwise);
+    jcp.with_eltwise = eltwise_ind != -1;
+    if (jcp.with_eltwise)
+        jcp.eltwise = p.entry_[eltwise_ind].eltwise;
+
+    jcp.post_ops = p;
+
+    bool ok_to_pad_channels = true
+        && !is_data_layout_nxc
+        && jcp.oc == jcp.ngroups
+        && jcp.ic == jcp.ngroups
+        && one_of(isa, avx512_core, avx512_core, avx2);
+    if (ok_to_pad_channels) {
+        jcp.oc = rnd_up(jcp.oc, simd_w);
+        jcp.ic = rnd_up(jcp.oc, simd_w);
+        jcp.ngroups = rnd_up(jcp.ngroups, simd_w);
+    }
+
+    bool args_ok = true && jcp.oc == jcp.ngroups && jcp.ic == jcp.ngroups
+                   && IMPLICATION(!is_data_layout_nxc, jcp.ngroups % simd_w == 0)
+                   && jcp.wei_tag == wei_tag
+                   && data_tag != format_tag::undef && jcp.ic <= src_d.padded_dims()[1]
+                   && jcp.oc <= dst_d.padded_dims()[1]
+                   && jcp.ngroups <= weights_d.padded_dims()[0];
+    if (!args_ok) return status::unimplemented;
+
+    jcp.typesize_out = jcp.dst_dt == data_type::bf16 ? sizeof(bfloat16_t)
+                                                     : sizeof(float);
+    jcp.typesize_in = src_d.data_type() == data_type::bf16
+            ? sizeof(bfloat16_t)
+            : sizeof(float);
+
+    jcp.ur_w = is_bf16 ? (isa_has_bf16(jcp.isa) ? 6 : 4)
+                       : isa == avx512_core ? 6 : isa == avx2 ? 4 : 3;
+
+    jcp.ch_block = simd_w;
+    jcp.nb_ch = div_up(jcp.oc, jcp.ch_block);
+    jcp.nb_ch_blocking
+            = one_of(isa, avx512_core, avx512_core) ? 4 : isa == avx2 ? 3 : 2;
+    if (jcp.nb_ch < jcp.nb_ch_blocking)
+        jcp.nb_ch_blocking = jcp.nb_ch;
+
+    jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
+
+    return status::success;
+}
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+void jit_uni_fork_dw_conv_fwd_kernel<isa, kernel_dt>::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) {
+    using namespace dnnl::impl::memory_tracking::names;
+    if (jcp.bia_dt == data_type::bf16)
+        scratchpad.book<float>(key_conv_bias_bf16_convert_wsp, jcp.oc);
+    else if (jcp.with_bias && jcp.oc_without_padding != jcp.oc)
+        scratchpad.book<float>(key_conv_padded_bias, jcp.oc);
+}
+
+template struct jit_uni_fork_dw_conv_fwd_kernel<avx512_core, data_type::bf16>;
+template struct jit_uni_fork_dw_conv_fwd_kernel<avx512_core, data_type::f32>;
+template struct jit_uni_fork_dw_conv_fwd_kernel<avx2, data_type::f32>;
+template struct jit_uni_fork_dw_conv_fwd_kernel<sse41, data_type::f32>;
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+struct jit_uni_fork_dw_conv_bwd_data_kernel {
+
+    jit_uni_fork_dw_conv_bwd_data_kernel(const jit_conv_conf_t &ajcp, const primitive_attr_t &attr)
+        : ker_(nullptr) {
+        ker_ = new jit_kernel_t(ajcp, attr);
+    }
+
+    status_t create_kernel() { return ker_->create_kernel(); }
+    ~jit_uni_fork_dw_conv_bwd_data_kernel() { delete ker_; }
+
+    static bool post_ops_ok(const primitive_attr_t &attr);
+
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const convolution_desc_t &cd, const memory_desc_wrapper &diff_src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &diff_dst_d, const primitive_attr_t &attr);
+
+    static void init_scratchpad(memory_tracking::registrar_t &scratchpad,
+            const jit_conv_conf_t &jcp);
+
+    void operator()(const jit_conv_call_s *p) const { (*ker_)(p); }
+
+private:
+    using jit_kernel_t = typename utils::conditional<isa == avx512_core
+                    && kernel_dt == data_type::bf16,
+            jit_avx512_fork_dw_conv_bwd_data_kernel_bf16,
+            jit_uni_fork_dw_conv_bwd_data_kernel_f32<isa>>::type;
+    jit_kernel_t *ker_;
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_fork_dw_conv_bwd_data_kernel);
+};
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+bool jit_uni_fork_dw_conv_bwd_data_kernel<isa, kernel_dt>::post_ops_ok(const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+    if (p.len() > 1)
+        return false;
+
+    auto all_post_ops_supported = [&]() {
+        bool ok = true;
+
+        for (int i = 0; i < p.len(); i++) {
+            ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+        }
+        return ok;
+    };
+
+    return all_post_ops_supported();
+}
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+status_t jit_uni_fork_dw_conv_bwd_data_kernel<isa, kernel_dt>::init_conf(
+        jit_conv_conf_t &jcp, const convolution_desc_t &cd,
+        const memory_desc_wrapper &diff_src_d,
+        const memory_desc_wrapper &weights_d,
+        const memory_desc_wrapper &diff_dst_d, const primitive_attr_t &attr) {
+    using namespace dnnl::impl::format_tag;
+    using namespace dnnl::impl::utils;
+
+    jcp.dsrc_dt = cd.diff_src_desc.data_type;
+    const bool is_bf16 = diff_dst_d.data_type() == data_type::bf16;
+    jcp.isa = (is_bf16 && mayiuse(avx512_core_bf16)) ? avx512_core_bf16 : isa;
+
+    if (!mayiuse(isa) || (is_bf16 && !mayiuse(avx512_core)))
+        return status::unimplemented;
+
+    const int simd_w = one_of(isa, avx512_core, avx512_core) ? 16 : 8;
+
+    const bool with_groups = weights_d.ndims() == diff_src_d.ndims() + 1;
+    if (!with_groups) return status::unimplemented;
+
+    jcp.ngroups = weights_d.dims()[0];
+    jcp.mb = diff_src_d.dims()[0];
+
+    jcp.oc = diff_dst_d.dims()[1];
+    jcp.oc_without_padding = jcp.oc;
+    jcp.ic = diff_src_d.dims()[1];
+
+    jcp.ih = diff_src_d.dims()[2];
+    jcp.iw = diff_src_d.dims()[3];
+    jcp.oh = diff_dst_d.dims()[2];
+    jcp.ow = diff_dst_d.dims()[3];
+
+    jcp.kh = weights_d.dims()[3];
+    jcp.kw = weights_d.dims()[4];
+
+    jcp.t_pad = cd.padding[0][0];
+    jcp.l_pad = cd.padding[0][1];
+    jcp.b_pad = cd.padding[1][0];
+    jcp.r_pad = cd.padding[1][1];
+
+    jcp.stride_h = cd.strides[0];
+    jcp.stride_w = cd.strides[1];
+
+    jcp.dilate_h = cd.dilates[0];
+    jcp.dilate_w = cd.dilates[1];
+
+    jcp.ihp = jcp.ih + jcp.t_pad + jcp.b_pad;
+    jcp.iwp = jcp.iw + jcp.l_pad + jcp.r_pad;
+
+    if (!post_ops_ok(attr))
+        return status::unimplemented;
+
+    jcp.post_ops = attr.post_ops_;
+
+    bool ok_to_pad_channels = true && jcp.oc == jcp.ngroups
+            && jcp.ic == jcp.ngroups
+            && one_of(isa, avx512_core, avx512_core, avx2);
+    if (ok_to_pad_channels) {
+        jcp.oc = rnd_up(jcp.oc, simd_w);
+        jcp.ic = rnd_up(jcp.oc, simd_w);
+        jcp.ngroups = rnd_up(jcp.ngroups, simd_w);
+    }
+
+    auto dat_tag = one_of(isa, avx512_core, avx512_core) ? nChw16c : nChw8c;
+    auto wei_tag = one_of(isa, avx512_core, avx512_core) ? Goihw16g : Goihw8g;
+
+    jcp.src_tag = diff_src_d.mb_stride_relaxed_match(dat_tag);
+    jcp.wei_tag = weights_d.matches_one_of_tag(wei_tag);
+    jcp.dst_tag = diff_dst_d.mb_stride_relaxed_match(dat_tag);
+
+    bool args_ok = true && jcp.oc == jcp.ngroups && jcp.ic == jcp.ngroups
+            && jcp.ngroups % simd_w == 0 && jcp.dilate_h == 0
+            && jcp.dilate_w == 0 && jcp.src_tag == dat_tag
+            && jcp.wei_tag == wei_tag && jcp.dst_tag == dat_tag
+            && jcp.oh == (jcp.ihp - jcp.kh) / jcp.stride_h + 1
+            && jcp.ow == (jcp.iwp - jcp.kw) / jcp.stride_w + 1
+            && jcp.ic <= diff_src_d.padded_dims()[1]
+            && jcp.oc <= diff_dst_d.padded_dims()[1]
+            && jcp.ngroups <= weights_d.padded_dims()[0];
+    if (!args_ok) return status::unimplemented;
+
+    jcp.typesize_out = types::data_type_size(diff_src_d.data_type());
+    jcp.typesize_in = types::data_type_size(diff_dst_d.data_type());
+
+    jcp.ur_w = is_bf16 ? (isa_has_bf16(jcp.isa) ? 6 : 4)
+                       : isa == avx512_core ? 6 : isa == avx2 ? 4 : 3;
+
+    jcp.ch_block = simd_w;
+    jcp.nb_ch = jcp.ic / jcp.ch_block;
+    jcp.nb_ch_blocking
+            = one_of(isa, avx512_core, avx512_core) ? 4 : isa == avx2 ? 3 : 2;
+    if (jcp.nb_ch < jcp.nb_ch_blocking) jcp.nb_ch_blocking = jcp.nb_ch;
+
+    return status::success;
+}
+
+template <cpu_isa_t isa, data_type_t kernel_dt>
+void jit_uni_fork_dw_conv_bwd_data_kernel<isa, kernel_dt>::init_scratchpad(
+        memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp) {
+    UNUSED(scratchpad);
+    UNUSED(jcp);
+}
+
+template struct jit_uni_fork_dw_conv_bwd_data_kernel<avx512_core, data_type::bf16>;
+template struct jit_uni_fork_dw_conv_bwd_data_kernel<avx512_core, data_type::f32>;
+template struct jit_uni_fork_dw_conv_bwd_data_kernel<avx2, data_type::f32>;
+template struct jit_uni_fork_dw_conv_bwd_data_kernel<sse41, data_type::f32>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif /* CPU_X64_JIT_uni_fork_dw_CONV_KERNEL_UTILS_HPP */
diff --git a/src/cpu/x64/jit_uni_fork_dw_convolution.cpp b/src/cpu/x64/jit_uni_fork_dw_convolution.cpp
new file mode 100644
index 00000000000..197f043bf46
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_dw_convolution.cpp
@@ -0,0 +1,346 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "common/bfloat16.hpp"
+
+#include "jit_uni_fork_dw_convolution.hpp"
+#include "cpu/x64/injectors/jit_uni_binary_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+template <cpu_isa_t isa, data_type_t src_type, data_type_t dst_type>
+void jit_uni_fork_dw_convolution_fwd_t<isa, src_type, dst_type>::execute_forward(
+        const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
+    auto dst = CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+    const memory_desc_wrapper bias_d(pd()->weights_md(1));
+
+    f32_data_t *bias = nullptr;
+    if (pd()->desc()->bias_desc.data_type == data_type::bf16) {
+        auto bias_in = CTX_IN_MEM(const bf16_data_t *, DNNL_ARG_BIAS);
+        bias = ctx.get_scratchpad_grantor().template get<f32_data_t>(
+                key_conv_bias_bf16_convert_wsp);
+        cvt_bfloat16_to_float(bias, bias_in, jcp.oc_without_padding);
+        utils::array_set(bias + jcp.oc_without_padding, 0.f,
+                         jcp.oc - jcp.oc_without_padding);
+    } else {
+        auto bias_in = CTX_IN_MEM(const f32_data_t *, DNNL_ARG_BIAS);
+        if (pd()->wants_padded_bias()) {
+            auto padded_bias
+                    = ctx.get_scratchpad_grantor().template get<f32_data_t>(
+                            key_conv_padded_bias);
+            utils::array_copy(padded_bias, bias_in, jcp.oc_without_padding);
+            utils::array_set(padded_bias + jcp.oc_without_padding, 0.f,
+                    jcp.oc - jcp.oc_without_padding);
+            bias = padded_bias;
+        } else
+            bias = const_cast<float*> (bias_in);
+    }
+
+    int dil_d = jcp.dilate_d + 1;
+    int dil_h = jcp.dilate_h + 1;
+    int dil_w = jcp.dilate_w + 1;
+    int str_d = jcp.stride_d;
+    int str_h = jcp.stride_h;
+    int str_w = jcp.stride_w;
+
+    const auto is_src_layout_nxc = one_of(jcp.src_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
+    const auto is_dst_layout_nxc = one_of(jcp.dst_tag, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
+
+    auto kernel_params = [&](int ur_w_step, int ow, int oh, int od, int ih, int id, int kh, int kd,
+            int kh_padding, int kd_padding, int ch, int ch_step, int n, int work_rem) {
+        auto par_conv = jit_conv_call_s();
+
+        const int i_l_overflow = nstl::max(0, (jcp.l_pad - ow * str_w));
+        const int i_r_overflow = nstl::max(jcp.iw, (ow * str_w
+            + (jcp.kw - 1)*dil_w - jcp.l_pad + 1)) - jcp.iw;
+
+        const int iw = nstl::max((ow*str_w - jcp.l_pad
+            + div_up(i_l_overflow, dil_w)*dil_w), 0);
+        const int kw = div_up(i_l_overflow, dil_w);
+
+        const int kw_padding = jcp.kw - div_up(i_l_overflow, dil_w)
+            - div_up(i_r_overflow, dil_w);
+
+        const auto ic_off_idx = is_src_layout_nxc ? ch * jcp.ch_block : ch;
+        const auto oc_off_idx = is_dst_layout_nxc ? ch * jcp.ch_block : ch;
+
+        size_t src_off = (jcp.ndims == 3) ? src_d.blk_off(n, ic_off_idx, iw) :
+                         (jcp.ndims == 4) ? src_d.blk_off(n, ic_off_idx, ih, iw) : src_d.blk_off(n, ic_off_idx, id, ih, iw);
+        size_t dst_off = (jcp.ndims == 3) ? dst_d.blk_off(n, oc_off_idx, ow) :
+                         (jcp.ndims == 4) ? dst_d.blk_off(n, oc_off_idx, oh, ow) : dst_d.blk_off(n, oc_off_idx, od, oh, ow);
+        size_t wei_off = (jcp.ndims == 3) ? weights_d.blk_off(ch, 0, 0, kw) :
+                         (jcp.ndims == 4) ? weights_d.blk_off(ch, 0, 0, kh, kw) : weights_d.blk_off(ch, 0, 0, kd, kh, kw);
+
+        par_conv.src = &src[src_off];
+        par_conv.dst = &dst[dst_off];
+        par_conv.filt = &weights[wei_off];
+        par_conv.dst_orig = dst;
+
+        if (bias) par_conv.bias = &bias[bias_d.blk_off(ch*jcp.ch_block)];
+
+        par_conv.kd_padding = (size_t)nstl::max(0, kd_padding);
+        par_conv.kh_padding = (size_t)nstl::max(0, kh_padding);
+        par_conv.kw_padding = (size_t)nstl::max(0, kw_padding);
+
+        par_conv.ur_w = (size_t)ur_w_step;
+
+        assert(IMPLICATION(
+                jcp.loop_order == loop_nhwcg, is_src_layout_nxc));
+        // For is_src_layout_nxc maximize jit work along contiguous dim.
+        par_conv.load_work = utils::this_block_size(ch * jcp.ch_block,
+                                                    jcp.oc_without_padding,
+                                                    (is_src_layout_nxc ? work_rem * ch_step : ch_step)
+                                                    * jcp.ch_block);
+        par_conv.oc_off = ch * jcp.ch_block * sizeof(float);
+        par_conv.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+
+        return par_conv;
+    };
+
+    const int ch_step = jcp.nb_ch_blocking;
+    const int chb_work = utils::div_up(jcp.nb_ch, ch_step);
+
+    const int work_amount = jcp.mb * chb_work * jcp.od * jcp.oh;
+    const auto nthr = jcp.nthr;
+
+    parallel(nthr, [&](const int ithr, const int nthr) {
+        int start {0}, end {0};
+        balance211(work_amount, nthr, ithr, start, end);
+
+        int n {0}, chb {0}, od {0}, oh {0};
+        if (jcp.loop_order == loop_ngcw)
+            utils::nd_iterator_init(
+                    start, n, jcp.mb, chb, chb_work, od, jcp.od, oh, jcp.oh);
+        else if (jcp.loop_order == loop_nhwcg)
+            utils::nd_iterator_init(
+                    start, n, jcp.mb, od, jcp.od, oh, jcp.oh, chb, chb_work);
+        else
+            assert(!"unsupported loop order");
+
+        auto iwork = start;
+        while (iwork < end) {
+            int ch = chb * ch_step;
+
+            const int i_front_overflow = nstl::max(0, (int) (jcp.f_pad - od * str_d));
+            const int i_back_overflow = nstl::max(jcp.id,
+                                                  (int) (od * str_d + (jcp.kd - 1) * dil_d - jcp.f_pad + 1)) - jcp.id;
+
+            const int i_t_overflow = nstl::max(0, (int) (jcp.t_pad - oh * str_h));
+            const int i_b_overflow = nstl::max(jcp.ih,
+                                               (int) (oh * str_h + (jcp.kh - 1) * dil_h - jcp.t_pad + 1)) - jcp.ih;
+
+            const int id = nstl::max((int) (od * str_d - jcp.f_pad
+                                            + div_up(i_front_overflow, dil_d) * dil_d), 0);
+            const int kd = div_up(i_front_overflow, dil_d);
+            const int kd_padding = jcp.kd - div_up(i_front_overflow, dil_d)
+                                   - div_up(i_back_overflow, dil_d);
+
+            const int ih = nstl::max((int) (oh * str_h - jcp.t_pad
+                                            + div_up(i_t_overflow, dil_h) * dil_h), 0);
+            const int kh = div_up(i_t_overflow, dil_h);
+            const int kh_padding = jcp.kh - div_up(i_t_overflow, dil_h)
+                                   - div_up(i_b_overflow, dil_h);
+
+            // left border
+            int ow = 0;
+            int l_border = nstl::min(div_up(jcp.l_pad, str_w), jcp.ow);
+            int ur_w_step = 1;
+            for (; ow < l_border; ow++) {
+                jit_conv_call_s par_conv = kernel_params(ur_w_step, ow, oh, od, ih, id,
+                                                         kh, kd, kh_padding, kd_padding, ch, ch_step, n, end - iwork);
+
+                (*kernel_)(&par_conv);
+            }
+
+            // main loop
+            ur_w_step = (jcp.iw - (jcp.kw - 1) * dil_w + jcp.l_pad - 1)
+                        / jcp.stride_w - ow + 1;
+            if (ur_w_step > 0) {
+                jit_conv_call_s par_conv = kernel_params(ur_w_step, ow, oh, od, ih, id,
+                                                         kh, kd, kh_padding, kd_padding, ch, ch_step, n, end - iwork);
+
+                (*kernel_)(&par_conv);
+
+                ow += ur_w_step;
+            }
+
+            // right border
+            ur_w_step = 1;
+            for (; ow < jcp.ow; ow++) {
+                jit_conv_call_s par_conv = kernel_params(ur_w_step, ow, oh, od, ih, id,
+                                                         kh, kd, kh_padding, kd_padding, ch, ch_step, n, end - iwork);
+
+                (*kernel_)(&par_conv);
+            }
+
+            if (jcp.loop_order == loop_ngcw) {
+                ++iwork;
+                utils::nd_iterator_step(n, jcp.mb, chb, chb_work, od, jcp.od, oh, jcp.oh);
+            } else if (jcp.loop_order == loop_nhwcg) {
+                utils::nd_iterator_jump(
+                        iwork, end, n, jcp.mb, od, jcp.od, oh, jcp.oh, chb, chb_work);
+            } else
+                assert(!"unsupported loop order");
+        }
+    });
+
+    if (pd()->wants_zero_pad_dst())
+        ctx.zero_pad_output(DNNL_ARG_DST);
+}
+
+template struct jit_uni_fork_dw_convolution_fwd_t<avx512_core, data_type::bf16,
+        data_type::f32>;
+template struct jit_uni_fork_dw_convolution_fwd_t<avx512_core, data_type::bf16>;
+template struct jit_uni_fork_dw_convolution_fwd_t<avx512_core, data_type::f32>;
+template struct jit_uni_fork_dw_convolution_fwd_t<avx2, data_type::f32>;
+template struct jit_uni_fork_dw_convolution_fwd_t<sse41, data_type::f32>;
+
+template <cpu_isa_t isa, data_type_t diff_dst_type, data_type_t diff_src_type>
+void jit_uni_fork_dw_convolution_bwd_data_t<isa, diff_dst_type, diff_src_type>
+        ::execute_backward_data(const exec_ctx_t &ctx) const {
+    auto diff_dst = CTX_IN_MEM(const diff_dst_data_t *, DNNL_ARG_DIFF_DST);
+    auto weights = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
+    auto diff_src = CTX_OUT_MEM(diff_src_data_t *, DNNL_ARG_DIFF_SRC);
+
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(jcp.post_ops, ctx);
+
+    const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
+    const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+
+    auto kernel_params = [&](int ur_str_w, int iw, int oh, int ih,
+            int i_t_overflow, int i_b_overflow, int stride_off_h,
+            int ch, int ch_num, int n) {
+        auto par_conv = jit_conv_call_s();
+
+        const int i_l_overflow = nstl::max(0, (jcp.kw - 1 - iw - jcp.l_pad));
+        const int i_r_overflow = nstl::max(0, (jcp.kw - 1 - (jcp.iw - 1 - iw)
+            - jcp.r_pad));
+
+        int ow = iw + jcp.l_pad - i_r_overflow;
+        int stride_off_w = ow % jcp.stride_w;
+        ow /= jcp.stride_w;
+
+        par_conv.src = &diff_src[diff_src_d.blk_off(n, ch, ih, iw)];
+        par_conv.dst = &diff_dst[diff_dst_d.blk_off(n, ch, oh, ow)];
+        par_conv.filt = &weights[weights_d.blk_off(ch, 0, 0, i_b_overflow
+            + stride_off_h, i_r_overflow + stride_off_w)];
+
+        par_conv.kh_padding = nstl::max(0, jcp.kh - i_t_overflow - i_b_overflow
+            - stride_off_h);
+        par_conv.kw_padding = nstl::max(0, jcp.kw - i_l_overflow - i_r_overflow
+            - stride_off_w);
+
+        par_conv.ur_str_w = ur_str_w;
+
+        par_conv.ch_blocks = nstl::min(ch + ch_num, jcp.nb_ch) - ch;
+        par_conv.ic_off = ch * jcp.ch_block * sizeof(float);
+        par_conv.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
+
+        return par_conv;
+    };
+
+    const int chb_work = utils::div_up(jcp.nb_ch, jcp.nb_ch_blocking);
+    parallel_nd(jcp.mb, chb_work, jcp.ih,
+        [&](int n, int chb, int ih) {
+        int ch = chb * jcp.nb_ch_blocking;
+        int ch_num = jcp.nb_ch_blocking;
+
+        const int i_t_overflow = nstl::max(0, (int)(jcp.kh - 1 - ih
+            - jcp.t_pad));
+        const int i_b_overflow = nstl::max(0, (int)(jcp.kh - 1
+            - (jcp.ih - 1 - ih) - jcp.b_pad));
+
+        int oh = ih + jcp.t_pad - i_b_overflow;
+        int stride_off_h = oh % jcp.stride_h;
+        oh /= jcp.stride_h;
+
+        for (int i_str_w = 0; i_str_w < jcp.stride_w; i_str_w++) {
+            // left border
+            int iw = i_str_w;
+            int l_border = nstl::min(jcp.kw - 1 - jcp.l_pad, jcp.iw);
+            int ur_str_w = 1;
+            for (; iw < l_border; iw += jcp.stride_w) {
+                jit_conv_call_s par_conv = kernel_params(ur_str_w, iw, oh,
+                                             ih, i_t_overflow, i_b_overflow,
+                                             stride_off_h, ch, ch_num, n);
+
+                (*kernel_)(&par_conv);
+            }
+
+            // main loop
+            ur_str_w = nstl::min((jcp.iw - jcp.kw + jcp.r_pad - iw)
+                 / jcp.stride_w, jcp.iw);
+            while (iw + ur_str_w * jcp.stride_w > jcp.iw)
+                ur_str_w--;
+            if (ur_str_w > 0) {
+                jit_conv_call_s par_conv = kernel_params(ur_str_w, iw, oh,
+                                             ih, i_t_overflow, i_b_overflow,
+                                             stride_off_h, ch, ch_num, n);
+
+                (*kernel_)(&par_conv);
+
+                iw += ur_str_w * jcp.stride_w;
+            }
+
+            // right border
+            ur_str_w = 1;
+            for (; iw < jcp.iw; iw += jcp.stride_w) {
+                jit_conv_call_s par_conv = kernel_params(ur_str_w, iw, oh,
+                                             ih, i_t_overflow, i_b_overflow,
+                                             stride_off_h, ch, ch_num, n);
+
+                (*kernel_)(&par_conv);
+            }
+        }
+    });
+}
+
+template struct jit_uni_fork_dw_convolution_bwd_data_t<avx512_core, data_type::bf16,
+        data_type::f32>;
+template struct jit_uni_fork_dw_convolution_bwd_data_t<avx512_core,
+        data_type::bf16>;
+template struct jit_uni_fork_dw_convolution_bwd_data_t<avx512_core,
+        data_type::f32>;
+template struct jit_uni_fork_dw_convolution_bwd_data_t<avx2, data_type::f32>;
+template struct jit_uni_fork_dw_convolution_bwd_data_t<sse41, data_type::f32>;
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_uni_fork_dw_convolution.hpp b/src/cpu/x64/jit_uni_fork_dw_convolution.hpp
new file mode 100644
index 00000000000..320d549ea23
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_dw_convolution.hpp
@@ -0,0 +1,190 @@
+/*******************************************************************************
+* Copyright 2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_FORK_DW_CONVOLUTION_HPP
+#define CPU_X64_JIT_UNI_FORK_DW_CONVOLUTION_HPP
+
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+#include "cpu/cpu_convolution_pd.hpp"
+
+#include "jit_uni_fork_dw_conv_kernel_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa, data_type_t src_type, data_type_t dst_type = src_type>
+struct jit_uni_fork_dw_convolution_fwd_t : public primitive_t {
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+             const typename pd_t::base_class *hint_fwd_pd)
+                : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_dw:", jcp_.isa, ""),
+                            jit_uni_fork_dw_convolution_fwd_t);
+
+        status_t init(engine_t *engine) {
+            bool ok = true
+                && is_fwd()
+                && set_default_alg_kind(alg_kind::convolution_direct)
+                && expect_data_types(src_type, src_type,
+                        data_type::undef, dst_type, data_type::f32)
+                && IMPLICATION(this->with_bias(), utils::one_of(
+                        this->desc()->bias_desc.data_type, data_type::f32,
+                        data_type::bf16))
+                && attr()->has_default_values(primitive_attr_t::skip_mask_t::post_ops, dst_type)
+                && !has_zero_dim_memory();
+            if (!ok) return status::unimplemented;
+
+            status_t status = jit_uni_fork_dw_conv_fwd_kernel<isa, src_type>::init_conf(jcp_,
+                        *desc(), src_md_, weights_md_, bias_md_, dst_md_, *attr());
+            if (status != status::success) return status;
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_uni_fork_dw_conv_fwd_kernel<isa, src_type>::init_scratchpad(
+                    scratchpad, jcp_);
+
+            return status::success;
+        }
+
+        jit_conv_conf_t jcp_;
+    };
+
+    jit_uni_fork_dw_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    typedef typename prec_traits_t<data_type::f32>::type f32_data_t;
+    typedef typename prec_traits_t<data_type::bf16>::type bf16_data_t;
+    typedef typename prec_traits_t<src_type>::type data_t;
+    typedef typename prec_traits_t<dst_type>::type dst_data_t;
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_,
+                              new jit_uni_fork_dw_conv_fwd_kernel<isa, src_type>(pd()->jcp_, *pd()->dst_md(0), *pd()->attr())));
+        return kernel_->create_kernel();
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_forward(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_forward(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<jit_uni_fork_dw_conv_fwd_kernel<isa, src_type>> kernel_;
+};
+
+using jit_avx512_common_fork_dw_convolution_fwd_t =
+        jit_uni_fork_dw_convolution_fwd_t<avx512_core, data_type::f32>;
+using jit_avx2_fork_dw_convolution_fwd_t =
+        jit_uni_fork_dw_convolution_fwd_t<avx2, data_type::f32>;
+using jit_sse41_fork_dw_convolution_fwd_t =
+        jit_uni_fork_dw_convolution_fwd_t<sse41, data_type::f32>;
+
+template <cpu_isa_t isa, data_type_t diff_dst_type,
+        data_type_t diff_src_type = diff_dst_type>
+struct jit_uni_fork_dw_convolution_bwd_data_t : public primitive_t {
+    struct pd_t : public cpu_convolution_bwd_data_pd_t {
+        pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+                const convolution_fwd_pd_t *hint_fwd_pd)
+            : cpu_convolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+
+        DECLARE_COMMON_PD_T(JIT_IMPL_NAME_HELPER("jit_dw:", jcp_.isa, ""),
+                jit_uni_fork_dw_convolution_bwd_data_t);
+
+        status_t init(engine_t *engine) {
+            bool ok = true && desc()->prop_kind == prop_kind::backward_data
+                    && set_default_alg_kind(alg_kind::convolution_direct)
+                    && expect_data_types(diff_src_type, diff_dst_type,
+                            data_type::undef, diff_dst_type, data_type::f32)
+                    && !has_zero_dim_memory()
+                    && set_default_formats();
+
+            if (!ok) return status::unimplemented;
+
+            status_t status = jit_uni_fork_dw_conv_bwd_data_kernel<isa,
+                    diff_dst_type>::init_conf(jcp_, *desc(), *diff_src_md(),
+                    *weights_md(), *diff_dst_md(), *attr());
+            if (status != status::success) return status;
+
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_uni_fork_dw_conv_bwd_data_kernel<isa,
+                    diff_dst_type>::init_scratchpad(scratchpad, jcp_);
+
+            return status::success;
+        }
+
+        jit_conv_conf_t jcp_;
+
+    protected:
+        bool set_default_formats() {
+            using namespace format_tag;
+
+            auto dat_tag = utils::one_of(isa, avx512_core, avx512_core)
+                    ? nChw16c
+                    : nChw8c;
+            auto wei_tag = utils::one_of(isa, avx512_core, avx512_core)
+                    ? Goihw16g
+                    : Goihw8g;
+
+            return set_default_formats_common(dat_tag, wei_tag, dat_tag);
+        }
+    };
+
+    jit_uni_fork_dw_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
+
+    typedef typename prec_traits_t<diff_src_type>::type diff_src_data_t;
+    typedef typename prec_traits_t<diff_dst_type>::type diff_dst_data_t;
+    typedef typename prec_traits_t<diff_dst_type>::type wei_data_t;
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_,
+                new jit_uni_fork_dw_conv_bwd_data_kernel<isa, diff_dst_type>(
+                        pd()->jcp_, *pd()->attr())));
+        return kernel_->create_kernel();
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_backward_data(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_backward_data(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<jit_uni_fork_dw_conv_bwd_data_kernel<isa, diff_dst_type>>
+            kernel_;
+};
+
+using jit_avx512_common_fork_dw_convolution_bwd_data_t =
+    jit_uni_fork_dw_convolution_bwd_data_t<avx512_core, data_type::f32>;
+using jit_avx2_fork_dw_convolution_bwd_data_t =
+    jit_uni_fork_dw_convolution_bwd_data_t<avx2, data_type::f32>;
+using jit_sse41_fork_dw_convolution_bwd_data_t =
+    jit_uni_fork_dw_convolution_bwd_data_t<sse41, data_type::f32>;
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_uni_fork_softmax.cpp b/src/cpu/x64/jit_uni_fork_softmax.cpp
new file mode 100644
index 00000000000..454146f3c69
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_softmax.cpp
@@ -0,0 +1,111 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/dnnl_thread.hpp"
+#include "cpu/x64/jit_uni_fork_softmax.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace utils;
+
+template <cpu_isa_t isa>
+jit_uni_fork_softmax_fwd_t<isa>::jit_uni_fork_softmax_fwd_t(const pd_t *apd)
+        : primitive_t(apd) {}
+
+template <cpu_isa_t isa>
+status_t jit_uni_fork_softmax_fwd_t<isa>::execute(const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const uint8_t*, DNNL_ARG_SRC);
+    auto dst = CTX_OUT_MEM(uint8_t*, DNNL_ARG_DST);
+
+    const memory_desc_wrapper data_d(pd()->src_md());
+
+    const auto &jpp = pd()->jpp_;
+
+    size_t outer_size = utils::array_product(pd()->src_md()->dims, pd()->desc()->softmax_axis);
+
+    size_t dim = jpp.channels * jpp.inner_size;
+
+    if (jpp.inner_size > 1) {
+        const size_t work_amount = outer_size;
+
+        auto ker = [&](const int ithr, const int nthr) {
+            size_t start{0}, end{0};
+
+            balance211(work_amount, nthr, ithr, start, end);
+
+            size_t ou{0};
+            nd_iterator_init(start, ou, outer_size);
+
+            for (size_t iwork = start; iwork < end; ++iwork) {
+                auto args = jit_softmax_call_s();
+                args.channels = jpp.channels;
+                args.work = jpp.inner_size;
+                size_t off = data_d.off_l(ou * dim);
+                args.src = src + off * jpp.dt_size;
+                args.dst = dst + off * jpp.dt_size;
+
+                (*kernel_)(&args);
+
+                nd_iterator_step(ou, outer_size);
+            }
+        };
+
+        parallel(0, ker);
+    } else {
+        int ou_blocks = div_up(outer_size, jpp.outer_block);
+        const size_t work_amount = ou_blocks;
+
+        auto ker = [&](const int ithr, const int nthr) {
+            size_t start{0}, end{0};
+
+            balance211(work_amount, nthr, ithr, start, end);
+
+            size_t oub{0};
+            nd_iterator_init(start, oub, ou_blocks);
+
+            for (size_t iwork = start; iwork < end; ++iwork) {
+                size_t work = nstl::min(jpp.outer_block, outer_size - oub * jpp.outer_block);
+
+                auto args = jit_softmax_call_s();
+                args.channels = jpp.channels;
+                args.work = work;
+                size_t off = data_d.off_l(oub * jpp.outer_block * dim);
+                args.src = src + off * jpp.dt_size;
+                args.dst = dst + off * jpp.dt_size;
+
+                (*kernel_)(&args);
+
+                nd_iterator_step(oub, ou_blocks);
+            }
+        };
+
+        parallel(0, ker);
+    }
+
+    return status::success;
+}
+
+template struct jit_uni_fork_softmax_fwd_t<sse41>;
+template struct jit_uni_fork_softmax_fwd_t<avx2>;
+template struct jit_uni_fork_softmax_fwd_t<avx512_core>;
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_uni_fork_softmax.hpp b/src/cpu/x64/jit_uni_fork_softmax.hpp
new file mode 100644
index 00000000000..00584f0adb1
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_softmax.hpp
@@ -0,0 +1,97 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_FORK_SOFTMAX_HPP
+#define CPU_X64_JIT_UNI_FORK_SOFTMAX_HPP
+
+#include <assert.h>
+#include <memory>
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/cpu_softmax_pd.hpp"
+#include "cpu/x64/cpu_isa_traits.hpp"
+#include "cpu/x64/jit_uni_fork_softmax_kernel_f32.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa>
+struct jit_uni_fork_softmax_fwd_t : public primitive_t {
+    struct pd_t : public cpu_softmax_fwd_pd_t {
+        using cpu_softmax_fwd_pd_t::cpu_softmax_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T(
+                JIT_IMPL_NAME_HELPER("jit:", isa, ""),
+                jit_uni_fork_softmax_fwd_t<isa>);
+
+        status_t init(engine_t *engine) {
+            const memory_desc_wrapper src_d(src_md());
+            const memory_desc_wrapper dst_d(dst_md());
+            auto data_type = src_d.data_type();
+
+            auto ndims = desc_.src_desc.ndims;
+            auto dims = desc_.src_desc.dims;
+            auto axis = desc_.softmax_axis;
+
+            size_t inner_size = utils::array_product(dims + axis + 1, ndims - axis - 1);
+
+            format_tag_t dat_tag = utils::pick(ndims - 3, format_tag::ncw, format_tag::nchw, format_tag::ncdhw);
+
+            // TODO: disabled because of failed test (case: for axis == 0, batch == 2). Needs to be debugged.
+            if (ndims == 3)
+                return status::unimplemented;
+
+            using namespace data_type;
+            bool ok = src_d == dst_d && mayiuse(isa) && is_fwd()
+                      && !has_zero_dim_memory()
+                      && utils::one_of(data_type, f32, bf16)
+                      && attr()->has_default_values()
+                      && src_d.is_dense(true)
+                      && src_d.matches_one_of_tag(dat_tag) == dat_tag
+                      && inner_size > 1;
+            if (!ok) return status::unimplemented;
+
+            return jit_uni_fork_softmax_kernel_f32<isa>::init_conf(jpp_, desc_, src_md(), dst_md());
+        }
+        jit_softmax_conf_t jpp_;
+    };
+
+    jit_uni_fork_softmax_fwd_t(const pd_t *apd);
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_, new jit_uni_fork_softmax_kernel_f32<isa>(pd()->jpp_)));
+        return kernel_->create_kernel();
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<jit_uni_fork_softmax_kernel_f32<isa>> kernel_;
+};
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_uni_fork_softmax_kernel_f32.cpp b/src/cpu/x64/jit_uni_fork_softmax_kernel_f32.cpp
new file mode 100644
index 00000000000..a79233c8df7
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_softmax_kernel_f32.cpp
@@ -0,0 +1,745 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "jit_generator.hpp"
+#include "jit_uni_fork_softmax_kernel_f32.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace Xbyak;
+
+#define GET_OFF(field) offsetof(jit_softmax_call_s, field)
+
+template<cpu_isa_t isa>
+jit_uni_fork_softmax_kernel_f32<isa>::jit_uni_fork_softmax_kernel_f32(jit_softmax_conf_t ajpp)
+    : jit_generator_t(jit_name(), isa), jpp(ajpp) {
+    if (jpp.dt == data_type::bf16 && !mayiuse(avx512_core_bf16)) {
+        bf16_emu_.reset(new bf16_emulation_t(this, bf16_emu_zmm_1, bf16_emu_zmm_2, bf16_emu_zmm_3,
+                                             bf16_emu_gpr, bf16_emu_zmm_4, bf16_emu_zmm_5));
+    }
+}
+
+
+template <cpu_isa_t isa>
+status_t jit_uni_fork_softmax_kernel_f32<isa>::init_conf(jit_softmax_conf_t &jpp,
+                   const softmax_desc_t &pd, const memory_desc_wrapper &src_d,
+                   const memory_desc_wrapper &dst_d) {
+    auto ndims = pd.src_desc.ndims;
+    auto dims = pd.src_desc.dims;
+    auto axis = pd.softmax_axis;
+
+    jpp.dt = src_d.data_type();
+    jpp.dt_size = src_d.data_type_size();
+
+    size_t nregs = cpu_isa_traits_t<isa>::n_vregs;
+
+    if (jpp.dt == data_type::bf16) {
+        if (isa != avx512_core) {
+            return status::unimplemented;
+        }
+        else if (!mayiuse(avx512_core_bf16)) {
+            nregs -= 5; // reserved for the bf16 emulator
+        }
+    }
+
+    size_t aux_simd_registers = 5; // 3 aux for exp + one + (-FTL_MAX)
+    size_t regs_for_one_unroll = 2;
+    size_t max_inner_unroll = (nregs - aux_simd_registers) / regs_for_one_unroll;
+    size_t max_channels_unroll = 4;
+
+    jpp.outer_size = utils::array_product(dims, axis);
+    jpp.channels = dims[axis];
+    jpp.inner_size = utils::array_product(dims + axis + 1, ndims - axis - 1);
+
+    if (jpp.outer_size < 1 || jpp.channels < 1 || jpp.inner_size < 1) {
+        return status::unimplemented;
+    }
+
+    jpp.ur_inner = max_inner_unroll;
+    jpp.ur_channel = nstl::min(max_channels_unroll, jpp.channels);
+    jpp.outer_block = 2 * cpu_isa_traits_t<isa>::vlen / sizeof(float);
+
+    if (jpp.inner_size == 1) {
+        // limit max jit code size for dense case
+        if (jpp.channels > 128) {
+            return status::unimplemented;
+        }
+
+        // ref implementation is faster for small work amount
+        if (jpp.channels * jpp.outer_size < 16) {
+            return status::unimplemented;
+        }
+    }
+
+    return status::success;
+}
+
+template <cpu_isa_t isa>
+int jit_uni_fork_softmax_kernel_f32<isa>::id_vreg_max(int ur_inner) {
+    return 5+ur_inner;
+}
+
+template <cpu_isa_t isa>
+int jit_uni_fork_softmax_kernel_f32<isa>::id_vreg_denom(int ur_inner) {
+    return 5+jpp.ur_inner + ur_inner;
+}
+
+template <cpu_isa_t isa>
+int jit_uni_fork_softmax_kernel_f32<isa>::id_vreg_src(int ur_inner) {
+    return 5+2*jpp.ur_inner;
+}
+
+template <cpu_isa_t isa>
+auto jit_uni_fork_softmax_kernel_f32<isa>::vreg_max(int ur_inner) -> Vmm {
+    return Vmm(id_vreg_max(ur_inner));
+}
+
+template <cpu_isa_t isa>
+auto jit_uni_fork_softmax_kernel_f32<isa>::vreg_denom(int ur_inner) -> Vmm {
+    return Vmm(id_vreg_denom(ur_inner));
+}
+
+template <cpu_isa_t isa>
+auto jit_uni_fork_softmax_kernel_f32<isa>::vreg_src(int ur_inner) -> Vmm {
+    return Vmm(id_vreg_src(ur_inner));
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::load_vector(Vmm vmm_src_, const Xbyak::Address &op) {
+    switch (jpp.dt) {
+        case data_type::f32:
+            uni_vmovups(vmm_src_, op);
+            break;
+        case data_type::bf16:
+            uni_vpmovzxwd(vmm_src_, op);
+            uni_vpslld(vmm_src_, vmm_src_, 16);
+            break;
+        default:
+            assert(!"unknown data type");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::load_scalar(Xmm xmm_src_, const Xbyak::Address &op) {
+    switch (jpp.dt) {
+        case data_type::f32:
+            movss(xmm_src_, op);
+            break;
+        case data_type::bf16:
+            pinsrw(xmm_src_, op, 0x0);
+            uni_vpslld(xmm_src_, xmm_src_, 16);
+            break;
+        default:
+            assert(!"unknown data type");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::store_vector(const Xbyak::Address &op, Vmm vmm_dst_) {
+    Ymm ymm_dst_ = Ymm(vmm_dst_.getIdx());
+    switch (jpp.dt) {
+        case data_type::f32:
+            uni_vmovups(op, vmm_dst_);
+            break;
+        case data_type::bf16:
+            if (mayiuse(avx512_core_bf16))
+                vcvtneps2bf16(ymm_dst_, vmm_dst_);
+            else
+                bf16_emu_->vcvtneps2bf16(ymm_dst_, Zmm(vmm_dst_.getIdx()));
+            vmovdqu16(op, ymm_dst_);
+            break;
+        default:
+            assert(!"unknown data type");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::store_scalar(const Xbyak::Address &op, Xmm xmm_dst_) {
+    switch (jpp.dt) {
+        case data_type::f32:
+            movss(op, xmm_dst_);
+            break;
+        case data_type::bf16:
+            if (mayiuse(avx512_core_bf16))
+                vcvtneps2bf16(xmm_dst_, xmm_dst_);
+            else
+                bf16_emu_->vcvtneps2bf16(xmm_dst_, Ymm(xmm_dst_.getIdx()));
+            pextrw(op, xmm_dst_, 0x0);
+            break;
+        default:
+            assert(!"unknown dst_dt");
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::prepare_table() {
+    const unsigned int cvals[] = {
+            0x3f800000, // [0] 1.0f
+            0x3f000000, // [1] 0.5f
+            0x3fb8aa3b, // [2] log2ef = 1.44269502f
+            0x3f317218, // [3] ln2f =   0.69314718f
+            0x0000007f, // [4] 0x7f
+            // exp(x) polynom
+            0x3f800001, // [5] p0 = 1.0000001f
+            0x3efffe85, // [6] p2 = 0.4999887f
+            0x3e2aaa3e, // [7] p3 = 0.16666505f
+            0x3d2bb1b1, // [8] p4 = 0.041917507f
+            0x3c091ec1, // [9] p5 = 0.008369149f
+            0x42b0c0a5, //[10] max logf = 88.3762589f
+            0xc1766666  //[11] min logf = -14.5f
+    };
+
+    align(64);
+    L(l_table);
+    for (size_t i = 0; i < sizeof(cvals) / sizeof(cvals[0]); ++i) {
+        for (size_t d = 0; d < vlen / sizeof(float); ++d) {
+            dd(cvals[i]);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::simd_expf(const Vmm &vmm_src) {
+    uni_vminps(vmm_src, vmm_src, ptr[imm_addr64 + 10 * vlen]);
+    uni_vmaxps(vmm_src, vmm_src, ptr[imm_addr64 + 11 * vlen]);
+    uni_vmovups(vmm_aux0, vmm_src);
+    //calculate exp(x)
+    // fx = x * log2ef + 0.5
+    uni_vmulps(vmm_src, vmm_src, ptr[imm_addr64 + 2 * vlen]);
+    uni_vaddps(vmm_src, vmm_src, ptr[imm_addr64 + 1 * vlen]);
+
+    // tmp = floorf(fx)
+    if (isa < avx512_core) {
+        uni_vroundps(vmm_aux1, vmm_src, _op_floor);
+    } else {
+        vcvtps2dq(vmm_aux1 | T_rd_sae, vmm_src);
+        vcvtdq2ps(vmm_aux1, vmm_aux1);
+
+        vcmpps(k_mask_tmp, vmm_aux1, vmm_src, _cmp_gt_os);
+        vmovups(vmm_aux2 | k_mask_tmp | T_z, zword[imm_addr64 + 0 * vlen]);
+
+        uni_vsubps(vmm_aux1, vmm_aux1, vmm_aux2);
+    }
+    //keep fx for further computations
+    uni_vmovups(vmm_src, vmm_aux1); //vmm_src = fx
+    // compute 2^n
+    uni_vcvtps2dq(vmm_aux2, vmm_src);
+    uni_vpaddd(vmm_aux2, vmm_aux2, ptr[imm_addr64 + 4 * vlen]);
+    uni_vpslld(vmm_aux2, vmm_aux2, 23); //Vmm(6) = 2^-fx
+
+    //x = x - fx * ln2
+    uni_vfnmadd231ps(vmm_aux0, vmm_aux1, ptr[imm_addr64 + 3 * vlen]);
+    // y = p5
+    uni_vmovups(vmm_src, ptr[imm_addr64 + 9 * vlen]);
+    // y = y * x + p4
+    uni_vfmadd213ps(vmm_src, vmm_aux0, ptr[imm_addr64 + 8 * vlen]);
+    // y = y * x + p3
+    uni_vfmadd213ps(vmm_src, vmm_aux0, ptr[imm_addr64 + 7 * vlen]);
+    // y = y * x + p2
+    uni_vfmadd213ps(vmm_src, vmm_aux0, ptr[imm_addr64 + 6 * vlen]);
+    // y = y * x + p1
+    uni_vfmadd213ps(vmm_src, vmm_aux0, vmm_one);
+    // y = y * x + p0
+    uni_vfmadd213ps(vmm_src, vmm_aux0, ptr[imm_addr64 + 5 * vlen]);  //exp(q)
+    // y = y * 2^n
+    uni_vmulps(vmm_src, vmm_src, vmm_aux2);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::scalar_expf(const Xmm &xmm_src) {
+    minss(xmm_src, ptr[imm_addr64 + 10 * vlen]);
+    maxss(xmm_src, ptr[imm_addr64 + 11 * vlen]);
+    movups(xmm_aux0, xmm_src);
+    //calculate exp(x)
+    // fx = x * log2ef + 0.5
+    mulss(xmm_src, ptr[imm_addr64 + 2 * vlen]);
+    addss(xmm_src, ptr[imm_addr64 + 1 * vlen]);
+    // tmp = floorf(fx)
+    roundss(xmm_aux1, xmm_src, _op_floor);
+    //keep fx for further computations
+    movups(xmm_src, xmm_aux1); //xmm_src = fx
+    // compute 2^n
+    cvtps2dq(xmm_aux2, xmm_src);
+    paddd(xmm_aux2, ptr[imm_addr64 + 4 * vlen]);
+    pslld(xmm_aux2, 23); //Xmm(6) = 2^-fx
+
+    //calculation fx * ln2
+    mulss(xmm_aux1, ptr[imm_addr64 + 3 * vlen]);
+    //x = x - fx * ln2
+    subss(xmm_aux0, xmm_aux1);
+    // y = p5
+    movups(xmm_src, ptr[imm_addr64 + 9 * vlen]);
+    // y = y * x + p4
+    mulss(xmm_src, xmm_aux0);
+    addss(xmm_src, ptr[imm_addr64 + 8 * vlen]);
+
+    // y = y * x + p3
+    mulss(xmm_src, xmm_aux0);
+    addss(xmm_src, ptr[imm_addr64 + 7 * vlen]);
+    // y = y * x + p2
+    mulss(xmm_src, xmm_aux0);
+    addss(xmm_src, ptr[imm_addr64 + 6 * vlen]);
+
+    // y = y * x + p1
+    mulss(xmm_src, xmm_aux0);
+    addss(xmm_src, xmm_one);
+
+    // y = y * x + p0
+    mulss(xmm_src, xmm_aux0);
+    addss(xmm_src, ptr[imm_addr64 + 5 * vlen]); //exp(q)
+
+    // y = y * 2^n
+    mulps(xmm_src, xmm_aux2);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::simd_loop_max(int ur_inner) {
+    Label loop_channel_blocks;
+    Label loop_channel_tail;
+    Label loop_channel_end;
+
+    for (int i = 0; i < ur_inner; ++i) {
+        uni_vbroadcastss(vreg_max(i), xmm_float_min);
+    }
+
+    mov(reg_ch_work, reg_channels);
+    mov(reg_src_ptr, reg_src_base_ptr);
+
+    L(loop_channel_blocks); {
+        cmp(reg_ch_work, jpp.ur_channel);
+        jl(loop_channel_tail, T_NEAR);
+
+        for (int i = 0; i < ur_inner; ++i) {
+            for (int c = 0; c < (int)jpp.ur_channel; ++c) {
+                load_vector(vreg_src(i), ptr[reg_src_ptr + (i*simd_w + c*jpp.inner_size) * jpp.dt_size]);
+                uni_vmaxps(vreg_max(i), vreg_max(i), vreg_src(i));
+            }
+        }
+
+        sub(reg_ch_work, jpp.ur_channel);
+        add(reg_src_ptr, jpp.ur_channel * jpp.inner_size * jpp.dt_size);
+
+        jmp(loop_channel_blocks, T_NEAR);
+    }
+
+    L(loop_channel_tail); {
+        cmp(reg_ch_work, 0);
+        jle(loop_channel_end, T_NEAR);
+
+        for (int i = 0; i < ur_inner; ++i) {
+            load_vector(vreg_src(i), ptr[reg_src_ptr + i * simd_w * jpp.dt_size]);
+            uni_vmaxps(vreg_max(i), vreg_max(i), vreg_src(i));
+        }
+
+        add(reg_src_ptr, jpp.inner_size * jpp.dt_size);
+
+        dec(reg_ch_work);
+        jmp(loop_channel_tail, T_NEAR);
+    }
+
+    L(loop_channel_end);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::simd_loop_exp(int ur_inner) {
+    Label loop_channel_blocks;
+    Label loop_channel_tail;
+    Label loop_channel_end;
+
+    for (int i = 0; i < ur_inner; ++i) {
+        uni_vpxor(vreg_denom(i), vreg_denom(i), vreg_denom(i));
+    }
+
+    mov(reg_ch_work, reg_channels);
+
+    mov(reg_src_ptr, reg_src_base_ptr);
+    mov(reg_dst_ptr, reg_dst_base_ptr);
+
+    L(loop_channel_blocks); {
+        cmp(reg_ch_work, jpp.ur_channel);
+        jl(loop_channel_tail, T_NEAR);
+
+        for (int i = 0; i < ur_inner; ++i) {
+            for (int c = 0; c < (int)jpp.ur_channel; ++c) {
+                load_vector(vreg_src(i), ptr[reg_src_ptr + (i*simd_w + c*jpp.inner_size) * jpp.dt_size]);
+                uni_vsubps(vreg_src(i),vreg_src(i), vreg_max(i));
+                simd_expf(vreg_src(i));
+                uni_vaddps(vreg_denom(i), vreg_denom(i), vreg_src(i));
+                store_vector(ptr[reg_dst_ptr + (i*simd_w + c*jpp.inner_size) * jpp.dt_size], vreg_src(i));
+            }
+        }
+
+        sub(reg_ch_work, jpp.ur_channel);
+        add(reg_src_ptr, jpp.ur_channel * jpp.inner_size * jpp.dt_size);
+        add(reg_dst_ptr, jpp.ur_channel * jpp.inner_size * jpp.dt_size);
+
+        jmp(loop_channel_blocks, T_NEAR);
+    }
+
+    L(loop_channel_tail); {
+        cmp(reg_ch_work, 0);
+        jle(loop_channel_end, T_NEAR);
+
+        for (int i = 0; i < ur_inner; ++i) {
+            load_vector(vreg_src(i), ptr[reg_src_ptr + i * simd_w * jpp.dt_size]);
+            uni_vsubps(vreg_src(i), vreg_src(i), vreg_max(i));
+            simd_expf(vreg_src(i));
+            uni_vaddps(vreg_denom(i), vreg_denom(i), vreg_src(i));
+            store_vector(ptr[reg_dst_ptr + i * simd_w * jpp.dt_size], vreg_src(i));
+        }
+
+        add(reg_src_ptr, jpp.inner_size * jpp.dt_size);
+        add(reg_dst_ptr, jpp.inner_size * jpp.dt_size);
+
+        dec(reg_ch_work);
+        jmp(loop_channel_tail, T_NEAR);
+    }
+
+    L(loop_channel_end);
+}
+
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::simd_loop_div(int ur_inner) {
+    Label loop_channel_blocks;
+    Label loop_channel_tail;
+    Label loop_channel_end;
+
+    for (int i = 0; i < ur_inner; ++i) {
+        if (isa == sse41) {
+            uni_vmovups(vmm_aux0, vmm_one);
+            uni_vdivps(vmm_aux0, vmm_aux0, vreg_denom(i));
+            uni_vmovups(vreg_denom(i), vmm_aux0);
+        } else {
+            uni_vdivps(vreg_denom(i), vmm_one, vreg_denom(i));
+        }
+    }
+
+    mov(reg_ch_work, reg_channels);
+
+    mov(reg_src_ptr, reg_src_base_ptr);
+    mov(reg_dst_ptr, reg_dst_base_ptr);
+
+    L(loop_channel_blocks); {
+        cmp(reg_ch_work, jpp.ur_channel);
+        jl(loop_channel_tail, T_NEAR);
+
+        for (int i = 0; i < ur_inner; ++i) {
+            for (int c = 0; c < (int)jpp.ur_channel; ++c) {
+                load_vector(vreg_src(i), ptr[reg_dst_ptr + (i*simd_w + c*jpp.inner_size) * jpp.dt_size]);
+                uni_vmulps(vreg_src(i), vreg_src(i), vreg_denom(i));
+                store_vector(ptr[reg_dst_ptr + (i*simd_w + c*jpp.inner_size) * jpp.dt_size], vreg_src(i));
+            }
+        }
+
+        sub(reg_ch_work, jpp.ur_channel);
+        add(reg_src_ptr, jpp.ur_channel * jpp.inner_size * jpp.dt_size);
+        add(reg_dst_ptr, jpp.ur_channel * jpp.inner_size * jpp.dt_size);
+
+        jmp(loop_channel_blocks, T_NEAR);
+    }
+
+    L(loop_channel_tail); {
+        cmp(reg_ch_work, 0);
+        jle(loop_channel_end, T_NEAR);
+
+        for (int i = 0; i < ur_inner; ++i) {
+            load_vector(vreg_src(i), ptr[reg_dst_ptr + i * simd_w * jpp.dt_size]);
+            uni_vmulps(vreg_src(i), vreg_src(i), vreg_denom(i));
+            store_vector(ptr[reg_dst_ptr + i * simd_w * jpp.dt_size], vreg_src(i));
+        }
+
+        add(reg_src_ptr, jpp.inner_size * jpp.dt_size);
+        add(reg_dst_ptr, jpp.inner_size * jpp.dt_size);
+
+        dec(reg_ch_work);
+        jmp(loop_channel_tail, T_NEAR);
+    }
+
+    L(loop_channel_end);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::scalar_loop_max() {
+    Label loop_channel_tail;
+    Label loop_channel_end;
+
+    movups(xmm_max, xmm_float_min);
+    mov(reg_src_ptr, reg_src_base_ptr);
+    mov(reg_ch_work, reg_channels);
+
+    L(loop_channel_tail); {
+        cmp(reg_ch_work, 0);
+        jle(loop_channel_end, T_NEAR);
+
+        load_scalar(xmm_src, ptr[reg_src_ptr]);
+        maxss(xmm_max, xmm_src);
+
+        add(reg_src_ptr, jpp.inner_size * jpp.dt_size);
+
+        dec(reg_ch_work);
+        jmp(loop_channel_tail);
+    }
+
+    L(loop_channel_end);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::scalar_loop_exp() {
+    Label loop_channel_tail;
+    Label loop_channel_end;
+
+    mov(reg_src_ptr, reg_src_base_ptr);
+    mov(reg_dst_ptr, reg_dst_base_ptr);
+
+    mov(reg_ch_work, reg_channels);
+
+    pxor(xmm_denom, xmm_denom);
+
+    L(loop_channel_tail); {
+        cmp(reg_ch_work, 0);
+        jle(loop_channel_end, T_NEAR);
+
+        load_scalar(xmm_src, ptr[reg_src_ptr]);
+        subss(xmm_src, xmm_max);
+        scalar_expf(xmm_src);
+        addss(xmm_denom, xmm_src);
+        store_scalar(ptr[reg_dst_ptr], xmm_src);
+
+        add(reg_src_ptr, jpp.inner_size * jpp.dt_size);
+        add(reg_dst_ptr, jpp.inner_size * jpp.dt_size);
+
+        dec(reg_ch_work);
+        jmp(loop_channel_tail);
+    }
+
+    L(loop_channel_end);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::scalar_loop_div() {
+    Label loop_channel_tail;
+    Label loop_channel_end;
+
+    mov(reg_src_ptr, reg_src_base_ptr);
+    mov(reg_dst_ptr, reg_dst_base_ptr);
+    mov(reg_ch_work, reg_channels);
+
+    L(loop_channel_tail); {
+        cmp(reg_ch_work, 0);
+        jle(loop_channel_end, T_NEAR);
+
+        load_scalar(xmm_src, ptr[reg_dst_ptr]);
+        divss(xmm_src, xmm_denom);
+        store_scalar(ptr[reg_dst_ptr], xmm_src);
+
+        add(reg_src_ptr, jpp.inner_size * jpp.dt_size);
+        add(reg_dst_ptr, jpp.inner_size * jpp.dt_size);
+
+        dec(reg_ch_work);
+        jmp(loop_channel_tail);
+    }
+
+    L(loop_channel_end);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::dense_loop(int ou_block) {
+    for (int ou = 0; ou < ou_block; ou++) {
+        movups(xmm_max, xmm_float_min);
+        for (int ch = 0; ch < (int)jpp.channels; ch++) {
+            load_scalar(xmm_src, ptr[reg_src_base_ptr + (ou * jpp.channels + ch) * jpp.dt_size]);
+            maxss(xmm_max, xmm_src);
+        }
+
+        for (int ch = 0; ch < (int)jpp.channels; ch++) {
+            load_scalar(xmm_src, ptr[reg_src_base_ptr + (ou * jpp.channels + ch) * jpp.dt_size]);
+            subss(xmm_src, xmm_max);
+            store_scalar(ptr[reg_dst_base_ptr + (ou * jpp.channels + ch) * jpp.dt_size], xmm_src);
+        }
+    }
+
+    int full_work = ou_block * (int)jpp.channels;
+    int i = 0;
+    for (; i <= full_work - simd_w; i += simd_w) {
+        load_vector(vreg_src(0), ptr[reg_dst_base_ptr + i * jpp.dt_size]);
+        simd_expf(vreg_src(0));
+        store_vector(ptr[reg_dst_base_ptr + i * jpp.dt_size], vreg_src(0));
+    }
+
+    for (; i < full_work; i++) {
+        load_scalar(xmm_src, ptr[reg_dst_base_ptr + i * jpp.dt_size]);
+        scalar_expf(xmm_src);
+        store_scalar(ptr[reg_dst_base_ptr + i * jpp.dt_size], xmm_src);
+    }
+
+    for (int ou = 0; ou < ou_block; ou++) {
+        pxor(xmm_denom, xmm_denom);
+        for (int ch = 0; ch < (int)jpp.channels; ch++) {
+            load_scalar(xmm_src, ptr[reg_dst_base_ptr + (ou * jpp.channels + ch) * jpp.dt_size]);
+            addss(xmm_denom, xmm_src);
+        }
+
+        movss(xmm_one, ptr[imm_addr64 + 0 * vlen]);
+        divss(xmm_one, xmm_denom);
+        movss(xmm_denom, xmm_one);
+        for (int ch = 0; ch < (int)jpp.channels; ch++) {
+            load_scalar(xmm_src, ptr[reg_dst_base_ptr + (ou * jpp.channels + ch) * jpp.dt_size]);
+            mulss(xmm_src, xmm_denom);
+            store_scalar(ptr[reg_dst_base_ptr + (ou * jpp.channels + ch) * jpp.dt_size], xmm_src);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::generate() {
+    this->preamble();
+    if (bf16_emu_) bf16_emu_->init_vcvtneps2bf16();
+
+    if (jpp.inner_size == 1) {
+        this->generate_dense();
+    } else {
+        mov(reg_src_base_ptr, ptr[abi_param1 + GET_OFF(src)]);
+        mov(reg_dst_base_ptr, ptr[abi_param1 + GET_OFF(dst)]);
+        mov(reg_work_amount, ptr[abi_param1 + GET_OFF(work)]);
+        mov(reg_channels, ptr[abi_param1 + GET_OFF(channels)]);
+
+        mov(reg_min, float2int(-FLT_MAX));
+        movq(xmm_float_min, reg_min);
+
+        mov(imm_addr64, jit_uni_fork_softmax_kernel_f32<isa>::l_table);
+        uni_vmovups(vmm_one, ptr[imm_addr64 + 0 * vlen]);
+
+        cmp(reg_work_amount, jpp.ur_inner * simd_w);
+        jl(loop_simd, T_NEAR);
+
+        L(loop_simd_unroll);
+        {
+            simd_loop_max(jpp.ur_inner);
+            simd_loop_exp(jpp.ur_inner);
+            simd_loop_div(jpp.ur_inner);
+
+            add(reg_src_base_ptr, jpp.ur_inner * simd_w * jpp.dt_size);
+            add(reg_dst_base_ptr, jpp.ur_inner * simd_w * jpp.dt_size);
+
+            sub(reg_work_amount, jpp.ur_inner * simd_w);
+            cmp(reg_work_amount, jpp.ur_inner * simd_w);
+            jge(loop_simd_unroll, T_NEAR);
+        }
+
+        L(loop_simd);
+        {
+            cmp(reg_work_amount, simd_w);
+            jl(loop_scalar, T_NEAR);
+
+            simd_loop_max(1);
+            simd_loop_exp(1);
+            simd_loop_div(1);
+
+            add(reg_src_base_ptr, simd_w * jpp.dt_size);
+            add(reg_dst_base_ptr, simd_w * jpp.dt_size);
+
+            sub(reg_work_amount, simd_w);
+            jmp(loop_simd, T_NEAR);
+        }
+
+        L(loop_scalar);
+        {
+            cmp(reg_work_amount, 0);
+            jle(loop_end, T_NEAR);
+
+            scalar_loop_max();
+            scalar_loop_exp();
+            scalar_loop_div();
+
+            add(reg_src_base_ptr, jpp.dt_size);
+            add(reg_dst_base_ptr, jpp.dt_size);
+
+            dec(reg_work_amount);
+            jmp(loop_scalar, T_NEAR);
+        }
+
+        L(loop_end);
+
+        this->postamble();
+
+        prepare_table();
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_fork_softmax_kernel_f32<isa>::generate_dense() {
+    mov(reg_src_base_ptr, ptr[abi_param1 + GET_OFF(src)]);
+    mov(reg_dst_base_ptr, ptr[abi_param1 + GET_OFF(dst)]);
+    mov(reg_work_amount, ptr[abi_param1 + GET_OFF(work)]);
+
+    mov(reg_min, float2int(-FLT_MAX));
+    movq(xmm_float_min, reg_min);
+
+    mov(imm_addr64, jit_uni_fork_softmax_kernel_f32<isa>::l_table);
+    uni_vmovups(vmm_one, ptr[imm_addr64 + 0 * vlen]);
+
+    int outer_tail = jpp.outer_size % jpp.outer_block;
+    Label ou_loop_tail_label;
+    Label ou_loop_tail_1_label;
+    Label ou_loop_exit_label;
+
+    cmp(reg_work_amount, jpp.outer_block);
+    jne(ou_loop_tail_label, T_NEAR);
+
+    dense_loop(jpp.outer_block);
+
+    jmp(ou_loop_exit_label, T_NEAR);
+
+    L(ou_loop_tail_label);
+    cmp(reg_work_amount, outer_tail);
+    jne(ou_loop_tail_1_label, T_NEAR);
+
+    dense_loop(outer_tail);
+
+    jmp(ou_loop_exit_label, T_NEAR);
+
+    L(ou_loop_tail_1_label); {
+        cmp(reg_work_amount, 1);
+        jl(ou_loop_exit_label, T_NEAR);
+
+        dense_loop(1);
+
+        add(reg_src_base_ptr, jpp.dt_size * jpp.channels);
+        add(reg_dst_base_ptr, jpp.dt_size * jpp.channels);
+        dec(reg_work_amount);
+
+        jmp(ou_loop_tail_1_label, T_NEAR);
+    }
+
+    L(ou_loop_exit_label);
+
+    this->postamble();
+
+    prepare_table();
+}
+
+template struct jit_uni_fork_softmax_kernel_f32<sse41>;
+template struct jit_uni_fork_softmax_kernel_f32<avx2>;
+template struct jit_uni_fork_softmax_kernel_f32<avx512_core>;
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_uni_fork_softmax_kernel_f32.hpp b/src/cpu/x64/jit_uni_fork_softmax_kernel_f32.hpp
new file mode 100644
index 00000000000..57505c963e4
--- /dev/null
+++ b/src/cpu/x64/jit_uni_fork_softmax_kernel_f32.hpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+* Copyright 2019-2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_FORK_SOFTMAX_KERNEL_F32_HPP
+#define CPU_X64_JIT_UNI_FORK_SOFTMAX_KERNEL_F32_HPP
+
+#include <cfloat>
+#include "jit_generator.hpp"
+#include "jit_primitive_conf.hpp"
+#include "cpu/x64/jit_avx512_core_bf16cvt.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace Xbyak;
+
+template <cpu_isa_t isa>
+struct jit_uni_fork_softmax_kernel_f32 : public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_softmax_kernel_f32)
+    using Vmm = typename utils::conditional3<isa == sse41, Xmm,
+            isa == avx2, Ymm, Zmm>::type;
+
+    jit_uni_fork_softmax_kernel_f32(jit_softmax_conf_t ajpp);
+
+    jit_softmax_conf_t jpp;
+
+    static status_t init_conf(jit_softmax_conf_t &jpp,
+                       const softmax_desc_t &pd,
+                       const memory_desc_wrapper &src_d,
+                       const memory_desc_wrapper &dst_d);
+
+    void prepare_table();
+    void simd_expf(const Vmm &vmm_src);
+    void scalar_expf(const Xmm &xmm_src);
+
+    void simd_loop_max(int ur_inner);
+    void simd_loop_exp(int ur_inner);
+    void simd_loop_div(int ur_inner);
+
+    void scalar_loop_max();
+    void scalar_loop_exp();
+    void scalar_loop_div();
+
+    void dense_loop(int ou_block);
+    void generate_dense();
+private:
+    const int simd_w = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
+
+    Reg64 reg_work_amount   = rax;
+    Reg64 reg_src_base_ptr  = rbx;
+    Reg64 reg_dst_base_ptr  = rsi;
+    Reg64 reg_src_ptr       = r8;
+    Reg64 reg_dst_ptr       = r9;
+    Reg64 reg_channels      = r12;
+    Reg64 reg_ch_work       = r13;
+    Reg64 reg_min           = rdx;
+    Reg64 imm_addr64        = r14;
+    Reg64 bf16_emu_gpr      = r15;
+
+    Vmm vmm_aux0            = Vmm(0);
+    Vmm vmm_aux1            = Vmm(1);
+    Vmm vmm_aux2            = Vmm(2);
+    Xmm xmm_aux0            = Xmm(0);
+    Xmm xmm_aux1            = Xmm(1);
+    Xmm xmm_aux2            = Xmm(2);
+
+    Xmm xmm_float_min       = Xmm(3);
+    Xmm xmm_one             = Xmm(4);
+    Vmm vmm_one             = Vmm(4);
+
+    Xmm xmm_max             = Xmm(5);
+    Xmm xmm_denom           = Xmm(6);
+    Xmm xmm_src             = Xmm(7);
+
+    Zmm bf16_emu_zmm_1      = Zmm(27);
+    Zmm bf16_emu_zmm_2      = Zmm(28);
+    Zmm bf16_emu_zmm_3      = Zmm(29);
+    Zmm bf16_emu_zmm_4      = Zmm(30);
+    Zmm bf16_emu_zmm_5      = Zmm(31);
+
+    Opmask k_mask_tmp       = Opmask(2);
+
+    unsigned char _cmp_gt_os = isa == avx512_core ? 14 : 6;
+
+    int id_vreg_max(int ur_inner);
+    int id_vreg_denom(int ur_inner);
+    int id_vreg_src(int ur_inner);
+
+    auto vreg_max(int ur_inner) -> Vmm;
+    auto vreg_denom(int ur_inner) -> Vmm;
+    auto vreg_src(int ur_inner) -> Vmm;
+
+    void load_vector(Vmm vmm_src, const Xbyak::Address &op);
+    void load_scalar(Xmm xmm_src, const Xbyak::Address &op);
+    void store_vector(const Xbyak::Address &op, Vmm vmm_dst);
+    void store_scalar(const Xbyak::Address &op, Xmm xmm_dst);
+
+    Label loop_simd_unroll;
+    Label loop_simd;
+    Label loop_scalar;
+    Label loop_end;
+    Label l_table;
+
+    std::unique_ptr<bf16_emulation_t> bf16_emu_;
+
+    unsigned char _op_floor = 1;
+
+    void generate() override;
+};
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_uni_group_normalization.cpp b/src/cpu/x64/jit_uni_group_normalization.cpp
index 8f4b16c9806..4fd6acdc5e2 100644
--- a/src/cpu/x64/jit_uni_group_normalization.cpp
+++ b/src/cpu/x64/jit_uni_group_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,12 +61,12 @@ const bcast_set_t &get_supported_bcast_strategies() {
 
 template <cpu_isa_t isa>
 struct kernel_t : public jit_uni_group_normalization_fwd_t::kernel_base_t,
-                  public jit_generator {
+                  public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_group_normalization_fwd_t::kernel_t);
 
     kernel_t(const group_normalization_pd_t *pd)
         : jit_uni_group_normalization_fwd_t::kernel_base_t(pd)
-        , jit_generator(jit_name(), isa)
+        , jit_generator_t(jit_name(), isa)
         , src_d_(pd->src_md())
         , dst_d_(pd->dst_md())
         , C_(pd->C())
@@ -84,8 +84,8 @@ struct kernel_t : public jit_uni_group_normalization_fwd_t::kernel_base_t,
         with_eltwise_ = post_ops.find(primitive_kind::eltwise) != -1;
 
         const auto &attr_scales = pd->attr()->scales_;
-        with_src_scales_ = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
-        with_dst_scales_ = !attr_scales.get(DNNL_ARG_DST).has_default_values();
+        with_src_scales_ = !attr_scales.has_default_values(DNNL_ARG_SRC);
+        with_dst_scales_ = !attr_scales.has_default_values(DNNL_ARG_DST);
 
         io::io_conf_t io_conf;
         io::io_tail_conf_t io_tail_conf(simd_w_, axis_simd_tail_,
@@ -112,7 +112,9 @@ struct kernel_t : public jit_uni_group_normalization_fwd_t::kernel_base_t,
                 axis_simd_tail_, use_scale_, use_shift_);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
     void generate() override {
         const size_t c_src_size
                 = C_ * types::data_type_size(src_d_.data_type());
@@ -216,15 +218,15 @@ struct kernel_t : public jit_uni_group_normalization_fwd_t::kernel_base_t,
         args.eps = eps_;
         args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
 
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
 protected:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const Xbyak::AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                             ? yword
                                                         : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -385,12 +387,12 @@ template struct kernel_t<avx512_core>;
 template <cpu_isa_t isa>
 struct kernel_stat_t
     : public jit_uni_group_normalization_fwd_t::kernel_stat_base_t,
-      public jit_generator {
+      public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             jit_uni_group_normalization_fwd_t::kernel_stat_t);
 
     kernel_stat_t(const group_normalization_pd_t *pd, bool compute_var = false)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , src_d_(pd->src_md())
         , compute_var_(compute_var)
         , C_(pd->C())
@@ -433,7 +435,9 @@ struct kernel_stat_t
                 unroll_c_tail_);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     void generate() override {
         preamble();
@@ -552,7 +556,7 @@ struct kernel_stat_t
         args.block_size
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
 
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
     void operator()(const void *src, const float *mean, float *var,
@@ -564,15 +568,15 @@ struct kernel_stat_t
         args.block_size
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
 
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
 protected:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const Xbyak::AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                             ? yword
                                                         : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -804,8 +808,8 @@ status_t jit_uni_group_normalization_fwd_t::pd_t::init(engine_t *engine) {
                                         dst_md()->data_type),
                             mayiuse(avx512_core_fp16) || mayiuse(avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
-    VDISPATCH_GNORM(attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+    VDISPATCH_GNORM(attr()->has_default_values(
+                            skip_mask_t::scales | skip_mask_t::post_ops),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_GNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_GNORM(set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
diff --git a/src/cpu/x64/jit_uni_i8i8_pooling.cpp b/src/cpu/x64/jit_uni_i8i8_pooling.cpp
index 5d60ff176e5..3e42ba5e7c4 100644
--- a/src/cpu/x64/jit_uni_i8i8_pooling.cpp
+++ b/src/cpu/x64/jit_uni_i8i8_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -66,10 +66,10 @@ struct jit_uni_i8i8_pool_call_params_t {
 };
 
 template <cpu_isa_t isa>
-struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
+struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_i8i8_pooling_fwd_ker_t)
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     Xmm xreg(int idx) const { return Xmm(idx); }
     Ymm yreg(int idx) const { return Ymm(xreg(idx).getIdx()); }
     Vmm vreg(int idx) const { return Vmm(xreg(idx).getIdx()); }
@@ -97,12 +97,17 @@ struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
     Reg64 aux_reg_src_h = rax;
     Reg64 aux_reg_src_w = rbx;
 
+    Reg64 reg_store_tmp = r11; // shared with reg_kh_index and used only as tmp register for store on avx2
     Reg64 reg_tmp = rdx; // only used during mask init and store
     Reg64 reg_src_safe_access = rbp;
     Reg64 reg_dst_safe_access = rsi;
 
     Reg64 reg_mask = r15; // only used during mask init
 
+    Reg64 reg_oc_off = reg_tmp;
+    Reg64 reg_d_weights = aux_reg_src_h;
+    Reg64 reg_d_bias = aux_reg_src_w;
+
     Opmask k_cmp_mask = Opmask(7);
 
     Opmask mask(int idx) { return Opmask(6 - idx); }
@@ -139,6 +144,9 @@ struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
     std::unique_ptr<injector::jit_uni_postops_injector_t<isa>>
             postops_injector_;
 
+    Vmm vmm_d_weights = vreg(3);
+    Vmm vmm_d_bias = vreg(4);
+
     enum : int { max_vidx_base = utils::one_of(isa, sse41, avx2) ? 7 : 2 };
     //"avg" pool uses more registers for unrolling.
     enum : int { avg_vidx_base = utils::one_of(isa, sse41, avx2) ? 4 : 2 };
@@ -160,8 +168,8 @@ struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
     // thus we need to take into account ratio of sizes s32/i8 = 4
     static constexpr data_type_t avg_proc_dt = data_type::s32;
     enum : int {
-        s32_to_i8_ratio = sizeof(typename prec_traits<avg_proc_dt>::type)
-                / sizeof(typename prec_traits<data_type::u8>::type),
+        s32_to_i8_ratio = sizeof(typename prec_traits_t<avg_proc_dt>::type)
+                / sizeof(typename prec_traits_t<data_type::u8>::type),
         max_num_ll = s32_to_i8_ratio,
         mmx_msk_base_reg = 3
     };
@@ -232,13 +240,13 @@ struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
 
     jit_uni_i8i8_pooling_fwd_ker_t(
             const jit_pool_conf_t &jpp_, const memory_desc_t *dst_md)
-        : jit_generator(jit_name(), isa)
+        : jit_generator_t(jit_name(), isa)
         , jpp(jpp_)
         , postops_injector_(nullptr) {
 
         if (jpp.with_postops) {
 
-            const int simd_w = cpu_isa_traits<isa>::vlen / sizeof(float);
+            const int simd_w = cpu_isa_traits_t<isa>::vlen / sizeof(float);
             const std::size_t c_tail_elems = jpp.c % simd_w;
             post_op_tail_opmask_idx_ = 0;
             if (c_tail_elems) {
@@ -263,10 +271,12 @@ struct jit_uni_i8i8_pooling_fwd_ker_t : public jit_generator {
                     use_exact_tail_scalar_bcast};
             const binary_injector::static_params_t bsp {
                     reg_param, get_supported_bcast_strategies(), rhs_sp};
+            quantization_injector::static_params_t qsp =
+                    {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
             postops_injector_ = utils::make_unique<
                     injector::jit_uni_postops_injector_t<isa>>(
-                    this, jpp.post_ops, bsp);
+                    this, jpp.post_ops, bsp, qsp);
         }
     }
 };
@@ -292,7 +302,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<sse41>::load_src_max_op(
 
     if (masked) {
         if (jpp.src_dt == s32)
-            for (int64_t i = 0; i < jpp.c_tail; i++)
+            for (size_t i = 0; i < static_cast<size_t>(jpp.c_tail); i++)
                 pinsrd(vreg_src(jj),
                         ptr[aux_reg_src_w + offset + i * data_type_size(s32)],
                         i);
@@ -318,7 +328,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::load_src_max_op(
             // Example:  idx=[31..0]
             //    vreg_src = [x,x,x,x,.....,x,-,-,-,-,-] ; x => byte data
             //    shift to transform vreg_src = [-,-,-,-,-,x,..,x,x,x,x,]
-            const uint8_t shift = cpu_isa_traits<avx2>::vlen - jpp.c_tail;
+            const uint8_t shift = cpu_isa_traits_t<avx2>::vlen - jpp.c_tail;
 
             if (jpp.safe_c_tail) {
 
@@ -382,7 +392,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<sse41>::load_src_avg_op(
 
     if (jpp.src_dt == s32) {
         if (masked)
-            for (int64_t i = 0; i < jpp.c_tail; i++)
+            for (size_t i = 0; i < static_cast<size_t>(jpp.c_tail); i++)
                 pinsrd(vr_src,
                         ptr[aux_reg_src_w + offset + i * data_type_size(s32)],
                         i);
@@ -427,10 +437,10 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::load_src_avg_op(
             //    vreg_src = [x,x,x,x,.....,x,-,-,-,-,-] ; x => byte data
             //    shift to transform vreg_src = [-,-,-,-,-,x,..,x,x,x,x,]
             // Re-purposing vreg_zeros here. Set it back to zero immmediately.
-            const int msk_gran
-                    = cpu_isa_traits<avx2>::vlen / data_type_size(avg_proc_dt);
+            const int msk_gran = cpu_isa_traits_t<avx2>::vlen
+                    / data_type_size(avg_proc_dt);
 
-            const uint8_t shift = cpu_isa_traits<avx2>::vlen
+            const uint8_t shift = cpu_isa_traits_t<avx2>::vlen
                     - (jpp.c_tail > (ll + 1) * msk_gran
                                     ? msk_gran
                                     : jpp.c_tail - (ll * msk_gran));
@@ -547,7 +557,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<sse41>::store_dst_max_op(
 
     if (masked) {
         if (jpp.src_dt == s32)
-            for (int i = 0; i < jpp.c_tail; i++)
+            for (size_t i = 0; i < static_cast<size_t>(jpp.c_tail); i++)
                 pextrd(ptr[reg_ptr_dst_i8 + offset + i * data_type_size(s32)],
                         vreg_dst(jj), i);
         else if (utils::one_of(jpp.src_dt, u8, s8))
@@ -569,7 +579,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::store_dst_max_op(
     int c_block = jpp.c_block;
 
     const uint64_t low_mask = (1ULL << (c_block / 2)) - 1;
-    const uint8_t shift = cpu_isa_traits<avx2>::vlen - jpp.c_tail;
+    const uint8_t shift = cpu_isa_traits_t<avx2>::vlen - jpp.c_tail;
 
     if (masked) {
         switch (jpp.src_dt) {
@@ -659,25 +669,25 @@ void jit_uni_i8i8_pooling_fwd_ker_t<sse41>::store_dst_avg_op(
     // Don't generate useless code
     if (masked && !msk) return;
 
-    const Vmm &vr_dst = vreg_dst_s32(jj, ll);
+    const Vmm &vr_dst = jpp.dst_dt == f32 ? vreg_dst_f32(jj, ll) : vreg_dst_s32(jj, ll);
 
-    if (jpp.src_dt == s32) {
+    if (jpp.dst_dt == s32 || jpp.dst_dt == f32) {
         if (masked)
-            for (int i = 0; i < jpp.c_tail; i++)
+            for (size_t i = 0; i < static_cast<size_t>(jpp.c_tail); i++)
                 pextrd(ptr[reg_ptr_dst_i8 + offset + i * data_type_size(s32)],
                         vr_dst, i);
         else
             movups(ptr[reg_ptr_dst_i8 + offset], vr_dst);
-    } else if (utils::one_of(jpp.src_dt, s8, u8)) {
+    } else if (utils::one_of(jpp.dst_dt, s8, u8)) {
         packssdw(vr_dst, vr_dst);
-        if (jpp.src_dt == s8)
+        if (jpp.dst_dt == s8)
             packsswb(vr_dst, vr_dst);
         else
             packuswb(vr_dst, vr_dst);
 
         const int copy_range = masked
                 ? math::ilog2q(jpp.tail[ll] + 1)
-                : cpu_isa_traits<sse41>::vlen / data_type_size(avg_proc_dt);
+                : cpu_isa_traits_t<sse41>::vlen / data_type_size(avg_proc_dt);
         for (int i = 0; i < copy_range; i++)
             pextrb(ptr[reg_ptr_dst_i8 + offset + i], vr_dst, i);
     } else
@@ -729,14 +739,14 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::store_dst_avg_op(
         //         maskmovdqu/vmaskmovdqu
         //         with low 8-bytes mask throws exception if high 8-bytes belongs write-protected page.
         // NOTE: use indirect move via gpr to avoid transition penalty
-        vmovq(reg_tmp, Xmm(vr_dst.getIdx()));
-        movq(mmx_dst_i8, reg_tmp);
+        vmovq(reg_store_tmp, Xmm(vr_dst.getIdx()));
+        movq(mmx_dst_i8, reg_store_tmp);
 
         // mmx_full_msk - mask for all 8 bytes in zero-tail case
         // mmx_mask(ll) - ll-th mask of tail in non-zero-tail case
 
         const int msk_gran
-                = cpu_isa_traits<avx2>::vlen / data_type_size(avg_proc_dt);
+                = cpu_isa_traits_t<avx2>::vlen / data_type_size(avg_proc_dt);
 
         const int ll_end = (ll + 1) * msk_gran; // ((ll + 1) * 8)
 
@@ -771,6 +781,17 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::store_dst_avg_op(
     };
 
     switch (jpp.dst_dt) {
+        case f32:
+            if (masked) {
+                if (sizeof_src_dt() != sizeof_dst_dt()) {
+                    vpmaskmovd(ptr[reg_ptr_dst_i8 + offset], vreg_mask_2, vreg_dst_f32(jj, ll));
+                } else {
+                    vpmaskmovd(ptr[reg_ptr_dst_i8 + offset], vreg_mask, vreg_dst_f32(jj, ll));
+                }
+            } else {
+                vmovups(ptr[reg_ptr_dst_i8 + offset], vreg_dst_f32(jj, ll));
+            }
+            break;
         case s32:
             if (masked) {
                 vpmaskmovd(ptr[reg_ptr_dst_i8 + offset], vreg_mask,
@@ -792,11 +813,11 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx512_core>::store_dst_avg_op(
     // Don't generate useless code
     if (masked && !msk) return;
 
-    const Vmm &vr_dst
-            = masked ? vreg_dst_s32(jj, ll) | mask(ll) : vreg_dst_s32(jj, ll);
+    const Vmm &vr_dst = jpp.dst_dt == f32 ? masked ? vreg_dst_f32(jj, ll) | mask(ll) : vreg_dst_f32(jj, ll)
+                                          : masked ? vreg_dst_s32(jj, ll) | mask(ll) : vreg_dst_s32(jj, ll);
 
     switch (jpp.dst_dt) {
-        case s32: vmovups(ptr[reg_ptr_dst_i8 + offset], vr_dst); break;
+        case f32: case s32: vmovups(ptr[reg_ptr_dst_i8 + offset], vr_dst); break;
         case s8: vpmovsdb(ptr[reg_ptr_dst_i8 + offset], vr_dst); break;
         case u8: vpmovusdb(ptr[reg_ptr_dst_i8 + offset], vr_dst); break;
         default: assert(!"unsupported dst data_type");
@@ -936,7 +957,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::compute_avg_step(
     int iw = jpp.iw;
     int c = jpp.c;
 
-    const int num_ll = data_type_size(avg_proc_dt) / data_type_size(jpp.src_dt);
+    const int num_ll = data_type_size(avg_proc_dt) / data_type_size(jpp.dst_dt);
 
     for (int jj = 0; jj < ur_c; jj++) {
         for (int ll = 0; ll < num_ll; ll++) {
@@ -950,6 +971,9 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::compute_avg_step(
         }
     }
 
+    if (jpp.with_depthwise || jpp.with_quantization)
+        push(reg_oc_off);
+
     mov(aux_reg_src_d, reg_ptr_src_i8);
     xor_(reg_kd_index, reg_kd_index);
     L(l_kd);
@@ -989,6 +1013,11 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::compute_avg_step(
         jl(l_kd, T_NEAR);
     }
 
+    static constexpr int vlen_size_elem = cpu_isa_traits_t<isa>::vlen / sizeof(float);
+
+    if (jpp.with_depthwise || jpp.with_quantization)
+        pop(reg_oc_off);
+
     for (int jj = 0; jj < ur_c; jj++) {
         for (int ll = 0; ll < num_ll; ll++) {
             const bool masked = jj == ur_c - 1 && c_tail;
@@ -1000,6 +1029,15 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::compute_avg_step(
                 uni_vfmadd132ps(reg_dst_f32, vreg_zeros, vreg_tmp);
 
                 if (jpp.with_postops) {
+                    std::map<size_t, int> vmm_idx_off;
+                    vmm_idx_off.insert({reg_dst_f32.getIdx(), (ll * vlen_size_elem + jj * vlen_size_elem) * sizeof(float)});
+                    depthwise_injector::dynamic_params_t ddp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                              reg_oc_off, vmm_idx_off, this->rsp};
+                    quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jpp.dst_dt, this->rsp};
+
+                    injector_utils::vmm_index_set_t vmm_idxs;
+                    vmm_idxs.emplace(reg_dst_f32.getIdx());
+
                     binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
                     if (jpp.with_binary) {
                         rhs_arg_params.vmm_idx_to_out_reg.emplace(
@@ -1011,16 +1049,19 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::compute_avg_step(
                             rhs_arg_params.vmm_tail_idx_.emplace(
                                     reg_dst_f32.getIdx());
                     }
-                    postops_injector_->compute_vector(
-                            reg_dst_f32.getIdx(), rhs_arg_params);
+                    postops_injector_->compute_vector_range(
+                            vmm_idxs, rhs_arg_params, ddp, qdp);
                 }
 
-                uni_vcvtps2dq(reg_dst_s32, reg_dst_f32);
+                if (jpp.dst_dt != f32) {
+                    uni_vcvtps2dq(reg_dst_s32, reg_dst_f32);
+                }
 
                 if (jpp.with_postops)
                     if (jpp.dst_dt == u8) {
                         uni_vpmaxsd(reg_dst_s32, reg_dst_s32, vreg_zeros);
                     }
+
                 store_dst(jj, ll, c_tail);
             }
         }
@@ -1049,12 +1090,17 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::compute_c_block() {
     int c_tail = jpp.c_tail;
 
     xor_(c_iter, c_iter);
+    if (jpp.with_quantization)
+        xor_(reg_oc_off, reg_oc_off);
+
     if (c_steps > 0) {
         L(l_main_loop);
         {
             compute_step(ur_c, 0);
             add(reg_ptr_src_i8, ur_c * c_block * sizeof_src_dt());
             add(reg_ptr_dst_i8, ur_c * c_block * sizeof_dst_dt());
+            if (jpp.with_quantization)
+                add(reg_oc_off, ur_c*c_block*sizeof(float));
             inc(c_iter);
             cmp(c_iter, c_steps);
             jl(l_main_loop, T_NEAR);
@@ -1070,7 +1116,7 @@ void jit_uni_i8i8_pooling_fwd_ker_t<sse41>::init_mask() {}
 template <>
 void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::init_mask() {
     using namespace data_type;
-    using cpu_isa = cpu_isa_traits<avx2>;
+    using cpu_isa = cpu_isa_traits_t<avx2>;
 
     // AVX2 mask initialization: mask stored in Ymm-regs
     auto init = [&](uint64_t bit_mask, bool need_ymm_mask = true,
@@ -1130,6 +1176,10 @@ void jit_uni_i8i8_pooling_fwd_ker_t<avx2>::init_mask() {
                 vpalignr(vreg_mask_2, vreg_mask_2, vreg_zeros, 32 - shift);
             }
             vextracti128(xreg_mask_2_hi, vreg_mask_2, 0x1);
+
+            if (sizeof_src_dt() != sizeof_dst_dt()) {
+                vpmovsxbd(vreg_mask_2, vreg_mask);
+            }
         }
 
         // Need mask in MMX regs ?
@@ -1238,6 +1288,9 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::generate() {
     mov(rcx, rdi);
 #endif
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(reg_param, GET_OFF(post_ops_binary_rhs_arg_vec), reg_ptr_src_i8, reg_ptr_dst_i8);
+
 #define READ_PARAM(reg, field) \
     mov(reg, ptr[reg_param + offsetof(jit_uni_i8i8_pool_call_params_t, field)])
     READ_PARAM(reg_ptr_src_i8, src_i8);
@@ -1259,6 +1312,10 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::generate() {
     compute_c_block();
 
     emms();
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jpp.with_eltwise && postops_injector_)
@@ -1268,6 +1325,8 @@ void jit_uni_i8i8_pooling_fwd_ker_t<isa>::generate() {
 template <cpu_isa_t isa>
 status_t jit_uni_i8i8_pooling_fwd_ker_t<isa>::init_conf(
         jit_pool_conf_t &jpp, const pooling_pd_t *ppd) {
+    // disabling verbose dispatch messages for unsupported isa for
+    // better readability
     if (!mayiuse(isa)) return status::unimplemented;
 
     const auto &pd = *ppd->desc();
@@ -1322,7 +1381,7 @@ status_t jit_uni_i8i8_pooling_fwd_ker_t<isa>::init_conf(
     //     isa == sse41   : 16 bytes -> 16 for s8/u8, 4 for s32
     //     isa == avx2    : 32 bytes -> 32 for s8/u8, 8 for s32
     //     isa == avx512* : 64 bytes -> 64 for s8/u8, 16 for s32
-    int simd_w = cpu_isa_traits<isa>::vlen / data_type_size(jpp.src_dt);
+    int simd_w = cpu_isa_traits_t<isa>::vlen / data_type_size(jpp.dst_dt);
 
     /* Verify that vlen-sized memory access happens within the tensor's
      * size, otherwise load/store will always spill outside the memory
@@ -1359,7 +1418,7 @@ status_t jit_uni_i8i8_pooling_fwd_ker_t<isa>::init_conf(
             // avg_proc_dt (s32) defines granularity (because u8/s8 processed as s32)
             // sse : 4, avx2 : 8, avx512 : 16
             const size_t msk_gran
-                    = cpu_isa_traits<isa>::vlen / data_type_size(avg_proc_dt);
+                    = cpu_isa_traits_t<isa>::vlen / data_type_size(avg_proc_dt);
             const size_t msk_msk = (1ULL << msk_gran) - 1;
             size_t m = tail_mask;
             for (size_t ll = 0; ll < max_num_ll; ll++) {
@@ -1385,6 +1444,8 @@ bool jit_uni_i8i8_pooling_fwd_ker_t<isa>::post_ops_ok(jit_pool_conf_t &jpp,
     jpp.with_postops = false;
     jpp.with_eltwise = false;
     jpp.with_binary = false;
+    jpp.with_depthwise = false;
+    jpp.with_quantization = false;
 
     if (entries.empty()) return true;
 
@@ -1398,11 +1459,16 @@ bool jit_uni_i8i8_pooling_fwd_ker_t<isa>::post_ops_ok(jit_pool_conf_t &jpp,
                     && entry.binary.src1_desc.data_type == data_type::bf16)
                 return false;
             jpp.with_binary = true;
-        } else
+        } else if (entry.is_depthwise()) {
+            jpp.with_depthwise = true;
+        } else if (entry.is_quantization()) {
+            jpp.with_quantization = true;
+        } else {
             return false;
+        }
     }
 
-    jpp.with_postops = jpp.with_eltwise || jpp.with_binary;
+    jpp.with_postops = jpp.with_eltwise || jpp.with_binary || jpp.with_depthwise || jpp.with_quantization;
     jpp.post_ops = post_ops;
 
     /*
@@ -1450,11 +1516,11 @@ status_t jit_uni_i8i8_pooling_fwd_t<isa>::execute_forward(
      * boundary, if so, compute a safe memory access. */
     const auto src_safe_access = reinterpret_cast<char *>(
             reinterpret_cast<ptrdiff_t>(src_i8 + src_d.size() - 1)
-            - (cpu_isa_traits<isa>::vlen - 1));
+            - (cpu_isa_traits_t<isa>::vlen - 1));
 
     const auto dst_safe_access = reinterpret_cast<char *>(
             reinterpret_cast<ptrdiff_t>(dst_i8 + dst_d.size() - 1)
-            - (cpu_isa_traits<isa>::vlen - 1));
+            - (cpu_isa_traits_t<isa>::vlen - 1));
 
     parallel_nd(jpp.mb, jpp.od, jpp.oh, jpp.ow,
             [&](dim_t n, dim_t od, dim_t oh, dim_t ow) {
diff --git a/src/cpu/x64/jit_uni_i8i8_pooling.hpp b/src/cpu/x64/jit_uni_i8i8_pooling.hpp
index 78708c32d18..c7e9a448c20 100644
--- a/src/cpu/x64/jit_uni_i8i8_pooling.hpp
+++ b/src/cpu/x64/jit_uni_i8i8_pooling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,8 +59,6 @@ struct jit_uni_i8i8_pooling_fwd_t : public primitive_t {
             VDISPATCH_POOLING(utils::one_of(src_md()->data_type, data_type::s32,
                                       data_type::s8, data_type::u8),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_POOLING(src_md()->data_type == dst_md()->data_type,
-                    VERBOSE_INCONSISTENT_DT, "src", "dst");
             VDISPATCH_POOLING(!is_dilated(), VERBOSE_UNSUPPORTED_FEATURE,
                     "does not support dilations");
             VDISPATCH_POOLING(attr()->has_default_values(
@@ -79,6 +77,10 @@ struct jit_uni_i8i8_pooling_fwd_t : public primitive_t {
             VDISPATCH_POOLING(
                     attr_.set_default_formats(dst_md(0)) == status::success,
                     VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_POOLING(IMPLICATION(
+                    utils::one_of(desc()->alg_kind, alg_kind::pooling_avg_include_padding, alg_kind::pooling_avg_exclude_padding),
+                        utils::one_of(dst_md()->data_type, data_type::u8, data_type::s8, data_type::f32)),
+                    VERBOSE_BAD_ALGORITHM);
 
             CHECK(jit_conf());
 
@@ -92,7 +94,8 @@ struct jit_uni_i8i8_pooling_fwd_t : public primitive_t {
     };
 
     jit_uni_i8i8_pooling_fwd_t(const pd_t *apd);
-    ~jit_uni_i8i8_pooling_fwd_t();
+
+    ~jit_uni_i8i8_pooling_fwd_t() override;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_uni_instance_normalization.cpp b/src/cpu/x64/jit_uni_instance_normalization.cpp
index d441712e2e0..94ac0cea5cd 100644
--- a/src/cpu/x64/jit_uni_instance_normalization.cpp
+++ b/src/cpu/x64/jit_uni_instance_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,13 +59,13 @@ const bcast_set_t &get_supported_bcast_strategies() {
 
 template <cpu_isa_t isa>
 struct kernel_t : public jit_uni_instance_normalization_fwd_t::kernel_base_t,
-                  public jit_generator {
+                  public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             jit_uni_instance_normalization_fwd_t::kernel_t);
 
     kernel_t(const group_normalization_pd_t *pd)
         : jit_uni_instance_normalization_fwd_t::kernel_base_t(pd)
-        , jit_generator(jit_name(), isa)
+        , jit_generator_t(jit_name(), isa)
         , src_d_(pd->src_md())
         , dst_d_(pd->dst_md())
         , C_(pd->C())
@@ -82,8 +82,8 @@ struct kernel_t : public jit_uni_instance_normalization_fwd_t::kernel_base_t,
         with_eltwise_ = post_ops.find(primitive_kind::eltwise) != -1;
 
         const auto &attr_scales = pd->attr()->scales_;
-        with_src_scales_ = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
-        with_dst_scales_ = !attr_scales.get(DNNL_ARG_DST).has_default_values();
+        with_src_scales_ = !attr_scales.has_default_values(DNNL_ARG_SRC);
+        with_dst_scales_ = !attr_scales.has_default_values(DNNL_ARG_DST);
 
         io::io_conf_t io_conf;
         io::io_tail_conf_t io_tail_conf(simd_w_, axis_simd_tail_,
@@ -102,7 +102,9 @@ struct kernel_t : public jit_uni_instance_normalization_fwd_t::kernel_base_t,
                 {{dst_d_.data_type(), io_saturation_conf}});
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
     void generate() override {
         const size_t c_src_size
                 = C_ * types::data_type_size(src_d_.data_type());
@@ -206,15 +208,15 @@ struct kernel_t : public jit_uni_instance_normalization_fwd_t::kernel_base_t,
         args.eps = eps_;
         args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
 
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
 protected:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const Xbyak::AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                             ? yword
                                                         : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -374,12 +376,12 @@ template struct kernel_t<avx512_core>;
 template <cpu_isa_t isa>
 struct kernel_stat_t
     : public jit_uni_instance_normalization_fwd_t::kernel_stat_base_t,
-      public jit_generator {
+      public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             jit_uni_instance_normalization_fwd_t::kernel_stat_t);
 
     kernel_stat_t(const group_normalization_pd_t *pd, bool compute_var = false)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , src_d_(pd->src_md())
         , compute_var_(compute_var)
         , C_(pd->C())
@@ -404,7 +406,9 @@ struct kernel_stat_t
                 {src_d_.data_type(), f32 /* stats */}, io_conf, io_tail_conf,
                 io_bf16_conf);
     }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
     void generate() override {
         preamble();
 
@@ -461,7 +465,7 @@ struct kernel_stat_t
         args.block_size
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
 
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
     void operator()(const void *src, const float *mean, float *var,
@@ -473,15 +477,15 @@ struct kernel_stat_t
         args.block_size
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
 
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
 protected:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const Xbyak::AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                             ? yword
                                                         : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -676,8 +680,8 @@ status_t jit_uni_instance_normalization_fwd_t::pd_t::init(engine_t *engine) {
                                         dst_md()->data_type),
                             mayiuse(avx512_core_fp16) || mayiuse(avx2_vnni_2)),
             VERBOSE_ISA_DT_MISMATCH);
-    VDISPATCH_GNORM(attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+    VDISPATCH_GNORM(attr()->has_default_values(
+                            skip_mask_t::scales | skip_mask_t::post_ops),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_GNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_GNORM(set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
diff --git a/src/cpu/x64/jit_uni_layer_normalization.cpp b/src/cpu/x64/jit_uni_layer_normalization.cpp
index 5d532c43e01..dc0f7738d07 100644
--- a/src/cpu/x64/jit_uni_layer_normalization.cpp
+++ b/src/cpu/x64/jit_uni_layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ static bcast_set_t get_supported_bcast_strategies(int ndims) {
 
 template <cpu_isa_t isa>
 struct jit_stat_and_data_base_kernel_t : stat_and_data_kernel_t,
-                                         public jit_generator {
+                                         public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_lnorm_stat_and_data_kernel_t);
 
     void operator()(const void *src, void *dst, const float *scale,
@@ -99,14 +99,16 @@ struct jit_stat_and_data_base_kernel_t : stat_and_data_kernel_t,
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
         args.eps = eps_;
         args.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     jit_stat_and_data_base_kernel_t(const layer_normalization_pd_t *pd)
         : stat_and_data_kernel_t(pd)
-        , jit_generator(jit_name(), isa)
+        , jit_generator_t(jit_name(), isa)
         , src_d_(pd_->src_md())
         , dst_d_(pd_->dst_md())
         , simd_w_(vlen / sizeof(float))
@@ -128,8 +130,8 @@ struct jit_stat_and_data_base_kernel_t : stat_and_data_kernel_t,
         with_eltwise_ = post_ops.find(primitive_kind::eltwise) != -1;
 
         const auto &attr_scales = pd_->attr()->scales_;
-        with_src_scales_ = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
-        with_dst_scales_ = !attr_scales.get(DNNL_ARG_DST).has_default_values();
+        with_src_scales_ = !attr_scales.has_default_values(DNNL_ARG_SRC);
+        with_dst_scales_ = !attr_scales.has_default_values(DNNL_ARG_DST);
 
         io::io_conf_t io_conf;
         io::io_tail_conf_t io_tail_conf(simd_w_, axis_simd_tail_,
@@ -150,11 +152,11 @@ struct jit_stat_and_data_base_kernel_t : stat_and_data_kernel_t,
 
 protected:
     static constexpr int unroll_factor_ = 4;
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
                                                  : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -708,7 +710,7 @@ stat_and_data_kernel_t *stat_and_data_kernel_t::create(
 }
 
 template <cpu_isa_t isa>
-struct jit_diff_ss_kernel_t : diff_ss_kernel_t, public jit_generator {
+struct jit_diff_ss_kernel_t : diff_ss_kernel_t, public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_lnorm_diff_ss_kernel_t);
 
     void operator()(const void *src, const void *diff_dst, float *diff_scale,
@@ -733,14 +735,16 @@ struct jit_diff_ss_kernel_t : diff_ss_kernel_t, public jit_generator {
         args.inv_sqrtvar = inv_sqrtvar;
         args.block_size
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     jit_diff_ss_kernel_t(const layer_normalization_pd_t *pd)
         : diff_ss_kernel_t(pd)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , src_d_(pd_->src_md())
         , d_dst_d_(pd_->diff_dst_md())
         , simd_w_(vlen / sizeof(float))
@@ -764,11 +768,11 @@ struct jit_diff_ss_kernel_t : diff_ss_kernel_t, public jit_generator {
     }
 
 protected:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
                                                  : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -909,7 +913,8 @@ diff_ss_kernel_t *diff_ss_kernel_t::create(const layer_normalization_pd_t *pd) {
 }
 
 template <cpu_isa_t isa>
-struct jit_diff_data_base_kernel_t : diff_data_kernel_t, public jit_generator {
+struct jit_diff_data_base_kernel_t : diff_data_kernel_t,
+                                     public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_lnorm_diff_data_kernel_t);
 
     void operator()(const void *src, const void *diff_dst, void *diff_src,
@@ -924,14 +929,16 @@ struct jit_diff_data_base_kernel_t : diff_data_kernel_t, public jit_generator {
         args.inv_sqrtvar = inv_sqrtvar;
         args.block_size
                 = block_size * C_ * types::data_type_size(src_d_.data_type());
-        jit_generator::operator()(&args);
+        jit_generator_t::operator()(&args);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     jit_diff_data_base_kernel_t(const layer_normalization_pd_t *pd)
         : diff_data_kernel_t(pd)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , src_d_(pd_->src_md())
         , d_dst_d_(pd_->diff_dst_md())
         , d_src_d_(pd_->diff_src_md())
@@ -962,11 +969,11 @@ struct jit_diff_data_base_kernel_t : diff_data_kernel_t, public jit_generator {
 
 protected:
     static constexpr int unroll_factor_ = 4;
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
                                                  : zword;
-    const int vlen = cpu_isa_traits<isa>::vlen;
+    const int vlen = cpu_isa_traits_t<isa>::vlen;
 
     struct ker_args_t {
         const void *src;
@@ -1215,8 +1222,8 @@ status_t jit_uni_layer_normalization_fwd_t::pd_t::init(engine_t *engine) {
     VDISPATCH_LNORM(stat_md()->data_type == f32, VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_LNORM(check_scale_shift_data_type(), VERBOSE_UNSUPPORTED_FEATURE,
             "unsupported scale or shift data type");
-    VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+    VDISPATCH_LNORM(attr()->has_default_values(
+                            skip_mask_t::scales | skip_mask_t::post_ops),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_LNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_LNORM(set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
diff --git a/src/cpu/x64/jit_uni_layer_normalization.hpp b/src/cpu/x64/jit_uni_layer_normalization.hpp
index db4e23bf4eb..736d87f43c4 100644
--- a/src/cpu/x64/jit_uni_layer_normalization.hpp
+++ b/src/cpu/x64/jit_uni_layer_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -128,7 +128,8 @@ struct jit_uni_layer_normalization_fwd_t : public primitive_t {
     }
 
     jit_uni_layer_normalization_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-    virtual ~jit_uni_layer_normalization_fwd_t() = default;
+
+    ~jit_uni_layer_normalization_fwd_t() override = default;
 
     void reorder_stat(const exec_ctx_t &ctx, engine_t *engine,
             const memory_arg_t &in, const memory_arg_t &out) const {
@@ -153,24 +154,29 @@ struct jit_uni_layer_normalization_fwd_t : public primitive_t {
         auto scratchpad = ctx.get_scratchpad_grantor();
         auto mean_mem = scratchpad.get_memory_storage(key_lnorm_tmp_mean);
         auto variance_mem = scratchpad.get_memory_storage(key_lnorm_tmp_var);
-        memory_t mean(engine, &(pd()->reordered_stat_md_), std::move(mean_mem));
-        memory_t variance(
-                engine, &(pd()->reordered_stat_md_), std::move(variance_mem));
+        std::unique_ptr<memory_t, memory_deleter_t> mean;
+        CHECK(safe_ptr_assign(mean,
+                new memory_t(engine, &(pd()->reordered_stat_md_),
+                        std::move(mean_mem))));
+        std::unique_ptr<memory_t, memory_deleter_t> variance;
+        CHECK(safe_ptr_assign(variance,
+                new memory_t(engine, &(pd()->reordered_stat_md_),
+                        std::move(variance_mem))));
 
         // reorder input stats
         if (pd()->stats_are_src() && reorder_) {
-            reorder_stat(
-                    ctx, engine, ctx.args().at(DNNL_ARG_MEAN), {&mean, false});
+            reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_MEAN),
+                    {mean.get(), false});
             reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_VARIANCE),
-                    {&variance, false});
+                    {variance.get(), false});
         }
         status_t status = execute_forward(ctx);
         if (status != status::success) return status;
         // reorder output stats
         if (!pd()->stats_are_src() && reorder_) {
-            reorder_stat(
-                    ctx, engine, {&mean, true}, ctx.args().at(DNNL_ARG_MEAN));
-            reorder_stat(ctx, engine, {&variance, true},
+            reorder_stat(ctx, engine, {mean.get(), true},
+                    ctx.args().at(DNNL_ARG_MEAN));
+            reorder_stat(ctx, engine, {variance.get(), true},
                     ctx.args().at(DNNL_ARG_VARIANCE));
         }
 
@@ -287,7 +293,8 @@ struct jit_uni_layer_normalization_bwd_t : public primitive_t {
     }
 
     jit_uni_layer_normalization_bwd_t(const pd_t *apd) : primitive_t(apd) {}
-    virtual ~jit_uni_layer_normalization_bwd_t() = default;
+
+    ~jit_uni_layer_normalization_bwd_t() override = default;
 
     void reorder_stat(const exec_ctx_t &ctx, engine_t *engine,
             const memory_arg_t &in, const memory_arg_t &out) const {
@@ -315,14 +322,18 @@ struct jit_uni_layer_normalization_bwd_t : public primitive_t {
             auto mean_mem = scratchpad.get_memory_storage(key_lnorm_tmp_mean);
             auto variance_mem
                     = scratchpad.get_memory_storage(key_lnorm_tmp_var);
-            memory_t mean(
-                    engine, &(pd()->reordered_stat_md_), std::move(mean_mem));
-            memory_t variance(engine, &(pd()->reordered_stat_md_),
-                    std::move(variance_mem));
-            reorder_stat(
-                    ctx, engine, ctx.args().at(DNNL_ARG_MEAN), {&mean, false});
+            std::unique_ptr<memory_t, memory_deleter_t> mean;
+            CHECK(safe_ptr_assign(mean,
+                    new memory_t(engine, &(pd()->reordered_stat_md_),
+                            std::move(mean_mem))));
+            std::unique_ptr<memory_t, memory_deleter_t> variance;
+            CHECK(safe_ptr_assign(variance,
+                    new memory_t(engine, &(pd()->reordered_stat_md_),
+                            std::move(variance_mem))));
+            reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_MEAN),
+                    {mean.get(), false});
             reorder_stat(ctx, engine, ctx.args().at(DNNL_ARG_VARIANCE),
-                    {&variance, false});
+                    {variance.get(), false});
         }
 
         return execute_backward(ctx);
diff --git a/src/cpu/x64/jit_uni_ncsp_convolution.cpp b/src/cpu/x64/jit_uni_ncsp_convolution.cpp
new file mode 100644
index 00000000000..9d827ba76ee
--- /dev/null
+++ b/src/cpu/x64/jit_uni_ncsp_convolution.cpp
@@ -0,0 +1,765 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/impl_list_item.hpp"
+#include "common/matmul_pd.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "common/reorder.hpp"
+#include "common/stream.hpp"
+#include "common/tag_traits.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/cpu_isa_traits.hpp"
+#include "cpu/x64/jit_uni_ncsp_convolution.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::utils;
+
+status_t reduction_helper_t::reshape_activations(
+        memory_desc_t *o_md, const memory_desc_t *i_md, bool is_dst) {
+    dims_t reduce {};
+    // convert between activations for convolution and matmul
+    // batch dimension is the same for convolution and matmul
+    // channel dimension of convolution is split into group and channels
+    // spatial dimensions of convolution are combined into one
+    // eg. {n, c, d, h, w} <-> {n, g, c/g, sp}
+    // conv to matmul: add batch, remove spatial
+    // ndims_out: 1 (batch) + with_groups() + 2 (c/g and sp)
+    int ndims_out = 0;
+    reduce[ndims_out++] = pd_->MB(); // n
+    if (pd_->with_groups()) reduce[ndims_out++] = pd_->G(); // g
+    reduce[ndims_out++] = i_md->dims[1] / pd_->G(); // c/g
+    reduce[ndims_out++] = pd_->ID() * pd_->IH() * pd_->IW(); // sp
+
+    return memory_desc_reshape(*o_md, *i_md, ndims_out, reduce);
+}
+
+status_t reduction_helper_t::reshape_bias(
+        memory_desc_t *o_md, const memory_desc_t *i_md) {
+    dims_t reduce {};
+    // reshape bias from convolution to matmul
+    // for matmul, batch and spatial dimensions are always 1
+    // eg. {o} <-> {1, g, o/g, 1}
+    // ndims_out: 1 (batch) + groups + 2 (c/g and sp) for matmul
+    int ndims_out = 0;
+    reduce[ndims_out++] = 1; // b
+    if (pd_->with_groups()) reduce[ndims_out++] = pd_->G(); // g
+    reduce[ndims_out++] = i_md->dims[0] / pd_->G(); // o/g
+    reduce[ndims_out++] = 1; // sp
+    return memory_desc_reshape(*o_md, *i_md, ndims_out, reduce);
+}
+
+status_t reduction_helper_t::reshape_weights(
+        memory_desc_t *o_md, const memory_desc_t *i_md, bool to_matmul) {
+    dims_t reduce {};
+    // 1 (batch) + groups + 2 (c/g and sp) for matmul
+    // groups + convolution dims for convolution
+    const dim_t ndims_out = to_matmul ? 1 + pd_->with_groups() + 2
+                                      : pd_->with_groups() + pd_->ndims();
+    const dim_t ndims_ch = 2 + pd_->with_groups();
+    // this will never be the case for convolution reduction to matmul but
+    // adding in for compiler errors.
+    if (ndims_out > DNNL_MAX_NDIMS) return status::invalid_arguments;
+    // convert between weights for convolution and matmul
+    // for matmul, batch dimension b is always 1
+    // eg. {g, o, i, d, h, w} <-> {b, g, o, i}
+    if (to_matmul) {
+        // conv to matmul: add batch, remove spatial
+        reduce[0] = 1; // b
+        for (int d = 0; d < ndims_ch; ++d)
+            reduce[d + 1] = i_md->dims[d]; // g, oc, ic
+    } else {
+        // matmul to conv: remove batch, restore spatial
+        for (int d = 0; d < ndims_ch; ++d)
+            reduce[d] = i_md->dims[d + 1]; // g, o, i
+        for (int d = ndims_ch; d < ndims_out; ++d)
+            reduce[d] = 1; // d, h, w
+    }
+    return memory_desc_reshape(*o_md, *i_md, ndims_out, reduce);
+}
+
+status_t reduction_helper_t::reshape_for_transpose(
+        memory_desc_t &o_md, memory_desc_t &i_md) {
+    const int ndims = i_md.ndims;
+    std::vector<int> perm(ndims);
+    for (int dim = 0; dim < ndims; dim++) {
+        if (dim == ndims - 2)
+            perm[dim] = dim + 1;
+        else if (dim == ndims - 1)
+            perm[dim] = dim - 1;
+        else
+            perm[dim] = dim;
+    }
+    return memory_desc_permute_axes(o_md, i_md, perm.data());
+}
+
+bool reduction_helper_t::is_gemm() {
+    // 1x1
+    return utils::everyone_is(1, pd_->KD(), pd_->KH(), pd_->KW())
+            // unit groups
+            && 1 == pd_->G()
+            // no pre-padding
+            && utils::everyone_is(0, pd_->padFront(), pd_->padT(), pd_->padL())
+            // no post-padding
+            && utils::everyone_is(0, pd_->padBack(), pd_->padB(), pd_->padR())
+            // unit strides
+            && utils::everyone_is(1, pd_->KSD(), pd_->KSH(), pd_->KSW());
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::pd_t::init_convolution(
+        engine_t *engine) {
+    format_tag_t nspc_tag = get_axb_tag(ndims());
+    nspc_src_md_ = *src_md();
+    nspc_dst_md_ = *dst_md();
+    CHECK(memory_desc_init_by_tag(nspc_src_md_, nspc_tag));
+    CHECK(memory_desc_init_by_tag(nspc_dst_md_, nspc_tag));
+
+    CHECK(attr_.set_default_formats(&nspc_dst_md_));
+
+    // create a convolution descriptor with activations in nspc format
+    convolution_desc_t nspc_conv_d = convolution_desc_t();
+    const convolution_desc_t *ncsp_conv_d = desc();
+    CHECK(conv_desc_init(&nspc_conv_d, ncsp_conv_d->prop_kind,
+            ncsp_conv_d->alg_kind, &nspc_src_md_, &ncsp_conv_d->weights_desc,
+            &ncsp_conv_d->bias_desc, &nspc_dst_md_, ncsp_conv_d->strides,
+            ncsp_conv_d->dilates, ncsp_conv_d->padding[0],
+            ncsp_conv_d->padding[1]));
+    int skip_this_idx
+            = impl_list_item_t::find<jit_uni_ncsp_convolution_fwd_t::pd_t>(
+                    engine->get_implementation_list(
+                            reinterpret_cast<const op_desc_t *>(&nspc_conv_d)));
+    primitive_desc_iterator_t it(engine,
+            reinterpret_cast<const op_desc_t *>(&nspc_conv_d), attr(), nullptr,
+            skip_this_idx);
+    if (!it.is_initialized()) return status::out_of_memory;
+
+    if (++it == it.end()) return status::unimplemented;
+    nspc_conv_pd_ = *it;
+
+    if (weights_md_.format_kind == format_kind::any)
+        weights_md_ = *nspc_conv_pd_->weights_md(0);
+    if (bias_md_.format_kind == format_kind::any)
+        bias_md_ = *nspc_conv_pd_->weights_md(1);
+
+    CHECK(reorder_primitive_desc_create(
+            src_reorder_pd_, engine, src_md(), &nspc_src_md_));
+    const bool with_sum = attr()->post_ops_.find(primitive_kind::sum) != -1;
+    if (with_sum)
+        CHECK(reorder_primitive_desc_create(
+                dst_pre_reorder_pd_, engine, dst_md(), &nspc_dst_md_));
+    CHECK(reorder_primitive_desc_create(
+            dst_post_reorder_pd_, engine, &nspc_dst_md_, dst_md()));
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::pd_t::init_matmul(engine_t *engine) {
+    CHECK(reduction_helper_.reshape_activations(
+            &matmul_dst_md_, dst_md(0), true));
+
+    // initialize convolution bias as 1d plain tensor
+    if (bias_md_.format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_strides(bias_md_, nullptr));
+
+    // For call to matmul:
+    // - conv src becomes matmul weights (ie matrix B)
+    // - conv weights becomes matmul src (ie matrix A)
+    // This allows to keep conv src and conv dst in ncsp layout.
+    CHECK(reduction_helper_.reshape_activations(
+            &matmul_wei_md_, src_md(0), false));
+    CHECK(reduction_helper_.reshape_weights(
+            &matmul_src_md_, weights_md(0), true));
+    if (with_bias())
+        CHECK(reduction_helper_.reshape_bias(&matmul_bia_md_, weights_md(1)));
+    //primitive_desc_iface_t *matmul_pdi;
+    primitive_attr_t _attr;
+    post_ops_t _po;
+    if (with_bias()) {
+        CHECK(_po.append_binary(alg_kind::binary_add, &matmul_bia_md_));
+        CHECK(_attr.set_post_ops(_po));
+    }
+    matmul_desc_t matmul_d = matmul_desc_t();
+    CHECK(matmul_desc_init(&matmul_d, &matmul_src_md_, &matmul_wei_md_, nullptr,
+            &matmul_dst_md_));
+    primitive_desc_iterator_t it(
+            engine, (op_desc_t *)&matmul_d, &_attr, nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+    if (++it == it.end()) return status::unimplemented;
+    matmul_pd_ = *it;
+    if (weights_md_.format_kind == format_kind::any)
+        CHECK(reduction_helper_.reshape_weights(
+                &weights_md_, matmul_pd_->src_md(), false /*to_matmul*/));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::pd_t::init(engine_t *engine) {
+    using namespace data_type;
+    using namespace utils;
+
+    VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+
+    VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+            VERBOSE_BAD_ALGORITHM);
+
+    VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+
+    VDISPATCH_CONV(memory_desc_matches_tag(*src_md(), get_abx_tag(ndims())),
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_CONV(memory_desc_matches_tag(*dst_md(), get_abx_tag(ndims())),
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_CONV(everyone_is(f32, src_md()->data_type, dst_md()->data_type,
+                           weights_md(0)->data_type),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_CONV(IMPLICATION(with_bias(), weights_md(1)->data_type == f32),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_CONV(mayiuse(avx512_core), VERBOSE_UNSUPPORTED_ISA);
+
+    reduction_helper_ = reduction_helper_t(this);
+    // TODO: Support attributes in matmul-based convolution.
+    is_matmul_ = reduction_helper_.is_gemm() && attr()->has_default_values();
+
+    if (is_matmul_)
+        CHECK(init_matmul(engine));
+    else
+        CHECK(init_convolution(engine));
+
+    init_name();
+    init_scratchpad();
+    return status::success;
+}
+
+void jit_uni_ncsp_convolution_fwd_t::pd_t::init_scratchpad() {
+    using namespace memory_tracking::names;
+    auto scratchpad = scratchpad_registry().registrar();
+    if (is_matmul_) {
+        if (matmul_pd_)
+            scratchpad.book(key_nested, matmul_pd_->scratchpad_registry());
+    } else {
+        const memory_desc_wrapper dst_mdw(dst_md());
+        const memory_desc_wrapper src_mdw(src_md());
+        scratchpad.book(key_conv_ncsp_dst, dst_mdw.nelems(),
+                sizeof(dst_mdw.data_type()));
+        scratchpad.book(key_conv_ncsp_src, src_mdw.nelems(),
+                sizeof(src_mdw.data_type()));
+        if (nspc_conv_pd_)
+            scratchpad.book(key_nested, nspc_conv_pd_->scratchpad_registry());
+        if (src_reorder_pd_)
+            scratchpad.book(key_nested, src_reorder_pd_->scratchpad_registry());
+        if (dst_pre_reorder_pd_)
+            scratchpad.book(
+                    key_nested, dst_pre_reorder_pd_->scratchpad_registry());
+        if (dst_post_reorder_pd_)
+            scratchpad.book(
+                    key_nested, dst_post_reorder_pd_->scratchpad_registry());
+    }
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::init(engine_t *engine) {
+    if (pd()->matmul_pd_)
+        CHECK(pd()->matmul_pd_->create_primitive(matmul_p_, engine));
+    if (pd()->nspc_conv_pd_)
+        CHECK(pd()->nspc_conv_pd_->create_primitive(nspc_conv_p_, engine));
+    if (pd()->src_reorder_pd_)
+        CHECK(pd()->src_reorder_pd_->create_primitive(src_reorder_p_, engine));
+    if (pd()->dst_pre_reorder_pd_)
+        CHECK(pd()->dst_pre_reorder_pd_->create_primitive(
+                dst_pre_reorder_p_, engine));
+    if (pd()->dst_post_reorder_pd_)
+        CHECK(pd()->dst_post_reorder_pd_->create_primitive(
+                dst_post_reorder_p_, engine));
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::reorder_activations(
+        const exec_ctx_t &ctx, const std::shared_ptr<primitive_t> &prim,
+        engine_t *engine, const memory_arg_t &in,
+        const memory_arg_t &out) const {
+    using namespace memory_tracking::names;
+    exec_args_t r_args;
+    r_args[DNNL_ARG_SRC] = in;
+    r_args[DNNL_ARG_DST] = out;
+    exec_ctx_t r_ctx(ctx, std::move(r_args));
+
+    nested_scratchpad_t ns(ctx, key_nested, prim);
+    r_ctx.set_scratchpad_grantor(ns.grantor());
+    CHECK(prim->execute(r_ctx));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::execute_convolution(
+        const exec_ctx_t &ctx) const {
+
+    using namespace memory_tracking::names;
+    engine_t *engine = ctx.stream()->engine();
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // initialize nspc src memory
+    auto nspc_src_mem = scratchpad.get_memory_storage(key_conv_ncsp_src);
+    std::unique_ptr<memory_t, memory_deleter_t> nspc_src;
+    CHECK(safe_ptr_assign(nspc_src,
+            new memory_t(
+                    engine, &(pd()->nspc_src_md_), std::move(nspc_src_mem))));
+
+    // initialize nspc dst memory
+    auto nspc_dst_mem = scratchpad.get_memory_storage(key_conv_ncsp_dst);
+    std::unique_ptr<memory_t, memory_deleter_t> nspc_dst;
+    CHECK(safe_ptr_assign(nspc_dst,
+            new memory_t(
+                    engine, &(pd()->nspc_dst_md_), std::move(nspc_dst_mem))));
+
+    // reorder src from ncsp to nspc
+    CHECK(reorder_activations(ctx, src_reorder_p_, engine,
+            ctx.args().at(DNNL_ARG_SRC), {nspc_src.get(), false}));
+
+    // maybe reorder dst from ncsp to nspc
+    if (pd()->dst_pre_reorder_pd_)
+        CHECK(reorder_activations(ctx, dst_pre_reorder_p_, engine,
+                ctx.args().at(DNNL_ARG_DST), {nspc_dst.get(), false}));
+
+    // execute nspc convolution
+    const auto &args = ctx.args();
+    exec_args_t conv_args = args; // copy args to include postops mem.
+    conv_args[DNNL_ARG_DST] = {nspc_dst.get(), false};
+    conv_args[DNNL_ARG_SRC] = {nspc_src.get(), true};
+
+    exec_ctx_t nspc_ctx(ctx, std::move(conv_args));
+
+    nested_scratchpad_t ns(
+            ctx, memory_tracking::names::key_nested, nspc_conv_p_);
+    nspc_ctx.set_scratchpad_grantor(ns.grantor());
+    CHECK(nspc_conv_p_->execute(nspc_ctx));
+
+    // reorder dst from nspc to ncsp
+    CHECK(reorder_activations(ctx, dst_post_reorder_p_, engine,
+            {nspc_dst.get(), false}, ctx.args().at(DNNL_ARG_DST)));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::execute_matmul(
+        const exec_ctx_t &ctx) const {
+
+    exec_args_t matmul_args;
+    matmul_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_WEIGHTS);
+    matmul_args[DNNL_ARG_WEIGHTS] = ctx.args().at(DNNL_ARG_SRC);
+    matmul_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DST);
+
+    if (pd()->with_bias())
+        matmul_args[DNNL_ARG_SRC_1 | DNNL_ARG_ATTR_MULTIPLE_POST_OP(0)]
+                = ctx.args().at(DNNL_ARG_BIAS);
+
+    exec_ctx_t matmul_ctx(ctx, std::move(matmul_args));
+
+    nested_scratchpad_t ns(ctx, memory_tracking::names::key_nested, matmul_p_);
+    matmul_ctx.set_scratchpad_grantor(ns.grantor());
+
+    return matmul_p_->execute(matmul_ctx);
+}
+
+status_t jit_uni_ncsp_convolution_fwd_t::execute(const exec_ctx_t &ctx) const {
+    if (matmul_p_) return execute_matmul(ctx);
+    if (nspc_conv_p_) return execute_convolution(ctx);
+    return status::runtime_error;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_weights_t::pd_t::init(engine_t *engine) {
+    VDISPATCH_CONV(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+    VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+            VERBOSE_BAD_ALGORITHM);
+    VDISPATCH_CONV(is_bwd_w(), VERBOSE_BAD_PROPKIND);
+    VDISPATCH_CONV(memory_desc_matches_tag(*src_md(), get_abx_tag(ndims())),
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_CONV(
+            memory_desc_matches_tag(*diff_dst_md(), get_abx_tag(ndims())),
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_CONV(
+            everyone_is(data_type::f32, src_md()->data_type,
+                    diff_dst_md()->data_type, diff_weights_md(0)->data_type,
+                    with_bias() ? diff_weights_md(1)->data_type
+                                : data_type::f32),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_CONV(mayiuse(avx512_core), VERBOSE_UNSUPPORTED_ISA);
+
+    CHECK(init_convolution(engine));
+    init_name();
+    init_scratchpad();
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_weights_t::pd_t::init_convolution(
+        engine_t *engine) {
+    format_tag_t nspc_tag = get_axb_tag(ndims());
+    nspc_src_md_ = *src_md();
+    nspc_diff_dst_md_ = *diff_dst_md();
+    CHECK(memory_desc_init_by_tag(nspc_src_md_, nspc_tag));
+    CHECK(memory_desc_init_by_tag(nspc_diff_dst_md_, nspc_tag));
+    convolution_desc_t nspc_conv_d = convolution_desc_t();
+    const convolution_desc_t *ncsp_conv_d = desc();
+    CHECK(conv_desc_init(&nspc_conv_d, ncsp_conv_d->prop_kind,
+            ncsp_conv_d->alg_kind, &nspc_src_md_,
+            &ncsp_conv_d->diff_weights_desc, &ncsp_conv_d->diff_bias_desc,
+            &nspc_diff_dst_md_, ncsp_conv_d->strides, ncsp_conv_d->dilates,
+            ncsp_conv_d->padding[0], ncsp_conv_d->padding[1]));
+    int skip_this_idx = impl_list_item_t::find<
+            jit_uni_ncsp_convolution_bwd_weights_t::pd_t>(
+            engine->get_implementation_list(
+                    reinterpret_cast<const op_desc_t *>(&nspc_conv_d)));
+    primitive_desc_iterator_t it(engine,
+            reinterpret_cast<const op_desc_t *>(&nspc_conv_d), attr(), nullptr,
+            skip_this_idx);
+    if (!it.is_initialized()) return status::out_of_memory;
+    if (++it == it.end()) return status::unimplemented;
+    nspc_conv_pd_ = *it;
+    diff_weights_md_ = *nspc_conv_pd_->diff_weights_md(0);
+    diff_bias_md_ = *nspc_conv_pd_->diff_weights_md(1);
+    CHECK(reorder_primitive_desc_create(
+            src_reorder_pd_, engine, src_md(), &nspc_src_md_));
+    CHECK(reorder_primitive_desc_create(
+            dst_reorder_pd_, engine, diff_dst_md(), &nspc_diff_dst_md_));
+    return status::success;
+}
+
+void jit_uni_ncsp_convolution_bwd_weights_t::pd_t::init_scratchpad() {
+    using namespace memory_tracking::names;
+    auto scratchpad = scratchpad_registry().registrar();
+    const memory_desc_wrapper diff_dst_mdw(diff_dst_md());
+    const memory_desc_wrapper src_mdw(src_md());
+    scratchpad.book(key_conv_ncsp_diff_dst, diff_dst_mdw.nelems(),
+            diff_dst_mdw.data_type_size());
+    scratchpad.book(
+            key_conv_ncsp_src, src_mdw.nelems(), sizeof(src_mdw.data_type()));
+    if (nspc_conv_pd_)
+        scratchpad.book(key_nested, nspc_conv_pd_->scratchpad_registry());
+    if (src_reorder_pd_)
+        scratchpad.book(key_nested, src_reorder_pd_->scratchpad_registry());
+    if (dst_reorder_pd_)
+        scratchpad.book(key_nested, dst_reorder_pd_->scratchpad_registry());
+}
+
+status_t jit_uni_ncsp_convolution_bwd_weights_t::init(engine_t *engine) {
+    if (pd()->nspc_conv_pd_)
+        CHECK(pd()->nspc_conv_pd_->create_primitive(nspc_conv_p_, engine));
+    if (pd()->src_reorder_pd_)
+        CHECK(pd()->src_reorder_pd_->create_primitive(src_reorder_p_, engine));
+    if (pd()->dst_reorder_pd_)
+        CHECK(pd()->dst_reorder_pd_->create_primitive(dst_reorder_p_, engine));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_weights_t::reorder_activations(
+        const exec_ctx_t &ctx, const std::shared_ptr<primitive_t> &prim,
+        engine_t *engine, const memory_arg_t &in,
+        const memory_arg_t &out) const {
+    using namespace memory_tracking::names;
+    exec_args_t r_args;
+    r_args[DNNL_ARG_SRC] = in;
+    r_args[DNNL_ARG_DST] = out;
+    exec_ctx_t r_ctx(ctx, std::move(r_args));
+
+    nested_scratchpad_t ns(ctx, key_nested, prim);
+    r_ctx.set_scratchpad_grantor(ns.grantor());
+    CHECK(prim->execute(r_ctx));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_weights_t::execute_convolution(
+        const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+    engine_t *engine = ctx.stream()->engine();
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // initialize nspc src memory
+    auto nspc_src_mem = scratchpad.get_memory_storage(key_conv_ncsp_src);
+    std::unique_ptr<memory_t, memory_deleter_t> nspc_src;
+    CHECK(safe_ptr_assign(nspc_src,
+            new memory_t(
+                    engine, &(pd()->nspc_src_md_), std::move(nspc_src_mem))));
+
+    // initialize nspc dst memory
+    auto nspc_diff_dst_mem
+            = scratchpad.get_memory_storage(key_conv_ncsp_diff_dst);
+    std::unique_ptr<memory_t, memory_deleter_t> nspc_diff_dst;
+    CHECK(safe_ptr_assign(nspc_diff_dst,
+            new memory_t(engine, &(pd()->nspc_diff_dst_md_),
+                    std::move(nspc_diff_dst_mem))));
+
+    CHECK(reorder_activations(ctx, dst_reorder_p_, engine,
+            ctx.args().at(DNNL_ARG_DIFF_DST), {nspc_diff_dst.get(), false}));
+    CHECK(reorder_activations(ctx, src_reorder_p_, engine,
+            ctx.args().at(DNNL_ARG_SRC), {nspc_src.get(), false}));
+
+    const auto &args = ctx.args();
+    exec_args_t conv_args;
+    conv_args[DNNL_ARG_DIFF_DST] = {nspc_diff_dst.get(), true};
+    conv_args[DNNL_ARG_SRC] = {nspc_src.get(), true};
+    conv_args[DNNL_ARG_DIFF_WEIGHTS] = args.at(DNNL_ARG_DIFF_WEIGHTS);
+    if (pd()->with_bias())
+        conv_args[DNNL_ARG_DIFF_BIAS] = args.at(DNNL_ARG_DIFF_BIAS);
+
+    exec_ctx_t nspc_ctx(ctx, std::move(conv_args));
+
+    nested_scratchpad_t ns(
+            ctx, memory_tracking::names::key_nested, nspc_conv_p_);
+
+    nspc_ctx.set_scratchpad_grantor(ns.grantor());
+    CHECK(nspc_conv_p_->execute(nspc_ctx));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_weights_t::execute(
+        const exec_ctx_t &ctx) const {
+    return execute_convolution(ctx);
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::pd_t::init(engine_t *engine) {
+    VDISPATCH_CONV(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+    VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+            VERBOSE_BAD_ALGORITHM);
+    VDISPATCH_CONV(is_bwd_d(), VERBOSE_BAD_PROPKIND);
+    VDISPATCH_CONV(
+            memory_desc_matches_tag(*diff_src_md(), get_abx_tag(ndims())),
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_CONV(
+            memory_desc_matches_tag(*diff_dst_md(), get_abx_tag(ndims())),
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_CONV(everyone_is(data_type::f32, diff_src_md()->data_type,
+                           diff_dst_md()->data_type, weights_md(0)->data_type),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_CONV(mayiuse(avx512_core), VERBOSE_UNSUPPORTED_ISA);
+
+    if (one_of(data_type::bf16, diff_dst_md_.data_type, weights_md_.data_type)
+            && !mayiuse(avx512_core_bf16))
+        return status::unimplemented;
+
+    reduction_helper_ = reduction_helper_t(this);
+    is_matmul_ = reduction_helper_.is_gemm() && attr()->has_default_values();
+
+    if (is_matmul_)
+        CHECK(init_matmul(engine));
+    else
+        CHECK(init_convolution(engine));
+    init_scratchpad();
+    init_name();
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::pd_t::init_convolution(
+        engine_t *engine) {
+    format_tag_t nspc_tag = get_axb_tag(ndims());
+    nspc_diff_src_md_ = *diff_src_md();
+    nspc_diff_dst_md_ = *diff_dst_md();
+    CHECK(memory_desc_init_by_tag(nspc_diff_src_md_, nspc_tag));
+    CHECK(memory_desc_init_by_tag(nspc_diff_dst_md_, nspc_tag));
+    convolution_desc_t nspc_conv_d = convolution_desc_t();
+    const convolution_desc_t *ncsp_conv_d = desc();
+    CHECK(conv_desc_init(&nspc_conv_d, ncsp_conv_d->prop_kind,
+            ncsp_conv_d->alg_kind, &nspc_diff_src_md_,
+            &ncsp_conv_d->weights_desc, &ncsp_conv_d->bias_desc,
+            &nspc_diff_dst_md_, ncsp_conv_d->strides, ncsp_conv_d->dilates,
+            ncsp_conv_d->padding[0], ncsp_conv_d->padding[1]));
+    int skip_this_idx
+            = impl_list_item_t::find<jit_uni_ncsp_convolution_bwd_data_t::pd_t>(
+                    engine->get_implementation_list(
+                            reinterpret_cast<const op_desc_t *>(&nspc_conv_d)));
+    primitive_desc_iterator_t it(engine,
+            reinterpret_cast<const op_desc_t *>(&nspc_conv_d), attr(), nullptr,
+            skip_this_idx);
+    if (!it.is_initialized()) return status::out_of_memory;
+
+    if (++it == it.end()) return status::unimplemented;
+
+    nspc_conv_pd_ = *it;
+
+    CHECK(reorder_primitive_desc_create(
+            src_reorder_pd_, engine, &nspc_diff_src_md_, diff_src_md()));
+    CHECK(reorder_primitive_desc_create(
+            dst_reorder_pd_, engine, diff_dst_md(), &nspc_diff_dst_md_));
+    weights_md_ = *nspc_conv_pd_->weights_md(0);
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::pd_t::init_matmul(
+        engine_t *engine) {
+    CHECK(reduction_helper_.reshape_activations(
+            &matmul_wei_md_, diff_dst_md(0), true));
+    // initialize diff weights to plain format.
+    CHECK(memory_desc_init_by_strides(weights_md_, weights_md_.ndims,
+            weights_md_.dims, weights_md_.data_type, nullptr));
+    // reshape weights to matmul format
+    memory_desc_t weights_reshaped_md_;
+    CHECK(reduction_helper_.reshape_weights(
+            &weights_reshaped_md_, &weights_md_, true));
+    CHECK(reduction_helper_.reshape_for_transpose(
+            matmul_src_md_, weights_reshaped_md_));
+    CHECK(reduction_helper_.reshape_activations(
+            &matmul_dst_md_, diff_src_md(), false));
+    primitive_attr_t _attr;
+    matmul_desc_t matmul_d = matmul_desc_t();
+    CHECK(matmul_desc_init(&matmul_d, &matmul_src_md_, &matmul_wei_md_, nullptr,
+            &matmul_dst_md_));
+    primitive_desc_iterator_t it(
+            engine, (op_desc_t *)&matmul_d, &_attr, nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+    if (++it == it.end()) return status::unimplemented;
+    matmul_diff_src_pd_ = *it;
+
+    return status::success;
+}
+
+void jit_uni_ncsp_convolution_bwd_data_t::pd_t::init_scratchpad() {
+    using namespace memory_tracking::names;
+    auto scratchpad = scratchpad_registry().registrar();
+    if (is_matmul_) {
+        if (matmul_diff_src_pd_)
+            scratchpad.book(
+                    key_nested, matmul_diff_src_pd_->scratchpad_registry());
+    } else {
+        const memory_desc_wrapper diff_dst_mdw(diff_dst_md());
+        const memory_desc_wrapper diff_src_mdw(diff_src_md());
+        scratchpad.book(key_conv_ncsp_diff_dst, diff_dst_mdw.nelems(),
+                sizeof(diff_dst_mdw.data_type()));
+        scratchpad.book(key_conv_ncsp_diff_src, diff_src_mdw.nelems(),
+                sizeof(diff_src_mdw.data_type()));
+        if (nspc_conv_pd_)
+            scratchpad.book(key_nested, nspc_conv_pd_->scratchpad_registry());
+        if (src_reorder_pd_)
+            scratchpad.book(key_nested, src_reorder_pd_->scratchpad_registry());
+        if (dst_reorder_pd_)
+            scratchpad.book(key_nested, dst_reorder_pd_->scratchpad_registry());
+    }
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::init(engine_t *engine) {
+    if (pd()->nspc_conv_pd_)
+        CHECK(pd()->nspc_conv_pd_->create_primitive(nspc_conv_p_, engine));
+    if (pd()->src_reorder_pd_)
+        CHECK(pd()->src_reorder_pd_->create_primitive(src_reorder_p_, engine));
+    if (pd()->dst_reorder_pd_)
+        CHECK(pd()->dst_reorder_pd_->create_primitive(dst_reorder_p_, engine));
+    if (pd()->matmul_diff_src_pd_)
+        CHECK(pd()->matmul_diff_src_pd_->create_primitive(
+                matmul_diff_src_p_, engine));
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::reorder_activations(
+        const exec_ctx_t &ctx, const std::shared_ptr<primitive_t> &prim,
+        engine_t *engine, const memory_arg_t &in,
+        const memory_arg_t &out) const {
+    using namespace memory_tracking::names;
+    exec_args_t r_args;
+    r_args[DNNL_ARG_SRC] = in;
+    r_args[DNNL_ARG_DST] = out;
+    exec_ctx_t r_ctx(ctx, std::move(r_args));
+
+    nested_scratchpad_t ns(ctx, key_nested, prim);
+    r_ctx.set_scratchpad_grantor(ns.grantor());
+    CHECK(prim->execute(r_ctx));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::execute_convolution(
+        const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+    engine_t *engine = ctx.stream()->engine();
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    // initialize nspc src memory
+    auto nspc_diff_src_mem
+            = scratchpad.get_memory_storage(key_conv_ncsp_diff_src);
+    std::unique_ptr<memory_t, memory_deleter_t> nspc_diff_src;
+    CHECK(safe_ptr_assign(nspc_diff_src,
+            new memory_t(engine, &(pd()->nspc_diff_src_md_),
+                    std::move(nspc_diff_src_mem))));
+
+    // initialize nspc dst memory
+    auto nspc_diff_dst_mem
+            = scratchpad.get_memory_storage(key_conv_ncsp_diff_dst);
+    std::unique_ptr<memory_t, memory_deleter_t> nspc_diff_dst;
+    CHECK(safe_ptr_assign(nspc_diff_dst,
+            new memory_t(engine, &(pd()->nspc_diff_dst_md_),
+                    std::move(nspc_diff_dst_mem))));
+
+    CHECK(reorder_activations(ctx, dst_reorder_p_, engine,
+            ctx.args().at(DNNL_ARG_DIFF_DST), {nspc_diff_dst.get(), false}));
+
+    const auto &args = ctx.args();
+    exec_args_t conv_args;
+    conv_args[DNNL_ARG_DIFF_DST] = {nspc_diff_dst.get(), true};
+    conv_args[DNNL_ARG_DIFF_SRC] = {nspc_diff_src.get(), false};
+    conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
+
+    exec_ctx_t nspc_ctx(ctx, std::move(conv_args));
+
+    nested_scratchpad_t ns(
+            ctx, memory_tracking::names::key_nested, nspc_conv_p_);
+
+    nspc_ctx.set_scratchpad_grantor(ns.grantor());
+    CHECK(nspc_conv_p_->execute(nspc_ctx));
+
+    CHECK(reorder_activations(ctx, src_reorder_p_, engine,
+            {nspc_diff_src.get(), false}, ctx.args().at(DNNL_ARG_DIFF_SRC)));
+
+    return status::success;
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::execute_matmul(
+        const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+
+    exec_args_t matmul_src_diff_args;
+    matmul_src_diff_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_WEIGHTS);
+    matmul_src_diff_args[DNNL_ARG_WEIGHTS] = ctx.args().at(DNNL_ARG_DIFF_DST);
+    matmul_src_diff_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DIFF_SRC);
+
+    exec_ctx_t matmul_src_diff_ctx(ctx, std::move(matmul_src_diff_args));
+
+    nested_scratchpad_t matmul_src_diff_ns(
+            ctx, memory_tracking::names::key_nested, matmul_diff_src_p_);
+    matmul_src_diff_ctx.set_scratchpad_grantor(matmul_src_diff_ns.grantor());
+
+    return matmul_diff_src_p_->execute(matmul_src_diff_ctx);
+}
+
+status_t jit_uni_ncsp_convolution_bwd_data_t::execute(
+        const exec_ctx_t &ctx) const {
+    if (matmul_diff_src_p_)
+        return execute_matmul(ctx);
+    else
+        return execute_convolution(ctx);
+}
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/jit_uni_ncsp_convolution.hpp b/src/cpu/x64/jit_uni_ncsp_convolution.hpp
new file mode 100644
index 00000000000..c08fdb5de9f
--- /dev/null
+++ b/src/cpu/x64/jit_uni_ncsp_convolution.hpp
@@ -0,0 +1,219 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_NCSP_CONVOLUTION_HPP
+#define CPU_X64_JIT_UNI_NCSP_CONVOLUTION_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/cpu_convolution_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct reduction_helper_t {
+    reduction_helper_t() : pd_(nullptr) {}
+    reduction_helper_t(const convolution_pd_t *pd_) : pd_(pd_) {}
+    status_t reshape_activations(
+            memory_desc_t *o_md, const memory_desc_t *i_md, bool is_dst);
+
+    status_t reshape_bias(memory_desc_t *o_md, const memory_desc_t *i_md);
+    status_t reshape_weights(
+            memory_desc_t *o_md, const memory_desc_t *i_md, bool to_matmul);
+    status_t reshape_for_transpose(memory_desc_t &o_md, memory_desc_t &i_md);
+    // If convolution is 1x1, no padding, and single strides then dispatch
+    // to matmul kernel. This is done because matmul supports transposed
+    // layout and is more efficient than 1x1 convolutions due to not having
+    // to dispatch an additional reorder kernel for src and dst. Adding
+    // transposed support inside convolution brgemm kernels is preferable but it
+    // will take time to implement.
+    bool is_gemm();
+
+private:
+    const convolution_pd_t *pd_;
+};
+
+struct jit_uni_ncsp_convolution_fwd_t : public primitive_t {
+
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T(name_.c_str(), jit_uni_ncsp_convolution_fwd_t);
+
+        status_t init(engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> matmul_pd_;
+        std::shared_ptr<primitive_desc_t> nspc_conv_pd_;
+        std::shared_ptr<primitive_desc_t> src_reorder_pd_;
+        std::shared_ptr<primitive_desc_t> dst_pre_reorder_pd_;
+        std::shared_ptr<primitive_desc_t> dst_post_reorder_pd_;
+        memory_desc_t matmul_src_md_;
+        memory_desc_t matmul_wei_md_;
+        memory_desc_t matmul_bia_md_;
+        memory_desc_t matmul_dst_md_;
+        memory_desc_t nspc_src_md_;
+        memory_desc_t nspc_dst_md_;
+
+    private:
+        status_t init_convolution(engine_t *engine);
+        status_t init_matmul(engine_t *engine);
+        reduction_helper_t reduction_helper_;
+        bool is_matmul_ = false;
+        std::string name_ = "jit_uni_ncsp_convolution:";
+        void init_name() {
+            std::string suffix = is_matmul_ ? "matmul" : "conv";
+            name_ += suffix + "+";
+            name_.append(
+                    is_matmul_ ? matmul_pd_->name() : nspc_conv_pd_->name());
+        }
+        void init_scratchpad();
+    };
+
+    jit_uni_ncsp_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {};
+
+    ~jit_uni_ncsp_convolution_fwd_t() override = default;
+
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    status_t execute_convolution(const exec_ctx_t &ctx) const;
+    status_t execute_matmul(const exec_ctx_t &ctx) const;
+    status_t reorder_activations(const exec_ctx_t &ctx,
+            const std::shared_ptr<primitive_t> &prim, engine_t *engine,
+            const memory_arg_t &in, const memory_arg_t &out) const;
+    const pd_t *pd() const {
+        return static_cast<const pd_t *>(primitive_t::pd().get());
+    }
+    std::shared_ptr<primitive_t> matmul_p_;
+    std::shared_ptr<primitive_t> nspc_conv_p_;
+    std::shared_ptr<primitive_t> src_reorder_p_;
+    std::shared_ptr<primitive_t> dst_pre_reorder_p_;
+    std::shared_ptr<primitive_t> dst_post_reorder_p_;
+};
+
+struct jit_uni_ncsp_convolution_bwd_weights_t : public primitive_t {
+    struct pd_t : public cpu_convolution_bwd_weights_pd_t {
+        using cpu_convolution_bwd_weights_pd_t::
+                cpu_convolution_bwd_weights_pd_t;
+
+        DECLARE_COMMON_PD_T(
+                name_.c_str(), jit_uni_ncsp_convolution_bwd_weights_t);
+
+        status_t init(engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> nspc_conv_pd_;
+        std::shared_ptr<primitive_desc_t> src_reorder_pd_;
+        std::shared_ptr<primitive_desc_t> dst_reorder_pd_;
+        memory_desc_t nspc_src_md_;
+        memory_desc_t nspc_diff_dst_md_;
+
+    private:
+        status_t init_convolution(engine_t *engine);
+        std::string name_;
+        void init_scratchpad();
+        void init_name() {
+            name_ = "jit_uni_ncsp_convolution:conv+";
+            name_.append(nspc_conv_pd_->name());
+        }
+    };
+    jit_uni_ncsp_convolution_bwd_weights_t(const pd_t *cpd)
+        : primitive_t(cpd) {};
+
+    ~jit_uni_ncsp_convolution_bwd_weights_t() override = default;
+
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    status_t execute_convolution(const exec_ctx_t &ctx) const;
+    status_t reorder_activations(const exec_ctx_t &ctx,
+            const std::shared_ptr<primitive_t> &prim, engine_t *engine,
+            const memory_arg_t &in, const memory_arg_t &out) const;
+    const pd_t *pd() const {
+        return static_cast<const pd_t *>(primitive_t::pd().get());
+    }
+    std::shared_ptr<primitive_t> nspc_conv_p_;
+    std::shared_ptr<primitive_t> src_reorder_p_;
+    std::shared_ptr<primitive_t> dst_reorder_p_;
+};
+
+struct jit_uni_ncsp_convolution_bwd_data_t : public primitive_t {
+    struct pd_t : public cpu_convolution_bwd_data_pd_t {
+        using cpu_convolution_bwd_data_pd_t::cpu_convolution_bwd_data_pd_t;
+
+        DECLARE_COMMON_PD_T(name_.c_str(), jit_uni_ncsp_convolution_bwd_data_t);
+
+        status_t init(engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> matmul_diff_src_pd_;
+        std::shared_ptr<primitive_desc_t> nspc_conv_pd_;
+        std::shared_ptr<primitive_desc_t> src_reorder_pd_;
+        std::shared_ptr<primitive_desc_t> dst_reorder_pd_;
+        memory_desc_t nspc_diff_dst_md_;
+        memory_desc_t nspc_diff_src_md_;
+        memory_desc_t matmul_src_md_;
+        memory_desc_t matmul_wei_md_;
+        memory_desc_t matmul_dst_md_;
+
+    private:
+        status_t init_convolution(engine_t *engine);
+        status_t init_matmul(engine_t *engine);
+        reduction_helper_t reduction_helper_;
+        bool is_matmul_ = false;
+        std::string name_;
+        void init_scratchpad();
+        void init_name() {
+            std::string suffix = is_matmul_ ? "matmul" : "conv";
+            name_ = "jit_uni_ncsp_convolution:" + suffix + "+";
+            name_.append(is_matmul_ ? matmul_diff_src_pd_->name()
+                                    : nspc_conv_pd_->name());
+        }
+    };
+    jit_uni_ncsp_convolution_bwd_data_t(const pd_t *cpd) : primitive_t(cpd) {};
+
+    ~jit_uni_ncsp_convolution_bwd_data_t() override = default;
+
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    status_t execute_convolution(const exec_ctx_t &ctx) const;
+    status_t execute_matmul(const exec_ctx_t &ctx) const;
+    status_t reorder_activations(const exec_ctx_t &ctx,
+            const std::shared_ptr<primitive_t> &prim, engine_t *engine,
+            const memory_arg_t &in, const memory_arg_t &out) const;
+    const pd_t *pd() const {
+        return static_cast<const pd_t *>(primitive_t::pd().get());
+    }
+    std::shared_ptr<primitive_t> matmul_diff_src_p_;
+    std::shared_ptr<primitive_t> nspc_conv_p_;
+    std::shared_ptr<primitive_t> src_reorder_p_;
+    std::shared_ptr<primitive_t> dst_reorder_p_;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/jit_uni_planar_conv_kernel_f32.cpp b/src/cpu/x64/jit_uni_planar_conv_kernel_f32.cpp
new file mode 100644
index 00000000000..cd112d44818
--- /dev/null
+++ b/src/cpu/x64/jit_uni_planar_conv_kernel_f32.cpp
@@ -0,0 +1,804 @@
+/*******************************************************************************
+* Copyright 2019-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/c_types_map.hpp"
+#include "common/memory.hpp"
+#include "common/nstl.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "cpu/x64/jit_uni_planar_conv_kernel_f32.hpp"
+
+#define GET_OFF(field) offsetof(jit_conv_call_s, field)
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::prop_kind;
+using namespace dnnl::impl::utils;
+
+using namespace Xbyak;
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::load_src_scalar(int ur_h) {
+    Label init_done_label;
+    Label init_first_label;
+
+    mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]);
+    if (jcp.with_bias)
+        mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]);
+
+    if (!jcp.with_sum) {
+        test(reg_ci_flag, FLAG_IC_FIRST);
+        jne(init_first_label, T_NEAR);
+    }
+
+    for (int kk = 0; kk < ur_h; kk++) {
+        size_t offt = sizeof(float) * (kk * jcp.ow * jcp.oh_block_step);
+        movss(Xmm(kk), make_safe_addr(reg_output, offt, reg_long_offt));
+    }
+
+    if (jcp.with_sum && jcp.with_bias) {
+        test(reg_ci_flag, FLAG_IC_FIRST);
+        je(init_done_label, T_NEAR);
+
+        movss(xmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt));
+        for (int kk = 0; kk < ur_h; kk++) {
+            uni_vaddps(Vmm(kk), Vmm(kk), vmm_tmp);
+        }
+    }
+
+    jmp(init_done_label, T_NEAR);
+
+    L(init_first_label);
+    if (this->jcp.with_bias) {
+        movss(xmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt));
+        for (int kk = 0; kk < ur_h; kk++) {
+            uni_vmovups(Vmm(kk), vmm_tmp);
+        }
+    } else {
+        for (int kk = 0; kk < ur_h; kk++) {
+            uni_vpxor(Vmm(kk), Vmm(kk), Vmm(kk));
+        }
+    }
+
+    L(init_done_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::filter_scalar(int ur_h) {
+    Label iter_exit_label;
+
+    int iw = jcp.iw;
+    int ih = jcp.ih;
+    int id = jcp.id;
+    int dilate_w = jcp.dilate_w + 1;
+    int ic_blk = jcp.ic_block;
+    int kw = jcp.kw;
+    int kh = jcp.kh;
+    int kd = jcp.kd;
+
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(aux_reg_input_w, aux_reg_input_h);
+    mov(aux_reg_kernel_w, aux_reg_kernel_h);
+    mov(kw_iter, reg_kw);
+
+    Label kw_label;
+    L(kw_label);
+    {
+        for (size_t ifm2 = 0; ifm2 < (size_t)ic_blk; ifm2++) {
+            for (int kk = 0; kk < ur_h; kk++) {
+                size_t inp_off = sizeof(float) * (ifm2 * id * ih * iw + kk * jcp.iw * jcp.oh_block_step);
+                movss(xmm_src, make_safe_addr(aux_reg_input_w, inp_off, reg_long_offt));
+
+                size_t ker_off = sizeof(float) * (ifm2 * kd * kh * kw);
+                movss(xmm_ker, ptr[aux_reg_kernel_w + ker_off]);
+
+                uni_vfmadd231ps(Vmm(kk), vmm_src, vmm_ker);
+            }
+        }
+
+        add(aux_reg_kernel_w, sizeof(float));
+        add(aux_reg_input_w, dilate_w * sizeof(float));
+
+        dec(kw_iter);
+        cmp(kw_iter, 0);
+        jg(kw_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::apply_filter_scalar(int ur_h) {
+    int iw = jcp.iw;
+    int kw = jcp.kw;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_d = jcp.dilate_h + 1;
+    const int inp_mult_h = dilate_h;
+    const int inp_mult_d = dilate_d;
+
+    Label skip_kh_loop, skip_kd_loop, kd_label;
+    if (jcp.ndims == 5) {
+        push(reg_kernel);
+        push(reg_output);
+
+        mov(reg_kd, ptr[param1 + GET_OFF(kd_padding)]);
+        mov(aux_reg_ker_d, aux_reg_kernel_h);
+        mov(aux_reg_inp_d, aux_reg_input_h);
+
+        cmp(reg_kd, 0);
+        je(skip_kd_loop, T_NEAR);
+
+        L(kd_label);
+        mov(kh_iter, ptr[param1 + GET_OFF(kh_padding)]);
+    } else {
+        mov(kh_iter, reg_kh);
+    }
+
+    if (jcp.ndims == 5) {
+        mov(aux_reg_input_h, aux_reg_inp_d);
+        mov(aux_reg_kernel_h, aux_reg_ker_d);
+    }
+
+    cmp(kh_iter, 0);
+    je(skip_kh_loop, T_NEAR);
+
+    Label kh_label;
+    L(kh_label);
+    {
+        filter_scalar(ur_h);
+
+        add(aux_reg_kernel_h, sizeof(float) * kw);
+        add(aux_reg_input_h, sizeof(float) * iw * inp_mult_h);
+
+        dec(kh_iter);
+        cmp(kh_iter, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(skip_kh_loop);
+
+    if (jcp.ndims == 5) {
+        add(aux_reg_ker_d, sizeof(float) * jcp.kw * jcp.kh);
+        add(aux_reg_inp_d, sizeof(float) * jcp.ih * jcp.iw * inp_mult_d);
+
+        dec(reg_kd);
+        cmp(reg_kd, 0);
+        jg(kd_label, T_NEAR);
+        L(skip_kd_loop);
+
+        pop(reg_output);
+        pop(reg_kernel);
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::apply_postprocess_scalar(int ur_h) {
+    Label regular_store_label;
+
+    mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]);
+    test(reg_ci_flag, FLAG_IC_LAST);
+    je(regular_store_label, T_NEAR);
+
+    int eltwise_inj_idx = 0;
+    const auto &p = attr_.post_ops_;
+
+    if (p.len() == 0 && eltwise_injectors.size() == 1) {
+        eltwise_injectors[0]->compute_vector_range(0, ur_h);
+    }
+
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur_h);
+            eltwise_inj_idx++;
+        }
+    }
+
+    L(regular_store_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::store_dst_scalar(int ur_h) {
+    for (int kk = 0; kk < ur_h; kk++) {
+        size_t o_off = sizeof(float) * (kk * jcp.ow * jcp.oh_block_step);
+        movss(make_safe_addr(reg_output, o_off, reg_long_offt), Xmm(kk));
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::load_src(int ur_h, int ur_w) {
+    Label init_done_label;
+    Label init_first_label;
+
+    mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]);
+    if (jcp.with_bias)
+        mov(reg_bias, ptr[this->param1 + GET_OFF(bias)]);
+
+    if (!jcp.with_sum) {
+        test(reg_ci_flag, FLAG_IC_FIRST);
+        jne(init_first_label, T_NEAR);
+    }
+
+    for (int kk = 0; kk < ur_h; kk++) {
+        for (int jj = 0; jj < ur_w; jj++) {
+            size_t offt = sizeof(float) * (jj * jcp.ow_block + kk * jcp.ow * jcp.oh_block_step);
+            uni_vmovups(Vmm(kk * ur_w + jj), make_safe_addr(reg_output, offt, reg_long_offt));
+        }
+    }
+
+    if (jcp.with_sum && jcp.with_bias) {
+        test(reg_ci_flag, FLAG_IC_FIRST);
+        je(init_done_label, T_NEAR);
+
+        uni_vbroadcastss(vmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt));
+        for (int kk = 0; kk < ur_h; kk++) {
+            for (int jj = 0; jj < ur_w; jj++) {
+                uni_vaddps(Vmm(kk * ur_w + jj), Vmm(kk * ur_w + jj), vmm_tmp);
+            }
+        }
+    }
+
+    jmp(init_done_label, T_NEAR);
+
+    L(init_first_label);
+    if (this->jcp.with_bias) {
+        uni_vbroadcastss(vmm_tmp, make_safe_addr(reg_bias, 0, reg_long_offt));
+        for (int kk = 0; kk < ur_h; kk++) {
+            for (int jj = 0; jj < ur_w; jj++) {
+                uni_vmovups(Vmm(kk * ur_w + jj), vmm_tmp);
+            }
+        }
+    } else {
+        for (int kk = 0; kk < ur_h; kk++) {
+            for (int jj = 0; jj < ur_w; jj++) {
+                uni_vpxor(Vmm(kk * ur_w + jj), Vmm(kk * ur_w + jj), Vmm(kk * ur_w + jj));
+            }
+        }
+    }
+
+    L(init_done_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::filter_unrolled(int ur_h, int ur_w) {
+    int iw = jcp.iw;
+    int ih = jcp.ih;
+    int id = jcp.id;
+    int stride_w = jcp.stride_w;
+    int dilate_w = jcp.dilate_w + 1;
+    int ic_blk = jcp.ic_block;
+    int kw = jcp.kw;
+    int kh = jcp.kh;
+    int kd = jcp.kd;
+    int ow_blk = jcp.ow_block;
+
+    for (int ki = 0; ki < kw; ki++) {
+        for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) {
+            for (int kk = 0; kk < ur_h; kk++) {
+                for (int jj = 0; jj < ur_w; jj++) {
+                    size_t inp_off = sizeof(float) * ((size_t) ifm2 * id * ih * iw + ki * dilate_w +
+                            jj * stride_w * ow_blk + kk * jcp.ow * jcp.oh_block_step);
+                    uni_vmovups(vmm_src, make_safe_addr(aux_reg_input_h, inp_off, reg_long_offt));
+
+                    int ker_off = sizeof(float) * ((size_t) ifm2 * kd * kh * kw + ki);
+                    uni_vbroadcastss(vmm_ker, ptr[aux_reg_kernel_h + ker_off]);
+
+                    uni_vfmadd231ps(Vmm(kk * ur_w + jj), vmm_src, vmm_ker);
+                }
+            }
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::filter(int ur_h) {
+    Label iter_exit_label;
+
+    int iw = jcp.iw;
+    int ih = jcp.ih;
+    int id = jcp.id;
+    int dilate_w = jcp.dilate_w + 1;
+    int ic_blk = jcp.ic_block;
+    int kw = jcp.kw;
+    int kh = jcp.kh;
+    int kd = jcp.kd;
+
+    cmp(reg_kw, 0);
+    je(iter_exit_label, T_NEAR);
+
+    mov(aux_reg_input_w, aux_reg_input_h);
+    mov(aux_reg_kernel_w, aux_reg_kernel_h);
+    mov(kw_iter, reg_kw);
+
+    Label kw_label;
+    L(kw_label);
+    {
+        for (int ifm2 = 0; ifm2 < ic_blk; ifm2++) {
+            for (int kk = 0; kk < ur_h; kk++) {
+                size_t inp_off = sizeof(float) * ((size_t) ifm2 * id * ih * iw + kk * jcp.ow * jcp.oh_block_step);
+                uni_vmovups(vmm_src, make_safe_addr(aux_reg_input_w, inp_off, reg_long_offt));
+
+                size_t ker_off = sizeof(float) * ((size_t) ifm2 * kd * kh * kw);
+                uni_vbroadcastss(vmm_ker, ptr[aux_reg_kernel_w + ker_off]);
+
+                uni_vfmadd231ps(Vmm(kk), vmm_src, vmm_ker);
+            }
+        }
+
+        add(aux_reg_kernel_w, sizeof(float));
+        add(aux_reg_input_w, dilate_w * sizeof(float));
+
+        dec(kw_iter);
+        cmp(kw_iter, 0);
+        jg(kw_label, T_NEAR);
+    }
+
+    L(iter_exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::apply_filter(int ur_h, int ur_w) {
+    int iw = jcp.iw;
+    int kw = jcp.kw;
+    int dilate_h = jcp.dilate_h + 1;
+    int dilate_d = jcp.dilate_h + 1;
+    const int inp_mult_h = dilate_h;
+    const int inp_mult_d = dilate_d;
+
+    Label skip_kh_loop, skip_kd_loop, kd_label;
+    if (jcp.ndims == 5) {
+        push(reg_kernel);
+        push(reg_output);
+
+        mov(reg_kd, ptr[param1 + GET_OFF(kd_padding)]);
+        mov(aux_reg_ker_d, aux_reg_kernel_h);
+        mov(aux_reg_inp_d, aux_reg_input_h);
+
+        cmp(reg_kd, 0);
+        je(skip_kd_loop, T_NEAR);
+
+        L(kd_label);
+        mov(kh_iter, ptr[param1 + GET_OFF(kh_padding)]);
+    } else {
+        mov(kh_iter, reg_kh);
+    }
+
+    if (jcp.ndims == 5) {
+        mov(aux_reg_input_h, aux_reg_inp_d);
+        mov(aux_reg_kernel_h, aux_reg_ker_d);
+    }
+
+    cmp(kh_iter, 0);
+    je(skip_kh_loop, T_NEAR);
+
+    Label kh_label;
+    L(kh_label);
+    {
+        if (ur_w == jcp.nb_ow_blocking)
+            filter_unrolled(ur_h, ur_w);
+        else
+            filter(ur_h);
+
+        add(aux_reg_kernel_h, sizeof(float) * kw);
+        add(aux_reg_input_h, sizeof(float) * iw * inp_mult_h);
+
+        dec(kh_iter);
+        cmp(kh_iter, 0);
+        jg(kh_label, T_NEAR);
+    }
+
+    L(skip_kh_loop);
+
+    if (jcp.ndims == 5) {
+        add(aux_reg_ker_d, sizeof(float) * jcp.kw * jcp.kh);
+        add(aux_reg_inp_d, sizeof(float) * jcp.ih * jcp.iw * inp_mult_d);
+
+        dec(reg_kd);
+        cmp(reg_kd, 0);
+        jg(kd_label, T_NEAR);
+        L(skip_kd_loop);
+
+        pop(reg_output);
+        pop(reg_kernel);
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::apply_postprocess(int ur_h, int ur_w) {
+    Label regular_store_label;
+
+    mov(reg_ci_flag, ptr[this->param1 + GET_OFF(flags)]);
+    test(reg_ci_flag, FLAG_IC_LAST);
+    je(regular_store_label, T_NEAR);
+
+    int eltwise_inj_idx = 0;
+    const auto &p = attr_.post_ops_;
+
+    if (p.len() == 0 && eltwise_injectors.size() == 1) {
+        eltwise_injectors[0]->compute_vector_range(0, ur_w * ur_h);
+    }
+
+    for (int i = 0; i < p.len(); i++) {
+        auto& post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors[eltwise_inj_idx]->compute_vector_range(0, ur_w * ur_h);
+            eltwise_inj_idx++;
+        }
+    }
+
+    L(regular_store_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::store_dst(int ur_h, int ur_w) {
+    for (int kk = 0; kk < ur_h; kk++) {
+        for (int jj = 0; jj < ur_w; jj++) {
+            size_t o_off = sizeof(float) * (jj * jcp.ow_block + kk * jcp.ow * jcp.oh_block_step);
+            uni_vmovups(make_safe_addr(reg_output, o_off, reg_long_offt), Vmm(kk * ur_w + jj));
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::solve_common(int ur_h) {
+    auto solve_loop = [&](int ur_w, int step_w) {
+        Label loop_label;
+        Label exit_label;
+
+        L(loop_label);
+        {
+            if (step_w == 1) {
+                load_src_scalar(ur_h);
+                apply_filter_scalar(ur_h);
+                apply_postprocess_scalar(ur_h);
+                store_dst_scalar(ur_h);
+            } else {
+                load_src(ur_h, ur_w);
+                apply_filter(ur_h, ur_w);
+                apply_postprocess(ur_h, ur_w);
+                store_dst(ur_h, ur_w);
+            }
+
+            add(reg_input, sizeof(float) * step_w * jcp.stride_w);
+            add(reg_output, sizeof(float) * step_w);
+        }
+
+        L(exit_label);
+    };
+
+    Label left_border_label;
+    Label main_loop_unrolled_label;
+    Label main_loop_label;
+    Label right_border_label;
+    Label exit_label;
+
+    xor_(reg_ow, reg_ow);
+    sub(reg_input, sizeof(float) * jcp.l_pad);
+
+    auto adjust_indexes_left = [&]() {
+        Label border_indexes_label;
+        Label border_indexes_exit_label;
+
+        mov(reg_wj, jcp.l_pad);
+        sub(reg_wj, reg_ow);
+        L(border_indexes_label);
+        {
+            cmp(reg_wj, 0);
+            jle(border_indexes_exit_label, T_NEAR);
+
+            add(aux_reg_kernel_h, sizeof(float));
+            add(aux_reg_input_h, sizeof(float) * (jcp.dilate_w + 1));
+            dec(reg_kw);
+            sub(reg_wj, jcp.dilate_w + 1);
+
+            jmp(border_indexes_label);
+
+            L(border_indexes_exit_label);
+        }
+    };
+
+    auto adjust_indexes_right = [&]() {
+        Label border_indexes_right_label;
+        Label border_indexes_right_exit_label;
+
+        imul(reg_wj, reg_ow, jcp.stride_w);
+        add(reg_wj, (jcp.kw-1) * (jcp.dilate_w+1) - jcp.l_pad+1 - jcp.iw);
+
+        L(border_indexes_right_label);
+        {
+            cmp(reg_wj, 0);
+            jle(border_indexes_right_exit_label, T_NEAR);
+
+            dec(reg_kw);
+            sub(reg_wj, jcp.dilate_w + 1);
+
+            jmp(border_indexes_right_label);
+
+            L(border_indexes_right_exit_label);
+        }
+    };
+
+    int left_border_end = nstl::min(div_up(jcp.l_pad, jcp.stride_w), jcp.ow);
+    L(left_border_label); {
+        cmp(reg_ow, left_border_end);
+        jge(main_loop_unrolled_label, T_NEAR);
+
+        mov(aux_reg_input_h, reg_input);
+        mov(aux_reg_kernel_h, reg_kernel);
+        mov(reg_kw, jcp.kw);
+
+        adjust_indexes_left();
+        adjust_indexes_right();
+
+        solve_loop(1, 1); // scalar
+
+        inc(reg_ow);
+        jmp(left_border_label, T_NEAR);
+    }
+
+    int main_loop_end = (jcp.iw - (jcp.kw - 1)*(jcp.dilate_w + 1) + jcp.l_pad - 1) / jcp.stride_w + 1;
+    L(main_loop_unrolled_label); {
+        cmp(reg_ow, main_loop_end - jcp.nb_ow_blocking * jcp.ow_block);
+        jg(main_loop_label, T_NEAR);
+
+        mov(aux_reg_input_h, reg_input);
+        mov(aux_reg_kernel_h, reg_kernel);
+        mov(reg_kw, jcp.kw);
+
+        solve_loop(jcp.nb_ow_blocking, jcp.nb_ow_blocking * jcp.ow_block);
+
+        add(reg_ow, jcp.nb_ow_blocking * jcp.ow_block);
+        jmp(main_loop_unrolled_label, T_NEAR);
+    }
+
+    L(main_loop_label); {
+        cmp(reg_ow, main_loop_end - jcp.ow_block);
+        jg(right_border_label, T_NEAR);
+
+        mov(aux_reg_input_h, reg_input);
+        mov(aux_reg_kernel_h, reg_kernel);
+        mov(reg_kw, jcp.kw);
+
+        solve_loop(1, jcp.ow_block); // vectorized
+
+        add(reg_ow, jcp.ow_block);
+        jmp(main_loop_label, T_NEAR);
+    }
+
+    int right_border_end = jcp.ow;
+    L(right_border_label); {
+        cmp(reg_ow, right_border_end);
+        jge(exit_label, T_NEAR);
+
+        mov(aux_reg_input_h, reg_input);
+        mov(aux_reg_kernel_h, reg_kernel);
+        mov(reg_kw, jcp.kw);
+
+        adjust_indexes_left();
+        adjust_indexes_right();
+
+        solve_loop(1, 1); // scalar
+
+        inc(reg_ow);
+        jmp(right_border_label, T_NEAR);
+    }
+
+    L(exit_label);
+}
+
+template <cpu_isa_t isa>
+void jit_uni_planar_conv_fwd_kernel_f32<isa>::generate() {
+    const auto &p = attr_.post_ops_;
+    for (int i = 0; i < p.len(); i++) {
+        auto &post_op = p.entry_[i];
+        if (post_op.is_eltwise()) {
+            eltwise_injectors.push_back(new jit_uni_eltwise_injector_t<isa>(
+                    this,
+                    post_op.eltwise
+            ));
+        }
+    }
+
+    this->preamble();
+
+    mov(reg_input, ptr[this->param1 + GET_OFF(src)]);
+    mov(reg_output, ptr[this->param1 + GET_OFF(dst)]);
+    mov(reg_kernel, ptr[this->param1 + GET_OFF(filt)]);
+    mov(reg_kh, ptr[this->param1 + GET_OFF(kh_padding)]);
+    mov(reg_oh_blocks, ptr[this->param1 + GET_OFF(oh_blocks)]);
+
+    Label tail_label;
+    Label exit_label;
+
+    solve_common(1);
+
+    this->postamble();
+
+    for (auto& inj : eltwise_injectors)
+        inj->prepare_table();
+}
+
+template <cpu_isa_t isa>
+bool jit_uni_planar_conv_fwd_kernel_f32<isa>::post_ops_ok(
+        jit_conv_conf_t &jcp, const primitive_attr_t &attr) {
+    const auto &p = attr.post_ops_;
+
+    auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(); };
+    auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(); };
+    auto is_simple = [&](int idx) { return is_eltwise(idx); };
+
+    switch (p.len()) {
+    case 0: return true; // no post_ops
+    case 1:
+        return true // sum OR eltwise OR depthwise
+                && !jcp.with_eltwise && (is_simple(0) || is_sum(0));
+    case 2:
+        return true // sum->relu
+                && !jcp.with_eltwise && ((is_sum(0) && is_simple(1)) ||
+                                         (is_simple(0) && is_simple(1)));
+    case 3:
+        return true // sum->relu
+                && !jcp.with_eltwise && (is_sum(0) && is_simple(1) && is_simple(2));
+    default: return false;
+    }
+
+    return false;
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_planar_conv_fwd_kernel_f32<isa>::init_conf(jit_conv_conf_t &jcp,
+        const convolution_desc_t &cd, memory_desc_t &src_md,
+        memory_desc_t &weights_md, memory_desc_t &dst_md,
+        memory_desc_t &bias_md, const primitive_attr_t &attr)
+{
+    if (!mayiuse(isa)) return status::unimplemented;
+
+    const memory_desc_wrapper src_d(&src_md);
+    const memory_desc_wrapper weights_d(&weights_md);
+    const memory_desc_wrapper dst_d(&dst_md);
+    const memory_desc_wrapper bias_d(&bias_md);
+
+    jcp.prop_kind = cd.prop_kind;
+
+    const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
+    int ndims = src_d.ndims();
+    jcp.ndims = ndims;
+
+    jcp.ngroups = with_groups ? weights_d.dims()[0] : 1;
+    jcp.mb = src_d.dims()[0];
+
+    jcp.oc = dst_d.dims()[1] / jcp.ngroups;
+    jcp.oc_without_padding = jcp.oc;
+    jcp.ic = src_d.dims()[1] / jcp.ngroups;
+
+    jcp.id = (ndims == 5) ? src_d.dims()[2] : 1;
+    jcp.ih = (ndims == 3) ? 1 : src_d.dims()[ndims-2];
+    jcp.iw = src_d.dims()[ndims-1];
+    jcp.od = (ndims == 5) ? dst_d.dims()[2] : 1;
+    jcp.oh = (ndims == 3) ? 1 : dst_d.dims()[ndims-2];
+    jcp.ow = dst_d.dims()[ndims-1];
+    jcp.kd = (ndims == 5) ? weights_d.dims()[with_groups + 2] : 1;
+    jcp.kh = (ndims == 3) ? 1 : weights_d.dims()[with_groups + ndims-2];
+    jcp.kw = weights_d.dims()[with_groups + ndims-1];
+
+    jcp.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
+    jcp.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims-4];
+    jcp.l_pad = cd.padding[0][ndims-3];
+    jcp.stride_d = (ndims == 5) ? cd.strides[0] : 1;
+    jcp.stride_h = (ndims == 3) ? 1 : cd.strides[ndims-4];
+    jcp.stride_w = cd.strides[ndims-3];
+
+    jcp.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
+    jcp.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims-4];
+    jcp.dilate_w = cd.dilates[ndims-3];
+
+    jcp.b_pad = (jcp.oh - 1) * jcp.stride_h + (jcp.kh - 1) * (jcp.dilate_h + 1)
+            - (jcp.ih + jcp.t_pad - 1);
+
+    jcp.with_bias = cd.bias_desc.format_kind != format_kind::undef;
+    jcp.with_eltwise = false;
+
+    if (!post_ops_ok(jcp, attr))
+        return status::unimplemented;
+
+    const auto &p = attr.post_ops_;
+    jcp.with_sum = p.find(primitive_kind::sum) != -1;
+
+    const int simd_w = isa == avx512_core ? 16 : isa == avx2 ? 8 : 4;
+
+    auto set_or_check_wei_format = [&]() {
+        using namespace format_tag;
+        format_tag_t wei_tag = with_groups ? ndims == 5 ? goidhw : goihw
+                                           : ndims == 5 ? oidhw : oihw;
+
+        memory_desc_t want_wei_md = weights_md;
+        memory_desc_init_by_tag(want_wei_md, wei_tag);
+
+        if (weights_md.format_kind == format_kind::any) {
+            weights_md = want_wei_md;
+            return true;
+        }
+
+        return weights_md == want_wei_md;
+    };
+
+    if (!set_or_check_wei_format())
+        return status::unimplemented;
+
+    auto dat_tag = ndims == 5 ? format_tag::ncdhw : format_tag::nchw;
+    if (src_d.format_kind() == format_kind::any) {
+        CHECK(memory_desc_init_by_tag(src_md, dat_tag));
+        jcp.src_tag = dat_tag;
+    } else {
+        jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag);
+    }
+    if (jcp.src_tag != dat_tag)
+        return status::unimplemented;
+
+    if (dst_d.format_kind() == format_kind::any) {
+        CHECK(memory_desc_init_by_tag(dst_md, dat_tag));
+        jcp.dst_tag = dat_tag;
+    } else {
+        jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag);
+    }
+    if (jcp.dst_tag != dat_tag)
+        return status::unimplemented;
+
+    if (jcp.with_bias) {
+        if (bias_d.format_kind() == format_kind::any)
+            CHECK(memory_desc_init_by_tag(bias_md, format_tag::x));
+    }
+
+    // This convolution implementation was introduced as workaround to provide competitive performance on MSD topology.
+    // The conditions below are needed to bound applicability scope.
+    bool args_ok = jcp.ngroups == 1 &&
+              jcp.oc == 1 &&
+              jcp.stride_d == 1 && jcp.stride_h == 1 && jcp.stride_w == 1;
+    if (!args_ok) return status::unimplemented;
+
+    jcp.ur_w = 1;
+
+    jcp.ow_block = simd_w;
+    jcp.nb_ow_blocking = isa == avx512_core ? 3 : 3;
+
+    jcp.oh_block = 1;
+    jcp.nb_oh_blocking = 1;
+    jcp.oh_block_step = 1; // (jcp.dilate_h + 1);
+
+    jcp.oc_block = 1;
+    jcp.nb_oc = jcp.oc / jcp.oc_block;
+    jcp.nb_oc_blocking = 1;
+
+    jcp.ic_block = 1;
+    jcp.nb_ic = jcp.ic / jcp.ic_block;
+    jcp.nb_ic_blocking = 1;
+
+    return status::success;
+}
+
+template struct jit_uni_planar_conv_fwd_kernel_f32<avx512_core>;
+template struct jit_uni_planar_conv_fwd_kernel_f32<avx2>;
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_uni_planar_conv_kernel_f32.hpp b/src/cpu/x64/jit_uni_planar_conv_kernel_f32.hpp
new file mode 100644
index 00000000000..d8b41e76c6b
--- /dev/null
+++ b/src/cpu/x64/jit_uni_planar_conv_kernel_f32.hpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+* Copyright 2019-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_PLANAR_CONV_KERNEL_F32_HPP
+#define CPU_X64_JIT_UNI_PLANAR_CONV_KERNEL_F32_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/jit_primitive_conf.hpp"
+#include "cpu/x64/injectors/jit_uni_eltwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_depthwise_injector.hpp"
+#include "cpu/x64/injectors/jit_uni_quantization_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa>
+struct jit_uni_planar_conv_fwd_kernel_f32: public jit_generator_t {
+    jit_uni_planar_conv_fwd_kernel_f32(jit_conv_conf_t ajcp,
+            const primitive_attr_t &attr): jit_generator_t(jit_name()), jcp(ajcp), attr_(attr) {}
+
+    ~jit_uni_planar_conv_fwd_kernel_f32() {
+        for (auto inj : eltwise_injectors)
+           delete inj;
+        eltwise_injectors.clear();
+
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_planar_conv_fwd_kernel_f32)
+
+    static bool post_ops_ok(jit_conv_conf_t &jcp,
+            const primitive_attr_t &attr);
+    static status_t init_conf(jit_conv_conf_t &jcp,
+            const convolution_desc_t &cd, memory_desc_t &src_md,
+            memory_desc_t &weights_md, memory_desc_t &dst_md,
+            memory_desc_t &bias_md, const primitive_attr_t &attr);
+
+    jit_conv_conf_t jcp;
+    const primitive_attr_t &attr_;
+    void (*jit_ker)(jit_conv_call_s *);
+
+private:
+    using Vmm = typename utils::conditional3<isa == sse41, Xbyak::Xmm,
+            isa == avx2, Xbyak::Ymm, Xbyak::Zmm>::type;
+    using reg64_t = const Xbyak::Reg64;
+    using reg32_t = const Xbyak::Reg32;
+    const Xbyak::AddressFrame &vmmword = (isa == sse41)
+        ? xword : (isa == avx2) ? yword : zword;
+
+    reg64_t reg_input = r8;
+    reg64_t reg_kernel = r9;
+    reg64_t reg_output = r10;
+
+    reg64_t aux_reg_input_h = r11;
+    reg64_t aux_reg_kernel_h = r12;
+
+    reg64_t aux_reg_input_w = r13;
+    reg64_t aux_reg_kernel_w = r14;
+
+    reg64_t aux_reg_inp_d = r9;
+    reg64_t aux_reg_ker_d = r10;
+
+    reg64_t reg_kd = rbx;
+    reg64_t reg_kh = rdx;
+    reg64_t reg_kw = rsi;
+
+    reg64_t kh_iter = rax;
+    reg64_t kw_iter = abi_not_param1;
+
+    reg64_t reg_bias = r13;
+    reg64_t reg_long_offt = r15;
+    reg32_t reg_ci_flag = r15d;
+
+    reg64_t reg_d_weights = r15;
+    reg64_t reg_d_bias = kh_iter;
+
+    reg64_t reg_ow = rbp;
+
+    reg64_t reg_oh_blocks = aux_reg_kernel_w;
+
+    reg64_t reg_wj = aux_reg_input_w;
+
+    Vmm vmm_ker = Vmm(15);
+    Vmm vmm_tmp = Vmm(15);
+    Vmm vmm_src = Vmm(14);
+    Xbyak::Xmm xmm_ker = Xbyak::Xmm(15);
+    Xbyak::Xmm xmm_tmp = Xbyak::Xmm(15);
+    Xbyak::Xmm xmm_src = Xbyak::Xmm(14);
+
+    nstl::vector<jit_uni_eltwise_injector_t<isa>*> eltwise_injectors;
+    nstl::vector<jit_uni_depthwise_injector_f32<isa>*> depthwise_injectors;
+
+    inline void load_src(int ur_h, int ur_w);
+    inline void filter(int ur_h);
+    inline void filter_unrolled(int ur_h, int ur_w);
+    inline void apply_filter(int ur_h, int ur_w);
+    inline void apply_postprocess(int ur_h, int ur_w);
+    inline void store_dst(int ur_h, int ur_w);
+    inline void solve_common(int ur_h);
+
+    inline void filter_scalar(int ur_h);
+    inline void load_src_scalar(int ur_h);
+    inline void apply_filter_scalar(int ur_h);
+    inline void apply_postprocess_scalar(int ur_h);
+    inline void store_dst_scalar(int ur_h);
+
+    void generate() override;
+};
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_uni_planar_convolution.cpp b/src/cpu/x64/jit_uni_planar_convolution.cpp
new file mode 100644
index 00000000000..2e1f4e8d48a
--- /dev/null
+++ b/src/cpu/x64/jit_uni_planar_convolution.cpp
@@ -0,0 +1,164 @@
+/*******************************************************************************
+* Copyright 2019-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "jit_uni_planar_convolution.hpp"
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/memory_tracking.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+using namespace dnnl::impl::status;
+using namespace dnnl::impl::utils;
+
+#define src_blk_off(f, n, c, d, h, w) \
+    pd()->ndims() == 5 \
+        ? (f).blk_off(n, c, d, h, w) \
+        : (f).blk_off(n, c, h, w)
+
+#define wht_blk_off(f, g, oc, ic, kd, kh, kw) \
+    pd()->ndims() == 5 \
+        ? pd()->with_groups() \
+            ? (f).blk_off(g, oc, ic, kd, kh, kw) \
+            : (f).blk_off(oc, ic, kd, kh, kw) \
+        : pd()->with_groups() \
+            ? (f).blk_off(g, oc, ic, kh, kw) \
+            : (f).blk_off(oc, ic, kh, kw)
+
+template <cpu_isa_t isa>
+void _jit_uni_planar_convolution_fwd_t<isa>::execute_forward(const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const data_t *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const data_t *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const data_t *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(data_t *, DNNL_ARG_DST);
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+    const memory_desc_wrapper bias_d(pd()->weights_md(1));
+
+    const auto &jcp = pd()->jcp_;
+
+    std::vector<int> oh_indexes(jcp.oh);
+
+    int idx = 0;
+    for (int i = 0; i < (jcp.dilate_h + 1); i++) {
+        for (int ib = 0; ib < jcp.oh; ib += (jcp.dilate_h + 1)) {
+            if (ib + i >= jcp.oh)
+                continue;
+
+            oh_indexes[idx++] = ib + i;
+            if (idx >= jcp.oh)
+                break;
+        }
+        if (idx >= jcp.oh)
+            break;
+    }
+
+    int threads_count = dnnl_get_max_threads();
+    int ohb_size = div_up(jcp.oh, threads_count);
+
+    auto kernel_params = [&](int n, int g, int icb, int oc, int od, int oh, int oh_blocks) {
+        auto par_conv = jit_conv_call_s();
+
+        const int dj = od * jcp.stride_d;
+        const int d_t_overflow = nstl::max(0, jcp.f_pad - dj);
+        const int d_b_overflow = nstl::max(jcp.id, dj + (jcp.kd - 1) * (jcp.dilate_d + 1) - jcp.f_pad + 1) - jcp.id;
+        const int id = nstl::max(dj - jcp.f_pad + div_up(d_t_overflow, (jcp.dilate_d + 1)) * (jcp.dilate_d + 1), 0);
+        const int wd = div_up(d_t_overflow, (jcp.dilate_d + 1));
+        const int kd_padding = jcp.kd - div_up(d_t_overflow, (jcp.dilate_d + 1)) - div_up(d_b_overflow, (jcp.dilate_d + 1));
+
+        const int hj = oh * jcp.stride_h;
+        const int i_t_overflow = nstl::max(0, jcp.t_pad - hj);
+        const int i_b_overflow = nstl::max(jcp.ih, hj + (jcp.kh - 1) * (jcp.dilate_h + 1) - jcp.t_pad + 1) - jcp.ih;
+        const int ih = nstl::max(hj - jcp.t_pad + div_up(i_t_overflow, (jcp.dilate_h + 1)) * (jcp.dilate_h + 1), 0);
+        const int wh = div_up(i_t_overflow, (jcp.dilate_h + 1));
+        const int kh_padding = jcp.kh - div_up(i_t_overflow, (jcp.dilate_h + 1)) - div_up(i_b_overflow, (jcp.dilate_h + 1));
+
+        const size_t _oc = oc;
+        const size_t _ic = g * jcp.nb_ic + icb;
+
+        par_conv.src = &src[src_blk_off(src_d, n, _ic, id, ih, 0)];
+        par_conv.dst = &dst[src_blk_off(dst_d, n, _oc, od, oh, 0)];
+        par_conv.filt = &weights[wht_blk_off(weights_d, g, _oc, _ic, wd, wh, 0)];
+
+        if (icb == 0) {
+            if (bias)
+                par_conv.bias = &bias[bias_d.blk_off(_oc)];
+            par_conv.flags |= FLAG_IC_FIRST;
+        }
+
+        if (icb + 1 == jcp.nb_ic) {
+            par_conv.flags |= FLAG_IC_LAST;
+        }
+
+        par_conv.oc_off = _oc * sizeof(float);
+        par_conv.oh_blocks = (size_t)oh_blocks;
+
+        par_conv.kh_padding = (size_t)nstl::max(0, kh_padding);
+        par_conv.kd_padding = (size_t)nstl::max(0, kd_padding);
+
+        return par_conv;
+    };
+
+    auto ker = [&](const int ithr, const int nthr) {
+        int g = 0;
+        int oc = 0;
+
+        for (int n = 0; n < jcp.mb; n++) {
+            int icbb = 0;
+            while (icbb < jcp.nb_ic) {
+                int icb_step = jcp.nb_ic_blocking;
+                int icb_step_rem = jcp.nb_ic - icbb;
+                if (icb_step_rem < jcp.nb_ic_blocking_max)
+                    icb_step = icb_step_rem;
+
+                for (int icb = icbb; icb < icbb + icb_step; ++icb) {
+                    for (int odb = 0; odb < (jcp.dilate_d + 1); odb++) {
+                        for (int od = odb; od < jcp.od; od += (jcp.dilate_d + 1)) {
+                            int oh_idx_off = ithr * ohb_size;
+                            for (int oh_idx = 0; oh_idx < ohb_size; oh_idx++) {
+                                if ((oh_idx_off + oh_idx) >= jcp.oh || oh_indexes[oh_idx_off + oh_idx] >= jcp.oh)
+                                    continue;
+                                int oh = oh_indexes[oh_idx_off + oh_idx];
+
+                                jit_conv_call_s par_conv = kernel_params(n, g, icb, oc, od, oh, 1);
+
+                                (*kernel_)(&par_conv);
+                            }
+                        }
+                    }
+                }
+                icbb += icb_step;
+            }
+        }
+    };
+
+    parallel(0, ker);
+}
+
+
+template struct _jit_uni_planar_convolution_fwd_t<avx512_core>;
+template struct _jit_uni_planar_convolution_fwd_t<avx2>;
+
+}
+}
+}
+}
diff --git a/src/cpu/x64/jit_uni_planar_convolution.hpp b/src/cpu/x64/jit_uni_planar_convolution.hpp
new file mode 100644
index 00000000000..134fc1e7b48
--- /dev/null
+++ b/src/cpu/x64/jit_uni_planar_convolution.hpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+* Copyright 2019-2021 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_PLANAR_CONVOLUTION_HPP
+#define CPU_X64_JIT_UNI_PLANAR_CONVOLUTION_HPP
+
+#include "jit_primitive_conf.hpp"
+#include "jit_uni_planar_conv_kernel_f32.hpp"
+
+#include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive.hpp"
+
+#include "cpu/cpu_convolution_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <cpu_isa_t isa>
+struct _jit_uni_planar_convolution_fwd_t: public primitive_t {
+    struct pd_t : public cpu_convolution_fwd_pd_t {
+        pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+             const typename pd_t::base_class *hint_fwd_pd)
+                : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+
+        DECLARE_COMMON_PD_T(
+                JIT_IMPL_NAME_HELPER("jit_planar:", isa, ""),
+                _jit_uni_planar_convolution_fwd_t<isa>);
+
+        status_t init(engine_t *engine) {
+            bool ok = true
+                && is_fwd()
+                && set_default_alg_kind(alg_kind::convolution_direct)
+                && !this->has_zero_dim_memory()
+                && utils::everyone_is(data_type::f32,
+                        this->desc()->src_desc.data_type,
+                        this->desc()->weights_desc.data_type,
+                        this->desc()->dst_desc.data_type)
+                && IMPLICATION(this->with_bias(), data_type::f32 == this->desc()->bias_desc.data_type)
+                && attr()->has_default_values(primitive_attr_t::skip_mask_t::post_ops);
+            if (!ok) return status::unimplemented;
+
+            status_t sts = jit_uni_planar_conv_fwd_kernel_f32<isa>::init_conf(jcp_, *desc(), src_md_, weights_md_, dst_md_, bias_md_, *attr());
+
+            return sts;
+        }
+
+        jit_conv_conf_t jcp_;
+    };
+
+    _jit_uni_planar_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+
+    typedef typename prec_traits_t<data_type::f32>::type data_t;
+
+    status_t init(engine_t *engine) override {
+        CHECK(safe_ptr_assign(kernel_, new jit_uni_planar_conv_fwd_kernel_f32<isa>(pd()->jcp_, *pd()->attr())));
+        return kernel_->create_kernel();
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        execute_forward(ctx);
+        return status::success;
+    }
+
+private:
+    void execute_forward(const exec_ctx_t &ctx) const;
+
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    std::unique_ptr<jit_uni_planar_conv_fwd_kernel_f32<isa>> kernel_;
+};
+
+using jit_avx512_common_planar_convolution_fwd_t = _jit_uni_planar_convolution_fwd_t<avx512_core>;
+using jit_avx2_planar_convolution_fwd_t = _jit_uni_planar_convolution_fwd_t<avx2>;
+
+}
+}
+}
+}
+
+#endif
diff --git a/src/cpu/x64/jit_uni_pool_kernel.cpp b/src/cpu/x64/jit_uni_pool_kernel.cpp
index 76f022bafa3..fafbefe8d89 100644
--- a/src/cpu/x64/jit_uni_pool_kernel.cpp
+++ b/src/cpu/x64/jit_uni_pool_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 * Copyright 2018 YANDEX LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -34,22 +34,21 @@ using namespace alg_kind;
 
 #define GET_OFF(field) offsetof(jit_pool_call_s, field)
 
+constexpr int sse41_single_block_size
+        = cpu_isa_traits_t<sse41>::vlen / sizeof(float);
+
 static bcast_set_t get_supported_bcast_strategies() {
     return {broadcasting_strategy_t::scalar, broadcasting_strategy_t::per_oc,
             broadcasting_strategy_t::no_broadcast};
 }
 
 template <cpu_isa_t isa>
-jit_uni_pool_kernel<isa>::~jit_uni_pool_kernel() = default;
+jit_uni_pool_kernel_t<isa>::~jit_uni_pool_kernel_t() = default;
 
 template <cpu_isa_t isa>
-jit_uni_pool_kernel<isa>::jit_uni_pool_kernel(
+jit_uni_pool_kernel_t<isa>::jit_uni_pool_kernel_t(
         const jit_pool_conf_t &ajpp, const memory_desc_t *dst_md)
-    : jit_generator(jit_name(), isa), jpp(ajpp), bf16_emu_(nullptr) {
-    if (use_bf16_emulation())
-        bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
-                bf16_emu_reserv_1, bf16_emu_reserv_2, bf16_emu_reserv_3,
-                bf16_emu_reserv_4, bf16_emu_reserv_5);
+    : jit_generator_t(jit_name(), isa), jpp(ajpp) {
 
     bool has_f8_e5m2_binary_postops = false;
     bool has_f8_e4m3_binary_postops = false;
@@ -81,16 +80,13 @@ jit_uni_pool_kernel<isa>::jit_uni_pool_kernel(
                     fp8_emu_reserv_4, fp8_emu_reserv_5, fp8_emu_reg64);
     }
 
+    const auto tail_size
+            = isa == sse41 ? jpp.c_tail % sse41_single_block_size : jpp.c_tail;
+
     if (jpp.with_postops) {
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = true;
         static constexpr bool use_exact_tail_scalar_bcast = false;
-        static constexpr int sse41_single_block_size
-                = cpu_isa_traits<sse41>::vlen / sizeof(float);
-        size_t postop_tail = static_cast<size_t>(jpp.c_tail);
-        const bool high_half_block_empty = isa == sse41
-                && static_cast<size_t>(jpp.c_tail) > sse41_single_block_size;
-        if (high_half_block_empty) postop_tail -= sse41_single_block_size;
 
         const binary_injector::rhs_arg_static_params_t rhs_sp {
                 static_cast<std::size_t>(this->xmm4.getIdx()), this->r14,
@@ -99,7 +95,8 @@ jit_uni_pool_kernel<isa>::jit_uni_pool_kernel(
                 memory_desc_wrapper(jpp.tag_kind == jit_memory_tag_kind_t::ncsp
                                 ? jpp.tmp_md
                                 : *dst_md),
-                postop_tail, k_c_tail_mask, use_exact_tail_scalar_bcast};
+                static_cast<size_t>(tail_size), k_c_tail_mask,
+                use_exact_tail_scalar_bcast};
 
         const binary_injector::static_params_t bsp {reg_param,
                 get_supported_bcast_strategies(), rhs_sp, f8_e5m2_emu_.get(),
@@ -109,6 +106,34 @@ jit_uni_pool_kernel<isa>::jit_uni_pool_kernel(
                 = utils::make_unique<injector::jit_uni_postops_injector_t<isa>>(
                         this, jpp.post_ops, bsp);
     }
+
+    io::io_tail_conf_t io_tail_conf(jpp.c_block, tail_size,
+            k_c_tail_mask.getIdx(), vmm_c_tail_mask.getIdx(), tmp_gpr);
+
+    utils::optional_t<io::io_emu_bf16_conf_t> io_bf16_conf;
+    if (use_bf16_emulation())
+        io_bf16_conf = io::io_emu_bf16_conf_t(bf16_emu_reserv_1,
+                bf16_emu_reserv_2, bf16_emu_reserv_3, bf16_emu_reserv_4,
+                bf16_emu_reserv_5);
+
+    utils::optional_t<io::io_emu_fp8_conf_t> io_fp8_conf;
+    if (use_fp8_emulation() || has_f8_e5m2_binary_postops
+            || has_f8_e4m3_binary_postops)
+        io_fp8_conf = io::io_emu_fp8_conf_t(fp8_emu_reserv_1, fp8_emu_reserv_2,
+                fp8_emu_reserv_3, fp8_emu_reserv_4, fp8_emu_reserv_5,
+                fp8_tmp_mask, fp8_emu_reg64);
+
+    using io_mdt_helper = io::jit_io_multi_dt_helper_t<Vmm>;
+
+    typename io_mdt_helper::data_types_t dtypes = {jpp.src_dt, jpp.dst_dt};
+    // Indices of type s32 will be stored/loaded as f32 as jit_io_helper_t does not
+    // support integers but stores/loads f32 without additional conversions of those
+    // 4 bytes. jit_io_helper_t is not used for processing indices of type u8.
+    if (jpp.ind_dt == data_type::s32) dtypes.insert(data_type::f32);
+    if (jpp.needs_f32_accum_for_bf16) dtypes.insert(data_type::f32);
+
+    io_ = io_mdt_helper(this, jpp.isa, dtypes, {}, io_tail_conf, io_bf16_conf,
+            {}, utils::nullopt, io_fp8_conf);
 }
 
 static status_t set_binary_postops_formats(
@@ -136,9 +161,27 @@ static status_t set_binary_postops_formats(
 }
 
 template <cpu_isa_t isa>
-status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
-        memory_tracking::registrar_t &scratchpad, primitive_attr_t &attr,
-        const pooling_pd_t *ppd) {
+bool jit_uni_pool_kernel_t<isa>::has_large_buffers(const pooling_pd_t *ppd) {
+    auto is_large = [](const memory_desc_t &md) {
+        memory_desc_wrapper mdw(md);
+        return mdw.size()
+                > static_cast<size_t>(std::numeric_limits<int32_t>::max());
+    };
+
+    if (is_large(*ppd->invariant_src_md())) return true;
+    auto &post_ops = ppd->attr()->post_ops_;
+    for (int i = 0; i < post_ops.len(); i++) {
+        auto &e = post_ops.entry_[i];
+        if (e.is_binary()) {
+            if (is_large(e.binary.src1_desc)) return true;
+        }
+    }
+    return false;
+}
+
+template <cpu_isa_t isa>
+status_t jit_uni_pool_kernel_t<isa>::init_conf(
+        jit_pool_conf_t &jpp, primitive_attr_t &attr, const pooling_pd_t *ppd) {
 
     const auto &pd = *ppd->desc();
     const memory_desc_wrapper src_d(
@@ -146,6 +189,8 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
     const memory_desc_wrapper dst_d(
             ppd->is_fwd() ? ppd->dst_md() : ppd->diff_dst_md());
 
+    // VDISPATCH_POOLING_IC(!has_large_buffers(ppd), VERBOSE_SHAPE_RESTRICTION);
+
     const int ndims = src_d.ndims();
 
     jpp.nthr = dnnl_get_max_threads();
@@ -238,7 +283,8 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
         jpp.is_bf16 = false;
         jpp.is_f16 = false;
         jpp.is_fp8 = false;
-        jpp.dt_size = types::data_type_size(data_type::f32);
+        jpp.src_dt = jpp.dst_dt = data_type::f32;
+        jpp.dt_size = types::data_type_size(jpp.src_dt);
         jpp.tag_kind = jit_memory_tag_kind_t::ncsp;
 
         // used to initialize binary post-ops
@@ -264,6 +310,8 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
             : ((jpp.is_fp8 && mayiuse(avx512_core_fp16)) ? avx512_core_fp16
                                                          : isa);
 
+    // disabling verbose dispatch messages for unsupported isa for
+    // better readability
     if (!mayiuse(isa)) return status::unimplemented;
 
     VDISPATCH_POOLING_IC(
@@ -365,6 +413,13 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
     }
     assert(jpp.ur > 0);
 
+    const bool is_relaxed_acc = utils::one_of(
+            attr.acc_mode_, accumulation_mode::relaxed, accumulation_mode::any);
+    jpp.needs_f32_accum_for_bf16 = !is_relaxed_acc && jpp.is_bf16
+            && jpp.alg == alg_kind::pooling_max && jpp.is_backward
+            && (jpp.stride_d < jpp.kd || jpp.stride_h < jpp.kh
+                    || jpp.stride_w < jpp.kw);
+
     // select jpp.ur_bc
     if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
         auto min_ur_w = nstl::max(1, utils::div_up(jpp.l_pad, jpp.stride_w));
@@ -390,9 +445,8 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
         }
 
         //take into account cache re-usage after zeroing on backward
-        if (jpp.is_backward && ndims < 5) {
-            const int L2 = platform::get_per_core_cache_size(2)
-                    / sizeof(jpp.dt_size);
+        if (jpp.is_backward && ndims < 5 && !jpp.needs_f32_accum_for_bf16) {
+            const int L2 = platform::get_per_core_cache_size(2) / jpp.dt_size;
             int ur_bc = nstl::max(1, L2 / (jpp.kh * jpp.iw * jpp.c_block));
             jpp.ur_bc = nstl::min(jpp.ur_bc, ur_bc);
         }
@@ -403,6 +457,31 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
         jpp.ur_bc_tail = 0;
     }
 
+    jpp.f32_accum_block_size = jpp.ur_bc * jpp.c_block;
+    if (jpp.needs_f32_accum_for_bf16) {
+        assert(memory_desc_wrapper(jpp.tmp_md).is_zero()
+                && (fmt_tag == nspc_fmt_tag || fmt_tag == blocked_fmt_tag));
+
+        dims_t dims {};
+        utils::array_copy(dims, src_d.dims(), ndims);
+
+        const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
+        dims[0] = nstl::min(dnnl_get_max_threads(), jpp.mb * nb2_c);
+        dims[1] = jpp.f32_accum_block_size;
+
+        CHECK(memory_desc_init_by_tag(
+                jpp.tmp_md, ndims, dims, data_type::f32, fmt_tag));
+    }
+
+    jpp.post_ops = attr.post_ops_;
+
+    return status::success;
+}
+
+template <cpu_isa_t isa>
+void jit_uni_pool_kernel_t<isa>::init_scratchpad(
+        const jit_pool_conf_t &jpp, memory_tracking::registrar_t &scratchpad) {
+
     // scratchpad for c_block slice of input and/or output
     using namespace memory_tracking::names;
     const int nscr = nstl::min(dnnl_get_max_threads(), jpp.mb * jpp.nb_c);
@@ -420,9 +499,10 @@ status_t jit_uni_pool_kernel<isa>::init_conf(jit_pool_conf_t &jpp,
                         * nscr);
     }
 
-    jpp.post_ops = attr.post_ops_;
-
-    return status::success;
+    if (jpp.needs_f32_accum_for_bf16) {
+        auto tmp_d = memory_desc_wrapper(jpp.tmp_md);
+        scratchpad.book<char>(key_pool_src_f32_accum, tmp_d.size());
+    }
 }
 
 static int reg_ind(int shift, int bc, int j, int ur_bc, int ur_w) noexcept {
@@ -430,240 +510,214 @@ static int reg_ind(int shift, int bc, int j, int ur_bc, int ur_w) noexcept {
 };
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::prepare_tail_mask() {
-    if (is_superset(isa, avx512_core)) {
-        size_t c_tail_mask = (1ULL << jpp.c_tail) - 1ULL;
-        mov(tmp_gpr.cvt32(), c_tail_mask);
-        kmovw(k_c_tail_mask, tmp_gpr.cvt32());
-    } else if (utils::one_of(isa, avx, avx2, avx2_vnni_2)) {
-        constexpr int max_words_in_ymm = 8;
-
-        // for 'avx2_vnni_2' mask works with 2 x xf16 elements,
-        // in case of 'c_tail % 2 != 0' load/store an additional word
-        // for the remaining element.
-        auto dt_elem_div = isa == avx2_vnni_2 ? 2 : 1;
-        auto mask_offset = max_words_in_ymm - (jpp.c_tail / dt_elem_div);
-        auto mask_register
-                = isa == avx2_vnni_2 ? xmm_c_tail_mask : vmm_c_tail_mask;
-        static const uint32_t mask[16] = {0xffffffff, 0xffffffff, 0xffffffff,
-                0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0,
-                0, 0, 0, 0, 0, 0, 0};
-        mov(tmp_gpr, reinterpret_cast<size_t>(&mask[mask_offset]));
-        vmovups(mask_register, ptr[tmp_gpr]);
-    }
-}
-
-template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::put_one_in_vmm() {
+inline void jit_uni_pool_kernel_t<isa>::put_one_in_vmm() {
     mov(tmp_gpr, 1);
     uni_broadcast_reg_val(tmp_gpr.getIdx(), vmm_one.getIdx());
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::uni_broadcast_reg_val(
+inline void jit_uni_pool_kernel_t<isa>::uni_broadcast_reg_val(
         const int reg_idx, const int vmm_idx) {
     uni_vmovq(Xmm(vmm_idx), reg64_t(reg_idx));
     uni_vpbroadcastd(Vmm(vmm_idx), Xmm(vmm_idx));
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::push_vmm_val(const int idx) {
+inline void jit_uni_pool_kernel_t<isa>::push_vmm_val(const int idx) {
     Vmm val_to_store(idx);
     sub(rsp, val_to_store.getBit());
     uni_vmovups(ptr[rsp], val_to_store);
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::pop_vmm_val(const int idx) {
+inline void jit_uni_pool_kernel_t<isa>::pop_vmm_val(const int idx) {
     Vmm val_to_load(idx);
     uni_vmovups(val_to_load, ptr[rsp]);
     add(rsp, val_to_load.getBit());
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::load(const int idx,
-        const reg64_t &reg_ptr, const int offset,
+inline void jit_uni_pool_kernel_t<isa>::load(const data_type_t dt,
+        const int idx, const reg64_t &reg_ptr, const int offset,
         const bool is_c_tail_proccessing) {
-    if (jpp.is_bf16) {
-        /*TODO: maybe use vpmovzxwd + vpslld,
-             * in order to free up vmm_idx() register */
-        if (is_c_tail_proccessing && !jpp.is_c_padded) {
-            Vmm vmm_to_load = Vmm(idx) | k_c_tail_mask | T_z;
-            vpmovzxwd(vmm_to_load, ptr[reg_ptr + offset]);
-            vpslld(vmm_to_load, vmm_to_load, 16);
-        } else {
-            vmovups(Ymm(idx), ptr[reg_ptr + offset]);
-            vpermw(Vmm(idx) | k_mask_cvt | T_z, vmm_idx(), Vmm(idx));
-        }
-    } else if (jpp.is_f16) {
-        Vmm vmm_to_load = is_c_tail_proccessing && !jpp.is_c_padded
-                ? Vmm(idx) | k_c_tail_mask | T_z
-                : Vmm(idx);
-        vcvtph2psx(vmm_to_load, ptr[reg_ptr + offset]);
-    } else if (jpp.is_fp8) {
-        Vmm vmm_to_load = is_c_tail_proccessing && !jpp.is_c_padded
-                ? Vmm(idx) | k_c_tail_mask | T_z
-                : Vmm(idx);
-        if (jpp.src_dt == data_type::f8_e5m2)
-            f8_e5m2_emu_->vcvt_f8_to_f32(vmm_to_load, ptr[reg_ptr + offset]);
-        else if (jpp.src_dt == data_type::f8_e4m3)
-            f8_e4m3_emu_->vcvt_f8_to_f32(vmm_to_load, ptr[reg_ptr + offset]);
-    } else {
-        if (is_c_tail_proccessing && !jpp.is_c_padded) {
-            if (isa == avx || isa == avx2) {
-                vmaskmovps(Vmm(idx), vmm_c_tail_mask, ptr[reg_ptr + offset]);
-            } else {
-                vmovups(Zmm(idx) | k_c_tail_mask | T_z, ptr[reg_ptr + offset]);
-            }
-        } else {
-            uni_vmovups(Vmm(idx), ptr[reg_ptr + offset]);
-        }
-    }
+    io_[dt]->load(vmmword[reg_ptr + offset], Vmm(idx),
+            is_c_tail_proccessing && !jpp.is_c_padded);
 }
 
-template <>
-inline void jit_uni_pool_kernel<avx2_vnni_2>::load(const int idx,
-        const reg64_t &reg_ptr, const int offset,
+template <cpu_isa_t isa>
+inline void jit_uni_pool_kernel_t<isa>::store(const data_type_t dt,
+        const int idx, const reg64_t &reg_ptr, const int offset,
         const bool is_c_tail_proccessing) {
-    if (is_c_tail_proccessing) {
-        vmaskmovps(Xmm(idx), xmm_c_tail_mask, ptr[reg_ptr + offset]);
-        if (jpp.c_tail % 2 != 0) {
-            const int tail_pos = jpp.c_tail - 1;
-            auto word_addr
-                    = ptr[reg_ptr + offset + tail_pos * sizeof(bfloat16_t)];
-            vpinsrw(Xmm(idx), Xmm(idx), word_addr, tail_pos);
-        }
-    }
-    if (jpp.is_bf16) {
-        if (is_c_tail_proccessing)
-            vpmovzxwd(Ymm(idx), Xmm(idx));
-        else
-            vpmovzxwd(Ymm(idx), ptr[reg_ptr + offset]);
-        vpslld(Ymm(idx), Ymm(idx), 16);
-    } else if (jpp.is_f16) {
-        if (is_c_tail_proccessing)
-            vcvtph2ps(Ymm(idx), Xmm(idx));
-        else
-            vcvtph2ps(Ymm(idx), ptr[reg_ptr + offset]);
-    } else
-        assert(!"invalid data type");
+    if (is_c_tail_proccessing && jpp.is_c_padded && jpp.with_postops)
+        pad_with_zeros(idx);
+    io_[dt]->store(Vmm(idx), vmmword[reg_ptr + offset],
+            is_c_tail_proccessing && !jpp.is_c_padded);
 }
 
-template <>
-inline void jit_uni_pool_kernel<sse41>::load(const int idx,
-        const reg64_t &reg_ptr, const int offset,
-        const bool is_c_tail_proccessing) {
-    if (is_c_tail_proccessing && !jpp.is_c_padded) {
-        for (int i = 0; i < jpp.c_tail % (jpp.c_block / 2); i++)
-            pinsrd(Xmm(idx), ptr[reg_ptr + offset + i * jpp.dt_size], i);
+template <cpu_isa_t isa>
+inline void jit_uni_pool_kernel_t<isa>::pad_with_zeros(const int idx) {
+    if (isa == sse41) {
+        uni_vxorps(xmm_tmp_1, xmm_tmp_1, xmm_tmp_1);
+        if (jpp.c_tail <= sse41_single_block_size && sse_high_half) {
+            uni_vmovups(Vmm(idx), xmm_tmp_1);
+        } else if ((jpp.c_tail < sse41_single_block_size && !sse_high_half)
+                || (jpp.c_tail > sse41_single_block_size && sse_high_half)) {
+            const auto c_tail = jpp.c_tail % sse41_single_block_size;
+            std::bitset<8> tail_mask((1 << c_tail) - 1);
+            tail_mask.flip();
+            uni_vblendps(Vmm(idx), Vmm(idx), xmm_tmp_1, tail_mask.to_ulong());
+        }
+    } else if (isa == avx || isa == avx2) {
+        uni_vxorps(ymm_tmp_1, ymm_tmp_1, ymm_tmp_1);
+        uni_vblendvps(Vmm(idx), ymm_tmp_1, Vmm(idx), vmm_c_tail_mask);
     } else
-        uni_vmovups(Vmm(idx), ptr[reg_ptr + offset]);
+        uni_vmovups(Vmm(idx) | k_c_tail_mask | T_z, Vmm(idx));
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::store(const int idx,
-        const reg64_t &reg_ptr, const int offset,
-        const bool is_c_tail_proccessing) {
-    if (jpp.is_bf16 || jpp.is_f16) {
-        if (is_c_tail_proccessing) {
-            if (jpp.is_c_padded) {
-                vmovdqu16(Ymm(idx) | k_c_tail_mask | T_z, Ymm(idx));
-                vmovups(yword[reg_ptr + offset], Ymm(idx));
-            } else
-                vmovdqu16(ptr[reg_ptr + offset] | k_c_tail_mask, Ymm(idx));
-        } else
-            vmovups(yword[reg_ptr + offset], Ymm(idx));
-    } else if (jpp.is_fp8) {
-        if (is_c_tail_proccessing) {
-            if (jpp.is_c_padded) {
-                vmovdqu8(Xmm(idx) | k_c_tail_mask | T_z, Xmm(idx));
-                vmovdqu8(yword[reg_ptr + offset], Xmm(idx));
-            } else
-                vmovdqu8(ptr[reg_ptr + offset] | k_c_tail_mask, Xmm(idx));
-        } else
-            vmovdqu8(yword[reg_ptr + offset], Xmm(idx));
-    } else {
-        if (is_c_tail_proccessing) {
-            if (!jpp.is_c_padded) {
-                if (isa == avx || isa == avx2)
-                    vmaskmovps(
-                            ptr[reg_ptr + offset], vmm_c_tail_mask, Vmm(idx));
-                else
-                    vmovups(ptr[reg_ptr + offset] | k_c_tail_mask, Zmm(idx));
+inline void jit_uni_pool_kernel_t<isa>::load_indices(
+        const int indr_i, const int step_index, bool is_c_tail_processing) {
+    if (jpp.ind_dt == data_type::u8) {
+        auto indvr = vreg(indr_i);
+        auto indxr = xreg(indr_i);
+        if (isa == sse41) {
+            if (is_c_tail_processing && !jpp.is_c_padded) {
+                for (int i = 0; i < jpp.c_tail % (jpp.c_block / 2); i++)
+                    pinsrb(indxr, ptr[reg_index + step_index + i], i);
             } else {
-                if (jpp.with_postops) {
-                    if (isa == avx || isa == avx2) {
-                        uni_vxorps(ymm_tmp_1, ymm_tmp_1, ymm_tmp_1);
-                        uni_vblendvps(
-                                Vmm(idx), ymm_tmp_1, Vmm(idx), vmm_c_tail_mask);
-                    } else
-                        uni_vmovups(Vmm(idx) | k_c_tail_mask | T_z, Vmm(idx));
-                }
-                uni_vmovups(vmmword[reg_ptr + offset], Vmm(idx));
+                movd(indxr, ptr[reg_index + step_index]);
             }
-        } else
-            uni_vmovups(vmmword[reg_ptr + offset], Vmm(idx));
+            pmovzxbd(indvr, indxr);
+        } else if (isa == avx || isa == avx2) {
+            if (is_c_tail_processing && !jpp.is_c_padded) {
+                for (int i = 0; i < jpp.c_tail; i++)
+                    vpinsrb(indxr, indxr, ptr[reg_index + step_index + i], i);
+            } else {
+                vmovq(indxr, ptr[reg_index + step_index]);
+            }
+            if (!mayiuse(avx2)) {
+                avx_pmovzxbd(indvr, indxr, xmm_tmp);
+            } else {
+                vpmovzxbd(indvr, indxr);
+            }
+        } else {
+            if (is_c_tail_processing && !jpp.is_c_padded) {
+                vpmovzxbd(indvr | k_c_tail_mask | T_z,
+                        ptr[reg_index + step_index]);
+            } else {
+                vpmovzxbd(indvr, ptr[reg_index + step_index]);
+            }
+        }
+    } else {
+        assert(jpp.ind_dt == data_type::s32);
+
+        // Load 4-byte values without conversion. The values are actually integers.
+        auto indvr = vreg(indr_i);
+        io_[data_type::f32]->load(vmmword[reg_index + step_index], indvr,
+                is_c_tail_processing && !jpp.is_c_padded);
     }
 }
 
-template <>
-inline void jit_uni_pool_kernel<avx2_vnni_2>::store(const int idx,
-        const reg64_t &reg_ptr, const int offset,
-        const bool is_c_tail_proccessing) {
-    if (jpp.is_bf16 || jpp.is_f16) {
-        if (is_c_tail_proccessing) {
-            vmaskmovps(ptr[reg_ptr + offset], xmm_c_tail_mask, Xmm(idx));
-            if (jpp.c_tail % 2 != 0) {
-                const int tail_pos = jpp.c_tail - 1;
-                auto word_addr = ptr[reg_ptr + offset + tail_pos * 2];
-                vpextrw(word_addr, Xmm(idx), tail_pos);
+template <cpu_isa_t isa>
+inline void jit_uni_pool_kernel_t<isa>::store_indices(const int indr_i,
+        const int step_index, const bool is_c_tail_processing,
+        const bool is_first_w_block) {
+    if (jpp.ind_dt == data_type::u8) {
+        auto xr = xreg(indr_i);
+        if (isa == sse41) {
+            for (int i = 0; i < (jpp.c_block / 2); ++i) {
+                if (is_c_tail_processing
+                        && i + (sse_high_half ? (jpp.c_block / 2) : 0)
+                                >= jpp.c_tail) {
+                    if (jpp.is_c_padded)
+                        mov(ptr[reg_index + step_index + i],
+                                tmp_gpr.cvt8()); // fill padded tail with zeros
+                    else
+                        break; // tail end
+                } else {
+                    // bytes which should be stored are located in
+                    // least significant bits(8 to be precise) of 32 bits parts
+                    // of xmm thus we need to store 0, 4, 8 and 12 byte of xmm
+                    pextrb(ptr[reg_index + step_index + i], xr, 4 * i);
+                }
             }
-        } else
-            vmovups(xword[reg_ptr + offset], Xmm(idx));
-    } else
-        assert(!"datatype not supported");
-}
-
-template <>
-inline void jit_uni_pool_kernel<sse41>::store(const int idx,
-        const reg64_t &reg_ptr, const int offset,
-        const bool is_c_tail_proccessing) {
-    if (is_c_tail_proccessing) {
-        if (!jpp.is_c_padded) {
-            for (int i = 0; i < jpp.c_tail % (jpp.c_block / 2); i++)
-                pextrd(ptr[reg_ptr + offset + i * jpp.dt_size], Xmm(idx), i);
-        } else {
-            if (jpp.with_postops) {
-                static constexpr auto xmm_half = 4;
-                const auto tail_size = (jpp.c_without_padding > jpp.c_block)
-                        ? jpp.c_without_padding % (jpp.c - jpp.c_block)
-                        : jpp.c_without_padding;
-                const auto tail_size_real = (tail_size >= xmm_half)
-                        ? tail_size - xmm_half
-                        : tail_size;
-                uni_vxorps(xmm_tmp_1, xmm_tmp_1, xmm_tmp_1);
-                if (tail_size <= xmm_half && sse_high_half) {
-                    // just zero out upper half padding and don't write anything else
-                    uni_vmovups(vmmword[reg_ptr + offset], xmm_tmp_1);
-                    return;
+        } else if (utils::one_of(isa, avx, avx2, avx2_vnni_2)) {
+            auto yr = yreg(indr_i);
+            if (is_c_tail_processing && !jpp.is_c_padded) {
+                const int max_nr_of_vals = jpp.c_tail > (jpp.c_block / 2)
+                        ? (jpp.c_block / 2)
+                        : jpp.c_tail;
+                for (int i = 0; i < max_nr_of_vals; ++i) {
+                    // bytes which should be stored are located in
+                    // least significant bits(8 to be precise) of 32 bits parts
+                    // of xmm thus we need to store 0, 4, 8 and 12 byte of xmm
+                    vpextrb(ptr[reg_index + step_index + i], xr, 4 * i);
                 }
 
-                if ((tail_size < xmm_half && !sse_high_half)
-                        || (tail_size > xmm_half && sse_high_half)) {
-                    std::bitset<8> tail_mask((1 << tail_size_real) - 1);
-                    tail_mask.flip();
-                    uni_vblendps(Vmm(idx), Vmm(idx), xmm_tmp_1,
-                            tail_mask.to_ulong());
+                if (jpp.c_tail > (jpp.c_block / 2)) {
+                    Xmm higher_128bits(vmm_mask.getIdx());
+                    vextractf128(higher_128bits, yr, 1);
+                    for (int i = 0; i < jpp.c_tail - (jpp.c_block / 2); ++i) {
+                        // bytes which should be stored are located in
+                        // least significant bits(8 to be precise) of 32 bits parts
+                        // of xmm thus we need to store 0, 4, 8 and 12 byte of xmm
+                        vpextrb(ptr[reg_index + step_index + (jpp.c_block / 2)
+                                        + i],
+                                higher_128bits, 4 * i);
+                    }
+                }
+            } else {
+                if (is_c_tail_processing) {
+                    assert(jpp.is_c_padded);
+                    vandps(yr, yr, vmm_c_tail_mask);
+                }
+                if (is_first_w_block) {
+                    vmovd(xmm_tmp, reg_shuf_mask);
+                    uni_vpbroadcastd(vmm_tmp, xmm_tmp);
+                }
+                if (mayiuse(avx2)) {
+                    vpshufb(yr, yr, vmm_tmp);
+                    vmovd(ptr[reg_index + step_index], xr);
+                    vperm2i128(yr, yr, yr, 0x1u);
+                    vmovd(ptr[reg_index + step_index + (jpp.c_block / 2)], xr);
+                } else {
+                    Xmm t(vmm_mask.getIdx());
+                    vextractf128(t, yr, 0);
+                    vpshufb(t, t, xmm_tmp);
+                    vmovd(ptr[reg_index + step_index], t);
+                    vextractf128(t, yr, 1);
+                    vpshufb(t, t,
+                            xmm_tmp); // ymm_tmp[:128]==ymm_tmp[127:0]
+                    vmovd(ptr[reg_index + step_index + (jpp.c_block / 2)], t);
                 }
             }
-            uni_vmovups(vmmword[reg_ptr + offset], Vmm(idx));
+        } else {
+            auto vr = vreg(indr_i);
+            if (is_c_tail_processing) {
+                if (jpp.is_c_padded) {
+                    knotw(k_c_tail_mask, k_c_tail_mask);
+                    vpxord(vr | k_c_tail_mask, vr, vr);
+                    knotw(k_c_tail_mask, k_c_tail_mask);
+                    vpmovusdb(ptr[reg_index + step_index], vr);
+                } else
+                    vpmovusdb(ptr[reg_index + step_index], vr | k_c_tail_mask);
+            } else {
+                vpmovusdb(ptr[reg_index + step_index], vr);
+            }
         }
-    } else
-        uni_vmovups(vmmword[reg_ptr + offset], Vmm(idx));
+    } else {
+        assert(jpp.ind_dt == data_type::s32);
+
+        // Store 4-byte values without conversion. The values are actually integers.
+        auto idx = reg_idx(indr_i);
+        if (is_c_tail_processing && jpp.is_c_padded) pad_with_zeros(idx);
+        io_[data_type::f32]->store(Vmm(idx), vmmword[reg_index + step_index],
+                is_c_tail_processing && !jpp.is_c_padded);
+    }
 }
 
 template <cpu_isa_t isa>
-bool jit_uni_pool_kernel<isa>::post_ops_ok(jit_pool_conf_t &jpp,
+bool jit_uni_pool_kernel_t<isa>::post_ops_ok(jit_pool_conf_t &jpp,
         const primitive_attr_t &attr, const memory_desc_wrapper &dst_d) {
     const auto &post_ops = attr.post_ops_;
     const auto &entries = post_ops.entry_;
@@ -703,7 +757,7 @@ bool jit_uni_pool_kernel<isa>::post_ops_ok(jit_pool_conf_t &jpp,
 }
 
 template <cpu_isa_t isa>
-void jit_uni_pool_kernel<isa>::apply_postops(int ur_bc, int ur_w, int c_block,
+void jit_uni_pool_kernel_t<isa>::apply_postops(int ur_bc, int ur_w, int c_block,
         const std::function<bool(int, bool)> &is_tail_predicate) {
     binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
     const int end_idx = vmm_idx_upper_bound() + 1;
@@ -749,7 +803,7 @@ void jit_uni_pool_kernel<isa>::apply_postops(int ur_bc, int ur_w, int c_block,
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::maybe_recalculate_divisor(
+inline void jit_uni_pool_kernel_t<isa>::maybe_recalculate_divisor(
         int jj, int ur_w, int pad_l, int pad_r, bool with_c_tail_proccessing) {
     if (jpp.alg == pooling_avg_exclude_padding) {
         int kw = jpp.kw;
@@ -780,7 +834,7 @@ inline void jit_uni_pool_kernel<isa>::maybe_recalculate_divisor(
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::avg_step(int ur_w, int ur_bc, int pad_l,
+inline void jit_uni_pool_kernel_t<isa>::avg_step(int ur_w, int ur_bc, int pad_l,
         int pad_r, bool with_c_tail_proccessing) {
 
     auto iw = jpp.iw;
@@ -812,7 +866,7 @@ inline void jit_uni_pool_kernel<isa>::avg_step(int ur_w, int ur_bc, int pad_l,
             auto accvr = vreg(accr_i);
             if (jpp.is_backward) {
                 auto output_offset = dt_size * (jj * c_off + bci * c_block);
-                load(accvr.getIdx(), reg_output, output_offset,
+                load(jpp.dst_dt, accvr.getIdx(), reg_output, output_offset,
                         is_tail_processing(bci));
                 uni_vdivps(accvr, accvr, vmm_tmp);
             } else {
@@ -851,39 +905,15 @@ inline void jit_uni_pool_kernel<isa>::avg_step(int ur_w, int ur_bc, int pad_l,
                 if (aux_input_offset >= iw * c_off) continue;
                 int input_offset = dt_size * aux_input_offset;
                 if (jpp.is_backward) {
-                    auto inpyr = yreg(inpr_i);
-                    load(reg_idx(inpr_i), aux_reg_input, input_offset,
-                            is_tail_processing(bci));
+                    load(jpp.src_dt, reg_idx(inpr_i), aux_reg_input,
+                            input_offset, is_tail_processing(bci));
                     uni_vaddps(inpvr, inpvr, accvr);
-                    if (jpp.is_bf16) {
-                        if (!isa_has_bf16(jpp.isa))
-                            bf16_emu_->vcvtneps2bf16(inpyr, zreg(inpr_i));
-                        else
-                            vcvtneps2bf16(inpyr, inpvr);
-                    } else if (jpp.is_f16) {
-                        vcvtps2ph(inpyr, inpvr, _op_mxcsr);
-                    } else if (jpp.is_fp8) {
-                        auto inpxr = xreg(inpr_i);
-                        if (jpp.src_dt == data_type::f8_e5m2)
-                            f8_e5m2_emu_->vcvt_f32_to_f8(inpxr, zreg(inpr_i));
-                        else if (jpp.src_dt == data_type::f8_e4m3)
-                            f8_e4m3_emu_->vcvt_f32_to_f8(inpxr, zreg(inpr_i));
-                    }
-                    store(reg_idx(inpr_i), aux_reg_input, input_offset,
-                            is_tail_processing(bci));
+                    store(jpp.src_dt, reg_idx(inpr_i), aux_reg_input,
+                            input_offset, is_tail_processing(bci));
                 } else {
-                    if (jpp.is_bf16 || jpp.is_f16 || jpp.is_fp8
-                            || is_tail_processing(bci)
-                            || (isa == sse41
-                                    && c_off % (jpp.c_block / 2) != 0)) {
-                        load(vmm_tmp_1.getIdx(), aux_reg_input, input_offset,
-                                is_tail_processing(bci));
-
-                        uni_vaddps(accvr, accvr, vmm_tmp_1);
-                    } else {
-                        uni_vaddps(accvr, accvr,
-                                ptr[aux_reg_input + input_offset]);
-                    }
+                    load(jpp.src_dt, vmm_tmp_1.getIdx(), aux_reg_input,
+                            input_offset, is_tail_processing(bci));
+                    uni_vaddps(accvr, accvr, vmm_tmp_1);
                 }
             }
         }
@@ -919,35 +949,9 @@ inline void jit_uni_pool_kernel<isa>::avg_step(int ur_w, int ur_bc, int pad_l,
         for (int jj = 0; jj < ur_w; jj++) {
             for (int bci = 0; bci < ur_bc; bci++) {
                 const auto accr_i = reg_ind(0, bci, jj, ur_bc, ur_w);
-                const auto accvr = vreg(accr_i);
                 const auto output_offset
                         = dt_size * (jj * c_off + bci * c_block);
-                const auto accyr = yreg(accr_i);
-                if (jpp.is_bf16) {
-                    if (isa == avx2_vnni_2) {
-                        auto accxr = xreg(accr_i);
-                        vcvtneps2bf16(accxr, accyr, Xbyak::VexEncoding);
-                    } else {
-                        const auto acczr = zreg(accr_i);
-                        if (!isa_has_bf16(jpp.isa))
-                            bf16_emu_->vcvtneps2bf16(accyr, acczr);
-                        else
-                            vcvtneps2bf16(accyr, accvr);
-                    }
-                } else if (jpp.is_f16) {
-                    if (isa == avx2_vnni_2) {
-                        auto accxr = xreg(accr_i);
-                        vcvtps2ph(accxr, accyr, _op_mxcsr);
-                    } else
-                        vcvtps2ph(accyr, accvr, _op_mxcsr);
-                } else if (jpp.is_fp8) {
-                    const auto accxr = xreg(accr_i);
-                    if (jpp.src_dt == data_type::f8_e5m2)
-                        f8_e5m2_emu_->vcvt_f32_to_f8(accxr, accvr);
-                    else if (jpp.src_dt == data_type::f8_e4m3)
-                        f8_e4m3_emu_->vcvt_f32_to_f8(accxr, accvr);
-                }
-                store(reg_idx(accr_i), reg_output, output_offset,
+                store(jpp.dst_dt, reg_idx(accr_i), reg_output, output_offset,
                         is_tail_processing(bci));
             }
         }
@@ -955,7 +959,7 @@ inline void jit_uni_pool_kernel<isa>::avg_step(int ur_w, int ur_bc, int pad_l,
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::max_step_fwd(int ur_w, int ur_bc,
+inline void jit_uni_pool_kernel_t<isa>::max_step_fwd(int ur_w, int ur_bc,
         int pad_l, int pad_r, bool with_c_tail_proccessing) {
     int iw = jpp.iw;
     int kw = jpp.kw;
@@ -1021,7 +1025,7 @@ inline void jit_uni_pool_kernel<isa>::max_step_fwd(int ur_w, int ur_bc,
                         = (ki + jj * stride_w - pad_l) * c_off + bci * c_block;
                 if (aux_input_offset >= iw * c_off) continue;
                 int input_offset = jpp.dt_size * aux_input_offset;
-                load(reg_idx(inpr_i), aux_reg_input, input_offset,
+                load(jpp.src_dt, reg_idx(inpr_i), aux_reg_input, input_offset,
                         is_tail_processing(bci));
                 if (isa == sse41) {
                     movups(vmm_mask, accvr);
@@ -1093,150 +1097,42 @@ inline void jit_uni_pool_kernel<isa>::max_step_fwd(int ur_w, int ur_bc,
     for_(int jj = 0; jj < ur_w; jj++)
     for (int bci = 0; bci < ur_bc; bci++) {
         const auto accr_i = reg_ind(0, bci, jj, ur_bc, ur_w);
-        const auto accvr = vreg(accr_i);
         const auto output_offset = jpp.dt_size * (jj * c_off + bci * c_block);
-        auto accyr = yreg(accr_i);
-        if (jpp.is_bf16) {
-            if (isa == avx2_vnni_2) {
-                auto accxr = xreg(accr_i);
-                vcvtneps2bf16(accxr, accyr, Xbyak::VexEncoding);
-            } else {
-                auto acczr = zreg(accr_i);
-                if (!isa_has_bf16(jpp.isa))
-                    bf16_emu_->vcvtneps2bf16(accyr, acczr);
-                else
-                    vcvtneps2bf16(accyr, accvr);
-            }
-        } else if (jpp.is_f16) {
-            if (isa == avx2_vnni_2) {
-                auto accxr = xreg(accr_i);
-                vcvtps2ph(accxr, accyr, _op_mxcsr);
-            } else
-                vcvtps2ph(accyr, accvr, _op_mxcsr);
-        } else if (jpp.is_fp8) {
-            auto accxr = xreg(accr_i);
-            auto acczr = zreg(accr_i);
-            if (jpp.src_dt == data_type::f8_e5m2)
-                f8_e5m2_emu_->vcvt_f32_to_f8(accxr, acczr);
-            else if (jpp.src_dt == data_type::f8_e4m3)
-                f8_e4m3_emu_->vcvt_f32_to_f8(accxr, acczr);
-        }
-        store(reg_idx(accr_i), reg_output, output_offset,
-                is_tail_processing(bci));
+        const bool is_c_tail_processing = is_tail_processing(bci);
+        store(jpp.dst_dt, reg_idx(accr_i), reg_output, output_offset,
+                is_c_tail_processing);
 
         if (jpp.is_training) {
             const size_t step_index = (jj * c_off + bci * c_block)
                     * types::data_type_size(jpp.ind_dt);
 
             const auto indr_i = reg_ind(2, bci, jj, ur_bc, ur_w);
-            auto vr = vreg(indr_i);
-            if (jpp.ind_dt == data_type::u8) {
-                auto xr = xreg(indr_i);
-                if (isa == sse41) {
-                    for (int i = 0; i < (jpp.c_block / 2); ++i) {
-                        if (is_tail_processing(bci)
-                                && i + (sse_high_half ? (jpp.c_block / 2) : 0)
-                                        >= jpp.c_tail) {
-                            if (jpp.is_c_padded)
-                                mov(ptr[reg_index + step_index + i],
-                                        tmp_gpr.cvt8()); // fill padded tail with zeros
-                            else
-                                break; // tail end
-                        } else {
-                            // bytes which should be stored are located in
-                            // least significant bits(8 to be precise) of 32 bits parts
-                            // of xmm thus we need to store 0, 4, 8 and 12 byte of xmm
-                            pextrb(ptr[reg_index + step_index + i], xr, 4 * i);
-                        }
-                    }
-                } else if (utils::one_of(isa, avx, avx2, avx2_vnni_2)) {
-                    auto yr = yreg(indr_i);
-                    if (is_tail_processing(bci) && !jpp.is_c_padded) {
-                        const int max_nr_of_vals
-                                = jpp.c_tail > (jpp.c_block / 2)
-                                ? (jpp.c_block / 2)
-                                : jpp.c_tail;
-                        for (int i = 0; i < max_nr_of_vals; ++i) {
-                            // bytes which should be stored are located in
-                            // least significant bits(8 to be precise) of 32 bits parts
-                            // of xmm thus we need to store 0, 4, 8 and 12 byte of xmm
-                            vpextrb(ptr[reg_index + step_index + i], xr, 4 * i);
-                        }
-
-                        if (jpp.c_tail > (jpp.c_block / 2)) {
-                            Xmm higher_128bits(vmm_mask.getIdx());
-                            vextractf128(higher_128bits, yr, 1);
-                            for (int i = 0; i < jpp.c_tail - (jpp.c_block / 2);
-                                    ++i) {
-                                // bytes which should be stored are located in
-                                // least significant bits(8 to be precise) of 32 bits parts
-                                // of xmm thus we need to store 0, 4, 8 and 12 byte of xmm
-                                vpextrb(ptr[reg_index + step_index
-                                                + (jpp.c_block / 2) + i],
-                                        higher_128bits, 4 * i);
-                            }
-                        }
-                    } else {
-                        if (is_tail_processing(bci)) {
-                            assert(jpp.is_c_padded);
-                            vandps(yr, yr, vmm_c_tail_mask);
-                        }
-                        if (jj == 0) {
-                            vmovd(xmm_tmp, reg_shuf_mask);
-                            uni_vpbroadcastd(vmm_tmp, xmm_tmp);
-                        }
-                        if (mayiuse(avx2)) {
-                            vpshufb(yr, yr, vmm_tmp);
-                            vmovd(ptr[reg_index + step_index], xr);
-                            vperm2i128(yr, yr, yr, 0x1u);
-                            vmovd(ptr[reg_index + step_index
-                                          + (jpp.c_block / 2)],
-                                    xr);
-                        } else {
-                            Xmm t(vmm_mask.getIdx());
-                            vextractf128(t, yr, 0);
-                            vpshufb(t, t, xmm_tmp);
-                            vmovd(ptr[reg_index + step_index], t);
-                            vextractf128(t, yr, 1);
-                            vpshufb(t, t,
-                                    xmm_tmp); // ymm_tmp[:128]==ymm_tmp[127:0]
-                            vmovd(ptr[reg_index + step_index
-                                          + (jpp.c_block / 2)],
-                                    t);
-                        }
-                    }
-                } else {
-                    if (is_tail_processing(bci)) {
-                        if (jpp.is_c_padded) {
-                            knotw(k_c_tail_mask, k_c_tail_mask);
-                            vpxord(vr | k_c_tail_mask, vr, vr);
-                            knotw(k_c_tail_mask, k_c_tail_mask);
-                            vpmovusdb(ptr[reg_index + step_index], vr);
-                        } else
-                            vpmovusdb(ptr[reg_index + step_index],
-                                    vr | k_c_tail_mask);
-                    } else {
-                        vpmovusdb(ptr[reg_index + step_index], vr);
-                    }
-                }
-            } else {
-                store(vr.getIdx(), reg_index, step_index,
-                        is_tail_processing(bci));
-            }
+            const bool is_first_w_block = jj == 0;
+            store_indices(
+                    indr_i, step_index, is_c_tail_processing, is_first_w_block);
         }
     }
 }
 
 template <cpu_isa_t isa>
-inline void jit_uni_pool_kernel<isa>::max_step_bwd(int ur_w, int ur_bc,
+inline void jit_uni_pool_kernel_t<isa>::max_step_bwd(int ur_w, int ur_bc,
         int pad_l, int pad_r, bool with_c_tail_proccessing) {
 
     int iw = jpp.iw;
     int kw = jpp.kw;
     int stride_w = jpp.stride_w;
     int c_block = jpp.c_block;
-    const int c_off
+    const int output_c_off
             = (jpp.tag_kind == jit_memory_tag_kind_t::nspc) ? jpp.c : c_block;
+    const int input_c_off = jpp.needs_f32_accum_for_bf16
+            ? jpp.f32_accum_block_size
+            : output_c_off;
+    const auto input_dt
+            = jpp.needs_f32_accum_for_bf16 ? data_type::f32 : jpp.src_dt;
+    const size_t input_dt_size = types::data_type_size(input_dt);
+    const size_t output_dt_size = jpp.dt_size;
+    assert(output_dt_size == types::data_type_size(jpp.dst_dt));
+
     Label kd_label, kh_label;
 
     const auto is_tail_processing = [&](int bc) {
@@ -1254,48 +1150,15 @@ inline void jit_uni_pool_kernel<isa>::max_step_bwd(int ur_w, int ur_bc,
     for_(int jj = 0; jj < ur_w; jj++)
     for (int bci = 0; bci < ur_bc; bci++) {
         const auto outr_i = reg_ind(0, bci, jj, ur_bc, ur_w);
-        auto out_offset = jpp.dt_size * (jj * c_off + bci * c_block);
-        load(reg_idx(outr_i), reg_output, out_offset, is_tail_processing(bci));
-        const size_t step_index = (jj * c_off + bci * c_block)
+        auto out_offset = output_dt_size * (jj * output_c_off + bci * c_block);
+        const bool is_c_tail_processing = is_tail_processing(bci);
+        load(jpp.dst_dt, reg_idx(outr_i), reg_output, out_offset,
+                is_c_tail_processing);
+        const size_t step_index = (jj * output_c_off + bci * c_block)
                 * types::data_type_size(jpp.ind_dt);
 
         const auto indr_i = reg_ind(1, bci, jj, ur_bc, ur_w);
-        auto indvr = vreg(indr_i);
-        if (jpp.ind_dt == data_type::u8) {
-            auto indxr = xreg(indr_i);
-            if (isa == sse41) {
-                if (is_tail_processing(bci) && !jpp.is_c_padded) {
-                    for (int i = 0; i < jpp.c_tail % (jpp.c_block / 2); i++)
-                        pinsrb(indxr, ptr[reg_index + step_index + i], i);
-                } else {
-                    movd(indxr, ptr[reg_index + step_index]);
-                }
-                pmovzxbd(indvr, indxr);
-            } else if (isa == avx || isa == avx2) {
-                if (is_tail_processing(bci) && !jpp.is_c_padded) {
-                    for (int i = 0; i < jpp.c_tail; i++)
-                        vpinsrb(indxr, indxr, ptr[reg_index + step_index + i],
-                                i);
-                } else {
-                    vmovq(indxr, ptr[reg_index + step_index]);
-                }
-                if (!mayiuse(avx2)) {
-                    avx_pmovzxbd(indvr, indxr, xmm_tmp);
-                } else {
-                    vpmovzxbd(indvr, indxr);
-                }
-            } else {
-                if (is_tail_processing(bci) && !jpp.is_c_padded) {
-                    vpmovzxbd(indvr | k_c_tail_mask | T_z,
-                            ptr[reg_index + step_index]);
-                } else {
-                    vpmovzxbd(indvr, ptr[reg_index + step_index]);
-                }
-            }
-        } else {
-            load(indvr.getIdx(), reg_index, step_index,
-                    is_tail_processing(bci));
-        }
+        load_indices(indr_i, step_index, is_c_tail_processing);
     }
     uni_vmovq(xmm_tmp, reg_k_shift);
     uni_vpbroadcastd(vmm_k_offset, xmm_tmp);
@@ -1303,11 +1166,6 @@ inline void jit_uni_pool_kernel<isa>::max_step_bwd(int ur_w, int ur_bc,
     if (jpp.simple_alg && jpp.ndims == 5) {
         push(reg_input);
         push(reg_output);
-        if (isa == sse41) {
-            // Save rdi since it is used in maskmovdqu
-            assert(dst_ptr == rdi);
-            push(dst_ptr);
-        }
         mov(aux_reg_input_d, reg_input);
         mov(ki, ptr[reg_param + GET_OFF(kd_padding)]);
         mov(reg_kd_pad_shift, ptr[reg_param + GET_OFF(kd_padding_shift)]);
@@ -1332,58 +1190,35 @@ inline void jit_uni_pool_kernel<isa>::max_step_bwd(int ur_w, int ur_bc,
                 const auto inpr_i = reg_ind(2, bci, jj, ur_bc, ur_w);
                 const auto inpvr = vreg(inpr_i);
                 const auto cvtvr = vreg(reg_ind(3, bci, jj, ur_bc, ur_w));
-                int aux_inp_offset
-                        = (ki + jj * stride_w - pad_l) * c_off + bci * c_block;
-                if (aux_inp_offset >= iw * c_off) continue;
-                int inp_offset = jpp.dt_size * aux_inp_offset;
-                load(reg_idx(inpr_i), aux_reg_input, inp_offset,
+                int aux_inp_offset = (ki + jj * stride_w - pad_l) * input_c_off
+                        + bci * c_block;
+                if (aux_inp_offset >= iw * input_c_off) continue;
+                int inp_offset = input_dt_size * aux_inp_offset;
+                load(input_dt, reg_idx(inpr_i), aux_reg_input, inp_offset,
                         is_tail_processing(bci));
                 if (isa == sse41) {
-                    mov(dst_ptr, aux_reg_input);
-                    add(dst_ptr, inp_offset);
-
                     movups(cvtvr, indvr);
                     pcmpeqd(cvtvr, vmm_k_offset);
-                    addps(inpvr, outvr);
-                    if (is_tail_processing(bci)) {
-                        Label end_cond_move[4];
-                        for (int i = 0; i < jpp.c_tail % (jpp.c_block / 2);
-                                i++) {
-                            pextrd(tmp_gpr.cvt32(), cvtvr, i);
-                            cmp(tmp_gpr, 0);
-                            je(end_cond_move[i], T_NEAR);
-                            pextrd(ptr[dst_ptr + i * jpp.dt_size], inpvr, i);
-                            L(end_cond_move[i]);
-                        }
-                    } else
-                        maskmovdqu(inpvr, cvtvr);
+                    vandps(cvtvr, cvtvr, outvr);
+                    addps(inpvr, cvtvr);
+                    store(input_dt, inpvr.getIdx(), aux_reg_input, inp_offset,
+                            is_tail_processing(bci));
                 } else if (isa == avx || isa == avx2) {
                     if (mayiuse(avx2)) {
                         vpcmpeqd(cvtvr, indvr, vmm_k_offset);
                     } else {
                         avx_pcmpeqd(cvtvr, indvr, vmm_k_offset, xmm_tmp);
                     }
-                    vaddps(inpvr, inpvr, outvr);
-                    if (is_tail_processing(bci)) {
-                        vandps(cvtvr, cvtvr, vmm_c_tail_mask);
-                    }
-                    vmaskmovps(
-                            vmmword[aux_reg_input + inp_offset], cvtvr, inpvr);
+                    uni_vpxor(vmm_tmp, vmm_tmp, vmm_tmp);
+                    vblendvps(vmm_tmp, vmm_tmp, outvr, cvtvr);
+                    vaddps(inpvr, inpvr, vmm_tmp);
+                    store(input_dt, inpvr.getIdx(), aux_reg_input, inp_offset,
+                            is_tail_processing(bci));
                 } else {
-                    auto indzr = zreg(inpr_i);
-                    auto indyr = yreg(inpr_i);
                     vpcmpeqd(k_store_mask, indvr, vmm_k_offset);
                     vblendmps(vmm_tmp | k_store_mask | T_z, outvr, outvr);
                     vaddps(inpvr, inpvr, vmm_tmp);
-                    if (jpp.is_bf16) {
-                        if (!isa_has_bf16(jpp.isa))
-                            bf16_emu_->vcvtneps2bf16(indyr, indzr);
-                        else
-                            vcvtneps2bf16(indyr, inpvr);
-                    } else if (jpp.is_f16) {
-                        vcvtps2ph(indyr, inpvr, _op_mxcsr);
-                    }
-                    store(inpvr.getIdx(), aux_reg_input, inp_offset,
+                    store(input_dt, inpvr.getIdx(), aux_reg_input, inp_offset,
                             is_tail_processing(bci));
                 }
             }
@@ -1402,13 +1237,13 @@ inline void jit_uni_pool_kernel<isa>::max_step_bwd(int ur_w, int ur_bc,
             if (with_c_tail_proccessing && (isa == avx || isa == avx2))
                 pop_vmm_val(vmm_c_tail_mask.getIdx());
         }
-        add(aux_reg_input, jpp.dt_size * iw * c_off);
+        add(aux_reg_input, input_dt_size * iw * input_c_off);
         inc(kj);
         cmp(kj, reg_kh);
         jl(kh_label, T_NEAR);
     }
     if (jpp.simple_alg && jpp.ndims == 5) {
-        add(aux_reg_input_d, jpp.dt_size * jpp.ih * iw * c_off);
+        add(aux_reg_input_d, input_dt_size * jpp.ih * iw * input_c_off);
 
         mov(tmp_gpr, reg_kd_pad_shift);
         uni_vmovq(xmm_tmp, tmp_gpr);
@@ -1423,22 +1258,18 @@ inline void jit_uni_pool_kernel<isa>::max_step_bwd(int ur_w, int ur_bc,
         dec(ki);
         cmp(ki, 0);
         jg(kd_label, T_NEAR);
-        if (isa == sse41) {
-            // Save rdi since it is used in maskmovdqu
-            assert(dst_ptr == rdi);
-            pop(dst_ptr);
-        }
         pop(reg_output);
         pop(reg_input);
     }
 }
 
 template <cpu_isa_t isa>
-void jit_uni_pool_kernel<isa>::zero_diff_src(
+void jit_uni_pool_kernel_t<isa>::zero_diff_src(
         int ur_bc, bool with_c_tail_proccessing) {
-    const int c_off = (jpp.tag_kind == jit_memory_tag_kind_t::nspc)
-            ? jpp.c
-            : jpp.c_block;
+    const int c_off = jpp.needs_f32_accum_for_bf16
+            ? jpp.f32_accum_block_size
+            : ((jpp.tag_kind == jit_memory_tag_kind_t::nspc) ? jpp.c
+                                                             : jpp.c_block);
 
     Label l_skip, l_ih_loop, l_id_loop;
 
@@ -1459,7 +1290,10 @@ void jit_uni_pool_kernel<isa>::zero_diff_src(
     Vmm vzero = vmm_tmp;
     uni_vpxor(vzero, vzero, vzero);
 
-    const int width_size = jpp.iw * c_off * jpp.dt_size;
+    const auto src_dt
+            = jpp.needs_f32_accum_for_bf16 ? data_type::f32 : jpp.src_dt;
+    const auto dt_size = types::data_type_size(src_dt);
+    const int width_size = jpp.iw * c_off * dt_size;
 
     auto aux_reg_zero_ptr = tmp_gpr;
 
@@ -1469,31 +1303,31 @@ void jit_uni_pool_kernel<isa>::zero_diff_src(
         mov(aux_reg_zero_ih, reg_zero_ih);
         L(l_ih_loop);
         {
-            const auto vlen = cpu_isa_traits<isa>::vlen;
-            const int step = c_off * jpp.dt_size;
+            const auto vlen = cpu_isa_traits_t<isa>::vlen;
+            const int step = c_off * dt_size;
 
             // TODO: maybe a big code generated here
             for_(int i = 0; i < width_size; i += step)
             for (int bci = 0; bci < ur_bc; bci++) {
-                const int offs = i + bci * jpp.c_block * jpp.dt_size;
+                const int offs = i + bci * jpp.c_block * dt_size;
                 if (isa == sse41) {
                     bool is_needed_c_tail_processing = false;
                     if (is_tail_processing(bci)
                             && jpp.c_tail < (jpp.c_block / 2))
                         is_needed_c_tail_processing = true;
-                    store(vzero.getIdx(), reg_zero_ptr, offs,
+                    store(src_dt, vzero.getIdx(), reg_zero_ptr, offs,
                             is_needed_c_tail_processing);
                     if (!is_tail_processing(bci)
                             || (is_tail_processing(bci)
                                     && (jpp.is_c_padded
                                             || jpp.c_tail
                                                     > (jpp.c_block / 2)))) {
-                        store(vzero.getIdx(), reg_zero_ptr, offs + vlen,
+                        store(src_dt, vzero.getIdx(), reg_zero_ptr, offs + vlen,
                                 is_tail_processing(bci));
                     }
 
                 } else {
-                    store(vzero.getIdx(), reg_zero_ptr, offs,
+                    store(src_dt, vzero.getIdx(), reg_zero_ptr, offs,
                             is_tail_processing(bci));
                 }
             }
@@ -1511,12 +1345,10 @@ void jit_uni_pool_kernel<isa>::zero_diff_src(
 }
 
 template <cpu_isa_t isa>
-void jit_uni_pool_kernel<isa>::generate() {
+void jit_uni_pool_kernel_t<isa>::generate() {
 
     this->preamble();
 
-    Label idx_table;
-
     int ow = jpp.ow;
     int iw = jpp.iw;
     int kw = jpp.kw;
@@ -1524,19 +1356,18 @@ void jit_uni_pool_kernel<isa>::generate() {
     int c_block = jpp.c_block;
     int stride_w = jpp.stride_w;
     int l_pad = jpp.l_pad;
-    const int c_off
+    const int output_c_off
             = (jpp.tag_kind == jit_memory_tag_kind_t::nspc) ? jpp.c : c_block;
+    const int input_c_off = jpp.needs_f32_accum_for_bf16
+            ? jpp.f32_accum_block_size
+            : output_c_off;
+
+    int vlen = cpu_isa_traits_t<isa>::vlen;
 
-    int vlen = cpu_isa_traits<isa>::vlen;
+    const size_t input_dt_size
+            = jpp.needs_f32_accum_for_bf16 ? sizeof(float) : jpp.dt_size;
 
-#if defined(_WIN32)
-    // Always mimic the Unix ABI (see the note about maskmovdqu in the header
-    // file).
-    xor_(rdi, rcx);
-    xor_(rcx, rdi);
-    xor_(rdi, rcx);
-#endif
-    if (use_bf16_emulation()) bf16_emu_->init_vcvtneps2bf16();
+    if (use_bf16_emulation()) io_.init_bf16();
 
     mov(reg_input, ptr[reg_param + GET_OFF(src)]);
     mov(reg_output, ptr[reg_param + GET_OFF(dst)]);
@@ -1547,14 +1378,6 @@ void jit_uni_pool_kernel<isa>::generate() {
     mov(reg_ker_area_h, ptr[reg_param + GET_OFF(ker_area_h)]);
     mov(reg_nbc, ptr[reg_param + GET_OFF(ur_bc)]);
 
-    if ((jpp.is_bf16 || jpp.is_f16) && isa != avx2_vnni_2) {
-        mov(tmp_gpr.cvt32(), 0xAAAAAAAA);
-        kmovd(k_mask_cvt, tmp_gpr.cvt32());
-
-        mov(tmp_gpr, idx_table);
-        vmovups(vmm_idx(), ptr[tmp_gpr]);
-    }
-
     auto process_oi = [&](int ur_w, int ur_bc, int lpad, int rpad,
                               bool with_c_tail_proccessing,
                               bool inc_reg = true) {
@@ -1585,15 +1408,17 @@ void jit_uni_pool_kernel<isa>::generate() {
 
         if (!inc_reg) return;
 
-        auto dt_size = jpp.dt_size;
+        auto output_dt_size = jpp.dt_size;
         auto shift = (isa == sse41) ? vlen : 0;
         add(reg_input,
-                dt_size * nstl::max(0, ur_w * stride_w - lpad) * c_off - shift);
-        add(reg_output, dt_size * ur_w * c_off - shift);
+                input_dt_size * nstl::max(0, ur_w * stride_w - lpad)
+                                * input_c_off
+                        - shift);
+        add(reg_output, output_dt_size * ur_w * output_c_off - shift);
         if (jpp.alg == pooling_max && (jpp.is_training || jpp.is_backward)) {
             auto ishift = (isa == sse41) ? jpp.c_block / 2 : 0;
             auto ind_dt_size = types::data_type_size(jpp.ind_dt);
-            add(reg_index, (ur_w * c_off - ishift) * ind_dt_size);
+            add(reg_index, (ur_w * output_c_off - ishift) * ind_dt_size);
         }
     };
 
@@ -1705,7 +1530,7 @@ void jit_uni_pool_kernel<isa>::generate() {
         // care of c tail processing if number of channels
         // is not divided by number of channels in block
         L(ur_bc_tail_label);
-        if (jpp.c_tail != 0) prepare_tail_mask();
+        if (jpp.c_tail != 0) io_.prepare_tail_mask();
         perform_ker(jpp.ur_bc_tail, jpp.c_tail != 0);
 
         L(finish_label);
@@ -1713,7 +1538,7 @@ void jit_uni_pool_kernel<isa>::generate() {
         jmp(finish_label, T_NEAR);
 
         L(c_tail_processing_label);
-        prepare_tail_mask();
+        io_.prepare_tail_mask();
         perform_ker(jpp.ur_bc, true);
 
         L(finish_label);
@@ -1723,25 +1548,17 @@ void jit_uni_pool_kernel<isa>::generate() {
 
     if (jpp.with_eltwise && postops_injector_)
         postops_injector_->prepare_table(/* generate = */ true);
-
-    if ((jpp.is_bf16 || jpp.is_f16) && isa != avx2_vnni_2) {
-        align(64);
-        L(idx_table);
-        const uint16_t _idx[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
-                8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15};
-        for (size_t i = 0; i < sizeof(_idx) / sizeof(_idx[0]); ++i)
-            dw(_idx[i]);
-    }
     if (f8_e5m2_emu_) f8_e5m2_emu_->prepare_table();
     if (f8_e4m3_emu_) f8_e4m3_emu_->prepare_table();
+    io_.prepare_table_fp8();
 }
 
-template struct jit_uni_pool_kernel<sse41>;
-template struct jit_uni_pool_kernel<avx>;
-template struct jit_uni_pool_kernel<avx2>;
-template struct jit_uni_pool_kernel<avx2_vnni_2>;
-template struct jit_uni_pool_kernel<avx512_core>;
-template struct jit_uni_pool_kernel<avx512_core_fp16>;
+template struct jit_uni_pool_kernel_t<sse41>;
+template struct jit_uni_pool_kernel_t<avx>;
+template struct jit_uni_pool_kernel_t<avx2>;
+template struct jit_uni_pool_kernel_t<avx2_vnni_2>;
+template struct jit_uni_pool_kernel_t<avx512_core>;
+template struct jit_uni_pool_kernel_t<avx512_core_fp16>;
 
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/jit_uni_pool_kernel.hpp b/src/cpu/x64/jit_uni_pool_kernel.hpp
index 3f4f0eb96a6..94456323202 100644
--- a/src/cpu/x64/jit_uni_pool_kernel.hpp
+++ b/src/cpu/x64/jit_uni_pool_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 * Copyright 2018 YANDEX LLC
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,28 +27,30 @@
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
 #include "cpu/x64/jit_generator.hpp"
 #include "cpu/x64/jit_primitive_conf.hpp"
+#include "cpu/x64/utils/jit_io_helper.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct bf16_emulation_t;
-
 template <cpu_isa_t isa>
-struct jit_uni_pool_kernel : public jit_generator {
+struct jit_uni_pool_kernel_t : public jit_generator_t {
 
-    jit_uni_pool_kernel(
+    jit_uni_pool_kernel_t(
             const jit_pool_conf_t &ajpp, const memory_desc_t *dst_md);
     jit_pool_conf_t jpp;
-    ~jit_uni_pool_kernel();
 
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_pool_kernel)
+    ~jit_uni_pool_kernel_t() override;
+
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_pool_kernel_t)
 
-    static status_t init_conf(jit_pool_conf_t &jbp,
-            memory_tracking::registrar_t &scratchpad, primitive_attr_t &attr,
+    static status_t init_conf(jit_pool_conf_t &jpp, primitive_attr_t &attr,
             const pooling_pd_t *ppd);
 
+    static void init_scratchpad(const jit_pool_conf_t &jpp,
+            memory_tracking::registrar_t &scratchpad);
+
 private:
     using Xmm = Xbyak::Xmm;
     using Ymm = Xbyak::Ymm;
@@ -57,7 +59,7 @@ struct jit_uni_pool_kernel : public jit_generator {
     using Reg32 = Xbyak::Reg32;
     using Reg64 = Xbyak::Reg64;
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     int vmm_idx_upper_bound() const noexcept {
         return is_superset(isa, avx512_core) ? 31 : 15;
@@ -79,9 +81,8 @@ struct jit_uni_pool_kernel : public jit_generator {
     Ymm ymm_tmp_1 = Ymm(0);
     Vmm vmm_tmp_1 = Vmm(0);
 
-    // Used only for avx and if c tail is present
+    // Used only for avx and if c tail is present; is shared with jit_io_multi_dt_helper_t
     Vmm vmm_c_tail_mask = Vmm(2);
-    Xmm xmm_c_tail_mask = Xmm(2);
 
     Vmm vmm_ker_area_h = Vmm(2);
     Vmm vmm_one = Vmm(2);
@@ -90,14 +91,6 @@ struct jit_uni_pool_kernel : public jit_generator {
 
     Vmm vmm_k_offset = Vmm(1);
 
-    // Used only for avx512 when bf16 is present
-    inline Vmm vmm_idx() {
-        if (!jpp.is_backward) {
-            return (jpp.is_training) ? Vmm(4) : Vmm(1);
-        } else
-            return Vmm(4);
-    }
-
     Zmm bf16_emu_reserv_1 = Zmm(5);
     Zmm bf16_emu_reserv_2 = Zmm(6);
     Zmm bf16_emu_reserv_3 = Zmm(7);
@@ -112,33 +105,23 @@ struct jit_uni_pool_kernel : public jit_generator {
     Reg64 fp8_emu_reg64 = bf16_emu_reserv_4;
     Xbyak::Opmask fp8_tmp_mask = Xbyak::Opmask(3);
 
+    // k_c_tail_mask is shared with jit_io_multi_dt_helper_t and jit_uni_postops_injector_t
     Opmask k_c_tail_mask = Opmask(4);
-    Opmask k_mask_cvt = Opmask(5);
-    Opmask k_store_mask = Opmask(6);
-
-    // Here be some (tame) dragons. This kernel does not follow the regular
-    // OS-agnostic ABI pattern because when isa is sse41 it uses maskmovdqu
-    // instruction which has its destination hardcoded in rdi. Therefore:
-    // - all registers are hardcoded
-    // - on Windows rdi and rcx are swapped to mimic the Unix x86_64 ABI
-    //
-    // While this is only required by the backward pass, the quirk above
-    // is applied to the forward pass as well to keep things simpler.
+    Opmask k_store_mask = Opmask(5);
 
     using reg64_t = const Reg64;
-    reg64_t reg_param = rdi; // Always mimic the Unix ABI
+    reg64_t reg_param = abi_param1;
     reg64_t reg_input = r8;
     reg64_t aux_reg_input = r9;
     reg64_t reg_index = r10;
     reg64_t reg_output = r12;
     reg64_t reg_kd_pad_shift = r13;
-    reg64_t dst_ptr = rdi; // Must be rdi due to maskmovdqu
 
     reg64_t kj = r14;
     reg64_t oi_iter = r15;
     reg64_t reg_kh = rax;
     reg64_t reg_k_shift = rbx;
-    reg64_t tmp_gpr = rcx; // Must be rcx because rdi is used above
+    reg64_t tmp_gpr = abi_not_param1;
     reg64_t reg_ker_area_h = rdx;
     reg64_t reg_nbc = rsi;
 
@@ -156,15 +139,18 @@ struct jit_uni_pool_kernel : public jit_generator {
 
     int prev_kw;
 
-    void prepare_tail_mask();
     void put_one_in_vmm();
     void uni_broadcast_reg_val(const int reg_idx, const int vmm_idx);
     void push_vmm_val(const int idx);
     void pop_vmm_val(const int idx);
-    void load(const int idx, const reg64_t &reg_ptr, const int offset,
-            const bool is_c_tail_proccessing);
-    void store(const int idx, const reg64_t &reg_ptr, const int offset,
-            const bool is_c_tail_proccessing);
+    void load(const data_type_t dt, const int idx, const reg64_t &reg_ptr,
+            const int offset, const bool is_c_tail_proccessing);
+    void store(const data_type_t dt, const int idx, const reg64_t &reg_ptr,
+            const int offset, const bool is_c_tail_proccessing);
+    void pad_with_zeros(int idx);
+    void load_indices(int indr_i, int step_index, bool is_c_tail_processing);
+    void store_indices(int indr_i, int step_index, bool is_c_tail_processing,
+            bool is_first_w_block);
 
     void maybe_recalculate_divisor(int jj, int ur_w, int pad_l, int pad_r,
             bool with_c_tail_proccessing);
@@ -195,8 +181,13 @@ struct jit_uni_pool_kernel : public jit_generator {
         add(reg_input, sizeof(float) * 4);
         add(reg_output, sizeof(float) * 4);
         if (jpp.alg == alg_kind::pooling_max
-                && (jpp.is_training || jpp.is_backward))
+                && (jpp.is_training || jpp.is_backward)) {
+            if (jpp.ind_dt == data_type::undef) {
+                assert(!"Unknown data type for indices");
+                return;
+            }
             add(reg_index, types::data_type_size(jpp.ind_dt) * 4);
+        }
 
         step(ur_w, ur_bc, pad_l, pad_r, with_c_tail_processing);
     }
@@ -264,11 +255,13 @@ struct jit_uni_pool_kernel : public jit_generator {
         return jpp.is_fp8 && is_superset(isa, avx512_core_fp16);
     }
 
-    std::unique_ptr<bf16_emulation_t> bf16_emu_;
+    static bool has_large_buffers(const pooling_pd_t *ppd);
+
     std::unique_ptr<fp8_emulation_e5m2_t> f8_e5m2_emu_;
     std::unique_ptr<fp8_emulation_e4m3_t> f8_e4m3_emu_;
     std::unique_ptr<injector::jit_uni_postops_injector_t<isa>>
             postops_injector_;
+    io::jit_io_multi_dt_helper_t<Vmm> io_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_uni_pooling.cpp b/src/cpu/x64/jit_uni_pooling.cpp
index 01810218be9..0a40560b57f 100644
--- a/src/cpu/x64/jit_uni_pooling.cpp
+++ b/src/cpu/x64/jit_uni_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017 - 2024 Intel Corporation
+* Copyright 2017 - 2025 Intel Corporation
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
@@ -483,6 +483,151 @@ class bwd_pooling_transpose_facade_t
     const dim_t c_tail_;
 };
 
+struct bwd_f32_accum_for_bf16_t {
+    using value_type = typename prec_traits_t<data_type::f32>::type;
+
+    bwd_f32_accum_for_bf16_t(const jit_pool_conf_t &jpp, const exec_ctx_t &ctx);
+
+    value_type *get_addr_2d(int ithr, dim_t ih) const {
+        return blk_data(ithr, 0, ih, 0);
+    }
+
+    value_type *get_addr_3d(int ithr, dim_t id, dim_t ih) const {
+        return blk_data(ithr, 0, id, ih, 0);
+    }
+
+    void zero_data(int ithr);
+
+    void cvt_to_bf16_slice_2d(int ithr, bfloat16_t *dst,
+            memory_desc_wrapper const &dst_d, dim_t n, dim_t b_c,
+            dim_t ur_bc) const;
+
+    void cvt_to_bf16_slice_3d(int ithr, bfloat16_t *dst,
+            memory_desc_wrapper const &dst_d, dim_t n, dim_t b_c,
+            dim_t ur_bc) const;
+
+private:
+    template <typename... Args>
+    value_type *blk_data(Args... args) const {
+        assert(wsp_);
+        return wsp_ + accum_d_.blk_off(std::forward<Args>(args)...);
+    }
+
+    const jit_pool_conf_t &jpp_;
+    value_type *wsp_ {nullptr};
+    memory_desc_wrapper accum_d_ {nullptr};
+};
+
+bwd_f32_accum_for_bf16_t::bwd_f32_accum_for_bf16_t(
+        const jit_pool_conf_t &jpp, const exec_ctx_t &ctx)
+    : jpp_ {jpp} {
+    if (jpp_.needs_f32_accum_for_bf16) {
+        accum_d_ = memory_desc_wrapper(jpp_.tmp_md);
+        auto &scratchpad = ctx.get_scratchpad_grantor();
+        wsp_ = scratchpad.template get<value_type>(
+                memory_tracking::names::key_pool_src_f32_accum);
+        assert(wsp_);
+    }
+}
+
+void bwd_f32_accum_for_bf16_t::zero_data(int ithr) {
+    auto *data = blk_data(ithr);
+    memset(data, 0,
+            jpp_.tmp_md.format_desc.blocking.strides[0] * sizeof(value_type));
+}
+
+void bwd_f32_accum_for_bf16_t::cvt_to_bf16_slice_2d(int ithr, bfloat16_t *dst,
+        memory_desc_wrapper const &dst_d, dim_t n, dim_t b_c,
+        dim_t ur_bc) const {
+
+    assert(wsp_ && (jpp_.ndims == 3 || jpp_.ndims == 4)
+            && (jpp_.tag_kind == jit_memory_tag_kind_t::nspc
+                    || jpp_.tag_kind == jit_memory_tag_kind_t::blocked));
+
+    if (jpp_.tag_kind == jit_memory_tag_kind_t::nspc) {
+        if (jpp_.tmp_md.dims[1] == jpp_.c && b_c == 0
+                && jpp_.c == ur_bc * jpp_.c_block) {
+            // all channels
+            const size_t nelems = jpp_.ih * jpp_.iw * jpp_.c;
+            const auto *cur_src = blk_data(ithr);
+            auto *cur_dst = dst + dst_d.blk_off(n);
+            cvt_float_to_bfloat16(cur_dst, cur_src, nelems);
+        } else {
+            const auto c_b = jpp_.c_block * b_c;
+            const auto c_e = nstl::min(
+                    static_cast<dim_t>(jpp_.c), jpp_.c_block * (b_c + ur_bc));
+
+            if (c_b >= c_e) return;
+
+            const size_t nelems = c_e - c_b;
+            if (jpp_.ndims == 4) {
+                for (dim_t h = 0; h < jpp_.ih; ++h) {
+                    for (dim_t w = 0; w < jpp_.iw; ++w) {
+                        const auto *cur_src = blk_data(ithr, 0, h, w);
+                        auto *cur_dst = dst + dst_d.blk_off(n, c_b, h, w);
+                        cvt_float_to_bfloat16(cur_dst, cur_src, nelems);
+                    }
+                }
+            } else {
+                for (dim_t w = 0; w < jpp_.iw; ++w) {
+                    const auto *cur_src = blk_data(ithr, 0, w);
+                    auto *cur_dst = dst + dst_d.blk_off(n, c_b, w);
+                    cvt_float_to_bfloat16(cur_dst, cur_src, nelems);
+                }
+            }
+        }
+    } else if (jpp_.tag_kind == jit_memory_tag_kind_t::blocked) {
+        assert(ur_bc == 1);
+
+        const size_t nelems = jpp_.ih * jpp_.iw * jpp_.c_block;
+        const auto *src_b = blk_data(ithr);
+        auto *dst_b = dst + dst_d.blk_off(n, b_c);
+        cvt_float_to_bfloat16(dst_b, src_b, nelems);
+    }
+}
+
+void bwd_f32_accum_for_bf16_t::cvt_to_bf16_slice_3d(int ithr, bfloat16_t *dst,
+        memory_desc_wrapper const &dst_d, dim_t n, dim_t b_c,
+        dim_t ur_bc) const {
+
+    assert(wsp_ && jpp_.ndims == 5
+            && (jpp_.tag_kind == jit_memory_tag_kind_t::nspc
+                    || jpp_.tag_kind == jit_memory_tag_kind_t::blocked));
+
+    if (jpp_.tag_kind == jit_memory_tag_kind_t::blocked) {
+        assert(ur_bc == 1);
+        const size_t nelems = jpp_.id * jpp_.ih * jpp_.iw * jpp_.c_block;
+        const auto *src_b = blk_data(ithr);
+        auto *dst_b = dst + dst_d.blk_off(n, b_c);
+        cvt_float_to_bfloat16(dst_b, src_b, nelems);
+    } else if (jpp_.tag_kind == jit_memory_tag_kind_t::nspc) {
+        if (jpp_.tmp_md.dims[1] == jpp_.c && b_c == 0
+                && jpp_.c == ur_bc * jpp_.c_block) {
+            // all channels
+            const size_t nelems = jpp_.id * jpp_.ih * jpp_.iw * jpp_.c;
+            cvt_float_to_bfloat16(
+                    dst + dst_d.blk_off(n), blk_data(ithr), nelems);
+        } else {
+            const auto c_b = jpp_.c_block * b_c;
+            const auto c_e = nstl::min(
+                    static_cast<dim_t>(jpp_.c), jpp_.c_block * (b_c + ur_bc));
+
+            if (c_b >= c_e) return;
+
+            const size_t nelems = c_e - c_b;
+            for (dim_t id = 0; id < jpp_.id; ++id) {
+                for (dim_t h = 0; h < jpp_.ih; ++h) {
+                    for (dim_t w = 0; w < jpp_.iw; ++w) {
+                        const auto *cur_src = blk_data(ithr, 0, id, h, w);
+                        auto *cur_dst = dst + dst_d.blk_off(n, c_b, id, h, w);
+                        cvt_float_to_bfloat16(cur_dst, cur_src, nelems);
+                    }
+                }
+            }
+        }
+    }
+}
+
 } // namespace jit_uni_pooling_utils
 
 template <cpu_isa_t isa, impl::data_type_t d_type>
@@ -493,7 +638,7 @@ template <cpu_isa_t isa, impl::data_type_t d_type>
 status_t jit_uni_pooling_fwd_t<isa, d_type>::init(engine_t *engine) {
 
     CHECK(safe_ptr_assign(kernel_,
-            new jit_uni_pool_kernel<isa>(
+            new jit_uni_pool_kernel_t<isa>(
                     pd()->jpp_, pd()->invariant_dst_md())));
 
     if (pd()->jpp_.tag_kind == jit_memory_tag_kind_t::ncsp)
@@ -558,7 +703,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward(const data_t *src,
     const auto post_ops_binary_rhs_arg_vec
             = binary_injector::prepare_binary_args(jpp.post_ops, ctx);
 
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
     using namespace jit_uni_pooling_utils;
 
     const auto transpose_facade
@@ -687,7 +832,7 @@ void jit_uni_pooling_fwd_t<isa, d_type>::execute_forward_3d(const data_t *src,
     const auto post_ops_binary_rhs_arg_vec
             = binary_injector::prepare_binary_args(jpp.post_ops, ctx);
 
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
     using namespace jit_uni_pooling_utils;
     static constexpr int first_ithr = 0;
 
@@ -879,7 +1024,7 @@ status_t jit_uni_pooling_bwd_t<isa, d_type>::init_ncsp_trans_ctx() {
 template <cpu_isa_t isa, data_type_t d_type>
 status_t jit_uni_pooling_bwd_t<isa, d_type>::init(engine_t *engine) {
     CHECK(safe_ptr_assign(kernel_,
-            new jit_uni_pool_kernel<isa>(
+            new jit_uni_pool_kernel_t<isa>(
                     pd()->jpp_, pd()->invariant_dst_md())));
     if (pd()->jpp_.tag_kind == jit_memory_tag_kind_t::ncsp)
         CHECK(init_ncsp_trans_ctx());
@@ -892,7 +1037,7 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
         const exec_ctx_t &ctx) const {
 
     using namespace jit_uni_pooling_utils;
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
 
     const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
     const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
@@ -906,6 +1051,8 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
                     diff_dst_d, indices_d, wsp_dt_, diff_src, diff_dst, indices,
                     ctx);
 
+    bwd_f32_accum_for_bf16_t f32_accum(jpp, ctx);
+
     auto get_first_ih = [&](int oh) {
         return nstl::min(nstl::max(oh * jpp.stride_h - jpp.t_pad, 0), jpp.ih);
     };
@@ -924,6 +1071,8 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
         const auto c_off = jpp.is_plain() ? b_c * jpp.c_block : b_c;
         if (transpose_facade.should_transpose_src())
             arg.src = transpose_facade.get_src_addr(ithr, ih, jpp);
+        else if (jpp.needs_f32_accum_for_bf16)
+            arg.src = f32_accum.get_addr_2d(ithr, ih);
         else
             arg.src = &diff_src[diff_src_d.blk_off(n, c_off, ih)];
 
@@ -950,6 +1099,8 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
         if (transpose_facade.should_transpose_src())
             arg.zero_ptr
                     = transpose_facade.get_src_addr(ithr, zero_ih_start, jpp);
+        else if (jpp.needs_f32_accum_for_bf16)
+            arg.zero_ptr = f32_accum.get_addr_2d(ithr, zero_ih_start);
         else
             arg.zero_ptr
                     = &diff_src[diff_src_d.blk_off(n, c_off, zero_ih_start, 0)];
@@ -978,6 +1129,10 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward(
 
         if (transpose_facade.should_transpose_src())
             transpose_facade.execute_transpose_output(ithr, n, b_c);
+
+        if (jpp.needs_f32_accum_for_bf16)
+            f32_accum.cvt_to_bf16_slice_2d(
+                    ithr, (bfloat16_t *)diff_src, diff_src_d, n, b_c, ur_bc);
     };
 
     const int nthr = jpp.nthr;
@@ -1017,7 +1172,7 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
 
     const auto &jpp = pd()->jpp_;
 
-    using wsp_data_t = typename prec_traits<wsp_dt_>::type;
+    using wsp_data_t = typename prec_traits_t<wsp_dt_>::type;
     using namespace jit_uni_pooling_utils;
     static constexpr int first_ithr = 0;
 
@@ -1029,6 +1184,12 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
     const auto trans_src = transpose_facade.should_transpose_src();
     const auto trans_dst = transpose_facade.should_transpose_dst();
 
+    bwd_f32_accum_for_bf16_t f32_accum(jpp, ctx);
+
+    const size_t input_dt_size = jpp.needs_f32_accum_for_bf16
+            ? sizeof(bwd_f32_accum_for_bf16_t::value_type)
+            : jpp.dt_size;
+
     auto get_last_ih = [&](int oh) {
         return nstl::min(
                 nstl::max(oh * jpp.stride_h - jpp.t_pad + jpp.kh, 0), jpp.ih);
@@ -1056,6 +1217,8 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
 
         if (trans_src)
             arg.src = transpose_facade.get_src_addr_3d(ithr, id + kd, ih, jpp);
+        else if (jpp.needs_f32_accum_for_bf16)
+            arg.src = f32_accum.get_addr_3d(ithr, id + kd, ih);
         else
             arg.src = (const void *)&diff_src[diff_src_d.blk_off(
                     n, c_off, id + kd, ih)];
@@ -1091,6 +1254,9 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
             if (trans_src)
                 arg.zero_ptr = transpose_facade.get_src_addr_3d(
                         ithr, zero_id_start, zero_ih_start, jpp);
+            else if (jpp.needs_f32_accum_for_bf16)
+                arg.zero_ptr = f32_accum.get_addr_3d(
+                        ithr, zero_id_start, zero_ih_start);
             else
                 arg.zero_ptr = &diff_src[diff_src_d.blk_off(
                         n, c_off, zero_id_start, zero_ih_start, 0)];
@@ -1135,18 +1301,34 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
     const int nthr = jpp.nthr;
 
     if (jpp.simple_alg) {
+        const dim_t nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
+
         if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
-            const dim_t nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-            parallel_nd(
-                    jpp.mb, jpp.od, nb2_c, [&](dim_t n, dim_t od, dim_t b2_c) {
-                        const dim_t b_c = b2_c * jpp.ur_bc;
-                        const dim_t ur_bc
-                                = nstl::min(dim_t(jpp.ur_bc), jpp.nb_c - b_c);
-                        process_simple(n, b_c, od, ur_bc, first_ithr);
-                    });
+            if (!jpp.needs_f32_accum_for_bf16) {
+                parallel_nd(jpp.mb, jpp.od, nb2_c,
+                        [&](dim_t n, dim_t od, dim_t b2_c) {
+                            const dim_t b_c = b2_c * jpp.ur_bc;
+                            const dim_t ur_bc = nstl::min(
+                                    dim_t(jpp.ur_bc), jpp.nb_c - b_c);
+                            process_simple(n, b_c, od, ur_bc, first_ithr);
+                        });
+            } else {
+                parallel_nd_ext(nthr, jpp.mb, nb2_c,
+                        [&](dim_t ithr, dim_t nthr, dim_t n, dim_t b2_c) {
+                            const dim_t b_c = b2_c * jpp.ur_bc;
+                            const dim_t ur_bc = nstl::min(
+                                    dim_t(jpp.ur_bc), jpp.nb_c - b_c);
+                            for (int od = 0; od < jpp.od; ++od) {
+                                process_simple(n, b_c, od, ur_bc, ithr);
+                            }
+                            f32_accum.cvt_to_bf16_slice_3d(ithr,
+                                    (bfloat16_t *)diff_src, diff_src_d, n, b_c,
+                                    ur_bc);
+                        });
+            }
         } else {
             assert(jpp.ur_bc == 1);
-            if (trans_src || trans_dst) {
+            if (trans_src || trans_dst || jpp.needs_f32_accum_for_bf16) {
                 parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
                         [&](dim_t ithr, dim_t nthr, dim_t n, dim_t b_c) {
                             if (trans_src)
@@ -1158,6 +1340,10 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
                             if (trans_dst)
                                 transpose_facade.execute_transpose_output(
                                         ithr, n, b_c);
+                            if (jpp.needs_f32_accum_for_bf16)
+                                f32_accum.cvt_to_bf16_slice_3d(ithr,
+                                        (bfloat16_t *)diff_src, diff_src_d, n,
+                                        b_c, 1);
                         });
             } else {
                 parallel_nd(jpp.mb, jpp.nb_c, jpp.od,
@@ -1168,31 +1354,35 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
         }
     } else {
         const data_t zero_val = 0;
-        if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
-            const size_t chunk_size = (size_t)jpp.ih * jpp.iw * jpp.c;
-            parallel_nd(jpp.mb, jpp.id, [&](dim_t n, dim_t id) {
-                const size_t offset = ((size_t)n * jpp.id + id) * chunk_size;
-                PRAGMA_OMP_SIMD()
-                for (size_t idx = 0; idx < chunk_size; ++idx)
-                    diff_src[offset + idx] = zero_val;
-            });
-        } else {
-            if (!trans_src) {
-                const size_t chunk_size
-                        = (size_t)jpp.id * jpp.ih * jpp.iw * jpp.c_block;
-                parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
-                        [&](dim_t ithr, dim_t nthr, dim_t n, dim_t b_c) {
-                            const size_t offset
-                                    = ((size_t)n * jpp.nb_c + b_c) * chunk_size;
-                            PRAGMA_OMP_SIMD()
-                            for (size_t idx = 0; idx < chunk_size; ++idx)
-                                diff_src[offset + idx] = zero_val;
-                        });
+        if (!jpp.needs_f32_accum_for_bf16) {
+            if (jpp.tag_kind == jit_memory_tag_kind_t::nspc) {
+                const size_t chunk_size = (size_t)jpp.ih * jpp.iw * jpp.c;
+                parallel_nd(jpp.mb, jpp.id, [&](dim_t n, dim_t id) {
+                    const size_t offset
+                            = ((size_t)n * jpp.id + id) * chunk_size;
+                    PRAGMA_OMP_SIMD()
+                    for (size_t idx = 0; idx < chunk_size; ++idx)
+                        diff_src[offset + idx] = zero_val;
+                });
+            } else {
+                if (!trans_src) {
+                    const size_t chunk_size
+                            = (size_t)jpp.id * jpp.ih * jpp.iw * jpp.c_block;
+                    parallel_nd_ext(nthr, jpp.mb, jpp.nb_c,
+                            [&](dim_t ithr, dim_t nthr, dim_t n, dim_t b_c) {
+                                const size_t offset
+                                        = ((size_t)n * jpp.nb_c + b_c)
+                                        * chunk_size;
+                                PRAGMA_OMP_SIMD()
+                                for (size_t idx = 0; idx < chunk_size; ++idx)
+                                    diff_src[offset + idx] = zero_val;
+                            });
+                }
             }
         }
 
         const auto nb2_c = utils::div_up(jpp.nb_c, jpp.ur_bc);
-        if (trans_src || trans_dst) {
+        if (trans_src || trans_dst || jpp.needs_f32_accum_for_bf16) {
             parallel_nd_ext(nthr, jpp.mb, nb2_c,
                     [&](dim_t ithr, dim_t nthr, dim_t n, dim_t b2_c) {
                         const dim_t b_c = b2_c * jpp.ur_bc;
@@ -1202,16 +1392,19 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
                                     ithr, n, b_c);
 
                             size_t block_size = jpp.c_block * jpp.id * jpp.ih
-                                    * jpp.iw * jpp.dt_size;
+                                    * jpp.iw * input_dt_size;
 
                             const void *src = transpose_facade.get_src_addr_3d(
                                     ithr, 0, 0, jpp);
                             std::memset((void *)src, zero_val, block_size);
                         }
 
+                        if (jpp.needs_f32_accum_for_bf16)
+                            f32_accum.zero_data(ithr);
+
+                        const dim_t ur_bc
+                                = nstl::min(dim_t(jpp.ur_bc), jpp.nb_c - b_c);
                         for (dim_t kd = 0; kd < jpp.kd; ++kd) {
-                            const dim_t ur_bc = nstl::min(
-                                    dim_t(jpp.ur_bc), jpp.nb_c - b_c);
                             for (int od = 0; od < jpp.od; ++od) {
                                 const dim_t ik
                                         = static_cast<dim_t>(od) * jpp.stride_d;
@@ -1236,6 +1429,11 @@ void jit_uni_pooling_bwd_t<isa, d_type>::execute_backward_3d(
                         if (trans_src)
                             transpose_facade.execute_transpose_output(
                                     ithr, n, b_c);
+
+                        if (jpp.needs_f32_accum_for_bf16)
+                            f32_accum.cvt_to_bf16_slice_3d(ithr,
+                                    (bfloat16_t *)diff_src, diff_src_d, n, b_c,
+                                    ur_bc);
                     });
         } else {
             for (dim_t kd = 0; kd < jpp.kd; ++kd) {
diff --git a/src/cpu/x64/jit_uni_pooling.hpp b/src/cpu/x64/jit_uni_pooling.hpp
index 5b58e9ec26f..f75a43cbed6 100644
--- a/src/cpu/x64/jit_uni_pooling.hpp
+++ b/src/cpu/x64/jit_uni_pooling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -70,10 +70,10 @@ struct jit_uni_pooling_fwd_t : public primitive_t {
             if (desc()->alg_kind == alg_kind::pooling_max && is_training)
                 init_default_ws();
 
-            auto scratchpad = scratchpad_registry().registrar();
+            CHECK(jit_uni_pool_kernel_t<isa>::init_conf(jpp_, attr_, this));
 
-            CHECK(jit_uni_pool_kernel<isa>::init_conf(
-                    jpp_, scratchpad, attr_, this));
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_uni_pool_kernel_t<isa>::init_scratchpad(jpp_, scratchpad);
 
             return status::success;
         }
@@ -84,9 +84,9 @@ struct jit_uni_pooling_fwd_t : public primitive_t {
     explicit jit_uni_pooling_fwd_t(const pd_t *apd);
     jit_uni_pooling_fwd_t(jit_uni_pooling_fwd_t &&) = default;
     jit_uni_pooling_fwd_t &operator=(jit_uni_pooling_fwd_t &&) = default;
-    ~jit_uni_pooling_fwd_t();
+    ~jit_uni_pooling_fwd_t() override;
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
@@ -111,7 +111,7 @@ struct jit_uni_pooling_fwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     status_t init_ncsp_trans_ctx();
 
-    std::unique_ptr<jit_uni_pool_kernel<isa>> kernel_;
+    std::unique_ptr<jit_uni_pool_kernel_t<isa>> kernel_;
     std::unique_ptr<jit_uni_pooling_utils::trans_context_t> trans_ctx_;
     static constexpr data_type_t wsp_dt_ = data_type::f32;
 };
@@ -146,10 +146,10 @@ struct jit_uni_pooling_bwd_t : public primitive_t {
                         compare_ws(hint_fwd_pd_), VERBOSE_WS_MISMATCH);
             }
 
-            auto scratchpad = scratchpad_registry().registrar();
+            CHECK(jit_uni_pool_kernel_t<isa>::init_conf(jpp_, attr_, this));
 
-            CHECK(jit_uni_pool_kernel<isa>::init_conf(
-                    jpp_, scratchpad, attr_, this));
+            auto scratchpad = scratchpad_registry().registrar();
+            jit_uni_pool_kernel_t<isa>::init_scratchpad(jpp_, scratchpad);
 
             return status::success;
         }
@@ -160,9 +160,9 @@ struct jit_uni_pooling_bwd_t : public primitive_t {
     explicit jit_uni_pooling_bwd_t(const pd_t *apd);
     jit_uni_pooling_bwd_t(jit_uni_pooling_bwd_t &&) = default;
     jit_uni_pooling_bwd_t &operator=(jit_uni_pooling_bwd_t &&) = default;
-    ~jit_uni_pooling_bwd_t();
+    ~jit_uni_pooling_bwd_t() override;
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
@@ -187,7 +187,7 @@ struct jit_uni_pooling_bwd_t : public primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     status_t init_ncsp_trans_ctx();
 
-    std::unique_ptr<jit_uni_pool_kernel<isa>> kernel_;
+    std::unique_ptr<jit_uni_pool_kernel_t<isa>> kernel_;
     std::unique_ptr<jit_uni_pooling_utils::trans_context_t> trans_ctx_;
     static constexpr data_type_t wsp_dt_ = data_type::f32;
 };
diff --git a/src/cpu/x64/jit_uni_reduction.cpp b/src/cpu/x64/jit_uni_reduction.cpp
index 813f61a23b2..d6fc2f5b16d 100644
--- a/src/cpu/x64/jit_uni_reduction.cpp
+++ b/src/cpu/x64/jit_uni_reduction.cpp
@@ -197,9 +197,6 @@ status_t jit_uni_reduction_t::get_proper_kernel(
         const memory_desc_t *dst_md, const jit_reduction_conf_t &conf) {
     using namespace data_type;
 
-    if (conf.isa == avx512_core_fp16)
-        return safe_ptr_assign(kernel_,
-                new jit_uni_reduction_kernel_t<avx512_core_fp16>(conf, dst_md));
     if (conf.isa == avx512_core_bf16)
         return safe_ptr_assign(kernel_,
                 new jit_uni_reduction_kernel_t<avx512_core_bf16>(conf, dst_md));
diff --git a/src/cpu/x64/jit_uni_reduction.hpp b/src/cpu/x64/jit_uni_reduction.hpp
index 844fa730413..7b78568b1bf 100644
--- a/src/cpu/x64/jit_uni_reduction.hpp
+++ b/src/cpu/x64/jit_uni_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,8 @@ struct jit_uni_reduction_t : public primitive_t {
     };
 
     jit_uni_reduction_t(const pd_t *apd) : primitive_t(apd) {}
-    virtual ~jit_uni_reduction_t() = default;
+
+    ~jit_uni_reduction_t() override = default;
 
     status_t init(engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
diff --git a/src/cpu/x64/jit_uni_reduction_kernel.cpp b/src/cpu/x64/jit_uni_reduction_kernel.cpp
index 6849657268e..4a2bcc4ebca 100644
--- a/src/cpu/x64/jit_uni_reduction_kernel.cpp
+++ b/src/cpu/x64/jit_uni_reduction_kernel.cpp
@@ -428,7 +428,6 @@ void jit_uni_reduction_kernel_t<isa, Vmm>::generate() {
         postops_injector_->prepare_table(/* generate = */ true);
 }
 
-template struct jit_uni_reduction_kernel_t<avx512_core_fp16>;
 template struct jit_uni_reduction_kernel_t<avx512_core_bf16>;
 template struct jit_uni_reduction_kernel_t<avx512_core>;
 template struct jit_uni_reduction_kernel_t<avx2_vnni_2>;
diff --git a/src/cpu/x64/jit_uni_reduction_kernel.hpp b/src/cpu/x64/jit_uni_reduction_kernel.hpp
index 57d2393ce2a..0c824b00036 100644
--- a/src/cpu/x64/jit_uni_reduction_kernel.hpp
+++ b/src/cpu/x64/jit_uni_reduction_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,14 +33,14 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_uni_reduction_kernel_base_t : public jit_generator {
+struct jit_uni_reduction_kernel_base_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reduction)
 
     jit_uni_reduction_kernel_base_t(const jit_reduction_conf_t &conf)
-        : jit_generator(jit_name(), conf.isa)
+        : jit_generator_t(jit_name(), conf.isa)
         , conf_(conf)
         , sum_scales_(conf_.sum_scales) {}
-    virtual ~jit_uni_reduction_kernel_base_t() = default;
+    ~jit_uni_reduction_kernel_base_t() override = default;
 
     virtual std::size_t get_simd_w() = 0;
 
@@ -49,12 +49,12 @@ struct jit_uni_reduction_kernel_base_t : public jit_generator {
     std::queue<float> sum_scales_;
 };
 
-template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits<isa>::Vmm>
+template <cpu_isa_t isa, typename Vmm = typename cpu_isa_traits_t<isa>::Vmm>
 struct jit_uni_reduction_kernel_t : public jit_uni_reduction_kernel_base_t {
     jit_uni_reduction_kernel_t(
             const jit_reduction_conf_t &conf, const memory_desc_t *dst_md);
 
-    virtual ~jit_uni_reduction_kernel_t() = default;
+    ~jit_uni_reduction_kernel_t() override = default;
 
     std::size_t get_simd_w() override { return simd_w_; }
 
diff --git a/src/cpu/x64/jit_uni_reorder.cpp b/src/cpu/x64/jit_uni_reorder.cpp
index da388cc297c..47be48c5137 100644
--- a/src/cpu/x64/jit_uni_reorder.cpp
+++ b/src/cpu/x64/jit_uni_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,22 +64,6 @@ namespace x64 {
 
 namespace tr {
 
-static bool is_direct_copy(const prb_t &prb) {
-    // Note: io_helper has an implicit conversion to f32 which is incorrect for
-    // s32->s32. Disabling it for now as a direct copy path.
-    const bool is_s32
-            = utils::everyone_is(data_type::s32, prb.itype, prb.otype);
-    const bool no_scale = utils::everyone_is(
-            scale_type_t::NONE, prb.src_scale_type, prb.dst_scale_type);
-    const bool no_zp
-            = utils::everyone_is(false, prb.req_src_zp, prb.req_dst_zp);
-    const bool no_comp = utils::everyone_is(
-            false, prb.req_s8s8_comp, prb.req_asymmetric_comp);
-    return prb.ndims == 1 && prb.nodes[0].is == 1 && prb.nodes[0].os == 1
-            && !is_s32 && !prb.is_tail_present && no_scale && no_zp && no_comp
-            && prb.beta == 0.f;
-}
-
 static bool prb_has_small_strides(const prb_t &prb) {
     constexpr ptrdiff_t max_stride = (1LL << 31) - 1;
     for (int d = 0; d < prb.ndims; ++d) {
@@ -106,17 +90,19 @@ bool prb_has_huge_prime_number(const prb_t &prb) {
 const size_t ker_prb_size_min = 64;
 
 /* kernel */
-struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
+struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_reorder_kernel_f32)
 
     void operator()(const call_param_t *c) const override {
-        jit_generator::operator()(c);
+        jit_generator_t::operator()(c);
     }
     void operator()(const tail_call_param_t *c) const override {
-        jit_generator::operator()(c);
+        jit_generator_t::operator()(c);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     enum class scale_arg_t { NONE, SRC, DST };
 
@@ -211,15 +197,15 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                 && utils::everyone_is(0, p.ioff, p.ooff) /* do we need this? */
                 && utils::one_of(p.beta, 0.f, 1.f) /* anything else? */
                 && simple_impl_desc_init(p, nullptr) && mayiuse(sse41)
-                && IMPLICATION(utils::one_of(bf16, p.itype, p.otype),
+                && IMPLICATION(bf16 == p.itype, mayiuse(avx2))
+                && IMPLICATION((bf16 == p.otype) && (bf16 != p.itype),
                         mayiuse(avx512_core) || mayiuse(avx2_vnni_2))
                 && IMPLICATION(utils::one_of(f16, p.itype, p.otype),
-                        mayiuse(avx512_core_fp16) || mayiuse(avx2_vnni_2))
+                        mayiuse(avx512_core_fp16) || mayiuse(avx2))
                 && IMPLICATION(utils::one_of(f8_e5m2, p.itype, p.otype)
                                 || utils::one_of(f8_e4m3, p.itype, p.otype),
                         mayiuse(avx512_core_amx))
-                && IMPLICATION(!is_direct_copy(p), prb_has_small_strides(p))
-                && !prb_has_huge_prime_number(p);
+                && prb_has_small_strides(p) && !prb_has_huge_prime_number(p);
         return ok;
     }
 
@@ -313,7 +299,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                             vcvtph2psx(dst, src);
                         else
                             vcvtph2psx(dst, Xmm(src.getIdx()));
-                    } else if (is_superset(isa_, avx2_vnni_2)) {
+                    } else if (is_superset(isa_, avx2)) {
                         if (src.isMEM())
                             vcvtph2ps(dst, src);
                         else
@@ -594,105 +580,6 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         return true;
     }
 
-    template <typename Vmm>
-    bool process_direct_copy(const int ndims, const int len_unroll) {
-        using namespace data_type;
-
-        static constexpr bool is_zmm = std::is_same<Vmm, Xbyak::Zmm>::value;
-        static constexpr bool is_ymm = std::is_same<Vmm, Xbyak::Ymm>::value;
-        static constexpr int vlen = vreg_traits<Vmm>::vlen;
-        const int simd_w = vlen / sizeof(float);
-        const int len_tail = len_unroll % simd_w;
-        const bool is_i8 = utils::one_of(s8, prb_.itype, prb_.otype)
-                || utils::one_of(u8, prb_.itype, prb_.otype);
-
-        // TODO: make a standalone jit:direct_copy implementation.
-        const bool can_do = is_direct_copy(prb_)
-                // s8u8 with AVX should be used with XMM vreg.
-                && IMPLICATION(is_i8 && isa_ == avx, !is_ymm)
-                // Prime numbers greater than INT_MAX cause input address
-                // overflow and crash.
-                && !prb_has_huge_prime_number(prb_);
-        if (!can_do) return false;
-
-        const int tail_vmm_idx = 0;
-        const int max_unroll = is_zmm ? 16 : 8;
-
-        using data_types_t = std::unordered_set<data_type_t, std::hash<int>>;
-
-        auto io_init_saturate_f32 = [&](const data_types_t &store_data_types) {
-            if (!zmm_io_.empty())
-                zmm_io_.init_saturate_f32(store_data_types);
-            else if (!ymm_io_.empty())
-                ymm_io_.init_saturate_f32(store_data_types);
-            else {
-                assert(!xmm_io_.empty());
-                xmm_io_.init_saturate_f32(store_data_types);
-            }
-        };
-
-        auto io_load = [&](const Xbyak::Address &src_addr, const Vmm &vmm,
-                               const bool tail) {
-            if (!zmm_io_.empty())
-                zmm_io_[prb_.itype]->load(src_addr, Zmm(vmm.getIdx()), tail);
-            else if (!ymm_io_.empty())
-                ymm_io_[prb_.itype]->load(src_addr, Ymm(vmm.getIdx()), tail);
-            else {
-                assert(!xmm_io_.empty());
-                xmm_io_[prb_.itype]->load(src_addr, Xmm(vmm.getIdx()), tail);
-            }
-        };
-
-        auto io_store = [&](const Vmm &vmm, const Xbyak::Address &dst_addr,
-                                const bool tail) {
-            if (!zmm_io_.empty())
-                zmm_io_[prb_.otype]->store(Zmm(vmm.getIdx()), dst_addr, tail);
-            else if (!ymm_io_.empty())
-                ymm_io_[prb_.otype]->store(Ymm(vmm.getIdx()), dst_addr, tail);
-            else {
-                assert(!xmm_io_.empty());
-                xmm_io_[prb_.otype]->store(Xmm(vmm.getIdx()), dst_addr, tail);
-            }
-        };
-
-        auto io_prepare_tail_mask = [&]() {
-            if (!zmm_io_.empty())
-                zmm_io_.prepare_tail_mask();
-            else if (!ymm_io_.empty())
-                ymm_io_.prepare_tail_mask();
-            else {
-                assert(!xmm_io_.empty());
-                xmm_io_.prepare_tail_mask();
-            }
-        };
-
-        io_init_saturate_f32({prb_.otype});
-
-        int off = 0;
-        for (; off + len_tail < len_unroll;) {
-            int n_vregs_to_process_len_unroll = (len_unroll - off) / simd_w;
-            int unroll = nstl::min(max_unroll, n_vregs_to_process_len_unroll);
-
-            for (int ur = 0; ur < unroll; ++ur) {
-                const auto vmm = Vmm(ur);
-                io_load(i_addr(off + ur * simd_w), vmm, false);
-                io_store(vmm, o_addr(off + ur * simd_w), false);
-            }
-
-            off += unroll * simd_w;
-            assert(off <= len_unroll);
-        }
-
-        if (len_tail) {
-            io_prepare_tail_mask();
-            const auto vmm = Vmm(tail_vmm_idx + 1);
-            io_load(i_addr(off), vmm, true);
-            io_store(vmm, o_addr(off), true);
-        }
-
-        return true;
-    }
-
     void process_unroll_generic_step(int reg_unroll, const int *i_off,
             const int *o_off, const int *s_off, const int *c_off,
             const int *zero_padding, const bool tail_processing) {
@@ -1387,15 +1274,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
 
     void compute_ker(
             const int ndims, const int len_unroll, const bool tail_processing) {
-        bool optimized = false;
-        if (is_superset(isa_, avx512_core)) {
-            optimized = process_direct_copy<Zmm>(ndims, len_unroll);
-        } else if (is_superset(isa_, avx)) {
-            optimized = process_direct_copy<Ymm>(ndims, len_unroll);
-        } else {
-            optimized = process_direct_copy<Xmm>(ndims, len_unroll);
-        }
-        if (!optimized) optimized = process_unroll_tr8x8(ndims, len_unroll);
+        bool optimized = process_unroll_tr8x8(ndims, len_unroll);
         if (!optimized)
             process_unroll_generic(ndims, len_unroll, tail_processing);
     }
@@ -1647,7 +1526,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
 
     jit_uni_reorder_kernel_f32_t(const desc_t &desc)
         : kernel_t(desc)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , isa_(get_max_cpu_isa())
         , bf16_emu_(nullptr)
         , f8_e5m2_emu_(nullptr)
@@ -1657,7 +1536,7 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         otype_sz_ = data_type_size(prb_.otype);
         stype_sz_ = sizeof(float);
         if (prb_.otype == data_type::bf16 && !mayiuse(avx512_core_bf16)
-                && !mayiuse(avx2_vnni_2)) {
+                && !mayiuse(avx2_vnni_2) && mayiuse(avx512_core)) {
             bf16_emu_ = utils::make_unique<bf16_emulation_t>(this,
                     bf16_emu_reserv_1_, bf16_emu_reserv_2_, bf16_emu_reserv_3_,
                     bf16_emu_scratch_, bf16_emu_reserv_4_);
@@ -1690,86 +1569,6 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
                         prb_.itype, data_type::f8_e5m2, data_type::f8_e4m3))
                 create_fp8_emu(prb_.itype);
         }
-
-        const bool can_do_direct_copy = is_direct_copy(prb_)
-                // Prime numbers greater than INT_MAX cause input address
-                // overflow and crash.
-                && !prb_has_huge_prime_number(prb_);
-
-        /* io_helper is used only in direct copy algorithm */
-        if (can_do_direct_copy) {
-            const int ndims = prb_.ndims;
-            int len_last_dim_unroll = 1;
-            int len_unroll = 1;
-
-            for (int d = 0; d < ndims; ++d) {
-                const auto &node = prb_.nodes[d];
-                if (len_unroll * node.n <= len_unroll_max) {
-                    len_unroll *= node.n;
-                } else {
-                    len_last_dim_unroll = len_unroll_max / len_unroll;
-                    while (node.n % len_last_dim_unroll)
-                        --len_last_dim_unroll;
-                    len_unroll *= len_last_dim_unroll;
-                    break;
-                }
-            }
-
-            const bool is_zmm = is_superset(isa_, avx512_core);
-            const bool is_i8
-                    = utils::one_of(data_type::s8, prb_.itype, prb_.otype)
-                    || utils::one_of(data_type::u8, prb_.itype, prb_.otype);
-            const int vlen = isa_max_vlen(isa_);
-            const int simd_w = vlen / sizeof(float);
-            const int tail_opmask_idx = 2;
-            const int tail_vmm_idx = 0;
-            // Unroll might be max of 16 for zmm or 8 otherwise so keep auxiliary
-            // registers indices higher than this number. Follow existing bf16_emu
-            // register numeration for that.
-            const int zero_idx
-                    = is_zmm ? bf16_emu_zmm_4_idx_ + 1 : xmm_zero_.getIdx();
-            const int saturation_ubound_idx
-                    = is_zmm ? zero_idx + 1 : xmm_saturation_ubound_.getIdx();
-            const int max_unroll = is_zmm ? 16 : 8;
-            MAYBE_UNUSED(max_unroll);
-            assert(zero_idx >= max_unroll);
-            assert(saturation_ubound_idx >= max_unroll);
-            assert(simd_w > 0);
-
-            io::io_conf_t io_conf;
-            io::io_tail_conf_t io_tail_conf(simd_w,
-                    simd_w > 0 ? len_unroll % simd_w : 0, tail_opmask_idx,
-                    tail_vmm_idx, reg_tmp_);
-            io::io_emu_bf16_conf_t io_bf16_conf(bf16_emu_zmm_1_idx_,
-                    bf16_emu_zmm_2_idx_, bf16_emu_zmm_3_idx_, reg_tmp_,
-                    bf16_emu_zmm_4_idx_);
-            io::io_emu_fp8_conf_t io_fp8_conf(fp8_emu_zmm_1_idx_,
-                    fp8_emu_zmm_2_idx_, fp8_emu_zmm_3_idx_, fp8_emu_zmm_4_idx_,
-                    fp8_emu_zmm_5_idx_, fp8_emu_kmask_aux_idx_,
-                    fp8_emu_scratch_);
-            io::io_saturation_conf_t io_saturation_conf(
-                    zero_idx, saturation_ubound_idx, reg_tmp_);
-
-            if (is_superset(isa_, avx512_core)) {
-                zmm_io_ = io::jit_io_multi_dt_helper_t<Zmm>(this, isa_,
-                        {prb_.itype, prb_.otype}, io_conf, io_tail_conf,
-                        io_bf16_conf, {{prb_.otype, io_saturation_conf}},
-                        utils::nullopt, io_fp8_conf);
-            } else if (
-                    is_superset(isa_,
-                            avx) /* s8u8 with AVX should be used with XMM vreg */
-                    && IMPLICATION(isa_ == avx, !is_i8)) {
-                ymm_io_ = io::jit_io_multi_dt_helper_t<Ymm>(this, isa_,
-                        {prb_.itype, prb_.otype}, io_conf, io_tail_conf,
-                        io_bf16_conf, {{prb_.otype, io_saturation_conf}},
-                        utils::nullopt, io_fp8_conf);
-            } else {
-                xmm_io_ = io::jit_io_multi_dt_helper_t<Xmm>(this, isa_,
-                        {prb_.itype, prb_.otype}, io_conf, io_tail_conf,
-                        io_bf16_conf, {{prb_.otype, io_saturation_conf}},
-                        utils::nullopt, io_fp8_conf);
-            }
-        }
     }
 
     void generate() override {
@@ -1858,13 +1657,6 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
         if (is_fp8_itype || is_fp8_otype) {
             if (f8_e5m2_emu_) f8_e5m2_emu_->prepare_table();
             if (f8_e4m3_emu_) f8_e4m3_emu_->prepare_table();
-            if (is_superset(isa_, avx512_core_amx)) {
-                if (is_fp8_itype && zmm_io_.at(prb_.itype) != nullptr)
-                    zmm_io_[prb_.itype]->prepare_table_fp8();
-                if (is_fp8_otype && prb_.itype != prb_.otype
-                        && zmm_io_.at(prb_.otype) != nullptr)
-                    zmm_io_[prb_.otype]->prepare_table_fp8();
-            }
         }
     }
 
@@ -1939,15 +1731,10 @@ struct jit_uni_reorder_kernel_f32_t : public kernel_t, public jit_generator {
     const Zmm fp8_emu_reserv_5_ = Zmm(fp8_emu_zmm_5_idx_);
     const Opmask fp8_emu_kmask_aux_ = Opmask(fp8_emu_kmask_aux_idx_);
     const Reg64 fp8_emu_scratch_ = bf16_emu_scratch_;
-
-    // TODO: parametrize the kernel with the Vmm argument.
-    io::jit_io_multi_dt_helper_t<Xmm> xmm_io_;
-    io::jit_io_multi_dt_helper_t<Ymm> ymm_io_;
-    io::jit_io_multi_dt_helper_t<Zmm> zmm_io_;
 };
 
 // Seperate class for no unroll/threading burden
-struct jit_single_blk_kernel_t : public jit_generator {
+struct jit_single_blk_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_single_blk_kernel)
     static bool applicable(const prb_t &p) {
         using namespace data_type;
@@ -1993,7 +1780,7 @@ struct jit_single_blk_kernel_t : public jit_generator {
     }
 
     jit_single_blk_kernel_t(const tr::prb_t &prb)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , prb_(prb)
         , itype_sz_(data_type_size(prb_.itype))
         , otype_sz_(data_type_size(prb_.otype))
@@ -2342,7 +2129,7 @@ static void prb_block_for_cache(tr::prb_t &prb) {
 
     const bool cache_blocking_needed
             = stride_cache_friendly || requires_inner_blocking;
-    if (!cache_blocking_needed || is_direct_copy(prb)) return;
+    if (!cache_blocking_needed) return;
 
     int unit_input_stride_idx = -1;
     for (auto idx = 0; idx < prb.ndims; ++idx) {
@@ -2426,10 +2213,7 @@ static void prb_thread_kernel_balance(
     // size_drv_min = C0 + FC * (nthr > 1 ? 1 : 0) + VC * (nthr - 1)
     // where FC and VC are fixed and variable costs respectively.
     // Though for now, the below heuristic seems to be good enough
-    // Note: direct copy needs only as many kernels as nthr.
-    const size_t size_drv_thr = is_direct_copy(prb) ? nthr
-            : (nthr > 1)                            ? 16 * nthr
-                                                    : 1;
+    const size_t size_drv_thr = (nthr > 1) ? 16 * nthr : 1;
 
     /* size_drv_min is the minimal size for the parallel
      * driver required for good parallelization */
@@ -2505,8 +2289,8 @@ static void prb_thread_kernel_balance(
 
     if (want_borrow_ker_from_drv || want_borrow_drv_from_ker) {
         DEBUG({
-            verbose_printf(verbose_t::debuginfo, "split: ");
-            prb_dump(prb);
+            verbose_printf(
+                    verbose_t::debuginfo, "split: %s\n", prb_dump(prb).c_str());
             verbose_printf(verbose_t::debuginfo, "ndims_ker_max = %d\n",
                     ndims_ker_max);
         });
@@ -2543,13 +2327,10 @@ status_t jit_uni_reorder_t::pd_t::init_scratchpad() {
                 compensation_reduce_size);
     }
 
-    const memory_desc_wrapper input_d(src_md());
-    int scales_mask = -1;
-    bool is_set = false;
-    CHECK(attr()->scales_.get(DNNL_ARG_DST, &scales_mask, &is_set));
-
-    if (is_set && scales_mask > 0) {
-        get_D_values(input_d, scales_mask, nullptr, &D_mask_, nullptr);
+    if (!attr()->scales_.has_default_values(DNNL_ARG_DST)) {
+        const memory_desc_wrapper input_d(src_md());
+        int mask = attr()->scales_.get_mask(DNNL_ARG_DST);
+        get_D_values(input_d, mask, nullptr, &D_mask_, nullptr);
         if (D_mask_ > 1) {
             scratchpad.template book<float>(
                     memory_tracking::names::key_reorder_precomputed_dst_scales,
@@ -2573,8 +2354,8 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
 
     prb_block_for_cache(prb);
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "cache: ");
-        prb_dump(prb);
+        verbose_printf(
+                verbose_t::debuginfo, "cache: %s\n", prb_dump(prb).c_str());
     });
 
     int ndims_ker_max {};
@@ -2593,8 +2374,8 @@ status_t jit_uni_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
             VERBOSE_BAD_NDIMS, "driver", ndims_driver);
 
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "ker  : ");
-        prb_dump(ker_desc.prb);
+        verbose_printf(verbose_t::debuginfo, "ker  : %s\n",
+                prb_dump(ker_desc.prb).c_str());
     });
 
     auto _pd = make_unique_pd<pd_t>(
@@ -2803,12 +2584,12 @@ void jit_uni_reorder_t::omp_driver(const char *in, char *out,
     out += pd()->prb_.ooff * data_type_size(pd()->prb_.otype);
 
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "prb  : ");
-        tr::prb_dump(pd()->prb_);
+        verbose_printf(verbose_t::debuginfo, "prb  : %s\n",
+                tr::prb_dump(pd()->prb_).c_str());
     });
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "ker  : ");
-        tr::prb_dump(pd()->ker_desc_.prb);
+        verbose_printf(verbose_t::debuginfo, "ker  : %s\n",
+                tr::prb_dump(pd()->ker_desc_.prb).c_str());
     });
 
     int ndims = pd()->prb_.ndims;
@@ -3002,11 +2783,28 @@ status_t jit_blk_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
     VDISPATCH_REORDER_IC(
             !prb.is_tail_present, "tail processing is not supported");
 
+    // NB! Fall back to ref, if input and output both batch-strided
+    bool batch_strided_input = false;
+    bool batch_strided_output = false;
+    if (prb.ndims > 1) {
+        int batch_idx = prb.nodes[0].is > prb.nodes[1].is ? 0 : 1;
+        int channel_idx = batch_idx == 0 ? 1 : 0;
+        batch_strided_input =
+                (ptrdiff_t) prb.nodes[channel_idx].n * prb.nodes[channel_idx].is < prb.nodes[batch_idx].is;
+        batch_idx = prb.nodes[0].os > prb.nodes[1].os ? 0 : 1;
+        channel_idx = batch_idx == 0 ? 1 : 0;
+        batch_strided_output =
+                (ptrdiff_t) prb.nodes[channel_idx].n * prb.nodes[channel_idx].is < prb.nodes[batch_idx].is;
+    }
+
     prb_tile_normalize(prb);
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "tile : ");
-        prb_dump(prb);
+        verbose_printf(
+                verbose_t::debuginfo, "tile : %s\n", prb_dump(prb).c_str());
     });
+    // NB! Fall back to ref, if input and output both batch-strided
+    if (batch_strided_input && batch_strided_output)
+        return status::unimplemented;
 
     if (!tr::jit_single_blk_kernel_t::applicable(prb)) {
         return status::unimplemented;
diff --git a/src/cpu/x64/jit_uni_reorder.hpp b/src/cpu/x64/jit_uni_reorder.hpp
index 1a49a5daf35..e8f4ae27383 100644
--- a/src/cpu/x64/jit_uni_reorder.hpp
+++ b/src/cpu/x64/jit_uni_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -147,8 +147,8 @@ void prb_node_swap(prb_t &p, int d0, int d1);
  * to the right if d0 > d1 */
 void prb_node_move(prb_t &p, int d0, int d1);
 
-/** dumps the problem to stdout */
-void prb_dump(const prb_t &p);
+/** dumps the problem to a string */
+std::string prb_dump(const prb_t &p);
 
 struct call_param_t {
     const void *in = nullptr;
@@ -185,7 +185,7 @@ struct kernel_t {
     virtual void operator()(const call_param_t *c) const = 0;
     virtual void operator()(const tail_call_param_t *c) const = 0;
     virtual status_t create_kernel() = 0;
-    virtual ~kernel_t() {}
+    virtual ~kernel_t() = default;
 
     /** inits kernel descriptor:
      *      desc            -- kernel descriptor (output)
@@ -297,7 +297,7 @@ struct jit_blk_reorder_t : public primitive_t {
     status_t execute(const exec_ctx_t &ctx) const override;
 
     jit_blk_reorder_t(const pd_t *apd);
-    ~jit_blk_reorder_t();
+    ~jit_blk_reorder_t() override;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
diff --git a/src/cpu/x64/jit_uni_reorder_direct_copy.cpp b/src/cpu/x64/jit_uni_reorder_direct_copy.cpp
new file mode 100644
index 00000000000..06738d2aa56
--- /dev/null
+++ b/src/cpu/x64/jit_uni_reorder_direct_copy.cpp
@@ -0,0 +1,404 @@
+/*******************************************************************************
+* Copyright 2018-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/dnnl_thread.hpp"
+
+#include "cpu/x64/jit_uni_reorder_direct_copy.hpp"
+
+#include "cpu/x64/jit_generator.hpp"
+#include "cpu/x64/utils/jit_io_helper.hpp"
+
+using namespace Xbyak;
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+template <typename Vmm>
+struct direct_copy_kernel_t
+    : public jit_uni_reorder_direct_copy_t::kernel_base_t,
+      public jit_generator_t {
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(direct_copy_kernel_t)
+
+    direct_copy_kernel_t(const reorder_pd_t *pd, cpu_isa_t isa)
+        : jit_uni_reorder_direct_copy_t::kernel_base_t(pd)
+        , jit_generator_t(jit_name(), isa)
+        , isa_(isa)
+        , src_dt_(pd_->src_md()->data_type)
+        , dst_dt_(pd_->dst_md()->data_type) {
+        assert(!utils::one_of(isa_, isa_undef, isa_all));
+
+        const memory_desc_wrapper src_d(pd_->src_md());
+
+        tail_size_ = src_d.nelems() % simd_w_;
+
+        io::io_conf_t io_conf;
+        io::io_tail_conf_t io_tail_conf(
+                simd_w_, tail_size_, tail_opmask_idx_, tail_vmm_idx_, reg_tmp_);
+        io::io_emu_bf16_conf_t io_bf16_conf(emu_zmm_1_idx_, emu_zmm_2_idx_,
+                emu_zmm_3_idx_, reg_tmp_, emu_zmm_4_idx_);
+        io::io_emu_fp8_conf_t io_fp8_conf(emu_zmm_1_idx_, emu_zmm_2_idx_,
+                emu_zmm_3_idx_, emu_zmm_4_idx_, emu_zmm_5_idx_,
+                emu_kmask_aux_idx_, reg_tmp_);
+        io::io_saturation_conf_t io_saturation_conf(
+                zero_idx_, saturation_ubound_idx_, reg_tmp_);
+
+        io_ = io::jit_io_multi_dt_helper_t<Vmm>(this, isa_, {src_dt_, dst_dt_},
+                io_conf, io_tail_conf, io_bf16_conf,
+                {{dst_dt_, io_saturation_conf}}, utils::nullopt, io_fp8_conf);
+    }
+
+    static constexpr int vlen_ = vreg_traits_t<Vmm>::vlen;
+    static constexpr int simd_w_ = vlen_ / sizeof(float);
+    static constexpr int unroll_12_ = 12;
+    static constexpr int unroll_4_ = 4;
+
+    int get_max_unroll() const override { return unroll_12_; }
+
+    void operator()(
+            const void *src, void *dst, size_t work_amount) const override {
+        ker_args_t args;
+        args.src = src;
+        args.dst = dst;
+        args.work_amount = work_amount;
+        jit_generator_t::operator()(&args);
+    }
+
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
+
+    Address src_ptr(size_t offt = 0) { return ptr[reg_src + offt]; }
+
+    Address dst_ptr(size_t offt = 0) { return ptr[reg_dst + offt]; }
+
+    Vmm vmm_src(int idx) const {
+        // Incorporate `+ 1` here as `0th` index is used for tail on AVX2.
+        return Vmm(idx + 1);
+    }
+
+    void copy(const int unroll, const bool tail) {
+        // Copy two simdw at once in vectorized loop first when `ne_convert`
+        // instructions are available for xf16.
+        if (isa_ == avx2_vnni_2
+                && (utils::one_of(src_dt_, data_type::bf16, data_type::f16))
+                && (unroll % 2 == 0)) {
+            for (size_t i = 0; i < static_cast<size_t>(unroll) / 2; i++) {
+                const Vmm &vmm_src_even = vmm_src(2 * i);
+                const Vmm &vmm_src_odd = vmm_src(2 * i + 1);
+                Vmm vmm_tmp(vmm_tmp_idx_);
+                io_[src_dt_]->load_two_simdw_xf16(
+                        src_ptr(2 * i * types::data_type_size(src_dt_)
+                                * simd_w_),
+                        vmm_src_even, vmm_src_odd);
+                io_[src_dt_]->merge_interleaved_to_plain(
+                        vmm_src_even, vmm_src_odd, vmm_tmp);
+                io_[dst_dt_]->store(vmm_src_even,
+                        dst_ptr(2 * i * types::data_type_size(dst_dt_)
+                                * simd_w_),
+                        tail);
+                io_[dst_dt_]->store(vmm_src_odd,
+                        dst_ptr((2 * i + 1) * types::data_type_size(dst_dt_)
+                                * simd_w_),
+                        tail);
+            }
+        } else {
+            for (int i = 0; i < unroll; i++) {
+                io_[src_dt_]->load(
+                        src_ptr(i * types::data_type_size(src_dt_) * simd_w_),
+                        vmm_src(i), tail);
+            }
+            for (int i = 0; i < unroll; i++) {
+                io_[dst_dt_]->store(vmm_src(i),
+                        dst_ptr(i * types::data_type_size(dst_dt_) * simd_w_),
+                        tail);
+            }
+        }
+
+        if (tail) return;
+
+        add(reg_src, unroll * types::data_type_size(src_dt_) * simd_w_);
+        add(reg_dst, unroll * types::data_type_size(dst_dt_) * simd_w_);
+        sub(reg_work_amount, unroll * simd_w_);
+    }
+
+    void generate() override {
+        preamble();
+
+        if (tail_size_) io_.prepare_tail_mask();
+        if (is_bf16()) io_.init_bf16();
+        io_.init_saturate_f32({dst_dt_});
+
+        Reg64 param = abi_param1;
+#define PARAM_OFF(x) offsetof(ker_args_t, x)
+        mov(reg_src, ptr[param + PARAM_OFF(src)]);
+        mov(reg_dst, ptr[param + PARAM_OFF(dst)]);
+        mov(reg_work_amount, ptr[param + PARAM_OFF(work_amount)]);
+#undef PARAM_OFF
+
+        Label unroll_12_start, unroll_4_start, full_vector_start, tail_start,
+                end;
+
+        L(unroll_12_start);
+        {
+            cmp(reg_work_amount, unroll_12_ * simd_w_);
+            jl(unroll_4_start, T_NEAR);
+
+            copy(unroll_12_, false);
+
+            jmp(unroll_12_start, T_NEAR);
+        }
+
+        L(unroll_4_start);
+        {
+            cmp(reg_work_amount, unroll_4_ * simd_w_);
+            jl(full_vector_start, T_NEAR);
+
+            copy(unroll_4_, false);
+
+            jmp(unroll_4_start, T_NEAR);
+        }
+
+        L(full_vector_start);
+        {
+            cmp(reg_work_amount, simd_w_);
+            jl(tail_start, T_NEAR);
+
+            copy(1, false);
+
+            jmp(full_vector_start, T_NEAR);
+        }
+
+        L(tail_start);
+        {
+            cmp(reg_work_amount, 0);
+            jle(end, T_NEAR);
+
+            copy(1, true);
+        }
+        L(end);
+
+        postamble();
+
+        if (is_f8()) io_.prepare_table_fp8();
+    }
+
+private:
+    struct ker_args_t {
+        const void *src;
+        void *dst;
+        size_t work_amount;
+    };
+
+    bool is_bf16() const {
+        return utils::one_of(data_type::bf16, src_dt_, dst_dt_);
+    }
+
+    bool is_f16() const {
+        return utils::one_of(data_type::f16, src_dt_, dst_dt_);
+    }
+
+    bool is_f8() const {
+        return utils::one_of(data_type::f8_e4m3, src_dt_, dst_dt_)
+                || utils::one_of(data_type::f8_e5m2, src_dt_, dst_dt_);
+    }
+
+    cpu_isa_t isa_;
+    data_type_t src_dt_, dst_dt_;
+    io::jit_io_multi_dt_helper_t<Vmm> io_;
+    size_t tail_size_;
+
+    const Reg64 reg_tmp_ = rax;
+    const Reg64 reg_src = r8;
+    const Reg64 reg_dst = r9;
+    const Reg64 reg_work_amount = r10;
+
+    const int tail_opmask_idx_ = 1;
+
+    const int tail_vmm_idx_ = 0;
+    // Indices from 1 to 12 are occupied in the unrolled body.
+    const int vmm_tmp_idx_ = 13;
+    const int zero_idx_ = 14;
+    const int saturation_ubound_idx_ = 15;
+    const int emu_zmm_1_idx_ = 27;
+    const int emu_zmm_2_idx_ = 28;
+    const int emu_zmm_3_idx_ = 29;
+    const int emu_zmm_4_idx_ = 30;
+    const int emu_zmm_5_idx_ = 31;
+    const int emu_kmask_aux_idx_ = 2;
+};
+
+status_t jit_uni_reorder_direct_copy_t::pd_t::create(reorder_pd_t **reorder_pd,
+        engine_t *engine, const primitive_attr_t *attr, engine_t *src_engine,
+        const memory_desc_t *src_md, engine_t *dst_engine,
+        const memory_desc_t *dst_md) {
+    auto _pd = make_unique_pd<pd_t>(
+            attr, src_engine->kind(), src_md, dst_engine->kind(), dst_md);
+    if (_pd == nullptr) return status::out_of_memory;
+
+    CHECK(_pd->init(engine, src_engine, dst_engine));
+
+    return safe_ptr_assign(*reorder_pd, _pd.release());
+}
+
+status_t jit_uni_reorder_direct_copy_t::pd_t::init(
+        engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
+    using namespace data_type;
+
+    VDISPATCH_REORDER(is_dense_format_kind({src_md(), dst_md()}),
+            VERBOSE_UNSUPPORTED_SPARSE_CFG);
+    isa_ = get_max_cpu_isa();
+
+    const auto src_dt = src_md()->data_type;
+    const auto dst_dt = dst_md()->data_type;
+    const memory_desc_wrapper src_d(src_md());
+    const memory_desc_wrapper dst_d(dst_md());
+    const auto blocks_size = src_d.blk_size();
+
+    VDISPATCH_REORDER(!src_d.has_runtime_dims_or_strides(),
+            VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+    VDISPATCH_REORDER(!dst_d.has_runtime_dims_or_strides(),
+            VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+    VDISPATCH_REORDER(
+            src_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+    VDISPATCH_REORDER(
+            dst_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+
+    // Note: io_helper has an implicit conversion to f32 which is incorrect for
+    // s32->s32. Disabling it for now.
+    const bool is_s32 = utils::everyone_is(s32, src_dt, dst_dt);
+    VDISPATCH_REORDER(!is_s32, VERBOSE_UNSUPPORTED_DT);
+
+    VDISPATCH_REORDER(IMPLICATION(utils::one_of(bf16, src_dt, dst_dt),
+                              mayiuse(avx512_core) || mayiuse(avx2_vnni_2)),
+            VERBOSE_ISA_DT_MISMATCH);
+    VDISPATCH_REORDER(
+            IMPLICATION(utils::one_of(f16, src_dt, dst_dt),
+                    mayiuse(avx512_core_fp16) || mayiuse(avx2_vnni_2)),
+            VERBOSE_ISA_DT_MISMATCH);
+    VDISPATCH_REORDER(
+            IMPLICATION(utils::one_of(bf16, src_dt, dst_dt) && blocks_size < 8,
+                    mayiuse(avx512_core_bf16)),
+            VERBOSE_ISA_DT_MISMATCH);
+    VDISPATCH_REORDER(
+            IMPLICATION(utils::one_of(f16, src_dt, dst_dt) && blocks_size < 8,
+                    mayiuse(avx512_core_fp16)),
+            VERBOSE_ISA_DT_MISMATCH);
+
+    const bool is_f8 = utils::one_of(f8_e4m3, src_dt, dst_dt)
+            || utils::one_of(f8_e5m2, src_dt, dst_dt);
+    VDISPATCH_REORDER(IMPLICATION(is_f8, mayiuse(avx512_core_amx)),
+            VERBOSE_ISA_DT_MISMATCH);
+    VDISPATCH_REORDER(IMPLICATION(is_f8,
+                              !utils::one_of(src_dt, u8, s8)
+                                      && !utils::one_of(dst_dt, u8, s8)),
+            VERBOSE_UNSUPPORTED_DT_CFG);
+    // Direct copy operates only on identical formats.
+    VDISPATCH_REORDER(src_d.similar_to(dst_d, true, false, 0),
+            VERBOSE_TENSOR_FORMAT_MISMATCH, "src", "dst");
+
+    VDISPATCH_REORDER(src_d.extra().flags == dst_d.extra().flags,
+            VERBOSE_UNSUPPORTED_MD_FLAG, "src or dst");
+
+    VDISPATCH_REORDER(IMPLICATION(src_d.extra().flags > 0UL,
+                              src_d.additional_buffer_size()
+                                      == dst_d.additional_buffer_size()),
+            VERBOSE_UNSUPPORTED_MD_FLAG, "src or dst");
+
+    VDISPATCH_REORDER(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+
+    return status::success;
+}
+
+jit_uni_reorder_direct_copy_t::kernel_base_t *
+jit_uni_reorder_direct_copy_t::kernel_base_t::create(
+        const reorder_pd_t *pd, cpu_isa_t isa) {
+    // Reorder must support blocked formats such as aBx8b.
+    // These variables will help to dispatch smaller blocks into proper kernels.
+    const bool has_blocks = !memory_desc_wrapper(pd->src_md()).is_plain();
+    const auto blocks_size = memory_desc_wrapper(pd->src_md()).blk_size();
+
+    if (is_superset(isa, avx512_core)
+            && IMPLICATION(has_blocks, blocks_size >= 16)) {
+        return new direct_copy_kernel_t<Zmm>(pd, isa);
+    } else if (is_superset(isa, avx2)
+            && IMPLICATION(has_blocks, blocks_size >= 8)) {
+        return new direct_copy_kernel_t<Ymm>(pd, isa);
+    } else if (is_superset(isa, sse41)) {
+        return new direct_copy_kernel_t<Xmm>(pd, isa);
+    } else {
+        assert(!"unexpected");
+    }
+    return nullptr;
+}
+
+status_t jit_uni_reorder_direct_copy_t::init(engine_t *engine) {
+    const auto isa = pd()->isa_;
+    CHECK(safe_ptr_assign(kernel_, kernel_base_t::create(pd(), isa)));
+    return kernel_->create_kernel();
+}
+
+status_t jit_uni_reorder_direct_copy_t::execute(const exec_ctx_t &ctx) const {
+    const auto in = CTX_IN_MEM(const char *, DNNL_ARG_FROM);
+    auto out = CTX_OUT_MEM(char *, DNNL_ARG_TO);
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const auto src_dt_size = src_d.data_type_size();
+    const auto dst_dt_size = dst_d.data_type_size();
+    const auto nelems = src_d.nelems(true);
+    const int simd_w = isa_max_vlen(pd()->isa_) / sizeof(float);
+
+    // If nelem is small, we do sequential copy and don't spawn threads
+    const dim_t thr_granularity
+            = static_cast<dim_t>(kernel_->get_max_unroll()) * simd_w;
+    int nthr = nelems < thr_granularity ? 1 : 0;
+
+    parallel(nthr, [&](const int ithr, const int nthr) {
+        dim_t start {0}, end {0};
+
+        balance211(utils::div_up(nelems, simd_w), nthr, ithr, start, end);
+        start = nstl::min(nelems, start * simd_w);
+        end = nstl::min(nelems, end * simd_w);
+        if (start == end) return;
+
+        (*kernel_)(in + (start + src_d.offset0()) * src_dt_size,
+                out + (start + dst_d.offset0()) * dst_dt_size, end - start);
+    });
+
+    if (src_d.is_additional_buffer()) {
+        // Verified in pd_t::init();
+        assert(src_d.extra().flags == dst_d.extra().flags);
+
+        const auto additional_size = src_d.additional_buffer_size();
+        const auto data_size = src_d.size(/* index = */ 0,
+                /* include_additional_size = */ false);
+        std::memcpy(out + data_size * dst_dt_size, in + data_size * src_dt_size,
+                additional_size);
+    }
+
+    return status::success;
+}
+
+template struct direct_copy_kernel_t<Zmm>;
+template struct direct_copy_kernel_t<Ymm>;
+template struct direct_copy_kernel_t<Xmm>;
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/jit_uni_reorder_direct_copy.hpp b/src/cpu/x64/jit_uni_reorder_direct_copy.hpp
new file mode 100644
index 00000000000..e717ea72a27
--- /dev/null
+++ b/src/cpu/x64/jit_uni_reorder_direct_copy.hpp
@@ -0,0 +1,82 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_JIT_UNI_REORDER_DIRECT_COPY_HPP
+#define CPU_X64_JIT_UNI_REORDER_DIRECT_COPY_HPP
+
+#include "common/c_types_map.hpp"
+
+#include "cpu/reorder/cpu_reorder_pd.hpp"
+
+#include "cpu/x64/cpu_isa_traits.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+struct jit_uni_reorder_direct_copy_t : public primitive_t {
+    using primitive_t::primitive_t;
+    struct pd_t : public cpu_reorder_pd_t {
+        using cpu_reorder_pd_t::cpu_reorder_pd_t;
+
+        DECLARE_COMMON_PD_T(
+                "jit_direct_copy:uni", jit_uni_reorder_direct_copy_t);
+
+        status_t init(
+                engine_t *engine, engine_t *src_engine, engine_t *dst_engine);
+
+        cpu_isa_t isa_;
+
+    private:
+        static status_t create(reorder_pd_t **reorder_pd, engine_t *engine,
+                const primitive_attr_t *attr, engine_t *src_engine,
+                const memory_desc_t *src_md, engine_t *dst_engine,
+                const memory_desc_t *dst_md);
+
+        friend dnnl::impl::impl_list_item_t;
+    };
+
+    status_t init(engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+    struct kernel_base_t {
+        virtual void operator()(
+                const void *src, void *dst, size_t work_amount) const = 0;
+        static kernel_base_t *create(const reorder_pd_t *pd, cpu_isa_t isa);
+        virtual status_t create_kernel() = 0;
+        virtual int get_max_unroll() const = 0;
+        virtual ~kernel_base_t() = default;
+
+    protected:
+        kernel_base_t(const reorder_pd_t *pd) : pd_(pd) {}
+
+        // `pd_` is needed to access its members (such as `attr()`) in
+        // `generate()` call.
+        const reorder_pd_t *pd_;
+    };
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<kernel_base_t> kernel_;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/jit_uni_reorder_utils.cpp b/src/cpu/x64/jit_uni_reorder_utils.cpp
index 5b7c8d3be04..38621550ec1 100644
--- a/src/cpu/x64/jit_uni_reorder_utils.cpp
+++ b/src/cpu/x64/jit_uni_reorder_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -199,15 +199,22 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
         return po.len() == 0 || (po.len() == 1 && po.entry_[0].is_sum(false));
     };
 
-    bool ok = im_d.is_blocking_desc() && om_d.is_blocking_desc()
-            && !im_d.has_runtime_dims_or_strides() && !im_d.has_zero_dim()
-            && !om_d.has_runtime_dims_or_strides() && !om_d.has_zero_dim()
-            && attr->has_default_values(
-                    primitive_attr_t::skip_mask_t::scales_runtime
-                    | primitive_attr_t::skip_mask_t::zero_points_runtime
-                    | primitive_attr_t::skip_mask_t::post_ops)
-            && check_post_ops(attr);
-    if (!ok) return unimplemented;
+    VDISPATCH_REORDER_IC(
+            im_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+    VDISPATCH_REORDER_IC(
+            om_d.is_blocking_desc(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+    VDISPATCH_REORDER_IC(!im_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "src");
+    VDISPATCH_REORDER_IC(!om_d.has_zero_dim(), VERBOSE_EMPTY_TENSOR, "dst");
+    VDISPATCH_REORDER_IC(!im_d.has_runtime_dims_or_strides(),
+            VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+    VDISPATCH_REORDER_IC(!om_d.has_runtime_dims_or_strides(),
+            VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+    using smask_t = primitive_attr_t::skip_mask_t;
+    VDISPATCH_REORDER_IC(attr->has_default_values(smask_t::scales
+                                 | smask_t::zero_points | smask_t::post_ops),
+            VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_REORDER_IC(check_post_ops(attr), VERBOSE_UNSUPPORTED_POSTOP);
 
     bool is_tail_present = false;
     dims_t iblocks, oblocks, i_tails, o_tails, i_paddings, o_paddings;
@@ -219,7 +226,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
         const auto pdim = om_d.padded_dims()[d];
         const auto cblock = oblocks[d];
         // do not allow excess pdim other than required for rounding-up of dim.
-        if (utils::rnd_up(dim, cblock) != pdim) return unimplemented;
+        VDISPATCH_REORDER_IC(utils::rnd_up(dim, cblock) == pdim,
+                VERBOSE_UNSUPPORTED_PAD_FEATURE);
     }
 
     utils::array_set(i_tails, 0, im_d.ndims());
@@ -273,24 +281,25 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
 
     p.src_scale_type = scale_type_t::NONE;
     int src_mask = 0;
-    bool is_src_set = false;
-    CHECK(attr->scales_.get(DNNL_ARG_SRC, &src_mask, &is_src_set));
-    if (is_src_set) {
+    if (!attr->scales_.has_default_values(DNNL_ARG_SRC)) {
+        src_mask = attr->scales_.get_mask(DNNL_ARG_SRC);
         p.src_scale_type
                 = src_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY;
     }
 
     p.dst_scale_type = scale_type_t::NONE;
     int dst_mask = 0;
-    bool is_dst_set = false;
-    CHECK(attr->scales_.get(DNNL_ARG_DST, &dst_mask, &is_dst_set));
-    if (is_dst_set) {
+    if (!attr->scales_.has_default_values(DNNL_ARG_DST)) {
+        dst_mask = attr->scales_.get_mask(DNNL_ARG_DST);
         p.dst_scale_type
                 = dst_mask == 0 ? scale_type_t::COMMON : scale_type_t::MANY;
     }
 
-    if (is_src_set && is_dst_set && src_mask != dst_mask)
-        return status::unimplemented;
+    VDISPATCH_REORDER_IC(
+            IMPLICATION(p.src_scale_type != scale_type_t::NONE
+                            && p.dst_scale_type != scale_type_t::NONE,
+                    src_mask == dst_mask),
+            VERBOSE_UNSUPPORTED_SCALES_CFG);
 
     p.scale_adjust = (om_d.extra().flags & memory_extra_flags::scale_adjust)
             ? om_d.extra().scale_adjust
@@ -306,10 +315,12 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
         return IMPLICATION(check, mask == (with_groups ? 0x3 : 0x1));
     };
 
-    if (!mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask)
-            || !mask_ok(p.req_asymmetric_comp,
-                    om_d.extra().asymm_compensation_mask))
-        return status::unimplemented;
+    VDISPATCH_REORDER_IC(
+            mask_ok(p.req_s8s8_comp, om_d.extra().compensation_mask),
+            VERBOSE_UNSUPPORTED_MD_FLAG, "dst");
+    VDISPATCH_REORDER_IC(mask_ok(p.req_asymmetric_comp,
+                                 om_d.extra().asymm_compensation_mask),
+            VERBOSE_UNSUPPORTED_MD_FLAG, "dst");
 
     ptrdiff_t ss[max_ndims] = {0}; // scales strides
     if (p.src_scale_type == scale_type_t::MANY
@@ -428,14 +439,14 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
     p.beta = sum_idx == -1 ? 0.f : attr->post_ops_.entry_[sum_idx].sum.scale;
 
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "init : ");
-        prb_dump(p);
+        verbose_printf(
+                verbose_t::debuginfo, "init : %s\n", prb_dump(p).c_str());
     });
 
     prb_normalize(p);
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "norm : ");
-        prb_dump(p);
+        verbose_printf(
+                verbose_t::debuginfo, "norm : %s\n", prb_dump(p).c_str());
     });
 
     // compensation strides require prb_normalized
@@ -443,8 +454,8 @@ status_t prb_init(prb_t &p, const memory_desc_t &imd, const memory_desc_t &omd,
 
     prb_simplify(p);
     DEBUG({
-        verbose_printf(verbose_t::debuginfo, "smpl : ");
-        prb_dump(p);
+        verbose_printf(
+                verbose_t::debuginfo, "smpl : %s\n", prb_dump(p).c_str());
     });
 
     return success;
@@ -596,19 +607,20 @@ void prb_node_move(prb_t &p, int d0, int d1) {
     p.nodes[d1] = node;
 }
 
-void prb_dump(const prb_t &p) {
-    verbose_printf(verbose_t::debuginfo, "@@@ type:%s:%s ndims:%d ",
-            dnnl_dt2str(p.itype), dnnl_dt2str(p.otype), p.ndims);
+std::string prb_dump(const prb_t &p) {
+    std::stringstream ss;
+    ss << "@@@ type:" << dnnl_dt2str(p.itype) << ':' << dnnl_dt2str(p.otype)
+       << " ndims:" << p.ndims;
     for (int d = 0; d < p.ndims; ++d) {
-        if (d != 0) verbose_printf(verbose_t::debuginfo, "x");
-        verbose_printf(verbose_t::debuginfo,
-                "[%zu:%zu:%d:%d:%s:%td:%td:%td:%td]", p.nodes[d].n,
-                p.nodes[d].tail_size, p.nodes[d].dim_id,
-                p.nodes[d].parent_node_id,
-                p.nodes[d].is_zero_pad_needed ? "true" : "false", p.nodes[d].is,
-                p.nodes[d].os, p.nodes[d].ss, p.nodes[d].cs);
+        if (d != 0) ss << 'x';
+        const auto &node = p.nodes[d];
+        ss << '[' << node.n << ':' << node.tail_size << ':' << node.dim_id
+           << ':' << node.parent_node_id << ':'
+           << (node.is_zero_pad_needed ? "true" : "false") << ':' << node.is
+           << ':' << node.os << ':' << node.ss << ':' << node.cs << ']';
     }
-    verbose_printf(verbose_t::debuginfo, " off:%zu:%zu\n", p.ioff, p.ooff);
+    ss << " off:" << p.ioff << ':' << p.ooff;
+    return ss.str();
 }
 
 } // namespace tr
diff --git a/src/cpu/x64/jit_uni_resampling.hpp b/src/cpu/x64/jit_uni_resampling.hpp
index 0676475e221..5e764cb82d5 100644
--- a/src/cpu/x64/jit_uni_resampling.hpp
+++ b/src/cpu/x64/jit_uni_resampling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,7 +50,8 @@ struct jit_uni_resampling_fwd_t : public primitive_t {
     };
 
     jit_uni_resampling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
-    virtual ~jit_uni_resampling_fwd_t() = default;
+
+    ~jit_uni_resampling_fwd_t() override = default;
 
     status_t init(engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
diff --git a/src/cpu/x64/jit_uni_resampling_kernel.hpp b/src/cpu/x64/jit_uni_resampling_kernel.hpp
index e9dc0d546bb..c3114112012 100644
--- a/src/cpu/x64/jit_uni_resampling_kernel.hpp
+++ b/src/cpu/x64/jit_uni_resampling_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,15 +34,15 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_uni_resampling_kernel_base_t : public jit_generator {
+struct jit_uni_resampling_kernel_base_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_resampling)
 
     jit_uni_resampling_kernel_base_t(const jit_resampling_conf_t &conf)
-        : jit_generator(jit_name(), conf.isa)
+        : jit_generator_t(jit_name(), conf.isa)
         , conf_(conf)
         , sum_scales_(conf_.sum_scales) {}
 
-    virtual ~jit_uni_resampling_kernel_base_t() = default;
+    ~jit_uni_resampling_kernel_base_t() override = default;
 
     virtual std::size_t get_simd_w() = 0;
 
@@ -57,7 +57,7 @@ struct jit_uni_resampling_kernel_t : public jit_uni_resampling_kernel_base_t {
     jit_uni_resampling_kernel_t(
             const jit_resampling_conf_t &conf, const memory_desc_t *dst_md);
 
-    virtual ~jit_uni_resampling_kernel_t() = default;
+    ~jit_uni_resampling_kernel_t() override = default;
 
     std::size_t get_simd_w() override { return simd_w_; }
 
@@ -70,7 +70,7 @@ struct jit_uni_resampling_kernel_t : public jit_uni_resampling_kernel_base_t {
     using c_oriented_generation_fn_t = std::function<void(const bool)>;
 
     constexpr int vmm_idx(int idx) const {
-        return (cpu_isa_traits<isa>::n_vregs - 1) - idx;
+        return (cpu_isa_traits_t<isa>::n_vregs - 1) - idx;
     }
 
     bool can_movntps_be_used() const;
diff --git a/src/cpu/x64/jit_uni_softmax.cpp b/src/cpu/x64/jit_uni_softmax.cpp
index bd694707149..5a52600152e 100644
--- a/src/cpu/x64/jit_uni_softmax.cpp
+++ b/src/cpu/x64/jit_uni_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,22 +51,22 @@ using namespace data_type;
 
 template <cpu_isa_t isa>
 struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
-                                    public jit_generator {
+                                    public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_softmax_dense_kernel_t)
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const AddressFrame &vmmword = is_superset(isa, avx512_core) ? zword
             : is_superset(isa, avx)                             ? yword
                                                                 : xword;
-    static constexpr auto vlen = cpu_isa_traits<isa>::vlen;
-    static constexpr auto n_vregs = cpu_isa_traits<isa>::n_vregs;
+    static constexpr auto vlen = cpu_isa_traits_t<isa>::vlen;
+    static constexpr auto n_vregs = cpu_isa_traits_t<isa>::n_vregs;
     static constexpr auto simd_w_ = vlen / sizeof(float); // bf16 works on ymms
 
     const memory_desc_wrapper src_d_, dst_d_, diff_dst_d_;
     io::jit_io_multi_dt_helper_t<Vmm> io_;
 
-    std::unique_ptr<jit_uni_eltwise_injector<isa>> exp_injector_;
-    std::unique_ptr<jit_uni_eltwise_injector<isa>> log_injector_;
+    std::unique_ptr<jit_uni_eltwise_injector_t<isa>> exp_injector_;
+    std::unique_ptr<jit_uni_eltwise_injector_t<isa>> log_injector_;
     std::unique_ptr<injector::jit_uni_postops_injector_t<isa>>
             postops_injector_;
 
@@ -139,10 +139,12 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
     Opmask tail_opmask = Opmask(tail_opmask_idx_);
 
     void operator()(const call_params_t *p) const override {
-        return jit_generator::operator()(p);
+        return jit_generator_t::operator()(p);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     bool is_data_type_xf16(data_type_t dt) {
         return utils::one_of(dt, bf16, f16);
@@ -609,7 +611,7 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
                     // Prepare indices for exp aux vmms.
                     injector_utils::vmm_index_set_t exp_aux_indices;
                     const auto exp_vmm_aux_count
-                            = jit_uni_eltwise_injector<isa>::aux_vecs_count(
+                            = jit_uni_eltwise_injector_t<isa>::aux_vecs_count(
                                     alg_kind::eltwise_exp, pd_->is_fwd(), 0.f);
                     for (size_t j = 0; j < exp_vmm_aux_count; j++) {
                         // Insert the next idx starting after `vreg_tmp_sum`.
@@ -687,24 +689,31 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
                 Vmm vreg_tmp_src_even = Vmm(i + 1);
                 Vmm vreg_tmp_src_odd = Vmm(i + 2);
                 vtmp = Vmm(i + 3);
-                if (can_load_two_simdw && !need_scratchpad_) {
-                    io_[dst_d_.data_type()]->load_two_simdw_xf16(
-                            dst_ptr(dst_next_vreg_stride_ * i),
-                            vreg_tmp_src_even, vreg_tmp_src_odd);
-                    io_[dst_d_.data_type()]->merge_interleaved_to_plain(
-                            vreg_tmp_src_even, vreg_tmp_src_odd, vtmp);
-                } else {
-                    if (need_scratchpad_) {
+                if (can_load_two_simdw) {
+                    if (!need_scratchpad_) {
+                        io_[dst_d_.data_type()]->load_two_simdw_xf16(
+                                dst_ptr(dst_next_vreg_stride_ * i),
+                                vreg_tmp_src_even, vreg_tmp_src_odd);
+                        io_[dst_d_.data_type()]->merge_interleaved_to_plain(
+                                vreg_tmp_src_even, vreg_tmp_src_odd, vtmp);
+                    } else {
                         io_[f32]->load(
                                 interim_ptr(interim_next_vreg_stride_ * i),
                                 vreg_tmp_src_even, tail);
                         io_[f32]->load(interim_ptr(interim_next_vreg_stride_
                                                * (i + 1)),
                                 vreg_tmp_src_odd, tail);
-                    } else
+                    }
+                } else {
+                    if (!need_scratchpad_) {
                         io_[dst_d_.data_type()]->load(
                                 dst_ptr(dst_next_vreg_stride_ * i),
                                 vreg_tmp_src_even, tail);
+                    } else {
+                        io_[f32]->load(
+                                interim_ptr(interim_next_vreg_stride_ * i),
+                                vreg_tmp_src_even, tail);
+                    }
                 }
                 for (int i_odd = 0; i_odd < 2 && i_odd + i < unroll; i_odd++) {
                     const auto vreg_tmp_src
@@ -892,18 +901,18 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
     // initialization.
     void generate() override {
         if (pd_->is_fwd() || is_logsoftmax_)
-            exp_injector_.reset(new jit_uni_eltwise_injector<isa>(this,
+            exp_injector_.reset(new jit_uni_eltwise_injector_t<isa>(this,
                     alg_kind::eltwise_exp, 0.0f, 0.0f, 1.0f, data_type::f32,
                     !use_ext_aux_vmms_, reg_exp_injector_table, injector_mask));
         if (pd_->is_fwd() && is_logsoftmax_) {
-            log_injector_.reset(new jit_uni_eltwise_injector<isa>(this,
+            log_injector_.reset(new jit_uni_eltwise_injector_t<isa>(this,
                     alg_kind::eltwise_log, 0.0f, 0.0f, 1.0f, data_type::f32,
                     true, reg_log_injector_table, injector_mask));
         }
         if (with_postops_) {
             static constexpr bool preserve_gpr = true;
             static constexpr bool preserve_vmm = true;
-            static constexpr bool use_exact_tail_scalar_bcast = true;
+            static constexpr bool use_exact_tail_scalar_bcast = false;
             static constexpr std::size_t tmp_vmm_injector = 0u;
 
             const binary_injector::rhs_arg_static_params_t rhs_sp {
@@ -942,7 +951,7 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
 
     jit_softmax_dense_kernel_t(const softmax_pd_t *pd)
         : jit_softmax_kernel_base_t(pd)
-        , jit_generator(jit_name(), isa)
+        , jit_generator_t(jit_name(), isa)
         , src_d_(pd_->invariant_src_md())
         , dst_d_(pd_->dst_md())
         , diff_dst_d_(pd_->diff_dst_md())
@@ -951,7 +960,13 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
         , is_avx2_ne_xf16_(mayiuse(avx2_vnni_2) && !mayiuse(avx512_core)
                   && (is_bf16_ || is_f16_))
         // Note: must be aligned with pd_t::init()->init_scratchpad();
-        , need_scratchpad_(pd_->is_fwd() && dst_d_.data_type() != f32)
+        , need_scratchpad_(pd_->is_fwd() && dst_d_.data_type() != f32
+                  && /* !relaxed_acc */ !(
+                          src_d_.data_type() == dst_d_.data_type()
+                          && !types::is_integral_dt(dst_d_.data_type())
+                          && utils::one_of(pd_->attr()->acc_mode_,
+                                  accumulation_mode::relaxed,
+                                  accumulation_mode::any)))
         , use_ext_aux_vmms_(!is_logsoftmax_ && n_vregs > 16)
         , axis_simd_full_(pd_->axis_size() / simd_w_)
         , axis_simd_tail_(pd_->axis_size() % simd_w_) {
@@ -963,9 +978,9 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
 
         const auto &attr_scales = pd_->attr()->scales_;
         with_src_scales_ = is_superset(isa, avx2)
-                && !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+                && !attr_scales.has_default_values(DNNL_ARG_SRC);
         with_dst_scales_ = is_superset(isa, avx2)
-                && !attr_scales.get(DNNL_ARG_DST).has_default_values();
+                && !attr_scales.has_default_values(DNNL_ARG_DST);
 
         io::io_conf_t io_conf;
         io::io_tail_conf_t io_tail_conf(simd_w_, axis_simd_tail_,
@@ -984,21 +999,21 @@ struct jit_softmax_dense_kernel_t : jit_softmax_kernel_base_t,
 
 template <cpu_isa_t isa>
 struct jit_softmax_strided_kernel_t : jit_softmax_kernel_base_t,
-                                      public jit_generator {
+                                      public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_softmax_strided_kernel_t)
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
     const AddressFrame &vmmword = is_superset(isa, avx512_core) ? zword
             : is_superset(isa, avx)                             ? yword
                                                                 : xword;
-    static constexpr auto vlen = cpu_isa_traits<isa>::vlen;
+    static constexpr auto vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr auto simd_w_ = vlen / sizeof(float); // bf16 works on ymms
 
     const memory_desc_wrapper src_d_, dst_d_;
     io::jit_io_multi_dt_helper_t<Vmm> io_;
 
-    std::unique_ptr<jit_uni_eltwise_injector<isa>> exp_injector_;
-    std::unique_ptr<jit_uni_eltwise_injector<isa>> log_injector_;
+    std::unique_ptr<jit_uni_eltwise_injector_t<isa>> exp_injector_;
+    std::unique_ptr<jit_uni_eltwise_injector_t<isa>> log_injector_;
     std::unique_ptr<injector::jit_uni_postops_injector_t<isa>>
             postops_injector_;
 
@@ -1063,10 +1078,12 @@ struct jit_softmax_strided_kernel_t : jit_softmax_kernel_base_t,
     Opmask tail_opmask = Opmask(tail_opmask_idx_);
 
     void operator()(const call_params_t *p) const override {
-        return jit_generator::operator()(p);
+        return jit_generator_t::operator()(p);
     }
 
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
     void compute_predefined_variables() {
         // `axis_simd_full_` is actually `inner_simd_full_`.
@@ -1440,18 +1457,18 @@ struct jit_softmax_strided_kernel_t : jit_softmax_kernel_base_t,
 
     void generate() override {
         if (pd_->is_fwd() || is_logsoftmax_)
-            exp_injector_.reset(new jit_uni_eltwise_injector<isa>(this,
+            exp_injector_.reset(new jit_uni_eltwise_injector_t<isa>(this,
                     alg_kind::eltwise_exp, 0.0f, 0.0f, 1.0f, data_type::f32,
                     true, reg_exp_injector_table, injector_mask));
         if (pd_->is_fwd() && is_logsoftmax_) {
-            log_injector_.reset(new jit_uni_eltwise_injector<isa>(this,
+            log_injector_.reset(new jit_uni_eltwise_injector_t<isa>(this,
                     alg_kind::eltwise_log, 0.0f, 0.0f, 1.0f, data_type::f32,
                     true, reg_log_injector_table, injector_mask));
         }
         if (with_postops_) {
             static constexpr bool preserve_gpr = true;
             static constexpr bool preserve_vmm = true;
-            static constexpr bool use_exact_tail_scalar_bcast = true;
+            static constexpr bool use_exact_tail_scalar_bcast = false;
             static constexpr std::size_t tmp_vmm_injector = 0u;
 
             const binary_injector::rhs_arg_static_params_t rhs_sp {
@@ -1491,11 +1508,17 @@ struct jit_softmax_strided_kernel_t : jit_softmax_kernel_base_t,
 
     jit_softmax_strided_kernel_t(const softmax_pd_t *pd)
         : jit_softmax_kernel_base_t(pd)
-        , jit_generator(jit_name(), isa)
+        , jit_generator_t(jit_name(), isa)
         , src_d_(pd_->invariant_src_md())
         , dst_d_(pd_->dst_md())
         // Note: must be aligned with pd_t::init()->init_scratchpad();
-        , need_scratchpad_(pd_->is_fwd() && dst_d_.data_type() != f32)
+        , need_scratchpad_(pd_->is_fwd() && dst_d_.data_type() != f32
+                  && /* !relaxed_acc */ !(
+                          src_d_.data_type() == dst_d_.data_type()
+                          && !types::is_integral_dt(dst_d_.data_type())
+                          && utils::one_of(pd_->attr()->acc_mode_,
+                                  accumulation_mode::relaxed,
+                                  accumulation_mode::any)))
         , axis_size_(pd_->axis_size())
         // `axis_stride_`, `axis_simd_full_` and `axis_simd_tail_` are only
         // different pieces from the dense version.
@@ -1517,9 +1540,9 @@ struct jit_softmax_strided_kernel_t : jit_softmax_kernel_base_t,
 
         const auto &attr_scales = pd_->attr()->scales_;
         with_src_scales_ = is_superset(isa, avx2)
-                && !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+                && !attr_scales.has_default_values(DNNL_ARG_SRC);
         with_dst_scales_ = is_superset(isa, avx2)
-                && !attr_scales.get(DNNL_ARG_DST).has_default_values();
+                && !attr_scales.has_default_values(DNNL_ARG_DST);
 
         io::io_conf_t io_conf;
         io::io_tail_conf_t io_tail_conf(simd_w_, axis_simd_tail_,
diff --git a/src/cpu/x64/jit_uni_softmax.hpp b/src/cpu/x64/jit_uni_softmax.hpp
index 26dfe469719..f54c4aa9f4f 100644
--- a/src/cpu/x64/jit_uni_softmax.hpp
+++ b/src/cpu/x64/jit_uni_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -130,9 +130,8 @@ struct jit_uni_softmax_fwd_t : public primitive_t {
                                     || is_superset(isa_, avx2_vnni_2));
             VDISPATCH_SOFTMAX(f16_isa_ok, VERBOSE_ISA_DT_MISMATCH);
 
-            VDISPATCH_SOFTMAX(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+            VDISPATCH_SOFTMAX(attr()->has_default_values(skip_mask_t::scales
+                                      | skip_mask_t::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
 
@@ -171,7 +170,17 @@ struct jit_uni_softmax_fwd_t : public primitive_t {
 
     private:
         void init_scratchpad() {
-            if (dst_md()->data_type != data_type::f32) {
+            const auto src_dt = src_md()->data_type;
+            const auto dst_dt = dst_md()->data_type;
+            // Relaxed accumulation allows to downconvert intermediate results
+            // directly from xf16 or xf8 to dst avoiding scratchpad memory.
+            const bool relaxed_acc = src_dt == dst_dt
+                    && !types::is_integral_dt(dst_dt)
+                    && utils::one_of(attr()->acc_mode_,
+                            accumulation_mode::relaxed, accumulation_mode::any);
+            const bool need_scratchpad
+                    = dst_dt != data_type::f32 && !relaxed_acc;
+            if (need_scratchpad) {
                 auto scratchpad = scratchpad_registry().registrar();
                 // When stride != 1, then each thread operates over simd at a
                 // time, thus, increased scratchpad size.
diff --git a/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp b/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
index c2e074c2f8b..6bcf2ec3d07 100644
--- a/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
+++ b/src/cpu/x64/jit_uni_tbb_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -56,7 +56,7 @@ template <cpu_isa_t isa>
 int get_vlen(jit_memory_tag_kind_t tag_kind) {
     return isa == sse41 && tag_kind == jit_memory_tag_kind_t::blocked
             ? 32
-            : cpu_isa_traits<isa>::vlen;
+            : cpu_isa_traits_t<isa>::vlen;
 }
 
 template <cpu_isa_t isa>
@@ -95,10 +95,10 @@ std::tuple<dim_t, dim_t, dim_t> get_data_strides(
 #define PARAM_ADDR(x) (reg_param_ + offsetof(call_params_t, x))
 template <cpu_isa_t isa>
 struct jit_bnorm_process_tail_t {
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     jit_bnorm_process_tail_t(const batch_normalization_pd_t *pd,
-            jit_generator *host, Reg64 reg_tmp, Reg64 reg_blk_has_tail,
+            jit_generator_t *host, Reg64 reg_tmp, Reg64 reg_blk_has_tail,
             Reg64 reg_C, Vmm vtail_mask, Opmask ktail_mask)
         : h_(host)
         , reg_tmp_(reg_tmp)
@@ -109,11 +109,11 @@ struct jit_bnorm_process_tail_t {
         const memory_desc_wrapper data_d(pd->src_md());
         c_is_padded_ = pd->C() != data_d.padded_dims()[1];
 
-        const int vlen = isa == sse41 ? 32 : cpu_isa_traits<isa>::vlen;
+        const int vlen = isa == sse41 ? 32 : cpu_isa_traits_t<isa>::vlen;
         tail_ = pd->C() % (int)(vlen / sizeof(float));
     }
 
-    jit_generator *const h_;
+    jit_generator_t *const h_;
     const Reg64 reg_tmp_;
     const Reg64 reg_blk_has_tail_;
     const Reg64 reg_C_;
@@ -198,10 +198,10 @@ struct jit_bnorm_process_tail_t {
 
 template <cpu_isa_t isa>
 struct jit_bnorm_process_relu_t {
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     jit_bnorm_process_relu_t(const batch_normalization_pd_t *pd,
-            jit_generator *host, Reg64 reg_off_dat, Reg64 reg_tmp,
+            jit_generator_t *host, Reg64 reg_off_dat, Reg64 reg_tmp,
             Reg64 reg_ptr_ws, Vmm vzero, Vmm vstore_mask, Opmask kstore_mask,
             Vmm valpha, Vmm vmask, Reg64 reg_alpha)
         : h_(host)
@@ -225,12 +225,12 @@ struct jit_bnorm_process_relu_t {
                           : 0.f) {}
 
     jit_bnorm_process_relu_t(const batch_normalization_pd_t *pd,
-            jit_generator *host, Reg64 reg_off_dat, Reg64 reg_tmp,
+            jit_generator_t *host, Reg64 reg_off_dat, Reg64 reg_tmp,
             Reg64 reg_ptr_ws, Vmm vzero, Vmm vstore_mask, Opmask kstore_mask)
         : jit_bnorm_process_relu_t(pd, host, reg_off_dat, reg_tmp, reg_ptr_ws,
                 vzero, vstore_mask, kstore_mask, Vmm(), Vmm(), Reg64()) {}
 
-    jit_generator *const h_;
+    jit_generator_t *const h_;
     const Reg64 reg_off_dat_;
     const Reg64 reg_tmp_;
     const Reg64 reg_ptr_ws_;
@@ -302,7 +302,7 @@ struct jit_bnorm_process_relu_t {
     void fwd_process_relu_avx2(Vmm vdst, const int off = 0) {
         Reg64 reg_store_mask = reg_tmp_;
         h_->shr(reg_off_dat_, bit_shift_);
-        h_->vcmpps(vstore_mask_, vzero_, vdst, jit_generator::_cmp_lt_os);
+        h_->vcmpps(vstore_mask_, vzero_, vdst, jit_generator_t::_cmp_lt_os);
         h_->vmovmskps(reg_store_mask, vstore_mask_);
         h_->mov(h_->ptr[reg_ptr_ws_ + reg_off_dat_ + off],
                 reg_store_mask.cvt8());
@@ -312,7 +312,7 @@ struct jit_bnorm_process_relu_t {
 
     void fwd_process_relu_avx512_common(Vmm vdst, const int off = 0) {
         h_->shr(reg_off_dat_, bit_shift_);
-        h_->vcmpps(kstore_mask_, vzero_, vdst, jit_generator::_cmp_lt_os);
+        h_->vcmpps(kstore_mask_, vzero_, vdst, jit_generator_t::_cmp_lt_os);
         h_->kmovw(h_->ptr[reg_ptr_ws_ + reg_off_dat_ + off], kstore_mask_);
         h_->vblendmps(vdst | kstore_mask_, vzero_, vdst);
         h_->shl(reg_off_dat_, bit_shift_);
@@ -331,7 +331,7 @@ struct jit_bnorm_process_relu_t {
         const Xmm xmm_aux = Xmm(valpha.getIdx());
         h_->vmovq(xmm_aux, reg_alpha);
         h_->vbroadcastss(valpha, xmm_aux);
-        h_->vcmpps(kstore_mask_, vzero_, vmm_dst, h_->_cmp_lt_os);
+        h_->vcmpps(kstore_mask_, vzero_, vmm_dst, jit_generator_t::_cmp_lt_os);
         h_->vmulps(valpha, vmm_dst, valpha);
         h_->vblendmps(vmm_dst | kstore_mask_, valpha, vmm_dst);
     }
@@ -341,7 +341,7 @@ struct jit_bnorm_process_relu_t {
         h_->uni_vpxor(vmask, vmask, vmask);
         h_->uni_vmovq(xmm_aux, reg_alpha);
         h_->uni_vbroadcastss(valpha, xmm_aux);
-        h_->uni_vcmpps(vmask, vmm_dst, vzero_, h_->_cmp_lt_os);
+        h_->uni_vcmpps(vmask, vmm_dst, vzero_, jit_generator_t::_cmp_lt_os);
         h_->uni_vmulps(valpha, valpha, vmm_dst);
         h_->uni_vblendvps(
                 vmm_dst, vmm_dst, valpha, vmask); // swaped aux and dst
@@ -369,10 +369,10 @@ struct jit_bnorm_process_relu_t {
 
 template <cpu_isa_t isa>
 struct helper_vmovups_data_t {
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     helper_vmovups_data_t(const batch_normalization_pd_t *pd,
-            jit_generator *host, Zmm zmm_reserved_1, Zmm zmm_reserved_2,
+            jit_generator_t *host, Zmm zmm_reserved_1, Zmm zmm_reserved_2,
             Zmm zmm_reserved_3, Zmm zmm_reserved_4, Reg64 reg_tmp)
         : h_(host)
         , bf16_emu_(nullptr)
@@ -386,7 +386,7 @@ struct helper_vmovups_data_t {
         }
     }
 
-    jit_generator *const h_;
+    jit_generator_t *const h_;
     std::unique_ptr<bf16_emulation_t> bf16_emu_;
     bool is_bf16_;
     bool is_f16_;
@@ -429,16 +429,15 @@ struct helper_vmovups_data_t {
 
                 // convert f32 output to bf16
                 if (!bf16_emu_)
-                    h_->vcvtneps2bf16(dst_reg, src_reg,
-                            mayiuse(avx512_core) ? Xbyak::EvexEncoding
-                                                 : Xbyak::VexEncoding);
+                    h_->vcvtneps2bf16(dst_reg, src_reg, h_->get_encoding());
                 else
                     bf16_emu_->vcvtneps2bf16(dst_reg, src_reg);
 
                 h_->uni_vmovups(dst.getAddress(), dst_reg);
             } else if (is_f16_) {
                 auto src_reg = Vmm(src.getIdx());
-                h_->vcvtps2ph(dst.getAddress(), src_reg, h_->_op_mxcsr);
+                h_->vcvtps2ph(
+                        dst.getAddress(), src_reg, jit_generator_t::_op_mxcsr);
             } else {
                 h_->uni_vmovups(dst.getAddress(), Vmm(src.getIdx()));
             }
@@ -463,9 +462,9 @@ struct helper_vmovups_data_t {
 };
 
 template <cpu_isa_t isa>
-struct jit_bnorm_fwd_statistics_t : public jit_generator {
+struct jit_bnorm_fwd_statistics_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_bnorm_fwd_statistics_t)
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
@@ -792,7 +791,7 @@ struct jit_bnorm_fwd_statistics_t : public jit_generator {
 
     jit_bnorm_fwd_statistics_t(const batch_normalization_pd_t *pd,
             const jit_memory_tag_kind_t tag_kind)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , pd_(pd)
         , tag_kind_(tag_kind)
         , vlen(get_vlen<isa>(tag_kind))
@@ -854,9 +853,9 @@ struct jit_bnorm_fwd_var_t : jit_bnorm_fwd_statistics_t<isa> {
 };
 
 template <cpu_isa_t isa>
-struct jit_bnorm_fwd_t : public jit_generator {
+struct jit_bnorm_fwd_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_bnorm_fwd_t)
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
@@ -1178,7 +1177,7 @@ struct jit_bnorm_fwd_t : public jit_generator {
 
     jit_bnorm_fwd_t(const batch_normalization_pd_t *pd,
             const jit_memory_tag_kind_t tag_kind)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , pd_(pd)
         , tag_kind_(tag_kind)
         , vlen(get_vlen<isa>(tag_kind))
@@ -1229,9 +1228,9 @@ struct jit_bnorm_fwd_t : public jit_generator {
 };
 
 template <cpu_isa_t isa>
-struct jit_bnorm_bwd_t : public jit_generator {
+struct jit_bnorm_bwd_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_bnorm_bwd_t)
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
@@ -1466,7 +1465,7 @@ struct jit_bnorm_bwd_t : public jit_generator {
 
     jit_bnorm_bwd_t(const batch_normalization_pd_t *pd,
             const jit_memory_tag_kind_t tag_kind)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , pd_(pd)
         , tag_kind_(tag_kind)
         , vlen(get_vlen<isa>(tag_kind))
@@ -1515,9 +1514,9 @@ struct jit_bnorm_bwd_t : public jit_generator {
 };
 
 template <cpu_isa_t isa>
-struct jit_bnorm_bwd_diff_ss_t : public jit_generator {
+struct jit_bnorm_bwd_diff_ss_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_bnorm_bwd_diff_ss_t)
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     const AddressFrame &vmmword = (isa == sse41) ? xword
             : (isa == avx2)                      ? yword
@@ -1864,7 +1863,7 @@ struct jit_bnorm_bwd_diff_ss_t : public jit_generator {
 
     jit_bnorm_bwd_diff_ss_t(const batch_normalization_pd_t *pd,
             const jit_memory_tag_kind_t tag_kind)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , pd_(pd)
         , tag_kind_(tag_kind)
         , vlen(get_vlen<isa>(tag_kind))
diff --git a/src/cpu/x64/jit_uni_tbb_batch_normalization.hpp b/src/cpu/x64/jit_uni_tbb_batch_normalization.hpp
index 618a4d2e9b2..777937c0174 100644
--- a/src/cpu/x64/jit_uni_tbb_batch_normalization.hpp
+++ b/src/cpu/x64/jit_uni_tbb_batch_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,10 +39,8 @@ struct driver_t;
 template <cpu_isa_t isa>
 struct jit_uni_tbb_batch_normalization_fwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_fwd_pd_t::
+                cpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("bnorm_tbb_jit:",
@@ -64,7 +62,8 @@ struct jit_uni_tbb_batch_normalization_fwd_t : public primitive_t {
     };
 
     jit_uni_tbb_batch_normalization_fwd_t(const pd_t *apd);
-    ~jit_uni_tbb_batch_normalization_fwd_t();
+
+    ~jit_uni_tbb_batch_normalization_fwd_t() override;
 
     status_t init(engine_t *engine) override;
 
@@ -79,10 +78,8 @@ struct jit_uni_tbb_batch_normalization_fwd_t : public primitive_t {
 template <cpu_isa_t isa>
 struct jit_uni_tbb_batch_normalization_bwd_t : public primitive_t {
     struct pd_t : public cpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : cpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_batch_normalization_bwd_pd_t::
+                cpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("bnorm_tbb_jit:",
@@ -102,7 +99,8 @@ struct jit_uni_tbb_batch_normalization_bwd_t : public primitive_t {
     };
 
     jit_uni_tbb_batch_normalization_bwd_t(const pd_t *apd);
-    ~jit_uni_tbb_batch_normalization_bwd_t();
+
+    ~jit_uni_tbb_batch_normalization_bwd_t() override;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.cpp b/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.cpp
index 7976522e6c6..48c57b42278 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.cpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,8 +48,8 @@ template <cpu_isa_t isa, typename Vmm>
 _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::_jit_uni_x8s8s32x_1x1_conv_kernel(
         const jit_1x1_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), isa), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    : jit_generator_t(jit_name(), isa), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
         static constexpr bool preserve_vmm = true;
@@ -61,10 +61,12 @@ _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::_jit_uni_x8s8s32x_1x1_conv_kernel(
                 memory_desc_wrapper(dst_md), tail_size,
                 use_exact_tail_scalar_bcast};
         static_params_t static_params {this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params
+                {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_
                 = utils::make_unique<injector::jit_uni_postops_injector_t<isa>>(
-                        this, jcp.post_ops, static_params);
+                        this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -190,7 +192,17 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::apply_postops(const int ur,
         const int load_loop_blk, const bool mask_flag_in,
         const float *p_sum_scale, const int32_t *p_sum_zp) {
 
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(ur, load_loop_blk, [&](const int i_ur, const int i_load) {
+            vmm_idx_off.insert({vreg_accum_idx(load_loop_blk, i_load, i_ur), i_load * jcp.load_block * sizeof(float)});
+        });
+        depthwise_injector::dynamic_params_t ddp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  reg_oc_off, vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {reg_oc_off, vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
         if (jcp.with_sum && *p_sum_zp != 0)
             mov(ptr[rsp + reg_bcast_loop_iter_off], reg_ptr_sum_zp);
         apply_sum(ur, load_loop_blk, mask_flag_in, p_sum_scale, p_sum_zp);
@@ -223,17 +235,17 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::apply_postops(const int ur,
                 test(reg_reduce_pos_flag, FLAG_OC_LAST);
                 je(postops_no_tail, T_NEAR);
                 postops_injector_->compute_vector_range(
-                        vmm_idxs, rhs_arg_params_tail);
+                        vmm_idxs, rhs_arg_params_tail, ddp, qdp);
                 jmp(postops_done, T_NEAR);
                 L(postops_no_tail);
             }
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
             L(postops_done);
         } else {
             iterate(ur, load_loop_blk, [&](const int i_ur, const int i_load) {
                 vmm_idxs.emplace(vreg_accum_idx(load_loop_blk, i_load, i_ur));
             });
-            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
         }
         if (jcp.with_sum && *p_sum_zp != 0)
             mov(reg_ptr_sum_zp, ptr[rsp + reg_bcast_loop_iter_off]);
@@ -322,12 +334,12 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::reduce_loop(
             const auto ptr_scales_offset
                     = jcp.is_oc_scale * (sizeof(float) * jcp.oc_block * i_load);
             if (jcp.with_bias) {
-                if (jcp.signed_input || jcp.dst_scale)
+                if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                     mov(reg_bias_data, ptr[rsp + reg_bias_data_off]);
                 cvt2ps(jcp.bia_dt, vmm_bias, reg_bias_data,
                         jcp.typesize_bia * jcp.oc_block * i_load, load_size);
             }
-            if (jcp.signed_input) {
+            if (jcp.signed_input || jcp.with_input_zp) {
                 mov(reg_comp_data, ptr[rsp + reg_comp_data_off]);
                 cvt2ps(data_type::s32, vmm_comp, reg_comp_data,
                         sizeof(int32_t) * jcp.oc_block * i_load, load_size);
@@ -353,7 +365,7 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::reduce_loop(
             for (int i_ur = 0; i_ur < ur; ++i_ur) {
                 const auto r = vreg_accum(load_loop_blk, i_load, i_ur);
                 uni_vcvtdq2ps(r, r);
-                if (jcp.signed_input) uni_vaddps(r, r, vmm_comp);
+                if (jcp.signed_input || jcp.with_input_zp) uni_vaddps(r, r, vmm_comp);
                 if (jcp.src_zero_point) uni_vaddps(r, r, vmm_zp_comp);
 
                 uni_vmulps(r, r, vmm_scale);
@@ -462,6 +474,8 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::reduce_loop(
     Label reduce_loop;
     Label reduce_loop_tail;
 
+    push(reg_oc_off);
+
     mov(aux_reg_load_data, reg_load_data);
 
     mov(aux_reg_bcast_data, aux1_reg_bcast_data);
@@ -483,6 +497,8 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::reduce_loop(
     L(reduce_loop_tail);
     fma_block(jcp.ic != jcp.ic_without_padding);
 
+    pop(reg_oc_off);
+
     if (jcp.oc_without_padding != jcp.oc) {
         Label end_store, common_store;
         mov(ptr[rsp + reg_bcast_data_off], reg_bcast_data);
@@ -514,10 +530,14 @@ template <cpu_isa_t isa, typename Vmm>
 void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::generate() {
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_load_data, reg_output_data);
+
     sub(rsp, stack_space_needed);
+    base_post_ops_data_offset += stack_space_needed;
 
     if (jcp.with_bias) mov(reg_bias_data, ptr[param1 + GET_OFF(bias_data)]);
-    if (jcp.signed_input) {
+    if (jcp.signed_input || jcp.with_input_zp) {
         mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
         mov(reg_comp_data, ptr[param1 + GET_OFF(compensation)]);
         mov(ptr[rsp + reg_comp_data_off], reg_comp_data);
@@ -529,7 +549,7 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::generate() {
         mov(ptr[rsp + reg_src_zero_point_off], reg_src_zero_point);
     }
     if (jcp.dst_scale) {
-        if (!jcp.signed_input) mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
+        if (!jcp.signed_input && !jcp.with_input_zp) mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
         mov(reg_ptr_dst_scale, ptr[param1 + GET_OFF(dst_scale)]);
         mov(ptr[rsp + reg_dst_scale_off], reg_ptr_dst_scale);
     }
@@ -548,19 +568,20 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::generate() {
     mov(ptr[rsp + bcast_loop_work_off], reg_bcast_loop_work);
     mov(reg_reduce_loop_work, ptr[param1 + GET_OFF(reduce_dim)]);
     mov(reg_reduce_pos_flag, ptr[param1 + GET_OFF(first_last_flag)]);
+    mov(reg_oc_off, ptr[param1 + GET_OFF(oc_off)]);
 
     auto load_loop_body = [&](int load_loop_blk) {
         bcast_loop(load_loop_blk);
         add(reg_load_data, load_loop_blk * jcp.load_loop_load_step);
         if (jcp.with_bias) {
-            if (jcp.signed_input || jcp.dst_scale)
+            if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                 mov(reg_bias_data, ptr[rsp + reg_bias_data_off]);
             add(reg_bias_data,
                     load_loop_blk * jcp.load_block * jcp.typesize_bia);
-            if (jcp.signed_input || jcp.dst_scale)
+            if (jcp.signed_input || jcp.dst_scale || jcp.with_input_zp)
                 mov(ptr[rsp + reg_bias_data_off], reg_bias_data);
         }
-        if (jcp.signed_input) {
+        if (jcp.signed_input || jcp.with_input_zp) {
             mov(reg_comp_data, ptr[rsp + reg_comp_data_off]);
             add(reg_comp_data,
                     load_loop_blk * jcp.load_block * sizeof(int32_t));
@@ -581,6 +602,7 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::generate() {
         mov(reg_bcast_data, ptr[rsp + reg_bcast_data_off]);
         add(reg_output_data, load_loop_blk * jcp.load_block * jcp.typesize_out);
         sub(reg_load_loop_work, load_loop_blk * jcp.load_loop_iter_step);
+        add(reg_oc_off, load_loop_blk * jcp.oc_block * sizeof(float));
     };
 
     static const int ur_cases[] = {2, 3, 5, 12};
@@ -624,7 +646,13 @@ void _jit_uni_x8s8s32x_1x1_conv_kernel<isa, Vmm>::generate() {
         }
     }
     L(load_loop_blk[num_ur_cases]);
+
+    base_post_ops_data_offset -= stack_space_needed;
     add(rsp, stack_space_needed);
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -640,6 +668,11 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
     // disabling verbose dispatch messages for unsupported isa for better readability
     if (!mayiuse(isa)) return status::unimplemented;
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     const bool dt_not_ok
             = !one_of(src_d.data_type(), data_type::u8, data_type::s8)
@@ -675,6 +708,20 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
 
     jcp.signed_input = (src_d.data_type() == data_type::s8);
 
+    jcp.with_input_zp = !attr.input_zero_points_.has_default_values();
+    jcp.with_weights_zp = !attr.weights_zero_points_.has_default_values();
+
+    if (jcp.with_input_zp) {
+        if (attr.input_zero_points_.count_ != 1 && attr.input_zero_points_.count_ != jcp.ic * jcp.ngroups)
+            return status::unimplemented;
+
+        if (attr.output_compensations_.count_ != jcp.oc * jcp.ngroups)
+            return status::unimplemented;
+    }
+
+    if (jcp.with_weights_zp)
+        return status::unimplemented;
+
     jcp.os = static_cast<dim_t>(jcp.od) * jcp.oh * jcp.ow;
     jcp.is = static_cast<dim_t>(jcp.id) * jcp.ih * jcp.iw;
 
@@ -695,11 +742,14 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
     const int sum_ind = post_ops.find(primitive_kind::sum, 0, dw_conv_ind);
     jcp.with_sum = sum_ind != -1;
 
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise, 0, dw_conv_ind) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization, 0, dw_conv_ind) != -1;
+
     const auto zp = attr.zero_points_;
     jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
     jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
     jcp.zp_src_is_common
-            = zp.common(DNNL_ARG_SRC); // otherwise, it's per-channel
+            = zp.get_mask(DNNL_ARG_SRC) == 0; // otherwise, it's per-channel
     assert(IMPLICATION(jcp.src_zero_point, jcp.zp_src_is_common));
 
     VDISPATCH_CONV_IC(
@@ -708,8 +758,8 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
 
     format_tag_t dat_tag = utils::pick(
             ndims - 3, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
-    jcp.src_tag = src_d.matches_one_of_tag(dat_tag);
-    jcp.dst_tag = dst_d.matches_one_of_tag(dat_tag);
+    jcp.src_tag = src_d.mb_stride_relaxed_match(dat_tag);
+    jcp.dst_tag = dst_d.mb_stride_relaxed_match(dat_tag);
 
     bool args_ok = true && jcp.ngroups == 1 && jcp.src_tag == dat_tag
             && jcp.dst_tag == dat_tag;
@@ -730,7 +780,7 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
 
     using namespace injector;
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(isa,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, false, false, false));
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, false, false, false));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
     args_ok = true && jcp.oc % simd_w == 0 && jcp.ic % simd_w == 0
@@ -742,7 +792,8 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
 
     jcp.bia_dt = jcp.with_bias ? cd.bias_desc.data_type : data_type::undef;
     jcp.dst_dt = cd.dst_desc.data_type;
-    jcp.sum_dt = post_ops.get_sum_dt(jcp.dst_dt);
+    if (jcp.with_sum)
+        jcp.sum_dt = post_ops.get_sum_dt(jcp.dst_dt);
 
     jcp.ic_block = jcp.oc_block = simd_w;
 
@@ -910,7 +961,7 @@ status_t jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(
 
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     jcp.wei_adj_scale
@@ -927,7 +978,7 @@ void jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_scratchpad(
         const jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
     using namespace dnnl::impl::memory_tracking::names;
 
-    const int wei_mask = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int wei_mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
     const dim_t scales_count
             = wei_mask == 0 ? 1 : static_cast<dim_t>(jcp.oc) * jcp.ngroups;
     const dim_t count = nstl::max<dim_t>(scales_count, 8);
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.hpp b/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.hpp
index 204d00d0bc5..8955480cc08 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.hpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_1x1_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,11 +31,10 @@ namespace cpu {
 namespace x64 {
 
 template <cpu_isa_t isa, typename Vmm>
-struct _jit_uni_x8s8s32x_1x1_conv_kernel : public jit_generator {
+struct _jit_uni_x8s8s32x_1x1_conv_kernel : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_uni_x8s8s32x_1x1_conv_kernel)
     _jit_uni_x8s8s32x_1x1_conv_kernel(const jit_1x1_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_t &dst_md);
-
     int get_tail_size() { return jcp.oc_without_padding % jcp.oc_block; }
 
     jit_1x1_conv_conf_t jcp;
@@ -63,7 +62,7 @@ struct _jit_uni_x8s8s32x_1x1_conv_kernel : public jit_generator {
     const Xbyak::Reg64 reg_reduce_loop_iter = r13;
     const Xbyak::Reg64 aux_reg_bcast_data = r14;
     const Xbyak::Reg64 aux_reg_load_data = r15;
-    const Xbyak::Reg64 aux_reg_saturation = r15;
+    const Xbyak::Reg64 aux_reg_saturation = r14;
     const Xbyak::Reg64 reg_reduce_pos_flag = rax;
     const Xbyak::Reg64 aux1_reg_bcast_data = rbx;
     const Xbyak::Reg64 reg_bcast_loop_work = rbx;
@@ -75,6 +74,14 @@ struct _jit_uni_x8s8s32x_1x1_conv_kernel : public jit_generator {
     const Xbyak::Reg64 reg_src_zero_point = aux_reg_bcast_data; // r14
     const Xbyak::Reg64 reg_dst_zero_point = reg_src_zero_point;
 
+    const Xbyak::Reg64 reg_d_weights = aux_reg_bcast_data;
+    const Xbyak::Reg64 reg_d_bias = abi_param1;
+    const Xbyak::Reg64 reg_oc_off = aux_reg_load_data;
+    int base_post_ops_data_offset = 0;
+
+    Vmm vmm_d_weights = Vmm(0);
+    Vmm vmm_d_bias = Vmm(1);
+
     const Vmm vmm_tmp = Vmm(3);
     const Vmm vmm_one = Vmm(2);
     const Vmm vmm_zero = Vmm(1);
@@ -174,7 +181,7 @@ struct jit_uni_x8s8s32x_1x1_conv_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_x8s8s32x_1x1_conv_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.cpp b/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.cpp
index 6b0e7e9d9e9..bf7a61a4480 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.cpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,6 +69,9 @@ status_t jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward(
     DEFINE_ARG_SCALES_BUFFER(
             dw_dst_scales, DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_DST);
 
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, pd()->jcp_);
+
+
     auto scratchpad = ctx.get_scratchpad_grantor();
 
     auto local_scales
@@ -76,7 +79,7 @@ status_t jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward(
     const float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.0f;
-    int wei_mask = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     if (wei_mask == 0) {
         utils::array_set(
                 local_scales, src_scales[0] * wei_scales[0] * factor, 8);
@@ -94,7 +97,7 @@ status_t jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward(
 
         auto dw_local_scales
                 = dw_scratchpad.template get<float>(key_conv_adjusted_scales);
-        int wei_mask = attr_dw->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+        int wei_mask = attr_dw->scales_.get_mask(DNNL_ARG_WEIGHTS);
         float factor = 1.f / jcp_dw->wei_adj_scale;
         if (wei_mask == 0) {
             utils::array_set(dw_local_scales,
@@ -111,7 +114,8 @@ status_t jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward(
                 dst, local_scales, dst_scales, dw_oscales, dw_dst_scales,
                 src_zero_point, dst_zero_point, scratchpad,
                 post_ops_binary_rhs_arg_vec.data(),
-                post_ops_binary_rhs_arg_vec_dw.data());
+                post_ops_binary_rhs_arg_vec_dw.data(),
+                output_compensation);
     });
     return status::success;
 }
@@ -125,7 +129,8 @@ void jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward_thr(
         const int32_t *src_zero_point, const int32_t *dst_zero_point,
         const memory_tracking::grantor_t &scratchpad,
         const void *post_ops_binary_rhs_arg_vec,
-        const void *post_ops_binary_rhs_arg_vec_dw) const {
+        const void *post_ops_binary_rhs_arg_vec_dw,
+        const int32_t *output_compensation) const {
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_1x1_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -153,9 +158,8 @@ void jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward_thr(
 
     auto offset = weights_d.size() - weights_d.additional_buffer_size();
     char *w = const_cast<char *>(weights);
-    const int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(w + offset)
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(w + offset) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     const int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.ngroups * jcp.oc : 0)
@@ -266,8 +270,7 @@ void jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward_thr(
                 : weights_d.blk_off(ocb, icb);
         p.load_data = weights + wei_offset;
         p.bias_data = &bias[_ocb * jcp.oc_block * bia_dt_size];
-        p.compensation = (jcp.signed_input) ? &compensation[_ocb * jcp.oc_block]
-                                            : nullptr;
+        p.compensation = (jcp.signed_input || jcp.with_input_zp) ? &compensation[_ocb * jcp.oc_block] : nullptr;
         p.zp_compensation = jcp.src_zero_point
                 ? zp_compensation + _ocb * jcp.oc_block
                 : nullptr;
@@ -291,6 +294,7 @@ void jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward_thr(
         p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec;
         p.dst_orig = static_cast<const char *>(p.output_data)
                 - dst_offset * dst_dt_size;
+        p.oc_off = _ocb * jcp.oc_block * sizeof(float);
 
         (*kernel_)(&p);
     };
@@ -420,6 +424,7 @@ void jit_uni_x8s8s32x_1x1_convolution_fwd_t<isa>::execute_forward_thr(
             par_conv_dw.post_ops_binary_rhs_arg_vec
                     = post_ops_binary_rhs_arg_vec_dw;
             par_conv_dw.dst_orig = dst;
+            p.oc_off = ocb * jcp_dw->ch_block * sizeof(float);
 
             (*kernel_dw_)(&par_conv_dw);
 
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.hpp b/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.hpp
index c86b8c75052..05b0f0cd4ae 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.hpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_1x1_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,13 +40,7 @@ namespace x64 {
 template <cpu_isa_t isa>
 struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        using dw_conv_pd_type = cpu_convolution_fwd_pd_t;
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , jcp_()
-            , rtus_()
-            , jcp_dw_(nullptr) {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         pd_t(const pd_t &other) : cpu_convolution_fwd_pd_t(other) {
             if (copy(other) != status::success) is_initialized_ = false;
@@ -80,9 +74,11 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
 
             VDISPATCH_CONV(
-                    attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::zero_points_runtime
-                                    | smask_t::post_ops | smask_t::sum_dt,
+                    attr()->has_default_values(smask_t::scales
+                                    | smask_t::zero_points | smask_t::post_ops
+                                    | smask_t::sum_dt
+                                    | smask_t::input_zero_points
+                                    | smask_t::output_compensations,
                             dst_md(0)->data_type),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(attr()->scales_.has_default_values({DNNL_ARG_SRC,
@@ -105,8 +101,11 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
             const convolution_desc_t *conv_d = desc();
             const memory_desc_t *src_d = src_md();
+
+            // TODO: make `rtus_prepare` assign initialized object to `rtus_`
             rtus_prepare(this, conv_d, src_d, dst_md(), weights_md());
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_uni_x8s8s32x_1x1_conv_kernel<isa>::init_conf(jcp_,
                     *conv_d, *src_d, *weights_md(), *dst_md(),
                     with_bias() ? *weights_md(1) : types::zero_md(), attr_,
@@ -128,14 +127,14 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
         const memory_desc_t *dst_md(
                 int index = 0, bool user_input = false) const override {
-            return jcp_.with_dw_conv
+            return dw_conv_pd_ && jcp_.with_dw_conv
                     ? dw_conv_pd_->dst_md(index, user_input)
                     : cpu_convolution_fwd_pd_t::dst_md(index, user_input);
         }
 
         const memory_desc_t *arg_md(
                 int arg, bool user_input = false) const override {
-            if (jcp_.with_dw_conv) {
+            if (dw_conv_pd_ && jcp_.with_dw_conv) {
                 switch (arg) {
                     case DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_SRC:
                         return cpu_convolution_fwd_pd_t::dst_md(0, user_input);
@@ -153,31 +152,35 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
             if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS))
                 return arg_usage_t::input;
 
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS)
-                    && attr_post_op_dw_inputs() > 1)
-                return arg_usage_t::input;
-
-            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_ATTR_OUTPUT_SCALES)
-                    && jcp_.with_dw_conv)
-                return arg_usage_t::input;
+            if (arg == (DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_BIAS))
+                return attr_post_op_dw_inputs() > 1 ? arg_usage_t::input
+                                                    : arg_usage_t::unused;
 
             return convolution_fwd_pd_t::arg_usage(arg);
         }
 
-        jit_1x1_conv_conf_t jcp_;
-        reduce_to_unit_stride_t rtus_;
-        jit_conv_conf_t *jcp_dw_; // doesn't own a resource
+        jit_1x1_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
+        reduce_to_unit_stride_t rtus_ = utils::zero<decltype(rtus_)>();
+        jit_conv_conf_t *jcp_dw_ = nullptr; // doesn't own a resource
         std::unique_ptr<cpu_convolution_fwd_pd_t> dw_conv_pd_;
         using dw_pd_t = typename jit_uni_x8s8s32x_convolution_fwd_t<isa>::pd_t;
 
     protected:
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
 
         status_t copy(const pd_t &other) {
@@ -231,11 +234,10 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
             if (is_src_s8) {
                 want_wei_md.extra.flags
-                        = 0 | compensation_conv_s8s8 | scale_adjust;
+                        = 0 | compensation_conv_s8s8;
                 want_wei_md.extra.compensation_mask
                         = with_groups() ? g_mask : c_mask;
-                want_wei_md.extra.scale_adjust
-                        = mayiuse(avx2_vnni) ? 1.0f : 0.5f;
+                want_wei_md.extra.scale_adjust = 1.0f;
             }
             if (is_src_zero_point) {
                 want_wei_md.extra.flags |= compensation_conv_asymmetric_src;
@@ -364,7 +366,7 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
 
     // Note: In case of fused depthwise convolution, the final output data type
     // after fusion may not be same as for dst.
-    typedef typename prec_traits<data_type::s32>::type acc_data_t;
+    using acc_data_t = typename prec_traits_t<data_type::s32>::type;
 
     status_t init(engine_t *engine) override {
         CHECK(safe_ptr_assign(kernel_,
@@ -398,7 +400,8 @@ struct jit_uni_x8s8s32x_1x1_convolution_fwd_t : public primitive_t {
             const int32_t *dst_zero_point,
             const memory_tracking::grantor_t &scratchpad,
             const void *post_ops_binary_rhs_arg_vec,
-            const void *post_ops_binary_rhs_arg_vec_dw) const;
+            const void *post_ops_binary_rhs_arg_vec_dw,
+            const int32_t *output_compensation) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
     std::unique_ptr<jit_uni_x8s8s32x_1x1_conv_kernel<isa>> kernel_;
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_1x1_deconvolution.hpp b/src/cpu/x64/jit_uni_x8s8s32x_1x1_deconvolution.hpp
index e094e11e52e..491ad1c0484 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_1x1_deconvolution.hpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_1x1_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,16 +39,12 @@ namespace x64 {
 template <cpu_isa_t isa>
 struct jit_uni_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : cpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using cpu_deconvolution_fwd_pd_t::cpu_deconvolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : cpu_deconvolution_fwd_pd_t(other)
             , conv_pd_(other.conv_pd_->clone()) {}
 
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T(
                 name_.c_str(), jit_uni_x8s8s32x_1x1_deconvolution_fwd_t);
 
@@ -101,9 +97,8 @@ struct jit_uni_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
             VDISPATCH_DECONVOLUTION(
                     desc()->accum_data_type == s32, VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_DECONVOLUTION(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops
-                            | skip_mask_t::zero_points_runtime),
+                    attr()->has_default_values(skip_mask_t::scales
+                            | skip_mask_t::post_ops | skip_mask_t::zero_points),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_DECONVOLUTION(
                     zero_points_valid(attr(), true /*per_oc_bcast_accepted*/),
@@ -134,9 +129,10 @@ struct jit_uni_x8s8s32x_1x1_deconvolution_fwd_t : public primitive_t {
         std::shared_ptr<primitive_desc_t> conv_pd_;
 
     private:
-        std::string name_ = JIT_IMPL_NAME_HELPER("jit_deconvolution:", isa, "");
+        std::string name_;
 
         void init_name() {
+            name_ = JIT_IMPL_NAME_HELPER("jit_deconvolution:", isa, "");
             name_.append("+");
             name_.append(conv_pd_->name());
         }
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.cpp b/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.cpp
index d4512765ee3..a58c22b1a68 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.cpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ void pick_loop_order(jit_conv_conf_t &jcp) {
     jcp.loop_order = loop_cwgn;
     if (jcp.ngroups > 1) {
         jcp.loop_order = loop_ngcw;
-        if (jcp.mb < jcp.nthr)
+        if (jcp.mb < jcp.nthr && jcp.ndims != 5)
             jcp.loop_order = jcp.ndims == 3 ? loop_nwcg : loop_nhwcg;
     } else if (jcp.mb >= jcp.nthr && jcp.ic_without_padding <= 8) {
         jcp.loop_order = loop_ngcw;
@@ -57,12 +57,12 @@ template <cpu_isa_t isa, typename Vmm>
 _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::_jit_uni_x8s8s32x_fwd_kernel(
         const jit_conv_conf_t &ajcp, const primitive_attr_t &attr,
         const memory_desc_t &dst_md)
-    : jit_generator(jit_name(), isa), jcp(ajcp), attr_(attr) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
+    : jit_generator_t(jit_name(), isa), jcp(ajcp), attr_(attr) {
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
         using namespace binary_injector;
         static constexpr bool preserve_gpr = true;
-        static constexpr bool preserve_vmm = false;
-        static constexpr size_t helper_vmm_idx = 15;
+        static constexpr bool preserve_vmm = true;
+        static constexpr size_t helper_vmm_idx = 2;
         const size_t block_tail
                 = (jcp.is_depthwise ? jcp.ch_block : jcp.oc_block)
                 % isa_simd_width_;
@@ -77,10 +77,12 @@ _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::_jit_uni_x8s8s32x_fwd_kernel(
                 memory_desc_wrapper(dst_md), tail_size, true};
         const static_params_t static_params {
                 this->param1, rhs_arg_static_params};
+        quantization_injector::static_params_t quantization_static_params =
+                {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
 
         postops_injector_
                 = utils::make_unique<injector::jit_uni_postops_injector_t<isa>>(
-                        this, jcp.post_ops, static_params);
+                        this, jcp.post_ops, static_params, quantization_static_params);
     }
 }
 
@@ -180,14 +182,28 @@ template <cpu_isa_t isa, typename Vmm>
 void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::apply_postops(
         const int nb_oc_block, const int ur_w, const bool last_oc_block_flag,
         const int oc_block, const float *p_sum_scale, const int32_t *p_sum_zp) {
-    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum) {
-        if (jcp.with_sum && *p_sum_zp != 0) push(reg_ptr_sum_zp);
+    if (jcp.with_eltwise || jcp.with_binary || jcp.with_sum || jcp.with_depthwise || jcp.with_quantization) {
+        std::map<size_t, int> vmm_idx_off;
+        iterate(nb_oc_block, ur_w,
+                [&](const bool, const int k, const int j) {
+                    vmm_idx_off.insert({vmm_out_idx(j, k), k * oc_block * sizeof(float)});
+                });
+        depthwise_injector::dynamic_params_t ddp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                                  ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                                  this->rsp, base_post_ops_data_offset};
+        quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp.dst_dt,
+                                                     this->rsp, base_post_ops_data_offset};
+
+        if (jcp.with_sum && *p_sum_zp != 0) {
+            base_post_ops_data_offset += reg64_size;
+            push(reg_ptr_sum_zp);
+        }
         apply_sum(nb_oc_block, ur_w, last_oc_block_flag, oc_block, p_sum_scale,
                 p_sum_zp);
 
         vmm_index_set_t vmm_idxs;
+        binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
         if (jcp.with_binary) {
-            binary_injector::rhs_arg_dynamic_params_t rhs_arg_params;
             const bool oc_blk_is_smaller_than_vmm = oc_block < isa_simd_width_;
             iterate(nb_oc_block, ur_w, last_oc_block_flag,
                     oc_blk_is_smaller_than_vmm,
@@ -214,9 +230,12 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::apply_postops(
                     [&](const bool, const int k, const int j) {
                         vmm_idxs.emplace(vmm_out_idx(j, k));
                     });
-            postops_injector_->compute_vector_range(vmm_idxs);
+            postops_injector_->compute_vector_range(vmm_idxs, rhs_arg_params, ddp, qdp);
+        }
+        if (jcp.with_sum && *p_sum_zp != 0) {
+            base_post_ops_data_offset -= reg64_size;
+            pop(reg_ptr_sum_zp);
         }
-        if (jcp.with_sum && *p_sum_zp != 0) pop(reg_ptr_sum_zp);
     }
 }
 
@@ -229,7 +248,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::store_output(
 
     mov(reg_bias, ptr[param1 + GET_OFF(bias)]);
     mov(reg_ptr_scales, ptr[param1 + GET_OFF(scales)]);
-    if (jcp.signed_input)
+    if (jcp.signed_input || jcp.with_input_zp)
         mov(reg_compensation, ptr[param1 + GET_OFF(compensation)]);
 
     if (jcp.src_zero_point) {
@@ -256,7 +275,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::store_output(
             int bias_offset = jcp.typesize_bia * k * oc_block;
             cvt2ps(jcp.bia_dt, vmm_bias, reg_bias, bias_offset, load_size);
         }
-        if (jcp.signed_input) {
+        if (jcp.signed_input || jcp.with_input_zp) {
             const int comp_offset = sizeof(int32_t) * k * oc_block;
             load_data(data_type::s32, vmm_comp, reg_compensation, comp_offset,
                     load_size);
@@ -281,7 +300,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::store_output(
             /* add comp in s32 to avoid loss of precision
                when convert s32 to f32 in integer (2^24)
                TODO: do the same to bias */
-            if (jcp.signed_input) uni_vpaddd(vmm, vmm, vmm_comp);
+            if (jcp.signed_input || jcp.with_input_zp) uni_vpaddd(vmm, vmm, vmm_comp);
             if (jcp.src_zero_point) uni_vpaddd(vmm, vmm, vmm_zp_comp);
             uni_vcvtdq2ps(vmm, vmm);
 
@@ -381,9 +400,10 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker_dw(int ur_w, int pad_l,
                     && std::is_same<Vmm, Xbyak::Xmm>::value))
         assert(!"invalid group blocking for depthwise convolution");
 
-    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input);
+    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input || jcp.with_input_zp);
 
     if (jcp.src_zero_point) {
+        base_post_ops_data_offset += reg64_size;
         push(aux_reg_ker_d);
         mov(reg_src_zero_point, ptr[param1 + GET_OFF(src_zero_point)]);
         uni_vpbroadcastd(vmm_zp, ptr[reg_src_zero_point]);
@@ -407,7 +427,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker_dw(int ur_w, int pad_l,
     };
 
     auto kernel_offset = [this](int ci, int ki) {
-        return jcp.typesize_in * ((ci * jcp.kh * jcp.kw + ki) * jcp.ch_block);
+        return jcp.typesize_in * ((ci * jcp.kd * jcp.kh * jcp.kw + ki) * jcp.ch_block);
     };
 
     auto compute = [this](Vmm vreg_acc, Vmm vreg_wei, Vmm vreg_src) {
@@ -438,6 +458,10 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker_dw(int ur_w, int pad_l,
     }
 
     for (int ci = 0; ci < jcp.nb_ch_blocking; ++ci) {
+        if (jcp.with_input_zp && (h_padded || get_ow_start(0, pad_l) != 0 || get_ow_end(ur_w, jcp.kw-1, pad_r) != ur_w)) {
+            load_data(data_type::u8, vmm_shift, reg_input_zp, ci * jcp.ch_block, get_blocking_size());
+        }
+
         const bool mask_flag = last_ic_block_flag != no_last_block
                 && ci == jcp.nb_ch_blocking - 1;
         if (jcp.is_resrc_depthwise && !h_padded) {
@@ -461,12 +485,12 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker_dw(int ur_w, int pad_l,
             if (compute_kernel) {
                 uni_vpmovsxbd(vmm_wei, ptr[aux_reg_ker + aux_kernel_offset]);
                 if (h_padded) {
-                    assert(jcp.signed_input);
+                    assert(jcp.signed_input || jcp.with_input_zp);
                     for (int oi = 0; oi < ur_w; ++oi)
                         compute(vmm_out(oi, ci), vmm_wei, vmm_shift);
                 } else {
-                    int start = jcp.signed_input ? 0 : oi_start;
-                    int end = jcp.signed_input ? ur_w : oi_end;
+                    int start = (jcp.signed_input || jcp.with_input_zp) ? 0 : oi_start;
+                    int end = (jcp.signed_input || jcp.with_input_zp) ? ur_w : oi_end;
                     for (int oi = start; oi < end; ++oi) {
                         if (oi >= oi_start && oi < oi_end) {
                             if (jcp.is_resrc_depthwise) {
@@ -485,7 +509,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker_dw(int ur_w, int pad_l,
                             }
                             compute(vmm_out(oi, ci), vmm_wei, vmm_dw_src);
                         } else {
-                            assert(jcp.signed_input);
+                            assert(jcp.signed_input || jcp.with_input_zp);
                             compute(vmm_out(oi, ci), vmm_wei, vmm_shift);
                         }
                     }
@@ -510,7 +534,10 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker_dw(int ur_w, int pad_l,
         }
     }
 
-    if (jcp.src_zero_point) pop(aux_reg_ker_d);
+    if (jcp.src_zero_point) {
+        base_post_ops_data_offset -= reg64_size;
+        pop(aux_reg_ker_d);
+    }
 }
 
 template <cpu_isa_t isa, typename Vmm>
@@ -527,11 +554,12 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker(int ur_w, int pad_l,
 
     int nb_oc_block = jcp.nb_oc_blocking;
 
-    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input);
+    const bool compute_kernel = IMPLICATION(h_padded, jcp.signed_input || jcp.with_input_zp);
 
-    assert(IMPLICATION(h_padded, jcp.src_zero_point || jcp.signed_input));
+    assert(IMPLICATION(h_padded, jcp.src_zero_point || jcp.signed_input || jcp.with_input_zp));
 
     if (jcp.src_zero_point) {
+        base_post_ops_data_offset += reg64_size;
         push(aux_reg_ker_d);
         mov(reg_src_zero_point, ptr[param1 + GET_OFF(src_zero_point)]);
     }
@@ -564,8 +592,8 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker(int ur_w, int pad_l,
         const int ow_end = get_ow_end(ur_w, ki, pad_r);
         const int ic_tail_size = jcp.ic_without_padding % ic_sub_step;
 
-        const int _start = jcp.signed_input ? 0 : ow_start;
-        const int _end = jcp.signed_input ? ur_w : ow_end;
+        const int _start = (jcp.signed_input || jcp.with_input_zp) ? 0 : ow_start;
+        const int _end = (jcp.signed_input || jcp.with_input_zp) ? ur_w : ow_end;
 
         /* Skip the last loads of input
             if (ic % 8) / ic_sub_step < ic_block / ic_sub_step */
@@ -576,6 +604,9 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker(int ur_w, int pad_l,
         if (compute_kernel) {
             for (int ic = 0; ic < icb; ++ic) {
                 if (h_padded) {
+                    if (jcp.with_input_zp)
+                        uni_vpbroadcastd(vmm_shift, ptr[reg_input_zp + ic_sub_step * ic * sizeof(uint8_t)]);
+
                     // fill padded area with shifted value in first iteration
                     if (ic == 0) {
                         const Vmm inp = vmm_inp(0, nb_oc_block);
@@ -608,7 +639,10 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker(int ur_w, int pad_l,
                         } else {
                             // fill padded area with shifted value in
                             // first iteration
-                            if (jcp.signed_input && ic == 0) {
+                            if ((jcp.signed_input || jcp.with_input_zp) && ic == 0) {
+                                if (jcp.with_input_zp)
+                                    uni_vpbroadcastd(vmm_shift, ptr[reg_input_zp + 4 * ic * sizeof(uint8_t)]);
+
                                 const Vmm inp = vmm_inp(jj, nb_oc_block);
                                 uni_vmovups(inp, vmm_shift);
                             }
@@ -655,7 +689,10 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::compute_ker(int ur_w, int pad_l,
             }
         }
     }
-    if (jcp.src_zero_point) pop(aux_reg_ker_d);
+    if (jcp.src_zero_point) {
+        base_post_ops_data_offset -= reg64_size;
+        pop(aux_reg_ker_d);
+    }
 }
 
 template <cpu_isa_t isa, typename Vmm>
@@ -683,7 +720,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::kh_loop(
     if (jcp.ndims == 5) {
         mov(aux_reg_ker_d, reg_ker);
         mov(aux_reg_inp_d, reg_inp);
-        if (jcp.signed_input || jcp.src_zero_point) {
+        if (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) {
             //TODO: May be avoided when f_pad=0 and dd0
             //TODO: Potential optimization by precomputing, when kd <<< od?
             mov(reg_ki, ptr[param1 + GET_OFF(f_overflow)]);
@@ -708,8 +745,8 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::kh_loop(
         }
 
         mov(reg_ki, ptr[param1 + GET_OFF(kd_padding)]);
-        if ((jcp.signed_input || jcp.src_zero_point) || (jcp.dilate_d >= jcp.id)
-                || (!(jcp.signed_input || jcp.src_zero_point)
+        if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) || (jcp.dilate_d >= jcp.id)
+                || (!(jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                         && (jcp.kd - 1) * (jcp.dilate_d + 1)
                                 < nstl::max(jcp.f_pad, jcp.back_pad))) {
             cmp(reg_ki, 0);
@@ -727,7 +764,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::kh_loop(
         mov(aux_reg_ker, reg_ker);
     }
 
-    if ((jcp.signed_input || jcp.src_zero_point) && jcp.ndims > 3) {
+    if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) && jcp.ndims > 3) {
         mov(reg_overflow, ptr[param1 + GET_OFF(t_overflow)]);
         cmp(reg_overflow, 0);
         je(no_t_overflow_label, T_NEAR);
@@ -743,8 +780,8 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::kh_loop(
         L(no_t_overflow_label);
     }
     mov(reg_kj, ptr[param1 + GET_OFF(kh_padding)]);
-    if ((jcp.signed_input || jcp.src_zero_point) || (jcp.dilate_h >= jcp.ih)
-            || (!(jcp.signed_input || jcp.src_zero_point)
+    if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) || (jcp.dilate_h >= jcp.ih)
+            || (!(jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                     && (jcp.kh - 1) * (jcp.dilate_h + 1)
                             < nstl::max(jcp.t_pad, jcp.b_pad))) {
         cmp(reg_kj, 0);
@@ -769,7 +806,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::kh_loop(
         jg(kh_label, T_NEAR);
     }
     L(skip_kh_loop);
-    if ((jcp.signed_input || jcp.src_zero_point) && jcp.ndims > 3) {
+    if ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) && jcp.ndims > 3) {
         mov(reg_overflow, ptr[param1 + GET_OFF(b_overflow)]);
         cmp(reg_overflow, 0);
         je(no_b_overflow_label, T_NEAR);
@@ -791,7 +828,7 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::kh_loop(
         jne(kd_label, T_NEAR);
 
         L(skip_kd_loop);
-        if (jcp.signed_input || jcp.src_zero_point) {
+        if (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp) {
             mov(reg_ki, ptr[param1 + GET_OFF(back_overflow)]);
             cmp(reg_ki, 0);
             je(no_back_overflow_label, T_NEAR);
@@ -823,6 +860,9 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::icb_loop(
     // IC loop
     Label icb_label;
     mov(reg_icb, jcp.nb_ic);
+    if (jcp.with_input_zp)
+        mov(reg_input_zp, ptr[param1 + GET_OFF(input_zp)]);
+
     L(icb_label);
     const bool do_icb_loop
             = jcp.is_depthwise ? jcp.nb_ch > jcp.nb_ch_blocking : jcp.nb_ic > 1;
@@ -855,6 +895,8 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::icb_loop(
                 * jcp.ic_block;
         add(reg_inp, jcp.typesize_in * inp_step);
         safe_add(reg_ker, jcp.typesize_in * ker_step, reg_ker_long_offt);
+        if (jcp.with_input_zp)
+            add(reg_input_zp, sizeof(uint8_t) * inp_step);
 
         dec(reg_icb);
         cmp(reg_icb, 0);
@@ -903,16 +945,19 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::generate() {
             * (jcp.ur_w * jcp.oc_without_padding * jcp.ngroups);
     preamble();
 
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(this->param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_inp, reg_out);
+
     if (jcp.is_depthwise) {
         const bool is_zero_point = jcp.src_zero_point || jcp.dst_zero_point;
         int idx = ker_max_reg + 1 - jcp.max_regs_ur - 2 * is_zero_point;
         if (!jcp.is_resrc_depthwise) vmm_dw_src = Vmm(--idx);
         if (!jcp.has_vnni) vmm_dw_tmp = Vmm(--idx);
-        if (jcp.signed_input) {
+        if (jcp.signed_input || jcp.with_input_zp) {
             --idx; // due to extra register used for compensations
         }
-        assert(IMPLICATION(
-                !is_zero_point, idx == ker_max_reg - ker_dw_reg_base_idx));
+        assert(IMPLICATION(!is_zero_point && !jcp.with_input_zp,
+                idx == ker_max_reg - ker_dw_reg_base_idx));
     }
 
     if (!jcp.is_depthwise && (!jcp.has_vnni)) {
@@ -1229,6 +1274,9 @@ void _jit_uni_x8s8s32x_fwd_kernel<isa, Vmm>::generate() {
     L(done_compute);
     assert(ow_block_jmp_table.size() == static_cast<size_t>(label_cntr));
 
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
+
     postamble();
 
     if (jcp.with_eltwise)
@@ -1247,6 +1295,11 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
     const memory_desc_wrapper dst_d(&dst_md);
     const memory_desc_wrapper bias_d(&bias_md);
 
+    // Big int (> INT_MAX) values are unsupported and jcp fields may overflow
+    // TODO: change data type of jcp fields to size_t
+    VDISPATCH_CONV_IC(!has_large_size(cd, src_d, weights_d, dst_d),
+            VERBOSE_BAD_PARAM, "Large size is not supported");
+
     const bool with_groups = weights_d.ndims() == src_d.ndims() + 1;
     const int ndims = src_d.ndims();
     const bool is_1d = ndims == 3;
@@ -1315,14 +1368,14 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
 
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     const auto zp = attr.zero_points_;
     jcp.dst_zero_point = !zp.has_default_values(DNNL_ARG_DST);
     jcp.src_zero_point = !zp.has_default_values(DNNL_ARG_SRC);
     jcp.zp_src_is_common
-            = zp.common(DNNL_ARG_SRC); // otherwise, it's per-channel
+            = zp.get_mask(DNNL_ARG_SRC) == 0; // otherwise, it's per-channel
     assert(IMPLICATION(jcp.src_zero_point, jcp.zp_src_is_common));
 
     VDISPATCH_CONV_IC(
@@ -1330,8 +1383,22 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
             VERBOSE_UNSUPPORTED_FEATURE,
             "fused depthwise convolution does not support zero-point");
 
-    VDISPATCH_CONV_IC(!(is_3d && jcp.is_depthwise), VERBOSE_UNSUPPORTED_FEATURE,
-            "unsupported depthwise implementation for 3D convolution");
+    jcp.with_input_zp = !attr.input_zero_points_.has_default_values();
+    jcp.with_weights_zp = !attr.weights_zero_points_.has_default_values();
+
+    if (jcp.with_input_zp) {
+        if (attr.input_zero_points_.count_ != 1 && attr.input_zero_points_.count_ != jcp.ic * jcp.ngroups)
+            return status::unimplemented;
+
+        if (attr.output_compensations_.count_ != jcp.oc * jcp.ngroups)
+            return status::unimplemented;
+    }
+
+    if (jcp.with_input_zp && jcp.is_depthwise && !utils::one_of(ndims, 3, 4))
+        return status::unimplemented;
+
+    if (jcp.with_weights_zp)
+        return status::unimplemented;
 
     if (jcp.is_depthwise) {
         jcp.ch_block = is_avx2 ? 8 : 4;
@@ -1364,7 +1431,7 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
             && jcp.kw < 4 && jcp.dilate_w == 0;
 
     if (jcp.is_depthwise) {
-        jcp.max_regs_ur = 14 - !jcp.is_resrc_depthwise - jcp.signed_input
+        jcp.max_regs_ur = 14 - !jcp.is_resrc_depthwise - (jcp.signed_input || jcp.with_input_zp)
                 + (jcp.has_vnni);
     } else {
         jcp.max_regs_ur = jcp.has_vnni ? 15 - jcp.signed_input : 12;
@@ -1386,7 +1453,8 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
                 wei_tag = with_groups ? jcp.is_depthwise ? Goihw8g : gOIhw2i8o4i
                                       : OIhw2i8o4i;
             } else {
-                wei_tag = with_groups ? gOIdhw2i8o4i : OIdhw2i8o4i;
+                wei_tag = with_groups ? jcp.is_depthwise ? Goidhw8g : gOIdhw2i8o4i
+                                      : OIdhw2i8o4i;
             }
         } else {
             if (is_avx2) {
@@ -1401,7 +1469,9 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
                             ? jcp.is_depthwise ? Goihw4g : gOIhw4o4i
                             : OIhw4o4i;
                 } else {
-                    wei_tag = with_groups ? gOIdhw4o4i : OIdhw4o4i;
+                    wei_tag = with_groups
+                            ? jcp.is_depthwise ? Goidhw4g : gOIdhw4o4i
+                            : OIdhw4o4i;
                 }
             }
         }
@@ -1469,14 +1539,18 @@ status_t jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jit_conv_conf_t &jcp,
     jcp.with_binary = !everyone_is(-1, binary_ind, prelu_ind);
     const int sum_ind = post_ops.find(primitive_kind::sum);
     jcp.with_sum = sum_ind != -1;
-    jcp.sum_dt = post_ops.get_sum_dt(jcp.dst_dt);
+    if (jcp.with_sum)
+        jcp.sum_dt = post_ops.get_sum_dt(jcp.dst_dt);
+
+    jcp.with_depthwise = post_ops.find(primitive_kind::depthwise) != -1;
+    jcp.with_quantization = post_ops.find(primitive_kind::quantization) != -1;
 
     jcp.post_ops = post_ops;
 
     using namespace injector;
 
     const bool post_ops_ok_ = post_ops_ok(post_ops_ok_args_t(isa,
-            {eltwise, binary, sum}, jcp.post_ops, &dst_d, false, false, false));
+            {eltwise, binary, sum, depthwise, quantization}, jcp.post_ops, &dst_d, false, false, false));
     VDISPATCH_CONV_IC(post_ops_ok_, VERBOSE_UNSUPPORTED_POSTOP);
 
     jcp.typesize_in = types::data_type_size(src_d.data_type());
@@ -1614,7 +1688,7 @@ void jit_uni_x8s8s32x_fwd_kernel<isa>::init_scratchpad(
         memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp,
         const primitive_attr_t &attr) {
 
-    const int mask = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
     const dim_t scales_count
             = mask == 0 ? 1 : static_cast<dim_t>(jcp.oc) * jcp.ngroups;
     dim_t count = scales_count == 1 ? (dim_t)8 : scales_count;
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.hpp b/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.hpp
index 64f96c2a38d..176a5856c99 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.hpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ namespace cpu {
 namespace x64 {
 
 template <cpu_isa_t isa, typename Vmm>
-struct _jit_uni_x8s8s32x_fwd_kernel : public jit_generator {
+struct _jit_uni_x8s8s32x_fwd_kernel : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_uni_x8s8s32x_conv_fwd_ker_t_)
 
     _jit_uni_x8s8s32x_fwd_kernel(const jit_conv_conf_t &ajcp,
@@ -42,7 +42,7 @@ struct _jit_uni_x8s8s32x_fwd_kernel : public jit_generator {
 
 private:
     constexpr static int isa_simd_width_
-            = cpu_isa_traits<isa>::vlen / sizeof(float);
+            = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     std::unique_ptr<injector::jit_uni_postops_injector_t<isa>>
             postops_injector_;
     enum {
@@ -98,6 +98,16 @@ struct _jit_uni_x8s8s32x_fwd_kernel : public jit_generator {
     /* binary post-ops operand */
     const Xbyak::Reg64 temp_offset_reg = r12;
 
+    const Xbyak::Reg64 reg_input_zp = abi_not_param1;
+
+    const Xbyak::Reg64 reg_d_weights = r15;
+    const Xbyak::Reg64 reg_d_bias = r13;
+    int base_post_ops_data_offset = 0;
+    constexpr static int reg64_size = 8;
+
+    const Vmm vmm_d_weights = Vmm(0);
+    const Vmm vmm_d_bias = Vmm(1);
+
     const Vmm vmm_wei = Vmm(0);
     /* used during bias/comp/scale section of store_output */
     const Vmm vmm_bias = Vmm(0);
@@ -232,7 +242,7 @@ struct jit_uni_x8s8s32x_fwd_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_x8s8s32x_fwd_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_convolution.cpp b/src/cpu/x64/jit_uni_x8s8s32x_convolution.cpp
index bfa20f0d33e..62367366648 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_convolution.cpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ const float *jit_uni_x8s8s32x_convolution_fwd_t<isa>::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
-    int wei_mask = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.0f;
@@ -71,6 +71,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -94,9 +97,8 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d(
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<char *>(weights);
-    const int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     const int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.ngroups * jcp.oc : 0)
@@ -150,7 +152,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d(
                 auto bias_w = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
                                    : nullptr;
                 const int32_t *compensation_w
-                        = (jcp.signed_input) ? compensation + g_oc : nullptr;
+                        = (jcp.signed_input || jcp.with_input_zp) ? compensation + g_oc : nullptr;
 
                 auto dst_w = dst
                         + dst_dt_size * dst_d.blk_off(n, g_oc, oh_s, ow_s);
@@ -173,7 +175,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d(
                             0, jcp.kh - i_t_overflow - i_b_overflow);
 
                     const size_t wei_stride
-                            = (jcp.signed_input || jcp.src_zero_point)
+                            = (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                             ? 0
                             : i_t_overflow * wht_h_stride;
                     p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
@@ -199,6 +201,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d(
                     p.post_ops_binary_rhs_arg_vec
                             = post_ops_binary_rhs_arg_vec.data();
                     p.dst_orig = dst;
+                    p.oc_off = g_oc * sizeof(float);
+                    if (jcp.with_input_zp)
+                        p.input_zp = input_zp + g_ic;
 
                     (*kernel_)(&p);
                     src_w += src_h_stride * jcp.stride_h;
@@ -240,6 +245,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_1d(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -264,9 +272,8 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_1d(
     size_t ch_offset = jcp.is_depthwise ? jcp.nb_ch * jcp.ch_block
                                         : jcp.ngroups * jcp.oc;
     auto w = const_cast<char *>(weights);
-    const int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[extra_data_offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[extra_data_offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     const int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[extra_data_offset])
                     + (jcp.signed_input ? ch_offset : 0)
@@ -313,7 +320,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_1d(
 
             p.bias = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
                           : nullptr;
-            p.compensation = (jcp.signed_input) ? compensation + g_oc : nullptr;
+            p.compensation = (jcp.signed_input || jcp.with_input_zp) ? compensation + g_oc : nullptr;
             p.zp_compensation
                     = jcp.src_zero_point ? zp_compensation + g_oc : nullptr;
             p.src_zero_point = jcp.src_zero_point ? src_zero_point : nullptr;
@@ -331,6 +338,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_1d(
 
             p.post_ops_binary_rhs_arg_vec = post_ops_binary_rhs_arg_vec.data();
             p.dst_orig = dst;
+            p.oc_off = g_oc * sizeof(float);
+            if (jcp.with_input_zp)
+                p.input_zp = input_zp + g_ic;
 
             (*kernel_)(&p);
 
@@ -373,6 +383,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d_dw(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -398,9 +411,8 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d_dw(
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<char *>(weights);
-    const int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     const int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.nb_ch * jcp.ch_block : 0)
@@ -425,7 +437,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d_dw(
                 auto bias_w = bias ? bias + (bias_d.blk_off(g) * bia_dt_size)
                                    : nullptr;
                 const int32_t *compensation_w
-                        = jcp.signed_input ? compensation + g : nullptr;
+                        = (jcp.signed_input || jcp.with_input_zp) ? compensation + g : nullptr;
 
                 auto dst_w
                         = dst + dst_dt_size * dst_d.blk_off(n, g, oh_s, ow_s);
@@ -445,7 +457,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d_dw(
                 int kh_padding
                         = nstl::max(0, jcp.kh - i_t_overflow - i_b_overflow);
 
-                size_t wei_stride = (jcp.signed_input || jcp.src_zero_point)
+                size_t wei_stride = (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                         ? 0
                         : i_t_overflow * wht_h_stride;
                 p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
@@ -470,6 +482,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_2d_dw(
                 p.post_ops_binary_rhs_arg_vec
                         = post_ops_binary_rhs_arg_vec.data();
                 p.dst_orig = dst;
+                p.oc_off = g * sizeof(float);
+                if (jcp.with_input_zp)
+                    p.input_zp = input_zp + g;
 
                 (*kernel_)(&p);
             });
@@ -490,6 +505,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
     DEFINE_ZERO_POINTS_BUFFER(src_zero_point, DNNL_ARG_SRC);
     DEFINE_ZERO_POINTS_BUFFER(dst_zero_point, DNNL_ARG_DST);
 
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
     const memory_desc_wrapper src_d(pd()->src_md());
     const memory_desc_wrapper dst_d(pd()->dst_md());
     const memory_desc_wrapper weights_d(pd()->weights_md(0));
@@ -513,9 +531,8 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
 
     size_t offset = weights_d.size() - weights_d.additional_buffer_size();
     auto w = const_cast<char *>(weights);
-    const int32_t *compensation = (jcp.signed_input)
-            ? reinterpret_cast<int32_t *>(&w[offset])
-            : nullptr;
+    const int32_t *compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : nullptr;
     const int32_t *zp_compensation = jcp.src_zero_point
             ? reinterpret_cast<int32_t *>(&w[offset])
                     + (jcp.signed_input ? jcp.ngroups * jcp.oc : 0)
@@ -584,7 +601,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
                 auto bias_w = bias ? bias + (bias_d.blk_off(g_oc) * bia_dt_size)
                                    : nullptr;
                 const int32_t *compensation_w
-                        = (jcp.signed_input) ? compensation + g_oc : nullptr;
+                        = (jcp.signed_input || jcp.with_input_zp) ? compensation + g_oc : nullptr;
                 p.zp_compensation
                         = jcp.src_zero_point ? zp_compensation + g_oc : nullptr;
                 p.src_zero_point
@@ -598,7 +615,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
                 auto src_w = src + src_d.blk_off(n, g_ic, id_s, ih_s, iw_s)
                         + d_f_overflow * dilate_d * src_d_stride;
                 auto wht_w = weights + wht_blk_off(weights_d, g, ocb, 0)
-                        + ((jcp.signed_input || jcp.src_zero_point)
+                        + ((jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                                           ? 0
                                           : d_f_overflow)
                                 * wht_d_stride;
@@ -618,7 +635,7 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
                     int kh_padding = nstl::max(
                             0, jcp.kh - i_t_overflow - i_b_overflow);
 
-                    size_t wei_stride = (jcp.signed_input || jcp.src_zero_point)
+                    size_t wei_stride = (jcp.signed_input || jcp.src_zero_point || jcp.with_input_zp)
                             ? 0
                             : wht_h_stride * i_t_overflow;
                     p.src = src_w + i_t_overflow * dilate_h * src_h_stride;
@@ -640,6 +657,9 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
                     p.post_ops_binary_rhs_arg_vec
                             = post_ops_binary_rhs_arg_vec.data();
                     p.dst_orig = dst;
+                    p.oc_off = g_oc * sizeof(float);
+                    if (jcp.with_input_zp)
+                        p.input_zp = input_zp + g_ic;
 
                     (*kernel_)(&p);
                     src_w += src_h_stride * jcp.stride_h;
@@ -669,6 +689,123 @@ status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d(
     return status::success;
 }
 
+template <cpu_isa_t isa>
+status_t jit_uni_x8s8s32x_convolution_fwd_t<isa>::execute_forward_3d_dw(const exec_ctx_t &ctx) const {
+    auto src = CTX_IN_MEM(const char *, DNNL_ARG_SRC);
+    auto weights = CTX_IN_MEM(const char *, DNNL_ARG_WEIGHTS);
+    auto bias = CTX_IN_MEM(const char *, DNNL_ARG_BIAS);
+    auto dst = CTX_OUT_MEM(char *, DNNL_ARG_DST);
+
+    const auto &jcp = pd()->jcp_;
+    const auto post_ops_binary_rhs_arg_vec
+            = binary_injector::prepare_binary_args(pd()->jcp_.post_ops, ctx);
+
+    DEFINE_INPUT_ZERO_POINTS_BUFFER(input_zp, jcp);
+    DEFINE_OUTPUT_COMPENSATION_BUFFER(output_compensation, jcp);
+
+    const memory_desc_wrapper src_d(pd()->src_md());
+    const memory_desc_wrapper dst_d(pd()->dst_md());
+    const memory_desc_wrapper weights_d(pd()->weights_md(0));
+    const memory_desc_wrapper bias_d(pd()->weights_md(1));
+
+    const size_t bia_dt_size
+            = pd()->with_bias() ? types::data_type_size(bias_d.data_type()) : 0;
+    const size_t dst_dt_size = types::data_type_size(dst_d.data_type());
+
+    assert(jcp.ic_block == 1);
+    assert(jcp.oc_block == 1);
+    assert(jcp.nb_ic == 1);
+    assert(jcp.nb_oc == 1);
+    assert(jcp.nb_oc_blocking == 1);
+    assert(jcp.nb_ch % jcp.nb_ch_blocking == 0);
+
+    DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
+    DEFINE_ARG_SCALES_BUFFER(wei_scales, DNNL_ARG_WEIGHTS);
+    DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
+
+
+    const float *oscales = adjust_oscales(
+            ctx.get_scratchpad_grantor(), src_scales, wei_scales);
+
+    size_t offset = weights_d.size() - weights_d.additional_buffer_size();
+    auto w = const_cast<char *>(weights);
+    const int32_t* compensation = (jcp.signed_input) ? reinterpret_cast<int32_t *>(&w[offset]) :
+                                  (jcp.with_input_zp) ? output_compensation : 0;
+    int nb_groups = jcp.nb_ch / jcp.nb_ch_blocking;
+    int group_block = jcp.ch_block;
+
+    parallel_nd(jcp.mb, jcp.od, jcp.oh, jcp.nb_ow, nb_groups, [&](int n, int od_s, int oh_s, int owb, int gg) {
+        auto p = jit_conv_call_s();
+
+        size_t src_d_stride = src_d.blk_off(0, 0, 1);
+        size_t wht_d_stride = wht_blk_off(weights_d, 0, 0, 0, 1);
+
+        size_t src_h_stride = src_d.blk_off(0, 0, 0, 1);
+        size_t wht_h_stride = wht_blk_off(weights_d, 0, 0, 0, 0, 1);
+
+        int gb = gg * jcp.nb_ch_blocking;
+        int g = gb * group_block;
+
+        int id_s = -jcp.f_pad + od_s * jcp.stride_d;
+
+        int ih_s = -jcp.t_pad + oh_s * jcp.stride_h;
+        int ow_s = owb * jcp.ow_block;
+        int iw_s = ow_s * jcp.stride_w;
+
+        auto bias_w = bias ? bias + (bias_d.blk_off(g) * bia_dt_size) : 0;
+        const int32_t *compensation_w = (jcp.signed_input || jcp.with_input_zp) ? compensation + g : 0;
+
+        auto dst_w = dst + dst_dt_size * dst_d.blk_off(n, g, od_s, oh_s, ow_s);
+        auto src_w = src + src_d.blk_off(n, g, id_s, ih_s, iw_s);
+        auto wht_w = weights + wht_blk_off(weights_d, gb, 0);
+
+        auto scales = &oscales[jcp.is_oc_scale * g];
+
+        int dilate_d = jcp.dilate_d + 1;
+        int i_f_overflow = nstl::min(jcp.kd, div_up(max(0, -id_s), dilate_d));
+        int i_back_overflow = nstl::min(jcp.kd,
+                                        div_up(max(0, id_s - jcp.id + (jcp.kd - 1) * dilate_d + 1),
+                                               dilate_d));
+        int kd_padding = nstl::max(0, jcp.kd - i_f_overflow - i_back_overflow);
+
+        size_t wei_d_stride = (jcp.signed_input || jcp.with_input_zp) ? 0 : i_f_overflow * wht_d_stride;
+
+        int dilate_h = jcp.dilate_h + 1;
+        int i_t_overflow = nstl::min(jcp.kh, div_up(max(0, -ih_s), dilate_h));
+        int i_b_overflow = nstl::min(jcp.kh,
+                                     div_up(max(0, ih_s - jcp.ih + (jcp.kh - 1) * dilate_h + 1),
+                                            dilate_h));
+        int kh_padding = nstl::max(0, jcp.kh - i_t_overflow - i_b_overflow);
+
+        size_t wei_h_stride = (jcp.signed_input || jcp.with_input_zp) ? 0 : i_t_overflow * wht_h_stride;
+        p.src = src_w + i_t_overflow * dilate_h * src_h_stride
+                + i_f_overflow * dilate_d * src_d_stride;
+        p.dst = dst_w;
+        p.filt = wht_w + wei_d_stride + wei_h_stride;
+        p.bias = bias_w;
+        p.compensation = compensation_w;
+        p.oc_blocks = gb;
+        p.kd_padding = kd_padding;
+        p.kh_padding = kh_padding;
+        p.scales = scales;
+        p.dst_scale = dst_scales;
+        p.f_overflow = i_f_overflow;
+        p.back_overflow = i_back_overflow;
+        p.t_overflow = i_t_overflow;
+        p.b_overflow = i_b_overflow;
+        p.owb = owb;
+        p.post_ops_binary_rhs_arg_vec
+                = post_ops_binary_rhs_arg_vec.data();
+
+        p.oc_off = g * sizeof(float);
+        if (jcp.with_input_zp)
+            p.input_zp = input_zp + g;
+
+        (*kernel_)(&p);
+    });
+    return status::success;
+}
+
 template struct jit_uni_x8s8s32x_convolution_fwd_t<sse41>;
 template struct jit_uni_x8s8s32x_convolution_fwd_t<avx2>;
 
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_convolution.hpp b/src/cpu/x64/jit_uni_x8s8s32x_convolution.hpp
index 2b029c4526a..ea9e99d919a 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_convolution.hpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,9 +35,7 @@ namespace x64 {
 template <cpu_isa_t isa>
 struct jit_uni_x8s8s32x_convolution_fwd_t : public primitive_t {
     struct pd_t : public cpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const typename pd_t::base_class *hint_fwd_pd)
-            : cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd), jcp_() {}
+        using cpu_convolution_fwd_pd_t::cpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 JIT_IMPL_NAME_HELPER("jit_uni_int8:",
@@ -65,9 +63,11 @@ struct jit_uni_x8s8s32x_convolution_fwd_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(smask_t::scales_runtime
-                                    | smask_t::zero_points_runtime
-                                    | smask_t::post_ops | smask_t::sum_dt,
+                    attr()->has_default_values(smask_t::scales
+                                    | smask_t::zero_points | smask_t::post_ops
+                                    | smask_t::sum_dt
+                                    | smask_t::input_zero_points
+                                    | smask_t::output_compensations,
                             dst_md(0)->data_type),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONV(attr()->post_ops_.check_sum_consistency(
@@ -76,6 +76,7 @@ struct jit_uni_x8s8s32x_convolution_fwd_t : public primitive_t {
             VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_CONV(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
 
+            // TODO: make `init_conf` assign initialized object to `jcp_`
             CHECK(jit_uni_x8s8s32x_fwd_kernel<isa>::init_conf(jcp_, *desc(),
                     src_md_, weights_md_, dst_md_, bias_md_, attr_,
                     dnnl_get_max_threads()));
@@ -87,16 +88,24 @@ struct jit_uni_x8s8s32x_convolution_fwd_t : public primitive_t {
             return attr_.set_default_formats(dst_md(0));
         }
 
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
     protected:
         bool zero_points_ok() const {
-            // Only common zero points are supported -> mask should only be 0
-            int mask_src = 0, mask_dst = 0;
-            attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-            attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
-            return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                    && mask_src == 0 && mask_dst == 0;
+            const auto &zp = attr()->zero_points_;
+
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                const bool ok = mask_src == 0;
+                if (!ok) return false;
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                const bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+
+            return zp.has_default_values(DNNL_ARG_WEIGHTS);
         }
     };
 
@@ -119,7 +128,9 @@ struct jit_uni_x8s8s32x_convolution_fwd_t : public primitive_t {
             case 4:
                 if (is_dw) return execute_forward_2d_dw(ctx);
                 return execute_forward_2d(ctx);
-            case 5: return execute_forward_3d(ctx);
+            case 5:
+                if (is_dw) return execute_forward_3d_dw(ctx);
+                return execute_forward_3d(ctx);
         }
         return status::unimplemented;
     }
@@ -129,6 +140,7 @@ struct jit_uni_x8s8s32x_convolution_fwd_t : public primitive_t {
     status_t execute_forward_2d(const exec_ctx_t &ctx) const;
     status_t execute_forward_3d(const exec_ctx_t &ctx) const;
     status_t execute_forward_2d_dw(const exec_ctx_t &ctx) const;
+    status_t execute_forward_3d_dw(const exec_ctx_t &ctx) const;
     const pd_t *pd() const {
         return static_cast<const pd_t *>(primitive_t::pd().get());
     }
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp b/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp
index 9b03681d900..d8a4130c95d 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,7 +101,7 @@ status_t jit_uni_x8s8s32x_deconv_fwd_kernel<isa>::init_conf(
             zero_points_valid(&attr), VERBOSE_UNSUPPORTED_ZP_CFG);
     jcp.src_zero_point = !attr.zero_points_.has_default_values(DNNL_ARG_SRC);
     jcp.dst_zero_point = !attr.zero_points_.has_default_values(DNNL_ARG_DST);
-    jcp.zp_src_is_common = attr.zero_points_.common(DNNL_ARG_SRC);
+    jcp.zp_src_is_common = attr.zero_points_.get_mask(DNNL_ARG_SRC) == 0;
 
     format_tag_t dat_tag = utils::pick(
             ndims - 3, format_tag::nwc, format_tag::nhwc, format_tag::ndhwc);
@@ -160,11 +160,10 @@ status_t jit_uni_x8s8s32x_deconv_fwd_kernel<isa>::init_conf(
 
         if (jcp.signed_input && !jcp.is_depthwise) {
             want_wei_md.extra.flags = 0
-                    | memory_extra_flags::compensation_conv_s8s8
-                    | memory_extra_flags::scale_adjust;
+                    | memory_extra_flags::compensation_conv_s8s8;
             want_wei_md.extra.compensation_mask = (1 << 0)
                     + (with_groups && !jcp.is_depthwise ? (1 << 1) : 0);
-            want_wei_md.extra.scale_adjust = jcp.has_vnni ? 1.f : 0.5f;
+            want_wei_md.extra.scale_adjust = 1.f;
         }
         if (jcp.src_zero_point) set_zp_src_comp_flags(want_wei_md, with_groups);
 
@@ -270,9 +269,15 @@ status_t jit_uni_x8s8s32x_deconv_fwd_kernel<isa>::init_conf(
     const int sum_ind = p.find(primitive_kind::sum);
     jcp.with_sum = sum_ind != -1;
 
+    const int depthwise_ind = p.find(primitive_kind::depthwise);
+    jcp.with_depthwise = depthwise_ind != -1;
+
+    const int quantization_ind = p.find(primitive_kind::quantization);
+    jcp.with_quantization = quantization_ind != -1;
+
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
-    jcp.is_oc_scale = wei_scales.mask_ != 0;
+    jcp.is_oc_scale = wei_scales.get_mask() > 0;
     jcp.dst_scale = !dst_scales.has_default_values();
 
     jcp.post_ops = p;
@@ -386,7 +391,7 @@ template <cpu_isa_t isa>
 void jit_uni_x8s8s32x_deconv_fwd_kernel<isa>::init_scratchpad(
         memory_tracking::registrar_t &scratchpad, const jit_conv_conf_t &jcp,
         const primitive_attr_t &attr) {
-    const int mask = attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const int mask = attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
     const dim_t scales_count
             = mask == 0 ? 1 : static_cast<dim_t>(jcp.oc) * jcp.ngroups;
     dim_t count = nstl::max<dim_t>(scales_count, 8);
@@ -405,7 +410,7 @@ bool jit_uni_x8s8s32x_deconv_fwd_kernel<isa>::post_ops_ok(jit_conv_conf_t &jcp,
         const memory_desc_wrapper &dst_d, const primitive_attr_t &attr) {
     using namespace injector;
 
-    return injector::post_ops_ok(post_ops_ok_args_t(isa, {sum, eltwise, binary},
+    return injector::post_ops_ok(post_ops_ok_args_t(isa, {sum, eltwise, binary, depthwise, quantization},
             attr.post_ops_, &dst_d, false /*sum_at_pos_0_only*/,
             false /*sum_requires_scale_one*/, false /*sum_requires_zp_zero*/,
             true /*sum_requires_same_params*/,
@@ -417,12 +422,12 @@ template <cpu_isa_t isa, typename Vmm>
 _jit_uni_x8s8s32x_deconv_fwd_kernel<isa,
         Vmm>::_jit_uni_x8s8s32x_deconv_fwd_kernel(const jit_conv_conf_t &ajcp,
         const primitive_attr_t &attr, const memory_desc_wrapper &dst_d)
-    : jit_generator(jit_name(), isa)
+    : jit_generator_t(jit_name(), isa)
     , jcp_(ajcp)
     , postops_injector_(nullptr)
     , ker_max_regs_(jcp_.has_vnni ? 14 : 12) {
 
-    if (jcp_.with_eltwise || jcp_.with_binary || jcp_.with_sum) {
+    if (jcp_.with_eltwise || jcp_.with_binary || jcp_.with_sum || jcp_.with_depthwise || jcp_.with_quantization) {
         const std::size_t tail_size = get_tail_size();
 
         static constexpr bool preserve_gpr = true;
@@ -436,9 +441,11 @@ _jit_uni_x8s8s32x_deconv_fwd_kernel<isa,
                 tail_size, Xbyak::Opmask(2), use_exact_tail_scalar_bcast};
         const binary_injector::static_params_t bsp {this->param1_, rhs_sp};
 
+        const quantization_injector::static_params_t qsp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias};
+
         postops_injector_ = utils::make_unique<
                 injector::jit_uni_postops_injector_t<isa, Vmm>>(
-                this, jcp_.post_ops, bsp);
+                this, jcp_.post_ops, bsp, qsp);
     }
 }
 
@@ -1056,10 +1063,23 @@ void _jit_uni_x8s8s32x_deconv_fwd_kernel<isa, Vmm>::apply_postops(int ur_w,
             }
         }
     }
+
+    std::map<size_t, int> vmm_idx_off;
+    for (int ocb = 0; ocb < jcp_.nb_oc_blocking; ocb++) {
+        for (int ur = 0; ur < ur_w; ur++) {
+            vmm_idx_off.insert({vmm_out(ur, ocb).getIdx(), ocb * jcp_.oc_block * sizeof(float)});
+        }
+    }
+    depthwise_injector::dynamic_params_t ddp {vmm_d_weights.getIdx(), vmm_d_bias.getIdx(), reg_d_weights, reg_d_bias,
+                                              ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off,
+                                              this->rsp, base_post_ops_data_offset};
+    quantization_injector::dynamic_params_t qdp {ptr[this->param1 + GET_OFF(oc_off)], vmm_idx_off, jcp_.dst_dt,
+                                                 this->rsp, base_post_ops_data_offset};
+
     const int nb_oc_block
             = jcp_.is_depthwise ? jcp_.nb_ch_blocking : jcp_.nb_oc_blocking;
     postops_injector_->compute_vector_range(
-            16 - nb_oc_block * ur_w, 16, rhs_arg_params);
+            16 - nb_oc_block * ur_w, 16, rhs_arg_params, ddp, qdp);
 }
 
 template <cpu_isa_t isa, typename Vmm>
@@ -1143,7 +1163,7 @@ void _jit_uni_x8s8s32x_deconv_fwd_kernel<isa, Vmm>::store_output(
     if (p_sum_zp && *p_sum_zp != 0) {
         mov(reg_ptr_sum_zp_, reinterpret_cast<size_t>(p_sum_zp));
     }
-    if (jcp_.with_eltwise || jcp_.with_binary || jcp_.with_sum)
+    if (jcp_.with_eltwise || jcp_.with_binary || jcp_.with_sum || jcp_.with_depthwise || jcp_.with_quantization)
         apply_postops(ur_w, last_oc_block, p_sum_scale, p_sum_zp);
     if (jcp_.dst_scale) {
         mov(reg_ptr_dst_scales_, ptr[param1_ + GET_OFF(dst_scale)]);
@@ -1305,8 +1325,13 @@ template <cpu_isa_t isa, typename Vmm>
 void _jit_uni_x8s8s32x_deconv_fwd_kernel<isa, Vmm>::generate() {
     preamble();
 
-    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp_))
+    if (postops_injector_)
+        postops_injector_->push_post_ops_data_on_stack(param1, GET_OFF(post_ops_binary_rhs_arg_vec), reg_src_, reg_filt_);
+
+    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp_)) {
         sub(rsp, reserved_stack_size_);
+        base_post_ops_data_offset += reserved_stack_size_;
+    }
 
     const auto vmm_one_128 = Xbyak::Xmm(vmm_one_.getIdx());
     mov(reg_scratch_, 0x10001);
@@ -1372,8 +1397,13 @@ void _jit_uni_x8s8s32x_deconv_fwd_kernel<isa, Vmm>::generate() {
         }
     }
 
-    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp_))
+    if (zp::should_calculate_deconv_zp_src_pad_str_comp(jcp_)) {
         add(rsp, reserved_stack_size_);
+        base_post_ops_data_offset -= reserved_stack_size_;
+    }
+
+    if (postops_injector_)
+        postops_injector_->reset_stack_pointer();
 
     postamble();
 
@@ -1414,8 +1444,8 @@ status_t jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::pd_t::init(
     VDISPATCH_DECONVOLUTION(
             desc()->accum_data_type == s32, VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_DECONVOLUTION(
-            attr()->has_default_values(skip_mask_t::scales_runtime
-                    | skip_mask_t::post_ops | skip_mask_t::zero_points_runtime),
+            attr()->has_default_values(skip_mask_t::scales
+                    | skip_mask_t::post_ops | skip_mask_t::zero_points),
             VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_DECONVOLUTION(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
 
@@ -1451,15 +1481,18 @@ const float *jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::adjust_oscales(
         const memory_tracking::grantor_t &scratchpad, const float *src_scales,
         const float *wei_scales) const {
     auto loc_scales = scratchpad.template get<float>(key_conv_adjusted_scales);
-    int wei_mask = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
+    int wei_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
     float factor = (pd()->jcp_.signed_input && (!pd()->jcp_.has_vnni))
             ? 1.f / pd()->jcp_.wei_adj_scale
             : 1.0f;
-    if (wei_mask == 0) {
-        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor, 8);
-    } else {
+    if (has_wei_scales && wei_mask > 0) {
         for (dim_t c = 0; c < pd()->OC(); c++)
             loc_scales[c] = src_scales[0] * wei_scales[c] * factor;
+    } else {
+        utils::array_set(loc_scales, src_scales[0] * wei_scales[0] * factor,
+                /* WHY: pd()->jcp_.simd_w = 0!!! */ 8);
     }
     return loc_scales;
 }
@@ -1573,6 +1606,8 @@ status_t jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::execute_forward_1d(
             p.dst_zero_point = zp_dst;
             p.dst_orig = dst;
 
+            p.oc_off = g_oc * sizeof(float);
+
             (*kernel_)(&p);
 
             ++start;
@@ -1744,6 +1779,8 @@ status_t jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::execute_forward_2d(
                 p.dst_zero_point = zp_dst;
                 p.dst_orig = dst;
 
+                p.oc_off = g_oc * sizeof(float);
+
                 (*kernel_)(&p);
             }
             if (jcp.loop_order == loop_ngc)
@@ -1971,6 +2008,8 @@ status_t jit_uni_x8s8s32x_deconvolution_fwd_t<isa>::execute_forward_3d(
                 p.dst_zero_point = zp_dst;
                 p.dst_orig = dst;
 
+                p.oc_off = g_oc * sizeof(float);
+
                 (*kernel_)(&p);
             }
 
diff --git a/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.hpp b/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.hpp
index 4385fb0265a..29019fe16a2 100644
--- a/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.hpp
+++ b/src/cpu/x64/jit_uni_x8s8s32x_deconvolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,14 +45,15 @@ class jit_uni_postops_injector_t;
 using namespace Xbyak;
 
 template <cpu_isa_t isa, typename Vmm>
-struct _jit_uni_x8s8s32x_deconv_fwd_kernel : public jit_generator {
+struct _jit_uni_x8s8s32x_deconv_fwd_kernel : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(_jit_uni_x8s8s32x_deconv_fwd_kernel);
 
     _jit_uni_x8s8s32x_deconv_fwd_kernel(const jit_conv_conf_t &ajcp,
             const primitive_attr_t &attr, const memory_desc_wrapper &dst_d);
-    ~_jit_uni_x8s8s32x_deconv_fwd_kernel();
 
-    const jit_conv_conf_t jcp_;
+    ~_jit_uni_x8s8s32x_deconv_fwd_kernel() override;
+
+    const jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
 
 private:
     std::unique_ptr<injector::jit_uni_postops_injector_t<isa, Vmm>>
@@ -140,6 +141,14 @@ struct _jit_uni_x8s8s32x_deconv_fwd_kernel : public jit_generator {
             int ur_w, int l_overflow, int r_overflow, bool h_padded);
     void append_zp_src_pad_str_comp(int ur_w, int l_overflow, int r_overflow,
             bool h_padded, bool last_oc_block);
+
+    /* depthwise and quantization post ops */
+    const Xbyak::Reg64 reg_d_weights = r15;
+    const Xbyak::Reg64 reg_d_bias = r13;
+    int base_post_ops_data_offset = 0;
+    Vmm vmm_d_weights = Vmm(0);
+    Vmm vmm_d_bias = Vmm(1);
+
     void kh_loop(int ur_w, int pad_l, int pad_r, ker_block_t last_ker_block);
     void icb_loop(int ur_w, int pad_l, int pad_r, bool last_block);
     void generate() override;
@@ -179,7 +188,7 @@ struct jit_uni_x8s8s32x_deconv_fwd_kernel {
 
 private:
     DNNL_DISALLOW_COPY_AND_ASSIGN(jit_uni_x8s8s32x_deconv_fwd_kernel);
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 template <cpu_isa_t isa>
@@ -193,11 +202,12 @@ struct jit_uni_x8s8s32x_deconvolution_fwd_t : public primitive_t {
                 jit_uni_x8s8s32x_deconvolution_fwd_t);
 
         status_t init(engine_t *engine);
-        jit_conv_conf_t jcp_;
+        jit_conv_conf_t jcp_ = utils::zero<decltype(jcp_)>();
     };
 
     jit_uni_x8s8s32x_deconvolution_fwd_t(const pd_t *apd);
-    ~jit_uni_x8s8s32x_deconvolution_fwd_t();
+
+    ~jit_uni_x8s8s32x_deconvolution_fwd_t() override;
 
     status_t init(engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
diff --git a/src/cpu/x64/jit_uni_xf16_sum.cpp b/src/cpu/x64/jit_uni_xf16_sum.cpp
index bec4321965b..cabc102e131 100644
--- a/src/cpu/x64/jit_uni_xf16_sum.cpp
+++ b/src/cpu/x64/jit_uni_xf16_sum.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -107,7 +107,7 @@ void jit_avx512_core_bf16_sum_kernel_t::tail_iteration() {
     cmp(reg_sz, 0);
     jle(exit_label, T_NEAR);
 
-    const int bf16_half_reg = vreg_traits<Zmm>::vlen / 4;
+    const int bf16_half_reg = vreg_traits_t<Zmm>::vlen / 4;
     mov(reg32_mask, 0xffff);
     cmp(reg_sz, bf16_half_reg);
     jge(mask_label, T_NEAR);
@@ -165,7 +165,7 @@ void jit_avx512_core_bf16_sum_kernel_t::tail_iteration() {
 
     for (int s = 0; s < jsp.num_srcs; s++)
         add(reg_src[s], bf16_half_reg * jsp.typesize_in);
-    add(reg_dst, (vreg_traits<Zmm>::vlen / 4) * jsp.typesize_out);
+    add(reg_dst, (vreg_traits_t<Zmm>::vlen / 4) * jsp.typesize_out);
 
     jmp(tail_label, T_NEAR);
 }
@@ -191,12 +191,12 @@ status_t jit_avx512_core_bf16_sum_kernel_t::init_conf(
     for (/*continue*/; jsp.loop_unroll < max_unroll; jsp.loop_unroll++) {
         const int num_regs
                 = num_vregs_required(jsp.loop_unroll + 1, jsp.num_srcs);
-        if (num_regs > (cpu_isa_traits<avx512_core>::n_vregs
+        if (num_regs > (cpu_isa_traits_t<avx512_core>::n_vregs
                     - (isa_has_bf16(jsp.isa) ? 1 : 6)))
             break;
     }
     if (jsp.loop_unroll == 0) return status::unimplemented;
-    jsp.size_blocking = (vreg_traits<Zmm>::vlen / 2) * jsp.loop_unroll;
+    jsp.size_blocking = (vreg_traits_t<Zmm>::vlen / 2) * jsp.loop_unroll;
 
     const memory_desc_wrapper o_d(&dst_d);
     jsp.is_bf16_dst = data_type::bf16 == o_d.data_type();
@@ -329,7 +329,7 @@ status_t jit_avx2_vnni_2_xf16_sum_kernel_t::init_conf(jit_sum_conf_t &jsp,
     jsp.isa = avx2_vnni_2;
     jsp.loop_unroll = 6;
     jsp.unroll_reg_count = 2 * num_srcs + 4;
-    jsp.size_blocking = (vreg_traits<Ymm>::vlen / 2) * jsp.loop_unroll;
+    jsp.size_blocking = (vreg_traits_t<Ymm>::vlen / 2) * jsp.loop_unroll;
 
     const memory_desc_wrapper i_d(&(src_d.front()));
     const memory_desc_wrapper o_d(&dst_d);
@@ -348,9 +348,9 @@ template <typename Vmm>
 void jit_uni_xf16_sum_kernel_t<Vmm>::loop_iteration(int current_unroll) {
     Label loop_label, loop_exit_label;
     const int num_compute_elements
-            = (vreg_traits<Vmm>::vlen / 2) * current_unroll;
-    dim_t src_shift = (vreg_traits<Vmm>::vlen / 2) * jsp.typesize_in;
-    dim_t dst_shift = (vreg_traits<Vmm>::vlen / 4) * jsp.typesize_out;
+            = (vreg_traits_t<Vmm>::vlen / 2) * current_unroll;
+    dim_t src_shift = (vreg_traits_t<Vmm>::vlen / 2) * jsp.typesize_in;
+    dim_t dst_shift = (vreg_traits_t<Vmm>::vlen / 4) * jsp.typesize_out;
     L(loop_label);
     cmp(reg_sz, num_compute_elements);
     jl(loop_exit_label, T_NEAR);
@@ -436,13 +436,6 @@ status_t jit_xf16_sum_t<src_data_type, dst_data_type, isa>::execute(
     const dim_t num_blocks = nelems / num_elems_in_block;
     const dim_t tail = nelems % num_elems_in_block;
 
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8 \
-        && __GNUC_PATCHLEVEL__ == 3
-// GCC issues a false positive warning 'array subscript is above array bounds'
-// with gcc 4.8.3 + -march=native option, so disable it for now
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
     parallel(0, [&](const int ithr, const int nthr) {
         dim_t start {0}, end {0};
         balance211(num_blocks, nthr, ithr, start, end);
@@ -477,10 +470,6 @@ status_t jit_xf16_sum_t<src_data_type, dst_data_type, isa>::execute(
             (*kernel_)(&arg);
         }
     });
-#if defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 8 \
-        && __GNUC_PATCHLEVEL__ == 3
-#pragma GCC diagnostic pop
-#endif
     return status::success;
 }
 
diff --git a/src/cpu/x64/jit_uni_xf16_sum.hpp b/src/cpu/x64/jit_uni_xf16_sum.hpp
index 17a73d65983..1538b5bec49 100644
--- a/src/cpu/x64/jit_uni_xf16_sum.hpp
+++ b/src/cpu/x64/jit_uni_xf16_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,14 +52,14 @@ struct jit_sum_call_t {
 };
 
 template <typename Vmm>
-struct jit_uni_xf16_sum_kernel_t : public jit_generator {
+struct jit_uni_xf16_sum_kernel_t : public jit_generator_t {
     jit_uni_xf16_sum_kernel_t(jit_sum_conf_t ajsp, unsigned int num_acc_iters)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , jsp(ajsp)
         , reg_src {r8, r9, r10, r11, r12, r13, r14, r15}
         , num_acc_iters(num_acc_iters) {}
 
-    ~jit_uni_xf16_sum_kernel_t() {}
+    ~jit_uni_xf16_sum_kernel_t() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_xf16_sum_kernel_t)
 
@@ -103,7 +103,7 @@ struct jit_avx512_core_bf16_sum_kernel_t
     jit_avx512_core_bf16_sum_kernel_t(jit_sum_conf_t ajsp)
         : jit_uni_xf16_sum_kernel_t<Xbyak::Zmm>(
                 ajsp, utils::div_up(ajsp.num_srcs, 2))
-        , max_vregs_available(cpu_isa_traits<avx512_core>::n_vregs
+        , max_vregs_available(cpu_isa_traits_t<avx512_core>::n_vregs
                   - (isa_has_bf16(jsp.isa) ? 1 : 6))
         , bf16_emu_(nullptr) {
         if (!mayiuse(avx512_core_bf16))
@@ -113,7 +113,7 @@ struct jit_avx512_core_bf16_sum_kernel_t
                     bf16_emu_reserved_5);
     }
 
-    ~jit_avx512_core_bf16_sum_kernel_t() = default;
+    ~jit_avx512_core_bf16_sum_kernel_t() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx512_core_bf16_sum_kernel_t)
 
@@ -204,7 +204,7 @@ struct jit_avx2_vnni_2_xf16_sum_kernel_t
     jit_avx2_vnni_2_xf16_sum_kernel_t(jit_sum_conf_t ajsp)
         : jit_uni_xf16_sum_kernel_t<Xbyak::Ymm>(ajsp, ajsp.num_srcs) {}
 
-    ~jit_avx2_vnni_2_xf16_sum_kernel_t() {}
+    ~jit_avx2_vnni_2_xf16_sum_kernel_t() override = default;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_avx2_vnni_2_xf16_sum_kernel_t)
 
@@ -258,6 +258,8 @@ struct jit_xf16_sum_t : public primitive_t {
         status_t init(engine_t *engine) {
 
             unsigned int max_num_arrs;
+            // disabling verbose dispatch messages for unsupported isa for
+            // better readability
             if (!mayiuse(isa)) return status::unimplemented;
             if (is_superset(isa, avx512_core)) {
                 max_num_arrs = jit_avx512_core_bf16_sum_kernel_t::max_num_arrs;
@@ -320,13 +322,13 @@ struct jit_xf16_sum_t : public primitive_t {
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
-    typedef typename prec_traits<src_data_type>::type src_data_t;
-    typedef typename prec_traits<dst_data_type>::type dst_data_t;
-    typedef typename prec_traits<data_type::f32>::type acc_data_t;
+    using src_data_t = typename prec_traits_t<src_data_type>::type;
+    using dst_data_t = typename prec_traits_t<dst_data_type>::type;
+    using acc_data_t = typename prec_traits_t<data_type::f32>::type;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::unique_ptr<jit_generator> kernel_;
+    std::unique_ptr<jit_generator_t> kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn.hpp b/src/cpu/x64/lrn/jit_avx512_common_lrn.hpp
index 65ca74c12fa..f2c4f704c1c 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn.hpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2022 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,9 +52,10 @@ struct jit_avx512_common_lrn_fwd_t : public primitive_t {
     };
 
     jit_avx512_common_lrn_fwd_t(const pd_t *apd);
-    ~jit_avx512_common_lrn_fwd_t();
 
-    using data_t = typename prec_traits<d_type>::type;
+    ~jit_avx512_common_lrn_fwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override {
         return lrn_executor_->create_kernel();
@@ -91,9 +92,10 @@ struct jit_avx512_common_lrn_bwd_t : public primitive_t {
     };
 
     jit_avx512_common_lrn_bwd_t(const pd_t *apd);
-    ~jit_avx512_common_lrn_bwd_t();
 
-    using data_t = typename prec_traits<d_type>::type;
+    ~jit_avx512_common_lrn_bwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override {
         return lrn_executor_->create_kernel();
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.cpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.cpp
index 7feae370601..663fea0a27e 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.cpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -70,12 +70,18 @@ void jit_avx512_common_lrn_kernel_bwd_t<f16>::load_data(
 template <>
 void jit_avx512_common_lrn_kernel_bwd_t<f16>::store_data(
         bool nt, const Address addr, Zmm zr) {
-    this->vcvtps2ph(addr, zr, this->_op_mxcsr);
+    this->vcvtps2ph(addr, zr, jit_generator_t::_op_mxcsr);
 }
 
 template <>
 void jit_avx512_common_lrn_kernel_bwd_t<bf16>::store_data(
         bool nt, const Address addr, Zmm zr) {
+    const bool is_bf16_supported
+            = mayiuse(avx512_core_bf16) || bf16_emu_ != nullptr;
+    if (!is_bf16_supported) {
+        assert("Failure in storing bf16 data.");
+        return;
+    }
     const Ymm yr = Ymm(zr.getIdx());
     if (mayiuse(avx512_core_bf16))
         vcvtneps2bf16(yr, zr);
@@ -98,7 +104,7 @@ void jit_avx512_common_lrn_kernel_bwd_t<d_type>::load_tail(int tail_value,
         Reg64 src, int src_mem_offset, int dst_stack_offset,
         int tmp_load_to_stack_idx_tail) {
     // TODO: Investigate if this method can be simplified by using mask or
-    // jit_generator load utilities.
+    // jit_generator_t load utilities.
     static constexpr auto src_acc_size
             = utils::one_of(d_type, bf16, f16) ? acc_bf_16_size : acc_size;
     auto tmp_xreg = this->xreg(0, tmp_load_to_stack_idx_tail);
@@ -211,7 +217,7 @@ void jit_avx512_common_lrn_kernel_bwd_t<f16>::store_tail(int tail_value,
 template <data_type_t d_type>
 jit_avx512_common_lrn_kernel_bwd_t<d_type>::jit_avx512_common_lrn_kernel_bwd_t(
         float alpha, float beta, int local_size, const char *name)
-    : jit_generator(name, avx512_core_bf16)
+    : jit_generator_t(name, avx512_core_bf16)
     , local_size_ {local_size - !(local_size % 2)}
     , z_prev_ {[this]() {
         std::vector<int> v(this->local_size_ / 2);
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.hpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.hpp
index d2b9f1c5fd9..c9b0167bfb0 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.hpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_base.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,12 +39,12 @@ using namespace Xbyak;
 using namespace Xbyak::util;
 
 template <data_type_t d_type>
-class jit_avx512_common_lrn_kernel_bwd_t : public jit_generator {
+class jit_avx512_common_lrn_kernel_bwd_t : public jit_generator_t {
 public:
     jit_avx512_common_lrn_kernel_bwd_t(float alpha, float beta, int local_size,
             const char *name = jit_name());
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     struct jit_args_bwd_t {
         jit_args_bwd_t();
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_blocked.hpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_blocked.hpp
index 867e9c7c91f..8b908060187 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_blocked.hpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_blocked.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ template <data_type_t d_type>
 class jit_avx512_common_lrn_kernel_bwd_blocked_t
     : public jit_avx512_common_lrn_kernel_bwd_t<d_type> {
 public:
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     struct jit_args_bwd_t {
         const data_t *src, *diff_dst, *ws0, *ws1;
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_nhwc.cpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_nhwc.cpp
index d3fc9ab2ce3..3a0689ff793 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_nhwc.cpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_bwd_nhwc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -169,7 +169,7 @@ void jit_avx512_common_lrn_kernel_bwd_nhwc_t<d_type>::execute_compute_loop(
         const int begin_end = C_tail ? 1 : 2;
         int middle_16_c_blocks = num_full_16c_blocks == 1
                 ? 0
-                : num_full_16c_blocks - begin_end;
+                : static_cast<int>(num_full_16c_blocks) - begin_end;
         int LTAIL = 0;
         if (C_tail && middle_16_c_blocks) {
             middle_16_c_blocks -= 1;
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.cpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.cpp
index f442f7bd12a..7c7a196638b 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.cpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,7 +72,7 @@ void jit_avx512_common_lrn_kernel_fwd_t<d_type>::load_tail(int tail_value,
         int tmp_load_to_stack_idx_tail) {
 
     // TODO: Investigate if this method can be simplified by using mask or
-    // jit_generator load utilities.
+    // jit_generator_t load utilities.
     static constexpr auto src_size = sizeof(data_t);
     auto tmp_xreg = this->xreg(0, tmp_load_to_stack_idx_tail);
 
@@ -111,12 +111,19 @@ void jit_avx512_common_lrn_kernel_fwd_t<d_type>::load_tail(int tail_value,
 template <>
 void jit_avx512_common_lrn_kernel_fwd_t<f16>::store_data(
         const Address addr, Zmm zr, Ymm yr) {
-    this->vcvtps2ph(addr, zr, this->_op_mxcsr);
+    this->vcvtps2ph(addr, zr, jit_generator_t::_op_mxcsr);
 }
 
 template <>
 void jit_avx512_common_lrn_kernel_fwd_t<bf16>::store_data(
         const Address addr, Zmm zr, Ymm yr) {
+    const bool is_bf16_supported = mayiuse(avx512_core_bf16)
+            || IMPLICATION(emulateBfloat_, this->bf16_emu_ != nullptr);
+    if (!is_bf16_supported) {
+        assert("Failure in storing bf16 data.");
+        return;
+    }
+
     if (emulateBfloat_)
         this->bf16_emu_->vcvtneps2bf16(yr, zr);
     else
@@ -208,7 +215,7 @@ template <data_type_t d_type>
 jit_avx512_common_lrn_kernel_fwd_t<d_type>::jit_avx512_common_lrn_kernel_fwd_t(
         prop_kind_t prop_kind, float alpha, float beta, float k, int local_size,
         const char *name)
-    : jit_generator(name, avx512_core_bf16)
+    : jit_generator_t(name, avx512_core_bf16)
     , pk_(prop_kind)
     , alpha_(alpha)
     , beta_(beta)
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.hpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.hpp
index 0602922cce9..62c86d1e806 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.hpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_base.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,12 +39,12 @@ using namespace Xbyak;
 using namespace Xbyak::util;
 
 template <data_type_t d_type>
-class jit_avx512_common_lrn_kernel_fwd_t : public jit_generator {
+class jit_avx512_common_lrn_kernel_fwd_t : public jit_generator_t {
 public:
     jit_avx512_common_lrn_kernel_fwd_t(prop_kind_t prop_kind, float alpha,
             float beta, float k, int local_size, const char *name = jit_name());
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     struct jit_args_fwd_t {
         jit_args_fwd_t();
diff --git a/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_nhwc.cpp b/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_nhwc.cpp
index feb9009a958..a1fc6ca97b0 100644
--- a/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_nhwc.cpp
+++ b/src/cpu/x64/lrn/jit_avx512_common_lrn_fwd_nhwc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ void jit_avx512_common_lrn_kernel_fwd_nhwc_t<d_type>::execute_compute_loop(
         const int begin_end = C_tail ? 1 : 2;
         int middle_16_c_blocks = num_full_16c_blocks == 1
                 ? 0
-                : num_full_16c_blocks - begin_end;
+                : static_cast<int>(num_full_16c_blocks) - begin_end;
         int LTAIL = 0;
         if (C_tail && middle_16_c_blocks) {
             middle_16_c_blocks -= 1;
diff --git a/src/cpu/x64/lrn/jit_uni_lrn.hpp b/src/cpu/x64/lrn/jit_uni_lrn.hpp
index 6bfc293af9b..7d35c5bb366 100644
--- a/src/cpu/x64/lrn/jit_uni_lrn.hpp
+++ b/src/cpu/x64/lrn/jit_uni_lrn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2021 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,9 +46,10 @@ struct jit_uni_lrn_fwd_t : public primitive_t {
     };
 
     jit_uni_lrn_fwd_t(const pd_t *apd);
-    ~jit_uni_lrn_fwd_t();
 
-    using data_t = typename prec_traits<d_type>::type;
+    ~jit_uni_lrn_fwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
@@ -80,9 +81,10 @@ struct jit_uni_lrn_bwd_t : public primitive_t {
     };
 
     jit_uni_lrn_bwd_t(const pd_t *apd);
-    ~jit_uni_lrn_bwd_t();
 
-    using data_t = typename prec_traits<d_type>::type;
+    ~jit_uni_lrn_bwd_t() override;
+
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/lrn/jit_uni_lrn_kernel.cpp b/src/cpu/x64/lrn/jit_uni_lrn_kernel.cpp
index 8a0fc6c5d9d..dcda13174a2 100644
--- a/src/cpu/x64/lrn/jit_uni_lrn_kernel.cpp
+++ b/src/cpu/x64/lrn/jit_uni_lrn_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ template <template <cpu_isa_t isa, data_type_t d_type> class Derived,
         cpu_isa_t isa, data_type_t d_type>
 jit_uni_lrn_kernel_t<Derived<isa, d_type>>::jit_uni_lrn_kernel_t(
         const char *name)
-    : jit_generator(name, isa)
+    : jit_generator_t(name, isa)
     , emulate_bfloat_(d_type == data_type::bf16 && !mayiuse(avx512_core_bf16)
               && is_superset(isa, avx512_core))
     , bf16_emu_(
@@ -71,7 +71,7 @@ jit_uni_lrn_kernel_t<Derived<isa, d_type>>::jit_uni_lrn_kernel_t(
     : jit_uni_lrn_kernel_t(name) {
     if (config.dat_tag == nhwc)
         single_pixel_offset_
-                = config.C * sizeof(typename prec_traits<d_type>::type);
+                = config.C * sizeof(typename prec_traits_t<d_type>::type);
 }
 
 template <template <cpu_isa_t isa, data_type_t d_type> class Derived,
@@ -625,7 +625,7 @@ void jit_uni_lrn_fwd_kernel_t<sse41, data_type::f32>::generate(
     if (pk_ != prop_kind::forward_inference) add(scratch_, 32);
     this->dec(hw);
     this->cmp(hw, 0);
-    this->jne(lrn_loop, this->T_NEAR);
+    this->jne(lrn_loop, jit_generator_t::T_NEAR);
 
     this->add(t, 64);
     this->postamble();
@@ -722,7 +722,7 @@ void jit_uni_lrn_fwd_kernel_t<isa, d_type>::generate(const nhwc_across_t &J) {
 
     this->dec(c);
     this->cmp(c, 0);
-    this->jne(lrn_loop, this->T_NEAR);
+    this->jne(lrn_loop, jit_generator_t::T_NEAR);
 
     this->vmovups(yc, this->ptr[src_]);
     this->vfmadd231ps(ysum, yc, yc);
@@ -912,7 +912,7 @@ void jit_uni_lrn_fwd_kernel_t<sse41, data_type::f32>::generate(
 
     this->dec(c);
     this->cmp(c, 0);
-    this->jne(lrn_loop, this->T_NEAR);
+    this->jne(lrn_loop, jit_generator_t::T_NEAR);
 
     /* compute last 3 blocks of channels:
      * block:       | -- low -- | -- hi --  |
@@ -1233,7 +1233,7 @@ void jit_uni_lrn_fwd_kernel_t<isa, d_type>::generate(const nchw_across_t &J) {
     if (pk_ != prop_kind::forward_inference) this->add(scratch_, J.HW * 4);
     this->dec(c);
     this->cmp(c, 0);
-    this->jne(lrn_loop, this->T_NEAR);
+    this->jne(lrn_loop, jit_generator_t::T_NEAR);
 
     this->vxorps(ye, ye, ye);
 
@@ -1308,7 +1308,7 @@ void jit_uni_lrn_fwd_kernel_t<sse41, data_type::f32>::generate(
     const Xbyak::Xmm &xe_lo = this->xmm14;
     const Xbyak::Xmm &xe_hi = this->xmm15;
 
-    const int vlen = cpu_isa_traits<sse41>::vlen / sizeof(float);
+    const int vlen = cpu_isa_traits_t<sse41>::vlen / sizeof(float);
 
     bool compute_tail = J.tail != 0;
     bool load_lo = J.tail == 0 || J.tail > 4;
@@ -1410,7 +1410,7 @@ void jit_uni_lrn_fwd_kernel_t<sse41, data_type::f32>::generate(
     if (pk_ != prop_kind::forward_inference) add(scratch_, J.HW * 4);
     this->dec(c);
     this->cmp(c, 0);
-    this->jne(lrn_loop, this->T_NEAR);
+    this->jne(lrn_loop, jit_generator_t::T_NEAR);
 
     this->xorps(xe_lo, xe_lo);
     this->xorps(xe_hi, xe_hi);
diff --git a/src/cpu/x64/lrn/jit_uni_lrn_kernel.hpp b/src/cpu/x64/lrn/jit_uni_lrn_kernel.hpp
index 598772e2893..b8cde756e36 100644
--- a/src/cpu/x64/lrn/jit_uni_lrn_kernel.hpp
+++ b/src/cpu/x64/lrn/jit_uni_lrn_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -84,22 +84,22 @@ class jit_uni_lrn_kernel_t; // primary template
 
 template <template <cpu_isa_t isa, data_type_t d_type> class Derived,
         cpu_isa_t isa, data_type_t d_type>
-class jit_uni_lrn_kernel_t<Derived<isa, d_type>> : public jit_generator {
+class jit_uni_lrn_kernel_t<Derived<isa, d_type>> : public jit_generator_t {
 public:
     jit_uni_lrn_kernel_t(const char *name = jit_name());
     jit_uni_lrn_kernel_t(
             const within_config_t &J, const char *name = jit_name());
 
-    ~jit_uni_lrn_kernel_t();
+    ~jit_uni_lrn_kernel_t() override;
 
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_lrn_kernel_t);
     // TODO: why use double simd for sse41?
     static constexpr int VECTOR_LENGTH
-            = (cpu_isa_traits<(isa > sse41 ? isa : avx2)>::vlen
+            = (cpu_isa_traits_t<(isa > sse41 ? isa : avx2)>::vlen
                     / sizeof(float));
 
 protected:
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     void load_constant(float constant, const Vmm &v_constant,
             const Xbyak::Xmm &x_constant);
@@ -119,9 +119,10 @@ class jit_uni_lrn_kernel_t<Derived<isa, d_type>> : public jit_generator {
     const Xbyak::Reg64 w_ = this->r10;
     const Xbyak::Reg64 imm_addr64_ = this->rbx;
     const Xbyak::Reg64 reg_tmp_ = this->rsi;
-    static constexpr size_t simd_w_ = cpu_isa_traits<isa>::vlen / sizeof(float);
+    static constexpr size_t simd_w_
+            = cpu_isa_traits_t<isa>::vlen / sizeof(float);
     int single_pixel_offset_
-            = VECTOR_LENGTH * sizeof(typename prec_traits<d_type>::type);
+            = VECTOR_LENGTH * sizeof(typename prec_traits_t<d_type>::type);
 
     io::jit_io_multi_dt_helper_t<Vmm> io_;
 };
@@ -140,7 +141,7 @@ class jit_uni_lrn_fwd_kernel_t
             const nhwc_across_t &J, float A, float K, prop_kind_t pk);
     jit_uni_lrn_fwd_kernel_t(
             const nchw_across_t &J, float A, float K, prop_kind_t pk);
-    ~jit_uni_lrn_fwd_kernel_t();
+    ~jit_uni_lrn_fwd_kernel_t() override;
 
 private:
     using Base = jit_uni_lrn_kernel_t<jit_uni_lrn_fwd_kernel_t<isa, d_type>>;
diff --git a/src/cpu/x64/lrn/lrn_avx512_blocked_executor.hpp b/src/cpu/x64/lrn/lrn_avx512_blocked_executor.hpp
index c6d85b9cc63..4707a1123dd 100644
--- a/src/cpu/x64/lrn/lrn_avx512_blocked_executor.hpp
+++ b/src/cpu/x64/lrn/lrn_avx512_blocked_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ class lrn_avx512_blocked_executor_fwd_t : public i_lrn_executor_t {
         }
     }
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t create_kernel() override {
         CHECK(ker_->create_kernel());
@@ -205,7 +205,7 @@ class lrn_avx512_blocked_executor_bwd_t : public i_lrn_executor_t {
         }
     }
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t create_kernel() override {
         CHECK(ker_->create_kernel());
diff --git a/src/cpu/x64/lrn/lrn_avx512_nhwc_executor.hpp b/src/cpu/x64/lrn/lrn_avx512_nhwc_executor.hpp
index 1316b20d743..d7b8eabd882 100644
--- a/src/cpu/x64/lrn/lrn_avx512_nhwc_executor.hpp
+++ b/src/cpu/x64/lrn/lrn_avx512_nhwc_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ class lrn_avx512_nhwc_executor_fwd_t : public i_lrn_executor_t {
         , H_(pd->H())
         , W_(pd->W()) {}
 
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t create_kernel() override { return ker_->create_kernel(); }
 
@@ -73,7 +73,7 @@ class lrn_avx512_nhwc_executor_fwd_t : public i_lrn_executor_t {
         return status::success;
     }
 
-    virtual ~lrn_avx512_nhwc_executor_fwd_t() = default;
+    ~lrn_avx512_nhwc_executor_fwd_t() override = default;
 
 private:
     std::unique_ptr<jit_avx512_common_lrn_kernel_fwd_nhwc_t<d_type>> ker_;
@@ -94,7 +94,7 @@ class lrn_avx512_nhwc_executor_bwd_t : public i_lrn_executor_t {
         , C_(pd->C())
         , H_(pd->H())
         , W_(pd->W()) {}
-    using data_t = typename prec_traits<d_type>::type;
+    using data_t = typename prec_traits_t<d_type>::type;
 
     status_t create_kernel() override { return ker_->create_kernel(); }
 
@@ -126,7 +126,7 @@ class lrn_avx512_nhwc_executor_bwd_t : public i_lrn_executor_t {
         return status::success;
     }
 
-    virtual ~lrn_avx512_nhwc_executor_bwd_t() = default;
+    ~lrn_avx512_nhwc_executor_bwd_t() override = default;
 
 private:
     std::unique_ptr<jit_avx512_common_lrn_kernel_bwd_nhwc_t<d_type>> ker_;
diff --git a/src/cpu/x64/matmul/amx_blocking_heuristics.cpp b/src/cpu/x64/matmul/amx_blocking_heuristics.cpp
new file mode 100644
index 00000000000..a33a5d5e45a
--- /dev/null
+++ b/src/cpu/x64/matmul/amx_blocking_heuristics.cpp
@@ -0,0 +1,1087 @@
+/*******************************************************************************
+* Copyright 2021-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu/x64/matmul/amx_blocking_heuristics.hpp"
+#include "cpu/matmul/gemm_based_common.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace matmul {
+
+using namespace dnnl::impl::cpu::matmul;
+
+using namespace dnnl::impl::memory_tracking::names;
+using namespace dnnl::impl::utils;
+
+using namespace data_type;
+using namespace format_tag;
+void matmul_amx_blocking_params_t::update_configuration(
+        brgemm_matmul_conf_t &bgmmc) const {
+    bgmmc.nthr_k = nthr_k_;
+    bgmmc.nthr_m = nthr_m_;
+    bgmmc.nthr_n = nthr_n_;
+    bgmmc.nthr_b = nthr_b_;
+    bgmmc.nthr = nthr_;
+    bgmmc.M_blk = m_blk_;
+    bgmmc.M_chunk_size = m_chunk_size_;
+    bgmmc.N_blk = n_blk_;
+    bgmmc.N_chunk_size = n_chunk_size_;
+
+    bgmmc.K_blk = k_blk_;
+    bgmmc.K_chunk_size = k_chunk_size_;
+    bgmmc.brgemm_batch_size = brgemm_batch_size_;
+
+    bgmmc.use_buffer_c = need_buf_c_;
+    bgmmc.use_buffer_a = need_buf_a_;
+    bgmmc.extendable_k = extendable_k_;
+    bgmmc.LDA = current_lda_;
+
+    bgmmc.is_a_nt = is_a_nt_;
+    bgmmc.is_b_nt = is_b_nt_;
+    bgmmc.set_nt = set_nt_;
+    bgmmc.is_macro_heuristics
+            = dynamic_cast<const matmul_amx_blocking_params_macro_t *>(this)
+            != nullptr;
+}
+
+dim_t matmul_amx_blocking_params_t::get_actual_lda() const {
+    if (!need_buf_a_)
+        return treat_A_as_plain ? K : A_strides[1 - transposed_A] / a_dt_sz;
+
+    constexpr int bytes_in_cacheline = 64;
+    const int elems_in_cacheline = bytes_in_cacheline / a_dt_sz;
+    dim_t lda = rnd_up(k_blk_, elems_in_cacheline);
+    const bool is_big_2_pow = lda >= 512 && math::is_pow2(lda);
+    if (is_big_2_pow) lda += elems_in_cacheline;
+    return lda;
+}
+
+bool matmul_amx_blocking_params_t::is_buffer_c_required() const {
+    if (nthr_k_ > 1 && K > k_chunk_elems_) return true;
+
+    return ((acc_dt != dst_dt || with_sum)
+            && (K > k_chunk_elems_ || K % k_blk_ > 0));
+}
+
+size_t matmul_amx_blocking_params_t::L2_threshold() {
+    return 3 * platform::get_per_core_cache_size(2) / 4;
+}
+
+size_t matmul_amx_blocking_params_t::L1_threshold() {
+    return 5 * platform::get_per_core_cache_size(1) / 6;
+}
+
+bool matmul_amx_blocking_params_macro_t::is_supported(
+        const brgemm_matmul_conf_t &bgmmc,
+        const brgemm_matmul_conf_utils_t &bm_conf_utils) {
+    // TODO: enable extendable_k optimization
+    if (bgmmc.K < bgmmc.wei_k_blk
+            || bgmmc.K % data_type_vnni_granularity(bgmmc.wei_dt) != 0) {
+        return false;
+    }
+
+    bool a_dt_ok
+            = one_of(bgmmc.orig_src_dt, dnnl_s8, dnnl_u8, dnnl_bf16, dnnl_f16);
+    bool b_dt_ok
+            = one_of(bgmmc.orig_wei_dt, dnnl_s8, dnnl_u8, dnnl_bf16, dnnl_f16);
+
+    bool a_tag_ok = bgmmc.src_tag == dnnl_format_tag_any
+            || bm_conf_utils.check_is_plain(bgmmc.src_tag);
+    bool b_tag_ok = bm_conf_utils.is_any_B_layout()
+            || bm_conf_utils.check_b_layout_blocked_32_by_n(bgmmc.wei_tag);
+
+    bool has_zp = bgmmc.src_zp_type != brgemm_broadcast_t::none
+            || bgmmc.wei_zp_type != brgemm_broadcast_t::none
+            || bgmmc.dst_zp_type != brgemm_broadcast_t::none;
+
+    return bgmmc.orig_src_dt == bgmmc.src_dt
+            && bgmmc.orig_wei_dt == bgmmc.wei_dt && bgmmc.is_amx
+            && !bgmmc.is_runtime_N && !bgmmc.is_runtime_M && a_dt_ok && a_tag_ok
+            && (bgmmc.reduce_kind == matmul_reduce_kind::undef) && b_tag_ok
+            && b_dt_ok && !has_zp && !bgmmc.packed_sparse_weights;
+}
+
+bool matmul_amx_blocking_params_macro_t::divs_are_acceptable() const {
+    bool unacceptable_m_div = m_per_thread < min_m_dim && nthr_m_ > 1;
+    bool unacceptable_k_div = k_per_thread < min_k_dim && nthr_k_ > 1;
+    bool unacceptable_n_div;
+    if (nthr_k_ == 1 && k_per_thread < k_threshold_write_bound_layer) {
+        // The layer is write-bound (small K) and no reduction (C becomes non-consecutive)
+        unacceptable_n_div
+                = n_per_thread < min_n_dim_write_bound_layer && nthr_n_ > 1;
+    } else {
+        unacceptable_n_div = n_per_thread < min_n_dim && nthr_n_ > 1;
+    }
+
+    bool unacceptable_b_div = nthr_b_ > (size_t)batch;
+
+    return !unacceptable_m_div && !unacceptable_k_div && !unacceptable_n_div
+            && !unacceptable_b_div;
+}
+
+size_t determine_tmul_size(size_t num_elements, int full_tile_size) {
+    size_t tmul_tiles = div_up(num_elements, full_tile_size);
+    size_t tmul_size = div_up(num_elements, tmul_tiles);
+    return tmul_size;
+}
+
+bool matmul_amx_blocking_params_macro_t::find_best_blocking(
+        const brgemm_matmul_conf_t &bgmmc,
+        const brgemm_matmul_conf_utils_t &bm_conf_utils,
+        matmul_amx_blocking_params_macro_t &best_blocking) {
+
+    if (!matmul_amx_blocking_params_macro_t::is_supported(
+                bgmmc, bm_conf_utils)) {
+        return false;
+    }
+
+    best_blocking = matmul_amx_blocking_params_micro_t(bgmmc);
+
+    matmul_amx_blocking_params_macro_t current_blocking(bgmmc);
+    assert(bgmmc.tr_a_dt_sz == bgmmc.tr_b_dt_sz);
+    current_blocking.gemm_dt_sz = bgmmc.tr_a_dt_sz;
+
+    for (size_t nthr_to_check = bgmmc.nthr; nthr_to_check > 0;
+            nthr_to_check--) {
+        current_blocking.nthr_ = nthr_to_check;
+
+        for (int b_div = 1; b_div <= current_blocking.nthr_; ++b_div) {
+            if (current_blocking.nthr_ % b_div != 0) continue;
+            for (int m_div = 1; m_div <= current_blocking.nthr_ / b_div;
+                    ++m_div) {
+                if ((current_blocking.nthr_ / b_div) % m_div != 0) continue;
+                for (int k_div = 1;
+                        k_div <= (current_blocking.nthr_ / b_div) / m_div;
+                        ++k_div) {
+                    if (((current_blocking.nthr_ / b_div) / m_div) % k_div != 0)
+                        continue;
+                    int n_div = ((current_blocking.nthr_ / b_div) / m_div)
+                            / k_div;
+                    current_blocking.set_core_divs(b_div, m_div, k_div, n_div);
+                    if (current_blocking.divs_are_acceptable()
+                            && current_blocking.set_blocking_parameters()) {
+                        if (current_blocking > best_blocking) {
+                            best_blocking = current_blocking;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+float matmul_amx_blocking_params_macro_t::calculate_blocking_scores() const {
+
+    size_t a_size = m_per_thread * k_per_thread * gemm_dt_sz;
+    size_t b_size = n_per_thread * k_per_thread * gemm_dt_sz;
+    size_t d_size = m_per_thread * n_per_thread * c_dt_sz;
+
+    bw_map_t bw_interpulator;
+
+    int macs_per_cycle_base = 1024;
+    int max_k_tmul = 64;
+    int max_n_tmul = 16;
+    // Reducing k-tmul or n-tmul does not shorten the cycles.
+    // However, reducing mtmul reduces the number of cycles required to execute a single tmul instruction.
+    int num_cycles_per_tmul
+            = m_tmul * max_k_tmul * max_n_tmul / macs_per_cycle_base;
+
+    // Calculate reduction cycles
+    float strip_1_size_shared, strip_1_size_private, strip_1_share_coef;
+    float strip_mid_size_shared, strip_mid_size_private;
+    float num_tmuls_per_strip, strip_mid_share_coef, num_strip, nt_mat_l1_miss;
+    float l1_reuse;
+
+    if (is_horizontal) {
+        // Amount of C/D bytes that are written per core
+        size_t strip_dst_size = m_decomposition * n_per_thread
+                * (nthr_k_ == 1 ? c_dt_sz : acc_dt_sz);
+        // Amount of compute
+        num_tmuls_per_strip = m_decomposition * k_per_thread * n_per_thread
+                / (m_tmul * k_tmul * n_tmul);
+        // Amount of strips in the execution
+        num_strip = div_up(m_per_thread, m_decomposition);
+        // B is blocked to the L2 in horizontal traversal, its loads are NT
+        nt_mat_l1_miss = b_size;
+        // Number of times A is reused from L1 in a strip
+        l1_reuse = div_up(n_blk_, n_decomposition);
+
+        // In horizontal multiple cores load the same B to L2
+        strip_1_size_shared = b_size;
+        // In strip 1 there is no sharing of A since there are no prefetches
+        size_t strip_1_size_private_a
+                = m_decomposition * k_per_thread * gemm_dt_sz;
+        strip_1_size_private = strip_1_size_private_a + strip_dst_size;
+        // The cores that share B
+        strip_1_share_coef = nthr_m_;
+
+        // In the mid strips B is reused from L2 and
+        // A is prefetched by multiple cores.
+        strip_mid_size_shared = m_decomposition * k_per_thread
+                * gemm_dt_sz; // A size per strip
+        // C is private to a core, since each core writes to a distinct buffer
+        strip_mid_size_private = strip_dst_size;
+        // share_coeff - the cores that share A
+        strip_mid_share_coef = std::max((size_t)1, nthr_n_);
+
+    } else {
+        // Amount of C/D bytes that are written per core
+        size_t strip_dst_size = n_decomposition * m_per_thread
+                * (nthr_k_ == 1 ? c_dt_sz : acc_dt_sz);
+        // Amount of compute
+        num_tmuls_per_strip = n_decomposition * k_per_thread * m_per_thread
+                / (m_tmul * k_tmul * n_tmul);
+        // Amount of strips in the execution
+        num_strip = div_up(n_per_thread, n_decomposition);
+        // A is blocked to the L2 in vertical traversal, its loads are NT
+        nt_mat_l1_miss = a_size;
+        // Number of times B is reused from L1 in a strip
+        l1_reuse = div_up(m_blk_, m_decomposition);
+
+        // In vertical multiple cores load the same A to L2
+        strip_1_size_shared = a_size;
+        // In strip 1 there is no sharing of B since there are no prefetches
+        size_t strip_1_size_private_b
+                = n_decomposition * k_per_thread * gemm_dt_sz;
+        strip_1_size_private = strip_1_size_private_b + strip_dst_size;
+        // The cores that share A
+        strip_1_share_coef = nthr_n_;
+
+        // In the mid strips A is reused from L2 and
+        // B is prefetched by multiple cores.
+        strip_mid_size_shared = n_decomposition * k_per_thread
+                * gemm_dt_sz; // B size per strip
+        // C is private to a core, since each core writes to a distinct buffer
+        strip_mid_size_private = strip_dst_size;
+        // share_coeff - the cores that share B
+        strip_mid_share_coef = std::max((size_t)1, nthr_m_);
+    }
+    // There are 2 L1 misses for the L1 matrix:
+    //   1. For the prefetch to the L2 (==L1 miss)
+    //   2. For the read from L2
+    float temporal_matrix_l1_miss = strip_mid_size_shared * 2;
+    float temporal_matrix_l1_hit = strip_mid_size_shared * (l1_reuse - 1);
+
+    float c_elem_per_strip = m_blk_ * n_blk_;
+
+    // C post write miss in bytes = m_blk_ * (#n_decompositions in BRGEMM) * (#cache lines per n_decomposition) * 64
+    float c_post_write_miss = m_blk_ * div_up(n_blk_, n_decomposition)
+            * rnd_up(n_decomposition * c_dt_sz, 64);
+    // C post write total in bytes = m_blk_ * (#n_decompositions in BRGEMM) * (#writes per n_decomposition) * 64
+    float c_post_write_total = m_blk_ * div_up(n_blk_, n_decomposition)
+            * div_up(n_decomposition, 16) * 64;
+    float c_post_write_hit = c_post_write_total - c_post_write_miss;
+
+    float c_post_read_c_tmp = c_elem_per_strip * acc_dt_sz;
+
+    float c_tmp_l1_cycles;
+    if (k_blk_ == K) {
+        c_tmp_l1_cycles = acc_dt_sz * c_elem_per_strip * k_chunk_size_
+                / bw_interpulator.l1_load_hit_bw;
+    } else {
+        // TODO: modify wrt wsp
+        c_tmp_l1_cycles = acc_dt_sz * c_elem_per_strip * k_chunk_size_
+                / bw_interpulator.l1_store_miss_bw;
+    }
+
+    float c_l1_cycles = c_post_write_miss / bw_interpulator.l1_store_miss_bw
+            + c_post_write_hit / bw_interpulator.l1_store_hit_bw
+            + c_post_read_c_tmp / bw_interpulator.l1_store_hit_bw
+            + c_tmp_l1_cycles;
+    float l1_cycles = temporal_matrix_l1_miss / bw_interpulator.l1_load_miss_bw
+            + temporal_matrix_l1_hit / bw_interpulator.l1_load_hit_bw
+            + nt_mat_l1_miss / bw_interpulator.l1_load_miss_bw + c_l1_cycles;
+
+    float strip_1_cycles
+            = strip_1_size_shared / bw_interpulator.get_bw(strip_1_share_coef)
+            + strip_1_size_private / bw_interpulator.get_bw(1);
+
+    float strip_mid_dram = strip_mid_size_shared
+                    / bw_interpulator.get_bw(strip_mid_share_coef)
+            + strip_mid_size_private / bw_interpulator.get_bw(1);
+    float strip_mid_llc = (strip_mid_size_private + strip_mid_size_shared)
+            / bw_interpulator.llc_bw;
+    float strip_tmul = num_tmuls_per_strip * num_cycles_per_tmul;
+    float strip_mid_cycles
+            = std::max({strip_mid_dram, strip_mid_llc, l1_cycles, strip_tmul});
+
+    float gemm_cycles = strip_1_cycles + (num_strip - 1) * strip_mid_cycles;
+
+    // Calculate reduction cycles
+    float reduction_cycles;
+    size_t c_size_per_core = m_per_thread * n_per_thread * acc_dt_sz;
+
+    if (nthr_k_ != 1) {
+        if (c_size_per_core * 2 < L2_threshold() && batch == 1) {
+            float reduction_read_bytes = (M * N * acc_dt_sz) * ((nthr_k_ - 1))
+                    / (nthr_m_ * nthr_n_);
+            float reduction_read_cycles;
+            if (a_size + b_size + d_size < L2_threshold()) {
+                reduction_read_cycles
+                        = reduction_read_bytes / bw_interpulator.get_bw(2);
+            } else {
+                reduction_read_cycles
+                        = reduction_read_bytes / bw_interpulator.llc_bw;
+            }
+
+            float reduction_write_bytes
+                    = (M * N * c_dt_sz) / (nthr_m_ * nthr_n_);
+            float reduction_write_cycles
+                    = reduction_write_bytes / bw_interpulator.get_bw(1);
+            // Add reduction const overhead - measured
+            reduction_cycles
+                    = reduction_read_cycles + reduction_write_cycles + 25000;
+        } else {
+            // Don't do reduction if c tmp doesn't fit
+            // Also parallel reduction is not supported for large batch
+            return 0;
+        }
+    } else {
+        reduction_cycles = 0;
+    }
+
+    float total_macs = M * K * N * batch;
+    float total_cycles = (gemm_cycles + reduction_cycles) * b_per_thread;
+    float peak_macs_per_cycle = (macs_per_cycle_base / gemm_dt_sz) * nthr;
+    float peak_cycles = total_macs / peak_macs_per_cycle;
+    return peak_cycles / total_cycles;
+}
+
+bool matmul_amx_blocking_params_macro_t::operator==(
+        const matmul_amx_blocking_params_macro_t &other) const {
+    bool same_score = other.efficiency_score_ == this->efficiency_score_;
+    bool same_direction = this->is_horizontal == other.is_horizontal;
+    bool same_l2_reuse = this->m_chunk_size_ * this->n_chunk_size_
+            == other.m_chunk_size_ * other.n_chunk_size_;
+    return same_score && same_direction && same_l2_reuse;
+}
+
+bool matmul_amx_blocking_params_macro_t::operator>(
+        const matmul_amx_blocking_params_macro_t &other) const {
+    if (other.efficiency_score_ > this->efficiency_score_) { return false; }
+    if (other.efficiency_score_ < this->efficiency_score_) { return true; }
+    // Both efficiency scores are equal
+    if (!this->is_horizontal && other.is_horizontal) {
+        if (this->m_per_thread * K + (size_t)(this->m_per_thread * N)
+                < L2_threshold()) {
+            // Vertical is an option. No l2 set issues for A
+            if (other.is_a_nt_) {
+                // Horizontal doesn't use the L1
+                return true;
+            }
+        }
+        return false;
+    } else if (this->is_horizontal && !other.is_horizontal) {
+        if (other.m_per_thread * K + (size_t)(other.m_per_thread * N)
+                < L2_threshold()) {
+            // Vertical is an option. No L2 set issues for A
+            if (this->is_a_nt_) {
+                // Horizontal doesn't use the L1
+                return false;
+            }
+        }
+        return true;
+    } else {
+        // Both are vertical or both are horizontal
+        // Pick by L2 reuse - the one with the largest m/n_chunk
+        // One of m_chunk_size_ and n_chunk_size is always 1
+        return this->m_chunk_size_ * this->n_chunk_size_
+                > other.m_chunk_size_ * other.n_chunk_size_;
+    }
+}
+
+bool matmul_amx_blocking_params_macro_t::operator!=(
+        const matmul_amx_blocking_params_macro_t &other) const {
+    return !(*this == other);
+}
+
+bool matmul_amx_blocking_params_macro_t::operator<(
+        const matmul_amx_blocking_params_macro_t &other) const {
+    return *this != other && !(*this > other);
+}
+
+dim_t matmul_amx_blocking_params_macro_t::calc_k_blk(size_t l1_dim) const {
+    // Assuming 2x2 decomposition
+    const size_t c_tiles = m_decomposition * n_decomposition * acc_dt_sz;
+    const size_t d_tiles = m_decomposition
+            * rnd_up(n_decomposition * c_dt_sz,
+                    64); // Rounded up to cache line size
+    const size_t available_space_in_l1
+            = L1_threshold() - (c_tiles * 2 + d_tiles);
+
+    const dim_t largest_k = available_space_in_l1 / (l1_dim * gemm_dt_sz);
+    const dim_t largest_k_tiles = largest_k / this->k_tmul;
+    const dim_t k_tiles = div_up(K, this->k_tmul);
+    const dim_t k_per_thread_tiles = div_up(k_tiles, nthr_k_);
+    const dim_t num_K_blocks = div_up(k_per_thread_tiles, largest_k_tiles);
+    return nstl::min(
+            (dim_t)(div_up(k_per_thread_tiles, num_K_blocks) * this->k_tmul),
+            K);
+}
+
+std::set<dim_t> matmul_amx_blocking_params_macro_t::blk_candidates(
+        dim_t dim_per_thread, dim_t decomposition) const {
+    dim_t num_inner_blocks = div_up(dim_per_thread, decomposition);
+    std::set<dim_t> dim_set;
+    for (int num_groups = 1; num_groups <= num_inner_blocks; ++num_groups) {
+        dim_t group_size = div_up(num_inner_blocks, num_groups);
+        dim_set.insert(group_size);
+    }
+
+    return dim_set;
+}
+
+size_t matmul_amx_blocking_params_macro_t::l2_matrix_usage(size_t k_chunk_size,
+        size_t m_or_n_blk, size_t k_blk, bool is_horizontal) const {
+    int decomposition = is_horizontal ? m_decomposition : n_decomposition;
+    int l1_matrix_size = 2 * decomposition
+            * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * gemm_dt_sz; // 2 for prefetch
+    int l2_matrix_size = m_or_n_blk
+            * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * gemm_dt_sz;
+    int c_size = 2 * decomposition * m_or_n_blk
+            * acc_dt_sz; // Keep 2 C strips just to avoid evicting A
+    return l1_matrix_size + l2_matrix_size + c_size;
+}
+
+size_t matmul_amx_blocking_params_macro_t::l2_matrix_and_c_usage(
+        size_t k_chunk_size, size_t m_or_n_blk, size_t k_blk,
+        bool is_horizontal) const {
+    size_t per_thread_for_l1_matrix
+            = is_horizontal ? m_per_thread : n_per_thread;
+    int l1_matrix_size = 2 * per_thread_for_l1_matrix
+            * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * gemm_dt_sz; // 2x factor to make sure C is fresher than A,B in LRU
+    int l2_matrix_size = 2 * m_or_n_blk
+            * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * gemm_dt_sz; // 2x factor to make sure C is fresher than A,B in LRU
+    int c_size
+            = per_thread_for_l1_matrix * m_or_n_blk * acc_dt_sz; // Keep C in L2
+    return l1_matrix_size + l2_matrix_size + c_size;
+}
+
+int matmul_amx_blocking_params_macro_t::bw(size_t m_blk, size_t k_chunk_size,
+        size_t k_blk, size_t n_blk, bool is_horizontal) const {
+    int a_bw = m_blk * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * gemm_dt_sz;
+    int b_bw = n_blk * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * gemm_dt_sz;
+    int c_bw;
+
+    if ((l2_matrix_and_c_usage(k_chunk_size, is_horizontal ? n_blk : m_blk,
+                 k_blk, is_horizontal)
+                        < L2_threshold()
+                || (dim_t)nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+                        == K)
+            && nthr_k_ == 1) {
+        c_bw = 0;
+    } else {
+        c_bw = m_blk * n_blk * acc_dt_sz;
+    }
+    return a_bw + b_bw + c_bw;
+}
+
+int matmul_amx_blocking_params_macro_t::compute(
+        size_t m_blk, size_t k_chunk_size, size_t k_blk, size_t n_blk) const {
+    return m_blk * nstl::min(k_blk * k_chunk_size, (size_t)k_per_thread)
+            * n_blk;
+}
+
+float matmul_amx_blocking_params_macro_t::ratio(size_t m_blk,
+        size_t k_chunk_size, size_t k_blk, size_t n_blk,
+        bool is_horizontal) const {
+    return static_cast<float>(compute(m_blk, k_chunk_size, k_blk, n_blk))
+            / bw(m_blk, k_chunk_size, k_blk, n_blk, is_horizontal);
+}
+
+float matmul_amx_blocking_params_macro_t::evaluate_single_core_blocking(
+        size_t k_chunk_size, size_t m_or_n_blk, size_t k_blk,
+        bool is_horizontal) const {
+    if (l2_matrix_usage(k_chunk_size, m_or_n_blk, k_blk, is_horizontal)
+            <= L2_threshold()) {
+        size_t m_blk, n_blk;
+        if (is_horizontal) {
+            m_blk = m_decomposition;
+            n_blk = m_or_n_blk;
+        } else {
+            m_blk = m_or_n_blk;
+            n_blk = n_decomposition;
+        }
+        float ratio_score
+                = ratio(m_blk, k_chunk_size, k_blk, n_blk, is_horizontal);
+        return ratio_score;
+    }
+    return 0;
+}
+
+void matmul_amx_blocking_params_macro_t::set_tmul_sizes() {
+    this->m_tmul = determine_tmul_size(this->m_per_thread, 16);
+    this->n_tmul = 16; // B blocked layout is a multiply of 16
+    this->k_tmul = nstl::min((size_t)wei_k_blk, (size_t)K);
+}
+
+void matmul_amx_blocking_params_macro_t::set_decomposition() {
+    m_decomposition = nstl::min((size_t)m_per_thread, 2 * m_tmul);
+    n_decomposition = nstl::min((size_t)n_per_thread, 2 * n_tmul);
+}
+
+bool matmul_amx_blocking_params_macro_t::is_horizontal_selected(
+        bool horizontal_not_possible, bool vertical_not_possible,
+        size_t best_m_v, size_t best_k_v, size_t k_blk_v) const {
+    // Choose between horizontal and vertical
+
+    bool is_horizontal_local;
+
+    if (horizontal_not_possible) {
+        is_horizontal_local = false;
+    } else if (vertical_not_possible) {
+        is_horizontal_local = true;
+    } else if ((size_t)m_per_thread < m_tmul * 2) {
+        // There are not enough tiles in M direction to go vertical
+        is_horizontal_local = true;
+    } else if ((size_t)n_per_thread < n_tmul * 2) {
+        // There are not enough tiles in N direction to go horizontal
+        is_horizontal_local = false;
+    } else if (m_per_thread >= n_per_thread) {
+        // Choose horizontal
+        is_horizontal_local = true;
+    } else {
+        // Choose vertical
+        is_horizontal_local = false;
+    }
+    return is_horizontal_local;
+}
+
+bool matmul_amx_blocking_params_macro_t::set_blocking_parameters() {
+    set_tmul_sizes();
+    set_decomposition();
+
+    std::set<dim_t> m_candidates
+            = blk_candidates(m_per_thread, m_decomposition);
+    std::set<dim_t> n_candidates
+            = blk_candidates(n_per_thread, n_decomposition);
+    dim_t best_k_h, best_n_h;
+    dim_t best_m_v, best_k_v;
+    float best_score_h = 0, best_score_v = 0;
+    bool horizontal_not_possible = false;
+    bool vertical_not_possible = false;
+
+    auto calc_horizontal = [&](size_t k_blk_h, dim_t min_k_chunk_size = 0) {
+        if (rnd_up(m_per_thread, m_decomposition) * (nthr_m_ - 1) > (size_t)M) {
+            horizontal_not_possible = true;
+        } else if (rnd_up(k_per_thread, k_blk_h) * (nthr_k_ - 1) > (size_t)K) {
+            // Early exit: There is no possible division of work for nthr_k threads
+            horizontal_not_possible = true;
+        } else {
+            std::set<dim_t> k_candidates_h
+                    = blk_candidates(k_per_thread, k_blk_h);
+            best_n_h = 0;
+            for (std::set<dim_t>::reverse_iterator it_n = n_candidates.rbegin();
+                    it_n != n_candidates.rend(); it_n++) {
+                for (std::set<dim_t>::reverse_iterator it_k
+                        = k_candidates_h.rbegin();
+                        it_k != k_candidates_h.rend(); it_k++) {
+                    float cur_score = evaluate_single_core_blocking(
+                            *it_k, *it_n * n_decomposition, k_blk_h, true);
+                    if (cur_score > best_score_h && *it_k >= min_k_chunk_size) {
+                        best_score_h = cur_score;
+                        best_k_h = *it_k;
+                        best_n_h = *it_n;
+                    }
+                }
+            }
+
+            if (rnd_up(n_per_thread, best_n_h * n_decomposition) * (nthr_n_ - 1)
+                    > (size_t)N) {
+                horizontal_not_possible = true;
+            }
+            if (rnd_up(k_per_thread, best_k_h * k_blk_h) * (nthr_k_ - 1)
+                    > (size_t)K) {
+                // There is not enough work for nthr_k threads
+                horizontal_not_possible = true;
+            }
+        }
+    };
+    // Calculate best score for horizontal traversal
+    dim_t k_blk_h = calc_k_blk(m_decomposition);
+    calc_horizontal(k_blk_h);
+
+    auto calc_vertical = [&](size_t k_blk_v) {
+        if (rnd_up(n_per_thread, n_decomposition) * (nthr_n_ - 1) > (size_t)N) {
+            vertical_not_possible = true;
+        } else if (rnd_up(k_per_thread, k_blk_v) * (nthr_k_ - 1) > (size_t)K) {
+            // Early exit: There is no possible division of work for nthr_k threads
+            vertical_not_possible = true;
+        } else {
+            // Calculate best score for vertical traversal
+            std::set<dim_t> k_candidates_v
+                    = blk_candidates(k_per_thread, k_blk_v);
+            for (std::set<dim_t>::reverse_iterator it_m = m_candidates.rbegin();
+                    it_m != m_candidates.rend(); it_m++) {
+                for (std::set<dim_t>::reverse_iterator it_k
+                        = k_candidates_v.rbegin();
+                        it_k != k_candidates_v.rend(); it_k++) {
+                    float cur_score = evaluate_single_core_blocking(
+                            *it_k, *it_m * m_decomposition, k_blk_v, false);
+                    if (cur_score > best_score_v) {
+                        best_score_v = cur_score;
+                        best_k_v = *it_k;
+                        best_m_v = *it_m;
+                    }
+                }
+            }
+
+            if (rnd_up(m_per_thread, best_m_v * m_decomposition) * (nthr_m_ - 1)
+                    > (size_t)M) {
+                vertical_not_possible = true;
+            }
+            if (rnd_up(k_per_thread, best_k_v * k_blk_v) * (nthr_k_ - 1)
+                    > (size_t)K) {
+                // There is not enough work for nthr_k threads
+                vertical_not_possible = true;
+            }
+            size_t l2_util_v;
+
+            if (!vertical_not_possible) {
+                // Figure out if vertical is an option wrt L2 usage
+                l2_util_v = l2_matrix_and_c_usage(
+                        best_k_v, best_m_v, k_blk_v, false);
+                if (l2_util_v > L2_threshold()) {
+                    l2_util_v = l2_matrix_usage(
+                            best_k_v, best_m_v, k_blk_v, false);
+                }
+            }
+            bool repeat_loop_over_k = div_up(K, k_blk_v * best_k_v) != 1;
+            bool critical_l2_set_issues_a
+                    = div_up((size_t)K, k_blk_v * best_k_v) != nthr_k_
+                    || (size_t)((l2_util_v * nthr_k_)) >= L2_threshold();
+
+            if (repeat_loop_over_k && critical_l2_set_issues_a)
+                vertical_not_possible = true;
+        }
+    };
+
+    dim_t k_blk_v = calc_k_blk(n_decomposition);
+    calc_vertical(k_blk_v);
+
+    if (vertical_not_possible && horizontal_not_possible) { return false; }
+
+    is_horizontal = is_horizontal_selected(horizontal_not_possible,
+            vertical_not_possible, best_m_v, best_k_v, k_blk_v);
+
+    if (is_horizontal) {
+        size_t l1_eff_factor = div_up(K, k_blk_h);
+        // This works for M > 32 in this case k_blk_h << 4096 =~ 512
+        // For M <= the problem is heavily memory bound ==> don't care about the L1 and work completely from the L2
+
+        size_t a_l1 = k_blk_h * m_decomposition * gemm_dt_sz;
+        size_t c_l1 = n_decomposition * m_decomposition * acc_dt_sz;
+        size_t d_post = m_decomposition * rnd_up(n_decomposition * c_dt_sz, 64);
+        is_a_nt_ = false;
+        is_b_nt_ = true;
+
+        if (k_blk_h < K
+                && l1_eff_factor * a_l1 + 2 * c_l1 + d_post > L1_threshold()) {
+            best_score_h = 0;
+            // Calculate k_blk_h and n_blk_h that can fit in the L2 when k_blk is wei_k_blk
+            calc_horizontal(wei_k_blk, k_blk_h / wei_k_blk);
+            // Give up on the L1.
+            k_blk_h = nstl::min(wei_k_blk * best_k_h, K);
+            best_k_h = 1;
+            is_a_nt_ = true;
+            // TODO: revive after precopy implementation
+            //            need_buf_a_ = false;
+            need_prefetch = false;
+        } else {
+            // TODO: revive after precopy implementation
+            //            need_buf_a_ = false;
+            need_prefetch = true;
+        }
+
+        k_blk_ = k_blk_h;
+        k_chunk_size_ = best_k_h;
+        n_blk_ = nstl::min(best_n_h * n_decomposition, N);
+        n_chunk_size_ = 1;
+        m_blk_ = m_decomposition;
+        m_chunk_size_ = div_up(m_per_thread, m_blk_);
+    } else {
+        k_blk_ = k_blk_v;
+        k_chunk_size_ = best_k_v;
+        n_blk_ = n_decomposition;
+        n_chunk_size_ = div_up(n_per_thread, n_blk_);
+        m_blk_ = nstl::min(best_m_v * m_decomposition, M);
+        m_chunk_size_ = 1;
+        is_a_nt_ = true;
+        is_b_nt_ = false;
+        need_prefetch = true;
+    }
+
+    extendable_k_ = K % data_type_vnni_granularity(wei_dt) != 0;
+
+    brgemm_batch_size_ = 1;
+
+    n_chunk_elems_ = nstl::min(n_per_thread, n_blk_ * n_chunk_size_);
+    m_chunk_elems_ = nstl::min(m_per_thread, m_blk_ * m_chunk_size_);
+    k_chunk_elems_ = nstl::min(k_per_thread, k_blk_ * k_chunk_size_);
+
+    set_nt_ = true;
+
+    current_lda_ = get_actual_lda();
+
+    // Need a temp C buffer if a BRGEMM creates partial results
+    need_buf_c_ = (nthr_k_ != 1) || (k_blk_ != K);
+
+    efficiency_score_ = calculate_blocking_scores();
+
+    return true;
+}
+
+void matmul_amx_blocking_params_macro_t::set_core_divs(
+        int nthr_b, int nthr_m, int nthr_k, int nthr_n) {
+    nthr_b_ = nthr_b;
+    nthr_m_ = nthr_m;
+    nthr_k_ = nthr_k;
+    nthr_n_ = nthr_n;
+    m_per_thread = div_up(M, nthr_m_);
+    k_per_thread = div_up(K, nthr_k_);
+    n_per_thread = div_up(N, nthr_n_);
+    b_per_thread = div_up(this->batch, nthr_b_);
+
+    nthr_mnb_ = nthr_ / nthr_k_;
+}
+
+void matmul_amx_blocking_params_micro_t::find_best_blocking(
+        const brgemm_matmul_conf_t &bgmmc,
+        const brgemm_matmul_conf_utils_t &bm_conf_utils,
+        matmul_amx_blocking_params_t &best_blocking) {
+
+    matmul_amx_blocking_params_micro_t current_blocking(bgmmc);
+
+    const int min_k_per_thread = 1024;
+    const int max_k_parallel_work
+            = div_up(static_cast<int>(bgmmc.K), min_k_per_thread);
+    const bool is_amx_xf16 = bgmmc.is_amx
+            && (bm_conf_utils.is_bf16() || bm_conf_utils.is_f16()
+                    || bm_conf_utils.is_f32_f16() || bm_conf_utils.is_f32_bf16()
+                    || bm_conf_utils.is_bf32()
+                    || bm_conf_utils.is_bf16_with_int_wei()
+                    || bm_conf_utils.is_f16_with_int_wei());
+    const bool is_amx_int8 = bgmmc.is_amx && bm_conf_utils.is_int8();
+
+    const bool runtime_dims
+            = bgmmc.is_runtime_M || bgmmc.is_runtime_N || bgmmc.is_runtime_K;
+    const int max_nthr_k = !runtime_dims && is_amx_xf16 && bgmmc.batch == 1
+            ? nstl::min(saturate(1, 7, bgmmc.nthr / 8), max_k_parallel_work)
+            : 1;
+    int iter = 0;
+    const int runtime_M_chunk = bgmmc.lda_big_pow2() ? 2 : 4;
+    const int runtime_N_chunk = 2;
+
+    // Disable skip configuration due to regressions for some cases.
+    const bool disable_skip_config = bgmmc.M == 4
+            && utils::one_of(true, bgmmc.N == 4096 && bgmmc.K == 4096,
+                    bgmmc.N == 11008 && bgmmc.K == 4096,
+                    bgmmc.N == 4096 && bgmmc.K == 11008);
+
+    for (int nthr_k = 1; nthr_k <= max_nthr_k; nthr_k++) {
+        int nthr_bmn = bgmmc.nthr / nthr_k;
+
+        int num_M_blk = bgmmc.is_runtime_M ? 1 : div_up(bgmmc.M, bgmmc.M_blk);
+        int num_N_blk = bgmmc.is_runtime_N ? 1 : div_up(bgmmc.N, bgmmc.N_blk);
+        int k_parallel_work = nstl::min(max_k_parallel_work, nthr_k);
+        int num_parallel_work
+                = bgmmc.batch * num_M_blk * num_N_blk * k_parallel_work;
+        const bool a_lot_of_parallel_work_lvl2
+                = num_parallel_work > 16 * bgmmc.nthr;
+        const bool low_parallelism
+                = static_cast<float>(num_parallel_work) < 1.5f * bgmmc.nthr;
+        const bool maybe_low_blocking
+                = is_amx_int8 && bm_conf_utils.maybe_low_brg_blocking();
+        const int min_M_blk = !bgmmc.is_runtime_M
+                        && (maybe_low_blocking || low_parallelism)
+                        && bgmmc.M_blk > 32
+                ? div_up(bgmmc.M_blk, 2)
+                : bgmmc.M_blk;
+        const int min_N_blk = !bgmmc.is_runtime_N && low_parallelism
+                        && is_amx_xf16 && !bm_conf_utils.check_n_blk_fixed()
+                        && bgmmc.N_blk > 32 && !runtime_dims
+                ? 32
+                : bgmmc.N_blk;
+        const int desired_M_chunk = bgmmc.is_runtime_M
+                ? runtime_M_chunk
+                : nstl::min(4, num_M_blk);
+        const int desired_N_chunk = bgmmc.is_runtime_N
+                ? runtime_N_chunk
+                : nstl::min(a_lot_of_parallel_work_lvl2 ? 6 : 4, num_N_blk);
+
+        std::unordered_set<int> mblk_candidates;
+        for (int m_blk = bgmmc.M_blk; m_blk >= min_M_blk;
+                m_blk = m_blk > 1 ? div_up(m_blk, 2) : m_blk - 1) {
+            if (IMPLICATION(maybe_low_blocking, m_blk != bgmmc.M_blk))
+                mblk_candidates.insert(m_blk);
+        }
+
+        if (!bgmmc.is_runtime_M && bgmmc.M > 16) {
+            // Add multiple of 16 M block sizes for consideration
+            const int mul16_m_blk_max
+                    = nstl::min(rnd_dn(static_cast<int>(bgmmc.M), 16), 64);
+            const int mul16_m_blk_min = rnd_up(min_M_blk, 16);
+            for (int m_blk = mul16_m_blk_max; m_blk >= mul16_m_blk_min;
+                    m_blk -= 16) {
+                mblk_candidates.insert(m_blk);
+            }
+        }
+
+        bool found_best_blocking = false;
+        for_(int n_blk = bgmmc.N_blk; n_blk >= min_N_blk; n_blk -= 16)
+        for_(int m_blk : mblk_candidates)
+        for_(int n_ch_sz = desired_N_chunk; n_ch_sz >= 1; n_ch_sz--)
+        for (int m_ch_sz = desired_M_chunk; m_ch_sz >= 1; m_ch_sz--, iter++) {
+            current_blocking.set_blocking_parameters(
+                    nthr_k, n_blk, n_ch_sz, m_blk, m_ch_sz);
+
+            float cur_score = current_blocking.get_blocking_scores();
+            float bst_score = best_blocking.get_blocking_scores();
+
+            int m_chunks = div_up(bgmmc.M, m_blk * m_ch_sz);
+            int n_chunks = div_up(bgmmc.N, n_blk * n_ch_sz);
+            int work_amount = bgmmc.batch * m_chunks * n_chunks;
+
+            bool skip_config = work_amount < nthr_bmn * 3
+                    && work_amount % nthr_bmn != 0 && max_nthr_k == 1;
+            if (skip_config && !disable_skip_config) continue;
+
+            if (cur_score > bst_score) {
+                best_blocking = current_blocking;
+                found_best_blocking = true;
+            }
+        }
+
+        if (!found_best_blocking) {
+            current_blocking.set_blocking_parameters(
+                    nthr_k, min_N_blk, 1, min_M_blk, 1);
+
+            float cur_score = current_blocking.get_blocking_scores();
+            float bst_score = best_blocking.get_blocking_scores();
+            if (cur_score > bst_score) best_blocking = current_blocking;
+        }
+    }
+}
+
+void matmul_amx_blocking_params_micro_t::update_k_blocking_dependent_params() {
+    k_chunk_elems_ = k_blk_ * k_chunk_size_ * brgemm_batch_size_;
+    current_lda_ = get_actual_lda();
+    need_buf_c_ = is_buffer_c_required();
+}
+
+void matmul_amx_blocking_params_micro_t::set_blocking_parameters(
+        int nthr_k, int n_blk, int n_chunk_size, int m_blk, int m_chunk_size) {
+    nthr_k_ = nstl::max(1, nthr_k);
+    nthr_mnb_ = nthr / nthr_k_;
+    nthr_ = nthr_mnb_ * nthr_k_;
+    n_blk_ = n_blk;
+    n_chunk_size_ = n_chunk_size;
+    m_blk_ = m_blk;
+    m_chunk_size_ = m_chunk_size;
+
+    if (one_of(0, n_blk_, n_chunk_size_, m_blk_, m_chunk_size_)) {
+        k_blk_ = k_chunk_size_ = k_chunk_elems_ = brgemm_batch_size_ = 0;
+        efficiency_score_ = 0.0f;
+        return;
+    }
+
+    n_chunk_elems_ = n_blk_ * n_chunk_size_;
+    m_chunk_elems_ = m_blk_ * m_chunk_size_;
+
+    if (K < wei_k_blk) {
+        k_blk_ = is_amx ? rnd_up(K, required_k_granularity) : K;
+        brgemm_batch_size_ = 1;
+    } else {
+        dim_t k_per_thr = div_up(K, nthr_k_);
+        k_blk_ = nstl::min(rnd_up(k_per_thr, required_k_granularity),
+                static_cast<dim_t>(wei_k_blk));
+        const dim_t num_k_blk = div_up(K, k_blk_);
+        const dim_t num_k_blk_per_thread = div_up(num_k_blk, nthr_k_);
+        brgemm_batch_size_ = num_k_blk_per_thread;
+
+        auto chunk_sz = calculate_chunk_memory_size();
+        const dim_t div_min = chunk_sz / L2_threshold();
+        const dim_t div_max = div_up(chunk_sz, L2_threshold());
+        // For big pow2 lda prefer to increase area of linear memory access
+        const dim_t adjust_k_divisor_threshold = lda_big_pow2() ? 2 : 0;
+        // Adjust K blocking values to fit into L2 cache
+        if (div_min > adjust_k_divisor_threshold && brgemm_batch_size_ > 1) {
+            const auto kc1 = nstl::max(
+                    brgemm_batch_size_ / div_min, static_cast<dim_t>(1));
+            const auto kc2 = div_up(brgemm_batch_size_, div_max);
+            const auto tail1 = num_k_blk_per_thread % kc1;
+            const auto tail2 = num_k_blk_per_thread % kc2;
+            // Prefer adjusted chunk size with more equal work distribution
+            // Across iterations
+            brgemm_batch_size_
+                    = IMPLICATION(tail1 == 0 || tail2 < tail1, tail2 == 0)
+                    ? kc2
+                    : kc1;
+        }
+
+        k_chunk_elems_ = k_blk_ * brgemm_batch_size_ * k_chunk_size_;
+        dim_t brgemm_k_elems = k_blk_ * brgemm_batch_size_;
+        const dim_t current_k_tail = K % k_blk_;
+
+        // TODO: review extendable_k_ condition to cover more cases
+        extendable_k_ = (K % wei_k_blk != 0) && (brgemm_k_elems > wei_k_blk)
+                && wei_zp_type == none && !use_buffer_a
+                && !packed_sparse_weights;
+
+        if (extendable_k_) {
+            if (brgemm_k_elems >= K) {
+                k_blk_ = K;
+                k_chunk_size_ = 1;
+                brgemm_batch_size_ = 1;
+            } else {
+                k_blk_ = brgemm_k_elems;
+                k_chunk_size_ = 1;
+                brgemm_batch_size_ = 1;
+            }
+        } else if (current_k_tail == 0
+                && K % (k_blk_ * brgemm_batch_size_) == 0) {
+            k_blk_ = brgemm_k_elems;
+            brgemm_batch_size_ = 1;
+        } else if (nthr_k_ == 1
+                && K == k_blk_ * brgemm_batch_size_ + current_k_tail) {
+            k_blk_ = brgemm_k_elems;
+            brgemm_batch_size_ = 2;
+        }
+    }
+    need_buf_a_
+            = use_buffer_a || (!extendable_k_ && K % required_k_granularity);
+
+    blocking_chunk_mem_size_ = calculate_chunk_memory_size();
+
+    efficiency_score_ = calculate_blocking_scores();
+}
+
+// Returns score for current blocking parameters' values in range [0, 1]
+// for parallel work over threads distribution score. Maximum scores - when
+// all threads have the same work amount w/o tails
+float matmul_amx_blocking_params_micro_t::get_thread_balance_scores() const {
+    assert(!(is_runtime_M && is_runtime_N)
+            && "single runtime dim is supported");
+    // Ignore M sizes in thread balance computation as actual M size is unknown
+    if (is_runtime_M) return (float)N / rnd_up(N, n_chunk_elems_);
+    // Ignore N sizes in thread balance computation as actual N size is unknown
+    if (is_runtime_N) return (float)M / rnd_up(M, m_chunk_elems_);
+
+    const dim_t num_M_chunks = div_up(M, m_chunk_elems_);
+    const dim_t num_N_chunks = div_up(N, n_chunk_elems_);
+    float mnb_parallel_score = batch * ((float)M / m_chunk_elems_)
+            * ((float)N / n_chunk_elems_)
+            / rnd_up(batch * num_M_chunks * num_N_chunks, nthr_mnb_)
+            * nthr_mnb_;
+    float k_parallel_score = 1.0f;
+    if (nthr_k_ > 1) {
+        const dim_t num_K_chunks = div_up(K, k_chunk_elems_);
+        const float parallel_reduction_penalty = 0.8f;
+        k_parallel_score = parallel_reduction_penalty
+                * ((float)K / k_chunk_elems_) / rnd_up(num_K_chunks, nthr_k_)
+                * nthr_k_;
+    }
+
+    return mnb_parallel_score * k_parallel_score / nthr;
+}
+
+// Returns score for current blocking parameters' values in range [0, 1]
+// for copied data reusage
+float matmul_amx_blocking_params_micro_t::get_copied_data_reusage_scores()
+        const {
+    const dim_t effective_m_chunk_sz = 64 * 4;
+    const dim_t desired_M_chunk_size = is_runtime_M
+            ? effective_m_chunk_sz
+            : nstl::min(M, effective_m_chunk_sz);
+    const dim_t effective_n_chunk_sz = 64 * (need_buf_a_ ? 4 : 1);
+    const dim_t desired_N_chunk_size = is_runtime_N
+            ? effective_n_chunk_sz
+            : nstl::min(N, effective_n_chunk_sz);
+    const float coef_M = nstl::min(
+            static_cast<float>(m_chunk_elems_) / desired_M_chunk_size, 1.0f);
+    const float coef_N = nstl::min(
+            static_cast<float>(n_chunk_elems_) / desired_N_chunk_size, 1.0f);
+    return 0.5f * (coef_M + coef_N);
+}
+
+// Returns score for current blocking parameters' values in range [0, 1]
+// for L2 utilization
+float matmul_amx_blocking_params_micro_t::get_L2_utilization_scores() const {
+    const float relative_difference_with_L2
+            = fabsf((float)L2_threshold() - blocking_chunk_mem_size_)
+            / nstl::max(L2_threshold(), blocking_chunk_mem_size_);
+    return 1.0f - relative_difference_with_L2;
+}
+
+// Returns score for current blocking parameters' values in range [0, 1]
+// consists of 3 parts with its own weights:
+// 	1) parallel work over threads distribution score
+// 	2) L2 utilization score
+// 	3) copied data re-usage score
+float matmul_amx_blocking_params_micro_t::calculate_blocking_scores() const {
+    if (one_of(0, n_blk_, n_chunk_size_, m_blk_, m_chunk_size_, k_blk_,
+                brgemm_batch_size_))
+        return 0.0f;
+
+    const float nthr_coeff = nstl::min(nthr, 100);
+    const float reusage_factor = 1.0f;
+    // For runtume M the actual size is unknown, use independent on num_threads
+    // balance factors
+    const float balance_factor
+            = is_runtime_M ? 1.0f : (nthr_coeff - 1.0f) / nthr_coeff;
+    const float cache_utilization_factor
+            = is_runtime_M ? 1.0f : 1.0f / nthr_coeff;
+
+    float scores = cache_utilization_factor * get_L2_utilization_scores()
+            + reusage_factor * get_copied_data_reusage_scores();
+    if (balance_factor > 0.0f)
+        scores += balance_factor * get_thread_balance_scores();
+    return scores
+            / (reusage_factor + balance_factor + cache_utilization_factor);
+}
+
+size_t matmul_amx_blocking_params_micro_t::calculate_chunk_memory_size() {
+    update_k_blocking_dependent_params();
+
+    const size_t A_chunk_sz = a_dt_sz * k_chunk_elems_ * m_chunk_elems_;
+    const size_t A_buf_sz = need_buf_a_
+            ? tr_a_dt_sz * current_lda_ * brgemm_batch_size_ * m_chunk_elems_
+            : 0;
+    const size_t B_chunk_sz = b_dt_sz * k_chunk_elems_ * n_chunk_elems_;
+    const size_t B_buf_sz
+            = use_buffer_b ? tr_b_dt_sz * n_blk_ * k_chunk_elems_ : 0;
+    const size_t C_chunk_sz = c_dt_sz * m_chunk_elems_ * n_chunk_elems_;
+    const size_t C_buf_sz
+            = need_buf_c_ ? acc_dt_sz * m_chunk_elems_ * n_chunk_elems_ : 0;
+    return A_chunk_sz + A_buf_sz + B_chunk_sz + B_buf_sz + C_chunk_sz
+            + C_buf_sz;
+}
+
+} // namespace matmul
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/matmul/amx_blocking_heuristics.hpp b/src/cpu/x64/matmul/amx_blocking_heuristics.hpp
new file mode 100644
index 00000000000..19ff397aca2
--- /dev/null
+++ b/src/cpu/x64/matmul/amx_blocking_heuristics.hpp
@@ -0,0 +1,227 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_MATMUL_AMX_BLOCKING_HEURISTICS_HPP
+#define CPU_X64_MATMUL_AMX_BLOCKING_HEURISTICS_HPP
+
+#include "common/math_utils.hpp"
+#include "cpu/x64/matmul/brgemm_matmul_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace matmul {
+
+class matmul_amx_blocking_params_t : public brgemm_matmul_conf_t {
+public:
+    matmul_amx_blocking_params_t(const brgemm_matmul_conf_t &bgmmc)
+        : brgemm_matmul_conf_t(bgmmc)
+        , nthr_m_(nstl::max(nthr_m, 1))
+        , nthr_n_(nstl::max(nthr_n, 1))
+        , nthr_k_(nstl::max(nthr_k, 1))
+        , nthr_b_(nstl::max(nthr_b, 1))
+        , nthr_mnb_(nthr / nthr_k_)
+        , nthr_(nthr_mnb_ * nthr_k_)
+        , n_blk_(N_blk)
+        , n_chunk_size_(N_chunk_size)
+        , n_chunk_elems_(n_blk_ * n_chunk_size_)
+        , m_blk_(M_blk)
+        , m_chunk_size_(M_chunk_size)
+        , m_chunk_elems_(m_blk_ * m_chunk_size_)
+        , k_blk_(K_blk)
+        , k_chunk_size_(K_chunk_size)
+        , k_chunk_elems_(k_blk_ * k_chunk_size_ * brgemm_batch_size)
+        , is_a_nt_(is_a_nt)
+        , is_b_nt_(is_b_nt)
+        , set_nt_(set_nt)
+        , brgemm_batch_size_(brgemm_batch_size)
+        , current_lda_(LDA)
+        , need_buf_c_(use_buffer_c)
+        , need_buf_a_(use_buffer_a)
+        , extendable_k_(extendable_k)
+        , blocking_chunk_mem_size_(0)
+        , efficiency_score_(0.0f) {}
+
+    void update_configuration(brgemm_matmul_conf_t &bgmmc) const;
+    float get_blocking_scores() const { return efficiency_score_; }
+
+    static size_t L1_threshold();
+    static size_t L2_threshold();
+
+protected:
+    virtual float calculate_blocking_scores() const = 0;
+    virtual dim_t get_actual_lda() const;
+
+    // Num threads for parallelism wrt K dimension
+    size_t nthr_m_ {0}, nthr_n_ {0}, nthr_k_ {0}, nthr_b_ {0};
+    // Num threads for parallelism wrt M, N and batch dimensions
+    int nthr_mnb_ {0};
+    int nthr_ {0};
+    dim_t n_blk_ {0}, n_chunk_size_ {0}, n_chunk_elems_ {0};
+    dim_t m_blk_ {0}, m_chunk_size_ {0}, m_chunk_elems_ {0};
+    dim_t k_blk_ {0}, k_chunk_size_ {0}, k_chunk_elems_ {0};
+
+    bool is_a_nt_ {true}, is_b_nt_ {true};
+    bool set_nt_ {false};
+
+    dim_t brgemm_batch_size_ {0};
+    dim_t current_lda_ {0};
+    bool need_buf_c_ {false}, need_buf_a_ {false};
+    bool extendable_k_ {false};
+    size_t blocking_chunk_mem_size_ {0};
+    float efficiency_score_ {0.0};
+
+    bool is_buffer_c_required() const;
+};
+
+class matmul_amx_blocking_params_macro_t : public matmul_amx_blocking_params_t {
+public:
+    matmul_amx_blocking_params_macro_t(const brgemm_matmul_conf_t &bgmmc)
+        : matmul_amx_blocking_params_t(bgmmc) {}
+    static bool is_supported(const brgemm_matmul_conf_t &bgmmc,
+            const brgemm_matmul_conf_utils_t &bm_conf_utils);
+    static bool find_best_blocking(const brgemm_matmul_conf_t &bgmmc,
+            const brgemm_matmul_conf_utils_t &bm_conf_utils,
+            matmul_amx_blocking_params_macro_t &best_blocking);
+
+protected:
+    float calculate_blocking_scores() const override;
+
+private:
+    static const dim_t min_m_dim = 64;
+    static const dim_t min_k_dim = 256;
+    static const dim_t min_n_dim = 64;
+    static const dim_t k_threshold_write_bound_layer = 256;
+    static const dim_t min_n_dim_write_bound_layer = 256;
+    dim_t n_decomposition = 32;
+    dim_t m_decomposition = 32;
+    size_t gemm_dt_sz;
+    dim_t m_per_thread, k_per_thread, n_per_thread, b_per_thread;
+    bool need_prefetch;
+    bool is_horizontal;
+
+    size_t m_tmul, n_tmul, k_tmul;
+    bool set_blocking_parameters();
+    bool is_horizontal_selected(bool horizontal_not_possible,
+            bool vertical_not_possible, size_t best_m_v, size_t best_k_v,
+            size_t k_blk_v) const;
+    void set_tmul_sizes();
+    void set_decomposition();
+    size_t l2_matrix_usage(size_t k_chunk_size, size_t m_or_n_blk, size_t k_blk,
+            bool is_horizontal) const;
+    size_t l2_matrix_and_c_usage(size_t k_chunk_size, size_t m_or_n_blk,
+            size_t k_blk, bool is_horizontal) const;
+    void set_core_divs(int nthr_b, int nthr_m, int nthr_k, int nthr_n);
+    int bw(size_t m_blk, size_t k_chunk_size, size_t k_blk, size_t n_blk,
+            bool is_horizontal) const;
+    int compute(size_t m_blk, size_t k_chunk_size, size_t k_blk,
+            size_t n_blk) const;
+    float ratio(size_t m_blk, size_t k_chunk_size, size_t k_blk, size_t n_blk,
+            bool is_horizontal) const;
+    std::set<dim_t> blk_candidates(
+            dim_t dim_per_thread, dim_t decomposition) const;
+    float evaluate_single_core_blocking(size_t k_chunk_size, size_t m_or_n_blk,
+            size_t k_blk, bool is_horizontal) const;
+    dim_t calc_k_blk(size_t l1_dim) const;
+    bool divs_are_acceptable() const;
+    bool operator==(const matmul_amx_blocking_params_macro_t &other) const;
+    bool operator>(const matmul_amx_blocking_params_macro_t &other) const;
+    bool operator!=(const matmul_amx_blocking_params_macro_t &other) const;
+    bool operator<(const matmul_amx_blocking_params_macro_t &other) const;
+};
+
+class matmul_amx_blocking_params_micro_t : public matmul_amx_blocking_params_t {
+public:
+    matmul_amx_blocking_params_micro_t(const brgemm_matmul_conf_t &bgmmc)
+        : matmul_amx_blocking_params_t(bgmmc) {}
+
+    void set_blocking_parameters(int nthr_k, int n_blk, int n_chunk_size,
+            int m_blk, int m_chunk_size);
+
+    static void find_best_blocking(const brgemm_matmul_conf_t &bgmmc,
+            const brgemm_matmul_conf_utils_t &bm_conf_utils,
+            matmul_amx_blocking_params_t &best_blocking);
+
+protected:
+    float calculate_blocking_scores() const override;
+
+private:
+    float get_thread_balance_scores() const;
+    void update_k_blocking_dependent_params();
+    size_t calculate_chunk_memory_size();
+    float get_copied_data_reusage_scores() const;
+    float get_L2_utilization_scores() const;
+};
+
+class bw_map_t {
+public:
+    bw_map_t() {}
+
+    float get_bw(int x) const { return linear_interpolation(multicore_bw, x); }
+
+    // All the following bandwidth measurements were taken on an
+    // EMR machine with two NUMA domains, each containing 32 cores.
+
+    // This BW is the BW for read/store when hitting the L1
+    const float l1_load_hit_bw = (float)106.41;
+    const float l1_store_hit_bw = l1_load_hit_bw;
+
+    // This l1 BW is the BW for read when missing the L1
+    const float l1_load_miss_bw = (float)(106.41 / 2.28);
+    // This l1 BW is the BW for store when missing the L1
+    const float l1_store_miss_bw = (float)(106.41 / 2.85);
+    // LLC BW
+    const float llc_bw = (float)6.0;
+
+private:
+    // This dictionary includes DRAM bandwidth for cores that share data.
+    // The key represents the number of cores sharing, and the value is the bandwidth.
+    const std::map<int, float> multicore_bw = {
+            {32, 4.06}, {16, 3.31}, {8, 2.98}, {4, 2.39}, {2, 0.9}, {1, 2.28}};
+
+    float linear_interpolation(
+            const std::map<int, float> &points, float x) const {
+        // Find the interval [x0, x1] where x0 <= x <= x1
+        auto it = points.lower_bound(x);
+        if (it == points.end()) {
+            return points.rbegin()
+                    ->second; // x is greater than the largest x in the map
+        }
+        if (it == points.begin()) {
+            return it->second; // x is less than the smallest x in the map
+        }
+
+        auto it1 = it;
+        auto it0 = std::prev(it);
+
+        int x0 = it0->first;
+        float y0 = it0->second;
+        int x1 = it1->first;
+        float y1 = it1->second;
+
+        // Perform linear interpolation
+        return y0 + (y1 - y0) * (x - x0) / (x1 - x0);
+    }
+};
+
+} // namespace matmul
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
\ No newline at end of file
diff --git a/src/cpu/x64/matmul/brgemm_matmul.cpp b/src/cpu/x64/matmul/brgemm_matmul.cpp
index 0f9cfc32b5e..9b739fc7b61 100644
--- a/src/cpu/x64/matmul/brgemm_matmul.cpp
+++ b/src/cpu/x64/matmul/brgemm_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,6 +44,14 @@ using namespace nstl;
 
 using namespace data_type;
 
+template <cpu_isa_t isa>
+void brgemm_matmul_t<isa>::pd_t::maybe_set_LDB2() {
+    if (bgmmc_.LDB < bgmmc_.N_blk
+            && (bgmmc_.N_blk % bgmmc_.LDB == 0 || bgmmc_.N_blk == bgmmc_.N)) {
+        bgmmc_.LDB2 = rnd_up(bgmmc_.K, bgmmc_.wei_k_blk) * bgmmc_.LDB;
+    }
+}
+
 template <cpu_isa_t isa>
 status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
     const auto src_dt = src_md_.data_type;
@@ -52,7 +60,7 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
 
     const bool is_f32 = everyone_is(f32, src_dt, wei_dt, dst_dt);
     const bool is_int8 = one_of(src_dt, u8, s8) && wei_dt == s8
-            && one_of(dst_dt, u8, s8, s32, f32, bf16);
+            && one_of(dst_dt, u8, s8, s32, f32, f16, bf16);
     const bool is_f8 = one_of(src_dt, f8_e5m2, f8_e4m3)
             && one_of(wei_dt, f8_e5m2, f8_e4m3)
             && one_of(dst_dt, f32, f16, bf16, f8_e5m2, f8_e4m3);
@@ -60,8 +68,14 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
             = everyone_is(bf16, src_dt, wei_dt) && one_of(dst_dt, bf16, f32);
     const bool is_f16
             = everyone_is(f16, src_dt, wei_dt) && one_of(dst_dt, f16, f32);
+    const bool is_f32_f16
+            = src_dt == f32 && wei_dt == f16 && one_of(dst_dt, f16, f32);
+    const bool is_f32_bf16
+            = src_dt == f32 && wei_dt == bf16 && one_of(dst_dt, bf16, f32);
     const bool is_bf16_with_int_wei = src_dt == bf16
             && one_of(wei_dt, s8, u8, s4, u4) && one_of(dst_dt, bf16, f32);
+    const bool is_f16_with_int_wei = src_dt == f16
+            && one_of(wei_dt, s8, u8, s4, u4) && one_of(dst_dt, f16, f32);
 
     auto check_bias = [&]() -> bool {
         const auto bia_dt = weights_md(1)->data_type;
@@ -70,31 +84,81 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
         const bool is_bia_dt_correct
                 = IMPLICATION(is_int8 == true,
                           one_of(bia_dt, f32, s32, s8, u8, bf16))
-                && IMPLICATION(!is_int8, one_of(bia_dt, f32, src_dt));
+                && IMPLICATION(
+                        is_f8 == true, one_of(bia_dt, f32, f16, bf16, src_dt))
+                && IMPLICATION(
+                        !(is_int8 || is_f8), one_of(bia_dt, f32, src_dt));
         return IMPLICATION(with_bias(), is_bia_dt_correct && is_bias_1xN());
     };
 
+    auto check_reduce = [&]() -> bool {
+        if (!with_reduce()) return true;
+
+        bool ok = reduce_kind() == matmul_reduce_kind::src;
+        ok = ok && src_md()->ndims == 2;
+        ok = ok && one_of(src_dt, f32, bf16, f16);
+
+        const memory_desc_wrapper src_mdw(src_md_);
+        ok = ok && !src_mdw.has_runtime_dims();
+        ok = ok && src_mdw.matches_tag(format_tag::ba);
+
+        const auto skip_mask = primitive_attr_t::skip_mask_t::fpmath_mode;
+        ok = ok && attr()->has_default_values(skip_mask);
+
+        return ok;
+    };
+
     auto check_attr_scales = [&]() -> bool {
         const std::vector<int> supported_args
                 = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
         bool ok = attr_scales_ok(supported_args);
-        if (!attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
-                && !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values()
-                && attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0) {
+        const auto &asc = attr()->scales_;
+        if (!asc.has_default_values(DNNL_ARG_SRC)
+                && !asc.has_default_values(DNNL_ARG_WEIGHTS)
+                && asc.get_mask(DNNL_ARG_WEIGHTS) > 0) {
             // This case requires scratchpad
             if (N() == DNNL_RUNTIME_DIM_VAL) ok = false;
         }
-        // Impl suppports scales only for integer weights
-        ok = ok
-                && IMPLICATION(!attr()->scales_.has_default_values(),
-                        types::is_integral_dt(wei_dt));
+        // Impl suppports f32 scales only for non-weight decompression
+        if (!(is_bf16_with_int_wei || is_f16_with_int_wei)) {
+            ok = ok && one_of(asc.get_data_type(DNNL_ARG_SRC), undef, f32);
+            ok = ok && one_of(asc.get_data_type(DNNL_ARG_WEIGHTS), undef, f32);
+            ok = ok && one_of(asc.get_data_type(DNNL_ARG_DST), undef, f32);
+        }
+        // Implementation has limited support w.r.t. scales groups.
+        if (!asc.has_default_values(DNNL_ARG_WEIGHTS)) {
+            if (!asc.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+                // Only grouping over K is supported.
+                ok = ok && asc.get_group(DNNL_ARG_WEIGHTS, 1) == 1;
+                // Only 'per_ocic' mask is supported, but not 'per_tensor' in
+                // benchdnn terms. In numbers, it's '12' is supported while for
+                // 4D '15' is required.
+                const int mask = asc.get_mask(DNNL_ARG_WEIGHTS);
+                const int ndims = weights_md_.ndims;
+                const int last_dim = (1 << (ndims - 1));
+                const int prelast_dim = (1 << (ndims - 2));
+                const bool mask_ok = (mask & ~(last_dim | prelast_dim)) == 0;
+                ok = ok && mask_ok;
+            }
+        }
         return ok;
     };
 
-    auto check_attr_zero_points
-            = [&]() -> bool { return attr()->zero_points_.common(); };
-    const bool problem_dt_correct = one_of(true, is_int8, is_f8, is_bf16,
-            is_f32, is_f16, is_bf16_with_int_wei);
+    auto check_attr_zero_points = [&]() -> bool {
+        const auto &zp = attr()->zero_points_;
+        static const std::vector<int> supported_args {
+                DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+        for (int arg : supported_args) {
+            if (!zp.has_default_values(arg)) {
+                const int mask = zp.get_mask(arg);
+                if (mask > 0) return false;
+            }
+        }
+        return true;
+    };
+    const bool problem_dt_correct
+            = one_of(true, is_int8, is_f8, is_bf16, is_f32, is_f16, is_f32_f16,
+                    is_f32_bf16, is_bf16_with_int_wei, is_f16_with_int_wei);
 
     auto src_d = memory_desc_wrapper(src_md_);
     auto weights_d = memory_desc_wrapper(weights_md_);
@@ -104,17 +168,19 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
             || (!src_d.is_sparse_desc() && !bias_d.is_sparse_desc()
                     && !dst_d.is_sparse_desc()
                     && weights_d.is_sparse_packed_desc());
+    // Disabling verbose dispatch messages for unsupported isa for better
+    // readability.
+    if (!mayiuse(isa)) return status::unimplemented;
+
     VDISPATCH_MATMUL(is_sparse_ok, VERBOSE_UNSUPPORTED_SPARSE_CFG);
-    VDISPATCH_MATMUL(mayiuse(isa), VERBOSE_UNSUPPORTED_ISA);
     VDISPATCH_MATMUL(problem_dt_correct, VERBOSE_UNSUPPORTED_DT_CFG);
     VDISPATCH_MATMUL(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
     VDISPATCH_MATMUL(
             attr()->has_default_values(
-                    primitive_attr_t::skip_mask_t::scales_runtime_data_type
-                            | primitive_attr_t::skip_mask_t::
-                                    scales_runtime_groups
+                    primitive_attr_t::skip_mask_t::scales_data_type
+                            | primitive_attr_t::skip_mask_t::scales_groups
                             | primitive_attr_t::skip_mask_t::
-                                    zero_points_runtime_data_type
+                                    zero_points_data_type
                             | primitive_attr_t::skip_mask_t::post_ops
                             | primitive_attr_t::skip_mask_t::sum_dt
                             | primitive_attr_t::skip_mask_t::fpmath_mode,
@@ -125,10 +191,18 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
     VDISPATCH_MATMUL(check_attr_scales(), VERBOSE_UNSUPPORTED_SCALES_CFG);
     VDISPATCH_MATMUL(check_attr_zero_points(), VERBOSE_UNSUPPORTED_ZP_CFG);
     VDISPATCH_MATMUL(check_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
+    VDISPATCH_MATMUL(check_reduce(), VERBOSE_UNSUPPORTED_FEATURE,
+            "reduce is not supported");
 
     CHECK(init_brgemm_matmul_conf(isa, bgmmc_, *desc(), src_md_, weights_md_,
             dst_md_, bias_md_, attr_));
 
+    // f32:f16 configuration on AVX2 doesn't support tails with proper
+    // instruction sequence in copy routines. Anchor: F32_F16_AVX2_NO_TAIL.
+    VDISPATCH_MATMUL(IMPLICATION((is_f32_f16 || is_f32_bf16) && isa == avx2,
+                             bgmmc_.N % 8 == 0),
+            "unsupported configuration");
+
     const float alpha = 1.0;
     const float beta = 1.0;
     const float beta_init = 0.0;
@@ -144,15 +218,25 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
     // non-amx isa. s8s8 proplem type is exception to avoid compensations
     // processing for tail kernel
     const auto backup_isa = is_amx && bgmmc_.is_runtime_M && !is_s8s8
-            ? (is_f16 ? avx512_core_fp16
-                      : (is_bf16 ? avx512_core_bf16
-                                 : (is_int8 ? avx512_core_vnni : avx512_core)))
+            ? (is_f16 || is_f32_f16 || is_f16_with_int_wei
+                            ? avx512_core_fp16
+                            : (is_bf16 || is_f32_bf16 || is_bf16_with_int_wei
+                                            ? avx512_core_bf16
+                                            : (is_int8 ? avx512_core_vnni
+                                                       : avx512_core)))
             : isa;
-    for_(int i_bs = 0; i_bs < 2; i_bs++)
-    for_(int i_init = 0; i_init < 2; i_init++)
+
+    maybe_set_LDB2();
+
+    const int i_bs_end = bgmmc_.brgemm_batch_tail_size ? 2 : 1;
+    const int i_init_start = bgmmc_.K_blk != bgmmc_.K ? 0 : 1;
+    const int i_K_end = bgmmc_.K_tail ? 2 : 1;
+
+    for_(int i_bs = 0; i_bs < i_bs_end; i_bs++)
+    for_(int i_init = i_init_start; i_init < 2; i_init++)
     for_(int i_M = 0; i_M < max_m_ker_idx; i_M++)
     for_(int i_N = 0; i_N < max_n_ker_idx; i_N++)
-    for (int i_K = 0; i_K < 2; i_K++) {
+    for (int i_K = 0; i_K < i_K_end; i_K++) {
         auto vbeta = (i_init) ? beta_init : beta;
         auto vM = (i_M) == 0 ? bgmmc_.M_blk
                              : (bgmmc_.is_runtime_M ? dynamic_m_tails[i_M - 1]
@@ -189,17 +273,30 @@ status_t brgemm_matmul_t<isa>::pd_t::init(engine_t *engine) {
             brgattr.use_uker = true;
             brgattr.use_interleave_stores = true;
             brgattr.max_bs = bs;
-            brgattr.wary_tail_read = false;
-
+            brgattr.wary_A_k_tail_read = bgmmc_.extendable_k;
+            brgattr.extendable_k = bgmmc_.extendable_k;
             // TODO: change expected sizes to local chunks wrt L2 blocking
             brgattr.hint_expected_A_size = vM * vK * bs;
             brgattr.hint_expected_B_size = vN * vK * bs;
             brgattr.hint_expected_C_size = vM * vN * bs;
+            if (bgmmc_.LDB2 != 0) brgattr.LDB2 = bgmmc_.LDB2;
+
+            brgattr.LDC2_N = bgmmc_.M_blk * bgmmc_.LDC;
+
             brgattr.hint_innermost_loop = brgemm_innermost_undef;
             brgattr.hint_prefetching = brgemm_kernel_prefetching_t::brgemm_prf0;
+
+            if (bgmmc_.set_nt) {
+                brgattr.hint_load_nt_A = bgmmc_.is_a_nt ? brgemm_hint_nt_true
+                                                        : brgemm_hint_nt_false;
+                brgattr.hint_load_nt_B = bgmmc_.is_b_nt ? brgemm_hint_nt_true
+                                                        : brgemm_hint_nt_false;
+            }
         }
 
         CHECK(brgemm_desc_set_attr(&brg, brgattr));
+        CHECK(brgemm_desc_finalize(&brg));
+
         bgmmc_.wsp_tile_per_thr_bytes = nstl::max(
                 brg.get_wsp_buffer_size(), bgmmc_.wsp_tile_per_thr_bytes);
     }
@@ -221,11 +318,16 @@ status_t brgemm_matmul_t<isa>::init(engine_t *engine) {
             = bgmmc.is_runtime_M ? max_num_dynamic_m_tails + 1 : 2;
     const int max_n_ker_idx
             = bgmmc.is_runtime_N ? max_num_dynamic_n_tails + 1 : 2;
-    for_(int i_bs = 0; i_bs < 2; i_bs++)
+
+    const int i_bs_end = bgmmc.brgemm_batch_tail_size ? 2 : 1;
+    const int i_init_start = bgmmc.K_blk != bgmmc.K ? 0 : 1;
+    const int i_K_end = bgmmc.K_tail ? 2 : 1;
+
+    for_(int i_bs = 0; i_bs < i_bs_end; i_bs++)
     for_(int i_M = 0; i_M < max_m_ker_idx; i_M++)
     for_(int i_N = 0; i_N < max_n_ker_idx; i_N++)
-    for_(int i_K = 0; i_K < 2; i_K++)
-    for (int i_init = 0; i_init < 2; i_init++) {
+    for_(int i_K = 0; i_K < i_K_end; i_K++)
+    for (int i_init = i_init_start; i_init < 2; i_init++) {
         int idx = pd()->get_brg_kernel_idx(i_bs, i_init, i_M, i_N, i_K);
         if (idx < 0) continue;
 
@@ -234,6 +336,25 @@ status_t brgemm_matmul_t<isa>::init(engine_t *engine) {
         CHECK(safe_ptr_assign(brg_kernels_[idx], ker));
         if (is_superset(pd()->get_brg_desc(idx).isa_impl, avx512_core_amx))
             brgemm_palettes_.insert(idx, pd()->get_brg_desc(idx));
+
+        if (pd()->with_reduce()) {
+            if (pd()->reduce_kind() == matmul_reduce_kind::src) {
+                if (i_N == 0 && i_init == i_init_start) {
+                    reducers_[i_M][i_K] = nullptr;
+                    auto db_desc = pd()->get_brg_desc(idx);
+                    db_desc.reduce_dim = i_K ? bgmmc.K_tail : bgmmc.K_blk;
+                    db_desc.load_dim = i_M ? bgmmc.M_tail : bgmmc.M_blk;
+
+                    if (db_desc.reduce_dim > 0 && db_desc.load_dim > 0) {
+                        CHECK(safe_ptr_assign(reducers_[i_M][i_K],
+                                new reducer_t(bgmmc, db_desc)));
+                        CHECK(reducers_[i_M][i_K]->create_kernel());
+                    }
+                }
+            } else {
+                assert(!"unsupported reduce kind");
+            }
+        }
     }
 
     if (bgmmc.use_buffer_b && !bgmmc.packed_sparse_weights)
@@ -242,7 +363,7 @@ status_t brgemm_matmul_t<isa>::init(engine_t *engine) {
     if (bgmmc.use_buffer_a || bgmmc.use_buffer_a_tail_only)
         CHECK(create_brgemm_matmul_copy_a(copy_A_kernel_, &bgmmc));
 
-    if (bgmmc.nthr_k > 1 && bgmmc.acc_dt == f32) {
+    if (pd()->with_reduce() || (bgmmc.nthr_k > 1 && bgmmc.acc_dt == f32)) {
         CHECK(safe_ptr_assign(
                 acc_ker_f32_, new cpu_accumulator_1d_t<data_type::f32>()));
         CHECK(acc_ker_f32_->create_kernel());
@@ -268,8 +389,8 @@ status_t brgemm_matmul_t<isa>::init(engine_t *engine) {
     if (is_jit_supported && wei_scale_count > 1 && req_copy_scales(attr)
             && !bgmmc.req_transpose_scales) {
         const auto &attr_scales = attr->scales_;
-        int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-        if (wei_scale_mask != 0) {
+        int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_scale_mask > 0) {
             CHECK(safe_ptr_assign(jit_scale_precompute_,
                     new jit_avx512_core_scale_precompute_t(attr)));
             CHECK(jit_scale_precompute_->create_kernel());
@@ -294,10 +415,13 @@ status_t brgemm_matmul_t<isa>::execute_body(const exec_ctx_t &ctx) const {
     matmul_helper_t helper(src_d, weights_d, dst_d);
 
     const auto &bgmmc = pd()->get_brgemm_matmul_conf();
-    const int wei_scale_mask
-            = pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
-    const bool wei_scale_per_k = wei_scale_mask & pd()->wei_qmask_K();
-    const bool wei_scale_per_n = wei_scale_mask & pd()->wei_qmask_N();
+    const bool has_wei_scales
+            = !pd()->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
+    const int wei_scale_mask = pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS);
+    const bool wei_scale_per_k
+            = has_wei_scales && (wei_scale_mask & pd()->wei_qmask_K());
+    const bool wei_scale_per_n
+            = has_wei_scales && (wei_scale_mask & pd()->wei_qmask_N());
     const float *oscales = scale_utils::precompute_scales(
             ctx.get_scratchpad_grantor(), src_scales, wei_scales, pd()->K(),
             pd()->N(), wei_scale_per_k, wei_scale_per_n, pd()->attr(),
@@ -314,14 +438,19 @@ status_t brgemm_matmul_t<isa>::execute_body(const exec_ctx_t &ctx) const {
     const int M_chunks = brgmm_ctx.get_M_chunks();
     const int M_chunk_size = brgmm_ctx.get_M_chunk_size();
     const int M_chunk_tail = brgmm_ctx.get_M_chunk_tail();
+
+    const int K_chunks = brgmm_ctx.get_K_chunks();
+    const int K_chunk_size = brgmm_ctx.get_K_chunk_size();
+    const int K_chunk_tail = brgmm_ctx.get_K_chunk_tail();
+
     const int N_chunks = brgmm_ctx.get_N_chunks();
     const int N_chunk_tail = brgmm_ctx.get_N_chunk_tail();
     parallel(num_threads, [&](const int ithr, const int nthr) {
-        const int ithr_bmn = brgmm_ctx.get_thread_idx_for_bmn(ithr);
+        const int ithr_bmn = brgmm_ctx.get_thread_idx_for_bmn_gemm(ithr);
         const int ithr_k = brgmm_ctx.get_thread_idx_for_k(ithr);
         if (ithr_bmn < 0 || ithr_k < 0) return;
         int start {0}, end {0};
-        balance211(brgmm_ctx.get_parallel_work_amount(),
+        balance211(brgmm_ctx.get_parallel_work_amount_gemm(),
                 brgmm_ctx.get_num_threads_for_bmn(), ithr_bmn, start, end);
         int kc_start {0}, kc_end {bgmmc.K_chunks};
         if (brgmm_ctx.parallel_reduction_is_used())
@@ -332,14 +461,39 @@ status_t brgemm_matmul_t<isa>::execute_body(const exec_ctx_t &ctx) const {
         brgemm_palettes_.maybe_tile_configure(
                 is_amx, prev_ker_idx, brgmm_ctx.get_base_brgemm_kernel_idx());
 
-        int b {0}, mc {0}, nc {0};
-        nd_iterator_init(start, b, bgmmc.batch, mc, M_chunks, nc, N_chunks);
+        int b {0}, mc {0}, nc {0}, b_per_t {0}, mc_per_t {0}, nc_per_t {0},
+                bt {0}, mt {0}, nt {0};
+        int m_chunks_per_thread = div_up(M_chunks, bgmmc.nthr_m);
+        int n_chunks_per_thread = div_up(N_chunks, bgmmc.nthr_n);
+        int batch_per_thread = div_up(bgmmc.batch, bgmmc.nthr_b);
+        nd_iterator_init(start, bt, bgmmc.nthr_b, mt, bgmmc.nthr_m, nt,
+                bgmmc.nthr_n, b_per_t, batch_per_thread, mc_per_t,
+                m_chunks_per_thread, nc_per_t, n_chunks_per_thread);
+        mc = mt * m_chunks_per_thread + mc_per_t;
+        nc = nt * n_chunks_per_thread + nc_per_t;
+        b = bt * batch_per_thread + b_per_t;
+
+        auto advance_func = [&]() {
+            ++start;
+            nd_iterator_step(bt, bgmmc.nthr_b, mt, bgmmc.nthr_m, nt,
+                    bgmmc.nthr_n, b_per_t, batch_per_thread, mc_per_t,
+                    m_chunks_per_thread, nc_per_t, n_chunks_per_thread);
+            mc = mt * m_chunks_per_thread + mc_per_t;
+            nc = nt * n_chunks_per_thread + nc_per_t;
+            b = bt * batch_per_thread + b_per_t;
+        };
+
         int mc_prev = -1;
         int nb_prev = -1;
         int b_prev = -1;
         const char *a_batch_ptr = nullptr;
         const char *b_batch_ptr = nullptr;
         while (start < end) {
+            if (mc >= M_chunks || nc >= N_chunks || b >= bgmmc.batch) {
+                advance_func();
+                continue;
+            }
+
             auto m_start = mc * M_chunk_size;
             const bool m_chunk_tail = mc == M_chunks - 1 && M_chunk_tail > 0;
             auto m_end = m_start + (m_chunk_tail ? M_chunk_tail : M_chunk_size);
@@ -353,39 +507,56 @@ status_t brgemm_matmul_t<isa>::execute_body(const exec_ctx_t &ctx) const {
                 b_batch_ptr = brgmm_ctx.get_data_B_batch_ptr(b);
             }
             for_(int kc = kc_start; kc < kc_end; kc++)
-            for (int nb = n_start; nb < n_end; nb++) {
-                const bool bcast_across_all_batch_dims
-                        = bgmmc.bcast_B_desc.bcast_across_all_batch_dims;
-                const bool skip_copy_b
-                        = (nb_prev == nb && kc_prev == kc
-                                  && (b_prev == b
-                                          || bcast_across_all_batch_dims))
-                        && !bgmmc.packed_sparse_weights;
-                if (bgmmc.use_buffer_b && !skip_copy_b)
-                    copy_b_chunk_in_buffer(
-                            brgmm_ctx, b_batch_ptr, ithr, b, nb, kc);
-                for (int mb = m_start; mb < m_end; mb++) {
-                    const bool skip_copy_a = mc_prev == mc && kc_prev == kc
-                            && (b_prev == b
-                                    || bgmmc.bcast_A_desc
-                                               .bcast_across_all_batch_dims);
-                    if (use_buffer_a && nb == n_start && !skip_copy_a)
-                        copy_a_chunk_in_buffer(
-                                brgmm_ctx, a_batch_ptr, ithr, mb, kc);
-                    compute_kernel(brgmm_ctx, a_batch_ptr, b_batch_ptr, ithr, b,
-                            mb, nb, kc, kc == kc_start, prev_ker_idx);
+            {
+                const bool k_chunk_tail
+                        = kc == K_chunks - 1 && K_chunk_tail > 0;
+                auto kb_start = kc * K_chunk_size;
+                auto kb_end = kb_start
+                        + (k_chunk_tail ? K_chunk_tail : K_chunk_size);
+
+                for (int nb = n_start; nb < n_end; nb++) {
+                    const bool bcast_across_all_batch_dims
+                            = bgmmc.bcast_B_desc.bcast_across_all_batch_dims;
+                    const bool skip_copy_b
+                            = (nb_prev == nb && kc_prev == kc
+                                      && (b_prev == b
+                                              || bcast_across_all_batch_dims))
+                            && !bgmmc.packed_sparse_weights;
+
+                    for (int mb = m_start; mb < m_end; mb++) {
+                        const bool skip_copy_a = mc_prev == mc && kc_prev == kc
+                                && (b_prev == b
+                                        || bgmmc.bcast_A_desc
+                                                   .bcast_across_all_batch_dims);
+                        for (int kb = kb_start; kb < kb_end; kb++) {
+
+                            if (bgmmc.use_buffer_b && mb == m_start
+                                    && !skip_copy_b)
+                                copy_b_chunk_in_buffer(brgmm_ctx, b_batch_ptr,
+                                        ithr, b, nb, kb);
+
+                            if (use_buffer_a && nb == n_start && !skip_copy_a)
+                                copy_a_chunk_in_buffer(
+                                        brgmm_ctx, a_batch_ptr, ithr, mb, kb);
+
+                            compute_kernel(brgmm_ctx, a_batch_ptr, b_batch_ptr,
+                                    ithr, b, mb, nb, kb,
+                                    kc == kc_start && kb == kb_start,
+                                    prev_ker_idx);
+                        }
+                    }
+                    kc_prev = kc;
+                    nb_prev = nb;
                 }
-                kc_prev = kc;
-                nb_prev = nb;
             }
             mc_prev = mc;
             b_prev = b;
-            ++start;
-            nd_iterator_step(b, bgmmc.batch, mc, M_chunks, nc, N_chunks);
+            advance_func();
         }
         if (is_amx) { amx_tile_release(); }
     });
 
+    maybe_reduce_and_convert_partial_results_A(brgmm_ctx);
     maybe_reduce_partial_results_and_apply_postops(brgmm_ctx);
 
     return status::success;
@@ -395,28 +566,28 @@ template <cpu_isa_t isa>
 void brgemm_matmul_t<isa>::compute_kernel(
         const brg_matmul_exec_ctx_t &brgmm_ctx, const char *A_data_batch_ptr,
         const char *B_data_batch_ptr, int ithr, int b_idx, int m_blk_idx,
-        int n_blk_idx, int k_chunk_idx, bool do_init, int &prev_ker_idx) const {
+        int n_blk_idx, int k_blk_idx, bool do_init, int &prev_ker_idx) const {
     const auto &bgmmc = pd()->get_brgemm_matmul_conf();
     const auto addr_batch = brgmm_ctx.get_batch_elem_ptr(ithr);
 
     const auto wsp_tile = brgmm_ctx.get_tile_workspace(ithr);
 
     const dim_t n = brgmm_ctx.get_N_idx(n_blk_idx, true);
-    const int k_blk_idx = k_chunk_idx * bgmmc.brgemm_batch_size;
 
     const dim_t M = brgmm_ctx.get_M();
     const dim_t N = brgmm_ctx.get_N();
     const int m_ker_idx = brgmm_ctx.get_M_kernel_idx(m_blk_idx);
     const int n_ker_idx = brgmm_ctx.get_N_kernel_idx(n_blk_idx);
-    const bool is_last_K_chunk = brgmm_ctx.is_last_K_chunk(k_chunk_idx);
+    const bool is_last_K_blk = brgmm_ctx.is_last_K_blk(k_blk_idx);
 
+    const int gemm_batch = brgmm_ctx.get_brgemm_batch_size(k_blk_idx);
     const int remaining_k_blks
             = (bgmmc.use_buffer_a ? utils::rnd_up(bgmmc.K, bgmmc.K_blk)
                                   : bgmmc.K)
-            - k_chunk_idx * bgmmc.K_chunk_elems;
-    const int gemm_batch = brgmm_ctx.get_brgemm_batch_size(k_chunk_idx);
+            - k_blk_idx * bgmmc.K_blk * bgmmc.brgemm_batch_size;
     const bool is_K_tail
-            = is_last_K_chunk && (gemm_batch * bgmmc.K_blk) != remaining_k_blks;
+            = is_last_K_blk && (gemm_batch * bgmmc.K_blk) != remaining_k_blks;
+
     auto is_bs_tail = (gemm_batch != bgmmc.brgemm_batch_size);
     const int brg_ker_idx = pd()->get_brg_kernel_idx(
             is_bs_tail, do_init, m_ker_idx, n_ker_idx, false);
@@ -454,8 +625,7 @@ void brgemm_matmul_t<isa>::compute_kernel(
         brgmm_ctx.init_brgemm_batch_elements_values(ithr, 0, gemm_batch,
                 A_data_batch_ptr, B_data_batch_ptr, b_idx, m_blk_idx, k_blk_idx,
                 n_blk_idx);
-
-        if (post_ops_applicable && is_last_K_chunk && !is_K_tail) {
+        if (post_ops_applicable && is_last_K_blk && !is_K_tail) {
             void *scratch = is_amx
                     ? static_cast<void *>(wsp_tile)
                     : static_cast<void *>(brgmm_ctx.get_s8s8_comp_ptr(
@@ -488,6 +658,9 @@ void brgemm_matmul_t<isa>::compute_kernel(
                     (void *)ptr_C, is_amx ? (void *)wsp_tile : nullptr,
                     &leading_dimensions);
         }
+
+        maybe_reduce_A(brgmm_ctx, ithr, gemm_batch, m_blk_idx, n_blk_idx,
+                k_blk_idx, do_init, is_K_tail, /* do_K_tail */ false);
     }
     if (is_K_tail) {
         brgmm_ctx.init_brgemm_batch_elements_values(ithr, gemm_batch, 1,
@@ -541,12 +714,178 @@ void brgemm_matmul_t<isa>::compute_kernel(
                     (void *)ptr_C, is_amx ? (void *)wsp_tile : nullptr,
                     &leading_dimensions);
         }
+
+        maybe_reduce_A(brgmm_ctx, ithr, gemm_batch, m_blk_idx, n_blk_idx,
+                k_blk_idx, do_init, is_K_tail,
+                /* do_K_tail */ true);
     }
 
     brgmm_ctx.maybe_restore_dst_values_from_buffer(
             ithr, b_idx, m_blk_idx, n_blk_idx);
 }
 
+template <cpu_isa_t isa>
+void brgemm_matmul_t<isa>::maybe_reduce_A(
+        const brg_matmul_exec_ctx_t &brgmm_ctx, int ithr, int gemm_batch,
+        int m_blk_idx, int n_blk_idx, int k_chunk_idx, bool do_init,
+        bool has_K_tail, bool do_K_tail) const {
+
+    if (!pd()->with_reduce()) return;
+    //current state macro heuristics don't support reduce_A -> kb =1 -> kb == kc
+    assert(!pd()->get_brgemm_matmul_conf().is_macro_heuristics);
+    const bool reduce_a = pd()->reduce_kind() == matmul_reduce_kind::src;
+    // Only `matmul_reduce_kind::src` is supported for now.
+    assert(reduce_a);
+
+    const auto &bgmmc = pd()->get_brgemm_matmul_conf();
+    const auto *addr_batch = brgmm_ctx.get_batch_elem_ptr(ithr);
+
+    if (reduce_a && n_blk_idx == 0) {
+        const dim_t m = brgmm_ctx.get_M_idx(m_blk_idx, true);
+
+        auto *reduce_ptr = bgmmc.use_buffer_reduce
+                ? brgmm_ctx.get_buf_reduce_ptr(ithr, m)
+                : brgmm_ctx.get_data_reduce_ptr(m);
+
+        brgemm_kernel_diff_bias_t p;
+
+        p.ptr_diff_bias_acc = (void *)reduce_ptr;
+        p.ptr_diff_bias = (void *)brgmm_ctx.get_data_reduce_ptr(m);
+
+        const int m_ker_idx = brgmm_ctx.get_M_kernel_idx(m_blk_idx);
+
+        if (!do_K_tail) {
+            for (int gb = 0; gb < gemm_batch; gb++) {
+                p.ptr_diff_dst = (void *)addr_batch[gb].ptr.A;
+
+                const bool is_first = do_init && gb == 0;
+                const bool is_last = (bgmmc.nthr_k == 1 || bgmmc.K_chunks == 1)
+                        && k_chunk_idx == bgmmc.K_chunks - 1
+                        && gb == gemm_batch - 1 && !has_K_tail;
+
+                p.flags = 0 | (is_first ? FLAG_REDUCE_FIRST : 0)
+                        | (is_last ? FLAG_REDUCE_LAST : 0);
+
+                (*reducers_[m_ker_idx][do_K_tail])(&p);
+            }
+        } else {
+            p.ptr_diff_dst = (void *)addr_batch[0].ptr.A;
+
+            const bool is_first = do_init && gemm_batch == 0;
+            const bool is_last = (bgmmc.nthr_k == 1 || bgmmc.K_chunks == 1)
+                    && k_chunk_idx == bgmmc.K_chunks - 1;
+
+            p.flags = 0 | (is_first ? FLAG_REDUCE_FIRST : 0)
+                    | (is_last ? FLAG_REDUCE_LAST : 0);
+
+            (*reducers_[m_ker_idx][do_K_tail])(&p);
+        }
+    }
+}
+
+template <cpu_isa_t isa>
+void brgemm_matmul_t<isa>::maybe_reduce_and_convert_partial_results_A(
+        const brg_matmul_exec_ctx_t &brgmm_ctx) const {
+    // Partial results appear when parallel reduction is used.
+    //
+    // There are two cases that require slightly different handling.
+    // - (Figure 1): when reduce data type is not f32. In this case there are
+    //   three steps:
+    //     * Step 1: add partial results from all reduce buffers except the last
+    //       one to the first reduce buffer (reduce_buf_0).
+    //     * Step 2 and step 3: add partial results from the first and the last
+    //       reduce buffers, convert the result to the reduce data type and
+    //       store it to the user provided reduce buffer.
+    //
+    // - (Figure 2): when reduce data type is f32. In this case the user
+    //   provided reduce buffer is used as one of the reduce buffers and there
+    //   is only 1 step:
+    //     * Step 1: add partial results from all reduce buffers to the user
+    //     provided reduce buffer.
+    //       buffer.
+    //
+    //                    Figure 1.
+    //             +--------------------+
+    //             | reduce (bf16/f16)  |<------+ Step 3.
+    //             +--------------------+       |
+    //             +--------------------+       |
+    //         +-->| reduce_buf_0 (f32) |--->   |
+    // Step 1. |   +--------------------+   |   |
+    //         |   +--------------------+   |   |
+    //         +<--| reduce_buf_1 (f32) |   +---> Step 2.
+    //             +--------------------+   |
+    //             +--------------------+   |
+    //             | reduce_buf_2 (f32) |--->
+    //             +--------------------+
+    //
+    //                    Figure 2.
+    //             +--------------------+
+    //         +-->|    reduce (f32)    |
+    //         |   +--------------------+
+    //         |   +--------------------+
+    // Step 1. +<--| reduce_buf_0 (f32) |
+    //         |   +--------------------+
+    //         |   +--------------------+
+    //         +<--| reduce_buf_1 (f32) |
+    //             +--------------------+
+
+    if (!pd()->with_reduce() || !brgmm_ctx.parallel_reduction_is_used()) return;
+
+    const auto &bgmmc = pd()->get_brgemm_matmul_conf();
+    const int num_threads = brgmm_ctx.get_num_threads_for_parallelization();
+
+    parallel(num_threads, [&](const int ithr, const int nthr) {
+        const int ithr_bmn = brgmm_ctx.get_thread_idx_for_bmn(ithr);
+        const int ithr_k = brgmm_ctx.get_thread_idx_for_k(ithr);
+        if (ithr_bmn < 0 || ithr_k < 0) return;
+
+        const int M_chunks = brgmm_ctx.get_M_chunks();
+
+        int start_mc {0}, end_mc {0};
+        balance211(M_chunks, brgmm_ctx.get_num_threads_for_bmn(), ithr_bmn,
+                start_mc, end_mc);
+        if (start_mc != end_mc && ithr_k == 0) {
+            const size_t m = start_mc * bgmmc.M_chunk_elems;
+            const size_t mc_work = end_mc - start_mc;
+            const size_t acc_size
+                    = std::min(mc_work * bgmmc.M_chunk_elems, bgmmc.M - m);
+
+            const bool is_reduce_f32 = bgmmc.reduce_dt == f32;
+
+            float *reduce_acc = is_reduce_f32
+                    ? (float *)brgmm_ctx.get_data_reduce_ptr(m)
+                    : (float *)brgmm_ctx.get_buf_reduce_ptr_by_index(0, m);
+
+            int ibuf = !is_reduce_f32;
+            for (; ibuf < bgmmc.nthr_k - 1; ibuf++) {
+                float *reduce_buf
+                        = (float *)brgmm_ctx.get_buf_reduce_ptr_by_index(
+                                ibuf, m);
+                acc_ker_f32_->accumulate(reduce_acc, reduce_buf, acc_size);
+            }
+
+            if (!is_reduce_f32) {
+                float *reduce_buf
+                        = (float *)brgmm_ctx.get_buf_reduce_ptr_by_index(
+                                ibuf, m);
+                switch (bgmmc.reduce_dt) {
+                    case data_type::bf16:
+                        add_floats_and_cvt_to_bfloat16(
+                                (bfloat16_t *)brgmm_ctx.get_data_reduce_ptr(m),
+                                reduce_acc, reduce_buf, acc_size);
+                        break;
+                    case data_type::f16:
+                        add_floats_and_cvt_to_float16(
+                                (float16_t *)brgmm_ctx.get_data_reduce_ptr(m),
+                                reduce_acc, reduce_buf, acc_size);
+                        break;
+                    default: assert(!"invalid data type");
+                }
+            }
+        }
+    });
+}
+
 template <cpu_isa_t isa>
 void brgemm_matmul_t<isa>::maybe_reduce_partial_results_and_apply_postops(
         const brg_matmul_exec_ctx_t &brgmm_ctx) const {
@@ -690,14 +1029,15 @@ void brgemm_matmul_t<isa>::maybe_reduce_partial_results_and_apply_postops(
 template <cpu_isa_t isa>
 void brgemm_matmul_t<isa>::copy_a_chunk_in_buffer(
         const brg_matmul_exec_ctx_t &brgmm_ctx, const char *A_data_batch_ptr,
-        int ithr, int m_blk_idx, int k_chunk_idx) const {
+        int ithr, int m_blk_idx, int k_blk_idx) const {
     const auto &bgmmc = pd()->get_brgemm_matmul_conf();
 
     auto ctx = jit_brgemm_matmul_copy_a_t::ctx_t();
-    const int k_start = k_chunk_idx * bgmmc.K_chunk_elems;
+    const int k_start = k_blk_idx * bgmmc.K_blk * bgmmc.brgemm_batch_size;
     const bool is_K_tail
-            = brgmm_ctx.is_last_K_chunk(k_chunk_idx) && bgmmc.K_tail > 0;
-    const int gemm_batch = brgmm_ctx.get_brgemm_batch_size(k_chunk_idx);
+            = brgmm_ctx.is_last_K_blk(k_blk_idx) && bgmmc.K_tail > 0;
+
+    const int gemm_batch = brgmm_ctx.get_brgemm_batch_size(k_blk_idx);
     const int gemm_batch_iters = bgmmc.use_buffer_a_tail_only ? 0 : gemm_batch;
 
     const dim_t m = brgmm_ctx.get_M_idx(m_blk_idx, true);
@@ -716,7 +1056,8 @@ void brgemm_matmul_t<isa>::copy_a_chunk_in_buffer(
     for (int gb = 0; gb < gemm_batch_iters; gb++) {
         const int k = k_start + gb * bgmmc.K_blk;
         ctx.src = (void *)brgmm_ctx.get_data_A_mk_ptr(A_data_batch_ptr, m, k);
-        ctx.tr_src = (void *)brgmm_ctx.get_buf_A_ptr(ithr, m_blk_idx, gb);
+        ctx.tr_src = (void *)brgmm_ctx.get_buf_A_ptr(
+                ithr, m_blk_idx, k_blk_idx, gb);
         ctx.current_K_blk = nstl::min(bgmmc.K_blk, bgmmc.K);
         ctx.current_K_start = k;
 
@@ -727,7 +1068,7 @@ void brgemm_matmul_t<isa>::copy_a_chunk_in_buffer(
         const int k = k_start + gemm_batch * bgmmc.K_blk;
         ctx.src = (void *)brgmm_ctx.get_data_A_mk_ptr(A_data_batch_ptr, m, k);
         ctx.tr_src = (void *)brgmm_ctx.get_buf_A_ptr(
-                ithr, m_blk_idx, gemm_batch_iters);
+                ithr, m_blk_idx, k_blk_idx, gemm_batch_iters);
         ctx.current_K_blk = K_tail;
         ctx.current_K_start = k;
 
@@ -738,13 +1079,13 @@ void brgemm_matmul_t<isa>::copy_a_chunk_in_buffer(
 template <cpu_isa_t isa>
 void brgemm_matmul_t<isa>::copy_b_chunk_in_buffer(
         const brg_matmul_exec_ctx_t &brgmm_ctx, const char *B_data_batch_ptr,
-        int ithr, int b_idx, int n_blk_idx, int k_chunk_idx) const {
+        int ithr, int b_idx, int n_blk_idx, int k_blk_idx) const {
     const auto &bgmmc = pd()->get_brgemm_matmul_conf();
 
-    const int k_start = k_chunk_idx * bgmmc.K_chunk_elems;
+    const int k_start = k_blk_idx * bgmmc.K_blk * bgmmc.brgemm_batch_size;
     const bool is_K_tail
-            = brgmm_ctx.is_last_K_chunk(k_chunk_idx) && bgmmc.K_tail > 0;
-    const int gemm_batch = brgmm_ctx.get_brgemm_batch_size(k_chunk_idx);
+            = brgmm_ctx.is_last_K_blk(k_blk_idx) && bgmmc.K_tail > 0;
+    const int gemm_batch = brgmm_ctx.get_brgemm_batch_size(k_blk_idx);
 
     const dim_t n = brgmm_ctx.get_N_idx(n_blk_idx, true);
 
@@ -757,7 +1098,8 @@ void brgemm_matmul_t<isa>::copy_b_chunk_in_buffer(
             p.src_ptr = (void *)B_data_ptr;
             p.bitmask_ptr
                     = (void *)brgmm_ctx.get_data_B_bitmask_ptr(b_idx, k, n);
-            p.dst_ptr = (void *)brgmm_ctx.get_buf_B_ptr(ithr, gb, n_blk_idx);
+            p.dst_ptr = (void *)brgmm_ctx.get_buf_B_ptr(
+                    ithr, k_blk_idx, n_blk_idx, gb);
             (*sparse_decompress_kernel_)(&p);
         }
         return;
@@ -772,17 +1114,20 @@ void brgemm_matmul_t<isa>::copy_b_chunk_in_buffer(
     ctx.zp_b_value_ptr = (void *)brgmm_ctx.get_zp_b_val_ptr();
     ctx.dynamic_src_stride = brgmm_ctx.copy_B_wei_stride();
 
-    int gb = 0;
-    for (; gb < gemm_batch; gb++) {
+    for (int gb = 0; gb < gemm_batch; gb++) {
         const int k = k_start + gb * bgmmc.K_blk;
         ctx.src = (void *)brgmm_ctx.get_data_B_kn_ptr(B_data_batch_ptr, k, n);
-        ctx.tr_src = (void *)brgmm_ctx.get_buf_B_ptr(ithr, gb, n_blk_idx);
+        ctx.tr_src = (void *)brgmm_ctx.get_buf_B_ptr(
+                ithr, k_blk_idx, n_blk_idx, gb);
         ctx.compensation_ptr
                 = (void *)brgmm_ctx.get_s8s8_comp_ptr(ithr, b_idx, n_blk_idx);
         ctx.current_K_start = k;
         ctx.current_K_iters = nstl::min(bgmmc.K_blk, bgmmc.K);
+        ctx.current_K_pad = brgmm_ctx.get_current_K_pad(ctx.current_K_iters);
+
         ctx.scales_ptr = (void *)brgmm_ctx.get_oscales_ptr(n, k);
-        if (bgmmc.blocked_B && isa == avx512_core_fp16) {
+        if (bgmmc.blocked_B && !bgmmc.is_f16_with_int_wei
+                && isa == avx512_core_fp16) {
             cvt_float16_to_float((float *)ctx.tr_src, (float16_t *)ctx.src,
                     bgmmc.wei_n_blk * ctx.current_K_iters);
         } else {
@@ -791,15 +1136,18 @@ void brgemm_matmul_t<isa>::copy_b_chunk_in_buffer(
     }
 
     if (is_K_tail) {
-        const int k = k_start + gb * bgmmc.K_blk;
+        const int k = k_start + gemm_batch * bgmmc.K_blk;
         ctx.src = (void *)brgmm_ctx.get_data_B_kn_ptr(B_data_batch_ptr, k, n);
-        ctx.tr_src = (void *)brgmm_ctx.get_buf_B_ptr(ithr, gb, n_blk_idx);
+        ctx.tr_src = (void *)brgmm_ctx.get_buf_B_ptr(
+                ithr, k_blk_idx, n_blk_idx, gemm_batch);
         ctx.compensation_ptr
                 = (void *)brgmm_ctx.get_s8s8_comp_ptr(ithr, b_idx, n_blk_idx);
         ctx.current_K_start = k;
         ctx.current_K_iters = bgmmc.K % bgmmc.K_blk;
+        ctx.current_K_pad = brgmm_ctx.get_current_K_pad(ctx.current_K_iters);
         ctx.scales_ptr = (void *)brgmm_ctx.get_oscales_ptr(n, k);
-        if (bgmmc.blocked_B && isa == avx512_core_fp16) {
+        if (bgmmc.blocked_B && !bgmmc.is_f16_with_int_wei
+                && isa == avx512_core_fp16) {
             cvt_float16_to_float((float *)ctx.tr_src, (float16_t *)ctx.src,
                     bgmmc.wei_n_blk * ctx.current_K_iters);
         } else {
@@ -832,7 +1180,8 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         , dst_d_(pd->dst_md())
         , data_A_ptr_(CTX_IN_MEM(const char *, DNNL_ARG_SRC))
         , data_B_ptr_(CTX_IN_MEM(const char *, DNNL_ARG_WEIGHTS))
-        , data_C_ptr_(CTX_OUT_MEM(char *, DNNL_ARG_DST)) {
+        , data_C_ptr_(CTX_OUT_MEM(char *, DNNL_ARG_DST))
+        , data_reduce_ptr_(CTX_OUT_MEM(char *, DNNL_ARG_REDUCE)) {
 
         const memory_desc_wrapper weights_d(pd->weights_md(0));
         if (bgmmc_.packed_sparse_weights) {
@@ -843,6 +1192,7 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         }
 
         bias_ptr_ = CTX_IN_MEM(const char *, DNNL_ARG_BIAS);
+
         oscales_ptr_ = oscales;
         dst_scales_ptr_ = dst_scales;
         memory_tracking::grantor_t scratchpad = ctx.get_scratchpad_grantor();
@@ -869,6 +1219,11 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
                 ? scratchpad.template get<char>(key_brgemm_primitive_buffer_d)
                 : nullptr;
 
+        buf_reduce_ptr_ = bgmmc.use_buffer_reduce
+                ? scratchpad.template get<char>(
+                        key_brgemm_primitive_buffer_reduce)
+                : nullptr;
+
         is_amx_ = is_superset(isa, avx512_core_amx);
         wsp_tile_ptr_ = is_amx_
                 ? ctx.get_scratchpad_grantor().template get<char>(
@@ -930,11 +1285,11 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
                                     + s8s8_buffer_sz]));
         }
 
-        // Set last_chunk_brgemm_batch_size_ to brgemm_batch_size
+        // Set last_brgemm_batch_size_ to brgemm_batch_size
         // when K_tail = 0 and brgemm_batch_tail_size = 0
-        last_chunk_brgemm_batch_size_ = bgmmc.brgemm_batch_tail_size;
-        if (bgmmc.K_tail == 0 && last_chunk_brgemm_batch_size_ == 0)
-            last_chunk_brgemm_batch_size_ = bgmmc.brgemm_batch_size;
+        last_brgemm_batch_size_ = bgmmc.brgemm_batch_tail_size;
+        if (bgmmc.K_tail == 0 && last_brgemm_batch_size_ == 0)
+            last_brgemm_batch_size_ = bgmmc.brgemm_batch_size;
 
         LDD_ = is_runtime_value(bgmmc_.LDD) ? helper.ldc() : bgmmc_.LDD;
         LDC_ = is_runtime_value(bgmmc_.LDC) ? LDD_ : bgmmc_.LDC;
@@ -942,6 +1297,12 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         is_A_batch_layout_trivial_ = bgmmc_.is_src_batch_layout_trivial;
         is_B_batch_layout_trivial_ = bgmmc_.is_wei_batch_layout_trivial;
         is_C_batch_layout_trivial_ = bgmmc_.is_dst_batch_layout_trivial;
+
+        K_ = bgmmc.K;
+        K_chunks_ = bgmmc.K_chunks;
+        K_chunk_tail_ = bgmmc.num_K_blocks % get_K_chunk_size();
+        K_chunk_tail_elements_ = K_ % bgmmc.K_chunk_elems;
+
         if (bgmmc.is_runtime_M) {
             M_ = helper.M();
             M_chunks_ = M_ / bgmmc.M_chunk_elems;
@@ -1072,6 +1433,15 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         }
         C_ptr_shift_b_ = bgmmc_.C_ptr_shift_b;
 
+        // create parallel work of amount that is divisible by nthr_m and nthr_n,
+        // the chunks that do not exist will be ignored in gemm execution
+        // In case of micro heuristics, nthr_m==nthr_n==nthr_b==1 (round up has no effect)
+        int m_chunks_per_thread = rnd_up(M_chunks_, bgmmc.nthr_m);
+        int n_chunks_per_thread = rnd_up(N_chunks_, bgmmc.nthr_n);
+        int b_per_thread = rnd_up(bgmmc.batch, bgmmc.nthr_b);
+        parallel_work_amount_gemm_
+                = b_per_thread * m_chunks_per_thread * n_chunks_per_thread;
+
         // parallelization
         parallel_work_amount_ = bgmmc.batch * M_chunks_ * N_chunks_;
 
@@ -1258,21 +1628,23 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
 
         for (int b_iter = 0; b_iter < brg_batch_iters; b_iter++) {
             const int brg_batch_idx = brg_batch_start + b_iter;
-            const int k = (k_blk_idx + brg_batch_idx) * bgmmc_.K_blk;
+            const int k = k_blk_idx * bgmmc_.K_blk * bgmmc_.brgemm_batch_size
+                    + brg_batch_idx * bgmmc_.K_blk;
             addr_batch[b_iter].ptr.A = bgmmc_.use_buffer_a
-                    ? get_buf_A_ptr(ithr, m_blk_idx, brg_batch_idx)
+                    ? get_buf_A_ptr(ithr, m_blk_idx, k_blk_idx, brg_batch_idx)
                     : get_data_A_mk_ptr(A_data_batch_ptr, m, k);
             addr_batch[b_iter].ptr.B = (bgmmc_.use_buffer_b)
-                    ? get_buf_B_ptr(ithr, brg_batch_idx, n_blk_idx)
+                    ? get_buf_B_ptr(ithr, k_blk_idx, n_blk_idx, brg_batch_idx)
                     : get_data_B_kn_ptr(B_data_batch_ptr, k, n);
         }
     }
 
-    char *get_buf_A_ptr(int ithr, int m_blk_idx, int k_blk_idx) const {
+    char *get_buf_A_ptr(int ithr, int m_blk_idx, int k_blk_idx, int gb) const {
         if (!bgmmc_.use_buffer_a && !bgmmc_.use_buffer_a_tail_only)
             return nullptr;
 
-        const int k_blk_local = bgmmc_.use_buffer_a_tail_only ? 0 : k_blk_idx;
+        int k_blk_local = bgmmc_.use_buffer_a_tail_only ? 0 : k_blk_idx;
+        k_blk_local = k_blk_local % get_K_chunk_size();
         if (is_runtime_M_tail_chunk(m_blk_idx)) {
             const int tail_idx = get_M_tail_block_idx(m_blk_idx);
             const int curr_m_block_size
@@ -1285,23 +1657,27 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
             const int batch = bgmmc_.use_buffer_a_tail_only
                     ? 1
                     : bgmmc_.brgemm_batch_size;
-            const dim_t offset = curr_m_buf_shift * ld * batch
-                    + k_blk_local * ld * curr_m_block_size;
-            return buf_A_ptr_ + ithr * bgmmc_.buffer_a_per_thread_sz + offset;
+            const dim_t offset = ithr * bgmmc_.buffer_a_per_thread_sz
+                    + curr_m_buf_shift * ld * batch * bgmmc_.K_chunk_size
+                    + k_blk_local * batch * ld * curr_m_block_size
+                    + gb * ld * curr_m_block_size;
+            return buf_A_ptr_ + offset;
         }
 
         const int m_blk_local = m_blk_idx % get_M_chunk_size();
         return buf_A_ptr_ + ithr * bgmmc_.buffer_a_per_thread_sz
-                + m_blk_local * bgmmc_.buffer_a_chunk_shift_along_m
-                + k_blk_local * bgmmc_.buffer_a_chunk_sz;
+                + m_blk_local * bgmmc_.buffer_a_m_stride
+                + k_blk_local * bgmmc_.buffer_a_k_stride
+                + gb * bgmmc_.buffer_a_gb_stride;
     }
 
-    char *get_buf_B_ptr(int ithr, int k_blk_idx, int n_blk_idx) const {
+    char *get_buf_B_ptr(int ithr, int k_blk_idx, int n_blk_idx, int gb) const {
         UNUSED(n_blk_idx);
         if (!bgmmc_.use_buffer_b) return nullptr;
-
+        int k_blk_local = k_blk_idx % get_K_chunk_size();
         return buf_B_ptr_ + ithr * bgmmc_.buffer_b_per_thread_sz
-                + k_blk_idx * bgmmc_.buffer_b_chunk_sz;
+                + k_blk_local * bgmmc_.buffer_b_k_brg_stride
+                + gb * bgmmc_.buffer_b_gb_stride;
     }
 
     char *get_buf_C_ptr(int ithr, int m_blk_idx, int n_blk_idx) const {
@@ -1313,10 +1689,19 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         }
         char *buf_C_ptr_local
                 = buf_C_ptr_ + ithr * bgmmc_.buffer_c_per_thread_sz;
-        const int n_blk_local = n_blk_idx % bgmmc_.N_chunk_size;
-        const int m_blk_local = m_blk_idx % get_M_chunk_size();
+
+        int n_blk_local = 0;
+        int m_blk_local = 0;
+
+        if (bgmmc_.is_runtime_N || bgmmc_.is_runtime_M
+                || bgmmc_.K_chunk_elems < bgmmc_.K) {
+            n_blk_local = n_blk_idx % bgmmc_.N_chunk_size;
+            m_blk_local = m_blk_idx % get_M_chunk_size();
+        }
+
         const bool runtime_M_tail = is_runtime_M_tail_chunk(m_blk_idx);
         const bool runtime_N_tail = is_runtime_N_tail_chunk(n_blk_idx);
+
         if (runtime_M_tail || runtime_N_tail) {
             const int curr_m_block_size = get_M_kernel_size(m_blk_idx);
             const dim_t curr_m_buf_shift = runtime_M_tail
@@ -1393,6 +1778,36 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         return off;
     }
 
+    // Returns a pointer to the user-provided reduce buffer, shifted by
+    // the specified offset @p off.
+    char *get_data_reduce_ptr(int off) const {
+        if (!bgmmc_.with_reduce) return nullptr;
+        return data_reduce_ptr_ + off * bgmmc_.reduce_dt_sz;
+    }
+
+    // Returns a pointer to the scratchpad reduce buffer for the
+    // corresponding @p ithr, shifted by the specified offset @p off.
+    char *get_buf_reduce_ptr(int ithr, int off) const {
+        if (!bgmmc_.with_reduce) return nullptr;
+        assert(bgmmc_.acc_dt == f32);
+        const int ithr_k = get_thread_idx_for_k(ithr);
+        // Use the user-provided reduce buffer as one of the reduce buffers.
+        const bool is_reduce_f32 = bgmmc_.reduce_dt == f32;
+        if (is_reduce_f32 && ithr_k == 0) return get_data_reduce_ptr(off);
+
+        return buf_reduce_ptr_
+                + (ithr_k - is_reduce_f32) * bgmmc_.buffer_reduce_per_thread_sz
+                + off * bgmmc_.acc_dt_sz;
+    }
+
+    // Returns a pointer to the scratchpad reduce buffer for the
+    // corresponding index @p ibuf, shifted by the specified offset @p off.
+    char *get_buf_reduce_ptr_by_index(int ibuf, int off) const {
+        if (!bgmmc_.with_reduce) return nullptr;
+        const size_t _off = bgmmc_.M * ibuf + off;
+        return buf_reduce_ptr_ + _off * bgmmc_.acc_dt_sz;
+    }
+
     const char *get_bias_ptr(int n) const {
         if (!bgmmc_.with_bias) return nullptr;
 
@@ -1509,16 +1924,19 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
 
     int get_base_brgemm_kernel_idx() const { return base_brg_ker_idx_; }
 
-    bool is_last_K_chunk(int k_chunk_idx) const {
-        return k_chunk_idx == bgmmc_.K_chunks - 1;
+    bool is_last_K_blk(int k_blk_idx) const {
+        return k_blk_idx == bgmmc_.num_K_blocks - 1;
     }
 
     int get_brgemm_batch_size(int k_chunk_idx) const {
-        return is_last_K_chunk(k_chunk_idx) ? last_chunk_brgemm_batch_size_
-                                            : bgmmc_.brgemm_batch_size;
+        return is_last_K_blk(k_chunk_idx) ? last_brgemm_batch_size_
+                                          : bgmmc_.brgemm_batch_size;
     }
 
     int get_parallel_work_amount() const { return parallel_work_amount_; }
+    int get_parallel_work_amount_gemm() const {
+        return parallel_work_amount_gemm_;
+    }
     int get_num_threads_for_k() const { return nthr_k_; }
     bool parallel_reduction_is_used() const {
         return nthr_k_ > 1 && bgmmc_.K_chunks > 1;
@@ -1530,11 +1948,19 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
         const int ithr_k = ithr / nthr_bmn_;
         return ithr_k < bgmmc_.K_chunks ? ithr_k : -1;
     }
+
+    int get_thread_idx_for_bmn_gemm(int ithr) const {
+        if (ithr >= num_threads_used_) return -1;
+        const int ithr_bmn = ithr % nthr_bmn_;
+        return ithr_bmn < parallel_work_amount_gemm_ ? ithr_bmn : -1;
+    }
+
     int get_thread_idx_for_bmn(int ithr) const {
         if (ithr >= num_threads_used_) return -1;
         const int ithr_bmn = ithr % nthr_bmn_;
         return ithr_bmn < parallel_work_amount_ ? ithr_bmn : -1;
     }
+
     int get_num_threads_for_parallelization() const {
         return num_threads_used_;
     }
@@ -1543,6 +1969,10 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
     int get_M_chunk_size() const { return bgmmc_.M_chunk_size; }
     int get_M_chunk_tail() const { return M_chunk_tail_; }
 
+    int get_K_chunks() const { return K_chunks_; }
+    int get_K_chunk_size() const { return bgmmc_.K_chunk_size; }
+    int get_K_chunk_tail() const { return K_chunk_tail_; }
+
     int get_M_kernel_idx(int m_block_idx) const {
         if (!is_M_tail_processing(m_block_idx))
             return 0;
@@ -1697,6 +2127,14 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
 
     bool packed_sparse_weights() const { return bgmmc_.packed_sparse_weights; }
 
+    int get_current_K_pad(int current_K_iters) const {
+        if (current_K_iters % bgmmc_.wei_k_blk == 0) return 0;
+        return bgmmc_.extendable_k ? bgmmc_.wei_k_blk
+                        - rnd_up(
+                                current_K_iters % bgmmc_.wei_k_blk, vnni_factor)
+                                   : 0;
+    }
+
 private:
     struct tail_processing_t {
         // dimension index kernel is applied to
@@ -1733,12 +2171,14 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
     int B_packed_sparse_block_size_;
 
     char *data_C_ptr_;
+    char *data_reduce_ptr_;
     brgemm_batch_element_t *batch_element_ptr_;
 
     char *buf_A_ptr_;
     char *buf_B_ptr_;
     char *buf_C_ptr_;
     char *buf_D_ptr_;
+    char *buf_reduce_ptr_;
 
     char *wsp_tile_ptr_;
     const char *bias_ptr_;
@@ -1762,14 +2202,20 @@ struct brgemm_matmul_t<isa>::brg_matmul_exec_ctx_t {
 
     // parallelization parameters
     int parallel_work_amount_;
+    int parallel_work_amount_gemm_;
     int nthr_, nthr_k_, nthr_bmn_, num_threads_used_;
-    int last_chunk_brgemm_batch_size_;
+    int last_brgemm_batch_size_;
     dim_t M_;
     int M_chunks_;
     int M_chunk_tail_;
     int M_chunk_tail_elements_;
     int M_tail_block_start_;
 
+    dim_t K_;
+    int K_chunks_;
+    int K_chunk_tail_;
+    int K_chunk_tail_elements_;
+
     dim_t N_;
     int N_chunks_;
     int N_chunk_tail_;
diff --git a/src/cpu/x64/matmul/brgemm_matmul.hpp b/src/cpu/x64/matmul/brgemm_matmul.hpp
index ce5d319748a..de89a6e5539 100644
--- a/src/cpu/x64/matmul/brgemm_matmul.hpp
+++ b/src/cpu/x64/matmul/brgemm_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include "cpu/x64/cpu_reducer.hpp"
 #include "cpu/x64/jit_avx512_core_scale_precompute.hpp"
 #include "cpu/x64/jit_avx512_sparse_decompress_kernel.hpp"
+#include "cpu/x64/jit_brgemm_post_ops.hpp"
 #include "cpu/x64/matmul/brgemm_matmul_copy_utils.hpp"
 #include "cpu/x64/matmul/brgemm_matmul_utils.hpp"
 
@@ -70,8 +71,9 @@ inline int get_brg_kernel_index(const brgemm_matmul_conf_t &bgmmc,
             : bgmmc.N_blk;
     auto vK = (is_K_tail) ? bgmmc.K_tail : bgmmc.K_blk;
     if (vM == 0 || vN == 0 || vK == 0 || bs == 0 || bgmmc.LDA < vK
-            || bgmmc.LDB < vN
-            || (bgmmc.LDC < vN && !is_runtime_value(bgmmc.LDC)))
+            || (bgmmc.LDB < vN && !bgmmc.is_amx)
+            || ((bgmmc.LDC < vN && !bgmmc.is_amx)
+                    && !is_runtime_value(bgmmc.LDC)))
         return -1;
 
     int idx = 2 * max_n_ker_idx
@@ -113,6 +115,8 @@ struct brgemm_matmul_t : public primitive_t {
             return bgmmc_;
         }
 
+        void maybe_set_LDB2();
+
     private:
         brgemm_desc_t brg_descs_[max_num_brg_kernels_matmul];
         brgemm_matmul_conf_t bgmmc_;
@@ -144,6 +148,11 @@ struct brgemm_matmul_t : public primitive_t {
             int k_blk_idx) const;
     void maybe_reduce_partial_results_and_apply_postops(
             const brg_matmul_exec_ctx_t &brgmm_ctx) const;
+    void maybe_reduce_A(const brg_matmul_exec_ctx_t &brgmm_ctx, int ithr,
+            int gemm_batch, int m_blk_idx, int n_blk_idx, int k_chunk_idx,
+            bool do_init, bool has_K_tail, bool do_K_tail) const;
+    void maybe_reduce_and_convert_partial_results_A(
+            const brg_matmul_exec_ctx_t &brgmm_ctx) const;
     void accumulate(
             char *result_ptr, const char *reduce_ptr, size_t size) const;
 
@@ -158,6 +167,10 @@ struct brgemm_matmul_t : public primitive_t {
     std::unique_ptr<jit_avx512_sparse_decompress_kernel_t>
             sparse_decompress_kernel_;
     std::unique_ptr<jit_avx512_core_scale_precompute_t> jit_scale_precompute_;
+
+    using reducer_t = x64::jit_brgemm_kernel_diff_bias_t<
+            typename cpu_isa_traits_t<isa>::Vmm>;
+    std::unique_ptr<reducer_t> reducers_[2][2];
 };
 
 } // namespace matmul
diff --git a/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp b/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
index 56ee8aff8ca..b000bba7256 100644
--- a/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
+++ b/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,12 +36,12 @@ using namespace Xbyak;
 
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t,
-                                         public jit_generator {
+                                         public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_a_impl_t)
 
     jit_brgemm_matmul_copy_a_impl_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_a_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize_(conf_->a_dt_sz)
         , tr_typesize_(conf_->tr_a_dt_sz)
         , vnni_granularity_(data_type_vnni_granularity(conf_->src_dt))
@@ -55,20 +55,26 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t,
                   conf_->has_zero_point_b && !conf_->with_wei_decompression)
         , avx512_core_dot_product_(
                   do_compute_compensation_ && !isa_has_int8_vnni(conf->isa))
+        // See the note in `create_brgemm_matmul_copy_b` why `orig_src_dt` used.
+        , use_fp16_instructions_(conf_->isa == avx512_core_fp16
+                  && conf_->orig_src_dt == data_type::f16
+                  && conf_->src_dt == data_type::f32)
         , k_loop_unroll_(is_ymm_ ? 7 : 16)
         , vmm_copy_idx_(is_ymm_                      ? 13
                           : avx512_core_dot_product_ ? 27
                                                      : 29) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
     using reg32_t = const Xbyak::Reg32;
     using opmask_t = const Xbyak::Opmask;
 
-    static constexpr int vlen_ = vreg_traits<Vmm>::vlen;
+    static constexpr int vlen_ = vreg_traits_t<Vmm>::vlen;
     static constexpr bool is_ymm_ = std::is_same<Vmm, Xbyak::Ymm>::value;
     static constexpr int num_comp_acc_ = is_ymm_ ? 7 : 8;
 
@@ -80,6 +86,7 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t,
     const dim_t tr_src_stride_;
     const bool do_compute_compensation_;
     const bool avx512_core_dot_product_;
+    const bool use_fp16_instructions_;
 
     const int k_loop_unroll_;
     const int vmm_copy_idx_;
@@ -136,8 +143,7 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t,
     void copy_M_loop(bool is_K_tail, bool is_first_K_iter, bool is_last_K_iter);
     inline void dot_product(Vmm v1, Vmm v2, Vmm v3) {
         if (!avx512_core_dot_product_)
-            vpdpbusd(v1, v2, v3,
-                    mayiuse(avx512_core) ? EvexEncoding : VexEncoding);
+            vpdpbusd(v1, v2, v3, get_encoding());
         else {
             vpmaddubsw(vmm_dot_product_temp, v2, v3);
             vpmaddwd(
@@ -151,10 +157,11 @@ struct jit_brgemm_matmul_copy_a_impl_t : public jit_brgemm_matmul_copy_a_t,
 template <>
 void jit_brgemm_matmul_copy_a_impl_t<Zmm>::load_vmm(int idx, int offset) {
     const auto addr = EVEX_compress_addr(reg_src, offset);
-    if (conf_->isa == avx512_core_fp16) {
+    if (use_fp16_instructions_) {
         vcvtph2psx(get_vmm_copy(idx), addr);
-    } else
+    } else {
         vmovdqu8(get_vmm_copy(idx), addr);
+    }
 }
 
 template <>
@@ -179,15 +186,15 @@ void jit_brgemm_matmul_copy_a_impl_t<Zmm>::load_tail(
     const auto kmovx = [this](Opmask k, size_t q) {
         if (conf_->is_bf32) {
             mov(regq_tmp.cvt32(), q);
-            jit_generator::kmovw(k, regq_tmp.cvt32());
+            jit_generator_t::kmovw(k, regq_tmp.cvt32());
         } else {
             mov(regq_tmp, q);
-            jit_generator::kmovq(k, regq_tmp);
+            jit_generator_t::kmovq(k, regq_tmp);
         }
     };
 
     const size_t dt_step
-            = conf_->is_bf32 || conf_->isa == avx512_core_fp16 ? 1 : typesize_;
+            = conf_->is_bf32 || use_fp16_instructions_ ? 1 : typesize_;
     const size_t tail_mask_load = size_t(((size_t)1 << (dt_step * k_tail)) - 1);
     kmovx(kTail_load, tail_mask_load);
     const int k_tail_st = rnd_up(k_tail, vnni_granularity_);
@@ -202,7 +209,7 @@ void jit_brgemm_matmul_copy_a_impl_t<Zmm>::load_tail(
     auto load_addr = EVEX_compress_addr(reg_src, offset * typesize_);
     if (conf_->is_bf32)
         vmovups(zmm_tail, load_addr);
-    else if (conf_->isa == avx512_core_fp16)
+    else if (use_fp16_instructions_)
         vcvtph2psx(zmm_tail, load_addr);
     else
         vmovdqu8(zmm_tail, load_addr);
@@ -223,7 +230,7 @@ void jit_brgemm_matmul_copy_a_impl_t<Zmm>::store_tail(
         Ymm ymm_downcvt_bf16 = Ymm(get_vmm_copy(0).getIdx());
         vcvtneps2bf16(ymm_downcvt_bf16, get_vmm_copy(0));
         vmovdqu16(tr_src_addr, ymm_downcvt_bf16 | kTail_store);
-    } else if (conf_->isa == avx512_core_fp16) {
+    } else if (use_fp16_instructions_) {
         vmovups(tr_src_addr, get_vmm_copy(0) | kTail_store);
     } else
         vmovdqu8(tr_src_addr, get_vmm_copy(0) | kTail_store);
@@ -418,7 +425,7 @@ void jit_brgemm_matmul_copy_a_impl_t<Vmm>::copy_M_loop(
                     ptr[param1 + GET_OFF(zp_a_compensation_result_ptr)]);
             if (!is_ymm_) {
                 mov(regq_tmp, 1);
-                jit_generator::kmovw(kTail_comp, imm_addr64.cvt32());
+                jit_generator_t::kmovw(kTail_comp, imm_addr64.cvt32());
             }
         }
     }
@@ -522,12 +529,12 @@ template struct jit_brgemm_matmul_copy_a_impl_t<Ymm>;
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_a_transposed_impl_t
     : public jit_brgemm_matmul_copy_a_t,
-      public jit_generator {
+      public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_a_transposed_impl_t)
 
     jit_brgemm_matmul_copy_a_transposed_impl_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_a_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize(conf_->a_dt_sz)
         , tr_typesize(conf_->tr_a_dt_sz)
         , rows_step(16)
@@ -538,12 +545,18 @@ struct jit_brgemm_matmul_copy_a_transposed_impl_t
         , m_loop_dst_shift(columns_step * dst_stride)
         , k_loop_src_shift(rows_step * src_stride)
         , k_loop_dst_shift(rows_step * tr_typesize)
-        , is_f32(everyone_is(data_type::f32, conf_->src_dt, conf_->wei_dt))
+        , is_f32(conf_->src_dt == data_type::f32)
         , is_bf32(conf_->is_bf32)
-        , is_dynamic_src_ld(conf_->is_runtime_M) {}
+        , is_dynamic_src_ld(conf_->is_runtime_M)
+        // See the note in `create_brgemm_matmul_copy_b` why `orig_src_dt` used.
+        , use_fp16_instructions_(conf_->isa == avx512_core_fp16
+                  && conf_->orig_src_dt == data_type::f16
+                  && conf_->src_dt == data_type::f32) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -562,6 +575,7 @@ struct jit_brgemm_matmul_copy_a_transposed_impl_t
     const bool is_f32;
     const bool is_bf32;
     const bool is_dynamic_src_ld;
+    const bool use_fp16_instructions_;
 
     opmask_t kFFFF = k1;
     opmask_t k3333 = k1;
@@ -609,17 +623,17 @@ struct jit_brgemm_matmul_copy_a_transposed_impl_t
 
     void vmovdqa64(Vmm v, const int64_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(v, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa64(v, ptr[imm_addr64]);
     }
 
     void vmovdqa32(Vmm v, const int32_t *addr) {
         mov(imm_addr64, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa32(v, ptr[imm_addr64]);
+        jit_generator_t::vmovdqa32(v, ptr[imm_addr64]);
     }
 
     void kmovw(Opmask mask_reg, size_t mask) {
         mov(regw_tmp, mask);
-        jit_generator::kmovw(mask_reg, regw_tmp);
+        jit_generator_t::kmovw(mask_reg, regw_tmp);
     }
 
     void transpose_f32(reg64_t dst, reg64_t src, int nrows, int ncolumns);
@@ -657,9 +671,9 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Zmm>::transpose_bf16(
                   } else
                       mov(regw_tmp, w);
                   if (use_word_sz)
-                      jit_generator::kmovw(k, regw_tmp);
+                      jit_generator_t::kmovw(k, regw_tmp);
                   else
-                      jit_generator::kmovd(k, regw_tmp);
+                      jit_generator_t::kmovd(k, regw_tmp);
               };
 
     auto store = [this, dst](Zmm r, int i) {
@@ -864,7 +878,7 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Ymm>::transpose_f32(
 
     const int A_rows = nstl::min(avx2_transpose_size, nrows);
     const int A_columns = nstl::min(avx2_transpose_size, ncolumns);
-    jit_generator::transpose(reg_src, reg_dst, src_stride, dst_stride, A_rows,
+    jit_generator_t::transpose(reg_src, reg_dst, src_stride, dst_stride, A_rows,
             A_columns, data_type::f32, ymm_tmp, ymm_tail_mask,
             xmm_upper_tail_mask);
     if (rows_step <= 8) return;
@@ -875,7 +889,7 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Ymm>::transpose_f32(
     const int B_columns = nstl::max(ncolumns - avx2_transpose_size, 0);
     add(reg_src, src_B_offset);
     add(reg_dst, dst_B_offset);
-    jit_generator::transpose(reg_src, reg_dst, src_stride, dst_stride, B_rows,
+    jit_generator_t::transpose(reg_src, reg_dst, src_stride, dst_stride, B_rows,
             B_columns, data_type::f32, ymm_tmp, ymm_tail_mask,
             xmm_upper_tail_mask);
 
@@ -885,7 +899,7 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Ymm>::transpose_f32(
     const int C_columns = nstl::min(avx2_transpose_size, ncolumns);
     add(reg_src, -src_B_offset + src_C_offset);
     add(reg_dst, -dst_B_offset + dst_C_offset);
-    jit_generator::transpose(reg_src, reg_dst, src_stride, dst_stride, C_rows,
+    jit_generator_t::transpose(reg_src, reg_dst, src_stride, dst_stride, C_rows,
             C_columns, data_type::f32, ymm_tmp, ymm_tail_mask,
             xmm_upper_tail_mask);
 
@@ -897,7 +911,7 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Ymm>::transpose_f32(
     const int D_columns = nstl::max(ncolumns - avx2_transpose_size, 0);
     add(reg_src, -src_C_offset + src_D_offset);
     add(reg_dst, -dst_C_offset + dst_D_offset);
-    jit_generator::transpose(reg_src, reg_dst, src_stride, dst_stride, D_rows,
+    jit_generator_t::transpose(reg_src, reg_dst, src_stride, dst_stride, D_rows,
             D_columns, data_type::f32, ymm_tmp, ymm_tail_mask,
             xmm_upper_tail_mask);
     sub(reg_src, src_D_offset);
@@ -922,7 +936,7 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Zmm>::transpose_f32(
             sub(regq_tmp, 1);
         } else
             mov(regw_tmp, q);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     const int load_mask
@@ -943,13 +957,14 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Zmm>::transpose_f32(
         const auto addr = is_dynamic_src_ld
                 ? ptr[i % 2 == 0 ? reg_aux_src0 : reg_aux_src1]
                 : EVEX_compress_addr(src, i * src_stride);
-        if (i < nrows)
-            if (conf_->isa == avx512_core_fp16)
+        if (i < nrows) {
+            if (use_fp16_instructions_)
                 vcvtph2psx(src_zmm(i) | kTail | T_z, addr);
             else
                 vmovups(src_zmm(i) | kTail | T_z, addr);
-        else
+        } else {
             vpxord(src_zmm(i), src_zmm(i), src_zmm(i));
+        }
     };
 
     auto store = [this, dst](Zmm r, int i) {
@@ -1075,7 +1090,7 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Xbyak::Zmm>::transpose_f32(
 template <typename Vmm>
 void jit_brgemm_matmul_copy_a_transposed_impl_t<Vmm>::deploy_transpose(
         reg64_t dst, reg64_t src, int nrows, int ncolumns) {
-    if (is_f32 || conf_->isa == avx512_core_fp16)
+    if (is_f32 || use_fp16_instructions_)
         transpose_f32(dst, src, nrows, ncolumns);
     else
         transpose_bf16(dst, src, nrows, ncolumns);
@@ -1269,14 +1284,14 @@ void jit_brgemm_matmul_copy_a_transposed_impl_t<Vmm>::generate() {
 
 struct jit_brgemm_matmul_copy_a_transposed_int8_impl_t
     : public jit_brgemm_matmul_copy_a_t,
-      public jit_generator {
+      public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(
             jit_brgemm_matmul_copy_a_transposed_int8_impl_t)
 
     jit_brgemm_matmul_copy_a_transposed_int8_impl_t(
             const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_a_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , src_stride_(conf_->copy_A_src_stride)
         , dst_stride_(conf_->LDA * conf_->tr_a_dt_sz)
         , m_loop_src_shift_(columns_step_ * conf_->a_dt_sz)
@@ -1291,8 +1306,10 @@ struct jit_brgemm_matmul_copy_a_transposed_int8_impl_t
         , last_m_block_tail_(conf_->M_tail % columns_step_)
         , do_compute_compensation_(conf_->has_zero_point_b) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     constexpr static int rows_step_ = 16;
@@ -1370,7 +1387,7 @@ struct jit_brgemm_matmul_copy_a_transposed_int8_impl_t
             sub(reg_tmp_, 1);
         } else
             mov(reg_tmp_, w);
-        jit_generator::kmovd(k, reg_tmp_.cvt32());
+        jit_generator_t::kmovd(k, reg_tmp_.cvt32());
     }
 
     void transpose_int8_vpermb(Reg64 dst, Reg64 src, int nrows, int ncolumns);
@@ -1954,7 +1971,7 @@ void jit_brgemm_matmul_copy_a_transposed_int8_impl_t::generate() {
 
     auto kmovw = [this](Opmask k, unsigned w) {
         mov(reg_tmp_, w);
-        jit_generator::kmovw(k, reg_tmp_.cvt32());
+        jit_generator_t::kmovw(k, reg_tmp_.cvt32());
     };
 
     kmovw(kFFFF_, 0xffff);
@@ -1967,7 +1984,7 @@ void jit_brgemm_matmul_copy_a_transposed_int8_impl_t::generate() {
 
     auto vmovdqa64 = [this](Zmm z, const int64_t *addr) {
         mov(reg_tmp_, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(z, ptr[reg_tmp_]);
+        jit_generator_t::vmovdqa64(z, ptr[reg_tmp_]);
     };
 
     if (has_vpermb_) {
@@ -2048,12 +2065,12 @@ template struct jit_brgemm_matmul_copy_a_transposed_impl_t<Ymm>;
 
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t,
-                                         public jit_generator {
+                                         public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_b_int8_t)
 
     jit_brgemm_matmul_copy_b_int8_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_b_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , src_stride_(conf->copy_B_wei_stride)
         , tr_src_stride_(conf->LDB * k_blk_step_ * sizeof(int8_t))
         , is_amx_(mayiuse(avx512_core_amx))
@@ -2067,8 +2084,10 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t,
                           : avx512_core_dot_product_ ? 23
                                                      : 25) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 protected:
     using reg64_t = const Xbyak::Reg64;
@@ -2078,7 +2097,7 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t,
     static constexpr int k_blk_step_ = 4;
     static constexpr int n_blk_step_ = 64;
     static constexpr int blk_sz_ = 6;
-    static constexpr int simd_w_ = vreg_traits<Vmm>::vlen;
+    static constexpr int simd_w_ = vreg_traits_t<Vmm>::vlen;
 
     const dim_t src_stride_;
     const dim_t tr_src_stride_;
@@ -2090,7 +2109,8 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t,
 
     constexpr static int reg_src_offs_ = 0;
     constexpr static int reg_tr_src_offs_ = 8;
-    constexpr static int stack_space_needed_ = 16;
+    constexpr static int reg_current_K_pad_offs_ = 16;
+    constexpr static int stack_space_needed_ = 24;
 
     const int comp_acc_idx_;
 
@@ -2134,7 +2154,7 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t,
 
     inline void vmovdqa64(Vmm vmm, const void *addr) {
         mov(reg_tmp, reinterpret_cast<size_t>(addr));
-        jit_generator::vmovdqa64(vmm, ptr[reg_tmp]);
+        jit_generator_t::vmovdqa64(vmm, ptr[reg_tmp]);
     }
 
     inline Vmm get_vmm(int blk, int idx) {
@@ -2147,15 +2167,15 @@ struct jit_brgemm_matmul_copy_b_int8_t : public jit_brgemm_matmul_copy_b_t,
     inline void load(int blk, int i, bool is_tail) {}
     inline void kmovq(Opmask k, size_t q) {}
     virtual void init_permute() {}
-    virtual void copy_block(int nrows, int ncolumns, bool n_tail) {
+    virtual void copy_block(
+            int nrows, int ncolumns, bool n_tail, bool zeropad) {
         UNUSED(n_tail);
-        copy_4x64(nrows, ncolumns);
+        copy_4x64(nrows, ncolumns, zeropad);
     }
-    virtual void copy_4x64(int nrows, int ncolumns) {}
+    virtual void copy_4x64(int nrows, int ncolumns, bool zeropad) {}
     inline void dot_product(Vmm v1, Vmm v2, Vmm v3) {
         if (!avx512_core_dot_product_)
-            vpdpbusd(v1, v2, v3,
-                    mayiuse(avx512_core) ? EvexEncoding : VexEncoding);
+            vpdpbusd(v1, v2, v3, get_encoding());
         else {
             vpmaddubsw(vmm_dot_product_temp, v2, v3);
             vpmaddwd(
@@ -2184,7 +2204,7 @@ inline void jit_brgemm_matmul_copy_b_int8_t<Zmm>::kmovq(Opmask k, size_t q) {
         sub(reg_tmp, 1);
     } else
         mov(reg_tmp, q);
-    jit_generator::kmovq(k, reg_tmp);
+    jit_generator_t::kmovq(k, reg_tmp);
 }
 
 template struct jit_brgemm_matmul_copy_b_int8_t<Zmm>;
@@ -2194,9 +2214,12 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
     : public jit_brgemm_matmul_copy_b_int8_t<Xbyak::Zmm> {
 
     jit_amx_brgemm_matmul_copy_b_int8_t(const brgemm_matmul_conf_t *conf)
-        : jit_brgemm_matmul_copy_b_int8_t<Xbyak::Zmm>(conf) {}
+        : jit_brgemm_matmul_copy_b_int8_t<Xbyak::Zmm>(conf)
+        , do_N_loop_(conf->LDB < conf->N_blk) {}
 
 private:
+    const bool do_N_loop_;
+
     void init_permute() override {
         alignas(64) static constexpr const uint8_t idx_lo_16[64] = {0, 1, 64,
                 65, 4, 5, 68, 69, 2, 3, 66, 67, 6, 7, 70, 71, 8, 9, 72, 73, 12,
@@ -2230,9 +2253,11 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
         vmovdqa64(vreg_idx_hi_128, (const void *)idx_hi_8);
     }
 
-    void copy_block(int nrows, int ncolumns, bool n_tail) override {
-        if (!is_dynamic_N_ || !n_tail) {
-            copy_4x64(nrows, ncolumns);
+    void copy_block(
+            int nrows, int ncolumns, bool n_tail, bool zeropad) override {
+
+        if (!do_N_loop_ && (!is_dynamic_N_ || !n_tail)) {
+            copy_4x64(nrows, ncolumns, zeropad);
             return;
         }
 
@@ -2254,10 +2279,17 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
         {
             mov(ptr[rsp + reg_src_offs_], reg_src);
             add(reg_src, reg_copy_block_n_shift);
-            copy_4x64(nrows, n_blk_step_);
+            copy_4x64(nrows, n_blk_step_, zeropad);
             add(reg_copy_block_n_shift, n_blk_step_ * typesize);
             add(reg_src, n_blk_step_ * typesize);
-            add(reg_tr_src, n_blk_step_ * k_blk_step_ * typesize);
+
+            if (do_N_loop_)
+                // (n_blk_step_ /conf_->LDB) --> # of LDBs handled by copy_4x64
+                add(reg_tr_src,
+                        (n_blk_step_ / conf_->LDB) * conf_->LDB2 * typesize);
+            else
+                add(reg_tr_src, n_blk_step_ * k_blk_step_ * typesize);
+
             sub(reg_dynamic_tail, n_blk_step_);
 
             cmp(reg_dynamic_tail, 0);
@@ -2277,7 +2309,10 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
             jle(loop_row_done, T_NEAR);
 
             add(reg_src, reg_copy_block_n_shift);
-            copy_4x64(nrows, 1 /* to force tail case */);
+            if (do_N_loop_ && !is_dynamic_N_)
+                copy_4x64(nrows, ncolumns % n_blk_step_, zeropad);
+            else
+                copy_4x64(nrows, 1 /* to force tail case */, zeropad);
         }
         L(loop_row_done);
 
@@ -2286,7 +2321,13 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
         mov(reg_tr_src, ptr[rsp + reg_tr_src_offs_]);
     }
 
-    void copy_4x64(int nrows, int ncolumns) override {
+    void copy_4x64(int nrows, int ncolumns, bool zeropad) override {
+
+        auto tr_src_off_n = [&](int n_elem) {
+            return ((n_elem / conf_->LDB) * conf_->LDB2
+                    + (n_elem % conf_->LDB) * k_blk_step_);
+        };
+
         const bool is_tail = ncolumns < n_blk_step_;
         const auto tail_mask = size_t(((size_t)1 << ncolumns) - 1);
 
@@ -2301,32 +2342,36 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
                 k++) {
             const int row_start = (kb * max_unroll + k) * k_blk_step_;
             const int row_end = nstl::min(row_start + k_blk_step_, nrows);
+            dim_t tr_src_off_base = (kb * max_unroll + k) * tr_src_stride_;
 
-            for (int i = row_start; i < row_end; i++)
-                load(k, i, is_tail);
-            if (row_end == nrows && nrows % k_blk_step_ > 0) {
-                for (int i = nrows; i < rnd_up(nrows, k_blk_step_); i++) {
-                    auto src_reg = get_vmm(k, i % k_blk_step_);
-                    vpxord(src_reg, src_reg, src_reg);
+            if (!zeropad) {
+                for (int i = row_start; i < row_end; i++)
+                    load(k, i, is_tail);
+                if (row_end == nrows && nrows % k_blk_step_ > 0) {
+                    for (int i = nrows; i < rnd_up(nrows, k_blk_step_); i++) {
+                        auto src_reg = get_vmm(k, i % k_blk_step_);
+                        vpxord(src_reg, src_reg, src_reg);
+                    }
                 }
+                vmovups(get_vmm(k, 4), vreg_idx_lo_256);
+                vpermi2b(get_vmm(k, 4), get_vmm(k, 0), get_vmm(k, 2));
+                vmovups(get_vmm(k, 5), vreg_idx_hi_256);
+                vpermi2b(get_vmm(k, 5), get_vmm(k, 0), get_vmm(k, 2));
+                vmovups(get_vmm(k, 0), vreg_idx_lo_256);
+                vpermi2b(get_vmm(k, 0), get_vmm(k, 1), get_vmm(k, 3));
+                vmovups(get_vmm(k, 2), vreg_idx_hi_256);
+                vpermi2b(get_vmm(k, 2), get_vmm(k, 1), get_vmm(k, 3));
+
+                vmovups(get_vmm(k, 1), vreg_idx_lo_128);
+                vpermi2b(get_vmm(k, 1), get_vmm(k, 4), get_vmm(k, 0));
+                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base),
+                        get_vmm(k, 1));
+                if (do_compute_compensation_)
+                    vpdpbusd(get_comp_acc(0), vmm_comp_mul, get_vmm(k, 1));
+            } else {
+                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base),
+                        vmm_zero);
             }
-
-            vmovups(get_vmm(k, 4), vreg_idx_lo_256);
-            vpermi2b(get_vmm(k, 4), get_vmm(k, 0), get_vmm(k, 2));
-            vmovups(get_vmm(k, 5), vreg_idx_hi_256);
-            vpermi2b(get_vmm(k, 5), get_vmm(k, 0), get_vmm(k, 2));
-            vmovups(get_vmm(k, 0), vreg_idx_lo_256);
-            vpermi2b(get_vmm(k, 0), get_vmm(k, 1), get_vmm(k, 3));
-            vmovups(get_vmm(k, 2), vreg_idx_hi_256);
-            vpermi2b(get_vmm(k, 2), get_vmm(k, 1), get_vmm(k, 3));
-
-            vmovups(get_vmm(k, 1), vreg_idx_lo_128);
-            vpermi2b(get_vmm(k, 1), get_vmm(k, 4), get_vmm(k, 0));
-            dim_t tr_src_off_base = (kb * max_unroll + k) * tr_src_stride_;
-            vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base),
-                    get_vmm(k, 1));
-            if (do_compute_compensation_)
-                vpdpbusd(get_comp_acc(0), vmm_comp_mul, get_vmm(k, 1));
             const bool dynamic_tail = is_dynamic_N_ && ncolumns < n_blk_step_;
 
             Label k_loop_done;
@@ -2334,15 +2379,17 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
                 cmp(reg_dynamic_tail, 16);
                 jle(k_loop_done, T_NEAR);
             }
-            if (ncolumns > 16 || dynamic_tail) {
+            if (!zeropad && (ncolumns > 16 || dynamic_tail)) {
                 vmovups(get_vmm(k, 3), vreg_idx_hi_128);
                 vpermi2b(get_vmm(k, 3), get_vmm(k, 4), get_vmm(k, 0));
-                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 64),
+                vmovups(EVEX_compress_addr(
+                                reg_tr_src, tr_src_off_base + tr_src_off_n(16)),
                         get_vmm(k, 3));
                 if (do_compute_compensation_)
                     vpdpbusd(get_comp_acc(1), vmm_comp_mul, get_vmm(k, 3));
             } else if (conf_->wei_n_blk > 16) {
-                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 64),
+                vmovups(EVEX_compress_addr(
+                                reg_tr_src, tr_src_off_base + tr_src_off_n(16)),
                         vmm_zero);
             }
 
@@ -2350,15 +2397,17 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
                 cmp(reg_dynamic_tail, 32);
                 jle(k_loop_done, T_NEAR);
             }
-            if (ncolumns > 32 || dynamic_tail) {
+            if (!zeropad && (ncolumns > 32 || dynamic_tail)) {
                 vmovups(get_vmm(k, 4), vreg_idx_lo_128);
                 vpermi2b(get_vmm(k, 4), get_vmm(k, 5), get_vmm(k, 2));
-                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 128),
+                vmovups(EVEX_compress_addr(
+                                reg_tr_src, tr_src_off_base + tr_src_off_n(32)),
                         get_vmm(k, 4));
                 if (do_compute_compensation_)
                     vpdpbusd(get_comp_acc(2), vmm_comp_mul, get_vmm(k, 4));
             } else if (conf_->wei_n_blk > 32) {
-                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 128),
+                vmovups(EVEX_compress_addr(
+                                reg_tr_src, tr_src_off_base + tr_src_off_n(32)),
                         vmm_zero);
             }
 
@@ -2366,15 +2415,17 @@ struct jit_amx_brgemm_matmul_copy_b_int8_t
                 cmp(reg_dynamic_tail, 48);
                 jle(k_loop_done, T_NEAR);
             }
-            if (ncolumns > 48 || dynamic_tail) {
+            if (!zeropad && (ncolumns > 48 || dynamic_tail)) {
                 vmovups(get_vmm(k, 0), vreg_idx_hi_128);
                 vpermi2b(get_vmm(k, 0), get_vmm(k, 5), get_vmm(k, 2));
-                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 192),
+                vmovups(EVEX_compress_addr(
+                                reg_tr_src, tr_src_off_base + tr_src_off_n(48)),
                         get_vmm(k, 0));
                 if (do_compute_compensation_)
                     vpdpbusd(get_comp_acc(3), vmm_comp_mul, get_vmm(k, 0));
             } else if (conf_->wei_n_blk > 48) {
-                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 192),
+                vmovups(EVEX_compress_addr(
+                                reg_tr_src, tr_src_off_base + tr_src_off_n(48)),
                         vmm_zero);
             }
             L(k_loop_done);
@@ -2407,7 +2458,7 @@ struct jit_avx512_core_brgemm_matmul_copy_b_int8_t
         vmovdqa64(vreg_idx_hi_128, (const void *)idx_hi_128);
     }
 
-    void copy_4x64(int nrows, int ncolumns) override {
+    void copy_4x64(int nrows, int ncolumns, bool zeropad) override {
         const bool is_tail = ncolumns < n_blk_step_;
         if (is_tail) {
             const auto tail_mask = size_t(((size_t)1 << ncolumns) - 1);
@@ -2423,44 +2474,49 @@ struct jit_avx512_core_brgemm_matmul_copy_b_int8_t
                 k++) {
             const int row_start = (kb * max_unroll + k) * k_blk_step_;
             const int row_end = nstl::min(row_start + k_blk_step_, nrows);
+            dim_t tr_src_off_base = (kb * max_unroll + k) * tr_src_stride_;
 
-            for (int i = row_start; i < row_end; i++)
-                load(k, i, is_tail);
-            if (row_end == nrows && nrows % k_blk_step_ > 0) {
-                for (int i = nrows; i < rnd_up(nrows, k_blk_step_); i++) {
-                    auto src_reg = get_vmm(k, i % k_blk_step_);
-                    vpxord(src_reg, src_reg, src_reg);
+            if (!zeropad) {
+                for (int i = row_start; i < row_end; i++)
+                    load(k, i, is_tail);
+                if (row_end == nrows && nrows % k_blk_step_ > 0) {
+                    for (int i = nrows; i < rnd_up(nrows, k_blk_step_); i++) {
+                        auto src_reg = get_vmm(k, i % k_blk_step_);
+                        vpxord(src_reg, src_reg, src_reg);
+                    }
                 }
-            }
 
-            vpunpcklbw(get_vmm(k, 4), get_vmm(k, 0), get_vmm(k, 1));
-            vpunpckhbw(get_vmm(k, 5), get_vmm(k, 0), get_vmm(k, 1));
-            vpunpcklbw(get_vmm(k, 0), get_vmm(k, 2), get_vmm(k, 3));
-            vpunpckhbw(get_vmm(k, 1), get_vmm(k, 2), get_vmm(k, 3));
-
-            vpunpcklwd(get_vmm(k, 2), get_vmm(k, 4), get_vmm(k, 0));
-            vpunpckhwd(get_vmm(k, 3), get_vmm(k, 4), get_vmm(k, 0));
-            vpunpcklwd(get_vmm(k, 4), get_vmm(k, 5), get_vmm(k, 1));
-            vpunpckhwd(get_vmm(k, 5), get_vmm(k, 5), get_vmm(k, 1));
-
-            vmovups(get_vmm(k, 0), vreg_idx_lo_256);
-            vpermi2q(get_vmm(k, 0), get_vmm(k, 2), get_vmm(k, 4));
-            vmovups(get_vmm(k, 1), vreg_idx_hi_256);
-            vpermi2q(get_vmm(k, 1), get_vmm(k, 2), get_vmm(k, 4));
-            vmovups(get_vmm(k, 2), vreg_idx_lo_256);
-            vpermi2q(get_vmm(k, 2), get_vmm(k, 3), get_vmm(k, 5));
-            vmovups(get_vmm(k, 4), vreg_idx_hi_256);
-            vpermi2q(get_vmm(k, 4), get_vmm(k, 3), get_vmm(k, 5));
-
-            vmovups(get_vmm(k, 3), vreg_idx_lo_128);
-            vpermi2q(get_vmm(k, 3), get_vmm(k, 0), get_vmm(k, 2));
-            dim_t tr_src_off_base = (kb * max_unroll + k) * tr_src_stride_;
-            vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base),
-                    get_vmm(k, 3));
-            if (do_compute_compensation_)
-                dot_product(get_comp_acc(0), vmm_comp_mul, get_vmm(k, 3));
+                vpunpcklbw(get_vmm(k, 4), get_vmm(k, 0), get_vmm(k, 1));
+                vpunpckhbw(get_vmm(k, 5), get_vmm(k, 0), get_vmm(k, 1));
+                vpunpcklbw(get_vmm(k, 0), get_vmm(k, 2), get_vmm(k, 3));
+                vpunpckhbw(get_vmm(k, 1), get_vmm(k, 2), get_vmm(k, 3));
+
+                vpunpcklwd(get_vmm(k, 2), get_vmm(k, 4), get_vmm(k, 0));
+                vpunpckhwd(get_vmm(k, 3), get_vmm(k, 4), get_vmm(k, 0));
+                vpunpcklwd(get_vmm(k, 4), get_vmm(k, 5), get_vmm(k, 1));
+                vpunpckhwd(get_vmm(k, 5), get_vmm(k, 5), get_vmm(k, 1));
+
+                vmovups(get_vmm(k, 0), vreg_idx_lo_256);
+                vpermi2q(get_vmm(k, 0), get_vmm(k, 2), get_vmm(k, 4));
+                vmovups(get_vmm(k, 1), vreg_idx_hi_256);
+                vpermi2q(get_vmm(k, 1), get_vmm(k, 2), get_vmm(k, 4));
+                vmovups(get_vmm(k, 2), vreg_idx_lo_256);
+                vpermi2q(get_vmm(k, 2), get_vmm(k, 3), get_vmm(k, 5));
+                vmovups(get_vmm(k, 4), vreg_idx_hi_256);
+                vpermi2q(get_vmm(k, 4), get_vmm(k, 3), get_vmm(k, 5));
+
+                vmovups(get_vmm(k, 3), vreg_idx_lo_128);
+                vpermi2q(get_vmm(k, 3), get_vmm(k, 0), get_vmm(k, 2));
+                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base),
+                        get_vmm(k, 3));
+                if (do_compute_compensation_)
+                    dot_product(get_comp_acc(0), vmm_comp_mul, get_vmm(k, 3));
+            } else {
+                vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base),
+                        vmm_zero);
+            }
 
-            if (ncolumns > 16) {
+            if (!zeropad && ncolumns > 16) {
                 vmovups(get_vmm(k, 5), vreg_idx_hi_128);
                 vpermi2q(get_vmm(k, 5), get_vmm(k, 0), get_vmm(k, 2));
                 vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 64),
@@ -2472,7 +2528,7 @@ struct jit_avx512_core_brgemm_matmul_copy_b_int8_t
                         vmm_zero);
             }
 
-            if (ncolumns > 32) {
+            if (!zeropad && ncolumns > 32) {
                 vmovups(get_vmm(k, 0), vreg_idx_lo_128);
                 vpermi2q(get_vmm(k, 0), get_vmm(k, 1), get_vmm(k, 4));
                 vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 128),
@@ -2484,7 +2540,7 @@ struct jit_avx512_core_brgemm_matmul_copy_b_int8_t
                         vmm_zero);
             }
 
-            if (ncolumns > 48) {
+            if (!zeropad && ncolumns > 48) {
                 vmovups(get_vmm(k, 2), vreg_idx_hi_128);
                 vpermi2q(get_vmm(k, 2), get_vmm(k, 1), get_vmm(k, 4));
                 vmovups(EVEX_compress_addr(reg_tr_src, tr_src_off_base + 192),
@@ -2521,7 +2577,7 @@ struct jit_avx2_vnni_brgemm_matmul_copy_b_int8_t
             uni_vmovups(vmm_src, ptr[reg_src + offset]);
     }
 
-    void copy_4x64(int nrows, int ncolumns) override {
+    void copy_4x64(int nrows, int ncolumns, bool zeropad) override {
         const bool is_tail = ncolumns < n_blk_step_;
         const int k_end = div_up(nrows, k_blk_step_);
         for_(int k = 0; k < k_end; k++)
@@ -2533,39 +2589,44 @@ struct jit_avx2_vnni_brgemm_matmul_copy_b_int8_t
                     = tr_src_off_base + pass * 2 * n_blk_step_;
             const int row_start = k * k_blk_step_;
             const int row_end = nstl::min(row_start + k_blk_step_, nrows);
-            for (int i = row_start; i < rnd_up(row_end, k_blk_step_); i++) {
-                const bool do_load = i < row_end
-                        && IMPLICATION(pass == 1, ncolumns >= simd_w_);
-                if (do_load) {
-                    const bool do_tail = is_tail
-                            && IMPLICATION(pass == 0, ncolumns < simd_w_);
-                    const auto offset
-                            = (is_dynamic_stride_ ? 0 : i * src_stride_)
-                            + pass * simd_w_;
-                    load_ymm(i % 4, offset, do_tail, ncolumns - pass * simd_w_);
-                    if (is_dynamic_stride_) add(reg_src, reg_src_stride);
-                } else {
-                    const auto src_ymm_1 = get_ymm(i % 4);
-                    uni_vpxor(src_ymm_1, src_ymm_1, src_ymm_1);
+            if (!zeropad) {
+                for (int i = row_start; i < rnd_up(row_end, k_blk_step_); i++) {
+                    const bool do_load = i < row_end
+                            && IMPLICATION(pass == 1, ncolumns >= simd_w_);
+                    if (do_load) {
+                        const bool do_tail = is_tail
+                                && IMPLICATION(pass == 0, ncolumns < simd_w_);
+                        const auto offset
+                                = (is_dynamic_stride_ ? 0 : i * src_stride_)
+                                + pass * simd_w_;
+                        load_ymm(i % 4, offset, do_tail,
+                                ncolumns - pass * simd_w_);
+                        if (is_dynamic_stride_) add(reg_src, reg_src_stride);
+                    } else {
+                        const auto src_ymm_1 = get_ymm(i % 4);
+                        uni_vpxor(src_ymm_1, src_ymm_1, src_ymm_1);
+                    }
                 }
+                if (pass == 0 && ncolumns >= simd_w_)
+                    mov(reg_src, reg_src_backup);
+
+                vpunpcklbw(get_ymm(4), get_ymm(0), get_ymm(1));
+                vpunpckhbw(get_ymm(5), get_ymm(0), get_ymm(1));
+                vpunpcklbw(get_ymm(0), get_ymm(2), get_ymm(3));
+                vpunpckhbw(get_ymm(1), get_ymm(2), get_ymm(3));
+
+                vpunpcklwd(get_ymm(2), get_ymm(4), get_ymm(0));
+                vpunpckhwd(get_ymm(3), get_ymm(4), get_ymm(0));
+                vpunpcklwd(get_ymm(4), get_ymm(5), get_ymm(1));
+                vpunpckhwd(get_ymm(5), get_ymm(5), get_ymm(1));
             }
-            if (pass == 0 && ncolumns >= simd_w_) mov(reg_src, reg_src_backup);
-
-            vpunpcklbw(get_ymm(4), get_ymm(0), get_ymm(1));
-            vpunpckhbw(get_ymm(5), get_ymm(0), get_ymm(1));
-            vpunpcklbw(get_ymm(0), get_ymm(2), get_ymm(3));
-            vpunpckhbw(get_ymm(1), get_ymm(2), get_ymm(3));
-
-            vpunpcklwd(get_ymm(2), get_ymm(4), get_ymm(0));
-            vpunpckhwd(get_ymm(3), get_ymm(4), get_ymm(0));
-            vpunpcklwd(get_ymm(4), get_ymm(5), get_ymm(1));
-            vpunpckhwd(get_ymm(5), get_ymm(5), get_ymm(1));
 
             auto get_accum
                     = [&](int idx) { return get_comp_acc(idx + pass * 4); };
 
-            if (IMPLICATION(
-                        pass == 1, ncolumns > 32)) { // check against {0, 32}
+            if (!zeropad
+                    && IMPLICATION(pass == 1,
+                            ncolumns > 32)) { // check against {0, 32}
                 vperm2i128(get_ymm(0), get_ymm(2), get_ymm(3), perm2i128_l);
                 vperm2i128(get_ymm(1), get_ymm(4), get_ymm(5), perm2i128_l);
                 uni_vmovups(ptr[reg_tr_src + set_1_tr_src_offset], get_ymm(0));
@@ -2585,7 +2646,7 @@ struct jit_avx2_vnni_brgemm_matmul_copy_b_int8_t
 
             const int set_2_tr_src_offset = set_1_tr_src_offset + n_blk_step_;
             const int upper_check = 16 + pass * 32; // check against {16, 48}
-            if (ncolumns > upper_check) {
+            if (!zeropad && ncolumns > upper_check) {
                 vperm2i128(get_ymm(2), get_ymm(2), get_ymm(3), perm2i128_h);
                 vperm2i128(get_ymm(3), get_ymm(4), get_ymm(5), perm2i128_h);
                 uni_vmovups(ptr[reg_tr_src + set_2_tr_src_offset], get_ymm(2));
@@ -2619,7 +2680,6 @@ void jit_brgemm_matmul_copy_b_int8_t<Vmm>::generate() {
     uni_vpxor(vmm_zero, vmm_zero, vmm_zero);
     mov(reg_src, ptr[param1 + GET_OFF(src)]);
     mov(reg_tr_src, ptr[param1 + GET_OFF(tr_src)]);
-    mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
     mov(reg_N_blk, ptr[param1 + GET_OFF(current_N_blk)]);
     if (is_dynamic_stride_) {
         mov(reg_src_stride, ptr[param1 + GET_OFF(dynamic_src_stride)]);
@@ -2635,33 +2695,33 @@ void jit_brgemm_matmul_copy_b_int8_t<Vmm>::generate() {
         uni_vpbroadcastb(vmm_comp_mul, reg_tmp.cvt8());
     }
 
-    auto compute_K_loop = [&](bool is_N_tail) {
+    auto compute_K_loop_body = [&](const reg64_t &reg_K, int ncolumns,
+                                       bool is_N_tail, bool zeropad) {
         const int k_unroll = 4;
-        int ncolumns = is_N_tail ? conf_->N_tail : conf_->N_blk;
-
         Label K_loop_unrolled, K_loop_single, K_loop_tail_or_done;
-        cmp(reg_K_iters, k_unroll * k_blk_step_);
+        cmp(reg_K, k_unroll * k_blk_step_);
         jl(K_loop_single, T_NEAR);
 
         L(K_loop_unrolled);
-        copy_block(k_unroll * k_blk_step_, ncolumns, is_N_tail);
-        if (!is_dynamic_stride_)
+        copy_block(k_unroll * k_blk_step_, ncolumns, is_N_tail, zeropad);
+        if (!zeropad && !is_dynamic_stride_)
             add(reg_src, k_unroll * k_blk_step_ * src_stride_);
         add(reg_tr_src, k_unroll * tr_src_stride_);
 
-        sub(reg_K_iters, k_unroll * k_blk_step_);
-        cmp(reg_K_iters, k_unroll * k_blk_step_);
+        sub(reg_K, k_unroll * k_blk_step_);
+        cmp(reg_K, k_unroll * k_blk_step_);
         jge(K_loop_unrolled, T_NEAR);
 
         L(K_loop_single);
-        cmp(reg_K_iters, k_blk_step_);
+        cmp(reg_K, k_blk_step_);
         jl(K_loop_tail_or_done, T_NEAR);
 
-        copy_block(k_blk_step_, ncolumns, is_N_tail);
-        if (!is_dynamic_stride_) add(reg_src, k_blk_step_ * src_stride_);
+        copy_block(k_blk_step_, ncolumns, is_N_tail, zeropad);
+        if (!zeropad && !is_dynamic_stride_)
+            add(reg_src, k_blk_step_ * src_stride_);
         add(reg_tr_src, tr_src_stride_);
 
-        sub(reg_K_iters, k_blk_step_);
+        sub(reg_K, k_blk_step_);
         jmp(K_loop_single, T_NEAR);
 
         L(K_loop_tail_or_done);
@@ -2669,15 +2729,29 @@ void jit_brgemm_matmul_copy_b_int8_t<Vmm>::generate() {
         int k_blk_tail = conf_->K % k_blk_step_;
         if (k_blk_tail > 0) {
             Label K_loop_done;
-            cmp(reg_K_iters, 0);
+            cmp(reg_K, 0);
             jle(K_loop_done, T_NEAR);
 
-            copy_block(k_blk_tail, ncolumns, is_N_tail);
-            sub(reg_K_iters, k_blk_tail);
+            copy_block(k_blk_tail, ncolumns, is_N_tail, zeropad);
+            add(reg_tr_src, tr_src_stride_);
+            sub(reg_K, k_blk_tail);
             L(K_loop_done);
         }
     };
 
+    auto compute_K_loop = [&](bool is_N_tail) {
+        int ncolumns = is_N_tail ? conf_->N_tail : conf_->N_blk;
+        // 'param1' register (rcx on Windows) re-written in compute_K_loop_body
+        // so we need to read and keep 'current_K_pad' parameter in stack before
+        // the call
+        mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_pad)]);
+        mov(ptr[rsp + reg_current_K_pad_offs_], reg_K_iters);
+        mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
+        compute_K_loop_body(reg_K_iters, ncolumns, is_N_tail, false);
+        mov(reg_K_iters, ptr[rsp + reg_current_K_pad_offs_]);
+        compute_K_loop_body(reg_K_iters, ncolumns, is_N_tail, true);
+    };
+
     Label done;
     cmp(reg_N_blk, 0);
     jle(done, T_NEAR);
@@ -2819,12 +2893,12 @@ void jit_brgemm_matmul_copy_b_int8_t<Vmm>::generate() {
 
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_b_bf16_t : public jit_brgemm_matmul_copy_b_t,
-                                         public jit_generator {
+                                         public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_b_bf16_t)
 
     jit_brgemm_matmul_copy_b_bf16_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_b_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize(conf->b_dt_sz)
         , tr_typesize(conf->tr_b_dt_sz)
         , scales_typesize(sizeof(float))
@@ -2834,12 +2908,16 @@ struct jit_brgemm_matmul_copy_b_bf16_t : public jit_brgemm_matmul_copy_b_t,
         , is_src_int4(one_of(conf->orig_wei_dt, data_type::s4, data_type::u4))
         , is_dynamic_stride(is_runtime_value(src_stride))
         , is_dynamic_N(conf->is_runtime_N)
+        , do_N_loop(conf->LDB < conf->N_blk)
         , req_cvtps2bf16(conf->is_bf32 || conf->is_bf16_with_int_wei)
         , req_zp_b_shift(conf->has_zero_point_b && conf->with_wei_decompression)
-        , req_apply_scales(conf->apply_scales_in_buffer_b) {}
+        , req_apply_scales(conf->apply_scales_in_buffer_b)
+        , typesize_scale(is_src_int4 ? 2 : 1) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
@@ -2847,7 +2925,7 @@ struct jit_brgemm_matmul_copy_b_bf16_t : public jit_brgemm_matmul_copy_b_t,
     using opmask_t = const Xbyak::Opmask;
     using zmm = const Xbyak::Zmm;
     using ymm = const Xbyak::Ymm;
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
 
     enum { k_blk_step = 2, n_blk_step = 16 };
     const int typesize, tr_typesize, scales_typesize;
@@ -2855,14 +2933,17 @@ struct jit_brgemm_matmul_copy_b_bf16_t : public jit_brgemm_matmul_copy_b_t,
     const bool is_src_int4;
     const bool is_dynamic_stride;
     const bool is_dynamic_N;
+    const bool do_N_loop;
     const bool req_cvtps2bf16;
     const bool req_zp_b_shift;
     const bool req_apply_scales;
+    const dim_t typesize_scale;
 
     constexpr static int reg_src_offs = 0;
 
     constexpr static int reg_tr_src_offs = 8;
-    constexpr static int stack_space_needed = 16;
+    constexpr static int reg_current_K_pad_offs_ = 16;
+    constexpr static int stack_space_needed = 24;
 
     opmask_t kTail = k7;
     opmask_t kFFFF = k6;
@@ -2904,9 +2985,9 @@ struct jit_brgemm_matmul_copy_b_bf16_t : public jit_brgemm_matmul_copy_b_t,
         } else
             mov(regw_tmp, w);
         if (req_cvtps2bf16)
-            jit_generator::kmovw(k, regw_tmp);
+            jit_generator_t::kmovw(k, regw_tmp);
         else
-            jit_generator::kmovd(k, regw_tmp);
+            jit_generator_t::kmovd(k, regw_tmp);
     }
     void copy_half_int4(const Zmm &zmm, const Ymm &ymm_half) {
         vinserti64x4(zmm, zmm, ymm_half, 1);
@@ -2931,8 +3012,8 @@ struct jit_brgemm_matmul_copy_b_bf16_t : public jit_brgemm_matmul_copy_b_t,
         }
     }
     void load_data(const Vmm vmm_in, const Xbyak::Operand &op, bool is_tail);
-    void copy_block(int nrows, int ncolumns, bool n_tail);
-    void copy_2x32(int nrows, int ncolumns);
+    void copy_block(int nrows, int ncolumns, bool n_tail, bool zeropad);
+    void copy_2x32(int nrows, int ncolumns, bool zeropad);
     void init_masks();
     void generate() override;
 };
@@ -2975,7 +3056,8 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::load_data(
 }
 
 template <typename Vmm>
-void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_2x32(int nrows, int ncolumns) {
+void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_2x32(
+        int nrows, int ncolumns, bool zeropad) {
 
     const int columns_tail = ncolumns % n_blk_step;
     if (columns_tail > 0 && columns_tail < n_blk_step) {
@@ -3005,9 +3087,9 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_2x32(int nrows, int ncolumns) {
         auto src_reg = get_vmm(blk, k % k_blk_step);
         const bool is_tail = ncolumns - n < n_blk_step;
         auto src_load = maybe_mask(src_reg, is_tail);
-        const auto typesize_scale = is_src_int4 ? 2 : 1;
-        const auto offset = (is_dynamic_stride ? 0 : k * src_stride)
-                + ((n * typesize) / typesize_scale);
+        const auto offset
+                = ((is_dynamic_stride ? 0 : k * src_stride) + (n * typesize))
+                / typesize_scale;
         const auto reg_src_load
                 = is_dynamic_stride && k % 2 != 0 ? reg_src_load_1 : reg_src;
         auto load_addr = maybe_EVEX_compress_addr(reg_src_load, offset);
@@ -3039,16 +3121,21 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_2x32(int nrows, int ncolumns) {
     };
 
     int iter = 0;
+    int n_iters;
+    if (is_dynamic_N || do_N_loop) {
+        n_iters = ncolumns;
+    } else {
+        n_iters = conf_->wei_n_blk;
+    }
     for_(int k = 0; k < nrows; k += k_blk_step)
-    for (int n = 0; n < (is_dynamic_N ? ncolumns : conf_->wei_n_blk);
-            n += n_blk_step) {
+    for (int n = 0; n < n_iters; n += n_blk_step) {
         const int k_blk = k / k_blk_step;
         const dim_t tr_src_off
                 = k_blk * tr_src_stride + n * k_blk_step * tr_typesize;
         const auto store_addr
                 = maybe_EVEX_compress_addr(reg_tr_src, tr_src_off);
         const auto store_addr_ymm1
-                = ptr[reg_tr_src + tr_src_off + vreg_traits<Vmm>::vlen];
+                = ptr[reg_tr_src + tr_src_off + vreg_traits_t<Vmm>::vlen];
         const int blk_idx = iter % max_unroll;
         const auto src_vmm0 = get_vmm(blk_idx, 0);
         const auto src_zmm0 = zmm(src_vmm0.getIdx());
@@ -3063,7 +3150,7 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_2x32(int nrows, int ncolumns) {
             }
         }
 
-        if (ncolumns - n <= 0) {
+        if (ncolumns - n <= 0 || zeropad) {
             uni_vmovups(store_addr, vmm_zero);
             if (!is_superset(conf_->isa, avx512_core))
                 uni_vmovups(store_addr_ymm1, vmm_zero);
@@ -3132,9 +3219,9 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::init_masks() {
 
 template <typename Vmm>
 void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_block(
-        int nrows, int ncolumns, bool n_tail) {
-    if (!is_dynamic_N || !n_tail) {
-        copy_2x32(nrows, ncolumns);
+        int nrows, int ncolumns, bool n_tail, bool zeropad) {
+    if (!do_N_loop && (!is_dynamic_N || !n_tail)) {
+        copy_2x32(nrows, ncolumns, zeropad);
         return;
     }
 
@@ -3148,25 +3235,44 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_block(
     mov(ptr[rsp + reg_tr_src_offs], reg_tr_src);
     xor_(reg_copy_block_n_shift, reg_copy_block_n_shift);
 
+    int current_n_blk_step = do_N_loop ? conf_->LDB : n_blk_step;
+
     Label loop_row_start, loop_row_tail, loop_row_done;
-    cmp(reg_dynamic_tail, n_blk_step);
+    cmp(reg_dynamic_tail, current_n_blk_step);
     jl(loop_row_tail, T_NEAR);
     L(loop_row_start);
     {
         mov(ptr[rsp + reg_src_offs], reg_src);
         add(reg_src, reg_copy_block_n_shift);
-        copy_2x32(nrows, n_blk_step);
-        add(reg_copy_block_n_shift, n_blk_step * typesize);
-        add(reg_src, n_blk_step * typesize);
-        add(reg_tr_src, n_blk_step * k_blk_step * tr_typesize);
-        sub(reg_dynamic_tail, n_blk_step);
+        copy_2x32(nrows, current_n_blk_step, zeropad);
+
+        if (do_N_loop) {
+            add(reg_tr_src,
+                    (current_n_blk_step / conf_->LDB) * conf_->LDB2
+                            * tr_typesize);
+            add(reg_src,
+                    conf_->B_strides[0] == typesize
+                            ? current_n_blk_step * typesize
+                            : conf_->B_strides[0]);
+            add(reg_copy_block_n_shift,
+                    conf_->B_strides[0] == typesize
+                            ? current_n_blk_step * typesize
+                            : conf_->B_strides[0]);
+
+        } else {
+            add(reg_src, current_n_blk_step * typesize);
+            add(reg_tr_src, current_n_blk_step * k_blk_step * tr_typesize);
+            add(reg_copy_block_n_shift, current_n_blk_step * typesize);
+        }
+
+        sub(reg_dynamic_tail, current_n_blk_step);
 
         cmp(reg_dynamic_tail, 0);
         jle(loop_row_done, T_NEAR);
 
         mov(reg_src, ptr[rsp + reg_src_offs]);
 
-        cmp(reg_dynamic_tail, n_blk_step);
+        cmp(reg_dynamic_tail, current_n_blk_step);
         jl(loop_row_tail, T_NEAR);
 
         jmp(loop_row_start, T_NEAR);
@@ -3178,7 +3284,11 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::copy_block(
         jle(loop_row_done, T_NEAR);
 
         add(reg_src, reg_copy_block_n_shift);
-        copy_2x32(nrows, 1 /* to force tail case */);
+        if (do_N_loop) {
+            copy_2x32(nrows, ncolumns % current_n_blk_step, zeropad);
+        } else {
+            copy_2x32(nrows, 1 /* to force tail case */, zeropad);
+        }
     }
     L(loop_row_done);
 
@@ -3196,7 +3306,6 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::generate() {
 
     mov(reg_src, ptr[param1 + GET_OFF(src)]);
     mov(reg_tr_src, ptr[param1 + GET_OFF(tr_src)]);
-    mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
     mov(reg_N_blk, ptr[param1 + GET_OFF(current_N_blk)]);
     mov(reg_scales, ptr[param1 + GET_OFF(scales_ptr)]);
     if (is_dynamic_stride) {
@@ -3210,37 +3319,40 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::generate() {
     }
 
     init_masks();
-    auto compute_K_loop = [&](bool is_N_tail) {
-        const int k_unroll = 8;
-        int ncolumns = is_N_tail ? conf_->N_tail : conf_->N_blk;
 
+    auto compute_K_loop_body = [&](const reg64_t &reg_K, int ncolumns,
+                                       bool is_N_tail, bool zeropad) {
+        const int k_unroll = 8;
         Label K_loop_unrolled, K_loop_single, K_loop_tail_or_done;
-        cmp(reg_K_iters, k_unroll * k_blk_step);
+
+        cmp(reg_K, k_unroll * k_blk_step);
         jl(K_loop_single, T_NEAR);
 
         L(K_loop_unrolled);
-        copy_block(k_unroll * k_blk_step, ncolumns, is_N_tail);
+        copy_block(k_unroll * k_blk_step, ncolumns, is_N_tail, zeropad);
 
-        if (!is_dynamic_stride)
-            add(reg_src, k_unroll * k_blk_step * src_stride);
-        if (req_apply_scales)
+        if (!zeropad && !is_dynamic_stride)
+            add(reg_src, (k_unroll * k_blk_step * src_stride) / typesize_scale);
+        if (!zeropad && req_apply_scales)
             add(reg_scales, k_unroll * k_blk_step * scales_N_stride);
         add(reg_tr_src, k_unroll * tr_src_stride);
 
-        sub(reg_K_iters, k_unroll * k_blk_step);
-        cmp(reg_K_iters, k_unroll * k_blk_step);
+        sub(reg_K, k_unroll * k_blk_step);
+        cmp(reg_K, k_unroll * k_blk_step);
         jge(K_loop_unrolled, T_NEAR);
 
         L(K_loop_single);
-        cmp(reg_K_iters, k_blk_step);
+        cmp(reg_K, k_blk_step);
         jl(K_loop_tail_or_done, T_NEAR);
 
-        copy_block(k_blk_step, ncolumns, is_N_tail);
-        if (!is_dynamic_stride) add(reg_src, k_blk_step * src_stride);
-        if (req_apply_scales) add(reg_scales, k_blk_step * scales_N_stride);
+        copy_block(k_blk_step, ncolumns, is_N_tail, zeropad);
+        if (!zeropad && !is_dynamic_stride)
+            add(reg_src, (k_blk_step * src_stride) / typesize_scale);
+        if (!zeropad && req_apply_scales)
+            add(reg_scales, k_blk_step * scales_N_stride);
         add(reg_tr_src, tr_src_stride);
 
-        sub(reg_K_iters, k_blk_step);
+        sub(reg_K, k_blk_step);
         jmp(K_loop_single, T_NEAR);
 
         L(K_loop_tail_or_done);
@@ -3248,15 +3360,29 @@ void jit_brgemm_matmul_copy_b_bf16_t<Vmm>::generate() {
         int k_blk_tail = conf_->K % k_blk_step;
         if (k_blk_tail > 0) {
             Label K_loop_done;
-            cmp(reg_K_iters, 0);
+            cmp(reg_K, 0);
             jle(K_loop_done, T_NEAR);
 
-            copy_block(k_blk_tail, ncolumns, is_N_tail);
-            sub(reg_K_iters, k_blk_tail);
+            copy_block(k_blk_tail, ncolumns, is_N_tail, zeropad);
+            add(reg_tr_src, tr_src_stride);
+            sub(reg_K, k_blk_tail);
             L(K_loop_done);
         }
     };
 
+    auto compute_K_loop = [&](bool is_N_tail) {
+        int ncolumns = is_N_tail ? conf_->N_tail : conf_->N_blk;
+        // 'param1' register (rcx on Windows) re-written in compute_K_loop_body
+        // so we need to read and keep 'current_K_pad' parameter in stack before
+        // the call
+        mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_pad)]);
+        mov(ptr[rsp + reg_current_K_pad_offs_], reg_K_iters);
+        mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
+        compute_K_loop_body(reg_K_iters, ncolumns, is_N_tail, false);
+        mov(reg_K_iters, ptr[rsp + reg_current_K_pad_offs_]);
+        compute_K_loop_body(reg_K_iters, ncolumns, is_N_tail, true);
+    };
+
     Label done;
     cmp(reg_N_blk, 0);
     jle(done, T_NEAR);
@@ -3283,40 +3409,48 @@ template struct jit_brgemm_matmul_copy_b_bf16_t<Ymm>;
 
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_b_f32_t : public jit_brgemm_matmul_copy_b_t,
-                                        public jit_generator {
+                                        public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_b_f32_t)
 
     jit_brgemm_matmul_copy_b_f32_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_b_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , dt_in_(conf->orig_wei_dt)
-        , simd_w_(vreg_traits<Vmm>::vlen / sizeof(float))
+        , simd_w_(vreg_traits_t<Vmm>::vlen / sizeof(float))
         , is_src_int4_(one_of(conf->orig_wei_dt, data_type::s4, data_type::u4))
+        , req_zp_b_shift_(
+                  conf->has_zero_point_b && conf->with_wei_decompression)
+        , req_apply_scales_(conf->apply_scales_in_buffer_b)
         , typesize_in_(types::data_type_size(dt_in_))
         , typesize_scale_(is_src_int4_ ? 2 : 1)
+        , scales_typesize_(sizeof(float))
         , src_stride_(conf_->copy_B_wei_stride)
-        , tr_src_stride_(conf_->LDB * typesize_out_) {}
+        , tr_src_stride_(conf_->LDB * typesize_out_)
+        , scales_N_stride_(conf_->N * scales_typesize_) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
     using reg32_t = const Xbyak::Reg32;
     using opmask_t = const Xbyak::Opmask;
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
 
     const data_type_t dt_in_;
     const int simd_w_;
-    const bool is_src_int4_;
-    const size_t typesize_in_, typesize_scale_;
+    const bool is_src_int4_, req_zp_b_shift_, req_apply_scales_;
+    const size_t typesize_in_, typesize_scale_, scales_typesize_;
     const size_t typesize_out_ = sizeof(float);
-    dim_t src_stride_, tr_src_stride_;
+    dim_t src_stride_, tr_src_stride_, scales_N_stride_;
 
     opmask_t kTail = k7;
     opmask_t kFFFF = k6;
     opmask_t k5555 = k5;
     opmask_t kAAAA = k4;
+    opmask_t kTail_int4 = k3;
 
     reg64_t reg_src = rax;
     reg64_t reg_tr_src = rbx;
@@ -3326,16 +3460,18 @@ struct jit_brgemm_matmul_copy_b_f32_t : public jit_brgemm_matmul_copy_b_t,
     reg64_t reg_K_start = r10;
     reg64_t reg_tmp = r15;
     reg32_t regw_tmp = r15d;
+    reg64_t reg_scales = rdx;
 
     Vmm vmm_zero = Vmm(0);
     Vmm vmm_permw = Vmm(1);
     Vmm vmm_permd = Vmm(2);
+    Vmm vmm_zp_b_shift = Vmm(3);
     Ymm ymm_tail_mask = ymm1;
 
     inline void kmovw(Opmask k, unsigned w) {
         if (!isa_has_masks(conf_->isa)) return;
         mov(regw_tmp, w);
-        jit_generator::kmovd(k, regw_tmp);
+        jit_generator_t::kmovd(k, regw_tmp);
     }
     void copy_half_int4(const Zmm &zmm, const Ymm &ymm_half) {
         vinserti64x4(zmm, zmm, ymm_half, 1);
@@ -3345,8 +3481,9 @@ struct jit_brgemm_matmul_copy_b_f32_t : public jit_brgemm_matmul_copy_b_t,
     }
     Vmm_lower_t maybe_mask(Vmm_lower_t vmm_lower, bool is_tail) {
         assert(is_src_int4_);
-        return is_tail && isa_has_masks(conf_->isa) ? vmm_lower | kTail | T_z
-                                                    : vmm_lower;
+        return is_tail && isa_has_masks(conf_->isa)
+                ? vmm_lower | kTail_int4 | T_z
+                : vmm_lower;
     }
     Vmm maybe_mask(Vmm vmm, bool is_tail) {
         return is_tail && isa_has_masks(conf_->isa) ? vmm | kTail | T_z : vmm;
@@ -3366,7 +3503,18 @@ void jit_brgemm_matmul_copy_b_f32_t<Vmm>::load_data(
 
     switch (dt_in_) {
         case data_type::f32: uni_vmovups(vmm, op); break;
-        case data_type::f16: vcvtph2psx(vmm, op); break;
+        case data_type::bf16:
+            // Upconvert: load 16 bits and move them 16 bits left.
+            uni_vpmovzxwd(vmm, op);
+            uni_vpslld(vmm, vmm, 16);
+            break;
+        case data_type::f16:
+            if (is_superset(conf_->isa, avx512_core_fp16)) {
+                vcvtph2psx(vmm, op);
+            } else {
+                vcvtph2ps(vmm, op);
+            }
+            break;
         case data_type::s8: uni_vpmovsxbd(vmm, op); break;
         case data_type::u8: uni_vpmovzxbd(vmm, op); break;
         // For int4, we see two int4 as one int8 and extend them int32
@@ -3401,7 +3549,7 @@ template <typename Vmm>
 void jit_brgemm_matmul_copy_b_f32_t<Vmm>::copy_16_x_n_block(
         int nrows, int ncolumns) {
     const int max_isa_regs = isa_num_vregs(conf_->isa);
-    const int reserved_regs = is_src_int4_ ? 3 : 2;
+    const int reserved_regs = req_zp_b_shift_ ? 4 : is_src_int4_ ? 3 : 2;
     const int max_regs_available = max_isa_regs - reserved_regs;
 
     auto get_vmm = [max_regs_available, reserved_regs](int reg_idx) {
@@ -3415,18 +3563,31 @@ void jit_brgemm_matmul_copy_b_f32_t<Vmm>::copy_16_x_n_block(
         auto src_vmm = get_vmm(blk);
         const bool is_tail = ncolumns - n < simd_w_;
         auto addr = maybe_EVEX_compress_addr(reg_src,
-                k * src_stride_ + ((n * typesize_in_) / typesize_scale_));
+                (k * src_stride_ + n * typesize_in_) / typesize_scale_);
         if (is_tail && !isa_has_masks(conf_->isa))
             vmaskmovps(src_vmm, ymm_tail_mask, addr);
         else
             load_data(src_vmm, addr, is_tail);
+
+        if (req_zp_b_shift_)
+            uni_vsubps(maybe_mask(src_vmm, is_tail), src_vmm, vmm_zp_b_shift);
+        if (req_apply_scales_) {
+            const auto scales_addr = maybe_EVEX_compress_addr(
+                    reg_scales, k * scales_N_stride_ + n * scales_typesize_);
+            vmulps(maybe_mask(src_vmm, is_tail), src_vmm, scales_addr);
+        }
     };
 
     const int columns_tail = ncolumns % simd_w_;
     if (columns_tail < simd_w_) {
         if (isa_has_masks(conf_->isa)) {
-            const auto tail_mask = (1 << (columns_tail / typesize_scale_)) - 1;
+            const auto tail_mask = (1 << columns_tail) - 1;
             kmovw(kTail, tail_mask);
+            if (is_src_int4_) {
+                const auto int4_tail_mask
+                        = (1 << (columns_tail / typesize_scale_)) - 1;
+                kmovw(kTail_int4, int4_tail_mask);
+            }
         } else {
             init_f32_avx2_mask_ymm(ymm_tail_mask, reg_tmp, columns_tail);
         }
@@ -3465,8 +3626,9 @@ void jit_brgemm_matmul_copy_b_f32_t<Vmm>::compute_k_loop(int ncolumns) {
         jl(K_end_label, T_NEAR);
 
         copy_16_x_n_block(unroll, ncolumns);
-        add(reg_src, unroll * src_stride_);
+        add(reg_src, (unroll * src_stride_) / typesize_scale_);
         add(reg_tr_src, unroll * tr_src_stride_);
+        if (req_apply_scales_) add(reg_scales, unroll * scales_N_stride_);
 
         sub(reg_K_iters, unroll);
         jmp(K_start_label, T_NEAR);
@@ -3488,6 +3650,7 @@ void jit_brgemm_matmul_copy_b_f32_t<Vmm>::generate() {
     mov(reg_tr_src, ptr[param1 + GET_OFF(tr_src)]);
     mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
     mov(reg_N_blk, ptr[param1 + GET_OFF(current_N_blk)]);
+    mov(reg_scales, ptr[param1 + GET_OFF(scales_ptr)]);
     kmovw(kFFFF, 0xffff); // 1111111111111111
     if (is_src_int4_) {
         alignas(64) static constexpr const uint32_t int4_permute[16]
@@ -3498,6 +3661,11 @@ void jit_brgemm_matmul_copy_b_f32_t<Vmm>::generate() {
         kmovw(kAAAA, 0xaaaa);
         kmovw(k5555, 0x5555);
     }
+    if (req_zp_b_shift_) {
+        mov(reg_tmp, ptr[param1 + GET_OFF(zp_b_value_ptr)]);
+        uni_vpbroadcastd(vmm_zp_b_shift, ptr[reg_tmp]);
+        uni_vcvtdq2ps(vmm_zp_b_shift, vmm_zp_b_shift);
+    }
 
     Label done;
     if (conf_->N_tail > 0) {
@@ -3522,12 +3690,12 @@ template struct jit_brgemm_matmul_copy_b_f32_t<Ymm>;
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_b_transposed_t
     : public jit_brgemm_matmul_copy_b_t,
-      public jit_generator {
+      public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_b_transposed_t)
 
     jit_brgemm_matmul_copy_b_transposed_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_b_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize_(conf_->b_dt_sz)
         , tr_typesize_(conf_->tr_b_dt_sz)
         , scales_typesize_(sizeof(float))
@@ -3538,7 +3706,9 @@ struct jit_brgemm_matmul_copy_b_transposed_t
         , is_bf32_(conf->is_bf32)
         , is_bf16_with_int_wei_(conf->is_bf16_with_int_wei)
         , is_src_int4_(one_of(conf->orig_wei_dt, data_type::s4, data_type::u4))
-        , req_cvtps2bf16_(conf->is_bf32 || conf->is_bf16_with_int_wei)
+        , req_cvtps2xf16_(conf->is_bf32 || conf->is_bf16_with_int_wei
+                  || (conf->is_f16_with_int_wei
+                          && conf->wei_dt == data_type::f16))
         , req_zp_comp_(conf_->has_zero_point_a)
         , req_s8s8_comp_(conf_->s8s8_compensation_required)
         , req_zp_b_shift_(
@@ -3546,6 +3716,15 @@ struct jit_brgemm_matmul_copy_b_transposed_t
         , req_apply_scales_(conf_->apply_scales_in_buffer_b)
         , avx512_core_dot_product_(
                   do_compute_compensation_ && !isa_has_int8_vnni(conf->isa))
+        // See the note in `create_brgemm_matmul_copy_b` why `orig_wei_dt` used.
+        , use_fp16_instructions_(is_subset(conf_->isa, avx512_core_fp16)
+                  && conf_->orig_wei_dt == data_type::f16
+                  && conf_->wei_dt == data_type::f32)
+        // This variable is responsible for enabling to upconversion from bf16
+        // to f32 similarly to f16, mostly for proper tail handling.
+        , use_bf16_instructions_(is_subset(conf_->isa, avx512_core_bf16)
+                  && conf_->orig_wei_dt == data_type::bf16
+                  && conf_->wei_dt == data_type::f32)
         , max_tmp_idx(16
                   - (avx512_core_dot_product_
                                   ? 8
@@ -3559,19 +3738,21 @@ struct jit_brgemm_matmul_copy_b_transposed_t
         , typesize_scale_(is_src_int4_ ? 2 : 1)
         , is_dynamic_N_(conf->is_runtime_N) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
     using reg32_t = const Xbyak::Reg32;
     using opmask_t = const Xbyak::Opmask;
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
 
     static constexpr bool is_ymm_ = std::is_same<Vmm, Xbyak::Ymm>::value;
     static constexpr cpu_isa_t isa_ = is_ymm_ ? avx2 : avx512_core;
-    static constexpr int max_vmm_regs_ = cpu_isa_traits<isa_>::n_vregs;
-    static constexpr int vlen_ = vreg_traits<Vmm>::vlen;
+    static constexpr int max_vmm_regs_ = cpu_isa_traits_t<isa_>::n_vregs;
+    static constexpr int vlen_ = vreg_traits_t<Vmm>::vlen;
     static constexpr int n_blk_step_ = is_ymm_ ? 8 : 16;
     static constexpr int req_cvt_bf16_k_blk_step_ = 16;
     static constexpr size_t comp_shift_ = vlen_;
@@ -3585,17 +3766,22 @@ struct jit_brgemm_matmul_copy_b_transposed_t
     const bool is_bf32_;
     const bool is_bf16_with_int_wei_;
     const bool is_src_int4_;
-    const bool req_cvtps2bf16_;
+    const bool req_cvtps2xf16_;
     const bool req_zp_comp_;
     const bool req_s8s8_comp_;
     const bool req_zp_b_shift_;
     const bool req_apply_scales_;
     const bool avx512_core_dot_product_;
+    const bool use_fp16_instructions_;
+    const bool use_bf16_instructions_;
     const int max_tmp_idx;
 
     const dim_t src_stride_, tr_src_stride_, scales_K_stride_, typesize_scale_;
     const bool is_dynamic_N_;
 
+    constexpr static int ldb_step_idx_offs = 0;
+    constexpr static int stack_space_needed = 8;
+
     opmask_t k3333 = k1;
     opmask_t k5555 = k2;
     opmask_t kAAAA = k3;
@@ -3642,12 +3828,12 @@ struct jit_brgemm_matmul_copy_b_transposed_t
 
     void kmovw(Opmask k, unsigned w) {
         mov(regw_tmp, w);
-        jit_generator::kmovw(k, regw_tmp);
+        jit_generator_t::kmovw(k, regw_tmp);
     };
 
     void kmovq(Opmask k, size_t q) {
         mov(regq_tmp, q);
-        jit_generator::kmovq(k, regq_tmp);
+        jit_generator_t::kmovq(k, regq_tmp);
     };
 
     Vmm src_vmm(int i) {
@@ -3682,7 +3868,10 @@ struct jit_brgemm_matmul_copy_b_transposed_t
     }
 
     void init_tail_mask(const int columns_tail, const bool use_int4_mask);
-    void load_int(const Vmm vmm_in, const Xbyak::Operand &op,
+    void maybe_apply_scales(
+            const Vmm vmm_in, const size_t offset, const bool is_tail);
+    void maybe_apply_zp_b_shift(const Vmm vmm_in, const bool is_tail);
+    void load_int(const Vmm vmm_in, const dim_t offset, const int i,
             const int columns_tail, bool is_tail);
     void copy_row_x_col(int nrows, int ncolumns);
     void compute_K_loop(bool is_N_tail, int curr_K_tail, bool is_first_K_iter,
@@ -3692,8 +3881,7 @@ struct jit_brgemm_matmul_copy_b_transposed_t
 
     inline void dot_product(Vmm v1, Vmm v2, Vmm v3) {
         if (!avx512_core_dot_product_)
-            vpdpbusd(v1, v2, v3,
-                    mayiuse(avx512_core) ? EvexEncoding : VexEncoding);
+            vpdpbusd(v1, v2, v3, get_encoding());
         else {
             vpmaddubsw(vmm_dot_product_temp, v2, v3);
             vpmaddwd(
@@ -3709,18 +3897,38 @@ struct jit_brgemm_matmul_copy_b_transposed_t
     void generate() override;
 };
 
+template <typename Vmm>
+void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::maybe_apply_scales(
+        const Vmm vmm_in, const size_t offset, const bool is_tail) {
+    if (!req_apply_scales_) return;
+
+    const auto vmm = maybe_mask(vmm_in, is_tail);
+    const auto scales_addr = EVEX_compress_addr(reg_scales, offset);
+    vmulps(vmm, vmm, scales_addr);
+}
+
+template <typename Vmm>
+void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::maybe_apply_zp_b_shift(
+        const Vmm vmm_in, const bool is_tail) {
+    if (!req_zp_b_shift_) return;
+
+    const auto vmm = maybe_mask(vmm_in, is_tail);
+    vpsubd(vmm, vmm, vmm_zp_b_val);
+}
+
 template <typename Vmm>
 void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::init_tail_mask(
         const int columns_tail, const bool use_int4_mask) {
-    assert(IMPLICATION(is_src_int4_, use_int4_mask));
+    assert(IMPLICATION(use_int4_mask, is_src_int4_));
     if (columns_tail > 0) {
-        const int dt_step = req_cvtps2bf16_ || conf_->isa == avx512_core_fp16
+        const int dt_step = req_cvtps2xf16_ || use_fp16_instructions_
+                        || use_bf16_instructions_
                 ? 1
                 : typesize_;
         const auto tail_mask = use_int4_mask
-                ? size_t(((size_t)1 << (dt_step * columns_tail) / 2) - 1)
+                ? size_t(((size_t)1 << div_up(dt_step * columns_tail, 2)) - 1)
                 : size_t(((size_t)1 << dt_step * columns_tail) - 1);
-        if (req_cvtps2bf16_)
+        if (req_cvtps2xf16_)
             kmovw(kTail, tail_mask);
         else
             kmovq(kTail, tail_mask);
@@ -3729,21 +3937,54 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::init_tail_mask(
 
 template <typename Vmm>
 void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::load_int(const Vmm vmm_in,
-        const Xbyak::Operand &op, int columns_tail, bool is_tail) {
+        const dim_t offset, const int i, int columns_tail, bool is_tail) {
     const auto vmm = maybe_mask(vmm_in, is_tail);
     const auto vmm_lower = Vmm_lower_t(vmm.getIdx());
+    const auto xmm_in = Xmm(vmm_in.getIdx());
+    const auto addr = EVEX_compress_addr(reg_src, offset);
+    MAYBE_UNUSED(xmm_in);
     MAYBE_UNUSED(vmm_lower);
     if (is_src_int4_) init_tail_mask(columns_tail, true);
 
+    // Two additional operations are needed for int4 when i * src_stride_ % 2 != 0.
+    // The maximum data size for a bitwise shift is 8 bytes (quadwords).
+    // If the loaded data size is smaller than 8, we can directly perform a right
+    // shift to eliminate the unnecessary half-byte at the front.
+    // If the loaded data size is 8, we need two registers to handle the
+    // unnecessary half-byte at the front and back, respectively.
+    const bool need_preload_int4 = is_src_int4_ && (i * src_stride_) % 2 != 0;
+    const auto max_shift_sz = 8;
+    if (need_preload_int4) {
+        const auto load_sz = is_tail ? div_up(columns_tail, 2)
+                : req_cvtps2xf16_    ? req_cvt_bf16_k_blk_step_ / 2
+                                     : k_blk_step_ / 2;
+        assert(load_sz <= max_shift_sz);
+        if (load_sz < max_shift_sz || is_tail) {
+            load_bytes(xmm_in, addr, load_sz);
+            vpsrlq(xmm_in, xmm_in, 4);
+        } else {
+            const auto xmm_tmp = Xmm(tmp_vmm(3).getIdx());
+            load_bytes(xmm_in, addr, load_sz);
+            load_bytes(
+                    xmm_tmp, EVEX_compress_addr(reg_src, offset + 1), load_sz);
+            vpsrlq(xmm_in, xmm_in, 4);
+            vpsllq(xmm_tmp, xmm_tmp, 4);
+            vpord(xmm_in, xmm_in, xmm_tmp);
+        }
+    }
+
     switch (conf_->orig_wei_dt) {
-        case data_type::s8: uni_vpmovsxbd(vmm, op); break;
-        case data_type::u8: uni_vpmovzxbd(vmm, op); break;
+        case data_type::s8: uni_vpmovsxbd(vmm, addr); break;
+        case data_type::u8: uni_vpmovzxbd(vmm, addr); break;
         // For int4, we see two int4 as one int8 and extend them int32
         // low half stores in lower bytes of vmm and high half in higher
         // bytes of vmm, then permute them into correct order
         // Finally, we process the extend bytes for s4/u4 accordingly
         case data_type::s4:
-            uni_vpmovsxbd(maybe_mask(vmm_lower, is_tail), op);
+            if (need_preload_int4)
+                uni_vpmovsxbd(maybe_mask(vmm_lower, is_tail), xmm_in);
+            else
+                uni_vpmovsxbd(maybe_mask(vmm_lower, is_tail), addr);
             copy_half_int4(vmm_in, vmm_lower);
             vpermd(vmm_in, vmm_permd, vmm_in);
             uni_vpslld(vmm_in | k5555, vmm_in, 28);
@@ -3751,7 +3992,10 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::load_int(const Vmm vmm_in,
             vpsrad(vmm_in | kAAAA, vmm_in, 4);
             break;
         case data_type::u4:
-            uni_vpmovzxbd(maybe_mask(vmm_lower, is_tail), op);
+            if (need_preload_int4)
+                uni_vpmovzxbd(maybe_mask(vmm_lower, is_tail), xmm_in);
+            else
+                uni_vpmovzxbd(maybe_mask(vmm_lower, is_tail), addr);
             copy_half_int4(vmm_in, vmm_lower);
             vpermd(vmm_in, vmm_permd, vmm_in);
             uni_vpslld(vmm_in | k5555, vmm_in, 28);
@@ -3772,13 +4016,12 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
     if (!nrows) return;
 
     const int columns_tail = ncolumns
-            % (req_cvtps2bf16_ ? req_cvt_bf16_k_blk_step_ : k_blk_step_);
+            % (req_cvtps2xf16_ ? req_cvt_bf16_k_blk_step_ : k_blk_step_);
     init_tail_mask(columns_tail, false);
 
-    auto load2bf16 = [this, nrows, columns_tail, ncolumns](
-                             int i, int base_idx) {
+    auto load2bf16 = [this, nrows, columns_tail, ncolumns](int i) {
         auto src_reg = src_vmm(i);
-        auto src_reg_next = tmp_vmm(i - base_idx);
+        auto src_reg_next = tmp_vmm(2);
 
         Label load_done;
         if (is_dynamic_N_ && nrows < n_blk_step_) {
@@ -3800,56 +4043,58 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
         auto zmm_src = columns_tail > 0 && ncolumns < req_cvt_bf16_k_blk_step_
                 ? src_reg | kTail | T_z
                 : src_reg;
-        const auto addr = EVEX_compress_addr(reg_src, i * src_stride_);
+        const auto src_offset = (i * src_stride_) / typesize_scale_;
+        const auto addr = EVEX_compress_addr(reg_src, src_offset);
         if (is_bf32_)
             vmovups(zmm_src, addr);
-        else if (is_bf16_with_int_wei_) {
-            load_int(src_reg, addr, columns_tail,
-                    columns_tail > 0 && ncolumns < req_cvt_bf16_k_blk_step_);
-            if (req_zp_b_shift_) vpsubd(zmm_src, zmm_src, vmm_zp_b_val);
+        else if (is_bf16_with_int_wei_ || conf_->is_f16_with_int_wei) {
+            const bool is_tail
+                    = columns_tail > 0 && ncolumns < req_cvt_bf16_k_blk_step_;
+            load_int(src_reg, src_offset, i, columns_tail, is_tail);
+            maybe_apply_zp_b_shift(src_reg, is_tail);
             vcvtdq2ps(zmm_src, zmm_src);
-            if (req_apply_scales_) {
-                const auto scales_addr
-                        = EVEX_compress_addr(reg_scales, i * scales_K_stride_);
-                vmulps(zmm_src, zmm_src, scales_addr);
-            }
+            maybe_apply_scales(src_reg, i * scales_K_stride_, is_tail);
         } else
-            assert("Unsupported data type in loading");
+            assert(!"Unsupported data type in loading");
 
         if (ncolumns <= req_cvt_bf16_k_blk_step_) {
             vpxord(src_reg_next, src_reg_next, src_reg_next);
         } else {
             auto zmm_src_next = columns_tail > 0 ? src_reg_next | kTail | T_z
                                                  : src_reg_next;
-            const auto next_addr = EVEX_compress_addr(reg_src,
-                    i * src_stride_
-                            + (req_cvt_bf16_k_blk_step_ * typesize_)
-                                    / typesize_scale_);
+            const auto next_src_offset
+                    = (i * src_stride_ + req_cvt_bf16_k_blk_step_ * typesize_)
+                    / typesize_scale_;
+            const auto next_addr = EVEX_compress_addr(reg_src, next_src_offset);
             if (is_bf32_)
                 vmovups(zmm_src_next, next_addr);
-            else if (is_bf16_with_int_wei_) {
-                load_int(src_reg_next, next_addr, columns_tail,
+            else if (is_bf16_with_int_wei_ || conf_->is_f16_with_int_wei) {
+                const auto is_tail = columns_tail > 0;
+                load_int(src_reg_next, next_src_offset, i, columns_tail,
                         columns_tail > 0);
-                if (req_zp_b_shift_)
-                    vpsubd(zmm_src_next, zmm_src_next, vmm_zp_b_val);
-
+                maybe_apply_zp_b_shift(src_reg_next, is_tail);
                 vcvtdq2ps(zmm_src_next, zmm_src_next);
-                if (req_apply_scales_) {
-                    const auto scales_next_addr = EVEX_compress_addr(reg_scales,
-                            i * scales_K_stride_
-                                    + req_cvt_bf16_k_blk_step_
-                                            * scales_typesize_);
-                    vmulps(zmm_src_next, zmm_src_next, scales_next_addr);
-                }
+                maybe_apply_scales(src_reg_next,
+                        i * scales_K_stride_
+                                + req_cvt_bf16_k_blk_step_ * scales_typesize_,
+                        is_tail);
             } else
-                assert("Unsupported data type in loading");
+                assert(!"Unsupported data type in loading");
         }
 
-        vcvtne2ps2bf16(src_reg, src_reg_next, src_reg);
+        if (conf_->wei_dt == data_type::bf16) {
+            vcvtne2ps2bf16(src_reg, src_reg_next, src_reg);
+        } else {
+            const auto src_vmm_lower0 = Vmm_lower_t(src_reg.getIdx());
+            const auto src_vmm_lower1 = Vmm_lower_t(src_reg_next.getIdx());
+            vcvtps2phx(src_vmm_lower0, src_reg);
+            vcvtps2phx(src_vmm_lower1, src_reg_next);
+            vinsertf64x4(src_reg, src_reg, src_vmm_lower1, 1);
+        }
         L(load_done);
     };
 
-    auto load = [this, nrows, columns_tail](int i) {
+    auto load = [this, nrows, columns_tail](int i, int base_idx) {
         Label load_done;
 
         auto src_reg = src_vmm(i);
@@ -3868,13 +4113,28 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
             return;
         }
 
-        auto src_load = columns_tail > 0 ? src_reg | kTail | T_z : src_reg;
-        const auto addr = EVEX_compress_addr(reg_src, i * src_stride_);
-        if (conf_->isa == avx512_core_fp16)
-            vcvtph2psx(src_load, addr);
-        else
+        const auto is_tail = columns_tail > 0;
+        auto src_load = is_tail ? src_reg | kTail | T_z : src_reg;
+        const auto src_offset = (i * src_stride_) / typesize_scale_;
+        const auto addr = EVEX_compress_addr(reg_src, src_offset);
+        if (conf_->is_f16_with_int_wei && conf_->wei_dt == data_type::f32) {
+            load_int(src_reg, src_offset, i, columns_tail, is_tail);
+            maybe_apply_zp_b_shift(src_reg, is_tail);
+            vcvtdq2ps(src_load, src_load);
+            maybe_apply_scales(src_reg, i * scales_K_stride_, is_tail);
+        } else if (use_fp16_instructions_) {
+            if (conf_->isa == avx512_core_fp16) {
+                vcvtph2psx(src_load, addr);
+            } else {
+                vcvtph2ps(src_load, addr);
+            }
+        } else if (use_bf16_instructions_) {
+            // Upconvert: load 16 bits and move them 16 bits left.
+            uni_vpmovzxwd(src_load, addr);
+            uni_vpslld(src_load, src_load, 16);
+        } else {
             vmovdqu8(src_load, addr);
-
+        }
         L(load_done);
     };
 
@@ -3885,21 +4145,16 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
 
     auto transpose16x8 = [&](int base_idx) {
         assert(base_idx == 0 || base_idx == 8);
-        // If compensation compute is required - use tmp(0) ... tmp(7)
-        // to not spoil reserved registers' values
-        const int tmp_corr_idx
-                = (is_src_int4_ || do_compute_compensation_ || req_zp_b_shift_)
-                * base_idx;
 
         // swap 1
-        if (req_cvtps2bf16_) {
+        if (req_cvtps2xf16_) {
             for (int i = 0; i < 4; i++) {
                 const int src_idx0 = base_idx + i * 2;
                 const int src_idx1 = src_idx0 + 1;
 
                 if (base_idx == 0 && i == 0) {
-                    load2bf16(src_idx0, base_idx);
-                    load2bf16(src_idx1, base_idx);
+                    load2bf16(src_idx0);
+                    load2bf16(src_idx1);
                 }
 
                 const int next_src_idx0 = src_idx0 + 2;
@@ -3907,17 +4162,17 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
 
                 const bool load_next = base_idx == 0 || i < 3;
 
-                const auto tmp0 = tmp_vmm(src_idx0 - tmp_corr_idx);
-                const auto tmp1 = tmp_vmm(src_idx1 - tmp_corr_idx);
+                const auto tmp0 = tmp_vmm(0);
+                const auto tmp1 = tmp_vmm(1);
                 const auto src0 = src_vmm(src_idx0);
                 const auto src1 = src_vmm(src_idx1);
 
                 if (valid_to_load_next(next_src_idx0, nrows) && load_next)
-                    load2bf16(next_src_idx0, base_idx);
+                    load2bf16(next_src_idx0);
                 valignd(tmp0, src0, src0, 0x1);
 
                 if (valid_to_load_next(next_src_idx1, nrows) && load_next)
-                    load2bf16(next_src_idx1, base_idx);
+                    load2bf16(next_src_idx1);
                 valignd(tmp1, src1, src1, 0xf);
 
                 vmovaps(src0 | kAAAA, tmp1);
@@ -3933,21 +4188,21 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
                 const bool load_next = base_idx == 0 || i < 3;
 
                 if (base_idx == 0 && i == 0) {
-                    load(src_idx0);
-                    load(src_idx1);
+                    load(src_idx0, base_idx);
+                    load(src_idx1, base_idx);
                 }
 
-                const auto tmp0 = tmp_vmm(src_idx0 - tmp_corr_idx);
-                const auto tmp1 = tmp_vmm(src_idx1 - tmp_corr_idx);
+                const auto tmp0 = tmp_vmm(0);
+                const auto tmp1 = tmp_vmm(1);
                 const auto src0 = src_vmm(src_idx0);
                 const auto src1 = src_vmm(src_idx1);
 
                 if (valid_to_load_next(next_src_idx0, nrows) && load_next)
-                    load(next_src_idx0);
+                    load(next_src_idx0, base_idx);
                 valignd(tmp0, src0, src0, 0x1);
 
                 if (valid_to_load_next(next_src_idx1, nrows) && load_next)
-                    load(next_src_idx1);
+                    load(next_src_idx1, base_idx);
                 valignd(tmp1, src1, src1, 0xf);
 
                 vmovaps(src0 | kAAAA, tmp1);
@@ -3960,8 +4215,8 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
             const int src_idx0 = base_idx + i + select_half + 0;
             const int src_idx2 = src_idx0 + 2;
 
-            const auto tmp0 = tmp_vmm(src_idx0 - tmp_corr_idx);
-            const auto tmp1 = tmp_vmm(src_idx2 - tmp_corr_idx);
+            const auto tmp0 = tmp_vmm(0);
+            const auto tmp1 = tmp_vmm(1);
             const auto src0 = src_vmm(src_idx0);
             const auto src2 = src_vmm(src_idx2);
 
@@ -3975,7 +4230,7 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
             const int src_idx0 = base_idx + i;
             const int src_idx4 = src_idx0 + 4;
 
-            const auto tmp0 = tmp_vmm(src_idx0 - tmp_corr_idx);
+            const auto tmp0 = tmp_vmm(0);
             const auto src0 = src_vmm(src_idx0);
             const auto src4 = src_vmm(src_idx4);
 
@@ -3987,7 +4242,7 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
 
     auto fixup16x16 = [&]() {
         for (int i = 0; i < 8; i++) {
-            const auto tmp = tmp_vmm(i);
+            const auto tmp = tmp_vmm(0);
             const auto src0 = src_vmm(i);
             const auto src8 = src_vmm(8 + i);
             vshuff64x2(tmp, src0, src8, 0x44);
@@ -3997,7 +4252,7 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::copy_row_x_col(
         }
 
         for (int i = 0; i < 8; i++) {
-            const auto tmp = tmp_vmm(i);
+            const auto tmp = tmp_vmm(0);
             const auto src0 = src_vmm(i);
             const auto src8 = src_vmm(8 + i);
             vshuff64x2(tmp, src0, src8, 0xee);
@@ -4020,6 +4275,7 @@ void jit_brgemm_matmul_copy_b_transposed_t<Ymm>::copy_row_x_col(
     if (!nrows) return;
 
     const int columns_tail = ncolumns % k_blk_step_;
+
     auto load = [this, nrows, columns_tail](int i) {
         auto vmm_src = src_vmm(i);
 
@@ -4038,11 +4294,30 @@ void jit_brgemm_matmul_copy_b_transposed_t<Ymm>::copy_row_x_col(
             uni_vpxor(vmm_src, vmm_src, vmm_src);
             return;
         }
+
         if (columns_tail > 0) {
             load_bytes(vmm_src, reg_src, i * src_stride_,
                     columns_tail * typesize_);
-        } else
-            uni_vmovups(vmm_src, ptr[reg_src + i * src_stride_]);
+            if (use_fp16_instructions_) {
+                // For f32:f16 case need to convert raw bytes after `load_bytes`
+                // into f32 values.
+                vcvtph2ps(vmm_src, Xmm(vmm_src.getIdx()));
+            } else if (use_bf16_instructions_) {
+                // Upconvert: move loaded 16 bits left.
+                uni_vpslld(vmm_src, vmm_src, 16);
+            }
+        } else {
+            if (use_fp16_instructions_) {
+                // For non-tailed case can use the convert instruction directly.
+                vcvtph2ps(vmm_src, ptr[reg_src + i * src_stride_]);
+            } else if (use_bf16_instructions_) {
+                // Upconvert: load 16 bits and move them 16 bits left.
+                uni_vpmovzxwd(vmm_src, ptr[reg_src + i * src_stride_]);
+                uni_vpslld(vmm_src, vmm_src, 16);
+            } else {
+                uni_vmovups(vmm_src, ptr[reg_src + i * src_stride_]);
+            }
+        }
 
         L(load_done);
     };
@@ -4197,11 +4472,38 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::compute_N_loop(
         cmp(reg_N_iters, n_blk_step_);
         jl(N_loop_tail_or_done, T_NEAR);
     }
+    if (conf_->LDB2 > 0) {
+        mov(regq_tmp, 0);
+        mov(ptr[rsp + ldb_step_idx_offs], regq_tmp);
+    }
 
     L(N_loop);
     compute_K_loop(false, curr_K_tail, is_first_K_iter, is_last_K_iter);
-    add(reg_src_base, n_blk_step_ * src_stride_);
-    add(reg_tr_src_base, n_blk_step_ * vnni_granularity_ * tr_typesize_);
+
+    add(reg_src_base, (n_blk_step_ * src_stride_) / typesize_scale_);
+    if (conf_->LDB2 > 0) {
+        Label small_increment, ldb_off_done;
+        mov(regq_tmp, ptr[rsp + ldb_step_idx_offs]);
+        add(regq_tmp, n_blk_step_);
+
+        cmp(regq_tmp, conf_->LDB);
+        jne(small_increment, T_NEAR);
+
+        add(reg_tr_src_base,
+                -conf_->LDB * vnni_granularity_ * tr_typesize_
+                        + n_blk_step_ * vnni_granularity_ * tr_typesize_
+                        + conf_->LDB2 * tr_typesize_);
+        mov(regq_tmp, 0);
+        mov(ptr[rsp + ldb_step_idx_offs], regq_tmp);
+        jmp(ldb_off_done, T_NEAR);
+        L(small_increment);
+        add(reg_tr_src_base, n_blk_step_ * vnni_granularity_ * tr_typesize_);
+        mov(ptr[rsp + ldb_step_idx_offs], regq_tmp);
+        L(ldb_off_done);
+    } else {
+        add(reg_tr_src_base, n_blk_step_ * vnni_granularity_ * tr_typesize_);
+    }
+
     if (req_apply_scales_) add(reg_scales_base, n_blk_step_ * scales_K_stride_);
 
     if (req_zp_comp_) add(reg_zp_comp_ptr, comp_shift_);
@@ -4224,8 +4526,8 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::compute_N_loop(
 
 template <typename Vmm>
 void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::generate() {
-
     preamble();
+    sub(rsp, stack_space_needed);
 
     if (avx512_core_dot_product_) {
         mov(regq_tmp.cvt16(), 1);
@@ -4338,6 +4640,7 @@ void jit_brgemm_matmul_copy_b_transposed_t<Vmm>::generate() {
     compute_body(false, false);
     L(done);
 
+    add(rsp, stack_space_needed);
     postamble();
 }
 
@@ -4346,12 +4649,12 @@ template struct jit_brgemm_matmul_copy_b_transposed_t<Ymm>;
 
 template <typename Vmm>
 struct jit_brgemm_matmul_copy_b_cvt_bf16_t : public jit_brgemm_matmul_copy_b_t,
-                                             public jit_generator {
+                                             public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_brgemm_matmul_copy_b_cvt_bf16_t)
 
     jit_brgemm_matmul_copy_b_cvt_bf16_t(const brgemm_matmul_conf_t *conf)
         : jit_brgemm_matmul_copy_b_t(conf)
-        , jit_generator(jit_name())
+        , jit_generator_t(jit_name())
         , typesize_(conf->b_dt_sz)
         , tr_typesize_(conf->tr_b_dt_sz)
         , scales_typesize_(sizeof(float))
@@ -4368,14 +4671,16 @@ struct jit_brgemm_matmul_copy_b_cvt_bf16_t : public jit_brgemm_matmul_copy_b_t,
                           : req_zp_b_shift_ ? 1
                                             : 0) {}
 
-    void operator()(ctx_t *ctx) override { jit_generator::operator()(ctx); }
-    status_t create_kernel() override { return jit_generator::create_kernel(); }
+    void operator()(ctx_t *ctx) override { jit_generator_t::operator()(ctx); }
+    status_t create_kernel() override {
+        return jit_generator_t::create_kernel();
+    }
 
 private:
     using reg64_t = const Xbyak::Reg64;
     using reg32_t = const Xbyak::Reg32;
     using opmask_t = const Xbyak::Opmask;
-    using Vmm_lower_t = typename vreg_traits<Vmm>::Vmm_lower_t;
+    using Vmm_lower_t = typename vreg_traits_t<Vmm>::Vmm_lower_t;
     using zmm = const Xbyak::Zmm;
     using ymm = const Xbyak::Ymm;
 
@@ -4401,6 +4706,9 @@ struct jit_brgemm_matmul_copy_b_cvt_bf16_t : public jit_brgemm_matmul_copy_b_t,
     reg64_t reg_tmp = r11;
     reg32_t regw_tmp = r11d;
 
+    reg64_t reg_src_back = r12;
+    reg64_t reg_tr_src_back = r13;
+
     Vmm vmm_zp_b_val = Vmm(0);
     Vmm vmm_permd = Vmm(1);
     Vmm vmm_scales0 = Vmm(2);
@@ -4432,7 +4740,7 @@ struct jit_brgemm_matmul_copy_b_cvt_bf16_t : public jit_brgemm_matmul_copy_b_t,
     void load_int(const Vmm vmm_in, const Xbyak::Operand &op);
     void get_scales(const int blk, const int k, const int n,
             const bool is_n_tail, const bool is_k_tail);
-    void copy_block(const int nrows, const int ncolumns);
+    void copy_block(const int nrows, const int ncolumns, bool zeropad);
     void generate() override;
 };
 
@@ -4511,7 +4819,7 @@ void jit_brgemm_matmul_copy_b_cvt_bf16_t<Vmm>::get_scales(const int blk,
 
 template <typename Vmm>
 void jit_brgemm_matmul_copy_b_cvt_bf16_t<Vmm>::copy_block(
-        const int nrows, int ncolumns) {
+        const int nrows, int ncolumns, bool zeropad) {
     const int columns_tail = ncolumns % n_blk_step;
     if (columns_tail > 0 && columns_tail < n_blk_step) {
         const auto regw_tmp = reg_tmp.cvt32();
@@ -4549,7 +4857,16 @@ void jit_brgemm_matmul_copy_b_cvt_bf16_t<Vmm>::copy_block(
             vmulps(src_vmm0, src_vmm0, vmm_scales0);
             vmulps(src_vmm1, src_vmm1, vmm_scales1);
         }
-        vcvtne2ps2bf16(src_vmm0, src_vmm1, src_vmm0);
+
+        if (conf_->wei_dt == data_type::bf16) {
+            vcvtne2ps2bf16(src_vmm0, src_vmm1, src_vmm0);
+        } else {
+            const auto src_vmm_lower0 = Vmm_lower_t(src_vmm0.getIdx());
+            const auto src_vmm_lower1 = Vmm_lower_t(src_vmm1.getIdx());
+            vcvtps2phx(src_vmm_lower0, src_vmm0);
+            vcvtps2phx(src_vmm_lower1, src_vmm1);
+            vinsertf64x4(src_vmm0, src_vmm0, src_vmm_lower1, 1);
+        }
     };
 
     int iter = 0;
@@ -4562,8 +4879,12 @@ void jit_brgemm_matmul_copy_b_cvt_bf16_t<Vmm>::copy_block(
                 = maybe_EVEX_compress_addr(reg_tr_src, tr_src_off);
         const int blk_idx = iter % max_unroll;
 
-        load(blk_idx, k, n);
-        uni_vmovups(store_addr, get_vmm(blk_idx, 0));
+        const auto store_vmm = get_vmm(blk_idx, 0);
+        if (zeropad)
+            uni_vpxor(store_vmm, store_vmm, store_vmm);
+        else
+            load(blk_idx, k, n);
+        uni_vmovups(store_addr, store_vmm);
 
         iter++;
     }
@@ -4578,7 +4899,6 @@ void jit_brgemm_matmul_copy_b_cvt_bf16_t<Vmm>::generate() {
 
     mov(reg_src, ptr[param1 + GET_OFF(src)]);
     mov(reg_tr_src, ptr[param1 + GET_OFF(tr_src)]);
-    mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
     mov(reg_N_blk, ptr[param1 + GET_OFF(current_N_blk)]);
     mov(reg_scales, ptr[param1 + GET_OFF(scales_ptr)]);
 
@@ -4587,67 +4907,108 @@ void jit_brgemm_matmul_copy_b_cvt_bf16_t<Vmm>::generate() {
         uni_vpbroadcastd(vmm_zp_b_val, ptr[reg_tmp]);
     }
 
-    auto compute_K_loop = [&](const int ncolumns) {
-        const int k_unroll = 8;
+    auto compute_K_loop_body
+            = [&](const reg64_t &reg_K, int ncolumns, bool zeropad) {
+                  const int k_unroll = 8;
+
+                  Label K_loop_unrolled, K_loop_single, K_loop_tail_or_done;
+                  cmp(reg_K, k_unroll * k_blk_step);
+                  jl(K_loop_single, T_NEAR);
+
+                  L(K_loop_unrolled);
+                  copy_block(k_unroll * k_blk_step, ncolumns, zeropad);
+                  add(reg_src, k_unroll * src_stride_);
+                  add(reg_tr_src, k_unroll * tr_src_stride_);
+                  if (req_apply_scales_)
+                      add(reg_scales, k_unroll * k_blk_step * scales_N_stride_);
+
+                  sub(reg_K, k_unroll * k_blk_step);
+                  cmp(reg_K, k_unroll * k_blk_step);
+                  jge(K_loop_unrolled, T_NEAR);
+
+                  L(K_loop_single);
+                  cmp(reg_K, k_blk_step);
+                  jl(K_loop_tail_or_done, T_NEAR);
+
+                  copy_block(k_blk_step, ncolumns, zeropad);
+                  add(reg_src, src_stride_);
+                  add(reg_tr_src, tr_src_stride_);
+                  if (req_apply_scales_)
+                      add(reg_scales, k_blk_step * scales_N_stride_);
+
+                  sub(reg_K, k_blk_step);
+                  jmp(K_loop_single, T_NEAR);
+
+                  L(K_loop_tail_or_done);
+
+                  const int k_blk_tail = conf_->K % k_blk_step;
+                  if (k_blk_tail > 0) {
+                      Label K_loop_done;
+                      cmp(reg_K, 0);
+                      jle(K_loop_done, T_NEAR);
+                      copy_block(k_blk_tail, ncolumns, zeropad);
+                      add(reg_tr_src, tr_src_stride_);
+                      sub(reg_K, k_blk_tail);
+                      L(K_loop_done);
+                  }
+              };
 
-        Label K_loop_unrolled, K_loop_single, K_loop_tail_or_done;
-        cmp(reg_K_iters, k_unroll * k_blk_step);
-        jl(K_loop_single, T_NEAR);
+    auto compute_K_loop = [&](const int ncolumns) {
+        mov(reg_src_back, reg_src);
+        mov(reg_tr_src_back, reg_tr_src);
 
-        L(K_loop_unrolled);
-        copy_block(k_unroll * k_blk_step, ncolumns);
-        add(reg_src, k_unroll * src_stride_);
-        add(reg_tr_src, k_unroll * tr_src_stride_);
-        if (req_apply_scales_)
-            add(reg_scales, k_unroll * k_blk_step * scales_N_stride_);
+        mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_iters)]);
+        compute_K_loop_body(reg_K_iters, ncolumns, false);
+        mov(reg_K_iters, ptr[param1 + GET_OFF(current_K_pad)]);
+        compute_K_loop_body(reg_K_iters, ncolumns, true);
 
-        sub(reg_K_iters, k_unroll * k_blk_step);
-        cmp(reg_K_iters, k_unroll * k_blk_step);
-        jge(K_loop_unrolled, T_NEAR);
+        mov(reg_src, reg_src_back);
+        mov(reg_tr_src, reg_tr_src_back);
+    };
 
-        L(K_loop_single);
-        cmp(reg_K_iters, k_blk_step);
-        jl(K_loop_tail_or_done, T_NEAR);
+    Label done;
+    cmp(reg_N_blk, 0);
+    jle(done, T_NEAR);
 
-        copy_block(k_blk_step, ncolumns);
-        add(reg_src, src_stride_);
-        add(reg_tr_src, tr_src_stride_);
-        if (req_apply_scales_) add(reg_scales, k_blk_step * scales_N_stride_);
+    if (conf_->LDB2 != 0) {
+        Label main_N_loop, main_N_loop_tail;
+        int tail = conf_->N % conf_->LDB;
 
-        sub(reg_K_iters, k_blk_step);
-        jmp(K_loop_single, T_NEAR);
+        if (tail != 0) {
+            cmp(reg_N_blk, conf_->LDB);
+            jl(main_N_loop_tail, T_NEAR);
+        }
 
-        L(K_loop_tail_or_done);
+        L(main_N_loop);
+        compute_K_loop(conf_->LDB);
+        add(reg_src, conf_->LDB2 * typesize_);
+        add(reg_tr_src, conf_->LDB2 * tr_typesize_);
 
-        const int k_blk_tail = conf_->K % k_blk_step;
-        if (k_blk_tail > 0) {
-            Label K_loop_done;
-            cmp(reg_K_iters, 0);
-            jle(K_loop_done, T_NEAR);
+        sub(reg_N_blk, conf_->LDB);
+        cmp(reg_N_blk, conf_->LDB);
+        jge(main_N_loop, T_NEAR);
 
-            copy_block(k_blk_tail, ncolumns);
-            sub(reg_K_iters, k_blk_tail);
-            L(K_loop_done);
+        if (tail != 0) {
+            L(main_N_loop_tail);
+            cmp(reg_N_blk, 0);
+            jle(done, T_NEAR);
+            compute_K_loop(tail);
         }
-    };
 
-    Label done;
-    cmp(reg_N_blk, 0);
-    jle(done, T_NEAR);
+    } else {
+        if (conf_->N_tail > 0) {
+            Label main_N_blk;
+            cmp(reg_N_blk, conf_->N_blk);
+            je(main_N_blk, T_NEAR);
+            compute_K_loop(conf_->N_tail);
+            jmp(done, T_NEAR);
 
-    if (conf_->N_tail > 0) {
-        Label main_N_blk;
-        cmp(reg_N_blk, conf_->N_blk);
-        je(main_N_blk, T_NEAR);
-        compute_K_loop(conf_->N_tail);
-        jmp(done, T_NEAR);
+            L(main_N_blk);
+        }
 
-        L(main_N_blk);
+        compute_K_loop(conf_->N_blk);
     }
-
-    compute_K_loop(conf_->N_blk);
     L(done);
-
     postamble();
 }
 
@@ -4672,22 +5033,30 @@ status_t create_brgemm_matmul_copy_b(
                     new jit_brgemm_matmul_copy_b_transposed_t<Ymm>(conf)));
         }
     } else {
-        if (conf->is_bf16_with_int_wei && conf->blocked_B) {
+        if ((conf->is_bf16_with_int_wei
+                    || (conf->is_f16_with_int_wei
+                            && conf->isa != avx512_core_fp16))
+                && conf->blocked_B) {
             if (is_superset(conf->isa, avx512_core))
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_brgemm_matmul_copy_b_cvt_bf16_t<Zmm>(conf)));
             else {
-                assert("Unsupported isa for bf16_with_int_wei");
+                assert(!"Unsupported isa for bf16_with_int_wei");
                 return status::unimplemented;
             }
-        } else if (is_bf16 || is_f16 || conf->is_bf32) {
+        } else if (is_bf16 || is_f16 || conf->is_bf32
+                || (conf->is_f16_with_int_wei
+                        && conf->isa != avx512_core_fp16)) {
             if (is_superset(conf->isa, avx512_core))
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_brgemm_matmul_copy_b_bf16_t<Zmm>(conf)));
             else
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_brgemm_matmul_copy_b_bf16_t<Ymm>(conf)));
-        } else if (is_f32 || conf->isa == avx512_core_fp16) {
+        } else if (is_f32
+                || (conf->isa == avx512_core_fp16
+                        && conf->orig_wei_dt == data_type::f16)) {
+            // See the note above why `orig_wei_dt` is used.
             if (is_superset(conf->isa, avx512_core))
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_brgemm_matmul_copy_b_f32_t<Zmm>(conf)));
@@ -4702,7 +5071,15 @@ status_t create_brgemm_matmul_copy_b(
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_avx512_core_brgemm_matmul_copy_b_int8_t(conf)));
             else {
-                assert(one_of(conf->isa, avx2_vnni, avx2_vnni_2));
+                // TODO: jit_avx2_vnni_brgemm_matmul_copy_b_int8_t can handle
+                // avx2 if no compensation is required. Consider enabling it
+                // for avx2 and renaming the kernel (drop "vnni" part).
+                const bool is_comp_required = conf->s8s8_compensation_required
+                        || conf->has_zero_point_a;
+                MAYBE_UNUSED(is_comp_required);
+                assert(one_of(conf->isa, avx2_vnni, avx2_vnni_2, avx2)
+                        && IMPLICATION(conf->isa == avx2, !is_comp_required));
+
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_avx2_vnni_brgemm_matmul_copy_b_int8_t(conf)));
             }
@@ -4734,7 +5111,7 @@ status_t create_brgemm_matmul_copy_a(
                 CHECK(safe_ptr_assign(copy_ker,
                         new jit_brgemm_matmul_copy_a_impl_t<Ymm>(conf)));
             } else {
-                assert("Unsupported isa for jit_brgemm_matmul_copy_a_impl_t");
+                assert(!"Unsupported isa for jit_brgemm_matmul_copy_a_impl_t");
                 return status::unimplemented;
             }
         }
diff --git a/src/cpu/x64/matmul/brgemm_matmul_copy_utils.hpp b/src/cpu/x64/matmul/brgemm_matmul_copy_utils.hpp
index 142752c9790..eacf8f52da2 100644
--- a/src/cpu/x64/matmul/brgemm_matmul_copy_utils.hpp
+++ b/src/cpu/x64/matmul/brgemm_matmul_copy_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,6 +37,7 @@ struct jit_brgemm_matmul_copy_b_t {
 
         dim_t current_K_start;
         dim_t current_K_iters;
+        dim_t current_K_pad {0};
         dim_t current_N_blk;
         dim_t dynamic_src_stride;
     };
@@ -46,7 +47,7 @@ struct jit_brgemm_matmul_copy_b_t {
 
     jit_brgemm_matmul_copy_b_t(const brgemm_matmul_conf_t *conf)
         : conf_(conf) {}
-    virtual ~jit_brgemm_matmul_copy_b_t() {}
+    virtual ~jit_brgemm_matmul_copy_b_t() = default;
 
     const brgemm_matmul_conf_t *conf_;
 };
@@ -71,7 +72,7 @@ struct jit_brgemm_matmul_copy_a_t {
 
     jit_brgemm_matmul_copy_a_t(const brgemm_matmul_conf_t *conf)
         : conf_(conf) {}
-    virtual ~jit_brgemm_matmul_copy_a_t() {}
+    virtual ~jit_brgemm_matmul_copy_a_t() = default;
 
     const brgemm_matmul_conf_t *conf_;
 };
diff --git a/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp b/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp
index 15c73874591..2f77b318880 100644
--- a/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp
+++ b/src/cpu/x64/matmul/brgemm_matmul_reorders.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "common/dnnl_thread.hpp"
 
+#include "common/reorder_pd.hpp"
 #include "cpu/x64/matmul/brgemm_matmul_reorders.hpp"
 
 namespace dnnl {
@@ -25,7 +26,7 @@ namespace x64 {
 
 using namespace format_tag;
 
-format_tag_t get_otag(const memory_desc_t &dst_md) {
+format_tag_t get_blocked_otag(const memory_desc_t &dst_md) {
 
     const memory_desc_wrapper od(dst_md);
     const auto vnni_granularity = data_type_vnni_granularity(od.data_type());
@@ -51,6 +52,150 @@ format_tag_t get_otag(const memory_desc_t &dst_md) {
     return otag;
 }
 
+// If two shapes are plain transposes of each other, then
+// src.strides / dst.strides (or dst.strides / src.strides) will look like:
+// index:   0, 1, 2, ..., l-1, l,   l+1, ..., l+m-1, l+m, l+m+1, ..., l+m+n
+// src/dst: 1, 1, 1, ..., 1,   M,   M,   ..., M,     1/K, 1/K,   ..., 1/K
+// dst/src: 1, 1, 1, ..., 1,   1/M, 1/M, ..., 1/M,   K,   K,     ..., K
+//
+// where the first k dimensions are batch dimensions, M is stride of
+// first dimesions squashed, K is stride of the second dimension squashed.
+//
+// therefore M = M, K = K, batch = product of first k dimensions.
+//
+// Note: Assuming src strides are sorted and dst and dims are sorted
+// in the same order. This done first thing in the
+// function.
+status_t calculate_plain_transpose_blocks(dim_t &batch, dim_t &M, dim_t &K,
+        const memory_desc_t &src_md, const memory_desc_t &dst_md) {
+
+    // drop all unit dims as they they will break the calculations. Removing
+    // unit dims will not change the physical memory reorder problem.
+    dims_t non_unit_dims {};
+    dim_t non_unit_dim = 0;
+    for (dim_t i = 0; i < src_md.ndims; i++) {
+        if (src_md.dims[i] == 1) continue;
+        non_unit_dims[non_unit_dim++] = src_md.dims[i];
+    }
+
+    memory_desc_t src_md_reduced, dst_md_reduced;
+    memory_desc_reshape(src_md_reduced, src_md, non_unit_dim, non_unit_dims);
+    memory_desc_reshape(dst_md_reduced, dst_md, non_unit_dim, non_unit_dims);
+
+    const memory_desc_wrapper id(src_md_reduced), od(dst_md_reduced);
+
+    dims_t sort_src_indices {};
+    dims_t sort_dst_indices {};
+    for (dim_t i = 0; i < id.ndims(); i++) {
+        sort_src_indices[i] = i;
+        sort_dst_indices[i] = i;
+    }
+    std::sort(sort_src_indices, sort_src_indices + id.ndims(),
+            [id](int a, int b) { return id.strides()[a] > id.strides()[b]; });
+    std::sort(sort_dst_indices, sort_dst_indices + od.ndims(),
+            [od](int a, int b) { return od.strides()[a] > od.strides()[b]; });
+    // make sure physical layout is dense and there is no magical
+    // padding.
+    for (dim_t i = id.ndims() - 1; i > 0; i--)
+        VDISPATCH_REORDER_IC((id.strides()[sort_src_indices[i]]
+                                             * id.dims()[sort_src_indices[i]]
+                                     == id.strides()[sort_src_indices[i - 1]])
+                        && (od.strides()[sort_dst_indices[i]]
+                                        * od.dims()[sort_dst_indices[i]]
+                                == od.strides()[sort_dst_indices[i - 1]]),
+                VERBOSE_UNSUPPORTED_MEM_STRIDE);
+    // sort the arrays by src strides.
+    dims_t src_sorted_strides {};
+    dims_t dst_sorted_strides {};
+    dims_t sorted_dims {};
+    for (dim_t i = 0; i < id.ndims(); i++) {
+        src_sorted_strides[i] = id.strides()[sort_src_indices[i]];
+        dst_sorted_strides[i] = od.strides()[sort_src_indices[i]];
+        sorted_dims[i] = id.dims()[sort_src_indices[i]];
+    }
+
+    // find first unmatching stride
+    dim_t l_idx = -1;
+    for (dim_t s_idx = 0; s_idx < id.ndims(); s_idx++) {
+        if (src_sorted_strides[s_idx] != dst_sorted_strides[s_idx]) {
+            l_idx = s_idx;
+            break;
+        }
+    }
+    // if all strides are the same, then this is a direct copy
+    VDISPATCH_REORDER_IC(l_idx != -1, VERBOSE_UNSUPPORTED_MEM_STRIDE);
+    // batch is product of first k dimensions.
+    batch = 1;
+    for (dim_t d_idx = 0; d_idx < l_idx; d_idx++)
+        batch *= sorted_dims[d_idx];
+    // create the src/dst and dst/src arrays.
+    dims_t src_over_dst {}, dst_over_src {};
+    for (dim_t s_idx = l_idx; s_idx < id.ndims(); s_idx++) {
+        src_over_dst[s_idx]
+                = src_sorted_strides[s_idx] / dst_sorted_strides[s_idx];
+        dst_over_src[s_idx]
+                = dst_sorted_strides[s_idx] / src_sorted_strides[s_idx];
+    }
+
+    // check if src and dst are transposes of each other
+
+    // find first umatching stride by division
+    dim_t lm_idx = -1;
+    int prev_M_src = src_over_dst[l_idx],
+        prev_1_over_M_dst = dst_over_src[l_idx];
+    // here we are checking to make sure all indexes are the same in src/dst
+    // and dst/src arrays (comparing with prev_h_src and prev_h_dst). If its
+    // different, then both src/dst and dst/src arrays should be different. In
+    // which case we have found the l+m index (lm_idx). If one is different
+    // while the other is not then this is not a plain transpose (return
+    // status::unimplemented).
+    //                             ^^^^^^^^^^^^^^^^^^^^
+    // index:   0, 1, 2, ..., l-1, l,   l+1, ..., l+m-1, l+m, l+m+1, ..., l+m+n
+    // src/dst: 1, 1, 1, ..., 1,   M,   M,   ..., M,     1/K, 1/K,   ..., 1/K
+    // dst/src: 1, 1, 1, ..., 1,   1/M, 1/M, ..., 1/M,   K,   K,     ..., K
+    for (dim_t s_idx = l_idx + 1; s_idx < id.ndims(); s_idx++) {
+        const bool cond_ok
+                = (src_over_dst[s_idx] == prev_M_src
+                          && dst_over_src[s_idx] == prev_1_over_M_dst)
+                || (src_over_dst[s_idx] != prev_M_src
+                        && dst_over_src[s_idx] != prev_1_over_M_dst);
+        VDISPATCH_REORDER_IC(cond_ok, VERBOSE_UNSUPPORTED_MEM_STRIDE);
+        if (src_over_dst[s_idx] != prev_M_src) {
+            lm_idx = s_idx;
+            break;
+        }
+    }
+    // This means all strides are the same. In this case its a direct copy.
+    // Only case this might be possible is unit strides after batch.
+    VDISPATCH_REORDER_IC(lm_idx != -1, VERBOSE_UNSUPPORTED_MEM_STRIDE);
+
+    int prev_1_over_K_src = src_over_dst[lm_idx],
+        prev_K_dst = dst_over_src[lm_idx];
+    // Here we make sure the last strides divs are the same. If not, then this
+    // means that one of the l+m, l+m+1, ..., l+m+n indexes is not in the same
+    // order as in src. For example in src, looking at memory view, it can be
+    // 1, 2, 3, 4, 5, 6, 7, 8 while in dst it is 1, 2, 3, 6, 8, 7, 4, 5 (1,2 are
+    // batch dims). While the global ordering is the transposed (4,5 come after
+    // 6, 7, 9 indicating possible transpose) the local ordering is different
+    // (8 and 7 swapped meaning 6,7,8 squashed is not the same as 6,8,7
+    // squashed).
+    //                                                   ^^^^^^^^^^^^^^^^^^^^^^
+    // index:   0, 1, 2, ..., l-1, l,   l+1, ..., l+m-1, l+m, l+m+1, ..., l+m+n
+    // src/dst: 1, 1, 1, ..., 1,   M,   M,   ..., M,     1/K, 1/K,   ..., 1/K
+    // dst/src: 1, 1, 1, ..., 1,   1/M, 1/M, ..., 1/M,   K,   K,     ..., K
+    for (dim_t s_idx = lm_idx + 1; s_idx < id.ndims(); s_idx++) {
+        const bool cond_ok = dst_over_src[s_idx] == prev_K_dst
+                && src_over_dst[s_idx] == prev_1_over_K_src;
+        VDISPATCH_REORDER_IC(cond_ok, VERBOSE_UNSUPPORTED_MEM_STRIDE);
+    }
+
+    // set block sizes with contiguous dimensions squashed.
+    M = prev_M_src;
+    K = prev_K_dst;
+
+    return status::success;
+}
+
 // This function initializes all required fields in the conf object to generate
 // copy_b kernel.
 // This particular call relies on memory descriptors and used in this
@@ -67,28 +212,41 @@ status_t init_conf(matmul::brgemm_matmul_conf_t &conf,
     const auto type_i = id.data_type();
     const auto type_o = od.data_type();
 
-    const bool is_bf16_with_int_wei = type_o == data_type::bf16
-            && utils::one_of(type_i, data_type::s8, data_type::u8,
-                    data_type::s4, data_type::u4);
-
-    format_tag_t otag = get_otag(dst_md);
-    // TODO: enable for itag = {ba, acb}
-    format_tag_t itag = id.matches_one_of_tag(
-            ab, abc, is_bf16_with_int_wei ? otag : format_tag::undef);
-    if (utils::one_of(format_tag::undef, itag, otag))
-        return status::invalid_arguments;
-
-    dim_t batch = ndims > 2 ? dims[ndims - 3] : 1;
-    dim_t K = dims[ndims - 2];
-    dim_t N = dims[ndims - 1];
-    if (utils::one_of(type_i, data_type::s4, data_type::u4) && N % 2 != 0)
-        return status::invalid_arguments;
-
-    dim_t in_ld
-            = ndims >= 2 ? memory_desc_wrapper(src_md).strides()[ndims - 2] : 1;
+    const bool is_plain = id.is_plain() && od.is_plain();
+    dim_t M, K, batch, in_ld, N;
+    format_tag_t itag = format_tag::undef, otag = format_tag::undef;
+    if (is_plain) {
+        CHECK(calculate_plain_transpose_blocks(batch, M, K, src_md, dst_md));
+        N = 0;
+        in_ld = M;
+        // The heuristic value is empirical
+        const bool is_small_shape = batch * M * K < 49152;
+        VDISPATCH_REORDER_IC(!is_small_shape, VERBOSE_SMALL_SHAPES);
+    } else {
+        batch = ndims > 2 ? dims[ndims - 3] : 1;
+        M = 0;
+        K = dims[ndims - 2];
+        N = dims[ndims - 1];
+        in_ld = ndims >= 2 ? id.strides()[ndims - 2] : 1;
+        const bool int4_ok = IMPLICATION(
+                utils::one_of(type_i, data_type::s4, data_type::u4),
+                N % 2 == 0);
+        VDISPATCH_REORDER_IC(int4_ok, VERBOSE_BAD_DIM, "N", ndims - 1);
+        const bool is_bf16_with_int_wei = type_o == data_type::bf16
+                && utils::one_of(type_i, data_type::s8, data_type::u8,
+                        data_type::s4, data_type::u4);
+
+        otag = get_blocked_otag(dst_md);
+        // TODO: enable for itag = {ba, acb}
+        itag = id.matches_one_of_tag(
+                ab, abc, is_bf16_with_int_wei ? otag : format_tag::undef);
+        VDISPATCH_REORDER_IC(!utils::one_of(format_tag::undef, itag, otag),
+                VERBOSE_UNSUPPORTED_TAG);
+    }
 
-    CHECK(matmul::init_conf(conf, batch, K, N, in_ld,
-            matmul::get_n_block_from_tag(otag), type_i, type_o, itag));
+    CHECK(matmul::init_conf(conf, batch, M, K, N, in_ld,
+            is_plain ? 0 : matmul::get_n_block_from_tag(otag), type_i, type_o,
+            itag));
 
     conf.s8s8_compensation_required
             = od.extra().flags & memory_extra_flags::compensation_conv_s8s8;
@@ -101,12 +259,11 @@ status_t init_conf(matmul::brgemm_matmul_conf_t &conf,
     return status::success;
 }
 
-status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init(
+status_t brgemm_matmul_copy_reorder_t::pd_t::init(
         engine_t *engine, engine_t *src_engine, engine_t *dst_engine) {
     using namespace status;
 
-    status_t status = cpu_reorder_pd_t::init(engine, src_engine, dst_engine);
-    if (status != success) return status;
+    CHECK(cpu_reorder_pd_t::init(engine, src_engine, dst_engine));
 
     const memory_desc_wrapper id(src_md_), od(dst_md_);
     const int ndims = id.ndims();
@@ -117,29 +274,58 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init(
     // TODO: enable support for type_i != type_o cases
     const bool is_int_weights = utils::one_of(
             type_i, data_type::s8, data_type::u8, data_type::s4, data_type::u4);
-    const bool dt_ok = true
-            && IMPLICATION(type_i == type_o,
-                    utils::one_of(type_o, data_type::s8, data_type::bf16,
-                            data_type::f16, data_type::f32))
+    const bool is_plain = id.is_plain() && od.is_plain();
+    const bool dt_ok
+            = IMPLICATION(type_i == type_o,
+                      utils::one_of(type_o, data_type::s8, data_type::bf16,
+                              data_type::f16, data_type::f32))
             && IMPLICATION(type_i != type_o,
                     utils::one_of(type_o, data_type::f32, data_type::f16,
                             data_type::bf16)
-                            && is_int_weights);
+                            && is_int_weights)
+            // Plain transpose is supported only for f32. Consider
+            // adding other datatypes for potential performance improvement.
+            && IMPLICATION(is_plain,
+                    type_i == data_type::f32 && type_o == data_type::f32);
+    VDISPATCH_REORDER_IC(dt_ok, VERBOSE_UNSUPPORTED_DT);
+
+    VDISPATCH_REORDER_IC(
+            id.is_dense(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+
+    // plain transpose reorder works for all shapes.
+    VDISPATCH_REORDER_IC(is_plain ? ndims >= 2 : utils::one_of(ndims, 2, 3),
+            VERBOSE_BAD_NDIMS, "src", ndims);
+
+    // plain transpose does not support postops
+    VDISPATCH_REORDER_IC(IMPLICATION(is_plain, attr()->post_ops_.len() == 0),
+            VERBOSE_UNSUPPORTED_POSTOP);
+
     const bool is_f16 = utils::one_of(data_type::f16, type_i, type_o);
     const bool is_s8s8 = type_i == data_type::s8 && type_o == data_type::s8;
     const bool is_bf16_with_int_wei
             = type_o == data_type::bf16 && is_int_weights;
-    const bool has_adj_scale
-            = od.extra().flags & memory_extra_flags::scale_adjust;
-    const bool args_ok = true && dt_ok && id.is_dense()
-            && utils::one_of(ndims, 2, 3)
-            && IMPLICATION(is_bf16_with_int_wei, mayiuse(avx512_core_bf16))
+    const bool isa_ok
+            = IMPLICATION(is_bf16_with_int_wei, mayiuse(avx512_core_bf16))
             && IMPLICATION(is_f16, mayiuse(avx512_core_fp16))
             && IMPLICATION(!is_f16, mayiuse(avx512_core))
-            && IMPLICATION(is_s8s8, mayiuse(avx512_core_vnni)) && !has_adj_scale
-            && attr()->has_default_values() && od.is_blocking_desc()
-            && !od.has_runtime_dims_or_strides() && !od.has_zero_dim();
-    if (!args_ok) return invalid_arguments;
+            && IMPLICATION(is_s8s8, mayiuse(avx512_core_vnni));
+    VDISPATCH_REORDER_IC(isa_ok, VERBOSE_UNSUPPORTED_ISA);
+
+    const bool has_adj_scale
+            = od.extra().flags & memory_extra_flags::scale_adjust;
+    VDISPATCH_REORDER_IC(
+            !has_adj_scale, VERBOSE_UNSUPPORTED_MD_FLAG, "dst:scale_adjust");
+
+    VDISPATCH_REORDER_IC(
+            attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+
+    VDISPATCH_REORDER_IC(
+            od.is_blocking_desc(), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+
+    VDISPATCH_REORDER_IC(
+            !od.has_runtime_dims_or_strides(), VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+    VDISPATCH_REORDER_IC(!od.has_zero_dim(), VERBOSE_BAD_DIM, "dst", 0);
 
     CHECK(init_conf(matmul_conf_for_reorder_, src_md_, dst_md_));
 
@@ -161,9 +347,8 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::init(
     return status::success;
 }
 
-status_t brgemm_matmul_matrix_B_reorder_t::pd_t::create(
-        reorder_pd_t **reorder_pd, engine_t *engine,
-        const primitive_attr_t *attr, engine_t *src_engine,
+status_t brgemm_matmul_copy_reorder_t::pd_t::create(reorder_pd_t **reorder_pd,
+        engine_t *engine, const primitive_attr_t *attr, engine_t *src_engine,
         const memory_desc_t *src_md, engine_t *dst_engine,
         const memory_desc_t *dst_md) {
     using namespace status;
@@ -178,7 +363,7 @@ status_t brgemm_matmul_matrix_B_reorder_t::pd_t::create(
     return safe_ptr_assign<reorder_pd_t>(*reorder_pd, _pd.release());
 }
 
-status_t brgemm_matmul_matrix_B_reorder_t::execute_body(
+status_t brgemm_matmul_copy_reorder_t::execute_body(
         const exec_ctx_t &ctx) const {
     using namespace utils;
 
@@ -211,80 +396,116 @@ status_t brgemm_matmul_matrix_B_reorder_t::execute_body(
             : nullptr;
 
     const int ndims = src_d.ndims();
+    if (kernel_conf.N <= 0) {
+        parallel_nd(kernel_conf.batch,
+                utils::div_up(kernel_conf.K, kernel_conf.K_blk),
+                utils::div_up(kernel_conf.M, kernel_conf.M_blk),
+                [&](const dim_t batch, const dim_t k_blk, const dim_t m_blk) {
+                    auto ker_exec_ctx
+                            = matmul::jit_brgemm_matmul_copy_a_t::ctx_t();
+                    ker_exec_ctx.current_K_blk
+                            = kernel_conf.K_blk * (k_blk + 1) > kernel_conf.K
+                            ? kernel_conf.K % kernel_conf.K_blk
+                            : kernel_conf.K_blk;
+                    ker_exec_ctx.current_M_blk
+                            = kernel_conf.M_blk * (m_blk + 1) > kernel_conf.M
+                            ? kernel_conf.M % kernel_conf.M_blk
+                            : kernel_conf.M_blk;
+                    ker_exec_ctx.src = (void *)(src
+                            + m_blk * kernel_conf.M_blk * kernel_conf.a_dt_sz
+                            + k_blk * kernel_conf.K_blk * kernel_conf.M
+                                    * kernel_conf.a_dt_sz
+                            + batch * kernel_conf.K * kernel_conf.M
+                                    * kernel_conf.a_dt_sz
+                            + src_d.offset0() * kernel_conf.a_dt_sz);
+                    ker_exec_ctx.tr_src = (void *)(dst
+                            + m_blk * kernel_conf.M_blk * kernel_conf.K
+                                    * kernel_conf.tr_a_dt_sz
+                            + k_blk * kernel_conf.K_blk * kernel_conf.tr_a_dt_sz
+                            + batch * kernel_conf.K * kernel_conf.M
+                                    * kernel_conf.tr_a_dt_sz
+                            + dst_d.offset0() * kernel_conf.tr_a_dt_sz);
+                    (*a_kernel_)(&ker_exec_ctx);
+                });
+
+    } else {
+
 #define get_blk_off(md, dt_sz, batch, d0, d1) \
     (ndims == 3 ? (dt_sz) * (md).blk_off((batch), (d0), (d1)) \
                 : (dt_sz) * (md).blk_off((d0), (d1)))
 
-    parallel_nd(kernel_conf.batch, div_up(kernel_conf.N, kernel_conf.N_blk),
-            [&](dim_t batch, dim_t n_blk_idx) {
-                const auto n = n_blk_idx * kernel_conf.N_blk;
-                const bool is_N_tail = (kernel_conf.N - n) < kernel_conf.N_blk;
-                auto ker_exec_ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t();
-                ker_exec_ctx.current_N_blk
-                        = is_N_tail ? kernel_conf.N_tail : kernel_conf.N_blk;
-
-                const auto comp_offset = batch * kernel_conf.s8s8_comp_b_str
-                        + n_blk_idx * kernel_conf.s8s8_comp_n_str;
-
-                ker_exec_ctx.zp_a_compensation_ptr
-                        = kernel_conf.has_zero_point_a
-                        ? (void *)&zp[comp_offset]
-                        : nullptr;
-                ker_exec_ctx.compensation_ptr
-                        = kernel_conf.s8s8_compensation_required
-                        ? (void *)&cp[comp_offset]
-                        : nullptr;
-
-                // required to compute zp compensation
-                int tmp_neg_a_zp_val = -1;
-                ker_exec_ctx.zp_a_neg_value_ptr = &tmp_neg_a_zp_val;
-
-                int k_blk_idx = 0;
-                for (; k_blk_idx < kernel_conf.K / kernel_conf.K_blk;
-                        k_blk_idx++) {
-                    const auto k = k_blk_idx * kernel_conf.K_blk;
-                    const auto src_offset = !kernel_conf.blocked_B
-                            ? get_blk_off(src_d, sdt_sz, batch, k, n)
-                            : get_blk_off(
-                                    src_d, sdt_sz, batch, k_blk_idx, n_blk_idx);
-                    ker_exec_ctx.src
-                            = (void *)&src[src_offset / src_typesz_scale];
-                    ker_exec_ctx.tr_src = (void *)&dst[get_blk_off(
-                            dst_d, ddt_sz, batch, k_blk_idx, n_blk_idx)];
-                    ker_exec_ctx.current_K_start = k;
-                    ker_exec_ctx.current_K_iters = kernel_conf.K_blk;
-                    (*kernel_)(&ker_exec_ctx);
-                }
-                if (kernel_conf.K_tail > 0) {
-                    const auto k = k_blk_idx * kernel_conf.K_blk;
-                    const auto src_offset = !kernel_conf.blocked_B
-                            ? get_blk_off(src_d, sdt_sz, batch, k, n)
-                            : get_blk_off(
-                                    src_d, sdt_sz, batch, k_blk_idx, n_blk_idx);
-                    ker_exec_ctx.src
-                            = (void *)&src[src_offset / src_typesz_scale];
-                    const auto dst_offset = get_blk_off(
-                            dst_d, ddt_sz, batch, k_blk_idx, n_blk_idx);
-                    ker_exec_ctx.tr_src = (void *)&dst[dst_offset];
-                    ker_exec_ctx.current_K_start = k;
-                    ker_exec_ctx.current_K_iters = kernel_conf.K_tail;
-                    (*kernel_)(&ker_exec_ctx);
-                    const auto vnni_granularity
-                            = data_type_vnni_granularity(type_o);
-                    const auto dst_zero_out_offset
-                            = rnd_up(kernel_conf.K_tail, vnni_granularity)
-                            * kernel_conf.N_blk * ddt_sz;
-                    const auto elems_to_zero
-                            = rnd_dn(kernel_conf.K_blk - kernel_conf.K_tail,
-                                      vnni_granularity)
-                            * kernel_conf.N_blk * ddt_sz;
-                    array_set(&dst[dst_offset + dst_zero_out_offset], 0,
-                            elems_to_zero);
-                }
-            });
+        parallel_nd(kernel_conf.batch, div_up(kernel_conf.N, kernel_conf.N_blk),
+                [&](dim_t batch, dim_t n_blk_idx) {
+                    const auto n = n_blk_idx * kernel_conf.N_blk;
+                    const bool is_N_tail
+                            = (kernel_conf.N - n) < kernel_conf.N_blk;
+                    auto ker_exec_ctx
+                            = matmul::jit_brgemm_matmul_copy_b_t::ctx_t();
+                    ker_exec_ctx.current_N_blk = is_N_tail ? kernel_conf.N_tail
+                                                           : kernel_conf.N_blk;
+
+                    const auto comp_offset = batch * kernel_conf.s8s8_comp_b_str
+                            + n_blk_idx * kernel_conf.s8s8_comp_n_str;
+
+                    ker_exec_ctx.zp_a_compensation_ptr
+                            = kernel_conf.has_zero_point_a
+                            ? (void *)&zp[comp_offset]
+                            : nullptr;
+                    ker_exec_ctx.compensation_ptr
+                            = kernel_conf.s8s8_compensation_required
+                            ? (void *)&cp[comp_offset]
+                            : nullptr;
+
+                    // required to compute zp compensation
+                    int tmp_neg_a_zp_val = -1;
+                    ker_exec_ctx.zp_a_neg_value_ptr = &tmp_neg_a_zp_val;
+
+                    int k_blk_idx = 0;
+                    for (; k_blk_idx < kernel_conf.K / kernel_conf.K_blk;
+                            k_blk_idx++) {
+                        const auto k = k_blk_idx * kernel_conf.K_blk;
+                        const auto src_offset = !kernel_conf.blocked_B
+                                ? get_blk_off(src_d, sdt_sz, batch, k, n)
+                                : get_blk_off(src_d, sdt_sz, batch, k_blk_idx,
+                                        n_blk_idx);
+                        ker_exec_ctx.src
+                                = (void *)&src[src_offset / src_typesz_scale];
+                        ker_exec_ctx.tr_src = (void *)&dst[get_blk_off(
+                                dst_d, ddt_sz, batch, k_blk_idx, n_blk_idx)];
+                        ker_exec_ctx.current_K_start = k;
+                        ker_exec_ctx.current_K_iters = kernel_conf.K_blk;
+                        (*b_kernel_)(&ker_exec_ctx);
+                    }
+                    if (kernel_conf.K_tail > 0) {
+                        const auto k = k_blk_idx * kernel_conf.K_blk;
+                        const auto src_offset = !kernel_conf.blocked_B
+                                ? get_blk_off(src_d, sdt_sz, batch, k, n)
+                                : get_blk_off(src_d, sdt_sz, batch, k_blk_idx,
+                                        n_blk_idx);
+                        ker_exec_ctx.src
+                                = (void *)&src[src_offset / src_typesz_scale];
+                        const auto dst_offset = get_blk_off(
+                                dst_d, ddt_sz, batch, k_blk_idx, n_blk_idx);
+                        ker_exec_ctx.tr_src = (void *)&dst[dst_offset];
+                        ker_exec_ctx.current_K_start = k;
+                        ker_exec_ctx.current_K_iters = kernel_conf.K_tail;
+                        (*b_kernel_)(&ker_exec_ctx);
+                        const auto vnni_granularity
+                                = data_type_vnni_granularity(type_o);
+                        const auto dst_zero_out_offset
+                                = rnd_up(kernel_conf.K_tail, vnni_granularity)
+                                * kernel_conf.N_blk * ddt_sz;
+                        const auto elems_to_zero
+                                = rnd_dn(kernel_conf.K_blk - kernel_conf.K_tail,
+                                          vnni_granularity)
+                                * kernel_conf.N_blk * ddt_sz;
+                        array_set(&dst[dst_offset + dst_zero_out_offset], 0,
+                                elems_to_zero);
+                    }
+                });
 
 #undef get_blk_off
-
+    }
     return status::success;
 }
 
diff --git a/src/cpu/x64/matmul/brgemm_matmul_reorders.hpp b/src/cpu/x64/matmul/brgemm_matmul_reorders.hpp
index 6ad457b694f..3aa19d6155f 100644
--- a/src/cpu/x64/matmul/brgemm_matmul_reorders.hpp
+++ b/src/cpu/x64/matmul/brgemm_matmul_reorders.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,12 +25,12 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct brgemm_matmul_matrix_B_reorder_t : public primitive_t {
+struct brgemm_matmul_copy_reorder_t : public primitive_t {
     struct pd_t : public cpu_reorder_pd_t {
         using cpu_reorder_pd_t::cpu_reorder_pd_t;
 
-        DECLARE_COMMON_PD_T("brgemm_matmul_matrix_B_reorder_t",
-                brgemm_matmul_matrix_B_reorder_t);
+        DECLARE_COMMON_PD_T(
+                "brgemm_matmul_copy_reorder_t", brgemm_matmul_copy_reorder_t);
 
         // required to re-use brgemm matmul copy_b jit kernels
         matmul::brgemm_matmul_conf_t matmul_conf_for_reorder_;
@@ -47,10 +47,18 @@ struct brgemm_matmul_matrix_B_reorder_t : public primitive_t {
         friend dnnl::impl::impl_list_item_t;
     };
 
-    brgemm_matmul_matrix_B_reorder_t(const pd_t *apd) : primitive_t(apd) {}
+    brgemm_matmul_copy_reorder_t(const pd_t *apd) : primitive_t(apd) {}
     status_t init(engine_t *engine) override {
-        CHECK(matmul::create_brgemm_matmul_copy_b(
-                kernel_, &pd()->matmul_conf_for_reorder_));
+        // Assuming copy_B operates over K and N only and copy_A
+        // operates over M and K only, checking for positive N is
+        // sufficient to dispatch into copy_B kernel and copy_A kernel
+        // otherwise.
+        if (pd()->matmul_conf_for_reorder_.N > 0)
+            CHECK(matmul::create_brgemm_matmul_copy_b(
+                    b_kernel_, &pd()->matmul_conf_for_reorder_));
+        else
+            CHECK(matmul::create_brgemm_matmul_copy_a(
+                    a_kernel_, &pd()->matmul_conf_for_reorder_));
 
         return status::success;
     }
@@ -62,7 +70,8 @@ struct brgemm_matmul_matrix_B_reorder_t : public primitive_t {
     }
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::unique_ptr<matmul::jit_brgemm_matmul_copy_b_t> kernel_;
+    std::unique_ptr<matmul::jit_brgemm_matmul_copy_b_t> b_kernel_;
+    std::unique_ptr<matmul::jit_brgemm_matmul_copy_a_t> a_kernel_;
 };
 
 } // namespace x64
diff --git a/src/cpu/x64/matmul/brgemm_matmul_utils.cpp b/src/cpu/x64/matmul/brgemm_matmul_utils.cpp
index 98ecef7fb6d..1d4fe2d44e8 100644
--- a/src/cpu/x64/matmul/brgemm_matmul_utils.cpp
+++ b/src/cpu/x64/matmul/brgemm_matmul_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "cpu/matmul/matmul_utils.hpp"
 #include "cpu/platform.hpp"
 #include "cpu/x64/injectors/jit_uni_postops_injector.hpp"
+#include "cpu/x64/matmul/amx_blocking_heuristics.hpp"
 #include "cpu/x64/matmul/brgemm_matmul_utils.hpp"
 #include "oneapi/dnnl/dnnl_debug.h"
 
@@ -93,17 +94,20 @@ bool post_ops_ok(brgemm_matmul_conf_t &bgmmc, const primitive_attr_t &attr,
     const auto ndims = dst_d.ndims();
 
     bool is_binary_po_per_oc_sp_bcast {};
+    bool is_binary_po_per_oc_d_bcast {};
     bool is_binary_po_channel_bcast {};
     bool is_binary_po_per_mb_bcast {};
     bool is_binary_po_per_mb_w_bcast {};
     bool is_binary_po_per_w_bcast {};
     bool is_binary_po_batch_bcast {};
-    std::tie(is_binary_po_per_oc_sp_bcast, is_binary_po_channel_bcast,
-            is_binary_po_per_mb_bcast, is_binary_po_per_mb_w_bcast,
-            is_binary_po_per_w_bcast, is_binary_po_batch_bcast)
+    std::tie(is_binary_po_per_oc_sp_bcast, is_binary_po_per_oc_d_bcast,
+            is_binary_po_channel_bcast, is_binary_po_per_mb_bcast,
+            is_binary_po_per_mb_w_bcast, is_binary_po_per_w_bcast,
+            is_binary_po_batch_bcast)
             = binary_injector_utils::bcast_strategies_present_tup(
                     post_ops.entry_, dst_d,
                     broadcasting_strategy_t::per_oc_spatial,
+                    broadcasting_strategy_t::per_oc_d,
                     broadcasting_strategy_t::per_mb,
                     broadcasting_strategy_t::per_mb_spatial,
                     broadcasting_strategy_t::per_mb_w,
@@ -111,6 +115,7 @@ bool post_ops_ok(brgemm_matmul_conf_t &bgmmc, const primitive_attr_t &attr,
                     broadcasting_strategy_t::batch);
     const bool supported_binary_bcast
             = IMPLICATION(is_binary_po_per_oc_sp_bcast, ndims < 4)
+            && IMPLICATION(is_binary_po_per_oc_d_bcast, ndims == 4)
             && IMPLICATION(
                     is_binary_po_channel_bcast, utils::one_of(ndims, 3, 4))
             && IMPLICATION(
@@ -122,7 +127,8 @@ bool post_ops_ok(brgemm_matmul_conf_t &bgmmc, const primitive_attr_t &attr,
                     is_binary_po_batch_bcast, utils::one_of(ndims, 3, 4));
     const bcast_set_t default_bcast_set = {broadcasting_strategy_t::per_oc,
             broadcasting_strategy_t::per_oc_spatial,
-            broadcasting_strategy_t::scalar, broadcasting_strategy_t::per_mb,
+            broadcasting_strategy_t::per_oc_d, broadcasting_strategy_t::scalar,
+            broadcasting_strategy_t::per_mb,
             broadcasting_strategy_t::per_mb_spatial,
             broadcasting_strategy_t::per_mb_w, broadcasting_strategy_t::per_w,
             broadcasting_strategy_t::batch,
@@ -178,17 +184,29 @@ status_t check_isa_with_datatype(
             = IMPLICATION(bm_conf_utils.is_f32(),
                       one_of(isa, avx512_core, avx2) || bm_conf_utils.is_bf32())
             && IMPLICATION(bm_conf_utils.is_int8(),
-                    one_of(isa, avx512_core_amx, avx512_core_vnni, avx512_core,
-                            avx2_vnni_2, avx2_vnni))
+                    is_superset(isa, avx512_core)
+                            || is_superset(isa, avx2_vnni))
             && IMPLICATION(bm_conf_utils.is_bf16(),
                     one_of(isa, avx512_core_amx, avx512_core_bf16, avx2_vnni_2))
             && IMPLICATION(bm_conf_utils.is_f16(),
                     one_of(isa, avx512_core_amx_fp16, avx512_core_fp16,
                             avx2_vnni_2))
+            // `avx512_core_amx_fp16` is not supported for plain upconversion
+            // as HW supports native compute.
+            && IMPLICATION(bm_conf_utils.is_f32_f16(),
+                    one_of(isa, avx512_core_fp16, avx2_vnni_2, avx512_core,
+                            avx2))
+            // `avx512_core_amx` is not supported for plain upconversion as HW
+            // supports native compute.
+            && IMPLICATION(bm_conf_utils.is_f32_bf16(),
+                    one_of(isa, avx512_core_bf16, avx2_vnni_2, avx512_core,
+                            avx2))
             && IMPLICATION(bm_conf_utils.is_int8_with_bf16_dst(),
                     is_superset(isa, avx512_core) || isa == avx2_vnni_2)
             && IMPLICATION(bm_conf_utils.is_bf16_with_int_wei(),
                     is_superset(isa, avx512_core_bf16))
+            && IMPLICATION(bm_conf_utils.is_f16_with_int_wei(),
+                    one_of(isa, avx512_core_amx_fp16, avx512_core_fp16))
             && IMPLICATION(bm_conf_utils.is_f8(),
                     is_superset(isa, avx512_core_amx_fp16));
     return ok ? status::success : status::unimplemented;
@@ -197,10 +215,13 @@ status_t check_isa_with_datatype(
 status_t check_datatype_cfg(const brgemm_matmul_conf_utils_t &bm_conf_utils) {
     const bool ok
             = one_of(true, bm_conf_utils.is_f32(), bm_conf_utils.is_bf16(),
-                      bm_conf_utils.is_f16(), bm_conf_utils.is_bf32(),
+                      bm_conf_utils.is_f16(), bm_conf_utils.is_f32_f16(),
+                      bm_conf_utils.is_f32_bf16(), bm_conf_utils.is_bf32(),
                       bm_conf_utils.is_f8(), bm_conf_utils.is_int8(),
-                      bm_conf_utils.is_bf16_with_int_wei())
-            && IMPLICATION(bm_conf_utils.is_bf16_with_int_wei(),
+                      bm_conf_utils.is_bf16_with_int_wei(),
+                      bm_conf_utils.is_f16_with_int_wei())
+            && IMPLICATION(bm_conf_utils.is_bf16_with_int_wei()
+                            || bm_conf_utils.is_f16_with_int_wei(),
                     bm_conf_utils.with_weights_decompression());
     return ok ? status::success : status::unimplemented;
 }
@@ -224,10 +245,25 @@ brgemm_matmul_conf_utils_t::brgemm_matmul_conf_utils_t(
               && one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::any)
               && isa == avx512_core_amx)
     , weights_decompression_support(one_of(bgmmc.wei_dt, u8, s8, u4, s4)
-              && one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::any)
+              && one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::f16,
+                      fpmath_mode::any)
+              && IMPLICATION(attr.fpmath_.mode_ == fpmath_mode::f16,
+                      bgmmc.src_dt == f16)
+              && IMPLICATION(attr.fpmath_.mode_ == fpmath_mode::bf16,
+                      bgmmc.src_dt == bf16)
               && attr.fpmath_.apply_to_int_)
     , bf16_with_int_wei_dt(weights_decompression_support && bgmmc.src_dt == bf16
               && one_of(bgmmc.dst_dt, bf16, f32))
+    // Keep this var separate from f16_dt to not slip f16:f16 on avx512_core and
+    // avx2 as there's no kernel for such combination.
+    , f32_f16_dt(bgmmc.src_dt == f32 && bgmmc.wei_dt == f16
+              && one_of(bgmmc.dst_dt, f16, f32))
+    // Keep this var separate from bf16_dt to not slip bf16:bf16 on avx512_core
+    // and avx2 as there's no kernel for such combination.
+    , f32_bf16_dt(bgmmc.src_dt == f32 && bgmmc.wei_dt == bf16
+              && one_of(bgmmc.dst_dt, bf16, f32))
+    , f16_with_int_wei_dt(weights_decompression_support && bgmmc.src_dt == f16
+              && one_of(bgmmc.dst_dt, f16, f32))
     , A_any_layout(A_any_layout)
     , B_any_layout(B_any_layout)
     , C_any_layout(C_any_layout)
@@ -267,6 +303,11 @@ int brgemm_matmul_conf_utils_t::get_default_n_block(
     if (n_blk > 0) return n_blk;
 
     const int simd_w = isa_max_vlen(isa_) / sizeof(float);
+
+    if (matmul_amx_blocking_params_macro_t::is_supported(bgmmc, *this)) {
+        return 32;
+    }
+
     return is_superset(isa_, avx512_core) || !f32_dt
             ? 64
             : nstl::min<int>(24, rnd_up(bgmmc.N, simd_w));
@@ -280,6 +321,7 @@ status_t brgemm_matmul_conf_utils_t::set_or_check_B_tag(memory_desc_t &B_md,
                 ? get_default_n_block(format_tag::undef)
                 : bgmmc.N_blk;
         bgmmc.wei_tag = blocked_B_layouts_allowed && !bgmmc.is_runtime_N
+                        && !bgmmc.is_int4_weights
                 ? this->pick_blocked_B_layout(default_n_block)
                 : plain_tensor_layout_tag;
         VCONDCHECK_BG(
@@ -295,6 +337,7 @@ status_t brgemm_matmul_conf_utils_t::set_or_check_B_tag(memory_desc_t &B_md,
         }
     } else {
         bgmmc.wei_tag = blocked_B_layouts_allowed && !bgmmc.is_runtime_N
+                        && !bgmmc.is_int4_weights
                 ? memory_desc_matches_one_of_tag(B_md, plain_tensor_layout_tag,
                         transposed_tensor_layout_tag, blocked_64n_B_layout_tag,
                         blocked_48n_B_layout_tag, blocked_32n_B_layout_tag,
@@ -344,7 +387,9 @@ status_t brgemm_matmul_conf_utils_t::set_or_check_tags(memory_desc_t &A_md,
                 = this->is_int8() && is_superset(bgmmc.isa, avx512_core);
         const bool is_adbc_allowed
                 = (this->is_bf16() || this->is_f32() || this->is_bf32()
-                          || this->is_f16() || this->is_bf16_with_int_wei())
+                          || this->is_f16() || this->is_f32_f16()
+                          || this->is_f32_bf16() || this->is_bf16_with_int_wei()
+                          || this->is_f16_with_int_wei())
                 && !xf16_avx2_vnni_2;
         bgmmc.src_tag = is_adbc_allowed
                 ? memory_desc_matches_one_of_tag(A_md, plain_tensor_layout_tag,
@@ -436,8 +481,9 @@ status_t brgemm_matmul_conf_utils_t::set_B_flags(memory_desc_t &B_md) const {
 
 format_tag_t brgemm_matmul_conf_utils_t::pick_blocked_B_layout(
         int n_blk) const {
+    const auto wei_k_blk = data_type_vnni_simd_elems(bgmmc.wei_dt, bgmmc.isa);
     if (bgmmc.ndims > 3) return format_tag::undef;
-    if (this->is_int8()) switch (n_blk) {
+    if (this->is_int8() || this->is_f8()) switch (n_blk) {
             case 64: return bgmmc.ndims == 3 ? aCB16b64c4b : BA16a64b4a;
             case 48: return bgmmc.ndims == 3 ? aCB16b48c4b : BA16a48b4a;
             case 32: return bgmmc.ndims == 3 ? aCB16b32c4b : BA16a32b4a;
@@ -446,7 +492,10 @@ format_tag_t brgemm_matmul_conf_utils_t::pick_blocked_B_layout(
         }
 
     if (this->is_bf16() || this->is_bf16_with_int_wei()
-            || (this->is_f16() && bgmmc.isa != avx512_core_fp16))
+            || ((this->is_f16() || this->is_f32_f16() || this->is_f32_bf16()
+                        || this->is_f16_with_int_wei())
+                    && (is_superset(bgmmc.isa, avx512_core_amx)
+                            || is_superset(bgmmc.isa, avx2_vnni_2))))
         switch (n_blk) {
             case 64: return bgmmc.ndims == 3 ? aCB16b64c2b : BA16a64b2a;
             case 48: return bgmmc.ndims == 3 ? aCB16b48c2b : BA16a48b2a;
@@ -455,13 +504,16 @@ format_tag_t brgemm_matmul_conf_utils_t::pick_blocked_B_layout(
             default: return format_tag::undef;
         }
     // Note: bf32 assumes f32 blocking
-    if (this->is_f32() || this->is_bf32() || this->is_f16()) switch (n_blk) {
+    if (this->is_f32() || this->is_bf32() || this->is_f16()
+            || this->is_f32_f16() || this->is_f32_bf16()
+            || this->is_f16_with_int_wei())
+        switch (n_blk) {
             case 64: return bgmmc.ndims == 3 ? aCB16b64c : BA16a64b;
             case 48: return bgmmc.ndims == 3 ? aCB16b48c : BA16a48b;
             case 32: return bgmmc.ndims == 3 ? aCB16b32c : BA16a32b;
             case 24: return bgmmc.ndims == 3 ? aCB8b24c : BA8a24b;
             case 16:
-                return bgmmc.wei_k_blk == 8
+                return wei_k_blk == 8
                         ? (bgmmc.ndims == 3 ? aCB8b16c : BA8a16b)
                         : (bgmmc.ndims == 3 ? aCB16b16c : BA16a16b);
             case 8: return bgmmc.ndims == 3 ? aCB8b8c : BA8a8b;
@@ -476,76 +528,6 @@ brgemm_broadcast_t get_zp_type(const primitive_attr_t &attr, int arg) {
             : brgemm_broadcast_t::per_tensor;
 }
 
-struct matmul_amx_blocking_params_t : public brgemm_matmul_conf_t {
-    matmul_amx_blocking_params_t()
-        : nthr_k_(0)
-        , nthr_mnb_(0)
-        , nthr_(0)
-        , n_blk_(0)
-        , n_chunk_size_(0)
-        , n_chunk_elems_(0)
-        , m_blk_(0)
-        , m_chunk_size_(0)
-        , m_chunk_elems_(0)
-        , k_blk_(0)
-        , k_chunk_size_(0)
-        , k_chunk_elems_(0)
-        , current_lda_(0)
-        , need_buf_c_(false)
-        , blocking_chunk_mem_size_(0)
-        , efficiency_score_(0.0f) {}
-
-    matmul_amx_blocking_params_t(const brgemm_matmul_conf_t &bgmmc)
-        : brgemm_matmul_conf_t(bgmmc)
-        , nthr_k_(nstl::max(nthr_k, 1))
-        , nthr_mnb_(nthr / nthr_k_)
-        , nthr_(nthr_mnb_ * nthr_k_)
-        , n_blk_(N_blk)
-        , n_chunk_size_(N_chunk_size)
-        , n_chunk_elems_(n_blk_ * n_chunk_size_)
-        , m_blk_(M_blk)
-        , m_chunk_size_(M_chunk_size)
-        , m_chunk_elems_(m_blk_ * m_chunk_size_)
-        , k_blk_(K_blk)
-        , k_chunk_size_(brgemm_batch_size)
-        , k_chunk_elems_(k_blk_ * k_chunk_size_)
-        , current_lda_(LDA)
-        , need_buf_c_(use_buffer_c)
-        , blocking_chunk_mem_size_(0)
-        , efficiency_score_(0.0f) {}
-
-    void set_blocking_parameters(int nthr_k, int n_blk, int n_chunk_size,
-            int m_blk, int m_chunk_size);
-    void update_configuration(brgemm_matmul_conf_t &bgmmc) const;
-    float get_blocking_scores() const { return efficiency_score_; }
-
-    static size_t L2_threshold();
-
-private:
-    // num threads for parallelism wrt k dimension
-    int nthr_k_;
-    // num threads for parallelism wrt m, n and batch dimensions
-    int nthr_mnb_;
-    int nthr_;
-    dim_t n_blk_, n_chunk_size_, n_chunk_elems_;
-    dim_t m_blk_, m_chunk_size_, m_chunk_elems_;
-    dim_t k_blk_, k_chunk_size_, k_chunk_elems_;
-
-    dim_t current_lda_;
-    bool need_buf_c_;
-    size_t blocking_chunk_mem_size_;
-    float efficiency_score_;
-
-    void update_k_blocking_dependent_params();
-    dim_t get_actual_lda();
-    bool is_buffer_c_required();
-    size_t calculate_chunk_memory_size();
-    float get_thread_balance_scores();
-    float get_copied_data_reusage_scores();
-    float get_L2_utilization_scores() const;
-    float calculate_blocking_scores();
-};
-
 struct matmul_avx512_blocking_params_t {
     struct matmul_params_t {
         matmul_params_t(int m, int n, int k, int od)
@@ -688,130 +670,13 @@ struct matmul_avx512_blocking_params_t {
 
         bgmmc.use_buffer_c = is_buffer_c_required(
                 bgmmc.acc_dt, bgmmc.dst_dt, bgmmc.with_sum);
-        bgmmc.LDA = bgmmc.use_buffer_a || bgmmc.treat_transposed_A_as_plain
+        bgmmc.LDA = bgmmc.adjust_a_strides || bgmmc.use_buffer_a
+                        || bgmmc.treat_A_as_plain
                 ? get_actual_lda(bgmmc.use_buffer_a, bgmmc.tr_a_dt_sz)
                 : bgmmc.A_strides[1] / bgmmc.a_dt_sz;
     }
 };
 
-size_t matmul_amx_blocking_params_t::L2_threshold() {
-    return 3 * platform::get_per_core_cache_size(2) / 4;
-}
-
-void compute_blocking_heuristic_amx(const brgemm_matmul_conf_t &bgmmc,
-        const brgemm_matmul_conf_utils_t &bm_conf_utils,
-        matmul_amx_blocking_params_t &best_blocking) {
-    matmul_amx_blocking_params_t current_blocking(bgmmc);
-
-    const int min_k_per_thread = 1024;
-    const int max_k_parallel_work
-            = div_up(static_cast<int>(bgmmc.K), min_k_per_thread);
-    const bool is_amx_xf16 = bgmmc.is_amx
-            && (bm_conf_utils.is_bf16() || bm_conf_utils.is_f16()
-                    || bm_conf_utils.is_bf32()
-                    || bm_conf_utils.is_bf16_with_int_wei());
-    const bool is_amx_int8 = bgmmc.is_amx && bm_conf_utils.is_int8();
-
-    const bool runtime_dims
-            = bgmmc.is_runtime_M || bgmmc.is_runtime_N || bgmmc.is_runtime_K;
-    const int max_nthr_k = !runtime_dims && is_amx_xf16 && bgmmc.batch == 1
-            ? nstl::min(saturate(1, 7, bgmmc.nthr / 8), max_k_parallel_work)
-            : 1;
-    int iter = 0;
-    const int runtime_M_chunk = bgmmc.lda_big_pow2() ? 2 : 4;
-    const int runtime_N_chunk = 2;
-
-    // Disable skip configuration due to regressions for some cases.
-    const bool disable_skip_config = bgmmc.M == 4
-            && utils::one_of(true, bgmmc.N == 4096 && bgmmc.K == 4096,
-                    bgmmc.N == 11008 && bgmmc.K == 4096,
-                    bgmmc.N == 4096 && bgmmc.K == 11008);
-
-    for (int nthr_k = 1; nthr_k <= max_nthr_k; nthr_k++) {
-        int nthr_bmn = bgmmc.nthr / nthr_k;
-
-        int num_M_blk = bgmmc.is_runtime_M ? 1 : div_up(bgmmc.M, bgmmc.M_blk);
-        int num_N_blk = bgmmc.is_runtime_N ? 1 : div_up(bgmmc.N, bgmmc.N_blk);
-        int k_parallel_work = nstl::min(max_k_parallel_work, nthr_k);
-        int num_parallel_work
-                = bgmmc.batch * num_M_blk * num_N_blk * k_parallel_work;
-        const bool a_lot_of_parallel_work_lvl2
-                = num_parallel_work > 16 * bgmmc.nthr;
-        const bool low_parallelism
-                = static_cast<float>(num_parallel_work) < 1.5f * bgmmc.nthr;
-        const bool maybe_low_blocking
-                = is_amx_int8 && bm_conf_utils.maybe_low_brg_blocking();
-        const int min_M_blk = !bgmmc.is_runtime_M
-                        && (maybe_low_blocking || low_parallelism)
-                        && bgmmc.M_blk > 32
-                ? div_up(bgmmc.M_blk, 2)
-                : bgmmc.M_blk;
-        const int min_N_blk = !bgmmc.is_runtime_N && low_parallelism
-                        && is_amx_xf16 && !bm_conf_utils.check_n_blk_fixed()
-                        && bgmmc.N_blk > 32 && !runtime_dims
-                ? 32
-                : bgmmc.N_blk;
-        const int desired_M_chunk = bgmmc.is_runtime_M
-                ? runtime_M_chunk
-                : nstl::min(4, num_M_blk);
-        const int desired_N_chunk = bgmmc.is_runtime_N
-                ? runtime_N_chunk
-                : nstl::min(a_lot_of_parallel_work_lvl2 ? 6 : 4, num_N_blk);
-
-        std::unordered_set<int> mblk_candidates;
-        for (int m_blk = bgmmc.M_blk; m_blk >= min_M_blk;
-                m_blk = m_blk > 1 ? div_up(m_blk, 2) : m_blk - 1) {
-            if (IMPLICATION(maybe_low_blocking, m_blk != bgmmc.M_blk))
-                mblk_candidates.insert(m_blk);
-        }
-
-        if (!bgmmc.is_runtime_M && bgmmc.M > 16) {
-            // Add multiple of 16 M block sizes for consideration
-            const int mul16_m_blk_max
-                    = nstl::min(rnd_dn(static_cast<int>(bgmmc.M), 16), 64);
-            const int mul16_m_blk_min = rnd_up(min_M_blk, 16);
-            for (int m_blk = mul16_m_blk_max; m_blk >= mul16_m_blk_min;
-                    m_blk -= 16) {
-                mblk_candidates.insert(m_blk);
-            }
-        }
-
-        bool found_best_blocking = false;
-        for_(int n_blk = bgmmc.N_blk; n_blk >= min_N_blk; n_blk -= 16)
-        for_(int m_blk : mblk_candidates)
-        for_(int n_ch_sz = desired_N_chunk; n_ch_sz >= 1; n_ch_sz--)
-        for (int m_ch_sz = desired_M_chunk; m_ch_sz >= 1; m_ch_sz--, iter++) {
-            current_blocking.set_blocking_parameters(
-                    nthr_k, n_blk, n_ch_sz, m_blk, m_ch_sz);
-
-            float cur_score = current_blocking.get_blocking_scores();
-            float bst_score = best_blocking.get_blocking_scores();
-
-            int m_chunks = div_up(bgmmc.M, m_blk * m_ch_sz);
-            int n_chunks = div_up(bgmmc.N, n_blk * n_ch_sz);
-            int work_amount = bgmmc.batch * m_chunks * n_chunks;
-
-            bool skip_config = work_amount < nthr_bmn * 3
-                    && work_amount % nthr_bmn != 0 && max_nthr_k == 1;
-            if (skip_config && !disable_skip_config) continue;
-
-            if (cur_score > bst_score) {
-                best_blocking = current_blocking;
-                found_best_blocking = true;
-            }
-        }
-
-        if (!found_best_blocking) {
-            current_blocking.set_blocking_parameters(
-                    nthr_k, min_N_blk, 1, min_M_blk, 1);
-
-            float cur_score = current_blocking.get_blocking_scores();
-            float bst_score = best_blocking.get_blocking_scores();
-            if (cur_score > bst_score) best_blocking = current_blocking;
-        }
-    }
-}
-
 float compute_blocking_heuristic_avx512(brgemm_matmul_conf_t &bgmmc,
         const brgemm_matmul_conf_utils_t &bm_conf_utils,
         const matmul_avx512_blocking_params_t::matmul_params_t &matmul,
@@ -869,6 +734,7 @@ float compute_blocking_heuristic_avx512(brgemm_matmul_conf_t &bgmmc,
         // Parallelize across K for shapes with big 'K' dimension
         bool bwd_w_par_k_blk = bgmmc.batch == 1
                 && bm_conf_utils.check_is_transposed(bgmmc.src_tag)
+                && !bm_conf_utils.is_int8()
                 && IMPLICATION(bm_conf_utils.is_bf16(), math::is_pow2(matmul.K))
                 && matmul.K >= 2048;
         if (bwd_w_par_k_blk) {
@@ -1092,9 +958,39 @@ status_t compute_blocking_heuristic(brgemm_matmul_conf_t &bgmmc,
     bgmmc.N_blk = bgmmc.wei_n_blk;
     if (!bgmmc.is_runtime_N) bgmmc.N_blk = nstl::min(bgmmc.N_blk, bgmmc.N);
 
-    bgmmc.M_chunk_size = bgmmc.N_chunk_size = 1;
+    bgmmc.M_chunk_size = bgmmc.N_chunk_size = bgmmc.K_chunk_size = 1;
+
+    bool prefer_copy_a
+            = one_of(true, bm_conf_utils.is_f32() && bgmmc.isa == avx2,
+                      bm_conf_utils.is_bf16(),
+                      bm_conf_utils.is_bf16_with_int_wei(),
+                      (bgmmc.is_amx
+                              && (bm_conf_utils.is_f16()
+                                      || bm_conf_utils.is_f16_with_int_wei())))
+            && (bgmmc.isa != avx2_vnni_2) // no perf study yet.
+            && bgmmc.lda_big_pow2() && bgmmc.M >= 1024;
+
+    // Avoid copying A for small N gives better performance.
+    // TODO: Expand for other precisions and cases.
+
+    if (bgmmc.is_amx && bm_conf_utils.is_int8())
+        prefer_copy_a &= bgmmc.N >= 256;
 
     if (bgmmc.is_amx) {
+        if (matmul_amx_blocking_params_macro_t::is_supported(
+                    bgmmc, bm_conf_utils)) {
+            //grid heuristic is possible best blocking is set
+            matmul_amx_blocking_params_macro_t best_blocking(bgmmc);
+            matmul_amx_blocking_params_macro_t::find_best_blocking(
+                    bgmmc, bm_conf_utils, best_blocking);
+
+            if (best_blocking.get_blocking_scores() != 0.0f) {
+                best_blocking.update_configuration(bgmmc);
+                return status::success;
+            }
+        }
+        bgmmc.use_buffer_a |= prefer_copy_a;
+
         // Configure matrix sizes
         if (bgmmc.is_runtime_M) {
             bgmmc.M_blk = 64; // use fixed block size for runtime M case
@@ -1122,6 +1018,7 @@ status_t compute_blocking_heuristic(brgemm_matmul_conf_t &bgmmc,
         // AMX BRGEMM kernel requires (K_brgemm % 64 == 0 || K_brgemm < 64)
         // for K_brgemm reduction value to avoid AMX tiles re-configuration.
         // To satisfy this condition K_tail value is fixed to K % wei_k_blk here.
+
         const bool fixed_K_tail_size
                 = bgmmc.K % bgmmc.wei_k_blk > 0 && bgmmc.K > bgmmc.wei_k_blk;
         bgmmc.K_blk = bgmmc.K < bgmmc.wei_k_blk
@@ -1131,9 +1028,10 @@ status_t compute_blocking_heuristic(brgemm_matmul_conf_t &bgmmc,
         bgmmc.brgemm_batch_size
                 = nstl::max(bgmmc.K / bgmmc.K_blk, static_cast<dim_t>(1));
 
-        matmul_amx_blocking_params_t best_blocking(bgmmc);
+        matmul_amx_blocking_params_micro_t best_blocking(bgmmc);
 
-        compute_blocking_heuristic_amx(bgmmc, bm_conf_utils, best_blocking);
+        matmul_amx_blocking_params_micro_t::find_best_blocking(
+                bgmmc, bm_conf_utils, best_blocking);
 
         VCONDCHECK_BG(best_blocking.get_blocking_scores() != 0.0f,
                 VERBOSE_BLOCKING_FAIL, "");
@@ -1177,7 +1075,7 @@ status_t compute_blocking_heuristic(brgemm_matmul_conf_t &bgmmc,
         //
         // Batch_Size:
         // - unused.
-
+        bgmmc.use_buffer_a |= prefer_copy_a;
         const matmul_avx512_blocking_params_t::matmul_params_t matmul(
                 bgmmc.M, bgmmc.N, bgmmc.K, bgmmc.batch);
 
@@ -1190,6 +1088,7 @@ status_t compute_blocking_heuristic(brgemm_matmul_conf_t &bgmmc,
 
         best_blocking.update_configuration(bgmmc);
     } else {
+        bgmmc.use_buffer_a |= prefer_copy_a;
         VCONDCHECK_BG(is_superset(bm_conf_utils.get_isa(), avx2),
                 VERBOSE_UNSUPPORTED_ISA)
         const bool is_f32 = bm_conf_utils.is_f32() && bgmmc.isa == avx2;
@@ -1227,10 +1126,16 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     bgmmc.brg_type = brgemm_addr;
 
     bgmmc.src_dt = src_d.data_type();
+    bgmmc.orig_src_dt = src_d.data_type();
     bgmmc.dst_dt = dst_d.data_type();
     bgmmc.wei_dt = weights_d.data_type();
     bgmmc.orig_wei_dt = weights_d.data_type();
 
+    bgmmc.with_reduce = mmd.reduce_desc.format_kind != format_kind::undef;
+    bgmmc.reduce_dt
+            = bgmmc.with_reduce ? mmd.reduce_desc.data_type : data_type::undef;
+    bgmmc.reduce_kind = mmd.reduce_kind;
+
     bgmmc.with_bias = mmd.bias_desc.format_kind != format_kind::undef;
     bgmmc.bia_dt = bgmmc.with_bias ? mmd.bias_desc.data_type : data_type::undef;
     bgmmc.s8s8_compensation_required = bgmmc.src_dt == s8 && !isa_has_s8s8(isa);
@@ -1258,6 +1163,9 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     }
     bgmmc.is_bf32 = bm_conf_utils.is_bf32();
     bgmmc.is_bf16_with_int_wei = bm_conf_utils.is_bf16_with_int_wei();
+    bgmmc.is_f16_with_int_wei = bm_conf_utils.is_f16_with_int_wei();
+    bgmmc.is_f32_f16 = bm_conf_utils.is_f32_f16();
+    bgmmc.is_f32_bf16 = bm_conf_utils.is_f32_bf16();
     bgmmc.with_wei_decompression = bm_conf_utils.with_weights_decompression();
     bgmmc.is_int4_weights = one_of(bgmmc.wei_dt, data_type::s4, data_type::u4);
 
@@ -1268,12 +1176,29 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
         bgmmc.wei_dt = bf16;
         bgmmc.tr_a_dt_sz = types::data_type_size(bf16);
         bgmmc.tr_b_dt_sz = types::data_type_size(bf16);
-    } else if (bm_conf_utils.is_f16() && bgmmc.isa == avx512_core_fp16) {
+    } else if ((bm_conf_utils.is_f16() || bgmmc.is_f16_with_int_wei)
+            && bgmmc.isa == avx512_core_fp16) {
         // Similar to bf32, convert input data before compute
         bgmmc.src_dt = f32;
         bgmmc.wei_dt = f32;
         bgmmc.tr_a_dt_sz = types::data_type_size(f32);
         bgmmc.tr_b_dt_sz = types::data_type_size(f32);
+    } else if ((bm_conf_utils.is_f32_f16() || bm_conf_utils.is_f32_bf16())
+            && is_superset(bgmmc.isa, avx2)) {
+        // Note 1: Keep this branch separately from f16 one to have different
+        // ISA conditions (f16 includes f16:f32 and f16:f16 combinations). Same
+        // applies for bf16 (which includes bf16:bf16).
+        // Note 2: If `use_buffer_b()` is false, let the kernel perform the
+        // conversion. Otherwise, make the copy_b routine handle the conversion
+        // and set kernel data types to f32.
+        // Note 3: Since `use_buffer_b()` depends on `bgmmc.wei_tag`, which is
+        // set later in the code due to its dependencies, the update of data
+        // types to f32 happens below in ANCHOR: `CONVERT_F32_XF16_DATA_TYPES`.
+    } else if (bgmmc.is_f16_with_int_wei && bgmmc.isa != avx512_core_fp16) {
+        bgmmc.src_dt = f16;
+        bgmmc.wei_dt = f16;
+        bgmmc.tr_a_dt_sz = types::data_type_size(f16);
+        bgmmc.tr_b_dt_sz = types::data_type_size(f16);
     }
 
     bgmmc.acc_dt = bm_conf_utils.is_int8() ? s32 : f32;
@@ -1281,22 +1206,24 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     bgmmc.c_dt_sz = types::data_type_size(bgmmc.dst_dt);
     bgmmc.acc_dt_sz = types::data_type_size(bgmmc.acc_dt);
     if (bgmmc.with_bias) bgmmc.bias_dt_sz = types::data_type_size(bgmmc.bia_dt);
+    if (bgmmc.with_reduce)
+        bgmmc.reduce_dt_sz = types::data_type_size(bgmmc.reduce_dt);
 
     const auto &src_scales = attr.scales_.get(DNNL_ARG_SRC);
     const auto &wei_scales = attr.scales_.get(DNNL_ARG_WEIGHTS);
-    bgmmc.with_scales = !src_scales.has_default_values()
-            || !wei_scales.has_default_values();
-    if (bgmmc.with_scales) {
+    const bool has_wei_scales = !wei_scales.has_default_values();
+    bgmmc.with_scales = !src_scales.has_default_values() || has_wei_scales;
+    if (has_wei_scales) {
         const auto wei_qmask_N = 1 << (bgmmc.ndims - 1);
         const auto wei_qmask_K = 1 << (bgmmc.ndims - 2);
-        bgmmc.is_oscale_per_k = wei_scales.mask_ & wei_qmask_K;
-        bgmmc.is_oscale_per_n = wei_scales.mask_ & wei_qmask_N;
+        bgmmc.is_oscale_per_k = wei_scales.get_mask() & wei_qmask_K;
+        bgmmc.is_oscale_per_n = wei_scales.get_mask() & wei_qmask_N;
         bgmmc.apply_scales_in_buffer_b = bgmmc.is_oscale_per_k
                 && bgmmc.with_wei_decompression && bgmmc.N * bgmmc.K != 1;
 
         // only common and per-oc-channel scales are supported
         // only per-ic-channel scales is supprted with weight decompression
-        VCONDCHECK_BG(wei_scales.mask_ == 0 || bgmmc.is_oscale_per_n
+        VCONDCHECK_BG(wei_scales.get_mask() == 0 || bgmmc.is_oscale_per_n
                         || IMPLICATION(bgmmc.is_oscale_per_k,
                                 bgmmc.with_wei_decompression),
                 VERBOSE_UNSUPPORTED_SCALES_CFG);
@@ -1305,8 +1232,8 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     const auto &dst_scales = attr.scales_.get(DNNL_ARG_DST);
     bgmmc.with_dst_scales = !dst_scales.has_default_values();
     // only common scales are supported
-    VCONDCHECK_BG(!(bgmmc.with_dst_scales && dst_scales.mask_ != 0),
-            VERBOSE_UNSUPPORTED_SCALES_CFG)
+    VCONDCHECK_BG(!(bgmmc.with_dst_scales && dst_scales.get_mask() > 0),
+            VERBOSE_UNSUPPORTED_SCALES_CFG);
 
     const auto &p = attr.post_ops_;
     bgmmc.with_sum = p.find(primitive_kind::sum) != -1;
@@ -1395,27 +1322,36 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
             && bgmmc.is_oscale_per_k && bgmmc.is_oscale_per_n
             && bgmmc.transposed_B;
 
-    // int4 weights decompression only supports plain layout
-    // and transpose layouts when K % 2 == 0
+    if ((bm_conf_utils.is_f32_f16() || bm_conf_utils.is_f32_bf16())
+            && is_superset(bgmmc.isa, avx2) && bm_conf_utils.use_buffer_b()) {
+        // ANCHOR: `CONVERT_F32_XF16_DATA_TYPES`
+        bgmmc.src_dt = f32;
+        bgmmc.wei_dt = f32;
+        bgmmc.tr_a_dt_sz = types::data_type_size(f32);
+        bgmmc.tr_b_dt_sz = types::data_type_size(f32);
+    }
+
+    // int4 weights decompression only supports plain and transpose layouts
     // TODO: enable int4 reorder and extend support to blocked weights
     // layout when needed
     if (bgmmc.with_wei_decompression && bgmmc.is_int4_weights)
         VCONDCHECK_BG(bm_conf_utils.check_is_plain(bgmmc.wei_tag)
-                        || (bm_conf_utils.check_is_transposed(bgmmc.wei_tag)
-                                && bgmmc.K % 2 == 0),
+                        || bm_conf_utils.check_is_transposed(bgmmc.wei_tag),
                 VERBOSE_UNSUPPORTED_TAG);
 
     const bool transposed_A = bm_conf_utils.check_is_transposed(bgmmc.src_tag);
-    // if M == 1 we can still treat formally transposed A as plain
-    // and avoid copy routine creation/execution
-    bgmmc.treat_transposed_A_as_plain = transposed_A && bgmmc.M == 1;
-    bgmmc.transposed_A = ((transposed_A && !bgmmc.treat_transposed_A_as_plain)
+    // When M == 1 MatMul always considers A to be non-transposed even if A md
+    // was created using "ba" tag. It is not plain in cab layout.
+    bgmmc.treat_A_as_plain = bgmmc.M == 1
+            && IMPLICATION(bgmmc.batch != 1,
+                    bm_conf_utils.check_is_plain(bgmmc.src_tag));
+    bgmmc.transposed_A = ((transposed_A && !bgmmc.treat_A_as_plain)
             || bgmmc.src_tag == adbc);
     // For batched problems with plain A and C and fully broadcasted across B
     // we can merge all the batch dimensions into M if broadcast strategies
     // set is limited for binary post-ops
     const bool plain_A_layout = bm_conf_utils.check_is_plain(bgmmc.src_tag)
-            || bgmmc.treat_transposed_A_as_plain;
+            || bgmmc.treat_A_as_plain;
     const bool merge_batch_dims_into_M = bgmmc.batch > 1
             && bgmmc.bcast_B_desc.bcast_across_all_batch_dims && plain_A_layout
             && helper.is_src_dst_layout_batch_fusable()
@@ -1482,27 +1418,13 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
                 VERBOSE_UNSUPPORTED_MEM_STRIDE);
     }
 
-    bool prefer_copy_a
-            = one_of(true, bm_conf_utils.is_f32() && bgmmc.isa == avx2,
-                      bm_conf_utils.is_bf16(),
-                      bm_conf_utils.is_bf16_with_int_wei(),
-                      (bgmmc.is_amx && bm_conf_utils.is_f16()))
-            && (bgmmc.isa != avx2_vnni_2) // no perf study yet.
-            && bgmmc.lda_big_pow2() && bgmmc.M >= 1024;
-
-    // Avoid copying A for small N gives better performance.
-    // TODO: Expand for other precisions and cases.
-    if (bgmmc.is_amx && bm_conf_utils.is_int8())
-        prefer_copy_a &= bgmmc.N >= 256;
-
-    const bool is_copy_a_required
-            = (bgmmc.is_amx
-                      && ((bgmmc.K % bgmmc.required_k_granularity != 0)
-                              || bm_conf_utils.is_bf32()))
-            || (bm_conf_utils.is_f16() && isa == avx512_core_fp16)
+    const bool is_copy_a_required = (bgmmc.is_amx && bm_conf_utils.is_bf32())
+            || ((bm_conf_utils.is_f16() || bm_conf_utils.is_f16_with_int_wei())
+                    && isa == avx512_core_fp16)
             || (bgmmc.wei_zp_type != brgemm_broadcast_t::none
                     && !bm_conf_utils.with_weights_decompression())
-            || bgmmc.transposed_A || prefer_copy_a;
+            || bgmmc.transposed_A;
+
     bgmmc.use_buffer_a = is_copy_a_required;
 
     // Supported computation with copy only part of A related to K_tail if
@@ -1522,10 +1444,9 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
 
     // We need to correct A_strides if batched dimensions are merged in M and
     // A layout is formally transposed but could be treated as plain
-    if (merge_batch_dims_into_M
-            && (src_d.matches_tag(acbd) || bgmmc.treat_transposed_A_as_plain)) {
-        bgmmc.A_strides[1] = bgmmc.A_strides[2];
-    }
+    bgmmc.adjust_a_strides = merge_batch_dims_into_M
+            && (src_d.matches_tag(acbd) || bgmmc.treat_A_as_plain);
+    if (bgmmc.adjust_a_strides) bgmmc.A_strides[1] = bgmmc.A_strides[2];
 
     // We need to correct C_strides if batched dimensions are merged in M and
     // C layout is formally transposed but could be treated as plain
@@ -1568,13 +1489,23 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     bgmmc.M_tail = bgmmc.is_runtime_M ? 0 : bgmmc.M % bgmmc.M_blk;
     bgmmc.N_tail = bgmmc.is_runtime_N ? 0 : bgmmc.N % bgmmc.N_blk;
     bgmmc.K_tail = bgmmc.K > bgmmc.K_blk
-            ? rnd_up(bgmmc.K % bgmmc.K_blk, bgmmc.required_k_granularity)
+            ? (bgmmc.extendable_k ? bgmmc.K % bgmmc.K_blk
+                                  : rnd_up(bgmmc.K % bgmmc.K_blk,
+                                          bgmmc.required_k_granularity))
             : 0;
 
     bgmmc.LDB = bm_conf_utils.get_actual_LDB();
-    bgmmc.LDD = dst_d.blocking_desc().strides[bgmmc.ndims - 2];
+    VCONDCHECK_BG(!(bgmmc.LDB < bgmmc.N_blk && bgmmc.N_blk % bgmmc.LDB != 0
+                          && bgmmc.N_blk != bgmmc.N),
+            "The first coordinate of every N_blk that is larger than LDB "
+            "needs to be divisible by LDB");
+
+    bgmmc.LDD = dst_d.ndims() == 2 && bgmmc.M == 1
+            ? bgmmc.N
+            : dst_d.blocking_desc().strides[bgmmc.ndims - 2];
     bgmmc.LDC = bgmmc.use_buffer_c && bgmmc.nthr_k <= 1
-            ? bgmmc.N_blk * (bgmmc.is_runtime_N ? bgmmc.N_chunk_size : 1)
+            ? (bgmmc.is_amx ? nstl::min((dim_t)32, bgmmc.N_blk) : bgmmc.N_blk)
+                    * (bgmmc.is_runtime_N ? bgmmc.N_chunk_size : 1)
             : bgmmc.LDD;
 
     bgmmc.is_src_batch_layout_trivial
@@ -1585,6 +1516,39 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
             = is_batch_layout_trivial(dst_d, bgmmc.batch);
     init_aux_values(bgmmc, src_d, weights_d, dst_d);
 
+    bgmmc.use_buffer_reduce
+            = (bgmmc.reduce_dt != data_type::f32) || (bgmmc.nthr_k > 1);
+
+    const dim_t max_a_stride = bgmmc.M_blk
+            * (bgmmc.use_buffer_a ? bgmmc.copy_A_src_stride
+                                  : bgmmc.LDA * bgmmc.a_dt_sz);
+    const dim_t max_b_stride = bgmmc.K_blk
+            * (bgmmc.use_buffer_b ? bgmmc.copy_B_wei_stride
+                                  : bgmmc.LDB * bgmmc.b_dt_sz);
+    const dim_t max_c_stride = bgmmc.M_blk * bgmmc.LDC * bgmmc.c_dt_sz;
+    const dim_t max_d_stride = bgmmc.M_blk * bgmmc.LDD * bgmmc.acc_dt_sz;
+
+    const dim_t max_supported_stride = std::numeric_limits<int32_t>::max();
+
+    VCONDCHECK_BG(max_a_stride <= max_supported_stride,
+            VERBOSE_UNSUPPORTED_FEATURE,
+            "src stride > INT32_MAX is not supported");
+    VCONDCHECK_BG(max_b_stride <= max_supported_stride,
+            VERBOSE_UNSUPPORTED_FEATURE,
+            "weights stride > INT32_MAX is not supported");
+    VCONDCHECK_BG(std::max(max_c_stride, max_d_stride) <= max_supported_stride,
+            VERBOSE_UNSUPPORTED_FEATURE,
+            "dst stride > INT32_MAX is not supported");
+
+    // When is_wei_batch_layout_trivial is true, we only support that
+    // batch offset can be divided by 2
+    if (bgmmc.is_int4_weights) {
+        VCONDCHECK_BG(IMPLICATION(bgmmc.is_wei_batch_layout_trivial
+                                      && bgmmc.batch > 1,
+                              bgmmc.B_strides[2] % 2 == 0),
+                VERBOSE_BAD_DIM);
+    }
+
     // Dispatch small shapes to VNNI for better performance
     const bool runtime_dims
             = bgmmc.is_runtime_M || bgmmc.is_runtime_N || bgmmc.is_runtime_K;
@@ -1596,12 +1560,14 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     is_small_shapes = is_small_shapes && (bgmmc.isa != avx512_core_amx_fp16);
 
     if (bm_conf_utils.is_bf16() || bm_conf_utils.is_f16()
-            || bm_conf_utils.is_bf16_with_int_wei()) {
+            || bm_conf_utils.is_f32_f16() || bm_conf_utils.is_f32_bf16()
+            || bm_conf_utils.is_bf16_with_int_wei()
+            || bm_conf_utils.is_f16_with_int_wei()) {
         // empirical observation for performance breakpoint between amx and vnni
         // bf16/f16
         const dim_t buffer_a_chunk_sz_limit = 126;
         is_small_shapes = is_small_shapes
-                && bgmmc.buffer_a_chunk_sz <= buffer_a_chunk_sz_limit;
+                && bgmmc.buffer_a_gb_stride <= buffer_a_chunk_sz_limit;
     } else {
         is_small_shapes = is_small_shapes && bgmmc.ndims < 3
                 && ((bgmmc.M == 1 && bgmmc.K == 256)
@@ -1616,46 +1582,84 @@ status_t init_brgemm_matmul_conf(cpu_isa_t isa, brgemm_matmul_conf_t &bgmmc,
     return status::success;
 }
 
-status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t K, dim_t N,
-        dim_t in_ld, dim_t n_blk, data_type_t in_type, data_type_t out_type,
-        format_tag_t in_tag) {
-    if (n_blk <= 0) return status::invalid_arguments;
+status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t M, dim_t K,
+        dim_t N, dim_t in_ld, dim_t n_blk, data_type_t in_type,
+        data_type_t out_type, format_tag_t in_tag) {
+    if (n_blk <= 0 && M <= 0) return status::invalid_arguments;
 
     const auto vnni_granularity = data_type_vnni_granularity(out_type);
     if (vnni_granularity <= 0) return status::invalid_arguments;
 
+    // Zero initialize the `conf` to avoid access to 'garbage' in members.
+    conf = brgemm_matmul_conf_t();
+
     const bool is_bf16_with_int_wei = out_type == data_type::bf16
             && utils::one_of(in_type, data_type::s8, data_type::u8,
                     data_type::s4, data_type::u4);
     const bool with_wei_decompression = in_type != out_type
             && utils::one_of(in_type, data_type::s8, data_type::u8,
                     data_type::s4, data_type::u4);
-    const dim_t typesize_scale
-            = utils::one_of(in_type, data_type::s4, data_type::u4) ? 2 : 1;
 
-    conf.blocked_B = !utils::one_of(in_tag, ab, ba, abc, acb);
-    conf.transposed_B = utils::one_of(in_tag, ba, acb);
+    const bool is_copyB = N > 0;
+    conf.isa = get_max_cpu_isa(); // Just use the best ISA possible.
     conf.is_bf32 = false;
-    conf.is_bf16_with_int_wei = is_bf16_with_int_wei;
-    conf.with_wei_decompression = with_wei_decompression;
-    conf.orig_wei_dt = in_type;
-    conf.wei_tag = in_tag;
     conf.batch = batch;
-    conf.K = K;
-    conf.N = N;
-    conf.wei_n_blk = conf.N_blk = conf.LDB = n_blk;
-    conf.N_tail = conf.N % conf.N_blk;
-    conf.K_blk = 16 * vnni_granularity;
-    conf.K_tail = conf.K % conf.K_blk;
     conf.src_dt = conf.wei_dt = out_type;
+    conf.orig_src_dt = conf.orig_wei_dt = in_type;
+    // Note: will need to change `tr_a_dt_sz` for copyA in cases where src_dt != dst_dt
     conf.a_dt_sz = conf.tr_a_dt_sz = types::data_type_size(conf.src_dt);
-    conf.b_dt_sz = types::data_type_size(in_type);
-    conf.tr_b_dt_sz = types::data_type_size(conf.wei_dt);
-    conf.copy_B_wei_stride = (in_ld * conf.b_dt_sz) / typesize_scale;
-    conf.N_chunk_elems = conf.N; // To match seems unneeded assert.
-    conf.s8s8_comp_b_str = utils::rnd_up(conf.N, conf.wei_n_blk);
-    conf.s8s8_comp_n_str = conf.wei_n_blk;
-    conf.isa = get_max_cpu_isa(); // Just use the best ISA possible.
+    conf.N = N;
+    conf.M = M;
+    conf.K = K;
+    const dim_t copyA_K_blk = isa_num_vregs(conf.isa) / 2;
+    const dim_t copyB_K_blk = 16 * vnni_granularity;
+    conf.K_blk = is_copyB ? copyB_K_blk : copyA_K_blk;
+    conf.K_tail = conf.K % conf.K_blk;
+    if (!is_copyB) {
+        // Note: current implementation always calls the transposed kernel.
+        conf.transposed_A = true;
+        conf.M_blk = (dim_t)isa_max_vlen(conf.isa) / conf.a_dt_sz;
+        conf.M_tail = conf.M % conf.M_blk;
+        conf.copy_A_src_stride = in_ld * conf.a_dt_sz;
+        // setting LDA parameter required for plain transpose
+        conf.LDA = conf.K;
+
+        // jit_brgemm_matmul_copy_a_tranposed_impl_t::dst_stride
+        dim_t dst_stride = conf.LDA * conf.tr_a_dt_sz;
+
+        dim_t max_src_encode_stride = conf.K_blk * conf.copy_A_src_stride;
+        dim_t max_dst_encode_stride = conf.M_blk * dst_stride;
+
+        // Cannot encode EVEX compressed addresses
+        VCONDCHECK_BG(std::max(max_src_encode_stride, max_dst_encode_stride)
+                        <= std::numeric_limits<int32_t>::max(),
+                VERBOSE_UNSUPPORTED_MEM_STRIDE);
+
+    } else {
+        conf.blocked_B = !utils::one_of(in_tag, ab, ba, abc, acb);
+        conf.transposed_B = utils::one_of(in_tag, ba, acb);
+        conf.is_bf16_with_int_wei = is_bf16_with_int_wei;
+        conf.with_wei_decompression = with_wei_decompression;
+        conf.wei_tag = in_tag;
+        conf.wei_n_blk = conf.N_blk = conf.LDB = n_blk;
+        conf.N_tail = conf.N % conf.N_blk;
+        conf.b_dt_sz = types::data_type_size(in_type);
+        conf.tr_b_dt_sz = types::data_type_size(conf.wei_dt);
+        conf.copy_B_wei_stride = in_ld * conf.b_dt_sz;
+        conf.N_chunk_elems = conf.N; // To match seems unneeded assert.
+        conf.s8s8_comp_b_str = utils::rnd_up(conf.N, conf.wei_n_blk);
+        conf.s8s8_comp_n_str = conf.wei_n_blk;
+
+        dim_t max_wei_encode_off = conf.K_blk * conf.copy_B_wei_stride
+                + conf.wei_n_blk * conf.b_dt_sz;
+        dim_t max_dst_encode_off
+                = (conf.K_blk * conf.LDB + conf.wei_n_blk) * conf.tr_b_dt_sz;
+
+        // Cannot encode EVEX compressed addresses
+        VCONDCHECK_BG(std::max(max_wei_encode_off, max_dst_encode_off)
+                        <= std::numeric_limits<int32_t>::max(),
+                VERBOSE_UNSUPPORTED_MEM_STRIDE);
+    }
 
     // The following members are different from the upper level `init_conf()`
     // call from the reorder implementation due to lacking a memory descriptor
@@ -1674,36 +1678,76 @@ void init_aux_values(brgemm_matmul_conf_t &bgmmc,
         const memory_desc_wrapper &dst_d) {
     bgmmc.M_chunk_elems = bgmmc.M_blk * bgmmc.M_chunk_size;
     bgmmc.N_chunk_elems = bgmmc.N_blk * bgmmc.N_chunk_size;
-    bgmmc.K_chunk_elems = bgmmc.K_blk * bgmmc.brgemm_batch_size;
+    bgmmc.K_chunk_elems
+            = bgmmc.K_blk * bgmmc.K_chunk_size * bgmmc.brgemm_batch_size;
     bgmmc.M_chunks = div_up(bgmmc.M, bgmmc.M_chunk_elems);
     bgmmc.N_chunks = div_up(bgmmc.N, bgmmc.N_chunk_elems);
     bgmmc.K_chunks = div_up(bgmmc.K, bgmmc.K_chunk_elems);
     bgmmc.num_M_blocks = div_up(bgmmc.M, bgmmc.M_blk);
     bgmmc.num_N_blocks = div_up(bgmmc.N, bgmmc.N_blk);
+    bgmmc.num_K_blocks = div_up(bgmmc.K, bgmmc.K_blk * bgmmc.brgemm_batch_size);
+
     const int last_chunck_batch_size
             = (nstl::max(bgmmc.K, bgmmc.K_blk)
                       - (bgmmc.K_chunks - 1) * bgmmc.K_chunk_elems)
             / bgmmc.K_blk;
+
     bgmmc.brgemm_batch_tail_size
             = last_chunck_batch_size % bgmmc.brgemm_batch_size;
 
-    bgmmc.buffer_c_chunk_sz = bgmmc.acc_dt_sz
-            * (bgmmc.is_runtime_N ? bgmmc.N_blk : bgmmc.LDC)
-            * (bgmmc.nthr_k > 1 ? bgmmc.M : bgmmc.M_blk);
-    bgmmc.buffer_c_per_thread_sz = bgmmc.buffer_c_chunk_sz
-            * (bgmmc.nthr_k > 1 ? 1 : bgmmc.M_chunk_size * bgmmc.N_chunk_size);
+    if (!bgmmc.is_runtime_N && bgmmc.is_amx && bgmmc.nthr_k == 1) {
+        bgmmc.buffer_c_chunk_sz = rnd_up(bgmmc.N_blk, bgmmc.LDC) * bgmmc.M_blk
+                * bgmmc.acc_dt_sz;
+    } else {
+        bgmmc.buffer_c_chunk_sz = bgmmc.acc_dt_sz
+                * (bgmmc.is_runtime_N ? bgmmc.N_blk : bgmmc.LDC)
+                * (bgmmc.nthr_k > 1 ? bgmmc.M : bgmmc.M_blk);
+    }
+
+    if (bgmmc.nthr_k > 1) {
+        // c size == M * N (for reduction)
+        bgmmc.buffer_c_per_thread_sz = bgmmc.buffer_c_chunk_sz;
+
+    } else if (!bgmmc.is_runtime_N && !bgmmc.is_runtime_M
+            && bgmmc.K_chunk_elems >= bgmmc.K) {
+        // c size == BRGEMM size
+        bgmmc.buffer_c_per_thread_sz = bgmmc.buffer_c_chunk_sz;
 
-    bgmmc.buffer_a_chunk_sz = bgmmc.tr_a_dt_sz * bgmmc.M_blk
+    } else {
+        // c size == chunk size
+        bgmmc.buffer_c_per_thread_sz = bgmmc.buffer_c_chunk_sz
+                * bgmmc.M_chunk_size * bgmmc.N_chunk_size;
+    }
+
+    bgmmc.buffer_a_gb_stride = bgmmc.tr_a_dt_sz * bgmmc.M_blk
             * (bgmmc.use_buffer_a_tail_only ? bgmmc.wei_k_blk : bgmmc.LDA);
-    bgmmc.buffer_a_chunk_shift_along_m = bgmmc.buffer_a_chunk_sz
-            * (bgmmc.use_buffer_a_tail_only ? 1 : bgmmc.brgemm_batch_size);
-    bgmmc.buffer_a_per_thread_sz
-            = bgmmc.buffer_a_chunk_shift_along_m * bgmmc.M_chunk_size;
 
-    bgmmc.buffer_b_chunk_sz = bgmmc.tr_b_dt_sz * bgmmc.LDB
-            * rnd_up(bgmmc.K_blk, bgmmc.wei_k_blk);
-    bgmmc.buffer_b_per_thread_sz
-            = bgmmc.buffer_b_chunk_sz * bgmmc.brgemm_batch_size;
+    bgmmc.buffer_a_k_stride
+            = bgmmc.buffer_a_gb_stride * bgmmc.brgemm_batch_size;
+
+    bgmmc.buffer_a_m_stride = bgmmc.buffer_a_k_stride * bgmmc.K_chunk_size;
+
+    bgmmc.buffer_a_per_thread_sz = bgmmc.buffer_a_m_stride * bgmmc.M_chunk_size;
+
+    bgmmc.buffer_b_gb_stride = bgmmc.tr_b_dt_sz * bgmmc.LDB * bgmmc.K_blk;
+    bgmmc.buffer_b_k_brg_stride
+            = bgmmc.buffer_b_gb_stride * bgmmc.brgemm_batch_size;
+
+    bgmmc.buffer_b_n_blk_stride = bgmmc.tr_b_dt_sz
+            * ((bgmmc.N_blk / bgmmc.LDB) * bgmmc.LDB2
+                    + (bgmmc.N_blk % bgmmc.LDB)
+                            * data_type_vnni_granularity(bgmmc.wei_dt));
+
+    bgmmc.buffer_b_chunk_sz = bgmmc.tr_b_dt_sz * rnd_up(bgmmc.N_blk, bgmmc.LDB)
+            * rnd_up(bgmmc.K_chunk_elems, bgmmc.wei_k_blk);
+
+    bgmmc.buffer_b_per_thread_sz = bgmmc.buffer_b_chunk_sz;
+
+    bgmmc.buffer_reduce_per_thread_sz = 0;
+    if (bgmmc.reduce_kind == matmul_reduce_kind::src) {
+        assert(bgmmc.acc_dt == f32);
+        bgmmc.buffer_reduce_per_thread_sz = bgmmc.M * bgmmc.acc_dt_sz;
+    }
 
     bgmmc.s8s8_comp_ithr_str
             = bgmmc.use_buffer_b ? bgmmc.wei_n_blk * bgmmc.N_chunk_size : 0;
@@ -1726,9 +1770,13 @@ void init_aux_values(brgemm_matmul_conf_t &bgmmc,
             const dim_t src_stride = src_d.matches_tag(acbd)
                     ? bgmmc.A_strides[1]
                     : bgmmc.A_strides[0];
+            const dim_t copy_A_src_stride = src_d.matches_tag(dabc)
+                            && bgmmc.K * bgmmc.batch
+                                    == src_d.blocking_desc().strides[0]
+                    ? src_d.blocking_desc().strides[0]
+                    : src_d.blocking_desc().strides[0] * bgmmc.K;
             bgmmc.copy_A_src_stride
-                    = nstl::min(src_d.blocking_desc().strides[0],
-                              src_stride / factor)
+                    = nstl::min(copy_A_src_stride, src_stride / factor)
                     * factor;
         }
 
@@ -1756,7 +1804,6 @@ void init_aux_values(brgemm_matmul_conf_t &bgmmc,
                                   : wei_d.blocking_desc().strides[0])
                 * bgmmc.b_dt_sz;
     }
-    const int int4_fac = bgmmc.is_int4_weights ? 2 : 1;
     if (wei_d.matches_one_of_tag(acbd, adbc) != format_tag::undef
             && wei_d.matches_one_of_tag(abcd, abdc) == format_tag::undef) {
         const dim_t factor = bgmmc.wei_dt == f32 ? 2 : 1;
@@ -1767,14 +1814,14 @@ void init_aux_values(brgemm_matmul_conf_t &bgmmc,
                 * factor;
     } else if (bgmmc.transposed_B) {
         bgmmc.copy_B_wei_stride
-                = (wei_d.strides()[bgmmc.ndims - 1] * bgmmc.b_dt_sz) / int4_fac;
+                = (wei_d.strides()[bgmmc.ndims - 1] * bgmmc.b_dt_sz);
     } else if (bgmmc.is_runtime_N) {
         bgmmc.copy_B_wei_stride = bgmmc.N;
     } else if (bgmmc.blocked_B) {
-        bgmmc.copy_B_wei_stride = (bgmmc.LDB * bgmmc.b_dt_sz) / int4_fac;
+        bgmmc.copy_B_wei_stride = (bgmmc.LDB * bgmmc.b_dt_sz);
     } else {
         bgmmc.copy_B_wei_stride
-                = (wei_d.strides()[bgmmc.ndims - 2] * bgmmc.b_dt_sz) / int4_fac;
+                = (wei_d.strides()[bgmmc.ndims - 2] * bgmmc.b_dt_sz);
     }
 
     bgmmc.C_ptr_shift_b = dst_d.matches_one_of_tag(acbd)
@@ -1785,10 +1832,11 @@ void init_aux_values(brgemm_matmul_conf_t &bgmmc,
     bgmmc.has_zero_point_b = bgmmc.wei_zp_type != brgemm_broadcast_t::none;
     bgmmc.has_zero_point_c = bgmmc.dst_zp_type != brgemm_broadcast_t::none;
     bgmmc.post_ops_applicable = one_of(true, bgmmc.with_sum, bgmmc.with_bias,
-            bgmmc.with_scales, bgmmc.with_eltwise, bgmmc.with_binary,
-            bgmmc.acc_dt != bgmmc.dst_dt, bgmmc.s8s8_compensation_required,
-            bgmmc.has_zero_point_a, bgmmc.has_zero_point_b,
-            bgmmc.has_zero_point_c, bgmmc.with_dst_scales);
+            bgmmc.with_scales && !bgmmc.apply_scales_in_buffer_b,
+            bgmmc.with_eltwise, bgmmc.with_binary, bgmmc.acc_dt != bgmmc.dst_dt,
+            bgmmc.s8s8_compensation_required, bgmmc.has_zero_point_a,
+            bgmmc.has_zero_point_b, bgmmc.has_zero_point_c,
+            bgmmc.with_dst_scales);
 
     bgmmc.zp_a_comp_shift_n = bgmmc.wei_n_blk;
     bgmmc.zp_a_comp_elems_per_thr
@@ -1832,6 +1880,14 @@ void init_scratchpad(memory_tracking::registrar_t &scratchpad,
         scratchpad.book(key_brgemm_primitive_buffer,
                 bgmmc.nthr * bgmmc.buffer_c_per_thread_sz, default_data_align);
 
+    if (bgmmc.use_buffer_reduce) {
+        const bool is_reduce_f32 = bgmmc.reduce_dt == f32;
+        scratchpad.book(key_brgemm_primitive_buffer_reduce,
+                (bgmmc.nthr_k - is_reduce_f32)
+                        * bgmmc.buffer_reduce_per_thread_sz,
+                default_data_align);
+    }
+
     if (bgmmc.has_zero_point_a) {
         const auto num_elems = bgmmc.nthr * bgmmc.zp_a_comp_elems_per_thr;
         scratchpad.book(key_brgemm_primitive_zp_comp_a, num_elems,
@@ -1853,211 +1909,6 @@ void init_scratchpad(memory_tracking::registrar_t &scratchpad,
                 default_data_align);
 }
 
-void matmul_amx_blocking_params_t::update_k_blocking_dependent_params() {
-    k_chunk_elems_ = k_blk_ * k_chunk_size_;
-    current_lda_ = get_actual_lda();
-    need_buf_c_ = is_buffer_c_required();
-}
-
-void matmul_amx_blocking_params_t::set_blocking_parameters(
-        int nthr_k, int n_blk, int n_chunk_size, int m_blk, int m_chunk_size) {
-    nthr_k_ = nstl::max(1, nthr_k);
-    nthr_mnb_ = nthr / nthr_k_;
-    nthr_ = nthr_mnb_ * nthr_k_;
-    n_blk_ = n_blk;
-    n_chunk_size_ = n_chunk_size;
-    m_blk_ = m_blk;
-    m_chunk_size_ = m_chunk_size;
-    if (one_of(0, n_blk_, n_chunk_size_, m_blk_, m_chunk_size_)) {
-        k_blk_ = k_chunk_size_ = k_chunk_elems_ = 0;
-        efficiency_score_ = 0.0f;
-        return;
-    }
-
-    n_chunk_elems_ = n_blk_ * n_chunk_size_;
-    m_chunk_elems_ = m_blk_ * m_chunk_size_;
-
-    if (K < wei_k_blk) {
-        k_blk_ = is_amx ? rnd_up(K, required_k_granularity) : K;
-        k_chunk_size_ = 1;
-    } else {
-        dim_t k_per_thr = div_up(K, nthr_k_);
-        k_blk_ = nstl::min(rnd_up(k_per_thr, required_k_granularity),
-                static_cast<dim_t>(wei_k_blk));
-        const dim_t num_k_blk = div_up(K, k_blk_);
-        const dim_t num_k_blk_per_thread = div_up(num_k_blk, nthr_k_);
-        k_chunk_size_ = num_k_blk_per_thread;
-
-        auto chunk_sz = calculate_chunk_memory_size();
-        const dim_t div_min = chunk_sz / L2_threshold();
-        const dim_t div_max = div_up(chunk_sz, L2_threshold());
-        // for big pow2 lda prefer to increase area of linear memory access
-        const dim_t adjust_k_divisor_threshold = lda_big_pow2() ? 2 : 0;
-        // adjust k blocking values to fit into L2 cache
-        if (div_min > adjust_k_divisor_threshold && k_chunk_size_ > 1) {
-            const auto kc1
-                    = nstl::max(k_chunk_size_ / div_min, static_cast<dim_t>(1));
-            const auto kc2 = div_up(k_chunk_size_, div_max);
-            const auto tail1 = num_k_blk_per_thread % kc1;
-            const auto tail2 = num_k_blk_per_thread % kc2;
-            // prefer adjusted chunk size with more equal work distribution
-            // across iterations
-            k_chunk_size_ = IMPLICATION(tail1 == 0 || tail2 < tail1, tail2 == 0)
-                    ? kc2
-                    : kc1;
-        }
-
-        const dim_t current_k_tail = K % k_blk_;
-        if (current_k_tail == 0 && K % (k_blk_ * k_chunk_size_) == 0) {
-            k_blk_ *= k_chunk_size_;
-            k_chunk_size_ = 1;
-        } else if (nthr_k_ == 1
-                && K == k_blk_ * k_chunk_size_ + current_k_tail) {
-            k_blk_ *= k_chunk_size_;
-            k_chunk_size_ = 2;
-        }
-    }
-
-    blocking_chunk_mem_size_ = calculate_chunk_memory_size();
-
-    efficiency_score_ = calculate_blocking_scores();
-}
-
-// returns score for current blocking parameters' values in range [0, 1]
-// for parallel work over threads distribution score. Maximum scores - when
-// all threads have the same work amount w/o tails
-float matmul_amx_blocking_params_t::get_thread_balance_scores() {
-    assert(!(is_runtime_M && is_runtime_N)
-            && "single runtime dim is supported");
-    // Ignore M sizes in thread balance computation as actual M size is unknown
-    if (is_runtime_M) return (float)N / rnd_up(N, n_chunk_elems_);
-    // Ignore N sizes in thread balance computation as actual N size is unknown
-    if (is_runtime_N) return (float)M / rnd_up(M, m_chunk_elems_);
-
-    dim_t num_M_chunks = div_up(M, m_chunk_elems_);
-    dim_t num_N_chunks = div_up(N, n_chunk_elems_);
-    float mnb_parallel_score = batch * ((float)M / m_chunk_elems_)
-            * ((float)N / n_chunk_elems_)
-            / rnd_up(batch * num_M_chunks * num_N_chunks, nthr_mnb_)
-            * nthr_mnb_;
-    float k_parallel_score = 1.0f;
-    if (nthr_k_ > 1) {
-        dim_t num_K_chunks = div_up(K, k_chunk_elems_);
-        const float parallel_reduction_penalty = 0.8f;
-        k_parallel_score = parallel_reduction_penalty
-                * ((float)K / k_chunk_elems_) / rnd_up(num_K_chunks, nthr_k_)
-                * nthr_k_;
-    }
-
-    return mnb_parallel_score * k_parallel_score / nthr;
-}
-
-// returns score for current blocking parameters' values in range [0, 1]
-// for copied data reusage
-float matmul_amx_blocking_params_t::get_copied_data_reusage_scores() {
-    const dim_t effective_m_chunk_sz = 64 * 4;
-    const dim_t desired_M_chunk_size = is_runtime_M
-            ? effective_m_chunk_sz
-            : nstl::min(M, effective_m_chunk_sz);
-    const dim_t effective_n_chunk_sz = 64 * (use_buffer_a ? 4 : 1);
-    const dim_t desired_N_chunk_size = is_runtime_N
-            ? effective_n_chunk_sz
-            : nstl::min(N, effective_n_chunk_sz);
-    const float coef_M = nstl::min(
-            static_cast<float>(m_chunk_elems_) / desired_M_chunk_size, 1.0f);
-    const float coef_N = nstl::min(
-            static_cast<float>(n_chunk_elems_) / desired_N_chunk_size, 1.0f);
-    return 0.5f * (coef_M + coef_N);
-}
-
-// returns score for current blocking parameters' values in range [0, 1]
-// for L2 utilization
-float matmul_amx_blocking_params_t::get_L2_utilization_scores() const {
-    const float relative_difference_with_L2
-            = fabsf((float)L2_threshold() - blocking_chunk_mem_size_)
-            / nstl::max(L2_threshold(), blocking_chunk_mem_size_);
-    return 1.0f - relative_difference_with_L2;
-}
-
-// returns score for current blocking parameters' values in range [0, 1]
-// consists of 3 parts with its own weights:
-// 	1) parallel work over threads distribution score
-// 	2) L2 utilization score
-// 	3) copied data re-usage score
-float matmul_amx_blocking_params_t::calculate_blocking_scores() {
-    if (one_of(0, n_blk_, n_chunk_size_, m_blk_, m_chunk_size_, k_blk_,
-                k_chunk_size_))
-        return 0.0f;
-
-    const float nthr_coeff = nstl::min(nthr, 100);
-    const float reusage_factor = 1.0f;
-    // for runtume M the actual size is unknown, use independent on num_threads
-    // balance factors
-    const float balance_factor
-            = is_runtime_M ? 1.0f : (nthr_coeff - 1.0f) / nthr_coeff;
-    const float cache_utilization_factor
-            = is_runtime_M ? 1.0f : 1.0f / nthr_coeff;
-
-    float scores = cache_utilization_factor * get_L2_utilization_scores()
-            + reusage_factor * get_copied_data_reusage_scores();
-    if (balance_factor > 0.0f)
-        scores += balance_factor * get_thread_balance_scores();
-    return scores
-            / (reusage_factor + balance_factor + cache_utilization_factor);
-}
-
-void matmul_amx_blocking_params_t::update_configuration(
-        brgemm_matmul_conf_t &bgmmc) const {
-    bgmmc.nthr_k = nthr_k_;
-    bgmmc.M_blk = m_blk_;
-    bgmmc.M_chunk_size = m_chunk_size_;
-    bgmmc.N_blk = n_blk_;
-    bgmmc.N_chunk_size = n_chunk_size_;
-
-    bgmmc.K_blk = k_blk_;
-    bgmmc.brgemm_batch_size = k_chunk_size_;
-
-    bgmmc.use_buffer_c = need_buf_c_;
-    bgmmc.LDA = current_lda_;
-}
-
-dim_t matmul_amx_blocking_params_t::get_actual_lda() {
-    if (!use_buffer_a)
-        return treat_transposed_A_as_plain
-                ? K
-                : A_strides[1 - transposed_A] / a_dt_sz;
-
-    constexpr int bytes_in_cacheline = 64;
-    const int elems_in_cacheline = bytes_in_cacheline / a_dt_sz;
-    dim_t lda = rnd_up(k_blk_, elems_in_cacheline);
-    const bool is_big_2_pow = lda >= 512 && math::is_pow2(lda);
-    if (is_big_2_pow) lda += elems_in_cacheline;
-    return lda;
-}
-
-bool matmul_amx_blocking_params_t::is_buffer_c_required() {
-    if (nthr_k_ > 1 && K > k_chunk_elems_) return true;
-
-    return ((acc_dt != dst_dt || with_sum)
-            && (K > k_chunk_elems_ || K % k_blk_ > 0));
-}
-
-size_t matmul_amx_blocking_params_t::calculate_chunk_memory_size() {
-    update_k_blocking_dependent_params();
-
-    size_t A_chunk_sz = a_dt_sz * k_chunk_elems_ * m_chunk_elems_;
-    size_t A_buf_sz = use_buffer_a
-            ? tr_a_dt_sz * current_lda_ * k_chunk_size_ * m_chunk_elems_
-            : 0;
-    size_t B_chunk_sz = b_dt_sz * k_chunk_elems_ * n_chunk_elems_;
-    size_t B_buf_sz = use_buffer_b ? tr_b_dt_sz * n_blk_ * k_chunk_elems_ : 0;
-    size_t C_chunk_sz = c_dt_sz * m_chunk_elems_ * n_chunk_elems_;
-    size_t C_buf_sz
-            = need_buf_c_ ? acc_dt_sz * m_chunk_elems_ * n_chunk_elems_ : 0;
-    return A_chunk_sz + A_buf_sz + B_chunk_sz + B_buf_sz + C_chunk_sz
-            + C_buf_sz;
-}
-
 } // namespace matmul
 } // namespace x64
 } // namespace cpu
diff --git a/src/cpu/x64/matmul/brgemm_matmul_utils.hpp b/src/cpu/x64/matmul/brgemm_matmul_utils.hpp
index 5f59c947a09..4e524e1a24e 100644
--- a/src/cpu/x64/matmul/brgemm_matmul_utils.hpp
+++ b/src/cpu/x64/matmul/brgemm_matmul_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -87,15 +87,21 @@ struct brgemm_matmul_conf_t {
     int ndims, batch_ndims;
     dim_t M, N, K, batch, batch_without_first_dim;
     dim_t M_blk, N_blk, K_blk, M_tail, N_tail, K_tail;
-    int M_chunk_size, N_chunk_size;
+    int M_chunk_size, N_chunk_size, K_chunk_size;
+    bool is_a_nt, is_b_nt, set_nt;
     dim_t LDA, LDB, LDC, LDD;
+    dim_t LDB2;
     int brgemm_batch_size, brgemm_batch_tail_size;
     int wei_n_blk, wei_k_blk;
     brgemm_batch_kind_t brg_type;
+    bool is_macro_heuristics;
 
     cpu_isa_t isa;
 
+    matmul_reduce_kind_t reduce_kind;
+
     format_tag_t src_tag, wei_tag, dst_tag, bia_tag;
+    bool with_reduce;
     bool with_bias;
     bool with_sum;
     bool with_eltwise;
@@ -104,9 +110,6 @@ struct brgemm_matmul_conf_t {
     bool with_dst_scales;
     bool s8s8_compensation_required;
     bool packed_sparse_weights;
-    bool is_oscale_per_n;
-    bool is_oscale_per_k;
-    bool apply_scales_in_buffer_b;
     bool req_transpose_scales;
     bool with_wei_decompression;
     brgemm_broadcast_t src_zp_type;
@@ -117,6 +120,7 @@ struct brgemm_matmul_conf_t {
     bool use_buffer_a_tail_only;
     bool use_buffer_b;
     bool use_buffer_c;
+    bool use_buffer_reduce;
 
     brgemm_matmul_bcast_desc_t bcast_A_desc;
     brgemm_matmul_bcast_desc_t bcast_B_desc;
@@ -126,12 +130,14 @@ struct brgemm_matmul_conf_t {
     data_type_t wei_dt;
     data_type_t acc_dt;
     data_type_t bia_dt;
+    data_type_t reduce_dt;
+    data_type_t orig_src_dt;
     data_type_t orig_wei_dt;
     int nthr;
-    int nthr_k;
+    int nthr_k = 1, nthr_m = 1, nthr_n = 1, nthr_b = 1;
 
     // Auxiliary values for init_config() and execute()
-    dim_t a_dt_sz, b_dt_sz, c_dt_sz, acc_dt_sz, bias_dt_sz;
+    dim_t a_dt_sz, b_dt_sz, c_dt_sz, acc_dt_sz, bias_dt_sz, reduce_dt_sz;
 
     // used for transposed buffer datatype when different from x_dt_sz
     // (e.g. used in BF32 implementations having to down-convert to BF16
@@ -143,6 +149,7 @@ struct brgemm_matmul_conf_t {
     int K_chunks;
     int num_M_blocks;
     int num_N_blocks;
+    int num_K_blocks;
     dim_t M_chunk_elems;
     dim_t N_chunk_elems;
     dim_t K_chunk_elems;
@@ -160,12 +167,20 @@ struct brgemm_matmul_conf_t {
     dim_t copy_A_src_stride;
     dim_t copy_B_wei_stride;
 
-    dim_t buffer_a_chunk_sz;
-    dim_t buffer_a_chunk_shift_along_m;
+    dim_t buffer_a_gb_stride;
+    dim_t buffer_a_k_stride;
+    dim_t buffer_a_m_stride;
     dim_t buffer_a_per_thread_sz;
 
+    dim_t buffer_b_gb_stride;
+    dim_t buffer_b_k_brg_stride;
+    dim_t buffer_b_n_blk_stride;
+
     dim_t buffer_b_chunk_sz;
     dim_t buffer_b_per_thread_sz;
+
+    dim_t buffer_reduce_per_thread_sz;
+
     dim_t s8s8_comp_ithr_str;
     dim_t s8s8_comp_b_str;
     dim_t s8s8_comp_n_str;
@@ -174,7 +189,13 @@ struct brgemm_matmul_conf_t {
     bool transposed_A;
     bool transposed_B;
     bool blocked_B;
-    bool treat_transposed_A_as_plain;
+    bool treat_A_as_plain;
+
+    // A_strides could be changed during
+    // Matmul conf initialization in case when batches merged into M.
+    // This flag helps to properly initialize LDA when A_strides
+    // were changed.
+    bool adjust_a_strides = false;
 
     dim_t zp_a_comp_shift_n;
     dim_t zp_a_comp_elems_per_thr;
@@ -191,6 +212,9 @@ struct brgemm_matmul_conf_t {
     int required_k_granularity;
     bool is_bf32 = false;
     bool is_bf16_with_int_wei = false;
+    bool is_f16_with_int_wei = false;
+    bool is_f32_f16 = false;
+    bool is_f32_bf16 = false;
     bool is_int4_weights = false;
     bool req_wei_vnni_downconvert = false;
     bool is_runtime_M = false;
@@ -199,6 +223,11 @@ struct brgemm_matmul_conf_t {
     bool is_src_batch_layout_trivial = false;
     bool is_wei_batch_layout_trivial = false;
     bool is_dst_batch_layout_trivial = false;
+    bool is_oscale_per_n = false;
+    bool is_oscale_per_k = false;
+    bool apply_scales_in_buffer_b = false;
+    bool extendable_k = false;
+
     inline bool lda_big_pow2() const {
         const dim_t big_stride_threshold_in_bytes = 8192;
         const dim_t big_K_threshold = big_stride_threshold_in_bytes / a_dt_sz;
@@ -220,6 +249,12 @@ struct brgemm_matmul_conf_utils_t {
                         blocked_8n_B_layout_tag);
     }
 
+    inline bool check_b_layout_blocked_32_by_n(
+            format_tag_t matrix_b_tag) const {
+        return blocked_B_layouts_allowed && !bgmmc.is_runtime_N
+                && utils::one_of(matrix_b_tag, blocked_32n_B_layout_tag);
+    }
+
     inline bool get_blocked_B() const {
         return blocked_B_layouts_allowed && !bgmmc.is_runtime_N
                 && check_b_layout_blocked_by_n(bgmmc.wei_tag);
@@ -228,6 +263,7 @@ struct brgemm_matmul_conf_utils_t {
     inline bool use_buffer_b(bool use_heuristic = true) const {
         if (bgmmc.is_runtime_N) return true;
         if (bgmmc.is_bf16_with_int_wei) return true;
+        if (bgmmc.is_f16_with_int_wei) return true;
         if (bgmmc.apply_scales_in_buffer_b) return true;
 
         if (bgmmc.is_amx)
@@ -262,7 +298,13 @@ struct brgemm_matmul_conf_utils_t {
         }
         bool use_blocked_LDB = bgmmc.is_amx || bgmmc.use_buffer_b
                 || bgmmc.wei_tag != plain_tensor_layout_tag;
-        return use_blocked_LDB ? bgmmc.wei_n_blk : md_ldb;
+        if (use_blocked_LDB) return bgmmc.wei_n_blk;
+        // When K == 1 we always pick "ab" format for B (see set_or_check_B_tag)
+        // regardles of whether the actual tag was "ab" or  "ba".
+        // Since the implementation assumes the "ab" format is used we cannot
+        // use bgmmc.B_strides[1] directly as the strides could be specified for
+        // "ba" therefore we need to use bgmmc.N instead.
+        return bgmmc.K == 1 ? bgmmc.N : md_ldb;
     }
 
     inline bool maybe_low_brg_blocking() const {
@@ -297,6 +339,12 @@ struct brgemm_matmul_conf_utils_t {
 
     inline bool is_bf16_with_int_wei() const { return bf16_with_int_wei_dt; }
 
+    inline bool is_f32_f16() const { return f32_f16_dt; }
+
+    inline bool is_f32_bf16() const { return f32_bf16_dt; }
+
+    inline bool is_f16_with_int_wei() const { return f16_with_int_wei_dt; }
+
     inline bool with_weights_decompression() const {
         return !utils::one_of(bgmmc.src_dt, data_type::s8, data_type::u8,
                        data_type::s4, data_type::u4)
@@ -308,7 +356,8 @@ struct brgemm_matmul_conf_utils_t {
     }
 
     inline bool wei_down_convert_to_vnni() const {
-        return (bf32_dt || bf16_with_int_wei_dt) && get_blocked_B();
+        return (bf32_dt || f16_with_int_wei_dt || bf16_with_int_wei_dt)
+                && get_blocked_B();
     }
 
     inline bool is_any_B_layout() const { return B_any_layout; }
@@ -331,7 +380,8 @@ struct brgemm_matmul_conf_utils_t {
     brgemm_matmul_conf_t &bgmmc;
 
     const bool f32_dt, bf16_dt, f16_dt, f8_dt, int8_dt, bf32_dt;
-    const bool weights_decompression_support, bf16_with_int_wei_dt;
+    const bool weights_decompression_support, bf16_with_int_wei_dt, f32_f16_dt,
+            f32_bf16_dt, f16_with_int_wei_dt;
 
     const bool A_any_layout;
     const bool B_any_layout;
@@ -350,9 +400,9 @@ struct brgemm_matmul_conf_utils_t {
 
 // This function initializes all required fields in the conf object to generate
 // copy_b kernel. Used in this impl and re-used in brgemm kernel API.
-status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t K, dim_t N,
-        dim_t in_ld, dim_t n_blk, data_type_t in_type, data_type_t out_type,
-        format_tag_t in_tag);
+status_t init_conf(brgemm_matmul_conf_t &conf, dim_t batch, dim_t M, dim_t K,
+        dim_t N, dim_t in_ld, dim_t n_blk, data_type_t in_type,
+        data_type_t out_type, format_tag_t in_tag);
 
 void init_aux_values(brgemm_matmul_conf_t &bgmmc,
         const memory_desc_wrapper &src_d, const memory_desc_wrapper &wei_d,
diff --git a/src/cpu/x64/matmul/jit_uni_sparse_matmul.cpp b/src/cpu/x64/matmul/jit_uni_sparse_matmul.cpp
index d9777435206..77ccb8bf030 100644
--- a/src/cpu/x64/matmul/jit_uni_sparse_matmul.cpp
+++ b/src/cpu/x64/matmul/jit_uni_sparse_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ namespace matmul {
 using namespace dnnl::impl::data_type;
 using namespace Xbyak;
 
-struct sparse_matmul_kernel_t : public jit_generator {
+struct sparse_matmul_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(sparse_matmul_kernel_t);
 
     struct call_params_t {
@@ -47,7 +47,7 @@ struct sparse_matmul_kernel_t : public jit_generator {
     };
 
     sparse_matmul_kernel_t(size_t vlen, const matmul_pd_t *pd)
-        : jit_generator(jit_name())
+        : jit_generator_t(jit_name())
         , N_(pd->dst_md()->dims[1])
         , vlen_(vlen)
         , simd_w_(vlen_ / data_type_size())
@@ -57,7 +57,7 @@ struct sparse_matmul_kernel_t : public jit_generator {
     ~sparse_matmul_kernel_t() override = default;
 
     void operator()(const call_params_t *p) {
-        return jit_generator::operator()(p);
+        return jit_generator_t::operator()(p);
     }
 
     size_t simd_w() const { return simd_w_; }
@@ -92,7 +92,7 @@ struct jit_uni_sparse_matmul_kernel_t : public sparse_matmul_kernel_t {
     using sparse_matmul_kernel_t::tail_size;
     using sparse_matmul_kernel_t::vlen;
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     Reg64 reg_param = abi_param1;
 
@@ -300,7 +300,7 @@ struct jit_uni_sparse_matmul_kernel_t : public sparse_matmul_kernel_t {
     }
 
     jit_uni_sparse_matmul_kernel_t(const matmul_pd_t *pd)
-        : sparse_matmul_kernel_t(cpu_isa_traits<isa>::vlen, pd) {}
+        : sparse_matmul_kernel_t(cpu_isa_traits_t<isa>::vlen, pd) {}
     ~jit_uni_sparse_matmul_kernel_t() override = default;
 };
 
diff --git a/src/cpu/x64/matmul_inner_product.cpp b/src/cpu/x64/matmul_inner_product.cpp
new file mode 100644
index 00000000000..48e0cf81f9f
--- /dev/null
+++ b/src/cpu/x64/matmul_inner_product.cpp
@@ -0,0 +1,429 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <map>
+
+#include "cpu/x64/matmul_inner_product.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+bool is_desired_mm_impl(
+        const std::shared_ptr<primitive_desc_t> &matmul_pd, bool with_reduce) {
+    // Fallback to a generic GEMM-based Inner Product is usually preferred
+    // rather than using a reference or GEMM-based MatMul implementations here.
+    //
+    // The only exception is AVX2 for which using the GEMM-based MatMul is
+    // allowed because it is placed higher on the list of implementations.
+    // It is only allowed when no reduciton is requested (i.e. fwd, bwd_d,
+    // bwd_w (w/o bias)).
+    const bool is_brg_matmul = std::string(matmul_pd->name()).find("brg_matmul")
+            != std::string::npos;
+    const bool is_gemm_matmul
+            = std::string(matmul_pd->name()).find("gemm") != std::string::npos;
+
+    if (is_brg_matmul) return true;
+
+    const bool is_avx2 = !mayiuse(avx512_core) && mayiuse(avx2);
+    if (is_avx2 && is_gemm_matmul && !with_reduce) return true;
+
+    return false;
+}
+
+status_t create_matmul_pd(std::shared_ptr<primitive_desc_t> &matmul_pd,
+        engine_t *engine, const memory_desc_t *src_md,
+        const memory_desc_t *wei_md, const memory_desc_t *dst_md,
+        const memory_desc_t *bia_md, const memory_desc_t *reduce_md,
+        const primitive_attr_t *attr) {
+    auto matmul_desc = matmul_desc_t();
+
+    CHECK(matmul_desc_init(&matmul_desc, src_md, wei_md, bia_md, dst_md,
+            reduce_md, matmul_reduce_kind::src));
+
+    primitive_desc_iterator_t it(
+            engine, (op_desc_t *)&matmul_desc, attr, nullptr);
+
+    while (it != it.end()) {
+        matmul_pd = *(++it);
+        if (!matmul_pd) return status::unimplemented;
+        if (is_desired_mm_impl(matmul_pd, bool(reduce_md))) break;
+    }
+
+    return status::success;
+}
+
+status_t init_matmul_md(memory_desc_t &mm_md, const memory_desc_t &ip_md,
+        format_tag_t tag, bool swap_dims) {
+    auto p_dims = ip_md.dims;
+    auto p_dim1 = utils::array_product(p_dims + 1, ip_md.ndims - 1);
+
+    if (swap_dims) {
+        dims_t dims_2d = {p_dim1, p_dims[0]};
+        return memory_desc_init_by_tag(mm_md, 2, dims_2d, ip_md.data_type, tag);
+    } else {
+        dims_t dims_2d = {p_dims[0], p_dim1};
+        return memory_desc_init_by_tag(mm_md, 2, dims_2d, ip_md.data_type, tag);
+    }
+}
+
+static bool check_training_formats(const memory_desc_wrapper &src_d,
+        const memory_desc_wrapper &wei_d, const memory_desc_wrapper &bias_d,
+        const memory_desc_wrapper &dst_d) {
+    using namespace format_tag;
+    using namespace utils;
+
+    bool ok = src_d.matches_one_of_tag(ab, acb, acdb, acdeb)
+            && dst_d.matches_tag(ab);
+
+    if (!bias_d.is_zero()) ok = ok && bias_d.matches_tag(x);
+
+    ok = ok && IMPLICATION(src_d.matches_tag(ab), wei_d.matches_tag(ab))
+            && IMPLICATION(src_d.matches_tag(acb), wei_d.matches_tag(acb))
+            && IMPLICATION(src_d.matches_tag(acdb), wei_d.matches_tag(acdb))
+            && IMPLICATION(src_d.matches_tag(acdeb), wei_d.matches_tag(acdeb));
+
+    ok = ok && src_d.is_dense() && wei_d.is_dense() && dst_d.is_dense();
+    return ok;
+}
+
+status_t set_training_formats(memory_desc_t *src_md, memory_desc_t *wei_md,
+        memory_desc_t *bias_md, memory_desc_t *dst_md) {
+    using namespace format_tag;
+
+    const int ndims = src_md->ndims;
+    const auto tag = utils::pick(ndims - 2, ab, acb, acdb, acdeb);
+    if (src_md->format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(*src_md, tag));
+
+    if (wei_md->format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(*wei_md, tag));
+
+    if (dst_md->format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(*dst_md, ab));
+
+    if (bias_md && bias_md->format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(*bias_md, x));
+
+    return check_training_formats(src_md, wei_md, bias_md, dst_md)
+            ? status::success
+            : status::unimplemented;
+}
+
+int matmul_inner_product_fwd_t::pd_t::get_k_blk(format_tag_t tag) const {
+    using namespace format_tag;
+    switch (tag) {
+        case ba: return 0;
+        case BA8a8b:
+        case BA8a24b: return 8;
+        case BA16a16b:
+        case BA16a32b:
+        case BA16a48b:
+        case BA16a64b: return 16;
+        case BA16a16b2a:
+        case BA16a32b2a:
+        case BA16a48b2a:
+        case BA16a64b2a: return 32;
+        case BA16a16b4a:
+        case BA16a32b4a:
+        case BA16a48b4a:
+        case BA16a64b4a: return 64;
+        default: assert(!"unsupported tag"); return -1;
+    }
+}
+
+// This implementation is completely based on the MatMul primitive and is
+// currently enabled only for `forward_inference` propagation kind.
+//
+// The implementation allows using blocked weights layouts directly or via
+// the special tag `any`.
+// The Inner Product weights must meet **ONE** of the following requirements to
+// enable using the blocked layouts:
+//   - Weights don't have spatial.
+//   - Weights have unit spatial.
+//   - Weights have non-unit spatial but the number of input channels is a
+//     multiple of K block (returned by `get_k_blk()`).
+//
+// If none of the above requirements are met then a plain layout will be
+// used.
+//
+// Note: this implementation is only guranteed to work with a set of the
+// pre-defined layouts therefore there is no need to implement a generic
+// mechanism to map inner product weights layouts to the matmul ones and
+// vice versa.
+status_t matmul_inner_product_fwd_t::pd_t::init_matmul_params(
+        engine_t *engine) {
+    using namespace format_tag;
+
+    // clang-format off
+    static const std::map<format_tag_t, std::vector<format_tag_t>> mm_wei_to_ip_wei = {
+        { ba, {ab, acb, acdb, acdeb}},
+        { BA8a8b, {AB8b8a, AcB8b8a, AcdB8b8a, AcdeB8b8a}},
+        { BA8a24b, {AB8b24a, AcB8b24a, AcdB8b24a, AcdeB8b24a}},
+        { BA16a16b, {AB16b16a, AcB16b16a, AcdB16b16a, AcdeB16b16a}},
+        { BA16a32b, {AB16b32a, AcB16b32a, AcdB16b32a, AcdeB16b32a}},
+        { BA16a48b, {AB16b48a, AcB16b48a, AcdB16b48a, AcdeB16b48a}},
+        { BA16a64b, {AB16b64a, AcB16b64a, AcdB16b64a, AcdeB16b64a}},
+        { BA16a16b2a, {AB16b16a2b, AcB16b16a2b, AcdB16b16a2b, AcdeB16b16a2b}},
+        { BA16a32b2a, {AB16b32a2b, AcB16b32a2b, AcdB16b32a2b, AcdeB16b32a2b}},
+        { BA16a48b2a, {AB16b48a2b, AcB16b48a2b, AcdB16b48a2b, AcdeB16b48a2b}},
+        { BA16a64b2a, {AB16b64a2b, AcB16b64a2b, AcdB16b64a2b, AcdeB16b64a2b}},
+        { BA16a16b4a, {AB16b16a4b, AcB16b16a4b, AcdB16b16a4b, AcdeB16b16a4b}},
+        { BA16a32b4a, {AB16b32a4b, AcB16b32a4b, AcdB16b32a4b, AcdeB16b32a4b}},
+        { BA16a48b4a, {AB16b48a4b, AcB16b48a4b, AcdB16b48a4b, AcdeB16b48a4b}},
+        { BA16a64b4a, {AB16b64a4b, AcB16b64a4b, AcdB16b64a4b, AcdeB16b64a4b}}};
+    // clang-format on
+
+    auto mm_wei_tag = format_tag::undef;
+    // Try to initialize Inner Product weights layout based on the user-provided
+    // layout.
+    if (weights_md()->format_kind != format_kind::any) {
+        for (const auto &v : mm_wei_to_ip_wei) {
+            if (memory_desc_matches_tag(
+                        *weights_md(), v.second[weights_md()->ndims - 2])) {
+                mm_wei_tag = v.first;
+                // Check if the user-provided blocked layout can be handled.
+                const bool has_spatial = KD() + KH() + KW() > 3;
+                const int k_blk = get_k_blk(mm_wei_tag);
+                const bool is_wtag_supported = !(weights_md()->ndims > 2
+                        && has_spatial && k_blk > 0 && IC() % k_blk != 0);
+                VDISPATCH_INNER_PRODUCT(is_wtag_supported,
+                        VERBOSE_UNSUPPORTED_TAG_S, "weights");
+                break;
+            }
+        }
+    } else {
+        mm_wei_tag = format_tag::any;
+    }
+
+    VDISPATCH_INNER_PRODUCT(mm_wei_tag != format_tag::undef,
+            VERBOSE_UNSUPPORTED_TAG_S, "weights");
+
+    memory_desc_t mm_src_md {};
+    memory_desc_t mm_wei_md {};
+    memory_desc_t mm_dst_md {};
+
+    if (bias_md_.format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(bias_md_, x));
+
+    CHECK(init_matmul_md(mm_src_md, *src_md(), format_tag::ab));
+    CHECK(init_matmul_md(mm_wei_md, *weights_md(), mm_wei_tag, true));
+    CHECK(init_matmul_md(mm_dst_md, *dst_md(), format_tag::ab));
+
+    const auto src_tag = utils::pick(src_md()->ndims - 2, ab, acb, acdb, acdeb);
+    if (src_md()->format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(src_md_, src_md_.ndims, src_md_.dims,
+                src_md_.data_type, src_tag));
+    else
+        VDISPATCH_INNER_PRODUCT(memory_desc_matches_tag(*src_md(), src_tag),
+                VERBOSE_UNSUPPORTED_TAG_S, "src");
+
+    const auto dst_tag = ab;
+    if (dst_md()->format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(dst_md_, dst_md_.ndims, dst_md_.dims,
+                dst_md_.data_type, dst_tag));
+    else
+        VDISPATCH_INNER_PRODUCT(memory_desc_matches_tag(*dst_md(), dst_tag),
+                VERBOSE_UNSUPPORTED_TAG_S, "dst");
+
+    VDISPATCH_INNER_PRODUCT_SC(
+            attr_.set_default_formats(dst_md(0)), VERBOSE_UNSUPPORTED_POSTOP);
+
+    primitive_attr_t matmul_attr = *attr();
+    if (!matmul_attr.scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
+        const auto wei_mask = matmul_attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_mask == 1) {
+            VDISPATCH_INNER_PRODUCT_SC(matmul_attr.scales_.set(DNNL_ARG_WEIGHTS,
+                                               1 << (mm_wei_md.ndims - 1)),
+                    VERBOSE_UNSUPPORTED_ATTR);
+        } else if (wei_mask > 0) {
+            VDISPATCH_INNER_PRODUCT(false, VERBOSE_UNSUPPORTED_SCALES_CFG);
+        }
+    }
+
+    memory_desc_t mm_bia_md {};
+    // Inner Product bias is always a vector while MatMul requires bias to have
+    // the same number of dimensions as that of the output tensor, therefore an
+    // adjustment is required.
+    if (with_bias()) {
+        assert(weights_md(1)->ndims == 1);
+        dims_t mm_bia_dims = {1, weights_md(1)->dims[0]};
+        CHECK(memory_desc_init_by_tag(mm_bia_md, 2, mm_bia_dims,
+                weights_md(1)->data_type, format_tag::ab));
+    }
+
+    VDISPATCH_INNER_PRODUCT_SC(
+            create_matmul_pd(matmul_pd_, engine, &mm_src_md, &mm_wei_md,
+                    &mm_dst_md, with_bias() ? &mm_bia_md : nullptr, nullptr,
+                    &matmul_attr),
+            VERBOSE_PRIMITIVE_CREATION_FAIL, "matmul");
+
+    // Try to initialize Inner Product weights layout based on the MatMul's one.
+    if (weights_md()->format_kind == format_kind::any) {
+        // If the table doesn't have the required layout then fallback
+        // is needed.
+        bool is_fallback_required = true;
+        format_tag_t ip_wei_tag = format_tag::undef;
+        const auto &mm_queried_wei_md = *matmul_pd_->weights_md();
+        for (const auto &v : mm_wei_to_ip_wei) {
+            if (memory_desc_matches_tag(mm_queried_wei_md, v.first)) {
+                // Check if the implementation defined blocked layout can be
+                // handled.
+                const bool has_spatial = KD() + KH() + KW() > 3;
+                const int k_blk = get_k_blk(v.first);
+                is_fallback_required = weights_md()->ndims > 2 && has_spatial
+                        && k_blk > 0 && IC() % k_blk != 0;
+
+                if (!is_fallback_required)
+                    ip_wei_tag = v.second[weights_md()->ndims - 2];
+                break;
+            }
+        }
+        if (is_fallback_required) {
+            // Re-initialize MatMul weights memory descriptor with a plain
+            // layout.
+            CHECK(init_matmul_md(
+                    mm_wei_md, *weights_md(), format_tag::ba, true));
+            // Re-create MatMul primitive descriptor.
+            VDISPATCH_INNER_PRODUCT_SC(
+                    create_matmul_pd(matmul_pd_, engine, &mm_src_md, &mm_wei_md,
+                            &mm_dst_md, with_bias() ? &mm_bia_md : nullptr,
+                            nullptr, &matmul_attr),
+                    VERBOSE_PRIMITIVE_CREATION_FAIL, "matmul");
+            ip_wei_tag = utils::pick(
+                    weights_md()->ndims - 2, ab, acb, acdb, acdeb);
+        }
+        CHECK(memory_desc_init_by_tag(weights_md_, weights_md_.ndims,
+                weights_md_.dims, weights_md_.data_type, ip_wei_tag));
+        // Carry over the extra info from MatMul weights memory descriptor.
+        if (!is_fallback_required && mm_queried_wei_md.extra.flags != 0) {
+            weights_md_.extra = mm_queried_wei_md.extra;
+            // Since IP weights are transposed we need to swap bits
+            // (mask: 2 -> 1).
+            weights_md_.extra.compensation_mask = 1;
+        }
+    } else {
+        // At this point it's guaranteed that the table contains the requested
+        // layout that can be handled.
+        const auto &ip_wei_tags = mm_wei_to_ip_wei.at(mm_wei_tag);
+        const auto ip_wei_tag = ip_wei_tags[weights_md()->ndims - 2];
+        CHECK(memory_desc_init_by_tag(weights_md_, weights_md_.ndims,
+                weights_md_.dims, weights_md_.data_type, ip_wei_tag));
+    }
+
+    return status::success;
+}
+
+status_t matmul_inner_product_bwd_data_t::pd_t::init_matmul_params(
+        engine_t *engine) {
+    memory_desc_t mm_src_md {};
+    memory_desc_t mm_wei_md {};
+    memory_desc_t mm_dst_md {};
+
+    CHECK(init_matmul_md(mm_src_md, *diff_dst_md(), format_tag::ab));
+    CHECK(init_matmul_md(mm_wei_md, *weights_md(), format_tag::ab));
+    CHECK(init_matmul_md(mm_dst_md, *diff_src_md(), format_tag::ab));
+
+    VDISPATCH_INNER_PRODUCT_SC(
+            create_matmul_pd(matmul_pd_, engine, &mm_src_md, &mm_wei_md,
+                    &mm_dst_md, nullptr, nullptr, attr()),
+            VERBOSE_PRIMITIVE_CREATION_FAIL, "matmul");
+
+    return status::success;
+}
+
+status_t matmul_inner_product_bwd_weights_t::pd_t::init_matmul_params(
+        engine_t *engine) {
+    memory_desc_t mm_src_md {};
+    memory_desc_t mm_wei_md {};
+    memory_desc_t mm_dst_md {};
+
+    CHECK(init_matmul_md(mm_src_md, *diff_dst_md(), format_tag::ba, true));
+    CHECK(init_matmul_md(mm_wei_md, *src_md(), format_tag::ab));
+    CHECK(init_matmul_md(mm_dst_md, *diff_weights_md(), format_tag::ab));
+
+    memory_desc_t reduce_md {};
+
+    if (with_bias()) {
+        const memory_desc_t &diff_bias_md = *diff_weights_md(1);
+        dims_t reduce_dims {};
+        reduce_dims[0] = diff_bias_md.dims[0];
+        reduce_dims[1] = 1;
+
+        CHECK(memory_desc_reshape(reduce_md, diff_bias_md, 2, reduce_dims));
+    }
+
+    VDISPATCH_INNER_PRODUCT_SC(
+            create_matmul_pd(matmul_pd_, engine, &mm_src_md, &mm_wei_md,
+                    &mm_dst_md, nullptr, with_bias() ? &reduce_md : nullptr,
+                    attr()),
+            VERBOSE_PRIMITIVE_CREATION_FAIL, "matmul");
+
+    return status::success;
+}
+
+status_t matmul_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+
+    exec_args_t matmul_args = ctx.args();
+    exec_ctx_t matmul_ctx(ctx, std::move(matmul_args));
+
+    nested_scratchpad_t ns(ctx, key_nested, matmul_);
+    matmul_ctx.set_scratchpad_grantor(ns.grantor());
+
+    return matmul_->execute(matmul_ctx);
+}
+
+status_t matmul_inner_product_bwd_data_t::execute(const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+
+    exec_args_t matmul_args;
+    matmul_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_DIFF_DST);
+    matmul_args[DNNL_ARG_WEIGHTS] = ctx.args().at(DNNL_ARG_WEIGHTS);
+    matmul_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DIFF_SRC);
+
+    exec_ctx_t matmul_ctx(ctx, std::move(matmul_args));
+
+    nested_scratchpad_t ns(ctx, key_nested, matmul_);
+    matmul_ctx.set_scratchpad_grantor(ns.grantor());
+
+    return matmul_->execute(matmul_ctx);
+}
+
+status_t matmul_inner_product_bwd_weights_t::execute(
+        const exec_ctx_t &ctx) const {
+    using namespace memory_tracking::names;
+
+    exec_args_t matmul_args;
+    matmul_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_DIFF_DST);
+    matmul_args[DNNL_ARG_WEIGHTS] = ctx.args().at(DNNL_ARG_SRC);
+    matmul_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DIFF_WEIGHTS);
+
+    if (pd()->with_bias())
+        matmul_args[DNNL_ARG_REDUCE] = ctx.args().at(DNNL_ARG_DIFF_BIAS);
+
+    exec_ctx_t matmul_ctx(ctx, std::move(matmul_args));
+
+    nested_scratchpad_t ns(ctx, key_nested, matmul_);
+    matmul_ctx.set_scratchpad_grantor(ns.grantor());
+    return matmul_->execute(matmul_ctx);
+}
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/cpu/x64/matmul_inner_product.hpp b/src/cpu/x64/matmul_inner_product.hpp
new file mode 100644
index 00000000000..a055a90c64c
--- /dev/null
+++ b/src/cpu/x64/matmul_inner_product.hpp
@@ -0,0 +1,274 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_MATMUL_INNER_PRODUCT_HPP
+#define CPU_X64_MATMUL_INNER_PRODUCT_HPP
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/matmul_pd.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "cpu/x64/cpu_isa_traits.hpp"
+
+#include "cpu/cpu_inner_product_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+
+status_t create_matmul_pd(std::shared_ptr<primitive_desc_t> &matmul_pd,
+        engine_t *engine, const memory_desc_t *a_md, const memory_desc_t *b_md,
+        const memory_desc_t *c_md, const memory_desc_t *ip_bia_md,
+        const memory_desc_t *reduce_md, const primitive_attr_t *attr);
+
+status_t init_matmul_md(memory_desc_t &mm_md, const memory_desc_t &ip_md,
+        format_tag_t tag, bool swap_dims = false);
+
+status_t set_training_formats(memory_desc_t *src_md, memory_desc_t *wei_md,
+        memory_desc_t *bias_md, memory_desc_t *dst_md);
+
+struct matmul_inner_product_fwd_t : public primitive_t {
+    using primitive_t::primitive_t;
+    struct pd_t : public cpu_inner_product_fwd_pd_t {
+        using cpu_inner_product_fwd_pd_t::cpu_inner_product_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T((matmul_pd_ ? matmul_pd_->name() : "matmul"),
+                matmul_inner_product_fwd_t);
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+            using skip_mask_t = primitive_attr_t::skip_mask_t;
+
+            const auto src_dt = invariant_src_md()->data_type;
+            const auto wei_dt = invariant_wei_md()->data_type;
+            const auto dst_dt = invariant_dst_md()->data_type;
+            const bool is_int8 = utils::one_of(src_dt, u8, s8) && wei_dt == s8
+                    && utils::one_of(dst_dt, u8, s8, s32, f32, bf16);
+
+            auto skip_mask = skip_mask_t::post_ops | skip_mask_t::sum_dt
+                    | skip_mask_t::fpmath_mode;
+            if (is_int8) skip_mask |= skip_mask_t::scales;
+
+            // This implementation is currently enabled only for inference.
+            VDISPATCH_INNER_PRODUCT(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_INNER_PRODUCT(
+                    !has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+            VDISPATCH_INNER_PRODUCT(attr()->has_default_values(skip_mask),
+                    VERBOSE_UNSUPPORTED_ATTR);
+
+            if (get_prop_kind() == prop_kind::forward_training) {
+                VDISPATCH_INNER_PRODUCT_SC(
+                        set_training_formats(
+                                &src_md_, &weights_md_, &bias_md_, &dst_md_),
+                        VERBOSE_UNSUPPORTED_TAG);
+            }
+
+            VDISPATCH_INNER_PRODUCT_SC(
+                    init_matmul_params(engine), "init_matmul_params");
+            init_scratchpad();
+
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> matmul_pd_;
+
+    private:
+        int get_k_blk(format_tag_t tag) const;
+        status_t init_matmul_params(engine_t *engine);
+
+        void init_scratchpad() {
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested,
+                    matmul_pd_->scratchpad_registry());
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        CHECK(pd()->matmul_pd_->create_primitive(matmul_, engine));
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> matmul_;
+};
+
+struct matmul_inner_product_bwd_data_t : public primitive_t {
+    using primitive_t::primitive_t;
+    struct pd_t : public cpu_inner_product_bwd_data_pd_t {
+        using cpu_inner_product_bwd_data_pd_t::cpu_inner_product_bwd_data_pd_t;
+
+        DECLARE_COMMON_PD_T((matmul_pd_ ? matmul_pd_->name() : "matmul"),
+                matmul_inner_product_bwd_data_t);
+
+        bool has_type(data_type_t v) const {
+            return utils::one_of(v, weights_md()->data_type,
+                    diff_src_md()->data_type, diff_dst_md()->data_type);
+        }
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+            using skip_mask_t = primitive_attr_t::skip_mask_t;
+
+            const auto diff_src_dt = invariant_src_md()->data_type;
+            const auto diff_dst_dt = invariant_dst_md()->data_type;
+            const auto wei_dt = invariant_wei_md()->data_type;
+
+            const bool is_f32
+                    = utils::everyone_is(f32, diff_src_dt, wei_dt, diff_dst_dt);
+
+            VDISPATCH_INNER_PRODUCT(mayiuse(avx2), VERBOSE_UNSUPPORTED_ISA);
+            VDISPATCH_INNER_PRODUCT(IMPLICATION(!is_f32, mayiuse(avx512_core)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+
+            VDISPATCH_INNER_PRODUCT(get_prop_kind() == prop_kind::backward_data,
+                    VERBOSE_BAD_PROPKIND);
+            VDISPATCH_INNER_PRODUCT_SC(set_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_INNER_PRODUCT(
+                    !has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+            VDISPATCH_INNER_PRODUCT(utils::one_of(diff_dst_dt, f32, bf16, f16,
+                                            f8_e5m2, f8_e4m3),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_INNER_PRODUCT(wei_dt == diff_dst_dt,
+                    VERBOSE_INCONSISTENT_DT, "weights", "diff_dst");
+            VDISPATCH_INNER_PRODUCT(
+                    utils::one_of(diff_src_dt, f32, diff_dst_dt),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_INNER_PRODUCT(
+                    attr()->has_default_values(skip_mask_t::fpmath_mode),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_INNER_PRODUCT_SC(
+                    init_matmul_params(engine), "init_matmul_params");
+            init_scratchpad();
+
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> matmul_pd_;
+
+    private:
+        status_t init_matmul_params(engine_t *engine);
+        status_t set_formats() {
+            return set_training_formats(
+                    &diff_src_md_, &weights_md_, nullptr, &diff_dst_md_);
+        }
+
+        void init_scratchpad() {
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested,
+                    matmul_pd_->scratchpad_registry());
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        return pd()->matmul_pd_->create_primitive(matmul_, engine);
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> matmul_;
+};
+
+struct matmul_inner_product_bwd_weights_t : public primitive_t {
+    using primitive_t::primitive_t;
+    struct pd_t : public cpu_inner_product_bwd_weights_pd_t {
+        using cpu_inner_product_bwd_weights_pd_t::
+                cpu_inner_product_bwd_weights_pd_t;
+
+        DECLARE_COMMON_PD_T((matmul_pd_ ? matmul_pd_->name() : "matmul"),
+                matmul_inner_product_bwd_weights_t);
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+            using skip_mask_t = primitive_attr_t::skip_mask_t;
+
+            const auto src_dt = invariant_src_md()->data_type;
+            const auto diff_wei_dt = invariant_wei_md()->data_type;
+            const auto diff_dst_dt = invariant_dst_md()->data_type;
+            const auto diff_bia_dt = invariant_bia_md()->data_type;
+
+            const bool is_f32
+                    = utils::everyone_is(f32, src_dt, diff_wei_dt, diff_dst_dt)
+                    && IMPLICATION(with_bias(), diff_bia_dt == f32);
+
+            VDISPATCH_INNER_PRODUCT(mayiuse(avx2), VERBOSE_UNSUPPORTED_ISA);
+            VDISPATCH_INNER_PRODUCT(IMPLICATION(!is_f32, mayiuse(avx512_core)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+
+            VDISPATCH_INNER_PRODUCT(
+                    get_prop_kind() == prop_kind::backward_weights,
+                    VERBOSE_BAD_PROPKIND);
+            VDISPATCH_INNER_PRODUCT_SC(set_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_INNER_PRODUCT(
+                    !has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
+            VDISPATCH_INNER_PRODUCT(
+                    utils::one_of(src_dt, f32, bf16, f16, f8_e5m2, f8_e4m3),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_INNER_PRODUCT(diff_dst_dt == src_dt,
+                    VERBOSE_INCONSISTENT_DT, "diff_dst", "src");
+            VDISPATCH_INNER_PRODUCT(utils::one_of(diff_wei_dt, f32, src_dt),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_INNER_PRODUCT(
+                    attr()->has_default_values(skip_mask_t::fpmath_mode),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_INNER_PRODUCT_SC(
+                    init_matmul_params(engine), "init_matmul_params");
+            init_scratchpad();
+
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> matmul_pd_;
+
+    private:
+        status_t init_matmul_params(engine_t *engine);
+        status_t set_formats() {
+            return set_training_formats(
+                    &src_md_, &diff_weights_md_, &diff_bias_md_, &diff_dst_md_);
+        }
+
+        void init_scratchpad() {
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested,
+                    matmul_pd_->scratchpad_registry());
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        CHECK(pd()->matmul_pd_->create_primitive(matmul_, engine));
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> matmul_;
+};
+
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/cpu/x64/prelu/jit_prelu_backward.hpp b/src/cpu/x64/prelu/jit_prelu_backward.hpp
index f51988c4867..7a79faef6cc 100644
--- a/src/cpu/x64/prelu/jit_prelu_backward.hpp
+++ b/src/cpu/x64/prelu/jit_prelu_backward.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ class jit_prelu_bwd_t : public primitive_t {
     };
 
     jit_prelu_bwd_t(const pd_t *apd);
-    ~jit_prelu_bwd_t();
+    ~jit_prelu_bwd_t() override;
     status_t init(engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
 
diff --git a/src/cpu/x64/prelu/jit_prelu_base_kernel.cpp b/src/cpu/x64/prelu/jit_prelu_base_kernel.cpp
index 6218e6d642d..044e626039c 100644
--- a/src/cpu/x64/prelu/jit_prelu_base_kernel.cpp
+++ b/src/cpu/x64/prelu/jit_prelu_base_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace x64 {
 jit_prelu_base_kernel_t::jit_prelu_base_kernel_t(const cpu_isa_t &isa, int vlen,
         const prelu::bcast &bcast, const memory_desc_wrapper &tensor_md,
         size_t number_vmm_single_compute, const char *name)
-    : jit_generator(name, isa)
+    : jit_generator_t(name, isa)
     , isa_(isa)
     , simd_w_(vlen / sizeof(float))
     , bcast_(bcast)
diff --git a/src/cpu/x64/prelu/jit_prelu_base_kernel.hpp b/src/cpu/x64/prelu/jit_prelu_base_kernel.hpp
index 769b44472e9..5d114193533 100644
--- a/src/cpu/x64/prelu/jit_prelu_base_kernel.hpp
+++ b/src/cpu/x64/prelu/jit_prelu_base_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_prelu_base_kernel_t : public jit_generator {
+class jit_prelu_base_kernel_t : public jit_generator_t {
 public:
     jit_prelu_base_kernel_t(const cpu_isa_t &isa, const int vlen,
             const prelu::bcast &bcast, const memory_desc_wrapper &tensor_md,
diff --git a/src/cpu/x64/prelu/jit_prelu_forward.hpp b/src/cpu/x64/prelu/jit_prelu_forward.hpp
index 30c0b5f4c34..45768f86e05 100644
--- a/src/cpu/x64/prelu/jit_prelu_forward.hpp
+++ b/src/cpu/x64/prelu/jit_prelu_forward.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class jit_prelu_fwd_t : public primitive_t {
     };
 
     jit_prelu_fwd_t(const pd_t *apd);
-    ~jit_prelu_fwd_t();
+    ~jit_prelu_fwd_t() override;
     status_t init(engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
 
diff --git a/src/cpu/x64/prelu/jit_prelu_reduction_kernel.cpp b/src/cpu/x64/prelu/jit_prelu_reduction_kernel.cpp
index ae67a8b9ed4..cdd8f9850ed 100644
--- a/src/cpu/x64/prelu/jit_prelu_reduction_kernel.cpp
+++ b/src/cpu/x64/prelu/jit_prelu_reduction_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ static dim_t get_C(const cpu_prelu_bwd_pd_t *pd) {
 
 jit_prelu_reduction_kernel_t::jit_prelu_reduction_kernel_t(
         const cpu_prelu_bwd_pd_t *pd, int simd_w)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , scratchpad_c_block_offset_(
               utils::rnd_up(get_C(pd), alignment) * sizeof(float))
     , simd_w_(simd_w)
@@ -128,7 +128,7 @@ Xbyak::Address jit_prelu_reduction_kernel_t::diff_scratch_ptr(
 template <typename Vmm>
 jit_uni_prelu_reduction_kernel_t<Vmm>::jit_uni_prelu_reduction_kernel_t(
         const cpu_prelu_bwd_pd_t *pd, const cpu_isa_t &isa)
-    : jit_prelu_reduction_kernel_t(pd, vreg_traits<Vmm>::vlen / sizeof(float))
+    : jit_prelu_reduction_kernel_t(pd, vreg_traits_t<Vmm>::vlen / sizeof(float))
     , isa_(isa)
     , saturation_needed_(utils::one_of(
               data_type_, data_type::s8, data_type::u8, data_type::s32))
diff --git a/src/cpu/x64/prelu/jit_prelu_reduction_kernel.hpp b/src/cpu/x64/prelu/jit_prelu_reduction_kernel.hpp
index 32cf7fd3aae..796206ca1d5 100644
--- a/src/cpu/x64/prelu/jit_prelu_reduction_kernel.hpp
+++ b/src/cpu/x64/prelu/jit_prelu_reduction_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-class jit_prelu_reduction_kernel_t : public jit_generator {
+class jit_prelu_reduction_kernel_t : public jit_generator_t {
 public:
     static jit_prelu_reduction_kernel_t *create(const cpu_prelu_bwd_pd_t *pd);
 
@@ -46,7 +46,7 @@ class jit_prelu_reduction_kernel_t : public jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_prelu_reduction_kernel_t)
 
     void operator()(jit_prelu_reduction_kernel_t::call_params_t *params) {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
 private:
diff --git a/src/cpu/x64/prelu/jit_prelu_utils.cpp b/src/cpu/x64/prelu/jit_prelu_utils.cpp
index 459757c91dd..a5cbf64e491 100644
--- a/src/cpu/x64/prelu/jit_prelu_utils.cpp
+++ b/src/cpu/x64/prelu/jit_prelu_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,34 +47,34 @@ cpu_isa_t get_supported_isa() {
 
 static int get_vlen(const cpu_isa_t &isa) noexcept {
     if (isa == avx512_core_fp16)
-        return cpu_isa_traits<avx512_core_fp16>::vlen;
+        return cpu_isa_traits_t<avx512_core_fp16>::vlen;
     else if (isa == avx512_core_bf16)
-        return cpu_isa_traits<avx512_core_bf16>::vlen;
+        return cpu_isa_traits_t<avx512_core_bf16>::vlen;
     else if (isa == avx512_core)
-        return cpu_isa_traits<avx512_core>::vlen;
+        return cpu_isa_traits_t<avx512_core>::vlen;
     else if (isa == avx2_vnni_2)
-        return cpu_isa_traits<avx2_vnni_2>::vlen;
+        return cpu_isa_traits_t<avx2_vnni_2>::vlen;
     else if (isa == avx2)
-        return cpu_isa_traits<avx2>::vlen;
+        return cpu_isa_traits_t<avx2>::vlen;
     else if (isa == avx)
-        return cpu_isa_traits<avx>::vlen;
-    return cpu_isa_traits<sse41>::vlen;
+        return cpu_isa_traits_t<avx>::vlen;
+    return cpu_isa_traits_t<sse41>::vlen;
 }
 
 int get_n_vregs(const cpu_isa_t &isa) noexcept {
     if (isa == avx512_core_fp16)
-        return cpu_isa_traits<avx512_core_fp16>::n_vregs;
+        return cpu_isa_traits_t<avx512_core_fp16>::n_vregs;
     else if (isa == avx512_core_bf16)
-        return cpu_isa_traits<avx512_core_bf16>::n_vregs;
+        return cpu_isa_traits_t<avx512_core_bf16>::n_vregs;
     else if (isa == avx512_core)
-        return cpu_isa_traits<avx512_core>::n_vregs;
+        return cpu_isa_traits_t<avx512_core>::n_vregs;
     else if (isa == avx2_vnni_2)
-        return cpu_isa_traits<avx2_vnni_2>::n_vregs;
+        return cpu_isa_traits_t<avx2_vnni_2>::n_vregs;
     else if (isa == avx2)
-        return cpu_isa_traits<avx2>::n_vregs;
+        return cpu_isa_traits_t<avx2>::n_vregs;
     else if (isa == avx)
-        return cpu_isa_traits<avx>::n_vregs;
-    return cpu_isa_traits<sse41>::n_vregs;
+        return cpu_isa_traits_t<avx>::n_vregs;
+    return cpu_isa_traits_t<sse41>::n_vregs;
 }
 
 bool is_s8u8(const std::set<data_type_t> &tensor_data_types) noexcept {
@@ -88,7 +88,7 @@ int get_simd_w(const std::set<data_type_t> &tensor_data_types) noexcept {
     const auto &isa = prelu::get_supported_isa();
 
     return (isa == avx && is_s8u8(tensor_data_types))
-            ? vreg_traits<Xbyak::Xmm>::vlen / sizeof(float)
+            ? vreg_traits_t<Xbyak::Xmm>::vlen / sizeof(float)
             : prelu::get_vlen(isa) / sizeof(float);
 }
 
@@ -205,7 +205,7 @@ size_t get_block_tail_size(const memory_desc_t *mem) noexcept {
     return mem_d.padded_dims()[1] - mem_d.dims()[1];
 }
 
-void apply_zero_padding(jit_generator *host, const size_t tail_size,
+void apply_zero_padding(jit_generator_t *host, const size_t tail_size,
         const data_type_t dt, const size_t block_tail_size,
         const Xbyak::Reg64 &reg_dst, const Xbyak::Reg64 *reg_offset) noexcept {
     using namespace Xbyak;
diff --git a/src/cpu/x64/prelu/jit_prelu_utils.hpp b/src/cpu/x64/prelu/jit_prelu_utils.hpp
index 70df3a44e79..249a4aa811a 100644
--- a/src/cpu/x64/prelu/jit_prelu_utils.hpp
+++ b/src/cpu/x64/prelu/jit_prelu_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ bool is_s8u8(const std::set<data_type_t> &tensor_data_types) noexcept;
 int get_simd_w(const std::set<data_type_t> &tensor_data_types) noexcept;
 size_t c_blk_nelems(const memory_desc_t *mem, bool padding) noexcept;
 size_t get_block_tail_size(const memory_desc_t *mem) noexcept;
-void apply_zero_padding(jit_generator *host, const size_t tail_size,
+void apply_zero_padding(jit_generator_t *host, const size_t tail_size,
         const data_type_t dt, const size_t block_tail_size,
         const Xbyak::Reg64 &reg_dst, const Xbyak::Reg64 *reg_offset) noexcept;
 
diff --git a/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.cpp b/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.cpp
index 680645b9e74..d608c1b676a 100644
--- a/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.cpp
+++ b/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -83,7 +83,7 @@ bool jit_prelu_backward_kernel_t::any_tensor_bf16() const {
 template <typename Vmm>
 jit_uni_prelu_backward_kernel_t<Vmm>::jit_uni_prelu_backward_kernel_t(
         const cpu_prelu_bwd_pd_t *pd, const cpu_isa_t &isa)
-    : jit_prelu_backward_kernel_t(pd, isa, vreg_traits<Vmm>::vlen,
+    : jit_prelu_backward_kernel_t(pd, isa, vreg_traits_t<Vmm>::vlen,
             std::is_same<Vmm, Xbyak::Zmm>::value ? 4u : 6u)
     , saturation_needed_diff_src_(utils::one_of(
               diff_src_dt_, data_type::u8, data_type::s8, data_type::s32))
@@ -306,7 +306,7 @@ const Xbyak::Operand &jit_uni_prelu_backward_kernel_t<Vmm>::get_or_load_weights(
     return weights_vmm;
 }
 
-static void reduce(jit_generator *host, const Xbyak::Xmm &src,
+static void reduce(jit_generator_t *host, const Xbyak::Xmm &src,
         const Xbyak::Xmm &helper, const cpu_isa_t &isa) {
     UNUSED(helper);
     if (isa == sse41) {
@@ -318,7 +318,7 @@ static void reduce(jit_generator *host, const Xbyak::Xmm &src,
     }
 }
 
-static void reduce(jit_generator *host, const Xbyak::Ymm &src,
+static void reduce(jit_generator_t *host, const Xbyak::Ymm &src,
         const Xbyak::Ymm &helper, const cpu_isa_t &isa) {
     const Xbyak::Xmm xmm_helper {helper.getIdx()};
     const Xbyak::Xmm xmm_src {src.getIdx()};
@@ -328,7 +328,7 @@ static void reduce(jit_generator *host, const Xbyak::Ymm &src,
     reduce(host, xmm_src, xmm_helper, isa);
 }
 
-static void reduce(jit_generator *host, const Xbyak::Zmm &src,
+static void reduce(jit_generator_t *host, const Xbyak::Zmm &src,
         const Xbyak::Zmm &helper, const cpu_isa_t &isa) {
     const Xbyak::Ymm ymm_helper {helper.getIdx()};
     const Xbyak::Ymm ymm_src {src.getIdx()};
diff --git a/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.hpp b/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.hpp
index 022551ac6cc..18eb8561185 100644
--- a/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.hpp
+++ b/src/cpu/x64/prelu/jit_uni_prelu_backward_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ class jit_prelu_backward_kernel_t : public jit_prelu_base_kernel_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_prelu_backward_kernel_t)
 
     void operator()(jit_prelu_backward_kernel_t::call_params_t *params) {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
 protected:
diff --git a/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.cpp b/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.cpp
index 28b2b89574e..d707594db28 100644
--- a/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.cpp
+++ b/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -71,7 +71,7 @@ bool jit_prelu_forward_kernel_t::any_tensor_bf16() const {
 template <typename Vmm>
 jit_uni_prelu_forward_kernel_t<Vmm>::jit_uni_prelu_forward_kernel_t(
         const cpu_prelu_fwd_pd_t *pd, const cpu_isa_t &isa)
-    : jit_prelu_forward_kernel_t(pd, isa, vreg_traits<Vmm>::vlen,
+    : jit_prelu_forward_kernel_t(pd, isa, vreg_traits_t<Vmm>::vlen,
             (utils::one_of(isa, sse41, avx)
                     || pd->src_md(0)->data_type != data_type::f32)
                     ? 4u
diff --git a/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.hpp b/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.hpp
index df787dc7784..23735be154b 100644
--- a/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.hpp
+++ b/src/cpu/x64/prelu/jit_uni_prelu_forward_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ class jit_prelu_forward_kernel_t : public jit_prelu_base_kernel_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_prelu_forward_kernel_t)
 
     void operator()(jit_prelu_forward_kernel_t::call_params_t *params) {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
 protected:
@@ -75,7 +75,7 @@ class jit_uni_prelu_forward_kernel_t : public jit_prelu_forward_kernel_t {
     ~jit_uni_prelu_forward_kernel_t() override;
 
 private:
-    using jit_generator::uni_vfmadd132ps;
+    using jit_generator_t::uni_vfmadd132ps;
 
     void prepare_kernel_const_vars() override;
     void compute_dst(size_t unrolling_factor, bool tail) override;
diff --git a/src/cpu/x64/rnn/brgemm_cell_common_bwd.cpp b/src/cpu/x64/rnn/brgemm_cell_common_bwd.cpp
index 0cea1f117fa..429613e4062 100644
--- a/src/cpu/x64/rnn/brgemm_cell_common_bwd.cpp
+++ b/src/cpu/x64/rnn/brgemm_cell_common_bwd.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ brgemm_diff_src_layer_iter_t<weights_t, scratch_t,
               * rnn.diff_src_brgemm.N_layer_blocks)
     , LDA_(rnn.diff_src_brgemm.LDA)
     , LDC_(rnn.diff_src_brgemm.LDC)
-    , max_nthr_(rnn.nthr)
+    , max_nthr_(nstl::min(dnnl_get_current_num_threads(), rnn.nthr))
     , n_blocking_(rnn.diff_src_brgemm.N_blocks)
     , m_blocking_(rnn.diff_src_brgemm.M_blocks)
     , work_amount_(n_blocking_ * m_blocking_)
@@ -446,7 +446,7 @@ brgemm_diff_weights_layer_iter_t<src_layer_t, src_iter_t, scratch_t,
     , LDA_layer_(rnn.diff_wei_brgemm.LDA_layer)
     , LDC_iter_(rnn.diff_wei_brgemm.LDC_iter)
     , LDC_layer_(rnn.diff_wei_brgemm.LDC_layer)
-    , max_nthr_(rnn.nthr)
+    , max_nthr_(nstl::min(dnnl_get_current_num_threads(), rnn.nthr))
     , n_blocking_(rnn.diff_wei_brgemm.N_blocks)
     , m_blocking_(rnn.diff_wei_brgemm.M_blocks)
     , k_blocks_(rnn.diff_wei_brgemm.K_blocks)
diff --git a/src/cpu/x64/rnn/brgemm_cell_common_fwd.cpp b/src/cpu/x64/rnn/brgemm_cell_common_fwd.cpp
old mode 100755
new mode 100644
index 2d0b633a805..0df8c950eb6
--- a/src/cpu/x64/rnn/brgemm_cell_common_fwd.cpp
+++ b/src/cpu/x64/rnn/brgemm_cell_common_fwd.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ brgemm_dst_layer_iter_t<src_t, weights_t, scratch_t,
     , C_cell_(scratch_cell)
     , LDAl_(rnn_.src_layer_ld(cell_position))
     , LDAi_(rnn_.src_iter_ld(cell_position))
-    , max_nthr_(rnn_.nthr)
+    , max_nthr_(nstl::min(dnnl_get_current_num_threads(), rnn_.nthr))
     , n_blocking_((rnn_.unfused_post_gemm) ? rnn_.N_blocks * rnn_.n_gates
                                            : rnn_.N_blocks)
     , m_blocking_(rnn_.M_blocks)
@@ -452,7 +452,7 @@ brgemm_dst_proj_t<src_t, weights_t, gemm_acc_t>::brgemm_dst_proj_t(
     , C_(output)
     , LDC_(rnn_.is_cell_dt_f32() ? rnn_.dst_layer_ld(cell_position, true)
                                  : rnn_.scratch_gates_ld)
-    , max_nthr_(rnn_.nthr)
+    , max_nthr_(nstl::min(dnnl_get_current_num_threads(), rnn_.nthr))
     , work_amount_proj_(rnn_.Nproj_blocks * rnn_.M_blocks)
     , B_n_offset_(rnn_.Kprojpadded * rnn_.n_block)
     , Bp_kb_offset_(rnn_.kproj_block * rnn_.n_block)
@@ -601,7 +601,7 @@ brgemm_gru_t<src_t, weights_t, scratch_t, gemm_acc_t>::brgemm_gru_t(
     , LDAl_(rnn_.src_layer_ld(cell_position))
     , LDAi_p1_(rnn_.src_iter_ld(cell_position))
     , LDAi_p2_(rnn_.dst_iter_part2_ld(cell_position))
-    , max_nthr_(rnn_.nthr)
+    , max_nthr_(nstl::min(dnnl_get_current_num_threads(), rnn_.nthr))
     , n_blocking_((rnn_.unfused_post_gemm) ? rnn_.N_blocks * rnn_.n_gates
                                            : rnn_.N_blocks)
     , m_blocking_(rnn_.M_blocks)
@@ -688,9 +688,8 @@ void brgemm_gru_t<src_t, weights_t, scratch_t, gemm_acc_t>::kernel(
     gemm_acc_t *const amx_buffer = is_amx
             ? amx_scratchpad_ + rnn_.m_block * rnn_.n_block * ithr
             : nullptr;
-    const int max_K_Block = 2
-            * nstl::max(rnn_.KB1_blocks + 1,
-                    nstl::max(rnn_.KBproj_blocks + 1, rnn_.KB2_blocks + 1));
+    const int max_K_Block = nstl::max(rnn_.KB1_blocks + 1,
+            nstl::max(rnn_.KBproj_blocks + 1, rnn_.KB2_blocks + 1));
     brgemm_batch_element_t *const addr_batch
             = addr_batch_global_ + ithr * max_K_Block;
 
@@ -913,7 +912,7 @@ brgemm_merged_layer_t<src_t, weights_t, scratch_t,
     , Bl_(w_layer)
     , C_(scratch_gates)
     , LDAl_(rnn_.src_layer_ld(cell_position))
-    , max_nthr_(rnn_.nthr)
+    , max_nthr_(nstl::min(dnnl_get_current_num_threads(), rnn_.nthr))
     , n_blocking_((rnn_.unfused_post_gemm) ? rnn_.N_blocks * rnn_.n_gates
                                            : rnn_.N_blocks)
     , m_blocking_(rnn_.Mlayermerged_blocks)
diff --git a/src/cpu/x64/rnn/brgemm_cell_common_fwd.hpp b/src/cpu/x64/rnn/brgemm_cell_common_fwd.hpp
index 63d55705f86..16fd2d53bf8 100644
--- a/src/cpu/x64/rnn/brgemm_cell_common_fwd.hpp
+++ b/src/cpu/x64/rnn/brgemm_cell_common_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -121,7 +121,6 @@ class brgemm_dst_proj_t {
 private:
     void kernel(const int ithr, const int nthr) const;
 
-private:
     const ref_rnn_brgemm_t &rnn_brgemm_;
     const rnn_utils::rnn_conf_t &rnn_;
     const int proj_desc_idx_;
diff --git a/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.cpp b/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.cpp
index d3e4cda7fa7..37612d0101d 100644
--- a/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.cpp
+++ b/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace x64 {
 
 jit_brgemm_transpose_single_row_t::jit_brgemm_transpose_single_row_t(
         const int m_block)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , m_block_(m_block)
     , full_loop_iters_(m_block_ / (vmms_available_ * simd_w_))
     , tail_(m_block_ % simd_w_)
diff --git a/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.hpp b/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.hpp
index 5142ea355fe..4a0695dbf70 100644
--- a/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.hpp
+++ b/src/cpu/x64/rnn/jit_brgemm_transpose_single_row.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ namespace x64 {
  * In such case, because of perf reasons, number of output columns is extended
  * to 2.
  */
-class jit_brgemm_transpose_single_row_t : public jit_generator {
+class jit_brgemm_transpose_single_row_t : public jit_generator_t {
 public:
     jit_brgemm_transpose_single_row_t(const int m_block);
 
@@ -46,7 +46,7 @@ class jit_brgemm_transpose_single_row_t : public jit_generator {
 
     void operator()(
             jit_brgemm_transpose_single_row_t::call_params_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
 private:
diff --git a/src/cpu/x64/rnn/jit_diff_weights_peephole.cpp b/src/cpu/x64/rnn/jit_diff_weights_peephole.cpp
index 1c1d6650f98..91d351d470f 100644
--- a/src/cpu/x64/rnn/jit_diff_weights_peephole.cpp
+++ b/src/cpu/x64/rnn/jit_diff_weights_peephole.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace x64 {
 
 jit_diff_weights_peephole_t::jit_diff_weights_peephole_t(
         const rnn_utils::rnn_conf_t &rnn, const dim_t dhc_block_size)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , c_states_dt_(rnn.src_iter_c_dt)
     , scratch_dt_(utils::map(true, data_type::f32, rnn.is_bf16_conf(),
               data_type::bf16, rnn.is_f16_conf(), data_type::f16))
diff --git a/src/cpu/x64/rnn/jit_diff_weights_peephole.hpp b/src/cpu/x64/rnn/jit_diff_weights_peephole.hpp
index c4bccc0e335..fc05272de33 100644
--- a/src/cpu/x64/rnn/jit_diff_weights_peephole.hpp
+++ b/src/cpu/x64/rnn/jit_diff_weights_peephole.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ struct rnn_conf_t;
 }; // namespace rnn_utils
 namespace x64 {
 
-class jit_diff_weights_peephole_t : public jit_generator {
+class jit_diff_weights_peephole_t : public jit_generator_t {
 public:
     jit_diff_weights_peephole_t(
             const rnn_utils::rnn_conf_t &rnn, const dim_t dhc_block);
@@ -40,7 +40,7 @@ class jit_diff_weights_peephole_t : public jit_generator {
     };
 
     void operator()(jit_diff_weights_peephole_t::call_params_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
 private:
diff --git a/src/cpu/x64/rnn/jit_gates_reduction.cpp b/src/cpu/x64/rnn/jit_gates_reduction.cpp
index 04d5c99d674..099e59792c7 100644
--- a/src/cpu/x64/rnn/jit_gates_reduction.cpp
+++ b/src/cpu/x64/rnn/jit_gates_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ namespace x64 {
 
 jit_gates_reduction_t::jit_gates_reduction_t(
         const rnn_utils::rnn_conf_t &rnn, bool is_n_tail)
-    : jit_generator(jit_name())
+    : jit_generator_t(jit_name())
     , rnn_(rnn)
     , is_n_tail_(is_n_tail)
     , n_block_(is_n_tail_ ? rnn_.diff_wei_brgemm.n_tail
@@ -67,9 +67,9 @@ std::vector<Xbyak::Zmm> jit_gates_reduction_t::reserve_acc_regs() {
     acc_regs.reserve(n_simd_w_blks_ + n_tail_);
 
     for (int i = 0; i < n_simd_w_blks_; ++i)
-        acc_regs.emplace_back(Xbyak::Zmm(reserve_vmm()));
+        acc_regs.emplace_back(reserve_vmm());
 
-    if (n_tail_) acc_regs.emplace_back(Xbyak::Zmm(reserve_vmm()));
+    if (n_tail_) acc_regs.emplace_back(reserve_vmm());
 
     return acc_regs;
 }
diff --git a/src/cpu/x64/rnn/jit_gates_reduction.hpp b/src/cpu/x64/rnn/jit_gates_reduction.hpp
index ccba0e2223f..72767196913 100644
--- a/src/cpu/x64/rnn/jit_gates_reduction.hpp
+++ b/src/cpu/x64/rnn/jit_gates_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ namespace x64 {
  * scratch_blocked Oi32o(f32)/OI32o2i(bf16) (n_gates * rnn.dhc, mb)
  * diff_bias = o(n_gates * rnn.dhc)
  */
-class jit_gates_reduction_t : public jit_generator {
+class jit_gates_reduction_t : public jit_generator_t {
 public:
     jit_gates_reduction_t(const rnn_utils::rnn_conf_t &rnn, bool is_n_tail);
 
@@ -49,7 +49,7 @@ class jit_gates_reduction_t : public jit_generator {
     };
 
     void operator()(jit_gates_reduction_t::call_params_t *params) const {
-        jit_generator::operator()(params);
+        jit_generator_t::operator()(params);
     }
 
 private:
diff --git a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_bwd.hpp b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_bwd.hpp
index 5abc8e17102..3428319ad68 100644
--- a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_bwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_bwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ struct jit_uni_gru_cell_postgemm_part1_bwd : public jit_uni_rnn_postgemm {
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
         : jit_uni_rnn_postgemm(rnn, pd, jit_name()) {}
 
-    ~jit_uni_gru_cell_postgemm_part1_bwd() {}
+    ~jit_uni_gru_cell_postgemm_part1_bwd() override = default;
 
     status_t init(data_type_t sdt) override {
         CHECK(jit_uni_rnn_postgemm::init(src_data_t));
@@ -42,8 +42,8 @@ struct jit_uni_gru_cell_postgemm_part1_bwd : public jit_uni_rnn_postgemm {
 
 protected:
     // register size in bytes
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     const size_t vlen_scratch
             = vlen / (sizeof(float) / types::data_type_size(scratch_data_t));
     static constexpr size_t hstate_dt_size = sizeof(float);
@@ -197,14 +197,14 @@ struct jit_uni_gru_cell_postgemm_part1_bwd : public jit_uni_rnn_postgemm {
 
         // Reduce diff attention into XMM size. Otherwise accumulation
         // using XMM will zero high part of YMM/ZMM.
-        if (vlen >= cpu_isa_traits<avx512_core>::vlen) {
+        if (vlen >= cpu_isa_traits_t<avx512_core>::vlen) {
             Zmm diff_attn_acc(dattn_acc_idx);
             Ymm diff_attn_acc_high(tmp1_idx);
             Ymm diff_attn_acc_low(dattn_acc_idx);
             vextractf32x8(diff_attn_acc_high, diff_attn_acc, 1);
             vaddps(diff_attn_acc_low, diff_attn_acc_low, diff_attn_acc_high);
         }
-        if (vlen >= cpu_isa_traits<avx2>::vlen) {
+        if (vlen >= cpu_isa_traits_t<avx2>::vlen) {
             Ymm diff_attn_acc(dattn_acc_idx);
             Xmm diff_attn_acc_high(tmp1_idx);
             Xmm diff_attn_acc_low(dattn_acc_idx);
diff --git a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_fwd.hpp b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_fwd.hpp
index 3f339252946..4135d1baa95 100644
--- a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_fwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_1_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ struct jit_uni_gru_cell_postgemm_part1_fwd : public jit_uni_rnn_postgemm {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_gru_cell_postgemm_part1_fwd)
 
     using injector_t = typename utils::conditional<isa == avx512_core,
-            jit_uni_eltwise_injector<avx512_core>,
-            jit_uni_eltwise_injector<isa>>::type;
+            jit_uni_eltwise_injector_t<avx512_core>,
+            jit_uni_eltwise_injector_t<isa>>::type;
 
     jit_uni_gru_cell_postgemm_part1_fwd(
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
@@ -55,8 +55,8 @@ struct jit_uni_gru_cell_postgemm_part1_fwd : public jit_uni_rnn_postgemm {
     std::unique_ptr<injector_t> sigmoid_injector_;
 
     // register size in bytes
-    using Vmm = typename jit_uni_eltwise_injector<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename jit_uni_eltwise_injector_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t qscale_dt_size = sizeof(float);
     const size_t vlen_dst
             = vlen / (sizeof(float) / types::data_type_size(src_data_t));
diff --git a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_bwd.hpp b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_bwd.hpp
index 6922969268a..761c59db557 100644
--- a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_bwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_bwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ struct jit_uni_gru_cell_postgemm_part2_bwd : public jit_uni_rnn_postgemm {
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
         : jit_uni_rnn_postgemm(rnn, pd, jit_name()) {}
 
-    ~jit_uni_gru_cell_postgemm_part2_bwd() {}
+    ~jit_uni_gru_cell_postgemm_part2_bwd() override = default;
 
     status_t init(data_type_t sdt) override {
         CHECK(jit_uni_rnn_postgemm::init(src_data_t));
@@ -42,8 +42,8 @@ struct jit_uni_gru_cell_postgemm_part2_bwd : public jit_uni_rnn_postgemm {
 
 protected:
     // register size in bytes
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t hstate_dt_size = sizeof(float);
     const size_t vlen_scratch
             = vlen / (sizeof(float) / types::data_type_size(scratch_data_t));
diff --git a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_fwd.hpp b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_fwd.hpp
index 49814da34eb..86fb3cb8165 100644
--- a/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_fwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_gru_cell_postgemm_2_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,8 +30,8 @@ struct jit_uni_gru_cell_postgemm_part2_fwd : public jit_uni_rnn_postgemm {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_gru_cell_postgemm_part2_fwd)
 
     using injector_t = typename utils::conditional<isa == avx512_core,
-            jit_uni_eltwise_injector<avx512_core>,
-            jit_uni_eltwise_injector<isa>>::type;
+            jit_uni_eltwise_injector_t<avx512_core>,
+            jit_uni_eltwise_injector_t<isa>>::type;
 
     jit_uni_gru_cell_postgemm_part2_fwd(
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
@@ -55,8 +55,8 @@ struct jit_uni_gru_cell_postgemm_part2_fwd : public jit_uni_rnn_postgemm {
     std::unique_ptr<injector_t> tanh_injector_;
 
     // register size in bytes
-    using Vmm = typename jit_uni_eltwise_injector<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename jit_uni_eltwise_injector_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t qscale_dt_size = sizeof(float);
     const size_t vlen_dst
             = vlen / (sizeof(float) / types::data_type_size(src_data_t));
diff --git a/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_bwd.hpp b/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_bwd.hpp
index 4e8a5a04904..8d794fa046f 100644
--- a/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_bwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_bwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ struct jit_uni_gru_lbr_cell_postgemm_bwd : public jit_uni_rnn_postgemm {
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
         : jit_uni_rnn_postgemm(rnn, pd, jit_name()) {}
 
-    ~jit_uni_gru_lbr_cell_postgemm_bwd() {}
+    ~jit_uni_gru_lbr_cell_postgemm_bwd() override = default;
 
     status_t init(data_type_t sdt) override {
         CHECK(jit_uni_rnn_postgemm::init(src_data_t));
@@ -42,8 +42,8 @@ struct jit_uni_gru_lbr_cell_postgemm_bwd : public jit_uni_rnn_postgemm {
 
 protected:
     // register size in bytes
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t hstate_dt_size = sizeof(float);
     const size_t vlen_scratch
             = vlen / (sizeof(float) / types::data_type_size(scratch_data_t));
@@ -236,14 +236,14 @@ struct jit_uni_gru_lbr_cell_postgemm_bwd : public jit_uni_rnn_postgemm {
 
         // Reduce diff attention into XMM size. Otherwise accumulation
         // using XMM will zero high part of YMM/ZMM.
-        if (vlen >= cpu_isa_traits<avx512_core>::vlen) {
+        if (vlen >= cpu_isa_traits_t<avx512_core>::vlen) {
             Zmm diff_attn_acc(dattn_acc_idx);
             Ymm diff_attn_acc_high(tmp1_idx);
             Ymm diff_attn_acc_low(dattn_acc_idx);
             vextractf32x8(diff_attn_acc_high, diff_attn_acc, 1);
             vaddps(diff_attn_acc_low, diff_attn_acc_low, diff_attn_acc_high);
         }
-        if (vlen >= cpu_isa_traits<avx2>::vlen) {
+        if (vlen >= cpu_isa_traits_t<avx2>::vlen) {
             Ymm diff_attn_acc(dattn_acc_idx);
             Xmm diff_attn_acc_high(tmp1_idx);
             Xmm diff_attn_acc_low(dattn_acc_idx);
diff --git a/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_fwd.hpp b/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_fwd.hpp
index 2215c4abbcd..24663441828 100644
--- a/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_fwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_gru_lbr_cell_postgemm_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ struct jit_uni_gru_lbr_cell_postgemm_fwd : public jit_uni_rnn_postgemm {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_gru_lbr_cell_postgemm_fwd)
 
     using injector_t = typename utils::conditional<isa == avx512_core,
-            jit_uni_eltwise_injector<avx512_core>,
-            jit_uni_eltwise_injector<isa>>::type;
+            jit_uni_eltwise_injector_t<avx512_core>,
+            jit_uni_eltwise_injector_t<isa>>::type;
 
     jit_uni_gru_lbr_cell_postgemm_fwd(
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
@@ -56,8 +56,8 @@ struct jit_uni_gru_lbr_cell_postgemm_fwd : public jit_uni_rnn_postgemm {
     std::unique_ptr<injector_t> tanh_injector_;
 
     // register size in bytes
-    using Vmm = typename jit_uni_eltwise_injector<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename jit_uni_eltwise_injector_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
 
     const size_t vlen_dst
             = vlen / (sizeof(float) / types::data_type_size(src_data_t));
diff --git a/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm.hpp b/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm.hpp
index 8c6a0bd6331..58f80545df0 100644
--- a/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm.hpp
+++ b/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,18 +28,18 @@ namespace x64 {
 template <cpu_isa_t isa>
 struct jit_uni_lstm_cell_postgemm_t {
     jit_uni_lstm_cell_postgemm_t(
-            jit_generator *host, int tmp_id_begin, bool use_bf16_emu)
+            jit_generator_t *host, int tmp_id_begin, bool use_bf16_emu)
         : host_(host)
         , min_allowed_tmp_vmm_idx_(0)
-        , max_allowed_tmp_vmm_idx_(cpu_isa_traits<isa>::n_vregs - 1
+        , max_allowed_tmp_vmm_idx_(cpu_isa_traits_t<isa>::n_vregs - 1
                   - (is_superset(isa, avx512_core) && use_bf16_emu ? 4 : 0)) {
         reset_tmp_vmm_idx_range(tmp_id_begin, max_allowed_tmp_vmm_idx_);
     }
 
 protected:
-    using injector_t = jit_uni_eltwise_injector<isa>;
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    const size_t vlen_ = cpu_isa_traits<isa>::vlen;
+    using injector_t = jit_uni_eltwise_injector_t<isa>;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    const size_t vlen_ = cpu_isa_traits_t<isa>::vlen;
 
     Vmm get_next_tmp_vmm() {
         const Vmm vmm {current_tmp_id_++};
@@ -162,11 +162,10 @@ struct jit_uni_lstm_cell_postgemm_t {
         }
     }
 
-protected:
     const bool avx2_available_ = is_superset(isa, avx2);
 
 private:
-    jit_generator *host_;
+    jit_generator_t *host_;
     const int min_allowed_tmp_vmm_idx_;
     const int max_allowed_tmp_vmm_idx_;
     int tmp_id_first_;
diff --git a/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_bwd.hpp b/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_bwd.hpp
index 44dcc92c2db..d242df98187 100644
--- a/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_bwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_bwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,7 +45,8 @@ struct jit_uni_lstm_cell_postgemm_bwd
                   // stage
                   src_data_t == data_type::bf16 && !mayiuse(avx512_core_bf16)) {
     }
-    ~jit_uni_lstm_cell_postgemm_bwd() = default;
+
+    ~jit_uni_lstm_cell_postgemm_bwd() override = default;
 
     status_t init(data_type_t sdt) override {
         CHECK(jit_uni_rnn_postgemm::init(src_data_t));
@@ -63,7 +64,7 @@ struct jit_uni_lstm_cell_postgemm_bwd
     std::unique_ptr<injector_t> tanh_injector_;
 
     // register size in bytes
-    static constexpr size_t vlen_ = cpu_isa_traits<isa>::vlen;
+    static constexpr size_t vlen_ = cpu_isa_traits_t<isa>::vlen;
     const size_t vlen_c_states_ = vlen_ / (sizeof(float) / cstate_dt_size_);
 
     static constexpr size_t diff_cstate_dt_size_ = sizeof(float);
diff --git a/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_fwd.hpp b/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_fwd.hpp
index 010c0c6e324..016eed84f66 100644
--- a/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_fwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct jit_uni_lstm_cell_postgemm_fwd
                   src_data_t == data_type::bf16 && !mayiuse(avx512_core_bf16)) {
     }
 
-    ~jit_uni_lstm_cell_postgemm_fwd() = default;
+    ~jit_uni_lstm_cell_postgemm_fwd() override = default;
 
     status_t init(data_type_t sdt) override {
         CHECK(jit_uni_rnn_postgemm::init(src_data_t));
@@ -69,7 +69,7 @@ struct jit_uni_lstm_cell_postgemm_fwd
     std::unique_ptr<injector_t> tanh_injector_;
 
     // register size in bytes
-    static constexpr size_t vlen_ = cpu_isa_traits<isa>::vlen;
+    static constexpr size_t vlen_ = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t qscale_dt_size = sizeof(float);
     static constexpr size_t weights_peephole_dt_size_ = sizeof(float);
     const size_t vlen_dst_
diff --git a/src/cpu/x64/rnn/jit_uni_lstm_cell_projection_postgemm_fwd.hpp b/src/cpu/x64/rnn/jit_uni_lstm_cell_projection_postgemm_fwd.hpp
index d82ebd9f069..125fa9826b0 100644
--- a/src/cpu/x64/rnn/jit_uni_lstm_cell_projection_postgemm_fwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_lstm_cell_projection_postgemm_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ struct jit_uni_lstm_cell_projection_postgemm_fwd : public jit_uni_rnn_postgemm {
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
         : jit_uni_rnn_postgemm(rnn, pd, jit_name()) {}
 
-    ~jit_uni_lstm_cell_projection_postgemm_fwd() {}
+    ~jit_uni_lstm_cell_projection_postgemm_fwd() override = default;
 
     status_t init(data_type_t sdt) override {
         jit_uni_rnn_postgemm::init(src_data_t);
@@ -43,15 +43,15 @@ struct jit_uni_lstm_cell_projection_postgemm_fwd : public jit_uni_rnn_postgemm {
 
 protected:
     // register size in bytes
-    using Vmm = typename jit_uni_eltwise_injector<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename jit_uni_eltwise_injector_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t qscale_dt_size = sizeof(float);
     const size_t vlen_dst
             = vlen / (sizeof(float) / types::data_type_size(src_data_t));
     const size_t hstate_dt_size = types::data_type_size(src_data_t);
     const size_t scratch_dt_size = types::data_type_size(scratch_data_t);
 
-    void generate() {
+    void generate() override {
         using namespace Xbyak;
 
         // Labels declaration
diff --git a/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_bwd.hpp b/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_bwd.hpp
index 514983e7cb2..bf3fc2a24cc 100644
--- a/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_bwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_bwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ struct jit_uni_rnn_cell_postgemm_bwd : public jit_uni_rnn_postgemm {
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
         : jit_uni_rnn_postgemm(rnn, pd, jit_name()) {}
 
-    ~jit_uni_rnn_cell_postgemm_bwd() {}
+    ~jit_uni_rnn_cell_postgemm_bwd() override = default;
 
     status_t init(data_type_t sdt) override {
         CHECK(jit_uni_rnn_postgemm::init(src_data_t));
@@ -42,8 +42,8 @@ struct jit_uni_rnn_cell_postgemm_bwd : public jit_uni_rnn_postgemm {
 
 protected:
     // register size in bytes
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t hstate_dt_size = sizeof(float);
     const size_t vlen_scratch
             = vlen / (sizeof(float) / types::data_type_size(scratch_data_t));
diff --git a/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_fwd.hpp b/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_fwd.hpp
index 6cb72cd75df..f064e268d0f 100644
--- a/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_fwd.hpp
+++ b/src/cpu/x64/rnn/jit_uni_rnn_cell_postgemm_fwd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ struct jit_uni_rnn_cell_postgemm_fwd : public jit_uni_rnn_postgemm {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_rnn_cell_postgemm_fwd)
 
     using injector_t = typename utils::conditional<isa == avx512_core,
-            jit_uni_eltwise_injector<avx512_core>,
-            jit_uni_eltwise_injector<isa>>::type;
+            jit_uni_eltwise_injector_t<avx512_core>,
+            jit_uni_eltwise_injector_t<isa>>::type;
 
     jit_uni_rnn_cell_postgemm_fwd(
             const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
@@ -51,8 +51,8 @@ struct jit_uni_rnn_cell_postgemm_fwd : public jit_uni_rnn_postgemm {
     std::unique_ptr<injector_t> injector_;
 
     // register size in bytes
-    using Vmm = typename jit_uni_eltwise_injector<isa>::Vmm;
-    static constexpr size_t vlen = cpu_isa_traits<isa>::vlen;
+    using Vmm = typename jit_uni_eltwise_injector_t<isa>::Vmm;
+    static constexpr size_t vlen = cpu_isa_traits_t<isa>::vlen;
     static constexpr size_t cstate_dt_size = sizeof(float);
     static constexpr size_t qscale_dt_size = sizeof(float);
 
diff --git a/src/cpu/x64/rnn/jit_uni_rnn_common_postgemm.hpp b/src/cpu/x64/rnn/jit_uni_rnn_common_postgemm.hpp
index 3246142aa62..c1feaec9022 100644
--- a/src/cpu/x64/rnn/jit_uni_rnn_common_postgemm.hpp
+++ b/src/cpu/x64/rnn/jit_uni_rnn_common_postgemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,11 +34,11 @@ namespace impl {
 namespace cpu {
 namespace x64 {
 
-struct jit_uni_rnn_postgemm : public jit_generator {
+struct jit_uni_rnn_postgemm : public jit_generator_t {
 
     jit_uni_rnn_postgemm(const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd,
             const char *name)
-        : jit_generator(name)
+        : jit_generator_t(name)
         , rnn_(rnn)
         , pd_(pd)
         , projection_(false)
@@ -69,9 +69,7 @@ struct jit_uni_rnn_postgemm : public jit_generator {
         , zmm_tail_k_mask(k3)
         , xf16_dq_reg_idx(tmp_vector_register_idx) {}
 
-    ~jit_uni_rnn_postgemm() {
-        if (bf16_emu_) delete bf16_emu_;
-    }
+    ~jit_uni_rnn_postgemm() override { delete bf16_emu_; }
 
     bool is_projection() const { return projection_; };
 
@@ -431,7 +429,7 @@ struct jit_uni_rnn_postgemm : public jit_generator {
                 dshift_off_addr = ptr[qtable + 3 * vlen];
                 ymm_perm_mask_addr = ptr[qtable + 4 * vlen];
                 zmm_perm_mask_addr
-                        = ptr[qtable + 4 * vlen + cpu_isa_traits<avx>::vlen];
+                        = ptr[qtable + 4 * vlen + cpu_isa_traits_t<avx>::vlen];
                 break;
             }
             case data_type::f32: {
diff --git a/src/cpu/x64/rnn/rnn_brgemm_utils.cpp b/src/cpu/x64/rnn/rnn_brgemm_utils.cpp
index 0cc7d3bf74a..1ab023eb287 100644
--- a/src/cpu/x64/rnn/rnn_brgemm_utils.cpp
+++ b/src/cpu/x64/rnn/rnn_brgemm_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -85,11 +85,12 @@ x64::cpu_isa_t brgemm_calc_isa(
 
     if (rnn.is_cell_dt_int8()) {
         return utils::map(true, x64::isa_undef, mayiuse(avx512_core_vnni),
-                avx512_core, mayiuse(avx512_core), avx512_core);
+                avx512_core_vnni, mayiuse(avx512_core), avx512_core,
+                mayiuse(avx2), avx2);
     } else if (rnn.is_cell_dt_bf16()) {
         return x64::avx512_core_bf16;
     } else if (rnn.is_cell_dt_f16()) {
-        return isa_undef;
+        return x64::avx512_core_fp16;
     } else { // f32
         return utils::map(true, x64::isa_undef, mayiuse(avx512_core),
                 avx512_core, mayiuse(avx2), avx2);
@@ -564,7 +565,9 @@ status_t init_brgemm_kernel(x64::brgemm_desc_t *desc, x64::cpu_isa_t isa,
     brgattr.max_bs = max_bs;
     brgattr.max_top_vpad = 0;
     brgattr.max_bottom_vpad = 0;
+    brgattr.b_is_vnni = true;
     CHECK(brgemm_desc_set_attr(desc, brgattr));
+    CHECK(brgemm_desc_finalize(desc));
 
     x64::brgemm_kernel_t *_t_ptr;
     CHECK(brgemm_kernel_create(&_t_ptr, *desc));
diff --git a/src/cpu/x64/shuffle/jit_uni_shuffle.cpp b/src/cpu/x64/shuffle/jit_uni_shuffle.cpp
index 6b2e2053734..0f5dd99b63f 100644
--- a/src/cpu/x64/shuffle/jit_uni_shuffle.cpp
+++ b/src/cpu/x64/shuffle/jit_uni_shuffle.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -79,7 +79,7 @@ status_t jit_uni_shuffle_t<isa>::pd_t::init(engine_t *engine) {
             blocked_format != format_tag::undef, VERBOSE_UNSUPPORTED_TAG);
 
     conf_.blk_size = src_d.blocking_desc().strides[ndims() - 1];
-    conf_.simd_w = cpu_isa_traits<isa>::vlen / sizeof(float);
+    conf_.simd_w = cpu_isa_traits_t<isa>::vlen / sizeof(float);
 
     const bool has_spatial = utils::one_of(ndims(), 3, 4, 5);
     const dim_t HW = H() * W();
diff --git a/src/cpu/x64/shuffle/jit_uni_shuffle.hpp b/src/cpu/x64/shuffle/jit_uni_shuffle.hpp
index bbc6707d387..42c3612ee7a 100644
--- a/src/cpu/x64/shuffle/jit_uni_shuffle.hpp
+++ b/src/cpu/x64/shuffle/jit_uni_shuffle.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2021 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ struct jit_uni_shuffle_t : public primitive_t {
 
     jit_uni_shuffle_t(const pd_t *apd);
 
-    ~jit_uni_shuffle_t();
+    ~jit_uni_shuffle_t() override;
 
     status_t init(engine_t *engine) override;
 
diff --git a/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.cpp b/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.cpp
index e6bd94c31a3..329a3ccee61 100644
--- a/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.cpp
+++ b/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ static size_t get_padding_size(const jit_shuffle_conf_t &conf) {
 template <cpu_isa_t isa>
 jit_uni_shuffle_kernel_t<isa>::jit_uni_shuffle_kernel_t(
         const jit_shuffle_conf_t &conf)
-    : jit_generator(jit_name(), isa)
+    : jit_generator_t(jit_name(), isa)
     , conf_(conf)
     , padding_size_(get_padding_size(conf)) {}
 
diff --git a/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.hpp b/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.hpp
index ea7126ca5b7..4ef8f77cf36 100644
--- a/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.hpp
+++ b/src/cpu/x64/shuffle/jit_uni_shuffle_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,15 +36,15 @@ namespace x64 {
 using namespace Xbyak;
 
 template <cpu_isa_t isa>
-struct jit_uni_shuffle_kernel_t : public jit_generator {
+struct jit_uni_shuffle_kernel_t : public jit_generator_t {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_shuffle_kernel_t)
 
     jit_uni_shuffle_kernel_t(const jit_shuffle_conf_t &conf);
 
-    using Vmm = typename cpu_isa_traits<isa>::Vmm;
+    using Vmm = typename cpu_isa_traits_t<isa>::Vmm;
 
     constexpr int vmm_idx(int idx) const {
-        return (cpu_isa_traits<isa>::n_vregs - 1) - idx;
+        return (cpu_isa_traits_t<isa>::n_vregs - 1) - idx;
     }
 
     /*
diff --git a/src/cpu/x64/ukernel/attr_params.cpp b/src/cpu/x64/ukernel/attr_params.cpp
new file mode 100644
index 00000000000..50b6f38965c
--- /dev/null
+++ b/src/cpu/x64/ukernel/attr_params.cpp
@@ -0,0 +1,107 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/utils.hpp"
+
+#include "cpu/x64/ukernel/attr_params.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu::ukernel;
+
+status_t attr_params_t::set_post_ops_args(const void **post_ops_args) {
+    post_ops_args_ = post_ops_args;
+    return status::success;
+}
+
+status_t attr_params_t::set_scales(const void *scales, int arg) {
+    switch (arg) {
+        case DNNL_ARG_SRC: a_scales_ = scales; break;
+        case DNNL_ARG_WEIGHTS: b_scales_ = scales; break;
+        case DNNL_ARG_DST: d_scales_ = scales; break;
+        default: assert(!"unsupported arg");
+    }
+    return status::success;
+}
+
+const void *attr_params_t::get_scales(int arg) const {
+    switch (arg) {
+        case DNNL_ARG_SRC: return a_scales_;
+        case DNNL_ARG_WEIGHTS: return b_scales_;
+        case DNNL_ARG_DST: return d_scales_;
+        default: assert(!"unsupported arg");
+    }
+    return nullptr;
+}
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace ukernel {
+
+status_t dnnl_ukernel_attr_params_create(attr_params_t **attr_params) {
+    *attr_params = new attr_params_t();
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_set_post_ops_args(
+        attr_params_t *attr_params, const void **post_ops_args) {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    CHECK(attr_params->set_post_ops_args(post_ops_args));
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_set_A_scales(
+        attr_params_t *attr_params, const void *a_scales) {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    CHECK(attr_params->set_scales(a_scales, DNNL_ARG_SRC));
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_set_B_scales(
+        attr_params_t *attr_params, const void *b_scales) {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    CHECK(attr_params->set_scales(b_scales, DNNL_ARG_WEIGHTS));
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_set_D_scales(
+        attr_params_t *attr_params, const void *d_scales) {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    CHECK(attr_params->set_scales(d_scales, DNNL_ARG_DST));
+    return status::success;
+}
+
+status_t dnnl_ukernel_attr_params_destroy(attr_params_t *attr_params) {
+    delete attr_params;
+    return status::success;
+}
+
+} // namespace ukernel
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/ukernel/attr_params.hpp b/src/cpu/x64/ukernel/attr_params.hpp
new file mode 100644
index 00000000000..aadec28273c
--- /dev/null
+++ b/src/cpu/x64/ukernel/attr_params.hpp
@@ -0,0 +1,76 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_UKERNEL_ATTR_PARAMS_HPP
+#define CPU_X64_UKERNEL_ATTR_PARAMS_HPP
+
+#include "common/nstl.hpp"
+
+#include "cpu/ukernel/c_types_map.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+struct dnnl_ukernel_attr_params : public dnnl::impl::c_compatible {
+    dnnl_ukernel_attr_params() = default;
+
+    dnnl::impl::status_t set_post_ops_args(const void **post_ops_args);
+    const void *get_post_ops_args() const { return post_ops_args_; }
+
+    dnnl::impl::status_t set_scales(const void *scales, int arg);
+    const void *get_scales(int arg) const;
+
+private:
+    const void *post_ops_args_;
+    const void *a_scales_;
+    const void *b_scales_;
+    const void *d_scales_;
+};
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace ukernel {
+
+status_t dnnl_ukernel_attr_params_create(
+        dnnl_ukernel_attr_params **attr_params);
+
+status_t dnnl_ukernel_attr_params_set_post_ops_args(
+        dnnl_ukernel_attr_params *attr_params, const void **post_ops_args);
+
+status_t dnnl_ukernel_attr_params_set_A_scales(
+        dnnl_ukernel_attr_params *attr_params, const void *a_scales);
+
+status_t dnnl_ukernel_attr_params_set_B_scales(
+        dnnl_ukernel_attr_params *attr_params, const void *b_scales);
+
+status_t dnnl_ukernel_attr_params_set_D_scales(
+        dnnl_ukernel_attr_params *attr_params, const void *d_scales);
+
+status_t dnnl_ukernel_attr_params_destroy(
+        dnnl_ukernel_attr_params *attr_params);
+
+} // namespace ukernel
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/ukernel/brgemm.cpp b/src/cpu/x64/ukernel/brgemm.cpp
new file mode 100644
index 00000000000..870c0fc05c3
--- /dev/null
+++ b/src/cpu/x64/ukernel/brgemm.cpp
@@ -0,0 +1,491 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/memory_desc_wrapper.hpp"
+#include "common/verbose.hpp"
+
+#include "cpu/ref_io_helper.hpp"
+
+#include "cpu/x64/amx_tile_configure.hpp"
+
+#include "cpu/x64/brgemm/brgemm.hpp"
+#include "cpu/x64/brgemm/brgemm_utils.hpp"
+
+#include "cpu/x64/ukernel/brgemm.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu::x64;
+using namespace dnnl::impl::cpu::ukernel;
+
+#define VCHECK_BRGEMM(cond, msg, ...) \
+    VCONDCHECK(ukernel, create, check, brgemm, (cond), \
+            status::invalid_arguments, msg, ##__VA_ARGS__)
+
+#define VCHECK_BRGEMM_STATUS(status, cond, msg, ...) \
+    VCONDCHECK(ukernel, create, check, brgemm, (cond), (status), msg, \
+            ##__VA_ARGS__)
+
+dnnl_brgemm::~dnnl_brgemm() {
+    brgemm_kernel_destroy(brgemm_kernel_);
+}
+
+// Typical usage is either `1.f` to append to previous result, or `0.f` to write
+// C from scratch.
+status_t brgemm_t::set_add_C(int add_C) {
+    if (add_C == 0)
+        beta_ = 0.f;
+    else if (add_C == 1)
+        beta_ = 1.f;
+    return status::success;
+}
+
+status_t brgemm_t::set_post_ops(
+        dim_t ldd, data_type_t d_dt, const post_ops_t *post_ops) {
+    ldd_ = ldd;
+    d_dt_ = d_dt;
+    CHECK(attr_.set_post_ops(*post_ops));
+    return status::success;
+}
+
+status_t brgemm_t::set_scales(int mask, int arg) {
+    if (mask < 0) return status::invalid_arguments;
+    CHECK(attr_.scales_.set(arg, mask));
+    return status::success;
+}
+
+status_t brgemm_t::finalize() {
+    brgemm_batch_kind_t batch_kind = brgemm_batch_kind_t::brgemm_offs;
+
+    auto status = brgemm_desc_init(&brgemm_desc_, cpu_isa_t::isa_undef,
+            batch_kind, a_dt_, b_dt_, /* transA = */ false,
+            /* trans_B = */ false, brgemm_row_major, /* alpha = */ 1.f, beta_,
+            lda_, ldb_, ldc_, M_, N_, K_,
+            /* strides = */ nullptr);
+    if (status != status::success) {
+        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_init failed");
+    }
+
+    memory_desc_t D_md;
+    dims_t dims {M_, N_};
+    dims_t strides {ldc_, 1};
+    status = memory_desc_init_by_strides(
+            D_md, /* ndims = */ 2, dims, d_dt_, strides);
+    if (status != status::success) {
+        VCHECK_BRGEMM_STATUS(status, false, "D_md creation failed");
+    }
+
+    // This one is not used anywhere in implementation, but, maybe, could be
+    // used in the future in fpmath mode if users would like to override the
+    // default accumulation data type.
+    UNUSED(c_dt_);
+
+    status = brgemm_desc_set_postops(
+            &brgemm_desc_, &attr_, &D_md, ldd_, data_type::undef);
+    if (status != status::success) {
+        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_set_postops failed");
+    }
+
+    brgemm_attr_t brgemm_attr;
+    brgemm_attr.max_bs = batch_size_;
+    if (mayiuse(avx512_core_amx)) {
+        brgemm_attr.use_uker = true;
+        brgemm_attr.use_interleave_stores = true;
+        brgemm_attr.hint_prefetching = brgemm_kernel_prefetching_t::brgemm_prf0;
+    }
+
+    status = brgemm_desc_set_attr(&brgemm_desc_, brgemm_attr);
+    if (status != status::success) {
+        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_set_attr failed");
+    }
+
+    status = brgemm_desc_finalize(&brgemm_desc_);
+    if (status != status::success) {
+        VCHECK_BRGEMM_STATUS(status, false, "brgemm_desc_finalize failed");
+    }
+
+    // Note: API can't take a compensation buffer externally. Users must add
+    // compensation on their own as a binary post-op.
+    brgemm_desc_.req_s8s8_compensation = false;
+
+    // Precompute the pallette
+    // If status isn't successful, it means tiles configuration is not required.
+    status = brgemm_init_tiles(brgemm_desc_, palette_);
+    palette_initialized_ = (status == status::success);
+
+    return status::success;
+}
+
+status_t brgemm_t::get_B_pack_type(
+        pack_type_t *pack_type, data_type_t dt_a, data_type_t dt_b) {
+    // Use a descriptor to obtain the ISA to have compatible values when the
+    // user creates an object.
+    brgemm_desc_t brg {};
+    brg.dt_a = dt_a;
+    brg.dt_b = dt_b;
+    init_kernel_datatype(&brg, dt_a, dt_b);
+    brgemm_utils::set_isa_impl(&brg);
+    if (brg.isa_impl == cpu_isa_t::isa_undef) {
+        VCHECK_BRGEMM_STATUS(
+                status::unimplemented, false, "get_B_pack_type failed");
+    }
+    const bool has_vnni_layout = brgemm_desc_t::is_b_data_layout_vnni(
+            dt_a, dt_b, /* brgattr.b_is_vnni = */ false, brg.isa_impl);
+    *pack_type = has_vnni_layout ? pack_type::pack32 : pack_type::no_trans;
+    return status::success;
+}
+
+size_t brgemm_t::get_scratchpad_size() const {
+    return brgemm_desc_.get_wsp_buffer_size();
+}
+
+bool brgemm_t::is_execute_postops_valid() const {
+    return brgemm_desc_.are_post_ops_applicable();
+}
+
+status_t brgemm_t::set_hw_context() const {
+    if (palette_initialized_) {
+        auto status = amx_tile_lazy_configure(palette_);
+        VCHECK_BRGEMM_STATUS(
+                status, status == status::success, "amx_tile_configure failed");
+    }
+    return status::success;
+}
+
+status_t brgemm_t::generate() {
+    // Re-generation won't take any effect.
+    if (brgemm_kernel_ != nullptr) return status::success;
+
+    auto status = brgemm_kernel_create(&brgemm_kernel_, brgemm_desc_);
+    VCHECK_BRGEMM_STATUS(
+            status, status == status::success, "brgemm_kernel_create failed");
+
+    // Generate a verbose info string at the point where configuration is done.
+    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
+        create_verbose_info();
+    }
+    return status::success;
+}
+
+status_t brgemm_t::execute(const void *A_ptr, const void *B_ptr,
+        const dim_t *A_B_offsets, void *C_ptr, void *scratchpad_ptr) const {
+    const auto batch_size = brgemm_desc_.brgattr.max_bs;
+    std::vector<brgemm_batch_element_t> v_batch_element(batch_size);
+    for (int i = 0; i < batch_size; i++) {
+        v_batch_element[i].offset.A = A_B_offsets[2 * i];
+        v_batch_element[i].offset.B = A_B_offsets[2 * i + 1];
+    }
+
+    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
+        double start_ms = get_msec();
+        brgemm_kernel_execute(brgemm_kernel_, batch_size, A_ptr, B_ptr,
+                v_batch_element.data(), C_ptr, scratchpad_ptr,
+                /* dynamic_values = */ nullptr);
+        double duration_ms = get_msec() - start_ms;
+
+        std::stringstream ss;
+        ss << "cpu,brgemm,,undef," << verbose_info_;
+        VPROF(start_ms, ukernel, exec, VERBOSE_profile, ss.str().c_str(),
+                duration_ms);
+    } else {
+        brgemm_kernel_execute(brgemm_kernel_, batch_size, A_ptr, B_ptr,
+                v_batch_element.data(), C_ptr, scratchpad_ptr,
+                /* dynamic_values = */ nullptr);
+    }
+    return status::success;
+}
+
+status_t brgemm_t::execute(const void *A_ptr, const void *B_ptr,
+        const dim_t *A_B_offsets, const void *C_ptr, void *D_ptr,
+        void *scratchpad_ptr, const attr_params_t *attr_params) const {
+    if (attr_params == nullptr) return status::invalid_arguments;
+
+    if (!brgemm_desc_.are_post_ops_applicable()) {
+        if (C_ptr == D_ptr) {
+            return execute(A_ptr, B_ptr, A_B_offsets, const_cast<void *>(C_ptr),
+                    scratchpad_ptr);
+        } else {
+            VCHECK_BRGEMM_STATUS(status::runtime_error, false,
+                    "the kernel won't return correct results with this "
+                    "execute_with_postops call.");
+        }
+    }
+
+    const auto batch_size = brgemm_desc_.brgattr.max_bs;
+    std::vector<brgemm_batch_element_t> v_batch_element(batch_size);
+    for (int i = 0; i < batch_size; i++) {
+        v_batch_element[i].offset.A = A_B_offsets[2 * i];
+        v_batch_element[i].offset.B = A_B_offsets[2 * i + 1];
+    }
+
+    brgemm_post_ops_data_t post_ops_data;
+    // Note: this member is used to compute an offset from the base DST address.
+    // Thus, it's not a C buffer that should be passed, but D buffer.
+    post_ops_data.data_C_ptr_ = reinterpret_cast<const char *>(D_ptr);
+    // This member expects a pointer to a vector of pointers to binary_po args.
+    // It's exactly what `attr_params` stores when gets a pointer from the user.
+    post_ops_data.binary_post_ops_rhs = attr_params->get_post_ops_args();
+
+    // Scales (quantization case, happens after accumulation). Require manual
+    // combining when both are present, and extending to full simd broadcast,
+    // when single values are provided.
+    // Note: this piece is pretty close to what `precompute_scales` does.
+    // TODO: switch to `precompute_scales` directly.
+    alignas(64) float scales_buf[16] = {0};
+    // TODO: delegate extra memory to scratchpad?
+    std::vector<float> wei_scales_v(N_);
+
+    const bool has_src_scales = !attr_.scales_.has_default_values(DNNL_ARG_SRC);
+    const bool has_wei_scales
+            = !attr_.scales_.has_default_values(DNNL_ARG_WEIGHTS);
+
+    // Save src scale value to re-use it.
+    float src_scale_val = 1.f;
+    if (has_src_scales) {
+        const void *src_scales_ptr = attr_params->get_scales(DNNL_ARG_SRC);
+        if (src_scales_ptr == nullptr) return status::invalid_arguments;
+
+        src_scale_val
+                = cpu::io::load_float_value(data_type::f32, src_scales_ptr, 0);
+    }
+    if (has_wei_scales) {
+        // Handle weights entirely here to avoid duplicating the logic.
+
+        const void *wei_scales_ptr = attr_params->get_scales(DNNL_ARG_WEIGHTS);
+        if (wei_scales_ptr == nullptr) return status::invalid_arguments;
+
+        int wei_mask = attr_.scales_.get_mask(DNNL_ARG_WEIGHTS);
+        if (wei_mask > 0) {
+            for (dim_t i = 0; i < N_; i++) {
+                const float wei_scale_val = cpu::io::load_float_value(
+                        data_type::f32, wei_scales_ptr, i);
+                wei_scales_v[i] = wei_scale_val * src_scale_val;
+            }
+            post_ops_data.scales = wei_scales_v.data();
+        } else {
+            const float s = cpu::io::load_float_value(
+                    data_type::f32, wei_scales_ptr, 0);
+            utils::array_set(scales_buf, s * src_scale_val, 16);
+            post_ops_data.scales = scales_buf;
+        }
+    } else if (has_src_scales) {
+        utils::array_set(scales_buf, src_scale_val, 16);
+        post_ops_data.scales = scales_buf;
+    }
+
+    // Destination scales. Require manual extending to full simd broadcast.
+    alignas(64) float dst_scales_buf[16] = {0};
+    if (!attr_.scales_.has_default_values(DNNL_ARG_DST)) {
+        const void *dst_scales_ptr = attr_params->get_scales(DNNL_ARG_DST);
+        if (dst_scales_ptr == nullptr) return status::invalid_arguments;
+
+        const float s
+                = cpu::io::load_float_value(data_type::f32, dst_scales_ptr, 0);
+        utils::array_set(dst_scales_buf, 1.f / s, 16);
+        post_ops_data.dst_scales = dst_scales_buf;
+    }
+
+    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
+        double start_ms = get_msec();
+        brgemm_kernel_execute_postops(brgemm_kernel_, batch_size, A_ptr, B_ptr,
+                v_batch_element.data(), const_cast<void *>(C_ptr), D_ptr,
+                post_ops_data, scratchpad_ptr,
+                /* dynamic_values = */ nullptr);
+        double duration_ms = get_msec() - start_ms;
+
+        std::stringstream ss;
+        ss << "cpu,brgemm,,undef," << verbose_info_;
+        VPROF(start_ms, ukernel, exec, VERBOSE_profile, ss.str().c_str(),
+                duration_ms);
+    } else {
+        brgemm_kernel_execute_postops(brgemm_kernel_, batch_size, A_ptr, B_ptr,
+                v_batch_element.data(), const_cast<void *>(C_ptr), D_ptr,
+                post_ops_data, scratchpad_ptr,
+                /* dynamic_values = */ nullptr);
+    }
+    return status::success;
+}
+
+status_t brgemm_t::create_verbose_info() {
+#if defined(DISABLE_VERBOSE)
+    return status::success;
+#endif
+
+    const auto &d = brgemm_desc_;
+    std::stringstream ss;
+
+    memory_desc_t src_md;
+    const dims_t src_dims = {M_, K_};
+    const dims_t src_strides = {lda_, 1};
+    CHECK(memory_desc_init_by_strides(src_md, 2, src_dims, a_dt_, src_strides));
+
+    memory_desc_t wei_md;
+    const dims_t wei_dims = {K_, N_};
+    const dims_t wei_strides = {ldb_, 1};
+    CHECK(memory_desc_init_by_strides(wei_md, 2, wei_dims, b_dt_, wei_strides));
+
+    memory_desc_t dst_md;
+    const dims_t dst_dims = {M_, N_};
+    const dims_t dst_strides = {ldd_, 1};
+    CHECK(memory_desc_init_by_strides(dst_md, 2, dst_dims, d_dt_, dst_strides));
+
+    ss << md2fmt_str("src", &src_md, format_kind::undef) << " ";
+    ss << md2fmt_str("wei", &wei_md, format_kind::undef) << " ";
+    ss << md2fmt_str("dst", &dst_md, format_kind::undef);
+    ss << "," << attr2str(&attr_) << ",";
+    ss << "bs:" << d.brgattr.max_bs << " beta:" << beta_;
+    ss << "," << md2dim_str(&src_md) << ":" << md2dim_str(&wei_md);
+
+    verbose_info_ = ss.str();
+    return status::success;
+}
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace ukernel {
+
+status_t dnnl_brgemm_create(brgemm_t **brgemm, dim_t M, dim_t N, dim_t K,
+        dim_t batch_size, dim_t lda, dim_t ldb, dim_t ldc, data_type_t a_dt,
+        data_type_t b_dt, data_type_t c_dt) {
+    if (batch_size <= 0) {
+        VCHECK_BRGEMM_STATUS(
+                status::invalid_arguments, false, "batch size is non-positive");
+    }
+
+    *brgemm = new brgemm_t(
+            M, N, K, batch_size, lda, ldb, ldc, a_dt, b_dt, c_dt);
+    return status::success;
+}
+
+status_t dnnl_brgemm_set_add_C(brgemm_t *brgemm, int add_C) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->set_add_C(add_C));
+    return status::success;
+}
+
+status_t dnnl_brgemm_set_post_ops(brgemm_t *brgemm, dim_t ldd, data_type_t d_dt,
+        const post_ops_t *post_ops) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->set_post_ops(ldd, d_dt, post_ops));
+    return status::success;
+}
+
+status_t dnnl_brgemm_set_A_scales(brgemm_t *brgemm, int a_scale_mask) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->set_scales(a_scale_mask, DNNL_ARG_SRC));
+    return status::success;
+}
+
+status_t dnnl_brgemm_set_B_scales(brgemm_t *brgemm, int b_scale_mask) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->set_scales(b_scale_mask, DNNL_ARG_WEIGHTS));
+    return status::success;
+}
+
+status_t dnnl_brgemm_set_D_scales(brgemm_t *brgemm, int d_scale_mask) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->set_scales(d_scale_mask, DNNL_ARG_DST));
+    return status::success;
+}
+
+status_t dnnl_brgemm_finalize(brgemm_t *brgemm) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->finalize());
+    return status::success;
+}
+
+status_t dnnl_brgemm_get_B_pack_type(
+        pack_type_t *pack_type, data_type_t dt_a, data_type_t dt_b) {
+    if (pack_type) { return brgemm_t::get_B_pack_type(pack_type, dt_a, dt_b); }
+    return status::success;
+}
+
+status_t dnnl_brgemm_get_scratchpad_size(const brgemm_t *brgemm, size_t *size) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    if (size) *size = brgemm->get_scratchpad_size();
+    return status::success;
+}
+
+status_t dnnl_brgemm_is_execute_postops_valid(
+        const brgemm_t *brgemm, int *valid) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    if (valid) *valid = static_cast<int>(brgemm->is_execute_postops_valid());
+    return status::success;
+}
+
+status_t dnnl_brgemm_set_hw_context(const brgemm_t *brgemm) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->set_hw_context());
+    return status::success;
+}
+
+status_t dnnl_brgemm_release_hw_context() {
+    if (mayiuse(avx512_core_amx)) {
+        VCHECK_BRGEMM(amx_tile_release() == status::success,
+                "amx_tile_release failed");
+    }
+
+    return status::success;
+}
+
+status_t dnnl_brgemm_generate(brgemm_t *brgemm) {
+    if (brgemm == nullptr) return status::invalid_arguments;
+
+    CHECK(brgemm->generate());
+    return status::success;
+}
+
+status_t dnnl_brgemm_execute(const brgemm_t *brgemm, const void *A_ptr,
+        const void *B_ptr, const dim_t *A_B_offsets, void *C_ptr,
+        void *scratchpad_ptr) {
+    CHECK(brgemm->execute(A_ptr, B_ptr, A_B_offsets, C_ptr, scratchpad_ptr));
+    return status::success;
+}
+
+status_t dnnl_brgemm_execute_postops(const brgemm_t *brgemm, const void *A_ptr,
+        const void *B_ptr, const dim_t *A_B_offsets, const void *C_ptr,
+        void *D_ptr, void *scratchpad_ptr, const attr_params_t *attr_params) {
+    CHECK(brgemm->execute(A_ptr, B_ptr, A_B_offsets, C_ptr, D_ptr,
+            scratchpad_ptr, attr_params));
+    return status::success;
+}
+
+status_t dnnl_brgemm_destroy(brgemm_t *brgemm) {
+    delete brgemm;
+    return status::success;
+}
+
+} // namespace ukernel
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/ukernel/brgemm.hpp b/src/cpu/x64/ukernel/brgemm.hpp
new file mode 100644
index 00000000000..48272e40c9f
--- /dev/null
+++ b/src/cpu/x64/ukernel/brgemm.hpp
@@ -0,0 +1,164 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_UKERNEL_BRGEMM_HPP
+#define CPU_X64_UKERNEL_BRGEMM_HPP
+
+#include "cpu/ukernel/c_types_map.hpp"
+
+#include "cpu/x64/amx_tile_configure.hpp"
+#include "cpu/x64/brgemm/brgemm_types.hpp"
+
+#include "cpu/x64/ukernel/attr_params.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+struct dnnl_brgemm : public dnnl::impl::c_compatible {
+    dnnl_brgemm(dnnl::impl::dim_t M, dnnl::impl::dim_t N, dnnl::impl::dim_t K,
+            dnnl::impl::dim_t batch_size, dnnl::impl::dim_t lda,
+            dnnl::impl::dim_t ldb, dnnl::impl::dim_t ldc,
+            dnnl::impl::data_type_t a_dt, dnnl::impl::data_type_t b_dt,
+            dnnl::impl::data_type_t c_dt)
+        : M_(M)
+        , N_(N)
+        , K_(K)
+        , batch_size_(batch_size)
+        , lda_(lda)
+        , ldb_(ldb)
+        , ldc_(ldc)
+        , ldd_(ldc) // User may overwrite with set_post_ops().
+        , a_dt_(a_dt)
+        , b_dt_(b_dt)
+        , c_dt_(c_dt)
+        , d_dt_(c_dt) // User may overwrite with set_post_ops().
+        , beta_(0.f) // User may overwrite with set_add_C().
+        , brgemm_kernel_(nullptr) {}
+
+    ~dnnl_brgemm();
+
+    dnnl::impl::status_t set_add_C(int add_C);
+
+    dnnl::impl::status_t set_post_ops(dnnl::impl::dim_t ldd,
+            dnnl::impl::data_type_t d_dt,
+            const dnnl::impl::post_ops_t *post_ops);
+
+    dnnl::impl::status_t set_scales(int mask, int arg);
+
+    dnnl::impl::status_t finalize();
+
+    static dnnl::impl::status_t get_B_pack_type(
+            dnnl::impl::cpu::ukernel::pack_type_t *pack_type,
+            dnnl::impl::data_type_t dt_a, dnnl::impl::data_type_t dt_b);
+
+    size_t get_scratchpad_size() const;
+
+    bool is_execute_postops_valid() const;
+
+    dnnl::impl::status_t set_hw_context() const;
+
+    dnnl::impl::status_t generate();
+
+    dnnl::impl::status_t execute(const void *A_ptr, const void *B_ptr,
+            const dnnl::impl::dim_t *A_B_offsets, void *C_ptr,
+            void *scratchpad_ptr) const;
+    dnnl::impl::status_t execute(const void *A_ptr, const void *B_ptr,
+            const dnnl::impl::dim_t *A_B_offsets, const void *C_ptr,
+            void *D_ptr, void *scratchpad_ptr,
+            const dnnl::impl::cpu::ukernel::attr_params_t *attr_params) const;
+
+private:
+    // User's inputs.
+    dnnl::impl::dim_t M_, N_, K_, batch_size_;
+    dnnl::impl::dim_t lda_, ldb_, ldc_, ldd_;
+    dnnl::impl::data_type_t a_dt_, b_dt_, c_dt_, d_dt_;
+    float beta_;
+    // A copy of attributes to avoid dependency on user's attributes lifetime.
+    dnnl::impl::primitive_attr_t attr_;
+
+    // A main kernel.
+    dnnl::impl::cpu::x64::brgemm_desc_t brgemm_desc_;
+    dnnl::impl::cpu::x64::brgemm_kernel_t *brgemm_kernel_;
+
+    // Creates a `verbose_info_` string once during `generate()` call, and calls
+    // it during execute(). This is done to avoid string re-creation.
+    dnnl::impl::status_t create_verbose_info();
+    std::string verbose_info_;
+
+    bool palette_initialized_ = false;
+    char palette_[dnnl::impl::cpu::x64::AMX_PALETTE_SIZE] = {};
+};
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace ukernel {
+
+status_t dnnl_brgemm_create(dnnl_brgemm **brgemm, dim_t M, dim_t N, dim_t K,
+        dim_t batch_size, dim_t lda, dim_t ldb, dim_t ldc, data_type_t a_dt,
+        data_type_t b_dt, data_type_t c_dt);
+
+status_t dnnl_brgemm_set_add_C(dnnl_brgemm *brgemm, int add_C);
+
+status_t dnnl_brgemm_set_post_ops(dnnl_brgemm *brgemm, dim_t ldd,
+        data_type_t d_dt, const post_ops_t *post_ops);
+
+status_t dnnl_brgemm_set_A_scales(dnnl_brgemm *brgemm, int a_scale_mask);
+
+status_t dnnl_brgemm_set_B_scales(dnnl_brgemm *brgemm, int b_scale_mask);
+
+status_t dnnl_brgemm_set_D_scales(dnnl_brgemm *brgemm, int d_scale_mask);
+
+status_t dnnl_brgemm_finalize(dnnl_brgemm *brgemm);
+
+status_t dnnl_brgemm_get_B_pack_type(
+        dnnl::impl::cpu::ukernel::pack_type_t *pack_type, data_type_t dt_a,
+        data_type_t dt_b);
+
+status_t dnnl_brgemm_get_scratchpad_size(
+        const dnnl_brgemm *brgemm, size_t *size);
+
+status_t dnnl_brgemm_is_execute_postops_valid(
+        const dnnl_brgemm *brgemm, int *valid);
+
+status_t dnnl_brgemm_set_hw_context(const dnnl_brgemm *brgemm);
+
+status_t dnnl_brgemm_release_hw_context();
+
+status_t dnnl_brgemm_generate(dnnl_brgemm *brgemm);
+
+status_t dnnl_brgemm_execute(const dnnl_brgemm *brgemm, const void *A_ptr,
+        const void *B_ptr, const dim_t *A_B_offsets, void *C_ptr,
+        void *scratchpad_ptr);
+
+status_t dnnl_brgemm_execute_postops(const dnnl_brgemm *brgemm,
+        const void *A_ptr, const void *B_ptr, const dim_t *A_B_offsets,
+        const void *C_ptr, void *D_ptr, void *scratchpad_ptr,
+        const dnnl_ukernel_attr_params *attr_params);
+
+status_t dnnl_brgemm_destroy(dnnl_brgemm *brgemm);
+
+} // namespace ukernel
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/ukernel/transform.cpp b/src/cpu/x64/ukernel/transform.cpp
new file mode 100644
index 00000000000..3b2e1e630e3
--- /dev/null
+++ b/src/cpu/x64/ukernel/transform.cpp
@@ -0,0 +1,209 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/verbose.hpp"
+
+#include "cpu/x64/ukernel/transform.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+using namespace dnnl::impl;
+using namespace dnnl::impl::cpu::x64;
+using namespace dnnl::impl::cpu::ukernel;
+
+#define VCHECK_TRANSFORM(cond, msg, ...) \
+    VCONDCHECK(ukernel, create, check, brgemm, (cond), \
+            status::invalid_arguments, msg, ##__VA_ARGS__)
+
+dnnl_transform::dnnl_transform(dim_t K, dim_t N, pack_type_t in_pack_type,
+        dim_t in_ld, dim_t out_ld, data_type_t in_dt, data_type_t out_dt)
+    : K_(K)
+    , N_(N)
+    , in_ld_(in_ld)
+    , out_ld_(out_ld)
+    , in_dt_(in_dt)
+    , out_dt_(out_dt) {
+    // Check for a valid in_ld depending on a pack type.
+    assert(in_pack_type == pack_type::no_trans
+                    ? IMPLICATION(K_ > 1, in_ld_ >= N_)
+                    : in_ld_ >= K_);
+    // Only special N_blk sizes are supported by matmul copy routines. Rest
+    // will crash.
+    assert(utils::one_of(out_ld_, 16, 32, 48, 64));
+
+    const auto in_tag = in_pack_type == pack_type::trans ? format_tag::ba
+                                                         : format_tag::ab;
+    auto status = matmul::init_conf(bmc_, /* batch = */ 1, /* M = */ 0, K_, N_,
+            in_ld_, out_ld_, in_dt_, out_dt_, in_tag);
+    assert(status == status::success);
+    if (status != status::success) return;
+
+    if (in_pack_type == pack_type::trans) {
+        strides_[0] = 1;
+        strides_[1] = in_ld_;
+    } else if (in_pack_type == pack_type::no_trans) {
+        strides_[0] = in_ld_;
+        strides_[1] = 1;
+    } else {
+        assert(!"Unsupported pack type");
+    }
+}
+
+status_t transform_t::generate() {
+    // Re-generation won't take any effect.
+    if (pack_B_kernel_ != nullptr) return status::success;
+
+    CHECK(matmul::create_brgemm_matmul_copy_b(pack_B_kernel_, &bmc_));
+
+    // Generate a verbose info string at the point where configuration is done.
+    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
+        CHECK(create_verbose_info());
+    }
+    return status::success;
+}
+
+status_t transform_t::execute(const void *src, void *dst) const {
+    double start_ms = 0;
+    if (get_verbose(verbose_t::exec_profile, component_t::ukernel))
+        start_ms = get_msec();
+
+    const uint8_t *src_ptr = reinterpret_cast<const uint8_t *>(src);
+    uint8_t *dst_ptr = reinterpret_cast<uint8_t *>(dst);
+
+    const auto &kernel_conf = bmc_;
+    const dim_t n_blks = utils::div_up(kernel_conf.N, kernel_conf.N_blk);
+    const dim_t k_blks = utils::div_up(kernel_conf.K, kernel_conf.K_blk);
+    const auto blk_size = kernel_conf.K_blk * kernel_conf.N_blk;
+
+    const auto i_dt_sz = kernel_conf.b_dt_sz;
+    const auto o_dt_sz = kernel_conf.a_dt_sz;
+
+    for (dim_t n_blk_idx = 0; n_blk_idx < n_blks; n_blk_idx++) {
+        const auto n = n_blk_idx * kernel_conf.N_blk;
+        const bool is_N_tail = (kernel_conf.N - n) < kernel_conf.N_blk;
+        auto ker_exec_ctx = matmul::jit_brgemm_matmul_copy_b_t::ctx_t();
+        ker_exec_ctx.current_N_blk
+                = is_N_tail ? kernel_conf.N_tail : kernel_conf.N_blk;
+
+        int k_blk_idx = 0;
+        for (; k_blk_idx < kernel_conf.K / kernel_conf.K_blk; k_blk_idx++) {
+            const auto k = k_blk_idx * kernel_conf.K_blk;
+            const auto src_offset
+                    = i_dt_sz * (k * strides_[0] + n * strides_[1]);
+            const auto dst_offset
+                    = o_dt_sz * (k_blk_idx * blk_size + n_blk_idx * k_blks);
+            ker_exec_ctx.src = &src_ptr[src_offset];
+            ker_exec_ctx.tr_src = &dst_ptr[dst_offset];
+            ker_exec_ctx.current_K_start = k;
+            ker_exec_ctx.current_K_iters = kernel_conf.K_blk;
+            (*pack_B_kernel_)(&ker_exec_ctx);
+        }
+        if (kernel_conf.K_tail > 0) {
+            const auto k = k_blk_idx * kernel_conf.K_blk;
+            const auto src_offset
+                    = i_dt_sz * (k * strides_[0] + n * strides_[1]);
+            const auto dst_offset
+                    = o_dt_sz * (k_blk_idx * blk_size + n_blk_idx * k_blks);
+            ker_exec_ctx.src = &src_ptr[src_offset];
+            ker_exec_ctx.tr_src = &dst_ptr[dst_offset];
+            ker_exec_ctx.current_K_start = k;
+            ker_exec_ctx.current_K_iters = kernel_conf.K_tail;
+            (*pack_B_kernel_)(&ker_exec_ctx);
+        }
+    }
+
+    if (get_verbose(verbose_t::exec_profile, component_t::ukernel)) {
+        double duration_ms = get_msec() - start_ms;
+
+        std::stringstream ss;
+        ss << "cpu,transform,pack_B,undef," << verbose_info_;
+        VPROF(start_ms, ukernel, exec, VERBOSE_profile, ss.str().c_str(),
+                duration_ms);
+    }
+    return status::success;
+}
+
+status_t transform_t::create_verbose_info() {
+#if defined(DISABLE_VERBOSE)
+    return status::success;
+#endif
+
+    std::stringstream ss;
+
+    memory_desc_t src_md;
+    const dims_t dims = {K_, N_};
+    CHECK(memory_desc_init_by_strides(src_md, 2, dims, in_dt_, strides_));
+
+    memory_desc_t dst_md;
+    const dims_t dst_strides = {out_ld_, 1};
+    CHECK(memory_desc_init_by_strides(dst_md, 2, dims, out_dt_, dst_strides));
+
+    ss << md2fmt_str("src", &src_md, format_kind::undef) << " ";
+    ss << md2fmt_str("dst", &dst_md, format_kind::undef);
+    ss << ",,," << md2dim_str(&src_md);
+
+    verbose_info_ = ss.str();
+    return status::success;
+}
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace ukernel {
+
+status_t dnnl_transform_create(transform_t **transform, dim_t K, dim_t N,
+        pack_type_t in_pack_type, dim_t in_ld, dim_t out_ld, data_type_t in_dt,
+        data_type_t out_dt) {
+    if (transform == nullptr) return status::invalid_arguments;
+    VCHECK_TRANSFORM(utils::one_of(out_ld, 16, 32, 48, 64),
+            "Transform routine supports only \'out_ld\' of 16, 32, 48, or 64.");
+
+    *transform
+            = new transform_t(K, N, in_pack_type, in_ld, out_ld, in_dt, out_dt);
+    return status::success;
+}
+
+status_t dnnl_transform_generate(transform_t *transform) {
+    if (transform == nullptr) return status::invalid_arguments;
+
+    CHECK(transform->generate());
+    return status::success;
+}
+
+status_t dnnl_transform_execute(
+        const transform_t *transform, const void *in_ptr, void *out_ptr) {
+    if (utils::any_null(transform, in_ptr, out_ptr))
+        return status::invalid_arguments;
+
+    CHECK(transform->execute(in_ptr, out_ptr));
+    return status::success;
+}
+
+status_t dnnl_transform_destroy(transform_t *transform) {
+    delete transform;
+    return status::success;
+}
+
+} // namespace ukernel
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/ukernel/transform.hpp b/src/cpu/x64/ukernel/transform.hpp
new file mode 100644
index 00000000000..fbcd6f4962b
--- /dev/null
+++ b/src/cpu/x64/ukernel/transform.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef CPU_X64_UKERNEL_TRANSFORM_HPP
+#define CPU_X64_UKERNEL_TRANSFORM_HPP
+
+#include <memory>
+
+#include "cpu/ukernel/c_types_map.hpp"
+
+#include "cpu/x64/matmul/brgemm_matmul_copy_utils.hpp"
+#include "cpu/x64/matmul/brgemm_matmul_utils.hpp"
+
+#ifdef DNNL_EXPERIMENTAL_UKERNEL
+
+struct dnnl_transform : public dnnl::impl::c_compatible {
+    // Ctor that follows a call to initialize matmul conf struct.
+    dnnl_transform(dnnl::impl::dim_t K, dnnl::impl::dim_t N,
+            dnnl::impl::cpu::ukernel::pack_type_t in_pack_type,
+            dnnl::impl::dim_t in_ld, dnnl::impl::dim_t out_ld,
+            dnnl::impl::data_type_t in_dt, dnnl::impl::data_type_t out_dt);
+
+    // Generates a transform kernel.
+    dnnl::impl::status_t generate();
+
+    // Executes a transform kernel.
+    dnnl::impl::status_t execute(const void *src, void *dst) const;
+
+private:
+    // User's inputs.
+    dnnl::impl::dim_t K_, N_;
+    dnnl::impl::dim_t in_ld_, out_ld_;
+    dnnl::impl::data_type_t in_dt_, out_dt_;
+    // Save `strides_` for `execute` to get proper source offset.
+    dnnl::impl::dims_t strides_;
+
+    // A transform kernel.
+    // Note: though it's a generic class for any kind of transformation, so far
+    // it's only matmul's copy_B.
+    dnnl::impl::cpu::x64::matmul::brgemm_matmul_conf_t bmc_;
+    // `unique_ptr` is required by API that generates a kernel.
+    std::unique_ptr<dnnl::impl::cpu::x64::matmul::jit_brgemm_matmul_copy_b_t>
+            pack_B_kernel_;
+
+    // Creates a `verbose_info_` string once during `generate()` call, and calls
+    // it during execute(). This is done to avoid string re-creation.
+    dnnl::impl::status_t create_verbose_info();
+    std::string verbose_info_;
+};
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+namespace x64 {
+namespace ukernel {
+
+status_t dnnl_transform_create(dnnl_transform **transform, dim_t K, dim_t N,
+        dnnl::impl::cpu::ukernel::pack_type_t in_pack_type, dim_t in_ld,
+        dim_t out_ld, data_type_t in_dt, data_type_t out_dt);
+
+status_t dnnl_transform_generate(dnnl_transform *transform);
+
+status_t dnnl_transform_execute(
+        const dnnl_transform *transform, const void *in_ptr, void *out_ptr);
+
+status_t dnnl_transform_destroy(dnnl_transform *transform);
+
+} // namespace ukernel
+} // namespace x64
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
+
+#endif
+
+//vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
diff --git a/src/cpu/x64/utils/jit_io_helper.cpp b/src/cpu/x64/utils/jit_io_helper.cpp
index a8527362639..dd47e350335 100644
--- a/src/cpu/x64/utils/jit_io_helper.cpp
+++ b/src/cpu/x64/utils/jit_io_helper.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -109,8 +109,9 @@ io_gather_conf_t::io_gather_conf_t(const std::size_t simd_w,
     , vmm_tmp_idx_(vmm_tmp_idx) {}
 
 template <typename Vmm>
-jit_io_helper_t<Vmm>::jit_io_helper_t(jit_generator *host, const cpu_isa_t &isa,
-        const data_type_t &data_type, const io_conf_t &io_conf,
+jit_io_helper_t<Vmm>::jit_io_helper_t(jit_generator_t *host,
+        const cpu_isa_t &isa, const data_type_t &data_type,
+        const io_conf_t &io_conf,
         const utils::optional_t<io_tail_conf_t> &tail_conf,
         const utils::optional_t<io_emu_bf16_conf_t> &bf16_conf,
         const utils::optional_t<io_saturation_conf_t> &saturation_conf,
@@ -202,11 +203,10 @@ bool jit_io_helper_t<Vmm>::is_data_type_supported(const data_type_t dt) {
         case data_type::u8:
         case data_type::s8: return true;
         case data_type::bf16:
-            return is_superset(isa_, avx512_core) || isa_ == avx2_vnni_2;
         case data_type::f16:
-            return is_superset(isa_, avx512_core_fp16) || isa_ == avx2_vnni_2;
+            return is_superset(isa_, avx2);
         case data_type::f8_e4m3:
-        case data_type::f8_e5m2: return is_superset(isa_, avx512_core_amx);
+        case data_type::f8_e5m2: return is_superset(isa_, avx512_core_fp16);
         default: assert(!"Unsupported data type");
     }
     return false;
@@ -244,7 +244,7 @@ void jit_io_helper_t<Vmm>::prepare_vmm_mask(
                 reinterpret_cast<size_t>(&mask_f32[7 - how_many_bits_to_set]));
         host_->uni_vmovups(mask, host_->ptr[reg_tmp]);
     } else if (how_many_bits_to_set == simd_w) {
-        host_->uni_vcmpps(mask, mask, mask, jit_generator::_cmp_eq_oq);
+        host_->uni_vcmpps(mask, mask, mask, jit_generator_t::_cmp_eq_oq);
     } else {
         assert(!"Can't set so many bits.");
     }
@@ -277,12 +277,11 @@ void jit_io_helper_t<Vmm>::prepare_i8_data_to_store(const Vmm &i8_vmm) {
 
 template <typename Vmm>
 void jit_io_helper_t<Vmm>::prepare_xf16_data_to_store(const Vmm &vmm) {
-    assert(!is_superset(isa_, avx512_core));
     const auto &cvt_lower_vmm =
-            typename vreg_traits<Vmm>::Vmm_lower_t(vmm.getIdx());
+            typename vreg_traits_t<Vmm>::Vmm_lower_t(vmm.getIdx());
 
     if (data_type_ == data_type::bf16)
-        host_->vcvtneps2bf16(cvt_lower_vmm, vmm, Xbyak::VexEncoding);
+        host_->vcvtneps2bf16(cvt_lower_vmm, vmm, host_->get_encoding());
     else
         host_->uni_vcvtps2phx(cvt_lower_vmm, vmm);
 }
@@ -766,15 +765,22 @@ void jit_io_helper_t<Vmm>::store(const Vmm &src_raw_vmm,
     const bool is_xf16
             = utils::one_of(data_type_, data_type::bf16, data_type::f16);
 
-    const bool can_store_byte_by_byte = tail
-            && (isa_ == sse41
-                    || (!is_store_tail_supported && (is_i8 || is_xf16)));
+    const bool can_store_byte_by_byte
+            = (tail
+                      && (isa_ == sse41
+                              || (!is_store_tail_supported
+                                      && (is_i8 || is_xf16))))
+            || (std::is_same<Vmm, Xbyak::Xmm>::value && is_xf16);
 
     if (data_type_ == data_type::s32 || is_i8) saturate(src_raw_vmm);
 
     if (can_store_byte_by_byte) {
-        const size_t store_size
-                = tail_conf_->tail_size_ * types::data_type_size(data_type_);
+        // TODO: Consider adding opmask to store xf16 data from Xmm.
+        // This could allow to use store_bf16/store_f16 functions for isa >= avx512_core.
+        const size_t xmm_length
+                = vreg_traits_t<Xbyak::Xmm>::vlen / sizeof(int32_t);
+        const size_t store_size = (tail ? tail_conf_->tail_size_ : xmm_length)
+                * types::data_type_size(data_type_);
         store_byte_by_byte(src_vmm, dst_addr, store_size);
     } else {
         switch (data_type_) {
@@ -811,7 +817,7 @@ void jit_io_helper_t<Vmm>::store_byte_by_byte(const Vmm &src_vmm,
     const bool is_xf16
             = utils::one_of(data_type_, data_type::bf16, data_type::f16);
     const auto &cvt_lower_vmm =
-            typename vreg_traits<Vmm>::Vmm_lower_t(src_vmm.getIdx());
+            typename vreg_traits_t<Vmm>::Vmm_lower_t(src_vmm.getIdx());
 
     if (is_i8) prepare_i8_data_to_store(src_vmm);
     if (is_xf16) prepare_xf16_data_to_store(src_vmm);
@@ -837,16 +843,15 @@ void jit_io_helper_t<Vmm>::store_bf16(
     assert(bf16_supported_ && "Unsupported data type.");
     assert((src_vmm.isZMM() || src_vmm.isYMM())
             && "Store operation for bf16 is not supported for Xmms.");
+    assert(is_superset(isa_, avx512_core) || isa_ == avx2_vnni_2);
 
     const auto &cvt_lower_vmm =
-            typename vreg_traits<Vmm>::Vmm_lower_t(src_vmm.getIdx());
+            typename vreg_traits_t<Vmm>::Vmm_lower_t(src_vmm.getIdx());
 
     if (bf16_emu_)
         bf16_emu_->vcvtneps2bf16(cvt_lower_vmm, src_vmm);
     else
-        host_->vcvtneps2bf16(cvt_lower_vmm, src_vmm,
-                mayiuse(avx512_core) ? Xbyak::EvexEncoding
-                                     : Xbyak::VexEncoding);
+        host_->vcvtneps2bf16(cvt_lower_vmm, src_vmm, host_->get_encoding());
 
     if (io_conf_.nt_stores_enabled_)
         host_->uni_vmovntps(dst_addr, cvt_lower_vmm);
@@ -862,7 +867,7 @@ void jit_io_helper_t<Vmm>::store_f16(
             && "Store operation for f16 is not supported for Xmms.");
 
     const auto &cvt_lower_vmm =
-            typename vreg_traits<Vmm>::Vmm_lower_t(src_vmm.getIdx());
+            typename vreg_traits_t<Vmm>::Vmm_lower_t(src_vmm.getIdx());
 
     host_->uni_vcvtps2phx(cvt_lower_vmm, src_vmm);
 
@@ -906,8 +911,8 @@ void jit_io_helper_t<Vmm>::store_i8(
         static constexpr bool is_zmm = std::is_same<Vmm, Xbyak::Zmm>::value;
 
         auto store_i8_fn = data_type_ == data_type::s8
-                ? std::bind(&jit_generator::vpmovsdb, host_, _1, _2)
-                : std::bind(&jit_generator::vpmovusdb, host_, _1, _2);
+                ? std::bind(&jit_generator_t::vpmovsdb, host_, _1, _2)
+                : std::bind(&jit_generator_t::vpmovusdb, host_, _1, _2);
 
         if (io_conf_.nt_stores_enabled_ && is_zmm) {
             Xbyak::Xmm src_xmm(src_vmm.getIdx());
@@ -1017,7 +1022,7 @@ template <typename Vmm>
 jit_io_multi_dt_helper_t<Vmm>::jit_io_multi_dt_helper_t() = default;
 
 template <typename Vmm>
-jit_io_multi_dt_helper_t<Vmm>::jit_io_multi_dt_helper_t(jit_generator *host,
+jit_io_multi_dt_helper_t<Vmm>::jit_io_multi_dt_helper_t(jit_generator_t *host,
         const cpu_isa_t &isa, const data_types_t &data_types,
         const io_conf_t &io_conf,
         const utils::optional_t<io_tail_conf_t> &tail_conf,
@@ -1105,6 +1110,14 @@ void jit_io_multi_dt_helper_t<Vmm>::init_bf16() {
     if (bf16_io_helper) bf16_io_helper->init_bf16();
 }
 
+template <typename Vmm>
+void jit_io_multi_dt_helper_t<Vmm>::prepare_table_fp8() {
+    const auto f8_e5m2_io_helper = at(data_type::f8_e5m2);
+    if (f8_e5m2_io_helper) f8_e5m2_io_helper->prepare_table_fp8();
+    const auto f8_e4m3_io_helper = at(data_type::f8_e4m3);
+    if (f8_e4m3_io_helper) f8_e4m3_io_helper->prepare_table_fp8();
+}
+
 template <typename Vmm>
 jit_io_multi_dt_helper_t<Vmm>::~jit_io_multi_dt_helper_t() = default;
 
diff --git a/src/cpu/x64/utils/jit_io_helper.hpp b/src/cpu/x64/utils/jit_io_helper.hpp
index 104e3b1af6a..a6c444b579c 100644
--- a/src/cpu/x64/utils/jit_io_helper.hpp
+++ b/src/cpu/x64/utils/jit_io_helper.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,9 +61,9 @@ class io_tail_conf_t {
 
     std::size_t simd_w_ = 0;
     std::size_t tail_size_ = 0;
-    Xbyak::Opmask tail_opmask_ = Xbyak::Opmask();
+    Xbyak::Opmask tail_opmask_;
     int tail_vmm_mask_idx_ = 0;
-    Xbyak::Reg64 reg_tmp_ = Xbyak::Reg64();
+    Xbyak::Reg64 reg_tmp_;
 };
 
 class io_emu_bf16_conf_t {
@@ -124,7 +124,7 @@ class io_saturation_conf_t {
 
     int vreg_zero_saturation_idx_ = 0;
     int vreg_saturation_ubound_idx_ = 0;
-    Xbyak::Reg64 reg_tmp_ = Xbyak::Reg64();
+    Xbyak::Reg64 reg_tmp_;
 };
 
 class io_gather_conf_t {
@@ -138,10 +138,10 @@ class io_gather_conf_t {
     io_gather_conf_t &operator=(const io_gather_conf_t &other) = default;
 
     std::size_t simd_w_ = 0;
-    Xbyak::Opmask full_opmask_ = Xbyak::Opmask();
+    Xbyak::Opmask full_opmask_;
     int full_vmm_mask_idx_ = 0;
-    Xbyak::Reg64 reg_tmp_ = Xbyak::Reg64();
-    Xbyak::Reg64 reg_tmp1_ = Xbyak::Reg64();
+    Xbyak::Reg64 reg_tmp_;
+    Xbyak::Reg64 reg_tmp1_;
     // It is needed, when io_helper use emulation for gather
     // and it is not needed for sse.
     utils::optional_t<int> vmm_tmp_idx_ = utils::nullopt;
@@ -156,7 +156,7 @@ class jit_io_helper_t {
     friend class jit_io_multi_dt_helper_t<Vmm>;
 
     jit_io_helper_t() = default;
-    jit_io_helper_t(jit_generator *host, const cpu_isa_t &isa,
+    jit_io_helper_t(jit_generator_t *host, const cpu_isa_t &isa,
             const data_type_t &data_type, const io_conf_t &io_conf,
             const utils::optional_t<io_tail_conf_t> &tail_conf = utils::nullopt,
             const utils::optional_t<io_emu_bf16_conf_t> &bf16_conf
@@ -183,6 +183,7 @@ class jit_io_helper_t {
     void init_full_mask();
     void init_saturate_f32() const;
     void init_bf16();
+    // `prepare_table` implies the call must be used after `postamble`.
     void prepare_table_fp8();
     void gather(const Xbyak::Reg64 &src_reg, const Vmm &indices_vmm,
             const Vmm &dst_vmm, const bool tail);
@@ -234,7 +235,7 @@ class jit_io_helper_t {
     void convert_to_f32(const Vmm &dst_vmm, const Xbyak::Xmm &src_vmm,
             const data_type_t src_data_type);
 
-    jit_generator *host_;
+    jit_generator_t *host_;
     const cpu_isa_t isa_;
     const data_type_t data_type_;
     const bool bf16_supported_;
@@ -257,7 +258,7 @@ class jit_io_multi_dt_helper_t {
     using saturation_map_t = std::map<data_type_t, io_saturation_conf_t>;
 
     jit_io_multi_dt_helper_t();
-    jit_io_multi_dt_helper_t(jit_generator *host, const cpu_isa_t &isa,
+    jit_io_multi_dt_helper_t(jit_generator_t *host, const cpu_isa_t &isa,
             const data_types_t &data_types, const io_conf_t &io_conf,
             const utils::optional_t<io_tail_conf_t> &tail_conf = utils::nullopt,
             const utils::optional_t<io_emu_bf16_conf_t> &bf16_conf
@@ -273,6 +274,8 @@ class jit_io_multi_dt_helper_t {
     void init_saturate_f32(const data_types_t &store_data_types);
     void init_full_mask();
     void init_bf16();
+    // `prepare_table` implies the call must be used after `postamble`.
+    void prepare_table_fp8();
 
     std::shared_ptr<jit_io_helper_t<Vmm>> at(const data_type_t dt) const;
     bool empty() const;
diff --git a/src/cpu/x64/xbyak/_clang-format b/src/cpu/x64/xbyak/_clang-format
deleted file mode 100644
index 8f3f1619dcd..00000000000
--- a/src/cpu/x64/xbyak/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2019 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/src/cpu/zero_point_utils.cpp b/src/cpu/zero_point_utils.cpp
index 94b371c4075..ad4d10fd1f5 100644
--- a/src/cpu/zero_point_utils.cpp
+++ b/src/cpu/zero_point_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -84,8 +84,9 @@ zero_point_pad_comp_config_t::zero_point_pad_comp_config_t(
 
 zero_point_config_t::zero_point_config_t(const primitive_attr_t &attr)
     : src_exists(!attr.zero_points_.has_default_values(DNNL_ARG_SRC))
-    , dst_exists(!attr.zero_points_.has_default_values(DNNL_ARG_DST))
-    , src_is_common(attr.zero_points_.common(DNNL_ARG_SRC)) {}
+    , dst_exists(!attr.zero_points_.get(DNNL_ARG_DST).has_default_values())
+    , src_is_common(
+              src_exists && attr.zero_points_.get_mask(DNNL_ARG_SRC) == 0) {}
 
 bool zero_point_config_t::zp_exists() const noexcept {
     return src_exists || dst_exists;
@@ -99,22 +100,27 @@ zero_point_call_params_t::zero_point_call_params_t(const int32_t *src,
 bool zero_points_valid(
         const primitive_attr_t *attr, bool per_oc_bcast_accepted) noexcept {
 
-    int mask_src = -1, mask_dst = -1;
     static constexpr int common_mask = 0x0,
                          per_oc_mask = 0x2; // mask for common and per_oc_bcast
 
-    attr->zero_points_.get(DNNL_ARG_SRC, &mask_src);
-    attr->zero_points_.get(DNNL_ARG_DST, &mask_dst);
+    if (!attr->zero_points_.has_default_values(DNNL_ARG_SRC)) {
+        int mask_src = attr->zero_points_.get_mask(DNNL_ARG_SRC);
+        const bool src_mask_valid = per_oc_bcast_accepted
+                ? utils::one_of(mask_src, common_mask, per_oc_mask)
+                : mask_src == 0;
+        if (!src_mask_valid) return false;
+    }
+
+    if (!attr->zero_points_.has_default_values(DNNL_ARG_DST)) {
+        int mask_dst = attr->zero_points_.get_mask(DNNL_ARG_DST);
 
-    const bool src_mask_valid = per_oc_bcast_accepted
-            ? utils::one_of(mask_src, common_mask, per_oc_mask)
-            : mask_src == 0;
-    const bool dst_mask_valid = per_oc_bcast_accepted
-            ? utils::one_of(mask_dst, common_mask, per_oc_mask)
-            : mask_dst == 0;
+        const bool dst_mask_valid = per_oc_bcast_accepted
+                ? utils::one_of(mask_dst, common_mask, per_oc_mask)
+                : mask_dst == 0;
+        if (!dst_mask_valid) return false;
+    }
 
-    return attr->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-            && src_mask_valid && dst_mask_valid;
+    return attr->zero_points_.has_default_values(DNNL_ARG_WEIGHTS);
 }
 
 void set_zp_src_comp_flags(memory_desc_t &weights_md, bool with_groups) {
diff --git a/src/gpu/CMakeLists.txt b/src/gpu/CMakeLists.txt
index f49a9bd2d63..5e2947f0adc 100644
--- a/src/gpu/CMakeLists.txt
+++ b/src/gpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2020-2024 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -27,11 +27,11 @@ set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
 add_subdirectory(generic)
 
 if(DNNL_GPU_VENDOR STREQUAL "INTEL")
-    add_definitions_with_host_compiler(-DNGEN_CPP11)
-    add_definitions_with_host_compiler(-DNGEN_SAFE)
-    add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
-    add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
-    add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
+    set(ONEDNN_NGEN_DIR "${PROJECT_SOURCE_DIR}/third_party/ngen" CACHE PATH "Path to nGEN source code")
+    add_definitions_with_host_compiler(-DNGEN_CONFIG)
+    include_directories_with_host_compiler(${CMAKE_CURRENT_SOURCE_DIR}/intel/jit/ngen/)
+    include_directories_with_host_compiler(${ONEDNN_NGEN_DIR})
+
     add_subdirectory(intel)
 endif()
 
diff --git a/src/gpu/amd/README.md b/src/gpu/amd/README.md
index c4fd1025946..2cac152ed28 100644
--- a/src/gpu/amd/README.md
+++ b/src/gpu/amd/README.md
@@ -3,14 +3,15 @@
 ## General information
 
 Support for AMD backend is implemented via SYCL HIP backend. The feature is
-disabled by default. Users must enable it at build time with a CMake option
-`DNNL_GPU_VENDOR=AMD`. The AMD GPUs can be used via oneDNN engine abstraction.
-The engine should be created using `dnnl::engine::kind::gpu` engine kind or the
-user can provide a `sycl::device` objects that corresponds to AMD GPUs.
+disabled by default and is currently experimental. You must enable it at the
+build time with a CMake option `DNNL_GPU_VENDOR=AMD`. The AMD GPUs can be used
+via oneDNN engine abstraction. The engine should be created using
+`dnnl::engine::kind::gpu` engine kind or you can provide `sycl::device` objects
+that correspond to AMD GPUs.
 
 ## Pre-requisites
 * [oneAPI DPC++ Compiler with support for HIP AMD](https://github.com/intel/llvm/blob/sycl/sycl/doc/GetStartedGuide.md#build-dpc-toolchain-with-support-for-hip-amd), version [2022-12](https://github.com/intel/llvm/releases/tag/2022-12)
-* [AMD ROCm](https://github.com/RadeonOpenCompute/ROCm), version 5.3 or newer
+* [AMD ROCm](https://github.com/RadeonOpenCompute/ROCm), version 5.3 or newer. The latest supported version currently is 6.1.
 * [MIOpen](https://github.com/ROCmSoftwarePlatform/MIOpen), version 2.18 or newer (optional if AMD ROCm includes the required version of MIOpen)
 * [rocBLAS](https://github.com/ROCmSoftwarePlatform/rocBLAS), version 2.45.0 or newer (optional if AMD ROCm includes the required version of rocBLAS)
 
diff --git a/src/gpu/amd/engine.cpp b/src/gpu/amd/engine.cpp
index 8bd1bb944de..9a310b1463d 100644
--- a/src/gpu/amd/engine.cpp
+++ b/src/gpu/amd/engine.cpp
@@ -40,11 +40,6 @@ status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
     return status::success;
 }
 
-status_t engine_t::create_memory_storage(
-        memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
-    return impl()->create_memory_storage(storage, this, flags, size, handle);
-}
-
 engine_t::engine_t(
         const ::sycl::device &dev, const ::sycl::context &ctx, size_t index)
     : impl::gpu::engine_t(
diff --git a/src/gpu/amd/engine.hpp b/src/gpu/amd/engine.hpp
index 837e21df8a9..606c832de15 100644
--- a/src/gpu/amd/engine.hpp
+++ b/src/gpu/amd/engine.hpp
@@ -49,20 +49,12 @@ class engine_t : public gpu::engine_t {
 
     status_t init() { return init_impl(); }
 
-    status_t create_memory_storage(memory_storage_t **storage, unsigned flags,
-            size_t size, void *handle) override;
-
     status_t create_stream(
             impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
 
     void activate_stream_miopen(HIPstream hip_stream);
     void activate_stream_rocblas(HIPstream hip_stream);
 
-    const ::sycl::device &device() const { return impl()->device(); }
-    const ::sycl::context &context() const { return impl()->context(); }
-
-    xpu::sycl::backend_t backend() const { return impl()->backend(); }
-
     hipCtx_t get_underlying_context() const;
     hipDevice_t get_underlying_device() const;
     miopenHandle_t *get_miopen_handle();
@@ -73,6 +65,8 @@ class engine_t : public gpu::engine_t {
         return impl()->mayiuse_system_memory_allocators();
     }
 
+    DECLARE_COMMON_SYCL_ENGINE_FUNCTIONS();
+
 protected:
     const xpu::sycl::engine_impl_t *impl() const {
         return (const xpu::sycl::engine_impl_t *)impl::engine_t::impl();
diff --git a/src/gpu/amd/miopen_batch_normalization.hpp b/src/gpu/amd/miopen_batch_normalization.hpp
index c04f79dd239..0a1ce297438 100644
--- a/src/gpu/amd/miopen_batch_normalization.hpp
+++ b/src/gpu/amd/miopen_batch_normalization.hpp
@@ -139,10 +139,7 @@ struct miopen_batch_normalization_bwd_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
 
     struct pd_t : public batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using batch_normalization_bwd_pd_t::batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("hip:miopen:any", miopen_batch_normalization_bwd_t);
 
diff --git a/src/gpu/amd/miopen_batch_normalization_executor.hpp b/src/gpu/amd/miopen_batch_normalization_executor.hpp
index 4bf3505a1b9..fa3fbe89d2a 100644
--- a/src/gpu/amd/miopen_batch_normalization_executor.hpp
+++ b/src/gpu/amd/miopen_batch_normalization_executor.hpp
@@ -20,6 +20,7 @@
 
 #include "common/batch_normalization_pd.hpp"
 #include "common/c_types_map.hpp"
+#include "common/compiler_workarounds.hpp"
 #include "common/primitive.hpp"
 #include "common/type_helpers.hpp"
 #include "gpu/amd/engine.hpp"
@@ -64,49 +65,55 @@ struct bnorm_exec_base_t {
             xpu::sycl::interop_memory_arg_t<mean_var_m> arg_mean = {},
             xpu::sycl::interop_memory_arg_t<mean_var_m> arg_var = {}) const {
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine = *utils::downcast<amd::engine_t *>(engine);
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine
+                            = *utils::downcast<amd::engine_t *>(engine);
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            if (!use_scale)
-                init_scaleshift(sc, ih, hip_stream, arg_scale_buf, 1.f,
-                        bnorm_impl->C());
-            if (!use_shift)
-                init_scaleshift(sc, ih, hip_stream, arg_shift_buf, 0.f,
-                        bnorm_impl->C());
-            if (init_global_stats)
-                init_mean_var(
-                        sc, ih, hip_stream, arg_mean, arg_var, bnorm_impl->C());
+                    if (!use_scale)
+                        init_scaleshift(sc, ih, hip_stream, arg_scale_buf, 1.f,
+                                bnorm_impl->C());
+                    if (!use_shift)
+                        init_scaleshift(sc, ih, hip_stream, arg_shift_buf, 0.f,
+                                bnorm_impl->C());
+                    if (init_global_stats)
+                        init_mean_var(sc, ih, hip_stream, arg_mean, arg_var,
+                                bnorm_impl->C());
 
-            auto *x = arg_src.get_native_pointer(ih);
-            auto *y = arg_dst.get_native_pointer(ih);
-            auto *mean = arg_mean.get_native_pointer(ih);
-            auto *var = arg_var.get_native_pointer(ih);
+                    auto *x = arg_src.get_native_pointer(ih);
+                    auto *y = arg_dst.get_native_pointer(ih);
+                    auto *mean = arg_mean.get_native_pointer(ih);
+                    auto *var = arg_var.get_native_pointer(ih);
 
-            auto *scale = use_scale
-                    ? static_cast<uint8_t *>(arg_scale.get_native_pointer(ih))
-                    : static_cast<uint8_t *>(
-                            arg_scale_buf.get_native_pointer(ih));
+                    auto *scale = use_scale
+                            ? static_cast<uint8_t *>(
+                                    arg_scale.get_native_pointer(ih))
+                            : static_cast<uint8_t *>(
+                                    arg_scale_buf.get_native_pointer(ih));
 
-            uint8_t *shift = use_shift
-                    ? static_cast<uint8_t *>(arg_shift.get_native_pointer(ih))
-                    : static_cast<uint8_t *>(
-                            arg_shift_buf.get_native_pointer(ih));
-            uint8_t *y_prime = nullptr, *save_mean = nullptr,
-                    *save_var = nullptr;
+                    uint8_t *shift = use_shift
+                            ? static_cast<uint8_t *>(
+                                    arg_shift.get_native_pointer(ih))
+                            : static_cast<uint8_t *>(
+                                    arg_shift_buf.get_native_pointer(ih));
+                    uint8_t *y_prime = nullptr, *save_mean = nullptr,
+                            *save_var = nullptr;
 
-            if (!arg_wkspace.empty()) {
-                save_mean = static_cast<uint8_t *>(
-                        arg_wkspace.get_native_pointer(ih));
-                save_var = save_mean + bnorm_impl->mean_var_size_bytes();
-                y_prime = save_var + bnorm_impl->mean_var_size_bytes();
-            }
+                    if (!arg_wkspace.empty()) {
+                        save_mean = static_cast<uint8_t *>(
+                                arg_wkspace.get_native_pointer(ih));
+                        save_var
+                                = save_mean + bnorm_impl->mean_var_size_bytes();
+                        y_prime = save_var + bnorm_impl->mean_var_size_bytes();
+                    }
 
-            std::shared_ptr<bnorm_args_t> args(new bnorm_fwd_args_t(x, y, mean,
-                    var, scale, shift, y_prime, save_mean, save_var));
-            bnorm_impl->execute(handle, args);
-        });
+                    std::shared_ptr<bnorm_args_t> args(
+                            new bnorm_fwd_args_t(x, y, mean, var, scale, shift,
+                                    y_prime, save_mean, save_var));
+                    bnorm_impl->execute(handle, args);
+                });
     }
 
     void interop_task_bwd(
@@ -135,51 +142,56 @@ struct bnorm_exec_base_t {
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
                     arg_temp_relu,
             bool use_scale, bool use_shift) const {
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine = *utils::downcast<amd::engine_t *>(engine);
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine
+                            = *utils::downcast<amd::engine_t *>(engine);
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            if (!use_scale)
-                init_scaleshift(sc, ih, hip_stream, arg_scale_buf, 1.f,
-                        bnorm_impl->C());
-            if (!use_scale)
-                init_scaleshift(sc, ih, hip_stream, arg_diff_scale_buf, 1.f,
-                        bnorm_impl->C());
-            if (!use_shift)
-                init_scaleshift(sc, ih, hip_stream, arg_diff_shift_buf, 0.f,
-                        bnorm_impl->C());
+                    if (!use_scale)
+                        init_scaleshift(sc, ih, hip_stream, arg_scale_buf, 1.f,
+                                bnorm_impl->C());
+                    if (!use_scale)
+                        init_scaleshift(sc, ih, hip_stream, arg_diff_scale_buf,
+                                1.f, bnorm_impl->C());
+                    if (!use_shift)
+                        init_scaleshift(sc, ih, hip_stream, arg_diff_shift_buf,
+                                0.f, bnorm_impl->C());
 
-            auto *x = arg_src.get_native_pointer(ih);
-            auto *dy = arg_diff_dst.get_native_pointer(ih);
-            auto *dx = arg_diff_src.get_native_pointer(ih);
+                    auto *x = arg_src.get_native_pointer(ih);
+                    auto *dy = arg_diff_dst.get_native_pointer(ih);
+                    auto *dx = arg_diff_src.get_native_pointer(ih);
 
-            auto *scale = use_scale
-                    ? static_cast<uint8_t *>(arg_scale.get_native_pointer(ih))
-                    : static_cast<uint8_t *>(
-                            arg_scale_buf.get_native_pointer(ih));
-            auto *diff_scale = use_scale
-                    ? static_cast<uint8_t *>(
-                            arg_diff_scale.get_native_pointer(ih))
-                    : static_cast<uint8_t *>(
-                            arg_diff_scale_buf.get_native_pointer(ih));
-            uint8_t *diff_shift = use_shift
-                    ? static_cast<uint8_t *>(
-                            arg_diff_shift.get_native_pointer(ih))
-                    : static_cast<uint8_t *>(
-                            arg_diff_shift_buf.get_native_pointer(ih));
+                    auto *scale = use_scale
+                            ? static_cast<uint8_t *>(
+                                    arg_scale.get_native_pointer(ih))
+                            : static_cast<uint8_t *>(
+                                    arg_scale_buf.get_native_pointer(ih));
+                    auto *diff_scale = use_scale
+                            ? static_cast<uint8_t *>(
+                                    arg_diff_scale.get_native_pointer(ih))
+                            : static_cast<uint8_t *>(
+                                    arg_diff_scale_buf.get_native_pointer(ih));
+                    uint8_t *diff_shift = use_shift
+                            ? static_cast<uint8_t *>(
+                                    arg_diff_shift.get_native_pointer(ih))
+                            : static_cast<uint8_t *>(
+                                    arg_diff_shift_buf.get_native_pointer(ih));
 
-            auto *save_mean = static_cast<uint8_t *>(
-                    arg_wkspace.get_native_pointer(ih));
-            auto *save_var = save_mean + bnorm_impl->mean_var_size_bytes();
-            auto *wkspace = save_var + bnorm_impl->mean_var_size_bytes();
-            auto *relu_dy = arg_temp_relu.get_native_pointer(ih);
+                    auto *save_mean = static_cast<uint8_t *>(
+                            arg_wkspace.get_native_pointer(ih));
+                    auto *save_var
+                            = save_mean + bnorm_impl->mean_var_size_bytes();
+                    auto *wkspace
+                            = save_var + bnorm_impl->mean_var_size_bytes();
+                    auto *relu_dy = arg_temp_relu.get_native_pointer(ih);
 
-            std::shared_ptr<bnorm_args_t> args(
-                    new bnorm_bwd_args_t(x, dx, dy, save_mean, save_var, scale,
-                            diff_scale, diff_shift, wkspace, relu_dy));
-            bnorm_impl->execute(handle, args);
-        });
+                    std::shared_ptr<bnorm_args_t> args(new bnorm_bwd_args_t(x,
+                            dx, dy, save_mean, save_var, scale, diff_scale,
+                            diff_shift, wkspace, relu_dy));
+                    bnorm_impl->execute(handle, args);
+                });
     }
 
     template <typename T = float>
diff --git a/src/gpu/amd/miopen_binary.cpp b/src/gpu/amd/miopen_binary.cpp
index dc49689df3d..9ce2069fafd 100644
--- a/src/gpu/amd/miopen_binary.cpp
+++ b/src/gpu/amd/miopen_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_binary.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -40,20 +42,21 @@ status_t miopen_binary_t::execute(const exec_ctx_t &ctx) const {
         auto arg_scale1
                 = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            void *a = arg_src_0.get_native_pointer(ih);
-            void *b = arg_src_1.get_native_pointer(ih);
-            void *c = arg_dst.get_native_pointer(ih);
-            void *s0 = arg_scale0.get_native_pointer(ih);
-            void *s1 = arg_scale1.get_native_pointer(ih);
+                    void *a = arg_src_0.get_native_pointer(ih);
+                    void *b = arg_src_1.get_native_pointer(ih);
+                    void *c = arg_dst.get_native_pointer(ih);
+                    void *s0 = arg_scale0.get_native_pointer(ih);
+                    void *s1 = arg_scale1.get_native_pointer(ih);
 
-            pd()->binary_impl_->execute(handle, a, b, c, s0, s1);
-        });
+                    pd()->binary_impl_->execute(handle, a, b, c, s0, s1);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_binary.hpp b/src/gpu/amd/miopen_binary.hpp
index f682f6f05b9..2a3d247db8e 100644
--- a/src/gpu/amd/miopen_binary.hpp
+++ b/src/gpu/amd/miopen_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,7 +46,7 @@ struct miopen_binary_t : public gpu::primitive_t {
                     && check_data_types() && check_no_blocking()
                     && check_broadcast()
                     && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime)
+                            primitive_attr_t::skip_mask_t::scales)
                     && IMPLICATION(!attr()->scales_.has_default_values(),
                             check_scales_mask())
                     && check_format();
@@ -66,12 +66,7 @@ struct miopen_binary_t : public gpu::primitive_t {
                     || has_zero_dims(dst_md()->dims, dst_md()->ndims);
         }
 
-        bool check_scales_mask() const {
-            for (const auto &s : attr()->scales_.scales_) {
-                if (s.second.mask_ != 0) return false;
-            }
-            return true;
-        }
+        bool check_scales_mask() const { return attr_scales_ok(); }
 
         bool check_no_blocking() const {
             // Blocking is not supported by MIOPENOpTensor, return false if any
diff --git a/src/gpu/amd/miopen_convolution.cpp b/src/gpu/amd/miopen_convolution.cpp
index 8f5c47bb07f..211d89b6c13 100644
--- a/src/gpu/amd/miopen_convolution.cpp
+++ b/src/gpu/amd/miopen_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_convolution.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -39,7 +41,6 @@ status_t miopen_convolution_fwd_t::execute_convolution(
                 memory_tracking::names::key_conv_miopen_algo);
         auto arg_filter_scratch = CTX_SCRATCH_SYCL_MEMORY(
                 memory_tracking::names::key_conv_miopen_filter);
-        auto arg_oscale = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_OUTPUT_SCALES);
 
         xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
                 temp_dst;
@@ -55,25 +56,25 @@ status_t miopen_convolution_fwd_t::execute_convolution(
                     ::sycl::access::mode::read_write>(temp_reorder_mem, cgh);
         }
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-
-            std::vector<void *> args;
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_weights.get_native_pointer(ih));
-            args.push_back(arg_dst.get_native_pointer(ih));
-            args.push_back(arg_bias.get_native_pointer(ih));
-            args.push_back(arg_scratch.get_native_pointer(ih));
-            args.push_back(arg_filter_scratch.get_native_pointer(ih));
-            args.push_back(temp_dst.get_native_pointer(ih));
-            args.push_back(temp_reorder.get_native_pointer(ih));
-            args.push_back(arg_oscale.get_native_pointer(ih));
-
-            pd()->impl_->execute(handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+
+                    std::vector<void *> args;
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_weights.get_native_pointer(ih));
+                    args.push_back(arg_dst.get_native_pointer(ih));
+                    args.push_back(arg_bias.get_native_pointer(ih));
+                    args.push_back(arg_scratch.get_native_pointer(ih));
+                    args.push_back(arg_filter_scratch.get_native_pointer(ih));
+                    args.push_back(temp_dst.get_native_pointer(ih));
+                    args.push_back(temp_reorder.get_native_pointer(ih));
+
+                    pd()->impl_->execute(handle, args);
+                });
     });
 }
 
@@ -91,22 +92,23 @@ status_t miopen_convolution_bwd_data_t::execute_convolution(
         auto arg_filter_scratch = CTX_SCRATCH_SYCL_MEMORY(
                 memory_tracking::names::key_conv_miopen_filter);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-
-            std::vector<void *> args;
-            args.push_back(arg_diff_src.get_native_pointer(ih));
-            args.push_back(arg_weights.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(arg_bias.get_native_pointer(ih));
-            args.push_back(arg_scratch.get_native_pointer(ih));
-            args.push_back(arg_filter_scratch.get_native_pointer(ih));
-
-            pd()->impl_->execute(handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+
+                    std::vector<void *> args;
+                    args.push_back(arg_diff_src.get_native_pointer(ih));
+                    args.push_back(arg_weights.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+                    args.push_back(arg_bias.get_native_pointer(ih));
+                    args.push_back(arg_scratch.get_native_pointer(ih));
+                    args.push_back(arg_filter_scratch.get_native_pointer(ih));
+
+                    pd()->impl_->execute(handle, args);
+                });
     });
 }
 
@@ -118,17 +120,19 @@ status_t miopen_convolution_bwd_weights_t::execute_zero_dims(
         auto arg_diff_weights = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_WEIGHTS);
         auto arg_diff_bias = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_BIAS);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            void *weights = arg_diff_weights.get_native_pointer(ih);
-            void *bias = arg_diff_bias.get_native_pointer(ih);
+                    void *weights = arg_diff_weights.get_native_pointer(ih);
+                    void *bias = arg_diff_bias.get_native_pointer(ih);
 
-            pd()->impl_->execute_set_weights_bias(handle, weights, bias, 0.f);
-        });
+                    pd()->impl_->execute_set_weights_bias(
+                            handle, weights, bias, 0.f);
+                });
     });
 }
 
@@ -152,22 +156,23 @@ status_t miopen_convolution_bwd_weights_t::execute_convolution(
             arg_diff_bias = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_BIAS);
         }
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-
-            std::vector<void *> args;
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_diff_weights.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(arg_diff_bias.get_native_pointer(ih));
-            args.push_back(arg_scratch.get_native_pointer(ih));
-            args.push_back(arg_filter_scratch.get_native_pointer(ih));
-
-            pd()->impl_->execute(handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+
+                    std::vector<void *> args;
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_diff_weights.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+                    args.push_back(arg_diff_bias.get_native_pointer(ih));
+                    args.push_back(arg_scratch.get_native_pointer(ih));
+                    args.push_back(arg_filter_scratch.get_native_pointer(ih));
+
+                    pd()->impl_->execute(handle, args);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_convolution.hpp b/src/gpu/amd/miopen_convolution.hpp
index e131f8966cb..2e01cc8258f 100644
--- a/src/gpu/amd/miopen_convolution.hpp
+++ b/src/gpu/amd/miopen_convolution.hpp
@@ -50,7 +50,7 @@ struct miopen_convolution_fwd_t : public gpu::primitive_t {
             using namespace data_type;
 
             using sm_t = primitive_attr_t::skip_mask_t;
-            const auto attr_skip_mask = sm_t::oscale_runtime | sm_t::post_ops;
+            const auto attr_skip_mask = sm_t::post_ops;
 
             bool ok = utils::one_of(desc()->prop_kind,
                     prop_kind::forward_training, prop_kind::forward_inference);
@@ -81,10 +81,6 @@ struct miopen_convolution_fwd_t : public gpu::primitive_t {
                     && IMPLICATION(
                             desc()->alg_kind == dnnl_convolution_winograd,
                             ndims() < 5 && src_md_.data_type != s8);
-            ok = ok
-                    && IMPLICATION(!attr()->output_scales_.has_default_values(),
-                            src_md_.data_type == s8
-                                    && attr()->output_scales_.mask_ == 0);
             ok = ok
                     && IMPLICATION(
                             src_md_.data_type == s8, check_s8_configuration())
diff --git a/src/gpu/amd/miopen_convolution_impl.hpp b/src/gpu/amd/miopen_convolution_impl.hpp
index 0ec2567d7ac..e60dbbe1f9e 100644
--- a/src/gpu/amd/miopen_convolution_impl.hpp
+++ b/src/gpu/amd/miopen_convolution_impl.hpp
@@ -62,7 +62,7 @@ struct miopen_convolution_impl_base_t
     size_t workspace_size = 0;
     bool with_bias = false;
     int selected_sol = -1;
-    bool do_scaling = false;
+    bool do_scaling = false; // TODO: add proper scaling support.
     float output_scaling = 1.0f;
     bool runtime_scaling = false;
     bool use_temp_dst_ = false;
@@ -154,8 +154,8 @@ struct miopen_convolution_impl_base_t
         with_bias = pd->with_bias();
         alpha = 1.0f;
         beta = 0.0f;
-        do_scaling = !pd->attr()->output_scales_.has_default_values();
-        output_scaling = !pd->attr()->output_scales_.defined();
+        do_scaling = false;
+        output_scaling = false;
 
         dnnl_descs[x] = *pd->invariant_src_md();
         dnnl_descs[weights] = *pd->invariant_wei_md();
@@ -342,7 +342,7 @@ struct miopen_convolution_impl_base_t
         int expected_dims[MIOPEN_DIM_MAX] = {};
         MIOPEN_EXECUTE_FUNC_V(miopenGetConvolutionNdForwardOutputDim, conv_desc,
                 descs[x], weights_desc, &ndims[y], &expected_dims[0]);
-        for (size_t i = 0; i < ndims[y]; i++) {
+        for (int i = 0; i < ndims[y]; i++) {
             if (dims[y][i] != expected_dims[i]) return status::unimplemented;
         }
         return status::success;
@@ -446,7 +446,7 @@ struct miopen_convolution_impl_fwd_t : public miopen_convolution_impl_base_t {
     status_t configure_post_ops(convolution_pd_t *pd) {
         auto &p = pd->attr()->post_ops_;
         num_post_ops = p.len();
-        for (size_t i = 0; i < p.len(); i++) {
+        for (int i = 0; i < p.len(); i++) {
             post_ops[i] = p.entry_[i].kind;
             if (post_ops[i] == dnnl_eltwise) {
                 CHECK(create_and_set_eltwise_descriptor(pd));
@@ -580,7 +580,7 @@ struct miopen_convolution_impl_fwd_t : public miopen_convolution_impl_base_t {
                 weights_desc, descs[io::x], conv_desc, descs[io::y],
                 maxSolutionCount, &actualCount, solutions.data()));
 
-        for (int i = 0; i < actualCount; i++) {
+        for (size_t i = 0; i < actualCount; i++) {
             if (solutions[i].workspace_size > 0) continue;
             selected_sol = i;
             break;
@@ -705,7 +705,7 @@ struct miopen_convolution_impl_bwd_data_t
                 handle, descs[io::y], weights_desc, conv_desc, descs[io::x],
                 solutionCountm, &solutionCount, solutions.data()));
 
-        for (int i = 0; i < solutionCount; i++) {
+        for (size_t i = 0; i < solutionCount; i++) {
             if (selected_sol == -1) {
                 ws_size = solutions[i].workspace_size;
                 selected_sol = i;
@@ -842,7 +842,7 @@ struct miopen_convolution_impl_bwd_weights_t
                 handle, descs[io::y], descs[io::x], conv_desc, weights_desc,
                 solutionCountm, &solutionCount, solutions.data()));
 
-        for (int i = 0; i < solutionCount; i++) {
+        for (size_t i = 0; i < solutionCount; i++) {
             if (selected_sol == -1) {
                 ws_size = solutions[i].workspace_size;
                 selected_sol = i;
diff --git a/src/gpu/amd/miopen_deconvolution.cpp b/src/gpu/amd/miopen_deconvolution.cpp
index 16f6dbbe822..92d9bdf455b 100644
--- a/src/gpu/amd/miopen_deconvolution.cpp
+++ b/src/gpu/amd/miopen_deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_deconvolution.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -37,17 +39,18 @@ status_t miopen_deconvolution_bwd_weights_t::execute_bias(
         auto arg_diff_bias = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_BIAS);
         auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            void *bias = arg_diff_bias.get_native_pointer(ih);
-            void *y = arg_diff_dst.get_native_pointer(ih);
+                    void *bias = arg_diff_bias.get_native_pointer(ih);
+                    void *y = arg_diff_dst.get_native_pointer(ih);
 
-            impl_->execute_bias(handle, y, bias);
-        });
+                    impl_->execute_bias(handle, y, bias);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_deconvolution.hpp b/src/gpu/amd/miopen_deconvolution.hpp
index 7a9a708bdf2..86a6de12e3f 100644
--- a/src/gpu/amd/miopen_deconvolution.hpp
+++ b/src/gpu/amd/miopen_deconvolution.hpp
@@ -112,10 +112,7 @@ static status_t conv_descr_create(
 struct miopen_deconvolution_fwd_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
+        using deconvolution_fwd_pd_t::deconvolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : deconvolution_fwd_pd_t(other)
@@ -244,17 +241,12 @@ struct miopen_deconvolution_fwd_t : public gpu::primitive_t {
 struct miopen_deconvolution_bwd_data_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public deconvolution_bwd_data_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : deconvolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
+        using deconvolution_bwd_data_pd_t::deconvolution_bwd_data_pd_t;
 
         pd_t(const pd_t &other)
             : deconvolution_bwd_data_pd_t(other)
             , conv_pd_(other.conv_pd_->clone()) {}
 
-        ~pd_t() {}
-
         DECLARE_COMMON_PD_T("hip:miopen:any", miopen_deconvolution_bwd_data_t);
 
         status_t init_convolution(impl::engine_t *engine) {
@@ -344,17 +336,12 @@ struct miopen_deconvolution_bwd_data_t : public gpu::primitive_t {
 struct miopen_deconvolution_bwd_weights_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public deconvolution_bwd_weights_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : deconvolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
+        using deconvolution_bwd_weights_pd_t::deconvolution_bwd_weights_pd_t;
 
         pd_t(const pd_t &other)
             : deconvolution_bwd_weights_pd_t(other)
             , conv_pd_(other.conv_pd_->clone()) {}
 
-        ~pd_t() {}
-
         DECLARE_COMMON_PD_T(
                 "hip:miopen:any", miopen_deconvolution_bwd_weights_t);
 
diff --git a/src/gpu/amd/miopen_eltwise.cpp b/src/gpu/amd/miopen_eltwise.cpp
index 9bcb8e50fbf..7263cd7306c 100644
--- a/src/gpu/amd/miopen_eltwise.cpp
+++ b/src/gpu/amd/miopen_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_eltwise.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -36,18 +38,20 @@ status_t miopen_eltwise_fwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
         auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            std::vector<void *> args;
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    std::vector<void *> args;
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_dst.get_native_pointer(ih));
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_dst.get_native_pointer(ih));
 
-            pd()->eltwise_fwd_impl_->execute(handle, args.data(), args.size());
-        });
+                    pd()->eltwise_fwd_impl_->execute(
+                            handle, args.data(), args.size());
+                });
     });
 }
 
@@ -61,19 +65,21 @@ status_t miopen_eltwise_bwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
         auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST);
         auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC);
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            std::vector<void *> args;
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    std::vector<void *> args;
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(arg_diff_src.get_native_pointer(ih));
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+                    args.push_back(arg_diff_src.get_native_pointer(ih));
 
-            pd()->eltwise_bwd_impl_->execute(handle, args.data(), args.size());
-        });
+                    pd()->eltwise_bwd_impl_->execute(
+                            handle, args.data(), args.size());
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_gemm_inner_product.hpp b/src/gpu/amd/miopen_gemm_inner_product.hpp
index 28193f996f2..896315b958b 100644
--- a/src/gpu/amd/miopen_gemm_inner_product.hpp
+++ b/src/gpu/amd/miopen_gemm_inner_product.hpp
@@ -160,10 +160,10 @@ status_t template_set_default_params(memory_desc_t &src_md,
 
 struct miopen_gemm_inner_product_fwd_t : public miopen_inner_product_fwd_t {
     using miopen_inner_product_fwd_t::miopen_inner_product_fwd_t;
-    using parrent_pd_t = miopen_inner_product_fwd_t::pd_t;
+    using parent_pd_t = miopen_inner_product_fwd_t::pd_t;
 
-    struct pd_t : public parrent_pd_t {
-        using parrent_pd_t::parrent_pd_t;
+    struct pd_t : public parent_pd_t {
+        using parent_pd_t::parent_pd_t;
 
         DECLARE_COMMON_PD_T("hip:miopen:gemm", miopen_gemm_inner_product_fwd_t);
 
@@ -182,7 +182,7 @@ struct miopen_gemm_inner_product_fwd_t : public miopen_inner_product_fwd_t {
                     : reorder_check(src_md(), weights_md(), dst_md());
 
             using sm_t = primitive_attr_t::skip_mask_t;
-            const auto attr_skip_mask = sm_t::oscale_runtime | sm_t::post_ops;
+            const auto attr_skip_mask = sm_t::post_ops;
 
             bool with_eltwise
                     = attr()->post_ops_.find(primitive_kind::eltwise) != -1;
@@ -212,9 +212,6 @@ struct miopen_gemm_inner_product_fwd_t : public miopen_inner_product_fwd_t {
             ok = ok && memory_format_ok(src_md())
                     && memory_format_ok(weights_md(0))
                     && memory_format_ok(dst_md())
-                    && IMPLICATION(!attr()->output_scales_.has_default_values(),
-                            utils::one_of(src_md_.data_type, s8)
-                                    && attr()->output_scales_.mask_ == 0)
                     && attr()->has_default_values(attr_skip_mask)
                     && attr_post_ops_ok(attr(), s8_case)
                     && dense_check(src_md(), weights_md(), dst_md())
diff --git a/src/gpu/amd/miopen_gemm_inner_product_impl.hpp b/src/gpu/amd/miopen_gemm_inner_product_impl.hpp
index 687b44e6897..0f366dae370 100644
--- a/src/gpu/amd/miopen_gemm_inner_product_impl.hpp
+++ b/src/gpu/amd/miopen_gemm_inner_product_impl.hpp
@@ -59,9 +59,11 @@ struct miopen_gemm_inner_product_base_t {
             case miopenInt8:
                 blas_dt = rocblas_datatype_i8_r;
                 return status::success;
+#if MIOPEN_HAS_INT8X4
             case miopenInt8x4:
                 blas_dt = rocblas_datatype_i8_r;
                 return status::success;
+#endif
             case miopenInt32:
                 blas_dt = rocblas_datatype_i32_r;
                 return status::success;
@@ -248,9 +250,9 @@ struct miopen_gemm_inner_product_fwd_impl_t
         with_eltwise_ = with_eltwise || with_relu;
         with_relu_ = with_eltwise;
 
-        output_scales_ = 1.0f;
-        alpha_s32 = output_scales_;
-        alpha_f32 = output_scales_;
+        alpha_ = 1.0f;
+        alpha_s32 = alpha_;
+        alpha_f32 = alpha_;
 
         with_sum_ = with_sum;
         sum_scale_ = sum_scale(pd);
@@ -324,9 +326,8 @@ struct miopen_gemm_inner_product_fwd_impl_t
 
         if (with_bias_) {
             MIOPEN_EXECUTE_FUNC(miopenOpTensor, miopen_handle,
-                    miopenTensorOpAdd, &alpha_, y_acc_desc_, y_dst,
-                    &output_scales_, tensor_descs_[io::bia], b, &alpha2,
-                    y_acc_desc_, y_dst);
+                    miopenTensorOpAdd, &alpha_, y_acc_desc_, y_dst, &alpha_,
+                    tensor_descs_[io::bia], b, &alpha2, y_acc_desc_, y_dst);
         }
 
         if (with_eltwise_) {
diff --git a/src/gpu/amd/miopen_inner_product.cpp b/src/gpu/amd/miopen_inner_product.cpp
index cff00237295..17595d8bebc 100644
--- a/src/gpu/amd/miopen_inner_product.cpp
+++ b/src/gpu/amd/miopen_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,8 +15,10 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "gpu/amd/miopen_inner_product.hpp"
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_gemm_inner_product.hpp"
+#include "gpu/amd/miopen_inner_product.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
 #include "xpu/sycl/buffer_memory_storage.hpp"
@@ -37,35 +39,37 @@ status_t miopen_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_wei = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
         auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
         auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-        auto arg_oscale = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_OUTPUT_SCALES);
         auto arg_ip_scratch = CTX_SCRATCH_SYCL_MEMORY(
                 memory_tracking::names::key_iprod_int_dat_in_acc_dt);
         auto arg_spacial_scratch
                 = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
         auto arg_scaled_bias_scratch = CTX_SCRATCH_SYCL_MEMORY(
                 memory_tracking::names::key_conv_adjusted_scales);
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto native_stream = hip_stream->get_underlying_stream();
-            auto miopen_handle = hip_stream->get_miopen_handle(native_stream);
-            auto rocblas_handle = hip_stream->get_rocblas_handle(native_stream);
-
-            std::vector<void *> args;
-
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_wei.get_native_pointer(ih));
-            args.push_back(arg_bias.get_native_pointer(ih));
-            args.push_back(arg_dst.get_native_pointer(ih));
-            args.push_back(arg_ip_scratch.get_native_pointer(ih));
-            args.push_back(arg_spacial_scratch.get_native_pointer(ih));
-            args.push_back(arg_scaled_bias_scratch.get_native_pointer(ih));
-            args.push_back(arg_oscale.get_native_pointer(ih));
-
-            pd()->inner_product_impl_->execute(
-                    miopen_handle, rocblas_handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto native_stream = hip_stream->get_underlying_stream();
+                    auto miopen_handle
+                            = hip_stream->get_miopen_handle(native_stream);
+                    auto rocblas_handle
+                            = hip_stream->get_rocblas_handle(native_stream);
+
+                    std::vector<void *> args;
+
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_wei.get_native_pointer(ih));
+                    args.push_back(arg_bias.get_native_pointer(ih));
+                    args.push_back(arg_dst.get_native_pointer(ih));
+                    args.push_back(arg_ip_scratch.get_native_pointer(ih));
+                    args.push_back(arg_spacial_scratch.get_native_pointer(ih));
+                    args.push_back(
+                            arg_scaled_bias_scratch.get_native_pointer(ih));
+
+                    pd()->inner_product_impl_->execute(
+                            miopen_handle, rocblas_handle, args);
+                });
     });
 }
 
@@ -82,25 +86,28 @@ status_t miopen_inner_product_bwd_data_t::execute(const exec_ctx_t &ctx) const {
         auto arg_spacial_scratch
                 = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto native_stream = hip_stream->get_underlying_stream();
-            auto miopen_handle = hip_stream->get_miopen_handle(native_stream);
-            auto rocblas_handle = hip_stream->get_rocblas_handle(native_stream);
-
-            std::vector<void *> args;
-
-            args.push_back(arg_diff_src.get_native_pointer(ih));
-            args.push_back(arg_wei.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(arg_ip_scratch.get_native_pointer(ih));
-            args.push_back(arg_spacial_scratch.get_native_pointer(ih));
-
-            pd()->inner_product_impl_->execute(
-                    miopen_handle, rocblas_handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto native_stream = hip_stream->get_underlying_stream();
+                    auto miopen_handle
+                            = hip_stream->get_miopen_handle(native_stream);
+                    auto rocblas_handle
+                            = hip_stream->get_rocblas_handle(native_stream);
+
+                    std::vector<void *> args;
+
+                    args.push_back(arg_diff_src.get_native_pointer(ih));
+                    args.push_back(arg_wei.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+                    args.push_back(arg_ip_scratch.get_native_pointer(ih));
+                    args.push_back(arg_spacial_scratch.get_native_pointer(ih));
+
+                    pd()->inner_product_impl_->execute(
+                            miopen_handle, rocblas_handle, args);
+                });
     });
 }
 
@@ -140,25 +147,28 @@ status_t miopen_inner_product_bwd_weights_t::execute(
         auto arg_spacial_scratch
                 = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto native_stream = hip_stream->get_underlying_stream();
-            auto miopen_handle = hip_stream->get_miopen_handle(native_stream);
-            auto rocblas_handle = hip_stream->get_rocblas_handle(native_stream);
-            std::vector<void *> args;
-
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(arg_diff_wei.get_native_pointer(ih));
-            args.push_back(arg_bias.get_native_pointer(ih));
-            args.push_back(arg_ip_scratch.get_native_pointer(ih));
-            args.push_back(arg_spacial_scratch.get_native_pointer(ih));
-
-            pd()->inner_product_impl_->execute(
-                    miopen_handle, rocblas_handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto native_stream = hip_stream->get_underlying_stream();
+                    auto miopen_handle
+                            = hip_stream->get_miopen_handle(native_stream);
+                    auto rocblas_handle
+                            = hip_stream->get_rocblas_handle(native_stream);
+                    std::vector<void *> args;
+
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+                    args.push_back(arg_diff_wei.get_native_pointer(ih));
+                    args.push_back(arg_bias.get_native_pointer(ih));
+                    args.push_back(arg_ip_scratch.get_native_pointer(ih));
+                    args.push_back(arg_spacial_scratch.get_native_pointer(ih));
+
+                    pd()->inner_product_impl_->execute(
+                            miopen_handle, rocblas_handle, args);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_inner_product_impl.hpp b/src/gpu/amd/miopen_inner_product_impl.hpp
index d543ffa8ff5..cce0296abd9 100644
--- a/src/gpu/amd/miopen_inner_product_impl.hpp
+++ b/src/gpu/amd/miopen_inner_product_impl.hpp
@@ -134,7 +134,7 @@ struct miopen_inner_product_impl_base_t {
     bool conv_using_scale_scratchpad() const { return scale_bias_; }
 
     void set_bias_dims(miopenTensorLayout_t format, int ndims, int bias_dim) {
-        for (size_t i = 0; i < ndims; ++i) {
+        for (int i = 0; i < ndims; ++i) {
             dims_[io::bia][i] = 1;
             strides_[io::bia][i] = (format != miopenTensorNHWC ? 1 : bias_dim);
         }
@@ -155,7 +155,7 @@ struct miopen_inner_product_impl_base_t {
 
 struct miopen_inner_product_fwd_base_t
     : public miopen_inner_product_impl_base_t {
-    float output_scales_; // alpha in gemm
+    float alpha_; // alpha in gemm
     bool do_scaling_ {false}, runtime_scaling_ {false};
     float sum_scale_; // beta in gemm
     float eltwise_alpha(const inner_product_pd_t *pd) const {
diff --git a/src/gpu/amd/miopen_lrn.cpp b/src/gpu/amd/miopen_lrn.cpp
index 2639e20fa96..ddd2418c843 100644
--- a/src/gpu/amd/miopen_lrn.cpp
+++ b/src/gpu/amd/miopen_lrn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
  * limitations under the License.
  *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_lrn.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -38,19 +40,20 @@ status_t miopen_lrn_fwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
         auto arg_wrksp = CTX_OUT_SYCL_MEMORY(DNNL_ARG_WORKSPACE);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            void *src_ = arg_src.get_native_pointer(ih);
-            void *dst_ = arg_dst.get_native_pointer(ih);
-            void *ws_ = arg_wrksp.get_native_pointer(ih);
+                    void *src_ = arg_src.get_native_pointer(ih);
+                    void *dst_ = arg_dst.get_native_pointer(ih);
+                    void *ws_ = arg_wrksp.get_native_pointer(ih);
 
-            std::vector<void *> args {src_, dst_, ws_};
-            pd()->lrn_impl_->execute(handle, args);
-        });
+                    std::vector<void *> args {src_, dst_, ws_};
+                    pd()->lrn_impl_->execute(handle, args);
+                });
     });
 }
 
@@ -66,20 +69,21 @@ status_t miopen_lrn_bwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC);
         auto arg_ws = CTX_IN_SYCL_MEMORY(DNNL_ARG_WORKSPACE);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            std::vector<void *> args;
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_ws.get_native_pointer(ih));
-            args.push_back(arg_diff_src.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-
-            pd()->lrn_impl_->execute(handle, args);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    std::vector<void *> args;
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_ws.get_native_pointer(ih));
+                    args.push_back(arg_diff_src.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+
+                    pd()->lrn_impl_->execute(handle, args);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_matmul.hpp b/src/gpu/amd/miopen_matmul.hpp
index 516d08b70fc..f42147d62a6 100644
--- a/src/gpu/amd/miopen_matmul.hpp
+++ b/src/gpu/amd/miopen_matmul.hpp
@@ -62,8 +62,7 @@ struct miopen_matmul_t : public gpu::primitive_t {
 
             bool ok = blocking_ok()
                     && attr()->has_default_values(smask_t::post_ops)
-                    && attr_oscale_ok() && attr_post_ops_ok(s8_case)
-                    && set_default_formats()
+                    && attr_post_ops_ok(s8_case) && set_default_formats()
                     && (f32_case || f16_case || s8_case || bf16_case)
                     && IMPLICATION(with_bias(),
                             (IMPLICATION(f32_case, (bia_dt == f32))
@@ -80,11 +79,6 @@ struct miopen_matmul_t : public gpu::primitive_t {
         }
 
     private:
-        bool attr_oscale_ok() const {
-            const auto &oscale = attr()->output_scales_;
-            return oscale.mask_ == 0;
-        }
-
         bool attr_post_ops_ok(bool s8_case) const {
             using namespace primitive_kind;
             const auto &p = attr()->post_ops_;
diff --git a/src/gpu/amd/miopen_matmul_executor.hpp b/src/gpu/amd/miopen_matmul_executor.hpp
index dcf48183a42..81ae7fa2ade 100644
--- a/src/gpu/amd/miopen_matmul_executor.hpp
+++ b/src/gpu/amd/miopen_matmul_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,6 +18,8 @@
 #ifndef GPU_AMD_MIOPEN_MATMUL_EXECUTOR_HPP
 #define GPU_AMD_MIOPEN_MATMUL_EXECUTOR_HPP
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/engine.hpp"
 #include "gpu/amd/miopen_matmul.hpp"
 #include "gpu/amd/miopen_matmul_impl.hpp"
@@ -101,7 +103,8 @@ struct miopen_matmul_scratch_runtime_args_bias_exec_t
 
         init_scratch_buffer(scratchpad_size);
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
+        return hip_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
@@ -126,7 +129,8 @@ struct miopen_matmul_runtime_args_scratch_exec_t
 
         init_scratch_buffer(scratchpad_size);
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
+        return hip_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                ::sycl::handler &cgh) {
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
@@ -150,7 +154,8 @@ struct miopen_matmul_runtime_args_bias_exec_t
         amd::stream_t *hip_stream
                 = utils::downcast<amd::stream_t *>(ctx.stream());
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
+        return hip_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
@@ -173,20 +178,21 @@ struct miopen_matmul_runtime_args_exec_t : public miopen_matmul_exec_base_t {
         amd::stream_t *hip_stream
                 = utils::downcast<amd::stream_t *>(ctx.stream());
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-
-            auto arg_bias = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read>();
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>();
-
-            interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src,
-                    arg_dst, /*nullptr*/ arg_bias,
-                    /*nullptr*/ arg_scratch);
-        });
+        return hip_stream->interop_task(
+                [= WA_THIS_COPY_CAPTURE](::sycl::handler &cgh) {
+                    auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
+                    auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
+                    auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
+
+                    auto arg_bias = xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read>();
+                    auto arg_scratch = xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+
+                    interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt,
+                            arg_src, arg_dst, /*nullptr*/ arg_bias,
+                            /*nullptr*/ arg_scratch);
+                });
     }
 };
 
@@ -198,17 +204,18 @@ struct miopen_matmul_bias_scratch_exec_t : public miopen_matmul_exec_base_t {
         amd::stream_t *hip_stream
                 = utils::downcast<amd::stream_t *>(ctx.stream());
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-            auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
-            auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY(
-                    memory_tracking::names::key_matmul_dst_in_acc_dt);
-
-            interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src,
-                    arg_dst, arg_bias, arg_scratch);
-        });
+        return hip_stream->interop_task(
+                [= WA_THIS_COPY_CAPTURE](::sycl::handler &cgh) {
+                    auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
+                    auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
+                    auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
+                    auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
+                    auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_matmul_dst_in_acc_dt);
+
+                    interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt,
+                            arg_src, arg_dst, arg_bias, arg_scratch);
+                });
     }
 };
 
@@ -220,7 +227,8 @@ struct miopen_matmul_scratch_exec_t : public miopen_matmul_exec_base_t {
         amd::stream_t *hip_stream
                 = utils::downcast<amd::stream_t *>(ctx.stream());
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
+        return hip_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
@@ -244,7 +252,8 @@ struct miopen_matmul_bias_exec_t : public miopen_matmul_exec_base_t {
         amd::stream_t *hip_stream
                 = utils::downcast<amd::stream_t *>(ctx.stream());
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
+        return hip_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
@@ -267,20 +276,21 @@ struct miopen_matmul_exec_t : public miopen_matmul_exec_base_t {
         amd::stream_t *hip_stream
                 = utils::downcast<amd::stream_t *>(ctx.stream());
 
-        return hip_stream->interop_task([=](::sycl::handler &cgh) {
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-
-            auto arg_bias = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read>();
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>();
-
-            interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt, arg_src,
-                    arg_dst, /*nullptr*/ arg_bias,
-                    /*nullptr*/ arg_scratch);
-        });
+        return hip_stream->interop_task(
+                [= WA_THIS_COPY_CAPTURE](::sycl::handler &cgh) {
+                    auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
+                    auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
+                    auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
+
+                    auto arg_bias = xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read>();
+                    auto arg_scratch = xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+
+                    interop_task(matmul_impl_, engine, cgh, hip_stream, arg_wt,
+                            arg_src, arg_dst, /*nullptr*/ arg_bias,
+                            /*nullptr*/ arg_scratch);
+                });
     }
 };
 
diff --git a/src/gpu/amd/miopen_matmul_impl.hpp b/src/gpu/amd/miopen_matmul_impl.hpp
index f4813f2160c..ebe93218b1a 100644
--- a/src/gpu/amd/miopen_matmul_impl.hpp
+++ b/src/gpu/amd/miopen_matmul_impl.hpp
@@ -154,10 +154,10 @@ struct miopen_matmul_impl_t {
     void convert_dims_matmul(
             const dnnl_dim_t *dims, int *new_dims, int n_dims) {
         new_dims[0] = 1;
-        for (size_t i = 0; i < n_dims; i++) {
+        for (int i = 0; i < n_dims; i++) {
             new_dims[i + 1] = static_cast<int>(dims[i]);
         }
-        for (size_t i = n_dims; i < 4; i++) {
+        for (int i = n_dims; i < 4; i++) {
             new_dims[i + 1] = 1;
         }
     }
diff --git a/src/gpu/amd/miopen_pooling.cpp b/src/gpu/amd/miopen_pooling.cpp
index 6963c94f45c..a56deb2adc1 100644
--- a/src/gpu/amd/miopen_pooling.cpp
+++ b/src/gpu/amd/miopen_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,8 +15,10 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "gpu/amd/miopen_pooling.hpp"
+#include "common/compiler_workarounds.hpp"
+
 #include "common/nstl.hpp"
+#include "gpu/amd/miopen_pooling.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
 #include "xpu/sycl/buffer_memory_storage.hpp"
@@ -76,16 +78,17 @@ status_t miopen_pooling_fwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
         auto arg_wkspace = CTX_OUT_SYCL_MEMORY(DNNL_ARG_WORKSPACE);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-            void *x = arg_src.get_native_pointer(ih);
-            void *y = arg_dst.get_native_pointer(ih);
-            void *ws = arg_wkspace.get_native_pointer(ih);
-            pd()->pooling_impl_->execute(handle, x, y, ws);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+                    void *x = arg_src.get_native_pointer(ih);
+                    void *y = arg_dst.get_native_pointer(ih);
+                    void *ws = arg_wkspace.get_native_pointer(ih);
+                    pd()->pooling_impl_->execute(handle, x, y, ws);
+                });
     });
 }
 
@@ -105,17 +108,18 @@ status_t miopen_pooling_bwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST);
         auto arg_wkspace = CTX_IN_SYCL_MEMORY(DNNL_ARG_WORKSPACE);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-            void *dx = arg_diff_src.get_native_pointer(ih);
-            void *dy = arg_diff_dst.get_native_pointer(ih);
-            void *ws = arg_wkspace.get_native_pointer(ih);
-
-            pd()->pooling_impl_->execute(handle, dx, dy, ws);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+                    void *dx = arg_diff_src.get_native_pointer(ih);
+                    void *dy = arg_diff_dst.get_native_pointer(ih);
+                    void *ws = arg_wkspace.get_native_pointer(ih);
+
+                    pd()->pooling_impl_->execute(handle, dx, dy, ws);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_reduction.cpp b/src/gpu/amd/miopen_reduction.cpp
index 4d97fdca75f..a9b7cb5c256 100644
--- a/src/gpu/amd/miopen_reduction.cpp
+++ b/src/gpu/amd/miopen_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_reduction.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -38,17 +40,18 @@ status_t miopen_reduction_t::execute(const exec_ctx_t &ctx) const {
         auto arg_scratch
                 = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
-
-            void *a = arg_src.get_native_pointer(ih);
-            void *c = arg_dst.get_native_pointer(ih);
-            void *scratch = arg_scratch.get_native_pointer(ih);
-            pd()->reduction_impl_->execute(handle, a, c, scratch);
-        });
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
+
+                    void *a = arg_src.get_native_pointer(ih);
+                    void *c = arg_dst.get_native_pointer(ih);
+                    void *scratch = arg_scratch.get_native_pointer(ih);
+                    pd()->reduction_impl_->execute(handle, a, c, scratch);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_reorder.cpp b/src/gpu/amd/miopen_reorder.cpp
index d51b1ee1299..0b831ac5bd2 100644
--- a/src/gpu/amd/miopen_reorder.cpp
+++ b/src/gpu/amd/miopen_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_reorder.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -39,25 +41,26 @@ status_t miopen_reorder_t::execute(const exec_ctx_t &ctx) const {
         auto arg_dst_scale
                 = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
-            auto handle = hip_stream->get_miopen_handle();
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+                    auto handle = hip_stream->get_miopen_handle();
 
-            void *src_ = arg_src.get_native_pointer(ih);
-            void *dst_ = arg_dst.get_native_pointer(ih);
+                    void *src_ = arg_src.get_native_pointer(ih);
+                    void *dst_ = arg_dst.get_native_pointer(ih);
 
-            auto a = static_cast<uint8_t *>(src_)
-                    + pd()->reorder_->src_offset_in_bytes();
-            auto b = static_cast<uint8_t *>(dst_)
-                    + pd()->reorder_->dst_offset_in_bytes();
+                    auto a = static_cast<uint8_t *>(src_)
+                            + pd()->reorder_->src_offset_in_bytes();
+                    auto b = static_cast<uint8_t *>(dst_)
+                            + pd()->reorder_->dst_offset_in_bytes();
 
-            void *src_sc = arg_src_scale.get_native_pointer(ih);
-            void *dst_sc = arg_dst_scale.get_native_pointer(ih);
+                    void *src_sc = arg_src_scale.get_native_pointer(ih);
+                    void *dst_sc = arg_dst_scale.get_native_pointer(ih);
 
-            pd()->reorder_->execute(handle, a, b, src_sc, dst_sc);
-        });
+                    pd()->reorder_->execute(handle, a, b, src_sc, dst_sc);
+                });
     });
 }
 
diff --git a/src/gpu/amd/miopen_reorder.hpp b/src/gpu/amd/miopen_reorder.hpp
index ecc31996045..4f1146eb37d 100644
--- a/src/gpu/amd/miopen_reorder.hpp
+++ b/src/gpu/amd/miopen_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -71,14 +71,17 @@ struct miopen_reorder_t : public gpu::primitive_t {
             return ok;
         }
 
-        bool scales_ok() const {
-            const auto &scales = attr()->scales_;
-            const auto &supported_args = {DNNL_ARG_FROM, DNNL_ARG_TO};
-            if (!scales.has_default_values(supported_args)) return false;
-            // MIOpen does not support scaling per dimension.
-            for (auto arg : supported_args)
-                if (scales.get(arg).mask_ != 0) return false;
-            return true;
+        bool scales_ok(const std::vector<int> &supported_args
+                = {DNNL_ARG_FROM, DNNL_ARG_TO}) const {
+            bool ok = attr()->scales_.has_default_values(supported_args);
+            for (int arg : supported_args) {
+                if (attr()->scales_.has_default_values(arg)) continue;
+
+                const auto &mask = attr()->scales_.get_mask(arg);
+                // MIOpen does not support scaling per dimension.
+                ok = ok && (mask == 0);
+            }
+            return ok;
         }
 
         bool post_ops_ok() const {
@@ -94,8 +97,7 @@ struct miopen_reorder_t : public gpu::primitive_t {
 
         status_t init(impl::engine_t *engine, impl::engine_t *src_engine,
                 impl::engine_t *dst_engine) {
-            const auto attr_skip_mask
-                    = primitive_attr_t::skip_mask_t::scales_runtime
+            const auto attr_skip_mask = primitive_attr_t::skip_mask_t::scales
                     | primitive_attr_t::skip_mask_t::post_ops;
             const bool ok = true && (engine == dst_engine)
                     && src_engine->kind() == engine_kind::gpu
diff --git a/src/gpu/amd/miopen_softmax.cpp b/src/gpu/amd/miopen_softmax.cpp
index a142216cc00..3c1b09083bf 100644
--- a/src/gpu/amd/miopen_softmax.cpp
+++ b/src/gpu/amd/miopen_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,6 +15,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "common/compiler_workarounds.hpp"
+
 #include "gpu/amd/miopen_softmax.hpp"
 #include "gpu/amd/stream.hpp"
 #include "gpu/amd/sycl_hip_scoped_context.hpp"
@@ -36,19 +38,21 @@ status_t miopen_softmax_fwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
         auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            std::vector<void *> args;
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    std::vector<void *> args;
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
 
-            auto handle = hip_stream->get_miopen_handle();
+                    auto handle = hip_stream->get_miopen_handle();
 
-            args.push_back(arg_src.get_native_pointer(ih));
-            args.push_back(arg_dst.get_native_pointer(ih));
+                    args.push_back(arg_src.get_native_pointer(ih));
+                    args.push_back(arg_dst.get_native_pointer(ih));
 
-            pd()->softmax_impl_->execute(handle, args.data(), args.size());
-        });
+                    pd()->softmax_impl_->execute(
+                            handle, args.data(), args.size());
+                });
     });
 }
 
@@ -63,20 +67,22 @@ status_t miopen_softmax_bwd_t::execute(const exec_ctx_t &ctx) const {
         auto arg_diff_dst = CTX_IN_SYCL_MEMORY(DNNL_ARG_DIFF_DST);
         auto arg_diff_src = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DIFF_SRC);
 
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            std::vector<void *> args;
-            auto &sycl_engine
-                    = *utils::downcast<amd::engine_t *>(hip_stream->engine());
-            auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    std::vector<void *> args;
+                    auto &sycl_engine = *utils::downcast<amd::engine_t *>(
+                            hip_stream->engine());
+                    auto sc = hip_sycl_scoped_context_handler_t(sycl_engine);
 
-            auto handle = hip_stream->get_miopen_handle();
+                    auto handle = hip_stream->get_miopen_handle();
 
-            args.push_back(arg_dst.get_native_pointer(ih));
-            args.push_back(arg_diff_dst.get_native_pointer(ih));
-            args.push_back(arg_diff_src.get_native_pointer(ih));
+                    args.push_back(arg_dst.get_native_pointer(ih));
+                    args.push_back(arg_diff_dst.get_native_pointer(ih));
+                    args.push_back(arg_diff_src.get_native_pointer(ih));
 
-            pd()->softmax_impl_->execute(handle, args.data(), args.size());
-        });
+                    pd()->softmax_impl_->execute(
+                            handle, args.data(), args.size());
+                });
     });
 }
 
diff --git a/src/gpu/amd/sycl_hip_utils.hpp b/src/gpu/amd/sycl_hip_utils.hpp
index 24340313872..ec4f4c71367 100644
--- a/src/gpu/amd/sycl_hip_utils.hpp
+++ b/src/gpu/amd/sycl_hip_utils.hpp
@@ -51,7 +51,7 @@ inline status_t check_device(dnnl::impl::engine_kind_t eng_kind) {
 
 inline void convert_dnnl_dims_array(
         const dnnl_dim_t *dims, int *new_dims, int n_dims) {
-    for (size_t i = 0; i < n_dims; i++) {
+    for (int i = 0; i < n_dims; i++) {
         new_dims[i] = static_cast<int>(dims[i]);
     }
 }
@@ -59,7 +59,7 @@ inline void convert_dnnl_dims_array(
 inline void convert_dims(const dnnl_dim_t *dims, int *new_dims, int n_dims,
         int adjustment_size = 4, int adjustment_value = 1) {
     convert_dnnl_dims_array(dims, new_dims, n_dims);
-    for (size_t i = n_dims; i < adjustment_size; i++) {
+    for (int i = n_dims; i < adjustment_size; i++) {
         new_dims[i] = adjustment_value;
     }
 }
@@ -110,7 +110,7 @@ inline bool adjust_stride_for_dnn(
 
 // Check if the dimensions contain any zeros, returns true if they do.
 inline bool has_zero_dims(const dnnl_dim_t *dims, int n_dims) {
-    for (size_t i = 0; i < n_dims; i++) {
+    for (int i = 0; i < n_dims; i++) {
         if (dims[i] == 0) { return true; }
     }
     return false;
@@ -131,14 +131,19 @@ inline status_t convert_data_type(const memory_desc_t *mem_desc,
         case data_type_t::dnnl_bf16:
             *miopen_data_type = miopenDataType_t::miopenBFloat16;
             break;
-        case data_type_t::dnnl_s8:
-            *miopen_data_type
-                    = ((vectorized
-                               && mem_desc->format_desc.blocking.inner_blks[0]
-                                       == 4)
-                                    ? miopenDataType_t::miopenInt8x4
-                                    : miopenDataType_t::miopenInt8);
+        case data_type_t::dnnl_s8: {
+            if (vectorized
+                    && mem_desc->format_desc.blocking.inner_blks[0] == 4) {
+#if MIOPEN_HAS_INT8X4
+                *miopen_data_type = miopenDataType_t::miopenInt8x4;
+#else
+                return status::unimplemented;
+#endif
+            } else {
+                *miopen_data_type = miopenDataType_t::miopenInt8;
+            }
             break;
+        }
         default: return status::unimplemented;
     }
     return status::success;
diff --git a/src/gpu/generic/convolution_deconvolution.hpp b/src/gpu/generic/convolution_deconvolution.hpp
new file mode 100644
index 00000000000..aa7c3003aea
--- /dev/null
+++ b/src/gpu/generic/convolution_deconvolution.hpp
@@ -0,0 +1,366 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_CONVOLUTION_DECONVOLUTION_HPP
+#define GPU_GENERIC_CONVOLUTION_DECONVOLUTION_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "gpu/gpu_deconvolution_pd.hpp"
+#include "gpu/gpu_primitive.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+
+static status_t weights_axes_permutation(
+        memory_desc_t *o_md, const memory_desc_t *i_md, bool with_groups) {
+    using namespace memory_extra_flags;
+    int perm[DNNL_MAX_NDIMS] {}; // deconv to conv weight permutation
+    for (int d = 0; d < DNNL_MAX_NDIMS; ++d)
+        perm[d] = d;
+    nstl::swap(perm[0 + with_groups], perm[1 + with_groups]);
+    CHECK(memory_desc_permute_axes(*o_md, *i_md, perm));
+    if (o_md->extra.flags & compensation_gpu_conv_asymmetric_src)
+        o_md->extra.flags |= compensation_gpu_conv_asymmetric_src_swap;
+    return status::success;
+}
+
+static status_t conv_descr_create(
+        const deconvolution_desc_t *dd, convolution_desc_t *cd) {
+    using namespace prop_kind;
+    alg_kind_t alg_kind = alg_kind::convolution_direct;
+
+    const memory_desc_t *src_md, *dst_md, *d_weights_d;
+    prop_kind_t prop_kind;
+
+    switch (dd->prop_kind) {
+        case forward:
+        case forward_inference:
+            prop_kind = backward_data;
+            src_md = &dd->dst_desc;
+            dst_md = &dd->src_desc;
+            d_weights_d = &dd->weights_desc;
+            break;
+        case backward_data:
+            prop_kind = forward_training;
+            src_md = &dd->diff_dst_desc;
+            dst_md = &dd->diff_src_desc;
+            d_weights_d = &dd->weights_desc;
+            break;
+        case backward_weights:
+            prop_kind = dd->prop_kind;
+            src_md = &dd->diff_dst_desc;
+            dst_md = &dd->src_desc;
+            d_weights_d = &dd->diff_weights_desc;
+            break;
+        default: assert(!"unknown prop kind"); return status::invalid_arguments;
+    }
+
+    // Create weights desc for convolution
+    memory_desc_t c_weights_d;
+    const bool with_groups = d_weights_d->ndims == src_md->ndims + 1;
+    CHECK(weights_axes_permutation(&c_weights_d, d_weights_d, with_groups));
+
+    return conv_desc_init(cd, prop_kind, alg_kind, src_md, &c_weights_d,
+            prop_kind != backward_weights ? &dd->bias_desc : nullptr, dst_md,
+            dd->strides, dd->dilates, dd->padding[0], dd->padding[1]);
+}
+
+struct convolution_deconvolution_fwd_t : public gpu::primitive_t {
+    using gpu::primitive_t::primitive_t;
+    struct pd_t : public gpu_deconvolution_fwd_pd_t {
+        using gpu_deconvolution_fwd_pd_t::gpu_deconvolution_fwd_pd_t;
+
+        DECLARE_COMMON_PD_T(name_.c_str(), convolution_deconvolution_fwd_t);
+        status_t init_convolution(impl::engine_t *engine) {
+            convolution_desc_t cd;
+            CHECK(conv_descr_create(desc(), &cd));
+            primitive_attr_t conv_attr(*attr());
+            if (!conv_attr.is_initialized()) return status::out_of_memory;
+            primitive_desc_iterator_t it(
+                    engine, (op_desc_t *)&cd, &conv_attr, nullptr);
+            if (!it.is_initialized()) return status::out_of_memory;
+            conv_pd_ = *(++it);
+
+            return (conv_pd_) ? status::success : status::unimplemented;
+        }
+
+        status_t init(impl::engine_t *engine) {
+            using namespace format_tag;
+            using sm = primitive_attr_t::skip_mask_t;
+
+            const auto attr_skip_mask
+                    = sm::post_ops | sm::zero_points | sm::scales;
+
+            VDISPATCH_DECONVOLUTION(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_DECONVOLUTION(
+                    desc()->alg_kind == alg_kind::deconvolution_direct,
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_DECONVOLUTION(attr()->has_default_values(attr_skip_mask),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_DECONVOLUTION(
+                    (utils::everyone_is(data_type::f32,
+                             desc()->src_desc.data_type,
+                             desc()->weights_desc.data_type,
+                             desc()->dst_desc.data_type)
+                            || (utils::everyone_is(data_type::f64,
+                                    desc()->src_desc.data_type,
+                                    desc()->weights_desc.data_type,
+                                    desc()->dst_desc.data_type))
+                            || ((utils::everyone_is(data_type::f16,
+                                         desc()->src_desc.data_type,
+                                         desc()->weights_desc.data_type)
+                                        || utils::everyone_is(data_type::f32,
+                                                desc()->src_desc.data_type,
+                                                desc()->weights_desc.data_type)
+                                        || utils::everyone_is(data_type::bf16,
+                                                desc()->src_desc.data_type,
+                                                desc()->weights_desc.data_type))
+                                    && utils::one_of(desc()->dst_desc.data_type,
+                                            data_type::f16, data_type::u8,
+                                            data_type::s8))
+                            || (utils::everyone_is(data_type::bf16,
+                                        desc()->src_desc.data_type,
+                                        desc()->weights_desc.data_type)
+                                    && utils::one_of(desc()->dst_desc.data_type,
+                                            data_type::f32, data_type::bf16))
+                            || (utils::everyone_is(data_type::f16,
+                                        desc()->src_desc.data_type,
+                                        desc()->weights_desc.data_type)
+                                    && utils::one_of(desc()->dst_desc.data_type,
+                                            data_type::f32, data_type::f16))
+                            || (desc()->weights_desc.data_type == data_type::s8
+                                    && utils::one_of(desc()->src_desc.data_type,
+                                            data_type::u8, data_type::s8)
+                                    && desc()->dst_desc.data_type
+                                            != data_type::f64)),
+                    VERBOSE_UNSUPPORTED_DT);
+
+            VDISPATCH_DECONVOLUTION_SC(
+                    init_convolution(engine), "init_convolution()");
+            if (weights_md_.format_kind == format_kind::any) {
+                VDISPATCH_DECONVOLUTION_SC(
+                        weights_axes_permutation(&weights_md_,
+                                conv_pd_->weights_md(), with_groups()),
+                        "weights_axes_permutation()");
+            }
+            if (src_md_.format_kind == format_kind::any)
+                src_md_ = *conv_pd_->diff_dst_md();
+            if (dst_md_.format_kind == format_kind::any)
+                dst_md_ = *conv_pd_->diff_src_md();
+            if (bias_md_.format_kind == format_kind::any) {
+                VDISPATCH_DECONVOLUTION_SC(memory_desc_init_by_tag(bias_md_, x),
+                        VERBOSE_UNSUPPORTED_TAG);
+            }
+            init_name();
+            init_scratchpad();
+            VDISPATCH_DECONVOLUTION_SC(attr_.set_default_formats(dst_md(0)),
+                    VERBOSE_UNSUPPORTED_ATTR);
+
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> conv_pd_;
+
+    private:
+        std::string name_ = "conv:any";
+
+        void init_name() {
+            name_.append("+");
+            name_.append(conv_pd_->name());
+        }
+
+        void init_scratchpad() {
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested,
+                    conv_pd_->scratchpad_registry());
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        return create_nested_primitive(conv_p_, pd()->conv_pd_, engine);
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        using namespace memory_tracking::names;
+        const auto &args = ctx.args();
+        exec_args_t conv_args;
+        conv_args[DNNL_ARG_DIFF_DST] = args.at(DNNL_ARG_SRC);
+        conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
+        conv_args[DNNL_ARG_DIFF_SRC] = args.at(DNNL_ARG_DST);
+        if (pd()->with_bias())
+            conv_args[DNNL_ARG_BIAS] = args.at(DNNL_ARG_BIAS);
+
+        for (int idx = 0; idx < pd()->attr()->post_ops_.len(); ++idx) {
+            if (pd()->attr()->post_ops_.entry_[idx].is_binary()) {
+                conv_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1]
+                        = args.at(DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx)
+                                | DNNL_ARG_SRC_1);
+            } else if (pd()->attr()->post_ops_.entry_[idx].is_prelu()) {
+                conv_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx)
+                        | DNNL_ARG_WEIGHTS]
+                        = args.at(DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx)
+                                | DNNL_ARG_WEIGHTS);
+            }
+        }
+        const auto z_src = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC;
+        const auto z_dst = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST;
+        if (args.find(z_src) != args.end()) conv_args[z_src] = args.at(z_src);
+        if (args.find(z_dst) != args.end()) conv_args[z_dst] = args.at(z_dst);
+
+        for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+            int key = DNNL_ARG_ATTR_SCALES | arg;
+            if (args.find(key) != args.end()) conv_args[key] = args.at(key);
+        }
+
+        exec_ctx_t conv_ctx(ctx, std::move(conv_args));
+
+        nested_scratchpad_t ns(ctx, key_nested, conv_p_);
+        conv_ctx.set_scratchpad_grantor(ns.grantor());
+        // Executing the convolution kernel
+        return conv_p_->execute(conv_ctx);
+    }
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> conv_p_;
+};
+
+struct convolution_deconvolution_bwd_data_t : public gpu::primitive_t {
+    using gpu::primitive_t::primitive_t;
+    struct pd_t : public gpu_deconvolution_bwd_data_pd_t {
+        using gpu_deconvolution_bwd_data_pd_t::gpu_deconvolution_bwd_data_pd_t;
+
+        DECLARE_COMMON_PD_T(
+                name_.c_str(), convolution_deconvolution_bwd_data_t);
+
+        status_t init_convolution(impl::engine_t *engine) {
+            convolution_desc_t cd;
+            CHECK(conv_descr_create(desc(), &cd));
+            primitive_attr_t conv_attr(*attr());
+            if (!conv_attr.is_initialized()) return status::out_of_memory;
+            primitive_desc_iterator_t it(
+                    engine, (op_desc_t *)&cd, &conv_attr, nullptr);
+            if (!it.is_initialized()) return status::out_of_memory;
+            conv_pd_ = *(++it);
+            return (conv_pd_) ? status::success : status::unimplemented;
+        }
+
+        status_t init(impl::engine_t *engine) {
+            VDISPATCH_DECONVOLUTION(
+                    desc()->prop_kind == prop_kind::backward_data,
+                    VERBOSE_BAD_PROPKIND);
+
+            VDISPATCH_DECONVOLUTION(
+                    (utils::everyone_is(data_type::f32,
+                             desc()->diff_src_desc.data_type,
+                             desc()->weights_desc.data_type,
+                             desc()->diff_dst_desc.data_type)
+                            || (utils::everyone_is(data_type::f64,
+                                    desc()->diff_src_desc.data_type,
+                                    desc()->weights_desc.data_type,
+                                    desc()->diff_dst_desc.data_type))
+                            || utils::everyone_is(data_type::f16,
+                                    desc()->weights_desc.data_type,
+                                    desc()->diff_dst_desc.data_type)
+                            || utils::everyone_is(data_type::bf16,
+                                    desc()->weights_desc.data_type,
+                                    desc()->diff_dst_desc.data_type)),
+                    VERBOSE_UNSUPPORTED_DT);
+
+            VDISPATCH_DECONVOLUTION(
+                    utils::one_of(desc()->diff_src_desc.data_type,
+                            data_type::bf16, data_type::f16, data_type::f32,
+                            data_type::f64),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_DECONVOLUTION(
+                    desc()->alg_kind == alg_kind::deconvolution_direct,
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_DECONVOLUTION(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+
+            VDISPATCH_DECONVOLUTION_SC(
+                    init_convolution(engine), "init_convolution()");
+            if (weights_md_.format_kind == format_kind::any)
+                VDISPATCH_DECONVOLUTION_SC(
+                        weights_axes_permutation(&weights_md_,
+                                conv_pd_->weights_md(), with_groups()),
+                        "weights_axes_permutation()");
+            if (diff_src_md_.format_kind == format_kind::any)
+                diff_src_md_ = *conv_pd_->dst_md();
+            if (diff_dst_md_.format_kind == format_kind::any)
+                diff_dst_md_ = *conv_pd_->src_md();
+
+            init_name();
+            init_scratchpad();
+
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> conv_pd_;
+
+    private:
+        std::string name_ = "conv:any";
+
+        void init_name() {
+            name_.append("+");
+            name_.append(conv_pd_->name());
+        }
+
+        void init_scratchpad() {
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested,
+                    conv_pd_->scratchpad_registry());
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        return create_nested_primitive(conv_p_, pd()->conv_pd_, engine);
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        using namespace memory_tracking::names;
+        const auto &args = ctx.args();
+        exec_args_t conv_args;
+        conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST);
+        conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
+        conv_args[DNNL_ARG_DST] = args.at(DNNL_ARG_DIFF_SRC);
+        if (!types::is_zero_md(pd()->scratchpad_md()))
+            conv_args[DNNL_ARG_SCRATCHPAD] = args.at(DNNL_ARG_SCRATCHPAD);
+        exec_ctx_t conv_ctx(ctx, std::move(conv_args));
+
+        nested_scratchpad_t ns(ctx, key_nested, conv_p_);
+        conv_ctx.set_scratchpad_grantor(ns.grantor());
+        // Executing the convolution kernel
+        return conv_p_->execute(conv_ctx);
+    }
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> conv_p_;
+};
+
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/cross_engine_reorder.cpp b/src/gpu/generic/cross_engine_reorder.cpp
index bf37ff6d4e7..cbf4672c4c6 100644
--- a/src/gpu/generic/cross_engine_reorder.cpp
+++ b/src/gpu/generic/cross_engine_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,20 +27,18 @@ namespace impl {
 namespace gpu {
 namespace generic {
 
-void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *engine) {
-    using namespace memory_tracking::names;
-    if (!do_reorder_) return;
-
-    auto *gpu_engine = utils::downcast<gpu::engine_t *>(engine);
-
-    const memory_desc_wrapper wspace_md(
-            desc()->src_engine_kind == reorder_engine_kind_ ? dst_md()
-                                                            : src_md());
-    auto scratchpad = scratchpad_registry().registrar();
-    scratchpad.book(memory_tracking::names::key_reorder_cross_space,
-            wspace_md.size(), 1, gpu_engine->get_buffer_alignment());
-    scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(), 1,
-            gpu_engine->get_buffer_alignment());
+void cross_engine_reorder_t::pd_t::init_scratchpad(impl::engine_t *gpu_engine) {
+    if (do_reorder_) {
+        using namespace memory_tracking::names;
+        auto gpu_align = utils::downcast<gpu::engine_t *>(gpu_engine)
+                                 ->get_buffer_alignment();
+        auto scratchpad = scratchpad_registry().registrar();
+        auto needs_dst = desc()->src_engine_kind == reorder_engine_kind_;
+        memory_desc_wrapper wspace((needs_dst) ? dst_md() : src_md());
+        scratchpad.book(key_reorder_cross_space, wspace.size(), 1, gpu_align);
+        scratchpad.book(key_nested, reorder_pd_->scratchpad_registry().size(),
+                1, gpu_align);
+    }
 }
 
 status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
@@ -50,7 +48,7 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
                               dst_engine->kind()),
             VERBOSE_BAD_ENGINE_KIND);
     VDISPATCH_REORDER(attr_ok(), VERBOSE_UNSUPPORTED_ATTR);
-    VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
+    VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
 
     memory_desc_wrapper src_mdw(src_md());
     memory_desc_wrapper dst_mdw(dst_md());
@@ -72,17 +70,31 @@ status_t cross_engine_reorder_t::pd_t::init(impl::engine_t *engine,
     primitive_attr_t r_attr(*attr());
     if (!r_attr.is_initialized()) return status::out_of_memory;
 
-    VDISPATCH_REORDER_SC(reorder_primitive_desc_create(reorder_pd_,
-                                 reorder_engine, src_md(), dst_md(), &r_attr),
+    auto clean_src_md = *src_md();
+    auto clean_dst_md = *dst_md();
+    clean_src_md.extra = clean_dst_md.extra = {};
+    VDISPATCH_REORDER_SC(
+            reorder_primitive_desc_create(reorder_pd_, reorder_engine,
+                    &clean_src_md, &clean_dst_md, &r_attr),
             VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder");
-    init_scratchpad(engine);
 
     reorder_pd_t::init_desc(
             src_engine->kind(), dst_engine->kind(), true /* is_cross_engine */);
 
+    VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine),
+            "failed to create nested zp precompute convolution");
+    init_scratchpad(
+            (dst_engine->kind() == engine_kind::gpu) ? dst_engine : src_engine);
     return status::success;
 }
 
+status_t cross_engine_reorder_t::init(impl::engine_t *engine) {
+    CHECK(pd()->maybe_create_zp_precompute_conv(
+            zp_precomp_conv_, engine, this));
+    if (!pd()->do_reorder_) return status::success;
+    return create_nested_primitive(reorder_, pd()->reorder_pd_, engine);
+}
+
 status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
     using namespace memory_tracking::names;
     auto *gpu_stream = utils::downcast<gpu::stream_t *>(ctx.stream());
@@ -92,7 +104,7 @@ status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
     auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM);
     auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO);
 
-    std::unique_ptr<memory_t> wspace;
+    std::unique_ptr<memory_t, memory_deleter_t> wspace;
     if (pd()->do_reorder_) {
         auto src_engine_kind = pd()->desc()->src_engine_kind;
         auto reorder_engine_kind = pd()->reorder_engine_kind_;
@@ -158,6 +170,8 @@ status_t cross_engine_reorder_t::execute(const exec_ctx_t &ctx) const {
                     ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC),
                     ctx.input(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST));
         }
+        if (status == status::success)
+            status = pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_);
     }
     return status;
 }
diff --git a/src/gpu/generic/cross_engine_reorder.hpp b/src/gpu/generic/cross_engine_reorder.hpp
index cd69fefefaf..c6557ddaaeb 100644
--- a/src/gpu/generic/cross_engine_reorder.hpp
+++ b/src/gpu/generic/cross_engine_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,16 +57,13 @@ struct cross_engine_reorder_t : public gpu::primitive_t {
         DECLARE_GPU_REORDER_CREATE();
     };
 
-    status_t init(impl::engine_t *engine) override {
-        if (!pd()->do_reorder_) return status::success;
-        return create_nested_primitive(reorder_, pd()->reorder_pd_, engine);
-    }
-
+    status_t init(impl::engine_t *engine) override;
     status_t execute(const exec_ctx_t &ctx) const override;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::shared_ptr<impl::primitive_t> reorder_;
+    std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
 };
 
 } // namespace generic
diff --git a/src/gpu/generic/ref_concat.hpp b/src/gpu/generic/ref_concat.hpp
index 1e5a1902059..8948bce99b5 100644
--- a/src/gpu/generic/ref_concat.hpp
+++ b/src/gpu/generic/ref_concat.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,22 +33,18 @@ namespace generic {
 struct ref_concat_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public gpu_concat_pd_t {
-        pd_t(const primitive_attr_t *attr, const memory_desc_t *dst_md, int n,
-                int concat_dim, const memory_desc_t *const *src_mds)
-            : gpu_concat_pd_t(attr, dst_md, n, concat_dim, src_mds)
-            , tent_dst_md_(types::zero_md()) {}
-
-        pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        using gpu_concat_pd_t::gpu_concat_pd_t;
 
         DECLARE_CONCAT_PD_T("ref:any", ref_concat_t);
 
         status_t init(impl::engine_t *engine) {
             using sm = primitive_attr_t::skip_mask_t;
 
-            VDISPATCH_CONCAT(attr()->has_default_values(sm::scales_runtime),
+            VDISPATCH_CONCAT(attr()->has_default_values(sm::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
 
+            tent_dst_md_ = types::zero_md();
+
             if (gpu_concat_pd_t::init() != status::success) {
                 assert(dst_md_.format_kind != format_kind::undef);
                 VDISPATCH_CONCAT_SC(
@@ -64,12 +60,8 @@ struct ref_concat_t : public gpu::primitive_t {
             reorder_pds_.resize(n_ + use_tent_dst());
             for (int i = 0; i < n_; ++i) {
                 primitive_attr_t r_attr;
-                int mask = 0;
-                bool is_set = false;
-                VDISPATCH_CONCAT_SC(
-                        sc.get(DNNL_ARG_MULTIPLE_SRC + i, &mask, &is_set),
-                        VERBOSE_UNSUPPORTED_SCALES_CFG);
-                if (is_set) {
+                if (!sc.get(DNNL_ARG_MULTIPLE_SRC + i).has_default_values()) {
+                    int mask = sc.get_mask(DNNL_ARG_MULTIPLE_SRC + i);
                     VDISPATCH_CONCAT(mask == 0, "non-zero mask");
                     VDISPATCH_CONCAT_SC(r_attr.scales_.set(DNNL_ARG_SRC, mask),
                             VERBOSE_UNSUPPORTED_SCALES_CFG);
@@ -129,6 +121,9 @@ struct ref_concat_t : public gpu::primitive_t {
     }
 
     status_t execute(const exec_ctx_t &ctx) const override {
+        if (memory_desc_wrapper(pd()->dst_md()).size() == 0)
+            return status::success;
+
         using namespace memory_tracking::names;
         impl::engine_t *engine = ctx.stream()->engine();
         const auto n = pd()->n_inputs();
@@ -155,8 +150,10 @@ struct ref_concat_t : public gpu::primitive_t {
             auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage(
                     memory_tracking::names::key_concat_tent_dst);
 
-            memory_t tent_dst(
-                    engine, &pd()->tent_dst_md_, std::move(scratchpad));
+            std::unique_ptr<memory_t, memory_deleter_t> tent_dst;
+            CHECK(safe_ptr_assign(tent_dst,
+                    new memory_t(engine, &pd()->tent_dst_md_,
+                            std::move(scratchpad))));
 
             for (int i = 0; i < n; ++i) {
                 const auto &src_scales_arg = ctx.args().find(
@@ -167,10 +164,10 @@ struct ref_concat_t : public gpu::primitive_t {
                     src_scales = &src_scales_arg->second;
                 CHECK(execute_reorder(reorders_[i],
                         ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i),
-                        {&tent_dst, false}, src_scales, i));
+                        {tent_dst.get(), false}, src_scales, i));
             }
 
-            CHECK(execute_reorder(reorders_[n], {&tent_dst, true},
+            CHECK(execute_reorder(reorders_[n], {tent_dst.get(), true},
                     ctx.args().at(DNNL_ARG_DST), nullptr, n));
         } else {
             for (int i = 0; i < n; ++i) {
diff --git a/src/gpu/generic/ref_sum.hpp b/src/gpu/generic/ref_sum.hpp
index 4d08ca8709d..305a616b24d 100644
--- a/src/gpu/generic/ref_sum.hpp
+++ b/src/gpu/generic/ref_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ struct ref_sum_t : public gpu::primitive_t {
         using gpu_sum_pd_t::gpu_sum_pd_t;
 
         pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        ~pd_t() override = default;
 
         DECLARE_SUM_PD_T("ref:any", ref_sum_t);
 
@@ -148,7 +148,7 @@ struct ref_sum_t : public gpu::primitive_t {
         const auto n = pd()->n_inputs();
         exec_args_t r_args;
 
-        std::unique_ptr<memory_t> p_temp_dst_acc;
+        std::unique_ptr<memory_t, memory_deleter_t> p_temp_dst_acc;
         if (pd()->need_output_reorder()) {
             auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage(
                     key_sum_reduction);
@@ -161,13 +161,16 @@ struct ref_sum_t : public gpu::primitive_t {
         memory_arg_t dst_acc = {p_temp_dst_acc.get(), false};
 
         for (int i = 0; i < n; ++i) {
-            memory_t scales_mem(
-                    ctx.stream()->engine(), &pd()->scale_md_, nullptr);
-            CHECK(scales_mem.set_data_handle(
+            std::unique_ptr<memory_t, memory_deleter_t> scales_mem;
+            CHECK(safe_ptr_assign(scales_mem,
+                    new memory_t(ctx.stream()->engine(), &pd()->scale_md_,
+                            nullptr)));
+            CHECK(scales_mem->set_data_handle(
                     CTX_GPU_RES_STORAGE(i).data_handle()));
             r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i);
             r_args[DNNL_ARG_DST] = pd()->need_output_reorder() ? dst_acc : dst;
-            r_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = {&scales_mem, true};
+            r_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC]
+                    = {scales_mem.get(), true};
             exec_ctx_t r_ctx(ctx, std::move(r_args));
 
             nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]);
diff --git a/src/gpu/generic/sycl/README.md b/src/gpu/generic/sycl/README.md
index 20dc87041b3..6835ca375b0 100644
--- a/src/gpu/generic/sycl/README.md
+++ b/src/gpu/generic/sycl/README.md
@@ -33,11 +33,56 @@ one, the environment variable can be set to `cuda:*`.
 
 # Supported Primitives
 
+General limitations:
+
+* Currently blocked formats are not supported by any implementations unless
+  explicitly listed
+* There's a limit of maximum 5 post-ops for the implementations
+* The maximum supported size of any dimension of any input/output tensor of a
+    primitive is `INT32_MAX`
+
 ## Batch Normalization
 
 The implementation supports both forward and backward directions.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+* Supported data types:
+    * Forward direction: `f32`, `bf16`, `f16`, `s8`
+    * Backward direction: `f32`, `bf16`, `f16`
+
+## Binary
+
+* Supported formats: plain formats, `Ab32a`, `aBc32b`
+* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8`, `s32`
+
+## Convolution
+
+The implementation supports forward data, backward data, and backward weights
+directions.
+
+* Supported input/output formats: plain formats
+* Supported weights formats: `goiw`, `goihw`, `goidhw`, `oiw`, `oihw`, `oidhw`
+* Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8`
+* Limitations
+    * Some very large problem sizes currently return `unimplemented` due to an
+      issue with long execution times
+
+## Concat
+
+* Supported formats: plain formats
+* Supported data types: `f32`, `bf16`, `f16`, `s8`, `s32`
+
+## Deconvolution
+
+The implementation supports forward and backward data and backward weights
+directions.
+
+* Supported input/output formats: plain formats
+* Supported weights formats: `goiw`, `goihw`, `goidhw`, `oiw`, `oihw`, `oidhw`
+* Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8`
+* Limitations
+    * Some problems with large input/output tensors currently return `unimplemented`
+      due to an issue with long execution times
 
 ## Eltwise
 
@@ -47,31 +92,66 @@ The implementation supports both forward and backward directions.
 `gelu_tanh`, `hardsigmoid`, `hardswish`, `linear`, `log`, `logistic`, `mish`,
 `pow`, `relu`, `round`, `soft_relu`, `sqrt`, `square`,`swish` and `tanh`
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`, `N`
+* Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8`
+
+## Group Normalization
+* Supported Direction:  Both forward and backward directions are supported.
+* Supported data types: All possible data combinations listed in the oneDNN specification are supported.
+* Support Data layouts: All data layouts are supported.
+
+
+## Inner Product
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: All plain formats are supported.
+* Supported data types: All possible data combinations as listed in the specification are supported.
+* Supported post-ops: All post-ops mentioned in the specification are supported.
+Note: The backward pass does not support post-ops. You should not use post-ops in the forward pass during training.
+
+## Layer Normalization
+
+The implementation supports both forward and backward directions.
+
+* Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+* Supported input/output data types for forward direction: `f32`, `bf16`, `f16`,
+  `s8`, `u8`
+* Supported input/output data types for backward direction: `f32`, `bf16`
+* Supported scale/shift data types: `f32`, `bf16`, `f16`
 
 ## LRN
 
 The implementation supports both forward and backward directions.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+* Supported data types: `f32`, `bf16`, `f16`
+
+## Matmul
+
+* Supported formats: plain formats
+* Supported input/output data types: `f32`, `bf16`, `f16`, `s8`, `u8`, `s32`
+* Limitations
+    * Runtime dims is not supported
+    * PReLU post-op is not supported
 
 ## Pooling
 
 The implementation supports both forward and backward directions.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`
+* Supported data types for forward direction: `f32`, `bf16`, `f16`, `s8`, `u8`
+* Supported data types for backward direction: `f32`, `bf16`, `f16`
 
 ## PReLU
 
 The implementation supports both forward and backward propagations.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
-* Forward pass supports `f32`, `f16`, `bf16`, `s8` and `u8` data types
-* Backward pass supports `f32` and `bf16` data types
+* Supported data types `f32`, `f16`, `bf16`, `s8` and `u8` data types
 
 ## Reorder
 
-* Format support limitations: blocked formats are not supported
+* Supported formats: plain formats
 * Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8`
 
 ## Resampling
@@ -79,18 +159,38 @@ The implementation supports both forward and backward propagations.
 The implementation supports both forward and backward directions.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`
+* Supported data types: `f32`, `bf16`, `f16`, `s32`, `s8`, `u8`
 
 ## Softmax/LogSoftmax
 
 The implementation supports both forward and backward directions.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
+* Supported data types for forward direction: `f32`, `bf16`, `f16`, `s8`, `u8`
+* Supported data types for backward direction: `f32`, `bf16`, `f16`
 
 ## Shuffle
 
 The implementation supports both forward and backward propagations.
 
 * Supported formats: `NCDHW`, `NDHWC`, `NCHW`, `NHWC`, `NCW`, `NWC`, `NC`
-
 * Forward pass supports `f32`, `f16`, `bf16` and `s8` data types.
 * Backward pass supports `f32` and `bf16` data types.
+
+## Sum
+
+* Supported formats: plain formats with up to 7 dimensions
+* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8`
+
+## Reduction
+
+* Supported formats: plain formats with up to 6 dimensions
+* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8`
+
+## RNN
+
+The implementation supports forward propagation and vanilla RNN cell kind.
+
+* Supported formats: `ldigo`, `ldgoi`
+* Supported data types: `f32`, `bf16`, `f16`, `s8`, `u8`
+* Supported direction: `left2right`, `right2left`, `concat`, `sum`
diff --git a/src/gpu/generic/sycl/batch_normalizations_kernels.hpp b/src/gpu/generic/sycl/batch_normalizations_kernels.hpp
index 3957842f697..77ae2bdea00 100644
--- a/src/gpu/generic/sycl/batch_normalizations_kernels.hpp
+++ b/src/gpu/generic/sycl/batch_normalizations_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -130,7 +130,7 @@ struct batch_normalization_fwd_kernel_vec_t {
 
                 if (data_md().data_type() == data_type::s8) {
                     bn_res = gpu::generic::sycl::qz_a1b0<float,
-                            xpu::sycl::prec_traits<data_type::s8>::type>()(
+                            xpu::sycl::prec_traits_t<data_type::s8>::type>()(
                             maybe_post_op(bn_res));
                     dst_mem.store(bn_res, d_off);
                 } else {
@@ -278,7 +278,7 @@ struct batch_normalization_fwd_kernel_vec_t1 {
 
                 if (data_md().data_type() == data_type::s8) {
                     bn_res = gpu::generic::sycl::qz_a1b0<float,
-                            xpu::sycl::prec_traits<data_type::s8>::type>()(
+                            xpu::sycl::prec_traits_t<data_type::s8>::type>()(
                             maybe_post_op(bn_res));
                     dst_mem.store(bn_res, d_off);
                 } else {
diff --git a/src/gpu/generic/sycl/binary_kernels.hpp b/src/gpu/generic/sycl/binary_kernels.hpp
index 594a9996209..8d9ef1f170d 100644
--- a/src/gpu/generic/sycl/binary_kernels.hpp
+++ b/src/gpu/generic/sycl/binary_kernels.hpp
@@ -39,7 +39,7 @@ struct binary_kernel_vec_t {
         : conf_(conf)
         , src0_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0))
         , src1_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_1))
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
         , src0_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0))
         , src1_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
@@ -48,7 +48,7 @@ struct binary_kernel_vec_t {
                                                         | DNNL_ARG_SRC_0)
                                                      .data_type()
                                            : data_type_t::dnnl_f32)
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src0_mem(src0_, conf_.src0_md);
@@ -57,17 +57,6 @@ struct binary_kernel_vec_t {
         memory_plain_t src0_scale_mem(src0_scale_, scales_dt_);
         memory_plain_t src1_scale_mem(src1_scale_, scales_dt_);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-
-        size_t base_idx = offset_t * conf_.block_size;
-        size_t vec_base_idx = base_idx / vec_len;
-
-        size_t sg_base_idx = (wg_offset_t + sg_offset_t) * conf_.block_size;
-
         const float sm_0 = (conf_.do_scale_src0 ? src0_scale_mem.load(0) : 1.f);
 
         const float sm_1 = (conf_.do_scale_src1 ? src1_scale_mem.load(0) : 1.f);
@@ -91,13 +80,19 @@ struct binary_kernel_vec_t {
                                 == src1_mem.md().strides()[i]);
             }
         }
-        if (!any_broadcast && conf_.post_ops.get_post_op() == 0
-                && sg_base_idx + (sg.get_local_range()[0] * conf_.block_size)
-                        < conf_.wk_size
-                && is_same_tag) {
-            for (int i = 0; i < conf_.block_size / vec_len; i++) {
-                auto src0_vec = src0_mem.load_vec<vec_len>(vec_base_idx + i);
-                auto src1_vec = src1_mem.load_vec<vec_len>(vec_base_idx + i);
+
+        const bool is_blocked_fmt = conf_.src0_md.inner_nblks() > 0
+                || conf_.src1_md.inner_nblks() > 0
+                || conf_.dst_md.inner_nblks() > 0;
+
+        if (!any_broadcast && !is_blocked_fmt
+                && conf_.post_ops.get_post_op() == 0
+                && conf_.wk_size % vec_len == 0 && is_same_tag) {
+            for (int vec_idx = item.get_global_id(0);
+                    vec_idx < conf_.wk_size / vec_len;
+                    vec_idx += item.get_global_range(0)) {
+                auto src0_vec = src0_mem.load_vec<vec_len>(vec_idx);
+                auto src1_vec = src1_mem.load_vec<vec_len>(vec_idx);
 
                 if (conf_.do_scale_src0)
                     src0_vec *= ::sycl::vec<float, vec_len>(sm_0);
@@ -108,33 +103,36 @@ struct binary_kernel_vec_t {
                 // TODO: Adding post-ops seems to be interfering with compiler's
                 // optimizations. Figure out how to make the compiler to generate
                 // the right code.
-                dst_mem.store_vec(acc_vec, vec_base_idx + i);
+                dst_mem.store_vec(acc_vec, vec_idx);
             }
         } else {
-            for (int i = 0; i < conf_.block_size; i++) {
-                int idx = base_idx + i;
-                if (idx < conf_.wk_size) {
-                    for (int i = 0; i < max_supported_ndims; i++) {
-                        off_dst[i] = idx / strides[i] % dims[i];
-                    }
+            for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                    idx += item.get_global_range(0)) {
+                auto l_offset = idx;
+                for (int i = 0; i < conf_.ndims; i++) {
+                    const int d = conf_.ndims - 1 - i;
+                    const dim_t cur_dim = conf_.dst_md.dims()[d];
+                    off_dst[d] = l_offset % cur_dim;
+                    l_offset = l_offset / cur_dim;
+                }
 
-                    for (int i = 0; i < max_supported_ndims; i++) {
-                        off0[i] = conf_.broadcast_dims0[i] ? 0 : off_dst[i];
-                        off1[i] = conf_.broadcast_dims1[i] ? 0 : off_dst[i];
-                    }
+                for (int i = 0; i < max_supported_ndims; i++) {
+                    off0[i] = conf_.broadcast_dims0[i] ? 0 : off_dst[i];
+                    off1[i] = conf_.broadcast_dims1[i] ? 0 : off_dst[i];
+                }
 
-                    auto src0 = src0_mem.load_md(off0);
-                    auto src1 = src1_mem.load_md(off1);
+                auto src0 = src0_mem.load_md(off0);
+                auto src1 = src1_mem.load_md(off1);
 
-                    if (conf_.do_scale_src0) src0 *= sm_0;
-                    if (conf_.do_scale_src1) src1 *= sm_1;
+                if (conf_.do_scale_src0) src0 *= sm_0;
+                if (conf_.do_scale_src1) src1 *= sm_1;
 
-                    auto acc = compute_alg_n(src0, src1, conf_.alg_kind);
+                auto acc = compute_alg_n(src0, src1, conf_.alg_kind);
 
-                    acc = conf_.post_ops.apply(
-                            acc, dst_, idx, po_args_, off_dst);
-                    dst_mem.store(acc, idx);
-                }
+                int dst_idx = dst_mem.md().off_v(off_dst);
+                acc = conf_.post_ops.apply(
+                        acc, dst_, dst_idx, po_args_, off_dst);
+                dst_mem.store(acc, dst_idx);
             }
         }
     }
@@ -189,7 +187,7 @@ struct binary_kernel_vec_t {
 
     xpu::sycl::in_memory_arg_t src0_;
     xpu::sycl::in_memory_arg_t src1_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
     xpu::sycl::in_memory_arg_t src0_scale_;
     xpu::sycl::in_memory_arg_t src1_scale_;
     data_type_t scales_dt_;
diff --git a/src/gpu/generic/sycl/convolution_kernels.hpp b/src/gpu/generic/sycl/convolution_kernels.hpp
index 2f74d7914f1..c33434859c7 100644
--- a/src/gpu/generic/sycl/convolution_kernels.hpp
+++ b/src/gpu/generic/sycl/convolution_kernels.hpp
@@ -33,13 +33,13 @@ namespace sycl {
 struct convolution_kernel_fwd_t {
     static constexpr int max_supported_ndims = 6;
 
-    convolution_kernel_fwd_t(const sycl_convolution_conf_t &conf,
+    convolution_kernel_fwd_t(const sycl_convolution_fwd_conf_t &conf,
             ::sycl::handler &cgh, const exec_ctx_t &ctx)
         : conf_(conf)
         , data_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0))
         , weights_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS))
         , bias_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_BIAS))
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
         , data_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0))
         , weights_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
@@ -72,14 +72,6 @@ struct convolution_kernel_fwd_t {
                           : data_type_t::dnnl_f32) {}
 
     void operator()(::sycl::nd_item<1> item) const {
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-
-        size_t base_idx = offset_t * conf_.block_size;
-
         const float sm_data = (conf_.do_scale_data
                         ? load_float_value(scales_data_dt_, data_scale_ptr(), 0)
                         : 1.f);
@@ -132,101 +124,95 @@ struct convolution_kernel_fwd_t {
         const int DH = conf_.dilation[1];
         const int DW = conf_.dilation[2];
 
-        for (int i = 0; i < conf_.block_size; i++) {
-            int idx = base_idx + i;
-            if (idx < conf_.wk_size) {
-                for (int i = 0; i < max_supported_ndims; i++) {
-                    off[i] = idx / dst_strides[i] % dst_dims[i];
-                }
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            for (int i = 0; i < max_supported_ndims; i++) {
+                off[i] = idx / dst_strides[i] % dst_dims[i];
+            }
 
-                const int n = off[0];
-                const int oc_tot = off[1];
-                const int oc = oc_tot % OC;
-                const int g = oc_tot / OC;
-
-                const int od = off[2];
-                const int oh = off[3];
-                const int ow = off[4];
-
-                float accumulator = 0;
-                for (int ic = 0; ic < IC; ++ic) {
-                    for (int kd = 0; kd < KD; ++kd) {
-                        for (int kh = 0; kh < KH; ++kh) {
-                            for (int kw = 0; kw < KW; ++kw) {
-                                const int id = od * SD - PD + kd * (1 + DD);
-                                const int ih = oh * SH - PH + kh * (1 + DH);
-                                const int iw = ow * SW - PW + kw * (1 + DW);
-
-                                if (id < 0 || id >= data_dims[2] || ih < 0
-                                        || ih >= data_dims[3] || iw < 0
-                                        || iw >= data_dims[4]) {
-                                    continue;
-                                }
+            const int n = off[0];
+            const int oc_tot = off[1];
+            const int oc = oc_tot % OC;
+            const int g = oc_tot / OC;
 
-                                dims_t off_data {n, g * IC + ic, id, ih, iw};
-                                const int data_idx = data_md().off_v(off_data);
-                                dims_t off_weights {g, oc, ic, kd, kh, kw};
-                                dims_t off_weights_no_groups {
-                                        oc, ic, kd, kh, kw};
-                                const int weights_idx = weights_md().off_v(
-                                        no_groups ? off_weights_no_groups
-                                                  : off_weights);
-
-                                auto data = load_float_value(
-                                        data_md().data_type(), data_ptr(),
-                                        data_idx);
-                                auto weight = load_float_value(
-                                        weights_md().data_type(), weights_ptr(),
-                                        weights_idx);
-
-                                if (conf_.use_data_zeropoints) {
-                                    int zpoint_idx = conf_.single_data_zeropoint
-                                            ? 0
-                                            : g * IC + ic;
-                                    auto data_zeropoint = load_float_value(
-                                            zeropoints_data_dt_,
-                                            data_zeropoint_ptr(), zpoint_idx);
-                                    data -= data_zeropoint;
-                                }
-                                accumulator += data * weight;
+            const int od = off[2];
+            const int oh = off[3];
+            const int ow = off[4];
+
+            float accumulator = 0;
+            for (int ic = 0; ic < IC; ++ic) {
+                for (int kd = 0; kd < KD; ++kd) {
+                    for (int kh = 0; kh < KH; ++kh) {
+                        for (int kw = 0; kw < KW; ++kw) {
+                            const int id = od * SD - PD + kd * (1 + DD);
+                            const int ih = oh * SH - PH + kh * (1 + DH);
+                            const int iw = ow * SW - PW + kw * (1 + DW);
+
+                            if (id < 0 || id >= data_dims[2] || ih < 0
+                                    || ih >= data_dims[3] || iw < 0
+                                    || iw >= data_dims[4]) {
+                                continue;
+                            }
+
+                            dims_t off_data {n, g * IC + ic, id, ih, iw};
+                            const int data_idx = data_md().off_v(off_data);
+                            dims_t off_weights {g, oc, ic, kd, kh, kw};
+                            dims_t off_weights_no_groups {oc, ic, kd, kh, kw};
+                            const int weights_idx = weights_md().off_v(no_groups
+                                            ? off_weights_no_groups
+                                            : off_weights);
+
+                            auto data = load_float_value(data_md().data_type(),
+                                    data_ptr(), data_idx);
+                            auto weight
+                                    = load_float_value(weights_md().data_type(),
+                                            weights_ptr(), weights_idx);
+
+                            if (conf_.use_data_zeropoints) {
+                                int zpoint_idx = conf_.single_data_zeropoint
+                                        ? 0
+                                        : g * IC + ic;
+                                auto data_zeropoint = load_float_value(
+                                        zeropoints_data_dt_,
+                                        data_zeropoint_ptr(), zpoint_idx);
+                                data -= data_zeropoint;
                             }
+                            accumulator += data * weight;
                         }
                     }
                 }
-                if (conf_.do_scale_data) { accumulator *= sm_data; }
-                if (conf_.do_scale_weights) {
-                    if (!conf_.single_weight_scale) {
-                        sm_weights = load_float_value(scales_weights_dt_,
-                                weights_scale_ptr(), oc_tot);
-                    }
-                    accumulator *= sm_weights;
+            }
+            if (conf_.do_scale_data) { accumulator *= sm_data; }
+            if (conf_.do_scale_weights) {
+                if (!conf_.single_weight_scale) {
+                    sm_weights = load_float_value(
+                            scales_weights_dt_, weights_scale_ptr(), oc_tot);
                 }
+                accumulator *= sm_weights;
+            }
 
-                if (bias_md().ndims() != 0) {
-                    auto bias = load_float_value(
-                            bias_md().data_type(), bias_ptr(), oc_tot);
-                    accumulator += bias;
-                }
+            if (conf_.has_bias) {
+                auto bias = load_float_value(conf_.bias_dt, bias_ptr(), oc_tot);
+                accumulator += bias;
+            }
 
-                accumulator = conf_.post_ops.apply(accumulator, dst_, idx);
+            accumulator = conf_.post_ops.apply(accumulator, dst_, idx);
 
-                if (conf_.do_scale_dst) { accumulator /= sm_dst; }
-                if (conf_.use_dst_zeropoints) {
-                    int zpoint_idx = conf_.single_dst_zeropoint ? 0 : oc_tot;
-                    auto dst_zeropoint = load_float_value(zeropoints_dst_dt_,
-                            dst_zeropoint_ptr(), zpoint_idx);
-                    accumulator += dst_zeropoint;
-                }
-                store_float_value(
-                        dst_md().data_type(), accumulator, dst_ptr(), idx);
+            if (conf_.do_scale_dst) { accumulator /= sm_dst; }
+            if (conf_.use_dst_zeropoints) {
+                int zpoint_idx = conf_.single_dst_zeropoint ? 0 : oc_tot;
+                auto dst_zeropoint = load_float_value(
+                        zeropoints_dst_dt_, dst_zeropoint_ptr(), zpoint_idx);
+                accumulator += dst_zeropoint;
             }
+            store_float_value(
+                    dst_md().data_type(), accumulator, dst_ptr(), idx);
         }
     }
 
 private:
     const xpu::sycl::md_t &data_md() const { return conf_.data_md; }
     const xpu::sycl::md_t &weights_md() const { return conf_.weights_md; }
-    const xpu::sycl::md_t &bias_md() const { return conf_.bias_md; }
     const xpu::sycl::md_t &dst_md() const { return conf_.dst_md; }
 
     void *data_ptr() const { return data_.get_pointer(); }
@@ -239,12 +225,12 @@ struct convolution_kernel_fwd_t {
     void *data_zeropoint_ptr() const { return data_zeropoints_.get_pointer(); }
     void *dst_zeropoint_ptr() const { return dst_zeropoints_.get_pointer(); }
 
-    sycl_convolution_conf_t conf_;
+    sycl_convolution_fwd_conf_t conf_;
 
     xpu::sycl::in_memory_arg_t data_;
     xpu::sycl::in_memory_arg_t weights_;
     xpu::sycl::in_memory_arg_t bias_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
     xpu::sycl::in_memory_arg_t data_scale_;
     xpu::sycl::in_memory_arg_t weights_scale_;
     xpu::sycl::in_memory_arg_t dst_scale_;
@@ -259,10 +245,10 @@ struct convolution_kernel_fwd_t {
 struct convolution_kernel_bwd_data_t {
     static constexpr int max_supported_ndims = 6;
 
-    convolution_kernel_bwd_data_t(const sycl_convolution_conf_t &conf,
+    convolution_kernel_bwd_data_t(const sycl_convolution_bwd_data_conf_t &conf,
             ::sycl::handler &cgh, const exec_ctx_t &ctx)
         : conf_(conf)
-        , diff_data_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC))
+        , diff_data_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC))
         , weights_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS))
         , bias_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_BIAS))
         , diff_dst_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_DST))
@@ -435,9 +421,8 @@ struct convolution_kernel_bwd_data_t {
                 accumulator *= sm_weights;
             }
 
-            if (bias_md().ndims() != 0) {
-                auto bias = load_float_value(
-                        bias_md().data_type(), bias_ptr(), ic_tot);
+            if (conf_.has_bias) {
+                auto bias = load_float_value(conf_.bias_dt, bias_ptr(), ic_tot);
                 accumulator += bias;
             }
 
@@ -458,7 +443,6 @@ struct convolution_kernel_bwd_data_t {
 private:
     const xpu::sycl::md_t &diff_data_md() const { return conf_.diff_data_md; }
     const xpu::sycl::md_t &weights_md() const { return conf_.weights_md; }
-    const xpu::sycl::md_t &bias_md() const { return conf_.bias_md; }
     const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; }
 
     void *diff_data_ptr() const { return diff_data_.get_pointer(); }
@@ -471,9 +455,9 @@ struct convolution_kernel_bwd_data_t {
     void *data_zeropoint_ptr() const { return data_zeropoints_.get_pointer(); }
     void *dst_zeropoint_ptr() const { return dst_zeropoints_.get_pointer(); }
 
-    sycl_convolution_conf_t conf_;
+    sycl_convolution_bwd_data_conf_t conf_;
 
-    xpu::sycl::out_memory_arg_t diff_data_;
+    xpu::sycl::inout_memory_arg_t diff_data_;
     xpu::sycl::in_memory_arg_t weights_;
     xpu::sycl::in_memory_arg_t bias_;
     xpu::sycl::in_memory_arg_t diff_dst_;
@@ -491,7 +475,8 @@ struct convolution_kernel_bwd_data_t {
 struct convolution_kernel_bwd_weights_t {
     static constexpr int max_supported_ndims = 6;
 
-    convolution_kernel_bwd_weights_t(const sycl_convolution_conf_t &conf,
+    convolution_kernel_bwd_weights_t(
+            const sycl_convolution_bwd_weights_conf_t &conf,
             ::sycl::handler &cgh, const exec_ctx_t &ctx, int data_arg,
             int diff_dst_arg)
         : conf_(conf)
@@ -584,8 +569,8 @@ struct convolution_kernel_bwd_weights_t {
                             }
                         }
                     }
-                    store_float_value(diff_bias_md().data_type(),
-                            accumulator_bias, diff_bias_ptr(), g * OC + oc);
+                    store_float_value(conf_.bias_dt, accumulator_bias,
+                            diff_bias_ptr(), g * OC + oc);
                 }
             };
             if (conf_.is_deconvolution) {
@@ -636,7 +621,6 @@ struct convolution_kernel_bwd_weights_t {
     const xpu::sycl::md_t &diff_weights_md() const {
         return conf_.diff_weights_md;
     }
-    const xpu::sycl::md_t &diff_bias_md() const { return conf_.diff_bias_md; }
     const xpu::sycl::md_t &diff_dst_md() const { return conf_.diff_dst_md; }
 
     void *data_ptr() const { return data_.get_pointer(); }
@@ -644,7 +628,7 @@ struct convolution_kernel_bwd_weights_t {
     void *diff_bias_ptr() const { return diff_bias_.get_pointer(); }
     void *diff_dst_ptr() const { return diff_dst_.get_pointer(); }
 
-    sycl_convolution_conf_t conf_;
+    sycl_convolution_bwd_weights_conf_t conf_;
 
     xpu::sycl::in_memory_arg_t data_;
     xpu::sycl::out_memory_arg_t diff_weights_;
diff --git a/src/gpu/generic/sycl/eltwise_kernels.hpp b/src/gpu/generic/sycl/eltwise_kernels.hpp
index 0f1151ff7b6..6365111dd25 100644
--- a/src/gpu/generic/sycl/eltwise_kernels.hpp
+++ b/src/gpu/generic/sycl/eltwise_kernels.hpp
@@ -36,21 +36,13 @@ struct eltwise_fwd_kernel_vec_t {
             ::sycl::handler &cgh, const exec_ctx_t &ctx)
         : conf_(conf)
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
-        , po_args_(cgh, ctx)
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) {}
+        , po_args_(cgh, ctx, conf_.post_ops)
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-
-        size_t base_idx = offset_t * conf_.block_size;
-
         auto operation = [&](dim_t &idx, dim_t &n, dim_t &c, dim_t &d, dim_t &h,
                                  dim_t &w) {
             dim_t src_offset = data_offset(src_mem.md(), n, c, d, h, w);
@@ -72,22 +64,20 @@ struct eltwise_fwd_kernel_vec_t {
             dst_mem.store(acc, src_offset);
         };
 
-        for (dim_t blk_idx = 0; blk_idx < conf_.block_size; blk_idx++) {
-            dim_t idx = base_idx + blk_idx;
-            if (idx < conf_.wk_size) {
-                dim_t N = conf_.mb;
-                dim_t C = conf_.c;
-                dim_t D = conf_.d;
-                dim_t H = conf_.h;
-                dim_t W = conf_.w;
-
-                dim_t n = (idx / (C * D * H * W)) % N;
-                dim_t c = (idx / (D * H * W)) % C;
-                dim_t d = (idx / (H * W)) % D;
-                dim_t h = (idx / (W)) % H;
-                dim_t w = (idx / (1)) % W;
-                operation(idx, n, c, d, h, w);
-            }
+        for (dim_t idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            dim_t N = conf_.mb;
+            dim_t C = conf_.c;
+            dim_t D = conf_.d;
+            dim_t H = conf_.h;
+            dim_t W = conf_.w;
+
+            dim_t n = (idx / (C * D * H * W)) % N;
+            dim_t c = (idx / (D * H * W)) % C;
+            dim_t d = (idx / (H * W)) % D;
+            dim_t h = (idx / (W)) % H;
+            dim_t w = (idx / (1)) % W;
+            operation(idx, n, c, d, h, w);
         }
     }
 
@@ -204,7 +194,7 @@ struct eltwise_fwd_kernel_vec_t {
     sycl_eltwise_conf_t conf_;
     xpu::sycl::in_memory_arg_t src_;
     post_op_input_args po_args_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
 };
 
 struct eltwise_bwd_kernel_vec_t {
@@ -221,23 +211,14 @@ struct eltwise_bwd_kernel_vec_t {
         memory_tensor_t diff_src_mem(diff_src_, conf_.diff_src_md);
         memory_tensor_t diff_dst_mem(diff_dst_, conf_.diff_dst_md);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-        size_t base_idx = offset_t * conf_.block_size;
-
-        for (dim_t i = 0; i < conf_.block_size; i++) {
-            dim_t idx = base_idx + i;
-            if (idx < conf_.wk_size) {
-                auto diff_src = diff_src_mem.load(idx);
-                auto src = src_mem.load(idx);
-
-                auto dst = compute_alg_n(
-                        diff_src, src, conf_.alpha, conf_.beta, conf_.alg_kind);
-                diff_dst_mem.store(dst, idx);
-            }
+        for (dim_t idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            auto diff_src = diff_src_mem.load(idx);
+            auto src = src_mem.load(idx);
+
+            auto dst = compute_alg_n(
+                    diff_src, src, conf_.alpha, conf_.beta, conf_.alg_kind);
+            diff_dst_mem.store(dst, idx);
         }
     }
 
diff --git a/src/gpu/generic/sycl/engine.cpp b/src/gpu/generic/sycl/engine.cpp
index 467203d6e69..59b340932f8 100644
--- a/src/gpu/generic/sycl/engine.cpp
+++ b/src/gpu/generic/sycl/engine.cpp
@@ -37,11 +37,6 @@ status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
     return status::success;
 }
 
-status_t engine_t::create_memory_storage(
-        memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
-    return impl()->create_memory_storage(storage, this, flags, size, handle);
-}
-
 engine_t::engine_t(
         const ::sycl::device &dev, const ::sycl::context &ctx, size_t index)
     : impl::gpu::engine_t(
diff --git a/src/gpu/generic/sycl/engine.hpp b/src/gpu/generic/sycl/engine.hpp
index 3669f72312d..ae676dcc959 100644
--- a/src/gpu/generic/sycl/engine.hpp
+++ b/src/gpu/generic/sycl/engine.hpp
@@ -40,21 +40,15 @@ class engine_t : public gpu::engine_t {
 
     status_t init() { return init_impl(); }
 
-    status_t create_memory_storage(memory_storage_t **storage, unsigned flags,
-            size_t size, void *handle) override;
-
     status_t create_stream(
             impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
 
-    const ::sycl::device &device() const { return impl()->device(); }
-    const ::sycl::context &context() const { return impl()->context(); }
-
-    xpu::sycl::backend_t backend() const { return impl()->backend(); }
-
     bool mayiuse_system_memory_allocators() const override {
         return impl()->mayiuse_system_memory_allocators();
     }
 
+    DECLARE_COMMON_SYCL_ENGINE_FUNCTIONS();
+
 protected:
     const xpu::sycl::engine_impl_t *impl() const {
         return (const xpu::sycl::engine_impl_t *)impl::engine_t::impl();
diff --git a/src/gpu/generic/sycl/group_normalization_kernel.hpp b/src/gpu/generic/sycl/group_normalization_kernel.hpp
new file mode 100644
index 00000000000..aa476a6422a
--- /dev/null
+++ b/src/gpu/generic/sycl/group_normalization_kernel.hpp
@@ -0,0 +1,379 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+* Copyright 2025 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_GROUP_NORMALIZATION_KERNEL_HPP
+#define GPU_GENERIC_SYCL_GROUP_NORMALIZATION_KERNEL_HPP
+
+#include <sycl/nd_item.hpp>
+
+#include "common/primitive_exec_types.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+struct group_norm_fwd_t {
+    using sycl_dims_t = int32_t[6];
+
+    group_norm_fwd_t(const sycl_group_norm_conf_t &conf_,
+            ::sycl::local_accessor<float, 1> &local_memory,
+            ::sycl::handler &cgh, const exec_ctx_t &ctx)
+        : conf_(conf_)
+        , src(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
+        , scale(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SCALE))
+        , shift(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SHIFT))
+        , dst(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , mean(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_MEAN))
+        , variance(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_VARIANCE))
+        , src_scale(CTX_IN_SYCL_KERNEL_MEMORY(
+                  DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC))
+        , dst_scale(CTX_IN_SYCL_KERNEL_MEMORY(
+                  DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST))
+        , po_args_(cgh, ctx, conf_.post_ops)
+        , local_memory(local_memory) {}
+
+    inline void operator()(::sycl::nd_item<2> it) const {
+        auto batch = it.get_group(0);
+        auto group_num = it.get_group(1);
+        // Only one workgroup gets assigned to each group.
+        // Purposefully not implementing optimal reduction / group_norm for the sake of
+        // simplicity.
+
+        auto src_wrapper = conf_.src_desc;
+        auto dst_wrapper = conf_.dst_desc;
+        dim_t num_elements_to_reduce = 1;
+        const auto &dims = src_wrapper.dims();
+        // As Batch and channels are dim 0 and 1 respectively
+        for (int i = 2; i < src_wrapper.ndims(); i++) {
+            num_elements_to_reduce *= dims[i];
+        }
+        dim_t num_spatial_elements = num_elements_to_reduce;
+        num_elements_to_reduce *= conf_.num_channels_per_group;
+        dims_t logical_index;
+        for (auto i = group_num; i < static_cast<std::size_t>(conf_.num_groups);
+                i += it.get_group_range(1)) {
+            // as accumulation will always be in float.
+            float accum = 0;
+            float mean_value;
+            float std_value;
+            if (not conf_.use_global_stats) {
+                for (dim_t j = it.get_local_linear_id();
+                        j < num_elements_to_reduce;
+                        j += static_cast<dim_t>(it.get_local_range(0))) {
+                    get_logical_index(src_wrapper.ndims(), dims, batch,
+                            group_num, num_spatial_elements, j, logical_index);
+                    accum += load_float_value(src_wrapper.data_type(),
+                            src.get_pointer(),
+                            src_wrapper.off_v(logical_index));
+                }
+
+                workgroup_reduce(it, accum, it.get_local_range(0));
+                // Divide by total elements to get the mean
+                if (it.get_local_linear_id() == 0) {
+                    local_memory[it.get_local_range(0)]
+                            /= num_elements_to_reduce;
+                    store_float_value(data_type::f32,
+                            local_memory[it.get_local_range(0)],
+                            mean.get_pointer(),
+                            batch * conf_.num_groups + group_num);
+                }
+                ::sycl::group_barrier(it.get_group());
+                //write each value to local memory;
+                // start accum for standard deviation
+                float mean = local_memory[it.get_local_range(0)];
+                mean_value = mean;
+                accum = 0;
+                for (dim_t j = it.get_local_linear_id();
+                        j < num_elements_to_reduce;
+                        j += static_cast<dim_t>(it.get_local_range(0))) {
+                    get_logical_index(src_wrapper.ndims(), dims, batch,
+                            group_num, num_spatial_elements, j, logical_index);
+                    accum += ::sycl::pown(
+                            (load_float_value(src_wrapper.data_type(),
+                                     src.get_pointer(),
+                                     src_wrapper.off_v(logical_index))
+                                    - mean),
+                            2);
+                }
+                workgroup_reduce(it, accum, it.get_local_range(0) + 1);
+                // calculate std + eta
+                if (it.get_local_linear_id() == 0) {
+                    float variance_val
+                            = local_memory[it.get_local_range(0) + 1];
+                    variance_val /= num_elements_to_reduce;
+                    float std = ::sycl::sqrt(variance_val + conf_.eta);
+                    local_memory[it.get_local_range(0) + 1] = std;
+                    store_float_value(data_type::f32, variance_val,
+                            variance.get_pointer(),
+                            batch * conf_.num_groups + group_num);
+                }
+                ::sycl::group_barrier(it.get_group());
+                std_value = local_memory[it.get_local_range(0) + 1];
+            } else {
+                mean_value
+                        = load_float_value(data_type::f32, mean.get_pointer(),
+                                batch * conf_.num_groups + group_num);
+                std_value = load_float_value(data_type::f32,
+                        variance.get_pointer(),
+                        batch * conf_.num_groups + group_num);
+                std_value = ::sycl::sqrt(std_value + conf_.eta);
+            }
+
+            //Now start the normalization
+            for (dim_t j = it.get_local_linear_id(); j < num_elements_to_reduce;
+                    j += static_cast<dim_t>(it.get_local_range(0))) {
+                get_logical_index(src_wrapper.ndims(), dims, batch, group_num,
+                        num_spatial_elements, j, logical_index);
+                float value = load_float_value(src_wrapper.data_type(),
+                        src.get_pointer(), src_wrapper.off_v(logical_index));
+                float normalized_value = (value - mean_value) / std_value;
+                int32_t channel_value = (j / num_spatial_elements)
+                        % conf_.num_channels_per_group;
+                if (conf_.use_scale) {
+                    normalized_value *= load_float_value(data_type::f32,
+                            scale.get_pointer(),
+                            conf_.num_channels_per_group * group_num
+                                    + channel_value);
+                }
+                if (conf_.use_shift) {
+                    normalized_value += load_float_value(data_type::f32,
+                            shift.get_pointer(),
+                            conf_.num_channels_per_group * group_num
+                                    + channel_value);
+                }
+                if (conf_.src_scaling) {
+                    // Only one scaling factor per tensor is allowed,
+                    // as per the spec. Scaling factor will also always be f32 as per spec.
+                    normalized_value *= load_float_value(
+                            data_type::f32, src_scale.get_pointer(), 0);
+                }
+                float prev_value = normalized_value;
+                normalized_value = conf_.post_ops.apply(
+                        normalized_value, prev_value, po_args_, logical_index);
+                if (conf_.dst_scaling) {
+                    // Only one scaling factor per tensor is allowed,
+                    // as per the spec. Scaling factor will also always be f32 as per spec.
+                    normalized_value *= (1.0f
+                            / load_float_value(data_type::f32,
+                                    dst_scale.get_pointer(), 0));
+                }
+                store_float_value(dst_wrapper.data_type(), normalized_value,
+                        dst.get_pointer(), dst_wrapper.off_v(logical_index));
+            }
+        }
+    }
+
+private:
+    inline void get_logical_index(int ndims, const sycl_dims_t &dims,
+            dim_t batch, dim_t group_num, dim_t total_spacial_elements,
+            dim_t flattened_index, dims_t &logical_index) const {
+
+        logical_index[0] = batch;
+        // Calculate Channel index
+        logical_index[1] = group_num * conf_.num_channels_per_group
+                + ((flattened_index / total_spacial_elements)
+                        % conf_.num_channels_per_group);
+
+        for (int i = ndims - 1; i >= 2; i--) {
+            logical_index[i] = flattened_index % dims[i];
+            flattened_index /= dims[i];
+        }
+    }
+
+    inline void workgroup_reduce(::sycl::nd_item<2> &it,
+            float workitem_accum_value, int32_t idx_to_write_red_value) const {
+        local_memory[it.get_local_linear_id()] = workitem_accum_value;
+
+        ::sycl::group_barrier(it.get_group());
+
+        // Group leader accumulates;
+        if (it.get_local_linear_id() == 0) {
+            float total_sum = 0;
+            for (auto i = std::size_t(0); i < it.get_local_range(0); i++) {
+                total_sum += local_memory[i];
+            }
+            local_memory[idx_to_write_red_value] = total_sum;
+        }
+    }
+
+    sycl_group_norm_conf_t conf_;
+    xpu::sycl::in_memory_arg_t src;
+    xpu::sycl::in_memory_arg_t scale;
+    xpu::sycl::in_memory_arg_t shift;
+    xpu::sycl::inout_memory_arg_t dst;
+    xpu::sycl::inout_memory_arg_t mean;
+    xpu::sycl::inout_memory_arg_t variance;
+    xpu::sycl::in_memory_arg_t src_scale;
+    xpu::sycl::in_memory_arg_t dst_scale;
+    post_op_input_args po_args_;
+    ::sycl::local_accessor<float, 1> local_memory;
+};
+
+struct group_norm_bwd_t {
+    using sycl_dims_t = int32_t[6];
+    group_norm_bwd_t(const sycl_gnorm_bwd_conf_t &conf_,
+            ::sycl::local_accessor<float, 1> &local_memory,
+            ::sycl::handler &cgh, const exec_ctx_t &ctx)
+        : conf_(conf_)
+        , src(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
+        , diff_dst(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_DST))
+        , mean(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_MEAN))
+        , variance(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_VARIANCE))
+        , scales(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SCALE))
+        , diff_src(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC))
+        , diff_scales(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SCALE))
+        , diff_bias(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SHIFT))
+        , local_memory(local_memory) {}
+
+    void operator()(::sycl::nd_item<1> it) const {
+        auto channel_num = it.get_group(0);
+        auto group_num = channel_num / conf_.num_channels_per_group;
+        // each group handles a channel
+
+        dim_t num_spatial_elements = 1;
+        auto num_dims = conf_.src_desc.ndims();
+        const auto &dims = conf_.src_desc.dims();
+
+        for (int i = 2; i < num_dims; i++) {
+            num_spatial_elements *= dims[i];
+        }
+
+        dims_t logical_index;
+        logical_index[1] = channel_num;
+        float gamma = conf_.scale_diff_required ? load_float_value(
+                              data_type::f32, scales.get_pointer(), channel_num)
+                                                : 1.0f;
+        float diff_gamma = 0;
+        float diff_beta = 0;
+        for (dim_t batch = 0; batch < dims[0]; batch++) {
+            logical_index[0] = batch;
+            float mean_val = load_float_value(data_type::f32,
+                    mean.get_pointer(), batch * conf_.num_groups + group_num);
+            float variance_val
+                    = load_float_value(data_type::f32, variance.get_pointer(),
+                            batch * conf_.num_groups + group_num);
+            float std = ::sycl::sqrt(variance_val + conf_.eta);
+            for (dim_t spatial_index = it.get_local_id(0);
+                    spatial_index < num_spatial_elements;
+                    spatial_index += it.get_local_range(0)) {
+                get_spatial_logical_index(
+                        dims, logical_index, spatial_index, num_dims);
+                float src_val = load_float_value(conf_.src_desc.data_type(),
+                        src.get_pointer(), conf_.src_desc.off_v(logical_index));
+                float diff_dst_val = load_float_value(
+                        conf_.diff_dst_desc.data_type(), diff_dst.get_pointer(),
+                        conf_.diff_dst_desc.off_v(logical_index));
+                // mean and variance will always have memory format dnnl::format::ab
+                float normalized_value = (src_val - mean_val) / std;
+                // dL / dY = \sigma_{i = 0}^{C} dL/dy_{i} * \hat{x_{i}}
+                diff_gamma += diff_dst_val * normalized_value;
+                diff_beta += diff_dst_val;
+            }
+        }
+
+        workgroup_reduce(it, diff_gamma);
+        if (it.get_local_linear_id() == 0 && conf_.scale_diff_required) {
+            // group leader writes back to global memory
+            store_float_value(data_type::f32, local_memory[0],
+                    diff_scales.get_pointer(), channel_num);
+        }
+        // update all threads with the final value of diff_gamma
+        diff_gamma = local_memory[0];
+        ::sycl::group_barrier(it.get_group());
+        workgroup_reduce(it, diff_beta);
+        if (it.get_local_linear_id() == 0 && conf_.bias_diff_required) {
+            // group leader writes back to global memory
+            store_float_value(data_type::f32, local_memory[0],
+                    diff_bias.get_pointer(), channel_num);
+        }
+        // update all threads with the final value of diff_beta
+        diff_beta = local_memory[0];
+
+        // Calculate dL/dX
+        for (dim_t batch = 0; batch < dims[0]; batch++) {
+            logical_index[0] = batch;
+            float mean_val = load_float_value(data_type::f32,
+                    mean.get_pointer(), batch * conf_.num_groups + group_num);
+            float variance_val
+                    = load_float_value(data_type::f32, variance.get_pointer(),
+                            batch * conf_.num_groups + group_num);
+            float std = ::sycl::sqrt(variance_val + conf_.eta);
+            for (dim_t spatial_index = it.get_local_id(0);
+                    spatial_index < num_spatial_elements;
+                    spatial_index += it.get_local_range(0)) {
+                get_spatial_logical_index(
+                        dims, logical_index, spatial_index, num_dims);
+                float diff_src_value = load_float_value(
+                        conf_.diff_dst_desc.data_type(), diff_dst.get_pointer(),
+                        conf_.diff_dst_desc.off_v(logical_index));
+                if (not conf_.used_global_stats) {
+                    float x = load_float_value(conf_.src_desc.data_type(),
+                            src.get_pointer(),
+                            conf_.src_desc.off_v(logical_index));
+                    float x_hat = (x - mean_val) * diff_gamma;
+                    diff_src_value -= (diff_beta + (x_hat / std))
+                            / (num_spatial_elements
+                                    * conf_.num_channels_per_group);
+                }
+                diff_src_value = gamma * (diff_src_value / std);
+                store_float_value(conf_.diff_src_desc.data_type(),
+                        diff_src_value, diff_src.get_pointer(),
+                        conf_.diff_src_desc.off_v(logical_index));
+            }
+        }
+    }
+
+private:
+    inline void get_spatial_logical_index(const sycl_dims_t &dims,
+            dims_t &logical_index, dim_t flattened_index,
+            dim_t num_dims) const {
+        for (dim_t i = num_dims - 1; i >= 2; i--) {
+            logical_index[i] = flattened_index % dims[i];
+            flattened_index /= dims[i];
+        }
+    }
+
+    inline void workgroup_reduce(
+            ::sycl::nd_item<1> &it, float wi_accum_value) const {
+        local_memory[it.get_local_id(0)] = wi_accum_value;
+        ::sycl::group_barrier(it.get_group());
+        //group leader accumulates
+        if (it.get_local_id(0) == 0) {
+            float accum_value = 0;
+            for (std::size_t i = 0; i < it.get_local_range(0); i++) {
+                accum_value += local_memory[i];
+            }
+            local_memory[0] = accum_value;
+        }
+        ::sycl::group_barrier(it.get_group());
+    }
+
+    sycl_gnorm_bwd_conf_t conf_;
+    xpu::sycl::in_memory_arg_t src;
+    xpu::sycl::in_memory_arg_t diff_dst;
+    xpu::sycl::in_memory_arg_t mean;
+    xpu::sycl::in_memory_arg_t variance;
+    xpu::sycl::in_memory_arg_t scales;
+    xpu::sycl::inout_memory_arg_t diff_src;
+    xpu::sycl::inout_memory_arg_t diff_scales;
+    xpu::sycl::inout_memory_arg_t diff_bias;
+    ::sycl::local_accessor<float, 1> local_memory;
+};
+
+} // namespace dnnl::impl::gpu::generic::sycl
+
+#endif
diff --git a/src/gpu/generic/sycl/layer_normalizations_kernels.hpp b/src/gpu/generic/sycl/layer_normalizations_kernels.hpp
index c37762438e9..8e1a4b5e53e 100644
--- a/src/gpu/generic/sycl/layer_normalizations_kernels.hpp
+++ b/src/gpu/generic/sycl/layer_normalizations_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,14 +47,8 @@ struct layer_normalization_fwd_kernel_vec_t {
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)) {}
 
     void operator()(::sycl::nd_item<1> item) const {
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-        size_t base_idx = offset_t * conf_.block_size;
-        for (int i = 0; i < conf_.block_size; i++) {
-            dim_t idx = base_idx + i;
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
             if (idx < conf_.N) { compute_alg_n(idx); }
         }
     }
@@ -85,12 +79,20 @@ struct layer_normalization_fwd_kernel_vec_t {
         memory_tensor_t data_mem(data_, conf_.data_md);
         memory_tensor_t scale_mem(scale_, conf_.data_scaleshift_md);
         memory_tensor_t shift_mem(shift_, conf_.data_scaleshift_md);
-        memory_plain_t rt_scale_mem(rt_scale_, conf_.scales_src_dt);
-        memory_plain_t dst_scale_mem(dst_scale_, conf_.scales_dst_dt);
         memory_tensor_t stat_mem(stat_, conf_.stat_md);
         memory_plain_t var_mem(var_, conf_.var_dt);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
+        float sr = 1.f;
+        if (!conf_.src_def) {
+            memory_plain_t rt_scale_mem(rt_scale_, conf_.scales_src_dt);
+            sr = rt_scale_mem.load(0);
+        }
+        float ds = 1.f;
+        if (!conf_.dst_def) {
+            memory_plain_t dst_scale_mem(dst_scale_, conf_.scales_dst_dt);
+            ds = dst_scale_mem.load(0);
+        }
         float eps = epsilon();
         const size_t s_off = conf_.stat_md.off_l(idx);
         auto v_mean = stat_mem.load(s_off);
@@ -110,8 +112,6 @@ struct layer_normalization_fwd_kernel_vec_t {
                 float s = data_mem.load(src_off);
                 float d = sm * (s - v_mean) + sv;
 
-                float sr = conf_.src_def ? 1.f : rt_scale_mem.load(0);
-                float ds = conf_.dst_def ? 1.f : dst_scale_mem.load(0);
                 d = (d * sr * (1.f / ds));
                 dst_mem.store(d, d_off);
             }
@@ -146,15 +146,8 @@ struct layer_normalization_fwd_kernel_vec1_t {
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)) {}
 
     void operator()(::sycl::nd_item<1> item) const {
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-
-        size_t base_idx = offset_t * conf_.block_size;
-        for (int i = 0; i < conf_.block_size; i++) {
-            dim_t idx = base_idx + i;
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
             if (idx < conf_.N) { compute_alg_n(idx); }
         }
     }
@@ -184,8 +177,6 @@ struct layer_normalization_fwd_kernel_vec1_t {
         memory_tensor_t data_mem(data_, conf_.data_md);
         memory_tensor_t scale_mem(scale_, conf_.data_scaleshift_md);
         memory_tensor_t shift_mem(shift_, conf_.data_scaleshift_md);
-        memory_plain_t rt_scale_mem(rt_scale_, conf_.scales_src_dt);
-        memory_plain_t dst_scale_mem(dst_scale_, conf_.scales_dst_dt);
         memory_tensor_t stat_out_mem(mean_out_, conf_.stat_md);
         memory_plain_t var_out_mem(var_out_, conf_.var_dt);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
@@ -194,6 +185,17 @@ struct layer_normalization_fwd_kernel_vec1_t {
             stat_out_mem.store(0, idx);
             var_out_mem.store(0, idx);
         }
+        float sr = 1.f;
+        if (!conf_.src_def) {
+            memory_plain_t rt_scale_mem(rt_scale_, conf_.scales_src_dt);
+            sr = rt_scale_mem.load(0);
+        }
+        float ds = 1.f;
+        if (!conf_.dst_def) {
+            memory_plain_t dst_scale_mem(dst_scale_, conf_.scales_dst_dt);
+            ds = dst_scale_mem.load(0);
+        }
+
         float eps = epsilon();
         const size_t s_off = conf_.stat_md.off_l(idx);
         float v_mean = 0.f;
@@ -230,9 +232,6 @@ struct layer_normalization_fwd_kernel_vec1_t {
                 const auto d_off = dst_md().off_l(index);
                 float s = data_mem.load(src_off);
                 float d = sm * (s - v_mean) + sv;
-
-                float sr = conf_.src_def ? 1.f : rt_scale_mem.load(0);
-                float ds = conf_.dst_def ? 1.f : dst_scale_mem.load(0);
                 d = (d * sr * (1.f / ds));
 
                 dst_mem.store(d, d_off);
diff --git a/src/gpu/generic/sycl/lrn_kernels.hpp b/src/gpu/generic/sycl/lrn_kernels.hpp
index 0e8ef62ddab..869045fc559 100644
--- a/src/gpu/generic/sycl/lrn_kernels.hpp
+++ b/src/gpu/generic/sycl/lrn_kernels.hpp
@@ -42,13 +42,6 @@ struct lrn_fwd_kernel_vec_t {
         memory_tensor_t src_mem(src_, conf_.src_md);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-        size_t base_idx = offset_t * conf_.block_size;
-
         auto data_off = [&](dim_t &mb, dim_t &c, dim_t &d, dim_t &h, dim_t &w) {
             switch (tag_) {
                 case format_tag::nchw:
@@ -112,23 +105,21 @@ struct lrn_fwd_kernel_vec_t {
                       }
                   };
 
-        for (dim_t blk_idx = 0; blk_idx < conf_.block_size; blk_idx++) {
-            dim_t idx = base_idx + blk_idx;
-            if (idx < conf_.wk_size) {
-                dim_t N = conf_.mb;
-                dim_t C = conf_.c;
-                dim_t D = conf_.d;
-                dim_t H = conf_.h;
-                dim_t W = conf_.w;
-
-                dim_t n = (idx / (C * D * H * W)) % N;
-                dim_t c = (idx / (D * H * W)) % C;
-                dim_t d = (idx / (H * W)) % D;
-                dim_t h = (idx / (W)) % H;
-                dim_t w = (idx / (1)) % W;
-
-                operation(n, c, d, h, w);
-            }
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            dim_t N = conf_.mb;
+            dim_t C = conf_.c;
+            dim_t D = conf_.d;
+            dim_t H = conf_.h;
+            dim_t W = conf_.w;
+
+            dim_t n = (idx / (C * D * H * W)) % N;
+            dim_t c = (idx / (D * H * W)) % C;
+            dim_t d = (idx / (H * W)) % D;
+            dim_t h = (idx / (W)) % H;
+            dim_t w = (idx / (1)) % W;
+
+            operation(n, c, d, h, w);
         }
     }
 
@@ -166,13 +157,6 @@ struct lrn_bwd_kernel_vec_t {
         memory_tensor_t diff_src_mem(diff_src_, conf_.diff_src_md);
         memory_tensor_t diff_dst_mem(diff_dst_, conf_.diff_dst_md);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-        size_t base_idx = offset_t * conf_.block_size;
-
         auto data_off = [&](dim_t &mb, dim_t &c, dim_t &d, dim_t &h,
                                 dim_t &w) -> dim_t {
             switch (tag_) {
@@ -283,23 +267,21 @@ struct lrn_bwd_kernel_vec_t {
                       }
                   };
 
-        for (dim_t blk_idx = 0; blk_idx < conf_.block_size; blk_idx++) {
-            dim_t idx = base_idx + blk_idx;
-            if (idx < conf_.wk_size) {
-                dim_t N = conf_.mb;
-                dim_t C = conf_.c;
-                dim_t D = conf_.d;
-                dim_t H = conf_.h;
-                dim_t W = conf_.w;
-
-                dim_t n = (idx / (C * D * H * W)) % N;
-                dim_t c = (idx / (D * H * W)) % C;
-                dim_t d = (idx / (H * W)) % D;
-                dim_t h = (idx / (W)) % H;
-                dim_t w = (idx / (1)) % W;
-
-                operation(n, c, d, h, w);
-            }
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            dim_t N = conf_.mb;
+            dim_t C = conf_.c;
+            dim_t D = conf_.d;
+            dim_t H = conf_.h;
+            dim_t W = conf_.w;
+
+            dim_t n = (idx / (C * D * H * W)) % N;
+            dim_t c = (idx / (D * H * W)) % C;
+            dim_t d = (idx / (H * W)) % D;
+            dim_t h = (idx / (W)) % H;
+            dim_t w = (idx / (1)) % W;
+
+            operation(n, c, d, h, w);
         }
     }
 
diff --git a/src/gpu/generic/sycl/matmul_kernels.hpp b/src/gpu/generic/sycl/matmul_kernels.hpp
index c0d54b32b7e..b155bc69e04 100644
--- a/src/gpu/generic/sycl/matmul_kernels.hpp
+++ b/src/gpu/generic/sycl/matmul_kernels.hpp
@@ -83,7 +83,7 @@ struct matmul_kernel_fwd_t {
         }
 
         static void store_vec_helper(
-                out_memory_tensor_t &output, Vec data, int offset) {
+                inout_memory_tensor_t &output, Vec data, int offset) {
             data_type_t type = output.md().data_type();
             char *offset_ptr = static_cast<char *>(output.ptr())
                     + data_type_size(type) * offset;
@@ -189,7 +189,7 @@ struct matmul_kernel_fwd_t {
             }
         }
 
-        void store(out_memory_tensor_t &output, int offset, int row_stride) {
+        void store(inout_memory_tensor_t &output, int offset, int row_stride) {
             for (int row = 0; row < Rows; row++) {
                 for (int col = 0; col < Cols / vec_len; col++) {
                     store_vec_helper(output, data[row][col],
@@ -198,8 +198,8 @@ struct matmul_kernel_fwd_t {
             }
         }
 
-        void store_edge(out_memory_tensor_t &output, int offset, int row_stride,
-                int rows, int cols) {
+        void store_edge(inout_memory_tensor_t &output, int offset,
+                int row_stride, int rows, int cols) {
             for (int row = 0; row < rows; row++) {
                 int col;
                 for (col = 0; col < cols / vec_len; col++) {
@@ -215,7 +215,7 @@ struct matmul_kernel_fwd_t {
             }
         }
 
-        void store_generic(out_memory_tensor_t &output, int offset,
+        void store_generic(inout_memory_tensor_t &output, int offset,
                 int row_stride, bool transpose, bool is_edge_block, int rows,
                 int cols) {
             if (is_edge_block) {
@@ -286,8 +286,8 @@ struct matmul_kernel_fwd_t {
             }
         }
 
-        void dropout(xpu::sycl::out_memory_arg_t dropout_mask, int threshold,
-                int seed, int inv_q, int offset, int row_stride) {
+        void dropout(xpu::sycl::out_memory_arg_t dropout_mask, uint threshold,
+                uint seed, float inv_q, int offset, int row_stride) {
             for (int row = 0; row < Rows; row++) {
                 for (int col = 0; col < Cols / vec_len; col++) {
                     for (int vec_el = 0; vec_el < vec_len; vec_el++) {
@@ -323,6 +323,36 @@ struct matmul_kernel_fwd_t {
                 }
             }
         }
+
+        void apply_post_ops_edge(sycl_post_ops_t post_ops,
+                register_block<Rows, Cols> prev_dst, dims_t off_po, int dim1,
+                const matmul_kernel_fwd_t *kernel, int rows, int cols) {
+            for (int row = 0; row < rows; row++) {
+                int col;
+                for (col = 0; col < cols / vec_len; col++) {
+                    for (int v_el = 0; v_el < vec_len; v_el++) {
+                        off_po[dim1] += row;
+                        off_po[dim1 + 1] += col * vec_len + v_el;
+                        data[row][col][v_el]
+                                = post_ops.apply(data[row][col][v_el],
+                                        prev_dst.data[row][col][v_el],
+                                        kernel->po_args_, off_po);
+                        off_po[dim1] -= row;
+                        off_po[dim1 + 1] -= col * vec_len + v_el;
+                    }
+                }
+                int n_remaining = cols - col * vec_len;
+                for (int v_el = 0; v_el < n_remaining; v_el++) {
+                    off_po[dim1] += row;
+                    off_po[dim1 + 1] += col * vec_len + v_el;
+                    data[row][col][v_el] = post_ops.apply(data[row][col][v_el],
+                            prev_dst.data[row][col][v_el], kernel->po_args_,
+                            off_po);
+                    off_po[dim1] -= row;
+                    off_po[dim1 + 1] -= col * vec_len + v_el;
+                }
+            }
+        }
     };
 
     matmul_kernel_fwd_t(const sycl_matmul_conf_t &conf, ::sycl::handler &cgh,
@@ -331,7 +361,7 @@ struct matmul_kernel_fwd_t {
         , data_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0))
         , weights_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_WEIGHTS))
         , bias_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_BIAS))
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
         , data_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0))
         , data_scales_dt_((conf_.do_scale_data)
@@ -377,7 +407,7 @@ struct matmul_kernel_fwd_t {
         , dropout_seed_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_ATTR_DROPOUT_SEED))
         , dropout_probability_(
                   CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_ATTR_DROPOUT_PROBABILITY))
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         using data_block_t = register_block<register_block_M, register_block_K>;
@@ -597,8 +627,13 @@ struct matmul_kernel_fwd_t {
             if (conf_.transpose_dst) {
                 std::swap(off_po[matmul_dim_1], off_po[matmul_dim_2]);
             }
-            dst_block.apply_post_ops(
-                    conf_.post_ops, prev_dst, off_po, matmul_dim_1, this);
+            if (is_dst_edge_block) {
+                dst_block.apply_post_ops_edge(conf_.post_ops, prev_dst, off_po,
+                        matmul_dim_1, this, remaining_m, remaining_n);
+            } else {
+                dst_block.apply_post_ops(
+                        conf_.post_ops, prev_dst, off_po, matmul_dim_1, this);
+            }
 
             if (conf_.do_scale_dst) {
                 dst_block.eltwise([=](float &el) { el /= dst_scale; });
@@ -618,7 +653,7 @@ struct matmul_kernel_fwd_t {
     xpu::sycl::in_memory_arg_t data_;
     xpu::sycl::in_memory_arg_t weights_;
     xpu::sycl::in_memory_arg_t bias_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
     xpu::sycl::in_memory_arg_t data_scale_;
     data_type_t data_scales_dt_;
     xpu::sycl::in_memory_arg_t weights_scale_;
diff --git a/src/gpu/generic/sycl/pooling_kernels.hpp b/src/gpu/generic/sycl/pooling_kernels.hpp
index 6b515aa6537..6eb9b1c3c4b 100644
--- a/src/gpu/generic/sycl/pooling_kernels.hpp
+++ b/src/gpu/generic/sycl/pooling_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,13 +42,13 @@ struct pooling_fwd_kernel_vec_t {
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
         , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
         , ws_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_WORKSPACE))
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
         const bool is_max_pool = conf_.alg == alg_kind::pooling_max;
         float base_res = is_max_pool ? data_conv() : 0.f;
         dim_t MB = conf_.MB;
@@ -119,20 +119,20 @@ struct pooling_fwd_kernel_vec_t {
                 return (float)
                         std::numeric_limits<xpu::sycl::bfloat16_t>::lowest();
             case data_type::s8:
-                return (float)numeric_limits<typename xpu::sycl::prec_traits<
+                return (float)numeric_limits<typename xpu::sycl::prec_traits_t<
                         data_type::s8>::type>::lowest();
             case data_type::f16:
                 return (float)
-                        std::numeric_limits<typename xpu::sycl::prec_traits<
+                        std::numeric_limits<typename xpu::sycl::prec_traits_t<
                                 data_type::f16>::type>::lowest();
             case data_type::s32:
-                return (float)numeric_limits<typename xpu::sycl::prec_traits<
+                return (float)numeric_limits<typename xpu::sycl::prec_traits_t<
                         data_type::s32>::type>::lowest();
             case data_type::u8:
-                return (float)numeric_limits<typename xpu::sycl::prec_traits<
+                return (float)numeric_limits<typename xpu::sycl::prec_traits_t<
                         data_type::u8>::type>::lowest();
             default:
-                return (float)numeric_limits<typename xpu::sycl::prec_traits<
+                return (float)numeric_limits<typename xpu::sycl::prec_traits_t<
                         data_type::f32>::type>::lowest();
         }
     }
@@ -254,7 +254,7 @@ struct pooling_bwd_kernel_vec_t {
         memory_tensor_t diff_src_mem(diff_src_, conf_.diff_src_md);
         memory_tensor_t diff_dst_mem(diff_dst_, conf_.diff_dst_md);
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
         int denom = 1;
 
         const bool is_max_pool = conf_.alg == alg_kind::pooling_max;
diff --git a/src/gpu/generic/sycl/prelu_kernels.hpp b/src/gpu/generic/sycl/prelu_kernels.hpp
index 15425f3c447..1809446856f 100644
--- a/src/gpu/generic/sycl/prelu_kernels.hpp
+++ b/src/gpu/generic/sycl/prelu_kernels.hpp
@@ -50,7 +50,7 @@ struct prelu_fwd_kernel_vec_t {
         memory_tensor_t weights_mem(weights_, conf_.weights_md);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
 
         const int mask = conf_.mask;
         const dim_t work_amount = conf_.work_amount;
@@ -148,7 +148,7 @@ struct prelu_bwd_kernel_vec_t {
 
     prelu_bwd_kernel_vec_t(const sycl_prelu_conf_t &conf, ::sycl::handler &cgh,
             const exec_ctx_t &ctx, bool reduce_diff_weights,
-            std::unique_ptr<memory_t> &scratch_mem)
+            std::unique_ptr<memory_t, memory_deleter_t> &scratch_mem)
         : conf_(conf)
         , data_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
         , diff_data_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DIFF_SRC))
@@ -172,7 +172,7 @@ struct prelu_bwd_kernel_vec_t {
         memory_plain_t scratchpad_mem(
                 scratchpad_, conf_.weights_md.data_type());
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
         switch (conf_.bcast_type) {
             case broadcasting_strategy_t::scalar:
                 calculate_scalar(data_mem, weights_mem, scratchpad_mem,
@@ -393,7 +393,7 @@ struct prelu_bwd_kernel_vec_t {
             out_memory_tensor_t &diff_src_mem, size_t ith,
             ::sycl::nd_item<1> item) const {
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
         dims_t dims_d, dims_w;
         for (int i = 0; i < max_supported_ndims; i++) {
             dim_t data_dim_i = data_md().dims()[i];
diff --git a/src/gpu/generic/sycl/reduction_kernels.hpp b/src/gpu/generic/sycl/reduction_kernels.hpp
new file mode 100644
index 00000000000..aa830db2aac
--- /dev/null
+++ b/src/gpu/generic/sycl/reduction_kernels.hpp
@@ -0,0 +1,660 @@
+/*******************************************************************************
+* Copyright 2023-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REDUCTION_KERNELS_HPP
+#define GPU_GENERIC_SYCL_REDUCTION_KERNELS_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/primitive_exec_types.hpp"
+#include "common/utils.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_math_utils.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "xpu/sycl/memory_storage_base.hpp"
+#include "xpu/sycl/types.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct init_kernel_t {
+    init_kernel_t(xpu::sycl::out_memory_arg_t &out_arg, alg_kind_t alg)
+        : out_arg_(out_arg), alg_(alg) {}
+
+    void operator()(::sycl::item<1> item) const {
+        auto *out = reinterpret_cast<float *>(out_arg_.get_pointer());
+        const int idx = item.get_linear_id();
+        float val = 0;
+        if (alg_ == alg_kind::reduction_min)
+            val = std::numeric_limits<float>::max();
+        else if (alg_ == alg_kind::reduction_max)
+            val = std::numeric_limits<float>::lowest();
+        else if (alg_ == alg_kind::reduction_mul)
+            val = 1;
+        else
+            val = 0;
+
+        out[idx] = val;
+    }
+
+private:
+    xpu::sycl::out_memory_arg_t out_arg_;
+    alg_kind_t alg_;
+};
+
+struct atomic_finalize_kernel_t {
+    atomic_finalize_kernel_t(::sycl::handler &cgh, const exec_ctx_t &ctx,
+            data_type_t dt, xpu::sycl::out_memory_arg_t &out_arg,
+            alg_kind_t alg, float p, float eps, sycl_post_ops_t &post_ops,
+            xpu::sycl::md_t dst_md, int reduce_size)
+        : dt_(dt)
+        , out_arg_(out_arg)
+        , alg_(alg)
+        , p_(p)
+        , eps_(eps)
+        , post_ops_(post_ops)
+        , po_args_(cgh, ctx, post_ops)
+        , dst_md_(dst_md)
+        , reduce_size_(reduce_size) {}
+
+    void operator()(::sycl::item<1> item) const {
+        void *out_ptr = out_arg_.get_pointer();
+        auto idx = item.get_linear_id();
+        auto val = load_float_value(dt_, out_ptr, idx);
+        if (alg_ == alg_kind::reduction_norm_lp_max) {
+            val = ::sycl::rootn(::sycl::max(val, eps_), p_);
+        } else if (alg_ == alg_kind::reduction_norm_lp_sum) {
+            val = ::sycl::rootn(val + eps_, p_);
+        } else if (alg_ == alg_kind::reduction_norm_lp_power_p_max) {
+            val = ::sycl::max(val, eps_);
+        } else if (alg_ == alg_kind::reduction_norm_lp_power_p_sum) {
+            val = val + eps_;
+        } else if (alg_ == alg_kind::reduction_mean) {
+            val = val / reduce_size_;
+        }
+
+        auto prev_val = val;
+        auto l_offset = idx;
+        dims_t pos;
+        for (int i = 0; i < dst_md_.ndims(); i++) {
+            const int d = dst_md_.ndims() - 1 - i;
+            const dim_t cur_dim = dst_md_.dims()[d];
+            pos[d] = l_offset % cur_dim;
+            l_offset = l_offset / cur_dim;
+        }
+        val = post_ops_.apply(val, prev_val, po_args_, pos);
+
+        store_float_value(dt_, val, out_ptr, idx);
+    }
+
+private:
+    data_type_t dt_;
+    xpu::sycl::out_memory_arg_t out_arg_;
+    alg_kind_t alg_;
+    float p_, eps_;
+    sycl_post_ops_t post_ops_;
+    post_op_input_args po_args_;
+    xpu::sycl::md_t dst_md_;
+    int reduce_size_;
+};
+
+struct Reducer {
+    alg_kind_t alg_;
+    float p_, eps_;
+    bool needs_prepare_;
+    bool iter_needs_finalize_;
+
+    Reducer(alg_kind_t alg, float p, float eps, bool needs_prepare,
+            bool iter_needs_finalize)
+        : alg_(alg)
+        , p_(p)
+        , eps_(eps)
+        , needs_prepare_(needs_prepare)
+        , iter_needs_finalize_(iter_needs_finalize) {}
+
+    inline float identity() const {
+        if (alg_ == alg_kind::reduction_min) {
+            return std::numeric_limits<float>::max();
+        } else if (alg_ == alg_kind::reduction_max) {
+            return std::numeric_limits<float>::lowest();
+        } else if (alg_ == alg_kind::reduction_mul) {
+            return 1.f;
+        }
+
+        return 0.f;
+    }
+
+    inline bool needs_finalize() const {
+        return (alg_ == alg_kind::reduction_mean
+                || alg_ == alg_kind::reduction_norm_lp_max
+                || alg_ == alg_kind::reduction_norm_lp_sum
+                || alg_ == alg_kind::reduction_norm_lp_power_p_max
+                || alg_ == alg_kind::reduction_norm_lp_power_p_sum);
+    }
+
+    inline float subgroup_reduce(
+            ::sycl::sub_group &subgroup, float sg_input) const {
+        if (alg_ == alg_kind::reduction_sum || alg_ == alg_kind::reduction_mean
+                || alg_ == alg_kind::reduction_norm_lp_max
+                || alg_ == alg_kind::reduction_norm_lp_sum
+                || alg_ == alg_kind::reduction_norm_lp_power_p_max
+                || alg_ == alg_kind::reduction_norm_lp_power_p_sum) {
+            return ::sycl::reduce_over_group(
+                    subgroup, sg_input, ::sycl::plus<float> {});
+        } else if (alg_ == alg_kind::reduction_min) {
+            return ::sycl::reduce_over_group(
+                    subgroup, sg_input, ::sycl::minimum<float> {});
+        } else if (alg_ == alg_kind::reduction_max) {
+            return ::sycl::reduce_over_group(
+                    subgroup, sg_input, ::sycl::maximum<float> {});
+        } else if (alg_ == alg_kind::reduction_mul) {
+            return ::sycl::reduce_over_group(
+                    subgroup, sg_input, ::sycl::multiplies<float> {});
+        }
+
+        return ::sycl::nan(0U);
+    }
+
+    inline float reduce(float lhs, float rhs) const {
+        if (alg_ == alg_kind::reduction_sum || alg_ == alg_kind::reduction_mean
+                || alg_ == alg_kind::reduction_norm_lp_max
+                || alg_ == alg_kind::reduction_norm_lp_sum
+                || alg_ == alg_kind::reduction_norm_lp_power_p_max
+                || alg_ == alg_kind::reduction_norm_lp_power_p_sum) {
+            return lhs + rhs;
+        } else if (alg_ == alg_kind::reduction_min) {
+            return ::sycl::min(lhs, rhs);
+        } else if (alg_ == alg_kind::reduction_max) {
+            return ::sycl::max(lhs, rhs);
+        } else if (alg_ == alg_kind::reduction_mul) {
+            return lhs * rhs;
+        }
+
+        return ::sycl::nan(0U);
+    }
+
+    template <::sycl::memory_order Order, ::sycl::memory_scope Scope,
+            ::sycl::access::address_space Space>
+    void atomic_op(data_type_t dt, void *ref, int idx, float val, int size) {
+        auto atomic_out = ::sycl::atomic_ref<float, Order, Scope, Space>(
+                reinterpret_cast<float *>(ref)[idx]);
+        if (alg_ == alg_kind::reduction_sum || alg_ == alg_kind::reduction_mean
+                || alg_ == alg_kind::reduction_norm_lp_max
+                || alg_ == alg_kind::reduction_norm_lp_sum
+                || alg_ == alg_kind::reduction_norm_lp_power_p_max
+                || alg_ == alg_kind::reduction_norm_lp_power_p_sum) {
+            atomic_out.fetch_add(val);
+        } else if (alg_ == alg_kind::reduction_min) {
+            atomic_out.fetch_min(val);
+        } else if (alg_ == alg_kind::reduction_max) {
+            atomic_out.fetch_max(val);
+        }
+    }
+
+    inline void prepare(float &val) {
+        if (needs_prepare_
+                && (alg_ == alg_kind::reduction_norm_lp_max
+                        || alg_ == alg_kind::reduction_norm_lp_sum
+                        || alg_ == alg_kind::reduction_norm_lp_power_p_max
+                        || alg_ == alg_kind::reduction_norm_lp_power_p_sum)) {
+            val = ::sycl::pow(::sycl::fabs(val), p_);
+        }
+    }
+
+    inline void finalize(float &val, int size) {
+        if (alg_ == alg_kind::reduction_mean) {
+            val /= size;
+        } else if (alg_ == alg_kind::reduction_norm_lp_max
+                && iter_needs_finalize_) {
+            val = ::sycl::rootn(::sycl::max(val, eps_), p_);
+        } else if (alg_ == alg_kind::reduction_norm_lp_sum
+                && iter_needs_finalize_) {
+            val = ::sycl::rootn(val + eps_, p_);
+        } else if (alg_ == alg_kind::reduction_norm_lp_power_p_max
+                && iter_needs_finalize_) {
+            val = ::sycl::max(val, eps_);
+        } else if (alg_ == alg_kind::reduction_norm_lp_power_p_sum
+                && iter_needs_finalize_) {
+            val = val + eps_;
+        }
+    }
+};
+
+struct LocalMemTile {
+    using T = float;
+    using Index = int;
+
+    static constexpr Index Dim = 3;
+    // XXX: Set this depending on reduction size for optimisation
+    static constexpr bool CheckBounds = true;
+    static constexpr Index RowDim = Dim == 3 ? 1 : 0;
+    static constexpr Index ColDim = Dim == 3 ? 2 : 1;
+    Index row_tile_;
+    Index col_tile_;
+    Index row_lim_;
+    Index col_lim_;
+    Index local_id_;
+    Index local_row_id_;
+    Index local_col_id_;
+    Index wg_id_;
+    ::sycl::nd_item<Dim> &nd_item_;
+    bool bank_offset_;
+    T pad_val_;
+
+    LocalMemTile(data_type_t src_dt, data_type_t dst_dt, Index row_tile,
+            Index col_tile, Index row_lim, Index col_lim,
+            ::sycl::nd_item<Dim> &nd_item, bool bank_offset, T pad_val = 0)
+        : row_tile_ {row_tile}
+        , col_tile_ {col_tile}
+        , row_lim_ {row_lim}
+        , col_lim_ {col_lim}
+        , local_id_ {static_cast<Index>(nd_item.get_local_linear_id())}
+        , local_row_id_ {static_cast<Index>(nd_item.get_local_id(RowDim))}
+        , local_col_id_ {static_cast<Index>(nd_item.get_local_id(ColDim))}
+        , wg_id_ {static_cast<Index>(nd_item.get_group(0))}
+        , nd_item_(nd_item)
+        , bank_offset_(bank_offset)
+        , pad_val_ {pad_val} {}
+
+    Index get_row_tile() { return row_tile_; }
+    Index get_col_tile() { return col_tile_; }
+
+    Index get_local_id(bool is_transposed = false) {
+        if (!is_transposed) {
+            return local_row_id_ * (col_tile_ + bank_offset_) + local_col_id_;
+        } else {
+            return local_col_id_ * (row_tile_ + bank_offset_) + local_row_id_;
+        }
+    }
+
+    Index get_local_row_id() { return local_row_id_; }
+    Index get_local_col_id() { return local_col_id_; }
+    Index get_wg_id() { return wg_id_; }
+    Index get_global_row_id() { return nd_item_.get_global_id(RowDim); }
+    Index get_global_col_id() { return nd_item_.get_global_id(ColDim); }
+
+    void load_memory(data_type_t in_dt, void *global_in, data_type_t local_dt,
+            const xpu::sycl::md_t &in_md, void *local_out, Reducer &reducer,
+            bool is_first_red_iter) {
+        auto const local_id = get_local_id();
+        auto const wg_batch_id = get_wg_id();
+        auto const global_row_id = get_global_row_id();
+        auto const global_col_id = get_global_col_id();
+
+        auto const global_id = wg_batch_id * row_lim_ * col_lim_
+                + global_row_id * col_lim_ + global_col_id;
+
+        if constexpr (CheckBounds) {
+            const auto within_bounds
+                    = global_row_id < row_lim_ && global_col_id < col_lim_;
+            int idx = is_first_red_iter ? in_md.off_l(global_id) : global_id;
+            float val = within_bounds ? load_float_value(in_dt, global_in, idx)
+                                      : pad_val_;
+            reducer.prepare(val);
+            store_float_value(local_dt, val, local_out, local_id);
+        } else {
+            // maybe do subgroup load
+            auto val = load_float_value(in_dt, global_in, global_id);
+            reducer.prepare(val);
+            store_float_value(local_dt, val, local_out, local_id);
+        }
+        group_barrier(nd_item_.get_group());
+    }
+
+    void store_memory(data_type_t in_dt, void *local_in, data_type_t out_dt,
+            void *global_out, const xpu::sycl::md_t &out_md, bool is_reduced,
+            bool is_transposed, bool is_last_red_iter,
+            const sycl_post_ops_t &post_ops,
+            const post_op_input_args &po_args) {
+        auto const local_id = get_local_id(is_transposed);
+        auto const wg_batch_id = get_wg_id();
+        auto const col_id = get_global_col_id();
+
+        Index row_id;
+        Index row_lim;
+        Index global_id;
+        if (is_reduced) {
+            row_id = get_local_row_id();
+            row_lim = get_row_tile();
+            global_id = wg_batch_id * col_lim_ + col_id;
+        } else {
+            row_id = get_global_row_id();
+            row_lim = row_lim_;
+            if (is_transposed) {
+                global_id
+                        = (wg_batch_id * col_lim_ + col_id) * row_lim_ + row_id;
+            } else {
+                global_id
+                        = (wg_batch_id * row_lim_ + row_id) * col_lim_ + col_id;
+            }
+        }
+
+        if (row_id < row_lim && col_id < col_lim_) {
+            auto val = load_float_value(in_dt, local_in, local_id);
+            auto idx = is_last_red_iter ? out_md.off_l(global_id) : global_id;
+            float prev_val = load_float_value(out_dt, global_out, idx);
+
+            if (is_last_red_iter) {
+                auto l_offset = global_id;
+                dims_t pos;
+                for (int i = 0; i < out_md.ndims(); i++) {
+                    const int d = out_md.ndims() - 1 - i;
+                    const dim_t cur_dim = out_md.dims()[d];
+                    pos[d] = l_offset % cur_dim;
+                    l_offset = l_offset / cur_dim;
+                }
+                val = post_ops.apply(val, prev_val, po_args, pos);
+            }
+
+            store_float_value(out_dt, val, global_out, idx);
+        }
+    }
+
+    T load_local(data_type_t dt, void *input, int index) {
+        return load_float_value(dt, input, index); // ((T *)input)[index];
+    }
+
+    void store_local(data_type_t dt, void *output, int index, T val) {
+        store_float_value(dt, val, output, index);
+    }
+
+private:
+    template <bool Inplace>
+    void transpose_impl_(
+            data_type_t in_dt, void *input, data_type_t out_dt, void *output) {
+        auto const local_id = get_local_id(false);
+        auto const trans_local_id = get_local_id(true);
+        auto group = nd_item_.get_group();
+        const auto val = load_local(in_dt, input, local_id);
+        if constexpr (Inplace) { group_barrier(group); }
+        store_local(out_dt, output, trans_local_id, val);
+        group_barrier(group);
+    }
+
+public:
+    void transpose(
+            data_type_t in_dt, void *input, data_type_t out_dt, void *output) {
+        transpose_impl_<false>(in_dt, input, out_dt, output);
+    }
+
+    void transpose(data_type_t dt, void *input) {
+        transpose_impl_<true>(dt, input, dt, input);
+    }
+
+    void sg_reduce_impl(data_type_t in_dt, void *input, data_type_t out_dt,
+            void *output, bool is_transposed, Reducer &reducer) {
+        Index const reduce_lim = row_tile_;
+        Index const stride_lim = col_tile_;
+        if (row_tile_ == 1) { return; }
+
+        auto group = nd_item_.get_group();
+        auto subgroup = nd_item_.get_sub_group();
+
+        Index const sg_size = subgroup.get_max_local_range()[0];
+        Index const sg_group_id = subgroup.get_group_linear_id();
+        Index const sg_local_id = subgroup.get_local_linear_id();
+
+        // The tile is read in the shape [row_tile, col_tile] ->
+        // [num_row_blocks*max_subgroup_size, col_tile]. Thus the local range has
+        // the same shape (num_row_blocks*max_subgroup_size, col_tile), using this
+        // we can assign num_row_blocks*col_tile subgroups to reduce a chunk of
+        // memory size max_subgroup_size
+
+        auto const num_reduce_blocks
+                = ::sycl::max((reduce_lim) / sg_size, Index {1});
+        auto const reduce_block_id = sg_group_id % num_reduce_blocks;
+        auto const local_reduce_id = reduce_block_id * sg_size + sg_local_id;
+        auto const local_stride_id = sg_group_id / num_reduce_blocks;
+
+        Index input_id;
+        Index output_id;
+        if (!is_transposed) {
+            input_id = local_reduce_id * (stride_lim + bank_offset_)
+                    + local_stride_id;
+            output_id = reduce_block_id * (stride_lim + bank_offset_)
+                    + local_stride_id;
+        } else {
+            input_id = local_stride_id * (reduce_lim + bank_offset_)
+                    + local_reduce_id;
+            output_id = local_stride_id * (num_reduce_blocks + bank_offset_)
+                    + reduce_block_id;
+        }
+        auto const sg_input = local_reduce_id < reduce_lim
+                ? load_local(in_dt, input, input_id)
+                : reducer.identity();
+        float sg_output = reducer.subgroup_reduce(subgroup, sg_input);
+
+        group_barrier(group);
+
+        if (subgroup.leader()) {
+            store_local(out_dt, output, output_id, sg_output);
+        }
+        row_tile_ = num_reduce_blocks;
+        group_barrier(group);
+    }
+
+    void sg_reduce(data_type_t in_dt, void *input, data_type_t out_dt,
+            void *output, int num_sg_reductions, bool is_transposed,
+            Reducer &reducer) {
+        for (auto i = 0; i < num_sg_reductions; i++) {
+            sg_reduce_impl(
+                    in_dt, input, out_dt, output, is_transposed, reducer);
+        }
+    }
+
+    void sg_reduce(data_type_t dt, void *input, int num_sg_reductions,
+            bool is_transposed, Reducer &reducer) {
+        for (auto i = 0; i < num_sg_reductions; i++) {
+            sg_reduce_impl(dt, input, dt, input, is_transposed, reducer);
+        }
+    }
+
+    void wi_reduce_impl(data_type_t in_dt, void *input, data_type_t out_dt,
+            void *output, bool is_transposed, Reducer &reducer) {
+        if (row_lim_ == 1) { return; }
+
+        auto output_id = get_local_id(is_transposed);
+        if (get_local_row_id() == 0) {
+            auto input_id = output_id;
+            for (auto row_id = 1; row_id < row_tile_; row_id++) {
+                if (is_transposed) {
+                    input_id++;
+                } else {
+                    input_id += (col_tile_ + bank_offset_);
+                }
+                auto lhs = load_float_value(in_dt, input, input_id);
+                auto rhs = load_float_value(out_dt, output, output_id);
+                auto val = reducer.reduce(lhs, rhs);
+                store_float_value(out_dt, val, output, output_id);
+            }
+        }
+
+        group_barrier(nd_item_.get_group());
+        row_tile_ = 1;
+
+        if (get_local_row_id() == 0) {
+            auto final_output_id
+                    = get_local_col_id() * (row_tile_ + bank_offset_);
+            auto val = load_float_value(out_dt, output, output_id);
+            store_float_value(out_dt, val, output, final_output_id);
+        }
+        group_barrier(nd_item_.get_group());
+    }
+
+    void wi_reduce(
+            data_type_t dt, void *input, bool is_transposed, Reducer &reducer) {
+        wi_reduce_impl(dt, input, dt, input, is_transposed, reducer);
+    }
+
+    void finalize_reduce(data_type_t dt, void *input, int finalize_param,
+            bool is_transposed, Reducer &reducer) {
+        if (reducer.needs_finalize()) {
+            const auto local_row_id = get_local_row_id();
+            const auto local_id = get_local_id(is_transposed);
+            if (local_row_id == 0) {
+                auto val = load_float_value(dt, input, local_id);
+                reducer.finalize(val, finalize_param);
+                store_float_value(dt, val, input, local_id);
+            }
+        }
+    }
+
+    template <::sycl::memory_order Order, ::sycl::memory_scope Scope,
+            ::sycl::access::address_space Space>
+    void atomic_reduce(data_type_t in_dt, void *input, data_type_t out_dt,
+            void *output, const xpu::sycl::md_t &out_md, Index finalize_param,
+            Index batch_groups, bool is_transposed, Reducer &reducer) {
+        constexpr bool GlobalReduce
+                = Space == ::sycl::access::address_space::global_space;
+
+        // Get local indexes
+        const auto local_row_id = get_local_row_id();
+        const auto local_col_id = get_local_col_id();
+
+        // Get global column id and limit
+        const auto global_col_id
+                = GlobalReduce ? get_global_col_id() : local_col_id;
+        const auto global_col_lim = GlobalReduce ? col_lim_ : col_tile_;
+
+        const auto row_id_ok = local_row_id < row_tile_;
+        const auto col_id_ok
+                = GlobalReduce ? global_col_id < global_col_lim : true;
+
+        if (row_id_ok && col_id_ok) {
+            const auto outer_id = GlobalReduce ? get_wg_id() : 0;
+            const auto output_id = out_md.off_l(
+                    (outer_id % batch_groups) * global_col_lim + global_col_id);
+            const auto input_id = get_local_id(is_transposed);
+            reducer.atomic_op<Order, Scope, Space>(out_dt, output, output_id,
+                    load_float_value(in_dt, input, input_id), finalize_param);
+        }
+
+        // Atomics do a full reduction of the row.
+        row_tile_ = 1;
+    }
+};
+
+struct reduction_kernel_fwd_t {
+    using LocalMem = ::sycl::local_accessor<uint8_t, 1>;
+
+    static auto constexpr Order = ::sycl::memory_order::relaxed;
+    static auto constexpr DeviceScope = ::sycl::memory_scope::device;
+    static auto constexpr WGScope = ::sycl::memory_scope::work_group;
+    static auto constexpr GlobalSpace
+            = ::sycl::access::address_space::global_space;
+    static auto constexpr LocalSpace
+            = ::sycl::access::address_space::local_space;
+
+    reduction_kernel_fwd_t(const sycl_reduction_conf_t &conf, int row_tile,
+            int col_tile, int batch_groups, bool needs_atomic_reduce,
+            LocalMem &local_mem, ::sycl::handler &cgh, const exec_ctx_t &ctx)
+        : conf_(conf)
+        , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
+        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , local_mem_(local_mem)
+        , row_tile_(row_tile)
+        , col_tile_(col_tile)
+        , batch_groups_(batch_groups)
+        , needs_atomic_reduce_(needs_atomic_reduce)
+        , po_args_(cgh, ctx, conf_.post_ops) {}
+
+    reduction_kernel_fwd_t(xpu::sycl::in_memory_arg_t &src_arg,
+            xpu::sycl::out_memory_arg_t &dst_arg,
+            const sycl_reduction_conf_t &conf, bool needs_atomic_reduce,
+            LocalMem &local_mem, ::sycl::handler &cgh, const exec_ctx_t &ctx)
+        : conf_(conf)
+        , src_(src_arg)
+        , dst_(dst_arg)
+        , local_mem_(local_mem)
+        , row_tile_(conf.tile_row)
+        , col_tile_(conf.tile_col)
+        , batch_groups_(conf.batch_groups)
+        , needs_atomic_reduce_(needs_atomic_reduce)
+        , po_args_(cgh, ctx, conf_.post_ops) {}
+
+    void operator()(::sycl::nd_item<3> nd_item) const {
+        Reducer reducer(conf_.alg, conf_.p, conf_.eps, conf_.is_first_iter,
+                conf_.is_last_iter);
+
+        LocalMemTile tile(conf_.src_dt, conf_.dst_dt, row_tile_, col_tile_,
+                conf_.reduce_size, conf_.stride_size, nd_item,
+                conf_.bank_offset, reducer.identity());
+
+        // Copy values from global memory to local
+        tile.load_memory(conf_.src_dt, src_ptr(), conf_.local_mem_dt,
+                conf_.src_md, local_ptr(), reducer, conf_.is_first_iter);
+
+        if (conf_.transpose) {
+            // Transpose data in local memory
+            tile.transpose(conf_.local_mem_dt, local_ptr());
+        }
+
+        // Reduce values using subgroup reducer
+        tile.sg_reduce(conf_.local_mem_dt, local_ptr(), conf_.num_sg_reductions,
+                conf_.transpose, reducer);
+
+        if (conf_.alg == alg_kind::reduction_mean
+                && conf_.num_sg_reductions == 0) {
+            // Reduce values using work-item reducer
+            tile.wi_reduce(
+                    conf_.local_mem_dt, local_ptr(), conf_.transpose, reducer);
+        }
+
+        if (needs_atomic_reduce_) {
+            // Reduce remaining values into global memory using global atomics
+            tile.atomic_reduce<Order, DeviceScope, GlobalSpace>(
+                    conf_.local_mem_dt, local_ptr(), conf_.dst_dt, dst_ptr(),
+                    conf_.dst_md,
+                    conf_.reduce_size * conf_.batch_size / batch_groups_,
+                    batch_groups_, conf_.transpose, reducer);
+        } else {
+            // Finalize reduction
+            tile.finalize_reduce(conf_.local_mem_dt, local_ptr(),
+                    conf_.reduce_size, conf_.transpose, reducer);
+            tile.store_memory(conf_.local_mem_dt, local_ptr(), conf_.dst_dt,
+                    dst_ptr(), conf_.dst_md, true, conf_.transpose,
+                    conf_.is_last_iter, conf_.post_ops, po_args_);
+        }
+    }
+
+private:
+    void *src_ptr() const { return src_.get_pointer(); }
+    void *dst_ptr() const { return dst_.get_pointer(); }
+    void *local_ptr() const {
+        return local_mem_.get_multi_ptr<::sycl::access::decorated::no>().get();
+    }
+
+    sycl_reduction_conf_t conf_;
+    xpu::sycl::in_memory_arg_t src_;
+    xpu::sycl::out_memory_arg_t dst_;
+    LocalMem local_mem_;
+    int row_tile_;
+    int col_tile_;
+    int batch_groups_;
+    bool needs_atomic_reduce_;
+    post_op_input_args po_args_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/ref_batch_normalization.hpp b/src/gpu/generic/sycl/ref_batch_normalization.hpp
index 8a235f0088f..d58a5fe49b5 100644
--- a/src/gpu/generic/sycl/ref_batch_normalization.hpp
+++ b/src/gpu/generic/sycl/ref_batch_normalization.hpp
@@ -48,26 +48,35 @@ struct ref_batch_normalization_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper data_d(src_md(0));
             const memory_desc_wrapper dst_d(dst_md(0));
 
-            const bool ok = is_fwd()
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (utils::everyone_is(
-                                f32, src_md()->data_type, dst_md()->data_type)
+            VDISPATCH_BNORM(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_BNORM((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_BNORM(
+                    (utils::everyone_is(
+                             f32, src_md()->data_type, dst_md()->data_type)
                             || utils::everyone_is(bf16, src_md()->data_type,
                                     dst_md()->data_type)
                             || utils::everyone_is(f16, src_md()->data_type,
                                     dst_md()->data_type)
                             || utils::everyone_is(s8, src_md()->data_type,
-                                    dst_md()->data_type))
-                    && check_scale_shift_data_type()
-                    && (attr()->has_default_values()
-                            || with_relu_post_op(is_training()))
-                    && set_default_formats_common()
-                    && memory_desc_wrapper(src_md(0))
-                            == memory_desc_wrapper(dst_md(0))
-                    && md_dims_in_range(src_md());
-            if (!ok) return status::unimplemented;
-            if (src_md(0)->data_type == s8 && !stats_is_src())
-                return status::unimplemented;
+                                    dst_md()->data_type)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_BNORM(
+                    check_scale_shift_data_type(), VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_BNORM((attr()->has_default_values()
+                                    || with_relu_post_op(is_training())),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_BNORM(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_BNORM(memory_desc_wrapper(src_md(0))
+                            == memory_desc_wrapper(dst_md(0)),
+                    VERBOSE_INCONSISTENT_MDS, "src", "dst");
+            VDISPATCH_BNORM(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+
+            VDISPATCH_BNORM(!(src_md(0)->data_type == s8 && !stats_is_src()),
+                    VERBOSE_UNSUPPORTED_DT);
+
             if (is_training() && (fuse_norm_relu() || fuse_norm_add_relu()))
                 init_default_ws(8);
             return init_conf();
@@ -107,29 +116,37 @@ struct ref_batch_normalization_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_dst_d(diff_dst_md(0));
             const memory_desc_wrapper var_d(src_md(2));
 
-            bool ok = !is_fwd()
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (diff_dst_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (utils::everyone_is(f32, src_md()->data_type,
-                                diff_dst_md()->data_type,
-                                diff_src_md()->data_type)
+            VDISPATCH_BNORM(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_BNORM((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_BNORM(
+                    (diff_dst_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_BNORM(
+                    (utils::everyone_is(f32, src_md()->data_type,
+                             diff_dst_md()->data_type, diff_src_md()->data_type)
                             || utils::everyone_is(bf16, src_md()->data_type,
                                     diff_dst_md()->data_type,
                                     diff_src_md()->data_type)
                             || utils::everyone_is(f16, src_md()->data_type,
                                     diff_dst_md()->data_type,
-                                    diff_src_md()->data_type))
-                    && check_scale_shift_data_type()
-                    && attr()->has_default_values()
-                    && set_default_formats_common()
-                    && memory_desc_wrapper(diff_src_md())
-                            == memory_desc_wrapper(diff_dst_md())
-                    && md_dims_in_range(diff_src_md());
-
-            if (!ok) return status::unimplemented;
+                                    diff_src_md()->data_type)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_BNORM(
+                    check_scale_shift_data_type(), VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_BNORM(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_BNORM(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_BNORM(memory_desc_wrapper(diff_src_md())
+                            == memory_desc_wrapper(diff_dst_md()),
+                    VERBOSE_INCONSISTENT_MDS, "diff_src", "diff_dst");
+            VDISPATCH_BNORM(md_dims_in_range(diff_src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "diff_src");
+
             if (fuse_norm_relu() || fuse_norm_add_relu()) {
                 init_default_ws(8);
-                if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
+                VDISPATCH_BNORM(compare_ws(hint_fwd_pd_), VERBOSE_WS_INIT);
             }
             return init_conf();
         }
diff --git a/src/gpu/generic/sycl/ref_binary.cpp b/src/gpu/generic/sycl/ref_binary.cpp
index 47ef0d389cd..18ffdd8842c 100644
--- a/src/gpu/generic/sycl/ref_binary.cpp
+++ b/src/gpu/generic/sycl/ref_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_binary.hpp"
 #include "gpu/generic/sycl/binary_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -31,19 +32,13 @@ status_t ref_binary_t::pd_t::init_conf() {
     conf_.dst_md = xpu::sycl::md_t(dst_md());
     conf_.ndims = ndims();
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
-
     conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
 
     conf_.alg_kind = desc()->alg_kind;
     // Limitations:
     // - Only common scale policy is supported.
-    conf_.do_scale_src0
-            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.do_scale_src1
-            = !attr()->scales_.get(DNNL_ARG_SRC_1).has_default_values();
+    conf_.do_scale_src0 = !attr()->scales_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.do_scale_src1 = !attr()->scales_.has_default_values(DNNL_ARG_SRC_1);
     conf_.is_tensor_op = is_tensor_op();
     for (size_t i = 0; i < xpu::sycl::md_t::max_dims; i++) {
         conf_.broadcast_dims0[i]
@@ -52,31 +47,28 @@ status_t ref_binary_t::pd_t::init_conf() {
                 = conf_.src0_md.dims()[i] != 1 && conf_.src1_md.dims()[i] == 1;
     }
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
 
 status_t ref_binary_t::init(impl::engine_t *engine) {
+    if (memory_desc_wrapper(pd()->dst_md()).size() == 0) return status::success;
+
     const auto kid = ::sycl::get_kernel_id<binary_kernel_vec_t>();
     CHECK(create_kernel(engine, kid, &kernel_));
     return status::success;
 }
 
 status_t ref_binary_t::execute(const exec_ctx_t &ctx) const {
+    if (memory_desc_wrapper(pd()->dst_md()).size() == 0) return status::success;
+
+    ctx.zero_pad_output(DNNL_ARG_TO);
 
     parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         binary_kernel_vec_t binary_kernel(pd()->conf_, cgh, ctx);
 
-        const int block_size = pd()->conf_.block_size;
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_work = wg_size * block_size;
-        const int wg_cnt = utils::div_up(t_work, wg_work);
-
-        cgh.parallel_for(
-                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), binary_kernel);
+        cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size), binary_kernel);
     });
 
     return status::success;
diff --git a/src/gpu/generic/sycl/ref_binary.hpp b/src/gpu/generic/sycl/ref_binary.hpp
index 3a7bac750f8..2dfb2cf40c2 100644
--- a/src/gpu/generic/sycl/ref_binary.hpp
+++ b/src/gpu/generic/sycl/ref_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,19 +48,29 @@ struct ref_binary_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper src1_d(src_md(1));
             const memory_desc_wrapper dst_d(dst_md());
 
-            const bool ok = set_default_params() == status::success
-                    && attr_.set_default_formats(dst_md()) == status::success
-                    && check_data_types(src0_d, src1_d, dst_d)
-                    && check_formats(src0_d, src1_d, dst_d)
-                    && attr()->has_default_values(
-                            sm::scales_runtime | sm::post_ops)
-                    && IMPLICATION(!attr()->scales_.has_default_values(),
-                            check_scales_mask())
-                    && sycl_post_ops_t::post_ops_ok(attr())
-                    && md_dims_in_range(src_md(0))
-                    && md_dims_in_range(src_md(1))
-                    && md_dims_in_range(dst_md());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_BINARY_SC(
+                    set_default_params(), VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_BINARY_SC(attr_.set_default_formats(dst_md()),
+                    VERBOSE_UNSUPPORTED_TAG_S, "dst");
+            VDISPATCH_BINARY(check_data_types(src0_d, src1_d, dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_BINARY(check_formats(src0_d, src1_d, dst_d),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_BINARY(
+                    attr()->has_default_values(sm::scales | sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_BINARY(!is_ternary_op(), VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_BINARY(IMPLICATION(!attr()->scales_.has_default_values(),
+                                     scales_ok()),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_BINARY(sycl_post_ops_t::post_ops_ok(attr()),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_BINARY(md_dims_in_range(src_md(0)),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src(0)");
+            VDISPATCH_BINARY(md_dims_in_range(src_md(1)),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src(1)");
+            VDISPATCH_BINARY(md_dims_in_range(dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "dst");
 
             return init_conf();
         }
@@ -70,10 +80,16 @@ struct ref_binary_t : public gpu::generic::sycl::primitive_t {
     private:
         status_t init_conf();
 
-        bool check_scales_mask() const {
+        bool scales_ok() const {
             const std::vector<int> supported_args
                     = {DNNL_ARG_SRC_0, DNNL_ARG_SRC_1};
-            return attr_scales_ok(supported_args);
+
+            const auto &scales = attr()->scales_;
+            bool dt_ok = true;
+            for (auto arg : supported_args) {
+                dt_ok = dt_ok && is_supported_type(scales.get_data_type(arg));
+            }
+            return dt_ok && attr_scales_ok(supported_args);
         }
 
         static bool check_data_types(const memory_desc_wrapper &src0,
@@ -99,8 +115,11 @@ struct ref_binary_t : public gpu::generic::sycl::primitive_t {
             using namespace format_tag;
 
             for (const auto &mdw : {src0, src1, dst}) {
-                if (!mdw.is_plain()) { return false; }
+                if (!(mdw.is_plain() || mdw.matches_tag(format_tag::Ab32a)
+                            || mdw.matches_tag(format_tag::aBc32b)))
+                    return false;
             }
+
             return true;
         }
     };
diff --git a/src/gpu/generic/sycl/ref_convolution.cpp b/src/gpu/generic/sycl/ref_convolution.cpp
index 761fa233ae1..ec61e977432 100644
--- a/src/gpu/generic/sycl/ref_convolution.cpp
+++ b/src/gpu/generic/sycl/ref_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_convolution.hpp"
 #include "gpu/generic/sycl/convolution_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -24,37 +25,35 @@ namespace generic {
 namespace sycl {
 
 status_t ref_convolution_fwd_t::pd_t::init_conf() {
-    conf_ = sycl_convolution_conf_t();
+    conf_ = sycl_convolution_fwd_conf_t();
 
     conf_.data_md = xpu::sycl::md_t(src_md());
     conf_.weights_md = xpu::sycl::md_t(weights_md(0));
-    if (with_bias()) { conf_.bias_md = xpu::sycl::md_t(weights_md(1)); }
+    if (with_bias()) {
+        conf_.bias_dt = weights_md(1)->data_type;
+        conf_.has_bias = true;
+    }
     conf_.dst_md = xpu::sycl::md_t(dst_md());
     conf_.ndims = ndims();
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
-
     conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
 
-    conf_.do_scale_data
-            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
+    conf_.do_scale_data = !attr()->scales_.has_default_values(DNNL_ARG_SRC_0);
     conf_.do_scale_weights
-            = !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-    conf_.do_scale_dst
-            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-    conf_.single_weight_scale
-            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0;
+            = !attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
+    conf_.do_scale_dst = !attr()->scales_.has_default_values(DNNL_ARG_DST);
+    conf_.single_weight_scale = attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) == 0;
 
     conf_.use_data_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC);
     conf_.use_dst_zeropoints
             = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
-    conf_.single_data_zeropoint = attr()->zero_points_.common(DNNL_ARG_SRC_0);
-    conf_.single_dst_zeropoint = attr()->zero_points_.common(DNNL_ARG_DST);
+    conf_.single_data_zeropoint
+            = attr()->zero_points_.get_mask(DNNL_ARG_SRC) == 0;
+    conf_.single_dst_zeropoint
+            = attr()->zero_points_.get_mask(DNNL_ARG_DST) == 0;
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
     conf_.padding[1] = static_cast<int>(desc()->padding[0][1]);
@@ -77,55 +76,33 @@ status_t ref_convolution_fwd_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_convolution_fwd_t::execute(const exec_ctx_t &ctx) const {
+    if (memory_desc_wrapper(pd()->dst_md()).size() == 0) return status::success;
+
     parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         convolution_kernel_fwd_t convolution_kernel(pd()->conf_, cgh, ctx);
 
-        const int block_size = pd()->conf_.block_size;
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_work = wg_size * block_size;
-        const int wg_cnt = utils::div_up(t_work, wg_work);
-
-        cgh.parallel_for(::sycl::nd_range<1>(wg_cnt * wg_size, wg_size),
-                convolution_kernel);
+        cgh.parallel_for(
+                get_range(ctx, pd()->conf_.wk_size), convolution_kernel);
     });
 
     return status::success;
 }
 
 status_t ref_convolution_bwd_data_t::pd_t::init_conf() {
-    conf_ = sycl_convolution_conf_t();
+    conf_ = sycl_convolution_bwd_data_conf_t();
 
     conf_.diff_data_md = xpu::sycl::md_t(diff_src_md());
     conf_.weights_md = xpu::sycl::md_t(weights_md(0));
-    if (with_bias()) { conf_.bias_md = xpu::sycl::md_t(weights_md(1)); }
+    if (with_bias()) {
+        conf_.bias_dt = weights_md(1)->data_type;
+        conf_.has_bias = true;
+    }
     conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md());
     conf_.ndims = ndims();
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
-
     conf_.wk_size = memory_desc_wrapper(diff_src_md()).nelems();
 
-    conf_.do_scale_data
-            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.do_scale_weights
-            = !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-    conf_.do_scale_dst
-            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-    conf_.single_weight_scale
-            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0;
-
-    conf_.use_data_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
-    conf_.use_dst_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
-    conf_.single_data_zeropoint = attr()->zero_points_.common(DNNL_ARG_SRC_0);
-    conf_.single_dst_zeropoint = attr()->zero_points_.common(DNNL_ARG_DST);
-
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), diff_src_md());
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
     conf_.padding[1] = static_cast<int>(desc()->padding[0][1]);
@@ -148,55 +125,34 @@ status_t ref_convolution_bwd_data_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_convolution_bwd_data_t::execute(const exec_ctx_t &ctx) const {
+    if (memory_desc_wrapper(pd()->diff_src_md()).size() == 0)
+        return status::success;
+
     parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         convolution_kernel_bwd_data_t convolution_kernel(pd()->conf_, cgh, ctx);
 
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_cnt = utils::div_up(t_work, wg_size);
-
-        cgh.parallel_for(::sycl::nd_range<1>(wg_cnt * wg_size, wg_size),
-                convolution_kernel);
+        cgh.parallel_for(
+                get_range(ctx, pd()->conf_.wk_size), convolution_kernel);
     });
 
     return status::success;
 }
 
 status_t ref_convolution_bwd_weights_t::pd_t::init_conf() {
-    conf_ = sycl_convolution_conf_t();
+    conf_ = sycl_convolution_bwd_weights_conf_t();
 
     conf_.data_md = xpu::sycl::md_t(src_md());
     conf_.diff_weights_md = xpu::sycl::md_t(diff_weights_md(0));
     if (with_bias()) {
-        conf_.diff_bias_md = xpu::sycl::md_t(diff_weights_md(1));
+        conf_.bias_dt = diff_weights_md(1)->data_type;
+        conf_.has_bias = true;
     }
     conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md());
     conf_.ndims = ndims();
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
-
     conf_.wk_size = memory_desc_wrapper(diff_weights_md()).nelems();
 
-    conf_.do_scale_data
-            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.do_scale_weights
-            = !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-    conf_.do_scale_dst
-            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-    conf_.single_weight_scale
-            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0;
-
-    conf_.use_data_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
-    conf_.use_dst_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
-    conf_.single_data_zeropoint = attr()->zero_points_.common(DNNL_ARG_SRC_0);
-    conf_.single_dst_zeropoint = attr()->zero_points_.common(DNNL_ARG_DST);
-
-    conf_.post_ops = sycl_post_ops_t(attr());
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
     conf_.padding[1] = static_cast<int>(desc()->padding[0][1]);
@@ -224,13 +180,8 @@ status_t ref_convolution_bwd_weights_t::execute(const exec_ctx_t &ctx) const {
         convolution_kernel_bwd_weights_t convolution_kernel(
                 pd()->conf_, cgh, ctx, DNNL_ARG_SRC, DNNL_ARG_DIFF_DST);
 
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_cnt = utils::div_up(t_work, wg_size);
-
-        cgh.parallel_for(::sycl::nd_range<1>(wg_cnt * wg_size, wg_size),
-                convolution_kernel);
+        cgh.parallel_for(
+                get_range(ctx, pd()->conf_.wk_size), convolution_kernel);
     });
 
     return status::success;
diff --git a/src/gpu/generic/sycl/ref_convolution.hpp b/src/gpu/generic/sycl/ref_convolution.hpp
index b70f05c331e..5622ae31e2c 100644
--- a/src/gpu/generic/sycl/ref_convolution.hpp
+++ b/src/gpu/generic/sycl/ref_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,22 +32,16 @@ namespace gpu {
 namespace generic {
 namespace sycl {
 
-static bool check_convolution_data_types(const memory_desc_wrapper &src0,
+inline bool check_convolution_data_types(const memory_desc_wrapper &src0,
         const memory_desc_wrapper &src1, const memory_desc_wrapper &dst) {
-    using namespace data_type;
-
-    const auto src0_dt = src0.data_type();
-    const auto src1_dt = src1.data_type();
-    const auto dst_dt = dst.data_type();
-
-    for (auto t : {src0_dt, src1_dt, dst_dt}) {
-        if (!utils::one_of(t, f32, bf16, f16, s32, s8, u8)) return false;
+    for (const auto &mdw : {src0, src1, dst}) {
+        if (!is_supported_type(mdw.data_type())) return false;
     }
 
     return true;
 }
 
-static bool check_convolution_formats(const memory_desc_wrapper &src0,
+inline bool check_convolution_formats(const memory_desc_wrapper &src0,
         const memory_desc_wrapper &src1, const memory_desc_wrapper &dst) {
     using namespace format_tag;
 
@@ -57,7 +51,7 @@ static bool check_convolution_formats(const memory_desc_wrapper &src0,
     return true;
 }
 
-static bool check_convolution_work_amount(
+inline bool check_convolution_work_amount(
         const memory_desc_wrapper &weights, dim_t OC) {
     auto elems = weights.nelems();
     auto work_per_output = elems / OC;
@@ -66,6 +60,18 @@ static bool check_convolution_work_amount(
     return work_per_output < 200000;
 }
 
+inline bool check_convolution_scales_types(const primitive_attr_t *attr) {
+    const std::vector<int> supported_args
+            = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
+
+    const auto &scales = attr->scales_;
+    for (auto arg : supported_args) {
+        const auto dt = scales.get_data_type(arg);
+        if (!is_supported_type(dt)) { return false; }
+    }
+    return true;
+}
+
 struct ref_convolution_fwd_t : public gpu::generic::sycl::primitive_t {
     using gpu::generic::sycl::primitive_t::primitive_t;
 
@@ -82,24 +88,38 @@ struct ref_convolution_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper weights_d(weights_md());
             const memory_desc_wrapper dst_d(dst_md());
 
-            const bool ok = is_fwd()
-                    && check_convolution_work_amount(weights_d, OC())
-                    && set_default_formats() && md_dims_in_range(src_md())
-                    && attr_.set_default_formats(dst_md()) == status::success
-                    && check_convolution_data_types(data_d, weights_d, dst_d)
-                    && check_convolution_formats(data_d, weights_d, dst_d)
-                    && attr()->has_default_values(sm::scales_runtime
-                            | sm::zero_points_runtime | sm::post_ops
-                            | sm::sum_dt)
-                    && IMPLICATION(!attr()->scales_.has_default_values(),
-                            attr_scales_ok())
-                    && sycl_post_ops_t::post_ops_ok(attr(), false);
-            if (!ok) return status::unimplemented;
+            VDISPATCH_CONV(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(check_convolution_work_amount(weights_d, OC()),
+                    VERBOSE_IMPL_HEURISTIC_FAIL,
+                    "number of elements exceeds threshold");
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+            VDISPATCH_CONV_SC(attr_.set_default_formats(dst_md()),
+                    VERBOSE_UNSUPPORTED_TAG_S, "dst");
+            VDISPATCH_CONV(
+                    check_convolution_data_types(data_d, weights_d, dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_CONV(check_convolution_formats(data_d, weights_d, dst_d),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(sm::scales | sm::zero_points
+                            | sm::post_ops | sm::sum_dt),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(
+                    IMPLICATION(!attr()->scales_.has_default_values(),
+                            attr_scales_ok()
+                                    && check_convolution_scales_types(attr())),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_CONV(sycl_post_ops_t::post_ops_ok(attr(), false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
 
             return init_conf();
         }
 
-        sycl_convolution_conf_t conf_;
+        sycl_convolution_fwd_conf_t conf_;
 
     private:
         status_t init_conf();
@@ -138,23 +158,39 @@ struct ref_convolution_bwd_data_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper weights_d(weights_md());
             const memory_desc_wrapper diff_dst_d(diff_dst_md());
 
-            const bool ok = is_bwd_d()
-                    && check_convolution_work_amount(weights_d, OC())
-                    && md_dims_in_range(src_md()) && set_default_formats()
-                    && check_convolution_data_types(
-                            diff_data_d, weights_d, diff_dst_d)
-                    && check_convolution_formats(
-                            diff_data_d, weights_d, diff_dst_d)
-                    && attr()->has_default_values(sm::scales_runtime
-                            | sm::zero_points_runtime | sm::sum_dt)
-                    && IMPLICATION(!attr()->scales_.has_default_values(),
-                            attr_scales_ok());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_CONV(is_bwd_d(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(check_convolution_work_amount(weights_d, OC()),
+                    VERBOSE_IMPL_HEURISTIC_FAIL,
+                    "number of elements exceed threshold");
+            VDISPATCH_CONV(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(check_convolution_data_types(
+                                   diff_data_d, weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_CONV(check_convolution_formats(
+                                   diff_data_d, weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(sm::scales | sm::zero_points
+                            | sm::sum_dt | sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(sycl_post_ops_t::post_ops_ok(attr(), false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_CONV(
+                    IMPLICATION(!attr()->scales_.has_default_values(),
+                            attr_scales_ok()
+                                    && check_convolution_scales_types(attr())),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
+            VDISPATCH_CONV(sycl_post_ops_t::post_ops_ok(attr(), false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
 
             return init_conf();
         }
 
-        sycl_convolution_conf_t conf_;
+        sycl_convolution_bwd_data_conf_t conf_;
 
     private:
         status_t init_conf();
@@ -187,29 +223,33 @@ struct ref_convolution_bwd_weights_t : public gpu::generic::sycl::primitive_t {
 
         status_t init(impl::engine_t *engine) {
             using namespace data_type;
-            using sm = primitive_attr_t::skip_mask_t;
 
             const memory_desc_wrapper data_d(src_md());
             const memory_desc_wrapper diff_weights_d(diff_weights_md());
             const memory_desc_wrapper diff_dst_d(diff_dst_md());
 
-            const bool ok = is_bwd_w()
-                    && check_convolution_work_amount(diff_weights_d, OC())
-                    && md_dims_in_range(src_md()) && set_default_formats()
-                    && check_convolution_data_types(
-                            data_d, diff_weights_d, diff_dst_d)
-                    && check_convolution_formats(
-                            data_d, diff_weights_d, diff_dst_d)
-                    && attr()->has_default_values(sm::scales_runtime
-                            | sm::zero_points_runtime | sm::sum_dt)
-                    && IMPLICATION(!attr()->scales_.has_default_values(),
-                            attr_scales_ok());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_CONV(is_bwd_w(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_CONV(check_convolution_work_amount(diff_weights_d, OC()),
+                    VERBOSE_IMPL_HEURISTIC_FAIL,
+                    "number of elements exceed threshold");
+            VDISPATCH_CONV(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+            VDISPATCH_CONV(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(check_convolution_data_types(
+                                   data_d, diff_weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_CONV(check_convolution_formats(
+                                   data_d, diff_weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_CONV(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
+                    VERBOSE_BAD_ALGORITHM);
 
             return init_conf();
         }
 
-        sycl_convolution_conf_t conf_;
+        sycl_convolution_bwd_weights_conf_t conf_;
 
     private:
         status_t init_conf();
diff --git a/src/gpu/generic/sycl/ref_deconvolution.cpp b/src/gpu/generic/sycl/ref_deconvolution.cpp
index e6a483e9253..23fea027deb 100644
--- a/src/gpu/generic/sycl/ref_deconvolution.cpp
+++ b/src/gpu/generic/sycl/ref_deconvolution.cpp
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_deconvolution.hpp"
 #include "gpu/generic/sycl/convolution_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -24,11 +25,12 @@ namespace generic {
 namespace sycl {
 
 status_t ref_deconvolution_bwd_weights_t::pd_t::init_conf() {
-    conf_ = sycl_convolution_conf_t();
+    conf_ = sycl_convolution_bwd_weights_conf_t();
 
     conf_.diff_dst_md = xpu::sycl::md_t(src_md());
     if (with_bias()) {
-        conf_.diff_bias_md = xpu::sycl::md_t(diff_weights_md(1));
+        conf_.bias_dt = diff_weights_md(1)->data_type;
+        conf_.has_bias = true;
     }
     conf_.data_md = xpu::sycl::md_t(diff_dst_md());
     conf_.ndims = ndims();
@@ -63,10 +65,6 @@ status_t ref_deconvolution_bwd_weights_t::pd_t::init_conf() {
 
     conf_.diff_weights_md = xpu::sycl::md_t(&diff_weights_md_copy);
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
-
     conf_.wk_size = memory_desc_wrapper(diff_weights_md()).nelems();
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
@@ -96,13 +94,8 @@ status_t ref_deconvolution_bwd_weights_t::execute(const exec_ctx_t &ctx) const {
         convolution_kernel_bwd_weights_t convolution_kernel(
                 pd()->conf_, cgh, ctx, DNNL_ARG_DIFF_DST, DNNL_ARG_SRC);
 
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_cnt = utils::div_up(t_work, wg_size);
-
-        cgh.parallel_for(::sycl::nd_range<1>(wg_cnt * wg_size, wg_size),
-                convolution_kernel);
+        cgh.parallel_for(
+                get_range(ctx, pd()->conf_.wk_size), convolution_kernel);
     });
 
     return status::success;
diff --git a/src/gpu/generic/sycl/ref_deconvolution.hpp b/src/gpu/generic/sycl/ref_deconvolution.hpp
index b4d252b597b..36b04c83aed 100644
--- a/src/gpu/generic/sycl/ref_deconvolution.hpp
+++ b/src/gpu/generic/sycl/ref_deconvolution.hpp
@@ -44,28 +44,38 @@ struct ref_deconvolution_bwd_weights_t
 
         status_t init(impl::engine_t *engine) {
             using namespace data_type;
-            using sm = primitive_attr_t::skip_mask_t;
 
             const memory_desc_wrapper data_d(src_md());
             const memory_desc_wrapper diff_weights_d(diff_weights_md());
             const memory_desc_wrapper diff_dst_d(diff_dst_md());
 
-            const bool ok = desc()->prop_kind == prop_kind::backward_weights
-                    && check_convolution_work_amount(diff_weights_d, OC())
-                    && md_dims_in_range(src_md()) && set_default_formats()
-                    && check_convolution_data_types(
-                            data_d, diff_weights_d, diff_dst_d)
-                    && check_convolution_formats(
-                            data_d, diff_weights_d, diff_dst_d)
-                    && attr()->has_default_values(sm::scales_runtime
-                            | sm::zero_points_runtime | sm::post_ops
-                            | sm::sum_dt);
-            if (!ok) return status::unimplemented;
+            VDISPATCH_DECONVOLUTION(
+                    desc()->prop_kind == prop_kind::backward_weights,
+                    VERBOSE_BAD_PROPKIND);
+            VDISPATCH_DECONVOLUTION(
+                    check_convolution_work_amount(diff_weights_d, OC()),
+                    VERBOSE_IMPL_HEURISTIC_FAIL,
+                    "number of elements exceeds threshold");
+            VDISPATCH_DECONVOLUTION(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+            VDISPATCH_DECONVOLUTION(
+                    set_default_formats(), VERBOSE_UNSUPPORTED_TAG_S);
+            VDISPATCH_DECONVOLUTION(check_convolution_data_types(
+                                            data_d, diff_weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_DECONVOLUTION(check_convolution_formats(
+                                            data_d, diff_weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_DECONVOLUTION(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_DECONVOLUTION(
+                    desc()->alg_kind == alg_kind::deconvolution_direct,
+                    VERBOSE_BAD_ALGORITHM);
 
             return init_conf();
         }
 
-        sycl_convolution_conf_t conf_;
+        sycl_convolution_bwd_weights_conf_t conf_;
 
     private:
         status_t init_conf();
diff --git a/src/gpu/generic/sycl/ref_eltwise.cpp b/src/gpu/generic/sycl/ref_eltwise.cpp
index 3f7721251eb..1307b18189f 100644
--- a/src/gpu/generic/sycl/ref_eltwise.cpp
+++ b/src/gpu/generic/sycl/ref_eltwise.cpp
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_eltwise.hpp"
 #include "gpu/generic/sycl/eltwise_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -31,27 +32,13 @@ status_t ref_sycl_eltwise_fwd_t::pd_t::init_conf() {
     conf_.alg_kind = desc()->alg_kind;
     conf_.alpha = desc()->alpha;
     conf_.beta = desc()->beta;
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
     conf_.mb = MB();
     conf_.c = C();
     conf_.d = D();
     conf_.h = H();
     conf_.w = W();
 
-    if (attr()->post_ops_.len() > sycl_post_ops_t::max_post_ops) {
-        return status::unimplemented;
-    }
-    conf_.post_po_len = attr()->post_ops_.len();
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
-
-    const int block_size = conf_.block_size;
-    const int wg_size = conf_.wg_size;
-    const int t_work = conf_.wk_size;
-    int wg_work = wg_size * block_size;
-    int wg_cnt = (t_work + wg_work - 1) / wg_work;
-    wg_thr = wg_cnt * wg_size;
-    wg_thr = wg_thr < 1 ? 1 : wg_thr;
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
@@ -65,8 +52,8 @@ status_t ref_sycl_eltwise_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         eltwise_fwd_kernel_vec_t eltwise_fwd_kernel_(pd()->conf_, cgh, ctx);
 
-        cgh.parallel_for(::sycl::nd_range<1>(pd()->wg_thr, pd()->conf_.wg_size),
-                eltwise_fwd_kernel_);
+        cgh.parallel_for(
+                get_range(ctx, pd()->conf_.wk_size), eltwise_fwd_kernel_);
     });
 }
 
@@ -75,8 +62,6 @@ status_t ref_sycl_eltwise_bwd_t::pd_t::init_conf() {
     conf_.src_md = xpu::sycl::md_t(data_md(0));
     conf_.diff_src_md = xpu::sycl::md_t(diff_src_md());
     conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md());
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
     conf_.wk_size = memory_desc_wrapper(data_md(0)).nelems();
     conf_.alg_kind = desc()->alg_kind;
     conf_.alpha = desc()->alpha;
@@ -87,14 +72,6 @@ status_t ref_sycl_eltwise_bwd_t::pd_t::init_conf() {
     conf_.h = H();
     conf_.w = W();
 
-    const int block_size = conf_.block_size;
-    const int wg_size = conf_.wg_size;
-    const int t_work = conf_.wk_size;
-    int wg_work = wg_size * block_size;
-    int wg_cnt = (t_work + wg_work - 1) / wg_work;
-    wg_thr = wg_cnt * wg_size;
-    wg_thr = wg_thr < 1 ? 1 : wg_thr;
-
     return status::success;
 }
 
@@ -108,8 +85,8 @@ status_t ref_sycl_eltwise_bwd_t::execute_backward(const exec_ctx_t &ctx) const {
         eltwise_bwd_kernel_vec_t eltwise_bwd_kernel_(
                 pd()->conf_, cgh, ctx, pd()->use_dst());
 
-        cgh.parallel_for(::sycl::nd_range<1>(pd()->wg_thr, pd()->conf_.wg_size),
-                eltwise_bwd_kernel_);
+        cgh.parallel_for(
+                get_range(ctx, pd()->conf_.wk_size), eltwise_bwd_kernel_);
     });
 }
 
diff --git a/src/gpu/generic/sycl/ref_eltwise.hpp b/src/gpu/generic/sycl/ref_eltwise.hpp
index 3b854697c96..43f29c2997a 100644
--- a/src/gpu/generic/sycl/ref_eltwise.hpp
+++ b/src/gpu/generic/sycl/ref_eltwise.hpp
@@ -44,22 +44,29 @@ struct ref_sycl_eltwise_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper src_d(src_md());
             const memory_desc_wrapper dst_d(dst_md());
 
-            const bool ok = is_fwd()
-                    && check_data_types(
-                            src_md()->data_type, dst_md()->data_type)
-                    && (src_md()->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values(sm::post_ops)
-                    && set_default_formats_common() && src_d == dst_d
-                    && attr_.set_default_formats(dst_md(0)) == status::success
-                    && sycl_post_ops_t::post_ops_ok(attr())
-                    && md_dims_in_range(src_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_ELTWISE(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_ELTWISE(
+                    check_data_types(src_md()->data_type, dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_ELTWISE((src_md()->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_ELTWISE(attr()->has_default_values(sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_ELTWISE(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_ELTWISE(
+                    src_d == dst_d, VERBOSE_INCONSISTENT_MDS, "src", "dst");
+            VDISPATCH_ELTWISE(
+                    attr_.set_default_formats(dst_md(0)) == status::success,
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_ELTWISE(sycl_post_ops_t::post_ops_ok(attr()),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_ELTWISE(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
             return init_conf();
         }
 
         sycl_eltwise_conf_t conf_;
-        dim_t wg_thr;
 
     private:
         status_t init_conf();
@@ -102,19 +109,24 @@ struct ref_sycl_eltwise_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_src_d(diff_src_md());
             const memory_desc_wrapper diff_dst_d(diff_dst_md());
 
-            const bool ok = !is_fwd()
-                    && check_data_types(data_md()->data_type,
-                            diff_src_md()->data_type, diff_dst_md()->data_type)
-                    && (data_md()->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values()
-                    && set_default_formats_common() && diff_dst_d == diff_src_d;
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_ELTWISE(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_ELTWISE(
+                    check_data_types(data_md()->data_type,
+                            diff_src_md()->data_type, diff_dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_ELTWISE(
+                    (data_md()->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_ELTWISE(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_ELTWISE(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_ELTWISE(diff_dst_d == diff_src_d,
+                    VERBOSE_INCONSISTENT_MDS, "diff_src", "diff_dst");
             return init_conf();
         }
 
         sycl_eltwise_conf_t conf_;
-        dim_t wg_thr;
 
     private:
         status_t init_conf();
diff --git a/src/gpu/generic/sycl/ref_group_normalization.cpp b/src/gpu/generic/sycl/ref_group_normalization.cpp
new file mode 100644
index 00000000000..26c4e36a9ce
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_group_normalization.cpp
@@ -0,0 +1,174 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+* Copyright 2025 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/ref_group_normalization.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+status_t ref_group_normalization_fwd_t::pd_t::init(impl::engine_t *engine) {
+    using namespace data_type;
+    VDISPATCH_GNORM(set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_GNORM(is_fwd(), VERBOSE_BAD_PROPKIND);
+
+    auto src_mdw = memory_desc_wrapper(arg_md(DNNL_ARG_SRC));
+    auto dst_mdw = memory_desc_wrapper(arg_md(DNNL_ARG_DST));
+    auto src_dt = src_mdw.data_type();
+    auto dst_dt = dst_mdw.data_type();
+
+    VDISPATCH_GNORM(utils::one_of(src_dt, f32, bf16, f16, s8, u8),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_GNORM(utils::one_of(dst_dt, f32, bf16, f16, s8, u8),
+            VERBOSE_UNSUPPORTED_DT);
+
+    VDISPATCH_GNORM(attr_.set_default_formats(dst_md()) == status::success,
+            VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_GNORM(sycl_post_ops_t::post_ops_ok(attr()),
+            "sycl post op initialization returns false");
+
+    const primitive_attr_t::skip_mask_t attr_mask
+            = primitive_attr_t::skip_mask_t::scales
+            | primitive_attr_t::skip_mask_t::post_ops;
+    VDISPATCH_GNORM(
+            attr()->has_default_values(attr_mask), VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_GNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+    VDISPATCH_GNORM(check_scale_shift_data_type(),
+            "scale / shift data type must be data_type::f32");
+    VDISPATCH_GNORM(
+            sycl_post_ops_t::post_ops_ok(attr()), VERBOSE_UNSUPPORTED_POSTOP);
+
+    const auto &dims = src_mdw.dims();
+    const auto num_groups = desc()->groups;
+    VDISPATCH_GNORM(dims[1] % num_groups == 0,
+            "number of groups must divide the channels evenly");
+
+    auto batch_size = static_cast<std::size_t>(dims[0]);
+    auto group_range = static_cast<std::size_t>(desc()->groups);
+
+    auto device = utils::downcast<const impl::xpu::sycl::engine_impl_t *>(
+            engine->impl())
+                          ->device();
+
+    // To avoid using excess registers error
+    auto local_range = std::max(std::size_t(64),
+            device.get_info<::sycl::info::device::max_work_group_size>() / 8);
+    launch_range = ::sycl::nd_range<2>(
+            {batch_size * local_range, group_range}, {local_range, 1});
+
+    conf_ = sycl_group_norm_conf_t();
+    conf_.src_desc = xpu::sycl::md_t(arg_md(DNNL_ARG_SRC));
+    conf_.dst_desc = xpu::sycl::md_t(arg_md(DNNL_ARG_DST));
+    conf_.use_global_stats = stats_is_src();
+    conf_.num_groups = static_cast<int32_t>(group_range);
+    conf_.num_channels_per_group = static_cast<int32_t>(dims[1] / group_range);
+    conf_.use_scale = use_scale();
+    conf_.use_shift = use_shift();
+    conf_.src_scaling = !attr()->scales_.has_default_values(DNNL_ARG_SRC);
+    conf_.dst_scaling = !attr()->scales_.has_default_values(DNNL_ARG_DST);
+    conf_.eta = desc()->group_norm_epsilon;
+    conf_.post_ops = {attr(), dst_mdw};
+    return status::success;
+}
+
+status_t ref_group_normalization_fwd_t::init(impl::engine_t *engine) {
+    auto kid = ::sycl::get_kernel_id<group_norm_fwd_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+    return status::success;
+}
+
+status_t ref_group_normalization_fwd_t::execute(const exec_ctx_t &ctx) const {
+    exec_args_t cloned_args(ctx.args());
+    // Circumventing the const pointers when mean and value
+    // use_global_stats is set, to avoid creating 2 kernels just
+    // for this
+    cloned_args[DNNL_ARG_MEAN]
+            = memory_arg_t {cloned_args[DNNL_ARG_MEAN].mem, false};
+    cloned_args[DNNL_ARG_VARIANCE]
+            = memory_arg_t {cloned_args[DNNL_ARG_VARIANCE].mem, false};
+
+    exec_ctx_t exec_ctx(ctx.stream(), std::move(cloned_args));
+    auto &conf_ = pd()->conf_;
+    auto launch_range = pd()->launch_range;
+
+    parallel_for(exec_ctx, kernel_, [&](::sycl::handler &cgh) {
+        ::sycl::local_accessor<float, 1> local_memory(
+                launch_range.get_local_range()[0] + 2, cgh);
+        cgh.parallel_for(launch_range,
+                group_norm_fwd_t(conf_, local_memory, cgh, exec_ctx));
+    });
+    return status::success;
+}
+
+status_t ref_group_normalization_bwd_t::pd_t::init(impl::engine_t *engine) {
+    using namespace data_type;
+    VDISPATCH_GNORM(set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_GNORM(!is_fwd(), VERBOSE_BAD_PROPKIND);
+    VDISPATCH_GNORM(attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+
+    auto src_mdw = memory_desc_wrapper(arg_md(DNNL_ARG_SRC));
+    auto diff_src_mdw = memory_desc_wrapper(arg_md(DNNL_ARG_DIFF_SRC));
+    auto diff_dst_mdw = memory_desc_wrapper(arg_md(DNNL_ARG_DIFF_DST));
+    VDISPATCH_GNORM(utils::one_of(src_mdw.data_type(), f32, bf16, f16),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_GNORM(utils::one_of(diff_src_mdw.data_type(), f32, bf16, f16),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_GNORM(utils::one_of(diff_dst_mdw.data_type(), f32, bf16, f16),
+            VERBOSE_UNSUPPORTED_DT);
+
+    auto device = utils::downcast<const impl::xpu::sycl::engine_impl_t *>(
+            engine->impl())
+                          ->device();
+
+    dim_t num_channels = src_mdw.dims()[1];
+    // To avoid using excess registers error
+    auto local_range = std::max(std::size_t(64),
+            device.get_info<::sycl::info::device::max_work_group_size>() / 8);
+    launch_range = ::sycl::nd_range<1>(
+            {static_cast<std::size_t>(num_channels) * local_range},
+            {local_range});
+
+    conf_ = sycl_gnorm_bwd_conf_t();
+    conf_.src_desc = xpu::sycl::md_t(arg_md(DNNL_ARG_SRC));
+    conf_.diff_src_desc = xpu::sycl::md_t(arg_md(DNNL_ARG_DIFF_SRC));
+    conf_.diff_dst_desc = xpu::sycl::md_t(arg_md(DNNL_ARG_DIFF_DST));
+    conf_.num_groups = desc()->groups;
+    conf_.num_channels_per_group = src_mdw.dims()[1] / desc()->groups;
+    conf_.scale_diff_required = use_scale();
+    conf_.bias_diff_required = use_shift();
+    conf_.used_global_stats = stats_is_src();
+    conf_.eta = desc()->group_norm_epsilon;
+    return status::success;
+}
+
+status_t ref_group_normalization_bwd_t::init(impl::engine_t *engine) {
+    auto kid = ::sycl::get_kernel_id<group_norm_bwd_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+    return status::success;
+}
+
+status_t ref_group_normalization_bwd_t::execute(const exec_ctx_t &ctx) const {
+    auto &conf_ = pd()->conf_;
+    auto launch_range = pd()->launch_range;
+
+    parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        ::sycl::local_accessor<float, 1> local_memory(
+                launch_range.get_local_range()[0], cgh);
+        cgh.parallel_for(
+                launch_range, group_norm_bwd_t(conf_, local_memory, cgh, ctx));
+    });
+    return status::success;
+}
+
+} // namespace dnnl::impl::gpu::generic::sycl
diff --git a/src/gpu/generic/sycl/ref_group_normalization.hpp b/src/gpu/generic/sycl/ref_group_normalization.hpp
new file mode 100644
index 00000000000..bd613f7da17
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_group_normalization.hpp
@@ -0,0 +1,81 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+* Copyright 2025 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REF_GROUP_NORMALIZATION_HPP
+#define GPU_GENERIC_SYCL_REF_GROUP_NORMALIZATION_HPP
+
+#include "common/group_normalization_pd.hpp"
+#include "common/primitive_attr.hpp"
+#include "common/primitive_exec_types.hpp"
+#include "gpu/generic/sycl/engine.hpp"
+#include "gpu/generic/sycl/group_normalization_kernel.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+struct ref_group_normalization_fwd_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public group_normalization_fwd_pd_t {
+        using group_normalization_fwd_pd_t::group_normalization_fwd_pd_t;
+        DECLARE_COMMON_PD_T(
+                "ref:sycl:group_normalization", ref_group_normalization_fwd_t);
+
+        status_t init(impl::engine_t *engine);
+
+        ::sycl::nd_range<2> launch_range;
+        sycl_group_norm_conf_t conf_;
+    };
+
+    kernel_t kernel_;
+    status_t init(impl::engine_t *engine) override;
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const {
+        return dynamic_cast<pd_t *>(primitive_t::pd().get());
+    }
+};
+
+struct ref_group_normalization_bwd_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public group_normalization_bwd_pd_t {
+        using group_normalization_bwd_pd_t ::group_normalization_bwd_pd_t;
+        DECLARE_COMMON_PD_T("ref:sycl:group_normalization_bwd",
+                ref_group_normalization_bwd_t);
+
+        status_t init(impl::engine_t *engine);
+
+        ::sycl::nd_range<1> launch_range;
+        sycl_gnorm_bwd_conf_t conf_;
+    };
+
+    kernel_t kernel_;
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+    const pd_t *pd() const {
+        return dynamic_cast<pd_t *>(primitive_t::pd().get());
+    }
+};
+
+} // namespace dnnl::impl::gpu::generic::sycl
+
+#endif
diff --git a/src/gpu/generic/sycl/ref_inner_product.cpp b/src/gpu/generic/sycl/ref_inner_product.cpp
new file mode 100644
index 00000000000..368041422f6
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_inner_product.cpp
@@ -0,0 +1,824 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/ref_inner_product.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+
+namespace detail {
+
+// TODO: this seems like a function generic enough to go a common utils file.
+status_t get_primitive_descriptor(op_desc_t *op_desc,
+        const primitive_attr_t *attributes, impl::engine_t *engine,
+        std::shared_ptr<primitive_desc_t> &pd) {
+
+    primitive_desc_iterator_t it(engine, op_desc, attributes, nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+
+    while (++it != it.end()) {
+        if (*it) {
+            pd = *it;
+            return status::success;
+            ;
+        }
+    }
+
+    return status::out_of_memory;
+}
+
+status_t init_matmul_pd(impl::engine_t *engine,
+        const primitive_attr_t *attributes, const memory_desc_t *src_desc,
+        const memory_desc_t *weights_desc, const memory_desc_t *bias_desc,
+        const memory_desc_t *dst_desc,
+        std::shared_ptr<primitive_desc_t> &matmul_pd) {
+
+    matmul_desc_t matmul_desc;
+    CHECK(matmul_desc_init(
+            &matmul_desc, src_desc, weights_desc, bias_desc, dst_desc));
+
+    CHECK(get_primitive_descriptor(reinterpret_cast<op_desc_t *>(&matmul_desc),
+            attributes, engine, matmul_pd));
+    return status::success;
+}
+
+status_t init_reorder_pd(impl::engine_t *engine, const memory_desc_t *src_md,
+        const memory_desc_t *dst_md,
+        std::shared_ptr<primitive_desc_t> &reorder_pd) {
+    // This will always be a gpu-gpu copy in our case.
+    CHECK(reorder_primitive_desc_create(reorder_pd, engine, src_md, dst_md));
+    return status::success;
+}
+
+void get_flattened_dimension(const dims_t &dims, dims_t &squished_dims,
+        dim_t ndims, bool swap_dimensions) {
+    int64_t accum = 1;
+    for (dim_t i = 1; i < ndims; i++) {
+        accum *= dims[i];
+    }
+    if (swap_dimensions) {
+        squished_dims[0] = accum;
+        squished_dims[1] = dims[0];
+    } else {
+        squished_dims[0] = dims[0];
+        squished_dims[1] = accum;
+    }
+}
+
+std::vector<int> get_dim_order(int ndims, const dims_t strides) {
+    std::vector<int> order(ndims);
+    for (int i = 0; i < ndims; ++i) {
+        order[i] = i;
+    }
+
+    std::sort(order.begin(), order.end(),
+            [&strides](size_t i, size_t j) { return strides[i] < strides[j]; });
+
+    return order;
+}
+
+bool strides_in_desc_order(const dims_t &strides, dim_t ndims) {
+    bool are_descending = true;
+    for (int i = 1; i < ndims; i++) {
+        are_descending = are_descending & (strides[i] < strides[i - 1]);
+    }
+    return are_descending;
+}
+
+} // namespace detail
+
+bool ref_inner_product_fwd_t::pd_t::check_if_dtypes_valid(
+        const data_type_t &src_dt, const data_type_t &dst_dt,
+        const data_type_t &bias_dt, const data_type_t &weight_dt) const {
+    using namespace data_type;
+    return (utils::one_of(src_dt, f32) && utils::one_of(weight_dt, f32)
+                   && utils::one_of(dst_dt, f32)
+                   && utils::one_of(bias_dt, f32, undef))
+            || (utils::one_of(src_dt, f16) && utils::one_of(weight_dt, f16)
+                    && utils::one_of(dst_dt, f16, f32, s8, u8)
+                    && utils::one_of(bias_dt, f16, f32, undef))
+            || (utils::one_of(src_dt, u8, s8) && utils::one_of(weight_dt, s8)
+                    && utils::one_of(dst_dt, u8, s8, s32, bf16, f32)
+                    && utils::one_of(bias_dt, u8, s8, s32, bf16, f32, undef))
+            || (utils::one_of(src_dt, bf16) && utils::one_of(weight_dt, bf16)
+                    && utils::one_of(dst_dt, f32, bf16)
+                    && utils::one_of(bias_dt, f32, bf16, undef));
+}
+
+bool ref_inner_product_bwd_data_t::pd_t::check_bwd_data_dtypes(
+        const data_type_t &src_dt, const data_type_t &dst_dt,
+        const data_type_t &weight_dt) const {
+    using namespace data_type;
+    return (utils::one_of(src_dt, f32) && utils::one_of(dst_dt, f32, f16, bf16)
+                   && utils::one_of(weight_dt, f32, bf16, f16))
+            || (utils::one_of(src_dt, bf16) && utils::one_of(dst_dt, bf16)
+                    && utils::one_of(weight_dt, bf16))
+            || (utils::one_of(src_dt, f16) && utils::one_of(dst_dt, f16)
+                    && utils::one_of(weight_dt, f16));
+}
+
+bool ref_inner_product_bwd_weights_t::pd_t::check_bwd_weights_dtypes(
+        const data_type_t &src_dt, const data_type_t &dst_dt,
+        const data_type_t &weight_dt, const data_type_t &bias_dt) const {
+    using namespace data_type;
+    return (utils::one_of(src_dt, f32) && utils::one_of(dst_dt, f32)
+                   && utils::one_of(weight_dt, f32)
+                   && utils::one_of(bias_dt, f32, undef))
+            || (utils::one_of(src_dt, bf16) && utils::one_of(dst_dt, bf16)
+                    && utils::one_of(weight_dt, f32, bf16)
+                    && utils::one_of(bias_dt, f32, bf16, undef))
+            || (utils::one_of(src_dt, f16) && utils::one_of(dst_dt, f16)
+                    && utils::one_of(weight_dt, f32, f16)
+                    && utils::one_of(bias_dt, f32, f16, undef));
+}
+
+status_t ref_inner_product_bwd_weights_t::pd_t::init_reduction_pd(
+        impl::engine_t *engine, const memory_desc_t *src_desc,
+        const memory_desc_t *dest_desc) {
+    reduction_desc_t reduction_descriptor;
+    //diff_bias is 1D, diff_dst will be 2D, reshape diff_bias to 1xOC
+    dims_t diff_bias_reshaped_dims {1, dest_desc->dims[0]};
+    memory_desc_t diff_bias_reshaped;
+    CHECK(memory_desc_init_by_tag(diff_bias_reshaped, 2,
+            diff_bias_reshaped_dims, dest_desc->data_type, format_tag::ab));
+    CHECK(reduction_desc_init(&reduction_descriptor, alg_kind::reduction_sum,
+            src_desc, &diff_bias_reshaped, 0.0f, 0.0f));
+    CHECK(detail::get_primitive_descriptor(
+            reinterpret_cast<op_desc_t *>(&reduction_descriptor), attr(),
+            engine, reduction_pd));
+    return status::success;
+}
+
+status_t ref_inner_product_fwd_t::pd_t::init(impl::engine_t *engine) {
+
+    const bool ok = (set_default_params() == status::success);
+    VDISPATCH_INNER_PRODUCT(ok, VERBOSE_UNSUPPORTED_TAG);
+
+    auto bias_dt
+            = with_bias() ? arg_md(DNNL_ARG_BIAS)->data_type : data_type::undef;
+
+    auto src_wrapper = memory_desc_wrapper(src_md());
+    auto wei_wrapper = memory_desc_wrapper(weights_md());
+    auto dst_wrapper = memory_desc_wrapper(dst_md());
+
+    VDISPATCH_INNER_PRODUCT(is_fwd(), VERBOSE_BAD_PROPKIND);
+    VDISPATCH_INNER_PRODUCT(
+            check_if_dtypes_valid(src_wrapper.data_type(),
+                    dst_wrapper.data_type(), bias_dt, wei_wrapper.data_type()),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_INNER_PRODUCT(
+            (attr_.set_default_formats(dst_md()) == status::success),
+            "Failed to set default formats");
+    VDISPATCH_INNER_PRODUCT(
+            sycl_post_ops_t::post_ops_ok(attr()), "Unsupported Post Ops");
+    VDISPATCH_INNER_PRODUCT(src_wrapper.is_plain(),
+            "source memory descriptor is not a plain memory format");
+    VDISPATCH_INNER_PRODUCT(wei_wrapper.is_plain(),
+            "weight memory descriptor is not a plain memory format");
+    VDISPATCH_INNER_PRODUCT(dst_wrapper.is_plain(),
+            "destination memory descriptor is not a plain memory format");
+
+    // if anything contains a zero dimension, return success as this will be converted
+    // to a no-op
+
+    if (src_wrapper.has_zero_dim() || wei_wrapper.has_zero_dim()
+            || dst_wrapper.has_zero_dim()) {
+        has_zero_dim = true;
+        return status::success;
+    }
+
+    memory_desc_t src_reshaped;
+    memory_desc_t weights_reshaped;
+    memory_desc_t dst_reshaped;
+    format_tag_t src_format_tag = format_tag::ab;
+    format_tag_t wei_format_tag = format_tag::ba;
+    format_tag_t dst_format_tag = format_tag::ab;
+    memory_desc_t bias_reshaped = types::zero_md();
+
+    auto src_strides = src_wrapper.strides();
+    auto ndims = src_wrapper.ndims();
+
+    // If it's two dimensional, and not properly ordered, rely on TT/TN/NT GEMMs rather than reorders
+    if (ndims == 2) {
+        if (src_strides[1] != 1) { src_format_tag = format_tag::ba; }
+
+        if (wei_wrapper.strides()[1] != 1) { wei_format_tag = format_tag::ab; }
+    }
+
+    if (dst_wrapper.strides()[1] != 1) { dst_format_tag = format_tag::ba; }
+
+    dims_t src_squished_dims;
+    dims_t wei_squished_dims;
+    detail::get_flattened_dimension(
+            src_wrapper.dims(), src_squished_dims, src_wrapper.ndims());
+    detail::get_flattened_dimension(
+            wei_wrapper.dims(), wei_squished_dims, wei_wrapper.ndims(), true);
+    CHECK(memory_desc_init_by_tag(src_reshaped, 2, src_squished_dims,
+            src_wrapper.data_type(), src_format_tag));
+    CHECK(memory_desc_init_by_tag(weights_reshaped, 2, wei_squished_dims,
+            wei_wrapper.data_type(), wei_format_tag));
+    if (with_bias()) {
+        const auto bias_md = arg_md(DNNL_ARG_BIAS);
+        //Reshape bias to 1 x OC;
+        dims_t reshaped_bias_dims {1, bias_md->dims[0]};
+        CHECK(memory_desc_init_by_tag(bias_reshaped, 2, reshaped_bias_dims,
+                bias_md->data_type, format_tag::ab));
+    }
+
+    CHECK(memory_desc_init_by_tag(dst_reshaped, 2, dst_wrapper.dims(),
+            dst_wrapper.data_type(), dst_format_tag));
+
+    CHECK(gpu::generic::sycl::detail::init_matmul_pd(engine, attr(),
+            &src_reshaped, &weights_reshaped, &bias_reshaped, &dst_reshaped,
+            matmul_pd));
+
+    // check the memory format. (If 1 is not the innermost stride(ab...) or the second stride (a...b))
+    // reorder it to a ab... format. Only check src and weights, as dst will be handled by the matmul.
+    memory_desc_t src_reordered;
+
+    // Check if src needs reorde
+    bool is_favourable_layout
+            = (src_strides[ndims - 1] == 1 || src_strides[1] == 1)
+            && src_strides[0] == src_reshaped.dims[1];
+    bool is_not_strided = ((src_strides[0] == 1 || src_strides[1] == 1)
+            && src_wrapper.ndims() == 2); // check for strided plain layouts
+    is_favourable_layout = is_favourable_layout && is_not_strided;
+    if (!is_favourable_layout) {
+        src_needs_reorder = true;
+        memory_desc_init_by_tag(src_reordered, src_wrapper.ndims(),
+                src_wrapper.dims(), src_wrapper.data_type(),
+                dnnl::impl::get_abx_tag(src_wrapper.ndims()));
+        detail::init_reorder_pd(
+                engine, arg_md(DNNL_ARG_SRC), &src_reordered, src_reorder_pd);
+    }
+
+    memory_desc_t wei_reordered;
+    format_tag_t wei_reordered_tag;
+    auto src_reordered_wrapper = memory_desc_wrapper(src_reordered);
+    if (src_needs_reorder) {
+        // if the weight layout is not compatible with with src, reorder that too.
+        if (detail::get_dim_order(src_reordered_wrapper.ndims(),
+                    src_reordered_wrapper.strides())
+                != detail::get_dim_order(
+                        wei_wrapper.ndims(), wei_wrapper.strides())) {
+            wei_needs_reorder = true;
+            wei_reordered_tag = dnnl::impl::get_abx_tag(wei_wrapper.ndims());
+        }
+    } else {
+        if (detail::get_dim_order(src_wrapper.ndims(), src_wrapper.strides())
+                        != detail::get_dim_order(
+                                wei_wrapper.ndims(), wei_wrapper.strides())
+                && wei_wrapper.ndims() > 2) {
+            // This implies src is either nhwc or nchw
+            wei_needs_reorder = true;
+            if (src_strides[ndims - 1] == 1) {
+                wei_reordered_tag
+                        = dnnl::impl::get_abx_tag(wei_wrapper.ndims());
+            } else if (src_strides[1] == 1) {
+                wei_reordered_tag
+                        = dnnl::impl::get_axb_tag(wei_wrapper.ndims());
+            }
+        }
+    }
+    if (wei_needs_reorder) {
+        memory_desc_init_by_tag(wei_reordered, wei_wrapper.ndims(),
+                wei_wrapper.dims(), wei_wrapper.data_type(), wei_reordered_tag);
+        CHECK(detail::init_reorder_pd(engine, arg_md(DNNL_ARG_WEIGHTS),
+                &wei_reordered, weights_reorder_pd));
+    }
+
+    // book scratchpad for the matmul, src_reorder and wei_reorder
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(memory_tracking::names::key_nested,
+            matmul_pd->scratchpad_registry());
+    if (src_needs_reorder) {
+        scratchpad.book(memory_tracking::names::key_iprod_src_reorder,
+                src_wrapper.nelems(), src_wrapper.data_type_size());
+    }
+    if (wei_needs_reorder) {
+        scratchpad.book(memory_tracking::names::key_iprod_weights_reorder,
+                wei_wrapper.nelems(), wei_wrapper.data_type_size());
+    }
+    return status::success;
+}
+
+status_t ref_inner_product_bwd_data_t::pd_t::init(impl::engine_t *engine) {
+
+    bool ok = (set_default_params() == status::success)
+            && attr()->has_default_values();
+
+    VDISPATCH_INNER_PRODUCT(ok, VERBOSE_UNSUPPORTED_TAG);
+
+    auto src_wrapper = memory_desc_wrapper(arg_md(DNNL_ARG_DIFF_DST));
+    auto dst_wrapper = memory_desc_wrapper(arg_md(DNNL_ARG_DIFF_SRC));
+    auto wei_wrapper = memory_desc_wrapper(arg_md(DNNL_ARG_WEIGHTS));
+
+    VDISPATCH_INNER_PRODUCT(
+            utils::one_of(this->desc()->prop_kind, prop_kind::backward,
+                    prop_kind::backward_data),
+            VERBOSE_BAD_PROPKIND);
+    VDISPATCH_INNER_PRODUCT(
+            check_bwd_data_dtypes(src_wrapper.data_type(),
+                    dst_wrapper.data_type(), wei_wrapper.data_type()),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_INNER_PRODUCT(
+            attr()->has_default_values(), VERBOSE_UNSUPPORTED_POSTOP);
+    VDISPATCH_INNER_PRODUCT(
+            src_wrapper.is_plain(), "Blocked memory format is not supported");
+    VDISPATCH_INNER_PRODUCT(
+            dst_wrapper.is_plain(), "Blocked memory format is not supported");
+
+    if (src_wrapper.has_zero_dim() || wei_wrapper.has_zero_dim()
+            || dst_wrapper.has_zero_dim()) {
+        has_zero_dim = true;
+        return status::success;
+    }
+
+    // dL/dX = (dL/dY) x W (hence no transpose required here)
+    auto empty_bias_desc = types::
+            zero_md(); // empty memory descriptor to signify bias is not applied
+
+    // Temporary memory descriptors to initialize matmul_pd; diff_dst will always be 2D
+    memory_desc_t reshaped_diff_src_md;
+    memory_desc_t reshaped_weights_md;
+    memory_desc_t reshaped_diff_dst_md;
+    dims_t diff_src_flattened_dims;
+    dims_t wei_flattened_dims;
+
+    // No need to swap dimensions here
+    detail::get_flattened_dimension(
+            dst_wrapper.dims(), diff_src_flattened_dims, dst_wrapper.ndims());
+    detail::get_flattened_dimension(
+            wei_wrapper.dims(), wei_flattened_dims, wei_wrapper.ndims());
+
+    format_tag_t diff_dst_format = format_tag::ab;
+    format_tag_t diff_src_format = format_tag::ab;
+    format_tag_t wei_format = format_tag::ab;
+
+    if (dst_wrapper.ndims() == 2) {
+        if (dst_wrapper.strides()[1] != 1) { diff_src_format = format_tag::ba; }
+        if (wei_wrapper.strides()[1] != 1) { wei_format = format_tag::ba; }
+    }
+
+    if (src_wrapper.strides()[1] != 1) { diff_dst_format = format_tag::ba; }
+
+    CHECK(memory_desc_init_by_tag(reshaped_diff_src_md, 2,
+            diff_src_flattened_dims, dst_wrapper.data_type(), diff_src_format));
+    CHECK(memory_desc_init_by_tag(reshaped_weights_md, 2, wei_flattened_dims,
+            wei_wrapper.data_type(), wei_format));
+    CHECK(memory_desc_init_by_tag(reshaped_diff_dst_md, 2, src_wrapper.dims(),
+            src_wrapper.data_type(), diff_dst_format));
+
+    CHECK(gpu::generic::sycl::detail::init_matmul_pd(engine, attr(),
+            &reshaped_diff_dst_md, &reshaped_weights_md, &empty_bias_desc,
+            &reshaped_diff_src_md, matmul_pd));
+
+    // Now check if diff_src and diff_dst need to be reordered
+    memory_desc_t dst_reordered_desc;
+    if (not detail::strides_in_desc_order(
+                dst_wrapper.strides(), dst_wrapper.ndims())
+            && dst_wrapper.ndims() > 2) {
+        dst_needs_reorder = true;
+        CHECK(memory_desc_init_by_tag(dst_reordered_desc, dst_wrapper.ndims(),
+                dst_wrapper.dims(), dst_wrapper.data_type(),
+                dnnl::impl::get_abx_tag(dst_wrapper.ndims())));
+        CHECK(detail::init_reorder_pd(engine, &dst_reordered_desc,
+                arg_md(DNNL_ARG_DIFF_SRC), dst_reorder_pd));
+    }
+
+    auto dst_reordered_wrapper = memory_desc_wrapper(dst_reordered_desc);
+    if (dst_needs_reorder) {
+        if (detail::get_dim_order(dst_reordered_wrapper.ndims(),
+                    dst_reordered_wrapper.strides())
+                != detail::get_dim_order(
+                        wei_wrapper.ndims(), wei_wrapper.strides())) {
+            wei_needs_reorder = true;
+            memory_desc_t wei_reorder_desc;
+            CHECK(memory_desc_init_by_tag(wei_reorder_desc, wei_wrapper.ndims(),
+                    wei_wrapper.dims(), wei_wrapper.data_type(),
+                    dnnl::impl::get_abx_tag(wei_wrapper.ndims())));
+            CHECK(detail::init_reorder_pd(engine, arg_md(DNNL_ARG_WEIGHTS),
+                    &wei_reorder_desc, wei_reorder_pd));
+        }
+    } else if (not detail::strides_in_desc_order(
+                       wei_wrapper.strides(), wei_wrapper.ndims())
+            && wei_wrapper.ndims() > 2) {
+        wei_needs_reorder = true;
+        memory_desc_t wei_reorder_desc;
+        CHECK(memory_desc_init_by_tag(wei_reorder_desc, wei_wrapper.ndims(),
+                wei_wrapper.dims(), wei_wrapper.data_type(),
+                dnnl::impl::get_abx_tag(wei_wrapper.ndims())));
+        CHECK(detail::init_reorder_pd(engine, arg_md(DNNL_ARG_WEIGHTS),
+                &wei_reorder_desc, wei_reorder_pd));
+    }
+
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(memory_tracking::names::key_nested,
+            matmul_pd->scratchpad_registry());
+    if (dst_needs_reorder) {
+        scratchpad.book(memory_tracking::names::key_iprod_src_reorder,
+                dst_wrapper.nelems(), dst_wrapper.data_type_size());
+    }
+    if (wei_needs_reorder) {
+        scratchpad.book(memory_tracking::names::key_iprod_weights_reorder,
+                wei_wrapper.nelems(), wei_wrapper.data_type_size());
+    }
+    return status::success;
+}
+
+status_t ref_inner_product_bwd_weights_t::pd_t::init(impl::engine_t *engine) {
+
+    bool ok = (set_default_params() == status::success);
+    VDISPATCH_INNER_PRODUCT(ok, VERBOSE_UNSUPPORTED_TAG);
+
+    auto bias_dt = arg_md(DNNL_ARG_DIFF_BIAS)->data_type;
+
+    auto src_wrapper = memory_desc_wrapper(arg_md(DNNL_ARG_DIFF_DST));
+    auto dst_wrapper = memory_desc_wrapper(arg_md(DNNL_ARG_DIFF_WEIGHTS));
+    auto wei_wrapper = memory_desc_wrapper(arg_md(DNNL_ARG_SRC));
+
+    if (src_wrapper.has_zero_dim() || wei_wrapper.has_zero_dim()
+            || dst_wrapper.has_zero_dim()) {
+        has_zero_dim = true;
+        return status::success;
+    }
+
+    VDISPATCH_INNER_PRODUCT(
+            utils::one_of(this->desc()->prop_kind, prop_kind::backward,
+                    prop_kind::backward_weights),
+            VERBOSE_BAD_PROPKIND);
+    VDISPATCH_INNER_PRODUCT(
+            check_bwd_weights_dtypes(src_wrapper.data_type(),
+                    dst_wrapper.data_type(), wei_wrapper.data_type(), bias_dt),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_INNER_PRODUCT(
+            attr()->has_default_values(), VERBOSE_UNSUPPORTED_POSTOP);
+    VDISPATCH_INNER_PRODUCT(
+            src_wrapper.is_plain(), "blocked memory format is not supported");
+    VDISPATCH_INNER_PRODUCT(
+            wei_wrapper.is_plain(), "blocked memory format is not supported");
+    VDISPATCH_INNER_PRODUCT(
+            dst_wrapper.is_plain(), "blocked memory format is not supported");
+
+    format_tag_t wei_format_tag = format_tag::ab;
+    format_tag_t dst_format_tag = format_tag::ab;
+    if (wei_wrapper.ndims() == 2) {
+        if (dst_wrapper.strides()[1] != 1) { dst_format_tag = format_tag::ba; }
+        if (wei_wrapper.strides()[1] != 1) { wei_format_tag = format_tag::ba; }
+    }
+
+    // Since dL/dY is transposed, default is format_tag::ba;
+    format_tag_t src_format_tag = format_tag::ba;
+    if (src_wrapper.strides()[1] != 1) { src_format_tag = format_tag::ab; }
+
+    memory_desc_t reshaped_src_md;
+    memory_desc_t reshaped_diff_wt_md;
+    memory_desc_t reshaped_diff_dst_md;
+    dims_t wei_reshaped;
+    dims_t dst_reshaped;
+    detail::get_flattened_dimension(
+            wei_wrapper.dims(), wei_reshaped, wei_wrapper.ndims());
+    detail::get_flattened_dimension(
+            dst_wrapper.dims(), dst_reshaped, dst_wrapper.ndims());
+    auto empty_bias_desc = types::
+            zero_md(); // empty memory descriptor to signify bias is not applied
+    // (dL / dW) = (dL/dY) ^ T x X;
+    dims_t src_transposed_dims {src_wrapper.dims()[1], src_wrapper.dims()[0]};
+    CHECK(memory_desc_init_by_tag(reshaped_src_md, 2, wei_reshaped,
+            wei_wrapper.data_type(), wei_format_tag));
+    CHECK(memory_desc_init_by_tag(reshaped_diff_wt_md, 2, dst_reshaped,
+            dst_wrapper.data_type(), dst_format_tag));
+    CHECK(memory_desc_init_by_tag(reshaped_diff_dst_md, 2, src_transposed_dims,
+            src_wrapper.data_type(), src_format_tag));
+
+    // Create matmul_pd for dL/dW
+    CHECK(detail::init_matmul_pd(engine, attr(), &reshaped_diff_dst_md,
+            &reshaped_src_md, &empty_bias_desc, &reshaped_diff_wt_md,
+            matmul_pd));
+
+    memory_desc_t wei_reordered_desc;
+    if (wei_wrapper.ndims() > 2
+            && !detail::strides_in_desc_order(
+                    wei_wrapper.strides(), wei_wrapper.ndims())) {
+        wei_requires_reorder = true;
+        memory_desc_init_by_tag(wei_reordered_desc, wei_wrapper.ndims(),
+                wei_wrapper.dims(), wei_wrapper.data_type(),
+                dnnl::impl::get_abx_tag(wei_wrapper.ndims()));
+        CHECK(detail::init_reorder_pd(engine, arg_md(DNNL_ARG_SRC),
+                &wei_reordered_desc, wei_reorder_pd));
+    }
+
+    if (dst_wrapper.ndims() > 2
+            && !detail::strides_in_desc_order(
+                    dst_wrapper.strides(), dst_wrapper.ndims())) {
+        dst_requires_reorder = true;
+        memory_desc_t dst_reorder_desc;
+        CHECK(memory_desc_init_by_tag(dst_reorder_desc, dst_wrapper.ndims(),
+                dst_wrapper.dims(), dst_wrapper.data_type(),
+                dnnl::impl::get_abx_tag(dst_wrapper.ndims())));
+        CHECK(detail::init_reorder_pd(engine, &dst_reorder_desc,
+                arg_md(DNNL_ARG_DIFF_WEIGHTS), dst_reorder_pd));
+    }
+
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(memory_tracking::names::key_nested_multiple,
+            matmul_pd->scratchpad_registry());
+
+    //Create reduction_pd for dL/dB
+    if (with_bias()) {
+        CHECK(init_reduction_pd(
+                engine, arg_md(DNNL_ARG_DIFF_DST), arg_md(DNNL_ARG_DIFF_BIAS)));
+        // book scratchpad for reduction
+        scratchpad.book(memory_tracking::names::key_nested_multiple + 1,
+                reduction_pd->scratchpad_registry());
+    }
+
+    if (wei_requires_reorder) {
+        scratchpad.book(memory_tracking::names::key_iprod_weights_reorder,
+                wei_wrapper.nelems(), wei_wrapper.data_type_size());
+    }
+
+    if (dst_requires_reorder) {
+        scratchpad.book(memory_tracking::names::key_iprod_src_reorder,
+                dst_wrapper.nelems(), dst_wrapper.data_type_size());
+    }
+
+    return status::success;
+}
+
+status_t ref_inner_product_fwd_t::init(impl::engine_t *engine) {
+    if (pd()->has_zero_dim) { return status::success; }
+    std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+    CHECK(pd()->matmul_pd->create_primitive_nested(p, engine));
+    matmul_primitive = p.first;
+
+    if (pd()->src_needs_reorder) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+        CHECK(pd()->src_reorder_pd->create_primitive_nested(p, engine));
+        src_reorder_primitive = p.first;
+    }
+
+    if (pd()->wei_needs_reorder) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+        CHECK(pd()->weights_reorder_pd->create_primitive_nested(p, engine));
+        weights_reorder_primitive = p.first;
+    }
+
+    return status::success;
+}
+
+status_t ref_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim) { return status::success; }
+    exec_args_t matmul_args(ctx.args());
+
+    std::unique_ptr<memory_t, memory_deleter_t> src_scratch_mem;
+    std::unique_ptr<memory_t, memory_deleter_t> wei_scratch_mem;
+
+    if (pd()->src_needs_reorder) {
+        auto zero_md = types::zero_md();
+        exec_args_t src_reorder_args(ctx.args());
+        src_reorder_args[DNNL_ARG_FROM] = src_reorder_args[DNNL_ARG_SRC];
+        auto src_reorder_scratchpad
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_iprod_src_reorder);
+        // An md should not be required to simply access the scratchpad storage
+        safe_ptr_assign(src_scratch_mem,
+                new memory_t(ctx.stream()->engine(), &zero_md,
+                        std::move(src_reorder_scratchpad)));
+        src_reorder_args[DNNL_ARG_TO]
+                = memory_arg_t {src_scratch_mem.get(), false};
+        matmul_args[DNNL_ARG_SRC] = memory_arg_t {src_scratch_mem.get(), true};
+        exec_ctx_t src_reorder_ctx(ctx.stream(), std::move(src_reorder_args));
+        CHECK(src_reorder_primitive->execute(src_reorder_ctx));
+    }
+
+    if (pd()->wei_needs_reorder) {
+        auto zero_md = types::zero_md();
+        exec_args_t wei_reorder_args(ctx.args());
+        wei_reorder_args[DNNL_ARG_FROM] = wei_reorder_args[DNNL_ARG_WEIGHTS];
+        auto wei_reorder_scratchpad
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_iprod_weights_reorder);
+        // An md should not be required to simple access the scratchpad storage
+        safe_ptr_assign(wei_scratch_mem,
+                new memory_t(ctx.stream()->engine(), &zero_md,
+                        std::move(wei_reorder_scratchpad)));
+        wei_reorder_args[DNNL_ARG_TO]
+                = memory_arg_t {wei_scratch_mem.get(), false};
+        matmul_args[DNNL_ARG_WEIGHTS]
+                = memory_arg_t {wei_scratch_mem.get(), true};
+        exec_ctx_t wei_reorder_ctx(ctx.stream(), std::move(wei_reorder_args));
+        CHECK(weights_reorder_primitive->execute(wei_reorder_ctx));
+    }
+
+    nested_scratchpad_t nested_scratchpad(
+            ctx, memory_tracking::names::key_nested, matmul_primitive);
+    exec_ctx_t matmul_ctx(ctx.stream(), std::move(matmul_args));
+    matmul_ctx.set_scratchpad_grantor(nested_scratchpad.grantor());
+    return matmul_primitive->execute(matmul_ctx);
+}
+
+status_t ref_inner_product_bwd_data_t::init(impl::engine_t *engine) {
+    if (pd()->has_zero_dim) { return status::success; }
+    std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+    CHECK(pd()->matmul_pd->create_primitive_nested(p, engine));
+    matmul_primitive = p.first;
+    if (pd()->dst_needs_reorder) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+        CHECK(pd()->dst_reorder_pd->create_primitive_nested(p, engine));
+        dst_reorder_primitive = p.first;
+    }
+
+    if (pd()->wei_needs_reorder) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+        CHECK(pd()->wei_reorder_pd->create_primitive_nested(p, engine));
+        wei_reorder_primitive = p.first;
+    }
+    return status::success;
+}
+
+status_t ref_inner_product_bwd_data_t::execute(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim) { return status::success; }
+    std::unique_ptr<memory_t, memory_deleter_t> dst_scratch_mem;
+    std::unique_ptr<memory_t, memory_deleter_t> wei_scratch_mem;
+    exec_args_t matmul_args(ctx.args());
+    matmul_args[DNNL_ARG_SRC] = matmul_args[DNNL_ARG_DIFF_DST];
+    matmul_args[DNNL_ARG_DST] = matmul_args[DNNL_ARG_DIFF_SRC];
+    exec_args_t dst_reorder_args(ctx.args());
+
+    nested_scratchpad_t nested_scratchpad(
+            ctx, memory_tracking::names::key_nested, matmul_primitive);
+
+    // Map src and dst to diff_dst and diff_src respectively
+    if (pd()->wei_needs_reorder) {
+        auto zero_md = types::zero_md();
+        exec_args_t wei_reorder_args(ctx.args());
+        wei_reorder_args[DNNL_ARG_FROM] = wei_reorder_args[DNNL_ARG_WEIGHTS];
+        auto wei_reorder_scratchpad
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_iprod_weights_reorder);
+        // An md should not be required to simple access the scratchpad storage
+        safe_ptr_assign(wei_scratch_mem,
+                new memory_t(ctx.stream()->engine(), &zero_md,
+                        std::move(wei_reorder_scratchpad)));
+        wei_reorder_args[DNNL_ARG_TO]
+                = memory_arg_t {wei_scratch_mem.get(), false};
+        matmul_args[DNNL_ARG_WEIGHTS]
+                = memory_arg_t {wei_scratch_mem.get(), true};
+        exec_ctx_t wei_reorder_ctx(ctx.stream(), std::move(wei_reorder_args));
+        CHECK(wei_reorder_primitive->execute(wei_reorder_ctx));
+    }
+
+    if (pd()->dst_needs_reorder) {
+        auto zero_md = types::zero_md();
+        auto dst_reorder_scratchpad
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_iprod_src_reorder);
+        // An md should not be required to simple access the scratchpad storage
+        safe_ptr_assign(dst_scratch_mem,
+                new memory_t(ctx.stream()->engine(), &zero_md,
+                        std::move(dst_reorder_scratchpad)));
+        dst_reorder_args[DNNL_ARG_TO] = matmul_args[DNNL_ARG_DST];
+        dst_reorder_args[DNNL_ARG_FROM]
+                = memory_arg_t {dst_scratch_mem.get(), true};
+        matmul_args[DNNL_ARG_DST] = memory_arg_t {dst_scratch_mem.get(), false};
+        ;
+    }
+
+    exec_ctx_t matmul_ctx(ctx.stream(), std::move(matmul_args));
+
+    matmul_ctx.set_scratchpad_grantor(nested_scratchpad.grantor());
+
+    CHECK(matmul_primitive->execute(matmul_ctx));
+    if (pd()->dst_needs_reorder) {
+        exec_ctx_t dst_reorder_ctx(ctx.stream(), std::move(dst_reorder_args));
+        CHECK(dst_reorder_primitive->execute(dst_reorder_ctx));
+    }
+    return status::success;
+}
+
+status_t ref_inner_product_bwd_weights_t::init(impl::engine_t *engine) {
+    if (pd()->has_zero_dim) { return status::success; }
+    std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+    CHECK(pd()->matmul_pd->create_primitive_nested(p, engine));
+    matmul_primitive = p.first;
+
+    if (pd()->with_bias()) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t>
+                p_reduction;
+        CHECK(pd()->reduction_pd->create_primitive_nested(p_reduction, engine));
+        reduction_primitive = p_reduction.first;
+    }
+
+    if (pd()->wei_requires_reorder) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+        CHECK(pd()->wei_reorder_pd->create_primitive_nested(p, engine));
+        wei_reorder_primitive = p.first;
+    }
+
+    if (pd()->dst_requires_reorder) {
+        std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t> p;
+        CHECK(pd()->dst_reorder_pd->create_primitive_nested(p, engine));
+        dst_reorder_primitve = p.first;
+    }
+
+    return status::success;
+}
+
+status_t ref_inner_product_bwd_weights_t::execute(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim) { return status::success; }
+    std::unique_ptr<memory_t, memory_deleter_t> dst_scratch_mem;
+    std::unique_ptr<memory_t, memory_deleter_t> wei_scratch_mem;
+    auto zero_md = types::zero_md();
+
+    nested_scratchpad_t nested_scratchpad(
+            ctx, memory_tracking::names::key_nested_multiple, matmul_primitive);
+
+    exec_args_t matmul_args(ctx.args());
+
+    auto src_memory_arg = matmul_args[DNNL_ARG_SRC];
+    matmul_args[DNNL_ARG_SRC] = matmul_args[DNNL_ARG_DIFF_DST];
+    matmul_args[DNNL_ARG_WEIGHTS] = src_memory_arg;
+    matmul_args[DNNL_ARG_DST] = matmul_args[DNNL_ARG_DIFF_WEIGHTS];
+
+    if (pd()->wei_requires_reorder) {
+        exec_args_t wei_reorder_args(ctx.args());
+        wei_reorder_args[DNNL_ARG_FROM] = wei_reorder_args[DNNL_ARG_SRC];
+        auto wei_reorder_scratchpad
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_iprod_weights_reorder);
+        // An md should not be required to simple access the scratchpad storage
+        safe_ptr_assign(wei_scratch_mem,
+                new memory_t(ctx.stream()->engine(), &zero_md,
+                        std::move(wei_reorder_scratchpad)));
+        wei_reorder_args[DNNL_ARG_TO]
+                = memory_arg_t {wei_scratch_mem.get(), false};
+        matmul_args[DNNL_ARG_WEIGHTS]
+                = memory_arg_t {wei_scratch_mem.get(), true};
+
+        exec_ctx_t wei_reorder_ctx(ctx.stream(), std::move(wei_reorder_args));
+        CHECK(wei_reorder_primitive->execute(wei_reorder_ctx));
+    }
+
+    exec_args_t dst_reorder_args(ctx.args());
+    if (pd()->dst_requires_reorder) {
+        auto zero_md = types::zero_md();
+        auto dst_reorder_scratchpad
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_iprod_src_reorder);
+        // An md should not be required to simple access the scratchpad storage
+        safe_ptr_assign(dst_scratch_mem,
+                new memory_t(ctx.stream()->engine(), &zero_md,
+                        std::move(dst_reorder_scratchpad)));
+        dst_reorder_args[DNNL_ARG_TO] = matmul_args[DNNL_ARG_DST];
+        dst_reorder_args[DNNL_ARG_FROM]
+                = memory_arg_t {dst_scratch_mem.get(), true};
+        matmul_args[DNNL_ARG_DST] = memory_arg_t {dst_scratch_mem.get(), false};
+    }
+
+    // Map src and dst to diff_dst and diff_src respectively
+    exec_ctx_t matmul_ctx(ctx.stream(), std::move(matmul_args));
+
+    matmul_ctx.set_scratchpad_grantor(nested_scratchpad.grantor());
+    // calcules dL/dW;
+    CHECK(matmul_primitive->execute(matmul_ctx));
+
+    if (pd()->dst_requires_reorder) {
+        exec_ctx_t dst_reorder_ctx(ctx.stream(), std::move(dst_reorder_args));
+        CHECK(dst_reorder_primitve->execute(dst_reorder_ctx));
+    }
+
+    if (pd()->with_bias()) {
+        //calculates dL/dB
+        nested_scratchpad_t reduction_scratchpad(ctx,
+                memory_tracking::names::key_nested_multiple + 1,
+                reduction_primitive);
+        exec_args_t args_copy_reduction(ctx.args());
+        args_copy_reduction[DNNL_ARG_SRC]
+                = args_copy_reduction[DNNL_ARG_DIFF_DST];
+        args_copy_reduction[DNNL_ARG_DST]
+                = args_copy_reduction[DNNL_ARG_DIFF_BIAS];
+        exec_ctx_t copied_ctx_reduction(
+                ctx.stream(), std::move(args_copy_reduction));
+
+        copied_ctx_reduction.set_scratchpad_grantor(
+                reduction_scratchpad.grantor());
+        CHECK(reduction_primitive->execute(copied_ctx_reduction));
+    }
+    return status::success;
+}
+
+} // namespace dnnl::impl::gpu::generic::sycl
diff --git a/src/gpu/generic/sycl/ref_inner_product.hpp b/src/gpu/generic/sycl/ref_inner_product.hpp
new file mode 100644
index 00000000000..ff2255c4b8c
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_inner_product.hpp
@@ -0,0 +1,171 @@
+/*******************************************************************************
+* Copyright 2023-2024 Intel Corporation
+* Copyright 2024-2025 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP
+#define GPU_GENERIC_SYCL_REF_INNER_PRODUCT_HPP
+
+#include "common/opdesc.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "common/reduction_pd.hpp"
+#include "common/reorder.hpp"
+#include "common/tag_traits.hpp"
+#include "gpu/generic/sycl/ref_matmul.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+#include "gpu/gpu_inner_product_pd.hpp"
+#include "gpu/gpu_primitive.hpp"
+
+namespace dnnl::impl::gpu::generic::sycl {
+
+namespace detail {
+status_t init_matmul_pd(impl::engine_t *engine,
+        const primitive_attr_t *attributes, const memory_desc_t *src_desc,
+        const memory_desc_t *weights_desc, const memory_desc_t *bias_desc,
+        const memory_desc_t *dst_desc,
+        std::shared_ptr<primitive_desc_t> &matmul_pd);
+
+status_t init_reorder_pd(impl::engine_t *engine, const memory_desc_t *src_md,
+        const memory_desc_t *dst_md,
+        std::shared_ptr<primitive_desc_t> &reorder_pd);
+
+status_t get_primitive_descriptor(op_desc_t *op_desc,
+        const primitive_attr_t *attributes, impl::engine_t *engine,
+        std::shared_ptr<primitive_desc_t> &pd);
+
+std::vector<int> get_dim_order(int ndims, const dims_t strides);
+
+void get_flattened_dimension(const dims_t &dims, dims_t &squished_dims,
+        dim_t ndims, bool swap_dimensions = false);
+
+bool strides_in_desc_order(const dims_t &strides, dim_t ndims);
+} // namespace detail
+
+struct ref_inner_product_fwd_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_inner_product_fwd_pd_t {
+        using gpu_inner_product_fwd_pd_t::gpu_inner_product_fwd_pd_t;
+        using sm = primitive_attr_t::skip_mask_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_inner_product_fwd_t);
+
+        status_t init(impl::engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> matmul_pd;
+        std::shared_ptr<primitive_desc_t> src_reorder_pd;
+        std::shared_ptr<primitive_desc_t> weights_reorder_pd;
+
+        bool has_zero_dim = false;
+        bool src_needs_reorder = false;
+        bool wei_needs_reorder = false;
+
+    private:
+        bool check_if_dtypes_valid(const data_type_t &src_dt,
+                const data_type_t &dst_dt, const data_type_t &bias_dt,
+                const data_type_t &weight_dt) const;
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> matmul_primitive;
+    std::shared_ptr<impl::primitive_t> src_reorder_primitive;
+    std::shared_ptr<impl::primitive_t> weights_reorder_primitive;
+};
+
+struct ref_inner_product_bwd_data_t : public gpu::generic::sycl::primitive_t {
+
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_inner_product_bwd_data_pd_t {
+        using gpu_inner_product_bwd_data_pd_t::gpu_inner_product_bwd_data_pd_t;
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_inner_product_bwd_data_t);
+
+        status_t init(impl::engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> matmul_pd;
+        std::shared_ptr<primitive_desc_t> dst_reorder_pd;
+        std::shared_ptr<primitive_desc_t> wei_reorder_pd;
+
+        bool has_zero_dim = false;
+        bool dst_needs_reorder = false;
+        bool wei_needs_reorder = false;
+
+    private:
+        bool check_bwd_data_dtypes(const data_type_t &src_dt,
+                const data_type_t &dst_dt, const data_type_t &weight_dt) const;
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> matmul_primitive;
+    std::shared_ptr<impl::primitive_t> dst_reorder_primitive;
+    std::shared_ptr<impl::primitive_t> wei_reorder_primitive;
+};
+
+struct ref_inner_product_bwd_weights_t
+    : public gpu::generic::sycl::primitive_t {
+
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_inner_product_bwd_weights_pd_t {
+        using gpu_inner_product_bwd_weights_pd_t::
+                gpu_inner_product_bwd_weights_pd_t;
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_inner_product_bwd_weights_t);
+
+        status_t init(impl::engine_t *engine);
+
+        std::shared_ptr<primitive_desc_t> matmul_pd;
+        std::shared_ptr<primitive_desc_t> reduction_pd;
+        std::shared_ptr<primitive_desc_t> dst_reorder_pd;
+        std::shared_ptr<primitive_desc_t> wei_reorder_pd;
+
+        bool has_zero_dim = false;
+        bool wei_requires_reorder = false;
+        bool dst_requires_reorder = false;
+
+    private:
+        bool check_bwd_weights_dtypes(const data_type_t &src_dt,
+                const data_type_t &dst_dt, const data_type_t &weight_dt,
+                const data_type_t &bias_dt) const;
+
+        status_t init_reduction_pd(impl::engine_t *engine,
+                const memory_desc_t *src_desc, const memory_desc_t *dest_desc);
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::shared_ptr<impl::primitive_t> matmul_primitive;
+    std::shared_ptr<impl::primitive_t> reduction_primitive;
+
+    std::shared_ptr<impl::primitive_t> dst_reorder_primitve;
+    std::shared_ptr<impl::primitive_t> wei_reorder_primitive;
+};
+
+} // namespace dnnl::impl::gpu::generic::sycl
+
+#endif
diff --git a/src/gpu/generic/sycl/ref_layer_normalizations.cpp b/src/gpu/generic/sycl/ref_layer_normalizations.cpp
index 0060dfe19ec..f5c7b487594 100644
--- a/src/gpu/generic/sycl/ref_layer_normalizations.cpp
+++ b/src/gpu/generic/sycl/ref_layer_normalizations.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "common/c_types_map.hpp"
 #include "common/dnnl_traits.hpp"
 #include "gpu/generic/sycl/layer_normalizations_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 #include "xpu/sycl/types.hpp"
 
 namespace dnnl {
@@ -38,18 +39,13 @@ status_t ref_layer_normalization_fwd_t::pd_t::init_conf() {
     conf_.flags = desc()->flags;
     conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems();
     conf_.block_size = 16;
-    conf_.wg_size = 32;
 
-    conf_.rt_scaling = !attr()->scales_.defined();
-    conf_.src_def = attr()->scales_.get(DNNL_ARG_SRC).has_default_values();
-    conf_.dst_def = attr()->scales_.get(DNNL_ARG_DST).has_default_values();
+    conf_.rt_scaling = !attr()->scales_.has_default_values();
+    conf_.src_def = attr()->scales_.has_default_values(DNNL_ARG_SRC);
+    conf_.dst_def = attr()->scales_.has_default_values(DNNL_ARG_DST);
 
-    conf_.scales_src_dt = conf_.src_def
-            ? data_type_t::dnnl_f32
-            : attr()->scales_.get(DNNL_ARG_SRC).data_type_;
-    conf_.scales_dst_dt = conf_.dst_def
-            ? data_type_t::dnnl_f32
-            : attr()->scales_.get(DNNL_ARG_DST).data_type_;
+    conf_.scales_src_dt = attr()->scales_.get_data_type(DNNL_ARG_SRC);
+    conf_.scales_dst_dt = attr()->scales_.get_data_type(DNNL_ARG_DST);
 
     conf_.use_scale = use_scale();
     conf_.use_shift = use_shift();
@@ -91,17 +87,14 @@ status_t ref_layer_normalization_fwd_t::init(impl::engine_t *engine) {
 
 status_t ref_layer_normalization_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
+    if (memory_desc_wrapper(pd()->dst_md()).size() == 0) return status::success;
+
     if (pd()->stats_are_src()) {
         return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
             layer_normalization_fwd_kernel_vec_t layer_normalization_fwd_kernel(
                     pd()->conf_, cgh, ctx);
-            const int block_size = pd()->conf_.block_size;
-            const int wg_size = pd()->conf_.wg_size;
 
-            int work_per_wg = wg_size * block_size;
-            int n_wgs = (pd()->across_axis() + work_per_wg - 1) / work_per_wg;
-            int n_thr = n_wgs * wg_size;
-            cgh.parallel_for(::sycl::nd_range<1>(n_thr, wg_size),
+            cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size),
                     layer_normalization_fwd_kernel);
         });
     } else {
@@ -109,13 +102,7 @@ status_t ref_layer_normalization_fwd_t::execute_forward(
             layer_normalization_fwd_kernel_vec1_t
                     layer_normalization_fwd_kernel1(pd()->conf_, cgh, ctx);
 
-            const int block_size = pd()->conf_.block_size;
-            const int wg_size = pd()->conf_.wg_size;
-
-            int work_per_wg = wg_size * block_size;
-            int n_wgs = (pd()->across_axis() + work_per_wg - 1) / work_per_wg;
-            int n_thr = n_wgs * wg_size;
-            cgh.parallel_for(::sycl::nd_range<1>(n_thr, wg_size),
+            cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size),
                     layer_normalization_fwd_kernel1);
         });
     }
@@ -174,6 +161,9 @@ status_t ref_layer_normalization_bwd_t::init(impl::engine_t *engine) {
 
 status_t ref_layer_normalization_bwd_t::execute_backward(
         const exec_ctx_t &ctx) const {
+    if (memory_desc_wrapper(pd()->diff_src_md()).size() == 0)
+        return status::success;
+
     if (pd()->conf_.use_scale || pd()->conf_.use_shift) {
         auto status = parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
             auto nelems_A = memory_desc_wrapper(pd()->src_md(0)).nelems();
diff --git a/src/gpu/generic/sycl/ref_layer_normalizations.hpp b/src/gpu/generic/sycl/ref_layer_normalizations.hpp
index a25f752ed2a..80a7856bcb6 100644
--- a/src/gpu/generic/sycl/ref_layer_normalizations.hpp
+++ b/src/gpu/generic/sycl/ref_layer_normalizations.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,21 +52,41 @@ struct ref_layer_normalization_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper dst_d(dst_md(0));
             const memory_desc_wrapper var_d(src_md(2));
 
-            const bool ok = is_fwd()
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && utils::one_of(
-                            src_md(0)->data_type, f32, bf16, f16, s8, u8)
-                    && utils::one_of(
-                            dst_md(0)->data_type, f32, bf16, f16, s8, u8)
-                    && stat_md()->data_type == f32
-                    && check_scale_shift_data_type({f32, bf16, f16})
-                    && attr()->has_default_values(sm::scales_runtime)
-                    && attr_scales_ok() && set_default_formats_common()
-                    && md_dims_in_range(src_md());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_LNORM(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_LNORM((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_LNORM(is_supported_type(src_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(is_supported_type(dst_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(is_supported_type(stat_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(check_scale_shift_data_type({f32, bf16, f16}),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(attr()->has_default_values(sm::scales),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_LNORM(IMPLICATION(!attr()->scales_.has_default_values(),
+                                    scales_ok()),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_LNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_LNORM(md_dims_in_range(src_md()), VERBOSE_UNSUPPORTED_DT);
             return init_conf();
         }
 
+        bool scales_ok() const {
+            const std::vector<int> supported_args
+                    = {DNNL_ARG_SRC, DNNL_ARG_DST};
+
+            const auto &scales = attr()->scales_;
+            for (auto arg : supported_args) {
+                const auto dt = scales.get_data_type(arg);
+                if (!is_supported_type(dt)) { return false; }
+            }
+            return true;
+        }
+
         status_t init_conf();
         sycl_layer_normalization_conf_t conf_;
     };
@@ -102,19 +122,28 @@ struct ref_layer_normalization_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_dst_d(diff_dst_md(0));
             const memory_desc_wrapper var_d(src_md(2));
 
-            const bool ok = !is_fwd()
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (diff_dst_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && utils::one_of(src_md(0)->data_type, f32, bf16)
-                    && utils::one_of(diff_dst_md(0)->data_type, f32, bf16)
-                    && utils::one_of(diff_src_md(0)->data_type, f32, bf16)
-                    && stat_md()->data_type == f32
-                    && check_scale_shift_data_type({f32, bf16, f16})
-                    && attr()->has_default_values()
-                    && set_default_formats_common()
-                    && md_dims_in_range(diff_dst_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_LNORM(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_LNORM((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_LNORM(
+                    (diff_dst_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_LNORM(is_supported_type(src_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(is_supported_type(diff_dst_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(is_supported_type(diff_src_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(is_supported_type(stat_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(check_scale_shift_data_type({f32, bf16, f16}),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LNORM(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_LNORM(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_LNORM(md_dims_in_range(diff_dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "diff_dst");
             return init_conf();
         }
 
diff --git a/src/gpu/generic/sycl/ref_lrn.cpp b/src/gpu/generic/sycl/ref_lrn.cpp
index ab7a6055b9e..77b3b66fe98 100644
--- a/src/gpu/generic/sycl/ref_lrn.cpp
+++ b/src/gpu/generic/sycl/ref_lrn.cpp
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_lrn.hpp"
 #include "gpu/generic/sycl/lrn_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,8 +29,6 @@ status_t ref_sycl_lrn_fwd_t::pd_t::init_conf() {
     conf_.src_md = xpu::sycl::md_t(src_md());
     conf_.dst_md = xpu::sycl::md_t(dst_md());
     conf_.alg_kind = desc()->alg_kind;
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
 
     const memory_desc_wrapper data_d(src_md());
     conf_.mb = MB();
@@ -66,18 +65,11 @@ status_t ref_sycl_lrn_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     using namespace format_tag;
 
     return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
-        const auto block_size = pd()->conf_.block_size;
-        const auto wg_size = pd()->conf_.wg_size;
-        const auto t_work = pd()->conf_.wk_size;
-        const auto wg_work = wg_size * block_size;
-        const auto wg_cnt = (t_work + wg_work - 1) / wg_work;
-        auto n_thr = wg_cnt * wg_size;
-        n_thr = n_thr < 1 ? 1 : n_thr;
         const format_tag_t tag = pd()->dat_tag_;
 
         lrn_fwd_kernel_vec_t lrn_fwd_kernel_(pd()->conf_, cgh, ctx, tag);
 
-        cgh.parallel_for(::sycl::nd_range<1>(n_thr, wg_size), lrn_fwd_kernel_);
+        cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size), lrn_fwd_kernel_);
     });
 }
 
@@ -87,8 +79,6 @@ status_t ref_sycl_lrn_bwd_t::pd_t::init_conf() {
     conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md());
     conf_.diff_src_md = xpu::sycl::md_t(diff_src_md());
     conf_.alg_kind = desc()->alg_kind;
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
 
     const memory_desc_wrapper data_d(src_md());
     conf_.mb = MB();
@@ -123,17 +113,9 @@ status_t ref_sycl_lrn_bwd_t::init(impl::engine_t *engine) {
 status_t ref_sycl_lrn_bwd_t::execute_backward(const exec_ctx_t &ctx) const {
     return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         const format_tag_t tag = pd()->dat_tag_;
-
         lrn_bwd_kernel_vec_t lrn_bwd_kernel_(pd()->conf_, cgh, ctx, tag);
 
-        const int block_size = pd()->conf_.block_size;
-        const int wg_size = pd()->conf_.wg_size;
-        const int t_work = pd()->conf_.wk_size;
-        int wg_work = wg_size * block_size;
-        int wg_cnt = (t_work + wg_work - 1) / wg_work;
-        int wg_thr = wg_cnt * wg_size;
-        wg_thr = wg_thr < 1 ? 1 : wg_thr;
-        cgh.parallel_for(::sycl::nd_range<1>(wg_thr, wg_size), lrn_bwd_kernel_);
+        cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size), lrn_bwd_kernel_);
     });
 }
 
diff --git a/src/gpu/generic/sycl/ref_lrn.hpp b/src/gpu/generic/sycl/ref_lrn.hpp
index 734ae8fa0b8..e98779a31e5 100644
--- a/src/gpu/generic/sycl/ref_lrn.hpp
+++ b/src/gpu/generic/sycl/ref_lrn.hpp
@@ -43,16 +43,23 @@ struct ref_sycl_lrn_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper src_d(src_md());
             const memory_desc_wrapper dst_d(dst_md());
 
-            bool ok = is_fwd()
-                    && utils::one_of(src_md()->data_type, f32, bf16, f16)
-                    && utils::everyone_is(
-                            src_md()->data_type, dst_md()->data_type)
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values()
-                    && set_default_formats_common() && src_d == dst_d
-                    && md_dims_in_range(src_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_LRN(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_LRN(utils::one_of(src_md()->data_type, f32, bf16, f16),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LRN(utils::everyone_is(
+                                  src_md()->data_type, dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LRN((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_LRN(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_LRN(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_LRN(
+                    src_d == dst_d, VERBOSE_INCONSISTENT_MDS, "src", "dst");
+            VDISPATCH_LRN(md_dims_in_range(src_md()), VERBOSE_OUT_OF_RANGE_DIMS,
+                    "src");
+
             dat_tag_ = memory_desc_matches_one_of_tag(*src_md(), nchw, nhwc);
             return init_conf();
         }
@@ -90,15 +97,21 @@ struct ref_sycl_lrn_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_src_d(diff_src_md());
             const memory_desc_wrapper diff_dst_d(diff_dst_md());
 
-            bool ok = !is_fwd()
-                    && utils::one_of(src_md()->data_type, f32, bf16, f16)
-                    && utils::everyone_is(src_md()->data_type,
-                            diff_src_md()->data_type, diff_dst_md()->data_type)
-                    && attr()->has_default_values()
-                    && set_default_formats_common() && diff_dst_d == diff_src_d
-                    && md_dims_in_range(diff_dst_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_LRN(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_LRN(utils::one_of(src_md()->data_type, f32, bf16, f16),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LRN(
+                    utils::everyone_is(src_md()->data_type,
+                            diff_src_md()->data_type, diff_dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_LRN(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_LRN(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_LRN(diff_dst_d == diff_src_d, VERBOSE_INCONSISTENT_MDS,
+                    "diff_src", "diff_dst");
+            VDISPATCH_LRN(md_dims_in_range(diff_dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "diff_dst");
 
             dat_tag_ = memory_desc_matches_one_of_tag(*src_md(), nchw, nhwc);
             return init_conf();
diff --git a/src/gpu/generic/sycl/ref_matmul.cpp b/src/gpu/generic/sycl/ref_matmul.cpp
index 4a5b100ee1e..30838856ca2 100644
--- a/src/gpu/generic/sycl/ref_matmul.cpp
+++ b/src/gpu/generic/sycl/ref_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,58 +23,92 @@ namespace gpu {
 namespace generic {
 namespace sycl {
 
-status_t ref_matmul_t::pd_t::init_conf() {
+void ref_matmul_t::pd_t::init_conf() {
     conf_ = sycl_matmul_conf_t();
 
+    conf_.do_scale_data = !attr()->scales_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.do_scale_weights
+            = !attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
+    conf_.do_scale_dst = !attr()->scales_.has_default_values(DNNL_ARG_DST);
+    conf_.single_weights_scale
+            = attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) == 0;
+
+    conf_.use_data_zeropoints
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.use_weights_zeropoints
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS_0);
+    conf_.use_dst_zeropoints
+            = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
+
+    conf_.use_dropout = !attr()->dropout_.has_default_values();
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
+
+    memory_desc_wrapper src_d = src_md();
+    memory_desc_wrapper weights_d = weights_md();
+    memory_desc_wrapper dst_d = dst_md();
+    memory_desc_wrapper bias_d = weights_md(1);
+    for (const auto &mdw : {src_d, weights_d, dst_d, bias_d}) {
+        if (mdw.has_runtime_dims()) {
+            any_runtime_params_ = true;
+            return;
+        }
+    }
+    init_rt_conf(conf_, src_d, weights_d, dst_d, bias_d);
+}
+
+void ref_matmul_t::pd_t::init_rt_conf(sycl_matmul_conf_t &conf,
+        const memory_desc_wrapper src_d, const memory_desc_wrapper weights_d,
+        const memory_desc_wrapper dst_d,
+        const memory_desc_wrapper bias_d) const {
     int matmul_dim_1 = ndims() - 2;
     int matmul_dim_2 = ndims() - 1;
 
-    memory_desc_t data_md_copy = *src_md();
+    memory_desc_t data_md_copy = *src_d.md_;
     auto &data_strides = data_md_copy.format_desc.blocking.strides;
     if (data_strides[matmul_dim_1] < data_strides[matmul_dim_2]) {
         std::swap(data_strides[matmul_dim_1], data_strides[matmul_dim_2]);
         std::swap(data_md_copy.dims[matmul_dim_1],
                 data_md_copy.dims[matmul_dim_2]);
-        conf_.transpose_data = true;
+        conf.transpose_data = true;
     }
-    conf_.data_md = xpu::sycl::md_t(&data_md_copy);
+    conf.data_md = xpu::sycl::md_t(&data_md_copy);
 
-    memory_desc_t weights_md_copy = *weights_md();
+    memory_desc_t weights_md_copy = *weights_d.md_;
     auto &weights_strides = weights_md_copy.format_desc.blocking.strides;
     if (weights_strides[matmul_dim_1] < weights_strides[matmul_dim_2]) {
         std::swap(weights_strides[matmul_dim_1], weights_strides[matmul_dim_2]);
         std::swap(weights_md_copy.dims[matmul_dim_1],
                 weights_md_copy.dims[matmul_dim_2]);
-        conf_.transpose_weights = true;
+        conf.transpose_weights = true;
     }
-    conf_.weights_md = xpu::sycl::md_t(&weights_md_copy);
+    conf.weights_md = xpu::sycl::md_t(&weights_md_copy);
 
-    memory_desc_t dst_md_copy = *dst_md();
+    memory_desc_t dst_md_copy = *dst_d.md_;
     auto &dst_strides = dst_md_copy.format_desc.blocking.strides;
     if (dst_strides[matmul_dim_1] < dst_strides[matmul_dim_2]) {
         std::swap(dst_strides[matmul_dim_1], dst_strides[matmul_dim_2]);
         std::swap(
                 dst_md_copy.dims[matmul_dim_1], dst_md_copy.dims[matmul_dim_2]);
-        conf_.transpose_dst = true;
+        conf.transpose_dst = true;
     }
-    conf_.dst_md = xpu::sycl::md_t(&dst_md_copy);
+    conf.dst_md = xpu::sycl::md_t(&dst_md_copy);
 
     if (with_bias()) {
-        memory_desc_t bias_md_copy = *weights_md(1);
+        memory_desc_t bias_md_copy = *bias_d.md_;
         auto &bias_strides = bias_md_copy.format_desc.blocking.strides;
         if (bias_strides[matmul_dim_1] < bias_strides[matmul_dim_2]) {
             std::swap(bias_strides[matmul_dim_1], bias_strides[matmul_dim_2]);
             std::swap(bias_md_copy.dims[matmul_dim_1],
                     bias_md_copy.dims[matmul_dim_2]);
-            conf_.transpose_bias = true;
+            conf.transpose_bias = true;
         }
-        conf_.bias_md = xpu::sycl::md_t(&bias_md_copy);
+        conf.bias_md = xpu::sycl::md_t(&bias_md_copy);
     }
 
     dims_t dst_blocks;
     for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
-        if (i < conf_.dst_md.ndims()) {
-            dst_blocks[i] = conf_.dst_md.dims()[i];
+        if (i < conf.dst_md.ndims()) {
+            dst_blocks[i] = conf.dst_md.dims()[i];
         } else {
             dst_blocks[i] = 1;
         }
@@ -87,47 +121,16 @@ status_t ref_matmul_t::pd_t::init_conf() {
     for (int i = 0; i < matmul_kernel_fwd_t::max_supported_ndims; i++) {
         n_blocks *= dst_blocks[i];
     }
-    conf_.wk_size = n_blocks;
+    conf.wk_size = n_blocks;
 
     int high_two_bits = 3 << (ndims() - 2);
     // last two dimensions of data and weights are never broadcast
-    conf_.data_mask
-            = utils::get_dims_mask(dst_md()->dims, src_md()->dims, ndims())
+    conf.data_mask = utils::get_dims_mask(dst_d.dims(), src_d.dims(), ndims())
             | high_two_bits;
-    conf_.weights_mask
-            = utils::get_dims_mask(dst_md()->dims, weights_md(0)->dims, ndims())
+    conf.weights_mask
+            = utils::get_dims_mask(dst_d.dims(), weights_d.dims(), ndims())
             | high_two_bits;
-    conf_.bias_mask = utils::get_dims_mask(
-            dst_md()->dims, weights_md(1)->dims, ndims());
-
-    conf_.do_scale_data
-            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.do_scale_weights
-            = !attr()->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-    conf_.do_scale_dst
-            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-    conf_.single_weights_scale
-            = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ == 0;
-
-    conf_.use_data_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC_0);
-    conf_.use_weights_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS_0);
-    conf_.use_dst_zeropoints
-            = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
-
-    conf_.use_dropout = !attr()->dropout_.has_default_values();
-
-    conf_.post_ops = sycl_post_ops_t(attr());
-
-    for (auto i = 0; i < conf_.post_ops.get_post_op(); ++i) {
-        const auto &e = attr()->post_ops_.entry_[i];
-        if (e.is_binary() || e.is_prelu()) {
-            conf_.binary_src_arr[i] = xpu::sycl::md_t(
-                    arg_md(DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1));
-        }
-    }
-    return status::success;
+    conf.bias_mask = utils::get_dims_mask(dst_d.dims(), bias_d.dims(), ndims());
 }
 
 status_t ref_matmul_t::init(impl::engine_t *engine) {
@@ -137,14 +140,25 @@ status_t ref_matmul_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_matmul_t::execute(const exec_ctx_t &ctx) const {
+    if (memory_desc_wrapper(pd()->dst_md()).size() == 0) return status::success;
+
+    sycl_matmul_conf_t conf = pd()->conf_;
+    if (pd()->any_runtime_params_) {
+        const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
+        const auto weights_d
+                = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
+        const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
+        const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
+        pd()->init_rt_conf(conf, src_d, weights_d, dst_d, bias_d);
+    }
 
     parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
-        matmul_kernel_fwd_t matmul_kernel(pd()->conf_, cgh, ctx);
+        matmul_kernel_fwd_t matmul_kernel(conf, cgh, ctx);
 
         const int block_size = 32;
         const int wg_size = 32;
 
-        const int t_work = pd()->conf_.wk_size;
+        const int t_work = conf.wk_size;
         const int wg_work = wg_size * block_size;
         const int wg_cnt = utils::div_up(t_work, wg_work);
 
diff --git a/src/gpu/generic/sycl/ref_matmul.hpp b/src/gpu/generic/sycl/ref_matmul.hpp
index a61743d24b0..145ee66531b 100644
--- a/src/gpu/generic/sycl/ref_matmul.hpp
+++ b/src/gpu/generic/sycl/ref_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,27 +49,42 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper bias_d(weights_md(1));
             const memory_desc_wrapper dst_d(dst_md());
 
-            const bool ok = set_default_params() == status::success
-                    && attr_.set_default_formats(dst_md()) == status::success
-                    && check_data_types(src_d, weights_d, dst_d)
-                    && check_formats(src_d, weights_d, dst_d)
-                    && attr()->has_default_values(sm::scales_runtime
-                            | sm::zero_points_runtime | sm::post_ops
-                            | sm::dropout | sm::scales_runtime_data_type
-                            | sm::zero_points_runtime_data_type)
-                    && IMPLICATION(!attr()->scales_.has_default_values(),
-                            check_scales_mask())
-                    && post_ops_ok() && md_dims_in_range(src_md())
-                    && md_dims_in_range(weights_md());
-            if (!ok) return status::unimplemented;
-
-            return init_conf();
+            VDISPATCH_MATMUL_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_MATMUL_SC(attr_.set_default_formats(dst_md()),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(check_data_types(src_d, weights_d, dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_MATMUL(check_formats(src_d, weights_d, dst_d),
+                    VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_MATMUL(
+                    attr()->has_default_values(sm::post_ops | sm::dropout
+                            | sm::scales_data_type | sm::zero_points_data_type),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(IMPLICATION(!attr()->scales_.has_default_values(),
+                                     scales_ok()),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_MATMUL(sycl_post_ops_t::post_ops_ok(attr()),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_MATMUL(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+            VDISPATCH_MATMUL(md_dims_in_range(weights_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "weights");
+
+            init_conf();
+            return status::success;
         }
 
         sycl_matmul_conf_t conf_;
+        bool any_runtime_params_ = false;
+
+        void init_rt_conf(sycl_matmul_conf_t &conf,
+                const memory_desc_wrapper src_d,
+                const memory_desc_wrapper weights_d,
+                const memory_desc_wrapper dst_d,
+                const memory_desc_wrapper bias_d) const;
 
     private:
-        status_t init_conf();
+        void init_conf();
 
         status_t set_default_params() {
             if (src_md_.format_kind == format_kind::any) {
@@ -98,18 +113,19 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
             return status::success;
         }
 
-        bool check_scales_mask() const {
+        bool scales_ok() const {
             const std::vector<int> supported_args
                     = {DNNL_ARG_SRC_0, DNNL_ARG_WEIGHTS_0, DNNL_ARG_DST};
-            return attr_scales_ok(supported_args);
-        }
 
-        bool post_ops_ok() const {
-            // Dw conv post-ops are not supported.
-            return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
-                    && attr()->post_ops_.has_default_values(
-                            {primitive_kind::eltwise, primitive_kind::binary,
-                                    primitive_kind::sum});
+            const auto &scales = attr()->scales_;
+            bool dt_ok = true;
+            for (auto arg : supported_args) {
+                if (!scales.get(arg).has_default_values()) {
+                    dt_ok = dt_ok
+                            && is_supported_type(scales.get_data_type(arg));
+                }
+            }
+            return dt_ok && attr_scales_ok(supported_args);
         }
 
         static bool check_data_types(const memory_desc_wrapper &src,
@@ -135,7 +151,7 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
             using namespace format_tag;
 
             for (const auto &mdw : {src, weights, dst}) {
-                if (!mdw.is_plain() || mdw.has_runtime_dims()) { return false; }
+                if (!mdw.is_plain()) { return false; }
             }
             return true;
         }
diff --git a/src/gpu/generic/sycl/ref_pooling.cpp b/src/gpu/generic/sycl/ref_pooling.cpp
index 902c2ad22bf..5bfcd48922a 100644
--- a/src/gpu/generic/sycl/ref_pooling.cpp
+++ b/src/gpu/generic/sycl/ref_pooling.cpp
@@ -64,13 +64,7 @@ status_t ref_pooling_fwd_t::pd_t::init_conf() {
     conf_.DH = KDH(); //K:kernel D:Dilation H:Height
     conf_.DW = KDW(); //K:kernel D:Dilation W:Weight
 
-    const auto *att = attr();
-    const auto &attr_po = att->post_ops_;
-    if (attr_po.len() > sycl_post_ops_t::max_post_ops) {
-        return dnnl_unimplemented;
-    }
-    conf_.po_len = attr_po.len();
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
     return status::success;
 }
 
@@ -80,6 +74,11 @@ status_t ref_pooling_fwd_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_pooling_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
+    // XXX: Add support for 0-dim src
+    for (auto &md : {pd()->src_md(), pd()->dst_md()}) {
+        if (memory_desc_wrapper(md).size() == 0) return status::success;
+    }
+
     return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         auto nelems_A = memory_desc_wrapper(pd()->src_md(0)).nelems();
         pooling_fwd_kernel_vec_t pooling_fwd_kernel(pd()->conf_, cgh, ctx);
@@ -141,6 +140,10 @@ status_t ref_pooling_bwd_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_pooling_bwd_t::execute_backward(const exec_ctx_t &ctx) const {
+    for (auto &md : {pd()->diff_src_md(), pd()->diff_dst_md()}) {
+        if (memory_desc_wrapper(md).size() == 0) return status::success;
+    }
+
     return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         auto nelems_A = memory_desc_wrapper(pd()->diff_src_md(0)).nelems();
         pooling_bwd_kernel_vec_t pooling_bwd_kernel(pd()->conf_, cgh, ctx);
diff --git a/src/gpu/generic/sycl/ref_pooling.hpp b/src/gpu/generic/sycl/ref_pooling.hpp
index 692dfa589de..68c799d3626 100644
--- a/src/gpu/generic/sycl/ref_pooling.hpp
+++ b/src/gpu/generic/sycl/ref_pooling.hpp
@@ -52,23 +52,32 @@ struct ref_pooling_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper dst_d(dst_md(0));
             const memory_desc_wrapper ws_d(workspace_md(0));
 
-            const bool ok = is_fwd() && set_default_params() == status::success
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (!utils::one_of(
-                            f64, src_md(0)->data_type, dst_md(0)->data_type))
-                    && (IMPLICATION(src_md(0)->data_type == bf16,
-                            dst_md(0)->data_type == bf16))
-                    && (IMPLICATION(src_md(0)->data_type == s8,
-                            dst_md(0)->data_type != u8))
-                    && (IMPLICATION(src_md(0)->data_type == u8,
-                            dst_md(0)->data_type != s8))
-                    && (IMPLICATION(
-                            src_md(0)->data_type != dst_md(0)->data_type,
-                            desc()->prop_kind == forward_inference))
-                    && attr()->has_default_values(sm::post_ops)
-                    && attr_.set_default_formats(dst_md(0)) == status::success
-                    && md_dims_in_range(src_md());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_POOLING(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_POOLING_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_POOLING(
+                    (src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_POOLING((!utils::one_of(f64, src_md(0)->data_type,
+                                      dst_md(0)->data_type))
+                            && (IMPLICATION(src_md(0)->data_type == bf16,
+                                    dst_md(0)->data_type == bf16))
+                            && (IMPLICATION(src_md(0)->data_type == s8,
+                                    dst_md(0)->data_type != u8))
+                            && (IMPLICATION(src_md(0)->data_type == u8,
+                                    dst_md(0)->data_type != s8))
+                            && (IMPLICATION(src_md(0)->data_type
+                                            != dst_md(0)->data_type,
+                                    desc()->prop_kind == forward_inference)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_POOLING(attr()->has_default_values(sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_POOLING(sycl_post_ops_t::post_ops_ok(attr(), true, false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_POOLING_SC(attr_.set_default_formats(dst_md(0)),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_POOLING(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+
             bool is_training = desc_.prop_kind == prop_kind::forward_training;
             if (desc()->alg_kind == alg_kind::pooling_max && is_training)
                 init_default_ws();
@@ -105,23 +114,29 @@ struct ref_pooling_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_src_d(diff_src_md(0));
             const memory_desc_wrapper ws_d(workspace_md(0));
 
-            const bool ok = !is_fwd() && set_default_params() == status::success
-                    && (utils::everyone_is(f32, diff_src_md(0)->data_type,
-                                diff_dst_md(0)->data_type)
+            VDISPATCH_POOLING(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_POOLING_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_POOLING(
+                    (utils::everyone_is(f32, diff_src_md(0)->data_type,
+                             diff_dst_md(0)->data_type)
                             || utils::everyone_is(bf16,
                                     diff_src_md(0)->data_type,
                                     diff_dst_md(0)->data_type)
                             || utils::everyone_is(f16,
                                     diff_src_md(0)->data_type,
-                                    diff_dst_md(0)->data_type))
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values()
-                    && md_dims_in_range(diff_dst_md());
+                                    diff_dst_md(0)->data_type)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_POOLING(
+                    (src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_POOLING(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_POOLING(md_dims_in_range(diff_dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
 
-            if (!ok) return status::unimplemented;
             if (desc()->alg_kind == alg_kind::pooling_max) {
                 init_default_ws();
-                if (!compare_ws(hint_fwd_pd_)) return status::unimplemented;
+                VDISPATCH_POOLING(compare_ws(hint_fwd_pd_), VERBOSE_WS_INIT);
             }
             return init_conf();
         }
diff --git a/src/gpu/generic/sycl/ref_prelu.cpp b/src/gpu/generic/sycl/ref_prelu.cpp
index b0f828ccf0c..c694f930949 100644
--- a/src/gpu/generic/sycl/ref_prelu.cpp
+++ b/src/gpu/generic/sycl/ref_prelu.cpp
@@ -152,7 +152,7 @@ status_t ref_prelu_bwd_t::init(impl::engine_t *engine) {
 status_t ref_prelu_bwd_t::execute_backward(const exec_ctx_t &ctx) const {
     if (pd()->has_zero_dim_memory()) return status::success;
 
-    std::unique_ptr<memory_t> scratch_mem;
+    std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
     if (pd()->reduce_diff_weights_) {
         auto scratchpad_storage
                 = ctx.get_scratchpad_grantor().get_memory_storage(
diff --git a/src/gpu/generic/sycl/ref_prelu.hpp b/src/gpu/generic/sycl/ref_prelu.hpp
index 191d4f86c24..9b3269b2e89 100644
--- a/src/gpu/generic/sycl/ref_prelu.hpp
+++ b/src/gpu/generic/sycl/ref_prelu.hpp
@@ -51,18 +51,35 @@ struct ref_prelu_fwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper weights_d(weights_md(0));
             const memory_desc_wrapper dst_d(dst_md(0));
 
-            const bool ok = is_fwd() && set_default_formats()
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (weights_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && md_dims_in_range(src_md())
-                    && md_dims_in_range(weights_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_PRELU(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_PRELU(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_PRELU((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_PRELU(
+                    (weights_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_PRELU(check_data_types(data_d, weights_d, dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_PRELU(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
+            VDISPATCH_PRELU(md_dims_in_range(weights_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "weights");
+            VDISPATCH_PRELU(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
             return init_conf();
         }
 
         status_t init_conf();
         sycl_prelu_conf_t conf_;
+
+        static bool check_data_types(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &wei,
+                const memory_desc_wrapper &dst) {
+            for (const auto &mdw : {src, wei, dst}) {
+                if (!is_supported_type(mdw.data_type())) return false;
+            }
+            return true;
+        }
     };
 
     status_t init(impl::engine_t *engine) override;
@@ -92,15 +109,26 @@ struct ref_prelu_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_weights_d(diff_weights_md(0));
             const memory_desc_wrapper diff_dst_d(diff_dst_md(0));
 
-            const bool ok = !is_fwd() && set_default_formats()
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (weights_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && diff_src_md(0)->data_type == src_md(0)->data_type
-                    && diff_weights_md(0)->data_type == weights_md(0)->data_type
-                    && md_dims_in_range(diff_src_md())
-                    && md_dims_in_range(weights_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_PRELU(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_PRELU(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_PRELU((src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_PRELU(
+                    (weights_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_PRELU(diff_src_md(0)->data_type == src_md(0)->data_type,
+                    VERBOSE_INCONSISTENT_DT, "src", "diff_src");
+            VDISPATCH_PRELU(
+                    diff_weights_md(0)->data_type == weights_md(0)->data_type,
+                    VERBOSE_INCONSISTENT_DT, "weights", "diff_weights");
+            VDISPATCH_PRELU(check_data_types(data_d, weights_d, diff_dst_d),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_PRELU(md_dims_in_range(diff_src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "diff_src");
+            VDISPATCH_PRELU(md_dims_in_range(weights_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "weights");
+            VDISPATCH_PRELU(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
             CHECK(init_conf());
             CHECK(init_reduction(engine));
@@ -113,6 +141,16 @@ struct ref_prelu_bwd_t : public gpu::generic::sycl::primitive_t {
         status_t init_reduction(impl::engine_t *engine);
         void init_scratchpad();
 
+        static bool check_data_types(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &wei,
+                const memory_desc_wrapper &dst) {
+            for (const auto &mdw : {src, wei, dst}) {
+                if (!is_supported_type(mdw.data_type())) return false;
+            }
+
+            return true;
+        }
+
         sycl_prelu_conf_t conf_;
         bool reduce_diff_weights_ = false;
         memory_desc_t scratch_md_;
diff --git a/src/gpu/generic/sycl/ref_reduction.cpp b/src/gpu/generic/sycl/ref_reduction.cpp
new file mode 100644
index 00000000000..76350ad2f9a
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_reduction.cpp
@@ -0,0 +1,498 @@
+#include "ref_reduction.hpp"
+
+#include "gpu/generic/sycl/engine.hpp"
+#include "gpu/generic/sycl/reduction_kernels.hpp"
+
+#include <numeric>
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+inline int round_up_to_nearest_multiple(int val, int multiplier) {
+    const int diff = val % multiplier;
+    if (diff > 0) { val += (multiplier - diff); }
+    return val;
+}
+
+static int get_max_col_tile_dim(int col_lim, int max_wg_size, int max_sg_size) {
+    auto const max_tile_col = max_wg_size / max_sg_size;
+    return std::min(col_lim, max_tile_col);
+}
+
+static int get_max_row_tile_dim(
+        int row_lim, int tile_col, int max_wg_size, int max_sg_size) {
+    const auto max_row_tile = max_wg_size / tile_col;
+    const auto ideal_row_tile
+            = round_up_to_nearest_multiple(row_lim, max_sg_size);
+    auto ub_row_tile = round_up_to_nearest_multiple(max_row_tile, max_sg_size);
+    ub_row_tile = ub_row_tile > max_row_tile ? ub_row_tile - max_sg_size
+                                             : ub_row_tile;
+    return std::min(ideal_row_tile, ub_row_tile);
+}
+
+size_t ref_reduction_t::pd_t::compute_workspace_size(
+        const std::vector<int> &dims, const std::vector<int> &axes,
+        int reduce_size) {
+    if (axes.size() == 1 || reduce_size == 1) { return 0; }
+
+    auto out_sizes = get_first_two_out_sizes(dims, axes);
+    return std::accumulate(
+            out_sizes.begin(), out_sizes.end(), 1, std::multiplies<size_t>());
+}
+
+status_t ref_reduction_t::pd_t::init_scratchpad() {
+    dim_t dims1[] = {out_size_vec_[0]};
+    dim_t dims2[] = {out_size_vec_[1]};
+    memory_desc_init_by_tag(
+            scratch_md_1_, 1, dims1, data_type::f32, format_tag_t::dnnl_a);
+    memory_desc_init_by_tag(
+            scratch_md_2_, 1, dims2, data_type::f32, format_tag_t::dnnl_a);
+
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(memory_tracking::names::key_reduction, out_size_vec_[0],
+            types::data_type_size(data_type::f32));
+    scratchpad.book(memory_tracking::names::key_reduction_1, out_size_vec_[1],
+            types::data_type_size(data_type::f32));
+
+    return status::success;
+}
+
+status_t ref_reduction_t::pd_t::init_out_scratchpad() {
+    memory_desc_wrapper dst_wrap(dst_md());
+    auto scratchpad = scratchpad_registry().registrar();
+    scratchpad.book(memory_tracking::names::key_reduction_out,
+            dst_wrap.nelems(), types::data_type_size(data_type::f32));
+
+    return status::success;
+}
+
+status_t ref_reduction_t::pd_t::init_reorder(impl::engine_t *engine) {
+    reorder_src_md_ = *dst_md();
+    reorder_src_md_.data_type = data_type::f32;
+    CHECK(reorder_primitive_desc_create(
+            reorder_pd_, engine, &reorder_src_md_, dst_md()));
+
+    if (!reorder_pd_) { return status::invalid_arguments; }
+
+    return status::success;
+}
+
+reduction_sizes_t ref_reduction_t::pd_t::get_reduction_sizes(
+        const sycl_reduction_conf_t &conf) {
+    size_t input_size = 1;
+    for (size_t i = 0; i < xpu::sycl::md_t::max_dims; i++) {
+        if (conf.src_dims[i] == -1) break;
+        input_size *= conf.src_dims[i];
+    }
+    size_t reduction_size = 1;
+    for (size_t i = 0; i < xpu::sycl::md_t::max_dims; i++) {
+        if (conf.src_dims[i] == -1) break;
+        reduction_size *= conf.src_dims[conf.axes[i]];
+    }
+    const auto output_size = input_size / reduction_size;
+    return {input_size, reduction_size, output_size};
+}
+
+void ref_reduction_t::pd_t::squeeze_dims_and_axes(
+        const memory_desc_wrapper &src_wrap, const std::vector<bool> &axes_mask,
+        std::vector<int> &squeezed_dims, std::vector<int> &squeezed_axis) {
+    const auto &dims = src_wrap.dims();
+    int new_axis = 0;
+    for (int i = 0; i < src_wrap.ndims(); i++) {
+        int jump = i;
+        int new_dim = dims[i];
+        if (dims[i] == 1) { continue; }
+        while (axes_mask[i] && jump + 1 < src_wrap.ndims()
+                && (axes_mask[jump + 1] || dims[jump + 1] == 1)) {
+            new_dim *= dims[jump + 1];
+            ++jump;
+        }
+        if (axes_mask[i]) { squeezed_axis.push_back(new_axis); }
+        i = jump;
+        squeezed_dims.push_back(new_dim);
+        new_axis++;
+    }
+}
+
+std::vector<int> ref_reduction_t::pd_t::get_first_two_out_sizes(
+        const std::vector<int> &dims, const std::vector<int> &axes) {
+    std::vector<int> result {};
+    auto sorted_axes = axes;
+    std::sort(sorted_axes.begin(), sorted_axes.end(), std::greater<size_t>());
+    size_t total_size = std::accumulate(
+            dims.begin(), dims.end(), 1, std::multiplies<int>());
+
+    auto const size = std::min(sorted_axes.size(), 2UL);
+    for (size_t i = 0; i < size; i++) {
+        total_size /= dims[sorted_axes[i]];
+        result.push_back(total_size);
+    }
+    return result;
+}
+
+status_t ref_reduction_t::pd_t::init_conf(impl::engine_t *engine) {
+    auto *sycl_engine = utils::downcast<const impl::xpu::sycl::engine_impl_t *>(
+            engine->impl());
+    const ::sycl::device &sycl_device = sycl_engine->device();
+    bool supports_subgroup
+            = (sycl_device.get_info<::sycl::info::device::max_num_sub_groups>()
+                    > 0);
+    if (!supports_subgroup) return status::unimplemented;
+
+    const size_t max_work_group_size = std::min<size_t>(256,
+            sycl_device.get_info<::sycl::info::device::max_work_group_size>());
+
+    const auto max_work_item_sizes
+            = sycl_device
+                      .get_info<::sycl::info::device::max_work_item_sizes<3>>();
+    const auto max_wg_size
+            = std::min(max_work_item_sizes[2], max_work_group_size);
+
+#if defined(DNNL_SYCL_CUDA) || defined(DNNL_SYCL_HIP)
+    const auto max_sg_size = 32;
+#else
+    const auto subgroup_sizes
+            = sycl_device.get_info<::sycl::info::device::sub_group_sizes>();
+    const auto max_sg_size
+            = *std::max_element(subgroup_sizes.begin(), subgroup_sizes.end());
+#endif
+
+    bool supports_atomics = false;
+    for (const auto &cap :
+            sycl_device.get_info<
+                    ::sycl::info::device::atomic_memory_scope_capabilities>()) {
+        if (cap == ::sycl::memory_scope::work_group) {
+            supports_atomics = true;
+            break;
+        }
+    }
+
+    max_wg_size_ = max_wg_size;
+    max_sg_size_ = max_sg_size;
+
+    sycl_reduction_conf_t init_conf;
+    init_conf.alg = desc()->alg_kind;
+    init_conf.p = desc()->p;
+    init_conf.eps = desc()->eps;
+    init_conf.src_md = xpu::sycl::md_t(src_md());
+    init_conf.dst_md = xpu::sycl::md_t(dst_md());
+    init_conf.post_ops = sycl_post_ops_t(attr(), dst_md());
+
+    memory_desc_wrapper src_wrap(src_md());
+    memory_desc_wrapper dst_wrap(dst_md());
+    init_conf.src_dt = src_wrap.data_type();
+    init_conf.dst_dt = dst_wrap.data_type();
+
+    for (int i = 0; i < xpu::sycl::md_t::max_dims; ++i) {
+        init_conf.src_dims[i] = -1;
+        init_conf.axes[i] = -1;
+    }
+
+    std::vector<bool> axes_mask(src_wrap.ndims());
+    int arr_idx = 0;
+    for (int i = 0; i < src_wrap.ndims(); ++i) {
+        init_conf.src_dims[i] = src_wrap.dims()[i];
+
+        if (src_wrap.dims()[i] != 1 && dst_wrap.dims()[i] == 1) {
+            init_conf.axes[arr_idx] = i;
+            axes_mask[i] = true;
+            arr_idx++;
+        }
+    }
+    init_conf.num_dims = src_wrap.ndims();
+    init_conf.num_axes = arr_idx;
+
+    std::vector<int> new_dims;
+    std::vector<int> new_axes;
+    squeeze_dims_and_axes(src_wrap, axes_mask, new_dims, new_axes);
+    std::sort(new_axes.begin(), new_axes.end(), std::greater<int>());
+
+    squeezed_dims_ = new_dims;
+    squeezed_axes_ = new_axes;
+
+    auto num_dims = new_dims.size();
+    num_reductions_ = new_axes.size();
+    out_size_vec_ = get_first_two_out_sizes(new_dims, new_axes);
+
+    if (num_reductions_ == 1) {
+        auto const dims_begin = new_dims.begin();
+        auto const dims_end = new_dims.end();
+        auto const axis = new_axes[0];
+
+        // Setup internal params
+        init_conf.batch_size = std::accumulate(
+                dims_begin, dims_begin + axis, 1, std::multiplies<int>());
+        init_conf.reduce_size = new_dims[axis];
+        init_conf.stride_size = std::accumulate(
+                dims_begin + axis + 1, dims_end, 1, std::multiplies<int>());
+
+    } else if ((num_dims - num_reductions_) == 1) {
+        assert(num_dims == 3 && num_reductions_ == 2);
+        assert(new_axes[1] == 0 && new_axes[0] == 2);
+
+        init_conf.batch_size = new_dims[0] * new_dims[1];
+        init_conf.reduce_size = new_dims[2];
+        init_conf.stride_size = 1;
+        init_conf.batch_groups = new_dims[1];
+        multi_reduction_ = false;
+        num_reductions_ = 1;
+    } else {
+        multi_reduction_ = true;
+        CHECK(init_scratchpad());
+    }
+
+    if (init_conf.stride_size == 1) {
+        init_conf.transpose = false;
+        init_conf.bank_offset = false;
+    } else if (init_conf.stride_size > 4) {
+        init_conf.transpose = true;
+        init_conf.bank_offset = true;
+    } else {
+        init_conf.transpose = false;
+        init_conf.bank_offset = true;
+    }
+
+    auto dims = squeezed_dims_;
+    for (size_t red_iter = 0; red_iter < num_reductions_; ++red_iter) {
+        auto conf = init_conf;
+        const auto &axes = squeezed_axes_;
+        auto dims_begin = dims.begin();
+        auto dims_end = dims.end();
+        auto axis = axes[red_iter];
+        conf.is_first_iter = (red_iter == 0);
+        conf.is_last_iter = (red_iter == num_reductions_ - 1);
+        conf.batch_size = std::accumulate(
+                dims_begin, dims_begin + axis, 1, std::multiplies<int>());
+        conf.reduce_size = dims[axis];
+        if (axis < static_cast<int>(dims.size() - 1)) {
+            conf.stride_size = std::accumulate(
+                    dims_begin + axis + 1, dims_end, 1, std::multiplies<int>());
+        } else {
+            conf.stride_size = 1;
+        }
+
+        needs_atomic_reduction_ = conf.batch_groups != -1;
+        const auto batch_groups
+                = conf.batch_groups == -1 ? conf.batch_size : conf.batch_groups;
+        const auto max_wg_size = max_wg_size_;
+        const auto max_sg_size = max_sg_size_;
+        int tile_col = get_max_col_tile_dim(
+                conf.stride_size, max_wg_size, max_sg_size);
+        tile_col = std::min(tile_col, sycl_reduction_conf_t::local_col_wg);
+        int tile_row = get_max_row_tile_dim(
+                conf.reduce_size, tile_col, max_wg_size, max_sg_size);
+        tile_row = std::min(tile_row, sycl_reduction_conf_t::local_row_wg);
+        conf.tile_col = tile_col;
+        conf.tile_row = tile_row;
+        local_ranges_.emplace_back(range_t {1, tile_row, tile_col});
+
+        auto global_col
+                = round_up_to_nearest_multiple(conf.stride_size, tile_col);
+        auto global_row
+                = round_up_to_nearest_multiple(conf.reduce_size, tile_row);
+        global_ranges_.emplace_back(
+                range_t {conf.batch_size, global_row, global_col});
+        needs_atomic_reduction_ = needs_atomic_reduction_
+                || (global_row > std::min(
+                            tile_row, conf.num_sg_reductions * max_sg_size));
+
+        VDISPATCH_REDUCTION(
+                IMPLICATION(needs_atomic_reduction_, supports_atomics),
+                "Implementation needs to perform atomic reduction, but atomics "
+                "are not supported by current device");
+        VDISPATCH_REDUCTION(
+                IMPLICATION(needs_atomic_reduction_, !attr()->deterministic_),
+                "Atomic reduction is only supported in non-deterministic mode");
+        VDISPATCH_REDUCTION(IMPLICATION(needs_atomic_reduction_,
+                                    conf.alg != alg_kind::reduction_mul),
+                "Algorithm Mul is not supported with atomic reduction");
+        VDISPATCH_REDUCTION(IMPLICATION(needs_atomic_reduction_,
+                                    attr()->post_ops_.find(dnnl_sum) == -1),
+                "Sum postop is not supported with atomic reduction");
+
+        const size_t dt_size = data_type_size(data_type::f32);
+        local_mem_sizes_.push_back(
+                ((tile_row + conf.bank_offset) * (tile_col + conf.bank_offset))
+                * dt_size);
+
+        needs_reorder_ = needs_atomic_reduction_
+                && dst_wrap.data_type() != data_type::f32;
+        if (needs_reorder_) { conf.dst_dt = data_type::f32; }
+
+        if (multi_reduction_) {
+            if (red_iter != 0) { conf.src_dt = data_type::f32; }
+            if (red_iter != num_reductions_ - 1) {
+                conf.dst_dt = data_type::f32;
+            }
+        }
+
+        conf.batch_groups = batch_groups;
+        confs_.push_back(conf);
+        dims[axes[red_iter]] = 1;
+    }
+
+    if (needs_reorder_) { CHECK(init_reorder(engine)); }
+    if (needs_atomic_reduction_) { CHECK(init_out_scratchpad()); }
+
+    return status::success;
+}
+
+status_t ref_reduction_t::init(impl::engine_t *engine) {
+    const auto reduction_kid = ::sycl::get_kernel_id<reduction_kernel_fwd_t>();
+    CHECK(create_kernel(engine, reduction_kid, &kernel_));
+
+    if (pd()->needs_atomic_reduction_) {
+        const auto init_kid = ::sycl::get_kernel_id<init_kernel_t>();
+        const auto finalize_kid
+                = ::sycl::get_kernel_id<atomic_finalize_kernel_t>();
+        CHECK(create_kernel(engine, init_kid, &init_kernel_));
+        CHECK(create_kernel(engine, finalize_kid, &finalize_kernel_));
+    }
+
+    if (pd()->needs_reorder_) {
+        CHECK(pd()->reorder_pd_->create_primitive(reorder_p_, engine));
+    }
+
+    return status::success;
+}
+
+status_t ref_reduction_t::execute(const exec_ctx_t &ctx) const {
+    auto dst_wrap = memory_desc_wrapper(pd()->dst_md());
+    auto scratch_wrap = memory_desc_wrapper(pd()->reorder_src_md_);
+    const bool needs_atomic_reduction = pd()->needs_atomic_reduction_;
+    const bool needs_reorder = pd()->needs_reorder_;
+    for (size_t i = 0; i < pd()->num_reductions_; ++i) {
+        const auto &conf = pd()->confs_[i];
+
+        if (needs_reorder
+                && ((pd()->multi_reduction_ && pd()->num_reductions_ - 1 == i)
+                        || i == 0)) {
+            CHECK(parallel_for(ctx, init_kernel_, [&](::sycl::handler &cgh) {
+                auto out = CTX_OUT_SCRATCH_KERNEL_MEMORY(key_reduction_out);
+                init_kernel_t kernel(out, pd()->desc()->alg_kind);
+                cgh.parallel_for(
+                        ::sycl::range<1>(scratch_wrap.nelems()), kernel);
+            }));
+        }
+
+        if (!needs_reorder
+                && (needs_atomic_reduction
+                        && ((pd()->multi_reduction_
+                                    && pd()->num_reductions_ - 1 == i)
+                                || i == 0))) {
+            CHECK(parallel_for(ctx, init_kernel_, [&](::sycl::handler &cgh) {
+                auto out = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
+                init_kernel_t kernel(out, pd()->desc()->alg_kind);
+                cgh.parallel_for(::sycl::range<1>(dst_wrap.nelems()), kernel);
+            }));
+        }
+
+        const size_t local_mem_size_bytes = pd()->local_mem_sizes_[i];
+        const auto &global_range = pd()->global_ranges_[i];
+        const auto &local_range = pd()->local_ranges_[i];
+        CHECK(parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+            auto src_arg = CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC);
+            auto dst_arg = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
+
+            auto temp_arg1_in
+                    = xpu::sycl::memory_storage_base_t::empty_in_memory_arg(
+                            ctx.stream(), cgh);
+            auto temp_arg1_out
+                    = xpu::sycl::memory_storage_base_t::empty_out_memory_arg(
+                            ctx.stream(), cgh);
+            auto temp_arg2_in
+                    = xpu::sycl::memory_storage_base_t::empty_in_memory_arg(
+                            ctx.stream(), cgh);
+            auto temp_arg2_out
+                    = xpu::sycl::memory_storage_base_t::empty_out_memory_arg(
+                            ctx.stream(), cgh);
+            auto out_scratch
+                    = xpu::sycl::memory_storage_base_t::empty_out_memory_arg(
+                            ctx.stream(), cgh);
+
+            if (pd()->multi_reduction_) {
+                temp_arg1_in = CTX_IN_SCRATCH_KERNEL_MEMORY(key_reduction);
+                temp_arg1_out = CTX_OUT_SCRATCH_KERNEL_MEMORY(key_reduction);
+                temp_arg2_in = CTX_IN_SCRATCH_KERNEL_MEMORY(key_reduction_1);
+                temp_arg2_out = CTX_OUT_SCRATCH_KERNEL_MEMORY(key_reduction_1);
+            }
+            if (pd()->needs_reorder_) {
+                out_scratch = CTX_OUT_SCRATCH_KERNEL_MEMORY(key_reduction_out);
+            }
+
+            auto local_mem = ::sycl::local_accessor<uint8_t, 1>(
+                    ::sycl::range<1>(local_mem_size_bytes), cgh);
+
+            auto src = i == 0    ? src_arg
+                    : i % 2 != 0 ? temp_arg1_in
+                                 : temp_arg2_in;
+            auto dst = i == pd()->num_reductions_ - 1
+                    ? (pd()->needs_reorder_ ? out_scratch : dst_arg)
+                    : i % 2 != 0 ? temp_arg2_out
+                                 : temp_arg1_out;
+
+            reduction_kernel_fwd_t reduction_kernel(src, dst, conf,
+                    needs_atomic_reduction, local_mem, cgh, ctx);
+            ::sycl::nd_range<3> range(::sycl::range<3>(global_range.x,
+                                              global_range.y, global_range.z),
+                    ::sycl::range<3>(
+                            local_range.x, local_range.y, local_range.z));
+            cgh.parallel_for(range, reduction_kernel);
+        }));
+    }
+
+    const auto &conf = pd()->confs_[0];
+    const auto alg = conf.alg;
+    sycl_post_ops_t post_ops = sycl_post_ops_t(
+            pd()->attr(), memory_desc_wrapper(pd()->dst_md()));
+    if (needs_atomic_reduction
+            && (utils::one_of(alg, alg_kind::reduction_norm_lp_max,
+                        alg_kind::reduction_norm_lp_sum,
+                        alg_kind::reduction_norm_lp_power_p_max,
+                        alg_kind::reduction_norm_lp_power_p_sum,
+                        alg_kind::reduction_mean)
+                    || pd()->attr()->post_ops_.len() != 0)) {
+        float full_reduce_size = 1.f;
+        if (alg == alg_kind::reduction_mean) {
+            for (auto &axis : pd()->squeezed_axes_) {
+                full_reduce_size *= pd()->squeezed_dims_[axis];
+            }
+        }
+
+        CHECK(parallel_for(ctx, finalize_kernel_, [&](::sycl::handler &cgh) {
+            auto out = CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST);
+            auto out_scratch = CTX_OUT_SCRATCH_KERNEL_MEMORY(key_reduction_out);
+            atomic_finalize_kernel_t kernel(cgh, ctx,
+                    (pd()->needs_reorder_ ? conf.local_mem_dt
+                                          : pd()->dst_md()->data_type),
+                    (pd()->needs_reorder_ ? out_scratch : out), alg, conf.p,
+                    conf.eps, post_ops, conf.dst_md, full_reduce_size);
+            cgh.parallel_for(::sycl::range<1>(dst_wrap.nelems()), kernel);
+        }));
+    }
+
+    if (!needs_reorder) { return status::success; }
+
+    std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
+    auto scratchpad_storage = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_reduction_out);
+    CHECK(safe_ptr_assign(scratch_mem,
+            new memory_t(ctx.stream()->engine(), &pd()->reorder_src_md_,
+                    std::move(scratchpad_storage))));
+
+    exec_args_t reorder_args;
+    reorder_args[DNNL_ARG_SRC] = memory_arg_t {scratch_mem.get(), true};
+    reorder_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DST);
+    exec_ctx_t reorder_ctx(ctx, std::move(reorder_args));
+
+    return reorder_p_->execute(reorder_ctx);
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/generic/sycl/ref_reduction.hpp b/src/gpu/generic/sycl/ref_reduction.hpp
new file mode 100644
index 00000000000..138dca8c276
--- /dev/null
+++ b/src/gpu/generic/sycl/ref_reduction.hpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_REF_REDUCTION_HPP
+#define GPU_GENERIC_SYCL_REF_REDUCTION_HPP
+
+#include "common/primitive_desc_iterator.hpp"
+#include "common/reorder.hpp"
+#include "common/reorder_pd.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+#include "gpu/gpu_reduction_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct reduction_sizes_t {
+    size_t input_size = 0;
+    size_t reduce_size = 0;
+    size_t output_size = 0;
+};
+
+struct range_t {
+    int x, y, z;
+};
+
+struct ref_reduction_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_reduction_pd_t {
+        using gpu_reduction_pd_t::gpu_reduction_pd_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_reduction_t);
+
+        status_t init(impl::engine_t *engine) {
+            using sm = primitive_attr_t::skip_mask_t;
+
+            memory_desc_wrapper src_wrap(src_md());
+            memory_desc_wrapper dst_wrap(dst_md());
+
+            bool ok = set_default_params() == status::success
+                    && attr()->has_default_values(sm::post_ops)
+                    && sycl_post_ops_t::post_ops_ok(attr())
+                    && attr_.set_default_formats(dst_md()) == status::success
+                    && src_wrap.is_plain() && dst_wrap.is_plain()
+                    && src_wrap.is_dense() && dst_wrap.is_dense()
+                    && src_wrap.ndims() == dst_wrap.ndims()
+                    && src_wrap.ndims() <= xpu::sycl::md_t::max_dims
+                    && md_dims_in_range(src_md()) && md_dims_in_range(dst_md())
+                    && check_work_amount(src_wrap);
+            if (!ok) return status::unimplemented;
+
+            return init_conf(engine);
+        }
+
+        std::vector<sycl_reduction_conf_t> confs_;
+        std::vector<range_t> global_ranges_;
+        std::vector<range_t> local_ranges_;
+        std::vector<size_t> local_mem_sizes_;
+        bool needs_atomic_reduction_;
+        bool needs_reorder_;
+        std::vector<int> squeezed_dims_;
+        std::vector<int> squeezed_axes_;
+        std::vector<int> out_size_vec_;
+        size_t num_reductions_ = 0;
+        bool multi_reduction_ = false;
+        memory_desc_t scratch_md_1_, scratch_md_2_;
+        memory_desc_t reorder_src_md_;
+        std::shared_ptr<primitive_desc_t> reorder_pd_;
+
+        int max_wg_size_;
+        int max_sg_size_;
+
+    private:
+        bool check_work_amount(const memory_desc_wrapper &src_mdw) {
+            // Arbitrary threshold
+            // XXX: Refactor kernel to support larger problems
+            return src_mdw.nelems() < 9000000;
+        }
+
+        reduction_sizes_t get_reduction_sizes(
+                const sycl_reduction_conf_t &conf);
+        void squeeze_dims_and_axes(const memory_desc_wrapper &src_wrap,
+                const std::vector<bool> &axes_mask,
+                std::vector<int> &squeezed_dims,
+                std::vector<int> &squeezed_axis);
+        std::vector<int> get_first_two_out_sizes(
+                const std::vector<int> &dims, const std::vector<int> &axes);
+
+        size_t compute_workspace_size(const std::vector<int> &dims,
+                const std::vector<int> &axes, int reduce_size);
+        status_t init_scratchpad();
+        status_t init_out_scratchpad();
+        status_t init_reorder(impl::engine_t *engine);
+        status_t init_conf(impl::engine_t *engine);
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t launch_kernel(const exec_ctx_t &ctx, sycl_reduction_conf_t &conf,
+            size_t red_iter, bool &needs_reorder,
+            bool &needs_atomic_reduction) const;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    kernel_t kernel_;
+    kernel_t init_kernel_;
+    kernel_t finalize_kernel_;
+    std::shared_ptr<impl::primitive_t> reorder_p_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/ref_reorder.cpp b/src/gpu/generic/sycl/ref_reorder.cpp
index b6fffd37ff3..45d2e68f574 100644
--- a/src/gpu/generic/sycl/ref_reorder.cpp
+++ b/src/gpu/generic/sycl/ref_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_reorder.hpp"
 #include "gpu/generic/sycl/reorder_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -29,19 +30,13 @@ status_t ref_reorder_t::pd_t::init_conf() {
     conf_.src_md = xpu::sycl::md_t(src_md(0));
     conf_.dst_md = xpu::sycl::md_t(dst_md());
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
-
     conf_.wk_size = memory_desc_wrapper(src_md(0)).nelems();
 
-    conf_.do_scale_src
-            = !attr()->scales_.get(DNNL_ARG_SRC_0).has_default_values();
-    conf_.scale_src_mask = attr()->scales_.get(DNNL_ARG_SRC_0).mask_;
-    conf_.do_scale_dst
-            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-    conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_;
-    conf_.post_ops = sycl_post_ops_t(attr());
+    conf_.do_scale_src = !attr()->scales_.has_default_values(DNNL_ARG_SRC_0);
+    conf_.scale_src_mask = attr()->scales_.get_mask(DNNL_ARG_SRC_0);
+    conf_.do_scale_dst = !attr()->scales_.has_default_values(DNNL_ARG_DST);
+    conf_.scale_dst_mask = attr()->scales_.get_mask(DNNL_ARG_DST);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
@@ -56,15 +51,7 @@ status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
     parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         reorder_kernel_t reorder_kernel(pd()->conf_, cgh, ctx);
 
-        const int block_size = pd()->conf_.block_size;
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_work = wg_size * block_size;
-        const int wg_cnt = utils::div_up(t_work, wg_work);
-
-        cgh.parallel_for(
-                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), reorder_kernel);
+        cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size), reorder_kernel);
     });
 
     return status::success;
diff --git a/src/gpu/generic/sycl/ref_reorder.hpp b/src/gpu/generic/sycl/ref_reorder.hpp
index 620faf5fe96..85e55106ef9 100644
--- a/src/gpu/generic/sycl/ref_reorder.hpp
+++ b/src/gpu/generic/sycl/ref_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,16 +47,26 @@ struct ref_reorder_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper src_d(src_md());
             const memory_desc_wrapper dst_d(dst_md());
 
-            const bool ok = src_engine == dst_engine
-                    && !src_d.has_runtime_dims_or_strides()
-                    && !dst_d.has_runtime_dims_or_strides()
-                    && check_data_types(src_d, dst_d)
-                    && check_formats(src_d, dst_d)
-                    && attr()->has_default_values(
-                            sm::scales_runtime | sm::post_ops)
-                    && sycl_post_ops_t::post_ops_ok(attr())
-                    && md_dims_in_range(dst_md());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_REORDER(src_engine == dst_engine,
+                    "engine mismatch for src and dst tensors");
+            VDISPATCH_REORDER(!src_d.has_runtime_dims_or_strides(),
+                    VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+            VDISPATCH_REORDER(!dst_d.has_runtime_dims_or_strides(),
+                    VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+            VDISPATCH_REORDER(
+                    check_data_types(src_d, dst_d), VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_REORDER(
+                    check_formats(src_d, dst_d), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_REORDER(
+                    attr()->has_default_values(sm::scales | sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_REORDER(IMPLICATION(!attr()->scales_.has_default_values(),
+                                      scales_ok()),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_REORDER(sycl_post_ops_t::post_ops_ok(attr()),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_REORDER(md_dims_in_range(dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "dst");
 
             return init_conf();
         }
@@ -70,13 +80,8 @@ struct ref_reorder_t : public gpu::generic::sycl::primitive_t {
 
         static bool check_data_types(const memory_desc_wrapper &src,
                 const memory_desc_wrapper &dst) {
-            using namespace data_type;
-
-            const auto src_dt = src.data_type();
-            const auto dst_dt = dst.data_type();
-
-            for (auto t : {src_dt, dst_dt}) {
-                if (!utils::one_of(t, f32, bf16, f16, s8, u8)) return false;
+            for (const auto &mdw : {src, dst}) {
+                if (!is_supported_type(mdw.data_type())) return false;
             }
 
             return true;
@@ -91,6 +96,18 @@ struct ref_reorder_t : public gpu::generic::sycl::primitive_t {
             }
             return true;
         }
+
+        bool scales_ok() const {
+            const std::vector<int> supported_args
+                    = {DNNL_ARG_SRC, DNNL_ARG_DST};
+
+            const auto &scales = attr()->scales_;
+            for (auto arg : supported_args) {
+                const auto dt = scales.get_data_type(arg);
+                if (!is_supported_type(dt)) { return false; }
+            }
+            return true;
+        }
     };
 
     status_t init(impl::engine_t *engine) override;
diff --git a/src/gpu/generic/sycl/ref_resampling.cpp b/src/gpu/generic/sycl/ref_resampling.cpp
index 8ca8eb601b7..dd1bfd18622 100644
--- a/src/gpu/generic/sycl/ref_resampling.cpp
+++ b/src/gpu/generic/sycl/ref_resampling.cpp
@@ -43,14 +43,8 @@ status_t ref_resampling_fwd_t::pd_t::init_conf() {
     conf_.dst_md = xpu::sycl::md_t(dst_md());
 
     conf_.alg = desc()->alg_kind;
-    const auto *att = attr();
-    const auto &attr_po = att->post_ops_;
-    if (attr_po.len() > sycl_post_ops_t::max_post_ops) {
-        return dnnl_unimplemented;
-    }
-    conf_.po_len = attr_po.len();
 
-    conf_.post_ops = sycl_post_ops_t(attr());
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
     return status::success;
 }
 
diff --git a/src/gpu/generic/sycl/ref_resampling.hpp b/src/gpu/generic/sycl/ref_resampling.hpp
index a87589d8128..10d80ded88a 100644
--- a/src/gpu/generic/sycl/ref_resampling.hpp
+++ b/src/gpu/generic/sycl/ref_resampling.hpp
@@ -41,25 +41,30 @@ struct ref_resampling_fwd_t : public gpu::generic::sycl::primitive_t {
         DECLARE_COMMON_PD_T("dpcpp:ref:any", ref_resampling_fwd_t);
 
         status_t init(impl::engine_t *engine) {
-            using namespace data_type;
             using namespace prop_kind;
             using namespace alg_kind;
             using sm = primitive_attr_t::skip_mask_t;
             const memory_desc_wrapper src_d(src_md(0));
             const memory_desc_wrapper dst_d(dst_md(0));
 
-            const bool ok = is_fwd()
-                    && utils::one_of(
-                            src_md(0)->data_type, f32, bf16, f16, s32, s8, u8)
-                    && utils::one_of(
-                            dst_md(0)->data_type, f32, bf16, f16, s32, s8, u8)
-                    && attr()->has_default_values(sm::post_ops)
-                    && set_default_params() == status::success
-                    && attr_.set_default_formats(dst_md(0)) == status::success
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && md_dims_in_range(src_md());
-
-            if (!ok) { return status::unimplemented; }
+            VDISPATCH_RESAMPLING(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_RESAMPLING(is_supported_type(src_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_RESAMPLING(is_supported_type(dst_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_RESAMPLING(attr()->has_default_values(sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_RESAMPLING(sycl_post_ops_t::post_ops_ok(attr()),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_RESAMPLING_SC(
+                    set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_RESAMPLING_SC(attr_.set_default_formats(dst_md(0)),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_RESAMPLING(
+                    (src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_RESAMPLING(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
             return init_conf();
         }
 
@@ -92,12 +97,23 @@ struct ref_resampling_bwd_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper diff_dst_d(diff_dst_md(0));
             const memory_desc_wrapper diff_src_d(diff_src_md(0));
 
-            bool ok = !is_fwd() && set_default_params() == status::success
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && (diff_dst_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values()
-                    && md_dims_in_range(diff_dst_md());
-            if (!ok) return status::unimplemented;
+            VDISPATCH_RESAMPLING(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_RESAMPLING(is_supported_type(diff_src_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_RESAMPLING(is_supported_type(diff_dst_md(0)->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_RESAMPLING_SC(
+                    set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_RESAMPLING(
+                    (diff_src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_RESAMPLING(
+                    (diff_dst_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_RESAMPLING(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_RESAMPLING(md_dims_in_range(diff_dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
             return init_conf();
         }
 
diff --git a/src/gpu/generic/sycl/ref_shuffle.hpp b/src/gpu/generic/sycl/ref_shuffle.hpp
index c8d97dd7188..f56db73f5ec 100644
--- a/src/gpu/generic/sycl/ref_shuffle.hpp
+++ b/src/gpu/generic/sycl/ref_shuffle.hpp
@@ -46,17 +46,27 @@ struct ref_shuffle_t : public gpu::generic::sycl::primitive_t {
             auto src_data_md = invariant_src_md();
             auto dst_data_md = invariant_dst_md();
 
-            const bool ok = src_data_md->data_type == dst_data_md->data_type
-                    && (utils::one_of(src_data_md->data_type, bf16, f16, f32)
+            VDISPATCH_SHUFFLE(src_data_md->data_type == dst_data_md->data_type,
+                    VERBOSE_INCONSISTENT_DT, "src", "dst");
+            VDISPATCH_SHUFFLE(
+                    (utils::one_of(src_data_md->data_type, bf16, f16, f32)
                             || (is_fwd()
                                     && utils::one_of(src_data_md->data_type,
-                                            s32, s8, u8)))
-                    && set_default_formats_common()
-                    && IMPLICATION(is_fwd(),
-                            src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values()
-                    && md_dims_in_range(src_md());
-            if (!ok) return status::unimplemented;
+                                            s32, s8, u8))),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SHUFFLE(
+                    set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_SHUFFLE(
+                    IMPLICATION(is_fwd(),
+                            src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_SHUFFLE(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_SHUFFLE(memory_desc_wrapper(src_data_md)
+                            == memory_desc_wrapper(dst_data_md),
+                    VERBOSE_INCONSISTENT_MDS, "src", "dst");
+            VDISPATCH_SHUFFLE(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
             return init_conf();
         }
 
diff --git a/src/gpu/generic/sycl/ref_softmax.cpp b/src/gpu/generic/sycl/ref_softmax.cpp
index 7f0f85bdd38..01a9cbedc54 100644
--- a/src/gpu/generic/sycl/ref_softmax.cpp
+++ b/src/gpu/generic/sycl/ref_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "gpu/generic/sycl/ref_softmax.hpp"
 #include "gpu/generic/sycl/softmax_kernels.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,8 +29,6 @@ status_t ref_sycl_softmax_fwd_t::pd_t::init_conf() {
     conf_.src_md = xpu::sycl::md_t(src_md());
     conf_.dst_md = xpu::sycl::md_t(dst_md());
     conf_.alg_kind = desc()->alg_kind;
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
     conf_.axis = axis();
     conf_.axis_size = axis_size(true);
     conf_.inner_size = inner_size();
@@ -37,13 +36,10 @@ status_t ref_sycl_softmax_fwd_t::pd_t::init_conf() {
     conf_.channels = axis_size();
     conf_.wk_size = inner_size() * outer_size();
 
-    conf_.do_scale_src
-            = !attr()->scales_.get(DNNL_ARG_SRC).has_default_values();
-    conf_.do_scale_dst
-            = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
+    conf_.do_scale_src = !attr()->scales_.has_default_values(DNNL_ARG_SRC);
+    conf_.do_scale_dst = !attr()->scales_.has_default_values(DNNL_ARG_DST);
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
-    conf_.po_len = attr()->post_ops_.len();
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
@@ -54,19 +50,13 @@ status_t ref_sycl_softmax_fwd_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_sycl_softmax_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-    return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
-        const auto block_size = pd()->conf_.block_size;
-        const auto wg_size = pd()->conf_.wg_size;
-        const auto t_work = pd()->conf_.wk_size;
-        const auto wg_work = wg_size * block_size;
-        const auto wg_cnt = (t_work + wg_work - 1) / wg_work;
-        auto n_thr = wg_cnt * wg_size;
-        n_thr = n_thr < 1 ? 1 : n_thr;
+    if (pd()->has_zero_dim_memory()) return status::success;
 
+    return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         softmax_fwd_kernel_vec_t softmax_fwd_kernel_(pd()->conf_, cgh, ctx);
 
         cgh.parallel_for(
-                ::sycl::nd_range<1>(n_thr, wg_size), softmax_fwd_kernel_);
+                get_range(ctx, pd()->conf_.wk_size), softmax_fwd_kernel_);
     });
 }
 
@@ -76,8 +66,6 @@ status_t ref_sycl_softmax_bwd_t::pd_t::init_conf() {
     conf_.diff_dst_md = xpu::sycl::md_t(diff_dst_md());
     conf_.diff_src_md = xpu::sycl::md_t(diff_src_md());
     conf_.alg_kind = desc()->alg_kind;
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
     conf_.axis = axis();
     conf_.axis_size = axis_size(true);
     conf_.inner_size = inner_size();
@@ -94,19 +82,13 @@ status_t ref_sycl_softmax_bwd_t::init(impl::engine_t *engine) {
 }
 
 status_t ref_sycl_softmax_bwd_t::execute_backward(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim_memory()) return status::success;
+
     return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
         softmax_bwd_kernel_vec_t softmax_bwd_kernel(pd()->conf_, cgh, ctx);
 
-        const auto block_size = pd()->conf_.block_size;
-        const auto wg_size = pd()->conf_.wg_size;
-        const auto t_work = pd()->conf_.wk_size;
-        const auto wg_work = wg_size * block_size;
-        const auto wg_cnt = (t_work + wg_work - 1) / wg_work;
-        auto n_thr = wg_cnt * wg_size;
-        n_thr = n_thr < 1 ? 1 : n_thr;
-
         cgh.parallel_for(
-                ::sycl::nd_range<1>(n_thr, wg_size), softmax_bwd_kernel);
+                get_range(ctx, pd()->conf_.wk_size), softmax_bwd_kernel);
     });
 }
 
diff --git a/src/gpu/generic/sycl/ref_softmax.hpp b/src/gpu/generic/sycl/ref_softmax.hpp
index 17464aba5d7..47f533883ce 100644
--- a/src/gpu/generic/sycl/ref_softmax.hpp
+++ b/src/gpu/generic/sycl/ref_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,37 +39,47 @@ struct ref_sycl_softmax_fwd_t : public gpu::generic::sycl::primitive_t {
         status_t init(impl::engine_t *engine) {
             using sm = primitive_attr_t::skip_mask_t;
 
-            bool ok = is_fwd() && check_data_types(src_md()->data_type)
-                    && check_data_types(dst_md()->data_type)
-                    && (src_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && attr()->has_default_values(
-                            sm::scales_runtime | sm::post_ops)
-                    && attr_oscale_ok()
-                    && sycl_post_ops_t::post_ops_ok(attr(), true, false)
-                    && set_default_formats() == status::success
-                    && attr_.set_default_formats(dst_md()) == status::success
-                    && md_dims_in_range(src_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_SOFTMAX(is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_SOFTMAX(check_data_types(src_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(check_data_types(dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(
+                    (src_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_SOFTMAX(
+                    attr()->has_default_values(sm::scales | sm::post_ops),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_SOFTMAX(sycl_post_ops_t::post_ops_ok(attr(), true, false),
+                    VERBOSE_UNSUPPORTED_POSTOP);
+            VDISPATCH_SOFTMAX_SC(
+                    set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_SOFTMAX_SC(attr_.set_default_formats(dst_md()),
+                    VERBOSE_UNSUPPORTED_TAG_S, "dst");
+            VDISPATCH_SOFTMAX(check_formats(src_md(), dst_md()),
+                    VERBOSE_UNSUPPORTED_TAG_S, "src,dst");
+            VDISPATCH_SOFTMAX(md_dims_in_range(src_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "src");
             return init_conf();
         }
 
         sycl_softmax_conf_t conf_;
         status_t init_conf();
 
-        bool attr_oscale_ok() const {
-            const auto &scales = attr()->scales_;
-            bool ok = true;
-            for (const auto &e : scales.scales_) {
-                ok = ok && e.second.mask_ == 0;
-            }
-            return ok;
-        }
-
         bool check_data_types(data_type_t src) {
             return utils::one_of(src, data_type::f32, data_type::bf16,
                     data_type::f16, data_type::s8, data_type::u8);
         }
+
+        static bool check_formats(const memory_desc_wrapper &src,
+                const memory_desc_wrapper &dst) {
+            for (const auto &mdw : {src, dst}) {
+                if (!mdw.is_plain()) return false;
+            }
+
+            return true;
+        }
     };
 
     status_t init(impl::engine_t *engine) override;
@@ -94,16 +104,30 @@ struct ref_sycl_softmax_bwd_t : public gpu::generic::sycl::primitive_t {
 
         status_t init(impl::engine_t *engine) {
             using namespace data_type;
-            bool ok = !is_fwd()
-                    && utils::one_of(dst_md()->data_type, f32, bf16, f16)
-                    && utils::one_of(diff_src_md()->data_type, f32, bf16, f16)
-                    && (dst_md(0)->format_desc.blocking.inner_nblks == 0)
-                    && dst_md()->data_type == diff_dst_md()->data_type
-                    && attr()->has_default_values()
-                    && set_default_formats() == status::success
-                    && md_dims_in_range(diff_dst_md());
-
-            if (!ok) return status::unimplemented;
+            VDISPATCH_SOFTMAX(!is_fwd(), VERBOSE_BAD_PROPKIND);
+            VDISPATCH_SOFTMAX(
+                    utils::one_of(dst_md()->data_type, f32, bf16, f16),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(
+                    utils::one_of(diff_src_md()->data_type, f32, bf16, f16),
+                    VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_SOFTMAX(
+                    (dst_md(0)->format_desc.blocking.inner_nblks == 0),
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
+            VDISPATCH_SOFTMAX(dst_md()->data_type == diff_dst_md()->data_type,
+                    VERBOSE_INCONSISTENT_DT, "dst", "diff_dst");
+            VDISPATCH_SOFTMAX(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_SOFTMAX_SC(
+                    set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_SOFTMAX(memory_desc_wrapper(diff_src_md()).is_plain(),
+                    VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "diff_src");
+            VDISPATCH_SOFTMAX(memory_desc_wrapper(diff_dst_md()).is_plain(),
+                    VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "diff_dst");
+            VDISPATCH_SOFTMAX(memory_desc_wrapper(dst_md()).is_plain(),
+                    VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+            VDISPATCH_SOFTMAX(md_dims_in_range(diff_dst_md()),
+                    VERBOSE_OUT_OF_RANGE_DIMS, "diff_dst");
             return init_conf();
         }
 
diff --git a/src/gpu/generic/sycl/ref_sum.cpp b/src/gpu/generic/sycl/ref_sum.cpp
index 3c9691871b0..dc92e6e91f0 100644
--- a/src/gpu/generic/sycl/ref_sum.cpp
+++ b/src/gpu/generic/sycl/ref_sum.cpp
@@ -17,6 +17,7 @@
 #include "gpu/generic/sycl/ref_sum.hpp"
 #include "gpu/generic/sycl/sum_kernels.hpp"
 #include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -34,9 +35,6 @@ status_t ref_sum_t::pd_t::init_conf() {
     }
     conf_.dst_md = xpu::sycl::md_t(dst_md());
 
-    // XXX: should probably be tuned.
-    conf_.block_size = 16;
-    conf_.wg_size = 32;
     conf_.wk_size = memory_desc_wrapper(dst_md()).nelems();
     return status::success;
 }
@@ -75,15 +73,7 @@ status_t ref_sum_t::execute(const exec_ctx_t &ctx) const {
                 src2_mem_arg, src3_mem_arg, src4_mem_arg, src5_mem_arg,
                 src6_mem_arg, src7_mem_arg, dst_mem_arg);
 
-        const int block_size = pd()->conf_.block_size;
-        const int wg_size = pd()->conf_.wg_size;
-
-        const int t_work = pd()->conf_.wk_size;
-        const int wg_work = wg_size * block_size;
-        const int wg_cnt = utils::div_up(t_work, wg_work);
-
-        cgh.parallel_for(
-                ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size), sum_kernel);
+        cgh.parallel_for(get_range(ctx, pd()->conf_.wk_size), sum_kernel);
     });
 
     return status::success;
diff --git a/src/gpu/generic/sycl/ref_sum.hpp b/src/gpu/generic/sycl/ref_sum.hpp
index 18ee5afd5ee..d289e461e6d 100644
--- a/src/gpu/generic/sycl/ref_sum.hpp
+++ b/src/gpu/generic/sycl/ref_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,32 +40,39 @@ struct ref_sum_t : public gpu::generic::sycl::primitive_t {
         DECLARE_SUM_PD_T("dpcpp:ref:any", ref_sum_t);
 
         status_t init(impl::engine_t *engine) {
-            using namespace data_type;
             using namespace format_tag;
 
             const memory_desc_wrapper dst_d(dst_md());
-            if (!utils::one_of(dst_d.data_type(), f32, bf16, f16, s8, u8))
-                return status::unimplemented;
+            VDISPATCH_SUM(is_supported_type(dst_d.data_type()),
+                    VERBOSE_UNSUPPORTED_DT);
             // Block formats are not yet supported
             // Dimensions can not be > 6
-            if (!dst_d.is_plain() || dst_d.ndims() > xpu::sycl::md_t::max_dims)
-                return status::unimplemented;
+            VDISPATCH_SUM(!(!dst_d.is_plain()
+                                  || dst_d.ndims() > xpu::sycl::md_t::max_dims),
+                    VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
 
             const int n = n_inputs();
+            const auto &scales = attr()->scales_;
             for (auto i = 0; i < n; ++i) {
                 const memory_desc_wrapper src_d(src_md(i));
-                if (!utils::one_of(src_d.data_type(), f32, bf16, f16, s8, u8))
-                    return status::unimplemented;
+                VDISPATCH_SUM(is_supported_type(src_d.data_type()),
+                        VERBOSE_UNSUPPORTED_DT);
+
                 // Block formats are not yet supported
                 // Dimensions can not be > 6
-                if (!src_d.is_plain()
-                        || src_d.ndims() > xpu::sycl::md_t::max_dims)
-                    return status::unimplemented;
+                VDISPATCH_SUM(src_d.is_plain()
+                                && src_d.ndims() <= xpu::sycl::md_t::max_dims,
+                        VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+
+                VDISPATCH_SUM(attr()->scales_.has_default_values()
+                                || is_supported_type(
+                                        scales.get_data_type(DNNL_ARG_SRC + i)),
+                        VERBOSE_UNSUPPORTED_ATTR);
             }
 
-            const bool ok = set_default_params() == status::success
-                    && n <= DNNL_REF_SUM_MAX_NUM_TENSORS;
-            if (!ok) return status::unimplemented;
+            VDISPATCH_SUM_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_SUM(n <= DNNL_REF_SUM_MAX_NUM_TENSORS, VERBOSE_BAD_PARAM,
+                    "n_inputs");
 
             return init_conf();
         }
diff --git a/src/gpu/generic/sycl/ref_sum_many_inputs.cpp b/src/gpu/generic/sycl/ref_sum_many_inputs.cpp
index 0ef2e41cf5c..8f5b95f42b9 100644
--- a/src/gpu/generic/sycl/ref_sum_many_inputs.cpp
+++ b/src/gpu/generic/sycl/ref_sum_many_inputs.cpp
@@ -62,12 +62,13 @@ status_t ref_sum_many_inputs_t::execute(const exec_ctx_t &ctx) const {
             r_args[DNNL_ARG_MULTIPLE_SRC + j + pass_in_dst]
                     = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + j + in_arg_offset);
         }
-        n_remaining -= args_handled;
-        in_arg_offset += args_handled;
-        i++;
 
         exec_ctx_t r_ctx(ctx, std::move(r_args));
         CHECK(base_prims_[i]->execute(r_ctx));
+
+        n_remaining -= args_handled;
+        in_arg_offset += args_handled;
+        i++;
     }
     return status::success;
 }
diff --git a/src/gpu/generic/sycl/ref_sum_many_inputs.hpp b/src/gpu/generic/sycl/ref_sum_many_inputs.hpp
index c9415b13311..24fede922c2 100644
--- a/src/gpu/generic/sycl/ref_sum_many_inputs.hpp
+++ b/src/gpu/generic/sycl/ref_sum_many_inputs.hpp
@@ -45,10 +45,12 @@ struct ref_sum_many_inputs_t : public gpu::generic::sycl::primitive_t {
             const memory_desc_wrapper dst_d(dst_md());
 
             const int n = n_inputs();
-            const bool ok = set_default_params() == status::success
-                    && attr()->has_default_values()
-                    && n > DNNL_REF_SUM_MAX_NUM_TENSORS; // prevent inf recursion
-            if (!ok) return status::unimplemented;
+            VDISPATCH_SUM_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_SUM(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            // prevent inf recursion
+            VDISPATCH_SUM(n > DNNL_REF_SUM_MAX_NUM_TENSORS, VERBOSE_BAD_PARAM,
+                    "n_inputs");
 
             // the first kernel handles up to 8 inputs and remaining ones up to 7
             const int n_kernels = n == 1
diff --git a/src/gpu/generic/sycl/reorder_kernels.hpp b/src/gpu/generic/sycl/reorder_kernels.hpp
index 22181408b7b..2a3ff1d6281 100644
--- a/src/gpu/generic/sycl/reorder_kernels.hpp
+++ b/src/gpu/generic/sycl/reorder_kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ struct reorder_kernel_t {
             const exec_ctx_t &ctx)
         : conf_(conf)
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC_0))
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
         , src_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0))
         , dst_scale_(CTX_IN_SYCL_KERNEL_MEMORY(
@@ -59,14 +59,6 @@ struct reorder_kernel_t {
         memory_plain_t src_scale_mem(src_scale_, scales_src_dt_);
         memory_plain_t dst_scale_mem(dst_scale_, scales_dst_dt_);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-
-        size_t base_idx = offset_t * conf_.block_size;
-
         float scale_src = conf_.do_scale_src && conf_.scale_src_mask == 0
                 ? src_scale_mem.load(0)
                 : 1.f;
@@ -81,68 +73,66 @@ struct reorder_kernel_t {
                     = (i < src_md().ndims()) ? src_md().strides()[i] : INT_MAX;
         }
         dims_t dims_scales_src;
-        if (conf_.scale_src_mask != 0) {
+        if (conf_.scale_src_mask > 0) {
             for (int i = 0; i < max_supported_ndims; i++) {
                 dims_scales_src[i]
                         = conf_.scale_src_mask >> i & 1 ? dims[i] : 1;
             }
         }
         dims_t dims_scales_dst;
-        if (conf_.scale_dst_mask != 0) {
+        if (conf_.scale_dst_mask > 0) {
             for (int i = 0; i < max_supported_ndims; i++) {
                 dims_scales_dst[i]
                         = conf_.scale_dst_mask >> i & 1 ? dims[i] : 1;
             }
         }
 
-        for (int i = 0; i < conf_.block_size; i++) {
-            int idx = base_idx + i;
-            if (idx < conf_.wk_size) {
-                for (int i = 0; i < max_supported_ndims; i++) {
-                    off[i] = idx / strides[i] % dims[i];
-                }
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            for (int i = 0; i < max_supported_ndims; i++) {
+                off[i] = idx / strides[i] % dims[i];
+            }
 
-                int dst_idx = dst_md().off_v(off);
-                auto src = src_mem.load(idx);
-
-                if (conf_.do_scale_src) {
-                    if (conf_.scale_src_mask != 0) {
-                        int scale_idx = 0;
-                        for (int i = 0; i < max_supported_ndims; i++) {
-                            if (i < src_md().ndims()) {
-                                int off_scales_i = conf_.scale_src_mask >> i & 1
-                                        ? off[i]
-                                        : 0;
-                                scale_idx = scale_idx * dims_scales_src[i]
-                                        + off_scales_i;
-                            }
+            int dst_idx = dst_md().off_v(off);
+            auto src = src_mem.load(idx);
+
+            if (conf_.do_scale_src) {
+                if (conf_.scale_src_mask > 0) {
+                    int scale_idx = 0;
+                    for (int i = 0; i < max_supported_ndims; i++) {
+                        if (i < src_md().ndims()) {
+                            int off_scales_i = conf_.scale_src_mask >> i & 1
+                                    ? off[i]
+                                    : 0;
+                            scale_idx = scale_idx * dims_scales_src[i]
+                                    + off_scales_i;
                         }
-                        scale_src = src_scale_mem.load(scale_idx);
                     }
-                    src *= scale_src;
+                    scale_src = src_scale_mem.load(scale_idx);
                 }
+                src *= scale_src;
+            }
 
-                auto acc = src;
-                acc = conf_.post_ops.apply(acc, dst_, dst_idx);
-                if (conf_.do_scale_dst) {
-                    if (conf_.scale_dst_mask != 0) {
-                        int scale_idx = 0;
-                        for (int i = 0; i < max_supported_ndims; i++) {
-                            if (i < src_md().ndims()) {
-                                int off_scales_i = conf_.scale_dst_mask >> i & 1
-                                        ? off[i]
-                                        : 0;
-                                scale_idx = scale_idx * dims_scales_dst[i]
-                                        + off_scales_i;
-                            }
+            auto acc = src;
+            acc = conf_.post_ops.apply(acc, dst_, dst_idx);
+            if (conf_.do_scale_dst) {
+                if (conf_.scale_dst_mask > 0) {
+                    int scale_idx = 0;
+                    for (int i = 0; i < max_supported_ndims; i++) {
+                        if (i < src_md().ndims()) {
+                            int off_scales_i = conf_.scale_dst_mask >> i & 1
+                                    ? off[i]
+                                    : 0;
+                            scale_idx = scale_idx * dims_scales_dst[i]
+                                    + off_scales_i;
                         }
-
-                        scale_dst = dst_scale_mem.load(scale_idx);
                     }
-                    acc /= scale_dst;
+
+                    scale_dst = dst_scale_mem.load(scale_idx);
                 }
-                dst_mem.store(acc, dst_idx);
+                acc /= scale_dst;
             }
+            dst_mem.store(acc, dst_idx);
         }
     }
 
@@ -153,7 +143,7 @@ struct reorder_kernel_t {
     sycl_reorder_conf_t conf_;
 
     xpu::sycl::in_memory_arg_t src_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
     xpu::sycl::in_memory_arg_t src_scale_;
     xpu::sycl::in_memory_arg_t dst_scale_;
     data_type_t scales_src_dt_;
diff --git a/src/gpu/generic/sycl/resampling_kernels.hpp b/src/gpu/generic/sycl/resampling_kernels.hpp
index ad5838320b0..c055c034ad7 100644
--- a/src/gpu/generic/sycl/resampling_kernels.hpp
+++ b/src/gpu/generic/sycl/resampling_kernels.hpp
@@ -38,14 +38,14 @@ struct resampling_kernel_fwd_vec_t {
             ::sycl::handler &cgh, const exec_ctx_t &ctx)
         : conf_(conf)
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
-        , po_args_(cgh, ctx) {}
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
 
         const auto &src_ndims = conf_.src_md.ndims();
         const auto &src_dims = conf_.src_md.dims();
@@ -114,6 +114,15 @@ struct resampling_kernel_fwd_vec_t {
             }
 
             dims_t off {mb, c, od, oh, ow};
+            if (dst_ndims == 3) {
+                off[2] = ow;
+                off[3] = 0;
+                off[4] = 0;
+            } else if (dst_ndims == 4) {
+                off[2] = oh;
+                off[3] = ow;
+                off[4] = 0;
+            }
             dst = conf_.post_ops.apply(dst, dst_, data_p_off, po_args_, off);
             dst_mem.store(dst, data_p_off);
             utils::nd_iterator_step(mb, MB, c, C, od, OD, oh, OH, ow, OW);
@@ -142,7 +151,7 @@ struct resampling_kernel_fwd_vec_t {
     sycl_resampling_conf_t conf_;
 
     xpu::sycl::in_memory_arg_t src_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
     post_op_input_args po_args_;
 };
 
@@ -156,7 +165,7 @@ struct resampling_kernel_bwd_vec_t {
         memory_tensor_t diff_src_mem(diff_src_, conf_.diff_src_md);
         memory_tensor_t diff_dst_mem(diff_dst_, conf_.diff_dst_md);
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
 
         const auto &diff_src_ndims = conf_.diff_src_md.ndims();
         const auto &diff_src_dims = conf_.diff_src_md.dims();
diff --git a/src/gpu/generic/sycl/rnn/cell_common.cpp b/src/gpu/generic/sycl/rnn/cell_common.cpp
new file mode 100644
index 00000000000..b2797f3cf78
--- /dev/null
+++ b/src/gpu/generic/sycl/rnn/cell_common.cpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// Common for RNN and LSTM cell execution
+
+#include "gpu/generic/sycl/rnn/ref_rnn.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+using namespace dnnl::impl::utils;
+using namespace rnn_utils;
+
+status_t _ref_rnn_common_t::cell_execution(const cell_ctx_t &cell_struct) {
+
+    auto cell_layer = cell_struct.workspace.states_range(cell_struct.lay,
+            cell_struct.lay, cell_struct.dir, cell_struct.dir, cell_struct.iter,
+            cell_struct.iter);
+
+    auto iter_off = cell_struct.iter == 0
+            ? (-1 * (cell_struct.rnn.n_dir - 1) * (cell_struct.rnn.n_iter + 1))
+                    - 1
+            : cell_struct.iter - 1;
+    auto cell_iter = cell_struct.workspace.states_range(cell_struct.lay + 1,
+            cell_struct.lay + 1, cell_struct.dir, cell_struct.dir, iter_off,
+            iter_off);
+
+    auto scratch_gates = cell_struct.scratch.gates(0);
+
+    auto wei_layer
+            = cell_struct.user_data.wei_layer(cell_struct.lay, cell_struct.dir);
+    auto wei_iter
+            = cell_struct.user_data.wei_iter(cell_struct.lay, cell_struct.dir);
+
+    CHECK(gemm_primitive(cell_struct.engine, cell_struct.ctx, wei_layer,
+            cell_layer, scratch_gates, gemm_layer_fwd));
+
+    CHECK(gemm_primitive(cell_struct.engine, cell_struct.ctx, wei_iter,
+            cell_iter, scratch_gates, gemm_iter_fwd));
+
+    CHECK(rnn_bias(cell_struct.ctx, cell_struct.rnn.mb, cell_struct.rnn.dhc,
+            cell_struct.iter, cell_struct.lay, cell_struct.dir,
+            cell_struct.workspace, cell_struct.scratch, cell_struct.user_data));
+
+    return status::success;
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/generic/sycl/rnn/ref_rnn.cpp b/src/gpu/generic/sycl/rnn/ref_rnn.cpp
new file mode 100644
index 00000000000..b7b8e2840a9
--- /dev/null
+++ b/src/gpu/generic/sycl/rnn/ref_rnn.cpp
@@ -0,0 +1,707 @@
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// General architecture
+//
+// for diff states, we have n_states + 1 as we have n_states diff
+// to propagate to the previous iteration and 1 states to propagate
+// to the previous layer
+// index 0 is dh for cell(t-1, l) to consume
+// index 1 is dc for cell(t-1, l) to consume
+// index 2 is dh for cell(t, l-1) to consume
+// this indexing enables to have the same indexing for states in elemwise
+// function
+// only the cell execution function should be impacted
+
+#include "gpu/generic/sycl/rnn/ref_rnn.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_desc.hpp"
+
+#include "common/matmul_pd.hpp"
+#include "common/stream.hpp"
+#include "common/type_helpers.hpp"
+#include "gpu/generic/sycl/rnn/rnn_kernels.hpp"
+
+#include <memory>
+
+#define DPRINT(fmt, ...) \
+    do { \
+        if (get_verbose_dev_mode(verbose_t::debuginfo) >= 2) { \
+            printf(fmt, __VA_ARGS__); \
+            fflush(nullptr); \
+        } \
+    } while (0)
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+using namespace dnnl::impl::utils;
+using namespace dnnl::impl::math;
+using namespace prop_kind;
+using namespace alg_kind;
+using namespace rnn_utils;
+using namespace dnnl::impl::memory_tracking::names;
+
+status_t _ref_rnn_common_t::pd_t::set_default_params() {
+    using namespace format_tag;
+    if (src_layer_md_.format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(src_layer_md_, tnc));
+    if (dst_layer_md_.format_kind == format_kind::any)
+        CHECK(memory_desc_init_by_tag(dst_layer_md_, tnc));
+
+    // Optional parameters
+    if ((!types::is_zero_md(&src_iter_md_))
+            && (src_iter_md_.format_kind == format_kind::any))
+        CHECK(memory_desc_init_by_tag(src_iter_md_, ldnc));
+    if ((!types::is_zero_md(&bias_md_))
+            && (bias_md_.format_kind == format_kind::any))
+        CHECK(memory_desc_init_by_tag(bias_md_, ldgo));
+    if ((!types::is_zero_md(&dst_iter_md_))
+            && (dst_iter_md_.format_kind == format_kind::any))
+        CHECK(memory_desc_init_by_tag(dst_iter_md_, ldnc));
+
+    return status::success;
+}
+
+status_t _ref_rnn_common_t::pd_t::init(impl::engine_t *engine) {
+    using namespace prop_kind;
+    using namespace utils;
+    using namespace rnn_utils;
+    using namespace format_tag;
+
+    assert(engine->kind() == engine_kind::gpu);
+
+    const alg_kind_t cell_kind = this->desc()->cell_kind;
+
+    data_type_t src_layer_dt = this->desc()->src_layer_desc.data_type;
+    data_type_t weights_iter_dt = this->desc()->weights_iter_desc.data_type;
+    data_type_t weights_layer_dt = this->desc()->weights_layer_desc.data_type;
+    data_type_t bias_dt = this->desc()->bias_desc.data_type;
+
+    acc_data_t = data_type::f32;
+
+    src_type = src_layer_dt;
+    weights_type = weights_layer_dt;
+
+    VDISPATCH_RNN(
+            one_of(cell_kind, alg_kind::vanilla_rnn), VERBOSE_BAD_ALGORITHM);
+    VDISPATCH_RNN(weights_iter_dt == weights_layer_dt, VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_RNN_SC(this->set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+    VDISPATCH_RNN(this->with_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
+    VDISPATCH_RNN(this->desc()->prop_kind == forward_inference
+                    || this->desc()->prop_kind == forward_training,
+            VERBOSE_UNSUPPORTED_DT_CFG);
+
+    init_rnn_conf(rnn_conf, this, acc_data_t);
+
+    // Check that only supported attr have been passed.
+    primitive_attr_t::skip_mask_t attr_mask
+            = primitive_attr_t::skip_mask_t::rnn_tparams;
+    if (weights_layer_dt == data_type::s8) {
+        attr_mask = attr_mask | primitive_attr_t::skip_mask_t::rnn_data_qparams
+                | primitive_attr_t::skip_mask_t::rnn_weights_qparams
+                | primitive_attr_t::skip_mask_t::fpmath_mode;
+    }
+    VDISPATCH_RNN(this->attr()->has_default_values(attr_mask),
+            VERBOSE_UNSUPPORTED_ATTR);
+
+    // Set weights descriptors to desired format
+    VDISPATCH_RNN_SC(set_weights_desc(this->weights_layer_md_, rnn_conf),
+            "unsupported weights layer memory descriptor");
+    VDISPATCH_RNN_SC(set_weights_desc(this->weights_iter_md_, rnn_conf),
+            "unsupported weights iter memory descriptor");
+
+    // Check dimensions consistency
+    VDISPATCH_RNN((this->SIC() == this->DHC() || (this->T() == 1)),
+            VERBOSE_INCONSISTENT_DIM, "SIC", (int)this->SIC(), "DHC",
+            (int)this->DHC());
+
+    set_rnn_conf(rnn_conf, *this->desc());
+
+    dim_t workspace_size = get_workspace_size(rnn_conf);
+
+    // initialize the workspace_pd if needed
+    if (rnn_conf.use_workspace) {
+        dims_t ws_dims = {workspace_size};
+        VDISPATCH_RNN_SC(memory_desc_init_by_tag(
+                                 this->ws_md_, 1, ws_dims, data_type::u8, x),
+                "memory_desc_init_by_tag()");
+    }
+
+    memory_desc_t state_md;
+    dims_t state_dims = {rnn_conf.n_layer, rnn_conf.n_dir, rnn_conf.n_iter + 1,
+            rnn_conf.mb, rnn_conf.states_ws_ld};
+
+    CHECK(memory_desc_init_by_tag(state_md, 5, state_dims,
+            rnn_conf.src_data_type, format_tag::abcde));
+
+    // using is_l2r/r2l to account for bidirectional as well
+    // if both l2r and r2l are true, case is bidirectional concat
+    // is_sum changes behaviour from concat to sum for bidirectional case
+
+    bool is_l2r = !(this->desc()->direction == dnnl_unidirectional_right2left);
+    bool is_r2l = !(this->desc()->direction == dnnl_unidirectional_left2right);
+    bool is_sum = this->desc()->direction == dnnl_bidirectional_sum;
+
+    copy_init_layer_conf_
+            = sycl_rnn_copy_conf_t {xpu::sycl::md_t(this->src_md(0)),
+                    xpu::sycl::md_t(&state_md), rnn_conf.slc, rnn_conf.n_dir,
+                    rnn_conf.n_layer, rnn_conf.n_iter, rnn_conf.mb,
+                    rnn_conf.states_ws_ld, true, true, is_l2r, is_r2l, false};
+
+    xpu::sycl::md_t src_iter_md = this->src_md(1)->data_type == data_type::undef
+            ? xpu::sycl::md_t()
+            : xpu::sycl::md_t(this->src_md(1));
+
+    copy_init_iter_conf_ = sycl_rnn_copy_conf_t {src_iter_md,
+            xpu::sycl::md_t(&state_md), rnn_conf.sic, rnn_conf.n_dir,
+            rnn_conf.n_layer, rnn_conf.n_iter, rnn_conf.mb,
+            rnn_conf.states_ws_ld, false, true, is_l2r, is_r2l, false};
+
+    copy_res_layer_conf_ = sycl_rnn_copy_conf_t {xpu::sycl::md_t(&state_md),
+            xpu::sycl::md_t(this->dst_md(0)), rnn_conf.dhc, rnn_conf.n_dir,
+            rnn_conf.n_layer, rnn_conf.n_iter, rnn_conf.mb,
+            rnn_conf.states_ws_ld, true, false, is_l2r, is_r2l, is_sum};
+
+    xpu::sycl::md_t dst_iter_md = this->dst_md(1)->data_type == data_type::undef
+            ? xpu::sycl::md_t()
+            : xpu::sycl::md_t(this->dst_md(1));
+
+    copy_res_iter_conf_ = sycl_rnn_copy_conf_t {xpu::sycl::md_t(&state_md),
+            dst_iter_md, rnn_conf.dhc, rnn_conf.n_dir, rnn_conf.n_layer,
+            rnn_conf.n_iter, rnn_conf.mb, rnn_conf.states_ws_ld, false, false,
+            is_l2r, is_r2l, false};
+
+    sycl_rnn_bias_conf_t_ = sycl_rnn_bias_conf_t();
+    sycl_rnn_bias_conf_t_.dst_md = xpu::sycl::md_t(this->dst_md(0));
+    sycl_rnn_bias_conf_t_.bias_type = bias_dt;
+    sycl_rnn_bias_conf_t_.batch = rnn_conf.mb;
+    sycl_rnn_bias_conf_t_.dhc = rnn_conf.dhc;
+    sycl_rnn_bias_conf_t_.gates_ws_ld = rnn_conf.gates_ws_ld;
+    sycl_rnn_bias_conf_t_.states_ws_ld = rnn_conf.states_ws_ld;
+    sycl_rnn_bias_conf_t_.activation_kind = this->activation_kind();
+    sycl_rnn_bias_conf_t_.alpha = this->desc()->alpha;
+
+    auto fpmath_mode = this->attr()->fpmath_.mode_;
+
+    // The inputs of create_gemm_pd describe a gemm in column major.
+    // Below, we have to transpose the a and b descriptor to describe
+    // the GEMM as a row major problem.
+    auto create_gemm_pd =
+            [&](std::shared_ptr<primitive_desc_t> &gemm_pd, dim_t m, dim_t n,
+                    dim_t k, strides_t<2> a_strides, strides_t<2> b_strides,
+                    strides_t<2> c_strides, data_type_t a_dt, data_type_t b_dt,
+                    data_type_t c_dt, float beta) -> status_t {
+        memory_desc_t a_md, b_md, c_md, bias_md;
+
+        dims_t a_dims = {n, k}, b_dims = {k, m}, c_dims = {n, m};
+
+        dims_t b_strides_md = {b_strides[0], b_strides[1]};
+        CHECK(memory_desc_init_by_strides(
+                b_md, 2, b_dims, rnn_conf.wei_layer_type, b_strides_md));
+        dims_t a_strides_md = {a_strides[0], a_strides[1]};
+        CHECK(memory_desc_init_by_strides(
+                a_md, 2, a_dims, rnn_conf.src_data_type, a_strides_md));
+        dims_t c_strides_md = {c_strides[0], c_strides[1]};
+        CHECK(memory_desc_init_by_strides(
+                c_md, 2, c_dims, rnn_conf.dst_data_type, c_strides_md));
+
+        primitive_attr_t attr;
+        if (beta != 0) { CHECK(attr.post_ops_.append_sum(beta)); }
+        CHECK(attr.set_fpmath_mode(fpmath_mode));
+        attr.deterministic_ = this->attr()->deterministic_;
+
+        matmul_desc_t matmul_desc;
+        dnnl::impl::matmul_desc_init(
+                &matmul_desc, &a_md, &b_md, &bias_md, &c_md);
+
+        primitive_desc_iterator_t it(engine,
+                reinterpret_cast<op_desc_t *>(&matmul_desc), &attr, nullptr);
+
+        while (++it != it.end()) {
+            if (*it) {
+                gemm_pd = *it;
+                return status::success;
+                break;
+            }
+        }
+        return status::unimplemented;
+    };
+
+    float gemm_iter_fwd_beta = this->is_lbr() ? 0.0f : 1.0f;
+
+    // Setup gemm PDs
+
+    dim_t batch = rnn_conf.mb;
+    dim_t n_gates = rnn_conf.n_gates;
+    dim_t slc = rnn_conf.slc;
+    dim_t sic = rnn_conf.sic;
+    dim_t dhc = rnn_conf.dhc;
+
+    strides_t<5> wei_layer_strides = get_outer_strides(this->weights_md(0));
+    strides_t<5> wei_iter_strides = get_outer_strides(this->weights_md(1));
+
+    VDISPATCH_RNN_SC(create_gemm_pd(gemm_layer_fwd_pd_, n_gates * dhc, batch,
+                             slc, {rnn_conf.states_ws_ld, 1},
+                             {wei_layer_strides[2], wei_layer_strides[4]},
+                             {rnn_conf.scratch_gates_ld, 1}, weights_type,
+                             src_type, rnn_conf.acc_data_type, 0.0),
+            "create_gemm_pd(gemm_layer_fwd_pd_)");
+
+    VDISPATCH_RNN_SC(create_gemm_pd(gemm_iter_fwd_pd_, n_gates * dhc, batch,
+                             sic, {rnn_conf.states_ws_ld, 1},
+                             {wei_iter_strides[2], wei_iter_strides[4]},
+                             {rnn_conf.gates_ws_ld, 1}, weights_type, src_type,
+                             rnn_conf.acc_data_type, gemm_iter_fwd_beta),
+            "create_gemm_pd(gemm_iter_fwd_pd_)");
+
+    init_scratchpad(rnn_conf.use_workspace ? 0 : workspace_size);
+    return status::success;
+}
+
+status_t _ref_rnn_common_t::init(impl::engine_t *engine) {
+    using namespace rnn_utils;
+
+    switch (pd()->cell_kind()) {
+        case dnnl_vanilla_rnn:
+            cell_func = [this](const cell_ctx_t &cell_struct) -> status_t {
+                return this->cell_execution(cell_struct);
+            };
+            break;
+        default: break;
+    }
+    grid_func = [this](const grid_ctx_t &grid_struct) -> status_t {
+        return this->linear_execution(grid_struct);
+    };
+
+    const conf_t &rnn = pd()->rnn_conf;
+    rnn_utils::set_workspace_offsets(rnn, ws_gates_offset_, ws_states_offset_);
+
+    // IMPORTANT SYCL STUFF
+    const auto copy_kid = ::sycl::get_kernel_id<ref_rnn_copy_t>();
+    this->create_kernel(engine, copy_kid, &copy_kernel_);
+    const auto bias_kid = ::sycl::get_kernel_id<ref_rnn_bias>();
+    this->create_kernel(engine, bias_kid, &bias_kernel_);
+
+    bool gemm_ok = true;
+    auto create_nested_gemm =
+            [&](const std::shared_ptr<primitive_desc_t> &prim_desc,
+                    std::shared_ptr<impl::primitive_t> &prim) {
+                std::pair<std::shared_ptr<impl::primitive_t>, cache_state_t>
+                        pair;
+                bool gemm_ok = prim_desc->create_primitive_nested(pair, engine)
+                        == status::success;
+                prim = pair.first;
+                return gemm_ok;
+            };
+
+    gemm_ok = gemm_ok
+            && create_nested_gemm(pd()->gemm_layer_fwd_pd_, gemm_layer_fwd_);
+    gemm_ok = gemm_ok
+            && create_nested_gemm(pd()->gemm_iter_fwd_pd_, gemm_iter_fwd_);
+
+    if (!gemm_ok) return status::runtime_error;
+
+    return status::success;
+} // namespace sycl
+
+status_t _ref_rnn_common_t::gemm_primitive(impl::engine_t *engine,
+        const exec_ctx_t &ctx, std::unique_ptr<memory_storage_t> &a,
+        std::unique_ptr<memory_storage_t> &b,
+        std::unique_ptr<memory_storage_t> &c, gemm_kind_t gemm_kind) const {
+    std::unique_ptr<memory_t, memory_deleter_t> arg1, arg2, arg3;
+    exec_args_t gemm_args;
+    std::shared_ptr<impl::primitive_desc_t> gemm_pd;
+
+    switch (gemm_kind) {
+        case gemm_iter_fwd: gemm_pd = pd()->gemm_iter_fwd_pd_; break;
+        case gemm_layer_fwd: gemm_pd = pd()->gemm_layer_fwd_pd_; break;
+    }
+
+    CHECK(safe_ptr_assign(arg2,
+            new memory_t(
+                    ctx.stream()->engine(), gemm_pd->src_md(0), a->clone())));
+    CHECK(safe_ptr_assign(arg1,
+            new memory_t(ctx.stream()->engine(), gemm_pd->weights_md(0),
+                    b->clone())));
+    CHECK(safe_ptr_assign(arg3,
+            new memory_t(
+                    ctx.stream()->engine(), gemm_pd->dst_md(0), c->clone())));
+
+    gemm_args[DNNL_ARG_SRC] = memory_arg_t {arg1.get(), true};
+    gemm_args[DNNL_ARG_WEIGHTS] = memory_arg_t {arg2.get(), true};
+    gemm_args[DNNL_ARG_DST] = memory_arg_t {arg3.get(), false};
+
+    exec_ctx_t gemm_ctx(ctx, std::move(gemm_args));
+
+    std::unique_ptr<nested_scratchpad_t> ns;
+    const auto init_gemm_nested_scratchpad
+            = [&](const std::shared_ptr<impl::primitive_t> &gemm, int key) {
+                  ns = utils::make_unique<nested_scratchpad_t>(ctx, key, gemm);
+                  gemm_ctx.set_scratchpad_grantor(ns->grantor());
+              };
+
+    switch (gemm_kind) {
+        case gemm_iter_fwd:
+            init_gemm_nested_scratchpad(
+                    gemm_iter_fwd_, rnn_utils::scratch_t::key_gemm_iter_fwd);
+            CHECK(gemm_iter_fwd_->execute(gemm_ctx));
+            break;
+        case gemm_layer_fwd:
+            init_gemm_nested_scratchpad(
+                    gemm_layer_fwd_, rnn_utils::scratch_t::key_gemm_layer_fwd);
+            CHECK(gemm_layer_fwd_->execute(gemm_ctx));
+            break;
+
+        default: assert(!"unknown gemm_kind"); return status::runtime_error;
+    }
+
+    return status::success;
+}
+
+//*************** Grid computations strategy: linear ***************//
+status_t _ref_rnn_common_t::linear_execution(const grid_ctx_t &grid_struct) {
+
+    dim_t n_layer = grid_struct.rnn.n_layer;
+    dim_t n_dir = grid_struct.rnn.n_dir;
+    dim_t n_iter = grid_struct.rnn.n_iter;
+
+    for (dim_t dir = 0; dir < n_dir; dir++) {
+        for (dim_t j = 0; j < n_layer; j++) {
+            dim_t lay = j;
+            for (dim_t i = 0; i < n_iter; i += grid_struct.rnn.iter_loop) {
+                dim_t iter = i;
+                const cell_ctx_t c_struct
+                        = {grid_struct.engine, grid_struct.ctx, dir, lay, iter,
+                                grid_struct.user_data, grid_struct.workspace,
+                                grid_struct.scratch, grid_struct.rnn};
+                CHECK(cell_func(c_struct));
+            }
+        }
+    }
+    return status::success;
+}
+//********* GRID computations strategy: utility functions **********//
+
+status_t _ref_rnn_common_t::copy_init_layer(const exec_ctx_t &ctx, dim_t batch,
+        dim_t dhc, dim_t slc, dim_t n_iter, dim_t n_layer, dim_t n_dir,
+        dim_t n_states, dim_t states_ws_ld, const rnn_utils::workspace_t &ws,
+        const memory_storage_t &input) const {
+
+    auto max_wg_size_per_dim = calc_local_range(ctx);
+
+    parallel_for(ctx, copy_kernel_, [&](::sycl::handler &cgh) {
+        auto src_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &input)
+                          ->get_in_memory_arg(ctx.stream(), cgh);
+        auto dst_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &ws.states())
+                          ->get_out_memory_arg(ctx.stream(), cgh);
+
+        ref_rnn_copy_t copy_kernel(
+                pd()->copy_init_layer_conf_, src_mem_arg, dst_mem_arg);
+        size_t local_batch = max_wg_size_per_dim;
+        size_t local_iter = max_wg_size_per_dim;
+        size_t local_channel = max_wg_size_per_dim;
+        size_t global_batch = calc_global_range(
+                static_cast<size_t>(local_batch), static_cast<size_t>(batch));
+        size_t global_iter = calc_global_range(
+                static_cast<size_t>(local_iter), static_cast<size_t>(n_iter));
+        size_t global_channels = calc_global_range(
+                static_cast<size_t>(local_channel), static_cast<size_t>(slc));
+        cgh.parallel_for(
+                ::sycl::nd_range<3>(::sycl::range<3>(global_iter, global_batch,
+                                            global_channels),
+                        ::sycl::range<3>(
+                                local_iter, local_batch, local_channel)),
+                copy_kernel);
+    });
+
+    return status::success;
+}
+
+status_t _ref_rnn_common_t::copy_init_iter(const exec_ctx_t &ctx, dim_t batch,
+        dim_t dhc, dim_t sic, dim_t n_iter, dim_t n_layer, dim_t n_dir,
+        dim_t n_states, dim_t states_ws_ld, const rnn_utils::workspace_t &ws,
+        const memory_storage_t &firstit_states) const {
+
+    auto max_wg_size_per_dim = calc_local_range(ctx);
+
+    parallel_for(ctx, copy_kernel_, [&](::sycl::handler &cgh) {
+        auto src_iter_mem_arg = firstit_states
+                ? utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &firstit_states)
+                          ->get_in_memory_arg(ctx.stream(), cgh)
+                : xpu::sycl::memory_storage_base_t::empty_in_memory_arg(
+                        ctx.stream(), cgh);
+        auto ws_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &ws.states())
+                          ->get_out_memory_arg(ctx.stream(), cgh);
+
+        ref_rnn_copy_t copy_kernel(
+                pd()->copy_init_iter_conf_, src_iter_mem_arg, ws_mem_arg);
+        size_t local_batch = max_wg_size_per_dim;
+        size_t local_channel = max_wg_size_per_dim;
+        size_t local_lay_dir = max_wg_size_per_dim;
+        size_t global_batch
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(batch));
+        size_t global_channels = calc_global_range(
+                static_cast<size_t>(max_wg_size_per_dim),
+                std::max(static_cast<size_t>(sic), static_cast<size_t>(dhc)));
+        size_t global_lay_dir
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(n_layer * n_dir));
+        cgh.parallel_for(
+                ::sycl::nd_range<3>(::sycl::range<3>(global_lay_dir,
+                                            global_batch, global_channels),
+                        ::sycl::range<3>(
+                                local_lay_dir, local_batch, local_channel)),
+                copy_kernel);
+    });
+    return status::success;
+}
+
+status_t _ref_rnn_common_t::copy_res_layer(const exec_ctx_t &ctx, dim_t batch,
+        dim_t dhc, dim_t slc, dim_t n_iter, dim_t n_layer, dim_t n_dir,
+        dim_t n_states, dim_t states_ws_ld,
+        const memory_storage_t &dst_last_layer,
+        const rnn_utils::workspace_t &ws) const {
+
+    auto max_wg_size_per_dim = calc_local_range(ctx);
+
+    parallel_for(ctx, copy_kernel_, [&](::sycl::handler &cgh) {
+        auto ws_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &ws.states())
+                          ->get_in_memory_arg(ctx.stream(), cgh);
+        auto dst_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &dst_last_layer)
+                          ->get_out_memory_arg(ctx.stream(), cgh);
+
+        ref_rnn_copy_t copy_kernel(
+                pd()->copy_res_layer_conf_, ws_mem_arg, dst_mem_arg);
+        size_t local_batch = max_wg_size_per_dim;
+        size_t local_iter = max_wg_size_per_dim;
+        size_t local_channel = max_wg_size_per_dim;
+        size_t global_batch
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(batch));
+        size_t global_iter
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(n_iter));
+        size_t global_channels
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(n_states * dhc));
+        cgh.parallel_for(
+                ::sycl::nd_range<3>(::sycl::range<3>(global_iter, global_batch,
+                                            global_channels),
+                        ::sycl::range<3>(
+                                local_iter, local_batch, local_channel)),
+                copy_kernel);
+    });
+    return status::success;
+}
+
+status_t _ref_rnn_common_t::copy_res_iter(const exec_ctx_t &ctx, dim_t batch,
+        dim_t dhc, dim_t sic, dim_t n_iter, dim_t n_layer, dim_t n_dir,
+        dim_t n_states, dim_t states_ws_ld,
+        const memory_storage_t &dst_last_iter,
+        const rnn_utils::workspace_t &ws) const {
+
+    auto max_wg_size_per_dim = calc_local_range(ctx);
+
+    parallel_for(ctx, copy_kernel_, [&](::sycl::handler &cgh) {
+        auto src_iter
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &ws.states())
+                          ->get_in_memory_arg(ctx.stream(), cgh);
+        auto dst_iter = dst_last_iter
+                ? utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        &dst_last_iter)
+                          ->get_out_memory_arg(ctx.stream(), cgh)
+                : xpu::sycl::memory_storage_base_t::empty_out_memory_arg(
+                        ctx.stream(), cgh);
+        ref_rnn_copy_t copy_kernel(
+                pd()->copy_res_iter_conf_, src_iter, dst_iter);
+
+        size_t local_batch = max_wg_size_per_dim;
+        size_t local_channel = max_wg_size_per_dim;
+        size_t local_lay_dir = max_wg_size_per_dim;
+        size_t global_batch
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(batch));
+        size_t global_channels
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(dhc));
+        size_t global_lay_dir
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(n_layer * n_dir));
+        cgh.parallel_for(
+                ::sycl::nd_range<3>(::sycl::range<3>(global_lay_dir,
+                                            global_batch, global_channels),
+                        ::sycl::range<3>(
+                                local_lay_dir, local_batch, local_channel)),
+                copy_kernel);
+    });
+
+    return status::success;
+}
+
+status_t _ref_rnn_common_t::rnn_bias(const exec_ctx_t &ctx, dim_t batch,
+        dim_t dhc, dim_t iter, dim_t lay, dim_t dir,
+        const rnn_utils::workspace_t &ws, const rnn_utils::scratch_t &scratch,
+        const rnn_utils ::user_data_t &user_data) const {
+
+    auto max_wg_size_per_dim = calc_local_range(ctx);
+
+    parallel_for(ctx, bias_kernel_, [&](::sycl::handler &cgh) {
+        auto src_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        scratch.gates(0).get())
+                          ->get_inout_memory_arg(ctx.stream(), cgh);
+        auto bias_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        user_data.bias(lay, dir).get())
+                          ->get_in_memory_arg(ctx.stream(), cgh);
+
+        auto dst_mem_arg
+                = utils::downcast<const xpu::sycl::memory_storage_base_t *>(
+                        ws.states(lay + 1, dir, iter).get())
+                          ->get_out_memory_arg(ctx.stream(), cgh);
+        ref_rnn_bias bias_kernel(pd()->sycl_rnn_bias_conf_t_, src_mem_arg,
+                bias_mem_arg, dst_mem_arg);
+
+        size_t local_batch = max_wg_size_per_dim;
+        size_t local_channel = max_wg_size_per_dim;
+        size_t global_batch
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(batch));
+        size_t global_channels
+                = calc_global_range(static_cast<size_t>(max_wg_size_per_dim),
+                        static_cast<size_t>(dhc));
+        cgh.parallel_for(
+                ::sycl::nd_range<3>(
+                        ::sycl::range<3>(global_channels, global_batch, 1),
+                        ::sycl::range<3>(local_channel, local_batch, 1)),
+                bias_kernel);
+    });
+
+    return status::success;
+}
+
+// //********************* Execution function *********************//
+
+status_t _ref_rnn_common_t::execute_(const exec_ctx_t &ctx) const {
+
+    impl::engine_t *engine = ctx.stream()->engine();
+
+    auto rnn_pd = this->pd();
+
+    const conf_t &rnn = this->pd()->rnn_conf;
+
+    dim_t n_layer = rnn.n_layer;
+    dim_t n_dir = rnn.n_dir;
+    dim_t n_states = rnn.n_states;
+    dim_t n_iter = rnn.n_iter;
+    dim_t n_gates = rnn.n_gates;
+    dim_t n_bias = rnn.n_bias;
+    dim_t batch = rnn.mb;
+    dim_t slc = rnn.slc;
+    dim_t sic = rnn.sic;
+    dim_t dhc = rnn.dhc;
+    dim_t dlc = rnn.dlc;
+
+    auto &src_layer_native_ = CTX_IN_STORAGE(DNNL_ARG_SRC_LAYER);
+    auto &src_iter_native_ = CTX_IN_STORAGE(DNNL_ARG_SRC_ITER);
+    auto &wei_layer_native_ = CTX_IN_STORAGE(DNNL_ARG_WEIGHTS_LAYER);
+    auto &wei_iter_native_ = CTX_IN_STORAGE(DNNL_ARG_WEIGHTS_ITER);
+    auto &bias_native_ = CTX_IN_STORAGE(DNNL_ARG_BIAS);
+
+    auto &dst_last_layer_native_ = CTX_OUT_STORAGE(DNNL_ARG_DST_LAYER);
+    auto &dst_last_iter_native_ = CTX_OUT_STORAGE(DNNL_ARG_DST_ITER);
+
+    auto scratch_workspace
+            = ctx.get_scratchpad_grantor().get_memory_storage(key_rnn_space);
+    auto &workspace_ = rnn.is_training ? CTX_OUT_STORAGE(DNNL_ARG_WORKSPACE)
+                                       : *scratch_workspace;
+    const auto &workspace = rnn_utils::workspace_t(workspace_, rnn);
+
+    const auto scratch
+            = rnn_utils::scratch_t(rnn, ctx.get_scratchpad_grantor());
+
+    const rnn_utils::user_data_t user_data(wei_layer_native_,
+            pd()->weights_md(0), wei_iter_native_, pd()->weights_md(1),
+            bias_native_, pd()->weights_md(2));
+
+    DPRINT("\n%s\n", "+++++++++++++++");
+    DPRINT("%s\n", "+++++++++++++++");
+    DPRINT("  n_layer         = %lld\n", static_cast<long long>(n_layer));
+    DPRINT("  n_dir           = %lld\n", static_cast<long long>(n_dir));
+    DPRINT("  n_iter          = %lld\n", static_cast<long long>(n_iter));
+    DPRINT("  n_gates         = %lld\n", static_cast<long long>(n_gates));
+    DPRINT("  n_bias          = %lld\n", static_cast<long long>(n_bias));
+    DPRINT("  n_states        = %lld\n", static_cast<long long>(n_states));
+    DPRINT("  n_weights_layer = %lld\n", static_cast<long long>(rnn_pd->SLC()));
+    DPRINT("  n_weights_iter  = %lld\n", static_cast<long long>(rnn_pd->SIC()));
+    DPRINT("  batch           = %lld\n", static_cast<long long>(batch));
+    DPRINT("  slc             = %lld\n", static_cast<long long>(slc));
+    DPRINT("  sic             = %lld\n", static_cast<long long>(sic));
+    DPRINT("  dhc             = %lld\n", static_cast<long long>(dhc));
+    DPRINT("  dlc             = %lld\n", static_cast<long long>(dlc));
+    DPRINT("%s\n", "+++++++++++++++");
+    DPRINT("  use_workspace   = %s\n", rnn.use_workspace ? "yes" : "no");
+    DPRINT("%s\n", "+++++++++++++++");
+    DPRINT("  with_bias       = %s\n", rnn_pd->with_bias() ? "yes" : "no");
+    DPRINT("  with_dst_iter   = %s\n", rnn_pd->with_dst_iter() ? "yes" : "no");
+    DPRINT("%s\n", "+++++++++++++++");
+
+    CHECK(copy_init_layer(ctx, batch, dhc, slc, n_iter, n_layer, n_dir,
+            n_states, rnn.states_ws_ld, workspace, src_layer_native_));
+
+    CHECK(copy_init_iter(ctx, batch, dhc, sic, n_iter, n_layer, n_dir, n_states,
+            rnn.states_ws_ld, workspace, src_iter_native_));
+
+    // run the execution on the grid
+    const grid_ctx_t &grid_struct {
+            engine, ctx, user_data, workspace, scratch, pd()->rnn_conf};
+    CHECK(this->grid_func(grid_struct));
+
+    // Finally we copy the results to the result buffers
+
+    CHECK(copy_res_layer(ctx, batch, dhc, slc, n_iter, n_layer, n_dir, n_states,
+            rnn.states_ws_ld, dst_last_layer_native_, workspace));
+
+    CHECK(copy_res_iter(ctx, batch, dhc, sic, n_iter, n_layer, n_dir, n_states,
+            rnn.states_ws_ld, dst_last_iter_native_, workspace));
+
+    return status::success;
+};
+
+struct _ref_rnn_common_t;
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/generic/sycl/rnn/ref_rnn.hpp b/src/gpu/generic/sycl/rnn/ref_rnn.hpp
new file mode 100644
index 00000000000..8ec720d6f7c
--- /dev/null
+++ b/src/gpu/generic/sycl/rnn/ref_rnn.hpp
@@ -0,0 +1,177 @@
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_RNN_REF_RNN_HPP
+#define GPU_GENERIC_SYCL_RNN_REF_RNN_HPP
+
+#include <stdio.h>
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "common/utils.hpp"
+#include "gpu/generic/sycl/rnn/rnn_utils.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/gpu_rnn_pd.hpp"
+
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+
+#include "gpu/generic/sycl/sycl_gpu_kernel.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+enum gemm_kind_t { gemm_iter_fwd, gemm_layer_fwd };
+
+struct _ref_rnn_common_t : public primitive_t {
+    using primitive_t::primitive_t;
+
+    using base_pd_t = gpu_rnn_fwd_pd_t;
+
+    struct cell_ctx_t {
+        impl::engine_t *engine;
+        const exec_ctx_t &ctx;
+        dim_t dir;
+        dim_t lay;
+        dim_t iter;
+        const rnn_utils::user_data_t &user_data;
+        const rnn_utils::workspace_t &workspace;
+        const rnn_utils::scratch_t &scratch;
+        rnn_utils::conf_t rnn;
+    };
+
+    struct grid_ctx_t {
+        impl::engine_t *engine;
+        const exec_ctx_t &ctx;
+        const rnn_utils::user_data_t &user_data;
+        const rnn_utils::workspace_t &workspace;
+        const rnn_utils::scratch_t &scratch;
+        rnn_utils::conf_t rnn;
+    };
+
+    struct pd_t : public base_pd_t {
+
+        using base_pd_t::base_pd_t;
+
+        pd_t(const pd_t &other) = default;
+
+        DECLARE_COMMON_PD_T("ref:any", _ref_rnn_common_t);
+
+        status_t init(impl::engine_t *engine);
+
+        status_t set_default_params();
+
+        rnn_utils::conf_t rnn_conf = {};
+        data_type_t acc_data_t = data_type::undef;
+        data_type_t src_type = data_type::undef;
+        data_type_t weights_type = data_type::undef;
+
+        std::shared_ptr<primitive_desc_t> vanilla_cell_act_pd_;
+        std::shared_ptr<primitive_desc_t> gemm_iter_fwd_pd_;
+        std::shared_ptr<primitive_desc_t> gemm_layer_fwd_pd_;
+
+        sycl_rnn_copy_conf_t copy_init_layer_conf_;
+        sycl_rnn_copy_conf_t copy_init_iter_conf_;
+        sycl_rnn_copy_conf_t copy_res_layer_conf_;
+        sycl_rnn_copy_conf_t copy_res_iter_conf_;
+        sycl_rnn_bias_conf_t sycl_rnn_bias_conf_t_;
+
+    private:
+        void init_scratchpad(dim_t workspace_size) {
+            using namespace memory_tracking::names;
+            auto scratchpad = this->scratchpad_registry().registrar();
+            scratchpad.book(key_rnn_space, workspace_size, 1);
+            rnn_utils::scratch_t::book(scratchpad, rnn_conf,
+                    {gemm_iter_fwd_pd_.get(), gemm_layer_fwd_pd_.get()});
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override;
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        return execute_(ctx);
+    }
+
+private:
+    status_t execute_(const exec_ctx_t &ctx) const;
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    status_t linear_execution(const grid_ctx_t &grid_struct);
+
+    status_t cell_execution(const cell_ctx_t &cell_struct);
+
+    status_t gemm_primitive(impl::engine_t *engine, const exec_ctx_t &ctx,
+            std::unique_ptr<memory_storage_t> &a,
+            std::unique_ptr<memory_storage_t> &b,
+            std::unique_ptr<memory_storage_t> &c, gemm_kind_t gemm_kind) const;
+
+    status_t copy_init_layer(const exec_ctx_t &ctx, dim_t n_iter, dim_t batch,
+            dim_t slc, dim_t dhc, dim_t n_layer, dim_t n_dir, dim_t n_states,
+            dim_t states_ws_ld, const rnn_utils::workspace_t &ws,
+            const memory_storage_t &input) const;
+    status_t copy_init_iter(const exec_ctx_t &ctx, dim_t n_layer, dim_t n_dir,
+            dim_t batch, dim_t sic, dim_t dhc, dim_t n_iter, dim_t n_states,
+            dim_t states_ws_ld, const rnn_utils::workspace_t &ws,
+            const memory_storage_t &firstit_states) const;
+    status_t copy_res_layer(const exec_ctx_t &ctx, dim_t n_iter, dim_t batch,
+            dim_t slc, dim_t dhc, dim_t n_layer, dim_t n_dir, dim_t n_states,
+            dim_t states_ws_ld, const memory_storage_t &dst_last_layer,
+            const rnn_utils::workspace_t &ws) const;
+    status_t copy_res_iter(const exec_ctx_t &ctx, dim_t n_layer, dim_t n_dir,
+            dim_t batch, dim_t sic, dim_t dhc, dim_t n_iter, dim_t n_states,
+            dim_t states_ws_ld, const memory_storage_t &dst_last_iter,
+            const rnn_utils::workspace_t &ws) const;
+    status_t rnn_bias(const exec_ctx_t &ctx, dim_t batch, dim_t dhc, dim_t iter,
+            dim_t lay, dim_t dir, const rnn_utils::workspace_t &ws,
+            const rnn_utils::scratch_t &scratch,
+            const rnn_utils ::user_data_t &user_data) const;
+
+    // ptrs to GEMM primitives
+    std::shared_ptr<impl::primitive_t> gemm_layer_fwd_;
+    std::shared_ptr<impl::primitive_t> gemm_iter_fwd_;
+
+    // offset variables set in workspace and used in offset calculations for
+    // grid & cell execution and fwd & bwd kernel macros
+    dim_t ws_gates_offset_ = 0;
+    dim_t ws_states_offset_ = 0;
+    dim_t ws_c_states_offset_ = 0;
+    dim_t ws_grid_comp_offset_ = 0;
+    dim_t ws_bias_offset_ = 0;
+
+    // ptrs for storing weight offsets which are pre-calculated in
+    // in grid execution as weights_*_assing_func
+    std::vector<dim_t> wei_layer_offsets;
+    std::vector<dim_t> wei_iter_offsets;
+
+    std::function<status_t(const cell_ctx_t &)> cell_func;
+    std::function<status_t(const grid_ctx_t &)> grid_func;
+
+    kernel_t copy_kernel_;
+    kernel_t bias_kernel_;
+};
+
+using ref_rnn_fwd_t = _ref_rnn_common_t;
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+#endif
diff --git a/src/gpu/generic/sycl/rnn/rnn_kernels.hpp b/src/gpu/generic/sycl/rnn/rnn_kernels.hpp
new file mode 100644
index 00000000000..58009590afd
--- /dev/null
+++ b/src/gpu/generic/sycl/rnn/rnn_kernels.hpp
@@ -0,0 +1,217 @@
+/*******************************************************************************
+* Copyright 2023-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef SRC_GPU_GENERIC_SYCL_RNN_RNN_KERNELS_HPP
+#define SRC_GPU_GENERIC_SYCL_RNN_RNN_KERNELS_HPP
+
+#include "common/c_types_map.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_math_utils.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "xpu/sycl/types.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+inline int off_ker_bias(int dhc, int i0, int i1, int n_gates) {
+    return i0 * dhc + i1;
+}
+
+inline int cell_ws_state(int states_ws_ld, int i, int j) {
+    return i * states_ws_ld + j;
+}
+
+inline int cell_scratch_mem(
+        int scratch_gates_ld, int dhc, int i, int n, int j) {
+    return i * scratch_gates_ld + n * dhc + j;
+}
+
+struct ref_rnn_copy_t {
+    ref_rnn_copy_t(const sycl_rnn_copy_conf_t &conf,
+            const xpu::sycl::in_memory_arg_t &src,
+            xpu::sycl::out_memory_arg_t &dst)
+        : src_ {src}, dst_ {dst}, conf_ {conf} {}
+
+    void operator()(::sycl::nd_item<3> item) const {
+        const dim_t tl = item.get_global_id(0) // timestep/layer
+                / (conf_.layer ? 1 : conf_.n_dir);
+        dim_t dir = conf_.layer
+                ? 0
+                : item.get_global_id(0) % conf_.n_dir; // direction
+        const dim_t n = item.get_global_id(1); // batch
+        const dim_t c = item.get_global_id(2); // channel
+
+        if (dir >= conf_.n_dir || n >= conf_.batch || c >= conf_.range) return;
+
+        dim_t src_offset = 0;
+        dim_t dst_offset = 0;
+
+        if (conf_.layer) { // layer
+            if (tl >= conf_.n_iter) return;
+            if (conf_.to_state) { // init
+                if (conf_.l2r) { // l2r
+                    src_offset = conf_.src_md.off(tl, n, c);
+                    dst_offset = conf_.dst_md.off(0, dir, tl, n, c);
+                    do_copy(src_offset, dst_offset, src_ptr(), dst_ptr());
+                    dir = 1;
+                }
+                if (conf_.r2l) { // r2l
+                    src_offset = conf_.src_md.off(tl, n, c);
+                    dst_offset = conf_.dst_md.off(
+                            0, conf_.n_dir - 1, conf_.n_iter - tl - 1, n, c);
+                    do_copy(src_offset, dst_offset, src_ptr(), dst_ptr());
+                }
+            } else { // res
+                if (conf_.l2r) {
+                    dst_offset = conf_.dst_md.off(tl, n, dir * conf_.range + c);
+                    src_offset = conf_.src_md.off(conf_.n_layer, dir, tl, n, c);
+                    do_copy(src_offset, dst_offset, src_ptr(), dst_ptr());
+                    dir = 1;
+                }
+                if (conf_.r2l) {
+                    dst_offset = conf_.dst_md.off(tl, n, dir * conf_.range + c);
+                    src_offset = conf_.src_md.off(
+                            conf_.n_layer, dir, conf_.n_iter - tl - 1, n, c);
+                    if (conf_.sum) {
+                        dst_offset = conf_.dst_md.off(tl, n, c);
+                        auto src = load_float_value(
+                                src_md().data_type(), src_ptr(), src_offset);
+                        auto dst = load_float_value(conf_.dst_md.data_type(),
+                                dst_ptr(), dst_offset);
+                        store_float_value(src_md().data_type(), src + dst,
+                                dst_ptr(), dst_offset);
+                    } else {
+                        do_copy(src_offset, dst_offset, src_ptr(), dst_ptr());
+                    }
+                }
+            }
+        } else { // iter
+            if (tl >= conf_.n_layer) return;
+            if (conf_.to_state) { // init
+                src_offset = conf_.src_md.off(tl, dir, n, c);
+                dst_offset = conf_.dst_md.off(tl, dir, conf_.n_iter, n, c);
+                do_copy(src_offset, dst_offset, src_ptr(), dst_ptr());
+            } else { // res
+                src_offset
+                        = conf_.src_md.off(tl + 1, dir, conf_.n_iter - 1, n, c);
+                dst_offset = conf_.dst_md.off(tl, dir, n, c);
+                do_copy(src_offset, dst_offset, src_ptr(), dst_ptr());
+            }
+        }
+    }
+
+    xpu::sycl::in_memory_arg_t src_;
+    xpu::sycl::out_memory_arg_t dst_;
+    sycl_rnn_copy_conf_t conf_;
+
+    const xpu::sycl::md_t &src_md() const { return conf_.src_md; }
+    void *src_ptr() const { return src_.get_pointer(); }
+    void *dst_ptr() const { return dst_.get_pointer(); }
+
+    void do_copy(
+            dim_t src_offset, dim_t dst_offset, void *from, void *to) const {
+        if (src_ptr()) {
+            auto src = load_float_value(
+                    src_md().data_type(), src_ptr(), src_offset);
+            if (dst_ptr()) {
+                store_float_value(
+                        src_md().data_type(), src, dst_ptr(), dst_offset);
+            }
+        } else {
+            if (dst_ptr()) {
+                store_float_value(
+                        src_md().data_type(), 0.0f, dst_ptr(), dst_offset);
+            }
+        }
+    }
+};
+
+struct ref_rnn_bias {
+    ref_rnn_bias(const sycl_rnn_bias_conf_t &conf,
+            const xpu::sycl::inout_memory_arg_t &src_base,
+            const xpu::sycl::in_memory_arg_t &bias,
+            const xpu::sycl::out_memory_arg_t &dst_base)
+        : src_ {src_base}, bias_ {bias}, dst_ {dst_base}, conf_ {conf} {}
+    void operator()(::sycl::nd_item<3> item) const {
+
+        const int b = item.get_global_id(1);
+        const int c = item.get_global_id(0);
+
+        if (b >= conf_.batch || c >= conf_.dhc) return;
+
+        auto src = src_ptr();
+        auto bias = bias_ptr();
+        auto dst = dst_ptr();
+
+        auto src_offset = src_data_offset(b, c);
+        auto bias_offset = bias_data_offset(b, c);
+        auto dst_offset = dst_data_offset(b, c);
+
+        auto src_val
+                = load_float_value(conf_.dst_md.data_type(), src, src_offset);
+        auto bias_val = load_float_value(conf_.bias_type, bias, bias_offset);
+
+        auto g = compute_gates(src_val, bias_val);
+
+        store_float_value(conf_.dst_md.data_type(), g, dst, dst_offset);
+        store_float_value(conf_.dst_md.data_type(), g, src, src_offset);
+    }
+
+    inline dim_t src_data_offset(int b, int c) const {
+        return cell_scratch_mem(conf_.gates_ws_ld, conf_.dhc, b, 0, c);
+    }
+
+    inline dim_t bias_data_offset(int b, int c) const {
+        return off_ker_bias(conf_.dhc, 0, c, 0);
+    }
+
+    inline dim_t dst_data_offset(int b, int c) const {
+        return cell_ws_state(conf_.states_ws_ld, b, c);
+    }
+
+    float compute_gates(float in_val, float bias_val) const {
+        switch (conf_.activation_kind) {
+            case alg_kind::eltwise_relu:
+                return (float)(math::relu_fwd(
+                        (float)(in_val + bias_val), conf_.alpha));
+            case alg_kind::eltwise_tanh:
+                return (float)(math::tanh_fwd((float)(in_val + bias_val)));
+            case alg_kind::eltwise_logistic:
+                return (float)(math::logistic_fwd((float)(in_val + bias_val)));
+            default: return 0;
+        }
+    }
+
+    void *src_ptr() const { return src_.get_pointer(); }
+    void *dst_ptr() const { return dst_.get_pointer(); }
+    void *bias_ptr() const { return bias_.get_pointer(); }
+
+    xpu::sycl::inout_memory_arg_t src_;
+    xpu::sycl::in_memory_arg_t bias_;
+    xpu::sycl::out_memory_arg_t dst_;
+    sycl_rnn_bias_conf_t conf_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/rnn/rnn_utils.cpp b/src/gpu/generic/sycl/rnn/rnn_utils.cpp
new file mode 100644
index 00000000000..b6663f22465
--- /dev/null
+++ b/src/gpu/generic/sycl/rnn/rnn_utils.cpp
@@ -0,0 +1,202 @@
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/generic/sycl/rnn/rnn_utils.hpp"
+
+#include "common/c_types_map.hpp"
+#include "gpu/intel/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+using namespace dnnl::impl::utils;
+using namespace prop_kind;
+using namespace data_type;
+
+void rnn_utils::init_rnn_conf(
+        conf_t &rnn, const rnn_pd_t *rnn_pd, data_type_t acc_data_t) {
+
+    rnn = utils::zero<decltype(rnn)>();
+    rnn.is_fwd = utils::one_of(rnn_pd->desc()->prop_kind,
+            prop_kind::forward_training, prop_kind::forward_inference);
+    rnn.is_training = utils::one_of(rnn_pd->desc()->prop_kind,
+            prop_kind::forward_training, prop_kind::backward);
+
+    rnn.aux_data_type
+            = acc_data_t == data_type::f16 ? data_type::f16 : data_type::f32;
+
+    rnn.acc_data_type = acc_data_t;
+
+    rnn.wei_layer_type = rnn_pd->weights_md(0)->data_type;
+    rnn.wei_iter_type = rnn_pd->weights_md(1)->data_type;
+
+    rnn.n_layer = rnn_pd->weights_md(0)->dims[0];
+    rnn.n_iter = rnn_pd->src_md(0)->dims[0];
+    rnn.n_dir = rnn_pd->weights_md(0)->dims[1];
+    rnn.n_gates = rnn_pd->weights_md(0)->dims[3];
+    rnn.n_states = rnn_pd->desc()->cell_kind == dnnl_vanilla_lstm ? 2 : 1;
+    rnn.n_bias = rnn.n_gates + 1;
+    rnn.mb = rnn_pd->src_md(0)->dims[1];
+    rnn.sic = rnn_pd->weights_md(1)->dims[2];
+    rnn.slc = rnn_pd->weights_md(0)->dims[2];
+    rnn.dhc = rnn_pd->weights_md(0)->dims[4];
+    rnn.dlc = rnn_pd->dst_md(0)->dims[2];
+
+    rnn.gates_ld = rnn.dhc * rnn.n_gates;
+
+    rnn.n_parts_bias = 1;
+    rnn.parts_bias[0] = rnn.n_bias;
+    rnn.parts_bias[1] = 0;
+    rnn.iter_loop = 1;
+
+    rnn.use_workspace = rnn.is_training;
+
+    rnn.src_data_type = rnn_pd->src_md(0)->data_type;
+    rnn.input_data_type = rnn_pd->src_md(1)->data_type;
+    rnn.bias_data_type = rnn_pd->weights_md(2)->data_type;
+    rnn.dst_data_type = rnn_pd->dst_md(0)->data_type;
+    rnn.output_data_type = rnn_pd->dst_md(1)->data_type;
+
+    // Assign types for optional parameters for improved kernel reuse.
+    if (rnn.input_data_type == data_type::undef)
+        rnn.input_data_type = rnn.src_data_type;
+    if (rnn.output_data_type == data_type::undef)
+        rnn.output_data_type = rnn.dst_data_type;
+}
+
+void rnn_utils::set_rnn_conf(conf_t &rnn, const rnn_desc_t &rd) {
+
+    const bool is_fwd = rnn.is_fwd;
+
+    dim_t aux_elsz
+            = static_cast<dim_t>(types::data_type_size(rnn.aux_data_type));
+    rnn.ws_states_elsz = types::data_type_size(rnn.src_data_type);
+
+    rnn.scratch_gates_elsz = types::data_type_size(rnn.acc_data_type);
+
+    // Set workspace sizes to store:
+    // states to compute a pass
+    // intermediate results from the gates
+    rnn.states_ws_ld = nstl::max(rnn.slc, nstl::max(rnn.sic, rnn.dhc));
+    rnn.gates_ws_ld = rnn.gates_ld;
+    rnn.scratch_gates_ld = rnn.gates_ld;
+
+    rnn.ws_states_cell_size = rnn.mb * rnn.states_ws_ld * rnn.ws_states_elsz;
+    rnn.ws_states_size = (rnn.n_layer + 1) * rnn.n_dir * (rnn.n_iter + 1)
+            * rnn.ws_states_cell_size;
+
+    rnn.ws_gates_cell_size = rnn.mb * rnn.gates_ws_ld * aux_elsz;
+    rnn.ws_gates_size = rnn.ws_gates_cell_size;
+    rnn.scratch_gates_size
+            = rnn.mb * rnn.scratch_gates_ld * rnn.scratch_gates_elsz;
+
+    rnn.ws_bias_size
+            = rnn.n_layer * rnn.n_dir * rnn.n_bias * rnn.dhc * aux_elsz;
+
+    // For intermediate step in post-gemm fwd lbr gru
+    rnn.scratch_cell_size = [&]() {
+        if (is_fwd) {
+            return rnn.mb * rnn.scratch_gates_ld * rnn.scratch_gates_elsz;
+        } else {
+            return static_cast<dim_t>(0);
+        }
+    }();
+
+    // Used for storing the intermediate value from fwd pass in training lbr gru
+    rnn.ws_per_cell = rnn.mb * rnn.dhc * aux_elsz;
+
+    set_workspace_offsets(rnn, rnn.ws_gates_offset, rnn.ws_states_offset);
+}
+
+dim_t rnn_utils::set_workspace_offsets(
+        const conf_t &rnn, dim_t &ws_gates_offset, dim_t &ws_states_offset) {
+
+    const dim_t page_size = 4096;
+    dim_t current_offset = 0;
+
+#define register_space(a) \
+    do { \
+        current_offset = utils::rnd_up(current_offset, page_size); \
+        CONCAT2(a, _offset) = current_offset; \
+        current_offset += rnn.CONCAT2(a, _size); \
+    } while (false)
+
+    // Mandatory workspaces: go to workspace if use_workspace, scratchpad
+    // otherwise assumes the workspace base pointer is page aligned
+    register_space(ws_states);
+    register_space(ws_gates);
+
+    return current_offset;
+}
+
+dim_t rnn_utils::get_workspace_size(const conf_t &rnn) {
+    dim_t ws_gates_offset, ws_states_offset;
+    return set_workspace_offsets(rnn, ws_gates_offset, ws_states_offset);
+}
+
+status_t rnn_utils::set_good_strides(
+        memory_desc_t &weights_md, format_tag_t tag) {
+    auto &strides = weights_md.format_desc.blocking.strides;
+    auto dims = weights_md.dims;
+    using namespace format_tag;
+
+    if (tag == ldigo) {
+        strides[1] = dims[2] * strides[2];
+        strides[0] = dims[1] * strides[1];
+    } else if (tag == ldgoi) {
+        strides[3] = dims[4] * strides[4];
+        strides[1] = dims[3] * strides[3];
+        strides[0] = dims[1] * strides[1];
+    } else
+        return status::unimplemented;
+
+    return status::success;
+}
+
+status_t rnn_utils::set_weights_desc(
+        memory_desc_t &weights_md, const conf_t &rnn) {
+    using namespace format_tag;
+    if (weights_md.format_kind == format_kind::any) {
+        CHECK(memory_desc_init_by_tag(weights_md, rnn.is_fwd ? ldigo : ldgoi));
+
+        // Adjust strides for good leading dimension in GEMM
+        CHECK(set_good_strides(weights_md, rnn.is_fwd ? ldigo : ldgoi));
+
+        return status::success;
+    } else if (weights_md.format_kind != format_kind::blocked) {
+        // This implementation only supports blocked memory
+        return status::unimplemented;
+    }
+    return status::success;
+}
+
+const memory_storage_t &rnn_utils::get_storage(
+        const memory_storage_t *storage) {
+    return storage ? *storage : memory_storage_t::empty_storage();
+}
+const memory_storage_t &rnn_utils::get_storage(
+        const std::unique_ptr<memory_storage_t> &storage) {
+    return rnn_utils::get_storage(storage.get());
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/generic/sycl/rnn/rnn_utils.hpp b/src/gpu/generic/sycl/rnn/rnn_utils.hpp
new file mode 100644
index 00000000000..40d2bac82e5
--- /dev/null
+++ b/src/gpu/generic/sycl/rnn/rnn_utils.hpp
@@ -0,0 +1,381 @@
+/*******************************************************************************
+* Copyright 2019-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_RNN_RNN_REF_UTILS_HPP
+#define GPU_GENERIC_SYCL_RNN_RNN_REF_UTILS_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/memory_storage.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/primitive_desc.hpp"
+#include "common/stream.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/gpu_rnn_pd.hpp"
+
+inline int calc_4d_off(int i0, int i1, int d1, int i2, int d2, int i3, int d3,
+        int i4, int d4) {
+    return ((((i0) * (d1) + (i1)) * (d2) + (i2)) * (d3) + (i3)) * (d4) + (i4);
+}
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+template <size_t ndims>
+using strides_t = std::array<dim_t, ndims>;
+
+namespace rnn_utils {
+
+enum ws_part_t { gates, states, cell, grid, bias };
+
+namespace kernel_id {
+constexpr size_t copy_init_layer = 0;
+constexpr size_t copy_init_iter = 1;
+constexpr size_t copy_res_layer = 2;
+constexpr size_t copy_res_iter = 3;
+constexpr size_t bias_fwd = 4;
+constexpr size_t cell_fwd = 5;
+} // namespace kernel_id
+
+struct conf_t {
+    dim_t n_layer, n_iter, n_dir, n_gates, n_states;
+    dim_t mb;
+    dim_t slc, sic, dhc, dlc;
+
+    dim_t gates_ld, gates_ws_ld;
+
+    dim_t n_bias, n_parts_bias, parts_bias[DNNL_RNN_MAX_N_PARTS];
+
+    dim_t iter_loop;
+
+    dim_t states_ws_ld;
+    bool is_fwd, is_training;
+    bool use_workspace;
+
+    // Size of workspace for each tensor in bytes
+    dim_t ws_states_cell_size, ws_gates_cell_size;
+    dim_t ws_gates_size, ws_states_size, scratch_cell_size, ws_per_cell,
+            ws_bias_size;
+
+    dim_t ws_gates_offset;
+    dim_t ws_states_offset;
+    dim_t ws_bias_offset;
+
+    // Element size of each workspace part in bytes
+    dim_t ws_gates_elsz, ws_states_elsz, ws_bias_elsz;
+
+    dim_t n_iter_scratch_gates;
+    dim_t scratch_gates_size, scratch_gates_elsz, scratch_gates_ld;
+    dims_t local_ranges;
+
+    data_type_t acc_data_type;
+    data_type_t aux_data_type;
+    data_type_t input_data_type;
+    data_type_t output_data_type;
+    data_type_t src_data_type;
+    data_type_t dst_data_type;
+    data_type_t wei_layer_type;
+    data_type_t wei_iter_type;
+    data_type_t bias_data_type;
+};
+
+dim_t get_good_ld(
+        dim_t arch_ld, dim_t dim, dim_t sizeof_dt, bool ignore_assoc = false);
+void init_rnn_conf(
+        conf_t &rnn, const rnn_pd_t *rnn_pd, data_type_t acc_data_type);
+void set_rnn_conf(conf_t &rnn, const rnn_desc_t &rd);
+dim_t set_workspace_offsets(
+        const conf_t &rnn, dim_t &ws_gates_offset, dim_t &ws_h_state_offset);
+dim_t get_workspace_size(const conf_t &rnn);
+status_t set_weights_desc(memory_desc_t &weights_md, const conf_t &rnn);
+status_t set_good_strides(memory_desc_t &weights_md, format_tag_t tag);
+const memory_storage_t &get_storage(const memory_storage_t *storage);
+const memory_storage_t &get_storage(
+        const std::unique_ptr<memory_storage_t> &storage);
+
+struct data_helper_t {
+    static dim_t type_size(data_type_t d) {
+        return static_cast<dim_t>(types::data_type_size(d));
+    }
+};
+
+struct user_data_t : public data_helper_t {
+    using mst = memory_storage_t;
+    user_data_t(const mst &wei_layer, const memory_desc_t *wei_layer_mdw,
+            const mst &wei_iter, const memory_desc_t *wei_iter_mdw,
+            const mst &bias, const memory_desc_t *bias_mdw)
+        : wei_layer_(wei_layer)
+        , wei_layer_mdw_(wei_layer_mdw)
+        , wei_iter_(wei_iter)
+        , wei_iter_mdw_(wei_iter_mdw)
+        , bias_(bias)
+        , bias_mdw_(bias_mdw) {}
+
+    const mst &wei_layer() const { return wei_layer_; }
+    std::unique_ptr<mst> wei_layer(dim_t lay, dim_t dir) const {
+
+        dim_t t = type_size(wei_layer_mdw_.data_type());
+        // wei_layer dimension order: layer, dir, src c, gate, dst c
+        dim_t offset = wei_layer_mdw_.off(lay, dir, 0, 0, 0) * t;
+
+        return wei_layer_.clone_ptr_off(offset);
+    }
+
+    const mst &wei_iter() const { return wei_iter_; }
+    std::unique_ptr<mst> wei_iter(dim_t lay, dim_t dir) const {
+        dim_t t = type_size(wei_iter_mdw_.data_type());
+        // wei_iter dimension order: layer, dir, src c, gate, dst c
+        dim_t offset = wei_iter_mdw_.off(lay, dir, 0, 0, 0) * t;
+
+        return wei_iter_.clone_ptr_off(offset);
+    }
+
+    const mst &bias() const { return bias_; }
+
+    std::unique_ptr<mst> bias(dim_t lay, dim_t dir) const {
+        if (bias().data_handle() == nullptr) return {};
+        auto t = type_size(bias_mdw_.data_type());
+        // bia dimension order: lay, dir, gates, dhc
+        auto offset = bias_mdw_.off(lay, dir, 0, 0) * t;
+
+        return bias_.clone_ptr_off(offset);
+    }
+
+    const mst &wei_layer_;
+    const memory_desc_wrapper wei_layer_mdw_;
+    const mst &wei_iter_;
+    const memory_desc_wrapper wei_iter_mdw_;
+    const mst &bias_;
+    const memory_desc_wrapper bias_mdw_;
+};
+
+struct workspace_t : public data_helper_t {
+    using mst = memory_storage_t;
+    workspace_t(const mst &ws, const conf_t &conf)
+        : ws_(ws)
+        , conf_(conf)
+        , gates_(conf.ws_gates_size > 0 ? ws.clone() : nullptr)
+        , gates_strides_ {0}
+        , states_(conf.ws_states_size > 0 ? ws.clone() : nullptr)
+        , states_strides_ {0}
+        , bias_(conf.ws_bias_size > 0 ? ws.clone() : nullptr) {
+        if (gates_) {
+            gates_->set_offset(gates_->offset() + conf.ws_gates_offset);
+            const int n_b = conf_.mb;
+            const int n_tb = conf_.n_iter * n_b;
+            const int n_dtb = conf_.n_dir * n_tb;
+            gates_strides_
+                    = {n_dtb * conf_.gates_ws_ld, n_tb * conf_.gates_ws_ld,
+                            n_b * conf_.gates_ws_ld, conf_.gates_ws_ld};
+        }
+        if (states_) {
+            states_->set_offset(states_->offset() + conf.ws_states_offset);
+            const int n_b = conf_.mb;
+            const int n_tb = (conf_.n_iter + 1) * n_b;
+            const int n_dtb = conf_.n_dir * n_tb;
+            states_strides_ = {n_dtb * conf_.states_ws_ld,
+                    n_tb * conf_.states_ws_ld, n_b * conf_.states_ws_ld, 1};
+        }
+        bias_->set_offset(bias_->offset() + conf.ws_bias_offset);
+    }
+
+    template <size_t ndims>
+    static dim_t get_offset(const strides_t<ndims> &strides,
+            const std::array<dim_t, ndims> &dims) {
+        dim_t offset = 0;
+        for (size_t i = 0; i < ndims; i++) {
+            offset += strides[i] * dims[i];
+        }
+        return offset;
+    }
+
+    dim_t calc_off_ws_state(
+            dim_t i0, dim_t i1, dim_t i2, dim_t i3, dim_t i4) const {
+        assert(i0 >= 0);
+        //lay,dir,time
+        return calc_4d_off(i0, i1, conf_.n_dir, i2, conf_.n_iter + 1, i3,
+                conf_.mb, i4, conf_.states_ws_ld);
+    }
+
+    dim_t calc_off_ws_c_state(
+            dim_t i0_, dim_t i1, dim_t i2_, dim_t i3, dim_t i4) const {
+        // Logical index into workspace grid
+        auto i0 = i0_;
+        auto i2 = i2_ + 1;
+
+        assert(i0 >= 0);
+
+        return calc_4d_off(i0, i1, conf_.n_dir, i2, conf_.n_iter + 1, i3,
+                conf_.mb, i4, conf_.states_ws_ld);
+    }
+
+    dim_t calc_off_ws_grid_offset(
+            dim_t i0, dim_t i1, dim_t i2, dim_t i3, dim_t i4) const {
+        return calc_4d_off(i0, i1, conf_.n_dir, i2, conf_.n_iter, i3, conf_.mb,
+                i4, conf_.dhc);
+    }
+
+    const mst &ws() const { return ws_; }
+    const mst &gates() const { return get_storage(gates_); }
+    const mst &states() const { return get_storage(states_); }
+
+    std::unique_ptr<mst> states(dim_t layer, dim_t dir, dim_t time) const {
+        if (!states_) return {};
+        auto off_ = get_offset(states_strides(), {layer, dir, time, 0})
+                * conf_.ws_states_elsz;
+        return states().clone_ptr_off(off_);
+    }
+
+    const strides_t<4> &states_strides() const { return states_strides_; }
+
+    std::unique_ptr<mst> states_range(dim_t layer_start, dim_t layer_end,
+            dim_t dir_start, dim_t dir_end, dim_t time_start,
+            dim_t time_end) const {
+        auto off_start
+                = calc_off_ws_state(layer_start, dir_start, time_start, 0, 0)
+                * conf_.ws_states_elsz;
+        return states().clone_ptr_off(off_start);
+    }
+
+    std::unique_ptr<mst> gates(
+            dim_t layer, dim_t dir, dim_t time, dim_t mb = 0) const {
+        auto off = get_offset(gates_strides(), {layer, dir, time, mb})
+                * type_size(conf_.aux_data_type);
+        return gates().clone_ptr_off(off);
+    }
+    const strides_t<4> &gates_strides() const { return gates_strides_; }
+
+    std::unique_ptr<mst> grid_comp(dim_t layer, dim_t dir, dim_t time) const {
+        if (!grid_comp_) return {};
+
+        auto off = calc_off_ws_grid_offset(layer, dir, time, 0, 0)
+                * type_size(conf_.aux_data_type);
+
+        return grid_comp().clone_ptr_off(off);
+    }
+
+    const mst &c_states() const { return get_storage(c_states_); }
+    const mst &bias() const { return get_storage(bias_); }
+    const mst &grid_comp() const { return get_storage(grid_comp_); }
+
+private:
+    const mst &ws_;
+    const conf_t &conf_;
+    std::unique_ptr<mst> gates_;
+    strides_t<4> gates_strides_;
+    std::unique_ptr<mst> states_;
+    strides_t<4> states_strides_;
+    std::unique_ptr<mst> c_states_;
+    std::unique_ptr<mst> bias_;
+    std::unique_ptr<mst> grid_comp_;
+};
+
+struct scratch_t : public data_helper_t {
+    using mst = memory_storage_t;
+
+    enum {
+        key_gemm_iter_fwd = memory_tracking::names::key_nested_multiple,
+        key_gemm_layer_fwd,
+    };
+
+    scratch_t(const conf_t &conf, const memory_tracking::grantor_t &scratchpad)
+        : conf_(conf) {
+        using namespace memory_tracking::names;
+        gates_ = scratchpad.get_memory_storage(key_rnn_gates);
+        cell_ = scratchpad.get_memory_storage(key_rnn_cell);
+    }
+
+    struct gemm_pds {
+        const primitive_desc_t *iter_fwd_pd;
+        const primitive_desc_t *layer_fwd_pd;
+    };
+
+    static void book(memory_tracking::registrar_t &scratchpad,
+            const conf_t &rnn_conf, const gemm_pds &gemms) {
+        using namespace memory_tracking::names;
+        if (rnn_conf.scratch_gates_size > 0)
+            scratchpad.book(key_rnn_gates, rnn_conf.scratch_gates_size, 1);
+        scratchpad.book(key_rnn_cell, rnn_conf.scratch_cell_size, 1);
+        // book scratchpad for nested primitives
+        if (gemms.layer_fwd_pd) {
+            scratchpad.book(key_gemm_layer_fwd,
+                    gemms.layer_fwd_pd->scratchpad_registry());
+        }
+        if (gemms.iter_fwd_pd) {
+            scratchpad.book(key_gemm_iter_fwd,
+                    gemms.iter_fwd_pd->scratchpad_registry());
+        }
+    }
+
+    dim_t calc_off_gates(dim_t iter) const {
+        return conf_.n_iter_scratch_gates != 1
+                ? iter * conf_.mb * conf_.scratch_gates_ld
+                : 0;
+    };
+
+    const mst *gates() const {
+        assert(gates_);
+        return (conf_.is_fwd) ? (gates_ ? gates_.get() : diff_gates_.get())
+                              : nullptr;
+    }
+    std::unique_ptr<mst> gates(dim_t iter) const {
+        auto g = gates();
+        if (g == nullptr) return {};
+
+        auto off = calc_off_gates(iter) * conf_.scratch_gates_elsz;
+        return g->clone_ptr_off(off);
+    }
+
+    const mst *cell() const { return cell_.get(); }
+
+    const mst *diff_ht() const { return diff_ht_.get(); }
+
+private:
+    const conf_t &conf_;
+
+    std::unique_ptr<mst> gates_;
+    std::unique_ptr<mst> diff_gates_;
+    std::unique_ptr<mst> cell_;
+    std::unique_ptr<mst> diff_states_;
+    std::unique_ptr<mst> diff_ht_;
+};
+
+inline size_t calc_global_range(const size_t lc_range, size_t gl_range) {
+    return ((gl_range + (lc_range - 1)) / lc_range) * lc_range;
+}
+
+inline size_t calc_local_range(const exec_ctx_t &ctx) {
+    // Check the device for the supported max worgroup size
+    // TODO: 256 is an arbitrary ceiling to ensure we do not use too
+    // many registers, can be improved in future.
+    return std::floor(std::cbrt(std::min<size_t>(256,
+            static_cast<xpu::sycl::stream_impl_t *>(ctx.stream()->impl())
+                    ->queue()
+                    ->get_device()
+                    .get_info<::sycl::info::device::max_work_group_size>())));
+}
+
+} // namespace rnn_utils
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/shuffle_kernels.hpp b/src/gpu/generic/sycl/shuffle_kernels.hpp
index 9b9f188b943..a4b0f489537 100644
--- a/src/gpu/generic/sycl/shuffle_kernels.hpp
+++ b/src/gpu/generic/sycl/shuffle_kernels.hpp
@@ -84,12 +84,12 @@ struct shuffle_kernel_vec2_t {
 
         const dim_t stride_mb = conf_.stride_m;
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
         dim_t sp_start {0}, sp_end {0};
         balance211(conf_.SP, conf_.nthr, ithr, sp_start, sp_end);
 
-        for (size_t mb = 0; mb < conf_.MB; mb++) {
-            for (size_t sp = sp_start; sp < sp_end; sp++) {
+        for (dim_t mb = 0; mb < conf_.MB; mb++) {
+            for (dim_t sp = sp_start; sp < sp_end; sp++) {
                 const dim_t off = mb * stride_mb + sp * conf_.C;
                 for (dim_t c = 0; c < conf_.C; ++c) {
                     dim_t i = c % conf_.transpose_col;
@@ -121,7 +121,7 @@ struct shuffle_kernel_vec3_t {
         memory_tensor_t data_mem(data_, conf_.src_md);
         memory_tensor_t dst_mem(dst_, conf_.dst_md);
 
-        size_t ithr = item.get_group(0) * conf_.wg_size + item.get_local_id();
+        size_t ithr = item.get_global_id(0);
         const dim_t outer_size = conf_.outer_size;
         const dim_t inner_size = conf_.inner_size;
         const dim_t dim = conf_.axis_size * inner_size;
diff --git a/src/gpu/generic/sycl/simple_reduction.cpp b/src/gpu/generic/sycl/simple_reduction.cpp
new file mode 100644
index 00000000000..63154de3c5b
--- /dev/null
+++ b/src/gpu/generic/sycl/simple_reduction.cpp
@@ -0,0 +1,57 @@
+#include "simple_reduction.hpp"
+
+#include "gpu/generic/sycl/engine.hpp"
+#include "gpu/generic/sycl/simple_reduction_kernels.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+status_t simple_reduction_t::pd_t::init_conf() {
+    conf_.alg = desc()->alg_kind;
+    conf_.src_md = xpu::sycl::md_t(src_md());
+    conf_.dst_md = xpu::sycl::md_t(dst_md());
+    conf_.p = desc()->p;
+    conf_.eps = desc()->eps;
+
+    auto src_wrap = memory_desc_wrapper(src_md());
+    auto dst_wrap = memory_desc_wrapper(dst_md());
+    dst_nelems_ = dst_wrap.nelems();
+
+    const auto ndims = dst_wrap.ndims();
+    for (int d = 0; d < xpu::sycl::md_t::max_dims; d++) {
+        conf_.reduce_dims[d] = dim_t {1};
+        if (d < ndims) {
+            if (src_wrap.dims()[d] != dst_wrap.dims()[d]) {
+                conf_.reduce_dims[d] = src_wrap.dims()[d];
+                conf_.reduce_size *= conf_.reduce_dims[d];
+            }
+        }
+    }
+
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_wrap);
+
+    return status::success;
+}
+
+status_t simple_reduction_t::init(impl::engine_t *engine) {
+    const auto kid = ::sycl::get_kernel_id<simple_reduction_kernel_fwd_t>();
+    CHECK(create_kernel(engine, kid, &kernel_));
+
+    return status::success;
+}
+
+status_t simple_reduction_t::execute(const exec_ctx_t &ctx) const {
+    return parallel_for(ctx, kernel_, [&](::sycl::handler &cgh) {
+        simple_reduction_kernel_fwd_t reduction_kernel(pd()->conf_, cgh, ctx);
+        cgh.parallel_for(::sycl::range<1>(pd()->dst_nelems_), reduction_kernel);
+    });
+}
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/generic/sycl/simple_reduction.hpp b/src/gpu/generic/sycl/simple_reduction.hpp
new file mode 100644
index 00000000000..f8801882719
--- /dev/null
+++ b/src/gpu/generic/sycl/simple_reduction.hpp
@@ -0,0 +1,84 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GENERIC_SYCL_SIMPLE_REDUCTION_HPP
+#define GPU_GENERIC_SYCL_SIMPLE_REDUCTION_HPP
+
+#include "common/primitive_desc_iterator.hpp"
+#include "common/reorder.hpp"
+#include "common/reorder_pd.hpp"
+#include "gpu/generic/sycl/sycl_gpu_primitive.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_post_ops.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "gpu/generic/sycl/sycl_utils.hpp"
+#include "gpu/gpu_reduction_pd.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct simple_reduction_t : public gpu::generic::sycl::primitive_t {
+    using gpu::generic::sycl::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_reduction_pd_t {
+        using gpu_reduction_pd_t::gpu_reduction_pd_t;
+
+        DECLARE_COMMON_PD_T("dpcpp:ref:any", simple_reduction_t);
+
+        status_t init(impl::engine_t *engine) {
+            using sm = primitive_attr_t::skip_mask_t;
+
+            memory_desc_wrapper src_wrap(src_md());
+            memory_desc_wrapper dst_wrap(dst_md());
+
+            bool ok = set_default_params() == status::success
+                    && attr()->has_default_values(sm::post_ops)
+                    && sycl_post_ops_t::post_ops_ok(attr())
+                    && attr_.set_default_formats(dst_md()) == status::success
+                    && src_wrap.is_plain() && dst_wrap.is_plain()
+                    && src_wrap.ndims() == dst_wrap.ndims()
+                    && md_dims_in_range(src_md()) && md_dims_in_range(dst_md());
+            if (!ok) return status::unimplemented;
+
+            return init_conf();
+        }
+
+        sycl_simple_reduction_conf_t conf_;
+        dim_t dst_nelems_;
+
+    private:
+        status_t init_conf();
+    };
+
+    status_t init(impl::engine_t *engine) override;
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    kernel_t kernel_;
+    std::shared_ptr<impl::primitive_t> reorder_p_;
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/generic/sycl/simple_reduction_kernels.hpp b/src/gpu/generic/sycl/simple_reduction_kernels.hpp
new file mode 100644
index 00000000000..41ccd79e0f0
--- /dev/null
+++ b/src/gpu/generic/sycl/simple_reduction_kernels.hpp
@@ -0,0 +1,132 @@
+
+#ifndef GPU_GENERIC_SYCL_SIMPLE_REDUCTION_KERNELS_HPP
+#define GPU_GENERIC_SYCL_SIMPLE_REDUCTION_KERNELS_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/dnnl_thread.hpp"
+#include "common/primitive_exec_types.hpp"
+#include "common/utils.hpp"
+#include "gpu/generic/sycl/sycl_io_helper.hpp"
+#include "gpu/generic/sycl/sycl_math_utils.hpp"
+#include "gpu/generic/sycl/sycl_primitive_conf.hpp"
+#include "xpu/sycl/memory_storage_base.hpp"
+#include "xpu/sycl/types.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace generic {
+namespace sycl {
+
+struct Reducer {
+    dnnl_alg_kind_t alg_;
+    float p_, eps_;
+
+    Reducer(dnnl_alg_kind_t alg, float p, float eps)
+        : alg_(alg), p_(p), eps_(eps) {}
+
+    float identity() const {
+        if (alg_ == dnnl_reduction_min) {
+            return std::numeric_limits<float>::max();
+        } else if (alg_ == dnnl_reduction_max) {
+            return std::numeric_limits<float>::lowest();
+        } else if (alg_ == dnnl_reduction_mul) {
+            return 1.f;
+        }
+
+        return 0.f;
+    }
+
+    float reduce(float lhs, float rhs) const {
+        if (alg_ == dnnl_reduction_sum || alg_ == dnnl_reduction_mean) {
+            return lhs + rhs;
+        } else if (alg_ == dnnl_reduction_min) {
+            return ::sycl::min(lhs, rhs);
+        } else if (alg_ == dnnl_reduction_max) {
+            return ::sycl::max(lhs, rhs);
+        } else if (alg_ == dnnl_reduction_mul) {
+            return lhs * rhs;
+        } else if (alg_ == dnnl_reduction_norm_lp_max
+                || alg_ == dnnl_reduction_norm_lp_sum
+                || alg_ == dnnl_reduction_norm_lp_power_p_max
+                || alg_ == dnnl_reduction_norm_lp_power_p_sum) {
+            return lhs + ::sycl::pow(::sycl::fabs(rhs), p_);
+        }
+
+        return ::sycl::nan(0U);
+    }
+
+    float finalize(float val, int size) const {
+        if (alg_ == dnnl_reduction_mean) {
+            return val / size;
+        } else if (alg_ == dnnl_reduction_norm_lp_max) {
+            return ::sycl::rootn(::sycl::max(val, eps_), p_);
+        } else if (alg_ == dnnl_reduction_norm_lp_sum) {
+            return ::sycl::rootn(val + eps_, p_);
+        } else if (alg_ == dnnl_reduction_norm_lp_power_p_max) {
+            return ::sycl::max(val, eps_);
+        } else if (alg_ == dnnl_reduction_norm_lp_power_p_sum) {
+            return val + eps_;
+        }
+
+        return val;
+    }
+};
+
+struct simple_reduction_kernel_fwd_t {
+    sycl_simple_reduction_conf_t conf_;
+    xpu::sycl::in_memory_arg_t src_;
+    xpu::sycl::out_memory_arg_t dst_;
+    post_op_input_args po_args_;
+
+    simple_reduction_kernel_fwd_t(const sycl_simple_reduction_conf_t &conf,
+            ::sycl::handler &cgh, const exec_ctx_t &ctx)
+        : conf_(conf)
+        , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
+        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , po_args_(cgh, ctx, conf_.post_ops) {}
+
+    void operator()(::sycl::item<1> item) const {
+        Reducer reducer(conf_.alg, conf_.p, conf_.eps);
+
+        memory_tensor_t<::sycl::access_mode::read> src(src_, conf_.src_md);
+        memory_tensor_t<::sycl::access_mode::write> dst(dst_, conf_.dst_md);
+        const int id = item.get_linear_id();
+
+        const auto &dst_md = conf_.dst_md;
+        dims_t pos;
+        int l_offset = id;
+        for (int i = 0; i < dst_md.ndims(); i++) {
+            const int d = dst_md.ndims() - 1 - i;
+            const dim_t cur_dim = dst_md.dims()[d];
+            pos[d] = l_offset % cur_dim;
+            l_offset = l_offset / cur_dim;
+        }
+
+        float acc = reducer.identity();
+        for (off_t d0 = 0; d0 < conf_.reduce_dims[0]; d0++)
+            for (off_t d1 = 0; d1 < conf_.reduce_dims[1]; d1++)
+                for (off_t d2 = 0; d2 < conf_.reduce_dims[2]; d2++)
+                    for (off_t d3 = 0; d3 < conf_.reduce_dims[3]; d3++)
+                        for (off_t d4 = 0; d4 < conf_.reduce_dims[4]; d4++)
+                            for (off_t d5 = 0; d5 < conf_.reduce_dims[5];
+                                    d5++) {
+                                dims_t src_off = {pos[0] + d0, pos[1] + d1,
+                                        pos[2] + d2, pos[3] + d3, pos[4] + d4,
+                                        pos[5] + d5};
+                                const float val = src.load_md(src_off);
+                                acc = reducer.reduce(acc, val);
+                            }
+
+        float result = reducer.finalize(acc, conf_.reduce_size);
+        result = conf_.post_ops.apply(result, dst.load_md(pos), po_args_, pos);
+        dst.store_md(result, pos);
+    }
+};
+
+} // namespace sycl
+} // namespace generic
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+#endif
diff --git a/src/gpu/generic/sycl/softmax_kernels.hpp b/src/gpu/generic/sycl/softmax_kernels.hpp
index 6fcbd4b205d..59a5abadd3e 100644
--- a/src/gpu/generic/sycl/softmax_kernels.hpp
+++ b/src/gpu/generic/sycl/softmax_kernels.hpp
@@ -41,8 +41,8 @@ struct softmax_fwd_kernel_vec_t {
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC))
         , scale_dst_(CTX_IN_SYCL_KERNEL_MEMORY(
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST))
-        , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
-        , po_args_(cgh, ctx) {}
+        , dst_(CTX_INOUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
@@ -50,13 +50,6 @@ struct softmax_fwd_kernel_vec_t {
         memory_plain_t src_scale_mem(scale_src_, data_type::f32);
         memory_plain_t dst_scale_mem(scale_dst_, data_type::f32);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-        size_t base_idx = offset_t * conf_.block_size;
-
         dims_t dst_dims;
         for (int i = 0; i < xpu::sycl::md_t::max_dims; i++) {
             if (i < dst_mem.md().ndims()) {
@@ -127,14 +120,11 @@ struct softmax_fwd_kernel_vec_t {
             }
         };
 
-        for (dim_t blk_idx = 0; blk_idx < conf_.block_size; blk_idx++) {
-            dim_t idx = base_idx + blk_idx;
-
-            if (idx < conf_.wk_size) {
-                dim_t in = (idx / (1)) % conf_.inner_size;
-                dim_t ou = (idx / (conf_.inner_size)) % conf_.outer_size;
-                operation(ou, in);
-            }
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            dim_t in = (idx / (1)) % conf_.inner_size;
+            dim_t ou = (idx / (conf_.inner_size)) % conf_.outer_size;
+            operation(ou, in);
         }
     }
 
@@ -150,7 +140,7 @@ struct softmax_fwd_kernel_vec_t {
     xpu::sycl::in_memory_arg_t src_;
     xpu::sycl::in_memory_arg_t scale_src_;
     xpu::sycl::in_memory_arg_t scale_dst_;
-    xpu::sycl::out_memory_arg_t dst_;
+    xpu::sycl::inout_memory_arg_t dst_;
     post_op_input_args po_args_;
 };
 
@@ -167,13 +157,6 @@ struct softmax_bwd_kernel_vec_t {
         memory_tensor_t diff_src_mem(diff_src_, conf_.diff_src_md);
         memory_tensor_t diff_dst_mem(diff_dst_, conf_.diff_dst_md);
 
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-        size_t base_idx = offset_t * conf_.block_size;
-
         auto operation = [= WA_THIS_COPY_CAPTURE](
                                  dim_t &ou, dim_t &in) mutable {
             dim_t ou_in_offset = ou * conf_.channels * conf_.inner_size + in;
@@ -214,13 +197,11 @@ struct softmax_bwd_kernel_vec_t {
             }
         };
 
-        for (dim_t i = 0; i < conf_.block_size; i++) {
-            dim_t idx = base_idx + i;
-            if (idx < conf_.wk_size) {
-                dim_t in = (idx / 1) % conf_.inner_size;
-                dim_t ou = (idx / conf_.inner_size) % conf_.outer_size;
-                operation(ou, in);
-            }
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            dim_t in = (idx / 1) % conf_.inner_size;
+            dim_t ou = (idx / conf_.inner_size) % conf_.outer_size;
+            operation(ou, in);
         }
     }
 
diff --git a/src/gpu/generic/sycl/sum_kernels.hpp b/src/gpu/generic/sycl/sum_kernels.hpp
index c1380a2abe1..a05a0739405 100644
--- a/src/gpu/generic/sycl/sum_kernels.hpp
+++ b/src/gpu/generic/sycl/sum_kernels.hpp
@@ -64,14 +64,6 @@ struct sum_kernel_vec_t {
         , dst_(dst) {}
 
     void operator()(::sycl::nd_item<1> item) const {
-        auto sg = item.get_sub_group();
-        size_t wg_offset_t = item.get_group(0) * conf_.wg_size;
-        size_t sg_offset_t = sg.get_group_id()[0] * sg.get_local_range()[0];
-        size_t wi_offset_t = sg.get_local_id();
-        size_t offset_t = wg_offset_t + sg_offset_t + wi_offset_t;
-
-        size_t base_idx = offset_t * conf_.block_size;
-
         dims_t dims, off, strides;
         for (int i = 0; i < max_supported_ndims; i++) {
             dims[i] = (i < conf_.dst_md.ndims()) ? conf_.dst_md.dims()[i] : 1;
@@ -79,14 +71,13 @@ struct sum_kernel_vec_t {
                                                     : INT_MAX;
         }
 
-        for (int i = 0; i < conf_.block_size; i++) {
-            int idx = base_idx + i;
-            if (idx < conf_.wk_size) {
-                for (int i = 0; i < max_supported_ndims; i++) {
-                    off[i] = idx / strides[i] % dims[i];
-                }
-                auto result = conf_.src_scales[0]
-                        * load_float_val(src0_ptr(), conf_.src_md[0], off);
+        for (int idx = item.get_global_id(0); idx < conf_.wk_size;
+                idx += item.get_global_range(0)) {
+            for (int i = 0; i < max_supported_ndims; i++) {
+                off[i] = idx / strides[i] % dims[i];
+            }
+            auto result = conf_.src_scales[0]
+                    * load_float_val(src0_ptr(), conf_.src_md[0], off);
 
 #define ONEDNN_SYCL_SUM_ADD_ARG(ARG_N) \
     if (conf_.n > ARG_N) \
@@ -94,18 +85,16 @@ struct sum_kernel_vec_t {
                 * load_float_val( \
                         src##ARG_N##_ptr(), conf_.src_md[ARG_N], off);
 
-                ONEDNN_SYCL_SUM_ADD_ARG(1)
-                ONEDNN_SYCL_SUM_ADD_ARG(2)
-                ONEDNN_SYCL_SUM_ADD_ARG(3)
-                ONEDNN_SYCL_SUM_ADD_ARG(4)
-                ONEDNN_SYCL_SUM_ADD_ARG(5)
-                ONEDNN_SYCL_SUM_ADD_ARG(6)
-                ONEDNN_SYCL_SUM_ADD_ARG(7)
+            ONEDNN_SYCL_SUM_ADD_ARG(1)
+            ONEDNN_SYCL_SUM_ADD_ARG(2)
+            ONEDNN_SYCL_SUM_ADD_ARG(3)
+            ONEDNN_SYCL_SUM_ADD_ARG(4)
+            ONEDNN_SYCL_SUM_ADD_ARG(5)
+            ONEDNN_SYCL_SUM_ADD_ARG(6)
+            ONEDNN_SYCL_SUM_ADD_ARG(7)
 #undef ONEDNN_SYCL_SUM_ADD_ARG
 
-                store_float_value(
-                        conf_.dst_md.data_type(), result, dst_ptr(), idx);
-            }
+            store_float_value(conf_.dst_md.data_type(), result, dst_ptr(), idx);
         }
     }
 
diff --git a/src/gpu/generic/sycl/sycl_io_helper.hpp b/src/gpu/generic/sycl/sycl_io_helper.hpp
index be55e392614..08741908c0a 100644
--- a/src/gpu/generic/sycl/sycl_io_helper.hpp
+++ b/src/gpu/generic/sycl/sycl_io_helper.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,11 +28,17 @@ namespace gpu {
 namespace generic {
 namespace sycl {
 
+inline bool is_supported_type(data_type_t dt) {
+    using namespace data_type;
+    return utils::one_of(dt, f32, f16, bf16, s32, s8, u8);
+}
+
 inline int load_int_value(data_type_t dt, const void *ptr, dim_t idx) {
 #define CASE(dt) \
     case dt: \
         return static_cast<int>(reinterpret_cast< \
-                const typename xpu::sycl::prec_traits<dt>::type *>(ptr)[idx]);
+                const typename xpu::sycl::prec_traits_t<dt>::type *>( \
+                ptr)[idx]);
     using namespace data_type;
     switch (dt) {
         CASE(s32);
@@ -49,7 +55,8 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) {
 #define CASE(dt) \
     case dt: \
         return static_cast<float>(reinterpret_cast< \
-                const typename xpu::sycl::prec_traits<dt>::type *>(ptr)[idx]);
+                const typename xpu::sycl::prec_traits_t<dt>::type *>( \
+                ptr)[idx]);
 
     using namespace data_type;
     switch (dt) {
@@ -69,7 +76,7 @@ inline float load_float_value(data_type_t dt, const void *ptr, dim_t idx) {
 inline void store_float_value(data_type_t dt, float val, void *ptr, dim_t idx) {
 #define CASE(dt) \
     case dt: { \
-        using type_ = typename xpu::sycl::prec_traits<dt>::type; \
+        using type_ = typename xpu::sycl::prec_traits_t<dt>::type; \
         *(reinterpret_cast<type_ *>(ptr) + idx) \
                 = gpu::generic::sycl::saturate_and_round<type_>(val); \
     } break;
@@ -138,7 +145,7 @@ inline ::sycl::vec<float, width> load_float_vec(
         data_type_t dt, void *ptr, dim_t offset) {
 #define CASE(dt) \
     case dt: { \
-        using type = typename xpu::sycl::prec_traits<dt>::type; \
+        using type = typename xpu::sycl::prec_traits_t<dt>::type; \
         global_ptr<type> gptr_dt(reinterpret_cast<type *>(ptr)); \
         ::sycl::vec<type, width> vec_dt; \
         vec_dt.load(offset, gptr_dt); \
@@ -164,7 +171,7 @@ inline void store_float_vec(data_type_t dt, ::sycl::vec<float, width> vec_f32,
         void *ptr, dim_t offset) {
 #define CASE(dt) \
     case dt: { \
-        using type = typename xpu::sycl::prec_traits<dt>::type; \
+        using type = typename xpu::sycl::prec_traits_t<dt>::type; \
         global_ptr<type> gptr_dt(reinterpret_cast<type *>(ptr)); \
         auto vec_dt \
                 = gpu::generic::sycl::saturate_and_round_vec<type>(vec_f32); \
diff --git a/src/gpu/generic/sycl/sycl_post_ops.hpp b/src/gpu/generic/sycl/sycl_post_ops.hpp
index 2de1d1d1588..c11b5147073 100644
--- a/src/gpu/generic/sycl/sycl_post_ops.hpp
+++ b/src/gpu/generic/sycl/sycl_post_ops.hpp
@@ -19,6 +19,8 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive_attr.hpp"
+#include "common/primitive_exec_types.hpp"
+#include "common/utils.hpp"
 #include "gpu/generic/sycl/sycl_io_helper.hpp"
 #include "gpu/generic/sycl/sycl_math_utils.hpp"
 #include "xpu/sycl/memory_storage_base.hpp"
@@ -169,12 +171,61 @@ struct ref_binary_op_t {
     xpu::sycl::md_t src_md_;
 };
 
+struct ref_prelu_op_t {
+
+    ref_prelu_op_t() = default;
+
+    ref_prelu_op_t(const post_ops_t::entry_t::prelu_t &prelu,
+            memory_desc_wrapper src_mdw)
+        : ndims_(src_mdw.ndims()) {
+        dims_t prelu_dims;
+        for (int d = 0; d < ndims_; ++d) {
+            prelu_dims[d] = (prelu.mask & (1 << d)) ? src_mdw.dims()[d] : 1;
+        }
+
+        memory_desc_t prelu_desc;
+        using namespace format_tag;
+        auto dat_tag = utils::pick(
+                ndims_ - 2, ab, acb, acdb, acdeb); // prelu post-op uses axb
+        memory_desc_init_by_tag(
+                prelu_desc, ndims_, prelu_dims, src_mdw.data_type(), dat_tag);
+        utils::array_copy(
+                strides_, prelu_desc.format_desc.blocking.strides, ndims_);
+
+        for (int i = 0; i < ndims_; i++) {
+            if (prelu_dims[i] == 1) { strides_[i] = 0; }
+        }
+    }
+
+    float load_and_compute(float s0, const xpu::sycl::in_memory_arg_t &src,
+            dims_t offsets) const { // TODO dims32_t
+        memory_plain_t src_mem(src, data_type::f32);
+
+        dim_t lin_off = 0;
+        for (int i = 0; i < xpu::sycl::md_t::max_dims; i++) {
+            if (i < ndims_) { lin_off += strides_[i] * offsets[i]; }
+        }
+        float val = src_mem.load(lin_off);
+
+        auto res = compute(s0, val);
+        return res;
+    }
+
+    float compute(float src_val, float weights_val) const {
+        return math::relu_fwd(src_val, weights_val);
+    }
+
+private:
+    dim_t ndims_;
+    dims_t strides_;
+};
+
 struct ref_sum_op_t {
     ref_sum_op_t() = default;
     ref_sum_op_t(float scale, float zeropoint)
         : scale_(scale), zeropoint_(zeropoint) {}
 
-    float load_and_compute(float acc, const xpu::sycl::out_memory_arg_t &dst,
+    float load_and_compute(float acc, const xpu::sycl::inout_memory_arg_t &dst,
             dnnl::impl::data_type_t sum_dt_,
             dim_t offset) const { // TODO dims32_t
         memory_plain_t dst_mem(dst, sum_dt_);
@@ -203,6 +254,7 @@ struct sycl_post_op_t {
     primitive_kind_t kind_;
     union {
         ref_binary_op_t binary_;
+        ref_prelu_op_t prelu_;
         ref_eltwise_fwd_t eltwise_;
         ref_sum_op_t sum_;
     };
@@ -231,6 +283,8 @@ struct sycl_post_ops_t {
                 if (!ref_binary_op_t::binary_ok(attr_po.entry_[i].binary)) {
                     return false;
                 }
+            } else if (allow_inputs && attr_po.contain(prelu, i)) {
+                continue;
             } else {
                 return false;
             }
@@ -239,8 +293,7 @@ struct sycl_post_ops_t {
     }
 
     sycl_post_ops_t() = default;
-    sycl_post_ops_t(const primitive_attr_t *attr,
-            dnnl::impl::data_type_t dst_dt = dnnl_data_type_undef) {
+    sycl_post_ops_t(const primitive_attr_t *attr, memory_desc_wrapper dst_mdw) {
         using namespace primitive_kind;
 
         const auto &attr_po = attr->post_ops_;
@@ -252,7 +305,7 @@ struct sycl_post_ops_t {
                 ops_[i].sum_ = ref_sum_op_t(attr_po.entry_[i].sum.scale,
                         attr_po.entry_[i].sum.zero_point);
                 sum_dt_ = attr_po.entry_[i].sum.dt == dnnl_data_type_undef
-                        ? dst_dt
+                        ? dst_mdw.data_type()
                         : attr_po.entry_[i].sum.dt;
             } else if (attr_po.contain(eltwise, i)) {
                 ops_[i].kind_ = eltwise;
@@ -260,19 +313,23 @@ struct sycl_post_ops_t {
             } else if (attr_po.contain(binary, i)) {
                 ops_[i].kind_ = binary;
                 ops_[i].binary_ = ref_binary_op_t(attr_po.entry_[i].binary);
+            } else if (attr_po.contain(prelu, i)) {
+                ops_[i].kind_ = prelu;
+                ops_[i].prelu_
+                        = ref_prelu_op_t(attr_po.entry_[i].prelu, dst_mdw);
             }
         }
         n_post_ops_ = attr_po.len();
     }
 
-    inline float apply(float acc, const xpu::sycl::out_memory_arg_t &dst,
+    inline float apply(float acc, const xpu::sycl::inout_memory_arg_t &dst,
             dim_t dst_offset, const post_op_input_args &po_args,
             dims_t src_offset) const;
     inline float apply(float acc, float dst, const post_op_input_args &po_args,
             dims_t src_offset) const;
     inline float apply(float acc, const post_op_input_args &po_args,
             dims_t src_offset) const;
-    inline float apply(float acc, const xpu::sycl::out_memory_arg_t &dst,
+    inline float apply(float acc, const xpu::sycl::inout_memory_arg_t &dst,
             dim_t dst_offset) const;
 
     inline int get_post_op() const { return n_post_ops_; }
@@ -295,25 +352,27 @@ struct sycl_post_ops_t {
 };
 
 struct post_op_input_args {
-    post_op_input_args(::sycl::handler &cgh, const exec_ctx_t &ctx)
-        : args_ {CTX_IN_SYCL_KERNEL_MEMORY(
-                         (DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1)),
-                CTX_IN_SYCL_KERNEL_MEMORY(
-                        (DNNL_ARG_ATTR_MULTIPLE_POST_OP(1) | DNNL_ARG_SRC_1)),
-                CTX_IN_SYCL_KERNEL_MEMORY(
-                        (DNNL_ARG_ATTR_MULTIPLE_POST_OP(2) | DNNL_ARG_SRC_1)),
-                CTX_IN_SYCL_KERNEL_MEMORY(
-                        (DNNL_ARG_ATTR_MULTIPLE_POST_OP(3) | DNNL_ARG_SRC_1)),
-                CTX_IN_SYCL_KERNEL_MEMORY(
-                        (DNNL_ARG_ATTR_MULTIPLE_POST_OP(4) | DNNL_ARG_SRC_1))} {
+    post_op_input_args(::sycl::handler &cgh, const exec_ctx_t &ctx,
+            const sycl_post_ops_t &post_ops)
+#define CTX_IN_SYCL_KERNEL_MEMORY_PO(N) \
+    CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_ATTR_MULTIPLE_POST_OP(N) \
+            | (post_ops.get_post_op_kind(N) == primitive_kind::prelu \
+                            ? DNNL_ARG_WEIGHTS \
+                            : DNNL_ARG_SRC_1))
+        : args_ {CTX_IN_SYCL_KERNEL_MEMORY_PO(0),
+                CTX_IN_SYCL_KERNEL_MEMORY_PO(1),
+                CTX_IN_SYCL_KERNEL_MEMORY_PO(2),
+                CTX_IN_SYCL_KERNEL_MEMORY_PO(3),
+                CTX_IN_SYCL_KERNEL_MEMORY_PO(4)} {
     }
+#undef CTX_IN_SYCL_KERNEL_MEMORY_PO
 
     xpu::sycl::in_memory_arg_t args_[sycl_post_ops_t::max_post_ops];
 };
 
-float sycl_post_ops_t::apply(float acc, const xpu::sycl::out_memory_arg_t &dst,
-        dim_t dst_offset, const post_op_input_args &po_args,
-        dims_t src_offset) const {
+float sycl_post_ops_t::apply(float acc,
+        const xpu::sycl::inout_memory_arg_t &dst, dim_t dst_offset,
+        const post_op_input_args &po_args, dims_t src_offset) const {
     using namespace primitive_kind;
 
     for (auto i = 0; i < n_post_ops_; ++i) {
@@ -323,6 +382,10 @@ float sycl_post_ops_t::apply(float acc, const xpu::sycl::out_memory_arg_t &dst,
                 acc = ops_[i].binary_.load_and_compute(
                         acc, po_args.args_[i], src_offset);
                 break;
+            case prelu:
+                acc = ops_[i].prelu_.load_and_compute(
+                        acc, po_args.args_[i], src_offset);
+                break;
             case sum:
                 acc = ops_[i].sum_.load_and_compute(
                         acc, dst, sum_dt_, dst_offset);
@@ -344,6 +407,10 @@ float sycl_post_ops_t::apply(float acc, float dst,
                 acc = ops_[i].binary_.load_and_compute(
                         acc, po_args.args_[i], src_offset);
                 break;
+            case prelu:
+                acc = ops_[i].prelu_.load_and_compute(
+                        acc, po_args.args_[i], src_offset);
+                break;
             case sum: acc = ops_[i].sum_.compute(acc, dst); break;
             default: acc = ::sycl::nan(0u);
         }
@@ -362,14 +429,18 @@ float sycl_post_ops_t::apply(
                 acc = ops_[i].binary_.load_and_compute(
                         acc, po_args.args_[i], src_offset);
                 break;
+            case prelu:
+                acc = ops_[i].prelu_.load_and_compute(
+                        acc, po_args.args_[i], src_offset);
+                break;
             default: acc = ::sycl::nan(0u);
         }
     }
     return acc;
 }
 
-float sycl_post_ops_t::apply(float acc, const xpu::sycl::out_memory_arg_t &dst,
-        dim_t dst_offset) const {
+float sycl_post_ops_t::apply(float acc,
+        const xpu::sycl::inout_memory_arg_t &dst, dim_t dst_offset) const {
     using namespace primitive_kind;
 
     for (auto i = 0; i < n_post_ops_; ++i) {
diff --git a/src/gpu/generic/sycl/sycl_primitive_conf.hpp b/src/gpu/generic/sycl/sycl_primitive_conf.hpp
index c6375947cc6..3d4abadeeb3 100644
--- a/src/gpu/generic/sycl/sycl_primitive_conf.hpp
+++ b/src/gpu/generic/sycl/sycl_primitive_conf.hpp
@@ -42,22 +42,14 @@ struct sycl_binary_conf_t {
     int ndims;
     bool is_tensor_op;
 
-    int block_size;
-    int wg_size;
     int wk_size;
 
     sycl_post_ops_t post_ops;
 };
 
-struct sycl_convolution_conf_t {
-    xpu::sycl::md_t data_md;
-    xpu::sycl::md_t dst_md;
-    xpu::sycl::md_t weights_md;
-    xpu::sycl::md_t bias_md;
-    xpu::sycl::md_t diff_data_md;
-    xpu::sycl::md_t diff_dst_md;
-    xpu::sycl::md_t diff_weights_md;
-    xpu::sycl::md_t diff_bias_md;
+struct sycl_convolution_common_conf_t {
+    bool has_bias = false;
+    data_type_t bias_dt;
 
     int padding[3];
     int strides[3];
@@ -75,8 +67,6 @@ struct sycl_convolution_conf_t {
 
     int ndims;
 
-    int block_size;
-    int wg_size;
     int wk_size;
     bool has_groups;
 
@@ -85,6 +75,24 @@ struct sycl_convolution_conf_t {
     sycl_post_ops_t post_ops;
 };
 
+struct sycl_convolution_fwd_conf_t : sycl_convolution_common_conf_t {
+    xpu::sycl::md_t data_md;
+    xpu::sycl::md_t dst_md;
+    xpu::sycl::md_t weights_md;
+};
+
+struct sycl_convolution_bwd_data_conf_t : sycl_convolution_common_conf_t {
+    xpu::sycl::md_t weights_md;
+    xpu::sycl::md_t diff_data_md;
+    xpu::sycl::md_t diff_dst_md;
+};
+
+struct sycl_convolution_bwd_weights_conf_t : sycl_convolution_common_conf_t {
+    xpu::sycl::md_t data_md;
+    xpu::sycl::md_t diff_dst_md;
+    xpu::sycl::md_t diff_weights_md;
+};
+
 struct sycl_eltwise_conf_t {
     prop_kind_t prop_kind;
     xpu::sycl::md_t src_md;
@@ -99,10 +107,7 @@ struct sycl_eltwise_conf_t {
     dim_t d;
     dim_t h;
     dim_t w;
-    dim_t block_size;
-    dim_t wg_size;
     dim_t wk_size;
-    dim_t post_po_len;
     sycl_post_ops_t post_ops;
 };
 
@@ -116,8 +121,6 @@ struct sycl_matmul_conf_t {
     bool transpose_dst;
     bool transpose_weights;
     bool transpose_bias;
-    dim_t post_po_len;
-    xpu::sycl::md_t binary_src_arr[sycl::sycl_post_ops_t::max_post_ops];
     sycl_post_ops_t post_ops;
     int wk_size;
 
@@ -204,8 +207,6 @@ struct sycl_reorder_conf_t {
 
     int ndims;
 
-    int block_size;
-    int wg_size;
     int wk_size;
 
     sycl_post_ops_t post_ops;
@@ -214,7 +215,6 @@ struct sycl_reorder_conf_t {
 struct sycl_resampling_conf_t {
     dims_t dst_dims;
     int dst_ndims;
-    int po_len;
     size_t work_amount;
 
     xpu::sycl::md_t src_md;
@@ -326,11 +326,8 @@ struct sycl_softmax_conf_t {
     xpu::sycl::md_t diff_src_md;
     xpu::sycl::md_t diff_dst_md;
     alg_kind_t alg_kind;
-    dim_t block_size;
-    dim_t wg_size;
     dim_t wk_size;
 
-    int po_len;
     dim_t axis;
     dim_t axis_size;
     dim_t inner_size;
@@ -363,15 +360,12 @@ struct sycl_lrn_conf_t {
     float beta;
     float k;
 
-    int block_size;
-    int wg_size;
     int wk_size;
 };
 
 struct sycl_pooling_base_conf_t {
     xpu::sycl::md_t ws_md;
     int ndims;
-    int po_len;
     bool zero_dims;
     int block_size;
     int wg_size;
@@ -413,8 +407,6 @@ struct sycl_sum_conf_t {
     xpu::sycl::md_t dst_md;
     float src_scales[DNNL_REF_SUM_MAX_NUM_TENSORS];
     int n;
-    int block_size;
-    int wg_size;
     int wk_size;
 };
 
@@ -423,6 +415,135 @@ struct sycl_pooling_bwd_conf_t : public sycl_pooling_base_conf_t {
     xpu::sycl::md_t diff_dst_md;
 };
 
+struct sycl_simple_reduction_conf_t {
+    alg_kind_t alg = alg_kind::undef;
+    xpu::sycl::md_t src_md;
+    xpu::sycl::md_t dst_md;
+    float p;
+    float eps;
+    sycl_post_ops_t post_ops;
+    dim_t reduce_dims[xpu::sycl::md_t::max_dims];
+    int reduce_size = 1;
+};
+
+struct sycl_reduction_conf_t {
+    xpu::sycl::md_t src_md;
+    xpu::sycl::md_t dst_md;
+    alg_kind_t alg = alg_kind::undef;
+    float p = 0;
+    float eps = 0;
+    data_type_t src_dt;
+    data_type_t dst_dt;
+    data_type_t local_mem_dt = data_type::f32;
+
+    int num_dims = 0;
+    xpu::sycl::md_t::dim32_t src_dims[xpu::sycl::md_t::max_dims] = {};
+    int num_axes = 0;
+    xpu::sycl::md_t::dim32_t axes[xpu::sycl::md_t::max_dims] = {};
+    int batch_size = 1;
+    int reduce_size = 1;
+    int stride_size = 1;
+    int tile_row;
+    int tile_col;
+    int batch_groups = -1;
+
+    bool transpose;
+    bool bank_offset;
+    bool is_first_iter;
+    bool is_last_iter;
+
+    sycl_post_ops_t post_ops;
+
+    // XXX: Tune this values
+    const int num_sg_reductions = 4;
+    static constexpr int local_row_wg = 256;
+    static constexpr int local_col_wg = 8;
+};
+
+struct sycl_rnn_copy_conf_t {
+    xpu::sycl::md_t src_md;
+    xpu::sycl::md_t dst_md;
+    dim_t range;
+    dim_t n_dir;
+    dim_t n_layer;
+    dim_t n_iter;
+    dim_t batch;
+    dim_t states_ws_ld;
+    bool layer;
+    bool to_state;
+    bool l2r;
+    bool r2l;
+    bool sum;
+};
+
+struct sycl_rnn_bias_conf_t {
+    xpu::sycl::md_t dst_md;
+    data_type_t bias_type;
+    dim_t batch;
+    dim_t dhc;
+    dim_t gates_ws_ld;
+    dim_t states_ws_ld;
+    dnnl_alg_kind_t activation_kind;
+    float alpha;
+};
+
+template <size_t ndims>
+using strides_t = std::array<dim_t, ndims>;
+struct outer_strides_getter_t {
+    template <size_t ndims>
+    operator strides_t<ndims>() const {
+        strides_t<ndims> ret;
+        assert(static_cast<dim_t>(ndims) >= md.ndims());
+        for (int d = ndims - 1; d >= 0; d--) {
+            // Assumes size 1 dimensions are dense with respect to the neighboring
+            // dimension so they can be used for size calculations in some layouts
+            ret[d] = [&]() {
+                if (d >= md.ndims())
+                    return static_cast<dim_t>(0);
+                else if (md.padded_dims()[d] > 1)
+                    return md.strides()[d];
+                else if (d == md.ndims() - 1)
+                    return static_cast<dim_t>(1);
+                else
+                    return ret[d + 1] * md.padded_dims()[d + 1];
+            }();
+        }
+        return ret;
+    }
+
+    const memory_desc_wrapper &md;
+};
+
+inline outer_strides_getter_t get_outer_strides(const memory_desc_wrapper &md) {
+    return {md};
+}
+
+struct sycl_group_norm_conf_t {
+    xpu::sycl::md_t src_desc;
+    xpu::sycl::md_t dst_desc;
+    bool use_global_stats;
+    int32_t num_groups;
+    int32_t num_channels_per_group;
+    bool use_scale;
+    bool use_shift;
+    bool src_scaling;
+    bool dst_scaling;
+    float eta;
+    sycl_post_ops_t post_ops;
+};
+
+struct sycl_gnorm_bwd_conf_t {
+    xpu::sycl::md_t src_desc;
+    xpu::sycl::md_t diff_src_desc;
+    xpu::sycl::md_t diff_dst_desc;
+    int32_t num_groups;
+    int32_t num_channels_per_group;
+    bool scale_diff_required;
+    bool bias_diff_required;
+    bool used_global_stats;
+    float eta;
+};
+
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_binary_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_prelu_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_shuffle_conf_t);
@@ -436,6 +557,15 @@ CHECK_SYCL_KERNEL_ARG_TYPE(sycl_sum_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_pooling_base_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_pooling_fwd_conf_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(sycl_pooling_bwd_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_fwd_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_data_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_convolution_bwd_weights_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_simple_reduction_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_reduction_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_rnn_copy_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_rnn_bias_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_group_norm_conf_t);
+CHECK_SYCL_KERNEL_ARG_TYPE(sycl_gnorm_bwd_conf_t);
 
 } // namespace sycl
 } // namespace generic
diff --git a/src/gpu/generic/sycl/sycl_utils.hpp b/src/gpu/generic/sycl/sycl_utils.hpp
index f865be7fe06..97ac90f974a 100644
--- a/src/gpu/generic/sycl/sycl_utils.hpp
+++ b/src/gpu/generic/sycl/sycl_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include "common/memory_desc.hpp"
 #include "common/memory_desc_wrapper.hpp"
+#include "gpu/generic/sycl/stream.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -35,23 +36,37 @@ inline bool md_dims_in_range(const dnnl::impl::memory_desc_t *desc) {
     return true;
 }
 
+inline ::sycl::nd_range<1> get_range(const exec_ctx_t &ctx, int work_amount) {
+    const auto *sycl_engine_impl
+            = utils::downcast<const xpu::sycl::engine_impl_t *>(
+                    ctx.stream()->engine()->impl());
+    auto device = sycl_engine_impl->device();
+    int cu_cnt = device.get_info<::sycl::info::device::max_compute_units>();
+    // XXX: should probably be tuned.
+    int wg_size = 32;
+    int max_wgs_per_cu = 16;
+    int work_for_wg_cnt = utils::div_up(work_amount, wg_size);
+    int wg_cnt = std::min(cu_cnt * max_wgs_per_cu, work_for_wg_cnt);
+    return ::sycl::nd_range<1>(wg_cnt * wg_size, wg_size);
+}
+
 // copy from type_helpers.hpp, just without the assert
 inline size_t data_type_size(data_type_t data_type) {
     using namespace data_type;
     switch ((int)data_type) {
-        case f8_e5m2: return sizeof(prec_traits<f8_e5m2>::type);
-        case f8_e4m3: return sizeof(prec_traits<f8_e4m3>::type);
-        case f16: return sizeof(prec_traits<f16>::type);
-        case bf16: return sizeof(prec_traits<bf16>::type);
+        case f8_e5m2: return sizeof(prec_traits_t<f8_e5m2>::type);
+        case f8_e4m3: return sizeof(prec_traits_t<f8_e4m3>::type);
+        case f16: return sizeof(prec_traits_t<f16>::type);
+        case bf16: return sizeof(prec_traits_t<bf16>::type);
         case tf32: // the tf32 type is an f32
-        case f32: return sizeof(prec_traits<f32>::type);
-        case f64: return sizeof(prec_traits<f64>::type);
-        case s32: return sizeof(prec_traits<s32>::type);
-        case s8: return sizeof(prec_traits<s8>::type);
-        case u8: return sizeof(prec_traits<u8>::type);
-        case s4: return sizeof(prec_traits<s4>::type);
-        case u4: return sizeof(prec_traits<u4>::type);
-        case boolean: return sizeof(prec_traits<boolean>::type);
+        case f32: return sizeof(prec_traits_t<f32>::type);
+        case f64: return sizeof(prec_traits_t<f64>::type);
+        case s32: return sizeof(prec_traits_t<s32>::type);
+        case s8: return sizeof(prec_traits_t<s8>::type);
+        case u8: return sizeof(prec_traits_t<u8>::type);
+        case s4: return sizeof(prec_traits_t<s4>::type);
+        case u4: return sizeof(prec_traits_t<u4>::type);
+        case boolean: return sizeof(prec_traits_t<boolean>::type);
     }
     return (size_t)-1; /* not supposed to be reachable */
 }
diff --git a/src/gpu/gpu_binary_list.cpp b/src/gpu/gpu_binary_list.cpp
index ec067a8b468..8455aef0a61 100644
--- a/src/gpu/gpu_binary_list.cpp
+++ b/src/gpu/gpu_binary_list.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_BINARY_P({
-        GPU_INSTANCE_INTEL(intel::ocl::multi_po_reorder_binary)
+        GPU_INSTANCE_INTEL(intel::ocl::multi_po_reorder_binary_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_binary_t)
         GPU_INSTANCE_INTEL(intel::ocl::simple_binary_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_binary_t)
diff --git a/src/gpu/gpu_convolution_list.cpp b/src/gpu/gpu_convolution_list.cpp
index 32b9770c5f3..35302eb0f71 100644
--- a/src/gpu/gpu_convolution_list.cpp
+++ b/src/gpu/gpu_convolution_list.cpp
@@ -16,13 +16,16 @@
 
 #include "gpu/gpu_impl_list.hpp"
 
+#include <mutex>
+
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/jit/binary_format.hpp"
 #include "gpu/intel/jit/conv/gen_convolution.hpp"
 #include "gpu/intel/ocl/gen9_wino_convolution.hpp"
 #include "gpu/intel/ocl/ref_convolution.hpp"
 
-#ifdef DNNL_DEV_MODE
+#ifdef DNNL_EXPERIMENTAL
+#include "common/experimental.hpp"
 #include "gpu/intel/jit/v2/conv/gen_convolution.hpp"
 #endif
 
@@ -51,28 +54,28 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_CONV_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_fwd_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_fwd_t)
         GPU_INSTANCE_INTEL(intel::ocl::gen9_wino_convolution_fwd_t)
         GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_fwd_t)
+        GPU_INSTANCE_INTEL_EXPERIMENTAL(intel::jit::v2::conv::gen_convolution_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_fwd_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_convolution_fwd_t)
         nullptr,
     }},
     {{backward_data}, REG_BWD_D_PK({
-        GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_data_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_data_t)
         GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_bwd_data_t)
+        GPU_INSTANCE_INTEL_EXPERIMENTAL(intel::jit::v2::conv::gen_convolution_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_data_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_data_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_convolution_bwd_data_t)
         nullptr,
     })},
     {{backward_weights}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL_DEVMODE(intel::jit::v2::conv::gen_convolution_bwd_weights_t)
         GPU_INSTANCE_INTEL(intel::jit::gen_convolution_bwd_weights_t)
         GPU_INSTANCE_INTEL_REF(intel::ocl::ref_convolution_bwd_weights_t)
+        GPU_INSTANCE_INTEL_EXPERIMENTAL(intel::jit::v2::conv::gen_convolution_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_convolution_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_convolution_bwd_weights_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_convolution_bwd_weights_t)
@@ -81,6 +84,36 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
 });
 // clang-format on
 
+const std::map<pk_impl_key_t, std::vector<impl_list_item_t>> &
+get_impl_list_map() {
+    static std::map<pk_impl_key_t, std::vector<impl_list_item_t>> list_map;
+    static std::once_flag flag;
+    std::call_once(flag, [&] {
+        list_map = impl_list_map;
+#if (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL) && defined(DNNL_EXPERIMENTAL)
+        if (experimental::use_gpu_conv_v2()) {
+            for (auto &kv : list_map) {
+                auto &list = kv.second;
+                int fwd_idx = impl_list_item_t::find<
+                        intel::jit::v2::conv::gen_convolution_fwd_t::pd_t>(
+                        &list[0]);
+                int bwd_d_idx = impl_list_item_t::find<
+                        intel::jit::v2::conv::gen_convolution_bwd_data_t::pd_t>(
+                        &list[0]);
+                int bwd_w_idx = impl_list_item_t::find<intel::jit::v2::conv::
+                                gen_convolution_bwd_weights_t::pd_t>(&list[0]);
+                int idx = std::max(std::max(fwd_idx, bwd_d_idx), bwd_w_idx);
+                if (idx == -1) continue;
+                auto item = list[idx];
+                list.erase(list.begin() + idx);
+                list.insert(list.begin(), item);
+            }
+        }
+#endif
+    });
+    return list_map;
+}
+
 } // namespace
 
 const impl_list_item_t *get_convolution_impl_list(
@@ -91,9 +124,10 @@ const impl_list_item_t *get_convolution_impl_list(
             desc->prop_kind, forward_training, forward_inference);
     prop_kind_t prop_kind = is_fwd ? forward : desc->prop_kind;
 
-    const auto impl_list_it = impl_list_map.find({prop_kind});
-    return impl_list_it != impl_list_map.cend() ? impl_list_it->second.data()
-                                                : empty_list;
+    const auto impl_list_it = get_impl_list_map().find({prop_kind});
+    return impl_list_it != get_impl_list_map().cend()
+            ? impl_list_it->second.data()
+            : empty_list;
 }
 
 } // namespace gpu
diff --git a/src/gpu/gpu_convolution_pd.hpp b/src/gpu/gpu_convolution_pd.hpp
index 58d1e7c358e..57faa9941ef 100644
--- a/src/gpu/gpu_convolution_pd.hpp
+++ b/src/gpu/gpu_convolution_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,21 +35,31 @@ struct gpu_convolution_fwd_pd_t : public convolution_fwd_pd_t {
     // TODO: consider either moving this method to primitive_conf.hpp or making
     //       it static, or removing the 'attr' argument accessible via attr()
     bool zero_points_ok(const primitive_attr_t *attr) const {
+        const auto &zp = attr->zero_points_;
+
         using namespace data_type;
-        const auto src_type = invariant_src_md()->data_type;
-        int mask_wei = 0, mask_src = 0, mask_dst = 0;
-        if (attr->zero_points_.get(DNNL_ARG_WEIGHTS, &mask_wei)
-                != status::success)
-            return false;
-        if (attr->zero_points_.get(DNNL_ARG_SRC, &mask_src) != status::success)
-            return false;
-        if (attr->zero_points_.get(DNNL_ARG_DST, &mask_dst) != status::success)
-            return false;
-
-        return IMPLICATION(!utils::one_of(src_type, s8, u8),
-                       attr->zero_points_.has_default_values())
-                && (mask_wei == 0) && (mask_src == 0 || mask_src == 1 << 1)
-                && (mask_dst == 0 || mask_dst == 1 << 1);
+        bool ok = IMPLICATION(
+                !utils::one_of(invariant_src_md()->data_type, s8, u8),
+                zp.has_default_values());
+        if (!ok) return false;
+
+        if (!zp.has_default_values(DNNL_ARG_SRC)) {
+            int mask_src = zp.get_mask(DNNL_ARG_SRC);
+            ok = utils::one_of(mask_src, 0, (1 << 1));
+            if (!ok) return false;
+        }
+        if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+            int mask_wei = zp.get_mask(DNNL_ARG_WEIGHTS);
+            ok = mask_wei == 0;
+            if (!ok) return false;
+        }
+        if (!zp.has_default_values(DNNL_ARG_DST)) {
+            int mask_dst = zp.get_mask(DNNL_ARG_DST);
+            ok = utils::one_of(mask_dst, 0, (1 << 1));
+            if (!ok) return false;
+        }
+
+        return true;
     }
 };
 
@@ -60,19 +70,26 @@ struct gpu_convolution_bwd_data_pd_t : public convolution_bwd_data_pd_t {
     // TODO: consider either moving this method to primitive_conf.hpp or making
     //       it static, or removing the 'attr' argument accessible via attr()
     bool zero_points_ok(const primitive_attr_t *attr) const {
+        const auto &zp = attr->zero_points_;
+
         using namespace data_type;
-        const auto dst_type = invariant_dst_md()->data_type;
-        int mask_src = 0, mask_dst = 0;
-        if (attr->zero_points_.get(DNNL_ARG_SRC, &mask_src) != status::success)
-            return false;
-        if (attr->zero_points_.get(DNNL_ARG_DST, &mask_dst) != status::success)
-            return false;
-
-        return IMPLICATION(!utils::one_of(dst_type, s8, u8),
-                       attr->zero_points_.has_default_values())
-                && attr->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
-                && (mask_src == 0 || mask_src == 1 << 1)
-                && (mask_dst == 0 || mask_dst == 1 << 1);
+        bool ok = IMPLICATION(
+                !utils::one_of(invariant_dst_md()->data_type, s8, u8),
+                zp.has_default_values());
+        if (!ok) return false;
+
+        if (!zp.has_default_values(DNNL_ARG_SRC)) {
+            int mask_src = zp.get_mask(DNNL_ARG_SRC);
+            ok = utils::one_of(mask_src, 0, (1 << 1));
+            if (!ok) return false;
+        }
+        if (!zp.has_default_values(DNNL_ARG_DST)) {
+            int mask_dst = zp.get_mask(DNNL_ARG_DST);
+            ok = utils::one_of(mask_dst, 0, (1 << 1));
+            if (!ok) return false;
+        }
+
+        return zp.has_default_values(DNNL_ARG_WEIGHTS);
     }
 
     // TODO: consider either moving this method to primitive_conf.hpp or making
diff --git a/src/gpu/gpu_deconvolution_list.cpp b/src/gpu/gpu_deconvolution_list.cpp
index 55aeaff4b23..20ebdd37c3c 100644
--- a/src/gpu/gpu_deconvolution_list.cpp
+++ b/src/gpu/gpu_deconvolution_list.cpp
@@ -34,6 +34,8 @@
 #include "gpu/generic/sycl/ref_deconvolution.hpp"
 #endif
 
+#include "gpu/generic/convolution_deconvolution.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -45,18 +47,18 @@ using namespace dnnl::impl::prop_kind;
 const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_DECONV_P({
     {{forward}, {
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_fwd_t)
+        GPU_INSTANCE_GENERIC(generic::convolution_deconvolution_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
-        GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_bwd_data_t)
         GPU_INSTANCE_INTEL(intel::ocl::convolution_deconvolution_bwd_weights_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_data_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_deconvolution_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_bwd_data_t)
         GPU_INSTANCE_AMD(amd::miopen_deconvolution_bwd_weights_t)
+        GPU_INSTANCE_GENERIC(generic::convolution_deconvolution_bwd_data_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_deconvolution_bwd_weights_t)
         nullptr,
     })},
diff --git a/src/gpu/gpu_group_normalization_list.cpp b/src/gpu/gpu_group_normalization_list.cpp
index 65e3c4c6e2e..c88112af749 100644
--- a/src/gpu/gpu_group_normalization_list.cpp
+++ b/src/gpu/gpu_group_normalization_list.cpp
@@ -20,6 +20,10 @@
 #include "gpu/intel/ocl/ref_group_normalization.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/ref_group_normalization.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -32,11 +36,13 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
 impl_list_map REG_GNORM_P({
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_fwd_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_group_normalization_fwd_t)
         nullptr,
         }
     },
     {{backward}, REG_BWD_PK({
         GPU_INSTANCE_INTEL(intel::ocl::ref_group_normalization_bwd_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_group_normalization_bwd_t)
         nullptr,
         })
     },
diff --git a/src/gpu/gpu_impl_list.cpp b/src/gpu/gpu_impl_list.cpp
index c056be21cec..9124c181452 100644
--- a/src/gpu/gpu_impl_list.cpp
+++ b/src/gpu/gpu_impl_list.cpp
@@ -28,7 +28,7 @@ const impl_list_item_t *gpu_impl_list_t::get_implementation_list(
 #define CASE(kind) \
     case primitive_kind::kind: \
         return get_##kind##_impl_list((const kind##_desc_t *)desc);
-        switch ((int)desc->kind) {
+        switch ((int)desc->primitive_kind) {
             CASE(batch_normalization);
             CASE(binary);
             CASE(convolution);
diff --git a/src/gpu/gpu_impl_list.hpp b/src/gpu/gpu_impl_list.hpp
index c3ae3d09b58..53e2b1ea120 100644
--- a/src/gpu/gpu_impl_list.hpp
+++ b/src/gpu/gpu_impl_list.hpp
@@ -146,6 +146,14 @@ namespace gpu {
 #define GPU_INSTANCE_INTEL_DEVMODE(...)
 #endif
 
+// Instance macros that are enabled only with DNNL_EXPERIMENTAL.
+#ifdef DNNL_EXPERIMENTAL
+#define GPU_INSTANCE_INTEL_EXPERIMENTAL(...) \
+    DNNL_GPU_INTEL_ONLY(GPU_INSTANCE(__VA_ARGS__))
+#else
+#define GPU_INSTANCE_INTEL_EXPERIMENTAL(...)
+#endif
+
 // Instance macros that are enabled only when REF is disabled
 #ifdef DNNL_DISABLE_GPU_REF_KERNELS
 #define GPU_INSTANCE_INTEL_REF(...)
diff --git a/src/gpu/gpu_inner_product_list.cpp b/src/gpu/gpu_inner_product_list.cpp
index b13f990a9a5..442dad58eb4 100644
--- a/src/gpu/gpu_inner_product_list.cpp
+++ b/src/gpu/gpu_inner_product_list.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/convolution_inner_product.hpp"
 #include "gpu/intel/ocl/gemm_inner_product.hpp"
-#include "gpu/intel/ocl/gemm_post_ops_inner_product.hpp"
 #include "gpu/intel/ocl/ref_inner_product.hpp"
 #endif
 
@@ -32,6 +31,10 @@
 #include "gpu/amd/miopen_gemm_inner_product.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/ref_inner_product.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -49,6 +52,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_gemm_inner_product_fwd_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_fwd_t)
         GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_fwd_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_inner_product_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
@@ -62,6 +66,8 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_conv_inner_product_bwd_weights_t)
         GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_bwd_data_t)
         GPU_INSTANCE_AMD(amd::miopen_gemm_inner_product_bwd_weights_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_inner_product_bwd_data_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_inner_product_bwd_weights_t)
         nullptr,
     })},
 });
diff --git a/src/gpu/gpu_inner_product_pd.hpp b/src/gpu/gpu_inner_product_pd.hpp
index 15b00437d6c..f589196b018 100644
--- a/src/gpu/gpu_inner_product_pd.hpp
+++ b/src/gpu/gpu_inner_product_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ inline bool dense_gemm_consistency_check(const memory_desc_wrapper &src_d,
             && src_d.is_dense(true) && dst_d.is_dense() && wei_d.is_dense(true);
 }
 
-status_t template_set_default_params(memory_desc_t &src_md,
+inline status_t template_set_default_params(memory_desc_t &src_md,
         memory_desc_t &weights_md, memory_desc_t &dst_md,
         memory_desc_t *bias_md, int ndims, bool is_conv = false) {
     using namespace format_tag;
diff --git a/src/gpu/gpu_matmul_list.cpp b/src/gpu/gpu_matmul_list.cpp
index 3036a593789..15489895c02 100644
--- a/src/gpu/gpu_matmul_list.cpp
+++ b/src/gpu/gpu_matmul_list.cpp
@@ -19,17 +19,22 @@
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
 #include "gpu/intel/ocl/gemm_matmul.hpp"
 #include "gpu/intel/ocl/ref_matmul.hpp"
+#include "gpu/intel/ocl/ref_sparse_matmul.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
-#include "gpu/generic/sycl/ref_matmul.hpp"
 #include "gpu/nvidia/cudnn_matmul.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
 #include "gpu/amd/miopen_matmul.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/ref_matmul.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -39,7 +44,9 @@ namespace {
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_MATMUL_P({
         GPU_INSTANCE_INTEL(intel::ocl::gemm_matmul_t)
+        GPU_INSTANCE_INTEL(intel::ocl::ref_sparse_matmul_t)
         GPU_INSTANCE_INTEL_REF(intel::ocl::ref_matmul_t)
+        GPU_INSTANCE_NVIDIA(nvidia::cudnn_matmul_lt_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_matmul_t)
         GPU_INSTANCE_AMD(amd::miopen_matmul_t)
         GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_matmul_t)
diff --git a/src/gpu/gpu_reduction_list.cpp b/src/gpu/gpu_reduction_list.cpp
index b29c238e04a..858c16523ac 100644
--- a/src/gpu/gpu_reduction_list.cpp
+++ b/src/gpu/gpu_reduction_list.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include "gpu/intel/ocl/reduction/reusable_ref_reduction.hpp"
 
 #ifdef DNNL_DEV_MODE
-#include "gpu/intel/jit/jit_reduction.hpp"
+#include "gpu/intel/jit/reduction.hpp"
 #endif
 
 #endif
@@ -36,6 +36,11 @@
 #include "gpu/amd/miopen_reduction.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/ref_reduction.hpp"
+#include "gpu/generic/sycl/simple_reduction.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -44,13 +49,15 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_REDUCTION_P({
-        GPU_INSTANCE_INTEL_DEVMODE(intel::jit::jit_reduction_t)
+        GPU_INSTANCE_INTEL_DEVMODE(intel::jit::reduction_t)
         GPU_INSTANCE_INTEL(intel::ocl::atomic_reduction_t)
         GPU_INSTANCE_INTEL(intel::ocl::combined_reduction_t)
         GPU_INSTANCE_INTEL(intel::ocl::ref_reduction_t)
         GPU_INSTANCE_INTEL(intel::ocl::reusable_ref_reduction_t)
         GPU_INSTANCE_NVIDIA(nvidia::cudnn_reduction_t)
         GPU_INSTANCE_AMD(amd::miopen_reduction_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_reduction_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::simple_reduction_t)
         nullptr,
 });
 // clang-format on
diff --git a/src/gpu/gpu_reorder_list.cpp b/src/gpu/gpu_reorder_list.cpp
index b441ac5dd92..361f7b1d62c 100644
--- a/src/gpu/gpu_reorder_list.cpp
+++ b/src/gpu/gpu_reorder_list.cpp
@@ -29,6 +29,7 @@
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
 #include "gpu/nvidia/cudnn_reorder.hpp"
+#include "gpu/nvidia/cudnn_reorder_lt.hpp"
 #endif
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
@@ -54,10 +55,11 @@ constexpr impl_list_item_t impl_list[] = REG_REORDER_P({
         GPU_REORDER_INSTANCE_INTEL(intel::ocl::custom_reorder_t::pd_t) // for specific tensor shapes
         GPU_REORDER_INSTANCE_INTEL(intel::ocl::generic_reorder_t::pd_t)// fast and quite generic
         GPU_REORDER_INSTANCE_INTEL(intel::ocl::ref_reorder_t::pd_t)    // slow but fits every use case
+        GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_lt_t::pd_t)
         GPU_REORDER_INSTANCE_NVIDIA(nvidia::cudnn_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_AMD(amd::miopen_reorder_t::pd_t)
-        GPU_REORDER_INSTANCE_GENERIC_SYCL(generic::sycl::ref_reorder_t::pd_t)
         GPU_REORDER_INSTANCE_GENERIC(generic::cross_engine_reorder_t::pd_t)
+        GPU_REORDER_INSTANCE_GENERIC_SYCL(generic::sycl::ref_reorder_t::pd_t)
         nullptr,
 });
 // clang-format on
diff --git a/src/gpu/gpu_reorder_pd.cpp b/src/gpu/gpu_reorder_pd.cpp
new file mode 100644
index 00000000000..a5609fca26c
--- /dev/null
+++ b/src/gpu/gpu_reorder_pd.cpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/gpu_reorder_pd.hpp"
+#include "gpu/gpu_engine.hpp"
+#include "gpu/gpu_stream.hpp"
+#include "gpu/gpu_zero_points_conv.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+
+status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv_pd(
+        impl::engine_t *dst_engine) {
+    memory_desc_wrapper dst_mdw(dst_md());
+    auto &extra = dst_mdw.extra();
+    auto needs_conv = memory_extra_flags::compensation_gpu_conv_asymmetric_src;
+    auto is_dst_gpu = (dst_engine->kind() == engine_kind::gpu);
+    do_zp_precomp_conv_ = is_dst_gpu && (extra.flags & needs_conv);
+    if (!do_zp_precomp_conv_) return status::success;
+
+    using namespace memory_extra_flags;
+    const auto out_type = data_type::f32;
+    primitive_attr_t attr;
+    const bool is_bwd_d
+            = extra.flags & compensation_gpu_conv_asymmetric_src_bwd;
+    auto prop = (is_bwd_d) ? prop_kind::backward_data
+                           : prop_kind::forward_inference;
+    CHECK(create_zp_precompute_conv_pd(zp_precomp_conv_pd_, dst_engine, attr,
+            dst_md(), extra.idhw, extra.odhw, extra.pdhw, extra.ddhw, out_type,
+            prop));
+
+    using namespace memory_tracking::names;
+    auto gpu_align = utils::downcast<gpu::engine_t *>(dst_engine)
+                             ->get_buffer_alignment();
+    auto scratchpad = scratchpad_registry().registrar();
+    const auto &registry = zp_precomp_conv_pd_->scratchpad_registry();
+    memory_desc_wrapper wspace((is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md()
+                                          : zp_precomp_conv_pd_->src_md());
+    scratchpad.book(key_conv_tr_src, wspace.size(), 1, gpu_align);
+    scratchpad.book(key_conv_tails, registry.size(), 1, gpu_align);
+    return status::success;
+}
+
+status_t gpu_reorder_pd_t::maybe_create_zp_precompute_conv(
+        std::shared_ptr<impl::primitive_t> &zp_precomp_conv,
+        impl::engine_t *engine, gpu::primitive_t *primitive) const {
+    if (!do_zp_precomp_conv_) return status::success;
+    return primitive->create_nested_primitive(
+            zp_precomp_conv, zp_precomp_conv_pd_, engine);
+}
+
+status_t gpu_reorder_pd_t::maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx,
+        const std::shared_ptr<impl::primitive_t> &zp_precomp_conv) const {
+    using namespace memory_tracking::names;
+    if (!do_zp_precomp_conv_) return status::success;
+
+    const bool is_bwd_d = (zp_precomp_conv_pd_->get_prop_kind()
+            == prop_kind::backward_data);
+    auto *gpu_stream = utils::downcast<gpu::stream_t *>(ctx.stream());
+    auto conv_md_in = (is_bwd_d) ? zp_precomp_conv_pd_->diff_dst_md()
+                                 : zp_precomp_conv_pd_->src_md();
+    auto scratchpad
+            = ctx.get_scratchpad_grantor().get_memory_storage(key_conv_tr_src);
+    std::unique_ptr<memory_t, memory_deleter_t> wspace;
+    CHECK(safe_ptr_assign(wspace,
+            new memory_t(ctx.stream()->engine(), conv_md_in,
+                    std::move(scratchpad))));
+    CHECK(gpu_stream->fill(*wspace->memory_storage(), 0x01,
+            memory_desc_wrapper(conv_md_in).size(),
+            gpu_stream->ctx().get_deps(), gpu_stream->ctx().get_deps()));
+
+    exec_args_t r_args;
+    auto arg_in = (is_bwd_d) ? DNNL_ARG_DIFF_DST : DNNL_ARG_SRC;
+    auto arg_out = (is_bwd_d) ? DNNL_ARG_DIFF_SRC : DNNL_ARG_DST;
+    r_args[arg_in] = memory_arg_t {(memory_t *)wspace.get(), true};
+    r_args[DNNL_ARG_WEIGHTS] = memory_arg_t {ctx.output(DNNL_ARG_TO), true};
+    r_args[arg_out] = memory_arg_t {ctx.output(DNNL_ARG_TO), false};
+    exec_ctx_t r_ctx(ctx, std::move(r_args));
+
+    nested_scratchpad_t ns(ctx, key_conv_tails, zp_precomp_conv);
+    r_ctx.set_scratchpad_grantor(ns.grantor());
+    return zp_precomp_conv->execute(r_ctx);
+}
+
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/gpu_reorder_pd.hpp b/src/gpu/gpu_reorder_pd.hpp
index b62a5793306..25e9ac09d7a 100644
--- a/src/gpu/gpu_reorder_pd.hpp
+++ b/src/gpu/gpu_reorder_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_GPU_REORDER_PD_HPP
 
 #include "common/reorder_pd.hpp"
+#include "gpu/gpu_primitive.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,11 +29,35 @@ struct gpu_reorder_pd_t : public reorder_pd_t {
 
 protected:
     bool attr_ok() const {
+        using sm = dnnl_primitive_attr::skip_mask_t;
         return attr()->has_default_values(
-                       dnnl_primitive_attr::skip_mask_t::zero_points_runtime
-                       | dnnl_primitive_attr::skip_mask_t::scales_runtime
-                       | dnnl_primitive_attr::skip_mask_t::post_ops)
-                && post_ops_ok();
+                       sm::zero_points | sm::scales | sm::post_ops)
+                && post_ops_ok() && zero_points_ok();
+    }
+
+    bool zero_points_ok() const {
+        const auto &zp = attr()->zero_points_;
+
+        using namespace data_type;
+        bool ok = IMPLICATION(!utils::one_of(src_md()->data_type, s8, u8),
+                zp.has_default_values(DNNL_ARG_SRC));
+        if (!ok) return false;
+        ok = IMPLICATION(!utils::one_of(dst_md()->data_type, s8, u8),
+                zp.has_default_values(DNNL_ARG_DST));
+        if (!ok) return false;
+
+        if (!zp.has_default_values(DNNL_ARG_SRC)) {
+            int mask_src = zp.get_mask(DNNL_ARG_SRC);
+            ok = mask_src == 0;
+            if (!ok) return false;
+        }
+        if (!zp.has_default_values(DNNL_ARG_DST)) {
+            int mask_dst = zp.get_mask(DNNL_ARG_DST);
+            ok = mask_dst == 0;
+            if (!ok) return false;
+        }
+
+        return true;
     }
 
     bool post_ops_ok() const {
@@ -42,9 +67,27 @@ struct gpu_reorder_pd_t : public reorder_pd_t {
                         && post_ops.entry_[0].kind == primitive_kind::sum);
     }
 
-    bool extra_ok() const {
-        return src_md()->extra.flags == 0 && dst_md()->extra.flags == 0;
+    bool extra_ok(bool accept_conv_asymm = false) const {
+        if (!accept_conv_asymm)
+            return (src_md()->extra.flags == memory_extra_flags::none)
+                    && (dst_md()->extra.flags == memory_extra_flags::none);
+        return check_md_extra_flags_compensation_gpu(src_md()->extra.flags)
+                && check_md_extra_flags_compensation_gpu(dst_md()->extra.flags);
     }
+
+    status_t maybe_create_zp_precompute_conv_pd(impl::engine_t *dst_engine);
+
+public:
+    status_t maybe_create_zp_precompute_conv(
+            std::shared_ptr<impl::primitive_t> &zp_precomp_conv,
+            impl::engine_t *engine, gpu::primitive_t *primitive) const;
+
+    status_t maybe_exec_zp_precompute_conv(const exec_ctx_t &ctx,
+            const std::shared_ptr<impl::primitive_t> &zp_precomp_conv) const;
+
+private:
+    bool do_zp_precomp_conv_ = false;
+    std::shared_ptr<primitive_desc_t> zp_precomp_conv_pd_;
 };
 
 } // namespace gpu
diff --git a/src/gpu/gpu_resource.hpp b/src/gpu/gpu_resource.hpp
index 44489aa1295..1be9983facd 100644
--- a/src/gpu/gpu_resource.hpp
+++ b/src/gpu/gpu_resource.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@ struct gpu_resource_t : public resource_t {
     using mapped_memory_t = std::unique_ptr<memory_storage_t>;
 
     gpu_resource_t() = default;
+    ~gpu_resource_t() override = default;
 
     void add_memory_storage(key_memory_t idx, mapped_memory_t &&m) {
         assert(idx_to_memory_storage_.count(idx) == 0);
diff --git a/src/gpu/gpu_rnn_list.cpp b/src/gpu/gpu_rnn_list.cpp
index 7e9526abf61..9055273ff9e 100644
--- a/src/gpu/gpu_rnn_list.cpp
+++ b/src/gpu/gpu_rnn_list.cpp
@@ -20,6 +20,10 @@
 #include "gpu/intel/ocl/rnn/rnn_grid.hpp"
 #endif
 
+#ifdef GENERIC_SYCL_KERNELS_ENABLED
+#include "gpu/generic/sycl/rnn/ref_rnn.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -32,6 +36,7 @@ const std::map<pk_impl_key_t, std::vector<impl_list_item_t>>
         impl_list_map REG_RNN_P({
     {{forward}, {
         GPU_INSTANCE_INTEL(intel::ocl::simple_rnn_fwd_t)
+        GPU_INSTANCE_GENERIC_SYCL(generic::sycl::ref_rnn_fwd_t)
         nullptr,
     }},
     {{backward}, REG_BWD_PK({
diff --git a/src/gpu/gpu_sum_list.cpp b/src/gpu/gpu_sum_list.cpp
index 873ed213fd3..e34e90786bb 100644
--- a/src/gpu/gpu_sum_list.cpp
+++ b/src/gpu/gpu_sum_list.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ namespace {
 
 // clang-format off
 constexpr impl_list_item_t impl_list[] = REG_SUM_P({
-        GPU_SUM_INSTANCE_INTEL(intel::ocl::multi_po_reorder_sum)
+        GPU_SUM_INSTANCE_INTEL(intel::ocl::multi_po_reorder_sum_t)
         GPU_SUM_INSTANCE_INTEL(intel::ocl::gen9_sum_t)
         GPU_SUM_INSTANCE_INTEL(intel::ocl::many_inputs_sum_t)
         GPU_SUM_INSTANCE_INTEL(intel::ocl::simple_sum_t<data_type::f32>)
diff --git a/src/gpu/gpu_utils.hpp b/src/gpu/gpu_utils.hpp
index 607e26f4a71..232f1547fcc 100644
--- a/src/gpu/gpu_utils.hpp
+++ b/src/gpu/gpu_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,11 +17,13 @@
 #ifndef GPU_GPU_UTILS_HPP
 #define GPU_GPU_UTILS_HPP
 
+#include <cassert>
 #include <map>
 #include <vector>
 
 #include "oneapi/dnnl/dnnl.h"
 
+#include "common/primitive_attr.hpp"
 #include "common/primitive_exec_types.hpp"
 
 namespace dnnl {
@@ -30,7 +32,7 @@ namespace gpu {
 
 inline dim_t get_attr_oscales_count(int mask, const memory_desc_wrapper &md) {
     dim_t count = 1;
-    if (mask == 0) return count;
+    if (mask <= 0) return count;
 
     for (int d = 0; d < md.ndims(); d++) {
         const int dim_mask = 1 << d;
@@ -43,8 +45,33 @@ inline dim_t get_attr_oscales_count(int mask, const memory_desc_wrapper &md) {
 class scales_query_t {
 public:
     bool has_default_values() const { return scales_.has_default_values(); }
-    int get_mask() const { return scales_.mask_; }
+    int get_mask() const { return scales_.get_mask(); }
     size_t get_count() const { return count_; }
+    data_type_t get_data_type() const { return scales_.get_data_type(); }
+    dim_t get_group() const {
+        if (scales_.has_default_groups()) return 1;
+
+        const auto g0 = scales_.get_group(0);
+        const auto g1 = scales_.get_group(1);
+        assert(utils::one_of(1, g0, g1));
+        return g0 > 1 ? g0 : g1;
+    }
+    // Returns a dimension to which the group should be applied.
+    int get_group_dim() const {
+        // If groups are not identified, they should be set to `1`, and
+        // it shouldn't hurt to divide by 1 any dim. Just use 0th for that.
+        if (scales_.has_default_groups()) return 0;
+
+        const auto g0 = scales_.get_group(0);
+        const auto g1 = scales_.get_group(1);
+        assert(utils::one_of(1, g0, g1));
+        UNUSED(g1);
+        const int g_dim = g0 > 1 ? 0 : 1;
+        // Note: hardcoded value so far.
+        // TODO: replace with some API when ndims can be different from 2.
+        return ndims_ - /* scales_.get_groups_ndims() = */ 2 + g_dim;
+    }
+
     memory_storage_t &get_scales(const exec_ctx_t &ctx) const {
         return CTX_IN_STORAGE(DNNL_ARG_ATTR_SCALES | arg_);
     }
@@ -52,25 +79,48 @@ class scales_query_t {
     scales_query_t() = default;
     scales_query_t(const primitive_attr_t *attr, const memory_desc_wrapper &mdw,
             int arg)
-        : arg_(arg) {
-        scales_ = attr->scales_.get(arg);
-        count_ = get_attr_oscales_count(scales_.mask_, mdw);
-    }
+        : scales_(attr->scales_.get(arg))
+        , count_(get_attr_oscales_count(scales_.get_mask(), mdw))
+        , arg_(arg)
+        , ndims_(mdw.ndims()) {}
 
 private:
-    runtime_scales_t scales_;
+    quant_entry_t scales_;
     dim_t count_ = 0;
     int arg_ = 0;
+    int ndims_ = 0;
 };
 
 class zero_points_query_t {
 public:
-    bool has_default_values() const { return zps_.has_default_values(arg_); }
-    int get_mask() const {
-        int mask = zps_.get(arg_);
-        return mask;
-    }
+    bool has_default_values() const { return zps_.has_default_values(); }
+    int get_mask() const { return zps_.get_mask(); }
     size_t get_count() const { return count_; }
+    data_type_t get_data_type() const { return zps_.get_data_type(); }
+    dim_t get_group() const {
+        if (zps_.has_default_groups()) return 1;
+
+        const auto g0 = zps_.get_group(0);
+        const auto g1 = zps_.get_group(1);
+        assert(utils::one_of(1, g0, g1));
+        return g0 > 1 ? g0 : g1;
+    }
+    // Returns a dimension to which the group should be applied.
+    int get_group_dim() const {
+        // If groups are not identified, they should be set to `1`, and
+        // it shouldn't hurt to divide by 1 any dim. Just use 0th for that.
+        if (zps_.has_default_groups()) return 0;
+
+        const auto g0 = zps_.get_group(0);
+        const auto g1 = zps_.get_group(1);
+        assert(utils::one_of(1, g0, g1));
+        UNUSED(g1);
+        const int g_dim = g0 > 1 ? 0 : 1;
+        // Note: hardcoded value so far.
+        // TODO: replace with some API when ndims can be different from 2.
+        return ndims_ - /* zps_.get_groups_ndims() = */ 2 + g_dim;
+    }
+
     memory_storage_t &get_zero_points(const exec_ctx_t &ctx) const {
         return CTX_IN_STORAGE(DNNL_ARG_ATTR_ZERO_POINTS | arg_);
     }
@@ -78,16 +128,16 @@ class zero_points_query_t {
     zero_points_query_t() = default;
     zero_points_query_t(const primitive_attr_t *attr,
             const memory_desc_wrapper &mdw, int arg)
-        : arg_(arg) {
-        zps_ = attr->zero_points_;
-        int mask = zps_.get(arg);
-        count_ = get_attr_oscales_count(mask, mdw);
-    }
+        : zps_(attr->zero_points_.get(arg))
+        , count_(get_attr_oscales_count(zps_.get_mask(), mdw))
+        , arg_(arg)
+        , ndims_(mdw.ndims()) {}
 
 private:
-    zero_points_t zps_;
+    quant_entry_t zps_;
     dim_t count_ = 0;
     int arg_ = 0;
+    int ndims_ = 0;
 };
 
 struct quantization_t {
@@ -95,6 +145,9 @@ struct quantization_t {
     bool with_scale() const { return !scale_.has_default_values(); }
     int scale_mask() const { return scale_.get_mask(); }
     size_t num_scales() const { return scale_.get_count(); }
+    data_type_t scale_dt() const { return scale_.get_data_type(); }
+    dim_t scale_group() const { return scale_.get_group(); }
+    int scale_group_dim() const { return scale_.get_group_dim(); }
     memory_storage_t &scales(const exec_ctx_t &ctx) const {
         return scale_.get_scales(ctx);
     }
@@ -102,6 +155,9 @@ struct quantization_t {
     bool with_zp() const { return !zp_.has_default_values(); }
     int zp_mask() const { return zp_.get_mask(); }
     size_t num_zps() const { return zp_.get_count(); }
+    data_type_t zp_dt() const { return zp_.get_data_type(); }
+    dim_t zp_group() const { return zp_.get_group(); }
+    int zp_group_dim() const { return zp_.get_group_dim(); }
     memory_storage_t &zero_points(const exec_ctx_t &ctx) const {
         return zp_.get_zero_points(ctx);
     }
diff --git a/src/gpu/gpu_zero_pad_pd.hpp b/src/gpu/gpu_zero_pad_pd.hpp
index c2c8d13f8de..4577874d2a7 100644
--- a/src/gpu/gpu_zero_pad_pd.hpp
+++ b/src/gpu/gpu_zero_pad_pd.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,11 +25,12 @@ namespace gpu {
 
 struct gpu_zero_pad_pd_t : public primitive_desc_t {
     static constexpr auto base_pkind = primitive_kind::zero_pad;
-    typedef gpu_zero_pad_pd_t hint_class;
+    using hint_class = gpu_zero_pad_pd_t;
 
-    gpu_zero_pad_pd_t(const zero_pad_desc_t *adesc,
-            const primitive_attr_t *attr, const hint_class *hint_fwd_pd)
-        : primitive_desc_t(base_pkind), desc_(*adesc) {}
+    gpu_zero_pad_pd_t(const op_desc_t *adesc, const primitive_attr_t *attr,
+            const hint_class *hint_fwd_pd)
+        : primitive_desc_t(base_pkind)
+        , desc_(*op_desc_t::to_desc<zero_pad_desc_t>(adesc)) {}
 
     const zero_pad_desc_t *desc() const { return &desc_; }
     const op_desc_t *op_desc() const override {
diff --git a/src/gpu/gpu_zero_points_conv.cpp b/src/gpu/gpu_zero_points_conv.cpp
new file mode 100644
index 00000000000..0e1edb567d9
--- /dev/null
+++ b/src/gpu/gpu_zero_points_conv.cpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cassert>
+#include <vector>
+
+#include "common/convolution_pd.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "gpu/gpu_zero_points_conv.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+
+status_t create_zp_precompute_conv_pd(std::shared_ptr<primitive_desc_t> &retn,
+        dnnl::impl::engine_t *eng, const primitive_attr_t &attr,
+        const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw,
+        const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type,
+        prop_kind_t prop, bool has_offset0) {
+    using namespace memory_extra_flags;
+    auto real_wei = *wei;
+    const int off = (!idhw[1]) ? 2 + !idhw[2] : !idhw[0];
+    const bool with_groups = (real_wei.ndims == (6 - off));
+    if (real_wei.extra.flags & compensation_gpu_conv_asymmetric_src_swap) {
+        static_assert(DNNL_MAX_NDIMS == 12, "DNNL_MAX_NDIMS is not 12");
+        std::array<int, DNNL_MAX_NDIMS> perm_grp
+                = {0, 2, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+        std::array<int, DNNL_MAX_NDIMS> perm_no_grp
+                = {1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+        CHECK(memory_desc_permute_axes(real_wei, *wei,
+                (with_groups) ? perm_grp.data() : perm_no_grp.data()));
+    }
+    real_wei.extra = memory_extra_desc_t();
+
+    const auto &dims = real_wei.dims;
+    const bool is_fwd = ((prop == prop_kind::forward_training)
+            || (prop == prop_kind::forward_inference));
+    const bool is_bwd_d = (prop == prop_kind::backward_data);
+    assert((off < 3) && (real_wei.ndims >= 5 - off) && (is_fwd || is_bwd_d));
+    MAYBE_UNUSED(is_fwd);
+
+    using memory_dims = std::vector<dim_t>;
+    memory_dims S1 {1, 1, 1};
+    memory_dims P1 {0, 0, 0};
+    // dim order for weights: [G,] OC, IC, [[[D,] H,] W]
+    memory_dims dims_in {1,
+            (with_groups) ? dims[0] * dims[2 - is_bwd_d] : dims[1 - is_bwd_d]};
+    memory_dims dims_out {1,
+            (with_groups) ? dims[0] * dims[1 + is_bwd_d] : dims[0 + is_bwd_d]};
+    for (int i = off; i < 3; i++) {
+        const auto k_idx = 2 + with_groups + i - off;
+        const auto KD = (dims[k_idx] - 1) * (ddhw[i] + 1) + 1;
+        dims_in.emplace_back(idhw[i]);
+        dims_out.emplace_back(odhw[i]);
+        P1[i] = dims_out.back() - dims_in.back() - 1 + KD - pdhw[i];
+    }
+
+    memory_desc_t in, out;
+    CHECK(memory_desc_init_by_tag(out, int(dims_out.size()), dims_out.data(),
+            out_type, format_tag::any));
+    CHECK(memory_desc_init_by_tag(in, int(dims_in.size()), dims_in.data(),
+            data_type::s8, format_tag::any));
+
+    if (has_offset0) {
+        auto out_type_size = types::data_type_size(out_type);
+        auto offset0 = memory_desc_wrapper(real_wei).size(0, false);
+        assert(offset0 % out_type_size == 0);
+        out.offset0 = offset0 / out_type_size;
+    }
+    auto conv_desc = convolution_desc_t();
+    CHECK(dnnl::impl::conv_desc_init(&conv_desc, prop,
+            alg_kind::convolution_direct, (is_bwd_d) ? &out : &in, &real_wei,
+            nullptr, (is_bwd_d) ? &in : &out, S1.data() + off, ddhw + off,
+            pdhw + off, P1.data() + off));
+    primitive_desc_iterator_t it(eng, (op_desc_t *)&conv_desc, &attr, nullptr);
+    if (!it.is_initialized()) return status::out_of_memory;
+    retn = *(++it);
+    return (retn) ? status::success : status::unimplemented;
+}
+
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/gpu_zero_points_conv.hpp b/src/gpu/gpu_zero_points_conv.hpp
new file mode 100644
index 00000000000..e287454b4ec
--- /dev/null
+++ b/src/gpu/gpu_zero_points_conv.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_GPU_ZERO_POINTS_CONV_HPP
+#define GPU_GPU_ZERO_POINTS_CONV_HPP
+
+#include "common/primitive_desc.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+
+status_t create_zp_precompute_conv_pd(std::shared_ptr<primitive_desc_t> &retn,
+        dnnl::impl::engine_t *eng, const primitive_attr_t &attr,
+        const memory_desc_t *wei, const dim_t *idhw, const dim_t *odhw,
+        const dim_t *pdhw, const dim_t *ddhw, data_type_t out_type,
+        prop_kind_t prop, bool has_offset0 = true);
+
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/CMakeLists.txt b/src/gpu/intel/CMakeLists.txt
index 994bfb9af39..d5ce324dbca 100644
--- a/src/gpu/intel/CMakeLists.txt
+++ b/src/gpu/intel/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2024 Intel Corporation
+# Copyright 2024-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,16 +14,17 @@
 # limitations under the License.
 #===============================================================================
 
+# TODO: Extend to all -Wconversion warnings
+if(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang")
+    append(CMAKE_CXX_FLAGS "-Wbitfield-enum-conversion -Wbool-conversion -Wconstant-conversion -Wenum-conversion -Wimplicit-int-conversion -Wliteral-conversion -Wnon-literal-null-conversion -Wnull-conversion")
+    add_definitions("-DENABLE_LLVM_WCONVERSION")
+endif()
+
 file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
     )
 
-add_definitions_with_host_compiler(-DNGEN_CPP11)
-add_definitions_with_host_compiler(-DNGEN_SAFE)
-add_definitions_with_host_compiler(-DNGEN_NEO_INTERFACE)
-add_definitions_with_host_compiler(-DNGEN_NO_OP_NAMES)
-add_definitions_with_host_compiler(-DNGEN_WINDOWS_COMPAT)
 
 add_subdirectory(compute)
 add_subdirectory(microkernels)
diff --git a/src/gpu/intel/block_structure.cpp b/src/gpu/intel/block_structure.cpp
index fb9beb73286..19a6b2e747f 100644
--- a/src/gpu/intel/block_structure.cpp
+++ b/src/gpu/intel/block_structure.cpp
@@ -59,24 +59,24 @@ block_layout_t::block_layout_t(
         const memory_desc_wrapper &mdw, bool inner_only, bool do_normalize) {
     if (mdw.format_kind() == format_kind::undef) return;
 
-    const size_t ndims = static_cast<size_t>(mdw.ndims());
+    const dim_idx_t ndims = into<uint32_t>(mdw.ndims());
     auto &blocking = mdw.blocking_desc();
     auto *padded_dims = mdw.padded_dims();
 
     dim_t stride = 1;
     std::vector<dim_t> full_blocks(ndims, 1);
     for (int i = blocking.inner_nblks - 1; i >= 0; i--) {
-        dim_t dim_idx = blocking.inner_idxs[i];
+        dim_idx_t dim_idx = into<uint32_t>(blocking.inner_idxs[i]);
         dim_t block = blocking.inner_blks[i];
         append(block_t(dim_idx, block, stride));
         stride *= block;
-        full_blocks[static_cast<size_t>(dim_idx)] *= block;
+        full_blocks[dim_idx] *= block;
     }
 
     if (!inner_only) {
-        for (size_t i = 0; i < ndims; i++) {
+        for (dim_idx_t i = 0; i < ndims; i++) {
             dim_t block = padded_dims[i] / full_blocks[i];
-            append(block_t(static_cast<dim_t>(i), block, blocking.strides[i]));
+            append(block_t(i, block, blocking.strides[i]));
         }
 
         // Sort outer blocks by their stride.
diff --git a/src/gpu/intel/block_structure.hpp b/src/gpu/intel/block_structure.hpp
index 9bcc4594fde..fde534584d2 100644
--- a/src/gpu/intel/block_structure.hpp
+++ b/src/gpu/intel/block_structure.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@
 
 #include "common/c_types_map.hpp"
 #include "common/memory_desc_wrapper.hpp"
+#include "common/serialization.hpp"
 #include "common/utils.hpp"
 #include "gpu/intel/compute/kernel_arg_list.hpp"
-#include "gpu/intel/serialization.hpp"
 #include "gpu/intel/utils.hpp"
 
 namespace dnnl {
@@ -44,7 +44,7 @@ class stride_t {
 
     bool operator!=(const stride_t &other) const { return !operator==(other); }
 
-    size_t get_hash() const { return serialized_t::get_hash(*this); }
+    size_t get_hash() const { return serialization_stream_t::get_hash(*this); }
 
     operator dim_t() const {
         assert(is_fixed());
@@ -99,11 +99,11 @@ class stride_t {
 
 namespace compute {
 template <>
-struct scalar_type_traits<stride_t> {
+struct scalar_type_traits_t<stride_t> {
     static const auto type = scalar_type_t::_long;
 };
 } // namespace compute
-assert_trivially_serializable(stride_t);
+DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(stride_t);
 
 inline stride_t operator*(const stride_t &a, const stride_t &b) {
     stride_t tmp = a;
@@ -118,12 +118,12 @@ inline stride_t operator*(dim_t a, const stride_t &b) {
     return stride_t(a) * b;
 }
 
-static constexpr dim_t undefined_dim_idx = -1;
+static constexpr dim_idx_t undefined_dim_idx = -1;
 
 struct block_t {
     block_t() = default;
 
-    block_t(dim_t dim_idx, dim_t block, const stride_t &stride)
+    block_t(dim_idx_t dim_idx, dim_t block, const stride_t &stride)
         : dim_idx(dim_idx), block(block), stride(stride) {}
 
     bool can_merge(const block_t &other, bool same_dim_only = true) const {
@@ -143,7 +143,7 @@ struct block_t {
 #endif
     bool operator!=(const block_t &other) const { return !(*this == other); }
 
-    size_t get_hash() const { return serialized_t::get_hash(*this); }
+    size_t get_hash() const { return serialization_stream_t::get_hash(*this); }
 
     std::string str() const {
         std::ostringstream oss;
@@ -156,11 +156,12 @@ struct block_t {
 
     bool is_empty() const { return dim_idx == undefined_dim_idx; }
 
-    dim_t dim_idx = undefined_dim_idx; // Dimension index.
+    dim_idx_t dim_idx = undefined_dim_idx; // Dimension index.
+    uint8_t pad[4] = {};
     dim_t block = 1; // Block size.
     stride_t stride; // Stride between elements of the block.
 };
-assert_trivially_serializable(block_t);
+DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(block_t);
 
 // Static-sized layout of blocks
 struct block_layout_t {
@@ -234,7 +235,7 @@ struct block_layout_t {
     const block_t &operator[](size_t idx) const { return blocks[idx]; }
 
     void append(const block_t &block) { blocks[num_blocks++] = block; }
-    size_t get_hash() const { return serialized_t::get_hash(*this); }
+    size_t get_hash() const { return serialization_stream_t::get_hash(*this); }
 
     block_t &operator[](size_t idx) {
         assert(idx < num_blocks);
diff --git a/src/gpu/intel/compute/block_manipulation.hpp b/src/gpu/intel/compute/block_manipulation.hpp
index 56be29b19a8..287b493b273 100644
--- a/src/gpu/intel/compute/block_manipulation.hpp
+++ b/src/gpu/intel/compute/block_manipulation.hpp
@@ -30,11 +30,10 @@ namespace compute {
 
 class mapped_block_t {
 public:
-    mapped_block_t(size_t size, size_t dim_idx)
+    mapped_block_t(size_t size, dim_idx_t dim_idx)
         : size(size), dim_idx(dim_idx) {}
     mapped_block_t(size_t buffer_idx, const block_t &block)
-        : size(static_cast<size_t>(block.block))
-        , dim_idx(static_cast<size_t>(block.dim_idx)) {
+        : size(static_cast<size_t>(block.block)), dim_idx(block.dim_idx) {
         map(buffer_idx, block);
     }
 
@@ -44,8 +43,7 @@ class mapped_block_t {
 
     bool matches(const block_t &block) const {
         size_t block_size = static_cast<size_t>(block.block);
-        size_t block_dim_idx = static_cast<size_t>(block.dim_idx);
-        return block_size == size && block_dim_idx == dim_idx;
+        return block_size == size && block.dim_idx == dim_idx;
     }
 
     const std::unordered_map<size_t, block_t> &get_buffer_blocks() const {
@@ -56,7 +54,7 @@ class mapped_block_t {
         return blocks.find(buffer_idx) == blocks.end();
     }
 
-    size_t get_dim_idx() const { return dim_idx; }
+    dim_idx_t get_dim_idx() const { return dim_idx; }
     size_t get_size() const { return size; }
 
     bool can_merge(
@@ -93,7 +91,7 @@ class mapped_block_t {
 
 private:
     size_t size;
-    size_t dim_idx;
+    dim_idx_t dim_idx;
     std::unordered_map<size_t, block_t> blocks;
 };
 
@@ -135,12 +133,11 @@ class block_bin_t {
         assert(!is_broadcasted_[buffer_idx]);
         block_t front_block
                 = mapped_blocks.front().get_buffer_blocks().at(buffer_idx);
-        dim_t dim = static_cast<dim_t>(dim_idx);
         dim_t block_size = static_cast<dim_t>(size());
-        return block_t(dim, block_size, front_block.stride);
+        return block_t(dim_idx, block_size, front_block.stride);
     }
 
-    size_t get_dim_idx() const { return dim_idx; }
+    dim_idx_t get_dim_idx() const { return dim_idx; }
     const std::vector<mapped_block_t> &get_blocks() const {
         return mapped_blocks;
     }
@@ -148,7 +145,7 @@ class block_bin_t {
     bool is_in_lws() const { return is_in_lws_; }
 
 private:
-    size_t dim_idx;
+    dim_idx_t dim_idx;
     size_t num_layouts;
     std::vector<mapped_block_t> mapped_blocks;
     std::vector<bool> is_broadcasted_;
diff --git a/src/gpu/intel/compute/compute_engine.hpp b/src/gpu/intel/compute/compute_engine.hpp
index e7f785f7f06..5c10dc1ff12 100644
--- a/src/gpu/intel/compute/compute_engine.hpp
+++ b/src/gpu/intel/compute/compute_engine.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@
 #include "gpu/intel/compute/dispatch.hpp"
 #include "gpu/intel/compute/kernel.hpp"
 #include "gpu/intel/compute/kernel_ctx.hpp"
-#include "gpu/intel/jit/jit_generator_base.hpp"
+#include "gpu/intel/jit/generator_base.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -55,42 +55,41 @@ class compute_engine_t : public gpu::engine_t {
 
     const device_info_t *device_info() const { return device_info_.get(); }
 
-    virtual status_t create_kernel(compute::kernel_t *kernel,
-            jit::jit_generator_base *jitter,
-            const cache_blob_t &cache_blob) const = 0;
+    virtual status_t create_kernel(
+            compute::kernel_t *kernel, jit::generator_base_t *jitter) const = 0;
 
     virtual status_t create_kernels(std::vector<compute::kernel_t> *kernels,
             const std::vector<const char *> &kernel_names,
-            const compute::kernel_ctx_t &kernel_ctx,
-            const cache_blob_t &cache_blob) const = 0;
+            const compute::kernel_ctx_t &kernel_ctx) const = 0;
 
     status_t create_kernel_bundle(kernel_bundle_t &bundle,
             const std::vector<const char *> &kernel_names,
-            const compute::kernel_ctx_t &kernel_ctx,
-            const cache_blob_t &cache_blob = cache_blob_t()) const {
+            const compute::kernel_ctx_t &kernel_ctx) const {
         std::vector<kernel_t> kernels;
-        CHECK(create_kernels(&kernels, kernel_names, kernel_ctx, cache_blob));
+        CHECK(create_kernels(&kernels, kernel_names, kernel_ctx));
         bundle = kernel_bundle_t(std::move(kernels), kernel_names);
         return status::success;
     }
 
-    virtual status_t create_kernels_from_ocl_source(
-            std::vector<compute::kernel_t> *kernels,
-            const std::vector<const char *> &kernel_names,
-            const char *source_string,
-            const compute::kernel_ctx_t &kernel_ctx) const {
-        assert(!"unexpected");
-        return status::success;
-    };
-
     virtual status_t create_kernel_from_binary(compute::kernel_t &kernel,
-            const xpu::binary_t &binary, const char *kernel_name) const = 0;
+            const xpu::binary_t &binary, const char *kernel_name,
+            const program_src_t &src) const = 0;
 
     virtual status_t create_kernels_from_cache_blob(
             const cache_blob_t &cache_blob,
             std::vector<compute::kernel_t> &kernels,
             const std::vector<const char *> &kernel_names) const = 0;
 
+    status_t create_kernel_from_cache_blob(const cache_blob_t &cache_blob,
+            compute::kernel_t &kernel, const char *kernel_name) const {
+        std::vector<compute::kernel_t> kernels;
+        CHECK(create_kernels_from_cache_blob(
+                cache_blob, kernels, {kernel_name}));
+        if (kernels.size() != 1) return status::runtime_error;
+        kernel = std::move(kernels[0]);
+        return status::success;
+    };
+
     status_t get_zero_pad_primitive(
             impl::primitive_t *&result, const resource_mapper_t *&resources) {
         std::call_once(zero_pad_init_, [&]() -> void {
diff --git a/src/gpu/intel/compute/device_info.cpp b/src/gpu/intel/compute/device_info.cpp
index 63aeeb7be4b..bb1c53f1ef4 100644
--- a/src/gpu/intel/compute/device_info.cpp
+++ b/src/gpu/intel/compute/device_info.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,6 +45,7 @@ uint64_t get_future_extensions(
         case gpu_arch_t::xe_hpg:
         case gpu_arch_t::xe2:
         case gpu_arch_t::xe_hpc:
+        case gpu_arch_t::xe3:
             extensions |= (uint64_t)device_ext_t::intel_global_float_atomics;
             extensions
                     |= (uint64_t)device_ext_t::intel_variable_eu_thread_count;
@@ -64,11 +65,19 @@ uint64_t get_future_extensions(
     return extensions;
 }
 
+bool device_info_t::is_integrated() const {
+    auto family = static_cast<ngen::ProductFamily>(gpu_product_family_);
+    return ngen::getPlatformType(family) == ngen::PlatformType::Integrated;
+}
+
 bool device_info_t::mayiuse_sub_group(int size) const {
     switch (gpu_arch()) {
-        case gpu_arch_t::xe2:
-        case gpu_arch_t::xe_hpc: return utils::one_of(size, 16, 32);
-        default: return utils::one_of(size, 8, 16, 32);
+        case gpu_arch_t::gen9:
+        case gpu_arch_t::gen11:
+        case gpu_arch_t::xe_lp:
+        case gpu_arch_t::xe_hp:
+        case gpu_arch_t::xe_hpg: return utils::one_of(size, 8, 16, 32);
+        default: return utils::one_of(size, 16, 32);
     }
 }
 
@@ -104,8 +113,9 @@ int device_info_t::max_eus_per_wg(gpu_arch_t gpu_arch) {
     switch (gpu_arch) {
         case gpu::intel::compute::gpu_arch_t::gen9:
         case gpu::intel::compute::gpu_arch_t::gen11:
+        case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
-        case gpu::intel::compute::gpu_arch_t::xe_hpc: return 8;
+        case gpu::intel::compute::gpu_arch_t::xe3: return 8;
         case gpu::intel::compute::gpu_arch_t::xe_lp:
         case gpu::intel::compute::gpu_arch_t::xe_hp:
         case gpu::intel::compute::gpu_arch_t::xe_hpg: return 16;
@@ -118,8 +128,9 @@ int device_info_t::max_subgroup_size(gpu_arch_t gpu_arch) {
     switch (gpu_arch) {
         case gpu::intel::compute::gpu_arch_t::gen9: return 16;
         case gpu::intel::compute::gpu_arch_t::gen11:
+        case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
-        case gpu::intel::compute::gpu_arch_t::xe_hpc: return 32;
+        case gpu::intel::compute::gpu_arch_t::xe3: return 32;
         case gpu::intel::compute::gpu_arch_t::xe_lp:
         case gpu::intel::compute::gpu_arch_t::xe_hp:
         case gpu::intel::compute::gpu_arch_t::xe_hpg:
@@ -141,15 +152,17 @@ int device_info_t::min_subgroup_size() const {
         case gpu_arch_t::xe_hp:
         case gpu_arch_t::xe_hpg: return 8;
         case gpu_arch_t::xe_hpc:
-        case gpu_arch_t::xe2: return 16;
+        case gpu_arch_t::xe2:
+        case gpu_arch_t::xe3: return 16;
         default: return 0;
     }
 }
 
 int device_info_t::max_exec_size(gpu_arch_t gpu_arch) {
     switch (gpu_arch) {
+        case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
-        case gpu::intel::compute::gpu_arch_t::xe_hpc: return 128;
+        case gpu::intel::compute::gpu_arch_t::xe3: return 128;
         default: return 64;
     }
     return 64;
@@ -182,8 +195,9 @@ int device_info_t::threads_per_eu(gpu_arch_t gpu_arch, bool large_grf_mode) {
         case gpu::intel::compute::gpu_arch_t::xe_lp: return 7;
         case gpu::intel::compute::gpu_arch_t::xe_hp:
         case gpu::intel::compute::gpu_arch_t::xe_hpg:
-        case gpu::intel::compute::gpu_arch_t::xe2:
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
+        case gpu::intel::compute::gpu_arch_t::xe2:
+        case gpu::intel::compute::gpu_arch_t::xe3:
             return large_grf_mode ? 4 : 8;
         case gpu::intel::compute::gpu_arch_t::unknown: return 7;
     }
@@ -199,11 +213,10 @@ int device_info_t::max_slm_size(gpu_arch_t gpu_arch) {
             slm_size = (1 << 16);
             break;
         case gpu::intel::compute::gpu_arch_t::xe_hp:
+        case gpu::intel::compute::gpu_arch_t::xe_hpg:
         case gpu::intel::compute::gpu_arch_t::xe_hpc:
         case gpu::intel::compute::gpu_arch_t::xe2:
-        case gpu::intel::compute::gpu_arch_t::xe_hpg:
-            slm_size = (1 << 17);
-            break;
+        case gpu::intel::compute::gpu_arch_t::xe3: slm_size = (1 << 17); break;
         case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
     }
     return slm_size;
@@ -235,6 +248,7 @@ size_t device_info_t::icache_size() const {
         case gpu::intel::compute::gpu_arch_t::xe_hpg: return 96 * 1024;
         case gpu::intel::compute::gpu_arch_t::xe_hpc: return 80 * 1024;
         case gpu::intel::compute::gpu_arch_t::xe2: return 96 * 1024;
+        case gpu::intel::compute::gpu_arch_t::xe3: return 96 * 1024;
         case gpu::intel::compute::gpu_arch_t::unknown: assert(!"not expected");
     }
     return 0;
@@ -267,35 +281,35 @@ status_t device_info_t::init_attributes_common(impl::engine_t *engine) {
 status_t device_info_t::init_serialized_device_info(
         const std::vector<uint8_t> &cache_blob) {
     if (!cache_blob.empty()) {
-        serialized_device_info_.write(cache_blob.data(), cache_blob.size());
+        serialized_device_info_.append_array(
+                cache_blob.size(), cache_blob.data());
         return status::success;
     }
 
-    serialized_device_info_.write(&gpu_arch_);
-    serialized_device_info_.write(&gpu_product_family_);
-    serialized_device_info_.write(&stepping_id_);
-    serialized_device_info_.write(&ip_version_);
-    serialized_device_info_.write(&runtime_version_.major);
-    serialized_device_info_.write(&runtime_version_.minor);
-    serialized_device_info_.write(&runtime_version_.build);
-    serialized_device_info_.write(hw_threads_, 2);
-    serialized_device_info_.write(&eu_count_);
-    serialized_device_info_.write(&max_eus_per_wg_);
-    serialized_device_info_.write(&max_subgroup_size_);
-    serialized_device_info_.write(&max_exec_size_);
-    serialized_device_info_.write(&max_wg_size_);
-    serialized_device_info_.write(&l3_cache_size_);
-    serialized_device_info_.write(&extensions_);
-    serialized_device_info_.write(&native_extensions_);
-    serialized_device_info_.write(&mayiuse_systolic_);
-    serialized_device_info_.write(&mayiuse_ngen_kernels_);
-    serialized_device_info_.write(&mayiuse_system_memory_allocators_);
-    serialized_device_info_.write(&mayiuse_microkernels_);
-    serialized_device_info_.write(&mayiuse_non_uniform_work_groups_);
+    serialized_device_info_.append(gpu_arch_);
+    serialized_device_info_.append(gpu_product_family_);
+    serialized_device_info_.append(stepping_id_);
+    serialized_device_info_.append(ip_version_);
+    serialized_device_info_.append(runtime_version_.major);
+    serialized_device_info_.append(runtime_version_.minor);
+    serialized_device_info_.append(runtime_version_.build);
+    serialized_device_info_.append_array(2, hw_threads_);
+    serialized_device_info_.append(eu_count_);
+    serialized_device_info_.append(max_eus_per_wg_);
+    serialized_device_info_.append(max_subgroup_size_);
+    serialized_device_info_.append(max_exec_size_);
+    serialized_device_info_.append(max_wg_size_);
+    serialized_device_info_.append(l3_cache_size_);
+    serialized_device_info_.append(extensions_);
+    serialized_device_info_.append(native_extensions_);
+    serialized_device_info_.append(mayiuse_systolic_);
+    serialized_device_info_.append(mayiuse_ngen_kernels_);
+    serialized_device_info_.append(mayiuse_system_memory_allocators_);
+    serialized_device_info_.append(mayiuse_non_uniform_work_groups_);
 
     const size_t name_size = name_.size();
-    serialized_device_info_.write(&name_size);
-    serialized_device_info_.write(name_.data(), name_size);
+    serialized_device_info_.append(name_size);
+    serialized_device_info_.append_array(name_size, name_.data());
 
     return status::success;
 }
@@ -332,7 +346,6 @@ status_t device_info_t::init_from_cache_blob(
     DESERIALIZE(mayiuse_systolic_, bool);
     DESERIALIZE(mayiuse_ngen_kernels_, bool);
     DESERIALIZE(mayiuse_system_memory_allocators_, bool);
-    DESERIALIZE(mayiuse_microkernels_, bool);
     DESERIALIZE(mayiuse_non_uniform_work_groups_, bool);
 #undef DESERIALIZE
 
diff --git a/src/gpu/intel/compute/device_info.hpp b/src/gpu/intel/compute/device_info.hpp
index fa83697ced4..c0a5af24f5c 100644
--- a/src/gpu/intel/compute/device_info.hpp
+++ b/src/gpu/intel/compute/device_info.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 #include <string.h>
 
 #include "common/c_types_map.hpp"
-#include "common/serialization_stream.hpp"
+#include "common/serialization.hpp"
 #include "common/utils.hpp"
 #include "common/z_magic.hpp"
 
@@ -45,6 +45,7 @@ enum class gpu_arch_t {
     xe_hpg,
     xe_hpc,
     xe2,
+    xe3
 };
 
 static inline std::string to_string(gpu_arch_t arch) {
@@ -57,6 +58,7 @@ static inline std::string to_string(gpu_arch_t arch) {
     CASE(xe_hpg);
     CASE(xe_hpc);
     CASE(xe2);
+    CASE(xe3);
     return "unknown";
 #undef CASE
 }
@@ -72,6 +74,7 @@ static inline gpu_arch_t str2gpu_arch(const char *str) {
     CASE(xe_hpg);
     CASE(xe_hpc);
     CASE(xe2);
+    CASE(xe3);
     return gpu_arch_t::unknown;
 #undef CASE
 }
@@ -196,6 +199,8 @@ struct device_info_t {
     gpu_arch_t gpu_arch() const { return gpu_arch_; }
     int gpu_product_family() const { return gpu_product_family_; }
     int stepping_id() const { return stepping_id_; }
+    uint64_t native_extensions() const { return native_extensions_; }
+    bool is_integrated() const;
     uint32_t ip_version() const { return ip_version_; }
     int max_eus_per_wg() const { return max_eus_per_wg_; }
     static int max_eus_per_wg(gpu_arch_t gpu_arch);
@@ -231,9 +236,6 @@ struct device_info_t {
 
     bool mayiuse_ngen_kernels() const { return mayiuse_ngen_kernels_; }
 
-    /// Returns true if the OpenCL compiler supports microkernels.
-    bool mayiuse_microkernels() const { return mayiuse_microkernels_; }
-
     bool mayiuse_systolic() const { return mayiuse_systolic_; }
 
     bool mayiuse_non_uniform_work_groups() const {
@@ -281,7 +283,6 @@ struct device_info_t {
     bool mayiuse_systolic_ = false;
     bool mayiuse_ngen_kernels_ = false;
     bool mayiuse_system_memory_allocators_ = false;
-    bool mayiuse_microkernels_ = false;
 
     std::string name_;
     xpu::runtime_version_t runtime_version_;
diff --git a/src/gpu/intel/compute/dispatch.cpp b/src/gpu/intel/compute/dispatch.cpp
index 8b043ee10cc..53207b63642 100644
--- a/src/gpu/intel/compute/dispatch.cpp
+++ b/src/gpu/intel/compute/dispatch.cpp
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 #include <iomanip>
+#include <limits>
 #include <sstream>
 
 #include "common/math_utils.hpp"
@@ -31,45 +32,80 @@ namespace intel {
 namespace compute {
 
 // Compute optimal local work size for the given global work size.
-compute::range_t get_optimal_lws(const compute::range_t &gws,
-        const int mapped_vec_dim_idx, const gpu_arch_t gpu_arch) {
-    const size_t lws_max = 256;
+compute::range_t get_optimal_lws(compute::range_t &gws,
+        const dim_idx_t mapped_vec_dim_idx, const gpu_arch_t gpu_arch) {
     // Factors in descending order, prefer bigger sizes for local work size.
     const size_t optimal_lws_values[]
             = {256, 224, 192, 160, 128, 96, 64, 32, 16, 8, 7, 6, 5, 4, 3, 2, 1};
-    size_t total_lws = 1;
-
-    compute::range_t gws_copy = gws;
-    auto lws = compute::range_t::one(gws_copy.ndims());
+    const size_t optimal_vect_values[] = {256, 128, 64, 32, 16, 8, 4, 2, 1};
 
-    // Iterate through global work size and calculate max divisor from
-    // the array optimal_lws_values.
-    for (size_t i = 0; i < gws_copy.ndims(); ++i) {
-        auto rest_lws = lws_max / total_lws;
+    auto match = [](const size_t *values, size_t gws_i, size_t max_lws_i,
+                         size_t min_lws_i) {
         size_t lws_idx = 0;
-        while (rest_lws < optimal_lws_values[lws_idx])
+        while (max_lws_i < values[lws_idx])
             lws_idx++;
-
-        while (gws_copy[i] % optimal_lws_values[lws_idx])
+        while (gws_i % values[lws_idx])
             lws_idx++;
+        if (values[lws_idx] < min_lws_i) return min_lws_i;
+        return values[lws_idx];
+    };
+
+    const auto ndims = gws.ndims();
+    // Avoid GPU limitation where work-group size must fit in uint32_t
+    auto lws_min = compute::range_t::empty(ndims);
+    for (size_t i = 0; i < ndims; i++)
+        lws_min[i]
+                = utils::div_up(gws[i], std::numeric_limits<uint32_t>::max());
+
+    const compute::range_t lws_max = [&]() {
+        auto ret = compute::range_t::empty(ndims);
+        size_t max = 256;
+        size_t min = 1;
+        for (dim_t i = ndims - 1; i >= 0; i--) {
+            ret[i] = std::max(max / min, size_t(1));
+            min *= lws_min[i];
+        }
+        return ret;
+    }();
+
+    // Starting from XE_HP subgroups may not be contained in lws[0] when lws[0]
+    // is not a power of 2. To account for this, we consider multiple allocation
+    // strategies which require subgroups to be contained in lws[0] and take the
+    // best outcome.
+
+    auto lws_1d = [&]() {
+        auto ret = compute::range_t::one(ndims);
+        if (lws_min.nelems() == lws_min[0]) {
+            ret[0] = match(optimal_lws_values, gws[0], lws_max[0], lws_min[0]);
+        }
+        return ret;
+    }();
+
+    auto lws_nd = compute::range_t::one(ndims);
+    size_t total_lws = 1;
 
-        lws[i] *= optimal_lws_values[lws_idx];
-        total_lws *= optimal_lws_values[lws_idx];
-        gws_copy[i] /= optimal_lws_values[lws_idx];
+    // Iterate through global work size and calculate max divisor from
+    // the array optimal_lws_values.
+    for (size_t i = 0; i < ndims; ++i) {
+        auto rest_lws = std::max(lws_max[i] / total_lws, size_t(1));
+        auto lws_i = (static_cast<size_t>(mapped_vec_dim_idx) == i
+                             && gpu_arch >= gpu_arch_t::xe_hp)
+                ? match(optimal_vect_values, gws[i], rest_lws,
+                        utils::rnd_up_pow2(lws_min[i]))
+                : match(optimal_lws_values, gws[i], rest_lws, lws_min[i]);
+
+        lws_nd[i] *= lws_i;
+        total_lws *= lws_i;
     }
 
-    // Temporary WA for HW/Compiler walk order issue:
-    // starting from XE_HP, if LWS vectorized dim is not power of 2
-    // it may generate sub_groups with inconsecutive SIMD elements.
-    // TODO: remove it when the original issue fixed
-    if (mapped_vec_dim_idx != -1 && gpu_arch >= gpu_arch_t::xe_hp) {
-        if (!math::is_pow2(lws[mapped_vec_dim_idx])) {
-            for (size_t i = 0; i < lws.ndims(); i++) {
-                if (i != (size_t)mapped_vec_dim_idx) lws[i] = 1;
-            }
-        }
+    auto ret_lws = lws_nd.nelems() >= lws_1d.nelems() ? lws_nd : lws_1d;
+
+    // Ensure uniform work-groups
+    for (dim_idx_t i = 0; i < ndims; i++) {
+        gws[i] = utils::rnd_up(gws[i], ret_lws[i]);
     }
-    return lws;
+
+    return ret_lws;
 }
 
 dispatch_t::dispatch_t(const compute_engine_t *engine, const memory_desc_t *md)
@@ -105,7 +141,7 @@ dispatch_t::dispatch_t(const compute_engine_t *engine, const memory_desc_t *md)
 
 std::string dispatch_t::str() const {
     std::ostringstream oss;
-    for (int i = 0; i < ndims_; ++i) {
+    for (dim_idx_t i = 0; i < ndims_; ++i) {
         auto &d = dims_[i];
         oss << "    "
             << "dim #" << i << " name: " << std::setw(10) << d.name
@@ -119,10 +155,9 @@ std::string dispatch_t::str() const {
 
 void dispatch_t::define_dim_with_nesting_level(
         const std::string &name, int nesting_level, dim_t size, dim_t block) {
-#ifndef NDEBUG
-    for (int i = 0; i < ndims_; ++i)
-        assert(dims_[i].name != name && "Name is not unique.");
-#endif
+    for (dim_idx_t i = 0; i < ndims_; ++i)
+        gpu_assert(dims_[i].name != name)
+                << "Name " << dims_[i].name << " is not unique";
 
     dim_info_t di;
     di.name = name;
@@ -139,7 +174,7 @@ void dispatch_t::define_dim_with_nesting_level(
 status_t dispatch_t::vectorize_dim(const std::string &name, int vector_size) {
     if (!engine_->mayiuse_sub_group(vector_size)) return status::unimplemented;
     assert(vector_size > 1);
-    for (int i = 0; i < ndims_; ++i) {
+    for (dim_idx_t i = 0; i < ndims_; ++i) {
         if (dims_[i].name == name) {
             assert(dims_[i].size % vector_size == 0);
             assert(dims_[i].size % (vector_size * dims_[i].block) == 0);
@@ -167,7 +202,7 @@ void dispatch_t::def_kernel_macros(kernel_ctx_t &kernel_ctx) const {
 
     kernel_ctx.define_int(utils::format("%s_DEF", gws_prefix.c_str()), 1);
 
-    for (int i = 0; i < ndims_; ++i) {
+    for (dim_idx_t i = 0; i < ndims_; ++i) {
         auto get_dim_str = utils::format("-DGWS_GET_%s=%s_GET_ID%d",
                 dims_[i].name.c_str(), gws_prefix.c_str(), i);
         kernel_ctx.add_option(get_dim_str);
@@ -201,7 +236,7 @@ void dispatch_t::def_kernel_macros(kernel_ctx_t &kernel_ctx) const {
     }
 
     // Local work size and subgroup sizes.
-    int vec_dim_idx = find_vectorized_dim();
+    dim_idx_t vec_dim_idx = find_vectorized_dim();
     kernel_ctx.define_int(utils::format("GWS_WITH_SG_%s", attr_suffix_),
             vec_dim_idx != dim_not_found);
 
@@ -214,7 +249,26 @@ void dispatch_t::def_kernel_macros(kernel_ctx_t &kernel_ctx) const {
         for (size_t i = 0; i < r.global_range().ndims(); i++) {
             kernel_ctx.define_int(
                     utils::format("GWS_LWS%zu_%s", i, attr_suffix_),
-                    gpu_utils::into<int64_t>(r.local_range()[i]));
+                    into<int64_t>(r.local_range()[i]));
+        }
+    }
+
+    compute::range_t gws_actual {1, 1, 1};
+    for (dim_idx_t i = 0; i < ndims_; i++) {
+        const auto &d = dims_[i];
+        gws_actual[d.gws_index] *= utils::div_up(d.size, d.block);
+    };
+    auto &gws = nd_range_.global_range();
+    for (dim_idx_t i = 0; i < 3; i++) {
+        if (i < nd_range_.ndims() && gws[i] > gws_actual[i]) {
+            std::string overflow_check = utils::format(
+                    "-DGWS%d_OVERFLOW=\"(get_global_id(%d) >= %zu%s)\"", i, i,
+                    gws_actual[i], gws_actual[i] > UINT32_MAX ? "ul" : "u");
+            kernel_ctx.add_option(overflow_check);
+        } else {
+            std::string overflow_check
+                    = utils::format("-DGWS%d_OVERFLOW=false", i);
+            kernel_ctx.add_option(overflow_check);
         }
     }
 }
@@ -229,7 +283,7 @@ void dispatch_t::generate(bool generate_lws) {
     // XXX: Move dimensions with size = 1 to the end.
     for (int i = ndims_ - 2; i >= 0; --i) {
         if (dims_[i].size == 1) {
-            for (int j = i; j < ndims_ - 1; ++j) {
+            for (dim_idx_t j = i; j < ndims_ - 1; ++j) {
                 if (dims_[j + 1].size == 1) break;
                 std::swap(dims_[j], dims_[j + 1]);
             }
@@ -237,18 +291,18 @@ void dispatch_t::generate(bool generate_lws) {
     }
 
     // Find vectorized dimension (if any).
-    int vec_dim_idx = find_vectorized_dim();
+    dim_idx_t vec_dim_idx = find_vectorized_dim();
 
     // Compute GWS indices.
-    for (int i = 0; i < ndims_; ++i) {
+    for (dim_idx_t i = 0; i < ndims_; ++i) {
         if (vec_dim_idx == dim_not_found) {
             // Keep up to 4 dims in gws[0] to have bigger choice for work group
             // size.
-            dims_[i].gws_index = std::min(2, std::max(0, i - 3));
+            dims_[i].gws_index = std::min(2, std::max(0, into<int>(i) - 3));
         } else {
             // With vectorized dimension, work group size choices are more
             // limited so no need to group dimensions together.
-            dims_[i].gws_index = std::min(2, i);
+            dims_[i].gws_index = std::min(2, into<int>(i));
         }
     }
 
@@ -265,7 +319,7 @@ void dispatch_t::generate(bool generate_lws) {
     size_t hw_threads = dev_info->hw_threads();
 
     // Calculate block sizes for the dimensions with flexible blocking.
-    for (int i = 0; i < ndims_; ++i) {
+    for (dim_idx_t i = 0; i < ndims_; ++i) {
         if (dims_[i].block == 0) {
             int gws_index = dims_[i].gws_index;
             // Heuristic: use max blocking but keep at least eu_count work items.
@@ -284,9 +338,8 @@ void dispatch_t::generate(bool generate_lws) {
         if (vec_dim_idx != dim_not_found) {
             lws = compute::range_t::one(gws.ndims());
             int gws_index = dims_[vec_dim_idx].gws_index;
-            size_t vec_size
-                    = gpu_utils::into<size_t>(dims_[vec_dim_idx].vector_size);
-            size_t nblocks = gpu_utils::into<size_t>(
+            size_t vec_size = into<size_t>(dims_[vec_dim_idx].vector_size);
+            size_t nblocks = into<size_t>(
                     dims_[vec_dim_idx].size / dims_[vec_dim_idx].block);
             // XXX: max 256 work items per group
             lws[gws_index]
@@ -297,9 +350,9 @@ void dispatch_t::generate(bool generate_lws) {
                     * vec_size;
 
             // Move the vectorized dimension to the first place in the group.
-            int group_beg = ndims_ - 1;
-            int group_end = 0;
-            for (int i = 0; i < ndims_; ++i) {
+            dim_idx_t group_beg = ndims_ - 1;
+            dim_idx_t group_end = 0;
+            for (dim_idx_t i = 0; i < ndims_; ++i) {
                 if (dims_[i].gws_index == gws_index) {
                     group_beg = std::min(group_beg, i);
                     group_end = std::max(group_end, i);
@@ -308,7 +361,7 @@ void dispatch_t::generate(bool generate_lws) {
 
             if (vec_dim_idx != group_beg) {
                 auto vec_dim_info = dims_[vec_dim_idx];
-                for (int i = vec_dim_idx - 1; i >= group_beg; --i) {
+                for (int i = vec_dim_idx - 1; i >= into<int>(group_beg); --i) {
                     dims_[i + 1] = dims_[i];
                 }
                 dims_[group_beg] = std::move(vec_dim_info);
@@ -323,10 +376,20 @@ void dispatch_t::generate(bool generate_lws) {
         if (!lws) {
             // Compute the best lws.
             lws = get_optimal_lws(gws,
-                    vec_dim_idx != -1 ? dims_[vec_dim_idx].gws_index : -1,
+                    vec_dim_idx != dim_idx::invalid
+                            ? dims_[vec_dim_idx].gws_index
+                            : dim_idx::invalid,
                     dev_info->gpu_arch());
+            gpu_assert(lws) << "Unexpected missing lws";
+        } else {
+            // Last ditch effort to avoid dispatching restriction on Intel GPUs.
+            for (size_t i = 0; i < gws.ndims(); i++) {
+                if (gws[i] > lws[i] * UINT_MAX) {
+                    lws[i] *= utils::div_up(gws[i], UINT_MAX);
+                    gws[i] = utils::rnd_up(gws[i], lws[i]);
+                }
+            }
         }
-        gpu_assert(lws) << "Unexpected missing lws";
     }
 
     nd_range_ = nd_range_t(gws, lws);
@@ -351,8 +414,8 @@ void dispatch_t::set_lws(const compute::range_t &lrange) {
     nd_range_ = nd_range_t(grange, lrange);
 }
 
-void dispatch_t::define_dim_with_md_hint(
-        const std::string &name, int md_hint_index, dim_t size, dim_t block) {
+void dispatch_t::define_dim_with_md_hint(const std::string &name,
+        dim_idx_t md_hint_index, dim_t size, dim_t block) {
     int nesting_level = min_nesting_level;
     if (md_ndims_ > 0) {
         assert(md_hint_index >= 0 && md_hint_index < md_ndims_);
diff --git a/src/gpu/intel/compute/dispatch.hpp b/src/gpu/intel/compute/dispatch.hpp
index b0c975322c4..8dcc6692dd4 100644
--- a/src/gpu/intel/compute/dispatch.hpp
+++ b/src/gpu/intel/compute/dispatch.hpp
@@ -32,7 +32,7 @@ namespace gpu {
 namespace intel {
 namespace compute {
 
-range_t get_optimal_lws(const range_t &gws, const int mapped_vec_dim_idx,
+range_t get_optimal_lws(range_t &gws, const dim_idx_t mapped_vec_dim_idx,
         const gpu_arch_t gpu_arch);
 
 class compute_engine_t;
@@ -40,7 +40,7 @@ class compute_engine_t;
 class dispatch_t {
 public:
     static constexpr int min_nesting_level = -1;
-    static constexpr int dim_not_found = -1;
+    static constexpr dim_idx_t dim_not_found = -1;
 
     // md - memory descriptor hint to extract nesting levels based on the layout.
     dispatch_t(const compute_engine_t *engine = nullptr,
@@ -53,7 +53,7 @@ class dispatch_t {
 
     std::string str() const;
 
-    void define_dim(const std::string &name, int md_hint_idx, dim_t size,
+    void define_dim(const std::string &name, dim_idx_t md_hint_idx, dim_t size,
             dim_t block = 1) {
         define_dim_with_md_hint(name, md_hint_idx, size, block);
     }
@@ -107,12 +107,12 @@ class dispatch_t {
     };
 
 protected:
-    void define_dim_with_md_hint(const std::string &name, int md_hint_index,
-            dim_t size, dim_t block = 1);
+    void define_dim_with_md_hint(const std::string &name,
+            dim_idx_t md_hint_index, dim_t size, dim_t block = 1);
 
-    int find_vectorized_dim() const {
-        int vec_dim_idx = dim_not_found;
-        for (int i = 0; i < ndims_; ++i) {
+    dim_idx_t find_vectorized_dim() const {
+        dim_idx_t vec_dim_idx = dim_not_found;
+        for (dim_idx_t i = 0; i < ndims_; ++i) {
             if (dims_[i].vector_size != 1) {
                 assert(vec_dim_idx == dim_not_found);
                 assert(dims_[i].block > 0);
@@ -134,10 +134,10 @@ class dispatch_t {
 
     const compute_engine_t *engine_;
 
-    int md_ndims_ = 0;
+    dim_idx_t md_ndims_ = 0;
     int md_nesting_levels_[DNNL_MAX_NDIMS];
 
-    int ndims_ = 0;
+    dim_idx_t ndims_ = 0;
     dim_info_t dims_[DNNL_MAX_NDIMS];
 
     std::string attr_suffix_ = "DEFAULT";
diff --git a/src/gpu/intel/compute/dispatch_reusable.cpp b/src/gpu/intel/compute/dispatch_reusable.cpp
index 079a4ecbfdf..daa8081ce3f 100644
--- a/src/gpu/intel/compute/dispatch_reusable.cpp
+++ b/src/gpu/intel/compute/dispatch_reusable.cpp
@@ -56,7 +56,7 @@ status_t reusable_dispatch_config_t::use_subgroup(
 }
 
 status_t reusable_dispatch_config_t::define_dim_index(
-        const char *dim_name, dim_id_t dim_id, dim_t size) {
+        const char *dim_name, dim_idx_t dim_id, dim_t size) {
     memory_desc_t md = types::zero_md();
     md.ndims = 1;
     md.dims[0] = size;
@@ -88,7 +88,7 @@ status_t reusable_dispatch_config_t::register_buffer(
     if (has_zero_padding) return status::unimplemented;
 
     // Validate dim sizes
-    std::unordered_map<dim_id_t, bool, dim_id_hash_t> dim_seen;
+    std::unordered_map<dim_idx_t, bool, dim_id_hash_t> dim_seen;
     for (const auto &dim : dispatched_dims) {
         size_t canonical_idx = buffer.get_dim_idx(dim);
         if (canonical_idx == dim_not_found) {
@@ -221,24 +221,21 @@ class layout_equalizer_t {
         std::vector<bool> is_mapped_to(master_layout.size(), false);
         for (size_t i = 0; i < res.size(); i++) {
             block_t &block = res[i];
-            size_t block_size = static_cast<size_t>(block.block);
+            dim_t block_size = block.block;
             for (size_t j = 0; j < master_layout.size(); j++) {
                 if (is_mapped_to[j]) continue;
 
                 mapped_block_t &master_block = master_layout[j];
-                if (master_block.get_dim_idx()
-                        != static_cast<size_t>(block.dim_idx))
-                    continue;
+                if (master_block.get_dim_idx() != block.dim_idx) continue;
 
-                size_t master_size = master_block.get_size();
+                dim_t master_size = master_block.get_size();
                 if (master_size == block_size) {
                     // Nothing to do, already matches
                 } else if (block_size % master_size == 0) {
                     // subdivide block
-                    block.block = static_cast<dim_t>(master_size);
-                    block_t next_block(block.dim_idx,
-                            static_cast<dim_t>(block_size / master_size),
-                            block.stride * static_cast<dim_t>(master_size));
+                    block.block = master_size;
+                    block_t next_block(block.dim_idx, block_size / master_size,
+                            block.stride * master_size);
                     res.insert(i + 1, next_block);
                 } else if (master_size % block_size == 0) {
                     // subdivide master block
@@ -395,7 +392,7 @@ status_t reusable_dispatch_config_t::generate(
     gpu_assert(!buffers.empty());
 
     // Every dispatched dim must have a defined size
-    for (dim_id_t id : dispatched_dims) {
+    for (dim_idx_t id : dispatched_dims) {
         if (dim_sizes.find(id) == dim_sizes.end()) {
             return status::unimplemented;
         }
@@ -403,7 +400,7 @@ status_t reusable_dispatch_config_t::generate(
 
     std::array<bool, DNNL_MAX_NDIMS> is_dispatched;
     is_dispatched.fill(false);
-    for (dim_id_t dim : dispatched_dims) {
+    for (dim_idx_t dim : dispatched_dims) {
         is_dispatched[dim] = true;
     }
 
@@ -459,7 +456,7 @@ status_t reusable_dispatch_config_t::generate(
 void dispatch_compile_params_t::def_kernel_macros(
         kernel_ctx_t &kernel_ctx, const char *suffix) const {
     kernel_ctx.define_int("GWS_WITH_RUNTIME_PARAMS", 1);
-    if (use_int32_offset) kernel_ctx.add_option("-DUSE_INT32_OFFSET");
+    kernel_ctx.use_int32_offset(use_int32_offset);
 
     // Find a unique prefix (in case there are many kernels in a file).
     std::string gws_prefix;
@@ -473,7 +470,7 @@ void dispatch_compile_params_t::def_kernel_macros(
     kernel_ctx.define_int(utils::format("%s_DEF", gws_prefix.c_str()), 1);
 
     // For each term, define each parameter
-    for (size_t i = 0; i < gpu_utils::into<size_t>(num_terms); i++) {
+    for (size_t i = 0; i < into<size_t>(num_terms); i++) {
         const gws_indexing_term_t::compile_params_t &term = terms[i];
         const char *gws_dim_op = [term]() -> const char * {
             switch (term.op) {
@@ -484,9 +481,9 @@ void dispatch_compile_params_t::def_kernel_macros(
                 case (gws_op_t::SOLO_BLOCK): return "SOLO_BLOCK";
                 case (gws_op_t::FIRST_BLOCK): return "FIRST_BLOCK";
                 case (gws_op_t::MOD_BLOCK): return "MOD_BLOCK";
-                default:
-                    gpu_assert(false) << "Unexpected GWS indexing operation";
+                case (gws_op_t::UNDEF): break;
             }
+            gpu_error_not_expected() << "Unexpected GWS indexing operation";
             return nullptr;
         }();
         if (!gws_dim_op) continue; // Will not be hit due to gpu_assert above
@@ -496,12 +493,12 @@ void dispatch_compile_params_t::def_kernel_macros(
                 "-D%s_OP%zu=GWS_OP_%s", gws_prefix, i, gws_dim_op));
 
         // GWS<X>_RT_IDX<Y>
-        kernel_ctx.define_int(utils::format("%s_RT_IDX%zu", gws_prefix, i),
-                gpu_utils::into<dim_t>(i));
+        kernel_ctx.define_int(
+                utils::format("%s_RT_IDX%zu", gws_prefix, i), into<dim_t>(i));
 
         // GWS<X>_IDX<Y>
         kernel_ctx.define_int(utils::format("%s_IDX%zu", gws_prefix, i),
-                gpu_utils::into<dim_t>(term.gws_idx));
+                into<dim_t>(term.gws_idx));
     }
 
     // Define data types for conversion (Ignore the default suffix)
diff --git a/src/gpu/intel/compute/dispatch_reusable.hpp b/src/gpu/intel/compute/dispatch_reusable.hpp
index cb44d827912..8f2cc418a0b 100644
--- a/src/gpu/intel/compute/dispatch_reusable.hpp
+++ b/src/gpu/intel/compute/dispatch_reusable.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include "common/c_types_map.hpp"
 #include "common/memory_desc_wrapper.hpp"
+#include "common/serialization.hpp"
 #include "gpu/intel/block_structure.hpp"
 #include "gpu/intel/compute/block_manipulation.hpp"
 #include "gpu/intel/compute/compute_engine.hpp"
@@ -31,7 +32,6 @@
 #include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/ocl/types_interop.h"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -260,7 +260,7 @@ struct dispatch_compile_params_t {
     uint64_t buffer_num_terms[MAX_REGISTERED_BUFFERS] = {0};
     data_type_t buffer_types[MAX_REGISTERED_BUFFERS] = {data_type::undef};
 };
-assert_trivially_serializable(dispatch_compile_params_t);
+DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(dispatch_compile_params_t);
 
 class dispatch_runtime_params_t {
 public:
@@ -318,7 +318,7 @@ struct lws_strategy_t {
     virtual ~lws_strategy_t() = default;
 
     virtual range_t create_lws(
-            const range_t &gws, const gws_bin_mapping_t &mapper) const = 0;
+            range_t &gws, const gws_bin_mapping_t &mapper) const = 0;
 
     // Determine if a given block (mapped to each buffer) should be in the lws.
     // Gets called for each block dispatched to the GWS.
@@ -341,8 +341,8 @@ struct default_lws_strategy_t : public lws_strategy_t {
     default_lws_strategy_t(const compute_engine_t *engine,
             const gpu_primitive_attr_t *gpu_attr)
         : lws_strategy_t(engine, gpu_attr) {};
-    range_t create_lws(const range_t &gws,
-            const gws_bin_mapping_t &mapper) const override {
+    range_t create_lws(
+            range_t &gws, const gws_bin_mapping_t &mapper) const override {
         range_t lws
                 = get_optimal_lws(gws, -1, engine->device_info()->gpu_arch());
         return lws;
@@ -354,25 +354,15 @@ struct default_lws_strategy_t : public lws_strategy_t {
     }
 };
 
-struct dim_id_t {
-    dim_id_t() = default;
-    constexpr dim_id_t(size_t id) : id(id) {};
-    size_t id;
-
-    bool operator==(const dim_id_t &other) const { return id == other.id; }
-    bool operator==(size_t other) const { return id == other; }
-    operator size_t() const { return id; }
-};
-
 struct dim_id_hash_t {
-    size_t operator()(const dim_id_t &id) const noexcept { return id.id; }
+    size_t operator()(const dim_idx_t &id) const noexcept { return id; }
 };
 
-constexpr size_t dim_not_found = std::numeric_limits<size_t>::max();
+constexpr dim_idx_t dim_not_found = std::numeric_limits<dim_idx_t>::max();
 
 struct named_buffer_t : public memory_desc_t {
     named_buffer_t(const char *name, const memory_desc_t &md,
-            const std::vector<dim_id_t> &dims)
+            const std::vector<dim_idx_t> &dims)
         : memory_desc_t(md), name(name), dim_ids(dims) {
         gpu_assert(this->name.size() <= MAX_BUFFER_NAME_LENGTH);
         gpu_assert(format_kind == format_kind::blocked);
@@ -392,9 +382,9 @@ struct named_buffer_t : public memory_desc_t {
     }
 
     const std::string &get_name() const { return name; }
-    const std::vector<dim_id_t> &get_dim_ids() const { return dim_ids; }
+    const std::vector<dim_idx_t> &get_dim_ids() const { return dim_ids; }
 
-    void remove_dim(dim_id_t dim, bool update_strides = true) {
+    void remove_dim(dim_idx_t dim, bool update_strides = true) {
         size_t dim_idx = get_dim_idx(dim);
         if (dim_idx == dim_not_found) return;
 
@@ -433,7 +423,7 @@ struct named_buffer_t : public memory_desc_t {
 
     // Appends a block for the given dimension, of the given size.
     // Will change dimension size, strides, and block layout
-    void append_block(dim_id_t dim, dim_t size) {
+    void append_block(dim_idx_t dim, dim_t size) {
         auto &blk = format_desc.blocking;
 
         size_t dim_idx = get_dim_idx(dim);
@@ -464,8 +454,8 @@ struct named_buffer_t : public memory_desc_t {
         padded_dims[dim_idx] *= size;
     }
 
-    size_t get_dim_idx(dim_id_t dim) const {
-        for (size_t i = 0; i < dim_ids.size(); i++) {
+    dim_idx_t get_dim_idx(dim_idx_t dim) const {
+        for (dim_idx_t i = 0; i < into<dim_idx_t>(dim_ids.size()); i++) {
             if (dim_ids[i] == dim) { return i; }
         }
         return dim_not_found;
@@ -476,26 +466,25 @@ struct named_buffer_t : public memory_desc_t {
         block_layout_t layout(*this);
         for (auto &block : layout) {
             // Re-index the layout according to the included dims
-            block.dim_idx = static_cast<dim_t>(
-                    get_dim_ids()[static_cast<size_t>(block.dim_idx)]);
+            block.dim_idx = get_dim_ids()[static_cast<size_t>(block.dim_idx)];
         }
         return layout;
     }
 
 private:
     std::string name;
-    std::vector<dim_id_t> dim_ids;
+    std::vector<dim_idx_t> dim_ids;
 
-    void remove_blocking(dim_id_t dim) {
+    void remove_blocking(dim_idx_t dim) {
         auto &blk = format_desc.blocking;
-        size_t dim_idx = get_dim_idx(dim);
+        dim_idx_t dim_idx = get_dim_idx(dim);
         if (dim_idx == dim_not_found) return;
 
         // Tally up inner blocks that will be removed
         std::vector<block_t> blocks;
         dim_t stride = 1;
         for (int i = blk.inner_nblks - 1; i >= 0; i--) {
-            if (static_cast<size_t>(blk.inner_idxs[i]) == dim_idx)
+            if (blk.inner_idxs[i] == dim_idx)
                 blocks.emplace_back(dim_idx, blk.inner_blks[i], stride);
             stride *= blk.inner_blks[i];
         }
@@ -531,7 +520,7 @@ class reusable_dispatch_t {
             const std::vector<std::vector<size_t>> &buffer_term_map) {
         assert(buffers.size() == buffer_term_map.size());
 
-        compile_params.num_terms = gpu_utils::into<int>(term_list.terms.size());
+        compile_params.num_terms = into<int>(term_list.terms.size());
         for (size_t i = 0; i < term_list.terms.size(); i++) {
             compile_params.terms[i] = term_list.terms[i].compile_params();
         }
@@ -627,7 +616,7 @@ class gws_bin_mapping_t {
         add_(bin, gws_.ndims() - 1);
     }
 
-    nd_range_t nd_range(const lws_strategy_t &lws_strategy) const {
+    nd_range_t nd_range(const lws_strategy_t &lws_strategy) {
         range_t lws = lws_strategy.create_lws(gws_, *this);
         return compute::nd_range_t(gws_, lws);
     }
@@ -657,19 +646,19 @@ class gws_bin_mapping_t {
 class reusable_dispatch_config_t {
 public:
     reusable_dispatch_config_t(
-            const compute_engine_t *engine, std::vector<dim_id_t> dims)
+            const compute_engine_t *engine, std::vector<dim_idx_t> dims)
         : dispatched_dims(std::move(dims)), engine(engine) {};
     status_t generate(
             reusable_dispatch_t &dispatch, const lws_strategy_t &lws_strategy);
     status_t register_buffer(const named_buffer_t &buffer);
     status_t define_dim_index(
-            const char *dim_name, dim_id_t dim_id, dim_t size);
+            const char *dim_name, dim_idx_t dim_id, dim_t size);
     status_t use_subgroup(const std::string &buf_name, size_t size);
 
 private:
     std::vector<named_buffer_t> buffers;
-    std::vector<dim_id_t> dispatched_dims;
-    std::unordered_map<dim_id_t, dim_t, dim_id_hash_t> dim_sizes;
+    std::vector<dim_idx_t> dispatched_dims;
+    std::unordered_map<dim_idx_t, dim_t, dim_id_hash_t> dim_sizes;
 
     subgroup_data_t subgroup;
     const compute_engine_t *engine;
diff --git a/src/gpu/intel/compute/kernel.hpp b/src/gpu/intel/compute/kernel.hpp
index 7a78da0f8a9..77d4bbe1778 100644
--- a/src/gpu/intel/compute/kernel.hpp
+++ b/src/gpu/intel/compute/kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,10 +17,15 @@
 #ifndef GPU_INTEL_COMPUTE_KERNEL_HPP
 #define GPU_INTEL_COMPUTE_KERNEL_HPP
 
+#if defined(__linux__) && (defined(DNNL_DEV_MODE) || !defined(NDEBUG))
+#include <unistd.h>
+#endif
+
 #include <functional>
 #include <memory>
 #include <utility>
 
+#include "common/utils.hpp"
 #include "common/verbose.hpp"
 #include "gpu/intel/compute/kernel_arg_list.hpp"
 #include "gpu/intel/compute/utils.hpp"
@@ -34,6 +39,66 @@ namespace gpu {
 namespace intel {
 namespace compute {
 
+#if defined(__linux__) && (defined(DNNL_DEV_MODE) || !defined(NDEBUG))
+struct program_src_t {
+    program_src_t() = default;
+    program_src_t(const std::string &src_str) {
+        // Only enable if gdb-oneapi debugging is active
+        if (getenv_int("ZET_ENABLE_PROGRAM_DEBUGGING", 0) == 0) return;
+
+        const int name_size = 29;
+        char name[name_size] = "/tmp/dnnl_ocl_jit_src.XXXXXX";
+
+        // Ensure /tmp is a valid target for writing a temporary file
+        bool is_symlink = false;
+        status_t status = check_for_symlinks("/tmp", &is_symlink);
+        if (status != status::success || is_symlink) return;
+
+        // Guaranteed to have permissions 600 per the mkstemp specification,
+        // which is the minimum required for writing and then subsequently
+        // reading when debugging.
+        int fd = mkstemp(name);
+        if (fd == -1) return;
+
+        auto delete_fd = [&](int fd, char *name) {
+            // Unlink is called before close to ensure the file always exists
+            // and cannot be replaced with another file
+            unlink(name);
+            close(fd);
+        };
+
+        if (write(fd, src_str.c_str(), src_str.length()) == -1) {
+            delete_fd(fd, name);
+            return;
+        }
+        if (fsync(fd) == -1) {
+            delete_fd(fd, name);
+            return;
+        }
+
+        auto deleter = [&](char *name) {
+            delete_fd(fd, name);
+            delete[] name;
+        };
+
+        name_ = std::shared_ptr<char>(new char[name_size], deleter);
+        std::memcpy(name_.get(), name, name_size);
+    }
+    operator bool() const { return name_ != nullptr; };
+    const char *name() const { return name_.get(); }
+
+private:
+    std::shared_ptr<char> name_;
+};
+#else
+struct program_src_t {
+    program_src_t() = default;
+    program_src_t(const std::string &src_str) {}
+    operator bool() const { return false; }
+    const char *name() const { return nullptr; }
+};
+#endif
+
 class kernel_impl_t {
 public:
     kernel_impl_t() = default;
@@ -181,6 +246,7 @@ class kernel_bundle_t {
     kernel_bundle_t &operator=(const kernel_bundle_t &other) = delete;
     kernel_bundle_t(kernel_bundle_t &&other) = default;
     kernel_bundle_t &operator=(kernel_bundle_t &&other) = default;
+    ~kernel_bundle_t() = default;
 
     status_t get_kernels(std::vector<kernel_t> &kernels,
             const std::vector<const char *> &kernel_names) const {
diff --git a/src/gpu/intel/compute/kernel_arg_list.hpp b/src/gpu/intel/compute/kernel_arg_list.hpp
index 994b5a71643..9a3c9ce6081 100644
--- a/src/gpu/intel/compute/kernel_arg_list.hpp
+++ b/src/gpu/intel/compute/kernel_arg_list.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,6 +44,8 @@ enum class kernel_arg_kind_t {
 enum class scalar_type_t {
     undef,
     _char,
+    _f4_e2m1,
+    _f4_e3m0,
     _hfloat8,
     _bfloat8,
     _bfloat16,
@@ -75,6 +77,8 @@ inline std::string to_string(scalar_type_t type) {
     switch (type) {
         CASE(undef);
         CASE(_char);
+        CASE(_f4_e2m1);
+        CASE(_f4_e3m0);
         CASE(_hfloat8);
         CASE(_bfloat8);
         CASE(_bfloat16);
@@ -103,56 +107,56 @@ inline std::string to_string(scalar_type_t type) {
 }
 
 template <typename T>
-struct scalar_type_traits {};
+struct scalar_type_traits_t {};
 
 template <>
-struct scalar_type_traits<float16_t> {
+struct scalar_type_traits_t<float16_t> {
     static const auto type = scalar_type_t::_half;
 };
 template <>
-struct scalar_type_traits<bfloat16_t> {
+struct scalar_type_traits_t<bfloat16_t> {
     static const auto type = scalar_type_t::_bfloat16;
 };
 template <>
-struct scalar_type_traits<float> {
+struct scalar_type_traits_t<float> {
     static const auto type = scalar_type_t::_float;
 };
 template <>
-struct scalar_type_traits<double> {
+struct scalar_type_traits_t<double> {
     static const auto type = scalar_type_t::_double;
 };
 
 template <>
-struct scalar_type_traits<uint8_t> {
+struct scalar_type_traits_t<uint8_t> {
     static const auto type = scalar_type_t::_uchar;
 };
 template <>
-struct scalar_type_traits<uint16_t> {
+struct scalar_type_traits_t<uint16_t> {
     static const auto type = scalar_type_t::_ushort;
 };
 template <>
-struct scalar_type_traits<uint32_t> {
+struct scalar_type_traits_t<uint32_t> {
     static const auto type = scalar_type_t::_uint;
 };
 template <>
-struct scalar_type_traits<uint64_t> {
+struct scalar_type_traits_t<uint64_t> {
     static const auto type = scalar_type_t::_ulong;
 };
 
 template <>
-struct scalar_type_traits<int8_t> {
+struct scalar_type_traits_t<int8_t> {
     static const auto type = scalar_type_t::_char;
 };
 template <>
-struct scalar_type_traits<int16_t> {
+struct scalar_type_traits_t<int16_t> {
     static const auto type = scalar_type_t::_short;
 };
 template <>
-struct scalar_type_traits<int32_t> {
+struct scalar_type_traits_t<int32_t> {
     static const auto type = scalar_type_t::_int;
 };
 template <>
-struct scalar_type_traits<int64_t> {
+struct scalar_type_traits_t<int64_t> {
     static const auto type = scalar_type_t::_long;
 };
 
@@ -184,7 +188,7 @@ class kernel_arg_t {
             data_pool = static_cast<char *>(data_pool) + size_;
         }
         kind_ = kernel_arg_kind_t::scalar;
-        scalar_type_ = scalar_type_traits<T>::type;
+        scalar_type_ = scalar_type_traits_t<T>::type;
         new (const_cast<void *>(value_)) T(value);
         return *this;
     }
@@ -211,7 +215,7 @@ class kernel_arg_t {
     template <typename T>
     T as() const {
         assert(kind() == kernel_arg_kind_t::scalar);
-        assert(scalar_type() == scalar_type_traits<T>::type);
+        assert(scalar_type() == scalar_type_traits_t<T>::type);
         return *(const T *)value();
     }
 
@@ -225,6 +229,7 @@ class kernel_arg_t {
 class kernel_arg_list_t {
 public:
     kernel_arg_list_t() { args_.reserve(512); }
+    ~kernel_arg_list_t() = default;
 
     void append(const memory_storage_t &storage) {
         args_.emplace_back();
@@ -308,7 +313,7 @@ class kernel_arg_list_t {
 template <typename T>
 void set_scalar_arg_cvt(kernel_arg_list_t &arg_list, int index, T scalar,
         scalar_type_t requested_type) {
-    if (scalar_type_traits<T>::type == requested_type) {
+    if (scalar_type_traits_t<T>::type == requested_type) {
         arg_list.set(index, scalar);
         return;
     }
diff --git a/src/gpu/intel/compute/kernel_ctx.cpp b/src/gpu/intel/compute/kernel_ctx.cpp
new file mode 100644
index 00000000000..ee3466bc784
--- /dev/null
+++ b/src/gpu/intel/compute/kernel_ctx.cpp
@@ -0,0 +1,35 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/compute/kernel_ctx.hpp"
+#include "gpu/intel/primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace compute {
+
+void kernel_ctx_t::register_buffer_size(const memory_desc_info_t &mdi) {
+    register_buffer_size(
+            mdi.size + mdi.offset0 * types::data_type_size(mdi.data_type));
+}
+
+} // namespace compute
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/compute/kernel_ctx.hpp b/src/gpu/intel/compute/kernel_ctx.hpp
index d20003b2f4b..7306bdd73c4 100644
--- a/src/gpu/intel/compute/kernel_ctx.hpp
+++ b/src/gpu/intel/compute/kernel_ctx.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <unordered_map>
 
 #include "common/bit_cast.hpp"
+#include "common/type_helpers.hpp"
 #include "gpu/intel/gpu_primitive_attr.hpp"
 #include "gpu/intel/utils.hpp"
 
@@ -33,6 +34,9 @@ namespace dnnl {
 namespace impl {
 namespace gpu {
 namespace intel {
+
+struct memory_desc_info_t;
+
 namespace compute {
 
 class kernel_ctx_t {
@@ -47,6 +51,14 @@ class kernel_ctx_t {
         for (auto &opt : option_set_)
             oss << " " << opt;
 
+        if (use_int32_offset_) {
+            oss << " -DUSE_INT32_OFFSET";
+        } else {
+            // TODO: Determine if specialization for buffers between 2GB and 4GB
+            // is worthwhile
+            oss << " -cl-intel-greater-than-4GB-buffer-required";
+        }
+
         for (auto &int_var : int_var_map_) {
             oss << " -D" << int_var.first << "=" << int_var.second;
             if (int_var.second > INT_MAX || int_var.second < INT_MIN)
@@ -60,8 +72,22 @@ class kernel_ctx_t {
         return oss.str();
     }
 
+    void register_buffer_size(size_t size) {
+        if (size > INT_MAX) use_int32_offset(false);
+    }
+
+    void register_buffer_size(const memory_desc_wrapper &mdw) {
+        register_buffer_size(mdw.size(0, true, true));
+    }
+    void register_buffer_size(const memory_desc_info_t &mdi);
+
+    // Enable various optimizations when all buffers are < 2GB in size. In this
+    // case, int32_t types can be used for data offsets and avoid int64_t
+    // operations when native 64-bit operations are unsupported.
+    void use_int32_offset(bool value) { use_int32_offset_ = value; }
+
     void define_int(const char *variable, int64_t value) {
-        int_var_map_.insert({variable, value});
+        set_macro(variable, value, int_var_map_);
     }
 
     void define_int(const std::string &variable, int64_t value) {
@@ -88,7 +114,7 @@ class kernel_ctx_t {
         return has_macro(name.c_str());
     }
 
-    void set_data_type(data_type_t dt) {
+    void set_data_type(data_type_t dt, bool with_punning = true) {
         switch (dt) {
             case data_type::bf16: define_int("DT_BF16", 1); break;
             case data_type::f16: define_int("DT_F16", 1); break;
@@ -98,16 +124,12 @@ class kernel_ctx_t {
             case data_type::u8: define_int("DT_U8", 1); break;
             case data_type::f8_e4m3: define_int("DT_HF8", 1); break;
             case data_type::f8_e5m2: define_int("DT_BF8", 1); break;
+            case data_type::f4_e2m1: define_int("DT_F4_E2M1", 1); break;
+            case data_type::f4_e3m0: define_int("DT_F4_E3M0", 1); break;
             case data_type::s32: define_int("DT_S32", 1); break;
             default: assert(!"unknown data type"); break;
         }
-    }
-
-    template <typename T>
-    T get_scalar(const std::string &s) const {
-        UNUSED(s);
-        static_assert(!std::is_same<T, T>::value, "not expected");
-        return {};
+        define_int("WITH_PUNNING", with_punning);
     }
 
     std::string data_type() const {
@@ -156,23 +178,31 @@ class kernel_ctx_t {
         if (gpu_utils::dev_getenv("ocl_debug", 0)) {
             add_option("-DOCL_DEBUG");
         }
+
+        if (gpu_utils::dev_getenv("enable_ocl_werror", is_dev_mode()))
+            add_option("-Werror ");
     }
     void set_default_macros(const primitive_attr_t *attr) {
         if (attr) { define_int("DETERMINISTIC", attr->deterministic_); }
     }
 
+    template <typename T>
+    void set_macro(const char *variable, const T &value,
+            std::map<std::string, T> &var_map) {
+        gpu_assert(var_map.count(variable) == 0 || var_map[variable] == value)
+                << "Error: macro " << variable
+                << " is already set to a different value.\n  Old value: "
+                << var_map[variable] << "\n  New value: " << value;
+        var_map.insert({variable, value});
+    }
+
     std::map<std::string, int64_t> int_var_map_;
     std::map<std::string, float> float_var_map_;
     std::set<std::string> option_set_;
     std::unordered_map<std::string, std::string> custom_headers_;
+    bool use_int32_offset_ = true;
 };
 
-template <>
-inline int64_t kernel_ctx_t::get_scalar(const std::string &name) const {
-    assert(int_var_map_.count(name) != 0 && "not expected");
-    return int_var_map_.at(name);
-}
-
 } // namespace compute
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/compute/kernel_list.hpp b/src/gpu/intel/compute/kernel_list.hpp
deleted file mode 100644
index 682901028a2..00000000000
--- a/src/gpu/intel/compute/kernel_list.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_COMPUTE_KERNEL_LIST_HPP
-#define GPU_INTEL_COMPUTE_KERNEL_LIST_HPP
-
-#include <cassert>
-#include <unordered_map>
-
-#include "gpu/intel/compute/kernel.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace compute {
-
-class kernel_list_t {
-public:
-    void add(const char *name, kernel_t *kernel) {
-        assert(kernels_.count(name) == 0);
-        kernels_[name] = kernel;
-    }
-
-    void set(const char *name, const kernel_t &kernel) {
-        assert(kernels_.count(name) > 0);
-        *kernels_[name] = kernel;
-    }
-
-    const std::unordered_map<std::string, kernel_t *> &kernels() const {
-        return kernels_;
-    }
-
-private:
-    std::unordered_map<std::string, kernel_t *> kernels_;
-};
-
-} // namespace compute
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_COMPUTE_KERNEL_LIST_HPP
diff --git a/src/gpu/intel/compute/utils.hpp b/src/gpu/intel/compute/utils.hpp
index 0340dcf7f77..9ca19b37edb 100644
--- a/src/gpu/intel/compute/utils.hpp
+++ b/src/gpu/intel/compute/utils.hpp
@@ -48,7 +48,7 @@ class range_t {
                 << "Too many dimensions for range_t";
         ndims_ = dims.size();
         for (size_t i = 0; i < dims.size(); i++) {
-            dims_[i] = gpu_utils::into<size_t>(dims[i]);
+            dims_[i] = into<size_t>(dims[i]);
         }
     }
 
@@ -70,11 +70,13 @@ class range_t {
     }
 
     size_t &operator[](size_t idx) {
-        assert(idx < ndims_);
+        gpu_assert(idx < ndims_) << "range index " << idx
+                                 << " overflows range ndims of " << ndims_;
         return dims_[idx];
     }
     size_t operator[](size_t idx) const {
-        assert(idx < ndims_);
+        gpu_assert(idx < ndims_) << "range index " << idx
+                                 << " overflows range ndims of " << ndims_;
         return dims_[idx];
     }
     size_t ndims() const { return ndims_; }
@@ -95,6 +97,19 @@ class range_t {
 
     operator bool() const { return ndims_ > 0; }
 
+    std::string str() const {
+        if (ndims_ == 0) return "(nil)";
+
+        std::stringstream oss;
+        oss << "[";
+        for (size_t i = 0; i < ndims(); i++) {
+            if (i > 0) oss << ", ";
+            oss << dims_[i];
+        }
+        oss << "]";
+        return oss.str();
+    }
+
 private:
     size_t ndims_ = 0;
     std::array<size_t, max_ndims> dims_ = {0, 0, 0};
@@ -125,19 +140,10 @@ class nd_range_t {
 
     std::string str() const {
         std::stringstream oss;
-        oss << "gws = [";
-        for (size_t i = 0; i < ndims(); i++) {
-            if (i > 0) oss << ", ";
-            oss << global_range_[i];
-        }
-        oss << "] lws = ";
+        oss << "gws = " << global_range_.str();
+        oss << " lws = ";
         if (local_range_) {
-            oss << "[";
-            for (size_t i = 0; i < ndims(); i++) {
-                if (i > 0) oss << ", ";
-                oss << local_range_[i];
-            }
-            oss << "]";
+            oss << local_range_.str();
         } else {
             oss << "(nil)";
         }
diff --git a/src/gpu/intel/compute/zero_pool.cpp b/src/gpu/intel/compute/zero_pool.cpp
index b25577f2092..320f2bcd7a7 100644
--- a/src/gpu/intel/compute/zero_pool.cpp
+++ b/src/gpu/intel/compute/zero_pool.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -138,10 +138,10 @@ zero_pool_t::zero_pool_t(compute::compute_engine_t *engine, size_t chunk_size,
         bool stream_private, bool in_order)
     : engine_(engine)
     , chunk_size_(chunk_size)
+    , chunk_count_(stream_private ? 1 : 16)
     , stream_private_(stream_private)
     , in_order_(in_order) {
 
-    chunk_count_ = stream_private ? 1 : 16;
     assert(chunk_count_ <= max_chunks);
 }
 
diff --git a/src/gpu/intel/config.hpp b/src/gpu/intel/config.hpp
index ffb63f5344b..a2c8fd9efc8 100644
--- a/src/gpu/intel/config.hpp
+++ b/src/gpu/intel/config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -156,6 +156,19 @@ class int_param_t : public value_param_t<int> {
     }
 };
 
+class dim_param_t : public value_param_t<dim_t> {
+public:
+    using value_param_t::value_param_t;
+
+    void set_from_str(const std::string &s) override { value_ = std::stoll(s); }
+
+    std::string str() const override {
+        std::ostringstream oss;
+        oss << short_name() << "=" << value_;
+        return oss.str();
+    }
+};
+
 class container_config_t {
 public:
     virtual ~container_config_t() = default;
@@ -199,8 +212,10 @@ class container_config_t {
 
     std::vector<param_t *> get_all_params(bool do_sort = false) {
         auto *this_const = const_cast<const container_config_t *>(this);
+        const auto &all_params = this_const->get_all_params(do_sort);
         std::vector<param_t *> ret;
-        for (auto *p : this_const->get_all_params(do_sort)) {
+        ret.reserve(all_params.size());
+        for (auto *p : all_params) {
             ret.push_back(const_cast<param_t *>(p));
         }
         return ret;
@@ -208,6 +223,7 @@ class container_config_t {
 
     std::vector<const param_t *> get_all_params(bool do_sort = false) const {
         std::vector<const param_t *> ret;
+        ret.reserve(get_params_.size());
         for (auto &gp : get_params_)
             ret.push_back(gp(this));
         if (do_sort) {
diff --git a/src/gpu/intel/gemm/gpu_gemm_exec_types.hpp b/src/gpu/intel/gemm/gpu_gemm_exec_types.hpp
index 6d26c968321..37435782d1a 100644
--- a/src/gpu/intel/gemm/gpu_gemm_exec_types.hpp
+++ b/src/gpu/intel/gemm/gpu_gemm_exec_types.hpp
@@ -45,6 +45,7 @@ struct gemm_exec_args_t {
     const memory_storage_t *b_scales = nullptr;
     const memory_storage_t *c_scales = nullptr;
     const memory_storage_t *sum_ab = nullptr;
+    const memory_storage_t *sround_seed = nullptr;
     exec_args_t exec_args;
 };
 
diff --git a/src/gpu/intel/gpu_post_ops.hpp b/src/gpu/intel/gpu_post_ops.hpp
index 133cac60aca..fe9693597f4 100644
--- a/src/gpu/intel/gpu_post_ops.hpp
+++ b/src/gpu/intel/gpu_post_ops.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,7 +55,8 @@ struct specializations_t {
             }
         }
 
-        bool is_inlined(float value) const {
+        template <typename T>
+        bool is_inlined(T value) const {
             switch (mode_) {
                 case mode_t::always: return true;
                 case mode_t::never: return false;
@@ -98,6 +99,41 @@ struct specializations_t {
     } binary;
 };
 
+// Helper to extend the memory descriptor dimensions (e.g. NCW -> NCDHW).
+struct ndim_normalizer_t {
+    constexpr ndim_normalizer_t() = default;
+    constexpr ndim_normalizer_t(int insert_idx, int bcast_ndims)
+        : insert_idx(insert_idx), bcast_ndims(bcast_ndims) {}
+
+    int ndims(const memory_desc_t &md) const { return md.ndims + bcast_ndims; }
+
+    int dim_idx(int md_idx) const {
+        return (md_idx < insert_idx) ? md_idx : md_idx + bcast_ndims;
+    }
+
+    dim_t dim(int idx, const memory_desc_t &md) const {
+        auto &dims = md.dims;
+        return (idx < insert_idx)
+                ? dims[idx]
+                : (idx < insert_idx + bcast_ndims ? 1
+                                                  : dims[idx - bcast_ndims]);
+    }
+
+    dim_t stride(int idx, const memory_desc_t &md) const {
+        auto &strides = md.format_desc.blocking.strides;
+        return (idx < insert_idx)
+                ? strides[idx]
+                : (idx < insert_idx + bcast_ndims ? 0
+                                                  : strides[idx - bcast_ndims]);
+    }
+
+    // Position to insert broadcast dimensions, dimensions
+    // are inserted before this index.
+    int insert_idx = 0;
+    // Number of broadcast dimensions to insert.
+    int bcast_ndims = 0;
+};
+
 // New type to prevent misuse of relative_md_t indices with memory_desc_t indices
 struct relative_idx_t {
     constexpr relative_idx_t() = default;
@@ -125,8 +161,9 @@ struct relative_md_t {
     static constexpr int to_md_idx(idx_t idx, int ndims) {
         return ndims - 1 - idx.as_int();
     }
-    static idx_t from_md_idx(int idx, int ndims) {
-        return {gpu_utils::into<int8_t>(ndims - 1 - idx)};
+    static idx_t from_md_idx(
+            int idx, int ndims, const ndim_normalizer_t &ndim_normalizer) {
+        return {into<int8_t>(ndims - 1 - ndim_normalizer.dim_idx(idx))};
     }
 
     // A compressed representation of the inner block. This cannot represent all
@@ -148,38 +185,38 @@ struct relative_md_t {
     };
 
     relative_md_t() = default;
-    static status_t make(relative_md_t &rmd, const memory_desc_t &md) {
+    static status_t make(relative_md_t &rmd, const memory_desc_t &md,
+            const ndim_normalizer_t &ndim_normalizer) {
         if (md.format_kind != format_kind::blocked)
             return status::unimplemented;
 
         rmd.dt = md.data_type;
 
-        auto ndims = md.ndims;
-        auto &dims = md.dims;
-        auto &strides = md.format_desc.blocking.strides;
+        auto ndims = ndim_normalizer.ndims(md);
 
         auto layout = block_layout_t(md, true);
         gpu_assert(layout.size() <= blocking_t::max_dims);
 
         for (size_t i = 0; i < layout.size(); i++) {
-            rmd.inner_layout.idxs[i] = from_md_idx(layout[i].dim_idx, ndims);
-            rmd.inner_layout.blocks[i]
-                    = gpu_utils::into<uint8_t>(layout[i].block);
+            rmd.inner_layout.idxs[i]
+                    = from_md_idx(layout[i].dim_idx, ndims, ndim_normalizer);
+            rmd.inner_layout.blocks[i] = into<uint8_t>(layout[i].block);
         }
 
         // Default all dimensions to broadcast
         rmd.broadcast_mask = ~0;
         uint16_t mask_bit = 1;
         for (int i = ndims - 1; i >= 0; i--) {
-            if (dims[i] > 1) rmd.broadcast_mask &= ~mask_bit;
-            mask_bit = mask_bit << 1;
+            if (ndim_normalizer.dim(i, md) > 1) rmd.broadcast_mask &= ~mask_bit;
+            mask_bit = static_cast<uint16_t>(mask_bit << 1);
         }
 
         dim_t min_stride = std::numeric_limits<dim_t>::max();
         for (int i = 0; i < ndims; i++) {
-            if (dims[i] > 1 && strides[i] <= min_stride) {
-                rmd.inner_dim = from_md_idx(i, ndims);
-                min_stride = strides[i];
+            if (ndim_normalizer.dim(i, md) > 1
+                    && ndim_normalizer.stride(i, md) <= min_stride) {
+                rmd.inner_dim = from_md_idx(i, ndims, ndim_normalizer);
+                min_stride = ndim_normalizer.stride(i, md);
             }
         }
         if (rmd.inner_dim.is_unset()) rmd.inner_dim = {0};
@@ -206,6 +243,7 @@ struct relative_md_t {
 };
 
 enum class kind_t {
+    undef,
     sum,
     eltwise,
     conv,
@@ -220,13 +258,13 @@ struct sum_t {
         , inline_scale(s.scale.is_inlined(op.scale))
         , inline_zero_point(s.zero_point.is_inlined(op.zero_point))
         , scale(inline_scale ? op.scale : NAN)
-        , zero_point(inline_zero_point ? op.zero_point : NAN) {}
+        , zero_point(inline_zero_point ? op.zero_point : -1) {}
 
 #if __cplusplus >= 202002L
     bool operator==(const sum_t &) const = default;
 #endif
 
-    void serialize(serialized_data_t &s) const {
+    void serialize(serialization_stream_t &s) const {
         s.append(dt);
         s.append(inline_scale);
         s.append(inline_zero_point);
@@ -268,7 +306,7 @@ struct eltwise_t {
     bool operator==(const eltwise_t &) const = default;
 #endif
 
-    void serialize(serialized_data_t &s) const {
+    void serialize(serialization_stream_t &s) const {
         s.append(alg);
         s.append(inline_scale);
         s.append(inline_alpha);
@@ -323,9 +361,11 @@ struct depthwise_conv_t {
 struct binary_t {
     binary_t() = default;
     static status_t make(binary_t &b, const post_ops_t::entry_t::binary_t &op,
-            const specializations_t::binary_t &s) {
+            const specializations_t::binary_t &s,
+            const post_op::ndim_normalizer_t &ndim_normalizer) {
         if (s.src1_desc_layout.is_inlined())
-            CHECK(relative_md_t::make(b.src1_desc, op.src1_desc));
+            CHECK(relative_md_t::make(
+                    b.src1_desc, op.src1_desc, ndim_normalizer));
         else
             b.src1_desc.dt = op.src1_desc.data_type;
 
@@ -335,12 +375,13 @@ struct binary_t {
 
     static status_t make(binary_t &b, const post_ops_t::entry_t::prelu_t &op,
             const memory_desc_wrapper &dst_md,
-            const specializations_t::binary_t &s) {
+            const specializations_t::binary_t &s,
+            const ndim_normalizer_t &ndim_normalizer) {
         if (s.src1_desc_layout.is_inlined()) {
             memory_desc_t prelu_md;
             CHECK(get_prelu_md(
                     op.mask, dst_md.dims(), prelu_md, dst_md.ndims()));
-            CHECK(relative_md_t::make(b.src1_desc, prelu_md));
+            CHECK(relative_md_t::make(b.src1_desc, prelu_md, ndim_normalizer));
         } else {
             b.src1_desc.dt = data_type::f32;
         }
@@ -369,10 +410,11 @@ struct gpu_post_ops_t {
 
     static status_t make(gpu_post_ops_t &gpu_post_ops,
             const post_ops_t &post_ops, const memory_desc_wrapper &dst_md,
-            post_op::specializations_t opts = {}) {
+            post_op::specializations_t opts = {},
+            post_op::ndim_normalizer_t ndim_normalizer = {}) {
         auto &ops = gpu_post_ops.ops_;
         ops.clear();
-        ops.reserve(gpu_utils::into<size_t>(post_ops.len()));
+        ops.reserve(into<size_t>(post_ops.len()));
         using namespace post_op;
         for (auto &entry : post_ops.entry_) {
             switch (entry.kind) {
@@ -387,13 +429,15 @@ struct gpu_post_ops_t {
                     break;
                 case (primitive_kind::binary): {
                     binary_t b;
-                    CHECK(binary_t::make(b, entry.binary, opts.binary));
+                    CHECK(binary_t::make(
+                            b, entry.binary, opts.binary, ndim_normalizer));
                     ops.emplace_back(b);
                     break;
                 }
                 case (primitive_kind::prelu): {
                     binary_t b;
-                    CHECK(binary_t::make(b, entry.prelu, dst_md, opts.binary));
+                    CHECK(binary_t::make(b, entry.prelu, dst_md, opts.binary,
+                            ndim_normalizer));
                     ops.emplace_back(b);
                     break;
                 }
@@ -404,6 +448,7 @@ struct gpu_post_ops_t {
     }
 
     struct entry_t {
+        entry_t() : kind_(post_op::kind_t::undef) {}
         entry_t(post_op::sum_t e) : kind_(post_op::kind_t::sum), sum_(e) {}
         entry_t(post_op::eltwise_t e)
             : kind_(post_op::kind_t::eltwise), eltwise_(e) {}
@@ -420,6 +465,7 @@ struct gpu_post_ops_t {
                     depthwise_conv_.~depthwise_conv_t();
                     break;
                 case (post_op::kind_t::binary): binary_.~binary_t(); break;
+                default: gpu_error_not_expected();
             }
         }
 
@@ -466,7 +512,7 @@ struct gpu_post_ops_t {
                     sum_.inline_scale = true;
                     eltwise_.scale = scale;
                     break;
-                default: gpu_error_not_expected();
+                default: gpu_error_not_expected(); break;
             }
         }
 
@@ -480,19 +526,21 @@ struct gpu_post_ops_t {
                 case (post_op::kind_t::conv):
                     return depthwise_conv_ == other.depthwise_conv_;
                 case (post_op::kind_t::binary): return binary_ == other.binary_;
+                case (post_op::kind_t::undef): return true;
             }
             gpu_error_not_expected();
             return false;
         }
 #endif
 
-        void serialize(serialized_data_t &s) const {
+        void serialize(serialization_stream_t &s) const {
             s.append(kind_);
             switch (kind_) {
                 case (post_op::kind_t::sum): s.append(sum_); break;
                 case (post_op::kind_t::eltwise): s.append(eltwise_); break;
                 case (post_op::kind_t::conv): s.append(depthwise_conv_); break;
                 case (post_op::kind_t::binary): s.append(binary_); break;
+                default: gpu_error_not_expected(); break;
             }
         }
 
@@ -506,6 +554,7 @@ struct gpu_post_ops_t {
                     return d.pop<post_op::depthwise_conv_t>();
                 case (post_op::kind_t::binary):
                     return d.pop<post_op::binary_t>();
+                default: gpu_error_not_expected(); return entry_t();
             }
         }
 
@@ -545,21 +594,22 @@ struct gpu_post_ops_t {
     entry_t &operator[](size_t idx) { return ops_[idx]; }
     void pop_back() { return ops_.pop_back(); }
 
-    void serialize(serialized_data_t &s) const { s.append(ops_); }
+    void serialize(serialization_stream_t &s) const { s.append(ops_); }
 
     static gpu_post_ops_t deserialize(deserializer_t &d) {
-        return d.pop<gpu_post_ops_t>();
+        gpu_post_ops_t po;
+        d.pop(po.ops_);
+        return po;
     }
 
 #if __cplusplus >= 202002L
     bool operator==(const gpu_post_ops_t &) const = default;
 #else
     bool operator==(const gpu_post_ops_t &other) const {
-        return serialized_t(*this) == serialized_t(other);
+        return serialization_stream_t(*this) == serialization_stream_t(other);
     };
 #endif
 
-    // Enable serialization/deserialization
     size_t len() const { return ops_.size(); }
 
 private:
diff --git a/src/gpu/intel/gpu_primitive.hpp b/src/gpu/intel/gpu_primitive.hpp
index 6e913952b0c..407fef1608e 100644
--- a/src/gpu/intel/gpu_primitive.hpp
+++ b/src/gpu/intel/gpu_primitive.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 #include "gpu/intel/compute/compute_stream.hpp"
 #include "gpu/intel/compute/kernel.hpp"
 #include "gpu/intel/gemm/gpu_gemm_exec_types.hpp"
-#include "gpu/intel/jit/jit_generator_base.hpp"
+#include "gpu/intel/jit/generator_base.hpp"
 #include "gpu/intel/kernel_cache.hpp"
 #include "gpu/intel/ocl/types_interop.hpp"
 #include "xpu/context.hpp"
@@ -51,7 +51,7 @@ struct gpu_primitive_t : public gpu::primitive_t {
     private:
         bool empty_impl() const override { return !bool(kernel_); }
 
-        virtual status_t get_cache_blob_size_impl(
+        status_t get_cache_blob_size_impl(
                 impl::engine_t *engine, size_t *size) const override {
             if (empty()) return status::success;
             size_t sz = 0;
@@ -62,7 +62,7 @@ struct gpu_primitive_t : public gpu::primitive_t {
             return status::success;
         }
 
-        virtual status_t get_cache_blob_impl(
+        status_t get_cache_blob_impl(
                 impl::engine_t *engine, cache_blob_t &blob) const override {
             if (empty()) return status::success;
             xpu::binary_t binary;
@@ -89,10 +89,21 @@ struct gpu_primitive_t : public gpu::primitive_t {
     }
 
     status_t create_kernel(impl::engine_t *engine, compute::kernel_t *kernel,
-            jit::jit_generator_base *jitter, bool register_kernel = true) {
+            jit::generator_base_t *jitter, bool register_kernel = true) {
         auto *compute_engine
                 = utils::downcast<compute::compute_engine_t *>(engine);
-        CHECK(compute_engine->create_kernel(kernel, jitter, cache_blob()));
+        if (cache_blob()) {
+            VCHECK_KERNEL(
+                    compute_engine->create_kernel_from_cache_blob(cache_blob(),
+                            *kernel, jitter ? jitter->kernel_name() : nullptr),
+                    VERBOSE_KERNEL_CREATION_FAIL,
+                    jitter ? jitter->kernel_name() : "cached");
+            CHECK(register_kernels({*kernel}));
+            return status::success;
+        }
+        VCHECK_KERNEL(compute_engine->create_kernel(kernel, jitter),
+                VERBOSE_KERNEL_CREATION_FAIL,
+                jitter ? jitter->kernel_name() : "");
         if (register_kernel) CHECK(register_kernels({*kernel}));
         return status::success;
     }
@@ -103,8 +114,14 @@ struct gpu_primitive_t : public gpu::primitive_t {
             const compute::kernel_ctx_t &kernel_ctx) {
         auto *compute_engine
                 = utils::downcast<compute::compute_engine_t *>(engine);
+        if (cache_blob()) {
+            CHECK(compute_engine->create_kernels_from_cache_blob(
+                    cache_blob(), *kernels, kernel_names));
+            CHECK(register_kernels(*kernels));
+            return status::success;
+        }
         CHECK(compute_engine->create_kernels(
-                kernels, kernel_names, kernel_ctx, cache_blob()));
+                kernels, kernel_names, kernel_ctx));
         CHECK(register_kernels(*kernels));
         return status::success;
     }
@@ -112,10 +129,11 @@ struct gpu_primitive_t : public gpu::primitive_t {
     status_t create_kernel(impl::engine_t *engine, compute::kernel_t *kernel,
             const char *kernel_name, const compute::kernel_ctx_t &kernel_ctx) {
         std::vector<compute::kernel_t> kernels(1);
-        auto status
-                = create_kernels(engine, &kernels, {kernel_name}, kernel_ctx);
-        if (status == status::success) *kernel = kernels[0];
-        return status;
+        VCHECK_KERNEL(
+                create_kernels(engine, &kernels, {kernel_name}, kernel_ctx),
+                VERBOSE_KERNEL_CREATION_FAIL, kernel_name);
+        *kernel = kernels[0];
+        return status::success;
     }
 
     template <typename T>
@@ -125,8 +143,10 @@ struct gpu_primitive_t : public gpu::primitive_t {
         auto *compute_engine
                 = utils::downcast<compute::compute_engine_t *>(engine);
         if (cache_blob()) {
-            return compute_engine->create_kernels_from_cache_blob(
-                    cache_blob(), kernels, kernel_names);
+            CHECK(compute_engine->create_kernels_from_cache_blob(
+                    cache_blob(), kernels, kernel_names));
+            CHECK(register_kernels(kernels));
+            return status::success;
         }
 
         auto key = std::make_shared<trivial_key_container_t<T>>(
@@ -150,7 +170,8 @@ struct gpu_primitive_t : public gpu::primitive_t {
     status_t create_kernel(impl::engine_t *engine, compute::kernel_t &kernel,
             const char *kernel_name, const T &params) {
         std::vector<compute::kernel_t> kernels(1);
-        CHECK(create_kernels(engine, kernels, {kernel_name}, params));
+        VCHECK_KERNEL(create_kernels(engine, kernels, {kernel_name}, params),
+                VERBOSE_KERNEL_CREATION_FAIL, kernel_name);
         kernel = kernels[0];
         return status::success;
     }
diff --git a/src/gpu/intel/gpu_primitive_attr.hpp b/src/gpu/intel/gpu_primitive_attr.hpp
index 03e13ac940e..496f0f0ca4e 100644
--- a/src/gpu/intel/gpu_primitive_attr.hpp
+++ b/src/gpu/intel/gpu_primitive_attr.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #define GPU_INTEL_GPU_PRIMITIVE_ATTR_HPP
 
 #include "common/primitive_attr.hpp"
-#include "common/serialization_stream.hpp"
+#include "common/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -43,7 +43,7 @@ struct gpu_primitive_attr_t : public primitive_attr_item_t {
     size_t get_hash() const override { return threads_per_eu_; }
 
     void serialize(serialization_stream_t &stream) const override {
-        stream.write(&threads_per_eu_);
+        stream.append(threads_per_eu_);
     }
 
     int threads_per_eu() const { return threads_per_eu_; }
diff --git a/src/gpu/intel/jit/CMakeLists.txt b/src/gpu/intel/jit/CMakeLists.txt
index 4ef66f21b10..5a2b85edf9e 100644
--- a/src/gpu/intel/jit/CMakeLists.txt
+++ b/src/gpu/intel/jit/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2020-2024 Intel Corporation
+# Copyright 2020-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,11 +16,7 @@
 
 set(CMAKE_CXX_CLANG_TIDY "")
 
-if(DNNL_GPU_CONV_PLANNER)
-    set(CMAKE_CXX_STANDARD 17)
-endif()
-
-set(DIRS "codegen;conv;ir;ngen;pass;pooling;reorder;utils;v2")
+set(DIRS "codegen;conv;ir;pass;pooling;reorder;utils;v2")
 
 file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
@@ -34,8 +30,7 @@ foreach(d ${DIRS})
     list(APPEND SOURCES "${d_sources}")
 endforeach()
 
-
-if(DNNL_GPU_CONV_PLANNER)
+if(DNNL_EXPERIMENTAL)
     list(REMOVE_ITEM SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/v2/conv/planner/planner_main.cpp")
 else()
     file(GLOB planner_sources
@@ -54,11 +49,7 @@ add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
 
-if(DNNL_GPU_CONV_PLANNER)
-    if (NOT DNNL_GPU_RUNTIME STREQUAL "OCL")
-        message(FATAL_ERROR "GPU convolution planner requires OpenCL GPU runtime. "
-            "Build with -DONEDNN_GPU_RUNTIME=OCL.")
-    endif()
+if(DNNL_EXPERIMENTAL)
     add_subdirectory(v2/conv/planner)
 endif()
 
diff --git a/src/gpu/intel/jit/README.md b/src/gpu/intel/jit/README.md
index 18f36887292..4f484c31b56 100644
--- a/src/gpu/intel/jit/README.md
+++ b/src/gpu/intel/jit/README.md
@@ -24,8 +24,8 @@ kernels (ready to be passed into `ndEnqueueNDRangeKernel`). Generation of OpenCL
 kernels requires the OpenCL runtime.
 
 nGEN uses generator classes to control the emission of instructions. oneDNN uses
-the templated `jit_generator<ngen::HW>` class for jit kernels, which is built
-off of the `ngen::OpenCLCodeGenerator` class.
+the templated `generator_t<ngen::HW>` class for jit kernels, which is built off
+of the `ngen::OpenCLCodeGenerator` class.
 
 # Injectors
 Injectors provide low-level interfaces for including common building block code
@@ -44,4 +44,4 @@ oneDNN's IR adopts many ideas from the IR used by the [Halide](https://halide-la
 The IR is used to generate an Abstract Syntax Tree for each kernel, which can be
 traversed and optimized via optimization passes once created. The additional
 optimization potential from this strategy comes at the cost of more complexity
-in the design of these kernels.
\ No newline at end of file
+in the design of these kernels.
diff --git a/src/gpu/intel/jit/binary_format.cpp b/src/gpu/intel/jit/binary_format.cpp
index e96c6add522..0a38f318343 100644
--- a/src/gpu/intel/jit/binary_format.cpp
+++ b/src/gpu/intel/jit/binary_format.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,13 +14,18 @@
 * limitations under the License.
 *******************************************************************************/
 
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
+
 #include "gpu/intel/jit/binary_format.hpp"
 
 #include "common/utils.hpp"
 #include "gpu/intel/compute/compute_engine.hpp"
 #include "gpu/intel/compute/compute_stream.hpp"
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
+#include "gpu/intel/jit/generator.hpp"
 
 #define MAGIC0 0xBEEFCAFEu
 #define MAGIC1 0x3141592653589793ull
@@ -43,11 +48,12 @@ namespace jit {
 using namespace ngen;
 
 template <HW hw>
-class binary_format_kernel_t : public jit_generator<hw> {
+class binary_format_kernel_t : public generator_t<hw> {
     NGEN_FORWARD_OPENCL(hw);
 
 public:
-    binary_format_kernel_t() {
+    binary_format_kernel_t()
+        : generator_t<hw>({GENERATOR_NAME, GENERATOR_LINE}) {
 
         auto low_half = [](uint64_t q) -> uint32_t { return q & 0xFFFFFFFF; };
         auto high_half = [](uint64_t q) -> uint32_t { return q >> 32; };
@@ -158,8 +164,7 @@ class binary_format_kernel_t : public jit_generator<hw> {
         if (hw != HW::Unknown) {
             binary_format_kernel_t<hw> binary_format_kernel;
 
-            auto status
-                    = engine->create_kernel(&kernel, &binary_format_kernel, {});
+            auto status = engine->create_kernel(&kernel, &binary_format_kernel);
 
             if (status != status::success) return nullptr;
             *skip_check = binary_format_kernel.binaryIsZebin();
@@ -193,7 +198,16 @@ class binary_format_kernel_t : public jit_generator<hw> {
                     kernel = binary_format_kernel_t<HW::Xe2>::make_kernel(
                             engine, skip_check);
                     break;
-                case compute::gpu_arch_t::unknown: kernel = nullptr; break;
+                case compute::gpu_arch_t::xe3:
+                    kernel = binary_format_kernel_t<HW::Xe3>::make_kernel(
+                            engine, skip_check);
+                    break;
+                case compute::gpu_arch_t::unknown:
+                    VWARN(common, runtime,
+                            "unknown gpu platform - optimizations are disabled "
+                            "for binary format kernel");
+                    kernel = nullptr;
+                    break;
             }
         }
         return kernel;
@@ -203,8 +217,24 @@ class binary_format_kernel_t : public jit_generator<hw> {
 status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
     *ok = false;
 
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+    if (engine->runtime_kind() == runtime_kind::ocl) {
+        // Here we are doing a check for a temporary OpenCL engine while the
+        // GPU runtime is SYCL. nGEN-based kernels are all SYCL based in the
+        // current build so just exit here.
+        // This check should be already done for the "parent" SYCL engine so
+        // any potential errors should have been caught before this point.
+        *ok = true;
+        return status::success;
+    }
+#endif
+
     auto gpu_engine = utils::downcast<compute::compute_engine_t *>(engine);
-    if (!gpu_engine) return status::invalid_arguments;
+
+    if (!gpu_engine) {
+        VERROR(common, runtime, "bad engine kind, expected a gpu engine");
+        return status::invalid_arguments;
+    }
 
     impl::stream_t *stream_generic;
     auto status = gpu_engine->get_service_stream(stream_generic);
@@ -223,6 +253,8 @@ status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
         return status::success;
     }
 
+    VWARN(common, runtime, "binary kernel is not in zebin format");
+
     // Binary kernel check.
     uint32_t magic0 = MAGIC0;
     uint64_t magic1 = MAGIC1;
@@ -236,16 +268,28 @@ status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
     std::unique_ptr<memory_storage_t> magic_buf, result_buf;
 
     status = engine->create_memory_storage(&storage, sizeof(int32_t));
-    if (status != status::success) return status::runtime_error;
+    if (status != status::success) {
+        VERROR(common, runtime,
+                "failed to create memory storage during binary kernel check");
+        return status::runtime_error;
+    }
     magic_buf.reset(storage);
 
     status = engine->create_memory_storage(&storage, sizeof(int32_t));
-    if (status != status::success) return status::runtime_error;
+    if (status != status::success) {
+        VERROR(common, runtime,
+                "failed to create memory storage during binary kernel check");
+        return status::runtime_error;
+    }
     result_buf.reset(storage);
 
     void *magic_host = nullptr;
     magic_buf->map_data(&magic_host, nullptr, sizeof(int32_t));
-    if (!magic_host) return status::runtime_error;
+    if (!magic_host) {
+        VERROR(common, runtime,
+                "failed to map data during binary kernel check");
+        return status::runtime_error;
+    }
 
     *reinterpret_cast<uint32_t *>(magic_host) = magic_ptr;
 
@@ -253,7 +297,11 @@ status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
 
     void *result_host = nullptr;
     result_buf->map_data(&result_host, nullptr, sizeof(int32_t));
-    if (!result_host) return status::runtime_error;
+    if (!result_host) {
+        VERROR(common, runtime,
+                "failed to map data during binary kernel check");
+        return status::runtime_error;
+    }
 
     *reinterpret_cast<uint32_t *>(result_host) = 0;
 
@@ -266,8 +314,8 @@ status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
     arg_list.set(3, magic3);
     arg_list.set(4, magic4);
     arg_list.set(5, magic5);
-    arg_list.set(6, *magic_buf.get());
-    arg_list.set(7, *result_buf.get());
+    arg_list.set(6, *magic_buf);
+    arg_list.set(7, *result_buf);
 
     compute::range_t gws = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ};
     compute::range_t lws = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ};
@@ -278,14 +326,26 @@ status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
     status = kernel.parallel_for(*stream, nd_range, arg_list,
             compute_stream->ctx().get_deps(), compute_stream->ctx().get_deps());
 
-    if (status != status::success) return status::runtime_error;
+    if (status != status::success) {
+        VERROR(common, runtime,
+                "failed to execute kernel during binary kernel check");
+        return status::runtime_error;
+    }
 
     status = stream->wait();
-    if (status != status::success) return status::runtime_error;
+    if (status != status::success) {
+        VERROR(common, runtime,
+                "failed to execute stream during binary kernel check");
+        return status::runtime_error;
+    }
 
     result_host = nullptr;
     result_buf->map_data(&result_host, nullptr, sizeof(int32_t));
-    if (!result_host) return status::runtime_error;
+    if (!result_host) {
+        VERROR(common, runtime,
+                "failed to map data during binary kernel check");
+        return status::runtime_error;
+    }
 
     auto result = *reinterpret_cast<uint32_t *>(result_host);
 
@@ -301,3 +361,7 @@ status_t gpu_supports_binary_format(bool *ok, impl::engine_t *engine) {
 } // namespace gpu
 } // namespace impl
 } // namespace dnnl
+
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic pop
+#endif
diff --git a/src/gpu/intel/jit/codegen/bank_conflict_allocation.cpp b/src/gpu/intel/jit/codegen/bank_conflict_allocation.cpp
index c93ba075c87..94772bced75 100644
--- a/src/gpu/intel/jit/codegen/bank_conflict_allocation.cpp
+++ b/src/gpu/intel/jit/codegen/bank_conflict_allocation.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,12 +25,6 @@
 #include <vector>
 #include <initializer_list>
 
-#if defined(__GNUC__) && __GNUC__ == 7
-// GCC 7.x issues a false positive warning 'array subscript is above array bounds'
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -49,7 +43,7 @@ struct hw_context_t {
                 reg_bank_stride = i;
                 break;
             }
-        ir_assert(reg_bank_stride != -1);
+        gpu_assert(reg_bank_stride != -1);
 
         bank_masks.resize(ngen::Bundle::bank_count(hw));
         bundle_masks.resize(ngen::Bundle::bundle_count(hw));
@@ -63,10 +57,10 @@ struct hw_context_t {
             } else {
                 // Ensure bank/bundle pattern is repeated.
                 int j = (i % 64);
-                ir_assert((bank_masks[bank] & (1ull << j)) != 0);
+                gpu_assert((bank_masks[bank] & (1ull << j)) != 0);
                 //XeLP and Gen9 only have two bundles
                 if (hw > ngen::HW::XeLP)
-                    ir_assert((bundle_masks[bundle] & (1ull << j)) != 0);
+                    gpu_assert((bundle_masks[bundle] & (1ull << j)) != 0);
             }
         }
     }
@@ -89,9 +83,10 @@ struct hw_context_t {
             case ngen::HW::XeLP:
             case ngen::HW::XeHP:
             case ngen::HW::XeHPG: return 8;
+            case ngen::HW::XeHPC:
             case ngen::HW::Xe2:
-            case ngen::HW::XeHPC: return 16;
-            default: ir_error_not_expected();
+            case ngen::HW::Xe3: return 16;
+            default: gpu_error_not_expected();
         }
         return -1;
     }
@@ -367,6 +362,7 @@ struct reg_t {
         return block->masks[off].bank();
     }
 
+    // NOLINTNEXTLINE(readability-make-member-function-const)
     void exclude(const reg_mask_t &mask) {
         if (is_empty()) return;
         block->masks[off].subtract(mask);
@@ -402,7 +398,7 @@ struct reg_buf_mask_t {
     reg_buf_mask_t(const hw_context_t *hw_ctx, int regs, int block_regs = 0)
         : hw_ctx(hw_ctx), regs(regs), block_regs(block_regs) {
         if (block_regs == 0) this->block_regs = regs;
-        ir_assert(regs % this->block_regs == 0);
+        gpu_assert(regs % this->block_regs == 0);
         for (int i = 0; i < nblocks(); i++) {
             blocks.emplace_back(hw_ctx, this->block_regs);
         }
@@ -414,7 +410,7 @@ struct reg_buf_mask_t {
     int nblocks() const { return regs / block_regs; }
 
     reg_t get_reg(int off_bytes) {
-        ir_assert(off_bytes < size());
+        gpu_assert(off_bytes < size());
         off_bytes /= hw_ctx->reg_size;
         int block_idx = off_bytes / block_regs;
         int reg_idx = off_bytes % block_regs;
@@ -453,7 +449,7 @@ struct search_context_t {
         saved_blocks.resize(nblocks() * nblocks());
     }
 
-    int nblocks() { return int(blocks.size()); }
+    int nblocks() const { return int(blocks.size()); }
 
     void set_check_bundles(bool value = true) { check_bundles = value; }
 
@@ -463,7 +459,7 @@ struct search_context_t {
 
     // Saves block masks for the current recursion level.
     void save_blocks() {
-        ir_assert(saved_block_idx + nblocks() <= int(saved_blocks.size()));
+        gpu_assert(saved_block_idx + nblocks() <= int(saved_blocks.size()));
         for (int i = 0; i < nblocks(); i++) {
             saved_blocks[saved_block_idx + i] = *blocks[i];
         }
@@ -474,7 +470,7 @@ struct search_context_t {
     // Restores saved block masks.
     void restore_blocks() {
         saved_block_idx -= nblocks();
-        ir_assert(saved_block_idx >= 0);
+        gpu_assert(saved_block_idx >= 0);
         for (int i = 0; i < nblocks(); i++) {
             *blocks[i] = saved_blocks[saved_block_idx + i];
         }
@@ -639,9 +635,10 @@ reg_mask_t create_available_reg_mask(
 } // namespace
 
 bank_conflict_allocation_t bank_conflict_allocation_t::create(
-        reg_allocator_t &ra, int regs, const bank_conflict_attr_t &attr) {
+        reg_allocator_t &ra, const bank_conflict_attr_t &attr) {
+    int regs = ra.getRegisterCount();
     hw_context_t hw_ctx(ra.hardware(), regs);
-    ir_assert(regs <= reg_mask_t::max_regs);
+    gpu_assert(regs <= reg_mask_t::max_regs);
 
     bool is_dpas = false;
     bool is_dp4a = false;
@@ -659,7 +656,7 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
             auto &mad = func.as<mad_t>();
             is_f64 = mad.dst_type.is_f64();
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
 
@@ -687,7 +684,7 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
 
     auto create_reg = [&](const expr_t &e, int src_idx, int off_bytes) {
         if (is_zero(e)) return reg_t();
-        auto base = get_base(e);
+        const auto &base = get_base(e);
         int off = 0;
         if (!is_var(e)) off = to_cpp<int>(e.as<ptr_t>().off);
         off += off_bytes;
@@ -697,7 +694,7 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
                 return buf_masks[i].get_reg(off);
             }
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return reg_t();
     };
 
@@ -719,7 +716,7 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
             src0 = dpas_t::arg_src0(call);
             src1 = dpas_t::arg_src1(call);
             src2 = dpas_t::arg_src2(call);
-            if (!dpas.is_dp4a()) ir_assert(simd == hw_simd);
+            if (!dpas.is_dp4a()) gpu_assert(simd == hw_simd);
         } else if (call.func.is<mad_t>()) {
             auto &mad = call.func.as<mad_t>();
             simd = mad.exec_size;
@@ -730,7 +727,7 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
             src1 = mad_t::arg_src1(call);
             src2 = mad_t::arg_src2(call);
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
         for (int off = 0; off < simd; off += hw_simd) {
             auto _src0 = create_reg(src0, 0, off * src0_stride_bytes);
@@ -743,7 +740,7 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
     std::vector<reg_block_mask_t *> blocks;
 
     for (size_t i = 0; i < bufs.size(); i++)
-        ir_assert(buf_src_idx[i] != -1)
+        gpu_assert(buf_src_idx[i] != -1)
                 << "Buffer is not referenced: " << bufs[i];
 
     // Heuristic: search for register blocks in this order: src1, src2, src0.
@@ -775,8 +772,8 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
         ctx.reset_steps();
         ctx.set_check_bundles(check_bundles);
 
-        ir_assert(ctx.saved_block_idx == 0);
-        ir_assert(ctx.reg_mask == reg_mask);
+        gpu_assert(ctx.saved_block_idx == 0);
+        gpu_assert(ctx.reg_mask == reg_mask);
 
 #ifdef DNNL_DEV_MODE
         double search_time = get_msec();
@@ -784,12 +781,12 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
         found = search(ctx);
 #ifdef DNNL_DEV_MODE
         search_time = get_msec() - search_time;
-        ir_trace() << "Bank conflict allocation:" << std::endl;
-        ir_trace() << "    Search time: " << search_time << " ms" << std::endl;
-        ir_trace() << "    Status: " << (found ? "OK" : "FAIL") << std::endl;
-        ir_trace() << "    Steps: " << ctx.steps << std::endl;
-        ir_trace() << "    Bundle check: "
-                   << ir_utils::to_string(ctx.check_bundles) << std::endl;
+        gpu_trace() << "Bank conflict allocation:";
+        gpu_trace() << "    Search time: " << search_time << " ms";
+        gpu_trace() << "    Status: " << (found ? "OK" : "FAIL");
+        gpu_trace() << "    Steps: " << ctx.steps;
+        gpu_trace() << "    Bundle check: "
+                    << ir_utils::to_string(ctx.check_bundles);
 #endif
         if (found) break;
     }
@@ -815,7 +812,8 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
             ngen::Bundle bundle;
             // Choose the opposite bank for src0 or src2.
             if (is_src02 && bank != -1)
-                bundle = ngen::Bundle(1 - bank, ngen::Bundle::any);
+                bundle = ngen::Bundle(
+                        into<int8_t>(1 - bank), ngen::Bundle::any);
             auto &mask = buf_masks[i].blocks[0].masks[0];
             auto range = ra.alloc_range(regs, bundle);
             int base = range[0].getBase();
@@ -847,7 +845,3 @@ bank_conflict_allocation_t bank_conflict_allocation_t::create(
 } // namespace gpu
 } // namespace impl
 } // namespace dnnl
-
-#if defined(__GNUC__) && __GNUC__ == 7
-#pragma GCC diagnostic pop
-#endif
diff --git a/src/gpu/intel/jit/codegen/bank_conflict_allocation.hpp b/src/gpu/intel/jit/codegen/bank_conflict_allocation.hpp
index fece64e2747..f90de9a8ed9 100644
--- a/src/gpu/intel/jit/codegen/bank_conflict_allocation.hpp
+++ b/src/gpu/intel/jit/codegen/bank_conflict_allocation.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include "gpu/intel/jit/codegen/reg_buf.hpp"
 #include "gpu/intel/jit/codegen/register_allocator.hpp"
 #include "gpu/intel/jit/ir/ir.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -40,15 +40,15 @@ class bank_conflict_allocation_t {
     int refs() const { return refs_; }
 
     void retain() {
-        ir_assert(refs_ > 0);
+        gpu_assert(refs_ > 0);
         refs_++;
     }
 
     void release(const expr_t &buf) {
-        ir_assert(refs_ > 0);
+        gpu_assert(refs_ > 0);
         refs_--;
         auto it = buf_map_.find(buf);
-        ir_assert(it != buf_map_.end()) << "Buffer not found: " << buf;
+        gpu_assert(it != buf_map_.end()) << "Buffer not found: " << buf;
         it->second.release(*ra_);
         buf_map_.erase(it);
     }
@@ -60,11 +60,11 @@ class bank_conflict_allocation_t {
     void set_reg_buf(const expr_t &buf, const reg_buf_t &reg_buf) {
         auto ret = buf_map_.emplace(buf, reg_buf);
         reg_buf.claim(*ra_);
-        ir_assert(ret.second) << "Buffer already exists: " << buf;
+        gpu_assert(ret.second) << "Buffer already exists: " << buf;
     }
 
     static bank_conflict_allocation_t create(
-            reg_allocator_t &ra, int regs, const bank_conflict_attr_t &_attr);
+            reg_allocator_t &ra, const bank_conflict_attr_t &_attr);
 
 private:
     int refs_ = 0;
diff --git a/src/gpu/intel/jit/codegen/codegen.cpp b/src/gpu/intel/jit/codegen/codegen.cpp
index 3f88cb8c35d..f9be11e7e2a 100644
--- a/src/gpu/intel/jit/codegen/codegen.cpp
+++ b/src/gpu/intel/jit/codegen/codegen.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,16 +14,21 @@
 * limitations under the License.
 *******************************************************************************/
 
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
+
 #include "gpu/intel/jit/codegen/bank_conflict_allocation.hpp"
 #include "gpu/intel/jit/codegen/kernel.hpp"
 #include "gpu/intel/jit/codegen/reduce.hpp"
 #include "gpu/intel/jit/codegen/register_scope.hpp"
 #include "gpu/intel/jit/codegen/reorder.hpp"
 #include "gpu/intel/jit/codegen/send.hpp"
+#include "gpu/intel/jit/eltwise_injector.hpp"
 #include "gpu/intel/jit/ir/eltwise.hpp"
 #include "gpu/intel/jit/ir/fma.hpp"
-#include "gpu/intel/jit/jit_eltwise_injector.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -32,7 +37,7 @@ namespace intel {
 namespace jit {
 
 inline ngen::ConditionModifier cmp_op_to_ngen(op_kind_t op_kind) {
-    ir_assert(is_cmp_op(op_kind));
+    gpu_assert(is_cmp_op(op_kind));
     switch (op_kind) {
         case op_kind_t::_eq: return ngen::ConditionModifier::eq;
         case op_kind_t::_ne: return ngen::ConditionModifier::ne;
@@ -40,32 +45,35 @@ inline ngen::ConditionModifier cmp_op_to_ngen(op_kind_t op_kind) {
         case op_kind_t::_gt: return ngen::ConditionModifier::gt;
         case op_kind_t::_le: return ngen::ConditionModifier::le;
         case op_kind_t::_lt: return ngen::ConditionModifier::lt;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return ngen::ConditionModifier::none;
 }
 
 // Lowers IR to nGEN.
-template <ngen::HW hw>
+template <typename ngen_generator_t>
 class ir_to_ngen_t : public ir_visitor_t {
 public:
-    ir_to_ngen_t(ir_kernel_t<hw> *host, const expr_binding_t &expr_binding)
+    ir_to_ngen_t(ngen_generator_t *host, const expr_binding_t &expr_binding)
         : host_(host)
         , expr_binding_(expr_binding)
         , simd_size_(host->getSIMD())
-        , eu_count_(host->exec_cfg_.hw().eu_count())
-        , with_atomic_fp64_(host->exec_cfg_.hw().has_fp64_atomic_support()) {}
+        , eu_count_(host->exec_cfg().hw().eu_count())
+        , with_atomic_fp64_(host->exec_cfg().hw().has_fp64_atomic_support()) {}
 
-    ~ir_to_ngen_t() {
+    ~ir_to_ngen_t() override
 #ifdef DNNL_DEV_MODE
+    {
         if (bank_conflicts_ > 0)
-            ir_warning() << "Found bank conflicts: " << bank_conflicts_
-                         << std::endl;
+            gpu_warning() << "Found bank conflicts: " << bank_conflicts_;
         if (bundle_conflicts_ > 0)
-            ir_warning() << "Found bundle conflicts: " << bundle_conflicts_
-                         << std::endl;
-#endif
+            gpu_warning() << "Found bundle conflicts: " << bundle_conflicts_;
     }
+#else
+            = default;
+#endif
+
+    ngen::HW hw() const { return host_->getHardware(); }
 
     void _visit(const alloc_t &obj) override {
         auto scope = register_scope();
@@ -80,7 +88,8 @@ class ir_to_ngen_t : public ir_visitor_t {
             } else if (obj.size * 8 <= max_ngen_type_bits) {
                 rbd = scope.alloc_reg_data(type_t::u(obj.size * 8));
             } else {
-                const int regs = utils::div_up(obj.size, ngen::GRF::bytes(hw));
+                const int regs
+                        = utils::div_up(obj.size, ngen::GRF::bytes(hw()));
                 rbd = scope.alloc_reg_buf(regs);
             }
             if (obj.has_attr<grf_permute_attr_t>()) {
@@ -89,14 +98,15 @@ class ir_to_ngen_t : public ir_visitor_t {
             }
             expr_binding_.bind(obj.buf, rbd);
         }
-        ir_trace() << "codegen:bind " << obj.buf << " -> "
-                   << expr_binding_.get(obj.buf) << std::endl;
+        host_->comment(
+                obj.line_str() + " -> " + expr_binding_.get(obj.buf).str());
         visit(obj.body);
         if (do_alloc) expr_binding_.unbind(obj.buf);
         if (use_bc_alloc) release_bank_conflict_allocation(obj);
     }
 
     void _visit(const for_t &obj) override {
+        host_->comment(obj.line_str());
         auto scope = register_scope();
         auto var_op = scope.alloc_reg_data(obj.var.type());
         bool dynamic_loop = !is_const(obj.init) || !is_const(obj.bound);
@@ -104,10 +114,12 @@ class ir_to_ngen_t : public ir_visitor_t {
         auto bound_op = eval(obj.bound, scope);
         auto step_op = eval(obj.step, scope);
 
-        host_->emov(1, var_op, init_op);
         expr_binding_.bind(obj.var, var_op);
-        ir_trace() << "codegen:bind " << obj.var << " -> "
-                   << expr_binding_.get(obj.var) << std::endl;
+        host_->comment(
+                obj.var.str() + " -> " + expr_binding_.get(obj.var).str());
+
+        host_->emov(1, var_op, init_op);
+
         // For dynamic loops use standard format otherwise
         // use do-while format.
         if (dynamic_loop) {
@@ -132,9 +144,11 @@ class ir_to_ngen_t : public ir_visitor_t {
         }
 
         expr_binding_.unbind(obj.var);
+        host_->comment("end " + obj.line_str());
     }
 
     void _visit(const func_call_t &obj) override {
+        host_->comment(obj.line_str());
         auto scope = register_scope();
 
         auto &func = obj.func;
@@ -151,24 +165,24 @@ class ir_to_ngen_t : public ir_visitor_t {
             // If all channels are disabled for writing, quick return.
             if (all_of(mask, expr_t(false))) {
                 if (send_func.is_load() || send_func.is_load_2d()) {
-                    auto reg_buf_op = ir_to_ngen_t<hw>::eval(
-                            send_t::arg_reg_buf(args), scope);
-                    zero_out(reg_buf_op, send_func.payload_size());
+                    auto reg_buf_op = eval(send_t::arg_reg_buf(args), scope);
+                    auto pattern_op
+                            = eval(send_t::arg_fill_pattern(args), scope);
+                    fill_buf(reg_buf_op, send_func.payload_size(), pattern_op);
                 }
                 return;
             }
             // If all channels are enabled, do not use mask.
             if (all_of(mask, expr_t(true))) mask = expr_t();
-            auto arg_ops = ir_to_ngen_t<hw>::eval(args, scope);
+            auto arg_ops = eval(args, scope);
             send(scope, func.as<send_t>(), arg_ops, obj.attr);
         } else if (func.is<reorder_t>()) {
             auto arg_ops = eval(obj.args, scope);
-            ir_assert(obj.attr.is_empty()) << "Unexpected attribute.";
-            reorder(scope, func.as<reorder_t>(), reorder_t::arg_src_buf(obj),
-                    arg_ops);
+            gpu_assert(obj.attr.is_empty()) << "Unexpected attribute.";
+            reorder(scope, func.as<reorder_t>(), arg_ops);
         } else if (func.is<reduce_t>()) {
             auto arg_ops = eval(obj.args, scope);
-            ir_assert(obj.attr.is_empty()) << "Unexpected attribute.";
+            gpu_assert(obj.attr.is_empty()) << "Unexpected attribute.";
             reduce(scope, func.as<reduce_t>(), arg_ops);
         } else if (func.is<eltwise_t>()) {
             auto &eltwise_func = func.as<eltwise_t>();
@@ -184,14 +198,15 @@ class ir_to_ngen_t : public ir_visitor_t {
             slm_fence(obj.attr);
         } else if (func.is_same(funcs::zero_out_func())) {
             auto buf_op = eval(obj.args[0], scope);
-            zero_out(buf_op.reg_buf_data(), to_cpp<int>(obj.args[1]));
+            fill_buf(buf_op.reg_buf_data(), to_cpp<int>(obj.args[1]));
         } else {
-            ir_error_not_expected() << object_t(obj);
+            gpu_error_not_expected() << object_t(obj);
         }
     }
 
     void _visit(const if_t &obj) override {
-        ir_assert(obj.cond.type().elems() == simd_size_);
+        gpu_assert(obj.cond.type().elems() == simd_size_);
+        host_->comment(obj.line_str());
 
         bool has_else = !obj.else_body.is_empty();
         auto scope = register_scope();
@@ -203,27 +218,29 @@ class ir_to_ngen_t : public ir_visitor_t {
                 has_else ? l_else : l_endif, l_endif);
         visit(obj.body);
         if (has_else) {
+            host_->comment("else // " + obj.line_str());
             host_->else_(simd_size_, l_endif, l_endif);
             host_->mark(l_else);
             visit(obj.else_body);
         }
         host_->mark(l_endif);
         host_->endif(simd_size_);
+        host_->comment("end " + obj.line_str());
     }
 
     void _visit(const let_t &obj) override {
         if (obj.value.is_empty()) {
             auto var_op = expr_binding_.get(obj.var);
-            ir_trace() << "codegen:bind " << obj.var << " -> " << var_op
-                       << std::endl;
+            host_->comment(obj.line_str() + " -> " + var_op.str());
             // External variable, must be already bound.
-            ir_assert(expr_binding_.is_bound(obj.var))
+            gpu_assert(expr_binding_.is_bound(obj.var))
                     << "Variable is not defined: " << obj.var;
             visit(obj.body);
             return;
         }
 
         auto scope = register_scope();
+        host_->comment(obj.line_str());
         if (is_const(obj.value) || is_shuffle_const(obj.value)
                 || obj.var.type() != obj.value.type()) {
             auto &var_type = obj.var.type();
@@ -238,8 +255,7 @@ class ir_to_ngen_t : public ir_visitor_t {
         }
 
         auto var_op = expr_binding_.get(obj.var);
-        ir_trace() << "codegen:bind " << obj.var << " -> " << var_op
-                   << std::endl;
+        host_->comment(obj.var.str() + " -> " + var_op.str());
 
         // At this point the scope contains allocations for temporary
         // expressions. We need to 1) query and later re-claim the allocation
@@ -271,29 +287,47 @@ class ir_to_ngen_t : public ir_visitor_t {
     }
 
     void _visit(const store_t &obj) override {
+        host_->comment(obj.line_str());
         auto scope = register_scope();
         auto buf_op = eval(obj.buf, scope);
         auto off = to_cpp<int>(obj.off);
         auto mask_op = eval(obj.mask, scope);
 
         auto &type = obj.value.type();
+        auto scalar_type = type.scalar();
 
         int stride;
         if (obj.has_default_stride()) {
             stride = 1;
         } else {
-            ir_assert(obj.stride % type.scalar().size() == 0);
-            stride = obj.stride / type.scalar().size();
+            gpu_assert(obj.stride % scalar_type.size() == 0);
+            stride = obj.stride / scalar_type.size();
         }
 
         ngen::InstructionModifier mod = type.elems();
         if (!mask_op.is_invalid()) mod |= mask_op.flag_register_mod();
-        auto dst_rbd = buf_op.reg_buf_data().format(
-                off, to_ngen(type.scalar()), type.elems(), stride);
+        auto dst_rbd = buf_op.reg_buf_data().format(off / scalar_type.size(),
+                type.elems(), stride, to_ngen(scalar_type));
         ngen_operand_t dst(dst_rbd, mod);
         eval(obj.value, scope, dst, obj.fill_mask0 && !mask_op.is_invalid());
     }
 
+    void _visit(const while_t &obj) override {
+        host_->comment(obj.line_str());
+        auto scope = register_scope();
+
+        ngen::Label loop_end_label;
+        ngen::Label loop_begin_label;
+
+        host_->mark(loop_begin_label);
+        auto cond_op = eval(obj.cond, scope);
+        host_->jmpi(1 | ~cond_op.flag_register_mod(), loop_end_label);
+        visit(obj.body);
+        host_->jmpi(1, loop_begin_label);
+        host_->mark(loop_end_label);
+        host_->comment("end " + obj.line_str());
+    }
+
 private:
     ngen_register_scope_t register_scope() {
         return ngen_register_scope_t(host_->ra_);
@@ -304,15 +338,15 @@ class ir_to_ngen_t : public ir_visitor_t {
             const ngen::RegData &_src0, const ngen::RegData &_src1,
             const ngen::RegData &_src2, bool is_dpas = false) {
         int esize = mod.getExecSize();
-        int hw_simd = (hw >= ngen::HW::XeHPC ? 16 : 8);
-        auto shift = [](const ngen::RegData &rd, int exec_off) {
+        int hw_simd = (hw() >= ngen::HW::XeHPC ? 16 : 8);
+        auto shift = [this](const ngen::RegData &rd, int exec_off) {
             if (exec_off == 0 || rd.isNull()) return rd;
             int type_size = ngen::getBytes(rd.getType());
             int w = (exec_off % rd.getWidth());
             int h = (exec_off / rd.getWidth());
             int off = rd.getByteOffset()
                     + (w * rd.getHS() + h * rd.getVS()) * type_size;
-            int grf_size = ngen::GRF::bytes(hw);
+            int grf_size = ngen::GRF::bytes(hw());
             int shifted_base = rd.getBase() + off / grf_size;
             int shifted_off = off % grf_size;
             auto ret = rd;
@@ -324,15 +358,15 @@ class ir_to_ngen_t : public ir_visitor_t {
             auto src0 = shift(_src0, i);
             auto src1 = shift(_src1, i);
             auto src2 = shift(_src2, i);
-            bool same_bank01 = ngen::Bundle::same_bank(hw, src0, src1);
-            bool same_bank02 = ngen::Bundle::same_bank(hw, src0, src2);
+            bool same_bank01 = ngen::Bundle::same_bank(hw(), src0, src1);
+            bool same_bank02 = ngen::Bundle::same_bank(hw(), src0, src2);
             if (is_dpas) {
                 if (same_bank02) bank_conflicts_++;
             } else {
                 if (same_bank01 && same_bank02) bank_conflicts_++;
-                if (ngen::Bundle::conflicts(hw, src0, src1)
-                        || ngen::Bundle::conflicts(hw, src0, src2)
-                        || ngen::Bundle::conflicts(hw, src1, src2)) {
+                if (ngen::Bundle::conflicts(hw(), src0, src1)
+                        || ngen::Bundle::conflicts(hw(), src0, src2)
+                        || ngen::Bundle::conflicts(hw(), src1, src2)) {
                     bundle_conflicts_++;
                 }
             }
@@ -350,8 +384,7 @@ class ir_to_ngen_t : public ir_visitor_t {
             it->second.retain();
             return it->second.get_reg_buf(alloc.buf);
         }
-        auto bca = bank_conflict_allocation_t::create(
-                host_->ra_, host_->regs_, bc_attr);
+        auto bca = bank_conflict_allocation_t::create(host_->ra_, bc_attr);
         if (bca.is_empty()) return {};
 
         auto ret = bc_allocations_.emplace(bc_attr, std::move(bca));
@@ -361,7 +394,7 @@ class ir_to_ngen_t : public ir_visitor_t {
     void release_bank_conflict_allocation(const alloc_t &alloc) {
         auto &bc_attr = alloc.get_attr<bank_conflict_attr_t>();
         auto it = bc_allocations_.find(bc_attr);
-        ir_assert(it != bc_allocations_.end());
+        gpu_assert(it != bc_allocations_.end());
         it->second.release(alloc.buf);
         if (it->second.refs() == 0) bc_allocations_.erase(bc_attr);
     }
@@ -413,21 +446,21 @@ class ir_to_ngen_t : public ir_visitor_t {
         auto &src0_op = dpas_t::arg_src0(args);
         if (!src0_op.is_immediate()) {
             auto src0_rbd = src0_op.reg_buf_data().format(
-                    0, to_ngen(dpas_func.dst_type), esize, 1);
+                    0, esize, 1, to_ngen(dpas_func.dst_type));
             if (dpas_func.is_dpasw) src0_rbd = src0_rbd.unpermute();
             src0 = src0_rbd;
         } else {
-            ir_assert(src0_op.is_immediate());
-            ir_assert(to_cpp<int32_t>(src0_op.immediate()) == 0);
+            gpu_assert(src0_op.is_immediate());
+            gpu_assert(to_cpp<int32_t>(src0_op.immediate()) == 0);
             src0 = host_->null.retype(to_ngen(dpas_func.dst_type));
         }
 
-        dst = dst.format(0, to_ngen(dpas_func.dst_type), esize, 1);
-        src1 = src1.format(0, to_ngen(dpas_func.src1_type), esize, 1);
+        dst = dst.format(0, esize, 1, to_ngen(dpas_func.dst_type));
+        src1 = src1.format(0, esize, 1, to_ngen(dpas_func.src1_type));
         int src2_width = (dpas_func.is_dp4a() ? 1 : esize);
         int src2_stride = (dpas_func.is_dp4a() ? 0 : 1);
         src2 = src2.format(
-                0, to_ngen(dpas_func.src2_type), src2_width, src2_stride);
+                0, src2_width, src2_stride, to_ngen(dpas_func.src2_type));
 
         ngen::InstructionModifier mod = esize;
         if (!attr.is_empty())
@@ -459,24 +492,24 @@ class ir_to_ngen_t : public ir_visitor_t {
         auto &src0_op = mad_t::arg_src0(args);
         if (!src0_op.is_immediate()) {
             src0 = src0_op.reg_buf_data()
-                           .format(0, to_ngen(mad_func.dst_type),
-                                   mad_func.exec_size)
+                           .format(0, mad_func.exec_size, 1,
+                                   to_ngen(mad_func.dst_type))
                            .reg_data();
         } else {
-            ir_assert(src0_op.is_immediate());
-            ir_assert(to_cpp<int32_t>(src0_op.immediate()) == 0);
+            gpu_assert(src0_op.is_immediate());
+            gpu_assert(to_cpp<int32_t>(src0_op.immediate()) == 0);
             src0 = host_->null;
             src0.setType(to_ngen(mad_func.dst_type));
         }
 
-        dst = dst.format(0, to_ngen(mad_func.dst_type), mad_func.exec_size);
+        dst = dst.format(0, mad_func.exec_size, 1, to_ngen(mad_func.dst_type));
 
         int src1_width = (mad_func.src1_stride == 0 ? 1 : mad_func.exec_size);
         int src2_width = (mad_func.src2_stride == 0 ? 1 : mad_func.exec_size);
-        src1 = src1.format(0, to_ngen(mad_func.src1_type), src1_width,
-                mad_func.src1_stride);
-        src2 = src2.format(0, to_ngen(mad_func.src2_type), src2_width,
-                mad_func.src2_stride);
+        src1 = src1.format(0, src1_width, mad_func.src1_stride,
+                to_ngen(mad_func.src1_type));
+        src2 = src2.format(0, src2_width, mad_func.src2_stride,
+                to_ngen(mad_func.src2_type));
 
         ngen::InstructionModifier mod = mad_func.exec_size;
         if (!attr.is_empty())
@@ -486,10 +519,10 @@ class ir_to_ngen_t : public ir_visitor_t {
         if (src0.isNull()) {
             host_->mul(mod, dst, src1, src2);
         } else {
-            ir_assert(dst.byte_offset() == src0.getByteOffset())
+            gpu_assert(dst.byte_offset() == src0.getByteOffset())
                     << "dst/src0 must be aligned to the same GRF offset.";
             align_src_dst_offset(host_, scope, mod, dst, src1, src2);
-            if (hw < ngen::HW::XeLP
+            if (hw() < ngen::HW::XeLP
                     && (ngen_is_dw(to_ngen(mad_func.dst_type))
                             || mad_func.dst_type == type_t::f64()
                             || (src1_width == 1 && src2_width == 1))) {
@@ -497,7 +530,7 @@ class ir_to_ngen_t : public ir_visitor_t {
                 // Use mul/add sequence instead.
                 auto tmp = scope.alloc_range(
                         (mad_func.exec_size * mad_func.dst_type.size())
-                        / ngen::GRF::bytes(hw));
+                        / ngen::GRF::bytes(hw()));
                 auto reg = tmp[0].setType(to_ngen(mad_func.dst_type));
                 host_->mul(mod, reg, src1, src2);
                 host_->add(mod, dst, reg, src0);
@@ -512,17 +545,26 @@ class ir_to_ngen_t : public ir_visitor_t {
         }
     }
 
-    void zero_out(const ngen_operand_t &buf_op, int size) const {
+    void fill_buf(const ngen_operand_t &buf_op, int size,
+            const ngen_operand_t &pattern = {}) const {
         auto &rd = buf_op.reg_buf_data();
-        type_t type = type_t::f32();
-        int grf_size = ngen::GRF::bytes(hw);
-        int step = 2 * grf_size;
-        for (int i = 0; i < size; i += step) {
-            step = std::min(step, size - i);
+        type_t type = (pattern.is_invalid() ? type_t::f32() : type_t::u32());
+        int grf_size = ngen::GRF::bytes(hw());
+        int step = 2 * grf_size / type.size();
+        int elems = size / type.size();
+        for (int i = 0; i < elems; i += step) {
+            step = std::min(step, elems - i);
             step = utils::rnd_down_pow2(step);
-            int exec_size = step / type.size();
-            auto sub_rd_mov = rd.format(i, to_ngen(type), exec_size).reg_data();
-            host_->emov(exec_size, sub_rd_mov, ngen::Immediate(0.0f));
+            auto sub_rd_mov = rd.format(i, step, 1, to_ngen(type)).reg_data();
+            if (pattern.is_invalid()) {
+                host_->emov(step, sub_rd_mov, ngen::Immediate(0));
+            } else if (pattern.is_immediate()) {
+                host_->emov(step, sub_rd_mov, pattern.immediate());
+            } else if (pattern.is_reg_data()) {
+                host_->emov(step, sub_rd_mov, pattern.reg_data());
+            } else {
+                gpu_error_not_expected();
+            }
         }
     }
 
@@ -537,28 +579,29 @@ class ir_to_ngen_t : public ir_visitor_t {
         if (is_dense) return ngen::GRF(buf.base());
 
         if (send_func.is_load() || send_func.is_load_2d()) {
-            ir_error_not_expected()
+            gpu_error_not_expected()
                     << "Expected dense GRF region for load message.";
             return ngen::RegData();
         }
 
-        ir_assert(send_func.is_store() || send_func.is_store_2d()
+        gpu_assert(send_func.is_store() || send_func.is_store_2d()
                 || send_func.is_atomic());
 
         // Reorder buffer to a dense buffer for store.
-        int grf_size = ngen::GRF::bytes(hw);
+        int grf_size = ngen::GRF::bytes(hw());
+        int grf_elems = grf_size / ngen::getBytes(buf.type());
         int regs = utils::div_up(size, grf_size);
 
         auto tmp = scope.alloc_range(regs);
 
-        int dwords = ngen::GRF::bytes(hw) / sizeof(int32_t);
+        int dwords = ngen::GRF::bytes(hw()) / sizeof(int32_t);
         int max_step = 2;
         for (int i = 0; i < regs;) {
-            auto sub_buf = buf.format(i * grf_size);
+            auto sub_buf = buf.format(i * grf_elems);
             int step = std::min(max_step, regs - i);
             if (step > 1 && !sub_buf.is_dense(step * grf_size)) step = 1;
             int esize = step * dwords;
-            auto src = sub_buf.subregister(0, ngen::DataType::ud)(1);
+            auto src = sub_buf.subregister(ngen::DataType::ud)(1);
             auto dst = tmp[i].ud(0)(1);
             host_->emov(esize, dst, src);
             i += step;
@@ -571,23 +614,23 @@ class ir_to_ngen_t : public ir_visitor_t {
             ngen::InstructionModifier &mod, const ngen::RegData &mem_off_op,
             ngen::RegData &rd) const {
         int size = send_func.payload_size();
-        ir_assert(utils::one_of(send_func.type.kind(), type_kind_t::dword,
-                          type_kind_t::qword)
+        gpu_assert(utils::one_of(send_func.type.kind(), type_kind_t::dword,
+                           type_kind_t::qword)
                 && (size == 32 || size == 64))
                 << "expected atomic message dwordx8 or qwordx8";
         auto load_func = send_t::make(send_func.hw, send_op_t::load,
                 send_func.address, send_func.type, send_func.slots,
-                send_func.zero_out, send_func.cache_hint);
+                send_func.fill_buf, send_func.cache_hint);
         auto &load_send = load_func.as<send_t>();
         send_impl_t load(load_send);
         auto cmpwr_func = send_t::make(send_func.hw, send_op_t::atomic_cmpwr,
                 send_func.address, send_func.type, send_func.slots,
-                send_func.zero_out, send_func.cache_hint);
+                send_func.fill_buf, send_func.cache_hint);
         auto &cmpwr_send = cmpwr_func.as<send_t>();
         send_impl_t cmpwr(cmpwr_send);
         bool is_df = send_func.type.kind() == type_kind_t::qword;
 
-        int grf_size = ngen::GRF::bytes(hw);
+        int grf_size = ngen::GRF::bytes(hw());
         int regs = utils::div_up(size, grf_size);
 
         auto new_val = scope.alloc_range(2 * regs);
@@ -633,18 +676,19 @@ class ir_to_ngen_t : public ir_visitor_t {
         auto &mem_off_op = send_t::arg_mem_off(args);
         auto &reg_buf_op = send_t::arg_reg_buf(args);
         auto &mask_op = send_t::arg_mask(args);
+        auto &fill_pattern = send_t::arg_fill_pattern(args);
 
         ngen::InstructionModifier mod = send_func.nmasks();
-        ir_assert(math::is_pow2(mod.getExecSize()));
+        gpu_assert(math::is_pow2(mod.getExecSize()));
         if (!attr.is_empty())
             mod |= to_ngen(attr.as<instruction_modifier_attr_t>().mod);
         if (!mask_op.is_invalid()) mod |= mask_op.flag_register_mod();
 
         // Zero-out inactive channels unless told not to.
-        if (send_func.zero_out
+        if (send_func.fill_buf
                 && (send_func.is_load() || send_func.is_load_2d())
                 && mod.getPredCtrl() != ngen::PredCtrl::None) {
-            zero_out(reg_buf_op, send_func.payload_size());
+            fill_buf(reg_buf_op, send_func.payload_size(), fill_pattern);
         }
 
         // Emit send instruction.
@@ -661,8 +705,8 @@ class ir_to_ngen_t : public ir_visitor_t {
                 mod |= flag;
             }
         }
-        if ((hw <= ngen::HW::XeLP && send_func.is_atomic())
-                || (hw == ngen::HW::XeHPG && send_func.is_atomic()
+        if ((hw() <= ngen::HW::XeLP && send_func.is_atomic())
+                || (hw() == ngen::HW::XeHPG && send_func.is_atomic()
                         && send_func.type.kind() == type_kind_t::qword
                         && !with_atomic_fp64_)) {
             send_atomic_add_emu(
@@ -673,12 +717,11 @@ class ir_to_ngen_t : public ir_visitor_t {
     }
 
     void reorder(ngen_register_scope_t &scope, const reorder_t &reorder_func,
-            const expr_t &src_buf,
             const std::vector<ngen_operand_t> &args) const {
         auto &src_op = reorder_t::arg_src_buf(args);
         auto &dst_op = reorder_t::arg_dst_buf(args);
 
-        reorder_impl_t reorder_impl(hw, reorder_func);
+        reorder_impl_t reorder_impl(hw(), reorder_func);
         reorder_impl.emit(
                 host_, scope, src_op.reg_buf_data(), dst_op.reg_buf_data());
     }
@@ -688,32 +731,44 @@ class ir_to_ngen_t : public ir_visitor_t {
         auto &src_op = reduce_t::arg_src_buf(args);
         auto &dst_op = reduce_t::arg_dst_buf(args);
 
-        reduce_impl_t reduce_impl(hw, reduce_func, simd_size_);
+        reduce_impl_t reduce_impl(hw(), reduce_func, simd_size_);
         reduce_impl.emit(
                 host_, scope, src_op.reg_buf_data(), dst_op.reg_buf_data());
     }
 
     void eltwise(ngen_register_scope_t &scope, const eltwise_t &func,
             const std::vector<ngen_operand_t> &args) {
-        int elems = to_cpp<int>(hw, eltwise_t::arg_elems(args));
+        int elems = to_cpp<int>(hw(), eltwise_t::arg_elems(args));
         auto &data_op = eltwise_t::arg_data(args);
         const auto &data_rd = data_op.reg_buf_data();
 
-        jit_eltwise_injector_f32<hw> inj(host_, func.alg_kind, func.alpha,
-                func.beta, func.scale, eu_count_);
+        eltwise_injector_f32_t<ngen_generator_t> inj(host_, func.alg_kind,
+                func.alpha, func.beta, func.scale, eu_count_);
         auto scratch = scope.alloc_range(inj.preferred_scratch_regs());
         inj.set_scratch(scratch);
         inj.prepare();
 
-        int grf_size = ngen::GRF::bytes(hw);
+        int grf_size = ngen::GRF::bytes(hw());
         int f_size = sizeof(float);
         int step = 2 * grf_size / f_size;
+
+        auto do_eltwise = [&](const reg_buf_data_t &r, const int count) {
+            if (func.alg_kind == alg_kind::eltwise_stochastic_round) {
+                gpu_assert(args.size() == 3);
+                const auto &seed = args[2].reg_buf_data();
+                inj.compute(ngen::GRFRange(r.base(), count),
+                        seed.reg_data().getBase(), seed.reg_data().getOffset(),
+                        func.dst_dt);
+            } else {
+                inj.compute(ngen::GRFRange(r.base(), count));
+            }
+        };
         for (int i = 0; i < elems; i += step) {
             ngen_register_scope_t i_scope(scope.register_allocator());
             step = std::min(step, elems - i);
             step = utils::rnd_down_pow2(step);
             int cur_elems = step;
-            auto rd = data_rd.format(i * f_size, ngen::DataType::f);
+            auto rd = data_rd.format(i, ngen::DataType::f);
             // Use temporary storage when needed to ensure:
             // - Eltwise is applied to full register
             // - Data is aligned to GRF boundary
@@ -722,14 +777,12 @@ class ir_to_ngen_t : public ir_visitor_t {
                         = utils::rnd_up(cur_elems * f_size, grf_size) / f_size;
                 auto tmp = i_scope.alloc_reg_data(type_t::f32(full_elems));
                 emit_reorder_1d_tile(
-                        hw, host_, i_scope, cur_elems, rd, 1, tmp, 1);
-                inj.compute(ngen::GRFRange(
-                        tmp.base(), full_elems * f_size / grf_size));
+                        hw(), host_, i_scope, cur_elems, rd, 1, tmp, 1);
+                do_eltwise(tmp, full_elems * f_size / grf_size);
                 emit_reorder_1d_tile(
-                        hw, host_, i_scope, cur_elems, tmp, 1, rd, 1);
+                        hw(), host_, i_scope, cur_elems, tmp, 1, rd, 1);
             } else {
-                inj.compute(ngen::GRFRange(
-                        rd.base(), cur_elems * f_size / grf_size));
+                do_eltwise(rd, cur_elems * f_size / grf_size);
             }
         }
     }
@@ -738,18 +791,20 @@ class ir_to_ngen_t : public ir_visitor_t {
     ngen_operand_t eval(const expr_t &e, ngen_register_scope_t &scope,
             const ngen_operand_t &dst_operand = ngen_operand_t(),
             bool fill_mask0 = false) const {
-        expr_evaluator_t<hw> expr_evaluator(host_, expr_binding_, scope);
+        expr_evaluator_t<ngen_generator_t> expr_evaluator(
+                host_, expr_binding_, scope);
         return expr_evaluator.eval(e, dst_operand, fill_mask0);
     }
 
     std::vector<ngen_operand_t> eval(const std::vector<expr_t> &exprs,
             ngen_register_scope_t &scope) const {
-        expr_evaluator_t<hw> expr_evaluator(host_, expr_binding_, scope);
+        expr_evaluator_t<ngen_generator_t> expr_evaluator(
+                host_, expr_binding_, scope);
         return expr_evaluator.eval(exprs);
     }
 
 private:
-    ir_kernel_t<hw> *host_;
+    ngen_generator_t *host_;
     expr_binding_t expr_binding_;
     int simd_size_;
     int eu_count_;
@@ -764,13 +819,15 @@ class ir_to_ngen_t : public ir_visitor_t {
 };
 
 // Evaluates expression by emitting instructions with nGEN.
-template <ngen::HW hw>
+template <typename ngen_generator_t>
 class expr_evaluator_t : public ir_visitor_t {
 public:
-    expr_evaluator_t(ir_kernel_t<hw> *host, const expr_binding_t &expr_binding,
+    expr_evaluator_t(ngen_generator_t *host, const expr_binding_t &expr_binding,
             ngen_register_scope_t &scope)
         : host_(host), expr_binding_(expr_binding), scope_(scope) {}
 
+    constexpr ngen::HW hw() const { return host_->getHardware(); }
+
     bool is_int_up_convert(const expr_t &e, type_t &type) const {
         auto it = int_up_converts_.find(e);
         if (it == int_up_converts_.end()) return false;
@@ -784,13 +841,13 @@ class expr_evaluator_t : public ir_visitor_t {
             const ngen_operand_t &dst_operand = ngen_operand_t(),
             bool fill_mask0 = false) {
         if (!dst_operand.is_invalid()) {
-            ir_assert(dst_operand.mod().getExecSize() != 0);
+            gpu_assert(dst_operand.mod().getExecSize() != 0);
         }
         if (expr_binding_.is_bound(e)) {
             if (!dst_operand.is_invalid()) {
                 auto bind = expr_binding_.get(e);
                 if (fill_mask0) {
-                    ir_assert(!bind.is_immediate());
+                    gpu_assert(!bind.is_immediate());
                     host_->sel(dst_operand.mod(), dst_operand.reg_data(),
                             bind.reg_data(), 0);
                 } else {
@@ -806,7 +863,7 @@ class expr_evaluator_t : public ir_visitor_t {
                 visit(e);
             } else {
                 auto op = eval(e);
-                ir_assert(!op.is_immediate());
+                gpu_assert(!op.is_immediate());
                 host_->sel(dst_operand.mod(), dst_operand.reg_data(),
                         op.reg_data(), 0);
             }
@@ -908,11 +965,11 @@ class expr_evaluator_t : public ir_visitor_t {
                         // mul(q, d, d) instruction on XeHP. For some reason
                         // the result is incorrect when dst and src0 are
                         // accessed from the same register.
-                        if (hw > ngen::HW::XeLP)
+                        if (hw() > ngen::HW::XeLP)
                             host_->sync(ngen::SyncFunction::nop,
                                     ngen::SWSB<uint64_t>(1));
                     } else {
-                        ir_error_not_expected();
+                        gpu_error_not_expected();
                     }
                 }
                 ebinary(obj, mod, dst_op, src0_op, src1_op);
@@ -930,14 +987,14 @@ class expr_evaluator_t : public ir_visitor_t {
         // - All boolean values in IR must be expressed by shuffle_t objects
         // - _visit(shuffle_t *) must properly handle vector of booleans -> flag
         //   register lowering
-        ir_error_not_expected();
+        gpu_error_not_expected();
     }
 
     void _visit(const cast_t &obj) override {
         auto &from_type = obj.expr.type();
         auto &to_type = obj.type;
 
-        ir_assert(from_type != to_type) << "Equal types are not expected.";
+        gpu_assert(from_type != to_type) << "Equal types are not expected.";
 
         if (is_const(obj.expr) && !to_type.is_bool()) {
             if (obj.expr.type().is_bool()) {
@@ -985,22 +1042,34 @@ class expr_evaluator_t : public ir_visitor_t {
 
     void _visit(const float_imm_t &obj) override { bind(obj, to_ngen(obj)); }
 
+    void _visit(const iif_t &obj) override {
+        auto dst_op = alloc_dst_op(obj);
+        auto cond_op = eval(obj.cond);
+        auto true_expr_op = eval(obj.true_expr);
+        auto false_expr_op = eval(obj.false_expr);
+        auto mod = dst_op.mod();
+        host_->esel(mod | cond_op.flag_register_mod(), dst_op, true_expr_op,
+                false_expr_op);
+        bind(obj, dst_op);
+    }
+
     void _visit(const int_imm_t &obj) override { bind(obj, to_ngen(obj)); }
 
     void _visit(const load_t &obj) override {
         auto &type = obj.type;
+        auto scalar_type = type.scalar();
         auto buf_op = eval(obj.buf);
         auto off_op = eval(obj.off);
         int stride;
         if (obj.has_default_stride()) {
             stride = 1;
         } else {
-            ir_assert(obj.stride % type.scalar().size() == 0);
-            stride = obj.stride / type.scalar().size();
+            gpu_assert(obj.stride % scalar_type.size() == 0);
+            stride = obj.stride / scalar_type.size();
         }
-        auto load_rbd
-                = buf_op.reg_buf_data().format(to_cpp<int>(off_op.immediate()),
-                        to_ngen(type.scalar()), type.elems(), stride);
+        int off = to_cpp<int>(off_op.immediate());
+        auto load_rbd = buf_op.reg_buf_data().format(off / scalar_type.size(),
+                type.elems(), stride, to_ngen(scalar_type));
         bind(obj, load_rbd);
     }
 
@@ -1012,7 +1081,7 @@ class expr_evaluator_t : public ir_visitor_t {
             return;
         }
 
-        ir_assert(base_op.is_reg_buf_data());
+        gpu_assert(base_op.is_reg_buf_data());
 
         int off = to_cpp<int>(obj.off);
         bind(obj, base_op.reg_buf_data().format(off, ngen::DataType::ub));
@@ -1023,11 +1092,11 @@ class expr_evaluator_t : public ir_visitor_t {
         if (obj.type.is_bool() && is_shuffle_const(obj)) {
             auto dst_op = alloc_dst_op(obj);
             auto e_shuffle = expr_t(obj);
-            ir_assert(dst_op.is_flag_register()
+            gpu_assert(dst_op.is_flag_register()
                     || dst_op.type() == ngen::DataType::uw
                     || dst_op.type() == ngen::DataType::ud)
                     << e_shuffle;
-            ir_assert(!dst_op.is_negated()) << e_shuffle;
+            gpu_assert(!dst_op.is_negated()) << e_shuffle;
             uint32_t flag_mask = 0;
             for (int i = elems - 1; i >= 0; i--) {
                 flag_mask <<= 1;
@@ -1036,7 +1105,7 @@ class expr_evaluator_t : public ir_visitor_t {
             if (dst_op.mod().getPredCtrl() == ngen::PredCtrl::None) {
                 host_->emov(1, dst_op, ngen::Immediate(flag_mask));
             } else {
-                ir_assert(dst_op.mod().getFlagReg().getARFBase()
+                gpu_assert(dst_op.mod().getFlagReg().getARFBase()
                         == dst_op.flag_register().getARFBase());
                 host_->and_(1, dst_op.flag_register(), dst_op.flag_register(),
                         ngen::Immediate(flag_mask));
@@ -1081,7 +1150,7 @@ class expr_evaluator_t : public ir_visitor_t {
             while (length > 0) {
                 int exec_size = (1 << math::ilog2q(length));
                 if (obj.type.is_bool()) {
-                    ir_assert(off % 8 == 0)
+                    gpu_assert(off % 8 == 0)
                             << "expected mask offset to be multiple of 8";
                     auto chunk_op = op.reg_buf_data().subregister(
                             off / 8, ngen::DataType::b)(1);
@@ -1123,20 +1192,20 @@ class expr_evaluator_t : public ir_visitor_t {
                         src0_op.reg_data(), src1_op.reg_data(),
                         src2_op.reg_data());
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         bind(obj, dst_op);
     }
 
     void _visit(const unary_op_t &obj) override {
-        ir_assert(obj.op_kind == op_kind_t::_minus);
+        gpu_assert(obj.op_kind == op_kind_t::_minus);
         ngen_operand_t a_op;
         a_op = eval(obj.a);
         bind(obj, -a_op);
     }
 
     void _visit(const var_t &obj) override {
-        ir_assert(expr_binding_.is_bound(obj))
+        gpu_assert(expr_binding_.is_bound(obj))
                 << "Variable is not defined: " << expr_t(obj);
     }
 
@@ -1152,7 +1221,7 @@ class expr_evaluator_t : public ir_visitor_t {
     };
 
     ngen_operand_t alloc_dst_op(const expr_t &e) {
-        ir_assert(!expr_binding_.is_bound(e)) << "Already evaluated: " << e;
+        gpu_assert(!expr_binding_.is_bound(e)) << "Already evaluated: " << e;
         if (expr_binding_.is_dst_bound(e)) return expr_binding_.get_dst(e);
 
         // Expression is not bound yet, allocate new storage and bind.
@@ -1183,9 +1252,9 @@ class expr_evaluator_t : public ir_visitor_t {
         auto *shuffle = e.as_ptr<shuffle_t>();
         if (shuffle && shuffle->is_broadcast()) return ngen_operand_t();
 
+        int stride = res_type.bitsize() / e.type().bitsize();
         return ngen_operand_t(
-                scope.alloc_reg_data(e.type(), res_type.scalar().size()),
-                e.type().elems());
+                scope.alloc_reg_data(e.type(), stride), e.type().elems());
     }
 
     void bind(const expr_t &e, const ngen_operand_t &op) {
@@ -1208,7 +1277,7 @@ class expr_evaluator_t : public ir_visitor_t {
     void ebinary(const binary_op_t &obj, const ngen::InstructionModifier &mod,
             const ngen_operand_t &_dst, const ngen_operand_t &_src0,
             const ngen_operand_t &_src1) {
-        auto dst = _dst;
+        auto &dst = _dst;
         auto src0 = _src0;
         auto src1 = _src1;
         align_src_dst_offset(host_, scope_, mod, dst, src0, src1);
@@ -1228,7 +1297,8 @@ class expr_evaluator_t : public ir_visitor_t {
             case op_kind_t::_lt:
             case op_kind_t::_eq:
             case op_kind_t::_ne: {
-                ir_assert(!dst.is_negated()) << "Destination can't be negated.";
+                gpu_assert(!dst.is_negated())
+                        << "Destination can't be negated.";
                 ngen::InstructionModifier cmp_mod = mod;
                 if (!src0.is_reg_data()) {
                     cmp_mod |= cmp_op_to_ngen(negate_cmp_op(obj.op_kind));
@@ -1243,13 +1313,13 @@ class expr_evaluator_t : public ir_visitor_t {
             }
             case op_kind_t::_and: host_->eand(mod, dst, src0, src1); break;
             case op_kind_t::_prelu: {
-                int grf_size = ngen::GRF::bytes(hw);
+                int grf_size = ngen::GRF::bytes(hw());
                 int esize = mod.getExecSize();
-                int off = src0.reg_data().getByteOffset();
+                int off = src0.reg_data().getOffset();
                 int regs = utils::div_up(
                         esize * int(sizeof(float)) + off, grf_size);
                 auto temp = scope_.alloc_reg_buf_data(regs).format(
-                        off, ngen::DataType::f, esize);
+                        off, esize, 1, ngen::DataType::f);
                 host_->emul(mod, temp, dst, src1);
                 // Workaround for regioning restriction.
                 if (esize == 2) {
@@ -1265,7 +1335,7 @@ class expr_evaluator_t : public ir_visitor_t {
                 break;
             }
             default:
-                ir_error_not_expected()
+                gpu_error_not_expected()
                         << "Unknown kind: " << to_string(obj.op_kind);
         }
     }
@@ -1308,7 +1378,7 @@ class expr_evaluator_t : public ir_visitor_t {
             split_by_and(cast->expr, cv, cast->type);
             for (size_t i = 0; i < cv.size(); i++) {
                 if (cv[i].op_ == op_kind_t::undef) {
-                    ir_assert(i == cv.size() - 1);
+                    gpu_assert(i == cv.size() - 1);
                 }
             }
 
@@ -1330,7 +1400,7 @@ class expr_evaluator_t : public ir_visitor_t {
         for (size_t i = 0; i < vec.size(); i++) {
             if (!obj.vec[i].is<load_t>()) return false;
             vec[i] = eval(obj.vec[i]);
-            ir_assert(vec[i].is_reg_buf_data()) << obj.vec[i];
+            gpu_assert(vec[i].is_reg_buf_data()) << obj.vec[i];
             auto &rbd = vec[i].reg_buf_data();
             if (data_type == ngen::DataType::invalid) {
                 data_type = rbd.type();
@@ -1339,7 +1409,7 @@ class expr_evaluator_t : public ir_visitor_t {
             if (data_type != rbd.type()) return false;
         }
 
-        int grf_size = ngen::GRF::bytes(hw);
+        int grf_size = ngen::GRF::bytes(hw());
         auto diff_bytes = [&](const ngen_operand_t &a,
                                   const ngen_operand_t &b) {
             auto a_rd = a.reg_data();
@@ -1368,7 +1438,7 @@ class expr_evaluator_t : public ir_visitor_t {
             int regs = utils::div_up(stride_bytes * 2, grf_size);
             if (regs > 2) return false;
             rd.setRegion(stride_bytes / type_size, elems / 2, 0);
-            reg_buf_t rb(hw, ngen::GRFRange(rd.getBase(), regs));
+            reg_buf_t rb(hw(), ngen::GRFRange(rd.getBase(), regs));
             bind(obj, reg_buf_data_t(rb, rd));
             return true;
         }
@@ -1389,7 +1459,7 @@ class expr_evaluator_t : public ir_visitor_t {
             int regs = utils::div_up(stride_bytes * elems / 2, grf_size);
             if (regs > 2) return false;
             rd.setRegion(0, elems / 2, stride_bytes / type_size);
-            reg_buf_t rb(hw, ngen::GRFRange(rd.getBase(), regs));
+            reg_buf_t rb(hw(), ngen::GRFRange(rd.getBase(), regs));
             bind(obj, reg_buf_data_t(rb, rd));
             return true;
         }
@@ -1407,7 +1477,7 @@ class expr_evaluator_t : public ir_visitor_t {
         std::vector<int> vec(vec_size);
         for (int i = 0; i < vec_size; i++) {
             if (!is_const(obj.vec[i])) return false;
-            int value = to_cpp<int64_t>(obj.vec[i]);
+            int64_t value = to_cpp<int64_t>(obj.vec[i]);
             if (value < int_min || value > int_max) return false;
             vec[i] = (int)value;
         }
@@ -1471,21 +1541,20 @@ class expr_evaluator_t : public ir_visitor_t {
         auto &dst_rbd = dst.reg_buf_data();
         int dst_stride = dst_rbd.hs();
         int w_size = sizeof(uint16_t);
-        int grf_size = ngen::GRF::bytes(hw);
+        int grf_size = ngen::GRF::bytes(hw());
         auto tmp = scope_.alloc_reg_buf_data(1);
         auto w_type = (use_uv) ? ngen::DataType::uw : ngen::DataType::w;
         for (int i = 0; i < obj.elems(); i += esize) {
             uint32_t packed = 0;
             for (int j = 0; j < esize; j++)
                 set_packed(packed, (vec[obj.idx[i + j]] - vec_min) / factor, j);
-            auto t = tmp.format(i * w_size, w_type, esize);
+            auto t = tmp.format(i, esize, 1, w_type);
             host_->emov(esize, t,
                     (use_uv) ? ngen::Immediate::uv(packed)
                              : ngen::Immediate::v(packed));
         }
-        auto d = dst_rbd.format(
-                0, ngen::DataType::invalid, obj.elems(), dst_stride);
-        auto t = tmp.format(0, w_type, obj.elems());
+        auto d = dst_rbd.format(0, obj.elems(), dst_stride);
+        auto t = tmp.format(0, obj.elems(), 1, w_type);
         reg_buf_data_t t_strided;
         bool align_with_dst = false;
         if (align_with_dst) {
@@ -1493,7 +1562,7 @@ class expr_evaluator_t : public ir_visitor_t {
             int tmp_strided_regs
                     = utils::div_up(obj.elems() * w_size * w_stride, grf_size);
             auto tmp_strided = scope_.alloc_reg_buf_data(tmp_strided_regs);
-            t_strided = tmp_strided.format(0, w_type, obj.elems(), w_stride);
+            t_strided = tmp_strided.format(0, obj.elems(), w_stride, w_type);
             host_->emov(obj.elems(), t_strided, t);
         } else {
             t_strided = std::move(t);
@@ -1509,7 +1578,7 @@ class expr_evaluator_t : public ir_visitor_t {
         return true;
     }
 
-    ir_kernel_t<hw> *host_;
+    ngen_generator_t *host_;
     expr_binding_t expr_binding_;
     ngen_register_scope_t &scope_;
     bool allow_vert_stride_region_ = true;
@@ -1517,33 +1586,78 @@ class expr_evaluator_t : public ir_visitor_t {
     object_eq_map_t<expr_t, type_t> int_up_converts_;
 };
 
-template <ngen::HW hw>
-void convert_ir_to_ngen(const stmt_t &body, ir_kernel_t<hw> *host,
-        const expr_binding_t &expr_binding) {
-    ir_to_ngen_t<hw> visitor(host, expr_binding);
+template <typename ngen_generator_t>
+void convert_ir_to_ngen_impl(const stmt_t &body, ngen_generator_t *host,
+        const walk_order_t *kernel_grid_walk_order) {
+    expr_binding_t expr_binding(host->getHardware());
+    host->comment("Prologue");
+    host->generate_prologue();
+
+    host->bind_external_vars(body, expr_binding);
+    if (kernel_grid_walk_order)
+        host->bind_kernel_grid_walk_order(
+                *kernel_grid_walk_order, expr_binding);
+
+    host->comment("IR");
+    ir_to_ngen_t<ngen_generator_t> visitor(host, expr_binding);
     visitor.visit(body);
+
+    host->comment("Epilogue");
+    host->generate_epilogue();
+}
+
+std::string get_ngen_str(const stmt_t &body, ir_asm_kernel_t host,
+        const walk_order_t *kernel_grid_walk_order) {
+#ifdef NGEN_ASM
+    try {
+        convert_ir_to_ngen_impl(body, &host, kernel_grid_walk_order);
+        return host.str();
+    } catch (std::runtime_error &e) {
+        return "IR to nGEN Exception: " + std::string(e.what());
+    }
+#else
+    return "";
+#endif
+}
+
+template <typename ngen_generator_t>
+void convert_ir_to_ngen(const stmt_t &body, ngen_generator_t *host,
+        const walk_order_t *kernel_grid_walk_order) {
+    gpu_trace() << get_ngen_str(body, *host, kernel_grid_walk_order);
+    convert_ir_to_ngen_impl(body, host, kernel_grid_walk_order);
 }
 
 REG_GEN9_ISA(template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::Gen9> *host, const expr_binding_t &expr_binding));
+        ir_kernel_t<ngen::HW::Gen9> *host,
+        const walk_order_t *kernel_grid_walk_order));
 REG_GEN11_ISA(template void convert_ir_to_ngen(const stmt_t &body,
         ir_kernel_t<ngen::HW::Gen11> *host,
-        const expr_binding_t &expr_binding));
+        const walk_order_t *kernel_grid_walk_order));
 REG_XELP_ISA(template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::XeLP> *host, const expr_binding_t &expr_binding));
+        ir_kernel_t<ngen::HW::XeLP> *host,
+        const walk_order_t *kernel_grid_walk_order));
 REG_XEHP_ISA(template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::XeHP> *host, const expr_binding_t &expr_binding));
+        ir_kernel_t<ngen::HW::XeHP> *host,
+        const walk_order_t *kernel_grid_walk_order));
 REG_XEHPG_ISA(template void convert_ir_to_ngen(const stmt_t &body,
         ir_kernel_t<ngen::HW::XeHPG> *host,
-        const expr_binding_t &expr_binding));
-REG_XE2_ISA(template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::Xe2> *host, const expr_binding_t &expr_binding));
+        const walk_order_t *kernel_grid_walk_order));
 REG_XEHPC_ISA(template void convert_ir_to_ngen(const stmt_t &body,
         ir_kernel_t<ngen::HW::XeHPC> *host,
-        const expr_binding_t &expr_binding));
+        const walk_order_t *kernel_grid_walk_order));
+REG_XE2_ISA(template void convert_ir_to_ngen(const stmt_t &body,
+        ir_kernel_t<ngen::HW::Xe2> *host,
+        const walk_order_t *kernel_grid_walk_order));
+REG_XE3_ISA(template void convert_ir_to_ngen(const stmt_t &body,
+        ir_kernel_t<ngen::HW::Xe3> *host,
+        const walk_order_t *kernel_grid_walk_order));
 
 } // namespace jit
 } // namespace intel
 } // namespace gpu
 } // namespace impl
 } // namespace dnnl
+
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic pop
+#endif
diff --git a/src/gpu/intel/jit/codegen/codegen.hpp b/src/gpu/intel/jit/codegen/codegen.hpp
index ad4338001a5..168fd2e84f9 100644
--- a/src/gpu/intel/jit/codegen/codegen.hpp
+++ b/src/gpu/intel/jit/codegen/codegen.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #define GPU_INTEL_JIT_CODEGEN_CODEGEN_HPP
 
 #include "gpu/intel/jit/codegen/kernel.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -26,25 +26,34 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-template <ngen::HW hw>
-void convert_ir_to_ngen(const stmt_t &body, ir_kernel_t<hw> *host,
-        const expr_binding_t &expr_binding);
+template <typename ngen_generator_t>
+void convert_ir_to_ngen(const stmt_t &body, ngen_generator_t *host,
+        const walk_order_t *kernel_grid_walk_order = nullptr);
 
 REG_GEN9_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::Gen9> *host, const expr_binding_t &expr_binding));
+        ir_kernel_t<ngen::HW::Gen9> *host,
+        const walk_order_t *kernel_grid_walk_order));
 REG_GEN11_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
         ir_kernel_t<ngen::HW::Gen11> *host,
-        const expr_binding_t &expr_binding));
+        const walk_order_t *kernel_grid_walk_order));
 REG_XELP_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::XeLP> *host, const expr_binding_t &expr_binding));
+        ir_kernel_t<ngen::HW::XeLP> *host,
+        const walk_order_t *kernel_grid_walk_order));
 REG_XEHP_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
-        ir_kernel_t<ngen::HW::XeHP> *host, const expr_binding_t &expr_binding));
+        ir_kernel_t<ngen::HW::XeHP> *host,
+        const walk_order_t *kernel_grid_walk_order));
 REG_XEHPG_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
         ir_kernel_t<ngen::HW::XeHPG> *host,
-        const expr_binding_t &expr_binding));
+        const walk_order_t *kernel_grid_walk_order));
 REG_XEHPC_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
         ir_kernel_t<ngen::HW::XeHPC> *host,
-        const expr_binding_t &expr_binding));
+        const walk_order_t *kernel_grid_walk_order));
+REG_XE2_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
+        ir_kernel_t<ngen::HW::Xe2> *host,
+        const walk_order_t *kernel_grid_walk_order));
+REG_XE3_ISA(extern template void convert_ir_to_ngen(const stmt_t &body,
+        ir_kernel_t<ngen::HW::Xe3> *host,
+        const walk_order_t *kernel_grid_walk_order));
 
 } // namespace jit
 } // namespace intel
diff --git a/src/gpu/intel/jit/codegen/kernel.hpp b/src/gpu/intel/jit/codegen/kernel.hpp
index 4dcc2aae4c9..acc7009d36f 100644
--- a/src/gpu/intel/jit/codegen/kernel.hpp
+++ b/src/gpu/intel/jit/codegen/kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,22 +17,30 @@
 #ifndef GPU_INTEL_JIT_CODEGEN_KERNEL_HPP
 #define GPU_INTEL_JIT_CODEGEN_KERNEL_HPP
 
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
+
 #include "common/cpp_compat.hpp"
 
 #include "common/impl_registration.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/jit/codegen/operand.hpp"
 #include "gpu/intel/jit/codegen/register_allocator.hpp"
+#include "gpu/intel/jit/codegen/register_scope.hpp"
+#include "gpu/intel/jit/codegen/reorder.hpp"
 #include "gpu/intel/jit/emulation.hpp"
+#include "gpu/intel/jit/generator.hpp"
 #include "gpu/intel/jit/ir/ir.hpp"
+#include "gpu/intel/jit/ir/ir_builder.hpp"
 #include "gpu/intel/jit/ir/kernel_desc.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/ir/message.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
 #include "gpu/intel/jit/ir/walk_order.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
-#include "gpu/intel/jit/ngen/ngen_register_allocator.hpp"
+#include "ngen.hpp"
+#include "ngen_register_allocator.hpp"
 #include "xpu/utils.hpp"
 
 namespace dnnl {
@@ -42,23 +50,23 @@ namespace intel {
 namespace jit {
 
 template <template <ngen::HW> class KernelT>
-struct ir_generator_t : public jit_generator_base {
+struct ir_generator_t : public generator_base_t {
     ir_generator_t(const kernel_desc_base_t &kernel_desc)
         : kernel_name_(kernel_desc.kernel_name()), kernel_desc_(kernel_desc) {}
 
     const char *kernel_name() const override { return kernel_name_.c_str(); }
 
-    xpu::binary_t get_binary(cl_context context, cl_device_id device) override {
-        kernel_info_t kernel_info;
-        auto status = kernel_desc_.init_kernel_info(kernel_info);
-        if (status != status::success) return xpu::binary_t();
+    status_t get_kernel(compute::kernel_t &kernel,
+            const compute::compute_engine_t *engine) override {
         try {
 #define CASE(hw) \
     case ngen::HW::hw: { \
-        KernelT<ngen::HW::hw> kernel(kernel_desc_, kernel_info); \
-        return kernel.getBinary(context, device); \
+        KernelT<ngen::HW::hw> _kernel(kernel_desc_, engine); \
+        return _kernel.get_kernel(kernel, engine); \
     }
-            switch (kernel_desc_.exec_cfg().hw().to_ngen()) {
+            auto *device_info = engine->device_info();
+            auto hw = convert_dnnl_arch_to_ngen(device_info->gpu_arch());
+            switch (hw) {
                 REG_GEN9_ISA(CASE(Gen9));
                 REG_GEN11_ISA(CASE(Gen11));
                 REG_XELP_ISA(CASE(XeLP));
@@ -66,11 +74,14 @@ struct ir_generator_t : public jit_generator_base {
                 REG_XEHPG_ISA(CASE(XeHPG));
                 REG_XEHPC_ISA(CASE(XeHPC));
                 REG_XE2_ISA(CASE(Xe2));
+                REG_XE3_ISA(CASE(Xe3));
                 default: gpu_assert(false) << "Unexpected GPU architecture";
             }
 #undef CASE
-        } catch (ngen::out_of_registers_exception &) { return xpu::binary_t(); }
-        return xpu::binary_t();
+        } catch (ngen::out_of_registers_exception &) {
+            return status::runtime_error;
+        }
+        return status::runtime_error;
     }
 
 private:
@@ -84,7 +95,7 @@ class expr_binding_t {
 
     ~expr_binding_t() {
         if (!cpp_compat::uncaught_exceptions()) {
-            ir_assert(expr2dst_.empty()) << "Detected missing unbind_dst().";
+            gpu_assert(expr2dst_.empty()) << "Detected missing unbind_dst().";
         }
     }
 
@@ -93,20 +104,20 @@ class expr_binding_t {
     }
 
     ngen_operand_t get_dst(const expr_t &expr) const {
-        ir_assert(is_dst_bound(expr)) << "Destination is not bound: " << expr;
+        gpu_assert(is_dst_bound(expr)) << "Destination is not bound: " << expr;
         return expr2dst_.at(expr);
     }
 
     void bind_dst(const expr_t &expr, const ngen_operand_t &operand) {
-        ir_assert(!expr.is_empty());
+        gpu_assert(!expr.is_empty());
         auto ret = expr2dst_.insert({expr, operand});
-        ir_assert(ret.second) << "Already bound: " << expr;
+        gpu_assert(ret.second) << "Already bound: " << expr;
     }
 
     void unbind_dst(const expr_t &expr) {
-        ir_assert(!expr.is_empty());
+        gpu_assert(!expr.is_empty());
         auto it = expr2dst_.find(expr);
-        ir_assert(it != expr2dst_.end());
+        gpu_assert(it != expr2dst_.end());
         expr2dst_.erase(it);
     }
 
@@ -118,7 +129,7 @@ class expr_binding_t {
         if (expr.is_empty()) return ngen_operand_t();
         if (!is_bound(expr)) {
             if (!allow_empty)
-                ir_assert(false) << "Operand is not bound: " << expr;
+                gpu_assert(false) << "Operand is not bound: " << expr;
             return ngen_operand_t();
         }
         return expr2operand_.at(expr);
@@ -139,24 +150,23 @@ class expr_binding_t {
         int esize = operand.mod().getExecSize();
         if (esize == 0) esize = 1;
         if (esize != expr.type().elems() && !expr.type().is_bool()) {
-            ir_assert(expr.type().is_scalar() || esize == 1)
+            gpu_assert(expr.type().is_scalar() || esize == 1)
                     << "Expected broadcast.";
             if (operand.is_reg_buf_data() && esize != 1) {
                 // Bind scalar expression to the first vector element.
-                op_to_bind = operand.reg_buf_data().format(
-                        0, ngen::DataType::invalid, 1);
+                op_to_bind = operand.reg_buf_data().format(0, 1);
             }
         }
 
         auto ret = expr2operand_.insert({expr, op_to_bind});
-        ir_assert(ret.second) << "Already bound: " << expr;
+        gpu_assert(ret.second) << "Already bound: " << expr;
     }
 
     void unbind(const expr_t &expr) {
-        ir_assert(!expr.is_empty());
+        gpu_assert(!expr.is_empty());
 
         auto it = expr2operand_.find(expr);
-        ir_assert(it != expr2operand_.end());
+        gpu_assert(it != expr2operand_.end());
         expr2operand_.erase(it);
     }
 
@@ -166,101 +176,58 @@ class expr_binding_t {
     object_map_t<expr_t, ngen_operand_t> expr2operand_;
 };
 
-template <ngen::HW hw>
+template <typename ngen_generator_t>
 class expr_evaluator_t;
 
-template <ngen::HW hw>
+template <typename ngen_generator_t>
 class ir_to_ngen_t;
 
-template <ngen::HW hw>
-class ir_kernel_t : public jit_generator<hw> {
+template <typename ngen_generator_t>
+class ir_kernel_base_t : public ngen_generator_t {
 public:
-    NGEN_FORWARD_OPENCL(hw);
+    NGEN_FORWARD_SCOPE(ngen_generator_t)
 
-    friend class expr_evaluator_t<hw>;
-    friend class ir_to_ngen_t<hw>;
     friend class send_impl_t;
 
-    ir_kernel_t(
-            const kernel_desc_base_t &desc, const kernel_info_t &kernel_info)
-        : kernel_name_(desc.kernel_name())
-        , exec_cfg_(desc.exec_cfg())
-        , kernel_info_(kernel_info)
-        , local_range_(desc.local_range())
-        , require_dpas_(desc.with_dpas())
-        , regs_(exec_cfg_.regs())
-        , ra_(hw, desc.kernel_name())
-        , emu_strategy(hw, exec_cfg_.hw().stepping_id()) {
-        setStepping(exec_cfg_.hw().stepping_id());
-        ra_.setRegisterCount(regs_);
+    template <typename... ngen_generator_args>
+    ir_kernel_base_t(const kernel_desc_base_t &desc,
+            const impl::engine_t *engine, ngen_generator_args... args)
+        : ngen_generator_t(std::forward<ngen_generator_args>(args)...)
+        , exec_cfg_(desc.exec_cfg(engine))
+        , ra_(getHardware())
+        , emu_strategy(getHardware(), exec_cfg_.hw().stepping_id()) {
+        desc.init_kernel_iface(kernel_iface_);
+        ra_.setRegisterCount(exec_cfg_.regs());
     }
 
-    ir_kernel_t(const std::string &kernel_name, const exec_config_t &exec_cfg,
-            const kernel_info_t &kernel_info,
-            const compute::range_t &local_range, bool require_dpas)
-        : kernel_name_(kernel_name)
+    template <typename... ngen_generator_args>
+    ir_kernel_base_t(const exec_config_t &exec_cfg,
+            const kernel_iface_t &kernel_iface, ngen_generator_args... args)
+        : ngen_generator_t(std::forward<ngen_generator_args>(args)...)
+        , kernel_iface_(kernel_iface)
         , exec_cfg_(exec_cfg)
-        , kernel_info_(kernel_info)
-        , local_range_(local_range)
-        , require_dpas_(require_dpas)
-        , regs_(exec_cfg.regs())
-        , ra_(hw, kernel_name)
-        , emu_strategy(hw, exec_cfg.hw().stepping_id()) {
-        setStepping(exec_cfg.hw().stepping_id());
-        ra_.setRegisterCount(regs_);
+        , ra_(getHardware())
+        , emu_strategy(getHardware(), exec_cfg.hw().stepping_id()) {
+        ngen_generator_t::setStepping(exec_cfg.hw().stepping_id());
+        ra_.setRegisterCount(exec_cfg_.regs());
     }
 
-    const exec_config_t &exec_cfg() { return exec_cfg_; }
-
-    void setup_interface(const stmt_t &kernel_body = stmt_t()) {
-        externalName(kernel_name_);
-        requireLocalID(3);
-        requireLocalSize();
-        requireGRF(regs_);
-        requireSIMD(exec_cfg_.simd());
-        requireBarrier();
-        if (require_dpas_) requireDPAS();
-        if (has_send_atomics(kernel_body)) requireGlobalAtomics();
-
-        for (int i = 0; i < kernel_info_.nargs(); i++) {
-            auto &name = kernel_info_.arg_name(i);
-            auto &type = kernel_info_.arg_type(i);
-            if (type.is_ptr()) {
-                newArgument(name, ngen::ExternalArgumentType::GlobalPtr);
-            } else {
-                newArgument(name, to_ngen(type));
-            }
-        }
-
-        if (!kernel_body.is_empty() && local_range_) {
-            int slm_size = alloc_manager_t(kernel_body)
-                                   .total_size(alloc_kind_t::slm);
-            int max_slm_size = compute::device_info_t::max_slm_size_per_tg(
-                    convert_ngen_arch_to_dnnl(hw), thread_group_size(),
-                    regs_ > 128);
-            if (slm_size > max_slm_size) {
-                // TODO: Use status code for this check.
-                ir_except_not_implemented("SLM size limit is exceeded.");
-            }
-            requireSLM(slm_size);
-        }
-
-        finalizeInterface();
-    }
+    const kernel_iface_t &kernel_iface() const { return kernel_iface_; }
+    const exec_config_t &exec_cfg() const { return exec_cfg_; }
 
     void generate_prologue() {
-        setDefaultNoMask();
-        setDefaultAutoSWSB(true);
+        ngen_generator_t::setDefaultNoMask();
+        ngen_generator_t::setDefaultAutoSWSB(true);
 
-        prologue();
+        ngen_generator_t::prologue();
 
         // Claim registers.
-        ra_.claim(r0);
+        ra_.claim(ngen_generator_t::r0);
         for (int i = 0; i < 3; i++)
-            ra_.claim(getLocalID(i));
+            ra_.claim(ngen_generator_t::getLocalID(i));
 
-        for (int i = 0; i < kernel_info_.nargs(); i++) {
-            ra_.claim(getArgument(kernel_info_.arg_name(i)));
+        for (int i = 0; i < kernel_iface_.nargs(); i++) {
+            ra_.claim(ngen_generator_t::getArgument(kernel_iface_.arg_name(i)));
         }
 
         if (emu_strategy.emulate64) {
@@ -268,66 +235,53 @@ class ir_kernel_t : public jit_generator<hw> {
             emu_state.temp[1] = ra_.alloc();
         }
         // Enable IEEE f32 -> s32 rounding and f64/f32/f16 denormals.
-        or_(1, cr0, cr0, uint16_t(0x14C0));
+        or_(1, ngen_generator_t::cr0, ngen_generator_t::cr0, uint16_t(0x14C0));
 
         // Allocate and initialize signal header for future use.
-        if (require_signal_header_) {
+        if (exec_cfg_.require_signal_header()) {
             signal_header_ = ra_.alloc();
-            barrierheader(signal_header_);
+            ngen_generator_t::barrierheader(signal_header_);
         }
     }
 
-    void bind_external_vars(const stmt_t &kernel_body,
-            const grid_info_t &kernel_grid,
-            const std::array<expr_t, 3> &local_id,
-            expr_binding_t &expr_binding) {
-        grid_context_t grid_ctx(/*create_empty=*/true);
-        for (int i = 0; i < 3; i++) {
-            grid_ctx.set_tg_idx(i, kernel_grid.idx(i));
-            grid_ctx.set_local_id(i, local_id[i]);
-        }
-        bind_external_vars(kernel_body, grid_ctx, expr_binding);
-    }
-
-    void bind_external_vars(const stmt_t &kernel_body,
-            const walk_order_t &kernel_grid_walk_order,
-            const std::array<expr_t, 3> &local_id,
-            expr_binding_t &expr_binding) {
-        grid_context_t grid_ctx(/*create_empty=*/true);
-        for (int i = 0; i < 3; i++) {
-            grid_ctx.set_local_id(i, local_id[i]);
-        }
-        bind_external_vars(kernel_body, grid_ctx, expr_binding);
-        bind_kernel_grid_walk_order(kernel_grid_walk_order, expr_binding);
-    }
-
-    void bind_external_vars(const stmt_t &kernel_body,
-            const grid_context_t &grid_ctx, expr_binding_t &expr_binding) {
+    void bind_external_vars(
+            const stmt_t &kernel_body, expr_binding_t &expr_binding) {
         alloc_manager_t alloc_mgr(kernel_body);
 
         // Bind grid indices.
         int r0_sub_idxs[] = {1, 6, 7};
         for (int i = 0; i < 3; i++) {
-            if (grid_ctx.tg_idx(i).is_empty()) continue;
-            auto tmp = ra_.template alloc_sub<int32_t>();
-            mov(1, tmp, r0.ud(r0_sub_idxs[i]));
-            expr_binding.bind(grid_ctx.tg_idx(i), tmp);
+            auto tg_idx = alloc_mgr.find_let(ir_builder_t::tg_idx(i), true);
+            if (!tg_idx.is_empty()) {
+                auto tmp = ra_.template alloc_sub<int32_t>();
+                mov(1, tmp, ngen_generator_t::r0.ud(r0_sub_idxs[i]));
+                expr_binding.bind(tg_idx, tmp);
+            }
         }
 
         // Bind local IDs.
         for (int i = 0; i < 3; i++) {
-            expr_binding.bind(grid_ctx.local_id(i), getLocalID(i).uw(0));
+            auto local_id = alloc_mgr.find_let(ir_builder_t::local_id(i), true);
+            if (!local_id.is_empty()) {
+                expr_binding.bind(
+                        local_id, ngen_generator_t::getLocalID(i).uw(0));
+            }
         }
 
         // Bind arguments.
-        for (int i = 0; i < kernel_info_.nargs(); i++) {
-            auto &arg_var = kernel_info_.arg_var(i);
-            auto &name = kernel_info_.arg_name(i);
+        for (int i = 0; i < kernel_iface_.nargs(); i++) {
+            auto &arg_var = kernel_iface_.arg_var(i);
+            auto &name = kernel_iface_.arg_name(i);
             if (arg_var.type().is_ptr()) {
-                auto alloc_buf = alloc_mgr.find_buffer(name);
-                ir_assert(alloc_buf.is_same(arg_var));
+                auto alloc_buf
+                        = alloc_mgr.find_buffer(name, /*allow_empty=*/true);
+                if (alloc_buf.is_empty()) {
+                    gpu_warning() << "Unused argument: " << arg_var;
+                    continue;
+                }
+                gpu_assert(alloc_buf.is_same(arg_var));
             }
-            expr_binding.bind(arg_var, getArgument(name));
+            expr_binding.bind(arg_var, ngen_generator_t::getArgument(name));
         }
 
         // Bind SLM buffer (SLM loads/stores use 0-based offsets).
@@ -355,7 +309,7 @@ class ir_kernel_t : public jit_generator<hw> {
                                const ngen::Subregister &src1, uint32_t src2) {
             bool is_src2_16_bit
                     = (src2 <= std::numeric_limits<uint16_t>::max());
-            if (hw >= ngen::HW::XeLP && is_src2_16_bit && false) {
+            if (getHardware() >= ngen::HW::XeLP && is_src2_16_bit && false) {
                 mad(1, dst, src0, src1, src2);
             } else {
                 auto tmp = ra_.alloc_sub<uint64_t>();
@@ -413,7 +367,7 @@ class ir_kernel_t : public jit_generator<hw> {
             const std::vector<expr_t> &grid_vars,
             expr_binding_t &expr_binding) {
         int nblocks = (int)blocks.size();
-        ir_assert((int)grid_vars.size() == nblocks);
+        gpu_assert((int)grid_vars.size() == nblocks);
         if (nblocks == 1) {
             expr_binding.bind(grid_vars[0], id);
             return;
@@ -435,9 +389,8 @@ class ir_kernel_t : public jit_generator<hw> {
         ngen::Subregister grid_ids[grid_ndims] = {r0.ud(1), r0.ud(6), r0.ud(7)};
         for (int i = 0; i < grid_ndims; i++) {
             std::vector<std::pair<int, int>> blocks;
-            std::unordered_map<prb_dim_t, int, ir_utils::hasher_t<prb_dim_t>>
-                    dim_map;
-            auto to_dim_idx = [&](const prb_dim_t &dim) {
+            std::unordered_map<pvar_t, int> dim_map;
+            auto to_dim_idx = [&](const pvar_t &dim) {
                 if (dim_map.count(dim) != 0) return dim_map.at(dim);
                 int idx = (int)dim_map.size();
                 dim_map.emplace(dim, idx);
@@ -467,7 +420,7 @@ class ir_kernel_t : public jit_generator<hw> {
     }
 
     void generate_epilogue() {
-        epilogue();
+        ngen_generator_t::epilogue();
         pad_kernel();
     }
 
@@ -498,7 +451,7 @@ class ir_kernel_t : public jit_generator<hw> {
             }
         } else {
             // dst is a flag register.
-            ir_assert(!dst.is_negated());
+            gpu_assert(!dst.is_negated());
             auto _mod = mod;
             _mod.setExecSize(1);
             if (src0.is_reg_data()) {
@@ -512,7 +465,7 @@ class ir_kernel_t : public jit_generator<hw> {
     void eadd(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
             const ngen_operand_t &src0, const ngen_operand_t &src1) {
         if (src0.is_immediate()) {
-            ir_assert(src1.is_reg_data());
+            gpu_assert(src1.is_reg_data());
             eadd(mod, dst, src1, src0);
             return;
         }
@@ -542,7 +495,7 @@ class ir_kernel_t : public jit_generator<hw> {
         auto dst = dst_;
         auto mod = mod_;
         if (src0.is_immediate()) {
-            ir_assert(src1.is_reg_data());
+            gpu_assert(src1.is_reg_data());
             emul(mod, dst, src1, src0);
             return;
         }
@@ -559,7 +512,7 @@ class ir_kernel_t : public jit_generator<hw> {
                 };
                 src0 = subreg(src0_);
                 src1 = subreg(src1_);
-                dst = dst_.sub_reg_data(i, 1);
+                dst = dst_.sub_reg_data(i, esize);
                 if (ngen_is_dw(src1.type()) && ngen_is_w(src0.type())) {
                     emul(mod, dst.reg_data(), src1.reg_data(), src0.reg_data());
                 } else {
@@ -573,7 +526,7 @@ class ir_kernel_t : public jit_generator<hw> {
                 return;
             }
             if (ngen_is_dw(src1_imm.getType())) {
-                ir_assert(mod.getExecSize() == 1);
+                gpu_assert(mod.getExecSize() == 1);
                 auto tmp = ra_.alloc_sub<int64_t>();
                 if (ngen_is_w(src0.type())) {
                     auto tmp_src1 = ra_.alloc_sub<int32_t>();
@@ -592,20 +545,28 @@ class ir_kernel_t : public jit_generator<hw> {
     }
 
     void eadd3(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
-            const ngen_operand_t &src0, const ngen_operand_t &src1,
-            const ngen_operand_t &src2) {
-        if (hw >= ngen::HW::XeHP) {
+            const ngen_operand_t &_src0, const ngen_operand_t &_src1,
+            const ngen_operand_t &_src2) {
+        auto src0 = _src0;
+        auto src1 = _src1;
+        auto src2 = _src2;
+        auto scope = ngen_register_scope_t(ra_);
+        align_src_dst_offset(this, scope, mod, dst, src0);
+        align_src_dst_offset(this, scope, mod, dst, src1);
+        if (getHardware() >= ngen::HW::XeHP) {
             if (src2.is_reg_data()) {
-                add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                        src2.reg_data());
+                align_src_dst_offset(this, scope, mod, dst, src2);
+                add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                        fixup_ternary_rgn(src1.reg_data()), src2.reg_data());
             } else {
-                add3(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                        src2.immediate());
+                add3(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                        fixup_ternary_rgn(src1.reg_data()), src2.immediate());
             }
             return;
         }
         add(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
         if (src2.is_reg_data()) {
+            align_src_dst_offset(this, scope, mod, dst, src2);
             add(mod, dst.reg_data(), dst.reg_data(), src2.reg_data());
         } else {
             add(mod, dst.reg_data(), dst.reg_data(), src2.immediate());
@@ -613,26 +574,34 @@ class ir_kernel_t : public jit_generator<hw> {
     }
 
     void emad(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
-            const ngen_operand_t &src0, const ngen_operand_t &src1,
-            const ngen_operand_t &src2) {
+            const ngen_operand_t &_src0, const ngen_operand_t &_src1,
+            const ngen_operand_t &_src2) {
+        auto src0 = _src0;
+        auto src1 = _src1;
+        auto src2 = _src2;
+        auto scope = ngen_register_scope_t(ra_);
+        align_src_dst_offset(this, scope, mod, dst, src1);
         if (src2.is_reg_data()) {
-            mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                    src2.reg_data());
-        } else if (hw < ngen::HW::XeLP) {
+            align_src_dst_offset(this, scope, mod, dst, src0);
+            align_src_dst_offset(this, scope, mod, dst, src2);
+            mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                    fixup_ternary_rgn(src1.reg_data()), src2.reg_data());
+        } else if (getHardware() < ngen::HW::XeLP) {
+            align_src_dst_offset(this, scope, mod, dst, src0);
             mul(mod, dst.reg_data(), src1.reg_data(), src2.immediate());
             add(mod, dst.reg_data(), dst.reg_data(), src0.reg_data());
         } else if (src0.is_immediate()
                 && (ngen_is_dw(src0.type())
                         || src0.type() == ngen::DataType::uw)) {
             // dword immediate src0 is not supported, move to a register.
-            auto tmp_src0 = ra_.alloc_sub(src0.type());
+            auto tmp_src0 = scope.alloc_sub(src0.type());
             mov(1, tmp_src0, src0.immediate());
-            mad(mod, dst.reg_data(), tmp_src0, src1.reg_data(),
-                    src2.immediate());
-            ra_.safeRelease(tmp_src0);
+            mad(mod, dst.reg_data(), tmp_src0,
+                    fixup_ternary_rgn(src1.reg_data()), src2.immediate());
         } else {
-            mad(mod, dst.reg_data(), src0.reg_data(), src1.reg_data(),
-                    src2.immediate());
+            align_src_dst_offset(this, scope, mod, dst, src0);
+            mad(mod, dst.reg_data(), fixup_ternary_rgn(src0.reg_data()),
+                    fixup_ternary_rgn(src1.reg_data()), src2.immediate());
         }
     }
 
@@ -640,10 +609,11 @@ class ir_kernel_t : public jit_generator<hw> {
             const ngen_operand_t &src0, const ngen_operand_t &src1) {
         if (!src1.is_immediate()) {
             // Immediate src0 is not supported with fdiv_ieee.
-            if (src0.is_immediate() && hw >= ngen::HW::XeHPC) {
+            if (src0.is_immediate() && getHardware() >= ngen::HW::XeHPC) {
                 auto tmp_src0 = ra_.alloc_sub(src0.type());
                 mov(mod, tmp_src0, src0.immediate());
-                efdiv(mod, dst, ngen_operand_t(reg_buf_data_t(hw, tmp_src0)),
+                efdiv(mod, dst,
+                        ngen_operand_t(reg_buf_data_t(getHardware(), tmp_src0)),
                         src1);
                 ra_.safeRelease(tmp_src0);
             } else {
@@ -652,11 +622,14 @@ class ir_kernel_t : public jit_generator<hw> {
         } else {
             auto &src1_imm = src1.immediate();
             if (to_ir(src0.type()).is_fp()) {
-                ngen::Immediate src1_inv_value(1.f / to_cpp<float>(src1_imm));
+                constexpr float inf = std::numeric_limits<float>::infinity();
+                float f = to_cpp<float>(src1_imm);
+                float f_inv = f ? 1.f / f : std::signbit(f) ? -inf : inf;
+                ngen::Immediate src1_inv_value(f_inv);
                 emul(mod, dst, src0, src1_inv_value);
             } else {
                 int32_t src1_value = to_cpp<int32_t>(src1_imm);
-                ir_assert(0 < src1_value && src1_value <= INT32_MAX)
+                gpu_assert(0 < src1_value && src1_value <= INT32_MAX)
                         << src1_value;
                 eidiv(mod, dst.reg_data(), ngen::Subregister(), src0.reg_data(),
                         src1_value);
@@ -667,19 +640,32 @@ class ir_kernel_t : public jit_generator<hw> {
     void efdiv(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
             const ngen_operand_t &src0, const ngen_operand_t &src1) {
         int esize = mod.getExecSize();
-        int grf_size = ngen::GRF::bytes(hw);
+        int grf_size = ngen::GRF::bytes(getHardware());
         int div_esize = std::min(esize, grf_size / int(sizeof(float)));
 
-        ir_assert(dst.type() == ngen::DataType::f);
-        ir_assert(src0.type() == ngen::DataType::f);
-        ir_assert(src1.type() == ngen::DataType::f);
-        ir_assert(src1.reg_data().getHS() == 0);
+        gpu_assert(dst.type() == ngen::DataType::f);
+        gpu_assert(src0.type() == ngen::DataType::f);
+        gpu_assert(src1.type() == ngen::DataType::f);
+
+        if (src1.reg_data().getHS() != 0) {
+            int nregs = std::max(1, (mod.getExecSize() * 4) / grf_size);
+            auto s1 = src1.reg_data();
+            auto tmp_range_ = ra_.alloc_range(nregs);
+            auto tmp = tmp_range_[0].retype(s1.getType());
+            auto t1 = tmp.f(s1.getOffset())
+                              .setRegion(s1.getVS(), s1.getWidth(), s1.getHS());
+            inv(mod, t1, s1);
+            emul(mod, dst.reg_data(), src0.reg_data(), t1);
+            ra_.safeRelease(tmp);
+            return;
+        }
 
         // fdiv_ieee() is not supported in XeHPG so we use a less precise, inv-based sequence.
-        if (hw < ngen::HW::XeHPC) {
+        if (getHardware() < ngen::HW::XeHPC) {
             auto tmp = ra_.alloc_sub<float>();
             inv(1, tmp, src1.reg_data());
-            emul(mod, dst, src0, ngen_operand_t(reg_buf_data_t(hw, tmp)));
+            emul(mod, dst, src0,
+                    ngen_operand_t(reg_buf_data_t(getHardware(), tmp)));
             ra_.safeRelease(tmp);
             return;
         }
@@ -710,10 +696,10 @@ class ir_kernel_t : public jit_generator<hw> {
             auto src0_rd = r_spill(s0, div_esize, s0_spill);
             auto src1_rd = r_spill(s1, div_esize, s1_spill);
             // Enable mask as fdiv_ieee relies on masked if/endif flow.
-            setDefaultNoMask(false);
+            ngen_generator_t::setDefaultNoMask(false);
             fdiv_ieee(div_mod, f0[0], dst_rd(), src0_rd(), src1_rd(), zero, one,
                     tmp);
-            setDefaultNoMask(true);
+            ngen_generator_t::setDefaultNoMask(true);
         }
 
         ra_.safeRelease(one);
@@ -723,10 +709,10 @@ class ir_kernel_t : public jit_generator<hw> {
 
     void emod(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
             const ngen_operand_t &src0, const ngen_operand_t &src1) {
-        ir_assert(src1.is_immediate());
+        gpu_assert(src1.is_immediate());
         auto &src1_imm = src1.immediate();
         int32_t src1_value = to_cpp<int32_t>(src1_imm);
-        ir_assert(0 < src1_value && src1_value <= INT32_MAX) << src1_value;
+        gpu_assert(0 < src1_value && src1_value <= INT32_MAX) << src1_value;
         eidiv(mod, ngen::Subregister(), dst.reg_data(), src0.reg_data(),
                 src1_value);
     }
@@ -771,6 +757,14 @@ class ir_kernel_t : public jit_generator<hw> {
             const ngen_operand_t &src1) {
         if (src1.is_reg_data()) {
             cmp(mod, src0.reg_data(), src1.reg_data());
+        } else if (utils::one_of(src1.immediate().getType(), ngen::DataType::q,
+                           ngen::DataType::uq)) {
+            auto tmp = src1.immediate().getType() == ngen::DataType::uq
+                    ? ra_.alloc().uq()
+                    : ra_.alloc().q();
+            mov(1, tmp, src1.immediate());
+            cmp(mod, src0.reg_data(), tmp);
+            ra_.safeRelease(tmp);
         } else {
             cmp(mod, src0.reg_data(), src1.immediate());
         }
@@ -780,6 +774,14 @@ class ir_kernel_t : public jit_generator<hw> {
             const ngen_operand_t &src0, const ngen_operand_t &src1) {
         if (src1.is_reg_data()) {
             cmp(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
+        } else if (utils::one_of(src1.immediate().getType(), ngen::DataType::q,
+                           ngen::DataType::uq)) {
+            auto tmp = src1.immediate().getType() == ngen::DataType::uq
+                    ? ra_.alloc().uq()
+                    : ra_.alloc().q();
+            mov(1, tmp, src1.immediate());
+            cmp(mod, dst.reg_data(), src0.reg_data(), tmp);
+            ra_.safeRelease(tmp);
         } else {
             cmp(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
         }
@@ -810,10 +812,10 @@ class ir_kernel_t : public jit_generator<hw> {
     void eidiv(const ngen::InstructionModifier &mod, const ngen::RegData &qot,
             const ngen::RegData &rem, const ngen::RegData &x,
             const ngen::RegData &_y, const ngen::RegData &_magic) {
-        ir_assert(x.getHS() == 0);
-        ir_assert(_y.getType() == ngen::DataType::ud);
-        ir_assert(_magic.getHS() == 0);
-        ir_assert(_magic.getType() == ngen::DataType::uq);
+        gpu_assert(x.getHS() == 0);
+        gpu_assert(_y.getType() == ngen::DataType::ud);
+        gpu_assert(_magic.getHS() == 0);
+        gpu_assert(_magic.getType() == ngen::DataType::uq);
 
         bool x_signed = utils::one_of(x.getType(), ngen::DataType::b,
                 ngen::DataType::w, ngen::DataType::d);
@@ -869,8 +871,8 @@ class ir_kernel_t : public jit_generator<hw> {
     void eidiv(const ngen::InstructionModifier &mod, const ngen::RegData &_qot,
             const ngen::RegData &rem, const ngen::RegData &x,
             const ngen::RegData &_y, bool update_cr0_fp_to_int_rtz = true) {
-        ir_assert(mod.getExecSize() == 1);
-        ir_assert(_y.getType() == ngen::DataType::ud);
+        gpu_assert(mod.getExecSize() == 1);
+        gpu_assert(_y.getType() == ngen::DataType::ud);
         auto cr0_save = ra_.alloc_sub<uint32_t>();
         auto f_tmp = ra_.alloc_sub<float>();
         auto x_tmp = ra_.alloc_sub<float>();
@@ -915,7 +917,7 @@ class ir_kernel_t : public jit_generator<hw> {
         bool x_signed = utils::one_of(x.getType(), ngen::DataType::b,
                 ngen::DataType::w, ngen::DataType::d);
         auto div_type = (x_signed ? ngen::DataType::d : ngen::DataType::ud);
-        ir_assert(x.getHS() == 0);
+        gpu_assert(x.getHS() == 0);
         if (ngen::utils::is_zero_or_pow2(y)) {
             auto _x = get_subregister(x);
             if (x.getNeg() || (x == qot) || (x == rem)) {
@@ -939,24 +941,16 @@ class ir_kernel_t : public jit_generator<hw> {
         auto _qot = qot_tmp[0];
         mov(1, _x, x);
 
-        // qot = (x * m) >> p
-        bool use_mach = true;
-        if (use_mach) {
-            auto acc = acc0.retype(div_type);
-            mul(1, acc[0], _x, m & 0xFFFF);
-            mach(1, _qot, _x, m);
-            shr<uint32_t>(1, _qot, _qot, p - 32);
-        } else {
-            auto q_tmp = qot_tmp.retype(ngen::DataType::q);
-            emul(1, q_tmp[0], _x, m);
-            eshr(1, q_tmp.uq(0), q_tmp.uq(0), p);
-        }
+        auto acc = acc0.retype(div_type);
+        mul(1, acc[0], _x, m & 0xFFFF);
+        mach(1, _qot, _x, m);
+        shr<uint32_t>(1, _qot, _qot, p - 32);
 
         if (!rem.isInvalid()) {
             // rem = x - qot * y
             bool y_is_16_bit = (y <= static_cast<uint32_t>(
                                         std::numeric_limits<int16_t>::max()));
-            if (hw >= ngen::HW::XeLP && y_is_16_bit) {
+            if (getHardware() >= ngen::HW::XeLP && y_is_16_bit) {
                 mad(mod, rem, x, _qot, -int16_t(y));
             } else {
                 auto tmp = ra_.alloc_sub<uint64_t>();
@@ -1029,6 +1023,20 @@ class ir_kernel_t : public jit_generator<hw> {
                 *this, mod, dst, src0, src1, emu_strategy, emu_state);
     }
 
+    void esel(const ngen::InstructionModifier &mod, const ngen_operand_t &dst,
+            const ngen_operand_t &src0, const ngen_operand_t &src1) {
+        if (ngen_is_qw(dst.type())) {
+            auto neg_mod = mod;
+            neg_mod.setPredInv(!mod.isPredInv());
+            emov(mod, dst, src0);
+            emov(neg_mod, dst, src1);
+        } else if (src1.is_reg_data()) {
+            sel(mod, dst.reg_data(), src0.reg_data(), src1.reg_data());
+        } else {
+            sel(mod, dst.reg_data(), src0.reg_data(), src1.immediate());
+        }
+    }
+
 protected:
     // Helper RAII class allocating a temporary GRF buffer aligned at a
     // register boundary for instructions that require aligned operands.
@@ -1039,7 +1047,7 @@ class ir_kernel_t : public jit_generator<hw> {
         // read - whether operand is to be used as input (needs pre-copy)
         // write - whether operand is to be used as output (needs post-copy)
         // force_copy - always copy the region (even if it's aligned)
-        spiller_t(ir_kernel_t<hw> *host, const ngen::RegData &rd, int esize,
+        spiller_t(ir_kernel_base_t *host, const ngen::RegData &rd, int esize,
                 bool read, bool write, bool force_copy)
             : host_(host), rd_(rd), esize_(esize), read_(read), write_(write) {
             if (rd.getOffset() == 0 && !force_copy) return;
@@ -1047,7 +1055,7 @@ class ir_kernel_t : public jit_generator<hw> {
             int w = rd.getWidth();
             int hs = rd.getHS();
             int vs = rd.getVS();
-            int grf_size = ngen::GRF::bytes(hw);
+            int grf_size = ngen::GRF::bytes(host->getHardware());
             int regs = utils::div_up(
                     std::max(esize * hs, 1) * rd.getBytes(), grf_size);
             tmp_range_ = host_->ra_.alloc_range(regs);
@@ -1079,12 +1087,12 @@ class ir_kernel_t : public jit_generator<hw> {
                 case 1: ret.setType(ngen::DataType::ub); break;
                 case 2: ret.setType(ngen::DataType::uw); break;
                 case 4: ret.setType(ngen::DataType::ud); break;
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
             return ret;
         }
 
-        ir_kernel_t<hw> *host_ = nullptr;
+        ir_kernel_base_t *host_ = nullptr;
         ngen::RegData rd_;
         int esize_;
         bool read_ = false;
@@ -1108,9 +1116,9 @@ class ir_kernel_t : public jit_generator<hw> {
         return spill(rd, esize, false, true, force_copy);
     }
 
-    static bool overlaps(
-            int esize, const ngen::RegData &a, const ngen::RegData &b) {
-        int grf_size = ngen::GRF::bytes(hw);
+    bool overlaps(
+            int esize, const ngen::RegData &a, const ngen::RegData &b) const {
+        int grf_size = ngen::GRF::bytes(getHardware());
         int a_beg = a.getBase() * grf_size + a.getByteOffset();
         int b_beg = b.getBase() * grf_size + b.getByteOffset();
         int a_end = a_beg + std::max(esize * a.getHS(), 1) * a.getBytes() - 1;
@@ -1124,27 +1132,105 @@ class ir_kernel_t : public jit_generator<hw> {
         return false;
     }
 
+    static ngen::RegData fixup_ternary_rgn(const ngen::RegData &r) {
+        ngen::RegData retn = r;
+        return ((retn.getHS() == 1) && (retn.getVS() == retn.getWidth()))
+                ? retn.setRegion(1, 1, 0)
+                : retn;
+    }
+
+    kernel_iface_t kernel_iface_;
+    exec_config_t exec_cfg_;
+    reg_allocator_t ra_;
+    ngen::GRF signal_header_;
+
+    EmulationStrategy emu_strategy;
+    EmulationState emu_state;
+};
+
+template <ngen::HW hw>
+class ir_kernel_t : public ir_kernel_base_t<generator_t<hw>> {
+public:
+    using base = ir_kernel_base_t<generator_t<hw>>;
+    using elf_generator_t = ngen::ELFCodeGenerator<hw>;
+    friend class expr_evaluator_t<ir_kernel_t>;
+    friend class ir_to_ngen_t<ir_kernel_t>;
+
+    ir_kernel_t(const kernel_desc_base_t &desc, const impl::engine_t *engine,
+            const debug_config_t &debug_config)
+        : base(desc, engine, debug_config)
+        , kernel_name_(desc.kernel_name())
+        , require_dpas_(desc.with_dpas())
+        , local_range_(desc.local_range()) {}
+
+    ir_kernel_t(const std::string &kernel_name, const exec_config_t &exec_cfg,
+            const compute::range_t &local_range, bool require_dpas,
+            const debug_config_t &debug_config)
+        : base(exec_cfg, {}, debug_config)
+        , kernel_name_(kernel_name)
+        , require_dpas_(require_dpas)
+        , local_range_(local_range) {}
+
+    const ngen::NEOInterfaceHandler &neo_interface() const {
+        return elf_generator_t::interface_;
+    }
+
+    void set_kernel_iface(const kernel_iface_t &kernel_iface) {
+        base::kernel_iface_ = kernel_iface;
+    }
+
+    void setup_interface(const stmt_t &kernel_body = stmt_t()) {
+        elf_generator_t::externalName(kernel_name_);
+        elf_generator_t::requireLocalID(3);
+        elf_generator_t::requireLocalSize();
+        elf_generator_t::requireGRF(base::exec_cfg().regs());
+        elf_generator_t::requireSIMD(base::exec_cfg().simd());
+        elf_generator_t::requireBarrier();
+        if (require_dpas_) elf_generator_t::requireDPAS();
+        if (has_send_atomics(kernel_body))
+            elf_generator_t::requireGlobalAtomics();
+
+        for (int i = 0; i < base::kernel_iface().nargs(); i++) {
+            auto &name = base::kernel_iface().arg_name(i);
+            auto &type = base::kernel_iface().arg_type(i);
+            if (type.is_ptr()) {
+                elf_generator_t::newArgument(name,
+                        ngen::ExternalArgumentType::GlobalPtr,
+                        ngen::GlobalAccessType::Stateless);
+            } else {
+                elf_generator_t::newArgument(name, to_ngen(type));
+            }
+        }
+
+        if (!kernel_body.is_empty() && local_range_) {
+            int slm_size = alloc_manager_t(kernel_body)
+                                   .total_size(alloc_kind_t::slm);
+            int max_slm_size = compute::device_info_t::max_slm_size_per_tg(
+                    convert_ngen_arch_to_dnnl(hw), thread_group_size(),
+                    base::exec_cfg().regs() > 128);
+            if (slm_size > max_slm_size) {
+                // TODO: Use status code for this check.
+                gpu_except_not_implemented("SLM size limit is exceeded.");
+            }
+            elf_generator_t::requireSLM(slm_size);
+        }
+
+        elf_generator_t::finalizeInterface();
+    }
+
     int thread_group_size() const {
-        ir_assert(local_range_);
+        gpu_assert(local_range_);
         int local_size = 1;
         for (int i = 0; i < (int)local_range_.ndims(); i++) {
             local_size *= (int)local_range_[i];
         }
-        return ir_utils::safe_divide(local_size, exec_cfg_.simd());
+        return ir_utils::safe_divide(local_size, base::exec_cfg_.simd());
     }
 
+private:
     std::string kernel_name_;
-    exec_config_t exec_cfg_;
-    kernel_info_t kernel_info_;
-    compute::range_t local_range_;
     bool require_dpas_;
-    bool require_signal_header_ = false;
-    int regs_;
-    reg_allocator_t ra_;
-    ngen::GRF signal_header_;
-
-    EmulationStrategy emu_strategy;
-    EmulationState emu_state;
+    compute::range_t local_range_;
 };
 
 #define IR_KERNEL_EMULATION_FORWARD(hw) \
@@ -1155,8 +1241,11 @@ class ir_kernel_t : public jit_generator<hw> {
     using ir_kernel_t<hw>::eshr;
 
 #define IR_KERNEL_FORWARD(hw) \
-    NGEN_FORWARD_OPENCL(hw) \
+    NGEN_FORWARD_ELF(hw) \
     IR_KERNEL_EMULATION_FORWARD(hw) \
+    using ir_kernel_t<hw>::exec_cfg; \
+    using ir_kernel_t<hw>::kernel_iface; \
+    using ir_kernel_t<hw>::set_kernel_iface; \
     using ir_kernel_t<hw>::setup_interface; \
     using ir_kernel_t<hw>::bind_external_vars; \
     using ir_kernel_t<hw>::generate_prologue; \
@@ -1164,10 +1253,67 @@ class ir_kernel_t : public jit_generator<hw> {
     using ir_kernel_t<hw>::emu_strategy; \
     using ir_kernel_t<hw>::ra_;
 
+#ifdef NGEN_ASM
+class ir_asm_generator_t : public ngen::AsmCodeGenerator {
+public:
+    template <typename ngen_generator_t>
+    ir_asm_generator_t(const ngen_generator_t &k)
+        : ngen::AsmCodeGenerator(k.getProduct())
+        , interface_(k.neo_interface()) {}
+
+    NGEN_FORWARD_SCOPE(ngen::AsmCodeGenerator)
+
+    int getSIMD() const { return interface_.getSIMD(); }
+    void prologue() { interface_.generatePrologue(*this); }
+    void epilogue() {
+        int GRFCount = interface_.getGRFCount();
+        bool hasSLM = (interface_.getSLMSize() > 0);
+        epilogue(GRFCount, hasSLM, r0);
+    }
+
+    ngen::Subregister getArgument(const std::string &name) const {
+        return interface_.getArgument(name);
+    }
+    ngen::GRF getLocalID(int dim) const { return interface_.getLocalID(dim); }
+    std::string str() {
+        std::ostringstream oss;
+        getCode(oss);
+        return oss.str();
+    }
+
+private:
+    ngen::NEOInterfaceHandler interface_;
+};
+
+class ir_asm_kernel_t : public ir_kernel_base_t<ir_asm_generator_t> {
+public:
+    using base = ir_kernel_base_t<ir_asm_generator_t>;
+
+    friend class expr_evaluator_t<ir_asm_kernel_t>;
+    friend class ir_to_ngen_t<ir_asm_kernel_t>;
+
+    template <ngen::HW hw>
+    ir_asm_kernel_t(const ir_kernel_t<hw> &k)
+        : base(k.exec_cfg(), k.kernel_iface(), k) {}
+};
+#else
+class ir_asm_kernel_t {
+public:
+    template <ngen::HW hw>
+    ir_asm_kernel_t(const ir_kernel_t<hw> &k) {
+        MAYBE_UNUSED(k);
+    }
+};
+#endif
+
 } // namespace jit
 } // namespace intel
 } // namespace gpu
 } // namespace impl
 } // namespace dnnl
 
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic pop
+#endif
+
 #endif
diff --git a/src/gpu/intel/jit/codegen/ngen_helpers.hpp b/src/gpu/intel/jit/codegen/ngen_helpers.hpp
index c06440a2fda..da1160fc422 100644
--- a/src/gpu/intel/jit/codegen/ngen_helpers.hpp
+++ b/src/gpu/intel/jit/codegen/ngen_helpers.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 #define GPU_INTEL_JIT_CODEGEN_NGEN_HELPERS_HPP
 
 #include "gpu/intel/jit/ir/core.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
-#include "gpu/intel/jit/ngen/ngen_register_allocator.hpp"
 #include "gpu/intel/jit/utils/ngen_proxy.hpp"
+#include "ngen.hpp"
+#include "ngen_register_allocator.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,9 +28,12 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-// placeholder for data type unimplemented in HW.
-constexpr ngen::DataType ngen_hf8() {
-    return static_cast<ngen::DataType>(0x0D);
+constexpr ngen::DataType ngen_f4_e3m0() {
+    return static_cast<ngen::DataType>(0x5B);
+}
+
+constexpr ngen::DataType ngen_f4_e2m1() {
+    return static_cast<ngen::DataType>(0x5A);
 }
 
 template <typename T>
@@ -49,21 +52,27 @@ T to_cpp(const ngen::Immediate &imm) {
         case ngen::DataType::uq: return (T)utils::bit_cast<uint64_t>(u64);
         case ngen::DataType::f:
             return (T)utils::bit_cast<std::array<float, 2>>(u64)[0];
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return 0;
 }
 
 // type_t to ngen::DataType convertor.
 inline ngen::DataType to_ngen(const type_t &type) {
-    ir_assert(type.is_scalar()) << "Expected scalar type.";
+    gpu_assert(type.is_scalar()) << "Expected scalar type.";
 
 #define CASE(_kind, ngen_enum) \
     if (type.kind() == type_kind_t::_kind) return ngen::DataType::ngen_enum
 
+    // Until f4_e3m0 lands in ngen
+    if (type.kind() == type_kind_t::f4_e3m0) return ngen_f4_e3m0();
+    // Until f4_e2m1 lands in ngen
+    if (type.kind() == type_kind_t::f4_e2m1) return ngen_f4_e2m1();
+
     CASE(bf16, bf);
     CASE(f16, hf);
     CASE(bf8, bf8);
+    CASE(hf8, hf8);
     CASE(tf32, tf32);
     CASE(f32, f);
     CASE(f64, df);
@@ -71,16 +80,17 @@ inline ngen::DataType to_ngen(const type_t &type) {
     CASE(s32, d);
     CASE(s64, q);
     CASE(s8, b);
+    CASE(s4, s4);
     CASE(u16, uw);
     CASE(u32, ud);
     CASE(u64, uq);
     CASE(u8, ub);
+    CASE(u4, u4);
 
     if (type == type_t::byte_ptr()) return ngen::DataType::uq;
-    if (type == type_kind_t::hf8) return ngen_hf8();
 
 #undef CASE
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return ngen::DataType::invalid;
 }
 
@@ -92,25 +102,28 @@ inline type_t to_ir(ngen::DataType type) {
     CASE(bf16, bf);
     CASE(f16, hf);
     CASE(bf8, bf8);
+    CASE(hf8, hf8);
     CASE(f32, f);
     CASE(f64, df);
     CASE(s16, w);
     CASE(s32, d);
     CASE(s64, q);
     CASE(s8, b);
+    CASE(s4, s4);
     CASE(u16, uw);
     CASE(u32, ud);
     CASE(u64, uq);
     CASE(u8, ub);
+    CASE(u4, u4);
 
 #undef CASE
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return type_t::undef();
 }
 
 inline ngen::Immediate to_ngen(
         const expr_t &expr, const type_t &type = type_t::undef()) {
-    ir_assert(expr.type().is_scalar()) << "Vector types are not supported.";
+    gpu_assert(expr.type().is_scalar()) << "Vector types are not supported.";
     if (expr.is<int_imm_t>()) {
         auto &imm = expr.as<int_imm_t>();
         // No conversion.
@@ -128,15 +141,15 @@ inline ngen::Immediate to_ngen(
         CASE(uint64_t);
 
 #undef CASE
-        ir_error_not_expected() << "Can't convert expression: " << expr;
+        gpu_error_not_expected() << "Can't convert expression: " << expr;
     } else if (expr.is<float_imm_t>()) {
-        ir_assert(utils::one_of(type, type_t::undef(), type_t::f32()))
+        gpu_assert(utils::one_of(type, type_t::undef(), type_t::f32()))
                 << "Conversion is not supported.";
         auto &imm = expr.as<float_imm_t>();
         if (imm.type.is_f32()) { return ngen::Immediate((float)imm.value); }
         return ngen::Immediate(imm.value);
     }
-    ir_error_not_expected() << "Can't convert expression: " << expr;
+    gpu_error_not_expected() << "Can't convert expression: " << expr;
     return ngen::Immediate();
 }
 
@@ -155,7 +168,7 @@ inline ngen::InstructionModifier to_ngen(
 inline ngen::AtomicOp to_ngen(ngen_proxy::AtomicOp atomic_op) {
     switch (atomic_op) {
         case ngen_proxy::AtomicOp::fadd: return ngen::AtomicOp::fadd;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return ngen::AtomicOp(std::numeric_limits<uint16_t>::max());
 }
@@ -165,7 +178,7 @@ inline ngen::Immediate ngen_negate(const ngen::Immediate &imm) {
         case ngen::DataType::w: return ngen::Immediate(-to_cpp<int16_t>(imm));
         case ngen::DataType::d: return ngen::Immediate(-to_cpp<int32_t>(imm));
         case ngen::DataType::f: return ngen::Immediate(-to_cpp<float>(imm));
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return ngen::Immediate();
 }
@@ -193,10 +206,10 @@ inline bool ngen_is_xf(ngen::DataType type) {
 
 inline ngen::Subregister get_subregister(
         ngen::HW hw, ngen::DataType type, const ngen::GRFRange &r, int idx) {
-    int grf_size = ngen::GRF::bytes(hw);
-    int type_size = ngen::getBytes(type);
-    int off = idx * type_size;
-    return r[off / grf_size].sub((off % grf_size) / type_size, type);
+    int grf_bits = ngen::GRF::bytes(hw) * 8;
+    int type_bits = ngen::getBits(type);
+    int off_bits = idx * type_bits;
+    return r[off_bits / grf_bits].sub((off_bits % grf_bits) / type_bits, type);
 }
 
 inline ngen::Subregister get_subregister(const ngen::RegData &rd) {
diff --git a/src/gpu/intel/jit/codegen/operand.cpp b/src/gpu/intel/jit/codegen/operand.cpp
index 476fb37090c..697d42bd7ba 100644
--- a/src/gpu/intel/jit/codegen/operand.cpp
+++ b/src/gpu/intel/jit/codegen/operand.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,6 +42,7 @@ static std::string to_string(ngen::DataType type) {
         CASE(v);
         CASE(vf);
         CASE(bf8);
+        CASE(hf8);
         CASE(tf32);
         CASE(u4);
         CASE(s4);
@@ -59,8 +60,8 @@ std::string ngen_operand_t::str() const {
     auto &rb = rbd.reg_buf();
     std::ostringstream oss;
     if (rbd.type() != ngen::DataType::invalid) {
-        ir_assert(rb.blocks() == 1);
-        ir_assert(!rb.with_permute());
+        gpu_assert(rb.blocks() == 1);
+        gpu_assert(!rb.with_permute());
         oss << "r" << rbd.base() << ".";
         oss << rbd.offset() << ":";
         oss << to_string(rbd.type());
diff --git a/src/gpu/intel/jit/codegen/operand.hpp b/src/gpu/intel/jit/codegen/operand.hpp
index bca859b1a09..259879721d5 100644
--- a/src/gpu/intel/jit/codegen/operand.hpp
+++ b/src/gpu/intel/jit/codegen/operand.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include "gpu/intel/jit/codegen/ngen_helpers.hpp"
 #include "gpu/intel/jit/codegen/reg_buf.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -37,37 +37,39 @@ enum class ngen_operand_kind_t {
 // Wrapper to generalize ngen::FlagRegister, ngen::RegData, reg_buf_data_t and
 // ngen::Immediate operands.
 class ngen_operand_t {
+private:
+    template <typename T, ngen_operand_kind_t kind>
+    struct helper_t {
+        static constexpr ngen_operand_kind_t value = kind;
+        using type = T;
+    };
+
+    // These are only used in unevaluated contexts, no definition needed
+    static helper_t<ngen::FlagRegister, ngen_operand_kind_t::flag_register>
+    kind_of(const ngen::FlagRegister &);
+    static helper_t<reg_buf_data_t, ngen_operand_kind_t::reg_buf_data> kind_of(
+            const reg_buf_data_t &);
+    static helper_t<ngen::Immediate, ngen_operand_kind_t::immediate> kind_of(
+            const ngen::Immediate &);
+
 public:
     ngen_operand_t() : kind_(ngen_operand_kind_t::invalid) {}
+    ngen_operand_t(const ngen_operand_t &other, ngen::InstructionModifier mod)
+        : kind_(other.kind_), ptr_(other.ptr_), mod_(mod) {}
 
-    ngen_operand_t(const ngen::FlagRegister &flag)
-        : kind_(ngen_operand_kind_t::flag_register)
-        , ptr_(new ngen::FlagRegister(flag),
-                  destroy<ngen_operand_kind_t::flag_register>) {}
-
-    ngen_operand_t(const reg_buf_data_t &reg_buf_data)
-        : kind_(ngen_operand_kind_t::reg_buf_data)
-        , ptr_(new reg_buf_data_t(reg_buf_data),
-                  destroy<ngen_operand_kind_t::reg_buf_data>) {}
-
-    ngen_operand_t(const ngen::Immediate &imm)
-        : kind_(ngen_operand_kind_t::immediate)
-        , ptr_(new ngen::Immediate(imm),
-                  destroy<ngen_operand_kind_t::immediate>) {}
-
-    template <typename T>
-    ngen_operand_t(const T &other, const ngen::InstructionModifier &mod)
-        : ngen_operand_t(other) {
-        mod_ = mod;
-    }
+    template <typename T, typename Kind = decltype(kind_of(std::declval<T>())),
+            typename PtrT = typename Kind::type,
+            ngen_operand_kind_t kind = Kind::value>
+    ngen_operand_t(const T &operand, ngen::InstructionModifier mod = {})
+        : kind_(kind), ptr_(new PtrT(operand), destroy<kind>), mod_(mod) {}
 
     const ngen::Immediate &immediate() const {
-        ir_assert(is_immediate());
+        gpu_assert(is_immediate());
         return *(const ngen::Immediate *)ptr_.get();
     }
 
     const reg_buf_data_t &reg_buf_data() const {
-        ir_assert(is_reg_buf_data());
+        gpu_assert(is_reg_buf_data());
         return *(const reg_buf_data_t *)ptr_.get();
     }
 
@@ -77,7 +79,7 @@ class ngen_operand_t {
     }
 
     const ngen::FlagRegister &flag_register() const {
-        ir_assert(is_flag_register());
+        gpu_assert(is_flag_register());
         return *(const ngen::FlagRegister *)ptr_.get();
     }
 
@@ -110,7 +112,7 @@ class ngen_operand_t {
     ngen::DataType type() const {
         if (is_immediate()) return immediate().getType();
         if (is_reg_buf_data()) return reg_buf_data().type();
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return ngen::DataType::invalid;
     }
 
@@ -121,12 +123,12 @@ class ngen_operand_t {
             ret.is_negated_ = !ret.is_negated_;
             return ret;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return ngen_operand_t();
     }
 
     ngen_operand_t reinterpret(const type_t &new_type) const {
-        ir_assert(new_type.is_scalar());
+        gpu_assert(new_type.is_scalar());
         return ngen_operand_t(
                 reg_buf_data().reinterpret(to_ngen(new_type)), mod_);
     }
@@ -134,10 +136,8 @@ class ngen_operand_t {
     // Creates an operand with the requested register region based on the
     // existing region. off - offset in elements of the region data type.
     ngen_operand_t sub_reg_data(int off, int exec_size) const {
-        int off_bytes = off * ngen::getBytes(reg_buf_data().type())
-                * reg_buf_data().hs();
-        auto rd = reg_buf_data().format(off_bytes, ngen::DataType::invalid,
-                exec_size, reg_buf_data().hs());
+        auto rd = reg_buf_data().format(
+                off * reg_buf_data().hs(), exec_size, reg_buf_data().hs());
         return ngen_operand_t(rd, exec_size);
     }
 
@@ -155,7 +155,7 @@ class ngen_operand_t {
                 return flag_register() == other.flag_register();
             case ngen_operand_kind_t::reg_buf_data:
                 return reg_buf_data() == other.reg_buf_data();
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
@@ -179,7 +179,7 @@ class ngen_operand_t {
             case ngen_operand_kind_t::flag_register:
                 delete (ngen::FlagRegister *)ptr;
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
     }
 
@@ -195,7 +195,7 @@ class ngen_operand_t {
 
 template <typename T>
 T to_cpp(ngen::HW hw, const ngen_operand_t &op) {
-    ir_assert(op.is_immediate());
+    gpu_assert(op.is_immediate());
     return to_cpp<T>(op.immediate());
 }
 
diff --git a/src/gpu/intel/jit/codegen/reduce.hpp b/src/gpu/intel/jit/codegen/reduce.hpp
index 72d2db33ca1..a1d42668849 100644
--- a/src/gpu/intel/jit/codegen/reduce.hpp
+++ b/src/gpu/intel/jit/codegen/reduce.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include "gpu/intel/jit/codegen/register_scope.hpp"
 #include "gpu/intel/jit/codegen/reorder.hpp"
 #include "gpu/intel/jit/ir/reduce.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -45,7 +45,7 @@ class reduce_impl_t {
         bool is_inplace = (src_rd.base() == dst_rd.base()
                 && src_rd.byte_offset() == dst_rd.byte_offset());
         if (is_inplace) {
-            ir_assert(src_type == dst_type)
+            gpu_assert(src_type == dst_type)
                     << "Inplace operation is supported for the same type only.";
         }
 
@@ -55,7 +55,7 @@ class reduce_impl_t {
         int tile_elems = (int)tile.elems();
         auto src_tile_layout = src_layout_.map(tile);
         auto src_tile_blocks = src_tile_layout.blocks();
-        ir_assert(src_tile_blocks.size() <= 1);
+        gpu_assert(src_tile_blocks.size() <= 1);
         ngen_register_scope_t block_scope(scope.register_allocator());
         int src_stride
                 = src_tile_blocks.empty() ? 1 : (int)src_tile_blocks[0].stride;
@@ -65,16 +65,16 @@ class reduce_impl_t {
                     ngen_register_scope_t tile_scope(
                             scope.register_allocator());
                     auto dst_start = src_start;
-                    for (int i = 0; i < dst_layout_.ndims(); i++) {
+                    for (dim_idx_t i = 0; i < dst_layout_.ndims(); i++) {
                         if (dst_layout_.dims()[i] == 1) dst_start[i] = 0;
                     }
-                    int src_off = int(src_layout_(src_start) * src_type.size());
-                    int dst_off = int(dst_layout_(dst_start) * dst_type.size());
+                    int src_off = src_layout_(src_start);
+                    int dst_off = dst_layout_(dst_start);
 
                     if (is_inplace) {
                         bool same_src_dst = (dst_off == src_off);
                         if (!seen[dst_off] && !same_src_dst) {
-                            ir_error_not_expected()
+                            gpu_error_not_expected()
                                     << "Invalid inplace reduction.";
                         }
                         seen[dst_off] = true;
@@ -82,22 +82,22 @@ class reduce_impl_t {
                     }
 
                     auto d = dst_rd.format(
-                            dst_off, to_ngen(dst_type), tile_elems, 1);
+                            dst_off, tile_elems, 1, to_ngen(dst_type));
                     auto s = src_rd.format(
-                            src_off, to_ngen(src_type), tile_elems, src_stride);
+                            src_off, tile_elems, src_stride, to_ngen(src_type));
                     bool s_half_grf_aligned
                             = utils::one_of(s.byte_offset(), 0, grf_size / 2);
                     bool s_is_bf = src_type.is_bf16();
                     bool s_is_hf = src_type.is_f16();
-                    bool s_is_bf8 = src_type.is_bf8();
+                    bool s_is_fp8 = src_type.is_fp8();
                     bool d_is_f = dst_type.is_f32();
                     bool native_bf = host->exec_cfg().hw().systolic_support();
 
-                    if (src_stride != 1 || s_is_hf || s_is_bf8
+                    if (src_stride != 1 || s_is_hf || s_is_fp8
                             || (s_is_bf && !native_bf)
                             || (s_is_bf && !s_half_grf_aligned)) {
                         auto tmp_type = src_type;
-                        if ((s_is_hf && d_is_f) || s_is_bf8
+                        if ((s_is_hf && d_is_f) || s_is_fp8
                                 || (s_is_bf && !native_bf)
                                 || ((d.offset() != 0 || !s_half_grf_aligned)
                                         && (s_is_bf))) {
@@ -107,7 +107,7 @@ class reduce_impl_t {
                                 tmp_type.with_elems(tile_elems));
                         emit_reorder_1d_tile(hw_, host, tile_scope, tile_elems,
                                 s, src_stride, tmp, 1);
-                        s = tmp.format(0, to_ngen(tmp_type), tile_elems, 1);
+                        s = tmp.format(0, tile_elems, 1, to_ngen(tmp_type));
                     }
                     align_src_dst_offset(host, tile_scope, tile_elems, d, s);
                     host->add(tile_elems, d.reg_data(), d.reg_data(),
@@ -119,7 +119,7 @@ class reduce_impl_t {
     tensor_t find_1d_tile(layout_t a, layout_t b) const {
         layout_t::align_layouts(a, b);
 
-        ir_assert(!a.blocks().empty());
+        gpu_assert(!a.blocks().empty());
         // Allow trivial tile for scalar dst.
         if (b.blocks().empty()) { return tensor_t(dst_layout_.dims()); }
 
@@ -133,12 +133,12 @@ class reduce_impl_t {
                 auto a_blocks = a.blocks();
                 a_blocks.erase(a_blocks.begin());
                 a = layout_t(a.type(), a.ndims(), 0, a_blocks);
-                return find_1d_tile(a, b);
+                return find_1d_tile(std::move(a), std::move(b));
             }
             return tensor_t(std::vector<dim_t>(b.ndims(), 1));
         }
 
-        ir_assert(dim_t(b0.stride) == 1)
+        gpu_assert(dim_t(b0.stride) == 1)
                 << "Reduction is not supported for non-unit dst stride.";
 
         int grf_size = ngen::GRF::bytes(hw_);
@@ -154,7 +154,7 @@ class reduce_impl_t {
 
         if (a0.block % min_step != 0) {
             // TODO: Extend implementation to support this case.
-            ir_except_not_implemented("Reduction is not supported.");
+            gpu_except_not_implemented("Reduction is not supported.");
         }
 
         std::vector<dim_t> tile_dims(src_layout_.ndims(), 1);
diff --git a/src/gpu/intel/jit/codegen/reg_buf.hpp b/src/gpu/intel/jit/codegen/reg_buf.hpp
index e3e7814177a..8065768f297 100644
--- a/src/gpu/intel/jit/codegen/reg_buf.hpp
+++ b/src/gpu/intel/jit/codegen/reg_buf.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,8 +23,8 @@
 #include "gpu/intel/jit/codegen/register_allocator.hpp"
 #include "gpu/intel/jit/ir/core.hpp"
 #include "gpu/intel/jit/ir/grf_permutation.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -54,7 +54,7 @@ class reg_buf_t {
     int base(int reg_idx, bool apply_permute = true) const {
         if (apply_permute && !grf_perm_.is_empty())
             reg_idx = grf_perm_.map(reg_idx);
-        ir_assert(reg_idx >= 0 && reg_idx < regs())
+        gpu_assert(reg_idx >= 0 && reg_idx < regs())
                 << "Invalid index: " << reg_idx;
         int block_idx = reg_idx / block_regs_;
         return block_bases_[block_idx] + (reg_idx % block_regs_);
@@ -72,10 +72,10 @@ class reg_buf_t {
         std::unordered_set<int> seen;
         for (int i = 0; i < regs(); i++) {
             int i_mapped = grf_perm.map(i);
-            ir_assert(i_mapped >= 0 && i_mapped < regs());
+            gpu_assert(i_mapped >= 0 && i_mapped < regs());
             seen.insert(i_mapped);
         }
-        ir_assert(int(seen.size()) == regs()) << "Invalid permutation.";
+        gpu_assert(int(seen.size()) == regs()) << "Invalid permutation.";
 #endif
         grf_perm_ = grf_perm;
     }
@@ -158,6 +158,7 @@ class reg_buf_data_t {
     int base() const { return rd_.getBase(); }
 
     int byte_offset() const { return rd_.getByteOffset(); }
+    int bit_offset() const { return offset() * rd_.getBits(); }
 
     int offset() const { return rd_.getOffset(); }
 
@@ -171,31 +172,34 @@ class reg_buf_data_t {
         reg_buf_->set_grf_permutation(grf_perm);
     }
 
-    bool check_bounds(
-            int off_bytes, int len_bytes, bool is_dense = false) const {
-        ir_assert(off_bytes >= 0);
-        ir_assert(len_bytes >= 0);
-        if (len_bytes == 0) return true;
+    bool check_bounds(int off, int elems, ngen::DataType type,
+            bool is_dense = false) const {
+        gpu_assert(off >= 0 && elems >= 0);
+        if (elems == 0) return true;
 
-        int grf_size = ngen::GRF::bytes(hw());
-        int beg_off = (byte_offset() + off_bytes) / grf_size;
-        int end_off = (byte_offset() + off_bytes + len_bytes - 1) / grf_size;
+        const int grf_bits = ngen::GRF::bytes(hw()) << 3;
+        const int type_bits = ngen::getBits(type);
+        int first_bit = bit_offset() + off * type_bits;
+        int last_bit = first_bit + elems * type_bits - 1;
+        int beg_off = first_bit / grf_bits;
+        int end_off = last_bit / grf_bits;
 
-        // Check for out of bound accesses.
         if (get_grf_buf_index() + end_off >= reg_buf_->regs()) return false;
+        if (!is_dense) return true;
 
-        // Check if access is dense.
-        if (is_dense) {
-            int base0 = get_grf_base(beg_off);
-            for (int i = beg_off + 1; i < end_off + 1; i++) {
-                if (get_grf_base(i) != base0 + i) return false;
-            }
+        int base0 = get_grf_base(beg_off);
+        for (int i = beg_off + 1; i <= end_off; ++i) {
+            if (get_grf_base(i) != base0 + i) return false;
         }
         return true;
     }
 
+    bool check_bounds(int off, int elems, bool is_dense = false) const {
+        return check_bounds(off, elems, ngen::DataType::ub, is_dense);
+    }
+
     bool is_dense(int bytes) const {
-        ir_assert(check_bounds(0, bytes)) << "Invalid access.";
+        gpu_assert(check_bounds(0, bytes)) << "Invalid access.";
         return check_bounds(0, bytes, /*is_dense=*/true);
     }
 
@@ -209,74 +213,76 @@ class reg_buf_data_t {
 
     // Retype register region while preserving data.
     reg_buf_data_t reinterpret(ngen::DataType new_type) const {
-        int new_size = ngen::getBytes(new_type);
-        int old_size = ngen::getBytes(type());
+        int new_size = ngen::getBits(new_type);
+        int old_size = ngen::getBits(type());
         if (new_size == old_size) {
             auto ret = *this;
             ret.rd_.setType(new_type);
             return ret;
         } else if (new_size < old_size) {
-            ir_assert(rd_.getHS() <= 1) << "Can't reinterpret strided data to "
-                                           "differently sized data type.";
-            return format(0, new_type, rd_.getWidth() * old_size / new_size, 1);
+            gpu_assert(rd_.getHS() <= 1) << "Can't reinterpret strided data to "
+                                            "differently sized data type.";
+            return format(0, rd_.getWidth() * old_size / new_size, 1, new_type);
         } else {
-            ir_error_not_expected() << "Can't reinterpret to larger data type.";
+            gpu_error_not_expected()
+                    << "Can't reinterpret to larger data type.";
         }
         return reg_buf_data_t();
     }
 
-    ngen::Subregister subregister(int off_bytes,
-            ngen::DataType type = ngen::DataType::invalid) const {
-        ir_assert(check_bounds(off_bytes, 1)) << "Invalid access.";
-        if (type == ngen::DataType::invalid) type = rd_.getType();
-        auto rd = format(off_bytes, type, 1, 0).reg_data();
-        return ngen::Subregister(rd, rd.getOffset(), rd.getType());
-    }
-
-    ngen::Subregister subregister(int off, int width, int stride_bytes,
+    // Format register region to parameters regardless of data.
+    reg_buf_data_t format(int offset, int width = 1, int hstride = 0,
             ngen::DataType type = ngen::DataType::invalid) const {
         if (type == ngen::DataType::invalid) type = rd_.getType();
-        int off_bytes = off * stride_bytes;
-
-        ir_assert(check_bounds(off_bytes, stride_bytes * (width - 1)))
-                << "Invalid access.";
+        const auto grf_bits = ngen::GRF::bytes(hw()) << 3;
+        const auto type_bits = ngen::getBits(type);
 
-        auto rd = format(off_bytes, type, 1, 0).reg_data();
-        return ngen::Subregister(rd, rd.getOffset(), rd.getType());
-    }
-
-    // Format register region to parameters regardless of data.
-    reg_buf_data_t format(int off_bytes,
-            ngen::DataType type = ngen::DataType::invalid, int width = 1,
-            int hstride = 1) const {
-        if (type == ngen::DataType::invalid) type = rd_.getType();
-        auto grf_size = ngen::GRF::bytes(hw());
-        auto new_off = rd_.getByteOffset() + off_bytes;
-        auto new_grf_off = new_off % grf_size;
-        auto type_size = ngen::getBytes(type);
-        auto grf = get_grf(new_off / grf_size).retype(type);
+        auto off_bits = bit_offset() + offset * type_bits;
+        auto new_base = off_bits / grf_bits;
+        auto new_off = off_bits % grf_bits;
 
-        ir_assert(new_grf_off % type_size == 0);
+        gpu_assert(new_off % type_bits == 0);
 
         if (width == 1) {
             hstride = 0;
         } else if (hstride == 0) {
-            ir_assert(width == 1);
+            gpu_assert(width == 1);
         } else {
-            int max_width = 32 / type_size;
-            width = std::min(width, max_width / hstride);
-            width = std::min(width, 16);
+            const int max_width = 32 * 8 / (hstride * type_bits);
+            width = std::min({width, max_width, 16});
         }
         int vstride = width * hstride;
 
-        int region_bytes = ((width - 1) * hstride + 1) * type_size;
-        ir_assert(check_bounds(off_bytes, region_bytes)) << "Invalid access.";
+        int region = (width - 1) * hstride + 1;
+        gpu_assert(check_bounds(offset, region, type)) << "Invalid access.";
 
         auto ret = *this;
-        ret.rd_ = grf[new_grf_off / type_size](vstride, width, hstride);
+        auto grf = get_grf(new_base).retype(type);
+        ret.rd_ = grf[new_off / type_bits](vstride, width, hstride);
         return ret;
     }
 
+    reg_buf_data_t format(int offset, ngen::DataType type) const {
+        return format(offset, 1, 1, type);
+    }
+
+    reg_buf_data_t format(ngen::DataType type) const { return format(0, type); }
+
+    ngen::Subregister subregister(int offset, int width, int stride,
+            ngen::DataType type = ngen::DataType::invalid) const {
+        auto rd = format(offset * stride, width, stride, type).reg_data();
+        return {rd, rd.getOffset(), rd.getType()};
+    }
+
+    ngen::Subregister subregister(
+            int offset, ngen::DataType type = ngen::DataType::invalid) const {
+        return subregister(offset, 1, 1, type);
+    }
+
+    ngen::Subregister subregister(ngen::DataType type) const {
+        return subregister(0, type);
+    }
+
     reg_buf_data_t unpermute() const {
         int idx = get_grf_buf_index();
         int base = reg_buf_->base(idx, /*apply_permute=*/false);
@@ -303,7 +309,7 @@ class reg_buf_data_t {
         for (int i = 0; i < reg_buf_->regs(); i++) {
             if (reg_buf_->base(i) == rd_.getBase()) return i;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return -1;
     }
 
diff --git a/src/gpu/intel/jit/codegen/register_allocator.hpp b/src/gpu/intel/jit/codegen/register_allocator.hpp
index 628ab3dcd93..ad2cb968322 100644
--- a/src/gpu/intel/jit/codegen/register_allocator.hpp
+++ b/src/gpu/intel/jit/codegen/register_allocator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,9 +18,9 @@
 #define GPU_INTEL_JIT_CODEGEN_REGISTER_ALLOCATOR_HPP
 
 #include "common/z_magic.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
-#include "gpu/intel/jit/ngen/ngen_register_allocator.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
+#include "ngen.hpp"
+#include "ngen_register_allocator.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -34,17 +34,15 @@ class reg_allocator_t {
     // Based on nGEN limitation where subregisters are allocated in dword chunks.
     static const int granularity = 4;
 
-    reg_allocator_t(ngen::HW hw, const std::string &kernel_name_) : ra(hw) {
+    reg_allocator_t(ngen::HW hw) : ra(hw) {}
+    ~reg_allocator_t()
 #ifdef DNNL_DEV_MODE
-        kernel_name = kernel_name_;
-#endif
-        MAYBE_UNUSED(kernel_name_);
+    {
+        gpu_assert(!is_speculate) << "Speculative allocation never finished\n";
     }
-    ~reg_allocator_t() {
-#ifdef DNNL_DEV_MODE
-        ir_assert(!is_speculate) << "Speculative allocation never finished\n";
+#else
+            = default;
 #endif
-    }
 
     ngen::HW hardware() const { return ra.hardware(); }
 
@@ -119,6 +117,7 @@ class reg_allocator_t {
     }
 
     void setRegisterCount(int rcount) { ra.setRegisterCount(rcount); }
+    int getRegisterCount() { return ra.getRegisterCount(); }
 
 #ifdef DNNL_DEV_MODE
     int get_peak_regs() const { return peak_regs; }
@@ -150,7 +149,6 @@ class reg_allocator_t {
 #ifdef DNNL_DEV_MODE
     int peak_regs = 0;
     bool is_speculate = false;
-    std::string kernel_name;
 #endif
 
 private:
diff --git a/src/gpu/intel/jit/codegen/register_scope.hpp b/src/gpu/intel/jit/codegen/register_scope.hpp
index 5c214830b23..11e0f5a2fdf 100644
--- a/src/gpu/intel/jit/codegen/register_scope.hpp
+++ b/src/gpu/intel/jit/codegen/register_scope.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +19,8 @@
 
 #include "gpu/intel/jit/codegen/ngen_helpers.hpp"
 #include "gpu/intel/jit/codegen/reg_buf.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
-#include "gpu/intel/jit/ngen/ngen_register_allocator.hpp"
+#include "ngen.hpp"
+#include "ngen_register_allocator.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -99,21 +99,19 @@ class ngen_register_scope_t {
         return alloc_reg_buf(regs, base_bundle);
     }
 
-    reg_buf_data_t alloc_reg_data(const type_t &type, int stride_bytes = -1,
+    reg_buf_data_t alloc_reg_data(const type_t &type, int stride = 1,
             ngen::Bundle bundle = ngen::Bundle()) {
         if (type.is_scalar()) {
             auto sub = alloc_sub(to_ngen(type), bundle);
             return reg_buf_data_t(hw(), sub);
         }
 
-        int type_size = type.scalar().size();
-        if (stride_bytes == -1) stride_bytes = type_size;
         int grf_size = ngen::GRF::bytes(hw());
-        int regs = utils::div_up(type.elems() * stride_bytes, grf_size);
+        int regs = utils::div_up(
+                type.with_elems(type.elems() * stride).size(), grf_size);
         auto buf = alloc_reg_buf(regs, bundle);
         reg_buf_data_t rbd(buf);
-        return rbd.format(0, to_ngen(type.scalar()), type.elems(),
-                stride_bytes / type_size);
+        return rbd.format(0, type.elems(), stride, to_ngen(type.scalar()));
     }
 
     ngen::GRF alloc(ngen::Bundle bundle = ngen::Bundle()) {
diff --git a/src/gpu/intel/jit/codegen/reorder.hpp b/src/gpu/intel/jit/codegen/reorder.hpp
index bb76362b223..00404be0014 100644
--- a/src/gpu/intel/jit/codegen/reorder.hpp
+++ b/src/gpu/intel/jit/codegen/reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@
 #include "gpu/intel/jit/codegen/register_scope.hpp"
 #include "gpu/intel/jit/ir/reorder.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
 #include "gpu/intel/jit/utils/iterator.hpp"
 #include "gpu/intel/jit/utils/range.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -69,16 +69,16 @@ struct op_plan_t {
 
 private:
     int max_esize(const reg_data_t &reg, bool is_dst) const {
-        auto size = reg.getBytes();
+        auto size = reg.getBits();
         auto width = reg.getWidth();
         auto hs = reg.getHS();
         auto vs = reg.getVS();
-        auto remaining_bytes = 2 * grf_size_ - reg.getByteOffset();
+        auto remaining_bits = 16 * grf_size_ - size * reg.getOffset();
         auto stride = hs;
         if (!is_dst && width == 1) stride = vs;
         if (is_dst && stride == 0) stride = 1;
         if (stride == 0) return 16; // Broadcast can have max step
-        auto max_step = (remaining_bytes - 1) / (stride * size) + 1;
+        auto max_step = (remaining_bits - 1) / (stride * size) + 1;
         return utils::rnd_down_pow2(max_step);
     }
 
@@ -90,11 +90,11 @@ struct op_plan_t {
         auto width = src.getWidth();
         auto hs = src.getHS();
         auto vs = src.getVS();
-        auto size = src.getBytes();
+        auto size = src.getBits();
 
         if (!width) width = exec_size;
         auto height = exec_size / width;
-        auto grf_elems = grf_size_ / size;
+        auto grf_elems = 8 * grf_size_ / size;
 
         bool crosses_grf_boundary = false;
         auto begin = offset;
@@ -194,7 +194,7 @@ bool try_emit_batched_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     };
 
     // Do not attempt to match offsets when not moving data between pipes
-    const auto dst_off = (to_ir(dst_type).is_fp()) ? dst.byte_offset() : 0;
+    const auto dst_off = (to_ir(dst_type).is_fp()) ? dst.offset() : 0;
     for (int i = 0; i < width; i += batch) {
         int i_beg = i;
         int i_end = std::min(width, i + batch);
@@ -203,8 +203,9 @@ bool try_emit_batched_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             int esize = std::min(max_step, i_end - ii);
             esize = utils::rnd_down_pow2(esize);
 
-            auto s = src.subregister(ii, esize, src_type_size);
-            auto t = tmp.subregister(dst_off + (ii - i_beg) * 4, small_type)(4);
+            auto s = src.subregister(ii, esize, 1);
+            auto t = tmp.subregister(
+                    dst_off + (ii - i_beg), esize, 4, small_type)(4);
             ngen::InstructionModifier mod = esize;
             if (src_type == ngen::DataType::f && hw == ngen::HW::Gen9)
                 host->rnde(esize, s(1), s(1));
@@ -216,8 +217,9 @@ bool try_emit_batched_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             int esize = std::min(max_step, i_end - ii);
             esize = utils::rnd_down_pow2(esize);
 
-            auto d = dst.subregister(ii, esize, dst_type_size);
-            auto t = tmp.subregister(dst_off + (ii - i_beg) * 4, small_type)(4);
+            auto d = dst.subregister(ii, esize, 1);
+            auto t = tmp.subregister(
+                    dst_off + (ii - i_beg), esize, 4, small_type)(4);
             plan(mov, esize, d(1), t);
             ii += esize;
         }
@@ -241,9 +243,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     ngen::DataType dst_type = dst.type();
     // Replace (float -> float) by (int -> int) as word/dword moves have less
     // restrictions.
-    if (src_type == dst_type
-            && utils::one_of(src_type, ngen::DataType::bf, ngen::DataType::hf,
-                    ngen::DataType::f, ngen::DataType::df)) {
+    if (src_type == dst_type && to_ir(src_type).is_fp()) {
         int factor = (src_type == ngen::DataType::df ? 2 : 1);
         if (factor == 1 || (src_stride == 1 && dst_stride == 1)) {
             src_type
@@ -256,20 +256,22 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     }
 
     const int grf_size = ngen::GRF::bytes(hw);
-    int src_type_size = ngen::getBytes(src_type);
-    int dst_type_size = ngen::getBytes(dst_type);
-    int src_stride_bytes = src_stride * src_type_size;
-    int dst_stride_bytes = dst_stride * dst_type_size;
+    const int grf_bits = grf_size << 3;
+    int src_type_bits = ngen::getBits(src_type);
+    int dst_type_bits = ngen::getBits(dst_type);
     bool dst_b = ngen_is_b(dst_type);
     bool dst_d = ngen_is_dw(dst_type);
     bool dst_q = ngen_is_qw(dst_type);
     bool dst_f = (dst_type == ngen::DataType::f);
     bool dst_bf8 = (dst_type == ngen::DataType::bf8);
-    bool dst_hf8 = (dst_type == ngen_hf8());
+    bool dst_hf8 = (dst_type == ngen::DataType::hf8);
     bool dst_hf = (dst_type == ngen::DataType::hf);
     bool dst_bf = (dst_type == ngen::DataType::bf);
     bool dst_df = (dst_type == ngen::DataType::df);
     bool dst_xf = dst_bf || dst_f || dst_hf || dst_df;
+    bool dst_f4_e2m1 = (dst_type == ngen_f4_e2m1());
+    bool dst_f4_e3m0 = (dst_type == ngen_f4_e3m0());
+    bool dst_f4 = dst_f4_e2m1 || dst_f4_e3m0;
     bool src_b = ngen_is_b(src_type);
     bool src_d = ngen_is_dw(src_type);
     bool src_q = ngen_is_qw(src_type);
@@ -277,18 +279,17 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     bool src_hf = (src_type == ngen::DataType::hf);
     bool src_bf = (src_type == ngen::DataType::bf);
     bool src_bf8 = (src_type == ngen::DataType::bf8);
-    bool src_hf8 = (src_type == ngen_hf8());
+    bool src_hf8 = (src_type == ngen::DataType::hf8);
     bool src_df = (src_type == ngen::DataType::df);
     bool src_xf = src_bf || src_f || src_hf || src_df;
+    bool src_f4_e2m1 = (src_type == ngen_f4_e2m1());
+    bool src_f4_e3m0 = (src_type == ngen_f4_e3m0());
+    bool src_f4 = src_f4_e2m1 || src_f4_e3m0;
     bool f_to_xf = (src_f && (dst_bf || dst_hf));
     bool native_bf16 = host->exec_cfg().hw().systolic_support();
     op_plan_t plan = grf_size;
     ngen_register_scope_t lex_scope {scope.register_allocator()};
 
-    // Workaround for hf8 size since its a placeholder type undefined in ngen.
-    if (src_hf8) src_stride_bytes = src_stride;
-    if (dst_hf8) dst_stride_bytes = dst_stride;
-
     auto get_step = [&]() {
         int step = (width < 16 ? 8 : 16);
 
@@ -308,8 +309,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         if (src_stride > 4 || dst_stride > 4) step = 1;
 
         // Don't stride more than 4 bytes for word types.
-        if ((src_type_size == 2 && src_stride >= 4)
-                || (dst_type_size == 2 && dst_stride >= 4))
+        if ((src_type_bits == 16 && src_stride >= 4)
+                || (dst_type_bits == 16 && dst_stride >= 4))
             step = 1;
 
         // Non-power-of-2 strides must be handled element-by-element
@@ -323,9 +324,65 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
 
     using inst_mod_t = ngen::InstructionModifier;
     using reg_data_t = ngen::RegData;
+    using subregister_t = ngen::Subregister;
+    using immediate_t = ngen::Immediate;
+
+    auto bfn0xCA = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src0,
+                           reg_data_t src1, ngen::Immediate src2) {
+        if (hw >= ngen::HW::XeHPG)
+            host->bfn(mod, 0xCA, dst, src0, src1, src2);
+        else {
+            ngen::Immediate invs2((~(uint64_t)src2) & 0xFFFF);
+            host->and_(mod, src1, src1, src2);
+            host->and_(mod, src0, src0, invs2);
+            host->or_(mod, dst, src0, src1);
+        }
+    };
     auto shl16 = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
         host->eshl(mod, dst, src, 16);
     };
+    auto mov = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
+        host->emov(mod, dst, src);
+    };
+
+    auto u4_lower = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
+        host->and_(mod, dst, src, 0xF);
+    };
+
+    auto u4_upper = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
+        host->shr(mod, dst, src, 4);
+    };
+
+    auto cvt_u4_to_uw = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
+        auto dhs = dst.getHS();
+        auto shs = src.getHS();
+        auto esize = mod.getExecSize();
+
+        auto s = src;
+        s.setOffset(src.getOffset() / 2);
+        s.setRegion((src.getVS() + 1) / 2, (dst.getWidth() + 1) / 2, shs);
+        s.setType(ngen::DataType::ub);
+
+        if (shs == 1 && esize > 1) {
+            auto half_esize = esize / 2;
+            auto d = dst;
+            d.setRegion(dst.getVS(), (dst.getWidth() + 1) / 2, dhs * 2);
+            plan(u4_lower, half_esize, d, s);
+            d.setOffset(d.getOffset() + dhs);
+            auto eoff = host->ExecutionOffset(half_esize);
+            plan(u4_upper, half_esize | eoff, d, s);
+        } else {
+            auto d = dst;
+            d.setType(ngen::DataType::uw);
+            d.setRegion(2 * d.getWidth(), d.getWidth(), 2);
+            if ((src.getOffset() & 1) == 0)
+                plan(u4_lower, esize, d, s);
+            else
+                plan(u4_upper, esize, d, s);
+            if (dst.getHS() == 1) plan(mov, esize, dst, d);
+        }
+    };
+
     auto cvt_f32_to_bf16 = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
         auto exec_size = mod.getExecSize();
         host->add(mod, src, src, -0x8000);
@@ -336,10 +393,227 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         host->emov(mod, dst, src);
         host->add(mod | host->f0, dst, dst, 1);
     };
-    auto mov = [&](inst_mod_t mod, reg_data_t dst, reg_data_t src) {
-        host->emov(mod, dst, src);
+
+    auto cvt_f4xw_to_fp = [&](int esize, subregister_t dst, subregister_t src) {
+        const auto src_type = src.getType();
+        const auto dst_type = dst.getType();
+        const auto type_size = ngen::getBytes(dst_type);
+        const bool dst_f = dst_type == ngen::DataType::f;
+        const bool dst_hf = dst_type == ngen::DataType::hf;
+
+        subregister_t src_f = src;
+        subregister_t dst_i = dst;
+        dst_i.setType(src_type);
+        src_f.setType(dst_type);
+
+        immediate_t scale;
+        if (dst_f) {
+            const auto scale_f4 = src_f4_e2m1 ? 0x7e800000 : 0x7d800000;
+            const float scale_f = utils::bit_cast<float>(scale_f4);
+            scale = immediate_t::f(scale_f);
+        } else if (dst_hf) {
+            scale = immediate_t::hf(src_f4_e2m1 ? 0x7400 : 0x6c00);
+        } else {
+            gpu_error_not_expected();
+        }
+
+        const int f4_mantissa_bits = src_f4_e2m1 ? 1 : 0;
+        const int src_mantissa_bits = dst_f ? 23 : 10;
+        const int mantissa_shift = src_mantissa_bits - f4_mantissa_bits;
+        const int uw_stride = type_size / 2;
+        auto src_uw = src.uw()(uw_stride);
+        auto dst_uw = dst.uw(uw_stride - 1)(uw_stride * dst_stride);
+        auto bitmask = 0x7 << mantissa_shift;
+
+        // f4 upconvert sequence
+        host->eshl(esize, dst_i(dst_stride), src_uw, mantissa_shift);
+        host->eshl(esize, src_uw, src_uw, 12);
+        host->and_(esize, dst_i(dst_stride), dst_i(dst_stride), bitmask);
+        host->mul(esize, dst(dst_stride), dst(dst_stride), scale);
+        bfn0xCA(esize, dst_uw, dst_uw, src_uw, 0x8000);
+    };
+
+    auto cvt_fp_to_f4xw = [&](int esize, subregister_t dst, subregister_t src) {
+        const auto src_type = src.getType();
+        const auto dst_type = dst.getType();
+        const auto type_size = ngen::getBytes(src_type);
+
+        subregister_t src_i = src;
+        subregister_t dst_f = dst;
+        dst_f.setType(src_type);
+        src_i.setType(dst_type);
+
+        immediate_t max, scale, neg_half_ulp, rtne_mask;
+        if (src.getType() == ngen::DataType::f) {
+            const auto max_f4 = dst_f4_e2m1 ? 0x40c00000 : 0x41800000;
+            const auto scale_f4 = dst_f4_e2m1 ? 0x00800000 : 0x01800000;
+            const auto half_ulp_bit = dst_f4_e2m1 ? 0x00200000 : 0x00400000;
+            const auto rtne_mask_bits = (half_ulp_bit << 2) - 1;
+            const auto max_f4_f = utils::bit_cast<float>(max_f4);
+            const float scale_f = utils::bit_cast<float>(scale_f4);
+            dst_f = dst.f();
+            max = immediate_t::f(max_f4_f);
+            scale = immediate_t::f(scale_f);
+            neg_half_ulp = immediate_t::d(-half_ulp_bit);
+            rtne_mask = immediate_t::ud(rtne_mask_bits);
+        } else {
+            const auto half_ulp_bit = dst_f4_e2m1 ? 0x0100 : 0x0200;
+            const auto rtne_mask_bits = (half_ulp_bit << 2) - 1;
+            dst_f = dst.hf();
+            max = immediate_t::hf(dst_f4_e2m1 ? 0x4600 : 0x4c00);
+            scale = immediate_t::hf(dst_f4_e2m1 ? 0x0400 : 0x0c00);
+            neg_half_ulp = immediate_t::d(-half_ulp_bit);
+            rtne_mask = immediate_t::ud(rtne_mask_bits);
+        }
+
+        const int f4_mantissa_bits = dst_f4_e2m1 ? 1 : 0;
+        const int src_mantissa_bits = src_f ? 23 : 10;
+        const int mantissa_shift = src_mantissa_bits - f4_mantissa_bits;
+        const int exponent_shift = 8 * type_size - 4;
+        const int uw_stride = type_size / 2;
+        auto src_uw = src.uw()(uw_stride);
+        auto dst_uw = dst.uw()(uw_stride);
+
+        // f4 downconvert sequence
+        host->min_(esize, dst_f(1), abs(src(1)), max);
+        host->mul(esize, dst_f(1), dst_f(1), scale);
+        host->eadd(esize, dst(1), dst(1), neg_half_ulp);
+        host->and_(esize | host->nz | host->f0, host->null, dst(1), rtne_mask);
+        host->shr(esize, dst(1), dst(1), mantissa_shift);
+        host->add(esize | host->f0, dst(1), dst(1), 1);
+        host->shr(esize, src_uw, src_i(1), exponent_shift);
+        bfn0xCA(esize, dst_uw, dst_uw, src_uw, 0x8);
+
+        if (uw_stride > 1) host->mov(esize, dst.uw()(1), dst_uw);
+    };
+
+    auto pack_uw_to_u4 = [&](int esize, subregister_t dst, subregister_t src,
+                                 subregister_t tmp) {
+        // assumption: src and tmp are GRF-aligned
+        const auto dst_offset = dst.getOffset();
+        dst.setOffset(dst_offset / 2);
+        dst.setType(ngen::DataType::ub);
+        if (esize > 1 && dst_stride == 1) {
+            auto src_ub = src.ub();
+            host->shl(esize / 2, tmp(1), src.uw(1)(2), 4);
+            host->mov(esize / 2, src(1), src(2));
+            bfn0xCA(esize / 2, src(1), src(1), tmp(1), 0xF0);
+            host->mov(esize / 2, src_ub(1), src_ub(2));
+            host->mov(esize / 2, dst(1), src_ub(1));
+        } else {
+            auto ub_stride = dst_stride / 2;
+            auto ub_shift = dst_offset & 1;
+            host->mov(esize, tmp.ub()(ub_stride), dst(ub_stride));
+            host->mov(esize, tmp(1), tmp.ub()(ub_stride));
+            if (ub_shift) host->shl(esize, src(1), src(1), 4 * ub_shift);
+            auto mask = (uint16_t)(0xF << (4 * ub_shift));
+            bfn0xCA(esize, tmp(1), tmp(1), src(1), mask);
+            host->mov(esize, tmp.ub()(ub_stride), tmp(1));
+            host->mov(esize, dst(ub_stride), tmp.ub()(ub_stride));
+        }
     };
 
+    if (src_f4 && dst_hf) {
+        int step = get_step();
+        const int nregs = utils::div_up(4 * step, grf_size);
+        auto tmp = lex_scope.alloc_reg_buf_data(nregs).format(
+                0, 2 * width, 1, ngen::DataType::uw);
+        for (int i = 0; i < width; i += step) {
+            step = std::min(step, width - i);
+            step = utils::rnd_down_pow2(step);
+            int esize = step;
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            auto t = tmp.subregister(0, esize, 1, ngen::DataType::uw);
+            plan(cvt_u4_to_uw, esize, t(1), s(src_stride));
+            cvt_f4xw_to_fp(esize, d.hf(), t.uw());
+        }
+        return;
+    }
+
+    if (src_hf && dst_f4) {
+        int step = get_step();
+        const int nregs = utils::div_up(4 * step, grf_size);
+        auto tmp0 = lex_scope.alloc_reg_buf_data(nregs).format(
+                0, width, 1, ngen::DataType::uw);
+        auto tmp1 = lex_scope.alloc_reg_buf_data(nregs).format(
+                0, width, 1, ngen::DataType::uw);
+        for (int i = 0; i < width; i += step) {
+            step = std::min(step, width - i);
+            step = utils::rnd_down_pow2(step);
+            int esize = step;
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            auto t0 = tmp0.subregister(0, esize, 1, ngen::DataType::uw);
+            auto t1 = tmp1.subregister(0, esize, 1, ngen::DataType::uw);
+            plan(mov, esize, t1(src_stride), s.uw()(src_stride));
+            if (src_stride != 1) plan(mov, esize, t1(1), t1(src_stride));
+            cvt_fp_to_f4xw(esize, t0.uw(), t1.hf());
+            pack_uw_to_u4(esize, d, t0, t1);
+        }
+        return;
+    }
+
+    if (src_f4 && (dst_f || dst_bf)) {
+        int step = get_step();
+        const int nregs = utils::div_up(4 * step, grf_size);
+        auto tmp0 = lex_scope.alloc_reg_buf_data(nregs).format(
+                0, width, 1, ngen::DataType::ud);
+        reg_buf_data_t tmp1;
+        int tmp_stride = 1;
+        if (dst_bf)
+            tmp1 = lex_scope.alloc_reg_buf_data(nregs).format(
+                    0, width, 1, ngen::DataType::f);
+        for (int i = 0; i < width; i += step) {
+            step = std::min(step, width - i);
+            step = utils::rnd_down_pow2(step);
+            int esize = step;
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            auto t0 = tmp0.subregister(0, esize, 1);
+            ngen::Subregister t1;
+            if (dst_bf) {
+                t1 = tmp1.subregister(0, esize, 1);
+                std::swap(t1, d);
+                std::swap(tmp_stride, dst_stride);
+            }
+            plan(cvt_u4_to_uw, esize, t0.uw()(2), s(src_stride));
+            cvt_f4xw_to_fp(esize, d.f(), t0.ud());
+            if (dst_bf) {
+                std::swap(tmp_stride, dst_stride);
+                std::swap(t1, d);
+                host->emov(esize, t1.uw()(dst_stride), t1.uw(1)(2));
+                host->emov(esize, d.uw()(dst_stride), t1.uw()(dst_stride));
+            }
+        }
+        return;
+    }
+
+    if ((src_f || src_bf) && dst_f4) {
+        int step = get_step();
+        const int nregs = utils::div_up(4 * step, grf_size);
+        auto tmp0 = lex_scope.alloc_reg_buf_data(nregs).format(
+                0, width, 1, ngen::DataType::ud);
+        auto tmp1 = lex_scope.alloc_reg_buf_data(nregs).format(
+                0, width, 1, ngen::DataType::ud);
+        for (int i = 0; i < width; i += step) {
+            step = std::min(step, width - i);
+            step = utils::rnd_down_pow2(step);
+            int esize = step;
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            auto t0 = tmp0.subregister(0, esize, 1, ngen::DataType::ud);
+            auto t1 = tmp1.subregister(0, esize, 1, ngen::DataType::ud);
+            if (src_bf)
+                plan(shl16, esize, t1(1), s.uw()(src_stride));
+            else
+                plan(mov, esize, t1(1), s.ud()(src_stride));
+            cvt_fp_to_f4xw(esize, t0.ud(), t1.f());
+            pack_uw_to_u4(esize, d, t0.uw(), t1.uw());
+        }
+        return;
+    }
+
     // bf16 -> f32:
     // - bf16 must be packed: use left shift instead.
     if (src_bf && dst_f) {
@@ -348,10 +622,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
-            auto s = src.subregister(
-                    i, esize, src_stride_bytes, ngen::DataType::uw);
-            auto d = dst.subregister(
-                    i, esize, dst_stride_bytes, ngen::DataType::ud);
+            auto s = src.subregister(i, esize, src_stride, ngen::DataType::uw);
+            auto d = dst.subregister(i, esize, dst_stride, ngen::DataType::ud);
             plan(shl16, esize, d(dst_stride), s(src_stride));
         }
         return;
@@ -361,8 +633,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     // - Use d -> f -> bf/hf conversion with temporary
     if (src_d && (dst_bf || dst_hf)) {
         const int nregs = utils::div_up(width * (int)sizeof(float), grf_size);
-        auto tmp = lex_scope.alloc_reg_buf_data(nregs).format(
-                0, ngen::DataType::f);
+        auto tmp
+                = lex_scope.alloc_reg_buf_data(nregs).format(ngen::DataType::f);
         emit_reorder_1d_tile(hw, host, scope, width, src, src_stride, tmp, 1);
         emit_reorder_1d_tile(hw, host, scope, width, tmp, 1, dst, dst_stride);
         return;
@@ -372,8 +644,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     // - Use hf -> f -> bf conversion with temporary
     if ((src_hf && dst_bf) || (src_bf && dst_hf)) {
         const int nregs = utils::div_up(width * (int)sizeof(float), grf_size);
-        auto tmp = lex_scope.alloc_reg_buf_data(nregs).format(
-                0, ngen::DataType::f);
+        auto tmp
+                = lex_scope.alloc_reg_buf_data(nregs).format(ngen::DataType::f);
         emit_reorder_1d_tile(hw, host, scope, width, src, src_stride, tmp, 1);
         emit_reorder_1d_tile(hw, host, scope, width, tmp, 1, dst, dst_stride);
         return;
@@ -384,8 +656,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     // - int -> hf must be DW-aligned & strided: use f temporary
     // - Use b -> w -> f -> hf
     if (src_b && dst_hf) {
-        ir_assert(utils::one_of(dst_stride_bytes, 2, 4));
-        ir_assert(utils::one_of(src_stride_bytes, 1, 4));
+        gpu_assert(utils::one_of(dst_stride, 1, 2));
+        gpu_assert(utils::one_of(src_stride, 1, 4));
         int step = get_step();
         const int align_boundary = grf_size / 2;
         const int step_size = step * (int)sizeof(uint32_t);
@@ -397,11 +669,11 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
             auto byte_offset = 2 * (d.getByteOffset() % align_boundary);
-            auto t1 = tmp1.subregister(byte_offset, ngen::DataType::w);
-            auto t2 = tmp2.subregister(byte_offset, ngen::DataType::f);
+            auto t1 = tmp1.subregister(byte_offset / 2, ngen::DataType::w);
+            auto t2 = tmp2.subregister(byte_offset / 4, ngen::DataType::f);
             auto t1_as_hf = t1.reinterpret(0, ngen::DataType::hf);
             auto d_as_w = d.reinterpret(0, ngen::DataType::w);
 
@@ -427,15 +699,15 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             if ((src_stride > 1 && s.getByteOffset() > 0)
                     || (d.getByteOffset() > 0 && dst_stride != src_stride)) {
                 host->mov(esize,
-                        tmp1.subregister(0, ngen::DataType::ub)(src_stride),
+                        tmp1.subregister(ngen::DataType::ub)(src_stride),
                         s.reinterpret(0, ngen::DataType::ub)(src_stride));
 
-                host->mov(esize, tmp1.subregister(0, ngen::DataType::ub)(1),
-                        tmp1.subregister(0, ngen::DataType::ub)(src_stride));
+                host->mov(esize, tmp1.subregister(ngen::DataType::ub)(1),
+                        tmp1.subregister(ngen::DataType::ub)(src_stride));
 
                 host->mov(esize,
                         d.reinterpret(0, ngen::DataType::ub)(dst_stride),
-                        tmp1.subregister(0, ngen::DataType::ub)(1));
+                        tmp1.subregister(ngen::DataType::ub)(1));
             } else {
                 host->mov(esize,
                         d.reinterpret(0, ngen::DataType::ub)(dst_stride),
@@ -445,10 +717,114 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         return;
     }
 
+    // native x <-> xf8
+    if (((src_bf8 || dst_bf8) && hw >= ngen::HW::XeHPC)
+            || (hw >= ngen::HW::Xe3 && (src_hf8 || dst_hf8))) {
+        int step = get_step();
+        ngen::DataType src_raw
+                = (src_bf8 || src_hf8) ? ngen::DataType::ub : ngen::DataType::w;
+        ngen::DataType dst_raw
+                = (dst_bf8 || dst_hf8) ? ngen::DataType::ub : ngen::DataType::w;
+        ngen::DataType conv_src
+                = (src_bf8 || src_hf8) ? src_type : ngen::DataType::hf;
+        ngen::DataType conv_dst
+                = (dst_bf8 || dst_hf8) ? dst_type : ngen::DataType::hf;
+        const int conv_dst_type_bits = ngen::getBits(conv_dst);
+        const int conv_src_type_bits = ngen::getBits(conv_src);
+        const bool do_pre_reorder = !(src_hf || src_bf8 || src_hf8);
+        const bool do_post_reorder = !(dst_hf || dst_bf8 || dst_hf8);
+        int conv_dst_stride = dst_stride;
+        int conv_src_stride = src_stride;
+        if (do_post_reorder) {
+            if (dst_type_bits < conv_dst_type_bits)
+                conv_dst_stride = conv_dst_type_bits / dst_type_bits;
+        }
+        if (do_pre_reorder) { conv_src_stride = 1; }
+        const int step_nregs
+                = utils::div_up(step * ((int)sizeof(ngen::half)), grf_size);
+        auto tmp1 = lex_scope.alloc_reg_buf_data(step_nregs);
+        auto tmp2 = lex_scope.alloc_reg_buf_data(step_nregs);
+        // Only conversion between hf and bf8 supported with mov so additional
+        // reorders generated when required.
+        if (do_pre_reorder) {
+            const int src_nregs = utils::div_up(
+                    width * conv_src_type_bits * conv_src_stride, grf_bits);
+            auto tmp_src
+                    = lex_scope.alloc_reg_buf_data(src_nregs).format(conv_src);
+            emit_reorder_1d_tile(hw, host, scope, width, src, src_stride,
+                    tmp_src, conv_src_stride);
+            src = std::move(tmp_src);
+        }
+        if (do_post_reorder) {
+            const int dst_nregs = utils::div_up(
+                    width * conv_dst_type_bits * conv_dst_stride, grf_bits);
+            auto tmp_dst
+                    = lex_scope.alloc_reg_buf_data(dst_nregs).format(conv_dst);
+            dst = std::move(tmp_dst);
+        }
+        for (int i = 0; i < width; i += step) {
+            step = std::min(step, width - i);
+            step = utils::rnd_down_pow2(step);
+            int esize = step;
+
+            auto s = src.subregister(i, esize, conv_src_stride);
+            auto d = dst.subregister(i, esize, conv_dst_stride);
+            bool some_offset
+                    = (s.getByteOffset() != 0 || d.getByteOffset() != 0);
+            bool some_stride = (conv_dst_stride > 1 || conv_src_stride > 1);
+            assert((src_hf || dst_hf) || esize <= 16);
+            // Esize 1 disabled for hf <-> bf8.
+            // bcast to tmp reg, convert 2 vals, copy one to dst.
+            if (esize == 1) {
+                auto t1 = tmp1.subregister(ngen::DataType::hf);
+                auto t2 = tmp2.subregister(src_raw);
+                plan(mov, 2, t1.reinterpret(0, src_raw)(1),
+                        s.reinterpret(0, src_raw)(0));
+                plan(mov, 2, t2.reinterpret(0, conv_dst)(1),
+                        t1.reinterpret(0, conv_src)(1));
+                plan(mov, 1, d.reinterpret(0, dst_raw)(1),
+                        t2.reinterpret(0, dst_raw)(1));
+                // Conversion allowed only with 0 offset, matching stride.
+            } else if (some_stride || some_offset) {
+                if (dst_bf8 || dst_hf8) {
+                    auto t1 = tmp1.subregister(ngen::DataType::hf);
+                    auto t2 = tmp2.subregister(conv_src);
+                    if (s.getByteOffset() != 0) {
+                        plan(mov, esize,
+                                t2.reinterpret(0, src_raw)(conv_src_stride),
+                                s.reinterpret(0, src_raw)(conv_src_stride));
+                        s = t2;
+                    }
+                    plan(mov, esize, t1.reinterpret(0, src_raw)(1),
+                            s.reinterpret(0, src_raw)(conv_src_stride));
+                    plan(mov, esize, t2.reinterpret(0, dst_type)(1),
+                            t1.reinterpret(0, conv_src)(1));
+                    plan(mov, esize, d.reinterpret(0, dst_raw)(conv_dst_stride),
+                            t2.reinterpret(0, dst_raw)(1));
+                } else if (src_bf8 || src_hf8) {
+                    emit_reorder_1d_tile(hw, host, scope, step,
+                            src.format(i * conv_src_stride, src_raw),
+                            conv_src_stride, tmp1.format(src_raw), 1);
+                    auto t1 = tmp1.subregister(conv_src);
+                    auto t2 = tmp2.subregister(conv_dst);
+                    plan(mov, esize, t2(1), t1(1));
+                    plan(mov, esize, d.reinterpret(0, dst_raw)(conv_dst_stride),
+                            t2.reinterpret(0, dst_raw)(1));
+                }
+            } else {
+                plan(mov, esize, d(conv_dst_stride), s(conv_src_stride));
+            }
+        }
+        if (do_post_reorder) {
+            emit_reorder_1d_tile(hw, host, scope, width, dst, conv_dst_stride,
+                    _dst, dst_stride);
+        }
+        return;
+    }
+
     // hf8 -> x
     if (src_hf8) {
         int step = get_step();
-        const int dst_stride_bytes = 2 * dst_stride;
         const int step_nregs
                 = utils::div_up(step * ((int)sizeof(ngen::half)), grf_size);
         const bool do_post_reorder = !dst_hf;
@@ -457,12 +833,18 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             const int dst_nregs
                     = utils::div_up(width * 2 * dst_stride, grf_size);
             auto tmp_dst = lex_scope.alloc_reg_buf_data(dst_nregs).format(
-                    0, ngen::DataType::hf);
+                    ngen::DataType::hf);
             dst = std::move(tmp_dst);
         }
-        if (do_pre_reorder)
+        if (do_pre_reorder) {
+            const int src_nregs
+                    = utils::div_up(width * 2 * src_stride, grf_size);
+            auto tmp_src
+                    = lex_scope.alloc_reg_buf_data(src_nregs).format(src_type);
             emit_reorder_1d_tile(
-                    hw, host, scope, width, src, src_stride, src, 1);
+                    hw, host, scope, width, src, src_stride, tmp_src, 1);
+            src = std::move(tmp_src);
+        }
         auto tmp1 = lex_scope.alloc_reg_buf_data(step_nregs);
         auto tmp2 = lex_scope.alloc_reg_buf_data(step_nregs);
         for (int i = 0; i < width; i += step) {
@@ -470,27 +852,29 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
             auto s = src.subregister(i, esize, 1);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
-            host->eshl(esize, tmp1.subregister(0, ngen::DataType::uw)(1),
+            auto d = dst.subregister(i, esize, dst_stride);
+            host->eshl(esize, tmp1.subregister(ngen::DataType::uw)(1),
                     s.reinterpret(0, ngen::DataType::ub)(1), 8);
-            host->eshl(esize, tmp2.subregister(0, ngen::DataType::uw)(1),
+            host->eshl(esize, tmp2.subregister(ngen::DataType::uw)(1),
                     s.reinterpret(0, ngen::DataType::ub)(1), 7);
-            host->and_(esize, tmp2.subregister(0, ngen::DataType::uw)(1),
-                    tmp2.subregister(0, ngen::DataType::uw)(1), 0x3F80);
-            host->cmp(esize | host->eq | host->f0[0], host->null.uw(),
-                    tmp2.subregister(0, ngen::DataType::uw)(1), 0x3F80);
-            host->mul(esize, tmp2.subregister(0, ngen::DataType::hf)(1),
-                    tmp2.subregister(0, ngen::DataType::hf)(1),
+            host->and_(esize, tmp2.subregister(ngen::DataType::uw)(1),
+                    tmp2.subregister(ngen::DataType::uw)(1), 0x3F80);
+
+            host->xor_(esize, tmp1.subregister(ngen::DataType::uw)(1),
+                    tmp1.subregister(ngen::DataType::uw)(1), 0x7F00);
+            host->mul(esize, tmp2.subregister(ngen::DataType::hf)(1),
+                    tmp2.subregister(ngen::DataType::hf)(1),
                     ngen::Immediate::hf(0x5c00));
-            host->mov(esize | host->f0[0],
-                    tmp2.subregister(0, ngen::DataType::uw)(1), 0x7C01);
-            host->csel(esize | host->gt,
-                    tmp2.subregister(0, ngen::DataType::hf)(1),
-                    tmp2.subregister(0, ngen::DataType::hf)(1),
-                    -tmp2.subregister(0, ngen::DataType::hf)(1),
-                    tmp1.subregister(0, ngen::DataType::hf)(1));
+            host->csel(esize | host->ze,
+                    tmp2.subregister(ngen::DataType::hf)(1),
+                    ngen::Immediate::hf(0x7C01),
+                    tmp2.subregister(ngen::DataType::hf)(1),
+                    tmp1.subregister(ngen::DataType::hf)(1));
+            bfn0xCA(esize, tmp2.subregister(ngen::DataType::uw)(1),
+                    tmp2.subregister(ngen::DataType::uw)(1),
+                    tmp1.subregister(ngen::DataType::uw)(1), 0x8000);
             host->mov(esize, d.reinterpret(0, ngen::DataType::uw)(dst_stride),
-                    tmp2.subregister(0, ngen::DataType::uw)(1));
+                    tmp2.subregister(ngen::DataType::uw)(1));
         }
         if (do_post_reorder) {
             emit_reorder_1d_tile(
@@ -509,12 +893,12 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         if (do_pre_reorder) {
             const int src_nregs = utils::div_up(width * 2, grf_size);
             auto tmp_src = lex_scope.alloc_reg_buf_data(src_nregs).format(
-                    0, ngen::DataType::hf);
+                    ngen::DataType::hf);
             emit_reorder_1d_tile(
                     hw, host, scope, width, src, src_stride, tmp_src, 1);
             src = std::move(tmp_src);
-            src_type_size = 2;
         } else {
+            // FIXME: overwriting src is dangerous
             emit_reorder_1d_tile(
                     hw, host, scope, width, src, src_stride, src, 1);
         }
@@ -524,10 +908,10 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_type_size);
+            auto s = src.subregister(i, esize, src_stride);
             auto d = dst.subregister(i, esize, dst_stride);
             // get sign bits
-            host->and_(esize | host->nz | host->f2[0], host->null.uw(),
+            host->and_(esize | host->nz | host->f0[1], host->null.uw(),
                     s.reinterpret(0, ngen::DataType::uw)(1), 0x8000);
             // multiply by hf 128 to force overflow of exponent
             host->mul(esize, s.reinterpret(0, ngen::DataType::hf)(1),
@@ -546,7 +930,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                     s.reinterpret(0, ngen::DataType::uw)(1), -0x40);
             // check for zero mantissa.
             host->and_(esize | host->nz | host->f1[0], host->null.uw(),
-                    s.reinterpret(0, ngen::DataType::uw)(1), 0x3FF);
+                    s.reinterpret(0, ngen::DataType::uw)(1), 0x00FF);
             host->eshr(esize, s.reinterpret(0, ngen::DataType::uw)(1),
                     s.reinterpret(0, ngen::DataType::uw)(1), 7);
             host->add(esize | host->f1[0],
@@ -555,16 +939,16 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             host->mov(esize | host->f0[0],
                     s.reinterpret(0, ngen::DataType::uw)(1), 0x7F);
             // handle sign.
-            host->or_(esize | host->f2[0],
+            host->or_(esize | host->f0[1],
                     s.reinterpret(0, ngen::DataType::uw)(1),
                     s.reinterpret(0, ngen::DataType::uw)(1), 0x80);
 
-            host->mov(esize, tmp1.subregister(0, ngen::DataType::ub)(2),
+            host->mov(esize, tmp1.subregister(ngen::DataType::ub)(2),
                     s.reinterpret(0, ngen::DataType::uw)(1));
-            host->mov(esize, tmp1.subregister(0, ngen::DataType::ub)(1),
-                    tmp1.subregister(0, ngen::DataType::ub)(2));
+            host->mov(esize, tmp1.subregister(ngen::DataType::ub)(1),
+                    tmp1.subregister(ngen::DataType::ub)(2));
             host->mov(esize, d.reinterpret(0, ngen::DataType::ub)(dst_stride),
-                    tmp1.subregister(0, ngen::DataType::ub)(1));
+                    tmp1.subregister(ngen::DataType::ub)(1));
         }
         return;
     }
@@ -580,15 +964,15 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 = src_bf8 ? ngen::DataType::bf8 : ngen::DataType::hf;
         ngen::DataType conv_dst
                 = dst_bf8 ? ngen::DataType::bf8 : ngen::DataType::hf;
-        const int conv_dst_type_size = ngen::getBytes(conv_dst);
-        const int conv_src_type_size = ngen::getBytes(conv_src);
+        const int conv_dst_type_bits = ngen::getBits(conv_dst);
+        const int conv_src_type_bits = ngen::getBits(conv_src);
         const bool do_pre_reorder = !(src_hf || src_bf8);
         const bool do_post_reorder = !(dst_hf || dst_bf8);
         int conv_dst_stride = dst_stride;
         int conv_src_stride = src_stride;
         if (do_post_reorder) {
-            if (dst_type_size < conv_dst_type_size)
-                conv_dst_stride = conv_dst_type_size / dst_type_size;
+            if (dst_type_bits < conv_dst_type_bits)
+                conv_dst_stride = conv_dst_type_bits / dst_type_bits;
         }
         if (do_pre_reorder) { conv_src_stride = 1; }
         const int step_nregs
@@ -599,29 +983,27 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         // reorders generated when required.
         if (do_pre_reorder) {
             const int src_nregs = utils::div_up(
-                    width * conv_src_type_size * conv_src_stride, grf_size);
-            auto tmp_src = lex_scope.alloc_reg_buf_data(src_nregs).format(
-                    0, conv_src);
+                    width * conv_src_type_bits * conv_src_stride, grf_bits);
+            auto tmp_src
+                    = lex_scope.alloc_reg_buf_data(src_nregs).format(conv_src);
             emit_reorder_1d_tile(hw, host, scope, width, src, src_stride,
                     tmp_src, conv_src_stride);
             src = std::move(tmp_src);
         }
         if (do_post_reorder) {
             const int dst_nregs = utils::div_up(
-                    width * conv_dst_type_size * conv_dst_stride, grf_size);
-            auto tmp_dst = lex_scope.alloc_reg_buf_data(dst_nregs).format(
-                    0, conv_dst);
+                    width * conv_dst_type_bits * conv_dst_stride, grf_bits);
+            auto tmp_dst
+                    = lex_scope.alloc_reg_buf_data(dst_nregs).format(conv_dst);
             dst = std::move(tmp_dst);
         }
-        const int conv_src_stride_bytes = conv_src_type_size * conv_src_stride;
-        const int conv_dst_stride_bytes = conv_dst_type_size * conv_dst_stride;
         for (int i = 0; i < width; i += step) {
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, conv_src_stride_bytes);
-            auto d = dst.subregister(i, esize, conv_dst_stride_bytes);
+            auto s = src.subregister(i, esize, conv_src_stride);
+            auto d = dst.subregister(i, esize, conv_dst_stride);
             bool some_offset
                     = (s.getByteOffset() != 0 || d.getByteOffset() != 0);
             bool some_stride = (conv_dst_stride > 1 || conv_src_stride > 1);
@@ -629,8 +1011,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             // Esize 1 disabled for hf <-> bf8.
             // bcast to tmp reg, convert 2 vals, copy one to dst.
             if (esize == 1) {
-                auto t1 = tmp1.subregister(0, ngen::DataType::hf);
-                auto t2 = tmp2.subregister(0, src_raw);
+                auto t1 = tmp1.subregister(ngen::DataType::hf);
+                auto t2 = tmp2.subregister(src_raw);
                 plan(mov, 2, t1.reinterpret(0, src_raw)(1),
                         s.reinterpret(0, src_raw)(0));
                 plan(mov, 2, t2.reinterpret(0, conv_dst)(1),
@@ -640,8 +1022,8 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 // Conversion allowed only with 0 offset, matching stride.
             } else if (some_stride || some_offset) {
                 if (dst_bf8) {
-                    auto t1 = tmp1.subregister(0, ngen::DataType::hf);
-                    auto t2 = tmp2.subregister(0, conv_src);
+                    auto t1 = tmp1.subregister(ngen::DataType::hf);
+                    auto t2 = tmp2.subregister(conv_src);
                     plan(mov, esize, t1.reinterpret(0, src_raw)(1),
                             s.reinterpret(0, src_raw)(conv_src_stride));
                     plan(mov, esize, t2.reinterpret(0, dst_type)(1),
@@ -651,9 +1033,9 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 } else if (src_bf8) {
                     emit_reorder_1d_tile(hw, host, scope, step,
                             src.format(i * conv_src_stride, src_raw),
-                            conv_src_stride, tmp1.format(0, src_raw), 1);
-                    auto t1 = tmp1.subregister(0, conv_src);
-                    auto t2 = tmp2.subregister(0, conv_dst);
+                            conv_src_stride, tmp1.format(src_raw), 1);
+                    auto t1 = tmp1.subregister(conv_src);
+                    auto t2 = tmp2.subregister(conv_dst);
                     plan(mov, esize, t2(1), t1(1));
                     plan(mov, esize, d.reinterpret(0, dst_raw)(conv_dst_stride),
                             t2.reinterpret(0, dst_raw)(1));
@@ -671,12 +1053,11 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
 
     // hf -> b
     if (src_hf && dst_b) {
-        ir_assert(utils::one_of(src_stride_bytes, 2, 4));
-        ir_assert(utils::one_of(dst_stride_bytes, 1, 4));
+        gpu_assert(utils::one_of(src_stride, 1, 2));
+        gpu_assert(utils::one_of(dst_stride, 1, 4));
         int step = get_step();
         const int tmp_stride = 4;
-        const int tmp_stride_bytes = tmp_stride * dst_type_size;
-        const int step_size = step * tmp_stride_bytes;
+        const int step_size = step * tmp_stride;
         const int nregs = 1 + utils::div_up(step_size, grf_size);
         auto tmp1 = lex_scope.alloc_reg_buf_data(nregs);
         auto tmp2 = lex_scope.alloc_reg_buf_data(nregs);
@@ -685,16 +1066,16 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
-            const int t1_offset = (esize == 1) ? 0 : s.getByteOffset();
-            const int t2_offset = (d.getOffset() % 16) * tmp_stride_bytes;
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            const int t1_offset = (esize == 1) ? 0 : s.getByteOffset() / 2;
+            const int t2_offset = (d.getOffset() % 16) * tmp_stride;
             auto t1 = tmp1.subregister(t1_offset, dst_type);
             auto t2 = tmp2.subregister(t2_offset, dst_type);
 
             if (esize == 1) {
                 if (hw == ngen::HW::Gen9) {
-                    auto t1_f = tmp1.subregister(0, ngen::DataType::f);
+                    auto t1_f = tmp1.subregister(ngen::DataType::f);
                     plan(mov, 1, t1_f, s);
                     host->rnde(1, t1_f, t1_f);
                     auto t2_h = tmp2.subregister(t1_offset, ngen::DataType::hf);
@@ -707,14 +1088,13 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             }
 
             // Operands are already dword aligned as required by F-pipe
-            if (dst_stride_bytes >= tmp_stride_bytes) {
+            if (dst_stride >= tmp_stride) {
                 if (hw != ngen::HW::Gen9) {
                     plan(mov, esize | host->sat, d(dst_stride), s(src_stride));
                 } else {
-                    ir_assert(dst_stride_bytes % tmp_stride_bytes == 0);
-                    auto d_f = dst.format(i * ngen::getBytes(ngen::DataType::f),
-                            ngen::DataType::f, esize,
-                            dst_stride_bytes / tmp_stride_bytes);
+                    gpu_assert(dst_stride % tmp_stride == 0);
+                    auto d_f = dst.format(i, esize, dst_stride / tmp_stride,
+                            ngen::DataType::f);
                     plan(mov, esize, d_f, s(src_stride));
                     host->rnde(esize, d_f, d_f);
                     plan(mov, esize | host->sat, d(dst_stride), d_f);
@@ -725,7 +1105,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             if (hw != ngen::HW::Gen9) {
                 plan(mov, esize | host->sat, t1(tmp_stride), s(src_stride));
             } else {
-                auto t1_f = tmp1.format(t1_offset, ngen::DataType::f, esize);
+                auto t1_f = tmp1.format(t1_offset, esize, 1, ngen::DataType::f);
                 plan(mov, esize, t1_f, s(src_stride));
                 host->rnde(esize, t1_f, t1_f);
                 plan(mov, esize | host->sat, t1(tmp_stride), t1_f);
@@ -746,8 +1126,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         int step = get_step();
         const auto tmp_type = src_type;
         const int tmp_stride = 2;
-        const int tmp_stride_bytes = tmp_stride * src_type_size;
-        const int reg_size = dst.byte_offset() + width * tmp_stride_bytes;
+        const int reg_size = dst.byte_offset() + 4 * width * tmp_stride;
         const int nregs = utils::div_up(reg_size, grf_size);
         auto tmp = lex_scope.alloc_reg_buf_data(nregs);
         for (int i = 0; i < width; i += step) {
@@ -755,9 +1134,9 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
-            auto t = tmp.subregister(d.getByteOffset(), tmp_type);
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            auto t = tmp.subregister(d.getOffset() * 2, tmp_type);
             plan(mov, esize, t.d()(tmp_stride), s.d()(src_stride));
             plan(mov, esize, d(dst_stride), t(tmp_stride));
         }
@@ -771,18 +1150,17 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         int step = get_step();
         const auto tmp_type = dst_type;
         const int tmp_stride = 2;
-        const int tmp_stride_bytes = tmp_stride * src_type_size;
-        const int reg_size = dst.byte_offset() + width * tmp_stride_bytes;
-        const int nregs = utils::div_up(reg_size, grf_size);
+        const int reg_bits = dst.byte_offset() + 4 * width * tmp_stride;
+        const int nregs = utils::div_up(reg_bits, grf_size);
         auto tmp = lex_scope.alloc_reg_buf_data(nregs);
         for (int i = 0; i < width; i += step) {
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
-            auto t = tmp.subregister(s.getByteOffset(), tmp_type);
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
+            auto t = tmp.subregister(s.getOffset() * 2, tmp_type);
             plan(mov, esize, t(tmp_stride), s(src_stride));
             plan(mov, esize, d.d()(dst_stride), t.d()(tmp_stride));
         }
@@ -793,19 +1171,18 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     if (src_f && dst_hf) {
         int step = get_step();
         const auto tmp_type = dst_type;
-        const int reg_size
-                = src.byte_offset() + step * 2 * src_stride * dst_type_size;
+        const int reg_size = src.byte_offset() + 8 * step * src_stride;
         const int nregs = utils::div_up(reg_size, grf_size);
         auto tmp1 = lex_scope.alloc_reg_buf_data(nregs);
         auto tmp2 = lex_scope.alloc_reg_buf_data(nregs);
         for (int i = 0; i < width; i += step) {
-            int tmp_stride = 2;
+            int tmp_stride = 2 * src_stride;
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
 
             if (esize == 1
                     || (d.getByteOffset() == s.getByteOffset()
@@ -814,20 +1191,17 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 continue;
             }
 
-            const auto align_boundary = grf_size / 2;
-            auto tmp_offset = 2 * dst_type_size
-                    * (d.getOffset() % (align_boundary / 2));
             auto t1 = tmp1.subregister(
-                    s.getByteOffset() % (nregs * grf_size), tmp_type);
-            plan(mov, esize, t1(2 * src_stride), s(src_stride));
-            if (dst_stride == 1
-                    && (s.getByteOffset() != tmp_offset || src_stride != 1)) {
+                    s.getByteOffset() % (nregs * grf_size) / 2, tmp_type);
+            plan(mov, esize, t1(tmp_stride), s(src_stride));
+            if (hw >= ngen::HW::XeHPC && dst_stride == 1
+                    && t1.getOffset() / 2 != d.getOffset() % 16) {
                 // Packed word dst needs specially aligned and strided src
-                auto t2 = tmp2.subregister(tmp_offset, tmp_type);
-                plan(mov, esize, t2.w()(tmp_stride), t1.w()(2 * src_stride));
-                t1 = t2;
-            } else
-                tmp_stride = 2 * src_stride;
+                auto t2 = tmp2.subregister(2 * (d.getOffset() % 16), tmp_type);
+                plan(mov, esize, t2.w()(tmp_stride), t1.w()(tmp_stride));
+                std::swap(t1, t2);
+                tmp_stride = 2;
+            }
             plan(mov, esize, d.w()(dst_stride), t1.w()(tmp_stride));
         }
         return;
@@ -837,7 +1211,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     if (dst_f && src_hf) {
         int step = get_step();
         const auto tmp_type = src_type;
-        const int reg_size = step * 2 * dst_stride * src_type_size;
+        const int reg_size = 4 * dst_stride * step;
         const int nregs = utils::div_up(reg_size, grf_size);
         auto tmp = lex_scope.alloc_reg_buf_data(nregs);
         for (int i = 0; i < width; i += step) {
@@ -845,14 +1219,14 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
 
             int tmp_stride = 2 * dst_stride;
             if (esize > 1
                     && (s.getByteOffset() != d.getByteOffset()
                             || src_stride != 2 * dst_stride)) {
-                auto t = tmp.subregister(d.getByteOffset(), tmp_type);
+                auto t = tmp.subregister(d.getOffset() * 2, tmp_type);
                 plan(mov, esize, t.w()(tmp_stride), s.w()(src_stride));
                 s = t;
             } else
@@ -869,9 +1243,9 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     bool d_or_f_to_b = (src_d || src_f) && dst_b;
     bool b_to_d_or_f = (dst_d || dst_f) && src_b;
     if (d_or_f_to_b || b_to_d_or_f) {
-        if (dst_d || dst_f) ir_assert(dst_stride_bytes == 4);
-        if (src_d || src_f) ir_assert(src_stride_bytes == 4);
-        if (dst_b) ir_assert(utils::one_of(dst_stride_bytes, 1, 4, 8));
+        if (dst_d || dst_f) gpu_assert(dst_stride == 1);
+        if (src_d || src_f) gpu_assert(src_stride == 1);
+        if (dst_b) gpu_assert(utils::one_of(dst_stride, 1, 4, 8));
         int step = get_step();
         const int step_size = step * (int)sizeof(uint32_t);
         const int nregs = 1 + utils::div_up(step_size, grf_size);
@@ -882,25 +1256,25 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
             step = utils::rnd_down_pow2(step);
             int esize = step;
 
-            auto s = src.subregister(i, esize, src_stride_bytes);
-            auto d = dst.subregister(i, esize, dst_stride_bytes);
+            auto s = src.subregister(i, esize, src_stride);
+            auto d = dst.subregister(i, esize, dst_stride);
             if (src_d || src_f) {
                 // d -> b.
                 if (src_f && hw == ngen::HW::Gen9)
                     host->rnde(esize, s(src_stride), s(src_stride));
                 if (esize == 1) {
                     // relaxed F-pipe alignment requirements for f32 broadcast
-                    auto t = tmp1.subregister(0, dst_type);
+                    auto t = tmp1.subregister(dst_type);
                     plan(mov, 2 | host->sat, t(4), s);
                     plan(mov, 1, d, t);
-                } else if (dst_stride_bytes == 1) {
+                } else if (dst_stride == 1) {
                     auto offset_bytes = src_f ? s.getByteOffset()
                                               : 4 * (d.getByteOffset() % 16);
                     auto t = tmp1.subregister(offset_bytes, dst_type)(4);
                     plan(mov, esize | host->sat, t, s(src_stride));
                     if (offset_bytes != 4 * (d.getByteOffset() % 16)) {
                         auto t2 = tmp2.subregister(
-                                4 * (d.getByteOffset() % 16), dst_type)(4);
+                                d.getByteOffset() % 16, esize, 4, dst_type)(4);
                         plan(mov, esize, t2, t);
                         t = t2;
                     }
@@ -912,7 +1286,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 if (esize == 1) {
                     // Direct x8 -> x32 scalar cast is not always
                     // supported. Use intermediate cast to s16.
-                    auto t = tmp1.subregister(0, ngen::DataType::w)(1);
+                    auto t = tmp1.subregister(ngen::DataType::w)(1);
                     plan(mov, esize, t, s(src_stride));
                     plan(mov, esize, d(dst_stride), t);
                 } else if (src_b) {
@@ -929,33 +1303,25 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     }
 
     // Handle mov(src.uw(x)(1), dst.uw(y)(2)).
-    if (src_type_size == 2 && dst_type_size == 2 && src_stride == 2
+    if (src_type_bits == 16 && dst_type_bits == 16 && src_stride == 2
             && dst_stride == 1 && width > 1) {
         int step = get_step();
-        auto step_size = step * src_type_size * src_stride;
-        auto tmp_regs = utils::div_up(step_size, grf_size);
+        auto step_size = 2 * step * src_stride;
+        auto tmp_regs = 2 * utils::div_up(step_size, grf_size);
         auto tmp = lex_scope.alloc_reg_buf_data(tmp_regs);
-        auto tmp2_regs = utils::div_up(step * dst_type_size, grf_size);
-        auto tmp2 = lex_scope.alloc_reg_buf_data(tmp2_regs);
         for (int i = 0; i < width; i += step) {
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
-            auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
-                    esize, src_stride);
-            auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
-                    esize, dst_stride);
-            auto d_old = d;
-            bool d_half_grf_aligned
-                    = utils::one_of(d.byte_offset(), 0, grf_size / 2);
-            if (!d_half_grf_aligned) { d = tmp2.format(0, dst_type, esize); }
-            if (s.offset() != 0) {
-                auto t = tmp.format(0, src_type, esize, src_stride);
+            auto s = src.format(i * src_stride, esize, src_stride);
+            auto d = dst.format(i * dst_stride, esize, dst_stride);
+            if (2 * (d.offset() % 16) != s.offset() && hw >= ngen::HW::XeHPC) {
+                auto t = tmp.format(
+                        2 * (d.offset() % 16), esize, src_stride, src_type);
                 plan(mov, esize, t, s);
                 s = std::move(t);
             }
             plan(mov, esize, d, s);
-            if (!d_half_grf_aligned) plan(mov, esize, d_old, d);
         }
         return;
     }
@@ -966,9 +1332,9 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     // less limitations.
     if (src_xf || dst_xf) {
         // forcing floats if on Gen9, for RNDE only works with floats there
-        auto real_type_size = (src_xf && !src_f && hw == ngen::HW::Gen9)
-                ? ngen::getBytes(ngen::DataType::f)
-                : dst_type_size;
+        auto real_type_bits = (src_xf && !src_f && hw == ngen::HW::Gen9)
+                ? ngen::getBits(ngen::DataType::f)
+                : dst_type_bits;
 
         bool local_src_f = src_f;
         bool local_src_hf = src_hf;
@@ -976,21 +1342,19 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         auto local_src_type = src_type;
 
         int step = get_step();
-        auto tmp_regs = utils::div_up(step * real_type_size, grf_size);
+        auto tmp_regs = utils::div_up(step * real_type_bits, grf_bits);
         auto tmp = lex_scope.alloc_reg_buf_data(tmp_regs);
         for (int i = 0; i < width; i += step) {
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
-            ir_assert(math::is_pow2(esize));
-            auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
-                    esize, src_stride);
-            auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
-                    esize, dst_stride);
+            gpu_assert(math::is_pow2(esize));
+            auto s = src.format(i * src_stride, esize, src_stride);
+            auto d = dst.format(i * dst_stride, esize, dst_stride);
             auto d_old = d;
 
             if (hw == ngen::HW::Gen9) {
-                auto t = tmp.format(0, ngen::DataType::f, esize);
+                auto t = tmp.format(0, esize, 1, ngen::DataType::f);
                 if (src_f)
                     host->rnde(esize, t, s);
                 else if (src_xf) {
@@ -1012,7 +1376,7 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                     do_d0_align = true;
                 }
             }
-            if (do_d0_align) { d = tmp.format(0, dst_type, esize); }
+            if (do_d0_align) { d = tmp.format(0, esize, 1, dst_type); }
 
             bool do_align = false;
             if (esize > 1 && s.hs() != 0) {
@@ -1040,9 +1404,9 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 s = s.reinterpret(local_src_type);
             }
             // local_* values only differ if the original type was xf
-            if ((src_type_size == 2) && to_ir(src_type).is_int() && dst_f) {
-                auto td = dst.format(i * dst_stride_bytes, src_type, esize,
-                        dst_stride * dst_type_size / src_type_size);
+            if ((src_type_bits == 16) && to_ir(src_type).is_int() && dst_f) {
+                auto td = dst.format(
+                        2 * i * dst_stride, esize, 2 * dst_stride, src_type);
                 plan(mov, esize, td, s);
                 s = std::move(td);
             }
@@ -1066,23 +1430,20 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
 
     if (src_b && dst_b) {
         const int tmp_stride = 4;
-        const int tmp_stride_bytes = tmp_stride * dst_type_size;
         // Any byte conversion requires saturation:
         // - ub -> b loses 1 bit of precision
         // - b -> ub loses sign bit
         const bool needs_saturation = src_type != dst_type;
 
         int step = get_step();
-        const int nregs = 1 + utils::div_up(step * tmp_stride_bytes, grf_size);
+        const int nregs = 1 + utils::div_up(step * tmp_stride, grf_size);
         auto tmp = lex_scope.alloc_reg_buf_data(nregs);
         for (int i = 0; i < width; i += step) {
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
-            auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
-                    esize, src_stride);
-            auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
-                    esize, dst_stride);
+            auto s = src.format(i * src_stride, esize, src_stride);
+            auto d = dst.format(i * dst_stride, esize, dst_stride);
             ngen::InstructionModifier mod = esize;
             if (needs_saturation) mod |= host->sat;
 
@@ -1124,10 +1485,10 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
                 }
 
                 auto t = tmp.format(
-                        tmp_offset, dst_type, mod.getExecSize(), tmp_stride);
+                        tmp_offset, mod.getExecSize(), tmp_stride, dst_type);
                 plan(mov, mod, t, s);
                 mod = esize;
-                s = tmp.format(tmp_offset, dst_type, esize, tmp_stride);
+                s = tmp.format(tmp_offset, esize, tmp_stride, dst_type);
             }
             plan(mov, mod, d, s);
         }
@@ -1135,17 +1496,16 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
     }
 
     // w -> b
-    if ((src_type_size == 2) && dst_b) {
+    if ((src_type_bits == 16) && dst_b) {
+        src_stride *= 2;
         int step = get_step();
         for (int i = 0; i < width; i += step) {
             step = std::min(step, width - i);
             step = utils::rnd_down_pow2(step);
             int esize = step;
-            ir_assert(math::is_pow2(esize));
-            auto s = src.format(i * src_stride_bytes, dst_type, esize,
-                    src_stride * src_type_size / dst_type_size);
-            auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
-                    esize, dst_stride);
+            gpu_assert(math::is_pow2(esize));
+            auto s = src.format(i * src_stride, esize, src_stride, dst_type);
+            auto d = dst.format(i * dst_stride, esize, dst_stride);
             plan(mov, esize, d, s);
         }
         return;
@@ -1157,11 +1517,9 @@ void emit_reorder_1d_tile(ngen::HW hw, GeneratorT *host,
         step = std::min(step, width - i);
         step = utils::rnd_down_pow2(step);
         int esize = step;
-        ir_assert(math::is_pow2(esize));
-        auto s = src.format(i * src_stride_bytes, ngen::DataType::invalid,
-                esize, src_stride);
-        auto d = dst.format(i * dst_stride_bytes, ngen::DataType::invalid,
-                esize, dst_stride);
+        gpu_assert(math::is_pow2(esize));
+        auto s = src.format(i * src_stride, esize, src_stride);
+        auto d = dst.format(i * dst_stride, esize, dst_stride);
         plan(mov, esize, d, s);
     }
 }
@@ -1178,24 +1536,31 @@ void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
     bool is_bf_to_f = (src.type() == ngen::DataType::bf)
             && (dst.type() == ngen::DataType::f);
     int src_type_size = ngen::getBytes(src.type());
+    int dst_type_size = ngen::getBytes(dst.type());
     int src_off = src.offset();
     int dst_off = dst.offset();
     int src_byte_off = src.byte_offset();
     int dst_byte_off = dst.byte_offset();
+    int esize = mod.getExecSize();
+    const int grf_size = ngen::GRF::bytes(scope.hw());
+    // within the current generator, HS == 0 can mean 2 things:
+    //   - <0; 1, 0>, i.e. a scalar value so HS is to be treated as 1
+    //   - <1; 1, 0>, which is a more compatible representation of <N; N, 1>
+    int grf_src = grf_size / std::max(src.hs(), 1);
+    int grf_dst = grf_size / std::max(dst.hs(), 1);
 
     // If src is aligned with dst, return.
-    if ((is_xf || is_bf_to_f) && src_off == dst_off) return;
-    if (!is_xf && src_byte_off == dst_byte_off) return;
+    if ((is_xf || is_bf_to_f) && src_off % grf_src == dst_off % grf_dst) return;
+    if (!is_xf && src_byte_off % grf_size == dst_byte_off % grf_size) return;
 
-    int new_src_byte_off = (is_xf ? dst_off * src_type_size : dst_byte_off);
+    int new_src_off = (is_xf ? dst_off * src_type_size / dst_type_size
+                             : dst_off * dst_type_size / src_type_size);
 
-    int esize = mod.getExecSize();
-    int grf_size = ngen::GRF::bytes(scope.hw());
     int src_size = std::max(src_type_size * esize * src_stride, src_type_size);
 
     auto new_src = scope.alloc_reg_buf_data(
-            utils::div_up(src_size + new_src_byte_off, grf_size));
-    new_src = new_src.format(new_src_byte_off, src.type(), esize, src_stride);
+            utils::div_up(src_size + new_src_off * src_type_size, grf_size));
+    new_src = new_src.format(new_src_off, esize, src_stride, src.type());
     emit_reorder_1d_tile(scope.hw(), host, scope, esize, src, src_stride,
             new_src, src_stride);
     src = std::move(new_src);
@@ -1240,47 +1605,53 @@ void align_src_dst_offset(GeneratorT *host, ngen_register_scope_t &scope,
 // Reorder may require several steps, in this case a temporary buffer T is
 // allocated. For example: A -> T -> B or A -> B -> T -> B
 class reorder_2d_impl_t {
+    struct reorder_step_t;
+
 public:
     reorder_2d_impl_t(ngen::HW hw, tensor_t tile, const layout_t &src_layout,
             const layout_t &dst_layout)
-        : hw_(hw), src_(src_layout), dst_(dst_layout), tile_(std::move(tile)) {
-        ir_assert(src_.type() == dst_.type());
-    }
+        : hw_(hw), tile_(std::move(tile)) {
+        gpu_assert(src_layout.type() == dst_layout.type());
 
-    const tensor_t &tile() const { return tile_; }
-
-    template <typename GeneratorT>
-    void emit(GeneratorT *host, ngen_register_scope_t &scope,
-            const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
-        int a_idx, b_idx;
+        dim_idx_t a_idx, b_idx;
         int tile_a, tile_b;
         tile_to_2d_dims(tile_, a_idx, b_idx, tile_a, tile_b);
 
         // Convert src/dst to 2D layouts.
-        dim_assignment_t to_ab(src_.ndims(), 2);
+        dim_assignment_t to_ab(src_layout.ndims(), 2);
         to_ab.assign(a_idx, 0);
         to_ab.assign(b_idx, 1);
-        auto src_ab = to_ab.map(src_);
-        auto dst_ab = to_ab.map(dst_);
+        auto src_ab = to_ab.map(src_layout);
+        auto dst_ab = to_ab.map(dst_layout);
 
+        src_ = src_ab;
+        dst_ = dst_ab;
         // Find minimal cost reorder path between layouts.
-        auto path = find_min_cost_path(hw_, src_ab, dst_ab, tile_a, tile_b);
+        path_ = find_min_cost_path(hw_, src_ab, dst_ab, tile_a, tile_b);
+    }
+
+    const tensor_t &tile() const { return tile_; }
+    const std::vector<reorder_step_t> &path() const { return path_; }
+
+    template <typename GeneratorT>
+    void emit(GeneratorT *host, ngen_register_scope_t &scope,
+            const reg_buf_data_t &src_rd, const reg_buf_data_t &dst_rd) {
+        auto &orig_type = src_.type();
 
         // Allocate a temporary GRF buffer if needed.
         reg_buf_data_t tmp;
-        if (path.size() > 1) {
+        if (path_.size() > 1) {
             const int grf_size = ngen::GRF::bytes(hw_);
             tmp = scope.alloc_reg_buf_data(
-                    utils::div_up(dst_ab.size(), grf_size));
+                    utils::div_up(dst_.size(), grf_size));
         }
 
         // Iterate through found reorders.
-        auto *prev_layout = &src_ab;
+        auto *prev_layout = &src_;
         auto prev_rd = src_rd;
-        int path_len = int(path.size());
-        auto &orig_type = src_ab.type();
+        int path_len = int(path_.size());
         for (int i = 0; i < path_len; i++) {
-            auto &step = path[i];
+            auto &step = path_[i];
             auto &tile = step.tile;
             auto &type = step.type;
             auto *next_layout = &step.layout;
@@ -1293,17 +1664,19 @@ class reorder_2d_impl_t {
             auto next_rd = (use_dst ? dst_rd : tmp);
             auto &x_blocks = x.blocks();
             auto &y_blocks = y.blocks();
-            ir_assert(x_blocks.size() <= 1);
-            ir_assert(y_blocks.size() <= 1);
+            gpu_assert(x_blocks.size() <= 1);
+            gpu_assert(y_blocks.size() <= 1);
             int x_stride = (x_blocks.empty() ? 1 : int(x_blocks[0].stride));
             int y_stride = (y_blocks.empty() ? 1 : int(y_blocks[0].stride));
             int width = int(tile.elems()) * orig_type.size() / type.size();
             next_layout->for_each_tile(
                     tile, [&](const std::vector<dim_t> &start) {
-                        int prev_off = int(prev_layout->offset_in_bytes(start));
-                        int next_off = int(next_layout->offset_in_bytes(start));
-                        auto x_sub = prev_rd.format(prev_off, to_ngen(type), 1);
-                        auto y_sub = next_rd.format(next_off, to_ngen(type), 1);
+                        int prev_off = int(prev_layout->offset(start))
+                                * orig_type.bitsize() / type.bitsize();
+                        int next_off = int(next_layout->offset(start))
+                                * orig_type.bitsize() / type.bitsize();
+                        auto x_sub = prev_rd.format(prev_off, to_ngen(type));
+                        auto y_sub = next_rd.format(next_off, to_ngen(type));
                         emit_reorder_1d_tile(hw_, host, scope, width, x_sub,
                                 x_stride, y_sub, y_stride);
                     });
@@ -1423,8 +1796,8 @@ class reorder_2d_impl_t {
             for (int i = min_log_bytes; i <= max_log_bytes; i++) {
                 if ((mask & (1 << i)) == 0) continue;
                 if (i > min_log_bytes) {
-                    ir_assert(!layout.blocks().empty());
-                    ir_assert(!v.layout.blocks().empty());
+                    gpu_assert(!layout.blocks().empty());
+                    gpu_assert(!v.layout.blocks().empty());
                     int dim_idx0 = layout.blocks()[0].dim_idx;
                     int dim_idx1 = v.layout.blocks()[0].dim_idx;
                     if (dim_idx0 != dim_idx1) continue;
@@ -1460,30 +1833,30 @@ class reorder_2d_impl_t {
 
     // Extracts dimension sizes and their indices from a multidimensional
     // tensor.
-    static void tile_to_2d_dims(
-            const tensor_t &tile, int &a_idx, int &b_idx, int &a, int &b) {
-        a_idx = -1;
-        b_idx = -1;
-        for (int i = 0; i < tile.ndims(); i++) {
+    static void tile_to_2d_dims(const tensor_t &tile, dim_idx_t &a_idx,
+            dim_idx_t &b_idx, int &a, int &b) {
+        a_idx = dim_idx::invalid;
+        b_idx = dim_idx::invalid;
+        for (dim_idx_t i = 0; i < tile.ndims(); i++) {
             if (tile.dims()[i] == 1) continue;
-            if (a_idx == -1) {
+            if (a_idx == dim_idx::invalid) {
                 a_idx = i;
                 continue;
             }
-            if (b_idx == -1) {
+            if (b_idx == dim_idx::invalid) {
                 b_idx = i;
                 continue;
             }
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
 
-        for (int i = 0; i < tile.ndims(); i++) {
+        for (dim_idx_t i = 0; i < tile.ndims(); i++) {
             if (utils::one_of(i, a_idx, b_idx)) continue;
-            if (a_idx == -1) {
+            if (a_idx == dim_idx::invalid) {
                 a_idx = i;
                 continue;
             }
-            if (b_idx == -1) {
+            if (b_idx == dim_idx::invalid) {
                 b_idx = i;
                 continue;
             }
@@ -1565,8 +1938,8 @@ class reorder_2d_impl_t {
                 dst_idx = i;
         }
 
-        ir_assert(src_idx != -1);
-        ir_assert(dst_idx != -1);
+        gpu_assert(src_idx != -1);
+        gpu_assert(dst_idx != -1);
 
         // Layouts are the same, just copy.
         if (src_idx == dst_idx) {
@@ -1614,7 +1987,7 @@ class reorder_2d_impl_t {
         // Sanity check, ensure the reorder sequence is not too long.
         int max_cost = 256;
         if (cost[dst_idx] > max_cost)
-            ir_warning() << "High cost reorder generated\n";
+            gpu_warning() << "High cost reorder generated";
 
         // Restore the shortest reorder path.
         std::vector<reorder_step_t> ret;
@@ -1674,11 +2047,10 @@ class reorder_2d_impl_t {
     }
 
     ngen::HW hw_;
-
+    tensor_t tile_;
     layout_t src_;
     layout_t dst_;
-
-    tensor_t tile_;
+    std::vector<reorder_step_t> path_;
 };
 
 class reorder_impl_t {
@@ -1717,10 +2089,10 @@ class reorder_impl_t {
         auto &src_type = src_layout_.type();
         auto &dst_type = dst_layout_.type();
         dst_layout_.for_each_tile(tile, [&](const std::vector<dim_t> &start) {
-            int src_off = int(src_layout_(start) * src_type.size());
-            int dst_off = int(dst_layout_(start) * dst_type.size());
-            auto sub_src = src_rd.format(src_off, to_ngen(src_type), 1);
-            auto sub_dst = dst_rd.format(dst_off, to_ngen(dst_type), 1);
+            int src_off = src_layout_(start);
+            int dst_off = dst_layout_(start);
+            auto sub_src = src_rd.format(src_off, to_ngen(src_type));
+            auto sub_dst = dst_rd.format(dst_off, to_ngen(dst_type));
 
             ngen_register_scope_t tile_scope(scope.register_allocator());
             emit_reorder_1d_tile(hw_, host, tile_scope, tile_elems, sub_src,
@@ -1811,12 +2183,19 @@ class reorder_impl_t {
             scope.safeRelease(dummy);
 
             reorder_2d_impl_t r(hw_, tile, src_tile_layout, dst_tile_layout);
+            bool tile_ok = true;
+            for (auto &step : r.path())
+                if (step.tile.elems() < 2) {
+                    tile_ok = false;
+                    break;
+                }
+            // Skip any 2d reorder that attempts scalar moves
+            if (!tile_ok) continue;
+
             src_layout_.for_each_tile(
                     tile, [&](const std::vector<dim_t> &start) {
-                        auto src_off
-                                = src_layout_.offset_in_bytes<dim_t>(start);
-                        auto dst_off
-                                = dst_layout_.offset_in_bytes<dim_t>(start);
+                        auto src_off = src_layout_.offset<dim_t>(start);
+                        auto dst_off = dst_layout_.offset<dim_t>(start);
                         auto src_tile_rd = src_rd.format(int(src_off), type);
                         auto dst_tile_rd = dst_rd.format(int(dst_off), type);
 
diff --git a/src/gpu/intel/jit/codegen/send.hpp b/src/gpu/intel/jit/codegen/send.hpp
index 66968669927..0a3cf407684 100644
--- a/src/gpu/intel/jit/codegen/send.hpp
+++ b/src/gpu/intel/jit/codegen/send.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "gpu/intel/jit/codegen/register_scope.hpp"
 #include "gpu/intel/jit/ir/message.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -35,7 +35,7 @@ struct atomic_helper_t {
     static void call(GeneratorT *, ngen::AtomicOp,
             const ngen::InstructionModifier &, const DataSpecT &,
             ngen::AddressBase, const ngen::RegData &, const ngen::RegData &) {
-        ir_error_not_expected()
+        gpu_error_not_expected()
                 << "Unknown DataSpec: atomics are not supported.";
     }
 };
@@ -106,7 +106,7 @@ class send_impl_t {
                 emit_load_or_store(host, mod, ngen::block_hword(elems),
                         address_base, header, data);
                 break;
-            default: ir_error_not_expected() << send_.type;
+            default: gpu_error_not_expected() << send_.type;
         }
     }
 
@@ -125,9 +125,8 @@ class send_impl_t {
                         ngen::scattered_qword(send_.type.elems()),
                         to_address_base(send_.address), header, data);
                 break;
-            default: ir_error_not_expected() << send_.type;
+            default: gpu_error_not_expected() << send_.type;
         }
-        return;
     }
 
 private:
@@ -140,11 +139,11 @@ class send_impl_t {
             host->load(mod, data, spec, base, addr);
         } else if (send_.is_atomic()) {
             atomic_helper_t<DataSpecT>::call(
-                    host, ngen::AtomicOp::fadd, mod, spec, base, addr, data);
+                    host, to_atomic_op(send_.op), mod, spec, base, addr, data);
         } else if (send_.is_store()) {
             host->store(mod, spec, base, addr, data);
         } else {
-            ir_error_not_expected() << "Can't emit send: " << send_;
+            gpu_error_not_expected() << "Can't emit send: " << send_;
         }
     }
     template <typename GeneratorT, typename DataSpecT>
@@ -154,7 +153,6 @@ class send_impl_t {
             const ngen::RegData &addr, const ngen::RegData &data) {
         atomic_helper_t<DataSpecT>::call(
                 host, ngen::AtomicOp::cmpwr, mod, dst, spec, base, addr, data);
-        return;
     }
 
     template <typename GeneratorT>
@@ -166,12 +164,12 @@ class send_impl_t {
             for (auto &t : {type_t::qword(), type_t::dword()}) {
                 if (type.size() % t.size() == 0) {
                     int elems = type.size() / t.size();
-                    ir_assert(math::is_pow2(elems));
-                    ir_assert(elems >= 1 && elems <= 64);
+                    gpu_assert(math::is_pow2(elems));
+                    gpu_assert(elems >= 1 && elems <= 64);
                     return t.with_elems(elems);
                 }
             }
-            ir_error_not_expected();
+            gpu_error_not_expected();
             return type;
         };
 
@@ -184,7 +182,7 @@ class send_impl_t {
             lsc_spec = utils::make_unique<ngen::DataSpecLSC>(
                     ngen::block(lsc_type.first, lsc_type.second));
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
 
         if (send_.is_slm()) {
@@ -193,7 +191,7 @@ class send_impl_t {
             } else if (send_.is_store()) {
                 host->store.slm(mod, *lsc_spec, host->SLM, header, data);
             } else {
-                ir_error_not_expected();
+                gpu_error_not_expected();
             }
         } else if (send_.is_a64()) {
             *lsc_spec |= get_cache_settings(send_, host->exec_cfg_.hw());
@@ -202,11 +200,11 @@ class send_impl_t {
             } else if (send_.is_store()) {
                 host->store.ugm(mod, *lsc_spec, host->A64, header, data);
             } else if (send_.is_atomic()) {
-                host->atomic.ugm(ngen::AtomicOp::fadd, mod, *lsc_spec,
+                host->atomic.ugm(to_atomic_op(send_.op), mod, *lsc_spec,
                         to_address_base(send_.address), header, data);
             }
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
 
@@ -219,7 +217,7 @@ class send_impl_t {
             case 1: data_size = ngen::DataSizeLSC::D8; break;
             case 2: data_size = ngen::DataSizeLSC::D16; break;
             case 4: data_size = ngen::DataSizeLSC::D32; break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         ngen::DataSpecLSC data_spec(data_size);
         if (info.vnni) data_spec |= host->vnni;
@@ -231,7 +229,7 @@ class send_impl_t {
         } else if (send_.is_store_2d()) {
             host->store(mod, spec, host->A64, header, data);
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
 
@@ -261,7 +259,7 @@ class send_impl_t {
             case 8: return std::make_pair(ngen::DataSizeLSC::D64, type.elems());
             default: break;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return std::make_pair(ngen::DataSizeLSC::D8, 1);
     }
 
@@ -269,11 +267,21 @@ class send_impl_t {
         switch (address) {
             case send_address_t::a64: return ngen::AddressBase::createA64(true);
             case send_address_t::slm: return ngen::AddressBase::createSLM();
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return ngen::AddressBase();
     }
 
+    static ngen::AtomicOp to_atomic_op(send_op_t op) {
+        switch (op) {
+            case send_op_t::atomic_add: return ngen::AtomicOp::add;
+            case send_op_t::atomic_fadd: return ngen::AtomicOp::fadd;
+            case send_op_t::atomic_cmpwr: return ngen::AtomicOp::cmpwr;
+            default: gpu_error_not_expected();
+        }
+        return ngen::AtomicOp();
+    }
+
     const send_t &send_;
 };
 
diff --git a/src/gpu/intel/jit/conv/README.md b/src/gpu/intel/jit/conv/README.md
index ad2840bf78d..5c40f0bc9f1 100644
--- a/src/gpu/intel/jit/conv/README.md
+++ b/src/gpu/intel/jit/conv/README.md
@@ -3,7 +3,7 @@ GPU Convolution Kernel Generator
 
 # Generalized Convolution Algorithm
 
-See [oneDNN documentation](https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html)
+See [oneDNN documentation](https://uxlfoundation.github.io/oneDNN/dev_guide_convolution.html)
 for the naming conventions that are used below.
 
 Convolution has more variations than GEMM but for simplicity we will rely on
diff --git a/src/gpu/intel/jit/conv/config.cpp b/src/gpu/intel/jit/conv/config.cpp
index 2f55822f641..b5e2d62f717 100644
--- a/src/gpu/intel/jit/conv/config.cpp
+++ b/src/gpu/intel/jit/conv/config.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,15 +20,16 @@
 #include <cstring>
 #include <mutex>
 
+#include "common/utils.hpp"
 #include "gpu/intel/jit/conv/grf_usage.hpp"
 #include "gpu/intel/jit/conv/message_patterns.hpp"
 #include "gpu/intel/jit/conv/normalization.hpp"
 #include "gpu/intel/jit/conv/plan.hpp"
 #include "gpu/intel/jit/conv/problem.hpp"
 #include "gpu/intel/jit/conv/tiler.hpp"
+#include "gpu/intel/jit/eltwise_injector.hpp"
 #include "gpu/intel/jit/ir/gemm_schedule.hpp"
 #include "gpu/intel/jit/ir/tensor_config.hpp"
-#include "gpu/intel/jit/jit_eltwise_injector.hpp"
 
 #define VDISPATCH_CHECK(pd, engine, cond, msg, ...) \
     VCONDCHECK(primitive, create, dispatch, convolution, (cond), \
@@ -113,16 +114,17 @@ std::string prepend_groups_to_tag(const std::string &tag) {
 
 int get_default_mad_block(const type_t &type) {
     switch (type.size()) {
-        case 1: return 32;
+        // fp4 gets upconverted to f16 for mad.
+        case 1: return (type.is_fp4() ? 16 : 32);
         case 2:
         case 4: return 16;
         case 8: return 8;
-        default: ir_error_not_expected() << type;
+        default: gpu_error_not_expected() << type;
     }
     return 1;
 }
 
-bool is_small(const type_t &type, int elems) {
+bool is_small(const type_t &type, dim_t elems) {
     int block = get_default_mad_block(type);
     return elems <= block / 2;
 }
@@ -150,13 +152,19 @@ status_t conv_problem_t::init(
     with_bias = conv_pd->with_bias();
     with_groups = conv_pd->with_groups();
     with_sum = with_sum_post_op();
+    memory_desc_wrapper mdw_src(conv_pd->invariant_src_md());
+    memory_desc_wrapper mdw_wei(conv_pd->invariant_wei_md());
+    memory_desc_wrapper mdw_dst(conv_pd->invariant_dst_md());
+
+    strided = (mdw_src.is_plain() && !mdw_src.is_dense())
+            || (mdw_wei.is_plain() && !mdw_wei.is_dense())
+            || (mdw_dst.is_plain() && !mdw_dst.is_dense());
 
     src_data_type = conv_pd->invariant_src_md()->data_type;
     wei_data_type = conv_pd->invariant_wei_md()->data_type;
     bia_data_type = conv_pd->invariant_bia_md()->data_type;
     dst_data_type = conv_pd->invariant_dst_md()->data_type;
     fpmath_mode = attr->fpmath_.mode_;
-    deterministic = attr->deterministic_;
 
     ndims = conv_pd->ndims();
 
@@ -216,9 +224,9 @@ std::string conv_problem_t::desc_str(bool print_mb) const {
     if (g > 1) oss << "g" << g;
     oss << "ic" << ic;
 
-    std::vector<int> xd = {id, od, kd, sd, dd, pd};
-    std::vector<int> xh = {ih, oh, kh, sh, dh, ph};
-    std::vector<int> xw = {iw, ow, kw, sw, dw, pw};
+    std::vector<dim_t> xd = {id, od, kd, sd, dd, pd};
+    std::vector<dim_t> xh = {ih, oh, kh, sh, dh, ph};
+    std::vector<dim_t> xw = {iw, ow, kw, sw, dw, pw};
     std::vector<int> xdef = {1, 1, 1, 1, 0, 0};
     bool has_d = !ir_utils::is_equal(xd, xdef);
     bool has_h = !ir_utils::is_equal(xh, xdef);
@@ -267,11 +275,12 @@ int prim_config_t::sort_key(const param_t *param) const {
             nullptr,
     };
     for (const char **p = ordered_params; *p; p++) {
-        if (param->short_name() == *p) return p - ordered_params;
+        if (param->short_name() == *p) return into<int>(p - ordered_params);
     }
     return (int)(sizeof(ordered_params) / sizeof(ordered_params[0]));
 }
 
+const bool allow_global_reduction_param_t::default_value = true;
 const bwd_d_optimize_kind_t bwd_d_optimize_kind_param_t::default_value
         = bwd_d_optimize_kind_t::none;
 const bool pad_slm_param_t::default_value = true;
@@ -279,16 +288,16 @@ const bool pad_slm_param_t::default_value = true;
 std::string build_tag(const std::vector<int> &inner_blocks,
         const std::vector<int> &outer_blocks, const std::vector<char> &letters,
         const std::vector<int> &idxs) {
-    size_t n = letters.size();
-    ir_assert(inner_blocks.size() == n);
-    ir_assert(outer_blocks.size() == n);
-    ir_assert(idxs.size() == n);
+    dim_idx_t n = into<dim_idx_t>(letters.size());
+    gpu_assert(inner_blocks.size() == n);
+    gpu_assert(outer_blocks.size() == n);
+    gpu_assert(idxs.size() == n);
 
     std::string tag;
     std::vector<bool> seen(n);
 
     // Iterate through outer blocks.
-    for (int i = (int)n - 1; i >= 0; i--) {
+    for (int i = n - 1; i >= 0; i--) {
         int idx = idxs[i];
         int blk = outer_blocks[idx];
         if (blk == 1) continue;
@@ -297,7 +306,7 @@ std::string build_tag(const std::vector<int> &inner_blocks,
     }
 
     // Iterate through inner blocks.
-    for (int i = (int)n - 1; i >= 0; i--) {
+    for (int i = n - 1; i >= 0; i--) {
         int idx = idxs[i];
         int blk = inner_blocks[idx];
         if (blk == 1) continue;
@@ -313,7 +322,8 @@ std::string build_tag(const std::vector<int> &inner_blocks,
         for (int i = (int)n - 1; i >= 0; i--) {
             char c = letters[i];
             if (c == ' ') continue;
-            if (seen[i]) c = std::toupper(c);
+            if (seen[i]) c = static_cast<char>(std::toupper(c));
+            // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
             tag = c + tag;
         }
     }
@@ -321,7 +331,7 @@ std::string build_tag(const std::vector<int> &inner_blocks,
     return tag;
 }
 
-int pick_block_impl(bool prefer_rnd_up, int dim, int b0, int b1, int b2) {
+int pick_block_impl(bool prefer_rnd_up, dim_t dim, int b0, int b1, int b2) {
     int blocks[3] = {b0, b1, b2};
     int prev_blk = 1;
     for (int i = 0; i < 3; i++) {
@@ -336,19 +346,20 @@ int pick_block_impl(bool prefer_rnd_up, int dim, int b0, int b1, int b2) {
     return prev_blk;
 }
 
-int pick_block_rnd_up(int dim, int b0, int b1 = 0, int b2 = 0) {
+int pick_block_rnd_up(dim_t dim, int b0, int b1 = 0, int b2 = 0) {
     return pick_block_impl(true, dim, b0, b1, b2);
 }
 
-int pick_block(int dim, int b0, int b1 = 0, int b2 = 0) {
+int pick_block(dim_t dim, int b0, int b1 = 0, int b2 = 0) {
     return pick_block_impl(false, dim, b0, b1, b2);
 }
 
-int get_default_block(fma_kind_t fma, const type_t &type, int elems) {
+int get_default_block(fma_kind_t fma, const type_t &type, dim_t elems) {
     if (is_dp_fma(fma)) {
         if (is_small(type, elems)) {
-            int packed_dword_elems = 4 / type.size();
-            return std::max(utils::rnd_up_pow2(elems), packed_dword_elems);
+            int packed_dword_elems = 32 / type.bitsize();
+            return std::max(
+                    utils::rnd_up_pow2(into<int>(elems)), packed_dword_elems);
         }
         return 32 / type.size();
     }
@@ -372,6 +383,9 @@ struct nc_block_t {
     nc_block_t(int n_block, int c_block)
         : n_block_(n_block), c_block_(c_block) {}
 
+    int n_block() const { return n_block_; }
+    int c_block() const { return c_block_; }
+
     std::string tag() const {
         std::vector<int> idxs = {1, 0};
         return build_tag({n_block_, c_block_}, {1, 1}, {'a', 'b'}, idxs);
@@ -380,7 +394,7 @@ struct nc_block_t {
     // Ideally, this should only depend on data type, direction, mb, c, and g to
     // enable the same src/dst formats and avoid reorders between convolutions
     static nc_block_t get_default_blocking(const hw_t &hw, fma_kind_t fma,
-            type_t type, bool is_dw, int n, int c, int g,
+            type_t type, bool is_dw, dim_t n, dim_t c, dim_t g,
             bool is_output = false) {
         // Select dst layout to align with fma kind of following conv
         // for non-depthwise cases.
@@ -395,7 +409,8 @@ struct nc_block_t {
             auto default_gc_blk
                     = get_default_block(get_default_fma(hw, type), type, g * c);
             if (c_block != default_gc_blk) {
-                if (default_gc_blk % c == 0 && g % (default_gc_blk / c) == 0) {
+                if ((default_gc_blk % c == 0
+                            && g % (default_gc_blk / c) == 0)) {
                     c_block = default_gc_blk;
                 }
             }
@@ -437,10 +452,10 @@ struct goi_block_t {
     }
 
     static goi_block_t get_default_blocking(type_t type, int vec_size,
-            fma_kind_t fma_kind, bool is_bwd_d, int g, int o, int i,
-            bool ab_transpose) {
-        int x = o;
-        int y = i;
+            fma_kind_t fma_kind, bool is_fwd, bool is_bwd_d, dim_t g, dim_t o,
+            dim_t i, bool ab_transpose) {
+        dim_t x = o;
+        dim_t y = i;
         int g_block = 1;
         int o_block = 1;
         int i_block = 1;
@@ -456,26 +471,25 @@ struct goi_block_t {
             std::swap(x_block, y_block);
             std::swap(x_block_outer, y_block_outer);
         }
-        get_default_blocking(type, vec_size, fma_kind, is_bwd_d, g, x, y,
-                g_block, *x_block, *y_block, *x_block_outer, *y_block_outer,
-                ab_transpose);
+        get_default_blocking(type, vec_size, fma_kind, is_fwd, is_bwd_d, g, x,
+                y, g_block, *x_block, *y_block, *y_block_outer, ab_transpose);
         return goi_block_t(fma_kind, is_dw(g, o, i), is_bwd_d, g_block, o_block,
                 i_block, o_block_outer, i_block_outer);
     }
 
     static void get_default_blocking(type_t type, int vec_size,
-            fma_kind_t fma_kind, bool is_bwd_d, int g, int x, int y,
-            int &g_block, int &x_block, int &y_block, int &x_block_outer,
+            fma_kind_t fma_kind, bool is_fwd, bool is_bwd_d, dim_t g, dim_t x,
+            dim_t y, int &g_block, int &x_block, int &y_block,
             int &y_block_outer, bool ab_transpose = false) {
         if (is_dw(g, x, y)) {
             g_block = vec_size;
         } else if (fma_kind == fma_kind_t::mad) {
-            x_block = (ab_transpose && is_bwd_d) ? utils::rnd_up_pow2(x)
-                                                 : vec_size;
-            y_block = get_default_block(fma_kind, type, y);
+            x_block = (ab_transpose && (is_fwd || is_bwd_d)) ? 1 : vec_size;
+            y_block = (x_block == 1 ? 1 : get_default_block(fma_kind, type, y));
         } else {
-            int packed_dword_elems = 4 / type.size();
-            x_block = ab_transpose ? utils::rnd_up_pow2(x) : vec_size;
+            int packed_dword_elems = 32 / type.bitsize();
+            x_block = ab_transpose ? into<int>(utils::rnd_up_pow2(x))
+                                   : vec_size;
             y_block = packed_dword_elems;
             // Fixing y outer block helps to avoid extra GRF reorders however
             // in small reduction cases it may result in excessive zero
@@ -492,7 +506,7 @@ struct goi_block_t {
     }
 
 private:
-    static bool is_dw(int g, int o, int i) {
+    static bool is_dw(dim_t g, dim_t o, dim_t i) {
         return (g > 1 && o == 1 && i == 1);
     }
 
@@ -509,6 +523,8 @@ struct goi_block_t {
 // Matches the user-provided descriptor against the list of supported plain tags.
 std::string get_plain_user_tag(
         const conv_problem_t &prb, const memory_desc_t &md, bool is_wei) {
+    memory_desc_wrapper mdw(md);
+    if (mdw.is_plain() && !mdw.is_dense()) return "user";
     if (is_wei) {
         std::vector<const char *> plain_non_group_wei_tags
                 = {"abx", "axb", "xba"};
@@ -516,7 +532,7 @@ std::string get_plain_user_tag(
                 = {"abcx", "abxc", "axcb"};
         auto &plain_wei_tags = (prb.with_groups ? plain_group_wei_tags
                                                 : plain_non_group_wei_tags);
-        ir_assert(
+        gpu_assert(
                 plain_non_group_wei_tags.size() == plain_group_wei_tags.size());
         for (size_t i = 0; i < plain_wei_tags.size(); i++) {
             if (matches_tag(md, plain_wei_tags[i])) {
@@ -549,7 +565,7 @@ std::string maybe_fixup_1st_conv_wei_tag(
         auto ret = tag;
         return ret.replace(pos, std::strlen(*p), *(p + 1));
     }
-    ir_error_not_expected() << tag;
+    gpu_error_not_expected() << tag;
     return tag;
 }
 
@@ -571,6 +587,18 @@ void maybe_set_plain_weights(const conv_config_t &cfg, bool src_dst_axb,
     if (user_wei_tag.empty()) user_wei_tag = user_wei_req;
 }
 
+bool is_plain_tag_optimal_for_output(
+        const std::string &tag, const std::string &user_tag) {
+    // NHWC is OK with output as C is used for blocking and C is dense.
+    if (user_tag == "axb") return true;
+    // NCHW is OK only when blocked by W (not N).
+    if (user_tag == "abx") {
+        bool is_n_blocked = (tag.find("A") != std::string::npos);
+        return !is_n_blocked;
+    }
+    return false;
+}
+
 void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md,
         const memory_desc_t &wei_md, const memory_desc_t &dst_md,
         std::string &src_tag, std::string &wei_tag, std::string &dst_tag,
@@ -590,8 +618,8 @@ void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md,
             dst_compute_type, prb.is_dw, prb.mb, prb.oc, prb.g,
             /*is_output=*/prb.is_fwd);
     auto wei_blk = goi_block_t::get_default_blocking(wei_compute_type,
-            cfg.vec_size(), cfg.fma_kind(), prb.is_bwd_d, prb.g, prb.oc, prb.ic,
-            prb.ab_swap_transpose);
+            cfg.vec_size(), cfg.fma_kind(), prb.is_fwd, prb.is_bwd_d, prb.g,
+            prb.oc, prb.ic, prb.ab_swap_transpose);
 
     src_tag = src_blk.tag();
     wei_tag = wei_blk.tag();
@@ -620,9 +648,9 @@ void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md,
 
     // Use plain tags for user-facing activations for small-channel tensors.
     if (!matches_tag(src_md, src_tag) && is_small_ic_g1)
-        user_src_tag = (user_src_req.empty() ? "axb" : std::move(user_src_req));
+        user_src_tag = (user_src_req.empty() ? "axb" : user_src_req);
     if (!matches_tag(dst_md, dst_tag) && is_small_oc_g1)
-        user_dst_tag = (user_dst_req.empty() ? "axb" : std::move(user_dst_req));
+        user_dst_tag = (user_dst_req.empty() ? "axb" : user_dst_req);
 
     // Avoid reorder for small shapes
     if (!user_src_tag.empty() && !user_dst_tag.empty() && prb.g == 1
@@ -633,15 +661,76 @@ void init_data_tags(const conv_config_t &cfg, const memory_desc_t &src_md,
     maybe_set_plain_weights(
             cfg, src_axb && dst_axb, user_wei_req, wei_tag, user_wei_tag);
 
-    // Use plain tag for output to avoid extra reorders.
-    if (!user_src_tag.empty() && src_output) src_tag = user_src_tag;
-    if (!user_dst_tag.empty() && dst_output) dst_tag = user_dst_tag;
-
     if (user_src_tag.empty()) user_src_tag = src_tag;
     if (user_wei_tag.empty()) user_wei_tag = wei_tag;
     if (user_dst_tag.empty()) user_dst_tag = dst_tag;
     if (src_abx && !src_matches) user_src_tag = "abx";
     if (dst_abx && !dst_matches) user_dst_tag = "abx";
+
+    // Use plain tag for output to avoid extra reorders when beneficial.
+    if (src_output && is_plain_tag_optimal_for_output(src_tag, user_src_tag))
+        src_tag = user_src_tag;
+    if (dst_output && is_plain_tag_optimal_for_output(dst_tag, user_dst_tag))
+        dst_tag = user_dst_tag;
+
+    if (user_src_req == "user") src_tag = user_src_tag = "user";
+    if (user_wei_req == "user") wei_tag = user_wei_tag = "user";
+    if (user_dst_req == "user") dst_tag = user_dst_tag = "user";
+}
+
+void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw,
+        dim_t *odhw, dim_t *pdhw, dim_t *ddhw) {
+    const bool is_bwd_d = (prb.prop_kind() == prop_kind::backward_data);
+    using memory_dims = std::vector<dim_t>;
+    memory_dims I {prb.id, prb.ih, prb.iw};
+    memory_dims O {prb.od, prb.oh, prb.ow};
+    memory_dims K {prb.kd, prb.kh, prb.kw};
+    memory_dims S {prb.sd, prb.sh, prb.sw};
+    memory_dims D {prb.dd, prb.dh, prb.dw};
+    memory_dims P {prb.pd, prb.ph, prb.pw};
+    const int off = 5 - prb.ndims;
+    const auto *w = prb.conv_pd->weights_md();
+
+    // restore the original layout of the prb values
+    const auto *s
+            = (is_bwd_d) ? prb.conv_pd->diff_dst_md() : prb.conv_pd->src_md();
+    const auto *d
+            = (is_bwd_d) ? prb.conv_pd->diff_src_md() : prb.conv_pd->dst_md();
+    auto has_dim = [&](int i) {
+        return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1)
+                || (w->dims[2 + i + prb.with_groups] > 1);
+    };
+    auto move_back = [&](int i, int off) {
+        if (off == 0) return;
+        I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1;
+        D[i - off] = P[i - off] = 0;
+        std::swap(I[i - off], I[i]);
+        std::swap(O[i - off], O[i]);
+        std::swap(K[i - off], K[i]);
+        std::swap(S[i - off], S[i]);
+        std::swap(D[i - off], D[i]);
+        std::swap(P[i - off], P[i]);
+    };
+    bool has_d = (off <= 0) && has_dim(0 - off);
+    bool has_h = (off <= 1) && has_dim(1 - off);
+    bool has_w = (off <= 2) && has_dim(2 - off);
+    if (!has_d && !has_h && !has_w) has_w = true;
+    move_back(1, has_d * (!has_h == has_w));
+    move_back(2, !has_w * (!has_h + 1));
+
+    for (int i = off; i < int(K.size()); i++) {
+        const auto KD = (K[i] - 1) * (D[i] + 1) + 1;
+        gpu_assert(w->dims[2 + i + prb.with_groups - off] == K[i]);
+        O[i] = ir_utils::max_unique_pad_states(
+                O[i], I[i], KD, P[i], S[i], true);
+        I[i] = std::min(KD, I[i]);
+    }
+    for (int i = 0; i < 3; i++) {
+        idhw[i] = (i < off) ? 0 : I[i];
+        odhw[i] = (i < off) ? 0 : O[i];
+        pdhw[i] = (i < off) ? 0 : P[i];
+        ddhw[i] = (i < off) ? 0 : D[i];
+    }
 }
 
 status_t init_tensor_layouts(
@@ -728,10 +817,10 @@ status_t init_tensor_layouts(
 
     if (prb.is_bwd_w) {
         if (utils::one_of(prb.wei_data_type, data_type::bf16, data_type::f16,
-                    data_type::f8_e5m2))
+                    data_type::f8_e5m2, data_type::f8_e4m3))
             wei_layout = wei_layout.retype(type_t::f32());
         if (utils::one_of(prb.bia_data_type, data_type::bf16, data_type::f16,
-                    data_type::f8_e5m2))
+                    data_type::f8_e5m2, data_type::f8_e4m3))
             bia_layout = bia_layout.retype(type_t::f32());
     }
 
@@ -754,6 +843,23 @@ status_t init_tensor_layouts(
             prb.dhw_map,
             /*add_groups=*/true);
 
+    // Disable cases that cannot generate valid fp4 tiling.
+    if (src_layout.type().is_fp4())
+        for (auto &b : src_layout.blocks()) {
+            if (b.stride == stride_t(1) && b.block % 8)
+                return status::unimplemented;
+        }
+    if (wei_layout.type().is_fp4())
+        for (auto &b : wei_layout.blocks()) {
+            if (b.stride == stride_t(1) && b.block % 8)
+                return status::unimplemented;
+        }
+    if (dst_layout.type().is_fp4())
+        for (auto &b : dst_layout.blocks()) {
+            if (b.stride == stride_t(1) && b.block % 8)
+                return status::unimplemented;
+        }
+
     src.set_compute(src_layout);
     src.set_user(user_src_layout);
     wei.set_compute(wei_layout);
@@ -763,6 +869,35 @@ status_t init_tensor_layouts(
     bia.set_compute(bia_layout);
     bia.set_user(user_bia_layout);
 
+    if (cfg.zp_cfg().needs_src_reorder_precalc) {
+        auto get_channels = [](const layout_t &layout) {
+            const dim_t min_esize = 16;
+            return std::max(utils::rnd_up_pow2(layout.dim(1) * layout.dim(2)),
+                    min_esize);
+        };
+        using namespace memory_extra_flags;
+        prepare_zp_precompute_conv(prb, wei_md.extra.idhw, wei_md.extra.odhw,
+                wei_md.extra.pdhw, wei_md.extra.ddhw);
+
+        wei_md.extra.dst_size = sizeof(float);
+        for (const auto &o : wei_md.extra.odhw)
+            wei_md.extra.dst_size *= std::max(o, dim_t(1));
+        if (prb.prop_kind() == prop_kind::backward_data) {
+            wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src_bwd;
+            wei_md.extra.dst_size *= get_channels(src_layout);
+        } else {
+            wei_md.extra.dst_size *= get_channels(dst_layout);
+        }
+        wei_md.extra.flags |= compensation_gpu_conv_asymmetric_src;
+        // since tmasks are used on precalc ZPs only if absolutely necessary
+        // (due to significant computational costs in most cases) some block
+        // reads can exceed the total buffer size, resulting in page faults;
+        // padding at the end is the easiest way to avoid that, as 1-2 KB of
+        // additional VRAM per precalc buffer is virtually free
+        // TODO: vectorize send params (in jit:ir:v2 maybe?) and add tmasks!
+        const dim_t max_read_blk_bytes = 2048;
+        wei_md.extra.dst_size += max_read_blk_bytes * 2;
+    }
     return status::success;
 }
 
@@ -771,36 +906,42 @@ bool hw_ok(const hw_t &hw) {
     return true;
 }
 
-bool data_types_ok(const conv_problem_t &prb, const hw_t &hw) {
+bool data_types_ok(
+        const conv_problem_t &prb, const hw_t &hw, impl::engine_t *engine) {
     auto src = prb.src_data_type;
     auto wei = prb.wei_data_type;
     auto dst = prb.dst_data_type;
     auto bia = prb.bia_data_type;
-    bool is_bf8 = utils::one_of(data_type::f8_e5m2, src, wei, dst, bia);
-    bool is_hf8 = utils::one_of(data_type::f8_e4m3, src, wei, dst, bia);
-    if (!prb.is_f64_conv() && utils::one_of(data_type::f64, src, wei, dst, bia))
+    bool is_fp8 = utils::one_of(data_type::f8_e5m2, src, wei, dst, bia)
+            || utils::one_of(data_type::f8_e4m3, src, wei, dst, bia);
+    bool is_fp4 = utils::one_of(data_type::f4_e2m1, src, wei, dst, bia)
+            || utils::one_of(data_type::f4_e3m0, src, wei, dst, bia);
+    if (!prb.is_f64_accumulator()
+            && utils::one_of(data_type::f64, src, wei, dst, bia))
         return false;
-    if (prb.is_f64_conv()
-            && (utils::one_of(hw.to_ngen(), ngen::HW::XeLP, ngen::HW::XeHPG)
-                    && !hw.has_fp64_atomic_support()))
+    auto *compute_engine
+            = utils::downcast<const compute::compute_engine_t *>(engine);
+    auto *device_info = compute_engine->device_info();
+    if (prb.is_f64_accumulator() && !device_info->has_native(data_type::f64))
         return false;
-    if (is_bf8
-            && !(utils::one_of(hw, ngen::HW::XeHPC) && hw.systolic_support()))
+    if ((is_fp8 || is_fp4) && !(hw >= ngen::HW::XeHPC && hw.systolic_support()))
         return false;
-    if (is_hf8) return false;
     if (prb.is_fwd) return true;
     if (prb.is_bwd_d) return true;
     if (prb.is_bwd_w) {
         bool ok = true;
         data_type_t default_acc_type
                 = src == data_type::f64 ? data_type::f64 : data_type::f32;
-        ok &= utils::one_of(src, data_type::f8_e5m2, data_type::bf16,
+        ok &= utils::one_of(src, data_type::f8_e5m2, data_type::f8_e4m3,
+                data_type::f4_e3m0, data_type::f4_e2m1, data_type::bf16,
                 data_type::f16, data_type::f32, data_type::f64);
         ok &= (dst == src);
         ok &= (utils::one_of(wei, src, default_acc_type)
-                || (utils::one_of(src, data_type::f8_e4m3, data_type::f8_e5m2)
+                || (utils::one_of(src, data_type::f8_e4m3, data_type::f8_e5m2,
+                            data_type::f4_e2m1, data_type::f4_e2m1)
                         && utils::one_of(wei, data_type::f8_e4m3,
-                                data_type::f8_e5m2, data_type::f32,
+                                data_type::f8_e5m2, data_type::f4_e2m1,
+                                data_type::f4_e2m1, data_type::f32,
                                 data_type::bf16, data_type::f16)));
 
         if (prb.with_bias) { ok &= utils::one_of(bia, src, data_type::f32); }
@@ -812,29 +953,38 @@ bool data_types_ok(const conv_problem_t &prb, const hw_t &hw) {
 bool zero_points_ok(const conv_problem_t &prb) {
     auto *pd = prb.conv_pd;
     auto *attr = pd->attr();
+    const auto &zp = attr->zero_points_;
 
     using namespace data_type;
     const auto input_type = (prb.is_fwd) ? pd->invariant_src_md()->data_type
                                          : pd->invariant_dst_md()->data_type;
-    int mask_wei = 0, mask_src = 0, mask_dst = 0;
-    if (attr->zero_points_.get(DNNL_ARG_WEIGHTS, &mask_wei) != status::success)
-        return false;
-    if (attr->zero_points_.get(DNNL_ARG_SRC, &mask_src) != status::success)
-        return false;
-    if (attr->zero_points_.get(DNNL_ARG_DST, &mask_dst) != status::success)
-        return false;
 
-    if (!attr->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)) {
-        if (attr->zero_points_.get_data_type(DNNL_ARG_WEIGHTS) != s8)
-            return false;
+    bool ok = IMPLICATION(
+            !utils::one_of(input_type, s8, u8), zp.has_default_values());
+    if (!ok) return false;
+
+    if (!zp.has_default_values(DNNL_ARG_SRC)) {
+        int mask_src = zp.get_mask(DNNL_ARG_SRC);
+        ok = utils::one_of(mask_src, 0, (1 << 1));
+        if (!ok) return false;
+    }
+    if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+        int mask_wei = zp.get_mask(DNNL_ARG_WEIGHTS);
+        ok = mask_wei == 0;
+        if (!ok) return false;
+
+        if (zp.get_data_type(DNNL_ARG_WEIGHTS) != s8) return false;
         if (prb.with_groups) return false;
-        if (mask_src != 0) return false; // zp_wei implies scalar zp_src
+        // zp_wei implies scalar zp_src
+        if (zp.get_mask(DNNL_ARG_SRC) > 0) return false;
+    }
+    if (!zp.has_default_values(DNNL_ARG_DST)) {
+        int mask_dst = zp.get_mask(DNNL_ARG_DST);
+        ok = utils::one_of(mask_dst, 0, (1 << 1));
+        if (!ok) return false;
     }
 
-    return IMPLICATION(!utils::one_of(input_type, s8, u8),
-                   attr->zero_points_.has_default_values())
-            && (mask_wei == 0) && (mask_src == 0 || mask_src == 1 << 1)
-            && (mask_dst == 0 || mask_dst == 1 << 1);
+    return true;
 }
 
 bool post_ops_ok(const conv_problem_t &prb, const hw_t &hw) {
@@ -842,13 +992,13 @@ bool post_ops_ok(const conv_problem_t &prb, const hw_t &hw) {
     auto *attr = prb.attr;
 
     // No post-ops are supported for f64
-    if (prb.is_f64_conv() && !attr->has_default_values()) return false;
+    if (prb.is_f64_accumulator() && !attr->has_default_values()) return false;
 
     using sm = primitive_attr_t::skip_mask_t;
-    auto attr_skip_mask = sm::fpmath_mode;
+    auto attr_skip_mask = sm::fpmath_mode | sm::accumulation_mode;
     if (prb.is_fwd || prb.is_bwd_d) {
-        attr_skip_mask |= sm::post_ops | sm::sum_dt | sm::zero_points_runtime
-                | sm::zero_points_runtime_data_type | sm::scales_runtime;
+        attr_skip_mask |= sm::post_ops | sm::sum_dt | sm::zero_points_data_type
+                | sm::rounding_mode | sm::scales_data_type;
         if (!attr->has_default_values(attr_skip_mask)) return false;
     } else {
         if (!attr->has_default_values(attr_skip_mask)) return false;
@@ -862,18 +1012,22 @@ bool post_ops_ok(const conv_problem_t &prb, const hw_t &hw) {
         return false;
 
     if (!attr->scales_.has_default_values())
-        if (!prb.is_s32_accumulator()) return false;
+        if (!prb.is_s32_accumulator() && !prb.is_fp8_conv()) return false;
     auto scale_args = get_scale_args();
     std::vector<int> scales(scale_args.size());
     for (int i = 0; i < (int)scale_args.size(); i++)
         scales[i] = scale_args[i].second;
     if (!attr->scales_.has_default_values(scales)) return false;
     for (int arg : scales) {
-        int mask = attr->scales_.get(arg).mask_;
+        if (attr->scales_.has_default_values(arg)) continue;
+
+        int mask = attr->scales_.get(arg).get_mask();
         // XXX: per_oc for BWD_D is treated as per_ic assuming it's called from
         // deconvolution.
         if (arg == DNNL_ARG_WEIGHTS) {
             if (!utils::one_of(mask, 0, prb.with_groups ? 3 : 1)) return false;
+        } else if (arg == DNNL_ARG_DST) {
+            if (!utils::one_of(mask, 0, 2)) return false;
         } else {
             if (mask != 0) return false;
         }
@@ -882,11 +1036,14 @@ bool post_ops_ok(const conv_problem_t &prb, const hw_t &hw) {
     for (int i = 0; i < attr->post_ops_.len(); i++) {
         auto &po = attr->post_ops_.entry_[i];
         if (po.is_eltwise()) {
-            if (!jit_eltwise_injector_f32_is_supported(po.eltwise.alg))
+            if (!eltwise_injector_f32_is_supported(po.eltwise.alg))
                 return false;
             else if (po.eltwise.alg == alg_kind::eltwise_tanh
-                    && hw == ngen::HW::XeHPG && hw.systolic_support()
-                    && hw.eu_count() <= 128)
+                    && hw == ngen::HW::XeHPG
+                    && utils::one_of(hw.product_family(),
+                            ngen::ProductFamily::GenericXeHPG,
+                            ngen::ProductFamily::DG2)
+                    && hw.systolic_support() && hw.eu_count() <= 128)
                 // Workaround for hard to reproduce issue in end to end
                 // workloads. It is unclear what the actual issue is as the
                 // kernel always works correctly in benchdnn.
@@ -897,9 +1054,14 @@ bool post_ops_ok(const conv_problem_t &prb, const hw_t &hw) {
 }
 
 bool should_use_mad(const conv_problem_t &prb) {
-    bool small_ic_oc = prb.ic < 3 && prb.oc < 3 && prb.mb < 8;
-    bool grouped_small_ic_oc = prb.ic < 4 && prb.oc < 4 && prb.g > 1;
-    return prb.is_dw || small_ic_oc || grouped_small_ic_oc;
+    if (prb.is_dw) return true;
+    if (prb.is_bwd_w) return false;
+    dim_t kw_xc = prb.kw * (prb.is_fwd ? prb.ic : prb.oc);
+    bool small_ic_oc = (prb.oc <= 3 && prb.ic <= 3 && kw_xc <= 10)
+            || (prb.oc <= 2 && prb.ic <= 2);
+    bool small_mb_ic_oc = prb.mb < 8 && small_ic_oc;
+    bool grouped_small_ic_oc = prb.g > 1 && small_ic_oc;
+    return small_mb_ic_oc || grouped_small_ic_oc;
 }
 
 status_t init_fma_kind(
@@ -933,14 +1095,15 @@ status_t init_vec_size(conv_config_t &cfg) {
     int vec_size = cfg.simd();
     if (cfg.fma_kind() == fma_kind_t::mad) {
         int grf_elems = cfg.grf_size() / prb.acc_data_type_size;
-        int vec_dim = prb.ab_swap_transpose
+        dim_t vec_dim = prb.ab_swap_transpose
                 ? ((prb.is_bwd_w) ? prb.ic : prb.mb)
                 : ((prb.is_fwd || prb.is_bwd_w) ? prb.oc : prb.ic);
-        if (utils::rnd_up(vec_dim, grf_elems) < vec_size) vec_size = grf_elems;
+        if (utils::rnd_up(vec_dim, grf_elems) < vec_size || prb.is_bwd_d)
+            vec_size = grf_elems;
     }
     // SIMD32 produces invalid layouts in bwd_w.
     if (prb.is_bwd_w && !cfg.is_dpas_or_dpasw_fma()) {
-        if (prb.is_f64_conv()) {
+        if (prb.is_f64_accumulator()) {
             vec_size = std::min(vec_size, 8);
         } else {
             vec_size = std::min(vec_size, 16);
@@ -977,7 +1140,7 @@ bool post_op_layouts_ok(const conv_problem_t &prb) {
                             po.binary.src1_desc.dims, prb.ndims, true);
             // These cases don't have message-related limitations.
             if ((mask & (1 << 1)) == 0 || mask == (1 << 1)) continue;
-            auto rhs_layout = po.is_prelu()
+            const auto &rhs_layout = po.is_prelu()
                     ? layout_t(type_t::f32(), 0,
                             get_prelu_weights_dims(po.prelu.mask, output_md))
                     : layout_t(po.binary.src1_desc);
@@ -1000,11 +1163,22 @@ bwd_d_optimize_kind_t bwd_d_optimize_kind_hint(const conv_problem_t &prb) {
     if (prb.is_stride1()) return bwd_d_optimize_kind_t::none;
 
     auto hint = bwd_d_optimize_kind_t::skip_strided_dhw;
-    if (prb.iw % prb.sw != 0 || prb.mb < 16)
+    if (prb.iw % prb.sw != 0 || (prb.mb < 16 && prb.sw <= 8))
         hint = bwd_d_optimize_kind_t::skip_strided_dh;
     return hint;
 }
 
+void init_global_reduction(conv_config_t &cfg) {
+    if (cfg.allow_global_reduction_param().is_overridden()) return;
+    auto &prb = cfg.prb();
+    auto *attr = prb.conv_pd->attr();
+    bool value = true;
+    if (attr->deterministic_) value = false;
+    if (prb.is_f64_accumulator() && !cfg.hw().has_fp64_atomic_support())
+        value = false;
+    cfg.set_allow_global_reduction(value);
+}
+
 // Enable optimization for strided BWD_D convolution.
 void init_bwd_d_optimize(conv_config_t &cfg) {
     if (cfg.bwd_d_optimize_kind_param().is_overridden()) return;
@@ -1018,7 +1192,8 @@ status_t init_pd_time_cfg(const conv_problem_t &prb, conv_config_t &cfg,
     hw_t hw(engine);
 
     VDISPATCH_CHECK(pd, engine, hw_ok(hw), VERBOSE_UNSUPPORTED_ISA);
-    VDISPATCH_CHECK(pd, engine, data_types_ok(prb, hw), VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_CHECK(
+            pd, engine, data_types_ok(prb, hw, engine), VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_CHECK(
             pd, engine, post_ops_ok(prb, hw), VERBOSE_UNSUPPORTED_POSTOP);
     VDISPATCH_CHECK(
@@ -1030,6 +1205,7 @@ status_t init_pd_time_cfg(const conv_problem_t &prb, conv_config_t &cfg,
     cfg.set_exec_cfg(exec_config_t(hw));
     cfg.maybe_override_from_env();
 
+    cfg.set_require_signal_header(true);
     CHECK(init_fma_kind(cfg, pd, engine));
     CHECK(init_simd(cfg));
     CHECK(init_vec_size(cfg));
@@ -1040,6 +1216,7 @@ status_t init_pd_time_cfg(const conv_problem_t &prb, conv_config_t &cfg,
     VDISPATCH_CHECK(
             pd, engine, post_op_layouts_ok(prb), VERBOSE_UNSUPPORTED_POSTOP);
 
+    init_global_reduction(cfg);
     init_bwd_d_optimize(cfg);
 
     return status::success;
@@ -1047,7 +1224,8 @@ status_t init_pd_time_cfg(const conv_problem_t &prb, conv_config_t &cfg,
 
 bool pipeline_unroll_hint(const conv_problem_t &prb, fma_kind_t fma_kind,
         const exec_config_t &exec_cfg,
-        bwd_d_optimize_kind_t bwd_d_optimize_kind) {
+        bwd_d_optimize_kind_t bwd_d_optimize_kind,
+        bool allow_global_reduction) {
     bool do_unroll = true;
     if (prb.is_fwd) {
         const int max_unroll = exec_cfg.hw() <= ngen::HW::XeLP ? 4 : 9;
@@ -1067,9 +1245,10 @@ bool pipeline_unroll_hint(const conv_problem_t &prb, fma_kind_t fma_kind,
                         != bwd_d_optimize_kind_t::skip_strided_dhw)
             do_unroll = false;
     } else if (prb.is_bwd_w) {
-        // Deterministic mode requires to have full reduction in one thread which may result in multiple nested loops
-        // with large bounds so disable unrolling to avoid code size blow-up.
-        if (prb.deterministic) do_unroll = false;
+        // Disabled global reduction requires to have full reduction in one
+        // thread which may result in multiple nested loops with large bounds
+        // so disable unrolling to avoid code size blow-up.
+        if (!allow_global_reduction) do_unroll = false;
     }
     // Unrolling with mad or dp4a results in too large kernels.
     if (utils::one_of(fma_kind, fma_kind_t::mad, fma_kind_t::dp4a)
@@ -1081,19 +1260,20 @@ bool pipeline_unroll_hint(const conv_problem_t &prb, fma_kind_t fma_kind,
 void init_pipeline(conv_config_t &cfg) {
     if (cfg.pipeline().is_overridden()) return;
 
-    bool do_unroll = pipeline_unroll_hint(cfg.prb(), cfg.fma_kind(),
-            cfg.exec_cfg(), cfg.bwd_d_optimize_kind());
+    bool do_unroll
+            = pipeline_unroll_hint(cfg.prb(), cfg.fma_kind(), cfg.exec_cfg(),
+                    cfg.bwd_d_optimize_kind(), cfg.allow_global_reduction());
     if (cfg.plan().reuse_headers) do_unroll = false;
     cfg.pipeline().set(do_unroll, cfg.plan().reuse_headers);
 }
 
-send_pattern_t<prb_dim_t> validate_blocking(const conv_config_t &cfg,
+send_pattern_t<pvar_t> validate_blocking(const conv_config_t &cfg,
         conv_stride_layout_t::input_tensor_t tensor, bool check_2d) {
-    using send_pattern = send_pattern_t<prb_dim_t>;
+    using send_pattern = send_pattern_t<pvar_t>;
     const compute::gpu_arch_t arch
             = convert_ngen_arch_to_dnnl(cfg.hw().to_ngen());
 
-    auto is_match = [&](const send_hint_t<prb_dim_t> &hint) {
+    auto is_match = [&](const send_hint_t<pvar_t> &hint) {
         for (auto &dim : cfg.index_dims()) {
             if (hint[dim]) {
                 if (cfg.iter_dim(dim) % hint[dim]) return false;
@@ -1107,10 +1287,10 @@ send_pattern_t<prb_dim_t> validate_blocking(const conv_config_t &cfg,
     auto idiom = [&] {
         switch (arch) {
             case compute::gpu_arch_t::xe_hpc:
-                return uniform_send_idiom_t<prb_dim_t>(
+                return uniform_send_idiom_t<pvar_t>(
                         /*min_bytes=*/256, check_2d);
             default:
-                return uniform_send_idiom_t<prb_dim_t>(
+                return uniform_send_idiom_t<pvar_t>(
                         /*min_bytes=*/128, check_2d);
         }
     }();
@@ -1119,7 +1299,7 @@ send_pattern_t<prb_dim_t> validate_blocking(const conv_config_t &cfg,
         auto all_hints = idiom.get_hints(layout);
         if (!all_hints.empty()) {
             dim_t max = 0;
-            std::vector<send_hint_t<prb_dim_t>> max_hints = {};
+            std::vector<send_hint_t<pvar_t>> max_hints = {};
             for (auto &h : all_hints) {
                 auto hint_size = h.size();
                 if (max < hint_size) {
@@ -1134,7 +1314,7 @@ send_pattern_t<prb_dim_t> validate_blocking(const conv_config_t &cfg,
         return all_hints;
     }();
     if (hints.empty()) {
-        ir_suggestion() << "No hints generated! ";
+        gpu_suggestion() << "No hints generated!";
         return send_pattern();
     }
 
@@ -1142,11 +1322,11 @@ send_pattern_t<prb_dim_t> validate_blocking(const conv_config_t &cfg,
         if (is_match(h)) { return send_pattern(h); }
     }
 
-    ir_suggestion() << "blocking disables " << send_pattern(hints[0])
-                    << " load of the " << tensor
-                    << " tensor. Try a multiple of:\n";
+    gpu_suggestion() << "blocking disables " << send_pattern(hints[0])
+                     << " load of the " << tensor
+                     << " tensor. Try a multiple of:";
     for (auto &hint : hints) {
-        ir_suggestion() << "\t" << hint.str() << "\n";
+        gpu_suggestion() << "\t" << hint.str();
     }
 
     return send_pattern();
@@ -1156,8 +1336,8 @@ void init_params(conv_config_t &cfg) {
     cfg.tiler().set_params(cfg);
 }
 
-std::array<prb_tile_t, 3> get_kernel_grid_conv_dims(const conv_config_t &cfg) {
-    std::array<prb_tile_t, 3> grid_dims;
+std::array<pvar_tile_t, 3> get_kernel_grid_conv_dims(const conv_config_t &cfg) {
+    std::array<pvar_tile_t, 3> grid_dims;
     for (int i = 0; i < 3; i++) {
         for (auto &d : cfg.walk_order().grid_dims(i)) {
             grid_dims[i][d] = 1;
@@ -1166,35 +1346,35 @@ std::array<prb_tile_t, 3> get_kernel_grid_conv_dims(const conv_config_t &cfg) {
     return grid_dims;
 }
 
-using prb_tile_3 = std::array<prb_tile_t, 3>;
+using pvar_tile_3 = std::array<pvar_tile_t, 3>;
 
-prb_tile_3 get_thread_group_grid_conv_dims(const conv_config_t &cfg) {
-    static const prb_tile_t fwd_0({prb_dims::oc});
-    static const prb_tile_t fwd_1({prb_dims::mb, prb_dims::ow});
-    static const prb_tile_t fwd_2({prb_dims::ic});
+pvar_tile_3 get_thread_group_grid_conv_dims(const conv_config_t &cfg) {
+    static const pvar_tile_t fwd_0({pvars::oc}, 1);
+    static const pvar_tile_t fwd_1({pvars::mb, pvars::ow}, 1);
+    static const pvar_tile_t fwd_2({pvars::ic}, 1);
 
-    static const prb_tile_t bwd_d_0({prb_dims::ic});
-    static const prb_tile_t bwd_d_1({prb_dims::mb, prb_dims::iw});
-    static const prb_tile_t bwd_d_2({prb_dims::oc});
+    static const pvar_tile_t bwd_d_0({pvars::ic}, 1);
+    static const pvar_tile_t bwd_d_1({pvars::mb, pvars::iw}, 1);
+    static const pvar_tile_t bwd_d_2({pvars::oc}, 1);
 
-    static const prb_tile_t bwd_w_0({prb_dims::oc});
-    static const prb_tile_t bwd_w_1({prb_dims::ic});
-    static const prb_tile_t bwd_w_2;
+    static const pvar_tile_t bwd_w_0({pvars::oc}, 1);
+    static const pvar_tile_t bwd_w_1({pvars::ic}, 1);
+    static const pvar_tile_t bwd_w_2;
 
     // non-transposed
-    static const prb_tile_3 fwd = {fwd_0, fwd_1, fwd_2};
-    static const prb_tile_3 bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2};
-    static const prb_tile_3 bwd_w = {bwd_w_0, bwd_w_1, bwd_w_2};
+    static const pvar_tile_3 fwd = {fwd_0, fwd_1, fwd_2};
+    static const pvar_tile_3 bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2};
+    static const pvar_tile_3 bwd_w = {bwd_w_0, bwd_w_1, bwd_w_2};
     // transposed
-    static const prb_tile_3 t_fwd = {fwd_1, fwd_0, fwd_2};
-    static const prb_tile_3 t_bwd_d = {bwd_d_1, bwd_d_0, bwd_d_2};
-    static const prb_tile_3 t_bwd_w = {bwd_w_1, bwd_w_0, bwd_w_2};
+    static const pvar_tile_3 t_fwd = {fwd_1, fwd_0, fwd_2};
+    static const pvar_tile_3 t_bwd_d = {bwd_d_1, bwd_d_0, bwd_d_2};
+    static const pvar_tile_3 t_bwd_w = {bwd_w_1, bwd_w_0, bwd_w_2};
 
     auto &prb = cfg.prb();
     if (prb.is_fwd) return (prb.ab_swap_transpose) ? t_fwd : fwd;
     if (prb.is_bwd_d) return (prb.ab_swap_transpose) ? t_bwd_d : bwd_d;
     if (prb.is_bwd_w) return (prb.ab_swap_transpose) ? t_bwd_w : bwd_w;
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return fwd;
 }
 
@@ -1207,7 +1387,7 @@ void init_thread_group_grid(conv_config_t &cfg) {
 }
 
 void get_layout_and_dims(tensor_kind_t ab_kind, const conv_config_t &cfg,
-        layout_t &layout, std::vector<prb_dim_t> &dims) {
+        layout_t &layout, std::vector<pvar_t> &dims) {
     auto &prb = cfg.prb();
     auto &src_dims
             = conv_layout_dims(tensor_kind_t::src, /*src_dst_with_group=*/true);
@@ -1220,80 +1400,75 @@ void get_layout_and_dims(tensor_kind_t ab_kind, const conv_config_t &cfg,
             layout = prb.pick_a<const layout_param_t &>(cfg.src_layout(),
                                 cfg.wei_layout(), cfg.dst_layout())
                              .compute();
-            dims = prb.pick_a<const std::vector<prb_dim_t> &>(
+            dims = prb.pick_a<const std::vector<pvar_t> &>(
                     src_dims, wei_dims, dst_dims);
             break;
         case tensor_kind_t::b:
             layout = prb.pick_b<const layout_param_t &>(cfg.src_layout(),
                                 cfg.wei_layout(), cfg.dst_layout())
                              .compute();
-            dims = prb.pick_b<const std::vector<prb_dim_t> &>(
+            dims = prb.pick_b<const std::vector<pvar_t> &>(
                     src_dims, wei_dims, dst_dims);
             break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
-    ir_assert(layout.ndims() == (int)dims.size());
+    gpu_assert(layout.ndims() == dims.size());
 }
 
 // Calculates the size of the range for spatial dimensions within a tile.
 // For example, consider forward convolution with stride of 2 and tile ow8kw3.
 // After mapping (iw = ow * SW + kw), "iw" range is [0, 16] of size 17.
-int map_spatial(const conv_config_t &cfg, const prb_dim_t &dim,
-        const prb_tile_t &tile) {
+dim_t map_spatial(
+        const conv_config_t &cfg, const pvar_t &dim, const pvar_tile_t &tile) {
     auto &prb = cfg.prb();
-    bool is_isp = utils::one_of(dim, prb_dims::id, prb_dims::ih, prb_dims::iw);
-    bool is_osp = utils::one_of(dim, prb_dims::od, prb_dims::oh, prb_dims::ow);
-    const prb_dim_t isp_dims[] = {prb_dims::id, prb_dims::ih, prb_dims::iw};
-    const prb_dim_t ksp_dims[] = {prb_dims::kd, prb_dims::kh, prb_dims::kw};
-    const prb_dim_t osp_dims[] = {prb_dims::od, prb_dims::oh, prb_dims::ow};
-    int isp[] = {prb.id, prb.ih, prb.iw};
-    int osp[] = {prb.od, prb.oh, prb.ow};
-    int padding[] = {prb.pd, prb.ph, prb.pw};
-    int stride[] = {prb.sd, prb.sh, prb.sw};
-    int dilation[] = {prb.dd, prb.dh, prb.dw};
-    int idx = spatial_index(dim);
-    ir_assert(idx != -1);
-    int O = tile.get(osp_dims[idx], 1);
-    int I = tile.get(isp_dims[idx], 1);
-    int K = tile.get(ksp_dims[idx], 1);
-    int P = padding[idx];
-    int S = stride[idx];
-    int D = dilation[idx];
+    bool is_isp = utils::one_of(dim, pvars::id, pvars::ih, pvars::iw);
+    bool is_osp = utils::one_of(dim, pvars::od, pvars::oh, pvars::ow);
+    const pvar_t isp_dims[] = {pvars::id, pvars::ih, pvars::iw};
+    const pvar_t ksp_dims[] = {pvars::kd, pvars::kh, pvars::kw};
+    const pvar_t osp_dims[] = {pvars::od, pvars::oh, pvars::ow};
+    dim_t isp[] = {prb.id, prb.ih, prb.iw};
+    dim_t osp[] = {prb.od, prb.oh, prb.ow};
+    dim_t padding[] = {prb.pd, prb.ph, prb.pw};
+    dim_t stride[] = {prb.sd, prb.sh, prb.sw};
+    dim_t dilation[] = {prb.dd, prb.dh, prb.dw};
+    int idx = dim.spatial_index();
+    gpu_assert(idx != -1);
+    dim_t O = tile.get(osp_dims[idx], 1);
+    dim_t I = tile.get(isp_dims[idx], 1);
+    dim_t K = tile.get(ksp_dims[idx], 1);
+    dim_t P = padding[idx];
+    dim_t S = stride[idx];
+    dim_t D = dilation[idx];
     if (is_isp) {
         // Source tensor, map ox, kx to ix.
-        ir_assert(prb.is_fwd || prb.is_bwd_w);
-        int i_min = -P;
-        int i_max = (O - 1) * S - P + (K - 1) * (1 + D);
+        gpu_assert(prb.is_fwd || prb.is_bwd_w);
+        dim_t i_min = -P;
+        dim_t i_max = (O - 1) * S - P + (K - 1) * (1 + D);
         return std::min(isp[idx], i_max - i_min + 1);
     }
     // Destination tensor, map ix, kx to ox.
-    ir_assert(is_osp && prb.is_bwd_d);
-    int os_min = P - (K - 1) * (1 + D);
-    int os_max = (I - 1) + P;
+    gpu_assert(is_osp && prb.is_bwd_d);
+    dim_t os_min = P - (K - 1) * (1 + D);
+    dim_t os_max = (I - 1) + P;
     return std::min(osp[idx], utils::div_up(os_max - os_min + 1, S));
 }
 
-bool needs_spatial_mapping(const conv_config_t &cfg, const prb_dim_t &dim) {
+bool needs_spatial_mapping(const conv_config_t &cfg, const pvar_t &dim) {
     auto &prb = cfg.prb();
-    switch (dim.kind()) {
-        case prb_dim_kind_t::od:
-        case prb_dim_kind_t::oh:
-        case prb_dim_kind_t::ow: return prb.is_bwd_d;
-        case prb_dim_kind_t::id:
-        case prb_dim_kind_t::ih:
-        case prb_dim_kind_t::iw: return prb.is_fwd || prb.is_bwd_w;
-        default: return false;
-    }
+    if (utils::one_of(dim.name(), "od", "oh", "ow")) return prb.is_bwd_d;
+    if (utils::one_of(dim.name(), "id", "ih", "iw"))
+        return prb.is_fwd || prb.is_bwd_w;
+    return false;
 }
 
 size_t get_memory_footprint(const tensor_kind_t &ab_kind,
-        const conv_config_t &cfg, const prb_tile_t &_tile) {
+        const conv_config_t &cfg, const pvar_tile_t &_tile) {
     layout_t layout;
-    std::vector<prb_dim_t> dims;
+    std::vector<pvar_t> dims;
     get_layout_and_dims(ab_kind, cfg, layout, dims);
     dim_t elems = 1;
-    prb_tile_t tile;
-    for (int i = 0; i < layout.ndims(); i++) {
+    pvar_tile_t tile;
+    for (dim_idx_t i = 0; i < layout.ndims(); i++) {
         auto &d = dims[i];
         dim_t d_size
                 = (needs_spatial_mapping(cfg, d) ? map_spatial(cfg, d, _tile)
@@ -1301,15 +1476,15 @@ size_t get_memory_footprint(const tensor_kind_t &ab_kind,
         tile[d] = d_size;
         elems *= std::min(d_size, layout.dim(i));
     }
-    ir_assert(elems >= 1);
+    gpu_assert(elems >= 1);
     return (size_t)layout.type().size() * elems;
 }
 
 // Returns the memory footprint in bytes for both input tensors accessed inside
 // the tile that is combined from tg_tile and grid_tile.
-size_t get_memory_footprint(const conv_config_t &cfg, const prb_tile_t &tg_tile,
-        const prb_tile_t &grid_tile) {
-    prb_tile_t tile;
+size_t get_memory_footprint(const conv_config_t &cfg,
+        const pvar_tile_t &tg_tile, const pvar_tile_t &grid_tile) {
+    pvar_tile_t tile;
     for (auto &d : tg_tile) {
         if (tg_tile[d] == 1) continue;
         tile[d] = tg_tile[d];
@@ -1323,10 +1498,10 @@ size_t get_memory_footprint(const conv_config_t &cfg, const prb_tile_t &tg_tile,
     return a_bytes + b_bytes;
 }
 
-prb_tile_t get_grid_tile(const conv_config_t &cfg) {
-    prb_tile_t grid_tile;
+pvar_tile_t get_grid_tile(const conv_config_t &cfg) {
+    pvar_tile_t grid_tile;
     for (auto &d : conv_index_dims(cfg.prb().prop_kind())) {
-        int size = cfg.grid_dim(d);
+        dim_t size = cfg.grid_dim(d);
         if (size == 1) continue;
         grid_tile[d] = size;
     }
@@ -1339,7 +1514,7 @@ walk_order_t maybe_fixup_group_with_small_channels(
         const conv_config_t &cfg, const walk_order_t &walk_order) {
     auto &prb = cfg.prb();
     auto grid_tile = get_grid_tile(cfg);
-    if (prb.g == 1 || !grid_tile.has(prb_dims::g)) return walk_order;
+    if (prb.g == 1 || !grid_tile.has(pvars::g)) return walk_order;
 
     auto &layout = (prb.is_fwd || prb.is_bwd_w) ? cfg.src_layout().compute()
                                                 : cfg.dst_layout().compute();
@@ -1358,9 +1533,9 @@ walk_order_t maybe_fixup_group_with_small_channels(
         return walk_order;
 
     walk_order_t fixed;
-    fixed.add(prb_dims::g, grid_tile.at(prb_dims::g), 0);
+    fixed.add(pvars::g, grid_tile.at(pvars::g), 0);
     for (auto &b : walk_order.blocks()) {
-        if (b.dim == prb_dims::g) continue;
+        if (b.dim == pvars::g) continue;
         fixed.add(b.dim, b.size, b.grid_id);
     }
     fixed.finalize(grid_tile);
@@ -1368,23 +1543,21 @@ walk_order_t maybe_fixup_group_with_small_channels(
 }
 
 walk_order_t get_default_walk_order(
-        const conv_config_t &cfg, const prb_tile_t &grid_tile) {
-    using vec_t = std::vector<prb_dim_t>;
+        const conv_config_t &cfg, const pvar_tile_t &grid_tile) {
+    using vec_t = std::vector<pvar_t>;
     // Ordered from innermost to outermost.
-    static const vec_t fwd_0({prb_dims::oc});
-    static const vec_t fwd_1(
-            {prb_dims::ow, prb_dims::oh, prb_dims::od, prb_dims::g});
-    static const vec_t fwd_2({prb_dims::mb});
-
-    static const vec_t bwd_d_0({prb_dims::ic});
-    static const vec_t bwd_d_1(
-            {prb_dims::iw, prb_dims::ih, prb_dims::id, prb_dims::g});
-    static const vec_t bwd_d_2({prb_dims::mb});
-
-    static const vec_t bwd_w_0({prb_dims::oc});
-    static const vec_t bwd_w_1({prb_dims::ic, prb_dims::kw, prb_dims::kh,
-            prb_dims::kd, prb_dims::ow, prb_dims::oh, prb_dims::od});
-    static const vec_t bwd_w_2({prb_dims::g, prb_dims::mb});
+    static const vec_t fwd_0({pvars::oc});
+    static const vec_t fwd_1({pvars::ow, pvars::oh, pvars::od, pvars::g});
+    static const vec_t fwd_2({pvars::mb});
+
+    static const vec_t bwd_d_0({pvars::ic});
+    static const vec_t bwd_d_1({pvars::iw, pvars::ih, pvars::id, pvars::g});
+    static const vec_t bwd_d_2({pvars::mb});
+
+    static const vec_t bwd_w_0({pvars::oc});
+    static const vec_t bwd_w_1({pvars::ic, pvars::kw, pvars::kh, pvars::kd,
+            pvars::ow, pvars::oh, pvars::od});
+    static const vec_t bwd_w_2({pvars::g, pvars::mb});
     static const std::array<vec_t, 3> fwd = {fwd_0, fwd_1, fwd_2};
     static const std::array<vec_t, 3> bwd_d = {bwd_d_0, bwd_d_1, bwd_d_2};
     static const std::array<vec_t, 3> bwd_w = {bwd_w_0, bwd_w_1, bwd_w_2};
@@ -1406,31 +1579,32 @@ walk_order_t get_default_walk_order(
 class mn_walker_t {
 public:
     struct entry_t {
-        prb_dim_t dim;
-        int size = 1;
-        int tile_size = 1;
-        prb_dim_kind_t mn_kind = prb_dim_kind_t::undef;
+        pvar_t dim;
+        dim_t size = 1;
+        dim_t tile_size = 1;
+        char mn_kind = ' ';
 
         bool has_next() const { return size < tile_size; }
     };
 
-    mn_walker_t(const prb_tile_t &tile, const conv_problem_t &prb) : prb_(prb) {
+    mn_walker_t(const pvar_tile_t &tile, const conv_problem_t &prb)
+        : prb_(prb) {
         for (auto &d : tile) {
             auto bmnk = to_gemm(d, prb);
-            entry_t e;
+            if (!utils::one_of(bmnk, pvars::m, pvars::n)) continue;
+
+            entries_.emplace_back();
+            entry_t &e = entries_.back();
             e.dim = d;
             e.tile_size = tile[d];
-            e.mn_kind = bmnk.kind();
-            if (!utils::one_of(e.mn_kind, prb_dim_kind_t::m, prb_dim_kind_t::n))
-                continue;
-            entries_.push_back(e);
+            e.mn_kind = (bmnk == pvars::m ? 'm' : 'n');
         }
         // Put through spatial dimensions first and order spatial accordingly
         // (WHD, width is first).
         std::sort(entries_.begin(), entries_.end(),
                 [&](const entry_t &a, const entry_t &b) {
-                    int a_sp_idx = spatial_index(a.dim);
-                    int b_sp_idx = spatial_index(b.dim);
+                    int a_sp_idx = a.dim.spatial_index();
+                    int b_sp_idx = b.dim.spatial_index();
                     if (a_sp_idx >= 0 && b_sp_idx >= 0)
                         return a_sp_idx > b_sp_idx;
                     return (a_sp_idx >= 0) && (b_sp_idx < 0);
@@ -1443,29 +1617,27 @@ class mn_walker_t {
         return false;
     }
 
-    entry_t next(const prb_tile_t &inner) {
+    entry_t next(const pvar_tile_t &inner) {
         int m_size = 1;
         int n_size = 1;
         for (auto &d : inner) {
             auto bmnk = to_gemm(d, prb_);
-            if (bmnk == prb_dims::m) {
+            if (bmnk == pvars::m) {
                 m_size *= inner[d];
-            } else if (bmnk == prb_dims::n) {
+            } else if (bmnk == pvars::n) {
                 n_size *= inner[d];
             }
         }
-        auto mn_kind
-                = (m_size < n_size ? prb_dim_kind_t::m : prb_dim_kind_t::n);
-        for (auto kind : {mn_kind, prb_dim_kind_t::undef}) {
+        auto mn_kind = (m_size < n_size ? 'm' : 'n');
+        for (auto kind : {mn_kind, ' '}) {
             for (auto &e : entries_) {
-                if (utils::one_of(kind, e.mn_kind, prb_dim_kind_t::undef)
-                        && e.has_next()) {
+                if (utils::one_of(kind, e.mn_kind, ' ') && e.has_next()) {
                     e.size *= 2;
                     return e;
                 }
             }
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return entry_t();
     }
 
@@ -1477,12 +1649,12 @@ class mn_walker_t {
 walk_order_t compute_walk_order(const conv_config_t &cfg) {
     auto &prb = cfg.prb();
     int tg_size = 1;
-    prb_tile_t inner;
+    pvar_tile_t inner;
     for (auto &d : conv_index_dims(cfg.prb().prop_kind())) {
-        int iter = cfg.iter_dim(d);
-        int tg = cfg.thread_group_dim(d);
-        int loop = cfg.loop_dim(d);
-        int size = iter * tg * loop;
+        dim_t iter = cfg.iter_dim(d);
+        dim_t tg = cfg.thread_group_dim(d);
+        dim_t loop = cfg.loop_dim(d);
+        dim_t size = iter * tg * loop;
         if (size == 1) continue;
         inner[d] = size;
         tg_size *= tg;
@@ -1493,10 +1665,19 @@ walk_order_t compute_walk_order(const conv_config_t &cfg) {
     // Depthwise does not expose much reuse so keep the default order.
     if (prb.is_dw) return default_walk_order;
 
+    // XXX: Workaround for XeHPG related issues, supposedly coming from
+    // math.inv usage to emulate integer division when using blocked walk
+    // order.
+    if (cfg.hw() == ngen::HW::XeHPG
+            && utils::one_of(cfg.hw().product_family(),
+                    ngen::ProductFamily::GenericXeHPG,
+                    ngen::ProductFamily::DG2))
+        return default_walk_order;
+
     // If threadgroup memory footprint exceeds L3 then L3 blocking is not
     // applied.
     const size_t l3_size = cfg.hw().l3_cache_size();
-    size_t inner_bytes = get_memory_footprint(cfg, inner, prb_tile_t());
+    size_t inner_bytes = get_memory_footprint(cfg, inner, pvar_tile_t());
     if (inner_bytes > l3_size) return default_walk_order;
 
     // If input memory fits L3 then no L3 blocking is not applied.
@@ -1505,13 +1686,13 @@ walk_order_t compute_walk_order(const conv_config_t &cfg) {
 
     // If the kernel does not require multiple waves then no L3 blocking is not
     // applied.
-    float max_tgs_per_wave = conv_config_t::get_max_threadgroups_per_wave(
+    int max_tgs_per_wave = conv_config_t::get_max_threadgroups_per_wave(
             cfg.exec_cfg(), tg_size);
     if (grid_tile.elems() <= max_tgs_per_wave) return default_walk_order;
 
     // Add M/N blocks until the full footprint fits L3 cache.
-    prb_tile_t grid_inner;
-    prb_tile_t rem_tile = grid_tile;
+    pvar_tile_t grid_inner;
+    pvar_tile_t rem_tile = grid_tile;
     ab_bytes = inner_bytes;
     mn_walker_t mn_walker(rem_tile, cfg.prb());
     while (mn_walker.has_next()) {
@@ -1537,11 +1718,11 @@ walk_order_t compute_walk_order(const conv_config_t &cfg) {
                     break;
                 case 1:
                 case 2:
-                    int rem = utils::div_up(
+                    dim_t rem = utils::div_up(
                             grid_tile[b.dim], grid_inner.get(b.dim, 1));
                     if (rem == 1) continue;
                     auto bmnk = to_gemm(b.dim, prb);
-                    bool is_bk = utils::one_of(bmnk, prb_dims::b, prb_dims::k);
+                    bool is_bk = utils::one_of(bmnk, pvars::b, pvars::k);
                     if ((step == 2) != is_bk) continue;
                     walk_order.add(b.dim, rem, 0);
                     break;
@@ -1591,7 +1772,7 @@ int fixup_slm_bufs(const conv_problem_t &prb, int slm_bufs,
     return slm_bufs;
 }
 
-int slm_bufs_hint(const conv_problem_t &prb, int m_tg, int n_tg,
+int slm_bufs_hint(const conv_problem_t &prb, dim_t m_tg, dim_t n_tg,
         bool zp_do_src_compensation, bool enable_a, bool enable_b,
         bool do_unroll) {
     if (!enable_a && !enable_b) return 0;
@@ -1622,7 +1803,7 @@ void init_slm(conv_config_t &cfg) {
         }
         bufs = fixup_slm_bufs(prb, bufs, cfg.zp_cfg().do_src_compensation,
                 enable_a, enable_b, cfg.pipeline().do_unroll());
-        ir_assert(bufs > 0);
+        gpu_assert(bufs > 0);
         gmem_bufs = (cfg.is_dp_fma() && cfg.pipeline().do_unroll()) ? 2 : 1;
     }
     gmem_bufs = std::min(cfg.plan().max_gmem_bufs, gmem_bufs);
@@ -1672,11 +1853,11 @@ void fixup_config(conv_config_t &cfg) {
 
 void validate_config_and_plan(conv_config_t &cfg) {
     auto check_if_in_grid_dims
-            = [](const std::array<prb_tile_t, 3> &grid, const prb_dim_t &dim) {
+            = [](const std::array<pvar_tile_t, 3> &grid, const pvar_t &dim) {
                   for (auto &tile : grid)
                       for (auto &d : tile)
                           if (d == dim) return;
-                  ir_error_not_expected() << dim.name();
+                  gpu_error_not_expected() << dim.name();
               };
     const auto &tg_dims = get_thread_group_grid_conv_dims(cfg);
     const auto &grid_dims = get_kernel_grid_conv_dims(cfg);
@@ -1686,12 +1867,12 @@ void validate_config_and_plan(conv_config_t &cfg) {
     }
 
     auto &plan = cfg.plan();
-    ir_assert(cfg.slm().a() == plan.slm.has_a());
-    ir_assert(cfg.slm().b() == plan.slm.has_b());
-    ir_assert(cfg.pipeline().reuse_headers() == plan.reuse_headers);
+    gpu_assert(cfg.slm().a() == plan.slm.has_a());
+    gpu_assert(cfg.slm().b() == plan.slm.has_b());
+    gpu_assert(cfg.pipeline().reuse_headers() == plan.reuse_headers);
 
 #ifdef DNNL_DEV_MODE
-    using send_pattern = send_pattern_t<prb_dim_t>;
+    using send_pattern = send_pattern_t<pvar_t>;
     send_pattern a_load_pattern;
     send_pattern b_load_pattern;
     bool a_2d = plan.uses_2d_load(abc_kind_t::a);
@@ -1717,13 +1898,13 @@ void validate_config_and_plan(conv_config_t &cfg) {
     auto dummy_reg(var_t::make(type_t::byte_ptr(), "reg"));
     if (!a_load_pattern.matches(
                 plan.x2r.a_load.create_stmt(dummy_mem, dummy_reg))) {
-        ir_warning() << "Generated load for tensor A does not match "
-                     << a_load_pattern << " load idiom\n";
+        gpu_warning() << "Generated load for tensor A does not match "
+                      << a_load_pattern << " load idiom";
     }
     if (!b_load_pattern.matches(
                 plan.x2r.b_load.create_stmt(dummy_mem, dummy_reg))) {
-        ir_warning() << "Generated load for tensor B does not match "
-                     << a_load_pattern << " load idiom\n";
+        gpu_warning() << "Generated load for tensor B does not match "
+                      << a_load_pattern << " load idiom";
     }
 #endif
 }
@@ -1751,13 +1932,15 @@ status_t init_cfg(conv_config_t &cfg, const primitive_t *prim) {
     static std::mutex tune_mutex;
     std::unique_lock<std::mutex> lock(tune_mutex, std::defer_lock_t());
     if (cfg.tiler().is_tuning_mode()) lock.lock();
-    while (cfg.tiler().can_move_next()) {
+    while (cfg.tiler().is_valid()) {
         auto try_cfg = cfg;
         auto status = try_init_cfg(try_cfg);
         if (status == status::success) {
+            if (cfg.tiler().is_tuning_mode()) cfg.tiler().move_next(cfg);
             cfg = std::move(try_cfg);
             return status::success;
         }
+        cfg.tiler().move_next(cfg);
     }
     return status::runtime_error;
 }
@@ -1766,7 +1949,7 @@ int conv_config_t::reserved_regs() const {
     return constants::reserved_regs_default;
 }
 
-int conv_config_t::pad_block(const prb_dim_t &d) const {
+int conv_config_t::pad_block(const pvar_t &d) const {
     auto &src = src_layout().compute();
     auto &wei = wei_layout().compute();
     auto &dst = dst_layout().compute();
@@ -1778,12 +1961,16 @@ int conv_config_t::pad_block(const prb_dim_t &d) const {
     int oc_idxs[] = {-1, 1, 2};
     int ic_idxs[] = {2, 2, -1};
     int *idxs = nullptr;
-    switch (d.kind()) {
-        case prb_dim_kind_t::g: idxs = g_idxs; break;
-        case prb_dim_kind_t::mb: idxs = mb_idxs; break;
-        case prb_dim_kind_t::oc: idxs = oc_idxs; break;
-        case prb_dim_kind_t::ic: idxs = ic_idxs; break;
-        default: return 1;
+    if (d.name() == "g") {
+        idxs = g_idxs;
+    } else if (d.name() == "mb") {
+        idxs = mb_idxs;
+    } else if (d.name() == "oc") {
+        idxs = oc_idxs;
+    } else if (d.name() == "ic") {
+        idxs = ic_idxs;
+    } else {
+        return 1;
     }
 
     int ret = 1;
@@ -1817,7 +2004,7 @@ std::string conv_config_t::str() const {
         }
         oss << std::endl;
     }
-    int kg_elems = kernel_grid().elems(), tg_elems = thread_group_grid().elems();
+    dim_t kg_elems = kernel_grid().elems(), tg_elems = thread_group_grid().elems();
     int estimated_peak_regs = estimate_register_count(*this);
     oss << blocking_brief_str();
     oss << "  Kernel grid:                " << kernel_grid() << std::endl;
@@ -1848,7 +2035,7 @@ std::string pad_str(std::string s, int pad) {
     return s;
 }
 
-std::string pad_int(int i, int pad) {
+std::string pad_int(dim_t i, int pad) {
     return pad_str(std::to_string(i), pad);
 }
 
@@ -1859,10 +2046,10 @@ conv_key_t conv_config_t::key() const {
 std::string conv_config_t::blocking_brief_str() const {
     std::ostringstream oss;
     for (auto &d : index_dims()) {
-        int iter = iter_dim(d);
-        int tg = thread_group_dim(d);
-        int loop = loop_dim(d);
-        int grid = grid_dim(d);
+        dim_t iter = iter_dim(d);
+        dim_t tg = thread_group_dim(d);
+        dim_t loop = loop_dim(d);
+        dim_t grid = grid_dim(d);
         if (iter == 1 && loop == 1 && tg == 1) continue;
         oss << "  Dimension " << d.name()
             << pad_str(":", -18 + (int)d.name().length());
@@ -1897,12 +2084,12 @@ const conv_plan_t &conv_config_t::plan() const {
 bool conv_config_t::can_skip_wei_zero_out() const {
     if (!prb().is_bwd_w) return true;
     bmnk_dim_helper_t h(*this);
-    int k_iter_dim = h.iter_dim(prb_dims::k);
-    int k_loop_dim = h.loop_dim(prb_dims::k);
-    int k_tg_dim = h.thread_group_dim(prb_dims::k);
-    int k_tg_block = k_iter_dim * k_loop_dim * k_tg_dim;
-    int k_padded = padded_dim(prb_dims::mb) * padded_dim(prb_dims::od)
-            * padded_dim(prb_dims::oh) * padded_dim(prb_dims::ow);
+    dim_t k_iter_dim = h.iter_dim(pvars::k);
+    dim_t k_loop_dim = h.loop_dim(pvars::k);
+    dim_t k_tg_dim = h.thread_group_dim(pvars::k);
+    dim_t k_tg_block = k_iter_dim * k_loop_dim * k_tg_dim;
+    dim_t k_padded = padded_dim(pvars::mb) * padded_dim(pvars::od)
+            * padded_dim(pvars::oh) * padded_dim(pvars::ow);
     return k_tg_block >= k_padded;
 }
 
@@ -1911,13 +2098,12 @@ bool conv_config_t::can_skip_bia_zero_out() const {
     return can_skip_wei_zero_out() && !slm().b();
 }
 
-prb_tile_t conv_config_t::shape(bool pad) const {
+pvar_tile_t conv_config_t::shape(bool pad) const {
     auto &p = prb();
-    prb_tile_t ret;
+    pvar_tile_t ret;
 #define SET(name) \
-    ret[prb_dims::name] \
-            = (pad ? utils::rnd_up(p.name, pad_block(prb_dims::name)) \
-                   : p.name)
+    ret[pvars::name] \
+            = (pad ? utils::rnd_up(p.name, pad_block(pvars::name)) : p.name)
     SET(mb);
     SET(g);
     SET(oc);
diff --git a/src/gpu/intel/jit/conv/config.hpp b/src/gpu/intel/jit/conv/config.hpp
index 6141c353ca5..37d30f0ebbf 100644
--- a/src/gpu/intel/jit/conv/config.hpp
+++ b/src/gpu/intel/jit/conv/config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,6 +38,19 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
+class allow_global_reduction_param_t : public bool_param_t {
+public:
+    allow_global_reduction_param_t() : bool_param_t(default_value) {}
+    std::string name() const override { return "global-reduction"; }
+    std::string desc() const override {
+        return "Whether global reduction via atomics is allowed.";
+    }
+    bool is_overridable() const override { return true; }
+    bool is_default() const override { return get() == default_value; }
+
+    static const bool default_value;
+};
+
 // Special optimization techniques for backward by data convolution.
 //
 // skip_out_of_bound_w enables skip-conditions for kw loop (unit stride only):
@@ -72,7 +85,7 @@ inline std::string to_string(bwd_d_optimize_kind_t kind) {
         CASE(skip_strided_dh);
         CASE(skip_strided_dhw);
 #undef CASE
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return "unknown";
 }
@@ -85,7 +98,7 @@ inline bwd_d_optimize_kind_t to_bwd_d_optimize_kind(const std::string &s) {
     CASE(skip_strided_dh);
     CASE(skip_strided_dhw);
 #undef CASE
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return bwd_d_optimize_kind_t::undef;
 }
 
@@ -170,7 +183,7 @@ class pipeline_param_t : public param_t {
             switch (c) {
                 case 'u': do_unroll_ = true; break;
                 case 'r': reuse_headers_ = true; break;
-                default: ir_error_not_expected() << s;
+                default: gpu_error_not_expected() << s;
             }
         }
     }
@@ -233,12 +246,12 @@ class prefetch_param_t : public param_t {
                 b_ = p.find("b") != std::string::npos;
                 continue;
             }
-            ir_assert(p.size() >= 2) << p;
+            gpu_assert(p.size() >= 2) << p;
             char name = p[0];
             int value = std::stoi(p.substr(1));
             switch (name) {
                 case 'x': bufs_ = value; break;
-                default: ir_error_not_expected() << p;
+                default: gpu_error_not_expected() << p;
             }
         }
         if (!ab_set && bufs_ > 0) {
@@ -297,14 +310,14 @@ class slm_param_t : public param_t {
                 b_ = p.find("b") != std::string::npos;
                 continue;
             }
-            ir_assert(p.size() >= 2) << p;
+            gpu_assert(p.size() >= 2) << p;
             char name = p[0];
             int value = std::stoi(p.substr(1));
             switch (name) {
                 case 'x': bufs_ = value; break;
                 case 'g': gmem_bufs_ = value; break;
                 case 'v': sync_version_ = value; break;
-                default: ir_error_not_expected() << p;
+                default: gpu_error_not_expected() << p;
             }
         }
         if (!ab_set && bufs_ > 0) {
@@ -387,7 +400,7 @@ class subtiles_param_t : public param_t {
             } else if (kv.first == "b") {
                 b_ = kv.second;
             } else {
-                ir_error_not_expected() << kv.first;
+                gpu_error_not_expected() << kv.first;
             }
         }
     }
@@ -467,13 +480,13 @@ class conv_config_t : public prim_config_t {
 public:
 #define DECL_PARAM(name) \
     const name##_param_t &name##_param() const { \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         (void)name##_init_; \
         return name##_; \
     } \
     name##_param_t &name##_param() { return name##_; } \
     const name##_param_t::value_t &name() const { \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         return name##_.get(); \
     } \
     void set_##name(const name##_param_t::value_t &value) { \
@@ -483,11 +496,12 @@ class conv_config_t : public prim_config_t {
 #define DECL_PARAM2(name) \
     const name##_param_t &name() const { \
         (void)name##_init_; \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         return name##_; \
     } \
     name##_param_t &name() { return name##_; }
 
+    DECL_PARAM(allow_global_reduction)
     DECL_PARAM(bwd_d_optimize_kind)
     DECL_PARAM(fma_kind)
     DECL_PARAM(pad_slm)
@@ -506,10 +520,10 @@ class conv_config_t : public prim_config_t {
 
     std::string str() const override;
 
-    const std::vector<prb_dim_t> &index_dims() const override {
+    const std::vector<pvar_t> &index_dims() const override {
         return conv_index_dims(prb().prop_kind());
     }
-    prb_tile_t shape(bool pad) const override;
+    pvar_tile_t shape(bool pad) const override;
 
     std::string blocking_brief_str() const;
 
@@ -521,9 +535,9 @@ class conv_config_t : public prim_config_t {
     // compute and store, we still need to pad 8 to 32 and
     // spawn more thread groups to ensure 32c block is
     // properly zero-padded.
-    int pad_block(const prb_dim_t &d) const override;
+    int pad_block(const pvar_t &d) const override;
 
-    int unroll(const prb_dim_t &d) const { return unroll()(d); }
+    int unroll(const pvar_t &d) const { return into<int>(unroll()(d)); }
 
     int reserved_regs() const;
 
@@ -558,8 +572,7 @@ class conv_config_t : public prim_config_t {
     compute::nd_range_t nd_range() const {
         compute::range_t gws = compute::range_t::empty();
         compute::range_t lws = compute::range_t::empty();
-        for (int i = 0; i < gpu_utils::into<int>(compute::range_t::max_ndims);
-                i++) {
+        for (int i = 0; i < into<int>(compute::range_t::max_ndims); i++) {
             lws[i] = thread_group_grid().dim(i) * (i == 0 ? simd() : 1);
             gws[i] = kernel_grid().dim(i) * lws[i];
         }
@@ -586,6 +599,12 @@ class conv_config_t : public prim_config_t {
         set_exec_cfg(tmp);
     }
 
+    void set_require_signal_header(bool r) {
+        auto tmp = exec_cfg();
+        tmp.set_require_signal_header(r);
+        set_exec_cfg(tmp);
+    }
+
     void set_tiler(const std::shared_ptr<conv_tiler_t> &tiler);
     const conv_tiler_t &tiler() const;
     conv_tiler_t &tiler();
@@ -607,6 +626,7 @@ class conv_config_t : public prim_config_t {
                   return &((const conv_config_t *)c)->name##_; \
               });
 
+    INIT_PARAM(allow_global_reduction)
     INIT_PARAM(bwd_d_optimize_kind)
     INIT_PARAM(fma_kind)
     INIT_PARAM(pad_slm)
@@ -632,37 +652,39 @@ class bmnk_dim_helper_t {
         gemm_loop_ = to_gemm(cfg.loop_dims().get(), prb);
     }
 
-    int iter_dim(prb_dim_t d) const { return gemm_iter_.get(d, 1); }
+    dim_t iter_dim(const pvar_t &d) const { return gemm_iter_.get(d, 1); }
 
-    int thread_group_dim(prb_dim_t d) const {
+    dim_t thread_group_dim(const pvar_t &d) const {
         return gemm_thread_group_.get(d, 1);
     }
 
-    int loop_dim(prb_dim_t d) const { return gemm_loop_.get(d, 1); }
+    dim_t loop_dim(const pvar_t &d) const { return gemm_loop_.get(d, 1); }
 
 private:
-    prb_tile_t gemm_iter_;
-    prb_tile_t gemm_thread_group_;
-    prb_tile_t gemm_loop_;
+    pvar_tile_t gemm_iter_;
+    pvar_tile_t gemm_thread_group_;
+    pvar_tile_t gemm_loop_;
 };
 
 status_t init_pd_time_cfg(const conv_problem_t &prb, conv_config_t &cfg,
         impl::engine_t *engine, convolution_pd_t *pd, primitive_attr_t *attr);
 status_t init_cfg(conv_config_t &cfg, const primitive_t *prim);
 status_t init_regs(conv_config_t &cfg);
-int slm_bufs_hint(const conv_problem_t &prb, int m_tg, int n_tg,
+int slm_bufs_hint(const conv_problem_t &prb, dim_t m_tg, dim_t n_tg,
         bool do_src_zp_compensation, bool enable_a, bool enable_b,
         bool do_unroll);
 tensor_config_t get_tensor_config(
         const conv_config_t &cfg, const memory_desc_t *zp_src);
-bool is_small(const type_t &type, int elems);
+bool is_small(const type_t &type, dim_t elems);
 int estimate_register_count(const conv_config_t &cfg);
 int default_regs(const conv_config_t &cfg);
 void init_kernel_grid(conv_config_t &cfg);
 void init_walk_order(conv_config_t &cfg);
 void init_thread_group_grid(conv_config_t &cfg);
-std::array<prb_tile_t, 3> get_kernel_grid_conv_dims(const conv_config_t &cfg);
-std::array<prb_tile_t, 3> get_thread_group_grid_conv_dims(
+void prepare_zp_precompute_conv(const conv_problem_t &prb, dim_t *idhw,
+        dim_t *odhw, dim_t *pdhw, dim_t *ddhw);
+std::array<pvar_tile_t, 3> get_kernel_grid_conv_dims(const conv_config_t &cfg);
+std::array<pvar_tile_t, 3> get_thread_group_grid_conv_dims(
         const conv_config_t &cfg);
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/conv/conv_kernel.hpp b/src/gpu/intel/jit/conv/conv_kernel.hpp
index 9a3bdbb3f20..97bcc16a7d4 100644
--- a/src/gpu/intel/jit/conv/conv_kernel.hpp
+++ b/src/gpu/intel/jit/conv/conv_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,11 +55,14 @@ template <ngen::HW hw>
 conv_kernel_t<hw>::conv_kernel_t(const conv_config_t &cfg,
         const kernel_info_t &kernel_info, const compute::range_t &local_range,
         const layout_t &zp_dst)
-    : ir_kernel_t<hw>("gen_conv", cfg.exec_cfg(), kernel_info, local_range,
-            utils::one_of(cfg.fma_kind(), fma_kind_t::dpas, fma_kind_t::dpasw))
+    : ir_kernel_t<hw>("gen_conv", cfg.exec_cfg(), local_range,
+            utils::one_of(cfg.fma_kind(), fma_kind_t::dpas, fma_kind_t::dpasw),
+            {GENERATOR_NAME, GENERATOR_LINE})
     , prb_(cfg.prb())
     , cfg_(cfg) {
 
+    set_kernel_iface(kernel_info.iface());
+
     // XXX: BWD_W does 32x32 multiplication in the inner loop which may cause
     // hangs when using with split barrier. Switch to emulation to work around
     // the issue.
@@ -75,19 +78,6 @@ conv_kernel_t<hw>::conv_kernel_t(const conv_config_t &cfg,
     profile.stamp("Alloc_Mgr Construct");
 
     setup_interface(body);
-    profile.stamp("Setup Interface");
-
-    this->require_signal_header_ = true;
-    generate_prologue();
-
-    profile.stamp("Prologue");
-
-    // Bind "external" variables.
-    expr_binding_t expr_binding(hw);
-    bind_external_vars(body, cfg_.plan().gemm_schedule.kernel_grid_walk_order(),
-            builder.local_id(), expr_binding);
-    profile.stamp("Bind Variables");
-
 #ifdef DNNL_DEV_MODE
     profile.stop();
     verify_grf_usage(cfg, body, ra_.get_alloced_regs());
@@ -95,23 +85,19 @@ conv_kernel_t<hw>::conv_kernel_t(const conv_config_t &cfg,
 #endif
 
     // Generate assembly from IR.
-    convert_ir_to_ngen<hw>(body, this, expr_binding);
-    profile.stamp("Generate Assembly");
-
-    generate_epilogue();
-    profile.stop("Epilogue");
+    convert_ir_to_ngen<ir_kernel_t<hw>>(
+            body, this, &cfg_.plan().gemm_schedule.kernel_grid_walk_order());
+    profile.stop("Generate Assembly");
 
 #ifdef DNNL_DEV_MODE
-    ir_perf_no_trace() << profile << "\n";
+    gpu_perf_no_trace() << profile;
 
-    ir_trace() << "Actual register usage:           " << ra_.get_peak_regs()
-               << std::endl;
+    gpu_trace() << "Actual register usage:           " << ra_.get_peak_regs();
     int estimated_peak_regs = estimate_register_count(cfg_);
     if (ra_.get_peak_regs() > estimated_peak_regs) {
-        ir_warning()
+        gpu_warning()
                 << "conv_kernel_t register usage underestimated: estimate = "
-                << estimated_peak_regs << ", actual = " << ra_.get_peak_regs()
-                << "\n";
+                << estimated_peak_regs << ", actual = " << ra_.get_peak_regs();
     }
 #endif
 }
diff --git a/src/gpu/intel/jit/conv/gen_convolution.cpp b/src/gpu/intel/jit/conv/gen_convolution.cpp
index d2844bb0163..c3e6c872dd1 100644
--- a/src/gpu/intel/jit/conv/gen_convolution.cpp
+++ b/src/gpu/intel/jit/conv/gen_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,10 +25,11 @@
 #include "common/impl_registration.hpp"
 #include "common/utils.hpp"
 #include "common/verbose.hpp"
+#include "gpu/gpu_zero_points_conv.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/reorder/reorder_kernel.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 #include "gpu/intel/jit/conv/config.hpp"
 #include "gpu/intel/jit/conv/conv_kernel.hpp"
@@ -45,8 +46,7 @@ struct conv_pd_data_t {
     conv_config_t pd_cfg;
     tensor_config_t tensor_cfg;
     std::vector<kernel_info_t> kernel_infos;
-    std::shared_ptr<dnnl_primitive_desc> zp_pd;
-    std::shared_ptr<impl::primitive_t> zp_prim;
+    std::shared_ptr<primitive_desc_t> zp_pd;
 };
 
 class gen_convolution_t {
@@ -55,8 +55,6 @@ class gen_convolution_t {
 
     template <typename T>
     static status_t init_pd(T *pd, impl::engine_t *engine) {
-        bool enable_conv_v2 = gpu_utils::dev_getenv("enable_conv_v2", false);
-        if (enable_conv_v2) return status::unimplemented;
         try {
             using compute::compute_engine_t;
             auto *compute_engine = utils::downcast<compute_engine_t *>(engine);
@@ -70,83 +68,42 @@ class gen_convolution_t {
             conv_problem_t prb;
             CHECK(prb.init(engine, pd));
 
+            // The IR generator hard-codes s32 as the type for problem parameters.
+            VDISPATCH_CONV_IC(
+                    INT_MAX > std::max({prb.mb, prb.ic, prb.id, prb.ih, prb.iw,
+                            prb.oc, prb.od, prb.oh, prb.ow, prb.kd, prb.kh,
+                            prb.kw, prb.sd, prb.sh, prb.sw, prb.pd, prb.ph,
+                            prb.pw, prb.dd, prb.dh, prb.dw}),
+                    VERBOSE_SHAPE_RESTRICTION);
+
             pd->data = std::make_shared<conv_pd_data_t>();
             CHECK(init_pd_time_cfg(
                     prb, pd->data->pd_cfg, engine, pd, &pd->attr_));
 
-            if (pd->data->pd_cfg.zp_cfg().needs_src_precalc) {
-                memory::dims I {prb.id, prb.ih, prb.iw};
-                memory::dims O {prb.od, prb.oh, prb.ow};
-                memory::dims K {prb.kd, prb.kh, prb.kw};
-                memory::dims S {prb.sd, prb.sh, prb.sw};
-                memory::dims D {prb.dd, prb.dh, prb.dw};
-                memory::dims P {prb.pd, prb.ph, prb.pw};
-                const int off = 5 - prb.ndims;
-                const auto *w = pd->invariant_wei_md();
-                { // restore the original layout of the prb values
-                    const auto *s = pd->invariant_src_md();
-                    const auto *d = pd->invariant_dst_md();
-                    auto has_dim = [&](int i) {
-                        return (s->dims[2 + i] > 1) || (d->dims[2 + i] > 1)
-                                || (w->dims[2 + i + prb.with_groups] > 1);
-                    };
-                    auto move_back = [&](int i, int off) {
-                        if (off == 0) return;
-                        I[i - off] = O[i - off] = K[i - off] = S[i - off] = 1;
-                        D[i - off] = P[i - off] = 0;
-                        std::swap(I[i - off], I[i]);
-                        std::swap(O[i - off], O[i]);
-                        std::swap(K[i - off], K[i]);
-                        std::swap(S[i - off], S[i]);
-                        std::swap(D[i - off], D[i]);
-                        std::swap(P[i - off], P[i]);
-                    };
-                    bool has_d = (off <= 0) && has_dim(0 - off);
-                    bool has_h = (off <= 1) && has_dim(1 - off);
-                    bool has_w = (off <= 2) && has_dim(2 - off);
-                    if (!has_d && !has_h && !has_w) has_w = true;
-                    move_back(1, has_d * (!has_h == has_w));
-                    move_back(2, !has_w * (!has_h + 1));
+            if (pd->data->pd_cfg.zp_cfg().needs_src_reorder_precalc
+                    || pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) {
+                primitive_attr_t attr;
+                if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) {
+                    int mask = pd->attr_.zero_points_.get_mask(DNNL_ARG_SRC);
+                    attr.zero_points_.set(DNNL_ARG_SRC, mask);
+                    attr.post_ops_.append_eltwise(
+                            1.f, alg_kind::eltwise_linear, -1.f, 0.f);
                 }
-                memory::dims S1 {1, 1, 1};
-                memory::dims P1 {0, 0, 0};
-                memory::dims dims_src {1, dim_t(prb.g) * prb.ic};
-                memory::dims dims_dst {1, dim_t(prb.g) * prb.oc};
-
-                for (int i = off; i < int(K.size()); i++) {
-                    const auto KD = (K[i] - 1) * (D[i] + 1) + 1;
-                    dims_src.emplace_back(std::min(KD, I[i]));
-                    dims_dst.emplace_back(ir_utils::max_unique_pad_states(
-                            O[i], I[i], KD, P[i], S[i], true));
-                    P1[i] = dims_dst.back() - dims_src.back() - 1 + KD - P[i];
+                dim_t I[3], O[3], P[3], D[3];
+                prepare_zp_precompute_conv(prb, I, O, P, D);
+                CHECK(create_zp_precompute_conv_pd(pd->data->zp_pd, engine,
+                        attr, pd->weights_md(), I, O, P, D, data_type::f32,
+                        pd->get_prop_kind(),
+                        !pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc));
+                if (pd->data->pd_cfg.zp_cfg().needs_src_conv_precalc) {
+                    auto scratchpad = pd->scratchpad_registry().registrar();
+                    scratchpad.book(memory_tracking::names::key_nested_multiple,
+                            pd->data->zp_pd->scratchpad_registry());
                 }
-                memory::desc src(dims_src, memory::data_type::s8,
-                        memory::format_tag::any);
-                memory::desc dst(dims_dst, memory::data_type::s32,
-                        memory::format_tag::any);
-
-                // create a nested conv and allocate a nested scratchpad for it
-                primitive_attr_t attr;
-                int mask = 0;
-                CHECK(pd->attr_.zero_points_.get(DNNL_ARG_SRC, &mask));
-                attr.zero_points_.set(DNNL_ARG_SRC, mask);
-                attr.post_ops_.append_eltwise(
-                        1.f, alg_kind_t::dnnl_eltwise_linear, -1.f, 0.f);
-                dnnl_primitive_desc *zp_pd;
-                CHECK(dnnl_convolution_forward_primitive_desc_create(&zp_pd,
-                        engine, dnnl_prop_kind_t::dnnl_forward_inference,
-                        dnnl_alg_kind_t::dnnl_convolution_direct, src.get(), w,
-                        nullptr, dst.get(), S1.data() + off, D.data() + off,
-                        P.data() + off, P1.data() + off, &attr));
-                pd->data->zp_pd.reset(zp_pd, dnnl_primitive_desc_destroy);
-                auto scratchpad = pd->scratchpad_registry().registrar();
-                scratchpad.book(memory_tracking::names::key_nested_multiple,
-                        pd->data->zp_pd->impl()->scratchpad_registry());
             }
 
-            pd->data->tensor_cfg = get_tensor_config(pd->data->pd_cfg,
-                    (pd->data->zp_pd) ? pd->data->zp_pd->impl()->src_md()
-                                      : nullptr);
+            pd->data->tensor_cfg = get_tensor_config(
+                    pd->data->pd_cfg, zp_conv_md_in(*pd->data));
             pd->data->kernel_infos.reserve(max_kernels);
             CHECK(init_kernel_infos(pd));
 
@@ -178,8 +135,15 @@ class gen_convolution_t {
         int max_tries = 100;
         conv_config_t cfg;
         layout_t zp_dst;
-        if (data.zp_pd) zp_dst = layout_t(data.zp_pd->impl()->dst_md(), false);
+        if (data.zp_pd) zp_dst = layout_t(zp_conv_md_out(data), false);
+
+        if (primitive->cache_blob()) {
+            tiler->set_cur_version(primitive->version());
+        }
+
         for (int try_iter = 0; try_iter < max_tries; try_iter++) {
+            if (try_iter != 0 && !tiler->is_tuning_mode())
+                tiler->move_next(cfg);
             try {
                 cfg = data.pd_cfg;
                 cfg.set_pd(
@@ -187,16 +151,23 @@ class gen_convolution_t {
                 cfg.set_tiler(tiler);
                 CHECK(init_cfg(cfg, primitive));
 
-                if (primitive->cache_blob() && try_iter != primitive->version())
-                    continue;
                 if (!tiler->is_grf_limit_ok(cfg)) continue;
 
-                ir_info() << "Configuration:" << std::endl;
-                ir_info() << cfg;
+                gpu_info() << "Configuration:";
+                gpu_info() << cfg;
 
                 init_nd_ranges(primitive, cfg);
-
                 auto &kernel_infos = data.kernel_infos;
+
+                // This absolutely HAS to be executed first if present,
+                // since it adds its own version mark to the cache blob
+                for (int i = 0; i < int(kernel_infos.size()); i++)
+                    if (kernel_infos[i].id() == kernel_id_t::zp_precalc) {
+                        gpu_assert(data.zp_pd);
+                        CHECK(primitive->create_nested_primitive(
+                                zp_prim_, data.zp_pd, engine));
+                    }
+
                 std::vector<compute::kernel_t> tmp_kernels;
                 for (int i = 0; i < int(kernel_infos.size()); i++) {
                     auto &info = kernel_infos[i];
@@ -240,23 +211,20 @@ class gen_convolution_t {
                                     make_kernel<zero_out_kernel_t>(primitive,
                                             /*register_kernel=*/false, engine,
                                             cfg.exec_cfg(), info,
-                                            cfg.is_dpas_or_dpasw_fma()));
+                                            cfg.is_dpas_or_dpasw_fma(),
+                                            engine));
                             break;
 
                         case kernel_id_t::zp_precalc:
-                            ir_assert(data.zp_pd);
-                            if (!data.zp_prim)
-                                CHECK(data.zp_pd->impl()->create_primitive(
-                                        data.zp_prim, engine));
                             tmp_kernels.emplace_back();
                             continue;
 
-                        default: ir_error_not_expected();
+                        default: gpu_error_not_expected();
                     }
                     if (!tmp_kernels[i]) return status::runtime_error;
                 }
                 ok = true;
-                primitive->set_version(try_iter);
+                primitive->set_version(tiler->cur_version());
                 kernels_ = std::move(tmp_kernels);
                 break;
             } catch (ngen::out_of_registers_exception &err) {
@@ -273,28 +241,13 @@ class gen_convolution_t {
             }
         }
         if (!ok) return status::runtime_error;
-        ir_assert(kernels_.size() == data.kernel_infos.size());
-        primitive->register_kernels(kernels_);
+        gpu_assert(kernels_.size() == data.kernel_infos.size());
+        CHECK(primitive->register_kernels(kernels_));
 
         conv_tiler_t::after_create_hook(cfg, primitive);
         return status::success;
     }
 
-    template <typename T>
-    status_t init_res_storage(const T *primitive, impl::engine_t *engine,
-            gpu_resource_t *r) const {
-        auto &data = *primitive->pd()->data;
-        auto &kernel_infos = data.kernel_infos;
-        for (int i = 0; i < int(kernel_infos.size()); i++) {
-            auto &kernel_info = kernel_infos[i];
-            for (int j = 0; j < kernel_info.nargs(); j++) {
-                if (!kernel_info.is_resource(j)) continue;
-                ir_error_not_expected();
-            }
-        }
-        return status::success;
-    }
-
     template <typename T>
     status_t execute(const T *primitive, const exec_ctx_t &ctx) const {
         auto &data = *primitive->pd()->data;
@@ -320,7 +273,8 @@ class gen_convolution_t {
                     CHECK(primitive->parallel_for(
                             ctx, nd_ranges_[i], kernels_[i], arg_list));
                 } else if (info.id() == kernel_id_t::zp_precalc) {
-                    auto scratchpad_arg = [&](std::unique_ptr<memory_t> &retn,
+                    auto scratchpad_arg = [&](std::unique_ptr<memory_t,
+                                                      memory_deleter_t> &retn,
                                                   const std::string &name,
                                                   const memory_desc_t *md) {
                         auto s = ctx.get_scratchpad_grantor()
@@ -329,12 +283,11 @@ class gen_convolution_t {
                                 new memory_t(ctx.stream()->engine(), md,
                                         std::move(s)));
                     };
-                    ir_assert(data.zp_prim);
-                    std::unique_ptr<memory_t> zp_src, zp_dst;
-                    CHECK(scratchpad_arg(zp_src, "src_zero_points",
-                            data.zp_pd->impl()->src_md()));
+                    gpu_assert(zp_prim_);
+                    std::unique_ptr<memory_t, memory_deleter_t> zp_src, zp_dst;
                     CHECK(scratchpad_arg(
-                            zp_dst, "dst", data.zp_pd->impl()->dst_md()));
+                            zp_src, "src_zero_points", zp_conv_md_in(data)));
+                    CHECK(scratchpad_arg(zp_dst, "dst", zp_conv_md_out(data)));
 
                     exec_args_t e_args;
                     auto src_zp_idx = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC;
@@ -344,9 +297,9 @@ class gen_convolution_t {
                     e_args[DNNL_ARG_DST] = memory_arg_t {zp_dst.get(), false};
                     exec_ctx_t e_ctx(ctx, std::move(e_args));
                     const auto nm = memory_tracking::names::key_nested_multiple;
-                    nested_scratchpad_t ns(ctx, nm, data.zp_prim);
+                    nested_scratchpad_t ns(ctx, nm, zp_prim_);
                     e_ctx.set_scratchpad_grantor(ns.grantor());
-                    CHECK(data.zp_prim->execute(e_ctx));
+                    CHECK(zp_prim_->execute(e_ctx));
                 }
                 nsubmitted++;
                 if (nsubmitted == nkernels) break;
@@ -357,10 +310,24 @@ class gen_convolution_t {
     }
 
 private:
+    static const memory_desc_t *zp_conv_md_in(const conv_pd_data_t &data) {
+        if (!data.zp_pd) return nullptr;
+        const bool is_bwd_d
+                = (data.zp_pd->get_prop_kind() == prop_kind::backward_data);
+        return (is_bwd_d) ? data.zp_pd->diff_dst_md() : data.zp_pd->src_md();
+    }
+
+    static const memory_desc_t *zp_conv_md_out(const conv_pd_data_t &data) {
+        if (!data.zp_pd) return nullptr;
+        const bool is_bwd_d
+                = (data.zp_pd->get_prop_kind() == prop_kind::backward_data);
+        return (is_bwd_d) ? data.zp_pd->diff_src_md() : data.zp_pd->dst_md();
+    }
+
     template <typename T>
     static kernel_info_t &create_kernel_info(T *pd, kernel_id_t kernel_id) {
         auto &infos = pd->data->kernel_infos;
-        ir_assert((int)infos.size() + 1 <= max_kernels);
+        gpu_assert((int)infos.size() + 1 <= max_kernels);
         infos.emplace_back();
         auto &ret = infos.back();
         ret.set_id(kernel_id);
@@ -371,10 +338,8 @@ class gen_convolution_t {
     static status_t init_kernel_infos(T *pd) {
         auto &data = *pd->data;
         auto &cfg = data.pd_cfg;
-        const bool needs_zp_precalc = cfg.zp_cfg().needs_src_precalc;
-
         auto &conv_info = create_kernel_info(pd, kernel_id_t::convolution);
-        auto &zp_precalc_info = (needs_zp_precalc)
+        auto &zp_precalc_info = (cfg.zp_cfg().needs_src_conv_precalc)
                 ? create_kernel_info(pd, kernel_id_t::zp_precalc)
                 : conv_info;
 
@@ -384,68 +349,71 @@ class gen_convolution_t {
         // Initialize kernel arguments.
         int scratchpad_key = memory_tracking::names::key_none;
         for (auto &t : data.tensor_cfg.tensors()) {
-            const bool src_zp_precalc
-                    = needs_zp_precalc && (t.name == "src_zero_points");
+            const bool wei_reorder_precalc = (t.name == "wei")
+                    && cfg.zp_cfg().needs_src_reorder_precalc;
+            const bool src_conv_precalc = (t.name == "src_zero_points")
+                    && cfg.zp_cfg().needs_src_conv_precalc;
 
             const auto compute_buf = make_buffer(t.name);
             size_t compute_size = t.compute_layout.size();
             int compute_arg_key = t.arg_key;
 
             if (compute_arg_key == DNNL_ARG_UNDEF) {
-                ir_assert(!t.needs_reorder);
-                ir_assert(!t.needs_zero_out);
-                ir_error_not_expected();
+                gpu_assert(!t.needs_reorder);
+                gpu_assert(!t.needs_zero_out);
+                gpu_error_not_expected();
                 continue;
             }
 
             auto add_compute_arg = [&](kernel_info_t &ki, const expr_t &buf,
                                            bool is_input) {
-                if (t.needs_reorder || src_zp_precalc)
+                if (t.needs_reorder || src_conv_precalc)
                     ki.register_scratchpad_arg(
                             buf, compute_arg_key, is_input, compute_size);
                 else
                     ki.register_user_arg(buf, compute_arg_key, is_input);
             };
             auto scratchpad_book = [&](int key) {
-                pd->scratchpad_registry().registrar().book(
-                        gpu_utils::into<uint32_t>(key), compute_size, 1,
-                        ocl::OCL_BUFFER_ALIGNMENT);
+                pd->scratchpad_registry().registrar().book(into<uint32_t>(key),
+                        compute_size, 1, ocl::OCL_BUFFER_ALIGNMENT);
             };
             auto create_zero_out_info = [&]() -> kernel_info_t & {
                 auto &zero_out_info
                         = create_kernel_info(pd, kernel_id_t::zero_out);
                 auto size_var = var_t::make(type_t::u32(), "size");
                 zero_out_info.register_internal_arg(
-                        size_var, gpu_utils::into<uint32_t>(compute_size));
-                zero_out_info.set_nd_range(zero_out_kernel_t<>::nd_range(
-                        cfg.simd(), gpu_utils::into<int>(compute_size)));
+                        size_var, into<uint32_t>(compute_size));
+                zero_out_info.set_nd_range(zero_out_kernel_desc_t::nd_range(
+                        cfg.simd(), compute_size));
                 return zero_out_info;
             };
 
-            if (t.needs_reorder || src_zp_precalc) {
+            if (t.needs_reorder || src_conv_precalc) {
                 int user_arg_key = compute_arg_key;
                 auto user_buf = make_buffer(t.name + "_user");
                 compute_arg_key = ++scratchpad_key;
 
-                if (!src_zp_precalc && t.is_input) {
+                if (!src_conv_precalc && t.is_input) {
                     auto &reorder_info
                             = create_kernel_info(pd, kernel_id_t::pre_reorder);
                     reorder_info.register_user_arg(user_buf, user_arg_key,
                             /*is_input=*/true);
                     add_compute_arg(reorder_info, compute_buf, false);
-                    reorder_info.set_nd_range(reorder_kernel_t<>::nd_range(
-                            cfg.exec_cfg(), t.user_layout, t.compute_layout));
+                    reorder_config_t reorder_cfg(
+                            cfg.exec_cfg(), t.user_layout, t.compute_layout);
+                    reorder_info.set_nd_range(reorder_cfg.nd_range());
                 }
-                if (!src_zp_precalc && t.is_output) {
+                if (!src_conv_precalc && t.is_output) {
                     auto &reorder_info
                             = create_kernel_info(pd, kernel_id_t::post_reorder);
                     add_compute_arg(reorder_info, compute_buf, true);
                     reorder_info.register_user_arg(user_buf, user_arg_key,
                             /*is_input=*/false);
-                    reorder_info.set_nd_range(reorder_kernel_t<>::nd_range(
-                            cfg.exec_cfg(), t.compute_layout, t.user_layout));
+                    reorder_config_t reorder_cfg(
+                            cfg.exec_cfg(), t.compute_layout, t.user_layout);
+                    reorder_info.set_nd_range(reorder_cfg.nd_range());
                 }
-                if (src_zp_precalc) {
+                if (src_conv_precalc) {
                     scratchpad_book(++scratchpad_key);
                     create_zero_out_info().register_scratchpad_arg(compute_buf,
                             scratchpad_key, /*is_input=*/false, compute_size);
@@ -454,9 +422,9 @@ class gen_convolution_t {
                             scratchpad_key, /*is_input=*/true, compute_size);
                     const auto &dim = ir_utils::max_unique_pad_states;
                     const auto &prb = cfg.prb();
-                    const int KDD = (prb.kd - 1) * (prb.dd + 1) + 1;
-                    const int KDH = (prb.kh - 1) * (prb.dh + 1) + 1;
-                    const int KDW = (prb.kw - 1) * (prb.dw + 1) + 1;
+                    const dim_t KDD = (prb.kd - 1) * (prb.dd + 1) + 1;
+                    const dim_t KDH = (prb.kh - 1) * (prb.dh + 1) + 1;
+                    const dim_t KDW = (prb.kw - 1) * (prb.dw + 1) + 1;
                     compute_size = int64_t(compute_size) * sizeof(int32_t)
                             * dim(prb.od, prb.id, KDD, prb.pd, prb.sd, true)
                             * dim(prb.oh, prb.ih, KDH, prb.ph, prb.sh, true)
@@ -467,6 +435,12 @@ class gen_convolution_t {
                     add_compute_arg(zp_precalc_info, make_buffer("dst"), false);
                 }
                 scratchpad_book(compute_arg_key);
+                if (wei_reorder_precalc) {
+                    // user-supplied weights contain precomputed ZP values, so
+                    // the buffer is to be passed to the conv alongside weights
+                    conv_info.register_user_arg(
+                            user_buf, user_arg_key, t.is_input && !t.is_output);
+                }
             }
             if (t.needs_zero_out) {
                 add_compute_arg(create_zero_out_info(), compute_buf, false);
@@ -498,14 +472,14 @@ class gen_convolution_t {
                     nd_ranges_[i] = info.nd_range();
                     break;
                 case kernel_id_t::zp_precalc: break;
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
         }
     }
 
     static bool can_skip_zero_out(
             const kernel_info_t &info, const conv_config_t &cfg) {
-        ir_assert(info.id() == kernel_id_t::zero_out);
+        gpu_assert(info.id() == kernel_id_t::zero_out);
         auto &buf_name = info.arg_var(1).as<var_t>().name;
         if (buf_name == "wei") return cfg.can_skip_wei_zero_out();
         if (buf_name == "bia") return cfg.can_skip_bia_zero_out();
@@ -523,6 +497,7 @@ class gen_convolution_t {
 
     std::vector<compute::kernel_t> kernels_;
     std::vector<compute::nd_range_t> nd_ranges_;
+    std::shared_ptr<impl::primitive_t> zp_prim_;
 };
 
 status_t gen_convolution_fwd_t::pd_t::init(impl::engine_t *engine) {
@@ -540,22 +515,12 @@ status_t gen_convolution_fwd_t::execute(const exec_ctx_t &ctx) const {
     return impl_->execute(this, ctx);
 }
 
-status_t gen_convolution_fwd_t::init_res_storage(
-        impl::engine_t *engine, gpu_resource_t *r) const {
-    return impl_->init_res_storage(this, engine, r);
-}
-
 status_t gen_convolution_bwd_data_t::pd_t::init(impl::engine_t *engine) {
     VDISPATCH_CONV_IC(is_bwd_d(), VERBOSE_BAD_PROPKIND);
     CHECK(gen_convolution_t::init_pd(this, engine));
     return status::success;
 }
 
-status_t gen_convolution_bwd_data_t::init_res_storage(
-        impl::engine_t *engine, gpu_resource_t *r) const {
-    return impl_->init_res_storage(this, engine, r);
-}
-
 status_t gen_convolution_bwd_weights_t::pd_t::init(impl::engine_t *engine) {
     VDISPATCH_CONV_IC(is_bwd_w(), VERBOSE_BAD_PROPKIND);
     CHECK(gen_convolution_t::init_pd(this, engine));
@@ -576,11 +541,6 @@ status_t gen_convolution_bwd_weights_t::init(impl::engine_t *engine) {
     return impl_->init(this, engine);
 }
 
-status_t gen_convolution_bwd_weights_t::init_res_storage(
-        impl::engine_t *engine, gpu_resource_t *r) const {
-    return impl_->init_res_storage(this, engine, r);
-}
-
 status_t gen_convolution_bwd_weights_t::execute(const exec_ctx_t &ctx) const {
     return impl_->execute(this, ctx);
 }
diff --git a/src/gpu/intel/jit/conv/gen_convolution.hpp b/src/gpu/intel/jit/conv/gen_convolution.hpp
index b7b47967d94..b8bea381731 100644
--- a/src/gpu/intel/jit/conv/gen_convolution.hpp
+++ b/src/gpu/intel/jit/conv/gen_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -56,9 +56,6 @@ class gen_convolution_fwd_t : public gpu_primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
-    status_t init_res_storage(
-            impl::engine_t *engine, gpu_resource_t *r) const override;
-
     std::shared_ptr<gen_convolution_t> impl_;
 };
 
@@ -86,9 +83,6 @@ class gen_convolution_bwd_data_t : public gpu_primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
-    status_t init_res_storage(
-            impl::engine_t *engine, gpu_resource_t *r) const override;
-
     std::shared_ptr<gen_convolution_t> impl_;
 };
 
@@ -117,9 +111,6 @@ class gen_convolution_bwd_weights_t : public gpu_primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
-    status_t init_res_storage(
-            impl::engine_t *engine, gpu_resource_t *r) const override;
-
     std::shared_ptr<gen_convolution_t> impl_;
 };
 
diff --git a/src/gpu/intel/jit/conv/grf_usage.cpp b/src/gpu/intel/jit/conv/grf_usage.cpp
index 3fb0767d027..ff1156c7edb 100644
--- a/src/gpu/intel/jit/conv/grf_usage.cpp
+++ b/src/gpu/intel/jit/conv/grf_usage.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ std::string to_string(grf_usage_label_t label) {
         CASE(tmp_vars)
         CASE(zero_points)
 #undef CASE
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return "";
 }
@@ -205,14 +205,15 @@ class ir_usage_analyzer_t : public ir_visitor_t {
     bool verify(bool allow_errors) const {
         if (is_invalid_) {
             if (!allow_errors)
-                ir_error_not_expected() << "Can't collect GRF usage.";
+                gpu_error_not_expected() << "Can't collect GRF usage.";
             return false;
         }
         for (auto &buf : buf_usage_.bufs()) {
             if (buf_usage_.get_label(buf) != grf_usage_label_t::unknown)
                 continue;
             if (!allow_errors)
-                ir_error_not_expected() << "Buffer doesn't have label: " << buf;
+                gpu_error_not_expected()
+                        << "Buffer doesn't have label: " << buf;
             return false;
         }
         return true;
@@ -243,7 +244,7 @@ class ir_usage_analyzer_t : public ir_visitor_t {
             buf_usage_.set_label(buf, label);
         } else {
             if (skip_if_set) return;
-            ir_error_not_expected()
+            gpu_error_not_expected()
                     << "Label already set. Buffer: " << buf
                     << ", old label: " << buf_label << ", new label: " << label;
         }
@@ -251,7 +252,7 @@ class ir_usage_analyzer_t : public ir_visitor_t {
 
     void mark_known_bufs(const expr_t &buf) {
         if (is_invalid_) return;
-        ir_assert(is_buffer(buf));
+        gpu_assert(is_buffer(buf));
         auto &name = buf.as<var_t>().name;
         if (name.find("x_reduce") == 0) {
             set_label(buf, grf_usage_label_t::out_buf);
@@ -261,7 +262,7 @@ class ir_usage_analyzer_t : public ir_visitor_t {
     }
 
     bool is_known_buf(const expr_t &buf) const {
-        ir_assert(is_buffer(buf));
+        gpu_assert(is_buffer(buf));
         auto &name = buf.as<var_t>().name;
         if (name.find("zp_") == 0) return true;
         if (name.find("x_reduce") == 0) return true;
@@ -271,17 +272,17 @@ class ir_usage_analyzer_t : public ir_visitor_t {
     void mark_bufs(
             const reorder_t &reorder, const expr_t &src, const expr_t &dst) {
         if (is_invalid_) return;
-        ir_assert(is_buffer(src));
-        ir_assert(is_buffer(dst));
+        gpu_assert(is_buffer(src));
+        gpu_assert(is_buffer(dst));
         set_label(dst, grf_usage_label_t::reorder);
     }
 
     void mark_bufs(
             const send_t &send, const expr_t &buf, const expr_t &header) {
         if (is_invalid_) return;
-        if (!buf.is_empty()) ir_assert(is_buffer(buf));
-        ir_assert(is_buffer(header));
-        ir_assert(is_header(header));
+        if (!buf.is_empty()) gpu_assert(is_buffer(buf));
+        gpu_assert(is_buffer(header));
+        gpu_assert(is_header(header));
         grf_usage_label_t label = grf_usage_label_t::unknown;
         if (buf.is_empty()) {
             label = grf_usage_label_t::gmem_load;
@@ -305,9 +306,9 @@ class ir_usage_analyzer_t : public ir_visitor_t {
     void mark_fma_bufs(
             const expr_t &dst, const expr_t &src1, const expr_t &src2) {
         if (is_invalid_) return;
-        ir_assert(is_buffer(dst));
-        ir_assert(is_buffer(src1));
-        ir_assert(is_buffer(src2));
+        gpu_assert(is_buffer(dst));
+        gpu_assert(is_buffer(src1));
+        gpu_assert(is_buffer(src2));
         set_label(dst, grf_usage_label_t::out_buf);
     }
 
@@ -351,8 +352,8 @@ void compare(const grf_usage_t &est_usage, const grf_usage_t &ir_usage,
     table << "Total" << est_total << ir_total;
     table << (ir_total > est_total ? "FAIL" : "");
     table << std::endl;
-    ir_trace() << table << std::endl;
-    ir_trace() << ir_usage.buf_usage() << std::endl;
+    gpu_trace() << table;
+    gpu_trace() << ir_usage.buf_usage();
 }
 
 void verify_grf_usage(
@@ -360,9 +361,9 @@ void verify_grf_usage(
     ir_usage_analyzer_t analyzer(cfg.grf_size());
     analyzer.analyze(body);
 
-    auto ir_info = analyzer.get_grf_usage(external_usage);
+    auto gpu_info = analyzer.get_grf_usage(external_usage);
     auto est_info = cfg.plan().grf_usage();
-    compare(est_info, ir_info, analyzer);
+    compare(est_info, gpu_info, analyzer);
 }
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/conv/grf_usage.hpp b/src/gpu/intel/jit/conv/grf_usage.hpp
index b1b8a6abb17..d8e3ddfd3aa 100644
--- a/src/gpu/intel/jit/conv/grf_usage.hpp
+++ b/src/gpu/intel/jit/conv/grf_usage.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,6 +45,7 @@ enum class grf_usage_label_t {
 
 inline std::vector<grf_usage_label_t> all_grf_usage_labels() {
     std::vector<grf_usage_label_t> ret;
+    ret.reserve(static_cast<size_t>(grf_usage_label_t::_last));
     for (int i = 0; i < (int)grf_usage_label_t::_last; i++) {
         ret.push_back((grf_usage_label_t)i);
     }
@@ -72,13 +73,13 @@ class grf_buf_usage_t {
 
     grf_usage_label_t get_label(const expr_t &buf) const {
         auto it = buf_labels_.find(buf);
-        ir_assert(it != buf_labels_.end()) << "Buffer not found: " << buf;
+        gpu_assert(it != buf_labels_.end()) << "Buffer not found: " << buf;
         return it->second;
     }
 
     int get_size(const expr_t &buf) const {
         auto it = buf_sizes_.find(buf);
-        ir_assert(it != buf_sizes_.end()) << "Buffer not found: " << buf;
+        gpu_assert(it != buf_sizes_.end()) << "Buffer not found: " << buf;
         return it->second;
     }
 
diff --git a/src/gpu/intel/jit/conv/ir_builder.cpp b/src/gpu/intel/jit/conv/ir_builder.cpp
index 9aa62da1ea9..babafde59f1 100644
--- a/src/gpu/intel/jit/conv/ir_builder.cpp
+++ b/src/gpu/intel/jit/conv/ir_builder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -103,7 +103,7 @@ class buffer_access_verifier_t : public ir_visitor_t {
         } else if (func.is<builtin_t>()) {
             // No buffers to check.
         } else {
-            ir_error_not_expected() << "Unhandled function: " << obj;
+            gpu_error_not_expected() << "Unhandled function: " << obj;
         }
 
         ir_visitor_t::_visit(obj);
@@ -130,14 +130,14 @@ class buffer_access_verifier_t : public ir_visitor_t {
     }
 
 private:
-    void check_access(const expr_t &buf, int size, const object_t &obj) {
+    void check_access(const expr_t &buf, dim_t size, const object_t &obj) {
         auto &base = (is_var(buf) ? buf : buf.as<ptr_t>().base);
-        int off = (is_var(buf) ? 0 : to_cpp<int>(buf.as<ptr_t>().off));
+        dim_t off = (is_var(buf) ? 0 : to_cpp<dim_t>(buf.as<ptr_t>().off));
         auto it = buf_sizes_.find(base);
-        ir_assert(it != buf_sizes_.end())
+        gpu_assert(it != buf_sizes_.end())
                 << "Can't find allocation for buffer: " << buf;
         int buf_size = it->second;
-        ir_assert(off + size <= buf_size)
+        gpu_assert(off + size <= buf_size)
                 << "Invalid access:\n    " << obj << "\n    Buffer " << base
                 << " has size: " << buf_size;
     }
@@ -155,7 +155,7 @@ void verify_buffer_access(const stmt_t &s, ir_context_t &ir_ctx) {
 expr_t add_grid_guard(
         const expr_t &_cond, const grid_info_t &tg, const grid_info_t &load) {
     auto cond = _cond;
-    for (int i = 0; i < tg.ndims(); i++) {
+    for (dim_idx_t i = 0; i < tg.ndims(); i++) {
         if (tg[i] == load[i]) continue;
         auto i_cond = (tg.idx(i) < load[i]);
         if (cond.is_empty()) {
@@ -185,9 +185,9 @@ class compute_builder_t {
         , buf_mgr_(ir_ctx)
         , zp_dst_(zp_dst) {
         if (plan_.slm.has_a())
-            (void)buf_mgr_.get("a_slm", plan_.slm.a_layout.size());
+            (void)buf_mgr_.get("a_slm", into<int>(plan_.slm.a_layout.size()));
         if (plan_.slm.has_b())
-            (void)buf_mgr_.get("b_slm", plan_.slm.b_layout.size());
+            (void)buf_mgr_.get("b_slm", into<int>(plan_.slm.b_layout.size()));
     }
 
     // Setters for original AP/BP/CP buffers (P - problem notation).
@@ -198,7 +198,7 @@ class compute_builder_t {
 
     int ab_slm_size() const { return plan_.slm.slm_size(); }
 
-    stmt_t zero_out_stmt() const {
+    stmt_t zero_out_stmt() {
         auto c_entry = buf_mgr_.find("c");
         auto ret = stmt_group_t::make(stmt_label_t::c_zero_out(),
                 funcs::zero_out(c_entry.buf, c_entry.size));
@@ -207,6 +207,7 @@ class compute_builder_t {
             ret = ret.append(
                     funcs::zero_out(x_reduce_entry.buf, x_reduce_entry.size));
         }
+        build_zp_init_src_precomp(ret);
         return ret;
     }
 
@@ -273,32 +274,33 @@ class compute_builder_t {
 
     static bool is_out_alloc_buf(const expr_t &buf) {
         auto &buf_name = buf.as<var_t>().name;
-        return utils::one_of(buf_name, "x_reduce", "c");
+        return utils::one_of(buf_name, "x_reduce", "x_reduce_tmp", "c");
     }
 
     void build_g2s() {
         auto &slm = plan_.slm;
         if (slm.has_a()) {
             build_g2s_x("a", ap_buf_, buf_mgr_.get("a_slm"), slm.a_g2s_load,
-                    slm.x_reduce_layout, slm.x_reduce, slm.a_reorder,
-                    slm.a_g2s_store, slm.a_grid);
+                    slm.x_reduce, slm.a_reorder, slm.a_g2s_store, slm.a_grid);
         }
         if (slm.has_b()) {
             build_g2s_x("b", bp_buf_, buf_mgr_.get("b_slm"), slm.b_g2s_load,
-                    slm.x_reduce_layout, slm.x_reduce, slm.b_reorder,
-                    slm.b_g2s_store, slm.b_grid);
+                    slm.x_reduce, slm.b_reorder, slm.b_g2s_store, slm.b_grid);
         }
     }
 
     void build_g2s_x(const std::string &prefix, const expr_t &mem_buf,
             const expr_t &slm_buf, const send_plan_t &g2s_load,
-            const layout_t &reduce_layout, const reduce_plan_t &g2s_reduce,
-            const reorder_plan_t &g2s_reorder, const send_plan_t &g2s_store,
-            const grid_info_t &grid) {
+            const reduce_plan_t &g2s_reduce, const reorder_plan_t &g2s_reorder,
+            const send_plan_t &g2s_store, const grid_info_t &grid) {
         auto g2s_buf = buf_mgr_.get(prefix + "_g2s", g2s_load.reg_buf_size());
-        auto load = g2s_load.create_stmt(mem_buf, g2s_buf);
+        expr_t pattern;
+        if ((prefix == "a") && plan_.zp.is_src_precomp_compatible())
+            pattern = load_t::make(type_t::u32(),
+                    buf_mgr_.get("zp_src", plan_.zp.load_reg_buf_size()), 0);
+        auto load = g2s_load.create_stmt(mem_buf, g2s_buf, 0, pattern);
         auto reduce_buf = g2s_reduce
-                ? buf_mgr_.get("x_reduce", g2s_reduce.dst_buf_size())
+                ? buf_mgr_.get("x_reduce", into<int>(g2s_reduce.dst_buf_size()))
                 : expr_t();
         auto store_buf = g2s_reorder
                 ? buf_mgr_.get("g2s_tmp", g2s_store.reg_buf_size())
@@ -307,7 +309,8 @@ class compute_builder_t {
             g2s_buf = buf_mgr_.get(prefix + "_g2s", g2s_store.reg_buf_size());
         }
         if (g2s_reorder) {
-            g2s_buf = buf_mgr_.get(prefix + "_g2s", g2s_reorder.src.size());
+            g2s_buf = buf_mgr_.get(
+                    prefix + "_g2s", into<int>(g2s_reorder.src.size()));
         }
         bool do_reduce = ((cfg_.prb().ab_swap_transpose && prefix == "a")
                 || (!cfg_.prb().ab_swap_transpose && prefix == "b"));
@@ -344,8 +347,8 @@ class compute_builder_t {
     void build_x2r_mul() {
         auto &x2r = plan_.x2r;
         auto &fma = plan_.fma;
-        ir_assert(x2r.split_abc == fma.split_abc);
-        ir_assert(x2r.split_factor == fma.split_factor);
+        gpu_assert(x2r.split_abc == fma.split_abc);
+        gpu_assert(x2r.split_factor == fma.split_factor);
         for (int i = 0; i < x2r.split_factor; i++) {
             build_x2r(i);
             stmt_t mul_stmt;
@@ -375,43 +378,83 @@ class compute_builder_t {
         x2r_mul_stmt_ = x2r_mul_stmt_.append(s2r_load_stmt);
     }
 
+    expr_t build_zp_init_src_load(int subtile_idx, stmt_t &stmt) {
+        auto &zp = plan_.zp;
+        auto zp_buf = buf_mgr_.get("zp_src", zp.load_reg_buf_size());
+        auto zp_mem_buf = kernel_info_.find_arg("src_zero_points");
+        auto load = zp.load_create_stmt(zp_mem_buf, zp_buf, subtile_idx);
+        stmt = stmt.append(load);
+        return zp_buf;
+    }
+
+    expr_t build_zp_init_wei_load(int subtile_idx, stmt_t &stmt) {
+        auto &zp = plan_.zp;
+        auto zp_buf = buf_mgr_.get("zp_wei", zp.wei_load_reg_buf_size());
+        auto zp_mem_buf = kernel_info_.find_arg("wei_zero_points");
+        auto load = zp.wei_load_create_stmt(zp_mem_buf, zp_buf, subtile_idx);
+        stmt = stmt.append(load);
+        return zp_buf;
+    }
+
+    void build_zp_init_src_precomp(stmt_t &stmt) {
+        auto &zp = plan_.zp;
+
+        if (zp.is_src_precomp_compatible()) {
+            auto zp_buf = build_zp_init_src_load(0, stmt);
+
+            auto zp_src_buf = buf_mgr_.get("zp_src_buf", zp.src_reg_buf_size());
+            auto zp_src_init = zp.src_init_create_stmt(zp_buf, zp_src_buf);
+            stmt = stmt.append(zp_src_init);
+        }
+    }
+
     void build_zp_init(int subtile_idx, stmt_t &mul_stmt) {
         auto &zp = plan_.zp;
 
         if (zp.has_zp_wei()) {
-            auto zp_mem_buf = kernel_info_.find_arg("wei_zero_points");
-            auto zp_buf = buf_mgr_.get("zp_wei", zp.wei_load_reg_buf_size());
+            auto zp_buf = build_zp_init_wei_load(subtile_idx, mul_stmt);
+
             auto zp_wei_buf = buf_mgr_.get("zp_wei_buf", zp.wei_reg_buf_size());
-            auto load
-                    = zp.wei_load_create_stmt(zp_mem_buf, zp_buf, subtile_idx);
             auto zp_wei_init = zp.wei_init_create_stmt(
                     zp_buf, zp_wei_buf, plan_.gemm_schedule, subtile_idx);
-            mul_stmt = mul_stmt.append(load);
             mul_stmt = mul_stmt.append(zp_wei_init);
         }
-        if (zp.has_zp_src()) {
-            auto zp_wei = (zp.has_zp_wei()) ? buf_mgr_.get("zp_wei") : expr_t();
-            auto zp_mem_buf = kernel_info_.find_arg("src_zero_points");
-            auto zp_buf = buf_mgr_.get("zp_src", zp.load_reg_buf_size());
-            auto wei_buf = buf_mgr_.get("b");
+        if (zp.has_zp_src() && !zp.is_src_precomp_compatible()) {
+            auto zp_buf = build_zp_init_src_load(subtile_idx, mul_stmt);
+
             auto zp_mask_buf = buf_mgr_.get("zp_mask", zp.mask_reg_buf_size());
-            auto zp_comp_buf = buf_mgr_.get("zp_comp", zp.comp_reg_buf_size());
-            auto load = zp.load_create_stmt(zp_mem_buf, zp_buf, subtile_idx);
             auto zp_mask_init
                     = zp.mask_init_create_stmt(zp_mask_buf, subtile_idx);
+            mul_stmt = mul_stmt.append(zp_mask_init);
+
+            auto wei_buf = buf_mgr_.get("b");
+            auto zp_wei = (zp.has_zp_wei()) ? buf_mgr_.get("zp_wei") : expr_t();
+            auto zp_comp_buf = buf_mgr_.get("zp_comp", zp.comp_reg_buf_size());
             auto zp_comp_init = zp.comp_init_create_stmt(buf_mgr_, zp_buf,
                     wei_buf, zp_comp_buf, zp_wei, plan_.gemm_schedule,
                     subtile_idx);
-            mul_stmt = mul_stmt.append(load);
-            mul_stmt = mul_stmt.append(zp_mask_init);
             mul_stmt = mul_stmt.append(zp_comp_init);
         }
     }
 
     void build_zp_apply(int subtile_idx, stmt_t &mul_stmt) {
+        auto make_zp_fma = [&](const std::string &zp_buf, abc_kind_t kind,
+                                   int size) {
+            gpu_assert((kind == abc_kind_t::a) || (kind == abc_kind_t::b));
+            buf_mgr_.get(zp_buf, size);
+            return plan_.fma.create_stmt(ir_ctx_, buf_mgr_,
+                    (kind == abc_kind_t::a) ? zp_buf : "a",
+                    (kind == abc_kind_t::b) ? zp_buf : "b", "c", subtile_idx);
+        };
         auto &zp = plan_.zp;
-
-        if (zp.has_zp_src()) {
+        if (zp.has_zp_wei()) {
+            mul_stmt = mul_stmt.append(make_zp_fma(
+                    "zp_wei_buf", abc_kind_t::b, zp.wei_reg_buf_size()));
+        }
+        if (zp.is_src_precomp_compatible()) {
+            mul_stmt = mul_stmt.append(make_zp_fma(
+                    "zp_src_buf", abc_kind_t::a, zp.src_reg_buf_size()));
+        } else if (zp.has_zp_src()) {
             auto c_buf = buf_mgr_.get("c");
             auto zp_comp_buf = buf_mgr_.get("zp_comp");
             auto zp_mask_buf = buf_mgr_.get("zp_mask");
@@ -419,10 +462,6 @@ class compute_builder_t {
                     zp_comp_buf, zp_mask_buf, c_buf, subtile_idx);
             mul_stmt = mul_stmt.append(zp_comp_apply);
         }
-        if (zp.has_zp_wei()) {
-            mul_stmt = mul_stmt.append(plan_.fma.create_stmt(
-                    ir_ctx_, buf_mgr_, "a", "zp_wei_buf", "c", subtile_idx));
-        }
     }
 
     void build_x2r_x(const std::string &prefix, const expr_t &x_buf,
@@ -433,15 +472,19 @@ class compute_builder_t {
         auto reg_buf = buf_mgr_.get(prefix, buf_size);
         auto load_buf = x2r_reorder ? buf_mgr_.get("x2r_tmp",
                                 std::max(x2r_load.reg_buf_size(),
-                                        x2r_reorder.src_buf_size()))
+                                        into<int>(x2r_reorder.src_buf_size())))
                                     : reg_buf;
         if (load_buf.is_same(reg_buf)) {
             reg_buf = buf_mgr_.get(prefix, x2r_load.reg_buf_size());
         }
+        expr_t pattern;
+        if ((prefix == "a") && plan_.zp.is_src_precomp_compatible())
+            pattern = load_t::make(type_t::u32(),
+                    buf_mgr_.get("zp_src", plan_.zp.load_reg_buf_size()), 0);
+        auto load = x2r_load.create_stmt(x_buf, load_buf, subtile_idx, pattern);
         auto reduce_buf = x2r_reduce
-                ? buf_mgr_.get("x_reduce", x2r_reduce.dst_buf_size())
+                ? buf_mgr_.get("x_reduce", into<int>(x2r_reduce.dst_buf_size()))
                 : expr_t();
-        auto load = x2r_load.create_stmt(x_buf, load_buf, subtile_idx);
         bool do_reduce = ((cfg_.prb().ab_swap_transpose && prefix == "a")
                 || (!cfg_.prb().ab_swap_transpose && prefix == "b"));
         auto reduce = do_reduce
@@ -521,14 +564,31 @@ class compute_builder_t {
                 || !plan_.slm.x_reduce_tile.is_empty());
         auto x_reduce_buf = buf_mgr_.find("x_reduce", /*allow_empty=*/true).buf;
         if (x_reduce_buf.is_empty()) return;
+        auto x_reduce_dummy_buf
+                = var_t::make(type_t::byte_ptr(), "x_reduce_dummy");
         auto x_reduce_view
                 = plan_.bia_view.create_sub_view(plan_.x_reduce_tile());
         auto r2g = make_access_builder(ir_ctx_, x_reduce_view, x_reduce_buf_,
-                x_reduce_buf,
+                x_reduce_dummy_buf,
                 use_atomic ? send_op_t::atomic_fadd : send_op_t::store,
                 send_address_t::a64);
+        auto x_reduce_type = (plan_.slm ? plan_.slm.x_reduce.dst.type()
+                                        : plan_.x2r.x_reduce.dst.type());
+        layout_t x_reduce_reg_layout
+                = r2g.reg_layout().retype(x_reduce_type).make_dense();
+        stmt_t stmt = r2g.stmt();
+        if (r2g.reg_layout() == x_reduce_reg_layout) {
+            stmt = substitute(stmt, x_reduce_dummy_buf, x_reduce_buf);
+        } else {
+            auto x_reduce_tmp_buf = buf_mgr_.get(
+                    "x_reduce_tmp", into<int>(r2g.reg_layout().size()));
+            auto reorder_stmt = create_reorder_stmt(x_reduce_reg_layout,
+                    r2g.reg_layout(), x_reduce_buf, x_reduce_tmp_buf);
+            stmt = reorder_stmt.append(stmt);
+            stmt = substitute(stmt, x_reduce_dummy_buf, x_reduce_tmp_buf);
+        }
         auto cond = get_x_reduce_store_condition();
-        x_reduce_store_stmt_ = if_t::make(cond, r2g.stmt());
+        x_reduce_store_stmt_ = if_t::make(cond, stmt);
     }
 
     const conv_config_t &cfg_;
@@ -584,13 +644,13 @@ stmt_t inject_compute_loop_label(const stmt_t &s) {
 }
 
 void conv_ir_builder_t::build() {
-    auto &prb = cfg_.prb();
+    const auto &prb = cfg_.prb();
 
     trace_reset();
 
     std::vector<stmt_t> init_stmts;
-    auto &plan = cfg_.plan();
-    auto gemm_schedule = plan.gemm_schedule;
+    const auto &plan = cfg_.plan();
+    const auto &gemm_schedule = plan.gemm_schedule;
     auto init_cset = plan.init_cset;
     init_kernel_grid(cfg_.kernel_grid(), cfg_.thread_group_grid(), cfg_.simd(),
             init_cset, init_stmts);
@@ -644,7 +704,6 @@ void conv_ir_builder_t::build() {
     stmt_t loop_stmt = cb.iter_stmt();
     loop_stmt = gemm_schedule.create_loop_nest(loop_stmt);
     loop_stmt = inject_compute_loop_label(loop_stmt);
-    loop_stmt = cb.inject_compute_alloc_stmts(loop_stmt);
 
     stmt_t c_store_stmt;
     c_store_stmt = c_store_stmt.append(cb.x_reduced_store_stmt());
@@ -653,6 +712,7 @@ void conv_ir_builder_t::build() {
 
     stmt_ = std::move(loop_stmt);
     stmt_ = stmt_seq_t::make(cb.zero_out_stmt(), stmt_);
+    stmt_ = cb.inject_compute_alloc_stmts(stmt_);
     stmt_ = stmt_seq_t::make(stmt_, c_store_stmt);
 
     stmt_ = cb.inject_out_alloc_stmts(stmt_);
@@ -712,7 +772,7 @@ void conv_ir_builder_t::build() {
     verify_buffer_access(stmt_, ir_ctx);
 #endif
 
-    ir_trace() << "Convolution kernel body:\n" << stmt_ << std::endl;
+    gpu_trace() << "Convolution kernel body:\n" << stmt_;
     trace_perf();
 }
 
diff --git a/src/gpu/intel/jit/conv/ir_builder.hpp b/src/gpu/intel/jit/conv/ir_builder.hpp
index 62127a61e29..6e5a605669f 100644
--- a/src/gpu/intel/jit/conv/ir_builder.hpp
+++ b/src/gpu/intel/jit/conv/ir_builder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ class conv_ir_builder_t : public ir_builder_t {
 public:
     conv_ir_builder_t(const conv_config_t &cfg,
             const kernel_info_t &kernel_info, const layout_t &zp_dst)
-        : ir_builder_t(kernel_info)
+        : kernel_info_(kernel_info)
         , prb_(cfg.prb())
         , cfg_(cfg)
         , zp_dst_(zp_dst) {
@@ -58,6 +58,7 @@ class conv_ir_builder_t : public ir_builder_t {
             expr_t &src_buf, expr_t &dst_buf, expr_t &wei_buf, expr_t &bia_buf,
             expr_t &bia_reduction_condition);
 
+    const kernel_info_t &kernel_info_;
     const conv_problem_t &prb_;
     const conv_config_t &cfg_;
     const layout_t &zp_dst_;
diff --git a/src/gpu/intel/jit/conv/key.cpp b/src/gpu/intel/jit/conv/key.cpp
index 42f1711b4dd..aa004d0ccb5 100644
--- a/src/gpu/intel/jit/conv/key.cpp
+++ b/src/gpu/intel/jit/conv/key.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 #include "common/utils.hpp"
 #include "gpu/intel/jit/conv/config.hpp"
 #include "gpu/intel/jit/ir/hw.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -51,6 +51,9 @@ enum class key_type_kind_t {
     hf8,
     f8_e4m3 = hf8,
     xf8, // bf8 or hf8
+    f4_e2m1,
+    f4_e3m0,
+    xf4, // f4_e2m1 or f4_e3m0
     _max,
 };
 
@@ -66,6 +69,9 @@ static auto key_type_kind_names = nstl::to_array({
         make_enum_name(key_type_kind_t::bf8, "bf8"),
         make_enum_name(key_type_kind_t::hf8, "hf8"),
         make_enum_name(key_type_kind_t::xf8, "xf8"),
+        make_enum_name(key_type_kind_t::f4_e2m1, "f4_e2m1"),
+        make_enum_name(key_type_kind_t::f4_e3m0, "f4_e3m0"),
+        make_enum_name(key_type_kind_t::xf4, "xf4"),
         make_enum_name(key_type_kind_t::f32, "f32"),
         make_enum_name(key_type_kind_t::s32, "s32"),
         make_enum_name(key_type_kind_t::tf32, "tf32"),
@@ -87,7 +93,7 @@ fma_kind_t to_key(fma_kind_t fma) {
         case fma_kind_t::dp4a:
         case fma_kind_t::dpas: return fma;
         case fma_kind_t::dpasw: return fma_kind_t::dpas;
-        default: ir_error_not_expected(); return fma_kind_t::undef;
+        default: gpu_error_not_expected(); return fma_kind_t::undef;
     }
 }
 
@@ -98,6 +104,8 @@ key_type_kind_t to_type_kind(data_type_t dt) {
         CASE(s8);
         CASE(u8);
         CASE(bf16);
+        CASE(f4_e2m1);
+        CASE(f4_e3m0);
         CASE(f8_e5m2);
         CASE(f8_e4m3);
         CASE(f16);
@@ -105,7 +113,7 @@ key_type_kind_t to_type_kind(data_type_t dt) {
         CASE(s32);
         CASE(tf32);
         CASE(f64);
-        default: ir_error_not_expected(); return key_type_kind_t::undef;
+        default: gpu_error_not_expected(); return key_type_kind_t::undef;
     }
 #undef CASE
 }
@@ -127,7 +135,10 @@ key_type_kind_t to_filter(key_type_kind_t kind) {
         case key_type_kind_t::bf8:
         case key_type_kind_t::hf8:
         case key_type_kind_t::xf8: return key_type_kind_t::xf8;
-        default: ir_error_not_expected();
+        case key_type_kind_t::f4_e2m1:
+        case key_type_kind_t::f4_e3m0:
+        case key_type_kind_t::xf4: return key_type_kind_t::xf4;
+        default: gpu_error_not_expected();
     }
     return key_type_kind_t::undef;
 }
@@ -135,7 +146,7 @@ key_type_kind_t to_filter(key_type_kind_t kind) {
 template <>
 struct key_kind_traits_t<key_type_kind_t> {
     static bool matches(key_type_kind_t filter, key_type_kind_t other) {
-        ir_assert(filter == to_filter(filter));
+        gpu_assert(filter == to_filter(filter));
         if (filter == key_type_kind_t::any) return true;
         return filter == to_filter(other);
     }
@@ -205,7 +216,7 @@ struct key_hw_t {
     void parse(std::istream &in) {
         auto s = stream_parse<std::string>(in);
         auto parts = gpu_utils::split(s, ":");
-        ir_assert(parts.size() <= 2);
+        gpu_assert(parts.size() <= 2);
         hw = to_enum<ngen::HW>(parts[0]);
         family = (parts.size() > 1 ? to_enum<ngen::ProductFamily>(parts[1])
                                    : ngen::ProductFamily::Unknown);
@@ -249,7 +260,7 @@ struct key_type_info_t {
             case prop_kind::forward: ret.dst = any_type; break;
             case prop_kind::backward_data: ret.src = any_type; break;
             case prop_kind::backward_weights: ret.wei = any_type; break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return ret;
     }
@@ -276,7 +287,7 @@ struct key_type_info_t {
     void parse(std::istream &in) {
         auto s = stream_parse<std::string>(in);
         auto parts = gpu_utils::split(s, ":");
-        ir_assert(parts.size() == 3);
+        gpu_assert(parts.size() == 3);
         src = key_type_t(to_enum<key_type_kind_t>(parts[0]));
         wei = key_type_t(to_enum<key_type_kind_t>(parts[1]));
         dst = key_type_t(to_enum<key_type_kind_t>(parts[2]));
@@ -303,7 +314,7 @@ bool is_mb_blocked(const layout_t &layout) {
 
 struct key_mb_t {
     bool is_blocked = false;
-    int value = 0;
+    dim_t value = 0;
 
     key_mb_t() = default;
     key_mb_t(const conv_config_t &cfg, prop_kind_t prop) {
@@ -317,7 +328,7 @@ struct key_mb_t {
             case prop_kind::backward_weights:
                 is_blocked = src_blocked && dst_blocked;
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
     }
 
@@ -401,7 +412,7 @@ class conv_key_impl_t {
         return ret;
     }
 
-    int distance(const conv_key_impl_t &other) const {
+    dim_t distance(const conv_key_impl_t &other) const {
         int max_dist = std::numeric_limits<int>::max();
         if (!matches(other)) return max_dist;
         // Here this object is a filter, other object is a non-filter.
@@ -410,7 +421,7 @@ class conv_key_impl_t {
         //   Key     : mb512
         //   Filter A: mb128+ (distance: 384)
         //   Filter B: mb256+ (distance: 256) <- smaller distance, preferred.
-        int dist = other.mb_.value - mb_.value;
+        dim_t dist = other.mb_.value - mb_.value;
         auto f1 = hw_.family;
         auto f2 = other.hw_.family;
         if (f1 != f2) {
@@ -510,12 +521,12 @@ conv_key_t conv_key_t::to_filter() const {
 }
 
 const std::string &conv_key_t::desc() const {
-    ir_assert(impl_);
+    gpu_assert(impl_);
     return impl_->desc_str();
 }
 
-int conv_key_t::distance(const conv_key_t &other) const {
-    ir_assert(impl_ && other.impl_);
+dim_t conv_key_t::distance(const conv_key_t &other) const {
+    gpu_assert(impl_ && other.impl_);
     return impl_->distance(*other.impl_);
 }
 
@@ -534,7 +545,7 @@ size_t conv_key_t::get_hash() const {
 }
 
 void conv_key_t::stringify(std::ostream &out) const {
-    ir_assert(impl_);
+    gpu_assert(impl_);
     impl_->stringify(out);
 }
 
diff --git a/src/gpu/intel/jit/conv/key.hpp b/src/gpu/intel/jit/conv/key.hpp
index fa6595aaa1b..39ae1cdefc8 100644
--- a/src/gpu/intel/jit/conv/key.hpp
+++ b/src/gpu/intel/jit/conv/key.hpp
@@ -56,7 +56,7 @@ class conv_key_t {
     // Computes the distance between this key and other key (must be
     // non-filter), a filter with a smaller distance is a better match for the
     // key.
-    int distance(const conv_key_t &other) const;
+    dim_t distance(const conv_key_t &other) const;
     bool operator==(const conv_key_t &other) const;
     bool matches(const conv_key_t &other) const;
     size_t get_hash() const;
diff --git a/src/gpu/intel/jit/conv/lookup_table.cpp b/src/gpu/intel/jit/conv/lookup_table.cpp
index 49d54fef68e..8d3dabb0320 100644
--- a/src/gpu/intel/jit/conv/lookup_table.cpp
+++ b/src/gpu/intel/jit/conv/lookup_table.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ conv_lookup_table_t::conv_lookup_table_t(const char **entries) {
         {
             std::ostringstream oss;
             e.stringify(oss);
-            ir_assert(oss.str() == *entries)
+            gpu_assert(oss.str() == *entries)
                     << "parsed from:\n  " << *entries << "\nstringified to\n  "
                     << oss.str();
         }
@@ -85,10 +85,10 @@ blocking_params_t conv_lookup_table_t::find(const conv_key_t &key) const {
     auto &desc_entries = entries_it->second;
     auto it = desc_entries.begin();
     auto best = desc_entries.end();
-    int best_dist = std::numeric_limits<int>::max();
+    dim_t best_dist = std::numeric_limits<dim_t>::max();
     for (; it != desc_entries.end(); it++) {
         if (!it->key.matches(key)) continue;
-        int dist = it->key.distance(key);
+        dim_t dist = it->key.distance(key);
         if (dist < best_dist) {
             best_dist = dist;
             best = it;
@@ -98,22 +98,24 @@ blocking_params_t conv_lookup_table_t::find(const conv_key_t &key) const {
 }
 
 void conv_lookup_table_t::stringify(std::ostream &out) const {
+    bool is_first = true;
     for (auto &kv : data_) {
         for (auto &e : kv.second) {
-            out << "\"";
+            if (!is_first) out << "\n";
             e.stringify(out);
-            out << "\",\n";
+            is_first = false;
         }
     }
 }
 
 void conv_lookup_table_t::parse(std::istream &in) {
     data_.clear();
-    while (stream_try_match(in, "\"")) {
+    std::string line;
+    while (std::getline(in, line)) {
+        if (line.empty() || line[0] == '#') continue;
         entry_t e;
-        e.parse(in);
+        jit::parse(line, e);
         data_[e.key.desc()].push_back(e);
-        stream_match(in, "\",\n");
     }
 }
 
@@ -124,7 +126,7 @@ struct conv_lookup_table_instance_t {
         table_path = getenv_string_user(env_table_path_name);
 #endif
         if (!table_path.empty()) {
-            std::ifstream in(table_path, std::ios::binary);
+            std::ifstream in(table_path);
             if (!in.good()) return;
             conv_lookup_table_t file_table;
             file_table.parse(in);
diff --git a/src/gpu/intel/jit/conv/lookup_table_data.cpp b/src/gpu/intel/jit/conv/lookup_table_data.cpp
index 908392da848..adc005a7668 100644
--- a/src/gpu/intel/jit/conv/lookup_table_data.cpp
+++ b/src/gpu/intel/jit/conv/lookup_table_data.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,13 +31,9 @@ const char** get_conv_lookup_table_entries() {
         "xehpg dpas bwd_w x16:any:x16 mb1 ic3ih224oc128oh56kh4sh4ph0 simd=8 l=oh8 T=oc2 i=ic4kw4oc8ow16 bufs=0",
         "xehpg dpas fwd x8:x8:any mb128b ic64iw3136oc256ow3136kw1pw0 simd=8 l=ic2 T=ow8 i=ic32mb32oc32 bufs=2",
         "xehpg dpas fwd x16:x16:any mb2 ic320iw2304oc640ow2304kw1pw0 simd=8 l=ic20 T=oc4ow8 i=ic16oc32ow24",
-        "xehpg dpas fwd x16:x16:any mb32b ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=8 l=kw3 T=ow8 i=ic2kw8mb32oc8 bufs=1",
-        "xehpg dpas fwd x8:x8:any mb32b ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=8 l=kw3 T=ow8 i=ic4kw8mb32oc8 bufs=1",
         "xehpg dpas fwd x16:x16:any mb2 ic1280ih24oc1280oh24kh3ph1 simd=8 l=ic80kh3kw3 T=oc4 i=ic16oc32ow24",
         "xehpg dpas fwd x16:x16:any mb2 ic960iw9216oc320ow9216kw1pw0 simd=8 l=ic60 T=oc4ow8 i=ic16oc32ow32",
         "xehpg dpas fwd x8:x8:any mb128b ic128ih56oc128oh28kh3sh2ph1 simd=8 l=ic4kh3kw3 T=oc4ow8 i=ic32mb32oc32 bufs=3",
-        "xehpg mad fwd x16:x16:any mb1 ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=8 l=kw3 T=ow8 i=kw8oc8ow16 bufs=1",
-        "xehpg mad fwd x8:x8:any mb1 ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=8 l=x T=ow2 i=kw23oc8ow15 bufs=0",
         "xehpg dpas fwd x8:x8:any mb128b ic64ih56oc64oh56kh3ph1 simd=8 l=ic2kh3kw3 T=oc2ow8 i=ic32mb32oc32 bufs=3",
         "xehpg dpas fwd x8:x8:any mb128b ic1024iw196oc256ow196kw1pw0 simd=8 l=ic32 T=oc8ow4 i=ic32mb32oc32 bufs=3",
         "xehpc dpas fwd x16:x16:any mb1 ic256ih5oc12oh5kh3ph1 simd=16 l=ic2kh3kw3 T=ic8 i=ic16oc16ow8 bufs=0",
@@ -306,8 +302,6 @@ const char** get_conv_lookup_table_entries() {
         "xehpc dpas fwd x8:x8:any mb32b ic3id8ih224iw224oc16od8oh112ow112kd1kh3kw3sh2sw2pd0ph1pw1 simd=16 l=kh3 T=x i=ic4kw8mb32oc16 bufs=3",
         "xehpg dpas fwd x16:x16:any mb1 ic640ih32oc640oh16kh3sh2ph1 simd=8 l=ic10kh3kw3 T=ic4 i=ic16oc32ow16 bufs=0",
         "xehpg dpas fwd x16:x16:any mb1 ic1280ih8oc1280oh8kh3ph1 simd=8 l=ic20kh3kw3 T=ic2 i=ic32oc16ow8 bufs=0",
-        "xehpc dpas fwd x8:x8:any mb32b ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=16 l=kw3 T=ow4 i=ic4kw8mb32oc16 bufs=3",
-        "xehpc dpas fwd x16:x16:any mb32b ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=16 l=kw3 T=ow8 i=ic2kw8mb32oc16 bufs=3",
         "xehpc dpas bwd_w x16:any:x16 mb64 ic256iw90oc256ow59kw32pw0 simd=16 l=mb64ow4 T=ic8oc4 i=ic16oc64ow16",
         "xehpg dpas bwd_w x16:any:x16 mb3 ic256ih10iw16oc256oh10ow16kh5kw5ph2pw2 simd=8 l=oh10 T=ic8oc4 i=ic16oc64ow16 bufs=3",
         "xehpg dpas fwd x16:x16:any mb1 ic1920iw256oc1280ow256kw1pw0 simd=8 l=ic120 T=oc8ow4 i=ic16oc16ow32",
@@ -335,8 +329,6 @@ const char** get_conv_lookup_table_entries() {
         "xehpc mad fwd f64:f64:any mb2048 ic2ih10oc8oh8kh3ph0 simd=8 l=kh3 T=x i=ic2kw3oc8ow8 bufs=0",
         "xehpc dpas bwd_w x16:any:x16 mb64 ic128iw16oc64ow15kw2pw0 simd=16 l=x T=ic8 i=ic8kw2oc16ow16",
         "xehpg dpas fwd x8:x8:any mb32b ic10ih15oc20oh13kh3ph0 simd=8 l=kh3kw2 T=oc4ow4 i=ic16kw2mb32oc8 bufs=3",
-        "xehpc mad fwd x8:x8:any mb1 ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=16 l=x T=ow4 i=kw23oc16ow16 bufs=0",
-        "xehpc mad fwd x16:x16:any mb1 ic1ih80iw100650oc1oh80ow100650kh1kw23ph0pw11 simd=16 l=x T=x i=kw23oc16ow16 bufs=0",
         "xehpg dpas fwd x16:x16:any mb1 ic1280ih10oc1280oh8kh3ph0 simd=8 l=ic20kh3kw3 T=ic2 i=ic32oc16ow8 bufs=0",
         "xehpc dpas bwd_d any:x16:x16 mb64 ic256iw90oc256ow59kw32pw0 simd=16 l=kw32oc8 T=ic8iw4 i=ic32iw30oc32",
         "xehpg dpas fwd x16:x16:any mb2 ic640ih24oc1280oh24kh3ph1 simd=8 l=ic40kh3kw3 T=oc4 i=ic16oc32ow24",
diff --git a/src/gpu/intel/jit/conv/message_patterns.hpp b/src/gpu/intel/jit/conv/message_patterns.hpp
index f81fe4c0d1d..e5df00e9115 100644
--- a/src/gpu/intel/jit/conv/message_patterns.hpp
+++ b/src/gpu/intel/jit/conv/message_patterns.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,8 +28,8 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-struct conv_stride_layout_t : public stride_layout_t<prb_dim_t> {
-    using base_layout_t = stride_layout_t<prb_dim_t>;
+struct conv_stride_layout_t : public stride_layout_t<pvar_t> {
+    using base_layout_t = stride_layout_t<pvar_t>;
 
     enum class input_tensor_t {
         src,
@@ -46,23 +46,23 @@ struct conv_stride_layout_t : public stride_layout_t<prb_dim_t> {
                 else if (type == input_tensor_t::wei)
                     return prb.b_md();
                 else
-                    ir_error_not_expected();
+                    gpu_error_not_expected();
             } else if (prb.is_bwd_d) {
                 if (type == input_tensor_t::dst)
                     return prb.a_md();
                 else if (type == input_tensor_t::wei)
                     return prb.b_md();
                 else
-                    ir_error_not_expected();
+                    gpu_error_not_expected();
             } else if (prb.is_bwd_w) {
                 if (type == input_tensor_t::src)
                     return prb.a_md();
                 else if (type == input_tensor_t::dst)
                     return prb.b_md();
                 else
-                    ir_error_not_expected();
+                    gpu_error_not_expected();
             } else {
-                ir_error_not_expected();
+                gpu_error_not_expected();
             }
             return prb.a_md();
         }();
@@ -76,60 +76,60 @@ struct conv_stride_layout_t : public stride_layout_t<prb_dim_t> {
         const auto &blk = mdw.blocking_desc();
         auto s = strides.begin();
 
-        auto write_strides =
-                [&](std::array<base_layout_t::stride_dim_t, MAX_NDIMS>::iterator
-                                s,
-                        prb_dim_t conv_dim, dim_t desc_dim, dim_t size,
-                        dim_t access_stride = 1, bool can_overflow = false) {
-                    // Size 1 dimensions are effectively non-existent
-                    if (size == 1) return s;
-
-                    bool is_complex = access_stride == 0;
-
-                    // Complex expressions can produce any number as f_dim(dim)
-                    if (is_complex) access_stride = 1;
-
-                    auto outer = size;
-                    auto stride = 1;
-                    for (int j = 0; j < blk.inner_nblks; j++) {
-                        const dim_t blk_size = blk.inner_blks[j];
-                        if (blk.inner_idxs[j] == desc_dim) {
-                            outer = utils::div_up(outer, blk_size);
-                            auto next = stride;
-                            if (access_stride > 1) {
-                                if (blk_size % access_stride == 0) {
-                                    next *= access_stride;
-                                    access_stride = 1;
-                                } else {
-                                    access_stride = 1;
-                                    is_complex = true;
-                                }
-                            }
-                            ir_assert(s != strides.end());
-                            *s++ = stride_dim_t(conv_dim, blk_size, next,
-                                    can_overflow, is_complex);
-                            ndims++;
-                        }
-                        stride *= blk_size;
-                    }
-                    ir_assert(s != strides.end());
-                    *s++ = stride_dim_t(conv_dim, outer,
-                            access_stride * blk.strides[desc_dim], can_overflow,
-                            is_complex);
-                    ndims++;
-                    return s;
-                };
+        auto write_strides
+                = [&](std::array<base_layout_t::stride_dim_t,
+                              stride_layout_t::max_ndims>::iterator s,
+                          const pvar_t &conv_dim, dim_t desc_dim, dim_t size,
+                          dim_t access_stride = 1, bool can_overflow = false) {
+                      // Size 1 dimensions are effectively non-existent
+                      if (size == 1) return s;
+
+                      bool is_complex = access_stride == 0;
+
+                      // Complex expressions can produce any number as f_dim(dim)
+                      if (is_complex) access_stride = 1;
+
+                      auto outer = size;
+                      auto stride = 1;
+                      for (int j = 0; j < blk.inner_nblks; j++) {
+                          const dim_t blk_size = blk.inner_blks[j];
+                          if (blk.inner_idxs[j] == desc_dim) {
+                              outer = utils::div_up(outer, blk_size);
+                              auto next = stride;
+                              if (access_stride > 1) {
+                                  if (blk_size % access_stride == 0) {
+                                      next *= access_stride;
+                                      access_stride = 1;
+                                  } else {
+                                      access_stride = 1;
+                                      is_complex = true;
+                                  }
+                              }
+                              gpu_assert(s != strides.end());
+                              *s++ = stride_dim_t(conv_dim, blk_size, next,
+                                      can_overflow, is_complex);
+                              ndims++;
+                          }
+                          stride *= blk_size;
+                      }
+                      gpu_assert(s != strides.end());
+                      *s++ = stride_dim_t(conv_dim, outer,
+                              access_stride * blk.strides[desc_dim],
+                              can_overflow, is_complex);
+                      ndims++;
+                      return s;
+                  };
 
         switch (type) {
             case input_tensor_t::src:
             case input_tensor_t::dst: {
                 bool is_src = type == input_tensor_t::src;
                 int i = 0;
-                s = write_strides(s, prb_dims::mb, i++, prb.mb);
+                s = write_strides(s, pvars::mb, i++, prb.mb);
                 if (is_src)
-                    s = write_strides(s, prb_dims::ic, i++, prb.ic);
+                    s = write_strides(s, pvars::ic, i++, prb.ic);
                 else
-                    s = write_strides(s, prb_dims::oc, i++, prb.oc);
+                    s = write_strides(s, pvars::oc, i++, prb.oc);
 
                 if (mdw.ndims() >= 5) {
                     bool is_padded = is_src
@@ -137,14 +137,13 @@ struct conv_stride_layout_t : public stride_layout_t<prb_dim_t> {
                                     || prb.id < prb.od * prb.sd
                                                     + (prb.kd - 1)
                                                             * (prb.dd + 1));
-                    auto x_dim = !prb.is_bwd_d ? prb_dims::od : prb_dims::id;
+                    const auto &x_dim = !prb.is_bwd_d ? pvars::od : pvars::id;
                     auto x = !prb.is_bwd_d ? prb.od : prb.id;
                     auto xas = !prb.is_bwd_d ? prb.sd : prb.sd == 1;
                     auto kx = prb.kd;
                     auto kxas = !prb.is_bwd_w ? prb.dd + 1 : prb.dd == 0;
                     s = write_strides(s, x_dim, i, x, xas, is_padded);
-                    s = write_strides(
-                            s, prb_dims::kd, i++, kx, kxas, is_padded);
+                    s = write_strides(s, pvars::kd, i++, kx, kxas, is_padded);
                 }
                 if (mdw.ndims() >= 4) {
                     bool is_padded = is_src
@@ -152,41 +151,39 @@ struct conv_stride_layout_t : public stride_layout_t<prb_dim_t> {
                                     || prb.ih < prb.oh * prb.sh
                                                     + (prb.kh - 1)
                                                             * (prb.dh + 1));
-                    auto x_dim = !prb.is_bwd_d ? prb_dims::oh : prb_dims::ih;
+                    const auto &x_dim = !prb.is_bwd_d ? pvars::oh : pvars::ih;
                     auto x = !prb.is_bwd_d ? prb.oh : prb.ih;
                     auto xas = !prb.is_bwd_d ? prb.sh : prb.sh == 1;
                     auto kx = prb.kh;
                     auto kxas = !prb.is_bwd_w ? prb.dh + 1 : prb.dh == 0;
                     s = write_strides(s, x_dim, i, x, xas, is_padded);
-                    s = write_strides(
-                            s, prb_dims::kh, i++, kx, kxas, is_padded);
+                    s = write_strides(s, pvars::kh, i++, kx, kxas, is_padded);
                 }
                 bool is_padded = is_src
                         && (prb.pw
                                 || prb.iw < prb.ow * prb.sw
                                                 + (prb.kw - 1) * (prb.dw + 1));
-                auto x_dim = !prb.is_bwd_d ? prb_dims::ow : prb_dims::iw;
+                const auto &x_dim = !prb.is_bwd_d ? pvars::ow : pvars::iw;
                 auto x = !prb.is_bwd_d ? prb.ow : prb.iw;
                 auto xas = !prb.is_bwd_d ? prb.sw : prb.sw == 1;
                 auto kx = prb.kw;
                 auto kxas = !prb.is_bwd_w ? prb.dw + 1 : prb.dw == 0;
                 s = write_strides(s, x_dim, i, x, xas, is_padded);
-                s = write_strides(s, prb_dims::kw, i++, kx, kxas, is_padded);
+                s = write_strides(s, pvars::kw, i++, kx, kxas, is_padded);
                 break;
             }
             case input_tensor_t::wei: {
                 int i = 0;
-                if (prb.with_groups)
-                    s = write_strides(s, prb_dims::g, i++, prb.g);
-                s = write_strides(s, prb_dims::oc, i++, prb.oc);
-                s = write_strides(s, prb_dims::ic, i++, prb.ic);
+                if (prb.with_groups) s = write_strides(s, pvars::g, i++, prb.g);
+                s = write_strides(s, pvars::oc, i++, prb.oc);
+                s = write_strides(s, pvars::ic, i++, prb.ic);
                 if (mdw.ndims() >= 5 + prb.with_groups) {
-                    s = write_strides(s, prb_dims::kd, i++, prb.kd);
+                    s = write_strides(s, pvars::kd, i++, prb.kd);
                 }
                 if (mdw.ndims() >= 4 + prb.with_groups) {
-                    s = write_strides(s, prb_dims::kh, i++, prb.kh);
+                    s = write_strides(s, pvars::kh, i++, prb.kh);
                 }
-                s = write_strides(s, prb_dims::kw, i++, prb.kw);
+                s = write_strides(s, pvars::kw, i++, prb.kw);
                 break;
             }
             default: assert("unimplemented");
diff --git a/src/gpu/intel/jit/conv/model.hpp b/src/gpu/intel/jit/conv/model.hpp
index 39e384d8f32..5271846e9c0 100644
--- a/src/gpu/intel/jit/conv/model.hpp
+++ b/src/gpu/intel/jit/conv/model.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,10 +42,10 @@ enum class hw_t {
     xehpc,
 };
 
-hw_t to_hw(const std::string &s) {
+inline hw_t to_hw(const std::string &s) {
     if (s == "xehpg") return hw_t::xehpg;
     if (s == "xehpc") return hw_t::xehpc;
-    ir_assert(false);
+    gpu_assert(false);
     return hw_t::undef;
 }
 
@@ -55,10 +55,10 @@ enum class fma_t {
     mad,
 };
 
-fma_t to_fma(const std::string &s) {
+inline fma_t to_fma(const std::string &s) {
     if (s == "mad") return fma_t::mad;
     if (s == "dpas") return fma_t::dpas;
-    ir_assert(false);
+    gpu_assert(false);
     return fma_t::undef;
 }
 
@@ -69,18 +69,18 @@ enum class prop_t {
     bwd_w,
 };
 
-prop_t to_prop(const std::string &s) {
+inline prop_t to_prop(const std::string &s) {
     if (s == "fwd") return prop_t::fwd;
     if (s == "bwd_d") return prop_t::bwd_d;
     if (s == "bwd_w") return prop_t::bwd_w;
-    ir_assert(false);
+    gpu_assert(false);
     return prop_t::undef;
 }
 
 enum class type_t { undef, d8, d16, d32, d64 };
 
-void get_types(const std::string &type_cfg, prop_t prop, type_t &src_type,
-        type_t &dst_type) {
+inline void get_types(const std::string &type_cfg, prop_t prop,
+        type_t &src_type, type_t &dst_type) {
     const std::pair<const char *, type_t> all_types[] = {
             std::make_pair("bf16", type_t::d16),
             std::make_pair("f16", type_t::d16),
@@ -113,26 +113,28 @@ void get_types(const std::string &type_cfg, prop_t prop, type_t &src_type,
     }
     if (pos != type_cfg.length() || idx != ntypes) {
         std::cout << type_cfg << std::endl;
-        ir_assert(false);
+        gpu_assert(false);
     }
     switch (prop) {
         case prop_t::fwd: break;
         case prop_t::bwd_d: std::swap(types[0], types[2]); break;
         case prop_t::bwd_w: std::swap(types[1], types[2]); break;
-        default: ir_assert(false);
+        default: gpu_assert(false);
     }
     src_type = types[0];
     dst_type = types[2];
 }
 
-type_t to_src_type(const std::string &type_cfg, prop_t prop = prop_t::fwd) {
+inline type_t to_src_type(
+        const std::string &type_cfg, prop_t prop = prop_t::fwd) {
     type_t src_type;
     type_t dst_type;
     get_types(type_cfg, prop, src_type, dst_type);
     return src_type;
 }
 
-type_t to_dst_type(const std::string &type_cfg, prop_t prop = prop_t::fwd) {
+inline type_t to_dst_type(
+        const std::string &type_cfg, prop_t prop = prop_t::fwd) {
     type_t src_type;
     type_t dst_type;
     get_types(type_cfg, prop, src_type, dst_type);
@@ -170,7 +172,7 @@ struct hw_config_t {
                 s8_dpas_ops_per_clock = 1024;
                 f32_mad_ops_per_clock = 32;
                 break;
-            default: ir_assert(false); break;
+            default: gpu_assert(false); break;
         }
         bool is_dpas = (fma == fma_t::dpas);
         switch (src_type) {
@@ -184,7 +186,7 @@ struct hw_config_t {
                 ops_per_clock = (is_dpas ? s8_dpas_ops_per_clock / 8
                                          : f32_mad_ops_per_clock / 2);
                 break;
-            default: ir_assert(false); break;
+            default: gpu_assert(false); break;
         }
         threads_per_eu = (is_dpas ? 4 : 8);
     }
@@ -209,10 +211,10 @@ enum class metric_t {
     msre, // Mean squared relative error.
 };
 
-metric_t to_metric(const std::string &s) {
+inline metric_t to_metric(const std::string &s) {
     if (s == "mse") return metric_t::mse;
     if (s == "msre") return metric_t::msre;
-    ir_assert(false);
+    gpu_assert(false);
     return metric_t::undef;
 }
 
@@ -224,11 +226,11 @@ enum class score_t {
     mape, // Mean absolute percentage error.
 };
 
-score_t to_score(const std::string &s) {
+inline score_t to_score(const std::string &s) {
     if (s == "r2") return score_t::r2;
     if (s == "mae") return score_t::mae;
     if (s == "mape") return score_t::mape;
-    ir_assert(false);
+    gpu_assert(false);
     return score_t::undef;
 }
 
@@ -238,10 +240,10 @@ struct bmnk_conv_sample_t {
     type_t src_type;
     type_t dst_type;
     hw_config_t hw_cfg;
-    int b, m, n, k;
-    int bt, mt, nt, kt;
-    int bl, ml, nl, kl;
-    int bi, mi, ni, ki;
+    dim_t b, m, n, k;
+    dim_t bt, mt, nt, kt;
+    dim_t bl, ml, nl, kl;
+    dim_t bi, mi, ni, ki;
     float sec = 0;
     float gops_sec = 0;
     float weight = 1;
@@ -286,8 +288,8 @@ struct bmnk_conv_sample_t {
     }
 
     float with_atomic() const {
-        int k_tg = kl * kt * ki;
-        int k_rounded = utils::rnd_up(k, k_tg);
+        dim_t k_tg = kl * kt * ki;
+        dim_t k_rounded = utils::rnd_up(k, k_tg);
         return k_rounded > k_tg ? 1.0f : 0.0f;
     }
 
@@ -324,9 +326,9 @@ struct bmnk_conv_sample_t {
         ret.push_back(wave_util());
         ret.push_back(tg_util());
         ret.push_back(ops());
-        int bg = b / (bl * bt * bi);
-        int mg = m / (ml * mt * mi);
-        int ng = n / (nl * nt * ni);
+        dim_t bg = b / (bl * bt * bi);
+        dim_t mg = m / (ml * mt * mi);
+        dim_t ng = n / (nl * nt * ni);
         ret.push_back(bg * mg * ng);
         ret.push_back(k / (kl * kt * ki));
         ret.push_back(mt);
@@ -361,11 +363,11 @@ struct bmnk_conv_sample_t {
 // Convolution training sample.
 struct conv_sample_t {
     struct tile_t {
-        int g, mb;
-        int oc, ic;
-        int id, ih, iw;
-        int od, oh, ow;
-        int kd, kh, kw;
+        dim_t g, mb;
+        dim_t oc, ic;
+        dim_t id, ih, iw;
+        dim_t od, oh, ow;
+        dim_t kd, kh, kw;
     };
 
     prop_t prop;
@@ -401,7 +403,7 @@ struct conv_sample_t {
     }
 
     void pad() {
-        auto pad_dim = [](int &dim, int loop, int tg, int iter) {
+        auto pad_dim = [](dim_t &dim, dim_t loop, dim_t tg, dim_t iter) {
             if (iter == -1) return;
             dim = utils::rnd_up(dim, loop * tg * iter);
         };
@@ -423,7 +425,7 @@ struct conv_sample_t {
     }
 
     float eff() const {
-        int b, m, n, k;
+        dim_t b, m, n, k;
         to_gemm_tile(shape, b, m, n, k);
         float ops = 2.0f * b * m * n * k;
         return ops / 1e9 / sec / hw_cfg.max_gops_per_sec();
@@ -473,7 +475,7 @@ struct conv_sample_t {
         ret.kh = parse_dim(s, "kh");
         ret.kw = parse_dim(s, "kw");
         // Promote missing spatial dimensions based on others.
-        auto promote = [](int &d, int &h, int &w) {
+        auto promote = [](dim_t &d, dim_t &h, dim_t &w) {
             if (d != -1 && h == -1 && w == -1) {
                 h = w = d;
             } else if (d == -1 && h != -1 && w == -1) {
@@ -488,7 +490,8 @@ struct conv_sample_t {
         return normalize_tile(ret);
     }
 
-    void to_gemm_tile(const tile_t &t, int &b, int &m, int &n, int &k) const {
+    void to_gemm_tile(
+            const tile_t &t, dim_t &b, dim_t &m, dim_t &n, dim_t &k) const {
         b = t.g;
         switch (prop) {
             case prop_t::fwd:
@@ -506,7 +509,7 @@ struct conv_sample_t {
                 n = t.oc;
                 k = t.mb * t.od * t.oh * t.ow;
                 break;
-            default: ir_assert(false);
+            default: gpu_assert(false);
         }
         if (transpose) std::swap(m, n);
     }
@@ -514,7 +517,7 @@ struct conv_sample_t {
     // Initializes missing dimensions to one.
     tile_t normalize_tile(const tile_t &t) const {
         tile_t ret = t;
-        std::vector<int *> dims = {
+        std::vector<dim_t *> dims = {
                 &ret.g, &ret.mb, &ret.oc, &ret.ic, &ret.kd, &ret.kh, &ret.kw};
         switch (prop) {
             case prop_t::fwd:
@@ -528,7 +531,7 @@ struct conv_sample_t {
                 dims.push_back(&ret.ih);
                 dims.push_back(&ret.iw);
                 break;
-            default: ir_assert(false);
+            default: gpu_assert(false);
         }
         for (auto *d : dims)
             if (*d == -1) *d = 1;
@@ -566,7 +569,7 @@ class histogram_t {
                     buckets_[i].push_back(kv.first);
                 }
             }
-            ir_assert((int)buckets_[i].size() <= bucket_count);
+            gpu_assert((int)buckets_[i].size() <= bucket_count);
         }
     }
 
@@ -575,13 +578,13 @@ class histogram_t {
         auto &b = buckets_[idx];
         int n = (int)b.size();
         for (int i = 0; i < n; i++)
-            if (b[i] >= value) return i;
-        return (T)n;
+            if (b[i] >= value) return into<T>(i);
+        return into<T>(n);
     }
 
     template <typename T>
     vec1d<T> to(const vec1d<float> &x) const {
-        ir_assert(x.size() == buckets_.size());
+        gpu_assert(x.size() == buckets_.size());
         vec1d<T> ret(x.size());
         for (int i = 0; i < (int)x.size(); i++)
             ret[i] = to_bucket<T>(x[i], i);
@@ -596,7 +599,7 @@ class histogram_t {
         return ret;
     }
 
-    void serialize(serialized_data_t &s) const { s.append(buckets_); }
+    void serialize(serialization_stream_t &s) const { s.append(buckets_); }
 
     static histogram_t deserialize(deserializer_t &d) {
         histogram_t h;
@@ -608,7 +611,8 @@ class histogram_t {
     vec2d<float> buckets_;
 };
 
-float r2_score(const std::vector<float> &y, const std::vector<float> &y_pred) {
+inline float r2_score(
+        const std::vector<float> &y, const std::vector<float> &y_pred) {
     float u = 0;
     float v = 0;
     float y_mean = 0;
@@ -623,7 +627,8 @@ float r2_score(const std::vector<float> &y, const std::vector<float> &y_pred) {
     return 1 - u / v;
 }
 
-float mae_score(const std::vector<float> &y, const std::vector<float> &y_pred) {
+inline float mae_score(
+        const std::vector<float> &y, const std::vector<float> &y_pred) {
     int n = (int)y.size();
     float err = 0;
     for (int i = 0; i < n; i++) {
@@ -632,7 +637,7 @@ float mae_score(const std::vector<float> &y, const std::vector<float> &y_pred) {
     return -err / n;
 }
 
-float mape_score(
+inline float mape_score(
         const std::vector<float> &y, const std::vector<float> &y_pred) {
     float eps = std::numeric_limits<float>::epsilon();
     int n = (int)y.size();
@@ -643,13 +648,13 @@ float mape_score(
     return -err / n;
 }
 
-float score(const std::vector<float> &y, const std::vector<float> &y_pred,
-        score_t score) {
+inline float score(const std::vector<float> &y,
+        const std::vector<float> &y_pred, score_t score) {
     switch (score) {
         case score_t::r2: return r2_score(y, y_pred);
         case score_t::mae: return mae_score(y, y_pred);
         case score_t::mape: return mape_score(y, y_pred);
-        default: ir_assert(false);
+        default: gpu_assert(false);
     }
     return 0;
 }
@@ -687,7 +692,7 @@ class tree_t {
     int feature_count() const { return nfeatures_; }
 
     float predict(const vec1d<x_type> &x) const {
-        ir_assert((int)x.size() == (int)nfeatures_);
+        gpu_assert((int)x.size() == (int)nfeatures_);
         return predict_impl(x, 0);
     }
 
@@ -714,7 +719,7 @@ class tree_t {
             walk(node.right);
         };
         walk(0);
-        ir_assert(non_leaf_nodes * 2 + 1 == node_count());
+        gpu_assert(non_leaf_nodes * 2 + 1 == node_count());
         for (auto &c : count)
             c /= non_leaf_nodes;
         return count;
@@ -727,7 +732,7 @@ class tree_t {
         std::cout << "  Nodes:      " << node_count() << std::endl;
     }
 
-    void serialize(serialized_data_t &s) const {
+    void serialize(serialization_stream_t &s) const {
         s.append(nfeatures_);
         s.append(max_depth_);
         s.append(subsamples_);
@@ -760,7 +765,7 @@ class tree_t {
     }
 
     const tree_node_t &get_node(int idx) const {
-        ir_assert(idx < reserved_nodes_);
+        gpu_assert(idx < reserved_nodes_);
         return nodes_[idx];
     }
 
@@ -843,7 +848,7 @@ class tree_t {
             if (err < min_err) {
                 min_err = err;
                 feature_idx = f;
-                threshold = v;
+                threshold = into<x_type>(v);
             }
         }
     }
@@ -894,7 +899,7 @@ class tree_t {
                     err += weight * val * val;
                 }
                 break;
-            default: ir_assert(false);
+            default: gpu_assert(false);
         }
         return err / total;
     }
@@ -918,15 +923,15 @@ class tree_t {
             data.resize(off + sizeof(node.value));
             std::memcpy(&data[off], &node.value, sizeof(node.value));
         } else {
-            ir_assert(node.feature_idx >= 0 && node.feature_idx <= u8_max);
-            ir_assert(node.value >= 0 && node.value <= u8_max);
+            gpu_assert(node.feature_idx >= 0 && node.feature_idx <= u8_max);
+            gpu_assert(node.value >= 0 && node.value <= u8_max);
             data.push_back((uint8_t)node.feature_idx);
             data.push_back((uint8_t)node.value);
             size_t right_off_idx = data.size();
             data.push_back(0);
             data.push_back(0);
             size_t right_off = serialize_node(data, node.left);
-            ir_assert(right_off <= u16_max);
+            gpu_assert(right_off <= u16_max);
             std::memcpy(&data[right_off_idx], &right_off, sizeof(uint16_t));
             serialize_node(data, node.right);
         }
@@ -942,10 +947,10 @@ class tree_t {
         int left = -1;
         int right = -1;
         if (is_leaf) {
-            ir_assert(off + 1 + sizeof(value) <= data.size());
+            gpu_assert(off + 1 + sizeof(value) <= data.size());
             std::memcpy(&value, &data[off + 1], sizeof(value));
         } else {
-            ir_assert(off + 3 < data.size());
+            gpu_assert(off + 3 < data.size());
             value = (float)data[off + 1];
             uint8_t right_off_u8[2] = {data[off + 2], data[off + 3]};
             uint16_t right_off;
@@ -1000,7 +1005,7 @@ class gradient_boost_regressor_t {
     int tree_count() const { return (int)trees_.size(); }
 
     int feature_count() const {
-        ir_assert(!trees_.empty());
+        gpu_assert(!trees_.empty());
         return trees_[0].feature_count();
     }
 
@@ -1024,14 +1029,14 @@ class gradient_boost_regressor_t {
     }
 
     float score(const vec2d<float> &X, const vec1d<float> &y, score_t score,
-            int max_trees = std::numeric_limits<int>::max()) {
+            int max_trees = std::numeric_limits<int>::max()) const {
         auto y_pred = predict(X, max_trees);
         return model::score(y, y_pred, score);
     }
 
     std::vector<std::pair<std::string, float>> feature_importances(
             const std::vector<const char *> &feature_names) const {
-        ir_assert((int)feature_names.size() == feature_count());
+        gpu_assert((int)feature_names.size() == feature_count());
         vec1d<float> fi(feature_count());
         for (auto &tree : trees_) {
             auto tree_fi = tree.feature_importances();
@@ -1044,6 +1049,7 @@ class gradient_boost_regressor_t {
         }
         using entry_t = std::pair<std::string, float>;
         std::vector<entry_t> ret;
+        ret.reserve(feature_count());
         for (int i = 0; i < feature_count(); i++) {
             ret.emplace_back(feature_names[i], fi[i]);
         }
@@ -1055,7 +1061,7 @@ class gradient_boost_regressor_t {
     }
 
     void print_info(const std::vector<const char *> &feature_names,
-            const std::string prefix = "") const {
+            const std::string &prefix = "") const {
         std::cout << prefix << "Gradient boost regressor" << std::endl;
         std::cout << prefix << "  Features:   " << feature_count() << std::endl;
         std::cout << prefix << "  Trees:      " << tree_count() << std::endl;
@@ -1069,12 +1075,12 @@ class gradient_boost_regressor_t {
     }
 
     int serialized_size() const {
-        serialized_data_t s;
+        serialization_stream_t s;
         serialize(s);
         return (int)s.get_data().size();
     }
 
-    void serialize(serialized_data_t &s) const {
+    void serialize(serialization_stream_t &s) const {
         s.append(learning_rate_);
         s.append(hist_);
         s.append(f0_);
diff --git a/src/gpu/intel/jit/conv/model_bridge.cpp b/src/gpu/intel/jit/conv/model_bridge.cpp
index 964fcb50dad..4f2a9a52903 100644
--- a/src/gpu/intel/jit/conv/model_bridge.cpp
+++ b/src/gpu/intel/jit/conv/model_bridge.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,6 +32,8 @@ namespace model {
 type_t to_type(data_type_t dt) {
     switch (static_cast<int>(dt)) {
         case data_type::s8:
+        case data_type::f4_e2m1:
+        case data_type::f4_e3m0:
         case data_type::f8_e5m2:
         case data_type::f8_e4m3:
         case data_type::u8: return type_t::d8;
@@ -41,7 +43,7 @@ type_t to_type(data_type_t dt) {
         case data_type::f32:
         case data_type::s32: return type_t::d32;
         case data_type::f64: return type_t::d64;
-        default: ir_error_not_expected() << "Unknown type: " << dt;
+        default: gpu_error_not_expected() << "Unknown type: " << dt;
     }
     return type_t::undef;
 }
@@ -54,7 +56,8 @@ hw_t to_hw(ngen::HW hw) {
         case ngen::HW::XeHPG: return hw_t::xehpg;
         case ngen::HW::XeHPC: return hw_t::xehpc;
         case ngen::HW::Xe2: return hw_t::xehpc;
-        default: ir_error_not_expected() << "Unknown HW: " << to_string(hw);
+        case ngen::HW::Xe3: return hw_t::xehpc;
+        default: gpu_error_not_expected() << "Unknown HW: " << to_string(hw);
     }
     return hw_t::undef;
 }
@@ -66,7 +69,7 @@ fma_t to_fma(fma_kind_t fma) {
         case fma_kind_t::dpas:
         case fma_kind_t::dpasw: return fma_t::dpas;
         default:
-            ir_error_not_expected() << "Unknown FMA kind: " << to_string(fma);
+            gpu_error_not_expected() << "Unknown FMA kind: " << to_string(fma);
     }
     return fma_t::undef;
 }
@@ -97,11 +100,11 @@ conv_sample_t to_conv_sample(
         ret.loop.name = -1; \
         ret.tg.name = -1; \
         ret.iter.name = -1; \
-        if (!shape.has(prb_dims::name)) break; \
-        ret.shape.name = shape.get(prb_dims::name); \
-        ret.loop.name = blk.loop().get(prb_dims::name, 1); \
-        ret.tg.name = blk.thread_group().get(prb_dims::name, 1); \
-        ret.iter.name = blk.iter().get(prb_dims::name, 1); \
+        if (!shape.has(pvars::name)) break; \
+        ret.shape.name = shape.get(pvars::name); \
+        ret.loop.name = blk.loop().get(pvars::name, 1); \
+        ret.tg.name = blk.thread_group().get(pvars::name, 1); \
+        ret.iter.name = blk.iter().get(pvars::name, 1); \
     } while (false)
     HANDLE(g);
     HANDLE(mb);
@@ -192,7 +195,7 @@ gradient_boost_regressor_t &get_gbr(const conv_config_t &cfg) {
         for (auto &kv : kind2data) {
             auto kind = kv.first;
             auto &data = *kv.second;
-            auto s = serialized_t::from_data(unpack_data(data));
+            auto s = serialization_stream_t::from_data(unpack_data(data));
             deserializer_t d(s);
             gbr_map[kind] = gradient_boost_regressor_t::deserialize(d);
         }
diff --git a/src/gpu/intel/jit/conv/normalization.cpp b/src/gpu/intel/jit/conv/normalization.cpp
index 711f5c2365d..e0e1ce347a8 100644
--- a/src/gpu/intel/jit/conv/normalization.cpp
+++ b/src/gpu/intel/jit/conv/normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-layout_t insert_dimension(const layout_t &layout, int dim_idx) {
+layout_t insert_dimension(const layout_t &layout, dim_idx_t dim_idx) {
     auto new_blocks = layout.blocks();
     for (auto &b : new_blocks) {
         if (b.dim_idx >= dim_idx) b.dim_idx++;
@@ -34,11 +34,11 @@ layout_t insert_dimension(const layout_t &layout, int dim_idx) {
             /*do_normalize=*/false);
 }
 
-layout_t remove_size_1_dimension(const layout_t &layout, int dim_idx) {
-    ir_assert(0 <= dim_idx && dim_idx < layout.ndims());
-    ir_assert(layout.dim(dim_idx) == 1);
+layout_t remove_size_1_dimension(const layout_t &layout, dim_idx_t dim_idx) {
+    gpu_assert(dim_idx != dim_idx::invalid && dim_idx < layout.ndims());
+    gpu_assert(layout.dim(dim_idx) == 1);
     dim_assignment_t a(layout.ndims(), layout.ndims() - 1);
-    for (int i = 0; i < layout.ndims(); i++) {
+    for (dim_idx_t i = 0; i < layout.ndims(); i++) {
         if (i == dim_idx) continue;
         a.assign(i, i < dim_idx ? i : i - 1);
     }
@@ -46,8 +46,8 @@ layout_t remove_size_1_dimension(const layout_t &layout, int dim_idx) {
 }
 
 layout_t split_dimension(
-        const layout_t &_layout, int dim_idx, int outer_block) {
-    int rem_inner_block
+        const layout_t &_layout, dim_idx_t dim_idx, dim_t outer_block) {
+    dim_t rem_inner_block
             = ir_utils::safe_divide(_layout.dim(dim_idx), outer_block);
     auto layout = insert_dimension(_layout, dim_idx);
     std::vector<block_t> new_blocks;
@@ -82,23 +82,23 @@ layout_t split_dimension(
 }
 
 layout_t normalize_conv_groups(const layout_t &layout, bool with_groups,
-        int groups, bool is_dw, bool add_groups, bool is_wei) {
+        dim_t groups, bool is_dw, bool add_groups, bool is_wei) {
     if (with_groups == add_groups) return layout;
     if (is_wei) {
-        ir_assert(groups == 1)
+        gpu_assert(groups == 1)
                 << "Adding/removing groups can be done only for single group.";
         if (add_groups) return insert_dimension(layout, 0);
         return remove_size_1_dimension(layout, 0);
     }
 
-    ir_assert(!with_groups) << "Unexpected groups in source/destination.";
+    gpu_assert(!with_groups) << "Unexpected groups in source/destination.";
     if (is_dw) groups = layout.dim(1);
     if (layout.dim(1) == 1) groups = 1;
     return split_dimension(layout, /*dim_idx=*/1, groups);
 }
 
 layout_t normalize_conv_layout(const layout_t &_layout, bool with_groups,
-        int groups, bool is_dw, const std::array<int, 3> &dhw_map,
+        dim_t groups, bool is_dw, const std::array<int, 3> &dhw_map,
         bool add_groups, bool is_wei) {
     layout_t layout = _layout;
     layout = spatials_to_3d(layout, with_groups, dhw_map);
@@ -109,7 +109,7 @@ layout_t normalize_conv_layout(const layout_t &_layout, bool with_groups,
 }
 
 std::vector<dim_t> normalize_conv_dims(std::vector<dim_t> &dims,
-        bool with_groups, int groups, bool is_dw,
+        bool with_groups, dim_t groups, bool is_dw,
         const std::array<int, 3> &dhw_map, bool add_groups, bool is_wei) {
     layout_t dummy_layout(type_t::u8(), 0, dims);
     return normalize_conv_layout(dummy_layout, with_groups, groups, is_dw,
@@ -118,8 +118,8 @@ std::vector<dim_t> normalize_conv_dims(std::vector<dim_t> &dims,
 }
 
 void normalize_conv_layouts(layout_t &src_layout, layout_t &wei_layout,
-        layout_t &dst_layout, layout_t &bia_layout, bool with_groups, int g,
-        int ic, int oc, bool is_dw, const std::array<int, 3> &dhw_map,
+        layout_t &dst_layout, layout_t &bia_layout, bool with_groups, dim_t g,
+        dim_t ic, dim_t oc, bool is_dw, const std::array<int, 3> &dhw_map,
         bool add_groups) {
     src_layout = normalize_conv_layout(src_layout, /*with_groups=*/false,
             g > 1 ? src_layout.dim(1) / ic : 1, is_dw, dhw_map, add_groups,
@@ -130,14 +130,14 @@ void normalize_conv_layouts(layout_t &src_layout, layout_t &wei_layout,
             g > 1 ? dst_layout.dim(1) / oc : 1, is_dw, dhw_map, add_groups,
             /*is_wei=*/false);
     if (add_groups && !bia_layout.is_empty()) {
-        ir_assert(bia_layout.ndims() == 1) << bia_layout;
+        gpu_assert(bia_layout.ndims() == 1) << bia_layout;
         bia_layout = split_dimension(bia_layout, 0, g);
     }
 }
 
 uint32_t conv_post_op_view_mapper_t::normalize_mask(uint32_t orig_mask) const {
-    int cp_ndims = cp_view().nvdims();
-    ir_assert(cp_ndims >= 3);
+    dim_idx_t cp_ndims = cp_view().nvdims();
+    gpu_assert(cp_ndims >= 3);
     // Add groups to match ngcdhw layout.
     bool add_groups = (cp_view().vvars()[1].as<var_t>().name == "g");
     // Number of dimensions before normalization.
@@ -152,18 +152,18 @@ uint32_t conv_post_op_view_mapper_t::normalize_mask(uint32_t orig_mask) const {
             /*add_groups=*/false, /*is_wei=*/false);
     // Split channels into groups and channels to match ngcdhw layout.
     if (add_groups) cvt_dims.insert(cvt_dims.begin() + 1, cvt_dims[1]);
-    ir_assert(int(cvt_dims.size()) == cp_ndims);
+    gpu_assert(cvt_dims.size() == cp_ndims);
 
     uint32_t mask = 0;
-    for (int i = 0; i < cp_ndims; i++) {
+    for (dim_idx_t i = 0; i < cp_ndims; i++) {
         if (cvt_dims[i] == mask_set_value) mask = mask | (1 << i);
     }
     return mask;
 }
 
-void maybe_reshape_dims(int ndims, layout_t &layout, std::vector<dim_t> &dims,
-        std::vector<dim_t> &padded_dims) {
-    ir_assert(layout.ndims() == int(dims.size()));
+void maybe_reshape_dims(dim_idx_t ndims, layout_t &layout,
+        std::vector<dim_t> &dims, std::vector<dim_t> &padded_dims) {
+    gpu_assert(layout.ndims() == dim_idx_t(dims.size()));
     if (layout.ndims() < ndims) {
         layout = layout_t(layout.type(), ndims, layout.offset(),
                 layout.blocks(), /*do_normalize=*/false);
@@ -175,17 +175,26 @@ void maybe_reshape_dims(int ndims, layout_t &layout, std::vector<dim_t> &dims,
 // this method only gets called when ZP precompute is in order;
 // in all other cases ZPs are applied ad-hoc, without a post-op
 view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
-    auto map_o2k = [](view_t &v, int idx, int O, int I, int KD, int P, int S) {
-        const bool needs_right_bound = ((O - 1) * S + (KD - P) >= I);
+    auto map_o2k = [this](view_t &v, dim_idx_t idx, dim_t O, dim_t I, dim_t K,
+                           dim_t D, dim_t P, dim_t S) {
+        const auto KD = (K - 1) * (D + 1) + 1;
+        const auto KDP = (KD > 1) ? KD - P : 0;
+        const bool needs_right_bound = (O - 1) * S + KDP >= I;
         expr_t o = v.vvars()[idx];
         if (KD >= I) {
             o = o * S;
         } else {
             expr_t l, r;
-            int32_t off = P;
+            dim_t off = P;
             if (P > 0) l = binary_op_t::make(op_kind_t::_min, o * S - P, 0);
             if (needs_right_bound) {
-                r = binary_op_t::make(op_kind_t::_max, o * S + (KD - P), I);
+                if (schedule_.var_bound(o) > O) {
+                    auto q = binary_op_t::make(
+                            op_kind_t::_min, o * S + KDP, (O - 1) * S + KDP);
+                    r = binary_op_t::make(op_kind_t::_max, q, I);
+                } else {
+                    r = binary_op_t::make(op_kind_t::_max, o * S + KDP, I);
+                }
                 off -= I;
             }
             o = (!l.is_empty()) ? l : o;
@@ -193,7 +202,7 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
             o = (off != 0) ? o + off : o;
         }
         const auto &x = view_t::placeholder_var();
-        int32_t L = ir_utils::max_unique_pad_states(O, I, KD, P, S, true);
+        dim_t L = ir_utils::max_unique_pad_states(O, I, KD, P, S, true);
         bool mask = L < ir_utils::max_unique_pad_states(O, I, KD, P, S, false);
         v.set_vdim(v.vvars()[idx], (needs_right_bound) ? O : 1);
         v.set_tdim(idx, o, (mask) ? x < L : expr_t());
@@ -202,7 +211,7 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
     const auto &vars = cp_view().vvars();
     auto dst = normalize_conv_layout(zp_dst_, /*with_groups=*/false, prb_.g,
             prb_.is_dw, prb_.dhw_map, /*add_groups=*/true, /*is_wei=*/false);
-    ir_assert((vars.size() == 6) && (dst.ndims() == 6));
+    gpu_assert((vars.size() == 6) && (dst.ndims() == 6));
 
     bool non_1_spatials[3] = {false, false, false};
     std::vector<block_t> new_blk;
@@ -217,9 +226,6 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
     }
     dst = layout_t(dst.type(), dst.ndims(), dst.offset(), new_blk, false);
 
-    const auto KDD = (prb_.kd - 1) * (prb_.dd + 1) + 1;
-    const auto KDH = (prb_.kh - 1) * (prb_.dh + 1) + 1;
-    const auto KDW = (prb_.kw - 1) * (prb_.dw + 1) + 1;
     view_t view(vars, 6);
     view.set_vdim(vars[0], 1); // mb
     view.set_vdim(vars[1], prb_.g);
@@ -227,16 +233,16 @@ view_t conv_post_op_view_mapper_t::create_src_zp_view(uint32_t mask) const {
     view.set_tdim(0, vars[0]);
     view.set_tdim(1, vars[1]);
     view.set_tdim(2, vars[2]);
-    map_o2k(view, 3, prb_.od, prb_.id, KDD, prb_.pd, prb_.sd);
-    map_o2k(view, 4, prb_.oh, prb_.ih, KDH, prb_.ph, prb_.sh);
-    map_o2k(view, 5, prb_.ow, prb_.iw, KDW, prb_.pw, prb_.sw);
+    map_o2k(view, 3, prb_.od, prb_.id, prb_.kd, prb_.dd, prb_.pd, prb_.sd);
+    map_o2k(view, 4, prb_.oh, prb_.ih, prb_.kh, prb_.dh, prb_.ph, prb_.sh);
+    map_o2k(view, 5, prb_.ow, prb_.iw, prb_.kw, prb_.dw, prb_.pw, prb_.sw);
     view.set_tlayout(dst);
     return view;
 }
 
 view_t conv_post_op_view_mapper_t::create_view(const memory_desc_t &md) const {
-    int cp_ndims = cp_view().nvdims();
-    ir_assert(cp_ndims >= 3);
+    dim_idx_t cp_ndims = cp_view().nvdims();
+    gpu_assert(cp_ndims >= 3);
     // Add groups to match ngcdhw layout.
     bool add_groups = (cp_view().vvars()[1].as<var_t>().name == "g");
     layout_t layout(md, /*do_normalize=*/false);
@@ -251,9 +257,9 @@ view_t conv_post_op_view_mapper_t::create_view(const memory_desc_t &md) const {
     padded_dims = normalize_conv_dims(padded_dims, /*with_groups=*/false,
             prb_.g, prb_.is_dw, prb_.dhw_map, add_groups,
             /*is_wei=*/false);
-    ir_assert(layout.ndims() == cp_ndims) << "Incompatible dimensions.";
+    gpu_assert(layout.ndims() == cp_ndims) << "Incompatible dimensions.";
     uint32_t bound_check_mask = 0;
-    for (int i = 0; i < cp_ndims; i++) {
+    for (dim_idx_t i = 0; i < cp_ndims; i++) {
         if (dims[i] == 1) continue; // Broadcast, no bound check needed.
         if (padded_dims[i] != cp_view().tlayout().dim(i)) {
             bound_check_mask |= (1 << i);
@@ -270,7 +276,7 @@ view_t conv_post_op_view_mapper_t::try_create_bias_view(uint32_t mask) const {
     return {};
 }
 
-bool conv_post_op_view_mapper_t::is_spurious_spatial(int dim_idx) const {
+bool conv_post_op_view_mapper_t::is_spurious_spatial(dim_idx_t dim_idx) const {
     auto &var = cp_view().vvars()[dim_idx].as<var_t>();
 
     int sp_idx = -1;
@@ -284,30 +290,30 @@ bool conv_post_op_view_mapper_t::is_spurious_spatial(int dim_idx) const {
         return false;
     }
 
-    int p = utils::pick(sp_idx, prb_.pd, prb_.ph, prb_.pw);
-    int s = utils::pick(sp_idx, prb_.sd, prb_.sh, prb_.sw);
-    int k = utils::pick(sp_idx, prb_.kd, prb_.kh, prb_.kw);
-    int d = utils::pick(sp_idx, prb_.dd, prb_.dh, prb_.dw);
+    dim_t p = utils::pick(sp_idx, prb_.pd, prb_.ph, prb_.pw);
+    dim_t s = utils::pick(sp_idx, prb_.sd, prb_.sh, prb_.sw);
+    dim_t k = utils::pick(sp_idx, prb_.kd, prb_.kh, prb_.kw);
+    dim_t d = utils::pick(sp_idx, prb_.dd, prb_.dh, prb_.dw);
 
     if (prb_.is_fwd) {
-        int o_value = utils::pick(sp_idx, prb_.od, prb_.oh, prb_.ow);
-        int o_bound = schedule_.var_bound(var);
-        int i = utils::pick(sp_idx, prb_.id, prb_.ih, prb_.iw);
+        dim_t o_value = utils::pick(sp_idx, prb_.od, prb_.oh, prb_.ow);
+        dim_t o_bound = schedule_.var_bound(var);
+        dim_t i = utils::pick(sp_idx, prb_.id, prb_.ih, prb_.iw);
 
-        for (int o = o_value; o < o_bound; o++) {
-            int i_min = o * s - p;
+        for (dim_t o = o_value; o < o_bound; o++) {
+            dim_t i_min = o * s - p;
             if (i_min < i) return true;
         }
         return false;
     }
 
     if (prb_.is_bwd_d) {
-        int i_value = utils::pick(sp_idx, prb_.id, prb_.ih, prb_.iw);
-        int i_bound = schedule_.var_bound(var);
-        int o = utils::pick(sp_idx, prb_.od, prb_.oh, prb_.ow);
+        dim_t i_value = utils::pick(sp_idx, prb_.id, prb_.ih, prb_.iw);
+        dim_t i_bound = schedule_.var_bound(var);
+        dim_t o = utils::pick(sp_idx, prb_.od, prb_.oh, prb_.ow);
 
-        for (int i = i_value; i < i_bound; i++) {
-            int os_min = i - (k - 1) * (d + 1) + p;
+        for (dim_t i = i_value; i < i_bound; i++) {
+            dim_t os_min = i - (k - 1) * (d + 1) + p;
             if (os_min < o * s) return true;
         }
         return false;
diff --git a/src/gpu/intel/jit/conv/normalization.hpp b/src/gpu/intel/jit/conv/normalization.hpp
index cf15fb73712..cf926487376 100644
--- a/src/gpu/intel/jit/conv/normalization.hpp
+++ b/src/gpu/intel/jit/conv/normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,8 @@ class conv_post_op_view_mapper_t : public post_op_view_mapper_t {
             const conv_problem_t &prb, const zero_points_config_t &zp_cfg,
             const layout_t &zp_dst)
         : post_op_view_mapper_t(schedule.c_view())
-        , has_external_src_zps_(zp_cfg.needs_src_precalc)
+        , has_external_src_zps_(zp_cfg.needs_src_conv_precalc
+                  || zp_cfg.needs_src_reorder_precalc)
         , schedule_(schedule)
         , prb_(prb)
         , zp_dst_(zp_dst) {}
@@ -55,7 +56,7 @@ class conv_post_op_view_mapper_t : public post_op_view_mapper_t {
     // bounds) convolution computes an out-of-bound element which is not
     // generally zero. This requires special handling if there are post-ops
     // followed the convolution.
-    bool is_spurious_spatial(int dim_idx) const override;
+    bool is_spurious_spatial(dim_idx_t dim_idx) const override;
     bool need_to_restore_zero_padding() const override;
     bool use_dst_in_sum_post_op() const override;
     bool can_use_scales() const override;
@@ -73,8 +74,8 @@ class conv_post_op_view_mapper_t : public post_op_view_mapper_t {
 };
 
 void normalize_conv_layouts(layout_t &src_layout, layout_t &wei_layout,
-        layout_t &dst_layout, layout_t &bia_layout, bool with_groups, int g,
-        int ic, int oc, bool is_dw, const std::array<int, 3> &dhw_map,
+        layout_t &dst_layout, layout_t &bia_layout, bool with_groups, dim_t g,
+        dim_t ic, dim_t oc, bool is_dw, const std::array<int, 3> &dhw_map,
         bool add_groups);
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/conv/pipeline.cpp b/src/gpu/intel/jit/conv/pipeline.cpp
index da99085b95f..def57b54440 100644
--- a/src/gpu/intel/jit/conv/pipeline.cpp
+++ b/src/gpu/intel/jit/conv/pipeline.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,24 +31,24 @@ struct loop_info_t {
     loop_info_t() = default;
 
     loop_info_t(const stmt_t &s) {
-        ir_assert(s.is<for_t>()) << s;
+        gpu_assert(s.is<for_t>()) << s;
         auto &loop = s.as<for_t>();
         stmt = s;
         var = loop.var;
         init_ = loop.init;
         bound_ = loop.bound;
         expr_t e_size = simplify(bound_ - init_);
-        ir_assert(is_const(e_size));
+        gpu_assert(is_const(e_size));
         size_ = to_cpp<int>(e_size);
     }
 
     int init() const {
-        ir_assert(is_const(init_));
+        gpu_assert(is_const(init_));
         return to_cpp<int>(init_);
     }
 
     int bound() const {
-        ir_assert(is_const(bound_));
+        gpu_assert(is_const(bound_));
         return to_cpp<int>(bound_);
     }
 
@@ -80,7 +80,7 @@ class multi_loop_iterator_t {
         for (size_t i = 0; i < loops_.size(); i++) {
             if (loops_[i].var.is_same(var)) return var_values_[i];
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return 0;
     }
 
@@ -92,7 +92,7 @@ class multi_loop_iterator_t {
                 if (++var_values_[i] < l.bound()) break;
                 var_values_[i] = l.init();
             }
-            ir_assert(var_values_.back() < loops_.back().bound());
+            gpu_assert(var_values_.back() < loops_.back().bound());
         }
     }
 
@@ -130,7 +130,7 @@ class compute_step_visitor_t : public ir_visitor_t {
     stmt_t find_stmt_group(const stmt_label_t &label) const {
         auto groups = find_stmt_groups(label);
         if (groups.empty()) return stmt_t();
-        ir_assert(groups.size() == 1);
+        gpu_assert(groups.size() == 1);
         return groups[0];
     }
 
@@ -176,7 +176,7 @@ class compute_step_visitor_t : public ir_visitor_t {
             }
 
             if (!ok) {
-                ir_error_not_expected()
+                gpu_error_not_expected()
                         << "Found unexpected statement inside loop.\n"
                         << stmt_t(obj);
             }
@@ -204,7 +204,7 @@ class compute_step_visitor_t : public ir_visitor_t {
         ir_visitor_t::_visit(obj);
         if (in_compute_loop_ && is_let) {
             if (found_loop_)
-                ir_error_not_expected()
+                gpu_error_not_expected()
                         << "Let is allowed in the innermost loop only.";
 
             inner_let_stmts_.push_back(replace_stmt_body(obj, stmt_t()));
@@ -243,8 +243,8 @@ class compute_step_t {
         c_zero_out_ = v.find_stmt_group(stmt_label_t::c_zero_out());
         inner_let_stmts_ = v.inner_let_stmts();
 
-        ir_assert(g2r_load_.size() == mul_.size());
-        ir_assert(s2r_load_.size() == mul_.size());
+        gpu_assert(g2r_load_.size() == mul_.size());
+        gpu_assert(s2r_load_.size() == mul_.size());
 
         // Assign preload/mul tags to let statements.
         for (auto &_let : inner_let_stmts_) {
@@ -278,7 +278,7 @@ class compute_step_t {
             }
             auto let_info = create_let_info(
                     let, is_preload_let(_let), is_mul_let(_let));
-            let_infos.push_back(let_info);
+            let_infos.push_back(std::move(let_info));
             seen.insert(_let);
         };
         for (auto &_let : inner_let_stmts_)
@@ -335,7 +335,7 @@ class compute_step_t {
 
     void duplicate_lets(const std::vector<let_info_t> &let_infos) {
         int nlets = int(inner_let_stmts_.size());
-        ir_assert(int(let_infos.size()) == nlets);
+        gpu_assert(int(let_infos.size()) == nlets);
 
         std::vector<stmt_t> new_lets;
         for (int i = nlets - 1; i >= 0; i--) {
@@ -409,10 +409,10 @@ class compute_step_t {
             if (!info.needs_update()) continue;
             if (!contains_object(ret, info.var)) continue;
             if (is_preload) {
-                ir_assert(info.is_preload());
+                gpu_assert(info.is_preload());
                 ret = substitute(ret, info.var, info.preload_var);
             } else if (is_mul) {
-                ir_assert(info.is_mul());
+                gpu_assert(info.is_mul());
                 ret = substitute(ret, info.var, info.mul_var);
             }
         }
@@ -622,25 +622,25 @@ struct compute_params_t {
         : slm_bufs(slm_bufs)
         , gmem_bufs(gmem_bufs)
         , slm_buf_size(slm_buf_size)
-        , prefetch_bufs(prefetch_bufs) {
-        use_slm = (slm_buf_size > 0);
-        use_prefetch = (prefetch_bufs > 0);
-        ir_assert(!use_slm || !use_prefetch)
+        , prefetch_bufs(prefetch_bufs)
+        , use_slm(slm_buf_size > 0)
+        , use_prefetch(prefetch_bufs > 0) {
+        gpu_assert(!use_slm || !use_prefetch)
                 << "Can't have both SLM buffering and prefetch enabled.";
         if (use_slm) {
-            ir_assert(utils::one_of(slm_bufs, 1, 2, 3));
-            ir_assert(utils::one_of(gmem_bufs, 1, 2));
+            gpu_assert(utils::one_of(slm_bufs, 1, 2, 3));
+            gpu_assert(utils::one_of(gmem_bufs, 1, 2));
             preload_bufs = slm_bufs;
             unroll = math::lcm(slm_bufs * gmem_bufs, inner_loops_iters);
         } else if (use_prefetch) {
             preload_bufs = prefetch_bufs;
-            ir_assert(slm_bufs == 0);
-            ir_assert(gmem_bufs == 0);
+            gpu_assert(slm_bufs == 0);
+            gpu_assert(gmem_bufs == 0);
             unroll = math::lcm(prefetch_bufs, inner_loops_iters);
         } else {
             preload_bufs = 0;
-            ir_assert(slm_bufs == 0);
-            ir_assert(gmem_bufs == 0);
+            gpu_assert(slm_bufs == 0);
+            gpu_assert(gmem_bufs == 0);
             unroll = inner_loops_iters;
         }
     }
@@ -667,7 +667,7 @@ class compute_iterator_t {
 
         int compute_iters = loop_nest.size();
         iters = compute_iters;
-        ir_assert(iters >= 1) << "Empty loop is not expected.";
+        gpu_assert(iters >= 1) << "Empty loop is not expected.";
 
         iters += std::max(0, preload_bufs() - 1) + std::max(0, gmem_bufs() - 1);
         ramp_up_iters
@@ -679,7 +679,7 @@ class compute_iterator_t {
         body_iters = utils::rnd_dn(body_iters, params.unroll);
         ramp_down_iters = iters - ramp_up_iters - body_iters;
 
-        ir_assert(ramp_up_iters + body_iters + ramp_down_iters == iters);
+        gpu_assert(ramp_up_iters + body_iters + ramp_down_iters == iters);
 
         iter = 0;
         linear_id = 0;
@@ -706,11 +706,11 @@ class compute_iterator_t {
     void advance(int n) {
         if (n == 0) return;
 
-        ir_assert(n % params.unroll == 0);
-        ir_assert(iter + n <= iters);
+        gpu_assert(n % params.unroll == 0);
+        gpu_assert(iter + n <= iters);
 
-        if (preload_bufs() > 0) ir_assert(do_preload());
-        ir_assert(do_mul());
+        if (preload_bufs() > 0) gpu_assert(do_preload());
+        gpu_assert(do_mul());
 
         iter += n;
         riter -= n;
@@ -768,23 +768,23 @@ class compute_iterator_t {
 
     bool do_g2s_store() const {
         if (!params.use_slm) return false;
-        ir_assert(gmem_bufs() >= 1);
+        gpu_assert(gmem_bufs() >= 1);
         return iter >= (gmem_bufs() - 1) && riter >= (slm_bufs() - 1);
     }
 
     int gmem_write_buf_index() const {
-        ir_assert(do_g2s_load());
+        gpu_assert(do_g2s_load());
         return iter % gmem_bufs();
     }
 
     int gmem_read_buf_index() const {
-        ir_assert(do_g2s_store());
+        gpu_assert(do_g2s_store());
         return (iter - (gmem_bufs() - 1)) % gmem_bufs();
     }
 
     int slm_read_offset_update() const {
-        ir_assert(params.use_slm);
-        ir_assert(do_mul());
+        gpu_assert(params.use_slm);
+        gpu_assert(do_mul());
 
         int slm_iter = iter - (gmem_bufs() - 1) - (slm_bufs() - 1);
         int cur_slm_idx = slm_iter % slm_bufs();
@@ -795,8 +795,8 @@ class compute_iterator_t {
     }
 
     int slm_write_offset_update() const {
-        ir_assert(params.use_slm);
-        ir_assert(do_g2s_store());
+        gpu_assert(params.use_slm);
+        gpu_assert(do_g2s_store());
 
         int slm_iter = iter - (gmem_bufs() - 1);
         int cur_slm_idx = slm_iter % slm_bufs();
@@ -830,7 +830,7 @@ class sbid_manager_t {
     sbid_manager_t(const hw_t &hw = hw_t(), const int regs = 128)
         : sbid_count_(ngen::tokenCount(hw.to_ngen(), regs))
         , tuple_func_(builtin_t::make("tuple")) {
-        ir_assert(sbid_count_ <= max_sbid_count);
+        gpu_assert(sbid_count_ <= max_sbid_count);
     }
 
     ngen_proxy::SBID get_sbid(const expr_t &buf, int index = 0) {
@@ -921,7 +921,7 @@ class sbid_assigner_t {
                     s = update_call_with_sbid(s, sbid);
                 }
             } else {
-                ir_error_not_expected() << s;
+                gpu_error_not_expected() << s;
             }
             ret = substitute(ret, _s, s);
         }
@@ -1124,12 +1124,12 @@ class slm_sync_manager_t {
             ver_ = (version_t)cfg.slm().sync_version();
         }
         switch (slm_bufs_) {
-            case 2: ir_assert(ver_ == version_t::x2); break;
+            case 2: gpu_assert(ver_ == version_t::x2); break;
             case 3:
-                ir_assert(utils::one_of(ver_, version_t::x3_v1,
+                gpu_assert(utils::one_of(ver_, version_t::x3_v1,
                         version_t::x3_v2, version_t::x3_v3));
                 break;
-            default: ir_assert(ver_ == version_t::undef);
+            default: gpu_assert(ver_ == version_t::undef);
         }
     }
 
@@ -1322,11 +1322,11 @@ class simple_slm_buffering_injector_t {
         , slm_sync_mgr_(cfg, /*with_unroll=*/false) {}
 
     stmt_t inject() {
-        ir_assert(cfg_.slm().gmem_bufs() == 1)
+        gpu_assert(cfg_.slm().gmem_bufs() == 1)
                 << "GRF buffering is not supported.";
         if (utils::one_of(cfg_.slm().bufs(), 0, 1)) return root_;
 
-        ir_assert(cfg_.slm().a() == cfg_.slm().b())
+        gpu_assert(cfg_.slm().a() == cfg_.slm().b())
                 << "Mixed SLM/GMEM loads are not supported.";
 
         auto loop = step_.compute_loop();
@@ -1381,7 +1381,7 @@ class simple_slm_buffering_injector_t {
         auto g2s_load = g2s_load_orig;
         auto g2s_store = g2s_store_orig;
 
-        ir_assert(s2r_load.size() == mul.size());
+        gpu_assert(s2r_load.size() == mul.size());
 
         stmt_t s2r_mul;
         for (int i = 0; i < int(mul.size()); i++) {
@@ -1458,7 +1458,7 @@ class simple_slm_buffering_injector_t {
 
         alloc_updater_t alloc_updater;
         auto slm_buffers = alloc_mgr_.find_buffers(alloc_kind_t::slm);
-        ir_assert(slm_buffers.size() == 1);
+        gpu_assert(slm_buffers.size() == 1);
         auto &slm_buf = slm_buffers[0];
         int non_ab_slm_size = alloc_mgr_.alloc_size(slm_buf) - ab_slm_size_;
         alloc_updater.resize(
@@ -1572,8 +1572,8 @@ class unrolling_injector_t {
         };
 
         bmnk_dim_helper_t h(cfg_);
-        int k_iter_blk = h.iter_dim(prb_dims::k);
-        int reduce_iter_bytes = k_iter_blk * cfg_.prb().a_data_type_size;
+        dim_t k_iter_blk = h.iter_dim(pvars::k);
+        dim_t reduce_iter_bytes = k_iter_blk * cfg_.prb().a_data_type_size;
         // Add periodic signal-wait thread group synchronization in some cases.
         // This is to ensure threads access close reduction blocks and able to
         // reuse their common data from L1.
@@ -1606,7 +1606,7 @@ class unrolling_injector_t {
                 }
                 loop_body = loop_body.append(create_iteration(
                         it, sbid_mgr, /*in_loop_body=*/has_loop));
-                ir_assert(it.do_mul());
+                gpu_assert(it.do_mul());
                 loop_body = append_outer_post_inc(loop_body);
                 ++it;
             }
@@ -1615,7 +1615,7 @@ class unrolling_injector_t {
             if (!has_loop) {
                 body = body.append(loop_body);
             } else {
-                ir_assert(extent > 0);
+                gpu_assert(extent > 0);
                 auto for_var = ir_ctx_.create_tmp_var(type_t::s32(), "i");
                 body = body.append(for_t::make(for_var, 0, extent, loop_body));
             }
@@ -1624,7 +1624,7 @@ class unrolling_injector_t {
 
         // Ramp-down.
         for (int i = 0; i < it.ramp_down_iters; i++) {
-            ir_assert(it.do_mul());
+            gpu_assert(it.do_mul());
             body = body.append(create_iteration(it, sbid_mgr));
             body = append_outer_post_inc(body);
             ++it;
@@ -1655,7 +1655,7 @@ class unrolling_injector_t {
 
             auto slm_buffers = alloc_mgr_.find_buffers(alloc_kind_t::slm);
             if (!slm_buffers.empty()) {
-                ir_assert(slm_buffers.size() == 1);
+                gpu_assert(slm_buffers.size() == 1);
 
                 auto &slm_buf = slm_buffers[0];
                 int non_ab_slm_size
@@ -1714,7 +1714,7 @@ class unrolling_injector_t {
                 s = const_fold(substitute(s, v, mul_var_value));
             }
             for (auto &s : s2r_load) {
-                if (count_object(s, v) > 0) ir_error_not_expected();
+                if (count_object(s, v) > 0) gpu_error_not_expected();
                 s = const_fold(substitute(s, v, preload_var_value));
             }
             for (int i = 0; i < int(lets.size()); i++) {
@@ -1728,7 +1728,7 @@ class unrolling_injector_t {
                 } else if (is_mul_let && !is_preload_let) {
                     var_value = mul_var_value;
                 } else {
-                    ir_assert(count_object(let.as<let_t>().value, v) == 0)
+                    gpu_assert(count_object(let.as<let_t>().value, v) == 0)
                             << "Unexpected reference to variable " << v
                             << " from " << let;
                     continue;
@@ -1924,8 +1924,8 @@ class unrolling_injector_t {
         for (auto &_c : calls) {
             auto &c = _c.as<func_call_t>();
             if (!c.func.is<send_t>()) continue;
-            auto &buf = (is_mem ? send_t::arg_mem_buf(_c)
-                                : send_t::arg_reg_buf(_c));
+            auto &buf = (is_mem ? send_t::arg_mem_buf(c)
+                                : send_t::arg_reg_buf(c));
             ret.insert(buf.as<ptr_t>().base);
         }
         return ret;
diff --git a/src/gpu/intel/jit/conv/plan.cpp b/src/gpu/intel/jit/conv/plan.cpp
index 5ee61263a21..1c5067b6d0b 100644
--- a/src/gpu/intel/jit/conv/plan.cpp
+++ b/src/gpu/intel/jit/conv/plan.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ class dim_tile_t {
 
 private:
     static const expr_t &not_empty(const expr_t &v) {
-        ir_assert(!v.is_empty()) << "Queried empty index.";
+        gpu_assert(!v.is_empty()) << "Queried empty index.";
         return v;
     }
 
@@ -69,13 +69,13 @@ static dim_tile_t create_tile(gemm_schedule_t &gemm_schedule,
         const conv_config_t &cfg, const expr_t &dim) {
     dim_tile_t tile;
     auto &name = dim.as<var_t>().name;
-    auto conv_dim = prb_dim_t::from_name(name);
-    int loop_dim = cfg.loop_dim(conv_dim);
-    int tg_dim = cfg.thread_group_dim(conv_dim);
-    int iter_dim = cfg.iter_dim(conv_dim);
+    auto conv_dim = pvar_t(name);
+    dim_t loop_dim = cfg.loop_dim(conv_dim);
+    dim_t tg_dim = cfg.thread_group_dim(conv_dim);
+    dim_t iter_dim = cfg.iter_dim(conv_dim);
 
-    std::vector<int> dims = {1, loop_dim, tg_dim, iter_dim};
-    int ndims = (int)dims.size();
+    std::vector<dim_t> dims = {1, loop_dim, tg_dim, iter_dim};
+    dim_idx_t ndims = into<dim_idx_t>(dims.size());
     std::vector<expr_t> idxs(ndims);
 
     static const char *suffixes[]
@@ -120,15 +120,15 @@ void bind_thread_group_grid_idx(const conv_config_t &cfg,
     auto grid_dims = get_thread_group_grid_conv_dims(cfg);
     int grid_id = -1;
     for (auto &v : gemm_schedule.get_root_vars(var)) {
-        auto v_dim = prb_dim_t::from_name(v.as<var_t>().name);
+        auto v_dim = pvar_t(v.as<var_t>().name);
         for (int i = 0; i < 3; i++) {
             if (grid_dims[i].has(v_dim)) {
-                ir_assert(grid_id == -1 || grid_id == i);
+                gpu_assert(grid_id == -1 || grid_id == i);
                 grid_id = i;
             }
         }
     }
-    ir_assert(grid_id != -1);
+    gpu_assert(grid_id != -1);
     gemm_schedule.bind(var, cfg.thread_group_grid().idx(grid_id));
 }
 
@@ -137,8 +137,8 @@ void bind_kernel_grid(
     for (auto &v : vars) {
         if (gemm_schedule.var_bound(v) == 1) continue;
         auto root_vars = gemm_schedule.get_root_vars(v);
-        ir_assert((int)root_vars.size() == 1);
-        auto v_dim = prb_dim_t::from_name(root_vars[0].as<var_t>().name);
+        gpu_assert((int)root_vars.size() == 1);
+        auto v_dim = pvar_t(root_vars[0].as<var_t>().name);
         auto dummy_grid_var
                 = gemm_schedule.kernel_grid_walk_order().grid_var(v_dim);
         gemm_schedule.bind(v, dummy_grid_var);
@@ -171,13 +171,13 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     od = var_t::make(type_t::s32(), "od");
     oh = var_t::make(type_t::s32(), "oh");
     ow = var_t::make(type_t::s32(), "ow");
-    check_ow = (prb_.ow < cfg_.padded_dim(prb_dims::ow));
+    check_ow = (prb_.ow < cfg_.padded_dim(pvars::ow));
 
     // Initialize masks.
     expr_t id_mask, ih_mask, iw_mask;
     expr_t od_mask, oh_mask, ow_mask;
 
-    bool check_kw = (prb_.kw < cfg_.padded_dim(prb_dims::kw));
+    bool check_kw = (prb_.kw < cfg_.padded_dim(pvars::kw));
     bool check_iw = check_kw || check_ow
             || utils::need_src_or_dst_check(prb_.is_fwd, prb_.ow, prb_.iw,
                     prb_.kw, prb_.pw, prb_.sw, prb_.dw);
@@ -214,7 +214,7 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     src_view.set_tdim(4, oh * prb_.sh - prb_.ph + kh * (1 + prb_.dh), ih_mask);
     src_view.set_tdim(5, ow * prb_.sw - prb_.pw + kw * (1 + prb_.dw), iw_mask);
     src_view.set_tlayout(src_layout);
-    src_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    src_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Weights.
     wei_view = view_t({g, oc, ic, kd, kh, kw}, 6);
@@ -231,7 +231,7 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     wei_view.set_tdim(4, kh);
     wei_view.set_tdim(5, kw);
     wei_view.set_tlayout(wei_layout);
-    wei_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    wei_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Destination.
     dst_view = view_t({mb, g, oc, od, oh, ow}, 6);
@@ -248,7 +248,7 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     dst_view.set_tdim(4, oh, oh_mask);
     dst_view.set_tdim(5, ow, ow_mask);
     dst_view.set_tlayout(dst_layout);
-    dst_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    dst_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Initialize GEMM schedule.
     if (prb_.ab_swap_transpose) {
@@ -267,7 +267,7 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     gemm_schedule.set_k_vars({ic, kd, kh, kw});
 
     gemm_schedule.for_each_var([&](const expr_t &var) {
-        int bound = cfg_.padded_dim(prb_dim_t::from_name(var.as<var_t>().name));
+        dim_t bound = cfg_.padded_dim(pvar_t(var.as<var_t>().name));
         gemm_schedule.set_var_bound(var, bound);
     });
 
@@ -282,8 +282,8 @@ void init_fwd(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
 
     std::vector<expr_t> kernel_grid_vars;
     kernel_grid_vars.push_back(oc_tile.grid_idx());
-    kernel_grid_vars.push_back(od);
-    kernel_grid_vars.push_back(oh);
+    kernel_grid_vars.push_back(std::move(od));
+    kernel_grid_vars.push_back(std::move(oh));
     kernel_grid_vars.push_back(ow_tile.grid_idx());
     kernel_grid_vars.push_back(g_tile.grid_idx());
     kernel_grid_vars.push_back(mb_tile.grid_idx());
@@ -327,7 +327,7 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     // Initialize masks.
     expr_t od_mask(true), oh_mask(true), ow_mask(true);
 
-    bool check_iw = (prb_.iw < cfg_.padded_dim(prb_dims::iw));
+    bool check_iw = (prb_.iw < cfg_.padded_dim(pvars::iw));
     bool check_ow = check_iw
             || utils::need_src_or_dst_check(prb_.is_fwd, prb_.ow, prb_.iw,
                     prb_.kw, prb_.pw, prb_.sw, prb_.dw);
@@ -346,10 +346,10 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
         // Apply mapping to iw to ensure each thread group has the same
         // stride condition when evaluating skip conditions.
         iw_mapping = [&](const expr_t &e) {
-            int iw_tg_blk = cfg_.thread_group_dim(prb_dims::iw)
-                    * cfg_.iter_dim(prb_dims::iw);
-            int iw_bound = utils::rnd_up(prb_.iw, iw_tg_blk);
-            int iw_same_mod_blk = ir_utils::safe_divide(iw_bound, prb_.sw);
+            dim_t iw_tg_blk = cfg_.thread_group_dim(pvars::iw)
+                    * cfg_.iter_dim(pvars::iw);
+            dim_t iw_bound = utils::rnd_up(prb_.iw, iw_tg_blk);
+            dim_t iw_same_mod_blk = ir_utils::safe_divide(iw_bound, prb_.sw);
             return (e % iw_same_mod_blk) * prb_.sw + (e / iw_same_mod_blk);
         };
     } else {
@@ -388,14 +388,14 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
         case bwd_d_optimize_kind_t::skip_strided_dh:
             if (prb_.sw != 1) ow_mask &= (ow % prb_.sw == 0);
             break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     dst_view.set_tdim(3, od / prb_.sd, od_mask);
     dst_view.set_tdim(4, oh / prb_.sh, oh_mask);
     dst_view.set_tdim(5, ow / prb_.sw, ow_mask);
 
     dst_view.set_tlayout(dst_layout);
-    dst_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    dst_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Weights.
     wei_view = view_t({g, oc, ic, kd, kh, kw}, 6);
@@ -412,7 +412,7 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     wei_view.set_tdim(4, kh);
     wei_view.set_tdim(5, kw);
     wei_view.set_tlayout(wei_layout);
-    wei_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    wei_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Source.
     src_view = view_t({mb, g, ic, id, ih, iw}, 6);
@@ -429,7 +429,7 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     src_view.set_tdim(4, ih);
     src_view.set_tdim(5, iw_mapping(iw));
     src_view.set_tlayout(src_layout);
-    src_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    src_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Initialize GEMM schedule.
     if (prb_.ab_swap_transpose) {
@@ -451,7 +451,7 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     gemm_schedule.set_k_vars({oc, kd, kh, kw});
 
     gemm_schedule.for_each_var([&](const expr_t &var) {
-        int bound = cfg_.padded_dim(prb_dim_t::from_name(var.as<var_t>().name));
+        dim_t bound = cfg_.padded_dim(pvar_t(var.as<var_t>().name));
         gemm_schedule.set_var_bound(var, bound);
     });
 
@@ -506,7 +506,7 @@ void init_bwd_d(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
                             0),
                     expr_t(1));
             break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
 }
 
@@ -534,10 +534,10 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     // Initialize masks.
     expr_t id_mask(true), ih_mask(true), iw_mask(true);
 
-    bool check_ow = (prb_.ow < cfg_.padded_dim(prb_dims::ow));
-    bool check_oh = (prb_.oh < cfg_.padded_dim(prb_dims::oh));
-    bool check_od = (prb_.od < cfg_.padded_dim(prb_dims::od));
-    bool check_kw = (prb_.kw < cfg_.padded_dim(prb_dims::kw));
+    bool check_ow = (prb_.ow < cfg_.padded_dim(pvars::ow));
+    bool check_oh = (prb_.oh < cfg_.padded_dim(pvars::oh));
+    bool check_od = (prb_.od < cfg_.padded_dim(pvars::od));
+    bool check_kw = (prb_.kw < cfg_.padded_dim(pvars::kw));
     bool check_iw = check_kw
             || utils::need_src_or_dst_check(/*is_fwd=*/true, prb_.ow, prb_.iw,
                     prb_.kw, prb_.pw, prb_.sw, prb_.dw);
@@ -576,7 +576,7 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     src_view.set_tdim(4, oh * prb_.sh - prb_.ph + kh * (1 + prb_.dh), ih_mask);
     src_view.set_tdim(5, ow * prb_.sw - prb_.pw + kw * (1 + prb_.dw), iw_mask);
     src_view.set_tlayout(src_layout);
-    src_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    src_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Weights.
     wei_view = view_t({g, oc, ic, kd, kh, kw}, 6);
@@ -593,7 +593,7 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     wei_view.set_tdim(4, kh);
     wei_view.set_tdim(5, kw);
     wei_view.set_tlayout(wei_layout);
-    wei_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    wei_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Destination.
     dst_view = view_t({mb, g, oc, od, oh, ow}, 6);
@@ -610,7 +610,7 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     dst_view.set_tdim(4, oh);
     dst_view.set_tdim(5, ow);
     dst_view.set_tlayout(dst_layout);
-    dst_view.set_tmasks(cfg_.padded_dims().get().to_map());
+    dst_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
 
     // Bias.
     if (prb_.with_bias) {
@@ -620,7 +620,7 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
         bia_view.set_tdim(0, g);
         bia_view.set_tdim(1, oc);
         bia_view.set_tlayout(bia_layout);
-        bia_view.set_tmasks(cfg_.padded_dims().get().to_map());
+        bia_view.set_tmasks(cfg_.padded_dims().get().to_string_map());
     }
 
     // Initialize GEMM schedule.
@@ -643,7 +643,7 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     gemm_schedule.set_k_vars({mb, od, oh, ow});
 
     gemm_schedule.for_each_var([&](const expr_t &var) {
-        int bound = cfg_.padded_dim(prb_dim_t::from_name(var.as<var_t>().name));
+        dim_t bound = cfg_.padded_dim(pvar_t(var.as<var_t>().name));
         gemm_schedule.set_var_bound(var, bound);
     });
 
@@ -661,8 +661,8 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     kernel_grid_vars.push_back(od_tile.grid_idx());
     kernel_grid_vars.push_back(oh_tile.grid_idx());
     kernel_grid_vars.push_back(ow_tile.grid_idx());
-    kernel_grid_vars.push_back(kd);
-    kernel_grid_vars.push_back(kh);
+    kernel_grid_vars.push_back(std::move(kd));
+    kernel_grid_vars.push_back(std::move(kh));
     kernel_grid_vars.push_back(kw_tile.grid_idx());
     kernel_grid_vars.push_back(ic_tile.grid_idx());
     kernel_grid_vars.push_back(mb_tile.grid_idx());
@@ -675,8 +675,8 @@ void init_bwd_w(const conv_config_t &cfg_, gemm_schedule_t &gemm_schedule,
     gemm_schedule.reorder({od_tile.loop_idx(), oh_tile.loop_idx(),
             ow_tile.loop_idx(), mb_tile.loop_idx()});
 
-    gemm_schedule.unroll(mb_tile.loop_idx(), cfg_.unroll(prb_dims::mb));
-    gemm_schedule.unroll(ow_tile.loop_idx(), cfg_.unroll(prb_dims::ow));
+    gemm_schedule.unroll(mb_tile.loop_idx(), cfg_.unroll(pvars::mb));
+    gemm_schedule.unroll(ow_tile.loop_idx(), cfg_.unroll(pvars::ow));
 
     gemm_schedule.tensorize(g_tile.iter_idx());
     gemm_schedule.tensorize(oc_tile.iter_idx());
@@ -708,7 +708,7 @@ bool reorder_plan_t::can_split(int factor) const {
 
 void reorder_plan_t::set_split(int factor) {
     if (!*this) return;
-    ir_assert(can_split(factor));
+    gpu_assert(can_split(factor));
     split_factor = factor;
 }
 
@@ -721,16 +721,16 @@ stmt_t reorder_plan_t::create_stmt(
     return stmt;
 }
 
-int reorder_plan_t::src_buf_size() const {
-    int src_size = utils::div_up(src.size(), split_factor);
+dim_t reorder_plan_t::src_buf_size() const {
+    dim_t src_size = utils::div_up(src.size(), split_factor);
     return src_size;
 }
 
-int reorder_plan_t::estimate_regs() const {
+dim_t reorder_plan_t::estimate_regs() const {
     if (!*this) return 0;
 
-    int dst_size = utils::div_up(dst.size(), split_factor);
-    int ret = 0;
+    dim_t dst_size = utils::div_up(dst.size(), split_factor);
+    dim_t ret = 0;
     ret += utils::rnd_up(dst_size, grf_size());
     return utils::div_up(ret, grf_size());
 }
@@ -744,8 +744,8 @@ reduce_plan_t create_reduce_plan(const hw_t &hw, const layout_t &src,
     return ret;
 }
 
-int reduce_plan_t::dst_buf_size() const {
-    int dst_size = utils::div_up(dst.size(), split_factor);
+dim_t reduce_plan_t::dst_buf_size() const {
+    dim_t dst_size = utils::div_up(dst.size(), split_factor);
     return utils::rnd_up(dst_size, grf_size());
 }
 
@@ -755,7 +755,7 @@ bool reduce_plan_t::can_split(int factor) const {
     if (split_src.is_empty()) return false;
 
     // Do not split by reduction dims.
-    for (int i = 0; i < src.ndims(); i++) {
+    for (dim_idx_t i = 0; i < src.ndims(); i++) {
         if ((mask & (1 << i)) != 0 && split_src.dim(i) != src.dim(i))
             return false;
     }
@@ -765,7 +765,7 @@ bool reduce_plan_t::can_split(int factor) const {
 
 void reduce_plan_t::set_split(int factor) {
     if (!*this) return;
-    ir_assert(can_split(factor));
+    gpu_assert(can_split(factor));
     split_factor = factor;
 }
 
@@ -825,7 +825,7 @@ bool x2r_plan_t::can_split(abc_kind_t abc, int factor) const {
 }
 
 void x2r_plan_t::set_split(abc_kind_t abc, int factor) {
-    ir_assert(can_split(abc, factor));
+    gpu_assert(can_split(abc, factor));
     // Reset split factors.
     a_load.set_split(1);
     a_reorder.set_split(1);
@@ -876,7 +876,7 @@ std::string x2r_plan_t::str() const {
     return add_indent("x2r_plan", oss.str());
 }
 
-int get_dpas_block_rcount(const layout_t &layout, int dim_idx) {
+int get_dpas_block_rcount(const layout_t &layout, dim_idx_t dim_idx) {
     if (layout.nblocks() < 2) return 1;
 
     auto &b0 = layout.blocks()[0];
@@ -898,7 +898,7 @@ bool fma_plan_t::can_split(abc_kind_t abc, int factor) const {
     bool is_a = (abc == abc_kind_t::a);
     bool is_m = is_a;
     auto &layout = is_a ? a_layout : b_layout;
-    int mn_idx = is_a ? 1 : 2;
+    dim_idx_t mn_idx = is_a ? 1 : 2;
     int dim = (int)layout.dim(mn_idx);
     if (dim % factor != 0) return false;
     int blk = is_m ? m_blk : n_blk;
@@ -912,7 +912,7 @@ bool fma_plan_t::can_split(abc_kind_t abc, int factor) const {
 }
 
 void fma_plan_t::set_split(abc_kind_t abc, int factor) {
-    ir_assert(can_split(abc, factor));
+    gpu_assert(can_split(abc, factor));
     split_abc = abc;
     split_factor = factor;
     if (abc == abc_kind_t::a
@@ -926,14 +926,14 @@ void fma_plan_t::set_split(abc_kind_t abc, int factor) {
 }
 
 int fma_plan_t::a_buf_size() const {
-    int a_size = a_layout.size();
+    int a_size = into<int>(a_layout.size());
     if (split_abc == abc_kind_t::a)
         a_size = utils::div_up(a_size, split_factor);
     return utils::rnd_up(a_size, grf_size());
 }
 
 int fma_plan_t::b_buf_size() const {
-    int b_size = b_layout.size();
+    int b_size = into<int>(b_layout.size());
     if (split_abc == abc_kind_t::b)
         b_size = utils::div_up(b_size, split_factor);
     return utils::rnd_up(b_size, grf_size());
@@ -941,12 +941,12 @@ int fma_plan_t::b_buf_size() const {
 
 int fma_plan_t::bmnk_split_idx(
         bmnk_kind_t bmnk, int split_off, bool is_start) const {
-    int B = (int)a_layout.dim(0);
-    int M = (int)a_layout.dim(1);
-    int N = (int)b_layout.dim(2);
-    int K = (int)a_layout.dim(2);
+    dim_t B = a_layout.dim(0);
+    dim_t M = a_layout.dim(1);
+    dim_t N = b_layout.dim(2);
+    dim_t K = a_layout.dim(2);
     int start[4] = {0, 0, 0, 0};
-    int stop[4] = {B, M, N, K};
+    dim_t stop[4] = {B, M, N, K};
     bool split_a = (split_abc == abc_kind_t::a);
     bool split_b = (split_abc == abc_kind_t::b);
     bool is_m = (bmnk == bmnk_kind_t::m);
@@ -958,8 +958,8 @@ int fma_plan_t::bmnk_split_idx(
         off = split_off;
     }
     int i0 = start[(int)bmnk];
-    int i1 = stop[(int)bmnk];
-    ir_assert((i1 - i0) % factor == 0);
+    int i1 = into<int>(stop[(int)bmnk]);
+    gpu_assert((i1 - i0) % factor == 0);
     int step = (i1 - i0) / factor;
     int idx = i0 + off * step;
     return is_start ? idx : idx + step;
@@ -976,7 +976,8 @@ int fma_plan_t::bmnk_stop_idx(bmnk_kind_t bmnk, int subtile_idx) const {
 stmt_t fma_plan_t::create_stmt(ir_context_t &ir_ctx, buffer_manager_t &buf_mgr,
         const std::string &a, const std::string &b, const std::string &c,
         int subtile_idx) const {
-    int c_buf_size = utils::rnd_up(c_layout.size(), ir_ctx.grf_size());
+    int c_buf_size
+            = into<int>(utils::rnd_up(c_layout.size(), ir_ctx.grf_size()));
     auto a_buf = buf_mgr.get(a);
     auto b_buf = buf_mgr.get(b);
     auto c_buf = buf_mgr.get(c, c_buf_size);
@@ -989,9 +990,9 @@ stmt_t fma_plan_t::create_stmt(ir_context_t &ir_ctx, buffer_manager_t &buf_mgr,
     int k0 = bmnk_start_idx(bmnk_kind_t::k, subtile_idx);
     int k1 = bmnk_stop_idx(bmnk_kind_t::k, subtile_idx);
 
-    std::vector<int> a_idx(3);
-    std::vector<int> b_idx(3);
-    std::vector<int> c_idx(3);
+    std::vector<dim_t> a_idx(3);
+    std::vector<dim_t> b_idx(3);
+    std::vector<dim_t> c_idx(3);
 
     auto fma_funcs = create_fma_funcs(ir_ctx.hw());
 
@@ -1004,9 +1005,9 @@ stmt_t fma_plan_t::create_stmt(ir_context_t &ir_ctx, buffer_manager_t &buf_mgr,
                 b_idx[2] = c_idx[2] = n;
                 for (int m = m0; m < m1; m += m_blk) {
                     a_idx[1] = c_idx[1] = m;
-                    int a_off = a_layout.offset_in_bytes(a_idx);
-                    int b_off = b_layout.offset_in_bytes(b_idx);
-                    int c_off = c_layout.offset_in_bytes(c_idx);
+                    dim_t a_off = a_layout.offset_in_bytes(a_idx);
+                    dim_t b_off = b_layout.offset_in_bytes(b_idx);
+                    dim_t c_off = c_layout.offset_in_bytes(c_idx);
                     a_off = a_off % a_buf_size();
                     b_off = b_off % b_buf_size();
                     stmt = stmt.append(create_fma_block(fma_funcs, a_buf[a_off],
@@ -1025,7 +1026,7 @@ stmt_t fma_plan_t::create_fma_block(const std::vector<func_t> &fmas,
     auto src2 = b;
     auto dst = c;
     if (is_dpas) std::swap(src1, src2);
-    if (!is_dpas) ir_assert(fmas.size() == 1);
+    if (!is_dpas) gpu_assert(fmas.size() == 1);
     stmt_t ret;
     for (auto &f : fmas) {
         ret = ret.append(f.call({dst, dst, src1, src2}));
@@ -1062,20 +1063,21 @@ std::vector<func_t> fma_plan_t::create_fma_funcs(const hw_t &hw) const {
             int sdepth = ir_utils::safe_divide(k_blk * a.type().size(), 4);
             for (int r = 0; r < block_rcount;) {
                 int rcount = std::min(max_rcount, block_rcount - r);
-                auto dpas = dpas_t::make(/*is_dpasw=*/false, simd, sdepth,
-                        rcount, c.type(), b.type(), a.type());
+                auto dpas = dpas_t::make(/*is_dpasw=*/false, simd,
+                        into<uint8_t>(sdepth), into<uint8_t>(rcount), c.type(),
+                        b.type(), a.type());
                 ret.push_back(dpas);
                 r += rcount;
             }
             break;
         }
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return ret;
 }
 
 int fma_plan_t::estimate_regs() const {
-    return utils::div_up(c_layout.size(), grf_size());
+    return into<int>(utils::div_up(c_layout.size(), grf_size()));
 }
 
 std::string fma_plan_t::str() const {
@@ -1101,7 +1103,7 @@ bool conv_plan_t::can_split(abc_kind_t abc, int factor) const {
 }
 
 void conv_plan_t::set_split(abc_kind_t abc, int factor) {
-    ir_assert(can_split(abc, factor));
+    gpu_assert(can_split(abc, factor));
     split_abc = abc;
     split_factor = factor;
     x2r.set_split(abc, factor);
@@ -1115,7 +1117,7 @@ bool conv_plan_t::uses_2d_load(abc_kind_t abc) const {
 }
 
 grf_usage_t conv_plan_t::grf_usage() const {
-    ir_assert(reserved_regs != -1);
+    gpu_assert(reserved_regs != -1);
     bool with_headers = !reuse_headers;
 
     int out_buf_regs = 0;
@@ -1142,7 +1144,7 @@ grf_usage_t conv_plan_t::grf_usage() const {
                     /*with_headers=*/false,
                     /*reuse_headers=*/false);
     if (x2r.a_reorder && x2r.b_reorder) {
-        ir_assert(!use_a_slm && !use_b_slm);
+        gpu_assert(!use_a_slm && !use_b_slm);
         // Reuse load buffer when both reorders are enabled.
         gmem_load_buf_regs += std::max(a_g2r_buf_regs, b_g2r_buf_regs);
     } else {
@@ -1241,6 +1243,19 @@ std::string conv_plan_t::str() const {
     return jit::add_indent("conv_plan", oss.str());
 }
 
+type_t get_accumulation_type(
+        const conv_config_t &cfg, const type_t &a, const type_t &b) {
+    if (a.is_int()) return type_t::s32();
+    if (a.is_f64()) return type_t::f64();
+    if (cfg.fma_kind() == fma_kind_t::mad && a.is_f16() && b.is_f16()
+            && cfg.prb().is_fwd) {
+        // FIXME: f16 must use f32 accumulator according to documentation.
+        // Temporarily keeping f16 to avoid regressions.
+        return type_t::f16();
+    }
+    return type_t::f32();
+}
+
 struct fma_layout_hint_t {
     int vec_dim_idx = -1;
 
@@ -1248,17 +1263,16 @@ struct fma_layout_hint_t {
 };
 
 struct fma_context_t {
-    fma_context_t(const conv_config_t &cfg) {
-        hw = cfg.hw();
-        simd = cfg.simd();
-        vec_size = cfg.vec_size();
-        fma = cfg.fma_kind();
-        a_type = type_t(cfg.prb().a_data_type);
-        b_type = type_t(cfg.prb().b_data_type);
-        c_type = type_t(cfg.prb().c_data_type);
-        is_src1_broadcast = !cfg.prb().is_dw;
-        ab_swap_transpose_ = cfg.prb().ab_swap_transpose;
-    }
+    fma_context_t(const conv_config_t &cfg)
+        : hw(cfg.hw())
+        , simd(cfg.simd())
+        , vec_size(cfg.vec_size())
+        , fma(cfg.fma_kind())
+        , a_type(cfg.prb().a_data_type)
+        , b_type(cfg.prb().b_data_type)
+        , acc_type(get_accumulation_type(cfg, a_type, b_type))
+        , is_src1_broadcast(!cfg.prb().is_dw)
+        , ab_swap_transpose_(cfg.prb().ab_swap_transpose) {}
 
     fma_layout_hint_t &layout_hint(abc_kind_t abc) {
         return (abc == abc_kind_t::a) ? a_layout_hint : b_layout_hint;
@@ -1270,32 +1284,19 @@ struct fma_context_t {
 
     layout_t maybe_retype_layout_for_mad(
             bool is_a, const layout_t &layout) const {
-        bool is_b = !is_a;
         // mad with s8/u8 is not supported, promote to strided s16.
         if (layout.type().is_x8())
             return layout.retype(type_t::s16()).make_strided(2);
-
-        if (a_type.is_f16() && b_type.is_f16() && c_type.is_f32()) {
-            return layout.retype(type_t::f32()).make_dense();
-        }
-        if (layout.type().is_bf8())
-            return layout.make_dense().retype(type_t::f16());
-
         // mad with f16 requires aligned regioning for src1/src2.
-        if (a_type.is_f16()) return layout.make_dense();
-
-        if (layout.type().is_bf16() && !hw.systolic_support())
+        if (a_type.is_f16() && acc_type.is_f16()) {
+            return layout.make_dense();
+        }
+        bool is_a_xf8_or_xf16_or_xf4 = (a_type.is_fp4() || a_type.is_fp8()
+                || a_type.is_bf16() || a_type.is_f16());
+        bool is_b_xf8_or_xf16_or_xf4 = (b_type.is_fp4() || b_type.is_fp8()
+                || b_type.is_bf16() || b_type.is_f16());
+        if (is_a_xf8_or_xf16_or_xf4 || is_b_xf8_or_xf16_or_xf4) {
             return layout.retype(type_t::f32()).make_dense();
-
-        if (a_type.is_bf16()) {
-            // bf16 mixed mode requires src1 to be converted to f32 when it's
-            // broadcasted.
-            if (is_a && is_src1_broadcast)
-                return layout.retype(type_t::f32()).make_dense();
-            // bf16 mixed mode mad requires src1 to be packed
-            if (is_a) return layout.make_dense();
-            // bf16 mixed mode mad requires src2 to be f32.
-            if (is_b) return layout.retype(type_t::f32()).make_dense();
         }
         return layout;
     }
@@ -1305,9 +1306,9 @@ struct fma_context_t {
         bool is_mad = (fma == fma_kind_t::mad);
         bool is_dpas = is_dp_fma(fma);
         bool is_a = (abc == abc_kind_t::a);
-        bool is_b = (abc == abc_kind_t::b);
         auto type = (is_a ? a_type : b_type);
-        int type_size = (layout.type().is_bf8() ? 2 : type.size());
+        bool cvt_f16 = (layout.type().is_fp8() || layout.type().is_fp4());
+        int type_size = (cvt_f16 ? 2 : type.size());
         if (is_dpas) {
             int sdepth = 8;
             int dword_size = 4;
@@ -1332,35 +1333,11 @@ struct fma_context_t {
                     layout_t(type, 0, (int)bmnks.size(), blocks));
             auto abc_layout
                     = mapper.map_from_bmnk(abc, bmnks, fma_layout, layout);
-            if (layout.type().is_bf8()) return abc_layout.retype(type_t::f16());
+            if (cvt_f16) return abc_layout.retype(type_t::f16());
             return abc_layout;
         }
 
         if (is_mad) {
-            // swap b blocks for axb layouts when a inner dim is required by transpose
-            if (is_b && ab_swap_transpose_) {
-                if (layout.blocks().size() > 1) {
-                    std::vector<block_t> blocks;
-                    int new_inner_stride = 1;
-                    int nblocks = (int)layout.blocks().size();
-                    auto inner_most_block = layout.blocks()[0];
-                    for (int i = nblocks - 1; i >= 0; --i) {
-                        auto &b = layout.blocks()[i];
-                        if (b.dim_idx != inner_most_block.dim_idx) {
-                            new_inner_stride = b.block;
-                            blocks.insert(blocks.begin(),
-                                    block_t(b.dim_idx, b.block, stride_t(1)));
-                        } else {
-                            blocks.emplace_back(block_t(b.dim_idx, b.block,
-                                    stride_t(new_inner_stride)));
-                        }
-                    }
-                    return maybe_retype_layout_for_mad(is_a,
-                            layout_t(layout.type(), layout.ndims(),
-                                    layout.offset(), blocks)
-                                    .make_dense());
-                }
-            }
             // XXX: type and layout.type() may be different here when using mad
             // with fpmath attribute. For now type is ignored and hence fpmath
             // attribute has no effect with mad.
@@ -1374,10 +1351,14 @@ struct fma_context_t {
             auto fma_layout = bmnk_layout.make_with_block(
                     layout_t(ret.type(), 0, (int)bmnks.size(), blocks));
             auto abc_layout = mapper.map_from_bmnk(abc, bmnks, fma_layout, ret);
+            if (layout.type().is_x8()) {
+                gpu_assert(abc_layout.type().is_s16());
+                abc_layout = abc_layout.make_strided(2);
+            }
             return abc_layout;
         }
 
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return layout;
     }
 
@@ -1388,13 +1369,13 @@ struct fma_context_t {
     }
 
     static int get_vec_idx(abc_kind_t abc, bmnk_kind_t bmnk) {
-        ir_assert(utils::one_of(abc, abc_kind_t::a, abc_kind_t::b));
+        gpu_assert(utils::one_of(abc, abc_kind_t::a, abc_kind_t::b));
         bool is_a = (abc == abc_kind_t::a);
         switch (bmnk) {
             case bmnk_kind_t::b: return 0;
             case bmnk_kind_t::m: return is_a ? 1 : -1;
             case bmnk_kind_t::n: return is_a ? -1 : 2;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return -1;
     }
@@ -1412,7 +1393,7 @@ struct fma_context_t {
                 ret.push_back(bmnk_kind_t::k);
                 ret.push_back(bmnk_kind_t::n);
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return ret;
     }
@@ -1453,7 +1434,7 @@ struct fma_context_t {
     fma_kind_t fma;
     type_t a_type;
     type_t b_type;
-    type_t c_type;
+    type_t acc_type;
     bool is_src1_broadcast;
     bool ab_swap_transpose_;
     fma_layout_hint_t a_layout_hint;
@@ -1464,7 +1445,7 @@ int slm_memory_bank_count(ngen::HW hw) {
     switch (hw) {
         case ngen::HW::XeHP: return 65;
         case ngen::HW::XeHPG: return 32;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return 0;
 }
@@ -1473,7 +1454,7 @@ int slm_memory_bank_granularity(ngen::HW hw) {
     switch (hw) {
         case ngen::HW::XeHP: return 4;
         case ngen::HW::XeHPG: return 8;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return 0;
 }
@@ -1510,8 +1491,7 @@ dim_t find_min_stride_without_conflicts(
         }
     }
 
-    ir_warning() << "Couldn't find stride without conflicts for SLM padding."
-                 << std::endl;
+    gpu_warning() << "Couldn't find stride without conflicts for SLM padding.";
 
     return dense_stride_bytes;
 }
@@ -1534,7 +1514,7 @@ layout_t pad_slm_layout(
     auto l = layout.split_into_multi_blocks(multi_blocks);
 
     if (l.is_empty()) {
-        ir_warning() << "Couldn't split layout for SLM padding." << std::endl;
+        gpu_warning() << "Couldn't split layout for SLM padding.";
         return layout;
     }
     auto padded_blocks = l.blocks();
@@ -1546,14 +1526,14 @@ layout_t pad_slm_layout(
             if (stride == -1) {
                 dim_t stride_bytes = find_min_stride_without_conflicts(
                         hw, per_thr_bytes, dim_t(b.stride) * type_size);
-                ir_assert(stride_bytes % type_size == 0);
+                gpu_assert(stride_bytes % type_size == 0);
                 stride = stride_bytes / type_size;
             }
             b.stride = stride;
             stride = b.stride * b.block;
             continue;
         }
-        ir_assert(remaining_elems % b.block == 0);
+        gpu_assert(remaining_elems % b.block == 0);
         remaining_elems /= b.block;
         if (remaining_elems == 1) past_inner_block = true;
     }
@@ -1569,13 +1549,6 @@ layout_t get_slm_layout(const fma_context_t &fma_ctx, abc_kind_t abc,
     return layout;
 }
 
-type_t get_default_accumulation_type(const type_t &a, const type_t &b) {
-    UNUSED(b);
-    if (a.is_int()) return type_t::s32();
-    if (a.is_f64()) return type_t::f64();
-    return type_t::f32();
-}
-
 struct reduce_mask_t {
     reduce_mask_t() = default;
     reduce_mask_t(uint32_t mask) : enable(true), mask(mask) {}
@@ -1616,7 +1589,7 @@ tensor_t to_reduce_tensor(const tensor_t &tile, uint32_t mask) {
     auto map = get_reduce_dim_map(mask, reduce_ndims);
     std::vector<dim_t> reduce_dims(reduce_ndims);
     std::vector<expr_t> reduce_start(reduce_ndims);
-    for (int i = 0; i < tile.ndims(); i++) {
+    for (dim_idx_t i = 0; i < tile.ndims(); i++) {
         if (map[i] == -1) continue;
         reduce_dims[map[i]] = tile(i);
         reduce_start[map[i]] = tile.start(i);
@@ -1624,7 +1597,8 @@ tensor_t to_reduce_tensor(const tensor_t &tile, uint32_t mask) {
     return tensor_t(reduce_dims, reduce_start);
 }
 
-layout_t to_reduce_layout(const layout_t &layout, uint32_t mask) {
+layout_t to_reduce_layout(
+        const conv_config_t &cfg, const layout_t &layout, uint32_t mask) {
     int reduce_ndims = layout.ndims();
     auto map = get_reduce_dim_map(mask, reduce_ndims);
     std::vector<block_t> reduce_blocks;
@@ -1634,7 +1608,7 @@ layout_t to_reduce_layout(const layout_t &layout, uint32_t mask) {
         bb.dim_idx = map[b.dim_idx];
         reduce_blocks.push_back(bb);
     }
-    auto type = get_default_accumulation_type(layout.type(), layout.type());
+    auto type = get_accumulation_type(cfg, layout.type(), layout.type());
     return layout_t(type, reduce_ndims, 0, reduce_blocks).make_dense();
 }
 
@@ -1656,8 +1630,8 @@ class direct_view_t {
     const view_t &get() const { return direct_view_; }
 
     layout_t transform(const layout_t &layout) const {
-        ir_assert((bool)*this);
-        ir_assert(fused_tidx_ != -1);
+        gpu_assert((bool)*this);
+        gpu_assert(fused_tidx_ != dim_idx::invalid);
         std::vector<block_t> blocks;
         bool seen = false;
         for (auto &b : layout.blocks()) {
@@ -1671,7 +1645,7 @@ class direct_view_t {
             }
             if (seen) return layout_t();
             seen = true;
-            for (int i = 0; i < tdim.nvargs(); i++) {
+            for (dim_idx_t i = 0; i < tdim.nvargs(); i++) {
                 int vidx = tdim.vidx(i);
                 dim_t vstride = (dim_t)tdim.vstride(i);
                 auto vb = b;
@@ -1686,13 +1660,13 @@ class direct_view_t {
     }
 
 private:
-    bool check_tdims(int &fused_tidx) const {
+    bool check_tdims(dim_idx_t &fused_tidx) const {
         int nfused = 0;
-        for (int tidx = 0; tidx < view_.ntdims(); tidx++) {
+        for (dim_idx_t tidx = 0; tidx < view_.ntdims(); tidx++) {
             auto &tdim = view_.tdim(tidx);
             if (tdim.is_identity()) continue;
             int nvars = 0;
-            for (int i = 0; i < tdim.nvargs(); i++) {
+            for (dim_idx_t i = 0; i < tdim.nvargs(); i++) {
                 auto vdim = view_.vdims()[tdim.vidx(i)];
                 if (vdim == 1) continue;
                 if (tdim.vstride(i).is_unknown()) return false;
@@ -1708,7 +1682,7 @@ class direct_view_t {
     }
 
     bool check_masks() const {
-        for (int tidx = 0; tidx < view_.ntdims(); tidx++) {
+        for (dim_idx_t tidx = 0; tidx < view_.ntdims(); tidx++) {
             auto &tdim = view_.tdim(tidx);
             for (auto &v : view_.vvars())
                 if (contains_object(tdim.mask(), v)) return false;
@@ -1727,14 +1701,14 @@ class direct_view_t {
             return ret;
         }());
 
-        ir_assert(nvdims <= max_nvdims) << "Too many dimensions: " << nvdims;
+        gpu_assert(nvdims <= max_nvdims) << "Too many dimensions: " << nvdims;
         return std::vector<expr_t>(_vvars.begin(), _vvars.begin() + nvdims);
     }
 
     void init_direct_view() {
         std::vector<dim_t> tdim_extents(view_.ntdims());
         std::vector<expr_t> tdim_starts(view_.ntdims());
-        for (int tidx = 0; tidx < view_.ntdims(); tidx++) {
+        for (dim_idx_t tidx = 0; tidx < view_.ntdims(); tidx++) {
             auto &tdim = view_.tdim(tidx);
             auto &textent = tdim_extents[tidx];
             auto &tstart = tdim_starts[tidx];
@@ -1745,7 +1719,7 @@ class direct_view_t {
             }
             textent = 1;
             tstart = tdim.expr();
-            for (int i = 0; i < tdim.nvargs(); i++) {
+            for (dim_idx_t i = 0; i < tdim.nvargs(); i++) {
                 auto vidx = tdim.vidx(i);
                 auto vdim = view_.vdims()[vidx];
                 textent += (dim_t)tdim.vstride(i) * (vdim - 1);
@@ -1756,7 +1730,7 @@ class direct_view_t {
 
         direct_view_ = view_t(create_vvars(view_.ntdims()), view_.ntdims());
         direct_view_.set_tlayout(view_.tlayout());
-        for (int tidx = 0; tidx < view_.ntdims(); tidx++) {
+        for (dim_idx_t tidx = 0; tidx < view_.ntdims(); tidx++) {
             auto &tdim = view_.tdim(tidx);
             direct_view_.set_vdim(direct_view_.vvars()[tidx],
                     tdim_extents[tidx], tdim_starts[tidx]);
@@ -1767,7 +1741,7 @@ class direct_view_t {
     }
 
     view_t view_;
-    int fused_tidx_ = -1;
+    dim_idx_t fused_tidx_ = -1;
     view_t direct_view_;
     bool active_ = false;
 };
@@ -1891,8 +1865,7 @@ class plan_builder_t {
         if (status == plan_status_t::success) return status::success;
 
         if (a_direct_view_ || b_direct_view_) {
-            ir_trace() << "Retry plan initialization without direct view"
-                       << std::endl;
+            gpu_trace() << "Retry plan initialization without direct view";
             enable_direct_view(false);
             status = try_init_plan();
             if (status == plan_status_t::success) return status::success;
@@ -1900,7 +1873,7 @@ class plan_builder_t {
 
         if ((use_slm(abc_kind_t::a) || use_slm(abc_kind_t::b))
                 && !cfg_.slm().is_overridden()) {
-            ir_trace() << "Retry plan initialization without SLM" << std::endl;
+            gpu_trace() << "Retry plan initialization without SLM";
             enable_slm(false);
             status = try_init_plan();
             if (status == plan_status_t::success) return status::success;
@@ -1927,7 +1900,7 @@ class plan_builder_t {
                     = gmem_buf_size == 0 ? 0 : 1 + free / gmem_buf_size;
         }
 
-        ir_trace() << plan_ << std::endl;
+        gpu_trace() << plan_;
         cfg_.set_plan(plan_ptr_);
     }
 
@@ -2047,7 +2020,7 @@ class plan_builder_t {
                 && cfg_.is_dp_fma())
             return false;
         bmnk_dim_helper_t h(cfg_);
-        int k_tg = h.thread_group_dim(prb_dims::k);
+        dim_t k_tg = h.thread_group_dim(pvars::k);
         if (k_tg != 1) return false;
         return true;
     }
@@ -2102,9 +2075,9 @@ class plan_builder_t {
         auto &src = g2s_load.reg_layout();
         auto &dst = g2s_store.reg_layout();
         reorder = create_reorder_plan(cfg_.hw(), src, dst);
-        if (reduce_mask && !cfg_.prb().deterministic) {
+        if (reduce_mask && cfg_.allow_global_reduction()) {
             *reduce_tile = to_reduce_tensor(abs_thr_tile, reduce_mask.mask);
-            auto reduce_layout = to_reduce_layout(src, reduce_mask.mask);
+            auto reduce_layout = to_reduce_layout(cfg_, src, reduce_mask.mask);
             *reduce = create_reduce_plan(
                     cfg_.hw(), src, reduce_layout, reduce_mask.mask);
         }
@@ -2167,9 +2140,10 @@ class plan_builder_t {
                 send_address_t::slm, abc, thr_view);
         load = create_send_plan(cfg_.exec_cfg(), thr_view, params);
         layout = load.reg_layout();
-        if (reduce_mask && cfg_.prb().deterministic) {
+        if (reduce_mask && !cfg_.allow_global_reduction()) {
             *reduce_tile = to_reduce_tensor(abs_thr_tile, reduce_mask.mask);
-            auto reduce_layout = to_reduce_layout(layout, reduce_mask.mask);
+            auto reduce_layout
+                    = to_reduce_layout(cfg_, layout, reduce_mask.mask);
             *reduce = create_reduce_plan(
                     cfg_.hw(), layout, reduce_layout, reduce_mask.mask);
         }
@@ -2193,7 +2167,7 @@ class plan_builder_t {
 
         auto &direct_view
                 = (abc == abc_kind_t::a ? a_direct_view_ : b_direct_view_);
-        auto load_view = direct_view ? direct_view.get() : gmem_view;
+        const auto &load_view = direct_view ? direct_view.get() : gmem_view;
 
         auto params = get_send_params(cfg_.exec_cfg(), send_op_t::load,
                 send_address_t::a64, cfg_.fma_kind(), abc, load_view,
@@ -2209,9 +2183,10 @@ class plan_builder_t {
         }
 
         if (reduce_mask) {
-            ir_assert(!direct_view);
+            gpu_assert(!direct_view);
             *reduce_tile = to_reduce_tensor(abs_thr_tile, reduce_mask.mask);
-            auto reduce_layout = to_reduce_layout(reg_layout, reduce_mask.mask);
+            auto reduce_layout
+                    = to_reduce_layout(cfg_, reg_layout, reduce_mask.mask);
             *reduce = create_reduce_plan(
                     cfg_.hw(), reg_layout, reduce_layout, reduce_mask.mask);
         }
@@ -2236,7 +2211,7 @@ class plan_builder_t {
 
     plan_status_t verify_slm_k_slicing() const {
         bmnk_dim_helper_t h(cfg_);
-        int k_tg = h.thread_group_dim(prb_dims::k);
+        dim_t k_tg = h.thread_group_dim(pvars::k);
         if (k_tg == 1) return plan_status_t::success;
 
         auto l = plan_.fma.c_prb_layout;
@@ -2374,11 +2349,7 @@ class plan_builder_t {
         int k_blk = 1;
         auto &a_type = a_layout.type();
         auto &b_type = b_layout.type();
-        auto c_type = get_default_accumulation_type(a_type, b_type);
-        if (fma_kind == fma_kind_t::mad && a_type.is_f16() && b_type.is_f16()) {
-            // FIXME: f16 must use f32 accumulator according to documentation.
-            c_type = type_t::f16();
-        }
+        auto c_type = get_accumulation_type(cfg_, a_type, b_type);
         layout_t c_blk_layout(c_type, 0, std::vector<dim_t>(3, 1));
         switch (fma_kind) {
             case fma_kind_t::dp4a:
@@ -2386,11 +2357,11 @@ class plan_builder_t {
             case fma_kind_t::dpasw: {
                 const int sdepth = 8;
                 const int dword_size = 4;
-                ir_assert(is_dpas_src1_compatible(
+                gpu_assert(is_dpas_src1_compatible(
                         simd, /*transpose=*/true, b_layout));
-                ir_assert(is_dpas_src2_compatible(
+                gpu_assert(is_dpas_src2_compatible(
                         simd, /*transpose=*/true, a_layout));
-                ir_assert(a_layout.type().size() == b_layout.type().size());
+                gpu_assert(a_layout.type().size() == b_layout.type().size());
                 int ab_type_size = a_layout.type().size();
                 m_blk = get_dpas_block_rcount(a_layout, 1);
                 n_blk = simd;
@@ -2417,7 +2388,7 @@ class plan_builder_t {
                     return plan_status_t::invalid_fma_layout;
                 }
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
 
         auto c_layout = get_c_layout(a_layout, b_layout, c_blk_layout);
@@ -2471,16 +2442,20 @@ class plan_builder_t {
             zp_ic_dim = b_tile(ic_idx);
         }
 
-        layout_t zp_layout(cfg_.zp_cfg().src_zp_type, zp_off,
-                std::vector<dim_t> {zp_g_dim, zp_ic_dim});
+        const auto src_zp_type = (cfg_.zp_cfg().do_src_compensation)
+                ? cfg_.zp_cfg().src_zp_type
+                : type_t::s32();
+        layout_t zp_layout(
+                src_zp_type, zp_off, std::vector<dim_t> {zp_g_dim, zp_ic_dim});
         view_t zp_view(zp_layout);
         // TODO: support non-scalar wei layouts
         layout_t zp_wei_layout(
                 cfg_.zp_cfg().wei_zp_type, 0, std::vector<dim_t> {1, 1});
         view_t zp_wei_view(zp_wei_layout);
 
-        plan.init(cfg_, gemm_schedule_, zp_view, zp_wei_view, x2r.a_layout,
-                x2r.b_layout, fma.c_prb_layout);
+        plan.init(cfg_, x2r.a_load.send_params().hint_2d.enable, gemm_schedule_,
+                zp_view, zp_wei_view, x2r.a_layout, x2r.b_layout,
+                fma.c_prb_layout);
         return plan_status_t::success;
     }
 
@@ -2503,7 +2478,7 @@ class plan_builder_t {
             init_bwd_w(
                     cfg_, gemm_schedule, a_view, b_view, c_view, plan.bia_view);
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
         gemm_schedule.finalize();
 
diff --git a/src/gpu/intel/jit/conv/plan.hpp b/src/gpu/intel/jit/conv/plan.hpp
index 89cf190574e..5c44fa0f8c2 100644
--- a/src/gpu/intel/jit/conv/plan.hpp
+++ b/src/gpu/intel/jit/conv/plan.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,8 +47,8 @@ struct reorder_plan_t : public base_plan_t {
     bool can_split(int factor) const;
     void set_split(int factor = 1);
     stmt_t create_stmt(const expr_t &src_buf, const expr_t &dst_buf) const;
-    int src_buf_size() const;
-    int estimate_regs() const;
+    dim_t src_buf_size() const;
+    dim_t estimate_regs() const;
 
     std::string str(const std::string &tag = "reorder") const {
         std::ostringstream oss;
@@ -68,7 +68,7 @@ struct reduce_plan_t : public base_plan_t {
     using base_plan_t::base_plan_t;
 
     explicit operator bool() const { return !src.is_empty(); }
-    int dst_buf_size() const;
+    dim_t dst_buf_size() const;
     bool can_split(int factor) const;
     void set_split(int factor = 1);
     stmt_t create_stmt(const expr_t &src_buf, const expr_t &dst_buf) const;
@@ -89,7 +89,6 @@ struct slm_plan_t : public base_plan_t {
     send_plan_t a_g2s_load;
     send_plan_t b_g2s_load;
     tensor_t x_reduce_tile;
-    layout_t x_reduce_layout;
     reduce_plan_t x_reduce;
     reorder_plan_t a_reorder;
     reorder_plan_t b_reorder;
@@ -133,7 +132,6 @@ struct x2r_plan_t : public base_plan_t {
     send_plan_t a_load;
     send_plan_t b_load;
     tensor_t x_reduce_tile;
-    layout_t x_reduce_layout;
     reduce_plan_t x_reduce;
     reorder_plan_t a_reorder;
     reorder_plan_t b_reorder;
@@ -149,14 +147,14 @@ struct x2r_plan_t : public base_plan_t {
     void set_split(abc_kind_t abc = abc_kind_t::undef, int factor = 1);
 
     int a_buf_size() const {
-        int a_size = a_layout.size();
+        int a_size = into<int>(a_layout.size());
         if (split_abc == abc_kind_t::a)
             a_size = utils::div_up(a_size, split_factor);
         return utils::rnd_up(a_size, grf_size());
     }
 
     int b_buf_size() const {
-        int b_size = b_layout.size();
+        int b_size = into<int>(b_layout.size());
         if (split_abc == abc_kind_t::b)
             b_size = utils::div_up(b_size, split_factor);
         return utils::rnd_up(b_size, grf_size());
@@ -238,7 +236,7 @@ struct conv_plan_t : public base_plan_t {
     const tensor_t &x_reduce_tile() const {
         if (!x2r.x_reduce_tile.is_empty()) return x2r.x_reduce_tile;
         if (!slm.x_reduce_tile.is_empty()) return slm.x_reduce_tile;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return x2r.x_reduce_tile;
     }
 
diff --git a/src/gpu/intel/jit/conv/plan_utils.hpp b/src/gpu/intel/jit/conv/plan_utils.hpp
index a3e427613d6..2f06d33830e 100644
--- a/src/gpu/intel/jit/conv/plan_utils.hpp
+++ b/src/gpu/intel/jit/conv/plan_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 #include <string>
 
 #include "gpu/intel/jit/ir/tensor.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -34,7 +34,7 @@ struct base_plan_t {
     base_plan_t(const hw_t hw = hw_t()) : hw(hw) {}
 
     int grf_size() const {
-        ir_assert(!hw.is_undef());
+        gpu_assert(!hw.is_undef());
         return hw.grf_size();
     }
 
diff --git a/src/gpu/intel/jit/conv/problem.cpp b/src/gpu/intel/jit/conv/problem.cpp
index 05875727c46..9d0c2f0e3bf 100644
--- a/src/gpu/intel/jit/conv/problem.cpp
+++ b/src/gpu/intel/jit/conv/problem.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,15 +25,15 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-const std::vector<prb_dim_t> &conv_dims() {
-    static std::vector<prb_dim_t> _conv_dims = []() {
-        std::vector<prb_dim_t> ret;
+const std::vector<pvar_t> &conv_dims() {
+    static std::vector<pvar_t> _conv_dims = []() {
+        std::vector<pvar_t> ret;
         for (auto &d : conv_index_dims(prop_kind::forward)) {
             ret.push_back(d);
         }
-        ret.push_back(prb_dims::id);
-        ret.push_back(prb_dims::ih);
-        ret.push_back(prb_dims::iw);
+        ret.push_back(pvars::id);
+        ret.push_back(pvars::ih);
+        ret.push_back(pvars::iw);
         for (auto &d : conv_stride_dims())
             ret.push_back(d);
         for (auto &d : conv_dilation_dims())
@@ -45,76 +45,75 @@ const std::vector<prb_dim_t> &conv_dims() {
     return _conv_dims;
 }
 
-const std::vector<prb_dim_t> &conv_index_dims(prop_kind_t prop) {
+const std::vector<pvar_t> &conv_index_dims(prop_kind_t prop) {
     auto get_dims = [&](prop_kind_t prop) {
-        std::vector<prb_dim_t> ret;
-        ret.push_back(prb_dims::mb);
-        ret.push_back(prb_dims::g);
-        ret.push_back(prb_dims::oc);
-        ret.push_back(prb_dims::ic);
-        ret.push_back(prb_dims::kd);
-        ret.push_back(prb_dims::kh);
-        ret.push_back(prb_dims::kw);
+        std::vector<pvar_t> ret;
+        ret.push_back(pvars::mb);
+        ret.push_back(pvars::g);
+        ret.push_back(pvars::oc);
+        ret.push_back(pvars::ic);
+        ret.push_back(pvars::kd);
+        ret.push_back(pvars::kh);
+        ret.push_back(pvars::kw);
         if (prop != prop_kind::backward_data) {
-            ret.push_back(prb_dims::od);
-            ret.push_back(prb_dims::oh);
-            ret.push_back(prb_dims::ow);
+            ret.push_back(pvars::od);
+            ret.push_back(pvars::oh);
+            ret.push_back(pvars::ow);
         } else {
-            ret.push_back(prb_dims::id);
-            ret.push_back(prb_dims::ih);
-            ret.push_back(prb_dims::iw);
+            ret.push_back(pvars::id);
+            ret.push_back(pvars::ih);
+            ret.push_back(pvars::iw);
         }
         return ret;
     };
-    static std::vector<prb_dim_t> fwd_dims = get_dims(prop_kind::forward);
-    static std::vector<prb_dim_t> bwd_d_dims
-            = get_dims(prop_kind::backward_data);
-    static std::vector<prb_dim_t> bwd_w_dims
+    static std::vector<pvar_t> fwd_dims = get_dims(prop_kind::forward);
+    static std::vector<pvar_t> bwd_d_dims = get_dims(prop_kind::backward_data);
+    static std::vector<pvar_t> bwd_w_dims
             = get_dims(prop_kind::backward_weights);
     switch (prop) {
         case prop_kind::forward: return fwd_dims;
         case prop_kind::backward_data: return bwd_d_dims;
         case prop_kind::backward_weights: return bwd_w_dims;
-        default: ir_error_not_expected(); return fwd_dims;
+        default: gpu_error_not_expected(); return fwd_dims;
     }
 }
 
-bool is_conv_index(const prb_dim_t &dim) {
+bool is_conv_index(const pvar_t &dim) {
     for (auto prop : {prop_kind::forward, prop_kind::backward_data,
                  prop_kind::backward_weights})
         if (is_conv_index(dim, prop)) return true;
     return false;
 }
 
-bool is_conv_index(const prb_dim_t &dim, prop_kind_t prop) {
+bool is_conv_index(const pvar_t &dim, prop_kind_t prop) {
     for (auto &d : conv_index_dims(prop))
         if (d == dim) return true;
     return false;
 }
 
-const std::vector<prb_dim_t> &conv_layout_dims(
+const std::vector<pvar_t> &conv_layout_dims(
         tensor_kind_t tensor_kind, bool src_dst_with_group) {
-    static const std::vector<prb_dim_t> src_dims({prb_dims::mb, prb_dims::ic,
-            prb_dims::id, prb_dims::ih, prb_dims::iw});
-    static const std::vector<prb_dim_t> src_g_dims({prb_dims::mb, prb_dims::g,
-            prb_dims::ic, prb_dims::id, prb_dims::ih, prb_dims::iw});
-    static const std::vector<prb_dim_t> wei_dims({prb_dims::g, prb_dims::oc,
-            prb_dims::ic, prb_dims::kd, prb_dims::kh, prb_dims::kw});
-    static const std::vector<prb_dim_t> dst_dims({prb_dims::mb, prb_dims::oc,
-            prb_dims::od, prb_dims::oh, prb_dims::ow});
-    static const std::vector<prb_dim_t> dst_g_dims({prb_dims::mb, prb_dims::g,
-            prb_dims::oc, prb_dims::od, prb_dims::oh, prb_dims::ow});
-    static const std::vector<prb_dim_t> bia_g_dims({prb_dims::g, prb_dims::oc});
-    static const std::vector<prb_dim_t> bia_dims({prb_dims::oc});
+    static const std::vector<pvar_t> src_dims(
+            {pvars::mb, pvars::ic, pvars::id, pvars::ih, pvars::iw});
+    static const std::vector<pvar_t> src_g_dims(
+            {pvars::mb, pvars::g, pvars::ic, pvars::id, pvars::ih, pvars::iw});
+    static const std::vector<pvar_t> wei_dims(
+            {pvars::g, pvars::oc, pvars::ic, pvars::kd, pvars::kh, pvars::kw});
+    static const std::vector<pvar_t> dst_dims(
+            {pvars::mb, pvars::oc, pvars::od, pvars::oh, pvars::ow});
+    static const std::vector<pvar_t> dst_g_dims(
+            {pvars::mb, pvars::g, pvars::oc, pvars::od, pvars::oh, pvars::ow});
+    static const std::vector<pvar_t> bia_g_dims({pvars::g, pvars::oc});
+    static const std::vector<pvar_t> bia_dims({pvars::oc});
     switch (tensor_kind) {
         case tensor_kind_t::src:
             return src_dst_with_group ? src_g_dims : src_dims;
         case tensor_kind_t::wei: return wei_dims;
         case tensor_kind_t::dst:
             return src_dst_with_group ? dst_g_dims : dst_dims;
-        case tensor_kind_t::bia:
+        case tensor_kind_t::bias:
             return src_dst_with_group ? bia_g_dims : bia_dims;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return src_dims;
 }
@@ -130,52 +129,71 @@ tensor_kind_t to_abc(prop_kind_t prop, tensor_kind_t tensor) {
         case tensor_kind_t::src: return kinds[0];
         case tensor_kind_t::wei: return kinds[1];
         case tensor_kind_t::dst: return kinds[2];
-        default: ir_error_not_expected();
+        case tensor_kind_t::a:
+        case tensor_kind_t::b:
+        case tensor_kind_t::c: return tensor;
+        default: gpu_error_not_expected();
     }
     return kinds[0];
 }
 
-const std::vector<prb_dim_t> &conv_stride_dims() {
-    static std::vector<prb_dim_t> _stride_dims = [&]() {
-        std::vector<prb_dim_t> ret;
-        ret.push_back(prb_dims::sd);
-        ret.push_back(prb_dims::sh);
-        ret.push_back(prb_dims::sw);
+tensor_kind_t from_abc(prop_kind_t prop, tensor_kind_t abc) {
+    for (auto t :
+            {tensor_kind_t::src, tensor_kind_t::wei, tensor_kind_t::dst}) {
+        if (to_abc(prop, t) == abc) return t;
+    }
+    return tensor_kind_t::undef;
+}
+
+const std::vector<pvar_t> &conv_stride_dims() {
+    static std::vector<pvar_t> _stride_dims = [&]() {
+        std::vector<pvar_t> ret;
+        ret.push_back(pvars::sd);
+        ret.push_back(pvars::sh);
+        ret.push_back(pvars::sw);
         return ret;
     }();
     return _stride_dims;
 }
 
-const std::vector<prb_dim_t> &conv_dilation_dims() {
-    static std::vector<prb_dim_t> _dilation_dims = [&]() {
-        std::vector<prb_dim_t> ret;
-        ret.push_back(prb_dims::dd);
-        ret.push_back(prb_dims::dh);
-        ret.push_back(prb_dims::dw);
+const std::vector<pvar_t> &conv_dilation_dims() {
+    static std::vector<pvar_t> _dilation_dims = [&]() {
+        std::vector<pvar_t> ret;
+        ret.push_back(pvars::dd);
+        ret.push_back(pvars::dh);
+        ret.push_back(pvars::dw);
         return ret;
     }();
     return _dilation_dims;
 }
 
-const std::vector<prb_dim_t> &conv_padding_dims() {
-    static std::vector<prb_dim_t> _padding_dims = [&]() {
-        std::vector<prb_dim_t> ret;
-        ret.push_back(prb_dims::pd);
-        ret.push_back(prb_dims::ph);
-        ret.push_back(prb_dims::pw);
+const std::vector<pvar_t> &conv_padding_dims() {
+    static std::vector<pvar_t> _padding_dims = [&]() {
+        std::vector<pvar_t> ret;
+        ret.push_back(pvars::pd);
+        ret.push_back(pvars::ph);
+        ret.push_back(pvars::pw);
         return ret;
     }();
     return _padding_dims;
 }
 
-bool can_reduce_to_1d(const memory_desc_t &out_md, const post_ops_t &post_ops) {
-    int ndims = out_md.ndims;
+bool can_reduce_to_1d(const memory_desc_t &md, const post_ops_t &post_ops) {
+    int ndims = md.ndims;
     int sp_ndims = ndims - 2;
     int non_one_sp_ndims = 0;
-    for (int i = ndims - sp_ndims; i < ndims; i++) {
-        if (out_md.dims[i] != 1) non_one_sp_ndims++;
+    auto &strides = md.format_desc.blocking.strides;
+    dim_t sp_size = strides[ndims - 1];
+    bool sp_dense = true;
+    for (int i = ndims - 1; i >= ndims - sp_ndims; i--) {
+        if (md.dims[i] != 1) non_one_sp_ndims++;
+        if (strides[i] != sp_size) sp_dense = false;
+        sp_size *= md.dims[i];
     }
     if (non_one_sp_ndims == 1) return true;
+    memory_desc_wrapper mdw(md);
+    bool strided = mdw.is_plain() && !sp_dense;
+    if (strided) return false;
     for (int i = 0; i < post_ops.len(); i++) {
         auto &po = post_ops.entry_[i];
         int mask = 0;
@@ -183,7 +201,7 @@ bool can_reduce_to_1d(const memory_desc_t &out_md, const post_ops_t &post_ops) {
             mask = po.prelu.mask;
         } else if (po.is_binary()) {
             mask = utils::get_dims_mask(
-                    out_md.dims, po.binary.src1_desc.dims, ndims);
+                    md.dims, po.binary.src1_desc.dims, ndims);
         }
         // If the post-op is applied per D/H/W dimension then it cannot be
         // transformed to 1D.
@@ -197,7 +215,10 @@ bool can_reduce_to_1d(const memory_desc_t &out_md, const post_ops_t &post_ops) {
 void conv_problem_t::normalize_shape() {
     normalize_conv_shape(id, od, kd, sd, dd, pd, ih, oh, kh, sh, dh, ph, iw, ow,
             kw, sw, dw, pw,
-            can_reduce_to_1d(c_md(), conv_pd->attr()->post_ops_), dhw_map);
+            can_reduce_to_1d(c_md(), conv_pd->attr()->post_ops_)
+                    && can_reduce_to_1d(a_md(), post_ops_t())
+                    && can_reduce_to_1d(b_md(), post_ops_t()),
+            dhw_map);
 }
 
 const memory_desc_t &conv_problem_t::a_md() const {
@@ -263,8 +284,10 @@ status_t conv_problem_t::init_acc_data_type() {
     auto a = a_data_type;
     auto b = b_data_type;
     auto c = c_data_type;
-    bool is_bf8 = utils::one_of(data_type::f8_e5m2, a, b, c);
-    bool is_hf8 = utils::one_of(data_type::f8_e4m3, a, b, c);
+    bool is_fp8 = (utils::one_of(data_type::f8_e5m2, a, b, c)
+            || utils::one_of(data_type::f8_e4m3, a, b, c));
+    bool is_fp4 = (utils::one_of(data_type::f4_e2m1, a, b, c)
+            || utils::one_of(data_type::f4_e3m0, a, b, c));
     acc_data_type = data_type::undef;
     if (utils::one_of(a, data_type::s8, data_type::u8)
             && utils::one_of(b, data_type::s8, data_type::u8)) {
@@ -272,7 +295,7 @@ status_t conv_problem_t::init_acc_data_type() {
     } else if (utils::everyone_is(data_type::f16, a, b)
             || utils::everyone_is(data_type::bf16, a, b)
             || utils::everyone_is(data_type::tf32, a, b)
-            || utils::everyone_is(data_type::f32, a, b) || is_bf8 || is_hf8) {
+            || utils::everyone_is(data_type::f32, a, b) || is_fp8 || is_fp4) {
         acc_data_type = data_type::f32;
     } else if (utils::everyone_is(data_type::f64, a, b)) {
         acc_data_type = data_type::f64;
@@ -288,39 +311,26 @@ bool conv_problem_t::with_sum_post_op() const {
 }
 
 void conv_problem_t::init_transpose(const hw_t &hw) {
-    using sm = primitive_attr_t::skip_mask_t;
-    auto attr_skip_mask = sm::post_ops | sm::sum_dt | sm::scales_runtime;
-    bool allow_ab_transpose = gpu_utils::dev_getenv("allow_ab_transpose", true);
-    bool any_zp = !attr->has_default_values(attr_skip_mask);
-    bool any_f64 = utils::one_of(data_type::f64, src_data_type, dst_data_type);
-    if (!allow_ab_transpose || any_zp || any_f64 || with_groups
-            || hw <= ngen::HW::Gen9) {
-        ab_swap_transpose = gpu_utils::dev_getenv("ab_swap_transpose", false);
-        return;
-    }
-    int max_sp = (hw >= ngen::HW::XeHPC) ? 1240 : 512;
-    bool do_ic_swap = ((is_fwd || is_bwd_w) && oc < 6);
-    bool do_oc_swap = ((is_bwd_d) && ic < 6);
-    bool allow_bwd_w = !is_bwd_w
-            || ((src_data_type != data_type::f32
-                        || fpmath_mode == dnnl_fpmath_mode_tf32)
-                    && osp % 8 == 0);
-    bool allow_bwd_d
-            = !is_bwd_d || (wei_data_type == data_type::f32 && osp == isp);
-    bool allow_fwd = !is_fwd
-            || (dst_data_type != data_type::f32
-                    && dst_data_type != data_type::f64 && mb <= 8 && ih != iw
-                    && iw <= max_sp);
-    ab_swap_transpose = allow_fwd && allow_bwd_d && allow_bwd_w
-            && (do_oc_swap || do_ic_swap);
+    bool is_dw = (g > 1) && (oc == 1) && (ic == 1);
+    bool wei_any
+            = (conv_pd->invariant_wei_md()->format_kind == format_kind::any);
+    bool has_zp = !attr->zero_points_.has_default_values();
+    bool allow_fwd = (mb <= 8 && oc <= 3 && ic <= 3 && kw <= 2)
+            || (oc <= 2 && ic <= 2);
+    bool allow_bwd_d = (mb <= 8 && oc <= 3 && ic <= 3);
+    bool allow_bwd_w = (mb <= 8 && oc <= 3 && ic >= 16);
+    ab_swap_transpose = wei_any && !is_dw && !has_zp;
+    if (is_fwd) ab_swap_transpose &= allow_fwd;
+    if (is_bwd_d) ab_swap_transpose &= allow_bwd_d;
+    if (is_bwd_w) ab_swap_transpose &= allow_bwd_w;
     ab_swap_transpose
             = gpu_utils::dev_getenv("ab_swap_transpose", ab_swap_transpose);
 }
 
-void normalize_conv_shape(int &id, int &od, int &kd, int &sd, int &dd, int &pd,
-        int &ih, int &oh, int &kh, int &sh, int &dh, int &ph, int &iw, int &ow,
-        int &kw, int &sw, int &dw, int &pw, bool can_flatten_spatial,
-        std::array<int, 3> &dhw_map) {
+void normalize_conv_shape(dim_t &id, dim_t &od, dim_t &kd, dim_t &sd, dim_t &dd,
+        dim_t &pd, dim_t &ih, dim_t &oh, dim_t &kh, dim_t &sh, dim_t &dh,
+        dim_t &ph, dim_t &iw, dim_t &ow, dim_t &kw, dim_t &sw, dim_t &dw,
+        dim_t &pw, bool can_flatten_spatial, std::array<int, 3> &dhw_map) {
     for (int i = 0; i < 3; i++)
         dhw_map[i] = -1;
     bool is_1x1 = (kd * kh * kw == 1);
@@ -328,7 +338,7 @@ void normalize_conv_shape(int &id, int &od, int &kd, int &sd, int &dd, int &pd,
     if (is_1x1 && sd == 1 && sh == 1 && sw == 1 && is_eq_oi
             && can_flatten_spatial) {
         // Convert 3D to 1D convolution.
-        ir_assert(pd == 0 && ph == 0 && pw == 0);
+        gpu_assert(pd == 0 && ph == 0 && pw == 0);
         ow = od * oh * ow;
         iw = id * ih * iw;
         od = id = kd = 1;
@@ -338,12 +348,12 @@ void normalize_conv_shape(int &id, int &od, int &kd, int &sd, int &dd, int &pd,
     }
     // Propagate D -> H -> W. If the spatial dimension is not present, map it
     // to the next present dimension.
-    std::vector<int *> xd = {&id, &od, &kd, &sd, &dd, &pd};
-    std::vector<int *> xh = {&ih, &oh, &kh, &sh, &dh, &ph};
-    std::vector<int *> xw = {&iw, &ow, &kw, &sw, &dw, &pw};
-    std::vector<int *> x[3] = {std::move(xd), std::move(xh), std::move(xw)};
-    std::vector<int> x_old[3];
-    std::vector<int> xdef = {1, 1, 1, 1, 0, 0};
+    std::vector<dim_t *> xd = {&id, &od, &kd, &sd, &dd, &pd};
+    std::vector<dim_t *> xh = {&ih, &oh, &kh, &sh, &dh, &ph};
+    std::vector<dim_t *> xw = {&iw, &ow, &kw, &sw, &dw, &pw};
+    std::vector<dim_t *> x[3] = {std::move(xd), std::move(xh), std::move(xw)};
+    std::vector<dim_t> x_old[3];
+    std::vector<dim_t> xdef = {1, 1, 1, 1, 0, 0};
     bool has_dim[3] = {false, false, false};
     for (int i = 0; i < 3; i++) {
         x_old[i].resize(xdef.size());
@@ -352,12 +362,13 @@ void normalize_conv_shape(int &id, int &od, int &kd, int &sd, int &dd, int &pd,
             x_old[i][j] = *x[i][j];
         }
     }
-    auto set = [](const std::vector<int *> &x, const std::vector<int> &values) {
+    auto set = [](const std::vector<dim_t *> &x,
+                       const std::vector<dim_t> &values) {
         for (size_t i = 0; i < x.size(); i++)
             *x[i] = values[i];
     };
     if (!has_dim[0] && !has_dim[1] && !has_dim[2]) has_dim[2] = true;
-    int sp_count = (int)has_dim[0] + (int)has_dim[1] + (int)has_dim[2];
+    int sp_count = has_dim[0] + has_dim[1] + has_dim[2];
     int shift = 3 - sp_count;
     for (int i = 0, idx = 0; i < 3; i++) {
         if (has_dim[i]) dhw_map[i] = shift + idx++;
@@ -371,60 +382,46 @@ void normalize_conv_shape(int &id, int &od, int &kd, int &sd, int &dd, int &pd,
     if (!has_dim[0]) dhw_map[0] = dhw_map[1];
 }
 
-prb_dim_t to_gemm(const prb_dim_t &d, prop_kind_t prop, bool is_transpose) {
+pvar_t to_gemm(const pvar_t &d, prop_kind_t prop, bool is_transpose) {
     const bool is_fwd = (prop == prop_kind::forward);
     const bool is_bwd_d = (prop == prop_kind::backward_data);
     const bool is_bwd_w = (prop == prop_kind::backward_weights);
-    auto transpose_gemm = [](const prb_dim_t &d) {
-        if (d == prb_dims::m) return prb_dims::n;
-        if (d == prb_dims::n) return prb_dims::m;
-        if (d == prb_dims::k) return prb_dims::k;
-        ir_error_not_expected();
-        return prb_dim_t();
-    };
-    auto pick = [&](const prb_dim_t &fwd, const prb_dim_t &bwd_d,
-                        const prb_dim_t &bwd_w) {
-        if (is_transpose) {
-            if (is_fwd) return transpose_gemm(fwd);
-            if (is_bwd_d) return transpose_gemm(bwd_d);
-            if (is_bwd_w) return transpose_gemm(bwd_w);
-        }
-        if (is_fwd) return fwd;
-        if (is_bwd_d) return bwd_d;
-        if (is_bwd_w) return bwd_w;
-        ir_error_not_expected();
-        return prb_dim_t();
+    auto transpose_gemm = [](const pvar_t &d) {
+        if (d == pvars::m) return pvars::n;
+        if (d == pvars::n) return pvars::m;
+        if (d == pvars::k) return pvars::k;
+        gpu_error_not_expected();
+        return pvar_t();
     };
-    switch (d.kind()) {
-        case prb_dim_kind_t::g: return prb_dims::b;
-        case prb_dim_kind_t::mb:
-            return pick(prb_dims::m, prb_dims::m, prb_dims::k);
-        case prb_dim_kind_t::oc:
-            return pick(prb_dims::n, prb_dims::k, prb_dims::n);
-        case prb_dim_kind_t::ic:
-            return pick(prb_dims::k, prb_dims::n, prb_dims::m);
-        case prb_dim_kind_t::kd:
-        case prb_dim_kind_t::kh:
-        case prb_dim_kind_t::kw:
-            return pick(prb_dims::k, prb_dims::k, prb_dims::m);
-        case prb_dim_kind_t::od:
-        case prb_dim_kind_t::oh:
-        case prb_dim_kind_t::ow:
-            return pick(prb_dims::m, prb_dim_t(), prb_dims::k);
-        case prb_dim_kind_t::id:
-        case prb_dim_kind_t::ih:
-        case prb_dim_kind_t::iw:
-            return pick(prb_dim_t(), prb_dims::m, prb_dim_t());
-        default: return prb_dim_t();
-    }
+    auto pick
+            = [&](const pvar_t &fwd, const pvar_t &bwd_d, const pvar_t &bwd_w) {
+                  if (is_transpose) {
+                      if (is_fwd) return transpose_gemm(fwd);
+                      if (is_bwd_d) return transpose_gemm(bwd_d);
+                      if (is_bwd_w) return transpose_gemm(bwd_w);
+                  }
+                  if (is_fwd) return fwd;
+                  if (is_bwd_d) return bwd_d;
+                  if (is_bwd_w) return bwd_w;
+                  gpu_error_not_expected();
+                  return pvar_t();
+              };
+    if (d == pvars::g) return pvars::b;
+    if (d == pvars::mb) return pick(pvars::m, pvars::m, pvars::k);
+    if (d == pvars::oc) return pick(pvars::n, pvars::k, pvars::n);
+    if (d == pvars::ic) return pick(pvars::k, pvars::n, pvars::m);
+    if (is_kernel_spatial(d)) return pick(pvars::k, pvars::k, pvars::m);
+    if (is_output_spatial(d)) return pick(pvars::m, pvar_t(), pvars::k);
+    if (is_input_spatial(d)) return pick(pvar_t(), pvars::m, pvar_t());
+    return pvar_t();
 }
 
-prb_tile_t to_gemm(const prb_tile_t &t, prop_kind_t prop, bool is_transpose) {
-    prb_tile_t ret;
-    ret[prb_dims::b] = 1;
-    ret[prb_dims::m] = 1;
-    ret[prb_dims::n] = 1;
-    ret[prb_dims::k] = 1;
+pvar_tile_t to_gemm(const pvar_tile_t &t, prop_kind_t prop, bool is_transpose) {
+    pvar_tile_t ret;
+    ret[pvars::b] = 1;
+    ret[pvars::m] = 1;
+    ret[pvars::n] = 1;
+    ret[pvars::k] = 1;
     for (auto &d : t) {
         auto gemm_d = to_gemm(d, prop, is_transpose);
         if (gemm_d.is_undef()) continue;
diff --git a/src/gpu/intel/jit/conv/problem.hpp b/src/gpu/intel/jit/conv/problem.hpp
index 29ae0dc625f..68f4e8a0745 100644
--- a/src/gpu/intel/jit/conv/problem.hpp
+++ b/src/gpu/intel/jit/conv/problem.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,39 +29,58 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-bool is_conv_index(const prb_dim_t &dim);
-bool is_conv_index(const prb_dim_t &dim, prop_kind_t prop);
-const std::vector<prb_dim_t> &conv_dims();
-const std::vector<prb_dim_t> &conv_index_dims(prop_kind_t prop);
+bool is_conv_index(const pvar_t &dim);
+bool is_conv_index(const pvar_t &dim, prop_kind_t prop);
+const std::vector<pvar_t> &conv_dims();
+const std::vector<pvar_t> &conv_index_dims(prop_kind_t prop);
 
-const std::vector<prb_dim_t> &conv_layout_dims(
+const std::vector<pvar_t> &conv_layout_dims(
         tensor_kind_t tensor_kind, bool src_dst_with_group = false);
 
 template <typename T>
-T &&pick_a(prop_kind_t prop, T &&src, T &&wei, T &&dst) {
+T &&pick_abc(tensor_kind_t abc, prop_kind_t prop, T &&src, T &&wei, T &&dst) {
     bool is_fwd = (prop == prop_kind::forward);
+    bool is_bwd_d = (prop == prop_kind::backward_data);
     bool is_bwd_w = (prop == prop_kind::backward_weights);
-    return std::forward<T>(is_fwd || is_bwd_w ? src : dst);
+    switch (abc) {
+        case tensor_kind_t::a:
+            if (is_fwd || is_bwd_w) return std::forward<T>(src);
+            return std::forward<T>(dst);
+        case tensor_kind_t::b:
+            if (is_fwd || is_bwd_d) return std::forward<T>(wei);
+            return std::forward<T>(dst);
+        case tensor_kind_t::c:
+            if (is_fwd) return std::forward<T>(dst);
+            if (is_bwd_d) return std::forward<T>(src);
+            return std::forward<T>(wei);
+        default: gpu_error_not_expected();
+    }
+    return std::forward<T>(src);
+}
+
+template <typename T>
+T &&pick_a(prop_kind_t prop, T &&src, T &&wei, T &&dst) {
+    return std::forward<T>(pick_abc(tensor_kind_t::a, prop,
+            std::forward<T>(src), std::forward<T>(wei), std::forward<T>(dst)));
 }
 
 template <typename T>
 T &&pick_b(prop_kind_t prop, T &&src, T &&wei, T &&dst) {
-    bool is_fwd = (prop == prop_kind::forward);
-    bool is_bwd_d = (prop == prop_kind::backward_data);
-    return std::forward<T>(is_fwd || is_bwd_d ? wei : dst);
+    return std::forward<T>(pick_abc(tensor_kind_t::b, prop,
+            std::forward<T>(src), std::forward<T>(wei), std::forward<T>(dst)));
 }
 
 template <typename T>
 T &&pick_c(prop_kind_t prop, T &&src, T &&wei, T &&dst) {
-    bool is_fwd = (prop == prop_kind::forward);
-    bool is_bwd_d = (prop == prop_kind::backward_data);
-    return std::forward<T>(is_fwd ? dst : is_bwd_d ? src : wei);
+    return std::forward<T>(pick_abc(tensor_kind_t::c, prop,
+            std::forward<T>(src), std::forward<T>(wei), std::forward<T>(dst)));
 }
 
 tensor_kind_t to_abc(prop_kind_t prop, tensor_kind_t tensor);
-const std::vector<prb_dim_t> &conv_stride_dims();
-const std::vector<prb_dim_t> &conv_dilation_dims();
-const std::vector<prb_dim_t> &conv_padding_dims();
+tensor_kind_t from_abc(prop_kind_t prop, tensor_kind_t abc);
+const std::vector<pvar_t> &conv_stride_dims();
+const std::vector<pvar_t> &conv_dilation_dims();
+const std::vector<pvar_t> &conv_padding_dims();
 
 class hw_t;
 
@@ -87,12 +106,22 @@ class conv_problem_t {
         return ret;
     }
     bool is_s32_accumulator() const { return acc_data_type == data_type::s32; }
+    bool is_f64_accumulator() const { return acc_data_type == data_type::f64; }
+    bool is_fp4_conv() const {
+        return utils::one_of(
+                       src_data_type, data_type::f4_e2m1, data_type::f4_e3m0)
+                || utils::one_of(
+                        wei_data_type, data_type::f4_e2m1, data_type::f4_e3m0);
+    }
+    bool is_fp8_conv() const {
+        return utils::one_of(
+                       src_data_type, data_type::f8_e4m3, data_type::f8_e5m2)
+                || utils::one_of(
+                        wei_data_type, data_type::f8_e5m2, data_type::f8_e4m3);
+    }
     bool is_f32_conv() const {
         return utils::everyone_is(src_data_type, wei_data_type, data_type::f32);
     }
-    bool is_f64_conv() const {
-        return utils::everyone_is(src_data_type, wei_data_type, data_type::f64);
-    }
     bool is_int8_dst() const {
         return utils::one_of(dst_data_type, data_type::s8, data_type::u8);
     }
@@ -106,7 +135,7 @@ class conv_problem_t {
         if (is_fwd) return prop_kind::forward;
         if (is_bwd_d) return prop_kind::backward_data;
         if (is_bwd_w) return prop_kind::backward_weights;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return prop_kind::undef;
     }
 
@@ -149,7 +178,6 @@ class conv_problem_t {
     data_type_t dst_data_type = data_type::undef;
     data_type_t bia_data_type = data_type::undef;
     fpmath_mode_t fpmath_mode = fpmath_mode::strict;
-    bool deterministic = false;
 
     bool is_fwd = false;
     bool is_bwd_d = false;
@@ -159,20 +187,22 @@ class conv_problem_t {
     bool with_sum = false;
     bool is_dw = false;
     bool ab_swap_transpose = false;
+    bool strided = false;
 
     int ndims = 0;
-    int mb = 0; // Batch size.
-    int g = 0; // Groups.
-    int ic = 0, oc = 0; // Input and output channels.
-    int id = 0, ih = 0, iw = 0; // Input spatial sizes.
-    int od = 0, oh = 0, ow = 0; // Output spatial sizes.
-    int kd = 0, kh = 0, kw = 0; // Kernel sizes.
-    int sd = 0, sh = 0, sw = 0; // Strides.
-    int pd = 0, ph = 0, pw = 0; // Padding in the beginning.
-    int dd = 0, dh = 0, dw = 0; // Dilation.
+    dim_t mb = 0; // Batch size.
+    dim_t g = 0; // Groups.
+    dim_t ic = 0, oc = 0; // Input and output channels.
+    dim_t id = 0, ih = 0, iw = 0; // Input spatial sizes.
+    dim_t od = 0, oh = 0, ow = 0; // Output spatial sizes.
+    dim_t kd = 0, kh = 0, kw = 0; // Kernel sizes.
+    dim_t sd = 0, sh = 0, sw = 0; // Strides.
+    dim_t pd = 0, ph = 0, pw = 0; // Padding in the beginning.
+    dim_t dd = 0, dh = 0, dw = 0; // Dilation.
     // Mapping for spatial dimensions (e.g. when 3D convolution is reduced to 1D).
     std::array<int, 3> dhw_map = {-1, -1, -1};
-    int isp = 0, osp = 0, ksp = 0; // Combined input/output/kernel spatial size.
+    dim_t isp = 0, osp = 0,
+          ksp = 0; // Combined input/output/kernel spatial size.
 
     data_type_t a_data_type = data_type::undef;
     data_type_t b_data_type = data_type::undef;
@@ -199,10 +229,10 @@ class conv_problem_t {
     void init_transpose(const hw_t &hw);
 };
 
-void normalize_conv_shape(int &id, int &od, int &kd, int &sd, int &dd, int &pd,
-        int &ih, int &oh, int &kh, int &sh, int &dh, int &ph, int &iw, int &ow,
-        int &kw, int &sw, int &dw, int &pw, bool can_flatten_spatial,
-        std::array<int, 3> &dhw_map);
+void normalize_conv_shape(dim_t &id, dim_t &od, dim_t &kd, dim_t &sd, dim_t &dd,
+        dim_t &pd, dim_t &ih, dim_t &oh, dim_t &kh, dim_t &sh, dim_t &dh,
+        dim_t &ph, dim_t &iw, dim_t &ow, dim_t &kw, dim_t &sw, dim_t &dw,
+        dim_t &pw, bool can_flatten_spatial, std::array<int, 3> &dhw_map);
 bool is_small_ic(const conv_problem_t &prb);
 
 class conv_arg_helper_t {
@@ -213,7 +243,7 @@ class conv_arg_helper_t {
         if (prb_.is_fwd) return DNNL_ARG_SRC;
         if (prb_.is_bwd_d) return DNNL_ARG_DIFF_SRC;
         if (prb_.is_bwd_w) return DNNL_ARG_SRC;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return DNNL_ARG_UNDEF;
     }
 
@@ -224,7 +254,7 @@ class conv_arg_helper_t {
         if (prb_.is_fwd) return DNNL_ARG_WEIGHTS;
         if (prb_.is_bwd_d) return DNNL_ARG_WEIGHTS;
         if (prb_.is_bwd_w) return DNNL_ARG_DIFF_WEIGHTS;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return DNNL_ARG_UNDEF;
     }
 
@@ -235,7 +265,7 @@ class conv_arg_helper_t {
         if (prb_.is_fwd) return DNNL_ARG_BIAS;
         if (prb_.is_bwd_d) return DNNL_ARG_BIAS;
         if (prb_.is_bwd_w) return DNNL_ARG_DIFF_BIAS;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return DNNL_ARG_UNDEF;
     }
 
@@ -246,7 +276,7 @@ class conv_arg_helper_t {
         if (prb_.is_fwd) return DNNL_ARG_DST;
         if (prb_.is_bwd_d) return DNNL_ARG_DIFF_DST;
         if (prb_.is_bwd_w) return DNNL_ARG_DIFF_DST;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return DNNL_ARG_UNDEF;
     }
 
@@ -257,14 +287,13 @@ class conv_arg_helper_t {
     const conv_problem_t &prb_;
 };
 
-prb_dim_t to_gemm(
-        const prb_dim_t &d, prop_kind_t prop, bool is_transpose = false);
-prb_tile_t to_gemm(
-        const prb_tile_t &t, prop_kind_t prop, bool is_transpose = false);
-inline prb_dim_t to_gemm(const prb_dim_t &d, const conv_problem_t &prb) {
+pvar_t to_gemm(const pvar_t &d, prop_kind_t prop, bool is_transpose = false);
+pvar_tile_t to_gemm(
+        const pvar_tile_t &t, prop_kind_t prop, bool is_transpose = false);
+inline pvar_t to_gemm(const pvar_t &d, const conv_problem_t &prb) {
     return to_gemm(d, prb.prop_kind(), prb.ab_swap_transpose);
 }
-inline prb_tile_t to_gemm(const prb_tile_t &t, const conv_problem_t &prb) {
+inline pvar_tile_t to_gemm(const pvar_tile_t &t, const conv_problem_t &prb) {
     return to_gemm(t, prb.prop_kind(), prb.ab_swap_transpose);
 }
 
diff --git a/src/gpu/intel/jit/conv/tiler.cpp b/src/gpu/intel/jit/conv/tiler.cpp
index c48e07686bb..b5845816958 100644
--- a/src/gpu/intel/jit/conv/tiler.cpp
+++ b/src/gpu/intel/jit/conv/tiler.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,58 +47,75 @@ std::vector<tensor_kind_t> input_tensors(const conv_problem_t &prb) {
     return ret;
 }
 
-bool is_reduction_dim(const prb_dim_t &d, const conv_problem_t &prb) {
-    return to_gemm(d, prb) == prb_dims::k;
+bool is_reduction_dim(const pvar_t &d, const conv_problem_t &prb) {
+    return to_gemm(d, prb) == pvars::k;
 }
 
-bool is_vectorized_dim(const prb_dim_t &d, const conv_problem_t &prb) {
-    if (prb.is_dw) return d == prb_dims::g;
-    return to_gemm(d, prb) == prb_dims::n;
+pvar_t vectorized_dim(const conv_problem_t &prb, const pvar_tile_t &tile) {
+    pvar_t vec_dim;
+    if (prb.is_dw) {
+        vec_dim = pvars::g;
+    } else {
+        for (auto &d : tile) {
+            if (to_gemm(d, prb) != pvars::n) continue;
+            gpu_assert(vec_dim.is_undef()) << "Found 2+ N dimensions: " << tile;
+            vec_dim = d;
+        }
+    }
+    gpu_assert(!vec_dim.is_undef())
+            << "Cannot find vectorized dimension: " << tile;
+    return vec_dim;
 }
 
-int tensor_conv_dim_index(const prb_dim_t &d, tensor_kind_t t) {
-    using namespace prb_dims;
-    std::vector<prb_dim_t> src_dims = {mb, g, ic, id, ih, iw};
-    std::vector<prb_dim_t> wei_dims = {g, oc, ic, kd, kh, kw};
-    std::vector<prb_dim_t> dst_dims = {mb, g, oc, od, oh, ow};
-    std::vector<prb_dim_t> *prb_dims = nullptr;
+bool is_vectorized_dim(
+        const pvar_t &d, const conv_problem_t &prb, const pvar_tile_t &tile) {
+    return d == vectorized_dim(prb, tile);
+}
+
+int tensor_conv_dim_index(const pvar_t &d, tensor_kind_t t) {
+    using namespace pvars;
+    std::vector<pvar_t> src_dims = {mb, g, ic, id, ih, iw};
+    std::vector<pvar_t> wei_dims = {g, oc, ic, kd, kh, kw};
+    std::vector<pvar_t> dst_dims = {mb, g, oc, od, oh, ow};
+    std::vector<pvar_t> *pvars = nullptr;
     switch (t) {
-        case tensor_kind_t::src: prb_dims = &src_dims; break;
-        case tensor_kind_t::wei: prb_dims = &wei_dims; break;
-        case tensor_kind_t::dst: prb_dims = &dst_dims; break;
-        default: ir_error_not_expected();
-    }
-    auto it = std::find(prb_dims->begin(), prb_dims->end(), d);
-    if (it == prb_dims->end()) return -1;
-    return (int)(it - prb_dims->begin());
+        case tensor_kind_t::src: pvars = &src_dims; break;
+        case tensor_kind_t::wei: pvars = &wei_dims; break;
+        case tensor_kind_t::dst: pvars = &dst_dims; break;
+        default: gpu_error_not_expected();
+    }
+    auto it = std::find(pvars->begin(), pvars->end(), d);
+    if (it == pvars->end()) return -1;
+    return (int)(it - pvars->begin());
 }
 
 // Used for fused reduction dimensions with dpas only.
 struct x2_tile_info_t {
     x2_tile_info_t() = default;
-    x2_tile_info_t(const prb_dim_t &dim0, const prb_dim_t &dim1)
+    x2_tile_info_t(const pvar_t &dim0, const pvar_t &dim1)
         : dim0(dim0), dim1(dim1) {
-        ir_assert(dim0 != dim1);
+        gpu_assert(dim0 != dim1);
     }
     void add(tile_flags_t f) { flags = flags | f; }
     void set_iter_unit(int unit) { d.set_iter_unit(unit); }
     void set_iter_unit0(int unit) { d0.set_iter_unit(unit); }
     void set_iter_unit1(int unit) { d1.set_iter_unit(unit); }
 
-    std::vector<std::pair<int, int>> iter_blocks(int size0, int size1) const {
+    std::vector<std::pair<int, int>> iter_blocks(
+            dim_t size0, dim_t size1) const {
         if (!any(flags & tile_flags_t::iter)) return {std::make_pair(1, 1)};
 
         std::vector<std::pair<int, int>> ret;
-        int lo = std::min(
-                size0 * size1, (int)tile_info_t::default_min_iter_blk);
+        int lo = into<int>(std::min(
+                size0 * size1, (dim_t)tile_info_t::default_min_iter_blk));
         int hi = tile_info_t::default_max_iter_blk;
         for (int eff = 100; eff > 0; eff--) {
             for (int ij = lo; ij <= hi; ij++) {
                 if (!d.is_iter_ok(ij)) continue;
                 auto factors = tile_info_t::get_factors(ij);
                 if (!tile_info_t::block_ok(size0 * size1, ij, eff)) continue;
-                for (int i : factors) {
-                    int j = ij / i;
+                for (dim_t i : factors) {
+                    dim_t j = ij / i;
                     if (d0.is_iter_ok(i) && d1.is_iter_ok(j)) {
                         ret.emplace_back(i, j);
                     }
@@ -106,24 +123,24 @@ struct x2_tile_info_t {
             }
             if (!ret.empty()) return ret;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return ret;
     }
 
-    std::vector<std::pair<int, int>> thread_group_blocks(
-            int size0, int size1) const {
-        if (any(flags & tile_flags_t::thread_group)) ir_error_not_expected();
+    std::vector<std::pair<int, int>> thread_group_blocks() const {
+        if (any(flags & tile_flags_t::thread_group)) gpu_error_not_expected();
         return {std::make_pair(1, 1)};
     }
 
-    std::vector<std::pair<int, int>> loop_blocks(int size0, int size1) const {
+    std::vector<std::pair<dim_t, dim_t>> loop_blocks(
+            dim_t size0, dim_t size1) const {
         if (!any(flags & tile_flags_t::loop)) return {std::make_pair(1, 1)};
-        if (!any(flags & tile_flags_t::loop_span)) ir_error_not_expected();
+        if (!any(flags & tile_flags_t::loop_span)) gpu_error_not_expected();
         return {std::make_pair(size0, size1)};
     }
 
-    prb_dim_t dim0;
-    prb_dim_t dim1;
+    pvar_t dim0;
+    pvar_t dim1;
     tile_flags_t flags = tile_flags_t::undef;
     div_info_t d;
     div_info_t d0;
@@ -135,19 +152,19 @@ const layout_t &compute_layout(const conv_config_t &cfg, tensor_kind_t kind) {
         case tensor_kind_t::src: return cfg.src_layout().compute();
         case tensor_kind_t::wei: return cfg.wei_layout().compute();
         case tensor_kind_t::dst: return cfg.dst_layout().compute();
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return cfg.src_layout().compute();
 }
 
 int get_layout_unit(const conv_config_t &cfg, const layout_t &layout,
-        tensor_kind_t tensor_kind, const prb_dim_t &d) {
+        tensor_kind_t tensor_kind, const pvar_t &d) {
     auto &prb = cfg.prb();
     if (!is_reduction_dim(d, prb)) return 1;
-    int dim_idx = tensor_conv_dim_index(d, tensor_kind);
-    if (dim_idx == -1) return 1;
+    dim_idx_t dim_idx = tensor_conv_dim_index(d, tensor_kind);
+    if (dim_idx == dim_idx::invalid) return 1;
 
-    std::vector<int> blocks;
+    std::vector<dim_t> blocks;
     for (auto &b : layout.blocks()) {
         if (b.dim_idx == dim_idx) blocks.push_back(b.block);
     }
@@ -155,12 +172,12 @@ int get_layout_unit(const conv_config_t &cfg, const layout_t &layout,
     blocks.resize(blocks.size() - 1);
 
     int ret = 1;
-    for (int b : blocks)
+    for (dim_t b : blocks)
         ret *= b;
     return ret;
 }
 
-int get_layout_unit(const conv_config_t &cfg, const prb_dim_t &d) {
+int get_layout_unit(const conv_config_t &cfg, const pvar_t &d) {
     int ret = 1;
     for (auto t :
             {tensor_kind_t::src, tensor_kind_t::wei, tensor_kind_t::dst}) {
@@ -176,33 +193,33 @@ bool is_mad_x8_non_dw(const conv_config_t &cfg) {
             && (prb.a_data_type_size == 1) && (prb.b_data_type_size == 1);
 }
 
-void get_level_tiles(int size0, int size1, const x2_tile_info_t &info,
+void get_level_tiles(dim_t size0, dim_t size1, const x2_tile_info_t &info,
         std::vector<level_tile_t> &ret0, std::vector<level_tile_t> &ret1) {
     ret0.clear();
     ret1.clear();
-    auto tg_blocks = info.thread_group_blocks(size0, size1);
+    auto tg_blocks = info.thread_group_blocks();
     for (auto &tg : tg_blocks) {
-        int iter_size1 = utils::div_up(size0, tg.first);
-        int iter_size0 = utils::div_up(size1, tg.second);
+        dim_t iter_size1 = utils::div_up(size0, tg.first);
+        dim_t iter_size0 = utils::div_up(size1, tg.second);
         auto iter_blocks = info.iter_blocks(iter_size0, iter_size1);
         for (auto &iter : iter_blocks) {
-            int loop_size0 = utils::div_up(size0, tg.first * iter.first);
-            int loop_size1 = utils::div_up(size1, tg.second * iter.second);
+            dim_t loop_size0 = utils::div_up(size0, tg.first * iter.first);
+            dim_t loop_size1 = utils::div_up(size1, tg.second * iter.second);
             auto loop_blocks = info.loop_blocks(loop_size0, loop_size1);
             for (auto &loop : loop_blocks) {
                 level_tile_t t0;
                 level_tile_t t1;
                 if (any(info.flags & tile_flags_t::loop)) {
-                    t0[levels::loop] = loop.first;
-                    t1[levels::loop] = loop.second;
+                    t0.loop = loop.first;
+                    t1.loop = loop.second;
                 }
                 if (any(info.flags & tile_flags_t::thread_group)) {
-                    t0[levels::thread_group] = tg.first;
-                    t1[levels::thread_group] = tg.second;
+                    t0.thread_group = tg.first;
+                    t1.thread_group = tg.second;
                 }
                 if (any(info.flags & tile_flags_t::iter)) {
-                    t0[levels::iter] = iter.first;
-                    t1[levels::iter] = iter.second;
+                    t0.iter = iter.first;
+                    t1.iter = iter.second;
                 }
                 ret0.push_back(t0);
                 ret1.push_back(t1);
@@ -229,16 +246,16 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
     }
 
     level_tile_set_t make_level_tile_set(
-            const prb_tile_t &padded_shape) const override {
+            const pvar_tile_t &padded_shape) const override {
         const auto all_dims = dims();
         const int ndims = (int)all_dims.size();
         std::vector<int> deps(ndims, -1);
         std::vector<std::vector<level_tile_t>> tiles(ndims);
 
-        auto to_idx = [&](const prb_dim_t &d) {
+        auto to_idx = [&](const pvar_t &d) {
             for (int i = 0; i < ndims; i++)
                 if (all_dims[i] == d) return i;
-            ir_error_not_expected();
+            gpu_error_not_expected();
             return -1;
         };
 
@@ -248,7 +265,7 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
             int idx1 = to_idx(info.dim1);
             get_level_tiles(padded_shape[info.dim0], padded_shape[info.dim1],
                     info, tiles[idx0], tiles[idx1]);
-            ir_assert(!seen[idx0] && !seen[idx1]);
+            gpu_assert(!seen[idx0] && !seen[idx1]);
             seen[idx0] = seen[idx1] = true;
             deps[std::max(idx0, idx1)] = std::min(idx0, idx1);
         }
@@ -272,7 +289,7 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
         for (auto &d : iter_) {
             auto &info = tile_info(d);
             int unit = 1;
-            if (is_vectorized_dim(d, prb)) unit = cfg.vec_size();
+            if (is_vectorized_dim(d, prb, iter_)) unit = cfg.vec_size();
             if (is_reduction_dim(d, prb)) {
                 // This is to ensure that reduction-related address shifts are
                 // constant. For example with a_blk = 8 and Ax16a layout there are two
@@ -306,7 +323,7 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
             // additional space for x8 -> s16 reorder.
             int min_m_iter_block_hint = 2;
             for (auto &d : iter_) {
-                if (to_gemm(d, prb) != prb_dims::m) continue;
+                if (to_gemm(d, prb) != pvars::m) continue;
                 auto &info = tile_info(d);
                 int blk = std::min(info.min_iter_blk, min_m_iter_block_hint);
                 int pow2_blk = utils::rnd_up_pow2(blk);
@@ -319,23 +336,23 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
         if (!cfg.is_dp_fma()) return;
         auto &prb = cfg.prb();
         int rdims = 0;
-        prb_dim_t d0;
-        prb_dim_t d1;
+        pvar_t d0;
+        pvar_t d1;
         for (auto &d : iter_) {
             if (!is_reduction_dim(d, prb)) continue;
             rdims++;
             (d0.is_undef() ? d0 : d1) = d;
         }
         if (rdims == 1) return;
-        ir_assert(rdims == 2) << "Can't fuse more than two dimensions.";
+        gpu_assert(rdims == 2) << "Can't fuse more than two dimensions.";
         auto &info0 = tile_info(d0);
         auto &info1 = tile_info(d1);
         tile_flags_t flags = tile_flags_t::iter | tile_flags_t::loop
                 | tile_flags_t::loop_span;
-        ir_assert(info0.flags == flags);
-        ir_assert(info1.flags == flags);
-        ir_assert(info0.min_iter_blk == tile_info_t::default_min_iter_blk);
-        ir_assert(info1.min_iter_blk == tile_info_t::default_min_iter_blk);
+        gpu_assert(info0.flags == flags);
+        gpu_assert(info1.flags == flags);
+        gpu_assert(info0.min_iter_blk == tile_info_t::default_min_iter_blk);
+        gpu_assert(info1.min_iter_blk == tile_info_t::default_min_iter_blk);
 
         int unit = 32 / prb.a_data_type_size;
         x2_tile_info_t x2_info(d0, d1);
@@ -343,18 +360,25 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
         x2_info.set_iter_unit(unit);
         x2_info.d0 = info0.div_info;
         x2_info.d1 = info1.div_info;
-        x2_tile_infos_.push_back(x2_info);
+        x2_tile_infos_.push_back(std::move(x2_info));
     }
 
     void finalize_loop_dims(const conv_config_t &cfg) {
         auto &prb = cfg.prb();
         if (prb.is_bwd_w) {
+            if (!cfg.allow_global_reduction()) {
+                for (auto &d : {pvars::mb, pvars::od, pvars::oh, pvars::ow}) {
+                    loop_[d] = 1;
+                    tile_info(d).add(tile_flags_t::loop);
+                    tile_info(d).add(tile_flags_t::loop_span);
+                }
+            }
             struct loop_dim_t {
-                prb_dim_t dim;
-                int size = 0;
+                pvar_t dim;
+                dim_t size = 0;
 
                 static loop_dim_t *find(
-                        const prb_dim_t &dim, std::vector<loop_dim_t> &dims) {
+                        const pvar_t &dim, std::vector<loop_dim_t> &dims) {
                     for (auto &d : dims)
                         if (d.dim == dim) return &d;
                     return nullptr;
@@ -371,21 +395,22 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
                 ld.size = shape.get(d, 1);
                 if (iter_.has(d))
                     ld.size = utils::div_up(ld.size, iter_dim_hint);
-                loop_dims.push_back(ld);
+                loop_dims.push_back(std::move(ld));
             }
             std::sort(loop_dims.begin(), loop_dims.end(),
                     [&](const loop_dim_t &a, const loop_dim_t &b) {
                         return a.size > b.size;
                     });
 
-            // Deterministic mode doesn't allow reduction splitting between threadgroups.
-            if (!prb.deterministic) {
+            // Do not filter out loops with disabled global reduction as all
+            // loops must be present.
+            if (cfg.allow_global_reduction()) {
                 // For XeHPG and earlier hardware use only linear loops with SLM
                 // pipelining to avoid overflowing icache. Prefetch pipeline can
                 // handle nested loops without fully unrolling them.
                 int max_loop_ndims = (cfg.hw() <= ngen::HW::XeHPG ? 1 : 2);
                 for (int i = max_loop_ndims; i < (int)loop_dims.size(); i++)
-                    loop_dims[i].dim = prb_dim_t();
+                    loop_dims[i].dim = pvar_t();
             }
 
             for (auto &d : loop_) {
@@ -402,15 +427,15 @@ class conv_blocking_scheme_t : public blocking_scheme_t {
 };
 
 int inner_block(const conv_config_t &cfg, tensor_kind_t tensor_kind,
-        const prb_dim_t &dim) {
+        const pvar_t &dim) {
     int dim_idx = tensor_conv_dim_index(dim, tensor_kind);
-    ir_assert(dim_idx != -1);
+    gpu_assert(dim_idx != -1);
     auto &layout = compute_layout(cfg, tensor_kind);
-    return layout.inner_block(
-            dim_idx, /*skip_outer=*/true, /*inner_only=*/false);
+    return into<int>(layout.inner_block(
+            dim_idx, /*skip_outer=*/true, /*inner_only=*/false));
 }
 
-int inner_block(const conv_config_t &cfg, const prb_dim_t &dim) {
+int inner_block(const conv_config_t &cfg, const pvar_t &dim) {
     int ret = 0;
     for (auto t : input_tensors(cfg.prb())) {
         if (tensor_conv_dim_index(dim, t) == -1) continue;
@@ -421,9 +446,9 @@ int inner_block(const conv_config_t &cfg, const prb_dim_t &dim) {
 }
 
 dim_t inner_stride(const conv_config_t &cfg, tensor_kind_t tensor_kind,
-        const prb_dim_t &dim) {
-    int dim_idx = tensor_conv_dim_index(dim, tensor_kind);
-    ir_assert(dim_idx != -1);
+        const pvar_t &dim) {
+    dim_idx_t dim_idx = tensor_conv_dim_index(dim, tensor_kind);
+    gpu_assert(dim_idx != dim_idx::invalid);
     auto &layout = compute_layout(cfg, tensor_kind);
     for (auto &b : layout.blocks()) {
         if (b.dim_idx == dim_idx) return (dim_t)b.stride;
@@ -431,7 +456,7 @@ dim_t inner_stride(const conv_config_t &cfg, tensor_kind_t tensor_kind,
     return 0;
 }
 
-bool is_inner_non_blocked(const conv_config_t &cfg, const prb_dim_t &dim) {
+bool is_inner_non_blocked(const conv_config_t &cfg, const pvar_t &dim) {
     for (auto t : input_tensors(cfg.prb())) {
         if (tensor_conv_dim_index(dim, t) == -1) continue;
         if (inner_block(cfg, dim) != 1) continue;
@@ -440,16 +465,16 @@ bool is_inner_non_blocked(const conv_config_t &cfg, const prb_dim_t &dim) {
     return false;
 }
 
-int grf_usage_bytes(fma_kind_t fma, int b_iter, int m_iter, int n_iter,
-        int k_iter, int a_type_size, int b_type_size, int c_type_size) {
-    int a_elems = b_iter * m_iter * k_iter;
-    int b_elems = b_iter * k_iter * n_iter;
-    int c_elems = m_iter * n_iter;
-    int a_size = a_elems * a_type_size;
+dim_t grf_usage_bytes(fma_kind_t fma, dim_t b_iter, dim_t m_iter, dim_t n_iter,
+        dim_t k_iter, int a_type_size, int b_type_size, int c_type_size) {
+    dim_t a_elems = b_iter * m_iter * k_iter;
+    dim_t b_elems = b_iter * k_iter * n_iter;
+    dim_t c_elems = m_iter * n_iter;
+    dim_t a_size = a_elems * a_type_size;
     int a_reorder_size = 0;
-    int b_size = b_elems * b_type_size;
+    dim_t b_size = b_elems * b_type_size;
     int b_reorder_size = 0;
-    int c_size = c_elems * c_type_size;
+    dim_t c_size = c_elems * c_type_size;
     int dword_size = 4;
     if (fma == fma_kind_t::mad) {
         // mad/x8 case always requires x8 -> dword-strided s16 reorder.
@@ -464,8 +489,9 @@ int grf_usage_bytes(fma_kind_t fma, int b_iter, int m_iter, int n_iter,
     return abc_size;
 }
 
-int slm_usage_bytes(const conv_config_t &cfg, int b_tg, int m_tg, int n_tg,
-        int k_tg, int b_iter, int m_iter, int n_iter, int k_iter) {
+int slm_usage_bytes(const conv_config_t &cfg, dim_t b_tg, dim_t m_tg,
+        dim_t n_tg, dim_t k_tg, dim_t b_iter, dim_t m_iter, dim_t n_iter,
+        dim_t k_iter) {
     if (cfg.hw() >= ngen::HW::XeHPC) return 0;
 
     auto &prb = cfg.prb();
@@ -477,10 +503,10 @@ int slm_usage_bytes(const conv_config_t &cfg, int b_tg, int m_tg, int n_tg,
                 cfg.zp_cfg().do_src_compensation, slm_a, slm_b, do_unroll);
         max_slm_bufs = std::max(max_slm_bufs, bufs);
     }
-    int a_slm_elems = n_tg * b_iter * m_iter * k_iter;
-    int b_slm_elems = m_tg * b_iter * n_iter * k_iter;
-    int a_slm_size = a_slm_elems * prb.a_data_type_size;
-    int b_slm_size = b_slm_elems * prb.b_data_type_size;
+    dim_t a_slm_elems = n_tg * b_iter * m_iter * k_iter;
+    dim_t b_slm_elems = m_tg * b_iter * n_iter * k_iter;
+    dim_t a_slm_size = a_slm_elems * prb.a_data_type_size;
+    dim_t b_slm_size = b_slm_elems * prb.b_data_type_size;
     int ab_slm_size = 0;
     if (slm_a) ab_slm_size += a_slm_size;
     if (slm_b) ab_slm_size += b_slm_size;
@@ -493,14 +519,14 @@ int slm_usage_bytes_for_params(
     auto &prb = cfg.prb();
     auto tg = to_gemm(params.blocking().thread_group(), prb);
     auto iter = to_gemm(params.blocking().iter(), prb);
-    int b_tg = tg.get(prb_dims::b, 1);
-    int m_tg = tg.get(prb_dims::m, 1);
-    int n_tg = tg.get(prb_dims::n, 1);
-    int k_tg = tg.get(prb_dims::k, 1);
-    int b_iter = iter.get(prb_dims::b, 1);
-    int m_iter = iter.get(prb_dims::m, 1);
-    int n_iter = iter.get(prb_dims::n, 1);
-    int k_iter = iter.get(prb_dims::k, 1);
+    dim_t b_tg = tg.get(pvars::b, 1);
+    dim_t m_tg = tg.get(pvars::m, 1);
+    dim_t n_tg = tg.get(pvars::n, 1);
+    dim_t k_tg = tg.get(pvars::k, 1);
+    dim_t b_iter = iter.get(pvars::b, 1);
+    dim_t m_iter = iter.get(pvars::m, 1);
+    dim_t n_iter = iter.get(pvars::n, 1);
+    dim_t k_iter = iter.get(pvars::k, 1);
     return slm_usage_bytes(
             cfg, b_tg, m_tg, n_tg, k_tg, b_iter, m_iter, n_iter, k_iter);
 }
@@ -517,13 +543,13 @@ class conv_blocking_checker_t : public blocking_checker_t {
     }
 
     void reset_checks() override {
-        ir_assert((int)check_kind_t::_max < 64);
+        gpu_assert((int)check_kind_t::_max < 64);
 
         check_mask_ = 0;
         optional_check_mask_ = 0;
         set_check(optional_check_mask_, check_kind_t::limit_k_iter);
-        if (cfg_.prb().deterministic) {
-            set_check(check_kind_t::check_deterministic);
+        if (!cfg_.allow_global_reduction()) {
+            set_check(check_kind_t::check_global_reduction);
         } else {
             set_check(optional_check_mask_,
                     check_kind_t::check_k_slicing_utilization);
@@ -555,6 +581,7 @@ class conv_blocking_checker_t : public blocking_checker_t {
     bool is_ok(const blocking_t &blk) const override {
         context_t ctx(blk, cfg_);
         if (!check_vec_ok(ctx)) return false;
+        if (!check_fp4_ok(ctx)) return false;
         if (!check_tg_size_ok(ctx)) return false;
         if (!check_dpas_ok(ctx)) return false;
         if (!check_grf_usage_ok(ctx)) return false;
@@ -562,7 +589,7 @@ class conv_blocking_checker_t : public blocking_checker_t {
         if (!check_bwd_d_optimize_ok(ctx)) return false;
         if (!check_layouts_ok(ctx)) return false;
         if (!check_k_slicing_utilization_ok(ctx)) return false;
-        if (!check_deterministic_ok(ctx)) return false;
+        if (!check_global_reduction_ok(ctx)) return false;
         if (!limit_m_iter_ok(ctx)) return false;
         if (!limit_n_iter_ok(ctx)) return false;
         if (!limit_k_iter_ok(ctx)) return false;
@@ -571,25 +598,28 @@ class conv_blocking_checker_t : public blocking_checker_t {
 
 private:
     struct context_t {
-        context_t(const blocking_t &blk, const conv_config_t &cfg) : blk(blk) {
-            auto &prb = cfg.prb();
-            auto gemm_iter = to_gemm(blk.iter(), prb);
-            auto gemm_loop = to_gemm(blk.loop(), prb);
-            auto gemm_tg = to_gemm(blk.thread_group(), prb);
-            b_iter = gemm_iter.get(prb_dims::b, 1);
-            m_iter = gemm_iter.get(prb_dims::m, 1);
-            n_iter = gemm_iter.get(prb_dims::n, 1);
-            k_iter = gemm_iter.get(prb_dims::k, 1);
-            k_loop = gemm_loop.get(prb_dims::k, 1);
-            b_tg = gemm_tg.get(prb_dims::b, 1);
-            m_tg = gemm_tg.get(prb_dims::m, 1);
-            n_tg = gemm_tg.get(prb_dims::n, 1);
-            k_tg = gemm_tg.get(prb_dims::k, 1);
-            dpas_2x_depth = get_dpas_2x_depth(blk, cfg);
-        }
-
-        bool get_dpas_2x_depth(
-                const blocking_t &blk, const conv_config_t &cfg) const {
+        context_t(const blocking_t &blk, const conv_config_t &cfg)
+            : context_t(blk, cfg, to_gemm(blk.iter(), cfg.prb()),
+                    to_gemm(blk.loop(), cfg.prb()),
+                    to_gemm(blk.thread_group(), cfg.prb())) {}
+
+        context_t(const blocking_t &blk, const conv_config_t &cfg,
+                const pvar_tile_t &iter, const pvar_tile_t &loop,
+                const pvar_tile_t &tg)
+            : blk(blk)
+            , b_iter(iter.get(pvars::b, 1))
+            , m_iter(iter.get(pvars::m, 1))
+            , n_iter(iter.get(pvars::n, 1))
+            , k_iter(iter.get(pvars::k, 1))
+            , k_loop(loop.get(pvars::k, 1))
+            , b_tg(tg.get(pvars::b, 1))
+            , m_tg(tg.get(pvars::m, 1))
+            , n_tg(tg.get(pvars::n, 1))
+            , k_tg(tg.get(pvars::k, 1))
+            , dpas_2x_depth(get_dpas_2x_depth(blk, cfg, m_iter * n_iter)) {}
+
+        static bool get_dpas_2x_depth(
+                const blocking_t &blk, const conv_config_t &cfg, dim_t mn) {
             if (!cfg.is_dp_fma() || cfg.regs() <= 128) return false;
 
             // Use 2x reduction when the reduction dimension is dense to avoid
@@ -599,22 +629,21 @@ class conv_blocking_checker_t : public blocking_checker_t {
                     if (is_inner_non_blocked(cfg, d)) return true;
 
             // Use larger reduction when M/N are small.
-            int mn = m_iter * n_iter;
             if (mn <= 128) return true;
 
             return false;
         }
 
         blocking_t blk;
-        int b_iter;
-        int m_iter;
-        int n_iter;
-        int k_iter;
-        int k_loop;
-        int b_tg;
-        int m_tg;
-        int n_tg;
-        int k_tg;
+        dim_t b_iter;
+        dim_t m_iter;
+        dim_t n_iter;
+        dim_t k_iter;
+        dim_t k_loop;
+        dim_t b_tg;
+        dim_t m_tg;
+        dim_t n_tg;
+        dim_t k_tg;
 
         bool dpas_2x_depth = false;
     };
@@ -628,7 +657,7 @@ class conv_blocking_checker_t : public blocking_checker_t {
         check_bwd_d_optimize,
         check_layouts,
         check_k_slicing_utilization,
-        check_deterministic,
+        check_global_reduction,
         limit_m_iter,
         limit_n_iter,
         limit_k_iter,
@@ -669,17 +698,31 @@ class conv_blocking_checker_t : public blocking_checker_t {
 
         int vec_ndims = 0;
         for (auto &d : ctx.blk.iter()) {
-            if (is_vectorized_dim(d, cfg_.prb())) vec_ndims++;
+            if (is_vectorized_dim(d, cfg_.prb(), ctx.blk.iter())) vec_ndims++;
         }
         return vec_ndims == 1;
     }
 
+    bool check_fp4_ok(const context_t &ctx) const {
+        auto prb = cfg_.prb();
+        if (prb.is_fp4_conv()) {
+            for (auto &d : ctx.blk.iter())
+                for (auto t : input_tensors(cfg_.prb())) {
+                    if (tensor_conv_dim_index(d, t) == -1) continue;
+                    if (inner_stride(cfg_, t, d) == 1
+                            && inner_block(cfg_, d) % 8)
+                        return false;
+                }
+        }
+        return true;
+    }
+
     bool check_tg_size_ok(const context_t &ctx) const {
         if (!is_enabled(check_kind_t::check_tg_size)) return true;
 
         auto &tg = ctx.blk.thread_group();
-        int tg_size = 1;
-        int max_tg = 1;
+        dim_t tg_size = 1;
+        dim_t max_tg = 1;
         for (auto &d : tg) {
             tg_size *= tg[d];
             max_tg = std::max(tg[d], max_tg);
@@ -703,8 +746,8 @@ class conv_blocking_checker_t : public blocking_checker_t {
         if (!is_enabled(check_kind_t::check_grf_usage)) return true;
 
         auto &prb = cfg_.prb();
-        int abc_size = grf_usage_bytes(cfg_.fma_kind(), ctx.b_iter, ctx.m_iter,
-                ctx.n_iter, ctx.k_iter, prb.a_data_type_size,
+        dim_t abc_size = grf_usage_bytes(cfg_.fma_kind(), ctx.b_iter,
+                ctx.m_iter, ctx.n_iter, ctx.k_iter, prb.a_data_type_size,
                 prb.b_data_type_size, prb.acc_data_type_size);
         auto &exec_cfg = cfg_.exec_cfg();
         int usage_limit = exec_cfg.grf_size()
@@ -720,10 +763,10 @@ class conv_blocking_checker_t : public blocking_checker_t {
         if (slm_size == 0) return true;
 
         auto &exec_cfg = cfg_.exec_cfg();
-        int tg_size = ctx.b_tg * ctx.m_tg * ctx.n_tg * ctx.k_tg;
+        dim_t tg_size = ctx.b_tg * ctx.m_tg * ctx.n_tg * ctx.k_tg;
         int max_slm_size = compute::device_info_t::max_slm_size_per_tg(
-                convert_ngen_arch_to_dnnl(cfg_.hw().to_ngen()), tg_size,
-                exec_cfg.regs() > 128);
+                convert_ngen_arch_to_dnnl(cfg_.hw().to_ngen()),
+                into<int>(tg_size), exec_cfg.regs() > 128);
         if (slm_size > max_slm_size) return false;
 
         return true;
@@ -736,39 +779,39 @@ class conv_blocking_checker_t : public blocking_checker_t {
         switch (cfg_.bwd_d_optimize_kind()) {
             case bwd_d_optimize_kind_t::none: return true;
             case bwd_d_optimize_kind_t::skip_out_of_bound_w: {
-                int iw_iter = ctx.blk.iter().get(prb_dims::iw, 1);
-                int iw_tg = ctx.blk.thread_group().get(prb_dims::iw, 1);
+                dim_t iw_iter = ctx.blk.iter().get(pvars::iw, 1);
+                dim_t iw_tg = ctx.blk.thread_group().get(pvars::iw, 1);
                 if (iw_iter != 1 || iw_tg != 1) return false;
                 return true;
             }
             case bwd_d_optimize_kind_t::skip_strided_dh: return true;
             case bwd_d_optimize_kind_t::skip_strided_dhw: {
-                int iw_iter = ctx.blk.iter().get(prb_dims::iw, 1);
+                dim_t iw_iter = ctx.blk.iter().get(pvars::iw, 1);
                 if (iw_iter > 1) return false;
-                int iw_tg = ctx.blk.thread_group().get(prb_dims::iw, 1);
+                dim_t iw_tg = ctx.blk.thread_group().get(pvars::iw, 1);
                 if (!math::is_pow2(iw_tg)) return false;
                 if ((prb.iw / prb.sw) % iw_tg != 0) return false;
                 return true;
             }
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
 
     // Checks that the layout can be split based as required by level_blocks.
     static bool layout_dim_ok(prop_kind_t prop, tensor_kind_t tensor_kind,
-            const layout_t &layout, const prb_dim_t &d,
-            std::vector<std::pair<level_kind_t, int>> level_blocks) {
+            const layout_t &layout, const pvar_t &d,
+            std::vector<std::pair<level_t, int>> level_blocks) {
         if (level_blocks.empty()) return true;
-        int dim_idx = tensor_conv_dim_index(d, tensor_kind);
-        if (dim_idx == -1) return true;
-        std::vector<int> blocks;
+        dim_idx_t dim_idx = tensor_conv_dim_index(d, tensor_kind);
+        if (dim_idx == dim_idx::invalid) return true;
+        std::vector<dim_t> blocks;
         for (auto &b : layout.blocks()) {
             if (b.dim_idx == dim_idx) blocks.push_back(b.block);
         }
         if (blocks.size() <= 1) return true;
         blocks.resize(blocks.size() - 1);
-        auto step = [&](std::pair<level_kind_t, int> &kv) {
+        auto step = [&](std::pair<level_t, int> &kv) {
             int &block = kv.second;
             for (auto &b : blocks) {
                 if (b == 1) continue;
@@ -796,14 +839,18 @@ class conv_blocking_checker_t : public blocking_checker_t {
 
     static bool layout_ok(const blocking_t &blk, const layout_t &layout,
             prop_kind_t prop, tensor_kind_t tensor_kind) {
-        for (int i = 0; i < prb_dim_t::max_id(); i++) {
-            auto d = prb_dim_t::from_id(i);
-            std::vector<std::pair<level_kind_t, int>> blocks;
+        std::unordered_set<pvar_t> dims;
+        for (auto &d : blk.iter())
+            dims.insert(d);
+        for (auto &d : blk.thread_group())
+            dims.insert(d);
+        for (auto &d : dims) {
+            std::vector<std::pair<level_t, int>> blocks;
             if (blk.iter().has(d))
-                blocks.emplace_back(level_kind_t::iter, blk.iter_dim(d));
+                blocks.emplace_back(level_t::iter, blk.iter_dim(d));
             if (blk.thread_group().has(d))
                 blocks.emplace_back(
-                        level_kind_t::thread_group, blk.thread_group_dim(d));
+                        level_t::thread_group, blk.thread_group_dim(d));
             if (!layout_dim_ok(prop, tensor_kind, layout, d, std::move(blocks)))
                 return false;
         }
@@ -828,10 +875,10 @@ class conv_blocking_checker_t : public blocking_checker_t {
     bool check_k_slicing_utilization_ok(const context_t &ctx) const {
         if (!is_enabled(check_kind_t::check_k_slicing_utilization)) return true;
 
-        int b = padded_gemm_shape_.get(prb_dims::b, 1);
-        int m = padded_gemm_shape_.get(prb_dims::m, 1);
-        int n = padded_gemm_shape_.get(prb_dims::n, 1);
-        int k = padded_gemm_shape_.get(prb_dims::k, 1);
+        dim_t b = padded_gemm_shape_.get(pvars::b, 1);
+        dim_t m = padded_gemm_shape_.get(pvars::m, 1);
+        dim_t n = padded_gemm_shape_.get(pvars::n, 1);
+        dim_t k = padded_gemm_shape_.get(pvars::k, 1);
 
         int64_t nthr = 1;
         nthr *= utils::div_up(b, ctx.b_iter);
@@ -843,9 +890,9 @@ class conv_blocking_checker_t : public blocking_checker_t {
         return true;
     }
 
-    bool check_deterministic_ok(const context_t &ctx) const {
-        if (!is_enabled(check_kind_t::check_deterministic)) return true;
-        int k = padded_gemm_shape_.get(prb_dims::k, 1);
+    bool check_global_reduction_ok(const context_t &ctx) const {
+        if (!is_enabled(check_kind_t::check_global_reduction)) return true;
+        dim_t k = padded_gemm_shape_.get(pvars::k, 1);
         return ctx.k_loop * ctx.k_iter >= k;
     }
 
@@ -858,7 +905,7 @@ class conv_blocking_checker_t : public blocking_checker_t {
         auto &prb = cfg_.prb();
         int max_blk = 1;
         for (auto &d : ctx.blk.iter()) {
-            if (to_gemm(d, prb) == prb_dims::m) {
+            if (to_gemm(d, prb) == pvars::m) {
                 int d_blk = inner_block(cfg_, d);
                 max_blk = std::max(max_blk, d_blk);
             }
@@ -920,8 +967,8 @@ class conv_blocking_checker_t : public blocking_checker_t {
     }
 
     const conv_config_t &cfg_;
-    const prb_tile_t padded_shape_;
-    const prb_tile_t padded_gemm_shape_;
+    const pvar_tile_t padded_shape_;
+    const pvar_tile_t padded_gemm_shape_;
     const int max_tg_size_ = 0;
 
     uint64_t check_mask_ = 0;
@@ -940,14 +987,10 @@ namespace conv_schemes {
 //   #dim - remove minimum block restriction (minimum is 1)
 conv_blocking_scheme_t fwd_T_wo_I_noi("ls:[ic,kd,kh,kw],T:[oc,ow],i:[mb,oc,ic]");
 conv_blocking_scheme_t fwd_T_no_I_noi("ls:[ic,kd,kh,kw],T:[oc,mb],i:[mb,oc,ic]");
-conv_blocking_scheme_t fwd_T_wn_I_wnoi("ls:[ic,kd,kh,kw],T:[ow,mb],i:[ow,mb,oc,ic]");
 conv_blocking_scheme_t fwd_T_i_I_noi("ls:[ic,kd,kh,kw],T:[ic],i:[mb,oc,ic]");
-conv_blocking_scheme_t fwd_T_iw_I_wnoi("ls:[ic,kd,kh,kw],T:[ic,ow],i:[ow,mb,oc,ic]");
 conv_blocking_scheme_t fwd_T_wo_I_woi("ls:[ic,kd,kh,kw],T:[oc,ow],i:[ow,oc,ic]");
 conv_blocking_scheme_t fwd_T_i_I_woi("ls:[ic,kd,kh,kw],T:[ic],i:[ow,oc,ic]");
 conv_blocking_scheme_t fwd_T_wo_I_woki("ls:[ic,kd,kh,kw],T:[oc,ow],i:[ow,oc,kw,ic]");
-conv_blocking_scheme_t fwd_T_w_I_woki("ls:[ic,kd,kh,kw],T:[ow],i:[ow,oc,kw,ic]");
-conv_blocking_scheme_t fwd_T_w_I_noki("ls:[ic,kd,kh,kw],T:[ow],i:[mb,ow,oc,kw,ic]");
 conv_blocking_scheme_t fwd_T_wo_I_noki("ls:[ic,kd,kh,kw],T:[oc,ow],i:[mb,oc,kw,ic]");
 conv_blocking_scheme_t fwd_dw_T_w_I_wgk("ls:[kd,kh,kw],T:[ow],i:[ow,g,#kw]");
 conv_blocking_scheme_t fwd_dw_T_w_I_ngk("ls:[kd,kh,kw],T:[ow],i:[mb,g,#kw]");
@@ -974,21 +1017,21 @@ conv_blocking_scheme_t bwd_w_T_io_I_ikow("l:[mb,oh,ow],T:[oc,ic],i:[ic,kw,oc,ow]
 // clang-format on
 
 double get_iter_dim_score(
-        const prb_dim_t &dim, const conv_config_t &cfg, int dim_size) {
+        const pvar_t &dim, const conv_config_t &cfg, dim_t dim_size) {
     auto &prb = cfg.prb();
-    if (utils::one_of(dim, prb_dims::ow, prb_dims::iw)) {
+    if (utils::one_of(dim, pvars::ow, pvars::iw)) {
         if (prb.ksp > 1 || dim_size % 16 != 0) return 16 - 1;
         return dim_size;
-    } else if (dim == prb_dims::mb) {
+    } else if (dim == pvars::mb) {
         return dim_size;
     } else {
-        ir_error_not_expected() << "Unknown dimension: " << dim;
+        gpu_error_not_expected() << "Unknown dimension: " << dim;
     }
     return 0;
 }
 
-prb_dim_t select_non_blocked_iter_dim(
-        const conv_config_t &cfg, const std::vector<prb_dim_t> &dims) {
+pvar_t select_non_blocked_iter_dim(
+        const conv_config_t &cfg, const std::vector<pvar_t> &dims) {
     const auto shape = cfg.shape(/*pad=*/false);
     std::vector<double> scores;
     scores.reserve(dims.size());
@@ -998,17 +1041,17 @@ prb_dim_t select_non_blocked_iter_dim(
     return dims[max_it - scores.begin()];
 }
 
-prb_dim_t select_iter_dim(
-        const conv_config_t &cfg, const std::vector<prb_dim_t> &_dims) {
+pvar_t select_iter_dim(
+        const conv_config_t &cfg, const std::vector<pvar_t> &_dims) {
     bool is_bwd_d_w_opt = utils::one_of(cfg.bwd_d_optimize_kind(),
             bwd_d_optimize_kind_t::skip_strided_dhw,
             bwd_d_optimize_kind_t::skip_out_of_bound_w);
-    std::vector<prb_dim_t> dims;
+    std::vector<pvar_t> dims;
     for (auto &d : _dims) {
-        if (is_bwd_d_w_opt && d == prb_dims::iw) continue;
+        if (is_bwd_d_w_opt && d == pvars::iw) continue;
         dims.push_back(d);
     }
-    ir_assert(!dims.empty());
+    gpu_assert(!dims.empty());
     if (dims.size() == 1) return dims[0];
 
     std::vector<int> dim_blocks;
@@ -1021,7 +1064,7 @@ prb_dim_t select_iter_dim(
     for (int i = 0; i < (int)dims.size(); i++) {
         if (dim_blocks[i] == max_block) return dims[i];
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return *dims.begin();
 }
 
@@ -1031,9 +1074,9 @@ using conv_blocking_scheme_list_t
 conv_blocking_scheme_list_t get_blocking_schemes_fwd_dw(
         const conv_config_t &cfg) {
     conv_blocking_scheme_list_t ret(conv_tune_level());
-    auto m_iter_dim = select_iter_dim(cfg, {prb_dims::mb, prb_dims::ow});
-    bool m_is_mb = (m_iter_dim == prb_dims::mb);
-    bool m_is_ow = (m_iter_dim == prb_dims::ow);
+    auto m_iter_dim = select_iter_dim(cfg, {pvars::mb, pvars::ow});
+    bool m_is_mb = (m_iter_dim == pvars::mb);
+    bool m_is_ow = (m_iter_dim == pvars::ow);
     ret.add(m_is_mb, conv_schemes::fwd_dw_T_w_I_ngk);
     ret.add(m_is_ow, conv_schemes::fwd_dw_T_w_I_wgk);
     return ret;
@@ -1042,9 +1085,9 @@ conv_blocking_scheme_list_t get_blocking_schemes_fwd_dw(
 conv_blocking_scheme_list_t get_blocking_schemes_bwd_d_dw(
         const conv_config_t &cfg) {
     conv_blocking_scheme_list_t ret(conv_tune_level());
-    auto m_iter_dim = select_iter_dim(cfg, {prb_dims::mb, prb_dims::iw});
-    bool m_is_mb = (m_iter_dim == prb_dims::mb);
-    bool m_is_iw = (m_iter_dim == prb_dims::iw);
+    auto m_iter_dim = select_iter_dim(cfg, {pvars::mb, pvars::iw});
+    bool m_is_mb = (m_iter_dim == pvars::mb);
+    bool m_is_iw = (m_iter_dim == pvars::iw);
     ret.add(m_is_mb, conv_schemes::bwd_d_dw_T_w_I_ng);
     ret.add(m_is_iw, conv_schemes::bwd_d_dw_T_w_I_wg);
     return ret;
@@ -1053,50 +1096,44 @@ conv_blocking_scheme_list_t get_blocking_schemes_bwd_d_dw(
 conv_blocking_scheme_list_t get_blocking_schemes_bwd_w_dw(
         const conv_config_t &cfg) {
     conv_blocking_scheme_list_t ret(conv_tune_level());
-    auto k_iter_dim = select_iter_dim(cfg, {prb_dims::mb, prb_dims::ow});
-    bool k_is_mb = (k_iter_dim == prb_dims::mb);
-    bool k_is_ow = (k_iter_dim == prb_dims::ow);
+    auto k_iter_dim = select_iter_dim(cfg, {pvars::mb, pvars::ow});
+    bool k_is_mb = (k_iter_dim == pvars::mb);
+    bool k_is_ow = (k_iter_dim == pvars::ow);
     ret.add(k_is_mb, conv_schemes::bwd_w_dw_I_gn);
     ret.add(k_is_ow, conv_schemes::bwd_w_dw_I_gw);
-    ret.add(k_is_mb && cfg.prb().deterministic, conv_schemes::bwd_w_dw_I_gn_d);
-    ret.add(k_is_ow && cfg.prb().deterministic, conv_schemes::bwd_w_dw_I_gw_d);
+    ret.add(k_is_mb && !cfg.allow_global_reduction(),
+            conv_schemes::bwd_w_dw_I_gn_d);
+    ret.add(k_is_ow && !cfg.allow_global_reduction(),
+            conv_schemes::bwd_w_dw_I_gw_d);
     return ret;
 }
 
 conv_blocking_scheme_list_t get_blocking_schemes_fwd(const conv_config_t &cfg) {
     conv_blocking_scheme_list_t ret(conv_tune_level());
-    auto m_iter_dim = cfg.prb().ab_swap_transpose
-            ? prb_dims::oc
-            : select_iter_dim(cfg, {prb_dims::mb, prb_dims::ow});
-    bool m_is_mb = (m_iter_dim == prb_dims::mb);
-    bool m_is_ow = (m_iter_dim == prb_dims::ow);
-    bool m_is_oc = (m_iter_dim == prb_dims::oc);
+    auto m_iter_dim = select_iter_dim(cfg, {pvars::mb, pvars::ow});
+    bool mb_iter = (m_iter_dim == pvars::mb);
+    bool ow_iter = (m_iter_dim == pvars::ow);
     bool ge_xelp = (cfg.hw() >= ngen::HW::XeLP);
     bool small_ic = (is_small_ic(cfg.prb()) && cfg.prb().kw > 1);
-    ret.add(m_is_mb, conv_schemes::fwd_T_wo_I_noi);
-    ret.add(m_is_mb, conv_schemes::fwd_T_no_I_noi);
-    ret.add(m_is_mb && ge_xelp, conv_schemes::fwd_T_i_I_noi);
-    ret.add(m_is_oc, conv_schemes::fwd_T_wn_I_wnoi);
-    ret.add(m_is_oc && ge_xelp, conv_schemes::fwd_T_i_I_noi);
-    ret.add(m_is_oc && ge_xelp, conv_schemes::fwd_T_iw_I_wnoi);
-    ret.add(m_is_ow, conv_schemes::fwd_T_wo_I_woi);
-    ret.add(m_is_ow && ge_xelp, conv_schemes::fwd_T_i_I_woi);
-    ret.add(m_is_mb && small_ic, conv_schemes::fwd_T_wo_I_noki);
-    ret.add(m_is_oc && small_ic, conv_schemes::fwd_T_w_I_woki);
-    ret.add(m_is_oc && small_ic, conv_schemes::fwd_T_w_I_noki);
-    ret.add(m_is_ow && small_ic, conv_schemes::fwd_T_wo_I_woki);
+    ret.add(mb_iter, conv_schemes::fwd_T_wo_I_noi);
+    ret.add(mb_iter, conv_schemes::fwd_T_no_I_noi);
+    ret.add(mb_iter && ge_xelp, conv_schemes::fwd_T_i_I_noi);
+    ret.add(ow_iter, conv_schemes::fwd_T_wo_I_woi);
+    ret.add(ow_iter && ge_xelp, conv_schemes::fwd_T_i_I_woi);
+    ret.add(mb_iter && small_ic, conv_schemes::fwd_T_wo_I_noki);
+    ret.add(ow_iter && small_ic, conv_schemes::fwd_T_wo_I_woki);
     return ret;
 }
 
 conv_blocking_scheme_list_t get_blocking_schemes_bwd_d(
         const conv_config_t &cfg) {
     conv_blocking_scheme_list_t ret(conv_tune_level());
-    auto m_iter_dim = cfg.prb().ab_swap_transpose
-            ? prb_dims::ic
-            : select_iter_dim(cfg, {prb_dims::mb, prb_dims::iw});
-    bool m_is_mb = (m_iter_dim == prb_dims::mb);
-    bool m_is_iw = (m_iter_dim == prb_dims::iw);
-    bool m_is_ic = (m_iter_dim == prb_dims::ic);
+    const auto &m_iter_dim = cfg.prb().ab_swap_transpose
+            ? pvars::ic
+            : select_iter_dim(cfg, {pvars::mb, pvars::iw});
+    bool m_is_mb = (m_iter_dim == pvars::mb);
+    bool m_is_iw = (m_iter_dim == pvars::iw);
+    bool m_is_ic = (m_iter_dim == pvars::ic);
     bool ge_xelp = (cfg.hw() >= ngen::HW::XeLP);
     ret.add(m_is_mb, conv_schemes::bwd_d_T_ni_I_nio);
     ret.add(m_is_mb, conv_schemes::bwd_d_T_wi_I_nio);
@@ -1110,17 +1147,18 @@ conv_blocking_scheme_list_t get_blocking_schemes_bwd_d(
 conv_blocking_scheme_list_t get_blocking_schemes_bwd_w(
         const conv_config_t &cfg) {
     conv_blocking_scheme_list_t ret(conv_tune_level());
-    auto k_iter_dim = select_iter_dim(cfg, {prb_dims::mb, prb_dims::ow});
-    bool k_is_mb = (k_iter_dim == prb_dims::mb);
-    bool k_is_ow = (k_iter_dim == prb_dims::ow);
+    auto k_iter_dim = select_iter_dim(cfg, {pvars::mb, pvars::ow});
+    bool k_is_mb = (k_iter_dim == pvars::mb);
+    bool k_is_ow = (k_iter_dim == pvars::ow);
     bool small_ic = is_small_ic(cfg.prb());
-    ret.add(k_is_mb, conv_schemes::bwd_w_T_io_I_ion);
-    ret.add(k_is_ow, conv_schemes::bwd_w_T_io_I_iow);
+    bool strided = cfg.prb().strided;
+    ret.add(k_is_mb || strided, conv_schemes::bwd_w_T_io_I_ion);
+    ret.add(k_is_ow || strided, conv_schemes::bwd_w_T_io_I_iow);
     ret.add(k_is_mb && small_ic, conv_schemes::bwd_w_T_io_I_kon);
     ret.add(k_is_mb && small_ic, conv_schemes::bwd_w_T_io_I_ikon);
     ret.add(k_is_ow && small_ic, conv_schemes::bwd_w_T_io_I_ikow);
-    ret.add(cfg.prb().deterministic, conv_schemes::bwd_w_T_io_I_ion_d);
-    ret.add(cfg.prb().deterministic, conv_schemes::bwd_w_T_io_I_iow_d);
+    ret.add(!cfg.allow_global_reduction(), conv_schemes::bwd_w_T_io_I_ion_d);
+    ret.add(!cfg.allow_global_reduction(), conv_schemes::bwd_w_T_io_I_iow_d);
     return ret;
 }
 
@@ -1130,7 +1168,7 @@ conv_blocking_scheme_list_t get_blocking_schemes_dw_impl(
     if (prb.is_fwd) return get_blocking_schemes_fwd_dw(cfg);
     if (prb.is_bwd_d) return get_blocking_schemes_bwd_d_dw(cfg);
     if (prb.is_bwd_w) return get_blocking_schemes_bwd_w_dw(cfg);
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return conv_blocking_scheme_list_t();
 }
 
@@ -1141,7 +1179,7 @@ conv_blocking_scheme_list_t get_blocking_schemes_impl(
     if (prb.is_fwd) return get_blocking_schemes_fwd(cfg);
     if (prb.is_bwd_d) return get_blocking_schemes_bwd_d(cfg);
     if (prb.is_bwd_w) return get_blocking_schemes_bwd_w(cfg);
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return conv_blocking_scheme_list_t();
 }
 
@@ -1155,14 +1193,15 @@ std::vector<conv_blocking_scheme_t> get_blocking_schemes(
 
 } // namespace
 
-int grf_usage_bytes(const conv_config_t &cfg, const blocking_params_t &params) {
+dim_t grf_usage_bytes(
+        const conv_config_t &cfg, const blocking_params_t &params) {
     auto &prb = cfg.prb();
     auto iter = to_gemm(params.blocking().iter(), prb);
-    int b_iter = iter.get(prb_dims::b, 1);
-    int m_iter = iter.get(prb_dims::m, 1);
-    int n_iter = iter.get(prb_dims::n, 1);
-    int k_iter = iter.get(prb_dims::k, 1);
-    int abc_size = grf_usage_bytes(cfg.fma_kind(), b_iter, m_iter, n_iter,
+    dim_t b_iter = iter.get(pvars::b, 1);
+    dim_t m_iter = iter.get(pvars::m, 1);
+    dim_t n_iter = iter.get(pvars::n, 1);
+    dim_t k_iter = iter.get(pvars::k, 1);
+    dim_t abc_size = grf_usage_bytes(cfg.fma_kind(), b_iter, m_iter, n_iter,
             k_iter, prb.a_data_type_size, prb.b_data_type_size,
             prb.acc_data_type_size);
     return abc_size;
@@ -1191,12 +1230,12 @@ void sort_by_model_scores(params_generator_t &params_gen,
     for (auto &p : params_gen.params_vec()) {
         float score = model::get_score(cfg, p);
         float eff = p.blocking().get_efficiency(cfg.shape(/*pad=*/true));
-        int regs = utils::div_up(grf_usage_bytes(cfg, p), cfg.grf_size());
+        dim_t regs = utils::div_up(grf_usage_bytes(cfg, p), cfg.grf_size());
         int slm_size = slm_usage_bytes_for_params(cfg, p);
         table << p.str() << (int)(score * 1000) / 1000.0 << eff << regs
               << slm_size << std::endl;
     }
-    ir_trace() << table.str() << std::endl;
+    gpu_trace() << table.str();
 #endif
     MAYBE_UNUSED(&slm_usage_bytes_for_params);
 }
@@ -1255,12 +1294,11 @@ class conv_tuner_t {
         maybe_rescore();
     }
 
-    bool can_move_next() const { return params_gen_.can_move_next(); }
+    bool is_valid() const { return params_gen_.is_valid(); }
 
-    void move_next() {
-        ir_assert(can_move_next());
-        params_gen_.move_next();
-    }
+    void move_next() { params_gen_.move_next(); }
+
+    int cur_index() const { return params_gen_.cur_index(); }
 
     void print_all() const { params_gen_.print_all(); }
 
@@ -1281,7 +1319,7 @@ class conv_tuner_t {
             blocking_checker_t &chk,
             const std::vector<level_tile_set_t> &level_tile_sets,
             const conv_key_t &key, double ops,
-            const std::function<prb_tile_t(const prb_tile_t &)> &convert,
+            const std::function<pvar_tile_t(const pvar_tile_t &)> &convert,
             bool create_if_not_found = false) {
         std::lock_guard<std::mutex> lock(mutex_);
         auto *tuner = get_tuner(key, /*do_lock=*/false);
@@ -1290,10 +1328,10 @@ class conv_tuner_t {
 
         params_generator_t params_gen(
                 tune_level, simd_size, chk, level_tile_sets);
-        std::vector<std::vector<prb_tile_t>> tiles;
+        std::vector<std::vector<pvar_tile_t>> tiles;
         for (auto &p : params_gen.params_vec()) {
             auto &b = p.blocking();
-            std::vector<prb_tile_t> p_tiles;
+            std::vector<pvar_tile_t> p_tiles;
             p_tiles.push_back(convert(b.iter()));
             p_tiles.push_back(convert(b.thread_group()));
             p_tiles.push_back(convert(b.loop()));
@@ -1366,16 +1404,15 @@ class conv_tuner_t {
         if (tune_data_.reported_points() != created_configs_) return;
         if (created_configs_ < 8) return;
 
-        int beg = params_gen_.cur_index() + 1;
+        int beg = params_gen_.cur_index();
         int end = params_gen_.configs();
         if (beg == end) return;
 
         const int nbest = 5;
         auto best_ids = tune_data_.best_ids(nbest);
         std::unordered_map<int, float> dists;
-        ir_trace() << "[Tuning] Rescoring: " << (end - beg) << " configs left"
-                   << std::endl;
-        ir_trace() << "  Best config: " << best_params_dbg_ << std::endl;
+        gpu_trace() << "[Tuning] Rescoring: " << (end - beg) << " configs left";
+        gpu_trace() << "  Best config: " << best_params_dbg_;
         for (int i = beg; i < end; i++) {
             auto &p = params_gen_.at(i);
             dists[p.id()] = std::numeric_limits<float>::max();
@@ -1389,8 +1426,7 @@ class conv_tuner_t {
 
         for (int i = beg; i < end; i++) {
             auto &p = params_gen_.at(i);
-            ir_trace() << "  " << p << " [dist:" << dists[p.id()] << "]"
-                       << std::endl;
+            gpu_trace() << "  " << p << " [dist:" << dists[p.id()] << "]";
         }
     }
 
@@ -1421,6 +1457,13 @@ std::unordered_map<const impl::primitive_t *, conv_tuner_t::primitive_info_t>
 
 std::mutex conv_tuner_t::mutex_;
 
+enum class grf_mode_policy_t {
+    // Try 128 GRF mode based on heuristics.
+    try_small_grf = 0,
+    // Use default_regs().
+    _default = 1
+};
+
 class conv_tiler_impl_t {
 public:
     conv_tiler_impl_t() = default;
@@ -1443,20 +1486,46 @@ class conv_tiler_impl_t {
 
     bool is_tuning_mode() const { return tuner_; }
 
-    bool can_move_next() const {
-        if (is_tuning_mode()) return tuner_->can_move_next();
-        return params_gen_.can_move_next();
+    bool is_valid() const {
+        if (is_tuning_mode()) return tuner_->is_valid();
+        return params_gen_.is_valid();
+    }
+
+    void move_next(const conv_config_t &cfg) {
+        if (is_tuning_mode()) {
+            tuner_->move_next();
+            return;
+        }
+        if (grf_mode_policy_ == grf_mode_policy_t::try_small_grf
+                && cfg.regs() != default_regs(cfg)) {
+            grf_mode_policy_ = grf_mode_policy_t::_default;
+            return;
+        }
+        grf_mode_policy_ = grf_mode_policy_t::try_small_grf;
+        params_gen_.move_next();
+    }
+
+    int32_t cur_version() const {
+        return pack_version(is_tuning_mode() ? tuner_->cur_index()
+                                             : params_gen_.cur_index(),
+                grf_mode_policy_);
+    }
+
+    void set_cur_version(int32_t version) {
+        gpu_assert(!is_tuning_mode());
+        int idx;
+        unpack_version(version, idx, grf_mode_policy_);
+        params_gen_.set_cur_index(idx);
     }
 
     void set_params(conv_config_t &cfg) {
         init_regs(cfg);
         if (is_tuning_mode()) {
-            tuner_->move_next();
             tuner_->set_params(cfg);
         } else {
-            if (!try_small_grf_) params_gen_.move_next();
             params_gen_.set_params(cfg);
-            maybe_try_small_grf(cfg);
+            if (grf_mode_policy_ == grf_mode_policy_t::try_small_grf)
+                maybe_try_small_grf(cfg);
         }
     }
 
@@ -1480,6 +1549,16 @@ class conv_tiler_impl_t {
     }
 
 private:
+    static int32_t pack_version(int idx, grf_mode_policy_t policy) {
+        return idx * 2 + static_cast<int>(policy);
+    }
+
+    static void unpack_version(
+            int32_t version, int &idx, grf_mode_policy_t &policy) {
+        idx = version / 2;
+        policy = static_cast<grf_mode_policy_t>(version % 2);
+    }
+
     void init(const conv_config_t &cfg) {
         if (cfg.loop_dims().is_overridden()
                 || cfg.thread_group_dims().is_overridden()
@@ -1512,8 +1591,7 @@ class conv_tiler_impl_t {
             case tiler_mode_t::lookup: {
                 const auto params = const_conv_lookup_table().find(cfg.key());
                 if (!params.is_empty() && chk.is_ok(params.blocking())) {
-                    ir_info() << "[INFO] Using lookup table config: "
-                              << params.str() << std::endl;
+                    gpu_info() << "Using lookup table config: " << params.str();
                     params_gen_ = params_generator_t(tune_level, simd_size, chk,
                             level_tile_sets, params);
                 } else {
@@ -1524,7 +1602,7 @@ class conv_tiler_impl_t {
                 break;
             }
             case tiler_mode_t::tune: {
-                auto convert = [&](const prb_tile_t &tile) {
+                auto convert = [&](const pvar_tile_t &tile) {
                     return to_gemm(tile, cfg.prb());
                 };
                 tuner_ = conv_tuner_t::get_tuner(tune_level, simd_size, chk,
@@ -1538,46 +1616,42 @@ class conv_tiler_impl_t {
                 break;
         }
         if (!is_tuning_mode()) {
-            ir_assert(!params_gen_.is_empty()) << "No configurations found.";
+            gpu_assert(!params_gen_.is_empty()) << "No configurations found.";
             sort_by_model_scores(params_gen_, cfg, mode_);
         }
         if (tiler_params().do_list) print_all();
     }
 
     void maybe_try_small_grf(conv_config_t &cfg) {
+        if (cfg.regs() == 128 || cfg.exec_cfg_param().is_overridden("regs"))
+            return;
         auto try_cfg = cfg;
         init_walk_order(try_cfg);
         init_kernel_grid(try_cfg);
         init_thread_group_grid(try_cfg);
-        int kg_elems = try_cfg.kernel_grid().elems(),
-            tg_elems = try_cfg.thread_group_grid().elems();
+        dim_t kg_elems = try_cfg.kernel_grid().elems(),
+              tg_elems = try_cfg.thread_group_grid().elems();
         try_cfg.set_regs(128);
-        int new_wave_util = conv_config_t::get_wave_utilization(
-                try_cfg.exec_cfg(), kg_elems, tg_elems);
-        int wave_util = conv_config_t::get_wave_utilization(
-                cfg.exec_cfg(), kg_elems, tg_elems);
-        if (wave_util > 90 && new_wave_util >= wave_util && !try_small_grf_
-                && cfg.regs() > 128
-                && !cfg.exec_cfg_param().is_overridden("regs")) {
-            cfg.set_regs(128);
-            try_small_grf_ = true;
-        } else {
-            try_small_grf_ = false;
-        }
+        int new_wave_util
+                = static_cast<int>(conv_config_t::get_wave_utilization(
+                        try_cfg.exec_cfg(), kg_elems, tg_elems));
+        int wave_util = static_cast<int>(conv_config_t::get_wave_utilization(
+                cfg.exec_cfg(), kg_elems, tg_elems));
+        if (wave_util > 90 && new_wave_util >= wave_util) cfg.set_regs(128);
     }
 
     void print_info(double init_time_ms) {
-        ir_info() << "Convolution tiler:" << std::endl;
-        ir_info() << "  Mode:              " << to_string(mode_) << std::endl;
-        ir_info() << "  Filtered configs:  " << configs() << std::endl;
-        ir_info() << "  Init time (ms):    " << init_time_ms << std::endl;
+        gpu_info() << "Convolution tiler:";
+        gpu_info() << "  Mode:              " << to_string(mode_);
+        gpu_info() << "  Filtered configs:  " << configs();
+        gpu_info() << "  Init time (ms):    " << init_time_ms;
     }
 
     tiler_mode_t mode_ = tiler_mode_t::undef;
     params_generator_t params_gen_;
     conv_tuner_t *tuner_ = nullptr;
     int grf_usage_limit_ = 0;
-    bool try_small_grf_ = false;
+    grf_mode_policy_t grf_mode_policy_ = grf_mode_policy_t::try_small_grf;
 };
 
 conv_tiler_t::conv_tiler_t(const conv_config_t &cfg)
@@ -1591,8 +1665,20 @@ bool conv_tiler_t::is_tuning_mode() const {
     return impl_->is_tuning_mode();
 }
 
-bool conv_tiler_t::can_move_next() const {
-    return impl_->can_move_next();
+bool conv_tiler_t::is_valid() const {
+    return impl_->is_valid();
+}
+
+void conv_tiler_t::move_next(const conv_config_t &cfg) {
+    impl_->move_next(cfg);
+}
+
+int32_t conv_tiler_t::cur_version() const {
+    return impl_->cur_version();
+}
+
+void conv_tiler_t::set_cur_version(int32_t version) {
+    impl_->set_cur_version(version);
 }
 
 void conv_tiler_t::set_params(conv_config_t &cfg) {
diff --git a/src/gpu/intel/jit/conv/tiler.hpp b/src/gpu/intel/jit/conv/tiler.hpp
index cbd579abf43..2caeaff6e28 100644
--- a/src/gpu/intel/jit/conv/tiler.hpp
+++ b/src/gpu/intel/jit/conv/tiler.hpp
@@ -41,7 +41,10 @@ class conv_tiler_t {
     void set_tuner(conv_tuner_t *tuner);
     int configs() const;
     bool is_tuning_mode() const;
-    bool can_move_next() const;
+    bool is_valid() const;
+    void move_next(const conv_config_t &cfg);
+    int32_t cur_version() const;
+    void set_cur_version(int32_t idx);
     void set_params(conv_config_t &cfg);
     void notify_out_of_registers(const conv_config_t &cfg);
     bool is_grf_limit_ok(const conv_config_t &cfg) const;
diff --git a/src/gpu/intel/jit/conv/zero_out.cpp b/src/gpu/intel/jit/conv/zero_out.cpp
new file mode 100644
index 00000000000..37102597046
--- /dev/null
+++ b/src/gpu/intel/jit/conv/zero_out.cpp
@@ -0,0 +1,98 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/conv/zero_out.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+const size_t zero_out_kernel_desc_t::bytes_per_thr = 128;
+
+std::string zero_out_kernel_desc_t::kernel_name() const {
+    return "zero_out";
+}
+
+exec_config_t zero_out_kernel_desc_t::exec_cfg(
+        const impl::engine_t *engine) const {
+    return exec_config_t(hw_t(engine), regs_, simd_);
+}
+compute::range_t zero_out_kernel_desc_t::local_range() const {
+    return compute::range_t(into<size_t>(simd_));
+}
+
+void zero_out_kernel_desc_t::init_kernel_iface(
+        kernel_iface_t &kernel_iface) const {
+    kernel_iface.register_arg("size", type_t::u32());
+    kernel_iface.register_arg("ptr", type_t::byte_ptr());
+}
+
+void zero_out_kernel_desc_t::init_kernel_info(kernel_info_t &kernel_info,
+        const kernel_params_base_t &_params,
+        const impl::engine_t *engine) const {
+    auto &params = static_cast<const zero_out_kernel_params_t &>(_params);
+    for (int i = 0; i < kernel_info.nargs(); i++) {
+        auto &name = kernel_info.arg_name(i);
+        auto &var = kernel_info.arg_var(i);
+        if (var.type().is_ptr()) continue;
+        gpu_assert(name == "size") << "Unknown scalar argument: " << name;
+        kernel_info.set_internal_arg(name, into<uint32_t>(params.size));
+    }
+    kernel_info.set_nd_range(nd_range(simd_, params.size));
+}
+
+status_t zero_out_kernel_desc_t::create_kernel(compute::kernel_t &kernel,
+        gpu_primitive_t *primitive, impl::engine_t *engine) const {
+    return primitive->create_kernel(
+            engine, kernel, kernel_name().c_str(), *this);
+}
+
+status_t zero_out_kernel_desc_t::create_generator(
+        const compute::compute_engine_t &engine,
+        compute::kernel_t &kernel) const {
+    ir_generator_t<zero_out_kernel_t> ir_gen(*this);
+    return engine.create_kernel(&kernel, &ir_gen);
+}
+
+serialization_stream_t zero_out_kernel_desc_t::serialize() const {
+    return serialization_stream_t(regs_, simd_, dpas_);
+}
+
+zero_out_kernel_desc_t zero_out_kernel_desc_t::deserialize(
+        const serialization_stream_t &s) {
+    zero_out_kernel_desc_t desc;
+    deserializer_t d(s);
+    d.pop(desc.regs_);
+    d.pop(desc.simd_);
+    d.pop(desc.dpas_);
+    return desc;
+}
+
+compute::nd_range_t zero_out_kernel_desc_t::nd_range(int simd, size_t size) {
+    return compute::nd_range_t(
+            into<size_t>(
+                    utils::div_up(size, zero_out_kernel_desc_t::bytes_per_thr)
+                    * simd),
+            simd);
+}
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/conv/zero_out.hpp b/src/gpu/intel/jit/conv/zero_out.hpp
index 08e8e65987d..50846f237b3 100644
--- a/src/gpu/intel/jit/conv/zero_out.hpp
+++ b/src/gpu/intel/jit/conv/zero_out.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +19,10 @@
 
 #include "gpu/intel/jit/codegen/kernel.hpp"
 #include "gpu/intel/jit/codegen/register_scope.hpp"
+#include "gpu/intel/jit/ir/kernel_desc.hpp"
+#include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,31 +30,71 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
+class zero_out_kernel_desc_t : public kernel_desc_base_t {
+public:
+    static const size_t bytes_per_thr;
+
+    zero_out_kernel_desc_t() = default;
+    zero_out_kernel_desc_t(int regs, int simd, bool dpas)
+        : regs_(regs), simd_(simd), dpas_(dpas) {}
+    std::string kernel_name() const override;
+    exec_config_t exec_cfg(const impl::engine_t *engine) const override;
+    bool with_dpas() const override { return dpas_; }
+    compute::range_t local_range() const override;
+    void init_kernel_iface(kernel_iface_t &kernel_iface) const override;
+    void init_kernel_info(kernel_info_t &kernel_info,
+            const kernel_params_base_t &params,
+            const impl::engine_t *engine) const override;
+    status_t create_kernel(compute::kernel_t &kernel,
+            gpu_primitive_t *primitive, impl::engine_t *engine) const override;
+    status_t create_generator(const compute::compute_engine_t &engine,
+            compute::kernel_t &kernel) const;
+    serialization_stream_t serialize() const override;
+    static zero_out_kernel_desc_t deserialize(const serialization_stream_t &s);
+
+    static compute::nd_range_t nd_range(int simd, size_t size);
+
+private:
+    int regs_ = 0;
+    int simd_ = 0;
+    bool dpas_ = false;
+};
+
+class zero_out_kernel_params_t : public kernel_params_base_t {
+public:
+    zero_out_kernel_params_t() = default;
+    zero_out_kernel_params_t(size_t size) : size(size) {}
+
+    size_t size = 0;
+};
+
 template <ngen::HW hw = ngen::HW::Unknown>
 class zero_out_kernel_t : public ir_kernel_t<hw> {
 public:
     IR_KERNEL_FORWARD(hw)
 
     zero_out_kernel_t(const exec_config_t &exec_cfg,
-            const kernel_info_t &kernel_info, bool require_dpas)
-        : ir_kernel_t<hw>("zero_out", exec_cfg, kernel_info,
-                kernel_info.nd_range().local_range(), require_dpas) {
-
+            const kernel_info_t &kernel_info, bool require_dpas,
+            const impl::engine_t *engine)
+        : zero_out_kernel_t<hw>(zero_out_kernel_desc_t(exec_cfg.regs(),
+                                        exec_cfg.simd(), require_dpas),
+                engine) {}
+
+    zero_out_kernel_t(
+            const kernel_desc_base_t &_desc, const impl::engine_t *engine)
+        : ir_kernel_t<hw>(_desc, engine, {GENERATOR_NAME, GENERATOR_LINE}) {
         setup_interface();
         generate_prologue();
 
-        std::vector<std::string> arg_names(kernel_info.nargs());
-        for (int i = 0; i < kernel_info.nargs(); i++) {
-            arg_names[i] = kernel_info.arg_name(i);
-        }
-
         int simd_size = getSIMD();
         bool use_lsc = (hw >= ngen::HW::XeHPG);
 
-        auto size = getArgument(arg_names[0]);
-        auto ptr = getArgument(arg_names[1]);
+        auto size = getArgument(kernel_iface().arg_name(0));
+        auto ptr = getArgument(kernel_iface().arg_name(1));
         auto global_id = ra_.template alloc_sub<uint32_t>();
         auto off0 = ra_.template alloc_sub<uint32_t>();
+        const int bytes_per_thr
+                = into<int>(zero_out_kernel_desc_t::bytes_per_thr);
 
         mul(1, global_id, r0.ud(1), getLocalSize(0).uw());
         add(1, global_id, global_id, getLocalID(0));
@@ -116,20 +158,8 @@ class zero_out_kernel_t : public ir_kernel_t<hw> {
 
         generate_epilogue();
     }
-
-    static compute::nd_range_t nd_range(int simd, int size) {
-        return compute::nd_range_t(
-                gpu_utils::into<size_t>(
-                        utils::div_up(size, bytes_per_thr) * simd),
-                simd);
-    }
-
-    static const int bytes_per_thr;
 };
 
-template <ngen::HW hw>
-const int zero_out_kernel_t<hw>::bytes_per_thr = 128;
-
 } // namespace jit
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/jit/conv/zp_plan.cpp b/src/gpu/intel/jit/conv/zp_plan.cpp
index 65d7cc0a785..c0a478f2f12 100644
--- a/src/gpu/intel/jit/conv/zp_plan.cpp
+++ b/src/gpu/intel/jit/conv/zp_plan.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,7 +62,7 @@ inline std::string to_string(zp_comp_kind_t kind) {
         CASE(wei_Xn4k_x8_zp_per_k);
         CASE(wei_Xb_s16);
         CASE(wei_Xn_s16);
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
 #undef CASE
     }
     return "unknown";
@@ -74,7 +74,7 @@ inline std::ostream &operator<<(std::ostream &out, zp_comp_kind_t kind) {
 }
 
 static bool ends_with(
-        const layout_t &layout, int i0, int b0, bool strict = false) {
+        const layout_t &layout, dim_idx_t i0, int b0, bool strict = false) {
     if (layout.nblocks() < 1) return false;
     auto &blocks = layout.blocks();
     if (blocks[0].dim_idx != i0) return false;
@@ -83,7 +83,8 @@ static bool ends_with(
     return true;
 }
 
-static bool ends_with(const layout_t &layout, int i1, int b1, int i0, int b0) {
+static bool ends_with(
+        const layout_t &layout, dim_idx_t i1, int b1, int i0, int b0) {
     if (!ends_with(layout, i0, b0, /*strict=*/true)) return false;
     if (layout.nblocks() < 2) return false;
     auto &blocks = layout.blocks();
@@ -97,7 +98,7 @@ static type_t dpas_type(type_t t) {
 }
 
 static void get_kw_ic_from_b_view(const gemm_schedule_t &gemm_schedule,
-        expr_t &kw_var, expr_t &ic_var, int &kw, int &ic) {
+        expr_t &kw_var, expr_t &ic_var, dim_t &kw, dim_t &ic) {
     loop_kind_t exp_loops = loop_kind_t::tg_grid | loop_kind_t::kernel_grid
             | loop_kind_t::serial;
     auto &view = gemm_schedule.b_view();
@@ -114,7 +115,7 @@ class split_dispatcher_t {
     split_dispatcher_t() = default;
     split_dispatcher_t(const layout_t &comp_layout, const layout_t &c_layout,
             const hw_t &hw, bool is_fwd, const bmnk_mapper_t &mapper) {
-        ir_assert(comp_layout.ndims() == 6);
+        gpu_assert(comp_layout.ndims() == 6);
         comp_g_idx_ = 0;
         comp_c_idx_ = (is_fwd) ? 1 : 2;
         c_g_idx_ = 1;
@@ -145,7 +146,7 @@ class split_dispatcher_t {
                 return;
             }
         }
-        ir_error_not_expected()
+        gpu_error_not_expected()
                 << "Can't initialize SIMD size, comp_layout = " << comp_layout
                 << ", c_layout = " << c_layout;
     }
@@ -176,7 +177,7 @@ class split_dispatcher_t {
     }
 
     void set_split(abc_kind_t abc, int factor) {
-        ir_assert(can_split(abc, factor));
+        gpu_assert(can_split(abc, factor));
         if (factor == 1) {
             split_ = split_t::no_split();
             return;
@@ -199,8 +200,8 @@ class split_dispatcher_t {
     public:
         split_t() = default;
         split_t(const layout_t &c, const bmnk_mapper_t &mapper, abc_kind_t abc,
-                int factor, int simd_dim_idx, int simd) {
-            ir_assert(factor > 1);
+                int factor, dim_idx_t simd_dim_idx, int simd) {
+            gpu_assert(factor > 1);
             bmnk_kind_t split_mn
                     = (abc == abc_kind_t::a ? bmnk_kind_t::m : bmnk_kind_t::n);
             dim_t dim = 1;
@@ -215,9 +216,9 @@ class split_dispatcher_t {
             if (dim % factor != 0) return;
             dim_t subtile_dim = dim / factor;
             dim_t cur_dim = 1;
-            int cur_idx = -1;
+            dim_idx_t cur_idx = dim_idx::invalid;
             for (auto &b : mn_blocks) {
-                if (cur_idx == -1) {
+                if (cur_idx == dim_idx::invalid) {
                     cur_idx = b.dim_idx;
                 } else if (b.dim_idx != cur_idx) {
                     return;
@@ -250,16 +251,16 @@ class split_dispatcher_t {
                 const std::vector<dim_t> &start, int subtile_idx) const {
             if (!*this) return false;
             if (factor_ == 1) return true;
-            int beg = subtile_idx * subtile_dim_;
-            int end = beg + subtile_dim_;
+            dim_t beg = subtile_idx * subtile_dim_;
+            dim_t end = beg + subtile_dim_;
             return start[dim_idx_] >= beg && start[dim_idx_] < end;
         }
 
     private:
         abc_kind_t abc_ = abc_kind_t::undef;
         int factor_ = 0;
-        int dim_idx_ = -1;
-        int subtile_dim_ = -1;
+        dim_idx_t dim_idx_ = dim_idx::invalid;
+        dim_t subtile_dim_ = -1;
     };
 
     tensor_t get_simd_tile(const layout_t &c_layout) const {
@@ -295,14 +296,14 @@ class zp_wei_init_plan_t : public base_plan_t {
         , b_layout_(b_layout)
         , data_type_(data_type)
         , simd_(simd) {
-        ir_assert(zp_layout_.blocks().empty());
+        gpu_assert(zp_layout_.blocks().empty());
     }
 
     int wei_reg_buf_size() const {
-        ir_assert(b_layout_.size()
+        gpu_assert(b_layout_.size()
                         % (sdepth_ * simd_ * dpas_type(data_type_).size())
                 == 0);
-        return utils::rnd_up(b_layout_.size(), grf_size());
+        return utils::rnd_up(into<int>(b_layout_.size()), grf_size());
     }
 
     int estimate_regs() const {
@@ -311,16 +312,13 @@ class zp_wei_init_plan_t : public base_plan_t {
 
     stmt_t create_stmt(const expr_t &wei_buf, const expr_t &dpas_buf,
             const gemm_schedule_t &gemm_schedule, int subtile_idx) {
-        auto simd_bcast = [&](const expr_t &e) {
-            return shuffle_t::make_broadcast(e, simd_);
-        };
         if (subtile_idx > 0) return stmt_t();
 
-        ir_assert(zp_layout_.blocks().empty());
+        gpu_assert(zp_layout_.blocks().empty());
         auto data_size = data_type_.size();
         auto dpas_size = dpas_type(data_type_).size();
         auto sdepth_size = simd_ * dpas_size;
-        ir_assert(zp_layout_.type().is_s8());
+        gpu_assert(zp_layout_.type().is_s8());
         auto wei_load = -load_t::make(zp_layout_.type(), wei_buf, 0);
         stmt_t stmt = store_t::make(wei_buf, 0, wei_load);
         int size = dpas_size / data_size;
@@ -331,12 +329,12 @@ class zp_wei_init_plan_t : public base_plan_t {
                     cast_t::make(data_type_.with_elems(size), wei_load)));
         }
         expr_t kw_var, ic_var;
-        int kw = 0, ic = 0;
+        dim_t kw = 0, ic = 0;
         get_kw_ic_from_b_view(gemm_schedule, kw_var, ic_var, kw, ic);
         kw_var = simd_bcast(kw_var);
 
         bool small_ic = is_small(data_type_, ic);
-        int kw_idx = 5; // TODO: support non-forward kw!
+        dim_idx_t kw_idx = 5; // TODO: support non-forward kw!
 
         std::vector<dim_t> tile_dim(b_layout_.ndims(), 1);
         for (auto &b : b_layout_.blocks()) {
@@ -344,7 +342,7 @@ class zp_wei_init_plan_t : public base_plan_t {
             tile_dim[b.dim_idx] *= b.block;
         }
         tensor_t tile(tile_dim);
-        ir_assert(tile.elems() % sdepth_size == 0);
+        gpu_assert(tile.elems() % sdepth_size == 0);
         wei_load = simd_bcast(load_t::make(
                 dpas_type(data_type_), (size > 1) ? dpas_buf : wei_buf, 0));
         b_layout_.for_each_tile(tile, [&](const std::vector<dim_t> &start) {
@@ -372,6 +370,10 @@ class zp_wei_init_plan_t : public base_plan_t {
     IR_DEFINE_DUMP()
 
 private:
+    expr_t simd_bcast(const expr_t &e) const {
+        return shuffle_t::make_broadcast(e, simd_);
+    }
+
     layout_t zp_layout_;
     layout_t b_layout_;
     type_t data_type_;
@@ -384,19 +386,28 @@ class zp_comp_init_plan_t : public base_plan_t {
     using base_plan_t::base_plan_t;
 
     zp_comp_init_plan_t(const hw_t &hw, bool is_fwd, const layout_t &zp_layout,
-            const layout_t &wei_layout)
-        : base_plan_t(hw), zp_layout_(zp_layout), wei_layout_(wei_layout) {
+            const layout_t &src_layout, const layout_t &wei_layout)
+        : base_plan_t(hw)
+        , zp_layout_(zp_layout)
+        , src_layout_(src_layout)
+        , wei_layout_(wei_layout) {
         init_idxs(is_fwd);
         init_comp_layout();
         init_comp_kind();
     }
 
+    const layout_t &zp_layout() const { return zp_layout_; }
+    const layout_t &src_layout() const { return src_layout_; }
     const layout_t &comp_layout() const { return comp_layout_; }
 
     int ndims() const { return comp_layout_.ndims(); }
 
+    int fill_reg_buf_size() const {
+        return utils::rnd_up(into<int>(src_layout_.size()), grf_size());
+    }
+
     int comp_reg_buf_size() const {
-        int ret = utils::div_up(comp_layout_.size(), split_factor_);
+        int ret = utils::div_up(into<int>(comp_layout_.size()), split_factor_);
         ret = utils::rnd_up(ret, grf_size());
         return ret;
     }
@@ -420,13 +431,16 @@ class zp_comp_init_plan_t : public base_plan_t {
     }
 
     void set_split(abc_kind_t abc, int factor) {
-        ir_assert(can_split(abc, factor));
+        gpu_assert(can_split(abc, factor));
         if (abc == abc_kind_t::b) split_factor_ = factor;
     }
 
+    int estimate_fill_regs() const {
+        return utils::div_up(fill_reg_buf_size(), grf_size());
+    }
+
     int estimate_regs() const {
-        int ret = 0;
-        ret += comp_reg_buf_size();
+        int ret = comp_reg_buf_size();
         switch (kind_) {
             case zp_comp_kind_t::wei_Xn4k_x8_zp_common:
                 // zp_1x4 buffer.
@@ -441,13 +455,28 @@ class zp_comp_init_plan_t : public base_plan_t {
         return utils::div_up(ret, grf_size());
     }
 
+    stmt_t create_fill_stmt(
+            const expr_t &src_buf, const expr_t &dpas_buf) const {
+        auto int8_bcast4 = [](const expr_t &buf) {
+            auto load = load_t::make(type_t::u8(), buf, 0);
+            return store_t::make(buf, 0,
+                    cast_t::make(
+                            type_t::u8(4), shuffle_t::make_broadcast(load, 4)));
+        };
+        auto stmt = int8_bcast4(src_buf);
+        stmt = stmt.append(store_t::make(
+                dpas_buf, 0, -load_t::make(type_t::s8(), src_buf, 0)));
+        stmt = stmt.append(int8_bcast4(dpas_buf));
+        auto fill = simd_bcast(load_t::make(type_t::u32(), dpas_buf, 0));
+        for (int i = 0; i < src_layout_.size(); i += simd_ * 4)
+            stmt = stmt.append(store_t::make(dpas_buf, i, fill));
+        return stmt;
+    }
+
     stmt_t create_stmt(buffer_manager_t &buf_mgr, const expr_t &zp_buf,
             const expr_t &wei_buf, const expr_t &comp_buf,
             const expr_t &src_buf, const gemm_schedule_t &gemm_schedule,
             int subtile_idx) const {
-        auto simd_bcast = [&](const expr_t &e) {
-            return shuffle_t::make_broadcast(e, simd_);
-        };
         if (split_factor_ == 1 && subtile_idx > 0) return stmt_t();
         stmt_t stmt, comp_buf_fill;
         int ck_blk = 1;
@@ -457,7 +486,7 @@ class zp_comp_init_plan_t : public base_plan_t {
             default: break;
         }
         expr_t kw_var, ic_var;
-        int kw = 0, ic = 0;
+        dim_t kw = 0, ic = 0;
         get_kw_ic_from_b_view(gemm_schedule, kw_var, ic_var, kw, ic);
         kw_var = simd_bcast(kw_var);
 
@@ -496,8 +525,17 @@ class zp_comp_init_plan_t : public base_plan_t {
                 });
         auto zp_1x4 = buf_mgr.get("zp_1x4");
         if (!zp_1x4.is_empty()) {
-            auto init = store_t::make(zp_1x4, 0, 0x01010101);
-            stmt = init.append(stmt);
+            if (zp_layout_.type().is_s8()) {
+                auto load = load_t::make(
+                        zp_layout_.type(), buf_mgr.get("zp_src"), 0);
+                auto init = store_t::make(zp_1x4, 0,
+                        cast_t::make(type_t::u8(4),
+                                shuffle_t::make_broadcast(load, 4)));
+                stmt = init.append(stmt);
+            } else {
+                auto init = store_t::make(zp_1x4, 0, 0x01010101);
+                stmt = init.append(stmt);
+            }
         }
         return comp_buf_fill.append(stmt);
     }
@@ -505,6 +543,7 @@ class zp_comp_init_plan_t : public base_plan_t {
     std::string str() const {
         std::ostringstream oss;
         oss << "zp_layout:   " << zp_layout_ << std::endl;
+        oss << "src_layout:  " << src_layout_ << std::endl;
         oss << "wei_layout:  " << wei_layout_ << std::endl;
         oss << "comp_layout: " << comp_layout_ << std::endl;
         oss << "kind:        " << kind_ << std::endl;
@@ -515,6 +554,10 @@ class zp_comp_init_plan_t : public base_plan_t {
     IR_DEFINE_DUMP()
 
 private:
+    expr_t simd_bcast(const expr_t &e) const {
+        return shuffle_t::make_broadcast(e, simd_);
+    }
+
     void init_idxs(bool is_fwd) {
         g_idx_ = 0;
         cn_idx_ = is_fwd ? 1 : 2;
@@ -540,7 +583,7 @@ class zp_comp_init_plan_t : public base_plan_t {
         } else if (is_wei_Xn_s16(simd_dim_idx_, simd_)) {
             kind_ = zp_comp_kind_t::wei_Xn_s16;
         } else {
-            ir_error_not_expected() << wei_layout_;
+            gpu_error_not_expected() << wei_layout_;
         }
     }
 
@@ -604,7 +647,7 @@ class zp_comp_init_plan_t : public base_plan_t {
     }
 
     int wei_reg_buf_size() const {
-        int ret = utils::div_up(wei_layout_.size(), split_factor_);
+        int ret = utils::div_up(into<int>(wei_layout_.size()), split_factor_);
         ret = utils::rnd_up(ret, grf_size());
         return ret;
     }
@@ -630,8 +673,8 @@ class zp_comp_init_plan_t : public base_plan_t {
         auto &b = comp_layout_.blocks().back();
         dim_t subtile_dim = ir_utils::safe_divide(
                 comp_layout_.dim(b.dim_idx), split_factor_);
-        int beg = subtile_idx * subtile_dim;
-        int end = beg + subtile_dim;
+        dim_t beg = subtile_idx * subtile_dim;
+        dim_t end = beg + subtile_dim;
         return start[b.dim_idx] >= beg && start[b.dim_idx] < end;
     }
 
@@ -645,17 +688,17 @@ class zp_comp_init_plan_t : public base_plan_t {
                 return create_tile_wei_Xn4k_x8_zp_per_k(zp, wei, comp, buf_mgr);
             case zp_comp_kind_t::wei_Xb_s16:
             case zp_comp_kind_t::wei_Xn_s16:
-                return create_tile_wei_Xy_s16(zp, wei, comp);
-            default: ir_error_not_expected();
+                return create_tile_wei_Xy_s16(zp, wei, comp, buf_mgr);
+            default: gpu_error_not_expected();
         }
         return stmt_t();
     }
 
     stmt_t create_zp_common_mul_stmt(
             const expr_t &zp, const expr_t &comp) const {
-        if (kind_ != zp_comp_kind_t::wei_Xn4k_x8_zp_common) return stmt_t();
-
         auto zp_type = zp_layout_.type();
+        if ((kind_ != zp_comp_kind_t::wei_Xn4k_x8_zp_common) || zp_type.is_s8())
+            return stmt_t();
         auto comp_type = comp_layout_.type();
         auto comp_load = load_t::make(comp_type.with_elems(simd_), comp, 0);
         auto zp_load = load_t::make(zp_type, zp, 0);
@@ -685,11 +728,12 @@ class zp_comp_init_plan_t : public base_plan_t {
         auto wei_x8_type = wei_layout_.type();
         auto wei_s16_type = type_t::s16();
         auto comp_type = comp_layout_.type();
-        auto mad = mad_t::make(hw, comp_type, simd_, zp_type, zp_stride,
-                wei_s16_type, wei_s16_stride);
+        auto real_zp = zp;
         auto wei_s16_buf = buf_mgr.get(
                 "zp_wei_s16", simd_ * wei_s16_type.size() * wei_s16_stride);
-        stmt_t ret;
+        auto ret = maybe_typecast_zp_src(buf_mgr, zp_type, real_zp, 4);
+        auto mad = mad_t::make(hw, comp_type, simd_, zp_type, zp_stride,
+                wei_s16_type, wei_s16_stride);
         for (int ic = 0; ic < 4; ic++) {
             auto wei_x8
                     = load_t::make(wei_x8_type.with_elems(simd_), wei, ic, 4);
@@ -698,13 +742,13 @@ class zp_comp_init_plan_t : public base_plan_t {
                     wei_s16_stride * type_t::s16().size());
             ret = ret.append(store_s16);
             ret = ret.append(mad.call(
-                    {comp, comp, zp + zp_type.size() * ic, wei_s16_buf}));
+                    {comp, comp, real_zp + zp_type.size() * ic, wei_s16_buf}));
         }
         return ret;
     }
 
-    stmt_t create_tile_wei_Xy_s16(
-            const expr_t &zp, const expr_t &wei, const expr_t &comp) const {
+    stmt_t create_tile_wei_Xy_s16(const expr_t &zp, const expr_t &wei,
+            const expr_t &comp, buffer_manager_t &buf_mgr) const {
         int zp_stride = (kind_ == zp_comp_kind_t::wei_Xb_s16 && !is_zp_common())
                 ? 1
                 : 0;
@@ -712,21 +756,39 @@ class zp_comp_init_plan_t : public base_plan_t {
         auto zp_type = zp_layout_.type();
         auto wei_type = wei_layout_.type();
         auto comp_type = comp_layout_.type();
+        auto real_zp = zp;
 
-        ir_assert((int)comp_layout_.inner_stride() == 1);
-        ir_assert(wei_type.is_s16());
-        ir_assert((int)wei_layout_.inner_stride() == wei_stride);
+        gpu_assert((int)comp_layout_.inner_stride() == 1);
+        gpu_assert(wei_type.is_s16());
+        gpu_assert((int)wei_layout_.inner_stride() == wei_stride);
 
+        auto ret = maybe_typecast_zp_src(
+                buf_mgr, zp_type, real_zp, (zp_stride) ? simd_ : 1);
         auto mad = mad_t::make(
                 hw, comp_type, simd_, zp_type, zp_stride, wei_type, wei_stride);
-        return mad.call({comp, comp, zp, wei});
+        return ret.append(mad.call({comp, comp, std::move(real_zp), wei}));
+    }
+
+    stmt_t maybe_typecast_zp_src(buffer_manager_t &buf_mgr, type_t &type,
+            expr_t &zp, int size) const {
+        auto real_type = type_t::s32();
+        stmt_t ret;
+        if (type != real_type) {
+            auto src_zp = load_t::make(type.with_elems(size), zp, 0);
+            zp = buf_mgr.get("zp_src_s32", real_type.size() * size);
+            ret = store_t::make(
+                    zp, 0, cast(src_zp, real_type.with_elems(size)));
+            type = real_type;
+        }
+        return ret;
     }
 
-    int g_idx_ = -1;
-    int cn_idx_ = -1;
-    int ck_idx_ = -1;
+    dim_idx_t g_idx_ = -1;
+    dim_idx_t cn_idx_ = -1;
+    dim_idx_t ck_idx_ = -1;
 
     layout_t zp_layout_;
+    layout_t src_layout_;
     layout_t wei_layout_;
     layout_t comp_layout_;
 
@@ -752,25 +814,25 @@ struct texpr_t {
     }
 
     expr_t to_expr(const std::vector<expr_t> &vstart,
-            const std::vector<dim_t> vstart_inc,
+            const std::vector<dim_t> &vstart_inc,
             const std::vector<expr_t> &vvars, int simd_vidx) const {
         int ndims = (int)vstart.size();
-        ir_assert((int)vstart_inc.size() == ndims);
-        ir_assert((int)vvars.size() == ndims);
+        gpu_assert((int)vstart_inc.size() == ndims);
+        gpu_assert((int)vvars.size() == ndims);
         bool non_linear[max_nvdims] = {false};
         if (has_non_linear) {
             auto vars = find_objects<var_t>(base);
             for (auto &v : vars) {
                 for (int i = 0; i < (int)vvars.size(); i++) {
                     if (vvars[i].is_same(v)) {
-                        ir_assert(i < max_nvdims);
+                        gpu_assert(i < max_nvdims);
                         non_linear[i] = true;
                         break;
                     }
                 }
             }
         }
-        ir_assert(!non_linear[simd_vidx]);
+        gpu_assert(!non_linear[simd_vidx]);
 
         auto ret = base;
         for (int i = 0; i < nvargs(); i++) {
@@ -793,7 +855,7 @@ struct texpr_t {
     bool is_var() const { return nvargs() == 1 && is_zero(base); }
 
     int64_t to_const() const {
-        ir_assert(is_const());
+        gpu_assert(is_const());
         return to_cpp<int64_t>(base);
     }
 
@@ -809,10 +871,10 @@ struct texpr_t {
         return false;
     }
 
-    int vstride(int vidx) const {
+    dim_t vstride(int vidx) const {
         for (int i = 0; i < nvargs(); i++)
             if (vidxs[i] == vidx) { return vstrides[i]; }
-        ir_error_not_expected() << "Dimension not found: " << vidx;
+        gpu_error_not_expected() << "Dimension not found: " << vidx;
         return -1;
     }
 
@@ -837,7 +899,7 @@ struct texpr_t {
                 }
             }
             if (!found) {
-                ir_assert(cur_idx < max_nvargs);
+                gpu_assert(cur_idx < max_nvargs);
                 ret.vidxs[cur_idx] = b.vidxs[i];
                 ret.vstrides[cur_idx] += mult * b.vstrides[i];
                 cur_idx++;
@@ -849,7 +911,7 @@ struct texpr_t {
 
     texpr_t operator*(const texpr_t &b) const {
         if (!is_const() && b.is_const()) return b * *this;
-        if (!is_const()) ir_error_not_expected();
+        if (!is_const()) gpu_error_not_expected();
 
         auto c = to_const();
         auto ret = b;
@@ -873,8 +935,9 @@ struct texpr_t {
         if (!is_zero(base)) parts.push_back(base.str());
         for (int i = 0; i < nvargs(); i++) {
             auto s = "_" + std::to_string(vidxs[i]);
+            // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
             if (vstrides[i] != 1) s = std::to_string(vstrides[i]) + " x " + s;
-            parts.push_back(s);
+            parts.push_back(std::move(s));
         }
         for (int i = 0; i < (int)parts.size(); i++) {
             if (i > 0) oss << " + ";
@@ -887,14 +950,14 @@ struct texpr_t {
     IR_DEFINE_DUMP()
 
     static texpr_t create_from_const(const expr_t &e) {
-        ir_assert(jit::is_const(e));
+        gpu_assert(jit::is_const(e));
         texpr_t ret;
         ret.base = e;
         return ret;
     }
 
     static texpr_t create_from_vidx(int vidx) {
-        ir_assert(vidx != -1);
+        gpu_assert(vidx != -1);
         texpr_t ret;
         ret.base = expr_t(0);
         ret.vidxs[0] = vidx;
@@ -917,7 +980,7 @@ class zp_mask_desc_t {
             const std::vector<expr_t> &vstart) {
         vinfo_t vinfo(vvars, vstart);
         if (is_x_op_y(mask, vinfo, op_, lhs_, rhs_)) return;
-        ir_error_not_expected() << mask;
+        gpu_error_not_expected() << mask;
     }
 
     const texpr_t &lhs() const { return lhs_; }
@@ -934,10 +997,11 @@ class zp_mask_desc_t {
         if (!lhs_.has_vidx(simd_vidx, vvars)) {
             return binary_op_t::make(op_, e_lhs, e_rhs);
         }
-        int stride = lhs_.vstride(simd_vidx);
+        dim_t stride = lhs_.vstride(simd_vidx);
         std::vector<expr_t> off;
+        off.reserve(simd);
         for (int i = 0; i < simd; i++) {
-            off.push_back(-stride * i);
+            off.emplace_back(-stride * i);
         }
         return binary_op_t::make(op_, e_lhs, e_rhs + shuffle_t::make(off));
     }
@@ -959,7 +1023,7 @@ class zp_mask_desc_t {
         int vidx(const expr_t &var) const {
             for (int i = 0; i < (int)vvars.size(); i++)
                 if (vvars[i].is_same(var)) return i;
-            ir_error_not_expected();
+            gpu_error_not_expected();
             return -1;
         }
 
@@ -975,7 +1039,7 @@ class zp_mask_desc_t {
                 op = binary->op_kind;
                 lhs = to_texpr(binary->a, vinfo);
                 rhs = to_texpr(binary->b, vinfo);
-                ir_assert(rhs.nvargs() == 0);
+                gpu_assert(rhs.nvargs() == 0);
                 return true;
             }
         }
@@ -995,7 +1059,7 @@ class zp_mask_desc_t {
                 case op_kind_t::_mul: return a * b;
                 case op_kind_t::_div:
                 case op_kind_t::_mod: {
-                    ir_assert(b.is_const());
+                    gpu_assert(b.is_const());
                     auto e_a = a.to_expr(vinfo.vvars);
                     auto e_b = b.base;
                     texpr_t ret;
@@ -1003,10 +1067,10 @@ class zp_mask_desc_t {
                     ret.has_non_linear = true;
                     return ret;
                 }
-                default: ir_error_not_expected() << e;
+                default: gpu_error_not_expected() << e;
             }
         }
-        ir_error_not_expected() << e;
+        gpu_error_not_expected() << e;
         return texpr_t();
     }
 
@@ -1038,7 +1102,7 @@ class zp_mask_init_plan_t : public base_plan_t {
     explicit operator bool() const { return !mask_layout_.is_empty(); }
 
     int mask_reg_buf_size() const {
-        return utils::rnd_up(mask_layout_.size(), grf_size());
+        return utils::rnd_up(into<int>(mask_layout_.size()), grf_size());
     }
 
     int estimate_regs() const {
@@ -1059,7 +1123,7 @@ class zp_mask_init_plan_t : public base_plan_t {
                     for (auto &m : mask_descs_) {
                         auto e_m = m.normalize(
                                 vvars_, vstart_, start, simd_, simd_dim_idx_);
-                        e_masks.push_back(e_m);
+                        e_masks.push_back(std::move(e_m));
                     }
                     auto cond = e_masks[0];
                     for (int i = 1; i < (int)e_masks.size(); i++)
@@ -1085,16 +1149,15 @@ class zp_mask_init_plan_t : public base_plan_t {
 
 private:
     void init_mask_descs(const conv_config_t &cfg, const view_t &a_view) {
-        for (int i = 0; i < a_view.ntdims(); i++) {
+        for (dim_idx_t i = 0; i < a_view.ntdims(); i++) {
             auto &tdim = a_view.tdim(i);
             if (tdim.is_identity()) {
-                int vidx = tdim.vidx(0);
+                dim_idx_t vidx = tdim.vidx(0);
                 auto &vvar = a_view.vvar(vidx);
                 auto &name = vvar.as<var_t>().name;
                 if (utils::one_of(name, "g", "ic", "oc")) continue;
-                int padded = cfg.padded_dim(
-                        prb_dim_t::from_name(vvar.as<var_t>().name));
-                int dim = a_view.vdims()[vidx];
+                dim_t padded = cfg.padded_dim(pvar_t(vvar.as<var_t>().name));
+                dim_t dim = a_view.vdims()[vidx];
                 if (dim != padded) add_mask_desc(mask_descs_, vvar < dim);
                 continue;
             }
@@ -1110,7 +1173,7 @@ class zp_mask_init_plan_t : public base_plan_t {
             const layout_t &src_layout, const std::vector<expr_t> &vvars) {
         if (mask_descs_.empty()) return;
         int ndims = src_layout.ndims();
-        ir_assert((int)vvars.size() == ndims);
+        gpu_assert((int)vvars.size() == ndims);
         std::vector<dim_t> dims(ndims, 1);
         for (int i = 0; i < ndims; i++) {
             for (auto &m : mask_descs_)
@@ -1170,7 +1233,7 @@ class zp_mask_init_plan_t : public base_plan_t {
         auto masks = split_by_and(mask);
         for (auto &m : masks) {
             if (is_const(m)) {
-                ir_assert(to_cpp<bool>(m));
+                gpu_assert(to_cpp<bool>(m));
                 continue;
             }
             mask_descs.emplace_back(m, vvars_, vstart_);
@@ -1199,7 +1262,7 @@ class zp_comp_apply_plan_t : public base_plan_t {
         , c_layout_(c_layout)
         , simd_str_(simd_str) {
         if (!mask_layout_.is_empty()) {
-            ir_assert(utils::one_of(mask_layout_.ndims(), 7, 9));
+            gpu_assert(utils::one_of(mask_layout_.ndims(), 7u, 9u));
         }
     }
 
@@ -1207,8 +1270,8 @@ class zp_comp_apply_plan_t : public base_plan_t {
             const expr_t &c_buf, const split_dispatcher_t &sd,
             int subtile_idx) const {
         const auto comp_type = comp_layout_.type();
-        const auto mask_type = mask_layout_.type();
-        const int kw_dim = comp_layout_.dim(comp_kw_idx_);
+        const auto mask_type = type_t::s16();
+        const dim_t kw_dim = comp_layout_.dim(comp_kw_idx_);
         std::vector<int> comp_off;
         std::vector<int> mask_off;
         c_layout_.for_each_tile(
@@ -1225,7 +1288,8 @@ class zp_comp_apply_plan_t : public base_plan_t {
         std::vector<std::pair<int, stmt_t>> precomp;
         for (int i = 0; i < int(comp_off.size()) / kw_dim; i++) {
             bool is_same = i > 0;
-            for (int kw = i * kw_dim; is_same && (kw < (i + 1) * kw_dim); kw++)
+            for (dim_t kw = i * kw_dim; is_same && (kw < (i + 1) * kw_dim);
+                    kw++)
                 is_same &= (comp_off[kw - kw_dim] == comp_off[kw])
                         && (mask_off[kw - kw_dim] == mask_off[kw]);
             if (is_same) continue;
@@ -1236,7 +1300,7 @@ class zp_comp_apply_plan_t : public base_plan_t {
             auto comp0_load
                     = load_t::make(comp_type.with_elems(sd.simd()), comp0, 0);
             if (mask_buf.is_empty()) {
-                for (int kw = i * kw_dim + 1; kw < (i + 1) * kw_dim; kw++) {
+                for (dim_t kw = i * kw_dim + 1; kw < (i + 1) * kw_dim; kw++) {
                     auto comp = comp_buf[comp_off[kw]];
                     auto comp_load = load_t::make(
                             comp_type.with_elems(sd.simd()), comp, 0);
@@ -1249,7 +1313,7 @@ class zp_comp_apply_plan_t : public base_plan_t {
                 auto mask0_load = shuffle_t::make_broadcast(m0_ld, sd.simd());
                 stmt = stmt.append(
                         store_t::make(comp0, 0, comp0_load * mask0_load));
-                for (int kw = i * kw_dim + 1; kw < (i + 1) * kw_dim; kw++) {
+                for (dim_t kw = i * kw_dim + 1; kw < (i + 1) * kw_dim; kw++) {
                     auto comp = comp_buf[comp_off[kw]];
                     auto mask = mask_buf[mask_off[kw]];
                     auto comp_load = load_t::make(
@@ -1322,7 +1386,7 @@ class zp_comp_apply_plan_t : public base_plan_t {
             const expr_t &c, const split_dispatcher_t &sd) const {
         auto comp_type = comp_layout_.type();
         auto c_type = c_layout_.type();
-        ir_assert((int)comp_layout_.inner_stride() == 1);
+        gpu_assert((int)comp_layout_.inner_stride() == 1);
         auto comp_load = load_t::make(comp_type.with_elems(sd.simd()), comp, 0);
         auto c_load = load_t::make(c_type.with_elems(sd.simd()), c, 0);
         stmt_t c_update;
@@ -1374,7 +1438,7 @@ class zp_comp_apply_plan_t : public base_plan_t {
         return ret;
     }
 
-    int comp_kw_idx_ = -1;
+    dim_idx_t comp_kw_idx_ = dim_idx::invalid;
 
     layout_t comp_layout_;
     layout_t mask_layout_;
@@ -1383,7 +1447,8 @@ class zp_comp_apply_plan_t : public base_plan_t {
 };
 
 struct zp_plan_impl_t : public base_plan_t {
-    bool needs_precalc = false;
+    bool src_2d_loads = false;
+    bool has_dpasw = false;
     split_dispatcher_t sd;
     send_plan_t load;
     zp_comp_init_plan_t comp_init;
@@ -1399,8 +1464,17 @@ struct zp_plan_impl_t : public base_plan_t {
         , comp_apply(hw)
         , wei_init(hw) {}
 
+    bool has_scalar_int8_src() const {
+        return has_zp_src() && (comp_init.zp_layout().elems() == 1)
+                && comp_init.zp_layout().type().is_s8()
+                && comp_init.src_layout().type().is_s8();
+    }
     bool has_zp_src() const { return load; }
     bool has_zp_wei() const { return wei_load; }
+    bool is_src_precomp_compatible() const {
+        return has_scalar_int8_src() && !has_zp_wei() && !src_2d_loads
+                && !has_dpasw;
+    }
     explicit operator bool() const { return has_zp_src() || has_zp_wei(); }
 
     bool can_split(abc_kind_t abc, int factor) const {
@@ -1410,14 +1484,16 @@ struct zp_plan_impl_t : public base_plan_t {
     }
 
     void set_split(abc_kind_t abc, int factor) {
-        ir_assert(can_split(abc, factor));
+        gpu_assert(can_split(abc, factor));
         if (has_zp_src()) comp_init.set_split(abc, factor);
         if (bool(*this)) sd.set_split(abc, factor);
     }
 
     int estimate_regs() const {
         int ret = 0;
-        if (has_zp_src()) {
+        if (is_src_precomp_compatible()) {
+            ret += comp_init.estimate_fill_regs();
+        } else if (has_zp_src()) {
             ret += comp_init.estimate_regs();
             ret += mask_init.estimate_regs();
         }
@@ -1445,12 +1521,16 @@ zp_plan_t::zp_plan_t(const hw_t &hw)
 
 zp_plan_t::~zp_plan_t() = default;
 
-void zp_plan_t::init(const conv_config_t &cfg,
+// NOLINTNEXTLINE(readability-make-member-function-const)
+void zp_plan_t::init(const conv_config_t &cfg, bool src_2d_loads,
         const gemm_schedule_t &gemm_schedule, const view_t &zp_view,
         const view_t &zp_src_view, const layout_t &src_layout,
         const layout_t &wei_layout, const layout_t &dst_layout) {
-    impl->needs_precalc = cfg.zp_cfg().needs_src_precalc;
-    bool do_src = cfg.zp_cfg().do_src_compensation && !impl->needs_precalc;
+    impl->src_2d_loads = src_2d_loads;
+    impl->has_dpasw = cfg.fma_kind() == fma_kind_t::dpasw;
+    bool do_src = cfg.zp_cfg().do_src_compensation
+            && !cfg.zp_cfg().needs_src_reorder_precalc
+            && !cfg.zp_cfg().needs_src_conv_precalc;
     bool do_wei = cfg.zp_cfg().do_wei_compensation;
     send_plan_t impl_load;
 
@@ -1458,8 +1538,8 @@ void zp_plan_t::init(const conv_config_t &cfg,
         auto load_params = get_send_params(
                 cfg.exec_cfg(), send_op_t::load, send_address_t::a64, zp_view);
         impl_load = create_send_plan(cfg.exec_cfg(), zp_view, load_params);
-        impl->comp_init = zp_comp_init_plan_t(
-                cfg.hw(), cfg.prb().is_fwd, impl_load.reg_layout(), wei_layout);
+        impl->comp_init = zp_comp_init_plan_t(cfg.hw(), cfg.prb().is_fwd,
+                impl_load.reg_layout(), src_layout, wei_layout);
         impl->sd = split_dispatcher_t(impl->comp_init.comp_layout(), dst_layout,
                 cfg.hw(), cfg.prb().is_fwd, gemm_schedule.bmnk_mapper());
     }
@@ -1485,6 +1565,10 @@ zp_plan_t::operator bool() const {
     return (bool)*impl;
 }
 
+bool zp_plan_t::is_src_precomp_compatible() const {
+    return impl->is_src_precomp_compatible();
+}
+
 bool zp_plan_t::has_zp_src() const {
     return impl->has_zp_src();
 }
@@ -1493,10 +1577,6 @@ bool zp_plan_t::has_zp_wei() const {
     return impl->has_zp_wei();
 }
 
-bool zp_plan_t::needs_precalc() const {
-    return impl->needs_precalc;
-}
-
 int zp_plan_t::load_reg_buf_size() const {
     return impl->load.reg_buf_size();
 }
@@ -1517,18 +1597,25 @@ int zp_plan_t::wei_reg_buf_size() const {
     return impl->wei_init.wei_reg_buf_size();
 }
 
+int zp_plan_t::src_reg_buf_size() const {
+    return impl->comp_init.fill_reg_buf_size();
+}
+
+stmt_t zp_plan_t::src_init_create_stmt(
+        const expr_t &src_buf, const expr_t &dpas_buf) const {
+    return impl->comp_init.create_fill_stmt(src_buf, dpas_buf);
+}
+
 stmt_t zp_plan_t::load_create_stmt(
         const expr_t &mem_buf, const expr_t &reg_buf, int subtile_idx) const {
-    return subtile_idx > 0
-            ? stmt_t()
-            : impl->load.create_stmt(mem_buf, reg_buf, subtile_idx);
+    if (subtile_idx > 0) return stmt_t();
+    return impl->load.create_stmt(mem_buf, reg_buf, subtile_idx);
 }
 
 stmt_t zp_plan_t::wei_load_create_stmt(
         const expr_t &mem_buf, const expr_t &reg_buf, int subtile_idx) const {
-    return subtile_idx > 0
-            ? stmt_t()
-            : impl->wei_load.create_stmt(mem_buf, reg_buf, subtile_idx);
+    if (subtile_idx > 0) return stmt_t();
+    return impl->wei_load.create_stmt(mem_buf, reg_buf, subtile_idx);
 }
 
 stmt_t zp_plan_t::comp_init_create_stmt(buffer_manager_t &buf_mgr,
@@ -1561,6 +1648,7 @@ bool zp_plan_t::can_split(abc_kind_t abc, int factor) const {
     return impl->can_split(abc, factor);
 }
 
+// NOLINTNEXTLINE(readability-make-member-function-const)
 void zp_plan_t::set_split(abc_kind_t abc, int factor) {
     impl->set_split(abc, factor);
 }
diff --git a/src/gpu/intel/jit/conv/zp_plan.hpp b/src/gpu/intel/jit/conv/zp_plan.hpp
index 54e833920b7..267de03422f 100644
--- a/src/gpu/intel/jit/conv/zp_plan.hpp
+++ b/src/gpu/intel/jit/conv/zp_plan.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,20 +37,23 @@ struct zp_plan_impl_t;
 struct zp_plan_t : public base_plan_t {
     zp_plan_t(const hw_t &hw);
     ~zp_plan_t();
-    void init(const conv_config_t &cfg, const gemm_schedule_t &gemm_schedule,
-            const view_t &zp_src_view, const view_t &zp_view,
-            const layout_t &src_layout, const layout_t &wei_layout,
-            const layout_t &dst_layout);
+    void init(const conv_config_t &cfg, bool src_2d_loads,
+            const gemm_schedule_t &gemm_schedule, const view_t &zp_src_view,
+            const view_t &zp_view, const layout_t &src_layout,
+            const layout_t &wei_layout, const layout_t &dst_layout);
 
     explicit operator bool() const;
+    bool is_src_precomp_compatible() const;
     bool has_zp_src() const;
     bool has_zp_wei() const;
-    bool needs_precalc() const;
     int load_reg_buf_size() const;
     int mask_reg_buf_size() const;
     int comp_reg_buf_size() const;
     int wei_load_reg_buf_size() const;
     int wei_reg_buf_size() const;
+    int src_reg_buf_size() const;
+    stmt_t src_init_create_stmt(
+            const expr_t &src_buf, const expr_t &dpas_buf) const;
     stmt_t load_create_stmt(const expr_t &mem_buf, const expr_t &reg_buf,
             int subtile_idx) const;
     stmt_t comp_init_create_stmt(buffer_manager_t &buf_mgr,
diff --git a/src/gpu/intel/jit/eltwise_injector.cpp b/src/gpu/intel/jit/eltwise_injector.cpp
new file mode 100644
index 00000000000..3f4eda92b15
--- /dev/null
+++ b/src/gpu/intel/jit/eltwise_injector.cpp
@@ -0,0 +1,1030 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/eltwise_injector.hpp"
+#include "common/impl_registration.hpp"
+#include "gpu/intel/jit/codegen/kernel.hpp"
+#include "gpu/intel/jit/codegen/ngen_helpers.hpp"
+
+#include <limits>
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+using namespace ngen;
+
+template <typename ngen_generator_t>
+int eltwise_injector_f32_t<ngen_generator_t>::min_scratch_regs() {
+    using namespace alg_kind;
+    if (is_fwd_) {
+        switch ((int)alg_) {
+            case eltwise_elu:
+            case eltwise_elu_use_dst_for_bwd: return 1;
+            case eltwise_exp:
+            case eltwise_exp_use_dst_for_bwd: return 0;
+            case eltwise_gelu_erf: return 4;
+            case eltwise_hardsigmoid: return 0;
+            case eltwise_hardswish: return 1;
+            case eltwise_log: return 0;
+            case eltwise_mish: return 4;
+            case eltwise_pow: return 1;
+            case eltwise_relu:
+            case eltwise_relu_use_dst_for_bwd: return 1;
+            case eltwise_abs: return 0;
+            case eltwise_soft_relu: return 1;
+            case eltwise_sqrt:
+            case eltwise_sqrt_use_dst_for_bwd: return 0;
+            case eltwise_square: return 0;
+            case eltwise_swish: return 1;
+            case eltwise_tanh:
+            case eltwise_tanh_use_dst_for_bwd: return 2;
+            case eltwise_round: return 0;
+            case eltwise_linear: return 0;
+            case eltwise_clip:
+            case eltwise_clip_v2:
+            case eltwise_clip_v2_use_dst_for_bwd: return 0;
+            case eltwise_gelu_tanh: return 2;
+            case eltwise_logistic:
+            case eltwise_logistic_use_dst_for_bwd: return 0;
+            case eltwise_stochastic_round: return 6;
+            default: assert(!"unsupported eltwise algorithm");
+        }
+    } else {
+        switch (alg_) {
+            case eltwise_relu: return 1;
+            case eltwise_abs: return 1;
+            case eltwise_square: return 0;
+            case eltwise_linear: return 0;
+            case eltwise_clip: return 1;
+            case eltwise_gelu_tanh: return 2;
+            default: assert(!"unsupported eltwise algorithm");
+        }
+    }
+    return 0;
+}
+
+template <typename ngen_generator_t>
+int eltwise_injector_f32_t<ngen_generator_t>::preferred_scratch_regs() {
+    using namespace alg_kind;
+    if (is_fwd_) {
+        switch (alg_) {
+            case eltwise_elu:
+            case eltwise_elu_use_dst_for_bwd: return 8;
+            case eltwise_gelu_erf: return 8;
+            case eltwise_hardswish: return 8;
+            case eltwise_mish: return 8;
+            case eltwise_relu:
+            case eltwise_relu_use_dst_for_bwd: return (alpha_ == 0.f) ? 1 : 8;
+            case eltwise_tanh: return 8;
+            case eltwise_gelu_tanh: return 8;
+            case eltwise_soft_relu: return 8;
+            case eltwise_swish: return 8;
+            default: break;
+        }
+    } else {
+        switch (alg_) {
+            case eltwise_gelu_tanh: return 8;
+            default: break;
+        }
+    }
+    return min_scratch_regs();
+}
+
+template <typename ngen_generator_t>
+int eltwise_injector_f32_t<ngen_generator_t>::max_batch_size() {
+    using namespace alg_kind;
+    auto ss = scratch_.getLen();
+
+    if (is_fwd_) {
+        switch (alg_) {
+            case eltwise_relu:
+            case eltwise_relu_use_dst_for_bwd:
+                if (alpha_ == 0.)
+                    break;
+                else
+                    return ss;
+            case eltwise_elu:
+            case eltwise_elu_use_dst_for_bwd:
+            case eltwise_hardswish:
+            case eltwise_pow:
+            case eltwise_soft_relu:
+            case eltwise_swish: return ss;
+            case eltwise_tanh:
+            case eltwise_mish:
+            case eltwise_gelu_erf: return ss / min_scratch_regs();
+            case eltwise_gelu_tanh: return ss & ~1;
+            default: break;
+        }
+    } else {
+        switch (alg_) {
+            case eltwise_gelu_tanh: return ss / 2;
+            default: break;
+        }
+    }
+
+    return 128;
+}
+
+template <typename ngen_generator_t>
+int eltwise_injector_f32_t<ngen_generator_t>::phase_count(alg_kind_t alg) {
+    using namespace alg_kind;
+
+    if (is_fwd_) {
+        switch (alg) {
+            case eltwise_elu:
+            case eltwise_elu_use_dst_for_bwd: return 5;
+            case eltwise_exp:
+            case eltwise_exp_use_dst_for_bwd: return 2;
+            case eltwise_gelu_erf: return 25;
+            case eltwise_hardsigmoid: return 4;
+            case eltwise_hardswish: return 5;
+            case eltwise_log: return 2;
+            case eltwise_mish:
+                return phase_count(alg_kind::eltwise_soft_relu)
+                        + phase_count(alg_kind::eltwise_tanh) + 1;
+            case eltwise_pow: return 6;
+            case eltwise_relu:
+            case eltwise_relu_use_dst_for_bwd: return (alpha_ == 0) ? 1 : 2;
+            case eltwise_soft_relu: return 10;
+            case eltwise_swish: return 5;
+            case eltwise_tanh:
+            case eltwise_tanh_use_dst_for_bwd:
+                return (use_tanh_compat()) ? 9 : 6;
+            case eltwise_linear: return (beta_ == 0) ? 1 : 2;
+            case eltwise_clip:
+            case eltwise_clip_v2:
+            case eltwise_clip_v2_use_dst_for_bwd: return 2;
+            case eltwise_gelu_tanh: return 8;
+            case eltwise_logistic:
+            case eltwise_logistic_use_dst_for_bwd: return 4;
+            default: break;
+        }
+    } else {
+        switch (alg) {
+            case eltwise_abs: return 2;
+            case eltwise_clip: return 4;
+            case eltwise_gelu_tanh: return 14;
+            default: break;
+        }
+    }
+
+    return 1;
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::relu_zero_ns_prepare_fwd() {
+    h->mov(1, scratch_[0].f(0), 0.f);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::relu_zero_ns_compute_fwd(
+        int simd, const ngen::GRF &r) {
+    /* use csel instead of max to propagate NaNs*/
+    h->csel(simd | le | f0[0], r, scratch_[0].f(0), r, r);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::relu_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    auto temp = scratch_[off].f();
+    switch (phase) {
+        case 0: h->mul(simd, temp, r, alpha_); break;
+        case 1: h->csel(simd | le | f0[0], r, temp, r, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::abs_compute_fwd(
+        int simd, const ngen::GRF &r) {
+    h->mov(simd, r, abs(r));
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::soft_relu_compute_fwd_inner(
+        int simd, const ngen::GRF &input, const ngen::GRF &temp,
+        const ngen::GRF &dest, int phase, int off, float alpha) {
+    const float exp_overflow_bound = 88.72283172607421875f;
+    const float log2e = 1.44269502162933349609375f;
+    const float reciproc_log2e = 1.f / log2e; // 1 / log_2(e)
+    switch (phase) {
+        case 0: h->mul(simd, temp, input, alpha); break;
+        case 1: h->add(simd, dest, input, -exp_overflow_bound); break;
+        case 2: h->csel(simd | le | f0[0], dest, dest, temp, dest); break;
+        case 3: h->mul(simd, temp, temp, log2e); break;
+        case 4: h->exp(simd, temp, temp); break;
+        case 5: h->add(simd, temp, temp, 1.f); break;
+        case 6: h->log(simd, temp, temp); break;
+        case 7: h->mul(simd, temp, temp, reciproc_log2e); break;
+        case 8: h->csel(simd | le | f0[0], temp, temp, dest, dest); break;
+        case 9: h->mul(simd, dest, temp, 1.f / alpha); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::soft_relu_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    auto temp = scratch_[off].f();
+    soft_relu_compute_fwd_inner(simd, r, temp, r, phase, off, alpha_);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::sqrt_compute_fwd(
+        int simd, const ngen::GRF &r) {
+    h->sqt(simd, r, r);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::square_compute_fwd(
+        int simd, const ngen::GRF &r) {
+    h->mul(simd, r, r, r);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::tanh_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off, int batch) {
+    const float log2e = 1.44269502162933349609375f; // log_2(e)
+    auto one_half = scratch_[0].f(7);
+    auto a = scratch_[off + batch].f();
+    switch (phase) {
+        case 0: h->mul(simd, a, abs(r), 2.f * log2e); break;
+        case 1: h->exp(simd, a, a); break;
+        case 2: h->mad(simd, a, one_half, a, one_half); break;
+        case 3: h->inv(simd, a, a); break;
+        case 4: h->add(simd, a, -a, 1.f); break;
+        case 5: h->csel(simd | ge | f0[0], r, a, -a, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::tanh_compute_fwd_compat(
+        int simd, const ngen::GRF &r, int phase, int off, int batch) {
+    // This approximation of tanh(x) does not use the math.exp instruction
+    // that seems to be faulty on DG2-128; the exact formula is as follows:
+    // R = max(min(0.0519867*x*((x^2 + k)^2 + l)/((x^2 + m)^2 + n), 1), -1)
+    // Both absolute and relative errors are <7*10^-5 \forall x \in \mathbb R
+    auto k = scratch_[0].f(4);
+    auto l = scratch_[0].f(5);
+    auto m = scratch_[0].f(6);
+    auto n = scratch_[0].f(7);
+    auto a = scratch_[off + batch].f();
+    switch (phase) {
+        case 0: h->mad(simd, a, m, r, r); break;
+        case 1: h->mad(simd, a, n, a, a); break;
+        case 2: h->inv(simd, a, a); break;
+        case 3: h->mul(simd, a, a, r); break;
+        case 4: h->mad(simd, r, k, r, r); break;
+        case 5: h->mad(simd, r, l, r, r); break;
+        case 6: h->mul(simd, r, r, 0.0519867f); break; // 0.051986694f
+        case 7: h->mul(simd | sat, r, r, abs(a)); break;
+        case 8: h->csel(simd | ge | f0[0], r, r, -r, a); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::round_compute_fwd(
+        int simd, const ngen::GRF &r) {
+    h->rnde(simd, r, r);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::sround_compute_fwd(int simd,
+        const ngen::GRF &r, int phase, const ngen::Subregister &seed,
+        const ngen::DataType dst_dt, int off) {
+    // 2 regs for bias.
+    auto bias = scratch_[0].ud();
+    auto u_r = r[0].ud()(8, 8, 1);
+    auto u_f = r[0].f()(8, 8, 1);
+
+    // Initialize indices in counter.
+    int base_idx = off * simd;
+    h->template mov<uint16_t>(8, bias.uw(0)(1), Immediate::uv(0x76543210));
+    h->template mov<uint16_t>(8, bias.uw(8)(1), Immediate::uv(0xfedcba98));
+    auto imm = Immediate::ud(base_idx);
+    if (hw() >= gpu_xe_hpc)
+        h->add(16, bias.ud(), bias.uw(), imm);
+    else {
+        auto extra = scratch_[1].ud();
+        h->add(8, extra.ud(0)(1), bias.uw(8)(8, 8, 1), imm);
+        h->add(8, bias.ud(0)(1), bias.uw(0)(8, 8, 1), imm);
+    }
+
+    const uint32_t dst_dt_digits = dnnl::impl::types::digits<uint32_t>(
+            convert_ngen_type_to_dnnl(dst_dt));
+    assert(dst_dt_digits <= 24);
+
+    data_type_t dnnl_t = to_dnnl(to_ir(dst_dt));
+    const float f_min = types::min_value<float>(dnnl_t);
+    const float max = types::max_value<float>(dnnl_t);
+    const float lowest = types::lowest_value<float>(dnnl_t);
+    auto bia_scratch = scratch_[4].ud();
+
+    // Mask for preserving inf, NaN.
+    // u_r & 0x7F800000 != 0x7F800000 implies (~u_r) & 0x7F800000 != 0
+    h->and_(simd | h->nz | f0[0], h->null.ud(), ~u_r, 0x7F800000);
+
+    const int truncation_mask = (0xffffffff << (24 - dst_dt_digits));
+
+    philox_4x32(simd, seed, bias);
+
+    if (getBytes(dst_dt) == 2) {
+        h->mov(simd, bia_scratch.ud(0), bias.ub(0)(simd, simd, 1));
+    } else {
+        h->mov(simd, bia_scratch.ud(0), bias.uw(0)(simd, simd, 1));
+    }
+
+    h->and_(simd, bia_scratch, bia_scratch, ~truncation_mask);
+
+    h->add(simd | f0[0], u_r, u_r, bia_scratch);
+    h->and_(simd | f0[0], u_r, u_r, truncation_mask);
+
+    // Enforce dst data type range.
+    h->max_(simd, u_f, u_f, lowest);
+    h->min_(simd, u_f, u_f, max);
+    // Enforce minimum precision.
+    h->cmp(simd | lt | f0[0], abs(u_f), f_min);
+    h->mov(simd | f0[0], u_r, 0);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::philox_4x32(
+        int simd, const ngen::Subregister &seed, const ngen::GRF &bias) {
+    auto sround_seed = seed;
+    auto ctr = bias.ud(0);
+
+    auto key = scratch_[2].ud();
+    auto ctr_mul = scratch_[3].ud();
+    auto offs = scratch_[4].ud();
+    auto off_inc = scratch_[5].uw();
+    auto addr = h->indirect[h->a0].ud(0)(0, 1, 0);
+
+    // Compute key.
+    if (hw() >= gpu_xe_hpc)
+        h->mov(4, key.uq(0)(4, 4, 1), uint64_t(0xBB67AE859E3779B9uLL));
+    else {
+        h->mov(4, key.ud(0)(4, 4, 2), uint32_t(0x9E3779B9u));
+        h->mov(4, key.ud(1)(4, 4, 2), uint32_t(0xBB67AE85u));
+    }
+
+    h->template mov<uint16_t>(4, offs.uw(16)(1), Immediate::uv(0x00009988));
+    h->template mul<uint32_t>(
+            4, offs.ud(8)(1), key.ud(0)(4, 4, 1), offs.uw(16)(4, 4, 1));
+    h->add(4, offs.ud(8)(1), offs.ud(8)(4, 4, 1), sround_seed);
+
+    h->template mov<uint16_t>(8, offs.uw(8)(1), Immediate::uv(0x77665544));
+    h->template mov<uint16_t>(8, offs.uw(0)(1), Immediate::uv(0x33221100));
+    h->template mul<uint32_t>(
+            8, key.ud(8)(1), key.ud(0)(8, 8, 1), offs.uw(8)(8, 8, 1));
+    h->template mul<uint32_t>(
+            8, key.ud(0)(1), key.ud(0)(8, 8, 1), offs.uw(0)(8, 8, 1));
+    h->add(16, key, key, sround_seed);
+    // Compute ctr_mul.
+    h->mov(4, ctr_mul.ud(2)(4), 0xCD9E8D57);
+    h->mov(4, ctr_mul.ud(0)(4), 0xD2511F53);
+    auto ctr_base_sub = offs.uw(8)(8, 8, 1);
+    h->mov(8, ctr_base_sub,
+            (ctr.getBase() * GRF::bytes(hw())) + ctr.getOffset());
+
+    // Prepare first iter idx swizzle
+    h->template mov<uint16_t>(8, off_inc.uw(0)(1), Immediate::uv(0x56741230));
+    h->template mul<uint16_t>(8, off_inc.uw(0)(1), off_inc.uw(0)(8, 8, 1), 4);
+
+    //as_uint4(convert_ulong2(ctr.s31) * mul) ^ (uint4)(ctr.s20 ^ key, 0, 0).s3120
+    auto philox_round = [&](ngen::Subregister &ctr, ngen::GRF &ctr_mul,
+                                ngen::GRF &key, int idx) {
+        // TODO: what if offsets in different operands can differ?
+
+        // Apply idx swizzle.
+        h->add(8, h->a0, ctr_base_sub, off_inc);
+        h->template movi<uint32_t>(8, ctr.ud(0)(1), addr);
+        h->add(8, h->a0, h->a0, 32);
+        h->template movi<uint32_t>(8, ctr.ud(8)(1), addr);
+
+        // KEY packed with mul in ctr_mul, key in odd indices, ctr_mul in even.
+        // Swizzle Key to avoid double swizzle as in ocl ((uint4)(ctr.s20 ^ key, 0, 0).s3120).
+        h->mov(4, ctr_mul.ud(1)(4), key.ud(idx + 1)(0, 1, 0));
+        h->mov(4, ctr_mul.ud(3)(4), key.ud(idx)(0, 1, 0));
+        // END SWIZZLE CTR_MUL
+
+        // xor ctr.s02 ^ key.s10
+        h->xor_(8, ctr_mul.ud(1)(2), ctr_mul.ud(1)(8, 4, 2),
+                ctr.ud(1)(8, 4, 2));
+
+        // EMULATE QW <- DW X DW
+        // mul ctr.s31 * ctr_mul
+        auto ctrLo = ctr.ud(0)(8, 4, 2);
+        auto ctrHi = ctr.ud(1)(8, 4, 2);
+
+        const auto grf_size = ngen::GRF::bytes(hw());
+        const int esize = grf_size / 8; // 8 = 2 * dword bytes
+        const int steps = utils::div_up(8, esize);
+
+        auto acc = h->acc0.retype(DataType::ud);
+        for (int i = 0, off = 0; i < steps; ++i, off += 2 * esize)
+            h->mul(esize, acc[i](2), ctr.ud(off)(8, 4, 2),
+                    ctr_mul.uw(off)(8, 2, 4));
+        h->mach(8, ctrLo, ctr.ud(0)(8, 4, 2), ctr_mul.ud(0)(8, 4, 2));
+        h->mov(8, ctrHi, ctrLo);
+        for (int i = 0, off = 0; i < steps; ++i, off += 2 * esize)
+            h->mov(esize, ctr.ud(off)(2), acc[i](2));
+
+        // xor results
+        h->xor_(8, ctr.ud(1)(2), ctr.ud(1)(8, 4, 2), ctr_mul.ud(1)(8, 4, 2));
+
+        // Set idx swizzle for subsequent iterations.
+        if (idx == 0) {
+            h->template mov<uint16_t>(
+                    8, off_inc.uw(0)(1), Immediate::uv(0x65472103));
+            h->template mul<uint16_t>(
+                    8, off_inc.uw(0)(1), off_inc.uw(0)(8, 8, 1), 4);
+        }
+    };
+    philox_round(ctr, ctr_mul, key, 0);
+    philox_round(ctr, ctr_mul, key, 2);
+    philox_round(ctr, ctr_mul, key, 4);
+    philox_round(ctr, ctr_mul, key, 6);
+    philox_round(ctr, ctr_mul, key, 8);
+    philox_round(ctr, ctr_mul, key, 10);
+    philox_round(ctr, ctr_mul, key, 12);
+    philox_round(ctr, ctr_mul, key, 14);
+    philox_round(ctr, ctr_mul, offs, 8);
+    philox_round(ctr, ctr_mul, offs, 10);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::swish_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    const float log2e = 1.442695f; // log_2(e)
+    auto temp = scratch_[off].f();
+    switch (phase) {
+        case 0: h->mul(simd, temp, r, -1.f * log2e * alpha_); break;
+        case 1: h->exp(simd, temp, temp); break;
+        case 2: h->add(simd, temp, temp, 1.f); break;
+        case 3: h->inv(simd, temp, temp); break;
+        case 4: h->mul(simd, r, r, temp); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::linear_compute_fwd(
+        int simd, const ngen::GRF &r, int phase) {
+    switch (phase) {
+        case 0: h->mul(simd, r, r, alpha_); break;
+        case 1: h->add(simd, r, r, beta_); break; /* skipped if beta_ = 0 */
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::clip_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, float alpha, float beta) {
+    switch (phase) {
+        case 0: h->max_(simd, r, r, alpha); break;
+        case 1: h->min_(simd, r, r, beta); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::gelu_tanh_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+
+    const float k = 0.044715f;
+    const float sqrt_2_over_pi = 0.7978845f; // sqrt(2/pi)
+    const float log2e = 1.442695f; // log_2(e)
+
+    int msimd = simd;
+    if (hw() == gpu_xe_hp)
+        msimd = 16; // workaround for intermittent hang with DPAS+EM
+
+    auto a = scratch_[off].f();
+    switch (phase) {
+        case 0: h->mul(simd, a, r, r); break;
+        case 1: h->mul(simd, a, a, k); break;
+        case 2: h->mad(simd, a, r, a, r); break;
+        case 3: h->mul(simd, a, a, -2 * sqrt_2_over_pi * log2e); break;
+        case 4: h->exp(msimd, a, a); break;
+        case 5: h->add(simd, a, a, 1.0f); break;
+        case 6: h->inv(msimd, a, a); break;
+        case 7: h->mul(simd, r, a, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::logistic_compute_fwd(
+        int simd, const ngen::GRF &r, int phase) {
+    const float log2e = 1.442695f; // log_2(e)
+    switch (phase) {
+        case 0: h->mul(simd, r, r, -1.f * log2e); break;
+        case 1: h->exp(simd, r, r); break;
+        case 2: h->add(simd, r, r, 1.f); break;
+        case 3: h->inv(simd, r, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::relu_prepare_bwd() {
+    auto neg_slope = scratch_[0].f(0);
+    auto pos_slope = scratch_[0].f(4);
+    h->mov(1, neg_slope, alpha_);
+    h->mov(1, pos_slope, 1.f);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::relu_compute_bwd(
+        int simd, const ngen::GRF &r) {
+    auto neg_slope = scratch_[0].f(0);
+    auto pos_slope = scratch_[0].f(4);
+    h->csel(simd | le | f0[0], r, neg_slope, pos_slope, r);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::abs_prepare_bwd() {
+    auto neg_one = scratch_[0].f(0);
+    auto pos_one = scratch_[0].f(4);
+    h->mov(1, neg_one, -1.f);
+    h->mov(1, pos_one, 1.f);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::clip_prepare_bwd() {
+    auto pos_inf_imm = Immediate(std::numeric_limits<float>::infinity());
+    auto zero = scratch_[0].f(0);
+    auto one = scratch_[0].f(1);
+    auto pos_inf = scratch_[0].f(2);
+    h->mov(1, zero, 0.f);
+    h->mov(1, one, 1.f);
+    h->mov(1, pos_inf, pos_inf_imm);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::tanh_prepare_fwd() {
+    auto one_half = scratch_[0].f(7);
+    h->mov(1, one_half, 0.5f);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::tanh_prepare_fwd_compat() {
+    auto k = scratch_[0].f(4);
+    auto l = scratch_[0].f(5);
+    auto m = scratch_[0].f(6);
+    auto n = scratch_[0].f(7);
+    h->mov(1, k, 77.0954f); //  77.095392909578f
+    h->mov(1, l, -4435.55f); // -4435.54623970169f
+    h->mov(1, m, 17.06396f); //  17.06396485f
+    h->mov(1, n, -212.7724f); // -212.772646402036f
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::abs_compute_bwd(
+        int simd, const ngen::GRF &r, int phase) {
+    auto neg_one = scratch_[0].f(0);
+    auto pos_one = scratch_[0].f(4);
+    switch (phase) {
+        case 0: h->csel(simd | lt | f0[0], r, neg_one, r, r); break;
+        case 1: h->csel(simd | gt | f0[0], r, pos_one, r, r); break;
+        default: break;
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::square_compute_bwd(
+        int simd, const ngen::GRF &r) {
+    h->add(simd, r, r, r);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::linear_compute_bwd(
+        int simd, const ngen::GRF &r) {
+    h->mov(simd, r, alpha_);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::clip_compute_bwd(
+        int simd, const ngen::GRF &r, int phase, float alpha, float beta) {
+    auto zero = scratch_[0].f(0);
+    auto one = scratch_[0].f(1);
+    auto pos_inf = scratch_[0].f(2);
+    switch (phase) {
+        // r[i] = r[i] - alpha
+        case 0: h->add(simd, r, r, -alpha); break;
+        // r[i] <= 0 => r[i] = infinity
+        case 1: h->csel(simd | le | f0[0], r, pos_inf, r, r); break;
+        // r[i] = (r[i] + alpha) - beta
+        case 2: h->add(simd, r, r, alpha - beta); break;
+        // r[i] = (r[i] <= 0 ? 1 : 0)
+        case 3: h->csel(simd | le | f0[0], r, one, zero, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::gelu_tanh_compute_bwd(
+        int simd, const ngen::GRF &r, int phase, int off, int batch) {
+
+    const float k = 0.044715f;
+    const float sqrt_2_over_pi = 0.7978845f; // sqrt(2/pi)
+    const float log2e = 1.442695f; // log_2(e)
+
+    int msimd = simd;
+    if (hw() == gpu_xe_hp) msimd = 16;
+
+    auto a = scratch_[off].f();
+    auto b = scratch_[off + batch].f();
+    switch (phase) {
+        case 0: h->mul(simd, a, r, r); break;
+        case 1: h->mul(simd, b, a, 3.0f * k); break;
+        case 2: h->mul(simd, a, a, k); break;
+        case 3: h->mad(simd, a, r, a, r); break;
+        case 4: h->mad(simd, b, r, b, r); break;
+        case 5: h->mul(simd, a, a, -2 * sqrt_2_over_pi * log2e); break;
+        case 6: h->mul(simd, b, b, 2 * sqrt_2_over_pi); break;
+        case 7: h->exp(msimd, a, a); break;
+        case 8: h->add(simd, r, a, 1.0f); break;
+        case 9: h->inv(msimd, r, r); break;
+        case 10: h->mul(simd, a, a, r); break;
+        case 11: h->mul(simd, a, a, b); break;
+        case 12: h->add(simd, a, a, 1.0f); break;
+        case 13: h->mul(simd, r, r, a); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::elu_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    auto temp = scratch_[off].f();
+    const float log2e = 1.442695f; // log_2(e)
+    switch (phase) {
+        case 0: h->mul(simd, temp, r, log2e); break;
+        case 1: h->exp(simd, temp, temp); break;
+        case 2: h->add(simd, temp, temp, -1.f); break;
+        case 3: h->mul(simd, temp, temp, alpha_); break;
+        case 4: h->csel(simd | le | f0[0], r, temp, r, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::exp_compute_fwd(
+        int simd, const ngen::GRF &r, int phase) {
+    const float log2e = 1.442695f; // log_2(e)
+    switch (phase) {
+        case 0: h->mul(simd, r, r, log2e); break;
+        case 1: h->exp(simd, r, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::gelu_erf_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off, int batch) {
+    auto temp = scratch_[off].f();
+    auto at_accum = scratch_[off + batch].f();
+    auto tpow = scratch_[off + 2 * batch].f();
+    auto temp2 = scratch_[off + 3 * batch].f();
+    const float log2e = 1.442695f; // log_2(e)
+    const float reciproc_sqrt_2 = 0.707106769084930419921875f; // 1/sqrt(2)
+    const float p = 0.3275911f;
+    const float a1 = 0.254829592f;
+    const float a2 = -0.284496736f;
+    const float a3 = 1.421413741f;
+    const float a4 = -1.453152027f;
+    const float a5 = 1.061405429f;
+    switch (phase) {
+        case 0: h->mul(simd, temp, abs(r), reciproc_sqrt_2); break;
+        case 1: h->mul(simd, temp, temp, p); break;
+        case 2: h->add(simd, temp, temp, 1.f); break;
+        case 3: h->inv(simd, temp, temp); break;
+        case 4: h->mul(simd, at_accum, temp, a1); break;
+        case 5: h->mul(simd, tpow, temp, temp); break;
+        case 6: h->mul(simd, temp2, tpow, a2); break;
+        case 7: h->add(simd, at_accum, temp2, at_accum); break;
+        case 8: h->mul(simd, tpow, tpow, temp); break;
+        case 9: h->mul(simd, temp2, tpow, a3); break;
+        case 10: h->add(simd, at_accum, temp2, at_accum); break;
+        case 11: h->mul(simd, tpow, tpow, temp); break;
+        case 12: h->mul(simd, temp2, tpow, a4); break;
+        case 13: h->add(simd, at_accum, temp2, at_accum); break;
+        case 14: h->mul(simd, tpow, tpow, temp); break;
+        case 15: h->mul(simd, temp2, tpow, a5); break;
+        case 16: h->add(simd, at_accum, temp2, at_accum); break;
+        case 17: h->mul(simd, temp, r, r); break;
+        case 18: h->mul(simd, temp, temp, -log2e * 0.5f); break;
+        case 19: h->exp(simd, temp, temp); break;
+        case 20: h->mul(simd, temp, temp, at_accum); break;
+        case 21: h->mul(simd, temp, temp, r); break;
+        case 22: h->mul(simd, temp, temp, 0.5f); break;
+        case 23: h->add(simd, temp2, r, -temp); break;
+        case 24: h->csel(simd | le | f0[0], r, temp, temp2, r); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::hardsigmoid_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    switch (phase) {
+        case 0: h->mul(simd, r, r, alpha_); break;
+        case 1: h->add(simd, r, r, beta_); break;
+        case 2: h->min_(simd, r, r, 1.f); break;
+        case 3: h->max_(simd, r, r, 0.f); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::hardswish_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    auto temp = scratch_[off].f();
+    switch (phase) {
+        case 0: h->mul(simd, temp, r, alpha_); break;
+        case 1: h->add(simd, temp, temp, beta_); break;
+        case 2: h->min_(simd, temp, temp, 1.f); break;
+        case 3: h->max_(simd, temp, temp, 0.f); break;
+        case 4: h->mul(simd, r, r, temp); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::log_compute_fwd(
+        int simd, const ngen::GRF &r, int phase) {
+    const float reciproc_log2e = 1.f / 1.442695f; // 1 / log_2(e)
+    switch (phase) {
+        case 0: h->log(simd, r, r); break;
+        case 1: h->mul(simd, r, r, reciproc_log2e); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::mish_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off, int batch) {
+    auto temp = scratch_[off + batch].f();
+    auto temp2 = scratch_[off + 2 * batch].f();
+    const int srelu_phases = phase_count(alg_kind::eltwise_soft_relu);
+    const int tanh_phases = phase_count(alg_kind::eltwise_tanh);
+    // note tanh_compute_fwd_* clobbers scratch_[off] and scratch_[off + batch]
+    if (phase < srelu_phases)
+        soft_relu_compute_fwd_inner(simd, r, temp, temp2, phase, off, 1.f);
+    if (phase >= srelu_phases && phase < srelu_phases + tanh_phases) {
+        if (use_tanh_compat())
+            tanh_compute_fwd_compat(
+                    simd, temp2, phase - srelu_phases, off, batch);
+        else
+            tanh_compute_fwd(simd, temp2, phase - srelu_phases, off, batch);
+    }
+    if (phase == srelu_phases + tanh_phases) h->mul(simd, r, r, temp2);
+    if (phase > srelu_phases + tanh_phases) assert(!"invalid phase");
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::pow_compute_fwd(
+        int simd, const ngen::GRF &r, int phase, int off) {
+    auto temp = scratch_[off].f();
+    switch (phase) {
+        case 0:
+            if (float((long long int)beta_) == beta_) {
+                h->mov(simd, temp, abs(r));
+            } else {
+                h->mov(simd, temp, r);
+            }
+            break;
+        case 1: h->log(simd, temp, temp); break;
+        case 2: h->mul(simd, temp, temp, beta_); break;
+        case 3: h->exp(simd, temp, temp); break;
+        case 4:
+            if (((long long int)beta_) & 0x1)
+                h->csel(simd | lt | f0[0], temp, -temp, temp, r);
+            break;
+        case 5: h->mul(simd, r, temp, alpha_); break;
+        default: assert(!"invalid phase");
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::compute(const int *grfs,
+        int ngrf, const int seed, const int off, const ngen::DataType dt) {
+    using namespace alg_kind;
+
+    auto bmax = max_batch_size();
+    auto phases = phase_count(alg_);
+
+    for (int idx0 = 0; idx0 < ngrf; idx0 += bmax) {
+        auto batch = nstl::min(ngrf - idx0, bmax);
+
+        for (int phase = 0; phase < phases; phase++) {
+            for (int ii = 0, nreg = 0; ii < batch; ii += nreg) {
+                auto grf0 = grfs[idx0 + ii];
+                auto base = GRF(grf0).f();
+
+                nreg = 1;
+                if (ii + 1 < batch)
+                    if (grf0 + 1 == grfs[idx0 + ii + 1]) nreg = 2;
+
+                int simd = nreg * GRF::bytes(hw()) / sizeof(float);
+
+                if (is_fwd_) {
+                    switch ((int)alg_) {
+                        case eltwise_elu:
+                        case eltwise_elu_use_dst_for_bwd:
+                            elu_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_exp:
+                        case eltwise_exp_use_dst_for_bwd:
+                            exp_compute_fwd(simd, base, phase);
+                            break;
+                        case eltwise_gelu_erf:
+                            gelu_erf_compute_fwd(simd, base, phase, ii, batch);
+                            break;
+                        case eltwise_hardsigmoid:
+                            hardsigmoid_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_hardswish:
+                            hardswish_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_log:
+                            log_compute_fwd(simd, base, phase);
+                            break;
+                        case eltwise_mish:
+                            mish_compute_fwd(simd, base, phase, ii, batch);
+                            break;
+                        case eltwise_pow:
+                            pow_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_relu:
+                        case eltwise_relu_use_dst_for_bwd:
+                            if (alpha_ == 0.f)
+                                relu_zero_ns_compute_fwd(simd, base);
+                            else
+                                relu_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_abs: abs_compute_fwd(simd, base); break;
+                        case eltwise_soft_relu:
+                            soft_relu_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_sqrt:
+                        case eltwise_sqrt_use_dst_for_bwd:
+                            sqrt_compute_fwd(simd, base);
+                            break;
+                        case eltwise_square:
+                            square_compute_fwd(simd, base);
+                            break;
+                        case eltwise_tanh:
+                        case eltwise_tanh_use_dst_for_bwd:
+                            if (use_tanh_compat())
+                                tanh_compute_fwd_compat(
+                                        simd, base, phase, ii, batch);
+                            else
+                                tanh_compute_fwd(simd, base, phase, ii, batch);
+                            break;
+                        case eltwise_round:
+                            round_compute_fwd(simd, base);
+                            break;
+                        case eltwise_swish:
+                            swish_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_linear:
+                            linear_compute_fwd(simd, base, phase);
+                            break;
+                        case eltwise_clip:
+                        case eltwise_clip_v2:
+                        case eltwise_clip_v2_use_dst_for_bwd:
+                            clip_compute_fwd(simd, base, phase, alpha_, beta_);
+                            break;
+                        case eltwise_gelu_tanh:
+                            gelu_tanh_compute_fwd(simd, base, phase, ii);
+                            break;
+                        case eltwise_logistic:
+                        case eltwise_logistic_use_dst_for_bwd:
+                            logistic_compute_fwd(simd, base, phase);
+                            break;
+                        case eltwise_stochastic_round:
+                            sround_compute_fwd(simd, base, phase,
+                                    GRF(seed).ud(off), dt, ii);
+                            break;
+                        default: assert(!"unsupported eltwise algorithm");
+                    }
+                } else {
+                    switch (alg_) {
+                        case eltwise_relu: relu_compute_bwd(simd, base); break;
+                        case eltwise_abs:
+                            abs_compute_bwd(simd, base, phase);
+                            break;
+                        case eltwise_square:
+                            square_compute_bwd(simd, base);
+                            break;
+                        case eltwise_linear:
+                            linear_compute_bwd(simd, base);
+                            break;
+                        case eltwise_clip:
+                            clip_compute_bwd(simd, base, phase, alpha_, beta_);
+                            break;
+                        case eltwise_gelu_tanh:
+                            gelu_tanh_compute_bwd(simd, base, phase, ii, batch);
+                            break;
+                        default: assert(!"unsupported eltwise algorithm");
+                    }
+                }
+                // Apply scale.
+                if (phase == phases - 1 && scale_ != 1.f) {
+                    h->mul(simd, base, base, scale_);
+                }
+            }
+        }
+    }
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::compute(
+        const ngen::GRFRange &regs, const int seed, const int off,
+        const ngen::DataType dt) {
+    int grfs[ngen::GRF::maxRegs()];
+
+    for (int i = 0; i < regs.getLen(); i++)
+        grfs[i] = regs.getBase() + i;
+
+    compute(grfs, regs.getLen(), seed, off, dt);
+}
+
+template <typename ngen_generator_t>
+void eltwise_injector_f32_t<ngen_generator_t>::prepare() {
+    using namespace alg_kind;
+
+    assert(scratch_.getLen() >= min_scratch_regs());
+
+    if (is_fwd_) {
+        switch (alg_) {
+            case eltwise_relu:
+            case eltwise_relu_use_dst_for_bwd:
+                if (alpha_ == 0.f) relu_zero_ns_prepare_fwd();
+                break;
+            case eltwise_mish:
+            case eltwise_tanh:
+                if (use_tanh_compat())
+                    tanh_prepare_fwd_compat();
+                else
+                    tanh_prepare_fwd();
+                break;
+            default: break;
+        }
+    } else {
+        switch (alg_) {
+            case eltwise_relu: relu_prepare_bwd(); break;
+            case eltwise_abs: abs_prepare_bwd(); break;
+            case eltwise_clip: clip_prepare_bwd(); break;
+            default: break;
+        }
+    }
+}
+
+REG_GEN9_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_gen9>>);
+REG_GEN11_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_gen11>>);
+REG_XELP_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_xe_lp>>);
+REG_XEHP_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_xe_hp>>);
+REG_XEHPG_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_xe_hpg>>);
+REG_XEHPC_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_xe_hpc>>);
+REG_XE2_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_xe2>>);
+REG_XE3_ISA(template struct eltwise_injector_f32_t<generator_t<gpu_xe3>>);
+
+REG_GEN9_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_gen9>>);
+REG_GEN11_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_gen11>>);
+REG_XELP_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_xe_lp>>);
+REG_XEHP_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_xe_hp>>);
+REG_XEHPG_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_xe_hpg>>);
+REG_XEHPC_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_xe_hpc>>);
+REG_XE2_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_xe2>>);
+REG_XE3_ISA(template struct eltwise_injector_f32_t<ir_kernel_t<gpu_xe3>>);
+
+#ifdef NGEN_ASM
+template struct eltwise_injector_f32_t<ir_asm_kernel_t>;
+#endif
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/eltwise_injector.hpp b/src/gpu/intel/jit/eltwise_injector.hpp
new file mode 100644
index 00000000000..e332f1a434a
--- /dev/null
+++ b/src/gpu/intel/jit/eltwise_injector.hpp
@@ -0,0 +1,171 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_ELTWISE_INJECTOR_HPP
+#define GPU_INTEL_JIT_ELTWISE_INJECTOR_HPP
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/utils.hpp"
+#include "gpu/intel/jit/codegen/kernel.hpp"
+#include "gpu/intel/jit/generator.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+inline bool eltwise_injector_f32_is_supported(alg_kind_t alg) {
+    using namespace alg_kind;
+    return utils::one_of(alg, eltwise_elu, eltwise_elu_use_dst_for_bwd,
+            eltwise_exp, eltwise_exp_use_dst_for_bwd, eltwise_gelu_tanh,
+            eltwise_gelu_erf, eltwise_hardsigmoid, eltwise_hardswish,
+            eltwise_log, eltwise_mish, eltwise_pow, eltwise_relu,
+            eltwise_relu_use_dst_for_bwd, eltwise_soft_relu, eltwise_sqrt,
+            eltwise_sqrt_use_dst_for_bwd, eltwise_square, eltwise_swish,
+            eltwise_tanh, eltwise_tanh_use_dst_for_bwd, eltwise_abs,
+            eltwise_round, eltwise_linear, eltwise_clip, eltwise_clip_v2,
+            eltwise_clip_v2_use_dst_for_bwd, eltwise_logistic,
+            eltwise_logistic_use_dst_for_bwd, eltwise_stochastic_round);
+}
+
+template <typename ngen_generator_t>
+struct eltwise_injector_f32_t {
+    eltwise_injector_f32_t(ngen_generator_t *host, alg_kind_t alg, float alpha,
+            float beta, float scale, int eu_count,
+            const ngen::GRFRange &scratch = ngen::GRFRange(),
+            bool is_fwd = true)
+        : alg_(alg)
+        , alpha_(alpha)
+        , beta_(beta)
+        , scale_(scale)
+        , is_fwd_(is_fwd)
+        , eu_count_(eu_count)
+        , h(host)
+        , scratch_(scratch) {
+
+        assert(eltwise_injector_f32_is_supported(alg_));
+        assert(scratch_.isEmpty() || (scratch_.getLen() >= min_scratch_regs()));
+    }
+
+    ngen::HW hw() const { return h->getHardware(); }
+
+    int min_scratch_regs();
+    int preferred_scratch_regs();
+    void set_scratch(const ngen::GRFRange &scratch) { scratch_ = scratch; }
+
+    void prepare();
+    void compute(const ngen::GRF &reg) { compute(ngen::GRFRange(reg, 1)); }
+    void compute(const ngen::GRFRange &regs, int seed = -1, int seed_off = -1,
+            ngen::DataType = ngen::DataType::invalid);
+    void compute(const int *grfs, int ngrf, int seed = -1, int seed_off = -1,
+            ngen::DataType = ngen::DataType::invalid);
+
+private:
+    const alg_kind_t alg_;
+    const float alpha_;
+    const float beta_;
+    const float scale_;
+    const bool is_fwd_;
+
+    const int eu_count_;
+
+    ngen_generator_t *h;
+
+    ngen::GRFRange scratch_;
+
+    bool is_gpu(ngen::HW arg_hw, int arg_eu_count) const {
+        return (hw() == arg_hw) && (eu_count_ == arg_eu_count);
+    }
+    bool use_tanh_compat() const { return false; }
+
+    int max_batch_size();
+    int phase_count(alg_kind_t alg);
+
+    void relu_zero_ns_prepare_fwd();
+    void relu_prepare_bwd();
+    void abs_prepare_bwd();
+    void clip_prepare_bwd();
+    void tanh_prepare_fwd();
+    void tanh_prepare_fwd_compat();
+
+    void relu_zero_ns_compute_fwd(int simd, const ngen::GRF &r);
+    void relu_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
+    void abs_compute_fwd(int simd, const ngen::GRF &r);
+    void exp_compute_fwd(int simd, const ngen::GRF &r, int phase);
+    void elu_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
+    void gelu_erf_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off, int batch);
+    void hardsigmoid_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off);
+    void hardswish_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off);
+    void log_compute_fwd(int simd, const ngen::GRF &r, int phase);
+    void mish_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off, int batch);
+    void pow_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
+    void soft_relu_compute_fwd_inner(int simd, const ngen::GRF &input,
+            const ngen::GRF &temp, const ngen::GRF &dest, int phase, int off,
+            float alpha);
+    void soft_relu_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off);
+    void sqrt_compute_fwd(int simd, const ngen::GRF &r);
+    void square_compute_fwd(int simd, const ngen::GRF &r);
+    void round_compute_fwd(int simd, const ngen::GRF &r);
+    void sround_compute_fwd(int simd, const ngen::GRF &r, int phase,
+            const ngen::Subregister &seed, const ngen::DataType dst_dt,
+            int off);
+    void philox_4x32(
+            int simd, const ngen::Subregister &seed, const ngen::GRF &bias);
+    void swish_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
+    void tanh_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off, int batch);
+    void tanh_compute_fwd_compat(
+            int simd, const ngen::GRF &r, int phase, int off, int batch);
+    void linear_compute_fwd(int simd, const ngen::GRF &r, int phase);
+    void clip_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, float alpha, float beta);
+    void gelu_tanh_compute_fwd(
+            int simd, const ngen::GRF &r, int phase, int off);
+    void logistic_compute_fwd(int simd, const ngen::GRF &r, int phase);
+
+    void relu_compute_bwd(int simd, const ngen::GRF &r);
+    void abs_compute_bwd(int simd, const ngen::GRF &r, int phase);
+    void square_compute_bwd(int simd, const ngen::GRF &r);
+    void linear_compute_bwd(int simd, const ngen::GRF &r);
+    void clip_compute_bwd(
+            int simd, const ngen::GRF &r, int phase, float alpha, float beta);
+    void gelu_tanh_compute_bwd(
+            int simd, const ngen::GRF &r, int phase, int off, int batch);
+
+    const ngen::InstructionModifier le = ngen_generator_t::le;
+    const ngen::InstructionModifier lt = ngen_generator_t::lt;
+    const ngen::InstructionModifier ge = ngen_generator_t::ge;
+    const ngen::InstructionModifier gt = ngen_generator_t::gt;
+    const ngen::InstructionModifier eq = ngen_generator_t::eq;
+    const ngen::InstructionModifier sat = ngen_generator_t::sat;
+    const ngen::FlagRegister f0 = ngen_generator_t::f0;
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_JIT_ELTWISE_INJECTOR_HPP
diff --git a/src/gpu/intel/jit/emulated_generator.cpp b/src/gpu/intel/jit/emulated_generator.cpp
index 12f17023b15..4af0de41f1f 100644
--- a/src/gpu/intel/jit/emulated_generator.cpp
+++ b/src/gpu/intel/jit/emulated_generator.cpp
@@ -450,6 +450,7 @@ REG_XEHP_ISA(template class emulated_generator_t<gpu_xe_hp>);
 REG_XEHPG_ISA(template class emulated_generator_t<gpu_xe_hpg>);
 REG_XEHPC_ISA(template class emulated_generator_t<gpu_xe_hpc>);
 REG_XE2_ISA(template class emulated_generator_t<gpu_xe2>);
+REG_XE3_ISA(template class emulated_generator_t<gpu_xe3>);
 
 } // namespace jit
 } // namespace intel
diff --git a/src/gpu/intel/jit/emulated_generator.hpp b/src/gpu/intel/jit/emulated_generator.hpp
index 673f4f6c097..4781a983292 100644
--- a/src/gpu/intel/jit/emulated_generator.hpp
+++ b/src/gpu/intel/jit/emulated_generator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2024 Intel Corporation
+ * Copyright 2024-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,13 @@
 #define GPU_INTEL_JIT_EMULATED_GENERATOR_HPP
 
 // Must be included before emulation.hpp
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 #include "gpu/intel/compute/device_info.hpp"
 #include "gpu/intel/jit/codegen/register_allocator.hpp"
 #include "gpu/intel/jit/emulation.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-#include "gpu/intel/jit/ngen/ngen_core.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "ngen_core.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -33,16 +33,18 @@ namespace intel {
 namespace jit {
 
 template <gpu_gen_t hw>
-class emulated_generator_t : public jit_generator<hw> {
+class emulated_generator_t : public generator_t<hw> {
     friend struct EmulationImplementation;
 
 protected:
-    NGEN_FORWARD_OPENCL(hw);
+    NGEN_FORWARD_ELF(hw)
 
 public:
-    emulated_generator_t(
-            const compute::device_info_t &device_info, const std::string &name)
-        : ra_(hw, name), emu_strategy(hw, device_info.stepping_id()) {}
+    emulated_generator_t(const compute::device_info_t &device_info,
+            const debug_config_t &debug_config)
+        : generator_t<hw>(debug_config)
+        , ra_(hw)
+        , emu_strategy(hw, device_info.stepping_id()) {}
 
 protected:
     reg_allocator_t ra_;
diff --git a/src/gpu/intel/jit/emulation.hpp b/src/gpu/intel/jit/emulation.hpp
index c71c6377f13..6570f425b53 100644
--- a/src/gpu/intel/jit/emulation.hpp
+++ b/src/gpu/intel/jit/emulation.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,13 @@
 #ifndef GPU_INTEL_JIT_EMULATION_HPP
 #define GPU_INTEL_JIT_EMULATION_HPP
 
+#include "common/utils.hpp"
+#include "ngen.hpp"
+
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+#include <source_location>
+#endif
+
 #include <exception>
 
 namespace dnnl {
@@ -25,7 +32,7 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-struct EmulationStrategy {
+struct EmulationStrategy { // NOLINT(readability-identifier-naming)
     // Emulate 64-bit arithmetic (required for GenXLP)
     bool emulate64 = false;
     // Emulate DW x DW -> DW multiplication (required for Gen12)
@@ -56,7 +63,7 @@ struct EmulationStrategy {
     }
 };
 
-struct EmulationState {
+struct EmulationState { // NOLINT(readability-identifier-naming)
     ngen::GRF temp[2]; // Temporary GRFs for use in emulation sequences
     ngen::FlagRegister
             flag; // Flag register for use in emulating 64-bit adds (optional, avoids temporary registers/acc)
@@ -65,10 +72,19 @@ struct EmulationState {
 
 // Implementation wrapped as static methods in non-instantiated class.
 // Clients should declare EmulationImplementation as a friend.
-struct EmulationImplementation {
+struct EmulationImplementation { // NOLINT(readability-identifier-naming)
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+    [[noreturn]] static void stub(
+            std::source_location where = std::source_location::current()) {
+        throw std::runtime_error(std::string("Unimplemented (at ")
+                + std::string(where.file_name()) + ":"
+                + std::to_string(where.line()) + ")");
+    }
+#else
     [[noreturn]] static void stub() {
         throw std::runtime_error("Unimplemented");
     }
+#endif
 
     template <typename DT, typename O>
     static void applyDefaultType(O &op) {
@@ -240,54 +256,70 @@ struct EmulationImplementation {
     template <typename DT = void, typename Generator>
     static void emov(Generator &g, const ngen::InstructionModifier &mod,
             ngen::RegData dst, ngen::RegData src0,
-            const EmulationStrategy &strategy) {
+            const EmulationStrategy &strategy, ngen::SourceLocation loc = {}) {
         using namespace ngen;
         applyDefaultType<DT>(dst);
         applyDefaultType<DT>(src0);
 
         bool dstQ = isQW(dst);
         bool s0Q = isQW(src0);
-        bool s0D = isDW(src0);
         bool isDF = (src0.getType() == DataType::df
                 && dst.getType() == DataType::df);
         bool unaligned = (mod.getExecSize() > 1 && src0.getHS() != 0
                 && src0.getOffset() != dst.getOffset());
-
-        if ((dstQ && s0D) && strategy.emulate64) {
-            if (src0.getNeg()) stub();
-            bool s0Signed = isSigned(src0.getType());
-            RegData dstHi, dstLo;
-            splitToDW(dst, dstLo, dstHi);
-            g.mov(mod, dstLo, src0);
-            if (!s0Signed)
-                g.mov(mod, dstHi, 0);
-            else
-                g.asr(mod, dstHi, dstLo, uint16_t(31));
-        } else if (((dstQ || s0Q) && strategy.emulate64)
-                || (isDF && unaligned && g.hardware >= ngen::HW::XeHP)) {
-            if (dstQ != s0Q) stub();
-
-            auto mod2x = mod;
-            mod2x.setExecSize(mod.getExecSize() * 2);
-
-            makeDWPair(dst, mod.getExecSize());
-            makeDWPair(src0, mod.getExecSize());
-            g.mov(mod2x, dst, src0);
+        bool emulateDF = isDF && unaligned && g.getHardware() >= ngen::HW::XeHP;
+
+        if ((strategy.emulate64 && dstQ) || emulateDF) {
+            switch (src0.getType()) {
+                case DataType::ub:
+                case DataType::uw:
+                case DataType::ud: {
+                    RegData dstHi, dstLo;
+                    splitToDW(dst, dstLo, dstHi);
+                    g.mov(mod, dstLo, src0, loc);
+                    g.mov(mod, dstHi, 0, loc);
+                    break;
+                }
+                case DataType::d: {
+                    if (src0.getNeg()) stub();
+                    RegData dstHi, dstLo;
+                    splitToDW(dst, dstLo, dstHi);
+                    g.mov(mod, dstLo, src0, loc);
+                    g.asr(mod, dstHi, src0, uint16_t(31), loc);
+                    break;
+                }
+                case DataType::q:
+                case DataType::uq:
+                case DataType::df: {
+                    if (dstQ != s0Q) stub();
+
+                    auto mod2x = mod;
+                    mod2x.setExecSize(mod.getExecSize() * 2);
+
+                    makeDWPair(dst, mod.getExecSize());
+                    makeDWPair(src0, mod.getExecSize());
+                    g.mov(mod2x, dst, src0, loc);
+                    break;
+                }
+                default: stub(); break;
+            }
+        } else if (strategy.emulate64 && s0Q) {
+            stub();
         } else if (dst.getType() == DataType::f
                 && src0.getType() == DataType::bf
                 && (src0.getHS() != 1 || mod.getExecSize() == 1)) {
             // Emulate bf16->f32 upconversion
             dst.setType(DataType::ud);
             src0.setType(DataType::uw);
-            g.shl(mod, dst, src0, 16);
+            g.shl(mod, dst, src0, 16, loc);
         } else
-            g.mov(mod, dst, src0);
+            g.mov(mod, dst, src0, loc);
     }
 
     template <typename DT = void, typename Generator>
     static void emov(Generator &g, const ngen::InstructionModifier &mod,
             ngen::RegData dst, ngen::Immediate src0,
-            const EmulationStrategy &strategy) {
+            const EmulationStrategy &strategy, ngen::SourceLocation loc = {}) {
         using namespace ngen;
         applyDefaultType<DT>(dst);
         applyDefaultType<DT>(src0);
@@ -310,14 +342,14 @@ struct EmulationImplementation {
 
                 downgradeToDW(dst);
                 dst.setRegion(0, 0, 1);
-                g.mov(mod2x, dst, s0Lo);
+                g.mov(mod2x, dst, s0Lo, loc);
             } else {
                 splitToDW(dst, dstLo, dstHi);
-                g.mov(mod, dstLo, s0Lo);
-                g.mov(mod, dstHi, s0Hi);
+                g.mov(mod, dstLo, s0Lo, loc);
+                g.mov(mod, dstHi, s0Hi, loc);
             }
         } else
-            g.mov(mod, dst, src0);
+            g.mov(mod, dst, src0, loc);
     }
 
     template <typename Generator>
@@ -325,7 +357,7 @@ struct EmulationImplementation {
             const ngen::InstructionModifier &mod, bool &doSub,
             const ngen::Immediate &src1, ngen::Immediate &s1LoPos,
             const ngen::Immediate &s1Lo, const ngen::Immediate &s1Hi, bool &s1Q,
-            const ngen::GRF (&temp)[2]) {
+            const ngen::GRF (&temp)[2], const ngen::SourceLocation &loc) {
         using namespace ngen;
         uint64_t raw = static_cast<uint64_t>(src1);
         if (src1.getType() == DataType::d) {
@@ -344,15 +376,15 @@ struct EmulationImplementation {
             const ngen::InstructionModifier &mod, bool &doSub,
             const ngen::RegData &src1, ngen::RegData &s1LoPos,
             ngen::RegData &s1Lo, ngen::RegData &s1Hi, bool &s1Q,
-            const ngen::GRF (&temp)[2]) {
+            const ngen::GRF (&temp)[2], const ngen::SourceLocation &loc) {
         using namespace ngen;
         s1Q = true;
         s1Hi = temp[0].d();
         if (s1Lo.getNeg()) {
-            g.asr(mod, s1Hi, -s1Lo, uint16_t(31));
+            g.asr(mod, s1Hi, -s1Lo, uint16_t(31), loc);
             s1Hi = -s1Hi;
         } else
-            g.asr(mod, s1Hi, s1Lo, uint16_t(31));
+            g.asr(mod, s1Hi, s1Lo, uint16_t(31), loc);
         s1Lo.setType(DataType::ud);
     }
 
@@ -371,18 +403,18 @@ struct EmulationImplementation {
     template <typename Generator>
     static void eaddFixupQD(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::FlagRegister &flag, const ngen::RegData &dstHi,
-            const ngen::RegData &src1) {
+            const ngen::RegData &src1, const ngen::SourceLocation &loc) {
         if ((src1.getBytes() < 8) && isSigned(src1.getType())) {
             // Add sign extension of src1 to high 32 bits of dst (inefficient but rarely used path).
-            g.cmp(mod | (src1.getNeg() ? g.le : g.lt) | flag, src1, 0);
-            g.add(mod | flag, dstHi, dstHi, -1);
+            g.cmp(mod | (src1.getNeg() ? g.le : g.lt) | flag, src1, 0, loc);
+            g.add(mod | flag, dstHi, dstHi, -1, loc);
         }
     }
 
     template <typename Generator>
     static void eaddFixupQD(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::FlagRegister &flag, const ngen::RegData &dstHi,
-            const ngen::Immediate &src1) {
+            const ngen::Immediate &src1, const ngen::SourceLocation &loc) {
         /* no-op */
     }
 
@@ -396,7 +428,8 @@ struct EmulationImplementation {
     template <typename DT = void, typename S1, typename Generator>
     static void eaddInternal(Generator &g, const ngen::InstructionModifier &mod,
             ngen::RegData dst, ngen::RegData src0, S1 src1,
-            const EmulationStrategy &strategy, const EmulationState &state) {
+            const EmulationStrategy &strategy, const EmulationState &state,
+            const ngen::SourceLocation &loc) {
         using namespace ngen;
         const auto &temp = state.temp;
 
@@ -415,24 +448,24 @@ struct EmulationImplementation {
             splitToDW(dst, dstLo, dstHi);
             splitToDW(src0, s0Lo, s0Hi);
             splitToDW(src1, s1Lo, s1Hi);
-            g.add(mod, dstLo, s0Lo, s1Lo);
+            g.add(mod, dstLo, s0Lo, s1Lo, loc);
 
             if (s0Q && s1Q) {
                 if (!equal(dstHi, s0Hi) && !equal(dstHi, s1Hi))
-                    g.add(mod, dstHi, s0Hi, s1Hi);
+                    g.add(mod, dstHi, s0Hi, s1Hi, loc);
             } else if (s0Q) {
-                if (!equal(dstHi, s0Hi)) g.mov(mod, dstHi, s0Hi);
+                if (!equal(dstHi, s0Hi)) g.mov(mod, dstHi, s0Hi, loc);
             } else if (s1Q) {
-                if (!equal(dstHi, s1Hi)) g.mov(mod, dstHi, s1Hi);
+                if (!equal(dstHi, s1Hi)) g.mov(mod, dstHi, s1Hi, loc);
             } else
-                g.mov(mod, dstHi, uint16_t(0));
+                g.mov(mod, dstHi, uint16_t(0), loc);
         } else if (!strategy.emulate64)
-            g.add(mod, dst, src0, src1);
+            g.add(mod, dst, src0, src1, loc);
         else {
             if (!dstQ) {
                 downgradeToDW(src0);
                 downgradeToDW(src1);
-                g.add(mod, dst, src0, src1);
+                g.add(mod, dst, src0, src1, loc);
             } else {
                 RegData dstHi, dstLo, s0Hi, s0Lo;
                 S1 s1Hi, s1Lo, s1LoPos;
@@ -462,19 +495,19 @@ struct EmulationImplementation {
                         neg = false;
                     }
 
-                    g.add(mod | Mx | g.ov | flag, dstLo, s0LoUD, s1LoMod);
+                    g.add(mod | Mx | g.ov | flag, dstLo, s0LoUD, s1LoMod, loc);
                     if (s0Q && s1Q)
-                        g.add(mod, dstHi, s0Hi, s1Hi);
+                        g.add(mod, dstHi, s0Hi, s1Hi, loc);
                     else if (s0Q && !equal(dstHi, s0Hi))
-                        g.mov(mod, dstHi, s0Hi);
+                        g.mov(mod, dstHi, s0Hi, loc);
                     else if (s1Q && !equal(dstHi, s1Hi))
-                        g.mov(mod, dstHi, s1Hi);
+                        g.mov(mod, dstHi, s1Hi, loc);
                     else if (!s0Q && !s1Q)
-                        g.mov(mod, dstHi, 0);
+                        g.mov(mod, dstHi, 0, loc);
                     g.add(mod | Mx | (revFlag ? ~flag : flag), dstHi, dstHi,
-                            neg ? -1 : +1);
-                    eaddFixupQD(g, mod | Mx, flag, dstHi, src0);
-                    eaddFixupQD(g, mod | Mx, flag, dstHi, src1);
+                            neg ? -1 : +1, loc);
+                    eaddFixupQD(g, mod | Mx, flag, dstHi, src0, loc);
+                    eaddFixupQD(g, mod | Mx, flag, dstHi, src1, loc);
                 } else {
                     // Slow path: addc/subb + acc.
                     RegData carry = temp[0].ud();
@@ -487,12 +520,12 @@ struct EmulationImplementation {
                         if (s0Signed) {
                             s0Q = true;
                             s0Hi = temp[0].d();
-                            g.asr(mod, s0Hi, s0Lo, uint16_t(31));
+                            g.asr(mod, s0Hi, s0Lo, uint16_t(31), loc);
                             s0Lo.setType(DataType::ud);
                             if (s0Lo.getNeg()) s0Hi = -s0Hi;
                         } else
                             eaddSignExtend1(g, mod, doSub, src1, s1LoPos, s1Lo,
-                                    s1Hi, s1Q, temp);
+                                    s1Hi, s1Q, temp, loc);
                         carry = temp[1].ud();
                         lateCarry = true;
                     }
@@ -504,41 +537,41 @@ struct EmulationImplementation {
                     // Compute low 32 bits, saving carry/borrow.
                     if (dstLo.getOffset() != 0) {
                         doSub ? g.subb(mod, g.null.retype(s0Lo.getType()), s0Lo,
-                                s1LoPos)
+                                s1LoPos, loc)
                               : g.addc(mod, g.null.retype(s0Lo.getType()), s0Lo,
-                                      s1Lo);
-                        g.add(mod, dstLo, s0Lo, s1Lo);
+                                      s1Lo, loc);
+                        g.add(mod, dstLo, s0Lo, s1Lo, loc);
                     } else if ((mod.getExecSize() > 1)
                             && !isUnitStride(dstLo)) {
                         subDstLo = temp[1].ud();
-                        doSub ? g.subb(mod, subDstLo, s0Lo, s1LoPos)
-                              : g.addc(mod, subDstLo, s0Lo, s1Lo);
+                        doSub ? g.subb(mod, subDstLo, s0Lo, s1LoPos, loc)
+                              : g.addc(mod, subDstLo, s0Lo, s1Lo, loc);
                     } else {
-                        doSub ? g.subb(mod, dstLo, s0Lo, s1LoPos)
-                              : g.addc(mod, dstLo, s0Lo, s1Lo);
+                        doSub ? g.subb(mod, dstLo, s0Lo, s1LoPos, loc)
+                              : g.addc(mod, dstLo, s0Lo, s1Lo, loc);
                     }
 
                     // Retrieve carry from accumulator, unless it conflicts with subDstLo.
-                    if (!lateCarry) g.mov(mod, carry, g.acc0.ud());
+                    if (!lateCarry) g.mov(mod, carry, g.acc0.ud(), loc);
 
                     // Move low 32-bits to final resting place, if needed.
-                    if (subDstLo.isValid()) g.mov(mod, dstLo, subDstLo);
+                    if (subDstLo.isValid()) g.mov(mod, dstLo, subDstLo, loc);
 
                     // Retrieve carry from accumulator once subDstLo isn't needed.
-                    if (lateCarry) g.mov(mod, carry, g.acc0.ud());
+                    if (lateCarry) g.mov(mod, carry, g.acc0.ud(), loc);
 
                     if (doSub) carry = -carry;
 
                     // Compute high 32 bits of sum.
                     if (s0Q && s1Q) {
-                        g.add(mod, dstHi, s0Hi, s1Hi);
-                        g.add(mod, dstHi, carry, dstHi);
+                        g.add(mod, dstHi, s0Hi, s1Hi, loc);
+                        g.add(mod, dstHi, carry, dstHi, loc);
                     } else if (s0Q)
-                        g.add(mod, dstHi, carry, s0Hi);
+                        g.add(mod, dstHi, carry, s0Hi, loc);
                     else if (s1Q)
-                        g.add(mod, dstHi, carry, s1Hi);
+                        g.add(mod, dstHi, carry, s1Hi, loc);
                     else
-                        g.mov(mod, dstHi, carry);
+                        g.mov(mod, dstHi, carry, loc);
                 }
             }
         }
@@ -548,27 +581,28 @@ struct EmulationImplementation {
     static void eadd(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::RegData &dst, const ngen::RegData &src0,
             const ngen::RegData &src1, const EmulationStrategy &strategy,
-            const EmulationState &state) {
+            const EmulationState &state, ngen::SourceLocation loc = {}) {
         if (src0.getNeg() && !src1.getNeg() && strategy.emulate64
                 && !strategy.emulate64_add32)
-            eaddInternal<DT>(g, mod, dst, src1, src0, strategy, state);
+            eaddInternal<DT>(g, mod, dst, src1, src0, strategy, state, loc);
         else
-            eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state);
+            eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
     }
 
     template <typename DT = void, typename Generator>
     static void eadd(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::RegData &dst, const ngen::RegData &src0,
             ngen::Immediate src1, const EmulationStrategy &strategy,
-            const EmulationState &state) {
-        eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state);
+            const EmulationState &state, ngen::SourceLocation loc = {}) {
+        eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
     }
 
     // Integer multiplication, emulating 32x32 multiplication as configured.
     template <typename DT = void, typename S1, typename Generator>
     static void emulInternal(Generator &g, const ngen::InstructionModifier &mod,
             ngen::RegData dst, ngen::RegData src0, S1 src1,
-            const EmulationStrategy &strategy, const EmulationState &state) {
+            const EmulationStrategy &strategy, const EmulationState &state,
+            const ngen::SourceLocation &loc) {
         using namespace ngen;
         applyDefaultType<DT>(dst);
         applyDefaultType<DT>(src0);
@@ -591,7 +625,18 @@ struct EmulationImplementation {
         bool emulate64 = strategy.emulate64_mul;
 
         if (s0Q) {
-            stub();
+            if (s1Q || !dstQ) stub();
+            auto temp = s1Signed ? state.temp[0].d() : state.temp[0].ud();
+            auto &src1Reg = [&]() -> ngen::RegData & {
+                if (s1Immed || s1W) {
+                    g.mov(mod, temp, src1, loc);
+                    return temp;
+                } else {
+                    return *reinterpret_cast<ngen::RegData *>(&src1);
+                }
+            }();
+            return emulInternal(
+                    g, mod, dst, src1Reg, src0, strategy, state, loc);
         } else if (s1Q) {
             if (!s0D || !dstQ) stub();
             auto s0Type = src0.getType();
@@ -609,25 +654,25 @@ struct EmulationImplementation {
                     = g.acc0.retype(s0Type)[dstLo.getOffset()](dstLo.getHS());
             auto accHi
                     = g.acc0.retype(s0Type)[dstHi.getOffset()](dstHi.getHS());
-            g.mul(mod, accHi, src0, s1W2);
-            g.macl(mod, dstHi, src0, s1Hi);
-            g.mul(mod, accLo, src0, s1W0);
-            g.mach(mod, dstLo, src0, s1Lo);
-            g.add(mod, dstHi, dstHi, dstLo);
-            g.mov(mod, dstLo, accLo);
+            g.mul(mod, accHi, src0, s1W2, loc);
+            g.macl(mod, dstHi, src0, s1Hi, loc);
+            g.mul(mod, accLo, src0, s1W0, loc);
+            g.mach(mod, dstLo, src0, s1Lo, loc);
+            g.add(mod, dstHi, dstHi, dstLo, loc);
+            g.mov(mod, dstLo, accLo, loc);
         } else if (dstQ && s0W && s1W) {
             RegData dstLo, dstHi;
             splitToDW(dst, dstLo, dstHi);
 
-            g.mul(mod, dstLo, src0, src1);
+            g.mul(mod, dstLo, src0, src1, loc);
 
             dstHi.setType(mulHiType);
             dstLo.setType(mulHiType);
 
             if (s0Signed || s1Signed)
-                g.asr(mod, dstHi, dstLo, 31);
+                g.asr(mod, dstHi, dstLo, 31, loc);
             else
-                g.mov(mod, dstHi, 0);
+                g.mov(mod, dstHi, 0, loc);
         } else if (dstQ && s0W && s1D) {
             stub();
         } else if (dstQ && s0D
@@ -638,15 +683,15 @@ struct EmulationImplementation {
             auto acc = g.acc0.retype(mulHiType)[dstLo.getOffset()](
                     dstLo.getHS());
 
-            g.mul(mod, acc, src0, lowWord(src1));
+            g.mul(mod, acc, src0, lowWord(src1), loc);
             if (s1D)
-                g.mach(mod, dstLo, src0, expandDW(src1));
+                g.mach(mod, dstLo, src0, expandDW(src1), loc);
             else
-                g.mach(mod, dstLo, src0, int32_t(0));
-            g.mov(mod, dstHi, dstLo);
-            g.mov(mod, dstLo, acc);
+                g.mach(mod, dstLo, src0, int32_t(0), loc);
+            g.mov(mod, dstHi, dstLo, loc);
+            g.mov(mod, dstLo, acc, loc);
         } else if (dstD && s0D && s1D && strategy.emulateDWxDW) {
-            int ne1 = ngen::GRF::bytes(g.hardware) >> 2;
+            int ne1 = ngen::GRF::bytes(g.getHardware()) >> 2;
 
             for (int r = 0; r < mod.getExecSize(); r += ne1) {
                 auto mmod = mod;
@@ -657,52 +702,53 @@ struct EmulationImplementation {
                 auto dummy = g.null.retype(mulHiType)[dst.getOffset()](
                         dst.getHS());
 
-                g.mul(mmod, acc, src0, lowWord(src1));
+                g.mul(mmod, acc, src0, lowWord(src1), loc);
 
-                if (g.hardware < HW::Gen10) {
-                    g.mach(mmod, dummy, src0, expandDW(src1));
-                    g.mov(mmod, dst, acc);
+                if (g.getHardware() < HW::Gen10) {
+                    g.mach(mmod, dummy, src0, expandDW(src1), loc);
+                    g.mov(mmod, dst, acc, loc);
                 } else {
-                    g.macl(mmod, dst, src0, expandDW(src1));
+                    g.macl(mmod, dst, src0, expandDW(src1), loc);
                 }
 
-                regionVSAdvance(g.hardware, dst, ne1);
-                regionVSAdvance(g.hardware, src0, ne1);
-                regionVSAdvance(g.hardware, src1, ne1);
+                regionVSAdvance(g.getHardware(), dst, ne1);
+                regionVSAdvance(g.getHardware(), src0, ne1);
+                regionVSAdvance(g.getHardware(), src1, ne1);
             }
         } else
-            g.mul(mod, dst, src0, src1);
+            g.mul(mod, dst, src0, src1, loc);
     }
 
     template <typename DT = void, typename Generator>
     static void emul(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::RegData &dst, const ngen::RegData &src0,
             const ngen::RegData &src1, const EmulationStrategy &strategy,
-            const EmulationState &state) {
-        emulInternal<DT>(g, mod, dst, src0, src1, strategy, state);
+            const EmulationState &state, ngen::SourceLocation loc = {}) {
+        emulInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
     }
 
     template <typename DT = void, typename Generator>
     static void emul(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::RegData &dst, const ngen::RegData &src0,
             ngen::Immediate src1, const EmulationStrategy &strategy,
-            const EmulationState &state) {
-        emulInternal<DT>(g, mod, dst, src0, src1, strategy, state);
+            const EmulationState &state, ngen::SourceLocation loc = {}) {
+        emulInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
     }
 
     template <typename S1, typename Generator>
     static void emul32High(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::RegData &dstHi, const ngen::RegData &src0,
-            const S1 &src1) {
-        g.mul(mod, g.acc0.ud(dstHi.getOffset()), src0, lowWord(src1));
-        g.mach(mod, dstHi, src0, src1);
+            const S1 &src1, ngen::SourceLocation loc = {}) {
+        g.mul(mod, g.acc0.ud(dstHi.getOffset()), src0, lowWord(src1), loc);
+        g.mach(mod, dstHi, src0, src1, loc);
     }
 
     // Shift left, emulating 64-bit arithmetic if configured.
     template <typename DT = void, typename Generator>
     static void eshl(Generator &g, const ngen::InstructionModifier &mod,
             ngen::RegData dst, ngen::RegData src0, uint16_t src1,
-            const EmulationStrategy &strategy, const EmulationState &state) {
+            const EmulationStrategy &strategy, const EmulationState &state,
+            ngen::SourceLocation loc = {}) {
         using namespace ngen;
         const auto &temp = state.temp;
 
@@ -713,7 +759,7 @@ struct EmulationImplementation {
         bool s0Q = isQW(src0);
 
         if (src1 == 0) {
-            emov<DT, Generator>(g, mod, dst, src0, strategy);
+            emov<DT, Generator>(g, mod, dst, src0, strategy, loc);
             return;
         }
 
@@ -727,20 +773,20 @@ struct EmulationImplementation {
             splitToDW(dst, dstLo, dstHi);
 
             if (s0Q) {
-                splitToDW(dst, s0Lo, s0Hi);
+                splitToDW(src0, s0Lo, s0Hi);
 
-                g.shr(mod, acc, s0Lo, uint16_t(32 - src1));
-                g.shl(mod, dstHi, s0Hi, src1);
-                g.shl(mod, dstLo, s0Lo, src1);
-                g.or_(mod, dstHi, acc, dstHi);
+                g.shr(mod, acc, s0Lo, uint16_t(32 - src1), loc);
+                g.shl(mod, dstHi, s0Hi, src1, loc);
+                g.shl(mod, dstLo, s0Lo, src1, loc);
+                g.or_(mod, dstHi, acc, dstHi, loc);
             } else {
                 dstHi.setType(DataType::ud);
-                g.shl(mod, dstLo, src0, src1);
-                g.shr(mod, dstHi, src0, uint16_t(32 - src1));
+                g.shl(mod, dstLo, src0, src1, loc);
+                g.shr(mod, dstHi, src0, uint16_t(32 - src1), loc);
             }
         } else {
             if (s0Q && !dstQ) downgradeToDW(src0);
-            g.shl(mod, dst, src0, src1);
+            g.shl(mod, dst, src0, src1, loc);
         }
     }
 
@@ -748,7 +794,8 @@ struct EmulationImplementation {
     template <typename DT = void, typename Generator>
     static void eshr(Generator &g, const ngen::InstructionModifier &mod,
             ngen::RegData dst, ngen::RegData src0, uint16_t src1,
-            const EmulationStrategy &strategy, const EmulationState &state) {
+            const EmulationStrategy &strategy, const EmulationState &state,
+            ngen::SourceLocation loc = {}) {
         using namespace ngen;
         const auto &temp = state.temp;
 
@@ -759,7 +806,7 @@ struct EmulationImplementation {
         bool s0Q = isQW(src0);
 
         if (src1 == 0) {
-            emov<DT, Generator>(g, mod, dst, src0, strategy);
+            emov<DT, Generator>(g, mod, dst, src0, strategy, loc);
             return;
         }
 
@@ -773,23 +820,23 @@ struct EmulationImplementation {
             splitToDW(dst, dstLo, dstHi);
 
             if (s0Q) {
-                splitToDW(dst, s0Lo, s0Hi);
+                splitToDW(src0, s0Lo, s0Hi);
 
-                g.shl(mod, acc, s0Lo, uint16_t(32 - src1));
-                g.shr(mod, dstLo, s0Lo, src1);
-                isSigned(src0.getType()) ? g.asr(mod, dstHi, s0Hi, src1)
-                                         : g.shr(mod, dstHi, s0Hi, src1);
-                g.or_(mod, dstLo, acc, dstLo);
+                g.shl(mod, acc, s0Lo, uint16_t(32 - src1), loc);
+                g.shr(mod, dstLo, s0Lo, src1, loc);
+                isSigned(src0.getType()) ? g.asr(mod, dstHi, s0Hi, src1, loc)
+                                         : g.shr(mod, dstHi, s0Hi, src1, loc);
+                g.or_(mod, dstLo, acc, dstLo, loc);
             } else {
                 dstLo.setType(dstHi.getType());
-                isSigned(src0.getType()) ? g.asr(mod, dstLo, src0, src1)
-                                         : g.shr(mod, dstLo, src0, src1);
-                g.mov(mod, dstHi, uint16_t(0));
+                isSigned(src0.getType()) ? g.asr(mod, dstLo, src0, src1, loc)
+                                         : g.shr(mod, dstLo, src0, src1, loc);
+                g.mov(mod, dstHi, uint16_t(0), loc);
             }
         } else {
             if (s0Q && !dstQ) downgradeToDW(src0);
-            isSigned(src0.getType()) ? g.asr(mod, dst, src0, src1)
-                                     : g.shr(mod, dst, src0, src1);
+            isSigned(src0.getType()) ? g.asr(mod, dst, src0, src1, loc)
+                                     : g.shr(mod, dst, src0, src1, loc);
         }
     }
 
@@ -797,18 +844,19 @@ struct EmulationImplementation {
     template <typename DT = void, typename Generator>
     static void emulConstant(Generator &g, const ngen::InstructionModifier &mod,
             const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1,
-            const EmulationStrategy &strategy, const EmulationState &state) {
+            const EmulationStrategy &strategy, const EmulationState &state,
+            ngen::SourceLocation loc = {}) {
         if (src1 == 0)
-            emov<DT>(g, mod, dst, uint16_t(0), strategy);
+            emov<DT>(g, mod, dst, uint16_t(0), strategy, loc);
         else if (src1 == 1) {
-            if (dst != src0) emov<DT>(g, mod, dst, src0, strategy);
+            if (dst != src0) emov<DT>(g, mod, dst, src0, strategy, loc);
         } else if (ngen::utils::is_zero_or_pow2(src1))
             eshl<DT>(g, mod, dst, src0, uint16_t(ngen::utils::log2(src1)),
-                    strategy, state);
+                    strategy, state, loc);
         else if (src1 > 0)
-            emul<DT>(g, mod, dst, src0, uint32_t(src1), strategy, state);
+            emul<DT>(g, mod, dst, src0, uint32_t(src1), strategy, state, loc);
         else
-            emul<DT>(g, mod, dst, src0, int32_t(src1), strategy, state);
+            emul<DT>(g, mod, dst, src0, int32_t(src1), strategy, state, loc);
     }
 }; // struct EmulationHelper
 
diff --git a/src/gpu/intel/jit/gemm/.clang-tidy b/src/gpu/intel/jit/gemm/.clang-tidy
new file mode 100644
index 00000000000..fc05834a23b
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/.clang-tidy
@@ -0,0 +1,3 @@
+Checks: '-*,misc-definitions-in-headers'
+CheckOptions:
+  - { key: HeaderFileExtensions,          value: "x" }
diff --git a/src/gpu/intel/jit/gemm/CMakeLists.txt b/src/gpu/intel/jit/gemm/CMakeLists.txt
index 6214ff9134f..6cfca22680e 100644
--- a/src/gpu/intel/jit/gemm/CMakeLists.txt
+++ b/src/gpu/intel/jit/gemm/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2024 Intel Corporation
+# Copyright 2024-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,12 @@
 # limitations under the License.
 #===============================================================================
 
-set(DIRS "generator;selector")
+# TODO: Extend conversion warning support to GEMM code
+if(CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM" OR CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang")
+    append(CMAKE_CXX_FLAGS " -Wno-implicit-int-conversion")
+endif()
+
+set(DIRS "generator;generator/pieces;selector")
 
 file(GLOB SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
@@ -28,15 +33,47 @@ foreach(d ${DIRS})
     list(APPEND SOURCES "${d_sources}")
 endforeach()
 
+# There is no easy way to emulate target_compile_definitions() with the dpcpp
+# host compiler arguments. As such, we cannot split compilation of the generator
+# templates by using defines.
+if(DPCPP_HOST_COMPILER_KIND STREQUAL "DEFAULT")
+   list(REMOVE_ITEM SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/generator/generator.cpp)
+
+    file (GLOB GENERATOR_SOURCES
+        generator/generator.cpp
+     )
+
+    if (DNNL_ENABLE_PRIMITIVE_GPU_ISA STREQUAL "ALL")
+        set(DNNL_GPU_ISA_LIST "GEN9;GEN11;XELP;XEHP;XEHPG;XEHPC;XE2;XE3")
+    else()
+        foreach(isa ${DNNL_ENABLE_PRIMITIVE_GPU_ISA})
+            string(TOUPPER ${isa} ISA)
+            set(DNNL_GPU_ISA_LIST "${DNNL_GPU_ISA_LIST};${ISA}")
+        endforeach()
+    endif()
+
+    foreach(isa ${DNNL_GPU_ISA_LIST})
+        set(GENERATOR_LIB generator${isa})
+        add_library(${GENERATOR_LIB} OBJECT ${GENERATOR_SOURCES})
+        target_compile_definitions(${GENERATOR_LIB} PRIVATE DNNL_GPU_ISA_${isa})
+        set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
+                        $<TARGET_OBJECTS:${GENERATOR_LIB}>)
+    endforeach()
+endif()
+
 set(OBJ_LIB ${LIB_PACKAGE_NAME}_gpu_intel_jit_gemm)
 add_library(${OBJ_LIB} OBJECT ${SOURCES})
 set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
 
-include_directories_with_host_compiler_before(${OBJ_LIB} ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../ngen)
+include_directories_with_host_compiler_before(${OBJ_LIB} ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
-# Workaround for LTO bug in GCC 10, 11, 12 (possibly other versions)
 if(CMAKE_COMPILER_IS_GNUCC)
+    # Workaround for LTO bug in GCC 10, 11, 12 (possibly other versions)
     set_source_files_properties(generator/pieces/loop_sequencer.cpp PROPERTIES COMPILE_FLAGS -fno-lto)
     set_source_files_properties(generator/generator.cpp PROPERTIES COMPILE_FLAGS -fno-lto)
+
+
+    # Workaround for excessively long compile time in GCC 11, 12 (possibly other versions)
+    set_source_files_properties(gen_gemm_kernel_db.cpp PROPERTIES COMPILE_FLAGS -fno-var-tracking)
 endif()
diff --git a/src/gpu/intel/jit/gemm/gemm_walk_orders.hpp b/src/gpu/intel/jit/gemm/gemm_walk_orders.hpp
index 5e7cfaf1e70..6b9f72e26e3 100644
--- a/src/gpu/intel/jit/gemm/gemm_walk_orders.hpp
+++ b/src/gpu/intel/jit/gemm/gemm_walk_orders.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ inline void gemm_linear_order_args(compute::kernel_arg_list_t &arg_list,
     uint32_t ss_count = dev_info->eu_count() / dev_info->max_eus_per_wg();
     bool large_grf_mode = (info.grfCount > 128);
     uint32_t thread_per_ss = dev_info->hw_threads(large_grf_mode) / ss_count;
-    uint32_t thread_per_tg = gpu_utils::into<uint32_t>(lws.nelems());
+    uint32_t thread_per_tg = into<uint32_t>(lws.nelems());
     uint32_t tg_per_ss = thread_per_ss / thread_per_tg;
     uint32_t concurrent_tg = tg_per_ss * ss_count;
 
@@ -196,7 +196,7 @@ inline void gemm_linear_order_args(compute::kernel_arg_list_t &arg_list,
         arg_list.set(argn++, group_count);
     }
 
-    gws[0] = lws[0] * group_count;
+    gws[0] = lws[0] * (group_count + info.extraWGs());
     gws[1] = lws[1];
 }
 
diff --git a/src/gpu/intel/jit/gemm/gen_gemm.cpp b/src/gpu/intel/jit/gemm/gen_gemm.cpp
index 1545e7ce2f1..cee1e8cbfc2 100644
--- a/src/gpu/intel/jit/gemm/gen_gemm.cpp
+++ b/src/gpu/intel/jit/gemm/gen_gemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,13 +36,13 @@ status_t gen_gemm_t::launch_nocopy(const gemm_exec_ctx_t &ctx,
         const memory_storage_t &c, const memory_storage_t *ao,
         const memory_storage_t *bo, const memory_storage_t *a_scales,
         const memory_storage_t *b_scales, const memory_storage_t &co,
-        const memory_storage_t *c_temp, int po_count,
-        const memory_storage_t **po_srcs, int64_t offset_a, int64_t offset_b,
-        int64_t offset_c, int32_t offset_aq, int32_t offset_bq,
-        int32_t offset_co, int32_t *offset_po_src, int32_t lda, int32_t ldb,
-        int32_t ldc, int32_t m, int32_t n, int32_t k, int32_t k0, float alpha,
-        float beta, int32_t cmask, bool last_k_block, bool swapab,
-        bool disable_hilbert) const {
+        const memory_storage_t *c_temp, const memory_storage_t *sround_seed,
+        int po_count, const memory_storage_t **po_srcs, int64_t offset_a,
+        int64_t offset_b, int64_t offset_c, int64_t offset_aq,
+        int64_t offset_bq, int64_t offset_co, int64_t *offset_po_src,
+        int32_t lda, int32_t ldb, int32_t ldc, int32_t m, int32_t n, int32_t k,
+        int32_t k0, float alpha, float beta, int32_t cmask, bool last_k_block,
+        bool swapab, bool disable_hilbert) const {
     if (pd()->desc()->batch() == 0) return status::success;
 
     uint32_t flags = 0;
@@ -107,6 +107,7 @@ status_t gen_gemm_t::launch_nocopy(const gemm_exec_ctx_t &ctx,
         }
     }
     if (nocopy_info()->needsTempC()) arg_list.set(argn++, *c_temp);
+    if (problem->cStochasticRound) { arg_list.set(argn++, *sround_seed); }
     arg_list.set(argn++, flags);
     if (k_parallel_fixed) arg_list.set(argn++, k0);
 
@@ -273,6 +274,7 @@ status_t gen_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
     auto &c_zp = GEMM_CTX_ARG_STORAGE(c_zero_point);
     auto &bias = GEMM_CTX_ARG_STORAGE(bias);
     auto &sum_ab = GEMM_CTX_ARG_STORAGE(sum_ab);
+    auto *sround_seed = &GEMM_CTX_ARG_STORAGE(sround_seed);
     auto *co = &c_zp;
     const memory_storage_t *ao = nullptr, *bo = nullptr;
     const memory_storage_t *a_scales = nullptr, *b_scales = nullptr;
@@ -335,19 +337,18 @@ status_t gen_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
             = types::bytes_to_elements(b_type, b.offset()) + pd()->dyn_offset_b;
     size_t off_c0
             = types::bytes_to_elements(c_type, c.offset()) + pd()->dyn_offset_c;
-    size_t off_aq0 = 0, off_bq0 = 0, off_co0 = 0;
+    int64_t off_aq0 = 0, off_bq0 = 0, off_co0 = 0;
 
-    int32_t po_offsets0[GEMM_MAX_PO] = {0}, po_offsets[GEMM_MAX_PO] = {0};
+    int64_t po_offsets0[GEMM_MAX_PO] = {0}, po_offsets[GEMM_MAX_PO] = {0};
     for (int i = 0; i < po_count; i++)
         if (po_srcs[i])
             po_offsets0[i] = po_srcs[i]->offset() / problem.Tbinary[i];
 
     int cmask = 0;
-
     if (pd()->with_c_zero_points()) {
         off_co0 = types::bytes_to_elements(c_type, co->offset())
                 + pd()->dyn_offset_co;
-        CHECK(pd()->attr()->zero_points_.get(DNNL_ARG_DST, &cmask));
+        cmask = pd()->attr()->zero_points_.get_mask(DNNL_ARG_DST);
     } else if (pd()->with_bias()) {
         off_co0 = types::bytes_to_elements(c_type, bias.offset());
         co = &bias;
@@ -405,10 +406,10 @@ status_t gen_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
         if (k_parallel_global && !nocopy_info()->fusedBeta() && beta != 1.0f
                 && (k > dim_t(k0) * pd()->kernel_desc()->aux_params()->wgK)) {
             status = launch_nocopy(ctx, compute_stream, zero_pool, a, b, c, ao,
-                    bo, a_scales, b_scales, *co, nullptr, po_count, po_srcs,
-                    off_a0, off_b0, off_c0, int32_t(off_aq0), int32_t(off_bq0),
-                    int32_t(off_co0), po_offsets0, lda, ldb, ldc, m, n, 0, 1,
-                    1.0f, beta, 0, false, swapab, true);
+                    bo, a_scales, b_scales, *co, nullptr, sround_seed, po_count,
+                    po_srcs, off_a0, off_b0, off_c0, off_aq0, off_bq0, off_co0,
+                    po_offsets0, lda, ldb, ldc, m, n, 0, 1, 1.0f, beta, 0,
+                    false, swapab, true);
             if (status) return status;
             beta = 1.0f;
         }
@@ -435,12 +436,12 @@ status_t gen_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
 
                 auto off_c = off_c0 + Bm + Bn * ldc;
 
-                auto off_aq = int32_t(off_aq0);
-                auto off_bq = int32_t(off_bq0);
+                auto off_aq = off_aq0;
+                auto off_bq = off_bq0;
                 if (pd()->ao_dims_ >= 1 || a_scales) off_aq += Bm;
                 if (pd()->bo_dims_ >= 1 || b_scales) off_bq += Bn;
 
-                auto off_co = int32_t(off_co0);
+                auto off_co = off_co0;
                 switch (cmask & 3) {
                     case 1: off_co += Bn; break;
                     case 2: off_co += Bm; break;
@@ -467,11 +468,11 @@ status_t gen_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
 
                 float eff_beta = (Bk == 0) ? beta : 1.0f;
                 status = launch_nocopy(ctx, compute_stream, zero_pool, a, b, c,
-                        ao, bo, a_scales, b_scales, *co, c_temp.get(), po_count,
-                        po_srcs, off_a_src, off_b_src, off_c, off_aq, off_bq,
-                        off_co, po_offsets, lda, ldb, ldc, size_m, size_n,
-                        size_k, k0, alpha, eff_beta, cmask, last_k_block,
-                        swapab, disable_hilbert);
+                        ao, bo, a_scales, b_scales, *co, c_temp.get(),
+                        sround_seed, po_count, po_srcs, off_a_src, off_b_src,
+                        off_c, off_aq, off_bq, off_co, po_offsets, lda, ldb,
+                        ldc, size_m, size_n, size_k, k0, alpha, eff_beta, cmask,
+                        last_k_block, swapab, disable_hilbert);
 
                 if (status) return status;
             }
diff --git a/src/gpu/intel/jit/gemm/gen_gemm.hpp b/src/gpu/intel/jit/gemm/gen_gemm.hpp
index 0ed13782274..8f9b1b3bdb3 100644
--- a/src/gpu/intel/jit/gemm/gen_gemm.hpp
+++ b/src/gpu/intel/jit/gemm/gen_gemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_INTEL_JIT_GEMM_GEN_GEMM_HPP
 
 #include <assert.h>
+#include <limits>
 #include <memory>
 
 #include "common/c_types_map.hpp"
@@ -56,8 +57,9 @@ struct gen_gemm_t : public gpu_gemm_t {
 
             // LIMITATIONS:
             // - runtime dims are not supported
-            auto attr_skip_mask = smask_t::scales_runtime | smask_t::post_ops
-                    | smask_t::fpmath_mode;
+            auto attr_skip_mask = smask_t::scales | smask_t::post_ops
+                    | smask_t::fpmath_mode | smask_t::accumulation_mode
+                    | smask_t::rounding_mode;
             auto &attr_zps = attr()->zero_points_;
 
             dev_info_ = compute_engine->device_info();
@@ -65,18 +67,27 @@ struct gen_gemm_t : public gpu_gemm_t {
             int stepping = dev_info_->stepping_id();
 
             const auto d = desc();
+            bool all_f8 = (utils::one_of(d->a_type(), f8_e5m2, f8_e4m3)
+                    && utils::one_of(d->b_type(), f8_e5m2, f8_e4m3)
+                    && utils::one_of(
+                            d->c_type(), f8_e5m2, f8_e4m3, f16, bf16, f32));
             wei_decomp_ = (utils::one_of(d->c_type(), f32, f16, bf16, f8_e5m2,
                                    f8_e4m3)
                                   && utils::one_of(d->a_type(), u8, s8, s4, u4)
                                   && utils::one_of(d->b_type(), f16, f32, bf16,
                                           f8_e5m2, f8_e4m3))
                     && attr()->mayiconvert(d->a_type(), f32);
-            dy_quant_enabled_ = (utils::one_of(d->c_type(), f32, f16, bf16)
-                    && utils::one_of(d->a_type(), u8, s8, s4, u4)
-                    && utils::one_of(d->b_type(), u8, s8));
+            dy_quant_enabled_
+                    = (utils::one_of(d->c_type(), f32, f16, bf16)
+                              && utils::one_of(d->a_type(), u8, s8, s4, u4)
+                              && utils::one_of(d->b_type(), u8, s8))
+                    || all_f8;
             quant_enabled_ = wei_decomp_ || dy_quant_enabled_;
             CHECK(set_default_formats(false));
 
+            with_sround_ = attr()->rounding_mode_.get(DNNL_ARG_DST)
+                    == rounding_mode::stochastic;
+
             // If m = 1, swap A/B to use more efficient n = 1 kernels if possible.
             eff_lda_ = d->lda();
             eff_ldb_ = d->ldb();
@@ -114,45 +125,55 @@ struct gen_gemm_t : public gpu_gemm_t {
 
             if (quant_enabled_) {
                 attr_skip_mask |= smask_t::fpmath_mode
-                        | smask_t::scales_runtime_data_type
-                        | smask_t::scales_runtime_groups
-                        | smask_t::zero_points_runtime_data_type
-                        | smask_t::zero_points_runtime_groups;
+                        | smask_t::scales_data_type | smask_t::scales_groups
+                        | smask_t::zero_points_data_type
+                        | smask_t::zero_points_groups;
             }
 
+            const int mask_scalar = 1 << 0;
+            const int mask_per_oc = 1 << 1;
+            const int mask_per_ic = 1 << 2;
+
             bool wei_zp_2d = false;
             auto wei_scales_type = data_type::undef;
             auto src_scales_type = data_type::undef;
             int wei_q2d_group_k = 0;
             int src_q2d_group_k = 0;
+            int a_ndims = desc()->a_desc.ndims;
+            int b_ndims = desc()->b_desc.ndims;
 
             // Check parameters.
-            if (utils::one_of(d->c_type(), s32, f16, f32, u8, s8)
+            if (utils::one_of(d->c_type(), s32, f16, bf16, f32, u8, s8)
                     && utils::one_of(d->a_type(), u8, s8, u4, s4)) {
                 VDISPATCH_GEMM(
                         (utils::one_of(d->b_type(), u8, s8) || wei_decomp_),
                         VERBOSE_UNSUPPORTED_DT);
-                attr_skip_mask |= smask_t::zero_points_runtime;
+                attr_skip_mask |= smask_t::zero_points;
 
                 VDISPATCH_GEMM(IMPLICATION(utils::one_of(d->c_type(), f32, s8,
-                                                   u8, f16),
+                                                   u8, f16, bf16),
                                        arch_ >= arch_t::xe_hp),
                         VERBOSE_ISA_DT_MISMATCH);
-            } else if (d->a_type() == bf16) {
-                VDISPATCH_GEMM(
-                        d->b_type() == bf16, VERBOSE_INCONSISTENT_DT, "a", "b");
-                VDISPATCH_GEMM(utils::one_of(d->c_type(), bf16, f32),
+            } else if (utils::one_of(d->a_type(), f16, bf16)) {
+                VDISPATCH_GEMM(d->b_type() == d->a_type(),
+                        VERBOSE_INCONSISTENT_DT, "a", "b");
+                VDISPATCH_GEMM(utils::one_of(d->c_type(), d->a_type(), f32,
+                                       f8_e5m2, f8_e4m3),
                         VERBOSE_INCONSISTENT_DT, "a", "c");
-                VDISPATCH_GEMM(utils::one_of(d->acc_type, bf16, f32),
+                VDISPATCH_GEMM(utils::one_of(d->acc_type, d->a_type(), f32),
                         VERBOSE_INCONSISTENT_DT, "a", "acc");
             } else if (!wei_decomp_) {
-                VDISPATCH_GEMM(utils::one_of(d->a_type(), f64, f32, f16,
-                                       f8_e5m2, f8_e4m3),
+                VDISPATCH_GEMM(utils::one_of(d->a_type(), f64, f32, f16, bf16,
+                                       f8_e5m2, f8_e4m3, f4_e2m1, f4_e3m0),
                         VERBOSE_UNSUPPORTED_DT);
-                VDISPATCH_GEMM(d->b_type() == d->a_type()
+                VDISPATCH_GEMM(
+                        (d->b_type() == d->a_type()
                                 || (utils::one_of(d->a_type(), f8_e5m2, f8_e4m3)
                                         && utils::one_of(
-                                                d->b_type(), f8_e5m2, f8_e4m3)),
+                                                d->b_type(), f8_e5m2, f8_e4m3))
+                                || (utils::one_of(d->a_type(), f4_e2m1, f4_e3m0)
+                                        && utils::one_of(d->b_type(), f4_e2m1,
+                                                f4_e3m0))),
                         VERBOSE_INCONSISTENT_DT, "a", "b");
                 VDISPATCH_GEMM(utils::one_of(d->acc_type, d->a_type(), f32),
                         VERBOSE_UNSUPPORTED_DT);
@@ -188,8 +209,6 @@ struct gen_gemm_t : public gpu_gemm_t {
                     VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "ngen_kernels");
             VDISPATCH_GEMM(attr()->has_default_values(attr_skip_mask),
                     VERBOSE_UNSUPPORTED_ATTR);
-            VDISPATCH_GEMM(attr()->output_scales_.mask_ == 0,
-                    VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_GEMM(IMPLICATION(with_sum_ab(),
                                    !with_bias()
                                            && (attr_zps.has_default_values(
@@ -198,57 +217,123 @@ struct gen_gemm_t : public gpu_gemm_t {
             VDISPATCH_GEMM(attr()->post_ops_.check_sum_consistency(d->c_type(),
                                    utils::one_of(d->a_type(), s8, u8)),
                     VERBOSE_UNSUPPORTED_POSTOP);
+            auto valid_2d_mask = [](int mask, int ndims) {
+                return utils::one_of(mask, (1 << (ndims - 1)),
+                        (1 << (ndims - 1)) + (1 << (ndims - 2)));
+            };
+            auto c_kernel_type
+                    = convert_dnnl_to_kernel_type(desc_.c_desc.data_type);
+            for (int i = 0; i < desc_.c_desc.ndims; i++) {
+                auto c_stride = desc_.c_desc.format_desc.blocking.strides[i];
+                VDISPATCH_GEMM(IMPLICATION((c_kernel_type.isInt4()
+                                                   || c_kernel_type.isFP4()),
+                                       c_stride == 1 || c_stride % 2 == 0),
+                        VERBOSE_SHAPE_RESTRICTION);
+            }
 
             if (!attr()->zero_points_.has_default_values()) {
-                bool a_zp = !attr_zps.has_default_values(DNNL_ARG_A);
-                bool b_zp = !attr_zps.has_default_values(DNNL_ARG_B);
-
-                int cmask_a = 0, cmask_b = 0, cmask_c = 0;
-                CHECK(attr_zps.get(DNNL_ARG_A, &cmask_a));
-                CHECK(attr_zps.get(DNNL_ARG_B, &cmask_b));
-                CHECK(attr_zps.get(DNNL_ARG_C, &cmask_c));
+                if (!attr_zps.has_default_values(DNNL_ARG_A)) {
+                    const int cmask_a = attr_zps.get_mask(DNNL_ARG_A);
+                    ao_dims_ = cmask_a > 0;
+
+                    // Groups determine supported masks.
+                    if (!attr_zps.has_default_groups(DNNL_ARG_A)) {
+                        VDISPATCH_GEMM(valid_2d_mask(cmask_a, a_ndims),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+
+                        const auto idx = DNNL_ARG_WEIGHTS;
+                        auto zp_group_k = attr_zps.get_group(idx, 0);
+                        if (zp_group_k < d->k()) {
+                            wei_zp_2d = true;
+                            ao_dims_ = 2;
+                            wei_q2d_group_k = zp_group_k;
+                        }
+                        const auto wei_q2d_group_n = attr_zps.get_group(idx, 1);
+                        // Non-trivial N group unsupported.
+                        VDISPATCH_GEMM(wei_q2d_group_n == 1,
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+                        // Zero points with non-trivial groups only supported
+                        // when target tensor is being dequantized.
+                        VDISPATCH_GEMM(!dy_quant_enabled_
+                                        || utils::one_of(d->a_type(), s4, u4)
+                                        || zp_group_k == desc()->k(),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+                    } else {
+                        VDISPATCH_GEMM(utils::one_of(cmask_a, 0, mask_per_oc,
+                                               mask_per_ic),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+                    }
+                }
 
-                wei_zp_2d = attr_zps.get_groups_ndims(DNNL_ARG_A) > 1;
-                VDISPATCH_GEMM(
-                        (utils::one_of(cmask_a, 0, 1 << 1, 1 << 2) || wei_zp_2d)
-                                && utils::one_of(cmask_b, 0, 1 << 0)
-                                && utils::one_of(cmask_c, 0, 1 << 0, 1 << 1),
-                        VERBOSE_UNSUPPORTED_ZP_CFG);
-
-                ao_dims_ = a_zp ? (cmask_a != 0 ? 1 : 0) : -1;
-                bo_dims_ = b_zp ? (cmask_b != 0 ? 1 : 0) : -1;
-                if (wei_zp_2d) ao_dims_ = 2;
-                if (swap_ab_) std::swap(ao_dims_, bo_dims_);
+                if (!attr_zps.has_default_values(DNNL_ARG_B)) {
+                    const int cmask_b = attr_zps.get_mask(DNNL_ARG_B);
+                    bo_dims_ = cmask_b > 0;
+
+                    // Groups determine supported masks.
+                    if (!attr_zps.has_default_groups(DNNL_ARG_B)) {
+                        VDISPATCH_GEMM(valid_2d_mask(cmask_b, b_ndims),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+
+                        const auto idx = DNNL_ARG_SRC;
+                        auto zp_group_k = attr_zps.get_group(idx, 1);
+                        if (zp_group_k < d->k()) {
+                            bo_dims_ = 2;
+                            src_q2d_group_k = zp_group_k;
+                        }
+                        const auto src_q2d_group_m = attr_zps.get_group(idx, 0);
+                        // Non-trivial M group unsupported.
+                        VDISPATCH_GEMM(src_q2d_group_m == 1,
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+                        // Zero points with non-trivial groups only supported
+                        // when target tensor is being dequantized.
+                        VDISPATCH_GEMM(!dy_quant_enabled_
+                                        || utils::one_of(d->b_type(), s4, u4)
+                                        || zp_group_k == desc()->k(),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+                    } else {
+                        VDISPATCH_GEMM(utils::one_of(cmask_b, 0, mask_scalar,
+                                               mask_per_oc | mask_per_ic),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
+                    }
+                }
 
-                if (wei_zp_2d) {
-                    wei_q2d_group_k
-                            = attr_zps.get_groups_ndims(DNNL_ARG_WEIGHTS) > 0
-                            ? attr_zps.get_groups(DNNL_ARG_WEIGHTS)[0]
-                            : 1;
+                if (!attr_zps.has_default_values(DNNL_ARG_C)) {
+                    const int cmask_c = attr_zps.get_mask(DNNL_ARG_C);
+                    VDISPATCH_GEMM(
+                            utils::one_of(cmask_c, 0, mask_scalar, mask_per_oc),
+                            VERBOSE_UNSUPPORTED_ZP_CFG);
                 }
+
+                if (swap_ab_) std::swap(ao_dims_, bo_dims_);
             }
 
             auto &wei_scales = attr()->scales_.get(DNNL_ARG_WEIGHTS);
             auto &src_scales = attr()->scales_.get(DNNL_ARG_SRC);
 
-            if (quant_enabled_ && wei_scales.ndims_ > 1) wei_scales_2d_ = true;
-            if (quant_enabled_ && src_scales.ndims_ > 1) src_scales_2d_ = true;
+            if (quant_enabled_ && !wei_scales.has_default_groups())
+                wei_scales_2d_ = true;
+            if (quant_enabled_ && !src_scales.has_default_groups())
+                src_scales_2d_ = true;
 
             for (auto s : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-                auto mask = attr()->scales_.get(s).mask_;
-                VDISPATCH_GEMM(utils::one_of(mask, 0, 1 << 0, 1 << 1, 1 << 2)
-                                || (s == DNNL_ARG_WEIGHTS && wei_scales_2d_)
-                                || (s == DNNL_ARG_SRC && src_scales_2d_),
+                if (attr()->scales_.has_default_values(s)) continue;
+
+                auto mask = attr()->scales_.get_mask(s);
+                VDISPATCH_GEMM(utils::one_of(mask, 0, mask_scalar, mask_per_oc,
+                                       mask_per_ic)
+                                || (s == DNNL_ARG_WEIGHTS && wei_scales_2d_
+                                        && valid_2d_mask(mask, a_ndims))
+                                || (s == DNNL_ARG_SRC && src_scales_2d_
+                                        && valid_2d_mask(mask, a_ndims)),
                         VERBOSE_UNSUPPORTED_SCALES_CFG);
             }
 
             if (wei_scales_2d_) {
-                auto scales_group_k
-                        = wei_scales.ndims_ > 0 ? wei_scales.group_dims_[0] : 1;
+                auto scales_group_k = wei_scales.get_group(0);
                 if (scales_group_k >= d->k()) {
                     wei_scales_2d_ = false;
                 } else {
-                    wei_scales_type = wei_scales.data_type_;
+                    wei_scales_type = wei_scales.get_data_type();
                     if (!wei_zp_2d)
                         wei_q2d_group_k = scales_group_k;
                     else {
@@ -256,16 +341,22 @@ struct gen_gemm_t : public gpu_gemm_t {
                                 VERBOSE_UNSUPPORTED_SCALES_CFG);
                     }
                 }
+                // Non-trivial N group unsupported.
+                VDISPATCH_GEMM(wei_scales.get_group(1) == 1,
+                        VERBOSE_UNSUPPORTED_SCALES_CFG);
             }
             if (src_scales_2d_) {
-                src_scales_type = src_scales.data_type_;
-                src_po_sc_ = src_scales.mask_ == 2;
-                auto scales_group_k
-                        = src_scales.ndims_ > 0 ? src_scales.group_dims_[1] : 1;
+                src_scales_type = src_scales.get_data_type();
+                src_po_sc_ = src_scales.get_mask() == 2;
+                auto scales_group_k = src_scales.get_group(1);
                 if (scales_group_k >= d->k())
                     src_scales_2d_ = false;
-                else
+                else {
                     src_q2d_group_k = scales_group_k;
+                    VDISPATCH_GEMM(dy_quant_enabled_
+                                    && utils::one_of(eff_a_type(), s4, u4),
+                            VERBOSE_UNSUPPORTED_SCALES_CFG);
+                }
             }
 
             VDISPATCH_GEMM_SC(init_post_ops(), VERBOSE_UNSUPPORTED_POSTOP);
@@ -274,10 +365,10 @@ struct gen_gemm_t : public gpu_gemm_t {
                     || (post_ops_.find(prelu) != -1);
             bool with_eltwise = (post_ops_.find(eltwise) != -1);
 
-            // check GPU architecture
+            // Check GPU architecture.
             bool arch_ok = utils::one_of(arch_, arch_t::gen9, arch_t::gen11,
                     arch_t::xe_lp, arch_t::xe_hp, arch_t::xe_hpg,
-                    arch_t::xe_hpc, arch_t::xe2);
+                    arch_t::xe_hpc, arch_t::xe2, arch_t::xe3);
 
             VDISPATCH_GEMM(arch_ok, VERBOSE_UNSUPPORTED_ARCH, "gpu");
             VDISPATCH_GEMM(IMPLICATION(with_binary, arch_ >= arch_t::xe_hp),
@@ -289,7 +380,9 @@ struct gen_gemm_t : public gpu_gemm_t {
                     || compute_engine->mayiuse(compute::device_ext_t::
                                     intel_subgroup_split_matrix_multiply_accumulate);
 
-            // size checks for fused reduction kernels
+            bool is_integrated = compute_engine->device_info()->is_integrated();
+
+            // Size checks for fused reduction kernels.
             if (with_sum_ab()) {
                 auto mnk = d->m() * d->n() * d->k();
                 if (arch_ == arch_t::xe_hpc && d->a_type() == f32)
@@ -297,34 +390,43 @@ struct gen_gemm_t : public gpu_gemm_t {
                             (mnk <= 256 * 1024 * 1024), VERBOSE_LARGE_SHAPES);
             }
 
-            // choose kernel
+            // Wrangle data types.
             auto ao_type = with_a_zero_points()
                     ? attr_zps.get_data_type(DNNL_ARG_A)
                     : data_type::s32;
-            auto bo_type = data_type::s32;
+            auto bo_type = with_b_zero_points()
+                    ? attr_zps.get_data_type(DNNL_ARG_B)
+                    : data_type::s32;
+            if (swap_ab_) std::swap(ao_type, bo_type);
+            bool int_acc = utils::one_of(eff_a_type(), s8, u8);
             auto co_type = with_bias() ? d->bias_type()
-                    : with_sum_ab()
-                    ? d->sum_ab_type
-                    : (utils::one_of(eff_a_type(), s8, u8) ? s32 : d->c_type());
+                    : with_sum_ab()    ? d->sum_ab_type
+                    : int_acc          ? s32
+                                       : d->c_type();
 
-            auto acc_type = utils::one_of(eff_a_type(), s8, u8)
+            // Choose accumulation data type.
+            auto acc_type = int_acc
                     ? s32
                     : (utils::one_of(f64, eff_a_type(), eff_b_type()) ? f64
                                                                       : f32);
-
-            if (swap_ab_) std::swap(ao_type, bo_type);
-            if (d->c_type() == f16 && !has_systolic) acc_type = data_type::f16;
             VDISPATCH_GEMM(
                     IMPLICATION(acc_type == f64, !with_eltwise && !with_binary),
                     VERBOSE_UNSUPPORTED_POSTOP);
 
-            if (types::data_type_size(acc_type) < 4) {
-                // Limited post-op support for low-precision accumulation.
-                VDISPATCH_GEMM(
-                        !with_binary && IMPLICATION(with_sum_, sum_at_begin_),
-                        VERBOSE_UNSUPPORTED_POSTOP);
+            bool need_x32_acc
+                    = with_binary || !IMPLICATION(with_sum_, sum_at_begin_);
+
+            switch (attr()->acc_mode_) {
+                case accumulation_mode::any:
+                    if (!need_x32_acc) acc_type = data_type::undef;
+                    break;
+                case accumulation_mode::f16: acc_type = data_type::f16; break;
+                case accumulation_mode::f32: acc_type = data_type::f32; break;
+                case accumulation_mode::s32: acc_type = data_type::s32; break;
+                default: break;
             }
 
+            // Handle special compute modes.
             kernel_desc_t::compute_mode mode = kernel_desc_t::mode_default;
 
             if (attr()->mayiconvert(f32, tf32))
@@ -333,28 +435,43 @@ struct gen_gemm_t : public gpu_gemm_t {
                 set_mode(mode, kernel_desc_t::mode_bf16x1);
             if (attr()->mayiconvert(f32, f16))
                 set_mode(mode, kernel_desc_t::mode_f16x1);
+            if (attr()->mayiconvert(f32, f32))
+                set_mode(mode, kernel_desc_t::mode_strict);
             if (attr()->deterministic_)
                 set_mode(mode, kernel_desc_t::mode_deterministic);
+            if (attr()->acc_mode_ == accumulation_mode::relaxed)
+                set_mode(mode, kernel_desc_t::mode_relaxed_acc);
 
             if (wei_decomp_) {
                 acc_type = data_type::f32;
                 set_mode(mode, kernel_desc_t::mode_w_decomp);
             }
 
+            // GEMM kernels down convert the following parameters to
+            // int/uint32_t
+            VDISPATCH_GEMM(std::max({eff_m(), eff_n(), d->k(), d->batch()})
+                            <= std::numeric_limits<int32_t>::max(),
+                    VERBOSE_SHAPE_RESTRICTION);
+            VDISPATCH_GEMM(std::max({eff_lda(), eff_ldb(), d->ldc()})
+                            <= std::numeric_limits<uint32_t>::max(),
+                    VERBOSE_SHAPE_RESTRICTION);
+
+            // Call kernel selector to choose a kernel.
             gpu_post_ops_t gpu_post_ops;
             CHECK(gpu_post_ops_t::make(gpu_post_ops, post_ops_, dst_md(),
                     get_post_op_specializations()));
 
             CHECK(kernel_desc_.select_kernel(arch_, stepping,
-                    dev_info_->eu_count(), has_systolic, mode, batch_dims(),
-                    eff_transa(), eff_transb(), eff_trans_bias(), swap_ab(),
-                    ao_dims_, bo_dims_, wei_scales_2d_, src_scales_2d_,
-                    wei_q2d_group_k, src_q2d_group_k, with_c_zero_points(),
-                    with_bias(), eff_sum_ab(), alpha(), beta(), eff_a_type(),
-                    eff_b_type(), desc()->c_type(), ao_type, bo_type,
-                    wei_scales_type, src_scales_type, co_type, acc_type,
-                    eff_align_a(), eff_align_b(), align_c(), eff_m(), eff_n(),
-                    d->k(), eff_lda(), eff_ldb(), d->ldc(), d->batch(),
+                    dev_info_->eu_count(), has_systolic, is_integrated, mode,
+                    batch_dims(), eff_transa(), eff_transb(), eff_trans_bias(),
+                    swap_ab(), ao_dims_, bo_dims_, wei_scales_2d_,
+                    src_scales_2d_, with_sround_, wei_q2d_group_k,
+                    src_q2d_group_k, with_c_zero_points(), with_bias(),
+                    eff_sum_ab(), alpha(), beta(), eff_a_type(), eff_b_type(),
+                    desc()->c_type(), ao_type, bo_type, wei_scales_type,
+                    src_scales_type, co_type, acc_type, eff_align_a(),
+                    eff_align_b(), align_c(), eff_m(), eff_n(), d->k(),
+                    eff_lda(), eff_ldb(), d->ldc(), d->batch(),
                     std::move(gpu_post_ops)));
 
             // Global k-parallel kernels don't support post-ops or non-f32/s32
@@ -366,6 +483,11 @@ struct gen_gemm_t : public gpu_gemm_t {
                         VERBOSE_UNSUPPORTED_POSTOP);
             }
 
+            // Limited post-op support for low-precision accumulation.
+            if (kernel_desc_.problem()->Tc.size() < 4) {
+                VDISPATCH_GEMM(!need_x32_acc, VERBOSE_UNSUPPORTED_POSTOP);
+            }
+
             // Ensure kernel can be run deterministically if required.
             if (attr()->deterministic_)
                 VDISPATCH_GEMM(!kernel_desc_.driver_info()->nondeterministic(),
@@ -403,6 +525,7 @@ struct gen_gemm_t : public gpu_gemm_t {
             auto c_t = d->c_type();
 
             bool is_f16 = utils::everyone_is(f16, a_t, b_t, c_t);
+            bool is_bf16 = utils::everyone_is(bf16, a_t, b_t, c_t);
             bool is_xe_hp_plus = arch_ >= arch_t::xe_hp;
 
             // Rename memory descriptors following column major format.
@@ -453,7 +576,23 @@ struct gen_gemm_t : public gpu_gemm_t {
             auto dotrans = batch ? acb : ba;
             auto notrans = batch ? abc : ab;
 
-            if (is_f16 && is_xe_hp_plus && use_tn) {
+            auto cache_line_align_md = [&](memory_desc_t &md) {
+                auto dim = md.dims[md.ndims - 1];
+                dnnl::impl::dims_t dims;
+                dnnl::impl::utils::array_copy(dims, md.dims, md.ndims);
+                auto kernel_type = convert_dnnl_to_kernel_type(md.data_type);
+                dim_t cache_line_elems = 64 / kernel_type;
+                md.dims[md.ndims - 1] = utils::rnd_up(dim,
+                        std::min((dim_t)cache_line_elems,
+                                utils::rnd_up(dim, 2)));
+                CHECK(memory_desc_init_by_strides(md, nullptr));
+                dnnl::impl::utils::array_copy(md.dims, dims, md.ndims);
+                return status::success;
+            };
+            if (a_any) CHECK(cache_line_align_md(a_desc));
+            if (b_any) CHECK(cache_line_align_md(b_desc));
+
+            if ((is_f16 || is_bf16) && is_xe_hp_plus && use_tn) {
                 if (a_any && b_any) {
                     CHECK(memory_desc_init_by_tag(a_desc, dotrans));
                     CHECK(memory_desc_init_by_tag(b_desc, notrans));
@@ -526,6 +665,7 @@ struct gen_gemm_t : public gpu_gemm_t {
         bool with_c_zero_points() const {
             return !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
         }
+        bool with_sround() const { return with_sround_; }
 
         bool wei_scales_2d() const { return wei_scales_2d_; }
         bool src_scales_2d() const { return src_scales_2d_; }
@@ -619,6 +759,7 @@ struct gen_gemm_t : public gpu_gemm_t {
         bool src_po_sc_ = false;
         dim_t eff_lda_ = 0, eff_ldb_ = 0;
         bool eff_transa_ = false, eff_transb_ = false;
+        bool with_sround_ = false;
 
         const compute::device_info_t *dev_info_ = nullptr;
         compute::gpu_arch_t arch_ = compute::gpu_arch_t::unknown;
@@ -628,7 +769,7 @@ struct gen_gemm_t : public gpu_gemm_t {
 
     gen_gemm_t(const pd_t *apd) : gpu_gemm_t(apd) {}
 
-    ~gen_gemm_t() {
+    ~gen_gemm_t() override {
         if (zero_pool_) release_zero_pool(zero_pool_);
     }
 
@@ -678,13 +819,13 @@ struct gen_gemm_t : public gpu_gemm_t {
             const memory_storage_t &c, const memory_storage_t *ao,
             const memory_storage_t *bo, const memory_storage_t *a_scales,
             const memory_storage_t *b_scales, const memory_storage_t &co,
-            const memory_storage_t *c_temp, int po_count,
-            const memory_storage_t **po_src, int64_t offset_a, int64_t offset_b,
-            int64_t offset_c, int32_t offset_aq, int32_t offset_bq,
-            int32_t offset_co, int32_t *offset_po_src, int32_t lda, int32_t ldb,
-            int32_t ldc, int32_t m, int32_t n, int32_t k, int32_t k0,
-            float alpha, float beta, int32_t cmask, bool last_k_block,
-            bool swapab, bool disable_hilbert) const;
+            const memory_storage_t *c_temp, const memory_storage_t *sround_seed,
+            int po_count, const memory_storage_t **po_src, int64_t offset_a,
+            int64_t offset_b, int64_t offset_c, int64_t offset_aq,
+            int64_t offset_bq, int64_t offset_co, int64_t *offset_po_src,
+            int32_t lda, int32_t ldb, int32_t ldc, int32_t m, int32_t n,
+            int32_t k, int32_t k0, float alpha, float beta, int32_t cmask,
+            bool last_k_block, bool swapab, bool disable_hilbert) const;
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     const CommonDriverInfo *nocopy_info() const {
diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp
index 8eba841ea12..ff7e51b6438 100644
--- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp
+++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "gpu/intel/jit/gemm/gen_gemm_kernel.hpp"
 #include "common/impl_registration.hpp"
 #include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/jit/gemm/gen_gemm_kernel_db.hpp"
 #include "gpu/intel/jit/gemm/include/generator.hpp"
 #include "gpu/intel/jit/gemm/include/strategy_parser.hpp"
 #include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
@@ -28,16 +29,11 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-#define _CATALOG_ gemm_catalog
-#include "selector/db/kernel.db"
-;
-#undef _CATALOG_
-
 status_t gen_gemm_kernel_desc_t::create_generator(
         const compute::compute_engine_t &engine,
         compute::kernel_t &kernel) const {
     gen_gemm_kernel_t kd(*this);
-    return engine.create_kernel(&kernel, &kd, cache_blob_t());
+    return engine.create_kernel(&kernel, &kd);
 }
 
 compute::scalar_type_t gen_gemm_kernel_desc_t::scalar_type() const {
@@ -52,6 +48,8 @@ compute::scalar_type_t gen_gemm_kernel_desc_t::scalar_type() const {
         case Type::u32: return compute::scalar_type_t::_uint;
         case Type::s64: return compute::scalar_type_t::_long;
         case Type::u64: return compute::scalar_type_t::_ulong;
+        case Type::f4_e2m1: return compute::scalar_type_t::_f4_e2m1;
+        case Type::f4_e3m0: return compute::scalar_type_t::_f4_e3m0;
         case Type::bf8: return compute::scalar_type_t::_bfloat8;
         case Type::hf8: return compute::scalar_type_t::_hfloat8;
         case Type::bf16: return compute::scalar_type_t::_bfloat16;
@@ -64,12 +62,14 @@ compute::scalar_type_t gen_gemm_kernel_desc_t::scalar_type() const {
 
 status_t gen_gemm_kernel_desc_t::finalize(const char *tags) {
     // Update problem alignments to match catalog entry.
-    if (!isPacked(problem_.A.layout)) {
+    if (!isPacked(problem_.A.layout)
+            && problem_.Ta_ext.paddedSize() >= problem_.Ta.paddedSize()) {
         problem_.A.setAlignment(std::max(
                 problem_.Ta_ext.paddedSize(), entry_->driverInfo.alignment[0]));
     }
 
-    if (!isPacked(problem_.B.layout)) {
+    if (!isPacked(problem_.B.layout)
+            && problem_.Tb_ext.paddedSize() >= problem_.Tb.paddedSize()) {
         problem_.B.setAlignment(std::max(
                 problem_.Tb_ext.paddedSize(), entry_->driverInfo.alignment[1]));
     }
@@ -79,13 +79,61 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) {
                 entry_->restrictions.alignment[2]));
     }
 
-    problem_.CO.setAlignment(problem_.Tco.size());
+    problem_.CO.setAlignment(problem_.Tco.paddedSize());
 
     // Parse strategy string.
     strategy_ = GEMMStrategy(hw_, stepping_);
-    strategy_.unroll[LoopM] = entry_->driverInfo.unroll[LoopM];
-    strategy_.unroll[LoopN] = entry_->driverInfo.unroll[LoopN];
-    parseStrategy(entry_->strategy, hw_, problem_, strategy_);
+#ifdef DNNL_DEV_MODE
+    std::string ovr_strategy;
+    ovr_strategy = gpu_utils::dev_getenv("GEMM_KERNEL", ovr_strategy);
+    if (!ovr_strategy.empty()) {
+        // Warning: will override problem data types (including up/down
+        // conversions) - this will cause inaccuracies if precisions/layouts
+        // are chosen that are incompatible with the given problem
+        std::stringstream ss(ovr_strategy);
+        std::string val;
+        ss >> val;
+        gpu_assert(val == "gemm");
+        ss >> val;
+        const char *pstr = val.c_str();
+        pstr = parsePrecisions(pstr, problem_.Ta_ext, problem_.Ta);
+        pstr = parsePrecisions(pstr, problem_.Tb_ext, problem_.Tb);
+        pstr = parsePrecisions(pstr, problem_.Tc, problem_.Tc_ext);
+        ss >> val;
+        pstr = val.c_str();
+        pstr = parseLayout(pstr, problem_.A);
+        pstr = parseLayout(pstr, problem_.B);
+        pstr = parseLayout(pstr, problem_.C);
+
+        if (problem_.A.alignment == 0)
+            problem_.A.setAlignment(
+                    problem_.A.defaultAlignment(problem_.Ta_ext));
+        if (problem_.B.alignment == 0)
+            problem_.B.setAlignment(
+                    problem_.B.defaultAlignment(problem_.Tb_ext));
+        if (problem_.C.alignment == 0)
+            problem_.C.setAlignment(
+                    problem_.C.defaultAlignment(problem_.Tc_ext));
+
+        strategy_ = GEMMStrategy(hw_, stepping_);
+        ss >> strategy_.unroll[LoopM];
+        ss >> strategy_.unroll[LoopN];
+
+        ss >> val;
+        problem_.alpha = std::stoi(val);
+        ss >> val;
+        problem_.beta = std::stoi(val);
+
+        ovr_strategy = ss.str().substr(ss.tellg()); // remaining string
+        parseStrategy(ovr_strategy.c_str(), hw_, problem_, strategy_);
+    } else {
+#endif
+        strategy_.unroll[LoopM] = entry_->driverInfo.unroll[LoopM];
+        strategy_.unroll[LoopN] = entry_->driverInfo.unroll[LoopN];
+        parseStrategy(entry_->strategy, hw_, problem_, strategy_);
+#ifdef DNNL_DEV_MODE
+    }
+#endif
     strategy_.panelCheck
             |= (isPacked(problem_.A.layout) || isPacked(problem_.B.layout));
     adjustStrategy(hw_, problem_, strategy_, tags);
@@ -99,9 +147,9 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) {
             aux_params_.k0 = utils::rnd_up(aux_params_.k0, problem_.bqGroupK);
     }
 
-    if (hw_ == ngen::HW::Xe2) {
-        // Temporary hack to use XeHPC register banking on Xe2, in order
-        //   to successfully reuse XeHPC strategies.
+    if (hw_ == ngen::HW::Xe2 || hw_ == ngen::HW::Xe3) {
+        // Use XeHPC register banking on Xe2/Xe3, in order
+        // to successfully reuse XeHPC strategies.
         strategy_.raHW = ngen::HW::XeHPC;
 
         // Bump up alignments to 16 bytes for block 2D if available.
@@ -133,6 +181,9 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) {
     if (strategy_.barrierFreq > 0 && k_ >= 0 && k_ < 2 * strategy_.barrierFreq)
         strategy_.barrierFreq = 0;
 
+    // Correct GRF count in following calculations for fixed systolic kernels.
+    if (strategy_.fixedSystolic) strategy_.GRFs = 256;
+
     // Disable linear ordering and persistent threads if the GEMM doesn't fill the GPU.
     if (m_ >= 0 && n_ >= 0 && eu_count_ >= 0) {
         int wg_tile_m = strategy_.wg[LoopM] * strategy_.unroll[LoopM];
@@ -182,54 +233,11 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) {
         }
     }
 
-#ifdef DNNL_DEV_MODE
-    std::string ovr_strategy;
-    ovr_strategy = gpu_utils::dev_getenv("GEMM_KERNEL", ovr_strategy);
-    if (!ovr_strategy.empty()) {
-        // Warning: will override problem data types (including up/down
-        // conversions) - this will cause inaccuracies if precisions/layouts
-        // are chosen that are incompatible with the given problem
-        std::stringstream ss(ovr_strategy);
-        std::string val;
-        ss >> val;
-        gpu_assert(val == "gemm");
-        ss >> val;
-        const char *pstr = val.c_str();
-        pstr = parsePrecisions(pstr, problem_.Ta_ext, problem_.Ta);
-        pstr = parsePrecisions(pstr, problem_.Tb_ext, problem_.Tb);
-        pstr = parsePrecisions(pstr, problem_.Tc, problem_.Tc_ext);
-        ss >> val;
-        pstr = val.c_str();
-        pstr = parseLayout(pstr, problem_.A);
-        pstr = parseLayout(pstr, problem_.B);
-        pstr = parseLayout(pstr, problem_.C);
-
-        if (problem_.A.alignment == 0)
-            problem_.A.setAlignment(
-                    problem_.A.defaultAlignment(problem_.Ta_ext));
-        if (problem_.B.alignment == 0)
-            problem_.B.setAlignment(
-                    problem_.B.defaultAlignment(problem_.Tb_ext));
-        if (problem_.C.alignment == 0)
-            problem_.C.setAlignment(
-                    problem_.C.defaultAlignment(problem_.Tc_ext));
-
-        strategy_ = GEMMStrategy(hw_, stepping_);
-        ss >> strategy_.unroll[LoopM];
-        ss >> strategy_.unroll[LoopN];
-
-        ss >> val;
-        problem_.alpha = std::stoi(val);
-        ss >> val;
-        problem_.beta = std::stoi(val);
-
-        ovr_strategy = ss.str().substr(ss.tellg()); // remaining string
-        parseStrategy(ovr_strategy.c_str(), hw_, problem_, strategy_);
-    }
-#endif
-
+    strategy_.relaxedAccumulation |= relaxed_acc_;
     strategy_.systolicAvailable &= !disable_systolic_;
-    strategy_.preflight(hw_, problem_);
+    try {
+        strategy_.preflight(hw_, problem_);
+    } catch (...) { return status::unimplemented; }
 
     // Check for legal 2D quantization group size.
     if (problem_.aoPtrDims == 2 || problem_.aScale2D)
@@ -239,6 +247,9 @@ status_t gen_gemm_kernel_desc_t::finalize(const char *tags) {
         if (problem_.bqGroupK % strategy_.bqGroupKGranularity())
             return status::unimplemented;
 
+    strategy_.kInterleaveChunk
+            = std::min(strategy_.kInterleaveChunk, (int)aux_params_.k0);
+    if (strategy_.kInterleave) aux_params_.wgK = strategy_.wg[LoopK];
     update_driver_info();
 
 #ifdef DNNL_DEV_MODE
@@ -274,6 +285,7 @@ void gen_gemm_kernel_desc_t::update_driver_info() {
         REG_XEHPG_ISA(ARCH_DISPATCH(XeHPG))
         REG_XEHPC_ISA(ARCH_DISPATCH(XeHPC))
         REG_XE2_ISA(ARCH_DISPATCH(Xe2))
+        REG_XE3_ISA(ARCH_DISPATCH(Xe3))
         default:
             assert(!"Unsupported architecture");
             driver_info_ = entry_->driverInfo;
@@ -344,9 +356,10 @@ status_t gen_gemm_kernel_desc_t::transfer_post_ops(
 }
 
 status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
-        int stepping, int eu_count, bool has_systolic, compute_mode mode,
-        int batch_dims, bool trans_a, bool trans_b, bool trans_co, bool swap_ab,
-        int ao_dims, int bo_dims, bool wei_scale_2d, bool src_scale_2d,
+        int stepping, int eu_count, bool has_systolic, bool is_integrated,
+        compute_mode mode, int batch_dims, bool trans_a, bool trans_b,
+        bool trans_co, bool swap_ab, int ao_dims, int bo_dims,
+        bool wei_scale_2d, bool src_scale_2d, bool dst_sround,
         int wei_q2d_group_k, int src_q2d_group_k, bool c_offset, bool bias,
         sum_ab_t reduce_ab, float alpha, float beta, data_type_t a_type,
         data_type_t b_type, data_type_t c_type, data_type_t ao_type,
@@ -366,21 +379,15 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
     k_ = k;
     eu_count_ = eu_count;
     disable_systolic_ = !has_systolic;
+    relaxed_acc_ = mode & mode_relaxed_acc;
 
-    align_a = nstl::max(align_a, int(types::data_type_size(a_type)));
-    align_b = nstl::max(align_b, int(types::data_type_size(b_type)));
-    align_c = nstl::max(align_c, int(types::data_type_size(c_type)));
+    auto a_type_size = types::data_type_size(a_type);
+    auto b_type_size = types::data_type_size(b_type);
+    auto c_type_size = types::data_type_size(c_type);
 
-    bool can_2d_a = (lda * problem_.Ta <= 16777216);
-    bool can_2d_b = (ldb * problem_.Tb <= 16777216);
-    bool can_2d_c = (ldc * problem_.Tc <= 16777216);
-
-    // Xe2 requires stronger alignment for block 2D.
-    if (arch == compute::gpu_arch_t::xe2) {
-        can_2d_a &= (align_a % 16 == 0);
-        can_2d_b &= (align_b % 16 == 0);
-        can_2d_c &= (align_c % 16 == 0);
-    }
+    align_a = nstl::max(align_a, int(a_type_size));
+    align_b = nstl::max(align_b, int(b_type_size));
+    align_c = nstl::max(align_c, int(c_type_size));
 
     // Set up problem structure.
     problem_.Ta = problem_.Ta_ext = convert_dnnl_to_kernel_type(a_type);
@@ -399,6 +406,18 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
     problem_.A.setAlignment(align_a);
     problem_.B.setAlignment(align_b);
     problem_.C.setAlignment(align_c);
+
+    auto a_size = (trans_a ? m : k) * lda * a_type_size;
+    auto b_size = (trans_b ? k : n) * ldb * b_type_size;
+    auto c_size = n * ldc * c_type_size;
+
+    // Consolidate specialization logic to limit large buffer configurations
+    bool needA64 = std::max({a_size, b_size, c_size})
+            > std::numeric_limits<uint32_t>::max();
+    problem_.A.needA64 = needA64;
+    problem_.B.needA64 = needA64;
+    problem_.C.needA64 = needA64;
+
     if (batch_dims > 0) {
         problem_.batch = BatchMode::Strided;
         problem_.batchDims = batch_dims;
@@ -476,103 +495,109 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
     problem_.sumA = (reduce_ab == sum_ab::sum_b_col);
     problem_.sumB = (reduce_ab == sum_ab::sum_a_row);
 
+    problem_.cStochasticRound = dst_sround;
+
     // Select a kernel from the catalog.
-    MatchParams match_params[4];
-    int npatterns = 1;
+    std::vector<MatchParams> match_params;
+    MatchParams base(hw_, has_systolic, is_integrated, problem_);
 
-    match_params[0] = MatchParams(hw_, has_systolic, problem_);
+    base.sizes.m = m;
+    base.sizes.n = n;
+    base.sizes.k = k;
+    base.sizes.batch = batch;
+    base.stepping = stepping;
 
-    match_params[0].sizes.m = m;
-    match_params[0].sizes.n = n;
-    match_params[0].sizes.k = k;
-    match_params[0].sizes.batch = batch;
-    match_params[0].stepping = stepping;
+    bool can_2d_a = (lda * problem_.Ta_ext <= 16777216);
+    bool can_2d_b = (ldb * problem_.Tb_ext <= 16777216);
+    bool can_2d_c = (ldc * problem_.Tc_ext <= 16777216);
 
-    auto tags = const_cast<char *>(match_params[0].tags);
+    // Xe2 requires stronger alignment for block 2D.
+    if (arch == compute::gpu_arch_t::xe2 || arch == compute::gpu_arch_t::xe3) {
+        can_2d_a &= (align_a % 16 == 0);
+        can_2d_b &= (align_b % 16 == 0);
+        can_2d_c &= (align_c % 16 == 0);
+    }
+
+    auto tags = const_cast<char *>(base.tags);
     while (*tags)
         tags++;
+    if (problem_.A.needA64 || problem_.B.needA64 || problem_.C.needA64)
+        *tags++ = kcatalog::ReqBatchN;
     if (can_2d_a) *tags++ = kcatalog::ReqBlock2DA;
     if (can_2d_b) *tags++ = kcatalog::ReqBlock2DB;
     if (can_2d_c) *tags++ = kcatalog::ReqBlock2DC;
 
+    match_params.push_back(base);
+
     bool fpmath_tf32 = mode & mode_tf32;
     bool fpmath_bf16 = mode & mode_bf16x1;
     bool fpmath_f16 = mode & mode_f16x1;
-
-    if (fpmath_tf32
-            && utils::everyone_is(Type::f32, problem_.Ta, problem_.Tb)) {
-        match_params[npatterns] = match_params[0];
-        match_params[npatterns].selector.precisions[0] = "T";
-        match_params[npatterns].selector.precisions[1] = "T";
-        npatterns++;
-    }
-
-    if (fpmath_bf16
-            && utils::everyone_is(Type::f32, problem_.Ta, problem_.Tb)) {
-        match_params[npatterns] = match_params[0];
-        match_params[npatterns].selector.precisions[0] = "[SB]";
-        match_params[npatterns].selector.precisions[1] = "[SB]";
-        npatterns++;
-    } else if (fpmath_bf16
-            && (utils::one_of(Type::f32, problem_.Ta, problem_.Tb)
-                    || (problem_.Ta.isF8() || problem_.Tb.isF8()))
-            && (problem_.Ta.isInteger() || problem_.Tb.isInteger())) {
-        if (problem_.Ta.isInt8() || problem_.Ta.isInt4()) {
-            match_params[npatterns] = match_params[0];
-            match_params[npatterns].selector.precisions[0]
-                    = match_params[0].selector.precisions[0];
-            match_params[npatterns].selector.precisions[1] = "B";
-            npatterns++;
-        } else {
-            match_params[npatterns] = match_params[0];
-            match_params[npatterns].selector.precisions[0] = "B";
-            match_params[npatterns].selector.precisions[1]
-                    = match_params[0].selector.precisions[1];
-            npatterns++;
+    bool fpmath_strict = !(fpmath_tf32 || fpmath_bf16 || fpmath_f16)
+            && (mode & mode_strict) && (mode & mode_w_decomp);
+
+    auto add_mode_matches = [&](bool has_mode, const char *(*match)(Type)) {
+        if (!has_mode) return;
+        auto &def = base.selector.precisions;
+        if (match(problem_.Ta)) {
+            match_params.push_back(base);
+            match_params.back().selector.precisions[0] = match(problem_.Ta);
+            match_params.back().selector.precisions[1] = def[1];
         }
-    }
-
-    if (fpmath_f16 && utils::everyone_is(Type::f32, problem_.Ta, problem_.Tb)) {
-        match_params[npatterns] = match_params[0];
-        match_params[npatterns].selector.precisions[0] = "[SH]";
-        match_params[npatterns].selector.precisions[1] = "[SH]";
-        npatterns++;
-    }
+        if (match(problem_.Tb)) {
+            match_params.push_back(base);
+            match_params.back().selector.precisions[0] = def[0];
+            match_params.back().selector.precisions[1] = match(problem_.Tb);
+        }
+        if (match(problem_.Ta) && match(problem_.Tb)) {
+            match_params.push_back(base);
+            match_params.back().selector.precisions[0] = match(problem_.Ta);
+            match_params.back().selector.precisions[1] = match(problem_.Tb);
+        }
+    };
 
-    if (fpmath_f16
-            && (utils::one_of(Type::f32, problem_.Ta, problem_.Tb)
-                    || (problem_.Ta.isF8() || problem_.Tb.isF8()))
-            && (problem_.Ta.isInteger() || problem_.Tb.isInteger())) {
-        if (problem_.Ta.isInt8() || problem_.Ta.isInt4()) {
-            match_params[npatterns] = match_params[0];
-            match_params[npatterns].selector.precisions[0]
-                    = match_params[0].selector.precisions[0];
-            match_params[npatterns].selector.precisions[1] = "H";
-            npatterns++;
+    add_mode_matches(fpmath_tf32, [](Type dt) -> const char * {
+        if (dt == Type::f32) { return "T"; }
+        return nullptr;
+    });
+
+    add_mode_matches(fpmath_bf16, [](Type dt) -> const char * {
+        if (dt == Type::f32) { return "[SB]"; }
+        if (dt.isInt8() || dt.isInt4()) return "[OB]";
+        if (dt.isF8()) return "B";
+        return nullptr;
+    });
+
+    add_mode_matches(fpmath_f16, [](Type dt) -> const char * {
+        if (dt == Type::f32) { return "[SH]"; }
+        if (dt.isInt8() || dt.isInt4()) return "[OH]";
+        if (dt.isF8()) return "H";
+        return nullptr;
+    });
+
+    add_mode_matches(!(fpmath_f16 || fpmath_bf16), [](Type dt) -> const char * {
+        if (dt.isInt4()) return "[FO]";
+        return nullptr;
+    });
+
+    if (fpmath_strict) {
+        if (problem_.Tb.isInt4() && !(fpmath_f16 || fpmath_bf16)) {
+            match_params.emplace_back(match_params[0]);
+            match_params.back().selector.precisions[1]
+                    = match_params.back().selector.precisions[0];
         } else {
-            match_params[npatterns] = match_params[0];
-            match_params[npatterns].selector.precisions[0] = "H";
-            match_params[npatterns].selector.precisions[1]
-                    = match_params[0].selector.precisions[1];
-            npatterns++;
+            match_params.emplace_back(match_params[0]);
+            match_params.back().selector.precisions[0]
+                    = match_params.back().selector.precisions[1];
         }
     }
-
-    if (problem_.Ta.isInt4() && !(fpmath_f16 || fpmath_bf16)) {
-        match_params[npatterns] = match_params[0];
-        match_params[npatterns].selector.precisions[0] = "[FO]";
-        npatterns++;
-    }
-
-    if (problem_.Tb.isInt4() && !(fpmath_f16 || fpmath_bf16)) {
-        match_params[npatterns] = match_params[0];
-        match_params[npatterns].selector.precisions[1] = "[FO]";
-        npatterns++;
-    }
+    add_mode_matches(true, [](Type dt) -> const char * {
+        if (dt.isFP4()) return "E";
+        return nullptr;
+    });
 
     EvaluateParams eval_params;
 
-    eval_params.sizes = match_params[0].sizes;
+    eval_params.sizes = base.sizes;
     eval_params.alpha = alpha;
     eval_params.beta = beta;
     eval_params.postOps = !problem_.postOps.empty();
@@ -581,26 +606,31 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
     eval_params.batch = (batch_dims > 0);
     eval_params.deterministic = (mode & mode_deterministic);
 
-    entry_ = select(
-            gemm_catalog, npatterns, match_params, eval_params, aux_params_);
+    entry_ = select(catalog(), static_cast<int>(match_params.size()),
+            match_params.data(), eval_params, aux_params_);
 
     if (!entry_) return status::unimplemented;
 
-    // Update A/B types from entry.
-    Type Ta_new, Ta_ext_new, Tb_new, Tb_ext_new;
+    // Update A/B/C types from entry.
+    Type Ta_new, Ta_ext_new, Tb_new, Tb_ext_new, Tc_new;
     parsePrecisions(entry_->selector.precisions[0], Ta_ext_new, Ta_new);
     parsePrecisions(entry_->selector.precisions[1], Tb_ext_new, Tb_new);
+    Tc_new = charToType(entry_->selector.precisions[2][0]);
 
     auto update_type = [](Type &T, Type T_new, bool sz_change = false) {
         if ((T.bits() != T_new.bits()) && !sz_change) return;
         if (T.isF8() && T_new.isF8()) return;
+        if (T.isF4() && T_new.isF4()) return;
         T = T.isSigned() ? T_new.asSigned() : T_new.asUnsigned();
     };
     update_type(problem_.Ta, Ta_new, true);
     update_type(problem_.Tb, Tb_new, true);
+    update_type(problem_.Tc, Tc_new, true);
     update_type(problem_.Ta_ext, Ta_ext_new);
     update_type(problem_.Tb_ext, Tb_ext_new);
 
+    if (problem_.Ts == Type::invalid) problem_.Ts = problem_.Tc;
+
     auto block_k = entry_->driverInfo.blocking[LoopK];
     if (block_k > 0 && k > block_k && beta != 1.0f) problem_.beta = Scalar();
 
@@ -608,13 +638,13 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
 }
 
 status_t gen_gemm_xe_systolic_kernel_desc_t::select_kernel(
-        compute::gpu_arch_t arch, int stepping, int eu_count, int batch_dims,
-        bool packed_c, bool trans_co, bool a_offset, bool b_offset,
-        bool c_offset, bool bias, float alpha, float beta, data_type_t a_type,
-        data_type_t b_type, data_type_t c_type, data_type_t ao_type,
-        data_type_t bo_type, data_type_t co_type, data_type_t acc_type, dim_t m,
-        dim_t n, dim_t k, dim_t batch, int unroll_m, int unroll_n, bool alt,
-        gpu_post_ops_t &&post_ops) {
+        compute::gpu_arch_t arch, int stepping, int eu_count,
+        bool is_integrated, int batch_dims, bool packed_c, bool trans_co,
+        bool a_offset, bool b_offset, bool c_offset, bool bias, float alpha,
+        float beta, data_type_t a_type, data_type_t b_type, data_type_t c_type,
+        data_type_t ao_type, data_type_t bo_type, data_type_t co_type,
+        data_type_t acc_type, dim_t m, dim_t n, dim_t k, dim_t batch,
+        int unroll_m, int unroll_n, bool alt, gpu_post_ops_t &&post_ops) {
     using namespace ngen;
     using namespace kcatalog;
 
@@ -626,9 +656,8 @@ status_t gen_gemm_xe_systolic_kernel_desc_t::select_kernel(
     k_ = k;
     eu_count_ = eu_count;
 
-    if (hw_ != HW::Xe2)
-        if (!utils::one_of(hw_, HW::XeHP, HW::XeHPG, HW::XeHPC))
-            return status::unimplemented;
+    if (!utils::one_of(hw_, HW::XeHP, HW::XeHPG, HW::XeHPC, HW::Xe2, HW::Xe3))
+        return status::unimplemented;
 
     bool xehpc = (hw_ >= HW::XeHPC);
 
@@ -693,7 +722,7 @@ status_t gen_gemm_xe_systolic_kernel_desc_t::select_kernel(
     }
 
     // Find it in the catalog.
-    MatchParams match_params(hw_, true, problem_);
+    MatchParams match_params(hw_, true, is_integrated, problem_);
 
     match_params.sizes.m = m;
     match_params.sizes.n = n;
@@ -722,7 +751,7 @@ status_t gen_gemm_xe_systolic_kernel_desc_t::select_kernel(
     eval_params.cConvert = (acc_type != c_type);
     eval_params.batch = (batch_dims > 0);
 
-    entry_ = select(gemm_catalog, match_params, eval_params, aux_params_);
+    entry_ = select(catalog(), match_params, eval_params, aux_params_);
 
     if (!entry_) return status::unimplemented;
 
@@ -748,6 +777,7 @@ void gen_gemm_xe_systolic_kernel_desc_t::choose_unrolls(
             break;
         case compute::gpu_arch_t::xe_hpc:
         case compute::gpu_arch_t::xe2:
+        case compute::gpu_arch_t::xe3:
             if (utils::one_of(a_type, f16, bf16)) {
                 if (unroll_m != 0)
                     unroll_n = (unroll_m > 16) ? 32 : 16;
@@ -777,9 +807,18 @@ void gen_gemm_kernel_t::init_interface() {
     interface_ = NEOInterfaceHandler {desc()->hw_};
     auto s_type_ngen = problem.Ts.ngen();
 
-    interface_.newArgument("A", ExternalArgumentType::GlobalPtr);
-    interface_.newArgument("B", ExternalArgumentType::GlobalPtr);
-    interface_.newArgument("C", ExternalArgumentType::GlobalPtr);
+    auto a_access = strategy.A.getGlobalAccessType();
+    auto b_access = strategy.B.getGlobalAccessType();
+    auto c_access = strategy.C.getGlobalAccessType();
+    auto ao_access = strategy.AO.getGlobalAccessType();
+    auto bo_access = strategy.BO.getGlobalAccessType();
+    auto co_access = strategy.CO.getGlobalAccessType();
+    auto as_access = strategy.A_scale.getGlobalAccessType();
+    auto bs_access = strategy.B_scale.getGlobalAccessType();
+
+    interface_.newArgument("A", ExternalArgumentType::GlobalPtr, a_access);
+    interface_.newArgument("B", ExternalArgumentType::GlobalPtr, b_access);
+    interface_.newArgument("C", ExternalArgumentType::GlobalPtr, c_access);
     interface_.newArgument("offset_A", DataType::q);
     interface_.newArgument("offset_B", DataType::q);
     interface_.newArgument("offset_C", DataType::q);
@@ -792,26 +831,35 @@ void gen_gemm_kernel_t::init_interface() {
     interface_.newArgument("alpha_real", s_type_ngen);
     interface_.newArgument("beta_real", s_type_ngen);
     if (problem.aoPtrDims >= 0)
-        interface_.newArgument("ao_ptr", ExternalArgumentType::GlobalPtr);
+        interface_.newArgument(
+                "ao_ptr", ExternalArgumentType::GlobalPtr, ao_access);
     if (problem.boPtrDims >= 0)
-        interface_.newArgument("bo_ptr", ExternalArgumentType::GlobalPtr);
+        interface_.newArgument(
+                "bo_ptr", ExternalArgumentType::GlobalPtr, bo_access);
     if (problem.aScale2D)
-        interface_.newArgument("a_scale_ptr", ExternalArgumentType::GlobalPtr);
+        interface_.newArgument(
+                "a_scale_ptr", ExternalArgumentType::GlobalPtr, as_access);
     if (problem.bScale2D)
-        interface_.newArgument("b_scale_ptr", ExternalArgumentType::GlobalPtr);
+        interface_.newArgument(
+                "b_scale_ptr", ExternalArgumentType::GlobalPtr, bs_access);
     if (problem.aoPtrDims == 2 || problem.aScale2D)
         interface_.newArgument("ldaq", DataType::d);
     if (problem.boPtrDims == 2 || problem.bScale2D)
         interface_.newArgument("ldbq", DataType::d);
     if (problem.cOffset != COffset::None || problem.sumA || problem.sumB) {
-        interface_.newArgument("CO", ExternalArgumentType::GlobalPtr);
-        interface_.newArgument("offset_CO", DataType::d);
+        interface_.newArgument(
+                "CO", ExternalArgumentType::GlobalPtr, co_access);
+        interface_.newArgument("offset_CO", DataType::q);
         if (problem.cOffset == COffset::Pre)
             interface_.newArgument("ldco", DataType::d);
     }
+    if (problem.cStochasticRound) {
+        interface_.newArgument("sround_seed", ExternalArgumentType::GlobalPtr);
+    }
 
     if (strategy.needsTempC(problem))
-        interface_.newArgument("temp_C", ExternalArgumentType::GlobalPtr);
+        interface_.newArgument(
+                "temp_C", ExternalArgumentType::GlobalPtr, c_access);
     interface_.newArgument("flags", DataType::ud);
     if ((strategy.kParallel || strategy.kParallelLocal)
             && !strategy.kParallelVariable)
@@ -819,8 +867,9 @@ void gen_gemm_kernel_t::init_interface() {
     for (size_t i = 0; i < problem.postOps.len(); i++) {
         if (!problem.postOps[i].is_binary()) continue;
         auto bname = "binary" + std::to_string(i);
-        interface_.newArgument(bname, ExternalArgumentType::GlobalPtr);
-        interface_.newArgument("offset_" + bname, DataType::d);
+        interface_.newArgument(bname, ExternalArgumentType::GlobalPtr,
+                strategy.binary[i].getGlobalAccessType());
+        interface_.newArgument("offset_" + bname, DataType::q);
         if (problem.binaryRow[i] && problem.binaryCol[i])
             interface_.newArgument("ld" + bname, DataType::d);
     }
@@ -876,16 +925,16 @@ void gen_gemm_kernel_t::init_interface() {
     if (strategy.variableSLM())
         interface_.newArgument("local_mem", ExternalArgumentType::LocalPtr);
     if (problem.aoPtrDims >= 1 || problem.aScale2D)
-        interface_.newArgument("offset_Aq", DataType::d);
+        interface_.newArgument("offset_Aq", DataType::q);
     if (problem.boPtrDims >= 1 || problem.bScale2D)
-        interface_.newArgument("offset_Bq", DataType::d);
+        interface_.newArgument("offset_Bq", DataType::q);
 
     if (desc()->hw_ >= HW::XeHPG) interface_.allowArgumentRearrangement(false);
     interface_.externalName(kernel_name());
 }
 
-xpu::binary_t gen_gemm_kernel_t::get_binary(
-        cl_context context, cl_device_id device) {
+status_t gen_gemm_kernel_t::get_kernel(
+        compute::kernel_t &kernel, const compute::compute_engine_t *engine) {
     init_interface();
     maybe_print_verbose();
 
@@ -894,7 +943,7 @@ xpu::binary_t gen_gemm_kernel_t::get_binary(
         gemm_kernel_generator_t<ngen::HW::arch> generator; \
         generator.setStepping(desc()->stepping_); \
         generator.gemm(*desc()->problem(), *desc()->strategy(), interface_); \
-        return generator.getBinary(context, device); \
+        return generator.get_kernel(kernel, engine); \
         break; \
     }
 
@@ -907,6 +956,7 @@ xpu::binary_t gen_gemm_kernel_t::get_binary(
             REG_XEHPG_ISA(ARCH_DISPATCH(XeHPG))
             REG_XEHPC_ISA(ARCH_DISPATCH(XeHPC))
             REG_XE2_ISA(ARCH_DISPATCH(Xe2))
+            REG_XE3_ISA(ARCH_DISPATCH(Xe3))
             default: assert(!"Unsupported architecture"); break;
         }
     } catch (const std::runtime_error &err) {
diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp
index f865d5bc75e..c66f9d8608d 100644
--- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp
+++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include "gpu/intel/jit/gemm/include/problem.hpp"
 #include "gpu/intel/jit/gemm/include/strategy.hpp"
 #include "gpu/intel/jit/gemm/include/type.hpp"
-#include "gpu/intel/jit/jit_generator_base.hpp"
+#include "gpu/intel/jit/generator_base.hpp"
 #include "gpu/intel/kernel_cache.hpp"
 #include "xpu/utils.hpp"
 
@@ -46,6 +46,8 @@ static inline Type convert_dnnl_to_kernel_type(data_type_t type) {
         case data_type::bf16: return Type::bf16;
         case data_type::f8_e5m2: return Type::bf8;
         case data_type::f8_e4m3: return Type::hf8;
+        case data_type::f4_e2m1: return Type::f4_e2m1;
+        case data_type::f4_e3m0: return Type::f4_e3m0;
         case data_type::s32: return Type::s32;
         case data_type::u8: return Type::u8;
         case data_type::s8: return Type::s8;
@@ -69,7 +71,9 @@ struct gen_gemm_kernel_desc_t {
     status_t create_generator(const compute::compute_engine_t &engine,
             compute::kernel_t &kernel) const;
 
-    serialized_t serialize() const { return serialized_t(problem_, strategy_); }
+    serialization_stream_t serialize() const {
+        return serialization_stream_t(problem_, strategy_);
+    }
     compute::gpu_arch_t arch() const { return arch_; }
 
     const kcatalog::Entry &entry() const {
@@ -82,7 +86,7 @@ struct gen_gemm_kernel_desc_t {
     ngen::HW hw_ = ngen::HW::Unknown;
     int stepping_ = 0;
     GEMMProblem problem_ = {};
-    GEMMStrategy strategy_ = {};
+    GEMMStrategy strategy_;
     const kcatalog::Entry *entry_ = nullptr;
     EvaluateAuxOutput aux_params_;
     CommonDriverInfo driver_info_;
@@ -91,6 +95,7 @@ struct gen_gemm_kernel_desc_t {
     int m_ = -1, n_ = -1, k_ = -1;
     int eu_count_ = -1;
     bool disable_systolic_ = false;
+    bool relaxed_acc_ = false;
 
     status_t transfer_post_ops(gpu_post_ops_t &&post_ops, bool swap_ab);
 
@@ -105,6 +110,8 @@ struct gen_gemm_nocopy_kernel_desc_t : public gen_gemm_kernel_desc_t {
         mode_bf16x1 = 0x2,
         mode_f16x1 = 0x4,
         mode_w_decomp = 0x8,
+        mode_relaxed_acc = 0x10,
+        mode_strict = 0x20,
         mode_deterministic = 0x8000
     };
 
@@ -113,9 +120,10 @@ struct gen_gemm_nocopy_kernel_desc_t : public gen_gemm_kernel_desc_t {
     }
 
     status_t select_kernel(compute::gpu_arch_t arch, int stepping, int eu_count,
-            bool has_systolic, compute_mode mode, int batch_dims, bool trans_a,
-            bool trans_b, bool trans_co, bool swap_ab, int ao_dims, int bo_dims,
-            bool wei_scale_2d, bool src_scale_2d, int wei_q2d_group_k,
+            bool has_systolic, bool is_integrated, compute_mode mode,
+            int batch_dims, bool trans_a, bool trans_b, bool trans_co,
+            bool swap_ab, int ao_dims, int bo_dims, bool wei_scale_2d,
+            bool src_scale_2d, bool dst_sround, int wei_q2d_group_k,
             int src_q2d_group_k, bool c_offset, bool bias, sum_ab_t reduce_ab,
             float alpha, float beta, data_type_t a_type, data_type_t b_type,
             data_type_t c_type, data_type_t ao_type, data_type_t bo_type,
@@ -127,12 +135,13 @@ struct gen_gemm_nocopy_kernel_desc_t : public gen_gemm_kernel_desc_t {
 
 struct gen_gemm_xe_systolic_kernel_desc_t : public gen_gemm_kernel_desc_t {
     status_t select_kernel(compute::gpu_arch_t arch, int stepping, int eu_count,
-            int batch_dims, bool packed_c, bool trans_co, bool a_offset,
-            bool b_offset, bool c_offset, bool bias, float alpha, float beta,
-            data_type_t a_type, data_type_t b_type, data_type_t c_type,
-            data_type_t ao_type, data_type_t bo_type, data_type_t co_type,
-            data_type_t acc_type, dim_t m, dim_t n, dim_t k, dim_t batch,
-            int unroll_m, int unroll_n, bool alt, gpu_post_ops_t &&post_ops);
+            bool is_integrated, int batch_dims, bool packed_c, bool trans_co,
+            bool a_offset, bool b_offset, bool c_offset, bool bias, float alpha,
+            float beta, data_type_t a_type, data_type_t b_type,
+            data_type_t c_type, data_type_t ao_type, data_type_t bo_type,
+            data_type_t co_type, data_type_t acc_type, dim_t m, dim_t n,
+            dim_t k, dim_t batch, int unroll_m, int unroll_n, bool alt,
+            gpu_post_ops_t &&post_ops);
 
     static void choose_unrolls(compute::gpu_arch_t arch, int eu_count,
             data_type_t a_type, data_type_t b_type, data_type_t c_type, dim_t m,
@@ -142,13 +151,14 @@ struct gen_gemm_xe_systolic_kernel_desc_t : public gen_gemm_kernel_desc_t {
     static int min_block_k(data_type_t a_type) { return 2048; }
 };
 
-struct gen_gemm_kernel_t : public jit_generator_base {
+struct gen_gemm_kernel_t : public generator_base_t {
 
     explicit gen_gemm_kernel_t(const gen_gemm_kernel_desc_t &desc)
         : desc_(desc) {}
 
     const char *kernel_name() const override { return "gemm_kernel"; }
-    xpu::binary_t get_binary(cl_context context, cl_device_id device) override;
+    status_t get_kernel(compute::kernel_t &kernel,
+            const compute::compute_engine_t *engine) override;
 
     const gen_gemm_kernel_desc_t *desc() const { return &desc_; }
 
diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_db.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_db.cpp
new file mode 100644
index 00000000000..373d3c7eab5
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_db.cpp
@@ -0,0 +1,37 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/gemm/gen_gemm_kernel_db.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+#define _CATALOG_ gemm_catalog
+#include "selector/db/kernel.db"
+#undef _CATALOG_
+
+kcatalog::Catalog catalog() {
+    return gemm_catalog;
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel_db.hpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel_db.hpp
new file mode 100644
index 00000000000..146d146a08d
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel_db.hpp
@@ -0,0 +1,36 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_GEMM_GEN_GEMM_KERNEL_DB_HPP
+#define GPU_INTEL_JIT_GEMM_GEN_GEMM_KERNEL_DB_HPP
+
+#include "gpu/intel/jit/gemm/include/kernel_catalog.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+kcatalog::Catalog catalog();
+
+}
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/gemm/generator/.clang-format b/src/gpu/intel/jit/gemm/generator/.clang-format
new file mode 100644
index 00000000000..71d803d4198
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/generator/.clang-format
@@ -0,0 +1,18 @@
+#===============================================================================
+# Copyright 2019-2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+DisableFormat : true
+SortIncludes:   false
diff --git a/src/gpu/intel/jit/gemm/generator/_clang-format b/src/gpu/intel/jit/gemm/generator/_clang-format
deleted file mode 100644
index ba1496d179e..00000000000
--- a/src/gpu/intel/jit/gemm/generator/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2019-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/src/gpu/intel/jit/gemm/generator/generator.cpp b/src/gpu/intel/jit/gemm/generator/generator.cpp
index a37988bab5a..ebac60076e0 100644
--- a/src/gpu/intel/jit/gemm/generator/generator.cpp
+++ b/src/gpu/intel/jit/gemm/generator/generator.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,6 +37,7 @@
 #include "pieces/k_loop_setup.cxx"
 #include "pieces/k_loop.cxx"
 #include "pieces/layout_setup.cxx"
+#include "pieces/l3_prefetch.cxx"
 #include "pieces/masks.cxx"
 #include "pieces/math_helpers.cxx"
 #include "pieces/matrix_access.cxx"
@@ -47,6 +48,7 @@
 #include "pieces/remask.cxx"
 #include "pieces/row_column_sums.cxx"
 #include "pieces/state_utils.cxx"
+#include "pieces/tlb_warmup.cxx"
 #include "pieces/walk_orders.cxx"
 
 #include "pieces/quantization.cxx"
diff --git a/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp b/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp
index fb6e555b70e..404e90aeaad 100644
--- a/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp
+++ b/src/gpu/intel/jit/gemm/generator/microkernel_provider.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
+#define BINARY_OUTPUT
+
 #include "microkernel_provider.hpp"
 #include "generator.hpp"
 #include "kernel_selector.hpp"
@@ -48,11 +50,15 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, SizeP
                               const GEMMProblem &problem_, const std::vector<StrategyRequirement> &reqs_,
                               void (*strategyAdjuster)(GEMMStrategy &strategy))
 {
-    kcatalog::Catalog catalog;
-
     bool localA = protocol.options().localA;
     bool localB = protocol.options().localB;
     bool beta1 = protocol.options().addToC;
+    bool slmPtr = protocol.options().slmPtr;
+    bool scaleA = protocol.options().scaleA;
+    bool scaleB = protocol.options().scaleB;
+    bool offsetA = protocol.options().offsetA;
+    bool offsetB = protocol.options().offsetB;
+
     bool transC = !isColMajor(problem_.C.layout);
 
     auto problem = problem_;
@@ -67,17 +73,26 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, SizeP
         problem.transpose();
         std::swap(localA, localB);
         std::swap(sizes.m, sizes.n);
+        std::swap(scaleA, scaleB);
+        std::swap(offsetA, offsetB);
         for (auto &req: reqs)
             req.transpose();
     }
 
+    if (scaleA != problem.aScale2D || scaleB != problem.bScale2D)
+        stub("Protocol scales do not match problem description");
+    if (offsetA != (problem.aoPtrDims >= 0) || offsetB != (problem.boPtrDims >= 0))
+        stub("Protocol offsets do not match problem description");
+
     /* Get hardware information */
     auto product = npack::decodeHWIPVersion(hwInfo.gmdid);
     auto hw = getCore(product.family);
     auto stepping = hwInfo.gmdid & 0xFF;
 
+    bool isIntegrated = getPlatformType(product.family) == PlatformType::Integrated;
+
     /* Create catalog matcher */
-    MatchParams matchParams(hw, hwInfo.systolicAvailable, problem);
+    MatchParams matchParams(hw, hwInfo.systolicAvailable, isIntegrated, problem);
 
     matchParams.sizes = sizes;
     matchParams.stepping = stepping;
@@ -101,12 +116,14 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, SizeP
     if (localA && localB)
         stub("Unsupported protocol");
 
-    if (localA)
-        catalog = CatalogLMR;
-    else if (localB)
-        catalog = CatalogMLR;
-    else
-        catalog = CatalogMMR;
+    kcatalog::Catalog catalog = [&] () {
+        if (localA)
+            return kcatalog::Catalog(CatalogLMR);
+        else if (localB)
+            return kcatalog::Catalog(CatalogMLR);
+        else
+            return kcatalog::Catalog(CatalogMMR);
+    }();
 
     /* Call kernel selector */
     EvaluateAuxOutput auxParams;
@@ -125,9 +142,9 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, SizeP
         adjustStrategy(hw, problem, strategy);
         modifyStrategy(strategy, auxParams);
 
-        /* Xe2-XeHPC compatibility logic */
-        if (hw == ngen::HW::Xe2) {
-            // Use XeHPC register banking on Xe2, in order
+        /* Xe2/Xe3-XeHPC compatibility logic */
+        if (hw == ngen::HW::Xe2 || hw == ngen::HW::Xe3) {
+            // Use XeHPC register banking on Xe2/Xe3, in order
             //   to successfully reuse XeHPC strategies.
             strategy.raHW = ngen::HW::XeHPC;
 
@@ -182,8 +199,13 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, SizeP
     interface.newArgument("h0", DataType::d);
     interface.newArgument("local_id_m", DataType::d);
     interface.newArgument("local_id_n", DataType::d);
-    if (protocol.options().slmPtr)
-        interface.newArgument("slm_base", ExternalArgumentType::LocalPtr);
+    if (slmPtr)            interface.newArgument("slm_base", ExternalArgumentType::LocalPtr);
+    if (scaleA)            interface.newArgument("a_scale_ptr", ExternalArgumentType::GlobalPtr);
+    if (offsetA)           interface.newArgument("ao_ptr", ExternalArgumentType::GlobalPtr);
+    if (scaleA || offsetA) interface.newArgument("ldaq", DataType::d);
+    if (scaleB)            interface.newArgument("b_scale_ptr", ExternalArgumentType::GlobalPtr);
+    if (offsetB)           interface.newArgument("bo_ptr", ExternalArgumentType::GlobalPtr);
+    if (scaleB || offsetB) interface.newArgument("ldbq", DataType::d);
 
     /* Update problem from strategy */
     if (isPacked(problem.A.layout))
@@ -208,6 +230,7 @@ Package selectGEMMMicrokernel(GEMMProtocol protocol, HWInformation hwInfo, SizeP
         REG_XEHPG_ISA(ARCH_DISPATCH(XeHPG))
         REG_XEHPC_ISA(ARCH_DISPATCH(XeHPC))
         REG_XE2_ISA(ARCH_DISPATCH(Xe2))
+        REG_XE3_ISA(ARCH_DISPATCH(Xe3))
         default: throw std::runtime_error("Unsupported architecture");
     }
 #undef ARCH_DISPATCH
@@ -241,12 +264,11 @@ static inline bool getStrategyByHeuristics(HW hw, GEMMStrategy &strategy, bool l
     } else if (!block2DA) {
         s.A.accessType = AccessType::Block;
         if (systolic)
-            s.ka_load = (problem.A.layout == MatrixLayout::T) ? 32 : 16;
+            s.ka_load = (problem.A.layout == MatrixLayout::T) ? (64 / problem.Ta_ext) : 16;
         s.slmA = true;
-
     } else if (problem.A.layout == MatrixLayout::T) {
         s.A.accessType = AccessType::Block2DTranspose;
-        s.ka_load = 32;
+        s.ka_load = 64 / problem.Ta_ext;
     } else if (problem.A.layout == MatrixLayout::N) {
         s.A.accessType = AccessType::Block2DVNNI;
         s.A_copies = 2;
@@ -282,6 +304,9 @@ static inline bool getStrategyByHeuristics(HW hw, GEMMStrategy &strategy, bool l
     s.B_prefetch = s.B;
     s.A_prefetch.prefetch = s.B_prefetch.prefetch = true;
 
+    s.AO.newDP = s.A_scale.newDP = true;
+    s.BO.newDP = s.B_scale.newDP = true;
+
     if (!localA && block2DA) {
         if (!isPacked(problem.A.layout))
             s.A_prefetch.accessType = AccessType::Block2D;
@@ -314,14 +339,21 @@ static inline bool getStrategyByHeuristics(HW hw, GEMMStrategy &strategy, bool l
         return false;
 
     s.systolic = systolic;
+    if (systolic && hw >= HW::XeHPC)
+        s.extendedAtomicFMA = s.atomicFMA = true;
     s.registerScheme = GEMMStrategy::VAvoid;
     if (s.wgTile(LoopM) * s.wgTile(LoopN) > 512)
         s.GRFs = 256;
     if (localA && !localB)
         s.loadBFirst = true;
 
-    if (s.slmA || s.slmB)
+    if (s.slmA || s.slmB) {
         s.slmBuffers = 1;
+        s.unrollKSLM = std::max(int(s.slmA) * s.ka_load, int(s.slmB) * s.kb_load);
+    }
+
+    if (hw == HW::Xe2 || hw == HW::Xe3)
+        s.raHW = HW::XeHPC;
 
     adjustStrategy(hw, problem, strategy);
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/address_setup.cxx b/src/gpu/intel/jit/gemm/generator/pieces/address_setup.cxx
index b66bc116a31..4f996a5835f 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/address_setup.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/address_setup.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/allocators.cpp b/src/gpu/intel/jit/gemm/generator/pieces/allocators.cpp
index 1fd0819040c..58ba6e54831 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/allocators.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/allocators.cpp
@@ -99,6 +99,13 @@ FlagRegister VirtualFlagAllocator::assignPhysical(VirtualFlag vflag)
     return pflag.toPhysical();
 }
 
+bool VirtualFlagAllocator::lock(VirtualFlag vflag, bool allowAlreadyLocked) {
+    bool wasLocked = isLocked(vflag);
+    if (wasLocked && !allowAlreadyLocked) stub("Illegally locking an already-locked flag register");
+    locked |= mask(vflag);
+    return wasLocked;
+}
+
 bool VirtualFlagAllocator::canLock(int n) const
 {
     uint8_t unlocked = ~locked & ((1 << nflag) - 1);
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/allocators.hpp b/src/gpu/intel/jit/gemm/generator/pieces/allocators.hpp
index 783a1c680c5..0e3cf11e637 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/allocators.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/allocators.hpp
@@ -78,7 +78,7 @@ class VirtualFlagAllocator {
 
     bool isVirtual(VirtualFlag vflag)               { return (vflag.idx >= nflag); }
 
-    bool lock(VirtualFlag vflag)                    { bool wasLocked = isLocked(vflag); locked |= mask(vflag); return wasLocked; }
+    bool lock(VirtualFlag vflag, bool allowAlreadyLocked = false);
     void unlock(VirtualFlag vflag)                  { locked &= ~mask(vflag); }
     bool isLocked(VirtualFlag vflag)          const { return !(~locked & mask(vflag)); }
     bool canLock(int n = 1) const;
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/c_update.cxx b/src/gpu/intel/jit/gemm/generator/pieces/c_update.cxx
index 25443602a0d..4e3a6d182f5 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/c_update.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/c_update.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -429,7 +429,7 @@ bool BLASKernelGenerator<hw>::gemmAccessC(COperation op, const GEMMProblem &prob
 
         for (int q = 0; q < state.C_count; q++) {
             bool checkAlign = (problem.C.alignment % align) != 0;
-            bool checkWidth = (q == 0 && Tc_ext.size() < 4 && op != COperation::Load);
+            bool checkWidth = (q == 0 && Tc_ext.paddedSize() < 4 && op != COperation::Load);
             auto &labelNonBlock2DRem = altCRemainder ? labelAltCRemainder : labelStdCRemainder;
 
             if (checkAlign) {
@@ -1078,9 +1078,9 @@ void BLASKernelGenerator<hw>::updateCLayout(const vector<RegisterBlock> &layoutE
                             break;
                         case 1:
                             {
-                                C_accs.push_back(C_acc);
-                                C_accSwaps.push_back(C_accSwap);
-                                C_loads.push_back(C_load);
+                                C_accs.push_back(std::move(C_acc));
+                                C_accSwaps.push_back(std::move(C_accSwap));
+                                C_loads.push_back(std::move(C_load));
                             }
                             break;
                         case 2:
@@ -2122,6 +2122,18 @@ void BLASKernelGenerator<hw>::convert(const GRFMultirange &range, Type Told, Typ
         return;
     }
 
+    // Special path: f32->hf8.
+    if (hw >= HW::Xe3 && Told == Type::f32 && Tnew == Type::hf8) {
+        int ne = elementsPerGRF<uint32_t>(hw);
+        for (int i = 0; i < range.getLen(); i++)
+            mov(ne, range[i].hf(), range[i].f());
+        for (int i = 0; i < range.getLen(); i++)
+            mov(ne, range[i].hf8(), range[i].hf());
+        for (int i = 0; i < range.getLen(); i++)
+            mov(ne, range[i].ub(0)(4), range[i].ub());
+        return;
+    }
+
     // Special path: s16->f16.
     if (Told == Type::s16 && Tnew == Type::f16) {
         if (hw < HW::Gen11) stub();
@@ -2400,6 +2412,11 @@ void BLASKernelGenerator<hw>::gemmKReduce(const GEMMProblem &problem, const GEMM
         if (ok) break;
     }
 
+    if (sliceRegs > maxContig)
+        sliceRegs = align_down(sliceRegs, maxContig);
+    else if (sliceRegs < maxContig)
+        sliceRegs = rounddown_pow2(sliceRegs);
+
     // Allocate address and data registers, automatically shrinking sliceRegs if
     //  there are not enough registers.
     C_load.resize(1);
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/common.cxx b/src/gpu/intel/jit/gemm/generator/pieces/common.cxx
index cf93bb0f992..3b03052a28a 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/common.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/common.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -84,22 +84,6 @@ void BLASKernelGenerator<hw>::padding()
         nop();
 }
 
-Subregister SubregisterPair::getReg(int idx) const
-{
-    auto r = regs[idx & 1];
-    if (negative)
-        r = -r;
-    return r;
-}
-
-Subregister SubregisterPair::getRegAvoiding(HW hw, const RegData &rd) const
-{
-    if (Bundle::same_bank(hw, rd, regs[0]))
-        return getReg(1);
-    else
-        return getReg(0);
-}
-
 // Create a copy of a SubregisterPair in the other bank.
 template <HW hw>
 void BLASKernelGenerator<hw>::duplicateScalar(SubregisterPair &val, CommonState &state)
@@ -582,7 +566,7 @@ Subregister BLASKernelGenerator<hw>::findLDMultiple(const LDMultiples &multiples
 
 // Calculate and cache a specific ld multiple.
 template <HW hw>
-void BLASKernelGenerator<hw>::calcIncrement(LDIncrements &increments, SubregisterPair &base, int scale, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::calcIncrement(LDIncrements &increments, SubregisterPair &base, int scale,  const CommonStrategy &strategy, CommonState &state)
 {
     // Check for existing increment.
     for (auto &inc: increments)
@@ -591,7 +575,8 @@ void BLASKernelGenerator<hw>::calcIncrement(LDIncrements &increments, Subregiste
 
     // Copy base for scale = 1.
     if (scale == 1) {
-        duplicateScalar(base, state);
+        if(strategy.avoidIncConflicts)
+            duplicateScalar(base, state);
         increments.push_back(std::make_pair(1, base));
         return;
     }
@@ -599,14 +584,13 @@ void BLASKernelGenerator<hw>::calcIncrement(LDIncrements &increments, Subregiste
     // General scaling.
     SubregisterPair scaled;
     if (strategy.avoidIncConflicts)
-        scaled = SubregisterPair(state.ra.alloc_sub<uint32_t>(getHint(HintType::LongTerm0, strategy)),
-                                 state.ra.alloc_sub<uint32_t>(getHint(HintType::LongTerm1, strategy)));
+        scaled = SubregisterPair(state.ra.alloc_sub(increments.type, getHint(HintType::LongTerm0, strategy)),
+                                 state.ra.alloc_sub(increments.type, getHint(HintType::LongTerm1, strategy)));
     else
-        scaled = SubregisterPair(state.ra.alloc_sub<uint32_t>(getHint(HintType::LongTerm, strategy)));
+        scaled = SubregisterPair(state.ra.alloc_sub(increments.type, getHint(HintType::LongTerm, strategy)));
 
-    int nr = strategy.avoidIncConflicts ? 2 : 1;
-    for (int i = 0; i < nr; i++)
-        emulConstant(1, scaled.getReg(i), base, scale, strategy, state);
+    emulConstant(1, scaled.getReg(0), base, scale, strategy, state);
+    if(scaled.isDuplicated()) emov(1, scaled.getReg(1), scaled.getReg(0), strategy, state);
 
     increments.push_back(std::make_pair(scale, scaled));
 }
@@ -624,7 +608,7 @@ SubregisterPair BLASKernelGenerator<hw>::lookupIncrement(const LDIncrements &inc
     if (!release)
         return SubregisterPair();
 
-    auto result = state.ra.alloc_sub<int32_t>();
+    auto result = state.ra.alloc_sub(increments.type);
     emulConstant(1, result, base, scale, strategy, state);
     *release = true;
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/compute_utils.hpp b/src/gpu/intel/jit/gemm/generator/pieces/compute_utils.hpp
index 3afc1e28b24..7774502882a 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/compute_utils.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/compute_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,11 +40,15 @@ struct SystolicParams {
 
 static inline SystolicParams systolicParams(ngen::HW hw, const GEMMProblem &problem, const GEMMStrategy &strategy)
 {
+    // Check opCount using correct compute types.
+    auto problem_ = problem;
+    problem_.autoTypeConversions(hw, strategy.systolicAvailable);
+
     SystolicParams params;
-    params.opsPerChan = std::max(1, std::min(4 / problem.Ta.real(), 4 / problem.Tb.real()));
+    params.opsPerChan = std::max(1, std::min(4 / problem_.Ta.real(), 4 / problem_.Tb.real()));
     params.sdepth = 8;
     params.ksys = params.sdepth * params.opsPerChan;
-    params.osys = ngen::GRF::bytes(hw) / std::max(problem.Tc_compute().real().size(), 4);
+    params.osys = ngen::GRF::bytes(hw) / std::max(problem_.Tc_compute().real().size(), 4);
     params.rcountMax = 8;
 
     return params;
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.cpp b/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.cpp
index 31e9b64dcf1..ad62ab0f046 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.cpp
@@ -104,4 +104,21 @@ void coopSplit(bool isA, int &splitR, int &splitC, int r, int c, int mnFull, Coo
         stub("Cooperative operation cannot be split evenly between threads.");
 }
 
+// Return the natural splitting (maximizing contiguous memory accesses) for matrix A.
+CoopSplit naturalSplitA(MatrixLayout layout)
+{
+    switch (layout) {
+        case MatrixLayout::Pr:
+        case MatrixLayout::Pc: return CoopSplit::Linear;
+        case MatrixLayout::N:  return CoopSplit::FullK;
+        case MatrixLayout::T:  return CoopSplit::MN;
+        default: stub();
+    }
+}
+
+CoopSplit naturalSplitB(MatrixLayout layout)
+{
+    return naturalSplitA(transposeLayout(layout));
+}
+
 #include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.hpp b/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.hpp
index 02121528a48..85d024dd07b 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/cooperative_split.hpp
@@ -27,6 +27,9 @@
 // Split A/B matrix between threads.
 void coopSplit(bool isA, int &splitR, int &splitC, int r, int c, int mnFull, CoopSplit stype, const MatrixAddressing &atype, const GEMMStrategy &strategy);
 
+CoopSplit naturalSplitA(MatrixLayout layout);
+CoopSplit naturalSplitB(MatrixLayout layout);
+
 #include "internal/namespace_end.hxx"
 
 #endif /* header guard */
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/copy.cxx b/src/gpu/intel/jit/gemm/generator/pieces/copy.cxx
index 2be36f87020..93c12505f44 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/copy.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/copy.cxx
@@ -172,7 +172,7 @@ void BLASKernelGenerator<hw>::copyExecute(CopyPlan &&plan, CommonState &state)
     if (!state.vflagsEnabled())
         for (int i = 0; i < nflag; i++)
             if (!raVFlag0.isFree(VirtualFlag{i}))
-                raVFlag0.lock(VirtualFlag{i});
+                raVFlag0.lock(VirtualFlag{i}, true);
     auto raVFlag = raVFlag0;
 
     // If we have enough free flags, use those.
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.cpp b/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.cpp
index 468c51137fe..9f91765b223 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -96,7 +96,7 @@ static bool isSubsetOf(DataType dt1, DataType dt2)
     if (dt1 == dt2) return true;
     if (isFP(dt1) && isInt(dt2)) return false;
     if (isW(dt1) && dt2 == DataType::tf32) return false;
-    if (is4(dt1) && (isB(dt2) || dt2 == Type::ngen_hf8())) return true;
+    if (is4(dt1) && (isB(dt2) || dt2 == DataType::hf8)) return true;
     if (dt1 == DataType::s4 && dt2 == DataType::bf8) return true;
     return getBytes(dt1) < getBytes(dt2);
 }
@@ -107,9 +107,9 @@ static bool isSubsetOf(DataType dt1, DataType dt2)
 /***********************/
 
 CopyOperand::CopyOperand(RegData rd)
-        : grf(rd.getBase()), offset(rd.getOffset()),
+        : grf(rd.getBase()), offset(rd.getLogicalOffset()),
           stride(rd.getHS()), type(rd.getType()), kind(GRF),
-          overwrite(false), overwriteStride(false), neg(rd.getNeg())
+          overwrite(false), overwriteStride(false), neg(rd.getNeg()), abs(rd.getAbs()), inv(false)
 {
     if (rd.getAbs()) stub("Unsupported modifier");
     if (rd.getVS() != 0 || rd.getWidth() != 0)
@@ -124,6 +124,13 @@ CopyOperand CopyOperand::operator-() const
     return clone;
 }
 
+CopyOperand CopyOperand::operator~() const
+{
+    auto clone = *this;
+    clone.inv = !clone.inv;
+    return clone;
+}
+
 // Convert a GRF CopyOperand to an nGEN object.
 RegData CopyOperand::ngen() const
 {
@@ -132,7 +139,9 @@ RegData CopyOperand::ngen() const
     if (kind != GRF || temp) stub("Invalid operation");
 
     RegData rd = ngen::GRF(grf).sub(offset, type)(stride);
+    if (abs) rd = ngen::abs(rd);
     if (neg) rd = -rd;
+    if (inv) rd = ~rd;
 
     return rd;
 }
@@ -168,7 +177,7 @@ void CopyInstruction::moveToIntegerPipe()
     if (asSigned(st) != asSigned(dt)) return;
 
     switch (getBytes(st)) {
-        case 1: st = dt = is4(st) ? DataType::u4 : DataType::ub; break;
+        case 1: st = dt = isInt4(st) ? DataType::u4 : DataType::ub; break;
         case 2: st = dt = DataType::uw; break;
         case 4: st = dt = DataType::ud; break;
         case 8:
@@ -241,6 +250,10 @@ void CopyPlan::transform()
     sort(SortType::PhaseOnly);
 
     legalizeImmediateTypes();
+#ifdef DNNL_DEV_MODE
+    if (get_verbose(verbose_t::debuginfo) > 100)
+        dump();
+#endif
 }
 
 
@@ -586,9 +599,9 @@ void CopyPlan::split2DRegions()
     auto is2D = [](const CopyOperand &op) { return op.inVS || op.inW; };
 
     for (auto &i: insns) {
-        if (is2D(i.dst) || is2D(i.src1) || is2D(i.src2))
+        if ((is2D(i.dst) && !is4Bit(i.dst.type)) || is2D(i.src1) || is2D(i.src2))
             stub("Unsupported 2D region");
-        if (is2D(i.src0)) {
+        if (is2D(i.src0)){
             if (i.flag) stub("Unsupported predication");
             int w = i.src0.inW, vs = i.src0.inVS, hs = i.src0.stride;
             bool splitH = (w * w >= i.simd);
@@ -646,20 +659,43 @@ void CopyPlan::planTypeConversions()
         if (asSigned(st) == asSigned(dt) && st != dt)
             dt = st;
 
-        if (is4(st) && isInt(dt)) {
+        if (isInt4(st) && isInt(dt)) {
             planInt4Upconversion(i);
             rerun = true;
+        } else if (st == Type::ngen_f4_e2m1() && dt == DataType::hf) {
+            planEmulatedF4E2M1ToHF(i);
+            rerun = true;
+        } else if (dt == Type::ngen_f4_e2m1() && st == DataType::hf) {
+            planEmulatedHFToF4E2M1(i);
+            rerun = true;
+        } else if (st == Type::ngen_f4_e3m0() && dt == DataType::hf) {
+            planEmulatedE3M0ToHF(i);
+            rerun = true;
+        } else if (dt == Type::ngen_f4_e3m0() && st == DataType::hf) {
+            planEmulatedHFToE3M0(i);
+            rerun = true;
+        } else if (isFP4(dt)) {
+            copyThrough(i, DataType::hf);
+            rerun = true;
+        } else if (isFP4(st)) {
+            copyThrough(i, DataType::hf);
+            rerun = true;
+        } else if ((isB(st) || isW(st)) && isInt4(dt)) {
+            planBToI4(i);
+            rerun = true;
         } else if (st == DataType::u4 && dt == DataType::hf) {
             copyThrough(i, DataType::uw);
             rerun = true;
         } else if (st == DataType::s4 && dt == DataType::hf) {
             planS4ToHF(i);
             rerun = true;
-        } else if (is4(st) && isFP(dt)) {
+        } else if (isInt4(st) && isFP(dt)) {
             copyThrough(i, DataType::hf, 1);
             rerun = true;
-        } else if (is4(dt))
+        } else if (isInt4(dt))
             stub("Unsupported move to int4");
+        else if (isFP4(dt))
+            stub("Unsupported move to FP4");
         else if (isB(st) && getBytes(dt) == 8)
             copyThrough(i, DataType::w);
         else if (getBytes(st) == 8 && isB(dt))
@@ -733,10 +769,14 @@ void CopyPlan::planTypeConversions()
                 } else
                     planEmulatedHalveFloat(i);
             }
-        } else if (st == Type::ngen_hf8() && dt == DataType::hf) {
+        } else if (st == DataType::hf8 && dt == DataType::hf) {
+            if (hw < HW::Xe3)
                 planEmulatedHF8ToHF(i);
-        } else if (st == DataType::hf && dt == Type::ngen_hf8()) {
+        } else if (st == DataType::hf && dt == DataType::hf8) {
+            if (hw < HW::Xe3)
                 planEmulatedHFToHF8(i);
+        } else if (st == Type::ngen_f8_e8m0() && dt == DataType::hf) {
+                planEmulatedFP8E8M0ToHF(i);
         } else if (st != dt && (isFP8(st) || isFP8(dt))) {
             copyThrough(i, DataType::hf, 1);
             rerun = true;
@@ -845,6 +885,222 @@ void CopyPlan::planS4ToHF(CopyInstruction &i)
     ie[3]->dst.range = ie[0]->src0.type;
 }
 
+// hf->f4 sequence.
+void CopyPlan::planEmulatedHFToF4E2M1(CopyInstruction &i)
+{
+    // Emulation sequence for mov y:f4 x:hf:
+    //        sel (lt)     t1:hf    (abs)x:hf  0x4600:hf     /* Clamp */
+    //        mul          t1:hf    t1:hf      0x400:hf
+    //        add          t1:uw    t1:uw      -0x100
+    //        and (nz)f0   null     t1:uw      0x3ff:uw
+    //   (f0) add          t1:uw    t1:uw      0x200
+    //        shl          t1:uw    t1:uw      3
+    //        bfn.0xCA     t1:uw    t1:uw      x:uw   0x8000 /* copy sign */
+    //        shr          t1:uw    t1:uw      8
+    //        and          t1:uw    t1:uw      0x00f0:uw     /* byte pack */
+    //        shr          t1:uw<2> t1:uw<2>   4
+    //        or           t1:uw    t1.1:uw<2> t1:uw<2>
+    //        mov          t1:ub    t1:ub<2>
+    //        mov          y:uw     t1:ub
+    //
+
+    if (i.src0.neg || i.sat || i.hasCMod()) stub("Unsupported modifier");
+    int simd = i.simd;
+
+    auto ie = splitMultiple<10>(i);
+    auto tmp = newTemp(DataType::ud, simd, 1);
+
+    auto convFlag = newFlag(simd);
+    auto ddst = i.dst;
+    auto ssrc = i.src0;
+
+    if (ssrc.stride > 1) {
+        ie[0]->op = Opcode::mov;
+        ie[0]->dst = ssrc;
+        ie[0]->dst.stride = 1;
+        ie[0]->src0 = ssrc;
+        ssrc.stride = 1;
+    } else {
+        ie[0]->invalidate();
+    }
+
+    // Clamp and round.
+    ie[1]->op = Opcode::sel;
+    ie[1]->flag = convFlag;
+    ie[1]->cmod = ConditionModifier::lt;
+    ie[1]->simd = simd;
+    ie[1]->dst = tmp;
+    ie[1]->dst.type = DataType::hf;
+    ie[1]->src0 = ssrc;
+    ie[1]->src0.type = DataType::hf;
+    ie[1]->src0.abs = true;
+    ie[1]->src1 = Immediate::hf(0x4600);
+
+    ie[2]->op = Opcode::mul;
+    ie[2]->simd = simd;
+    ie[2]->dst = tmp;
+    ie[2]->dst.type = DataType::hf;
+    ie[2]->src0 = tmp;
+    ie[2]->src0.type = DataType::hf;
+    ie[2]->src1 = Immediate::hf(0x0400);
+
+    ie[3]->op = Opcode::add;
+    ie[3]->simd = simd;
+    ie[3]->dst = tmp;
+    ie[3]->dst.type = DataType::uw;
+    ie[3]->src0 = tmp;
+    ie[3]->src0.type = DataType::uw;
+    ie[3]->src1 = Immediate::w(-0x0100);
+
+    ie[4]->op = Opcode::and_;
+    ie[4]->simd = simd;
+    ie[4]->flag = convFlag;
+    ie[4]->cmod = ConditionModifier::nz;
+    ie[4]->dst = CopyOperand();
+    ie[4]->dst.type = DataType::uw;
+    ie[4]->src0 = tmp;
+    ie[4]->src0.type = DataType::uw;
+    ie[4]->src1 = Immediate::uw(0x03ff);
+
+    ie[5]->op = Opcode::add;
+    ie[5]->simd = simd;
+    ie[5]->flag = convFlag;
+    ie[5]->dst = tmp;
+    ie[5]->dst.type = DataType::uw;
+    ie[5]->src0 = tmp;
+    ie[5]->src0.type = DataType::uw;
+    ie[5]->src1 = Immediate::uw(0x0200);
+
+    ie[6]->op = Opcode::shl;
+    ie[6]->simd = simd;
+    ie[6]->dst = tmp;
+    ie[6]->dst.type = DataType::uw;
+    ie[6]->src0 = tmp;
+    ie[6]->src0.type = DataType::uw;
+    ie[6]->src1 = Immediate::uw(3);
+
+    // Restore sign.
+    ie[7]->op = Opcode::bfn;
+    ie[7]->dst = tmp;
+    ie[7]->dst.stride = 1;
+    ie[7]->dst.type = DataType::uw;
+    ie[7]->src0 = tmp;
+    ie[7]->src0.type = DataType::uw;
+    ie[7]->src1 = ssrc;
+    ie[7]->src1.type = DataType::uw;
+    ie[7]->src2 = 0x8000;
+    ie[7]->ctrl = 0xCA;
+
+    ie[8]->op = Opcode::shr;
+    ie[8]->simd = simd;
+    ie[8]->dst = tmp;
+    ie[8]->dst.type = DataType::uw;
+    ie[8]->src0 = tmp;
+    ie[8]->src0.type = DataType::uw;
+    ie[8]->src1 = Immediate::uw(12);
+
+    // Pack into byte.
+
+    ie[9]->op = Opcode::mov;
+    ie[9]->simd = simd;
+    ie[9]->dst = ddst;
+    ie[9]->dst.type = DataType::u4;
+    ie[9]->src0 = tmp;
+    ie[9]->src0.type = DataType::uw;
+
+}
+
+void CopyPlan::planBToI4(CopyInstruction &i)
+{
+    if (i.src0.neg || i.sat || i.hasCMod()) stub("Unsupported modifier");
+    int simd = i.simd;
+
+    auto ie = splitMultiple<6>(i);
+    auto tmp = newTemp(DataType::uw, simd, 1);
+
+    auto ddst = CopyOperand(i.dst);
+    auto ssrc = CopyOperand(i.src0);
+    bool sUw = ssrc.type == DataType::uw;
+    bool sUb = ssrc.type == DataType::ub;
+    bool tempSrc = ((ssrc.stride > 1 && ssrc.type == DataType::uw) || (ssrc.stride == 1 && ssrc.type == DataType::ub));
+    if ((!sUw && !sUb)) stub();
+    int idx = 0;
+
+    if(tempSrc){
+        ie[idx]->op = Opcode::mov;
+        ie[idx]->dst = ssrc;
+        ie[idx]->dst.type = DataType::ub;
+        ie[idx]->dst.stride = 2;
+        ie[idx]->src0 = ssrc;
+        if(sUw){
+           ie[idx]->src0.type = DataType::ub;
+           ie[idx]->src0.stride = ie[idx]->src0.stride * 2;
+        }
+    }else{
+        ie[idx]->invalidate();
+    }
+    ++idx;
+
+    ie[idx]->op = Opcode::mov;
+    ie[idx]->simd = simd/2;
+    ie[idx]->dst = tmp;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->src0 = ssrc;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src0.stride = 2;
+    ++idx;
+
+    ie[idx]->op = Opcode::shl;
+    ie[idx]->simd = simd/2;
+    ie[idx]->dst = ssrc;
+    ie[idx]->dst.offset = 0;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = ssrc;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src0.offset = 1;
+    ie[idx]->src0.stride = 2;
+    ie[idx]->src1 = Immediate::uw(0x4);
+    ++idx;
+
+    ie[idx]->op = Opcode::or_;
+    ie[idx]->simd = simd/2;
+    ie[idx]->dst = tmp;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->src0 = tmp;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src1 = ssrc;
+    ie[idx]->src1.type = DataType::uw;
+    ie[idx]->src1.stride = 1;
+    ++idx;
+
+    ie[idx]->op = Opcode::mov;
+    ie[idx]->simd = simd/2;
+    ie[idx]->dst = tmp;
+    ie[idx]->dst.type = DataType::ub;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->src0 = tmp;
+    ie[idx]->src0.stride = 2;
+    ie[idx]->src0.type = DataType::ub;
+    ++idx;
+
+    ie[idx]->op = Opcode::mov;
+    ie[idx]->simd = simd/2;
+    ie[idx]->dst = ddst;
+    ie[idx]->dst.type = DataType::ub;
+    if (ddst.inVS != 0)
+        ie[idx]->dst.stride = ddst.inVS / ddst.inW;
+    if (ie[idx]->dst.offset != 0)
+            ie[idx]->dst.offset /= 2;
+    ie[idx]->src0 = tmp;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::ub;
+    ++idx;
+}
+
 // Emulated f->bf or hf->bf8 sequence.
 void CopyPlan::planEmulatedHalveFloat(CopyInstruction &i)
 {
@@ -905,7 +1161,7 @@ void CopyPlan::planEmulatedHalveFloat(CopyInstruction &i)
 void CopyPlan::planEarlyInt4Upconversions()
 {
     for (auto &i: insns) {
-        if (i.op == Opcode::mov && is4(i.src0.type) && isB(i.dst.type)) {
+        if (i.op == Opcode::mov && isInt4(i.src0.type) && isB(i.dst.type)) {
             bool s4 = (i.src0.type == DataType::s4);
             if (i.src0.inW == 2 && i.src0.stride == 1 && i.dst.stride >= (s4 ? 2 : 4)) {
                 planInt4Upconversion(i);
@@ -1011,6 +1267,113 @@ void CopyPlan::planInt4Upconversion(CopyInstruction &i)
     }
 }
 
+
+// Emulation sequence for fp8 e8m0->hf conversion.
+void CopyPlan::planEmulatedFP8E8M0ToHF(CopyInstruction &i) {
+    if (i.src0.neg || i.sat || i.hasCMod()) stub("Unsupported modifier");
+    if (hw < HW::XeHP) stub("Unsupported HW");
+
+
+    // Emulation sequence for mov y:hf y:e8m0
+    // mov                 y:uw   x:u8                    /* emulated separately
+    // xor                 t1:uw, 0xFF, y:uw
+    // add                 y:w, y:w, -112
+    // csel (ze)           y:w,  0 , y:w, y:w
+    // cmp  (ge)           t0:w, y:w, 31
+    // shr                 y:uw, 10
+    // csel (ge)           y:fp16,  0x7bff, y:fp16, t0:fp16
+    // csel (ze)           y:fp16, NaN:fp16, y:fp16, t1:fp16
+
+    auto ie = splitMultiple<9>(i);
+
+    auto yOrig = i.dst, y = yOrig;
+
+    bool tempY = (y.stride > 1 && multiGRF(hw, i, y));
+    if (tempY)  /* Replace y by temporary if nonunit stride hurts performance */
+        y = newTemp(DataType::uw, i.simd, 1);
+
+    auto yW = y;
+    yW.type = DataType::w;
+
+    auto t0 = i.src0;
+    if (t0.overwrite && t0.overwriteStride
+        && t0.stride == y.stride * 2 && t0.offset == y.offset * 2) {
+        t0.type = DataType::hf;
+        t0.stride /= 2;
+        t0.offset /= 2;
+    } else
+        t0 = newTemp(DataType::hf, i.simd, y.stride, 0, y.offset);
+
+
+    auto t1 = i.src0;
+    t1 = newTemp(DataType::hf, i.simd, y.stride, 0, y.offset);
+
+    auto t0UW = t0;
+    t0UW.type = DataType::uw;
+
+    auto t1W = t1;
+    t1W.type = DataType::w;
+
+    // Copy to u16.
+    ie[0]->src0.type = DataType::ub;
+    ie[0]->dst.type = DataType::uw;
+    ie[0]->dst = yW;
+
+    ie[1]->op = Opcode::xor_;
+    ie[1]->dst =  t0UW;
+    ie[1]->src0 = yW;
+    ie[1]->src1 = 0xFF;
+
+    ie[2]->op = Opcode::add;
+    ie[2]->dst = ie[2]->src0 =yW;
+    ie[2]->src1 = -112;
+
+    ie[3]->op = Opcode::csel;
+    ie[3]->dst = yW;
+    ie[3]->src0 = Immediate::w(0x0);
+    ie[3]->src1 = yW;
+    ie[3]->src2 = yW;
+    ie[3]->cmod = ConditionModifier::le;
+
+    ie[4]->op = Opcode::cmp;
+    ie[4]->dst = t1W;
+    ie[4]->src0 = yW;
+    ie[4]->src1 = Immediate::uw(31);;
+    ie[4]->cmod = ConditionModifier::le;
+    ie[4]->flag = newFlag(ie[4]->simd);
+
+
+    ie[5]->op = Opcode::shl;
+    ie[5]->dst = yW;
+    ie[5]->src0 = yW;
+    ie[5]->src0.type = DataType::uw;
+    ie[5]->dst.type  = DataType::uw;
+    ie[5]->src1 = 10;
+
+    ie[6]->op = Opcode::csel;
+    ie[6]->dst = y;
+    ie[6]->src0 = Immediate::hf(0x7BFF);
+    ie[6]->src1 = y;
+    ie[6]->src2 = t1;
+    ie[6]->cmod = ConditionModifier::ze;
+
+    ie[7]->op = Opcode::csel;
+    ie[7]->dst = y;
+    ie[7]->src0 = Immediate::hf(0x7C01);
+    ie[7]->src1 = y;
+    ie[7]->src2 = t0;
+    ie[7]->cmod = ConditionModifier::ze;
+
+    if (tempY) {
+        ie[8]->op = Opcode::mov;
+        ie[8]->dst = yOrig;
+        ie[8]->dst.type = DataType::uw;
+        ie[8]->src0 = yW;
+    } else
+        ie[8]->invalidate();
+}
+
+
 // Emulation sequence for hf8->hf conversion.
 void CopyPlan::planEmulatedHF8ToHF(CopyInstruction &i)
 {
@@ -1093,6 +1456,300 @@ void CopyPlan::planEmulatedHF8ToHF(CopyInstruction &i)
         ie[7]->invalidate();
 }
 
+// Emulation sequence for f4_e2m1->hf conversion.
+void CopyPlan::planEmulatedF4E2M1ToHF(CopyInstruction &i) {
+
+    // Emulation sequence for mov y:hf x:hf4_E2M1
+    // mov                 t0:uw   x:u4                    /* emulated separately */
+    // shl                 y:uw    t0:uw    9
+    // shl                 t0:uw   t0:uw    12
+    // and                 y:uw    y:uw    0x0E00
+    // mul                 y:hf    y:hf    16384:hf
+    // bfn.0xCA            y:uw    y:uw    t0:uw  0x8000   /* copy sign */
+
+
+    if (i.src0.neg || i.sat || i.hasCMod()) stub("Unsupported modifier");
+
+    auto ie = splitMultiple<8>(i);
+
+    auto yOrig = i.dst, y = yOrig;
+
+    bool tempY = (y.stride > 1 && multiGRF(hw, i, y));
+    if (tempY)  /* Replace y by temporary if nonunit stride hurts performance */
+        y = newTemp(DataType::uw, i.simd, 1);
+
+    auto yUW = y;
+    yUW.type = DataType::uw;
+
+    auto t0 = newTemp(DataType::hf, i.simd, y.stride, 0, y.offset);
+
+    auto t0UW = t0;
+    t0UW.type = DataType::uw;
+    t0UW.stride = 1;
+    t0UW.offset = 0;
+
+    // Copy to u16.
+    ie[0]->src0.type = DataType::u4;
+    ie[0]->dst.type = DataType::uw;
+    ie[0]->dst = t0UW;
+
+    ie[1]->src0 = ie[0]->dst;
+    ie[1]->op = Opcode::shl;
+    ie[1]->dst = yUW;
+    ie[1]->src1 = 9;
+
+    ie[2]->op = Opcode::shl;
+    ie[2]->dst = t0UW;
+    ie[2]->src0 = ie[0]->dst;
+    ie[2]->src1 = 12;
+
+    ie[3]->op = Opcode::and_;
+    ie[3]->dst = ie[3]->src0 = yUW;
+    ie[3]->src1 = 0x0E00;
+
+    ie[4]->op = Opcode::mul;
+    ie[4]->dst = ie[4]->src0 = y;
+    ie[4]->src1 = Immediate::hf(0x7400);
+
+    ie[5]->op = Opcode::and_;
+    ie[5]->dst = ie[5]->src0 = t0UW;
+    ie[5]->src1 = 0x8000;
+
+    ie[6]->op = Opcode::or_;
+    ie[6]->dst = ie[6]->src0 = yUW;
+    ie[6]->src1 = t0UW;
+
+    if (tempY) {
+        ie[7]->op = Opcode::mov;
+        ie[7]->dst = yOrig;
+        ie[7]->dst.type = DataType::uw;
+        ie[7]->src0 = yUW;
+    } else
+        ie[7]->invalidate();
+}
+
+// Emulation sequence for hf->e3m0 conversion.
+void CopyPlan::planEmulatedHFToE3M0(CopyInstruction &i)
+{
+    // // Emulation sequence for  mov y:e3m0 x:hf
+    // sel (lt)     t0:hf   x:hf    0x4c00:hf       /* clamp max val */
+    // mul          t0:hf   t0:hf   0x0c00:hf       /* round up */
+    // add          t0:uw   t0:uw   -0x200          /* subtract 1/2 ULP */
+    // and (nz)f0   null:uw t0:uw   0x7FF
+    // (f0) add     t0:uw   t0:uw   0x400           /* round up */
+    // shl          t0:uw   t0:uw   2
+    // bfn.0xCA     t0:uw   x:uw    t0:uw  0x8000   /* copy sign */
+    // shr          t0:uw   t0:uw   12
+    // mov /* pack halfs into bytes */
+
+    if (i.src0.neg || i.sat || i.hasCMod()) stub("Unsupported modifier");
+
+    auto ie = splitMultiple<10>(i);
+
+    auto x = i.src0;
+    auto y = i.dst;
+
+    bool tempX = (x.stride > 1);
+
+    auto bits = newTemp(DataType::uw, i.simd, 1, 0, 0);
+
+    auto selFlag = newFlag(i.simd);
+    int simd = i.simd;
+    int idx = 0;
+
+    if(tempX){
+        ie[idx]->op = Opcode::mov;
+        ie[idx]->dst = x;
+        ie[idx]->dst.stride = 1;
+        ie[idx]->src0 = x;
+    }else{
+        ie[idx]->invalidate();
+    }
+    ++idx;
+
+    ie[idx]->op = Opcode::sel;
+    ie[idx]->cmod = ConditionModifier::lt;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.type = DataType::hf;
+    ie[idx]->src0 = x;
+    ie[idx]->src0.abs = true;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::hf;
+    ie[idx]->src1 = 0x4c00;
+    ie[idx]->src1.type = DataType::hf;
+    ++idx;
+
+    ie[idx]->op = Opcode::mul;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::hf;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::hf;
+    ie[idx]->src1 = 0x0c00;
+    ie[idx]->src1.type = DataType::hf;
+    ++idx;
+
+    ie[idx]->op = Opcode::add;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src1 = -0x0200;
+    ie[idx]->src1.type = DataType::w;
+    ++idx;
+
+    ie[idx]->op = Opcode::and_;
+    ie[idx]->flag = selFlag;
+    ie[idx]->cmod = ConditionModifier::nz;
+    ie[idx]->dst = CopyOperand();
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src1 = 0x07ff;
+    ie[idx]->src1.type = DataType::uw;
+    ++idx;
+
+    ie[idx]->op = Opcode::add;
+    ie[idx]->flag = selFlag;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src1 = 0x400;
+    ie[idx]->src1.type = DataType::uw;
+    ++idx;
+
+    ie[idx]->op = Opcode::shl;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src1 = 0x2;
+    ie[idx]->src1.type = DataType::uw;
+    ++idx;
+
+    ie[idx]->op = Opcode::bfn;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src1 = x;
+    ie[idx]->src1.stride = 1;
+    ie[idx]->src1.type = DataType::uw;
+    ie[idx]->src2 = 0x8000;
+    ie[idx]->ctrl = 0xCA;
+    ++idx;
+
+    ie[idx]->op = Opcode::shr;
+    ie[idx]->dst = bits;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->dst.type = DataType::uw;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.stride = 1;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src1 = 0xC;
+    ie[idx]->src1.type = DataType::uw;
+    ++idx;
+
+    // Pack.
+    ie[idx]->op = Opcode::mov;
+    ie[idx]->simd = simd;
+    ie[idx]->dst = y;
+    ie[idx]->dst.type = DataType::u4;
+    ie[idx]->dst.stride = 1;
+    ie[idx]->src0 = bits;
+    ie[idx]->src0.type = DataType::uw;
+    ie[idx]->src0.stride = 1;
+    ++idx;
+
+}
+
+// Emulation sequence for e3m0->hf conversion.
+void CopyPlan::planEmulatedE3M0ToHF(CopyInstruction &i)
+{
+    // Emulation sequence for mov y:hf x:e3m0: play only on the exponent bits
+    // mov                 y:uw   x:u4                    /* emulated separately */
+    // shl                 t0:uw   t0:uw   12
+    // and                 y:uw    y:uw    0x7
+    // (f1)add             y:uw    y:uw    12
+    // shl                 y:uw    y:uw    10
+    // bfn.0xCA            y:uw    y:uw    t0:uw  0x8000   /* copy sign */
+
+    if (i.src0.neg || i.sat || i.hasCMod()) stub("Unsupported modifier");
+
+    auto ie = splitMultiple<8>(i);
+
+    auto yOrig = i.dst, y = yOrig;
+
+    bool tempY = (y.stride > 1 && multiGRF(hw, i, y));
+    if (tempY)  /* Replace y by temporary if nonunit stride hurts performance */
+        y = newTemp(DataType::uw, i.simd, 1);
+
+    auto yUW = y;
+    yUW.type = DataType::uw;
+
+    auto t0 = newTemp(DataType::hf, i.simd, y.stride, 0, y.offset);
+
+    auto t0UW = t0;
+    t0UW.type = DataType::uw;
+
+    // Copy to u16.
+    ie[0]->src0.type = DataType::u4;
+    ie[0]->dst.type = DataType::uw;
+    ie[0]->dst = yUW;
+
+    ie[1]->op = Opcode::shl;
+    ie[1]->dst = t0UW;
+    ie[1]->src0 = ie[0]->dst;
+    ie[1]->src1 = 12;
+
+    ie[2]->op = Opcode::and_;
+    ie[2]->dst = ie[2]->src0 = yUW;
+    ie[2]->src1 = 0x7;
+    ie[2]->cmod = ConditionModifier::nz;
+    ie[2]->flag = newFlag(ie[2]->simd);
+
+    ie[3]->op = Opcode::add;
+    ie[3]->dst = ie[3]->src0 = yUW;
+    ie[3]->src1 = 0xc;
+    ie[3]->dst.type = DataType::uw;
+    ie[3]->src0.type = DataType::uw;
+    ie[3]->src1.type = DataType::uw;
+    ie[3]->flag = ie[2]->flag;
+
+    ie[4]->src0 = ie[3]->dst;
+    ie[4]->op = Opcode::shl;
+    ie[4]->dst = yUW;
+    ie[4]->src1 = 10;
+
+    ie[5]->op = Opcode::and_;
+    ie[5]->dst = ie[5]->src0 = t0UW;
+    ie[5]->src1 = 0x8000;
+
+    ie[6]->op = Opcode::or_;
+    ie[6]->dst = ie[6]->src0 = yUW;
+    ie[6]->src1 = t0UW;
+
+    if (tempY) {
+        ie[7]->op = Opcode::mov;
+        ie[7]->dst = yOrig;
+        ie[7]->dst.type = DataType::uw;
+        ie[7]->src0 = yUW;
+    } else
+        ie[7]->invalidate();
+}
+
 // Emulation sequence for hf->hf8 conversion.
 void CopyPlan::planEmulatedHFToHF8(CopyInstruction &i)
 {
@@ -1106,7 +1763,7 @@ void CopyPlan::planEmulatedHFToHF8(CopyInstruction &i)
     // mul          t0:hf   t0:hf   2^(-15):hf      /* hf8 underflow */
     // and (ze)f1   null    ~t0:uw  0x7C00          /* nan/inf check */
     // add          t0:uw   t0:uw   -0x40           /* round */
-    // and (nz)f2   null    t0:uw   0x3FF
+    // and (nz)f2   null    t0:uw   0x0FF
     // shl          t0:uw   t0:uw   1               /* move to high byte */
     // (f2) add     t0:uw   t0:uw   0x100
     // (f1) mov     t0:uw   0x7F00
@@ -1143,7 +1800,7 @@ void CopyPlan::planEmulatedHFToHF8(CopyInstruction &i)
     ie[4]->dst = CopyOperand();
     ie[4]->dst.type = DataType::uw;
     ie[4]->src0 = t0UW;
-    ie[4]->src1 = 0x3FF;
+    ie[4]->src1 = 0x0FF;
     ie[4]->cmod = ConditionModifier::nz;
     ie[4]->flag = newFlag(ie[4]->simd);
 
@@ -1180,7 +1837,7 @@ void CopyPlan::checkNoSubbytes()
 {
     for (auto &i: insns)
         if (is4(i.dst.type) || is4(i.src0.type) || is4(i.src1.type) || is4(i.src2.type))
-            stub("Unexpected int4 type");
+            stub("Unexpected 4-bit type");
 }
 
 // Collapse multiple-cnum instructions into single-cnum instructions if possible.
@@ -1312,6 +1969,31 @@ inline bool legalPackedBF(HW hw, const CopyOperand &op)
     return (op.stride == 1 && (op.offset & (align - 1)) == 0);
 }
 
+void   CopyPlan::planFP8SIMD1Mov(CopyInstruction &i){
+    /* Simd 1 not allowed, use following sequence instead:
+       hf8->hf (analagous sequence will be generated for hf->hf8)
+       mov(2, t_dst.hf, src<2,2,1>.hf8)
+       mov(1, dst.uw, t_dst<1,1,1>.uw) */
+
+     auto dt = i.dst.type;
+     auto ie = splitMultiple<2>(i);
+     auto src = i.src0;
+     auto dst = i.dst;
+     auto t_dst = newTemp(dt, 2, 1);
+     t_dst.stride = 1;
+
+     ie[0]->op = Opcode::mov;
+     ie[0]->dst = t_dst;
+     ie[0]->src0 = src;
+     ie[0]->src0.stride = 1;
+     ie[0]->simd = 2;
+
+     ie[1]->op = Opcode::mov;
+     ie[1]->dst = dst;
+     ie[1]->src0 = t_dst;
+     ie[1]->moveToIntegerPipe();
+}
+
 // Pass to legalize regions.
 void CopyPlan::legalizeRegions()
 {
@@ -1328,13 +2010,17 @@ void CopyPlan::legalizeRegions()
         if (!i.dst) continue;
 
         /* Check for special packed conversion cases */
-        if (i.op == Opcode::mov && s0t == DataType::hf && dt == DataType::bf8) {
-            // hf -> bf8: src0/dst must be packed unit stride, zero offset
-            if (i.src0.offset != 0 || i.src0.stride != 1)
+        if (i.op == Opcode::mov && ((s0t == DataType::hf && isFP8(dt))
+                                 || (dt == DataType::hf && isFP8(s0t)))) {
+            // hf <-> bf8/hf8: src0/dst must be packed unit stride, zero offset
+            if (i.simd == 1 && i.src0.offset == 0 && i.src0.stride == 1){
+                planFP8SIMD1Mov(i);
+                rerun = true;
+            } else if (i.src0.offset != 0 || i.src0.stride != 1) {
                 repositionSrc(i, 0, 1, 0);
-            if (i.dst.offset != 0 || i.dst.stride != 1)
+                rerun = true;
+            } else if (i.dst.offset != 0 || i.dst.stride != 1)
                 repositionDst(i, 1, 0);
-            if (i.simd == 1) hw_unsupported();
             continue;
         }
 
@@ -1384,14 +2070,15 @@ void CopyPlan::legalizeRegions()
         if (hfIntConvert)
             maxChanOff = 1;     /* special case: integer<->hf only allows .0:hf */
 
-        doRestrideDst |= (channelOffset > maxChanOff);
+        bool badChanOff = (channelOffset >= maxChanOff);
+        doRestrideDst |= badChanOff;
 
         /* For illegal dst, copy through temporary dst */
         if (doRestrideDst) {
             if (i.simd == 1)
                 i.dst.stride = dstMinStride;
             else {
-                restrideDst(i, dstMinStride, hfIntConvert);
+                restrideDst(i, dstMinStride, badChanOff);
                 rerun = true;
                 continue;
             }
@@ -1576,6 +2263,8 @@ void CopyPlan::optimizeZip()
                 if (o1.temp && o1.value != o2.value) return false;
                 if (o1.stride & 1) return false;
                 if (o1.neg != o2.neg) return false;
+                if (o1.abs != o2.abs) return false;
+                if (o1.inv != o2.inv) return false;
                 return (o1.offset + (o1.stride >> 1) == o2.offset);
             };
 
@@ -1615,7 +2304,7 @@ static void widen(CopyOperand &op, bool zipping = false)
         case CopyOperand::Null: return;
     }
 
-    if (is4(op.type))      op.type = DataType::ub;
+    if (isInt4(op.type))      op.type = DataType::ub;
     else if (isB(op.type)) op.type = DataType::uw;
     else if (isW(op.type)) op.type = DataType::ud;
     else stub();
@@ -1664,6 +2353,8 @@ void CopyPlan::optimizeZipAdjacent()
             if (o1.temp && o1.value != o2.value) return false;
             if (!widenable(o1, true)) return false;
             if (o1.neg != o2.neg) return false;
+            if (o1.abs != o2.abs) return false;
+            if (o1.inv != o2.inv) return false;
             return (o1.offset + 1 == o2.offset);
         };
 
@@ -1706,7 +2397,7 @@ void CopyPlan::optimizeWidenIntegers()
         while (true) {
             bool doWiden = widenable(i.dst) && widenable(i.src0)
                         && asSigned(i.dst.type) == asSigned(i.src0.type)
-                        && !i.src0.neg && i.simd % 2 == 0;
+                        && !i.src0.neg && !i.src0.inv && !i.src0.abs && i.simd % 2 == 0;
 
             for (auto op: {&i.src1, &i.src2}) if (*op) {
                 doWiden = doWiden && widenable(*op)
@@ -1751,6 +2442,8 @@ void CopyPlan::optimizeConcatenate(bool initial)
                 if (o1.temp != o2.temp) return false;
                 if (o1.temp && (o1.value != o2.value)) return false;
                 if (o1.neg != o2.neg) return false;
+                if (o1.abs != o2.abs) return false;
+                if (o1.inv != o2.inv) return false;
                 auto gap = (o2.absByteOffset(hw) - o1.absByteOffset(hw))
                          - elementsToBytes(o1.stride * i1.simd, o1.type);
                 if (outTooFar)
@@ -1934,7 +2627,7 @@ void CopyPlan::materializeTemps(const GRFAllocator &grfAllocator, const FlagAllo
     cnumOrder.reserve(temps.size());
 
     for (size_t t = 0; t < temps.size(); t++)
-        cnumOrder.push_back(std::make_pair(temps[t].cnumMin, t));
+        cnumOrder.push_back(std::make_pair(temps[t].cnumMin, int(t)));
     std::sort(cnumOrder.begin(), cnumOrder.end());
 
     for (int cnum0 = 0; cnum0 < ncnum; ) {
@@ -2006,5 +2699,108 @@ void CopyPlan::materializeTemps(const GRFAllocator &grfAllocator, const FlagAllo
     temps.clear();
 }
 
+#ifdef DNNL_DEV_MODE
+int CopyPlan::cycleCount() const
+{
+    int count = 0;
+    for (const auto &i: insns)
+        count += (multiGRF(hw, i, i.dst) || multiGRF(hw, i, i.src0) || multiGRF(hw, i, i.src1) || multiGRF(hw, i, i.src2)) ? 2 : 1;
+    return count;
+}
+
+void CopyPlan::dump() const
+{
+    for (const auto &i: insns)
+        i.dump(*this);
+}
+
+void CopyInstruction::dump(const CopyPlan &plan) const
+{
+    if (flag && cmod == ConditionModifier::none) {
+        std::cout << '(';
+        flag.dump();
+        std::cout << ")\t";
+    }
+
+    std::cout << getMnemonic(op, HW::Gen9);
+    if (op == Opcode::bfn)
+        std::cout << '.' << std::hex << int(ctrl) << std::dec;
+    std::cout << " (" << simd << ")\t";
+    if (sat) std::cout << "(sat) ";
+    if (cmod != ConditionModifier::none) {
+        std::cout << '(' << cmod << ')';
+        flag.dump();
+        std::cout << ' ';
+    }
+    dst.dump();
+    std::cout << '\t';
+    src0.dump();
+    if (src1) {
+        std::cout << '\t';
+        src1.dump();
+        if (src2) {
+            std::cout << '\t';
+            src2.dump();
+        }
+    }
+    if (atomic)
+        std::cout << "\t{Atomic}";
+    if (get_verbose(verbose_t::debuginfo))
+        std::cout << "\t\t(phase = " << phase << ", cnum = [" << cnumMin << ", " << cnumMax << "])";
+
+    std::cout << std::endl;
+}
+
+void CopyOperand::dump() const
+{
+    auto outType = [](DataType dt) {
+        if (dt == Type::ngen_f8_e8m0())
+            std::cout << "e8m0";
+        if (dt == Type::ngen_f4_e2m1())
+            std::cout << "e2m1";
+        if (dt == Type::ngen_f4_e3m0())
+            std::cout << "e3m0";
+        else
+            std::cout << dt;
+    };
+
+    if (neg) std::cout << '-';
+    switch (kind) {
+        case Null: std::cout << "null:" << type; break;
+        case GRF:
+            if (temp) {
+                std::cout << 't' << value;
+                if (grf) std::cout << '+' << grf;
+            } else
+                std::cout << 'r' << grf;
+            std::cout << '.' << int(offset) << ':';
+            outType(type);
+            if (range != DataType::invalid && range != type) {
+                std::cout << '[';
+                outType(range);
+                std::cout << ']';
+            }
+            std::cout << '<';
+            if (inVS || inW)
+                std::cout << int(inVS) << ';' << int(inW) << ',';
+            std::cout << int(stride) << '>';
+            break;
+        case Flag:
+            if (temp)
+                std::cout << 't' << value;
+            else
+                std::cout << 'f' << (grf >> 1) << '.' << (grf & 1);
+            if (offset)
+                std::cout << '+' << int(offset);
+            break;
+        case Immediate:
+            LabelManager man;
+            ngenImmediate().outputText(std::cout, PrintDetail::full, man);
+            break;
+    }
+    if (stride > 1 && overwriteStride) std::cout << "!!";
+    else if (overwrite)                std::cout << '!';
+}
+#endif
 
 #include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.hpp b/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.hpp
index ad2d951a0ff..c3bb7ebc677 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/copy_plan.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,9 @@ struct CopyOperand
     bool temp = false;                                  // Operand is a temporary?
     bool overwrite = false;                             // Operand can be trashed?
     bool overwriteStride = false;                       // Padding area between strides can be trashed?
-    bool neg = false;
+    bool neg = false;                                   // Negate (-) rd operator
+    bool abs = false;                                   // Absolute Value abs() rd operator
+    bool inv = false;                                   // Invert (~) rd operator
     uint64_t value = 0;                                 // Immediate value, or temporary index
 
     bool isNull() const { return kind == Null; }
@@ -62,8 +64,12 @@ struct CopyOperand
     CopyOperand(ngen::RegData rd);
     CopyOperand(ngen::Immediate imm) : type(imm.getType()), kind(Immediate), value(imm) {}
     CopyOperand(int imm) : CopyOperand(ngen::Immediate(imm)) {}
+#if defined(DNNL_DEV_MODE)
+    void dump() const;
+#endif
 
     CopyOperand operator-() const;
+    CopyOperand operator~() const;
 
 };
 
@@ -90,6 +96,9 @@ struct CopyInstruction
 
     template <typename Generator>
     inline void execute(Generator &g);
+#if defined(DNNL_DEV_MODE)
+    void dump(const CopyPlan &plan) const;
+#endif
 
 };
 
@@ -125,7 +134,6 @@ class CopyPlan
     using FlagAllocator = std::function<void(int bytes, ngen::FlagRegister &flag)>;
 
     CopyPlan(ngen::HW hw_, bool systolicAvailable_) : hw(hw_), systolicAvailable(systolicAvailable_) {}
-
     CopyInstruction &append(CopyInstruction &&i);
     CopyInstruction &append(ngen::Opcode op, int simd, const CopyOperand &dst, const CopyOperand &src0, const CopyOperand &src1 = CopyOperand(), const CopyOperand &src2 = CopyOperand());
     CopyInstruction &append(ngen::Opcode op, int simd, ngen::InstructionModifier mod, const CopyOperand &dst, const CopyOperand &src0, const CopyOperand &src1 = CopyOperand(), const CopyOperand &src2 = CopyOperand());
@@ -139,7 +147,10 @@ class CopyPlan
 
     template <typename Generator>
     inline void execute(Generator &g);
-
+#if defined(DNNL_DEV_MODE)
+    void dump() const;
+    int cycleCount() const;
+#endif
     int tempFlagBytes() const;
 
 protected:
@@ -177,11 +188,18 @@ class CopyPlan
     void planEarlyInt4Upconversions();
     void planEmulatedHalveFloat(CopyInstruction &i);
     void planSmallUWToHF(CopyInstruction &i);
+    void planBToI4(CopyInstruction &i);
     void planBToHF(CopyInstruction &i);
     void planS4ToHF(CopyInstruction &i);
+    void planEmulatedE3M0ToHF(CopyInstruction &i);
+    void planEmulatedHFToE3M0(CopyInstruction &i);
+    void planEmulatedF4E2M1ToHF(CopyInstruction &i);
+    void planEmulatedHFToF4E2M1(CopyInstruction &i);
     void planInt4Upconversion(CopyInstruction &i);
     void planEmulatedHF8ToHF(CopyInstruction &i);
     void planEmulatedHFToHF8(CopyInstruction &i);
+    void planFP8SIMD1Mov(CopyInstruction &i);
+    void planEmulatedFP8E8M0ToHF(CopyInstruction &i);
     void legalizeSIMD(bool initial = false);
     void legalizeRegions();
     void legalizeNegation();
@@ -235,7 +253,7 @@ void CopyInstruction::execute(Generator &g)
                 g.o(ngenModifiers(), dst.ngen(), src0.ngen(), src1.ngen(), src2.ngen()); \
         }                                                                           \
         break;
-#define BFN_OP_CASE(o)                                                          \
+#define BFN_OP_CASE(o)                                                              \
     case ngen::Opcode::o:                                                           \
         if (src0.kind == CopyOperand::Immediate) {                                  \
             if (src2.kind == CopyOperand::Immediate)                                \
@@ -261,6 +279,7 @@ void CopyInstruction::execute(Generator &g)
         BINARY_OP_CASE(shl)
         BINARY_OP_CASE(shr)
         BINARY_OP_CASE(asr)
+        BINARY_OP_CASE(sel)
         TERNARY_OP_CASE(mad)
         TERNARY_OP_CASE(csel)
         BFN_OP_CASE(bfn)
@@ -270,6 +289,7 @@ void CopyInstruction::execute(Generator &g)
 #undef UNARY_OP_CASE
 #undef BINARY_OP_CASE
 #undef TERNARY_OP_CASE
+#undef BFN_OP_CASE
 }
 
 #include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/driver_info.cxx b/src/gpu/intel/jit/gemm/generator/pieces/driver_info.cxx
index 4754ce3e51a..b4057f59c6a 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/driver_info.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/driver_info.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,6 +45,9 @@ CommonDriverInfo BLASKernelGenerator<hw>::driverInfo(GEMMProblem problem, const
     if (strategy.cWalkOrder == WalkOrder::SimpleLinear) {
         info.loopOrder[0] = (info.loopOrder[0] == LoopN) ? LoopMNLinearNMK : LoopMNLinearMNK;
         info.loopOrder[1] = LoopNone;
+    } else if (strategy.cWalkOrder == WalkOrder::NestedLinear) {
+        info.loopOrder[0] = (info.loopOrder[0] == LoopN) ? LoopMNNestedLinearNMK : LoopMNNestedLinearMNK;
+        info.loopOrder[1] = LoopNone;
     } else if (strategy.cWalkOrder == WalkOrder::Hilbertlike) {
         info.loopOrder[0] = (info.loopOrder[0] == LoopN) ? LoopMNHilbertNMK : LoopMNHilbertMNK;
         info.loopOrder[1] = LoopNone;
@@ -74,6 +77,7 @@ CommonDriverInfo BLASKernelGenerator<hw>::driverInfo(GEMMProblem problem, const
     if (problem.alpha.pointer())                                  info.flags |= FlagAlphaPtr;
     if (problem.beta.pointer())                                   info.flags |= FlagBetaPtr;
     if (strategy.nondeterministic(problem))                       info.flags |= FlagNondeterministic;
+    if (strategy.tlbWarmup)                                       info.flags |= FlagExtraWG;
     info.flags |= (strategy.fillGoal << FlagShiftFillGoal) & FlagMaskFillGoal;
     info.slm = int(gemmSLMSize(hw, problem, strategy));
     info.perKSLM = int(gemmPerKSLMSize(hw, problem, strategy));
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/emulation.cxx b/src/gpu/intel/jit/gemm/generator/pieces/emulation.cxx
index e47b0558bf5..0a5c813f8c9 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/emulation.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/emulation.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -51,13 +51,13 @@ static inline DataType withSignedness(DataType dt, bool signedType)
 // Three-argument add.
 template <HW hw>
 template <typename DT, typename S0, typename S2>
-void BLASKernelGenerator<hw>::eadd3(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, const S2 &src2)
+void BLASKernelGenerator<hw>::eadd3(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, const S2 &src2, ngen::SourceLocation loc)
 {
     if ((hw >= HW::XeHP) && !(dst.getOffset() & 1))
-        add3<DT>(mod, dst, src0, src1, src2);
+        add3<DT>(mod, dst, src0, src1, src2, loc);
     else {
-        add<DT>(mod, dst, src1, src0);
-        add<DT>(mod, dst, dst, src2);
+        add<DT>(mod, dst, src1, src0, loc);
+        add<DT>(mod, dst, dst, src2, loc);
     }
 }
 
@@ -65,18 +65,18 @@ template <HW hw>
 template <typename S0>
 void BLASKernelGenerator<hw>::ecsel(const InstructionModifier &mod, const InstructionModifier &cmod, const FlagRegister &flag,
                                     const RegData &dst,  const S0 &src0,
-                                    const RegData &src1, const RegData &src2)
+                                    const RegData &src1, const RegData &src2, ngen::SourceLocation loc)
 {
     if (hw == HW::Gen9 || dst.getByteOffset() & 7) {
-        cmp(mod | cmod | flag, src2, 0);
-        sel(mod | ~flag, dst, src1, src0);
+        cmp(mod | cmod | flag, src2, 0, loc);
+        sel(mod | ~flag, dst, src1, src0, loc);
     } else
-        csel(mod | cmod | flag, dst, src0, src1, src2);
+        csel(mod | cmod | flag, dst, src0, src1, src2, loc);
 };
 
 template <HW hw>
 template <typename DT>
-void BLASKernelGenerator<hw>::emov(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::emov(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
     EmulationImplementation::applyDefaultType<DT>(dst);
     EmulationImplementation::applyDefaultType<DT>(src0);
@@ -96,25 +96,25 @@ void BLASKernelGenerator<hw>::emov(const ngen::InstructionModifier &mod, ngen::R
     if (dst.getType() == DataType::f && src0.getType() == DataType::bf) {
         dst.setType(DataType::ud);
         src0.setType(DataType::uw);
-        shl(mod, dst, src0, 16);
+        shl(mod, dst, src0, 16, loc);
     } else if (!strategy.systolicAvailable && dst.getType() == DataType::bf && src0.getType() == DataType::f) {
         // Emulated f32->bf16 RTNE conversion.
         auto flag = state.emulate.flag;
         if (!flag.isValid()) stub();
         dst.setType(DataType::uw);
         src0.setType(DataType::ud);
-        add(mod, src0, src0, -0x8000);
-        and_(mod | nz | flag, null.ud(), src0, 0x1FFFF);
-        mov(mod, dst, EmulationImplementation::highWord(src0));
+        add(mod, src0, src0, -0x8000, loc);
+        and_(mod | nz | flag, null.ud(), src0, 0x1FFFF, loc);
+        mov(mod, dst, EmulationImplementation::highWord(src0), loc);
         // add(mod, src0, src0, 0x8000);       // Preserve src0 -- if nondestructive mov -- not needed
-        add(mod | flag, dst, dst, 1);
+        add(mod | flag, dst, dst, 1, loc);
     } else
-        EmulationImplementation::emov(*this, mod, dst, src0, strategy.emulate);
+        EmulationImplementation::emov(*this, mod, dst, src0, strategy.emulate, loc);
 }
 
 template <HW hw>
 template <typename DT>
-void BLASKernelGenerator<hw>::eadd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::eadd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
     if (dst.getType() == DataType::f && src0.getType() == DataType::f && src1.getType() == DataType::bf && src1.getHS() != 1) {
         GRF alloced, temp = state.emulate.temp[0];
@@ -123,17 +123,17 @@ void BLASKernelGenerator<hw>::eadd(const InstructionModifier &mod, const RegData
 
         auto src1UW = src1;
         src1UW.setType(DataType::uw);
-        mov(mod, temp.uw(0)(1), src1UW);
-        add(mod, dst, src0, temp.bf(0)(1));
+        mov(mod, temp.uw(0)(1), src1UW, loc);
+        add(mod, dst, src0, temp.bf(0)(1), loc);
 
         state.ra.safeRelease(alloced);
     } else
-        EmulationImplementation::eadd<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate);
+        EmulationImplementation::eadd<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc);
 }
 
 template <HW hw>
 template <typename S0>
-void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, RegData src1, RegData src2, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, RegData src1, RegData src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
     bool sub = false;
     if (src1.getNeg()) {
@@ -144,17 +144,17 @@ void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData
         src2 = -src2;
         sub = !sub;
     }
-    emad(mod, dst, src0, src1, src2, strategy, state, sub);
+    emad(mod, dst, src0, src1, src2, strategy, state, sub, loc);
 }
 
 template <HW hw>
 template <typename S0, typename S2>
-void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, const S2 &src2, const CommonStrategy &strategy, CommonState &state, bool sub)
+void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, const S2 &src2, const CommonStrategy &strategy, CommonState &state, bool sub, ngen::SourceLocation loc)
 {
     auto dstType = dst.getType();
     if ((hw >= HW::Gen10 && !sub && !(dst.getByteOffset() & 7) && !one_of(dstType, DataType::q, DataType::uq) && !one_of(src2.getType(), DataType::d, DataType::ud))
             || one_of(dstType, DataType::hf, DataType::f, DataType::df)) {
-        mad(mod, dst, src0, src1, src2);
+        mad(mod, dst, src0, src1, src2, loc);
     } else {
         auto ttype = withSignedness(dst.getType(), isSigned(src1.getType()) || isSigned(src2.getType()));
         RegData temp;
@@ -167,8 +167,8 @@ void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData
             temp = tempRange[0].retype(ttype);
         }
 
-        emul(unsaturated(mod), temp, src1, src2, strategy, state);
-        eadd(mod, dst, sub ? -temp : temp, src0, strategy, state);
+        emul(unsaturated(mod), temp, src1, src2, strategy, state, loc);
+        eadd(mod, dst, sub ? -temp : temp, src0, strategy, state, loc);
 
         state.ra.safeRelease(tempSub);
         state.ra.safeRelease(tempRange);
@@ -177,22 +177,22 @@ void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData
 
 template <HW hw>
 template <typename S0>
-void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, const Immediate &src2, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, const Immediate &src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
-    emad(mod, dst, src0, src1, src2, strategy, state, false);
+    emad(mod, dst, src0, src1, src2, strategy, state, false, loc);
 }
 
 template <HW hw>
 template <typename S0>
-void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, int32_t src2, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, int32_t src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
     auto dstType = dst.getType();
     if (src2 == 0)
-        emov(mod, dst, src0, strategy, state);
+        emov(mod, dst, src0, strategy, state, loc);
     else if (src2 == 1)
-        eadd(mod, dst, src1, src0, strategy, state);
+        eadd(mod, dst, src1, src0, strategy, state, loc);
     else if (hw >= HW::Gen10 && !(dst.getByteOffset() & 7) && (src2 >= -0x8000 && src2 < 0x10000) && !one_of(dstType, DataType::q, DataType::uq)) {
-        mad(mod, dst, src0, src1, src2);
+        mad(mod, dst, src0, src1, src2, loc);
     } else {
         auto ttype = isSigned(src1.getType()) ? DataType::d : DataType::ud;
         Subregister tempScalar;
@@ -204,8 +204,8 @@ void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData
             tempGRFs = state.ra.alloc_range(2);
             temp = tempGRFs[0].retype(ttype);
         }
-        emulConstant(unsaturated(mod), temp, src1, src2, strategy, state);
-        eadd(mod, dst, temp, src0, strategy, state);
+        emulConstant(unsaturated(mod), temp, src1, src2, strategy, state, loc);
+        eadd(mod, dst, temp, src0, strategy, state, loc);
         state.ra.safeRelease(tempScalar);
         state.ra.safeRelease(tempGRFs);
     }
@@ -213,31 +213,31 @@ void BLASKernelGenerator<hw>::emad(const InstructionModifier &mod, const RegData
 
 template <HW hw>
 template <typename S0>
-void BLASKernelGenerator<hw>::eaddScaled(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, Type src2, const CommonStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::eaddScaled(const InstructionModifier &mod, const RegData &dst, const S0 &src0, const RegData &src1, Type src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
-    if (src2.isInt4()) {
+    if (src2.is4Bit()) {
         auto tmpRange = state.ra.alloc_range(2);
         auto tmp = tmpRange[0].retype(src1.getType());
-        eshr(mod, tmp, src1, 1, strategy, state);
-        eadd(mod, dst, tmp, src0, strategy, state);
+        eshr(mod, tmp, src1, 1, strategy, state, loc);
+        eadd(mod, dst, tmp, src0, strategy, state, loc);
         state.ra.safeRelease(tmpRange);
     } else
-        emad(mod, dst, src0, src1, src2.size(), strategy, state);
+        emad(mod, dst, src0, src1, src2.size(), strategy, state, loc);
 }
 
 template <HW hw>
 template <typename DT>
-void BLASKernelGenerator<hw>::emulConstant(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, Type src1, const CommonStrategy &strategy, const CommonState &state)
+void BLASKernelGenerator<hw>::emulConstant(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, Type src1, const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc)
 {
-    if (src1.isInt4())
-        eshr<DT>(mod, dst, src0, 1, strategy, state);
+    if (src1.is4Bit())
+        eshr<DT>(mod, dst, src0, 1, strategy, state, loc);
     else
-        emulConstant<DT>(mod, dst, src0, src1.size(), strategy, state);
+        emulConstant<DT>(mod, dst, src0, src1.size(), strategy, state, loc);
 }
 
 template <HW hw>
 template <typename DT>
-void BLASKernelGenerator<hw>::emath(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const GEMMStrategy &strategy, CommonState &state)
+void BLASKernelGenerator<hw>::emath(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const GEMMStrategy &strategy, CommonState &state, ngen::SourceLocation loc)
 {
     if (hw == HW::XeHP && strategy.systolic && mod.getExecSize() <= 8) {
         // Workaround for DPAS + SIMD8 EM hang: use SIMD16 arithmetic.
@@ -247,27 +247,27 @@ void BLASKernelGenerator<hw>::emath(const InstructionModifier &mod, MathFunction
         auto temp = state.ra.alloc_range(2);
         auto tt = temp[0].retype(src0.getType());
 
-        mov(mod.getExecSize(), tt, src0);
-        math(mod16, fc, tt, tt);
-        mov(mod.getExecSize(), dst, tt);
+        mov(mod.getExecSize(), tt, src0, loc);
+        math(mod16, fc, tt, tt, loc);
+        mov(mod.getExecSize(), dst, tt, loc);
 
         state.ra.safeRelease(temp);
     } else
-        math(mod, fc, dst, src0);
+        math(mod, fc, dst, src0, loc);
 }
 
 template <HW hw>
-void BLASKernelGenerator<hw>::ejmpi(InstructionModifier mod, Label &dst)
+void BLASKernelGenerator<hw>::ejmpi(InstructionModifier mod, Label &dst, ngen::SourceLocation loc)
 {
     if (hw >= HW::XeHPC && mod.getPredCtrl() == PredCtrl::anyv && !mod.isPredInv()) {
         mod.setPredCtrl(PredCtrl::Normal);
-        jmpi(mod, dst);
+        jmpi(mod, dst, loc);
         auto flag = mod.getFlagReg();
         flag.setBase(flag.getBase() ^ 1);
         mod.setFlagReg(flag);
-        jmpi(mod, dst);
+        jmpi(mod, dst, loc);
     } else
-        jmpi(mod, dst);
+        jmpi(mod, dst, loc);
 }
 
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/gemm.cxx b/src/gpu/intel/jit/gemm/generator/pieces/gemm.cxx
index aaa290be9d8..102a97bbcf3 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/gemm.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/gemm.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,7 +35,7 @@ using std::vector;
 template <HW hw>
 void BLASKernelGenerator<hw>::gemm(GEMMProblem problem, GEMMStrategy strategy, const InterfaceHandler &interface_)
 {
-    GEMMState state(hw);
+    GEMMState state(hw, strategy);
     interface = interface_;
     gemm(problem, strategy, state);
 }
@@ -57,20 +57,23 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
     gemmInitState(problem, strategy, state);
 
     // Transfer surface indices to strategy AddressBases.
-    if (!strategy.A.base.isStateless()) strategy.A.base.setIndex(state.inputs.surfaceA);
-    if (!strategy.B.base.isStateless()) strategy.B.base.setIndex(state.inputs.surfaceB);
-    if (!strategy.C.base.isStateless()) {
-        strategy.C.base.setIndex(state.inputs.surfaceC[0]);
-        if (state.C_count > 1) stub();
-        if (state.useTempC)
-            state.tempCStrategy.base.setIndex(state.inputs.surfaceTempC);
-    }
-    if (problem.usesCO() && !strategy.CO.base.isStateless())
-        strategy.CO.base.setIndex(state.inputs.surfaceCO);
+    strategy.A.assignSurface(state.inputs.surfaceA);
+    strategy.B.assignSurface(state.inputs.surfaceB);
+    strategy.C.assignSurface(state.inputs.surfaceC[0]);
+
+    if (!strategy.C.base.isStateless() && state.C_count > 1) stub();
+    if (state.useTempC)
+        state.tempCStrategy.assignSurface(state.inputs.surfaceTempC);
+    if (problem.usesCO())
+        strategy.CO.assignSurface(state.inputs.surfaceCO);
 
     for (size_t i = 0; i < strategy.binary.size(); i++)
-        if (!strategy.binary[i].base.isStateless())
-            strategy.binary[i].base.setIndex(state.inputs.binarySurfaces[i]);
+        strategy.binary[i].assignSurface(state.inputs.binarySurfaces[i]);
+
+    strategy.AO.assignSurface(state.inputs.surfaceAO);
+    strategy.BO.assignSurface(state.inputs.surfaceBO);
+    strategy.A_scale.assignSurface(state.inputs.surfaceAScale);
+    strategy.B_scale.assignSurface(state.inputs.surfaceBScale);
 
     // Prologue.
     if (!inFusedGEMM)
@@ -116,6 +119,23 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
     // Scale LDs/offsets.
     gemmScaleInputs(problem, strategy, state);
 
+    // Check if this is a TLB warmup thread, and perform warmup if so.
+    if (strategy.tlbWarmup) {
+        Label lNotTLBWarmup;
+        state.groupIDMN = state.ra.alloc_sub<uint32_t>(getHint(HintType::LongTerm, strategy));
+        add(1 | ge | f1[0], state.groupIDMN.d(), state.inputs.groupIDMN, -1);
+        jmpi(1 | f1[0], lNotTLBWarmup);
+        status << "TLB warmup" << status_stream::endl;
+        auto mstate = state;
+        moveR0(strategy, mstate);
+        gemmGetBatchIDs(problem, strategy, mstate);
+        gemmOffsetBatchABC(problem, strategy, mstate);
+        gemmSetupABC(problem, strategy, mstate);
+        gemmTLBWarmup(problem, strategy, mstate);
+        epilogue(strategy, mstate);
+        mark(lNotTLBWarmup);
+    }
+
     // Local ID handling and saving.
     gemmReorderLocalIDs(problem, strategy, state);
 
@@ -140,30 +160,24 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
     if (strategy.kParallelVariable)
         state.k0Rem = copySubregister(state.inputs.k0, state, getHint(HintType::LongTerm, strategy));
 
+    // L3 prefetch warmup.
+    if (strategy.prefetchABL3) {
+        gemmInitL3Prefetch(false, problem, strategy, state);
+        gemmWarmupL3Prefetch(problem, strategy, state);
+    }
+
     // Surface handling for quantization parameters.
     auto replace0 = [&](Subregister &s) {
-        if (s.isValid()) {
+        if (s.isValid())
             state.ra.release(s);
-            s = state.ra.alloc_sub<uint32_t>();
-            mov(1, s, 0);
-        }
+        s = state.ra.alloc_sub<uint32_t>();
+        mov(1, s, 0);
     };
 
-    state.A_offsetStrategy.base = A64;
-    state.B_offsetStrategy.base = A64;
-    state.A_scaleStrategy.base = A64;
-    state.B_scaleStrategy.base = A64;
-
-    if (problem.quantized2DA() && !strategy.A.base.isStateless()) {
-        if (problem.aoPtrDims == 2) replace0(state.inputs.aoPtr);
-        replace0(state.inputs.aScalePtr);
-        state.A_offsetStrategy.base = state.A_scaleStrategy.base = AddressBase::createBTS(0);
-    }
-    if (problem.quantized2DB() && !strategy.B.base.isStateless()) {
-        if (problem.boPtrDims == 2) replace0(state.inputs.boPtr);
-        replace0(state.inputs.bScalePtr);
-        state.B_offsetStrategy.base = state.B_scaleStrategy.base = AddressBase::createBTS(0);
-    }
+    if (!strategy.AO.base.isStateless() && problem.aoPtrDims == 2) replace0(state.inputs.aoPtr);
+    if (!strategy.A_scale.base.isStateless())                      replace0(state.inputs.aScalePtr);
+    if (!strategy.BO.base.isStateless() && problem.boPtrDims == 2) replace0(state.inputs.boPtr);
+    if (!strategy.B_scale.base.isStateless())                      replace0(state.inputs.bScalePtr);
 
     // A/B offset pointer handling.
     bool aOffset = (problem.aOffset != ABOffset::None);
@@ -208,6 +222,14 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
     if (aoScalarLoad) loadABO(problem.Tao, state.inputs.ao, state.inputs.aoPtr);
     if (boScalarLoad) loadABO(problem.Tbo, state.inputs.bo, state.inputs.boPtr);
 
+    if (problem.cStochasticRound) {
+        state.inputs.sroundSeed = state.ra.alloc_sub(DataType::ud, getHint(HintType::LongTerm, strategy));
+        vector<Subregister> srcs;
+        srcs.push_back(state.inputs.sroundSeedPtr);
+        auto seedLoad = loadScalars(Type::u32, srcs, strategy, state);
+        mov(1, state.inputs.sroundSeed, seedLoad);
+    }
+
     // 2D scale address handling.
     if (problem.aScale2D && state.inputs.offsetAScale.isValid())
         eadd(1, state.inputs.aScalePtr, state.inputs.aScalePtr, state.inputs.offsetAScale, strategy, state);
@@ -217,17 +239,20 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
     state.ra.safeRelease(state.inputs.offsetAScale);
     state.ra.safeRelease(state.inputs.offsetBScale);
 
-    if (problem.aqGroupK == 0) problem.aqGroupK = std::max(strategy.unrollKSLM, strategy.ka_load);
-    if (problem.bqGroupK == 0) problem.bqGroupK = std::max(strategy.unrollKSLM, strategy.kb_load);
+    if (problem.aqGroupK == 0) problem.aqGroupK = strategy.slmA ? strategy.unrollKSLM : strategy.ka_load;
+    if (problem.bqGroupK == 0) problem.bqGroupK = strategy.slmB ? strategy.unrollKSLM : strategy.kb_load;
+    if (problem.aqGroupM == 0) problem.aqGroupM = 1;
+    if (problem.bqGroupN == 0) problem.bqGroupN = 1;
 
     // Persistent thread preparation and re-entry.
     if (strategy.persistent) {
         if (!strategy.linearOrder()) stub();
         if (problem.batch != BatchMode::None) stub();       // need to wrangle groupIDK also
 
-        auto newGroupIDMN = state.ra.alloc_sub<uint32_t>(getHint(HintType::LongTerm, strategy));
-        mov(1, newGroupIDMN, state.inputs.groupIDMN);
-        state.inputs.groupIDMN = newGroupIDMN;
+        if (state.groupIDMN == state.inputs.groupIDMN) {
+            state.groupIDMN = state.ra.alloc_sub<uint32_t>(getHint(HintType::LongTerm, strategy));
+            mov(1, state.groupIDMN, state.inputs.groupIDMN);
+        }
 
         if (state.effTempC == state.inputs.tempC)
             state.effTempC = state.ra.alloc_sub<uint64_t>(getHint(HintType::LongTerm, strategy));
@@ -256,7 +281,7 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
         // Check if we have reached the k-sliced region yet, and if so,
         //   if we need to do k-slicing computations.
         and_(1 | nz | f0[1], null.ud(), state.inputs.flags, FlagKSlicing);
-        add(1 | ge | f0[0], slicedGroupIdx, state.inputs.groupIDMN, -state.inputs.kParallelStart);
+        add(1 | ge | f0[0], slicedGroupIdx, state.groupIDMN, -state.inputs.kParallelStart);
         mov(1, state.h0, 0);
         mov(1 | lt | f1[1], temp, state.fullK);
         jmpi(1 | f0[1], lAlreadySliced);
@@ -292,7 +317,7 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
         or_(1, state.inputs.flags, state.inputs.flags, FlagKSlicing);
         divDown(slicedGroupIdx.ud(), temp, kpad, state.inputs.kRecip, f1[0], strategy, state);
         emad(1, state.h0, temp, -kpad, slicedGroupIdx.ud(), strategy, state);
-        eadd3(1, state.inputs.groupIDMN, groupCountMN, -slicedGroupIdx.ud(), -1);
+        eadd3(1, state.groupIDMN, groupCountMN, -slicedGroupIdx.ud(), -1);
         if (strategy.altFusedBeta)
             add(1 | gt | f0[1], temp3.d(), state.h0, -state.inputs.k0);
         add(1 | le | f1[1], temp.d(), state.fullK, -state.h0);
@@ -300,7 +325,7 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
             eadd3(1 | ge | f1[0], temp2.d(), state.h0, state.inputs.k0, -kpad);
             cmp(1 | f0[1] | lt | f0[1], null.ud(), temp3.ud(), state.fullK);
         }
-        add(1, slicedGroupIdx, state.inputs.groupIDMN, -state.inputs.kParallelStart);
+        add(1, slicedGroupIdx, state.groupIDMN, -state.inputs.kParallelStart);
         if (strategy.altFusedBeta) {
             // Decide if we are responsible for beta scaling.
             // If within padded region (h0 >= k, f1.1), scale if f0.1:
@@ -316,7 +341,7 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
         // In the tail case, it's possible that we don't have a full k0 chunk
         //  of work to do. Bail if so.
         // If bias enabled, do it if h0 = 0.
-        cmp(1 | lt | f0[0], state.inputs.groupIDMN.d(), state.inputs.kParallelStart);
+        cmp(1 | lt | f0[0], state.groupIDMN.d(), state.inputs.kParallelStart);
         if (problem.cOffset == COffset::Pre)
             cmp(1 | gt | f0[1], state.h0, 0);
         min_(1, state.inputs.k, temp.d(), state.k0Rem);     /* temp holds k - h0 */
@@ -391,16 +416,11 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
 
     if (strategy.kParallel && (strategy.fuseBeta || strategy.fusePostOps)) {
         if (!strategy.linearOrder()) stub();
-        gemmFusedBetaPOInit(state.inputs.groupIDMN, problem, strategy, state);
+        gemmFusedBetaPOInit(state.groupIDMN, problem, strategy, state);
     }
 
     // Group ID remapping.
-    if (strategy.cWalkOrder == WalkOrder::SimpleLinear)
-        gemmSimpleLinearOrder(problem, strategy, state);
-    else if (strategy.cWalkOrder == WalkOrder::Hilbertlike)
-        gemmHilbertlikeOrder(problem, strategy, state);
-    else if (strategy.cWalkOrder == WalkOrder::Boustrophedon)
-        gemmBoustrophedonOrder(problem, strategy, state);
+    gemmReorderGlobalIDs(problem, strategy, state);
 
     // Batch handling.
     gemmGetBatchIDs(problem, strategy, state);
@@ -506,6 +526,8 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
     if (strategy.linearOrder() || strategy.persistent) {
         state.ra.safeRelease(state.inputs.groupIDM);
         state.ra.safeRelease(state.inputs.groupIDN);
+        state.ra.claim(state.nextGroupIDM);
+        state.ra.claim(state.nextGroupIDN);
     }
 
     moveR0(strategy, state);
@@ -518,13 +540,11 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
             // k <- floor(k / (chunk * k local size)) * chunk + min(k % (chunk * k local size), chunk)
             auto chunk = strategy.kInterleaveChunk;
             auto wgChunk = chunk * strategy.wg[LoopK];
-            if (!is_zero_or_pow2(wgChunk)) stub();
             auto temp1 = state.ra.alloc_sub<uint32_t>();
             auto temp2 = state.ra.alloc_sub<uint32_t>();
-
-            shr(1, temp1, state.inputs.k.ud(), ilog2(strategy.wg[LoopK]));
-            and_(1, temp2, state.inputs.k.ud(), wgChunk - 1);
-            and_(1, temp1, temp1, ~uint32_t(chunk - 1));
+            divDown(temp1, state.inputs.k.ud(), wgChunk, strategy, state);
+            emad(1, temp2, state.inputs.k.ud(), -temp1, wgChunk, strategy, state);
+            emul(1, temp1, temp1, chunk, strategy, state);
             min_(1, temp2, temp2, chunk);
             add(1, state.inputs.k.ud(), temp1, temp2);
 
@@ -616,7 +636,7 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
             and_(1 | ze | f0[0], null.ud(), state.inputs.flags, FlagKSlicing);
             cmp(1 | gt | f1[0], state.k0Rem, 0);
             jmpi(1 | f0[0], lNotKSliced);
-            add(1, state.inputs.groupIDMN, state.inputs.groupIDMN, -1);
+            add(1, state.groupIDMN, state.groupIDMN, -1);
             alignDown(1 | gt | f0[1], state.k0Rem.ud(), state.k0Rem.ud(), strategy.kAlign(problem), strategy, state);
             persistentRestore();
             jmpi(1 | f0[1], lReentry);
@@ -629,8 +649,8 @@ void BLASKernelGenerator<hw>::gemm(GEMMProblem &problem, GEMMStrategy &strategy,
             emul(1, state.inputs.groupCountMN, state.inputs.groupCountM, state.inputs.groupCountN, strategy, state);
         }
 
-        add(1, state.inputs.groupIDMN, state.inputs.groupIDMN, state.inputs.groupStride);
-        cmp(1 | lt | f0[0], state.inputs.groupIDMN, state.inputs.groupCountMN);
+        add(1, state.groupIDMN, state.groupIDMN, state.inputs.groupStride);
+        cmp(1 | lt | f0[0], state.groupIDMN, state.inputs.groupCountMN);
         state.ra.safeRelease(state.inputs.groupCountMN);
 
         persistentRestore();
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/gemm_microkernel.cxx b/src/gpu/intel/jit/gemm/generator/pieces/gemm_microkernel.cxx
index 7b333dd587f..09e6a0404a9 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/gemm_microkernel.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/gemm_microkernel.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@ using namespace ngen::utils;
 template <HW hw>
 void BLASKernelGenerator<hw>::gemmMicrokernel(GEMMProblem problem, GEMMStrategy strategy, const ngen::InterfaceHandler &interface_)
 {
-    GEMMState state(hw);
+    GEMMState state(hw, strategy);
 
     interface = interface_;
 
@@ -40,6 +40,11 @@ void BLASKernelGenerator<hw>::gemmMicrokernel(GEMMProblem problem, GEMMStrategy
 
     strategy.forceWGUpdate = WGFixed;
 
+    strategy.AO.base = A64;
+    strategy.BO.base = A64;
+    strategy.A_scale.base = A64;
+    strategy.B_scale.base = A64;
+
     state.isNested = true;
 
     /* Leave some space for host kernel arguments */
@@ -100,10 +105,7 @@ void BLASKernelGenerator<hw>::gemmMicrokernel(GEMMProblem problem, GEMMStrategy
     state.lid0 = (strategy.fusedLoop == LoopN) ? state.lidN : state.lidM;
     getFusedID(strategy.unroll[strategy.fusedLoop], problem, strategy, state);
 
-    emulConstant(1, state.inputs.lda, state.inputs.lda, problem.Ta_ext.size(), strategy, state);
-    emulConstant(1, state.inputs.ldb, state.inputs.ldb, problem.Tb_ext.size(), strategy, state);
-    if (!registerC)
-        emulConstant(1, state.inputs.ldc[0], state.inputs.ldc[0], problem.Tc_ext.size(), strategy, state);
+    gemmScaleInputs(problem, strategy, state);
 
     if (wgCheck || gemmtBarriers) {
         state.wgI0 = copySubregister(state.i0, state);
@@ -177,7 +179,38 @@ micro::Package BLASKernelGenerator<hw>::gemmMicrokernelPackage(const GEMMProblem
         Argument arg;
         arg.name = parg.name;
 
-        if (arg.name == "c") {
+        /* Map microkernel argument name to gemmstone internal name (iname) */
+        auto iname = arg.name;
+
+        if (transposeC) {
+            if (iname == "a") iname = "b";
+            else if (iname == "b") iname = "a";
+            else if (iname == "lda") iname = "ldb";
+            else if (iname == "ldb") iname = "lda";
+            else if (iname == "m") iname = "n";
+            else if (iname == "n") iname = "m";
+            else if (iname == "i0") iname = "j0";
+            else if (iname == "j0") iname = "i0";
+            else if (iname == "local_id_m") iname = "local_id_n";
+            else if (iname == "local_id_n") iname = "local_id_m";
+            else if (iname == "a_scale") iname = "b_scale";
+            else if (iname == "b_scale") iname = "a_scale";
+            else if (iname == "a_offset") iname = "b_offset";
+            else if (iname == "b_offset") iname = "a_offset";
+            else if (iname == "ldaq") iname = "ldbq";
+            else if (iname == "ldbq") iname = "ldaq";
+        }
+
+        if (iname == "a") iname = "A";
+        else if (iname == "b") iname = "B";
+        else if (iname == "slm") iname = "slm_base";
+        else if (iname == "a_offset") iname = "ao_ptr";
+        else if (iname == "b_offset") iname = "bo_ptr";
+        else if (iname == "a_scale") iname = "a_scale_ptr";
+        else if (iname == "b_scale") iname = "b_scale_ptr";
+
+        /* Locate argument registers */
+        if (iname == "c") {
             int tileM = strategy.unroll[LoopM];
             int tileN = strategy.unroll[LoopN];
             int blockM = outputCLayout[0].nr;
@@ -222,32 +255,20 @@ micro::Package BLASKernelGenerator<hw>::gemmMicrokernelPackage(const GEMMProblem
             arg.sizes.block[0] = blockM;
             arg.sizes.block[1] = blockN;
         } else {
-            const char *aname = parg.name;
-            if (arg.name == "a") aname = "A";
-            if (arg.name == "b") aname = "B";
-            if (arg.name == "slm") aname = "slm_base";
-            auto reg = interface.getArgument(aname);
+            auto reg = interface.getArgument(iname);
             arg.location.resize(1);
             arg.location[0].boffset = reg.getBase() * GRF::bytes(hw) + reg.getByteOffset();
             arg.location[0].blen = reg.getBytes();
         }
 
-        if (arg.name == "a") arg.actualType = microType(problem.Ta_ext);
-        if (arg.name == "b") arg.actualType = microType(problem.Tb_ext);
-        if (arg.name == "c") arg.actualType = microType(problem.Tc);
-
-        if (transposeC) {
-            if (arg.name == "a") arg.name = "b";
-            else if (arg.name == "b") arg.name = "a";
-            else if (arg.name == "lda") arg.name = "ldb";
-            else if (arg.name == "ldb") arg.name = "lda";
-            else if (arg.name == "m") arg.name = "n";
-            else if (arg.name == "n") arg.name = "m";
-            else if (arg.name == "i0") arg.name = "j0";
-            else if (arg.name == "j0") arg.name = "i0";
-            else if (arg.name == "local_id_m") arg.name = "local_id_n";
-            else if (arg.name == "local_id_n") arg.name = "local_id_m";
-        }
+        /* Provide actual argument types */
+        if (iname == "A") arg.actualType = microType(problem.Ta_ext);
+        if (iname == "B") arg.actualType = microType(problem.Tb_ext);
+        if (iname == "c") arg.actualType = microType(problem.Tc);
+        if (iname == "ao_ptr") arg.actualType = microType(problem.Tao);
+        if (iname == "bo_ptr") arg.actualType = microType(problem.Tbo);
+        if (iname == "a_scale_ptr") arg.actualType = microType(problem.Ta_scale);
+        if (iname == "b_scale_ptr") arg.actualType = microType(problem.Tb_scale);
 
         package.arguments.push_back(std::move(arg));
     }
@@ -285,6 +306,8 @@ static inline micro::StructuredType::Type microType(Type T)
         CASE(u32)
         CASE(u16)
         CASE(u8)
+        CASE(s4)
+        CASE(u4)
         default: stub("Unsupported type");
     }
 #undef CASE
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/gemm_setup.cxx b/src/gpu/intel/jit/gemm/generator/pieces/gemm_setup.cxx
index 4ff9844dfe3..e2adbb60e7d 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/gemm_setup.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/gemm_setup.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -139,27 +139,28 @@ void BLASKernelGenerator<hw>::gemmCalcWorkshareAOffset(Subregister &off, Subregi
         }
     } else {
         auto Ta_ext = problem.Ta_ext;
-        off = state.ra.alloc_sub<uint32_t>(getHint(HintType::TempComp0, strategy));
+        auto Toff = A_strategy.base.isA64() ? DataType::uq : DataType::ud;
+        off = state.ra.alloc_sub(Toff, getHint(HintType::TempComp0, strategy));
 
         switch (A.layout) {
             case MatrixLayout::Pc:
-                mulConstant(1, off, lid, ma * ka * Ta_ext);
+                emulConstant(1, off, lid, ma * ka * Ta_ext, strategy, state);
                 break;
             case MatrixLayout::T:
                 if (splitLinear) stub();
                 if (splitM) {
-                    mul(1, off, state.inputs.lda, lid);
-                    mulConstant(1, off, off, ma);
+                    emul(1, off, state.inputs.lda, lid, strategy, state);
+                    emulConstant(1, off, off, ma, strategy, state);
                 } else
-                    mulConstant(1, off, lid, ka * Ta_ext);
+                    emulConstant(1, off, lid, ka * Ta_ext, strategy, state);
                 break;
             case MatrixLayout::N:
                 if (splitLinear) stub();
                 if (splitM)
-                    mulConstant(1, off, lid, ma * Ta_ext);
+                    emulConstant(1, off, lid, ma * Ta_ext, strategy, state);
                 else {
-                    mul(1, off, state.inputs.lda, lid);
-                    mulConstant(1, off, off, ka);
+                    emul(1, off, state.inputs.lda, lid, strategy, state);
+                    emulConstant(1, off, off, ka, strategy, state);
                 }
                 break;
             default: stub();
@@ -193,27 +194,28 @@ void BLASKernelGenerator<hw>::gemmCalcWorkshareBOffset(Subregister &off, Subregi
         }
     } else {
         auto Tb_ext = problem.Tb_ext;
-        off = state.ra.alloc_sub<uint32_t>(getHint(HintType::TempComp0, strategy));
+        auto Toff = B_strategy.base.isA64() ? DataType::uq : DataType::ud;
+        off = state.ra.alloc_sub(Toff, getHint(HintType::TempComp0, strategy));
 
         switch (B.layout) {
             case MatrixLayout::Pr:
-                mulConstant(1, off, lid, nb * kb * Tb_ext);
+                emulConstant(1, off, lid, nb * kb * Tb_ext, strategy, state);
                 break;
             case MatrixLayout::N:
                 if (splitLinear) stub();
                 if (splitN) {
-                    mul(1, off, state.inputs.ldb, lid);
-                    mulConstant(1, off, off, nb);
+                    emul(1, off, state.inputs.ldb, lid, strategy, state);
+                    emulConstant(1, off, off, nb, strategy, state);
                 } else
-                    mulConstant(1, off, lid, kb * Tb_ext);
+                    emulConstant(1, off, lid, kb * Tb_ext, strategy, state);
                 break;
             case MatrixLayout::T:
                 if (splitLinear) stub();
                 if (splitN)
-                    mulConstant(1, off, lid, nb * Tb_ext);
+                    emulConstant(1, off, lid, nb * Tb_ext, strategy, state);
                 else {
-                    mul(1, off, state.inputs.ldb, lid);
-                    mulConstant(1, off, off, kb);
+                    emul(1, off, state.inputs.ldb, lid, strategy, state);
+                    emulConstant(1, off, off, kb, strategy, state);
                 }
                 break;
             default: stub();
@@ -264,6 +266,19 @@ CoopSplit BLASKernelGenerator<hw>::effCoopSplitB(const GEMMProblem &problem, con
         return strategy.coopB;
 }
 
+// Offset A pointer in m dimension by a variable value.
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmOffsetAm(const Subregister &i, const Subregister &effA, const MatrixAddressing &globalA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    auto Ta_ext = problem.Ta_ext;
+    switch (globalA.layout) {
+        case MatrixLayout::N:  eaddScaled(1, effA, effA, i, Ta_ext, strategy, state); break;
+        case MatrixLayout::Pc:
+        case MatrixLayout::T:  emad(1, effA, effA, state.inputs.lda, i, strategy, state); break;
+        default: stub();
+    }
+}
+
 // Offset A pointer in k dimension by a constant value.
 template <HW hw>
 void BLASKernelGenerator<hw>::gemmOffsetAk(int h, const Subregister &effA, const MatrixAddressing &globalA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
@@ -316,6 +331,19 @@ void BLASKernelGenerator<hw>::gemmOffsetBk(const Subregister &h, const Subregist
     }
 }
 
+// Offset B pointer in n dimension by a variable value.
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmOffsetBn(const Subregister &j, const Subregister &effB, const MatrixAddressing &globalB, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    auto Tb_ext = problem.Tb_ext;
+    switch (globalB.layout) {
+        case MatrixLayout::Pr:
+        case MatrixLayout::N:  emad(1, effB, effB, state.inputs.ldb, j, strategy, state); break;
+        case MatrixLayout::T:  eaddScaled(1, effB, effB, j, Tb_ext, strategy, state); break;
+        default: stub();
+    }
+}
+
 // Adjust A, B, C to start at (i0, j0).
 //  initial is true to adjust offset_{A,B,C}, false to adjust A,B,C pointers.
 template <HW hw>
@@ -412,14 +440,19 @@ void BLASKernelGenerator<hw>::gemmOffsetABC(bool initial, Subregister i0, Subreg
             auto offsetC = initial ? state.offsetC[q] : state.effC[q];
 
             Subregister x, y;
-            int xstride = Tc_ext.size();
+            int xstride = Tc_ext.is4Bit() ? 0 : Tc_ext.size(); //Tc_ext.paddedSize();
             switch (problem.C.layout) {
-                case MatrixLayout::Pr:  xstride *= strategy.unroll[LoopN];   /* fall through */
+                case MatrixLayout::Pr:  xstride = strategy.unroll[LoopN] * Tc_ext;   /* fall through */
                 case MatrixLayout::N:   x = i0; y = j0;             break;
-                case MatrixLayout::Pc:  xstride *= strategy.unroll[LoopM];   /* fall through */
+                case MatrixLayout::Pc:  xstride = strategy.unroll[LoopM] * Tc_ext;   /* fall through */
                 case MatrixLayout::T:   x = j0; y = i0;             break;
             }
-            emad(1, offsetC, offsetC, x, xstride, strategy, state);
+            if(Tc_ext.is4Bit()){
+                eshr(1,tempQ0, x, 1, strategy, state);
+                eadd(1, offsetC, offsetC, tempQ0, strategy, state);
+            }else{
+                emad(1, offsetC, offsetC, x, xstride, strategy, state);
+            }
             emul(1, tempQ0, y, state.inputs.ldc[q], strategy, state);
             eadd(1, offsetC, offsetC, tempQ0.reinterpret(0, offsetC.getType()), strategy, state);       // Gen12: Use add3.
         }
@@ -758,8 +791,10 @@ void BLASKernelGenerator<hw>::gemmScaleInputs(const GEMMProblem &problem, const
     scale(Tco, inputs.ldco);
 
     {
-        scale(Ta_ext, inputs.offsetA);
-        scale(Tb_ext, inputs.offsetB);
+        if (strategy.A.base.getModel() != ModelSLM)
+            scale(Ta_ext, inputs.offsetA);
+        if (strategy.B.base.getModel() != ModelSLM)
+            scale(Tb_ext, inputs.offsetB);
         for (int q = 0; q < state.C_count; q++)
             scale(Tc_ext, inputs.offsetC[q]);
         if (problem.usesCO())
@@ -1431,7 +1466,7 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
         if (!state.copyC)
             useUnmasked &= matchLayouts(Tc, layoutExt, state.C_layoutExtUnmasked);
         if (state.C_layoutExtUnmasked.size() == layoutExt.size())
-            useUnmasked &= (Tc_ext.size() < 4) || (needsPseudoblock(hw, Tc_ext, unrollM, unrollN, problem.C, state.Cext_strategy, true, false)
+            useUnmasked &= (Tc_ext.paddedSize() < 4) || (needsPseudoblock(hw, Tc_ext, unrollM, unrollN, problem.C, state.Cext_strategy, true, false)
                                                 != needsPseudoblock(hw, Tc_ext, unrollM, unrollN, problem.C, state.Cext_strategy, true, true));
         if (!useUnmasked)
             state.C_layoutExtUnmasked.clear();
@@ -1491,7 +1526,7 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
         // Repacked data can use significantly more registers than the loaded
         // data. Lazy repacking can reduce register utilization and improve load
         // pipelining at (in some cases) the expense of more work.
-        bool lazyRepack = state.Ta_load.isInt4() && Ta == Type::f16;    // Other cases are unimplemented
+        bool lazyRepack = state.Ta_load.isInt4() && one_of(Ta, Type::f16, Type::bf16, Type::f32);    // Other cases are unimplemented
         if (lazyRepack)
             state.ka_repack = std::min(state.ka_repack, strategy.kb_load);
         makeUnbackedRegLayout(Ta, state.Ar_layout, unrollM, state.ka_repack, isLayoutColMajor(state.A_layout), crosspackA, tileM_A, tileK_A, true, splitA);
@@ -1501,13 +1536,33 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
 
     // Prepare to repack C if needed, and choose repack tile size.
     if (Tc != Tc_compute) {
+        auto &period = state.cRepackPeriod;
         int panel = strategy.cRepackPanel;
-        if (panel == 0)
+        bool fullTileRepack = true;
+        if (panel == 0){
+            fullTileRepack = false;
             panel = 2 * elementsPerGRF(hw, Tc_compute);
+        }
 
         int Cr_unrollM = unrollM, Cr_unrollN = unrollN;
         auto &Cr_unrollX = globalCM ? Cr_unrollM : Cr_unrollN;
-        Cr_unrollX = std::min(Cr_unrollX, panel);
+
+        if (Cr_unrollX <= panel && fullTileRepack) {
+            // Repack full tiles.
+            if (problem.aScale2D && problem.bScale2D)
+                period = gcd(problem.aqGroupK, problem.bqGroupK);
+            else if (problem.aScale2D)
+                period = problem.aqGroupK;
+            else if (problem.bScale2D)
+                period = problem.bqGroupK;
+            else
+                period = strategy.repackC ? strategy.repackC : strategy.unroll[LoopK];
+        } else {
+            // Repack partial tiles, interleaved with computation.
+            Cr_unrollX = panel;
+            period = outerProductCount(hw, problem, strategy);
+        }
+        period = std::min(period, 64);
 
         makeUnbackedRegLayout(Tc_compute, state.Cr_layout, Cr_unrollM, Cr_unrollN, globalCM, 1, strategy.C.tileR, strategy.C.tileC, true);
     }
@@ -1523,6 +1578,10 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
                              state.repackA ? state.Ar_layout :
                                              state.A_layout;
         makeSumLayout(false, Ta, As_srcLayout, Tc, state.As_layout, strategy, state);
+        if (Tc != Tc_compute) {
+            std::swap(state.Asr_layout, state.As_layout);   /* TODO: trim down */
+            makeUnbackedRegLayout(Tc_compute, state.As_layout, unrollM, 1, true, 1);
+        }
         if (state.systolicSumA)
             setupTeardownAccumulateSumSystolic(true, Tb, problem, strategy, state);
     }
@@ -1536,6 +1595,10 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
                              state.repackB ? state.Br_layout :
                                              state.B_layout;
         makeSumLayout(true,  Tb, Bs_srcLayout, Tc, state.Bs_layout, strategy, state);
+        if (Tc != Tc_compute) {
+            std::swap(state.Bsr_layout, state.Bs_layout);
+            makeUnbackedRegLayout(Tc_compute, state.Bs_layout, 1, unrollN, false, 1);
+        }
         if (state.systolicSumB)
             setupTeardownAccumulateSumSystolic(true, Ta, problem, strategy, state);
     }
@@ -1551,6 +1614,8 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
     for (bool isA : {true, false})
         gemmMake2DQuantizationLayouts(isA, problem, strategy, state);
 
+    gemmCalcQuantizationIncrements(problem, strategy, state);
+
     // Grab flag registers now for named barriers. TODO: unlock these.
     if (strategy.needsNamedBarriersM(problem))
         state.barrierM = state.raVFlag.allocSubreg0();
@@ -1631,10 +1696,10 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
     allocAddrRegs(state.Bi_addrs, state.Bi_layout, state.Bi, state.Bi_strategy, state);
     allocAddrRegs(state.Ao_addrs, state.Ao_layout, state.Ao, state.Ao_strategy, state);
     allocAddrRegs(state.Bo_addrs, state.Bo_layout, state.Bo, state.Bo_strategy, state);
-    allocAddrRegs(state.A_offsetAddrs, state.A_offsetLayout, problem.AO, state.A_offsetStrategy, state);
-    allocAddrRegs(state.B_offsetAddrs, state.B_offsetLayout, problem.BO, state.B_offsetStrategy, state);
-    allocAddrRegs(state.A_scaleAddrs, state.A_scaleLayout, problem.A_scale, state.A_scaleStrategy, state);
-    allocAddrRegs(state.B_scaleAddrs, state.B_scaleLayout, problem.B_scale, state.B_scaleStrategy, state);
+    allocAddrRegs(state.A_offsetAddrs, state.A_offsetLayout, problem.AO, strategy.AO, state);
+    allocAddrRegs(state.B_offsetAddrs, state.B_offsetLayout, problem.BO, strategy.BO, state);
+    allocAddrRegs(state.A_scaleAddrs, state.A_scaleLayout, problem.A_scale, strategy.A_scale, state);
+    allocAddrRegs(state.B_scaleAddrs, state.B_scaleLayout, problem.B_scale, strategy.B_scale, state);
 
     // Free up some C registers temporarily for use in address calculations.
     releaseRanges(state.C_regs, state);
@@ -1701,6 +1766,26 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
         }
     }
 
+    if (problem.aqGroupM > 1 && (ao2D || as2D)) {
+        auto inI0Q = i0q, inI0S = i0s;
+        if (i0q == state.i0) i0q = state.ra.alloc_sub<uint32_t>();
+        divDown(i0q, inI0Q, problem.aqGroupM, strategy, state);
+        if (inI0S == inI0Q)
+            i0s = i0q;
+        else
+            divDown(i0s, i0s, problem.aqGroupM, strategy, state);
+    }
+
+    if (problem.bqGroupN > 1 && (bo2D || bs2D)) {
+        auto inJ0Q = j0q, inJ0S = j0s;
+        if (j0q == state.j0) j0q = state.ra.alloc_sub<uint32_t>();
+        divDown(j0q, inJ0Q, problem.bqGroupN, strategy, state);
+        if (inJ0S == inJ0Q)
+            j0s = j0q;
+        else
+            divDown(j0s, j0s, problem.bqGroupN, strategy, state);
+    }
+
     auto setupQAddr = [&](Type T, vector<GRFRange> &addrs, const vector<RegisterBlock> &layout,
                           Subregister ptr, Subregister r0, Subregister c0, Subregister ld,
                           const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
@@ -1716,25 +1801,27 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
 
     if (ao2D) {
         setupQAddr(Tao, state.A_offsetAddrs, state.A_offsetLayout, state.inputs.aoPtr,
-                   i0q, A_h0q, state.inputs.ldao, problem.AO, state.A_offsetStrategy);
+                   i0q, A_h0q, state.inputs.ldao, problem.AO, strategy.AO);
     }
     if (as2D) {
         if (!state.lateScale2DA)
             i0s = i0q, A_h0s = A_h0q;
         setupQAddr(Ta_scale, state.A_scaleAddrs, state.A_scaleLayout, state.inputs.aScalePtr,
-                   i0s, A_h0s, state.inputs.ldaScale, problem.A_scale, state.A_scaleStrategy);
+                   i0s, A_h0s, state.inputs.ldaScale, problem.A_scale, strategy.A_scale);
     }
     if (bo2D) {
         setupQAddr(Tbo, state.B_offsetAddrs, state.B_offsetLayout, state.inputs.boPtr,
-                   B_h0q, j0q, state.inputs.ldbo, problem.BO, state.B_offsetStrategy);
+                   B_h0q, j0q, state.inputs.ldbo, problem.BO, strategy.BO);
     }
     if (bs2D) {
         if (!state.lateScale2DB)
             j0s = j0q, B_h0s = B_h0q;
         setupQAddr(Tb_scale, state.B_scaleAddrs, state.B_scaleLayout, state.inputs.bScalePtr,
-                   B_h0s, j0s, state.inputs.ldbScale, problem.B_scale, state.B_scaleStrategy);
+                   B_h0s, j0s, state.inputs.ldbScale, problem.B_scale, strategy.B_scale);
     }
 
+    if (i0s != state.i0) state.ra.safeRelease(i0s);
+    if (j0s != state.j0) state.ra.safeRelease(j0s);
     state.ra.safeRelease(A_h0q);
     state.ra.safeRelease(B_h0q);
     state.ra.safeRelease(A_h0s);
@@ -1742,6 +1829,7 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
 
     // Load and convert 0D/1D offsets for 2D dequantization.
     if (aoTo2D) {
+        if (!strategy.AO.base.isStateless()) stub();
         std::vector<RegisterBlock> A_offsetLayout;
         GRFRange aoLoad;
         if (problem.aoPtrDims == 1) {
@@ -1753,16 +1841,23 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
             aoLoad = loadVector(problem.Tao, problem.Tao, aoBase, r, rem, strategy, state);
             makeUnbackedRegLayout(problem.Tao, A_offsetLayout, r, 1, true);
             state.ra.safeRelease(aoBase);
-        } else {
+        } else if (problem.aoPtrDims == 0) {
             auto grf = loadScalars(problem.Tao, {state.inputs.aoPtr}, strategy, state);
             aoLoad = grf-grf;
             A_offsetLayout = state.Ar_offsetLayout;
+        } else {
+            GRF grf{state.inputs.ao.getBase()};
+            aoLoad = grf-grf;
+            A_offsetLayout = state.Ar_offsetLayout;
+            A_offsetLayout[0].offsetBytes = state.inputs.ao.getByteOffset();
         }
         gemmRepack2DOffsetData(problem.Ta_ext, problem.Tao, state.Tao_int, A_offsetLayout, state.Ar_offsetLayout, aoLoad, state.Ar_offsetRegs, problem, strategy, state);
         state.ra.safeRelease(aoLoad);
-        if (!strategy.persistent) state.ra.safeRelease(state.inputs.aoPtr);
+        if (!strategy.persistent)
+            state.ra.safeRelease(state.inputs.aoPtr);
     }
     if (boTo2D) {
+        if (!strategy.BO.base.isStateless()) stub();
         std::vector<RegisterBlock> B_offsetLayout;
         GRFRange boLoad;
         if (problem.boPtrDims == 1) {
@@ -1774,16 +1869,21 @@ bool BLASKernelGenerator<hw>::gemmAccumulateCSetup(GEMMProblem &problem, GEMMStr
             boLoad = loadVector(problem.Tbo, problem.Tbo, boBase, c, rem, strategy, state);
             makeUnbackedRegLayout(problem.Tbo, B_offsetLayout, 1, c, false);
             state.ra.safeRelease(boBase);
-        } else {
+        } else if (problem.boPtrDims == 0) {
             auto grf = loadScalars(problem.Tbo, {state.inputs.boPtr}, strategy, state);
             boLoad = grf-grf;
             B_offsetLayout = state.Br_offsetLayout;
+        } else {
+            GRF grf{state.inputs.bo.getBase()};
+            boLoad = grf-grf;
+            B_offsetLayout = state.Br_offsetLayout;
+            B_offsetLayout[0].offsetBytes = state.inputs.bo.getByteOffset();
         }
         gemmRepack2DOffsetData(problem.Tb_ext, problem.Tbo, state.Tbo_int, B_offsetLayout, state.Br_offsetLayout, boLoad, state.Br_offsetRegs, problem, strategy, state);
         state.ra.safeRelease(boLoad);
-        if (!strategy.persistent) state.ra.safeRelease(state.inputs.boPtr);
+        if (!strategy.persistent)
+            state.ra.safeRelease(state.inputs.boPtr);
     }
-
     if (i0q != state.i0) state.ra.safeRelease(i0q);
     if (j0q != state.j0) state.ra.safeRelease(j0q);
 
@@ -2043,8 +2143,6 @@ void BLASKernelGenerator<hw>::gemmCalcIncrements(const GEMMProblem &problem, con
 {
     gemmFreeIncrements(problem, strategy, state, doA, doB);
 
-    bool doAq = doA, doBq = doB;
-
     doA &= (problem.A.layout == MatrixLayout::N);
     doB &= (problem.B.layout == MatrixLayout::T);
 
@@ -2076,7 +2174,11 @@ void BLASKernelGenerator<hw>::gemmCalcIncrements(const GEMMProblem &problem, con
         if (strategy.prefetchB && !strategy.B_prefetch.address2D)
             calcInterleavedIncrement(false, strategy.kb_pfStride);
     }
+}
 
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmCalcQuantizationIncrements(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
     bool ao2D = (problem.aoPtrDims == 2);
     bool bo2D = (problem.boPtrDims == 2);
     bool as2D = problem.aScale2D;
@@ -2096,13 +2198,13 @@ void BLASKernelGenerator<hw>::gemmCalcIncrements(const GEMMProblem &problem, con
             calcIncrement(increments, base, inc, strategy, state);
     };
 
-    if (doAq && ao2D && problem.AO.layout == MatrixLayout::N)
+    if (ao2D && problem.AO.layout == MatrixLayout::N)
         calcInterleavedQIncrement(true,  state.ldao,     state.ldaoIncrements);
-    if (doAq && as2D && problem.A_scale.layout == MatrixLayout::N)
+    if (as2D && problem.A_scale.layout == MatrixLayout::N)
         calcInterleavedQIncrement(true,  state.ldaScale, state.ldasIncrements);
-    if (doBq && bo2D && problem.BO.layout == MatrixLayout::T)
+    if (bo2D && problem.BO.layout == MatrixLayout::T)
         calcInterleavedQIncrement(false, state.ldbo,     state.ldboIncrements);
-    if (doBq && bs2D && problem.B_scale.layout == MatrixLayout::T)
+    if (bs2D && problem.B_scale.layout == MatrixLayout::T)
         calcInterleavedQIncrement(false, state.ldbScale, state.ldbsIncrements);
 }
 
@@ -2131,42 +2233,47 @@ void BLASKernelGenerator<hw>::gemmFreeIncrements(const GEMMProblem &problem, con
     }
 }
 
+// Adjust addresses for worksharing.
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmApplyWorkshareOffset(bool isA, Subregister &base, Subregister alias, Address2DParams &params2D,
+                                                       const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy,
+                                                       int r, int c, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    Subregister off;
+    auto &offR = params2D.offR, &offC = params2D.offC;
+    auto offR0 = offR, offC0 = offC;
+    isA ? gemmCalcWorkshareAOffset(off, offR, offC, atype, astrategy, r, c, problem, strategy, state)
+        : gemmCalcWorkshareBOffset(off, offR, offC, atype, astrategy, r, c, problem, strategy, state);
+    if (astrategy.address2D) {
+        if (offR0.isValid() && offR != offR0) add(1, offR, offR, offR0);
+        if (offC0.isValid() && offC != offC0) add(1, offC, offC, offC0);
+    } else {
+        auto base0 = base;
+        if (base == alias)
+            base = state.ra.alloc_sub(base.getType());
+        eadd(1, base, base0, off, strategy, state);
+    }
+    state.ra.safeRelease(off);
+}
+
 // Prepare A/B prefetch addresses.
 template <HW hw>
 void BLASKernelGenerator<hw>::gemmABPrefetchAddrSetup(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool doA, bool doB)
 {
     if (doA && strategy.cooperativePF && strategy.prefetchA) {
-        Subregister offAp;
         auto &A_offR = state.A_params.offR, &Ap_offR = state.Ap_params.offR;
         auto &A_offC = state.A_params.offC, &Ap_offC = state.Ap_params.offC;
-        gemmCalcWorkshareAOffset(offAp, Ap_offR, Ap_offC, problem.A, strategy.A_prefetch, state.ma_prefetch, state.ka_prefetch, problem, strategy, state);
-        if (strategy.A_prefetch.address2D) {
-            if (A_offR.isValid() && A_offR != Ap_offR) add(1, Ap_offR, Ap_offR, A_offR);
-            if (A_offC.isValid() && A_offC != Ap_offC) add(1, Ap_offC, Ap_offC, A_offC);
-        } else {
-            auto inEffAp = state.effAp;
-            if (state.effA == state.effAp)
-                state.effAp = state.ra.alloc_sub(state.effA.getType());
-            eadd(1, state.effAp, inEffAp, offAp, strategy, state);
-        }
-        state.ra.safeRelease(offAp);
+        Ap_offR = A_offR, Ap_offC = A_offC;
+        gemmApplyWorkshareOffset(true, state.effAp, state.effA, state.Ap_params, problem.A, strategy.A_prefetch,
+                                 state.ma_prefetch, state.ka_prefetch, problem, strategy, state);
     }
 
     if (doB && strategy.cooperativePF && strategy.prefetchB) {
-        Subregister offBp;
         auto &B_offR = state.B_params.offR, &Bp_offR = state.Bp_params.offR;
         auto &B_offC = state.B_params.offC, &Bp_offC = state.Bp_params.offC;
-        gemmCalcWorkshareBOffset(offBp, Bp_offR, Bp_offC, problem.B, strategy.B_prefetch, state.kb_prefetch, state.nb_prefetch, problem, strategy, state);
-        if (strategy.B_prefetch.address2D) {
-            if (B_offR.isValid() && B_offR != Bp_offR) add(1, Bp_offR, Bp_offR, B_offR);
-            if (B_offC.isValid() && B_offC != Bp_offC) add(1, Bp_offC, Bp_offC, B_offC);
-        } else {
-            auto inEffBp = state.effBp;
-            if (state.effB == state.effBp)
-                state.effBp = state.ra.alloc_sub(state.effB.getType());
-            eadd(1, state.effBp, inEffBp, offBp, strategy, state);
-        }
-        state.ra.safeRelease(offBp);
+        Bp_offR = B_offR, Bp_offC = B_offC;
+        gemmApplyWorkshareOffset(false, state.effBp, state.effB, state.Bp_params, problem.B, strategy.B_prefetch,
+                                 state.kb_prefetch, state.nb_prefetch, problem, strategy, state);
     }
 
     if (problem.backward()) {
@@ -2332,13 +2439,15 @@ void BLASKernelGenerator<hw>::gemmInitInterface(GEMMProblem &problem, GEMMStrate
         state.inputs.surfaceBO = interface.getArgumentSurfaceIfExists("bo_ptr");
     }
     if (problem.aScale2D) {
-        state.inputs.aScalePtr = interface.getArgument("a_scale_ptr");
+        state.inputs.aScalePtr = interface.getArgumentIfExists("a_scale_ptr");
         state.inputs.surfaceAScale = interface.getArgumentSurfaceIfExists("a_scale_ptr");
     }
     if (problem.bScale2D) {
-        state.inputs.bScalePtr = interface.getArgument("b_scale_ptr");
+        state.inputs.bScalePtr = interface.getArgumentIfExists("b_scale_ptr");
         state.inputs.surfaceBScale = interface.getArgumentSurfaceIfExists("b_scale_ptr");
     }
+    if (problem.cStochasticRound) 
+        state.inputs.sroundSeedPtr = interface.getArgument("sround_seed");
     state.inputs.offsetA = interface.getArgumentIfExists("offset_A");
     state.inputs.offsetB = interface.getArgumentIfExists("offset_B");
     state.inputs.offsetC[0] = interface.getArgumentIfExists("offset_C");
@@ -2400,7 +2509,7 @@ void BLASKernelGenerator<hw>::gemmInitInterface(GEMMProblem &problem, GEMMStrate
         state.inputs.groupCountM = interface.getArgument("group_count_m");
         state.inputs.groupCountN = interface.getArgument("group_count_n");
     }
-    if (strategy.cWalkOrder == WalkOrder::SimpleLinear)
+    if (one_of(strategy.cWalkOrder, WalkOrder::SimpleLinear, WalkOrder::NestedLinear))
         state.inputs.gcMNRecip = interface.getArgument("group_count_recip");
     else if (strategy.cWalkOrder == WalkOrder::Hilbertlike) {
         state.inputs.hilbertVD = interface.getArgumentIfExists("hilbert_vd");
@@ -2463,7 +2572,7 @@ void BLASKernelGenerator<hw>::gemmInitInterface(GEMMProblem &problem, GEMMStrate
     state.inputs.localSizeK = lszs_reordered[2];
 
     if (strategy.linearOrder()) {
-        state.inputs.groupIDMN = tgids[0];
+        state.groupIDMN = state.inputs.groupIDMN = tgids[0];
         state.inputs.groupIDM = invalid;
         state.inputs.groupIDN = invalid;
     }
@@ -2610,6 +2719,8 @@ void BLASKernelGenerator<hw>::gemmInitInterface(GEMMProblem &problem, GEMMStrate
 
     if (state.inputs.flags.isValid())
         state.ra.claim(state.inputs.flags);
+    if (problem.cStochasticRound)
+        state.ra.claim(state.inputs.sroundSeedPtr);
     if (state.inputs.slmBase.isValid())
         state.ra.claim(state.inputs.slmBase);
 
@@ -2643,7 +2754,7 @@ void BLASKernelGenerator<hw>::gemmInitInterface(GEMMProblem &problem, GEMMStrate
         state.ra.claim(state.inputs.groupCountN);
     }
 
-    if (strategy.cWalkOrder == WalkOrder::SimpleLinear)
+    if (one_of(strategy.cWalkOrder, WalkOrder::SimpleLinear, WalkOrder::NestedLinear))
         state.ra.claim(state.inputs.gcMNRecip);
     else if (strategy.cWalkOrder == WalkOrder::Hilbertlike) {
         {
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/grf_multirange.hpp b/src/gpu/intel/jit/gemm/generator/pieces/grf_multirange.hpp
index f0f76f7fde5..1acbaa71b70 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/grf_multirange.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/grf_multirange.hpp
@@ -36,6 +36,7 @@ struct GRFMultirange {
         for (auto &r : ranges) {
             if (idx < r.getLen()) {
                 if (consecutive) *consecutive = r.getLen() - idx;
+                if (r.isInvalid()) return ngen::GRF();
                 return r[idx];
             }
             idx -= r.getLen();
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/hw_template_instantiations.cxx b/src/gpu/intel/jit/gemm/generator/pieces/hw_template_instantiations.cxx
index f33ee82a072..497d101772f 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/hw_template_instantiations.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/hw_template_instantiations.cxx
@@ -19,10 +19,30 @@
 #pragma warning (disable: 4661)     /* missing definition in template instatiation */
 #endif
 
+#if defined(DNNL_GPU_ISA_GEN9)
 REG_GEN9_ISA(template class BLASKernelGenerator<HW::Gen9>);
+#elif defined(DNNL_GPU_ISA_GEN11)
 REG_GEN11_ISA(template class BLASKernelGenerator<HW::Gen11>);
+#elif defined(DNNL_GPU_ISA_XELP)
 REG_XELP_ISA(template class BLASKernelGenerator<HW::Gen12LP>);
+#elif defined(DNNL_GPU_ISA_XEHP)
 REG_XEHP_ISA(template class BLASKernelGenerator<HW::XeHP>);
+#elif defined(DNNL_GPU_ISA_XEHPG)
 REG_XEHPG_ISA(template class BLASKernelGenerator<HW::XeHPG>);
+#elif defined(DNNL_GPU_ISA_XEHPC)
 REG_XEHPC_ISA(template class BLASKernelGenerator<HW::XeHPC>);
+#elif defined(DNNL_GPU_ISA_XE2)
 REG_XE2_ISA(template class BLASKernelGenerator<HW::Xe2>);
+#elif defined(DNNL_GPU_ISA_XE3)
+REG_XE3_ISA(template class BLASKernelGenerator<HW::Xe3>);
+#else
+// Default to instantiating all classes
+REG_GEN9_ISA(template class BLASKernelGenerator<HW::Gen9>);
+REG_GEN11_ISA(template class BLASKernelGenerator<HW::Gen11>);
+REG_XELP_ISA(template class BLASKernelGenerator<HW::Gen12LP>);
+REG_XEHP_ISA(template class BLASKernelGenerator<HW::XeHP>);
+REG_XEHPG_ISA(template class BLASKernelGenerator<HW::XeHPG>);
+REG_XEHPC_ISA(template class BLASKernelGenerator<HW::XeHPC>);
+REG_XE2_ISA(template class BLASKernelGenerator<HW::Xe2>);
+REG_XE3_ISA(template class BLASKernelGenerator<HW::Xe3>);
+#endif
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/hw_utils.hpp b/src/gpu/intel/jit/gemm/generator/pieces/hw_utils.hpp
index b471f09cf04..ac0d387fabb 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/hw_utils.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/hw_utils.hpp
@@ -94,7 +94,8 @@ static inline size_t slmCapacity(ngen::HW hw)
         case HW::XeHP:
         case HW::XeHPG:
         case HW::XeHPC:     return 131072;
-        case HW::Xe2:       return 131072;
+        case HW::Xe2:
+        case HW::Xe3:       return 131072;
         default:
             return 0;
     }
@@ -124,6 +125,7 @@ static inline int eusPerSubslice(ngen::HW hw)
         case HW::Gen11:
         case HW::XeHPC:
         case HW::Xe2:
+        case HW::Xe3:
             return 8;
         case HW::Gen12LP:
         case HW::XeHP:
@@ -155,7 +157,7 @@ static inline int block2DMinAlignment(ngen::HW hw, const MatrixAddressing &atype
 {
     using namespace ngen;
     if (!isBlock2D(astrategy.accessType) && !asIfBlock2D) return 0;
-    if (hw == HW::Xe2) return 16;
+    if (hw == HW::Xe2 || hw == HW::Xe3) return 16;
     return (isTransposing(astrategy.accessType) || astrategy.prefetch) ? 4 : 8;
 }
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/k_loop.cxx b/src/gpu/intel/jit/gemm/generator/pieces/k_loop.cxx
index 4023ea25368..a6d10135f7d 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/k_loop.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/k_loop.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,6 +50,8 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     auto Ta = problem.Ta, Tb = problem.Tb, Tc = problem.Tc;
     auto Ta_ext = problem.Ta_ext, Tb_ext = problem.Tb_ext;
     auto Ta_load = state.Ta_load, Tb_load = state.Tb_load;
+    auto Tao = problem.Tao, Tbo = problem.Tbo;
+    auto Ta_scale = problem.Ta_scale, Tb_scale = problem.Tb_scale;
 
     bool cLoadAhead = strategy.cLoadAhead;
     auto opCountMain = outerProductCount(hw, problem, strategy);
@@ -387,6 +389,9 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
         });
     }
 
+    // A/B L3 prefetch.
+    gemmScheduleL3Prefetches(&ls, problem, strategy, state);
+
     if (slmDequantize2DA && slmDequantize2DB && kaq_load != kbq_load) stub();
     int slmKQLoad = slmDequantize2DA ? kaq_load : kbq_load;
     slmKQLoad = std::max(slmKQLoad, unrollKSLM);
@@ -397,12 +402,12 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
                      | lookahead(lookaheadSLMStore + lookaheadSLMReload + unrollKSLM - 1);
     if (slmDequantize2D) ls.schedule(reqSLMLoadQ, [&](Iteration h) {
         if (slmDequantize2DA) {
-            if (ao2D) gemmALoad(state.A_offsetRegs, state.A_offsetLayout, state.A_offsetAddrs, problem.AO,      state.A_offsetStrategy, problem, strategy, state);
-            if (as2D) gemmALoad(state.A_scaleRegs,  state.A_scaleLayout,  state.A_scaleAddrs,  problem.A_scale, state.A_scaleStrategy,  problem, strategy, state);
+            if (ao2D) gemmALoad(state.A_offsetRegs, state.A_offsetLayout, state.A_offsetAddrs, problem.AO,      strategy.AO,      problem, strategy, state);
+            if (as2D) gemmALoad(state.A_scaleRegs,  state.A_scaleLayout,  state.A_scaleAddrs,  problem.A_scale, strategy.A_scale, problem, strategy, state);
         }
         if (slmDequantize2DB) {
-            if (bo2D) gemmBLoad(state.B_offsetRegs, state.B_offsetLayout, state.B_offsetAddrs, problem.BO,      state.B_offsetStrategy, problem, strategy, state);
-            if (bs2D) gemmBLoad(state.B_scaleRegs,  state.B_scaleLayout,  state.B_scaleAddrs,  problem.B_scale, state.B_scaleStrategy,  problem, strategy, state);
+            if (bo2D) gemmBLoad(state.B_offsetRegs, state.B_offsetLayout, state.B_offsetAddrs, problem.BO,      strategy.BO,      problem, strategy, state);
+            if (bs2D) gemmBLoad(state.B_scaleRegs,  state.B_scaleLayout,  state.B_scaleAddrs,  problem.B_scale, strategy.B_scale, problem, strategy, state);
         }
     });
 
@@ -630,6 +635,11 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     ls.schedule_if(reqLoopCheck,
         [&](Iteration h) {
             add(1 | gt | f0[0], state.K, state.K, -unrollK);
+            if (lateKLoopCheck) {
+                state.raVFlag.lock(state.flagAP);
+                if (state.vflagsEnabled())
+                    state.activeVFlags[state.flagAP.index()].clear();
+            }
         },
         [&](Iteration h) {
             return (curPhase == LoopSequencer::PhaseMainLoop);
@@ -659,24 +669,24 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     // Quantization parameter address increment helpers.
     auto doIncAq = [&](Iteration h) {
         auto kaInc = kInc(h, state.kaqStride, problem.aqGroupK);
-        if (ao2D) incAddrK(problem.Tao,      state.A_offsetAddrs, true,  kaInc, state.ldao,     state.ldaoIncrements, state.A_offsetLayout, problem.AO,      state.A_offsetStrategy, strategy, state);
-        if (as2D) incAddrK(problem.Ta_scale, state.A_scaleAddrs,  true,  kaInc, state.ldaScale, state.ldasIncrements, state.A_scaleLayout,  problem.A_scale, state.A_scaleStrategy,  strategy, state);
+        if (ao2D) incAddrK(Tao,      state.A_offsetAddrs, true,  kaInc, state.ldao,     state.ldaoIncrements, state.A_offsetLayout, problem.AO,      strategy.AO,      strategy, state);
+        if (as2D) incAddrK(Ta_scale, state.A_scaleAddrs,  true,  kaInc, state.ldaScale, state.ldasIncrements, state.A_scaleLayout,  problem.A_scale, strategy.A_scale, strategy, state);
     };
 
     auto doIncBq = [&](Iteration h) {
         auto kbInc = kInc(h, state.kbqStride, problem.bqGroupK);
-        if (bo2D) incAddrK(problem.Tbo,      state.B_offsetAddrs, false, kbInc, state.ldbo,     state.ldboIncrements, state.B_offsetLayout, problem.BO,      state.B_offsetStrategy, strategy, state);
-        if (bs2D) incAddrK(problem.Tb_scale, state.B_scaleAddrs,  false, kbInc, state.ldbScale, state.ldbsIncrements, state.B_scaleLayout,  problem.B_scale, state.B_scaleStrategy,  strategy, state);
+        if (bo2D) incAddrK(Tbo,      state.B_offsetAddrs, false, kbInc, state.ldbo,     state.ldboIncrements, state.B_offsetLayout, problem.BO,      strategy.BO,      strategy, state);
+        if (bs2D) incAddrK(Tb_scale, state.B_scaleAddrs,  false, kbInc, state.ldbScale, state.ldbsIncrements, state.B_scaleLayout,  problem.B_scale, strategy.B_scale, strategy, state);
     };
 
     auto doIncAqLate = [&](Iteration h) {
         auto kaInc = kInc(h, state.kaqLate, problem.aqGroupK);
-        incAddrK(problem.Ta_scale, state.A_scaleAddrs, true, kaInc, state.ldaScale, state.ldasIncrements, state.A_scaleLayout, problem.A_scale, state.A_scaleStrategy, strategy, state);
+        incAddrK(Ta_scale, state.A_scaleAddrs, true, kaInc, state.ldaScale, state.ldasIncrements, state.A_scaleLayout, problem.A_scale, strategy.A_scale, strategy, state);
     };
 
     auto doIncBqLate = [&](Iteration h) {
         auto kbInc = kInc(h, state.kbqLate, problem.bqGroupK);
-        incAddrK(problem.Tb_scale, state.B_scaleAddrs, false, kbInc, state.ldbScale, state.ldbsIncrements, state.B_scaleLayout, problem.B_scale, state.B_scaleStrategy, strategy, state);
+        incAddrK(Tb_scale, state.B_scaleAddrs, false, kbInc, state.ldbScale, state.ldbsIncrements, state.B_scaleLayout, problem.B_scale, strategy.B_scale, strategy, state);
     };
 
     // SLM quantization parameter address increment.
@@ -728,6 +738,9 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     if (strategy.prefetchA && strategy.prefetchB && loadBFirst)
         ls.swapLast2();
 
+    // A/B L3 prefetch address increments.
+    gemmScheduleL3PrefetchIncs(&ls, problem, strategy, state);
+
     // A/B quantization parameter address increment.
     auto reqIncAq = every(kaq_load);
     auto reqIncBq = every(kbq_load);
@@ -858,30 +871,83 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     if (remaskA && remaskB && loadBFirst)
         ls.swapLast2();
 
-    // A/B quantization parameter repacking.
+    // A/B quantization parameter repacking and remasking.
     auto reqRepackAq = every(kaq_load);
     auto reqRepackBq = every(kbq_load);
     auto reqRepackAqLate = every(kaq_loadLate);
     auto reqRepackBqLate = every(kbq_loadLate);
 
+    bool remaskAq = (ao2D || as2D) && (minOPCount > 1) && (problem.aqGroupK == 1);
+    bool remaskBq = (ao2D || bs2D) && (minOPCount > 1) && (problem.bqGroupK == 1);
+    int iremaskScale = 2;
+
+    auto doRemaskAq = [&](Iteration h, bool slm) {
+        if (!remaskAq) return;
+        int ms, ks;
+        Subregister offK;
+        if (slm && (state.effCoopA == CoopSplit::K || state.effCoopA == CoopSplit::FullK)) {
+            offK = state.ra.allocSub<uint32_t>();
+            mulConstant(1, offK, state.lidN, state.ka_slm);
+        }
+        if (as2D) {
+            getLayoutDims(state.A_scaleLayout, ms, ks);
+            remaskLayoutSingle(Ta_scale, iremaskScale, true, ks, state.K,
+                               state.A_scaleLayout, state.A_scaleRegs, strategy, state,
+                               -h.counterOffset(), offK);
+        }
+        if (ao2D) {
+            getLayoutDims(state.A_offsetLayout, ms, ks);
+            remaskLayoutSingle(Tao, iremaskScale, true, ks, state.K,
+                               state.A_offsetLayout, state.A_offsetRegs, strategy, state,
+                               -h.counterOffset(), offK);
+        }
+        state.ra.safeRelease(offK);
+    };
+
+    auto doRemaskBq = [&](Iteration h, bool slm) {
+        if (!remaskBq) return;
+        int ks, ns;
+        Subregister offK;
+        if (slm && (state.effCoopB == CoopSplit::K || state.effCoopB == CoopSplit::FullK)) {
+            offK = state.ra.allocSub<uint32_t>();
+            mulConstant(1, offK, state.lidM, state.ka_slm);
+        }
+        if (bs2D) {
+            getLayoutDims(state.B_scaleLayout, ks, ns);
+            remaskLayoutSingle(Tb_scale, iremaskScale, false, ks, state.K,
+                               state.B_scaleLayout, state.B_scaleRegs, strategy, state,
+                               -h.counterOffset(), offK);
+        }
+        if (bo2D) {
+            getLayoutDims(state.B_offsetLayout, ks, ns);
+            remaskLayoutSingle(Tbo, iremaskScale, false, ks, state.K,
+                               state.B_offsetLayout, state.B_offsetRegs, strategy, state,
+                               -h.counterOffset(), offK);
+        }
+        state.ra.safeRelease(offK);
+    };
+
     if (dequantize2DA) ls.schedule(reqRepackAq, [&](Iteration h) {
-        if (ao2D) gemmRepack2DOffsetData(Ta_ext, problem.Tao,    state.Tao_int,    state.A_offsetLayout, state.Ar_offsetLayout, state.A_offsetRegs, state.Ar_offsetRegs, problem, strategy, state);
-        if (as2D) gemmRepack2DQuantizationData(problem.Ta_scale, state.Ta_scaleOp, state.A_scaleLayout,  state.Ar_scaleLayout,  state.A_scaleRegs,  state.Ar_scaleRegs,  problem, strategy, state);
+        if (A_remActive(h)) doRemaskAq(h, false);
+        if (ao2D) gemmRepack2DOffsetData(Ta_ext, Tao,    state.Tao_int,    state.A_offsetLayout, state.Ar_offsetLayout, state.A_offsetRegs, state.Ar_offsetRegs, problem, strategy, state);
+        if (as2D) gemmRepack2DQuantizationData(Ta_scale, state.Ta_scaleOp, state.A_scaleLayout,  state.Ar_scaleLayout,  state.A_scaleRegs,  state.Ar_scaleRegs,  problem, strategy, state);
     });
 
     if (dequantize2DB) ls.schedule(reqRepackBq, [&](Iteration h) {
-        if (bo2D) gemmRepack2DOffsetData(Tb_ext, problem.Tbo,    state.Tbo_int,    state.B_offsetLayout, state.Br_offsetLayout, state.B_offsetRegs, state.Br_offsetRegs, problem, strategy, state);
-        if (bs2D) gemmRepack2DQuantizationData(problem.Tb_scale, state.Tb_scaleOp, state.B_scaleLayout,  state.Br_scaleLayout,  state.B_scaleRegs,  state.Br_scaleRegs,  problem, strategy, state);
+        if (B_remActive(h)) doRemaskBq(h, false);
+        if (bo2D) gemmRepack2DOffsetData(Tb_ext, Tbo,    state.Tbo_int,    state.B_offsetLayout, state.Br_offsetLayout, state.B_offsetRegs, state.Br_offsetRegs, problem, strategy, state);
+        if (bs2D) gemmRepack2DQuantizationData(Tb_scale, state.Tb_scaleOp, state.B_scaleLayout,  state.Br_scaleLayout,  state.B_scaleRegs,  state.Br_scaleRegs,  problem, strategy, state);
     });
 
     if (as2DLate) ls.schedule(reqRepackAqLate, [&](Iteration h) {
-        gemmRepack2DQuantizationData(problem.Ta_scale, state.Ta_scaleOp, state.A_scaleLayout, state.Ar_scaleLayout, state.A_scaleRegs, state.Ar_scaleRegs, problem, strategy, state);
+        gemmRepack2DQuantizationData(Ta_scale, state.Ta_scaleOp, state.A_scaleLayout, state.Ar_scaleLayout, state.A_scaleRegs, state.Ar_scaleRegs, problem, strategy, state);
     });
 
     if (bs2DLate) ls.schedule(reqRepackBqLate, [&](Iteration h) {
-        gemmRepack2DQuantizationData(problem.Tb_scale, state.Tb_scaleOp, state.B_scaleLayout, state.Br_scaleLayout, state.B_scaleRegs, state.Br_scaleRegs, problem, strategy, state);
+        gemmRepack2DQuantizationData(Tb_scale, state.Tb_scaleOp, state.B_scaleLayout, state.Br_scaleLayout, state.B_scaleRegs, state.Br_scaleRegs, problem, strategy, state);
     });
 
+
     // A/B repacking.
     auto reqRepackA = every(ka_repackMain)
                     | variants(A_copies);
@@ -956,25 +1022,25 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     // A/B 2D quantization parameter loads.
     auto reqLoadAq = every(kaq_load) | lookahead(ka_repackMain);
     auto reqLoadBq = every(kbq_load) | lookahead(kb_loadMain);
-    auto reqLoadAqLate = every(kaq_loadLate) | lookahead(ka_loadMain);
-    auto reqLoadBqLate = every(kbq_loadLate) | lookahead(kb_loadMain);
+    auto reqLoadAqLate = every(kaq_loadLate) | lookahead(kaq_loadLate);
+    auto reqLoadBqLate = every(kbq_loadLate) | lookahead(kbq_loadLate);
 
     if (readA && dequantize2DA) ls.schedule(reqLoadAq, [&](Iteration h) {
-        if (ao2D) gemmALoad(state.A_offsetRegs, state.A_offsetLayout, state.A_offsetAddrs, problem.AO,      state.A_offsetStrategy, problem, strategy, state);
-        if (as2D) gemmALoad(state.A_scaleRegs,  state.A_scaleLayout,  state.A_scaleAddrs,  problem.A_scale, state.A_scaleStrategy,  problem, strategy, state);
+        if (ao2D) gemmALoad(state.A_offsetRegs, state.A_offsetLayout, state.A_offsetAddrs, problem.AO,      strategy.AO,      problem, strategy, state);
+        if (as2D) gemmALoad(state.A_scaleRegs,  state.A_scaleLayout,  state.A_scaleAddrs,  problem.A_scale, strategy.A_scale, problem, strategy, state);
     });
 
     if (readB && dequantize2DB) ls.schedule(reqLoadBq, [&](Iteration h) {
-        if (bo2D) gemmBLoad(state.B_offsetRegs, state.B_offsetLayout, state.B_offsetAddrs, problem.BO,      state.B_offsetStrategy, problem, strategy, state);
-        if (bs2D) gemmBLoad(state.B_scaleRegs,  state.B_scaleLayout,  state.B_scaleAddrs,  problem.B_scale, state.B_scaleStrategy,  problem, strategy, state);
+        if (bo2D) gemmBLoad(state.B_offsetRegs, state.B_offsetLayout, state.B_offsetAddrs, problem.BO,      strategy.BO,      problem, strategy, state);
+        if (bs2D) gemmBLoad(state.B_scaleRegs,  state.B_scaleLayout,  state.B_scaleAddrs,  problem.B_scale, strategy.B_scale, problem, strategy, state);
     });
 
     if (readA && as2DLate) ls.schedule(reqLoadAqLate, [&](Iteration h) {
-        gemmALoad(state.A_scaleRegs, state.A_scaleLayout, state.A_scaleAddrs, problem.A_scale, state.A_scaleStrategy, problem, strategy, state);
+        gemmALoad(state.A_scaleRegs, state.A_scaleLayout, state.A_scaleAddrs, problem.A_scale, strategy.A_scale, problem, strategy, state);
     });
 
     if (readB && bs2DLate) ls.schedule(reqLoadBqLate, [&](Iteration h) {
-        gemmBLoad(state.B_scaleRegs, state.B_scaleLayout, state.B_scaleAddrs, problem.B_scale, state.B_scaleStrategy, problem, strategy, state);
+        gemmBLoad(state.B_scaleRegs, state.B_scaleLayout, state.B_scaleAddrs, problem.B_scale, strategy.B_scale, problem, strategy, state);
     });
 
     // Outer product(s).
@@ -1004,7 +1070,7 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
         auto &regsA = Ar_regs(h);
         auto &regsB = Br_regs(h);
 
-            outerProduct(h, ha, hb, oc, layoutA, layoutB, regsA, regsB, problem, strategy, state);
+            outerProduct(h, ha, hb, oc, opRemActive(h), layoutA, layoutB, regsA, regsB, problem, strategy, state);
 
         if (calcASums && !slmASums && !state.systolicSumA) {
             int ka_sum = (curPhase == LoopSequencer::PhaseMainLoop) ? ka_sumMain : oc;
@@ -1027,10 +1093,12 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
 
     if (slmDequantize2D) ls.schedule(reqSLMRepackQ, [&](Iteration h) {
         if (slmDequantize2DA) {
+            if (slmRemActive(h)) doRemaskAq(h, true);
             if (ao2D) gemmRepack2DOffsetData(Ta_ext, problem.Tao,    state.Tao_int,    state.A_offsetLayout, state.Ar_offsetLayout, state.A_offsetRegs, state.Ar_offsetRegs, problem, strategy, state);
             if (as2D) gemmRepack2DQuantizationData(problem.Ta_scale, state.Ta_scaleOp, state.A_scaleLayout,  state.Ar_scaleLayout,  state.A_scaleRegs,  state.Ar_scaleRegs,  problem, strategy, state);
         }
         if (slmDequantize2DB) {
+            if (slmRemActive(h)) doRemaskBq(h, true);
             if (bo2D) gemmRepack2DOffsetData(Tb_ext, problem.Tbo,    state.Tbo_int,    state.B_offsetLayout, state.Br_offsetLayout, state.B_offsetRegs, state.Br_offsetRegs, problem, strategy, state);
             if (bs2D) gemmRepack2DQuantizationData(problem.Tb_scale, state.Tb_scaleOp, state.B_scaleLayout,  state.Br_scaleLayout,  state.B_scaleRegs,  state.Br_scaleRegs,  problem, strategy, state);
         }
@@ -1238,7 +1306,7 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
 
     using CT = LoopSequencer::CallbackType;
 
-    Label lTop, lBottom;
+    Label lTop, lBottom, lNextTilePFL3;
     std::vector<Label> labels;
 
     ls.analyze();
@@ -1250,6 +1318,11 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
     if (barrierSubloop)
         outerK = state.ra.alloc_sub<uint32_t>();
 
+    // Prepare to peel loops for L3 prefetch, if needed.
+    Subregister l3PFPeelK;
+    if (strategy.prefetchABL3)
+        l3PFPeelK = state.ra.alloc_sub<uint32_t>();
+
     // Prepare to peel loops for C prefetch, if needed.
     int prefetchCPeelLoops = -1;
     Subregister pfCPeelK;
@@ -1281,7 +1354,16 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
         add(1, state.K, state.K, offset);
     });
     ls.setCallback(CT::LoopStart, [&](int unroll, int) {
-        cmp(1 | le | state.flagAP, state.K, 0);
+        if (strategy.prefetchABL3) {
+            int peel = strategy.prefetchABL3 - ls.getLoopBias();
+            if (peel < unroll) {
+                peel = unroll;
+                status << "Warning: L3 prefetch distance too short for k loop; extending" << status_stream::endl;
+            }
+            add(1 | le | state.flagAP, state.K, state.K, -peel);
+            mov(1, l3PFPeelK, peel);
+        } else
+            cmp(1 | le | state.flagAP, state.K, 0);
         if (prefetchCPeelLoops > 0) {
             min_(1, pfCPeelK, state.K, prefetchCPeelLoops * unrollK);
             add(1, state.K, state.K, -pfCPeelK);
@@ -1303,10 +1385,9 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
             sync.nop(SWSB(1));
         else if (hw >= HW::Gen12LP)
             sync.nop(SWSB(Pipe::A, 1));
-        jmpi(1 | state.flagAP, lBottom);
+        jmpi(1 | state.flagAP, strategy.prefetchABL3 ? lNextTilePFL3 : lBottom);
         mark(lTop);
         state.wipeActiveVFlags();
-
     });
     ls.setCallback(CT::LoopEnd, [&](int, int) {
         jmpi(1 | state.flagAP, lTop);
@@ -1338,6 +1419,18 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
             jmpi(1 | state.flagAP, lTop);
         }
         mark(lBottom);
+        if (strategy.prefetchABL3) {
+            Label lPeelDone;
+            cmp(1 | eq | state.flagAP, l3PFPeelK, 0);
+            jmpi(1 | state.flagAP, lPeelDone);
+            mark(lNextTilePFL3);
+            /* Start L3 prefetch for next tile */
+            gemmInitL3Prefetch(true, problem, strategy, state);
+            add(1 | le | state.flagAP, state.K, state.K, l3PFPeelK);
+            mov(1, l3PFPeelK, 0);
+            jmpi(1 | ~state.flagAP, lTop);
+            mark(lPeelDone);
+        }
         state.wipeActiveVFlags();
     });
     ls.setCallback(CT::JumpIfLT, [&](int thresh, int label) {
@@ -1376,14 +1469,14 @@ void BLASKernelGenerator<hw>::kLoop(KLoop type, const GEMMProblem &problem, GEMM
                     add(1, state.K, state.K, state.kNoBarrierEnd);
                 if (prefetchCPeelLoops == 0)
                     gemmPrefetchC(problem, strategy, state);
-                if (lateKLoopCheck)
-                    state.raVFlag.lock(state.flagAP);
                 haveA_lastRSWA = false;
                 status << "k loop cooldown" << status_stream::endl;
                 break;
             case LoopSequencer::PhaseShortLoop:
                 if (strategy.prefetchC > 0)
                     gemmPrefetchC(problem, strategy, state);
+                if (strategy.prefetchABL3)
+                    gemmInitL3Prefetch(true, problem, strategy, state);
                 status << "Short k loop" << status_stream::endl;
                 remActiveA = remActiveB = remActiveSLM = false;
                 resetForNewLoop();
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/k_loop_setup.cxx b/src/gpu/intel/jit/gemm/generator/pieces/k_loop_setup.cxx
index 61948af8b62..1e18f856d5f 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/k_loop_setup.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/k_loop_setup.cxx
@@ -104,6 +104,12 @@ bool BLASKernelGenerator<hw>::kLoopSetup(const GEMMProblem &problem, const GEMMS
                 state.modBarrierFence[q] = SBID(state.tokenBarrierFence[q]);
     }
 
+    // Update L3 prefetch enable flags.
+    if (strategy.l3PrefetchA)
+        mov(1, state.flagL3PFA, state.nextFlagL3PFA);
+    if (strategy.l3PrefetchB)
+        mov(1, state.flagL3PFB, state.nextFlagL3PFB);
+
     // Remainder load preparations.
     auto &ka_loadRem = state.ka_loadRem, &kb_loadRem = state.kb_loadRem;
     ka_loadRem = 1, kb_loadRem = 1;
@@ -374,6 +380,7 @@ void BLASKernelGenerator<hw>::kLoopTeardown(const GEMMProblem &problem, const GE
     safeReleaseRanges(state.Bo_regsRem, state);
     state.tokenAllocator.safeRelease(state.tokenBarrierFence[0]);
     state.tokenAllocator.safeRelease(state.tokenBarrierFence[1]);
+    gemmTeardownL3Prefetch(problem, strategy, state);
 }
 
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/kernel_queries.cpp b/src/gpu/intel/jit/gemm/generator/pieces/kernel_queries.cpp
index 198fcd11f67..36c486c71cd 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/kernel_queries.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/kernel_queries.cpp
@@ -93,6 +93,7 @@ bool keepIJ0(const GEMMProblem &problem, const GEMMStrategy &strategy)
     if (problem.hasBinaryPostOp()) return true;
     if (problem.aoPtrDims > 0 || problem.boPtrDims > 0) return true;
     if (problem.aScale2D || problem.bScale2D) return true;
+    if (problem.earlyDequantizeA() ||  problem.earlyDequantizeB()) return true;
     return false;
 }
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/l3_prefetch.cxx b/src/gpu/intel/jit/gemm/generator/pieces/l3_prefetch.cxx
new file mode 100644
index 00000000000..1469e9f5d70
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/generator/pieces/l3_prefetch.cxx
@@ -0,0 +1,255 @@
+
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+
+#include "generator.hpp"
+#include "cooperative_split.hpp"
+#include "loop_sequencer.hpp"
+
+using namespace ngen;
+
+#include "internal/namespace_start.hxx"
+
+
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmInitL3Prefetch(bool nextWave, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    if (!strategy.prefetchABL3) return;
+
+    status << (nextWave ? "Start L3 prefetch on next tile"
+                        : "Prepare for L3 prefetch") << status_stream::endl;
+
+    auto Ta_ext = problem.Ta_ext, Tb_ext = problem.Tb_ext;
+    bool doA = strategy.l3PrefetchA;
+    bool doB = strategy.l3PrefetchB;
+
+    auto &gidMN = state.groupIDMN;
+    auto gcMN = state.inputs.groupCountMN;
+    Subregister nextGroupID;
+
+    if (nextWave) {
+        nextGroupID = state.ra.alloc_sub<uint32_t>();
+        add(1, nextGroupID, gidMN, state.inputs.groupStride);
+    } else
+        nextGroupID = gidMN;
+
+    if (gcMN.isInvalid()) {
+        gcMN = state.ra.alloc_sub<uint32_t>();
+        emul(1, gcMN, state.inputs.groupCountM, state.inputs.groupCountN, strategy, state);
+    }
+
+    if (state.nextGroupIDM.isInvalid()) state.nextGroupIDM = state.ra.alloc_sub<uint32_t>();
+    if (state.nextGroupIDN.isInvalid()) state.nextGroupIDN = state.ra.alloc_sub<uint32_t>();
+
+    auto &nextFlagL3PFA = state.nextFlagL3PFA;
+    auto &nextFlagL3PFB = state.nextFlagL3PFB;
+    if (nextFlagL3PFA.isInvalid() && nextFlagL3PFB.isInvalid()) {
+        auto storage = state.ra.alloc_sub<uint32_t>();
+        if (doA) nextFlagL3PFA = storage.uw(0);
+        if (doB) nextFlagL3PFB = storage.uw(1);
+    }
+
+    gemmLinearOrder(nextGroupID, state.nextGroupIDM, state.nextGroupIDN,
+                    nextFlagL3PFA, nextFlagL3PFB,
+                    problem, strategy, state);
+
+#if 1
+    if (gpu_utils::dev_getenv("ALL_PF",0)) {
+        if (doA) mov(1, nextFlagL3PFA, 0xFFFF);
+        if (doB) mov(1, nextFlagL3PFB, 0xFFFF);
+    }
+#endif
+
+    if (nextWave) {
+        cmp(1 | ge | state.flagAP, nextGroupID, gcMN);
+        if (doA) mov(1 | state.flagAP, nextFlagL3PFA, 0);
+        if (doB) mov(1 | state.flagAP, nextFlagL3PFB, 0);
+    }
+
+    auto nextI0 = state.ra.alloc_sub<uint32_t>();
+    auto nextJ0 = state.ra.alloc_sub<uint32_t>();
+
+    if (doA) mulConstant(1, nextI0, state.nextGroupIDM, strategy.wgTile(LoopM));
+    if (doB) mulConstant(1, nextJ0, state.nextGroupIDN, strategy.wgTile(LoopN));
+
+    auto coopSplitA = naturalSplitA(problem.A.layout);
+    auto coopSplitB = naturalSplitB(problem.B.layout);
+
+    auto effApL3 = state.inputs.A;
+    auto effBpL3 = state.inputs.B;
+    Address2DParams Apl3_params, Bpl3_params;
+
+    Apl3_params.rows = state.inputs.m;
+    Apl3_params.cols = state.fullK;
+    Apl3_params.offR = nextI0;
+    Bpl3_params.rows = state.fullK;
+    Bpl3_params.cols = state.inputs.n;
+    Bpl3_params.offC = nextJ0;
+
+    int ma_prefetchL3, ka_prefetchL3;
+    int kb_prefetchL3, nb_prefetchL3;
+
+    if (doA) coopSplit(true,  ma_prefetchL3, ka_prefetchL3, strategy.unroll[LoopM], strategy.ka_prefetchL3, strategy.wgTile(LoopM), coopSplitA, problem.A, strategy);
+    if (doB) coopSplit(false, kb_prefetchL3, nb_prefetchL3, strategy.kb_prefetchL3, strategy.unroll[LoopN], strategy.wgTile(LoopN), coopSplitB, problem.B, strategy);
+
+    std::swap(state.effCoopA, coopSplitA);  /* Temporarily override A/B splitting */
+    std::swap(state.effCoopB, coopSplitB);
+
+    if (doA) gemmApplyWorkshareOffset(true,  effApL3, state.inputs.A, Apl3_params, problem.A, strategy.AB_prefetchL3, ma_prefetchL3, ka_prefetchL3, problem, strategy, state);
+    if (doB) gemmApplyWorkshareOffset(false, effBpL3, state.inputs.B, Bpl3_params, problem.B, strategy.AB_prefetchL3, kb_prefetchL3, nb_prefetchL3, problem, strategy, state);
+
+    std::swap(state.effCoopA, coopSplitA);
+    std::swap(state.effCoopB, coopSplitB); /* ... and restore */
+
+    if (!strategy.AB_prefetchL3.address2D) {
+        if (doA) gemmOffsetAm(nextI0, effApL3, problem.A, problem, strategy, state);
+        if (doB) gemmOffsetBn(nextJ0, effBpL3, problem.B, problem, strategy, state);
+    }
+
+    if (strategy.kParallelLocal) stub();
+
+    if (doA && state.Apl3_layout.empty()) {
+        state.flagL3PFA = state.raVFlag.alloc();
+        if (!getRegLayout(Ta_ext, state.Apl3_layout, ma_prefetchL3, ka_prefetchL3, false, false, false, AvoidFragment, 0, 0, problem.A, strategy.AB_prefetchL3)) stub();
+        for (auto &block: state.Apl3_layout)
+            block.flag[0] = state.flagL3PFA;
+        allocAddrRegs(state.Apl3_addrs, state.Apl3_layout, problem.A, strategy.AB_prefetchL3, state);
+    }
+
+    if (doB && state.Bpl3_layout.empty()) {
+        state.flagL3PFB = state.raVFlag.alloc();
+        if (!getRegLayout(Tb_ext, state.Bpl3_layout, kb_prefetchL3, nb_prefetchL3, false, false, false, AvoidFragment, 0, 0, problem.B, strategy.AB_prefetchL3)) stub();
+        for (auto &block: state.Bpl3_layout)
+            block.flag[0] = state.flagL3PFB;
+        allocAddrRegs(state.Bpl3_addrs, state.Bpl3_layout, problem.B, strategy.AB_prefetchL3, state);
+    }
+
+    if (doA) setupAddr(Ta_ext, state.Apl3_addrs, effApL3, state.Apl3_layout, state.inputs.lda, problem.A, strategy.AB_prefetchL3, strategy, state, Apl3_params);
+    if (doB) setupAddr(Tb_ext, state.Bpl3_addrs, effBpL3, state.Bpl3_layout, state.inputs.ldb, problem.B, strategy.AB_prefetchL3, strategy, state, Bpl3_params);
+
+    if (doA) mov(1, state.flagL3PFA, nextFlagL3PFA);
+    if (doB) mov(1, state.flagL3PFB, nextFlagL3PFB);
+
+    state.ra.safeRelease(nextI0);
+    state.ra.safeRelease(nextJ0);
+    if (nextWave)
+        state.ra.safeRelease(nextGroupID);
+    state.ra.safeRelease(gcMN);
+    state.ra.claim(state.inputs.groupCountMN);
+    if (effApL3 != state.inputs.A) state.ra.safeRelease(effApL3);
+    if (effBpL3 != state.inputs.B) state.ra.safeRelease(effBpL3);
+    for (auto off: {&Address2DParams::offR, &Address2DParams::offC}) {
+        if (Apl3_params.*off != state.A_params.*off) state.ra.safeRelease(Apl3_params.*off);
+        if (Bpl3_params.*off != state.B_params.*off) state.ra.safeRelease(Bpl3_params.*off);
+    }
+}
+
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmWarmupL3Prefetch(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    if (!strategy.prefetchABL3) return;
+
+    status << "L3 prefetch warmup" << status_stream::endl;
+
+    using namespace loop_sequencer;
+    LoopSequencer ls;
+
+    gemmScheduleL3Prefetches(&ls, problem, strategy, state);
+    gemmScheduleL3PrefetchIncs(&ls, problem, strategy, state, false);
+
+    std::vector<Label> labels;
+    int lastThresh = -1;
+
+    using CT = LoopSequencer::CallbackType;
+
+    ls.setCallback(CT::JumpIfLT, [&](int thresh, int label) {
+        if (size_t(label) >= labels.size())
+            labels.resize(label + 1);
+        if (thresh != lastThresh)
+            cmp(1 | lt | state.flagAP, state.k, thresh);
+        jmpi(1 | state.flagAP, labels[label]);
+        lastThresh = thresh;
+    });
+    ls.setCallback(CT::JumpTarget, [&](int label, int) {
+        mark(labels[label]);
+    });
+    ls.setCallback(CT::Jump, [&](int label, int) {
+        if (size_t(label) >= labels.size())
+            labels.resize(label + 1);
+        jmpi(1, labels[label]);
+    });
+
+    ls.analyze();
+    ls.materialize(0, strategy.prefetchABL3);
+}
+
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmScheduleL3Prefetches(void *lsPtr, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    using namespace loop_sequencer;
+    auto &ls = *reinterpret_cast<loop_sequencer::LoopSequencer*>(lsPtr);
+
+    auto reqL3PFA = every(strategy.ka_prefetchL3);
+    auto reqL3PFB = every(strategy.kb_prefetchL3);
+
+    if (strategy.l3PrefetchA) ls.schedule(reqL3PFA, [&](Iteration h) {
+        gemmALoad(GRFMultirange(), state.Apl3_layout, state.Apl3_addrs,
+                  problem.A, strategy.AB_prefetchL3, problem, strategy, state);
+    });
+
+    if (strategy.l3PrefetchB) ls.schedule(reqL3PFB, [&](Iteration h) {
+        gemmBLoad(GRFMultirange(), state.Bpl3_layout, state.Bpl3_addrs,
+                  problem.B, strategy.AB_prefetchL3, problem, strategy, state);
+    });
+}
+
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmScheduleL3PrefetchIncs(void *lsPtr, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool allowDelay)
+{
+    using namespace loop_sequencer;
+    auto &ls = *reinterpret_cast<loop_sequencer::LoopSequencer*>(lsPtr);
+
+    auto reqL3PFA = every(strategy.ka_prefetchL3);
+    auto reqL3PFB = every(strategy.kb_prefetchL3);
+
+    allowDelay &= strategy.delayABInc;
+    auto delayA = allowDelay ? (strategy.ka_prefetchL3 >> 1) : 0;
+    auto delayB = allowDelay ? (strategy.kb_prefetchL3 >> 1) : 0;
+
+    if (strategy.l3PrefetchA) ls.schedule(reqL3PFA.delay(delayA), [&](Iteration h) {
+        gemmAIncrement(problem.Ta_ext, state.Apl3_layout, state.Apl3_addrs,
+                       problem.A, strategy.AB_prefetchL3, strategy.ka_prefetchL3,
+                       problem, strategy, state);
+    });
+
+    if (strategy.l3PrefetchB) ls.schedule(reqL3PFB.delay(delayB), [&](Iteration h) {
+        gemmBIncrement(problem.Tb_ext, state.Bpl3_layout, state.Bpl3_addrs,
+                       problem.B, strategy.AB_prefetchL3, strategy.kb_prefetchL3,
+                       problem, strategy, state);
+    });
+}
+
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmTeardownL3Prefetch(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    // Not much teardown to do. Free flags (we will restore them next loop),
+    //   but leave address registers in place.
+    state.raVFlag.safeRelease(state.flagL3PFA);
+    state.raVFlag.safeRelease(state.flagL3PFB);
+}
+
+#include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/layout_setup.cxx b/src/gpu/intel/jit/gemm/generator/pieces/layout_setup.cxx
index 20bab3f7a1f..3845ae1ad3d 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/layout_setup.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/layout_setup.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -262,7 +262,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                 vymask.bitRep = consecutive;
                 vymask.maskRep = 1;
                 vymask.rsize = *yblock;
-                vymask.rdivide = 1;
+                vymask.rshift = 0;
             } else if (logicalSlots < slots) {
                 auto &fymask = block.colMajor ? block.rowMask.fixed : block.colMask.fixed;
                 fymask.isFixed = true;
@@ -274,12 +274,12 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
             //   (ditto for regular scattered float with new dataport messages.)
             //  Otherwise, fragment 2 is possible for DWord+ types but not implemented.
             if (remainderX) {
-                if (avoidFragment && block.count == 1) {
+                if (avoidFragment && (*xblock == 1 || block.count == 1)) {
                     vxmask.isFixed = false;
                     vxmask.bitRep = (block.simdSize > 16) ? 32 : 16;
                     vxmask.maskRep = 1;
                     vxmask.rsize = 1;
-                    vxmask.rdivide = 1;
+                    vxmask.rshift = 0;
                 } else if (allowDesc && (channelScattered || astrategy.newDP) && *xblock > 1 && !byte) {
                     fragment = std::min(*xblock, 4 * width / T);
                     if (block.colMajor)             // Clang can't handle the ternary operator equivalent of this.
@@ -482,7 +482,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                         vrmask.rsize = rblock;
                         vrmask.bitRep = std::max<int>(T.paddedSize() / maskGranularity, 1);
                         vrmask.maskRep = cblock;
-                        vrmask.rdivide = std::max<int>(maskGranularity / T, 1);
+                        vrmask.rshift = ilog2(std::max<int>(maskGranularity / T, 1));
                     }
                 } else {
                     if (avoidFragment) {
@@ -491,8 +491,8 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                         vrmask.isFixed = false;
                         vrmask.bitRep = 0;  /* will be filled in later */
                         vrmask.maskRep = 1;
-                        vrmask.rdivide = 1;
                         vrmask.rsize = 1;
+                        vrmask.rshift = 0;
                     } else {
                         // Fragment it. Could actually handle rowFragment = 2 by changing descriptor.
                         block.rowFragment = 1;
@@ -520,7 +520,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                         vcmask.rsize = cblock;
                         vcmask.bitRep = std::max<int>(T.paddedSize() / maskGranularity, 1);
                         vcmask.maskRep = rblock;
-                        vcmask.rdivide = std::max<int>(maskGranularity / T, 1);
+                        vcmask.rshift = ilog2(std::max<int>(maskGranularity / T, 1));
                     }
                 } else {
                     if (avoidFragment) {
@@ -529,8 +529,8 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                         vcmask.isFixed = false;
                         vcmask.bitRep = 0;
                         vcmask.maskRep = 1;
-                        vcmask.rdivide = 1;
                         vcmask.rsize = 1;
+                        vcmask.rshift = 0;
                     } else {
                         // Fragment it. Could actually handle colFragment = 2 by changing descriptor.
                         block.colFragment = 1;
@@ -680,6 +680,16 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                 block.byteGlue = true;
                 block.crosspack /= T.perByte();
             }
+
+            // Xe2: manually mask in the height dimension to work around slow LSC
+            //      out-of-bounds checks.
+            bool remainderH = memCM ? remainderC : remainderR;
+            if (hw >= HW::Xe2 && remainderH) {
+                auto &vymask = memCM ? block.colMask.variable : block.rowMask.variable;
+                vymask.isFixed = false;
+                vymask.bitRep = vymask.maskRep = vymask.rsize = 1;
+                vymask.rshift = 0;
+            }
             break;
         }
         case AccessType::CacheLine: {
@@ -719,7 +729,8 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                 auto &vxmask = block.colMajor ? block.rowMask.variable : block.colMask.variable;
                 vxmask.isFixed = false;
                 vxmask.bitRep = block.simdSize;
-                vxmask.maskRep = vxmask.rdivide = vxmask.rsize = 1;
+                vxmask.maskRep = vxmask.rsize = 1;
+                vxmask.rshift = 0;
             }
 
             if (remainderY) {
@@ -728,7 +739,7 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
                 vymask.bitRep = xCacheLines;
                 vymask.maskRep = 1;
                 vymask.rsize = yblock;
-                vymask.rdivide = 1;
+                vymask.rshift = 0;
             }
             break;
         }
@@ -739,13 +750,13 @@ bool BLASKernelGenerator<hw>::getBlockInfo(Type T, const MatrixAddressing &atype
     if (block.rowMask && !block.rowMask.fixed.isFixed) {
         if (vrmask.rsize == 0)
             vrmask.rsize = rblock;
-        vrmask.maskRep = std::min<int>(vrmask.maskRep, std::max<int>(1, vrmask.rdivide * block.simdSize / (vrmask.bitRep * vrmask.rsize)));
+        vrmask.maskRep = std::min<int>(vrmask.maskRep, std::max<int>(1, (block.simdSize << vrmask.rshift) / (vrmask.bitRep * vrmask.rsize)));
         block.noRowsOK = true;          // All-zero masks are always OK.
     }
     if (block.colMask && !block.colMask.fixed.isFixed) {
         if (vcmask.rsize == 0)
             vcmask.rsize = cblock;
-        vcmask.maskRep = std::min<int>(vcmask.maskRep, std::max<int>(1, vcmask.rdivide * block.simdSize / (vcmask.bitRep * vcmask.rsize)));
+        vcmask.maskRep = std::min<int>(vcmask.maskRep, std::max<int>(1, (block.simdSize << vcmask.rshift) / (vcmask.bitRep * vcmask.rsize)));
         block.noColsOK = true;
     }
 
@@ -793,156 +804,6 @@ bool BLASKernelGenerator<hw>::tryAddRemainder(Type T, RegisterBlock &block, bool
     return true;
 }
 
-// Split 2D block array loads into multiple blocks.
-static inline void postprocessLayout2D(vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    if (!isBlock2D(astrategy.accessType)) return;
-
-    int maxCount = 1;
-    for (auto &block : layout)
-        maxCount = std::max(maxCount, int(block.count));
-    if (maxCount == 1) return;
-
-    vector<RegisterBlock> xlayout;
-    xlayout.reserve(layout.size() * maxCount);
-
-    for (auto &block : layout) {
-        bool cm = block.colMajor;
-        auto RegisterBlock::* nx      = cm ? &RegisterBlock::nr      : &RegisterBlock::nc;
-        auto RegisterBlock::* offsetX = cm ? &RegisterBlock::offsetR : &RegisterBlock::offsetC;
-
-        auto nblock = block;
-        nblock.*nx /= block.count;
-        nblock.ld /= block.count;
-
-        for (int i = 0; i < block.count; i++) {
-            xlayout.push_back(nblock);
-            nblock.*offsetX += nblock.*nx;
-            nblock.simdSize = 0;           // Blocks > 0 do not need loads.
-        }
-    }
-
-    std::swap(layout, xlayout);
-}
-
-// Split blocks that span multiple tiles. Requires each tile to be contained within a single block.
-static inline void postprocessLayoutMultitile(Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    if (!atype.tileR || !atype.tileC) return;
-    if (isLargeCrosspack(T, atype.crosspack)) return;
-
-    bool needToSplit = false;
-    for (const auto &block: layout)
-        needToSplit |= (block.colMajor ? (block.nr > atype.tileR) : (block.nc > atype.tileC));
-
-    if (!needToSplit) return;
-
-    vector<RegisterBlock> xlayout;
-    xlayout.reserve(layout.size());
-
-    for (const auto &block: layout) {
-        auto nx      = block.colMajor ? &RegisterBlock::nr      : &RegisterBlock::nc;
-        auto ny      = block.colMajor ? &RegisterBlock::nc      : &RegisterBlock::nr;
-        auto offsetX = block.colMajor ? &RegisterBlock::offsetR : &RegisterBlock::offsetC;
-        auto offsetY = block.colMajor ? &RegisterBlock::offsetC : &RegisterBlock::offsetR;
-        auto tileX   = block.colMajor ? atype.tileR             : atype.tileC;
-        auto tileY   = block.colMajor ? atype.tileC             : atype.tileR;
-
-        if (block.*nx == tileX) {
-            xlayout.push_back(block);
-            continue;
-        }
-
-        if (block.*nx % tileX || block.*offsetX % tileX || block.*ny % tileY || block.*offsetY % tileY) stub();
-        if (isTransposing(astrategy.accessType)) stub();
-
-        auto nblock = block;
-        nblock.*nx = tileX;
-        nblock.*ny = tileY;
-        nblock.ld = tileX;
-
-        for (int j = 0; j < block.*ny / tileY; j++) {
-            for (int i = 0; i < block.*nx / tileX; i++) {
-                nblock.*offsetX = block.*offsetX + i * tileX;
-                nblock.*offsetY = block.*offsetY + j * tileY;
-                xlayout.push_back(nblock);
-                nblock.simdSize = 0;
-            }
-        }
-    }
-
-    std::swap(layout, xlayout);
-}
-
-// Split large crosspack blocks into smaller pieces so that they can be transposed.
-static inline void postprocessLayoutLargeCP(Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    if (!isLargeCrosspack(T, atype.crosspack))
-        return;
-
-    bool haveLargeCP = false;
-    for (const auto &block: layout) {
-        haveLargeCP |= isLargeCrosspack(T, block.crosspack);
-        if (haveLargeCP) break;
-    }
-
-    if (!haveLargeCP) return;
-
-    vector<RegisterBlock> xlayout;
-    xlayout.reserve(layout.size());
-
-    for (const auto &block: layout) {
-        if (!isLargeCrosspack(T, block.crosspack))
-            xlayout.push_back(block);
-        else {
-            auto ny      = block.colMajor ? &RegisterBlock::nc      : &RegisterBlock::nr;
-            auto offsetY = block.colMajor ? &RegisterBlock::offsetC : &RegisterBlock::offsetR;
-
-            if (block.*ny % block.crosspack)
-                return;
-            int blocks = (block.*ny / block.crosspack);
-            auto nblock = block;
-            nblock.*ny = block.crosspack;
-            nblock.simplify(T);
-            for (int i = 0; i < blocks; i++) {
-                xlayout.push_back(nblock);
-                nblock.simdSize = 0;
-                nblock.*offsetY += nblock.*ny;
-            }
-        }
-    }
-
-    std::swap(layout, xlayout);
-}
-
-// Remove unneeded blocks from a dpasw src2 layout.
-static inline void postprocessLayoutDPASW(vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    if (!astrategy.dpasw)
-        return;
-
-    vector<RegisterBlock> nlayout;
-    nlayout.reserve(layout.size() / 2);
-
-    bool cm = isLayoutColMajor(layout);
-    auto tile = cm ? astrategy.tileC : astrategy.tileR;
-    auto offsetX = cm ? &RegisterBlock::offsetC : &RegisterBlock::offsetR;
-
-    for (const auto &block: layout)
-        if ((block.*offsetX % (2 * tile)) < tile)
-            nlayout.push_back(block);
-
-    layout = std::move(nlayout);
-}
-
-static inline void postprocessLayout(Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    postprocessLayout2D(layout, atype, astrategy);
-    postprocessLayoutMultitile(T, layout, atype, astrategy);
-    postprocessLayoutLargeCP(T, layout, atype, astrategy);
-    postprocessLayoutDPASW(layout, atype, astrategy);
-}
-
 // Add a submatrix to a register layout.
 template <HW hw>
 bool BLASKernelGenerator<hw>::addToRegLayout(Type T, std::vector<RegisterBlock> &layout,
@@ -1141,89 +1002,6 @@ bool BLASKernelGenerator<hw>::add1DBlockToRegLayout(Type T, vector<RegisterBlock
     return true;
 }
 
-static void finalizeLayout(HW hw, Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    int offsetBytes = 0;
-    for (auto &block : layout) {
-        if (block.isLoadBlock() || isBlock2D(astrategy.accessType))
-            offsetBytes = ngen::utils::alignup_pow2(offsetBytes, GRF::bytes(hw));
-        block.calcBytes(T, astrategy);
-        block.offsetBytes = offsetBytes;
-        offsetBytes += block.bytes;
-        block.simplify(T);
-    }
-}
-
-// Return maximum immediate address offset for a send message.
-static inline int maxOffsetAddr(Type T, const MatrixAddressingStrategy &astrategy)
-{
-    switch (astrategy.base.getModel()) {
-        case ModelA64:
-        case ModelSLM: return 1 << 19;
-        case ModelA32:
-        case ModelBTS: return 1 << 11;
-        default: return 0;
-    }
-}
-
-// Identify and combine block address registers that differ only by constant offsets.
-void coalesceAddrs(HW hw, Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
-{
-    if (hw < HW::Xe2) return;
-    if (!astrategy.newDP) return;
-    if (layout.empty()) return;
-    if (astrategy.noCoalesce) return;
-
-    RegisterBlock *anchor = &layout[0];
-    int max = maxOffsetAddr(T, astrategy);
-
-    for (auto &block: layout) {
-        int dr = block.offsetR - anchor->offsetR;
-        int dc = block.offsetC - anchor->offsetC;
-
-        auto accessType = implAccessType(atype, astrategy, block);
-
-        if (isBlock2D(accessType)) {
-            if (block.nr == anchor->nr && block.nc == anchor->nc && block.count == anchor->count) {
-                int ox, oy;
-                switch (atype.layout) {
-                    case MatrixLayout::N: ox = dr; oy = dc; break;
-                    case MatrixLayout::T: ox = dc; oy = dr; break;
-                    default: return;
-                }
-                block.set2DOffset(ox * T / block.ebytes, oy);
-            } else {
-                // No match. Make this block the new anchor.
-                anchor = &block;
-            }
-        } else {
-            switch (atype.layout) {
-                case MatrixLayout::N: if (dc == 0) block.offsetAddr = dr; break;
-                case MatrixLayout::T: if (dr == 0) block.offsetAddr = dc; break;
-                case MatrixLayout::Pr:
-                case MatrixLayout::Pc:
-                    auto offsetX = (atype.layout == MatrixLayout::Pc) ? &RegisterBlock::offsetR
-                                                                      : &RegisterBlock::offsetC;
-                    if (block.*offsetX / atype.packSize == anchor->*offsetX / atype.packSize)
-                        block.offsetAddr = untile(T, atype, block);
-                    break;
-            }
-
-            block.offsetAddr *= T;
-            if (block.offsetAddr >= max || block.offsetAddr < -max)
-                block.offsetAddr = 0;
-            if (one_of(accessType, AccessType::Scattered, AccessType::ChannelScattered))
-                if (block.simdSize > anchor->simdSize)
-                    block.offsetAddr = 0;
-            if (block.offsetAddr & 0x3)
-                block.offsetAddr = 0;
-
-            if (block.offsetAddr == 0)
-                anchor = &block;
-        }
-    }
-}
-
 // Create a register layout for a matrix.
 template <HW hw>
 bool BLASKernelGenerator<hw>::getRegLayout(Type T, vector<RegisterBlock> &layout, int r, int c,
@@ -1635,8 +1413,8 @@ void BLASKernelGenerator<hw>::adjustSubblockAddrs(Type T, const vector<RegisterB
                          : mov(1, subaddr[0].ud(3), newH);
                 }
             }
-
-            updateBlock2DSizes(subaddr[0], subblock, block, atype);
+	    if (subaddr.isValid())
+		updateBlock2DSizes(subaddr[0], subblock, block, atype);
         }
     }
 }
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.cpp b/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.cpp
index 44e77928535..c95dbd55a8e 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -215,7 +215,9 @@ bool tryAllocAddrRegs(vector<GRFRange> &addrRegs, const vector<RegisterBlock> &l
     GRFRange last;
     for (int l = 0; l < nblocks && ok; l++) {
         if (layout[l].offsetAddr == 0) {
-            last = state.ra.try_alloc_range(addrGRFCount(atype, astrategy, layout[l]), hint);
+            auto count = addrGRFCount(atype, astrategy, layout[l]);
+            if (count < 1) continue;
+            last = state.ra.try_alloc_range(count, hint);
             ok &= last.isValid();
         }
         addrRegs[l] = last;
@@ -249,9 +251,10 @@ void getLayoutDims(const vector<RegisterBlock> &layout, int &m, int &n)
     // For now all layouts are sorted so last block is in lower-right corner.
     if (layout.size() == 0)
         stub("Empty layout.");
+    auto &first = layout[0];
     auto &last = layout[layout.size() - 1];
-    m = last.offsetR + last.nr;
-    n = last.offsetC + last.nc;
+    m = last.offsetR + last.nr - first.offsetR;
+    n = last.offsetC + last.nc - first.offsetC;
 }
 
 bool hasFullCrosspack(const vector<RegisterBlock> &layout, int crosspack)
@@ -569,6 +572,8 @@ void assignUniformMask(vector<RegisterBlock> &layout, FlagRegister flag, int idx
 bool assignAllDescs(vector<RegisterBlock> &layout)
 {
     for (auto &block : layout) {
+        if (!block.descRemR && !block.descRemC)
+            continue;
         if (block.simdSize != layout[0].simdSize)
             return false;
         block.descAssigned = true;
@@ -578,4 +583,266 @@ bool assignAllDescs(vector<RegisterBlock> &layout)
     return true;
 }
 
+// Split 2D block array loads into multiple blocks.
+void postprocessLayout2D(vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    if (!isBlock2D(astrategy.accessType)) return;
+
+    int maxCount = 1;
+    for (auto &block : layout)
+        maxCount = std::max(maxCount, int(block.count));
+    if (maxCount == 1) return;
+
+    vector<RegisterBlock> xlayout;
+    xlayout.reserve(layout.size() * maxCount);
+
+    for (auto &block : layout) {
+        bool cm = block.colMajor;
+        auto RegisterBlock::* nx      = cm ? &RegisterBlock::nr      : &RegisterBlock::nc;
+        auto RegisterBlock::* offsetX = cm ? &RegisterBlock::offsetR : &RegisterBlock::offsetC;
+
+        auto nblock = block;
+        nblock.*nx /= block.count;
+        nblock.ld /= block.count;
+
+        for (int i = 0; i < block.count; i++) {
+            xlayout.push_back(nblock);
+            nblock.*offsetX += nblock.*nx;
+            nblock.simdSize = 0;           // Blocks > 0 do not need loads.
+        }
+    }
+
+    std::swap(layout, xlayout);
+}
+
+// Split blocks that span multiple tiles. Requires each tile to be contained within a single block.
+void postprocessLayoutMultitile(Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    if (!atype.tileR || !atype.tileC) return;
+    if (isLargeCrosspack(T, atype.crosspack)) return;
+
+    bool needToSplit = false;
+    for (const auto &block: layout)
+        needToSplit |= (block.colMajor ? (block.nr > atype.tileR) : (block.nc > atype.tileC));
+
+    if (!needToSplit) return;
+
+    vector<RegisterBlock> xlayout;
+    xlayout.reserve(layout.size());
+
+    for (const auto &block: layout) {
+        auto nx      = block.colMajor ? &RegisterBlock::nr      : &RegisterBlock::nc;
+        auto ny      = block.colMajor ? &RegisterBlock::nc      : &RegisterBlock::nr;
+        auto offsetX = block.colMajor ? &RegisterBlock::offsetR : &RegisterBlock::offsetC;
+        auto offsetY = block.colMajor ? &RegisterBlock::offsetC : &RegisterBlock::offsetR;
+        auto tileX   = block.colMajor ? atype.tileR             : atype.tileC;
+        auto tileY   = block.colMajor ? atype.tileC             : atype.tileR;
+
+        if (block.*nx == tileX) {
+            xlayout.push_back(block);
+            continue;
+        }
+
+        if (block.*nx % tileX || block.*offsetX % tileX || block.*ny % tileY || block.*offsetY % tileY) stub();
+        if (isTransposing(astrategy.accessType)) stub();
+
+        auto nblock = block;
+        nblock.*nx = tileX;
+        nblock.*ny = tileY;
+        nblock.ld = tileX;
+
+        for (int j = 0; j < block.*ny / tileY; j++) {
+            for (int i = 0; i < block.*nx / tileX; i++) {
+                nblock.*offsetX = block.*offsetX + i * tileX;
+                nblock.*offsetY = block.*offsetY + j * tileY;
+                xlayout.push_back(nblock);
+                nblock.simdSize = 0;
+            }
+        }
+    }
+
+    std::swap(layout, xlayout);
+}
+
+// Split large crosspack blocks into smaller pieces so that they can be transposed.
+void postprocessLayoutLargeCP(Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    if (!isLargeCrosspack(T, atype.crosspack))
+        return;
+
+    bool haveLargeCP = false;
+    for (const auto &block: layout) {
+        haveLargeCP |= isLargeCrosspack(T, block.crosspack);
+        if (haveLargeCP) break;
+    }
+
+    if (!haveLargeCP) return;
+
+    vector<RegisterBlock> xlayout;
+    xlayout.reserve(layout.size());
+
+    for (const auto &block: layout) {
+        if (!isLargeCrosspack(T, block.crosspack))
+            xlayout.push_back(block);
+        else {
+            auto ny      = block.colMajor ? &RegisterBlock::nc      : &RegisterBlock::nr;
+            auto offsetY = block.colMajor ? &RegisterBlock::offsetC : &RegisterBlock::offsetR;
+
+            if (block.*ny % block.crosspack)
+                return;
+            int blocks = (block.*ny / block.crosspack);
+            auto nblock = block;
+            nblock.*ny = block.crosspack;
+            nblock.simplify(T);
+            for (int i = 0; i < blocks; i++) {
+                xlayout.push_back(nblock);
+                nblock.simdSize = 0;
+                nblock.*offsetY += nblock.*ny;
+            }
+        }
+    }
+
+    std::swap(layout, xlayout);
+}
+// Remove unneeded blocks from a dpasw src2 layout.
+void postprocessLayoutDPASW(vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    if (!astrategy.dpasw)
+        return;
+
+    vector<RegisterBlock> nlayout;
+    nlayout.reserve(layout.size() / 2);
+
+    bool cm = isLayoutColMajor(layout);
+    auto tile = cm ? astrategy.tileC : astrategy.tileR;
+    auto offsetX = cm ? &RegisterBlock::offsetC : &RegisterBlock::offsetR;
+
+    for (const auto &block: layout)
+        if ((block.*offsetX % (2 * tile)) < tile)
+            nlayout.push_back(block);
+
+    layout = std::move(nlayout);
+}
+
+void postprocessLayout(Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    postprocessLayout2D(layout, atype, astrategy);
+    postprocessLayoutMultitile(T, layout, atype, astrategy);
+    postprocessLayoutLargeCP(T, layout, atype, astrategy);
+    postprocessLayoutDPASW(layout, atype, astrategy);
+}
+
+void finalizeLayout(HW hw, Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    int offsetBytes = 0;
+    for (auto &block : layout) {
+        if (block.isLoadBlock() || isBlock2D(astrategy.accessType))
+            offsetBytes = ngen::utils::alignup_pow2(offsetBytes, GRF::bytes(hw));
+        block.calcBytes(T, astrategy);
+        block.offsetBytes = offsetBytes;
+        offsetBytes += block.bytes;
+        block.simplify(T);
+    }
+}
+
+// Return maximum immediate address offset for a send message.
+int maxOffsetAddr(Type T, const MatrixAddressingStrategy &astrategy)
+{
+    switch (astrategy.base.getModel()) {
+        case ModelA64:
+        case ModelSLM: return 1 << 19;
+        case ModelA32:
+        case ModelBTS: return 1 << 11;
+        default: return 0;
+    }
+}
+
+// Identify and combine block address registers that differ only by constant offsets.
+void coalesceAddrs(HW hw, Type T, vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy)
+{
+    if (hw < HW::Xe2) return;
+    if (!astrategy.newDP) return;
+    if (layout.empty()) return;
+    if (astrategy.noCoalesce) return;
+
+    RegisterBlock *anchor = &layout[0];
+    int max = maxOffsetAddr(T, astrategy);
+
+    for (auto &block: layout) {
+        int dr = block.offsetR - anchor->offsetR;
+        int dc = block.offsetC - anchor->offsetC;
+
+        auto accessType = implAccessType(atype, astrategy, block);
+
+        if (isBlock2D(accessType)) {
+            if (block.nr == anchor->nr && block.nc == anchor->nc && block.count == anchor->count) {
+                int ox, oy;
+                switch (atype.layout) {
+                    case MatrixLayout::N: ox = dr; oy = dc; break;
+                    case MatrixLayout::T: ox = dc; oy = dr; break;
+                    default: return;
+                }
+                block.set2DOffset(ox * T / block.ebytes, oy);
+            } else {
+                // No match. Make this block the new anchor.
+                anchor = &block;
+            }
+        } else {
+            switch (atype.layout) {
+                case MatrixLayout::N: if (dc == 0) block.offsetAddr = dr; break;
+                case MatrixLayout::T: if (dr == 0) block.offsetAddr = dc; break;
+                case MatrixLayout::Pr:
+                case MatrixLayout::Pc:
+                    auto offsetX = (atype.layout == MatrixLayout::Pc) ? &RegisterBlock::offsetR
+                                                                      : &RegisterBlock::offsetC;
+                    if (block.*offsetX / atype.packSize == anchor->*offsetX / atype.packSize)
+                        block.offsetAddr = untile(T, atype, block);
+                    break;
+            }
+
+            block.offsetAddr *= T;
+            if (block.offsetAddr >= max || block.offsetAddr < -max)
+                block.offsetAddr = 0;
+            if (one_of(accessType, AccessType::Scattered, AccessType::ChannelScattered))
+                if (block.simdSize > anchor->simdSize)
+                    block.offsetAddr = 0;
+            if (block.offsetAddr & 0x3)
+                block.offsetAddr = 0;
+
+            if (block.offsetAddr == 0)
+                anchor = &block;
+        }
+    }
+}
+
+bool needsRemask(Type T, bool column, const RegisterBlock &block,
+                        const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, bool ignoreMasks = false)
+{
+    if (!ignoreMasks)
+        if (column ? !block.remainderC : !block.remainderR)
+            return false;
+
+    bool block2DRemask = isBlock2D(astrategy.accessType)
+                      && ((block.colMajor ^ isTransposing(astrategy.accessType)) != column);
+
+    int maskGranularity = block.ebytes;
+    if (block.ebytes >= 16)
+        maskGranularity = 4;
+    if (block2DRemask)
+        maskGranularity = std::max(maskGranularity, block2DWidthAlignment(T, block, atype, astrategy));
+    if (ignoreMasks && !(block2DRemask && astrategy.address2D))
+        maskGranularity = 256;
+
+    return (T.paddedSize() < maskGranularity);
+}
+
+bool needsRemask(Type T, bool column, const vector<RegisterBlock> &layout,
+                 const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, bool ignoreMasks)
+{
+    for (auto &block: layout)
+        if (needsRemask(T, column, block, atype, astrategy, ignoreMasks))
+            return true;
+    return false;
+}
+
 #include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.hpp b/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.hpp
index da25a0c6593..8d695cac01f 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/layout_utils.hpp
@@ -175,7 +175,14 @@ void assignUniformMask(std::vector<RegisterBlock> &layout, ngen::FlagRegister fl
 // Returns true if successful; false if not all blocks in layout are compatible.
 bool assignAllDescs(std::vector<RegisterBlock> &layout);
 
+void postprocessLayout(Type T, std::vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy);
 
+void finalizeLayout(ngen::HW hw, Type T, std::vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy);
+
+void coalesceAddrs(ngen::HW hw, Type T, std::vector<RegisterBlock> &layout, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy);
+
+bool needsRemask(Type T, bool column, const std::vector<RegisterBlock> &layout,
+                 const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, bool ignoreMasks);
 #include "internal/namespace_end.hxx"
 
 #endif /* header guard */
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.cpp b/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.cpp
index 8f4dca1bf85..e9cf6d82d91 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -142,6 +142,12 @@ int LoopSequencer::getCooldown() const
     return minCooldown;
 }
 
+int LoopSequencer::getLoopBias() const
+{
+    checkAnalyzed();
+    return minCooldown + unroll - 1;
+}
+
 /**************/
 /* Main logic */
 /**************/
@@ -176,7 +182,7 @@ void LoopSequencer::analyze()
 // Sequence the loop.
 void LoopSequencer::materialize(int minLoops, int maxLoops)
 {
-    typedef CallbackType CT;
+    using CT = CallbackType;
 
     analyze();
 
@@ -186,7 +192,7 @@ void LoopSequencer::materialize(int minLoops, int maxLoops)
 
     bool unifyRemainder = (remainderHandling != RemainderHandling::Separate)
                        && (minCooldown >= unroll) && (unroll > 1);
-    int loopBias = minCooldown + unroll - 1;
+    int loopBias = getLoopBias();
 
     int labelShort, labelUnite;
 
@@ -301,7 +307,7 @@ void LoopSequencer::materialize(int minLoops, int maxLoops)
 
 void LoopSequencer::run(int l, int guaranteedMin, int guaranteedMax, int alignOffset)
 {
-    typedef CallbackType CT;
+    using CT = CallbackType;
 
     auto alignCounter = [&](int i) {
         return align_up(i - alignOffset, counterAlign) + alignOffset;
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.hpp b/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.hpp
index 327696dc1a5..a9309ec38de 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/loop_sequencer.hpp
@@ -150,6 +150,7 @@ class LoopSequencer {
     int getUnroll() const;
     int getWarmup() const;
     int getCooldown() const;
+    int getLoopBias() const;
 
 protected:
     struct Action {
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/masks.cxx b/src/gpu/intel/jit/gemm/generator/pieces/masks.cxx
index 756be43e2ff..04c5dbefded 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/masks.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/masks.cxx
@@ -127,7 +127,7 @@ void BLASKernelGenerator<hw>::loadMask(MaskAssignment assignment, Subregister in
         // Load a variable mask, which requires some minor bit-twiddling.
         auto &vmask = assignment.mask.variable;
 
-        uint32_t rsizeScaled = vmask.rsize / vmask.rdivide;
+        uint32_t rsizeScaled = std::max<uint32_t>(vmask.rsize >> vmask.rshift, 1);
         uint32_t maskLen = vmask.bitRep * vmask.maskRep * rsizeScaled;
         uint32_t fullMask = (uint64_t(1) << maskLen) - 1;
         uint32_t rep1Mask = (uint64_t(1) << (vmask.bitRep * rsizeScaled)) - 1;
@@ -136,9 +136,10 @@ void BLASKernelGenerator<hw>::loadMask(MaskAssignment assignment, Subregister in
         auto flagType = flag.getType();
         auto mask0Type = getBytes(flagType) >= 4 ? DataType::uq : flagType;
 
-        if (vmask.rsize == 1 && vmask.rdivide == 1) {
+        if (vmask.rsize == 1) {
             // Simple threshold comparison.
             offset += assignment.offset;
+            offset <<= vmask.rshift;
             if (flag.isARF())
                 cmp(int(maskLen) | gt | static_cast<FlagRegister &>(flag), index, offset);
             else {
@@ -152,11 +153,11 @@ void BLASKernelGenerator<hw>::loadMask(MaskAssignment assignment, Subregister in
             auto mask0 = state.ra.alloc_sub(mask0Type, getHint(HintType::Bank1));
             auto mask = mask0.reinterpret(0, flagType);
             auto mindex = index;
+            auto rdivide = 1 << vmask.rshift;
 
-            if (vmask.rdivide > 1) {
-                if (!is_zero_or_pow2(vmask.rdivide)) stub();
-                add(1 | sat, temp, mindex, -offset + vmask.rdivide - 1);
-                shr(1, temp, temp, uint16_t(ilog2(vmask.rdivide)));
+            if (vmask.rshift) {
+                add(1 | sat, temp, mindex, -offset + rdivide - 1);
+                shr(1, temp, temp, uint16_t(vmask.rshift));
                 mindex = temp;
                 offset = 0;
             }
@@ -169,7 +170,7 @@ void BLASKernelGenerator<hw>::loadMask(MaskAssignment assignment, Subregister in
                 mulConstant(1, temp, mindex, vmask.bitRep);
                 mindex = temp;
             }
-            uint16_t tshift = vmask.bitRep * (rsizeScaled + div_up(assignment.offset + offset, vmask.rdivide));
+            uint16_t tshift = vmask.bitRep * (rsizeScaled + div_up(assignment.offset + offset, rdivide));
             add(1 | sat, temp, -mindex, tshift);
             if (tshift >= 32)
                 min_(1, temp, temp, vmask.bitRep * rsizeScaled);            // Ensure shift count doesn't overflow.
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx b/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx
index 91c4ffcd535..12c6018e33a 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/matrix_access.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -344,23 +344,25 @@ void BLASKernelGenerator<hw>::atomicAddMatrixBlock(Type T, const GRF &src, const
                 auto rOld = rOldNew[0];
                 auto rNew = rOldNew[nregReal];
                 auto flagToDo = getPhysicalFlag(state.vflagEAtomicAdd, state);
+                auto ebytes = block.ebytes;
+                if (ebytes == 1) ebytes = block.count;
 
                 if (block.simdSize > 16) stub();    // Need 32 channels.
                 if (astrategy.newDP)
                     load(block.simdSize | maskMod, rOld, specLSC, astrategy.base, getAddress(addr, block, astrategy));
                 else if (astrategy.base.getModel() == ModelA64) {
-                    if (block.ebytes == 2)
+                    if (ebytes == 2)
                         load(block.simdSize | maskMod, rOld, scattered_byte(2), astrategy.base, addr);
-                    else if (block.ebytes == 4)
+                    else if (ebytes == 4)
                         load(block.simdSize | maskMod, rOld, scattered_dword(), astrategy.base, addr);
-                    else if (block.ebytes == 8)
+                    else if (ebytes == 8)
                         load(block.simdSize | maskMod, rOld, scattered_qword(), astrategy.base, addr);
                 } else {
-                    if (block.ebytes == 2)
+                    if (ebytes == 2)
                         load(block.simdSize | maskMod, rOld, scattered_byte(2), astrategy.base, addr);
-                    else if (block.ebytes == 4)
+                    else if (ebytes == 4)
                         load(block.simdSize | maskMod, rOld, surface_dword(ChannelMask::r), astrategy.base, addr);
-                    else if (block.ebytes == 8)
+                    else if (ebytes == 8)
                         stub();         // needs cmpwr2
                 }
                 Label labelMask;
@@ -385,15 +387,16 @@ void BLASKernelGenerator<hw>::atomicAddMatrixBlock(Type T, const GRF &src, const
                     mark(labelCmpXchgLoop);
 
                     auto dt = T.ngen();
-                    add(int(simd * block.ebytes / T.real()) | eoMod | NoMask, rNew.retype(dt), rOld.retype(dt), curSrc.retype(dt));
-                    mov<uint32_t>((simd * block.ebytes / 4) | eoMod | NoMask, rSave, rOld);
+                    auto hs = std::max(1, 4 / ebytes);
+                    add(int(simd * ebytes / T.real()) | eoMod | NoMask, rNew.retype(dt)[0](hs), rOld.retype(dt)[0](hs), curSrc.retype(dt)[0](hs));
+                    mov<uint32_t>((simd * hs * ebytes / 4) | eoMod | NoMask, rSave, rOld);
 
                     auto atomicMod = simd | flagToDo | eoMod;
                     auto cmpMod = simd | flagToDo | ne | flagToDo | eoMod;
 
                     if (astrategy.newDP)
                         atomic(AtomicOp::cmpwr, atomicMod, rOld, specLSC, astrategy.base, getAddress(addr[hoff], block, astrategy), rOld);
-                    else switch (block.ebytes) {
+                    else switch (ebytes) {
                         case 2: if (hw < HW::Gen12LP) hw_unsupported();
                                 atomic(AtomicOp::cmpwr, atomicMod, rOld, scattered_word(),  astrategy.base, addr[hoff], rOld); break;
                         case 4: atomic(AtomicOp::cmpwr, atomicMod, rOld, scattered_dword(), astrategy.base, addr[hoff], rOld); break;
@@ -401,11 +404,11 @@ void BLASKernelGenerator<hw>::atomicAddMatrixBlock(Type T, const GRF &src, const
                         default: stub();
                     }
 
-                    if (block.ebytes == 2)
+                    if (ebytes == 2)
                         cmp<uint16_t>(cmpMod, rSave[0][0](2), rOld[0](2));
-                    else if (block.ebytes == 4)
+                    else if (ebytes == 4)
                         cmp<uint32_t>(cmpMod, rSave, rOld);
-                    else if (block.ebytes == 8) {
+                    else if (ebytes == 8) {
                         if (strategy.emulate.emulate64) {
                             cmp<uint32_t>(simd | ne | flagToDo | eoMod, rSave[0][0](2), rOld[0](2));
                             cmp<uint32_t>(simd | ~flagToDo | ne | flagToDo | eoMod, rSave[0][1](2), rOld[1](2));
@@ -515,8 +518,8 @@ void BLASKernelGenerator<hw>::loadLoadStoreDescriptors(bool load, bool store, Re
             exdescStore.parts.extMessageLen = 0;
             descLoad.parts.responseLen = 0;
 
-            int underlyingSIMD = std::max<int>(block.simdSize, maxScatteredSIMD(hw, astrategy) >> 1);
-            int log2GRFs = ilog2((uint64_t)underlyingSIMD * block.ebytes) - GRF::log2Bytes(hw);
+            uint32_t underlyingSIMD = std::max<uint32_t>(block.simdSize, (uint32_t)maxScatteredSIMD(hw, astrategy) >> 1);
+            int log2GRFs = ilog2(underlyingSIMD * block.ebytes) - GRF::log2Bytes(hw);
             int log2Components = int(block.splitComplex);
 
             if (channel) mov(1, t2, 0x1000 << log2Components);
@@ -621,6 +624,8 @@ void BLASKernelGenerator<hw>::prepareSeriesRegisterBlockMasking(const vector<Reg
         for (int startPreload = start; startPreload < nblocks; startPreload++) {
             auto &block = layout[startPreload];
 
+            if (!block.isLoadBlock()) continue;
+
             bool plFlag[2];
             for (int i = 0; i <= 1; i++)
                 plFlag[i] = block.flag[i] && (block.flag[i] != state.blockEMask);
@@ -630,7 +635,7 @@ void BLASKernelGenerator<hw>::prepareSeriesRegisterBlockMasking(const vector<Reg
 
             auto &flag = block.flag[plFlag[0] ? 0 : 1];
             if (!state.raVFlag.canLock(flag.n)) break;
-            state.raVFlag.lock(getPhysicalFlag(flag, state));
+            state.raVFlag.lock(getPhysicalFlag(flag, state), true);
         }
     }
 }
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/matrix_multiply.cxx b/src/gpu/intel/jit/gemm/generator/pieces/matrix_multiply.cxx
index ac6977d19f3..1146752164d 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/matrix_multiply.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/matrix_multiply.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ using std::vector;
 //  ha and hb are the k indices within the A and B chunks, respectively.
 //  A_copy, B_copy are the indices of the A, B copies to use.
 template <HW hw>
-void BLASKernelGenerator<hw>::outerProduct(int h, int ha, int hb, int opCount,
+void BLASKernelGenerator<hw>::outerProduct(int h, int ha, int hb, int opCount, bool rem,
                                            const vector<RegisterBlock> &A_layout, const vector<RegisterBlock> &B_layout,
                                            const GRFMultirange &A_regs, const GRFMultirange &B_regs,
                                            const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
@@ -41,7 +41,7 @@ void BLASKernelGenerator<hw>::outerProduct(int h, int ha, int hb, int opCount,
     if (strategy.dotVL)
         innerProductFMA(h, ha, hb, opCount, A_layout, B_layout, A_regs, B_regs, problem, strategy, state);
     else if (strategy.systolic)
-        outerProductSystolic(h, ha, hb, opCount, A_layout, B_layout, A_regs, B_regs, problem, strategy, state);
+        outerProductSystolic(h, ha, hb, opCount, rem, A_layout, B_layout, A_regs, B_regs, problem, strategy, state);
     else if (hw < HW::Gen12LP && problem.isIGEMM())
         outerProductGen9IGEMM(ha, hb, A_layout, B_layout, A_regs, B_regs, problem, strategy, state);
     else
@@ -475,7 +475,7 @@ void BLASKernelGenerator<hw>::outerProductGen9IGEMM(int ha, int hb, const vector
 
 // Accumulate multiple outer products using the systolic array.
 template <HW hw>
-void BLASKernelGenerator<hw>::outerProductSystolic(int h, int ha, int hb, int opCount,
+void BLASKernelGenerator<hw>::outerProductSystolic(int h, int ha, int hb, int opCount, bool rem,
                                                    const vector<RegisterBlock> &A_layout, const vector<RegisterBlock> &B_layout,
                                                    const GRFMultirange &A_regs, const GRFMultirange &B_regs,
                                                    const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
@@ -494,9 +494,14 @@ void BLASKernelGenerator<hw>::outerProductSystolic(int h, int ha, int hb, int op
                         : state.systolicSumB;
     bool snake = strategy.fmaBoustrophedon && !sum;
     bool repackC = !state.Cr_layout.empty();
+    bool startRepackC = false, endRepackC = false;
     int Cr_unrollM = 0, Cr_unrollN = 0;
-    if (repackC)
+    if (repackC) {
         getLayoutDims(state.Cr_layout, Cr_unrollM, Cr_unrollN);
+        auto rphase = align_down(h, opCount) % state.cRepackPeriod;
+        startRepackC = rem || (rphase == 0);
+        endRepackC = rem || (rphase + opCount >= state.cRepackPeriod);
+    }
 
     RegisterBlock sumBlock;
     sumBlock.colMajor = globalCM;
@@ -547,7 +552,7 @@ void BLASKernelGenerator<hw>::outerProductSystolic(int h, int ha, int hb, int op
                 if (rc != 8 && strategy.extendedAtomicFMA) hw_unsupported();
             }
 
-            if (repackC && hhbase == 0)
+            if (startRepackC && hhbase == 0)
                 srcC0 = null.retype(C0.getType());
 
             useDPASW ? dpasw(mod, sdepth, rc, C0, srcC0, V0, N0)
@@ -588,13 +593,19 @@ void BLASKernelGenerator<hw>::outerProductSystolic(int h, int ha, int hb, int op
                     B = state.sysSumAll1s[0];
                     nb = elementsPerGRF(hw, Tb);
                     B_block = &sumBlock;
-                    C = findBlockReg(Tc, state.As_layout, x, 0, state.As_regs, nc, C_block);
+                    if (repackC)
+                        C = findBlockReg(Tc, state.Asr_layout, x % Cr_unrollM, 0, state.Asr_regs, nc, C_block);
+                    else
+                        C = findBlockReg(Tc, state.As_layout, x, 0, state.As_regs, nc, C_block);
                 } else {
                     A = state.sysSumAll1s[0];
                     na = elementsPerGRF(hw, Ta);
                     A_block = &sumBlock;
                     B = findBlockReg(Tb, B_layout, hhb, x, B_regs, nb, B_block);
-                    C = findBlockReg(Tc, state.Bs_layout, 0, x, state.Bs_regs, nc, C_block);
+                    if (repackC)
+                        C = findBlockReg(Tc, state.Bsr_layout, 0, x % Cr_unrollN, state.Bsr_regs, nc, C_block);
+                    else
+                        C = findBlockReg(Tc, state.Bs_layout, 0, x, state.Bs_regs, nc, C_block);
                 }
 
                 int nv = globalCM ? na : nb;
@@ -647,14 +658,14 @@ void BLASKernelGenerator<hw>::outerProductSystolic(int h, int ha, int hb, int op
         bool finishChain = !strategy.extendedAtomicFMA || (x + osys >= nx) || (repackC && x >= (Cr_unrollM - 2*xinc));
         issueDPAS(finishChain);
 
-        if (repackC) {
+        if (endRepackC) {
             int xr = x - Cr_unrollM + xinc;
             if (xr >= 0)
                 outerProductRepackC(xr, xr % Cr_unrollM, xinc, h, problem, strategy, state);
         }
     } /* x loop */
 
-    if (repackC) for (int xr = nx - Cr_unrollM + xinc; xr < nx; xr += xinc)
+    if (endRepackC) for (int xr = nx - Cr_unrollM + xinc; xr < nx; xr += xinc)
         outerProductRepackC(xr, xr % Cr_unrollM, xinc, h, problem, strategy, state);
 
 }
@@ -672,6 +683,10 @@ void BLASKernelGenerator<hw>::outerProductRepackC(int x0, int xr0, int nx, int h
     bool globalCM = isLayoutColMajor(C_layout);
     bool scaleA = state.lateScale2DA, scaleB = state.lateScale2DB;
 
+    bool sumA = problem.needsASums();
+    bool sumB = problem.needsBSums();
+    if (globalCM ? sumB : sumA) stub();
+
     if (Tc.size() != Tc_compute.size()) stub();
     if (state.C_buffers > 1) stub();
 
@@ -712,41 +727,56 @@ void BLASKernelGenerator<hw>::outerProductRepackC(int x0, int xr0, int nx, int h
     for (int x1 = 0; x1 < nx; x1 += 2 * nec) {
         int x = x0 + x1, xr = xr0 + x1;
         int xchunk = std::min(nx - x1, 2 * nec);
-        for (int y = 0; y < ny; y++) {
+        for (int y = 0; y < ny + sumA + sumB; y++) {
             auto i = globalCM ? x : y;
             auto j = globalCM ? y : x;
             auto ir = globalCM ? xr : y;
             auto jr = globalCM ? y : xr;
 
-            int ne, ner, nes[2];
-            const RegisterBlock *C_block, *Cr_block, *sblock;
-            auto C = findBlockReg(Tc, C_layout, i, j, C_regs, ne, C_block);
-            auto Cr = findBlockReg(Tc_compute, Cr_layout, ir, jr, Cr_regs, ner, Cr_block);
+            int ne = 0, ner = 0, nes[2] = {0, 0};
+            const RegisterBlock *C_block = nullptr, *Cr_block = nullptr;
+            const RegisterBlock *sblock = nullptr;
+            Subregister C, Cr;
+
+            bool doASum = sumA && y == ny;
+            bool doBSum = sumB && y == ny;
+
+            if (y < ny) {
+                C  = findBlockReg(Tc, C_layout, i, j, C_regs, ne, C_block);
+                Cr = findBlockReg(Tc_compute, Cr_layout, ir, jr, Cr_regs, ner, Cr_block);
+            } else if (doASum) {
+                C  = findBlockReg(Tc, state.As_layout, x, 0, state.As_regs, ne, C_block);
+                Cr = findBlockReg(Tc_compute, state.Asr_layout, xr, 0, state.Asr_regs, ner, Cr_block);
+            } else if (doBSum) {
+                C  = findBlockReg(Tc, state.Bs_layout, 0, x, state.Bs_regs, ne, C_block);
+                Cr = findBlockReg(Tc_compute, state.Bsr_layout, 0, xr, state.Bsr_regs, ner, Cr_block);
+            }
 
             std::array<Subregister, 2> scale;
             std::array<int, 2> scaleStride = {0, 0};
             int nscale = 0;
-            if (scaleA) {
-                int js = ((jr + h) / problem.aqGroupK) % state.kaqLate;
+            if (scaleA && !doBSum) {
+                int hs = (h / problem.aqGroupK) % state.kaqLate;
                 scale[nscale] = findBlockReg(state.Ta_scaleInt, state.Ar_scaleLayout,
-                                             i, js, state.Ar_scaleRegs, nes[0], sblock);
+                                             i, hs, state.Ar_scaleRegs, nes[0], sblock);
                 scaleStride[nscale] = globalCM ? 1 : 0;
                 nscale++;
             }
-            if (scaleB) {
-                int is = ((ir + h) / problem.bqGroupK) % state.kbqLate;
+            if (scaleB && !doASum) {
+                int hs = (h / problem.bqGroupK) % state.kbqLate;
                 scale[nscale] = findBlockReg(state.Tb_scaleInt, state.Br_scaleLayout,
-                                             is, j, state.Br_scaleRegs, nes[1], sblock);
+                                             hs, j, state.Br_scaleRegs, nes[1], sblock);
                 scaleStride[nscale] = globalCM ? 0 : 1;
                 nscale++;
             }
 
-            ne = std::min(ne, ner);
+            ne = std::min({ne, ner, xchunk});
             if (scaleStride[0] == 1) ne = std::min(ne, nes[0]);
             if (scaleStride[1] == 1) ne = std::min(ne, nes[1]);
 
             if (ne < xchunk) stub();
-            if (C_block->crosspack != 1 || Cr_block->crosspack != 1) stub();
+            if ((C_block && C_block->crosspack != 1)
+                    || (Cr_block && Cr_block->crosspack != 1)) stub();
 
             WorkItem item = {C, Cr, ne, iacc, scale, scaleStride};
             bool coalesce = false;
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.cpp b/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.cpp
index 6c3fcd062be..f2a1955b56a 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.cpp
@@ -29,7 +29,8 @@ void movePipes(Subregister &s, bool sizeCanChange)
     DataType type = s.getType();
 
     switch (type) {
-        case DataType::bf8: type = DataType::ub; break;
+        case DataType::bf8:
+        case DataType::hf8: type = DataType::ub; break;
         case DataType::bf:
         case DataType::hf: type = DataType::uw; break;
         case DataType::tf32:
@@ -42,7 +43,6 @@ void movePipes(Subregister &s, bool sizeCanChange)
         case DataType::q:
         case DataType::uq: if (sizeCanChange) type = DataType::f; break;
         default:
-            if (type == Type::ngen_hf8()) type = DataType::ub;
             break;
     }
 
@@ -54,7 +54,8 @@ void moveToIntPipe(Subregister &s)
     DataType type = s.getType();
 
     switch (type) {
-        case DataType::bf8: type = DataType::ub; break;
+        case DataType::bf8:
+        case DataType::hf8: type = DataType::ub; break;
         case DataType::bf:
         case DataType::hf: type = DataType::uw; break;
         case DataType::q:
@@ -63,7 +64,6 @@ void moveToIntPipe(Subregister &s)
         case DataType::tf32:
         case DataType::df: type = DataType::ud; break;
         default:
-            if (type == Type::ngen_hf8()) type = DataType::ub;
             break;
     }
 
@@ -73,7 +73,8 @@ void moveToIntPipe(Subregister &s)
 void moveToIntPipe(int esize, RegData &s)
 {
     switch (s.getType()) {
-        case DataType::bf8: s.setType(DataType::ub); break;
+        case DataType::bf8:
+        case DataType::hf8: s.setType(DataType::ub); break;
         case DataType::bf:
         case DataType::hf: s.setType(DataType::uw); break;
         case DataType::q:
@@ -85,7 +86,6 @@ void moveToIntPipe(int esize, RegData &s)
             EmulationImplementation::makeDWPair(s, esize);
             break;
         default:
-            if (s.getType() == Type::ngen_hf8()) s.setType(DataType::ub);
             break;
     }
 }
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.hpp b/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.hpp
index e22276dfa05..fdb3575ea4f 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/ngen_object_helpers.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,16 +25,20 @@
 #include "internal/namespace_start.hxx"
 
 // DataType queries and helpers.
-static inline bool is4(ngen::DataType dt) { return one_of(dt, ngen::DataType::u4, ngen::DataType::s4); }
+static inline bool is4(ngen::DataType dt) { return (dt == Type::ngen_f4_e2m1()) || one_of(dt, ngen::DataType::u4, ngen::DataType::s4); }
 static inline bool isB(ngen::DataType dt) { return one_of(dt, ngen::DataType::ub, ngen::DataType::b); }
 static inline bool isW(ngen::DataType dt) { return one_of(dt, ngen::DataType::uw, ngen::DataType::w); }
 static inline bool isD(ngen::DataType dt) { return one_of(dt, ngen::DataType::ud, ngen::DataType::d); }
 static inline bool isQ(ngen::DataType dt) { return one_of(dt, ngen::DataType::uq, ngen::DataType::q); }
-static inline bool isFP8(ngen::DataType dt) { return one_of(dt, ngen::DataType::bf8, Type::ngen_hf8()); }
+static inline bool isFP8(ngen::DataType dt) { return (dt == Type::ngen_f8_e8m0()) || one_of(dt, ngen::DataType::bf8, ngen::DataType::hf8); }
+static inline bool isFP4(ngen::DataType dt) { return (dt == Type::ngen_f4_e2m1() || dt == Type::ngen_f4_e3m0()); }
+static inline bool isInt4(ngen::DataType dt) { return one_of(dt, ngen::DataType::u4, ngen::DataType::s4);}
+static inline bool is4Bit(ngen::DataType dt) { return isFP4(dt) || isInt4(dt);}
 
 static inline bool isFP(ngen::DataType dt) {
+    if (dt == Type::ngen_f4_e2m1() || dt == Type::ngen_f8_e8m0()) return true;
     using namespace ngen;
-    return one_of(dt, DataType::bf8, Type::ngen_hf8(), DataType::bf, DataType::hf, DataType::tf32, DataType::f, DataType::df);
+    return one_of(dt, DataType::bf8, DataType::hf8, DataType::bf, DataType::hf, DataType::tf32, DataType::f, DataType::df);
 }
 static inline bool isInt(ngen::DataType dt) { return !isFP(dt); }
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/post_ops.cxx b/src/gpu/intel/jit/gemm/generator/pieces/post_ops.cxx
index fb97b7c49d8..4c6172bbd08 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/post_ops.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/post_ops.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -211,6 +211,14 @@ void BLASKernelGenerator<hw>::gemmVectorBinaryOpC(BinaryOp op, bool column, cons
             if (scale.isValid()) {
                 if (op != BinaryOp::Add) stub();
                 mad(nc, C(1), C(1), offBase(stride()), scale);
+            } else if (strategy.dotVL > 0) {
+                // Dot-based kernels pack the C-tile into one register when possible, which results in
+                // register region restriction complications. Split binary operations into execSize=1 pieces.
+                for (int off = 0; off < nc; off++) {
+                    auto val = C.offset(off)(1);
+                    auto src1 = offBase.offset(off)(stride());
+                    binaryOp(op, 1, val, val, src1, state);
+                }
             } else
                 binaryOp(op, nc, C(1), C(1), offBase(stride()), state);
 
@@ -455,7 +463,7 @@ void BLASKernelGenerator<hw>::gemmLoadBinaryOpArgs(const GEMMProblem &problem, c
 template <HW hw>
 void BLASKernelGenerator<hw>::gemmApplyPostOps(size_t poMin, size_t poMax, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
 {
-    if (poMin >= poMax) return;
+    if (poMin >= poMax && !problem.cStochasticRound) return;
 
     Label lSkip;
     and_(1 | nz | state.flagAP, null.ud(), state.inputs.flags, FlagNonfinalKBlock);
@@ -530,7 +538,7 @@ void BLASKernelGenerator<hw>::gemmApplyPostOps(size_t poMin, size_t poMax, const
         auto &entry = problem.postOps[i];
         switch (entry.kind()) {
             case post_op::kind_t::eltwise: {
-                using Injector = jit_eltwise_injector_f32<hw>;
+                using Injector = eltwise_injector_f32_t<GENERATOR_BASE(hw)>;
                 if (state.Tacc != Type::f32) stub();
 
                 int euCount = 0; /* only used for a DG2 W/A for conv */
@@ -564,6 +572,22 @@ void BLASKernelGenerator<hw>::gemmApplyPostOps(size_t poMin, size_t poMax, const
             default: stub();
         }
     }
+    if(problem.cStochasticRound){
+        using Injector = eltwise_injector_f32_t<GENERATOR_BASE(hw)>;
+        int euCount = 0; /* only used for a DG2 W/A for conv */
+        Injector injector{this, alg_kind::eltwise_stochastic_round, 0.0, 0.0, 1.0, 
+                          euCount, GRFRange(), problem.postOpFwd};
+        auto scratch = state.ra.try_alloc_range(injector.preferred_scratch_regs());
+        if (scratch.isInvalid())
+            scratch = state.ra.alloc_range(injector.min_scratch_regs());
+        if (scratch.isInvalid())
+            stub();
+        
+        injector.set_scratch(scratch);
+        injector.prepare();
+        injector.compute(C_grfs, C_ngrf, state.inputs.sroundSeed.getBase(), state.inputs.sroundSeed.getOffset(), problem.Tc_ext.ngen());
+    }
+        
 
     mark(lSkip);
 }
@@ -706,10 +730,10 @@ void BLASKernelGenerator<hw>::gemmApplyABOffset(const GEMMProblem &problem, cons
         if (!(aOffset && bOffset)) return Subregister{};
 
         auto ret = state.ra.alloc_sub(problem.Tc.ngen());
-
-        if (!boVector) mul(1, ret, state.k, state.inputs.bo);
-        else if (Tc.isFP()) mov(1, ret, state.k);
-        else stub();
+        if (!boVector)
+            mul(1, ret, state.k, state.inputs.bo);
+        else
+            mov(1, ret, state.k);
 
         return ret;
     }();
@@ -757,7 +781,7 @@ void BLASKernelGenerator<hw>::gemmApplyABOffset(const GEMMProblem &problem, cons
 
         if (aOffset && bOffset) for (int r = 0; r < state.Bs_regs.getLen(); r++) {
             auto ne = elementsPerGRF(hw, Tc);
-            auto Bs = state.Bs_regs[r];
+            auto Bs = state.Bs_regs[r].retype(Tc.ngen());
             boVector ? emad(ne, Bs, Bs, boData[r].retype(Tc.ngen()), temp, strategy, state)
                      : add(ne, Bs, Bs, temp);
         };
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/problem_utils.cpp b/src/gpu/intel/jit/gemm/generator/pieces/problem_utils.cpp
index 6f3eba050fe..9a8a3cb8d83 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/problem_utils.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/problem_utils.cpp
@@ -37,6 +37,8 @@ void GEMMProblem::transpose()
     std::swap(aOffset, bOffset);
     std::swap(aoPtrDims, boPtrDims);
     std::swap(aScale2D, bScale2D);
+    std::swap(aqGroupM, bqGroupN);
+    std::swap(aqGroupK, bqGroupK);
     std::swap(sumA, sumB);
     std::swap(binaryRow, binaryCol);
     binaryTrans.flip();
@@ -48,6 +50,8 @@ void GEMMProblem::transpose()
     AO.transpose();
     BO.transpose();
     CO.transpose();
+    A_scale.transpose();
+    B_scale.transpose();
 }
 
 static inline void append(std::ostringstream &ss, const Scalar &x);
@@ -66,12 +70,12 @@ std::string GEMMProblem::toString() const
         case BatchMode::Variable:   ss << "batchnv "; break;
     }
 
-    auto appendQString = [&](char matrix, int ptrDims, int xqGroupK) {
+    auto appendQString = [&](char matrix, int ptrDims, int xqGroupR, int xqGroupC) {
         ss << matrix;
         if (ptrDims < 0 || ptrDims > 2) return;
         ss << "[" << "pvg"[ptrDims];
         if (ptrDims == 2)
-            ss << xqGroupK;
+            ss << xqGroupR << 'x' << xqGroupC;
         ss << ']';
     };
 
@@ -80,16 +84,16 @@ std::string GEMMProblem::toString() const
     bool offsetc = (cOffset == COffset::Post);
     if (offseta || offsetb || offsetc) {
         ss << "offset";
-        if (offseta) appendQString('a', aoPtrDims, aqGroupK);
-        if (offsetb) appendQString('b', boPtrDims, bqGroupK);
-        if (offsetc) appendQString('c', -1, 0);
+        if (offseta) appendQString('a', aoPtrDims, aqGroupM, aqGroupK);
+        if (offsetb) appendQString('b', boPtrDims, bqGroupK, bqGroupN);
+        if (offsetc) appendQString('c', -1, 0, 0);
         ss << ' ';
     }
 
     if (aScale2D || bScale2D) {
         ss << "scale";
-        if (aScale2D) appendQString('a', 2, aqGroupK);
-        if (bScale2D) appendQString('b', 2, bqGroupK);
+        if (aScale2D) appendQString('a', 2, aqGroupM, aqGroupK);
+        if (bScale2D) appendQString('b', 2, bqGroupK, bqGroupN);
         ss << ' ';
     }
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/quantization.cpp b/src/gpu/intel/jit/gemm/generator/pieces/quantization.cpp
new file mode 100644
index 00000000000..b1aaee00c73
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/generator/pieces/quantization.cpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+
+#include "layout_utils.hpp"
+
+using namespace ngen;
+using std::vector;
+
+#include "internal/namespace_start.hxx"
+
+
+bool canDequantizeInt4(Type Tsrc, Type Tdst,
+                       const vector<RegisterBlock> &layoutSrc, const vector<RegisterBlock> &layoutDst,
+                       const vector<RegisterBlock> layoutOffset, const vector<RegisterBlock> layoutScale)
+{
+    if (!Tsrc.isInt4() || !one_of(Tdst, Type::f16, Type::bf16, Type::f32))
+        return false;
+
+    if (layoutOffset.empty() || layoutScale.empty()) {
+        int m, n, md, nd;
+        getLayoutDims(layoutSrc, m, n);
+        getLayoutDims(layoutDst, md, nd);
+
+        if (m < md || n < nd) return false;
+    }
+
+    return true;
+}
+
+#include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/quantization.cxx b/src/gpu/intel/jit/gemm/generator/pieces/quantization.cxx
index 13be6fb6644..9e919617630 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/quantization.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/quantization.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ using std::vector;
 
 // Prepare 2D dequantization layouts.
 template <HW hw>
-bool BLASKernelGenerator<hw>::gemmMake2DQuantizationLayouts(bool isA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+bool BLASKernelGenerator<hw>::gemmMake2DQuantizationLayouts(bool isA, const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state)
 {
     int xoPtrDims = (isA ? problem.aoPtrDims : problem.boPtrDims);
     bool xo2D = (xoPtrDims == 2);
@@ -43,8 +43,8 @@ bool BLASKernelGenerator<hw>::gemmMake2DQuantizationLayouts(bool isA, const GEMM
     if (!xo2D && !xoTo2D && !xs2D) return true;
 
     auto &X_strategy       = isA ? strategy.A             : strategy.B;
-    auto &X_offsetStrategy = isA ? state.A_offsetStrategy : state.B_offsetStrategy;
-    auto &X_scaleStrategy  = isA ? state.A_scaleStrategy  : state.B_scaleStrategy;
+    auto &X_offsetStrategy = isA ? strategy.AO            : strategy.BO;
+    auto &X_scaleStrategy  = isA ? strategy.A_scale       : strategy.B_scale;
     auto &X_offsetLayout   = isA ? state.A_offsetLayout   : state.B_offsetLayout;
     auto &X_scaleLayout    = isA ? state.A_scaleLayout    : state.B_scaleLayout;
     auto &Xr_offsetLayout  = isA ? state.Ar_offsetLayout  : state.Br_offsetLayout;
@@ -56,27 +56,33 @@ bool BLASKernelGenerator<hw>::gemmMake2DQuantizationLayouts(bool isA, const GEMM
     auto Tx          = isA ? problem.Ta         : problem.Tb;
     auto Txo         = isA ? problem.Tao        : problem.Tbo;
     auto Txs         = isA ? problem.Ta_scale   : problem.Tb_scale;
+    auto xqGroupK    = isA ? problem.aqGroupK   : problem.bqGroupK;
+    auto xqGroupMN   = isA ? problem.aqGroupM   : problem.bqGroupN;
     auto &Txo_int    = isA ? state.Tao_int      : state.Tbo_int;
     auto &Txs_int    = isA ? state.Ta_scaleInt  : state.Tb_scaleInt;
     auto &Tx_scaleOp = isA ? state.Ta_scaleOp   : state.Tb_scaleOp;
     auto &lateScale  = isA ? state.lateScale2DA : state.lateScale2DB;
 
-    bool Txs_bf = Txs == Type::bf16;
-    Tx_scaleOp = (Txs_bf ? Type(Tx_ext.isInt4() ? Type::f16 : Type::f32) : Txs);
+    bool downScale   = isA ? problem.downconvertAScales() : problem.downconvertBScales();
+
+    bool Tx_bf = problem.Ta_ext ==  Type::bf16 || problem.Tb_ext == Type::bf16;
+    Tx_scaleOp = (Tx_bf ? Type(Tx_ext.isInt4() ? Type::f16 : Type::f32) : Txs);
     Txo_int    = Txo.isInteger() ? Tx.asSignedInt() : Tx;
     Txs_int    = Tx;
 
     int cpoDiv = 1;
     if (Txo_int.isInt8()) Txo_int = Type::s16, cpoDiv = 2;
 
-    if (xs2D && (Txs.paddedSize() > Tx.paddedSize())) {
+    if (downScale)
+        Tx_scaleOp = Tx;
+    else if (xs2D && (Txs.paddedSize() > Tx.paddedSize())) {
         lateScale = true;
         Txs_int = Tx_scaleOp = problem.Tc;
     }
 
     bool int4SpecialPath = Tx_ext.isInt4() && one_of(Tx, Type::f16, Type::bf16, Type::f32);
     if (int4SpecialPath)
-        Txo_int = Txs_int = Type::f16;
+        Txo_int = Txs_int = Tx_scaleOp = Type::f16;
 
     // Get tile sizes, depending on whether A/B are copied to SLM.
     // For late scaling (after compute), scales are always applied to the whole tile.
@@ -90,11 +96,15 @@ bool BLASKernelGenerator<hw>::gemmMake2DQuantizationLayouts(bool isA, const GEMM
         r = slmA ? state.ma_slm : rNoSLM;
         c = slmA ? state.ka_slm : cNoSLM;
         k = slmA ? strategy.unrollKSLM : cNoSLM;
-        c = state.kaq = std::max(1, c / problem.aqGroupK);
-        state.kaqStride = std::max(1, k / problem.aqGroupK);
-        cNoSLM = state.kaqLate = std::max(1, cNoSLM / problem.aqGroupK);
+        r = std::max(1, r / xqGroupMN);
+        c = state.kaq = std::max(1, c / xqGroupK);
+        state.kaqStride = std::max(1, k / xqGroupK);
+        rNoSLM = std::max(1, rNoSLM / xqGroupMN);
+        cNoSLM = state.kaqLate = std::max(1, cNoSLM / xqGroupK);
         remR = (strategy.remHandling[LoopM] != RemainderHandling::Ignore);
-        tileC = 1;
+        if (xqGroupMN <= 1 && xqGroupK > 1) tileC = 1;
+        if (xqGroupMN > 1 && (xqGroupMN % strategy.unroll[LoopM] && strategy.unroll[LoopM] % xqGroupMN))
+            stub("Tile size not compatible with group size in m dimension");
     } else {
         bool slmB = strategy.slmB;
         cNoSLM = strategy.unroll[LoopN];
@@ -102,47 +112,61 @@ bool BLASKernelGenerator<hw>::gemmMake2DQuantizationLayouts(bool isA, const GEMM
         c = slmB ? state.nb_slm : cNoSLM;
         r = slmB ? state.kb_slm : rNoSLM;
         k = slmB ? strategy.unrollKSLM : rNoSLM;
-        r = state.kbq = std::max(1, r / problem.bqGroupK);
-        state.kbqStride = std::max(1, k / problem.bqGroupK);
-        rNoSLM = state.kbqLate = std::max(1, rNoSLM / problem.bqGroupK);
+        c = std::max(1, c / xqGroupMN);
+        r = state.kbq = std::max(1, r / xqGroupK);
+        state.kbqStride = std::max(1, k / xqGroupK);
+        cNoSLM = std::max(1, cNoSLM / xqGroupMN);
+        rNoSLM = state.kbqLate = std::max(1, rNoSLM / xqGroupK);
         remC = (strategy.remHandling[LoopN] != RemainderHandling::Ignore);
-        tileR = 1;
+        if (xqGroupMN <= 1 && xqGroupK > 1) tileR = 1;
+        if (xqGroupMN > 1 && (xqGroupMN % strategy.unroll[LoopN] && strategy.unroll[LoopN] % xqGroupMN))
+            stub("Tile size not compatible with group size in n dimension");
     }
 
     int rs = lateScale ? rNoSLM : r;
     int cs = lateScale ? cNoSLM : c;
 
     if (X_strategy.padded) {
-        X_offsetStrategy.padded = true;
+        X_offsetStrategy.padded = X_scaleStrategy.padded = true;
         remR = remC = false;
     }
 
-    X_offsetStrategy.newDP = (hw >= HW::XeHPG);
-    X_scaleStrategy = X_offsetStrategy;
+    bool wantCM = isA ^ (xqGroupMN > 1);
+    X_offsetStrategy.accessType = (wantCM == isColMajor(XO.layout)) ? AccessType::Block : AccessType::Scattered;
+    X_scaleStrategy.accessType  = (wantCM == isColMajor(XS.layout)) ? AccessType::Block : AccessType::Scattered;
 
-    X_offsetStrategy.accessType = (isA == isColMajor(XO.layout)) ? AccessType::Block : AccessType::Scattered;
-    X_scaleStrategy.accessType  = (isA == isColMajor(XS.layout)) ? AccessType::Block : AccessType::Scattered;
+    if (xo2D && !getRegLayout(Txo, X_offsetLayout, r,  c,  remR, remC, false, AvoidFragment, 0, 0, XO, X_offsetStrategy)) return false;
+    if (xs2D && !getRegLayout(Txs, X_scaleLayout,  rs, cs, remR, remC, false, AvoidFragment, 0, 0, XS, X_scaleStrategy)) return false;
 
-    if (!X_offsetStrategy.base.isStateless()) {
-        X_offsetStrategy.base.setIndex(isA ? state.inputs.surfaceAO : state.inputs.surfaceBO);
-        X_scaleStrategy.base.setIndex(isA ? state.inputs.surfaceAScale : state.inputs.surfaceBScale);
-    }
+    // Adjust masks for m/n grouping.
+    auto adjustMask = [=](MaskInfo &mask) {
+        if (!mask || xqGroupMN <= 1) return;
+        if (!is_zero_or_pow2(xqGroupMN)) stub();
+        if (mask.fixed.isFixed) stub();
+        mask.variable.rshift += ilog2(xqGroupMN);
+    };
 
-    if (xo2D && !getRegLayout(Txo, X_offsetLayout, r,  c,  remR, remC, false, AvoidFragment, tileR, tileC, XO, X_offsetStrategy)) return false;
-    if (xs2D && !getRegLayout(Txs, X_scaleLayout,  rs, cs, remR, remC, false, AvoidFragment, tileR, tileC, XS, X_scaleStrategy)) return false;
+    for (auto *Xq_layout: {&X_offsetLayout, &X_scaleLayout}) {
+        for (auto &block: *Xq_layout) {
+            adjustMask(block.rowMask);
+            adjustMask(block.colMask);
+        }
+    }
 
     // Quantization parameters will be upconverted to the size of A/B and duplicated to match crosspack.
     auto &lsrc = isA ? (strategy.slmA ? state.Ao_layout : !state.Ar_layout.empty() ? state.Ar_layout : state.A_layout)
                      : (strategy.slmB ? state.Bo_layout : !state.Br_layout.empty() ? state.Br_layout : state.B_layout);
     if (lsrc.empty()) stub();
     int crosspack = lsrc[0].crosspack;
+    if (xqGroupMN > 1)
+        crosspack = 1;
     if (int4SpecialPath && Tx == Type::bf16)
         crosspack = 1;
     int cpo = div_up(crosspack, cpoDiv);
 
     auto makeQRepack = [&](Type Txq, Type Txq_int, vector<RegisterBlock> &repack, vector<RegisterBlock> &src, int m, int n, int cp) {
         if (cp > 1 || (cColMajor && (cp != src[0].crosspack)) || Txq != Txq_int)
-            makeUnbackedRegLayout(Txq_int, repack, m, n, isA, cp, tileR, tileC, false);
+            makeUnbackedRegLayout(Txq_int, repack, m, n, wantCM, cp, tileR, tileC, false);
     };
 
     if (xo2D) makeQRepack(Txo, Txo_int,    Xr_offsetLayout, X_offsetLayout, r,  c,  cpo);
@@ -165,25 +189,26 @@ void BLASKernelGenerator<hw>::gemmRepack2DQuantizationData(Type Ts, Type Td, con
                                                            const GRFMultirange &src, const GRFMultirange &dst,
                                                            const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
 {
-    if (dst.empty()) return;
+    if (layoutDst.empty()) return;
 
     int ms, ns, md, nd;
     getLayoutDims(layoutSrc, ms, ns);
-    getLayoutDims(layoutSrc, md, nd);
+    getLayoutDims(layoutDst, md, nd);
 
     // Copy, broadcasting 1D to 2D data as needed.
     for (int doffR = 0; doffR < md; doffR += ms)
         for (int doffC = 0; doffC < nd; doffC += ns)
             copyRegisters(Ts, Td, layoutSrc, layoutDst, src, dst, doffR, doffC, false, strategy, state);
 
-    // Duplicate data in crosspack dimension. TODO: do this as part of the copy.
+    // Duplicate data in padded region. TODO: do this as part of the copy.
     int cp = layoutDst[0].crosspack;
+    int p0 = layoutDst[0].colMajor ? layoutDst[0].nc : layoutDst[0].nr;
 
     if (cp > 1) map(hw, Td, dst, layoutDst, strategy, [&](int simd, RegData r) {
         Subregister r0 = GRF(r.getBase()).sub(r.getOffset(), r.getType());
         moveToIntPipe(r0);
         auto r1 = r0;
-        for (int i = 1; i < cp; i++) {
+        for (int i = p0; i < cp; i++) {
             r1.setOffset(r1.getOffset() + 1);
             mov(simd / cp, r1(cp), r0(cp));
         }
@@ -246,13 +271,15 @@ template <HW hw>
 void BLASKernelGenerator<hw>::gemmDequantizeOperation(bool doA, Type T, Type To, BinaryOp op,
                                                       const std::vector<RegisterBlock> &layout, const std::vector<RegisterBlock> &qlayout,
                                                       const GRFMultirange &regs, const GRFMultirange &qregs,
-                                                      int hq, const GEMMProblem &problem)
+                                                      int hq, const GEMMProblem &problem, CommonState &state)
 {
-    int xqGroupK = doA ? problem.aqGroupK : problem.bqGroupK;
+    int xqGroupK  = doA ? problem.aqGroupK : problem.bqGroupK;
+    int xqGroupMN = doA ? problem.aqGroupM : problem.bqGroupN;
 
     int mq, nq;
     getLayoutDims(qlayout, mq, nq);
     bool broadcast = (mq * nq) == 1;
+    bool mnGrouped = (xqGroupMN > 1);
 
     for (auto &block: layout) {
         auto crosspack = block.crosspack;
@@ -260,15 +287,18 @@ void BLASKernelGenerator<hw>::gemmDequantizeOperation(bool doA, Type T, Type To,
         int nx = colMajor ? block.nr : block.nc;
         int ny = colMajor ? block.nc : block.nr;
 
-        for (int y0 = 0; y0 < ny; y0 += crosspack) {
+        for (int y0 = 0; y0 < ny; y0 += (mnGrouped ? 1 : crosspack)) {
         for (int x0 = 0; x0 < nx; ) {
             auto ii0 = colMajor ? x0 : y0;
             auto jj0 = colMajor ? y0 : x0;
             auto io0 = ii0 + block.offsetR;
             auto jo0 = jj0 + block.offsetC;
             auto &ho0 = doA ? jo0 : io0;
+            auto &lo0 = doA ? io0 : jo0;
+            auto l0 = lo0;
             ho0 += hq;
             ho0 /= xqGroupK;
+            if (mnGrouped) lo0 /= xqGroupMN;
             if (broadcast) io0 = jo0 = 0;
 
             int ne, neq;
@@ -277,59 +307,59 @@ void BLASKernelGenerator<hw>::gemmDequantizeOperation(bool doA, Type T, Type To,
             auto qdata = findBlockReg(To, qlayout, io0, jo0, qregs, neq, qblock);
 
             int strideq = 1;
+            int strided = 1;
             if (broadcast)
                 strideq = 0;
-            else if (colMajor == doA) {
+            else if (mnGrouped) {
+                strided = crosspack;
+                strideq = 0;
+                ne = std::min(ne, xqGroupMN - (l0 % xqGroupMN));
+            } else if (colMajor == doA) {
                 ne = std::min(ne, neq);
-                if (qblock->crosspack * To != crosspack * T) stub();
+                if (qblock->crosspack * To < crosspack * T) stub();
             } else {
                 ne = std::min(ne, xqGroupK);
                 strideq = 0;
             }
 
             int maxSIMD = (op == BinaryOp::Sub && T.isInt8()) ? 64 : 32;
-            int simd = std::min({ne * crosspack, 2 * elementsPerGRF(hw, T), maxSIMD});
+            if (To == Type::f32) maxSIMD = elementsPerGRF(hw, To);
+            int simd = std::min({ne * crosspack / strided, 2 * elementsPerGRF(hw, T) / strided, maxSIMD});
+            bool reqTmpQdata = false;
+            GRF tmpReg;
+            if(one_of(op, BinaryOp::Mul, BinaryOp::ScaleSub) && qdata.getOffset() != data.getOffset() && strideq != 0){
+                auto utype = one_of(To, Type::f16, Type::bf16) ? ngen::DataType::uw : ngen::DataType::ud;
+                tmpReg = state.ra.alloc();
+                auto tmpQdata = tmpReg.setOffset(data.getOffset()).setType(utype).setRegion(0, 0, strideq);
+                mov(simd, tmpQdata, qdata(strideq).setType(utype)); 
+                qdata = Subregister(tmpQdata, data.getOffset(), To.ngen());
+                reqTmpQdata = true;
+            }
             switch (op) {
                 case BinaryOp::Sub:
-                    if (T.isInt8()) {
+                    if (T.isInt8() && strided == 1) {
                         add(simd / 2, data(2), data(2), -qdata(strideq * 2 / To));
                         data.setOffset(data.getOffset() + 1);
                         qdata.setOffset(qdata.getOffset() + strideq / To);
                         add(simd / 2, data(2), data(2), -qdata(strideq * 2 / To));
                     } else
-                        add(simd, data(1), data(1), -qdata(strideq));
+                        add(simd, data(strided), data(strided), -qdata(strideq));
                     break;
-                case BinaryOp::Mul: mul(simd, data(1), data(1),  qdata(strideq)); break;
+                case BinaryOp::Mul: mul(simd, data(strided), data(strided),  qdata(strideq)); break;
                 case BinaryOp::ScaleSub:
                     if (T != Type::f16) stub();
-                    mad(simd, data(1), -qdata(strideq), data(1), Immediate::hf(0x7800));  /* 0x7800 = 2^15 */
+                    mad(simd, data(strided), -qdata(strideq), data(strided), Immediate::hf(0x7800));  /* 0x7800 = 2^15 */
                     break;
                 default: stub();
             }
-            x0 += simd / crosspack;
+            x0 += simd * strided / crosspack;
+            if(reqTmpQdata)
+                state.ra.release(tmpReg);
         }
         }
     }
 }
 
-bool canDequantizeInt4(Type Tsrc, Type Tdst,
-                       const vector<RegisterBlock> &layoutSrc, const vector<RegisterBlock> &layoutDst,
-                       const vector<RegisterBlock> layoutOffset, const vector<RegisterBlock> layoutScale)
-{
-    if (!Tsrc.isInt4() || !one_of(Tdst, Type::f16, Type::bf16, Type::f32))
-        return false;
-
-    if (layoutOffset.empty() || layoutScale.empty()) {
-        int m, n, md, nd;
-        getLayoutDims(layoutSrc, m, n);
-        getLayoutDims(layoutDst, md, nd);
-
-        if (m < md || n < nd) return false;
-    }
-
-    return true;
-}
-
 // Shift s4 data by 8 to transfrom it into u4 data.
 template <HW hw>
 void BLASKernelGenerator<hw>::dequantizeInt4Shift(Type Tsrc, GRFMultirange src, const CommonStrategy &strategy)
@@ -356,21 +386,23 @@ void BLASKernelGenerator<hw>::dequantizeInt4(bool doA, Type Tsrc, Type Tdst, con
     getLayoutDims(layoutDst, md, nd);
 
     bool s4 = Tsrc.isSigned();
+    bool f8 = Tdst.isF8();
     bool f32 = (Tdst == Type::f32);
     bool bf16 = (Tdst == Type::bf16);
 
-    int offR0 = offR, offC0 = offC;
-
     vector<RegisterBlock> layoutDstF16;
     const vector<RegisterBlock> *effLayoutDst = &layoutDst;
     GRFMultirange dstF16;
     const GRFMultirange *effDst = &dst;
-    if (f32 || bf16) {
+    if (f32 || bf16 || f8) {
         makeUnbackedRegLayout(Type::f16, layoutDstF16, m, n, isLayoutColMajor(layoutDst), 1);
+        for (auto &block: layoutDstF16) {
+            block.offsetR += layoutDst[0].offsetR;
+            block.offsetC += layoutDst[0].offsetC;
+        }
         dstF16 = chunkAlloc(getRegCount(layoutDstF16), 2, state);
         effLayoutDst = &layoutDstF16;
         effDst = &dstF16;
-        offR = offC = 0;
     }
 
     // 1) Shift s4 data to u4 data by adding 8.
@@ -386,7 +418,7 @@ void BLASKernelGenerator<hw>::dequantizeInt4(bool doA, Type Tsrc, Type Tdst, con
     int hab = doA ? offC : offR;
     if (!layoutOffset.empty()) {
         if (!problem) stub();
-        gemmDequantizeOperation(doA, Type::f16, Type::f16, BinaryOp::ScaleSub, *effLayoutDst, layoutOffset, *effDst, offset, hab, *problem);
+        gemmDequantizeOperation(doA, Type::f16, Type::f16, BinaryOp::ScaleSub, *effLayoutDst, layoutOffset, *effDst, offset, hab, *problem, state);
     } else {
         map(hw, Type::f16, *effDst, *effLayoutDst, strategy, [&](int esize, RegData r) {
             s4 ? mad(esize, r, Immediate::hf(0xA400), r, Immediate::hf(0x7800))
@@ -403,12 +435,12 @@ void BLASKernelGenerator<hw>::dequantizeInt4(bool doA, Type Tsrc, Type Tdst, con
     //      this could be scaled into the previous multiplication.
     if (!layoutScale.empty()) {
         if (!problem) stub();
-        gemmDequantizeOperation(doA, Type::f16, Tscale, BinaryOp::Mul, *effLayoutDst, layoutScale, *effDst, scale, hab, *problem);
+        gemmDequantizeOperation(doA, Type::f16, Tscale, BinaryOp::Mul, *effLayoutDst, layoutScale, *effDst, scale, hab, *problem, state);
     }
 
     // 6) Convert to dst type if needed.
-    if (f32 || bf16) {
-        copyRegisters(Type::f16, Tdst, layoutDstF16, layoutDst, dstF16, dst, offR0, offC0, false, strategy, state);
+    if (f32 || bf16 || f8) {
+        copyRegisters(Type::f16, Tdst, layoutDstF16, layoutDst, dstF16, dst, offR, offC, false, strategy, state);
         safeReleaseRanges(dstF16, state);
     }
 }
@@ -434,10 +466,10 @@ void BLASKernelGenerator<hw>::gemmDequantizeAB(bool doA, Type Tsrc, Type Tdst,
     auto &srRegs     = doA ? state.Ar_scaleRegs    : state.Br_scaleRegs;
     bool lateScale   = doA ? state.lateScale2DA    : state.lateScale2DB;
 
-    auto &oLayout = orRegs.empty() ? oiLayout : orLayout;
-    auto &oRegs   = orRegs.empty() ? oiRegs   : orRegs;
-    auto &sLayout = srRegs.empty() ? siLayout : srLayout;
-    auto &sRegs   = srRegs.empty() ? siRegs   : srRegs;
+    auto &oLayout = orLayout.empty() ? oiLayout : orLayout;
+    auto &oRegs   = orLayout.empty() ? oiRegs   : orRegs;
+    auto &sLayout = srLayout.empty() ? siLayout : srLayout;
+    auto &sRegs   = srLayout.empty() ? siRegs   : srRegs;
 
     bool xo2D = !oLayout.empty();
     bool xs2D = !sLayout.empty() && !lateScale;
@@ -478,12 +510,12 @@ void BLASKernelGenerator<hw>::gemmDequantizeAB(bool doA, Type Tsrc, Type Tdst,
             convert(src, Tsrc, Tx1_int, strategy, state);
 
         if (xo2D) {
-            gemmDequantizeOperation(doA, Tx1_int, Txo_int, BinaryOp::Sub, layoutDst, oLayout, dst, oRegs, hab, problem);
+            gemmDequantizeOperation(doA, Tx1_int, Txo_int, BinaryOp::Sub, layoutDst, oLayout, dst, oRegs, hab, problem, state);
             convert(dst, Tx1_int, Tx2_int, strategy, state);
         }
 
         if (xs2D) {
-            gemmDequantizeOperation(doA, Tx_scaleInt, Tx_scaleOp, BinaryOp::Mul, layoutDst, sLayout, dst, sRegs, hab, problem);
+            gemmDequantizeOperation(doA, Tx_scaleInt, Tx_scaleOp, BinaryOp::Mul, layoutDst, sLayout, dst, sRegs, hab, problem, state);
             convert(dst, Tx_scaleInt, Tdst, strategy, state);
         }
     }
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/register_allocation.cxx b/src/gpu/intel/jit/gemm/generator/pieces/register_allocation.cxx
index 10333db2acc..b528c5286d6 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/register_allocation.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/register_allocation.cxx
@@ -613,6 +613,8 @@ void BLASKernelGenerator<hw>::gemmAllocRegs(GEMMProblem &problem, GEMMStrategy &
         state.Bi_regs[q] = state.ra.alloc_range(state.Bi_regCount);
 
     // Allocate registers for A/B sums.
+    state.Asr_regs = state.ra.alloc_range(getRegCount(state.Asr_layout));
+    state.Bsr_regs = state.ra.alloc_range(getRegCount(state.Bsr_layout));
     state.As_regs = state.ra.alloc_range(getRegCount(state.As_layout));
     state.Bs_regs = state.ra.alloc_range(getRegCount(state.Bs_layout));
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/register_block.hpp b/src/gpu/intel/jit/gemm/generator/pieces/register_block.hpp
index b1e835a0663..9e811646c91 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/register_block.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/register_block.hpp
@@ -34,7 +34,7 @@ struct MaskInfo {
         struct {
             uint8_t isFixed : 1;  // = false (variable mask)
             uint8_t reverse : 1;  // True to reverse mask.
-            uint8_t rdivide : 6;  // Amount by which to divide index before forming mask. Fractions are rounded up.
+            uint8_t rshift : 6;   // Power of 2 by which to divide index before forming mask. Fractions are rounded up.
                                   // Note maskRep * bitRep * (rsize >> rshift) = # mask bits.
             uint8_t rsize;        // Maximum remainder value. (e.g. 16 if we need the last 4 bits of the index).
             uint8_t maskRep;      // # of repetitions of mask pattern.
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/remask.cxx b/src/gpu/intel/jit/gemm/generator/pieces/remask.cxx
index 0b01b1c2d62..0cf27c10caf 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/remask.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/remask.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,36 +27,6 @@ using std::vector;
 #include "internal/namespace_start.hxx"
 
 
-static bool needsRemask(Type T, bool column, const RegisterBlock &block,
-                        const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, bool ignoreMasks = false)
-{
-    if (!ignoreMasks)
-        if (column ? !block.remainderC : !block.remainderR)
-            return false;
-
-    bool block2DRemask = isBlock2D(astrategy.accessType)
-                      && ((block.colMajor ^ isTransposing(astrategy.accessType)) != column);
-
-    int maskGranularity = block.ebytes;
-    if (block.ebytes >= 16)
-        maskGranularity = 4;
-    if (block2DRemask)
-        maskGranularity = std::max(maskGranularity, block2DWidthAlignment(T, block, atype, astrategy));
-    if (ignoreMasks && !(block2DRemask && astrategy.address2D))
-        maskGranularity = 256;
-
-    return (T.paddedSize() < maskGranularity);
-}
-
-bool needsRemask(Type T, bool column, const vector<RegisterBlock> &layout,
-                 const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, bool ignoreMasks)
-{
-    for (auto &block: layout)
-        if (needsRemask(T, column, block, atype, astrategy, ignoreMasks))
-            return true;
-    return false;
-}
-
 template <HW hw>
 void BLASKernelGenerator<hw>::setupTeardownRemask(Type T, int index, bool setup, int nq, Subregister remQ,
                                                   const CommonStrategy &strategy, CommonState &state,
@@ -65,8 +35,8 @@ void BLASKernelGenerator<hw>::setupTeardownRemask(Type T, int index, bool setup,
     if (T.paddedSize() > 4) T = Type::u32;
 
     if (setup) {
-        bool int4 = T.isInt4();
-        if (int4) {
+        bool halfByte = T.is4Bit();
+        if (halfByte) {
             nq = div_up(nq, 2);
             T = Type::u8;
         }
@@ -82,7 +52,7 @@ void BLASKernelGenerator<hw>::setupTeardownRemask(Type T, int index, bool setup,
         bool haveVariableOff = variableOffQ.isValid();
         bool haveFixedOff = (fixedOffQ != 0);
 
-        if (haveVariableOff || haveFixedOff || int4) {
+        if (haveVariableOff || haveFixedOff || halfByte) {
             auto nremQ = state.ra.alloc_sub<uint32_t>();
             freeRemQ = true;
 
@@ -92,7 +62,7 @@ void BLASKernelGenerator<hw>::setupTeardownRemask(Type T, int index, bool setup,
                 add(1, nremQ, remQ, -variableOffQ);
             else if (haveFixedOff)
                 add(1, nremQ, remQ, -fixedOffQ);
-            if (int4)
+            if (halfByte)
                 avg(1, nremQ, (haveVariableOff || haveFixedOff) ? nremQ : remQ, 0);
             remQ = nremQ;
         }
@@ -140,7 +110,9 @@ void BLASKernelGenerator<hw>::setupTeardownRemask(Type T, int index, bool setup,
 }
 
 template <HW hw>
-void BLASKernelGenerator<hw>::remaskLayout(Type T, int index, bool column, const std::vector<RegisterBlock> &layout, const GRFMultirange &regs, const CommonStrategy &strategy, CommonState &state, int offset)
+void BLASKernelGenerator<hw>::remaskLayout(Type T, int index, bool column,
+                                           const std::vector<RegisterBlock> &layout, const GRFMultirange &regs,
+                                           const CommonStrategy &strategy, CommonState &state, int offset)
 {
     for (auto &block: layout) {
         auto crosspack = block.crosspack;
@@ -194,4 +166,15 @@ void BLASKernelGenerator<hw>::remaskLayout(Type T, int index, bool column, const
     }
 }
 
+template <HW hw>
+void BLASKernelGenerator<hw>::remaskLayoutSingle(Type T, int index, bool column, int nq, Subregister remQ,
+                                                 const std::vector<RegisterBlock> &layout, const GRFMultirange &regs,
+                                                 const CommonStrategy &strategy, CommonState &state,
+                                                 int fixedOffQ, const Subregister &variableOffQ, int maskOff)
+{
+    setupTeardownRemask(T, index, true, nq, remQ, strategy, state, fixedOffQ, variableOffQ);
+    remaskLayout(T, index, column, layout, regs, strategy, state, maskOff);
+    setupTeardownRemask(T, index, false, nq, remQ, strategy, state);
+}
+
 #include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/state.cpp b/src/gpu/intel/jit/gemm/generator/pieces/state.cpp
index d9d17447bd2..c2ff85f1606 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/state.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/state.cpp
@@ -23,6 +23,22 @@ using namespace ngen;
 #include "internal/namespace_start.hxx"
 
 
+Subregister SubregisterPair::getReg(int idx) const
+{
+    auto r = regs[idx & 1];
+    if (negative)
+        r = -r;
+    return r;
+}
+
+Subregister SubregisterPair::getRegAvoiding(HW hw, const RegData &rd) const
+{
+    if (Bundle::same_bank(hw, rd, regs[0]))
+        return getReg(1);
+    else
+        return getReg(0);
+}
+
 VirtualFlag CommonState::allocVFlag(ngen::HW hw, int n)
 {
     auto flag = raVFlag.allocVirtual(n);
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/state.hpp b/src/gpu/intel/jit/gemm/generator/pieces/state.hpp
index e440ebc9f67..779c40e2fb0 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/state.hpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/state.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -104,7 +104,20 @@ struct Address2DParams {
 };
 
 // Cached set of leading dimension multiples for address increments.
-using LDIncrements = std::vector<std::pair<int, SubregisterPair>>;
+struct LDIncrements {
+    using value_type = std::pair<int, SubregisterPair>;
+    LDIncrements(const MatrixAddressingStrategy &s): type(s.base.isA64() ? ngen::DataType::q : ngen::DataType::ud) {}
+
+    std::vector<value_type>::iterator begin() { return increments.begin(); }
+    std::vector<value_type>::const_iterator begin() const { return increments.begin(); }
+    std::vector<value_type>::iterator end() { return increments.end(); }
+    std::vector<value_type>::const_iterator end() const { return increments.end(); }
+    void push_back(const value_type & v) { increments.push_back(v); }
+    void clear() { increments.clear(); }
+
+    ngen::DataType type;
+    std::vector<value_type> increments;
+};
 
 // Assignment of a logical mask to an variable and virtual flag register.
 struct MaskAssignment {
@@ -137,7 +150,7 @@ struct CommonState {
     ngen::FlagRegister flagSwizzle;
     EmulationState emulate;
     ngen::GRFRange eatomicAddRegs[2];
-    ngen::GRFRange remaskRegs[2];
+    ngen::GRFRange remaskRegs[3];
     VirtualFlag vflagEAtomicAdd;
     VirtualFlag blockEMask;
     ngen::Label blockDone;
@@ -223,6 +236,8 @@ struct GEMMState : public CommonState {
         std::vector<ngen::Subregister> binaryLDs;           // d
         std::vector<std::vector<ngen::Subregister>> binaryStrides;    // d
         std::vector<uint8_t> binarySurfaces;
+        ngen::Subregister sroundSeedPtr;                    // q
+        ngen::Subregister sroundSeed;                       // ud
     } inputs;
     Type Ta_load, Tb_load;                                  // Current type to be loaded into A/B_regs.
     Type Tacc;                                              // Current type in accumulator registers.
@@ -259,6 +274,7 @@ struct GEMMState : public CommonState {
     GRFMultirange Ao_regs, Bo_regs;                         // Outgoing data to copy to SLM.
     GRFMultirange Ao_regsRem, Bo_regsRem;
     GRFMultirange As_regs, Bs_regs;                         // A row sums/B column sums.
+    GRFMultirange Asr_regs, Bsr_regs;                       // A row sums/B column sums to be repacked.
     GRFMultirange Ap_regs, Bp_regs, Cp_regs;                // A/B/C prefetch registers.
     GRFMultirange A_offsetRegs, B_offsetRegs;               // A/B offsets (grouped).
     GRFMultirange A_scaleRegs, B_scaleRegs;                 // A/B scales (grouped).
@@ -267,6 +283,7 @@ struct GEMMState : public CommonState {
     std::vector<MaskAssignment> AB_masks, AB_masksCoop;
     ngen::GRFRange broadcast_regs;
     std::vector<ngen::GRFRange> tempMul_regs;
+    ngen::Subregister groupIDMN;                            // d
     ngen::Subregister i0, j0, h0;                           // d
     ngen::Subregister wgI0, wgJ0;                           // d
     ngen::Subregister threadK0, k0Rem, wgK;                 // ud
@@ -325,6 +342,7 @@ struct GEMMState : public CommonState {
     std::vector<RegisterBlock> Ai_layoutRem, Bi_layoutRem;
     std::vector<RegisterBlock> Ao_layout, Bo_layout;
     std::vector<RegisterBlock> As_layout, Bs_layout;
+    std::vector<RegisterBlock> Asr_layout, Bsr_layout;
     std::vector<RegisterBlock> Ap_layout, Bp_layout, Cp_layout;
     std::vector<RegisterBlock> Ap_layoutAlt, Bp_layoutAlt;
     std::vector<RegisterBlock> A_offsetLayout, B_offsetLayout;
@@ -347,8 +365,6 @@ struct GEMMState : public CommonState {
     MatrixAddressing Ai, Bi, Ao, Bo, tempC;
     MatrixAddressingStrategy Ai_strategy, Bi_strategy;
     MatrixAddressingStrategy Ao_strategy, Bo_strategy;
-    MatrixAddressingStrategy A_offsetStrategy, B_offsetStrategy;
-    MatrixAddressingStrategy A_scaleStrategy, B_scaleStrategy;
     MatrixAddressingStrategy Cext_strategy, tempCStrategy;
     ngen::FlagRegister panelMaskA, panelMaskB;
     int8_t tokenBarrierFence[2];
@@ -372,6 +388,7 @@ struct GEMMState : public CommonState {
     bool repackA = false, repackB = false;
     bool repackARem = false, repackBRem = false;
     int ka_repack, ka_repackRem, kb_repackRem;
+    int cRepackPeriod = 0;
     bool remActiveA, remActiveB, remActiveSLM;
     std::vector<MaskAssignment> kMasksA, kMasksB, kMasksAi, kMasksBi;
     int initSLMKOffset = 0;
@@ -382,6 +399,11 @@ struct GEMMState : public CommonState {
     ngen::GRF emulate64TempSave[2];
     bool simd32KMasks = false;
     int lastThresh = 0;
+    ngen::Subregister nextGroupIDM, nextGroupIDN;
+    ngen::Subregister nextFlagL3PFA, nextFlagL3PFB;
+    ngen::FlagRegister flagL3PFA, flagL3PFB;
+    std::vector<RegisterBlock> Apl3_layout, Bpl3_layout;
+    std::vector<ngen::GRFRange> Apl3_addrs, Bpl3_addrs;
 
     std::vector<ngen::Subregister> effBinary;
 
@@ -389,7 +411,13 @@ struct GEMMState : public CommonState {
         ngen::InstructionModifier depAddr[4];
     } sysgemm;
 
-    GEMMState(ngen::HW hw) : CommonState(hw) {}
+    GEMMState(ngen::HW hw, const GEMMStrategy& strategy) : CommonState(hw),
+                                                           ldaIncrements(strategy.A),
+                                                           ldbIncrements(strategy.B),
+                                                           ldaoIncrements(strategy.AO),
+                                                           ldboIncrements(strategy.BO),
+                                                           ldasIncrements(strategy.A_scale),
+                                                           ldbsIncrements(strategy.B_scale) {}
 
     int internalSIMD() const { return simd32KMasks ? 32 : 16; }
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cpp b/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cpp
new file mode 100644
index 00000000000..e15cd4c637a
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+
+#include "state_utils.hpp"
+#include "hw_utils.hpp"
+
+using namespace ngen;
+using std::vector;
+
+#include "internal/namespace_start.hxx"
+
+
+// Release fused remainder-related state variables.
+void releaseFusedRemainders(GEMMState &state)
+{
+    state.ra.safeRelease(state.remFusedStorage);
+    state.remaindersFused[LoopM] = Subregister{};
+    state.remaindersFused[LoopN] = Subregister{};
+}
+
+void releaseCoopRemainders(GEMMState &state)
+{
+    for (LoopType loop: {LoopM, LoopN, LoopK})
+        if (state.remaindersCoop[loop] != state.remainders[loop])
+            state.ra.safeRelease(state.remaindersCoop[loop]);
+}
+
+// Allocate temporary registers for emulating atomic addition.
+void allocEAtomicAddRegs(HW hw, Type T, const vector<RegisterBlock> &layout,
+                         const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, CommonState &state,
+                         const FlagRegister &flag)
+{
+    if (hasNativeAtomicAdd(hw, T.real(), atype, astrategy)) return;
+
+    int maxNReg = 0;
+    for (const auto &block : layout)
+        maxNReg = std::max(maxNReg, block.nregs());
+
+    if (maxNReg == 0) return;
+
+    state.eatomicAddRegs[0] = state.ra.alloc_range(maxNReg * 2);
+    state.eatomicAddRegs[1] = state.ra.alloc_range(maxNReg);
+    state.vflagEAtomicAdd = flag.isValid() ? flag
+                                           : state.allocVFlag(hw);
+}
+
+void freeEAtomicAddRegs(CommonState &state, const FlagRegister &flag)
+{
+    state.ra.safeRelease(state.eatomicAddRegs[0]);
+    state.ra.safeRelease(state.eatomicAddRegs[1]);
+    if (flag.isInvalid())
+        state.raVFlag.release(state.vflagEAtomicAdd);
+}
+
+void releaseMaskAssignments(vector<MaskAssignment> &assignments, CommonState &state, int start)
+{
+    for (size_t an = start; an < assignments.size(); an++)
+        state.raVFlag.release(assignments[an].flag);
+
+    state.wipeActiveVFlags();
+}
+
+void reclaimMaskAssignments(vector<MaskAssignment> &assignments, CommonState &state, int start)
+{
+    for (size_t an = start; an < assignments.size(); an++)
+        state.raVFlag.claim(assignments[an].flag);
+}
+
+void safeReleaseMaskAssignments(vector<MaskAssignment> &assignments, CommonState &state, int start)
+{
+    releaseMaskAssignments(assignments, state, start);
+    assignments.resize(start);
+}
+
+RegData getMaskFlag(HW hw, VirtualFlag vflag, CommonState &state)
+{
+    if (state.vflagsEnabled()) {
+        return state.vflagStorage.sub(hw, vflag.idx, DataType::uw)
+                                 .reinterpret(0, vflag.n == 2 ? DataType::ud : DataType::uw);
+    } else if (!state.raVFlag.isVirtual(vflag)) {
+        auto pflag = vflag.toPhysical();
+        state.usePhysicalFlag(pflag);
+        return pflag;
+    } else
+        stub("Need virtual flag registers");
+}
+
+
+#include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cxx b/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cxx
index 29fc49800fc..bf5f4f03fe0 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/state_utils.cxx
@@ -25,21 +25,6 @@ using std::vector;
 #include "internal/namespace_start.hxx"
 
 
-// Release fused remainder-related state variables.
-void releaseFusedRemainders(GEMMState &state)
-{
-    state.ra.safeRelease(state.remFusedStorage);
-    state.remaindersFused[LoopM] = Subregister{};
-    state.remaindersFused[LoopN] = Subregister{};
-}
-
-void releaseCoopRemainders(GEMMState &state)
-{
-    for (LoopType loop: {LoopM, LoopN, LoopK})
-        if (state.remaindersCoop[loop] != state.remainders[loop])
-            state.ra.safeRelease(state.remaindersCoop[loop]);
-}
-
 template <HW hw>
 void BLASKernelGenerator<hw>::saveMNLocalIDs(const GEMMStrategy &strategy, GEMMState &state)
 {
@@ -69,65 +54,4 @@ void BLASKernelGenerator<hw>::releaseSavedMNLocalIDs(GEMMState &state)
     state.lidN = invalid;
 }
 
-// Allocate temporary registers for emulating atomic addition.
-void allocEAtomicAddRegs(HW hw, Type T, const vector<RegisterBlock> &layout,
-                         const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, CommonState &state,
-                         const FlagRegister &flag)
-{
-    if (hasNativeAtomicAdd(hw, T.real(), atype, astrategy)) return;
-
-    int maxNReg = 0;
-    for (const auto &block : layout)
-        maxNReg = std::max(maxNReg, block.nregs());
-
-    if (maxNReg == 0) return;
-
-    state.eatomicAddRegs[0] = state.ra.alloc_range(maxNReg * 2);
-    state.eatomicAddRegs[1] = state.ra.alloc_range(maxNReg);
-    state.vflagEAtomicAdd = flag.isValid() ? flag
-                                           : state.allocVFlag(hw);
-}
-
-void freeEAtomicAddRegs(CommonState &state, const FlagRegister &flag)
-{
-    state.ra.safeRelease(state.eatomicAddRegs[0]);
-    state.ra.safeRelease(state.eatomicAddRegs[1]);
-    if (flag.isInvalid())
-        state.raVFlag.release(state.vflagEAtomicAdd);
-}
-
-void releaseMaskAssignments(vector<MaskAssignment> &assignments, CommonState &state, int start)
-{
-    for (size_t an = start; an < assignments.size(); an++)
-        state.raVFlag.release(assignments[an].flag);
-
-    state.wipeActiveVFlags();
-}
-
-void reclaimMaskAssignments(vector<MaskAssignment> &assignments, CommonState &state, int start)
-{
-    for (size_t an = start; an < assignments.size(); an++)
-        state.raVFlag.claim(assignments[an].flag);
-}
-
-void safeReleaseMaskAssignments(vector<MaskAssignment> &assignments, CommonState &state, int start)
-{
-    releaseMaskAssignments(assignments, state, start);
-    assignments.resize(start);
-}
-
-RegData getMaskFlag(HW hw, VirtualFlag vflag, CommonState &state)
-{
-    if (state.vflagsEnabled()) {
-        return state.vflagStorage.sub(hw, vflag.idx, DataType::uw)
-                                 .reinterpret(0, vflag.n == 2 ? DataType::ud : DataType::uw);
-    } else if (!state.raVFlag.isVirtual(vflag)) {
-        auto pflag = vflag.toPhysical();
-        state.usePhysicalFlag(pflag);
-        return pflag;
-    } else
-        stub("Need virtual flag registers");
-}
-
-
 #include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/tlb_warmup.cxx b/src/gpu/intel/jit/gemm/generator/pieces/tlb_warmup.cxx
new file mode 100644
index 00000000000..a3e474eef27
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/generator/pieces/tlb_warmup.cxx
@@ -0,0 +1,173 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+
+#include "generator.hpp"
+#include "hw_utils.hpp"
+#include "layout_utils.hpp"
+#include "state_utils.hpp"
+#include "ngen_object_helpers.hpp"
+
+#include "internal/namespace_start.hxx"
+
+using namespace ngen;
+using namespace ngen::utils;
+using std::vector;
+
+
+
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmTLBWarmup(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    auto lid = state.ra.allocSub<uint32_t>();
+    int whose = 0;
+
+    emad(1, lid, state.inputs.localIDM, state.inputs.localIDN, strategy.wg[LoopM], strategy, state);
+    if (strategy.kParallelLocal)
+        emad(1, lid, lid, state.inputs.localIDK, strategy.wg[LoopM] * strategy.wg[LoopN], strategy, state);
+
+    if (problem.quantized2DA()) {
+        auto mq = state.ra.allocSub<uint32_t>();
+        auto kq = state.ra.allocSub<uint32_t>();
+        divDown(mq, state.inputs.m, problem.aqGroupM, strategy, state);
+        divDown(kq, state.inputs.k, problem.aqGroupK, strategy, state);
+        if (problem.aScale2D) {
+            tlbWarmup(problem.A_scale, strategy.A_scale, state.inputs.aScalePtr,
+                      mq, kq, state.ldaScale, lid, whose++, problem, strategy, state);
+        }
+        if (problem.aoPtrDims == 2) {
+            tlbWarmup(problem.AO, strategy.AO, state.inputs.aoPtr,
+                      mq, kq, state.ldao, lid, whose++, problem, strategy, state);
+        }
+        state.ra.safeRelease(mq);
+        state.ra.safeRelease(kq);
+    }
+
+    if (problem.quantized2DB()) {
+        auto kq = state.ra.allocSub<uint32_t>();
+        auto nq = state.ra.allocSub<uint32_t>();
+        divDown(kq, state.inputs.k, problem.bqGroupK, strategy, state);
+        divDown(nq, state.inputs.n, problem.bqGroupN, strategy, state);
+        if (problem.bScale2D) {
+            tlbWarmup(problem.B_scale, strategy.B_scale, state.inputs.bScalePtr,
+                      kq, nq, state.ldbScale, lid, whose++, problem, strategy, state);
+        }
+        if (problem.boPtrDims == 2) {
+            tlbWarmup(problem.BO, strategy.BO, state.inputs.boPtr,
+                      kq, nq, state.ldbo, lid, whose++, problem, strategy, state);
+        }
+        state.ra.safeRelease(kq);
+        state.ra.safeRelease(nq);
+    }
+
+    tlbWarmup(problem.A, strategy.A, state.effA,
+              state.inputs.m, state.inputs.k, state.inputs.lda, lid, whose++,
+              problem, strategy, state);
+    tlbWarmup(problem.B, strategy.B, state.effB,
+              state.inputs.k, state.inputs.n, state.inputs.ldb, lid, whose++,
+              problem, strategy, state);
+
+    state.ra.safeRelease(lid);
+}
+
+template <HW hw>
+void BLASKernelGenerator<hw>::tlbWarmup(const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy,
+                                        const Subregister &ptr, const Subregister &r, const Subregister &c,
+                                        const Subregister &ld, const Subregister &lid, int whose,
+                                        const CommonProblem &problem, const CommonStrategy &strategy, CommonState &state)
+{
+    auto flag = state.raVFlag.alloc();
+    const uint32_t byteLimit = 256 * 1024 * 1024;
+
+    auto bytes = state.ra.allocSub<uint64_t>();
+    emul(1, bytes, ld, isColMajor(atype.layout) ? c : r, strategy, state);
+    cmp(1 | nz | flag, bytes.ud(1), 0);
+    min_(1, bytes.ud(), bytes.ud(), byteLimit);
+    mov(1 | flag, bytes.ud(), byteLimit);
+
+    state.raVFlag.safeRelease(flag);
+
+    tlbWarmup(astrategy.base, ptr, bytes.ud(), lid, whose, problem, strategy, state);
+
+    state.ra.safeRelease(bytes);
+}
+
+template <HW hw>
+void BLASKernelGenerator<hw>::tlbWarmup(AddressBase base, const Subregister &ptr, const Subregister &bytes,
+                                        const Subregister &lid, int whose,
+                                        const CommonProblem &problem, const CommonStrategy &strategy, CommonState &state)
+{
+    bool a64 = base.isA64();
+    auto Taddr = a64 ? DataType::uq : DataType::ud;
+    const int simd = elementsPerGRF<uint32_t>(hw);
+    const int log2Stride = 16;      // 64kb stride.
+    const int log2TwiddleStride = 6;
+
+    int udStride = a64 ? 2 : 1;
+    auto addr = state.ra.allocRange(udStride);
+    auto addr0 = addr[0].retype(Taddr);
+    auto addrLo = addr0.ud(0)(udStride);
+    auto off = state.ra.allocRange(udStride);
+    auto off0 = off[0].ud(0)(udStride);
+    auto twiddle = state.ra.alloc().ud();
+    auto data = state.ra.alloc().ud();
+    auto count = state.ra.alloc().d();
+    auto flag = state.raVFlag.alloc();
+
+    extendIndexVec(simd, state);
+
+    auto iv = accessIndexVec(0, state)(1);
+
+    cmp(1 | nz | flag, lid, whose);         /* Check if we are responsible thread */
+
+    shl(simd, off0, iv, log2Stride);
+    shl(simd, twiddle, iv, log2TwiddleStride);
+    eadd(simd, addr0, ptr, off0, strategy, state);
+    xor_(simd, addrLo, addrLo, twiddle);    /* Perturb low bits to avoid cache hotspotting */
+
+    add(1, count, bytes, ((simd + 1) << log2Stride) - 1);
+    shr(1, count, count, log2Stride);
+    add(simd, count, count[0], -iv);
+
+    Label lTop, lSkip;
+    jmpi(1 | flag, lSkip);
+
+    mark(lTop);
+    add(simd | gt | flag, count, count, -simd);
+    if (hw >= HW::XeHPC)
+        load(simd | flag, null, D8U32 | L1C_L3C, base, addr);
+    else if (hw >= HW::XeHPG)
+        load(simd | flag, data, D8U32 | L1C_L3C, base, addr);
+    else
+        load(simd | flag, data, scattered_byte(), base, addr);
+    xor_(simd, addrLo, addrLo, twiddle);
+    add(simd, twiddle, twiddle, simd << log2TwiddleStride);
+    and_(simd, twiddle, twiddle, 0xFFF);    /* Don't cross 4K page boundaries */
+    eadd(simd, addr0, addr0, simd << log2Stride, strategy, state);
+    xor_(simd, addrLo, addrLo, twiddle);
+    jmpi(1 | flag, lTop);
+    mark(lSkip);
+
+    releaseIndexVec(state);
+    state.raVFlag.safeRelease(flag);
+    state.ra.safeRelease(off);
+    state.ra.safeRelease(twiddle);
+    state.ra.safeRelease(addr);
+    state.ra.safeRelease(data);
+    state.ra.safeRelease(count);
+}
+
+#include "internal/namespace_end.hxx"
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/token_alloc_utils.cpp b/src/gpu/intel/jit/gemm/generator/pieces/token_alloc_utils.cpp
index 4bc175650fb..e7d2d17b9ea 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/token_alloc_utils.cpp
+++ b/src/gpu/intel/jit/gemm/generator/pieces/token_alloc_utils.cpp
@@ -36,9 +36,10 @@ bool allocateTokens(const vector<RegisterBlock> &layout, const GRFMultirange &re
         if (token < 0)
             success = false;
         else {
-            auto regKey = !regs.empty() ? regs[layout[l].offsetReg()].getBase()
-                                        : addrs[l].getBase();
-            state.tokenMap.push_back(std::make_pair(regKey, token));
+            auto regKey = !regs.empty() ? regs[layout[l].offsetReg()]
+                                        : addrs[l];
+            if (regKey.isInvalid()) continue;
+            state.tokenMap.push_back(std::make_pair(regKey.getBase(), token));
         }
     }
 
diff --git a/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx b/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx
index 69612f78dcd..e9751d02ff0 100644
--- a/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx
+++ b/src/gpu/intel/jit/gemm/generator/pieces/walk_orders.cxx
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,35 +23,109 @@ using namespace ngen::utils;
 
 #include "internal/namespace_start.hxx"
 
+// Convert linear index to 2D index.
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmLinearOrder(const Subregister &groupIDMN, const Subregister &groupIDM, const Subregister &groupIDN,
+                                              const Subregister &aLeader, const Subregister &bLeader,
+                                              const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    switch (strategy.cWalkOrder) {
+        case WalkOrder::SimpleLinear:   gemmSimpleLinearOrder (groupIDMN, groupIDM, groupIDN, aLeader, bLeader, problem, strategy, state); break;
+        case WalkOrder::NestedLinear:   gemmNestedLinearOrder (groupIDMN, groupIDM, groupIDN, aLeader, bLeader, problem, strategy, state); break;
+        case WalkOrder::Hilbertlike:    gemmHilbertlikeOrder  (groupIDMN, groupIDM, groupIDN, aLeader, bLeader, problem, strategy, state); break;
+        case WalkOrder::Boustrophedon:  gemmBoustrophedonOrder(groupIDMN, groupIDM, groupIDN, aLeader, bLeader, problem, strategy, state); break;
+        default: stub();
+    }
+}
 
 // Convert linear index to 2D index using column/row-major ordering.
 template <HW hw>
-void BLASKernelGenerator<hw>::gemmSimpleLinearOrder(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state)
+void BLASKernelGenerator<hw>::gemmSimpleLinearOrder(const Subregister &groupIDMN, const Subregister &groupIDM, const Subregister &groupIDN,
+                                                    const Subregister &aLeader, const Subregister &bLeader,
+                                                    const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
 {
-    state.inputs.groupIDM = state.ra.alloc_sub<uint32_t>();
-    state.inputs.groupIDN = state.ra.alloc_sub<uint32_t>();
-
     bool nmk = (strategy.loopOrder[0] == LoopN);
     auto &groupCountX = nmk ? state.inputs.groupCountN : state.inputs.groupCountM;
-    auto &groupIDX    = nmk ? state.inputs.groupIDN    : state.inputs.groupIDM;
-    auto &groupIDY    = nmk ? state.inputs.groupIDM    : state.inputs.groupIDN;
+    auto &groupIDX    = nmk ? groupIDN : groupIDM;
+    auto &groupIDY    = nmk ? groupIDM : groupIDN;
+    auto &xLeader     = nmk ? bLeader  : aLeader;
+    auto &yLeader     = nmk ? aLeader  : bLeader;
+
+    divDown(groupIDY, groupIDMN, groupCountX, state.inputs.gcMNRecip, state.flagAP, strategy, state);
+    emad(1, groupIDX, groupIDMN, -groupIDY, groupCountX, strategy, state);
+
+    if (xLeader.isValid() || yLeader.isValid()) {
+        auto flag = state.raVFlag.alloc();
+        if (xLeader.isValid())
+            cmp(1 | lt | flag, xLeader, state.inputs.groupIDMN, groupCountX);
+        if (yLeader.isValid()) {
+            cmp(1 | eq | flag, yLeader, groupIDX, 0);
+            cmp(1 | ~flag | eq | flag, yLeader, state.inputs.groupIDMN, 0);
+        }
+        state.raVFlag.safeRelease(flag);
+    }
+}
 
-    divDown(groupIDY, state.inputs.groupIDMN, groupCountX, state.inputs.gcMNRecip, state.flagAP, strategy, state);
-    emad(1, groupIDX, state.inputs.groupIDMN, -groupIDY, groupCountX, strategy, state);
+// Convert linear index to 2D index using nested column/row-major ordering.
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmNestedLinearOrder(const Subregister &groupIDMN, const Subregister &groupIDM, const Subregister &groupIDN,
+                                                    const Subregister &aLeader, const Subregister &bLeader,
+                                                    const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    int nwgM = div_up(strategy.blockingAlt[LoopM], strategy.wgTile(LoopM));
+    int nwgN = div_up(strategy.blockingAlt[LoopN], strategy.wgTile(LoopN));
 
-    if (!strategy.persistent) {
-        state.ra.safeRelease(state.inputs.groupCountM);
-        state.ra.safeRelease(state.inputs.groupCountN);
-        state.ra.safeRelease(state.inputs.gcMNRecip);
+    auto groupIDX1 = state.ra.alloc_sub<uint32_t>();
+    auto groupIDY1 = state.ra.alloc_sub<uint32_t>();
+    auto temp = state.ra.alloc_sub<uint32_t>();
+
+    bool nmk = (strategy.loopOrder[0] == LoopMNNestedLinearNMK);
+    auto &groupCountX = nmk ? state.inputs.groupCountN : state.inputs.groupCountM;
+    auto &groupIDX    = nmk ? groupIDN : groupIDM;
+    auto &groupIDY    = nmk ? groupIDM : groupIDN;
+    auto &xLeader     = nmk ? bLeader  : aLeader;
+    auto &yLeader     = nmk ? aLeader  : bLeader;
+    auto nwgX         = nmk ? nwgN     : nwgM;
+    auto nwgY         = nmk ? nwgM     : nwgN;
+
+    status << "Nested linear ordering" << status_stream::endl;
+
+    mulConstant(1, temp, groupCountX, nwgY);
+
+    divDown(groupIDY, groupIDMN, temp, state.inputs.gcMNRecip, state.flagAP, strategy, state);
+    emad(1, temp, groupIDMN, -groupIDY, temp, strategy, state);
+
+    divDown(groupIDX, temp, nwgM * nwgN, strategy, state);
+    emad(1, temp, temp, -groupIDX, nwgM * nwgN, strategy, state);
+
+    divDown(groupIDY1, temp, nwgX, strategy, state);
+    emad(1, groupIDX1, temp, -groupIDY1, nwgX, strategy, state);
+
+    emad(1, groupIDX, groupIDX1, groupIDX, nwgX, strategy, state);
+    emad(1, groupIDY, groupIDY1, groupIDY, nwgY, strategy, state);
+
+    if (xLeader.isValid() || yLeader.isValid()) {
+        auto flag = state.raVFlag.alloc();
+        if (xLeader.isValid()) cmp(1 | eq | flag, xLeader, groupIDY, 0);
+        if (yLeader.isValid()) cmp(1 | eq | flag, yLeader, groupIDX, 0);
+        state.raVFlag.safeRelease(flag);
     }
+
+    state.ra.safeRelease(groupIDX1);
+    state.ra.safeRelease(groupIDY1);
+    state.ra.safeRelease(temp);
 }
 
 // Convert linear index to 2D index in a Hilbert curve-like fashion.
 template <HW hw>
-void BLASKernelGenerator<hw>::gemmHilbertlikeOrder(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state)
+void BLASKernelGenerator<hw>::gemmHilbertlikeOrder(const Subregister &groupIDMN, const Subregister &groupIDM, const Subregister &groupIDN,
+                                                   const Subregister &aLeader, const Subregister &bLeader,
+                                                   const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
 {
-    bool triangular = false;
-    bool rectangular = !triangular && state.inputs.hilbertVD.isValid();
+    if (aLeader.isValid() || bLeader.isValid()) stub();
+
+    const bool triangular = false;
+    const bool rectangular = !triangular && state.inputs.hilbertVD.isValid();
 
     auto storage = state.ra.alloc();
     auto u = storage.ud(0);
@@ -117,7 +191,7 @@ void BLASKernelGenerator<hw>::gemmHilbertlikeOrder(const GEMMProblem &problem, G
         cmp(1 | ~f0[0] | ne | f0[0], state.inputs.m, state.inputs.n);
     else
         cmp(2 | le | f0[0], u(1), hilbertBail);
-    mov(1, q, state.inputs.groupIDMN);
+    mov(1, q, groupIDMN);
     add(4, a0[4](1), a0[0](1), 16);
     if (!rectangular && !triangular)
         emad(1, uv1, -1, u.uw(), v.uw(), strategy, state);
@@ -224,19 +298,9 @@ void BLASKernelGenerator<hw>::gemmHilbertlikeOrder(const GEMMProblem &problem, G
         mad(1, qrem, q, -qqot.uw(), divisor.uw());
     }
 
-    // Reassign m/n group IDs.
-    if (!strategy.persistent) {
-        state.inputs.groupIDM = state.inputs.groupCountM;
-        state.inputs.groupIDN = state.inputs.groupCountN;
-        state.inputs.groupCountM = invalid;
-        state.inputs.groupCountN = invalid;
-    } else {
-        state.inputs.groupIDM = state.ra.alloc_sub<uint32_t>();
-        state.inputs.groupIDN = state.ra.alloc_sub<uint32_t>();
-    }
-
-    add(1, state.inputs.groupIDM, a, nmk ? qqot : qrem);
-    add(1, state.inputs.groupIDN, b, nmk ? qrem : qqot);
+    // Assign m/n group IDs.
+    add(1, groupIDM, a, nmk ? qqot : qrem);
+    add(1, groupIDN, b, nmk ? qrem : qqot);
 
     state.ra.safeRelease(storage);
     state.ra.safeRelease(storage2);
@@ -249,8 +313,12 @@ void BLASKernelGenerator<hw>::gemmHilbertlikeOrder(const GEMMProblem &problem, G
 
 // Convert linear index to 2D index in a boustrophedon pattern.
 template <HW hw>
-void BLASKernelGenerator<hw>::gemmBoustrophedonOrder(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state)
+void BLASKernelGenerator<hw>::gemmBoustrophedonOrder(const Subregister &groupIDMN, const Subregister &groupIDM, const Subregister &groupIDN,
+                                                     const Subregister &aLeader, const Subregister &bLeader,
+                                                     const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
 {
+    if (aLeader.isValid() || bLeader.isValid()) stub();
+
     auto storage = state.ra.alloc_range(4);
     auto u = storage[0].ud(0);
     auto s = storage[0].ud(1);
@@ -281,7 +349,6 @@ void BLASKernelGenerator<hw>::gemmBoustrophedonOrder(const GEMMProblem &problem,
 
     auto &groupCountM = state.inputs.groupCountM;
     auto &groupCountN = state.inputs.groupCountN;
-    auto idMN = state.inputs.groupIDMN;
 
     Label lBegin, lEnd, lDone, lBeginTri2, lEndTri2, lTricalc1, lTricalc2, lTricalcOut;
 
@@ -323,9 +390,9 @@ void BLASKernelGenerator<hw>::gemmBoustrophedonOrder(const GEMMProblem &problem,
     ecsel(1, lt, f0[0], v, groupCountM, groupCountN, s0);
     ecsel(1, ge, f0[0], u, groupCountM, groupCountN, s0);
 
-    emad(1, temp0, idMN, -v.uw(), ithresh.uw(), strategy, state);
+    emad(1, temp0, groupIDMN, -v.uw(), ithresh.uw(), strategy, state);
     cmp(1 | ge | f0[0], temp2.d(), temp0.d(), 0);
-    ecsel(1, ge, f0[1], q, temp0, idMN, temp0.d());
+    ecsel(1, ge, f0[1], q, temp0, groupIDMN, temp0.d());
 
     if (hw >= HW::XeHPC) {
         add(1,          s1, abs(s0), 1);
@@ -445,20 +512,10 @@ void BLASKernelGenerator<hw>::gemmBoustrophedonOrder(const GEMMProblem &problem,
     // Reassign m/n group IDs.
     mark(lDone);
 
-    if (!strategy.persistent) {
-        state.inputs.groupIDM = state.inputs.groupCountM;
-        state.inputs.groupIDN = state.inputs.groupCountN;
-        state.inputs.groupCountM = invalid;
-        state.inputs.groupCountN = invalid;
-    } else {
-        state.inputs.groupIDM = state.ra.alloc_sub<uint32_t>();
-        state.inputs.groupIDN = state.ra.alloc_sub<uint32_t>();
-    }
-
     and_(1 | ne | f1[1], null.ud(), islice, 1);
     eadd3(1 | f1[1], j, v, -j, -1);
-    ecsel(1, ge, f0[0], state.inputs.groupIDM, i, j, s0);
-    ecsel(1, lt, f0[0], state.inputs.groupIDN, i, j, s0);
+    ecsel(1, ge, f0[0], groupIDM, i, j, s0);
+    ecsel(1, lt, f0[0], groupIDN, i, j, s0);
 
     state.ra.safeRelease(storage);
     if (!strategy.persistent) {
@@ -467,6 +524,35 @@ void BLASKernelGenerator<hw>::gemmBoustrophedonOrder(const GEMMProblem &problem,
     }
 }
 
+// Reorder global IDs as needed.
+template <HW hw>
+void BLASKernelGenerator<hw>::gemmReorderGlobalIDs(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
+{
+    auto &gidM = state.inputs.groupIDM;
+    auto &gidN = state.inputs.groupIDN;
+    auto &gidMN = state.groupIDMN;
+
+    if (strategy.cWalkOrder == WalkOrder::HW2D)
+        return;
+
+    if (state.nextGroupIDM.isValid() && state.nextGroupIDN.isValid()) {
+        gidM = state.nextGroupIDM;
+        gidN = state.nextGroupIDN;
+        return;
+    }
+
+    gidM = state.ra.alloc_sub<uint32_t>();
+    gidN = state.ra.alloc_sub<uint32_t>();
+
+    gemmLinearOrder(gidMN, gidM, gidN, Subregister(), Subregister(), problem, strategy, state);
+
+    if (!strategy.persistent) {
+        state.ra.safeRelease(state.inputs.groupCountM);
+        state.ra.safeRelease(state.inputs.groupCountN);
+        state.ra.safeRelease(state.inputs.gcMNRecip);
+    }
+}
+
 // Reverse m/n loops if requested.
 template <HW hw>
 void BLASKernelGenerator<hw>::gemmReverseLoops(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state)
diff --git a/src/gpu/intel/jit/gemm/generator/strategy.cpp b/src/gpu/intel/jit/gemm/generator/strategy.cpp
index 3db804ef4eb..043547957ae 100644
--- a/src/gpu/intel/jit/gemm/generator/strategy.cpp
+++ b/src/gpu/intel/jit/gemm/generator/strategy.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -133,11 +133,13 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem)
     // Fused beta/post-op configuration.
     fuseBeta &= (kParallel || kParallelVariable);
     fusePostOps &= (kParallel || kParallelVariable);
+    relaxedAccumulation &= hasNativeAtomicAdd(hw, Tc_ext, problem.C, C);
 
     bool needsFusedPostOps = false;
 
     needsFusedPostOps |= (problem.cOffset == COffset::Post);
-    needsFusedPostOps |= (Tc.bits() != Tc_ext.bits());
+    if (!relaxedAccumulation)
+        needsFusedPostOps |= (Tc.bits() != Tc_ext.bits());
     for (size_t i = 0; i < problem.postOps.len(); i++)
         needsFusedPostOps |= (!problem.postOps[i].is_sum());
     if (problem.Ts != problem.Tc) {
@@ -175,11 +177,14 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem)
 
     block2DCRemainder &= !isPacked(problem.C.layout);
     block2DCRemainder &= !isBlock2D(C.accessType);
-    block2DCFull |= (Tc_ext.size() < 4);
+    block2DCFull |= (Tc_ext.paddedSize() < 4);
     block2DCFull &= block2DCRemainder;
 
     extendedAtomicFMA &= !problem.needsASums() && !problem.needsBSums();
 
+    if (tlbWarmup && !linearOrder())
+         cWalkOrder = WalkOrder::SimpleLinear;
+
     // Default SIMD setting.
     if (fmaSIMD == 0) {
         fmaSIMD = std::min(32, 2 * GRF::bytes(hw) / std::max<int>({Ta.paddedSize(), Tb.paddedSize(), Tc.paddedSize()}));
@@ -218,6 +223,8 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem)
         dpasw = true;
     }
 
+    altCRemainder &= (problem.Tc_ext.bits() >= 8);
+
     dpasw &= systolic && fused;
 
     // Accumulator usage: 64-bit emulation, or k chaining, or extra C registers, or storage for r0 header.
@@ -308,6 +315,9 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem)
     else if (prefetchC && C.atomic)
         C_prefetch.cachingR = makeL1Uncacheable(C_prefetch.cachingR);
 
+    if (prefetchABL3 && cWalkOrder == WalkOrder::HW2D)
+        cWalkOrder = WalkOrder::SimpleLinear;
+
     // Propagate tiling requests to strategy.
     int tileM_A, tileK_A, tileK_B, tileN_B;
     std::tie(tileM_A, tileK_A, tileK_B, tileN_B) = targetKernelTiling(hw, problem, *this);
@@ -379,9 +389,12 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem)
     ukAlign = align_up(ukAlign, minUnrollKSLM * slmVersions);
 
     if (kInterleave) ukAlign = lcm(ukAlign, kInterleaveChunk);
+    if (repackC) ukAlign = lcm(ukAlign, repackC);
 
     if (problem.quantized2DA()) ukAlign = lcm(ukAlign, problem.aqGroupK);
     if (problem.quantized2DB()) ukAlign = lcm(ukAlign, problem.bqGroupK);
+    if (l3PrefetchA) ukAlign = lcm(ukAlign, ka_prefetchL3);
+    if (l3PrefetchB) ukAlign = lcm(ukAlign, kb_prefetchL3);
 
     unroll[LoopK] = align_up(unroll[LoopK], ukAlign);
 
@@ -392,6 +405,7 @@ void GEMMStrategy::preflight(HW hw, const GEMMProblem &problem)
         unroll[LoopK] = unrollKSLM = 32 / Ta_real;
 
     barrierFreq = align_up(barrierFreq, unroll[LoopK]);
+    prefetchABL3 = align_up(prefetchABL3, unroll[LoopK]);
 
     int kChunkA = (problem.A.tileC ? problem.A.tileC : problem.A.crosspack);
     int kChunkB = (problem.B.tileR ? problem.B.tileR : problem.B.crosspack);
diff --git a/src/gpu/intel/jit/gemm/generator/strategy_parser.cpp b/src/gpu/intel/jit/gemm/generator/strategy_parser.cpp
index 9372b42ff7e..9523e600e89 100644
--- a/src/gpu/intel/jit/gemm/generator/strategy_parser.cpp
+++ b/src/gpu/intel/jit/gemm/generator/strategy_parser.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -100,16 +100,17 @@ CacheSettingsLSC getCachingEntry(std::stringstream &s, HW hw)
     }
 }
 
-void getCaching(std::stringstream &s, HW hw, MatrixAddressingStrategy &astrategy)
+void getCaching(std::stringstream &s, HW hw, MatrixAddressingStrategy &astrategy, bool leaveDefault = false)
 {
     auto &cachingR = astrategy.cachingR;
     auto &cachingW = astrategy.cachingW;
 
-    cachingR = CacheSettingsLSC::L1C_L3C;
-    cachingW = CacheSettingsLSC::L1WB_L3WB;
-
-    if (hw >= HW::XeHPC)
-        cachingW = CacheSettingsLSC::L1UC_L3WB;
+    if (!leaveDefault) {
+        cachingR = CacheSettingsLSC::L1C_L3C;
+        cachingW = CacheSettingsLSC::L1WB_L3WB;
+        if (hw >= HW::XeHPC)
+            cachingW = CacheSettingsLSC::L1UC_L3WB;
+    }
 
     if (s.peek() == '{') {
         char eat;
@@ -144,6 +145,10 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
     char eat, asA, asB, asC, accessA, accessB, accessC;
     char accessAUnaligned = '\0', accessBUnaligned = '\0';
     char accessAPrefetch = 's', accessBPrefetch = 's', accessCPrefetch = 's';
+    char accessABPrefetchL3 = 'b';
+
+    auto A64 = AddressBase::createA64(true);
+    auto BTS = AddressBase::createBTS(0);
 
     s >> std::ws >> asA >> accessA;
         if (s.peek() == '/') s >> eat >> accessAUnaligned;
@@ -153,13 +158,22 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
         if (s.peek() == 'x') s >> eat >> strategy.A_copies;
         getCaching(s, hw, strategy.A);
     if (s.peek() == '+') {
-        strategy.prefetchA = 1;
-        s >> eat >> accessAPrefetch >> strategy.ka_prefetch;
-        if (s.peek() == ',') s >> eat >> strategy.ka_pfStride;
-        if (s.peek() == '@') s >> eat >> strategy.prefetchA;
-        if (s.peek() == '/') s >> eat >> strategy.prefetchAMasked;
-        else strategy.prefetchAMasked = strategy.prefetchA;
-        getCaching(s, hw, strategy.A_prefetch);
+        s >> eat;
+        if (s.peek() != '+') {
+            strategy.prefetchA = 1;
+            s >> accessAPrefetch >> strategy.ka_prefetch;
+            if (s.peek() == ',') s >> eat >> strategy.ka_pfStride;
+            if (s.peek() == '@') s >> eat >> strategy.prefetchA;
+            if (s.peek() == '/') s >> eat >> strategy.prefetchAMasked;
+            else strategy.prefetchAMasked = strategy.prefetchA;
+            getCaching(s, hw, strategy.A_prefetch);
+        }
+    }
+    if (s.peek() == '+') {
+        strategy.l3PrefetchA = true;
+        s >> eat >> accessABPrefetchL3 >> strategy.ka_prefetchL3;
+        if (s.peek() == '@') s >> eat >> strategy.prefetchABL3;
+        getCaching(s, hw, strategy.AB_prefetchL3, true);
     }
     s >> std::ws >> asB >> accessB;
         if (s.peek() == '/') s >> eat >> accessBUnaligned;
@@ -169,13 +183,22 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
         if (s.peek() == 'x') s >> eat >> strategy.B_copies;
         getCaching(s, hw, strategy.B);
     if (s.peek() == '+') {
-        strategy.prefetchB = 1;
-        s >> eat >> accessBPrefetch >> strategy.kb_prefetch;
-        if (s.peek() == ',') s >> eat >> strategy.kb_pfStride;
-        if (s.peek() == '@') s >> eat >> strategy.prefetchB;
-        if (s.peek() == '/') s >> eat >> strategy.prefetchBMasked;
-        else strategy.prefetchBMasked = strategy.prefetchB;
-        getCaching(s, hw, strategy.B_prefetch);
+        s >> eat;
+        if (s.peek() != '+') {
+            strategy.prefetchB = 1;
+            s >> accessBPrefetch >> strategy.kb_prefetch;
+            if (s.peek() == ',') s >> eat >> strategy.kb_pfStride;
+            if (s.peek() == '@') s >> eat >> strategy.prefetchB;
+            if (s.peek() == '/') s >> eat >> strategy.prefetchBMasked;
+            else strategy.prefetchBMasked = strategy.prefetchB;
+            getCaching(s, hw, strategy.B_prefetch);
+        }
+    }
+    if (s.peek() == '+') {
+        strategy.l3PrefetchB = true;
+        s >> eat >> accessABPrefetchL3 >> strategy.kb_prefetchL3;
+        if (s.peek() == '@') s >> eat >> strategy.prefetchABL3;
+        getCaching(s, hw, strategy.AB_prefetchL3, true);
     }
     s >> std::ws >> asC >> accessC;
         getTiling(s, strategy.C);
@@ -193,7 +216,7 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
     strategy.A.base = strategy.A_prefetch.base = getAddressBase(asA);
     strategy.B.base = strategy.B_prefetch.base = getAddressBase(asB);
     strategy.C.base = strategy.C_prefetch.base = getAddressBase(asC);
-    strategy.CO.base = (hw >= HW::XeHPG) ? AddressBase::createA64(true) : AddressBase::createBTS(0);
+    strategy.CO.base = (hw >= HW::XeHPG) ? A64 : BTS;
     strategy.A.newDP = bool(std::isupper(accessA));
     strategy.B.newDP = bool(std::isupper(accessB));
     strategy.C.newDP = bool(std::isupper(accessC));
@@ -219,6 +242,15 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
     strategy.B_prefetch.cachingW = CacheSettingsLSC::Default;
     strategy.C_prefetch.cachingW = CacheSettingsLSC::Default;
 
+    strategy.AB_prefetchL3.prefetch = true;
+    strategy.AB_prefetchL3.newDP = true;
+    strategy.AB_prefetchL3.padded = true;
+    strategy.AB_prefetchL3.accessType = getAccessType(accessABPrefetchL3);
+    strategy.AB_prefetchL3.base = getAddressBase(strategy.l3PrefetchA ? asA : asB);
+    if (strategy.AB_prefetchL3.cachingR == CacheSettingsLSC::Default) {
+        strategy.AB_prefetchL3.cachingR = CacheSettingsLSC::L1UC_L3C;
+    }
+
     strategy.A.padded |= isPacked(problem.A.layout);
     strategy.B.padded |= isPacked(problem.B.layout);
     strategy.A_prefetch.padded |= isPacked(problem.A.layout);
@@ -257,7 +289,7 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
             strategy.dpasw = true;
         else if (mod == "fs") {
             strategy.fixedSystolic = strategy.systolic = true;
-            strategy.CO.base = AddressBase::createBTS(0);
+            strategy.CO.base = BTS;
         } else if (mod == "ar")
             strategy.altCRemainder = true;
         else if (mod == "sr") {
@@ -347,6 +379,8 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
             strategy.reverse[LoopM] = true;
         else if (mod == "rn")
             strategy.reverse[LoopN] = true;
+        else if (mod == "wt")
+            strategy.tlbWarmup = true;
         else if (mod == "kb" || mod == "kv") {
             if (mod == "kb") strategy.kParallel = true;
             if (mod == "kv") {
@@ -358,7 +392,7 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
             strategy.C.atomic = true;
             strategy.CO.atomic = problem.sumA || problem.sumB;
             if (strategy.CO.atomic)
-                strategy.CO.base = AddressBase::createA64(true);
+                strategy.CO.base = A64;
         } else if (mod == "kr")
             strategy.kParallelLocal = true;
         else if (mod == "akr")
@@ -373,6 +407,8 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
             strategy.fusePostOps = true;
         else if (mod == "zt")
             strategy.zeroTempC = true;
+        else if (mod == "rx")
+            strategy.relaxedAccumulation = true;
         else if (mod == "fg") {
             float fillGoal;
             s >> fillGoal;
@@ -402,6 +438,8 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
             strategy.cWalkOrder = WalkOrder::Hilbertlike;
         else if (mod == "li")
             strategy.cWalkOrder = WalkOrder::SimpleLinear;
+        else if (mod == "nl")
+            strategy.cWalkOrder = WalkOrder::NestedLinear;
         else if (mod == "pt")
             strategy.persistent = true;
         else if (mod == "of")
@@ -437,9 +475,15 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
                 strategy.splitBarrier = true;
             } else if (mod.substr(0, 2) == "pk")
                 strategy.kPadding = stoi(mod.substr(2));
-            else if (mod.substr(0, 2) == "cr")
+            else if (mod.substr(0, 2) == "cr") {
                 strategy.cRepackPanel = stoi(mod.substr(2));
-            else if (mod.substr(0, 2) == "wx") {
+                if (strategy.cRepackPanel == 0)
+                    strategy.cRepackPanel = 1024;   /* arbitrary large value */
+            } else if (mod.substr(0, 2) == "rc") {
+                strategy.repackC = stoi(mod.substr(2));
+                if (strategy.cRepackPanel == 0)
+                    strategy.cRepackPanel = 1024;   /* arbitrary large value */
+            } else if (mod.substr(0, 2) == "wx") {
                 strategy.wgPadFactor = stoi(mod.substr(2));
                 strategy.forceWGUpdate = WGFixed;
             } else if (mod.substr(0, 2) == "ql") {
@@ -519,25 +563,59 @@ void parseStrategy(const char *str, HW hw, const GEMMProblem &problem, GEMMStrat
 
     if (strategy.persistent && !strategy.linearOrder()) strategy.cWalkOrder = WalkOrder::SimpleLinear;
 
+    // Use new LSC messages on Xe2+
+    if (hw >= ngen::HW::Xe2) {
+       strategy.A.newDP = strategy.A_prefetch.newDP = true;
+       strategy.B.newDP = strategy.B_prefetch.newDP = true;
+       strategy.C.newDP = strategy.C_prefetch.newDP = true;
+    }
+
     size_t poCount = problem.postOps.len();
     strategy.binary.resize(poCount);
     for (auto &astrategy: strategy.binary) {
-        astrategy.base = (hw >= HW::XeHPC) ? AddressBase::createA64(true) : AddressBase::createBTS(0);
+        astrategy.base = (hw >= HW::XeHPC) ? A64 : BTS;
         astrategy.newDP = strategy.C.newDP;
     }
+
+    bool surfaceAq = (problem.quantized2DA() && !strategy.A.base.isStateless());
+    bool surfaceBq = (problem.quantized2DB() && !strategy.B.base.isStateless());
+
+    strategy.AO.base = strategy.A_scale.base = (surfaceAq ? BTS : A64);
+    strategy.BO.base = strategy.B_scale.base = (surfaceBq ? BTS : A64);
+
+    if (problem.aoPtrDims <= 2) strategy.AO.base = A64;
+    if (problem.boPtrDims <= 2) strategy.BO.base = A64;
+
+    strategy.AO.newDP = strategy.A_scale.newDP = strategy.A.newDP;
+    strategy.BO.newDP = strategy.B_scale.newDP = strategy.B.newDP;
 }
 
 void adjustStrategy(HW hw, const GEMMProblem &problem, GEMMStrategy &strategy, const char *tags)
 {
     auto *gemmAStrategy = &strategy.A, *gemmBStrategy = &strategy.B;
+    if (problem.A.needA64) strategy.A.forceA64();
+    if (problem.B.needA64) strategy.B.forceA64();
+    if (problem.C.needA64) strategy.C.forceA64();
 
     // 2D block accesses use 2D addressing where supported.
-    strategy.A.address2D |= isBlock2D(strategy.A.accessType) && !isPacked(problem.A.layout);
-    strategy.B.address2D |= isBlock2D(strategy.B.accessType) && !isPacked(problem.B.layout);
-    strategy.C.address2D |= isBlock2D(strategy.C.accessType) && !isPacked(problem.C.layout);
-    strategy.A_prefetch.address2D |= isBlock2D(strategy.A_prefetch.accessType) && !isPacked(problem.A.layout);
-    strategy.B_prefetch.address2D |= isBlock2D(strategy.B_prefetch.accessType) && !isPacked(problem.B.layout);
-    strategy.C_prefetch.address2D |= isBlock2D(strategy.C_prefetch.accessType) && !isPacked(problem.C.layout);
+    auto use2DAddressing = [](MatrixAddressingStrategy &astrategy) {
+        astrategy.address2D |= isBlock2D(astrategy.accessType);
+    };
+
+    if (!isPacked(problem.A.layout)) {
+        use2DAddressing(strategy.A);
+        use2DAddressing(strategy.A_prefetch);
+    }
+    if (!isPacked(problem.B.layout)) {
+        use2DAddressing(strategy.B);
+        use2DAddressing(strategy.B_prefetch);
+    }
+    if (!isPacked(problem.C.layout)) {
+        use2DAddressing(strategy.C);
+        use2DAddressing(strategy.C_prefetch);
+    }
+    if (!(strategy.l3PrefetchA && isPacked(problem.A.layout)) && !(strategy.l3PrefetchB && isPacked(problem.B.layout)))
+        use2DAddressing(strategy.AB_prefetchL3);
 
     // Notify kernel generator to downgrade block 2D prefetches if block 2D cannot be used.
     if (tags && !strategy.optAlignAB2D) {
@@ -671,6 +749,13 @@ std::string unparseStrategy(HW hw, const GEMMProblem &problem, const GEMMStrateg
             s << '/' << strategy.prefetchAMasked;
         unparseCaching(hw, s, strategy.A_prefetch);
     }
+    if (strategy.l3PrefetchA) {
+        if (!strategy.prefetchA) s << '+';
+        s << '+';
+        unparseAccessType(s, strategy.AB_prefetchL3);
+        s << strategy.ka_prefetchL3 << '@' << strategy.prefetchABL3;
+        unparseCaching(hw, s, strategy.AB_prefetchL3);
+    }
     s << ' ';
     unparseAddressBase(s, strategy.B.base);
     unparseAccessType(s, strategy.B);
@@ -695,6 +780,13 @@ std::string unparseStrategy(HW hw, const GEMMProblem &problem, const GEMMStrateg
             s << '/' << strategy.prefetchBMasked;
         unparseCaching(hw, s, strategy.B_prefetch);
     }
+    if (strategy.l3PrefetchB) {
+        if (!strategy.prefetchB) s << '+';
+        s << '+';
+        unparseAccessType(s, strategy.AB_prefetchL3);
+        s << strategy.kb_prefetchL3 << '@' << strategy.prefetchABL3;
+        unparseCaching(hw, s, strategy.AB_prefetchL3);
+    }
     s << ' ';
     unparseAddressBase(s, strategy.C.base);
     unparseAccessType(s, strategy.C);
@@ -796,6 +888,7 @@ std::string unparseStrategy(HW hw, const GEMMProblem &problem, const GEMMStrateg
     if (strategy.panelCheck)                s << " up";
     if (strategy.reverse[LoopM])            s << " rm";
     if (strategy.reverse[LoopN])            s << " rn";
+    if (strategy.tlbWarmup)                 s << " wt";
 
     if (strategy.checkAdd32 && !strategy.emulate.emulate64) s << " ch";
     if (!strategy.checkAdd32 && strategy.emulate.emulate64) s << " nch";
@@ -803,16 +896,17 @@ std::string unparseStrategy(HW hw, const GEMMProblem &problem, const GEMMStrateg
     if (strategy.C.smode == ScatterSIMD::Wide)              s << " wc";
     if (strategy.forceCopyC)                                s << " cc";
 
-    if (!strategy.jointSplit)       s << " njs";
-    if (strategy.mSplitThresh)      s << " ms" << strategy.mSplitThresh;
-    if (strategy.nSplitThresh)      s << " ns" << strategy.nSplitThresh;
+    if (!strategy.jointSplit)           s << " njs";
+    if (strategy.mSplitThresh)          s << " ms" << strategy.mSplitThresh;
+    if (strategy.nSplitThresh)          s << " ns" << strategy.nSplitThresh;
 
-    if (strategy.kParallel)         s << " kb";
-    if (strategy.kParallelVariable) s << " kv";
-    if (strategy.fuseBeta)          s << (strategy.altFusedBeta ? " afb" : " fb");
-    if (strategy.fusePostOps)       s << " fp";
-    if (strategy.zeroTempC)         s << " zt";
-    if (strategy.kPadding)          s << " pk" << strategy.kPadding;
+    if (strategy.kParallel)             s << " kb";
+    if (strategy.kParallelVariable)     s << " kv";
+    if (strategy.fuseBeta)              s << (strategy.altFusedBeta ? " afb" : " fb");
+    if (strategy.fusePostOps)           s << " fp";
+    if (strategy.zeroTempC)             s << " zt";
+    if (strategy.relaxedAccumulation)   s << " rx";
+    if (strategy.kPadding)              s << " pk" << strategy.kPadding;
 
     if (strategy.C.atomic && !strategy.kParallel && !strategy.kParallelVariable)
         s << " au";
@@ -826,6 +920,7 @@ std::string unparseStrategy(HW hw, const GEMMProblem &problem, const GEMMStrateg
         case WalkOrder::Hilbertlike:      s << " hi"; break;
         case WalkOrder::Boustrophedon:    s << " bo"; break;
         case WalkOrder::SimpleLinear:     s << " li"; break;
+        case WalkOrder::NestedLinear:     s << " nl"; break;
         default: break;
     }
 
diff --git a/src/gpu/intel/jit/gemm/include/.clang-format b/src/gpu/intel/jit/gemm/include/.clang-format
new file mode 100644
index 00000000000..71d803d4198
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/include/.clang-format
@@ -0,0 +1,18 @@
+#===============================================================================
+# Copyright 2019-2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+DisableFormat : true
+SortIncludes:   false
diff --git a/src/gpu/intel/jit/gemm/include/_clang-format b/src/gpu/intel/jit/gemm/include/_clang-format
deleted file mode 100644
index ba1496d179e..00000000000
--- a/src/gpu/intel/jit/gemm/include/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2019-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/src/gpu/intel/jit/gemm/include/driver_info.hpp b/src/gpu/intel/jit/gemm/include/driver_info.hpp
index 0c7671c8310..941b4b394fe 100644
--- a/src/gpu/intel/jit/gemm/include/driver_info.hpp
+++ b/src/gpu/intel/jit/gemm/include/driver_info.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,6 +36,8 @@ enum LoopType : uint8_t {
     LoopMNHilbertNMK = 0x91,        // Fused n/m indices (Hilbert ordering), with NMK nested inside
     LoopMNLinearMNK = 0xA0,         // Fused m/n indices (simple linear ordering), with MNK nested inside
     LoopMNLinearNMK = 0xA1,         // Fused n/m indices (simple linear ordering), with NMK nested inside
+    LoopMNNestedLinearMNK = 0xB0,   // Fused m/n indices (nested linear ordering), with MNK nested inside
+    LoopMNNestedLinearNMK = 0xB1,   // Fused n/m indices (nested linear ordering), with NMK nested inside
     LoopAny = 0xFF,
     LoopNone = 0xFF
 };
@@ -67,6 +69,7 @@ enum DriverInfoFlags : uint32_t {
     FlagNondeterministic = 0x4000,  // Kernel produces nondeterministic results.
     FlagMaskFillGoal = 0xF0000,     // Fraction of available thread slots to fill, in sixteenths
     FlagShiftFillGoal = 16,         //   (starting bit)
+    FlagExtraWG = 0x400000,         // Add an additional workgroup.
 };
 
 // Driver information, shared by all kernel types.
@@ -88,11 +91,12 @@ struct CommonDriverInfo {
     bool support4GB[3];             // True if >4GB buffers allowed for A,B,C (gemm) or S,D (copy).
 
     bool fusedEUs()           const { return (fusedLoop != LoopNone); }
-    bool isMNK()              const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopM || l == LoopMNHilbertMNK || l == LoopMNBoustrophedonMNK || l == LoopMNLinearMNK; }
-    bool isNMK()              const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopN || l == LoopMNHilbertNMK || l == LoopMNBoustrophedonNMK || l == LoopMNLinearNMK; }
+    bool isMNK()              const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopM || l == LoopMNHilbertMNK || l == LoopMNBoustrophedonMNK || l == LoopMNLinearMNK || l == LoopMNNestedLinearMNK; }
+    bool isNMK()              const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopN || l == LoopMNHilbertNMK || l == LoopMNBoustrophedonNMK || l == LoopMNLinearNMK || l == LoopMNNestedLinearNMK; }
     bool isHilbert()          const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopMNHilbertMNK || l == LoopMNHilbertNMK; }
     bool isBoustrophedon()    const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopMNBoustrophedonMNK || l == LoopMNBoustrophedonNMK; }
     bool isSimpleLinear()     const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopMNLinearMNK || l == LoopMNLinearNMK; }
+    bool isNestedLinear()     const { auto l = loopOrder[0] & ~LoopPersistent; return l == LoopMNNestedLinearMNK || l == LoopMNNestedLinearNMK; }
     bool isLinearOrder()      const { return (loopOrder[0] != LoopNone) && (loopOrder[0] & 0x80); }
     bool isPersistent()       const { return (loopOrder[0] != LoopNone) && (loopOrder[0] & LoopPersistent); }
     bool fixedWG()            const { return wgUpdate == WGFixed; }
@@ -113,6 +117,7 @@ struct CommonDriverInfo {
     bool betaPtr()            const { return flags & FlagBetaPtr; }
     bool fixedWGK()           const { return flags & FlagFixedWGK; }
     bool nondeterministic()   const { return flags & FlagNondeterministic; }
+    int extraWGs()            const { return (flags & FlagExtraWG) ? 1 : 0; }
 
     int wgTile(LoopType l)    const { return unroll[l] * wg[l]; }
     int kPadding()            const { return (kParallel() || kParallelVariable()) ? blockingAlt[LoopK] : 0; }
diff --git a/src/gpu/intel/jit/gemm/include/generator.hpp b/src/gpu/intel/jit/gemm/include/generator.hpp
index ef10b84af70..14ffe231715 100644
--- a/src/gpu/intel/jit/gemm/include/generator.hpp
+++ b/src/gpu/intel/jit/gemm/include/generator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 #include "common/math_utils.hpp"
 #include "common/utils.hpp"
 #include "gpu/intel/gpu_post_ops.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-#include "gpu/intel/jit/jit_post_op_injector.hpp"
-#include "gpu/intel/serialization.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "gpu/intel/jit/post_op_injector.hpp"
+#include "common/serialization.hpp"
 
 #include <array>
 #include <bitset>
@@ -54,10 +54,10 @@
 #include "internal/namespace_start.hxx"
 
 // Macro configuration
-#define GENERATOR_SUPER(hw) ngen::OpenCLCodeGenerator<hw>
-#define FORWARD(hw) NGEN_FORWARD_OPENCL(hw)
+#define GENERATOR_SUPER(hw) ngen::ELFCodeGenerator<hw>
+#define FORWARD(hw) NGEN_FORWARD_ELF(hw)
 
-#define GENERATOR_BASE(hw) jit_generator<hw>
+#define GENERATOR_BASE(hw) generator_t<hw>
 
 
 template <ngen::HW hw>
@@ -65,7 +65,7 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
 public:
     using super = GENERATOR_SUPER(hw);
 
-    BLASKernelGenerator() {}
+    BLASKernelGenerator(): GENERATOR_BASE(hw)({GENERATOR_NAME, GENERATOR_LINE}) {}
 
     FORWARD(hw)
 
@@ -89,7 +89,7 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
     GRFMultirange outputCRange;
     std::vector<RegisterBlock> outputCLayout;
 
-    using Injector = jit_post_op_injector<hw>;
+    using Injector = post_op_injector_t<GENERATOR_BASE(hw)>;
     std::unique_ptr<Injector> postOpInjector;
 
     class status_stream {
@@ -142,31 +142,31 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
 
     // emulation.cpp
     friend struct EmulationImplementation;
-    template <typename DT = void> void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0,   const CommonStrategy &strategy, CommonState &state);
-    template <typename DT = void> void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::Immediate src0, const CommonStrategy &strategy, CommonState &state)                                              { EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy.emulate); }
-    template <typename DT = void> void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const ngen::RegData &src1, const CommonStrategy &strategy, CommonState &state);
-    template <typename DT = void> void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, ngen::Immediate src1,      const CommonStrategy &strategy, const CommonState &state) { EmulationImplementation::eadd<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate); }
-    template <typename DT = void> void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const ngen::RegData &src1, const CommonStrategy &strategy, const CommonState &state) { EmulationImplementation::emul<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate); }
-    template <typename DT = void> void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, ngen::Immediate src1,      const CommonStrategy &strategy, const CommonState &state) { EmulationImplementation::emul<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate); }
-    template <typename DT = void> void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0, uint16_t src1, const CommonStrategy &strategy, const CommonState &state)                           { EmulationImplementation::eshl<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate); }
-    template <typename DT = void> void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0, uint16_t src1, const CommonStrategy &strategy, const CommonState &state)                           { EmulationImplementation::eshr<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate); }
-    template <typename DT = void> void emulConstant(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1, const CommonStrategy &strategy, const CommonState &state)      { EmulationImplementation::emulConstant<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate); }
-    template <typename DT = void> void emulConstant(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, Type src1, const CommonStrategy &strategy, const CommonState &state);
-    template <typename S1> void emul32High(const ngen::InstructionModifier &mod, const ngen::RegData &dstHi, const ngen::RegData &src0, const S1 &src1)                                                                     { EmulationImplementation::emul32High(*this, mod, dstHi, src0, src1); }
-
-    template <typename S0, typename S2> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const S2 &src2, const CommonStrategy &strategy, CommonState &state, bool sub);
-    template <typename S0> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const ngen::Immediate &src2, const CommonStrategy &strategy, CommonState &state);
-    template <typename S0> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, ngen::RegData src1, ngen::RegData src2, const CommonStrategy &strategy, CommonState &state);
-    template <typename S0> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, int32_t src2, const CommonStrategy &strategy, CommonState &state);
-    template <typename S0> void eaddScaled(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, Type src2, const CommonStrategy &strategy, CommonState &state);
-    template <typename DT = void, typename S0, typename S2> void eadd3(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const S2 &src2);
-    template <typename S0> void ecsel(const ngen::InstructionModifier &mod, const ngen::InstructionModifier &cmod, const ngen::FlagRegister &flag, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const ngen::RegData &src2);
-
-    template <typename DT = void> void emath(const ngen::InstructionModifier &mod, ngen::MathFunction fc, const ngen::RegData &dst, const ngen::RegData &src0, const GEMMStrategy &strategy, CommonState &state);
-    template <typename DT = void> void einv(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const GEMMStrategy &strategy, CommonState &state) { emath<DT>(mod, ngen::MathFunction::inv, dst, src0, strategy, state); }
-    template <typename DT = void> void esqt(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const GEMMStrategy &strategy, CommonState &state) { emath<DT>(mod, ngen::MathFunction::sqt, dst, src0, strategy, state); }
-
-    void ejmpi(ngen::InstructionModifier mod, ngen::Label &dst);
+    template <typename DT = void> void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0,   const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename DT = void> void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::Immediate src0, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc ={})                                              { EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy.emulate, loc); }
+    template <typename DT = void> void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const ngen::RegData &src1, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename DT = void> void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, ngen::Immediate src1,      const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {}) { EmulationImplementation::eadd<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc); }
+    template <typename DT = void> void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const ngen::RegData &src1, const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {}) { EmulationImplementation::emul<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc); }
+    template <typename DT = void> void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, ngen::Immediate src1,      const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {}) { EmulationImplementation::emul<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc); }
+    template <typename DT = void> void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0, uint16_t src1, const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {})                           { EmulationImplementation::eshl<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc); }
+    template <typename DT = void> void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst, ngen::RegData src0, uint16_t src1, const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {})                           { EmulationImplementation::eshr<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc); }
+    template <typename DT = void> void emulConstant(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1, const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {})      { EmulationImplementation::emulConstant<DT>(*this, mod, dst, src0, src1, strategy.emulate, state.emulate, loc); }
+    template <typename DT = void> void emulConstant(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, Type src1, const CommonStrategy &strategy, const CommonState &state, ngen::SourceLocation loc = {});
+    template <typename S1> void emul32High(const ngen::InstructionModifier &mod, const ngen::RegData &dstHi, const ngen::RegData &src0, const S1 &src1, ngen::SourceLocation loc = {})                                                                     { EmulationImplementation::emul32High(*this, mod, dstHi, src0, src1, loc); }
+
+    template <typename S0, typename S2> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const S2 &src2, const CommonStrategy &strategy, CommonState &state, bool sub, ngen::SourceLocation loc = {});
+    template <typename S0> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const ngen::Immediate &src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename S0> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, ngen::RegData src1, ngen::RegData src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename S0> void emad(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, int32_t src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename S0> void eaddScaled(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, Type src2, const CommonStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename DT = void, typename S0, typename S2> void eadd3(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const S2 &src2, ngen::SourceLocation loc = {});
+    template <typename S0> void ecsel(const ngen::InstructionModifier &mod, const ngen::InstructionModifier &cmod, const ngen::FlagRegister &flag, const ngen::RegData &dst, const S0 &src0, const ngen::RegData &src1, const ngen::RegData &src2, ngen::SourceLocation loc = {});
+
+    template <typename DT = void> void emath(const ngen::InstructionModifier &mod, ngen::MathFunction fc, const ngen::RegData &dst, const ngen::RegData &src0, const GEMMStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {});
+    template <typename DT = void> void einv(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const GEMMStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {}) { emath<DT>(mod, ngen::MathFunction::inv, dst, src0, strategy, state, loc); }
+    template <typename DT = void> void esqt(const ngen::InstructionModifier &mod, const ngen::RegData &dst, const ngen::RegData &src0, const GEMMStrategy &strategy, CommonState &state, ngen::SourceLocation loc = {}) { emath<DT>(mod, ngen::MathFunction::sqt, dst, src0, strategy, state, loc); }
+
+    void ejmpi(ngen::InstructionModifier mod, ngen::Label &dst, ngen::SourceLocation loc = {});
 
     // asm_helpers.cpp
     void goto12(const ngen::InstructionModifier &mod, ngen::Label &jip) { goto12(mod, jip, jip); }
@@ -276,6 +276,7 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
 
     void setupTeardownRemask(Type T, int index, bool setup, int nq, ngen::Subregister remQ, const CommonStrategy &strategy, CommonState &state, int fixedOffQ = 0, const ngen::Subregister &variableOffQ = ngen::Subregister());
     void remaskLayout(Type T, int index, bool column, const std::vector<RegisterBlock> &layout, const GRFMultirange &regs, const CommonStrategy &strategy, CommonState &state, int offset = 0);
+    void remaskLayoutSingle(Type T, int index, bool column, int nq, ngen::Subregister remQ, const std::vector<RegisterBlock> &layout, const GRFMultirange &regs, const CommonStrategy &strategy, CommonState &state, int fixedOffQ = 0, const ngen::Subregister &variableOffQ = ngen::Subregister(), int maskOff = 0);
 
     void setAddrRemainder(Type T, const ngen::GRFRange &addr, const RegisterBlock &block, const ngen::Subregister &remR, const ngen::Subregister &remC, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, const CommonStrategy &strategy, CommonState &state);
     void setAddrRemainder(Type T, const std::vector<ngen::GRFRange> &addr, const std::vector<RegisterBlock> &layout, const ngen::Subregister &remR, const ngen::Subregister &remC, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, const CommonStrategy &strategy, CommonState &state);
@@ -317,8 +318,8 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
     void innerProductFMA(int h, int ha, int hb, int opCount, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void outerProductFMA(int h, int ha, int hb, int opCount, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void outerProductGen9IGEMM(int ha, int hb, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
-    void outerProductSystolic(int h, int ha, int hb, int opCount, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
-    void outerProduct(int h, int ha, int hb, int opCount, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void outerProductSystolic(int h, int ha, int hb, int opCount, bool rem, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void outerProduct(int h, int ha, int hb, int opCount, bool rem, const std::vector<RegisterBlock> &A_layout, const std::vector<RegisterBlock> &B_layout, const GRFMultirange &A_regs, const GRFMultirange &B_regs, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void setupTeardownAccumulateSumSystolic(bool setup, Type Tother, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void outerProductRepackC(int x0, int xr0, int nx, int h, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
 
@@ -386,20 +387,29 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
     SubregisterPair lookupIncrement(const LDIncrements &increments, const SubregisterPair &base, int scale, const CommonStrategy &strategy, CommonState &state, bool *release = nullptr);
     void gemmFreeIncrements(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool doA = true, bool doB = true);
     void gemmCalcIncrements(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, int ka_load = 0, int kb_load = 0, bool doA = true, bool doB = true);
+    void gemmCalcQuantizationIncrements(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     ngen::Subregister gemmMNLinearID(const GEMMStrategy &strategy, GEMMState &state);
+    void gemmApplyWorkshareOffset(bool isA, ngen::Subregister &base, ngen::Subregister alias, Address2DParams &params2D, const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, int r, int c, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmCalcWorkshareAOffset(ngen::Subregister &off, ngen::Subregister &offR, ngen::Subregister &offC, const MatrixAddressing &A, const MatrixAddressingStrategy &A_strategy, int ma, int ka, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmCalcWorkshareBOffset(ngen::Subregister &off, ngen::Subregister &offR, ngen::Subregister &offC, const MatrixAddressing &B, const MatrixAddressingStrategy &B_strategy, int kb, int nb, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     bool gemmPrepMaskedAB(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
     void gemmSLMRemask(bool remaskA, bool remaskB, GRFMultirange &Ao_regs, GRFMultirange &Bo_regs, int kOffset, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     // quantization.cpp
-    bool gemmMake2DQuantizationLayouts(bool isA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    bool gemmMake2DQuantizationLayouts(bool isA, const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
     void gemmRepack2DQuantizationData(Type Ts, Type Td, const std::vector<RegisterBlock> &layoutSrc, const std::vector<RegisterBlock> &layoutDst, const GRFMultirange &src, const GRFMultirange &dst, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmRepack2DOffsetData(Type Text, Type Ts, Type Td, const std::vector<RegisterBlock> &layoutSrc, const std::vector<RegisterBlock> &layoutDst, const GRFMultirange &src, const GRFMultirange &dst, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void dequantizeInt4Shift(Type Tsrc, GRFMultirange src, const CommonStrategy &strategy);
     void dequantizeInt4(bool doA, Type Tsrc, Type Tdst, const std::vector<RegisterBlock> &layoutSrc, const std::vector<RegisterBlock> &layoutDst, const std::vector<RegisterBlock> &layoutOffset, const std::vector<RegisterBlock> &layoutScale, GRFMultirange src, GRFMultirange dst, GRFMultirange offset, GRFMultirange scale, Type Tscale, int offR, int offC, const GEMMProblem *problem, const CommonStrategy &strategy, CommonState &state, bool s4Shift = true);
-    void gemmDequantizeOperation(bool doA, Type T, Type To, BinaryOp op, const std::vector<RegisterBlock> &layout, const std::vector<RegisterBlock> &qlayout, const GRFMultirange &regs, const GRFMultirange &qregs, int hq, const GEMMProblem &problem);
+    void gemmDequantizeOperation(bool doA, Type T, Type To, BinaryOp op, const std::vector<RegisterBlock> &layout, const std::vector<RegisterBlock> &qlayout, const GRFMultirange &regs, const GRFMultirange &qregs, int hq, const GEMMProblem &problem, CommonState &state);
     void gemmDequantizeAB(bool doA, Type Tsrc, Type Tdst, const std::vector<RegisterBlock> &layoutSrc, const std::vector<RegisterBlock> &layoutDst, const GRFMultirange &src, const GRFMultirange &dst, int hab, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool s4Shift = true);
 
+    // l3_prefetch.cxx
+    void gemmInitL3Prefetch(bool nextWave, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmWarmupL3Prefetch(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmScheduleL3Prefetches(void *ls, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmScheduleL3PrefetchIncs(void *ls, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool allowDelay = true);
+    void gemmTeardownL3Prefetch(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+
     // k_loop.cpp
     void gemmCalcKLoopBarrierCount(ngen::Subregister &count, const ngen::Subregister &k, int cooldown, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmCalcKSLM(const ngen::Subregister &kSLM, const ngen::Subregister &lid, int kgran, int kdiv, int krep, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, ngen::Subregister kBase = ngen::Subregister());
@@ -437,9 +447,12 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
     void gemmOOBExit(ngen::Label &target, const GEMMStrategy &strategy, GEMMState &state);
 
     // walk_orders.cpp
-    void gemmSimpleLinearOrder(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
-    void gemmHilbertlikeOrder(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
-    void gemmBoustrophedonOrder(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
+    void gemmLinearOrder(const ngen::Subregister &groupIDMN, const ngen::Subregister &groupIDM, const ngen::Subregister &groupIDN, const ngen::Subregister &aLeader, const ngen::Subregister &bLeader, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmSimpleLinearOrder(const ngen::Subregister &groupIDMN, const ngen::Subregister &groupIDM, const ngen::Subregister &groupIDN, const ngen::Subregister &aLeader, const ngen::Subregister &bLeader, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmNestedLinearOrder(const ngen::Subregister &groupIDMN, const ngen::Subregister &groupIDM, const ngen::Subregister &groupIDN, const ngen::Subregister &aLeader, const ngen::Subregister &bLeader, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmHilbertlikeOrder(const ngen::Subregister &groupIDMN, const ngen::Subregister &groupIDM, const ngen::Subregister &groupIDN, const ngen::Subregister &aLeader, const ngen::Subregister &bLeader, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmBoustrophedonOrder(const ngen::Subregister &groupIDMN, const ngen::Subregister &groupIDM, const ngen::Subregister &groupIDN, const ngen::Subregister &aLeader, const ngen::Subregister &bLeader, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmReorderGlobalIDs(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmReorderLocalIDs(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
 
     // atomic_fusions.cpp
@@ -453,14 +466,21 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
     bool gemmFusedPostOpsFinalize(ngen::Label &labelLateExit, GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
     void gemmRedirectToTempC(GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
 
+    // tlb_warmup.cxx
+    void tlbWarmup(ngen::AddressBase base, const ngen::Subregister &ptr, const ngen::Subregister &bytes, const ngen::Subregister &lid, int whose, const CommonProblem &problem, const CommonStrategy &strategy, CommonState &state);
+    void tlbWarmup(const MatrixAddressing &atype, const MatrixAddressingStrategy &astrategy, const ngen::Subregister &base, const ngen::Subregister &r, const ngen::Subregister &c, const ngen::Subregister &ld, const ngen::Subregister &lid, int whose, const CommonProblem &problem, const CommonStrategy &strategy, CommonState &state);
+    void gemmTLBWarmup(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+
     // gemm_setup.cpp
     void gemmCheck32(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
     void gemmGetBatchIDs(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmReleaseBatchIDs(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmOffsetAm(const ngen::Subregister &i, const ngen::Subregister &effA, const MatrixAddressing &globalA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmOffsetAk(int h, const ngen::Subregister &effA, const MatrixAddressing &globalA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmOffsetAk(const ngen::Subregister &h, const ngen::Subregister &effA, const MatrixAddressing &globalA, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmOffsetBk(int h, const ngen::Subregister &effB, const MatrixAddressing &globalB, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmOffsetBk(const ngen::Subregister &h, const ngen::Subregister &effB, const MatrixAddressing &globalB, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
+    void gemmOffsetBn(const ngen::Subregister &j, const ngen::Subregister &effB, const MatrixAddressing &globalB, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmFoldOffsets(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmRestoreOffsets(const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state);
     void gemmOffsetABC(bool initial, ngen::Subregister i0, ngen::Subregister j0, ngen::Subregister h0, ngen::Subregister i0p, ngen::Subregister j0p, const GEMMProblem &problem, const GEMMStrategy &strategy, GEMMState &state, bool doA = true, bool doB = true, bool doC = true, bool doBinary = false);
@@ -474,7 +494,6 @@ class BLASKernelGenerator : public GENERATOR_BASE(hw) {
     void gemmDowngradeAccess(const GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state);
     void gemmInitInterface(GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state, bool inSK = false);
     void gemmInitState(GEMMProblem &problem, GEMMStrategy &strategy, GEMMState &state, bool inSK = false);
-    static void gemmAutoTypeConversions(GEMMProblem &problem, const GEMMStrategy &strategy);
 
     // gemm.cpp
     void gemmSubkernel(GEMMProblem &problem, GEMMStrategy &strategy, GEMMState state);
diff --git a/src/gpu/intel/jit/gemm/include/internal/ngen_includes.hpp b/src/gpu/intel/jit/gemm/include/internal/ngen_includes.hpp
index f1b6a4e77cd..c51513a01fb 100644
--- a/src/gpu/intel/jit/gemm/include/internal/ngen_includes.hpp
+++ b/src/gpu/intel/jit/gemm/include/internal/ngen_includes.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,5 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#   include "gpu/intel/jit/ngen/ngen_opencl.hpp"
-#   include "gpu/intel/jit/ngen/ngen_register_allocator.hpp"
+#include "../config.hpp"
+
+#   include "ngen_opencl.hpp"
+#   include "ngen_register_allocator.hpp"
+#   include "ngen_asm.hpp"
diff --git a/src/gpu/intel/jit/gemm/include/internal/utils.hpp b/src/gpu/intel/jit/gemm/include/internal/utils.hpp
index c7b02ed6e8c..ed765640318 100644
--- a/src/gpu/intel/jit/gemm/include/internal/utils.hpp
+++ b/src/gpu/intel/jit/gemm/include/internal/utils.hpp
@@ -20,7 +20,7 @@
 
 #include <stdexcept>
 
-#if __has_include(<source_location>)
+#ifdef __cpp_lib_source_location
 #include <source_location>
 #endif
 
@@ -93,7 +93,7 @@ class hw_unsupported_exception : public std::runtime_error {
     hw_unsupported_exception() : std::runtime_error("Unsupported in hardware") {}
 };
 
-#if __has_include(<source_location>) && __cpp_lib_source_location >= 201907L
+#if defined(__cpp_lib_source_location) && __cpp_lib_source_location >= 201907L
 [[noreturn]] static inline void stub(
                                      std::source_location where = std::source_location::current()) {
     throw stub_exception(where.file_name(), where.line());
diff --git a/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp b/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp
index 8e38a45fffc..234c7dd706f 100644
--- a/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp
+++ b/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,8 @@
 
 #include <string>
 #include <tuple>
-#include <vector>
+#include <array>
+#include <utility>
 
 #include "config.hpp"
 #include "driver_info.hpp"
@@ -61,19 +62,20 @@ enum RestrictionTags : char {
     ReqBatchMultiDim = 'W',  ReqNoBatchMultiDim = 'w',
     ReqSumA = 'Q',           ReqNoSumA = 'q',
     ReqSumB = 'P',           ReqNoSumB = 'p',
+    ReqIntegrated = 'H',     ReqNoIntegrated = 'h',
+    ReqBatchN = 'N',         ReqNoBatchN = 'n',
+    ReqOffsetMultiDim = 'O', ReqNoOffsetMultiDim = 'o',
     ReqSystolic = 'I',       ReqNoSystolic = 'i',
     ReqCustom1 = 'D',        ReqNoCustom1 = 'd',
     ReqXe2Block2D = 'G',     ReqNoXe2Block2D = 'g',
 };
 
 enum HWTags : char {
-    HWTagGen9 = '9',
-    HWTagGen11 = 'B',
     HWTagGen12LP = 'C',
-    HWTagXeHP = 'D',
     HWTagXeHPG = 'E',
     HWTagXeHPC = 'F',
     HWTagXe2 = 'G',
+    HWTagXe3 = 'H',
 };
 
 struct Selector {
@@ -202,31 +204,38 @@ struct Entry {
 };
 
 struct Catalog {
+    template <std::size_t N>
+    Catalog(const std::array<Entry, N> &a): entryCount(N), entries(a.data()) {}
     int entryCount          DEFAULT(0);
 
     const Entry *entries;
 };
 
-template <size_t n>
-struct FlatCatalog {
-    FlatCatalog(Entry (&&a)[n]) {
-        for(size_t i = 0; i < n; i++) {
-            entries[i] = std::move(a[i]);
-        }
-    }
-
-    Entry entries[n];
-
-    /* implicit */ operator Catalog() const {
-        Catalog catalog = {n, &entries[0]};
-        return catalog;
+#if __cplusplus >= 202002L && __cpp_lib_to_array >= 201907L
+template <std::size_t N>
+constexpr std::array<Entry, N> toArray(Entry (&&a)[N]) {
+    return std::to_array(std::move(a));
+}
+#elif __cplusplus >= 201402L && __cpp_lib_integer_sequence >= 201304L
+template <std::size_t N, std::size_t... I>
+constexpr std::array<Entry, N>
+toArrayImpl(Entry (&&a)[N], std::index_sequence<I...>) {
+    return {{std::move(a[I])...}};
+}
+template <std::size_t N>
+constexpr std::array<Entry, N> toArray(Entry (&&a)[N]) {
+    return toArrayImpl(std::move(a), std::make_index_sequence<N>{});
+}
+#else
+template <std::size_t N>
+std::array<Entry, N> toArray(Entry (&&a)[N]) {
+    std::array<Entry, N> ret;
+    for(size_t i = 0; i < N; i++) {
+        ret[i] = std::move(a[i]);
     }
-};
-
-template <std::size_t n>
-constexpr FlatCatalog<n> toFlatCatalog(Entry (&&a)[n]) {
-    return FlatCatalog<n>(std::move(a));
+    return ret;
 }
+#endif
 
 } /* namespace kcatalog */
 
diff --git a/src/gpu/intel/jit/gemm/include/kernel_selector.hpp b/src/gpu/intel/jit/gemm/include/kernel_selector.hpp
index 1e75cb3410f..0c9dd2ae2b2 100644
--- a/src/gpu/intel/jit/gemm/include/kernel_selector.hpp
+++ b/src/gpu/intel/jit/gemm/include/kernel_selector.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,8 +52,8 @@ struct MatchParamsBase
     int nExtraReqs = 0;
     const StrategyRequirement *extraReqs = nullptr;
 
-    MatchParamsBase() {}
-    MatchParamsBase(ngen::HW hw, bool systolicAvailable, const GEMMProblem &problem);
+    MatchParamsBase() = default;
+    MatchParamsBase(ngen::HW hw, bool systolicAvailable, bool isIntegrated, const GEMMProblem &problem);
 
 protected:
     std::array<char, 32> temp;
@@ -61,10 +61,11 @@ struct MatchParamsBase
 
 struct MatchParams : public MatchParamsBase
 {
-    MatchParams() : MatchParamsBase() {}
-    MatchParams(ngen::HW hw, bool systolicAvailable, const GEMMProblem &problem)
-            : MatchParamsBase(hw, systolicAvailable, problem) {}
+    MatchParams() = default;
+    MatchParams(ngen::HW hw, bool systolicAvailable, bool isIntegrated, const GEMMProblem &problem)
+            : MatchParamsBase(hw, systolicAvailable, isIntegrated, problem) {}
 
+    // NOLINTNEXTLINE(bugprone-copy-constructor-init)
     MatchParams(const MatchParams &other) { *this = other; }
     MatchParams &operator=(const MatchParams &other) {
         static_cast<MatchParamsBase &>(*this) = other;
@@ -100,10 +101,8 @@ const kcatalog::Entry *upper_bound(const kcatalog::Catalog &catalog, const kcata
 
 class EntryIterator {
 public:
-    EntryIterator(const kcatalog::Catalog &catalog_, const MatchParams &pattern_): catalog(catalog_), pattern(pattern_) {
-        begin = lower_bound(catalog_, pattern_.selector);
-        end   = upper_bound(catalog_, pattern_.selector);
-        current = begin;
+    EntryIterator(const kcatalog::Catalog &catalog_, const MatchParams &pattern_)
+        : catalog(catalog_), pattern(pattern_), begin(lower_bound(catalog_, pattern_.selector)), end(upper_bound(catalog_, pattern_.selector)), current(begin) {
         findNextMatch();
     }
 
diff --git a/src/gpu/intel/jit/gemm/include/problem.hpp b/src/gpu/intel/jit/gemm/include/problem.hpp
index 6bc7d2883d2..c857a368243 100644
--- a/src/gpu/intel/jit/gemm/include/problem.hpp
+++ b/src/gpu/intel/jit/gemm/include/problem.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,10 +65,10 @@ struct MatrixAddressing {
     uint8_t crosspack = 1;          // Crosspack for packed layouts.
     uint8_t alignment;              // Alignment for all addresses, offsets, and leading dimensions.
     uint8_t panelLength = 0;        // Length of the panel for packed layouts = #cols/rows for Pc/Pr respectively.
-    uint8_t padA[1] = {};
+    bool needA64 = false;
     uint16_t tileR = 0, tileC = 0;  // Tiling (0 if none) for packed layouts.
 
-    void setAlignment(int align) { alignment = sanitizeAlign(align); }
+    void setAlignment(int align) { alignment = static_cast<uint8_t>(sanitizeAlign(align)); }
     int defaultAlignment(Type T) const {
         return sanitizeAlign((isPacked(layout) ? (packSize * crosspack) : 1) * T);
     }
@@ -163,17 +163,20 @@ struct GEMMProblem : public CommonProblem {
     MatrixAddressing A, B, C;                       // Addressing information for A/B/C matrices.
     MatrixAddressing AO, BO, CO;                    // Addressing information for A/B/C offsets (if 2D).
     MatrixAddressing A_scale, B_scale;              // Addressing information for A/B/C scales (if 2D).
+    MatrixAddressing sroundSeed;              // Addressing information for A/B/C scales (if 2D).
     bool checkBeta0 = true;                         // If true, check for beta = 0 and handle specially.
     ABOffset aOffset = ABOffset::None;              // A/B offset modes.
     ABOffset bOffset = ABOffset::None;              //
     int aoPtrDims = -1, boPtrDims = -1;             // A/B offset dimensionality (-1: none; 0: scalar; 1: vector, 2: matrix)
     bool aScale2D = false, bScale2D = false;        // A/B 2D scaling.
-    int aqGroupK = 0, bqGroupK = 0;                 // Group size in k dimension for A/B quantization parameters (offsets and scales)
+    int aqGroupM = 0, aqGroupK = 0;                 // Group sizes for A quantization parameters (offsets and scales)
+    int bqGroupN = 0, bqGroupK = 0;                 // Group sizes for B quantization parameters (offsets and scales)
     COffset cOffset = COffset::None;                // C offset mode.
     BatchMode batch = BatchMode::None;              // Batch mode.
     int batchDims = 0;                              // # of batch dimensions (strided batch only).
     bool sumA = false, sumB = false;                // If true, calculate A row sums/B column sums and store in CO.
     bool postOpFwd = true;                          // Eltwise parameters
+    bool cStochasticRound = false;
 
     gpu_post_ops_t postOps;                         // Fused post operations to apply
     std::bitset<post_ops_t::post_ops_limit> binaryRow;      // Binary op broadcasts row data if false
@@ -234,8 +237,20 @@ struct GEMMProblem : public CommonProblem {
     bool quantized2DA() const { return (aoPtrDims == 2) || aScale2D; }
     bool quantized2DB() const { return (boPtrDims == 2) || bScale2D; }
 
-    bool earlyDequantizeA() const { return (aOffset == ABOffset::Calc && Tao.asSigned().isSubsetOf(Ta)) || (aScale2D && Ta_scale.isSubsetOf(Ta)); }
-    bool earlyDequantizeB() const { return (bOffset == ABOffset::Calc && Tbo.asSigned().isSubsetOf(Tb)) || (bScale2D && Tb_scale.isSubsetOf(Tb)); }
+    bool downconvertAScales() const { return Ta == Type::f16 && Ta_scale == Type::f32; }
+    bool downconvertBScales() const { return Tb == Type::f16 && Tb_scale == Type::f32; }
+
+    bool earlyDequantizeA() const {
+        return (aOffset == ABOffset::Calc && earlyDequantizableOffset(Ta_ext, Tao, Ta)
+                    && (Ta_ext.bits() < Ta.bits() || Ta.isFP()))
+            || (aScale2D && (Ta_scale.isSubsetOf(Ta) || downconvertAScales()));
+    }
+
+    bool earlyDequantizeB() const {
+        return (bOffset == ABOffset::Calc && earlyDequantizableOffset(Tb_ext, Tbo, Tb)
+                    && (Tb_ext.bits() < Tb.bits() || Tb.isFP()))
+            || (bScale2D && (Tb_scale.isSubsetOf(Tb) || downconvertBScales()));
+    }
 
     Type Tc_compute() const {
         if (Ta.isInteger() && Tb.isInteger() && Tc == Type::f32)
@@ -250,8 +265,12 @@ struct GEMMProblem : public CommonProblem {
     std::string toString() const;
     std::string scalarsToString() const;
 
+    static bool earlyDequantizableOffset(Type T_ext, Type To, Type T) {
+        return To.asSigned().isSubsetOf(T) && (To.bits() < T.bits() || T_ext.bits() < T.bits());
+    }
+
     /* Serialization for kernel cache. */
-    void serialize(serialized_data_t &s) const
+    void serialize(serialization_stream_t &s) const
     {
         s.append(Ta, Tb, Tc, Ts);
         s.append(Ta_ext, Tb_ext, Tc_ext);
@@ -275,6 +294,7 @@ struct GEMMProblem : public CommonProblem {
         s.append(binaryCol);
         s.append(binaryBatch);
         s.append(binaryTrans);
+        s.append(cStochasticRound);
     }
 };
 
@@ -293,6 +313,8 @@ void GEMMProblem::autoTypeConversions(ngen::HW hw, bool systolicAvailable)
 
         if (Ta.isF8()) Ta = Type::f16;
         if (Tb.isF8()) Tb = Type::f16;
+        if (Ta.isF4()) Ta = Type::f16;
+        if (Tb.isF4()) Tb = Type::f16;
 
     if (hw > HW::Gen9 && !systolicAvailable && Tc == Type::f32) {
         if (Ta == Type::f16) Ta = Type::f32;
diff --git a/src/gpu/intel/jit/gemm/include/strategy.hpp b/src/gpu/intel/jit/gemm/include/strategy.hpp
index 063e2558b82..082bf679e08 100644
--- a/src/gpu/intel/jit/gemm/include/strategy.hpp
+++ b/src/gpu/intel/jit/gemm/include/strategy.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,6 +101,7 @@ struct MatrixAddressingStrategy {
 
     void preflight(ngen::HW hw);
     void forceA64();
+    void assignSurface(uint8_t index) { if (!base.isStateless()) base.setIndex(index); }
 
     ngen::GlobalAccessType getGlobalAccessType() const {
         return base.isStateless() ? ngen::GlobalAccessType::Stateless : ngen::GlobalAccessType::Surface;
@@ -134,6 +135,7 @@ enum class CoopSplit {
 enum class WalkOrder : uint8_t {
     HW2D,           // Rely on HW thread dispatch for ordering
     SimpleLinear,   // Simple 1D->2D mapping in column-major/row-major order
+    NestedLinear,   // Fixed-size blocks of WGs traversed in column/row-major order
     Hilbertlike,    // Cache-oblivious Hilbert curve-based order
     Boustrophedon,  // Cache-aware panel boustrophedon walk order
 };
@@ -171,7 +173,7 @@ struct CommonStrategy {
     EmulationStrategy emulate;
                                     ZPAD(C, 2)
 
-    CommonStrategy() {}
+    CommonStrategy() = default;
     CommonStrategy(ngen::HW hw, int stepping = 0);
     void preflight(ngen::HW hw, const CommonProblem &problem);
 };
@@ -200,7 +202,9 @@ struct GEMMStrategyPOD : public CommonStrategy {
     WGType forceWGUpdate = WGDynamic;            // Force work group update type.
                                     ZPAD(B, 3)
     int wgPadFactor = 1;                         // If > 1, pad workgroup with empty threads.
-    MatrixAddressingStrategy A, B, C, CO;        // Strategies for accessing A/B/C/C offsets.
+    MatrixAddressingStrategy A, B, C;            // Strategies for accessing A/B/C.
+    MatrixAddressingStrategy AO, BO, CO;         // Strategies for accessing A/B/C offsets.
+    MatrixAddressingStrategy A_scale, B_scale;   // Strategies for accessing A/B scales.
     int ka_load, kb_load;                        // How much of A/B is loaded at once, in k dimension
     int ka_load_masked = 0, kb_load_masked = 0;  // Same as above, when masking m/n (0 = default = same as ka/kb_load)
     bool loadBFirst = false;                     // If true, load B before A (default A then B).
@@ -208,7 +212,8 @@ struct GEMMStrategyPOD : public CommonStrategy {
     bool kDescRem = false;                       // Allow descriptor-based k remainder handling for A/B.
     bool slmA = false, slmB = false;             // Whether to copy A/B to SLM.
     bool splitCopy = false;                      // Separate SLM copy and compute threads?
-                                    ZPAD(C, 2)
+    bool tlbWarmup = false;                      // Enable TLB warmup?
+                                    ZPAD(C, 1)
     int slmBuffers = 0;                          // # of A/B SLM buffers, 0 for none.
     int unrollKSLM = 0;                          // k unroll for SLM copies (0 = auto = unroll[LoopK]/slmCopies)
     int unrollKSLMMasked = 0;                    //   Alternate value to use with masking (0 = same as unrollKSLM)
@@ -230,6 +235,12 @@ struct GEMMStrategyPOD : public CommonStrategy {
     int prefetchA = 0, prefetchB = 0, prefetchC = 0;                // Prefetch distances, in units of unrollK.
     int prefetchAMasked = 0, prefetchBMasked = 0;                   // Same as above, when masking m/n.
     MatrixAddressingStrategy A_prefetch, B_prefetch, C_prefetch;    // Strategies for prefetching A/B/C.
+    bool l3PrefetchA = false;                    // Enable L3 prefetch for A?
+    bool l3PrefetchB = false;                    // Enable L3 prefetch for B?
+                                    ZPAD(HH, 2)
+    int prefetchABL3 = 0;                        // L3 prefetch distance for A/B.
+    int ka_prefetchL3 = 0, kb_prefetchL3 = 0;    // Chunk size for L3 prefetch of A/B.
+    MatrixAddressingStrategy AB_prefetchL3;      // Strategy for L3 prefetch of A/B.
     enum {
         CSeparate,                                   // C stored in its own bundle, A/B in the other bundle.
         ACB,                                         // A, then C, then B
@@ -253,7 +264,9 @@ struct GEMMStrategyPOD : public CommonStrategy {
     bool fusePostOps = false;                    //   Fuse post-operations into kernel? (kParallel/kParallelVariable, requires linear ordering)
     bool altFusedBeta = false;                   //   Enable alternate beta fusion implementation? (requires sequential dispatch)
     bool zeroTempC = false;                      //   Use pre-zeroed temporary C memory.
-                                    ZPAD(K, 3)
+    bool relaxedAccumulation = false;            //   Allow downconversion of partial contributions to Tc_ext.
+                                                 //     If false (default), only downconvert C at the end of the calculation.
+                                    ZPAD(K, 2)
     int kPadding = 32;                           //   Pad k dimension when load balancing (kParallel/kParallelVariable)
     bool doubleWA = false;                       // Use explicit double broadcast instructions? (Gen9 only)
                                     ZPAD(L, 3)
@@ -263,6 +276,7 @@ struct GEMMStrategyPOD : public CommonStrategy {
     bool block2DCRemainder = false;              // Generate block 2D C remainder path?
     bool block2DCFull = false;                   //   Use block 2D C remainder path even for full tiles?
     int cRepackPanel = 0;                        // Size of panels for repacking C (0 = automatic)
+    int repackC = 0;                             // Repack C every repackC k loops.
     bool cAccumulators = false;                  // Use accumulator registers for part of C (to save a few registers)?
     bool cLoadAhead = false;                     // Load C before doing FMAs?
     bool autoatomic = true;                      // Automatically use C atomics for beta = 1 kernels?
@@ -302,7 +316,7 @@ struct GEMMStrategyPOD : public CommonStrategy {
     bool insideSK = false;                       // Inside a superkernel?
                                     ZPAD(P, 3)
 
-    GEMMStrategyPOD() {}
+    GEMMStrategyPOD() = default;
     GEMMStrategyPOD(ngen::HW hw, int stepping = 0) : CommonStrategy(hw, stepping) {}
 };
 
@@ -312,7 +326,7 @@ struct GEMMStrategy : public GEMMStrategyPOD
 {
     std::vector<MatrixAddressingStrategy> binary; // Strategies for accessing binary postop data.
 
-    GEMMStrategy() {}
+    GEMMStrategy() = default;
     GEMMStrategy(ngen::HW hw, int stepping = 0) : GEMMStrategyPOD(hw, stepping) {}
 
     void preflight(ngen::HW hw, const GEMMProblem &problem);
@@ -358,6 +372,8 @@ struct GEMMStrategy : public GEMMStrategyPOD
             return WGFixed;
         if (cooperativePF)
             return WGFixed;     /* until flexible cooperative PF enabled */
+        if (cWalkOrder == WalkOrder::NestedLinear)
+            return WGFixed;
         if (forceWGUpdate == WGShrinkable)
             return WGShrinkable;
         else
@@ -389,7 +405,7 @@ struct GEMMStrategy : public GEMMStrategyPOD
     int bqGroupKGranularity() const { return groupKReduce(slmB ? unrollKSLM : kb_load); }
     static int groupKReduce(int x) { while (x > 32 && (x & 1) == 0) x >>= 1; return x; }
 
-    void serialize(serialized_data_t &s) const
+    void serialize(serialization_stream_t &s) const
     {
         const GEMMStrategyPOD &pod = *this;
         s.append(pod);
diff --git a/src/gpu/intel/jit/gemm/include/type.hpp b/src/gpu/intel/jit/gemm/include/type.hpp
index faa36902ef0..84a31e956fc 100644
--- a/src/gpu/intel/jit/gemm/include/type.hpp
+++ b/src/gpu/intel/jit/gemm/include/type.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,6 @@
 
 #include "internal/namespace_start.hxx"
 
-
 // Enum-like class for data types.
 class Type {
 public:
@@ -43,6 +42,9 @@ class Type {
         s32  = 0x01190402,
         u64  = 0x011A0803,
         s64  = 0x011B0803,
+        f4_e2m1 = 0x410B0100,
+        f4_e3m0 = 0x410A0100,
+        f8_e8m0 = 0x1080100,
         bf8  = 0x010E0100,
         hf8  = 0x010F0100,
         bf16 = 0x010C0201,
@@ -63,17 +65,20 @@ class Type {
     constexpr int components()        const { return 1; }
     constexpr bool isInteger()        const { return uint32_t(val) & 0x100000; }
     constexpr bool isFP()             const { return !isInteger(); }
+    constexpr bool isFP4()            const { return uint32_t(val) & 0x40000000; }
     constexpr bool isInt4()           const { return uint32_t(val) & 0x20000000; }
+    constexpr bool is4Bit()           const { return isInt4() || isFP4(); }
     constexpr bool isInt8()           const { return (val == Type::u8)  || (val == Type::s8);  }
     constexpr bool isInt16()          const { return (val == Type::u16) || (val == Type::s16); }
-    constexpr bool isF8()             const { return (val == Type::bf8) || (val == Type::hf8); }
+    constexpr bool isF8()             const { return (val == Type::bf8) || (val == Type::hf8  || (val == Type::f8_e8m0)); }
+    constexpr bool isF4()             const { return (val == Type::f4_e2m1 || val == Type::f4_e3m0) ;}
     constexpr bool isSigned()         const { return (uint32_t(val) & 0x110000) != 0x100000; }
-    constexpr int bits()              const { return isInt4() ? 4 : (paddedSize() * 8); }
+    constexpr int bits()              const { return is4Bit() ? 4 : (paddedSize() * 8); }
     constexpr int paddedSize()        const { return (uint32_t(val) >> 8) & 0xFF; }
     int log2Size()                    const { subByteCheck(); return uint32_t(val) & 0xFF; }
     int size()                        const { subByteCheck(); return paddedSize(); }
-    constexpr int perByte()           const { return isInt4() ? 2 : 1; }
-    void subByteCheck()               const { if (isInt4()) stub(); }
+    constexpr int perByte()           const { return is4Bit() ? 2 : 1; }
+    void subByteCheck()               const { if (is4Bit()) stub(); }
 
     constexpr Type arithmetic() const {
         return (val == tf32) ? Type(f32) : real();
@@ -89,27 +94,34 @@ class Type {
             case Type::f64: return data_type::f64;
             case Type::f32: return data_type::f32;
             case Type::f16: return data_type::f16;
+            case Type::hf8: return data_type::f8_e4m3;
+            case Type::bf8: return data_type::f8_e5m2;
             case Type::s32: return data_type::s32;
             case Type::u8: return data_type::u8;
             case Type::s8: return data_type::s8;
             case Type::u4: return data_type::u4;
             case Type::s4: return data_type::s4;
+            case Type::f4_e2m1: return data_type::f4_e2m1;
+            case Type::f4_e3m0: return data_type::f4_e3m0;
             default: assert(!"Unsupported type"); return data_type::undef;
         }
     }
     constexpr Type baseType() const { return *this; }
 
-    template <typename U> constexpr friend int operator*(U a, Type t) {
-        return t.isInt4() ? int((a + 1) / 2) : int(a * (U(1) << t.log2Size()));
+    template <typename U> constexpr friend decltype(std::declval<U>()*1) operator*(U a, Type t) {
+        return t.is4Bit() ?(a + 2 * (a >= 0) - 1) / 2 : a * int(1u << t.log2Size());
     }
-    template <typename U> constexpr friend int operator*(Type t, U a) { return a * t; }
-    template <typename U>           friend int operator*=(U &a, Type t) { a = a * t; return a; }
-    template <typename U> constexpr friend int operator/(U a, Type t) {
-        return t.isInt4() ? int(a * 2) : int(a / (U(1) << t.log2Size()));
+    template <typename U> constexpr friend decltype(std::declval<U>()*1) operator*(Type t, U a) { return a * t; }
+    template <typename U>           friend U operator*=(U &a, Type t) { a = a * t; return a; }
+    template <typename U> constexpr friend decltype(std::declval<U>()/1) operator/(U a, Type t) {
+        return t.is4Bit() ? a * 2 : a / int(1u << t.log2Size());
     }
 
-    /* Not a valid nGEN DataType, just a stand-in to represent hf8 data */
-    static constexpr ngen::DataType ngen_hf8() { return static_cast<ngen::DataType>(0x71); }
+    // Not a valid nGEN DataType; for gemmstone internal use only
+    static  constexpr  ngen::DataType ngen_f4_e2m1() { return  static_cast<ngen::DataType>(0x5A);}
+    static  constexpr  ngen::DataType ngen_f4_e3m0() { return  static_cast<ngen::DataType>(0x5B);}
+    static  constexpr  ngen::DataType ngen_f8_e8m0() { return  static_cast<ngen::DataType>(0x69);}
+
 
     ngen::DataType ngen() const
     {
@@ -117,8 +129,8 @@ class Type {
         auto none = DT::invalid;
         static const DT table[32] = {DT::hf,   DT::f,    DT::df,    none,
                                      none,     none,     none,      none,
-                                     none,     none,     none,      none,
-                                     DT::bf,   DT::tf32, DT::bf8,   ngen_hf8(),
+                                     ngen_f8_e8m0(),     none,     ngen_f4_e3m0(),      ngen_f4_e2m1(),
+                                     DT::bf,   DT::tf32, DT::bf8,   DT::hf8,
                                      none,     none,     DT::u4,    DT::s4,
                                      DT::ub,   DT::b,    DT::uw,    DT::w,
                                      DT::ud,   DT::d,    DT::uq,    DT::q,
@@ -143,6 +155,8 @@ class Type {
     }
 };
 
+static_assert((-9 * Type(Type::s4) == -5) && (9 * Type(Type::s4) == 5), "Round away from zero is required non-integer type sizes");
+
 inline char typeToChar(Type T)
 {
     switch (T.baseType()) {
@@ -163,6 +177,9 @@ inline char typeToChar(Type T)
         case Type::s64:   return 'L';
         case Type::bf16:  return 'B';
         case Type::tf32:  return 'T';
+        case Type::f4_e2m1:   return 'E';
+        case Type::f4_e3m0:   return 'e';
+        case Type::f8_e8m0:   return 'X';
         default:          return '?';
     }
 }
@@ -185,6 +202,9 @@ inline Type charToType(char c)
         case 'I': return Type::s32;
         case 'B': return Type::bf16;
         case 'T': return Type::tf32;
+        case 'E': return Type::f4_e2m1;
+        case 'e': return Type::f4_e3m0;
+        case 'X': return Type::f8_e8m0;
         default:  return Type::invalid;
     }
 }
diff --git a/src/gpu/intel/jit/gemm/jit_gemm_pd.cpp b/src/gpu/intel/jit/gemm/jit_gemm_pd.cpp
index 9b9b5a2bc6c..aedb65c0164 100644
--- a/src/gpu/intel/jit/gemm/jit_gemm_pd.cpp
+++ b/src/gpu/intel/jit/gemm/jit_gemm_pd.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ status_t jit_gemm_pd_t::init_post_ops() {
                 beta_ = e.sum.scale;
                 break;
             case eltwise:
-                ok &= jit_eltwise_injector_f32_is_supported(e.eltwise.alg);
+                ok &= eltwise_injector_f32_is_supported(e.eltwise.alg);
                 binary_srcs_.push_back(binary_src_t {binary_src_t::none, 0});
                 break;
             case prelu:
@@ -92,59 +92,70 @@ status_t jit_gemm_pd_t::init_post_ops() {
     }
 
     if (!wei_scales->has_default_values()) {
-        const auto &mask = wei_scales->mask_;
+        const auto &mask = wei_scales->get_mask();
         bool convert = (mask == 0 || math::is_pow2(mask));
-        if (wei_scales->ndims_ > 1)
-            convert |= (wei_scales->group_dims_[0] >= d->k());
+        if (!wei_scales->has_default_groups())
+            convert |= (wei_scales->get_group(0) >= d->k());
         if (convert) {
-            ok = ok && (mask == 0 || mask == (1 << (d->c_desc.ndims - 1)));
-
             dim_t dims = {(mask > 0) ? d->m() : 1};
             CHECK(memory_desc_init_by_tag(wei_scales_md, 1, &dims,
-                    wei_scales->data_type_, format_tag::a));
+                    wei_scales->get_data_type(), format_tag::a));
 
-            auto status = post_ops_.prepend_binary(binary_mul, &wei_scales_md);
-            if (status != status::success) return status;
+            CHECK(post_ops_.prepend_binary(binary_mul, &wei_scales_md));
 
             binary_srcs_.insert(binary_srcs_.begin(),
                     binary_src_t {binary_src_t::scales, DNNL_ARG_WEIGHTS});
         }
     }
     if (!src_scales->has_default_values()) {
-        const auto &mask = src_scales->mask_;
+        const auto &mask = src_scales->get_mask();
         bool convert = (mask == 0);
-        if (src_scales->ndims_ > 1)
-            convert |= (src_scales->group_dims_[1] >= d->k());
+        if (!src_scales->has_default_groups()) {
+            convert |= (src_scales->get_group(1) >= d->k());
+        }
         if (convert) {
             if (mask == 0) {
                 dim_t dims = 1;
                 CHECK(memory_desc_init_by_tag(src_scales_md, 1, &dims,
-                        src_scales->data_type_, format_tag::a));
+                        src_scales->get_data_type(), format_tag::a));
+            } else if (!src_scales->has_default_groups()) {
+                // TODO: is it inverted?
+                int n_group = src_scales->get_group(0);
+                int k_group = src_scales->get_group(1);
+                dim_t dims[]
+                        = {(mask & (d->batch() > 1 ? 2 : 1)) ? d->n() / n_group
+                                                             : 1,
+                                d->k() / k_group};
+                CHECK(memory_desc_init_by_tag(src_scales_md, 2, dims,
+                        src_scales->get_data_type(), format_tag::ab));
             } else {
                 dim_t dims[] = {d->n(), 1};
                 CHECK(memory_desc_init_by_tag(src_scales_md, 2, dims,
-                        src_scales->data_type_, format_tag::ab));
+                        src_scales->get_data_type(), format_tag::ab));
             }
 
-            auto status = post_ops_.prepend_binary(binary_mul, &src_scales_md);
-            if (status != status::success) return status;
+            CHECK(post_ops_.prepend_binary(binary_mul, &src_scales_md));
 
             binary_srcs_.insert(binary_srcs_.begin(),
                     binary_src_t {binary_src_t::scales, DNNL_ARG_SRC});
         }
     }
     if (!c_scales->has_default_values()) {
-        ok = ok && (c_scales->mask_ == 0);
-
-        dim_t dims = {1};
-        CHECK(memory_desc_init_by_tag(
-                c_scales_md, 1, &dims, f32, format_tag::a));
+        const auto &mask = c_scales->get_mask();
+        bool convert = (mask == 0 || math::is_pow2(mask));
+        if (!c_scales->has_default_groups())
+            convert |= (c_scales->get_group(0) >= d->m());
+        if (convert) {
+            ok = ok && (mask == 0 || mask == (1 << (d->c_desc.ndims - 1)));
+            dim_t dims = {(mask > 0) ? d->m() : 1};
+            CHECK(memory_desc_init_by_tag(c_scales_md, 1, &dims,
+                    c_scales->get_data_type(), format_tag::a));
 
-        auto status = post_ops_.append_binary(binary_div, &c_scales_md);
-        if (status != status::success) return status;
+            CHECK(post_ops_.append_binary(binary_div, &c_scales_md));
 
-        binary_srcs_.push_back(
-                binary_src_t {binary_src_t::scales, DNNL_ARG_DST});
+            binary_srcs_.push_back(
+                    binary_src_t {binary_src_t::scales, DNNL_ARG_DST});
+        }
     }
 
     return status::success;
diff --git a/src/gpu/intel/jit/gemm/selector/.clang-format b/src/gpu/intel/jit/gemm/selector/.clang-format
new file mode 100644
index 00000000000..71d803d4198
--- /dev/null
+++ b/src/gpu/intel/jit/gemm/selector/.clang-format
@@ -0,0 +1,18 @@
+#===============================================================================
+# Copyright 2019-2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+DisableFormat : true
+SortIncludes:   false
diff --git a/src/gpu/intel/jit/gemm/selector/_clang-format b/src/gpu/intel/jit/gemm/selector/_clang-format
deleted file mode 100644
index ba1496d179e..00000000000
--- a/src/gpu/intel/jit/gemm/selector/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2019-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/src/gpu/intel/jit/gemm/selector/db/kernel.db b/src/gpu/intel/jit/gemm/selector/db/kernel.db
index 78befe63dc1..949dadb729a 100644
--- a/src/gpu/intel/jit/gemm/selector/db/kernel.db
+++ b/src/gpu/intel/jit/gemm/selector/db/kernel.db
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,79 +15,7 @@
 *******************************************************************************/
 
 /*@kcatalog@*/
-auto _CATALOG_ = kcatalog::toFlatCatalog({
-{{'9', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as16 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab2x2 as l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "s"}, "ab1 ab2 ab k8 acb bm4032 bn4032 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4032, 4032, 1536}, {4096, 4096, 1536}, {48, 8, 8}, {1, 16, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 8}, {true, true, true}}, {'W', 1, {384}}},
-{{'9', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "s"}, "ab1 ab2 ab k8 acb bm1008 bn2016 bk1008", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {1008, 2016, 1008}, {4096, 4096, 1008}, {24, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 8}, {true, true, true}}, {'W', 1, {1e+06}}},
-{{'9', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, 1024, -1}, {1, 1, 1}, ""}, "ab8 as16 ab acb nmk bk1024", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 16}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}},
-{{'9', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4/2 as4 ab k8 cs bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, ""}, "ab2x2 as4x2 ab acb bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {32}}},
-{{'9', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab2 ab k8 cs bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 8, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 as4 ab k8 cs bk512", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {512, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4x2 as16 ab acb bk512", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}},
-{{'9', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1024, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4x2 as16 ab acb bk512", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 512}, {4096, 4096, 512}, {16, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {128}}},
-{{'9', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 ab2 as k8 cs bk1024", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"H", "H", "H"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab8x2 ab k16 acb bm8192 bn4096 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {32, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'9', "gemm", {"H", "H", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4 as k8 acb bm8192 bn4096 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {32, 64, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 as8 ab k16 l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'9', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 as8x2 ab k16 l4 acb nmk ns128", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 ab2x2 ab k4 l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'9', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab4x2 ab k8 l4 acb ns64", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as2x2 ab32 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 as8 ab k8 l4 cs", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab4x2 as k16 l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab2 ab k8 l4 cs", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'9', "gemm", {"O", "O", "I"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4 as k8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {16, 32, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4/2x2 as2x2 as l4 cb1 wg 8x1 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 1, 1}, 1, (WGType) 1, 1, 128, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab1x2 as l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {1, 8, 1}, 1, (WGType) 1, 1, 256, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 as8 as l4 cab1 k32 wg 2x4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 32}, {2, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as2x2 ab8/2x2 as l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {1, 8, 1}, 1, (WGType) 1, 1, 256, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "z"}, "ab4x2 ab4x2 as k8 acb bm8192 bn4096 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {true, true, false}}, {'W', 1, {512}}},
-{{'9', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab4x2 ab k8 acb bm4608 bn4608 bk1536", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4608, 4608, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 4}, {true, true, true}}, {'W', 1, {1e+06}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "wqpe"}, "ab1x2 ab16 ab acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, -1, -1}, {1, 1, 1}, "v"}, "ab8x2 ab16x2 ab ca1 wg 1x8 acb kb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {1, 8, 1}, 1, (WGType) 1, 3, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, -1, -1}, {1, 1, 1}, ""}, "ab8x2 ab16x2 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab16x2 ab acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 32}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "ab8 ab16 ab acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab16 ab acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {16, 16, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {512, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as8x2 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {8, 8, 16}, {1, 8, 1}, 1, (WGType) 1, 1, 512, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {64}}},
-{{'9', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {512, -1, -1}, {1, 1, 1}, "v"}, "ab2x2 as8x2 ab ca1 wg 1x8 acb kb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {8, 8, 16}, {1, 8, 1}, 1, (WGType) 1, 3, 512, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {64}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab32 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 8, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {64}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "as8x2 ab32 ab ca1 wg 1x8 acb kb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {1, 8, 1}, 1, (WGType) 1, 3, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 32, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab32 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16 as4x2 as cb1 wg 8x1 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 16}, {8, 1, 1}, 1, (WGType) 1, 1, 512, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {16, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab32 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "as1x2 ab16 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 16, 16}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'9', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "z"}, "as4 ab4 as k8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {512}}},
-{{'B', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "ab4 as8 ab l4 acb sr", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab2 as l4 acb sr", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "q"}, "ab16 as4 as l4 cb1 wg 8x1 acb nmk sr", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 1, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'B', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "as8 ab4 as l4 acb nmk sr", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"H", "H", "H"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab8 ab acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'B', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sqp"}, "ab4 as4x2 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {64, 16, 8}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'B', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "ab4 as8 ab l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4 ab l4 cab1 wg 2x4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 4}, {2, 4, 1}, 1, (WGType) 1, 1, 1536, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'B', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab2 as l4 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "as2x2 ab32 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "q"}, "ab16 as4 as l4 cb1 wg 8x1 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 1, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'B', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sqp"}, "as4x2 ab4 as l4 cb1 wg 8x1 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 64, 8}, {8, 1, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'B', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "as8 ab4 as l4 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 as16 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "ab4 ab32x2 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 8, 64}, {1, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'B', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "ab16 ab4 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 16}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab8 ab4 ab l4 ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {1, 8, 1}, 1, (WGType) 1, 1, 512, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {1e+06}}},
-{{'B', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as16 as32 ab l4 cb1 wg 8x1 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 32}, {8, 1, 1}, 1, (WGType) 1, 1, 1024, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'B', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8 ab2x2 ab l4 cb1 wg 8x1 acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {8, 1, 1}, 1, (WGType) 1, 1, 256, 0, {1, 1, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "q"}, "ab4 ab2 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 8}, {1, 8, 1}, 1, (WGType) 1, 0, 512, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab1x2 ab8 ab acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab32 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 8, 32}, {1, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'B', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab1 ab4x2 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {1, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'B', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "p"}, "as8x2 ab32x2 ab ca1 wg 1x8 acb", {8, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 8, 64}, {1, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {64}}},
-{{'B', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab8 ab1x2 as acb nmk", {8, (LoopType) 255, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 32, 8}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {512}}},
+auto _CATALOG_ = kcatalog::toArray({
 {{'C', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 as8 ab l4 ca1 wg 2x8 int sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
 {{'C', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab8 ab l4 cab1 wg 4x4 int sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
 {{'C', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab16 ab l4 cb1 wg 8x2 vnc nmk sr", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {256}}},
@@ -104,6 +32,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'C', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {1, 1, 1}, ""}, "ab2x2 as16 ab l4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 8, 16}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {256}}},
 {{'C', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qxy"}, "sb2/1 su8x2 ab l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {64, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {2, 2, 2}, {false, false, true}}, {'W', 1, {1024}}},
 {{'C', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 as8 ab l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
+{{'C', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab8 ab wg 2x1x8 ikr kc4 acb ar sb32 bk0 np", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {4096, 4096, 16777216}, {4096, 4096, 16777216}, {32, 1, 8}, {2, 1, 8}, 1, (WGType) 0, 4101, 0, 256, {2, 2, 4}, {true, true, true}}, {'W', 1, {32}}},
 {{'C', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2x2 ab2x2 ab k4 l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 32, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
 {{'C', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab8 ab l4 cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
 {{'C', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4 as8 ab k8 l4 vnc", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
@@ -123,8 +52,8 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'C', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb l4 int k16 cab1 wg 4x4", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}},
 {{'C', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 sb8x2 sb l4 vnc k32 cab1 wg 4x4 ek", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}},
 {{'C', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb l4 int k32 cab1 wg 4x4 fn nmk ek", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'C', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
-{{'C', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'C', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 4, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'C', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4 sb4 ab wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 4, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'C', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 ab2x2 ab int bm8192 bn4096 bk1536", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 4096, 1536}, {4096, 4096, 1536}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {true, true, true}}, {'W', 1, {512}}},
 {{'C', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 ab4x2 ab k8 int bm4608 bn4608 bk1536", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4608, 4608, 1536}, {4096, 4096, 1536}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {128, 64, 4}, {true, true, true}}, {'W', 1, {1e+06}}},
 {{'C', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4x2 as4x2 ab int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {16, 4, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {64}}},
@@ -146,73 +75,6 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'C', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "ab16 ab32 ab ca1 wg 2x8 int ek kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 2048, 0, {4, 4, 4}, {true, true, true}}, {'W', 1, {32}}},
 {{'C', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "zqps"}, "ab16/8 ab4x2 su k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {384}}},
 {{'C', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qsz"}, "ab8 ab4x2 su k32 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 1024}, {4096, 4096, 1024}, {12, 32, 32}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {true, true, false}}, {'W', 1, {1e+06}}},
-{{'D', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 1024}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 0, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
-{{'D', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "dsmqp"}, "ab16 ab16 ab fs wg 4x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}},
-{{'D', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Dsmqp"}, "ab16 ab16 ab fs wg 8x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}},
-{{'D', "gemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyn"}, "sb16 sb16 ab cab1 wg 4x4 fn nmk cs pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {256}}},
-{{'D', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb1x4 sb1x4 sb l4 cs nmk fn pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 4}, {8, 2, 1}, 1, (WGType) 0, 1, 0, 0, {2, 2, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyv"}, "sb4x2 sb4x2 ab cs wg 2x2x16 kr kb bk64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 64}, {8192, 8192, 64}, {8, 8, 8}, {2, 2, 16}, 1, (WGType) 0, 7, 0, 4096, {2, 2, 4}, {false, false, true}}, {'W', 1, {64}}},
-{{'D', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xynqp"}, "sb16 sb16 ab ca2 wg 1x4 fn nmk cs pab", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 8, 32}, {1, 4, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {256}}},
-{{'D', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 su16 sb l4 cab1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 3072, 0, {2, 2, 4}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyn"}, "sb16 sb16 as cab1 wg 4x4 cs pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {256}}},
-{{'D', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb4 sb cs cab1x2 wg 4x4 hi kc4 bm6144 bn6144 bk1536", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {6144, 6144, 1536}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 0, 4096, 0, {128, 128, 8}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 1024, -1}, {1, 1, 1}, "xyz"}, "sb8 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyz"}, "sb16 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {32}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb2x2 sb8x2 sb ca1 wg 2x8 cs sf", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {16, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {2048, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb2 sb4 sb cab2 wg 4x4 ar sn cs tb hi bm8192 bn8192 bk1536/1024", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 4096, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb2 sb4 sb cab2 wg 4x4 ar sn cs tb hi bm8192 bn8192 bk1536/1024", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4/2x2 sb2 sb cb1 wg 8x2 cs nmk hi bk1024", {8, (LoopType) 1, 128, {(LoopType) 145, (LoopType) 255, (LoopType) 255}, {262144, 262144, 1024}, {8192, 8192, 1024}, {16, 16, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {512, 1024, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {16, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {32, 1024, -1}, {1, 1, 1}, "xyz"}, "sb16 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {4, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {16}}},
-{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb cab1 wg 4x4 k8 cs hi bk1024", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 262144, 1024}, {8192, 8192, 1024}, {8, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 1024, -1}, {1, 1, 1}, "xyz"}, "sb8 sb8 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb8/4 sb1 as cb1 wg 8x2 ar cs fn hi bm8192 bn8192 bk1024", {8, (LoopType) 1, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {8, 32, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 4096, 0, {8, 8, 8}, {false, false, true}}, {'W', 1, {256}}},
-{{'D', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Dsmqp"}, "ab16 ab16 ab fs wg 8x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}},
-{{'D', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "dsmqp"}, "ab16 ab16 ab fs wg 4x4 bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 1024}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}},
-{{'D', "gemm", {"H", "H", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 1024}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 0, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
-{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {64, -1, -1}, {1, 1, 1}, "p"}, "ab8 ab16 ab l4 ca1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}},
-{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyz"}, "sb4 su16x2 sb l4 ca1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"H", "H", "H"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {65, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab16/8 as16 ab l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {512}}},
-{{'D', "gemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sb16 sb16 ab wg 8x8 cab3 ks16 af dw vav bo ar bk0 sn grf256 sys l4 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 1024}, {32, 32, 48}, {8, 8, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {1024}}},
-{{'D', "gemm", {"H", "H", "H"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb16 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"H", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "ab16 ab16 ab wg 8x8 cab3 ks16 af dw vav bo ar bk0 grf256 sys l4 np", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 1024}, {32, 32, 48}, {8, 8, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {1024}}},
-{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {32, 32, -1}, {-1, -1, -1}, {32, 32, -1}, {1, 1, 1}, "xy"}, "sb16 sb16 ab cab2 wg 2x4 cs pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 8, 32}, {2, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {2, 2, 2}, {false, false, true}}, {'W', 1, {128}}},
-{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "su8 su32 sb l4 cab1 wg 4x4 cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 8192, 0, {2, 2, 2}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"H", "H", "H"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "xyzv"}, "su8 sb32x2 sb wg 2x1x16 kr l4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 1, 64}, {2, 1, 16}, 1, (WGType) 0, 5, 0, 2048, {2, 2, 2}, {false, false, false}}, {'W', 1, {32}}},
-{{'D', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sb32 sb16 ab wg 8x4 cab3 ks32 af dw vav bo ar bk0 sm sn grf256 sys pab l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 1024}, {16, 32, 96}, {8, 4, 1}, 1, (WGType) 1, 1, 49152, 0, {2, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
-{{'D', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as4/1x2 ab4/1x2 ab l4 ca1 wg 2x8 cs nmk hi bk2048", {8, (LoopType) 1, 128, {(LoopType) 145, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 1024}, {32, 32, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 1024, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
-{{'D', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "ab16x2 ab16x2 ab wg 8x4 cab3 ks16 af dw vav bo ar bk0 sn grf256 sys l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 1024}, {16, 32, 96}, {8, 4, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 4}, {true, true, true}}, {'W', 1, {512}}},
-{{'D', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sm"}, "ab16x3 ab16x3 ab fs sc bo acb bk4096", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 4096}, {8192, 8192, 1024}, {32, 32, 32}, {4, 4, 1}, 2, (WGType) 1, 0, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
-{{'D', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "dsm"}, "ab16 ab16 ab fs wg 4x4 bo acb bk8192", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 1024}, {32, 48, 32}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}},
-{{'D', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Dsm"}, "ab16 ab16 ab fs wg 8x4 bo acb bk8192", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 1024}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1536}}},
-{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyznqp"}, "sb8 sb32 sb l4 cab1 wg 4x4 cs pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 su16x2 sb l4 ca1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8x2 sb8x2 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 3072, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, "xyzqp"}, "su16 sb64x2 sb wg 2x1x16 l4 cs kr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 2048}, {8192, 8192, 1024}, {32, 1, 128}, {2, 1, 16}, 1, (WGType) 0, 5, 0, 2048, {1, 1, 4}, {false, false, false}}, {'W', 1, {32}}},
-{{'D', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {1, 1, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzq"}, "sb2x2 sb4x2 sb ca1x2 wg 2x8 cs hi bk1536/1536", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 1536}, {8192, 8192, 1536}, {32, 16, 16}, {2, 8, 1}, 1, (WGType) 1, 0, 2048, 0, {128, 64, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb4x2 sb2x2 ab cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 32, 8}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 128, 4}, {false, false, true}}, {'W', 1, {1e+06}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, 1024}, {1024, -1, -1}, {1, 1, 1}, "xyzv"}, "sb16 sb32 sb cab1 wg 2x8 cs bm8192 bn8192 bk2048 kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 6144, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {32}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb32 sb cab1 wg 2x8 cs bm8192 bn8192 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {32}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb1x2 su4/2x2 sb ca1 wg 2x8 cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {1048576, 131072, 2048}, {8192, 8192, 1024}, {64, 8, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs bm2048 bn2048 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {2048, 2048, 2048}, {8192, 8192, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 31, -1}, {1, 1, 1}, "xyzn"}, "sb8 su32 sb cab1 wg 4x4 cs bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {16, 4, 32}, {4, 4, 1}, 1, (WGType) 1, 1, 10240, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb4 sb8 sb cab1 wg 4x4 cs bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {32, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {16, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyznqp"}, "sb4/2x2 sb4x2 sb cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 2048}, {8192, 8192, 1024}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {128, 128, -1}, {1, 1, 1}, "xyv"}, "sb2x2 sb2x2 ab wg 2x2x16 kb kr cs ar bk64", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 64}, {8192, 8192, 64}, {8, 8, 4}, {2, 2, 16}, 1, (WGType) 0, 7, 0, 4096, {4, 4, 4}, {false, false, true}}, {'W', 1, {64}}},
-{{'D', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "sb4x3 sb4x3 sb cs cab1 wg 4x4 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 1024}, {16, 16, 12}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb16 sb cab1 wg 4x4 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 8, 16}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {64}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzv"}, "sb16 sb32 sb cab1 wg 2x8 cs kb", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 3, 6144, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {32}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb16 sb32 sb cab1 wg 2x8 cs", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1536}, {8192, 8192, 1024}, {8, 4, 32}, {2, 8, 1}, 1, (WGType) 1, 1, 6144, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {32}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb8 su cab1 wg 4x4 cs hi bk2048", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 2048}, {8192, 8192, 1024}, {16, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzn"}, "su4/2x2 sb1x2 su cb1 wg 8x2 cs fn hi bk2048", {8, (LoopType) 1, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {131072, 1048576, 2048}, {8192, 8192, 1024}, {8, 64, 8}, {8, 2, 1}, 1, (WGType) 1, 1, 4096, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {512}}},
-{{'D', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb8 sb4 sb cab1 wg 4x4 cs bm4096 bn4096 bk2048", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {8192, 8192, 1024}, {8, 32, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 5120, 0, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
 {{'E', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "smqp"}, "ab16x3 ab16x3 ab fs sc bo acb bk2048 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 2048}, {8192, 8192, 2048}, {32, 32, 16}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
 {{'E', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsmqp"}, "sB16 sB16 sb fs wg 8x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {8, 4, 1}, 1, (WGType) 1, 0, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}},
 {{'E', "gemm", {"B", "B", "S"}, {"A2#8,16", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsmqp"}, "sB16 sB16 sb fs wg 4x4 bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 4096}, {8192, 8192, 4096}, {32, 48, 16}, {4, 4, 1}, 1, (WGType) 1, 0, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}},
@@ -278,7 +140,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 4, 1}, "xyI"}, "sS16x2 sB16 aB wg 16x2 cb4x2 ks16 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 4, 4}, {false, false, true}}, {'E', 17, {1.07596e+06, 547281, 0, 0, 0, 0, 5.3341, 5.22645, 6.47481, 17.5871, 0.0187721, 0.00841977, 0.0135396, 0.854887, 1.37769, 1.17708, 1.45209e-12}}},
 {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB16 aB wg 8x4 cab3 ks32 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05966e+06, 526716, 0, 0, 0, 0, 5.25431, 5.59994, 6.71912, 17.4322, 0.0194231, 0.00993451, 0.0135184, 0.980104, 1.41981, 1.17834, 1.31956e-12}}},
 {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {596002, 1.33567e+06, 0, 0, 0, 0, 5.53572, 5.47044, 6.55621, 18.2618, 0.0211424, 0.0211424, 0, 1, 1.31489, 1.18381, 6.85524e-13}}},
-{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06751e+06, 204301, 68125.3, 363183, 0, 0, 5.23091, 5.8341, 5.50916, 14.4411, 0.0246809, 0.0146692, 0.0199046, 0.894464, 1.4112, 1.17796, 1.27893e-12}}},
+{{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpxy"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06751e+06, 204301, 68125.3, 363183, 0, 0, 5.23091, 5.8341, 5.50916, 14.4411, 0.0246809, 0.0146692, 0.0199046, 0.894464, 1.4112, 1.17796, 1.27893e-12}}},
 {{'E', "gemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ip"}, "aB32 aB16 aB wg 2x4x4 kr ca3 ks64 af dw vav bo sr bk0 sm dm sys grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14066e+06, 578017, 75917, 84236.6, 0, 0, 6.02279, 6.09029, 4.49028, 11.4256, 0.0521067, 0.0432382, 0.0332794, 0.97984, 1.21403, 1.20108, 7.3813e-15}}},
 {{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.31167e+06, 785275, 0, 0, 0, 0, 7.11381, 8.75643, 6.11098, 15.9972, 0.0503546, 0.0303966, 0.0484271, 0.842682, 1.20649, 1.2023, -1.8357e-15}}},
 {{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.35147e+06, 357789, 0, 0, 0, 0, 11.5708, 11.8958, 6.40012, 17.218, 0.14396, 0.139657, 0.0130437, 0.882761, 1.16324, 1.0488, 3.12079e-12}}},
@@ -293,25 +155,34 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
 {{'E', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2 sB2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}},
-{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {2, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
 {{'E', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"F", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyzI"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.25 nse sr sb32 bk0 bm0 pab sys", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 512, {8, 4, 4}, {false, false, false}}, {'W', 1, {64}}},
 {{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}},
 {{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS8 sB wg 2x1x16 akr fg 0.5 kc4 nse sr sb32 bk0 bm0 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 525061, 0, 2048, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab grf256", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}},
 {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}},
 {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {4, 4, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {4, 4, 1}, "Iqxy"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}},
 {{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB16 aB wg 8x4 cab4 ks16 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05172e+06, 557389, 0, 0, 0, 0, 2.23646, 5.1076, 6.24833, 17.6243, 0.0206356, 0.0106181, 0.0095341, 0.748815, 1.31282, 1.18539, 7.88099e-13}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {512, 1, -1}, {-1, 1, -1}, {512, 1, -1}, {4, 4, 1}, ""}, "aB128x2 aB128x2 aB wg 2x1x8 ikr ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {1, 1, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {1}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {513, 1, -1}, {1024, 1, -1}, {513, 1, -1}, {1024, 1, -1}, {4, 4, 1}, ""}, "aB64 aB32x2 aB wg 2x1x8 ikr nse hi ar sb64 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4097, 1, 2049}, {-1, 1, -1}, {4097, 1, 2049}, {-1, 1, -1}, {4, 4, 1}, ""}, "aB16 aB128 aB wg 2x1x16 ikr nse hi ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 128}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4097, 1, -1}, {-1, 1, 2048}, {4097, 1, -1}, {-1, 1, 2048}, {4, 4, 1}, ""}, "aB64 aB32x2 aB wg 2x1x8 ikr nse hi ar sb64 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, -1}, {4096, 1, 4096}, {1025, 1, -1}, {4096, 1, 4096}, {4, 4, 1}, ""}, "aB16x2 aB128 aB wg 2x1x4 ikr wx4 nse hi ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 16384, 16777216}, {32768, 16384, 16777216}, {2, 1, 128}, {2, 1, 4}, 4, (WGType) 1, 4357, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {2}}},
-{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 4097}, {4096, 1, -1}, {1025, 1, 4097}, {4096, 1, -1}, {4, 4, 1}, ""}, "aB16 aB128 aB wg 2x1x16 ikr nse hi ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 128}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
-{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyIs"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi sr bk0 sm dm grf256 sys acb pab", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 40960, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {768}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {512, 1, -1}, {-1, 1, -1}, {512, 1, -1}, {4, 4, 1}, ""}, "aB128x2 aB64x2 aB wg 2x1x8 ikr ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 160, (LoopType) 255, (LoopType) 2}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {2, 1, 8}, 1, (WGType) 0, 4198661, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {1}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {513, 1, -1}, {1024, 1, -1}, {513, 1, -1}, {1024, 1, -1}, {4, 4, 1}, ""}, "aB64 aB32x2 aB wg 2x1x8 ikr nse hi ar sb64 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 64}, {2, 1, 8}, 1, (WGType) 0, 4198661, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4097, 1, 2049}, {-1, 1, -1}, {4097, 1, 2049}, {-1, 1, -1}, {4, 4, 1}, ""}, "aB16 aB128 aB wg 2x1x16 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 128}, {2, 1, 16}, 1, (WGType) 0, 4198661, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4097, 1, -1}, {-1, 1, 2048}, {4097, 1, -1}, {-1, 1, 2048}, {4, 4, 1}, ""}, "aB64 aB32x2 aB wg 2x1x8 ikr nse hi ar sb64 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 64}, {2, 1, 8}, 1, (WGType) 0, 4198661, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, -1}, {4096, 1, 4096}, {1025, 1, -1}, {4096, 1, 4096}, {4, 4, 1}, ""}, "aB16x2 aB128 aB wg 2x1x4 ikr wx4 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 16384, 16777216}, {32768, 16384, 16777216}, {2, 1, 128}, {2, 1, 4}, 4, (WGType) 1, 4198661, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {2}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 4097}, {4096, 1, -1}, {1025, 1, 4097}, {4096, 1, -1}, {4, 4, 1}, ""}, "aB16 aB128 aB wg 2x1x16 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {65536, 16384, 16777216}, {65536, 16384, 16777216}, {4, 1, 128}, {2, 1, 16}, 1, (WGType) 0, 4198661, 0, 64, {4, 4, 4}, {true, true, true}}, {'W', 1, {4}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ioxyz"}, "sS64x2 sB16 sB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, false}}, {'E', 17, {978625, 470744, 0, 0, 0, 0, 1.5997, 5.18308, 6.10237, 17.1883, 0.0161902, 0.0142461, 0.0050822, 0.870423, 1.37496, 1.20524, 9.37477e-13}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixyz"}, "sS32x2 sB16 sB wg 16x1x2 ikr cb4 ks64 xaf dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 64}, {16, 1, 2}, 1, (WGType) 1, 4357, 32768, 32768, {4, 4, 4}, {false, false, false}}, {'E', 17, {849827, -83056.2, 214309, 623119, 0, 0, 1.34807, 5.05315, 5.40586, 15.1632, 0.0173594, 0.0309966, 0.00618382, 0.767366, 1.37973, 1.22172, 6.63552e-13}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixyz"}, "sS32x2 sB16 sB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, false}}, {'E', 17, {982213, 473301, 0, 0, 0, 0, 1.74644, 5.1767, 6.10829, 17.1708, 0.0167439, 0.0136956, 0.00599404, 0.999577, 1.37511, 1.22059, 7.53689e-13}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxyz"}, "sS64 sB16x2 sB wg 16x2 cb4 ks64 xaf fx dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 64}, {16, 2, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, false}}, {'E', 17, {981842, 458926, 0, 0, 0, 0, 1.5015, 5.00498, 6.31005, 16.9024, 0.0169639, 0.0400974, 0, 0.719651, 1.35848, 1.1845, 9.89658e-13}}},
+{{'E', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 33, -1}, {-1, -1, -1}, {4, 4, 1}, "xyIs"}, "sB16 sB32 aB wg 4x8 cab3x2 ks32 xaf st dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.01302e+06, 570829, 0, 0, 0, 0, 3.67307, 6.66635, 6.86396, 18.2302, 0.0202076, 0.0155595, 0.00597746, 1, 1.56109, 1.12816, 4.46535e-12}}},
+{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {4, 4, 1}, "Iqxy"}, "sB64 sS16 aS wg 2x1x8 ikr af acb sr bk0 bm0 sys pab grf256 rc0", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'E', 17, {3.5449e+06, 60571.4, -243099, 15595.1, 0, 0, 1.78243, 2.8889, 2.76679, 6.10171, 0.051381, 0.0216118, 0.0510683, 1, 1.21576, 1.21633, -9.23968e-14}}},
+{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {4, 4, 1}, "IQxy"}, "sS64 sB32 aB wg 2x1x8 ikr ki64 sys af k64 grf256 acb di sr nch fm pab rc0", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "Ixyz"}, "sB64 sB32x2 sB wg 4x8 ca4x2 ks64 af dw nse hi sr sm dm grf256 cr0 sys pab bk0", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 16, 4}, {false, false, false}}, {'E', 17, {930230, 383972, 0, 0, 0, 0, 1.36662, 2.39816, 6.07666, 16.7056, 0.00930946, 0.00736716, 0.0110739, 1, 1.22963, 1.21426, 6.39235e-14}}},
+{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixyz"}, "sB64 sB32x2 sB wg 4x8 ca4x2 ks64 af dw nse hi sr sm dm grf256 cr0 sys pab bk0", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {false, false, false}}, {'E', 17, {928487, 383872, 0, 0, 0, 0, 1.36666, 2.39635, 6.08292, 16.706, 0.00967946, 0.0275317, 0.0124194, 0.709751, 1.30644, 1.2369, 3.12589e-14}}},
+{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 8192}, {-1, 1, -1}, {-1, 1, 8192}, {-1, 1, -1}, {16, 16, 1}, "I"}, "aS64 aB128 aB wg 2x1x32 ikr af nse bo sr sb128 dm sys bk0", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {2, 1, 32}, 1, (WGType) 0, 4357, 0, 256, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}},
+{{'E', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 8191}, {-1, 1, -1}, {-1, 1, 8191}, {16, 16, 1}, "I"}, "aS64 aB128 aB wg 2x1x32 kr af nse bo sr sb128 dm sys bk0", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 256, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}},
 {{'E', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"F", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"H", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}},
@@ -383,34 +254,45 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 4, 1}, "xyI"}, "sS16x2 sB16 aB wg 16x2 cb4x2 ks16 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {16, 4, 4}, {false, false, true}}, {'E', 17, {1.08445e+06, 546785, 0, 0, 0, 0, 5.33037, 5.24024, 6.46841, 17.5866, 0.0186739, 0.00499822, 0.0174057, 0.790315, 1.46202, 1.17542, 3.02139e-12}}},
 {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB16 aB wg 8x4 cab3 ks32 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05615e+06, 527155, 0, 0, 0, 0, 5.24242, 5.61507, 6.74453, 17.432, 0.0191776, 0.0115211, 0.0120279, 1, 1.50506, 1.17144, 2.14593e-12}}},
 {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 xaf dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {599248, 1.33458e+06, 0, 0, 0, 0, 5.52852, 5.45748, 6.54024, 18.2766, 0.0211426, 0.0211426, 0, 1, 1.40808, 1.175, 1.4855e-12}}},
-{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06367e+06, 203220, 74026.4, 363426, 0, 0, 5.22592, 5.83255, 5.53361, 14.4283, 0.0245863, -0.000813833, 0.0346436, 0.773746, 1.49124, 1.1734, 2.5839e-12}}},
+{{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Inpxy"}, "sB16 sB32x2 aB wg 2x8x2 kr ca4x2 ks64 xaf st dw vav bo sr bk0 dm grf256 sys", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 32768, 32768, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.06367e+06, 203220, 74026.4, 363426, 0, 0, 5.22592, 5.83255, 5.53361, 14.4283, 0.0245863, -0.000813833, 0.0346436, 0.773746, 1.49124, 1.1734, 2.5839e-12}}},
 {{'E', "gemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ip"}, "aB32 aB16 aB wg 2x4x4 kr ca3 ks64 af dw vav bo sr bk0 sm dm sys grf256", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 64}, {2, 4, 4}, 1, (WGType) 1, 261, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13288e+06, 578431, 81868.2, 83815, 0, 0, 6.01706, 6.08456, 4.49408, 11.3129, 0.0520799, 0.0402399, 0.0372522, 0.940886, 1.2079, 1.2015, 4.21515e-15}}},
 {{'E', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8 aB8/4 aB wg 4x8 kc8 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}},
 {{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.30894e+06, 785550, 0, 0, 0, 0, 7.11935, 8.75656, 6.13129, 16.041, 0.0504259, 0.0420693, 0.0674256, 0.73758, 1.20696, 1.20187, 1.15297e-15}}},
 {{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34765e+06, 357923, 0, 0, 0, 0, 11.5919, 11.8886, 6.4384, 17.1449, 0.145301, 0.14134, 0.0125881, 0.886981, 1.17153, 1.00812, 9.37293e-12}}},
 {{'E', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}},
 {{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}},
-{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "oxyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
+{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Oxyz"}, "sB4x2 sB16 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 2, 16}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 4096, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}},
 {{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}},
 {{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}},
 {{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
 {{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"O", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}},
 {{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}},
-{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "xyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "oxyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
+{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Oxyz"}, "sB4x2 sB16 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 2, 16}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 4096, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}},
 {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}},
 {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}},
 {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}},
 {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}},
-{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 16, -1}, {-1, 2, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 3, -1}, {-1, 16, -1}, {-1, 3, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4096, 2, -1}, {-1, 2, -1}, {4096, 2, -1}, {-1, 2, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, 4096}, {4096, 2, -1}, {-1, 2, 4096}, {4096, 2, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, -1}, {-1, 1, -1}, {-1, 1, -1}, {1, 1, 1}, "hxyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}},
 {{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
-{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sS16x2 sB16 aB wg 16x2 cb4 ks16 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 16}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.02157e+06, 527328, 0, 0, 0, 0, 3.47453, 5.53847, 6.17805, 17.0853, 0.0227172, 0.00268985, 0.022493, 0.935291, 1.301, 1.18662, 5.43286e-13}}},
-{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 512}, {-1, 1, -1}, {-1, 1, 512}, {1, 1, 1}, ""}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 257, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}},
-{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 513}, {1024, 1, -1}, {-1, 1, 513}, {1024, 1, -1}, {1, 1, 1}, ""}, "aB128 aB64x2 aB wg 8x1x2 ikr nse hi ar sb128 bk0 grf256 dot", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {8, 1, 2}, 1, (WGType) 0, 4357, 0, 256, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}},
-{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 4097}, {-1, 1, -1}, {1025, 1, 4097}, {-1, 1, -1}, {1, 1, 1}, ""}, "aB128 aB64x2 aB wg 4x1x8 ikr nse hi ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 16384, 16777216}, {32768, 16384, 16777216}, {2, 1, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 64, {1, 2, 4}, {true, true, true}}, {'W', 1, {2}}},
-{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 513}, {-1, 1, 4096}, {1025, 1, 513}, {-1, 1, 4096}, {1, 1, 1}, ""}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 257, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 512}, {-1, 1, -1}, {-1, 1, 512}, {1, 1, 1}, "H"}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 4194561, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 513}, {1024, 1, -1}, {-1, 1, 513}, {1024, 1, -1}, {1, 1, 1}, "H"}, "aB128 aB64x2 aB wg 8x1x2 ikr nse hi ar sb128 bk0 grf256 dot wt", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {8, 1, 2}, 1, (WGType) 0, 4198661, 0, 256, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 4097}, {-1, 1, -1}, {1025, 1, 4097}, {-1, 1, -1}, {1, 1, 1}, "H"}, "aB128 aB64x2 aB wg 4x1x8 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 16384, 16777216}, {32768, 16384, 16777216}, {2, 1, 128}, {4, 1, 8}, 1, (WGType) 0, 4198661, 0, 128, {1, 2, 4}, {true, true, true}}, {'W', 1, {2}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 513}, {-1, 1, 4096}, {1025, 1, 513}, {-1, 1, 4096}, {1, 1, 1}, "H"}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 4194561, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {4095, 2, 4095}, {-1, 2, -1}, {4095, 2, 4095}, {4, 4, 1}, "Hpxy"}, "sB128 sB64 aB wg 8x1x4 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 32768, 16777216}, {32768, 32768, 16777216}, {2, 2, 128}, {8, 1, 4}, 1, (WGType) 0, 4198661, 0, 256, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB32 aB16 aB ca3 ks64 wg 2x4x4 kr sys dw af k192 grf256 sm vav di dm sr bk0 cc fm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 192}, {2, 4, 4}, 1, (WGType) 1, 5, 12288, 12288, {4, 4, 2}, {true, true, true}}, {'E', 17, {4.488e+06, 120208, 122348, 327.657, 0, 0, 3.41911, 6.33998, 2.77024, 6.84323, 0.0582208, 0.0145417, 0.0574819, 0.511227, 1.20562, 1.20018, 5.11052e-14}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxy"}, "sS32x2 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {995629, 475244, 0, 0, 0, 0, 2.57281, 4.9973, 6.41839, 16.8374, 0.018098, 0.0100962, 0.0118445, 0.996594, 1.41614, 1.19695, 9.60059e-13}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, 2048}, {-1, 1024, -1}, {4, 4, 1}, "Ixy"}, "sS32x2 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {985865, 449373, 0, 0, 0, 0, 2.52021, 5.18432, 6.41111, 16.2959, 0.0196104, 0.0078899, 0.0176455, 0.975623, 1.37619, 1.19224, 7.58759e-13}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sS32 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {996386, 490462, 0, 0, 0, 0, 2.66142, 5.16946, 6.13711, 17.1234, 0.0173418, 0.00858928, 0.0131118, 0.74711, 1.38311, 1.23925, 7.08247e-13}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxy"}, "sB32 sB16 aB wg 8x4 cab3x2 ks32 xaf fx dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05802e+06, 544339, 0, 0, 0, 0, 3.68162, 5.47347, 6.43127, 16.927, 0.0189051, 0.00830613, 0.0176655, 0.737745, 1.33714, 1.26074, -3.55719e-14}}},
+{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 33, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB32 sB16 aB wg 8x4 cab4x2 ks32 xaf st dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {966409, 495159, 0, 0, 0, 0, 3.17717, 6.26529, 6.93941, 17.3959, 0.0210983, 0.0147392, 0.00938379, 1, 1.72262, 1.08677, 5.49901e-12}}},
 {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsm"}, "sB16 sB16 sb fs wg 4x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {4, 4, 1}, 1, (WGType) 1, 256, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}},
 {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsm"}, "sB16 sB16 sb fs wg 8x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 256, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}},
 {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sm"}, "ab16x3 ab16x3 ab fs sc bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 4096}, {8192, 8192, 4096}, {32, 32, 32}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
@@ -474,10 +356,9 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'E', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB64 sB32 aB wg 8x4 cab3 ks64 xaf dw vav bo sr bk0 sm dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.0553e+06, 523707, 0, 0, 0, 0, 2.27443, 2.47386, 6.75043, 17.4507, 0.00973723, 0.0173445, 0.00450943, 0.936605, 1.39613, 1.18223, 6.42735e-13}}},
 {{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyz"}, "sb32 sb32 sb l4 cab1 wg 4x4 cs sr", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 4096, 0, {1, 1, 4}, {false, false, false}}, {'E', 17, {1.01946e+06, 85192.9, 0, 0, 0, 0, 3.69873, 4.09617, 6.42674, 17.041, 0.0424222, 0.0270009, 0.0195008, 0.698122, 1.40723, 1.13886, 7.68905e-13}}},
 {{'E', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB8 aB16 aB wg 4x8 cab4 ks16 nse bo sr bk0 sn grf256 dm l4", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.31114e+06, 828815, 0, 0, 0, 0, 4.12868, 4.53677, 6.35113, 17.6714, 0.04015, 0.0278237, 0.024414, 0.810338, 1.20421, 1.02447, 3.99205e-12}}},
-{{'E', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sb2x2 sb2x2 ab wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {4096, 4096, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
-{{'E', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB2x2 sB2x2 aB wg 4x8 kc2 cab4 ks8 nse bo sr bk0 sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
-{{'E', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}},
-{{'E', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 4, 4}, {false, false, true}}, {'W', 1, {512}}},
+{{'E', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}},
+{{'E', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 4, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'E', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 nse sb64 bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 4}, {4, 8, 1}, 1, (WGType) 0, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
 {{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse bo bk0 sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3546e+06, 315336, 0, 0, 0, 0, 19.529, 12.4603, 5.98766, 15.7596, 0.125914, 0.121288, 0.0101254, 0.873111, 1.32691, 1.13232, 2.47813e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 4x4x2 kr kc8 cab4x2 ks8 nse hi bk0 sn sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {4, 4, 2}, 1, (WGType) 1, 261, 40960, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.01037e+06, 330255, 116967, 182860, 0, 0, 17.1106, 16.5232, 4.44267, 11.7098, 0.141417, 0.103027, 0.0331418, 0.58131, 1.36247, 1.11242, 2.61462e-12}}},
@@ -504,7 +385,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x8 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1555e+06, 250247, 0, 0, 0, 0, 20.2189, 17.5095, 5.50136, 14.7401, 0.130247, 0.121949, 0.0272943, 0.822585, 1.38372, 1.17114, 2.4229e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 8x4 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 12288, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.14264e+06, 242222, 0, 0, 0, 0, 11.8247, 19.2893, 4.69171, 13.5115, 0.206087, 0.141717, 0.0799846, 1, 1.23922, 1.11159, 2.37456e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB wg 4x4 kc4 cab4x2 ks8 nse bo bk0 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 8}, {4, 4, 1}, 1, (WGType) 1, 257, 8192, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16938e+06, 118796, 0, 0, 0, 0, 12.0084, 17.0436, 4.73424, 13.6962, 0.223144, 0.129865, 0.0961405, 0.885772, 1.23839, 1.03623, 1.12784e-11}}},
-{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "enpqw"}, "aB8x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.128279, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}},
+{{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "enpqw"}, "aB8x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.128279, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8 aB8 aB wg 8x4 kc8 cab4x2 ks8 nse bo bk0 sm sn sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.3433e+06, 325408, 0, 0, 0, 0, 13.5129, 12.9096, 5.98721, 15.7372, 0.12661, 0.0960571, 0.0304219, 0.583089, 1.34377, 1.11974, 3.39207e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4 aS4x2 aB wg 2x8x2 kr kc4 ca4x2 ks8 nse bo bk0 sm sr dm", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {2, 8, 2}, 1, (WGType) 1, 261, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22693e+06, 251321, 210418, 106612, 0, 0, 13.5399, 13.9651, 4.39951, 13.0611, 0.147223, 0.115872, 0.0300123, 0.916152, 1.36645, 1.0844, 1.44051e-11}}},
@@ -520,10 +401,15 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aS4 aB4x2 aB wg 2x1x16 kr kc4 nse bo sb64 bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 263, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15281e+06, 253100, 56985.6, 10434.6, 0, 0, 15.075, 15.6287, 30.852, 29.2407, 6.13546, 4.08401, 0.367041, 0.496243, 1.00145, 0, 0}}},
 {{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aS4x2 aB8x2 aP wg 8x4 kc4 cb4 ks8 nse bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {981989, 555240, 0, 0, 0, 0, 15.2859, 19.3328, 6.46524, 20.9306, 0.131163, 0.116872, 0.0205996, 0.805001, 1.19768, 1.00218, 8.63883e-12}}},
 {{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB8/4x2 aB wg 8x4 kc4 cb4 ks8 nse bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {984519, 554553, 0, 0, 0, 0, 15.6433, 19.3034, 6.32057, 17.386, 0.131163, 0.116872, 0.020599, 0.810127, 1.20102, 1.02803, 6.01287e-12}}},
-{{'E', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyIs"}, "sB32 sB32 aB wg 8x8 cab3x2 ks32 af dw vav hi sr bk0 dm grf256 sys acb pab", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 43008, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {1e+06}}},
-{{'E', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Ipqxyz"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.5 acb sr sb32 bk0 bm0 grf256 pab sys dm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 525061, 0, 2048, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}},
-{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "xyIs"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi sr bk0 sm dm grf256 sys acb pab", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 57344, 0, {4, 4, 4}, {false, false, true}}, {'W', 1, {1e+06}}},
-{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "Inqxy"}, "sB64 sS16 aS wg 2x1x8 ikr af acb sr bk0 bm0 sys pab grf256", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB32 aB wg 8x8 cab3x2 ks32 af dw vav hi sr bk0 dm grf256 sys acb pab rc0", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 43008, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {3.88957e+06, 2.97727e+06, 0, 0, 0, 0, 2.09294, 2.00406, 4.52375, 15.4399, 0.0125151, 0.0125151, 0, 1, 1.272, 1.20602, 2.32202e-13}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Ipqxyz"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.5 acb sr sb32 bk0 bm0 grf256 pab sys dm rc0", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 525061, 0, 2048, {4, 4, 4}, {false, false, false}}, {'E', 17, {1.02104e+06, 24262.1, 391695, -22645.2, 0, 0, 1.81043, 16.2483, 6.79395, 16.9189, 0.130633, 0.0432557, 0.114384, 0.798165, 1.21069, 1.21079, -7.43355e-14}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi sr bk0 sm dm grf256 sys acb pab", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 57344, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.72221e+06, 2.4081e+06, 0, 0, 0, 0, 1.90767, 2.47059, 4.96037, 15.1649, 0.0137524, 0.0137524, 0, 1, 1.25118, 1.20908, -4.599e-14}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1024, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi sr bk0 sm dm grf256 sys acb cr0 pab rc0", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 16, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 57344, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.72221e+06, 2.4081e+06, 0, 0, 0, 0, 1.90767, 2.47059, 4.96037, 15.1649, 0.0137524, 0.0137524, 0, 1, 1.25118, 1.20908, -4.599e-14}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1024, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB32 aB wg 8x4 cab4 ks32 af dw vav hi sr bk0 sm dm grf256 sys acb cr0 pab rc0", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 8, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 57344, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {120968, 1.77814e+06, 0, 0, 0, 0, 0.93955, 4.60294, 4.76347, 13.3704, 0.0275881, 0.0275881, 0, 0.900385, 1.23903, 1.20423, -9.89412e-15}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {4, 4, 1}, "Inqxy"}, "sB64 sS16 aS wg 2x1x8 ikr af acb sr bk0 bm0 sys pab grf256 rc0", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'E', 17, {2.83863e+06, 74855, -154928, 13616, 0, 0, 1.71054, 2.96521, 2.65327, 6.42273, 0.0516787, 0.0149917, 0.0568116, 1, 1.22262, 1.22456, -1.25262e-13}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {4, 4, 1}, "IQxy"}, "sS64 sB32 aB wg 2x1x8 ikr ki64 sys af k64 grf256 acb di sr nch fm pab", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 64}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 8191}, {-1, 1, -1}, {-1, 1, 8191}, {16, 16, 1}, "I"}, "aS64 aB128 aB wg 2x1x32 kr af nse bo sr sb128 dm sys bk0", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 128}, {2, 1, 32}, 1, (WGType) 0, 261, 0, 128, {16, 16, 4}, {true, true, true}}, {'W', 1, {16}}},
+{{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 8192}, {-1, 1, -1}, {-1, 1, 8192}, {-1, 1, -1}, {16, 16, 1}, "I"}, "aS64 aB128 aB wg 2x1x32 ikr af nse bo sr sb128 dm sys bk0", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 128}, {2, 1, 32}, 1, (WGType) 0, 4357, 0, 128, {16, 16, 4}, {true, true, true}}, {'W', 1, {16}}},
 {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 16x4 cab4 ks16 af dw vav hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 64, 16}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {555289, 1.12387e+06, 0, 0, 0, 0, 11.0812, 12.062, 6.31783, 16.9321, 0.033502, 0.033502, 0, 0.924238, 1.20788, 1.20316, -1.03588e-14}}},
 {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xypIn"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.01712e+06, 501677, 0, 0, 0, 0, 11.2352, 11.0061, 6.00586, 15.9727, 0.0368901, 0.0329381, 0.0158743, 0.872583, 1.26733, 1.18539, 7.97223e-13}}},
 {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {538723, 893443, 0, 0, 0, 0, 10.7801, 9.8182, 5.03684, 13.5344, 0.0761836, 0.0761836, 0, 0.79624, 1.20514, 1.20124, -4.36232e-15}}},
@@ -657,7 +543,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}},
 {{'F', "gemm", {"B", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}},
 {{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}},
-{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}},
+{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 1, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}},
 {{'F', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+B8@16 aB nse grf256 wg 4x8 bo pt kc8 sb256 bk0 br sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, true}}, {'W', 1, {1024}}},
 {{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 aS8x2+S8@24 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {915642, 488057, 0, 0, 0, 0, 2.62682, 4.67056, 1.01353, 1.76192, 0.0687398, 0.0687398, 0, 0.998364, 1.80644, 1.08579, 3.08664e-11}}},
 {{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32+B16@32 aS16+S8@32 aB wg 4x8 kc16 nse hi pt sb256 bk0 sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {887309, 574758, 0, 0, 0, 0, 4.77569, 4.82861, 0.536993, 1.65054, 0.0889844, 0.0889844, 0, 1, 1.65309, 1.06232, 1.19911e-11}}},
@@ -709,16 +595,27 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8x2+S16@24 aS16+S16@32 aB wg 4x2x4 kr kc8 nse hi pt sr br bk0 sm sn grf256 kv afb sb32", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {131072, 131072, 16777216}, {8192, 8192, 0}, {8, 8, 16}, {4, 2, 4}, 1, (WGType) 1, 413, 0, 4096, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.2692e+06, -138155, -69946.2, 247166, 2.48218e+06, 0, 2.22535, 2.9076, -0.226323, 2.36118, 0.179437, 0.0955498, 0.0965902, 0.986833, 1.48781, 1.14074, 6.5259e-12}}},
 {{'F', "gemm", {"D", "D", "D"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS16+S8@32 aS8+S1,8@32 aB wg 16x1x2 kr kc8 nse li nmk pt sr br bk0 sm sn grf256 sb32 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {131072, 262144, 16777216}, {8192, 8192, 0}, {8, 16, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 16384, {8, 8, 8}, {true, true, true}}, {'E', 17, {1.08982e+06, -365235, -57570.3, 563017, 2.44122e+06, 0, 1.97404, 6.45479, 0.729323, 2.0721, 0.109576, 0.0188555, 0.121551, 0.995776, 1.6362, 1.09808, 8.84348e-12}}},
 {{'F', "gemm", {"D", "D", "D"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,8@16 aB8+B8@16 aS nse wg 8x4 bo pt kc8 sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {863507, 843091, 0, 0, 0, 0, 2.99578, 2.42156, 4.42824, 4.74906, 0.0650645, 0.0650645, 0, 0.993213, 1.59693, 1.16972, 1.1178e-11}}},
-{{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB16 aB wg 2x8 af li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 64}, {32, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av64+m64@64 am32x2+m64@32 aB wg 4x8 xaf hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {888453, 388680, 0, 0, 0, 0, 1.36981, 1.36706, 1.57389, 2.50026, 0.0230892, 0.0230892, 0, 0.779666, 1.13216, 0.970824, 1.11098e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am32+m64@64 aB wg 4x8 af hi pt sr br sb256 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {872440, 433979, 0, 0, 0, 0, 0.692755, 0.929392, 0.682568, 1.28977, 0.00829318, 0.00829318, 0, 0.933146, 1.44966, 1.06633, 2.17433e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m64@64 am16 aB wg 8x4 cb4x2 ks32 xaf hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00706e+06, 522382, 0, 0, 0, 0, 0.725659, 1.44632, 0.970408, 1.74134, 0.0067111, 0.0067111, 0, 0.90349, 1.42986, 1.13348, 2.91269e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "av32+m16@64 am32+m32@72 aB wg 8x4 xaf hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 64}, {32, 16, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {876646, 564122, 0, 0, 6.5151e+06, 7.83974e+06, 0.629669, 0.87362, 0.885543, 1.48097, 0.00440774, 0.00440774, 0, 1, 1.66234, 1.24996, 2.85794e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m16@32 am16x2 aB wg 4x4x2 kr cb4 ks16 xaf st hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 261, 16384, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.02495e+06, 13797.9, 15430.1, 758509, 0, 0, 0.535333, 1.20812, 0.912657, 1.84068, 0.00529983, 0.00529983, 0, 1, 1.60581, 1.15873, 3.51036e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqps"}, "av16+m32@72 am32+m32@64 aB wg 4x8 xaf hi pt sr br sb64 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 64}, {32, 24, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {903365, 697556, 0, 0, 8.2903e+06, 1.21651e+07, 0.724506, 0.722081, 0.92287, 1.55416, 0.00402055, 0.00402055, 0, 0.997691, 1.6726, 1.18622, 5.18793e-12}}},
+{{'F', "gemm", {"E", "E", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16+m32@40 am32+m32@32 aB wg 2x8x2 kr xaf st hi pt sr br sb64 bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 0, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.06324e+06, 220443, 364.907, 276934, 0, 0, 0.524524, 1.25881, 0.793843, 2.21167, 0.00974309, 0.00974309, 0, 0.984682, 1.55809, 1.03396, 4.08729e-12}}},
+{{'F', "gemm", {"F", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks32 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}},
 {{'F', "gemm", {"F", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}},
 {{'F', "gemm", {"F", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}},
 {{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}},
-{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}},
-{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "BIp"}, "aB16+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt ca4x2 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {953955, 796880, 0, 0, 6.54705e+06, 1.02728e+07, 0.868475, 0.996885, 0.927331, 1.55842, 0.00423299, 0.00423299, 0, 0.997407, 1.58612, 1.11537, 2.43106e-12}}},
+{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks32 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {1, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}},
+{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "BIp"}, "aB16+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt ca4x2 ks16 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {32, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {802877, 573250, 0, 0, 1.50997e+07, 5.83296e+06, 1.33358, 0.860821, 1.18713, 2.11721, 0.00513884, 0.00513884, 0, 0.996337, 1.41895, 1.02888, 2.72693e-12}}},
+{{'F', "gemm", {"F", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "BIp"}, "aB16+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt ca4x2 ks16 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {953955, 796880, 0, 0, 6.54705e+06, 1.02728e+07, 0.868475, 0.996885, 0.927331, 1.55842, 0.00423299, 0.00423299, 0, 0.997407, 1.58612, 1.11537, 2.43106e-12}}},
 {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}},
 {{'F', "gemm", {"F", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}},
 {{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "IB"}, "aB64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr dm", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {1, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.21597e+06, 165524, 28763.4, 9411.33, 0, 0, 0.278476, 0.599765, 0.920572, 5.21577, 0.021695, 0.0325353, 0.0148497, 1, 1.30279, 0.785158, 1.47381e-11}}},
-{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "at32+m32@64 am32/16+m16@64 aB wg 8x4 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {875588, 627440, 0, 0, 5.68525e+06, 9.24058e+06, 0.613898, 0.645764, 0.795901, 1.22977, 0.00405493, 0.00405493, 0, 1, 1.86804, 1.21451, 2.52854e-12}}},
+{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "at32+m32@64 am32/16+m16@64 aB wg 8x4 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {867949, 633849, 0, 0, 5.89824e+06, 1.00844e+07, 0.930403, 0.918787, 0.795424, 1.23078, 0.00404692, 0.00404692, 0, 0.990134, 1.69975, 1.14597, 2.34849e-12}}},
+{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABs"}, "at32+m32@64 am32+m32@64 aB wg 8x4 xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 917504, 16777216}, {524288, 917504, 32}, {32, 56, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {871124, 600159, 0, 0, 5.72375e+06, 8.32307e+06, 0.555762, 0.650262, 0.789585, 1.21694, 0.00411938, 0.00411938, 0, 1, 1.69975, 1.10825, 2.61032e-12}}},
+{{'F', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IABs"}, "at64x2+m64@64 am32+m32@64 aB wg 8x4 xaf fx vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {866537, 568292, 0, 0, 4.29425e+06, 7.31546e+06, 0.475793, 0.638791, 0.753591, 1.18019, 0.0041899, 0.0041899, 0, 1, 1.78867, 1.11209, 1.76449e-12}}},
 {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
 {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}},
 {{'F', "gemm", {"F", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}},
@@ -846,7 +743,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}},
 {{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}},
 {{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}},
-{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}},
+{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks32 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {1, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}},
 {{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}},
 {{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}},
 {{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}},
@@ -861,11 +758,14 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}},
 {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}},
 {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}},
-{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}},
+{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {262144, 131072, 64}, {16, 8, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 8192, 8192, {1, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}},
 {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}},
 {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}},
-{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {877896, 641311, 0, 0, 7.70867e+06, 1.03055e+07, 0.79175, 0.746432, 0.882948, 1.4915, 0.00422556, 0.00422556, 0, 0.975411, 1.67727, 1.23964, 3.79962e-12}}},
 {{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}},
+{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m64@48 am32+m32@48 aB wg 8x4 af rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 917504, 16777216}, {524288, 917504, 32}, {32, 56, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {860257, 607544, 0, 0, 5.65002e+06, 8.42957e+06, 0.65045, 0.672365, 0.789643, 1.23363, 0.00404835, 0.00404835, 0, 1, 1.9756, 1.22265, 4.06886e-12}}},
+{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m64@64 am32+m32@64 aB wg 8x4 xaf rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {874546, 579329, 0, 0, 5.41819e+06, 8.2985e+06, 0.514532, 0.714068, 0.749831, 1.19945, 0.00433911, 0.00433911, 0, 1, 1.97939, 1.22442, 3.02858e-12}}},
+{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m64@56 am32+m32@56 aB wg 8x4 xaf fx rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {863702, 681514, 0, 0, 6.07928e+06, 1.03055e+07, 0.723236, 0.751648, 0.771003, 1.22757, 0.00408183, 0.00408183, 0, 0.787241, 1.94989, 1.18778, 5.70022e-12}}},
+{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {857350, 514605, 0, 0, 4.49741e+06, 5.0217e+06, 0.578609, 1.14407, 0.733928, 1.13844, 0.00599451, 0.00599451, 0, 0.988626, 1.78984, 1.15379, 4.01784e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 aB vav sys grf256 af hi pt wg 4x8 sb512 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}},
 {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "av32+m128@96 am64+m64@96 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886295, 679810, 0, 0, 6.14973e+06, 1.05103e+07, 0.387882, 0.34723, 0.844766, 1.28789, 0.00202312, 0.00202312, 0, 0.99971, 1.60161, 1.05949, 2.70909e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m64@128 am64x2+m64@128 aB wg 4x8 xaf vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {878684, 628439, 0, 0, 5.63446e+06, 8.38042e+06, 0.326785, 0.347891, 0.805503, 1.24882, 0.00204557, 0.00204557, 0, 1, 1.50403, 1.06152, 3.02972e-12}}},
@@ -940,7 +840,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "i"}, "av32 aS32+m32@8 aB wg 4x8 cab4 ks32 nse hi pt bk0 sn nb 4x8 grf256 kv afb sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.4409e+06, 852858, 0, 0, 4.12877e+06, 8.43776e+06, 0.61084, 0.782227, 0.876523, 1.49805, 0.0164756, 0.0164756, 0, 1, 1.10945, 0.995623, 8.03063e-13}}},
 {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#i"}, "aB8x2 aS8x2 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04685e+06, 855332, 0, 0, 3.9977e+06, 9.216e+06, 0.658789, 0.718726, 0.867169, 1.54002, 0.0168389, 0.0168389, 0, 1, 1.09776, 0.992762, 7.01917e-13}}},
 {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB8/4 aS8 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {991806, 971407, 0, 0, 4.04685e+06, 9.01939e+06, 1.72245, 1.50029, 0.860423, 1.51374, 0.0176793, 0.0176793, 0, 1, 1.07978, 0.979007, 6.31428e-13}}},
-{{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32 at32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}},
+{{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32 at32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m128@96 am64+m64@128 aB wg 8x4 xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {895996, 619115, 0, 0, 5.44113e+06, 8.45414e+06, 0.385983, 0.342777, 0.788529, 1.22228, 0.00199536, 0.00199536, 0, 1, 1.60168, 1.14344, 3.24663e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32x2+m64@64 am64+m128@96 aB wg 8x4 xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {873940, 565190, 0, 0, 4.58752e+06, 7.00416e+06, 0.375538, 0.421029, 0.747052, 1.19407, 0.00251587, 0.00251587, 0, 1, 1.69069, 1.13204, 1.62736e-12}}},
@@ -978,22 +878,23 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, "#I"}, "at32+m64@96 aB128 aB wg 1x4x8 kr af vav li pt sr br sb128 bk0 sm dm grf256 sys kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 128}, {16, 16, 128}, {1, 4, 8}, 1, (WGType) 1, 445, 0, 4096, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.19588e+06, -90151.7, -17276.9, 143923, 2.18726e+06, 1.17965e+06, 0.50932, 0.206862, 0.351165, 0.961797, 0.00952689, 0.0094563, 0.00799782, 1, 1.08799, 0.29902, 3.8884e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aS32+S64@96 aB64+S32@96 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 64}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.18068e+06, 53960.8, 0, 0, 0, 0, 0.213645, 3.70034, 2.54839, 9.45781, 0.067707, 0.0150101, 0.0808417, 1, 1.00383, 0, 0}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS16+m32@32 aB8x2+m16@32 aB wg 8x2 cb3 ks32 nse hi pt sr br bk0 sm sn grf256 kv afb l4 dm", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 2, 1}, 1, (WGType) 1, 441, 12288, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.43403e+06, 452176, 0, 0, 4.48102e+06, 9.67475e+06, 0.486151, 0.555121, 0.890086, 1.40877, 0.0164251, 0.000123601, 0.0164842, 0.490983, 1.14664, 1.00079, 8.17626e-13}}},
-{{'F', "gemm", {"O", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@128 am64+m128@128 aB wg 2x2x8 kr xaf hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.09423e+06, -83565.2, -8479.72, 132562, 2.41664e+06, 1.75309e+06, 0.217848, 0.232009, 0.349329, 1.02701, 0.0120474, 0.012138, 0.00283781, 0.878271, 1.29462, 0.7804, 9.59757e-12}}},
+{{'F', "gemm", {"O", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at64+m64@128 am64+m128@128 aB wg 2x2x8 kr xaf hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 4096, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.09423e+06, -83565.2, -8479.72, 132562, 2.41664e+06, 1.75309e+06, 0.217848, 0.232009, 0.349329, 1.02701, 0.0120474, 0.012138, 0.00283781, 0.878271, 1.29462, 0.7804, 9.59757e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABpqI"}, "am64+S1,64@128 av64+B64@128 aS cs sys grf256 af wg 8x4 bo sb512 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {8192, 8192, 16777216}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {696651, 1.10436e+06, 0, 0, 0, 0, 0.942188, 0.965594, 1.06189, 2.04065, 0.00368306, 0.00368306, 0, 0.911159, 1.34929, 0.941877, 1.64736e-12}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Is"}, "aS32 aS32 aB sys grf256 cab2 wg 4x4 ek l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.16401e+06, 348644, 0, 0, 0, 0, 0.807306, 0.892675, 0.990554, 1.4802, 0.00939438, 0.000733543, 0.0109328, 0.899502, 1.01113, 1.00523, 2.16142e-14}}},
 {{'F', "gemm", {"O", "O", "I"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aB16 aB16 aB wg 8x4 cab3 ks32 nse hi pt sr bk0 grf256 kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.02524e+06, 922411, 0, 0, 4.21888e+06, 8.7081e+06, 0.917542, 0.658478, 0.919692, 1.40366, 0.0167399, 0.0167399, 0, 1, 1.08933, 0.991295, 6.92853e-13}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}},
-{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}},
-{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}},
-{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "at8x2+m16@24 at8x2+m32@8 aB wg 16x1x4 kr kc8 nse nmk li pt sr sb256 bk0 sm sn kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {16, 1, 4}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18993e+06, -230103, -26635.9, 388995, 2.2528e+06, 0, 0.900793, 5.78162, 0.552809, 1.28255, 0.0627307, 0.0602325, 0.0232779, 1, 1.21284, 0.921396, 2.8065e-12}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 445, 0, 32768, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 445, 0, 1024, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 8192, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 445, 0, 4096, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 445, 0, 8192, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}},
+{{'F', "gemm", {"O", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 445, 0, 4096, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}},
+{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 445, 0, 4096, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}},
+{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "at8x2+m16@24 at8x2+m32@8 aB wg 16x1x4 kr kc8 nse nmk li pt sr sb256 bk0 sm sn kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {16, 1, 4}, 1, (WGType) 1, 445, 0, 16384, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.18993e+06, -230103, -26635.9, 388995, 2.2528e+06, 0, 0.900793, 5.78162, 0.552809, 1.28255, 0.0627307, 0.0602325, 0.0232779, 1, 1.21284, 0.921396, 2.8065e-12}}},
+{{'F', "gemm", {"O", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4", {16, (LoopType) 255, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 4, 4}, {false, false, true}}, {'W', 1, {512}}},
 {{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m32@48 am32+S32@64 aB wg 4x8 xaf st vav hi pt sb32 bk0 sn grf256 sys sr br kv afb rr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {869760, 741196, 0, 0, 8.192e+06, 1.05431e+07, 0.731287, 0.777012, 0.881104, 1.51408, 0.00403024, 0.00403024, 0, 0.998184, 1.76752, 1.28618, 2.08947e-12}}},
 {{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB32 aB32 aB wg 8x4 cab3 ks32 af vav hi pt sr br bk0 sn nb 8x4 dm grf256 sys kv afb l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.07153e+06, 922406, 0, 0, 5.48536e+06, 9.18323e+06, 0.85293, 1.19559, 1.04485, 1.64281, 0.00471518, 0.00471518, 0, 0.961495, 1.72318, 1.24713, 3.69593e-12}}},
-{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks16 af vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}},
+{{'F', "gemm", {"Q", "Q", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32 am16+m16@16 aB wg 2x4x4 kr ca3x2 ks32 af vav hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {2, 4, 4}, 1, (WGType) 1, 445, 6144, 16384, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.27954e+06, -187821, -42333.9, 291644, 3.34234e+06, 2.63782e+06, 0.670967, 0.826166, 0.942564, 1.64083, 0.0148244, 0.00555253, 0.00975056, 0.806514, 1.26716, 0.788997, 1.48059e-11}}},
 {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32+m32@48 av32 aB wg 8x4 cb4 ks32 xaf st vav hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 32}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.00046e+06, 649003, 0, 0, 5.58776e+06, 8.89651e+06, 0.806799, 1.52159, 1.05017, 1.76588, 0.00548707, 0.00548707, 0, 0.843701, 1.54307, 1.23912, 1.78553e-12}}},
 {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIqp"}, "av16+m32@32 av16x2 aB wg 4x8 cb4x2 ks32 xaf vav hi pt sr br bk0 grf256 sys rr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 32768, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03762e+06, 706048, 0, 0, 6.93043e+06, 1.10019e+07, 0.892859, 1.0972, 0.98165, 1.70677, 0.00434444, 0.00434444, 0, 0.705466, 1.6217, 1.19447, 5.03184e-12}}},
 {{'F', "gemm", {"Q", "Q", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB16+B16@16 aB wg 4x4 vav hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}},
@@ -1004,9 +905,9 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"Q", "Q", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at32+m32@48 am16x2+m64@32 aB wg 4x4x2 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 524288, 16777216}, {262144, 524288, 32}, {16, 32, 32}, {4, 4, 2}, 1, (WGType) 1, 445, 0, 32768, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.03742e+06, -452509, 25423.9, 695882, 3.92397e+06, 3.94035e+06, 0.820319, 0.785135, 0.84902, 1.56923, 0.00809246, 0.00116078, 0.00745441, 0.526504, 1.60176, 1.07294, 4.15601e-12}}},
 {{'F', "gemm", {"Q", "Q", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS8+S1,16@16 aB8+B8@16 aU vav wg 8x4 bo pt sm sb256 grf256 bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {875898, 864492, 0, 0, 0, 0, 2.55527, 2.12966, 0.857629, 1.85569, 0.0625314, 0.0625314, 0, 1, 1.01096, 1.00724, 3.06523e-14}}},
 {{'F', "gemm", {"S", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
-{{'F', "gemm", {"S", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {1, 1, 1}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
+{{'F', "gemm", {"S", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+m32@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn grf256 l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 1, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@8 aB8+B8@8 aB nse wg 4x8 bo pt sb256 kc8 bk0 sr", {16, (LoopType) 255, 128, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}},
-{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@32 aS32+m16@40 aB wg 4x4 kc16 nse hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}},
+{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, ""}, "aB16+m8@32 aS32+m16@40 aB wg 4x4 kc16 nse hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 32}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08792e+06, 260070, 0, 0, 0, 0, 1.27159, 2.25336, 0.633711, 1.35704, 0.0632943, 0.00105479, 0.0694168, 0.543903, 1.15915, 0.195161, 2.93818e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 am/S16+S32@32 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {933004, 470490, 0, 0, 0, 0, 2.49562, 3.97982, 0.810184, 1.38841, 0.0630776, 0.0630776, 0, 1, 1.22055, -0.309162, 2.6504e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {0, 0, 0}, {0, 0, 0}, {1, 1, 1}, ""}, "am16+m16@64 am/S32+m32@64 aB wg 4x8 kc16 nse hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {895520, 534803, 0, 0, 0, 0, 1.20137, 3.44786, 1.88791, 3.67536, 0.0747548, 0.0747548, 0, 1, 1.37174, 0.989936, 2.27372e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 256}, {8, 8, 1}, ""}, "am16+m32@64 am/S32x2+m16@32 aB wg 2x16 kc16 nse hi pt sb256 bk0 sn grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {938708, 543839, 0, 0, 0, 0, 2.39026, 3.22437, 2.56416, 5.86786, 0.101672, 0.101672, 0, 0.999145, 1.33995, 1.00257, 1.83338e-12}}},
@@ -1018,16 +919,16 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {63, -1, -1}, {1, 1, 1}, ""}, "am16x2+m16@16 aS16+m16@16 aB wg 1x16x2 kr kc16 nse li pt sr kv sb256 bk0 sn grf256 afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {8192, 8192, 0}, {16, 16, 16}, {1, 16, 2}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.11666e+06, -375280, -55079.8, 590017, 2.65421e+06, 0, 6.85937, 1.10986, 0.561901, 1.18544, 0.0718347, 0.0118681, 0.0644105, 0.645395, 1.19384, 0.859778, 5.37885e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "am16x2+S32@16 aS32+S16@32 aS wg 1x4x8 kr kc16 nse li pt sr kv sb256 bk0 sn grf256 afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {8192, 8192, 0}, {4, 16, 32}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 1024, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21263e+06, -101219, -23787.5, 154376, 2.3765e+06, 0, 4.80141, 0.952132, 2.62236, 5.17717, 0.130559, 0.263753, 0.0519091, 1, 1.25383, 1.14072, -8.09176e-14}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+S32@24 aS8+S16@32 aS wg 1x8x4 kr kc8 nse li pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {8192, 8192, 0}, {1, 16, 8}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 512, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16307e+06, -117224, -49945.9, 211601, 2.21184e+06, 0, 27.752, 0.924823, 7.95811, 19.7907, 0.498836, 0.333805, 0.307337, 1, 1.19347, 0, 0}}},
-{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m16@8 aS16+m32@8 aB wg 2x4x4 kr kc16 nse hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22732e+06, -192672, -38121.5, 322384, 3.32595e+06, 0, 2.49763, 1.54907, 0.628927, 1.39358, 0.0627876, 0.0504407, 0.0208002, 0.589308, 1.19034, 0.60621, 4.79866e-11}}},
-{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m16@32 aS16+m16@32 aB wg 2x4x4 kr kc16 nse hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.24447e+06, -148653, -56383, 266371, 2.85901e+06, 0, 1.66468, 2.34448, 0.802059, 1.33917, 0.0632256, 0.0464224, 0.0241823, 0.804536, 1.32602, 0.945203, 1.61386e-11}}},
+{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, ""}, "aB16+m16@8 aS16+m32@8 aB wg 2x4x4 kr kc16 nse hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.22732e+06, -192672, -38121.5, 322384, 3.32595e+06, 0, 2.49763, 1.54907, 0.628927, 1.39358, 0.0627876, 0.0504407, 0.0208002, 0.589308, 1.19034, 0.60621, 4.79866e-11}}},
+{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, ""}, "aB16+m16@32 aS16+m16@32 aB wg 2x4x4 kr kc16 nse hi pt sr sb256 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {2, 4, 4}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.24447e+06, -148653, -56383, 266371, 2.85901e+06, 0, 1.66468, 2.34448, 0.802059, 1.33917, 0.0632256, 0.0464224, 0.0241823, 0.804536, 1.32602, 0.945203, 1.61386e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {32, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {1024, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB16+m8@24 aS16+m16@24 aB wg 1x2x16 kr kc16 nse hi pt sr br sb32 bk0 sn grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.19256e+06, -99702.7, -5250.1, 135470, 4.21888e+06, 0, 0.924905, 1.15646, 0.601147, 1.43162, 0.0628943, 0.0635026, 0.0173549, 1, 1.36155, 1.02238, 5.68694e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aS16+m8@24 aB wg 1x4x8 kr kc8 nse hi pt sb256 bk0 sn grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 16}, {1, 4, 8}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {973570, 471411, 10282.9, 75820.8, 0, 0, 1.45806, 1.71929, 3.09927, 7.18832, 0.0643004, 0.0643004, 0, 0.97308, 1.43932, 0.981141, 7.50692e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1023, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aS16+m8@24 aB wg 1x4x8 kr kc8 nse hi pt sb256 bk0 sn grf256 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {1, 4, 8}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21899e+06, -88481.9, -25272.1, 146166, 3.03923e+06, 0, 1.52455, 1.45603, 0.792316, 1.37923, 0.0633789, 0.0507551, 0.0245627, 1, 1.38276, 0.904775, 1.71039e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {256, 256, 64}, {1, 1, 1}, ""}, "aB8+m16@8 aS8x2 aB wg 1x4x4 kr kc8 nse hi pt ar sb32 bk0", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 8}, {1, 4, 4}, 1, (WGType) 1, 261, 0, 1024, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.19515e+06, 151613, 62335.9, 17126.5, 0, 0, 1.28521, 4.53132, 1.51228, 3.55467, 0.145838, 0.0796081, 0.0700619, 0.527675, 1.35925, 1.00198, 1.34953e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}},
-{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "am16x2+m32@16 aS32+m16@32 aS wg 1x4 kc16 nse li pt sr sb256 bk0 sn grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {65536, 262144, 16777216}, {65536, 262144, 16777216}, {4, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15385e+06, 54677.4, 0, 0, 0, 0, 4.66453, 0.94861, 2.51588, 5.10353, 0.11684, 0.263033, 0.0520785, 1, 1.35189, 1.13162, 7.09638e-15}}},
-{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+m8@16 aB16+m16@16 aB wg 4x4 kc8 nse hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}},
+{{'F', "gemm", {"S", "S", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {4, 4, 1}, ""}, "am16x2+m32@16 aS32+m16@32 aS wg 1x4 kc16 nse li pt sr sb256 bk0 sn grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {65536, 262144, 16777216}, {65536, 262144, 16777216}, {4, 16, 32}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15385e+06, 54677.4, 0, 0, 0, 0, 4.66453, 0.94861, 2.51588, 5.10353, 0.11684, 0.263033, 0.0520785, 1, 1.35189, 1.13162, 7.09638e-15}}},
+{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, ""}, "aB8+m8@16 aB16+m16@16 aB wg 4x4 kc8 nse hi pt sb256 bk0 grf256 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10018e+06, 251905, 0, 0, 0, 0, 1.56408, 2.85947, 0.648851, 1.37611, 0.0629702, 0.000146865, 0.0632313, 0.517444, 1.16754, -0.0884205, 2.14696e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "am16+m16@64 am16+m32@64 aB wg 4x4 kc16 nse hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {262144, 65536, 16777216}, {16, 4, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10359e+06, 253001, 0, 0, 0, 0, 1.23463, 4.97057, 1.98081, 3.72207, 0.0797284, 0.00775855, 0.0752955, 0.909553, 1.43847, 0.946385, 6.92721e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am16+m32@32 am16+m16@32 aB wg 2x8 kc16 nse hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 32768, 16777216}, {262144, 32768, 16777216}, {16, 2, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09369e+06, 252258, 0, 0, 0, 0, 1.95015, 5.48836, 2.57025, 5.95009, 0.14056, 0.0114998, 0.12796, 0.974789, 1.2964, 0.960056, 4.31831e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aB8/4x2+m8@28 aB8/4x2+m8@28 aP nse wg 4x8 bo pt sb256 kc8 grf256 bk0 sr kv afb", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 409, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {866662, 832973, 0, 0, 6.62733e+06, 0, 2.31399, 2.18462, 0.879335, 1.58093, 0.0624816, 0.0624816, 0, 1, 1.01086, 1.00539, 3.62849e-14}}},
@@ -1039,7 +940,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "aB8x2+m8@16 aB16+m8@24 aB wg 2x2x8 kr kc8 nse nmk li pt sr sb32 bk0 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 16}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.16702e+06, -121012, -8196.35, 178120, 2.6026e+06, 0, 1.04823, 5.51776, 0.693341, 1.46543, 0.0628152, 0.058718, 0.0176573, 0.998417, 1.16326, 0.364617, 1.16605e-11}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 1, -1}, {1, 1, 1}, ""}, "aB8x2+S8@24 at8x2+S32@24 aB wg 4x1x8 kr kc8 nse nmk li pt sr sb256 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 16384, 16777216}, {8192, 8192, 0}, {16, 1, 8}, {4, 1, 8}, 1, (WGType) 1, 413, 0, 256, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1701e+06, -71179.7, -3197.37, 120149, 2.58458e+06, 0, 0.911003, 23.7279, 3.34366, 10.235, 1.37481, 0.288391, 0.23596, 0.333333, 1.09898, 0, 0}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}},
-{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {1, 1, 1}, ""}, "am16x2+m16@16 aB8x2+m8@24 aS wg 1x8x4 kr kc8 nse li pt sr sb256 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {65536, 262144, 32}, {4, 16, 16}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.26743e+06, -130651, -75631.7, 238106, 2.415e+06, 0, 13.4604, 0.983422, 2.77777, 5.15729, 0.127127, 0.144409, 0.0797807, 1, 1.29538, 1.06434, 1.41696e-13}}},
+{{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, -1, -1}, {4, 4, 1}, ""}, "am16x2+m16@16 aB8x2+m8@24 aS wg 1x8x4 kr kc8 nse li pt sr sb256 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {65536, 262144, 16777216}, {65536, 262144, 32}, {4, 16, 16}, {1, 8, 4}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.26743e+06, -130651, -75631.7, 238106, 2.415e+06, 0, 13.4604, 0.983422, 2.77777, 5.15729, 0.127127, 0.144409, 0.0797807, 1, 1.29538, 1.06434, 1.41696e-13}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, -1, -1}, {1, 1, 1}, ""}, "at8+m16@8 aB16x2+m16@8 aS wg 1x8x8 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {16384, 262144, 16777216}, {16384, 262144, 32}, {1, 16, 16}, {1, 8, 8}, 1, (WGType) 1, 413, 0, 512, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.10613e+06, -79156.7, -1736.09, 153649, 2.00704e+06, 0, 47.8566, 0.896699, 10.8519, 21.2763, 1.082, 0.293987, 0.241183, 0.333333, 1.10378, 0, 0}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8x2+B8@8 am16x2+S16@8 aB wg 4x8 kc8 nse hi pt sb256 bk0 grf256 sr l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {869157, 563214, 0, 0, 0, 0, 2.39412, 4.22276, 0.853271, 1.30754, 0.0725087, 0.0725087, 0, 1, 1.25715, 0.942426, 4.35208e-12}}},
 {{'F', "gemm", {"S", "S", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "am8+m8@32 am16x2+m16@16 aB wg 2x8 kc8 nse hi pt sr sb256 bk0 grf256 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 16}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.09546e+06, 254169, 0, 0, 0, 0, 1.63097, 3.84728, 1.50643, 2.37923, 0.0742032, 0.0154716, 0.0594372, 0.827878, 1.42273, 0.913754, 1.12977e-11}}},
@@ -1137,7 +1038,7 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"[FO]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "am32 at32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}},
 {{'F', "gemm", {"[F0]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}},
 {{'F', "gemm", {"[FO]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}},
-{{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 aB wg 4x2x4 kr xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}},
+{{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 aB wg 4x2x4 kr xaf st hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}},
 {{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}},
 {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 af vav hi pt bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.11006e+06, 935686, 0, 0, 0, 0, 1.58314, 3.00527, 1.01282, 1.59913, 0.00625344, 0.00625344, 0, 1, 1.56406, 1.11642, 3.07212e-12}}},
 {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks32 af vav hi pt bk0 sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {8192, 8192, 16777216}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07316e+06, 789496, 0, 0, 0, 0, 1.617, 1.63049, 0.937992, 1.68308, 0.0104723, 0.0104723, 0, 0.85096, 1.32269, 1.03329, 2.07642e-12}}},
@@ -1165,13 +1066,25 @@ auto _CATALOG_ = kcatalog::toFlatCatalog({
 {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S1,16@32 aB32x2 aB wg 1x4x8 kr cb4 ks32 af vav hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.0672e+06, 762832, 500.274, 77177.4, 0, 0, 1.36188, 1.19635, 3.53273, 8.13743, 0.0615908, 0.0615908, 0, 0.79968, 1.28461, 0.929214, 4.37091e-12}}},
 {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S32@48 aB32/16x2 aB wg 8x4 cb4x2 ks32 af vav hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {964729, 752109, 0, 0, 0, 0, 1.46062, 1.46554, 1.01724, 2.17327, 0.0239322, 0.0239322, 0, 1, 1.25665, 0.945813, 3.00962e-12}}},
 {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S64@64 aB16x2 aB wg 4x8 cb4 ks64 af vav hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.00341e+06, 675337, 0, 0, 0, 0, 2.06693, 1.45219, 1.47193, 3.06315, 0.0374548, 0.0374548, 0, 1, 1.33339, 0.377183, 1.26633e-11}}},
-{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32 am128 aB wg 2x1x8 ikr xaf st vav hi pt ar sb128 bk0 bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 9, -1}, {-1, 32, -1}, {-1, 9, -1}, {-1, 32, -1}, {16, 16, 1}, "IAB"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf vav hi pt ar sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'W', 1, {256}}},
-{{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "IAB"}, "at32+m128@96 am32x2+m64@96 aB wg 2x16 vav hi pt ar sb128 bk0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {2097152, 262144, 16777216}, {2097152, 262144, 16777216}, {128, 16, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'W', 1, {2048}}},
-{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav ar sb64 bm0 bk0 sys nmk np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 9, -1}, {-1, 64, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav ar sb64 bm0 bk0 sys nmk grf256 np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'G', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "IABs"}, "am32+m32@112 am32x2+m32@112 aB wg 4x8 ca4 ks32 af st rr vav hi pt sr br bk0 sn nb 4x0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 458752, 16777216}, {1048576, 458752, 16777216}, {64, 28, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {16, 16, 4}, {true, true, true}}, {'W', 1, {1e+06}}},
-{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "IAB"}, "at32+m32@64 am32+m64@64 aB wg 4x8 ca3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 4x0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {16, 16, 4}, {true, true, true}}, {'W', 1, {1e+06}}},
-{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "ABIpq"}, "at32 am128 aB wg 2x1x8 ikr xaf st acb hi pt ar sb128 bk0 bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
-{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 9, -1}, {-1, 32, -1}, {-1, 9, -1}, {-1, 32, -1}, {16, 16, 1}, "ABIpq"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf acb hi pt ar sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'W', 1, {256}}}
+{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32 am128 aB wg 2x1x8 ikr xaf st vav hi pt sr br sb128 bk0 bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
+{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 9, -1}, {-1, 16, -1}, {-1, 9, -1}, {-1, 16, -1}, {16, 16, 1}, "IAB"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf vav hi pt sr br sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'W', 1, {256}}},
+{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 17, -1}, {-1, 24, -1}, {-1, 17, -1}, {-1, 24, -1}, {16, 16, 1}, "ABI"}, "at64x2 am32x2+m64@32 aB wg 4x2 af rr vav hi pt sr br sb64 bk0 grf256 sys np np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'W', 1, {1024}}},
+{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 25, -1}, {-1, 32, -1}, {-1, 25, -1}, {-1, 32, -1}, {16, 16, 1}, "IAB"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf vav hi pt sr br sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'W', 1, {256}}},
+{{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 33, -1}, {-1, 48, -1}, {-1, 33, -1}, {-1, 48, -1}, {16, 16, 1}, "ABI"}, "at64+m64@48 am32+m16@48 aB wg 4x1 xaf rr vav hi pt sr br sb64 bk0 sm grf256 sys np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'W', 1, {1024}}},
+{{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "ABI"}, "at32+m128@96 am32x2+m64@96 aB wg 2x16 vav hi pt sr br sb128 bk0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {2097152, 262144, 16777216}, {2097152, 262144, 16777216}, {128, 16, 32}, {2, 16, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {879529, 62860.9, 0, 0, 0, 0, 1.12572, 1.9182, 3.81465, 7.84556, 0.00532516, 0.00532516, 0, 1, 1.01261, 1.00705, -3.00232e-14}}},
+{{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "ABI"}, "at128+m128@96 am32+m64@96 aB wg 2x1x16 sys ikr k128 sr br li pt nmk sb128 np bk0", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {2, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {16, 16, 4}, {true, true, true}}, {'E', 17, {764990, 1268.09, 0, 0, 0, 0, 1.01617, 2.03052, 3.46533, 8.79028, 0.0171141, 0.00829374, 0.00596437, 0.150105, 1.78232, -3.38813, 6.20049e-11}}},
+{{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "ABI"}, "at64+m128@32 am128+m64@32 aB wg 2x8 xaf st hi pt sr br sb128 sn grf256 cr0 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 128}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {525002, 18498.2, 0, 0, 0, 0, 0.485217, 0.854072, 1.96694, 5.31108, 0.00356541, 0.00161949, 0.00452935, 0.938224, 1.01441, 1.01414, -4.57758e-14}}},
+{{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 8191}, {-1, 1, -1}, {-1, 1, 8191}, {16, 16, 1}, "ABI"}, "at128+m64@48 am128+m32@48 aB wg 2x1x4 ikr af hi pt sr br sb128 grf256 sys bk0 acb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {2, 1, 4}, 1, (WGType) 1, 4357, 0, 256, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}},
+{{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 8192}, {-1, 1, -1}, {-1, 1, 8192}, {-1, 1, -1}, {16, 16, 1}, "ABI"}, "at64+m128@64 am128+m128@64 aB wg 1x1x8 ikr af rr hi pt sr br sb128 sn grf256 sys bk0 acb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {1, 1, 8}, 1, (WGType) 1, 4357, 0, 128, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}},
+{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav sr br sb64 bm0 bk0 sys nmk np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
+{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 9, -1}, {-1, 64, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav sr br sb64 bm0 bk0 sys nmk grf256 np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "ABIs"}, "am32+m32@112 am32x2+m32@112 aB wg 4x8 ca4 ks32 af st rr vav hi pt sr br bk0 sn nb 4x0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 458752, 16777216}, {1048576, 458752, 16777216}, {64, 28, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {855191, 60448.1, 0, 0, 0, 0, 0.945598, 1.89384, 4.01425, 7.25708, 0.00522328, 0.00522328, 0, 1, 1.04077, 1.00307, 1.26521e-13}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 512, -1}, {-1, -1, -1}, {16, 16, 1}, "ABI"}, "at32+m32@64 am32+m64@64 aB wg 4x8 ca3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 4x0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {969021, 62613.4, 0, 0, 0, 0, 1.08371, 1.91773, 3.98757, 8.74675, 0.00540242, 0.00540242, 0, 1, 1.00953, 1.00923, -1.52423e-14}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 511, -1}, {16, 16, 1}, "ABI"}, "at32+m32@32 am32+m64@32 aB wg 4x4 ca3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 4x0 grf256 sys acb cr0 ", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {32, 32, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {758684, 49729.1, 0, 0, 0, 0, 0.934748, 1.93997, 3.75236, 8.71198, 0.00639545, 0.00639545, 0, 1, 1.01136, 1.00459, 2.53456e-14}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "ABIpq"}, "at32 am128 aB wg 2x1x8 ikr xaf st acb hi pt sr br sb128 bk0 bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {16, 16, 4}, {true, true, true}}, {'E', 17, {688345, 14101.4, 16156.7, 499.636, 0, 0, 0.759369, 1.91298, 1.29685, 8.23773, 0.0203254, 0.0334888, 0.021689, 1, 1.90691, 0.897029, 6.06249e-12}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 9, -1}, {-1, 32, -1}, {-1, 9, -1}, {-1, 32, -1}, {16, 16, 1}, "ABIpq"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf acb hi pt sr br sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'E', 17, {735000, 35051.3, 20105.7, 10419.3, 0, 0, 1.02806, 2.00271, 0.717545, 5.45873, 0.0120863, 0.0120863, 0, 1, 1.00881, 1.00341, 2.10507e-13}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {-1, 8, -1}, {-1, 2, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32 am128 aB wg 2x1x8 ikr xaf st acb li sr br sb128 bk0 bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 161, (LoopType) 255, (LoopType) 2}, {16777216, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {16, 16, 4}, {true, true, true}}, {'E', 17, {683920, 12729, 10055.6, 676.527, 0, 0, 1.01772, 1.93175, -0.0239557, 5.55272, 0.0286059, 0.0303289, 0.0227933, 0.973783, 1.00914, 1.00487, 3.30544e-14}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 25, -1}, {-1, 32, -1}, {-1, 25, -1}, {-1, 32, -1}, {16, 16, 1}, "IAB"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf acb hi pt sr br sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'E', 17, {739630, 35298.1, 22639.2, 10401.8, 0, 0, 1.02641, 1.90486, 0.696072, 5.49548, 0.0121039, 0.0121039, 0, 1, 1.00848, 1.00061, 2.96932e-13}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 8192}, {-1, 1, -1}, {-1, 1, 8192}, {-1, 1, -1}, {16, 16, 1}, "ABI"}, "at64+m128@64 am128+m128@64 aB wg 1x1x8 ikr af rr hi pt sr br sb128 sn grf256 sys bk0 acb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {1, 1, 8}, 1, (WGType) 1, 4357, 0, 128, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}},
+{{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 8191}, {-1, 1, -1}, {-1, 1, 8191}, {16, 16, 1}, "ABI"}, "at128+m64@48 am128+m32@48 aB wg 2x1x4 ikr af hi pt sr br sb128 grf256 sys bk0 acb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {2, 1, 4}, 1, (WGType) 1, 4357, 0, 256, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}}
 });
diff --git a/src/gpu/intel/jit/gemm/selector/db/ukernel_lmr.db b/src/gpu/intel/jit/gemm/selector/db/ukernel_lmr.db
index 13150224f38..05839d5f225 100644
--- a/src/gpu/intel/jit/gemm/selector/db/ukernel_lmr.db
+++ b/src/gpu/intel/jit/gemm/selector/db/ukernel_lmr.db
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 /*@kcatalog@*/
-auto  _CATALOG_ = kcatalog::toFlatCatalog({
+auto _CATALOG_ = kcatalog::toArray({
 {{'F', "ugemm", {"H", "H", "S"}, {"A2#16,64", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 8, 1}, "xzIB"}, "lB16x2 am32+m32@64 rB sys wg 2x4 vav grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 32, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {128, 8, 4}, {false, true, false}}, {'W', 1, {1024}}},
 {{'F', "ugemm", {"H", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "xzIB"}, "lS16x2 am32+m32@64 rB sys wg 2x4 vav grf256", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 1024}, {8192, 8192, 1024}, {32, 32, 32}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {4, 8, 4}, {false, true, false}}, {'W', 1, {1024}}}
 });
diff --git a/src/gpu/intel/jit/gemm/selector/db/ukernel_mlr.db b/src/gpu/intel/jit/gemm/selector/db/ukernel_mlr.db
index e98012d3596..6dfd5e1adda 100644
--- a/src/gpu/intel/jit/gemm/selector/db/ukernel_mlr.db
+++ b/src/gpu/intel/jit/gemm/selector/db/ukernel_mlr.db
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 /*@kcatalog@*/
-auto _CATALOG_ = kcatalog::toFlatCatalog({
+auto _CATALOG_ = kcatalog::toArray({
 {{'E', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "yI"}, "aB16x2 lB32 rB sys dw wg 4x8 vav", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 4, 32}, {4, 8, 1}, 1, (WGType) 0, 257, 0, 0, {4, 4, 4}, {true, false, true}}, {'W', 1, {64}}},
 {{'F', "ugemm", {"H", "H", "S"}, {"N", "B16", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 1, 1}, "yzIA"}, "av16x2+m16@32 lB16x2 rB sys wg 2x4 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 16}, {2, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 128, 4}, {true, false, false}}, {'W', 1, {512}}},
 {{'F', "ugemm", {"H", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 4, 1}, "yIA"}, "av16x2+m16@32 lB32x2 rB sys wg 4x8 vav", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 4, 4}, {true, false, true}}, {'W', 1, {128}}},
diff --git a/src/gpu/intel/jit/gemm/selector/db/ukernel_mmr.db b/src/gpu/intel/jit/gemm/selector/db/ukernel_mmr.db
index 98f9c81cee2..765b00c79e8 100644
--- a/src/gpu/intel/jit/gemm/selector/db/ukernel_mmr.db
+++ b/src/gpu/intel/jit/gemm/selector/db/ukernel_mmr.db
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 /*@kcatalog@*/
-auto _CATALOG_ = kcatalog::toFlatCatalog({
+auto _CATALOG_ = kcatalog::toArray({
 {{'C', "ugemm", {"B", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab4 as8 rb l4 ca1 wg 2x8 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {2, 8, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}},
 {{'C', "ugemm", {"B", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "ab2 ab8 rb l4 cab1 wg 4x4 int", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {32, 16, 8}, {4, 4, 1}, 1, (WGType) 1, 1, 6144, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {512}}},
 {{'C', "ugemm", {"B", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "as8x2 ab16/8 rb l4 cb1 wg 8x2 vnc nmk", {8, (LoopType) 1, 128, {(LoopType) 1, (LoopType) 0, (LoopType) 255}, {4096, 4096, 2048}, {4096, 4096, 2048}, {16, 16, 16}, {8, 2, 1}, 1, (WGType) 1, 1, 2048, 0, {2, 2, 4}, {true, true, false}}, {'W', 1, {256}}},
diff --git a/src/gpu/intel/jit/gemm/selector/kernel_evaluator.cpp b/src/gpu/intel/jit/gemm/selector/kernel_evaluator.cpp
index 6db4b9f7353..f27077c3ab7 100644
--- a/src/gpu/intel/jit/gemm/selector/kernel_evaluator.cpp
+++ b/src/gpu/intel/jit/gemm/selector/kernel_evaluator.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -244,9 +244,9 @@ double evaluateECore(const kcatalog::Entry &e, const DerivedEvaluateParams &dp,
     } else if (e.driverInfo.kParallelVariable()) {
         /* Variable k-slicing disabled: use kr to fill GPU as much as possible */
         if (threads < capacity && !noKR)
-            aux.wgK = roundDownSmallPow2(uint16_t(std::floor(capacity / threads)));
+            aux.wgK = roundDownSmallPow2(uint16_t(threads ? std::floor(capacity / threads) : 1));
     } else if (e.driverInfo.shrinkWGK())
-        aux.wgK = roundUpSmallPow2(uint8_t(std::ceil(capacity * e.driverInfo.wg[LoopK] * e.driverInfo.fillGoal() / threads)));
+        aux.wgK = roundUpSmallPow2(uint8_t(threads ? std::ceil(capacity * e.driverInfo.wg[LoopK] * e.driverInfo.fillGoal() / threads) : 1));
     else
         aux.wgK = e.driverInfo.wg[LoopK];
 
@@ -496,8 +496,6 @@ DerivedEvaluateParams getDerivedParams(const kcatalog::Entry &e, const EvaluateP
     dp.threadCount *= (dp.wgCountK * p.sizes.batch);
 
     switch (e.selector.hw) {
-        case kcatalog::HWTagGen9:
-        case kcatalog::HWTagGen11:
         case kcatalog::HWTagGen12LP:
             dp.threadsPerEU = 7;
             break;
@@ -509,7 +507,6 @@ DerivedEvaluateParams getDerivedParams(const kcatalog::Entry &e, const EvaluateP
     int ssCount;
     switch (e.selector.hw) {
         case kcatalog::HWTagGen12LP:
-        case kcatalog::HWTagXeHP:
         case kcatalog::HWTagXeHPG:
             ssCount = p.euCount >> 4;
             break;
diff --git a/src/gpu/intel/jit/gemm/selector/kernel_selector.cpp b/src/gpu/intel/jit/gemm/selector/kernel_selector.cpp
index d4a8aa25a33..96b2ebc2d62 100644
--- a/src/gpu/intel/jit/gemm/selector/kernel_selector.cpp
+++ b/src/gpu/intel/jit/gemm/selector/kernel_selector.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ inline bool tagMatch(const char *tref, const char *tpattern)
     for (auto c = tref; *c; c++) {
         // Lowercase tags -> must not match pattern
         // Uppercase tags -> must match pattern
-        int cu = *c & ~0x20;     // tolower(c)
+        int cu = *c & ~0x20;     // toupper(c)
         bool match = (std::strchr(tpattern, cu) != nullptr);
         bool wantMatch = (*c & 0x20) == 0;
         if (match != wantMatch)
@@ -207,14 +207,18 @@ const kcatalog::Entry *select(const kcatalog::Catalog &catalog, int npatterns, c
         }
     }
 
-    /* Temporarily reuse XeHPC strategies for Xe2 until more Xe2 strategies are
+    /* Temporarily reuse XeHPC/Xe2 strategies for Xe2/Xe3 until more strategies are
        in the catalog*/
-    if (!bestEntry && patterns[0].selector.hw == kcatalog::HWTagXe2) {
+    if (!bestEntry
+            && (patterns[0].selector.hw == kcatalog::HWTagXe2
+                    || patterns[0].selector.hw == kcatalog::HWTagXe3
+                    )) {
         std::vector<MatchParams> override_patterns;
+        const bool isXe3 = patterns[0].selector.hw == kcatalog::HWTagXe3;
         override_patterns.reserve(npatterns);
         for (int i = 0; i < npatterns; i++) {
             override_patterns.emplace_back(patterns[i]);
-            override_patterns.back().selector.hw = kcatalog::HWTagXeHPC;
+            override_patterns.back().selector.hw = isXe3 ? kcatalog::HWTagXe2 : kcatalog::HWTagXeHPC;
         }
         return select(catalog, npatterns, override_patterns.data(), eparams, aux);
     }
@@ -255,7 +259,7 @@ const kcatalog::Entry *upper_bound(const kcatalog::Catalog &catalog, const kcata
 }
 
 
-MatchParamsBase::MatchParamsBase(ngen::HW hw, bool systolicAvailable, const GEMMProblem &problem_)
+MatchParamsBase::MatchParamsBase(ngen::HW hw, bool systolicAvailable, bool isIntegrated, const GEMMProblem &problem_)
 {
     using namespace kcatalog;
 
@@ -263,13 +267,11 @@ MatchParamsBase::MatchParamsBase(ngen::HW hw, bool systolicAvailable, const GEMM
 
     switch (hw) {
         default: assert(!"Unknown architecture");
-        case ngen::HW::Gen9:    selector.hw = kcatalog::HWTagGen9;    break;
-        case ngen::HW::Gen11:   selector.hw = kcatalog::HWTagGen11;   break;
         case ngen::HW::Gen12LP: selector.hw = kcatalog::HWTagGen12LP; break;
-        case ngen::HW::XeHP:    selector.hw = kcatalog::HWTagXeHP;    break;
         case ngen::HW::XeHPG:   selector.hw = kcatalog::HWTagXeHPG;   break;
         case ngen::HW::XeHPC:   selector.hw = kcatalog::HWTagXeHPC;   break;
         case ngen::HW::Xe2:     selector.hw = kcatalog::HWTagXe2;     break;
+        case ngen::HW::Xe3:     selector.hw = kcatalog::HWTagXe3;     break;
     }
 
     auto &C = problem.C;
@@ -329,18 +331,23 @@ MatchParamsBase::MatchParamsBase(ngen::HW hw, bool systolicAvailable, const GEMM
     if (systolicAvailable)
         *tagPtr++ = ReqSystolic;
 
+    if (isIntegrated) *tagPtr++ = ReqIntegrated;
+
     if (problem.batch != BatchMode::None) {
         *tagPtr++ = ReqBatch;
         if (problem.batchDims > 1)
             *tagPtr++ = ReqBatchMultiDim;
     }
 
+    if (problem.aoPtrDims > 0 || problem.boPtrDims > 0)
+        *tagPtr++ = ReqOffsetMultiDim;
+
     problem.autoTypeConversions(hw, systolicAvailable);
     if (problem.needsASums() && !problem.sumA) *tagPtr++ = ReqSumA;
     if (problem.needsBSums() && !problem.sumB) *tagPtr++ = ReqSumB;
 
-    if (hw == ngen::HW::Xe2)
-        *tagPtr++ = ReqXe2Block2D;
+    if (hw == ngen::HW::Xe2) *tagPtr++ = ReqXe2Block2D;
+    if (hw == ngen::HW::Xe3) *tagPtr++ = ReqXe2Block2D;
 
     sizes.batch = sizes.m = sizes.n = sizes.k = 0;
 }
diff --git a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp
index 2bf0107efb7..e33e3e278c7 100644
--- a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp
+++ b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include "common/float16.hpp"
 #include "common/impl_registration.hpp"
 #include "common/type_helpers.hpp"
+#include "common/verbose_msg.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/jit/gemm/gemm_walk_orders.hpp"
 #include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
@@ -61,11 +62,21 @@ status_t xe_hp_systolic_gemm_t::pd_t::init(impl::engine_t *engine) {
             && utils::one_of(d->c_type(), s32, f32, s8, u8, f16));
 
     if (dt_int_ok) {
-        a_zp_ = !attr()->zero_points_.has_default_values(DNNL_ARG_SRC);
-        b_zp_ = !attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS);
-        c_zp_ = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
+        a_zp_ = !attr()->zero_points_.has_default_values(DNNL_ARG_A);
+        b_zp_ = !attr()->zero_points_.has_default_values(DNNL_ARG_B);
+        c_zp_ = !attr()->zero_points_.has_default_values(DNNL_ARG_C);
     }
 
+    // LIMITATIONS:
+    // - batch is not supported for unpacked inputs.
+    // - runtime dims are not supported
+    bool limits_ok
+            = !utils::one_of(DNNL_RUNTIME_DIM_VAL, d->m(), d->n(), d->k());
+
+    VDISPATCH_GEMM(limits_ok, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
+
+    // Must check runtime dimensions before calling `set_default_formats` to
+    // avoid undefined behavior.
     VDISPATCH_GEMM_SC(
             set_default_formats(d->a_type()), VERBOSE_UNSUPPORTED_TAG);
 
@@ -74,11 +85,8 @@ status_t xe_hp_systolic_gemm_t::pd_t::init(impl::engine_t *engine) {
 
     VDISPATCH_GEMM(!use_nocopy(), VERBOSE_SKIP_PRIMITIVE_IMPL);
 
-    // LIMITATIONS:
-    // - batch is not supported for unpacked inputs.
-    // - runtime dims are not supported
-    bool limits_ok
-            = !utils::one_of(DNNL_RUNTIME_DIM_VAL, d->m(), d->n(), d->k());
+    // `set_default_formats` determines a/b/c packing, so it must be called
+    // prior to this.
     if (!packed_a())
         limits_ok = limits_ok && (d->lda() != DNNL_RUNTIME_DIM_VAL)
                 && (d->batch() == 1);
@@ -88,12 +96,12 @@ status_t xe_hp_systolic_gemm_t::pd_t::init(impl::engine_t *engine) {
     if (!packed_c())
         limits_ok = limits_ok && (d->ldc() != DNNL_RUNTIME_DIM_VAL);
 
-    auto attr_skip_mask = smask_t::scales_runtime | smask_t::post_ops;
+    auto attr_skip_mask = smask_t::scales | smask_t::post_ops;
 
-    if (dt_int_ok) attr_skip_mask |= smask_t::zero_points_runtime;
+    if (dt_int_ok) attr_skip_mask |= smask_t::zero_points;
 
-    bool arch_ok = utils::one_of(
-            arch, arch_t::xe_hp, arch_t::xe_hpg, arch_t::xe_hpc, arch_t::xe2);
+    bool arch_ok = utils::one_of(arch, arch_t::xe_hp, arch_t::xe_hpg,
+            arch_t::xe_hpc, arch_t::xe2, arch_t::xe3);
 
     VDISPATCH_GEMM(limits_ok, VERBOSE_RUNTIMEDIM_UNSUPPORTED);
     VDISPATCH_GEMM((dt_float_ok || dt_int_ok), VERBOSE_UNSUPPORTED_DT_CFG);
@@ -111,6 +119,15 @@ status_t xe_hp_systolic_gemm_t::pd_t::init(impl::engine_t *engine) {
                                    && d->bias_mask() < 8),
             VERBOSE_UNSUPPORTED_BIAS_CFG);
 
+    // Limit scope of large buffer implementation support as the ability test
+    // large buffers is limited by testing time.
+    VDISPATCH_GEMM(std::max({memory_desc_wrapper(src_md(0)).size(),
+                           memory_desc_wrapper(src_md(1)).size(),
+                           memory_desc_wrapper(src_md(2)).size(),
+                           memory_desc_wrapper(dst_md()).size()})
+                    <= (size_t)std::numeric_limits<int32_t>::max(),
+            VERBOSE_SHAPE_RESTRICTION);
+
     VDISPATCH_GEMM_SC(init_post_ops(), VERBOSE_UNSUPPORTED_POSTOP);
 
     if (dt_int_ok) {
@@ -118,13 +135,21 @@ status_t xe_hp_systolic_gemm_t::pd_t::init(impl::engine_t *engine) {
                         && IMPLICATION(b_zp_, !packed_a()),
                 VERBOSE_UNSUPPORTED_ZP_CFG);
 
-        int cmask_a = 0, cmask_b = 0, cmask_c = 0;
-        CHECK(attr()->zero_points_.get(DNNL_ARG_WEIGHTS, &cmask_b));
-        CHECK(attr()->zero_points_.get(DNNL_ARG_SRC, &cmask_a));
-        CHECK(attr()->zero_points_.get(DNNL_ARG_DST, &cmask_c));
-        VDISPATCH_GEMM((cmask_a == 0) && (cmask_b == 0)
-                        && utils::one_of(cmask_c, 0, 1 << 0, 1 << 1),
-                VERBOSE_UNSUPPORTED_ZP_CFG);
+        const auto &zp = attr()->zero_points_;
+
+        if (!zp.has_default_values(DNNL_ARG_SRC)) {
+            VDISPATCH_GEMM(
+                    zp.get_mask(DNNL_ARG_SRC) == 0, VERBOSE_UNSUPPORTED_ZP_CFG);
+        }
+        if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+            VDISPATCH_GEMM(zp.get_mask(DNNL_ARG_WEIGHTS) == 0,
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+        }
+        if (!zp.has_default_values(DNNL_ARG_DST)) {
+            VDISPATCH_GEMM(utils::one_of(zp.get_mask(DNNL_ARG_DST), 0, (1 << 0),
+                                   (1 << 1)),
+                    VERBOSE_UNSUPPORTED_ZP_CFG);
+        }
     }
 
     init_scratchpad();
@@ -467,7 +492,7 @@ status_t xe_hp_systolic_gemm_t::init(impl::engine_t *engine) {
     int cmask = -1;
 
     if (pd()->with_c_zero_points())
-        CHECK(pd()->attr()->zero_points_.get(DNNL_ARG_DST, &cmask));
+        cmask = pd()->attr()->zero_points_.get_mask(DNNL_ARG_DST);
     else if (pd()->with_bias())
         cmask = pd()->bias_cmask();
 
@@ -548,8 +573,10 @@ status_t xe_hp_systolic_gemm_t::init_compute(impl::engine_t *engine) {
 
     kd_t kd_full;
 
+    bool is_integrated = compute_engine->device_info()->is_integrated();
+
     auto status = kd_full.select_kernel(arch_, stepping, eu_count_,
-            pd()->with_batch(), pd()->packed_c(), trans_co,
+            is_integrated, pd()->with_batch(), pd()->packed_c(), trans_co,
             pd()->with_a_zero_points(), pd()->with_b_zero_points(),
             pd()->with_c_zero_points(), pd()->with_bias(), pd()->alpha(),
             pd()->beta(), a_type, b_type, c_type, dnnl_s32, dnnl_s32, co_type,
@@ -587,13 +614,13 @@ status_t xe_hp_systolic_gemm_t::init_compute(impl::engine_t *engine) {
                 kd_t kd;
 
                 auto status = kd.select_kernel(arch_, stepping, eu_count_,
-                        pd()->with_batch(), pd()->packed_c(), trans_co,
-                        pd()->with_a_zero_points(), pd()->with_b_zero_points(),
-                        this_c_offset, pd()->with_bias(), pd()->alpha(),
-                        this_beta, a_type, b_type, c_type, dnnl_s32, dnnl_s32,
-                        co_type, acc_type, d->m(), d->n(), d->k(), d->batch(),
-                        pd()->unroll_m(), pd()->unroll_n(), pd()->alt(),
-                        std::move(gpu_post_ops));
+                        is_integrated, pd()->with_batch(), pd()->packed_c(),
+                        trans_co, pd()->with_a_zero_points(),
+                        pd()->with_b_zero_points(), this_c_offset,
+                        pd()->with_bias(), pd()->alpha(), this_beta, a_type,
+                        b_type, c_type, dnnl_s32, dnnl_s32, co_type, acc_type,
+                        d->m(), d->n(), d->k(), d->batch(), pd()->unroll_m(),
+                        pd()->unroll_n(), pd()->alt(), std::move(gpu_post_ops));
 
                 if (status != status::success) return status;
 
@@ -774,8 +801,8 @@ status_t xe_hp_systolic_gemm_t::launch_compute(const gemm_exec_ctx_t &ctx,
         int64_t offset_b, int32_t ldb, const memory_storage_t &c,
         int64_t offset_c, int32_t ldc, float alpha, float beta,
         const memory_storage_t *ao, const memory_storage_t *bo,
-        const memory_storage_t &co, int32_t offset_co, int po_count,
-        const memory_storage_t **po_srcs, int32_t *offset_po_src,
+        const memory_storage_t &co, int64_t offset_co, int po_count,
+        const memory_storage_t **po_srcs, int64_t *offset_po_src,
         bool first_k_block, bool last_k_block, int32_t batch, int32_t stride_a,
         int32_t stride_b, int32_t stride_c) const {
     if (batch == 0) return status::success;
@@ -991,9 +1018,9 @@ status_t xe_hp_systolic_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
             = b.offset() / types::data_type_size(b_type) + pd()->dyn_offset_b;
     size_t off_c0
             = c.offset() / types::data_type_size(c_type) + pd()->dyn_offset_c;
-    size_t off_co0 = 0;
+    int64_t off_co0 = 0;
 
-    int32_t po_offsets0[GEMM_MAX_PO] = {0}, po_offsets[GEMM_MAX_PO] = {0};
+    int64_t po_offsets0[GEMM_MAX_PO] = {0}, po_offsets[GEMM_MAX_PO] = {0};
     for (int i = 0; i < po_count; i++)
         if (po_srcs[i])
             po_offsets0[i] = po_srcs[i]->offset() / problem_.Tbinary[i];
@@ -1051,7 +1078,7 @@ status_t xe_hp_systolic_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
                 if (packed_b) off_b_packed += off_b0;
 
                 auto off_c = off_c0 + Bm + Bn * ldc;
-                auto off_co = int32_t(off_co0);
+                auto off_co = off_co0;
                 switch (co_kind_) {
                     case 'R': off_co += Bm; break;
                     case 'C': off_co += Bn; break;
diff --git a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp
index 9ba48548dc7..3a7728a836c 100644
--- a/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp
+++ b/src/gpu/intel/jit/gemm/xe_hp_systolic_gemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -175,7 +175,7 @@ struct xe_hp_systolic_gemm_t : public gpu_gemm_t {
 public:
     xe_hp_systolic_gemm_t(const pd_t *apd) : gpu_gemm_t(apd) {}
 
-    virtual status_t execute(const gemm_exec_ctx_t &ctx) const override;
+    status_t execute(const gemm_exec_ctx_t &ctx) const override;
 
 private:
     status_t init_compute(impl::engine_t *engine);
@@ -196,8 +196,8 @@ struct xe_hp_systolic_gemm_t : public gpu_gemm_t {
             int32_t ldb, const memory_storage_t &c, int64_t offset_c,
             int32_t ldc, float alpha, float beta, const memory_storage_t *ao,
             const memory_storage_t *bo, const memory_storage_t &co,
-            int32_t offset_co, int po_count, const memory_storage_t **po_src,
-            int32_t *offset_po_src, bool first_k_block, bool last_k_block,
+            int64_t offset_co, int po_count, const memory_storage_t **po_src,
+            int64_t *offset_po_src, bool first_k_block, bool last_k_block,
             int32_t batch, int32_t stride_a, int32_t stride_b,
             int32_t stride_c) const;
 
diff --git a/src/gpu/intel/jit/gen9_simple_sum.cpp b/src/gpu/intel/jit/gen9_simple_sum.cpp
index 486bdc74039..85a02aea2d5 100644
--- a/src/gpu/intel/jit/gen9_simple_sum.cpp
+++ b/src/gpu/intel/jit/gen9_simple_sum.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include "gpu/intel/jit/gen9_simple_sum.hpp"
 
 #include "gpu/intel/jit/gen9_simple_sum_kernel_f32.hpp"
-#include "gpu/intel/ocl/ocl_gpu_kernel.hpp"
+#include "gpu/intel/ocl/kernel.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/jit/gen9_simple_sum.hpp b/src/gpu/intel/jit/gen9_simple_sum.hpp
index f9c1098859a..d22e4e37e5a 100644
--- a/src/gpu/intel/jit/gen9_simple_sum.hpp
+++ b/src/gpu/intel/jit/gen9_simple_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -63,9 +63,9 @@ struct gen9_simple_sum_t : public gpu_primitive_t {
 
     gen9_simple_sum_t(const pd_t *apd) : gpu_primitive_t(apd) {}
 
-    virtual status_t init(impl::engine_t *engine);
+    status_t init(impl::engine_t *engine) override;
 
-    virtual status_t execute(const exec_ctx_t &ctx) const {
+    status_t execute(const exec_ctx_t &ctx) const override {
         status_t status = status::success;
         auto &output = CTX_OUT_CLEAN_STORAGE(DNNL_ARG_DST, status);
         CHECK(status);
diff --git a/src/gpu/intel/jit/gen9_simple_sum_kernel_f32.hpp b/src/gpu/intel/jit/gen9_simple_sum_kernel_f32.hpp
index 14fdf9dcf82..2aa8a5e44f2 100644
--- a/src/gpu/intel/jit/gen9_simple_sum_kernel_f32.hpp
+++ b/src/gpu/intel/jit/gen9_simple_sum_kernel_f32.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2019-2024 Intel Corporation
+ * Copyright 2019-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #define GPU_INTEL_JIT_GEN9_SIMPLE_SUM_KERNEL_F32_HPP
 
 #include "common/c_types_map.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
+#include "gpu/intel/jit/generator.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -27,9 +27,10 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-class gen9_simple_sum_kernel_f32_t : public jit_generator<gpu_gen9> {
+class gen9_simple_sum_kernel_f32_t : public generator_t<gpu_gen9> {
 public:
-    gen9_simple_sum_kernel_f32_t() : jit_generator<gpu_gen9>() {
+    gen9_simple_sum_kernel_f32_t()
+        : generator_t<gpu_gen9>({GENERATOR_NAME, GENERATOR_LINE}) {
         using namespace ngen;
         constexpr auto GlobalPtr = ExternalArgumentType::GlobalPtr;
         constexpr auto Scalar = ExternalArgumentType::Scalar;
diff --git a/src/gpu/intel/jit/generator.cpp b/src/gpu/intel/jit/generator.cpp
new file mode 100644
index 00000000000..6dc8faf73c1
--- /dev/null
+++ b/src/gpu/intel/jit/generator.cpp
@@ -0,0 +1,41 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/generator.hpp"
+
+#include "gpu/intel/jit/utils/utils.hpp"
+#include "ngen_register_decl.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+void check_kernel_size(const std::string &kernel_name, size_t kernel_size,
+        size_t icache_size) {
+    if (kernel_size > icache_size) {
+        gpu_warning() << kernel_name
+                      << " larger than icache, kernel: " << kernel_size
+                      << " bytes, icache: " << icache_size << " bytes";
+    }
+}
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/generator.hpp b/src/gpu/intel/jit/generator.hpp
new file mode 100644
index 00000000000..07062aa7e8b
--- /dev/null
+++ b/src/gpu/intel/jit/generator.hpp
@@ -0,0 +1,204 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_GENERATOR_HPP
+#define GPU_INTEL_JIT_GENERATOR_HPP
+
+#include <memory>
+
+// Must be included before emulation.hpp
+#include "ngen.hpp"
+
+#include "common/impl_registration.hpp"
+#include "common/nstl.hpp"
+#include "gpu/intel/compute/compute_engine.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/gpu_primitive.hpp"
+#include "gpu/intel/jit/emulation.hpp"
+#include "gpu/intel/jit/generator_base.hpp"
+#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
+#include "xpu/utils.hpp"
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#include "gpu/intel/sycl/engine.hpp"
+#include "gpu/intel/sycl/sycl_interop_gpu_kernel.hpp"
+#include "ngen_sycl.hpp"
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/kernel.hpp"
+#include "ngen_opencl.hpp"
+#endif
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+using gpu_gen_t = ngen::HW;
+constexpr gpu_gen_t gpu_gen9 = ngen::HW::Gen9;
+constexpr gpu_gen_t gpu_gen11 = ngen::HW::Gen11;
+constexpr gpu_gen_t gpu_xe_lp = ngen::HW::XeLP;
+constexpr gpu_gen_t gpu_xe_hp = ngen::HW::XeHP;
+constexpr gpu_gen_t gpu_xe_hpg = ngen::HW::XeHPG;
+constexpr gpu_gen_t gpu_xe_hpc = ngen::HW::XeHPC;
+constexpr gpu_gen_t gpu_xe2 = ngen::HW::Xe2;
+constexpr gpu_gen_t gpu_xe3 = ngen::HW::Xe3;
+
+template <typename ngen_generator_t>
+struct eltwise_injector_f32_t;
+
+template <typename ngen_generator_t>
+struct reduction_injector_f32_t;
+
+template <typename ngen_generator_t>
+struct post_op_injector_t;
+
+#if (!defined(NDEBUG) || defined(DNNL_DEV_MODE))
+#define GENERATOR_NAME __FILE__
+#define GENERATOR_LINE __LINE__
+#else
+#define GENERATOR_NAME "oneDNN"
+#define GENERATOR_LINE 0
+#endif
+
+struct debug_config_t {
+    const char *name;
+    uint32_t line;
+};
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+template <gpu_gen_t hw>
+using ngen_code_generator_t = ngen::SYCLCodeGenerator<hw>;
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+template <gpu_gen_t hw>
+using ngen_code_generator_t = ngen::OpenCLCodeGenerator<hw>;
+#endif
+
+template <gpu_gen_t hw>
+class generator_t : public ngen_code_generator_t<hw>, public generator_base_t {
+    friend struct eltwise_injector_f32_t<generator_t>;
+    friend struct reduction_injector_f32_t<generator_t>;
+    friend struct post_op_injector_t<generator_t>;
+    friend struct EmulationImplementation;
+
+private:
+#ifdef DNNL_DEV_MODE
+    static constexpr bool enable_debug_lines = true;
+#else
+    static constexpr bool enable_debug_lines = false;
+#endif
+public:
+    generator_t(const debug_config_t &debug_config)
+        : ngen_code_generator_t<hw>(0,
+                {debug_config.name, debug_config.line, enable_debug_lines}) {};
+
+    const char *kernel_name() const override {
+        return ngen_code_generator_t<hw>::getExternalName().c_str();
+    }
+
+    status_t get_kernel(compute::kernel_t &kernel,
+            const compute::compute_engine_t *engine) override {
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+        auto *sycl_engine = utils::downcast<const sycl::engine_t *>(engine);
+        auto sycl_kernel = ngen_code_generator_t<hw>::getKernel(
+                sycl_engine->context(), sycl_engine->device());
+        return sycl::sycl_interop_gpu_kernel_t::make(kernel, sycl_kernel, {});
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+        auto *ocl_engine = utils::downcast<const ocl::engine_t *>(engine);
+        auto ocl_kernel = ngen_code_generator_t<hw>::getKernel(
+                ocl_engine->context(), ocl_engine->device());
+        return ocl::kernel_t::make(kernel, ocl_kernel, {});
+#endif
+    }
+};
+
+void check_kernel_size(
+        const std::string &kernel_name, size_t kernel_size, size_t icache_size);
+
+template <template <ngen::HW> class KernelT, ngen::HW arch, typename... ArgsT>
+std::unique_ptr<jit::generator_base_t> make_generator(
+        const compute::device_info_t &device_info, ArgsT &&...args) {
+
+    auto raw_kernel = new KernelT<arch>(std::forward<ArgsT>(args)...);
+    check_kernel_size(raw_kernel->kernel_name(),
+            raw_kernel->getRootStreamLength(), device_info.icache_size());
+    return std::unique_ptr<jit::generator_base_t>(raw_kernel);
+}
+
+template <template <ngen::HW> class KernelT, typename... ArgsT>
+compute::kernel_t make_kernel(gpu_primitive_t *primitive, bool register_kernel,
+        impl::engine_t *engine, ArgsT &&...args) {
+    using namespace compute;
+    kernel_t kernel;
+
+    if (primitive->cache_blob()) {
+        status_t status = primitive->create_kernel(
+                engine, &kernel, nullptr, register_kernel);
+        if (status != status::success) return kernel_t();
+        return kernel;
+    }
+
+    auto *compute_engine = utils::downcast<compute_engine_t *>(engine);
+    auto *device_info = compute_engine->device_info();
+    auto arch = convert_dnnl_arch_to_ngen(device_info->gpu_arch());
+
+    std::unique_ptr<jit::generator_base_t> jit_kernel;
+#define CASE(gpu_arch) \
+    case gpu_arch: \
+        jit_kernel = make_generator<KernelT, gpu_arch>( \
+                *device_info, std::forward<ArgsT>(args)...); \
+        break;
+    switch (arch) {
+        REG_GEN9_ISA(CASE(gpu_gen9));
+        REG_GEN11_ISA(CASE(gpu_gen11));
+        REG_XELP_ISA(CASE(gpu_xe_lp));
+        REG_XEHP_ISA(CASE(gpu_xe_hp));
+        REG_XEHPG_ISA(CASE(gpu_xe_hpg));
+        REG_XEHPC_ISA(CASE(gpu_xe_hpc));
+        REG_XE2_ISA(CASE(gpu_xe2));
+        REG_XE3_ISA(CASE(gpu_xe3));
+        default: break;
+    }
+#undef CASE
+
+    if (!jit_kernel) return kernel_t();
+
+    status_t status = primitive->create_kernel(
+            engine, &kernel, jit_kernel.get(), register_kernel);
+    if (status != status::success) return kernel_t();
+    return kernel;
+}
+
+template <template <ngen::HW> class KernelT, typename... ArgsT>
+compute::kernel_t make_kernel(
+        gpu_primitive_t *primitive, impl::engine_t *engine, ArgsT &&...args) {
+    return make_kernel<KernelT>(primitive, /*register_kernel=*/true, engine,
+            std::forward<ArgsT>(args)...);
+}
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_JIT_GENERATOR_HPP
diff --git a/src/gpu/intel/jit/generator_base.hpp b/src/gpu/intel/jit/generator_base.hpp
new file mode 100644
index 00000000000..41f432dceb8
--- /dev/null
+++ b/src/gpu/intel/jit/generator_base.hpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_GENERATOR_BASE_HPP
+#define GPU_INTEL_JIT_GENERATOR_BASE_HPP
+
+#include <vector>
+#include <CL/cl.h>
+
+#include "xpu/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+
+namespace compute {
+class compute_engine_t;
+class kernel_t;
+} // namespace compute
+
+namespace jit {
+
+struct generator_base_t {
+    virtual ~generator_base_t() = default;
+    virtual const char *kernel_name() const = 0;
+    virtual status_t get_kernel(
+            compute::kernel_t &kernel, const compute::compute_engine_t *engine)
+            = 0;
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_JIT_GENERATOR_BASE_HPP
diff --git a/src/gpu/intel/jit/ir/block_2d_utils.hpp b/src/gpu/intel/jit/ir/block_2d_utils.hpp
index 1f144476128..b90ff599fb3 100644
--- a/src/gpu/intel/jit/ir/block_2d_utils.hpp
+++ b/src/gpu/intel/jit/ir/block_2d_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,8 @@
 
 #include <algorithm>
 
-#include "gpu/intel/jit/ir/hw.hpp"
+#include "gpu/intel/jit/utils/utils.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -27,13 +28,20 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-inline int block_2d_base_alignment(const hw_t &hw) {
-    switch (hw.to_ngen()) {
-        case ngen::HW::XeHPC:
-            // XXX: A steppings require 128 byte alignment due to a HW bug.
-            return (hw.stepping_id() <= 6) ? 128 : 64;
-        case ngen::HW::Xe2: return 64;
-        default: ir_error_not_expected();
+inline int block_2d_min_dim() {
+    return 64;
+}
+
+inline int block_2d_max_dim() {
+    return 1 << 24;
+}
+
+inline int block_2d_base_alignment(ngen::HW hw) {
+    switch (hw) {
+        case ngen::HW::XeHPC: return 64;
+        case ngen::HW::Xe2:
+        case ngen::HW::Xe3: return 64;
+        default: gpu_error_not_expected();
     }
     return 0;
 }
@@ -42,34 +50,39 @@ inline int block_2d_x_alignment(int type_size) {
     return std::max(4, type_size) / type_size;
 }
 
-inline bool block_2d_width_ok(int width, int type_size) {
-    int width_bytes = width * type_size;
-    if (width_bytes < 64) return false;
-    if (width_bytes > (1 << 24)) return false;
-    if (width_bytes % std::max(4, type_size) != 0) return false;
+inline int block_2d_w_alignment(int type_size) {
+    return std::max(4, type_size);
+}
+
+inline bool block_2d_width_ok(dim_t width, int type_size) {
+    dim_t width_bytes = width * type_size;
+    if (width_bytes < block_2d_min_dim()) return false;
+    if (width_bytes > block_2d_max_dim()) return false;
+    if (width_bytes % block_2d_w_alignment(type_size) != 0) return false;
     return true;
 }
 
-inline bool block_2d_height_ok(int height) {
-    if (height > (1 << 24)) return false;
+inline bool block_2d_height_ok(dim_t height) {
+    if (height > block_2d_max_dim()) return false;
     return true;
 }
 
-inline int block_2d_pitch_alignment(const hw_t &hw) {
-    switch (hw.to_ngen()) {
+inline int block_2d_pitch_alignment(ngen::HW hw) {
+    switch (hw) {
         case ngen::HW::XeHPC: return 8;
         case ngen::HW::Xe2: return 16;
-        default: ir_error_not_expected();
+        case ngen::HW::Xe3: return 16;
+        default: gpu_error_not_expected();
     }
     return 0;
 }
 
 inline bool block_2d_pitch_ok(
-        const hw_t &hw, int pitch, int type_size, bool use_xy = true) {
-    int pitch_bytes = pitch * type_size;
-    if (pitch_bytes < 64) return false;
+        ngen::HW hw, dim_t pitch, int type_size, bool use_xy = true) {
+    dim_t pitch_bytes = pitch * type_size;
+    if (pitch_bytes < block_2d_min_dim()) return false;
     // 2^24 Pitch does not work on Xe2/Xe3
-    if (pitch_bytes > ((1 << 24) - 1)) return false;
+    if (pitch_bytes > block_2d_max_dim() - 1) return false;
     if (pitch_bytes % block_2d_pitch_alignment(hw) != 0) return false;
     // To be able to point the base to different rows.
     if (use_xy && pitch_bytes % block_2d_base_alignment(hw) != 0) return false;
diff --git a/src/gpu/intel/jit/ir/blocking.cpp b/src/gpu/intel/jit/ir/blocking.cpp
index 69dafd55982..11fe1772dc0 100644
--- a/src/gpu/intel/jit/ir/blocking.cpp
+++ b/src/gpu/intel/jit/ir/blocking.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,10 +25,10 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-std::vector<int> tile_info_t::iter_blocks(int size) const {
+std::vector<int> tile_info_t::iter_blocks(dim_t size) const {
     if (!any(flags & tile_flags_t::iter)) return {1};
     std::vector<int> ret;
-    int lo = std::min(size, (int)min_iter_blk);
+    int lo = static_cast<int>(std::min<dim_t>(size, min_iter_blk));
     int hi = max_iter_blk;
     uint32_t pow2_seen = 0;
     // Step 1. Check the divisors.
@@ -54,19 +54,19 @@ std::vector<int> tile_info_t::iter_blocks(int size) const {
     return ret;
 }
 
-std::vector<int> tile_info_t::thread_group_blocks(int size) const {
+std::vector<int> tile_info_t::thread_group_blocks(dim_t size) const {
     std::vector<int> ret;
     int bound = any(flags & tile_flags_t::thread_group) ? max_thread_group_blk
                                                         : 1;
     for (int i = 1; i <= bound; i *= 2) {
-        int size_padded = utils::rnd_up(size, i);
+        dim_t size_padded = utils::rnd_up(size, i);
         double eff = (double)size / size_padded;
         if (eff >= 0.75) ret.push_back(i);
     }
     return ret;
 }
 
-std::vector<int> tile_info_t::loop_blocks(int size, int iter_blk) const {
+std::vector<dim_t> tile_info_t::loop_blocks(dim_t size, int iter_blk) const {
     if (!any(flags & tile_flags_t::loop)) return {1};
     if (any(flags & tile_flags_t::loop_span)) return {size};
     if (any(flags & tile_flags_t::loop_iter_unroll)) {
@@ -76,33 +76,33 @@ std::vector<int> tile_info_t::loop_blocks(int size, int iter_blk) const {
     return get_loop_blocks(size);
 }
 
-std::vector<int> tile_info_t::get_factors(int n) {
-    std::vector<int> ret;
-    int n_sqrt = (int)std::sqrt(n);
-    for (int i = 1; i <= n_sqrt; i++) {
+std::vector<dim_t> tile_info_t::get_factors(dim_t n) {
+    std::vector<dim_t> ret;
+    dim_t n_sqrt = std::sqrt(n);
+    for (dim_t i = 1; i <= n_sqrt; i++) {
         if (n % i == 0) ret.push_back(i);
     }
-    int lo = n_sqrt;
+    dim_t lo = n_sqrt;
     if (n_sqrt * n_sqrt == n) lo--;
-    for (int i = lo; i >= 1; i--) {
+    for (dim_t i = lo; i >= 1; i--) {
         if (n % i == 0) ret.push_back(n / i);
     }
     return ret;
 }
 
-std::vector<int> tile_info_t::get_loop_blocks(int n) {
+std::vector<dim_t> tile_info_t::get_loop_blocks(dim_t n) {
     const int step = 4;
     int steps = (int)(std::log((float)n) / std::log((float)step));
     auto factors = get_factors(n);
     if (factors.size() >= (size_t)steps) return factors;
 
-    std::vector<int> ret;
+    std::vector<dim_t> ret;
     ret.reserve(steps);
     for (int i = 1; i <= n; i *= step) {
         int a = i;
         int b = i * step;
         bool found = false;
-        for (int j : factors) {
+        for (dim_t j : factors) {
             if (a <= j && j < b) {
                 found = true;
                 ret.push_back(j);
@@ -114,30 +114,22 @@ std::vector<int> tile_info_t::get_loop_blocks(int n) {
     return ret;
 }
 
-namespace levels {
-level_t loop(level_kind_t::loop);
-level_t thread_group(level_kind_t::thread_group);
-level_t iter(level_kind_t::iter);
-} // namespace levels
-
 void get_level_tiles(
-        int size, const tile_info_t &info, std::vector<level_tile_t> &ret) {
+        dim_t size, const tile_info_t &info, std::vector<level_tile_t> &ret) {
     ret.clear();
     auto iter_blocks = info.iter_blocks(size);
     for (int iter : iter_blocks) {
-        int tg_size = utils::div_up(size, iter);
+        dim_t tg_size = utils::div_up(size, iter);
         auto tg_blocks = info.thread_group_blocks(tg_size);
         for (int tg : tg_blocks) {
-            int loop_size = utils::div_up(size, tg * iter);
+            dim_t loop_size = utils::div_up(size, tg * iter);
             auto loop_blocks = info.loop_blocks(loop_size, iter);
-            for (int loop : loop_blocks) {
+            for (dim_t loop : loop_blocks) {
                 level_tile_t t;
-                if (any(info.flags & tile_flags_t::loop))
-                    t[levels::loop] = loop;
+                if (any(info.flags & tile_flags_t::loop)) t.loop = loop;
                 if (any(info.flags & tile_flags_t::thread_group))
-                    t[levels::thread_group] = tg;
-                if (any(info.flags & tile_flags_t::iter))
-                    t[levels::iter] = iter;
+                    t.thread_group = tg;
+                if (any(info.flags & tile_flags_t::iter)) t.iter = iter;
                 ret.push_back(t);
             }
         }
@@ -146,7 +138,7 @@ void get_level_tiles(
 
 void params_generator_t::set_params(prim_config_t &cfg) {
     auto &params = params_vec_[cur_idx_];
-    ir_trace() << "set params #" << cur_idx_ << ": " << params << std::endl;
+    gpu_trace() << "set params #" << cur_idx_ << ": " << params;
     cfg.set_params(params);
 }
 
@@ -167,7 +159,7 @@ std::string to_string(tiler_mode_t mode) {
         CASE(tune);
 #undef CASE
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return "(unknown)";
 }
 
@@ -204,18 +196,18 @@ std::vector<blocking_t> level_tile_set_t::sample(int target,
             set(blk, dims_[i], try_tiles[i]);
         }
         if (!is_ok(blk)) continue;
-        ret.push_back(blk);
+        ret.push_back(std::move(blk));
         if ((int)ret.size() >= target) break;
     }
     return ret;
 }
 
 void level_tile_set_t::set(
-        blocking_t &blk, const prb_dim_t &dim, const level_tile_t &tile) {
-    if (tile.has(levels::loop)) blk.set_loop(dim, tile[levels::loop]);
-    if (tile.has(levels::thread_group))
-        blk.set_thread_group(dim, tile[levels::thread_group]);
-    if (tile.has(levels::iter)) blk.set_iter(dim, tile[levels::iter]);
+        blocking_t &blk, const pvar_t &dim, const level_tile_t &tile) {
+    if (tile.has(level_t::loop)) blk.set_loop(dim, tile.loop);
+    if (tile.has(level_t::thread_group))
+        blk.set_thread_group(dim, tile.thread_group);
+    if (tile.has(level_t::iter)) blk.set_iter(dim, tile.iter);
 }
 
 void level_tile_set_t::product_impl(int idx, std::vector<int> &cur_idxs,
@@ -271,7 +263,7 @@ void blocking_generator_t::generate_all(int vec_size, blocking_checker_t &chk,
 
 void blocking_generator_t::generate_sample(int vec_size,
         const blocking_checker_t &chk, const level_tile_set_t &level_tile_set) {
-    ir_assert(false);
+    gpu_assert(false);
     int target_size = 1;
     auto is_ok = [&](const blocking_t &blk) { return chk.is_ok(blk); };
     auto ts_blockings = level_tile_set.sample(target_size, is_ok, vec_size);
@@ -297,7 +289,7 @@ params_generator_t::params_generator_t(int tune_level, int simd_size,
         const std::vector<level_tile_set_t> &level_tile_sets, int idx) {
     append_params(params_vec_, level_tile_sets, chk, tune_level, simd_size);
     if (idx != -1) {
-        ir_assert(idx >= 0 && idx < configs());
+        gpu_assert(idx >= 0 && idx < configs());
         std::vector<blocking_params_t> temp_vec;
         temp_vec.swap(params_vec_);
         append_params(params_vec_, temp_vec[idx]);
@@ -350,7 +342,7 @@ const tiler_params_t &tiler_params() {
                 continue;
             }
             auto sub_opts = gpu_utils::split(opt, ":");
-            ir_assert((int)sub_opts.size() == 2);
+            gpu_assert((int)sub_opts.size() == 2);
             auto &key = sub_opts[0];
             auto &value = sub_opts[1];
             if (key == "tune_iters") {
@@ -359,17 +351,17 @@ const tiler_params_t &tiler_params() {
                 ret.mode = tiler_mode_t::env_tiler;
                 ret.env_params_idx = std::stoi(value);
             } else {
-                ir_error_not_expected();
+                gpu_error_not_expected();
             }
         }
         bool do_tune = (ret.mode == tiler_mode_t::tune);
-        ir_assert(do_tune == (ret.tune_iters != 0));
+        gpu_assert(do_tune == (ret.tune_iters != 0));
         return ret;
     }();
     return params;
 }
 
-tile_to_vec_t::tile_to_vec_t(const std::vector<std::vector<prb_tile_t>> &tiles,
+tile_to_vec_t::tile_to_vec_t(const std::vector<std::vector<pvar_tile_t>> &tiles,
         const std::vector<int> &_ids) {
     if (tiles.empty()) return;
     int ntiles = (int)tiles.size();
@@ -380,7 +372,7 @@ tile_to_vec_t::tile_to_vec_t(const std::vector<std::vector<prb_tile_t>> &tiles,
         ids.resize(ntiles);
         std::iota(ids.begin(), ids.end(), 0);
     }
-    ir_assert(ids.size() == tiles.size());
+    gpu_assert(ids.size() == tiles.size());
     int max_id = 0;
     for (int i = 0; i < ntiles; i++) {
         for (int j = 0; j < nsubtiles; j++) {
diff --git a/src/gpu/intel/jit/ir/blocking.hpp b/src/gpu/intel/jit/ir/blocking.hpp
index ae6c02a1cfe..ac865456b85 100644
--- a/src/gpu/intel/jit/ir/blocking.hpp
+++ b/src/gpu/intel/jit/ir/blocking.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,9 @@
 
 #include "gpu/intel/jit/ir/core.hpp"
 #include "gpu/intel/jit/ir/problem.hpp"
+#include "gpu/intel/utils.hpp"
+
+#include <set>
 
 namespace dnnl {
 namespace impl {
@@ -29,32 +32,32 @@ namespace jit {
 class blocking_t {
 public:
     int simd() const { return simd_; }
-    const prb_tile_t &loop() const { return loop_; }
-    const prb_tile_t &thread_group() const { return thread_group_; }
-    const prb_tile_t &iter() const { return iter_; }
+    const pvar_tile_t &loop() const { return loop_; }
+    const pvar_tile_t &thread_group() const { return thread_group_; }
+    const pvar_tile_t &iter() const { return iter_; }
 
-    int loop_dim(const prb_dim_t &d) const { return loop_[d]; }
-    int thread_group_dim(const prb_dim_t &d) const { return thread_group_[d]; }
-    int iter_dim(const prb_dim_t &d) const { return iter_[d]; }
+    dim_t loop_dim(const pvar_t &d) const { return loop_[d]; }
+    dim_t thread_group_dim(const pvar_t &d) const { return thread_group_[d]; }
+    dim_t iter_dim(const pvar_t &d) const { return iter_[d]; }
 
     void set_simd(int simd) { simd_ = simd; }
-    void set_loop(const prb_dim_t &d, int value) { loop_[d] = value; }
-    void set_thread_group(const prb_dim_t &d, int value) {
+    void set_loop(const pvar_t &d, dim_t value) { loop_[d] = value; }
+    void set_thread_group(const pvar_t &d, dim_t value) {
         thread_group_[d] = value;
     }
-    void set_iter(const prb_dim_t &d, int value) { iter_[d] = value; }
+    void set_iter(const pvar_t &d, dim_t value) { iter_[d] = value; }
 
     bool is_empty() const {
         return loop_.is_empty() && thread_group_.is_empty() && iter_.is_empty();
     }
     bool is_spatial() const {
-        for (auto d : {prb_dims::iw, prb_dims::ow}) {
+        for (const auto &d : {pvars::iw, pvars::ow}) {
             if (iter_.has(d) && iter_[d] != 1) return true;
         }
         return false;
     }
 
-    void unset(const prb_dim_t &d) {
+    void unset(const pvar_t &d) {
         if (loop_.has(d)) loop_[d] = 1;
         if (thread_group_.has(d)) thread_group_[d] = 1;
         if (iter_.has(d)) iter_[d] = 1;
@@ -107,15 +110,15 @@ class blocking_t {
     }
 
     // Returns the ratio of all operations (with padding) to "useful" operations
-    float get_efficiency(const prb_tile_t &shape) const {
-        float ret = 1;
+    double get_efficiency(const pvar_tile_t &shape) const {
+        double ret = 1;
         for (auto &d : shape) {
-            int loop = loop_.get(d, 1);
-            int tg = thread_group_.get(d, 1);
-            int iter = iter_.get(d, 1);
-            int size = shape[d];
-            int size_padded = utils::rnd_up(size, loop * tg * iter);
-            if (size_padded != size) ret *= float(size) / size_padded;
+            dim_t loop = loop_.get(d, 1);
+            dim_t tg = thread_group_.get(d, 1);
+            dim_t iter = iter_.get(d, 1);
+            dim_t size = shape[d];
+            dim_t size_padded = utils::rnd_up(size, loop * tg * iter);
+            if (size_padded != size) ret *= double(size) / size_padded;
         }
         return ret;
     }
@@ -124,9 +127,9 @@ class blocking_t {
 
 private:
     int simd_ = 0;
-    prb_tile_t loop_;
-    prb_tile_t thread_group_;
-    prb_tile_t iter_;
+    pvar_tile_t loop_;
+    pvar_tile_t thread_group_;
+    pvar_tile_t iter_;
 };
 
 struct blocking_hash_t {
@@ -183,7 +186,7 @@ struct div_info_t {
         unroll_unit = math::lcm(unroll_unit, new_unit);
     }
 
-    bool is_iter_ok(int blk) const {
+    bool is_iter_ok(dim_t blk) const {
         if (iter_unit != 1 && blk % iter_unit != 0) return false;
         if (iter_unit != 1 && !math::is_pow2(blk)) return false;
         return true;
@@ -193,7 +196,7 @@ struct div_info_t {
 // Blocking restrictions for a prb dimension.
 struct tile_info_t {
     tile_info_t() = default;
-    tile_info_t(const prb_dim_t &dim) : dim(dim) {}
+    tile_info_t(const pvar_t &dim) : dim(dim) {}
     void add(tile_flags_t f) { flags = flags | f; }
     void remove(tile_flags_t f) { flags = flags & ~f; }
     void set_iter_unit(int unit) { div_info.set_iter_unit(unit); }
@@ -203,20 +206,20 @@ struct tile_info_t {
         if (pow2_block != 0) min_iter_pow2_blk = pow2_block;
     }
 
-    std::vector<int> iter_blocks(int size) const;
-    std::vector<int> thread_group_blocks(int size) const;
-    std::vector<int> loop_blocks(int size, int iter_blk) const;
+    std::vector<int> iter_blocks(dim_t size) const;
+    std::vector<int> thread_group_blocks(dim_t size) const;
+    std::vector<dim_t> loop_blocks(dim_t size, int iter_blk) const;
 
-    static bool block_ok(int size, int blk, int target_eff) {
-        int size_padded = utils::rnd_up(size, blk);
+    static bool block_ok(dim_t size, int blk, int target_eff) {
+        dim_t size_padded = utils::rnd_up(size, blk);
         double eff = size / (double)size_padded;
         return eff * 100 >= target_eff;
     }
 
-    static std::vector<int> get_factors(int n);
-    static std::vector<int> get_loop_blocks(int n);
+    static std::vector<dim_t> get_factors(dim_t n);
+    static std::vector<dim_t> get_loop_blocks(dim_t n);
 
-    prb_dim_t dim;
+    pvar_t dim;
     tile_flags_t flags = tile_flags_t::undef;
     div_info_t div_info;
 
@@ -231,45 +234,49 @@ struct tile_info_t {
     static const int default_max_thread_group_blk = 16;
 };
 
-// Tile level kinds.
-enum class level_kind_t {
+// Tile levels.
+enum class level_t {
     undef = 0,
     loop,
     thread_group,
     iter,
-    _max,
 };
 
-inline std::string to_string(level_kind_t kind) {
-    std::ostringstream oss;
-    switch (kind) {
-#define CASE(name, value) \
-    case level_kind_t::name: return value
-        CASE(loop, "l");
-        CASE(thread_group, "T");
-        CASE(iter, "i");
-#undef CASE
-        default: ir_error_not_expected();
-    }
-    return oss.str();
-}
+class level_tile_t {
+public:
+    bool has(level_t level) const {
+        switch (level) {
+            case level_t::loop: return loop != 0;
+            case level_t::thread_group: return thread_group != 0;
+            case level_t::iter: return iter != 0;
+            default: gpu_error_not_expected();
+        }
+        return false;
+    }
 
-using level_t = map_key_t<level_kind_t>;
-using level_tile_t = tile_t<level_t>;
+    std::string str() const {
+        if (utils::everyone_is(0, loop, thread_group, iter)) return "x";
+        std::ostringstream oss;
+        if (loop != 0) oss << "l" << loop;
+        if (thread_group != 0) oss << "T" << thread_group;
+        if (iter != 0) oss << "i" << iter;
+        return oss.str();
+    }
 
-namespace levels {
-extern level_t loop;
-extern level_t thread_group;
-extern level_t iter;
-}; // namespace levels
+    IR_DEFINE_DUMP()
+
+    dim_t loop = 0;
+    int thread_group = 0;
+    int iter = 0;
+};
 
 void get_level_tiles(
-        int size, const tile_info_t &info, std::vector<level_tile_t> &ret);
+        dim_t size, const tile_info_t &info, std::vector<level_tile_t> &ret);
 
 class level_tile_set_t {
 public:
     level_tile_set_t(const std::vector<std::vector<level_tile_t>> &tiles,
-            const std::vector<int> &deps, const std::vector<prb_dim_t> &dims)
+            const std::vector<int> &deps, const std::vector<pvar_t> &dims)
         : tiles_(tiles), deps_(deps), dims_(dims) {}
 
     int count() const;
@@ -280,7 +287,7 @@ class level_tile_set_t {
 
 private:
     static void set(
-            blocking_t &blk, const prb_dim_t &dim, const level_tile_t &tile);
+            blocking_t &blk, const pvar_t &dim, const level_tile_t &tile);
 
     void product_impl(int idx, std::vector<int> &cur_idxs, blocking_t &blk,
             std::vector<blocking_t> &ret) const;
@@ -289,7 +296,7 @@ class level_tile_set_t {
 
     std::vector<std::vector<level_tile_t>> tiles_;
     std::vector<int> deps_;
-    std::vector<prb_dim_t> dims_;
+    std::vector<pvar_t> dims_;
 };
 
 // Blocking scheme describing recipes to generate blockings.
@@ -298,13 +305,13 @@ class blocking_scheme_t {
     virtual ~blocking_scheme_t() = default;
     blocking_scheme_t() = default;
     blocking_scheme_t(const std::string &s) {
-        ir_assert(s[s.length() - 1] == ']');
+        gpu_assert(s[s.length() - 1] == ']');
         auto parts = gpu_utils::split(s.substr(0, s.length() - 1), "],");
         for (auto &p : parts) {
             auto p_parts = gpu_utils::split(p, ":");
             auto &key = p_parts[0];
             auto &vec = p_parts[1];
-            ir_assert(vec[0] == '[');
+            gpu_assert(vec[0] == '[');
             auto s_dims
                     = gpu_utils::split(vec.substr(1, vec.length() - 1), ",");
             for (auto &s : s_dims)
@@ -313,7 +320,7 @@ class blocking_scheme_t {
     }
 
     virtual level_tile_set_t make_level_tile_set(
-            const prb_tile_t &padded_shape) const {
+            const pvar_tile_t &padded_shape) const {
         const auto all_dims = dims();
         const int ndims = int(all_dims.size());
         const std::vector<int> deps(ndims, -1);
@@ -326,32 +333,26 @@ class blocking_scheme_t {
         return level_tile_set_t(tiles, deps, all_dims);
     }
 
-    tile_info_t &tile_info(const prb_dim_t &d) {
-        auto it = tile_infos_.find(d.id());
+    tile_info_t &tile_info(const pvar_t &d) {
+        auto it = tile_infos_.find(d);
         if (it != tile_infos_.end()) return it->second;
-        auto &info = tile_infos_[d.id()];
+        auto &info = tile_infos_[d];
         info = tile_info_t(d);
         return info;
     }
 
-    const tile_info_t &tile_info(const prb_dim_t &d) const {
-        return tile_infos_.at(d.id());
+    const tile_info_t &tile_info(const pvar_t &d) const {
+        return tile_infos_.at(d);
     }
 
-    std::vector<prb_dim_t> dims() const {
-        std::vector<prb_dim_t> all_dims;
-        for (int i = 0; i < prb_dim_t::max_id(); i++) {
-            auto d = prb_dim_t::from_id(i);
-            if (d.is_undef()) continue;
-            all_dims.push_back(d);
-        }
-        std::vector<prb_dim_t> ret;
-        for (auto &d : all_dims) {
-            if (loop_.has(d) || thread_group_.has(d) || iter_.has(d)) {
-                ret.push_back(d);
+    std::vector<pvar_t> dims() const {
+        std::set<pvar_t> dims;
+        for (auto *t : {&loop_, &thread_group_, &iter_}) {
+            for (auto &d : t->keys()) {
+                dims.insert(d);
             }
         }
-        return ret;
+        return std::vector<pvar_t>(dims.begin(), dims.end());
     }
 
     std::string str() const {
@@ -366,11 +367,11 @@ class blocking_scheme_t {
 
 private:
     void set(const std::string &s_tile, const std::string &_s_dim) {
-        ir_assert(!_s_dim.empty());
+        gpu_assert(!_s_dim.empty());
         bool no_min_check = (_s_dim[0] == '#');
         auto s_dim = no_min_check ? _s_dim.substr(1) : _s_dim;
-        auto d = prb_dim_t::from_name(s_dim);
-        if (no_min_check) ir_assert(s_tile == "i");
+        auto d = pvar_t(s_dim);
+        if (no_min_check) gpu_assert(s_tile == "i");
         if (s_tile == "i") {
             add_iter_dim(d);
             if (no_min_check) tile_info(d).set_min_iter_block(1);
@@ -383,43 +384,43 @@ class blocking_scheme_t {
         } else if (s_tile == "li") {
             add_loop_dim_with_iter_unroll(d);
         } else {
-            ir_error_not_expected() << s_tile;
+            gpu_error_not_expected() << s_tile;
         }
     }
 
-    void add_loop_dim(const prb_dim_t &d) {
+    void add_loop_dim(const pvar_t &d) {
         loop_[d] = 1;
         auto &info = tile_info(d);
         info.add(tile_flags_t::loop);
     }
 
-    void add_loop_dim_with_span(const prb_dim_t &d) {
+    void add_loop_dim_with_span(const pvar_t &d) {
         add_loop_dim(d);
         tile_info(d).add(tile_flags_t::loop_span);
     }
 
-    void add_loop_dim_with_iter_unroll(const prb_dim_t &d) {
+    void add_loop_dim_with_iter_unroll(const pvar_t &d) {
         add_loop_dim(d);
         tile_info(d).add(tile_flags_t::loop_iter_unroll);
     }
 
-    void add_thread_group_dim(const prb_dim_t &d) {
+    void add_thread_group_dim(const pvar_t &d) {
         thread_group_[d] = 1;
         auto &info = tile_info(d);
         info.add(tile_flags_t::thread_group);
     }
 
-    void add_iter_dim(const prb_dim_t &d) {
+    void add_iter_dim(const pvar_t &d) {
         iter_[d] = 1;
         auto &info = tile_info(d);
         info.add(tile_flags_t::iter);
     }
 
 protected:
-    prb_tile_t loop_;
-    prb_tile_t thread_group_;
-    prb_tile_t iter_;
-    std::unordered_map<int, tile_info_t> tile_infos_;
+    pvar_tile_t loop_;
+    pvar_tile_t thread_group_;
+    pvar_tile_t iter_;
+    std::map<pvar_t, tile_info_t> tile_infos_;
 };
 
 template <class blocking_scheme_kind>
@@ -548,19 +549,21 @@ class params_generator_t {
 
     bool is_empty() const { return params_vec_.empty(); }
 
-    bool can_move_next() const { return cur_idx_ + 1 < configs(); }
+    bool is_valid() const { return cur_idx_ < configs(); }
 
-    void move_next() {
-        ir_assert(can_move_next());
-        cur_idx_++;
-    }
+    void move_next() { cur_idx_++; }
 
     int cur_index() const { return cur_idx_; }
 
+    void set_cur_index(int idx) {
+        gpu_assert(idx < configs());
+        cur_idx_ = idx;
+    }
+
     const blocking_params_t &cur_params() const { return at(cur_idx_); }
 
     const blocking_params_t &at(int idx) const {
-        ir_assert(idx >= 0 && idx < configs());
+        gpu_assert(idx >= 0 && idx < configs());
         return params_vec_[idx];
     }
 
@@ -570,8 +573,8 @@ class params_generator_t {
 
     template <typename KeyFuncT>
     void sort(int beg, int end, const KeyFuncT &key_func) {
-        ir_assert(beg >= 0 && beg < configs());
-        ir_assert(end >= beg && end <= configs());
+        gpu_assert(beg >= 0 && beg < configs());
+        gpu_assert(end >= beg && end <= configs());
         std::sort(params_vec_.begin() + beg, params_vec_.begin() + end,
                 [&](const blocking_params_t &a, const blocking_params_t &b) {
                     return key_func(a) < key_func(b);
@@ -580,7 +583,7 @@ class params_generator_t {
 
     template <typename PredicateFuncT>
     void remove_if(const PredicateFuncT &func) {
-        ir_assert(cur_idx_ == -1);
+        gpu_assert(cur_idx_ == -1);
         params_vec_.erase(
                 std::remove_if(params_vec_.begin(), params_vec_.end(), func),
                 params_vec_.end());
@@ -594,7 +597,7 @@ class params_generator_t {
         table_t table("List of configs", headers);
         for (int i = 0; i < configs(); i++) {
             auto &params = params_vec_[i];
-            ir_trace() << "params #" << i << ": " << params << std::endl;
+            gpu_trace() << "params #" << i << ": " << params;
         }
     }
 
@@ -609,7 +612,7 @@ class params_generator_t {
             blocking_checker_t &chk, int tune_level, int simd_size);
 
     std::vector<blocking_params_t> params_vec_;
-    int cur_idx_ = -1;
+    int cur_idx_ = 0;
 };
 
 enum class tiler_mode_t {
@@ -655,7 +658,7 @@ const tiler_params_t &tiler_params();
 class tile_to_vec_t {
 public:
     tile_to_vec_t() = default;
-    tile_to_vec_t(const std::vector<std::vector<prb_tile_t>> &tiles,
+    tile_to_vec_t(const std::vector<std::vector<pvar_tile_t>> &tiles,
             const std::vector<int> &ids = {});
 
     float dist(int id0, int id1) const {
@@ -664,7 +667,7 @@ class tile_to_vec_t {
         float ret = 0;
         // Use L1 distance between coordinates.
         for (int i = 0; i < (int)v0.size(); i++) {
-            ret += std::abs(v0[i] - v1[i]);
+            ret += float(std::abs(v0[i] - v1[i]));
         }
         return ret;
     }
@@ -674,64 +677,63 @@ class tile_to_vec_t {
     struct indexed_tile_t {
         struct indexed_dim_t {
             indexed_dim_t() = default;
-            indexed_dim_t(const prb_dim_t &dim) : dim_(dim) {}
+            indexed_dim_t(const pvar_t &dim) : dim_(dim) {}
             bool is_empty() const { return values_.empty(); }
-            const prb_dim_t &dim() const { return dim_; }
+            const pvar_t &dim() const { return dim_; }
 
-            void add(int value) { values_.emplace(value, -1); }
+            void add(dim_t value) { values_.emplace(value, dim_idx::invalid); }
 
             void finalize() {
-                int idx = 0;
+                dim_idx_t idx = 0;
                 add(1);
                 for (auto &kv : values_) {
                     kv.second = idx++;
                 }
             }
 
-            int to_index(int value) const {
+            dim_idx_t to_index(dim_t value) const {
                 auto it = values_.find(value);
-                ir_assert(it != values_.end());
+                gpu_assert(it != values_.end());
                 return it->second;
             }
 
-            prb_dim_t dim_;
-            std::map<int, int> values_;
+            pvar_t dim_;
+            std::map<dim_t, dim_idx_t> values_;
         };
 
-        indexed_tile_t() {
-            for (int i = 0; i < prb_dim_t::max_id(); i++) {
-                auto d = prb_dim_t::from_id(i);
-                dim_mappers_[i] = indexed_dim_t(d);
+        void add(const pvar_t &d, dim_t value) {
+            if (dim_mappers_.count(d) == 0) {
+                dim_mappers_[d] = indexed_dim_t(d);
             }
+            dim_mappers_[d].add(value);
         }
 
-        void add(prb_dim_t d, int value) { dim_mappers_[d.id()].add(value); }
-
-        void add(const prb_tile_t &t) {
+        void add(const pvar_tile_t &t) {
             for (auto &d : t) {
                 add(d, t[d]);
             }
         }
 
         void finalize() {
-            for (auto &d : dim_mappers_)
-                if (!d.is_empty()) d.finalize();
+            for (auto &kv : dim_mappers_)
+                if (!kv.second.is_empty()) kv.second.finalize();
         }
 
-        int to_index(const prb_dim_t &d, int value) const {
-            return dim_mappers_[d.id()].to_index(value);
+        dim_idx_t to_index(const pvar_t &d, dim_t value) const {
+            return dim_mappers_.at(d).to_index(value);
         }
 
-        std::vector<int> to_index(const prb_tile_t &t) const {
-            std::vector<int> ret;
-            for (auto &m : dim_mappers_) {
+        std::vector<dim_idx_t> to_index(const pvar_tile_t &t) const {
+            std::vector<dim_idx_t> ret;
+            for (auto &kv : dim_mappers_) {
+                auto &m = kv.second;
                 if (m.is_empty()) continue;
                 ret.push_back(to_index(m.dim(), t.get(m.dim(), 1)));
             }
             return ret;
         }
 
-        std::array<indexed_dim_t, prb_dim_t::max_id()> dim_mappers_;
+        std::unordered_map<pvar_t, indexed_dim_t> dim_mappers_;
     };
 
     std::vector<std::vector<int>> vecs_;
diff --git a/src/gpu/intel/jit/ir/config.hpp b/src/gpu/intel/jit/ir/config.hpp
index 9f9be1959ed..02dac51002b 100644
--- a/src/gpu/intel/jit/ir/config.hpp
+++ b/src/gpu/intel/jit/ir/config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include "gpu/intel/config.hpp"
 #include "gpu/intel/jit/ir/blocking.hpp"
 #include "gpu/intel/jit/ir/hw.hpp"
+#include "gpu/intel/jit/ir/ir_builder.hpp"
 #include "gpu/intel/jit/ir/post_ops.hpp"
 #include "gpu/intel/jit/ir/problem.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
@@ -61,7 +62,7 @@ class layout_param_t : public dnnl::impl::gpu::intel::param_t {
                 compute_unnormalized_tag_ = parts[0];
                 user_unnormalized_tag_ = parts[1];
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
     }
 
@@ -124,16 +125,12 @@ class exec_cfg_param_t : public value_param_t<exec_config_t> {
         if (key == "regs") return false;
         if (key == "simd") return false;
         if (key == "vec") return value_.vec_size() == value_.simd();
-        ir_error_not_expected() << key;
+        gpu_error_not_expected() << key;
         return false;
     }
 
     std::vector<std::string> accepted_keys() const override {
-        std::vector<std::string> ret;
-        ret.push_back("regs");
-        ret.push_back("simd");
-        ret.push_back("vec");
-        return ret;
+        return {"regs", "simd", "vec"};
     }
 
     void set_from_str(
@@ -145,7 +142,7 @@ class exec_cfg_param_t : public value_param_t<exec_config_t> {
         } else if (key == "vec") {
             value_.set_vec_size(std::stoi(value));
         } else {
-            ir_error_not_expected() << key;
+            gpu_error_not_expected() << key;
         }
     }
 
@@ -187,29 +184,28 @@ class thread_group_grid_param_t : public grid_param_t {
 
 class tile_param_t : public param_t {
 public:
-    using value_t = prb_tile_t;
+    using value_t = pvar_tile_t;
 
     const value_t &get() const { return tile_; }
 
     bool is_empty() const { return tile_.is_empty(); }
 
-    int get(const prb_dim_t &dim) const { return tile_.get(dim, 1); }
+    dim_t get(const pvar_t &pvar) const { return tile_.get(pvar, 1); }
 
-    int operator()(const prb_dim_t &dim) const { return get(dim); }
+    dim_t operator()(const pvar_t &pvar) const { return get(pvar); }
 
     void set_from_str(const std::string &s) override {
-        tile_ = prb_tile_t();
+        tile_ = pvar_tile_t();
         for (auto &kv : ir_utils::to_string_int_pairs(s)) {
-            tile_[prb_dim_t::from_name(kv.first)] = kv.second;
+            tile_[pvar_t(kv.first)] = kv.second;
         }
     }
 
-    void set(const prb_dim_t &dim, int size) { tile_[dim] = size; }
+    void set(const pvar_t &pvar, dim_t size) { tile_[pvar] = size; }
 
     void set(const value_t &value) { tile_ = value; }
 
-    template <typename T>
-    void set(const dim_map_t<T, int> &tile) {
+    void set(const pvar_map_t<int> &tile) {
         for (auto &d : tile) {
             set(d.str(), tile[d]);
         }
@@ -291,9 +287,9 @@ class prim_config_t : public container_config_t {
     ~prim_config_t() override = default;
     std::string str() const override = 0;
 
-    virtual prb_tile_t shape(bool pad) const = 0;
-    virtual const std::vector<prb_dim_t> &index_dims() const = 0;
-    virtual int pad_block(const prb_dim_t &d) const = 0;
+    virtual pvar_tile_t shape(bool pad) const = 0;
+    virtual const std::vector<pvar_t> &index_dims() const = 0;
+    virtual int pad_block(const pvar_t &d) const = 0;
 
     void set_zp_cfg(const zero_points_config_t &zp_cfg) { zp_cfg_ = zp_cfg; }
     const zero_points_config_t &zp_cfg() const { return zp_cfg_; }
@@ -310,7 +306,7 @@ class prim_config_t : public container_config_t {
     }
 
     void set_params(const blocking_params_t &params) {
-        ir_assert(!params.is_empty());
+        gpu_assert(!params.is_empty());
         const auto &blocking = params.blocking();
         if (!loop_dims().is_overridden()) loop_dims().set(blocking.loop());
         if (!thread_group_dims().is_overridden())
@@ -319,8 +315,9 @@ class prim_config_t : public container_config_t {
 
         // update padded dimensions based on what was set just above
         for (auto &d : index_dims()) {
-            int blk = loop_dim(d) * thread_group_dim(d) * iter_dim(d);
-            int padded = utils::rnd_up(dim(d), math::lcm(blk, pad_block(d)));
+            dim_t blk = loop_dim(d) * thread_group_dim(d) * iter_dim(d);
+            dim_t padded = utils::rnd_up(
+                    dim(d), math::lcm<dim_t>(blk, pad_block(d)));
             padded_dims().set(d, padded);
         }
 
@@ -332,9 +329,9 @@ class prim_config_t : public container_config_t {
             int bufs_hint = blocking_params_t::bufs_hint_undef) const {
         blocking_t blocking;
         for (auto &d : index_dims()) {
-            int loop = loop_dim(d);
-            int tg = thread_group_dim(d);
-            int iter = iter_dim(d);
+            dim_t loop = loop_dim(d);
+            dim_t tg = thread_group_dim(d);
+            dim_t iter = iter_dim(d);
             if (loop != 1) blocking.set_loop(d, loop);
             if (tg != 1) blocking.set_thread_group(d, tg);
             if (iter != 1) blocking.set_iter(d, iter);
@@ -346,7 +343,7 @@ class prim_config_t : public container_config_t {
     }
 
     static int get_max_threadgroups_per_wave(
-            const exec_config_t &exec_cfg, int tg_elems) {
+            const exec_config_t &exec_cfg, dim_t tg_elems) {
         auto arch = convert_ngen_arch_to_dnnl(exec_cfg.hw().to_ngen());
         int threads_per_eu = compute::device_info_t::threads_per_eu(
                 arch, exec_cfg.regs() > 128);
@@ -354,40 +351,43 @@ class prim_config_t : public container_config_t {
         int subslice_count = exec_cfg.hw().eu_count() / eus_per_subslice;
 
         int tgs_per_subslice = eus_per_subslice * threads_per_eu / tg_elems;
-        ir_assert(tgs_per_subslice > 0);
+        gpu_assert(tgs_per_subslice > 0);
         return subslice_count * tgs_per_subslice;
     }
 
     // Return thread utilization as a percentage. If this value is low,
     // parallelism is a fundamental limitation to the current work scheduling.
     static float get_thread_utilization(
-            const exec_config_t &exec_cfg, int kg_elems, int tg_elems) {
+            const exec_config_t &exec_cfg, dim_t kg_elems, dim_t tg_elems) {
         auto arch = convert_ngen_arch_to_dnnl(exec_cfg.hw().to_ngen());
         int eus_per_subslice = compute::device_info_t::max_eus_per_wg(arch);
         int subslice_count = exec_cfg.hw().eu_count() / eus_per_subslice;
 
-        int min_wg_per_subslice_wave = std::max(eus_per_subslice / tg_elems, 1);
-        int min_wg_per_wave = subslice_count * min_wg_per_subslice_wave;
-        return (100.f * kg_elems) / utils::rnd_up(kg_elems, min_wg_per_wave);
+        dim_t min_wg_per_subslice_wave
+                = std::max<dim_t>(eus_per_subslice / tg_elems, 1);
+        dim_t min_wg_per_wave = subslice_count * min_wg_per_subslice_wave;
+        return (100.f * float(kg_elems))
+                / float(utils::rnd_up(kg_elems, min_wg_per_wave));
     }
 
     // Return wave utilization as a percentage. If this value is low, memory
     // latency may be an issue due to limited use of SMT to hide the latency.
     static float get_wave_utilization(
-            const exec_config_t &exec_cfg, int kg_elems, int tg_elems) {
+            const exec_config_t &exec_cfg, dim_t kg_elems, dim_t tg_elems) {
         int tgs_per_wave = get_max_threadgroups_per_wave(exec_cfg, tg_elems);
-        return (100.f * kg_elems) / utils::rnd_up(kg_elems, tgs_per_wave);
+        return (100.f * float(kg_elems))
+                / float(utils::rnd_up(kg_elems, tgs_per_wave));
     }
 
 #define DECL_PARAM(name) \
     const name##_param_t &name##_param() const { \
         (void)name##_init_; \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         return name##_; \
     } \
     name##_param_t &name##_param() { return name##_; } \
     const name##_param_t::value_t &name() const { \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         return name##_.get(); \
     } \
     void set_##name(const name##_param_t::value_t &value) { \
@@ -396,7 +396,7 @@ class prim_config_t : public container_config_t {
 #define DECL_PARAM2(name) \
     const name##_param_t &name() const { \
         (void)name##_init_; \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         return name##_; \
     } \
     name##_param_t &name() { return name##_; }
@@ -412,51 +412,52 @@ class prim_config_t : public container_config_t {
 #undef DECL_PARAM
 #undef DECL_PARAM2
 
-    int iter_dim(const prb_dim_t &d) const { return iter_dims().get(d); }
+    dim_t iter_dim(const pvar_t &d) const { return iter_dims().get(d); }
 
-    int iter_dim(std::initializer_list<prb_dim_t> dims) const {
+    dim_t iter_dim(std::initializer_list<pvar_t> dims) const {
         int ret = 1;
         for (auto &dim : dims)
             ret *= iter_dim(dim);
         return ret;
     }
 
-    int loop_dim(const prb_dim_t &d) const { return loop_dims().get(d); }
+    dim_t loop_dim(const pvar_t &d) const { return loop_dims().get(d); }
 
-    int thread_group_dim(const prb_dim_t &d) const {
+    dim_t thread_group_dim(const pvar_t &d) const {
         return thread_group_dims().get(d);
     }
 
-    int padded_dim(const prb_dim_t &d) const { return padded_dims().get(d); }
+    dim_t padded_dim(const pvar_t &d) const { return padded_dims().get(d); }
 
-    int grid_dim(const prb_dim_t &dim) const {
+    dim_t grid_dim(const pvar_t &dim) const {
         return ir_utils::safe_divide(padded_dim(dim),
                 loop_dim(dim) * thread_group_dim(dim) * iter_dim(dim));
     }
 
-    prb_tile_t dims() const { return shape(/* pad = */ false); }
-    int dim(const prb_dim_t &d) const { return dims().get(d); }
+    pvar_tile_t dims() const { return shape(/* pad = */ false); }
+    dim_t dim(const pvar_t &d) const { return dims().get(d); }
 
     int sort_key(const param_t *param) const override;
 
-    void init_kernel_grid(const std::array<prb_tile_t, 3> &grid) {
-        std::vector<int> dims(grid.size(), 1);
-        for (int i = 0; i < int(grid.size()); i++) {
+    void init_kernel_grid(const std::array<pvar_tile_t, 3> &grid) {
+        std::vector<dim_t> dims(grid.size(), 1);
+        for (dim_idx_t i = 0; i < grid.size(); i++) {
             for (auto &d : grid[i]) {
-                int tg_block = loop_dim(d) * thread_group_dim(d) * iter_dim(d);
+                dim_t tg_block
+                        = loop_dim(d) * thread_group_dim(d) * iter_dim(d);
                 dims[i] *= ir_utils::safe_divide(padded_dim(d), tg_block);
             }
         }
-        set_kernel_grid(grid_info_t(dims, "grid_idx"));
+        set_kernel_grid(grid_info_t(dims, ir_builder_t::tg_idx));
     }
 
-    void init_thread_group_grid(const std::array<prb_tile_t, 3> &grid) {
-        std::vector<int> dims(grid.size(), 1);
-        for (int i = 0; i < int(grid.size()); i++) {
+    void init_thread_group_grid(const std::array<pvar_tile_t, 3> &grid) {
+        std::vector<dim_t> dims(grid.size(), 1);
+        for (dim_idx_t i = 0; i < grid.size(); i++) {
             for (auto &d : grid[i])
                 dims[i] *= thread_group_dim(d);
         }
-        set_thread_group_grid(grid_info_t(dims, "tg_idx"));
+        set_thread_group_grid(grid_info_t(dims, ir_builder_t::thr_idx));
     }
 
 protected:
diff --git a/src/gpu/intel/jit/ir/core.cpp b/src/gpu/intel/jit/ir/core.cpp
index 3de929cc762..c6a9aa392c5 100644
--- a/src/gpu/intel/jit/ir/core.cpp
+++ b/src/gpu/intel/jit/ir/core.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@ int type_t::size() const {
     if (is_ptr()) return sizeof(uint64_t);
 
     if (is_bool()) return utils::div_up(elems(), 8);
+    if (is_x4() || is_fp4()) return utils::div_up(elems(), 2);
 
     if (elems() != 1) return elems() * scalar().size();
 
@@ -58,15 +59,17 @@ int type_t::size() const {
         case type_kind_t::qword: return 8;
         case type_kind_t::oword: return 16;
         case type_kind_t::hword: return 32;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return 0;
 }
 
 data_type_t to_dnnl(const type_t &type) {
-    ir_assert(type.elems() == 1) << type;
-    ir_assert(!type.is_ptr() == 1) << type;
+    gpu_assert(type.elems() == 1) << type;
+    gpu_assert(!type.is_ptr() == 1) << type;
     switch (type.kind()) {
+        case type_kind_t::f4_e3m0: return data_type::f4_e3m0;
+        case type_kind_t::f4_e2m1: return data_type::f4_e2m1;
         case type_kind_t::bf8: return data_type::f8_e5m2;
         case type_kind_t::hf8: return data_type::f8_e4m3;
         case type_kind_t::bf16: return data_type::bf16;
@@ -77,7 +80,7 @@ data_type_t to_dnnl(const type_t &type) {
         case type_kind_t::s32: return data_type::s32;
         case type_kind_t::s8: return data_type::s8;
         case type_kind_t::u8: return data_type::u8;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return data_type::undef;
 }
@@ -104,6 +107,7 @@ std::string to_string(op_kind_t kind) {
         case op_kind_t::_ne: return "!=";
 
         case op_kind_t::_and: return "&&";
+        case op_kind_t::_or: return "||";
 
         case op_kind_t::_add3: return "add3";
         case op_kind_t::_mad: return "mad";
@@ -112,7 +116,7 @@ std::string to_string(op_kind_t kind) {
         case op_kind_t::_idiv: return "idiv";
         case op_kind_t::_imod: return "imod";
 
-        default: ir_error_not_expected() << "Unknown op_kind_t value.";
+        default: gpu_error_not_expected() << "Unknown op_kind_t value.";
     }
     return "";
 }
@@ -138,6 +142,7 @@ bool is_commutative_op(op_kind_t op_kind) {
         case op_kind_t::_eq:
         case op_kind_t::_ne:
         case op_kind_t::_and:
+        case op_kind_t::_or:
         case op_kind_t::_add3: return true;
         default: return false;
     }
@@ -151,7 +156,7 @@ op_kind_t negate_cmp_op(op_kind_t op_kind) {
         case op_kind_t::_lt: return op_kind_t::_gt;
         case op_kind_t::_eq: return op_kind_t::_eq;
         case op_kind_t::_ne: return op_kind_t::_ne;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return op_kind_t::undef;
 }
@@ -165,13 +170,13 @@ type_t unary_op_type(op_kind_t op_kind, const expr_t &a) {
             return t;
         }
         default:
-            ir_error_not_expected() << "Unknown op_kind_t value: " << op_kind;
+            gpu_error_not_expected() << "Unknown op_kind_t value: " << op_kind;
     }
     return type_t::undef();
 }
 
 type_t common_int_type(const type_t &_a, const type_t &_b) {
-    ir_assert(_a.is_int() && _b.is_int()) << "Unexpected types.";
+    gpu_assert(_a.is_int() && _b.is_int()) << "Unexpected types.";
 
     int elems = _a.elems();
 
@@ -201,7 +206,7 @@ type_t common_int_type(const type_t &_a, const type_t &_b) {
 }
 
 type_t common_type(const type_t &a, const type_t &b) {
-    ir_assert(a.elems() == b.elems())
+    gpu_assert(a.elems() == b.elems())
             << "Types must have the same number of components.";
     if (a.is_undef() || b.is_undef()) return type_t::undef();
     if (a.is_fp() && !b.is_fp()) return a;
@@ -218,20 +223,24 @@ type_t common_type(const expr_t &a, const expr_t &b) {
 type_t binary_op_type(op_kind_t op_kind, const type_t &a, const type_t &b,
         const expr_t &a_expr = expr_t(), const expr_t &b_expr = expr_t()) {
     if (a.is_undef() || b.is_undef()) return type_t::undef();
-    ir_assert(a.elems() == b.elems())
+    gpu_assert(a.elems() == b.elems())
             << "Types must have the same number of components.";
     if (is_cmp_op(op_kind)) return type_t::_bool(a.elems());
     if (utils::one_of(op_kind, op_kind_t::_shl, op_kind_t::_shr)) {
-        ir_assert(a.is_unsigned())
+        gpu_assert(a.is_unsigned())
                 << "a must be unsigned for shift left/right.";
         return type_t::u32(a.elems());
     }
-    if (op_kind == op_kind_t::_and) {
+    if (utils::one_of(op_kind, op_kind_t::_and, op_kind_t::_or)) {
         if (a == b) return a;
         if (is_const(a_expr)) return b;
         if (is_const(b_expr)) return a;
         return (a.size() >= b.size()) ? a : b;
     }
+    if (utils::one_of(op_kind, op_kind_t::_div, op_kind_t::_mod) && a.is_int()
+            && b.is_int()) {
+        return (a.is_signed() ? type_t::s32() : type_t::u32());
+    }
     return common_type(a, b);
 }
 
@@ -251,13 +260,13 @@ type_t ternary_op_type(
         case op_kind_t::_idiv:
         case op_kind_t::_imod:
             return a.type().is_signed() ? type_t::s32() : type_t::u32();
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return type_t::undef();
 }
 
 type_t nary_op_type(op_kind_t op_kind, const std::vector<expr_t> &args) {
-    ir_assert(!args.empty());
+    gpu_assert(!args.empty());
     if (args.size() == 1) return args[0].type();
 
     auto type = args[0].type();
@@ -268,9 +277,9 @@ type_t nary_op_type(op_kind_t op_kind, const std::vector<expr_t> &args) {
 }
 
 void ptr_t::normalize(expr_t &base, expr_t &off, op_kind_t op_kind) {
-    ir_assert(base.type().is_ptr()) << "base is not a pointer: " << base;
-    ir_assert(off.type().is_int()) << "off is not an integer: " << off;
-    ir_assert(utils::one_of(op_kind, op_kind_t::_add, op_kind_t::_sub))
+    gpu_assert(base.type().is_ptr()) << "base is not a pointer: " << base;
+    gpu_assert(off.type().is_int()) << "off is not an integer: " << off;
+    gpu_assert(utils::one_of(op_kind, op_kind_t::_add, op_kind_t::_sub))
             << "Can't apply this operation to pointer: " << to_string(op_kind);
 
     if (!base.is<ptr_t>()) {
@@ -295,7 +304,7 @@ void normalize_ptr(const type_t &type, expr_t &base_expr, expr_t &off) {
         off = const_fold_non_recursive(base_expr.as<ptr_t>().off + off);
         base_expr = base_expr.as<ptr_t>().base;
     }
-    ir_assert(to_cpp<int64_t>(off) % type.scalar().size() == 0)
+    gpu_assert(to_cpp<int64_t>(off) % type.scalar().size() == 0)
             << "Incompatible offset: " << off;
 }
 
@@ -307,9 +316,33 @@ expr_t linear_t::to_expr() const {
     return simplify_rewrite(ret);
 }
 
+void stmt_seq_flatten(std::vector<stmt_t> &out, const stmt_t &s) {
+    if (auto *seq = s.as_ptr<stmt_seq_t>()) {
+        out.insert(out.end(), seq->vec.begin(), seq->vec.end());
+        return;
+    }
+    out.push_back(s);
+}
+
+stmt_t stmt_seq_t::make(const std::vector<stmt_t> &_vec) {
+    std::vector<stmt_t> vec;
+    for (auto &s : _vec)
+        stmt_seq_flatten(vec, s);
+    return stmt_t(new stmt_seq_t(vec));
+}
+
+stmt_t stmt_t::append(const stmt_t &s) const {
+    if (is_empty()) return s;
+    if (s.is_empty()) return *this;
+    std::vector<stmt_t> vec;
+    stmt_seq_flatten(vec, *this);
+    stmt_seq_flatten(vec, s);
+    return stmt_seq_t::make(vec);
+}
+
 expr_t expr_t::operator[](const expr_t &off) const {
     if (is<shuffle_t>()) {
-        ir_assert(is_const(off)) << "Offset is not constant.";
+        gpu_assert(is_const(off)) << "Offset is not constant.";
         auto &shuffle = as<shuffle_t>();
         int idx = shuffle.idx[to_cpp<int>(off)];
         return shuffle.vec[idx];
@@ -332,6 +365,11 @@ expr_t operator-(const expr_t &a) {
     return const_fold_non_recursive(unary_op_t::make(op_kind_t::_minus, a));
 }
 
+expr_t div_up(const expr_t &a, const expr_t &b) {
+    return const_fold_non_recursive(
+            binary_op_t::make(op_kind_t::_div_up, a, b));
+}
+
 #define DEFINE_BINARY_OPERATOR(op, op_kind) \
     expr_t operator op(const expr_t &a, const expr_t &b) { \
         if (a.type().is_ptr()) return shift_ptr(op_kind, a, b); \
@@ -354,6 +392,7 @@ DEFINE_BINARY_OPERATOR(<, op_kind_t::_lt)
 DEFINE_BINARY_OPERATOR(<=, op_kind_t::_le)
 
 DEFINE_BINARY_OPERATOR(&, op_kind_t::_and)
+DEFINE_BINARY_OPERATOR(|, op_kind_t::_or)
 
 #undef DEFINE_BINARY_OPERATOR
 
@@ -521,12 +560,12 @@ void ir_visitor_t::_visit(const let_t &obj) {
 }
 
 object_t ir_mutator_t::_mutate(const linear_t &obj) {
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return obj;
 }
 
 void ir_visitor_t::_visit(const linear_t &obj) {
-    ir_error_not_expected();
+    gpu_error_not_expected();
 }
 
 object_t ir_mutator_t::_mutate(const load_t &obj) {
@@ -582,17 +621,13 @@ void ir_visitor_t::_visit(const stmt_group_t &obj) {
 }
 
 object_t ir_mutator_t::_mutate(const stmt_seq_t &obj) {
-    auto head = mutate(obj.head);
-    auto tail = mutate(obj.tail);
-
-    if (head.is_same(obj.head) && tail.is_same(obj.tail)) return obj;
-
-    return stmt_seq_t::make(head, tail);
+    auto vec = mutate(obj.vec);
+    if (ir_utils::is_same(vec, obj.vec)) return obj;
+    return stmt_seq_t::make(vec);
 }
 
 void ir_visitor_t::_visit(const stmt_seq_t &obj) {
-    visit(obj.head);
-    visit(obj.tail);
+    visit(obj.vec);
 }
 
 object_t ir_mutator_t::_mutate(const store_t &obj) {
@@ -641,21 +676,35 @@ void ir_visitor_t::_visit(const unary_op_t &obj) {
     visit(obj.a);
 }
 
+object_t ir_mutator_t::_mutate(const while_t &obj) {
+    auto cond = mutate(obj.cond);
+    auto body = mutate(obj.body);
+
+    if (cond.is_same(obj.cond) && body.is_same(obj.body)) return obj;
+
+    return while_t::make(cond, body);
+}
+
+void ir_visitor_t::_visit(const while_t &obj) {
+    visit(obj.cond);
+    visit(obj.body);
+}
+
 // Catch missing mutates that are not expected to dispatch to the base
 // mutator
 object_t ir_mutator_t::_mutate(const nary_op_t &obj) {
-    ir_error_not_expected() << "Can't handle type: nary_op_t";
+    gpu_error_not_expected() << "Can't handle type: nary_op_t";
     return {};
 }
 void ir_visitor_t::_visit(const nary_op_t &obj) {
-    ir_error_not_expected() << "Can't handle type: nary_op_t";
+    gpu_error_not_expected() << "Can't handle type: nary_op_t";
 }
 object_t ir_mutator_t::_mutate(const pexpr_t &obj) {
-    ir_error_not_expected() << "Can't handle type: pexpr_t";
+    gpu_error_not_expected() << "Can't handle type: pexpr_t";
     return {};
 }
 void ir_visitor_t::_visit(const pexpr_t &obj) {
-    ir_error_not_expected() << "Can't handle type: pexpr_t";
+    gpu_error_not_expected() << "Can't handle type: pexpr_t";
 }
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/ir/core.hpp b/src/gpu/intel/jit/ir/core.hpp
index bcbf237c506..978c10ad675 100644
--- a/src/gpu/intel/jit/ir/core.hpp
+++ b/src/gpu/intel/jit/ir/core.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,7 +62,8 @@
     HANDLE_IR_OBJECT(let_t) \
     HANDLE_IR_OBJECT(stmt_group_t) \
     HANDLE_IR_OBJECT(stmt_seq_t) \
-    HANDLE_IR_OBJECT(store_t)
+    HANDLE_IR_OBJECT(store_t) \
+    HANDLE_IR_OBJECT(while_t)
 
 #define HANDLE_TRAVERSE_TARGETS() \
     HANDLE_EXPR_IR_OBJECTS() \
@@ -144,26 +145,22 @@ struct type_info_t {
     IR_DECL_TYPE_ID(class_name) \
     static bool _is_stmt() { return true; };
 
-#define IR_DECL_MUTATE(mutator_template) \
-    object_t _mutate(mutator_template &mutator) const override { \
-        return mutator._mutate(*this); \
-    }
-#define IR_DECL_VISIT(visitor_template) \
-    void _visit(visitor_template &visitor) const override { \
-        visitor._visit(*this); \
-    }
-
 #define IR_DECLARE_TRAVERSERS() \
-    IR_DECL_MUTATE(ir_mutator_t) \
-    IR_DECL_VISIT(ir_visitor_t)
+    object_t _mutate(ir_mutator_t &mutator) const override { \
+        return mutator._mutate(*this); \
+    } \
+    void _visit(ir_visitor_t &visitor) const override { visitor._visit(*this); }
 
 // Defines getter for a function argument.
 #define IR_DEFINE_ARG_GET(name, index) \
+    static const expr_t &arg_##name(const func_call_t &c) { \
+        gpu_assert(c.func.is<self_type>()) << c; \
+        return c.args[index]; \
+    } \
     static const expr_t &arg_##name(const stmt_t &s) { \
-        ir_assert(s.is<func_call_t>()) << s; \
+        gpu_assert(s.is<func_call_t>()) << s; \
         auto &c = s.as<func_call_t>(); \
-        ir_assert(c.func.is<self_type>()) << s; \
-        return c.args[index]; \
+        return arg_##name(c); \
     } \
     template <typename T> \
     static T &arg_##name(std::vector<T> &args) { \
@@ -199,6 +196,8 @@ enum class type_kind_t {
     _bool,
 
     // Integer types.
+    u4,
+    s4,
     u8,
     s8,
     u16,
@@ -209,6 +208,8 @@ enum class type_kind_t {
     s64,
 
     // Floating point types.
+    f4_e3m0,
+    f4_e2m1,
     bf8,
     f8_e5m2 = bf8,
     hf8,
@@ -229,6 +230,8 @@ enum class type_kind_t {
 
 static auto type_kind_names = nstl::to_array({
         make_enum_name(type_kind_t::undef, "undef"),
+        make_enum_name(type_kind_t::u4, "u4"),
+        make_enum_name(type_kind_t::s4, "s4"),
         make_enum_name(type_kind_t::u8, "u8"),
         make_enum_name(type_kind_t::s8, "s8"),
         make_enum_name(type_kind_t::u16, "u16"),
@@ -237,6 +240,8 @@ static auto type_kind_names = nstl::to_array({
         make_enum_name(type_kind_t::s32, "s32"),
         make_enum_name(type_kind_t::u64, "u64"),
         make_enum_name(type_kind_t::s64, "s64"),
+        make_enum_name(type_kind_t::f4_e3m0, "f4_e3m0"),
+        make_enum_name(type_kind_t::f4_e2m1, "f4_e2m1"),
         make_enum_name(type_kind_t::bf8, "bf8"),
         make_enum_name(type_kind_t::hf8, "hf8"),
         make_enum_name(type_kind_t::bf16, "bf16"),
@@ -260,6 +265,8 @@ class type_t {
         return type_t(type_kind_t::_bool, elems);
     }
 
+    static type_t u4(int elems = 1) { return type_t(type_kind_t::u4, elems); }
+    static type_t s4(int elems = 1) { return type_t(type_kind_t::s4, elems); }
     static type_t u8(int elems = 1) { return type_t(type_kind_t::u8, elems); }
     static type_t s8(int elems = 1) { return type_t(type_kind_t::s8, elems); }
     static type_t u16(int elems = 1) { return type_t(type_kind_t::u16, elems); }
@@ -272,11 +279,12 @@ class type_t {
     // Returns unsigned integer type.
     static type_t u(int bits, int elems = 1) {
         switch (bits) {
+            case 4: return u4(elems);
             case 8: return u8(elems);
             case 16: return u16(elems);
             case 32: return u32(elems);
             case 64: return u64(elems);
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return type_t::undef();
     }
@@ -284,14 +292,22 @@ class type_t {
     // Returns signed integer type.
     static type_t s(int bits, int elems = 1) {
         switch (bits) {
+            case 4: return s4(elems);
             case 8: return s8(elems);
             case 16: return s16(elems);
             case 32: return s32(elems);
             case 64: return s64(elems);
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return type_t::undef();
     }
+
+    static type_t f4_e3m0(int elems = 1) {
+        return type_t(type_kind_t::f4_e3m0, elems);
+    }
+    static type_t f4_e2m1(int elems = 1) {
+        return type_t(type_kind_t::f4_e2m1, elems);
+    }
     static type_t bf8(int elems = 1) { return type_t(type_kind_t::bf8, elems); }
     static type_t hf8(int elems = 1) { return type_t(type_kind_t::hf8, elems); }
     static type_t bf16(int elems = 1) {
@@ -340,7 +356,7 @@ class type_t {
 
 #undef CASE
 
-        ir_error_not_expected();
+        gpu_error_not_expected();
 
         return undef();
     }
@@ -348,6 +364,8 @@ class type_t {
     template <typename T>
     T max() const {
         switch (kind()) {
+            case type_kind_t::u4:
+            case type_kind_t::s4:
             case type_kind_t::u8:
             case type_kind_t::s8:
             case type_kind_t::u16:
@@ -356,12 +374,12 @@ class type_t {
             case type_kind_t::s32:
             case type_kind_t::u64:
             case type_kind_t::s64: {
-                int bits = 8 * size();
+                int bits = scalar().bitsize();
                 if (is_signed()) bits--;
                 T ret = T(1) << (bits - 1);
                 return ret + (ret - 1);
             }
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return 0;
     }
@@ -369,6 +387,8 @@ class type_t {
     template <typename T>
     T min() const {
         switch (kind()) {
+            case type_kind_t::u4:
+            case type_kind_t::s4:
             case type_kind_t::u8:
             case type_kind_t::s8:
             case type_kind_t::u16:
@@ -380,7 +400,7 @@ class type_t {
                 if (is_unsigned()) return 0;
                 return -max<T>() - 1;
             }
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return 0;
     }
@@ -391,31 +411,46 @@ class type_t {
 
     type_t(type_kind_t kind, uint32_t elems = 1) : kind_(kind), elems_(elems) {}
 
-    type_t(const std::string &s) {
-        elems_ = 1;
+    type_t(const std::string &s) : elems_(1) {
 #define CASE(x) \
     if (to_string(type_kind_t::x) == s) { \
         kind_ = type_kind_t::x; \
         return; \
     }
+        CASE(f4_e3m0);
+        CASE(f4_e2m1);
+        CASE(f8_e5m2);
+        CASE(f8_e4m3);
         CASE(bf16);
         CASE(f16);
         CASE(tf32);
         CASE(f32);
         CASE(f64);
-        CASE(s32);
+
+        CASE(s4);
         CASE(s8);
+        CASE(s16);
+        CASE(s32);
+        CASE(s64);
+
+        CASE(u4);
         CASE(u8);
+        CASE(u16);
+        CASE(u32);
+        CASE(u64);
 #undef CASE
-        ir_error_not_expected();
+        gpu_error_not_expected();
     }
 
     // Constructor from dnnl_data_type_t.
     type_t(data_type_t dt) {
+        if (dt == data_type::undef) return;
         elems_ = 1;
         switch ((int)dt) {
 #define CASE(x) \
     case data_type::x: kind_ = type_kind_t::x; break;
+            CASE(f4_e3m0);
+            CASE(f4_e2m1);
             CASE(f8_e5m2);
             CASE(f8_e4m3);
             CASE(bf16);
@@ -426,8 +461,10 @@ class type_t {
             CASE(s32);
             CASE(s8);
             CASE(u8);
+            CASE(s4);
+            CASE(u4);
 #undef CASE
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
     }
 
@@ -453,7 +490,7 @@ class type_t {
     static void init_parse_iface(parse_iface_t<type_t> *iface) {
         iface->add<type_kind_t, &type_t::kind_>();
         iface->set_pre_stringify_func([](const type_t &type) {
-            ir_assert(!type.is_ptr() && type.is_scalar())
+            gpu_assert(!type.is_ptr() && (type.is_scalar() || type.is_undef()))
                     << "Cannot stringify pointer/non-scalar type.";
         });
     }
@@ -465,15 +502,13 @@ class type_t {
     bool is_bool() const { return kind() == type_kind_t::_bool; }
 
     bool is_fp() const {
-        return utils::one_of(kind(), type_kind_t::bf8, type_kind_t::hf8,
-                type_kind_t::bf16, type_kind_t::f16, type_kind_t::tf32,
-                type_kind_t::f32, type_kind_t::f64);
-    }
-
-    bool is_fp8() const {
-        return utils::one_of(kind(), type_kind_t::bf8, type_kind_t::hf8);
+        return is_fp4() || is_fp8()
+                || utils::one_of(kind(), type_kind_t::bf16, type_kind_t::f16,
+                        type_kind_t::tf32, type_kind_t::f32, type_kind_t::f64);
     }
 
+    bool is_f4_e3m0() const { return kind() == type_kind_t::f4_e3m0; }
+    bool is_f4_e2m1() const { return kind() == type_kind_t::f4_e2m1; }
     bool is_bf8() const { return kind() == type_kind_t::bf8; }
     bool is_hf8() const { return kind() == type_kind_t::hf8; }
     bool is_bf16() const { return kind() == type_kind_t::bf16; }
@@ -482,35 +517,32 @@ class type_t {
     bool is_f32() const { return kind() == type_kind_t::f32; }
     bool is_f64() const { return kind() == type_kind_t::f64; }
 
+    bool is_fp4() const { return is_f4_e3m0() || is_f4_e2m1(); }
+    bool is_fp8() const { return is_bf8() || is_hf8(); }
+
     bool is_int() const {
-        return utils::one_of(kind(), type_kind_t::u8, type_kind_t::s8,
-                type_kind_t::u16, type_kind_t::s16, type_kind_t::u32,
-                type_kind_t::s32, type_kind_t::u64, type_kind_t::s64);
+        return is_x4() || is_x8() || is_x16() || is_x32() || is_x64();
     }
 
+    bool is_s4() const { return kind() == type_kind_t::s4; }
+    bool is_u4() const { return kind() == type_kind_t::u4; }
+    bool is_x4() const { return is_s4() || is_u4(); }
+
     bool is_s8() const { return kind() == type_kind_t::s8; }
     bool is_u8() const { return kind() == type_kind_t::u8; }
-    bool is_x8() const {
-        return utils::one_of(kind(), type_kind_t::s8, type_kind_t::u8);
-    }
+    bool is_x8() const { return is_s8() || is_u8(); }
 
     bool is_s16() const { return kind() == type_kind_t::s16; }
     bool is_u16() const { return kind() == type_kind_t::u16; }
-    bool is_x16() const {
-        return utils::one_of(kind(), type_kind_t::s16, type_kind_t::u16);
-    }
+    bool is_x16() const { return is_s16() || is_u16(); }
 
     bool is_s32() const { return kind() == type_kind_t::s32; }
     bool is_u32() const { return kind() == type_kind_t::u32; }
-    bool is_x32() const {
-        return utils::one_of(kind(), type_kind_t::s32, type_kind_t::u32);
-    }
+    bool is_x32() const { return is_s32() || is_u32(); }
 
     bool is_s64() const { return kind() == type_kind_t::s64; }
     bool is_u64() const { return kind() == type_kind_t::u64; }
-    bool is_x64() const {
-        return utils::one_of(kind(), type_kind_t::s64, type_kind_t::u64);
-    }
+    bool is_x64() const { return is_s64() || is_u64(); }
 
     bool is_byte() const { return kind() == type_kind_t::byte; }
     bool is_dword() const { return kind() == type_kind_t::dword; }
@@ -520,14 +552,14 @@ class type_t {
 
     bool is_signed(int elems = -1) const {
         if (elems != -1 && elems_ != elems) return false;
-        return utils::one_of(kind(), type_kind_t::s8, type_kind_t::s16,
-                type_kind_t::s32, type_kind_t::s64);
+        return utils::one_of(kind(), type_kind_t::s4, type_kind_t::s8,
+                type_kind_t::s16, type_kind_t::s32, type_kind_t::s64);
     }
 
     bool is_unsigned(int elems = -1) const {
         if (elems != -1 && elems_ != elems) return false;
-        return utils::one_of(kind(), type_kind_t::u8, type_kind_t::u16,
-                type_kind_t::u32, type_kind_t::u64);
+        return utils::one_of(kind(), type_kind_t::u4, type_kind_t::u8,
+                type_kind_t::u16, type_kind_t::u32, type_kind_t::u64);
     }
 
     bool is_scalar() const { return elems() == 1; }
@@ -572,6 +604,22 @@ class type_t {
     // Returns size in bytes.
     int size() const;
 
+    // Returns size in bits.
+    int bitsize() const {
+        // 8 elements occupy the same number of bytes that a single element
+        // occupies in bits.
+        constexpr int bits_per_byte = 8;
+        return with_elems(bits_per_byte * elems()).size();
+    }
+
+    // Returns number of elements that fit in `size()` bytes.
+    // The size in bytes of `n` packed elements is
+    //     `div_up(n * size(), packing())`.
+    int packing() const {
+        constexpr int bits_per_byte = 8;
+        return bits_per_byte * size() / bitsize();
+    }
+
     std::string str() const {
         std::ostringstream oss;
         oss << to_string(kind());
@@ -596,6 +644,8 @@ class ref_count_t {
 public:
     ref_count_t() : value_(0) {}
     ref_count_t(const ref_count_t &) = delete;
+    ref_count_t &operator=(const ref_count_t &) = delete;
+    ~ref_count_t() = default;
 
     uint32_t increment() { return ++value_; }
     uint32_t decrement() { return --value_; }
@@ -609,9 +659,12 @@ class object_t;
 class ir_mutator_t;
 class ir_visitor_t;
 
-#define HANDLE_IR_OBJECT(type) class type;
+// clang-tidy doesn't like the semicolon next to the class name.
+#define CLASS_DECLARATION(name) class name
+#define HANDLE_IR_OBJECT(type) CLASS_DECLARATION(type);
 HANDLE_TRAVERSE_TARGETS()
 #undef HANDLE_IR_OBJECT
+#undef CLASS_DECLARATION
 
 // Base class for all IR objects. Implemented as an intrusive pointer, with
 // the reference counter stored inside the object.
@@ -620,6 +673,7 @@ class object_impl_t {
     object_impl_t(type_info_t type_info) : type_info_(type_info) {};
 
     object_impl_t(const object_impl_t &) = delete;
+    object_impl_t &operator=(const object_impl_t &) = delete;
 
     virtual ~object_impl_t() = default;
 
@@ -648,7 +702,7 @@ class object_impl_t {
     //       necessary, and please don't add a non-const variant of the method!
     template <typename T>
     const T &as() const {
-        ir_assert(is<T>());
+        gpu_assert(is<T>());
         return *as_ptr<T>(); // fails on incorrect casts even in Release
     }
 
@@ -739,7 +793,7 @@ class object_t {
 
     template <typename T>
     const T &as() const {
-        ir_assert(impl_);
+        gpu_assert(impl_);
         return impl_->as<T>();
     }
 
@@ -859,6 +913,7 @@ class ir_mutator_t {
     template <typename T>
     std::vector<T> mutate(const std::vector<T> &v) {
         std::vector<T> new_v;
+        new_v.reserve(v.size());
         for (auto &e : v)
             new_v.push_back(mutate(e));
         return new_v;
@@ -866,7 +921,7 @@ class ir_mutator_t {
 
     // To catch missing _mutate() handlers in ir_mutator_t.
     object_t _mutate(const object_impl_t &obj) {
-        ir_error_not_expected() << "Can't handle type: " << object_t(&obj);
+        gpu_error_not_expected() << "Can't handle type: " << object_t(&obj);
         return {};
     }
 
@@ -900,7 +955,7 @@ class ir_visitor_t {
 
     // To catch missing _visit() handlers in ir_visitor_t.
     void _visit(const object_impl_t &obj) {
-        ir_error_not_expected() << "Can't handle type: " << object_t(obj);
+        gpu_error_not_expected() << "Can't handle type: " << object_t(obj);
     }
 
 #define HANDLE_IR_OBJECT(type) virtual void _visit(const type &obj);
@@ -947,7 +1002,7 @@ class expr_t : public object_t {
     expr_t(uint64_t v);
 
     const type_t &type() const {
-        ir_assert(!is_empty());
+        gpu_assert(!is_empty());
         return ((const expr_impl_t *)impl())->type;
     }
 
@@ -970,7 +1025,7 @@ class expr_t : public object_t {
 private:
 #ifdef SANITY_CHECK
     void sanity_check() const override {
-        ir_assert(dynamic_cast<const expr_impl_t *>(impl()) == impl())
+        gpu_assert(dynamic_cast<const expr_impl_t *>(impl()) == impl())
                 << object_t(impl());
     }
 #endif
@@ -1004,6 +1059,7 @@ enum class op_kind_t {
     _eq,
 
     _and,
+    _or,
 
     // Ternary operations.
     // Parametric ReLU.
@@ -1174,7 +1230,7 @@ class cast_t : public expr_impl_t {
     cast_t(const type_t &type, const expr_t &expr, bool saturate)
         : expr_impl_t(_type_info(), type), expr(expr), saturate(saturate) {
         if (!is_bool_vec_u16()) {
-            ir_assert(type.elems() == expr.type().elems())
+            gpu_assert(type.elems() == expr.type().elems())
                     << "Number of elements must match.";
         }
     }
@@ -1437,8 +1493,8 @@ class load_t : public expr_impl_t {
         , off(_off)
         , stride(_stride) {
         normalize_ptr(type, buf, off);
-        ir_assert(is_var(buf)) << buf;
-        ir_assert(buf.type().is_ptr()) << buf;
+        gpu_assert(is_var(buf)) << buf;
+        gpu_assert(buf.type().is_ptr()) << buf;
         if (stride == type.scalar().size()) stride = default_stride;
     }
 };
@@ -1531,7 +1587,7 @@ inline const expr_t &get_base(const expr_t &e) {
     if (e.is_empty()) return e;
     if (e.is<var_t>()) return e;
     if (e.is<ptr_t>()) return e.as<ptr_t>().base;
-    ir_error_not_expected() << e;
+    gpu_error_not_expected() << e;
     return e;
 }
 
@@ -1569,9 +1625,9 @@ class shuffle_t : public expr_impl_t {
         return make(vec, idx);
     }
 
-    static expr_t make_broadcast(const expr_t &expr, int elems) {
+    static expr_t make_broadcast(const expr_t &expr, dim_t elems) {
         if (elems == 1) return expr;
-        ir_assert(expr.type().is_scalar()) << expr;
+        gpu_assert(expr.type().is_scalar()) << expr;
         return make({expr}, std::vector<int>(elems, 0));
     }
 
@@ -1579,9 +1635,9 @@ class shuffle_t : public expr_impl_t {
     // (S[beg], S[beg + 1], ..., S[end - 1]) vector.
     static expr_t make(const expr_t &_shuffle, int beg, int end) {
         auto &shuffle = _shuffle.as<shuffle_t>();
-        ir_assert(beg >= 0 && beg <= shuffle.elems());
-        ir_assert(end >= 0 && end <= shuffle.elems());
-        ir_assert(beg < end);
+        gpu_assert(beg >= 0 && beg <= shuffle.elems());
+        gpu_assert(end >= 0 && end <= shuffle.elems());
+        gpu_assert(beg < end);
         std::vector<expr_t> vec;
         std::vector<int> idx(end - beg, -1);
         for (int i = beg; i < end; i++) {
@@ -1626,19 +1682,19 @@ class shuffle_t : public expr_impl_t {
         : expr_impl_t(_type_info(), shuffle_type(vec, idx))
         , vec(vec)
         , idx(idx) {
-        ir_assert(idx.size() > 1) << "Unexpected empty or scalar shuffle.";
+        gpu_assert(idx.size() > 1) << "Unexpected empty or scalar shuffle.";
     }
 
     static type_t shuffle_type(
             const std::vector<expr_t> &vec, const std::vector<int> &idx) {
-        ir_assert(!vec.empty() && !idx.empty());
+        gpu_assert(!vec.empty() && !idx.empty());
 
         auto elem_type = vec[0].type();
         for (auto &v : vec)
             elem_type = common_type(elem_type, v.type());
 
         for (size_t i = 0; i < idx.size(); i++) {
-            ir_assert(idx[i] >= 0 && idx[i] < int(vec.size()))
+            gpu_assert(idx[i] >= 0 && idx[i] < int(vec.size()))
                     << "Incorrect index.";
             MAYBE_UNUSED(i);
         }
@@ -1695,6 +1751,11 @@ inline expr_t ternary_add3(const expr_t &a, const expr_t &b, const expr_t &c) {
     return ternary_op_t::make(op_kind_t::_add3, a, b, c);
 }
 
+inline expr_t ternary_idiv(
+        const expr_t &a, const expr_t &b, const expr_t &magic) {
+    return ternary_op_t::make(op_kind_t::_idiv, a, b, magic);
+}
+
 // Unary operation: (op a).
 class unary_op_t : public expr_impl_t {
 public:
@@ -1753,7 +1814,7 @@ class var_t : public expr_impl_t {
 template <typename T>
 expr_t to_expr(T value, const type_t &type) {
 #define CASE(ir_type, cpp_type) \
-    if (type == type_t::ir_type()) return expr_t((cpp_type)value)
+    if (type == type_t::ir_type()) return expr_t(static_cast<cpp_type>(value))
 
     CASE(_bool, bool);
     CASE(bf16, bfloat16_t);
@@ -1769,7 +1830,7 @@ expr_t to_expr(T value, const type_t &type) {
 
 #undef CASE
 
-    ir_error_not_expected() << type;
+    gpu_error_not_expected() << type;
 
     return expr_t();
 }
@@ -1821,13 +1882,13 @@ inline bool is_var(const expr_t &e) {
 // Convertor from IR expression to C++ constant.
 template <typename T>
 T to_cpp(const expr_t &e) {
-    ir_assert(is_const(e)) << "Expression must be constant.";
+    gpu_assert(is_const(e)) << "Expression must be constant.";
 
     if (e.is<int_imm_t>()) return (T)e.as<int_imm_t>().value;
     if (e.is<float_imm_t>()) return (T)e.as<float_imm_t>().value;
     if (e.is<bool_imm_t>()) return (T)e.as<bool_imm_t>().value;
 
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return 0;
 }
 
@@ -1836,6 +1897,7 @@ inline int to_int(const expr_t &e) {
 }
 
 expr_t operator-(const expr_t &a);
+expr_t div_up(const expr_t &a, const expr_t &b);
 
 #define DECLARE_BINARY_OPERATOR(op, op_kind) \
     expr_t operator op(const expr_t &a, const expr_t &b);
@@ -1856,6 +1918,7 @@ DECLARE_BINARY_OPERATOR(<, op_kind_t::_lt)
 DECLARE_BINARY_OPERATOR(<=, op_kind_t::_le)
 
 DECLARE_BINARY_OPERATOR(&, op_kind_t::_and)
+DECLARE_BINARY_OPERATOR(|, op_kind_t::_or)
 
 #undef DECLARE_BINARY_OPERATOR
 
@@ -1893,7 +1956,7 @@ class stmt_t : public object_t {
 private:
 #ifdef SANITY_CHECK
     void sanity_check() const override {
-        ir_assert(dynamic_cast<const stmt_impl_t *>(impl()) == impl())
+        gpu_assert(dynamic_cast<const stmt_impl_t *>(impl()) == impl())
                 << object_t(impl());
     }
 #endif
@@ -1930,7 +1993,7 @@ class alloc_attr_t : public object_t {
 private:
 #ifdef SANITY_CHECK
     void sanity_check() const override {
-        ir_assert(dynamic_cast<const alloc_attr_impl_t *>(impl()) == impl())
+        gpu_assert(dynamic_cast<const alloc_attr_impl_t *>(impl()) == impl())
                 << object_t(impl());
     }
 #endif
@@ -2014,18 +2077,18 @@ class alloc_t : public stmt_impl_t {
 public:
     IR_DECL_STMT_TYPE_ID(alloc_t)
 
-    static stmt_t make(const expr_t &buf, int size, alloc_kind_t kind,
+    static stmt_t make(const expr_t &buf, uint32_t size, alloc_kind_t kind,
             const std::vector<alloc_attr_t> &attrs, const stmt_t &body = {}) {
         return stmt_t(new alloc_t(buf, size, kind, attrs, body));
     }
 
-    static stmt_t make(const expr_t &buf, int size, alloc_kind_t kind,
+    static stmt_t make(const expr_t &buf, uint32_t size, alloc_kind_t kind,
             const alloc_attr_t &attr, const stmt_t &body = {}) {
         std::vector<alloc_attr_t> attrs = {attr};
         return make(buf, size, kind, attrs, body);
     }
 
-    static stmt_t make(const expr_t &buf, int size, alloc_kind_t kind,
+    static stmt_t make(const expr_t &buf, uint32_t size, alloc_kind_t kind,
             const stmt_t &body = {}) {
         return make(buf, size, kind, std::vector<alloc_attr_t>(), body);
     }
@@ -2055,24 +2118,32 @@ class alloc_t : public stmt_impl_t {
     const T &get_attr() const {
         for (auto &a : attrs)
             if (a.is<T>()) return a.as<T>();
-        ir_error_not_expected() << "Can't find attribute.";
+        gpu_error_not_expected() << "Can't find attribute.";
         return attrs[0].as<T>();
     }
 
     int register_alloc_size(int grf_size) const {
-        return (kind == alloc_kind_t::grf) ? utils::rnd_up(size, grf_size) : 0;
+        return (kind == alloc_kind_t::grf)
+                ? into<int>(utils::rnd_up(size, grf_size))
+                : 0;
+    }
+
+    std::string line_str() const {
+        std::ostringstream out;
+        out << "alloc " << buf.as<var_t>().name << "[" << size << "]";
+        return out.str();
     }
 
     IR_DECLARE_TRAVERSERS()
 
     expr_t buf;
-    int size;
+    uint32_t size;
     alloc_kind_t kind;
     std::vector<alloc_attr_t> attrs;
     stmt_t body;
 
 private:
-    alloc_t(const expr_t &buf, int size, alloc_kind_t kind,
+    alloc_t(const expr_t &buf, uint32_t size, alloc_kind_t kind,
             const std::vector<alloc_attr_t> &attrs, const stmt_t &body)
         : stmt_impl_t(_type_info())
         , buf(buf)
@@ -2080,7 +2151,7 @@ class alloc_t : public stmt_impl_t {
         , kind(kind)
         , attrs(attrs)
         , body(body) {
-        ir_assert(buf.type().is_ptr()) << buf;
+        gpu_assert(buf.type().is_ptr()) << buf;
     }
 };
 
@@ -2136,6 +2207,17 @@ class store_t : public stmt_impl_t {
 
     bool has_default_stride() const { return stride == default_stride; }
 
+    std::string line_str() const {
+        std::ostringstream out;
+        out << load_t::make(value.type(), buf, off, stride);
+        out << " = " << value;
+        if (!mask.is_empty()) {
+            out << ", mask = " << mask.str();
+            if (fill_mask0) out << " [FILL]";
+        }
+        return out.str();
+    }
+
     IR_DECLARE_TRAVERSERS()
 
     static const int default_stride = -1;
@@ -2158,11 +2240,11 @@ class store_t : public stmt_impl_t {
         , mask(_mask)
         , fill_mask0(_fill_mask0) {
         normalize_ptr(value.type(), buf, off);
-        ir_assert(is_var(buf)) << buf;
-        ir_assert(buf.type().is_ptr()) << buf;
+        gpu_assert(is_var(buf)) << buf;
+        gpu_assert(buf.type().is_ptr()) << buf;
         if (stride == value.type().scalar().size()) stride = default_stride;
         if (!mask.is_empty())
-            ir_assert(mask.type() == type_t::_bool(value.type().elems()));
+            gpu_assert(mask.type() == type_t::_bool(value.type().elems()));
     }
 };
 
@@ -2195,6 +2277,14 @@ class for_t : public stmt_impl_t {
         return ir_utils::get_hash(var, init, bound, body, step, unroll);
     }
 
+    std::string line_str() const {
+        std::ostringstream out;
+        out << "for (" << var << " = " << init << "; " << var << " < " << bound
+            << "; " << var << " += " << step << ") ";
+        if (unroll != 1) out << "[unroll: " << unroll << "] ";
+        return out.str();
+    }
+
     IR_DECLARE_TRAVERSERS()
 
     expr_t var;
@@ -2244,6 +2334,12 @@ class if_t : public stmt_impl_t {
         return ir_utils::get_hash(cond, body, else_body);
     }
 
+    std::string line_str() const {
+        std::ostringstream oss;
+        oss << "if (" << cond << ")";
+        return oss.str();
+    }
+
     IR_DECLARE_TRAVERSERS()
 
     expr_t cond;
@@ -2292,6 +2388,12 @@ class let_t : public stmt_impl_t {
         return utils::rnd_up(var.type().size(), reg_allocator_t::granularity);
     };
 
+    std::string line_str() const {
+        std::ostringstream out;
+        out << var << "." << var.type() << " = " << value;
+        return out.str();
+    }
+
     IR_DECLARE_TRAVERSERS()
 
     expr_t var;
@@ -2302,7 +2404,10 @@ class let_t : public stmt_impl_t {
     let_t(const expr_t &var, const expr_t &value, const stmt_t &body)
         : stmt_impl_t(_type_info()), var(var), value(value), body(body) {
         if (!value.is_empty() && !is_const(value))
-            ir_assert(var.type() == value.type());
+            gpu_assert(var.type() == value.type())
+                    << "Variable " << var << " and  value " << value
+                    << "have different types. " << var.type()
+                    << " != " << value.type() << "\n";
     }
 };
 
@@ -2370,7 +2475,7 @@ class stmt_label_t {
             CASE(prefetch);
             CASE(mul);
 #undef CASE
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return {};
     }
@@ -2426,43 +2531,78 @@ class stmt_group_t : public stmt_impl_t {
         : stmt_impl_t(_type_info()), label(label), body(body) {}
 };
 
-// Statement sequence, allows combining two statements.
+// Statement sequence, allows combining multiple statements.
 // C++ equivalent:
 //     {
-//         head;
-//         tail;
+//         vec[0];
+//         vec[1];
+//         ...
 //     }
 class stmt_seq_t : public stmt_impl_t {
 public:
     IR_DECL_STMT_TYPE_ID(stmt_seq_t)
 
+    static stmt_t make(const std::vector<stmt_t> &vec);
+
     static stmt_t make(const stmt_t &head, const stmt_t &tail) {
-        return stmt_t(new stmt_seq_t(head, tail));
+        return head.append(tail);
     }
 
     bool is_equal(const object_impl_t &obj) const override {
         if (!obj.is<self_type>()) return false;
         auto &other = obj.as<self_type>();
 
-        return head.is_equal(other.head) && tail.is_equal(other.tail);
+        return ir_utils::is_equal(vec, other.vec);
     }
 
-    size_t get_hash() const override { return ir_utils::get_hash(head, tail); }
+    size_t get_hash() const override { return ir_utils::get_hash(vec); }
 
     IR_DECLARE_TRAVERSERS()
 
-    stmt_t head;
-    stmt_t tail;
+    std::vector<stmt_t> vec;
 
 private:
-    stmt_seq_t(const stmt_t &head, const stmt_t &tail)
-        : stmt_impl_t(_type_info()), head(head), tail(tail) {}
+    stmt_seq_t(const std::vector<stmt_t> &vec)
+        : stmt_impl_t(_type_info()), vec(vec) {}
 };
 
-inline stmt_t stmt_t::append(const stmt_t &s) const {
-    if (is_empty()) return s;
-    return stmt_seq_t::make(*this, s);
-}
+// While loop statement with a condition.
+// C++ equivalent:
+//    while (cond) {
+//        body;
+//    }
+class while_t : public stmt_impl_t {
+public:
+    IR_DECL_STMT_TYPE_ID(while_t)
+
+    static stmt_t make(const expr_t &cond, const stmt_t &body = {}) {
+        return stmt_t(new while_t(cond, body));
+    }
+
+    bool is_equal(const object_impl_t &obj) const override {
+        if (!obj.is<self_type>()) return false;
+        auto &other = obj.as<self_type>();
+
+        return cond.is_equal(other.cond) && body.is_equal(other.body);
+    }
+
+    size_t get_hash() const override { return ir_utils::get_hash(cond, body); }
+
+    std::string line_str() const {
+        std::ostringstream out;
+        out << "while (" << cond << ")";
+        return out.str();
+    }
+
+    IR_DECLARE_TRAVERSERS()
+
+    expr_t cond;
+    stmt_t body;
+
+private:
+    while_t(const expr_t &cond, const stmt_t &body)
+        : stmt_impl_t(_type_info()), cond(cond), body(body) {}
+};
 
 // Function call attribute.
 class func_call_attr_impl_t : public object_impl_t {
@@ -2493,7 +2633,8 @@ class func_call_attr_t : public object_t {
 private:
 #ifdef SANITY_CHECK
     void sanity_check() const override {
-        ir_assert(dynamic_cast<const func_call_attr_impl_t *>(impl()) == impl())
+        gpu_assert(
+                dynamic_cast<const func_call_attr_impl_t *>(impl()) == impl())
                 << object_t(impl());
     }
 #endif
@@ -2549,12 +2690,12 @@ class func_impl_t : public object_impl_t {
     func_impl_t(type_info_t type_info) : object_impl_t(type_info) {}
 
     size_t get_hash() const override {
-        ir_error_not_expected() << "get_hash() is not implemented.";
+        gpu_error_not_expected() << "get_hash() is not implemented.";
         return 0;
     }
 
     bool is_equal(const object_impl_t &obj) const override {
-        ir_error_not_expected() << "is_equal() is not implemented.";
+        gpu_error_not_expected() << "is_equal() is not implemented.";
         return false;
     }
 
@@ -2589,7 +2730,7 @@ class func_t : public object_t {
 private:
 #ifdef SANITY_CHECK
     void sanity_check() const override {
-        ir_assert(dynamic_cast<const func_impl_t *>(impl()) == impl())
+        gpu_assert(dynamic_cast<const func_impl_t *>(impl()) == impl())
                 << object_t(impl());
     }
 #endif
@@ -2615,6 +2756,13 @@ class func_call_t : public stmt_impl_t {
 
     size_t get_hash() const override { return ir_utils::get_hash(args, attr); }
 
+    std::string line_str() const {
+        std::ostringstream out;
+        out << func << "(" << ir_utils::make_seq_print_helper(args) << ")";
+        if (!attr.is_empty()) out << " " << attr;
+        return out.str();
+    }
+
     IR_DECLARE_TRAVERSERS()
 
     func_t func;
@@ -2625,7 +2773,7 @@ class func_call_t : public stmt_impl_t {
     func_call_t(const func_t &func, const std::vector<expr_t> &args,
             const func_call_attr_t &attr)
         : stmt_impl_t(_type_info()), func(func), args(args), attr(attr) {
-        ir_assert(!func.is_empty());
+        gpu_assert(!func.is_empty());
     }
 };
 
@@ -2636,7 +2784,7 @@ inline stmt_t func_impl_t::call(
 
 inline stmt_t func_call_attr_t::apply_to(const stmt_t &s) const {
     auto &c = s.as<func_call_t>();
-    ir_assert(c.attr.is_empty())
+    gpu_assert(c.attr.is_empty())
             << "Merging of attributes is not supported: " << s;
     return func_call_t::make(c.func, c.args, *this);
 }
diff --git a/src/gpu/intel/jit/ir/eltwise.hpp b/src/gpu/intel/jit/ir/eltwise.hpp
index 91e9494def9..cdca5f7cb60 100644
--- a/src/gpu/intel/jit/ir/eltwise.hpp
+++ b/src/gpu/intel/jit/ir/eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,13 +33,18 @@ class eltwise_t : public func_impl_t {
 public:
     IR_DECL_DERIVED_TYPE_ID(eltwise_t, func_impl_t)
 
+    static func_t make(alg_kind_t alg_kind, float scale, float alpha,
+            float beta, expr_t &seed, ngen::DataType dst_dt) {
+        return func_t(
+                new eltwise_t(alg_kind, scale, alpha, beta, seed, dst_dt));
+    }
     static func_t make(
             alg_kind_t alg_kind, float scale, float alpha, float beta) {
         return func_t(new eltwise_t(alg_kind, scale, alpha, beta));
     }
 
     std::string str() const override {
-        switch (alg_kind) {
+        switch (static_cast<int>(alg_kind)) {
             case alg_kind::eltwise_relu: return "relu";
             case alg_kind::eltwise_tanh: return "tanh";
             case alg_kind::eltwise_elu: return "elu";
@@ -59,6 +64,7 @@ class eltwise_t : public func_impl_t {
             case alg_kind::eltwise_gelu_tanh: return "gelu_tanh";
             case alg_kind::eltwise_gelu_erf: return "gelu_erf";
             case alg_kind::eltwise_hardswish: return "hardswish";
+            case alg_kind::eltwise_hardsigmoid: return "hardsigmoid";
             case alg_kind::eltwise_relu_use_dst_for_bwd:
                 return "relu_use_dst_for_bwd";
             case alg_kind::eltwise_tanh_use_dst_for_bwd:
@@ -74,7 +80,10 @@ class eltwise_t : public func_impl_t {
             case alg_kind::eltwise_clip_v2_use_dst_for_bwd:
                 return "clip_v2_use_dst_for_bwd";
             case alg_kind::eltwise_round: return "round";
-            default: ir_error_not_expected();
+            // Note: `eltwise_stochastic_round` is not a part of `enum` which
+            // forces `switch` to iterate over `int`, not `alg_kind_t`.
+            case alg_kind::eltwise_stochastic_round: return "stochastic_round";
+            default: gpu_error_not_expected();
         }
         return "unknown";
     }
@@ -86,14 +95,30 @@ class eltwise_t : public func_impl_t {
     float scale;
     float alpha;
     float beta;
+    expr_t seed;
+    ngen::DataType dst_dt = ngen::DataType::invalid;
 
 private:
+    eltwise_t(alg_kind_t alg_kind, float scale, float alpha, float beta,
+            expr_t &seed, ngen::DataType dst_dt)
+        : func_impl_t(_type_info())
+        , alg_kind(alg_kind)
+        , scale(scale)
+        , alpha(alpha)
+        , beta(beta)
+        , seed(seed)
+        , dst_dt(dst_dt) {
+        assert(alg_kind == alg_kind::eltwise_stochastic_round);
+    }
+
     eltwise_t(alg_kind_t alg_kind, float scale, float alpha, float beta)
         : func_impl_t(_type_info())
         , alg_kind(alg_kind)
         , scale(scale)
         , alpha(alpha)
-        , beta(beta) {}
+        , beta(beta) {
+        assert(alg_kind != alg_kind::eltwise_stochastic_round);
+    }
 };
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/ir/epilogue.cpp b/src/gpu/intel/jit/ir/epilogue.cpp
index 7eb9dd4d0bb..c62fe878087 100644
--- a/src/gpu/intel/jit/ir/epilogue.cpp
+++ b/src/gpu/intel/jit/ir/epilogue.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ class zero_pad_builder_t {
     bool is_empty() const { return mem_view_.is_empty(); }
 
     expr_t create_mask(const layout_t &reg_layout, const tensor_t &tile) const {
-        ir_assert(!is_empty());
+        gpu_assert(!is_empty());
         auto layout = reg_layout.map(tile);
         auto view = mem_view_.create_sub_view(tile);
         mask_tensor_t mask_tensor(layout);
@@ -52,7 +52,7 @@ class zero_pad_builder_t {
     }
 
     stmt_t build_stmt(const layout_t &reg_layout, const expr_t &reg_buf) const {
-        ir_assert(mem_view_.nvdims() == reg_layout.ndims())
+        gpu_assert(mem_view_.nvdims() == reg_layout.ndims())
                 << "Incompatible view/layout.";
         int max_step = std::min(
                 16, 2 * ir_ctx_->grf_size() / reg_layout.type().size());
@@ -62,7 +62,7 @@ class zero_pad_builder_t {
         reg_layout.for_each_tile(
                 base_tile, [&](const std::vector<dim_t> &start) {
                     tensor_t tile(base_tile.dims(), start);
-                    int off = reg_layout(start) * reg_layout.type().size();
+                    dim_t off = reg_layout(start) * reg_layout.type().size();
                     auto mask = create_mask(reg_layout, tile);
                     auto zero = to_expr(0, reg_layout.type());
                     auto store = store_t::make(reg_buf, off,
@@ -74,13 +74,13 @@ class zero_pad_builder_t {
     }
 
 private:
-    void fill_mask_impl(mask_tensor_t &mask_tensor, int idx,
+    void fill_mask_impl(mask_tensor_t &mask_tensor, dim_idx_t idx,
             std::vector<dim_t> &args, const view_t &view,
             const layout_t &layout) const {
         if (idx == layout.ndims()) {
             std::vector<expr_t> vargs;
             vargs.reserve(layout.ndims());
-            for (int i = 0; i < layout.ndims(); i++)
+            for (dim_idx_t i = 0; i < layout.ndims(); i++)
                 vargs.push_back(view.vstart(i) + args[i]);
             expr_t mask = full_mem_view_.vmask(vargs);
             auto off = layout.offset(args, /*ignore_offset=*/true);
@@ -129,7 +129,7 @@ class post_op_tensor_t {
         if (!mem_buf().is_empty()) {
             auto &type = mem_buf().type();
             if (!type.is_ptr()) {
-                ir_assert(type.is_f32()) << "Expected f32: " << mem_buf();
+                gpu_assert(type.is_f32()) << "Expected f32: " << mem_buf();
                 reg_buf_ = mem_buf();
                 reg_layout_ = layout_t(
                         type, 0, std::vector<dim_t>(mem_view().nvdims(), 1));
@@ -184,13 +184,14 @@ class post_op_tensor_t {
     bool needs_masked_update() const { return info_.needs_masked_update(); }
 
     bool needs_f32_convert() const {
-        return !mem_view().type().is_f32() && !mem_view().type().is_f64();
+        return info_.do_convert() && !mem_view().type().is_f32()
+                && !mem_view().type().is_f64();
     }
 
     bool needs_reduction() const {
         if (!info_.is_output()) return false;
 
-        for (int i = 0; i < mem_view().nvdims(); i++) {
+        for (dim_idx_t i = 0; i < mem_view().nvdims(); i++) {
             if (is_broadcast_dim(i)) {
                 if (reg_layout_.dims()[i] != 1) return true;
             }
@@ -202,8 +203,9 @@ class post_op_tensor_t {
 
     const expr_t &compute_expr() const { return info_.compute_expr(); }
 
-    bool is_broadcast_dim(int dim_idx) const {
-        ir_assert(dim_idx >= 0 && dim_idx < mem_view().nvdims());
+    bool is_broadcast_dim(dim_idx_t dim_idx) const {
+        gpu_assert(
+                dim_idx != dim_idx::invalid && dim_idx < mem_view().nvdims());
         return (mask() & (1 << dim_idx)) == 0;
     }
 
@@ -225,12 +227,12 @@ class post_op_tensor_t {
     bool do_preload() const { return do_preload_; }
 
     tensor_t apply_mask(const tensor_t &tile) const {
-        ir_assert(mem_view().nvdims() == tile.ndims());
+        gpu_assert(mem_view().nvdims() == tile.ndims());
 
         auto start = tile.start();
         auto dims = tile.dims();
 
-        for (int i = 0; i < tile.ndims(); i++) {
+        for (dim_idx_t i = 0; i < tile.ndims(); i++) {
             if (!is_broadcast_dim(i)) continue;
             start[i] = expr_t(0);
             dims[i] = 1;
@@ -239,10 +241,10 @@ class post_op_tensor_t {
     }
 
     void init_output_buffer(const tensor_t &tile) {
-        ir_assert(needs_store());
+        gpu_assert(needs_store());
 
-        ir_assert(reg_layout_.is_empty());
-        ir_assert(reg_buf_.is_empty());
+        gpu_assert(reg_layout_.is_empty());
+        gpu_assert(reg_buf_.is_empty());
 
         reg_buf_ = make_tmp_reg_buffer();
 
@@ -253,17 +255,18 @@ class post_op_tensor_t {
         // is computed via reduction. Extend layout to cover full masked_tile
         // and apply the final reduction after all tiles.
         auto masked_tile = apply_mask(tile);
-        for (int i = 0; i < masked_tile.ndims(); i++) {
+        for (dim_idx_t i = 0; i < masked_tile.ndims(); i++) {
             if (masked_tile(i) >= tile(i)) continue;
-            ir_assert(masked_tile(i) == 1) << "Unexpected output tensor shape.";
+            gpu_assert(masked_tile(i) == 1)
+                    << "Unexpected output tensor shape.";
             reg_layout_ = reg_layout_.add_outer_block(i, tile(i));
         }
-        register_buffer(reg_buf_, reg_layout_.size());
+        register_buffer(reg_buf_, into<int>(reg_layout_.size()));
     }
 
     stmt_t build_load_stmt(const view_t &c_view) {
-        ir_assert(needs_load());
-        ir_assert(reg_buf_.is_empty());
+        gpu_assert(needs_load());
+        gpu_assert(reg_buf_.is_empty());
 
         reg_buf_ = make_tmp_reg_buffer();
         auto read = make_access_builder(*ir_ctx_, mem_view(), mem_buf(),
@@ -275,8 +278,11 @@ class post_op_tensor_t {
     }
 
     stmt_t build_prefetch_stmt(const view_t &c_view) const {
-        ir_assert(needs_load());
+        gpu_assert(needs_load());
 
+        // Disable prefetching for precomputed ZPs stored at the end of 'wei'
+        if ((mem_buf().str() == "wei") || (mem_buf().str() == "wei_user"))
+            return stmt_t();
         auto prefetch = make_access_builder(*ir_ctx_, mem_view(), mem_buf(),
                 expr_t(), send_op_t::prefetch, send_address_t::a64,
                 get_cache_hint(c_view));
@@ -289,7 +295,7 @@ class post_op_tensor_t {
         auto f32_buf = make_tmp_reg_buffer();
         auto f32_layout = reg_layout_.retype(type_t::f32()).make_dense();
 
-        register_buffer(f32_buf, f32_layout.size());
+        register_buffer(f32_buf, into<int>(f32_layout.size()));
 
         // Reorder to f32.
         auto ret = create_reorder_stmt(
@@ -298,44 +304,45 @@ class post_op_tensor_t {
         // Assign new f32 layout and buffer.
         reg_layout_ = std::move(f32_layout);
         reg_buf_ = std::move(f32_buf);
+        info_.retype(type_t::f32());
 
         return ret;
     }
 
     stmt_t build_compute_stmt(const std::vector<post_op_tensor_t> &tensors) {
-        ir_assert(needs_compute());
-        ir_assert(is_f32_scalar()) << "Only f32 scalars are supported.";
+        gpu_assert(needs_compute());
+        gpu_assert(is_f32_scalar()) << "Only f32 scalars are supported.";
         reg_layout_ = mem_view().create_pseudo_vlayout();
         auto e = compute_expr();
         tensor_t tile(std::vector<dim_t>(reg_layout_.ndims(), 1));
         for (auto &t : tensors) {
             if (contains_object(e, t.op_var())) {
-                ir_assert(t.is_f32_scalar())
+                gpu_assert(t.is_f32_scalar())
                         << "All tensors in the compute expression must be f32 "
                            "scalars.";
-                ir_assert(t.do_preload()) << "All tensors in the compute "
-                                             "expression must be preloaded.";
+                gpu_assert(t.do_preload()) << "All tensors in the compute "
+                                              "expression must be preloaded.";
                 e = substitute(e, t.op_var(), t.load_expr(tile, 0));
             }
         }
         reg_buf_ = make_tmp_reg_buffer();
-        register_buffer(reg_buf_, reg_layout_.size());
+        register_buffer(reg_buf_, into<int>(reg_layout_.size()));
         return store_t::make(reg_buf_, 0, e);
     }
 
     stmt_t build_zero_out_stmt() const {
-        ir_assert(needs_store());
+        gpu_assert(needs_store());
         return funcs::zero_out(reg_buf_, reg_layout_.size());
     }
 
     stmt_t build_reduce_stmt() {
-        ir_assert(needs_store());
+        gpu_assert(needs_store());
 
         stmt_t stmt;
 
         if (needs_reduction()) {
             auto reduced_layout = mem_view().create_dense_vlayout();
-            ir_assert(reduced_layout.size() <= reg_layout_.size());
+            gpu_assert(reduced_layout.size() <= reg_layout_.size());
 
             stmt = stmt.append(
                     create_reduce_stmt(reg_layout_, reduced_layout, reg_buf_,
@@ -347,7 +354,7 @@ class post_op_tensor_t {
     }
 
     stmt_t build_slm_store_stmt(const grid_info_t &tg_grid) {
-        ir_assert(needs_store());
+        gpu_assert(needs_store());
         tensor_t tile(mem_view().vdims());
         slm_reduce_builder_ = slm_reduce_builder_t(
                 *ir_ctx_, tg_grid, reg_buf_, reg_layout_, tile, 1);
@@ -355,12 +362,12 @@ class post_op_tensor_t {
     }
 
     stmt_t build_slm_load_stmt() {
-        ir_assert(needs_store());
-        ir_assert(!slm_reduce_builder_.is_empty());
+        gpu_assert(needs_store());
+        gpu_assert(!slm_reduce_builder_.is_empty());
 
         reg_layout_ = slm_reduce_builder_.reg_layout();
 
-        auto new_tile = slm_reduce_builder_.thr_tile();
+        const auto &new_tile = slm_reduce_builder_.thr_tile();
         info_ = info_.create_sub_tensor(new_tile);
 
         auto &slm_allocs = slm_reduce_builder_.allocs();
@@ -370,37 +377,38 @@ class post_op_tensor_t {
     }
 
     stmt_t build_store_stmt() const {
-        ir_assert(needs_store());
+        gpu_assert(needs_store());
 
         auto write = make_access_builder(*ir_ctx_, mem_view(), mem_buf(),
                 reg_buf(), send_op_t::atomic_fadd, send_address_t::a64);
-        ir_assert(write.reg_layout() == reg_layout());
+        gpu_assert(write.reg_layout() == reg_layout());
 
         return write.stmt();
     }
 
     expr_t load_expr(const tensor_t &tile, int dim_idx) const {
         auto &type = reg_layout_.type();
-        int elems = is_broadcast_dim(dim_idx) ? 1 : tile.elems();
-        int off = reg_layout_.offset_in_bytes(expr_cast<dim_t>(tile.start()));
+        int elems = is_broadcast_dim(dim_idx) ? 1 : into<int>(tile.elems());
+        dim_t off = reg_layout_.offset_in_bytes(expr_cast<dim_t>(tile.start()));
         auto ret = (reg_buf_.type().is_ptr()
                         ? load_t::make(type.with_elems(elems), reg_buf_, off)
                         : reg_buf_);
         if (elems != tile.elems())
-            ret = shuffle_t::make_broadcast(ret, tile.elems());
+            ret = shuffle_t::make_broadcast(ret, into<int>(tile.elems()));
         return ret;
     }
 
     stmt_t store_stmt(const tensor_t &tile, int dim_idx, const expr_t &_value,
             const expr_t &mask = expr_t()) const {
         auto value = _value;
-        ir_assert(!is_broadcast_dim(dim_idx));
-        ir_assert(value.type().elems() == tile.elems());
+        gpu_assert(!is_broadcast_dim(dim_idx));
+        gpu_assert(value.type().elems() == tile.elems());
         // Add cast for booleans for comparison ops.
         if (value.type().is_bool()) {
-            value = cast(value, reg_layout_.type().with_elems(tile.elems()));
+            value = cast(value,
+                    reg_layout_.type().with_elems(into<int>(tile.elems())));
         }
-        int off = reg_layout_.offset_in_bytes(expr_cast<dim_t>(tile.start()));
+        dim_t off = reg_layout_.offset_in_bytes(expr_cast<dim_t>(tile.start()));
         auto ret = store_t::make(
                 reg_buf_, off, value, store_t::default_stride, mask);
         return ret;
@@ -416,12 +424,12 @@ class post_op_tensor_t {
             if (ptr) var = ptr->base.as_ptr<var_t>();
         }
         if (!var && needs_compute()) var = op_var().as_ptr<var_t>();
-        ir_assert(var) << "Can't extract variable from buffer: " << mem_buf();
+        gpu_assert(var) << "Can't extract variable from buffer: " << mem_buf();
         auto &name = var->name;
         return ir_ctx_->create_tmp_var(type_t::byte_ptr(), "tmp_" + name);
     }
 
-    void register_buffer(const expr_t &buf, int size) {
+    void register_buffer(const expr_t &buf, uint32_t size) {
         size = utils::rnd_up(size, ir_ctx_->grf_size());
         for (auto &_a : allocs_) {
             auto &a = _a.as<alloc_t>();
@@ -436,9 +444,9 @@ class post_op_tensor_t {
     }
 
     send_cache_hint_t get_cache_hint(const view_t &c_view) const {
-        ir_assert(mem_view().nvdims() == c_view.nvdims());
+        gpu_assert(mem_view().nvdims() == c_view.nvdims());
         bool per_tensor = true;
-        for (int i = 0; i < mem_view().nvdims(); i++) {
+        for (dim_idx_t i = 0; i < mem_view().nvdims(); i++) {
             if ((mask() & (1 << i)) != 0) continue;
             if (c_view.vdims()[i] == 1) continue;
             per_tensor = false;
@@ -482,14 +490,14 @@ class post_op_bcast_mutator_t : public ir_mutator_t {
         auto it = from2to_.find(obj);
         if (it != from2to_.end()) return make_bcast(it->second);
 
-        ir_error_not_expected() << "Unknown variable.";
+        gpu_error_not_expected() << "Unknown variable.";
         return obj;
     }
 
 private:
     object_t make_bcast(const expr_t &e) const {
         if (e.type().elems() == elems_) return e;
-        ir_assert(e.type().elems() == 1);
+        gpu_assert(e.type().elems() == 1);
         return shuffle_t::make_broadcast(e, elems_);
     }
 
@@ -512,24 +520,34 @@ class post_op_builder_t {
         auto &lhs_tensor = *args.at(post_op_.lhs());
         if (!post_op_.eltwise().is_empty()) {
             // Apply eltwise post-op.
-            ir_assert(post_op_.lhs().is_equal(post_op_.rhs()))
+            gpu_assert(post_op_.lhs().is_equal(post_op_.rhs()))
                     << "Only supported form is lhs = eltwise(lhs).";
-            int lhs_size = lhs_tensor.reg_layout().size();
-            int lhs_elems = lhs_size / int(sizeof(float));
-            return post_op_.eltwise().call(
-                    {expr_t(lhs_elems), lhs_tensor.reg_buf()});
+            dim_t lhs_size = lhs_tensor.reg_layout().size();
+            dim_t lhs_elems = lhs_size / int(sizeof(float));
+            auto &eltwise_func = post_op_.eltwise().as<eltwise_t>();
+            if (eltwise_func.alg_kind == alg_kind::eltwise_stochastic_round) {
+
+                return post_op_.eltwise().call(
+                        {expr_t(lhs_elems), lhs_tensor.reg_buf(),
+                                (*args.at(eltwise_func.seed)).reg_buf()});
+            } else {
+                return post_op_.eltwise().call(
+                        {expr_t(lhs_elems), lhs_tensor.reg_buf()});
+            }
         }
 
         int inner_dim_idx = -1;
         auto base_inner_tile = find_1d_tile(
                 lhs_tensor.reg_layout().type(), args, inner_dim_idx);
         auto inner_layout = lhs_tensor.reg_layout().map(base_inner_tile);
-        ir_assert(inner_dim_idx != -1);
+        gpu_assert(inner_dim_idx != -1);
 
-        // All post-ops are performed in f32 except f64 bias.
+        // All post-ops arguments are f32 type except f64 bias and u64
+        // stochastic rounding seed.
         for (auto &kv : args) {
-            ir_assert(kv.second->reg_layout().type().is_f32()
-                    || kv.second->reg_layout().type().is_f64());
+            gpu_assert(kv.second->reg_layout().type().is_f32()
+                    || kv.second->reg_layout().type().is_f64()
+                    || kv.second->reg_layout().type().is_u64());
         }
 
         // Handle one inner tile at a time. Inner tile covers a single block
@@ -562,33 +580,33 @@ class post_op_builder_t {
             int &inner_dim_idx) const {
         auto &lhs_tensor = *args.at(post_op_.lhs());
 
-        ir_assert(!lhs_tensor.reg_layout().is_empty());
+        gpu_assert(!lhs_tensor.reg_layout().is_empty());
         std::vector<dim_t> dims(lhs_tensor.mem_view().nvdims(), 1);
 
         if (lhs_tensor.reg_layout().blocks().empty()) {
             for (dim_t d : lhs_tensor.mem_view().vdims())
-                ir_assert(d == 1);
+                gpu_assert(d == 1);
             inner_dim_idx = 0;
         } else {
             auto &b0 = lhs_tensor.reg_layout().blocks()[0];
-            ir_assert(dim_t(b0.stride) == 1);
+            gpu_assert(dim_t(b0.stride) == 1);
             inner_dim_idx = b0.dim_idx;
 
-            int inner_block = b0.block;
-            int max_step = 2 * hw_.grf_size() / lhs_type.size();
-            inner_block = std::max(8, math::gcd(inner_block, max_step));
+            dim_t inner_block = b0.block;
+            dim_t max_step = 2 * hw_.grf_size() / lhs_type.size();
+            inner_block = std::max<dim_t>(8, math::gcd(inner_block, max_step));
 
             for (auto &kv : args) {
                 auto &t = *kv.second;
                 if (t.is_broadcast_dim(b0.dim_idx)) continue;
 
                 auto &l = t.reg_layout();
-                ir_assert(!l.is_empty());
-                ir_assert(!l.blocks().empty());
+                gpu_assert(!l.is_empty());
+                gpu_assert(!l.blocks().empty());
                 auto &lb0 = l.blocks()[0];
-                ir_assert(lb0.dim_idx == b0.dim_idx);
-                ir_assert(dim_t(lb0.stride) == 1);
-                inner_block = math::gcd(int(lb0.block), inner_block);
+                gpu_assert(lb0.dim_idx == b0.dim_idx);
+                gpu_assert(dim_t(lb0.stride) == 1);
+                inner_block = math::gcd(lb0.block, inner_block);
             }
             dims[b0.dim_idx] = inner_block;
         }
@@ -604,7 +622,7 @@ class post_op_builder_t {
             auto te = t.load_expr(tile, dim_idx);
             sub_map.insert({t.op_var(), te});
         }
-        post_op_bcast_mutator_t bcast_mutator(tile.elems(), sub_map);
+        post_op_bcast_mutator_t bcast_mutator(into<int>(tile.elems()), sub_map);
         return bcast_mutator.mutate(expr);
     }
 
@@ -615,7 +633,7 @@ class post_op_builder_t {
 int get_post_op_mem_usage(const post_op_tensor_info_t &info, int c_elems,
         const view_t &c_mem_view, int max_elems_per_dim = 64) {
     int po_elems = 1;
-    for (int i = 0; i < info.view().nvdims(); i++) {
+    for (dim_idx_t i = 0; i < info.view().nvdims(); i++) {
         if ((info.mask() & (1 << i)) == 0) continue;
         po_elems *= std::min(max_elems_per_dim, (int)c_mem_view.vdims()[i]);
     }
@@ -657,7 +675,7 @@ int find_tile_size(const exec_config_t &exec_cfg,
                 - (int)c_reg_layout.size();
         if (total_size <= available_size * 0.8) return tile_size;
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return -1;
 }
 
@@ -708,10 +726,10 @@ class epilogue_builder_t {
         tile_size_ = find_tile_size(exec_cfg, post_op_ctx_, c_mem_view_,
                 c_reg_layout, preload_max_size_, post_op_blk_);
 
-        ir_trace() << "Creating epilogue with parameters"
-                   << ": tile_size = " << tile_size_
-                   << ", preload_max_size = " << preload_max_size
-                   << ", post_op_blk = " << post_op_blk << std::endl;
+        gpu_trace() << "Creating epilogue with parameters"
+                    << ": tile_size = " << tile_size_
+                    << ", preload_max_size = " << preload_max_size
+                    << ", post_op_blk = " << post_op_blk;
 
         for (auto &po_tensor_info : post_op_ctx_.post_op_tensor_infos()) {
             post_op_tensor_t po_tensor(ir_ctx_, po_tensor_info);
@@ -719,10 +737,10 @@ class epilogue_builder_t {
             if (po_tensor_info.buf().is_empty()
                     && !po_tensor_info.needs_compute()) {
                 // C tensor.
-                ir_assert(c_po_idx_ == -1);
+                gpu_assert(c_po_idx_ == -1);
                 c_po_idx_ = tensor_idx;
             }
-            post_op_tensors_.push_back(po_tensor);
+            post_op_tensors_.push_back(std::move(po_tensor));
             tensor_idx++;
         }
 
@@ -766,15 +784,15 @@ class epilogue_builder_t {
                     = !layout.is_equal(next->layout, /*compare_offset=*/false);
             if (force_reorder) do_reorder = true;
             if (do_reorder) {
-                ir_assert(stmt.is_empty());
+                gpu_assert(stmt.is_empty());
                 // Generate reorder between stages.
                 stmt = create_reorder_stmt(
                         layout, next->layout, buf, next->buf);
             } else {
                 // Reuse the same GRF buffer for the next stage.
-                int this_off = to_cpp<int>(layout.offset_in_bytes());
-                int next_off = to_cpp<int>(next->layout.offset_in_bytes());
-                ir_assert(next_off == 0);
+                dim_t this_off = to_cpp<dim_t>(layout.offset_in_bytes());
+                dim_t next_off = to_cpp<dim_t>(next->layout.offset_in_bytes());
+                gpu_assert(next_off == 0);
                 next->set_buf(buf[this_off]);
             }
         }
@@ -790,15 +808,15 @@ class epilogue_builder_t {
             return buf.as<ptr_t>().base;
         }
 
-        int get_buf_size(bool check_base = true) const {
+        dim_t get_buf_size(bool check_base = true) const {
             if (check_base)
-                ir_assert(buf.is_same(buf_base()))
+                gpu_assert(buf.is_same(buf_base()))
                         << "Size must be queried from another stage.";
-            return (buf_size == 0) ? int(layout.size()) : buf_size;
+            return (buf_size == 0) ? layout.size() : buf_size;
         }
 
-        int max_off_bytes() const {
-            int l_off_bytes = (int)layout.max_off_bytes(/*ignore_offset=*/true);
+        dim_t max_off_bytes() const {
+            dim_t l_off_bytes = layout.max_off_bytes(/*ignore_offset=*/true);
             return std::max(buf_size, l_off_bytes);
         }
 
@@ -807,13 +825,13 @@ class epilogue_builder_t {
         }
 
         layout_t layout;
-        int buf_size;
+        dim_t buf_size;
         expr_t buf;
         stmt_t stmt; // Statement to emit after the stage.
     };
 
     void build(const layout_t &c_reg_layout, const expr_t &c_reg_buf) {
-        c_reg_buf_size_ = c_reg_layout.size();
+        c_reg_buf_size_ = into<int>(c_reg_layout.size());
         auto tmp_type = (post_op_builders_.empty() ? c_mem_view_.type()
                                                    : type_t::f32());
         int tmp_buf_elems = tile_size_ / tmp_type.size();
@@ -985,7 +1003,7 @@ class epilogue_builder_t {
         std::vector<int> buf_sizes(nstages);
         for (int i = 1; i < nstages; i++) {
             auto &s = c_stages[i];
-            buf_sizes[i] = s.max_off_bytes();
+            buf_sizes[i] = into<int>(s.max_off_bytes());
         }
 
         // Generate reorders between C stages if needed.
@@ -1012,6 +1030,7 @@ class epilogue_builder_t {
 
         // Create sub-tensors for post-ops.
         std::vector<post_op_tensor_t> sub_po_tensors;
+        sub_po_tensors.reserve(post_op_tensors_.size());
         for (auto &t : post_op_tensors_)
             sub_po_tensors.push_back(t.create_sub_tensor(tile));
 
diff --git a/src/gpu/intel/jit/ir/fma.cpp b/src/gpu/intel/jit/ir/fma.cpp
index a09c8de64c3..529a498371d 100644
--- a/src/gpu/intel/jit/ir/fma.cpp
+++ b/src/gpu/intel/jit/ir/fma.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ int get_simd_size(const hw_t &hw, const fma_kind_t kind, const type_t &a,
         case fma_kind_t::mad: ret = mad_t::get_simd_size(hw, a, b, c); break;
         default: break;
     }
-    ir_assert(ret != 0);
+    gpu_assert(ret != 0);
     return ret;
 }
 
@@ -75,7 +75,7 @@ type_t multiply_desc_t::get_c_type(
         return type_t::f16();
     }
 
-    ir_error_not_expected()
+    gpu_error_not_expected()
             << "Can't deduce C type. A type: " << a << " B type: " << b;
     return type_t::undef();
 }
@@ -86,7 +86,7 @@ bool dpas_t::is_src_type(type_t type) {
 }
 
 layout_t dpas_t::a_layout() const {
-    if (!is_src_type(src1_type)) ir_error_not_expected();
+    if (!is_src_type(src1_type)) gpu_error_not_expected();
 
     int m_blk = exec_size;
     int inner_blk = 4 / src1_type.size();
@@ -97,7 +97,7 @@ layout_t dpas_t::a_layout() const {
 }
 
 layout_t dpas_t::b_layout() const {
-    if (!is_src_type(src2_type)) ir_error_not_expected();
+    if (!is_src_type(src2_type)) gpu_error_not_expected();
 
     int n_blk = rcount;
     int k_blk = sdepth * 4 / src2_type.size();
@@ -133,6 +133,7 @@ bool dpas_t::matches_types(
         const hw_t &hw, const type_t &a, const type_t &b, const type_t &c) {
     if (a.is_x8() && b.is_x8() && c.is_s32()) return true;
     if (a.is_fp8() && b.is_fp8() && (c.is_f32() || c.is_bf16())) return true;
+    if (a.is_fp4() && b.is_fp4() && (c.is_f32() || c.is_bf16())) return true;
     if (a.is_f16() && b.is_f16() && c.is_f32()) return true;
     if (a.is_bf16() && b.is_bf16() && c.is_f32()) return true;
     if (a.is_tf32() && b.is_tf32() && c.is_f32() && hw >= ngen::HW::XeHPC)
@@ -146,6 +147,7 @@ bool mad_t::matches_types(
     if (a != b && !(a.is_x8() && b.is_x8())) return false;
 
     if (a.is_fp8() && b.is_fp8()) return true;
+    if (a.is_fp4() && b.is_fp4()) return true;
     if (a.is_f64() && c.is_f64()) return true;
     if (a.is_f32() && c.is_f32()) return true;
     if (a.is_f16() && c.is_f16()) return true;
diff --git a/src/gpu/intel/jit/ir/fma.hpp b/src/gpu/intel/jit/ir/fma.hpp
index 330f2aba46c..accf37604e0 100644
--- a/src/gpu/intel/jit/ir/fma.hpp
+++ b/src/gpu/intel/jit/ir/fma.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include <string>
 
 #include "gpu/intel/jit/ir/tensor.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -69,7 +69,8 @@ class multiply_desc_t {
     multiply_desc_t(const layout_t &a_layout, const layout_t &b_layout,
             bool force_c_upconvert)
         : a_layout_(a_layout), b_layout_(b_layout) {
-        ir_assert(a_layout.ndims() == 2 && b_layout.ndims() == 2)
+        gpu_assert(a_layout.ndims() == dim_idx_t(2)
+                && b_layout.ndims() == dim_idx_t(2))
                 << "Expected 2D layouts, A layout: " << a_layout
                 << " B layout: " << b_layout;
 
@@ -83,9 +84,9 @@ class multiply_desc_t {
     const type_t &b_type() const { return b_layout_.type(); }
     const type_t &c_type() const { return c_type_; }
 
-    int m() const { return a_layout_.dims()[0]; }
-    int n() const { return b_layout_.dims()[1]; }
-    int k() const { return a_layout_.dims()[1]; }
+    dim_t m() const { return a_layout_.dims()[0]; }
+    dim_t n() const { return b_layout_.dims()[1]; }
+    dim_t k() const { return a_layout_.dims()[1]; }
 
     static type_t get_c_type(
             const type_t &a, const type_t &b, bool force_c_upconvert);
@@ -101,8 +102,8 @@ class dpas_t : public func_impl_t {
 public:
     IR_DECL_DERIVED_TYPE_ID(dpas_t, func_impl_t)
 
-    static func_t make(bool is_dpasw, int exec_size, int sdepth, int rcount,
-            const type_t &dst_type, const type_t &src1_type,
+    static func_t make(bool is_dpasw, int exec_size, uint8_t sdepth,
+            uint8_t rcount, const type_t &dst_type, const type_t &src1_type,
             const type_t &src2_type) {
         return func_t(new dpas_t(is_dpasw, exec_size, sdepth, rcount, dst_type,
                 src1_type, src2_type));
@@ -140,10 +141,9 @@ class dpas_t : public func_impl_t {
         std::ostringstream oss;
         oss << (is_dpasw ? "dpasw" : is_dp4a() ? "dp4a" : "dpas");
         if (!is_dp4a()) {
-            oss << "." << sdepth << "x" << rcount;
-        } else {
-            oss << ".x" << exec_size;
+            oss << std::to_string(sdepth) << "x" << std::to_string(rcount);
         }
+        oss << ".x" << std::to_string(exec_size);
         return oss.str();
     }
 
@@ -157,9 +157,13 @@ class dpas_t : public func_impl_t {
         return call({dst, src0, src1, src2});
     }
 
-    int dst_size() const { return exec_size * rcount * sizeof(uint32_t); }
+    int dst_size() const {
+        return exec_size * (int)(rcount * sizeof(uint32_t));
+    }
     int src0_size() const { return dst_size(); }
-    int src1_size() const { return exec_size * sdepth * sizeof(uint32_t); }
+    int src1_size() const {
+        return exec_size * (int)(sdepth * sizeof(uint32_t));
+    }
     int src2_size() const {
         const int dpas_size = sdepth * rcount * sizeof(uint32_t);
         return is_dpasw ? dpas_size / 2 : dpas_size;
@@ -178,15 +182,15 @@ class dpas_t : public func_impl_t {
     bool is_dpasw;
 
     int exec_size;
-    int sdepth;
-    int rcount;
+    uint8_t sdepth;
+    uint8_t rcount;
 
     type_t dst_type; // src0 type is same as dst_type.
     type_t src1_type;
     type_t src2_type;
 
 private:
-    dpas_t(bool is_dpasw, int exec_size, int sdepth, int rcount,
+    dpas_t(bool is_dpasw, int exec_size, uint8_t sdepth, uint8_t rcount,
             const type_t &dst_type, const type_t &src1_type,
             const type_t &src2_type)
         : func_impl_t(_type_info())
@@ -249,7 +253,8 @@ class mad_t : public func_impl_t {
             const hw_t &hw, const type_t &a, const type_t &b, const type_t &c) {
         int max_size = max_exec_size;
         int max_exec_size_bytes = get_max_exec_size_bytes(hw);
-        int max_type_size = utils::one_of(type_t::bf8(), a, b, c)
+        int max_type_size = (utils::one_of(type_t::bf8(), a, b, c)
+                                    || utils::one_of(type_t::hf8(), a, b, c))
                 ? 2
                 : std::max(a.size(), std::max(b.size(), c.size()));
         return std::min(max_size, max_exec_size_bytes / max_type_size);
@@ -276,12 +281,12 @@ class mad_t : public func_impl_t {
         , src1_stride(src1_stride)
         , src2_stride(src2_stride) {
         int max_exec_size_bytes = get_max_exec_size_bytes(hw);
-        ir_assert(math::is_pow2(exec_size));
+        gpu_assert(math::is_pow2(exec_size));
 
-        ir_assert(exec_size <= max_exec_size);
-        ir_assert(dst_size() <= max_exec_size_bytes);
-        ir_assert(src1_size() <= max_exec_size_bytes);
-        ir_assert(src2_size() <= max_exec_size_bytes);
+        gpu_assert(exec_size <= max_exec_size);
+        gpu_assert(dst_size() <= max_exec_size_bytes);
+        gpu_assert(src1_size() <= max_exec_size_bytes);
+        gpu_assert(src2_size() <= max_exec_size_bytes);
     }
 };
 
diff --git a/src/gpu/intel/jit/ir/gemm_schedule.cpp b/src/gpu/intel/jit/ir/gemm_schedule.cpp
index 5790acd9050..c2bbe35ba1a 100644
--- a/src/gpu/intel/jit/ir/gemm_schedule.cpp
+++ b/src/gpu/intel/jit/ir/gemm_schedule.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ layout_t bmnk_mapper_t::map_to_bmnk(abc_kind_t abc_kind,
                 break;
             }
         }
-        if (!found) ir_error_not_expected() << "MNK dimension not found.";
+        if (!found) gpu_error_not_expected() << "MNK dimension not found.";
     }
     return layout_t(layout.type(), int(bmnk_kinds.size()), 0, blocks);
 }
@@ -64,15 +64,15 @@ void bmnk_block_mapper_t::push_block(abc_kind_t abc_kind, const block_t &b) {
         case bmnk_kind_t::m: m_blocks_.emplace_back(abc_kind, b); break;
         case bmnk_kind_t::n: n_blocks_.emplace_back(abc_kind, b); break;
         case bmnk_kind_t::k: k_blocks_.emplace_back(abc_kind, b); break;
-        default: ir_error_not_expected() << "Unknown MNK kind.";
+        default: gpu_error_not_expected() << "Unknown MNK kind.";
     }
 }
 
 layout_t bmnk_block_mapper_t::map_from_bmnk(abc_kind_t abc_kind,
         const std::vector<bmnk_kind_t> &bmnk_kinds,
         const layout_t &bmnk_layout) const {
-    ir_assert(bmnk_layout.ndims() <= 3);
-    ir_assert(bmnk_layout.has_zero_offset());
+    gpu_assert(bmnk_layout.ndims() <= 3);
+    gpu_assert(bmnk_layout.has_zero_offset());
     std::vector<block_t> blocks;
     std::vector<std::vector<block_t>> tmp_blocks(
             static_cast<int>(bmnk_kind_t::k) + 1);
@@ -87,13 +87,13 @@ layout_t bmnk_block_mapper_t::map_from_bmnk(abc_kind_t abc_kind,
     for (auto &b : bmnk_layout.blocks()) {
         auto &bmnk_blocks = tmp_blocks[static_cast<int>(bmnk_kinds[b.dim_idx])];
         bool ok = pop_block(bmnk_blocks, blocks, b);
-        ir_assert(ok) << "Can't map from bmnk layout to problem layout.";
+        gpu_assert(ok) << "Can't map from bmnk layout to problem layout.";
         MAYBE_UNUSED(ok);
     }
     for (auto bmnk_kind : bmnk_kinds) {
         auto &bmnk_blocks = tmp_blocks[static_cast<int>(bmnk_kind)];
         pop_size_1_blocks(bmnk_blocks);
-        ir_assert(bmnk_blocks.empty());
+        gpu_assert(bmnk_blocks.empty());
     }
 
     // Fix strides to make them dense.
diff --git a/src/gpu/intel/jit/ir/gemm_schedule.hpp b/src/gpu/intel/jit/ir/gemm_schedule.hpp
index b961b8c3db0..36bcbd2ce11 100644
--- a/src/gpu/intel/jit/ir/gemm_schedule.hpp
+++ b/src/gpu/intel/jit/ir/gemm_schedule.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ inline std::ostream &operator<<(std::ostream &out, abc_kind_t abc) {
         case abc_kind_t::a: out << "a"; break;
         case abc_kind_t::b: out << "b"; break;
         case abc_kind_t::c: out << "c"; break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return out;
 }
@@ -85,7 +85,7 @@ class bmnk_mapper_t {
 
     void set_bmnk_kind(const expr_t &var, bmnk_kind_t bmnk_kind) {
         auto ret = bmnk_kinds_.insert({var, bmnk_kind});
-        ir_assert(ret.second) << "Can't set variable twice: " << var;
+        gpu_assert(ret.second) << "Can't set variable twice: " << var;
     }
 
     const expr_t &var(abc_kind_t abc_kind, int dim_idx) const {
@@ -118,7 +118,7 @@ class bmnk_mapper_t {
             case abc_kind_t::a: return a_vars_;
             case abc_kind_t::b: return b_vars_;
             case abc_kind_t::c: return c_vars_;
-            default: ir_error_not_expected() << "Unknown ABC kind.";
+            default: gpu_error_not_expected() << "Unknown ABC kind.";
         }
         return a_vars_;
     }
@@ -203,7 +203,7 @@ static std::string to_string(loop_kind_t kind) {
         case loop_kind_t::serial: return "serial";
         case loop_kind_t::tg_grid: return "tg_grid";
         case loop_kind_t::tensorized: return "tensorized";
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return "unknown";
 }
@@ -319,7 +319,7 @@ class loop_t {
             return 0;
         }
         if (is_split_parent()) {
-            ir_assert(child_vars_.size() == 2);
+            gpu_assert(child_vars_.size() == 2);
             auto &outer_loop = all_loops.at(child_vars_[0]);
             auto &inner_loop = all_loops.at(child_vars_[1]);
             auto outer_var
@@ -333,7 +333,7 @@ class loop_t {
             // Example of "unpacking":
             //     fused_var = (a * b * c * d)
             //     b = (fused_var / (D * C)) % B
-            ir_assert(child_vars_.size() == 1);
+            gpu_assert(child_vars_.size() == 1);
             auto &fused_loop = all_loops.at(child_vars_[0]);
             int nvars = int(fused_loop.parent_vars_.size());
             expr_t denom = 1;
@@ -351,7 +351,7 @@ class loop_t {
             }
         }
 
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return expr_t();
     }
 
@@ -364,9 +364,9 @@ class loop_t {
         oss << " kind: " << kind_;
         if (unroll_factor_ != 1) oss << " unroll: " << unroll_factor_;
         std::vector<std::string> props;
-        if (is_root()) props.push_back("root");
-        if (is_fused_child()) props.push_back("fused");
-        if (is_split_parent()) props.push_back("split");
+        if (is_root()) props.emplace_back("root");
+        if (is_fused_child()) props.emplace_back("fused");
+        if (is_split_parent()) props.emplace_back("split");
         oss << "(" << make_seq_print_helper(props, ", ") << ")";
         return oss.str();
     }
@@ -473,12 +473,12 @@ class gemm_schedule_t {
 
     void set_view(const view_t &view) {
         // Create missing loops.
-        for (int i = 0; i < view.nvdims(); i++) {
+        for (dim_idx_t i = 0; i < view.nvdims(); i++) {
             auto &v = view.vvars()[i];
             dim_t bound = view.vdims()[i];
             if (has_loop(v)) {
                 auto &loop = find_loop(v);
-                ir_assert(bound == to_cpp<dim_t>(loop.bound()))
+                gpu_assert(bound == to_cpp<dim_t>(loop.bound()))
                         << "Inconsistent sizes.";
                 continue;
             }
@@ -497,17 +497,17 @@ class gemm_schedule_t {
     }
 
     view_t a_tg_view() const {
-        ir_assert(is_finalized_);
+        gpu_assert(is_finalized_);
         return a_view_.create_sub_view(a_tg_tile_);
     }
 
     view_t b_tg_view() const {
-        ir_assert(is_finalized_);
+        gpu_assert(is_finalized_);
         return b_view_.create_sub_view(b_tg_tile_);
     }
 
     view_t c_tg_view() const {
-        ir_assert(is_finalized_);
+        gpu_assert(is_finalized_);
         return c_view_.create_sub_view(c_tg_tile_);
     }
 
@@ -532,11 +532,11 @@ class gemm_schedule_t {
         return c_tg_tile_.create_sub_tensor(c_thr_tile_);
     }
 
-    int var_bound(const expr_t &var) const {
-        return to_cpp<int>(find_loop(var).bound());
+    dim_t var_bound(const expr_t &var) const {
+        return to_cpp<dim_t>(find_loop(var).bound());
     }
 
-    void set_var_bound(const expr_t &var, int bound) {
+    void set_var_bound(const expr_t &var, dim_t bound) {
         return find_loop(var).set_bound(bound);
     }
 
@@ -549,11 +549,11 @@ class gemm_schedule_t {
     //       ...
     //     }
     //   }
-    void split(const expr_t &var, int factor, expr_t &outer_var,
+    void split(const expr_t &var, dim_t factor, expr_t &outer_var,
             expr_t &inner_var, const std::string &outer_name = {},
             const std::string &inner_name = {}) {
         auto &loop = find_loop(var);
-        ir_assert(loop.is_leaf()) << "Can't split, non-leaf loop.";
+        gpu_assert(loop.is_leaf()) << "Can't split, non-leaf loop.";
 
         auto bound = to_cpp<int64_t>(loop.bound());
         if (loop.is_root() && (bound % factor != 0)) {
@@ -562,7 +562,7 @@ class gemm_schedule_t {
             loop.set_bound(bound);
         }
 
-        ir_assert(bound % factor == 0) << "Can't split.";
+        gpu_assert(bound % factor == 0) << "Can't split.";
 
         if (outer_name.empty()) {
             outer_var = create_var({var}, "outer");
@@ -575,7 +575,7 @@ class gemm_schedule_t {
             inner_var = var_t::make(type_t::s32(), inner_name);
         }
 
-        ir_assert(outer_var.as<var_t>().name != inner_var.as<var_t>().name)
+        gpu_assert(outer_var.as<var_t>().name != inner_var.as<var_t>().name)
                 << "Cannot split loops to the same name "
                 << outer_var.as<var_t>().name;
 
@@ -623,8 +623,9 @@ class gemm_schedule_t {
         }
         auto &fused_loop = create_loop(fused_var, fused_bound);
         std::vector<std::reference_wrapper<loop_t>> loop_refs;
+        loop_refs.reserve(vars.size());
         for (auto &v : vars) {
-            loop_refs.push_back(find_loop(v));
+            loop_refs.emplace_back(find_loop(v));
         }
         fused_loop.set_fuse(loop_refs);
         set_bmnk_kind(fused_var, bmnk_kind(vars));
@@ -646,20 +647,21 @@ class gemm_schedule_t {
     // Binds the loop defined by `v` to an external variable.
     void bind(const expr_t &v, const expr_t &bound_var) {
         auto &loop = find_loop(v);
-        ir_assert(loop.is_leaf()) << "Can't bind non-leaf loop: " << v;
+        gpu_assert(loop.is_leaf()) << "Can't bind non-leaf loop: " << v;
         loop.set_bound_var(bound_var);
         loop.set_kind(bound_var_to_loop_kind(bound_var));
 
-        int var_dim = bound_var_to_dim(bound_var);
-        ir_assert(to_cpp<int>(loop.bound()) == var_dim)
-                << "Dimension size doesn't match.";
+        dim_t var_dim = bound_var_to_dim(bound_var);
+        gpu_assert(to_cpp<dim_t>(loop.bound()) == var_dim)
+                << "Dimension size doesn't match, "
+                << to_cpp<dim_t>(loop.bound()) << " != " << var_dim << ".";
     }
 
     // Reorders loops defined by given variables.
     void reorder(const std::vector<expr_t> &ordered_vars) {
         for (auto &v : ordered_vars) {
             auto &loop = find_loop(v);
-            ir_assert(loop.is_leaf()) << "Can't reorder non-leaf loop: " << v;
+            gpu_assert(loop.is_leaf()) << "Can't reorder non-leaf loop: " << v;
         }
         std::vector<bool> found(vars_.size());
         for (size_t i = 0; i < vars_.size(); i++) {
@@ -684,13 +686,13 @@ class gemm_schedule_t {
     //   }
     void set_dynamic_bounds(
             const expr_t &var, const expr_t &init, const expr_t &step) {
-        ir_assert(find_loop(var).is_leaf()) << "Variable is non-leaf: " << var;
+        gpu_assert(find_loop(var).is_leaf()) << "Variable is non-leaf: " << var;
         dynamic_inits_[var] = expand(init);
         dynamic_steps_[var] = expand(step);
     }
 
     bool with_thread_group_k_slicing() const {
-        ir_assert(is_finalized_);
+        gpu_assert(is_finalized_);
         dim_t k_thr = 1;
         dim_t k_tg = 1;
         for (int i = 0; i < bmnk_mapper_.ndims(abc_kind_t::a); i++) {
@@ -699,12 +701,12 @@ class gemm_schedule_t {
             k_thr *= a_thr_tile_(i);
             k_tg *= a_tg_tile_(i);
         }
-        ir_assert(k_tg % k_thr == 0);
+        gpu_assert(k_tg % k_thr == 0);
         return k_thr < k_tg;
     }
 
     bool with_kernel_grid_k_slicing() const {
-        ir_assert(is_finalized_);
+        gpu_assert(is_finalized_);
         dim_t k_loop = 1;
         dim_t k = 1;
         for (int i = 0; i < bmnk_mapper_.ndims(abc_kind_t::a); i++) {
@@ -726,7 +728,7 @@ class gemm_schedule_t {
                 ret.push_back(loop.var());
                 return;
             }
-            ir_assert(loop.is_fused_child() || loop.is_split_child());
+            gpu_assert(loop.is_fused_child() || loop.is_split_child());
             for (auto &pv : loop.parent_vars()) {
                 walk(pv);
             }
@@ -761,7 +763,7 @@ class gemm_schedule_t {
         }
         if (expand_trivial_vars) {
             for (auto &kv : loops_) {
-                int bound = to_cpp<int>(kv.second.bound());
+                dim_t bound = to_cpp<dim_t>(kv.second.bound());
                 if (bound != 1) continue;
                 if (!contains_object(ret, kv.first)) continue;
                 ret = substitute(ret, kv.first, expr_t(0));
@@ -787,8 +789,8 @@ class gemm_schedule_t {
             bool with_dyn = init_it != dynamic_inits.end();
             auto init = with_dyn ? init_it->second : expr_t(0);
             auto step = with_dyn ? step_it->second : expr_t(1);
-            ir_assert(!with_dyn || step_it != dynamic_steps.end());
-            if (found_vars.count(var) == 0 && to_cpp<int>(loop.bound()) == 1
+            gpu_assert(!with_dyn || step_it != dynamic_steps.end());
+            if (found_vars.count(var) == 0 && to_cpp<dim_t>(loop.bound()) == 1
                     && !with_dyn)
                 continue;
             body = for_t::make(
@@ -798,7 +800,7 @@ class gemm_schedule_t {
 
         for (auto &kv : dynamic_inits) {
             auto &c = kv.second;
-            ir_assert(c.is_empty()) << "Skip condition is not injected: " << c;
+            gpu_assert(c.is_empty()) << "Skip condition is not injected: " << c;
         }
 
         return body;
@@ -825,7 +827,7 @@ class gemm_schedule_t {
             case tile_level_t::loop: return 1;
             case tile_level_t::thread_group: return 2;
             case tile_level_t::iter: return 3;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return -1;
     }
@@ -836,7 +838,7 @@ class gemm_schedule_t {
             case loop_kind_t::serial: return 1;
             case loop_kind_t::tg_grid: return 2;
             case loop_kind_t::tensorized: return 3;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return -1;
     }
@@ -869,7 +871,7 @@ class gemm_schedule_t {
                     case loop_kind_t::serial: return 0;
                     case loop_kind_t::tensorized:
                         return std::numeric_limits<int>::max();
-                    default: ir_error_not_expected();
+                    default: gpu_error_not_expected();
                 }
                 return -1;
             };
@@ -938,28 +940,28 @@ class gemm_schedule_t {
     }
 
     loop_kind_t bound_var_to_loop_kind(const expr_t &v) const {
-        for (int i = 0; i < kernel_grid_.ndims(); i++) {
+        for (dim_idx_t i = 0; i < kernel_grid_.ndims(); i++) {
             if (kernel_grid_.idx(i).is_same(v)) return loop_kind_t::kernel_grid;
         }
-        for (int i = 0; i < tg_grid_.ndims(); i++) {
+        for (dim_idx_t i = 0; i < tg_grid_.ndims(); i++) {
             if (tg_grid_.idx(i).is_same(v)) return loop_kind_t::tg_grid;
         }
         if (kernel_grid_walk_order_.is_grid_var(v))
             return loop_kind_t::kernel_grid;
-        ir_error_not_expected() << "Unknown external variable: " << v;
+        gpu_error_not_expected() << "Unknown external variable: " << v;
         return loop_kind_t::undef;
     }
 
-    int bound_var_to_dim(const expr_t &v) const {
-        for (int i = 0; i < kernel_grid_.ndims(); i++) {
+    dim_t bound_var_to_dim(const expr_t &v) const {
+        for (dim_idx_t i = 0; i < kernel_grid_.ndims(); i++) {
             if (kernel_grid_.idx(i).is_same(v)) return kernel_grid_.dim(i);
         }
-        for (int i = 0; i < tg_grid_.ndims(); i++) {
+        for (dim_idx_t i = 0; i < tg_grid_.ndims(); i++) {
             if (tg_grid_.idx(i).is_same(v)) return tg_grid_.dim(i);
         }
         if (kernel_grid_walk_order_.is_grid_var(v))
             return kernel_grid_walk_order_.dim_size(v);
-        ir_error_not_expected() << "Unknown external variable: " << v;
+        gpu_error_not_expected() << "Unknown external variable: " << v;
         return -1;
     }
 
@@ -969,12 +971,12 @@ class gemm_schedule_t {
     }
 
     const loop_t &find_loop(const expr_t &var) const {
-        ir_assert(has_loop(var)) << "Var not found: " << var;
+        gpu_assert(has_loop(var)) << "Var not found: " << var;
         return loops_.at(var);
     }
 
     loop_t &find_loop(const expr_t &var) {
-        ir_assert(has_loop(var)) << "Var not found: " << var;
+        gpu_assert(has_loop(var)) << "Var not found: " << var;
         return loops_[var];
     }
 
@@ -989,7 +991,7 @@ class gemm_schedule_t {
             const expr_t &var, const expr_t &bound, bool is_root = false) {
         loop_t loop(var, bound, is_root);
         auto ret = loops_.insert({var, loop});
-        ir_assert(ret.second) << "Variable already exists: " << var;
+        gpu_assert(ret.second) << "Variable already exists: " << var;
         vars_.push_back(var);
         return ret.first->second;
     }
@@ -1079,7 +1081,7 @@ class gemm_schedule_t {
                 int level;
                 if (loop.is_fused_parent()) {
                     auto &child_var = loop.child_vars()[0];
-                    ir_assert(find_loop(child_var).is_leaf());
+                    gpu_assert(find_loop(child_var).is_leaf());
                     kind = find_loop(child_var).kind();
                     level = loop_level(child_var);
                 } else {
@@ -1090,11 +1092,11 @@ class gemm_schedule_t {
                 walk_down(loop.child_vars()[0]);
                 walk_down(loop.child_vars()[1]);
             } else {
-                ir_error_not_expected();
+                gpu_error_not_expected();
             }
         };
         walk_down(root_var);
-        ir_assert(ret.is_valid()) << "Invalid loop nest.";
+        gpu_assert(ret.is_valid()) << "Invalid loop nest.";
         return ret;
     }
 
diff --git a/src/gpu/intel/jit/ir/grf_permutation.hpp b/src/gpu/intel/jit/ir/grf_permutation.hpp
index 10dda466331..be36a7495d1 100644
--- a/src/gpu/intel/jit/ir/grf_permutation.hpp
+++ b/src/gpu/intel/jit/ir/grf_permutation.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ class grf_permutation_t {
     grf_permutation_t() { permutation_.fill(-1); }
 
     int map(int off) const {
-        ir_assert(off >= 0 && off < max_regs);
+        gpu_assert(off >= 0 && off < max_regs);
         if (permutation_[off] == -1) return off;
         return permutation_[off];
     }
@@ -42,10 +42,10 @@ class grf_permutation_t {
     bool is_empty() const { return is_empty_; }
 
     void set_permute(int old_off, int new_off) {
-        ir_assert(old_off >= 0 && old_off < max_regs);
+        gpu_assert(old_off >= 0 && old_off < max_regs);
         if (old_off == new_off || new_off == -1) return;
         is_empty_ = false;
-        ir_assert(utils::one_of(permutation_[old_off], -1, new_off))
+        gpu_assert(utils::one_of(permutation_[old_off], -1, new_off))
                 << "Already assigned to a different offset.";
         permutation_[old_off] = new_off;
     }
diff --git a/src/gpu/intel/jit/ir/hw.cpp b/src/gpu/intel/jit/ir/hw.cpp
index 5f97c93fa3d..b99f22544d0 100644
--- a/src/gpu/intel/jit/ir/hw.cpp
+++ b/src/gpu/intel/jit/ir/hw.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,8 +31,9 @@ int hw_t::cache_line_size() const {
         case ngen::HW::XeHP:
         case ngen::HW::XeHPG:
         case ngen::HW::XeHPC:
-        case ngen::HW::Xe2: return 64;
-        default: ir_error_not_expected();
+        case ngen::HW::Xe2:
+        case ngen::HW::Xe3: return 64;
+        default: gpu_error_not_expected();
     }
     return 0;
 }
diff --git a/src/gpu/intel/jit/ir/hw.hpp b/src/gpu/intel/jit/ir/hw.hpp
index 805263c9ee4..0e3795b3820 100644
--- a/src/gpu/intel/jit/ir/hw.hpp
+++ b/src/gpu/intel/jit/ir/hw.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,6 +67,7 @@ class hw_t {
     bool is_undef() const { return hw_ == ngen::HW::Unknown; }
     bool has_fp64_atomic_support() const { return with_atomic_fp64_; }
     ngen::HW to_ngen() const { return hw_; }
+    operator ngen::HW() const { return hw_; }
     ngen::ProductFamily product_family() const { return product_family_; }
     int stepping_id() const { return stepping_id_; }
     int eu_count() const { return eu_count_; }
@@ -145,15 +146,23 @@ class exec_config_t {
 public:
     exec_config_t() = default;
     exec_config_t(const hw_t &hw) : hw_(hw) {}
+    exec_config_t(const hw_t &hw, int regs, int simd,
+            bool require_signal_header = false)
+        : hw_(hw)
+        , regs_(regs)
+        , simd_(simd)
+        , require_signal_header_(require_signal_header) {}
 
     const hw_t &hw() const { return hw_; }
     int regs() const { return regs_; }
     int simd() const { return simd_; }
     int vec_size() const { return vec_size_; }
     int grf_size() const { return hw_.grf_size(); }
+    bool require_signal_header() const { return require_signal_header_; }
     void set_regs(int regs) { regs_ = regs; }
     void set_simd(int simd) { simd_ = simd; }
     void set_vec_size(int vec_size) { vec_size_ = vec_size; }
+    void set_require_signal_header(bool r) { require_signal_header_ = r; }
 
     std::string str() const {
         std::ostringstream oss;
@@ -169,6 +178,7 @@ class exec_config_t {
     int regs_ = 0;
     int simd_ = 0;
     int vec_size_ = 0;
+    bool require_signal_header_ = false;
 };
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/ir/ir.cpp b/src/gpu/intel/jit/ir/ir.cpp
index 20ef75168be..2bdd851e3d7 100644
--- a/src/gpu/intel/jit/ir/ir.cpp
+++ b/src/gpu/intel/jit/ir/ir.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,8 +43,7 @@ class ir_printer_t : public ir_visitor_t {
         auto grf_size = 1; // Assume all objects are grf aligned
         auto guard = mem_usage_guard(obj.register_alloc_size(grf_size));
         print_indent();
-        out_ << "alloc " << obj.buf.as<var_t>().name << "[" << obj.size
-             << "] (mem_usage: " << mem_usage_bytes_ << ")\n";
+        out_ << obj.line_str() << "(mem_usage: " << mem_usage_bytes_ << ")\n";
         visit(obj.body);
     }
 
@@ -98,16 +97,14 @@ class ir_printer_t : public ir_visitor_t {
 
     void _visit(const func_call_t &obj) override {
         print_indent();
-        out_ << obj.func << "(" << make_seq_print_helper(obj.args) << ")";
-        if (!obj.attr.is_empty()) out_ << " " << obj.attr;
-        out_ << "\n";
+        out_ << obj.line_str() << "\n";
     }
 
     void _visit(const func_impl_t &obj) override { out_ << obj.str(); }
 
     void _visit(const if_t &obj) override {
         print_indent();
-        out_ << "if (" << strip_parens(obj.cond.str()) << ") {\n";
+        out_ << obj.line_str() << " {\n";
         add_indent();
         visit(obj.body);
         remove_indent();
@@ -137,7 +134,7 @@ class ir_printer_t : public ir_visitor_t {
         int size = obj.register_alloc_size();
         auto guard = mem_usage_guard(size);
         print_indent();
-        out_ << obj.var << "." << obj.var.type() << " = " << obj.value << "\n";
+        out_ << obj.line_str() << "\n";
         visit(obj.body);
     }
 
@@ -210,19 +207,13 @@ class ir_printer_t : public ir_visitor_t {
     }
 
     void _visit(const stmt_seq_t &obj) override {
-        visit(obj.head);
-        visit(obj.tail);
+        for (auto &s : obj.vec)
+            visit(s);
     }
 
     void _visit(const store_t &obj) override {
         print_indent();
-        out_ << load_t::make(obj.value.type(), obj.buf, obj.off, obj.stride);
-        out_ << " = " << obj.value;
-        if (!obj.mask.is_empty()) {
-            out_ << ", mask = " << obj.mask.str();
-            if (obj.fill_mask0) out_ << " [FILL]";
-        }
-        out_ << "\n";
+        out_ << obj.line_str() << "\n";
     }
 
     void _visit(const ternary_op_t &obj) override {
@@ -237,6 +228,16 @@ class ir_printer_t : public ir_visitor_t {
 
     void _visit(const var_t &obj) override { out_ << obj.name; }
 
+    void _visit(const while_t &obj) override {
+        print_indent();
+        out_ << obj.line_str() << " {\n";
+        add_indent();
+        visit(obj.body);
+        remove_indent();
+        print_indent();
+        out_ << "}\n";
+    }
+
 private:
     mem_usage_guard_t mem_usage_guard(int size) {
         return mem_usage_guard_t(&mem_usage_bytes_, size);
@@ -317,7 +318,7 @@ class substitute_and_type_mutator_t : public ir_mutator_t {
         // tmp0.s32            -> tmp0_0.u64
         // tmp1.s32 = tmp0.s32 -> tmp1.s32 = tmp0_0.u64
         if (!value.is_empty()) {
-            auto &value_type = expr_t(value).type();
+            auto value_type = expr_t(value).type();
             if (var.as<var_t>().type != value_type) {
                 auto var_old = var;
                 var = var_t::make(value_type, var.as<var_t>().name);
@@ -374,19 +375,19 @@ class stmt_flattener_t : public ir_visitor_t {
 
 class alloc_injector_t : public ir_mutator_t {
 public:
-    alloc_injector_t(const stmt_t &root, const std::vector<stmt_t> &allocs,
-            bool put_innermost)
-        : root_(root), put_innermost_(put_innermost), allocs_(allocs) {
+    alloc_injector_t(const stmt_t &root, const std::vector<stmt_t> &allocs)
+        : allocs_(allocs) {
         for (auto &_a : allocs) {
             auto &a = _a.as<alloc_t>();
-            if (a.kind != alloc_kind_t::global) ir_assert(a.size > 0) << _a;
+            if (a.kind != alloc_kind_t::global) gpu_assert(a.size > 0) << _a;
             alloc_map_.insert({a.buf, _a});
+            buf_cur_refs_[a.buf] = 0;
         }
-        mutate(root_);
+        mutate(root);
         buf_total_refs_ = buf_cur_refs_;
         for (auto &kv : buf_cur_refs_)
             kv.second = 0;
-        in_ctor_ = false;
+        in_ctor_ = false; // NOLINT(cppcoreguidelines-prefer-member-initializer)
     }
 
 #define HANDLE_IR_OBJECT(type) \
@@ -404,39 +405,131 @@ class alloc_injector_t : public ir_mutator_t {
     template <typename T>
     object_t mutate_stmt(const T &obj) {
         if (in_ctor_) return ir_mutator_t::_mutate(obj);
-        object_t new_obj = obj;
-        object_set_t<expr_t> undef_bufs;
-        if (put_innermost_) {
-            for (auto &kv : buf_cur_refs_)
-                if (kv.second == 0) undef_bufs.insert(kv.first);
-            new_obj = ir_mutator_t::_mutate(obj);
+        if (T::_type_id() == ir_type_id_t::stmt_seq_t) {
+            return mutate_stmt_seq(obj);
         }
-        for (auto &a : allocs_) {
-            auto it = alloc_map_.find(a.as<alloc_t>().buf);
-            auto &buf = it->first;
-            if (it->second.is_empty()) continue; // Already injected.
-            bool do_inject = false;
-            if (put_innermost_) {
-                int cur_refs = buf_cur_refs_[buf];
-                int total_refs = buf_total_refs_[buf];
-                bool was_undef = (undef_bufs.count(buf) != 0);
-                do_inject = was_undef && (cur_refs == total_refs);
-            } else {
-                do_inject = root_.is_same(obj);
+        auto undef_bufs = get_undef_bufs();
+        auto new_obj = ir_mutator_t::_mutate(obj);
+        new_obj = maybe_inject(new_obj, undef_bufs);
+        return new_obj;
+    }
+
+    // Handle stmt_seq_t in a special way:
+    // 1. Walk through the sequence and record the first and the last statement
+    //    where a buffer is referenced
+    // 2. Inject alloc statements according to the usage
+    object_t mutate_stmt_seq(const object_t &obj) {
+        auto stmt_vec = obj.as<stmt_seq_t>().vec;
+        gpu_assert(!stmt_vec.empty());
+        int nstmts = (int)stmt_vec.size();
+        // Mutate statments and record buffer usage in the form: buf: [first, last].
+        object_map_t<expr_t, int> last_undef;
+        object_map_t<expr_t, std::pair<int, int>> entries;
+        for (int i = 0; i < nstmts; i++) {
+            auto &s = stmt_vec[i];
+            for (auto &b : get_undef_bufs()) {
+                auto it = alloc_map_.find(b);
+                if (it == alloc_map_.end() || it->second.is_empty()) continue;
+                last_undef[b] = i;
+            }
+            s = mutate(s);
+            for (auto &kv : last_undef) {
+                auto &buf = kv.first;
+                if (entries.count(buf) != 0) continue;
+                if (buf_cur_refs_[buf] == buf_total_refs_[buf]) {
+                    entries[buf] = std::make_pair(kv.second, i);
+                }
             }
-            if (do_inject) {
-                auto &a = it->second.as<alloc_t>();
+        }
+        // Sort buffers based on the number of statements they span. This is to
+        // inject more local allocations first.
+        std::vector<expr_t> bufs;
+        for (auto &kv : entries) {
+            if (alloc_map_.at(kv.first).is_empty()) continue;
+            bufs.push_back(kv.first);
+        }
+        std::sort(bufs.begin(), bufs.end(),
+                [&](const expr_t &a, const expr_t &b) {
+                    auto &ea = entries.at(a);
+                    auto &eb = entries.at(b);
+                    int a_span = (ea.second - ea.first);
+                    int b_span = (eb.second - eb.first);
+                    if (a_span == b_span)
+                        return a.as<var_t>().name < b.as<var_t>().name;
+                    return a_span < b_span;
+                });
+        // Use union-find to incrementally merge statements based on the common
+        // buffers.
+        std::vector<int> parent(nstmts);
+        std::iota(parent.begin(), parent.end(), 0);
+        std::function<int(int)> _find;
+        std::function<void(int, int)> _union;
+        _find = [&](int i) {
+            if (parent[i] == i) return i;
+            return parent[i] = _find(parent[i]);
+        };
+        _union = [&](int i, int j) {
+            i = _find(i);
+            j = _find(j);
+            parent[j] = i;
+        };
+        std::vector<stmt_t> new_stmt_seq = std::move(stmt_vec);
+        for (auto &buf : bufs) {
+            auto &e = entries.at(buf);
+            stmt_t stmt;
+            for (int i = e.first; i <= e.second; i++) {
+                int idx = _find(i);
+                stmt = stmt.append(new_stmt_seq[idx]);
+                new_stmt_seq[idx] = stmt_t();
+                _union(e.first, i);
+            }
+            auto it = alloc_map_.find(buf);
+            auto &a = it->second.as<alloc_t>();
+            stmt = alloc_t::make(a.buf, a.size, a.kind, a.attrs, stmt);
+            new_stmt_seq[_find(e.first)] = stmt;
+            it->second = stmt_t();
+        }
+        stmt_t new_obj;
+        for (auto &s : new_stmt_seq) {
+            if (s.is_empty()) continue;
+            new_obj = new_obj.append(s);
+        }
+        return std::move(new_obj);
+    }
+
+    object_set_t<expr_t> get_undef_bufs() const {
+        object_set_t<expr_t> ret;
+        for (auto &kv : buf_cur_refs_)
+            if (kv.second == 0) ret.insert(kv.first);
+        return ret;
+    }
+
+    object_t maybe_inject(
+            const object_t &obj, const object_set_t<expr_t> &undef_bufs) {
+        auto new_obj = obj;
+        for (auto &kv : alloc_map_) {
+            if (kv.second.is_empty()) continue;
+            auto &buf = kv.first;
+            auto &a = kv.second.as<alloc_t>();
+            if (do_inject(buf, undef_bufs)) {
                 new_obj = alloc_t::make(
                         a.buf, a.size, a.kind, a.attrs, new_obj);
-                it->second = stmt_t();
+                kv.second = stmt_t();
             }
         }
         return new_obj;
     }
 
+    bool do_inject(
+            const expr_t &buf, const object_set_t<expr_t> &undef_bufs) const {
+        if (buf.is_empty()) return false; // Already injected.
+        int cur_refs = buf_cur_refs_.at(buf);
+        int total_refs = buf_total_refs_.at(buf);
+        bool was_undef = (undef_bufs.count(buf) != 0);
+        return was_undef && (cur_refs == total_refs);
+    }
+
     bool in_ctor_ = true;
-    const stmt_t &root_;
-    bool put_innermost_;
     std::vector<stmt_t> allocs_;
     object_map_t<expr_t, stmt_t> alloc_map_;
     object_map_t<expr_t, int> buf_total_refs_;
@@ -457,7 +550,7 @@ object_t substitute(const object_t &root, const object_t &from,
     if (to.is_same(from)) return root;
     substitute_mutator_t sm(from, to);
     auto ret = sm.mutate(root);
-    ir_assert(sm.substitutions() <= max_substitutions)
+    gpu_assert(sm.substitutions() <= max_substitutions)
             << "Unexpected number of substitutions.";
     return ret;
 }
@@ -467,7 +560,7 @@ object_t substitute_with_different_type(const object_t &root,
     if (to.is_same(from)) return root;
     substitute_and_type_mutator_t sm(from, to);
     auto ret = sm.mutate(root);
-    ir_assert(sm.substitutions() <= max_substitutions)
+    gpu_assert(sm.substitutions() <= max_substitutions)
             << "Unexpected number of substitutions.";
     return ret;
 }
@@ -480,7 +573,15 @@ std::vector<stmt_t> flatten_statements(const stmt_t &root) {
 
 stmt_t inject_alloc_stmts(const stmt_t &stmt, const std::vector<stmt_t> &allocs,
         bool put_innermost) {
-    alloc_injector_t injector(stmt, allocs, put_innermost);
+    if (!put_innermost) {
+        auto ret = stmt;
+        for (auto &_a : allocs) {
+            auto &a = _a.as<alloc_t>();
+            ret = alloc_t::make(a.buf, a.size, a.kind, a.attrs, ret);
+        }
+        return ret;
+    }
+    alloc_injector_t injector(stmt, allocs);
     return injector.mutate(stmt);
 }
 
@@ -501,6 +602,121 @@ stmt_t inject_let_stmts(const stmt_t &stmt, const std::vector<stmt_t> &lets) {
     return ret;
 }
 
+class var_counter_t : public ir_visitor_t {
+public:
+    var_counter_t(const object_set_t<expr_t> &vars) {
+        for (auto &v : vars) {
+            counts[v] = 0;
+        }
+    }
+
+    void _visit(const var_t &obj) override {
+        auto it = counts.find(obj);
+        if (it == counts.end()) return;
+        it->second++;
+    }
+
+    object_map_t<expr_t, int> counts;
+};
+
+object_map_t<expr_t, int> count_vars(
+        const stmt_t &stmt, const object_set_t<expr_t> &vars) {
+    var_counter_t counter(vars);
+    counter.visit(stmt);
+    return counter.counts;
+}
+
+class let_injector_t : public ir_mutator_t {
+public:
+    object_t _mutate(const stmt_seq_t &obj) override {
+        auto new_obj = ir_mutator_t::_mutate(obj);
+        auto &stmt_vec = new_obj.as<stmt_seq_t>().vec;
+        int nstmts = (int)stmt_vec.size();
+        // 1. Collect total var references for dangling lets.
+        object_set_t<expr_t> let_vars;
+        for (auto &s : stmt_vec) {
+            if (is_dangling_let(s)) {
+                auto &var = s.as<let_t>().var;
+                let_vars.insert(var);
+            }
+        }
+        if (let_vars.empty()) return new_obj;
+        auto total_refs = count_vars(new_obj, let_vars);
+
+        // 2. Find scopes for dangling lets.
+        object_map_t<expr_t, stmt_t> var2let;
+        object_map_t<stmt_t, int> let_scope_ends;
+        object_map_t<expr_t, int> cur_refs;
+        for (auto &v : let_vars)
+            cur_refs[v] = 0;
+        for (int i = 0; i < nstmts; i++) {
+            auto &s = stmt_vec[i];
+            if (is_dangling_let(s)) {
+                var2let[s.as<let_t>().var] = s;
+                let_scope_ends[s] = i;
+            }
+            for (auto &kv : count_vars(s, let_vars)) {
+                auto &var = kv.first;
+                cur_refs[var] += kv.second;
+                if (cur_refs[var] == total_refs[var]) {
+                    let_vars.erase(var);
+                    let_scope_ends[var2let.at(var)] = i;
+                }
+            }
+        }
+
+        // 3. Nest let statements according to the scopes.
+        std::vector<entry_t> entries;
+        entries.emplace_back();
+        for (int i = 0; i < nstmts; i++) {
+            auto &s = stmt_vec[i];
+            if (is_dangling_let(s)) {
+                entry_t e;
+                e.let_stmt = s;
+                entries.push_back(e);
+            } else {
+                entries.back().append(s);
+            }
+            while (!entries.empty()) {
+                auto &last = entries.back();
+                if (last.let_stmt.is_empty()) break;
+                int end = let_scope_ends.at(last.let_stmt);
+                if (end > i) break;
+                auto new_stmt = last.make_let();
+                entries.pop_back();
+                entries.back().append(new_stmt);
+            }
+        }
+
+        stmt_t ret;
+        for (auto &e : entries) {
+            gpu_assert(e.let_stmt.is_empty()) << e.let_stmt;
+            ret = ret.append(e.body);
+        }
+
+        return std::move(ret);
+    }
+
+private:
+    static bool is_dangling_let(const stmt_t &s) {
+        auto *let = s.as_ptr<let_t>();
+        return let && let->body.is_empty();
+    }
+
+    struct entry_t {
+        stmt_t body;
+        stmt_t let_stmt;
+
+        stmt_t make_let() const { return replace_stmt_body(let_stmt, body); }
+
+        void append(const stmt_t &s) { body = body.append(s); }
+    };
+};
+
+stmt_t inject_dangling_let_stmts(const stmt_t &stmt) {
+    return let_injector_t().mutate(stmt);
+}
+
 std::vector<expr_t> split_by_and(const expr_t &e) {
     auto *binary = e.as_ptr<binary_op_t>();
     if (!binary || binary->op_kind != op_kind_t::_and) return {e};
@@ -512,11 +728,19 @@ std::vector<expr_t> split_by_and(const expr_t &e) {
 }
 
 expr_t abs(const expr_t &e) {
-    ir_assert(is_const(e)) << e;
+    gpu_assert(is_const(e)) << e;
     if (to_cpp<bool>(e >= 0)) return e;
     return -e;
 }
 
+expr_t max(const expr_t &a, const expr_t &b) {
+    return binary_op_t::make(op_kind_t::_max, a, b);
+}
+
+expr_t min(const expr_t &a, const expr_t &b) {
+    return binary_op_t::make(op_kind_t::_min, a, b);
+}
+
 expr_t cast(const expr_t &e, const type_t &type, bool saturate) {
     return const_fold(cast_t::make(type, e, saturate));
 }
@@ -557,7 +781,7 @@ expr_t make_buffer(const std::string &name) {
 
 // Returns number of occurrences of `obj` in `root` (based on identity equality).
 int count_object(const object_t &root, const object_t &obj) {
-    ir_assert(!obj.is_empty());
+    gpu_assert(!obj.is_empty());
 
     std::vector<object_t> found;
     do {
@@ -571,7 +795,7 @@ int count_object(const object_t &root, const object_t &obj) {
 
 #undef HANDLE_IR_OBJECT
 
-        ir_error_not_expected() << obj;
+        gpu_error_not_expected() << obj;
     } while (false);
 
     int ret = 0;
@@ -581,7 +805,7 @@ int count_object(const object_t &root, const object_t &obj) {
 }
 
 bool contains_object(const object_t &root, const object_t &obj) {
-    ir_assert(is_var(obj)) << obj;
+    gpu_assert(is_var(obj)) << obj;
     return count_object(root, obj) > 0;
 }
 
@@ -626,6 +850,9 @@ stmt_t get_stmt_body(const stmt_t &stmt) {
     auto *_for = stmt.as_ptr<for_t>();
     if (_for) return _for->body;
 
+    auto *_while = stmt.as_ptr<while_t>();
+    if (_while) return _while->body;
+
     auto *let = stmt.as_ptr<let_t>();
     if (let) return let->body;
 
@@ -648,6 +875,9 @@ stmt_t replace_stmt_body(const stmt_t &stmt, const stmt_t &new_body) {
                 _for->step, _for->unroll);
     }
 
+    auto *_while = stmt.as_ptr<while_t>();
+    if (_while) { return while_t::make(_while->cond, new_body); }
+
     auto *let = stmt.as_ptr<let_t>();
     if (let) { return let_t::make(let->var, let->value, new_body); }
 
@@ -712,7 +942,7 @@ bool has_send_atomics(const stmt_t &s) {
 }
 
 bool relation_t::implies(const relation_t &other) const {
-    ir_assert(var().is_same(other.var()));
+    gpu_assert(var().is_same(other.var()));
 
     if (op_kind() != other.op_kind()) return false;
 
@@ -728,19 +958,19 @@ bool relation_t::implies(const relation_t &other) const {
         // (x <= A) && (A <= B) => (x <= B)
         case op_kind_t::_lt:
         case op_kind_t::_le: return A <= B;
-        default: ir_error_not_expected() << "Not implemented: " << expr_;
+        default: gpu_error_not_expected() << "Not implemented: " << expr_;
     }
     return false;
 }
 
 relation_t relation_t::transform(
         const linear_transform_t &t, const expr_t &new_var) const {
-    ir_assert(t.a == 1) << "Not implemented.";
+    gpu_assert(t.a == 1) << "Not implemented.";
     return relation_t(binary_op_t::make(op_kind(), new_var, rhs() + t.b));
 }
 
 expr_t relation_t::normalize(const expr_t &e) {
-    ir_assert(is_relation_constraint(e)) << e;
+    gpu_assert(is_relation_constraint(e)) << e;
     auto &op = e.as<binary_op_t>();
 
     auto op_kind = op.op_kind;
@@ -784,7 +1014,7 @@ int64_t bound_finder_base_t::find_bound_impl(
 
     auto *unary = e.as_ptr<unary_op_t>();
     if (unary) {
-        ir_assert(unary->op_kind == op_kind_t::_minus) << e;
+        gpu_assert(unary->op_kind == op_kind_t::_minus) << e;
         auto a = find_bound_impl(unary->a, !is_low);
         if (!is_good_bound(a)) return def_bound;
         return -a;
@@ -828,7 +1058,7 @@ int64_t bound_finder_base_t::find_bound_impl(
                 if (!is_const(binary->b)) return def_bound;
 
                 auto b = to_cpp<int64_t>(binary->b);
-                ir_assert(b != 0);
+                gpu_assert(b != 0);
 
                 auto a = find_bound_impl(binary->a, b > 0 ? is_low : !is_low);
                 if (!is_good_bound(a)) return def_bound;
@@ -925,7 +1155,7 @@ bool is_linear_var_transform(const expr_t &e, linear_transform_t &t) {
     if (a.is_same(var) && is_const(b)) {
         t.x = var;
         t.a = 1;
-        t.b = (is_sub ? -1 : 1) * to_cpp<int>(b);
+        t.b = (is_sub ? -1 : 1) * to_cpp<int64_t>(b);
         return true;
     }
 
@@ -933,7 +1163,7 @@ bool is_linear_var_transform(const expr_t &e, linear_transform_t &t) {
     if (is_const(a) && b.is_same(var)) {
         t.x = var;
         t.a = (is_sub ? -1 : 1);
-        t.b = to_cpp<int>(a);
+        t.b = to_cpp<int64_t>(a);
         return true;
     }
 
@@ -992,14 +1222,14 @@ void constraint_set_t::add_constraint(const expr_t &e) {
 }
 
 bool constraint_set_t::is_single_value(const expr_t &e, expr_t &value) const {
-    ir_assert(is_var(e)) << e;
+    gpu_assert(is_var(e)) << e;
     auto it = relations_.find(e);
     if (it == relations_.end()) return false;
 
     expr_t lo;
     expr_t hi;
     for (auto &rel : it->second) {
-        ir_assert(is_const(rel.rhs())) << rel;
+        gpu_assert(is_const(rel.rhs())) << rel;
         bool do_break = false;
         switch (rel.op_kind()) {
             case op_kind_t::_eq:
@@ -1024,7 +1254,7 @@ bool constraint_set_t::is_single_value(const expr_t &e, expr_t &value) const {
                 }
                 break;
             }
-            default: ir_error_not_expected() << rel;
+            default: gpu_error_not_expected() << rel;
         }
         if (do_break) break;
     }
@@ -1037,7 +1267,7 @@ bool constraint_set_t::can_prove_impl(
         const expr_t &_e, bool do_simplify) const {
     auto e = _e;
     if (is_const(e)) {
-        ir_assert(e.type() == type_t::_bool()) << e;
+        gpu_assert(e.type() == type_t::_bool()) << e;
         return to_cpp<bool>(e);
     }
 
@@ -1047,7 +1277,7 @@ bool constraint_set_t::can_prove_impl(
         e = simplify_cmp_reduce_lhs_rhs(e);
         e = simplify(e);
         if (is_const(e)) {
-            ir_assert(e.type() == type_t::_bool()) << e;
+            gpu_assert(e.type() == type_t::_bool()) << e;
             return to_cpp<bool>(e);
         }
     }
diff --git a/src/gpu/intel/jit/ir/ir.hpp b/src/gpu/intel/jit/ir/ir.hpp
index 76accbc7b69..2ed14fc4d59 100644
--- a/src/gpu/intel/jit/ir/ir.hpp
+++ b/src/gpu/intel/jit/ir/ir.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_INTEL_JIT_IR_IR_HPP
 
 #include <algorithm>
+#include <cstdint>
 #include <map>
 #include <vector>
 
@@ -116,15 +117,19 @@ class buffer_manager_t {
         }
         // Ensure large GRF buffers are aligned to a register boundary.
         if (!entry_ptr->is_slm() && entry_ptr->size > ir_ctx_->grf_size()) {
-            ir_assert(entry_ptr->size % ir_ctx_->grf_size() == 0);
+            gpu_assert(entry_ptr->size % ir_ctx_->grf_size() == 0);
         }
         return entry_ptr->buf;
     }
 
+    bool has(const std::string &name) const {
+        return entries_.find(name) != entries_.end();
+    }
+
     entry_t find(const std::string &name, bool allow_empty = false) const {
         auto it = entries_.find(name);
         if (it != entries_.end()) return it->second;
-        if (!allow_empty) ir_error_not_expected() << "Not found: " << name;
+        if (!allow_empty) gpu_error_not_expected() << "Not found: " << name;
         return entry_t();
     }
 
@@ -134,7 +139,7 @@ class buffer_manager_t {
 
     const entry_t &find_ref(const std::string &name) const {
         auto it = entries_.find(name);
-        ir_assert(it != entries_.end());
+        gpu_assert(it != entries_.end());
         return it->second;
     }
 
@@ -144,7 +149,7 @@ class buffer_manager_t {
 
     entry_t &find_ref(const std::string &name) {
         auto it = entries_.find(name);
-        ir_assert(it != entries_.end());
+        gpu_assert(it != entries_.end());
         return it->second;
     }
 
@@ -189,19 +194,19 @@ class alloc_updater_t : public ir_mutator_t {
 public:
     void resize(const expr_t &buf, int new_size) {
         auto ret = resizes_.insert({buf, new_size});
-        ir_assert(ret.second) << buf;
+        gpu_assert(ret.second) << buf;
         MAYBE_UNUSED(ret);
     }
 
     void add_attr(const expr_t &buf, const alloc_attr_t &attr) {
         auto ret = attrs_.insert({buf, attr});
-        ir_assert(ret.second) << buf;
+        gpu_assert(ret.second) << buf;
         MAYBE_UNUSED(ret);
     }
 
     void remove(const expr_t &buf) {
         auto ret = removes_.insert(buf);
-        ir_assert(ret.second) << buf;
+        gpu_assert(ret.second) << buf;
         MAYBE_UNUSED(ret);
     }
 
@@ -218,7 +223,7 @@ class alloc_updater_t : public ir_mutator_t {
             } else if (!new_stmt.is_same(old_stmt)) {
                 auto &new_a = new_stmt.as<alloc_t>();
                 auto &entry = buf_mgr.find_ref(e.buf);
-                ir_assert(entry.attrs.empty());
+                gpu_assert(entry.attrs.empty());
                 entry.size = new_a.size;
                 entry.attrs = new_a.attrs;
                 continue;
@@ -295,12 +300,33 @@ stmt_t inject_alloc_stmts(const stmt_t &stmt, const buffer_manager_t &buf_mgr);
 // all let statements.
 stmt_t inject_let_stmts(const stmt_t &stmt, const std::vector<stmt_t> &lets);
 
+// Injects dangling let statements (having empty body) in the innermost
+// possible scope. This allows to use declare and use variables on-the-fly, in
+// the imperative manner.
+//
+// Example:
+//     let x = 1 {}
+//     let y = (x + 1) {}
+//     let z = (y + 1) {}
+//     store(..., z)
+//
+// After injection:
+//     let x = 1 {
+//       let y = (x + 1) }
+//         let z = (y + 1) {
+//           store(..., z)
+//         }
+//       }
+//     }
+stmt_t inject_dangling_let_stmts(const stmt_t &stmt);
+
 template <typename T>
 struct expr_cast_helper_t {
     static T call(const expr_t &e) { return to_cpp<T>(e); }
 
     static std::vector<T> call(const std::vector<expr_t> &exprs) {
         std::vector<T> ret;
+        ret.reserve(exprs.size());
         for (auto &e : exprs)
             ret.push_back(to_cpp<T>(e));
         return ret;
@@ -320,6 +346,7 @@ struct expr_cast_helper_t<expr_t> {
             = typename std::enable_if<std::is_arithmetic<U>::value>::type>
     static std::vector<expr_t> call(const std::vector<U> &vec) {
         std::vector<expr_t> ret;
+        ret.reserve(vec.size());
         for (auto &v : vec)
             ret.push_back(to_expr(v));
         return ret;
@@ -353,6 +380,7 @@ std::vector<std::pair<KeyT, ValueT>> sort_var_map(
         const std::unordered_map<KeyT, ValueT, HashT, EqualT> &map,
         const CompareT &compare) {
     std::vector<std::pair<KeyT, ValueT>> ret;
+    ret.reserve(map.size());
     for (auto &kv : map)
         ret.emplace_back(kv);
     std::sort(ret.begin(), ret.end(), compare);
@@ -370,34 +398,54 @@ std::vector<std::pair<KeyT, expr_t>> sort_var_map_by_value(
             });
 }
 
+template <typename ValueT, typename HashT, typename EqualT>
+std::vector<std::pair<expr_t, ValueT>> sort_var_map_by_key(
+        const std::unordered_map<expr_t, ValueT, HashT, EqualT> &map) {
+    return sort_var_map(map,
+            [](const std::pair<expr_t, ValueT> &a,
+                    const std::pair<expr_t, ValueT> &b) {
+                return a.first.template as<var_t>().name
+                        < b.first.template as<var_t>().name;
+            });
+}
+
 class alloc_manager_t {
 public:
     alloc_manager_t(const stmt_t &root) {
+        auto name_sort = [](const expr_t &a, const expr_t &b) {
+            return a.as<var_t>().name < b.as<var_t>().name;
+        };
+
+        auto lets = find_objects<let_t>(root);
+        for (auto &_l : lets) {
+            auto &l = _l.as<let_t>();
+            lets_.push_back(l.var);
+        }
+        // Sort lets by name.
+        std::sort(lets_.begin(), lets_.end(), name_sort);
+
         auto allocs = find_objects<alloc_t>(root);
         for (auto &_a : allocs) {
             auto &a = _a.as<alloc_t>();
             auto ret = buf2alloc_.insert({a.buf, _a});
             buffers_.push_back(a.buf);
-            ir_assert(ret.second) << "Buffer already exists: " << a.buf;
+            gpu_assert(ret.second) << "Buffer already exists: " << a.buf;
             MAYBE_UNUSED(ret);
         }
-
         // Sort buffers by name.
-        std::sort(buffers_.begin(), buffers_.end(),
-                [](const expr_t &a, const expr_t &b) {
-                    return a.as<var_t>().name < b.as<var_t>().name;
-                });
+        std::sort(buffers_.begin(), buffers_.end(), name_sort);
     }
 
+    const std::vector<expr_t> &lets() const { return lets_; }
     const std::vector<expr_t> &buffers() const { return buffers_; }
 
+    expr_t find_let(const std::string &name, bool allow_empty = false) const {
+        return find_var(lets(), name, allow_empty);
+    }
+
     expr_t find_buffer(
             const std::string &name, bool allow_empty = false) const {
-        for (auto &b : buffers())
-            if (b.as<var_t>().name == name) return b;
-
-        if (!allow_empty) ir_error_not_expected() << name;
-        return expr_t();
+        return find_var(buffers(), name, allow_empty);
     }
 
     std::vector<expr_t> find_buffers(alloc_kind_t kind) const {
@@ -407,20 +455,20 @@ class alloc_manager_t {
         return ret;
     }
 
-    int alloc_size(const expr_t &buf) const {
+    uint32_t alloc_size(const expr_t &buf) const {
         auto *a = find_alloc(buf);
-        ir_assert(a) << buf;
+        gpu_assert(a) << buf;
         return a->size;
     }
 
     alloc_kind_t alloc_kind(const expr_t &buf) const {
         auto *a = find_alloc(buf);
-        ir_assert(a) << buf;
+        gpu_assert(a) << buf;
         return a->kind;
     }
 
-    int total_size(alloc_kind_t kind) const {
-        int ret = 0;
+    uint32_t total_size(alloc_kind_t kind) const {
+        uint32_t ret = 0;
         for (auto &kv : buf2alloc_) {
             auto &a = kv.second.as<alloc_t>();
             if (a.kind == kind) ret += a.size;
@@ -429,6 +477,14 @@ class alloc_manager_t {
     }
 
 private:
+    expr_t find_var(const std::vector<expr_t> &vars, const std::string &name,
+            bool allow_empty) const {
+        for (auto &v : vars)
+            if (v.as<var_t>().name == name) return v;
+        if (!allow_empty) gpu_error_not_expected() << name;
+        return expr_t();
+    }
+
     const alloc_t *find_alloc(const expr_t &buf) const {
         auto it = buf2alloc_.find(buf);
         if (it == buf2alloc_.end()) return nullptr;
@@ -437,11 +493,14 @@ class alloc_manager_t {
 
     object_map_t<expr_t, stmt_t> buf2alloc_;
     std::vector<expr_t> buffers_;
+    std::vector<expr_t> lets_;
     object_map_t<expr_t, stmt_t> alloc_updates_;
 };
 
 // IR utility functions.
 expr_t abs(const expr_t &e);
+expr_t max(const expr_t &a, const expr_t &b);
+expr_t min(const expr_t &a, const expr_t &b);
 
 expr_t cast(const expr_t &e, const type_t &type, bool saturate = false);
 
@@ -597,7 +656,7 @@ class ir_path_t {
     void pop() { path_.pop_back(); }
 
     const object_impl_t *back() const {
-        ir_assert(!is_empty());
+        gpu_assert(!is_empty());
         return path_.back();
     }
 
@@ -672,8 +731,8 @@ struct mem_usage_guard_t {
 // where a and b are integer constants.
 struct linear_transform_t {
     expr_t x;
-    int a;
-    int b;
+    int64_t a;
+    int64_t b;
 
     bool is_identity() const { return a == 1 && b == 0; }
 };
@@ -742,7 +801,7 @@ class modulus_info_t {
     }
 
     bool implies(const modulus_info_t &other) const {
-        ir_assert(var().is_same(other.var()));
+        gpu_assert(var().is_same(other.var()));
 
         int64_t this_mod = to_cpp<int64_t>(mod());
         int64_t other_mod = to_cpp<int64_t>(other.mod());
@@ -801,7 +860,7 @@ class bound_finder_t : public bound_finder_base_t {
         : relations_(relations) {}
 
     int64_t get_var_bound(const expr_t &e, bool is_low) const override {
-        ir_assert(is_var(e));
+        gpu_assert(is_var(e));
         int64_t def_bound = unlimited_bound(is_low);
         auto it = relations_.find(e);
         if (it == relations_.end()) return def_bound;
@@ -846,7 +905,7 @@ class constraint_set_t {
     std::string str() const {
         std::ostringstream oss;
         oss << "relations:" << (relations_.empty() ? " (empty)\n" : "\n");
-        for (auto &r : relations_) {
+        for (auto &r : sort_var_map_by_key(relations_)) {
             oss << "\t" << r.first << ":";
             bool first = true;
             for (auto &s : r.second) {
@@ -858,7 +917,7 @@ class constraint_set_t {
 
         oss << "modulus_info:"
             << (modulus_infos_.empty() ? " (empty)\n" : "\n");
-        for (auto &m : modulus_infos_) {
+        for (auto &m : sort_var_map_by_key(modulus_infos_)) {
             oss << "\t" << m.first << ":";
             bool first = true;
             for (auto &s : m.second) {
@@ -913,12 +972,12 @@ class constraint_set_t {
             case op_kind_t::_le: break;
             case op_kind_t::_gt:
                 op_kind = op_kind_t::_ge;
-                ir_assert(b < std::numeric_limits<int64_t>::max());
+                gpu_assert(b < std::numeric_limits<int64_t>::max());
                 b += 1;
                 break;
             case op_kind_t::_lt:
                 op_kind = op_kind_t::_le;
-                ir_assert(b > std::numeric_limits<int64_t>::min());
+                gpu_assert(b > std::numeric_limits<int64_t>::min());
                 b -= 1;
                 break;
             default: return false;
@@ -990,7 +1049,7 @@ inline func_t zero_out_func() {
     return f;
 }
 
-inline stmt_t zero_out(const expr_t &buf, int size) {
+inline stmt_t zero_out(const expr_t &buf, dim_t size) {
     return zero_out_func().call({buf, expr_t(size)});
 }
 
diff --git a/src/gpu/intel/jit/ir/ir_builder.cpp b/src/gpu/intel/jit/ir/ir_builder.cpp
index 6b16d9faa7e..a5abadd75a0 100644
--- a/src/gpu/intel/jit/ir/ir_builder.cpp
+++ b/src/gpu/intel/jit/ir/ir_builder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,26 +25,19 @@ namespace jit {
 void ir_builder_t::init_kernel_grid(const grid_info_t &kernel_grid,
         const grid_info_t &tg_grid, int simd_size, constraint_set_t &cset,
         std::vector<stmt_t> &init_stmts) {
-    int grid_ndims = kernel_grid.ndims();
-    for (int i = 0; i < grid_ndims; i++) {
-        local_id_[i]
-                = var_t::make(type_t::u16(), "local_id" + std::to_string(i));
-        int local_id_bound = tg_grid.dim(i);
-        if (i == 0) local_id_bound *= simd_size;
-        cset.add_constraint(local_id_[i] >= 0);
-        cset.add_constraint(local_id_[i] < local_id_bound);
-
+    for (dim_idx_t i = 0; i < kernel_grid.ndims(); i++) {
+        auto local_id = var_t::make(type_t::u16(), ir_builder_t::local_id(i));
+        int local_id_bound = into<int>(tg_grid.dim(i));
+        if (i == dim_idx_t(0)) local_id_bound *= simd_size;
+        cset.add_constraint(local_id >= 0);
+        cset.add_constraint(local_id < local_id_bound);
         cset.add_constraint(kernel_grid.idx(i) >= 0);
         cset.add_constraint(kernel_grid.idx(i) < kernel_grid.dim(i));
         cset.add_constraint(tg_grid.idx(i) >= 0);
         cset.add_constraint(tg_grid.idx(i) < tg_grid.dim(i));
-    }
-
-    for (int i = 0; i < grid_ndims; i++) {
-        auto value = local_id_[i];
-        if (i == 0) value /= simd_size;
+        if (i == 0) local_id /= simd_size;
         auto &type = tg_grid.idx(i).type();
-        init_stmts.push_back(let_t::make(tg_grid.idx(i), cast(value, type)));
+        init_stmts.push_back(let_t::make(tg_grid.idx(i), cast(local_id, type)));
     }
 }
 
diff --git a/src/gpu/intel/jit/ir/ir_builder.hpp b/src/gpu/intel/jit/ir/ir_builder.hpp
index eb23c7705ae..cf0290ce680 100644
--- a/src/gpu/intel/jit/ir/ir_builder.hpp
+++ b/src/gpu/intel/jit/ir/ir_builder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@
 #include <array>
 
 #include "gpu/intel/jit/ir/ir.hpp"
-#include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
 
 namespace dnnl {
@@ -33,21 +32,20 @@ class ir_builder_t {
 public:
     const stmt_t &stmt() const { return stmt_; }
 
-    const std::array<expr_t, 3> &local_id() const { return local_id_; }
+#define GENNAME(prefix) \
+    static std::string prefix(int idx) { return #prefix + std::to_string(idx); }
+    GENNAME(tg_idx)
+    GENNAME(thr_idx)
+    GENNAME(local_id)
+#undef GENNAME
 
 protected:
-    ir_builder_t(const kernel_info_t &kernel_info)
-        : kernel_info_(kernel_info) {}
-
     void init_kernel_grid(const grid_info_t &kernel_grid,
             const grid_info_t &tg_grid, int simd_size, constraint_set_t &cset,
             std::vector<stmt_t> &init_stmts);
 
     virtual void build() = 0;
 
-    const kernel_info_t &kernel_info_;
-    std::array<expr_t, 3> local_id_; // Local IDs (OpenCL) for the 0-th lane.
-
     stmt_t stmt_;
 };
 
diff --git a/src/gpu/intel/jit/ir/kernel_desc.hpp b/src/gpu/intel/jit/ir/kernel_desc.hpp
index 32ad428ded3..f737dc756c6 100644
--- a/src/gpu/intel/jit/ir/kernel_desc.hpp
+++ b/src/gpu/intel/jit/ir/kernel_desc.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #ifndef GPU_INTEL_JIT_IR_KERNEL_DESC_HPP
 #define GPU_INTEL_JIT_IR_KERNEL_DESC_HPP
 
+#include "common/serialization.hpp"
 #include "gpu/intel/compute/compute_engine.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/jit/ir/fma.hpp"
 #include "gpu/intel/jit/ir/hw.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -36,26 +36,29 @@ class kernel_t;
 
 namespace jit {
 
+class kernel_iface_t;
 class kernel_info_t;
+class kernel_params_base_t;
 
 class kernel_desc_base_t {
 public:
     virtual ~kernel_desc_base_t() = default;
     virtual std::string kernel_name() const = 0;
-    virtual exec_config_t exec_cfg() const = 0;
+    virtual exec_config_t exec_cfg(const impl::engine_t *engine) const = 0;
     virtual bool with_dpas() const = 0;
     virtual compute::range_t local_range() const = 0;
-    virtual status_t init_kernel_info(kernel_info_t &kernel_info) const = 0;
+    virtual void init_kernel_iface(kernel_iface_t &kernel_iface) const = 0;
+    virtual void init_kernel_info(kernel_info_t &kernel_info,
+            const kernel_params_base_t &params,
+            const impl::engine_t *engine) const = 0;
     virtual status_t create_kernel(compute::kernel_t &kernel,
             gpu_primitive_t *primitive, impl::engine_t *engine) const = 0;
-    virtual serialized_t serialize() const = 0;
+    virtual serialization_stream_t serialize() const = 0;
 };
 
 class kernel_params_base_t {
 public:
     virtual ~kernel_params_base_t() = default;
-    virtual status_t init_dispatch_kernel_info(kernel_info_t &kernel_info,
-            const kernel_desc_base_t &desc) const = 0;
 };
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/ir/kernel_info.hpp b/src/gpu/intel/jit/ir/kernel_info.hpp
index 61e8fab9e58..7a738c62e77 100644
--- a/src/gpu/intel/jit/ir/kernel_info.hpp
+++ b/src/gpu/intel/jit/ir/kernel_info.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive_exec_types.hpp"
+#include "common/serialization.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
 #include "gpu/intel/jit/ir/kernel_desc.hpp"
 
@@ -39,6 +40,8 @@ class memory_storage_ptr_t {
         : unique_ptr_(std::move(ptr)) {}
     memory_storage_ptr_t(const memory_storage_t *ptr) : raw_ptr_(ptr) {}
     memory_storage_ptr_t(const memory_storage_ptr_t &) = delete;
+    memory_storage_ptr_t &operator=(const memory_storage_ptr_t &) = delete;
+    ~memory_storage_ptr_t() = default;
 
     const memory_storage_t *get() const {
         if (unique_ptr_) return unique_ptr_.get();
@@ -62,49 +65,58 @@ class memory_storage_wrapper_t {
 
     const memory_storage_t *get() const {
         if (!ptr_) return nullptr;
-        return ptr_.get()->get();
+        return ptr_->get();
     }
 
 private:
     std::shared_ptr<memory_storage_ptr_t> ptr_;
 };
 
-class grid_context_t {
+class kernel_iface_t {
 public:
-    grid_context_t(bool create_empty = false) {
-        if (!create_empty) {
-            for (int i = 0; i < ndims(); i++) {
-                local_ids_[i] = var_t::make(
-                        type_t::u16(), "local_id" + std::to_string(i));
-                tg_idxs_[i] = var_t::make(
-                        type_t::s32(), "tg_idx" + std::to_string(i));
-            }
-        }
-    }
-
-    int ndims() const { return grid_ndims_; }
-    void set_tg_idx(int idx, const expr_t &e) {
-        ir_assert(idx >= 0 && idx < ndims());
-        tg_idxs_[idx] = e;
+    int nargs() const { return int(args_.size()); }
+    const expr_t &arg_var(int idx) const {
+        gpu_assert(idx >= 0 && idx < nargs());
+        return args_[idx].var;
     }
-    void set_local_id(int idx, const expr_t &e) {
-        ir_assert(idx >= 0 && idx < ndims());
-        local_ids_[idx] = e;
+    const std::string &arg_name(int idx) const {
+        return arg_var(idx).as<var_t>().name;
     }
+    const type_t &arg_type(int idx) const { return arg_var(idx).type(); }
+    bool has(const std::string &name) const { return find_arg_impl(name); }
 
-    const expr_t &tg_idx(int idx) const {
-        ir_assert(idx >= 0 && idx < ndims());
-        return tg_idxs_[idx];
+    expr_t find_arg(const std::string &name, bool allow_empty = false) const {
+        auto *arg = find_arg_impl(name);
+        if (arg) return arg->var;
+        if (!allow_empty)
+            gpu_error_not_expected() << "Argument not found: " << name;
+        return expr_t();
     }
-    const expr_t &local_id(int idx) const {
-        ir_assert(idx >= 0 && idx < ndims());
-        return local_ids_[idx];
+
+    void register_arg(const expr_t &var) { args_.emplace_back(var); }
+
+    void register_arg(const std::string &name, const type_t &type) {
+        register_arg(var_t::make(type, name));
     }
 
 private:
-    static const int grid_ndims_ = 3;
-    std::array<expr_t, grid_ndims_> tg_idxs_;
-    std::array<expr_t, grid_ndims_> local_ids_;
+    struct arg_t {
+        arg_t() = default;
+        arg_t(const expr_t &var) : var(var) {}
+        const std::string &name() const { return var.as<var_t>().name; }
+        bool is_ptr() const { return var.type().is_ptr(); }
+
+        expr_t var;
+    };
+
+    const arg_t *find_arg_impl(const std::string &name) const {
+        for (int i = 0; i < nargs(); i++) {
+            if (args_[i].name() == name) return &args_[i];
+        }
+        return nullptr;
+    }
+
+    std::vector<arg_t> args_;
 };
 
 enum class kernel_id_t {
@@ -123,8 +135,6 @@ enum class kernel_id_t {
 // Kernel arguments can be of the following kinds:
 // - Internal arguments: only scalar
 //   - Examples: common output scales (contain a single value)
-// - Resource arguments: stored to a resource storage during primitive creation
-//   - Examples: output scales or zero points
 // - User arguments: passed by the user at run time
 //   - Examples: source, weights, destination
 class kernel_info_t {
@@ -136,12 +146,12 @@ class kernel_info_t {
     // Returns stage ID, kernels with smaller stage IDs are executed first.
     int stage_id() const {
         switch (id()) {
-            case kernel_id_t::pre_reorder: return 0;
             case kernel_id_t::zero_out: return 0;
             case kernel_id_t::zp_precalc: return 1;
-            case kernel_id_t::convolution: return 2;
-            case kernel_id_t::post_reorder: return 3;
-            default: ir_error_not_expected();
+            case kernel_id_t::pre_reorder: return 2;
+            case kernel_id_t::convolution: return 3;
+            case kernel_id_t::post_reorder: return 4;
+            default: gpu_error_not_expected();
         }
         return -1;
     }
@@ -161,28 +171,10 @@ class kernel_info_t {
 
     void set_internal_arg(const std::string &name, const expr_t &value) {
         auto *arg = find_arg_impl(name);
-        ir_assert(arg);
+        gpu_assert(arg) << "Cannot find argument: " << name;
         arg->value = value;
     }
 
-    std::map<std::string, expr_t> get_vars() const {
-        std::map<std::string, expr_t> vars;
-        for (auto &arg : args_) {
-            if (arg.var.is<var_t>())
-                vars[arg.var.as<var_t>().name] = arg.var;
-            else if (arg.var.is<const_var_t>())
-                vars[arg.var.as<const_var_t>().name] = arg.var;
-            else
-                ir_error_not_expected();
-        }
-        return vars;
-    }
-
-    void register_resource_arg(const expr_t &var) {
-        // TODO: Check key uniqueness.
-        register_arg(var, arg_kind_t::resource, nargs(), /*is_input=*/true);
-    }
-
     void register_user_arg(const expr_t &var, int dnnl_arg, bool is_input) {
         register_arg(var, arg_kind_t::user, dnnl_arg, is_input);
     }
@@ -193,12 +185,12 @@ class kernel_info_t {
     }
 
     const std::string &arg_name(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         return args_[idx].name();
     }
 
     const expr_t &arg_var(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         return args_[idx].var;
     }
 
@@ -208,12 +200,12 @@ class kernel_info_t {
         auto *arg = find_arg_impl(name);
         if (arg) return arg->var;
         if (!allow_empty)
-            ir_error_not_expected() << "Argument not found: " << name;
+            gpu_error_not_expected() << "Argument not found: " << name;
         return expr_t();
     }
 
     int key(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         return args_[idx].key;
     }
 
@@ -221,44 +213,43 @@ class kernel_info_t {
         for (int i = 0; i < nargs(); i++) {
             if (arg_name(i) == name) return key(i);
         }
-        ir_error_not_expected() << "Argument not found: " << name;
+        gpu_error_not_expected() << "Argument not found: " << name;
         return -1;
     }
 
     int nargs() const { return int(args_.size()); }
 
-    bool is_resource(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
-        return args_[idx].kind == arg_kind_t::resource;
-    }
-
     bool is_scratchpad(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         return args_[idx].kind == arg_kind_t::scratchpad;
     }
 
     bool is_user(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         return args_[idx].kind == arg_kind_t::user;
     }
 
     bool is_input(int idx) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         return args_[idx].is_input;
     }
 
     bool is_output(int idx) const { return !is_input(idx); }
 
+    kernel_iface_t iface() const {
+        kernel_iface_t iface;
+        for (int i = 0; i < nargs(); i++) {
+            iface.register_arg(args_[i].var);
+        }
+        return iface;
+    }
+
     memory_storage_wrapper_t arg_storage(int idx, const exec_ctx_t &ctx,
             const gpu_primitive_t *primitive) const {
-        ir_assert(idx >= 0 && idx < nargs());
+        gpu_assert(idx >= 0 && idx < nargs());
         bool is_input = args_[idx].is_input;
         int key = args_[idx].key;
         switch (args_[idx].kind) {
-            case arg_kind_t::resource:
-                return *(ctx.get_resource_mapper()
-                                 ->template get<gpu_resource_t>(primitive)
-                                 ->get_memory_storage(key));
             case arg_kind_t::scratchpad:
                 return ctx.get_scratchpad_grantor().get_memory_storage(key);
             case arg_kind_t::user: {
@@ -267,7 +258,7 @@ class kernel_info_t {
             }
             // No storage for internal arguments.
             case arg_kind_t::internal: return memory_storage_wrapper_t();
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return memory_storage_wrapper_t();
     }
@@ -279,7 +270,7 @@ class kernel_info_t {
                 return memory_desc_wrapper(md).size();
             }
             case arg_kind_t::scratchpad: return args_[idx].scratchpad_size;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return std::numeric_limits<size_t>::max();
     }
@@ -316,23 +307,22 @@ class kernel_info_t {
                         CASE(u64, uint64_t)
 #undef CASE
 
-                        ir_error_not_expected() << type;
+                        gpu_error_not_expected() << type;
                     } while (false);
                     break;
                 }
-                case arg_kind_t::resource:
                 case arg_kind_t::scratchpad:
                 case arg_kind_t::user: {
                     arg_list.set(i, *storage_list[i].get());
                     break;
                 }
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
         }
     }
 
 private:
-    enum class arg_kind_t { internal, resource, scratchpad, user };
+    enum class arg_kind_t { internal, scratchpad, user };
 
     struct arg_t {
         arg_t(const expr_t &var, arg_kind_t kind, int key, bool is_input,
@@ -355,7 +345,7 @@ class kernel_info_t {
 
     void register_arg(const expr_t &var, arg_kind_t kind, int key,
             bool is_input, size_t scratchpad_size = 0) {
-        ir_assert(is_var(var)) << "Expected var, got: " << var;
+        gpu_assert(is_var(var)) << "Expected var, got: " << var;
         args_.emplace_back(var, kind, key, is_input, scratchpad_size);
     }
 
@@ -378,54 +368,6 @@ class kernel_info_t {
     std::vector<arg_t> args_;
 };
 
-class exec_plan_t {
-public:
-    int kernel_count() const { return (int)entries_.size(); }
-
-    status_t create_kernels(std::vector<compute::kernel_t> &kernels,
-            gpu_primitive_t *primitive, impl::engine_t *engine) const {
-        for (auto &e : entries_) {
-            compute::kernel_t kernel;
-            CHECK(e.desc->create_kernel(kernel, primitive, engine));
-            kernels.push_back(kernel);
-        }
-        return status::success;
-    }
-
-    template <typename T>
-    status_t execute(const T *primitive, const exec_ctx_t &ctx,
-            const std::vector<compute::kernel_t> &kernels) const {
-        for (int i = 0; i < kernel_count(); i++) {
-            auto &e = entries_[i];
-            kernel_info_t info;
-            CHECK(e.params->init_dispatch_kernel_info(info, *e.desc));
-            std::vector<memory_storage_wrapper_t> storage_list;
-            info.init_memory_storage_list(storage_list, ctx, primitive);
-            compute::kernel_arg_list_t arg_list;
-            info.set_args(arg_list, storage_list);
-            CHECK(primitive->parallel_for(
-                    ctx, info.nd_range(), kernels[i], arg_list));
-        }
-        return status::success;
-    }
-
-    void add_kernel(const std::shared_ptr<kernel_desc_base_t> &desc,
-            const std::shared_ptr<kernel_params_base_t> &params) {
-        entry_t e;
-        e.desc = desc;
-        e.params = params;
-        entries_.push_back(e);
-    }
-
-private:
-    struct entry_t {
-        std::shared_ptr<kernel_desc_base_t> desc;
-        std::shared_ptr<kernel_params_base_t> params;
-    };
-
-    std::vector<entry_t> entries_;
-};
-
 } // namespace jit
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/jit/ir/linear_expr.cpp b/src/gpu/intel/jit/ir/linear_expr.cpp
index 0607d1a1110..755b8fd42ee 100644
--- a/src/gpu/intel/jit/ir/linear_expr.cpp
+++ b/src/gpu/intel/jit/ir/linear_expr.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ std::vector<expr_t> op_split(op_kind_t kind, const expr_t &e) {
 expr_t op_combine(op_kind_t kind, const std::vector<expr_t> &args) {
     bool is_add = (kind == op_kind_t::_add);
     bool is_mul = (kind == op_kind_t::_mul);
-    ir_assert(is_add || is_mul);
+    gpu_assert(is_add || is_mul);
     expr_t ret = (is_add ? 0 : 1);
     for (auto &a : args) {
         if (a.is_empty()) continue;
@@ -56,7 +56,7 @@ bool is_const_expr(const expr_t &e) {
     if (auto *op = e.as_ptr<binary_op_t>()) {
         return is_const_expr(op->a) && is_const_expr(op->b);
     }
-    ir_error_not_expected() << e;
+    gpu_error_not_expected() << e;
     return false;
 }
 
@@ -85,7 +85,7 @@ class linear_normalize_expander_t : public ir_mutator_t {
         auto a = op.a;
         auto b = op.b;
         if (!is_const_expr(b)) std::swap(a, b);
-        ir_assert(is_const_expr(b));
+        gpu_assert(is_const_expr(b));
         auto a_args = op_split(op_kind_t::_add, a);
         auto b_args = op_split(op_kind_t::_add, b);
         expr_t ret = 0;
@@ -104,12 +104,12 @@ class linear_normalize_expander_t : public ir_mutator_t {
 };
 
 expr_t linear_normalize_reduce(const expr_t &e,
-        object_eq_map_t<expr_t, int> factors, int const_factor) {
+        object_eq_map_t<expr_t, int64_t> factors, int64_t const_factor) {
     auto mul_args = op_split(op_kind_t::_mul, e);
     for (auto &ma : mul_args) {
         if (is_const(ma)) {
-            int ma_const = to_cpp<int>(ma);
-            int div = math::gcd(const_factor, ma_const);
+            int64_t ma_const = to_cpp<int64_t>(ma);
+            int64_t div = math::gcd(const_factor, ma_const);
             const_factor /= div;
             ma = ma_const / div;
             continue;
@@ -119,34 +119,34 @@ expr_t linear_normalize_reduce(const expr_t &e,
         factors[ma]--;
         ma = expr_t();
     }
-    ir_assert(const_factor == 1);
+    gpu_assert(const_factor == 1);
     for (auto &kv : factors) {
-        ir_assert(kv.second == 0);
+        gpu_assert(kv.second == 0);
     }
     return op_combine(op_kind_t::_mul, mul_args);
 }
 
-object_eq_map_t<expr_t, int> find_common_factors(
-        const std::vector<expr_t> &add_args, int &const_factor) {
+object_eq_map_t<expr_t, int64_t> find_common_factors(
+        const std::vector<expr_t> &add_args, int64_t &const_factor) {
     const_factor = 1;
-    object_eq_map_t<expr_t, int> common;
+    object_eq_map_t<expr_t, int64_t> common;
     for (int i = 0; i < (int)add_args.size(); i++) {
         auto mul_args = op_split(op_kind_t::_mul, add_args[i]);
         if (i == 0) {
             for (auto &ma : mul_args) {
                 if (is_const(ma)) {
-                    const_factor *= to_cpp<int>(ma);
+                    const_factor *= to_cpp<int64_t>(ma);
                     continue;
                 }
                 common[ma]++;
             }
         } else {
             auto i_common = common;
-            int i_const_factor = 1;
+            int64_t i_const_factor = 1;
             common.clear();
             for (auto &ma : mul_args) {
                 if (is_const(ma)) {
-                    i_const_factor *= to_cpp<int>(ma);
+                    i_const_factor *= to_cpp<int64_t>(ma);
                     continue;
                 }
                 auto it = i_common.find(ma);
@@ -165,12 +165,12 @@ object_eq_map_t<expr_t, int> find_common_factors(
 // Example: (c * a + 2 * c * b) -> c * (a + 2 * b)
 expr_t linear_normalize_const_factor_out(const expr_t &_e) {
     auto e = simplify_rewrite(_e);
-    ir_assert(is_const_expr(e));
+    gpu_assert(is_const_expr(e));
     auto add_args = op_split(op_kind_t::_add, e);
     if (add_args.size() <= 1) return e;
 
     // Find common factors of all summands.
-    int const_factor;
+    int64_t const_factor;
     auto common = find_common_factors(add_args, const_factor);
     if (common.empty() && const_factor == 1) return e;
 
@@ -196,12 +196,12 @@ std::pair<expr_t, expr_t> split_to_coef_and_index(const expr_t &e) {
     expr_t idx;
     for (auto &a : args) {
         if (a.is<var_t>()) {
-            ir_assert(idx.is_empty());
+            gpu_assert(idx.is_empty());
             idx = a;
         } else if (is_const_expr(a)) {
             coef *= a;
         } else {
-            ir_error_not_expected() << a;
+            gpu_error_not_expected() << a;
         }
     }
     return std::make_pair(coef, idx);
@@ -237,11 +237,18 @@ class linear_coef_t {
     }
 
     bool is_zero() const { return factors_.empty() && imm_ == 0; }
-    int imm() const { return imm_; }
-    void set_imm(int imm) { imm_ = imm; }
+    int64_t imm() const { return imm_; }
+    void set_imm(int64_t imm) { imm_ = imm; }
+    void keep_const_vars_only() {
+        std::vector<expr_t> new_factors;
+        for (auto &f : factors_) {
+            if (f.is<const_var_t>()) new_factors.push_back(f);
+        }
+        factors_ = std::move(new_factors);
+    }
 
-    linear_coef_t &operator/=(int factor) {
-        ir_assert(imm_ % factor == 0);
+    linear_coef_t &operator/=(int64_t factor) {
+        gpu_assert(imm_ % factor == 0);
         imm_ /= factor;
         return *this;
     }
@@ -255,10 +262,10 @@ class linear_coef_t {
         imm_ = math::gcd(imm_, other.imm_);
         auto lhs = op_combine(op_kind_t::_mul, factors_);
         auto rhs = op_combine(op_kind_t::_mul, other.factors_);
-        int const_factor = 1;
+        int64_t const_factor = 1;
         auto common = find_common_factors(
                 {std::move(lhs), std::move(rhs)}, const_factor);
-        ir_assert(const_factor == 1);
+        gpu_assert(const_factor == 1);
         factors_.clear();
         for (auto &kv : common) {
             for (int i = 0; i < kv.second; i++)
@@ -288,13 +295,14 @@ class linear_coef_t {
 
     IR_DEFINE_DUMP()
 
-    static expr_t div(const expr_t &e, int factor) {
+    static expr_t div(const expr_t &e, int64_t factor) {
         linear_coef_t coef(e);
         coef /= factor;
         return coef.to_expr();
     }
 
-    static std::vector<expr_t> div(const std::vector<expr_t> &v, int factor) {
+    static std::vector<expr_t> div(
+            const std::vector<expr_t> &v, int64_t factor) {
         std::vector<expr_t> ret;
         ret.reserve(v.size());
         for (auto &e : v)
@@ -304,21 +312,21 @@ class linear_coef_t {
 
 private:
     void mul_impl(const expr_t &e) {
-        ir_assert(is_const_expr(e)) << e;
+        gpu_assert(is_const_expr(e)) << e;
         if (is_const(e)) {
-            imm_ *= to_cpp<int>(e);
+            imm_ *= to_cpp<int64_t>(e);
             if (imm_ == 0) factors_.clear();
             return;
         }
         factors_.push_back(e);
     }
 
-    int imm_ = 0;
+    int64_t imm_ = 0;
     std::vector<expr_t> factors_;
 };
 
-int linear_max_pow2_divisor_impl(const expr_t &e) {
-    const int large_pow2 = (1 << 20);
+int64_t linear_max_pow2_divisor_impl(const expr_t &e) {
+    const int64_t large_pow2 = (1 << 20);
     if (is_zero(e)) return large_pow2;
     if (e.is<const_var_t>()) return 1;
     if (e.is<var_t>()) return 1;
@@ -343,34 +351,33 @@ int linear_max_pow2_divisor_impl(const expr_t &e) {
             case op_kind_t::_div:
             case op_kind_t::_div_up:
             case op_kind_t::_mod: return 1;
-            default: ir_error_not_expected() << e;
+            default: gpu_error_not_expected() << e;
         }
     }
-    ir_error_not_expected() << e;
+    gpu_error_not_expected() << e;
     return 1;
 }
 
-int linear_max_pow2_divisor(const expr_t &e) {
+int64_t linear_max_pow2_divisor(const expr_t &e) {
     auto _linear = to_linear(e);
     auto &linear = _linear.as<linear_t>();
-    int ret = linear_max_pow2_divisor_impl(linear.c);
+    int64_t ret = linear_max_pow2_divisor_impl(linear.c);
     for (auto &u : linear.u_vec)
         ret = math::gcd(ret, linear_max_pow2_divisor_impl(u));
     return ret;
 }
 
-expr_t linear_div(const expr_t &e, int factor) {
+expr_t linear_div(const expr_t &e, int64_t factor) {
     auto _linear = to_linear(e);
     auto &linear = _linear.as<linear_t>();
     auto c = linear_coef_t::div(linear.c, factor);
     auto u_vec = linear_coef_t::div(linear.u_vec, factor);
-    auto v_vec = linear.v_vec;
-    return linear_t::to_expr(c, u_vec, v_vec);
+    return linear_t::to_expr(c, u_vec, linear.v_vec);
 }
 
-expr_t simplify_linear_mod_reduce(const expr_t &e, int factor) {
+expr_t simplify_linear_mod_reduce(const expr_t &e, int64_t factor) {
     if (factor == 1) return 0;
-    if (is_const(e)) return to_cpp<int>(e) % factor;
+    if (is_const(e)) return to_cpp<int64_t>(e) % factor;
     if (e.is<const_var_t>()) return e;
     if (auto *op = e.as_ptr<binary_op_t>()) {
         auto a = simplify_linear_mod_reduce(op->a, factor);
@@ -390,8 +397,8 @@ expr_t simplify_linear_mod_reduce(const expr_t &e, int factor) {
     return e;
 }
 
-expr_t simplify_linear_mod(const expr_t &e, int factor) {
-    ir_assert(factor > 0);
+expr_t simplify_linear_mod(const expr_t &e, int64_t factor) {
+    gpu_assert(factor > 0);
     if (factor == 1) return 0;
     auto _linear = to_linear(e);
     auto &linear = _linear.as<linear_t>();
@@ -410,9 +417,10 @@ expr_t simplify_linear_mod(const expr_t &e, int factor) {
         }
     }
     if (common.imm() == 0) return 0;
-    int div = math::gcd(common.imm(), factor);
-    int new_factor = factor / div;
+    int64_t div = math::gcd(common.imm(), factor);
+    int64_t new_factor = factor / div;
     common.set_imm(1);
+    common.keep_const_vars_only();
     auto reduced = simplify_linear_mod_reduce(common.to_expr(), new_factor);
     return reduced % new_factor;
 }
@@ -439,18 +447,22 @@ expr_t split_to_linear_impl(
         return expr;
     }
 
-    ir_error_not_expected() << expr;
+    gpu_error_not_expected() << expr;
     return expr;
 }
 
 void split_to_linear(const expr_t &expr, const std::vector<expr_t> &idxs,
-        expr_t &init, std::vector<expr_t> &incs) {
+        const std::vector<expr_t> &start, expr_t &init,
+        std::vector<expr_t> &incs) {
     incs = std::vector<expr_t>(idxs.size());
     init = to_linear(expr);
+    expr_t start_shift = 0;
     for (size_t i = 0; i < idxs.size(); i++) {
         init = split_to_linear_impl(init, idxs[i], incs[i]);
+        if (is_zero(start[i])) continue;
+        start_shift += start[i] * incs[i];
     }
-    init = init.as<linear_t>().to_expr();
+    init = init.as<linear_t>().to_expr() + start_shift;
 }
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/ir/linear_expr.hpp b/src/gpu/intel/jit/ir/linear_expr.hpp
index d63451d2c3b..dc7bb369204 100644
--- a/src/gpu/intel/jit/ir/linear_expr.hpp
+++ b/src/gpu/intel/jit/ir/linear_expr.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,22 +42,23 @@ expr_t to_linear(const expr_t &_e);
 
 // Returns the max power of two divisor of an expression. The expression must
 // be convertable to linear_t.
-int linear_max_pow2_divisor(const expr_t &e);
+int64_t linear_max_pow2_divisor(const expr_t &e);
 
 // Divides an expression by a constant. The expression must be convertable to
 // linear_t.
-expr_t linear_div(const expr_t &e, int factor);
+expr_t linear_div(const expr_t &e, int64_t factor);
 
 // Simplifies a modulus of an expression by a constant. The expression must be
 // convertable to linear_t.
-expr_t simplify_linear_mod(const expr_t &e, int factor);
+expr_t simplify_linear_mod(const expr_t &e, int64_t factor);
 
 // Returns the base and the increments of linear expression `expr` when
 // incrementing `idxs[i]` by 1:
 //     init = expr(idxs[i] = 0)
 //     incs[i] = expr(idxs[i] + 1) - expr(idx[i]).
 void split_to_linear(const expr_t &expr, const std::vector<expr_t> &idxs,
-        expr_t &init, std::vector<expr_t> &incs);
+        const std::vector<expr_t> &start, expr_t &init,
+        std::vector<expr_t> &incs);
 
 } // namespace jit
 } // namespace intel
diff --git a/src/gpu/intel/jit/ir/message.cpp b/src/gpu/intel/jit/ir/message.cpp
index 00708d9457c..186fc7ef761 100644
--- a/src/gpu/intel/jit/ir/message.cpp
+++ b/src/gpu/intel/jit/ir/message.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
 * limitations under the License.
 *******************************************************************************/
 
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
+
 #include "gpu/intel/jit/ir/message.hpp"
 
 #include "gpu/intel/jit/ir/block_2d_utils.hpp"
@@ -26,28 +31,10 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-std::ostream &operator<<(std::ostream &out, const send_op_t op) {
-    const char *s = nullptr;
-    switch (op) {
-        case send_op_t::atomic_fadd: s = "atomic_fadd"; break;
-        case send_op_t::atomic_cmpwr: s = "atomic_cmpwr"; break;
-        case send_op_t::load: s = "load"; break;
-        case send_op_t::load_2d: s = "load_2d"; break;
-        case send_op_t::prefetch: s = "prefetch"; break;
-        case send_op_t::prefetch_2d: s = "prefetch_2d"; break;
-        case send_op_t::store: s = "store"; break;
-        case send_op_t::store_2d: s = "store_2d"; break;
-        case send_op_t::undef: s = "undef"; break;
-        default: ir_error_not_expected(); s = "unknown";
-    }
-
-    return out << s;
-}
-
 stmt_t send_t::create_offset_store(const expr_t &header_buf,
         const expr_t &mem_buf, const expr_t &_mem_off,
         bool is_signed_offset) const {
-    ir_assert(is_var(mem_buf));
+    gpu_assert(is_var(mem_buf));
     int header_off = 0;
     int unit_size = 1;
     if (!is_lsc && is_block() && is_slm()) {
@@ -221,9 +208,8 @@ class memory_walker_t {
 public:
     memory_walker_t(const constraint_set_t &cset, const view_t &view)
         : view_(view)
-        , type_size_(view.type().size())
         , mask_tensor_(view.create_mask_tensor(cset).reinterpret(view.type()))
-        , full_size_(view.velems() * type_size_) {
+        , full_size_(utils::div_up(view.velems() * view.type().bitsize(), 8)) {
         init_dense_blocks(cset);
         reset();
     }
@@ -239,7 +225,9 @@ class memory_walker_t {
 
     int remaining_size() const { return remaining_size_; }
 
-    int remaining_elems() const { return remaining_size_ / type_size_; }
+    int remaining_elems() const {
+        return remaining_size_ * type().packing() / type().size();
+    }
 
     bool is_dense_and_aligned(int off, int size, int alignment) const {
         if (off + size > remaining_size_) return false;
@@ -257,7 +245,8 @@ class memory_walker_t {
             int off = i * slot_size;
             // Overflow is fine, expect it to be handled by proper masking.
             if (off >= remaining_size_) return true;
-            if ((slot_size * slots) % type_size_ != 0) return false;
+            if ((slot_size * slots * type().packing()) % type().size() != 0)
+                return false;
             if (!is_dense_and_aligned(off, slot_size, alignment)) return false;
         }
         return true;
@@ -277,7 +266,7 @@ class memory_walker_t {
             return base;
         }
         int block_idx = get_block_index(cur_off_ + off);
-        ir_assert(block_idx >= 0 && block_idx < int(block_offs_.size()));
+        gpu_assert(block_idx >= 0 && block_idx < int(block_offs_.size()));
         base = block_offs_[block_idx];
         auto prev_base = block_offs_[block_idx == 0 ? 0 : block_idx - 1];
         auto get_const_summand = [&](const expr_t &expr) -> int64_t {
@@ -303,30 +292,32 @@ class memory_walker_t {
     // Returns a boolean mask expression for the given region to access.
     expr_t get_mask(int off, int size, int mask_size, int nmasks,
             bool allow_fail = false) const {
-        ir_assert(size % mask_size == 0) << "Incompatible mask size.";
+        gpu_assert(size % mask_size == 0) << "Incompatible mask size.";
         auto sub_mask_tensor = create_sub_mask_tensor(off, size);
         sub_mask_tensor = sub_mask_tensor.reinterpret(type_t::u8(mask_size));
         if (sub_mask_tensor.is_empty()) {
             if (allow_fail) return expr_t();
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
         auto ret = sub_mask_tensor.to_expr(nmasks);
         if (ret.is_empty()) {
             if (allow_fail) return expr_t();
-            ir_error_not_expected() << "Can't create mask.";
+            gpu_error_not_expected() << "Can't create mask.";
         }
         return ret;
     }
 
     // Moves the current position `size` bytes ahead.
     void advance(int size) {
-        ir_assert(size % type_size_ == 0);
+        gpu_assert((size * type().packing()) % type().size() == 0);
         size = std::min(size, remaining_size_);
         cur_off_ += size;
         remaining_size_ -= size;
     }
 
 private:
+    const type_t &type() const { return view_.type(); }
+
     void init_dense_blocks(const constraint_set_t &cset) {
         auto l = view_.create_pseudo_vlayout();
         // Find the maximum innermost dense tile.
@@ -338,7 +329,7 @@ class memory_walker_t {
             stride = b.block * b.stride;
         }
         tensor_t tile(dims);
-        dense_block_size_ = tile.elems() * type_size_;
+        dense_block_size_ = tile.elems() * type().size() / type().packing();
         // Split the memory view into dense blocks and precompute block offsets
         // and alignments.
         view_.for_each_tile(tile, [&](const std::vector<dim_t> &start) {
@@ -349,20 +340,21 @@ class memory_walker_t {
             int64_t f = get_max_const_factor(off, cset);
             int alignment = f ? ir_utils::max_pow2_divisor(f) : base_alignment;
 
-            block_offs_.push_back(off);
+            block_offs_.push_back(std::move(off));
             block_alignments_.push_back(alignment);
         });
     }
 
     mask_tensor_t create_sub_mask_tensor(int off, int size) const {
-        ir_assert(off % type_size_ == 0);
-        ir_assert(size % type_size_ == 0);
+        gpu_assert((off * type().packing()) % type().size() == 0);
+        gpu_assert((size * type().packing()) % type().size() == 0);
 
-        std::vector<dim_t> sub_dims = {size / type_size_};
-        layout_t sub_layout(view_.type(), 0, sub_dims);
+        std::vector<dim_t> sub_dims
+                = {(dim_t)size * type().packing() / type().size()};
+        layout_t sub_layout(type(), 0, sub_dims);
         mask_tensor_t sub_mask_tensor(sub_layout);
-        int beg = (cur_off_ + off) / type_size_;
-        int end = (cur_off_ + off + size) / type_size_;
+        int beg = (cur_off_ + off) * type().packing() / type().size();
+        int end = (cur_off_ + off + size) * type().packing() / type().size();
         for (int i = beg; i < end; i++) {
             auto mask = (i < mask_tensor_.elems()) ? mask_tensor_.mask(i)
                                                    : expr_t(false);
@@ -380,7 +372,6 @@ class memory_walker_t {
     }
 
     view_t view_;
-    int type_size_;
     mask_tensor_t mask_tensor_;
     std::vector<expr_t> block_offs_;
     std::vector<int> block_alignments_;
@@ -394,10 +385,7 @@ class layout_walker_t {
 public:
     layout_walker_t() = default;
     layout_walker_t(const layout_t &layout, int grf_size)
-        : layout_(layout)
-        , grf_size_(grf_size)
-        , type_size_(layout.type().size())
-        , idxs_(layout.blocks().size()) {}
+        : layout_(layout), grf_size_(grf_size), idxs_(layout.blocks().size()) {}
 
     int offset_bytes() const { return off_bytes_; }
 
@@ -414,12 +402,13 @@ class layout_walker_t {
     // - The last element must not cross the layout boundary
     bool can_advance(int stride, int elems, bool is_last_region = false) {
         if (is_last_region) elems = std::min(elems, remaining_elems());
+        const auto stride_bytes
+                = utils::div_up(stride * type().size(), type().packing());
         auto cur_idxs = idxs_;
         int cur_off_bytes = off_bytes_;
         for (int i = 0; i < elems - 1; i++) {
             int next_off_bytes = advance(cur_idxs, cur_off_bytes);
-            if (next_off_bytes - cur_off_bytes != stride * type_size_)
-                return false;
+            if (next_off_bytes - cur_off_bytes != stride_bytes) return false;
             cur_off_bytes = next_off_bytes;
         }
         cur_off_bytes = advance(cur_idxs, cur_off_bytes);
@@ -438,6 +427,8 @@ class layout_walker_t {
     }
 
 private:
+    const type_t &type() const { return layout_.type(); }
+
     int max_offset_bytes() const {
         return utils::rnd_up((int)layout_.size(), grf_size_);
     }
@@ -454,12 +445,11 @@ class layout_walker_t {
             int stride = (int)layout_.blocks()[i].stride;
             off += idxs[i] * stride;
         }
-        return off * type_size_;
+        return utils::div_up(off * type().size(), type().packing());
     }
 
     layout_t layout_;
     int grf_size_;
-    int type_size_;
 
     std::vector<int> idxs_;
     int elems_ = 0;
@@ -511,19 +501,18 @@ void access_builder_t::build() {
     }
     if (!ok && send_op_ == send_op_t::prefetch) {
         // Do not treat as an error, skip prefetch messages during generation.
-        ir_warning() << "Can't generate send decomposition for prefetch."
-                     << std::endl;
+        gpu_warning() << "Can't generate send decomposition for prefetch.";
         return;
     }
-    ir_assert(ok) << "Can't generate send decomposition.";
+    gpu_assert(ok) << "Can't generate send decomposition.";
 }
 
 static bool stride_dimension_ok(const view_t &view, int stride_tidx,
-        int stride_vidx, const std::vector<expr_t> &vstart) {
+        dim_idx_t stride_vidx, const std::vector<expr_t> &vstart) {
     auto &tdim = view.tdim(stride_tidx);
     auto e = tdim.expr();
-    for (int i = 0; i < tdim.nvargs(); i++) {
-        int vidx = tdim.vidx(i);
+    for (dim_idx_t i = 0; i < tdim.nvargs(); i++) {
+        dim_idx_t vidx = tdim.vidx(i);
         auto &vvar = view.vvars()[vidx];
         if (vidx == stride_vidx) {
             e = substitute(e, vvar, expr_t(0));
@@ -550,7 +539,7 @@ static expr_t try_scalarize(const expr_t &e) {
         return binary_op_t::make(binary->op_kind, a, b);
     }
 
-    ir_error_not_expected() << e;
+    gpu_error_not_expected() << e;
     return expr_t();
 }
 
@@ -570,7 +559,7 @@ static stmt_t try_promote_to_lsc(const stmt_t &_call) {
     send_t::arg_mask(new_args) = std::move(mask);
 
     auto lsc_send = send_t::make(send.hw, send.op, send.address, send.type,
-            send.slots, /*is_lsc=*/true, send.zero_out, send.cache_hint);
+            send.slots, /*is_lsc=*/true, send.fill_buf, send.cache_hint);
     return lsc_send.call(new_args);
 }
 
@@ -587,17 +576,17 @@ bool access_builder_t::try_build_2d(send_params_t &send_params) {
 
     auto &b0 = blocks[0];
     auto &b1 = blocks[1];
-    ir_assert(b0.dim_idx != b1.dim_idx);
+    gpu_assert(b0.dim_idx != b1.dim_idx);
     if (b0.stride != stride_t(1)) return false;
     if (!b1.stride.is_fixed()) return false;
 
-    auto get_tdim_idx = [&](int vdim_idx, int &stride) {
-        int ret = -1;
-        for (int i = 0; i < mem_view_.ntdims(); i++) {
+    auto get_tdim_idx = [&](dim_idx_t vdim_idx, int &stride) {
+        dim_idx_t ret = dim_idx::invalid;
+        for (dim_idx_t i = 0; i < mem_view_.ntdims(); i++) {
             auto &tdim = mem_view_.tdim(i);
-            for (int j = 0; j < tdim.nvargs(); j++) {
+            for (dim_idx_t j = 0; j < tdim.nvargs(); j++) {
                 if (tdim.vidx(j) == vdim_idx) {
-                    ir_assert(ret == -1);
+                    gpu_assert(ret == dim_idx::invalid);
                     stride = (int)tdim.vstride(j);
                     ret = i;
                 }
@@ -608,13 +597,13 @@ bool access_builder_t::try_build_2d(send_params_t &send_params) {
 
     int w_tstride = 0;
     int h_tstride = 0;
-    int w_dim_idx = get_tdim_idx(b0.dim_idx, w_tstride);
-    int h_dim_idx = get_tdim_idx(b1.dim_idx, h_tstride);
+    dim_idx_t w_dim_idx = get_tdim_idx(b0.dim_idx, w_tstride);
+    dim_idx_t h_dim_idx = get_tdim_idx(b1.dim_idx, h_tstride);
 
     if (w_tstride != 1) return false;
 
     auto &tlayout = mem_view_.tlayout();
-    auto get_2d_dim = [&](int tidx) {
+    auto get_2d_dim = [&](dim_idx_t tidx) {
         return tlayout.inner_block(tidx, /*skip_outer=*/false);
     };
 
@@ -750,7 +739,7 @@ bool access_builder_t::try_build_2d(send_params_t &send_params) {
         }
 
         auto vstart = vstart0;
-        for (int i = 0; i < vlayout.ndims(); i++) {
+        for (dim_idx_t i = 0; i < into<dim_idx_t>(vlayout.ndims()); i++) {
             if (start[i] == 0) continue;
             int factor = (i == b0.dim_idx ? type_factor : 1);
             vstart[i] += factor * start[i];
@@ -861,7 +850,7 @@ bool access_builder_t::fixup_send_2d_params(const type_t &send_type, bool vnni,
 }
 
 bool access_builder_t::check_2d_mask(const tensor_t &tile,
-        bool use_virtual_surface, int w_dim_idx, int h_dim_idx,
+        bool use_virtual_surface, dim_idx_t w_dim_idx, dim_idx_t h_dim_idx,
         expr_t &mask) const {
     auto sub_view = mem_view_.create_sub_view(tile);
     auto mask_tensor = sub_view.create_mask_tensor(ir_ctx_->cset());
@@ -873,11 +862,11 @@ bool access_builder_t::check_2d_mask(const tensor_t &tile,
 
     // Remove bound conditions that are covered by out-of-bound send checks.
     uint32_t tmask = 0xFFFFFFFF;
-    for (int i = 0; i < sub_view.nvdims(); i++) {
+    for (dim_idx_t i = 0; i < sub_view.nvdims(); i++) {
         if (!utils::one_of(i, w_dim_idx, h_dim_idx)) continue;
-        for (int j = 0; j < sub_view.ntdims(); j++) {
+        for (dim_idx_t j = 0; j < sub_view.ntdims(); j++) {
             auto &tdim = sub_view.tdim(j);
-            for (int k = 0; k < tdim.nvargs(); k++) {
+            for (dim_idx_t k = 0; k < tdim.nvargs(); k++) {
                 if (tdim.vidx(k) == i) {
                     // TODO: Check if tdim mask is a bound mask.
                     tmask &= ~(1U << i);
@@ -895,6 +884,8 @@ bool access_builder_t::check_2d_mask(const tensor_t &tile,
 bool access_builder_t::try_build(
         const layout_t &try_layout, memory_walker_t &mem_walker) {
     auto &try_layout_blocks = try_layout.blocks();
+    const int type_size = mem_type_.size();
+    const int type_packing = mem_type_.packing();
     int reg_stride
             = (try_layout_blocks.empty() ? 0
                                          : (int)try_layout_blocks[0].stride);
@@ -916,16 +907,17 @@ bool access_builder_t::try_build(
             int nmasks = s.nmasks();
             int payload_stride = s.payload_type_stride();
             int access_size = s.access_size();
-            int access_elems = access_size / mem_type_.size();
+            int access_elems = access_size * type_packing / type_size;
             bool is_last_chunk = mem_walker.remaining_size() <= access_size;
 
             if (reg_stride != 1 || payload_stride != slot_size) {
                 // Detected strided GRF layout or strided payload. In this
                 // case require full data type and stride match.
                 if (reg_stride != 0
-                        && payload_stride != reg_stride * mem_type_.size())
+                        && payload_stride * type_packing
+                                != reg_stride * type_size)
                     continue;
-                if (s.type.size() != mem_type_.size()) continue;
+                if (type_packing * s.type.size() != type_size) continue;
             }
             // Prefetches don't have payload so skip these conditions for
             // prefetch.
@@ -960,7 +952,8 @@ bool access_builder_t::try_build(
         send_stmt = try_promote_to_lsc(send_stmt);
         stmt_ = stmt_.append(send_stmt);
 
-        reg_layout_walker_->advance(send.access_size() / mem_type_.size());
+        reg_layout_walker_->advance(
+                send.access_size() * type_packing / type_size);
         mem_walker.advance(send.access_size());
     }
     reg_layout_ = try_layout;
@@ -979,7 +972,8 @@ std::vector<layout_t> access_builder_t::candidate_payload_layouts() const {
     // These payload layouts are to match payload for byte x {1,2} scattered
     // messages (they are dword-strided).
     if (type_size == 2) ret.push_back(vlayout.make_strided(2));
-    if (type_size == 1) ret.push_back(vlayout.make_strided(4));
+    if (type_size == 1 && mem_type_.bitsize() == 8)
+        ret.push_back(vlayout.make_strided(4));
 
     return ret;
 }
@@ -1004,7 +998,7 @@ stmt_t access_builder_t::create_send_stmt(
         } else if (!off_base.is_equal(off_base0)) {
             is_same_base = false;
         }
-        off_vec.push_back(off);
+        off_vec.push_back(std::move(off));
         off_const_vec.emplace_back(off_const - off_const0);
     }
     expr_t off;
@@ -1029,12 +1023,13 @@ stmt_t access_builder_t::create_send_stmt(
 
 static const int any_block = 0;
 
-send_2d_hint_t get_send_2d_hint(send_op_t send_op, const type_t &_type,
+send_2d_hint_t get_send_2d_hint(send_op_t send_op, const type_t &type,
         bool vnni, bool transpose, int w_tile, int h_tile,
         int w_blk = any_block, int h_blk = any_block) {
-    auto type = _type;
+    gpu_assert(!(vnni && transpose)) << "VNNI with transpose is not supported.";
 
-    ir_assert(!(vnni && transpose)) << "VNNI with transpose is not supported.";
+    // Disable sub-byte types for now.
+    if (type.packing() > 1) return send_2d_hint_t();
 
     // XXX: Convert transpose to VNNI when transpose is not
     // supported. This will require additional reorder but
@@ -1087,12 +1082,12 @@ send_2d_hint_t get_send_2d_hint(send_op_t send_op, const type_t &_type,
 
     if (vnni) {
         // TODO: Remove.
-        ir_assert(h_blk > 0);
+        gpu_assert(h_blk > 0);
         h_blk = find_block(h_tile, h_blk, h_max);
     }
     if (transpose && w_blk > 0) {
         // TODO: Remove.
-        ir_assert(w_blk > 0);
+        gpu_assert(w_blk > 0);
         w_blk = find_block(w_tile, w_blk, w_max);
     }
 
@@ -1241,3 +1236,7 @@ send_params_t get_send_params(const exec_config_t &exec_cfg, send_op_t send_op,
 } // namespace gpu
 } // namespace impl
 } // namespace dnnl
+
+#ifdef ENABLE_LLVM_WCONVERSION
+#pragma clang diagnostic pop
+#endif
diff --git a/src/gpu/intel/jit/ir/message.hpp b/src/gpu/intel/jit/ir/message.hpp
index 8ebb43f94bf..e2c0cf26425 100644
--- a/src/gpu/intel/jit/ir/message.hpp
+++ b/src/gpu/intel/jit/ir/message.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@ GPU_DEFINE_PARSE_ENUM(send_kind_t, send_kind_names)
 // Send operation kind.
 enum class send_op_t {
     undef,
+    atomic_add,
     atomic_fadd,
     atomic_cmpwr,
     load,
@@ -56,7 +57,19 @@ enum class send_op_t {
     store_2d,
 };
 
-std::ostream &operator<<(std::ostream &out, const send_op_t value);
+static auto send_op_names = nstl::to_array({
+        make_enum_name(send_op_t::undef, "undef"),
+        make_enum_name(send_op_t::atomic_add, "atomic_add"),
+        make_enum_name(send_op_t::atomic_fadd, "atomic_fadd"),
+        make_enum_name(send_op_t::atomic_cmpwr, "atomic_cmpwr"),
+        make_enum_name(send_op_t::load, "load"),
+        make_enum_name(send_op_t::load_2d, "load_2d"),
+        make_enum_name(send_op_t::prefetch, "prefetch"),
+        make_enum_name(send_op_t::prefetch_2d, "prefetch_2d"),
+        make_enum_name(send_op_t::store, "store"),
+        make_enum_name(send_op_t::store_2d, "store_2d"),
+});
+GPU_DEFINE_PARSE_ENUM(send_op_t, send_op_names)
 
 // Send address model.
 enum class send_address_t {
@@ -182,18 +195,18 @@ class send_t : public func_impl_t {
         return (op == other.op) && (address == other.address)
                 && (type == other.type) && (slots == other.slots)
                 && (slot_mask == other.slot_mask) && (is_lsc == other.is_lsc)
-                && (zero_out == other.zero_out)
+                && (fill_buf == other.fill_buf)
                 && (block_2d_info == other.block_2d_info)
                 && (cache_hint == other.cache_hint);
     }
     std::string str() const override {
         std::ostringstream oss;
-        oss << op;
+        oss << to_string(op);
         oss << ".";
         oss << type.str();
         if (is_scattered()) oss << "x" << slots;
         if (is_2d()) oss << "." << block_2d_info.str();
-        if (!zero_out) oss << ".nzo";
+        if (!fill_buf) oss << ".nofill";
         if (cache_hint != send_cache_hint_t::undef)
             oss << "." << to_string(cache_hint);
         return oss.str();
@@ -206,6 +219,7 @@ class send_t : public func_impl_t {
     IR_DEFINE_ARG_GET(mask, 3)
     IR_DEFINE_ARG_GET(x, 4)
     IR_DEFINE_ARG_GET(y, 5)
+    IR_DEFINE_ARG_GET(fill_pattern, 6)
 
     // Header offsets in bytes for 2D block messages.
     static int header_2d_off_base() { return 0; }
@@ -218,11 +232,15 @@ class send_t : public func_impl_t {
 
     stmt_t operator()(const expr_t &mem_buf, const expr_t &mem_off,
             const expr_t &reg_buf, const expr_t &mask,
-            const expr_t &x = expr_t(), const expr_t &y = expr_t()) const {
-        return call({mem_buf, mem_off, reg_buf, mask, x, y});
+            const expr_t &x = expr_t(), const expr_t &y = expr_t(),
+            const expr_t &pattern = expr_t()) const {
+        return call({mem_buf, mem_off, reg_buf, mask, x, y, pattern});
     }
 
-    bool is_atomic() const { return op == send_op_t::atomic_fadd; }
+    bool is_atomic() const {
+        return utils::one_of(op, send_op_t::atomic_add, send_op_t::atomic_fadd,
+                send_op_t::atomic_cmpwr);
+    }
     bool is_load() const { return op == send_op_t::load; }
     bool is_load_2d() const { return op == send_op_t::load_2d; }
     bool is_prefetch() const { return op == send_op_t::prefetch; }
@@ -252,7 +270,7 @@ class send_t : public func_impl_t {
     }
 
     int payload_type_stride() const {
-        ir_assert(!is_2d());
+        gpu_assert(!is_2d());
         return std::max(4, type.size());
     }
 
@@ -291,7 +309,7 @@ class send_t : public func_impl_t {
 
         if (is_scattered()) return type.size();
 
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return 0;
     }
 
@@ -300,7 +318,7 @@ class send_t : public func_impl_t {
         int masks = ir_utils::safe_divide(type.size() * slots, mask_size());
         if (hw < ngen::HW::XeHPC && is_block() && masks > 16) {
             // Round-robin masking, 16 bits are reused with dword granularity.
-            ir_assert(masks % 16 == 0);
+            gpu_assert(masks % 16 == 0);
             masks = 16;
         }
         return masks;
@@ -342,7 +360,7 @@ class send_t : public func_impl_t {
     int slots;
     uint32_t slot_mask;
     bool is_lsc;
-    bool zero_out;
+    bool fill_buf;
 
     block_2d_info_t block_2d_info;
     send_cache_hint_t cache_hint;
@@ -367,7 +385,7 @@ class send_t : public func_impl_t {
         , slots(slots)
         , slot_mask(slot_mask)
         , is_lsc(is_lsc)
-        , zero_out(zero_out)
+        , fill_buf(zero_out)
         , cache_hint(cache_hint) {}
 
     send_t(const hw_t &hw, send_op_t op, const type_t &type, bool zero_out,
@@ -380,14 +398,14 @@ class send_t : public func_impl_t {
         , slots(1)
         , slot_mask(default_slot_mask)
         , is_lsc(true)
-        , zero_out(zero_out)
+        , fill_buf(zero_out)
         , block_2d_info(block_2d_info)
         , cache_hint(cache_hint) {
-        ir_assert(utils::one_of(op, send_op_t::load_2d, send_op_t::store_2d,
+        gpu_assert(utils::one_of(op, send_op_t::load_2d, send_op_t::store_2d,
                 send_op_t::prefetch_2d));
         if (is_store_2d()) {
-            ir_assert(!block_2d_info.vnni);
-            ir_assert(!block_2d_info.transpose);
+            gpu_assert(!block_2d_info.vnni);
+            gpu_assert(!block_2d_info.transpose);
         }
     }
 };
@@ -453,7 +471,7 @@ class access_builder_t {
     const layout_t &reg_layout() const { return reg_layout_; }
     int reg_buf_size() const {
         if (reg_buf_size_ == 0)
-            return utils::rnd_up(reg_layout_.size(), grf_size());
+            return into<int>(utils::rnd_up(reg_layout_.size(), grf_size()));
         return reg_buf_size_;
     }
     const stmt_t &stmt() const { return stmt_; }
@@ -478,7 +496,7 @@ class access_builder_t {
             int &c, int &vnni_permute_factor);
 
     bool check_2d_mask(const tensor_t &tile, bool use_virtual_surface,
-            int w_idx, int h_idx, expr_t &mask) const;
+            dim_idx_t w_idx, dim_idx_t h_idx, expr_t &mask) const;
 
     std::vector<layout_t> candidate_payload_layouts() const;
     stmt_t create_send_stmt(
diff --git a/src/gpu/intel/jit/ir/message_patterns.hpp b/src/gpu/intel/jit/ir/message_patterns.hpp
index 1d8bde660bb..a353d32ef2e 100644
--- a/src/gpu/intel/jit/ir/message_patterns.hpp
+++ b/src/gpu/intel/jit/ir/message_patterns.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 
 #include "common/type_helpers.hpp"
 #include "gpu/intel/jit/ir/message.hpp"
-#include "gpu/intel/jit/ir/message_patterns.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
 
 namespace dnnl {
@@ -43,7 +42,7 @@ namespace jit {
 // The resulting overflow/underflow is handled by the entry can_overflow.
 template <typename dim_type_t>
 struct stride_layout_t {
-    static const int MAX_NDIMS = 12;
+    static const int max_ndims = 12;
     stride_layout_t(int type_size)
         : buffer_size(0), type_size(type_size), ndims(0) {};
 
@@ -57,7 +56,7 @@ struct stride_layout_t {
 
     struct stride_dim_t {
         stride_dim_t() = default;
-        stride_dim_t(dim_type_t dim, dim_t size, dim_t stride,
+        stride_dim_t(const dim_type_t &dim, dim_t size, dim_t stride,
                 bool can_overflow, bool is_complex)
             : dim(dim)
             , size(size)
@@ -83,7 +82,7 @@ struct stride_layout_t {
                 if (size > other.size)
                     return true;
                 else if (size == other.size)
-                    return dim.id() < other.dim.id();
+                    return dim < other.dim;
             }
             return false;
         }
@@ -94,7 +93,7 @@ struct stride_layout_t {
         }
     };
 
-    using stride_array_t = std::array<stride_dim_t, MAX_NDIMS>;
+    using stride_array_t = std::array<stride_dim_t, max_ndims>;
 
     typename stride_array_t::iterator strides_end() {
         return strides.begin() + ndims;
@@ -112,11 +111,11 @@ struct stride_layout_t {
     }
 
     const stride_dim_t &operator[](int i) const {
-        ir_assert(i < MAX_NDIMS);
+        gpu_assert(i < max_ndims);
         return strides[i];
     }
     stride_dim_t &operator[](int i) {
-        ir_assert(i < MAX_NDIMS);
+        gpu_assert(i < max_ndims);
         return strides[i];
     }
 
@@ -143,18 +142,17 @@ struct send_hint_t {
     enum send_dim_idx { block = 0, w = 1, h = 2 };
     using slayout_t = stride_layout_t<dim_type_t>;
     using hint_t = send_hint_t<dim_type_t>;
-    const dim_t &operator[](dim_type_t i) const { return hint_[i.id()]; }
-    dim_t &operator[](dim_type_t i) { return hint_[i.id()]; }
+    dim_t operator[](dim_type_t i) const {
+        return (hint_.count(i) == 0 ? 0 : hint_.at(i));
+    }
+    dim_t &operator[](dim_type_t i) { return hint_[i]; }
     std::string str() const {
         std::ostringstream oss;
         oss << "hint:";
         bool is_empty = true;
-        for (int id = 0; id < dim_type_t::max_id(); id++) {
-            auto i = dim_type_t::from_id(id);
-            if (hint_[i.id()] != unset) {
-                oss << " " << i.str() << ":" << hint_[i.id()];
-                is_empty = false;
-            }
+        for (auto &kv : hint_) {
+            oss << " " << kv.first.str() << ":" << kv.second;
+            is_empty = false;
         }
         if (is_empty) oss << " (empty)";
         return oss.str();
@@ -163,10 +161,9 @@ struct send_hint_t {
     dim_t size(send_dim_idx dim = send_dim_idx::block) const {
         assert((dim == send_dim_idx::block) || is_uniform_2d());
         int s = 1;
-        for (size_t i = 0; i < hint_.size(); ++i) {
-            if (hint_[i] != unset
-                    && (dim == send_dim_idx::block || dim & w_dims_[i]))
-                s *= hint_[i];
+        for (auto &kv : hint_) {
+            if (dim == send_dim_idx::block || dim & w_dims_.at(kv.first))
+                s *= kv.second;
         }
         return s;
     }
@@ -188,24 +185,24 @@ struct send_hint_t {
                 break;
         }
         base = std::min(base, i.size);
-        hint_[i.dim.id()] = hint_[i.dim.id()] == hint_t::unset
-                ? base
-                : hint_[i.dim.id()] * base;
+        hint_[i.dim] = (hint_.count(i.dim) == 0) ? base : hint_[i.dim] * base;
     }
 
-    void set_dim(dim_type_t idx, send_dim_idx i) { w_dims_[idx.id()] |= i; }
+    void set_dim(dim_type_t idx, send_dim_idx i) { w_dims_[idx] |= i; }
     bool is_w_dim(dim_type_t idx) const {
-        return w_dims_[idx.id()] & send_dim_idx::w;
+        if (w_dims_.count(idx) == 0) return false;
+        return (w_dims_.at(idx)) & send_dim_idx::w;
     }
     bool is_h_dim(dim_type_t idx) const {
-        return w_dims_[idx.id()] & send_dim_idx::h;
+        if (w_dims_.count(idx) == 0) return false;
+        return w_dims_.at(idx) & send_dim_idx::h;
     }
 
     bool is_uniform_blocked() const { return type_id_ == uniform_blocked; }
     bool is_uniform_2d() const { return type_id_ == uniform_2d; }
     send_type_id_t get_type() const { return type_id_; }
     void set_type(send_type_id_t type) {
-        ir_assert(utils::one_of(type_id_, type, send_type_id_t::empty));
+        gpu_assert(utils::one_of(type_id_, type, send_type_id_t::empty));
         type_id_ = type;
     }
     dim_t ref_2d_width() const { return block_width / type_size_; }
@@ -237,7 +234,7 @@ struct send_hint_t {
     dim_t surface_width() const {
         dim_t val = 0;
         for (auto &s : strides_) {
-            if (is_w_dim(s.dim)) val = hint_[s.dim.id()] * s.stride;
+            if (is_w_dim(s.dim)) val = hint_.at(s.dim) * s.stride;
         }
         return val * type_size_;
     };
@@ -246,8 +243,8 @@ struct send_hint_t {
     send_type_id_t type_id_;
     dim_t type_size_;
     dim_t ref_block_size_;
-    std::array<dim_t, dim_type_t::max_id()> hint_ = {0};
-    std::array<dim_t, dim_type_t::max_id()> w_dims_ = {0};
+    std::map<dim_type_t, dim_t> hint_;
+    std::map<dim_type_t, dim_t> w_dims_;
     std::vector<typename slayout_t::stride_dim_t> strides_;
 };
 
@@ -427,7 +424,7 @@ struct uniform_send_idiom_t final {
                     return std::vector<hint_t> {};
                 }();
 
-                int new_surface_width = i->size * i_stride_bytes;
+                dim_t new_surface_width = i->size * i_stride_bytes;
                 if (new_surface_width % surface_width_alignment) {
                     // Surface width must be aligned to max(4,
                     // elem_size). The elem_size requirement is
@@ -503,10 +500,9 @@ struct uniform_send_idiom_t final {
                 ret.begin(), ret.end(), [&](const hint_t &a, const hint_t &b) {
                     return a.size() > b.size();
                 });
-        if (ret.size() && filtered_ret.size()
+        if (!ret.empty() && !filtered_ret.empty()
                 && ret[0].size() > filtered_ret[0].size())
-            ir_warning() << "Optimal send hint disabled: " << ret[0]
-                         << std::endl;
+            gpu_warning() << "Optimal send hint disabled: " << ret[0];
 
         return filtered_ret;
     }
diff --git a/src/gpu/intel/jit/ir/post_ops.cpp b/src/gpu/intel/jit/ir/post_ops.cpp
index 3c84d7c8cdf..9945e28ac3b 100644
--- a/src/gpu/intel/jit/ir/post_ops.cpp
+++ b/src/gpu/intel/jit/ir/post_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,35 +42,43 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
         auto scale_args = get_scale_args();
         int src_scales_mask = 0;
         int wei_scales_mask = 0;
+        int dst_scales_mask = 0;
+        type_t src_scales_type, wei_scales_type, dst_scales_type;
         for (int i = 0; i < (int)scale_args.size(); i++) {
             auto buf = kernel_info.find_arg(
                     scale_args[i].first, /*allow_empty=*/true);
             if (buf.is_empty()) continue;
             int key = kernel_info.key(scale_args[i].first)
                     & ~DNNL_ARG_ATTR_SCALES;
-            int mask = attr.scales_.get(key).mask_;
+            if (attr.scales_.has_default_values(key)) continue;
+
+            int mask = attr.scales_.get_mask(key);
+            auto sc_type = attr.scales_.get_data_type(key);
             view_t view;
             switch (key) {
                 case DNNL_ARG_SRC:
-                    ir_assert(mask == 0);
-                    view = po_vm_.create_view(type_t::f32(), mask);
+                    gpu_assert(mask == 0);
+                    src_scales_type = sc_type;
+                    view = po_vm_.create_view(sc_type, mask);
                     src_scales = add_input_tensor(view, buf);
                     src_scales_mask = mask;
                     break;
                 case DNNL_ARG_WEIGHTS:
                     // Convert o/i weights mask to src/dst.
-                    // XXX: per_oc for BWD_D is treated as per_ic assuming it's
-                    // called from deconvolution.
-                    ir_assert(utils::one_of(mask, 0, 1, 3));
-                    view = po_vm_.create_view(
-                            type_t::f32(), (mask) ? 1 << 1 : 0);
+                    // XXX: per_oc for BWD_D is treated as per_ic assuming
+                    // it's called from deconvolution.
+                    gpu_assert(utils::one_of(mask, 0, 1, 3));
+                    wei_scales_type = sc_type;
+                    view = po_vm_.create_view(sc_type, (mask) ? 1 << 1 : 0);
                     wei_scales = add_input_tensor(view, buf);
                     wei_scales_mask = mask;
                     break;
                 case DNNL_ARG_DST: // Invert dst scales right after load.
-                    ir_assert(mask == 0);
-                    view = po_vm_.create_view(type_t::f32(), mask);
+                    gpu_assert(utils::one_of(mask, 0, 2));
+                    dst_scales_type = sc_type;
+                    view = po_vm_.create_view(sc_type, mask);
                     dst_scales = add_input_tensor(view, buf);
+                    dst_scales_mask = mask;
                     break;
             }
         }
@@ -86,9 +94,10 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
             src_scales = expr_t(1.0f);
             wei_scales = expr_t(1.0f);
         }
-        if (!is_one(dst_scales)) {
+        if (!is_one(dst_scales) && dst_scales_mask == 0) {
             inv_dst_scales = add_tensor(/*is_input=*/false,
-                    /*is_output=*/false, po_vm_.create_view(type_t::f32(), 0),
+                    /*is_output=*/false,
+                    po_vm_.create_view(type_t::f32(), dst_scales_mask),
                     expr_t(), var_t::make(type_t::f32(), "inv_dst_scales"),
                     expr_t(1.0f) / dst_scales);
             dst_scales = expr_t(1.0f);
@@ -97,12 +106,27 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
 
     if (po_vm_.can_use_simple_src_zps() && zp_cfg.do_src_compensation) {
         if (zp_cfg.is_runtime_src_zero_points) {
-            bool per_oc = !zp_cfg.is_common_src_zero_point
-                    || zp_cfg.needs_src_precalc;
-            auto view = po_vm_.create_src_zp_view((per_oc) ? 1 << 1 : 0);
+            auto view = po_vm_.create_src_zp_view(
+                    (!zp_cfg.is_common_src_zero_point) ? 1 << 1 : 0);
             auto buf = kernel_info.find_arg("src_zero_points");
-            auto in = add_input_tensor(view, buf);
-            post_ops_.emplace_back(c, c - in);
+            if (zp_cfg.needs_src_reorder_precalc) {
+                auto wei = kernel_info.find_arg("wei_user", true);
+                if (wei.is_empty()) wei = kernel_info.find_arg("wei");
+
+                layout_t tlayout(view.tlayout());
+                tlayout.set_offset(
+                        utils::div_up(schedule.b_view().tlayout().size(),
+                                tlayout.type().size()));
+                view.set_tlayout(tlayout);
+                layout_t scalar(zp_cfg.src_zp_type, 0,
+                        std::vector<dim_t>(view.vvars().size(), 1), false);
+                auto zp = add_input_tensor(view_t(scalar, view.vvars()), buf);
+                auto in = add_input_tensor(view, wei);
+                post_ops_.emplace_back(c, c - in * zp);
+            } else {
+                auto in = add_input_tensor(view, buf);
+                post_ops_.emplace_back(c, c - in);
+            }
         } else {
             auto func = eltwise_t::make(alg_kind::eltwise_linear,
                     /*scale=*/1.f,
@@ -161,7 +185,7 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
             auto op_kind = alg_kind_to_op_kind(po.binary.alg);
             post_ops_.emplace_back(c, binary_op_t::make(op_kind, c, rhs));
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
 
@@ -169,6 +193,9 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
     if (!is_one(inv_dst_scales)) {
         auto c_scaled = c * inv_dst_scales;
         post_ops_.emplace_back(c, c_scaled);
+    } else if (!is_one(dst_scales)) {
+        auto c_scaled = c / dst_scales;
+        post_ops_.emplace_back(c, c_scaled);
     }
 
     // Handle dst zero points.
@@ -188,6 +215,17 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
         }
     }
 
+    if (!attr.rounding_mode_.has_default_values()) {
+        auto seed_buf = kernel_info.find_arg("sround_seed");
+        auto view = po_vm_.create_view(type_t::u64(), 0);
+        auto in = add_input_tensor(view, seed_buf, /*do_convert=*/false);
+        auto func = eltwise_t::make(alg_kind::eltwise_stochastic_round,
+                /*scale=*/1.f,
+                /*alpha=*/1.f,
+                /*beta=*/0.f, in, convert_dnnl_type_to_ngen(dst_md.data_type));
+        post_ops_.emplace_back(c, c, func);
+    }
+
     need_to_restore_zero_padding_ = has_padding(out_md)
             && (po_vm_.need_to_restore_zero_padding()
                     || init_need_to_restore_zero_padding(
@@ -202,7 +240,7 @@ post_op_context_t::post_op_context_t(const primitive_attr_t &attr,
             continue;
         }
 
-        for (int i = 0; i < cp_ndims(); i++) {
+        for (dim_idx_t i = 0; i < cp_ndims(); i++) {
             if (!(info.mask() & (1 << i)) && po_vm_.is_spurious_spatial(i)) {
                 info.require_masked_update();
                 break;
@@ -222,14 +260,14 @@ bool post_op_context_t::init_need_to_restore_zero_padding(
         } else if (po.is_sum(/*require_scale_one=*/false,
                            /*require_zp_zero=*/false)) {
             if (po.sum.zero_point != 0) return true;
-            for (int j = 0; j < cp_ndims(); j++) {
+            for (dim_idx_t j = 0; j < cp_ndims(); j++) {
                 if (!is_cp_dim_zero_padded(j)) continue;
                 // Size one dimensions are treated as broadcast which does
                 // not preserve zero padding with block updates.
                 if (cp_view().vdims()[j] == 1) return true;
             }
         } else if (po.is_binary()) {
-            for (int j = 0; j < cp_ndims(); j++) {
+            for (dim_idx_t j = 0; j < cp_ndims(); j++) {
                 if (!is_cp_dim_zero_padded(j)) continue;
                 // Check if binary preserves zeros: (0 op X == 0) or (0 op 0 == 0).
                 bool zero_op_x_ok = (po.binary.alg == alg_kind::binary_mul);
@@ -248,7 +286,7 @@ bool post_op_context_t::init_need_to_restore_zero_padding(
         } else if (po.is_prelu()) {
             return false;
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
     if (zp_cfg.do_src_compensation && dst_md.dims[0] != dst_md.padded_dims[0])
@@ -256,6 +294,9 @@ bool post_op_context_t::init_need_to_restore_zero_padding(
     if (zp_cfg.do_dst_compensation && zp_cfg.is_common_dst_zero_point
             && out_md.dims[1] != out_md.padded_dims[1])
         return true;
+    auto dst_scales = attr.scales_.get(DNNL_ARG_DST);
+    if (!dst_scales.has_default_values() && dst_scales.get_mask() != 0)
+        return true;
     return false;
 }
 
diff --git a/src/gpu/intel/jit/ir/post_ops.hpp b/src/gpu/intel/jit/ir/post_ops.hpp
index eda0d55a43d..a983da5ff43 100644
--- a/src/gpu/intel/jit/ir/post_ops.hpp
+++ b/src/gpu/intel/jit/ir/post_ops.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,8 @@ struct zero_points_config_t {
     bool is_common_src_zero_point = false;
     bool is_common_wei_zero_point = false;
     bool is_common_dst_zero_point = false;
-    bool needs_src_precalc = false;
+    bool needs_src_reorder_precalc = false;
+    bool needs_src_conv_precalc = false;
     int common_src_zero_point = 0;
     int common_wei_zero_point = 0;
     int common_dst_zero_point = 0;
@@ -62,20 +63,23 @@ struct zero_points_config_t {
                           DNNL_ARG_WEIGHTS))
         , do_dst_compensation(pd
                   && !pd->attr()->zero_points_.has_default_values(DNNL_ARG_DST))
-        , is_runtime_src_zero_points(
-                  pd && !pd->attr()->zero_points_.defined(DNNL_ARG_SRC))
-        , is_runtime_wei_zero_points(
-                  pd && !pd->attr()->zero_points_.defined(DNNL_ARG_WEIGHTS))
-        , is_runtime_dst_zero_points(
-                  pd && !pd->attr()->zero_points_.defined(DNNL_ARG_DST))
+        , is_runtime_src_zero_points(pd
+                  && !pd->attr()->zero_points_.has_default_values(DNNL_ARG_SRC))
+        , is_runtime_wei_zero_points(pd
+                  && !pd->attr()->zero_points_.has_default_values(
+                          DNNL_ARG_WEIGHTS))
+        , is_runtime_dst_zero_points(pd
+                  && !pd->attr()->zero_points_.has_default_values(DNNL_ARG_DST))
         , is_common_src_zero_point(
-                  pd && pd->attr()->zero_points_.common(DNNL_ARG_SRC))
-        , is_common_wei_zero_point(
-                  pd && pd->attr()->zero_points_.common(DNNL_ARG_WEIGHTS))
+                  pd && pd->attr()->zero_points_.get_mask(DNNL_ARG_SRC) == 0)
+        , is_common_wei_zero_point(pd
+                  && pd->attr()->zero_points_.get_mask(DNNL_ARG_WEIGHTS) == 0)
         , is_common_dst_zero_point(
-                  pd && pd->attr()->zero_points_.common(DNNL_ARG_DST))
-        , needs_src_precalc(
-                  pd && do_src_compensation && is_src_precalc_compatible(pd))
+                  pd && pd->attr()->zero_points_.get_mask(DNNL_ARG_DST) == 0)
+        , needs_src_reorder_precalc(
+                  pd && do_src_compensation && can_use_src_reorder_precalc(pd))
+        , needs_src_conv_precalc(pd && do_src_compensation
+                  && !needs_src_reorder_precalc && can_use_src_conv_precalc(pd))
         , common_src_zero_point(0)
         , common_wei_zero_point(0)
         , common_dst_zero_point(0) {
@@ -101,12 +105,22 @@ struct zero_points_config_t {
     }
 
 private:
-    bool is_src_precalc_compatible(const primitive_desc_t *pd) {
+    bool can_use_src_reorder_precalc(const primitive_desc_t *pd) {
         if (pd->kind() != primitive_kind_t::dnnl_convolution) return false;
-        // In general, precomputed ZPs are slower than the regular ZPs up to a
-        // point where a nested convolution that does the precalc takes less
-        // time than the in-situ compensations; that usually happens around
-        // MB = 64, but the exact number is just a heuristic.
+        // Reorder-based precomputed ZPs are only available if the user did not
+        // specify the weights mem desc so the convolution can choose it freely
+        // and set a mem desc flag asking a reorder to precompute the values.
+        return (pd->invariant_wei_md()->format_kind == format_kind::any)
+                && pd->attr()->zero_points_.get_mask(DNNL_ARG_SRC) == 0
+                && pd->attr()->zero_points_.has_default_values(
+                        DNNL_ARG_WEIGHTS);
+    }
+    bool can_use_src_conv_precalc(const primitive_desc_t *pd) {
+        if (pd->kind() != primitive_kind_t::dnnl_convolution) return false;
+        // In general, conv-based precomputed ZPs are slower than the regular
+        // ZPs up to a point where a nested convolution that does the precalc
+        // takes less time than the in-situ compensations; that usually happens
+        // around MB = 64, but the exact number is just a heuristic.
         // TODO: a finer-grained estimate
         return (pd->invariant_src_md()->dims[0] >= 64)
                 && pd->attr()->zero_points_.has_default_values(
@@ -120,14 +134,15 @@ class post_op_tensor_info_t {
 
     post_op_tensor_info_t(bool is_input, bool is_output, const view_t &view,
             const expr_t &buf, uint32_t mask, const expr_t &op_var,
-            const expr_t &compute_expr)
+            const expr_t &compute_expr, const bool do_convert = true)
         : is_input_(is_input)
         , is_output_(is_output)
         , view_(view)
         , buf_(buf)
         , mask_(mask)
         , op_var_(op_var)
-        , compute_expr_(compute_expr) {
+        , compute_expr_(compute_expr)
+        , do_convert_(do_convert) {
         if (op_var_.is_empty())
             op_var_ = var_t::make(type_t::f32(), make_op_var_name(buf));
     }
@@ -150,12 +165,16 @@ class post_op_tensor_info_t {
 
     bool needs_compute() const { return !compute_expr().is_empty(); }
 
+    bool do_convert() const { return do_convert_; }
+
     post_op_tensor_info_t create_sub_tensor(const tensor_t &tile) const {
         auto ret = *this;
         ret.view_ = ret.view_.create_sub_view(tile);
         return ret;
     }
 
+    void retype(const type_t &new_type) { view_ = view_.retype(new_type); }
+
     void require_masked_update() { needs_masked_update_ = true; }
 
 private:
@@ -166,12 +185,12 @@ class post_op_tensor_info_t {
         auto *ptr = buf.as_ptr<ptr_t>();
         if (ptr) {
             auto prefix = make_op_var_name(ptr->base);
-            ir_assert(is_const(ptr->off));
-            int off = to_cpp<int>(ptr->off);
+            gpu_assert(is_const(ptr->off));
+            dim_t off = to_cpp<dim_t>(ptr->off);
             return prefix + "_" + std::to_string(off);
         }
 
-        ir_error_not_expected() << "Can't generate op var name: " << buf;
+        gpu_error_not_expected() << "Can't generate op var name: " << buf;
         return "unknown";
     }
     bool is_input_;
@@ -182,6 +201,7 @@ class post_op_tensor_info_t {
     uint32_t mask_;
     expr_t op_var_;
     expr_t compute_expr_;
+    bool do_convert_ = true;
 };
 
 class post_op_view_mapper_t {
@@ -217,7 +237,7 @@ class post_op_view_mapper_t {
 
     virtual view_t try_create_bias_view(uint32_t mask) const { return {}; }
 
-    virtual bool is_spurious_spatial(int dim_idx) const { return false; };
+    virtual bool is_spurious_spatial(dim_idx_t dim_idx) const { return false; };
     virtual bool need_to_restore_zero_padding() const { return false; }
     virtual bool use_dst_in_sum_post_op() const { return true; }
     virtual bool can_use_scales() const { return true; }
@@ -259,6 +279,11 @@ class post_op_t {
     bool uses(const expr_t &op_var) const {
         if (contains_object(lhs_, op_var)) return true;
         if (contains_object(rhs_, op_var)) return true;
+        if (eltwise_.is<eltwise_t>()) {
+            auto &eltwise_func = eltwise_.as<eltwise_t>();
+            if (eltwise_func.alg_kind == alg_kind::eltwise_stochastic_round)
+                if (contains_object(eltwise_func.seed, op_var)) return true;
+        }
         return false;
     }
 
@@ -282,7 +307,7 @@ inline op_kind_t alg_kind_to_op_kind(alg_kind_t alg) {
         case alg_kind::binary_lt: return op_kind_t::_lt;
         case alg_kind::binary_eq: return op_kind_t::_eq;
         case alg_kind::binary_ne: return op_kind_t::_ne;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return op_kind_t::undef;
 }
@@ -322,34 +347,36 @@ class post_op_context_t {
             const memory_desc_t &dst_md, const memory_desc_t &out_md,
             const zero_points_config_t &zp_cfg) const;
 
-    int cp_ndims() const { return cp_view().nvdims(); }
+    dim_idx_t cp_ndims() const { return cp_view().nvdims(); }
 
-    bool is_cp_dim_zero_padded(int idx) const {
+    bool is_cp_dim_zero_padded(dim_idx_t idx) const {
         return cp_view().is_masked_vdim(idx);
     }
 
     const expr_t &add_input_tensor(const view_t &view, const expr_t &buf,
+            const bool do_convert = true,
             const expr_t &compute_expr = expr_t()) {
         return add_tensor(/*is_input=*/true, /*is_output=*/false, view, buf,
-                expr_t(), compute_expr);
+                expr_t(), compute_expr, do_convert);
     }
 
     const expr_t &add_tensor(bool is_input, bool is_output, const view_t &view,
             const expr_t &buf, const expr_t &op_var,
-            const expr_t &compute_expr = expr_t()) {
-        ir_assert(cp_ndims() == view.nvdims());
+            const expr_t &compute_expr = expr_t(),
+            const bool do_convert = true) {
+        gpu_assert(cp_ndims() == view.nvdims());
         uint32_t mask = (buf.is_empty() && compute_expr.is_empty()
                         ? ~(1u << cp_ndims())
                         : compute_mask(view));
-        tensor_infos_.emplace_back(
-                is_input, is_output, view, buf, mask, op_var, compute_expr);
+        tensor_infos_.emplace_back(is_input, is_output, view, buf, mask, op_var,
+                compute_expr, do_convert);
         return tensor_infos_.back().op_var();
     }
 
     uint32_t compute_mask(const view_t &view) const {
-        ir_assert(cp_ndims() == view.nvdims());
+        gpu_assert(cp_ndims() == view.nvdims());
         uint32_t mask = 0;
-        for (int i = 0; i < cp_ndims(); i++) {
+        for (dim_idx_t i = 0; i < cp_ndims(); i++) {
             if (view.vdims()[i] != 1) mask |= (1 << i);
         }
         return mask;
diff --git a/src/gpu/intel/jit/ir/primitive_plan.cpp b/src/gpu/intel/jit/ir/primitive_plan.cpp
new file mode 100644
index 00000000000..bc4667fa41e
--- /dev/null
+++ b/src/gpu/intel/jit/ir/primitive_plan.cpp
@@ -0,0 +1,145 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/ir/primitive_plan.hpp"
+
+#include "gpu/intel/jit/conv/zero_out.hpp"
+#include "gpu/intel/jit/reorder/reorder_kernel.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+status_t primitive_init_plan_t::create_exec_plan(
+        primitive_exec_plan_t &exec_plan, gpu_primitive_t *primitive,
+        impl::engine_t *engine) const {
+    // Zero-out required buffers.
+    for (auto &b : buf_entries_) {
+        if (!b.zero_out) continue;
+        add_zero_out_kernel(exec_plan, b, primitive, engine);
+    }
+    // Pre-reorder.
+    for (auto &a : buf_entries_) {
+        if (a.is_user()) continue;
+        for (auto &b : buf_entries_) {
+            if (a.user_name == b.name) {
+                if (b.is_user_input)
+                    add_reorder_kernel(exec_plan, b, a, primitive, engine);
+                break;
+            }
+        }
+    }
+    for (auto &e : kernel_entries_) {
+        CHECK(add_kernel(exec_plan, *e.desc, *e.params, primitive, engine));
+    }
+    // Post-reorder.
+    for (auto &a : buf_entries_) {
+        if (a.is_user()) continue;
+        for (auto &b : buf_entries_) {
+            if (a.user_name == b.name) {
+                if (b.is_user_output)
+                    add_reorder_kernel(exec_plan, a, b, primitive, engine);
+                break;
+            }
+        }
+    }
+    return status::success;
+}
+
+primitive_init_plan_t::buffer_entry_t primitive_init_plan_t::find_buf(
+        const std::string &name) const {
+    for (auto &b : buf_entries_) {
+        if (b.name == name) return b;
+    }
+    return buffer_entry_t();
+}
+
+kernel_info_t primitive_init_plan_t::create_kernel_info(
+        const kernel_desc_base_t &desc,
+        const std::unordered_map<std::string, std::string> &buf_map) const {
+    kernel_iface_t iface;
+    desc.init_kernel_iface(iface);
+    kernel_info_t info;
+    for (int i = 0; i < iface.nargs(); i++) {
+        auto &name = iface.arg_name(i);
+        auto &var = iface.arg_var(i);
+        auto buf = find_buf(buf_map.count(name) == 0 ? name : buf_map.at(name));
+        if (!buf) {
+            info.register_internal_arg(var);
+        } else if (buf.is_user()) {
+            info.register_user_arg(var, buf.arg_key, buf.is_user_input);
+        } else {
+            info.register_scratchpad_arg(
+                    var, buf.arg_key, /*is_input=*/false, buf.layout.size());
+        }
+    }
+    return info;
+}
+
+status_t primitive_init_plan_t::add_kernel(primitive_exec_plan_t &exec_plan,
+        const kernel_desc_base_t &desc, const kernel_params_base_t &params,
+        gpu_primitive_t *primitive, impl::engine_t *engine,
+        const std::unordered_map<std::string, std::string> &buf_map) const {
+    compute::kernel_t kernel;
+    CHECK(desc.create_kernel(kernel, primitive, engine));
+    auto kernel_info = create_kernel_info(desc, buf_map);
+    desc.init_kernel_info(kernel_info, params, engine);
+    exec_plan.add_kernel(kernel, kernel_info);
+    return status::success;
+}
+
+status_t primitive_init_plan_t::add_zero_out_kernel(
+        primitive_exec_plan_t &exec_plan, const buffer_entry_t &buf,
+        gpu_primitive_t *primitive, impl::engine_t *engine) const {
+    auto desc = std::make_shared<zero_out_kernel_desc_t>(regs_, simd_, dpas_);
+    auto params = std::make_shared<zero_out_kernel_params_t>(buf.layout.size());
+    std::unordered_map<std::string, std::string> buf_map;
+    buf_map["ptr"] = buf.name;
+    return add_kernel(exec_plan, *desc, *params, primitive, engine, buf_map);
+}
+
+status_t primitive_init_plan_t::add_reorder_kernel(
+        primitive_exec_plan_t &exec_plan, const buffer_entry_t &src,
+        const buffer_entry_t &dst, gpu_primitive_t *primitive,
+        impl::engine_t *engine) const {
+    kernel_info_t kernel_info;
+    for (auto *e : {&src, &dst}) {
+        auto buf_var = var_t::make(type_t::byte_ptr(), e->name);
+        if (e->is_user()) {
+            kernel_info.register_user_arg(
+                    buf_var, e->arg_key, /*is_input=*/e == &src);
+        } else {
+            kernel_info.register_scratchpad_arg(buf_var, e->arg_key,
+                    /*is_input=*/e == &src, e->layout.size());
+        }
+    }
+    exec_config_t exec_cfg(hw_t(engine), regs_, simd_);
+    reorder_config_t cfg(exec_cfg, src.layout, dst.layout);
+    kernel_info.set_nd_range(cfg.nd_range());
+    auto kernel = make_kernel<reorder_kernel_t>(primitive,
+            /*register_kernel=*/true, engine, cfg, "reorder", kernel_info,
+            dpas_);
+    exec_plan.add_kernel(kernel, kernel_info);
+    return status::success;
+}
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/ir/primitive_plan.hpp b/src/gpu/intel/jit/ir/primitive_plan.hpp
new file mode 100644
index 00000000000..1cf644a0ba1
--- /dev/null
+++ b/src/gpu/intel/jit/ir/primitive_plan.hpp
@@ -0,0 +1,155 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_IR_PRIMITIVE_PLAN_HPP
+#define GPU_INTEL_JIT_IR_PRIMITIVE_PLAN_HPP
+
+#include "gpu/intel/jit/ir/kernel_info.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+class primitive_exec_plan_t {
+public:
+    status_t execute(
+            const gpu_primitive_t *primitive, const exec_ctx_t &ctx) const {
+        for (auto &e : kernel_entries_) {
+            std::vector<memory_storage_wrapper_t> storage_list;
+            e.kernel_info.init_memory_storage_list(
+                    storage_list, ctx, primitive);
+            compute::kernel_arg_list_t arg_list;
+            e.kernel_info.set_args(arg_list, storage_list);
+            CHECK(primitive->parallel_for(
+                    ctx, e.kernel_info.nd_range(), e.kernel, arg_list));
+        }
+        return status::success;
+    }
+
+    void add_kernel(
+            const compute::kernel_t &kernel, const kernel_info_t &kernel_info) {
+        kernel_entry_t e;
+        e.kernel = kernel;
+        e.kernel_info = kernel_info;
+        kernel_entries_.push_back(e);
+    }
+
+private:
+    struct kernel_entry_t {
+        compute::kernel_t kernel;
+        kernel_info_t kernel_info;
+    };
+
+    std::vector<kernel_entry_t> kernel_entries_;
+};
+
+class primitive_init_plan_t {
+public:
+    void set_regs(int regs) { regs_ = regs; }
+    void set_simd(int simd) { simd_ = simd; }
+    void set_dpas(bool dpas) { dpas_ = dpas; }
+
+    void add_kernel(const std::shared_ptr<kernel_desc_base_t> &desc,
+            const std::shared_ptr<kernel_params_base_t> &params) {
+        kernel_entry_t e;
+        e.desc = desc;
+        e.params = params;
+        kernel_entries_.push_back(e);
+    }
+
+    void add_user_buffer(const std::string &name, const layout_t &layout,
+            bool is_input, bool is_output, int arg_key, bool zero_out) {
+        buf_entries_.emplace_back();
+        auto &e = buf_entries_.back();
+        e.name = name;
+        e.layout = layout;
+        e.is_user_input = is_input;
+        e.is_user_output = is_output;
+        e.arg_key = arg_key;
+        e.zero_out = zero_out;
+    }
+
+    void add_internal_buffer(const std::string &name, const layout_t &layout,
+            const std::string &user_name, int scratchpad_key, bool zero_out) {
+        buf_entries_.emplace_back();
+        auto &e = buf_entries_.back();
+        e.name = name;
+        e.layout = layout;
+        e.user_name = user_name;
+        e.arg_key = scratchpad_key;
+        e.zero_out = zero_out;
+    }
+
+    status_t create_exec_plan(primitive_exec_plan_t &exec_plan,
+            gpu_primitive_t *primitive, impl::engine_t *engine) const;
+
+private:
+    struct buffer_entry_t {
+        std::string name;
+        jit::layout_t layout;
+        int arg_key = 0;
+        bool is_user_input = false;
+        bool is_user_output = false;
+        std::string user_name;
+        bool zero_out = false;
+
+        operator bool() const { return !name.empty(); }
+        bool is_user() const { return is_user_input || is_user_output; }
+    };
+
+    struct kernel_entry_t {
+        std::shared_ptr<kernel_desc_base_t> desc;
+        std::shared_ptr<kernel_params_base_t> params;
+
+        kernel_entry_t() = default;
+        kernel_entry_t(const std::shared_ptr<kernel_desc_base_t> &desc,
+                const std::shared_ptr<kernel_params_base_t> &params)
+            : desc(desc), params(params) {}
+    };
+
+    buffer_entry_t find_buf(const std::string &name) const;
+    kernel_info_t create_kernel_info(const kernel_desc_base_t &desc,
+            const std::unordered_map<std::string, std::string> &buf_map) const;
+    status_t add_kernel(primitive_exec_plan_t &exec_plan,
+            const kernel_desc_base_t &desc, const kernel_params_base_t &params,
+            gpu_primitive_t *primitive, impl::engine_t *engine,
+            const std::unordered_map<std::string, std::string> &buf_map
+            = {}) const;
+    status_t add_zero_out_kernel(primitive_exec_plan_t &exec_plan,
+            const buffer_entry_t &buf, gpu_primitive_t *primitive,
+            impl::engine_t *engine) const;
+    status_t add_reorder_kernel(primitive_exec_plan_t &exec_plan,
+            const buffer_entry_t &src, const buffer_entry_t &dst,
+            gpu_primitive_t *primitive, impl::engine_t *engine) const;
+
+    std::vector<kernel_entry_t> kernel_entries_;
+    std::vector<buffer_entry_t> buf_entries_;
+
+    // Hints.
+    int regs_ = 0;
+    int simd_ = 0;
+    bool dpas_ = false;
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/ir/problem.cpp b/src/gpu/intel/jit/ir/problem.cpp
index cfe8c97ca74..332779c426c 100644
--- a/src/gpu/intel/jit/ir/problem.cpp
+++ b/src/gpu/intel/jit/ir/problem.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,145 +35,114 @@ std::string to_string(tensor_kind_t tensor) {
         CASE(b);
         CASE(c);
 #undef CASE
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return {};
 }
 
-std::string to_string(prb_dim_kind_t kind) {
-    switch (kind) {
-#define CASE(name) \
-    case prb_dim_kind_t::name: return #name
-        CASE(undef);
-        CASE(g);
-        CASE(ic);
-        CASE(id);
-        CASE(ih);
-        CASE(iw);
-        CASE(kd);
-        CASE(kh);
-        CASE(kw);
-        CASE(mb);
-        CASE(oc);
-        CASE(od);
-        CASE(oh);
-        CASE(ow);
-        CASE(sd);
-        CASE(sh);
-        CASE(sw);
-        CASE(dd);
-        CASE(dh);
-        CASE(dw);
-        CASE(pd);
-        CASE(ph);
-        CASE(pw);
-        CASE(b);
-        CASE(m);
-        CASE(n);
-        CASE(k);
-#undef CASE
-        default: ir_error_not_expected();
-    }
-    return {};
+const expr_t &pvar_t::index_var() const {
+    static thread_local pvar_map_t<expr_t> vars;
+    if (!vars.has(*this))
+        vars[*this] = var_t::make(type_t::s32(), name_ + "_idx");
+    return vars[*this];
 }
 
-prb_dim_spatial_kind_t to_spatial(prb_dim_kind_t kind) {
-    switch (kind) {
-        case prb_dim_kind_t::id:
-        case prb_dim_kind_t::od:
-        case prb_dim_kind_t::kd:
-        case prb_dim_kind_t::sd:
-        case prb_dim_kind_t::dd:
-        case prb_dim_kind_t::pd: return prb_dim_spatial_kind_t::d;
-        case prb_dim_kind_t::ih:
-        case prb_dim_kind_t::oh:
-        case prb_dim_kind_t::kh:
-        case prb_dim_kind_t::sh:
-        case prb_dim_kind_t::dh:
-        case prb_dim_kind_t::ph: return prb_dim_spatial_kind_t::h;
-        case prb_dim_kind_t::iw:
-        case prb_dim_kind_t::ow:
-        case prb_dim_kind_t::kw:
-        case prb_dim_kind_t::sw:
-        case prb_dim_kind_t::dw:
-        case prb_dim_kind_t::pw: return prb_dim_spatial_kind_t::w;
-        default: return prb_dim_spatial_kind_t::undef;
+const expr_t &pvar_t::var() const {
+    static thread_local pvar_map_t<expr_t> vars;
+    if (!vars.has(*this)) {
+        auto var = const_var_t::make(type_t::s32(), name_);
+        vars[*this] = var;
     }
+    return vars[*this];
 }
 
-namespace prb_dims {
-prb_dim_t undef(prb_dim_kind_t::undef);
-prb_dim_t g(prb_dim_kind_t::g);
-prb_dim_t ic(prb_dim_kind_t::ic);
-prb_dim_t id(prb_dim_kind_t::id);
-prb_dim_t ih(prb_dim_kind_t::ih);
-prb_dim_t iw(prb_dim_kind_t::iw);
-prb_dim_t kd(prb_dim_kind_t::kd);
-prb_dim_t kh(prb_dim_kind_t::kh);
-prb_dim_t kw(prb_dim_kind_t::kw);
-prb_dim_t mb(prb_dim_kind_t::mb);
-prb_dim_t oc(prb_dim_kind_t::oc);
-prb_dim_t od(prb_dim_kind_t::od);
-prb_dim_t oh(prb_dim_kind_t::oh);
-prb_dim_t ow(prb_dim_kind_t::ow);
-prb_dim_t sd(prb_dim_kind_t::sd);
-prb_dim_t sh(prb_dim_kind_t::sh);
-prb_dim_t sw(prb_dim_kind_t::sw);
-prb_dim_t dd(prb_dim_kind_t::dd);
-prb_dim_t dh(prb_dim_kind_t::dh);
-prb_dim_t dw(prb_dim_kind_t::dw);
-prb_dim_t pd(prb_dim_kind_t::pd);
-prb_dim_t ph(prb_dim_kind_t::ph);
-prb_dim_t pw(prb_dim_kind_t::pw);
-prb_dim_t b(prb_dim_kind_t::b);
-prb_dim_t m(prb_dim_kind_t::m);
-prb_dim_t n(prb_dim_kind_t::n);
-prb_dim_t k(prb_dim_kind_t::k);
-} // namespace prb_dims
-
-int spatial_index(const prb_dim_t &dim) {
-    switch (to_spatial(dim.kind())) {
-        case prb_dim_spatial_kind_t::d: return 0;
-        case prb_dim_spatial_kind_t::h: return 1;
-        case prb_dim_spatial_kind_t::w: return 2;
-        default: return -1;
-    }
+pvar_t pvar_t::from_var(const expr_t &var) {
+    auto *ptr = var.as_ptr<const_var_t>();
+    if (!ptr) return pvar_t();
+    return pvar_t(ptr->name);
 }
 
-const expr_t &index_var(const prb_dim_t &prb_dim) {
-    static thread_local dim_map_t<prb_dim_t, expr_t> index_vars = []() {
-        dim_map_t<prb_dim_t, expr_t> ret;
-        for (auto &d : prb_dim_t::all()) {
-            ret[d] = var_t::make(type_t::s32(), d.str() + "_idx");
-        }
-        return ret;
-    }();
-    return index_vars.at(prb_dim);
+pvar_t pvar_t::from_index_var(const expr_t &index_var) {
+    auto *ptr = index_var.as_ptr<var_t>();
+    if (!ptr) return pvar_t();
+    const char *suffix = "_idx";
+    const size_t suffix_len = std::strlen(suffix);
+    auto &name = ptr->name;
+    auto pos = name.find(suffix);
+    if (pos == std::string::npos || pos + suffix_len != name.length())
+        return pvar_t();
+    return pvar_t(name.substr(0, name.length() - suffix_len));
 }
 
-const expr_t &size_var(const prb_dim_t &prb_dim) {
-    static thread_local dim_map_t<prb_dim_t, expr_t> size_vars = []() {
-        dim_map_t<prb_dim_t, expr_t> ret;
-        for (auto &d : prb_dim_t::all()) {
-            ret[d] = const_var_t::make(type_t::s32(), d.str());
-        }
-        return ret;
-    }();
-    return size_vars.at(prb_dim);
+char pvar_t::to_spatial() const {
+    if (name_.size() != 2) return ' ';
+    char c0 = name_[0];
+    char c1 = name_[1];
+    if (!std::strchr("dikops", c0)) return ' ';
+    if (!std::strchr("dhw", c1)) return ' ';
+    return c1;
 }
 
-prb_dim_t index_to_prb_dim(const expr_t &var) {
-    for (auto &d : prb_dim_t::all()) {
-        if (index_var(d).is_same(var)) return d;
+int pvar_t::spatial_index() const {
+    char sp = to_spatial();
+    switch (sp) {
+        case 'd': return 0;
+        case 'h': return 1;
+        case 'w': return 2;
+        default: return -1;
     }
-    return prb_dims::undef;
+    return -1;
 }
 
-prb_dim_t size_to_prb_dim(const expr_t &var) {
-    for (auto &d : prb_dim_t::all()) {
-        if (size_var(d).is_same(var)) return d;
-    }
-    return prb_dims::undef;
+namespace pvars {
+pvar_t g("g");
+pvar_t ic("ic");
+pvar_t id("id");
+pvar_t ih("ih");
+pvar_t iw("iw");
+pvar_t kd("kd");
+pvar_t kh("kh");
+pvar_t kw("kw");
+pvar_t mb("mb");
+pvar_t oc("oc");
+pvar_t od("od");
+pvar_t oh("oh");
+pvar_t ow("ow");
+pvar_t sd("sd");
+pvar_t sh("sh");
+pvar_t sw("sw");
+pvar_t dd("dd");
+pvar_t dh("dh");
+pvar_t dw("dw");
+pvar_t pd("pd");
+pvar_t ph("ph");
+pvar_t pw("pw");
+pvar_t b("b");
+pvar_t m("m");
+pvar_t n("n");
+pvar_t k("k");
+} // namespace pvars
+
+bool is_spatial(const pvar_t &pvar, char prefix) {
+    if (pvar.name().size() != 2) return false;
+    char c0 = pvar.name()[0];
+    char c1 = pvar.name()[1];
+    return (c0 == prefix) && utils::one_of(c1, 'd', 'h', 'w');
+}
+bool is_input_spatial(const pvar_t &pvar) {
+    return is_spatial(pvar, 'i');
+}
+bool is_output_spatial(const pvar_t &pvar) {
+    return is_spatial(pvar, 'o');
+}
+bool is_kernel_spatial(const pvar_t &pvar) {
+    return is_spatial(pvar, 'k');
+}
+bool is_dilation(const pvar_t &pvar) {
+    return is_spatial(pvar, 'd');
+}
+bool is_padding(const pvar_t &pvar) {
+    return is_spatial(pvar, 'p');
 }
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/ir/problem.hpp b/src/gpu/intel/jit/ir/problem.hpp
index baca1ad0123..c0f78088a7a 100644
--- a/src/gpu/intel/jit/ir/problem.hpp
+++ b/src/gpu/intel/jit/ir/problem.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ enum class tensor_kind_t {
     src,
     wei,
     dst,
-    bia,
+    bias,
     a,
     b,
     c,
@@ -41,276 +41,186 @@ enum class tensor_kind_t {
 
 std::string to_string(tensor_kind_t tensor);
 
-enum class prb_dim_kind_t : int8_t {
-    undef = 0,
-    g,
-    ic,
-    id,
-    ih,
-    iw,
-    kd,
-    kh,
-    kw,
-    mb,
-    oc,
-    od,
-    oh,
-    ow,
-    // Non-layout dimensions.
-    sd,
-    sh,
-    sw,
-    dd,
-    dh,
-    dw,
-    pd,
-    ph,
-    pw,
-    b,
-    m,
-    n,
-    k,
-    _max,
-};
-
-std::string to_string(prb_dim_kind_t kind);
-
-inline std::ostream &operator<<(std::ostream &out, prb_dim_kind_t kind) {
-    out << to_string(kind);
-    return out;
-}
-
-enum class prb_dim_spatial_kind_t : uint32_t {
-    undef,
-    d,
-    h,
-    w,
-};
-
-prb_dim_spatial_kind_t to_spatial(prb_dim_kind_t kind);
-
-template <typename KindT>
-class map_key_t {
+class pvar_t {
 public:
-    using kind_type = KindT;
-
-    map_key_t() = default;
-    map_key_t(KindT kind) : kind_(kind) {}
-    KindT kind() const { return kind_; }
-    int id() const { return static_cast<int>(kind_); }
-    bool is_undef() const { return kind_ == KindT::undef; }
-    bool is_max() const { return kind_ == KindT::_max; }
-    bool operator==(const map_key_t &other) const {
-        return kind_ == other.kind_;
-    }
-    bool operator!=(const map_key_t &other) const {
-        return kind_ != other.kind_;
-    }
-    size_t get_hash() const { return ir_utils::get_hash(kind_); }
-    std::string name() const { return str(); }
-    std::string str() const { return to_string(kind_); }
+    pvar_t() = default;
+    pvar_t(const std::string &name) : name_(name) { gpu_assert(!name.empty()); }
+    const std::string &name() const { return name_; }
+    bool is_undef() const { return name_.empty(); }
+    bool operator==(const pvar_t &other) const { return name_ == other.name_; }
+    bool operator!=(const pvar_t &other) const { return name_ != other.name_; }
+    bool operator<(const pvar_t &other) const { return name_ < other.name_; }
+    size_t get_hash() const { return ir_utils::get_hash(name_); }
+    std::string str() const { return name_; }
 
     IR_DEFINE_DUMP()
 
-    static constexpr int max_id() { return static_cast<int>(KindT::_max); }
-    static map_key_t from_id(int id) {
-        return map_key_t(static_cast<KindT>(id));
-    }
-    static map_key_t from_name(const std::string &name) {
-        for (int id = 0; id < max_id(); id++) {
-            auto key = from_id(id);
-            if (key.name() == name) return key;
-        }
-        ir_error_not_expected() << name;
-        return map_key_t();
-    }
-    static map_key_t undef() { return map_key_t(KindT::undef); }
-    static map_key_t max() { return map_key_t(KindT::_max); }
-
-    static std::vector<map_key_t> all() {
-        static std::vector<map_key_t> _all_keys = [&]() {
-            std::vector<map_key_t> ret;
-            for (int i = 1; i < max_id(); i++)
-                ret.push_back(from_id(i));
-            return ret;
-        }();
-        return _all_keys;
-    }
+    const expr_t &index_var() const;
+    const expr_t &var() const;
+    static pvar_t from_index_var(const expr_t &ndex_var);
+    static pvar_t from_var(const expr_t &var);
+
+    char to_spatial() const;
+    int spatial_index() const;
 
 private:
-    KindT kind_ = KindT::undef;
+    std::string name_;
 };
 
-using prb_dim_t = map_key_t<prb_dim_kind_t>;
-
-namespace prb_dims {
-extern prb_dim_t undef;
-extern prb_dim_t g;
-extern prb_dim_t ic;
-extern prb_dim_t id;
-extern prb_dim_t ih;
-extern prb_dim_t iw;
-extern prb_dim_t kd;
-extern prb_dim_t kh;
-extern prb_dim_t kw;
-extern prb_dim_t mb;
-extern prb_dim_t oc;
-extern prb_dim_t od;
-extern prb_dim_t oh;
-extern prb_dim_t ow;
-extern prb_dim_t sd;
-extern prb_dim_t sh;
-extern prb_dim_t sw;
-extern prb_dim_t dd;
-extern prb_dim_t dh;
-extern prb_dim_t dw;
-extern prb_dim_t pd;
-extern prb_dim_t ph;
-extern prb_dim_t pw;
-extern prb_dim_t b;
-extern prb_dim_t m;
-extern prb_dim_t n;
-extern prb_dim_t k;
-} // namespace prb_dims
-
-template <typename KeyT, typename ValueT>
-class dim_map_t {
+namespace pvars {
+extern pvar_t g;
+extern pvar_t ic;
+extern pvar_t id;
+extern pvar_t ih;
+extern pvar_t iw;
+extern pvar_t kd;
+extern pvar_t kh;
+extern pvar_t kw;
+extern pvar_t mb;
+extern pvar_t oc;
+extern pvar_t od;
+extern pvar_t oh;
+extern pvar_t ow;
+extern pvar_t sd;
+extern pvar_t sh;
+extern pvar_t sw;
+extern pvar_t dd;
+extern pvar_t dh;
+extern pvar_t dw;
+extern pvar_t pd;
+extern pvar_t ph;
+extern pvar_t pw;
+extern pvar_t b;
+extern pvar_t m;
+extern pvar_t n;
+extern pvar_t k;
+} // namespace pvars
+
+template <typename ValueT>
+class pvar_map_t {
 public:
     class iterator_t {
     public:
-        iterator_t(const dim_map_t *parent, KeyT key = KeyT::undef())
-            : parent_(parent), key_(key) {
-            move_next();
-        }
+        iterator_t(typename std::map<pvar_t, ValueT>::const_iterator it)
+            : it_(it) {}
 
         iterator_t &operator++() {
-            move_next();
+            it_++;
             return *this;
         }
-
         bool operator!=(const iterator_t &other) const {
-            return (parent_ != other.parent_) || (key_ != other.key_);
+            return it_ != other.it_;
         }
 
-        const KeyT &operator*() const { return key_; }
+        const pvar_t &operator*() const { return it_->first; }
 
     private:
-        void move_next() {
-            if (key_.is_max()) return;
-            key_ = KeyT::from_id(key_.id() + 1);
-            while (!key_.is_max() && !parent_->has(key_))
-                key_ = KeyT::from_id(key_.id() + 1);
-        }
-
-        const dim_map_t *parent_ = nullptr;
-        KeyT key_ = KeyT::max();
+        typename std::map<pvar_t, ValueT>::const_iterator it_;
     };
 
-    dim_map_t() {
-        is_set_.fill(false);
-        values_.fill(ValueT());
-    }
-
-    dim_map_t(const ValueT &value) {
-        is_set_.fill(true);
-        values_.fill(value);
-    }
-
-    dim_map_t(const std::initializer_list<KeyT> &keys) {
-        is_set_.fill(false);
-        values_.fill(ValueT());
+    pvar_map_t() = default;
+    pvar_map_t(const std::initializer_list<pvar_t> &keys, const ValueT &value) {
         for (auto &k : keys)
-            operator[](k) = 1;
+            operator[](k) = value;
     }
 
-    dim_map_t(const std::string &s) {
-        is_set_.fill(false);
-        values_.fill(ValueT());
+    explicit pvar_map_t(const std::string &s) {
         for (auto &kv : ir_utils::to_string_int_pairs(s)) {
-            operator[](KeyT::from_name(kv.first)) = ValueT(kv.second);
+            operator[](pvar_t(kv.first)) = ValueT(kv.second);
         }
     }
 
-    virtual ~dim_map_t() = default;
-
-    bool has(const KeyT &key) const { return is_set_[key.id()]; }
+    virtual ~pvar_map_t() = default;
 
-    iterator_t begin() const { return iterator_t(this); }
-    iterator_t end() const { return iterator_t(this, KeyT::max()); }
+    bool has(const pvar_t &key) const { return map_.count(key) != 0; }
+    iterator_t begin() const { return iterator_t(map_.begin()); }
+    iterator_t end() const { return iterator_t(map_.end()); }
+    int size() const { return (int)map_.size(); }
+    bool is_empty() const { return map_.empty(); }
 
-    int size() const { return size_; }
-    bool is_empty() const { return size_ == 0; }
+    void set(const pvar_t &key, const ValueT &value) { map_[key] = value; }
 
-    void set(const KeyT &key, const ValueT &value) {
-        int idx = key.id();
-        if (!is_set_[idx]) size_++;
-        is_set_[idx] = true;
-        values_[idx] = value;
+    void unset(const pvar_t &key) {
+        if (!has(key)) return;
+        map_.erase(key);
     }
 
-    void unset(const KeyT &key) {
-        int idx = key.id();
-        if (is_set_[idx]) size_--;
-        is_set_[idx] = false;
-        values_[idx] = ValueT();
-    }
-
-    std::vector<KeyT> keys() const {
-        std::vector<KeyT> ret;
+    std::vector<pvar_t> keys() const {
+        std::vector<pvar_t> ret;
         for (auto &key : *this)
             ret.push_back(key);
         return ret;
     }
 
-    const ValueT &operator[](const KeyT &key) const {
-        ir_assert(has(key)) << "Key not found: " << key;
-        return values_[key.id()];
+    const ValueT &operator[](const pvar_t &key) const {
+        gpu_assert(has(key)) << "Key not found: " << key;
+        return map_.at(key);
     }
 
-    ValueT &operator[](const KeyT &key) {
+    ValueT &operator[](const pvar_t &key) {
         if (!has(key)) set(key, ValueT());
-        return values_[key.id()];
+        return map_[key];
     }
 
-    const ValueT &at(const KeyT &key) const { return operator[](key); }
+    const ValueT &at(const pvar_t &key) const { return operator[](key); }
 
-    ValueT get(const KeyT &key, const ValueT &default_value = ValueT()) const {
+    ValueT get(
+            const pvar_t &key, const ValueT &default_value = ValueT()) const {
         if (!has(key)) return default_value;
         return at(key);
     }
 
-    void erase(const KeyT &key) { unset(key); }
+    void erase(const pvar_t &key) { map_.erase(key); }
 
-    void fill_missing(const ValueT &value) {
-        for (int i = 1; i < KeyT::max_id(); i++) {
-            if (is_set_[i]) continue;
-            set(KeyT::from_id(i), value);
-        }
+    std::unordered_map<std::string, dim_t> to_string_map() const {
+        std::unordered_map<std::string, dim_t> ret;
+        for (auto &kv : map_)
+            ret[kv.first.name()] = kv.second;
+        return ret;
     }
 
-    std::unordered_map<std::string, int> to_map() const {
-        std::unordered_map<std::string, int> ret;
-        for (auto &d : (*this)) {
-            ret[d.name()] = at(d);
+    pvar_map_t operator|(const pvar_map_t &other) const {
+        pvar_map_t ret = *this;
+        for (auto &kv : other.map_) {
+            auto it = map_.find(kv.first);
+            if (it != map_.end()) {
+                gpu_assert(it->second == kv.second);
+                continue;
+            }
+            ret[kv.first] = kv.second;
         }
         return ret;
     }
 
-    bool operator==(const dim_map_t &other) const {
-        if (is_set_ != other.is_set_) return false;
-        if (values_ != other.values_) return false;
-        if (size_ != other.size_) return false;
+    bool operator==(const pvar_map_t &other) const {
+        if (size() != other.size()) return false;
+        auto it1 = map_.begin();
+        auto it2 = other.map_.begin();
+        for (int i = 0; i < size(); i++) {
+            if (*it1 != *it2) return false;
+            it1++;
+            it2++;
+        }
         return true;
     }
 
-    bool operator!=(const dim_map_t &other) const { return !operator==(other); }
+    bool operator!=(const pvar_map_t &other) const {
+        return !operator==(other);
+    }
 
-    size_t get_hash() const {
-        return ir_utils::get_hash(is_set_, values_, size_);
+    pvar_map_t drop_defaults() const {
+        pvar_map_t ret;
+        for (auto &d : *this) {
+            if (at(d) == ValueT()) continue;
+            ret[d] = at(d);
+        }
+        return ret;
     }
 
+    size_t get_hash() const { return ir_utils::get_hash(map_); }
+
     void stringify(std::ostream &out) const {
-        if (size_ == 0) {
+        if (is_empty()) {
             out << "x";
             return;
         }
@@ -321,78 +231,77 @@ class dim_map_t {
     }
 
     void parse(std::istream &in) {
-        is_set_.fill(false);
-        values_.fill(ValueT());
         auto s = stream_parse<std::string>(in);
         if (s == "x") return;
         for (auto &kv : ir_utils::to_string_int_pairs(s)) {
-            operator[](KeyT::from_name(kv.first)) = ValueT(kv.second);
+            operator[](pvar_t(kv.first)) = ValueT(kv.second);
         }
     }
 
     std::string str_impl(bool multiline) const {
-        if (size_ == 0) return "x";
+        if (is_empty()) return "x";
         std::ostringstream oss;
         bool is_first = true;
-        for (auto &d : *this) {
-            auto &value = operator[](d);
+        for (auto &kv : map_) {
+            auto &p = kv.first;
+            auto &value = kv.second;
             if (multiline) {
                 if (!is_first) oss << std::endl;
-                oss << std::setw(4) << d << ": "
+                oss << std::setw(4) << p << ": "
                     << ir_utils::str_helper_t<ValueT>::call(value);
                 is_first = false;
             } else {
-                oss << d << ir_utils::str_helper_t<ValueT>::call(value);
+                oss << p << ir_utils::str_helper_t<ValueT>::call(value);
             }
         }
         return oss.str();
     }
 
-    virtual std::string str() const { return str_impl(/*multiline=*/true); }
+    virtual std::string str() const {
+        return str_impl(/*multiline=*/!std::is_integral<ValueT>::value);
+    }
 
     IR_DEFINE_DUMP()
 
 private:
-    std::array<bool, KeyT::max_id()> is_set_;
-    std::array<ValueT, KeyT::max_id()> values_;
-    int size_ = 0;
+    std::map<pvar_t, ValueT> map_;
 };
 
-template <typename KeyT>
-class tile_t : public dim_map_t<KeyT, int> {
+class pvar_tile_t : public pvar_map_t<dim_t> {
 public:
-    using dim_map_t<KeyT, int>::at;
-    using dim_map_t<KeyT, int>::dim_map_t;
-    using dim_map_t<KeyT, int>::has;
-    using dim_map_t<KeyT, int>::operator[];
-    using dim_map_t<KeyT, int>::str_impl;
-
-    int elems() const {
-        int ret = 1;
+    using pvar_map_t<dim_t>::at;
+    using pvar_map_t<dim_t>::pvar_map_t;
+    using pvar_map_t<dim_t>::has;
+    using pvar_map_t<dim_t>::operator[];
+    using pvar_map_t<dim_t>::str_impl;
+    using pvar_map_t<dim_t>::get_hash;
+
+    dim_t elems() const {
+        dim_t ret = 1;
         for (auto &d : *this)
             ret *= at(d);
         return ret;
     }
 
-    bool try_factor(const prb_dim_t &dim, int factor) {
+    bool try_factor(const pvar_t &key, dim_t factor) {
         if (factor == 1) return true;
-        if (!has(dim)) return false;
-        int &value = operator[](dim);
+        if (!has(key)) return false;
+        dim_t &value = operator[](key);
         if (value % factor != 0) return false;
         value /= factor;
         return true;
     }
 #if __cplusplus >= 202002L
-    bool operator==(const tile_t &other) const = default;
+    bool operator==(const pvar_tile_t &other) const = default;
 #endif
 
     std::string str() const override { return str_impl(/*multiline=*/false); }
 };
 
 template <typename ValueT>
-class prb_coord_t : public dim_map_t<prb_dim_t, ValueT> {
+class pvar_coord_t : public pvar_map_t<ValueT> {
 public:
-    using dim_map_t<prb_dim_t, ValueT>::dim_map_t;
+    using pvar_map_t<ValueT>::pvar_map_t;
 };
 
 template <typename T1, typename T2>
@@ -405,11 +314,26 @@ struct coord_add_type_t<int, int> {
     using type = int;
 };
 
+template <>
+struct coord_add_type_t<dim_t, dim_t> {
+    using type = dim_t;
+};
+
+template <>
+struct coord_add_type_t<dim_t, int> {
+    using type = dim_t;
+};
+
+template <>
+struct coord_add_type_t<int, dim_t> {
+    using type = dim_t;
+};
+
 template <typename T1, typename T2,
         typename T = typename coord_add_type_t<T1, T2>::type>
-inline prb_coord_t<T> operator+(
-        const prb_coord_t<T1> &a, const prb_coord_t<T2> &b) {
-    prb_coord_t<T> ret;
+inline pvar_coord_t<T> operator+(
+        const pvar_coord_t<T1> &a, const pvar_coord_t<T2> &b) {
+    pvar_coord_t<T> ret;
     for (auto &d : a) {
         ret[d] = a.get(d, T1(0)) + b.get(d, T2(0));
     }
@@ -420,22 +344,19 @@ inline prb_coord_t<T> operator+(
     return ret;
 }
 
-using prb_tile_t = tile_t<prb_dim_t>;
-
 template <typename T>
-bool has_spatial(const dim_map_t<prb_dim_t, T> &map,
-        prb_dim_spatial_kind_t spatial_kind) {
-    for (auto &d : prb_dim_t::all()) {
-        if (to_spatial(d.kind()) == spatial_kind && map.has(d)) return true;
+bool has_spatial(const pvar_map_t<T> &map, char spatial) {
+    for (auto &d : map) {
+        if (d.to_spatial() == spatial) return true;
     }
     return false;
 }
 
-int spatial_index(const prb_dim_t &dim);
-const expr_t &index_var(const prb_dim_t &prb_dim);
-const expr_t &size_var(const prb_dim_t &prb_dim);
-prb_dim_t index_to_prb_dim(const expr_t &var);
-prb_dim_t size_to_prb_dim(const expr_t &var);
+bool is_input_spatial(const pvar_t &pvar);
+bool is_output_spatial(const pvar_t &pvar);
+bool is_kernel_spatial(const pvar_t &pvar);
+bool is_dilation(const pvar_t &pvar);
+bool is_padding(const pvar_t &pvar);
 
 } // namespace jit
 } // namespace intel
@@ -443,4 +364,13 @@ prb_dim_t size_to_prb_dim(const expr_t &var);
 } // namespace impl
 } // namespace dnnl
 
+namespace std {
+template <>
+struct hash<dnnl::impl::gpu::intel::jit::pvar_t> {
+    size_t operator()(const dnnl::impl::gpu::intel::jit::pvar_t &pvar) const {
+        return std::hash<std::string>()(pvar.name());
+    }
+};
+} // namespace std
+
 #endif
diff --git a/src/gpu/intel/jit/ir/reduce.cpp b/src/gpu/intel/jit/ir/reduce.cpp
index 503bebd1c69..477abc969fa 100644
--- a/src/gpu/intel/jit/ir/reduce.cpp
+++ b/src/gpu/intel/jit/ir/reduce.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,21 +31,22 @@ stmt_t create_reduce_stmt(const layout_t &src, const layout_t &dst,
         uint32_t reduction_mask, bool drop_dims) {
     auto subtile = _subtile;
     if (subtile.is_empty()) subtile = tensor_t(src.dims());
-    ir_assert(src.ndims() == subtile.ndims());
-    int ndims = src.ndims();
+    gpu_assert(src.ndims() == subtile.ndims());
+    dim_idx_t ndims = src.ndims();
 
     // Align dst layout with src layout according to the mask if needed.
     layout_t dst_aligned;
     if (drop_dims) {
-        std::vector<int> dst2src(dst.ndims());
-        int dst_dim_idx = 0;
-        for (int i = 0; i < ndims; i++) {
+        std::vector<dim_idx_t> dst2src(dst.ndims());
+        dim_idx_t dst_dim_idx = 0;
+        for (dim_idx_t i = 0; i < ndims; i++) {
             if ((reduction_mask & (1 << i)) != 0) {
                 dst2src[dst_dim_idx] = i;
                 dst_dim_idx++;
             }
         }
-        ir_assert(dst_dim_idx == dst.ndims()) << "Incompatible reduction mask.";
+        gpu_assert(dst_dim_idx == dst.ndims())
+                << "Incompatible reduction mask.";
 
         auto dst_blocks = dst.blocks();
         for (auto &b : dst_blocks)
@@ -59,7 +60,7 @@ stmt_t create_reduce_stmt(const layout_t &src, const layout_t &dst,
 
     std::vector<dim_t> dst_tile_dims = subtile.dims();
     std::vector<expr_t> dst_tile_start = subtile.start();
-    for (int i = 0; i < ndims; i++) {
+    for (dim_idx_t i = 0; i < ndims; i++) {
         if ((reduction_mask & (1 << i)) == 0) {
             dst_tile_dims[i] = 1;
             dst_tile_start[i] = expr_t(0);
@@ -74,9 +75,9 @@ stmt_t create_reduce_stmt(const layout_t &src, const layout_t &dst,
 
 stmt_t create_reduce_stmt(const layout_t &src, const layout_t &dst,
         const expr_t &src_buf, const expr_t &dst_buf) {
-    ir_assert(src.ndims() == dst.ndims());
+    gpu_assert(src.ndims() == dst.ndims());
     uint32_t reduction_mask = 0;
-    for (int i = 0; i < src.ndims(); i++) {
+    for (dim_idx_t i = 0; i < src.ndims(); i++) {
         if (dst.dims()[i] != 1 || src.dims()[i] == 1) {
             reduction_mask |= (1 << i);
         }
diff --git a/src/gpu/intel/jit/ir/reorder.hpp b/src/gpu/intel/jit/ir/reorder.hpp
index 79084e75266..8ec86269078 100644
--- a/src/gpu/intel/jit/ir/reorder.hpp
+++ b/src/gpu/intel/jit/ir/reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,8 +57,8 @@ class reorder_t : public func_impl_t {
 
 inline stmt_t create_reorder_stmt(const layout_t &src, const layout_t &dst,
         const expr_t &src_buf, const expr_t &dst_buf) {
-    ir_assert(src.ndims() == dst.ndims()) << "Layouts are incompatible.";
-    ir_assert(src.elems() == dst.elems()) << "Layouts are incompatible.";
+    gpu_assert(src.ndims() == dst.ndims()) << "Layouts are incompatible.";
+    gpu_assert(src.elems() == dst.elems()) << "Layouts are incompatible.";
     auto func = reorder_t::make(src, dst);
     return func.call({dst_buf, src_buf});
 }
diff --git a/src/gpu/intel/jit/ir/send_plan.cpp b/src/gpu/intel/jit/ir/send_plan.cpp
index 3b92b236438..4c02ff37ab5 100644
--- a/src/gpu/intel/jit/ir/send_plan.cpp
+++ b/src/gpu/intel/jit/ir/send_plan.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,13 +43,22 @@ class send_plan_impl_t {
     virtual const layout_t &reg_layout() const = 0;
     virtual int reg_buf_size() const = 0;
     virtual stmt_t create_stmt(const expr_t &mem_buf, const expr_t &reg_buf,
-            int subtile_idx) const = 0;
+            int subtile_idx, const expr_t &pattern) const = 0;
     virtual bool can_split(int factor) const = 0;
     virtual void set_split(int factor) = 0;
     virtual int split_factor() const = 0;
-    virtual int estimate_regs(bool with_buffer = true, bool with_headers = true,
-            bool reuse_headers = false) const = 0;
+    virtual int estimate_regs(
+            bool with_buffer, bool with_headers, bool reuse_headers) const = 0;
     virtual std::string str(const std::string &tag) const = 0;
+
+    int estimate_regs(bool with_buffer, bool with_headers) const {
+        return estimate_regs(
+                with_buffer, with_headers, /*reuse_headers=*/false);
+    }
+    int estimate_regs(bool with_buffer) const {
+        return estimate_regs(with_buffer, /*with_headers=*/true);
+    }
+    int estimate_regs() const { return estimate_regs(/*with_buffer=*/true); }
 };
 
 send_op_t to_2d(send_op_t op) {
@@ -60,7 +69,7 @@ send_op_t to_2d(send_op_t op) {
         case send_op_t::prefetch_2d:
         case send_op_t::load_2d:
         case send_op_t::store_2d: return op;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return send_op_t::undef;
 }
@@ -105,7 +114,7 @@ class vec_off_t {
 
     vec_off_t &operator+=(const vec_off_t &shift) {
         if (shift.vec_.size() == 1) return operator+=(shift.vec_[0]);
-        ir_assert(vec_.size() == shift.vec_.size());
+        gpu_assert(vec_.size() == shift.vec_.size());
         for (int i = 0; i < size(); i++)
             vec_[i] += shift[i];
         return *this;
@@ -151,8 +160,8 @@ class vec_vec_off_t {
     }
 
     vec_off_t slice(int idx) const {
-        ir_assert(!is_empty());
-        ir_assert(idx >= 0 && idx < vec_[0].size());
+        gpu_assert(!is_empty());
+        gpu_assert(idx >= 0 && idx < vec_[0].size());
         vec_off_t ret;
         for (auto &o : vec_) {
             ret.push_back(o[idx]);
@@ -190,12 +199,12 @@ class vec_vec_off_t {
 
 expr_t to_vec(const expr_t &e, int elems) {
     if (e.type().elems() == elems) return e;
-    ir_assert(e.type().is_scalar());
+    gpu_assert(e.type().is_scalar());
     return shuffle_t::make_broadcast(e, elems);
 }
 
 expr_t to_vec(const vec_off_t &off, int elems) {
-    ir_assert(off.size() == elems);
+    gpu_assert(off.size() == elems);
     if (off.size() == 1) return off[0];
     std::vector<expr_t> e_off;
     e_off.reserve(off.size());
@@ -222,7 +231,7 @@ expr_t slice(const expr_t &e, int off, int elems) {
     if (e.is_empty()) return expr_t();
 
     if (is_const(e) || is_var(e)) {
-        ir_assert(off == 0 && elems == 1);
+        gpu_assert(off == 0 && elems == 1);
         return e;
     }
 
@@ -246,7 +255,7 @@ expr_t slice(const expr_t &e, int off, int elems) {
         return shuffle_t::make(vec);
     }
 
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return expr_t();
 }
 
@@ -310,7 +319,7 @@ class modulus_t {
 
 private:
     static int to_lg2(int64_t v) {
-        ir_assert(v >= 0);
+        gpu_assert(v >= 0);
         if (v == 0) return lg2_zero_;
         return std::min((int)max_lg2_, ngen::utils::bsf(v));
     }
@@ -339,10 +348,12 @@ class tdim_info_t {
     tdim_info_t() = default;
     tdim_info_t(
             int tidx, const tdim_t &tdim, const view_t &view, int64_t block = 1)
-        : tidx_(tidx), block_(block), dim_(&tdim) {
-        base_mod_ = to_base(tdim, view.vvars());
-        size_ = view.tlayout().dim(tidx);
-        for (int i = 0; i < tdim.nvargs(); i++) {
+        : tidx_(tidx)
+        , size_(view.tlayout().dim(tidx))
+        , base_mod_(to_base(tdim, view.vvars()))
+        , block_(block)
+        , dim_(&tdim) {
+        for (dim_idx_t i = 0; i < tdim.nvargs(); i++) {
             vidxs_[i] = tdim.vidx(i);
             vstrides_[i] = tdim.vstride(i);
         }
@@ -354,7 +365,7 @@ class tdim_info_t {
 
     int vidx(int i) const { return vidxs_[i]; }
 
-    int vstride(int i) const { return vstrides_[i]; }
+    dim_t vstride(int i) const { return vstrides_[i]; }
 
     int64_t block() const { return block_; }
 
@@ -383,7 +394,7 @@ class tdim_info_t {
         for (int i = 0; i < 2; i++) {
             if (vidxs_[i] == vidx) return vstrides_[i];
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return 0;
     }
 
@@ -414,7 +425,7 @@ class tdim_info_t {
     static modulus_t to_base(const tdim_t &tdim,
             const std::vector<expr_t> &vvars, const expr_t &e) {
         if (is_const(e)) return modulus_t(to_cpp<int64_t>(e));
-        for (int i = 0; i < tdim.nvargs(); i++) {
+        for (dim_idx_t i = 0; i < tdim.nvargs(); i++) {
             if (e.is_same(vvars[tdim.vidx(i)])) return modulus_t(0);
         }
         auto *binary = e.as_ptr<binary_op_t>();
@@ -475,13 +486,13 @@ class mask_desc_t {
 
     void set_base(const expr_t &base) {
         base_ = base;
-        int factor = 1;
+        dim_t factor = 1;
         if (tdim_.vidx(1) == -1) {
             factor = get_max_const_factor(base_, constraint_set_t());
-            factor = math::gcd(factor, static_cast<int>(a_ * tdim_.block()));
-            factor = math::gcd(factor, static_cast<int>(b_ * tdim_.block()));
+            factor = math::gcd(factor, a_ * tdim_.block());
+            factor = math::gcd(factor, b_ * tdim_.block());
             if (factor % tdim_.block() != 0)
-                factor = math::gcd(factor, static_cast<int>(tdim_.block()));
+                factor = math::gcd(factor, tdim_.block());
         }
         if (factor != tdim_.block()) {
             a_ = a_ * tdim_.block() / factor;
@@ -495,13 +506,13 @@ class mask_desc_t {
     bool is_const_base() const { return is_const(base_); }
 
     bool const_fold(int64_t inc) const {
-        ir_assert(is_const_base());
+        gpu_assert(is_const_base());
         int64_t base_const = to_cpp<int64_t>(base_);
         int64_t v = base_const + inc;
         switch (kind_) {
             case mask_kind_t::ab: return (a_ <= v) && (v < b_);
             case mask_kind_t::b: return (v < b_);
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
@@ -514,7 +525,7 @@ class mask_desc_t {
             case mask_kind_t::ab:
                 return (x >= to_vec(a_, slots)) & (x < to_vec(b_, slots));
             case mask_kind_t::b: return (x < to_vec(b_, slots));
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return expr_t();
     }
@@ -534,7 +545,7 @@ class mask_desc_t {
                 oss << indent << "  "
                     << "x < " << b_;
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return oss.str();
     }
@@ -561,7 +572,7 @@ bool has_vidx_mask(const std::vector<mask_desc_t> &mask_descs, int idx,
         if (!tdim.has_vidx(idx)) continue;
         if (tdim.vidx(1) != -1) return true;
         if (dim >= tdim.block()) {
-            ir_assert(dim % tdim.block() == 0);
+            gpu_assert(dim % tdim.block() == 0);
             return true;
         }
         if (dim * block >= tdim.block()) {
@@ -579,24 +590,24 @@ std::vector<T> slice(const std::vector<T> &v, int start, int stop) {
 }
 
 const char *fail_2d_header() {
-    return "INFO: can't use 2D send. ";
+    return "Cannot use 2D send. ";
 }
 
 template <typename T0>
 bool fail_2d(const T0 &t0) {
-    ir_trace() << fail_2d_header() << t0 << std::endl;
+    gpu_trace() << fail_2d_header() << t0;
     return false;
 }
 
 template <typename T0, typename T1>
 bool fail_2d(const T0 &t0, const T1 &t1) {
-    ir_trace() << fail_2d_header() << t0 << t1 << std::endl;
+    gpu_trace() << fail_2d_header() << t0 << t1;
     return false;
 }
 
 template <typename T0, typename T1, typename T2>
 bool fail_2d(const T0 &t0, const T1 &t1, const T2 &t2) {
-    ir_trace() << fail_2d_header() << t0 << t1 << t2 << std::endl;
+    gpu_trace() << fail_2d_header() << t0 << t1 << t2;
     return false;
 }
 
@@ -692,7 +703,7 @@ struct send_2d_params_t {
             const tdim_info_t &h_tdim) const {
         auto &blocks = vlayout.blocks();
         int nblocks = (int)blocks.size();
-        ir_assert((int)vblock_off.size() == nblocks);
+        gpu_assert((int)vblock_off.size() == nblocks);
         int64_t ret = 0;
         for (int i = 0; i < nblocks; i++) {
             auto &b = blocks[i];
@@ -702,7 +713,7 @@ struct send_2d_params_t {
             }
             ret += (int64_t)b.stride * vblock_off[i];
         }
-        return ret * vlayout.type().size();
+        return ret * vlayout.type().size() / vlayout.type().packing();
     }
 
     int64_t to_x_inc(
@@ -747,7 +758,7 @@ struct send_2d_params_t {
                     stride = utils::rnd_up(stride, grf_size / type.size());
                     break;
                 case pad_kind_t::none: break;
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
             cur_stride = stride;
         };
@@ -826,7 +837,7 @@ struct send_block_t {
 
 int rounded_slots(int slots, int max_slots) {
     if (max_slots == 1) {
-        ir_assert(slots == 1);
+        gpu_assert(slots == 1);
         return 1;
     }
 
@@ -856,12 +867,12 @@ int get_max_block_size(const hw_t &hw, const send_params_t &params) {
 class split_bounds_t {
 public:
     split_bounds_t(const layout_t &layout, int factor) {
-        ir_assert(layout.has_zero_offset()) << layout;
+        gpu_assert(layout.has_zero_offset()) << layout;
         auto tile = layout.split_exact(factor);
         if (tile.is_empty()) return;
 
         layout.for_each_tile(tile, [&](const std::vector<dim_t> &start) {
-            int off = layout.offset_in_bytes(start);
+            int off = into<int>(layout.offset_in_bytes(start));
             offs_.push_back(off);
         });
     }
@@ -870,7 +881,7 @@ class split_bounds_t {
 
     bool is_empty() const { return offs_.empty(); }
 
-    bool within(int beg, int end) const {
+    bool within(dim_t beg, dim_t end) const {
         if (beg >= offs_.back()) return true;
         for (int i = 0; i < factor() - 1; i++) {
             if (offs_[i] <= beg && end <= offs_[i + 1]) return true;
@@ -878,7 +889,7 @@ class split_bounds_t {
         return false;
     }
 
-    bool contains(int subtile_idx, int off) const {
+    bool contains(int subtile_idx, dim_t off) const {
         int o0 = offs_[subtile_idx];
         int o1 = subtile_idx + 1 < factor() ? offs_[subtile_idx + 1]
                                             : std::numeric_limits<int>::max();
@@ -914,7 +925,7 @@ struct send_group_t {
             size *= p2d.h_rcount;
             return size;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return 0;
     }
 
@@ -930,8 +941,8 @@ struct send_group_t {
     }
 
     send_group_t slice(int start, int stop, bool fuse, bool is_last) const {
-        ir_assert(slots == 1);
-        ir_assert(start < stop);
+        gpu_assert(slots == 1);
+        gpu_assert(start < stop);
         int len = (fuse ? stop - start : 1);
         auto ret = *this;
         if (!is_last) ret.pad_bytes = 1;
@@ -942,10 +953,10 @@ struct send_group_t {
     }
 
     void add(const send_group_t &g, int start, int stop, bool fuse) {
-        ir_assert(g.addr_inc == addr_inc);
-        ir_assert(g.mask_inc == mask_inc);
+        gpu_assert(g.addr_inc == addr_inc);
+        gpu_assert(g.mask_inc == mask_inc);
         int len = (fuse ? stop - start : 1);
-        ir_assert(len * g.type_size == type_size);
+        gpu_assert(len * g.type_size == type_size);
         if (fuse) {
             blocks.push_back(g.blocks[start]);
         } else {
@@ -993,12 +1004,12 @@ struct send_group_t {
             for (int i = 0; i < type_size; i += cur_size) {
                 cur_size = std::min(cur_size, type_size - i);
                 cur_size = utils::rnd_down_pow2(cur_size);
-                ir_assert(cur_size >= 16);
+                gpu_assert(cur_size >= 16);
                 auto type = type_t::oword(cur_size / 16);
                 type = fixup_type(type, send_params);
                 auto f = send_t::make(hw, send_params.send_op,
                         send_params.send_address, type, 1,
-                        send_t::default_slot_mask, is_lsc, zero_out,
+                        send_t::default_slot_mask, is_lsc, fill_buf,
                         send_params.cache_hint);
                 ret.push_back(f);
             }
@@ -1009,13 +1020,13 @@ struct send_group_t {
                 cur_slots = std::min(cur_slots, slots - i);
                 uint32_t slot_mask = send_t::default_slot_mask;
                 if (!math::is_pow2(cur_slots)) {
-                    slot_mask = (1 << cur_slots) - 1;
+                    slot_mask = (1u << cur_slots) - 1;
                     cur_slots = utils::rnd_up_pow2(cur_slots);
                 }
                 type = fixup_type(type, send_params);
                 auto f = send_t::make(hw, send_params.send_op,
                         send_params.send_address, type, cur_slots, slot_mask,
-                        is_lsc, zero_out, send_params.cache_hint);
+                        is_lsc, fill_buf, send_params.cache_hint);
                 ret.push_back(f);
             }
         } else if (is_2d()) {
@@ -1024,12 +1035,13 @@ struct send_group_t {
             for (int i = 0; i < rcount; i++) {
                 auto type = fixup_type(p.type, send_params);
                 auto f = send_t::make_2d(hw, to_2d(send_params.send_op), type,
-                        p.W, p.H, p.P, p.w, p.h, p.c, p.vnni, p.transpose,
-                        zero_out, send_params.cache_hint);
+                        into<int>(p.W), into<int>(p.H), into<int>(p.P), p.w,
+                        p.h, p.c, p.vnni, p.transpose, fill_buf,
+                        send_params.cache_hint);
                 ret.push_back(f);
             }
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
         return ret;
     }
@@ -1044,9 +1056,9 @@ struct send_group_t {
         } else if (is_scattered()) {
             oss << indent << "send.b" << type_size << "x" << slots;
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
-        if (!zero_out) oss << ".nzo";
+        if (!fill_buf) oss << ".nofill";
         oss << "(" << addr_inc << ")";
         if (mask_bits != 0)
             oss << std::endl << indent << "  mask_base: " << mask_inc;
@@ -1099,7 +1111,7 @@ struct send_group_t {
             if (bounds.contains(subtile_idx, b.reg_off)) {
                 auto bb = b;
                 bb.reg_off = bounds.normalize_reg_off(subtile_idx, b.reg_off);
-                new_blocks.push_back(bb);
+                new_blocks.push_back(std::move(bb));
             }
         }
 
@@ -1115,7 +1127,7 @@ struct send_group_t {
     int slot_stride = 0;
     int mask_bits = 0;
     int pad_bytes = 0;
-    bool zero_out = true;
+    bool fill_buf = true;
     send_2d_params_t send_2d_params;
 
     vec_off_t addr_inc; // slots
@@ -1138,7 +1150,7 @@ class mod_info_t {
     static modulus_t get_modulus(
             const layout_t &layout, const std::vector<T> &off, const T &base) {
         int ndims = layout.ndims();
-        ir_assert((int)off.size() == ndims);
+        gpu_assert((int)off.size() == ndims);
         std::vector<modulus_t> mods(layout.ndims());
         for (int i = 0; i < ndims; i++)
             mods[i] = off[i];
@@ -1186,28 +1198,28 @@ send_kind_t get_send_kind(const stmt_t &s) {
 struct layout_2d_wrapper_t {
     layout_2d_wrapper_t(const layout_t &l) : l(l) {}
 
-    int nblocks(int idx = -1) const {
+    int nblocks(dim_idx_t idx = -1) const {
         int ret = 0;
         for (auto &b : l.blocks()) {
             if (b.block == 1) continue;
-            if (idx == -1 || b.dim_idx == idx) ret++;
+            if (idx == dim_idx::invalid || b.dim_idx == idx) ret++;
         }
         return ret;
     }
     const block_t &w_block() const {
-        ir_assert(nblocks() >= 2);
+        gpu_assert(nblocks() >= 2);
         return l.blocks()[0];
     }
     const block_t &h_block() const {
-        ir_assert(nblocks() >= 2);
+        gpu_assert(nblocks() >= 2);
         return l.blocks()[1];
     }
     int64_t w_stride() const { return w_block().stride; }
     int64_t h_stride() const { return h_block().stride; }
-    int w_dim() const { return w_block().block; }
-    int h_dim() const { return h_block().block; }
-    int w_idx() const { return w_block().dim_idx; }
-    int h_idx() const { return h_block().dim_idx; }
+    dim_t w_dim() const { return w_block().block; }
+    dim_t h_dim() const { return h_block().block; }
+    dim_idx_t w_idx() const { return w_block().dim_idx; }
+    dim_idx_t h_idx() const { return h_block().dim_idx; }
 
     const layout_t &l;
 };
@@ -1234,6 +1246,7 @@ class view_info_t {
     int inner_idx() const { return inner_idx_; }
     int outer_idx() const { return outer_idx_; }
     int reg_bytes_per_elem() const { return reg_bytes_per_elem_; }
+    int reg_bits_per_elem() const { return reg_bits_per_elem_; }
     send_kind_t send_kind() const { return send_kind_; }
     const send_2d_params_t &send_2d_params() const { return send_2d_params_; }
     const expr_t &addr_base() const { return addr_base_; }
@@ -1257,11 +1270,11 @@ class view_info_t {
     }
 
     const tdim_info_t &vidx_to_tdim(int vidx) const {
-        for (int i = 0; i < view_.ntdims(); i++) {
+        for (dim_idx_t i = 0; i < view_.ntdims(); i++) {
             auto &tdim = tdims_[i];
             if (utils::one_of(vidx, tdim.vidx(0), tdim.vidx(1))) return tdim;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return tdims_[0];
     }
 
@@ -1287,13 +1300,20 @@ class view_info_t {
         // GRF layout will be strided in the middle and may trigger unsupported
         // reorders. Once reorder is robust enough, this check is to be removed
         const int type_size = send_params.mem_type.size();
-        if (type_size < slot_size && slot_size < 4) slot_size = type_size;
+        const int type_packing = send_params.mem_type.packing();
+        if (type_size < slot_size * type_packing && slot_size < 4)
+            slot_size = type_size;
+
+        // Require sub-byte types to fill a dword to avoid striding. This
+        // restriction can be reduced to byte-alignment when the restriction
+        // above is lifted.
+        if (slot_size < 4 && type_packing > 1) gpu_error_not_expected();
 
         // GPUs <= XeLP requires qword alignment for qword scattered messages,
         // downgrade to byte scattered (x1, x2 or x4) when alignment is
         // sub-qword.
         if (is_hw_xelp_or_below && slot_size == 8) {
-            const int align = get_block_alignment_bytes(inner_idx());
+            const int align = into<int>(get_block_alignment_bytes(inner_idx()));
             slot_size = std::min(
                     slot_size, ir_utils::max_divisor(align, {1, 2, 4, 8}));
         }
@@ -1301,28 +1321,29 @@ class view_info_t {
     }
 
 private:
-    int get_block_alignment_bytes(int inner_idx) const {
+    dim_t get_block_alignment_bytes(int inner_idx) const {
         if (inner_idx < 0) return 1;
         // Get base address.
         const auto &tlayout = view().tlayout();
-        int align = mod_info().get_modulus(tlayout, mod_info().vmods()).n();
+        const auto &type = vlayout().type();
+        dim_t align = mod_info().get_modulus(tlayout, mod_info().vmods()).n();
         // Get outer strides.
         for (int i = inner_idx; i < vlayout().nblocks(); i++) {
             auto &b = vlayout().blocks()[i];
-            int stride_bytes = dim_t(b.stride) * vlayout().type().size();
+            dim_t stride_bytes = dim_t(b.stride) * type.size() / type.packing();
             align = math::gcd(align, stride_bytes);
         }
         return align;
     }
 
     void init_tdims() {
-        for (int i = 0; i < view_.ntdims(); i++) {
+        for (dim_idx_t i = 0; i < view_.ntdims(); i++) {
             tdims_.emplace_back(i, view_.tdim(i), view_);
         }
     }
 
     void init_mask_descs() {
-        for (int i = 0; i < view_.ntdims(); i++) {
+        for (dim_idx_t i = 0; i < view_.ntdims(); i++) {
             auto &tdim = tdims_[i];
             if (tdim.has_mask()) mask_descs_.push_back(create_mask_desc(tdim));
         }
@@ -1350,7 +1371,7 @@ class view_info_t {
     void init_mod_info() {
         mod_info_ = mod_info_t(view_, tdims_);
         std::vector<modulus_t> vmods(view_.nvdims());
-        for (int i = 0; i < view_.nvdims(); i++)
+        for (dim_idx_t i = 0; i < view_.nvdims(); i++)
             vmods[i] = modulus_t(
                     is_zero(view_.vstart()[i]) ? 0 : view_.vdims()[i]);
         mod_info_.set_vmods(vmods);
@@ -1386,25 +1407,28 @@ class view_info_t {
         send_2d_params_ = try_init_2d();
         if (!send_2d_params_.is_empty()) {
             reg_bytes_per_elem_ = vlayout_.type().size();
+            reg_bits_per_elem_ = vlayout_.type().bitsize();
             send_kind_ = send_kind_t::_2d;
             outer_idx_ = inner_idx_ = 2;
             return;
         }
         vlayout_ = split_layout_inner(vlayout_, inner_idx_);
-        int type_size = vlayout_.type().size();
-        int inner_bytes = type_size;
-        int total_bytes = type_size * vlayout_.elems();
+        const type_t &type = vlayout_.type();
+        int inner_elems = 1;
+        int total_elems = into<int>(vlayout_.elems());
         auto &blocks = vlayout_.blocks();
         for (int i = 0; i < inner_idx_; i++) {
-            inner_bytes *= (int)blocks[i].block;
+            inner_elems *= into<int>(blocks[i].block);
         }
+        int inner_bytes = type.size() * inner_elems / type.packing();
+        int total_bytes = type.size() * total_elems / type.packing();
         if (can_use_block(inner_idx_, inner_bytes, total_bytes, send_params_)) {
             send_kind_ = send_kind_t::block;
         } else {
             send_kind_ = send_kind_t::scattered;
         }
-        vlayout_
-                = split_layout_outer(vlayout_, outer_idx_, reg_bytes_per_elem_);
+        vlayout_ = split_layout_outer(vlayout_, outer_idx_, reg_bits_per_elem_);
+        reg_bytes_per_elem_ = utils::div_up(reg_bits_per_elem_, 8);
     }
 
     layout_t split_layout_inner(const layout_t &layout, int &inner_idx) const {
@@ -1434,9 +1458,9 @@ class view_info_t {
 
     double outer_split_score(
             int slot_size, int slots, int max_slots, int total_bytes) const {
-        int r_slots = rounded_slots(slots, max_slots);
-        int r_size = r_slots * slot_size;
-        int nmsgs = utils::div_up(r_slots, max_slots)
+        dim_t r_slots = rounded_slots(slots, max_slots);
+        dim_t r_size = r_slots * slot_size;
+        dim_t nmsgs = utils::div_up(r_slots, max_slots)
                 * ir_utils::safe_divide(total_bytes, slots * slot_size);
 
         double score = total_bytes / (double)nmsgs;
@@ -1449,27 +1473,31 @@ class view_info_t {
     }
 
     layout_t split_layout_outer(const layout_t &layout, int &outer_idx,
-            int &reg_bytes_per_elem) const {
+            int &reg_bits_per_elem) const {
         outer_idx = inner_idx_;
         if (send_kind_ == send_kind_t::block) {
-            reg_bytes_per_elem = layout.type().size();
+            reg_bits_per_elem = layout.type().bitsize();
             return layout;
         }
-        ir_assert(send_kind_ == send_kind_t::scattered);
+        gpu_assert(send_kind_ == send_kind_t::scattered);
 
-        int type_size = layout.type().size();
-        int inner_bytes = type_size;
+        const type_t &type = layout.type();
+        int inner_elems = 1;
+        int total_elems = into<int>(vlayout_.elems());
 
         auto &blocks = layout.blocks();
         int nblocks = (int)blocks.size();
         for (int i = 0; i < inner_idx_; i++) {
-            inner_bytes *= (int)blocks[i].block;
+            inner_elems *= (int)blocks[i].block;
         }
+        gpu_assert(total_elems * type.size() % type.packing() == 0);
+        gpu_assert(inner_elems * type.size() % type.packing() == 0);
 
-        int total_bytes = vlayout_.elems() * type_size;
+        int inner_bytes = inner_elems * type.size() / type.packing();
+        int total_bytes = total_elems * type.size() / type.packing();
         int slot_size
                 = init_scattered_params(send_params_, inner_bytes, total_bytes);
-        reg_bytes_per_elem = std::max(1, 4 / slot_size) * type_size;
+        reg_bits_per_elem = std::max(1, 4 / slot_size) * type.bitsize();
 
         int max_slots = get_max_slots(hw_, send_params_);
         int inner_slots = ir_utils::safe_divide(inner_bytes, slot_size);
@@ -1481,8 +1509,8 @@ class view_info_t {
             auto &b = blocks[i];
             for (dim_t j = b.block; j > 1; j--) {
                 if (b.block % j == 0) {
-                    double score = outer_split_score(
-                            slot_size, slots * j, max_slots, total_bytes);
+                    double score = outer_split_score(slot_size,
+                            into<int>(slots * j), max_slots, total_bytes);
                     if (score > best_score) {
                         best_score = score;
                         best_idx = i;
@@ -1518,7 +1546,7 @@ class view_info_t {
         } else if (is_x_ge_a_and_x_lt_b(e, x, a, b, block)) {
             kind = mask_kind_t::ab;
         } else {
-            ir_error_not_expected() << e;
+            gpu_error_not_expected() << e;
         }
     }
 
@@ -1582,6 +1610,7 @@ class view_info_t {
     layout_t vlayout_; // Virtual layout.
     int inner_idx_ = 0;
     int outer_idx_ = 0;
+    int reg_bits_per_elem_ = 0;
     int reg_bytes_per_elem_ = 0;
     send_kind_t send_kind_ = send_kind_t::undef;
     send_2d_params_t send_2d_params_;
@@ -1620,8 +1649,8 @@ class send_2d_helper_t {
 
         int w_vidx = lw.w_idx();
         int h_vidx = lw.h_idx();
-        int w_tidx = w_tdim.tidx();
-        int h_tidx = h_tdim.tidx();
+        dim_idx_t w_tidx = w_tdim.tidx();
+        dim_idx_t h_tidx = h_tdim.tidx();
         bool use_xy = true;
 
         int w_tcount = 0;
@@ -1642,31 +1671,31 @@ class send_2d_helper_t {
             }
         }
 
-        int W = use_xy ? w_tdim.size() : lw.w_dim();
-        int H = use_xy ? h_tdim.size() : lw.h_dim();
-        int P = lw.h_stride();
+        dim_t W = use_xy ? w_tdim.size() : lw.w_dim();
+        dim_t H = use_xy ? h_tdim.size() : lw.h_dim();
+        dim_t P = lw.h_stride();
         int w = hint.width;
         int h = hint.height;
         int c = 1;
-        int w_rcount = ir_utils::safe_divide(lw.w_dim(), w);
-        int h_rcount = ir_utils::safe_divide(lw.h_dim(), h);
+        dim_t w_rcount = ir_utils::safe_divide(lw.w_dim(), w);
+        dim_t h_rcount = ir_utils::safe_divide(lw.h_dim(), h);
 
         // block is 1D; fallback to block/scattered message
         if (w == 1 || h == 1) return fail_2d("No benefit from 2D message");
 
         // Check v -> t strides.
-        int w_vstride = w_tdim.vstride_by_vidx(w_vidx);
+        dim_t w_vstride = w_tdim.vstride_by_vidx(w_vidx);
         if (w_vstride != 1)
             return fail_2d("Non-unit w (v -> t) stride: ", w_vstride);
 
-        int h_vstride = h_tdim.vstride_by_vidx(h_vidx);
+        dim_t h_vstride = h_tdim.vstride_by_vidx(h_vidx);
         if (h_vstride != 1) {
             int h_nvblocks = 0;
             h_nvblocks += lw.nblocks(h_tdim.vidx(0));
             h_nvblocks += lw.nblocks(h_tdim.vidx(1));
             if (h_nvblocks > 1)
                 return fail_2d("Can't handle multi h dimension with stride.");
-            ir_assert(use_xy) << "Unexpected combination.";
+            gpu_assert(use_xy) << "Unexpected combination.";
             if (H % h_vstride != 0)
                 return fail_2d(
                         "Can't apply non-unit h (v -> t) stride: ", h_tdim);
@@ -1682,13 +1711,13 @@ class send_2d_helper_t {
         params_.w = w;
         params_.h = h;
         params_.c = c;
-        params_.w_rcount = w_rcount;
-        params_.h_rcount = h_rcount;
+        params_.w_rcount = into<int>(w_rcount);
+        params_.h_rcount = into<int>(h_rcount);
         params_.w_vidx = w_vidx;
         params_.h_vidx = h_vidx;
         params_.w_tidx = w_tidx;
         params_.h_tidx = h_tidx;
-        params_.h_vstride = h_vstride;
+        params_.h_vstride = into<int>(h_vstride);
 
         if (!params_.apply_vnni_factor(hint.vnni_permute_factor)) return false;
         if (!params_.is_supported(info_.hw())) return false;
@@ -1715,7 +1744,7 @@ class send_2d_helper_t {
 
         // TODO: move unaligned portion of offset to block start x
         auto offset = info_.view().tlayout().offset_in_bytes();
-        if (!is_const(offset) || to_cpp<int>(offset) % base_align)
+        if (!is_const(offset) || to_cpp<int64_t>(offset) % base_align)
             return fail_2d("Unsupported base alignment: ", base_align);
 
         if (!base_mod.is_divisible(base_align) != 0)
@@ -1800,10 +1829,10 @@ class view_iterator_t {
 public:
     view_iterator_t(const view_info_t &info)
         : info_(info)
+        , inner_elems_(1)
         , block_off_(nblocks())
         , block_dims_(nblocks())
         , off_(info.vlayout().ndims()) {
-        inner_elems_ = 1;
         for (int i = 0; i < info_.inner_idx(); i++) {
             inner_elems_ *= (int)blocks()[i].block;
         }
@@ -1816,8 +1845,11 @@ class view_iterator_t {
     }
 
     int type_size() const { return info_.vlayout().type().size(); }
+    int type_packing() const { return info_.vlayout().type().packing(); }
     int inner_elems() const { return inner_elems_; }
-    int inner_bytes() const { return inner_elems_ * type_size(); }
+    int inner_bytes() const {
+        return inner_elems_ * type_size() / type_packing();
+    }
     int reg_off() const { return reg_off_; }
 
     int middle_blocks() const {
@@ -1827,7 +1859,9 @@ class view_iterator_t {
         return ret;
     }
 
-    int total_bytes() const { return info_.vlayout().elems() * type_size(); }
+    dim_t total_bytes() const {
+        return info_.vlayout().elems() * type_size() / type_packing();
+    }
 
     int nblocks() const { return (int)blocks().size(); }
 
@@ -1842,10 +1876,10 @@ class view_iterator_t {
 
     void next(vec_vec_off_t &mask, vec_off_t &addr, int elems, int slots,
             int slot_size, int mask_bits) {
-        ir_assert(has_next(elems));
+        gpu_assert(has_next(elems));
         advance(block_off_, info_.vlayout(), elems);
         linear_off_ += elems;
-        reg_off_ += elems * info_.reg_bytes_per_elem();
+        reg_off_ += utils::div_up(elems * info_.reg_bits_per_elem(), 8);
         off_.assign(info_.vlayout().ndims(), 0);
         for (int i = 0; i < nblocks(); i++) {
             auto &b = blocks()[i];
@@ -1884,6 +1918,7 @@ class view_iterator_t {
             ret += (int64_t)b.stride * block_off_[i];
         }
         ret *= type_size();
+        ret /= type_packing();
         vec_off_t vec(slots, ret);
         for (int i = 0; i < slots; i++)
             vec[i] += i * slot_size;
@@ -1964,7 +1999,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
         if (!is_2d()) send_params_.hint_2d.enable = false;
     }
 
-    std::string str(const std::string &tag = "send_plan") const override {
+    std::string str(const std::string &tag) const override {
         std::ostringstream oss;
         oss << tag << ":" << std::endl;
         oss << "  base = " << addr_base_ << std::endl;
@@ -1983,9 +2018,10 @@ class fast_send_plan_t final : public send_plan_impl_t {
         }
         return oss.str();
     }
+    std::string str() const { return str("send_plan"); }
 
     stmt_t create_stmt(const expr_t &mem_buf, const expr_t &reg_buf,
-            int subtile_idx) const override {
+            int subtile_idx, const expr_t &pattern) const override {
         stmt_t ret;
         bool is_g1b1 = (send_groups_.size() == 1)
                 && (send_groups_[0].blocks.size() == 1);
@@ -1994,7 +2030,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
                     ? _g
                     : _g.split(split_bounds_t(reg_layout(), split_factor_),
                             subtile_idx, is_g1b1);
-            ir_assert(!g.is_empty());
+            gpu_assert(!g.is_empty());
             bool try_legacy = send_params().try_legacy
                     && (g.hw < ngen::HW::XeHPC) && g.is_block();
             std::vector<stmt_t> calls;
@@ -2023,7 +2059,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
                                        : expr_t();
                     auto call = send(mem_buf, mem_off,
                             reg_buf.is_empty() ? expr_t() : reg_buf + reg_off,
-                            mask, x, y);
+                            mask, x, y, pattern);
                     if (try_legacy) {
                         send_infos.emplace_back(
                                 g.addr_inc[0] + b.addr_inc + byte_off, reg_off,
@@ -2070,14 +2106,14 @@ class fast_send_plan_t final : public send_plan_impl_t {
     }
 
     void set_split(int factor) override {
-        ir_assert(can_split(factor));
+        gpu_assert(can_split(factor));
         split_factor_ = factor;
     }
 
     int split_factor() const override { return split_factor_; }
 
-    int estimate_regs(bool with_buffer = true, bool with_headers = true,
-            bool reuse_headers = false) const override {
+    int estimate_regs(bool with_buffer, bool with_headers,
+            bool reuse_headers) const override {
         int header_size = 0;
         for (auto &g : send_groups_) {
             int g_header_size = 0;
@@ -2108,7 +2144,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
         if (g.is_2d()) return g_mem_off;
         if (g.is_block()) return g_mem_off + byte_off;
         if (g.is_scattered()) return slice(g_mem_off, slot_off, slots);
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return expr_t();
     }
 
@@ -2118,7 +2154,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
         if (g.is_2d() || g.is_block())
             return shuffle_t::make_broadcast(g_mask, send.nmasks());
         if (g.is_scattered()) return slice(g_mask, slot_off, send.slots);
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return expr_t();
     }
 
@@ -2143,7 +2179,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
     static std::vector<stmt_t> try_legacy_send(const std::vector<stmt_t> &calls,
             const std::vector<send_info_t> &infos) {
         if (calls.empty()) return calls;
-        ir_assert(calls.size() == infos.size());
+        gpu_assert(calls.size() == infos.size());
         int nmsgs = (int)calls.size();
         std::vector<bool> can_fuse(nmsgs + 1, true);
         can_fuse.front() = false;
@@ -2194,7 +2230,7 @@ class fast_send_plan_t final : public send_plan_impl_t {
     }
 
     static stmt_t merge(const std::vector<stmt_t> &calls, int start, int stop) {
-        ir_assert(start < stop);
+        gpu_assert(start < stop);
         int len = stop - start;
         if (len == 1) return calls[start];
 
@@ -2210,21 +2246,21 @@ class fast_send_plan_t final : public send_plan_impl_t {
             type = type.with_elems(
                     type.elems() * (new_size / s0.payload_size()));
             auto func = send_t::make(s0.hw, s0.op, s0.address, type, s0.slots,
-                    /*is_lsc=*/false, s0.zero_out, s0.cache_hint);
+                    /*is_lsc=*/false, s0.fill_buf, s0.cache_hint);
             auto new_args = c0.args;
             auto &mask = send_t::arg_mask(new_args);
             std::vector<expr_t> vec_mask;
             for (int i = start; i < stop; i++) {
                 auto i_mask = remove_bcast(send_t::arg_mask(calls[i]));
-                ir_assert(!i_mask.is_empty());
-                ir_assert(i_mask.type().is_scalar());
+                gpu_assert(!i_mask.is_empty());
+                gpu_assert(i_mask.type().is_scalar());
                 for (int i = 0; i < size / 4; i++) {
                     vec_mask.push_back(i_mask);
                 }
             }
             mask = shuffle_t::make(vec_mask);
             mask = simplify_propagate_shuffle(mask);
-            ir_assert(mask.type().elems() <= 16);
+            gpu_assert(mask.type().elems() <= 16);
             ret = ret.append(func.call(new_args));
         } else {
             for (int i = start; i < stop; i++) {
@@ -2261,12 +2297,14 @@ class ir_send_plan_t final : public send_plan_impl_t {
                 case send_kind_t::_2d: is_2d_ = true; break;
                 case send_kind_t::block: break;
                 case send_kind_t::scattered: is_scattered_ = true; break;
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
         }
     }
 
     ir_send_plan_t(const ir_send_plan_t &) = delete;
+    ir_send_plan_t &operator=(const ir_send_plan_t &) = delete;
+    ~ir_send_plan_t() override = default;
 
     const send_params_t &send_params() const override { return send_params_; }
 
@@ -2281,7 +2319,7 @@ class ir_send_plan_t final : public send_plan_impl_t {
     }
 
     stmt_t create_stmt(const expr_t &mem_buf, const expr_t &reg_buf,
-            int subtile_idx) const override {
+            int subtile_idx, const expr_t &pattern) const override {
         auto stmt = access_.stmt();
         stmt = substitute(stmt, dummy_mem_buf_, mem_buf);
         stmt = substitute(stmt, dummy_reg_buf_, reg_buf);
@@ -2294,25 +2332,26 @@ class ir_send_plan_t final : public send_plan_impl_t {
         split_bounds_t bounds(reg_layout(), factor);
         if (bounds.is_empty()) return false;
         auto calls = find_objects<func_call_t>(access_.stmt());
-        for (auto &c : calls) {
-            auto &send = c.as<func_call_t>().func.as<send_t>();
+        for (auto &_c : calls) {
+            auto &c = _c.as<func_call_t>();
+            auto &send = c.func.as<send_t>();
             auto &reg_buf = send_t::arg_reg_buf(c);
-            int beg = get_offset(reg_buf);
-            int end = beg + send.payload_size();
+            dim_t beg = get_offset(reg_buf);
+            dim_t end = beg + send.payload_size();
             if (!bounds.within(beg, end)) return false;
         }
         return true;
     }
 
     void set_split(int factor) override {
-        ir_assert(can_split(factor));
+        gpu_assert(can_split(factor));
         split_factor_ = factor;
     }
 
     int split_factor() const override { return split_factor_; }
 
-    int estimate_regs(bool with_buffer = true, bool with_headers = true,
-            bool reuse_headers = false) const override {
+    int estimate_regs(bool with_buffer, bool with_headers,
+            bool reuse_headers) const override {
         auto calls = find_objects<func_call_t>(access_.stmt());
         int header_size = 0;
         for (auto &c : calls) {
@@ -2341,14 +2380,14 @@ class ir_send_plan_t final : public send_plan_impl_t {
     static expr_t get_base(const expr_t &e) {
         auto *ptr = e.as_ptr<ptr_t>();
         if (ptr) return ptr->base;
-        ir_assert(e.is<var_t>()) << e;
+        gpu_assert(e.is<var_t>()) << e;
         return e;
     }
 
-    static int get_offset(const expr_t &e) {
+    static int64_t get_offset(const expr_t &e) {
         auto *ptr = e.as_ptr<ptr_t>();
-        if (ptr) return to_cpp<int>(ptr->off);
-        ir_assert(e.is<var_t>()) << e;
+        if (ptr) return to_cpp<int64_t>(ptr->off);
+        gpu_assert(e.is<var_t>()) << e;
         return 0;
     }
 
@@ -2357,11 +2396,12 @@ class ir_send_plan_t final : public send_plan_impl_t {
         if (bounds.factor() == 1) return stmt;
         auto ret = stmt;
         auto calls = find_objects<func_call_t>(stmt);
-        for (auto &c : calls) {
-            auto &send = c.as<func_call_t>().func.as<send_t>();
+        for (auto &_c : calls) {
+            auto &c = _c.as<func_call_t>();
+            auto &send = c.func.as<send_t>();
             auto &reg_buf = send_t::arg_reg_buf(c);
             auto reg_base = get_base(reg_buf);
-            int reg_off = get_offset(reg_buf);
+            int reg_off = into<int>(get_offset(reg_buf));
             if (!bounds.contains(subtile_idx, reg_off)) {
                 ret = substitute(ret, c, stmt_t());
                 continue;
@@ -2412,9 +2452,9 @@ int send_plan_t::reg_buf_size() const {
     return impl_->reg_buf_size();
 }
 
-stmt_t send_plan_t::create_stmt(
-        const expr_t &mem_buf, const expr_t &reg_buf, int subtile_idx) const {
-    return impl_->create_stmt(mem_buf, reg_buf, subtile_idx);
+stmt_t send_plan_t::create_stmt(const expr_t &mem_buf, const expr_t &reg_buf,
+        int subtile_idx, const expr_t &pattern) const {
+    return impl_->create_stmt(mem_buf, reg_buf, subtile_idx, pattern);
 }
 
 int send_plan_t::estimate_regs(
@@ -2441,31 +2481,31 @@ std::string send_plan_t::str(const std::string &tag) const {
 }
 
 send_group_t init_2d(const view_info_t &info, view_iterator_t &it,
-        layout_t &reg_layout, bool zero_out) {
+        layout_t &reg_layout, bool fill_buf) {
     auto &params = info.send_2d_params();
     auto &vlayout = info.vlayout();
     send_group_t ret;
     ret.hw = info.hw();
     ret.type_size = params.type.size();
     ret.slots = 1;
-    ret.zero_out = zero_out;
+    ret.fill_buf = fill_buf;
     ret.mask_bits = info.mask_bits();
     ret.send_2d_params = params;
     ret.addr_inc = vec_off_t(0);
     ret.mask_inc = it.get_mask(ret.mask_bits);
     int grf_size = info.grf_size();
     reg_layout = params.reg_layout(grf_size, vlayout.ndims(), vlayout.type());
-    ret.pad_bytes = utils::rnd_up(reg_layout.size(), grf_size);
+    ret.pad_bytes = into<int>(utils::rnd_up(reg_layout.size(), grf_size));
     return ret;
 }
 
 send_group_t init_block(const view_info_t &info, view_iterator_t &it,
-        layout_t &reg_layout, bool zero_out) {
+        layout_t &reg_layout, bool fill_buf) {
     send_group_t ret;
     ret.hw = info.hw();
     ret.type_size = it.inner_bytes();
     ret.slots = 1;
-    ret.zero_out = zero_out;
+    ret.fill_buf = fill_buf;
     ret.mask_bits = info.mask_bits();
     ret.addr_inc = vec_off_t(0);
     ret.mask_inc = it.get_mask(ret.mask_bits);
@@ -2480,16 +2520,17 @@ send_group_t init_block(const view_info_t &info, view_iterator_t &it,
 
 send_group_t init_scattered(const view_info_t &info,
         const send_params_t &send_params, view_iterator_t &it,
-        layout_t &reg_layout, bool zero_out) {
+        layout_t &reg_layout, bool fill_buf) {
     auto &vlayout = info.vlayout();
     auto &blocks = vlayout.blocks();
     int type_size = vlayout.type().size();
-    int slot_size = info.init_scattered_params(
-            send_params, it.inner_bytes(), vlayout.elems() * type_size);
+    int type_packing = vlayout.type().packing();
+    int slot_size = info.init_scattered_params(send_params, it.inner_bytes(),
+            into<int>(vlayout.elems() * type_size / type_packing));
     int slot_stride = std::max(4, slot_size);
     int inner_slots = ir_utils::safe_divide(it.inner_bytes(), slot_size);
 
-    ir_assert((slot_size % type_size == 0) || (slot_stride == slot_size));
+    gpu_assert((slot_size % type_size == 0) || (slot_stride == slot_size));
 
     send_group_t ret;
     ret.hw = info.hw();
@@ -2497,7 +2538,7 @@ send_group_t init_scattered(const view_info_t &info,
     ret.type_size = slot_size;
     ret.slot_stride = slot_stride;
     ret.slots = inner_slots * it.middle_blocks();
-    ret.zero_out = zero_out;
+    ret.fill_buf = fill_buf;
     ret.mask_bits = info.mask_bits();
     auto mask_base = it.get_mask(ret.mask_bits, inner_slots);
     auto addr_base = it.get_addr(inner_slots, slot_size);
@@ -2515,12 +2556,13 @@ send_group_t init_scattered(const view_info_t &info,
         if (slot_size == type_size) {
             reg_layout = reg_layout.make_strided(slot_stride / slot_size);
         } else {
-            ir_assert(reg_layout.nblocks() > 0);
+            gpu_assert(reg_layout.nblocks() > 0);
             auto &b0 = reg_layout.blocks()[0];
-            int inner = slot_size / type_size;
+            int inner = slot_size * type_packing / type_size;
             reg_layout
                     = reg_layout.split_block({0, b0}, inner, b0.block / inner);
-            int stride1 = ir_utils::safe_divide(slot_stride, type_size);
+            int stride1 = ir_utils::safe_divide(
+                    slot_stride * type_packing, type_size);
             reg_layout = reg_layout.make_strided(stride1, 1);
         }
     }
@@ -2612,15 +2654,15 @@ std::vector<send_group_t> fuse_blocks(
     int bytes = 0;
     for (auto &sg : ret)
         bytes += sg.type_size * (int)sg.blocks.size();
-    ir_assert(bytes == nblocks * send_group.type_size);
+    gpu_assert(bytes == nblocks * send_group.type_size);
 
     return ret;
 }
 
 bool can_use_send_plan(const view_t &view) {
-    for (int i = 0; i < view.ntdims(); i++) {
+    for (dim_idx_t i = 0; i < view.ntdims(); i++) {
         auto &tdim = view.tdim(i);
-        for (int j = 0; j < tdim.nvargs(); j++)
+        for (dim_idx_t j = 0; j < tdim.nvargs(); j++)
             if (tdim.vstride(j).is_unknown()) return false;
     }
     return true;
@@ -2635,7 +2677,7 @@ send_plan_t create_ir_send_plan(const exec_config_t &exec_cfg,
 }
 
 send_plan_t create_send_plan(const exec_config_t &exec_cfg, const view_t &view,
-        const send_params_t &send_params, bool zero_out) {
+        const send_params_t &send_params, bool fill_buf) {
     if (!send_params.use_send_plan)
         return create_ir_send_plan(exec_cfg, view, send_params);
     auto &hw = exec_cfg.hw();
@@ -2646,16 +2688,16 @@ send_plan_t create_send_plan(const exec_config_t &exec_cfg, const view_t &view,
     layout_t reg_layout;
     switch (info.send_kind()) {
         case send_kind_t::_2d:
-            base_group = init_2d(info, it, reg_layout, zero_out);
+            base_group = init_2d(info, it, reg_layout, fill_buf);
             break;
         case send_kind_t::block:
-            base_group = init_block(info, it, reg_layout, zero_out);
+            base_group = init_block(info, it, reg_layout, fill_buf);
             break;
         case send_kind_t::scattered:
             base_group = init_scattered(
-                    info, send_params, it, reg_layout, zero_out);
+                    info, send_params, it, reg_layout, fill_buf);
             break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
 
     // Add outer blocks to GRF layout.
@@ -2666,8 +2708,9 @@ send_plan_t create_send_plan(const exec_config_t &exec_cfg, const view_t &view,
         auto &last = reg_layout.blocks().back();
         stride = (dim_t)last.stride * last.block;
     }
+    const type_t &type = reg_layout.type();
     stride = utils::rnd_up(
-            stride, base_group.pad_bytes / reg_layout.type().size());
+            stride, base_group.pad_bytes * type.packing() / type.size());
     for (int i = outer_idx; i < (int)blocks.size(); i++) {
         auto &b = blocks[i];
         reg_layout = reg_layout.add_outer_block(b.dim_idx, b.block, stride);
@@ -2676,7 +2719,7 @@ send_plan_t create_send_plan(const exec_config_t &exec_cfg, const view_t &view,
 
     int reg_buf_size = send_params.is_prefetch()
             ? 0
-            : utils::rnd_up(reg_layout.size(), base_group.pad_bytes);
+            : into<int>(utils::rnd_up(reg_layout.size(), base_group.pad_bytes));
     auto ret = utils::make_unique<fast_send_plan_t>(
             info, reg_layout, reg_buf_size);
     base_group.add_block(0, vec_off_t(base_group.nmasks(), 0), 0);
diff --git a/src/gpu/intel/jit/ir/send_plan.hpp b/src/gpu/intel/jit/ir/send_plan.hpp
index 8a3e6828ebb..648dd52e23e 100644
--- a/src/gpu/intel/jit/ir/send_plan.hpp
+++ b/src/gpu/intel/jit/ir/send_plan.hpp
@@ -50,7 +50,7 @@ class send_plan_t {
     void set_reg_buf_size(int size);
 
     stmt_t create_stmt(const expr_t &mem_buf, const expr_t &reg_buf,
-            int subtile_idx = 0) const;
+            int subtile_idx = 0, const expr_t &pattern = {}) const;
 
     int estimate_regs(bool with_buffer = true, bool with_headers = true,
             bool reuse_headers = false) const;
diff --git a/src/gpu/intel/jit/ir/slm_reduce_builder.cpp b/src/gpu/intel/jit/ir/slm_reduce_builder.cpp
index 04feb34a13b..effdc4a7952 100644
--- a/src/gpu/intel/jit/ir/slm_reduce_builder.cpp
+++ b/src/gpu/intel/jit/ir/slm_reduce_builder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,19 +31,19 @@ namespace jit {
 
 slm_reduce_builder_t::slm_reduce_builder_t(ir_context_t &ir_ctx,
         const grid_info_t &tg_grid, const expr_t &reg_buf,
-        const layout_t &reg_layout, const tensor_t &thr_tile, int dim)
+        const layout_t &reg_layout, const tensor_t &thr_tile, dim_idx_t dim)
     : ir_ctx_(&ir_ctx)
     , tg_grid_(tg_grid)
     , reg_buf_(reg_buf)
     , reg_layout_(reg_layout)
     , thr_tile_(thr_tile)
     , dim_(dim) {
-    ir_assert((dim_ >= 0) && (dim_ <= 2));
-    ir_assert(tg_grid_.dim(dim_) > 1);
+    gpu_assert((dim_ != dim_idx::invalid) && (dim_ <= 2));
+    gpu_assert(tg_grid_.dim(dim_) > 1);
 
     tmp_reg_buf_ = ir_ctx.create_tmp_var(type_t::byte_ptr());
     slm_buf_ = ir_ctx.create_tmp_var(type_t::byte_ptr(), "reduce_slm");
-    tg_ndims_ = (dim_ != 2) ? dim_ + 1 : tg_grid_.ndims();
+    tg_ndims_ = (dim_ != dim_idx_t(2)) ? dim_ + 1 : tg_grid_.ndims();
 
     build();
 }
@@ -59,7 +59,7 @@ void slm_reduce_builder_t::build() {
         slm_layout = slm_layout.add_outer_block(ndims + i, tg_grid_.dim(i));
     }
 
-    slm_buf_size_ = slm_layout.size();
+    slm_buf_size_ = into<int>(slm_layout.size());
 
     // Write thread tile to SLM.
     std::vector<dim_t> write_dims = reg_layout_.dims();
@@ -75,7 +75,7 @@ void slm_reduce_builder_t::build() {
     store_stmt_ = write.stmt();
 
     auto &write_layout = write.reg_layout();
-    ir_assert(write_layout == reg_layout_) << "Incompatible layouts.";
+    gpu_assert(write_layout == reg_layout_) << "Incompatible layouts.";
 
     // Redistribute the layout to read/reduce all k-axis tiles from every
     // thread.
@@ -85,7 +85,7 @@ void slm_reduce_builder_t::build() {
     reg_layout_ = reg_layout_.map(tensor_t(local_thr_tile.dims()));
 
     if (split_grid.elems() != full_grid.elems()) {
-        for (int i = 0; i < full_grid.ndims(); i++) {
+        for (dim_idx_t i = 0; i < full_grid.ndims(); i++) {
             if (split_grid.dim(i) == full_grid.dim(i)) continue;
             auto cond = full_grid.idx(i) < split_grid.dim(i);
             if (reduce_cond_.is_empty())
@@ -102,12 +102,12 @@ void slm_reduce_builder_t::build() {
         read_start[i] = local_thr_tile.start(i);
         auto cond = read_start[i] < slm_layout.dims()[i];
         if (reduce_cond_.is_empty())
-            reduce_cond_ = cond;
+            reduce_cond_ = std::move(cond);
         else
             reduce_cond_ &= cond;
     }
     read_dims[ndims + dim_] = tg_grid_.dim(dim_);
-    for (int i = 0; i < tg_ndims_; i++) {
+    for (dim_idx_t i = 0; i < tg_ndims_; i++) {
         read_start[ndims + i] = (i == dim_) ? 0 : tg_grid_.idx(i);
     }
     tensor_t read_tile(read_dims, read_start);
diff --git a/src/gpu/intel/jit/ir/slm_reduce_builder.hpp b/src/gpu/intel/jit/ir/slm_reduce_builder.hpp
index f273ebbaca2..052c55e78af 100644
--- a/src/gpu/intel/jit/ir/slm_reduce_builder.hpp
+++ b/src/gpu/intel/jit/ir/slm_reduce_builder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
 #include "gpu/intel/jit/ir/hw.hpp"
 #include "gpu/intel/jit/ir/ir.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 #include <vector>
 
@@ -36,7 +36,7 @@ class slm_reduce_builder_t {
 
     slm_reduce_builder_t(ir_context_t &ir_ctx, const grid_info_t &tg_grid,
             const expr_t &reg_buf, const layout_t &reg_layout,
-            const tensor_t &thr_tile, int dim = 2);
+            const tensor_t &thr_tile, dim_idx_t dim = 2);
 
     bool is_empty() const { return reg_buf_.is_empty(); }
 
@@ -67,7 +67,7 @@ class slm_reduce_builder_t {
 
     uint32_t reduction_mask() const {
         uint32_t mask = 0xFFFFFFFF;
-        for (int i = 0; i < tg_ndims_; i++) {
+        for (dim_idx_t i = 0; i < tg_ndims_; i++) {
             int k_dim_idx = reg_layout_.ndims() + i;
             mask &= ~(1 << k_dim_idx);
         }
@@ -81,7 +81,7 @@ class slm_reduce_builder_t {
     layout_t reg_layout_;
     tensor_t thr_tile_;
 
-    int dim_ = -1;
+    dim_idx_t dim_ = -1;
 
     expr_t tmp_reg_buf_;
     int tmp_reg_buf_size_ = 0;
@@ -89,7 +89,7 @@ class slm_reduce_builder_t {
     expr_t slm_buf_;
     int slm_buf_size_ = 0;
 
-    int tg_ndims_ = 0;
+    dim_idx_t tg_ndims_ = 0;
 
     stmt_t store_stmt_;
     stmt_t load_stmt_;
diff --git a/src/gpu/intel/jit/ir/tensor.cpp b/src/gpu/intel/jit/ir/tensor.cpp
index 178826596c1..30488c76980 100644
--- a/src/gpu/intel/jit/ir/tensor.cpp
+++ b/src/gpu/intel/jit/ir/tensor.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,19 +26,19 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-layout_t::layout_t(const type_t &type, const expr_t &offset, int ndims,
+layout_t::layout_t(const type_t &type, const expr_t &offset, dim_idx_t ndims,
         const std::vector<std::pair<int, dim_t>> &parts,
         const std::vector<dim_t> &dims, bool do_normalize)
     : type_(type), ndims_(ndims), offset_(offset) {
-    if (!dims.empty() && ndims_ != int(dims.size())) {
-        ir_error_not_expected() << "Format and dimensions do not match.";
+    if (!dims.empty() && ndims_ != dims.size()) {
+        gpu_error_not_expected() << "Format and dimensions do not match.";
     }
     for (auto &p : parts) {
-        int dim_idx = p.first;
+        dim_idx_t dim_idx = p.first;
         dim_t block = p.second;
-        ir_assert(dim_idx < ndims_);
+        gpu_assert(dim_idx < ndims_);
         if (block == 0 && dims.empty())
-            ir_error_not_expected()
+            gpu_error_not_expected()
                     << "Dimensions are missing. Can't deduce them from "
                        "the format.";
     }
@@ -46,7 +46,7 @@ layout_t::layout_t(const type_t &type, const expr_t &offset, int ndims,
     dim_t stride = 1;
     // Iterate from right to left (innermost to outermost).
     for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
-        int dim_idx = it->first;
+        dim_idx_t dim_idx = it->first;
         dim_t block = it->second;
         if (block == 0) {
             dim_t full_block = 1;
@@ -66,7 +66,8 @@ layout_t::layout_t(const type_t &type, const expr_t &offset, int ndims,
 
 layout_t::layout_t(const memory_desc_wrapper &mdw, bool do_normalize)
     : type_(mdw.data_type()), offset_(mdw.offset0()) {
-    ir_assert(mdw.is_blocking_desc()) << "Expected blocking memory descriptor.";
+    gpu_assert(mdw.is_blocking_desc())
+            << "Expected blocking memory descriptor.";
 
     ndims_ = mdw.ndims();
     block_layout_t layout(
@@ -99,7 +100,7 @@ memory_desc_t layout_t::to_dnnl(const dim_t *dims_hint) const {
         auto &b = *it;
         if (!seen[b.dim_idx]) {
             // Outer block.
-            ir_assert(!in_inner_block);
+            gpu_assert(!in_inner_block);
             MAYBE_UNUSED(in_inner_block);
             blk.strides[b.dim_idx] = b.stride;
             md.padded_dims[b.dim_idx] = b.block;
@@ -111,7 +112,7 @@ memory_desc_t layout_t::to_dnnl(const dim_t *dims_hint) const {
             blk.inner_nblks++;
             if (prev_stride > 0) {
                 // Inner block must be dense.
-                ir_assert(prev_stride == b.block * dim_t(b.stride));
+                gpu_assert(prev_stride == b.block * dim_t(b.stride));
             }
             prev_stride = b.stride;
             in_inner_block = true;
@@ -119,9 +120,9 @@ memory_desc_t layout_t::to_dnnl(const dim_t *dims_hint) const {
         seen[b.dim_idx] = true;
     }
 
-    for (int i = 0; i < ndims(); i++) {
+    for (dim_idx_t i = 0; i < ndims(); i++) {
         if (seen[i]) continue;
-        ir_assert(md.dims[i] == 1);
+        gpu_assert(md.dims[i] == 1);
         md.padded_dims[i] = md.dims[i];
         blk.strides[i] = elems();
     }
@@ -131,7 +132,7 @@ memory_desc_t layout_t::to_dnnl(const dim_t *dims_hint) const {
 
 layout_t layout_t::map(const tensor_t &tensor) const {
     if (ndims() != tensor.ndims())
-        ir_error_not_expected() << "Dimensions do not match.";
+        gpu_error_not_expected() << "Dimensions do not match.";
 
     std::vector<dim_t> remaining_dims = tensor.dims();
     std::vector<block_t> mapped_blocks;
@@ -159,7 +160,7 @@ layout_t layout_t::map(const tensor_t &tensor) const {
                 return split_block(eb, rem_dim, block / rem_dim).map(tensor);
 
             // TODO: Remove exception usage.
-            ir_except_not_implemented("Can't map tensor layout.");
+            gpu_except_not_implemented("Can't map tensor layout.");
         }
         rem_dim /= block;
         mapped_blocks.emplace_back(b.dim_idx, block, b.stride);
@@ -167,7 +168,7 @@ layout_t layout_t::map(const tensor_t &tensor) const {
 
     for (auto &d : remaining_dims) {
         // TODO: Remove exception usage.
-        if (d != 1) ir_except_not_implemented("Can't map tensor layout.");
+        if (d != 1) gpu_except_not_implemented("Can't map tensor layout.");
         MAYBE_UNUSED(d);
     }
 
@@ -182,26 +183,26 @@ layout_t layout_t::reinterpret(
 
     expr_t new_offset = 0;
     if (!has_zero_offset()) {
-        ir_assert(is_const(offset_)) << "Expected constant offset.";
+        gpu_assert(is_const(offset_)) << "Expected constant offset.";
         int64_t off = to_cpp<int64_t>(offset_) * old_size;
-        ir_assert(off % new_size == 0);
+        gpu_assert(off % new_size == 0);
         new_offset = off / new_size;
     }
 
     if (old_size % new_size != 0 && new_size % old_size != 0) {
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return layout_t();
     }
 
     auto new_blocks = blocks_;
     if (new_blocks.empty()) {
-        ir_error_not_expected() << "Can't reinterpret.";
+        gpu_error_not_expected() << "Can't reinterpret.";
         return layout_t();
     }
 
     auto &b0 = new_blocks.front();
     if (dim_t(b0.stride) != 1) {
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return layout_t();
     }
 
@@ -216,7 +217,7 @@ layout_t layout_t::reinterpret(
     } else {
         int factor = (new_size / old_size);
         if (b0.block % factor != 0) {
-            ir_error_not_expected();
+            gpu_error_not_expected();
             return layout_t();
         }
         b0.block /= factor;
@@ -224,7 +225,7 @@ layout_t layout_t::reinterpret(
         for (auto &b : new_blocks) {
             if (&b == &b0) continue;
             if (b.stride % factor != 0) {
-                ir_error_not_expected();
+                gpu_error_not_expected();
                 return layout_t();
             }
             b.stride /= factor;
@@ -238,7 +239,7 @@ layout_t layout_t::split_block(
         const std::pair<int, block_t> &eb, dim_t block0, dim_t block1) const {
     int block_idx = eb.first;
     auto &b = eb.second;
-    ir_assert(b.block == block0 * block1) << "Incompatible block sizes.";
+    gpu_assert(b.block == block0 * block1) << "Incompatible block sizes.";
     MAYBE_UNUSED(b);
 
     auto new_blocks = blocks_;
@@ -270,7 +271,7 @@ layout_t layout_t::split_into_multi_blocks(
             if (e == 1) continue;
             if (b.block > e) {
                 // Try to split this block.
-                int next_block = utils::max_div(b.block, e);
+                dim_t next_block = utils::max_div(b.block, e);
                 if (next_block == 1) return layout_t();
                 return tmp.split_block(eb, next_block, b.block / next_block)
                         .split_into_multi_blocks(multi_blocks);
@@ -314,7 +315,7 @@ tensor_t layout_t::split_into_max_tile(
 }
 
 void layout_t::align_layouts(layout_t &a, layout_t &b) {
-    for (int i = 0; i < a.ndims(); i++) {
+    for (dim_idx_t i = 0; i < into<dim_idx_t>(a.ndims()); i++) {
         auto a_blocks = a.blocks();
         auto b_blocks = b.blocks();
 
@@ -368,7 +369,7 @@ std::vector<std::pair<char, dim_t>> layout_t::parse_letter_blocks(
             next = ss.peek();
         }
         char letter = char(ss.peek());
-        ir_assert(!ss.eof()) << "EOF is unexpected.";
+        gpu_assert(!ss.eof()) << "EOF is unexpected.";
         ss.ignore(1);
         ret.emplace_back(letter, block);
     }
@@ -388,7 +389,7 @@ std::vector<std::pair<int, dim_t>> layout_t::parse_format(
     }
 
     for (int i = 0; i < DNNL_MAX_NDIMS; i++) {
-        ir_assert(seen_letters[i] == (i < letter_ndims));
+        gpu_assert(seen_letters[i] == (i < letter_ndims));
     }
 
     auto letter_blocks = parse_letter_blocks(format);
@@ -401,7 +402,7 @@ std::vector<std::pair<int, dim_t>> layout_t::parse_format(
             int dim_idx = std::tolower(letter) - 'a';
             parts.emplace_back(dim_idx, block);
         } else {
-            ir_assert(ndims_hint >= letter_ndims);
+            gpu_assert(ndims_hint >= letter_ndims);
             for (int i = letter_ndims; i < ndims_hint; i++) {
                 parts.emplace_back(i, 0);
             }
@@ -418,17 +419,17 @@ void layout_t::sanity_check() const {
     if (is_empty()) return;
 
     for (auto &b : blocks_) {
-        ir_assert(b.block > 0) << "Incorrect block size.";
+        gpu_assert(b.block > 0) << "Incorrect block size.";
         MAYBE_UNUSED(b);
     }
-    ir_assert(ndims_ <= max_ndims);
+    gpu_assert(ndims_ <= max_ndims);
 }
 
-expr_t grid_splitter_t::pop_block(int size) {
-    ir_assert(size > 1);
-    ir_assert(can_pop_block(size));
+expr_t grid_splitter_t::pop_block(dim_t size) {
+    gpu_assert(size > 1);
+    gpu_assert(can_pop_block(size));
 
-    int new_stride = cur_stride_ * size;
+    dim_t new_stride = cur_stride_ * size;
 
     auto idx_expr = grid_.idx(cur_idx_);
     if (cur_stride_ != 1) idx_expr /= cur_stride_;
@@ -444,13 +445,13 @@ expr_t grid_splitter_t::pop_block(int size) {
     return idx_expr;
 }
 
-stride_t tdim_t::compute_stride(const expr_t &e, int idx, const expr_t &var) {
+stride_t tdim_t::compute_stride(
+        const expr_t &e, dim_idx_t idx, const expr_t &var) {
     // e == var -> fixed stride.
     if (e.is_same(var)) return stride_t(1);
 
-    auto e0 = e;
     auto e1 = substitute(e, var, var + 1);
-    auto e_stride = simplify(e1 - e0);
+    auto e_stride = simplify(e1 - e);
 
     if (is_const(e_stride)) return stride_t(to_cpp<dim_t>(e_stride));
 
@@ -459,11 +460,11 @@ stride_t tdim_t::compute_stride(const expr_t &e, int idx, const expr_t &var) {
 }
 
 view_t view_t::create_sub_view(const tensor_t &sub_tensor) const {
-    ir_assert(sub_tensor.ndims() == nvdims()) << "Dimensions don't match.";
+    gpu_assert(sub_tensor.ndims() == nvdims()) << "Dimensions don't match.";
 
     auto ret = *this;
     ret.vdims_ = sub_tensor.dims();
-    for (int i = 0; i < nvdims(); i++) {
+    for (dim_idx_t i = 0; i < nvdims(); i++) {
         auto &i_start = sub_tensor.start()[i];
         if (is_zero(i_start)) continue;
         auto &s = ret.vstart_[i];
@@ -475,14 +476,14 @@ view_t view_t::create_sub_view(const tensor_t &sub_tensor) const {
 
 view_t view_t::substitute(const expr_t &from, const expr_t &to) const {
     view_t ret = *this;
-    for (int i = 0; i < nvdims(); i++) {
+    for (dim_idx_t i = 0; i < nvdims(); i++) {
         ret.vstart_[i] = jit::substitute(ret.vstart_[i], from, to);
         ret.vstart_[i] = simplify(ret.vstart_[i]);
     }
     return ret;
 }
 
-std::vector<expr_t> view_t::create_vvars(int nvdims) {
+std::vector<expr_t> view_t::create_vvars(dim_idx_t nvdims) {
     static const int max_nvdims = 128;
     static thread_local std::vector<expr_t> _vvars([] {
         std::vector<expr_t> ret;
@@ -492,13 +493,13 @@ std::vector<expr_t> view_t::create_vvars(int nvdims) {
         return ret;
     }());
 
-    ir_assert(nvdims <= max_nvdims) << "Too many dimensions: " << nvdims;
+    gpu_assert(nvdims <= max_nvdims) << "Too many dimensions: " << nvdims;
     return std::vector<expr_t>(_vvars.begin(), _vvars.begin() + nvdims);
 }
 
 layout_t view_t::create_pseudo_vlayout(
         const layout_t &tlayout, bool init_offset) const {
-    ir_assert(!tlayout.is_empty());
+    gpu_assert(!tlayout.is_empty());
 
     std::vector<dim_t> rem_vdims = vdims_;
     std::vector<block_t> blocks;
@@ -512,11 +513,11 @@ layout_t view_t::create_pseudo_vlayout(
         if (tb_is_outermost) {
             // Use innermost dimension with maximum remaining size for first
             // block
-            int max_idx = -1;
-            int max_vidx = -1;
-            int max_vdim = 1;
+            dim_idx_t max_idx = dim_idx::invalid;
+            dim_idx_t max_vidx = dim_idx::invalid;
+            dim_t max_vdim = 1;
             for (int i = tinfo.nvargs() - 1; i >= 0; i--) {
-                int vidx = tinfo.vidx(i);
+                dim_idx_t vidx = tinfo.vidx(i);
                 if (rem_vdims[vidx] > max_vdim) {
                     max_idx = i;
                     max_vidx = vidx;
@@ -532,7 +533,7 @@ layout_t view_t::create_pseudo_vlayout(
             }
 
             for (int i = tinfo.nvargs() - 1; i >= 0; i--) {
-                int vidx = tinfo.vidx(i);
+                dim_idx_t vidx = tinfo.vidx(i);
                 if (rem_vdims[vidx] == 1) continue;
 
                 stride_t stride = tinfo.vstride(i) * tb.stride;
@@ -542,7 +543,7 @@ layout_t view_t::create_pseudo_vlayout(
             continue;
         }
 
-        ir_assert(tinfo.is_identity()) << "Can't create pseudo-layout.";
+        gpu_assert(tinfo.is_identity()) << "Can't create pseudo-layout.";
 
         int vidx = tinfo.vidx(0);
         dim_t &rem_vdim = rem_vdims[vidx];
@@ -559,13 +560,13 @@ layout_t view_t::create_pseudo_vlayout(
             }
 
             // TODO: Remove exception usage.
-            ir_except_not_implemented("Can't create pseudo-layout.");
+            gpu_except_not_implemented("Can't create pseudo-layout.");
         }
         blocks.emplace_back(tb.dim_idx, tblock, tb.stride);
     }
 
     for (auto &d : rem_vdims) {
-        ir_assert(d == 1) << "Can't create pseudo-layout.";
+        gpu_assert(d == 1) << "Can't create pseudo-layout.";
         MAYBE_UNUSED(d);
     }
 
@@ -590,7 +591,7 @@ layout_t dim_assignment_t::map(const layout_t &layout) const {
             /*remove_size_1_blocks=*/false);
     auto ret = layout_t(layout.type(), new_ndims(), layout.offset(), new_blocks,
             /*do_normalize=*/false);
-    ir_assert(layout.elems() == ret.elems())
+    gpu_assert(layout.elems() == ret.elems())
             << "Assignment doesn't preserve number of elements.";
     return ret;
 }
diff --git a/src/gpu/intel/jit/ir/tensor.hpp b/src/gpu/intel/jit/ir/tensor.hpp
index 1c0c35d593f..b087952d9fc 100644
--- a/src/gpu/intel/jit/ir/tensor.hpp
+++ b/src/gpu/intel/jit/ir/tensor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,15 +59,15 @@ class tensor_t {
             start_[i] = start[i];
     }
 
-    dim_t operator()(int idx) const { return dims_[idx]; }
+    dim_t operator()(dim_idx_t idx) const { return dims_[idx]; }
 
-    const expr_t &start(int idx) const { return start_[idx]; }
+    const expr_t &start(dim_idx_t idx) const { return start_[idx]; }
 
-    int ndims() const { return int(dims_.size()); }
+    dim_idx_t ndims() const { return into<dim_idx_t>(dims_.size()); }
 
     dim_t elems() const {
         dim_t ret = 1;
-        for (int i = 0; i < ndims(); i++)
+        for (dim_idx_t i = 0; i < ndims(); i++)
             ret *= dims_[i];
         return ret;
     }
@@ -80,7 +80,7 @@ class tensor_t {
 
     bool is_equal(const tensor_t &other) const {
         if (ndims() != other.ndims()) return false;
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             if (dims_[i] != other.dims_[i]) return false;
             if (!start_[i].is_equal(other.start_[i])) return false;
         }
@@ -89,7 +89,7 @@ class tensor_t {
 
     bool is_divisible(const tensor_t &other) const {
         if (ndims() != other.ndims()) return false;
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             if (dims_[i] % other.dims_[i] != 0) return false;
         }
         return true;
@@ -108,16 +108,16 @@ class tensor_t {
     IR_DEFINE_DUMP()
 
     bool has_zero_start() const {
-        for (int i = 0; i < ndims(); i++)
+        for (dim_idx_t i = 0; i < ndims(); i++)
             if (!is_zero(start_[i])) return false;
         return true;
     }
 
     dim_t to_1d_offset(const std::vector<dim_t> &args) const {
-        ir_assert(has_zero_start());
+        gpu_assert(has_zero_start());
 
         dim_t off = 0;
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             off *= dims_[i];
             off += args[i];
         }
@@ -125,16 +125,16 @@ class tensor_t {
     }
 
     tensor_t create_sub_tensor(const tensor_t &tile) const {
-        ir_assert(ndims() == tile.ndims()) << "Incompatible sizes.";
+        gpu_assert(ndims() == tile.ndims()) << "Incompatible sizes.";
         std::vector<expr_t> new_start = start_;
-        for (int i = 0; i < ndims(); i++)
+        for (dim_idx_t i = 0; i < ndims(); i++)
             new_start[i] += tile.start(i);
         return tensor_t(tile.dims(), new_start);
     }
 
     tensor_t substitute(const expr_t &from, const expr_t &to) const {
         tensor_t ret = *this;
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             ret.start_[i] = jit::substitute(ret.start_[i], from, to);
             ret.start_[i] = simplify(ret.start_[i]);
         }
@@ -149,22 +149,22 @@ class tensor_t {
 class grid_info_t {
 public:
     grid_info_t() = default;
-    grid_info_t(int ndims) : dims_(ndims), offs_(ndims), idxs_(ndims) {}
-    grid_info_t(const std::vector<int> &dims, const std::vector<expr_t> &idxs)
+    grid_info_t(dim_idx_t ndims) : dims_(ndims), offs_(ndims), idxs_(ndims) {}
+    grid_info_t(const std::vector<dim_t> &dims, const std::vector<expr_t> &idxs)
         : grid_info_t(dims, {}, idxs) {}
-    grid_info_t(const std::vector<int> &dims, const std::string &prefix)
-        : grid_info_t(dims, make_idxs(prefix, (int)dims.size())) {}
-    grid_info_t(const std::vector<int> &dims, const std::vector<int> &offs,
+    grid_info_t(const std::vector<dim_t> &dims, std::string (*genname)(int))
+        : grid_info_t(dims, make_idxs(genname, into<dim_idx_t>(dims.size()))) {}
+    grid_info_t(const std::vector<dim_t> &dims, const std::vector<dim_t> &offs,
             const std::vector<expr_t> &idxs)
         : dims_(dims), offs_(offs), idxs_(idxs) {
         if (offs_.empty()) offs_.resize(dims.size());
-        ir_assert(dims_.size() == offs_.size());
-        ir_assert(dims_.size() == idxs_.size());
+        gpu_assert(dims_.size() == offs_.size());
+        gpu_assert(dims_.size() == idxs_.size());
     }
 
     bool operator==(const grid_info_t &other) const {
         if (ndims() != other.ndims()) return false;
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             if (dim(i) != other.dim(i)) return false;
             if (off(i) != other.off(i)) return false;
             if (!idx(i).is_equal(other.idx(i))) return false;
@@ -174,35 +174,35 @@ class grid_info_t {
 
     bool is_empty() const { return dims_.empty(); }
 
-    int &dim(int dim_idx) { return dims_[dim_idx]; }
-    int &off(int dim_idx) { return offs_[dim_idx]; }
-    expr_t &idx(int dim_idx) { return idxs_[dim_idx]; }
-    int dim_idx(const expr_t &idx_var) const {
-        for (int i = 0; i < ndims(); i++) {
+    dim_t &dim(dim_idx_t dim_idx) { return dims_[dim_idx]; }
+    dim_t &off(dim_idx_t dim_idx) { return offs_[dim_idx]; }
+    expr_t &idx(dim_idx_t dim_idx) { return idxs_[dim_idx]; }
+    dim_idx_t dim_idx(const expr_t &idx_var) const {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             if (idx(i).is_same(idx_var)) return i;
         }
-        ir_error_not_expected() << "Index not found: " << idx_var;
+        gpu_error_not_expected() << "Index not found: " << idx_var;
         return -1;
     }
 
-    const int &dim(int dim_idx) const { return dims_[dim_idx]; }
-    const int &dim(const expr_t &idx_var) const {
+    const dim_t &dim(dim_idx_t dim_idx) const { return dims_[dim_idx]; }
+    const dim_t &dim(const expr_t &idx_var) const {
         return dims_[dim_idx(idx_var)];
     }
-    const int &off(int dim_idx) const { return offs_[dim_idx]; }
-    const expr_t &idx(int dim_idx) const { return idxs_[dim_idx]; }
+    const dim_t &off(dim_idx_t dim_idx) const { return offs_[dim_idx]; }
+    const expr_t &idx(dim_idx_t dim_idx) const { return idxs_[dim_idx]; }
 
-    int &operator[](int dim_idx) { return dim(dim_idx); }
-    const int &operator[](int dim_idx) const { return dim(dim_idx); }
+    dim_t &operator[](dim_idx_t dim_idx) { return dim(dim_idx); }
+    const dim_t &operator[](dim_idx_t dim_idx) const { return dim(dim_idx); }
 
-    int ndims() const { return int(dims_.size()); }
-    int elems() const {
+    dim_idx_t ndims() const { return into<dim_idx_t>(dims_.size()); }
+    dim_t elems() const {
         return utils::array_product(dims_.data(), dims_.size());
     }
 
-    grid_info_t sub_grid(std::initializer_list<int> old_dim_idxs) const {
-        grid_info_t ret(int(old_dim_idxs.size()));
-        int new_dim_idx = 0;
+    grid_info_t sub_grid(std::initializer_list<dim_idx_t> old_dim_idxs) const {
+        grid_info_t ret(into<dim_idx_t>(old_dim_idxs.size()));
+        dim_idx_t new_dim_idx = 0;
         for (auto old_dim_idx : old_dim_idxs) {
             ret.dim(new_dim_idx) = dim(old_dim_idx);
             ret.off(new_dim_idx) = off(old_dim_idx);
@@ -212,17 +212,17 @@ class grid_info_t {
         return ret;
     }
 
-    grid_info_t resize(const std::vector<int> &new_dims) const {
+    grid_info_t resize(const std::vector<dim_t> &new_dims) const {
         grid_info_t ret = *this;
         ret.dims_ = new_dims;
         return ret;
     }
 
-    grid_info_t slice(int dim_idx, int new_off, int new_dim,
+    grid_info_t slice(dim_idx_t dim_idx, dim_t new_off, dim_t new_dim,
             const expr_t &new_idx, expr_t &new_idx_value) const {
-        ir_assert(dim_idx >= 0 && dim_idx < ndims());
-        ir_assert(new_dim > 0 && new_off >= 0);
-        ir_assert(new_off + new_dim <= dims_[dim_idx]);
+        gpu_assert(dim_idx >= 0 && dim_idx < ndims());
+        gpu_assert(new_dim > 0 && new_off >= 0);
+        gpu_assert(new_off + new_dim <= dims_[dim_idx]);
 
         grid_info_t ret = *this;
         ret.offs_[dim_idx] += new_off;
@@ -237,7 +237,7 @@ class grid_info_t {
         return ret;
     }
 
-    grid_info_t halven(const expr_t &new_idx, int &dim_idx,
+    grid_info_t halven(const expr_t &new_idx, dim_idx_t &dim_idx,
             expr_t &new_idx_value, bool first = true) const {
         for (int i = ndims() - 1; i >= 0; i--) {
             if (dim(i) == 1 || dim(i) % 2 != 0) continue;
@@ -251,7 +251,7 @@ class grid_info_t {
     expr_t slice_condition() const {
         if (parent_dims_.empty()) return expr_t();
         expr_t ret(true);
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < ndims(); i++) {
             auto &idx = idxs_[i];
             if (offs_[i] > 0) ret &= (idx >= 0);
             if (offs_[i] + dims_[i] < parent_dims_[i]) ret &= (idx < dims_[i]);
@@ -269,20 +269,19 @@ class grid_info_t {
     IR_DEFINE_DUMP()
 
 private:
-    static std::vector<expr_t> make_idxs(const std::string &prefix, int n) {
+    static std::vector<expr_t> make_idxs(std::string (*genname)(int), int n) {
         std::vector<expr_t> ret;
         ret.reserve(n);
         for (int i = 0; i < n; i++)
-            ret.push_back(
-                    var_t::make(type_t::s32(), prefix + std::to_string(i)));
+            ret.push_back(var_t::make(type_t::s32(), genname(i)));
         return ret;
     }
 
-    std::vector<int> dims_;
-    std::vector<int> offs_;
+    std::vector<dim_t> dims_;
+    std::vector<dim_t> offs_;
     std::vector<expr_t> idxs_;
 
-    std::vector<int> parent_dims_;
+    std::vector<dim_t> parent_dims_;
 };
 
 class grid_splitter_t {
@@ -290,53 +289,54 @@ class grid_splitter_t {
     grid_splitter_t(const grid_info_t &grid)
         : grid_(grid), cur_idx_(grid.ndims() - 1), cur_stride_(1) {
         skip_size_1_dims();
-        ir_assert(cur_idx_ >= 0);
+        gpu_assert(cur_idx_ != dim_idx::invalid);
     }
 
-    int cur_block() const {
+    dim_t cur_block() const {
         if (is_empty()) return 1;
 
         return grid_.dim(cur_idx_) / cur_stride_;
     }
 
-    bool is_empty() const { return cur_idx_ == -1; }
+    bool is_empty() const { return cur_idx_ == dim_idx::invalid; }
 
-    bool can_pop_block(int size) const {
+    bool can_pop_block(dim_t size) const {
         if (is_empty()) return false;
         return cur_block() % size == 0;
     }
 
-    expr_t pop_block(int size);
+    expr_t pop_block(dim_t size);
 
 private:
     void skip_size_1_dims() {
-        while (cur_idx_ >= 0 && grid_.dim(cur_idx_) == 1)
+        while (cur_idx_ != dim_idx::invalid && grid_.dim(cur_idx_) == 1)
             cur_idx_--;
     }
 
     grid_info_t grid_;
 
-    int cur_idx_;
-    int cur_stride_;
+    dim_idx_t cur_idx_;
+    dim_t cur_stride_;
 };
 
 class layout_t {
 public:
-    static const int max_ndims = 16;
+    static const dim_idx_t max_ndims = 16;
 
     layout_t() : type_(type_t::undef()), ndims_(0), offset_(0) {
         sanity_check();
     }
 
-    layout_t(const type_t &type, const expr_t &offset, int ndims,
+    layout_t(const type_t &type, const expr_t &offset, dim_idx_t ndims,
             const std::vector<std::pair<int, dim_t>> &parts,
             const std::vector<dim_t> &dims = {}, bool do_normalize = true);
 
     layout_t(const type_t &type, const expr_t &offset,
             const std::string &format, const std::vector<dim_t> &dims = {},
             bool do_normalize = true)
-        : layout_t(type, offset, (int)dims.size(),
-                parse_format(format, int(dims.size())), dims, do_normalize) {}
+        : layout_t(type, offset, into<dim_idx_t>(dims.size()),
+                parse_format(format, into<dim_idx_t>(dims.size())), dims,
+                do_normalize) {}
 
     layout_t(const memory_desc_wrapper &mdw, const std::string &format,
             bool do_normalize = true)
@@ -352,7 +352,7 @@ class layout_t {
 
     layout_t(const type_t &type, const expr_t &offset,
             const std::vector<dim_t> &dims, bool do_normalize = true)
-        : type_(type), ndims_(int(dims.size())), offset_(offset) {
+        : type_(type), ndims_(into<dim_idx_t>(dims.size())), offset_(offset) {
         dim_t stride = 1;
         for (int i = ndims_ - 1; i >= 0; i--) {
             blocks_.emplace_back(i, dims[i], stride);
@@ -362,7 +362,7 @@ class layout_t {
         sanity_check();
     }
 
-    layout_t(const type_t &type, int ndims, const expr_t &offset,
+    layout_t(const type_t &type, dim_idx_t ndims, const expr_t &offset,
             const std::vector<block_t> &blocks, bool do_normalize = true)
         : type_(type), ndims_(ndims), offset_(offset), blocks_(blocks) {
         if (do_normalize) blocks_ = normalize_blocks(blocks_);
@@ -375,7 +375,7 @@ class layout_t {
 
     bool is_empty() const { return ndims_ == 0; }
 
-    int ndims() const { return ndims_; }
+    dim_idx_t ndims() const { return ndims_; }
 
     dim_t elems() const {
         dim_t ret = 1;
@@ -394,8 +394,8 @@ class layout_t {
             max_block_size
                     = std::max(max_block_size, b.block * (dim_t)b.stride);
         }
-        dim_t max_off_bytes = (max_off + 1) * type().size();
-        return std::max(max_off_bytes, max_block_size * type().size());
+        dim_t max_elems = std::max(max_off + 1, max_block_size);
+        return max_elems * type().size() / type().packing();
     }
 
     // Offset in bytes following the last accessible element.
@@ -407,7 +407,7 @@ class layout_t {
         }
         dim_t after_last = max_off + 1;
         if (!ignore_offset) after_last += expr_cast<dim_t>(offset_);
-        return after_last * type().size();
+        return after_last * type().size() / type().packing();
     }
 
     template <typename T = expr_t>
@@ -415,7 +415,7 @@ class layout_t {
             const std::vector<T> &args = {}, bool ignore_offset = false) const {
         if (args.empty()) return expr_cast<T>(offset_);
 
-        ir_assert(int(args.size()) == ndims()) << "Dimensions do not match.";
+        gpu_assert(args.size() == ndims()) << "Dimensions do not match.";
 
         T off = 0;
         auto _args = args;
@@ -445,7 +445,7 @@ class layout_t {
         return dims;
     }
 
-    dim_t dim(int dim_idx) const {
+    dim_t dim(dim_idx_t dim_idx) const {
         dim_t ret = 1;
         for (auto &b : blocks_) {
             if (b.dim_idx == dim_idx) ret *= b.block;
@@ -457,8 +457,8 @@ class layout_t {
 
     const std::vector<block_t> &blocks() const { return blocks_; }
 
-    dim_t inner_block(
-            int dim_idx, bool skip_outer = true, bool inner_only = true) const {
+    dim_t inner_block(dim_idx_t dim_idx, bool skip_outer = true,
+            bool inner_only = true) const {
         std::vector<dim_t> dim_blocks;
         for (auto &b : blocks_) {
             if (b.dim_idx == dim_idx) dim_blocks.push_back(b.block);
@@ -495,8 +495,8 @@ class layout_t {
     bool operator!=(const layout_t &other) const { return !operator==(other); }
     bool operator<=(const layout_t &other) const {
         if (!type_.is_equal(other.type_)) return false;
-        const auto other_blocks = other.normalize().blocks();
-        const auto self_blocks = normalize().blocks();
+        auto other_blocks = other.normalize().blocks();
+        auto self_blocks = normalize().blocks();
         if (self_blocks.size() > other_blocks.size()) return false;
         if (self_blocks.empty()) return true;
 
@@ -526,7 +526,7 @@ class layout_t {
     template <typename T = expr_t>
     T offset_in_bytes(
             const std::vector<T> &args = {}, bool ignore_offset = false) const {
-        return offset(args, ignore_offset) * type().size();
+        return offset(args, ignore_offset) * type().size() / type().packing();
     }
 
     std::string desc_str(bool dnnl_style = false) const {
@@ -540,10 +540,10 @@ class layout_t {
             auto &b = eb.second;
             std::string b_str;
             if (dnnl_style && is_outermost(eb)) {
-                b_str.append(1, (seen[b.dim_idx] ? 'A' : 'a') + b.dim_idx);
+                b_str.append(1, dim_idx::as_tag(b.dim_idx, seen[b.dim_idx]));
             } else {
                 b_str = std::to_string(b.block);
-                b_str.append(1, 'a' + b.dim_idx);
+                b_str.append(1, dim_idx::as_tag(b.dim_idx));
             }
             if (!dnnl_style) {
                 if (b.stride.is_unknown()) {
@@ -552,7 +552,8 @@ class layout_t {
                     b_str.append(1, '*');
                 }
             }
-            ret = b_str + ret;
+            b_str += ret;
+            std::swap(ret, b_str);
             dense_stride = b.stride * b.block;
             seen[b.dim_idx] = true;
         }
@@ -583,7 +584,7 @@ class layout_t {
         return ret;
     }
 
-    std::vector<dim_t> strides(int dim_idx) const {
+    std::vector<dim_t> strides(dim_idx_t dim_idx) const {
         std::vector<dim_t> ret;
         for (auto &b : blocks_)
             if (b.dim_idx == dim_idx) ret.push_back(b.stride);
@@ -621,7 +622,7 @@ class layout_t {
     }
 
     layout_t transpose() const {
-        if (ndims() != 2) ir_error_not_expected();
+        if (ndims() != 2) gpu_error_not_expected();
 
         // Flip: 0 -> 1, 1 -> 0.
         auto blocks = blocks_;
@@ -656,7 +657,7 @@ class layout_t {
         return true;
     }
 
-    bool is_blocked_by(int dim_idx, int block) const {
+    bool is_blocked_by(dim_idx_t dim_idx, int block) const {
         if (block == 1) return true;
         if (nblocks() == 0) return false;
         auto &b0 = blocks()[0];
@@ -677,7 +678,7 @@ class layout_t {
             if (b.stride != stride) break; // Not dense anymore.
             if (block_count[b.dim_idx] == 1) break; // Outer block.
             stride *= b.block;
-            ir_assert(block_count[b.dim_idx] > 0);
+            gpu_assert(block_count[b.dim_idx] > 0);
             block_count[b.dim_idx]--;
             inner_blocks.push_back(b);
         }
@@ -707,7 +708,7 @@ class layout_t {
                 } else if (i_stride % _stride == 0) {
                     factor = -(i_stride / _stride);
                 } else {
-                    ir_error_not_expected();
+                    gpu_error_not_expected();
                 }
             }
             if (factor > 0) {
@@ -720,11 +721,11 @@ class layout_t {
     }
 
     layout_t make_with_block(const layout_t &inner) const {
-        ir_assert(type() == inner.type());
-        ir_assert(ndims() == inner.ndims());
+        gpu_assert(type() == inner.type());
+        gpu_assert(ndims() == inner.ndims());
         auto cur_dims = dims();
         std::vector<dim_t> rem_dims(ndims());
-        for (int i = 0; i < ndims(); i++)
+        for (dim_idx_t i = 0; i < ndims(); i++)
             rem_dims[i] = ir_utils::safe_divide(dim(i), inner.dim(i));
         auto ret = inner;
         for (auto &b : blocks()) {
@@ -736,8 +737,8 @@ class layout_t {
             ret = ret.add_outer_block(b.dim_idx, blk);
             r = ir_utils::safe_divide(r, blk);
         }
-        for (int i = 0; i < ndims(); i++)
-            ir_assert(rem_dims[i] == 1);
+        for (dim_idx_t i = 0; i < ndims(); i++)
+            gpu_assert(rem_dims[i] == 1);
         return ret;
     }
 
@@ -758,7 +759,7 @@ class layout_t {
             const std::vector<dim_t> &multi_blocks) const;
 
     layout_t add_outer_block(
-            int dim_idx, dim_t block, dim_t stride = -1) const {
+            dim_idx_t dim_idx, dim_t block, dim_t stride = -1) const {
         if (stride == -1) {
             if (blocks_.empty()) {
                 stride = 1;
@@ -767,8 +768,8 @@ class layout_t {
                 stride = last.block * last.stride;
             }
         }
-        ir_assert(stride >= elems());
-        ir_assert(dim_idx < ndims());
+        gpu_assert(stride >= elems());
+        gpu_assert(dim_idx < ndims());
         auto new_blocks = blocks();
         new_blocks.emplace_back(dim_idx, block, stride);
         return layout_t(type(), ndims(), offset(), new_blocks);
@@ -777,7 +778,7 @@ class layout_t {
     layout_t add_outer_block_and_pad(
             int dim_idx, dim_t block, int pad_bytes) const {
         int type_size = type().size();
-        ir_assert(pad_bytes % type_size == 0);
+        gpu_assert(pad_bytes % type_size == 0);
         if (blocks_.empty())
             return add_outer_block(dim_idx, block, pad_bytes / type_size);
         auto &last = blocks_.back();
@@ -796,10 +797,10 @@ class layout_t {
     tensor_t split(const grid_info_t &grid_info,
             grid_info_t *out_grid = nullptr) const {
         tensor_t min_tile;
-        std::vector<int> cur_dims(grid_info.ndims(), 1);
+        std::vector<dim_t> cur_dims(grid_info.ndims(), 1);
 
         for (int iter = 0; iter < grid_info.elems(); iter++) {
-            for (int i = 0; i < grid_info.ndims(); i++) {
+            for (dim_idx_t i = 0; i < grid_info.ndims(); i++) {
                 if (++cur_dims[i] <= grid_info.dim(i)) break;
                 cur_dims[i] = 1;
             }
@@ -858,9 +859,9 @@ class layout_t {
 
     tensor_t split(const tensor_t &tile, const grid_info_t &grid,
             std::vector<block_t> *outer_blocks = nullptr) const {
-        ir_assert(ndims() == tile.ndims())
+        gpu_assert(ndims() == tile.ndims())
                 << "Number of dimensions doesn't match.";
-        ir_assert(tile.has_zero_start());
+        gpu_assert(tile.has_zero_start());
 
         if (outer_blocks) outer_blocks->resize(0);
 
@@ -870,7 +871,7 @@ class layout_t {
         dim_t tile_elems = tile.elems();
 
         grid_splitter_t grid_splitter(grid);
-        ir_assert(tile_elems * grid.elems() == total_elems)
+        gpu_assert(tile_elems * grid.elems() == total_elems)
                 << "Tile/grid dimensions do not match.";
         MAYBE_UNUSED(total_elems);
         MAYBE_UNUSED(tile_elems);
@@ -893,8 +894,8 @@ class layout_t {
                     return tensor_t();
                 }
             } else {
-                dim_t next_chunk = math::gcd(
-                        b.block, static_cast<dim_t>(grid_splitter.cur_block()));
+                dim_t next_chunk
+                        = math::gcd(b.block, grid_splitter.cur_block());
                 if (b.block == next_chunk) {
                     auto idx = grid_splitter.pop_block(next_chunk);
                     start[b.dim_idx] += idx * dims[b.dim_idx];
@@ -917,10 +918,10 @@ class layout_t {
     // absolute 1D offsets are increasing between callback calls.
     template <typename F>
     void for_each_tile(const tensor_t &tile, const F &f) const {
-        ir_assert(tile.ndims() == ndims());
-        ir_assert(tile.has_zero_start());
-        for (int i = 0; i < ndims(); i++) {
-            ir_assert(dim(i) % tile.dims()[i] == 0);
+        gpu_assert(tile.ndims() == ndims());
+        gpu_assert(tile.has_zero_start());
+        for (dim_idx_t i = 0; i < ndims(); i++) {
+            gpu_assert(dim(i) % tile.dims()[i] == 0);
         }
 
         int nblocks = int(blocks().size());
@@ -928,19 +929,19 @@ class layout_t {
         for (int i = 0; i < nblocks; i++)
             sub_blocks[i] = blocks()[i].block;
 
-        for (int i = 0; i < ndims(); i++) {
+        for (dim_idx_t i = 0; i < into<dim_idx_t>(ndims()); i++) {
             dim_t dim = tile.dims()[i];
             for (auto &eb : enumerated_blocks()) {
                 auto &b = eb.second;
                 if (b.dim_idx != i) continue;
                 int block_idx = eb.first;
                 if (b.block >= dim) {
-                    ir_assert(b.block % dim == 0);
+                    gpu_assert(b.block % dim == 0);
                     sub_blocks[block_idx] = b.block / dim;
                     break;
                 }
                 sub_blocks[block_idx] = 1;
-                ir_assert(dim % b.block == 0);
+                gpu_assert(dim % b.block == 0);
                 dim /= b.block;
             }
         }
@@ -975,11 +976,11 @@ class layout_t {
         }
     }
 
-    bool has_outer_block(dim_t block, int dim_idx = -1) const {
+    bool has_outer_block(dim_t block, dim_idx_t dim_idx = -1) const {
         if (block == 1) return true;
         if (blocks().empty()) return false;
         auto &b = blocks().back();
-        if (dim_idx != -1 && b.dim_idx != dim_idx) return false;
+        if (dim_idx != dim_idx::invalid && b.dim_idx != dim_idx) return false;
         if (b.block % block != 0) return false;
         return true;
     }
@@ -992,7 +993,7 @@ class layout_t {
     // eb is <block index, block> pair, see enumerated_blocks().
     static bool is_outermost(const std::pair<int, block_t> &eb,
             const std::vector<block_t> &blocks) {
-        int dim_idx = eb.second.dim_idx;
+        dim_idx_t dim_idx = eb.second.dim_idx;
         for (int i = 0; i < int(blocks.size()); i++) {
             if (blocks[i].dim_idx == dim_idx && i > eb.first) return false;
         }
@@ -1071,7 +1072,7 @@ class layout_t {
     type_t type_;
 
     // Number of dimensions.
-    int ndims_;
+    dim_idx_t ndims_;
 
     // Offset to the start of the layout (in elements of type).
     expr_t offset_;
@@ -1100,7 +1101,7 @@ class layout_iterator_t {
     }
 
     layout_iterator_t &operator++() {
-        ir_assert(has_next());
+        gpu_assert(has_next());
         while (block_ == 1) {
             block_idx_++;
             block_ = int(l_.blocks()[block_idx_].block);
@@ -1113,7 +1114,7 @@ class layout_iterator_t {
             }
         }
 
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return *this;
     }
 
@@ -1121,7 +1122,7 @@ class layout_iterator_t {
         std::vector<dim_t> dims(l_.ndims(), 1);
         for (int i = 0; i <= block_idx_; i++) {
             auto &b = l_.blocks()[i];
-            int b_block = b.block;
+            dim_t b_block = b.block;
             if (i == block_idx_) b_block /= block_;
             dims[b.dim_idx] *= b_block;
         }
@@ -1157,7 +1158,7 @@ class mask_tensor_t {
 
     mask_tensor_t(const layout_t &layout)
         : layout_(layout), masks_(layout.elems(), -1) {
-        ir_assert(layout.is_dense());
+        gpu_assert(layout.is_dense());
     }
 
     mask_tensor_t(const layout_t &layout, const std::vector<int> &masks,
@@ -1167,7 +1168,7 @@ class mask_tensor_t {
         , masks_(masks)
         , mask2ids_(mask2ids)
         , id2masks_(id2masks) {
-        ir_assert(int(masks.size()) == elems()) << "Incompatible size.";
+        gpu_assert(int(masks.size()) == elems()) << "Incompatible size.";
     }
 
     const type_t &type() const { return layout_.type(); }
@@ -1177,7 +1178,7 @@ class mask_tensor_t {
     dim_t elems() const { return layout_.elems(); }
 
     void set_mask(dim_t off, const expr_t &mask) {
-        ir_assert(0 <= off && off < elems()) << "Incorrect offset.";
+        gpu_assert(0 <= off && off < elems()) << "Incorrect offset.";
         if (mask.is_empty()) return;
 
         auto ret = mask2ids_.insert({mask, int(mask2ids_.size())});
@@ -1188,7 +1189,7 @@ class mask_tensor_t {
     }
 
     const expr_t &mask(dim_t off) const {
-        ir_assert(0 <= off && off < elems());
+        gpu_assert(0 <= off && off < elems());
         return id2masks_[masks_[off]];
     }
 
@@ -1227,7 +1228,7 @@ class mask_tensor_t {
     }
 
     mask_tensor_t reinterpret(const type_t &new_type) const {
-        ir_assert(!is_empty()) << "Can't reinterpret.";
+        gpu_assert(!is_empty()) << "Can't reinterpret.";
         dim_t bytes = elems() * type().size();
         if (bytes % new_type.size() != 0 && bytes > new_type.size())
             return mask_tensor_t();
@@ -1244,7 +1245,7 @@ class mask_tensor_t {
                     return mask_tensor_t();
                 }
             }
-            ir_assert(0 <= mask_id && mask_id < int(masks_.size()));
+            gpu_assert(0 <= mask_id && mask_id < int(masks_.size()));
             new_masks[i / new_type.size()] = mask_id;
         }
         dim_t new_elems = utils::div_up(bytes, new_type.size());
@@ -1252,7 +1253,7 @@ class mask_tensor_t {
         return mask_tensor_t(_1d_layout, new_masks, mask2ids_, id2masks_);
     }
 
-    expr_t to_expr(int nmasks) const {
+    expr_t to_expr(dim_t nmasks) const {
         if (elems() % nmasks != 0) return expr_t();
 
         std::vector<expr_t> vec(nmasks);
@@ -1303,7 +1304,7 @@ class tdim_t {
 
     tdim_t(const expr_t &expr, const expr_t &mask) : expr_(expr), mask_(mask) {}
 
-    int nvargs() const { return nvargs_; }
+    dim_idx_t nvargs() const { return nvargs_; }
 
     const expr_t &expr() const { return expr_; }
 
@@ -1314,7 +1315,7 @@ class tdim_t {
     expr_t mask(const expr_t &tvalue, const std::vector<expr_t> &vvars,
             const std::vector<expr_t> &vvalues) const {
         auto ret = substitute(mask_, placeholder_var(), tvalue);
-        for (int i = 0; i < int(vvars.size()); i++) {
+        for (dim_idx_t i = 0; i < vvars.size(); i++) {
             if (contains_object(ret, vvars[i])) {
                 ret = substitute(ret, vvars[i], vvalues[i]);
             }
@@ -1322,13 +1323,13 @@ class tdim_t {
         return ret;
     }
 
-    int vidx(int arg_idx) const {
-        ir_assert(arg_idx < nvargs());
+    dim_idx_t vidx(dim_idx_t arg_idx) const {
+        gpu_assert(arg_idx < nvargs());
         return vidxs_[arg_idx];
     }
 
-    stride_t vstride(int arg_idx) const {
-        ir_assert(arg_idx < nvargs());
+    stride_t vstride(dim_idx_t arg_idx) const {
+        gpu_assert(arg_idx < nvargs());
         return vstrides_[arg_idx];
     }
 
@@ -1336,13 +1337,13 @@ class tdim_t {
 
     bool is_identity() const { return is_var(expr_); }
 
-    bool is_fixed_stride(int arg_idx) const {
-        ir_assert(arg_idx < nvargs());
+    bool is_fixed_stride(dim_idx_t arg_idx) const {
+        gpu_assert(arg_idx < nvargs());
         return vstrides_[arg_idx].is_fixed();
     }
 
-    void add_vvar(int vidx, const expr_t &varg) {
-        ir_assert(nvargs_ + 1 <= max_nvargs);
+    void add_vvar(dim_idx_t vidx, const expr_t &varg) {
+        gpu_assert(nvargs_ + 1 <= max_nvargs);
         vidxs_[nvargs_] = vidx;
         vstrides_[nvargs_] = compute_stride(expr_, nvargs_, varg);
         nvargs_++;
@@ -1363,15 +1364,16 @@ class tdim_t {
     IR_DEFINE_DUMP()
 
 private:
-    static const int max_nvargs = 2;
+    static const dim_idx_t max_nvargs = 2;
 
-    static stride_t compute_stride(const expr_t &e, int idx, const expr_t &var);
+    static stride_t compute_stride(
+            const expr_t &e, dim_idx_t idx, const expr_t &var);
 
     expr_t expr_;
 
-    int nvargs_ = 0;
+    dim_idx_t nvargs_ = 0;
     std::array<stride_t, max_nvargs> vstrides_;
-    std::array<int, max_nvargs> vidxs_;
+    std::array<dim_idx_t, max_nvargs> vidxs_;
     expr_t mask_;
 };
 
@@ -1379,7 +1381,7 @@ class view_t {
 public:
     view_t() = default;
 
-    view_t(const std::vector<expr_t> &vvars, int ntdims)
+    view_t(const std::vector<expr_t> &vvars, dim_idx_t ntdims)
         : vvars_(vvars)
         , vdims_(vvars.size())
         , vstart_(vvars.size())
@@ -1399,7 +1401,7 @@ class view_t {
         , tdims_(layout.ndims())
         , tlayout_(layout) {
         if (vvars_.empty()) vvars_ = create_vvars(layout.ndims());
-        for (int i = 0; i < nvdims(); i++) {
+        for (dim_idx_t i = 0; i < nvdims(); i++) {
             expr_t i_mask;
             if ((bound_check_mask & (1 << i)) != 0)
                 i_mask = (placeholder_var() < layout.dim(i));
@@ -1413,49 +1415,50 @@ class view_t {
 
     std::vector<expr_t> vstart() const { return vstart_; }
 
-    expr_t vstart(int vidx) const { return vstart_[vidx]; }
+    expr_t vstart(dim_idx_t vidx) const { return vstart_[vidx]; }
 
     const layout_t &tlayout() const { return tlayout_; }
 
-    int nvdims() const { return int(vdims_.size()); }
+    dim_idx_t nvdims() const { return into<dim_idx_t>(vdims_.size()); }
 
-    int ntdims() const { return int(tdims_.size()); }
+    dim_idx_t ntdims() const { return into<dim_idx_t>(tdims_.size()); }
 
     dim_t velems() const {
         dim_t ret = 1;
-        for (int i = 0; i < nvdims(); i++)
+        for (dim_idx_t i = 0; i < nvdims(); i++)
             ret *= vdims_[i];
         return ret;
     }
 
-    const expr_t &vvar(int idx) const {
-        ir_assert(idx < nvdims());
+    const expr_t &vvar(dim_idx_t idx) const {
+        gpu_assert(idx < nvdims());
         return vvars_[idx];
     }
 
     const expr_t &vvar(const std::string &name) const {
         for (auto &v : vvars_)
             if (v.as<var_t>().name == name) return v;
-        ir_error_not_expected() << name;
+        gpu_error_not_expected() << name;
         return vvars_[0];
     }
 
-    const tdim_t &tdim(int idx) const {
-        ir_assert(idx < ntdims());
+    const tdim_t &tdim(dim_idx_t idx) const {
+        gpu_assert(idx < ntdims());
         return tdims_[idx];
     }
 
-    void set_tdim(int tidx, const expr_t &_texpr, const expr_t &mask = {}) {
-        ir_assert(tdims_[tidx].is_empty());
+    void set_tdim(
+            dim_idx_t tidx, const expr_t &_texpr, const expr_t &mask = {}) {
+        gpu_assert(tdims_[tidx].is_empty());
 
         auto texpr = simplify(_texpr);
 
         tdim_t tdim(texpr, mask);
-        for (int i = 0; i < nvdims(); i++) {
+        for (dim_idx_t i = 0; i < nvdims(); i++) {
             if (contains_object(texpr, vvars_[i])) tdim.add_vvar(i, vvars_[i]);
         }
         if (!is_const(texpr)) {
-            ir_assert(tdim.nvargs() > 0)
+            gpu_assert(tdim.nvargs() > 0)
                     << "Tensor dimension must have at least one view dimension "
                        "that maps to it.";
         }
@@ -1464,27 +1467,27 @@ class view_t {
 
     void set_vdim(
             const expr_t &varg, dim_t vdim, const expr_t &vstart = expr_t(0)) {
-        int vidx = vvar_index(varg);
-        ir_assert(vstart_[vidx].is_empty());
+        dim_idx_t vidx = vvar_index(varg);
+        gpu_assert(vstart_[vidx].is_empty());
         vstart_[vidx] = vstart;
         vdims_[vidx] = vdim;
     }
 
     void set_tlayout(const layout_t &tlayout) { tlayout_ = tlayout; }
 
-    void set_tmasks(const std::unordered_map<std::string, int> &padded_dims) {
+    void set_tmasks(const std::unordered_map<std::string, dim_t> &padded_dims) {
         using namespace ir_utils;
         auto &x = placeholder_var();
-        for (int i = 0; i < ntdims(); i++) {
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
             auto &tdim = tdims_[i];
             if (!tdim.is_identity() || !tdim.mask().is_empty()) continue;
-            int vidx = tdim.vidx(0);
-            int dim = tlayout_.dim(i);
+            dim_idx_t vidx = tdim.vidx(0);
+            dim_t dim = tlayout_.dim(i);
             auto &dim_name = vvars_[vidx].as<var_t>().name;
-            int padded_dim = get_or_default(padded_dims, dim_name, 1);
+            dim_t padded_dim = get_or_default(padded_dims, dim_name, dim_t(1));
             if (dim >= padded_dim) continue;
-            int inner_blk = ir_utils::max_pow2_divisor(dim);
-            int dim_blk = ir_utils::max_pow2_divisor(tlayout_.inner_block(
+            dim_t inner_blk = ir_utils::max_pow2_divisor(dim);
+            dim_t dim_blk = ir_utils::max_pow2_divisor(tlayout_.inner_block(
                     i, /*skip_outer=*/true, /*inner_only=*/false));
             inner_blk = std::min(inner_blk, dim_blk);
             auto tmask = (inner_blk == 1) ? (x < dim)
@@ -1493,10 +1496,10 @@ class view_t {
         }
     }
 
-    void set_tmasks(const std::vector<int> &padded_dims) {
-        ir_assert(int(padded_dims.size()) == ntdims());
-        std::unordered_map<std::string, int> pd_map;
-        for (int i = 0; i < ntdims(); i++) {
+    void set_tmasks(const std::vector<dim_t> &padded_dims) {
+        gpu_assert(padded_dims.size() == ntdims());
+        std::unordered_map<std::string, dim_t> pd_map;
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
             auto &dim_name = vvars_[tdims_[i].vidx(0)].as<var_t>().name;
             pd_map.emplace(dim_name, padded_dims[i]);
         }
@@ -1519,13 +1522,13 @@ class view_t {
     bool is_empty() const { return vdims_.empty(); }
 
     bool has_zero_vstart() const {
-        for (int i = 0; i < nvdims(); i++)
+        for (dim_idx_t i = 0; i < nvdims(); i++)
             if (!is_zero(vstart_[i])) return false;
         return true;
     }
 
-    bool has_tmask(int tidx) const {
-        ir_assert(tidx >= 0 && tidx < ntdims());
+    bool has_tmask(dim_idx_t tidx) const {
+        gpu_assert(tidx != dim_idx::invalid && tidx < ntdims());
         return !tdims_[tidx].mask().is_empty();
     }
 
@@ -1539,22 +1542,22 @@ class view_t {
 
     expr_t offset_in_bytes(const std::vector<expr_t> &vargs = {},
             bool ignore_offset = false) const {
-        return offset(vargs, ignore_offset) * type().size();
+        return offset(vargs, ignore_offset) * type().size() / type().packing();
     }
 
     int get_alignment(const constraint_set_t &cset) const {
         // Alignment must be a power of 2.
-        const int base_alignment = 128;
+        const dim_t base_alignment = 128;
         int64_t f = get_max_const_factor(this->offset_in_bytes(), cset);
-        int alignment = f ? ir_utils::max_pow2_divisor(f) : base_alignment;
-        return std::min(base_alignment, alignment);
+        dim_t alignment = f ? ir_utils::max_pow2_divisor(f) : base_alignment;
+        return static_cast<int>(std::min(base_alignment, alignment));
     }
 
-    int vvar_index(const expr_t &vvar) const {
-        for (size_t i = 0; i < vvars_.size(); i++)
-            if (vvar.is_same(vvars_[i])) return int(i);
-        ir_error_not_expected() << "Can't find view dimension.";
-        return -1;
+    dim_idx_t vvar_index(const expr_t &vvar) const {
+        for (dim_idx_t i = 0; i < vvars_.size(); i++)
+            if (vvar.is_same(vvars_[i])) return i;
+        gpu_error_not_expected() << "Can't find view dimension.";
+        return dim_idx::invalid;
     }
 
     template <typename T>
@@ -1577,17 +1580,17 @@ class view_t {
         return ret;
     }
 
-    bool is_masked_vdim(int vidx) const {
-        ir_assert(vidx >= 0 && vidx < nvdims());
-        ir_assert(has_zero_vstart())
+    bool is_masked_vdim(dim_idx_t vidx) const {
+        gpu_assert(vidx != dim_idx::invalid && vidx < nvdims());
+        gpu_assert(has_zero_vstart())
                 << "Can't be reliably determined if the view is a sub-view.";
-        for (int i = 0; i < ntdims(); i++) {
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
             auto &tdim = tdims_[i];
             if (tdim.expr().is_equal(vvars_[vidx])) {
                 if (vdims_[vidx] != tlayout_.dim(i)) return true;
             }
             if (has_tmask(i)) {
-                for (int j = 0; j < tdim.nvargs(); j++) {
+                for (dim_idx_t j = 0; j < tdim.nvargs(); j++) {
                     if (tdim.vidx(j) == vidx) return true;
                 }
             }
@@ -1605,13 +1608,13 @@ class view_t {
     //      assuming the zero padding invariant. However in some cases we need
     //      to generate the exact bound condition based on the logical indices.
     expr_t vmask(const std::vector<expr_t> &vargs) const {
-        ir_assert(int(vargs.size()) == nvdims()) << "Incompatible dimensions.";
-        ir_assert(has_zero_vstart())
+        gpu_assert(vargs.size() == nvdims()) << "Incompatible dimensions.";
+        gpu_assert(has_zero_vstart())
                 << "Can't be reliably determined if the view is a sub-view.";
         auto targs = cvt_vargs_to_targs(vargs);
         auto mask = bool_imm_t::make(true);
-        for (int i = 0; i < ntdims(); i++) {
-            for (int j = 0; j < nvdims(); j++) {
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
+            for (dim_idx_t j = 0; j < nvdims(); j++) {
                 if (!tdims_[i].expr().is_equal(vvars_[j])) continue;
                 if (vdims_[j] != tlayout_.dim(i)) {
                     mask &= (vargs[j] < vdims_[j]);
@@ -1627,7 +1630,7 @@ class view_t {
 
     bool can_convert_to_vlayout() const {
         if (nvdims() != ntdims()) return false;
-        for (int i = 0; i < nvdims(); i++) {
+        for (dim_idx_t i = 0; i < nvdims(); i++) {
             if (!tdims_[i].expr().is_same(vvars_[i])) return false;
             if (!tdims_[i].is_fixed_stride(0)) return false;
         }
@@ -1661,7 +1664,7 @@ class view_t {
     }
 
     layout_t create_vlayout(bool force_zero_offset = false) const {
-        ir_assert(can_convert_to_vlayout()) << "Can't convert view to layout.";
+        gpu_assert(can_convert_to_vlayout()) << "Can't convert view to layout.";
         if (force_zero_offset) return tlayout_.map(tensor_t(vdims_));
         return tlayout_.map(tensor_t(vdims_, vstart_));
     }
@@ -1718,25 +1721,25 @@ class view_t {
     void try_create_buffer_view(view_t &buf_view, view_t &inv_view) const {
         buf_view = view_t(create_vvars(ntdims()), ntdims());
         inv_view = view_t(vvars(), ntdims());
-        for (int i = 0; i < nvdims(); i++) {
+        for (dim_idx_t i = 0; i < nvdims(); i++) {
             inv_view.set_vdim(vvars()[i], vdims()[i]);
         }
-        for (int i = 0; i < ntdims(); i++) {
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
             auto &tdim = tdims_[i];
             auto &buf_vvar = buf_view.vvars()[i];
             if (tdim.is_identity()) {
-                int vidx = tdim.vidx(0);
+                dim_idx_t vidx = tdim.vidx(0);
                 buf_view.set_vdim(buf_vvar, vdims()[vidx], vstart(vidx));
                 buf_view.set_tdim(i, buf_vvar, tdim.mask());
                 inv_view.set_tdim(i, tdim.expr());
                 continue;
             }
-            int buf_vdim = 0;
+            int64_t buf_vdim = 0;
             bool ok = true;
-            for (int j = 0; j < tdim.nvargs(); j++) {
-                int vidx = tdim.vidx(j);
+            for (dim_idx_t j = 0; j < tdim.nvargs(); j++) {
+                dim_idx_t vidx = tdim.vidx(j);
                 auto &vvar = vvars()[vidx];
-                int vdim = vdims()[vidx];
+                dim_t vdim = vdims()[vidx];
                 if (vdim == 1) continue;
                 const auto &A = tdim.expr();
                 auto B = jit::substitute(A, vvar, vvar + 1);
@@ -1745,7 +1748,7 @@ class view_t {
                     ok = false;
                     break;
                 }
-                buf_vdim += to_cpp<int>(C) * (vdim - 1);
+                buf_vdim += to_cpp<int64_t>(C) * (vdim - 1);
             }
             buf_vdim++;
 
@@ -1757,8 +1760,8 @@ class view_t {
 
             auto buf_vstart = tdim.expr();
             auto inv_vstart = tdim.expr();
-            for (int j = 0; j < tdim.nvargs(); j++) {
-                int vidx = tdim.vidx(j);
+            for (dim_idx_t j = 0; j < tdim.nvargs(); j++) {
+                dim_idx_t vidx = tdim.vidx(j);
                 buf_vstart = jit::substitute(
                         buf_vstart, vvars()[vidx], vstart(vidx));
                 inv_vstart
@@ -1796,7 +1799,7 @@ class view_t {
 
     static const expr_t &placeholder_var() { return tdim_t::placeholder_var(); }
 
-    static std::vector<expr_t> create_vvars(int nvdims);
+    static std::vector<expr_t> create_vvars(dim_idx_t nvdims);
 
     template <typename SrcT = expr_t, typename DstT = SrcT>
     std::vector<DstT> cvt_vargs_to_targs(const std::vector<SrcT> &_vargs = {},
@@ -1805,19 +1808,19 @@ class view_t {
         if (vargs.empty()) vargs.resize(nvdims(), 0);
 
         if (!ignore_vstart) {
-            for (int i = 0; i < nvdims(); i++) {
+            for (dim_idx_t i = 0; i < nvdims(); i++) {
                 if (!is_zero(vstart_[i])) vargs[i] += vstart_[i];
             }
         }
 
         std::vector<expr_t> targs(ntdims());
-        for (int i = 0; i < ntdims(); i++) {
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
             targs[i] = tdims_[i].expr();
-            for (int j = 0; j < nvdims(); j++) {
+            for (dim_idx_t j = 0; j < nvdims(); j++) {
                 targs[i] = jit::substitute(targs[i], vvars_[j], vargs[j]);
             }
         }
-        for (int i = 0; i < ntdims(); i++) {
+        for (dim_idx_t i = 0; i < ntdims(); i++) {
             targs[i] = const_fold(targs[i]);
         }
         return expr_cast<DstT>(targs);
@@ -1828,21 +1831,21 @@ class view_t {
             const layout_t &tlayout, bool init_offset) const;
 
     void create_mask_tensor(mask_tensor_t &mask_tensor,
-            const layout_t &_vlayout, int vidx, std::vector<dim_t> &vargs,
+            const layout_t &_vlayout, dim_idx_t vidx, std::vector<dim_t> &vargs,
             uint32_t tmask) const {
         if (vidx == _vlayout.ndims()) {
             bool is_init = false;
             std::vector<expr_t> vvalues;
             std::vector<expr_t> targs;
             expr_t mask = bool_imm_t::make(true);
-            for (int i = 0; i < ntdims(); i++) {
+            for (dim_idx_t i = 0; i < ntdims(); i++) {
                 auto &tdim = tdims_[i];
                 if ((tmask & (1 << i)) == 0) continue;
                 if (tdim.mask().is_empty()) continue;
                 if (!is_init) {
                     // Lazily initialize values
                     vvalues = vstart_;
-                    for (int i = 0; i < nvdims(); i++)
+                    for (dim_idx_t i = 0; i < nvdims(); i++)
                         vvalues[i] += vargs[i];
                     targs = cvt_vargs_to_targs<dim_t, expr_t>(vargs);
                     is_init = true;
@@ -1853,7 +1856,7 @@ class view_t {
             return;
         }
 
-        for (int i = 0; i < vdims()[vidx]; i++) {
+        for (dim_idx_t i = 0; i < vdims()[vidx]; i++) {
             vargs[vidx] = i;
             create_mask_tensor(mask_tensor, _vlayout, vidx + 1, vargs, tmask);
         }
@@ -1891,42 +1894,42 @@ class dim_assignment_t {
 public:
     dim_assignment_t() = default;
 
-    dim_assignment_t(int old_ndims, int new_ndims)
+    dim_assignment_t(dim_idx_t old_ndims, dim_idx_t new_ndims)
         : old_ndims_(old_ndims)
         , new_ndims_(new_ndims)
         , assignments_(old_ndims, -1) {}
 
-    void assign(int old_idx, int new_idx) {
-        ir_assert(0 <= old_idx && old_idx < old_ndims_);
-        ir_assert(0 <= new_idx && new_idx < new_ndims_);
+    void assign(dim_idx_t old_idx, dim_idx_t new_idx) {
+        gpu_assert(old_idx != dim_idx::invalid && old_idx < old_ndims_);
+        gpu_assert(new_idx != dim_idx::invalid && new_idx < new_ndims_);
         assignments_[old_idx] = new_idx;
     }
 
-    void assign(const std::vector<int> &old_idxes, int new_idx) {
+    void assign(const std::vector<dim_idx_t> &old_idxes, dim_idx_t new_idx) {
         for (auto old_idx : old_idxes) {
             assign(old_idx, new_idx);
         }
     }
 
-    int operator[](int old_idx) const {
-        ir_assert(old_idx >= 0 && old_idx < old_ndims());
+    dim_idx_t operator[](dim_idx_t old_idx) const {
+        gpu_assert(old_idx >= 0 && old_idx < old_ndims());
         return assignments_[old_idx];
     }
 
-    int old_ndims() const { return old_ndims_; }
+    dim_idx_t old_ndims() const { return old_ndims_; }
 
-    int new_ndims() const { return new_ndims_; }
+    dim_idx_t new_ndims() const { return new_ndims_; }
 
     bool is_empty() const { return old_ndims_ == 0 && new_ndims_ == 0; }
 
     layout_t map(const layout_t &layout) const;
 
 private:
-    int old_ndims_ = 0;
-    int new_ndims_ = 0;
+    dim_idx_t old_ndims_ = 0;
+    dim_idx_t new_ndims_ = 0;
 
     // assignments_[old_idx] = new_idx.
-    std::vector<int> assignments_;
+    std::vector<dim_idx_t> assignments_;
 };
 
 // Adds size one spatial dimensions according to input parameters. Spatial
diff --git a/src/gpu/intel/jit/ir/tensor_config.cpp b/src/gpu/intel/jit/ir/tensor_config.cpp
index 50a068d3c13..74b5cb65de1 100644
--- a/src/gpu/intel/jit/ir/tensor_config.cpp
+++ b/src/gpu/intel/jit/ir/tensor_config.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,6 +26,11 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg,
         const primitive_attr_t &attr, const memory_desc_t *zp_src,
         const memory_desc_t &dst_md, dim_t ic, dim_t oc,
         tensor_config_t &tensor_cfg) {
+    if (!attr.rounding_mode_.has_default_values()) {
+        layout_t sround_seed_layout(type_t::u32(), 0, std::vector<dim_t> {1});
+        tensor_cfg.add_tensor("sround_seed", DNNL_ARG_ATTR_ROUNDING_SEED,
+                /*is_input=*/true, /*is_output=*/false, sround_seed_layout);
+    }
     auto add_zp_buffer = [&](const std::string &name, type_t type, int arg_id,
                                  dim_t size) {
         layout_t zp_layout(type, 0, std::vector<dim_t> {size});
@@ -33,18 +38,18 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg,
                 /*is_input=*/true, /*is_output=*/false, zp_layout);
     };
     if (zp_cfg.do_src_compensation && zp_cfg.is_runtime_src_zero_points) {
-        if (!zp_cfg.needs_src_precalc) {
-            add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC,
-                    (zp_cfg.is_common_src_zero_point) ? 1 : ic);
-        } else {
-            ir_assert(zp_src);
+        if (zp_cfg.needs_src_conv_precalc) {
+            gpu_assert(zp_src);
             int arg_key = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC;
             tensor_cfg.add_tensor("src_zero_points", arg_key, /*is_input=*/true,
                     /*is_output=*/false, layout_t(zp_src, false), layout_t());
+        } else {
+            add_zp_buffer("src_zero_points", zp_cfg.src_zp_type, DNNL_ARG_SRC,
+                    (zp_cfg.is_common_src_zero_point) ? 1 : ic);
         }
     }
     if (zp_cfg.do_wei_compensation && zp_cfg.is_runtime_wei_zero_points) {
-        ir_assert(zp_cfg.is_common_wei_zero_point);
+        gpu_assert(zp_cfg.is_common_wei_zero_point);
         add_zp_buffer(
                 "wei_zero_points", zp_cfg.wei_zp_type, DNNL_ARG_WEIGHTS, 1);
     }
@@ -54,9 +59,8 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg,
     auto scale_args = get_scale_args();
     for (int i = 0; i < (int)scale_args.size(); i++) {
         int arg = scale_args[i].second;
-        auto &s = attr.scales_.get(arg);
-        if (s.has_default_values()) continue;
-        std::vector<dim_t> dims = {(s.mask_ == 0) ? 1 : oc};
+        if (attr.scales_.has_default_values(arg)) continue;
+        std::vector<dim_t> dims = {(attr.scales_.get_mask(arg) == 0) ? 1 : oc};
         layout_t layout(type_t::f32(), 0, dims);
         int arg_key = DNNL_ARG_ATTR_SCALES | arg;
         tensor_cfg.add_tensor(scale_args[i].first, arg_key, /*is_input=*/true,
@@ -81,7 +85,7 @@ void init_extra_tensors(const zero_points_config_t &zp_cfg,
             tensor_cfg.add_tensor("prelu_rhs_" + std::to_string(i), arg_key,
                     /*is_input=*/true, /*is_output=*/false, layout);
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
 }
diff --git a/src/gpu/intel/jit/ir/tensor_config.hpp b/src/gpu/intel/jit/ir/tensor_config.hpp
index 0ea9db010a6..177efb6c7d3 100644
--- a/src/gpu/intel/jit/ir/tensor_config.hpp
+++ b/src/gpu/intel/jit/ir/tensor_config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -99,7 +99,7 @@ class tensor_config_t {
         for (auto &t : tensors_) {
             if (t.name == name) return t;
         }
-        ir_error_not_expected() << "Can't find tensor " << name;
+        gpu_error_not_expected() << "Can't find tensor " << name;
         return tensors_.front();
     }
 
@@ -117,6 +117,7 @@ inline layout_t make_layout(const memory_desc_t &md) {
 }
 
 inline layout_t make_layout(const memory_desc_t &md, const std::string &tag) {
+    if (tag == "user") return layout_t(md);
     return layout_t(md, tag, /*do_normalize=*/false);
 }
 
diff --git a/src/gpu/intel/jit/ir/walk_order.hpp b/src/gpu/intel/jit/ir/walk_order.hpp
index fdf7d519772..06e32d9dac7 100644
--- a/src/gpu/intel/jit/ir/walk_order.hpp
+++ b/src/gpu/intel/jit/ir/walk_order.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,20 +32,20 @@ class walk_order_t {
 public:
     struct block_t {
         block_t() = default;
-        block_t(const prb_dim_t &dim, int size, int grid_id)
+        block_t(const pvar_t &dim, int size, int grid_id)
             : dim(dim), size(size), grid_id(grid_id) {}
-        prb_dim_t dim;
+        pvar_t dim;
         int size = 0;
         int grid_id = -1;
     };
 
     struct dim_info_t {
         dim_info_t() = default;
-        dim_info_t(const prb_dim_t &dim, int size) : dim(dim), size(size) {
+        dim_info_t(const pvar_t &dim, int size) : dim(dim), size(size) {
             grid_var = var_t::make(type_t::s32(), dim.str() + "_grid_var");
         }
 
-        prb_dim_t dim;
+        pvar_t dim;
         int size = 0;
         expr_t grid_var;
     };
@@ -53,22 +53,29 @@ class walk_order_t {
     walk_order_t() = default;
     walk_order_t(const std::string &s) {
         auto parts = gpu_utils::split(s, ",");
-        ir_assert(parts.size() <= 3);
+        gpu_assert(parts.size() <= 3);
         for (int i = 0; i < (int)parts.size(); i++) {
             for (auto &kv : ir_utils::to_string_int_pairs(parts[i])) {
-                add(prb_dim_t::from_name(kv.first), kv.second, i);
+                add(pvar_t(kv.first), kv.second, i);
             }
         }
     }
 
-    void add(const prb_dim_t &dim, int block_size, int grid_id) {
+    void add(const pvar_t &dim, dim_t block_size, int grid_id) {
+        if (!blocks_.empty()) {
+            auto &last = blocks_.back();
+            if (last.dim == dim && last.grid_id == grid_id) {
+                last.size *= block_size;
+                return;
+            }
+        }
         blocks_.emplace_back(dim, block_size, grid_id);
     }
 
     const std::vector<block_t> &blocks() const { return blocks_; }
     const std::vector<dim_info_t> &dim_infos() const { return dim_infos_; }
 
-    bool has(const prb_dim_t &dim) const {
+    bool has(const pvar_t &dim) const {
         for (auto &info : dim_infos_) {
             if (info.dim == dim) return true;
         }
@@ -88,30 +95,30 @@ class walk_order_t {
         return false;
     }
 
-    std::vector<prb_dim_t> grid_dims(int id) const {
-        std::vector<prb_dim_t> ret;
+    std::vector<pvar_t> grid_dims(int id) const {
+        std::vector<pvar_t> ret;
         for (auto &info : dim_infos_) {
             if (grid_id(info.dim) == id) ret.push_back(info.dim);
         }
         return ret;
     }
 
-    int grid_id(const prb_dim_t &dim) const {
+    int grid_id(const pvar_t &dim) const {
         int id = -1;
         for (auto &b : blocks_) {
             if (b.dim != dim) continue;
             if (id == -1) id = b.grid_id;
-            ir_assert(b.grid_id == id);
+            gpu_assert(b.grid_id == id);
         }
-        ir_assert(id != -1);
+        gpu_assert(id != -1);
         return id;
     }
 
-    expr_t grid_var(const prb_dim_t &dim) const {
+    expr_t grid_var(const pvar_t &dim) const {
         for (auto &info : dim_infos_) {
             if (info.dim == dim) return info.grid_var;
         }
-        ir_error_not_expected() << "Grid variable not found: " << dim;
+        gpu_error_not_expected() << "Grid variable not found: " << dim;
         return expr_t();
     }
 
@@ -119,11 +126,11 @@ class walk_order_t {
         for (auto &info : dim_infos_) {
             if (info.grid_var.is_same(grid_var)) return info.size;
         }
-        ir_error_not_expected() << "Grid variable not found: " << grid_var;
+        gpu_error_not_expected() << "Grid variable not found: " << grid_var;
         return -1;
     }
 
-    int dim_size(const prb_dim_t &dim) const { return dim_size(grid_var(dim)); }
+    int dim_size(const pvar_t &dim) const { return dim_size(grid_var(dim)); }
 
     bool is_grid_var(const expr_t &grid_var) const {
         for (auto &info : dim_infos_) {
@@ -132,13 +139,13 @@ class walk_order_t {
         return false;
     }
 
-    void finalize(const prb_tile_t &grid_tile) {
+    void finalize(const pvar_tile_t &grid_tile) {
         for (auto &d : grid_tile) {
             int inner_block = 1;
             for (auto &b : blocks_) {
                 if (b.dim == d) inner_block *= b.size;
             }
-            int outer = utils::div_up(grid_tile[d], inner_block);
+            dim_t outer = utils::div_up(grid_tile[d], inner_block);
             int id = (inner_block != 1 ? grid_id(d) : 0);
             dim_infos_.emplace_back(d, grid_tile[d]);
             if (outer != 1) add(d, outer, id);
diff --git a/src/gpu/intel/jit/jit_eltwise_injector.cpp b/src/gpu/intel/jit/jit_eltwise_injector.cpp
deleted file mode 100644
index 39d36aa181d..00000000000
--- a/src/gpu/intel/jit/jit_eltwise_injector.cpp
+++ /dev/null
@@ -1,842 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/jit/jit_eltwise_injector.hpp"
-#include "common/impl_registration.hpp"
-
-#include <limits>
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-using namespace ngen;
-
-template <gpu_gen_t hw>
-int jit_eltwise_injector_f32<hw>::min_scratch_regs() {
-    using namespace alg_kind;
-    if (is_fwd_) {
-        switch (alg_) {
-            case eltwise_elu:
-            case eltwise_elu_use_dst_for_bwd: return 1;
-            case eltwise_exp:
-            case eltwise_exp_use_dst_for_bwd: return 0;
-            case eltwise_gelu_erf: return 4;
-            case eltwise_hardsigmoid: return 0;
-            case eltwise_hardswish: return 1;
-            case eltwise_log: return 0;
-            case eltwise_mish: return 4;
-            case eltwise_pow: return 1;
-            case eltwise_relu:
-            case eltwise_relu_use_dst_for_bwd: return 1;
-            case eltwise_abs: return 0;
-            case eltwise_soft_relu: return 1;
-            case eltwise_sqrt:
-            case eltwise_sqrt_use_dst_for_bwd: return 0;
-            case eltwise_square: return 0;
-            case eltwise_swish: return 1;
-            case eltwise_tanh:
-            case eltwise_tanh_use_dst_for_bwd: return 2;
-            case eltwise_round: return 0;
-            case eltwise_linear: return 0;
-            case eltwise_clip:
-            case eltwise_clip_v2:
-            case eltwise_clip_v2_use_dst_for_bwd: return 0;
-            case eltwise_gelu_tanh: return 2;
-            case eltwise_logistic:
-            case eltwise_logistic_use_dst_for_bwd: return 0;
-            default: assert(!"unsupported eltwise algorithm");
-        }
-    } else {
-        switch (alg_) {
-            case eltwise_relu: return 1;
-            case eltwise_abs: return 1;
-            case eltwise_square: return 0;
-            case eltwise_linear: return 0;
-            case eltwise_clip: return 1;
-            case eltwise_gelu_tanh: return 2;
-            default: assert(!"unsupported eltwise algorithm");
-        }
-    }
-    return 0;
-}
-
-template <gpu_gen_t hw>
-int jit_eltwise_injector_f32<hw>::preferred_scratch_regs() {
-    using namespace alg_kind;
-    if (is_fwd_) {
-        switch (alg_) {
-            case eltwise_elu:
-            case eltwise_elu_use_dst_for_bwd: return 8;
-            case eltwise_gelu_erf: return 8;
-            case eltwise_hardswish: return 8;
-            case eltwise_mish: return 8;
-            case eltwise_relu:
-            case eltwise_relu_use_dst_for_bwd: return (alpha_ == 0.f) ? 1 : 8;
-            case eltwise_tanh: return 8;
-            case eltwise_gelu_tanh: return 8;
-            case eltwise_soft_relu: return 8;
-            case eltwise_swish: return 8;
-            default: break;
-        }
-    } else {
-        switch (alg_) {
-            case eltwise_gelu_tanh: return 8;
-            default: break;
-        }
-    }
-    return min_scratch_regs();
-}
-
-template <gpu_gen_t hw>
-int jit_eltwise_injector_f32<hw>::max_batch_size() {
-    using namespace alg_kind;
-    auto ss = scratch_.getLen();
-
-    if (is_fwd_) {
-        switch (alg_) {
-            case eltwise_relu:
-            case eltwise_relu_use_dst_for_bwd:
-                if (alpha_ == 0.)
-                    break;
-                else
-                    return ss;
-            case eltwise_elu:
-            case eltwise_elu_use_dst_for_bwd:
-            case eltwise_hardswish:
-            case eltwise_pow:
-            case eltwise_soft_relu:
-            case eltwise_swish: return ss;
-            case eltwise_tanh:
-            case eltwise_mish:
-            case eltwise_gelu_erf: return ss / min_scratch_regs();
-            case eltwise_gelu_tanh: return ss & ~1;
-            default: break;
-        }
-    } else {
-        switch (alg_) {
-            case eltwise_gelu_tanh: return ss / 2;
-            default: break;
-        }
-    }
-
-    return 128;
-}
-
-template <gpu_gen_t hw>
-int jit_eltwise_injector_f32<hw>::phase_count(alg_kind_t alg) {
-    using namespace alg_kind;
-
-    if (is_fwd_) {
-        switch (alg) {
-            case eltwise_elu:
-            case eltwise_elu_use_dst_for_bwd: return 5;
-            case eltwise_exp:
-            case eltwise_exp_use_dst_for_bwd: return 2;
-            case eltwise_gelu_erf: return 25;
-            case eltwise_hardsigmoid: return 4;
-            case eltwise_hardswish: return 5;
-            case eltwise_log: return 2;
-            case eltwise_mish:
-                return phase_count(alg_kind::eltwise_soft_relu)
-                        + phase_count(alg_kind::eltwise_tanh) + 1;
-            case eltwise_pow: return 6;
-            case eltwise_relu:
-            case eltwise_relu_use_dst_for_bwd: return (alpha_ == 0) ? 1 : 2;
-            case eltwise_soft_relu: return 10;
-            case eltwise_swish: return 5;
-            case eltwise_tanh:
-            case eltwise_tanh_use_dst_for_bwd:
-                return (use_tanh_compat()) ? 9 : 6;
-            case eltwise_linear: return (beta_ == 0) ? 1 : 2;
-            case eltwise_clip:
-            case eltwise_clip_v2:
-            case eltwise_clip_v2_use_dst_for_bwd: return 2;
-            case eltwise_gelu_tanh: return 8;
-            case eltwise_logistic:
-            case eltwise_logistic_use_dst_for_bwd: return 4;
-            default: break;
-        }
-    } else {
-        switch (alg) {
-            case eltwise_abs: return 2;
-            case eltwise_clip: return 4;
-            case eltwise_gelu_tanh: return 14;
-            default: break;
-        }
-    }
-
-    return 1;
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::relu_zero_ns_prepare_fwd() {
-    h->mov(1, scratch_[0].f(0), 0.f);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::relu_zero_ns_compute_fwd(
-        int simd, const ngen::GRF &r) {
-    /* use csel instead of max to propagate NaNs*/
-    h->csel(simd | le | f0[0], r, scratch_[0].f(0), r, r);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::relu_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    auto temp = scratch_[off].f();
-    switch (phase) {
-        case 0: h->mul(simd, temp, r, alpha_); break;
-        case 1: h->csel(simd | le | f0[0], r, temp, r, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::abs_compute_fwd(
-        int simd, const ngen::GRF &r) {
-    h->mov(simd, r, abs(r));
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::soft_relu_compute_fwd_inner(int simd,
-        const ngen::GRF &input, const ngen::GRF &temp, const ngen::GRF &dest,
-        int phase, int off, float alpha) {
-    const float exp_overflow_bound = 88.72283172607421875;
-    const float log2e = 1.44269502162933349609375f;
-    const float reciproc_log2e = 1.f / log2e; // 1 / log_2(e)
-    switch (phase) {
-        case 0: h->mul(simd, temp, input, alpha); break;
-        case 1: h->add(simd, dest, input, -exp_overflow_bound); break;
-        case 2: h->csel(simd | le | f0[0], dest, dest, temp, dest); break;
-        case 3: h->mul(simd, temp, temp, log2e); break;
-        case 4: h->eexp(simd, temp, temp); break;
-        case 5: h->add(simd, temp, temp, 1.f); break;
-        case 6: h->log(simd, temp, temp); break;
-        case 7: h->mul(simd, temp, temp, reciproc_log2e); break;
-        case 8: h->csel(simd | le | f0[0], temp, temp, dest, dest); break;
-        case 9: h->mul(simd, dest, temp, 1.f / alpha); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::soft_relu_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    auto temp = scratch_[off].f();
-    soft_relu_compute_fwd_inner(simd, r, temp, r, phase, off, alpha_);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::sqrt_compute_fwd(
-        int simd, const ngen::GRF &r) {
-    h->sqt(simd, r, r);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::square_compute_fwd(
-        int simd, const ngen::GRF &r) {
-    h->mul(simd, r, r, r);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::tanh_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off, int batch) {
-    const float log2e = 1.44269502162933349609375f; // log_2(e)
-    auto one_half = scratch_[0].f(7);
-    auto a = scratch_[off + batch].f();
-    switch (phase) {
-        case 0: h->mul(simd, a, abs(r), 2.f * log2e); break;
-        case 1: h->exp(simd, a, a); break;
-        case 2: h->mad(simd, a, one_half, a, one_half); break;
-        case 3: h->inv(simd, a, a); break;
-        case 4: h->add(simd, a, -a, 1.f); break;
-        case 5: h->csel(simd | ge | f0[0], r, a, -a, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::tanh_compute_fwd_compat(
-        int simd, const ngen::GRF &r, int phase, int off, int batch) {
-    // This approximation of tanh(x) does not use the math.exp instruction
-    // that seems to be faulty on DG2-128; the exact formula is as follows:
-    // R = max(min(0.0519867*x*((x^2 + k)^2 + l)/((x^2 + m)^2 + n), 1), -1)
-    // Both absolute and relative errors are <7*10^-5 \forall x \in \mathbb R
-    auto k = scratch_[0].f(4);
-    auto l = scratch_[0].f(5);
-    auto m = scratch_[0].f(6);
-    auto n = scratch_[0].f(7);
-    auto a = scratch_[off + batch].f();
-    switch (phase) {
-        case 0: h->mad(simd, a, m, r, r); break;
-        case 1: h->mad(simd, a, n, a, a); break;
-        case 2: h->inv(simd, a, a); break;
-        case 3: h->mul(simd, a, a, r); break;
-        case 4: h->mad(simd, r, k, r, r); break;
-        case 5: h->mad(simd, r, l, r, r); break;
-        case 6: h->mul(simd, r, r, 0.0519867f); break; // 0.051986694f
-        case 7: h->mul(simd | sat, r, r, abs(a)); break;
-        case 8: h->csel(simd | ge | f0[0], r, r, -r, a); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::round_compute_fwd(
-        int simd, const ngen::GRF &r) {
-    h->rnde(simd, r, r);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::swish_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    const float log2e = 1.442695f; // log_2(e)
-    auto temp = scratch_[off].f();
-    switch (phase) {
-        case 0: h->mul(simd, temp, r, -1.f * log2e * alpha_); break;
-        case 1: h->exp(simd, temp, temp); break;
-        case 2: h->add(simd, temp, temp, 1.f); break;
-        case 3: h->inv(simd, temp, temp); break;
-        case 4: h->mul(simd, r, r, temp); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::linear_compute_fwd(
-        int simd, const ngen::GRF &r, int phase) {
-    switch (phase) {
-        case 0: h->mul(simd, r, r, alpha_); break;
-        case 1: h->add(simd, r, r, beta_); break; /* skipped if beta_ = 0 */
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::clip_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, float alpha, float beta) {
-    switch (phase) {
-        case 0: h->max_(simd, r, r, alpha); break;
-        case 1: h->min_(simd, r, r, beta); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::gelu_tanh_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-
-    const float k = 0.044715f;
-    const float sqrt_2_over_pi = 0.7978845f; // sqrt(2/pi)
-    const float log2e = 1.442695f; // log_2(e)
-
-    int msimd = simd;
-    if (hw == gpu_xe_hp)
-        msimd = 16; // workaround for intermittent hang with DPAS+EM
-
-    auto a = scratch_[off].f();
-    switch (phase) {
-        case 0: h->mul(simd, a, r, r); break;
-        case 1: h->mul(simd, a, a, k); break;
-        case 2: h->mad(simd, a, r, a, r); break;
-        case 3: h->mul(simd, a, a, -2 * sqrt_2_over_pi * log2e); break;
-        case 4: h->exp(msimd, a, a); break;
-        case 5: h->add(simd, a, a, 1.0f); break;
-        case 6: h->inv(msimd, a, a); break;
-        case 7: h->mul(simd, r, a, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::logistic_compute_fwd(
-        int simd, const ngen::GRF &r, int phase) {
-    const float log2e = 1.442695f; // log_2(e)
-    switch (phase) {
-        case 0: h->mul(simd, r, r, -1.f * log2e); break;
-        case 1: h->exp(simd, r, r); break;
-        case 2: h->add(simd, r, r, 1.f); break;
-        case 3: h->inv(simd, r, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::relu_prepare_bwd() {
-    auto neg_slope = scratch_[0].f(0);
-    auto pos_slope = scratch_[0].f(4);
-    h->mov(1, neg_slope, alpha_);
-    h->mov(1, pos_slope, 1.f);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::relu_compute_bwd(
-        int simd, const ngen::GRF &r) {
-    auto neg_slope = scratch_[0].f(0);
-    auto pos_slope = scratch_[0].f(4);
-    h->csel(simd | le | f0[0], r, neg_slope, pos_slope, r);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::abs_prepare_bwd() {
-    auto neg_one = scratch_[0].f(0);
-    auto pos_one = scratch_[0].f(4);
-    h->mov(1, neg_one, -1.f);
-    h->mov(1, pos_one, 1.f);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::clip_prepare_bwd() {
-    auto pos_inf_imm = Immediate(std::numeric_limits<float>::infinity());
-    auto zero = scratch_[0].f(0);
-    auto one = scratch_[0].f(1);
-    auto pos_inf = scratch_[0].f(2);
-    h->mov(1, zero, 0.f);
-    h->mov(1, one, 1.f);
-    h->mov(1, pos_inf, pos_inf_imm);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::tanh_prepare_fwd() {
-    auto one_half = scratch_[0].f(7);
-    h->mov(1, one_half, 0.5f);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::tanh_prepare_fwd_compat() {
-    auto k = scratch_[0].f(4);
-    auto l = scratch_[0].f(5);
-    auto m = scratch_[0].f(6);
-    auto n = scratch_[0].f(7);
-    h->mov(1, k, 77.0954f); //  77.095392909578f
-    h->mov(1, l, -4435.55f); // -4435.54623970169f
-    h->mov(1, m, 17.06396f); //  17.06396485f
-    h->mov(1, n, -212.7724f); // -212.772646402036f
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::abs_compute_bwd(
-        int simd, const ngen::GRF &r, int phase) {
-    auto neg_one = scratch_[0].f(0);
-    auto pos_one = scratch_[0].f(4);
-    switch (phase) {
-        case 0: h->csel(simd | lt | f0[0], r, neg_one, r, r); break;
-        case 1: h->csel(simd | gt | f0[0], r, pos_one, r, r); break;
-        default: break;
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::square_compute_bwd(
-        int simd, const ngen::GRF &r) {
-    h->add(simd, r, r, r);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::linear_compute_bwd(
-        int simd, const ngen::GRF &r) {
-    h->mov(simd, r, alpha_);
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::clip_compute_bwd(
-        int simd, const ngen::GRF &r, int phase, float alpha, float beta) {
-    auto zero = scratch_[0].f(0);
-    auto one = scratch_[0].f(1);
-    auto pos_inf = scratch_[0].f(2);
-    switch (phase) {
-        // r[i] = r[i] - alpha
-        case 0: h->add(simd, r, r, -alpha); break;
-        // r[i] <= 0 => r[i] = infinity
-        case 1: h->csel(simd | le | f0[0], r, pos_inf, r, r); break;
-        // r[i] = (r[i] + alpha) - beta
-        case 2: h->add(simd, r, r, alpha - beta); break;
-        // r[i] = (r[i] <= 0 ? 1 : 0)
-        case 3: h->csel(simd | le | f0[0], r, one, zero, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::gelu_tanh_compute_bwd(
-        int simd, const ngen::GRF &r, int phase, int off, int batch) {
-
-    const float k = 0.044715f;
-    const float sqrt_2_over_pi = 0.7978845f; // sqrt(2/pi)
-    const float log2e = 1.442695f; // log_2(e)
-
-    int msimd = simd;
-    if (hw == gpu_xe_hp) msimd = 16;
-
-    auto a = scratch_[off].f();
-    auto b = scratch_[off + batch].f();
-    switch (phase) {
-        case 0: h->mul(simd, a, r, r); break;
-        case 1: h->mul(simd, b, a, 3.0f * k); break;
-        case 2: h->mul(simd, a, a, k); break;
-        case 3: h->mad(simd, a, r, a, r); break;
-        case 4: h->mad(simd, b, r, b, r); break;
-        case 5: h->mul(simd, a, a, -2 * sqrt_2_over_pi * log2e); break;
-        case 6: h->mul(simd, b, b, 2 * sqrt_2_over_pi); break;
-        case 7: h->exp(msimd, a, a); break;
-        case 8: h->add(simd, r, a, 1.0f); break;
-        case 9: h->inv(msimd, r, r); break;
-        case 10: h->mul(simd, a, a, r); break;
-        case 11: h->mul(simd, a, a, b); break;
-        case 12: h->add(simd, a, a, 1.0f); break;
-        case 13: h->mul(simd, r, r, a); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::elu_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    auto temp = scratch_[off].f();
-    const float log2e = 1.442695f; // log_2(e)
-    switch (phase) {
-        case 0: h->mul(simd, temp, r, log2e); break;
-        case 1: h->exp(simd, temp, temp); break;
-        case 2: h->add(simd, temp, temp, -1.f); break;
-        case 3: h->mul(simd, temp, temp, alpha_); break;
-        case 4: h->csel(simd | le | f0[0], r, temp, r, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::exp_compute_fwd(
-        int simd, const ngen::GRF &r, int phase) {
-    const float log2e = 1.442695f; // log_2(e)
-    switch (phase) {
-        case 0: h->mul(simd, r, r, log2e); break;
-        case 1: h->exp(simd, r, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::gelu_erf_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off, int batch) {
-    auto temp = scratch_[off].f();
-    auto at_accum = scratch_[off + batch].f();
-    auto tpow = scratch_[off + 2 * batch].f();
-    auto temp2 = scratch_[off + 3 * batch].f();
-    const float log2e = 1.442695f; // log_2(e)
-    const float reciproc_sqrt_2 = 0.707106769084930419921875f; // 1/sqrt(2)
-    const float p = 0.3275911f;
-    const float a1 = 0.254829592f;
-    const float a2 = -0.284496736f;
-    const float a3 = 1.421413741f;
-    const float a4 = -1.453152027f;
-    const float a5 = 1.061405429f;
-    switch (phase) {
-        case 0: h->mul(simd, temp, abs(r), reciproc_sqrt_2); break;
-        case 1: h->mul(simd, temp, temp, p); break;
-        case 2: h->add(simd, temp, temp, 1.f); break;
-        case 3: h->inv(simd, temp, temp); break;
-        case 4: h->mul(simd, at_accum, temp, a1); break;
-        case 5: h->mul(simd, tpow, temp, temp); break;
-        case 6: h->mul(simd, temp2, tpow, a2); break;
-        case 7: h->add(simd, at_accum, temp2, at_accum); break;
-        case 8: h->mul(simd, tpow, tpow, temp); break;
-        case 9: h->mul(simd, temp2, tpow, a3); break;
-        case 10: h->add(simd, at_accum, temp2, at_accum); break;
-        case 11: h->mul(simd, tpow, tpow, temp); break;
-        case 12: h->mul(simd, temp2, tpow, a4); break;
-        case 13: h->add(simd, at_accum, temp2, at_accum); break;
-        case 14: h->mul(simd, tpow, tpow, temp); break;
-        case 15: h->mul(simd, temp2, tpow, a5); break;
-        case 16: h->add(simd, at_accum, temp2, at_accum); break;
-        case 17: h->mul(simd, temp, r, r); break;
-        case 18: h->mul(simd, temp, temp, -log2e * 0.5f); break;
-        case 19: h->exp(simd, temp, temp); break;
-        case 20: h->mul(simd, temp, temp, at_accum); break;
-        case 21: h->mul(simd, temp, temp, r); break;
-        case 22: h->mul(simd, temp, temp, 0.5f); break;
-        case 23: h->add(simd, temp2, r, -temp); break;
-        case 24: h->csel(simd | le | f0[0], r, temp, temp2, r); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::hardsigmoid_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    switch (phase) {
-        case 0: h->mul(simd, r, r, alpha_); break;
-        case 1: h->add(simd, r, r, beta_); break;
-        case 2: h->min_(simd, r, r, 1.f); break;
-        case 3: h->max_(simd, r, r, 0.f); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::hardswish_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    auto temp = scratch_[off].f();
-    switch (phase) {
-        case 0: h->mul(simd, temp, r, alpha_); break;
-        case 1: h->add(simd, temp, temp, beta_); break;
-        case 2: h->min_(simd, temp, temp, 1.f); break;
-        case 3: h->max_(simd, temp, temp, 0.f); break;
-        case 4: h->mul(simd, r, r, temp); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::log_compute_fwd(
-        int simd, const ngen::GRF &r, int phase) {
-    const float reciproc_log2e = 1.f / 1.442695f; // 1 / log_2(e)
-    switch (phase) {
-        case 0: h->log(simd, r, r); break;
-        case 1: h->mul(simd, r, r, reciproc_log2e); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::mish_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off, int batch) {
-    auto temp = scratch_[off + batch].f();
-    auto temp2 = scratch_[off + 2 * batch].f();
-    const int srelu_phases = phase_count(alg_kind::eltwise_soft_relu);
-    const int tanh_phases = phase_count(alg_kind::eltwise_tanh);
-    // note tanh_compute_fwd_* clobbers scratch_[off] and scratch_[off + batch]
-    if (phase < srelu_phases)
-        soft_relu_compute_fwd_inner(simd, r, temp, temp2, phase, off, 1.f);
-    if (phase >= srelu_phases && phase < srelu_phases + tanh_phases) {
-        if (use_tanh_compat())
-            tanh_compute_fwd_compat(
-                    simd, temp2, phase - srelu_phases, off, batch);
-        else
-            tanh_compute_fwd(simd, temp2, phase - srelu_phases, off, batch);
-    }
-    if (phase == srelu_phases + tanh_phases) h->mul(simd, r, r, temp2);
-    if (phase > srelu_phases + tanh_phases) assert(!"invalid phase");
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::pow_compute_fwd(
-        int simd, const ngen::GRF &r, int phase, int off) {
-    auto temp = scratch_[off].f();
-    switch (phase) {
-        case 0:
-            if ((long long int)beta_ == beta_) {
-                h->mov(simd, temp, abs(r));
-            } else {
-                h->mov(simd, temp, r);
-            }
-            break;
-        case 1: h->log(simd, temp, temp); break;
-        case 2: h->mul(simd, temp, temp, beta_); break;
-        case 3: h->exp(simd, temp, temp); break;
-        case 4:
-            if (((long long int)beta_) & 0x1)
-                h->csel(simd | lt | f0[0], temp, -temp, temp, r);
-            break;
-        case 5: h->mul(simd, r, temp, alpha_); break;
-        default: assert(!"invalid phase");
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::compute(const int *grfs, int ngrf) {
-    using namespace alg_kind;
-
-    auto bmax = max_batch_size();
-    auto phases = phase_count(alg_);
-
-    for (int idx0 = 0; idx0 < ngrf; idx0 += bmax) {
-        auto batch = nstl::min(ngrf - idx0, bmax);
-
-        for (int phase = 0; phase < phases; phase++) {
-            for (int ii = 0, nreg = 0; ii < batch; ii += nreg) {
-                auto grf0 = grfs[idx0 + ii];
-                auto base = GRF(grf0).f();
-
-                nreg = 1;
-                if (ii + 1 < batch)
-                    if (grf0 + 1 == grfs[idx0 + ii + 1]) nreg = 2;
-
-                int simd = nreg * GRF::bytes(hw) / sizeof(float);
-
-                if (is_fwd_) {
-                    switch (alg_) {
-                        case eltwise_elu:
-                        case eltwise_elu_use_dst_for_bwd:
-                            elu_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_exp:
-                        case eltwise_exp_use_dst_for_bwd:
-                            exp_compute_fwd(simd, base, phase);
-                            break;
-                        case eltwise_gelu_erf:
-                            gelu_erf_compute_fwd(simd, base, phase, ii, batch);
-                            break;
-                        case eltwise_hardsigmoid:
-                            hardsigmoid_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_hardswish:
-                            hardswish_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_log:
-                            log_compute_fwd(simd, base, phase);
-                            break;
-                        case eltwise_mish:
-                            mish_compute_fwd(simd, base, phase, ii, batch);
-                            break;
-                        case eltwise_pow:
-                            pow_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_relu:
-                        case eltwise_relu_use_dst_for_bwd:
-                            if (alpha_ == 0.f)
-                                relu_zero_ns_compute_fwd(simd, base);
-                            else
-                                relu_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_abs: abs_compute_fwd(simd, base); break;
-                        case eltwise_soft_relu:
-                            soft_relu_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_sqrt:
-                        case eltwise_sqrt_use_dst_for_bwd:
-                            sqrt_compute_fwd(simd, base);
-                            break;
-                        case eltwise_square:
-                            square_compute_fwd(simd, base);
-                            break;
-                        case eltwise_tanh:
-                        case eltwise_tanh_use_dst_for_bwd:
-                            if (use_tanh_compat())
-                                tanh_compute_fwd_compat(
-                                        simd, base, phase, ii, batch);
-                            else
-                                tanh_compute_fwd(simd, base, phase, ii, batch);
-                            break;
-                        case eltwise_round:
-                            round_compute_fwd(simd, base);
-                            break;
-                        case eltwise_swish:
-                            swish_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_linear:
-                            linear_compute_fwd(simd, base, phase);
-                            break;
-                        case eltwise_clip:
-                        case eltwise_clip_v2:
-                        case eltwise_clip_v2_use_dst_for_bwd:
-                            clip_compute_fwd(simd, base, phase, alpha_, beta_);
-                            break;
-                        case eltwise_gelu_tanh:
-                            gelu_tanh_compute_fwd(simd, base, phase, ii);
-                            break;
-                        case eltwise_logistic:
-                        case eltwise_logistic_use_dst_for_bwd:
-                            logistic_compute_fwd(simd, base, phase);
-                            break;
-                        default: assert(!"unsupported eltwise algorithm");
-                    }
-                } else {
-                    switch (alg_) {
-                        case eltwise_relu: relu_compute_bwd(simd, base); break;
-                        case eltwise_abs:
-                            abs_compute_bwd(simd, base, phase);
-                            break;
-                        case eltwise_square:
-                            square_compute_bwd(simd, base);
-                            break;
-                        case eltwise_linear:
-                            linear_compute_bwd(simd, base);
-                            break;
-                        case eltwise_clip:
-                            clip_compute_bwd(simd, base, phase, alpha_, beta_);
-                            break;
-                        case eltwise_gelu_tanh:
-                            gelu_tanh_compute_bwd(simd, base, phase, ii, batch);
-                            break;
-                        default: assert(!"unsupported eltwise algorithm");
-                    }
-                }
-                // Apply scale.
-                if (phase == phases - 1 && scale_ != 1.f) {
-                    h->mul(simd, base, base, scale_);
-                }
-            }
-        }
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::compute(const ngen::GRFRange &regs) {
-    int grfs[ngen::GRF::maxRegs()];
-
-    for (int i = 0; i < regs.getLen(); i++)
-        grfs[i] = regs.getBase() + i;
-
-    compute(grfs, regs.getLen());
-}
-
-template <gpu_gen_t hw>
-void jit_eltwise_injector_f32<hw>::prepare() {
-    using namespace alg_kind;
-
-    assert(scratch_.getLen() >= min_scratch_regs());
-
-    if (is_fwd_) {
-        switch (alg_) {
-            case eltwise_relu:
-            case eltwise_relu_use_dst_for_bwd:
-                if (alpha_ == 0.f) relu_zero_ns_prepare_fwd();
-                break;
-            case eltwise_mish:
-            case eltwise_tanh:
-                if (use_tanh_compat())
-                    tanh_prepare_fwd_compat();
-                else
-                    tanh_prepare_fwd();
-                break;
-            default: break;
-        }
-    } else {
-        switch (alg_) {
-            case eltwise_relu: relu_prepare_bwd(); break;
-            case eltwise_abs: abs_prepare_bwd(); break;
-            case eltwise_clip: clip_prepare_bwd(); break;
-            default: break;
-        }
-    }
-}
-
-REG_GEN9_ISA(template struct jit_eltwise_injector_f32<gpu_gen9>);
-REG_GEN11_ISA(template struct jit_eltwise_injector_f32<gpu_gen11>);
-REG_XELP_ISA(template struct jit_eltwise_injector_f32<gpu_xe_lp>);
-REG_XEHP_ISA(template struct jit_eltwise_injector_f32<gpu_xe_hp>);
-REG_XEHPG_ISA(template struct jit_eltwise_injector_f32<gpu_xe_hpg>);
-REG_XEHPC_ISA(template struct jit_eltwise_injector_f32<gpu_xe_hpc>);
-REG_XE2_ISA(template struct jit_eltwise_injector_f32<gpu_xe2>);
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/jit_eltwise_injector.hpp b/src/gpu/intel/jit/jit_eltwise_injector.hpp
deleted file mode 100644
index 0e0fc29e6aa..00000000000
--- a/src/gpu/intel/jit/jit_eltwise_injector.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_ELTWISE_INJECTOR_HPP
-#define GPU_INTEL_JIT_JIT_ELTWISE_INJECTOR_HPP
-
-#include <assert.h>
-
-#include "common/c_types_map.hpp"
-#include "common/utils.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-inline bool jit_eltwise_injector_f32_is_supported(alg_kind_t alg) {
-    using namespace alg_kind;
-    return utils::one_of(alg, eltwise_elu, eltwise_elu_use_dst_for_bwd,
-            eltwise_exp, eltwise_exp_use_dst_for_bwd, eltwise_gelu_tanh,
-            eltwise_gelu_erf, eltwise_hardsigmoid, eltwise_hardswish,
-            eltwise_log, eltwise_mish, eltwise_pow, eltwise_relu,
-            eltwise_relu_use_dst_for_bwd, eltwise_soft_relu, eltwise_sqrt,
-            eltwise_sqrt_use_dst_for_bwd, eltwise_square, eltwise_swish,
-            eltwise_tanh, eltwise_tanh_use_dst_for_bwd, eltwise_abs,
-            eltwise_round, eltwise_linear, eltwise_clip, eltwise_clip_v2,
-            eltwise_clip_v2_use_dst_for_bwd, eltwise_logistic,
-            eltwise_logistic_use_dst_for_bwd);
-}
-
-template <gpu_gen_t hw>
-struct jit_eltwise_injector_f32 {
-    jit_eltwise_injector_f32(jit_generator<hw> *host, alg_kind_t alg,
-            float alpha, float beta, float scale, int eu_count,
-            const ngen::GRFRange &scratch = ngen::GRFRange(),
-            bool is_fwd = true)
-        : alg_(alg)
-        , alpha_(alpha)
-        , beta_(beta)
-        , scale_(scale)
-        , is_fwd_(is_fwd)
-        , eu_count_(eu_count)
-        , h(host)
-        , scratch_(scratch) {
-
-        assert(jit_eltwise_injector_f32_is_supported(alg_));
-        assert(scratch_.isEmpty() || (scratch_.getLen() >= min_scratch_regs()));
-    }
-
-    int min_scratch_regs();
-    int preferred_scratch_regs();
-    void set_scratch(const ngen::GRFRange &scratch) { scratch_ = scratch; }
-
-    void prepare();
-    void compute(const ngen::GRF &reg) { compute(reg - reg); }
-    void compute(const ngen::GRFRange &regs);
-    void compute(const int *grfs, int ngrf);
-
-private:
-    const alg_kind_t alg_;
-    const float alpha_;
-    const float beta_;
-    const float scale_;
-    const bool is_fwd_;
-
-    const int eu_count_;
-
-    jit_generator<hw> *h;
-
-    ngen::GRFRange scratch_;
-
-    bool is_gpu(ngen::HW arg_hw, int arg_eu_count) const {
-        return (hw == arg_hw) && (eu_count_ == arg_eu_count);
-    }
-    bool use_tanh_compat() const { return false; }
-
-    int max_batch_size();
-    int phase_count(alg_kind_t alg);
-
-    void relu_zero_ns_prepare_fwd();
-    void relu_prepare_bwd();
-    void abs_prepare_bwd();
-    void clip_prepare_bwd();
-    void tanh_prepare_fwd();
-    void tanh_prepare_fwd_compat();
-
-    void relu_zero_ns_compute_fwd(int simd, const ngen::GRF &r);
-    void relu_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
-    void abs_compute_fwd(int simd, const ngen::GRF &r);
-    void exp_compute_fwd(int simd, const ngen::GRF &r, int phase);
-    void elu_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
-    void gelu_erf_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off, int batch);
-    void hardsigmoid_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off);
-    void hardswish_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off);
-    void log_compute_fwd(int simd, const ngen::GRF &r, int phase);
-    void mish_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off, int batch);
-    void pow_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
-    void soft_relu_compute_fwd_inner(int simd, const ngen::GRF &input,
-            const ngen::GRF &temp, const ngen::GRF &dest, int phase, int off,
-            float alpha);
-    void soft_relu_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off);
-    void sqrt_compute_fwd(int simd, const ngen::GRF &r);
-    void square_compute_fwd(int simd, const ngen::GRF &r);
-    void round_compute_fwd(int simd, const ngen::GRF &r);
-    void swish_compute_fwd(int simd, const ngen::GRF &r, int phase, int off);
-    void tanh_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off, int batch);
-    void tanh_compute_fwd_compat(
-            int simd, const ngen::GRF &r, int phase, int off, int batch);
-    void linear_compute_fwd(int simd, const ngen::GRF &r, int phase);
-    void clip_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, float alpha, float beta);
-    void gelu_tanh_compute_fwd(
-            int simd, const ngen::GRF &r, int phase, int off);
-    void logistic_compute_fwd(int simd, const ngen::GRF &r, int phase);
-
-    void relu_compute_bwd(int simd, const ngen::GRF &r);
-    void abs_compute_bwd(int simd, const ngen::GRF &r, int phase);
-    void square_compute_bwd(int simd, const ngen::GRF &r);
-    void linear_compute_bwd(int simd, const ngen::GRF &r);
-    void clip_compute_bwd(
-            int simd, const ngen::GRF &r, int phase, float alpha, float beta);
-    void gelu_tanh_compute_bwd(
-            int simd, const ngen::GRF &r, int phase, int off, int batch);
-
-    const ngen::InstructionModifier le = jit_generator<hw>::le;
-    const ngen::InstructionModifier lt = jit_generator<hw>::lt;
-    const ngen::InstructionModifier ge = jit_generator<hw>::ge;
-    const ngen::InstructionModifier gt = jit_generator<hw>::gt;
-    const ngen::InstructionModifier eq = jit_generator<hw>::eq;
-    const ngen::InstructionModifier sat = jit_generator<hw>::sat;
-    const ngen::FlagRegister f0 = jit_generator<hw>::f0;
-};
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_JIT_JIT_ELTWISE_INJECTOR_HPP
diff --git a/src/gpu/intel/jit/jit_generator.cpp b/src/gpu/intel/jit/jit_generator.cpp
deleted file mode 100644
index ee195aecd3b..00000000000
--- a/src/gpu/intel/jit/jit_generator.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/jit/jit_generator.hpp"
-
-#include "gpu/intel/jit/utils/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-void check_kernel_size(const std::string &kernel_name, size_t kernel_size,
-        size_t icache_size) {
-    if (kernel_size > icache_size) {
-        ir_warning() << kernel_name
-                     << " larger than icache, kernel: " << kernel_size
-                     << " bytes, icache: " << icache_size << " bytes\n";
-    }
-}
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/jit_generator.hpp b/src/gpu/intel/jit/jit_generator.hpp
deleted file mode 100644
index cb6a4b50a02..00000000000
--- a/src/gpu/intel/jit/jit_generator.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_GENERATOR_HPP
-#define GPU_INTEL_JIT_JIT_GENERATOR_HPP
-
-#include <memory>
-
-// Must be included before emulation.hpp
-#include "gpu/intel/jit/ngen/ngen.hpp"
-
-#include "common/impl_registration.hpp"
-#include "common/nstl.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/jit/emulation.hpp"
-#include "gpu/intel/jit/jit_generator_base.hpp"
-#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
-#include "xpu/utils.hpp"
-
-#include "gpu/intel/jit/ngen/ngen_opencl.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-using gpu_gen_t = ngen::HW;
-constexpr gpu_gen_t gpu_gen9 = ngen::HW::Gen9;
-constexpr gpu_gen_t gpu_gen11 = ngen::HW::Gen11;
-constexpr gpu_gen_t gpu_xe_lp = ngen::HW::XeLP;
-constexpr gpu_gen_t gpu_xe_hp = ngen::HW::XeHP;
-constexpr gpu_gen_t gpu_xe_hpg = ngen::HW::XeHPG;
-constexpr gpu_gen_t gpu_xe_hpc = ngen::HW::XeHPC;
-constexpr gpu_gen_t gpu_xe2 = ngen::HW::Xe2;
-
-// nGEN jit generator
-//
-// The main purpose of this header file is to provide extra features for nGEN
-// kernel generator, e.g. additional macros and debugging capabilities.
-//
-// Jit generator provides additional memory to simplify kernel debugging. This
-// memory is allocated using Shared Virtual Memory (SVM) feature in OpenCL 2.0.
-// SVM enables the host and device portions of an OpenCL application to
-// seamlessly share pointers and complex pointer-containing data-structures.
-// This memory can be used to dump state of GPU registers or view GPU memory on
-// the host in debugger.
-//
-// In order to use debug memory:
-// 1.  Allocate it using 'void jit_generator::dbg_alloc(cl_context context)'
-// 2.  Get memory pointer using 'void* jit_generator::dbg_memory()'
-// 3.  Pass it as extra OpenCL kernel argument and define it as new argument in
-//     kernel interface at corresponding order.
-// 4.  Set a breakpoint after 'dnnl_stream_wait()', memory will be available on
-//     the host side after kernel execution.
-//
-// A short example below demonstrates how to use debug memory:
-//
-//  ``` c++
-//  status_t primitive_impl_t::execute(const exec_ctx_t &ctx) {
-//      ...
-//      auto gpu_engine = utils::downcast<ocl_gpu_engine*>(engine);
-//      jit_generator->dbg_alloc(gpu_engine->context());
-//      void* dbg_mem = jit_generator->dbg_memory();
-//      ...
-//      compute::kernel_arg_list_t arg_list;
-//      arg_list.set(0, src);
-//      arg_list.set(1, dst);
-//      arg_list.set(2, dbg_mem, kernel_arg_t::kind_t::svm);
-//      ...
-//      parallel_for(ctx, nd_range, kernel_, arg_list);
-//  }
-//
-//  ngen_kernel_t() : jit_generator<...>() {
-//      externalName("ngen_kernel");
-//      newArgument("src", GlobalPtr);
-//      newArgument("dst", GlobalPtr);
-//      newArgument("dbg_mem", GlobalPtr);
-//      finalizeInterface();
-//      ...
-//      auto header = r32;
-//      auto data = r64;
-//      mov<uint64_t>(1, r64, getArgument("dbg_mem"));
-//      store(1, scattered_dword(), A64, header, data);
-//      ...
-//  }
-//  ```
-//
-
-template <gpu_gen_t hw>
-struct jit_eltwise_injector_f32;
-
-template <gpu_gen_t hw>
-struct jit_reduction_injector_f32;
-
-template <gpu_gen_t hw>
-struct jit_post_op_injector;
-
-template <gpu_gen_t hw>
-class jit_generator : public ngen::OpenCLCodeGenerator<hw>,
-                      public jit_generator_base {
-    friend struct jit_eltwise_injector_f32<hw>;
-    friend struct jit_reduction_injector_f32<hw>;
-    friend struct jit_post_op_injector<hw>;
-    friend struct EmulationImplementation;
-
-private:
-#ifdef CL_VERSION_2_0
-    struct svm_deleter {
-        cl_context context_;
-
-        void operator()(void *ptr) noexcept {
-            if (ptr) clSVMFree(context_, ptr);
-        }
-    };
-    std::unique_ptr<void, svm_deleter> dbg_memory_;
-#endif
-
-public:
-    jit_generator() = default;
-
-    const char *kernel_name() const override {
-        return ngen::OpenCLCodeGenerator<hw>::getExternalName().c_str();
-    }
-
-    xpu::binary_t get_binary(cl_context context, cl_device_id device) override {
-        return ngen::OpenCLCodeGenerator<hw>::getBinary(context, device);
-    }
-
-#ifdef CL_VERSION_2_0
-    void dbg_alloc(cl_context context);
-    void *dbg_memory() const { return dbg_memory_.get(); }
-#endif
-
-    void emath(ngen::MathFunction fc, int simd, ngen::GRF dst, ngen::GRF src) {
-        const int max_exec_size = ngen::GRF::bytes(hw) / sizeof(float);
-        for (; simd > 0; simd -= max_exec_size, dst++, src++)
-            this->math(nstl::min(simd, max_exec_size), fc, dst, src);
-    }
-    void eexp(int simd, const ngen::GRF &dst, const ngen::GRF &src) {
-        emath(ngen::MathFunction::exp, simd, dst, src);
-    }
-    void einv(int simd, const ngen::GRF &dst, const ngen::GRF &src) {
-        emath(ngen::MathFunction::inv, simd, dst, src);
-    }
-};
-
-#ifdef CL_VERSION_2_0
-template <gpu_gen_t hw>
-void jit_generator<hw>::dbg_alloc(cl_context context) {
-    constexpr size_t size = 1048576;
-    void *mem = clSVMAlloc(
-            context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, 0);
-    dbg_memory_ = decltype(dbg_memory_)(mem, svm_deleter {context});
-    memset(mem, 0xcd, size);
-}
-#endif
-
-void check_kernel_size(
-        const std::string &kernel_name, size_t kernel_size, size_t icache_size);
-
-template <template <ngen::HW> class KernelT, ngen::HW arch, typename... ArgsT>
-std::unique_ptr<jit::jit_generator_base> make_generator(
-        const compute::device_info_t &device_info, ArgsT &&...args) {
-
-    auto raw_kernel = new KernelT<arch>(std::forward<ArgsT>(args)...);
-    check_kernel_size(raw_kernel->kernel_name(),
-            raw_kernel->getRootStreamLength(), device_info.icache_size());
-    return std::unique_ptr<jit::jit_generator_base>(raw_kernel);
-}
-
-template <template <ngen::HW> class KernelT, typename... ArgsT>
-compute::kernel_t make_kernel(gpu_primitive_t *primitive, bool register_kernel,
-        impl::engine_t *engine, ArgsT &&...args) {
-    using namespace compute;
-    kernel_t kernel;
-
-    if (primitive->cache_blob()) {
-        status_t status = primitive->create_kernel(
-                engine, &kernel, nullptr, register_kernel);
-        if (status != status::success) return kernel_t();
-        return kernel;
-    }
-
-    auto *compute_engine = utils::downcast<compute_engine_t *>(engine);
-    auto *device_info = compute_engine->device_info();
-    auto arch = convert_dnnl_arch_to_ngen(device_info->gpu_arch());
-
-    std::unique_ptr<jit::jit_generator_base> jit_kernel;
-#define CASE(gpu_arch) \
-    case gpu_arch: \
-        jit_kernel = make_generator<KernelT, gpu_arch>( \
-                *device_info, std::forward<ArgsT>(args)...); \
-        break;
-    switch (arch) {
-        REG_GEN9_ISA(CASE(gpu_gen9));
-        REG_GEN11_ISA(CASE(gpu_gen11));
-        REG_XELP_ISA(CASE(gpu_xe_lp));
-        REG_XEHP_ISA(CASE(gpu_xe_hp));
-        REG_XEHPG_ISA(CASE(gpu_xe_hpg));
-        REG_XEHPC_ISA(CASE(gpu_xe_hpc));
-        REG_XE2_ISA(CASE(gpu_xe2));
-        default: break;
-    }
-#undef CASE
-
-    if (!jit_kernel) return kernel_t();
-
-    status_t status = primitive->create_kernel(
-            engine, &kernel, jit_kernel.get(), register_kernel);
-    if (status != status::success) return kernel_t();
-    return kernel;
-}
-
-template <template <ngen::HW> class KernelT, typename... ArgsT>
-compute::kernel_t make_kernel(
-        gpu_primitive_t *primitive, impl::engine_t *engine, ArgsT &&...args) {
-    return make_kernel<KernelT>(primitive, /*register_kernel=*/true, engine,
-            std::forward<ArgsT>(args)...);
-}
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_JIT_JIT_GENERATOR_HPP
diff --git a/src/gpu/intel/jit/jit_generator_base.hpp b/src/gpu/intel/jit/jit_generator_base.hpp
deleted file mode 100644
index 883d8fbb69e..00000000000
--- a/src/gpu/intel/jit/jit_generator_base.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_GENERATOR_BASE_HPP
-#define GPU_INTEL_JIT_JIT_GENERATOR_BASE_HPP
-
-#include <vector>
-#include <CL/cl.h>
-
-#include "xpu/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-struct jit_generator_base {
-    virtual ~jit_generator_base() = default;
-    virtual const char *kernel_name() const = 0;
-    virtual xpu::binary_t get_binary(cl_context context, cl_device_id device)
-            = 0;
-};
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_JIT_JIT_GENERATOR_BASE_HPP
diff --git a/src/gpu/intel/jit/jit_post_op_injector.cpp b/src/gpu/intel/jit/jit_post_op_injector.cpp
deleted file mode 100644
index b5e4cc33ad7..00000000000
--- a/src/gpu/intel/jit/jit_post_op_injector.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "gpu/intel/jit/jit_post_op_injector.hpp"
-#include "common/impl_registration.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-using namespace ngen;
-
-template <gpu_gen_t hw>
-int jit_post_op_injector<hw>::min_scratch_regs() {
-    int regs_cnt = 0;
-    for (size_t idx = 0; idx < workers_.size(); ++idx) {
-        regs_cnt = nstl::max(regs_cnt, workers_[idx].min_scratch_regs());
-    }
-    return regs_cnt;
-}
-
-template <gpu_gen_t hw>
-int jit_post_op_injector<hw>::preferred_scratch_regs() {
-    int regs_cnt = 0;
-    for (size_t idx = 0; idx < workers_.size(); ++idx) {
-        regs_cnt = nstl::max(regs_cnt, workers_[idx].preferred_scratch_regs());
-    }
-    return regs_cnt;
-}
-
-template <gpu_gen_t hw>
-void jit_post_op_injector<hw>::set_scratch(const ngen::GRFRange &scratch) {
-    for (size_t idx = 0; idx < workers_.size(); ++idx) {
-        workers_[idx].set_scratch(scratch);
-        if (workers_.size() == 1) workers_[idx].prepare();
-    }
-    scratch_ = scratch;
-}
-
-template <gpu_gen_t hw>
-void jit_post_op_injector<hw>::compute(const ngen::GRFRange &regs) {
-    for (size_t idx = 0; idx < workers_.size(); ++idx) {
-        if (workers_.size() > 1) workers_[idx].prepare();
-        workers_[idx].compute(regs);
-    }
-}
-
-REG_GEN9_ISA(template struct jit_post_op_injector<gpu_gen9>);
-REG_GEN11_ISA(template struct jit_post_op_injector<gpu_gen11>);
-REG_XELP_ISA(template struct jit_post_op_injector<gpu_xe_lp>);
-REG_XEHP_ISA(template struct jit_post_op_injector<gpu_xe_hp>);
-REG_XEHPG_ISA(template struct jit_post_op_injector<gpu_xe_hpg>);
-REG_XEHPC_ISA(template struct jit_post_op_injector<gpu_xe_hpc>);
-REG_XE2_ISA(template struct jit_post_op_injector<gpu_xe2>);
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/jit_post_op_injector.hpp b/src/gpu/intel/jit/jit_post_op_injector.hpp
deleted file mode 100644
index 41e4d0362a5..00000000000
--- a/src/gpu/intel/jit/jit_post_op_injector.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_POST_OP_INJECTOR_HPP
-#define GPU_INTEL_JIT_JIT_POST_OP_INJECTOR_HPP
-
-#include "common/primitive_attr.hpp"
-#include "gpu/intel/gpu_post_ops.hpp"
-#include "gpu/intel/jit/jit_eltwise_injector.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-inline bool jit_post_op_injector_is_supported(
-        const post_ops_t &post_ops, bool skip_sum) {
-    bool is_supported = true;
-    for (int idx = 0; idx < post_ops.len(); ++idx) {
-        const auto &po = post_ops.entry_[idx];
-        if (po.is_binary())
-            is_supported &= false;
-        else if (po.is_convolution())
-            is_supported &= false;
-        else if (po.is_eltwise())
-            is_supported
-                    &= jit_eltwise_injector_f32_is_supported(po.eltwise.alg);
-        else if (po.is_sum(false, false))
-            is_supported &= skip_sum;
-    }
-    return is_supported;
-}
-
-template <gpu_gen_t hw>
-struct jit_post_op_injector {
-    jit_post_op_injector(jit_generator<hw> *host, data_type_t accumulator_type,
-            const post_ops_t &post_ops, int eu_count,
-            const ngen::GRFRange &scratch = ngen::GRFRange(),
-            bool is_fwd = true)
-        : is_fwd_(is_fwd), scratch_(scratch) {
-        assert(accumulator_type == data_type_t::dnnl_f32);
-        workers_.reserve(post_ops.len());
-        for (int idx = 0; idx < post_ops.len(); ++idx) {
-            const auto &po = post_ops.entry_[idx];
-            if (po.is_eltwise())
-                workers_.emplace_back(host, po.eltwise.alg, po.eltwise.alpha,
-                        po.eltwise.beta, po.eltwise.scale, eu_count, scratch,
-                        is_fwd);
-        }
-    }
-
-    jit_post_op_injector(jit_generator<hw> *host, data_type_t accumulator_type,
-            const gpu_post_ops_t &post_ops, int eu_count,
-            const ngen::GRFRange &scratch = ngen::GRFRange(),
-            bool is_fwd = true)
-        : is_fwd_(is_fwd), scratch_(scratch) {
-        assert(accumulator_type == data_type_t::dnnl_f32);
-        workers_.reserve(post_ops.len());
-        for (auto &po : post_ops) {
-            if (po.is_eltwise()) {
-                auto &e = po.as_eltwise();
-                workers_.emplace_back(host, e.alg, e.alpha, e.beta, e.scale,
-                        eu_count, scratch, is_fwd);
-            }
-        }
-    }
-
-    int min_scratch_regs();
-    int preferred_scratch_regs();
-    void set_scratch(const ngen::GRFRange &scratch);
-
-    void compute(const ngen::GRF &reg) { compute(reg - reg); }
-    void compute(const ngen::GRFRange &regs);
-
-private:
-    std::vector<jit_eltwise_injector_f32<hw>> workers_;
-    bool is_fwd_;
-    ngen::GRFRange scratch_;
-};
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_JIT_JIT_POST_OP_INJECTOR_HPP
diff --git a/src/gpu/intel/jit/jit_reduction.cpp b/src/gpu/intel/jit/jit_reduction.cpp
deleted file mode 100644
index 09bf024f549..00000000000
--- a/src/gpu/intel/jit/jit_reduction.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-// A small wrapper on the jit_reduction_generator_t, used to test its functionality.
-// Only valid in dev mode for now, until performance is improved.
-#include "gpu/intel/gpu_primitive_attr.hpp"
-#ifdef DNNL_DEV_MODE
-
-#include "common/c_types_map.hpp"
-#include "common/compiler_workarounds.hpp"
-#include "common/utils.hpp"
-#include "gpu/intel/compute/compute_engine.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/jit/jit_reduction.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-using namespace gpu_utils;
-
-status_t jit_reduction_t::pd_t::init_conf(impl::engine_t *engine) {
-    const memory_desc_wrapper src_mdw(src_md());
-    const memory_desc_wrapper dst_mdw(dst_md());
-    const int ndims = src_mdw.ndims();
-    const dim_t *src_dims = src_mdw.dims();
-    const dim_t *dst_dims = dst_mdw.dims();
-
-    // Allow plain formats only
-    bool plain_case = true;
-    plain_case &= (src_mdw.blocking_desc().inner_nblks == 0);
-    plain_case &= (dst_mdw.blocking_desc().inner_nblks == 0);
-    if (!plain_case) return status::unimplemented;
-
-    // Allow only 1 reduced dimension, for now
-    for (int i = 0; i < ndims; i++) {
-        bool is_reduced = (src_dims[i] != dst_dims[i]);
-        if (is_reduced && reduction_size) return status::unimplemented;
-        if (is_reduced) {
-            reduction_size = src_dims[i];
-            reduction_stride = src_mdw.blocking_desc().strides[i];
-        }
-    }
-    assert(reduction_size);
-    assert(reduction_stride);
-
-    dim_t dst_nelems = dst_mdw.nelems();
-    dim_t inner_nelems = reduction_stride;
-    int dt_size = into<int>(sizeof(float));
-
-    auto &compute_engine
-            = *utils::downcast<compute::compute_engine_t *>(engine);
-    const compute::device_info_t &device_info = *compute_engine.device_info();
-    int reg_size = device_info.grf_size();
-    int elems_per_reg = reg_size / dt_size;
-    int default_nregs
-            = utils::max_div(into<int>(inner_nelems / elems_per_reg), 16);
-
-    nregs = dev_getenv("jit_reduction_nregs", into<int>(default_nregs));
-
-    // Only allow cases where inner size aligns with register size
-    if (inner_nelems % (elems_per_reg * nregs) != 0)
-        return status::unimplemented;
-
-    // Grouping threads into threadgroups: ensures better access patterns (we can use barriers)
-    // --> Use the largest threadgroup possible, must fit within the inner dimension
-    dim_t gws0 = inner_nelems / nregs;
-    dim_t nthreads = gws0 / elems_per_reg;
-    int tg_size = [this, &device_info, &nthreads]() {
-        const compute::gpu_arch_t arch = device_info.gpu_arch();
-        auto *gpu_attr = utils::downcast<gpu_primitive_attr_t *>(
-                attr()->gpu_attr_.get());
-        const int threads_per_eu = gpu_attr
-                ? gpu_attr->threads_per_eu()
-                : compute::device_info_t::threads_per_eu(arch);
-        int tg_size = utils::rnd_down_pow2(
-                device_info.max_eus_per_wg() * threads_per_eu);
-        while (nthreads % tg_size != 0) {
-            tg_size /= 2;
-        }
-        tg_size = dev_getenv("jit_reduction_tg_size", tg_size);
-        return tg_size;
-    }();
-    gpu_assert(nthreads % tg_size == 0) << "Invalid tg_size";
-
-    // valid case, now compute the nd_range_t
-    dim_t outer_nelems = dst_nelems / inner_nelems;
-    compute::range_t gws(into<size_t>(gws0), into<size_t>(outer_nelems));
-    compute::range_t lws(into<size_t>(tg_size * elems_per_reg), 1);
-    nd_range = compute::nd_range_t(gws, lws);
-
-    return status::success;
-}
-
-status_t jit_reduction_t::execute(const exec_ctx_t &ctx) const {
-    auto &src = CTX_IN_STORAGE(DNNL_ARG_SRC);
-    auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST);
-
-    // Set up the reduction arg list
-    compute::kernel_arg_list_t reduction_arg_list;
-
-    reduction_arg_list.append(src);
-    reduction_arg_list.append(dst);
-
-    CHECK(parallel_for(ctx, pd()->nd_range, kernel_, reduction_arg_list));
-
-    return status::success;
-}
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/jit/jit_reduction.hpp b/src/gpu/intel/jit/jit_reduction.hpp
deleted file mode 100644
index 89f95277898..00000000000
--- a/src/gpu/intel/jit/jit_reduction.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_REDUCTION_HPP
-#define GPU_INTEL_JIT_JIT_REDUCTION_HPP
-
-// A small wrapper on the jit_reduction_generator_t, used to test its functionality.
-// Only valid in dev mode for now, until performance is improved.
-#ifdef DNNL_DEV_MODE
-
-#include "common/c_types_map.hpp"
-#include "common/primitive.hpp"
-#include "common/reduction_pd.hpp"
-#include "common/utils.hpp"
-#include "gpu/gpu_reduction_pd.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/jit/jit_reduction_generator.hpp"
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-#include "gpu/intel/primitive_conf.hpp"
-#include "gpu/intel/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-struct jit_reduction_t : public gpu_primitive_t {
-    using gpu_primitive_t::gpu_primitive_t;
-    struct pd_t : public gpu_reduction_pd_t {
-        using gpu_reduction_pd_t::gpu_reduction_pd_t;
-
-        DECLARE_COMMON_PD_T("jit:ref", jit_reduction_t);
-
-        status_t init(impl::engine_t *engine) {
-            // Require the corresponding environment variable - skip this impl
-            // unless requested (do not report this skip to verbose)
-            bool enable_jit_reduction
-                    = gpu_utils::dev_getenv("enable_jit_reduction", false);
-            if (!enable_jit_reduction) return status::unimplemented;
-
-            using smask_t = primitive_attr_t::skip_mask_t;
-            const auto attr_skip_mask = smask_t::gpu_attr;
-            VDISPATCH_REDUCTION_SC(
-                    set_default_params(), VERBOSE_UNSUPPORTED_TAG);
-            VDISPATCH_REDUCTION(attr()->has_default_values(attr_skip_mask),
-                    VERBOSE_UNSUPPORTED_ATTR);
-            VDISPATCH_REDUCTION(!memory_desc_ndims_ok(src_md(), dst_md()),
-                    VERBOSE_INCONSISTENT_NDIMS, "src", "dst");
-            VDISPATCH_REDUCTION_SC(attr_.set_default_formats(dst_md(0)),
-                    VERBOSE_UNSUPPORTED_TAG);
-            // Only f32 supported for now
-            VDISPATCH_REDUCTION(
-                    utils::everyone_is(data_type::f32, src_md()->data_type,
-                            dst_md()->data_type),
-                    VERBOSE_UNSUPPORTED_DT);
-            // Make sure we can use the injector for this problem
-            VDISPATCH_REDUCTION(
-                    jit_reduction_injector_f32_is_supported(desc()->alg_kind),
-                    VERBOSE_BAD_ALGORITHM);
-
-            VDISPATCH_REDUCTION_SC(init_conf(engine), "init_conf");
-
-            return status::success;
-        }
-
-        status_t init_conf(impl::engine_t *engine);
-        dim_t reduction_size = 0;
-        dim_t reduction_stride = 0;
-        int nregs = 1;
-        compute::nd_range_t nd_range;
-    };
-
-    status_t init(impl::engine_t *engine) override {
-        compute::kernel_ctx_t kernel_ctx;
-
-        auto *gpu_engine = utils::downcast<ocl::ocl_gpu_engine_t *>(engine);
-        if (!gpu_engine) return status::runtime_error;
-
-        const compute::device_info_t &device_info = *gpu_engine->device_info();
-        kernel_ = make_kernel<jit_reduction_generator_t>(this, engine,
-                device_info, pd()->desc()->alg_kind, pd()->reduction_stride,
-                pd()->reduction_size, pd()->nregs);
-        if (!kernel_) return status::runtime_error;
-
-        return status::success;
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override;
-
-private:
-    const pd_t *pd() const {
-        return reinterpret_cast<const pd_t *>(primitive_t::pd().get());
-    }
-
-    compute::kernel_t kernel_;
-};
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // DNNL_DEV_MODE
-#endif
diff --git a/src/gpu/intel/jit/jit_reduction_generator.hpp b/src/gpu/intel/jit/jit_reduction_generator.hpp
deleted file mode 100644
index cc1f194cbd6..00000000000
--- a/src/gpu/intel/jit/jit_reduction_generator.hpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
- * Copyright 2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_REDUCTION_GENERATOR_HPP
-#define GPU_INTEL_JIT_JIT_REDUCTION_GENERATOR_HPP
-
-#include "common/c_types_map.hpp"
-#include "common/nstl.hpp"
-#include "common/utils.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/jit/emulated_generator.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-#include "gpu/intel/jit/jit_reduction_injector.hpp"
-#include "gpu/intel/jit/ngen/ngen_core.hpp"
-#include "gpu/intel/jit/ngen/ngen_interface.hpp"
-#include "gpu/intel/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-template <gpu_gen_t hw>
-class jit_reduction_generator_t : public emulated_generator_t<hw> {
-protected:
-    NGEN_FORWARD_OPENCL(hw);
-    FORWARD_EMULATION(hw);
-
-public:
-    jit_reduction_generator_t(const compute::device_info_t &device_info,
-            alg_kind_t alg, dim_t stride, dim_t iters, int nregs)
-        : emulated_generator_t<hw>(device_info, "ngen_jit_reduction") {
-        constexpr auto GlobalPtr = ngen::ExternalArgumentType::GlobalPtr;
-
-        // Number of dst elements computed per thread
-        const int grf_bytes = ngen::GRF::bytes(hw);
-        const int dt_size = sizeof(float);
-        int simd = grf_bytes / dt_size;
-
-        newArgument("src_ptr", GlobalPtr);
-        newArgument("dst_ptr", GlobalPtr);
-        setDefaultAutoSWSB();
-        requireSIMD(simd);
-
-        requireLocalSize();
-        externalName("ngen_jit_reduction");
-        finalizeInterface();
-
-        prologue();
-        setDefaultNoMask();
-
-        ra().claim(r0);
-        ngen::Subregister tg_idx0 = r0.ud(1);
-        ngen::Subregister tg_idx1 = r0.ud(6);
-        ngen::Subregister tid = r0.ud(2).b(0);
-        ngen::Subregister tg_size0 = getLocalSize(0).uw();
-        ngen::Subregister src_ptr = getArgument("src_ptr").uq();
-        ngen::Subregister dst_ptr = getArgument("dst_ptr").uq();
-        ra().claim(src_ptr);
-        ra().claim(dst_ptr);
-        ra().claim(tg_size0);
-
-        // SRC offset: tid*grf_bytes*nregs + tg_idx0*tg_size0*dt_size*nregs + tg_idx1*stride*iters*dt_size
-        // DST offset: tid*grf_bytes*nregs + tg_idx0*tg_size0*dt_size*nregs + tg_idx1*stride*dt_size
-        ngen::Subregister inner_off = ra().alloc_sub(ngen::DataType::ud);
-        ngen::Subregister outer_off = ra().alloc_sub(ngen::DataType::ud);
-        emul(1, inner_off, tg_idx0, tg_size0);
-        emul(1, inner_off, inner_off, dt_size * nregs);
-        emad(1, inner_off, inner_off, tid, nregs * grf_bytes);
-
-        emul(1, outer_off, tg_idx1, stride * dt_size);
-
-        ngen::GRF src_addr = ra().alloc().uq();
-        ngen::GRF dst_addr = ra().alloc().uq();
-        emad(1, src_addr, inner_off, outer_off, iters);
-        eadd(1, src_addr, src_addr, src_ptr);
-        eadd3(1, dst_addr, dst_ptr, inner_off, outer_off);
-        ra().release(inner_off);
-        ra().release(outer_off);
-
-        ngen::GRFRange acc = ra().alloc_range(nregs);
-        jit_reduction_injector_f32<hw> reduce(
-                *this, alg, ra(), device_info.stepping_id());
-        reduce.compute(src_addr, acc, stride, iters);
-        ra().release(src_addr);
-
-        finalize(simd, alg, acc, iters);
-
-        estore(dst_addr, acc);
-
-        ra().release(acc);
-        ra().release(dst_addr);
-
-        ra().release(r0);
-        ra().release(src_ptr);
-        ra().release(dst_ptr);
-        ra().release(tg_size0);
-#ifdef DNNL_DEV_MODE
-        gpu_assert(ra().get_alloced_regs() == 0)
-                << ra().get_alloced_regs()
-                << " registers are allocated that need to be released.";
-#endif
-
-        epilogue();
-    }
-
-protected:
-    // Store data from a contiguous range of registers into a contiguous
-    // range in global memory (block store)
-    void estore(const ngen::GRF &base_dst_addr, const ngen::GRFRange &src) {
-        const int grf_bytes = ngen::GRF::bytes(hw);
-        int nregs = src.getLen();
-        bool force_legacy = gpu_utils::dev_getenv(
-                "jit_reduction_force_legacy_send", false);
-        bool use_legacy = force_legacy || hw < ngen::HW::XeHPG;
-        const int max_store_size = use_legacy ? 128 : 512;
-        gpu_assert(max_store_size % grf_bytes == 0) << "Unexpected store size";
-        const int max_store_regs = max_store_size / grf_bytes;
-
-        // Load in chunks
-        int reg_start = 0;
-        while (reg_start < nregs) {
-            int store_regs = nstl::min(max_store_regs, nregs - reg_start);
-            // Compute the src address
-            ngen::GRF addr = ra().alloc().uq();
-            eadd(1, addr, base_dst_addr, reg_start * grf_bytes);
-            if (use_legacy) {
-                // Reduce store_regs according to valid store sizes
-                const int oword_per_grf = grf_bytes / 16;
-                for (auto store_owords : {8, 4, 2, 1}) {
-                    if (store_owords / oword_per_grf > store_regs) continue;
-                    store_regs = store_owords / oword_per_grf;
-                    break;
-                }
-
-                // Do the store
-                auto dt = ngen::aligned_block_oword(store_regs * oword_per_grf);
-                store(1, dt, A64, addr, src[reg_start]);
-            } else {
-                // Reduce store_regs according to valid store sizes
-                const int d64_per_grf = grf_bytes / 8;
-                for (auto store_d64s : {64, 32, 16, 8, 4, 3, 2, 1}) {
-                    if (store_d64s / d64_per_grf > store_regs) continue;
-                    store_regs = store_d64s / d64_per_grf;
-                    break;
-                }
-
-                // Do the store
-                ngen::DataSpecLSC lscspec = ngen::CacheSettingsLSC::L1UC_L3WB;
-                lscspec |= ngen::block(
-                        ngen::DataSizeLSC::D64, store_regs * d64_per_grf);
-                store.ugm(1, lscspec, A64, addr, src[reg_start]);
-            }
-            reg_start += store_regs;
-            ra().release(addr);
-        }
-    }
-
-    void finalize(
-            int simd, alg_kind_t alg, const ngen::GRFRange &acc, dim_t iters) {
-        int nregs = acc.getLen();
-        for (int i = 0; i < nregs; i++) {
-            switch (alg) {
-                case alg_kind::reduction_mean:
-                    mul(simd, acc[i].f(), acc[i].f(), 1.0f / iters);
-                    break;
-                default: break;
-            }
-        }
-    }
-};
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_JIT_JIT_REDUCTION_GENERATOR_HPP
diff --git a/src/gpu/intel/jit/jit_reduction_injector.cpp b/src/gpu/intel/jit/jit_reduction_injector.cpp
deleted file mode 100644
index 1ac95edf9af..00000000000
--- a/src/gpu/intel/jit/jit_reduction_injector.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "oneapi/dnnl/dnnl_types.h"
-
-// Must be included before emulation.hpp
-#include "gpu/intel/jit/ngen/ngen.hpp"
-
-#include "common/impl_registration.hpp"
-#include "common/nstl.hpp"
-#include "common/utils.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/jit/emulation.hpp"
-#include "gpu/intel/jit/jit_reduction_injector.hpp"
-#include "gpu/intel/jit/ngen/ngen_core.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-using namespace ngen;
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::sum_fwd(
-        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
-    eadd(h, simd, acc, acc, val);
-}
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::max_fwd(
-        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
-    h.max_(simd, acc, acc, val);
-}
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::min_fwd(
-        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
-    h.min_(simd, acc, acc, val);
-}
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::mul_fwd(
-        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
-    emul(h, simd, acc, acc, val);
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::initialize(
-        int simd, const ngen::GRF &reg) {
-    switch (alg_) {
-        case dnnl_reduction_sum:
-        case dnnl_reduction_mean: emov(h, simd, reg, 0.0f); break;
-        case dnnl_reduction_max:
-            emov(h, simd, reg, nstl::numeric_limits<float>::lowest());
-            break;
-        case dnnl_reduction_min:
-            emov(h, simd, reg, nstl::numeric_limits<float>::max());
-            break;
-        case dnnl_reduction_mul: emov(h, simd, reg, 1.0f); break;
-        default:
-            gpu_assert(false) << "unsupported reduction algorithm, " << alg_;
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::eload(
-        const ngen::GRFRange &dst, const ngen::GRF &base_src_addr) {
-    const int grf_bytes = ngen::GRF::bytes(hw);
-    int nregs = dst.getLen();
-    bool force_legacy
-            = gpu_utils::dev_getenv("jit_reduction_force_legacy_send", false);
-    bool use_legacy = force_legacy || hw < ngen::HW::XeHPG;
-    const int max_load_size = use_legacy ? 128 : 512;
-    gpu_assert(max_load_size % grf_bytes == 0) << "Unexpected load size";
-    const int max_load_regs = max_load_size / grf_bytes;
-
-    // Load in chunks
-    int reg_start = 0;
-    while (reg_start < nregs) {
-        int load_regs = nstl::min(max_load_regs, nregs - reg_start);
-        // Compute the src address
-        ngen::GRF addr = ra.alloc().uq();
-        eadd(h, 1, addr, base_src_addr, reg_start * grf_bytes);
-        if (use_legacy) {
-            // Reduce load_regs according to valid load sizes
-            const int oword_per_grf = grf_bytes / 16;
-            for (auto load_owords : {8, 4, 2, 1}) {
-                if (load_owords / oword_per_grf > load_regs) continue;
-                load_regs = load_owords / oword_per_grf;
-                break;
-            }
-
-            // Do the load
-            auto dt = ngen::aligned_block_oword(load_regs * oword_per_grf);
-            h.load(1, dst[reg_start], dt, h.A64, addr);
-        } else {
-            // Reduce load_regs according to valid load sizes
-            const int d64_per_grf = grf_bytes / 8;
-            for (auto load_d64s : {64, 32, 16, 8, 4, 3, 2, 1}) {
-                if (load_d64s / d64_per_grf > load_regs) continue;
-                load_regs = load_d64s / d64_per_grf;
-                break;
-            }
-
-            // Do the load
-            ngen::DataSpecLSC lscspec = ngen::CacheSettingsLSC::L1UC_L3WB;
-            lscspec |= ngen::block(
-                    ngen::DataSizeLSC::D64, load_regs * d64_per_grf);
-            h.load.ugm(1, dst[reg_start], lscspec, h.A64, addr);
-        }
-        reg_start += load_regs;
-        ra.release(addr);
-    }
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::compute(const ngen::GRF &src_ptr,
-        const ngen::GRFRange &acc, dim_t stride, dim_t iters) {
-    using namespace alg_kind;
-#ifdef DNNL_DEV_MODE
-    int pre_regs = ra.get_alloced_regs();
-#endif
-    assert(src_ptr.getType() == ngen::DataType::uq);
-
-    int dt_size = sizeof(float);
-    int reg_size = ngen::GRF::bytes(hw);
-    int elems_per_reg = reg_size / dt_size;
-    int nregs = acc.getLen();
-
-    int regs_per_inst = std::min(nregs, []() {
-        int reg_size = ngen::GRF::bytes(hw);
-        compute::gpu_arch_t gpu_arch = convert_ngen_arch_to_dnnl(hw);
-        int max_exec_size = compute::device_info_t::max_exec_size(gpu_arch);
-        return max_exec_size / reg_size;
-    }());
-
-    ngen::GRF load_addr = ra.alloc().uq();
-    emov(h, 1, load_addr, src_ptr);
-
-    // Set up GRFs used for loop indices
-    ngen::Subregister loop_index = ra.alloc_sub(ngen::DataType::d);
-    ngen::GRFRange val = ra.alloc_range(nregs);
-    ngen::FlagRegister loop_flag = ra.alloc_flag(true);
-
-    for (int i = 0; i < nregs; i += regs_per_inst) {
-        int inst_nregs = std::min(regs_per_inst, nregs - i);
-        int simd = inst_nregs * elems_per_reg;
-        initialize(simd, acc[i].f());
-    }
-
-    // Initialize loop
-    ngen::Label loop_start;
-    emov(h, 1, loop_index, 0);
-    h.mark(loop_start);
-
-    // Load data - coalesce calls when possible
-    eload(val, load_addr);
-
-    // Accumulate
-    for (int i = 0; i < nregs; i += regs_per_inst) {
-        int inst_nregs = std::min(regs_per_inst, nregs - i);
-        int simd = inst_nregs * elems_per_reg;
-        switch (alg_) {
-            case dnnl_reduction_sum:
-            case dnnl_reduction_mean:
-                sum_fwd(simd, acc[i].f(), val[i].f());
-                break;
-            case dnnl_reduction_max:
-                max_fwd(simd, acc[i].f(), val[i].f());
-                break;
-            case dnnl_reduction_min:
-                min_fwd(simd, acc[i].f(), val[i].f());
-                break;
-            case dnnl_reduction_mul:
-                mul_fwd(simd, acc[i].f(), val[i].f());
-                break;
-            default: gpu_assert(false) << "unsupported reduction algorithm";
-        }
-    }
-
-    // Iterate
-    eadd(h, 1, loop_index, loop_index, 1);
-    h.cmp(1 | h.lt | loop_flag, loop_index, iters);
-    eadd(h, 1, load_addr, load_addr, stride * dt_size);
-    h.jmpi(1 | loop_flag, loop_start);
-
-    // Release used registers
-    ra.release(load_addr);
-    ra.release(loop_index);
-    ra.release(val);
-    ra.release(loop_flag);
-
-#ifdef DNNL_DEV_MODE
-    int remaining_regs = ra.get_alloced_regs() - pre_regs;
-    gpu_assert(remaining_regs == 0)
-            << remaining_regs
-            << " registers are allocated that need to be released.";
-#endif
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::emov(jit_generator<hw> &host,
-        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
-        const ngen::Immediate &src0) {
-    EmulationImplementation::emov(host, mod, dst, src0, emu_strategy);
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::emov(jit_generator<hw> &host,
-        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
-        const ngen::RegData &src0) {
-    EmulationImplementation::emov(host, mod, dst, src0, emu_strategy);
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::eadd(jit_generator<hw> &host,
-        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
-        const ngen::RegData &src0, const ngen::Immediate &src1) {
-    EmulationState state;
-    state.temp[0] = ra.alloc();
-    state.temp[1] = ra.alloc();
-    EmulationImplementation::eadd(
-            host, mod, dst, src0, src1, emu_strategy, state);
-    ra.release(state.temp[0]);
-    ra.release(state.temp[1]);
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::eadd(jit_generator<hw> &host,
-        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
-        const ngen::RegData &src0, const ngen::RegData &src1) {
-    EmulationState state;
-    state.temp[0] = ra.alloc();
-    state.temp[1] = ra.alloc();
-    EmulationImplementation::eadd(
-            host, mod, dst, src0, src1, emu_strategy, state);
-    ra.release(state.temp[0]);
-    ra.release(state.temp[1]);
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::emul(jit_generator<hw> &host,
-        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
-        const ngen::RegData &src0, const ngen::Immediate &src1) {
-    EmulationState state;
-    state.temp[0] = ra.alloc();
-    state.temp[1] = ra.alloc();
-    EmulationImplementation::emul(
-            host, mod, dst, src0, src1, emu_strategy, state);
-    ra.release(state.temp[0]);
-    ra.release(state.temp[1]);
-}
-
-template <gpu_gen_t hw>
-void jit_reduction_injector_f32<hw>::emul(jit_generator<hw> &host,
-        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
-        const ngen::RegData &src0, const ngen::RegData &src1) {
-    EmulationState state;
-    state.temp[0] = ra.alloc();
-    state.temp[1] = ra.alloc();
-    EmulationImplementation::emul(
-            host, mod, dst, src0, src1, emu_strategy, state);
-    ra.release(state.temp[0]);
-    ra.release(state.temp[1]);
-}
-
-REG_GEN9_ISA(template struct jit_reduction_injector_f32<gpu_gen9>);
-REG_GEN11_ISA(template struct jit_reduction_injector_f32<gpu_gen11>);
-REG_XELP_ISA(template struct jit_reduction_injector_f32<gpu_xe_lp>);
-REG_XEHP_ISA(template struct jit_reduction_injector_f32<gpu_xe_hp>);
-REG_XEHPG_ISA(template struct jit_reduction_injector_f32<gpu_xe_hpg>);
-REG_XEHPC_ISA(template struct jit_reduction_injector_f32<gpu_xe_hpc>);
-REG_XE2_ISA(template struct jit_reduction_injector_f32<gpu_xe2>);
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/jit_reduction_injector.hpp b/src/gpu/intel/jit/jit_reduction_injector.hpp
deleted file mode 100644
index f4c97205784..00000000000
--- a/src/gpu/intel/jit/jit_reduction_injector.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_JIT_REDUCTION_INJECTOR_HPP
-#define GPU_INTEL_JIT_JIT_REDUCTION_INJECTOR_HPP
-
-#include <assert.h>
-
-#include "common/c_types_map.hpp"
-#include "common/utils.hpp"
-
-#include "gpu/intel/jit/codegen/register_allocator.hpp"
-#include "gpu/intel/jit/emulation.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-#include "gpu/intel/jit/ngen/ngen_core.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-
-inline bool jit_reduction_injector_f32_is_supported(alg_kind_t alg) {
-    using namespace alg_kind;
-    return utils::one_of(alg, reduction_sum, reduction_mean, reduction_max,
-            reduction_min, reduction_mul);
-}
-
-template <gpu_gen_t hw>
-struct jit_reduction_injector_f32 {
-    jit_reduction_injector_f32(jit_generator<hw> &host, alg_kind_t alg,
-            reg_allocator_t &ra_, int stepping_id)
-        : emu_strategy(hw, stepping_id), alg_(alg), h(host), ra(ra_) {
-        assert(jit_reduction_injector_f32_is_supported(alg_));
-    }
-
-    // src_ptr: GRF whose 1st qword subregister holds the first address to be loaded from
-    // acc: Potentially uninitialized GRFRange to store values in
-    // stride: Number of elements to increment the pointer by between iterations
-    // iters: Number of reduction iterations
-    void compute(const ngen::GRF &src_ptr, const ngen::GRFRange &acc,
-            dim_t stride, dim_t iters);
-
-private:
-    void initialize(int simd, const ngen::GRF &reg);
-    // Load data from a contiguous range in global memory into a contiguous
-    // range of registers (block load)
-    void eload(const ngen::GRFRange &dst, const ngen::GRF &base_src_addr);
-
-    // Emulation functions
-    void emov(jit_generator<hw> &host, const ngen::InstructionModifier &mod,
-            const ngen::RegData &dst, const ngen::Immediate &src0);
-    void emov(jit_generator<hw> &host, const ngen::InstructionModifier &mod,
-            const ngen::RegData &dst, const ngen::RegData &src0);
-    void eadd(jit_generator<hw> &host, const ngen::InstructionModifier &mod,
-            const ngen::RegData &dst, const ngen::RegData &src0,
-            const ngen::Immediate &src1);
-    void eadd(jit_generator<hw> &host, const ngen::InstructionModifier &mod,
-            const ngen::RegData &dst, const ngen::RegData &src0,
-            const ngen::RegData &src1);
-    void emul(jit_generator<hw> &host, const ngen::InstructionModifier &mod,
-            const ngen::RegData &dst, const ngen::RegData &src0,
-            const ngen::Immediate &src1);
-    void emul(jit_generator<hw> &host, const ngen::InstructionModifier &mod,
-            const ngen::RegData &dst, const ngen::RegData &src0,
-            const ngen::RegData &src1);
-    EmulationStrategy emu_strategy;
-
-    const alg_kind_t alg_;
-    jit_generator<hw> &h;
-    reg_allocator_t &ra;
-
-    void sum_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
-    void max_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
-    void min_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
-    void mul_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
-};
-
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_JIT_JIT_REDUCTION_INJECTOR_HPP
diff --git a/src/gpu/intel/jit/ngen/_clang-format b/src/gpu/intel/jit/ngen/_clang-format
deleted file mode 100644
index ba1496d179e..00000000000
--- a/src/gpu/intel/jit/ngen/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2019-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/src/gpu/intel/jit/ngen/ngen.hpp b/src/gpu/intel/jit/ngen/ngen.hpp
deleted file mode 100644
index 0a83c2d0cb2..00000000000
--- a/src/gpu/intel/jit/ngen/ngen.hpp
+++ /dev/null
@@ -1,2629 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-// nGEN: a C++ library for runtime Gen assembly generation.
-//
-// Macros that control nGEN's interface:
-//    NGEN_SAFE             if defined, enables run-time safety checks. Exceptions will be thrown if checks fail.
-//    NGEN_SHORT_NAMES      if defined, enables some short names (r[...] for indirect addressing, W for NoMask)
-//    NGEN_GLOBAL_REGS      if defined, register names and instruction modifiers (r7, cr0, Switch, etc.) are
-//                           global variables in the ngen namespace. Otherwise, they are members of the code
-//                           generator classes
-//    NGEN_CPP11            if defined, ngen is C++11-compatible (C++17 not required)
-
-#ifndef NGEN_HPP
-#define NGEN_HPP
-
-#include "ngen_config.hpp"
-
-#include <array>
-#include <cstring>
-#include <type_traits>
-#include <vector>
-
-#include "ngen_core.hpp"
-#include "ngen_auto_swsb.hpp"
-
-namespace NGEN_NAMESPACE {
-
-// Forward declarations.
-template <HW hw> class BinaryCodeGenerator;
-template <HW hw> class ELFCodeGenerator;
-
-// MSVC v140 workaround for enum comparison in template arguments.
-static constexpr bool hwLT(HW hw1, HW hw2) { return hw1 < hw2; }
-static constexpr bool hwLE(HW hw1, HW hw2) { return hw1 <= hw2; }
-static constexpr bool hwGE(HW hw1, HW hw2) { return hw1 >= hw2; }
-static constexpr bool hwGT(HW hw1, HW hw2) { return hw1 > hw2; }
-
-// -----------------------------------------------------------------------
-// Binary formats, split between pre-Gen12 and post-Gen12.
-
-#include "ngen_gen8.hpp"
-#include "ngen_gen12.hpp"
-
-// -----------------------------------------------------------------------
-
-
-class LabelFixup {
-public:
-    uint32_t labelID;
-    int32_t anchor;
-    int32_t offset;
-
-    LabelFixup(uint32_t labelID_, int32_t offset_) : labelID(labelID_), anchor(0), offset(offset_) {}
-
-    static constexpr auto JIPOffset = 12;
-    static constexpr auto JIPOffsetJMPI = -4;
-    static constexpr auto UIPOffset = 8;
-};
-
-#if defined(NGEN_GLOBAL_REGS) && !defined(NGEN_GLOBAL_REGS_DEFINED)
-#define NGEN_GLOBAL_REGS_DEFINED
-#include "ngen_registers.hpp"
-#endif
-
-template <HW hw>
-class BinaryCodeGenerator
-{
-    friend class ELFCodeGenerator<hw>;
-
-public:
-    static constexpr HW hardware = hw;
-
-protected:
-    class InstructionStream {
-        friend class BinaryCodeGenerator;
-
-        std::vector<LabelFixup> fixups;
-        std::vector<uint32_t> labels;
-        std::vector<uint64_t> code;
-        bool appended = false;
-
-        int length() const { return int(code.size() * sizeof(uint64_t)); }
-
-        void db(const Instruction8 &i) {
-            code.push_back(i.qword[0]);
-            code.push_back(i.qword[1]);
-        }
-
-        void db(const Instruction12 &i) {
-            code.push_back(i.qword[0]);
-            code.push_back(i.qword[1]);
-        }
-
-        void addFixup(LabelFixup fixup) {
-            fixup.anchor = length();
-            fixups.push_back(fixup);
-        }
-
-        void mark(Label &label, LabelManager &man) {
-            uint32_t id = label.getID(man);
-
-            man.setTarget(id, length());
-            labels.push_back(id);
-        }
-
-        void fixLabels(LabelManager &man) {
-            for (const auto &fixup : fixups) {
-                int32_t target = man.getTarget(fixup.labelID);
-                uint8_t *field = ((uint8_t *) code.data()) + fixup.anchor + fixup.offset;
-                *((int32_t *) field) = target - fixup.anchor;
-            }
-        }
-
-        void append(InstructionStream &other, LabelManager &man) {
-            auto offset = length();
-            auto sz = code.size();
-
-            code.resize(sz + other.code.size());
-            std::copy(other.code.begin(), other.code.end(), code.begin() + sz);
-
-            sz = labels.size();
-            labels.resize(sz + other.labels.size());
-            std::copy(other.labels.begin(), other.labels.end(), labels.begin() + sz);
-
-            for (LabelFixup fixup : other.fixups) {
-                fixup.anchor += offset;
-                fixups.push_back(fixup);
-            }
-
-#ifdef NGEN_SAFE
-            if (other.appended && !other.labels.empty())
-                throw multiple_label_exception();
-#endif
-
-            for (uint32_t id : other.labels)
-                man.offsetTarget(id, offset);
-
-            other.appended = true;
-        }
-
-        InstructionStream() {}
-    };
-
-    class Program {
-        friend class BinaryCodeGenerator;
-        using Instruction = typename Instruction12Dispatch<hw>::type;
-        std::vector<uint64_t> &code;
-
-        Program(InstructionStream &stream) : code(stream.code) {};
-
-    public:
-        size_t size() const                               { return code.size() >> 1; }
-        Instruction &operator[](size_t index)             { return *reinterpret_cast<Instruction *>(&code[index * 2]); }
-        const Instruction &operator[](size_t index) const { return *reinterpret_cast<Instruction *>(&code[index * 2]); }
-    };
-
-    static constexpr bool isGen12 = (hw >= HW::Gen12LP);
-    Product product;
-    int declaredGRFs = 128;
-
-    Label _labelLocalIDsLoaded;
-    Label _labelArgsLoaded;
-    Label _lastFenceLabel;
-    RegData _lastFenceDst;
-
-private:
-    InstructionModifier defaultModifier;
-
-    LabelManager labelManager;
-    InstructionStream rootStream;
-    std::vector<InstructionStream*> streamStack;
-
-    void db(const Instruction8 &i)  { streamStack.back()->db(i); }
-    void db(const Instruction12 &i) { streamStack.back()->db(i); }
-    void addFixup(LabelFixup fixup) { streamStack.back()->addFixup(fixup); }
-
-    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0);
-    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0);
-    template <bool forceWE = false, typename D, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0);
-    template <bool forceWE = false, typename D, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0);
-
-    template <bool forceWE = false, typename D, typename S0, typename S1, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1);
-    template <bool forceWE = false, typename D, typename S0, typename S1, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1);
-    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1);
-    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1);
-
-    template <HW hw_ = hw>
-    typename std::enable_if<hwLE(hw_, HW::Gen9)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, RegData dst, RegData src0, RegData src1, RegData src2);
-    template <HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, Align16Operand dst, Align16Operand src0, Align16Operand src1, Align16Operand src2);
-    template <typename D, typename S0, typename S1, typename S2, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2);
-    template <typename D, typename S0, typename S1, typename S2, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2);
-
-    template <typename DS0>
-    void opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0);
-    template <typename DS0, typename S1>
-    void opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0, S1 src1);
-
-    template <typename D, typename S0, typename S2>
-    void opBfn(Opcode op, DataType defaultType, const InstructionModifier &mod, int bfnCtrl, D dst, S0 src0, RegData src1, S2 src2);
-    void opDpas(Opcode op, DataType defaultType, const InstructionModifier &mod, int sdepth, int rcount, RegData dst, RegData src0, RegData src1, RegData src2);
-
-    template <typename D, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, uint32_t exdesc, D desc);
-    template <typename D, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, const RegData &exdesc, D desc);
-    template <typename ED, typename D, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, ED exdesc, D desc);
-
-    template <HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc);
-    template <HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc);
-    template <typename D, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, D desc);
-
-    template <typename ED, typename D, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, ED exdesc, D desc);
-    template <typename D, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, D desc);
-    template <typename D, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, RegData exdesc, D desc);
-
-    template <HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip);
-    template <HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip);
-    template <bool forceWE = false, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip);
-    template <bool forceWE = false, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip);
-    template <bool forceWE = false, bool small12 = true, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0);
-    template <bool forceWE = false, bool small12 = true, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0);
-
-    void opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, Label &uip);
-    template <bool forceWE = false>
-    void opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip);
-    void opCall(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip);
-
-    template <HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip);
-    template <HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip);
-    void opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Label &jip);
-
-    void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod);
-    void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, RegData src0);
-    void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, const Immediate &src0);
-
-    void opNop(Opcode op);
-
-    inline void unsupported();
-
-#include "ngen_compiler_fix.hpp"
-
-public:
-    explicit BinaryCodeGenerator(Product product_) : product{product_}, defaultModifier{}, labelManager{},
-                                                     sync{this}, load{this}, store{this}, atomic{this}
-    {
-        _workaround_();
-        pushStream(rootStream);
-    }
-
-    explicit BinaryCodeGenerator(int stepping_ = 0) : BinaryCodeGenerator({genericProductFamily(hw), stepping_}) {}
-
-    ~BinaryCodeGenerator() {
-        for (size_t sn = 1; sn < streamStack.size(); sn++)
-            delete streamStack[sn];
-    }
-
-    std::vector<uint8_t> getCode();
-    size_t getRootStreamLength() const { return rootStream.length(); }
-
-    Product getProduct() const { return product; }
-    ProductFamily getProductFamily() const { return product.family; }
-    int getStepping() const { return product.stepping; }
-
-    void setProduct(Product product_) { product = product_; }
-    void setProductFamily(ProductFamily family_) { product.family = family_; }
-    void setStepping(int stepping_) { product.stepping = stepping_; }
-
-protected:
-    // Configuration.
-    void setDefaultNoMask(bool def = true)          { defaultModifier.setWrEn(def); }
-    void setDefaultAutoSWSB(bool def = true)        { defaultModifier.setAutoSWSB(def); }
-    bool getDefaultNoMask() const                   { return defaultModifier.isWrEn(); }
-    bool getDefaultAutoSWSB() const                 { return defaultModifier.isAutoSWSB(); }
-
-    // Stream handling.
-    void pushStream()                               { pushStream(new InstructionStream()); }
-    void pushStream(InstructionStream *s)           { streamStack.push_back(s); }
-    void pushStream(InstructionStream &s)           { pushStream(&s); }
-
-    InstructionStream *popStream();
-
-    void appendStream(InstructionStream *s)         { appendStream(*s); }
-    void appendStream(InstructionStream &s)         { streamStack.back()->append(s, labelManager); }
-    void appendCurrentStream()                      { InstructionStream *s = popStream(); appendStream(s); delete s; }
-
-    void discardStream()                            { delete popStream(); }
-
-    template <typename String>
-    void comment(String)                            {}
-
-    void requireGRF(int grfs)                       { declaredGRFs = grfs; }
-
-    // Registers.
-#ifndef NGEN_GLOBAL_REGS
-#include "ngen_registers.hpp"
-#endif
-
-    // Labels.
-    inline void mark(Label &label)          { streamStack.back()->mark(label, labelManager); }
-
-    // Instructions.
-    template <typename DT = void>
-    void add(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::add, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void add(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::add, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void addc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::addc, getDataType<DT>(), mod | AccWrEn, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void addc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::addc, getDataType<DT>(), mod | AccWrEn, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void add3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void add3(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void add3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void add3(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void and_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::and_gen12 : Opcode::and_, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void and_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::and_gen12 : Opcode::and_, getDataType<DT>(), mod, dst, src0, src1);
-    }
-#ifndef NGEN_NO_OP_NAMES
-    template <typename DT = void>
-    void and(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        and_<DT>(mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void and(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        and_<DT>(mod, dst, src0, src1);
-    }
-#endif
-    template <typename DT = void>
-    void asr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::asr_gen12 : Opcode::asr, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void asr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::asr_gen12 : Opcode::asr, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void avg(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::avg, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void avg(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::avg, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void bfe(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfe(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfe(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfe(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfi1(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::bfi1_gen12 : Opcode::bfi1, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void bfi1(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::bfi1_gen12 : Opcode::bfi1, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void bfi2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfi2(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfi2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfi2(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        if (hw < HW::XeHP) unsupported();
-        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void bfrev(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(isGen12 ? Opcode::bfrev_gen12 : Opcode::bfrev, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void bfrev(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(isGen12 ? Opcode::bfrev_gen12 : Opcode::bfrev, getDataType<DT>(), mod, dst, src0);
-    }
-    void brc(const InstructionModifier &mod, Label &jip, Label &uip) {
-        opBranch(Opcode::brc, mod, isGen12 ? null.ud() : ip.d(), jip, uip);
-    }
-    void brc(const InstructionModifier &mod, RegData src0) {
-        src0.setRegion(2, 2, 1);
-        opBranch<true, true>(Opcode::brc, mod, isGen12 ? null.ud() : ip.d(), src0);
-    }
-    void brd(const InstructionModifier &mod, Label &jip) {
-        opBranch(Opcode::brd, mod, isGen12 ? null.ud() : ip.d(), jip);
-    }
-    void brd(const InstructionModifier &mod, RegData src0) {
-        src0.setRegion(2, 2, 1);
-        opBranch<true, true>(Opcode::brd, mod, isGen12 ? null.ud() : ip.d(), src0);
-    }
-    void break_(const InstructionModifier &mod, Label &jip, Label &uip) {
-        opBranch(Opcode::break_, mod, null, jip, uip);
-    }
-    void call(const InstructionModifier &mod, const RegData &dst, Label &jip) {
-        opCall(Opcode::call, mod, dst, jip);
-    }
-    void call(const InstructionModifier &mod, const RegData &dst, RegData jip) {
-        if (isGen12)
-            opBranch<true, true>(Opcode::call, mod, dst, jip);
-        else {
-            jip.setRegion(0, 1, 0);
-            opX<true>(Opcode::call, DataType::d, mod, dst, null.ud(0)(0, 1, 0), jip);
-        }
-    }
-    void calla(const InstructionModifier &mod, const RegData &dst, int32_t jip) {
-        if (isGen12)
-            opBranch<true>(Opcode::calla, mod, dst, jip);
-        else
-            opX<true>(Opcode::calla, DataType::d, mod, dst, (hw <= HW::Gen9) ? null.ud(0)(2,2,1) : null.ud(0)(0,1,0), Immediate::d(jip));
-    }
-    void calla(const InstructionModifier &mod, const RegData &dst, RegData jip) {
-        if (isGen12)
-            opBranch<true, true>(Opcode::calla, mod, dst, jip);
-        else {
-            jip.setRegion(0, 1, 0);
-            opX<true>(Opcode::calla, DataType::d, mod, dst, null.ud(0)(0, 1, 0), jip);
-        }
-    }
-    template <typename DT = void>
-    void cbit(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::cbit, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void cbit(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::cbit, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void cmp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::cmp_gen12 : Opcode::cmp, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void cmp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::cmp_gen12 : Opcode::cmp, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void cmpn(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::cmpn_gen12 : Opcode::cmpn, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void csel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void csel(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void csel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void csel(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    void cont(const InstructionModifier &mod, Label &jip, Label &uip) {
-        opBranch(Opcode::cont, mod, null, jip, uip);
-    }
-    template <typename DT = void>
-    void dp2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::dp2, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dp2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::dp2, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dp3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::dp3, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dp3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::dp3, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dp4(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::dp4, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dp4(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::dp4, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dp4a(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        if (hw < HW::Gen12LP) unsupported();
-        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void dp4a(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        if (hw < HW::Gen12LP) unsupported();
-        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void dp4a(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        if (hw < HW::Gen12LP) unsupported();
-        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void dp4a(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        if (hw < HW::Gen12LP) unsupported();
-        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void dpas(const InstructionModifier &mod, uint8_t sdepth, uint8_t rcount, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opDpas(Opcode::dpas, getDataType<DT>(), mod, sdepth, rcount, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void dpasw(const InstructionModifier &mod, uint8_t sdepth, uint8_t rcount, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opDpas(Opcode::dpasw, getDataType<DT>(), mod, sdepth, rcount, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void dph(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::dph, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void dph(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::dph, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    void else_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl = false) {
-        mod.setBranchCtrl(branchCtrl);
-        opBranch(Opcode::else_, mod, null, jip, uip);
-    }
-    void else_(InstructionModifier mod, Label &jip) {
-        else_(mod, jip, jip);
-    }
-    void endif(const InstructionModifier &mod, Label &jip) {
-        opBranch(Opcode::endif, mod, null, jip);
-    }
-    void endif(const InstructionModifier &mod) {
-        opBranch(Opcode::endif, mod, null, sizeof(Instruction8));
-    }
-    template <typename DT = void>
-    void fbh(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::fbh, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void fbh(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::fbh, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void fbl(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::fbl, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void fbl(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::fbl, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void frc(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::frc, getDataType<DT>(), mod, dst, src0);
-    }
-    void goto_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl = false) {
-        mod.setBranchCtrl(branchCtrl);
-        opBranch(Opcode::goto_, mod, null, jip, uip);
-    }
-    void goto_(const InstructionModifier &mod, Label &jip) {
-        goto_(mod, jip, jip);
-    }
-    void halt(const InstructionModifier &mod, Label &jip, Label &uip) {
-        opBranch(Opcode::halt, mod, null, jip, uip);
-    }
-    void halt(const InstructionModifier &mod, Label &jip) {
-        halt(mod, jip, jip);
-    }
-    void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl = false) {
-        mod.setBranchCtrl(branchCtrl);
-        opBranch(Opcode::if_, mod, null, jip, uip);
-    }
-    void if_(const InstructionModifier &mod, Label &jip) {
-        if_(mod, jip, jip);
-    }
-    void illegal() {
-        opX(Opcode::illegal, DataType::invalid, InstructionModifier(), null, null, null);
-    }
-    void join(InstructionModifier mod, Label &jip) {
-        opBranch(Opcode::join, mod, null, jip);
-    }
-    void join(InstructionModifier mod) {
-        opBranch(Opcode::join, mod, null, sizeof(Instruction8));
-    }
-    void jmpi(const InstructionModifier &mod, Label &jip) {
-        auto dst = isGen12 ? ARF(null) : ARF(ip);
-        opJmpi(Opcode::jmpi, mod, dst, dst, jip);
-    }
-    void jmpi(const InstructionModifier &mod, const RegData &jip) {
-#ifdef NGEN_SAFE
-        if (!isGen12 && jip.getType() != DataType::d && jip.getType() != DataType::invalid)
-            throw invalid_type_exception();
-#endif
-        if (isGen12)
-            opBranch<true, false>(Opcode::jmpi, mod, null, jip);
-        else
-            opX(Opcode::jmpi, DataType::d, mod, ip, ip, jip);
-    }
-    template <typename DT = void>
-    void line(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        if (hw >= HW::Gen11) unsupported();
-        opX(Opcode::line, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void line(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        if (hw >= HW::Gen11) unsupported();
-        opX(Opcode::line, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void lrp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opX(Opcode::lrp, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void lzd(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::lzd, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void lzd(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::lzd, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void mac(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::mac, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mac(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::mac, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mach(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::mach, getDataType<DT>(), (hw >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mach(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::mach, getDataType<DT>(), (hw >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
-    }
-    template <typename DT = void>
-    void macl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-#ifdef NGEN_SAFE
-        if (hw < HW::Gen10) unsupported();
-#endif
-        opX((hw >= HW::XeHPC) ? Opcode::macl : Opcode::mach, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void macl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-#ifdef NGEN_SAFE
-        if (hw < HW::Gen10) unsupported();
-#endif
-        opX((hw >= HW::XeHPC) ? Opcode::macl : Opcode::mach, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mad(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2) {
-        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void mad(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2) {
-        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void mad(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2) {
-        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void mad(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2) {
-        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void, HW hw_ = hw>
-    typename std::enable_if<hwLE(hw_, HW::Gen9)>::type
-    madm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1, const ExtendedReg &src2) {
-        opX(Opcode::madm, getDataType<DT>(), mod, extToAlign16(dst), extToAlign16(src0), extToAlign16(src1), extToAlign16(src2));
-    }
-    template <typename DT = void, HW hw_ = hw>
-    typename std::enable_if<hwGT(hw_, HW::Gen9)>::type
-    madm(const InstructionModifier &mod, const ExtendedReg &dst, ExtendedReg src0, ExtendedReg src1, const ExtendedReg &src2) {
-        src0.getBase().setRegion(4,4,1);
-        src1.getBase().setRegion(4,4,1);
-        opX(Opcode::madm, getDataType<DT>(), mod, dst, src0, src1, src2);
-    }
-    template <typename DT = void>
-    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0) {
-#ifdef NGEN_SAFE
-        if (mathArgCount(fc) != 1) throw invalid_operand_count_exception();
-#endif
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0);
-    }
-    template <typename DT = void>
-    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const RegData &src1) {
-#ifdef NGEN_SAFE
-        if (mathArgCount(fc) != 2) throw invalid_operand_count_exception();
-#endif
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const Immediate &src1) {
-#ifdef NGEN_SAFE
-        if (fc == MathFunction::invm || fc == MathFunction::rsqtm) throw invalid_operand_exception();
-#endif
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, src1.forceInt32());
-    }
-    template <typename DT = void, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen11)>::type
-    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, const ExtendedReg &src0) {
-#ifdef NGEN_SAFE
-        if (fc != MathFunction::rsqtm) throw invalid_operand_exception();
-#endif
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, extToAlign16(dst), extToAlign16(src0));
-    }
-    template <typename DT = void, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen11)>::type
-    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, ExtendedReg src0) {
-#ifdef NGEN_SAFE
-        if (fc != MathFunction::rsqtm) throw invalid_operand_exception();
-#endif
-        if (hw == HW::Gen11)
-            src0.getBase().setRegion(2,2,1);
-        else
-            src0.getBase().setRegion(1,1,0);
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0);
-    }
-    template <typename DT = void, HW hw_ = hw>
-    typename std::enable_if<hwLT(hw_, HW::Gen11)>::type
-    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1) {
-#ifdef NGEN_SAFE
-        if (fc != MathFunction::invm) throw invalid_operand_exception();
-#endif
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, extToAlign16(dst), extToAlign16(src0), extToAlign16(src1));
-    }
-    template <typename DT = void, HW hw_ = hw>
-    typename std::enable_if<hwGE(hw_, HW::Gen11)>::type
-    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, ExtendedReg src0, ExtendedReg src1) {
-#ifdef NGEN_SAFE
-        if (fc != MathFunction::invm) throw invalid_operand_exception();
-#endif
-        if (hw == HW::Gen11) {
-            src0.getBase().setRegion(2,2,1);
-            src1.getBase().setRegion(2,2,1);
-        } else {
-            src0.getBase().setRegion(1,1,0);
-            src1.getBase().setRegion(1,1,0);
-        }
-        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mov(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(isGen12 ? Opcode::mov_gen12 : Opcode::mov, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void mov(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(isGen12 ? Opcode::mov_gen12 : Opcode::mov, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        if (hardware >= HW::Gen10)
-            movi<DT>(mod, dst, src0, null.ud(0)(1,1,0));
-        else
-            opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-#ifdef NGEN_SAFE
-        if (hardware < HW::Gen10) throw unsupported_instruction();
-#endif
-        opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-#ifdef NGEN_SAFE
-        if (hardware < HW::Gen10) throw unsupported_instruction();
-#endif
-        opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mul(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::mul, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void mul(const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1) {
-        if (dst.getBytes() == 8)
-            src1 = src1.forceInt32();
-        opX(Opcode::mul, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    void nop() {
-        opNop(isGen12 ? Opcode::nop_gen12 : Opcode::nop);
-    }
-    void nop(const InstructionModifier &mod) {
-        opX(isGen12 ? Opcode::nop_gen12 : Opcode::nop, DataType::invalid, mod, null, null, null);
-    }
-    template <typename DT = void>
-    void not_(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(isGen12 ? Opcode::not_gen12 : Opcode::not_, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void not_(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(isGen12 ? Opcode::not_gen12 : Opcode::not_, getDataType<DT>(), mod, dst, src0);
-    }
-#ifndef NGEN_NO_OP_NAMES
-    template <typename DT = void>
-    void not(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        not_<DT>(mod, dst, src0);
-    }
-    template <typename DT = void>
-    void not(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        not_<DT>(mod, dst, src0);
-    }
-#endif
-    template <typename DT = void>
-    void or_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::or_gen12 : Opcode::or_, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void or_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::or_gen12 : Opcode::or_, getDataType<DT>(), mod, dst, src0, src1);
-    }
-#ifndef NGEN_NO_OP_NAMES
-    template <typename DT = void>
-    void or(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        or_<DT>(mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void or(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        or_<DT>(mod, dst, src0, src1);
-    }
-#endif
-    template <typename DT = void>
-    void pln(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        if (hw >= HW::Gen11) unsupported();
-        opX(Opcode::pln, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    void ret(const InstructionModifier &mod, RegData src0) {
-        src0.setRegion(2,2,1);
-        if (isGen12)
-            opBranch<true, true>(Opcode::ret, mod, null, src0);
-        else
-            opX<true>(Opcode::ret, DataType::ud, mod, null, src0);
-    }
-    template <typename DT = void>
-    void rndd(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::rndd, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rndd(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::rndd, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rnde(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::rnde, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rnde(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::rnde, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rndu(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::rndu, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rndu(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::rndu, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rndz(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-        opX(Opcode::rndz, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rndz(const InstructionModifier &mod, const RegData &dst, const Immediate &src0) {
-        opX(Opcode::rndz, getDataType<DT>(), mod, dst, src0);
-    }
-    template <typename DT = void>
-    void rol(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::rol_gen12 : Opcode::rol, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void rol(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::rol_gen12 : Opcode::rol, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void ror(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::ror_gen12 : Opcode::ror, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void ror(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::ror_gen12 : Opcode::ror, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void sad2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        if (hw >= HW::Gen12LP) unsupported();
-        opX(Opcode::sad2, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void sad2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        if (hw >= HW::Gen12LP) unsupported();
-        opX(Opcode::sad2, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void sada2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        if (hw >= HW::Gen12LP) unsupported();
-        opX(Opcode::sada2, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void sada2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        if (hw >= HW::Gen12LP) unsupported();
-        opX(Opcode::sada2, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void sel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::sel_gen12 : Opcode::sel, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void sel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::sel_gen12 : Opcode::sel, getDataType<DT>(), mod, dst, src0, src1);
-    }
-
-    /* Gen12-style sends */
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, uint32_t desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, const RegData &desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, uint32_t desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, const RegData &desc) {
-        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, uint32_t desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, const RegData &desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, uint32_t desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, const RegData &desc) {
-        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc);
-    }
-    /* Pre-Gen12-style sends; also supported on Gen12. */
-    void send(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc) {
-        opSend(Opcode::send, mod, dst, src0, exdesc, desc);
-    }
-    void send(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc) {
-        opSend(Opcode::send, mod, dst, src0, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc) {
-        opSend(Opcode::sendc, mod, dst, src0, exdesc, desc);
-    }
-    void sendc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc) {
-        opSend(Opcode::sendc, mod, dst, src0, exdesc, desc);
-    }
-    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc) {
-        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc) {
-        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc) {
-        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc) {
-        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc) {
-        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc) {
-        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc) {
-        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc);
-    }
-    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc) {
-        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc);
-    }
-
-    template <typename DT = void>
-    void shl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::shl_gen12 : Opcode::shl, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void shl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::shl_gen12 : Opcode::shl, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void shr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::shr_gen12 : Opcode::shr, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void shr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::shr_gen12 : Opcode::shr, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void smov(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::smov_gen12 : Opcode::smov, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void srnd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::srnd, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void srnd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::srnd, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void subb(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(Opcode::subb, getDataType<DT>(), mod | AccWrEn, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void subb(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(Opcode::subb, getDataType<DT>(), mod | AccWrEn, dst, src0, src1);
-    }
-    void wait(const InstructionModifier &mod, const RegData &nreg) {
-#ifdef NGEN_SAFE
-        if (!nreg.isARF() || nreg.getARFType() != ARFType::n) throw invalid_arf_exception();
-#endif
-        opX(Opcode::wait, DataType::invalid, mod, nreg, nreg);
-    }
-    void while_(const InstructionModifier &mod, Label &jip) {
-        opBranch(Opcode::while_, mod, null, jip);
-    }
-    template <typename DT = void>
-    void xor_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        opX(isGen12 ? Opcode::xor_gen12 : Opcode::xor_, getDataType<DT>(), mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void xor_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        opX(isGen12 ? Opcode::xor_gen12 : Opcode::xor_, getDataType<DT>(), mod, dst, src0, src1);
-    }
-#ifndef NGEN_NO_OP_NAMES
-    template <typename DT = void>
-    void xor(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-        xor_<DT>(mod, dst, src0, src1);
-    }
-    template <typename DT = void>
-    void xor(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-        xor_<DT>(mod, dst, src0, src1);
-    }
-#endif
-
-private:
-    struct Sync {
-        BinaryCodeGenerator<hw> &parent;
-
-        Sync(BinaryCodeGenerator<hw> *parent_) : parent(*parent_) {}
-
-        void operator()(SyncFunction fc, const InstructionModifier &mod = InstructionModifier()) {
-            parent.opSync(Opcode::sync, fc, mod);
-        }
-        void operator()(SyncFunction fc, const RegData &src0) {
-            this->operator()(fc, InstructionModifier(), src0);
-        }
-        void operator()(SyncFunction fc, const InstructionModifier &mod, const RegData &src0) {
-            parent.opSync(Opcode::sync, fc, mod, src0);
-        }
-        void operator()(SyncFunction fc, int src0) {
-            this->operator()(fc, InstructionModifier(), src0);
-        }
-        void operator()(SyncFunction fc, const InstructionModifier &mod, uint32_t src0) {
-            parent.opSync(Opcode::sync, fc, mod, Immediate::ud(src0));
-        }
-        void allrd() {
-            allrd(null.ud(0)(0, 1, 1));
-        }
-        void allrd(const InstructionModifier &mod) {
-            allrd(mod, null.ud(0)(0, 1, 1));
-        }
-        void allrd(const RegData &src0) {
-            allrd(InstructionModifier(), src0);
-        }
-        void allrd(const InstructionModifier &mod, const RegData &src0) {
-            this->operator()(SyncFunction::allrd, mod, src0);
-        }
-        void allrd(uint32_t src0) {
-            allrd(InstructionModifier(), src0);
-        }
-        void allrd(const InstructionModifier &mod, uint32_t src0) {
-            this->operator()(SyncFunction::allrd, mod, src0);
-        }
-        void allwr() {
-            allwr(null);
-        }
-        void allwr(const InstructionModifier &mod) {
-            allwr(mod, null);
-        }
-        void allwr(const RegData &src0) {
-            allwr(InstructionModifier(), src0);
-        }
-        void allwr(const InstructionModifier &mod, const RegData &src0) {
-            this->operator()(SyncFunction::allwr, mod, src0);
-        }
-        void allwr(uint32_t src0) {
-            allwr(InstructionModifier(), src0);
-        }
-        void allwr(const InstructionModifier &mod, uint32_t src0) {
-            this->operator()(SyncFunction::allwr, mod, src0);
-        }
-        void bar(const InstructionModifier &mod = InstructionModifier()) {
-            this->operator()(SyncFunction::bar, mod);
-        }
-        void bar(const InstructionModifier &mod, uint32_t src0) {
-            this->operator()(SyncFunction::bar, mod, src0);
-        }
-        void bar(const InstructionModifier &mod, const RegData &src0) {
-            this->operator()(SyncFunction::bar, mod, src0);
-        }
-        void bar(uint32_t src0) {
-            this->operator()(SyncFunction::bar, InstructionModifier(), src0);
-        }
-        void bar(const RegData &src0) {
-            this->operator()(SyncFunction::bar, InstructionModifier(), src0);
-        }
-        void flush() {
-            flush(InstructionModifier());
-        }
-        void flush(const InstructionModifier &mod) {
-            this->operator()(SyncFunction::flush, InstructionModifier(), null);
-        }
-        void host(const InstructionModifier &mod = InstructionModifier()) {
-            this->operator()(SyncFunction::host, mod);
-        }
-        void nop(const InstructionModifier &mod = InstructionModifier()) {
-            this->operator()(SyncFunction::nop, mod);
-        }
-    };
-public:
-    Sync sync;
-
-    void ignoredep(Operand op) {
-        if (hw >= HW::Gen12LP)
-            opX(Opcode::directive, DataType::ud, InstructionModifier(), GRF(static_cast<int>(op)), NullRegister(), NullRegister());
-    }
-    void subdep(Operand op, const GRFRange &r) {
-        ignoredep(op);
-        wrdep(r);
-    }
-    void subdep(Operand op, const GRF &r) {
-        ignoredep(op);
-        wrdep(r);
-    }
-    void wrdep(const GRFRange &r) {
-#ifdef NGEN_SAFE
-        if (hw < HW::Gen12LP) throw unsupported_instruction();
-#endif
-        int len = r.getLen();
-        for (int o = 0; o < len; o += 32) {
-            int thisLen = std::min(len - o, 32);
-            opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::wrdep)), r[o], r[o + thisLen - 1]);
-        }
-    }
-    void wrdep(const GRF &r) {
-        wrdep(r-r);
-    }
-    void fencedep(Label &fenceLocation) {
-        addFixup(LabelFixup(fenceLocation.getID(labelManager), LabelFixup::JIPOffset));
-        opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::fencedep)), Immediate::ud(0));
-    }
-
-    using _self = BinaryCodeGenerator<hw>;
-#include "ngen_pseudo.hpp"
-};
-
-#define NGEN_FORWARD(hw) \
-NGEN_FORWARD_NO_ELF_OVERRIDES(hw) \
-NGEN_FORWARD_EXTRA_ELF_OVERRIDES(hw) \
-void requireGRF(int grfs) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::requireGRF(grfs); }
-
-#define NGEN_FORWARD_NO_ELF_OVERRIDES(hw) \
-using InstructionStream = typename NGEN_NAMESPACE::BinaryCodeGenerator<hw>::InstructionStream; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::isGen12; \
-template <typename DT = void, typename... Targs> void add(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template add<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void add3(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template add3<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void addc(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template addc<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void and_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template and_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void asr(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template asr<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void avg(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template avg<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void bfe(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template bfe<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void bfi1(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template bfi1<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void bfi2(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template bfi2<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void bfn(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template bfn<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void bfrev(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template bfrev<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void cbit(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template cbit<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void cmp(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template cmp<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void cmpn(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template cmpn<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void csel(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template csel<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dp2(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dp2<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dp3(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dp3<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dp4(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dp4<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dp4a(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dp4a<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dpas(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dpas<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dpasw(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dpasw<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void dph(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template dph<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void fbh(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template fbh<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void fbl(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template fbl<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void frc(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template frc<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void line(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template line<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void lrp(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template lrp<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void lzd(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template lzd<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void mac(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template mac<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void macl(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template macl<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void mach(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template mach<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void mad(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template mad<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void madm(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template madm<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void math(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template math<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void mov(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template mov<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void movi(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template movi<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void mul(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template mul<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void not_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template not_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void or_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template or_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void pln(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template pln<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rndd(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rndd<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rnde(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rnde<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rndu(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rndu<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rndz(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rndz<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rol(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rol<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void ror(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template ror<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void sad2(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template sad2<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void sada2(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template sada2<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void sel(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template sel<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void shl(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template shl<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void shr(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template shr<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void smov(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template smov<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void subb(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template subb<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void xor_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template xor_<DT>(std::forward<Targs>(args)...); } \
-template <typename... Targs> void brc(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::brc(std::forward<Targs>(args)...); } \
-template <typename... Targs> void brd(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::brd(std::forward<Targs>(args)...); } \
-template <typename... Targs> void break_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::break_(std::forward<Targs>(args)...); } \
-template <typename... Targs> void call(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::call(std::forward<Targs>(args)...); } \
-template <typename... Targs> void calla(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::calla(std::forward<Targs>(args)...); } \
-template <typename... Targs> void cont(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::cont(std::forward<Targs>(args)...); } \
-template <typename... Targs> void else_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::else_(std::forward<Targs>(args)...); } \
-template <typename... Targs> void endif(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::endif(std::forward<Targs>(args)...); } \
-template <typename... Targs> void goto_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::goto_(std::forward<Targs>(args)...); } \
-template <typename... Targs> void halt(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::halt(std::forward<Targs>(args)...); } \
-template <typename... Targs> void if_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::if_(std::forward<Targs>(args)...); } \
-template <typename... Targs> void illegal(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::illegal(std::forward<Targs>(args)...); } \
-template <typename... Targs> void join(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::join(std::forward<Targs>(args)...); } \
-template <typename... Targs> void jmpi(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::jmpi(std::forward<Targs>(args)...); } \
-template <typename... Targs> void nop(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::nop(std::forward<Targs>(args)...); } \
-template <typename... Targs> void ret(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ret(std::forward<Targs>(args)...); } \
-template <typename... Targs> void send(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::send(std::forward<Targs>(args)...); } \
-template <typename... Targs> void sendc(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sendc(std::forward<Targs>(args)...); } \
-template <typename... Targs> void sends(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sends(std::forward<Targs>(args)...); } \
-template <typename... Targs> void sendsc(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sendsc(std::forward<Targs>(args)...); } \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sync; \
-template <typename... Targs> void wait(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::wait(std::forward<Targs>(args)...); } \
-template <typename... Targs> void while_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::while_(std::forward<Targs>(args)...); } \
-template <typename... Targs> void ignoredep(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ignoredep(std::forward<Targs>(args)...); } \
-template <typename... Targs> void subdep(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::subdep(std::forward<Targs>(args)...); } \
-template <typename... Targs> void wrdep(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::wrdep(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void min_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template min_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void max_(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template max_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void bfi(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template bfi<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void cos(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template cos<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void exp(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template exp<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void fdiv(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template fdiv<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void idiv(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template idiv<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void inv(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template inv<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void invm(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template invm<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void iqot(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template iqot<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void irem(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template irem<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void log(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template log<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void pow(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template pow<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rsqt(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rsqt<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void rsqtm(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template rsqtm<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void sin(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template sin<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void sqt(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template sqt<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void fdiv_ieee(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template fdiv_ieee<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void inv_ieee(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template inv_ieee<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void sqt_ieee(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template sqt_ieee<DT>(std::forward<Targs>(args)...); } \
-template <typename... Targs> void threadend(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::threadend(std::forward<Targs>(args)...); } \
-template <typename... Targs> void barrierheader(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::barrierheader(std::forward<Targs>(args)...); } \
-template <typename... Targs> void barriermsg(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::barriermsg(std::forward<Targs>(args)...); } \
-template <typename... Targs> void barriersignal(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::barriersignal(std::forward<Targs>(args)...); } \
-template <typename... Targs> void barrierwait(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::barrierwait(std::forward<Targs>(args)...); } \
-template <typename... Targs> void barrier(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::barrier(std::forward<Targs>(args)...); } \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::load; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::store; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::atomic; \
-template <typename... Targs> void memfence(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::memfence(std::forward<Targs>(args)...); } \
-template <typename... Targs> void slmfence(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::slmfence(std::forward<Targs>(args)...); } \
-template <typename... Targs> void fencewait(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::fencewait(std::forward<Targs>(args)...); } \
-template <typename... Targs> void loadlid(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::loadlid(std::forward<Targs>(args)...); } \
-template <typename... Targs> void loadargs(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::loadargs(std::forward<Targs>(args)...); } \
-template <typename... Targs> void epilogue(int GRFCount, bool hasSLM, const NGEN_NAMESPACE::RegData &r0_info) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::epilogue(GRFCount, hasSLM, r0_info); } \
-template <typename... Targs> void pushStream(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::pushStream(std::forward<Targs>(args)...); } \
-template <typename... Targs> InstructionStream *popStream(Targs&&... args) { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::popStream(std::forward<Targs>(args)...); } \
-template <typename... Targs> void appendStream(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::appendStream(std::forward<Targs>(args)...); } \
-template <typename... Targs> void appendCurrentStream(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::appendCurrentStream(std::forward<Targs>(args)...); } \
-template <typename... Targs> void discardStream(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::discardStream(std::forward<Targs>(args)...); } \
-template <typename... Targs> void mark(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mark(std::forward<Targs>(args)...); } \
-template <typename... Targs> void comment(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::comment(std::forward<Targs>(args)...); } \
-template <typename... Targs> void setDefaultNoMask(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::setDefaultNoMask(std::forward<Targs>(args)...); } \
-template <typename... Targs> void setDefaultAutoSWSB(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::setDefaultAutoSWSB(std::forward<Targs>(args)...); } \
-bool getDefaultNoMask() { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::getDefaultNoMask(); } \
-bool getDefaultAutoSWSB() { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::getDefaultAutoSWSB(); } \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::product; \
-NGEN_NAMESPACE::Product getProduct() { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::getProduct(); } \
-NGEN_NAMESPACE::ProductFamily getProductFamily() { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::getProductFamily(); } \
-int getStepping() { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::getStepping(); } \
-void setProduct(NGEN_NAMESPACE::Product product_) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::setProduct(product_); } \
-void setProductFamily(NGEN_NAMESPACE::ProductFamily family_) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::setProductFamily(family_); } \
-void setStepping(int stepping_) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::setStepping(stepping_); } \
-NGEN_FORWARD_EXTRA(hw) \
-NGEN_FORWARD_OP_NAMES(hw) \
-NGEN_FORWARD_MIN_MAX(hw) \
-NGEN_FORWARD_REGISTERS(hw)
-
-#define NGEN_FORWARD_EXTRA(hw)
-#define NGEN_FORWARD_EXTRA_ELF_OVERRIDES(hw)
-
-#ifdef NGEN_NO_OP_NAMES
-#define NGEN_FORWARD_OP_NAMES(hw)
-#else
-#define NGEN_FORWARD_OP_NAMES(hw) \
-template <typename DT = void, typename... Targs> void and(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template and_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void not(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template not_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void or(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template or_<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void xor(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template xor_<DT>(std::forward<Targs>(args)...); }
-#endif
-
-#ifdef NGEN_WINDOWS_COMPAT
-#define NGEN_FORWARD_MIN_MAX(hw)
-#else
-#define NGEN_FORWARD_MIN_MAX(hw) \
-template <typename DT = void, typename... Targs> void min(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template min<DT>(std::forward<Targs>(args)...); } \
-template <typename DT = void, typename... Targs> void max(Targs&&... args) { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template max<DT>(std::forward<Targs>(args)...); }
-#endif
-
-#ifdef NGEN_GLOBAL_REGS
-#define NGEN_FORWARD_REGISTERS(hw)
-#else
-#define NGEN_FORWARD_REGISTERS_BASE(hw) \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::indirect; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r4; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r5; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r6; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r7; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r8; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r9; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r10; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r11; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r12; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r13; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r14; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r15; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r16; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r17; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r18; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r19; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r20; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r21; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r22; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r23; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r24; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r25; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r26; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r27; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r28; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r29; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r30; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r31; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r32; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r33; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r34; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r35; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r36; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r37; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r38; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r39; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r40; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r41; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r42; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r43; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r44; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r45; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r46; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r47; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r48; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r49; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r50; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r51; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r52; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r53; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r54; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r55; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r56; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r57; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r58; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r59; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r60; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r61; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r62; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r63; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r64; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r65; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r66; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r67; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r68; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r69; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r70; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r71; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r72; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r73; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r74; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r75; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r76; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r77; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r78; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r79; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r80; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r81; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r82; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r83; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r84; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r85; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r86; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r87; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r88; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r89; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r90; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r91; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r92; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r93; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r94; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r95; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r96; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r97; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r98; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r99; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r100; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r101; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r102; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r103; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r104; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r105; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r106; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r107; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r108; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r109; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r110; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r111; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r112; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r113; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r114; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r115; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r116; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r117; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r118; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r119; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r120; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r121; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r122; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r123; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r124; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r125; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r126; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r127; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r128; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r129; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r130; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r131; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r132; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r133; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r134; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r135; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r136; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r137; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r138; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r139; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r140; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r141; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r142; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r143; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r144; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r145; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r146; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r147; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r148; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r149; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r150; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r151; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r152; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r153; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r154; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r155; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r156; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r157; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r158; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r159; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r160; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r161; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r162; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r163; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r164; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r165; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r166; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r167; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r168; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r169; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r170; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r171; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r172; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r173; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r174; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r175; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r176; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r177; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r178; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r179; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r180; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r181; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r182; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r183; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r184; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r185; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r186; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r187; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r188; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r189; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r190; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r191; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r192; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r193; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r194; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r195; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r196; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r197; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r198; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r199; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r200; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r201; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r202; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r203; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r204; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r205; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r206; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r207; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r208; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r209; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r210; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r211; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r212; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r213; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r214; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r215; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r216; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r217; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r218; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r219; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r220; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r221; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r222; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r223; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r224; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r225; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r226; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r227; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r228; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r229; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r230; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r231; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r232; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r233; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r234; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r235; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r236; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r237; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r238; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r239; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r240; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r241; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r242; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r243; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r244; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r245; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r246; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r247; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r248; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r249; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r250; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r251; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r252; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r253; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r254; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::r255; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::null; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::a0; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc4; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc5; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc6; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc7; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc8; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::acc9; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme4; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme5; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme6; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::mme7; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::noacc; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::nomme; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f0_0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f0_1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f1_0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::f1_1; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ce0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sp; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sr0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sr1; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::cr0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::n0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ip; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tdr0; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tm0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tm1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tm2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tm3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tm4; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::pm0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::tp0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::dbg0; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::fc0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::fc1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::fc2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::fc3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::NoDDClr; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::NoDDChk; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::AccWrEn; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::NoSrcDepSet; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::Breakpoint; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sat; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::NoMask; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ExBSO; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::Serialize; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::EOT; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::Atomic; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::Switch; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::NoPreempt; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::anyv; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::allv; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::any2h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::all2h; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::any4h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::all4h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::any8h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::all8h; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::any16h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::all16h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::any32h; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::all32h; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::any; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::all; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::x_repl; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::y_repl; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::z_repl; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::w_repl; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ze; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::eq; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::nz; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ne; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::gt; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ge; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::lt; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::le; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ov; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::un; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::eo; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M4; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M8; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M12; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M16; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M20; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M24; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::M28; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb0; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb3; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb4; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb5; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb6; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb7; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb8; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb9; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb10; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb11; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb12; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb13; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb14; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb15; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb16; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb17; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb18; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb19; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb20; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb21; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb22; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb23; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb24; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb25; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb26; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb27; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb28; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb29; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb30; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::sb31; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::NoAccSBSet; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::A32; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::A32NC; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::A64; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::A64NC; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::SLM; \
-template <typename... Targs> NGEN_NAMESPACE::InstructionModifier ExecutionOffset(Targs&&... args) { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::ExecutionOffset(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::AddressBase Surface(Targs&&... args) { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::Surface(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::AddressBase CC(Targs&&... args) { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::CC(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::AddressBase SC(Targs&&... args) { return NGEN_NAMESPACE::BinaryCodeGenerator<hw>::SC(std::forward<Targs>(args)...); } \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D8; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D16; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D32; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D64; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D8U32; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D16U32; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D8T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D16T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D32T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D64T; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D8U32T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::D16U32T; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V1; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V2; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V3; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V4; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V8; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V16; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V32; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V64; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V1T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V2T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V3T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V4T; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V8T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V16T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V32T; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::V64T; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::transpose; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::vnni; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1UC_L3UC; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1UC_L3C; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1C_L3UC; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1C_L3C; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1S_L3UC; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1S_L3C; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1IAR_L3C; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1UC_L3WB; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1WT_L3UC; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1WT_L3WB; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1S_L3WB; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1WB_L3WB; \
-using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1C_L3CC; using NGEN_NAMESPACE::BinaryCodeGenerator<hw>::L1UC_L3CC;
-#define NGEN_FORWARD_REGISTERS_EXTRA1(hw)
-#define NGEN_FORWARD_REGISTERS_EXTRA2(hw)
-#define NGEN_FORWARD_REGISTERS_EXTRA3(hw)
-#define NGEN_FORWARD_REGISTERS(hw) NGEN_FORWARD_REGISTERS_BASE(hw) NGEN_FORWARD_REGISTERS_EXTRA1(hw) NGEN_FORWARD_REGISTERS_EXTRA2(hw) NGEN_FORWARD_REGISTERS_EXTRA3(hw)
-#endif
-
-template <HW hw>
-inline void BinaryCodeGenerator<hw>::unsupported()
-{
-#ifdef NGEN_SAFE
-    throw unsupported_instruction();
-#endif
-}
-
-template <HW hw>
-typename BinaryCodeGenerator<hw>::InstructionStream *BinaryCodeGenerator<hw>::popStream()
-{
-#ifdef NGEN_SAFE
-    if (streamStack.size() <= 1) throw stream_stack_underflow();
-#endif
-
-    InstructionStream *result = streamStack.back();
-    streamStack.pop_back();
-    return result;
-}
-
-template <HW hw>
-static inline Instruction12 encodeSyncInsertion(autoswsb::SyncInsertion &si)
-{
-    Instruction12 i;
-
-    i.common.opcode = static_cast<int>(Opcode::sync);
-    i.common.swsb = (hw >= HW::XeHPC) ? SWSBInfoXeHPC(si.swsb, Opcode::sync).raw()
-                                      :   SWSBInfo12(si.swsb, Opcode::sync).raw();
-    i.common.maskCtrl = true;
-    i.binary.cmod = static_cast<int>(si.fc);
-
-    if (si.mask) {
-        i.binary.src0Type = getTypecode12(DataType::ud);
-        i.binary.src0Imm = true;
-        i.imm32.value = si.mask;
-    }
-    i.binary.dst = 1;
-
-    return i;
-}
-
-template <HW hw>
-std::vector<uint8_t> BinaryCodeGenerator<hw>::getCode()
-{
-#ifdef NGEN_SAFE
-    if (streamStack.size() > 1) throw unfinished_stream_exception();
-#endif
-    rootStream.fixLabels(labelManager);
-
-    Program program(rootStream);
-    autoswsb::BasicBlockList analysis = autoswsb::autoSWSB(hw, declaredGRFs, program);
-    std::vector<uint8_t> result;
-
-    if (analysis.empty()) {
-        result.resize(rootStream.length());
-        std::memmove(result.data(), rootStream.code.data(), rootStream.length());
-    } else {
-        std::multimap<int32_t, autoswsb::SyncInsertion*> syncs;
-
-        for (auto &bb : analysis)
-            for (auto &sync : bb.syncs)
-                syncs.insert(std::make_pair(sync.inum, &sync));
-
-        result.resize(rootStream.length() + syncs.size() * sizeof(Instruction12));
-
-        auto *psrc = reinterpret_cast<const Instruction12 *>(rootStream.code.data());
-        auto *pdst = reinterpret_cast<Instruction12 *>(result.data());
-        auto nextSync = syncs.begin();
-
-        for (uint32_t isrc = 0; isrc < program.size(); isrc++, psrc++) {
-            if (psrc->opcode() == Opcode::directive)
-                continue;
-            while ((nextSync != syncs.end()) && (nextSync->second->inum == isrc))
-                *pdst++ = encodeSyncInsertion<hw>(*(nextSync++)->second);
-            *pdst++ = *psrc;
-        }
-
-        result.resize(reinterpret_cast<uint8_t *>(pdst) - result.data());
-    }
-
-    return result;
-}
-
-template <HW hw>
-template <bool forceWE, typename D, typename S0, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
-    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
-
-    encodeCommon8(i, op, emod);
-    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-
-    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
-    if (src0.isIndirect()) i.binary.src0AddrImm9 = src0.getOffset() >> 9;
-
-    i.binary.dstType = getTypecode<hw>(dst.getType());
-    i.binary.src0Type = getTypecode<hw>(src0.getType());
-
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.src0RegFile = getRegFile(src0);
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, typename S0, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
-    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.binary.dst  = encodeBinaryOperand12<-1>(dst, tag).bits;
-    i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
-
-    i.binary.dstAddrMode = dst.isIndirect();
-    i.binary.dstType  = getTypecode12(dst.getType());
-    i.binary.src0Type = getTypecode12(src0.getType());
-
-    i.binary.src0Mods = src0.getMods();
-
-    i.binary.cmod = static_cast<int>(mod.getCMod());
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
-    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
-
-    encodeCommon8(i, op, emod);
-    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-
-    i.binary.dstType = getTypecode<hw>(dst.getType());
-    i.binary.src0Type = getImmediateTypecode<hw>(src0.getType());
-
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.src0RegFile = getRegFile(src0);
-
-    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
-
-    if (getBytes(src0.getType()) == 8)
-        i.imm64.value = static_cast<uint64_t>(src0);
-    else
-        i.imm32.value = static_cast<uint64_t>(src0);
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
-    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.binary.dst = encodeBinaryOperand12<-1>(dst, tag).bits;
-
-    i.binary.dstAddrMode = dst.isIndirect();
-
-    i.binary.dstType  = getTypecode12(dst.getType());
-    i.binary.src0Type = getTypecode12(src0.getType());
-
-    i.binary.src0Imm = true;
-
-    i.binary.cmod = static_cast<int>(mod.getCMod());
-
-    auto val = static_cast<uint64_t>(src0);
-    i.imm32.value = val;
-    if (getBytes(src0.getType()) == 8) {
-#ifdef NGEN_SAFE
-        if (mod.getCMod() != ConditionModifier::none) throw invalid_modifiers_exception();
-#endif
-        i.imm64.high = val >> 32;
-    }
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, typename S0, typename S1, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1)
-{
-    Instruction8 i{};
-
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
-    dst.fixup(hw, emod.getExecSize(),  ewidth, defaultType, -1, 2);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
-
-    encodeCommon8(i, op, emod);
-    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
-
-    i.binary.dst  = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-    i.binary.src1 = encodeBinaryOperand8<false>(src1).bits;
-
-    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
-    if (src0.isIndirect()) i.binary.src0AddrImm9 = src0.getOffset() >> 9;
-    if (src1.isIndirect()) i.binary.src1AddrImm9 = src1.getOffset() >> 9;
-
-    i.binary.dstType  = getTypecode<hw>(dst.getType());
-    i.binary.src0Type = getTypecode<hw>(src0.getType());
-    i.binary.src1Type = getTypecode<hw>(src1.getType());
-
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.src0RegFile = getRegFile(src0);
-    i.binary.src1RegFile = getRegFile(src1);
-
-#ifdef NGEN_SAFE
-    if (src1.isARF() && op != Opcode::illegal && op != Opcode::movi && op != Opcode::directive)
-        throw grf_expected_exception();
-#endif
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, typename S0, typename S1, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
-    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 2);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.binary.dst  = encodeBinaryOperand12<-1>(dst, tag).bits;
-    i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
-    i.binary.src1 = encodeBinaryOperand12<1>(src1, tag).bits;
-
-    i.binary.dstAddrMode = dst.isIndirect();
-    i.binary.dstType  = getTypecode12(dst.getType());
-    i.binary.src0Type = getTypecode12(src0.getType());
-    i.binary.src1Type = getTypecode12(src1.getType());
-
-    i.binary.src0Mods = src0.getMods();
-    i.binary.src1Mods = src1.getMods();
-
-    i.binary.cmod = static_cast<int>(mod.getCMod());
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, typename S0, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
-    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 2);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
-
-    encodeCommon8(i, op, emod);
-    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-
-    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
-    if (src0.isIndirect()) i.binary.src0AddrImm9 = src0.getOffset() >> 9;
-
-    i.binary.dstType = getTypecode<hw>(dst.getType());
-    i.binary.src0Type = getTypecode<hw>(src0.getType());
-    i.binary.src1Type = getImmediateTypecode<hw>(src1.getType());
-
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.src0RegFile = getRegFile(src0);
-    i.binary.src1RegFile = getRegFile(src1);
-
-    i.imm32.value = static_cast<uint64_t>(src1);
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, typename D, typename S0, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
-    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 2);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.binary.dst  = encodeBinaryOperand12<-1>(dst, tag).bits;
-    i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
-    i.binary.src1 = static_cast<uint64_t>(src1);
-
-    i.binary.dstAddrMode = dst.isIndirect();
-    i.binary.dstType  = getTypecode12(dst.getType());
-    i.binary.src0Type = getTypecode12(src0.getType());
-    i.binary.src1Type = getTypecode12(src1.getType());
-
-    i.binary.src0Mods = src0.getMods();
-
-    i.binary.cmod = static_cast<int>(mod.getCMod());
-
-    i.binary.src1Imm = true;
-    i.imm32.value = static_cast<uint64_t>(src1);
-
-    db(i);
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwLE(hw_, HW::Gen9)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, RegData dst, RegData src0, RegData src1, RegData src2)
-{
-    opX(op, defaultType, mod, emulateAlign16Dst(dst),  emulateAlign16Src(src0),
-                              emulateAlign16Src(src1), emulateAlign16Src(src2));
-}
-
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, Align16Operand dst, Align16Operand src0, Align16Operand src1, Align16Operand src2)
-{
-#ifdef NGEN_SAFE
-    if (dst.getReg().isARF())  throw grf_expected_exception();
-    if (src0.getReg().isARF()) throw grf_expected_exception();
-    if (src1.getReg().isARF()) throw grf_expected_exception();
-    if (src2.getReg().isARF()) throw grf_expected_exception();
-#endif
-
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier | Align16;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
-    dst.getReg().fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
-    src0.getReg().fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
-    src1.getReg().fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
-    src2.getReg().fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
-
-    encodeCommon8(i, op, emod);
-
-    i.ternary16.dstChanEn = dst.getChanEn();
-    i.ternary16.dstRegNum = dst.getReg().getBase();
-    i.ternary16.dstSubregNum2_4 = dst.getReg().getByteOffset() >> 2;
-    i.ternary16.dstType = getTernary16Typecode8(dst.getReg().getType());
-
-    i.ternary16.srcType = getTernary16Typecode8(src0.getReg().getType());
-
-    bool isFOrHF = (src0.getReg().getType() == DataType::f
-                 || src0.getReg().getType() == DataType::hf);
-
-    i.ternary16.src1Type = isFOrHF && (src1.getReg().getType() == DataType::hf);
-    i.ternary16.src2Type = isFOrHF && (src1.getReg().getType() == DataType::hf);
-
-    encodeTernaryCommon8(i, src0, src1, src2);
-
-    db(i);
-}
-
-template <HW hw>
-template <typename D, typename S0, typename S1, typename S2, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2)
-{
-    if (hw < HW::Gen10)
-        unsupported();
-
-#ifdef NGEN_SAFE
-    if (src0.isARF()) throw grf_expected_exception();
-    if (src2.isARF()) throw grf_expected_exception();
-#endif
-
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
-    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
-    src2.fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
-
-    encodeCommon8(i, op, emod);
-
-    i.ternary1.src0RegFile = std::is_base_of<Immediate, S0>::value;
-    i.ternary1.src1RegFile = src1.isARF();
-    i.ternary1.src2RegFile = std::is_base_of<Immediate, S2>::value;
-
-    encodeTernaryCommon8(i, src0, src1, src2);
-    encodeTernary1Dst10(i, dst);
-
-    db(i);
-}
-
-template <HW hw>
-template <typename D, typename S0,typename S1, typename S2, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
-    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
-    src2.fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.ternary.dst  = encodeTernaryOperand12<true>(dst, tag).bits;
-    encodeTernarySrc0(i, src0, tag);
-    encodeTernarySrc1(i, src1, tag);
-    encodeTernarySrc2(i, src2, tag);
-    encodeTernaryTypes(i, dst, src0, src1, src2);
-
-    i.ternary.cmod = static_cast<int>(mod.getCMod());
-
-    db(i);
-}
-
-template <HW hw>
-template <typename DS0>
-void BinaryCodeGenerator<hw>::opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0)
-{
-    InstructionModifier mmod = mod;
-
-    mmod.setCMod(static_cast<ConditionModifier>(fc));
-    opX(op, defaultType, mmod, dst, src0);
-}
-
-template <HW hw>
-template <typename DS0, typename S1>
-void BinaryCodeGenerator<hw>::opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0, S1 src1)
-{
-    InstructionModifier mmod = mod;
-
-    mmod.setCMod(static_cast<ConditionModifier>(fc));
-    opX(op, defaultType, mmod, dst, src0, src1);
-}
-
-template <HW hw>
-template <typename D, typename S0, typename S2>
-void BinaryCodeGenerator<hw>::opBfn(Opcode op, DataType defaultType, const InstructionModifier &mod, int bfnCtrl, D dst, S0 src0, RegData src1, S2 src2)
-{
-    if (hw < HW::XeHP)
-        unsupported();
-
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
-    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
-    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
-    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
-    src2.fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.ternary.dst  = encodeTernaryOperand12<true>(dst, tag).bits;
-    encodeTernarySrc0(i, src0, tag);
-    encodeTernarySrc1(i, src1, tag);
-    encodeTernarySrc2(i, src2, tag);
-    encodeTernaryTypes(i, dst, src0, src1, src2);
-
-    i.ternary.cmod = static_cast<int>(mod.getCMod());
-
-    i.bfn.bfnCtrl03 = (bfnCtrl >> 0);
-    i.bfn.bfnCtrl47 = (bfnCtrl >> 4);
-
-    db(i);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opDpas(Opcode op, DataType defaultType, const InstructionModifier &mod, int sdepth, int rcount, RegData dst, RegData src0, RegData src1, RegData src2)
-{
-    if (hw < HW::XeHP)
-        unsupported();
-
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    dst.fixup(hw, emod.getExecSize(), 0, defaultType, -1, 3);
-    src0.fixup(hw, emod.getExecSize(), 0, defaultType, 0, 3);
-    src1.fixup(hw, emod.getExecSize(), 0, defaultType, 1, 3);
-    src2.fixup(hw, emod.getExecSize(), 0, defaultType, 2, 3);
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.ternary.dst  = encodeTernaryOperand12<true,  false>(dst,  tag).bits;
-    i.ternary.src0 = encodeTernaryOperand12<false, false>(src0, tag).bits;
-    i.ternary.src1 = encodeTernaryOperand12<false, false>(src1, tag).bits;
-    i.ternary.src2 = encodeTernaryOperand12<false, false>(src2, tag).bits;
-
-    encodeTernaryTypes(i, dst, src0, src1, src2);
-
-    i.dpas.rcount = rcount - 1;
-    i.dpas.sdepth = utils::log2(sdepth);
-
-    i.dpas.src1SubBytePrecision = encodeSubBytePrecision12(src1.getType());
-    i.dpas.src2SubBytePrecision = encodeSubBytePrecision12(src2.getType());
-
-    i.ternary.cmod = static_cast<int>(mod.getCMod());
-
-    db(i);
-}
-
-template <HW hw>
-template <typename D, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, uint32_t exdesc, D desc)
-{
-    exdesc |= uint32_t(static_cast<uint8_t>(sfid));
-    opSends(static_cast<Opcode>(static_cast<uint8_t>(op) | 2), mod, dst, src0, src1, exdesc, desc);
-}
-
-template <HW hw>
-template <typename D, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, const RegData &exdesc, D desc)
-{
-    opSends(static_cast<Opcode>(static_cast<uint8_t>(op) | 2), mod, dst, src0, src1, exdesc, desc);
-}
-
-template <HW hw>
-template <typename ED, typename D, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0_, const RegData &src1, int src1Length, ED exdesc, D desc)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    auto src0 = src0_;
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.send.fusionCtrl = emod.isSerialized();
-
-    i.send.dstReg = dst.getBase();
-    i.send.src0Reg = src0.getBase();
-    i.send.src1Reg = src1.getBase();
-
-    i.send.dstRegFile = getRegFile(dst);
-    i.send.src0RegFile = getRegFile(src0);
-    i.send.src1RegFile = getRegFile(src1);
-
-    i.send.sfid = static_cast<int>(sfid) & 0xF;
-
-    if (src1.isNull())
-        src1Length = 0;
-
-    encodeSendDesc(i, desc);
-    encodeSendExDesc(i, exdesc, mod, src1Length, hw);
-
-    db(i);
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon8(i, op, emod);
-
-    i.binary.dst  = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-
-    i.sendsGen9.dstRegFile = getRegFile(dst);
-    i.binary.src0RegFile = getRegFile(src0);
-    i.binary.src1RegFile = RegFileIMM;
-
-    i.binary.dstType = getTypecode<hw>(dst.getType());
-
-    i.sendsGen9.sfid = exdesc & 0xF;
-    i.sendGen8.zero = 0;
-    i.sendGen8.exDesc16_19 = (exdesc >> 16) & 0xF;
-    i.sendGen8.exDesc20_23 = (exdesc >> 20) & 0xF;
-    i.sendGen8.exDesc24_27 = (exdesc >> 24) & 0xF;
-    i.sendGen8.exDesc28_31 = (exdesc >> 28) & 0xF;
-    i.sendsGen9.desc = desc;
-
-    i.sendsGen9.eot = (exdesc >> 5) & 1;
-    if (dst.isIndirect()) i.sendsGen9.dstAddrImm9 = dst.getOffset() >> 9;
-
-    db(i);
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc)
-{
-#ifdef NGEN_SAFE
-    // Only a0.0:ud is allowed for desc.
-    if (!desc.isARF() || desc.getARFType() != ARFType::a || desc.getARFBase() != 0 || desc.getOffset() != 0)
-        throw invalid_arf_exception();
-#endif
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon8(i, op, emod);
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-    i.binary.src1 = encodeBinaryOperand8<false>(desc).bits;
-
-    i.sendsGen9.dstRegFile = getRegFile(dst);
-    i.binary.src0RegFile = getRegFile(src0);
-    i.binary.src1RegFile = getRegFile(desc);
-    i.binary.src1Type = getTypecode<hw>(desc.getType());
-
-    i.sendsGen9.sfid = exdesc & 0xF;
-    i.sendGen8.zero = 0;
-    i.sendGen8.exDesc16_19 = (exdesc >> 16) & 0xF;
-    i.sendGen8.exDesc20_23 = (exdesc >> 20) & 0xF;
-    i.sendGen8.exDesc24_27 = (exdesc >> 24) & 0xF;
-    i.sendGen8.exDesc28_31 = (exdesc >> 28) & 0xF;
-
-    i.sendsGen9.eot = (exdesc >> 5) & 1;
-    if (dst.isIndirect()) i.sendsGen9.dstAddrImm9 = dst.getOffset() >> 9;
-
-    db(i);
-}
-
-template <HW hw>
-template <typename D, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, D desc)
-{
-    opSends(op, mod, dst, src0, null, exdesc, desc);
-}
-
-template <HW hw>
-template <typename ED, typename D, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, ED exdesc, D desc)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon8(i, op, emod);
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-
-    i.binary.src0RegFile = 0;                   // ?
-    i.sendsGen9.dstRegFile = getRegFile(dst);
-    i.sendsGen9.src1RegFile = getRegFile(src1);
-    i.sendsGen9.src1RegNum = src1.getBase();
-
-    if (dst.isIndirect())  i.sendsGen9.dstAddrImm9  =  dst.getOffset() >> 9;
-    if (src0.isIndirect()) i.sendsGen9.src0AddrImm9 = src0.getOffset() >> 9;
-
-    encodeSendsDesc(i, desc);
-    encodeSendsExDesc(i, exdesc);
-
-    db(i);
-}
-
-template <HW hw>
-template <typename D, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, RegData exdesc, D desc)
-{
-#ifdef NGEN_SAFE
-    throw sfid_needed_exception();
-#endif
-}
-
-template <HW hw>
-template <typename D, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, D desc)
-{
-    Opcode mop = static_cast<Opcode>(static_cast<int>(op) & ~2);
-    opSend(mop, mod, static_cast<SharedFunction>(exdesc & 0x1F), dst, src0, src1, -1, exdesc, desc);
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon8(i, op, emod);
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.dstType = getTypecode<hw>(dst.getType());
-    i.binary.src0RegFile = getRegFile(Immediate());
-    i.binary.src0Type = getTypecode<hw>(DataType::d);
-    i.branches.jip = jip;
-    i.branches.uip = uip;
-
-    db(i);
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.branches.branchCtrl = emod.getBranchCtrl();
-
-    i.binary.dst = encodeBinaryOperand12<-1, false>(dst, tag).bits;
-
-    i.binary.src0Imm = true;
-    i.binary.src1Imm = true;
-
-    i.branches.jip = jip;
-    i.branches.uip = uip;
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    encodeCommon8(i, op, emod);
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.dstType = getTypecode<hw>(dst.getType());
-    i.binary.src1RegFile = RegFileIMM;
-    i.binary.src1Type = getTypecode<hw>(DataType::d);
-    i.branches.jip = jip;
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.branches.branchCtrl = emod.getBranchCtrl();
-
-    i.binary.dst = encodeBinaryOperand12<-1, false>(dst, tag).bits;
-    i.binary.src0Imm = true;
-    i.branches.jip = jip;
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, bool small12, HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    encodeCommon8(i, op, emod);
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.dstRegFile = getRegFile(dst);
-    i.binary.dstType = getTypecode<hw>(DataType::d);
-    i.binary.src0RegFile = getRegFile(src0);
-    i.binary.src0Type = getTypecode<hw>(DataType::d);
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-
-    db(i);
-}
-
-template <HW hw>
-template <bool forceWE, bool small12, HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-    if (forceWE)
-        emod |= NoMask;
-
-    encodeCommon12(i, op, emod, dst, tag);
-
-    i.branches.branchCtrl = emod.getBranchCtrl();
-
-    i.binary.dst = encodeBinaryOperand12<-1, false>(dst, tag).bits;
-    i.binary.src0 = encodeBinaryOperand12<0, false>(src0, tag).bits;
-    if (small12)
-        i.binary.src0 &= 0xFFFF;
-
-
-    db(i);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, Label &uip)
-{
-    addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
-    addFixup(LabelFixup(uip.getID(labelManager), LabelFixup::UIPOffset));
-    opBranch(op, mod, dst, 0, 0);
-}
-
-template <HW hw>
-template <bool forceWE>
-void BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip)
-{
-    addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
-    opBranch<forceWE>(op, mod, dst, 0);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opCall(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip)
-{
-    addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
-    if (isGen12)
-        opBranch<true>(op, mod, dst, 0);
-    else
-        opX<true>(op, DataType::d, mod, dst, null.ud(0)(0, 1, 0), Immediate::d(0));
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip)
-{
-    Instruction8 i{};
-    InstructionModifier emod = mod | defaultModifier | NoMask;
-
-    encodeCommon8(i, op, emod);
-
-    src0.fixup(hw, emod.getExecSize(), 0, DataType::d, 0, 2);
-
-    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
-    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
-    i.binary.src0RegFile = getRegFile(src0);
-    i.binary.src1RegFile = RegFileIMM;
-    i.binary.src1Type = getTypecode<hw>(DataType::d);
-
-    i.branches.jip = jip;
-
-    db(i);
-}
-
-template <HW hw>
-template <HW hw_>
-typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
-BinaryCodeGenerator<hw>::opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip)
-{
-    opBranch<true>(op, mod, dst, jip);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Label &jip)
-{
-    if (hw >= HW::Gen12LP)
-        addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
-    opJmpi(op, mod, dst, src0, 0);
-    if (hw < HW::Gen12LP)
-        addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffsetJMPI));
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod)
-{
-    if (hw < HW::Gen12LP)
-        unsupported();
-
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon12(i, op, emod, null, tag);
-
-    i.binary.dst = 0x1;
-    i.binary.cmod = static_cast<int>(fc);
-
-    db(i);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, RegData src0)
-{
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    if (hw < HW::Gen12LP)
-        unsupported();
-
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon12(i, op, emod, null, tag);
-
-    i.binary.dst = 0x1;
-    if (!src0.isNull()) {
-        src0.setRegion(0, 1, 0);
-        i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
-        i.binary.src0Type = getTypecode12(src0.getType());
-    }
-    i.binary.cmod = static_cast<int>(fc);
-
-    db(i);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, const Immediate &src0)
-{
-    if (hw < HW::Gen12LP)
-        unsupported();
-
-    typename EncodingTag12Dispatch<hw>::tag tag;
-    Instruction12 i{};
-    InstructionModifier emod = mod | defaultModifier;
-
-    encodeCommon12(i, op, emod, null, tag);
-
-    i.binary.dst = 0x1;
-    i.binary.src0Type = getTypecode12(src0.getType());
-    i.binary.src0Imm = true;
-    i.binary.cmod = static_cast<int>(fc);
-
-    i.imm32.value = static_cast<uint64_t>(src0);
-
-    db(i);
-}
-
-template <HW hw>
-void BinaryCodeGenerator<hw>::opNop(Opcode op)
-{
-    Instruction8 i{};
-
-    i.qword[0] = static_cast<int>(op);
-    i.qword[1] = 0;
-
-    db(i);
-}
-
-} /* namespace NGEN_NAMESPACE */
-
-#endif /* header guard */
diff --git a/src/gpu/intel/jit/ngen/ngen_config.hpp b/src/gpu/intel/jit/ngen/ngen_config.hpp
index ff7fc528bba..8592f89f8e7 100644
--- a/src/gpu/intel/jit/ngen/ngen_config.hpp
+++ b/src/gpu/intel/jit/ngen/ngen_config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,15 @@
 #include "common/float16.hpp"
 
 #define NGEN_NAMESPACE ngen
+#define NGEN_CPP11
+#define NGEN_SAFE
+#define NGEN_NEO_INTERFACE
+#define NGEN_NO_OP_NAMES
+#define NGEN_WINDOWS_COMPAT
+
+#ifdef DNNL_DEV_MODE
+#define NGEN_ASM
+#endif
 
 namespace NGEN_NAMESPACE {
 using bfloat16 = dnnl::impl::bfloat16_t;
@@ -31,4 +40,14 @@ using half = dnnl::impl::float16_t;
 #define NGEN_BFLOAT16_TYPE
 #define NGEN_HALF_TYPE
 
+#if (!defined(NDEBUG) || defined(DNNL_DEV_MODE)) \
+        && (__cplusplus >= 202002L || _MSVC_LANG >= 202002L)
+#if __has_include(<version>)
+#include <version>
+#if __cpp_lib_source_location >= 201907L
+#define NGEN_ENABLE_SOURCE_LOCATION true
+#endif
+#endif
+#endif
+
 #endif /* header guard */
diff --git a/src/gpu/intel/jit/ngen/ngen_elf.hpp b/src/gpu/intel/jit/ngen/ngen_elf.hpp
deleted file mode 100644
index 96589b900ef..00000000000
--- a/src/gpu/intel/jit/ngen/ngen_elf.hpp
+++ /dev/null
@@ -1,444 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef NGEN_ELF_HPP
-#define NGEN_ELF_HPP
-
-#include "ngen.hpp"
-#include "ngen_interface.hpp"
-
-#include "npack/neo_packager.hpp"
-
-namespace NGEN_NAMESPACE {
-
-// ELF binary format generator class.
-template <HW hw>
-class ELFCodeGenerator : public BinaryCodeGenerator<hw>
-{
-public:
-    inline std::vector<uint8_t> getBinary();
-    static inline HW getBinaryArch(const std::vector<uint8_t> &binary);
-    static inline void getBinaryHWInfo(const std::vector<uint8_t> &binary, HW &outHW, Product &outProduct);
-
-    explicit ELFCodeGenerator(Product product_)  : BinaryCodeGenerator<hw>(product_) {}
-    explicit ELFCodeGenerator(int stepping_ = 0) : BinaryCodeGenerator<hw>(stepping_) {}
-
-protected:
-    NEOInterfaceHandler interface_{hw};
-
-    void externalName(const std::string &name)                           { interface_.externalName(name); }
-
-    const std::string &getExternalName() const                           { return interface_.getExternalName(); }
-    int getSIMD() const                                                  { return interface_.getSIMD(); }
-    int getGRFCount() const                                              { return interface_.getGRFCount(); }
-    size_t getSLMSize() const                                            { return interface_.getSLMSize(); }
-
-    void require32BitBuffers()                                           { interface_.require32BitBuffers(); }
-    void requireArbitrationMode(ThreadArbitrationMode mode)              { interface_.requireArbitrationMode(mode); }
-    void requireBarrier()                                                { interface_.requireBarrier(); }
-    void requireBarriers(int nbarriers)                                  { interface_.requireBarriers(nbarriers); }
-    void requireDPAS()                                                   { interface_.requireDPAS(); }
-    void requireGlobalAtomics()                                          { interface_.requireGlobalAtomics(); }
-    void requireGRF(int grfs)                                            { BinaryCodeGenerator<hw>::requireGRF(grfs); interface_.requireGRF(grfs); }
-    void requireLocalID(int dimensions)                                  { interface_.requireLocalID(dimensions); }
-    void requireLocalSize()                                              { interface_.requireLocalSize(); }
-    void requireNonuniformWGs()                                          { interface_.requireNonuniformWGs(); }
-    void requireNoPreemption()                                           { interface_.requireNoPreemption(); }
-    void requireScratch(size_t bytes = 1)                                { interface_.requireScratch(bytes); }
-    void requireSIMD(int simd_)                                          { interface_.requireSIMD(simd_); }
-    void requireSLM(size_t bytes)                                        { interface_.requireSLM(bytes); }
-    void requireStatelessWrites(bool req = true)                         { interface_.requireStatelessWrites(req); }
-    inline void requireType(DataType type)                               { interface_.requireType(type); }
-    template <typename T> void requireType()                             { interface_.requireType<T>(); }
-    void requireWalkOrder(int o1, int o2)                                { interface_.requireWalkOrder(o1, o2); }
-    void requireWalkOrder(int o1, int o2, int o3)                        { interface_.requireWalkOrder(o1, o2, o3); }
-    void requireWorkgroup(size_t x, size_t y = 1, size_t z = 1)          { interface_.requireWorkgroup(x, y, z); }
-
-    void finalizeInterface()                                             { interface_.finalize(); }
-
-    template <typename DT>
-    void newArgument(const std::string &name)                            { interface_.newArgument<DT>(name); }
-    void newArgument(const std::string &name, DataType type,
-                     ExternalArgumentType exttype = ExternalArgumentType::Scalar,
-                     GlobalAccessType access = GlobalAccessType::Default)
-    {
-        interface_.newArgument(name, type, exttype, access);
-    }
-    void newArgument(const std::string &name, Subregister reg,
-                     ExternalArgumentType exttype = ExternalArgumentType::Scalar,
-                     GlobalAccessType access = GlobalAccessType::Default)
-    {
-        interface_.newArgument(name, reg, exttype, access);
-    }
-    void newArgument(const std::string &name, ExternalArgumentType exttype,
-                     GlobalAccessType access = GlobalAccessType::Default)
-    {
-        interface_.newArgument(name, exttype, access);
-    }
-
-    void allowArgumentRearrangement(bool allow)                          { return interface_.allowArgumentRearrangement(allow); }
-
-    Subregister getArgument(const std::string &name) const               { return interface_.getArgument(name); }
-    Subregister getArgumentIfExists(const std::string &name) const       { return interface_.getArgumentIfExists(name); }
-    int getArgumentSurface(const std::string &name) const                { return interface_.getArgumentSurface(name); }
-    int getArgumentSurfaceIfExists(const std::string &name) const        { return interface_.getArgumentSurfaceIfExists(name); }
-    GRF getLocalID(int dim) const                                        { return interface_.getLocalID(dim); }
-    RegData getSIMD1LocalID(int dim) const                               { return interface_.getSIMD1LocalID(dim); }
-    Subregister getLocalSize(int dim) const                              { return interface_.getLocalSize(dim); }
-
-    void prologue()                                                      { interface_.generatePrologue(*this); }
-    void epilogue(RegData r0_info = RegData())
-    {
-        if (r0_info.isInvalid()) r0_info = this->r0;
-        int GRFCount = interface_.getGRFCount();
-        bool hasSLM = (interface_.getSLMSize() > 0);
-        BinaryCodeGenerator<hw>::epilogue(GRFCount, hasSLM, r0_info);
-    }
-
-    inline std::vector<uint8_t> getBinary(const std::vector<uint8_t> &code);
-
-private:
-    using BinaryCodeGenerator<hw>::labelManager;
-    using BinaryCodeGenerator<hw>::rootStream;
-
-    struct ZebinELF {
-        enum {
-            ELFMagic = 0x464C457F,             // '\x7FELF'
-            ELFClass64 = 2,
-            ELFLittleEndian = 1,
-            ELFVersion1 = 1,
-            ELFRelocatable = 1,
-        };
-        enum {
-            MachineIntelGT = 205,
-            ZebinExec = 0xFF12
-        };
-        union TargetMetadata {
-            uint32_t all;
-            struct {
-                unsigned genFlags : 8;
-                unsigned minHWRevision : 5;
-                unsigned validateRevision : 1;
-                unsigned disableExtValidation : 1;
-                unsigned useGfxCoreFamily : 1;
-                unsigned maxHWRevision : 5;
-                unsigned generator : 3;
-                unsigned reserved : 8;
-            } parts;
-        };
-        struct FileHeader {
-            uint32_t magic = ELFMagic;
-            uint8_t elfClass = ELFClass64;
-            uint8_t endian = ELFLittleEndian;
-            uint8_t version = ELFVersion1;
-            uint8_t osABI = 0;
-            uint64_t pad = 0;
-            uint16_t type = ELFRelocatable;
-            uint16_t machine = MachineIntelGT;
-            uint32_t version2 = 1;
-            uint64_t entrypoint = 0;
-            uint64_t programHeaderOff = 0;
-            uint64_t sectionTableOff;
-            TargetMetadata flags;
-            uint16_t size;
-            uint16_t programHeaderSize = 0;
-            uint16_t programTableEntries = 0;
-            uint16_t sectionHeaderSize;
-            uint16_t sectionCount;
-            uint16_t strTableIndex = 1;
-        } fileHeader;
-        struct SectionHeader {
-            uint32_t name;
-            enum Type : uint32_t {
-                Null = 0, Program = 1, SymbolTable = 2, StringTable = 3, Note = 7, ZeInfo = 0xFF000011
-            } type;
-            uint64_t flags = 0;
-            uint64_t addr = 0;
-            uint64_t offset;
-            uint64_t size;
-            uint32_t link = 0;
-            uint32_t info = 0;
-            uint64_t align = 0x10;
-            uint64_t entrySize = 0;
-        } sectionHeaders[5];
-        struct Note {
-            uint32_t nameSize = 8;
-            uint32_t descSize = 4;
-            enum Type : uint32_t {
-                ProductFamily = 1, GfxCoreFamily = 2, TargetMetadata = 3
-            } type = Type::GfxCoreFamily;
-            const char name[8] = "IntelGT";
-            uint32_t payload;
-        } noteGfxCore;
-        struct StringTable {
-            const char zero = '\0';
-            const char snStrTable[10] = ".shstrtab";
-            const char snMetadata[9] = ".ze_info";
-            const char snNote[21] = ".note.intelgt.compat";
-            const char snText[6] = {'.', 't', 'e', 'x', 't', '.'};
-        } stringTable;
-
-        static size_t align(size_t sz) {
-            return (sz + 0xF) & ~0xF;
-        }
-
-        ZebinELF(size_t szKernelName, size_t szMetadata, size_t szKernel) {
-            fileHeader.size = sizeof(fileHeader);
-            fileHeader.sectionHeaderSize = sizeof(SectionHeader);
-            fileHeader.sectionTableOff = offsetof(ZebinELF, sectionHeaders);
-            fileHeader.sectionCount = sizeof(sectionHeaders) / sizeof(SectionHeader);
-
-            fileHeader.flags.all = 0;
-
-            sectionHeaders[0].name = 0;
-            sectionHeaders[0].type = SectionHeader::Type::Null;
-            sectionHeaders[0].offset = 0;
-            sectionHeaders[0].size = 0;
-
-            sectionHeaders[1].name = offsetof(StringTable, snStrTable);
-            sectionHeaders[1].type = SectionHeader::Type::StringTable;
-            sectionHeaders[1].offset = offsetof(ZebinELF, stringTable);
-            sectionHeaders[1].size = sizeof(stringTable) + szKernelName;
-
-            sectionHeaders[2].name = offsetof(StringTable, snMetadata);
-            sectionHeaders[2].type = SectionHeader::Type::ZeInfo;
-            sectionHeaders[2].offset = align(sizeof(ZebinELF) + szKernelName);
-            sectionHeaders[2].size = szMetadata;
-
-            sectionHeaders[3].name = offsetof(StringTable, snText);
-            sectionHeaders[3].type = SectionHeader::Type::Program;
-            sectionHeaders[3].offset = sectionHeaders[2].offset + align(szMetadata);
-            sectionHeaders[3].size = szKernel;
-            sectionHeaders[3].flags = 6;    /* SHF_ALLOC | SHF_EXECINSTR */
-
-            sectionHeaders[4].name = offsetof(StringTable, snNote);
-            sectionHeaders[4].type = SectionHeader::Type::Note;
-            sectionHeaders[4].offset = offsetof(ZebinELF, noteGfxCore);
-            sectionHeaders[4].size = sizeof(noteGfxCore);
-
-            noteGfxCore.payload = static_cast<uint32_t>(npack::encodeGfxCoreFamily(hw));
-        }
-
-        static size_t kernelNameOffset() {
-            return offsetof(ZebinELF, stringTable.snText) + sizeof(stringTable.snText);
-        }
-
-        bool valid() const {
-            if (fileHeader.magic != ELFMagic || fileHeader.elfClass != ELFClass64
-                    || fileHeader.endian != ELFLittleEndian || fileHeader.sectionHeaderSize != sizeof(SectionHeader)
-                    || (fileHeader.version != 0 && fileHeader.version != ELFVersion1)
-                    || (fileHeader.type != ZebinExec && fileHeader.type != ELFRelocatable))
-                return false;
-            auto *base = reinterpret_cast<const uint8_t *>(&fileHeader);
-            auto *sheader = reinterpret_cast<const SectionHeader *>(base + fileHeader.sectionTableOff);
-            for (int s = 0; s < fileHeader.sectionCount; s++, sheader++)
-                if (sheader->type == SectionHeader::Type::ZeInfo)
-                    return true;
-            return false;
-        }
-
-        void findNotes(const Note *&start, const Note *&end) const {
-            auto *base = reinterpret_cast<const uint8_t *>(&fileHeader);
-            auto *sheader0 = reinterpret_cast<const SectionHeader *>(base + fileHeader.sectionTableOff);
-            const char *strtab = nullptr;
-            uint64_t strtabSize = 0;
-
-            auto sheader = sheader0;
-            for (int s = 0; s < fileHeader.sectionCount; s++, sheader++) {
-                if (sheader->type == SectionHeader::Type::StringTable) {
-                    strtab = reinterpret_cast<const char *>(base + sheader->offset);
-                    strtabSize = sheader->size;
-                }
-            }
-
-            bool found = false;
-            sheader = sheader0;
-            for (int s = 0; s < fileHeader.sectionCount; s++, sheader++)
-                if (sheader->type == SectionHeader::Type::Note)
-                    if (sheader->name < strtabSize)
-                        if (!strcmp(strtab + sheader->name, ".note.intelgt.compat"))
-                            { found = true; break; }
-
-            if (found) {
-                start = reinterpret_cast<const Note *>(base + sheader->offset);
-                end = reinterpret_cast<const Note *>(base + sheader->offset + sheader->size);
-            } else
-                start = end = nullptr;
-        }
-    };
-};
-
-#define NGEN_FORWARD_ELF(hw) \
-NGEN_FORWARD_NO_ELF_OVERRIDES(hw) \
-NGEN_FORWARD_ELF_EXTRA(hw) \
-template <typename... Targs> void externalName(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::externalName(std::forward<Targs>(args)...); } \
-const std::string &getExternalName() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getExternalName(); } \
-int getSIMD() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getSIMD(); } \
-int getGRFCount() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getGRFCount(); } \
-size_t getSLMSize() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getSLMSize(); } \
-template <typename... Targs> void require32BitBuffers(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::require32BitBuffers(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireArbitrationMode(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireArbitrationMode(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireBarrier(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireBarrier(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireBarriers(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireBarriers(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireDPAS(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireDPAS(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireGlobalAtomics(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireGlobalAtomics(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireGRF(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireGRF(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireLocalID(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireLocalID(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireLocalSize(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireLocalSize(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireNonuniformWGs(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireNonuniformWGs(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireNoPreemption(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireNoPreemption(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireScratch(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireScratch(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireSIMD(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireSIMD(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireSLM(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireSLM(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireStatelessWrites(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireStatelessWrites(std::forward<Targs>(args)...); } \
-void requireType(NGEN_NAMESPACE::DataType type) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireType(type); } \
-template <typename DT = void> void requireType() { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template requireType<DT>(); } \
-template <typename... Targs> void requireWalkOrder(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireWalkOrder(std::forward<Targs>(args)...); } \
-template <typename... Targs> void requireWorkgroup(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireWorkgroup(std::forward<Targs>(args)...); } \
-template <typename... Targs> void finalizeInterface(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::finalizeInterface(std::forward<Targs>(args)...); } \
-template <typename... Targs> void newArgument(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::newArgument(std::forward<Targs>(args)...); } \
-template <typename... Targs> void allowArgumentRearrangement(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::allowArgumentRearrangement(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::Subregister getArgument(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgument(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::Subregister getArgumentIfExists(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgumentIfExists(std::forward<Targs>(args)...); } \
-template <typename... Targs> int getArgumentSurface(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgumentSurface(std::forward<Targs>(args)...); } \
-template <typename... Targs> int getArgumentSurfaceIfExists(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgumentSurfaceIfExists(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::GRF getLocalID(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getLocalID(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::RegData getSIMD1LocalID(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getSIMD1LocalID(std::forward<Targs>(args)...); } \
-template <typename... Targs> NGEN_NAMESPACE::Subregister getLocalSize(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getLocalSize(std::forward<Targs>(args)...); } \
-void prologue() { NGEN_NAMESPACE::ELFCodeGenerator<hw>::prologue(); } \
-void epilogue(const NGEN_NAMESPACE::RegData &r0_info = NGEN_NAMESPACE::RegData()) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::epilogue(r0_info); }
-
-#define NGEN_FORWARD_ELF_EXTRA(hw)
-
-template <HW hw>
-std::vector<uint8_t> ELFCodeGenerator<hw>::getBinary()
-{
-    return getBinary(this->getCode());
-}
-
-template <HW hw>
-std::vector<uint8_t> ELFCodeGenerator<hw>::getBinary(const std::vector<uint8_t> &kernel)
-{
-    using super = BinaryCodeGenerator<hw>;
-    std::vector<uint8_t> binary;
-    std::string metadata;
-
-    // Locate entrypoints for XeHP+.
-    if (hw >= HW::XeHP) {
-        auto idPerThread = super::_labelLocalIDsLoaded.getID(labelManager);
-        auto idCrossThread = super::_labelArgsLoaded.getID(labelManager);
-
-        if (labelManager.hasTarget(idPerThread))
-            interface_.setSkipPerThreadOffset(labelManager.getTarget(idPerThread));
-        if (labelManager.hasTarget(idCrossThread))
-            interface_.setSkipCrossThreadOffset(labelManager.getTarget(idCrossThread));
-    }
-
-    // Generate metadata.
-    metadata = interface_.generateZeInfo();
-
-    // Construct ELF.
-    size_t paddedSzKernelName = interface_.getExternalName().length() + 1;
-    size_t paddedSzELF = ZebinELF::align(sizeof(ZebinELF) + paddedSzKernelName);
-    size_t paddedSzMetadata = ZebinELF::align(metadata.size());
-    size_t paddedSzKernel = ZebinELF::align(kernel.size());
-
-    binary.resize(paddedSzELF + paddedSzMetadata + paddedSzKernel);
-
-    (void) new(binary.data()) ZebinELF(paddedSzKernelName, metadata.size(), kernel.size());
-    utils::copy_into(binary, ZebinELF::kernelNameOffset(), interface_.getExternalName());
-    utils::copy_into(binary, paddedSzELF, metadata);
-    utils::copy_into(binary, paddedSzELF + paddedSzMetadata, kernel);
-
-    return binary;
-}
-
-template <HW hw>
-inline HW ELFCodeGenerator<hw>::getBinaryArch(const std::vector<uint8_t> &binary)
-{
-    HW outHW;
-    Product outProduct;
-
-    getBinaryHWInfo(binary, outHW, outProduct);
-
-    return outHW;
-}
-
-template <HW hw>
-inline void ELFCodeGenerator<hw>::getBinaryHWInfo(const std::vector<uint8_t> &binary, HW &outHW, Product &outProduct)
-{
-    using Note = typename ZebinELF::Note;
-
-    outHW = HW::Unknown;
-    outProduct.family = ProductFamily::Unknown;
-    outProduct.stepping = 0;
-
-    auto zebinELF = reinterpret_cast<const ZebinELF *>(binary.data());
-    if (zebinELF->valid()) {
-        // Check for .note.intelgt.compat section first. If not present, fall back to flags.
-        const Note *start, *end;
-        zebinELF->findNotes(start, end);
-        if (start && end) {
-            while (start < end) {
-                auto rstart = reinterpret_cast<const uint8_t *>(start);
-                if (start->descSize == sizeof(start->payload)) {
-                    auto *actualPayload = reinterpret_cast<const uint32_t *>(
-                        rstart + offsetof(Note, payload) - sizeof(Note::name) + utils::alignup_pow2(start->nameSize, 4)
-                    );
-                    switch (start->type) {
-                        case Note::Type::ProductFamily: {
-                            auto decodedFamily = npack::decodeProductFamily(static_cast<npack::ProductFamily>(*actualPayload));
-                            if (decodedFamily != ProductFamily::Unknown)
-                                outProduct.family = decodedFamily;
-                            break;
-                        }
-                        case Note::Type::GfxCoreFamily:
-                            if (outHW == HW::Unknown)
-                                outHW = npack::decodeGfxCoreFamily(static_cast<npack::GfxCoreFamily>(*actualPayload));
-                            break;
-                        case Note::Type::TargetMetadata: {
-                            typename ZebinELF::TargetMetadata metadata;
-                            metadata.all = *actualPayload;
-                            outProduct.stepping = metadata.parts.minHWRevision;
-                        }
-                        default: break;
-                    }
-                }
-                start = reinterpret_cast<const Note *>(
-                    rstart + offsetof(Note, name)
-                           + utils::alignup_pow2(start->nameSize, 4)
-                           + utils::alignup_pow2(start->descSize, 4)
-                );
-            }
-        } else {
-            if (zebinELF->fileHeader.flags.parts.useGfxCoreFamily)
-                outHW = npack::decodeGfxCoreFamily(static_cast<npack::GfxCoreFamily>(zebinELF->fileHeader.machine));
-            else
-                outProduct.family = npack::decodeProductFamily(static_cast<npack::ProductFamily>(zebinELF->fileHeader.machine));
-            outProduct.stepping = zebinELF->fileHeader.flags.parts.minHWRevision;
-        }
-    } else
-        npack::getBinaryHWInfo(binary, outHW, outProduct);
-
-    if (outHW != HW::Unknown && outProduct.family == ProductFamily::Unknown)
-        outProduct.family = genericProductFamily(outHW);
-    else if (outHW == HW::Unknown && outProduct.family != ProductFamily::Unknown)
-        outHW = getCore(outProduct.family);
-}
-
-} /* namespace NGEN_NAMESPACE */
-
-#endif /* NGEN_ELF_HPP */
diff --git a/src/gpu/intel/jit/ngen/ngen_level_zero.hpp b/src/gpu/intel/jit/ngen/ngen_level_zero.hpp
deleted file mode 100644
index e5b8c691a4d..00000000000
--- a/src/gpu/intel/jit/ngen/ngen_level_zero.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef NGEN_LEVEL_ZERO_HPP
-#define NGEN_LEVEL_ZERO_HPP
-
-#include "ngen_config.hpp"
-
-#include "gpu/intel/sycl/l0/level_zero/ze_api.h"
-
-#include <sstream>
-
-#include "ngen_elf.hpp"
-#include "ngen_interface.hpp"
-
-namespace NGEN_NAMESPACE {
-
-// Exceptions.
-class level_zero_error : public std::runtime_error {
-public:
-    level_zero_error(ze_result_t status_ = ZE_RESULT_SUCCESS) : std::runtime_error("A Level Zero error occurred."), status(status_) {}
-protected:
-    ze_result_t status;
-};
-
-// Level Zero program generator class.
-template <HW hw>
-class LevelZeroCodeGenerator : public ELFCodeGenerator<hw>
-{
-public:
-    explicit LevelZeroCodeGenerator(Product product_) : ELFCodeGenerator<hw>(product_) {
-        this->interface_.setInlineGRFCount(0);
-    }
-
-    explicit LevelZeroCodeGenerator(int stepping_ = 0) : LevelZeroCodeGenerator({genericProductFamily(hw), stepping_}) {}
-
-    inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
-    static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device);
-    static void detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, Product &outProduct);
-
-    /* Deprecated. Use the Product-based API instead. */
-    static void detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, int &outStepping);
-};
-
-#define NGEN_FORWARD_LEVEL_ZERO(hw) NGEN_FORWARD_ELF(hw)
-
-namespace detail {
-
-static inline void handleL0(ze_result_t result)
-{
-    if (result != ZE_RESULT_SUCCESS)
-        throw level_zero_error{result};
-}
-
-}; /* namespace detail */
-
-template <HW hw>
-ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
-{
-    using super = ELFCodeGenerator<hw>;
-
-    auto binary = super::getBinary();
-
-    ze_module_desc_t moduleDesc = {
-        ZE_STRUCTURE_TYPE_MODULE_DESC,
-        nullptr,
-        ZE_MODULE_FORMAT_NATIVE,
-        binary.size(),
-        binary.data(),
-        options.c_str(),
-        nullptr
-    };
-
-    ze_module_handle_t module;
-    detail::handleL0(zeModuleCreate(context, device, &moduleDesc, &module, nullptr));
-
-    if (module == nullptr)
-        throw level_zero_error{};
-
-    return module;
-}
-
-template <HW hw>
-HW LevelZeroCodeGenerator<hw>::detectHW(ze_context_handle_t context, ze_device_handle_t device)
-{
-    HW outHW;
-    int outStepping;
-
-    detectHWInfo(context, device, outHW, outStepping);
-
-    return outHW;
-}
-
-template <HW hw>
-void LevelZeroCodeGenerator<hw>::detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, int &outStepping)
-{
-    Product outProduct;
-    detectHWInfo(context, device, outHW, outProduct);
-    outStepping = outProduct.stepping;
-}
-
-template <HW hw>
-void LevelZeroCodeGenerator<hw>::detectHWInfo(ze_context_handle_t context, ze_device_handle_t device, HW &outHW, Product &outProduct)
-{
-#ifdef ZE_DEVICE_IP_VERSION_EXT_NAME
-    // Try ZE_extension_device_ip_version first if available.
-    ze_device_ip_version_ext_t vprop = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0};
-    ze_device_properties_t dprop = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES, &vprop};
-
-    if (zeDeviceGetProperties(device, &dprop) == ZE_RESULT_SUCCESS) {
-        outProduct = npack::decodeHWIPVersion(vprop.ipVersion);
-        outHW = getCore(outProduct.family);
-        if (outProduct.family != ProductFamily::Unknown)
-            return;
-    }
-#endif
-
-    static const uint8_t dummySPV[] = {0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0E, 0x00, 0x06, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x05, 0x00, 0x01, 0x00, 0x00, 0x00, 0x4F, 0x70, 0x65, 0x6E, 0x43, 0x4C, 0x2E, 0x73, 0x74, 0x64, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, 0x07, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6B, 0x65, 0x72, 0x6E, 0x65, 0x6C, 0x5F, 0x61, 0x72, 0x67, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x2E, 0x5F, 0x2E, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x70, 0x8E, 0x01, 0x00, 0x05, 0x00, 0x04, 0x00, 0x05, 0x00, 0x00, 0x00, 0x65, 0x6E, 0x74, 0x72, 0x79, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
-    ze_module_desc_t moduleDesc = {
-        ZE_STRUCTURE_TYPE_MODULE_DESC,
-        nullptr,
-        ZE_MODULE_FORMAT_IL_SPIRV,
-        sizeof(dummySPV),
-        dummySPV,
-        nullptr,
-        nullptr
-    };
-
-    ze_module_handle_t module;
-    detail::handleL0(zeModuleCreate(context, device, &moduleDesc, &module, nullptr));
-
-    if (module == nullptr)
-        throw level_zero_error{};
-
-    std::vector<uint8_t> binary;
-    size_t binarySize;
-
-    detail::handleL0(zeModuleGetNativeBinary(module, &binarySize, nullptr));
-    binary.resize(binarySize);
-    detail::handleL0(zeModuleGetNativeBinary(module, &binarySize, binary.data()));
-    detail::handleL0(zeModuleDestroy(module));
-
-    ELFCodeGenerator<hw>::getBinaryHWInfo(binary, outHW, outProduct);
-}
-
-} /* namespace NGEN_NAMESPACE */
-
-#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_pseudo.hpp b/src/gpu/intel/jit/ngen/ngen_pseudo.hpp
deleted file mode 100644
index ac3cf0949cf..00000000000
--- a/src/gpu/intel/jit/ngen/ngen_pseudo.hpp
+++ /dev/null
@@ -1,841 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-/*
- * Do not #include this file directly; ngen uses it internally.
- */
-
-
-// Pseudo-instructions and macros.
-template <typename DT = void>
-void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    sel<DT>(mod | lt | f0[0], dst, src0, src1);
-}
-template <typename DT = void>
-void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    sel<DT>(mod | lt | f0[0], dst, src0, src1);
-}
-#ifndef NGEN_WINDOWS_COMPAT
-template <typename DT = void>
-void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    sel<DT>(mod | lt | f0[0], dst, src0, src1);
-}
-template <typename DT = void>
-void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    sel<DT>(mod | lt | f0[0], dst, src0, src1);
-}
-#endif
-template <typename DT = void>
-void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    sel<DT>(mod | ge | f0[0], dst, src0, src1);
-}
-template <typename DT = void>
-void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    sel<DT>(mod | ge | f0[0], dst, src0, src1);
-}
-#ifndef NGEN_WINDOWS_COMPAT
-template <typename DT = void>
-void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    sel<DT>(mod | ge | f0[0], dst, src0, src1);
-}
-template <typename DT = void>
-void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    sel<DT>(mod | ge | f0[0], dst, src0, src1);
-}
-#endif
-
-template <typename DT = void>
-void bfi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, const RegData &src3) {
-    bfi1<DT>(mod, dst, src0, src1);
-    bfi2<DT>(mod, dst, dst, src2, src3);
-}
-
-// Brief compare instructions.
-template <typename DT = void>
-void cmp(const InstructionModifier &mod, const RegData &src0, const RegData &src1) {
-    auto dt = getDataType<DT>();
-    if (dt == DataType::invalid)
-        dt = src0.getType();
-    cmp<DT>(mod, null.retype(dt), src0, src1);
-}
-template <typename DT = void>
-void cmp(const InstructionModifier &mod, const RegData &src0, const Immediate &src1) {
-    auto dt = getDataType<DT>();
-    if (dt == DataType::invalid)
-        dt = src0.getType();
-    cmp<DT>(mod, null.retype(dt), src0, src1);
-}
-
-// Brief math instructions.
-template <typename DT = void>
-void cos(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::cos, dst, src0);
-}
-template <typename DT = void>
-void exp(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::exp, dst, src0);
-}
-template <typename DT = void>
-void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    math<DT>(mod, MathFunction::fdiv, dst, src0, src1);
-}
-template <typename DT = void>
-void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    math<DT>(mod, MathFunction::fdiv, dst, src0, src1);
-}
-template <typename DT = void>
-void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    math<DT>(mod, MathFunction::idiv, dst, src0, src1);
-}
-template <typename DT = void>
-void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    math<DT>(mod, MathFunction::idiv, dst, src0, src1);
-}
-template <typename DT = void>
-void inv(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::inv, dst, src0);
-}
-template <typename DT = void>
-void invm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1) {
-    math<DT>(mod, MathFunction::invm, dst, src0, src1);
-}
-template <typename DT = void>
-void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    math<DT>(mod, MathFunction::iqot, dst, src0, src1);
-}
-template <typename DT = void>
-void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    math<DT>(mod, MathFunction::iqot, dst, src0, src1);
-}
-template <typename DT = void>
-void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    math<DT>(mod, MathFunction::irem, dst, src0, src1);
-}
-template <typename DT = void>
-void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    math<DT>(mod, MathFunction::irem, dst, src0, src1);
-}
-template <typename DT = void>
-void log(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::log, dst, src0);
-}
-template <typename DT = void>
-void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
-    math<DT>(mod, MathFunction::pow, dst, src0, src1);
-}
-template <typename DT = void>
-void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
-    math<DT>(mod, MathFunction::pow, dst, src0, src1);
-}
-template <typename DT = void>
-void rsqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::rsqt, dst, src0);
-}
-template <typename DT = void>
-void rsqtm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0) {
-    math<DT>(mod, MathFunction::rsqtm, dst, src0);
-}
-template <typename DT = void>
-void sin(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::sin, dst, src0);
-}
-template <typename DT = void>
-void sqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
-    math<DT>(mod, MathFunction::sqt, dst, src0);
-}
-
-#define TMP(n) tmp[n].retype(dst.getType())
-
-// IEEE 754-compliant divide math macro sequence.
-//   Requires GRFs initialized with 0.0 and 1.0, as well as temporary GRFs (4 for single precision, 5 for double precision).
-//   dst, num, denom must be distinct GRFs.
-template <typename DT = void, typename A>
-void fdiv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData num, RegData denom,
-               RegData zero, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier())
-{
-    DataType dt = getDataType<DT>();
-    if (dt == DataType::invalid)
-        dt = dst.getType();
-    if (cfmod.getExecSize() == 0)
-        cfmod = mod;
-
-    Label labelSkip;
-
-    switch (dt) {
-        case DataType::hf:
-            fdiv<DT>(mod, dst, num, denom);
-            break;
-        case DataType::f:
-            invm<DT>(mod | eo | flag,         dst | mme0,      num | nomme,   denom | nomme);
-            if_(cfmod | ~flag, labelSkip);
-
-            madm<DT>(mod, TMP(0) | mme1,     zero | nomme,     num | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(1) | mme2,      one | nomme,  -denom | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(2) | mme3,      dst | mme0,   TMP(1) | mme2,      dst | mme0);
-            madm<DT>(mod, TMP(3) | mme4,      num | nomme,  -denom | nomme,  TMP(0) | mme1);
-            madm<DT>(mod, TMP(0) | mme5,   TMP(0) | mme1,   TMP(3) | mme4,   TMP(2) | mme3);
-            madm<DT>(mod, TMP(1) | mme6,      num | nomme,  -denom | nomme,  TMP(0) | mme5);
-            madm<DT>(mod,    dst | nomme,  TMP(0) | mme5,   TMP(1) | mme6,   TMP(2) | mme3);
-
-            mark(labelSkip);
-            endif(cfmod);
-            break;
-        case DataType::df:
-            invm<DT>(mod | eo | flag,         dst | mme0,      num | nomme,   denom | nomme);
-            if_(cfmod | ~flag, labelSkip);
-
-            madm<DT>(mod, TMP(0) | mme1,     zero | nomme,     num | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(1) | mme2,      one | nomme,  -denom | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(2) | mme3,      num | nomme,  -denom | nomme,  TMP(0) | mme1);
-            madm<DT>(mod, TMP(3) | mme4,      dst | mme0,   TMP(1) | mme2,      dst | mme0);
-            madm<DT>(mod, TMP(4) | mme5,      one | nomme,  -denom | nomme,  TMP(3) | mme4);
-            madm<DT>(mod,    dst | mme6,      dst | mme0,   TMP(1) | mme2,   TMP(3) | mme4);
-            madm<DT>(mod, TMP(0) | mme7,   TMP(0) | mme1,   TMP(2) | mme3,   TMP(3) | mme4);
-            madm<DT>(mod, TMP(3) | mme0,   TMP(3) | mme4,      dst | mme6,   TMP(4) | mme5);
-            madm<DT>(mod, TMP(2) | mme1,      num | nomme,  -denom | nomme,  TMP(0) | mme7);
-            madm<DT>(mod,    dst | nomme,  TMP(0) | mme7,   TMP(2) | mme1,   TMP(3) | mme0);
-
-            mark(labelSkip);
-            endif(cfmod);
-            break;
-        default:
-#ifdef NGEN_SAFE
-            throw invalid_type_exception();
-#endif
-            break;
-    }
-}
-
-// IEEE 754-compliant reciprocal math macro sequence.
-//   Requires GRF initialized with 1.0, as well as 3 temporary GRFs.
-//   dst and src must be distinct GRFs.
-template <typename DT = void, typename A>
-void inv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src, RegData one,
-              const A &tmp, InstructionModifier cfmod = InstructionModifier())
-{
-    DataType dt = getDataType<DT>();
-    if (dt == DataType::invalid)
-        dt = dst.getType();
-    if (cfmod.getExecSize() == 0)
-        cfmod = mod;
-
-    Label labelSkip;
-
-    switch (dt) {
-        case DataType::hf:
-            inv<DT>(mod, dst, src);
-            break;
-        case DataType::f:
-            invm<DT>(mod | eo | flag,         dst | mme0,      one | nomme,     src | nomme);
-            if_(cfmod | ~flag, labelSkip);
-
-            madm<DT>(mod, TMP(1) | mme2,      one | nomme,    -src | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(2) | mme3,      dst | mme0,   TMP(1) | mme2,      dst | mme0);
-            madm<DT>(mod, TMP(0) | mme5,      dst | mme0,   TMP(1) | mme2,   TMP(2) | mme3);
-            madm<DT>(mod, TMP(1) | mme6,      one | nomme,    -src | nomme,  TMP(0) | mme5);
-            madm<DT>(mod,    dst | nomme,  TMP(0) | mme5,   TMP(1) | mme6,   TMP(2) | mme3);
-
-            mark(labelSkip);
-            endif(cfmod);
-            break;
-        case DataType::df:
-            invm<DT>(mod | eo | flag,        dst | mme0,      one | nomme,     src | nomme);
-            if_(cfmod | ~flag, labelSkip);
-
-            madm<DT>(mod, TMP(0) | mme2,     one | nomme,    -src | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(1) | mme4,     dst | mme0,   TMP(0) | mme2,      dst | mme0);
-            madm<DT>(mod, TMP(2) | mme5,     one | nomme,    -src | nomme,  TMP(1) | mme4);
-            madm<DT>(mod,    dst | mme6,     dst | mme0,   TMP(0) | mme2,   TMP(1) | mme4);
-            madm<DT>(mod, TMP(1) | mme0,  TMP(1) | mme4,      dst | mme6,   TMP(2) | mme5);
-            madm<DT>(mod, TMP(0) | mme1,     one | nomme,    -src | nomme,     dst | mme6);
-            madm<DT>(mod,    dst | nomme,    dst | mme6,   TMP(0) | mme1,   TMP(1) | mme0);
-
-            mark(labelSkip);
-            endif(cfmod);
-            break;
-        default:
-#ifdef NGEN_SAFE
-            throw invalid_type_exception();
-#endif
-            break;
-    }
-}
-
-// IEEE 754-compliant square root macro sequence.
-//   Requires GRFs initialized with 0.0 and 0.5 (also 1.0 for double precision),
-//     and temporary GRFs (3 for single precision, 4 for double precision).
-//   dst and src must be distinct GRFs.
-template <typename DT = void, typename A>
-void sqt_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src,
-               RegData zero, RegData oneHalf, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier())
-{
-    DataType dt = getDataType<DT>();
-    if (dt == DataType::invalid)
-        dt = dst.getType();
-    if (cfmod.getExecSize() == 0)
-        cfmod = mod;
-
-    Label labelSkip;
-
-    switch (dt) {
-        case DataType::hf:
-            sqt<DT>(mod, dst, src);
-            break;
-        case DataType::f:
-            rsqtm<DT>(mod | eo | flag,        dst | mme0,       src | nomme);
-            if_(cfmod | ~flag, labelSkip);
-
-            madm<DT>(mod, TMP(0) | mme1,     zero | nomme,  oneHalf | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(1) | mme2,     zero | nomme,      src | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(2) | mme3,  oneHalf | nomme,  -TMP(1) | mme2,   TMP(0) | mme1);
-            madm<DT>(mod, TMP(0) | mme4,   TMP(0) | mme1,    TMP(2) | mme3,   TMP(0) | mme1);
-            madm<DT>(mod,    dst | mme5,   TMP(1) | mme2,    TMP(2) | mme3,   TMP(1) | mme2);
-            madm<DT>(mod, TMP(2) | mme6,      src | nomme,     -dst | mme5,      dst | mme5);
-            madm<DT>(mod,    dst | nomme,     dst | mme5,    TMP(0) | mme4,   TMP(2) | mme6);
-
-            mark(labelSkip);
-            endif(cfmod);
-            break;
-        case DataType::df:
-            rsqtm<DT>(mod | eo | flag,        dst | mme0,       src | nomme);
-            if_(cfmod | ~flag, labelSkip);
-
-            madm<DT>(mod, TMP(0) | mme1,     zero | mme0,   oneHalf | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(1) | mme2,     zero | mme0,       src | nomme,     dst | mme0);
-            madm<DT>(mod, TMP(2) | mme3,  oneHalf | nomme,  -TMP(1) | mme2,   TMP(0) | mme1);
-            madm<DT>(mod, TMP(3) | mme4,      one | nomme,  oneHalf | nomme,     dst | nomme);
-            madm<DT>(mod, TMP(3) | mme5,      one | nomme,   TMP(3) | mme4,   TMP(2) | mme3);
-            madm<DT>(mod,    dst | mme6,     zero | mme0,    TMP(2) | mme3,   TMP(1) | mme2);
-            madm<DT>(mod, TMP(2) | mme7,     zero | mme0,    TMP(2) | mme3,   TMP(0) | mme1);
-            madm<DT>(mod,    dst | mme6,   TMP(1) | mme2,    TMP(3) | mme5,      dst | mme6);
-            madm<DT>(mod, TMP(3) | mme5,   TMP(0) | mme1,    TMP(3) | mme5,   TMP(2) | mme7);
-            madm<DT>(mod, TMP(0) | mme1,      src | nomme,     -dst | mme6,      dst | mme6);
-            madm<DT>(mod,    dst | nomme,     dst | mme6,    TMP(0) | mme1,   TMP(3) | mme5);
-
-            mark(labelSkip);
-            endif(cfmod);
-            break;
-        default:
-#ifdef NGEN_SAFE
-            throw invalid_type_exception();
-#endif
-            break;
-    }
-}
-
-#undef TMP
-
-// Thread spawner messages.
-void threadend(const InstructionModifier &mod, const RegData &r0_info)
-{
-    {
-        auto sf = (hardware <= HW::XeHP) ? SharedFunction::ts
-                                         : SharedFunction::gtwy;
-        uint32_t exdesc = 0x20 | (static_cast<int>(sf) & 0xF);
-        send(8 | EOT | mod | NoMask, null, r0_info, exdesc, 0x2000010);
-    }
-}
-
-void threadend(const RegData &r0_info) { threadend(InstructionModifier(), r0_info); }
-
-// Gateway messages.
-void barriermsg(const InstructionModifier &mod, const GRF &header)
-{
-    {
-        uint32_t exdesc = static_cast<int>(SharedFunction::gtwy) & 0xF;
-        send(1 | mod | NoMask, null, header, exdesc, 0x2000004);
-    }
-}
-
-void barriermsg(const GRF &header) { barriermsg(InstructionModifier(), header); }
-
-// Prepare barrier header.
-void barrierheader(const GRF &header, const GRF &r0_info = r0)
-{
-    if (hardware >= HW::XeHPG) {
-        mov(1 | NoMask, header.hf(4), Immediate::hf(0));
-        mov(2 | NoMask, header.ub(10)(1), r0_info.ub(11)(0));
-    } else
-        and_(8 | NoMask, header.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000));
-}
-
-void barrierheader(const GRF &header, uint32_t threadCount, const GRF &r0_info = r0)
-{
-    if (hardware >= HW::XeHPG)
-        mov(1 | NoMask, header.ud(2), (threadCount << 24) | (threadCount << 16));
-    else {
-        and_(8 | NoMask, header.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000));
-        mov(1 | NoMask, header.ub(9), 0x80 | (threadCount & 0x7F));
-    }
-}
-
-void barriersignal(const InstructionModifier &mod, const GRF &temp, const GRF &r0_info = r0)
-{
-    {
-        barrierheader(temp, r0_info);
-        barriermsg(mod, temp);
-    }
-}
-
-void barriersignal(const InstructionModifier &mod, const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0)
-{
-    barrierheader(temp, threadCount, r0_info);
-    barriermsg(mod, temp);
-}
-
-void barriersignal(const GRF &temp, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), temp, r0_info); }
-void barriersignal(const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), temp, threadCount, r0_info); }
-
-// Named barriers.
-void nbarriermsg(const InstructionModifier &mod, const GRF &header)
-{
-        barriermsg(mod, header);
-}
-
-void nbarriermsg(const GRF &header) { nbarriermsg(InstructionModifier(), header); }
-
-void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0)
-{
-#ifdef NGEN_SAFE
-    if (hardware < HW::XeHPC)
-        throw unsupported_message();
-#endif
-    mov(1 | NoMask, temp.uw(4), uint8_t(barrierID));
-    mov(2 | NoMask, temp.ub(10)(1), r0_info.ub(11)(0));
-    nbarriermsg(mod, temp);
-}
-
-void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers)
-{
-#ifdef NGEN_SAFE
-    if (hardware < HW::XeHPC)
-        throw unsupported_message();
-#endif
-    mov(1 | NoMask, temp.ud(2), (barrierID & 0xFF) | (static_cast<uint32_t>(barrierType) << 14) | ((producers & 0xFF) << 16) | ((consumers & 0xFF) << 24));
-    nbarriermsg(mod, temp);
-}
-
-void barriersignal(uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), barrierID, temp, r0_info); }
-void barriersignal(uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers) { barriersignal(InstructionModifier(), barrierID, temp, barrierType, producers, consumers); }
-
-void barrierwait()
-{
-    if (isGen12)
-        sync.bar(NoMask);
-    else
-        wait(NoMask, n0[0]);
-}
-
-template <typename... Targs>
-void barrier(const Targs &...barrierArgs)
-{
-    barriersignal(barrierArgs...);
-    barrierwait();
-}
-
-void registerfence(const RegData &dst)
-{
-    _lastFenceDst = dst;
-    if (isGen12) {
-        _lastFenceLabel = Label();
-        mark(_lastFenceLabel);
-    }
-}
-
-// Global memory fence.
-void memfence(const InstructionModifier &mod, FenceScopeLSC scope, FlushTypeLSC flushing, const RegData &dst = NullRegister(), const RegData &header = GRF(0))
-{
-    registerfence(dst);
-
-    if (hardware >= HW::XeHPG) {
-        if (flushing == FlushTypeLSC::None && hardware == HW::XeHPG && scope > FenceScopeLSC::Subslice)
-            flushing = static_cast<FlushTypeLSC>(6);    /* workaround for DG2 bug */
-
-        uint32_t desc = 0x0210011F;
-        desc |= static_cast<uint32_t>(scope) << 9;
-        desc |= static_cast<uint32_t>(flushing) << 12;
-        send(1 | mod | NoMask, SharedFunction::ugm, dst, header, null, 0, desc);
-    } else {
-        const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF;
-        send(8 | mod | NoMask, dst, header, exdesc, 0x219E000);
-    }
-}
-
-void memfence(const InstructionModifier &mod, const RegData &dst = NullRegister(), const RegData &header = GRF(0))
-{
-    memfence(mod, FenceScopeLSC::GPU, FlushTypeLSC::None, dst, header);
-}
-
-void memfence(FenceScopeLSC scope, FlushTypeLSC flushing, const RegData &dst = NullRegister(), const RegData &header = GRF(0))
-{
-    memfence(InstructionModifier(), scope, flushing, dst, header);
-}
-
-void memfence(const RegData &dst = NullRegister(), const RegData &header = GRF(0))
-{
-    memfence(InstructionModifier(), dst, header);
-}
-
-// SLM-only memory fence.
-void slmfence(const InstructionModifier &mod, const RegData &dst = NullRegister(), const RegData &header = GRF(0))
-{
-    registerfence(dst);
-
-    if (hardware >= HW::XeHPG)
-        send(1 | mod | NoMask, SharedFunction::slm, dst, header, null, 0, 0x210011F);
-    else {
-        const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF;
-        send(8 | mod | NoMask, dst, header, exdesc, 0x219E0FE);
-    }
-}
-
-void slmfence(const RegData &dst = NullRegister(), const RegData &header = GRF(0)) { slmfence(InstructionModifier(), dst, header); }
-
-// Wait on the last global memory or SLM fence.
-void fencewait()
-{
-    if (isGen12)
-        fencedep(_lastFenceLabel);
-    else
-        mov<uint32_t>(8 | NoMask, null, _lastFenceDst);
-}
-
-// XeHP+ prologues.
-void loadlid(int argBytes, int dims = 3, int simd = 8, const GRF &temp = GRF(127), int paddedSize = 0)
-{
-    if (hardware >= HW::XeHP) {
-        if (paddedSize < 0)
-            paddedSize = 12*16;
-        const int grfSize = GRF::bytes(hardware);
-        const int grfOW = grfSize / 16;
-        int simdGRFs = (simd > 16 && grfSize < 64) ? 2 : 1;
-        int insns = 0;
-        bool lsc = (hardware >= HW::XeHPG);
-        auto tempAddr = temp[lsc ? 0 : 2];
-
-        if (dims > 0) {
-            auto dmSave = defaultModifier;
-            defaultModifier |= NoMask | AutoSWSB;
-
-
-            {
-                insns = lsc ? 5 : 6;
-                if (!lsc)
-                    mov<uint32_t>(8, temp, uint16_t(0));
-                and_<uint32_t>(1, temp[2], r0[0], uint32_t(~0x1F));
-                and_<uint16_t>(1, temp[0], r0[4], uint16_t(0xFF));
-                add<uint32_t>(1, temp[2], temp[2], uint16_t(argBytes));
-                if (simd == 1) {
-                    mad<uint32_t>(1, tempAddr, temp[2], temp.uw(0), uint16_t(grfSize));
-                    lsc ? load(1, r1, D32T(4) | L1C_L3C,      A32,   temp)
-                        : load(8, r1, aligned_block_oword(1), A32NC, temp);
-                } else {
-                    mad<uint32_t>(1, tempAddr, temp[2], temp.uw(0), uint16_t(3 * simdGRFs * grfSize));
-                    lsc ? load(1, r1, D32T(simdGRFs * ((dims == 1) ? 1 : 2) * grfOW * 4) | L1C_L3C,  A32,   temp)
-                        : load(8, r1, aligned_block_oword(simdGRFs * ((dims == 1) ? 1 : 2) * grfOW), A32NC, temp);
-                    if (dims == 3) {
-                        add<uint32_t>(1, tempAddr, tempAddr, uint16_t(2 * simdGRFs * grfSize));
-                        lsc ? load(1, GRF(1 + 2 * simdGRFs), D32T(grfOW * 4 * simdGRFs) | L1C_L3C,  A32,   temp)
-                            : load(8, GRF(1 + 2 * simdGRFs), aligned_block_oword(grfOW * simdGRFs), A32NC, temp);
-                        insns += 2;
-                    }
-                }
-            }
-
-            defaultModifier = dmSave;
-        }
-
-        if (paddedSize > 0) {
-            int nops = (paddedSize >> 4) - insns;
-#ifdef NGEN_SAFE
-            if (paddedSize & 0xF) throw invalid_operand_exception();
-            if (nops < 0)         throw invalid_operand_exception();
-#endif
-            for (int i = 0; i < nops; i++)
-                nop();
-        }
-
-        if (!_labelLocalIDsLoaded.defined(labelManager))
-            mark(_labelLocalIDsLoaded);
-
-    }
-}
-
-void loadargs(const GRF &base, int argGRFs, const GRF &temp = GRF(127), bool inPrologue = true)
-{
-    if (hardware >= HW::XeHP) {
-        if (argGRFs > 0) {
-            bool lsc = (hardware >= HW::XeHPG);
-            auto tempAddr = temp[lsc ? 0 : 2];
-            auto dst = base;
-            auto dmSave = defaultModifier;
-            defaultModifier |= NoMask | AutoSWSB;
-
-            {
-                if (!lsc)
-                    mov<uint32_t>(8, temp, uint16_t(0));
-                and_<uint32_t>(1, tempAddr, r0[0], uint32_t(~0x1F));
-                while (argGRFs > 0) {
-                    int nload = std::min(utils::rounddown_pow2(argGRFs), lsc ? 8 : 4);
-                    int loadBytes = nload * GRF::bytes(hardware);
-                    lsc ? load(1, dst, D64T(loadBytes >> 3) | L1C_L3C,      A32,   temp)
-                        : load(8, dst, aligned_block_oword(loadBytes >> 4), A32NC, temp);
-                    argGRFs -= nload;
-                    dst += nload;
-                    if (argGRFs > 0)
-                        add<uint32_t>(1, tempAddr, tempAddr, uint32_t(loadBytes));
-                }
-            }
-
-            defaultModifier = dmSave;
-        }
-
-        if (!_labelArgsLoaded.defined(labelManager))
-            mark(_labelArgsLoaded);
-    }
-}
-
-void epilogue(int GRFCount, bool hasSLM, const RegData &r0_info)
-{
-    GRF tmp0(GRFCount - 3);
-    GRF tmp1(GRFCount - 2);
-    GRF lastReg(GRFCount - 1);
-
-    bool doMemFence = false;
-    bool doSLMFence = false;
-    bool setAccToZero = false;
-
-    switch (hardware) {
-        case HW::XeLP:
-        case HW::XeHP:
-            doMemFence = true;
-            doSLMFence = true;
-            setAccToZero = true;
-            break;
-        case HW::XeHPG:
-            setAccToZero = true;
-            break;
-        default: break;
-    }
-
-    if (!hasSLM) doSLMFence = false;
-
-    int dwordsPerReg = GRF::bytes(hardware) / sizeof(uint32_t);
-    mov<uint32_t>(dwordsPerReg, lastReg, r0_info);
-
-    if (doMemFence) memfence(tmp0, r0_info);
-    if (doSLMFence) slmfence(tmp1, r0_info);
-
-    if (setAccToZero) {
-        mov(16, acc0.f(), 0.f);
-        if (hardware == HW::XeHP) mov(16, acc2.f(), 0.f);
-    }
-
-    if (doMemFence) wrdep(tmp0);
-    if (doSLMFence) wrdep(tmp1);
-
-    threadend(lastReg);
-}
-
-
-private:
-
-struct Load {
-    _self &parent;
-
-    Load(_self *parent_) : parent(*parent_) {}
-
-    template <typename DataSpec>
-    void operator()(const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const RegData &addr)
-    {
-        this->operator()(SharedFunction::automatic, mod, dst, spec, base, GRFDisp(addr));
-    }
-
-    template <typename DataSpec>
-    void operator()(const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr)
-    {
-        this->operator()(SharedFunction::automatic, mod, dst, spec, base, addr);
-    }
-
-    template <typename DataSpec>
-    void operator()(SharedFunction sfid, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr)
-    {
-        {
-            MessageDescriptor desc;
-            ExtendedMessageDescriptor exdesc;
-
-            if (sfid != SharedFunction::automatic)
-                exdesc.parts.sfid = static_cast<unsigned>(sfid);
-            encodeLoadDescriptors(parent.hardware, desc, exdesc, mod, dst, spec, base, addr);
-            if (sfid != SharedFunction::automatic)
-                exdesc.parts.sfid = static_cast<unsigned>(sfid);
-            parent.send(mod, dst, addr.getBase(), exdesc.all, desc.all);
-        }
-    }
-
-    void ugm(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr)
-    {
-        this->operator()(SharedFunction::ugm, mod, dst, spec, base, addr);
-    }
-    void ugml(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr)
-    {
-        this->operator()(SharedFunction::ugml, mod, dst, spec, base, addr);
-    }
-    void tgm(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr)
-    {
-        this->operator()(SharedFunction::tgm, mod, dst, spec, base, addr);
-    }
-    void slm(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr)
-    {
-        this->operator()(SharedFunction::slm, mod, dst, spec, base, addr);
-    }
-};
-
-struct Store {
-    _self &parent;
-
-    Store(_self *parent_) : parent(*parent_) {}
-
-    template <typename DataSpec>
-    void operator()(const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const RegData &addr, const RegData &data)
-    {
-        this->operator()(SharedFunction::automatic, mod, spec, base, GRFDisp(addr), data);
-    }
-
-    template <typename DataSpec>
-    void operator()(const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        this->operator()(SharedFunction::automatic, mod, spec, base, addr, data);
-    }
-
-    template <typename DataSpec>
-    void operator()(SharedFunction sfid, const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        {
-            MessageDescriptor desc;
-            ExtendedMessageDescriptor exdesc;
-
-            if (sfid != SharedFunction::automatic)
-                exdesc.parts.sfid = static_cast<unsigned>(sfid);
-            encodeStoreDescriptors(parent.hardware, desc, exdesc, mod, spec, base, addr);
-            if (sfid != SharedFunction::automatic)
-                exdesc.parts.sfid = static_cast<unsigned>(sfid);
-            parent.sends(mod, NullRegister(), addr.getBase(), data, exdesc.all, desc.all);
-        }
-    }
-
-    void ugm(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        this->operator()(SharedFunction::ugm, mod, spec, base, addr, data);
-    }
-    void ugml(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        this->operator()(SharedFunction::ugml, mod, spec, base, addr, data);
-    }
-    void tgm(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        this->operator()(SharedFunction::tgm, mod, spec, base, addr, data);
-    }
-    void slm(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        this->operator()(SharedFunction::slm, mod, spec, base, addr, data);
-    }
-};
-
-struct Atomic_ {
-    _self &parent;
-
-    Atomic_(_self *parent_) : parent(*parent_) {}
-
-    template <typename DataSpec>
-    void operator()(AtomicOp op, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const RegData &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::automatic, op, mod, dst, spec, base, GRFDisp(addr), data);
-    }
-    template <typename DataSpec>
-    void operator()(AtomicOp op, const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const RegData &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::automatic, op, mod, NullRegister(), spec, base, GRFDisp(addr), data);
-    }
-
-    template <typename DataSpec>
-    void operator()(AtomicOp op, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::automatic, op, mod, dst, spec, base, addr, data);
-    }
-    template <typename DataSpec>
-    void operator()(AtomicOp op, const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::automatic, op, mod, NullRegister(), spec, base, addr, data);
-    }
-    template <typename DataSpec>
-    void operator()(SharedFunction sfid, AtomicOp op, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data)
-    {
-        {
-            MessageDescriptor desc;
-            ExtendedMessageDescriptor exdesc;
-
-            if (sfid != SharedFunction::automatic)
-                exdesc.parts.sfid = static_cast<unsigned>(sfid);
-            encodeAtomicDescriptors(parent.hardware, desc, exdesc, op, mod, dst, spec, base, addr);
-            if (sfid != SharedFunction::automatic)
-                exdesc.parts.sfid = static_cast<unsigned>(sfid);
-            if (data.isNull())
-                parent.send(mod, dst, addr.getBase(), exdesc.all, desc.all);
-            else
-                parent.sends(mod, dst, addr.getBase(), data, exdesc.all, desc.all);
-        }
-    }
-
-    void ugm(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::ugm, op, mod, dst, spec, base, addr, data);
-    }
-    void ugm(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::ugm, op, mod, NullRegister(), spec, base, addr, data);
-    }
-    void ugml(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::ugml, op, mod, dst, spec, base, addr, data);
-    }
-    void ugml(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::ugml, op, mod, NullRegister(), spec, base, addr, data);
-    }
-    void tgm(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::tgm, op, mod, dst, spec, base, addr, data);
-    }
-    void tgm(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::tgm, op, mod, NullRegister(), spec, base, addr, data);
-    }
-    void slm(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::slm, op, mod, dst, spec, base, addr, data);
-    }
-    void slm(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister())
-    {
-        this->operator()(SharedFunction::slm, op, mod, NullRegister(), spec, base, addr, data);
-    }
-};
-
-public:
-
-Load load;
-Store store;
-Atomic_ atomic;
diff --git a/src/gpu/intel/jit/ngen/ngen_register_decl.cpp b/src/gpu/intel/jit/ngen/ngen_register_decl.cpp
deleted file mode 100644
index e6307a2c286..00000000000
--- a/src/gpu/intel/jit/ngen/ngen_register_decl.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "oneapi/dnnl/dnnl_config.h"
-#include "ngen_register_decl.hpp"
diff --git a/src/gpu/intel/jit/pass/alloc.cpp b/src/gpu/intel/jit/pass/alloc.cpp
index 46e5b450e62..526f4ebbcd6 100644
--- a/src/gpu/intel/jit/pass/alloc.cpp
+++ b/src/gpu/intel/jit/pass/alloc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,15 +34,15 @@ class alloc_lifter_t : public ir_mutator_t {
         for (auto &c : calls) {
             if (!is_func_call<send_t>(c)) continue;
             auto header_buf = send_t::arg_mem_off(c);
-            ir_assert(is_var(header_buf)) << header_buf;
-            header_bufs_.insert(header_buf);
+            gpu_assert(is_var(header_buf)) << header_buf;
+            header_bufs_.insert(std::move(header_buf));
         }
     }
 
     object_t _mutate(const alloc_t &obj) override {
         if (!do_lift(obj)) return ir_mutator_t::_mutate(obj);
         // Remove alloc and insert it before the compute loop.
-        allocs_.push_back(&obj);
+        allocs_.emplace_back(&obj);
         return obj.body;
     }
 
@@ -116,7 +116,7 @@ class alloc_let_optimizer_t : public ir_mutator_t {
     }
 
     object_t _mutate(const var_t &obj) override {
-        ir_assert(refs_.count(obj) == 1)
+        gpu_assert(refs_.count(obj) == 1)
                 << "Variable is not defined: " << expr_t(&obj);
         if (!skip_var_.is_same(obj)) refs_[&obj].update(increment_, level_);
         return ir_mutator_t::_mutate(obj);
@@ -142,7 +142,7 @@ class alloc_let_optimizer_t : public ir_mutator_t {
     template <typename T>
     object_t mutate_scope(const T &obj, const expr_t &var) {
         auto ret = refs_.insert({var, ref_info_t(level_)});
-        ir_assert(ret.second) << var;
+        gpu_assert(ret.second) << var;
         MAYBE_UNUSED(ret);
 
         auto new_obj = ir_mutator_t::_mutate(obj);
@@ -159,7 +159,7 @@ class alloc_let_optimizer_t : public ir_mutator_t {
     }
 
     object_t mutate_let(const let_t &obj, const ref_info_t &ref_info) {
-        ir_assert(ref_info.refs >= 1);
+        gpu_assert(ref_info.refs >= 1);
         if (ref_info.refs == 1) {
             // Variable is not used.
             remove_refs(obj);
@@ -177,7 +177,7 @@ class alloc_let_optimizer_t : public ir_mutator_t {
     }
 
     object_t mutate_alloc(const alloc_t &obj, const ref_info_t &ref_info) {
-        ir_assert(ref_info.refs >= 1);
+        gpu_assert(ref_info.refs >= 1);
         // Buffer is not used, single reference from alloc_t itself. Remove
         // stores to the buffer if any.
         if (ref_info.refs == 1) return remove_stores(obj.body, obj.buf);
diff --git a/src/gpu/intel/jit/pass/bank_conflict.cpp b/src/gpu/intel/jit/pass/bank_conflict.cpp
index eb418c52721..ad8a7aaa523 100644
--- a/src/gpu/intel/jit/pass/bank_conflict.cpp
+++ b/src/gpu/intel/jit/pass/bank_conflict.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,11 +78,11 @@ class bank_conflict_attribute_injector_t : public ir_mutator_t {
             auto src2_buf = ptr_base(obj.args[3]);
 
             // src0 may be null in some cases, skip it.
-            if (!src0_buf.is_empty()) bufs_.insert(src0_buf);
-            bufs_.insert(src1_buf);
-            bufs_.insert(src2_buf);
+            if (!src0_buf.is_empty()) bufs_.insert(std::move(src0_buf));
+            bufs_.insert(std::move(src1_buf));
+            bufs_.insert(std::move(src2_buf));
 
-            instructions_.push_back(obj);
+            instructions_.emplace_back(obj);
         } else if (is_load) {
             // Returns minimal 2^B so that there is x such that:
             //   x * 2^B <= a <= b < (x + 1) * 2^B
diff --git a/src/gpu/intel/jit/pass/cse.cpp b/src/gpu/intel/jit/pass/cse.cpp
index c34aa8933b7..3cd2c915671 100644
--- a/src/gpu/intel/jit/pass/cse.cpp
+++ b/src/gpu/intel/jit/pass/cse.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,14 +46,14 @@ class cse_expr_t {
         , path(path)
         , refs(refs)
         , cse_var(cse_var) {
-        ir_trace() << "cse_pass: add expression: " << expr << std::endl;
+        gpu_trace() << "cse_pass: add expression: " << expr;
     }
 
     void add_usage(const ir_path_t &other_path, bool do_increment = true) {
         if (do_increment) refs++;
         path.merge(other_path);
-        ir_trace() << "cse_pass: add usage: " << expr
-                   << ", total refs: " << refs << std::endl;
+        gpu_trace() << "cse_pass: add usage: " << expr
+                    << ", total refs: " << refs;
     }
 
     // Expression to eliminate via let.
@@ -122,7 +122,7 @@ class cse_var_entry_t {
             if (s->is_broadcast()) return 0;
             return s->elems();
         }
-        ir_error_not_expected() << "Unhandled expression: " << e;
+        gpu_error_not_expected() << "Unhandled expression: " << e;
         return 0;
     }
 
@@ -212,10 +212,10 @@ class cse_skipper_t : public ir_visitor_t {
                     });
             auto &e = **it;
 
-            ir_trace() << "cse_pass: skipping " << e.cse_expr()->expr
-                       << " with cost " << e.cost() << ", size " << e.size()
-                       << ", and cost per byte " << (double)e.cost() / e.size()
-                       << "\n";
+            gpu_trace() << "cse_pass: skipping " << e.cse_expr()->expr
+                        << " with cost " << e.cost() << ", size " << e.size()
+                        << ", and cost per byte "
+                        << (double)e.cost() / e.size();
 
             e.set_unallocated();
             grf_usage_ -= e.size();
@@ -244,12 +244,12 @@ class cse_context_t {
     bool has(const expr_t &e) const { return cse_exprs_.count(e) != 0; }
 
     cse_expr_t &find_cse_expr(const expr_t &e) {
-        ir_assert(has(e)) << e;
+        gpu_assert(has(e)) << e;
         return cse_exprs_.at(e);
     }
 
     const cse_expr_t &find_cse_expr(const expr_t &e) const {
-        ir_assert(has(e)) << e;
+        gpu_assert(has(e)) << e;
         return cse_exprs_.at(e);
     }
 
@@ -264,13 +264,13 @@ class cse_context_t {
 
     void register_expr(const expr_t &e, const ir_path_t &path) {
         auto ret = cse_exprs_.insert({e, cse_expr_t(e, e, path)});
-        ir_assert(ret.second) << e;
+        gpu_assert(ret.second) << e;
         MAYBE_UNUSED(ret);
     }
 
     void register_expr(const cse_expr_t &cse_expr) {
         auto ret = cse_exprs_.insert({cse_expr.expr, cse_expr});
-        ir_assert(ret.second);
+        gpu_assert(ret.second);
         MAYBE_UNUSED(ret);
     }
 
@@ -280,8 +280,8 @@ class cse_context_t {
             cse_expr.cse_var = ir_ctx_.create_tmp_var(e.type().is_bool()
                             ? bool_imm_t::get_packed_type(e.type().elems())
                             : e.type());
-            ir_trace() << "cse_pass: assigning var: " << e << " -> "
-                       << cse_expr.cse_var << std::endl;
+            gpu_trace() << "cse_pass: assigning var: " << e << " -> "
+                        << cse_expr.cse_var;
         }
         return cse_expr.cse_var;
     }
@@ -301,13 +301,13 @@ class cse_context_t {
 
     void update_expr(const expr_t &old_expr, const expr_t &new_expr) {
         auto it = cse_exprs_.find(old_expr);
-        ir_assert(it != cse_exprs_.end()) << old_expr;
+        gpu_assert(it != cse_exprs_.end()) << old_expr;
         auto &old_cse_expr = it->second;
         auto new_cse_expr = cse_expr_t(new_expr, old_cse_expr.orig_expr,
                 old_cse_expr.path, old_cse_expr.refs, old_cse_expr.cse_var);
         cse_exprs_.erase(it);
         auto ret = cse_exprs_.insert({new_expr, new_cse_expr});
-        ir_assert(ret.second);
+        gpu_assert(ret.second);
         MAYBE_UNUSED(ret);
     }
 
@@ -462,7 +462,7 @@ class cse_verifier_t : public scope_visitor_t {
 public:
     cse_verifier_t(cse_context_t &ctx) : ctx_(ctx) {}
 
-    ~cse_verifier_t() override { ir_assert(to_check_.empty()); }
+    ~cse_verifier_t() override { gpu_assert(to_check_.empty()); }
 
     void _visit(const binary_op_t &obj) override { visit_expr(obj); }
     void _visit(const shuffle_t &obj) override { visit_expr(obj); }
@@ -509,7 +509,7 @@ class cse_verifier_t : public scope_visitor_t {
         auto it = to_check_.find(obj);
         if (it != to_check_.end()) {
             for (auto &e : it->second) {
-                ir_assert(is_expr_defined(e))
+                gpu_assert(is_expr_defined(e))
                         << "Expression contains undefined variables: " << e;
                 MAYBE_UNUSED(e);
             }
@@ -542,7 +542,7 @@ class cse_let_generator_t : public ir_visitor_t {
         ctx_.for_each([&](const expr_t &e) {
             auto &cse_var = ctx_.get_var(e);
             auto ret = all_vars_.insert({cse_var, e});
-            ir_assert(ret.second);
+            gpu_assert(ret.second);
             MAYBE_UNUSED(ret);
         });
         ctx_.for_each([&](const expr_t &e) { generate_for_expr(e); });
@@ -684,9 +684,9 @@ stmt_t eliminate_common_subexprs_impl(const stmt_t &_stmt, cse_context_t &ctx,
     if (!has_skip) return stmt;
 
     int memory_usage = get_peak_regs(stmt, grf_size) * grf_size;
-    ir_trace() << "CSE exceeded GRF usage limit. Usage: " << memory_usage
-               << ", limit: " << memory_usage_limit
-               << ". Retry CSE and skip some expressions..." << std::endl;
+    gpu_trace() << "CSE exceeded GRF usage limit. Usage: " << memory_usage
+                << ", limit: " << memory_usage_limit
+                << ". Retry CSE and skip some expressions...";
     ctx.reset_cse_exprs();
     return stmt_t();
 }
@@ -715,7 +715,7 @@ class g2s_buf_visitor_t : public ir_visitor_t {
     int g2s_buf_size() const {
         int ret = 0;
         for (auto &kv : g2s_bufs_) {
-            ir_assert(kv.second != 0);
+            gpu_assert(kv.second != 0);
             ret += kv.second;
         }
         return ret;
@@ -733,7 +733,7 @@ class g2s_buf_visitor_t : public ir_visitor_t {
             return;
         }
         if (auto *func = obj.func.as_ptr<send_t>()) {
-            ir_assert(func->is_load()) << func;
+            gpu_assert(func->is_load()) << func;
             auto &buf = send_t::arg_reg_buf(obj);
             g2s_bufs_.emplace(get_base(buf), 0);
         }
diff --git a/src/gpu/intel/jit/pass/dp4a.cpp b/src/gpu/intel/jit/pass/dp4a.cpp
index 809e87429a5..f53428d3bbe 100644
--- a/src/gpu/intel/jit/pass/dp4a.cpp
+++ b/src/gpu/intel/jit/pass/dp4a.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace jit {
 
 class dp4a_injector_t : public ir_mutator_t {
 public:
-    object_t _mutate(const func_call_t &obj) {
+    object_t _mutate(const func_call_t &obj) override {
         auto *dpas = obj.func.as_ptr<dpas_t>();
         if (!dpas) return obj;
 
@@ -79,7 +79,7 @@ class dp4a_injector_t : public ir_mutator_t {
         if (type.is_x32()) return type;
         if (type.is_s8()) return type_t::s32();
         if (type.is_u8()) return type_t::u32();
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return type_t();
     };
 };
diff --git a/src/gpu/intel/jit/pass/dpas.cpp b/src/gpu/intel/jit/pass/dpas.cpp
index 1621e90e49a..bfec6e7f3cd 100644
--- a/src/gpu/intel/jit/pass/dpas.cpp
+++ b/src/gpu/intel/jit/pass/dpas.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ class mul_mutator_t : public ir_mutator_t {
         }
     };
 
-    object_t _mutate(const stmt_group_t &obj) {
+    object_t _mutate(const stmt_group_t &obj) override {
         if (obj.label != stmt_label_t::mul()) return ir_mutator_t::_mutate(obj);
         auto body = mutate_mul(obj.body);
         return stmt_group_t::make(obj.label, body);
@@ -75,8 +75,12 @@ class dpas_atomic_mutator_t : public mul_mutator_t {
                     && entries[i + 1].is_dpas) {
                 auto &cur_src1 = dpas_t::arg_src1(entries[i].stmt);
                 auto &next_src1 = dpas_t::arg_src1(entries[i + 1].stmt);
-                // Compare src1, apply {Atomic} if they are equal.
-                if (cur_src1.is_equal(next_src1)) {
+                auto &cur_src2 = dpas_t::arg_src2(entries[i].stmt);
+                auto &next_src2 = dpas_t::arg_src2(entries[i + 1].stmt);
+                auto &cur_src2_base = cur_src2.as<ptr_t>().base;
+                auto &next_src2_base = next_src2.as<ptr_t>().base;
+                if (cur_src1.is_equal(next_src1)
+                        && cur_src2_base.is_equal(next_src2_base)) {
                     auto atomic_attr = instruction_modifier_attr_t::make(
                             ngen_proxy::InstructionModifier().with_atomic());
                     auto &call = s.as<func_call_t>();
diff --git a/src/gpu/intel/jit/pass/dpasw.cpp b/src/gpu/intel/jit/pass/dpasw.cpp
index f78af5eb122..886e5ef021f 100644
--- a/src/gpu/intel/jit/pass/dpasw.cpp
+++ b/src/gpu/intel/jit/pass/dpasw.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,10 +101,10 @@ class dpasw_injector_t {
             load_mul_stmt_ = substitute(load_mul_stmt_, si.call, new_call, 1);
             for (auto &d : si.dpas_consumers) {
                 auto &di = find_dpas_info(d);
-                ir_assert(si.promote_to_dpasw == di.promote_to_dpasw)
+                gpu_assert(si.promote_to_dpasw == di.promote_to_dpasw)
                         << "Both send and dpas must be updated.";
                 if (di.update_applied) {
-                    ir_error_not_expected() << "Can it happen?";
+                    gpu_error_not_expected() << "Can it happen?";
                     continue;
                 }
                 auto new_call = di.new_call;
@@ -136,7 +136,7 @@ class dpasw_injector_t {
         }
 
         const send_t &new_send() const {
-            ir_assert(!new_call.is_same(call));
+            gpu_assert(!new_call.is_same(call));
             return new_call.as<func_call_t>().func.as<send_t>();
         }
 
@@ -164,8 +164,8 @@ class dpasw_injector_t {
                 base_call = base;
                 return;
             }
-            ir_assert(new_call.is_equal(s));
-            ir_assert(base_call.is_equal(base));
+            gpu_assert(new_call.is_equal(s));
+            gpu_assert(base_call.is_equal(base));
         }
 
         void set_prev_send(const stmt_t &s) {
@@ -210,8 +210,8 @@ class dpasw_injector_t {
                 new_call = s;
                 return;
             }
-            ir_assert(this->src2_relative_off == src2_relative_off);
-            ir_assert(new_call.is_equal(s));
+            gpu_assert(this->src2_relative_off == src2_relative_off);
+            gpu_assert(new_call.is_equal(s));
         }
 
         stmt_t call;
@@ -226,14 +226,14 @@ class dpasw_injector_t {
     send_info_t &find_send_info(const stmt_t &s) {
         for (auto &si : send_infos_)
             if (si.call.is_same(s)) return si;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return send_infos_.front();
     }
 
     dpas_info_t &find_dpas_info(const stmt_t &s) {
         for (auto &si : dpas_infos_)
             if (si.call.is_same(s)) return si;
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return dpas_infos_.front();
     }
     static bool is_send(const stmt_t &s, send_info_t &info) {
@@ -250,7 +250,7 @@ class dpasw_injector_t {
     }
 
     static bool has_constant_mask(const stmt_t &s) {
-        ir_assert(is_func_call<send_t>(s));
+        gpu_assert(is_func_call<send_t>(s));
         auto &mask = send_t::arg_mask(s);
         if (mask.is_empty()) return true;
         if (is_const(mask)) return true;
@@ -267,7 +267,7 @@ class dpasw_injector_t {
                 src2_base = ptr_base;
                 return;
             }
-            ir_assert(src2_base.is_same(ptr_base));
+            gpu_assert(src2_base.is_same(ptr_base));
         };
 
         // Iterate through dpas and send calls.
@@ -280,7 +280,7 @@ class dpasw_injector_t {
                 auto it = buf2send.find(buf);
                 if (it != buf2send.end()) prev_send = it->second;
                 buf2send[buf] = s;
-                send_infos_.push_back(send_info);
+                send_infos_.push_back(std::move(send_info));
                 if (!prev_send.is_empty()) {
                     send_infos_.back().set_prev_send(prev_send);
                 }
@@ -297,9 +297,8 @@ class dpasw_injector_t {
                 // instructions. That is dpas src2 buffer should be fully
                 // loaded by the corresponding send message.
                 if (send_info.reg_buf_size() != dpas_info.src2_size()) {
-                    ir_warning() << "Can't inject dpasw: different register "
-                                    "sizes in send and dpas."
-                                 << std::endl;
+                    gpu_warning() << "Can't inject dpasw: different register "
+                                     "sizes in send and dpas.";
                     return false;
                 }
                 dpas_info.send_producer = send_info.call;
@@ -348,7 +347,7 @@ class dpasw_injector_t {
         // Where:
         //   p_a_dst[:] = a_dst[0:rcount / 2] + b_dst[0:rcount / 2]
         //   p_b_dst[:] = a_dst[rcount / 2:rcount] + b_dst[rcount / 2:rcount]
-        ir_assert(a.dpas().is_equal(b.dpas()));
+        gpu_assert(a.dpas().is_equal(b.dpas()));
         auto _dpasw = dpas_t::make_dpasw(a.dpas());
         auto &dpasw = _dpasw.as<dpas_t>();
 
@@ -383,7 +382,7 @@ class dpasw_injector_t {
         auto &a_mem_off = send_t::arg_mem_off(a_send.call);
         auto &b_mem_off = send_t::arg_mem_off(b_send.call);
         auto ab_addr_diff = simplify(b_mem_off - a_mem_off);
-        ir_assert(is_const(ab_addr_diff));
+        gpu_assert(is_const(ab_addr_diff));
 
         auto new_send_args = a_send.args();
         send_t::arg_mem_off(new_send_args)
@@ -402,9 +401,9 @@ class dpasw_injector_t {
 
         const int grf_size = hw_.grf_size();
 
-        ir_assert(old_off % grf_size == 0)
+        gpu_assert(old_off % grf_size == 0)
                 << "Must be aligned to GRF boundary.";
-        ir_assert(new_off % grf_size == 0)
+        gpu_assert(new_off % grf_size == 0)
                 << "Must be aligned to GRF boundary.";
 
         old_off /= grf_size;
@@ -421,12 +420,12 @@ class dpasw_injector_t {
     }
 
     static func_t create_half_send(const send_t &send) {
-        ir_assert(send.type.elems() % 2 == 0) << "Can't create half-send.";
+        gpu_assert(send.type.elems() % 2 == 0) << "Can't create half-send.";
         auto _s = send_t::make(send.hw, send.op, send.address,
                 send.type.with_elems(send.type.elems() / 2), send.slots,
-                send.is_lsc, send.zero_out, send.cache_hint);
+                send.is_lsc, send.fill_buf, send.cache_hint);
         auto &s = _s.as<send_t>();
-        ir_assert(s.is_supported())
+        gpu_assert(s.is_supported())
                 << "Can't find send reading half of the original send.";
         MAYBE_UNUSED(s);
         return _s;
diff --git a/src/gpu/intel/jit/pass/expr_scalarizer.hpp b/src/gpu/intel/jit/pass/expr_scalarizer.hpp
index a6d65a32c9f..319b6df0f54 100644
--- a/src/gpu/intel/jit/pass/expr_scalarizer.hpp
+++ b/src/gpu/intel/jit/pass/expr_scalarizer.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ class expr_scalarizer_t : public ir_mutator_t {
         auto type = obj.type;
         auto expr = mutate(obj.expr);
         if (!type.is_scalar()) {
-            ir_assert(type.elems() == elems_) << expr;
+            gpu_assert(type.elems() == elems_) << expr;
             type = type.scalar();
         }
         return cast_t::make(type, expr, obj.saturate);
@@ -46,15 +46,15 @@ class expr_scalarizer_t : public ir_mutator_t {
         if (obj.type.is_scalar()) return obj;
 
         auto it = vec_vars_.find(obj);
-        ir_assert(it != vec_vars_.end()) << "Can't find variable: " << obj;
-        ir_assert(int(it->second.size()) == elems_);
+        gpu_assert(it != vec_vars_.end()) << "Can't find variable: " << obj;
+        gpu_assert(int(it->second.size()) == elems_);
         return it->second[idx_];
     }
 
     object_t _mutate(const shuffle_t &obj) override {
         expr_t new_obj = ir_mutator_t::_mutate(obj);
         auto &shuffle = new_obj.as<shuffle_t>();
-        ir_assert(shuffle.type.elems() == elems_) << new_obj;
+        gpu_assert(shuffle.type.elems() == elems_) << new_obj;
         return new_obj[idx_];
     }
 
diff --git a/src/gpu/intel/jit/pass/hoist.cpp b/src/gpu/intel/jit/pass/hoist.cpp
index 547e5129a81..8f6f5c1d59a 100644
--- a/src/gpu/intel/jit/pass/hoist.cpp
+++ b/src/gpu/intel/jit/pass/hoist.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ class sum_expr_t {
             const std::vector<expr_t> &args, const type_t &type) {
         auto maybe_bcast = [&](const expr_t &e) {
             if (e.type().elems() == type.elems()) return e;
-            ir_assert(e.type().is_scalar());
+            gpu_assert(e.type().is_scalar());
             return shuffle_t::make_broadcast(e, type.elems());
         };
         if (args.empty()) return cast(0, type);
@@ -77,12 +77,13 @@ class hoist_exprs_mutator_t : public ir_mutator_t {
             int max_hoist_size = std::numeric_limits<int>::max())
         : ir_ctx_(ir_ctx), max_hoist_size_(max_hoist_size) {}
 
-    ~hoist_exprs_mutator_t() override { ir_assert(let_vars_.empty()); }
+    ~hoist_exprs_mutator_t() override { gpu_assert(let_vars_.empty()); }
 
     object_t _mutate(const func_call_t &obj) override {
         if (!obj.func.is<send_t>()) return ir_mutator_t::_mutate(obj);
 
         std::vector<expr_t> new_args;
+        new_args.reserve(obj.args.size());
         for (auto &e : obj.args) {
             new_args.push_back(hoist_expr(e));
         }
@@ -120,7 +121,7 @@ class hoist_exprs_mutator_t : public ir_mutator_t {
         bool fully_hoisted = false;
         expr_t new_value;
         bool is_const_let = is_const(obj.value) || is_shuffle_const(obj.value);
-        if (is_const_let && loops_.size() > 0 && can_hoist(obj.var)) {
+        if (is_const_let && !loops_.empty() && can_hoist(obj.var)) {
             fully_hoisted = true;
             register_let(obj.var, obj.value);
             add_hoist_let(loops_[0], obj.var, obj.value);
@@ -232,7 +233,7 @@ class hoist_exprs_mutator_t : public ir_mutator_t {
                 return inv_var;
             }
 
-            other_args.push_back(inv_var);
+            other_args.push_back(std::move(inv_var));
             cur_expr = sum_expr_t::make_add(other_args, type);
         }
         return cur_expr.expr();
@@ -338,7 +339,7 @@ class hoist_send_masks_mutator_t : public ir_mutator_t {
         auto hoisted_mask = hoist_mask(mask);
         if (hoisted_mask.is_same(mask)) return ir_mutator_t::_mutate(obj);
 
-        ir_assert(hoisted_mask.type().is_u16() || hoisted_mask.type().is_u32())
+        gpu_assert(hoisted_mask.type().is_u16() || hoisted_mask.type().is_u32())
                 << hoisted_mask;
 
         send_t::arg_mask(new_args) = cast(hoisted_mask, mask.type());
@@ -355,7 +356,7 @@ class hoist_send_masks_mutator_t : public ir_mutator_t {
         }
 
         if (in_stmt_group) {
-            ir_assert(!obj.value.is_empty());
+            gpu_assert(!obj.value.is_empty());
             let_values_.emplace(obj.var, expand(obj.value, value_vars));
         }
 
@@ -375,16 +376,16 @@ class hoist_send_masks_mutator_t : public ir_mutator_t {
 
 private:
     bool is_loop_dependency(const expr_t &v) const {
-        ir_assert(is_var(v)) << v;
+        gpu_assert(is_var(v)) << v;
         return loop_deps_.count(v) != 0;
     }
 
-    bool can_hoist(const expr_t &expr) {
+    bool can_hoist(const expr_t &expr) const {
         return expr.type().size() <= max_hoist_size_ - current_hoist_size_;
     }
 
     expr_t hoist_mask(const expr_t &e) {
-        ir_assert(e.type().is_bool()) << e;
+        gpu_assert(e.type().is_bool()) << e;
 
         if (is_const(e) || is_shuffle_const(e)) return e;
 
diff --git a/src/gpu/intel/jit/pass/overflow.cpp b/src/gpu/intel/jit/pass/overflow.cpp
index 04e9e810cb0..55d913e4647 100644
--- a/src/gpu/intel/jit/pass/overflow.cpp
+++ b/src/gpu/intel/jit/pass/overflow.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ namespace jit {
 class overflow_bound_finder_t : public bound_finder_base_t {
 public:
     bool has_var(const expr_t &e) const {
-        ir_assert(is_var(e)) << "Expected variable, found: " << e;
+        gpu_assert(is_var(e)) << "Expected variable, found: " << e;
         auto it = var_bounds_.find(e);
         return it != var_bounds_.end();
     }
@@ -40,16 +40,16 @@ class overflow_bound_finder_t : public bound_finder_base_t {
     }
 
     int64_t get_var_bound(const expr_t &e, bool is_low) const override {
-        ir_assert(has_var(e)) << "Variable not found: " << e;
+        gpu_assert(has_var(e)) << "Variable not found: " << e;
         auto &lo_hi = var_bounds_.at(e);
         return is_low ? lo_hi.first : lo_hi.second;
     }
 
     void set_var_bounds(
             const expr_t &e, const std::pair<int64_t, int64_t> &lo_hi) {
-        ir_assert(is_good_bound(lo_hi.first))
+        gpu_assert(is_good_bound(lo_hi.first))
                 << "Can't compute low bound for " << e;
-        ir_assert(is_good_bound(lo_hi.second))
+        gpu_assert(is_good_bound(lo_hi.second))
                 << "Can't compute high bound for " << e;
         var_bounds_.emplace(e, lo_hi);
     }
@@ -124,9 +124,9 @@ class expr_overflow_fixer_t : public ir_mutator_t {
                 bool is_overflow = (lo < type_lo || hi > type_hi);
                 if (is_overflow) {
                     found_overflow = true;
-                    ir_warning() << "Found overflow: " << value
-                                 << " low bound: " << lo
-                                 << " high bound: " << hi << std::endl;
+                    gpu_warning()
+                            << "Found overflow: " << value
+                            << " low bound: " << lo << " high bound: " << hi;
                     break;
                 }
             }
@@ -142,7 +142,7 @@ class expr_overflow_fixer_t : public ir_mutator_t {
                     cast(binary->a, type_t::u64(e.type().elems())), binary->b);
         }
 
-        ir_error_not_expected() << "Can't fix overflow: " << e;
+        gpu_error_not_expected() << "Can't fix overflow: " << e;
         return e;
     }
 
@@ -174,13 +174,13 @@ class overflow_fixer_t : public ir_mutator_t {
             for (auto &rel : kv.second) {
                 bool is_ge = (rel.op_kind() == op_kind_t::_ge);
                 bool is_le = (rel.op_kind() == op_kind_t::_le);
-                ir_assert(is_ge || is_le);
+                gpu_assert(is_ge || is_le);
                 if (rel.op_kind() == op_kind_t::_ge) {
                     lo = std::max(to_cpp<int64_t>(rel.rhs()), lo);
                 } else if (rel.op_kind() == op_kind_t::_le) {
                     hi = std::min(to_cpp<int64_t>(rel.rhs()), hi);
                 } else {
-                    ir_error_not_expected()
+                    gpu_error_not_expected()
                             << "Only >= or <= is expected, found: "
                             << to_string(rel.op_kind());
                 }
@@ -231,7 +231,7 @@ class overflow_fixer_t : public ir_mutator_t {
                 auto value_i = scalarizer.mutate(obj.value);
                 auto lo_hi = ctx_.bound_finder.find_bounds(value_i);
                 ctx_.bound_finder.set_var_bounds(var_i, lo_hi);
-                ctx_.vec_vars[obj.var].push_back(var_i);
+                ctx_.vec_vars[obj.var].push_back(std::move(var_i));
             }
         }
         expr_t var = obj.var;
diff --git a/src/gpu/intel/jit/pass/pass.cpp b/src/gpu/intel/jit/pass/pass.cpp
index f98c52d7c12..c0774bde2c7 100644
--- a/src/gpu/intel/jit/pass/pass.cpp
+++ b/src/gpu/intel/jit/pass/pass.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace jit {
 
 class external_var_visitor_t : public scope_visitor_t {
 public:
-    void _visit(const var_t &obj) {
+    void _visit(const var_t &obj) override {
         if (!is_expr_defined(obj)) external_vars.insert(obj);
     }
 
@@ -100,7 +100,7 @@ class store_splitter_t : public ir_mutator_t {
         stmt_t new_stmt;
         for (int i = 0; i < elems; i += step) {
             int cur_elems = std::min(step, elems - i);
-            ir_assert(math::is_pow2(cur_elems));
+            gpu_assert(math::is_pow2(cur_elems));
             int off = i * stride * elem_size;
             auto store = store_t::make(obj.buf, obj.off + off,
                     split_expr(obj.value, i, i + cur_elems), obj.stride);
@@ -127,7 +127,7 @@ class store_splitter_t : public ir_mutator_t {
             return load_t::make(load->type.with_elems(end - beg), load->buf,
                     load->off + beg * stride, load->stride);
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return expr_t();
     }
 
diff --git a/src/gpu/intel/jit/pass/peephole.cpp b/src/gpu/intel/jit/pass/peephole.cpp
index 8d1d58a3a98..41af6640524 100644
--- a/src/gpu/intel/jit/pass/peephole.cpp
+++ b/src/gpu/intel/jit/pass/peephole.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ class peephole_optimizer_t : public ir_mutator_t {
                 if (!ok) new_obj = old_obj;
                 break;
             }
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return std::move(new_obj);
     }
diff --git a/src/gpu/intel/jit/pass/send.cpp b/src/gpu/intel/jit/pass/send.cpp
index a62d125b62c..a576a74838c 100644
--- a/src/gpu/intel/jit/pass/send.cpp
+++ b/src/gpu/intel/jit/pass/send.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace jit {
 
 class buffer_offset_lifter_t : public ir_mutator_t {
 public:
-    object_t _mutate(const func_call_t &obj) {
+    object_t _mutate(const func_call_t &obj) override {
         if (!obj.func.is<send_t>()) return ir_mutator_t::_mutate(obj);
 
         auto &mem_buf = send_t::arg_mem_buf(obj);
@@ -56,16 +56,22 @@ class send_injector_t : public ir_mutator_t {
 public:
     send_injector_t(ir_context_t &ir_ctx) : ir_ctx_(ir_ctx) {}
 
-    object_t _mutate(const func_call_t &obj) {
+    object_t _mutate(const func_call_t &obj) override {
         auto *send = obj.func.as_ptr<send_t>();
         if (!send) return ir_mutator_t::_mutate(obj);
 
         auto &mem_buf = send_t::arg_mem_buf(obj);
         auto &mem_off = send_t::arg_mem_off(obj);
+        // If header/offset is a pointer, then assume this send already has a
+        // proper header.
+        if (mem_off.type().is_ptr()) return obj;
+
         auto &reg_buf = send_t::arg_reg_buf(obj);
         auto &mask = send_t::arg_mask(obj);
+        expr_t pattern;
+        if (obj.args.size() > 6) pattern = send_t::arg_fill_pattern(obj);
 
-        ir_assert(is_var(mem_buf)) << mem_buf;
+        gpu_assert(is_var(mem_buf)) << mem_buf;
 
         auto header_buf = ir_ctx_.create_tmp_var(type_t::byte_ptr(), "h");
         auto off_store = simplify_store(
@@ -96,8 +102,10 @@ class send_injector_t : public ir_mutator_t {
                     send_t::header_2d_off_whc());
         }
 
-        auto new_call = func_call_t::make(
-                obj.func, {mem_buf, header_buf, reg_buf, mask}, obj.attr);
+        auto new_call = func_call_t::make(obj.func,
+                {mem_buf, header_buf, reg_buf, mask, expr_t(), expr_t(),
+                        std::move(pattern)},
+                obj.attr);
         auto body = stmt_seq_t::make(off_store, new_call);
 
         // Allocate header.
@@ -138,8 +146,8 @@ class send_2d_header_store_lifter_t : public ir_mutator_t {
             if (!is_func_call<send_t>(c)) continue;
             if (!c.as<func_call_t>().func.as<send_t>().is_2d()) continue;
             auto header_buf = send_t::arg_mem_off(c);
-            ir_assert(is_var(header_buf)) << header_buf;
-            header_bufs_.insert(header_buf);
+            gpu_assert(is_var(header_buf)) << header_buf;
+            header_bufs_.insert(std::move(header_buf));
         }
     }
 
@@ -168,7 +176,7 @@ class send_2d_header_store_lifter_t : public ir_mutator_t {
                     off, send_t::header_2d_off_x(), send_t::header_2d_off_y())
                 && !is_const(obj.value))
             return obj;
-        stores_[obj.buf].push_back(obj);
+        stores_[obj.buf].emplace_back(obj);
         return stmt_t();
     }
 
diff --git a/src/gpu/intel/jit/pass/shuffle_splitter.cpp b/src/gpu/intel/jit/pass/shuffle_splitter.cpp
index d631f0c19e8..43044df0ded 100644
--- a/src/gpu/intel/jit/pass/shuffle_splitter.cpp
+++ b/src/gpu/intel/jit/pass/shuffle_splitter.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ class shuffle_splitter_t : public ir_mutator_t {
         expr_t e = shuffle_t::make_broadcast(make_add(bcasts), elems);
         if (!non_bcasts.empty()) e = add(e, make_add(non_bcasts));
 
-        ir_assert(!e.is_empty());
+        gpu_assert(!e.is_empty());
         return std::move(e);
     }
 
@@ -96,7 +96,7 @@ class shuffle_splitter_t : public ir_mutator_t {
                     args.back()[a] += 1;
                 }
             }
-            vec_const.push_back(const_fold(e_const));
+            vec_const.emplace_back(const_fold(e_const));
         }
 
         if (!can_split) return new_obj;
@@ -143,27 +143,28 @@ class shuffle_splitter_t : public ir_mutator_t {
             return true;
         };
 
-        auto get_bcast_difference = [](expr_t expr_a, expr_t expr_b) {
-            if (!expr_a.is<shuffle_t>() || !expr_b.is<shuffle_t>())
-                return expr_t();
-
-            auto &a = expr_a.as<shuffle_t>();
-            auto &b = expr_b.as<shuffle_t>();
-            if (a.idx.size() != b.idx.size()) return expr_t();
-            if (a.vec.size() != b.vec.size()) return expr_t();
-
-            for (size_t i = 0; i < a.idx.size(); i++) {
-                if (a.idx[i] != b.idx[i]) return expr_t();
-            }
-
-            if (a.vec.size() <= 0) return expr_t();
-            expr_t offset = const_fold(a.vec[0] - b.vec[0]);
-            for (size_t i = 0; i < a.vec.size(); i++) {
-                expr_t new_offset = const_fold(a.vec[i] - b.vec[i]);
-                if (!offset.is_equal(new_offset)) return expr_t();
-            }
-            return offset;
-        };
+        auto get_bcast_difference
+                = [](const expr_t &expr_a, const expr_t &expr_b) {
+                      if (!expr_a.is<shuffle_t>() || !expr_b.is<shuffle_t>())
+                          return expr_t();
+
+                      auto &a = expr_a.as<shuffle_t>();
+                      auto &b = expr_b.as<shuffle_t>();
+                      if (a.idx.size() != b.idx.size()) return expr_t();
+                      if (a.vec.size() != b.vec.size()) return expr_t();
+
+                      for (size_t i = 0; i < a.idx.size(); i++) {
+                          if (a.idx[i] != b.idx[i]) return expr_t();
+                      }
+
+                      if (a.vec.empty()) return expr_t();
+                      expr_t offset = const_fold(a.vec[0] - b.vec[0]);
+                      for (size_t i = 0; i < a.vec.size(); i++) {
+                          expr_t new_offset = const_fold(a.vec[i] - b.vec[i]);
+                          if (!offset.is_equal(new_offset)) return expr_t();
+                      }
+                      return offset;
+                  };
 
         auto base_args = args[0];
         for (int i = 1; i < (int)args.size(); i++) {
@@ -172,9 +173,9 @@ class shuffle_splitter_t : public ir_mutator_t {
 
         vec_bcast = make_add(base_args);
         for (auto &a : args)
-            vec_off.push_back(make_add(difference(a, base_args)));
+            vec_off.emplace_back(make_add(difference(a, base_args)));
 
-        bool is_bcast_empty = base_args.size() == 0;
+        bool is_bcast_empty = base_args.empty();
         bool is_consts_empty = is_empty_or_fill(vec_const);
         bool is_consts_bcast = is_bcast(vec_const);
         bool is_off_empty = is_empty_or_fill(vec_off);
diff --git a/src/gpu/intel/jit/pass/simplify.cpp b/src/gpu/intel/jit/pass/simplify.cpp
index d8f330c62a7..093158bec6c 100644
--- a/src/gpu/intel/jit/pass/simplify.cpp
+++ b/src/gpu/intel/jit/pass/simplify.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -102,7 +102,7 @@ class pint_imm_t : public expr_impl_t {
     }
 
     // Matches any integer constant.
-    static expr_t make_any(int64_t id) { return expr_t(new pint_imm_t(id, 0)); }
+    static expr_t make_any(int id) { return expr_t(new pint_imm_t(id, 0)); }
 
     bool matches(const int_imm_t &imm) const {
         if (id == -1) return value == imm.value;
@@ -140,9 +140,9 @@ class match_context_t {
     }
 
     void set(const expr_t &ptrn, const expr_t &e) {
-        ir_assert(ptrn.is<pexpr_t>());
+        gpu_assert(ptrn.is<pexpr_t>());
         auto ret = expr_matched_.insert({ptrn, e});
-        ir_assert(ret.second);
+        gpu_assert(ret.second);
         MAYBE_UNUSED(ret);
     }
 
@@ -314,7 +314,7 @@ expr_t simplify_rewrite_add(const expr_t &_e) {
     auto _0 = pint_imm_t::_0();
 
     auto &obj = _e.as<binary_op_t>();
-    ir_assert(obj.op_kind == op_kind_t::_add);
+    gpu_assert(obj.op_kind == op_kind_t::_add);
 
     auto e = _e;
 
@@ -330,7 +330,7 @@ expr_t simplify_rewrite_sub(const expr_t &_e) {
     auto _0 = pint_imm_t::_0();
 
     auto &obj = _e.as<binary_op_t>();
-    ir_assert(obj.op_kind == op_kind_t::_sub);
+    gpu_assert(obj.op_kind == op_kind_t::_sub);
 
     auto e = _e;
 
@@ -347,7 +347,7 @@ expr_t simplify_rewrite_mul(const expr_t &_e) {
     auto _1 = pint_imm_t::_1();
 
     auto &obj = _e.as<binary_op_t>();
-    ir_assert(obj.op_kind == op_kind_t::_mul);
+    gpu_assert(obj.op_kind == op_kind_t::_mul);
 
     auto e = _e;
 
@@ -366,7 +366,7 @@ expr_t simplify_rewrite_div(const expr_t &_e) {
     auto _1 = pint_imm_t::_1();
 
     auto &obj = _e.as<binary_op_t>();
-    ir_assert(obj.op_kind == op_kind_t::_div);
+    gpu_assert(obj.op_kind == op_kind_t::_div);
 
     auto e = _e;
 
@@ -383,7 +383,7 @@ expr_t simplify_rewrite_mod(const expr_t &_e) {
     auto _1 = pint_imm_t::_1();
 
     auto &obj = _e.as<binary_op_t>();
-    ir_assert(obj.op_kind == op_kind_t::_mod);
+    gpu_assert(obj.op_kind == op_kind_t::_mod);
 
     auto e = _e;
 
@@ -397,7 +397,7 @@ expr_t simplify_rewrite_and(const expr_t &_e) {
     auto x = pexpr_t::x();
 
     auto &obj = _e.as<binary_op_t>();
-    ir_assert(obj.op_kind == op_kind_t::_and);
+    gpu_assert(obj.op_kind == op_kind_t::_and);
 
     auto e = _e;
 
@@ -420,6 +420,33 @@ expr_t simplify_rewrite_and(const expr_t &_e) {
     return e;
 }
 
+expr_t simplify_rewrite_or(const expr_t &_e) {
+    auto x = pexpr_t::x();
+
+    auto &obj = _e.as<binary_op_t>();
+    gpu_assert(obj.op_kind == op_kind_t::_or);
+
+    auto e = _e;
+
+    // Boolean rules.
+    if (e.type().is_bool()) {
+        auto _true = (e.type().is_scalar()
+                        ? expr_t(true)
+                        : shuffle_t::make_broadcast(
+                                expr_t(true), e.type().elems()));
+        auto _false = (e.type().is_scalar()
+                        ? expr_t(false)
+                        : shuffle_t::make_broadcast(
+                                expr_t(false), e.type().elems()));
+        REWRITE_BINARY_NO_STATIC(_true | x, _true);
+        REWRITE_BINARY_NO_STATIC(x | _true, _true);
+        REWRITE_BINARY_NO_STATIC(_false | x, x);
+        REWRITE_BINARY_NO_STATIC(x | _false, x);
+    }
+
+    return e;
+}
+
 expr_t simplify_rewrite_iif(const expr_t &_e) {
     auto x = pexpr_t::x();
     auto y = pexpr_t::y();
@@ -468,6 +495,7 @@ class term_rewrite_transformer_t : public ir_mutator_t {
             case op_kind_t::_div: return simplify_rewrite_div(e);
             case op_kind_t::_mod: return simplify_rewrite_mod(e);
             case op_kind_t::_and: return simplify_rewrite_and(e);
+            case op_kind_t::_or: return simplify_rewrite_or(e);
             default: return e;
         }
     }
@@ -584,7 +612,7 @@ class cmp_simplifier_t : public ir_mutator_t {
                     new_op_kind = op_kind_t::_le;
                     div = (sign ? div + 1 : div);
                     break;
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
         }
 
@@ -605,7 +633,7 @@ class cmp_simplifier_t : public ir_mutator_t {
         if (!is_const(a_op.b)) return e;
 
         auto &c0 = a_op.b;
-        ir_assert(to_cpp<int64_t>(c0) > 0) << e;
+        gpu_assert(to_cpp<int64_t>(c0) > 0) << e;
 
         // Comparison against a constant is a continuous function, just check
         // boundary points.
@@ -674,15 +702,15 @@ void fold_const_nary_op_args(op_kind_t op_kind, const std::vector<expr_t> &args,
     }
     if (op_kind == op_kind_t::_mul && is_one(c)) return;
     if (op_kind == op_kind_t::_add && is_zero(c)) return;
-    new_args.push_back(c);
+    new_args.push_back(std::move(c));
 }
 
 expr_t cvt_mul_to_nary_op(const expr_t &a, const expr_t &b) {
     auto *a_nary = a.as_ptr<nary_op_t>();
     auto *b_nary = b.as_ptr<nary_op_t>();
 
-    if (a_nary) ir_assert(a_nary->op_kind == op_kind_t::_mul);
-    if (b_nary) ir_assert(b_nary->op_kind == op_kind_t::_mul);
+    if (a_nary) gpu_assert(a_nary->op_kind == op_kind_t::_mul);
+    if (b_nary) gpu_assert(b_nary->op_kind == op_kind_t::_mul);
 
     auto a_args = cvt_expr_to_nary_op_args(a);
     auto b_args = cvt_expr_to_nary_op_args(b);
@@ -771,7 +799,7 @@ class mul_nary_op_expander_t : public nary_op_flattener_t {
             if (arg.is<cast_t>()) arg = args[i].as<cast_t>().expr;
             auto *nary = arg.as_ptr<nary_op_t>();
             if (nary && nary->op_kind != op_kind_t::_add) {
-                ir_error_not_expected();
+                gpu_error_not_expected();
             }
             auto i_args = cvt_expr_to_nary_op_args(arg);
             if (arg.type() != args[i].type()) {
@@ -780,7 +808,7 @@ class mul_nary_op_expander_t : public nary_op_flattener_t {
                 }
             }
             if (new_args.empty()) {
-                new_args = i_args;
+                new_args = std::move(i_args);
                 continue;
             }
             std::vector<expr_t> next_args;
@@ -828,6 +856,8 @@ class nary_op_canonical_verifier_t : public nary_op_visitor_t {
 
     void _visit(const unary_op_t &obj) override { visit_new_scope(obj); }
 
+    void _visit(const ternary_op_t &obj) override { visit_new_scope(obj); }
+
     void _visit(const nary_op_t &obj) override {
         if (parent_nary_) {
             if (!(parent_nary_->op_kind == op_kind_t::_add
@@ -871,7 +901,7 @@ class nary_op_back_transformer_t : public nary_op_mutator_t {
     object_t _mutate(const nary_op_t &obj) override {
         auto new_obj = nary_op_mutator_t::_mutate(obj);
         auto &nary = new_obj.as<nary_op_t>();
-        ir_assert(!nary.args.empty()) << new_obj;
+        gpu_assert(!nary.args.empty()) << new_obj;
 
         if (nary.args.size() == 1) return nary.args[0];
 
@@ -886,7 +916,7 @@ class nary_op_back_transformer_t : public nary_op_mutator_t {
                 ret *= nary.args[i];
             return std::move(ret);
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return expr_t();
     }
 };
@@ -1018,7 +1048,7 @@ class factored_expr_t : public expr_impl_t {
         auto f_common = intersect(other);
         auto diff_other = f_other.diff(f_common);
         // Other must be reducible.
-        ir_assert(diff_other.as<factored_expr_t>().is_one()) << diff_other;
+        gpu_assert(diff_other.as<factored_expr_t>().is_one()) << diff_other;
         return diff(f_common);
     }
 
@@ -1065,7 +1095,7 @@ class factored_expr_t : public expr_impl_t {
             e_const = e_const * abs(e);
         }
         if (sign) e_const = -e_const;
-        factors.push_back(e_const);
+        factors.push_back(std::move(e_const));
     }
 
     void init_factors(const expr_t &e) {
@@ -1118,7 +1148,7 @@ class factored_expr_t : public expr_impl_t {
             factors = f_common.merge(rest).as<factored_expr_t>().factors;
             return;
         }
-        ir_error_not_expected();
+        gpu_error_not_expected();
     }
 
     expr_t intersect_impl(const expr_t &other, bool ignore_constants) const {
@@ -1290,7 +1320,7 @@ class int_div_mod_expander_t : public nary_op_mutator_t {
             return mutate(lhs_div) + mutate(rhs_div);
         }
 
-        ir_error_not_expected() << expr;
+        gpu_error_not_expected() << expr;
 
         return expr_t();
     }
@@ -1346,7 +1376,7 @@ class int_div_mod_expander_t : public nary_op_mutator_t {
 
         // max_gcd is the GCD for some summand so at least one summand must be
         // reducible.
-        ir_assert(!lhs_args.empty());
+        gpu_assert(!lhs_args.empty());
 
         if (rhs_args.empty()) return expr_t();
 
@@ -1369,7 +1399,7 @@ class int_div_mod_expander_t : public nary_op_mutator_t {
             return lhs_div;
         }
 
-        ir_error_not_expected() << expr;
+        gpu_error_not_expected() << expr;
 
         return expr_t();
     }
@@ -1428,7 +1458,7 @@ class common_factor_simplifier_t : public nary_op_mutator_t {
         auto args = mutate(obj.args);
         for (auto &a : args) {
             auto *nary = a.as_ptr<nary_op_t>();
-            if (nary) ir_assert(nary->op_kind == op_kind_t::_mul) << a;
+            if (nary) gpu_assert(nary->op_kind == op_kind_t::_mul) << a;
         }
 
         // Fold same factors (find exact match, ignore constants).
@@ -1535,6 +1565,7 @@ expr_t simplify_with_nary(const expr_t &_e, const constraint_set_t &cset) {
     return e;
 }
 
+// NOLINTNEXTLINE(readability-identifier-naming)
 class _64_bit_add_optimizer_t : public nary_op_mutator_t {
 public:
     object_t _mutate(const nary_op_t &obj) override {
@@ -1664,13 +1695,13 @@ class stmt_simplifier_t : public ir_mutator_t {
             case op_kind_t::_le: return op_kind_t::_gt;
             case op_kind_t::_lt: return op_kind_t::_ge;
             case op_kind_t::_ne: return op_kind_t::_eq;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return op_kind_t::undef;
     }
 
     static expr_t flip_condition(const expr_t &cond) {
-        ir_assert(cond.type().is_bool());
+        gpu_assert(cond.type().is_bool());
 
         auto *binary_op = cond.as_ptr<binary_op_t>();
         if (binary_op) {
@@ -1686,7 +1717,7 @@ class stmt_simplifier_t : public ir_mutator_t {
                     flip_condition(shuffle->vec[0]), shuffle->elems());
         }
 
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return expr_t();
     }
 
@@ -1717,7 +1748,7 @@ stmt_t simplify_stmt(const stmt_t &s, const constraint_set_t &cset) {
 }
 
 int64_t get_max_const_factor(const expr_t &_e, const constraint_set_t &cset) {
-    ir_assert(_e.type().is_int());
+    gpu_assert(_e.type().is_int());
     auto e = _e;
     // Some complex expressions need more than one simplify() call.
     int max_tries = 3;
@@ -1740,9 +1771,9 @@ struct op_traits_t {};
         static auto compute(T a, T b) -> decltype(a op b) { \
             return a op b; \
         } \
-        template <op_kind_t dummy_op = name, \
-                typename \
-                = typename std::enable_if<dummy_op == op_kind_t::_and>::type> \
+        template <op_kind_t dummy_op = (name), \
+                typename = typename std::enable_if<dummy_op == op_kind_t::_and \
+                        || dummy_op == op_kind_t::_or>::type> \
         static bool compute(bool a, bool b) { \
             return a op b; \
         } \
@@ -1760,6 +1791,7 @@ DECL_OP_TRAITS(op_kind_t::_lt, <)
 DECL_OP_TRAITS(op_kind_t::_le, <=)
 
 DECL_OP_TRAITS(op_kind_t::_and, &)
+DECL_OP_TRAITS(op_kind_t::_or, |)
 
 template <>
 struct op_traits_t<op_kind_t::_min> {
@@ -1785,21 +1817,31 @@ struct op_traits_t<op_kind_t::_div> {
     template <typename T,
             typename = typename std::enable_if<is_int_t<T>::value>::type>
     static auto compute(T a, T b) -> decltype(a / b) {
-        ir_assert(b > 0);
-        int r = a % b;
-        int d = a / b;
+        gpu_assert(b > 0);
+        T r = a % b;
+        T d = a / b;
         if (r < 0) d--;
         return d;
     }
 };
 
+template <>
+struct op_traits_t<op_kind_t::_div_up> {
+    template <typename T,
+            typename = typename std::enable_if<is_int_t<T>::value>::type>
+    static auto compute(T a, T b) -> decltype(a / b) {
+        return op_traits_t<op_kind_t::_div>::compute(
+                static_cast<T>(a + b - 1), b);
+    }
+};
+
 template <>
 struct op_traits_t<op_kind_t::_mod> {
     template <typename T,
             typename = typename std::enable_if<is_int_t<T>::value>::type>
     static auto compute(T a, T b) -> decltype(a % b) {
-        ir_assert(b > 0);
-        int r = a % b;
+        gpu_assert(b > 0);
+        T r = a % b;
         if (r < 0) r += b;
         return r;
     }
@@ -1847,6 +1889,7 @@ class const_fold_helper_t {
             CASE(op_kind_t::_sub)
             CASE(op_kind_t::_mul)
             CASE(op_kind_t::_div)
+            CASE(op_kind_t::_div_up)
             CASE(op_kind_t::_mod)
 
             CASE(op_kind_t::_eq)
@@ -1857,10 +1900,11 @@ class const_fold_helper_t {
             CASE(op_kind_t::_le)
 
             CASE(op_kind_t::_and)
+            CASE(op_kind_t::_or)
             CASE(op_kind_t::_min)
             CASE(op_kind_t::_max)
 
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
 
 #undef CASE
         }
@@ -1892,7 +1936,7 @@ bool is_const_or_shuffle_const(const expr_t &e) {
 }
 
 expr_t const_fold_unary(op_kind_t op_kind, const expr_t &a) {
-    ir_assert(op_kind == op_kind_t::_minus);
+    gpu_assert(op_kind == op_kind_t::_minus);
     if (!a.type().is_scalar()) {
         int elems = a.type().elems();
         std::vector<expr_t> ret;
@@ -1915,7 +1959,7 @@ expr_t const_fold_unary(op_kind_t op_kind, const expr_t &a) {
 
 #undef CASE
 
-    ir_error_not_expected() << "Cannot handle type: " << a;
+    gpu_error_not_expected() << "Cannot handle type: " << a;
     return expr_t();
 }
 
@@ -1935,7 +1979,7 @@ expr_t const_fold_binary(const type_t &compute_type, op_kind_t op_kind,
     if (compute_type.is_unsigned()) {
         auto a_s64 = to_cpp<int64_t>(a);
         auto b_s64 = to_cpp<int64_t>(b);
-        ir_assert(a_s64 >= 0 && b_s64 >= 0)
+        gpu_assert(a_s64 >= 0 && b_s64 >= 0)
                 << "Overflow detected: fix data types.";
         MAYBE_UNUSED(a_s64);
         MAYBE_UNUSED(b_s64);
@@ -1959,14 +2003,14 @@ expr_t const_fold_binary(const type_t &compute_type, op_kind_t op_kind,
 
 #undef CASE
 
-    ir_error_not_expected() << "Unknown type.";
+    gpu_error_not_expected() << "Unknown type.";
     return expr_t();
 }
 
 object_t simplify(const object_t &obj, const constraint_set_t &cset) {
     if (obj.is_expr()) return simplify_expr(obj, cset);
     if (obj.is_stmt()) return simplify_stmt(obj, cset);
-    ir_assert(obj.is_empty());
+    gpu_assert(obj.is_empty());
     return object_t();
 }
 
@@ -2077,7 +2121,7 @@ expr_t simplify_cmp_reduce_lhs_rhs(const expr_t &e) {
                 new_op_kind = op_kind_t::_le;
                 div = (sign ? div + 1 : div);
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
     }
 
@@ -2110,6 +2154,7 @@ bool const_to_const_binary(const expr_t &e, op_kind_t op_kind,
     }
     switch (op_kind) {
         case op_kind_t::_and: a = b = e; return true;
+        case op_kind_t::_or: a = b = e; return true;
         case op_kind_t::_le:
         case op_kind_t::_lt:
             a = (is_true ? a0 : a1);
@@ -2191,8 +2236,8 @@ expr_t simplify_propagate_shuffle(const expr_t &e) {
                     ok = false;
                     break;
                 }
-                a.push_back(op_a);
-                b.push_back(op_b);
+                a.push_back(std::move(op_a));
+                b.push_back(std::move(op_b));
             } else if (op_kind == op_kind_t::_and) {
                 // Replace with expression true <op_kind> elem to allow matching
                 // this op against future binary operation.
@@ -2270,7 +2315,7 @@ expr_t nary_op_canonicalize(const expr_t &_e) {
     e = nary_op_transformer_t().mutate(e);
     e = mul_nary_op_expander_t().mutate(e);
 
-    ir_assert(is_nary_op_canonical(e)) << e;
+    gpu_assert(is_nary_op_canonical(e)) << e;
     MAYBE_UNUSED(is_nary_op_canonical(e));
 
     return e;
@@ -2280,7 +2325,7 @@ expr_t make_nary_op(op_kind_t op_kind, const std::vector<expr_t> &args) {
     if (args.empty()) {
         if (op_kind == op_kind_t::_add) return 0;
         if (op_kind == op_kind_t::_mul) return 1;
-        ir_error_not_expected() << to_string(op_kind);
+        gpu_error_not_expected() << to_string(op_kind);
     }
     if (args.size() == 1) return args[0];
 
diff --git a/src/gpu/intel/jit/pass/slm.cpp b/src/gpu/intel/jit/pass/slm.cpp
index 3319f021f27..c4b0308f9f0 100644
--- a/src/gpu/intel/jit/pass/slm.cpp
+++ b/src/gpu/intel/jit/pass/slm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ class slm_buffer_merger_t : public ir_mutator_t {
 
     const expr_t &slm_base() const { return slm_base_; }
 
-    int slm_size() const { return slm_size_; }
+    dim_t slm_size() const { return slm_size_; }
 
     object_t _mutate(const alloc_t &obj) override {
         if (obj.kind != alloc_kind_t::slm) return ir_mutator_t::_mutate(obj);
@@ -53,7 +53,7 @@ class slm_buffer_merger_t : public ir_mutator_t {
 
 private:
     expr_t push(const alloc_t &obj) {
-        int cur_off = slm_off_.back();
+        dim_t cur_off = slm_off_.back();
         expr_t new_buf = slm_base_ + cur_off;
         slm_off_.push_back(cur_off + obj.size);
         slm_size_ = std::max(slm_size_, cur_off + obj.size);
@@ -63,8 +63,8 @@ class slm_buffer_merger_t : public ir_mutator_t {
     void pop() { slm_off_.pop_back(); }
 
     expr_t slm_base_;
-    std::vector<int> slm_off_;
-    int slm_size_ = 0;
+    std::vector<dim_t> slm_off_;
+    dim_t slm_size_ = 0;
 };
 
 stmt_t merge_slm_buffers(const stmt_t &_stmt, ir_context_t &ir_ctx) {
@@ -72,8 +72,8 @@ stmt_t merge_slm_buffers(const stmt_t &_stmt, ir_context_t &ir_ctx) {
     stmt_t stmt = _stmt;
     slm_buffer_merger_t merger;
     stmt = merger.mutate(stmt);
-    stmt = alloc_t::make(
-            merger.slm_base(), merger.slm_size(), alloc_kind_t::slm, stmt);
+    stmt = alloc_t::make(merger.slm_base(), into<uint32_t>(merger.slm_size()),
+            alloc_kind_t::slm, stmt);
     trace_pass("merge_slm_buffers", stmt, ir_ctx);
     return stmt;
 }
@@ -85,7 +85,7 @@ class slm_reorder_injector_t : public ir_mutator_t {
         : hw_(hw), tg_grid_(tg_grid) {
         alloc_manager_t alloc_mgr(root);
         auto slm_buffers = alloc_mgr.find_buffers(alloc_kind_t::slm);
-        ir_assert(slm_buffers.size() == 1);
+        gpu_assert(slm_buffers.size() == 1);
         slm_base_ = slm_buffers[0];
         slm_size_ = alloc_mgr.total_size(alloc_kind_t::slm);
     }
@@ -155,17 +155,18 @@ class slm_reorder_injector_t : public ir_mutator_t {
             const layout_t &dst, const expr_t &src_buf, const expr_t &dst_buf) {
         auto src_tile = src.map(tile);
         auto &src_tile_blocks = src_tile.blocks();
-        int simd = src_tile_blocks[0].block;
-        int vect_size = src_tile_blocks[1].block;
+        int simd = into<int>(src_tile_blocks[0].block);
+        int vect_size = into<int>(src_tile_blocks[1].block);
         int tile_size = simd * vect_size * src.type().size();
         int slm_thr_size = (int)src.size();
         int dword_size = type_t::dword().size();
         int hword_size = type_t::hword().size();
         int hwords = tile_size / hword_size;
 
-        ir_assert(tile_size % hword_size == 0);
+        gpu_assert(tile_size % hword_size == 0);
 
-        slm_size_ = std::max(slm_size_, slm_thr_size * tg_grid_.elems());
+        slm_size_ = std::max(
+                slm_size_, slm_thr_size * into<int>(tg_grid_.elems()));
 
         auto store_send = send_t::make(hw_, send_op_t::store,
                 send_address_t::slm, type_t::dword(vect_size), simd, true);
@@ -206,11 +207,11 @@ class slm_reorder_injector_t : public ir_mutator_t {
         auto &d1 = dst_blocks[1];
 
         if (s0.dim_idx != d1.dim_idx || s1.dim_idx != d0.dim_idx) return false;
-        ir_assert(s0.block == d1.block);
-        ir_assert(s1.block == d0.block);
+        gpu_assert(s0.block == d1.block);
+        gpu_assert(s1.block == d0.block);
 
-        int simd = s0.block;
-        int vec_size = s1.block;
+        int simd = into<int>(s0.block);
+        int vec_size = into<int>(s1.block);
         if (!utils::one_of(simd, 16)) return false;
         if (!utils::one_of(vec_size, 8)) return false;
 
diff --git a/src/gpu/intel/jit/pass/strength_reduce.cpp b/src/gpu/intel/jit/pass/strength_reduce.cpp
index e75492f190d..9a93a6d2de0 100644
--- a/src/gpu/intel/jit/pass/strength_reduce.cpp
+++ b/src/gpu/intel/jit/pass/strength_reduce.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ class loop_strength_reducer_t : public ir_mutator_t {
 
     ~loop_strength_reducer_t() override {
         // Sanity check, all stores must be applied.
-        ir_assert(post_inc_stores.empty());
+        gpu_assert(post_inc_stores.empty());
     }
 
     object_t _mutate(const for_t &obj) override {
@@ -47,7 +47,7 @@ class loop_strength_reducer_t : public ir_mutator_t {
         int loop_level = int(loops_.size()) - 1;
         auto ret = lets_.insert(
                 {obj.var, let_info_t(obj.var, obj.value, loop_level)});
-        ir_assert(ret.second);
+        gpu_assert(ret.second);
         MAYBE_UNUSED(ret);
         auto new_obj = ir_mutator_t::_mutate(obj);
         lets_.erase(obj.var);
@@ -118,7 +118,7 @@ class loop_strength_reducer_t : public ir_mutator_t {
         loops_[init_store_level].init_stores.push_back(init_store_stmt);
         if (!post_inc_store.is_empty()) {
             auto ret = post_inc_stores.insert({obj.buf, post_inc_store});
-            ir_assert(ret.second);
+            gpu_assert(ret.second);
             MAYBE_UNUSED(ret);
         }
         return stmt_t();
@@ -226,7 +226,7 @@ class loop_strength_reducer_t : public ir_mutator_t {
         }
         loops_.pop_back();
         // The top-level dummy loop shouldn't be removed.
-        ir_assert(loops_.size() >= 1);
+        gpu_assert(!loops_.empty());
         return std::move(s);
     }
 
diff --git a/src/gpu/intel/jit/pass/unroll.cpp b/src/gpu/intel/jit/pass/unroll.cpp
index cea9158dc44..d4948fb3744 100644
--- a/src/gpu/intel/jit/pass/unroll.cpp
+++ b/src/gpu/intel/jit/pass/unroll.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -96,15 +96,15 @@ class loop_unroller_t : public ir_mutator_t {
         // No unrolling.
         if (_for.unroll == 1) return new_obj;
 
-        ir_assert(is_const(_for.init))
+        gpu_assert(is_const(_for.init))
                 << "Can't unroll loop with non-const bound: " << _for.init;
-        ir_assert(is_const(_for.bound))
+        gpu_assert(is_const(_for.bound))
                 << "Can't unroll loop with non-const bound: " << _for.bound;
 
         auto init = to_cpp<int>(_for.init);
         auto bound = to_cpp<int>(_for.bound);
 
-        ir_assert(_for.unroll == (bound - init))
+        gpu_assert(_for.unroll == (bound - init))
                 << "Only full loop unroll is supported.";
 
         stmt_t ret;
diff --git a/src/gpu/intel/jit/pooling/config.hpp b/src/gpu/intel/jit/pooling/config.hpp
index 17c0bc625b7..4079fdbd92c 100644
--- a/src/gpu/intel/jit/pooling/config.hpp
+++ b/src/gpu/intel/jit/pooling/config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -70,7 +70,8 @@ class pooling_config_t : public prim_config_t {
 
         // only allow SIMD-aligned channel-first layouts
         const auto &oc_blk = src.blocks()[0];
-        if ((oc_blk.dim_idx != 1) || (oc_blk.block % exec.simd())) return false;
+        if ((oc_blk.dim_idx != dim_idx_t(1)) || (oc_blk.block % exec.simd()))
+            return false;
 
         // for some reason 3D pooling works poorly on PVC at the moment
         // TODO: bring PVC 3D pooling back
@@ -126,14 +127,14 @@ class pooling_config_t : public prim_config_t {
         set_exec_cfg(ec);
     }
 
-    prb_tile_t shape(bool pad) const override {
+    pvar_tile_t shape(bool pad) const override {
 #define SET(g_name, l_name) \
-    ret[prb_dims::g_name] = (pad) \
-            ? utils::rnd_up(prb.l_name, pad_block(prb_dims::g_name)) \
+    ret[pvars::g_name] = (pad) \
+            ? utils::rnd_up(prb.l_name, pad_block(pvars::g_name)) \
             : prb.l_name
 
         const auto &prb = pooling_problem();
-        prb_tile_t ret;
+        pvar_tile_t ret;
         SET(mb, mb);
         SET(oc, c);
         if (is_fwd()) {
@@ -153,38 +154,36 @@ class pooling_config_t : public prim_config_t {
 #undef SET
     }
 
-    const std::vector<prb_dim_t> &index_dims() const override {
+    const std::vector<pvar_t> &index_dims() const override {
         auto get_dims = [&](bool is_fwd) {
-            std::vector<prb_dim_t> ret;
-            ret.push_back(prb_dims::mb);
-            ret.push_back(prb_dims::oc);
+            std::vector<pvar_t> ret;
+            ret.push_back(pvars::mb);
+            ret.push_back(pvars::oc);
             if (is_fwd) {
-                ret.push_back(prb_dims::od);
-                ret.push_back(prb_dims::oh);
-                ret.push_back(prb_dims::ow);
+                ret.push_back(pvars::od);
+                ret.push_back(pvars::oh);
+                ret.push_back(pvars::ow);
             } else {
-                ret.push_back(prb_dims::id);
-                ret.push_back(prb_dims::ih);
-                ret.push_back(prb_dims::iw);
+                ret.push_back(pvars::id);
+                ret.push_back(pvars::ih);
+                ret.push_back(pvars::iw);
             }
-            ret.push_back(prb_dims::kd);
-            ret.push_back(prb_dims::kh);
-            ret.push_back(prb_dims::kw);
+            ret.push_back(pvars::kd);
+            ret.push_back(pvars::kh);
+            ret.push_back(pvars::kw);
             return ret;
         };
-        static std::vector<prb_dim_t> fwd_dims = get_dims(true);
-        static std::vector<prb_dim_t> bwd_dims = get_dims(false);
+        static std::vector<pvar_t> fwd_dims = get_dims(true);
+        static std::vector<pvar_t> bwd_dims = get_dims(false);
         return (is_fwd()) ? fwd_dims : bwd_dims;
     }
 
-    int pad_block(const prb_dim_t &d) const override {
-        switch (d.kind()) {
-            default: return 1;
-            case prb_dim_kind_t::mb:
-                return src_layout().user().inner_block(0, true, false);
-            case prb_dim_kind_t::oc:
-                return src_layout().user().inner_block(1, true, false);
-        }
+    int pad_block(const pvar_t &d) const override {
+        if (d == pvars::mb)
+            return into<int>(src_layout().user().inner_block(0, true, false));
+        if (d == pvars::oc)
+            return into<int>(src_layout().user().inner_block(1, true, false));
+        return 1;
     }
 
     bool is_fwd() const { return !pooling_problem().is_backward; }
@@ -223,29 +222,30 @@ class pooling_config_t : public prim_config_t {
 
         //                  mb oc od oh ow kd kh kw
         //                  [0  1][2  3  4][5  6  7]
-        std::vector<int> lg {1, 1, 1, 1, 1, 1, 1, 1};
-        std::vector<int> tg {1, 1, 1}, kg {1, 1, 1};
+        std::vector<dim_t> lg {1, 1, 1, 1, 1, 1, 1, 1};
+        std::vector<dim_t> tg {1, 1, 1}, kg {1, 1, 1};
 
-        std::vector<int> padded {
-                int(src.dim(0)), int(src.dim(1)), prb.od, prb.oh, prb.ow};
+        std::vector<dim_t> padded {
+                src.dim(0), src.dim(1), prb.od, prb.oh, prb.ow};
         auto &mb = padded[0], &oc = padded[1];
         auto &od = padded[2], &oh = padded[3], &ow = padded[4];
 
         const bool is_scalar = (prb.kd * prb.kh * prb.kw == 1);
         const bool is_small = (prb.kh * prb.kw <= 9);
-        const bool is_xe2 = (exec.hw().to_ngen() == ngen::HW::Xe2);
+        const bool is_xe2_or_xe3 = (exec.hw().to_ngen() == ngen::HW::Xe2)
+                || (exec.hw().to_ngen() == ngen::HW::Xe3);
 
         const int src_type_size = src.type().size();
         const int acc_type_size = acc_type(1).size();
-        const int oc_blk = src.blocks()[0].block;
-        const int mb_blk = (is_blocked_by_mb()) ? src.blocks()[1].block : mb;
+        const dim_t oc_blk = src.blocks()[0].block;
+        const dim_t mb_blk = (is_blocked_by_mb()) ? src.blocks()[1].block : mb;
         // the constant being subtracted is heuristic
         const int regs_per_tile
                 = exec.regs() - (!is_scalar ? is_blocked_by_mb() ? 8 : 28 : 0);
 
-        auto optimize_load = [](int &dim, int mult) {
+        auto optimize_load = [](dim_t &dim, dim_t mult) {
             const int optimal_load_size = 256;
-            int null = 0;
+            dim_t null = 0;
             while ((dim * mult > optimal_load_size) && (dim > 1))
                 cut_dim(dim, null, 1);
         };
@@ -253,11 +253,13 @@ class pooling_config_t : public prim_config_t {
         if (!is_scalar && is_small) {
             // SMALL FILTERS
 
-            if (is_xe2)
-                mb = utils::rnd_up(mb, std::min(8, utils::rnd_up_pow2(mb)));
+            if (is_xe2_or_xe3)
+                mb = utils::rnd_up(
+                        mb, std::min(dim_t(8), utils::rnd_up_pow2(mb)));
 
-            const int max_tg = exec.hw().max_tg_size(exec.regs(), exec.simd());
-            ir_assert(max_tg == utils::rnd_up_pow2(max_tg));
+            const dim_t max_tg
+                    = exec.hw().max_tg_size(exec.regs(), exec.simd());
+            gpu_assert(max_tg == utils::rnd_up_pow2(max_tg));
 
             const bool ow_pow2
                     = (ow > 1) && (utils::rnd_up_pow2(oh) * ow > max_tg);
@@ -282,8 +284,8 @@ class pooling_config_t : public prim_config_t {
             kg[2] = 1;
 
             if (ow_pow2 && (mb >= 512)) { // lower TGs preferable at higher MBs
-                const int low_tg
-                        = std::max(1, max_tg / (2 * utils::div_up(mb, 512)));
+                const dim_t low_tg = std::max(
+                        dim_t(1), max_tg / (2 * utils::div_up(mb, 512)));
                 if (tg[2] / low_tg > 1) {
                     kg[is_blocked_by_mb() ? 2 : 1] *= tg[2] / low_tg;
                     tg[2] = low_tg;
@@ -296,17 +298,18 @@ class pooling_config_t : public prim_config_t {
             const int simds_per_line
                     = max_grf / (simd * (src_type_size + acc_type_size));
 
-            auto calc_non_sp = [](int scale, int simds, int opt, int per_line) {
-                int pow2 = 1;
-                for (int i = simds; i % 2 == 0; i /= 2)
-                    pow2 *= 2;
-                pow2 = (opt > pow2) ? simds : pow2;
-                return scale * utils::max_div(pow2, per_line / scale);
-            };
+            auto calc_non_sp
+                    = [](dim_t scale, dim_t simds, int opt, dim_t per_line) {
+                          dim_t pow2 = 1;
+                          for (dim_t i = simds; i % 2 == 0; i /= 2)
+                              pow2 *= 2;
+                          pow2 = (opt > pow2) ? simds : pow2;
+                          return scale * utils::max_div(pow2, per_line / scale);
+                      };
             if (is_blocked_by_mb()) {
                 lg[1] = oc_blk / simd;
                 lg[0] = mb_blk;
-                int null = 0;
+                dim_t null = 0;
                 while (lg[1] * lg[0] > simds_per_line) {
                     if (lg[0] > 1)
                         cut_dim(lg[0], null, 1);
@@ -325,7 +328,7 @@ class pooling_config_t : public prim_config_t {
                 } else {
                     lg[1] = utils::max_div(oc_blk / simd, simds_per_line);
                 }
-                if ((is_xe2 || (lg[1] < optimal_oc))
+                if ((is_xe2_or_xe3 || (lg[1] < optimal_oc))
                         && (lg[1] == utils::rnd_up_pow2(lg[1]))) {
                     const int oc_simds_per_line = simds_per_line / lg[1];
                     lg[0] = (mb <= oc_simds_per_line)
@@ -333,23 +336,24 @@ class pooling_config_t : public prim_config_t {
                             : utils::max_div(mb, oc_simds_per_line);
                 }
             }
-            lg[0] = calc_non_sp(1, (is_xe2) ? mb : prb.mb, 1, lg[0]);
+            lg[0] = calc_non_sp(1, (is_xe2_or_xe3) ? mb : prb.mb, 1, lg[0]);
             if (src.dim(0) % lg[0] == 0) mb = src.dim(0);
 
             const dim_t total_simds = dim_t(mb) * (oc / simd) * od * oh * ow;
-            const dim_t safe_thr_count = eu_count * 4;
+            const int safe_thr_count = eu_count * 4;
 
             if (total_simds < safe_thr_count * lg[1] * lg[0]) {
-                auto find_div = [](int num, int total_simds, int thr_count) {
-                    if (total_simds <= thr_count) return 1;
-                    const int orig = num;
+                auto find_div = [](dim_t num, dim_t total_simds,
+                                        int thr_count) {
+                    if (total_simds <= thr_count) return dim_t(1);
+                    const dim_t orig = num;
                     num = 0;
-                    for (int div = sqrtf(orig); div >= 1; div--)
+                    for (dim_t div = sqrtf(orig); div >= 1; div--)
                         if (orig % div == 0) {
                             if (total_simds >= thr_count * (orig / div))
-                                num = std::max(num, orig / div);
+                                num = std::max<dim_t>(num, orig / div);
                             if (total_simds >= thr_count * div)
-                                num = std::max(num, div);
+                                num = std::max<dim_t>(num, div);
                         }
                     return (num == 0) ? orig : num;
                 };
@@ -371,9 +375,11 @@ class pooling_config_t : public prim_config_t {
             const int loop_space = simds_per_line / (lg[0] * lg[1])
                     * (src_type_size + acc_type_size) / src_type_size;
             lg[7] = prb.kw;
-            lg[6] = std::max(utils::max_div(prb.kh, loop_space / lg[7]), 1);
+            lg[6] = std::max(
+                    utils::max_div(prb.kh, loop_space / lg[7]), dim_t(1));
             lg[5] = std::max(
-                    utils::max_div(prb.kd, loop_space / (lg[7] * lg[6])), 1);
+                    utils::max_div(prb.kd, loop_space / (lg[7] * lg[6])),
+                    dim_t(1));
         } else {
             // REGULAR FILTERS
 
@@ -390,7 +396,7 @@ class pooling_config_t : public prim_config_t {
                             * utils::rnd_up(oh, max_tg / tgw);
                 };
                 int ok_tgw = sqrt(max_tg);
-                ir_assert(ok_tgw == utils::rnd_up_pow2(ok_tgw));
+                gpu_assert(ok_tgw == utils::rnd_up_pow2(ok_tgw));
                 for (int tgw = sqrt(max_tg); tgw > 0; tgw >>= 1) {
                     if (loss(tgw) < loss(ok_tgw)) ok_tgw = tgw;
                     if (loss(max_tg / tgw) <= loss(ok_tgw))
@@ -423,7 +429,7 @@ class pooling_config_t : public prim_config_t {
             }
 
             const int safe_thr_count = eu_count * 7;
-            const int max_threads
+            const dim_t max_threads
                     = utils::div_up(dim_t(utils::div_up(oc, simd)) * mb * tg[0]
                                     * tg[1] * tg[2] * kg[0] * kg[1] * kg[2],
                             safe_thr_count);
@@ -437,8 +443,9 @@ class pooling_config_t : public prim_config_t {
                 }
             }
 
-            const int simds_per_tile = (regs_per_tile * 32 / simd
-                                               - lg[0] * lg[1] * acc_type_size)
+            const dim_t simds_per_tile
+                    = (regs_per_tile * 32 / simd
+                              - lg[0] * lg[1] * acc_type_size)
                     / src_type_size;
 
             if (simds_per_tile / (lg[0] * lg[1]) <= prb.kw) {
@@ -461,26 +468,27 @@ class pooling_config_t : public prim_config_t {
             }
 
             if (!is_blocked_by_mb()) {
-                const int oc_outer = oc / simd;
-                const int layers_per_thr
+                const dim_t oc_outer = oc / simd;
+                const dim_t layers_per_thr
                         = simds_per_tile / (lg[7] * lg[6] * lg[5]);
                 if (max_threads > 1) {
                     lg[1] = std::min(max_threads,
                             utils::max_div(oc_blk / simd, layers_per_thr));
-                    lg[1] = utils::max_div(oc_outer, std::max(lg[1], 1));
+                    lg[1] = utils::max_div(oc_outer, std::max(lg[1], dim_t(1)));
                 }
                 if ((oc == lg[1] * simd) && (max_threads / lg[1] > 1)) {
-                    const int mb_reg = layers_per_thr / lg[1] / src_type_size;
+                    const dim_t mb_reg = layers_per_thr / lg[1] / src_type_size;
                     lg[0] = std::min(max_threads / lg[1],
                             (mb_reg > mb_blk) ? utils::rnd_dn(mb_reg, mb_blk)
                                               : utils::max_div(mb_blk, mb_reg));
-                    lg[0] = utils::max_div(mb, std::max(lg[0], 1));
+                    lg[0] = utils::max_div(mb, std::max(lg[0], dim_t(1)));
                 }
                 if ((lg[0] == 1) && (max_threads / lg[1] > 1)) {
-                    const int oc_reg = layers_per_thr / lg[1] / src_type_size;
-                    const int lg1 = std::min(max_threads / lg[1],
+                    const dim_t oc_reg = layers_per_thr / lg[1] / src_type_size;
+                    const dim_t lg1 = std::min(max_threads / lg[1],
                             utils::max_div(oc_outer / lg[1], oc_reg));
-                    lg[1] *= utils::max_div(oc_outer / lg[1], std::max(lg1, 1));
+                    lg[1] *= utils::max_div(
+                            oc_outer / lg[1], std::max(lg1, dim_t(1)));
                 }
             }
         }
@@ -489,10 +497,10 @@ class pooling_config_t : public prim_config_t {
         kg[0] *= utils::div_up(oc, lg[1]);
         kg[1] *= utils::div_up(mb, lg[0]);
 
-        set_dims_padded(grid_info_t(padded, ""));
-        set_loop_grid(grid_info_t(lg, "lg_idx"));
-        set_kernel_grid(grid_info_t(kg, "kg_idx"));
-        set_thread_group_grid(grid_info_t(tg, "tg_idx"));
+        set_dims_padded(grid_info_t(padded, ir_builder_t::local_id));
+        set_loop_grid(grid_info_t(lg, ir_builder_t::local_id));
+        set_kernel_grid(grid_info_t(kg, ir_builder_t::tg_idx));
+        set_thread_group_grid(grid_info_t(tg, ir_builder_t::thr_idx));
     }
 
     compute::nd_range_t nd_range() const {
@@ -518,8 +526,8 @@ class pooling_config_t : public prim_config_t {
             desc.insert(desc.size(), 22 - desc.size(), ' ');
             oss << "  " << desc << layouts[i]->user() << std::endl;
         }
-        const int kg_elems = kernel_grid().elems();
-        const int tg_elems = thread_group_grid().elems();
+        const dim_t kg_elems = kernel_grid().elems();
+        const dim_t tg_elems = thread_group_grid().elems();
         //oss << blocking_brief_str();
         oss << "  Padded dimensions:    " << dims_padded() << std::endl;
         oss << "  Internal loop:        " << loop_grid() << std::endl;
@@ -541,7 +549,7 @@ class pooling_config_t : public prim_config_t {
         const auto simd = exec_cfg().simd();
         auto kg(kernel_grid());
         auto lg(loop_grid());
-        int null = 0;
+        dim_t null = 0;
 
         if (lg[5] > 1)
             cut_dim(lg[5], null, 1); // kd
@@ -564,13 +572,13 @@ class pooling_config_t : public prim_config_t {
 
 #define DECL_PARAM(name) \
     const name##_param_t &name##_param() const { \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         (void)name##_init_; \
         return name##_; \
     } \
     name##_param_t &name##_param() { return name##_; } \
     const name##_param_t::value_t &name() const { \
-        ir_assert(!name##_.is_undef()); \
+        gpu_assert(!name##_.is_undef()); \
         return name##_.get(); \
     } \
     void set_##name(const name##_param_t::value_t &value) { \
@@ -596,7 +604,7 @@ class pooling_config_t : public prim_config_t {
 private:
     int n_cuts_ = 0;
 
-    static void cut_dim(int &dn, int &up, int scale) {
+    static void cut_dim(dim_t &dn, dim_t &up, int scale) {
         // clang-format off
         static const std::array<unsigned char, 54> primes_up_to_256 = {
               2,   3,   5,   7,  11,  13,  17,  19,  23,  29,  31,
@@ -606,7 +614,7 @@ class pooling_config_t : public prim_config_t {
             197, 199, 211, 223, 227, 229, 233, 239, 241, 251,
         };
         // clang-format on
-        ir_assert(dn % scale == 0);
+        gpu_assert(dn % scale == 0);
         for (int p : primes_up_to_256)
             if (dn % (p * scale) == 0) {
                 up *= p;
@@ -619,13 +627,13 @@ class pooling_config_t : public prim_config_t {
 
     std::string desc_str() const {
         const auto &prb = pooling_problem();
-        const std::array<int, 6> xd
+        const std::array<dim_t, 6> xd
                 = {prb.id, prb.od, prb.kd, prb.stride_d, prb.dd, prb.f_pad};
-        const std::array<int, 6> xh
+        const std::array<dim_t, 6> xh
                 = {prb.ih, prb.oh, prb.kh, prb.stride_h, prb.dh, prb.t_pad};
-        const std::array<int, 6> xw
+        const std::array<dim_t, 6> xw
                 = {prb.iw, prb.ow, prb.kw, prb.stride_w, prb.dw, prb.l_pad};
-        const std::array<int, 6> xdef = {1, 1, 1, 1, 0, 0};
+        const std::array<dim_t, 6> xdef = {1, 1, 1, 1, 0, 0};
         const std::array<char, 6> name = {'i', 'o', 'k', 's', 'd', 'p'};
 
         const bool has_d = !ir_utils::is_equal(xd, xdef);
diff --git a/src/gpu/intel/jit/pooling/gen_pooling.cpp b/src/gpu/intel/jit/pooling/gen_pooling.cpp
index fce11afc18a..53b24a92476 100644
--- a/src/gpu/intel/jit/pooling/gen_pooling.cpp
+++ b/src/gpu/intel/jit/pooling/gen_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,20 +17,16 @@
 #include "gpu/intel/jit/pooling/gen_pooling.hpp"
 
 #include <iostream>
-#include <utility>
 
 #include "common/c_types_map.hpp"
-#include "common/impl_registration.hpp"
 #include "common/utils.hpp"
-#include "common/verbose.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/ir/post_ops.hpp"
 #include "gpu/intel/jit/ir/tensor_config.hpp"
-#include "gpu/intel/jit/ngen/ngen_register_allocator.hpp"
 #include "gpu/intel/jit/pooling/pooling_kernel.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
+#include "ngen_register_allocator.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -136,13 +132,11 @@ status_t gen_pooling_fwd_t::init(impl::engine_t *engine) {
 
     // Initialize kernel arguments.
     for (auto &t : tensor_cfg.tensors()) {
-        ir_assert(!t.needs_reorder);
-        ir_assert(!t.needs_zero_out);
+        gpu_assert(!t.needs_reorder);
+        gpu_assert(!t.needs_zero_out);
 
         if (t.arg_key == DNNL_ARG_UNDEF) {
-            ir_assert(!t.needs_reorder);
-            ir_assert(!t.needs_zero_out);
-            ir_error_not_expected();
+            gpu_error_not_expected();
             continue;
         }
         kernel_info_.register_user_arg(make_buffer(t.name), t.arg_key,
@@ -156,14 +150,14 @@ status_t gen_pooling_fwd_t::init(impl::engine_t *engine) {
             break;
         } catch (const ngen::out_of_registers_exception &exc) {
             UNUSED(exc);
-            ir_warning() << "loop too large: cut and retry!" << std::endl;
+            gpu_warning() << "loop too large: cut and retry!";
             kernel_ = {};
             if (!cfg_.cut()) {
-                ir_error_not_expected() << "minimal loop too large!";
+                gpu_error_not_expected() << "minimal loop too large!";
                 break;
             }
         } catch (const std::exception &exc) {
-            ir_error_not_expected() << exc.what();
+            gpu_error_not_expected() << exc.what();
             kernel_ = {};
             break;
         }
diff --git a/src/gpu/intel/jit/pooling/gen_pooling.hpp b/src/gpu/intel/jit/pooling/gen_pooling.hpp
index 36600f448f2..c4887ca65cf 100644
--- a/src/gpu/intel/jit/pooling/gen_pooling.hpp
+++ b/src/gpu/intel/jit/pooling/gen_pooling.hpp
@@ -36,9 +36,7 @@ class kernel_info_t;
 class gen_pooling_fwd_t : public gpu_primitive_t {
 public:
     struct pd_t : public gpu_pooling_fwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_fwd_pd_t::gpu_pooling_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("jit:ir", gen_pooling_fwd_t);
 
diff --git a/src/gpu/intel/jit/pooling/ir_builder.cpp b/src/gpu/intel/jit/pooling/ir_builder.cpp
index 6582cc033f6..e8bbda99a9a 100644
--- a/src/gpu/intel/jit/pooling/ir_builder.cpp
+++ b/src/gpu/intel/jit/pooling/ir_builder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,14 +17,9 @@
 #include "gpu/intel/jit/pooling/ir_builder.hpp"
 
 #include <algorithm>
-#include <array>
 #include <iostream>
-#include <limits>
-#include <memory>
-#include <numeric>
 #include <utility>
 #include <vector>
-#include <unordered_map>
 
 #include "common/c_types_map.hpp"
 #include "common/utils.hpp"
@@ -36,8 +31,6 @@
 #include "gpu/intel/jit/ir/post_ops.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
 #include "gpu/intel/jit/pass/pass.hpp"
-#include "gpu/intel/jit/utils/iterator.hpp"
-#include "gpu/intel/jit/utils/range.hpp"
 #include "gpu/intel/jit/utils/trace.hpp"
 
 namespace dnnl {
@@ -56,8 +49,8 @@ class pooling_post_op_view_mapper_t : public post_op_view_mapper_t {
     }
 
     view_t create_view(const memory_desc_t &md) const override {
-        int cp_ndims = cp_view().nvdims();
-        ir_assert(cp_ndims >= 3);
+        dim_idx_t cp_ndims = cp_view().nvdims();
+        gpu_assert(cp_ndims >= 3);
         layout_t layout(md, /*do_normalize=*/false);
         std::vector<dim_t> dims(md.dims, md.dims + md.ndims);
         std::vector<dim_t> pad_dims(md.padded_dims, md.padded_dims + md.ndims);
@@ -65,9 +58,9 @@ class pooling_post_op_view_mapper_t : public post_op_view_mapper_t {
         layout = spatials_to_3d(layout, false, {0, 1, 2});
         dims = dims_to_3d(dims);
         pad_dims = dims_to_3d(pad_dims);
-        ir_assert(layout.ndims() == cp_ndims) << "Incompatible dimensions.";
+        gpu_assert(layout.ndims() == cp_ndims) << "Incompatible dimensions.";
         uint32_t bound_check_mask = 0;
-        for (int i = 0; i < cp_ndims; i++) {
+        for (dim_idx_t i = 0; i < cp_ndims; i++) {
             if (dims[i] == 1) continue; // Broadcast, no bound check needed.
             if (pad_dims[i] != cp_view().tlayout().dim(i)) {
                 bound_check_mask |= (1 << i);
@@ -81,9 +74,9 @@ class pooling_post_op_view_mapper_t : public post_op_view_mapper_t {
     bool need_to_restore_zero_padding() const override { return true; }
 
 private:
-    static void maybe_reshape_dims(int ndims, layout_t &layout,
+    static void maybe_reshape_dims(dim_idx_t ndims, layout_t &layout,
             std::vector<dim_t> &dims, std::vector<dim_t> &padded_dims) {
-        ir_assert(layout.ndims() == int(dims.size()));
+        gpu_assert(layout.ndims() == dims.size());
         if (layout.ndims() < ndims) {
             layout = layout_t(layout.type(), ndims, layout.offset(),
                     layout.blocks(), /*do_normalize=*/false);
@@ -98,20 +91,20 @@ class pooling_post_op_view_mapper_t : public post_op_view_mapper_t {
     }
 
     uint32_t normalize_mask(uint32_t orig_mask) const {
-        int cp_ndims = cp_view().nvdims();
-        ir_assert(cp_ndims >= 3);
+        dim_idx_t cp_ndims = cp_view().nvdims();
+        gpu_assert(cp_ndims >= 3);
         // Number of dimensions before normalization.
-        int orig_ndims = 2 + ndims_;
+        dim_idx_t orig_ndims = 2 + ndims_;
         std::vector<dim_t> dummy_dims(orig_ndims, 1);
         dim_t mask_set_value = 2;
-        for (int i = 0; i < orig_ndims; i++) {
+        for (dim_idx_t i = 0; i < orig_ndims; i++) {
             if ((orig_mask & (1 << i)) != 0) dummy_dims[i] = mask_set_value;
         }
         auto cvt_dims = dims_to_3d(dummy_dims);
-        ir_assert(int(cvt_dims.size()) == cp_ndims);
+        gpu_assert(cvt_dims.size() == cp_ndims);
 
         uint32_t mask = 0;
-        for (int i = 0; i < cp_ndims; i++) {
+        for (dim_idx_t i = 0; i < cp_ndims; i++) {
             if (cvt_dims[i] == mask_set_value) mask = mask | (1 << i);
         }
         return mask;
@@ -124,7 +117,7 @@ class loop_bound_counter_t : public ir_mutator_t {
 public:
     int count(const expr_t &e) {
         const auto retn = simplify(mutate(e));
-        ir_assert(retn.is<int_imm_t>());
+        gpu_assert(retn.is<int_imm_t>());
         return to_cpp<int>(retn);
     }
     loop_bound_counter_t(const gemm_schedule_t &s) : schedule_(s) {}
@@ -144,10 +137,12 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
     const auto &src_layout = cfg.src_layout().user();
     const auto &dst_layout = cfg.dst_layout().user();
 
-    const bool is_xe2_small_kdhw = !(prb.kd * prb.kh * prb.kw == 1)
-            && (prb.kh * prb.kw <= 9) && (exec.hw().to_ngen() == ngen::HW::Xe2);
+    const bool is_xe2_or_xe3_small_kdhw = !(prb.kd * prb.kh * prb.kw == 1)
+            && (prb.kh * prb.kw <= 9)
+            && (exec.hw().to_ngen() == ngen::HW::Xe2
+                    || exec.hw().to_ngen() == ngen::HW::Xe3);
 
-    ir_assert(src_layout.ndims() == dst_layout.ndims());
+    gpu_assert(src_layout.ndims() == dst_layout.ndims());
 
     // Create loop variables.
     auto mb = var_t::make(type_t::s32(), "mb");
@@ -182,16 +177,16 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
     const auto &kg = cfg.kernel_grid();
     const auto &tg = cfg.thread_group_grid();
     const auto &dims_grid = cfg.dims_padded();
-    std::vector<int> padded_dims(dims_grid.ndims());
-    for (int i = 0; i < int(padded_dims.size()); i++)
+    std::vector<dim_t> padded_dims(dims_grid.ndims());
+    for (dim_idx_t i = 0; i < padded_dims.size(); i++)
         padded_dims[i] = dims_grid[i];
-    ir_assert(padded_dims.size() == 5);
-    std::vector<int> dims {padded_dims[0], int(src_layout.dim(1)),
-            padded_dims[2], padded_dims[3], padded_dims[4]};
+    gpu_assert(padded_dims.size() == 5);
+    std::vector<dim_t> dims {padded_dims[0], src_layout.dim(1), padded_dims[2],
+            padded_dims[3], padded_dims[4]};
 
     // Source.
     auto src_view = view_t({mb, oc, od, oh, ow, kd, kh, kw}, 5);
-    src_view.set_vdim(mb, (!is_xe2_small_kdhw) ? dims[0] : prb.mb);
+    src_view.set_vdim(mb, (!is_xe2_or_xe3_small_kdhw) ? dims[0] : prb.mb);
     src_view.set_vdim(oc, dims[1]);
     src_view.set_vdim(od, dims[2]);
     src_view.set_vdim(oh, dims[3]);
@@ -212,7 +207,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
 
     // Destination.
     auto dst_view = view_t({mb, oc, od, oh, ow}, 5);
-    dst_view.set_vdim(mb, (!is_xe2_small_kdhw) ? dims[0] : prb.mb);
+    dst_view.set_vdim(mb, (!is_xe2_or_xe3_small_kdhw) ? dims[0] : prb.mb);
     dst_view.set_vdim(oc, dims[1]);
     dst_view.set_vdim(od, dims[2]);
     dst_view.set_vdim(oh, dims[3]);
@@ -241,23 +236,25 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
             schedule.bind(fuse[0], kg.idx(idx));
     };
     auto odhw_to_schedule = [&](expr_t s1, expr_t ns, expr_t s0) {
-        int s0_idx = (s0.is_empty()) ? -1 : src_view.vvar_index(s0);
-        int s1_idx = src_view.vvar_index(s1);
-        int ns_idx = src_view.vvar_index(ns);
-        ir_assert((s0_idx <= 4) && (s1_idx <= 4) && (ns_idx <= 4));
+        dim_idx_t s0_idx
+                = (s0.is_empty()) ? dim_idx::invalid : src_view.vvar_index(s0);
+        dim_idx_t s1_idx = src_view.vvar_index(s1);
+        dim_idx_t ns_idx = src_view.vvar_index(ns);
+        gpu_assert((s0_idx <= 4 || s0_idx == dim_idx::invalid) && (s1_idx <= 4)
+                && (ns_idx <= 4));
 
         // s1 and ns may swap sides, which affects their fusing order: it has
         // to strictly replicate that of the arguments passed to this lambda!
-        const bool need_swap = (s1_idx >= 0) && (s1_idx <= 1);
+        const bool need_swap = (s1_idx <= 1);
         // 2 spatials and 2 non-spatials disallowed; only 1 of each or bust
-        ir_assert(need_swap != ((ns_idx >= 0) && (ns_idx <= 1)));
+        gpu_assert(need_swap != (ns_idx <= 1));
         if (need_swap) {
             std::swap(s1_idx, ns_idx);
             std::swap(s1, ns);
         }
 
-        const int s1_tlg_unroll = lg[s1_idx];
-        const int s1_unroll = s1_tlg_unroll * tg[s1_idx - 2];
+        const dim_t s1_tlg_unroll = lg[s1_idx];
+        const dim_t s1_unroll = s1_tlg_unroll * tg[s1_idx - 2];
         const auto ps1 = s1.str();
 
         std::vector<expr_t> s0_fuse, s1_fuse;
@@ -271,11 +268,11 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
         schedule.bind(s1_tg, tg.idx(s1_idx - 2));
         s1_fuse.emplace_back(s1_kg);
 
-        if (s0_idx >= 0) {
-            ir_assert(s0_idx == s1_idx + 1);
-            const int s0_tlg_unroll = lg[s0_idx];
-            const int s0_unroll = s0_tlg_unroll * tg[s0_idx - 2];
-            const int s0_full = s0_unroll * kg[s0_idx - 2];
+        if (s0_idx != dim_idx::invalid) {
+            gpu_assert(s0_idx == s1_idx + 1);
+            const dim_t s0_tlg_unroll = lg[s0_idx];
+            const dim_t s0_unroll = s0_tlg_unroll * tg[s0_idx - 2];
+            const dim_t s0_full = s0_unroll * kg[s0_idx - 2];
             const auto ps0 = s0.str();
 
             if (dims[s0_idx] > s0_full) {
@@ -286,7 +283,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
                 s0 = std::move(s0_ktlg);
             } else if (dims[s0_idx] <= utils::div_up(s0_full, 2)) {
                 expr_t s1_split, s1_ktlg; // part of kg[s1] is in kg[s0]
-                const int s1_ext = utils::div_up(s0_full, dims[s0_idx]);
+                const dim_t s1_ext = utils::div_up(s0_full, dims[s0_idx]);
                 schedule.split(s1_fuse[0], s1_ext, s1_ktlg, s1_split,
                         ps1 + "_ktlg", ps1 + "_split");
                 s1_fuse[0] = std::move(s1_ktlg);
@@ -304,7 +301,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
             s0_fuse.emplace_back(s0_kg);
         }
 
-        const int ns_unroll = lg[ns_idx];
+        const dim_t ns_unroll = lg[ns_idx];
         const auto pns = ns.str();
 
         expr_t ns_kg, ns_lg;
@@ -326,8 +323,8 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
 
     auto kdhw_to_schedule = [&](const expr_t &k) {
         const int k_idx = src_view.vvar_index(k);
-        ir_assert((k_idx >= 5) && (k_idx <= 7));
-        const int k_dim = lg[k_idx];
+        gpu_assert((k_idx >= 5) && (k_idx <= 7));
+        const dim_t k_dim = lg[k_idx];
         if (k_dim == schedule.var_bound(k)) {
             schedule.tensorize(k);
         } else if (k_dim < schedule.var_bound(k)) {
@@ -338,7 +335,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
                 schedule.tensorize(k_tnz);
             }
         } else {
-            ir_error_not_expected() << "k_dim > var_bound; this is wrong";
+            gpu_error_not_expected() << "k_dim > var_bound; this is wrong";
         }
     };
     kdhw_to_schedule(kd);
@@ -399,8 +396,8 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
 
     tensor_t src_tile(read_layout.split_into_max_tile(simd, true));
     tensor_t dst_tile(write_layout.split_into_max_tile(simd, true));
-    ir_assert(src_tile.elems() == simd);
-    ir_assert(dst_tile.elems() == simd);
+    gpu_assert(src_tile.elems() == simd);
+    gpu_assert(dst_tile.elems() == simd);
 
     const bool is_identity(prb.kd * prb.kh * prb.kw <= 1);
 
@@ -410,7 +407,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
     stmt_t stmt;
 
     auto gen_fill_values = [](int simd, bool isneg, type_t type) {
-        ir_assert(type.scalar().size() <= 4);
+        gpu_assert(type.scalar().size() <= 4);
         const int mult = 4 / type.scalar().size();
         expr_t v = 0;
         if (isneg) {
@@ -433,7 +430,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
         stmt_t retn;
         const auto values = gen_fill_values(simd, isneg, layout.type());
         layout.for_each_tile(tile, [&](const std::vector<dim_t> &s) {
-            const int off = layout(s) * layout.type().size();
+            const dim_t off = layout(s) * layout.type().size();
             if (off >= utils::rnd_dn(layout.size(), simd * 4))
                 retn = retn.append(store_t::make(buf, off, values.first));
             else if (off % (simd * 4) == 0)
@@ -453,8 +450,9 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
                 : stmt_t();
         stmt = stmt.append(read.stmt());
     } else {
-        ir_assert(acc_size % simd == 0);
-        allocs.push_back(alloc_t::make(acc_buf, acc_size, alloc_kind_t::grf));
+        gpu_assert(acc_size % simd == 0);
+        allocs.push_back(alloc_t::make(
+                acc_buf, into<uint32_t>(acc_size), alloc_kind_t::grf));
 
         stmt_t fill_stmt, compute_stmt = read.stmt();
         stmt = stmt_t();
@@ -466,8 +464,8 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
         fill_stmt = gen_zero_out(simd, is_neg, read_buf, src_tile, read_layout);
 
         read_layout.for_each_tile(src_tile, [&](const std::vector<dim_t> &s) {
-            const int off_l = read_layout(s) * read_layout.type().size();
-            const int off_a = (s[0] * lg[1] + s[1]) * acc_sc_size;
+            const dim_t off_l = read_layout(s) * read_layout.type().size();
+            const dim_t off_a = (s[0] * lg[1] + s[1]) * acc_sc_size;
 
             auto load = cast_t::make(
                     acc_type, load_t::make(read_type, read_buf, off_l));
@@ -485,7 +483,8 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
         if (!cfg.is_max()) {
             expr_t filter(prb.kd * prb.kh * prb.kw);
             if (!cfg.is_padded() && check_idhw) {
-                auto dim = [](const expr_t &o, int s, int p, int k, int i) {
+                auto dim = [](const expr_t &o, dim_t s, dim_t p, dim_t k,
+                                   dim_t i) {
                     if (k <= 1) return expr_t(1);
                     return binary_op_t::make(op_kind_t::_min, o * s - p + k, i)
                             - binary_op_t::make(op_kind_t::_max, o * s - p, 0);
@@ -526,7 +525,7 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
     if (!exit_cond.is_empty())
         stmt = if_t::make(shuffle_t::make_broadcast(exit_cond, simd), stmt);
 
-    if (!is_xe2_small_kdhw && ((dims[0] - prb.mb) / lg[0] >= 1)) {
+    if (!is_xe2_or_xe3_small_kdhw && ((dims[0] - prb.mb) / lg[0] >= 1)) {
         auto stop = gen_zero_out(simd, false, acc_buf, dst_tile, write_layout);
         stmt = if_t::make(shuffle_t::make_broadcast(mb >= prb.mb, simd),
                 stop.append(write_stmt), stmt);
@@ -550,8 +549,8 @@ stmt_t pooling_ir_builder_t::try_build(pooling_ir_builder_t &pb,
 
     const int regs = get_peak_regs(stmt, exec.grf_size());
 
-    ir_trace() << "Pooling kernel body:\n" << stmt << std::endl;
-    ir_trace() << "Pooling cfg (~" << regs << " regs):\n" << cfg << std::endl;
+    gpu_trace() << "Pooling kernel body:\n" << stmt;
+    gpu_trace() << "Pooling cfg (~" << regs << " regs):\n" << cfg;
 
     return (regs > exec.regs()) ? stmt_t() : std::move(stmt);
 }
diff --git a/src/gpu/intel/jit/pooling/ir_builder.hpp b/src/gpu/intel/jit/pooling/ir_builder.hpp
index 9079ddcea72..b94aad6da18 100644
--- a/src/gpu/intel/jit/pooling/ir_builder.hpp
+++ b/src/gpu/intel/jit/pooling/ir_builder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,11 +17,8 @@
 #ifndef GPU_INTEL_JIT_POOLING_IR_BUILDER_HPP
 #define GPU_INTEL_JIT_POOLING_IR_BUILDER_HPP
 
-#include "gpu/intel/jit/ir/gemm_schedule.hpp"
-#include "gpu/intel/jit/ir/ir.hpp"
 #include "gpu/intel/jit/ir/ir_builder.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
-#include "gpu/intel/jit/ir/tensor.hpp"
 #include "gpu/intel/jit/pooling/config.hpp"
 
 namespace dnnl {
@@ -33,12 +30,11 @@ namespace jit {
 class pooling_ir_builder_t : public ir_builder_t {
 public:
     pooling_ir_builder_t(pooling_config_t &cfg, const kernel_info_t &ki,
-            const primitive_desc_t &pd)
-        : ir_builder_t(ki) {
-        while ((stmt_ = try_build(*this, kernel_info_, cfg, pd)).is_empty()) {
-            ir_warning() << "loop too large: cut and retry!" << std::endl;
+            const primitive_desc_t &pd) {
+        while ((stmt_ = try_build(*this, ki, cfg, pd)).is_empty()) {
+            gpu_warning() << "loop too large: cut and retry!";
             const bool cut_ok = cfg.cut();
-            if (!cut_ok) ir_error_not_expected() << "minimal loop too large!";
+            if (!cut_ok) gpu_error_not_expected() << "minimal loop too large!";
         }
     }
 
diff --git a/src/gpu/intel/jit/pooling/pooling_kernel.hpp b/src/gpu/intel/jit/pooling/pooling_kernel.hpp
index 03b643f4986..60ae90bed63 100644
--- a/src/gpu/intel/jit/pooling/pooling_kernel.hpp
+++ b/src/gpu/intel/jit/pooling/pooling_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,20 +40,16 @@ class pooling_kernel_t : public ir_kernel_t<hw> {
 
     pooling_kernel_t(pooling_config_t &cfg, const std::string &kernel_name,
             const kernel_info_t &kernel_info, const primitive_desc_t &pd)
-        : ir_kernel_t<hw>(kernel_name, cfg.exec_cfg(), kernel_info,
-                kernel_info.nd_range().local_range(), /*require_dpas=*/false) {
+        : ir_kernel_t<hw>(kernel_name, cfg.exec_cfg(),
+                kernel_info.nd_range().local_range(), /*require_dpas=*/false,
+                {GENERATOR_NAME, GENERATOR_LINE}) {
+        set_kernel_iface(kernel_info.iface());
         pooling_ir_builder_t builder(cfg, kernel_info, pd);
         stmt_t body = builder.stmt();
         setup_interface(body);
-        generate_prologue();
-        expr_binding_t expr_binding(hw);
-        bind_external_vars(
-                body, cfg.kernel_grid(), builder.local_id(), expr_binding);
 
         // Generate assembly from IR.
-        convert_ir_to_ngen<hw>(body, this, expr_binding);
-
-        generate_epilogue();
+        convert_ir_to_ngen<ir_kernel_t<hw>>(body, this);
     }
 };
 
diff --git a/src/gpu/intel/jit/post_op_injector.cpp b/src/gpu/intel/jit/post_op_injector.cpp
new file mode 100644
index 00000000000..5e06a58402c
--- /dev/null
+++ b/src/gpu/intel/jit/post_op_injector.cpp
@@ -0,0 +1,77 @@
+/*******************************************************************************
+ * Copyright 2021-2025 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#include "gpu/intel/jit/post_op_injector.hpp"
+#include "common/impl_registration.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+using namespace ngen;
+
+template <typename ngen_generator_t>
+int post_op_injector_t<ngen_generator_t>::min_scratch_regs() {
+    int regs_cnt = 0;
+    for (size_t idx = 0; idx < workers_.size(); ++idx) {
+        regs_cnt = nstl::max(regs_cnt, workers_[idx].min_scratch_regs());
+    }
+    return regs_cnt;
+}
+
+template <typename ngen_generator_t>
+int post_op_injector_t<ngen_generator_t>::preferred_scratch_regs() {
+    int regs_cnt = 0;
+    for (size_t idx = 0; idx < workers_.size(); ++idx) {
+        regs_cnt = nstl::max(regs_cnt, workers_[idx].preferred_scratch_regs());
+    }
+    return regs_cnt;
+}
+
+template <typename ngen_generator_t>
+void post_op_injector_t<ngen_generator_t>::set_scratch(
+        const ngen::GRFRange &scratch) {
+    for (size_t idx = 0; idx < workers_.size(); ++idx) {
+        workers_[idx].set_scratch(scratch);
+        if (workers_.size() == 1) workers_[idx].prepare();
+    }
+    scratch_ = scratch;
+}
+
+template <typename ngen_generator_t>
+void post_op_injector_t<ngen_generator_t>::compute(const ngen::GRFRange &regs) {
+    for (size_t idx = 0; idx < workers_.size(); ++idx) {
+        if (workers_.size() > 1) workers_[idx].prepare();
+        workers_[idx].compute(regs);
+    }
+}
+
+REG_GEN9_ISA(template struct post_op_injector_t<generator_t<gpu_gen9>>);
+REG_GEN11_ISA(template struct post_op_injector_t<generator_t<gpu_gen11>>);
+REG_XELP_ISA(template struct post_op_injector_t<generator_t<gpu_xe_lp>>);
+REG_XEHP_ISA(template struct post_op_injector_t<generator_t<gpu_xe_hp>>);
+REG_XEHPG_ISA(template struct post_op_injector_t<generator_t<gpu_xe_hpg>>);
+REG_XEHPC_ISA(template struct post_op_injector_t<generator_t<gpu_xe_hpc>>);
+REG_XE2_ISA(template struct post_op_injector_t<generator_t<gpu_xe2>>);
+REG_XE3_ISA(template struct post_op_injector_t<generator_t<gpu_xe3>>);
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/post_op_injector.hpp b/src/gpu/intel/jit/post_op_injector.hpp
new file mode 100644
index 00000000000..19213ea35b8
--- /dev/null
+++ b/src/gpu/intel/jit/post_op_injector.hpp
@@ -0,0 +1,101 @@
+/*******************************************************************************
+ * Copyright 2021-2025 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_POST_OP_INJECTOR_HPP
+#define GPU_INTEL_JIT_POST_OP_INJECTOR_HPP
+
+#include "common/primitive_attr.hpp"
+#include "gpu/intel/gpu_post_ops.hpp"
+#include "gpu/intel/jit/eltwise_injector.hpp"
+#include "gpu/intel/jit/generator.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+inline bool post_op_injector_is_supported(
+        const post_ops_t &post_ops, bool skip_sum) {
+    bool is_supported = true;
+    for (int idx = 0; idx < post_ops.len(); ++idx) {
+        const auto &po = post_ops.entry_[idx];
+        if (po.is_binary())
+            is_supported &= false;
+        else if (po.is_convolution())
+            is_supported &= false;
+        else if (po.is_eltwise())
+            is_supported &= eltwise_injector_f32_is_supported(po.eltwise.alg);
+        else if (po.is_sum(false, false))
+            is_supported &= skip_sum;
+    }
+    return is_supported;
+}
+
+template <typename ngen_generator_t>
+struct post_op_injector_t {
+    post_op_injector_t(ngen_generator_t *host, data_type_t accumulator_type,
+            const post_ops_t &post_ops, int eu_count,
+            const ngen::GRFRange &scratch = ngen::GRFRange(),
+            bool is_fwd = true)
+        : is_fwd_(is_fwd), scratch_(scratch) {
+        assert(accumulator_type == data_type_t::dnnl_f32);
+        workers_.reserve(post_ops.len());
+        for (int idx = 0; idx < post_ops.len(); ++idx) {
+            const auto &po = post_ops.entry_[idx];
+            if (po.is_eltwise())
+                workers_.emplace_back(host, po.eltwise.alg, po.eltwise.alpha,
+                        po.eltwise.beta, po.eltwise.scale, eu_count, scratch,
+                        is_fwd);
+        }
+    }
+
+    post_op_injector_t(ngen_generator_t *host, data_type_t accumulator_type,
+            const gpu_post_ops_t &post_ops, int eu_count,
+            const ngen::GRFRange &scratch = ngen::GRFRange(),
+            bool is_fwd = true)
+        : is_fwd_(is_fwd), scratch_(scratch) {
+        assert(accumulator_type == data_type_t::dnnl_f32);
+        workers_.reserve(post_ops.len());
+        for (auto &po : post_ops) {
+            if (po.is_eltwise()) {
+                auto &e = po.as_eltwise();
+                workers_.emplace_back(host, e.alg, e.alpha, e.beta, e.scale,
+                        eu_count, scratch, is_fwd);
+            }
+        }
+    }
+
+    int min_scratch_regs();
+    int preferred_scratch_regs();
+    void set_scratch(const ngen::GRFRange &scratch);
+
+    void compute(const ngen::GRF &reg) { compute(ngen::GRFRange(reg, 1)); }
+    void compute(const ngen::GRFRange &regs);
+
+private:
+    std::vector<eltwise_injector_f32_t<ngen_generator_t>> workers_;
+    bool is_fwd_;
+    ngen::GRFRange scratch_;
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_JIT_POST_OP_INJECTOR_HPP
diff --git a/src/gpu/intel/jit/reduction.cpp b/src/gpu/intel/jit/reduction.cpp
new file mode 100644
index 00000000000..c7c1b5086aa
--- /dev/null
+++ b/src/gpu/intel/jit/reduction.cpp
@@ -0,0 +1,132 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// A small wrapper on the reduction_generator_t, used to test its functionality.
+// Only valid in dev mode for now, until performance is improved.
+#include "gpu/intel/gpu_primitive_attr.hpp"
+#ifdef DNNL_DEV_MODE
+
+#include "common/c_types_map.hpp"
+#include "common/compiler_workarounds.hpp"
+#include "common/utils.hpp"
+#include "gpu/intel/compute/compute_engine.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/compute/utils.hpp"
+#include "gpu/intel/jit/reduction.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+using namespace gpu_utils;
+
+status_t reduction_t::pd_t::init_conf(impl::engine_t *engine) {
+    const memory_desc_wrapper src_mdw(src_md());
+    const memory_desc_wrapper dst_mdw(dst_md());
+    const int ndims = src_mdw.ndims();
+    const dim_t *src_dims = src_mdw.dims();
+    const dim_t *dst_dims = dst_mdw.dims();
+
+    // Allow plain formats only
+    bool plain_case = true;
+    plain_case &= (src_mdw.blocking_desc().inner_nblks == 0);
+    plain_case &= (dst_mdw.blocking_desc().inner_nblks == 0);
+    if (!plain_case) return status::unimplemented;
+
+    // Allow only 1 reduced dimension, for now
+    for (int i = 0; i < ndims; i++) {
+        bool is_reduced = (src_dims[i] != dst_dims[i]);
+        if (is_reduced && reduction_size) return status::unimplemented;
+        if (is_reduced) {
+            reduction_size = src_dims[i];
+            reduction_stride = src_mdw.blocking_desc().strides[i];
+        }
+    }
+    assert(reduction_size);
+    assert(reduction_stride);
+
+    dim_t dst_nelems = dst_mdw.nelems();
+    dim_t inner_nelems = reduction_stride;
+    int dt_size = into<int>(sizeof(float));
+
+    auto &compute_engine
+            = *utils::downcast<compute::compute_engine_t *>(engine);
+    const compute::device_info_t &device_info = *compute_engine.device_info();
+    int reg_size = device_info.grf_size();
+    int elems_per_reg = reg_size / dt_size;
+    int default_nregs
+            = utils::max_div(into<int>(inner_nelems / elems_per_reg), 16);
+
+    nregs = dev_getenv("jit_reduction_nregs", into<int>(default_nregs));
+
+    // Only allow cases where inner size aligns with register size
+    if (inner_nelems % (elems_per_reg * nregs) != 0)
+        return status::unimplemented;
+
+    // Grouping threads into threadgroups: ensures better access patterns (we can use barriers)
+    // --> Use the largest threadgroup possible, must fit within the inner dimension
+    dim_t gws0 = inner_nelems / nregs;
+    dim_t nthreads = gws0 / elems_per_reg;
+    int tg_size = [this, &device_info, &nthreads]() {
+        const compute::gpu_arch_t arch = device_info.gpu_arch();
+        auto *gpu_attr = utils::downcast<gpu_primitive_attr_t *>(
+                attr()->gpu_attr_.get());
+        const int threads_per_eu = gpu_attr
+                ? gpu_attr->threads_per_eu()
+                : compute::device_info_t::threads_per_eu(arch);
+        int tg_size = utils::rnd_down_pow2(
+                device_info.max_eus_per_wg() * threads_per_eu);
+        while (nthreads % tg_size != 0) {
+            tg_size /= 2;
+        }
+        tg_size = dev_getenv("jit_reduction_tg_size", tg_size);
+        return tg_size;
+    }();
+    gpu_assert(nthreads % tg_size == 0) << "Invalid tg_size";
+
+    // valid case, now compute the nd_range_t
+    dim_t outer_nelems = dst_nelems / inner_nelems;
+    compute::range_t gws(into<size_t>(gws0), into<size_t>(outer_nelems));
+    compute::range_t lws(into<size_t>(tg_size * elems_per_reg), 1);
+    nd_range = compute::nd_range_t(gws, lws);
+
+    return status::success;
+}
+
+status_t reduction_t::execute(const exec_ctx_t &ctx) const {
+    auto &src = CTX_IN_STORAGE(DNNL_ARG_SRC);
+    auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST);
+
+    // Set up the reduction arg list
+    compute::kernel_arg_list_t reduction_arg_list;
+
+    reduction_arg_list.append(src);
+    reduction_arg_list.append(dst);
+
+    CHECK(parallel_for(ctx, pd()->nd_range, kernel_, reduction_arg_list));
+
+    return status::success;
+}
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/reduction.hpp b/src/gpu/intel/jit/reduction.hpp
new file mode 100644
index 00000000000..edf1bdee847
--- /dev/null
+++ b/src/gpu/intel/jit/reduction.hpp
@@ -0,0 +1,121 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_REDUCTION_HPP
+#define GPU_INTEL_JIT_REDUCTION_HPP
+
+// A small wrapper on the reduction_generator_t, used to test its functionality.
+// Only valid in dev mode for now, until performance is improved.
+#ifdef DNNL_DEV_MODE
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/reduction_pd.hpp"
+#include "common/utils.hpp"
+#include "gpu/gpu_reduction_pd.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/compute/utils.hpp"
+#include "gpu/intel/gpu_primitive.hpp"
+#include "gpu/intel/jit/reduction_generator.hpp"
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/primitive_conf.hpp"
+#include "gpu/intel/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+struct reduction_t : public gpu_primitive_t {
+    using gpu_primitive_t::gpu_primitive_t;
+    struct pd_t : public gpu_reduction_pd_t {
+        using gpu_reduction_pd_t::gpu_reduction_pd_t;
+
+        DECLARE_COMMON_PD_T("jit:ref", reduction_t);
+
+        status_t init(impl::engine_t *engine) {
+            // Require the corresponding environment variable - skip this impl
+            // unless requested (do not report this skip to verbose)
+            bool enable_jit_reduction
+                    = gpu_utils::dev_getenv("enable_jit_reduction", false);
+            if (!enable_jit_reduction) return status::unimplemented;
+
+            using smask_t = primitive_attr_t::skip_mask_t;
+            const auto attr_skip_mask = smask_t::gpu_attr;
+            VDISPATCH_REDUCTION_SC(
+                    set_default_params(), VERBOSE_UNSUPPORTED_TAG);
+            VDISPATCH_REDUCTION(attr()->has_default_values(attr_skip_mask),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_REDUCTION(!memory_desc_ndims_ok(src_md(), dst_md()),
+                    VERBOSE_INCONSISTENT_NDIMS, "src", "dst");
+            VDISPATCH_REDUCTION_SC(attr_.set_default_formats(dst_md(0)),
+                    VERBOSE_UNSUPPORTED_TAG);
+            // Only f32 supported for now
+            VDISPATCH_REDUCTION(
+                    utils::everyone_is(data_type::f32, src_md()->data_type,
+                            dst_md()->data_type),
+                    VERBOSE_UNSUPPORTED_DT);
+            // Make sure we can use the injector for this problem
+            VDISPATCH_REDUCTION(
+                    reduction_injector_f32_is_supported(desc()->alg_kind),
+                    VERBOSE_BAD_ALGORITHM);
+
+            VDISPATCH_REDUCTION_SC(init_conf(engine), "init_conf");
+
+            return status::success;
+        }
+
+        status_t init_conf(impl::engine_t *engine);
+        dim_t reduction_size = 0;
+        dim_t reduction_stride = 0;
+        int nregs = 1;
+        compute::nd_range_t nd_range;
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        compute::kernel_ctx_t kernel_ctx;
+
+        auto *gpu_engine = utils::downcast<ocl::engine_t *>(engine);
+        if (!gpu_engine) return status::runtime_error;
+
+        const compute::device_info_t &device_info = *gpu_engine->device_info();
+        kernel_ = make_kernel<reduction_generator_t>(this, engine, device_info,
+                pd()->desc()->alg_kind, pd()->reduction_stride,
+                pd()->reduction_size, pd()->nregs);
+        if (!kernel_) return status::runtime_error;
+
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+private:
+    const pd_t *pd() const {
+        return reinterpret_cast<const pd_t *>(primitive_t::pd().get());
+    }
+
+    compute::kernel_t kernel_;
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // DNNL_DEV_MODE
+#endif
diff --git a/src/gpu/intel/jit/reduction_generator.hpp b/src/gpu/intel/jit/reduction_generator.hpp
new file mode 100644
index 00000000000..dbb4bdbf30d
--- /dev/null
+++ b/src/gpu/intel/jit/reduction_generator.hpp
@@ -0,0 +1,194 @@
+/*******************************************************************************
+ * Copyright 2024-2025 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_REDUCTION_GENERATOR_HPP
+#define GPU_INTEL_JIT_REDUCTION_GENERATOR_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/nstl.hpp"
+#include "common/utils.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/jit/emulated_generator.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "gpu/intel/jit/reduction_injector.hpp"
+#include "gpu/intel/utils.hpp"
+#include "ngen_core.hpp"
+#include "ngen_interface.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+template <gpu_gen_t hw>
+class reduction_generator_t : public emulated_generator_t<hw> {
+protected:
+    NGEN_FORWARD_ELF(hw)
+    FORWARD_EMULATION(hw);
+
+public:
+    reduction_generator_t(const compute::device_info_t &device_info,
+            alg_kind_t alg, dim_t stride, dim_t iters, int nregs)
+        : emulated_generator_t<hw>(
+                device_info, {GENERATOR_NAME, GENERATOR_LINE}) {
+        constexpr auto GlobalPtr = ngen::ExternalArgumentType::GlobalPtr;
+
+        // Number of dst elements computed per thread
+        const int grf_bytes = ngen::GRF::bytes(hw);
+        const int dt_size = sizeof(float);
+        int simd = grf_bytes / dt_size;
+
+        newArgument("src_ptr", GlobalPtr);
+        newArgument("dst_ptr", GlobalPtr);
+        setDefaultAutoSWSB();
+        requireSIMD(simd);
+
+        requireLocalSize();
+        externalName("ngen_jit_reduction");
+        finalizeInterface();
+
+        prologue();
+        setDefaultNoMask();
+
+        ra().claim(r0);
+        ngen::Subregister tg_idx0 = r0.ud(1);
+        ngen::Subregister tg_idx1 = r0.ud(6);
+        ngen::Subregister tid = r0.ud(2).b(0);
+        ngen::Subregister tg_size0 = getLocalSize(0).uw();
+        ngen::Subregister src_ptr = getArgument("src_ptr").uq();
+        ngen::Subregister dst_ptr = getArgument("dst_ptr").uq();
+        ra().claim(src_ptr);
+        ra().claim(dst_ptr);
+        ra().claim(tg_size0);
+
+        // SRC offset: tid*grf_bytes*nregs + tg_idx0*tg_size0*dt_size*nregs + tg_idx1*stride*iters*dt_size
+        // DST offset: tid*grf_bytes*nregs + tg_idx0*tg_size0*dt_size*nregs + tg_idx1*stride*dt_size
+        ngen::Subregister inner_off = ra().alloc_sub(ngen::DataType::ud);
+        ngen::Subregister outer_off = ra().alloc_sub(ngen::DataType::ud);
+        emul(1, inner_off, tg_idx0, tg_size0);
+        emul(1, inner_off, inner_off, dt_size * nregs);
+        emad(1, inner_off, inner_off, tid, nregs * grf_bytes);
+
+        emul(1, outer_off, tg_idx1, stride * dt_size);
+
+        ngen::GRF src_addr = ra().alloc().uq();
+        ngen::GRF dst_addr = ra().alloc().uq();
+        emad(1, src_addr, inner_off, outer_off, iters);
+        eadd(1, src_addr, src_addr, src_ptr);
+        eadd3(1, dst_addr, dst_ptr, inner_off, outer_off);
+        ra().release(inner_off);
+        ra().release(outer_off);
+
+        ngen::GRFRange acc = ra().alloc_range(nregs);
+        reduction_injector_f32_t<generator_t<hw>> reduce(
+                *this, alg, ra(), device_info.stepping_id());
+        reduce.compute(src_addr, acc, stride, iters);
+        ra().release(src_addr);
+
+        finalize(simd, alg, acc, iters);
+
+        estore(dst_addr, acc);
+
+        ra().release(acc);
+        ra().release(dst_addr);
+
+        ra().release(r0);
+        ra().release(src_ptr);
+        ra().release(dst_ptr);
+        ra().release(tg_size0);
+#ifdef DNNL_DEV_MODE
+        gpu_assert(ra().get_alloced_regs() == 0)
+                << ra().get_alloced_regs()
+                << " registers are allocated that need to be released.";
+#endif
+
+        epilogue();
+    }
+
+protected:
+    // Store data from a contiguous range of registers into a contiguous
+    // range in global memory (block store)
+    void estore(const ngen::GRF &base_dst_addr, const ngen::GRFRange &src) {
+        const int grf_bytes = ngen::GRF::bytes(hw);
+        int nregs = src.getLen();
+        bool force_legacy = gpu_utils::dev_getenv(
+                "jit_reduction_force_legacy_send", false);
+        bool use_legacy = force_legacy || hw < ngen::HW::XeHPG;
+        const int max_store_size = use_legacy ? 128 : 512;
+        gpu_assert(max_store_size % grf_bytes == 0) << "Unexpected store size";
+        const int max_store_regs = max_store_size / grf_bytes;
+
+        // Load in chunks
+        int reg_start = 0;
+        while (reg_start < nregs) {
+            int store_regs = nstl::min(max_store_regs, nregs - reg_start);
+            // Compute the src address
+            ngen::GRF addr = ra().alloc().uq();
+            eadd(1, addr, base_dst_addr, reg_start * grf_bytes);
+            if (use_legacy) {
+                // Reduce store_regs according to valid store sizes
+                const int oword_per_grf = grf_bytes / 16;
+                for (auto store_owords : {8, 4, 2, 1}) {
+                    if (store_owords / oword_per_grf > store_regs) continue;
+                    store_regs = store_owords / oword_per_grf;
+                    break;
+                }
+
+                // Do the store
+                auto dt = ngen::aligned_block_oword(store_regs * oword_per_grf);
+                store(1, dt, A64, addr, src[reg_start]);
+            } else {
+                // Reduce store_regs according to valid store sizes
+                const int d64_per_grf = grf_bytes / 8;
+                for (auto store_d64s : {64, 32, 16, 8, 4, 3, 2, 1}) {
+                    if (store_d64s / d64_per_grf > store_regs) continue;
+                    store_regs = store_d64s / d64_per_grf;
+                    break;
+                }
+
+                // Do the store
+                ngen::DataSpecLSC lscspec = ngen::CacheSettingsLSC::L1UC_L3WB;
+                lscspec |= ngen::block(
+                        ngen::DataSizeLSC::D64, store_regs * d64_per_grf);
+                store.ugm(1, lscspec, A64, addr, src[reg_start]);
+            }
+            reg_start += store_regs;
+            ra().release(addr);
+        }
+    }
+
+    void finalize(
+            int simd, alg_kind_t alg, const ngen::GRFRange &acc, dim_t iters) {
+        int nregs = acc.getLen();
+        for (int i = 0; i < nregs; i++) {
+            switch (alg) {
+                case alg_kind::reduction_mean:
+                    mul(simd, acc[i].f(), acc[i].f(), 1.0f / iters);
+                    break;
+                default: break;
+            }
+        }
+    }
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_JIT_REDUCTION_GENERATOR_HPP
diff --git a/src/gpu/intel/jit/reduction_injector.cpp b/src/gpu/intel/jit/reduction_injector.cpp
new file mode 100644
index 00000000000..b09797554ce
--- /dev/null
+++ b/src/gpu/intel/jit/reduction_injector.cpp
@@ -0,0 +1,295 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dnnl/dnnl_types.h"
+
+// Must be included before emulation.hpp
+#include "ngen.hpp"
+
+#include "common/impl_registration.hpp"
+#include "common/nstl.hpp"
+#include "common/utils.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/jit/emulation.hpp"
+#include "gpu/intel/jit/reduction_injector.hpp"
+#include "ngen_core.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+using namespace ngen;
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::sum_fwd(
+        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
+    eadd(h, simd, acc, acc, val);
+}
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::max_fwd(
+        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
+    h.max_(simd, acc, acc, val);
+}
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::min_fwd(
+        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
+    h.min_(simd, acc, acc, val);
+}
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::mul_fwd(
+        int simd, const ngen::GRF &acc, const ngen::GRF &val) {
+    emul(h, simd, acc, acc, val);
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::initialize(
+        int simd, const ngen::GRF &reg) {
+    switch (alg_) {
+        case dnnl_reduction_sum:
+        case dnnl_reduction_mean: emov(h, simd, reg, 0.0f); break;
+        case dnnl_reduction_max:
+            emov(h, simd, reg, nstl::numeric_limits<float>::lowest());
+            break;
+        case dnnl_reduction_min:
+            emov(h, simd, reg, nstl::numeric_limits<float>::max());
+            break;
+        case dnnl_reduction_mul: emov(h, simd, reg, 1.0f); break;
+        default:
+            gpu_assert(false) << "unsupported reduction algorithm, " << alg_;
+    }
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::eload(
+        const ngen::GRFRange &dst, const ngen::GRF &base_src_addr) {
+    const int grf_bytes = ngen::GRF::bytes(hw());
+    int nregs = dst.getLen();
+    bool force_legacy
+            = gpu_utils::dev_getenv("jit_reduction_force_legacy_send", false);
+    bool use_legacy = force_legacy || hw() < ngen::HW::XeHPG;
+    const int max_load_size = use_legacy ? 128 : 512;
+    gpu_assert(max_load_size % grf_bytes == 0) << "Unexpected load size";
+    const int max_load_regs = max_load_size / grf_bytes;
+
+    // Load in chunks
+    int reg_start = 0;
+    while (reg_start < nregs) {
+        int load_regs = nstl::min(max_load_regs, nregs - reg_start);
+        // Compute the src address
+        ngen::GRF addr = ra.alloc().uq();
+        eadd(h, 1, addr, base_src_addr, reg_start * grf_bytes);
+        if (use_legacy) {
+            // Reduce load_regs according to valid load sizes
+            const int oword_per_grf = grf_bytes / 16;
+            for (auto load_owords : {8, 4, 2, 1}) {
+                if (load_owords / oword_per_grf > load_regs) continue;
+                load_regs = load_owords / oword_per_grf;
+                break;
+            }
+
+            // Do the load
+            auto dt = ngen::aligned_block_oword(load_regs * oword_per_grf);
+            h.load(1, dst[reg_start], dt, h.A64, addr);
+        } else {
+            // Reduce load_regs according to valid load sizes
+            const int d64_per_grf = grf_bytes / 8;
+            for (auto load_d64s : {64, 32, 16, 8, 4, 3, 2, 1}) {
+                if (load_d64s / d64_per_grf > load_regs) continue;
+                load_regs = load_d64s / d64_per_grf;
+                break;
+            }
+
+            // Do the load
+            ngen::DataSpecLSC lscspec = ngen::CacheSettingsLSC::L1UC_L3WB;
+            lscspec |= ngen::block(
+                    ngen::DataSizeLSC::D64, load_regs * d64_per_grf);
+            h.load.ugm(1, dst[reg_start], lscspec, h.A64, addr);
+        }
+        reg_start += load_regs;
+        ra.release(addr);
+    }
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::compute(
+        const ngen::GRF &src_ptr, const ngen::GRFRange &acc, dim_t stride,
+        dim_t iters) {
+    using namespace alg_kind;
+#ifdef DNNL_DEV_MODE
+    int pre_regs = ra.get_alloced_regs();
+#endif
+    assert(src_ptr.getType() == ngen::DataType::uq);
+
+    int dt_size = sizeof(float);
+    int reg_size = ngen::GRF::bytes(hw());
+    int elems_per_reg = reg_size / dt_size;
+    int nregs = acc.getLen();
+
+    int regs_per_inst = std::min(nregs, [this]() {
+        int reg_size = ngen::GRF::bytes(hw());
+        compute::gpu_arch_t gpu_arch = convert_ngen_arch_to_dnnl(hw());
+        int max_exec_size = compute::device_info_t::max_exec_size(gpu_arch);
+        return max_exec_size / reg_size;
+    }());
+
+    ngen::GRF load_addr = ra.alloc().uq();
+    emov(h, 1, load_addr, src_ptr);
+
+    // Set up GRFs used for loop indices
+    ngen::Subregister loop_index = ra.alloc_sub(ngen::DataType::d);
+    ngen::GRFRange val = ra.alloc_range(nregs);
+    ngen::FlagRegister loop_flag = ra.alloc_flag(true);
+
+    for (int i = 0; i < nregs; i += regs_per_inst) {
+        int inst_nregs = std::min(regs_per_inst, nregs - i);
+        int simd = inst_nregs * elems_per_reg;
+        initialize(simd, acc[i].f());
+    }
+
+    // Initialize loop
+    ngen::Label loop_start;
+    emov(h, 1, loop_index, 0);
+    h.mark(loop_start);
+
+    // Load data - coalesce calls when possible
+    eload(val, load_addr);
+
+    // Accumulate
+    for (int i = 0; i < nregs; i += regs_per_inst) {
+        int inst_nregs = std::min(regs_per_inst, nregs - i);
+        int simd = inst_nregs * elems_per_reg;
+        switch (alg_) {
+            case dnnl_reduction_sum:
+            case dnnl_reduction_mean:
+                sum_fwd(simd, acc[i].f(), val[i].f());
+                break;
+            case dnnl_reduction_max:
+                max_fwd(simd, acc[i].f(), val[i].f());
+                break;
+            case dnnl_reduction_min:
+                min_fwd(simd, acc[i].f(), val[i].f());
+                break;
+            case dnnl_reduction_mul:
+                mul_fwd(simd, acc[i].f(), val[i].f());
+                break;
+            default: gpu_assert(false) << "unsupported reduction algorithm";
+        }
+    }
+
+    // Iterate
+    eadd(h, 1, loop_index, loop_index, 1);
+    h.cmp(1 | h.lt | loop_flag, loop_index, iters);
+    eadd(h, 1, load_addr, load_addr, stride * dt_size);
+    h.jmpi(1 | loop_flag, loop_start);
+
+    // Release used registers
+    ra.release(load_addr);
+    ra.release(loop_index);
+    ra.release(val);
+    ra.release(loop_flag);
+
+#ifdef DNNL_DEV_MODE
+    int remaining_regs = ra.get_alloced_regs() - pre_regs;
+    gpu_assert(remaining_regs == 0)
+            << remaining_regs
+            << " registers are allocated that need to be released.";
+#endif
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::emov(ngen_generator_t &host,
+        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
+        const ngen::Immediate &src0) {
+    EmulationImplementation::emov(host, mod, dst, src0, emu_strategy);
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::emov(ngen_generator_t &host,
+        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
+        const ngen::RegData &src0) {
+    EmulationImplementation::emov(host, mod, dst, src0, emu_strategy);
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::eadd(ngen_generator_t &host,
+        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
+        const ngen::RegData &src0, const ngen::Immediate &src1) {
+    EmulationState state;
+    state.temp[0] = ra.alloc();
+    state.temp[1] = ra.alloc();
+    EmulationImplementation::eadd(
+            host, mod, dst, src0, src1, emu_strategy, state);
+    ra.release(state.temp[0]);
+    ra.release(state.temp[1]);
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::eadd(ngen_generator_t &host,
+        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
+        const ngen::RegData &src0, const ngen::RegData &src1) {
+    EmulationState state;
+    state.temp[0] = ra.alloc();
+    state.temp[1] = ra.alloc();
+    EmulationImplementation::eadd(
+            host, mod, dst, src0, src1, emu_strategy, state);
+    ra.release(state.temp[0]);
+    ra.release(state.temp[1]);
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::emul(ngen_generator_t &host,
+        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
+        const ngen::RegData &src0, const ngen::Immediate &src1) {
+    EmulationState state;
+    state.temp[0] = ra.alloc();
+    state.temp[1] = ra.alloc();
+    EmulationImplementation::emul(
+            host, mod, dst, src0, src1, emu_strategy, state);
+    ra.release(state.temp[0]);
+    ra.release(state.temp[1]);
+}
+
+template <typename ngen_generator_t>
+void reduction_injector_f32_t<ngen_generator_t>::emul(ngen_generator_t &host,
+        const ngen::InstructionModifier &mod, const ngen::RegData &dst,
+        const ngen::RegData &src0, const ngen::RegData &src1) {
+    EmulationState state;
+    state.temp[0] = ra.alloc();
+    state.temp[1] = ra.alloc();
+    EmulationImplementation::emul(
+            host, mod, dst, src0, src1, emu_strategy, state);
+    ra.release(state.temp[0]);
+    ra.release(state.temp[1]);
+}
+
+REG_GEN9_ISA(template struct reduction_injector_f32_t<generator_t<gpu_gen9>>);
+REG_GEN11_ISA(template struct reduction_injector_f32_t<generator_t<gpu_gen11>>);
+REG_XELP_ISA(template struct reduction_injector_f32_t<generator_t<gpu_xe_lp>>);
+REG_XEHP_ISA(template struct reduction_injector_f32_t<generator_t<gpu_xe_hp>>);
+REG_XEHPG_ISA(
+        template struct reduction_injector_f32_t<generator_t<gpu_xe_hpg>>);
+REG_XEHPC_ISA(
+        template struct reduction_injector_f32_t<generator_t<gpu_xe_hpc>>);
+REG_XE2_ISA(template struct reduction_injector_f32_t<generator_t<gpu_xe2>>);
+REG_XE3_ISA(template struct reduction_injector_f32_t<generator_t<gpu_xe3>>);
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/reduction_injector.hpp b/src/gpu/intel/jit/reduction_injector.hpp
new file mode 100644
index 00000000000..3e0f3fb64b2
--- /dev/null
+++ b/src/gpu/intel/jit/reduction_injector.hpp
@@ -0,0 +1,102 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_REDUCTION_INJECTOR_HPP
+#define GPU_INTEL_JIT_REDUCTION_INJECTOR_HPP
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/utils.hpp"
+
+#include "gpu/intel/jit/codegen/register_allocator.hpp"
+#include "gpu/intel/jit/emulation.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "ngen_core.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+inline bool reduction_injector_f32_is_supported(alg_kind_t alg) {
+    using namespace alg_kind;
+    return utils::one_of(alg, reduction_sum, reduction_mean, reduction_max,
+            reduction_min, reduction_mul);
+}
+
+template <typename ngen_generator_t>
+struct reduction_injector_f32_t {
+    reduction_injector_f32_t(ngen_generator_t &host, alg_kind_t alg,
+            reg_allocator_t &ra_, int stepping_id)
+        : emu_strategy(host.getHardware(), stepping_id)
+        , alg_(alg)
+        , h(host)
+        , ra(ra_) {
+        assert(reduction_injector_f32_is_supported(alg_));
+    }
+
+    ngen::HW hw() const { return h.getHardware(); }
+    // src_ptr: GRF whose 1st qword subregister holds the first address to be loaded from
+    // acc: Potentially uninitialized GRFRange to store values in
+    // stride: Number of elements to increment the pointer by between iterations
+    // iters: Number of reduction iterations
+    void compute(const ngen::GRF &src_ptr, const ngen::GRFRange &acc,
+            dim_t stride, dim_t iters);
+
+private:
+    void initialize(int simd, const ngen::GRF &reg);
+    // Load data from a contiguous range in global memory into a contiguous
+    // range of registers (block load)
+    void eload(const ngen::GRFRange &dst, const ngen::GRF &base_src_addr);
+
+    // Emulation functions
+    void emov(ngen_generator_t &host, const ngen::InstructionModifier &mod,
+            const ngen::RegData &dst, const ngen::Immediate &src0);
+    void emov(ngen_generator_t &host, const ngen::InstructionModifier &mod,
+            const ngen::RegData &dst, const ngen::RegData &src0);
+    void eadd(ngen_generator_t &host, const ngen::InstructionModifier &mod,
+            const ngen::RegData &dst, const ngen::RegData &src0,
+            const ngen::Immediate &src1);
+    void eadd(ngen_generator_t &host, const ngen::InstructionModifier &mod,
+            const ngen::RegData &dst, const ngen::RegData &src0,
+            const ngen::RegData &src1);
+    void emul(ngen_generator_t &host, const ngen::InstructionModifier &mod,
+            const ngen::RegData &dst, const ngen::RegData &src0,
+            const ngen::Immediate &src1);
+    void emul(ngen_generator_t &host, const ngen::InstructionModifier &mod,
+            const ngen::RegData &dst, const ngen::RegData &src0,
+            const ngen::RegData &src1);
+    EmulationStrategy emu_strategy;
+
+    const alg_kind_t alg_;
+    ngen_generator_t &h;
+    reg_allocator_t &ra;
+
+    void sum_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
+    void max_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
+    void min_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
+    void mul_fwd(int simd, const ngen::GRF &acc, const ngen::GRF &val);
+};
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_JIT_REDUCTION_INJECTOR_HPP
diff --git a/src/gpu/intel/jit/reorder/config.cpp b/src/gpu/intel/jit/reorder/config.cpp
new file mode 100644
index 00000000000..ce559dddb4d
--- /dev/null
+++ b/src/gpu/intel/jit/reorder/config.cpp
@@ -0,0 +1,108 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/reorder/config.hpp"
+#include "gpu/intel/jit/reorder/normalization.hpp"
+#include "gpu/intel/jit/reorder/tiler.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+
+namespace reorder {
+
+pvar_t pvars[] = {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, {"f"}, {"g"}, {"h"}, {"i"},
+        {"j"}, {"k"}, {"l"}};
+
+static_assert(sizeof(pvars) == DNNL_MAX_NDIMS * sizeof(pvar_t),
+        "Incorrect number of pvars for reorder");
+
+} // namespace reorder
+
+reorder_config_t::reorder_config_t(
+        const exec_config_t &ec, layout_t src, layout_t dst) {
+    set_exec_cfg(ec);
+
+    reorder::normalize(src, dst);
+    src_layout().set_user(src);
+    dst_layout().set_user(dst);
+
+    auto rev_tiles = reorder::tiles(ec.hw(), src, dst);
+    tiles_.assign(rev_tiles.rbegin(), rev_tiles.rend());
+
+    dim_idx_t ndims = src.ndims();
+    const auto &thr_tile = tiles_.front();
+
+    pvar_tile_t iter_tile;
+    pvar_tile_t loop_tile;
+    pvar_tile_t tg_tile;
+
+    pvar_tile_t dims;
+    dim_idx_t grid_idx = 0;
+
+    for (dim_idx_t i = 0; i < ndims; ++i) {
+        pvar_t &d = reorder::pvars[i];
+
+        dim_t tg_dim = thr_tile(i);
+        dim_t outer = utils::div_up(dst.dim(i), tg_dim);
+        iter_tile[d] = tg_dim;
+        loop_tile[d] = 1;
+        dims[d] = std::max(src.dim(i), dst.dim(i));
+        grid_[grid_idx][d] = 1;
+
+        if (outer != 1) grid_idx = std::min<dim_idx_t>(grid_idx + 1, 2);
+    }
+
+    for (dim_idx_t i = 0; i < ndims; ++i) {
+        pvar_t &d = reorder::pvars[i];
+        dim_t tg_dim = thr_tile(i);
+        dim_t outer = utils::div_up(dims[d], tg_dim);
+
+        if (outer % 2 == 0) {
+            tg_tile[d] = 2;
+            break;
+        }
+    }
+
+    padded_dims().set(dims);
+    iter_dims().set(iter_tile);
+    loop_dims().set(loop_tile);
+    thread_group_dims().set(tg_tile);
+
+    init_kernel_grid(grid_);
+    init_thread_group_grid(grid_);
+}
+
+compute::nd_range_t reorder_config_t::nd_range() const {
+    compute::range_t gws = compute::range_t::empty();
+    compute::range_t lws = compute::range_t::empty();
+    for (dim_idx_t i = 0; i < compute::range_t::max_ndims; ++i) {
+        lws[i] = thread_group_grid().dim(i);
+        gws[i] = kernel_grid().dim(i) * lws[i];
+    }
+    lws[0] *= simd();
+    gws[0] *= simd();
+
+    return compute::nd_range_t(gws, lws);
+}
+
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/reorder/config.hpp b/src/gpu/intel/jit/reorder/config.hpp
index 5a114dad523..9492bed0a7a 100644
--- a/src/gpu/intel/jit/reorder/config.hpp
+++ b/src/gpu/intel/jit/reorder/config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,6 +28,12 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
+namespace reorder {
+
+extern pvar_t pvars[];
+
+} // namespace reorder
+
 // Parameters for kernel generation.
 class reorder_config_t : public prim_config_t {
 public:
@@ -37,21 +43,25 @@ class reorder_config_t : public prim_config_t {
         return ss.str();
     }
 
-    prb_tile_t shape(bool pad) const override { return {}; };
+    pvar_tile_t shape(bool pad) const override { return {}; };
 
-    const std::vector<prb_dim_t> &index_dims() const override {
-        static const std::vector<prb_dim_t> null {};
+    const std::vector<pvar_t> &index_dims() const override {
+        static const std::vector<pvar_t> null {};
         return null;
     };
 
-    int pad_block(const prb_dim_t &d) const override { return 0; }
+    int pad_block(const pvar_t &d) const override { return 0; }
 
-    reorder_config_t(
-            const exec_config_t &ec, const layout_t &src, const layout_t &dst) {
-        src_layout().set_user(src);
-        dst_layout().set_user(dst);
-        set_exec_cfg(ec);
-    }
+    int simd() const { return exec_cfg().simd(); }
+    compute::nd_range_t nd_range() const;
+    const std::vector<tensor_t> &tiles() const { return tiles_; }
+    const std::array<pvar_tile_t, 3> &grid() const { return grid_; }
+
+    reorder_config_t(const exec_config_t &ec, layout_t src, layout_t dst);
+
+private:
+    std::vector<tensor_t> tiles_;
+    std::array<pvar_tile_t, 3> grid_;
 };
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/reorder/gen_reorder.cpp b/src/gpu/intel/jit/reorder/gen_reorder.cpp
index b51d1b1765f..eb458bffb2b 100644
--- a/src/gpu/intel/jit/reorder/gen_reorder.cpp
+++ b/src/gpu/intel/jit/reorder/gen_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 #include "gpu/intel/jit/reorder/config.hpp"
 #include "gpu/intel/jit/reorder/reorder_kernel.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -43,7 +43,6 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
     const auto dst_dt = dst_md()->data_type;
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
     auto *device_info = compute_engine->device_info();
-    zero_points_config_t zp_cfg(this);
     using namespace data_type;
 
     auto post_ops_ok = [&]() {
@@ -59,16 +58,16 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
         return true;
     };
     auto scales_ok = [&]() {
-        return (attr()->scales_.get(DNNL_ARG_SRC).mask_ == 0)
-                && (attr()->scales_.get(DNNL_ARG_DST).mask_ == 0);
-    };
-    auto zps_ok = [&]() {
-        return (!zp_cfg.do_src_compensation || zp_cfg.is_common_src_zero_point)
-                && (!zp_cfg.do_dst_compensation
-                        || zp_cfg.is_common_dst_zero_point);
+        const bool src_scale_ok
+                = attr()->scales_.has_default_values(DNNL_ARG_SRC)
+                || attr()->scales_.get_mask(DNNL_ARG_SRC) == 0;
+        const bool dst_scale_ok
+                = attr()->scales_.has_default_values(DNNL_ARG_DST)
+                || attr()->scales_.get_mask(DNNL_ARG_DST) == 0;
+        return src_scale_ok && dst_scale_ok;
     };
-    auto is_bf16_or_f32_or_f8 = [](data_type_t dt) {
-        return utils::one_of(dt, bf16, f32, f8_e5m2, f8_e4m3);
+    auto supports_bf16_conversion = [](data_type_t dt) {
+        return utils::one_of(dt, bf16, f32, f8_e5m2, f8_e4m3, f4_e3m0, f4_e2m1);
     };
     auto hf8_ok = [&]() {
         bool any_hf8 = utils::one_of(f8_e4m3, dst_dt, src_dt);
@@ -77,24 +76,29 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
                         || utils::one_of(src_dt, bf16, f16, f32)
                         || utils::one_of(dst_dt, bf16, f16, f32));
     };
-    auto skip_mask = dnnl_primitive_attr::skip_mask_t::post_ops
-            | dnnl_primitive_attr::skip_mask_t::zero_points_runtime
-            | dnnl_primitive_attr::skip_mask_t::scales_runtime;
     VDISPATCH_REORDER(
             src_engine == dst_engine && src_engine->kind() == engine_kind::gpu,
             VERBOSE_BAD_ENGINE_KIND);
     VDISPATCH_REORDER(utils::one_of(src_dt, f32, f16, bf16, f8_e5m2, f8_e4m3,
-                              s32, s8, u8, f64),
+                              f4_e3m0, f4_e2m1, s32, s8, u8, f64),
             VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_REORDER(utils::one_of(dst_dt, f32, f16, bf16, f8_e5m2, f8_e4m3,
-                              s32, s8, u8, f64),
+                              f4_e3m0, f4_e2m1, s32, s8, u8, f64),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_REORDER(IMPLICATION(utils::one_of(src_dt, f4_e3m0, f4_e2m1),
+                              utils::one_of(dst_dt, f32, f16, bf16)),
+            VERBOSE_UNSUPPORTED_DT);
+    VDISPATCH_REORDER(IMPLICATION(utils::one_of(dst_dt, f4_e3m0, f4_e2m1),
+                              utils::one_of(src_dt, f32, f16, bf16)),
             VERBOSE_UNSUPPORTED_DT);
     VDISPATCH_REORDER(IMPLICATION(src_dt == f16 || dst_dt == f16,
                               device_info->has_native(f16)),
             VERBOSE_UNSUPPORTED_DT_CFG);
-    VDISPATCH_REORDER(IMPLICATION(src_dt == bf16, is_bf16_or_f32_or_f8(dst_dt)),
+    VDISPATCH_REORDER(
+            IMPLICATION(src_dt == bf16, supports_bf16_conversion(dst_dt)),
             VERBOSE_UNSUPPORTED_DT_CFG);
-    VDISPATCH_REORDER(IMPLICATION(dst_dt == bf16, is_bf16_or_f32_or_f8(src_dt)),
+    VDISPATCH_REORDER(
+            IMPLICATION(dst_dt == bf16, supports_bf16_conversion(src_dt)),
             VERBOSE_UNSUPPORTED_DT_CFG);
     VDISPATCH_REORDER(IMPLICATION(utils::one_of(f8_e5m2, src_dt, dst_dt),
                               device_info->has_native(f8_e5m2)),
@@ -102,11 +106,30 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
     VDISPATCH_REORDER(IMPLICATION(src_dt == f64 || dst_dt == f64,
                               device_info->has_native(f64)),
             VERBOSE_UNSUPPORTED_DT_CFG);
+    VDISPATCH_REORDER(
+            IMPLICATION(src_dt == f64, utils::one_of(dst_dt, f32, f64)),
+            VERBOSE_UNSUPPORTED_DT_CFG);
+    VDISPATCH_REORDER(
+            IMPLICATION(dst_dt == f64, utils::one_of(src_dt, f32, f64)),
+            VERBOSE_UNSUPPORTED_DT_CFG);
+
+    using sm = dnnl_primitive_attr::skip_mask_t;
+    auto skip_mask
+            = sm::post_ops | sm::zero_points | sm::scales | sm::rounding_mode;
     VDISPATCH_REORDER(
             attr()->has_default_values(skip_mask), VERBOSE_UNSUPPORTED_ATTR);
-    VDISPATCH_REORDER(extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
+    VDISPATCH_REORDER(extra_ok(true), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
     VDISPATCH_REORDER(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
     VDISPATCH_REORDER(scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
+
+    // Create `zp_cfg` object after all relevant zp checks passing to avoid
+    // potential issues with the generator for unsupported features.
+    zero_points_config_t zp_cfg(this);
+    auto zps_ok = [&]() {
+        return (!zp_cfg.do_src_compensation || zp_cfg.is_common_src_zero_point)
+                && (!zp_cfg.do_dst_compensation
+                        || zp_cfg.is_common_dst_zero_point);
+    };
     VDISPATCH_REORDER(zps_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
     VDISPATCH_REORDER(hf8_ok(), VERBOSE_UNSUPPORTED_DT);
 
@@ -145,6 +168,7 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
             check_layout(dst_layout), VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
     VDISPATCH_REORDER(compute_engine->mayiuse_ngen_kernels(),
             VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "ngen_kernels");
+
     auto *gpu_attr
             = utils::downcast<gpu_primitive_attr_t *>(attr()->gpu_attr_.get());
     hw_t hw(engine);
@@ -153,9 +177,40 @@ status_t gen_reorder_t::pd_t::init(impl::engine_t *engine,
     exec_cfg.set_simd(16);
     cfg = std::make_shared<reorder_config_t>(exec_cfg, src_layout, dst_layout);
     cfg->set_zp_cfg(zp_cfg);
+
+    auto count_inner_elems = [&](const layout_t &layout) {
+        auto dims = cfg->tiles().front().dims();
+        dim_t contiguous_inner_elems = 1;
+        for (auto &b : layout.blocks()) {
+            if (b.block == 1) continue;
+            if ((dim_t)b.stride != contiguous_inner_elems) break;
+            if (b.block > dims[b.dim_idx]) {
+                if (b.block % dims[b.dim_idx] == 0)
+                    contiguous_inner_elems *= dims[b.dim_idx];
+                break;
+            }
+            contiguous_inner_elems *= b.block;
+            dims[b.dim_idx] /= b.block;
+        }
+        return contiguous_inner_elems;
+    };
+
+    if (utils::one_of(src_dt, f4_e2m1, f4_e3m0)) {
+        auto contiguous_inner_elems
+                = count_inner_elems(cfg->src_layout().user());
+        VDISPATCH_REORDER(contiguous_inner_elems % 8 == 0,
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "src");
+    }
+    if (utils::one_of(dst_dt, f4_e2m1, f4_e3m0)) {
+        auto contiguous_inner_elems
+                = count_inner_elems(cfg->dst_layout().user());
+        VDISPATCH_REORDER(contiguous_inner_elems % 8 == 0,
+                VERBOSE_UNSUPPORTED_TENSOR_LAYOUT, "dst");
+    }
     VDISPATCH_REORDER_SC(
             init_kernel_info(), "kernel initialization unsuccessful");
-
+    VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine),
+            "failed to create nested zp precompute convolution");
     return status::success;
 }
 
@@ -171,8 +226,7 @@ status_t gen_reorder_t::pd_t::init_kernel_info() {
             /*oc=*/1, tensor_cfg);
 
     kernel_info = std::make_shared<kernel_info_t>();
-    auto nd_range = reorder_kernel_t<>::nd_range(cfg->exec_cfg(),
-            cfg->src_layout().user(), cfg->dst_layout().user());
+    auto nd_range = cfg->nd_range();
     auto global_range = nd_range.global_range();
     constexpr int max = std::numeric_limits<int>::max();
     // This case *probably* overflowed in int32 precision.
@@ -183,13 +237,11 @@ status_t gen_reorder_t::pd_t::init_kernel_info() {
 
     // Initialize kernel arguments.
     for (auto &t : tensor_cfg.tensors()) {
-        ir_assert(!t.needs_reorder);
-        ir_assert(!t.needs_zero_out);
+        gpu_assert(!t.needs_reorder);
+        gpu_assert(!t.needs_zero_out);
 
         if (t.arg_key == DNNL_ARG_UNDEF) {
-            ir_assert(!t.needs_reorder);
-            ir_assert(!t.needs_zero_out);
-            ir_error_not_expected();
+            gpu_error_not_expected();
             continue;
         }
         kernel_info->register_user_arg(make_buffer(t.name), t.arg_key,
@@ -199,6 +251,9 @@ status_t gen_reorder_t::pd_t::init_kernel_info() {
 }
 
 status_t gen_reorder_t::init(impl::engine_t *engine) {
+    CHECK(pd()->maybe_create_zp_precompute_conv(
+            zp_precomp_conv_, engine, this));
+
     auto &cfg = *pd()->cfg;
     auto &info = *pd()->kernel_info;
 
@@ -218,6 +273,7 @@ status_t gen_reorder_t::execute(const exec_ctx_t &ctx) const {
     info.set_args(arg_list, storage_list);
 
     CHECK(parallel_for(ctx, info.nd_range(), kernel_, arg_list));
+    CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_));
     return status::success;
 }
 
diff --git a/src/gpu/intel/jit/reorder/gen_reorder.hpp b/src/gpu/intel/jit/reorder/gen_reorder.hpp
index c6aa048dfb3..478d5e030a4 100644
--- a/src/gpu/intel/jit/reorder/gen_reorder.hpp
+++ b/src/gpu/intel/jit/reorder/gen_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ class gen_reorder_t : public gpu_primitive_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
 
     compute::kernel_t kernel_;
+    std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
 };
 
 } // namespace jit
diff --git a/src/gpu/intel/jit/reorder/ir_builder.cpp b/src/gpu/intel/jit/reorder/ir_builder.cpp
index 7b1db533193..5c5ee11c416 100644
--- a/src/gpu/intel/jit/reorder/ir_builder.cpp
+++ b/src/gpu/intel/jit/reorder/ir_builder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@
 #include "gpu/intel/jit/ir/reorder.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
 #include "gpu/intel/jit/pass/pass.hpp"
-#include "gpu/intel/jit/utils/iterator.hpp"
+#include "gpu/intel/jit/reorder/tiler.hpp"
 #include "gpu/intel/jit/utils/range.hpp"
 #include "gpu/intel/jit/utils/trace.hpp"
 
@@ -46,689 +46,134 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-dim_t reorder_ir_builder_t::count_block_messages(
-        const exec_config_t &exec_cfg, dim_t inner_bytes, dim_t iterations) {
-    const auto max_block_owords = exec_cfg.grf_size() / 2;
-    const auto oword_size = 16;
-    const auto owords_per_grf = exec_cfg.grf_size() / oword_size;
-
-    dim_t block_owords = max_block_owords / 2;
-    auto inner_owords = inner_bytes / oword_size;
-    dim_t messages = inner_owords / max_block_owords;
-    inner_owords -= messages * max_block_owords;
-    // If iterations != 1, tail block messages must end on a grf boundary
-    const dim_t lower_bound = iterations == 1 ? 1 : owords_per_grf;
-    for (; block_owords >= lower_bound; block_owords >>= 1) {
-        if (inner_owords >= block_owords) {
-            inner_owords -= block_owords;
-            messages++;
-        }
+void split_tile(const tensor_t &wg_tile, const tensor_t &iter_tile,
+        pvar_tile_t &iter_dims, pvar_tile_t &loop_dims) {
+    const auto &wg_dims = wg_tile.dims();
+    const auto &it_dims = iter_tile.dims();
+    for (dim_idx_t i = 0; i < wg_tile.ndims(); ++i) {
+        pvar_t &d = reorder::pvars[i];
+        iter_dims[d] = it_dims[i];
+        loop_dims[d] = wg_dims[i] / it_dims[i];
     }
-    ir_assert(inner_owords == 0);
-    return messages * iterations;
-}
-
-dim_t reorder_ir_builder_t::count_scattered_messages(
-        const exec_config_t &exec_cfg, dim_t inner_bytes, dim_t iterations) {
-    const auto max_block_items = exec_cfg.grf_size() / 2;
-    int item_size = 8;
-
-    // Find the largest uint size we can use
-    for (; item_size > 1; item_size >>= 1) {
-        if (inner_bytes % item_size == 0) break;
-    }
-
-    dim_t block_items = max_block_items / 2;
-    auto inner_items = (iterations * inner_bytes) / item_size;
-    dim_t messages = (inner_items + (block_items - 1)) / max_block_items;
-    inner_items -= std::min(inner_items, messages * max_block_items);
-    for (; block_items >= (dim_t)2; block_items >>= 1) {
-        if (inner_items > block_items / 2) {
-            inner_items -= std::min(inner_items, block_items);
-            messages++;
-        }
-    }
-    if (inner_items) messages++;
-    return messages;
-}
-
-dim_t reorder_ir_builder_t::message_latency(
-        const exec_config_t &exec_cfg, const layout_t &l, const tensor_t &t) {
-    const auto grf_size = exec_cfg.grf_size();
-    const int scattered_message_penalty = 4;
-    bool can_use_block_messages = true;
-    std::vector<dim_t> outer = t.dims();
-    dim_t inner_elems = 1;
-
-    for (auto &blk : l.blocks()) {
-        auto block = blk.block;
-        auto dim_idx = blk.dim_idx;
-        if (block == 1) continue;
-        if (outer[dim_idx] < block) {
-            if (block % outer[dim_idx] == 0) {
-                inner_elems *= outer[dim_idx];
-                outer[dim_idx] = 1;
-            }
-            break;
-        }
-
-        can_use_block_messages &= (outer[dim_idx] % block == 0);
-        inner_elems *= block;
-        outer[dim_idx] = utils::div_up(outer[dim_idx], block);
-    }
-
-    auto type_size = l.type().scalar().size();
-    auto inner_bytes = inner_elems * type_size;
-    auto iterations = tensor_t(outer).elems();
-    can_use_block_messages &= (inner_bytes % 16 == 0);
-    can_use_block_messages &= (iterations == 1 || inner_bytes % grf_size == 0);
-
-    if (inner_bytes == 0 || iterations == 0) return 0;
-
-    return can_use_block_messages
-            ? count_block_messages(exec_cfg, inner_bytes, iterations)
-            : count_scattered_messages(exec_cfg, inner_bytes, iterations)
-                    * scattered_message_penalty;
-}
-
-void reorder_ir_builder_t::compute_blocks(const exec_config_t &exec_cfg,
-        const layout_t &src, const layout_t &dst, std::vector<int> &iter_blocks,
-        std::vector<int> &loop_blocks, std::vector<int> &tg_blocks,
-        dim_t max_iter_tile_bytes, dim_t max_thr_tile_bytes) {
-    if (max_iter_tile_bytes <= 0)
-        max_iter_tile_bytes = max_tile_size(exec_cfg.hw(), dst, src);
-    if (max_thr_tile_bytes <= 0)
-        max_thr_tile_bytes = max_tile_size(exec_cfg.hw(), dst, src);
-
-    ir_assert(src.ndims() == dst.ndims());
-    int ndims = src.ndims();
-    std::vector<dim_t> dims(ndims);
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = std::max(src.dim(i), dst.dim(i));
-    }
-
-    // Pad src/dst layouts to match each other.
-    auto pad_layout = [&](const layout_t &l) {
-        std::vector<block_t> padded_blocks;
-        for (auto &eb : l.enumerated_blocks()) {
-            auto b = eb.second;
-            if (l.is_outermost(eb)) {
-                dim_t inner = l.dim(b.dim_idx) / b.block;
-                b.block = ir_utils::safe_divide(dims[b.dim_idx], inner);
-            }
-            padded_blocks.push_back(b);
-        }
-        return layout_t(
-                l.type(), ndims, 0, padded_blocks, /*do_normalize=*/false);
-    };
-    layout_t padded_src = pad_layout(src);
-    layout_t padded_dst = pad_layout(dst);
-    ir_assert(ir_utils::is_equal(padded_src.dims(), padded_dst.dims()));
-
-    dim_t elems = padded_src.elems();
-    int max_type_size = std::max(src.type().size(), dst.type().size());
-    dim_t max_iter_tile_elems
-            = std::min(max_iter_tile_bytes / max_type_size, elems);
-    dim_t max_thr_tile_elems
-            = std::min(max_thr_tile_bytes / max_type_size, elems);
-
-    using tile_pair_t = std::array<tensor_t, 2>;
-
-    auto can_be_mapped = [](const layout_t &l, const tensor_t &t) {
-        std::vector<dim_t> rem_dims = t.dims();
-        for (auto &b : l.blocks()) {
-            auto &rem_dim = rem_dims[b.dim_idx];
-            if (rem_dim >= b.block) {
-                if (rem_dim % b.block != 0) return false;
-                rem_dim /= b.block;
-                continue;
-            }
-            if (b.block % rem_dim != 0) return false;
-            rem_dim = 1;
-        }
-        for (auto d : rem_dims)
-            ir_assert(d == 1);
-        return true;
-    };
-
-    auto add_pseudo_dimension = [](const layout_t &l) {
-        auto layout_size = l.size();
-        return [=](const tensor_t &t) {
-            auto dims = t.dims();
-            dims.push_back(layout_size);
-            return tensor_t(dims);
-        };
-    };
-
-    auto mappable_tiles = [&](const tensor_t &t) {
-        return can_be_mapped(padded_src, t) && can_be_mapped(padded_dst, t);
-    };
-
-    auto merge_tiles = [](const tile_pair_t &p) {
-        auto ndims = p[0].ndims() - 1;
-        std::vector<dim_t> dims(ndims);
-        for (int i = 0; i < ndims; ++i)
-            dims[i] = std::max(p[0](i), p[1](i));
-        return tensor_t(dims);
-    };
-
-    auto take_smaller = [](const tensor_t &a, const tensor_t &b) {
-        return a.elems() < b.elems();
-    };
-
-    // Incrementally increase subtiles in src and dst. The goal is to find the
-    // maximum src/dst tiles so that the final combined tile covers dense
-    // regions as big as possible in src/dst layouts.
-    std::vector<tensor_t> candidate_tiles;
-    auto a_tiles = inner_tiles(padded_src.blocks(), padded_src.ndims())
-            | filter(mappable_tiles)
-            | transform(add_pseudo_dimension(padded_src));
-    auto b_tiles = inner_tiles(padded_dst.blocks(), padded_dst.ndims())
-            | filter(mappable_tiles)
-            | transform(add_pseudo_dimension(padded_dst));
-    auto tiles = merge(a_tiles, b_tiles, take_smaller) | transform(merge_tiles);
-    for (auto tile : tiles) {
-        if (tile.elems() > max_thr_tile_elems) break;
-        candidate_tiles.push_back(tile);
-    }
-    ir_assert(!candidate_tiles.empty());
-
-    const auto eu_count = exec_cfg.hw().eu_count();
-    std::sort(candidate_tiles.begin(), candidate_tiles.end(),
-            [&](const tensor_t &a, const tensor_t &b) {
-                auto a_threads_reqd = padded_src.elems() / a.elems();
-                auto b_threads_reqd = padded_src.elems() / b.elems();
-                auto a_eu_util = utils::div_up(a_threads_reqd, eu_count);
-                auto b_eu_util = utils::div_up(b_threads_reqd, eu_count);
-                auto a_msg_load = message_latency(exec_cfg, padded_src, a)
-                        + message_latency(exec_cfg, padded_dst, a);
-                auto b_msg_load = message_latency(exec_cfg, padded_src, b)
-                        + message_latency(exec_cfg, padded_dst, b);
-
-                // Choose tiles with less message overhead per thread
-                if (a_eu_util * a_msg_load != b_eu_util * b_msg_load)
-                    return (a_eu_util * a_msg_load < b_eu_util * b_msg_load);
-
-                // Choose tiles with more bytes per message
-                if (a.elems() * b_msg_load != b.elems() * a_msg_load)
-                    return (a.elems() * b_msg_load > b.elems() * a_msg_load);
-
-                // If all else fails, go with the bigger tile
-                return a.elems() > b.elems();
-            });
-
-    tensor_t thr_tile = candidate_tiles[0];
-    tensor_t iter_tile;
-    for (auto &tile : candidate_tiles) {
-        if (tile.elems() > max_iter_tile_elems || !thr_tile.is_divisible(tile))
-            continue;
-        if (iter_tile.is_empty() || tile.elems() > iter_tile.elems())
-            iter_tile = tile;
-    }
-
-    ir_assert(!iter_tile.is_empty());
-    std::vector<int> thr_blocks(thr_tile.dims().begin(), thr_tile.dims().end());
-    iter_blocks.assign(iter_tile.dims().begin(), iter_tile.dims().end());
-
-    ir_assert(utils::array_product(iter_blocks) <= max_iter_tile_elems);
-    ir_assert(utils::array_product(thr_blocks) <= max_thr_tile_elems);
-
-    // Initialize loop blocks.
-    loop_blocks.resize(ndims, 1);
-    for (int i = 0; i < ndims; i++) {
-        loop_blocks[i] = ir_utils::safe_divide(thr_blocks[i], iter_blocks[i]);
-    }
-
-    // Initialize thread group blocks.
-    // Heuristic: try to split outer dimension and assign its
-    // inner part to the thread group. This may give better
-    // bandwidth utilization on XeHP/XeHPG.
-    tg_blocks.resize(ndims, 1);
-    const int tg_factor = 2;
-    for (int i = 0; i < ndims; i++) {
-        int outer = utils::div_up(dims[i], thr_blocks[i]);
-        if (outer % tg_factor == 0) {
-            tg_blocks[i] = tg_factor;
-            break;
-        }
-    }
-}
-
-void reorder_ir_builder_t::compute_blocks(const exec_config_t &exec_cfg,
-        const layout_t &src, const layout_t &dst, std::vector<int> &tile_blocks,
-        std::vector<int> &tg_blocks) {
-    std::vector<int> iter_blocks;
-    std::vector<int> loop_blocks;
-    compute_blocks(exec_cfg, src, dst, iter_blocks, loop_blocks, tg_blocks);
-    size_t n = iter_blocks.size();
-    tile_blocks.resize(n);
-    for (size_t i = 0; i < n; i++) {
-        tile_blocks[i] = iter_blocks[i] * loop_blocks[i];
-    }
-}
-
-void reorder_ir_builder_t::compute_grid(const layout_t &src,
-        const layout_t &dst, const std::vector<int> &iter_blocks,
-        const std::vector<int> &loop_blocks, const std::vector<int> &tg_blocks,
-        grid_info_t &kernel_grid, grid_info_t &tg_grid,
-        std::vector<int> *dim2grid) {
-    int ndims = src.ndims();
-    std::vector<dim_t> dims(ndims);
-    for (int i = 0; i < ndims; i++) {
-        dims[i] = std::max(src.dim(i), dst.dim(i));
-    }
-
-    if (dim2grid) dim2grid->resize(ndims, -1);
-
-    const int grid_ndims = 3;
-    std::vector<int> kernel_grid_dims(grid_ndims, 1);
-    std::vector<int> tg_grid_dims(grid_ndims, 1);
-    int grid_idx = 0;
-    int max_grid_idx = grid_ndims - 1;
-    for (int i = 0; i < ndims; i++) {
-        if (dim2grid) (*dim2grid)[i] = grid_idx;
-        int outer = utils::div_up(
-                dims[i], iter_blocks[i] * loop_blocks[i] * tg_blocks[i]);
-        tg_grid_dims[grid_idx] *= tg_blocks[i];
-        kernel_grid_dims[grid_idx] *= outer;
-        if (outer != 1 && grid_idx != max_grid_idx) grid_idx++;
-    }
-    kernel_grid = grid_info_t(kernel_grid_dims, "grid_idx");
-    tg_grid = grid_info_t(tg_grid_dims, "tg_idx");
-}
-
-compute::nd_range_t reorder_ir_builder_t::nd_range(
-        const exec_config_t &exec_cfg, layout_t src, layout_t dst) {
-    const int simd = exec_cfg.simd();
-    std::vector<int> iter_blocks;
-    std::vector<int> loop_blocks;
-    std::vector<int> tg_blocks;
-    normalize_reorder_layouts(src, dst);
-    compute_blocks(exec_cfg, src, dst, iter_blocks, loop_blocks, tg_blocks);
-    grid_info_t kernel_grid;
-    grid_info_t tg_grid;
-    compute_grid(src, dst, iter_blocks, loop_blocks, tg_blocks, kernel_grid,
-            tg_grid);
-    compute::range_t global = compute::range_t::empty(kernel_grid.ndims());
-    compute::range_t local = compute::range_t::empty(kernel_grid.ndims());
-    for (int i = 0; i < kernel_grid.ndims(); i++) {
-        global[i] = kernel_grid[i] * tg_grid[i];
-        local[i] = tg_grid[i];
-        if (i == 0) {
-            global[i] *= simd;
-            local[i] *= simd;
-        }
-    }
-    return compute::nd_range_t(global, local);
-}
-
-struct normalization_stage_t {
-    int idx;
-    block_t curr, last;
-    std::array<dim_t, 2> tile;
-
-    bool is_dense() const { return curr.stride == last.stride * last.block; }
-
-    dim_t elems() const { return tile[0]; }
-
-    normalization_stage_t() = default;
-    normalization_stage_t(int idx, const block_t &curr, const block_t &last,
-            std::vector<dim_t> tile)
-        : idx(idx)
-        , curr(curr)
-        , last(last)
-        , tile({tile[curr.dim_idx], tile[last.dim_idx]}) {}
-};
-
-struct merge_info_t {
-    enum class merge_direction_t { none = 0, forward, backward };
-
-    int iter_idx;
-    merge_direction_t direction;
-
-    merge_info_t(int iter_idx, merge_direction_t direction)
-        : iter_idx(iter_idx), direction(direction) {}
-};
-
-merge_info_t::merge_direction_t merge_direction(
-        const normalization_stage_t &l, const normalization_stage_t &r) {
-    using direction_t = merge_info_t::merge_direction_t;
-    if (l.curr.dim_idx != r.curr.dim_idx) return direction_t::none;
-    if (l.last.dim_idx != r.last.dim_idx) return direction_t::none;
-    if (l.tile[0] != r.tile[0]) return direction_t::none;
-    if (l.curr.block == r.curr.block
-            && l.tile[1] * l.last.block == r.tile[1] * r.last.block)
-        return direction_t::backward;
-    if (l.tile[1] == r.tile[1] && l.last.block == r.last.block)
-        return direction_t::forward;
-    return direction_t::none;
-}
-
-struct layout_normalization_t {
-    using blocks_t = std::vector<block_t>;
-    using block_iterator_t = typename blocks_t::const_iterator;
-    using stage_t = normalization_stage_t;
-
-    struct iterator_t {
-        bool operator==(const iterator_t &o) const { return curr_ == o.curr_; }
-        bool operator!=(const iterator_t &o) const { return !operator==(o); }
-        stage_t operator*() const { return {idx_, *curr_, *last_, tile_}; }
-        iterator_t &operator++() {
-            if (curr_ == end_) return *this;
-            auto blk = *last_;
-            tile_[blk.dim_idx] *= blk.block;
-            last_ = curr_;
-            ++curr_;
-            ++idx_;
-            return *this;
-        }
-
-        iterator_t(int ndims, block_iterator_t it, block_iterator_t end)
-            : curr_(it == end ? end : it + 1)
-            , last_(it)
-            , end_(end)
-            , idx_(0)
-            , tile_(ndims, 1) {}
-
-    private:
-        block_iterator_t curr_, last_, end_;
-        int idx_;
-        std::vector<dim_t> tile_;
-    };
-
-    int ndims() const { return ndims_; }
-    const blocks_t &blocks() const { return blocks_; }
-
-    bool empty() const { return begin() == end(); }
-    bool contains_dim(int dim_idx) const {
-        for (auto &blk : blocks_)
-            if (blk.dim_idx == dim_idx) return true;
-        return false;
-    }
-
-    void merge(std::vector<merge_info_t> merges) {
-        using direction_t = merge_info_t::merge_direction_t;
-        if (empty()) {
-            if (blocks_.empty()) blocks_.emplace_back(0, 1, 1);
-            return;
-        }
-
-        std::sort(merges.begin(), merges.end(),
-                [](const merge_info_t &l, const merge_info_t &r) {
-                    return l.iter_idx < r.iter_idx;
-                });
-        auto merge_it = merges.begin();
-        auto merge_end = merges.end();
-        std::vector<block_t> blocks;
-        block_t last = (*begin()).last;
-        for (auto s : *this) {
-            if (merge_it != merge_end && merge_it->iter_idx == s.idx) {
-                if (merge_it->direction == direction_t::backward)
-                    s.curr.dim_idx = last.dim_idx;
-                s.curr.block *= last.block;
-                s.curr.stride = last.stride;
-                ++merge_it;
-            } else
-                blocks.push_back(last);
-            last = s.curr;
-        }
-        blocks.push_back(last);
-        blocks_ = std::move(blocks);
-    }
-
-    void reindex(int ndims, const std::vector<int> &map) {
-        ndims_ = ndims;
-        for (auto &blk : blocks_)
-            blk.dim_idx = map[blk.dim_idx];
-    }
-
-    layout_t layout() const {
-        return {type_, ndims_, offset_, blocks_, /*do_normalize=*/false};
-    }
-
-    iterator_t begin() const {
-        return {ndims_, blocks_.begin(), blocks_.end()};
-    }
-    iterator_t end() const { return {ndims_, blocks_.end(), blocks_.end()}; }
-
-    layout_normalization_t(
-            const layout_t &layout, const std::vector<bool> &dim_empty)
-        : type_(layout.type())
-        , ndims_(layout.ndims())
-        , offset_(layout.offset())
-        , blocks_(normalized_blocks(layout, dim_empty)) {}
-
-private:
-    static std::vector<block_t> normalized_blocks(
-            const layout_t &layout, std::vector<bool> dim_empty) {
-        std::vector<block_t> normalized_blocks;
-        for (auto &eb : layout.enumerated_blocks()) {
-            auto &blk = eb.second;
-            if (blk.block != 1
-                    || (layout.is_outermost(eb) && !dim_empty[blk.dim_idx])) {
-                if (normalized_blocks.empty()
-                        || normalized_blocks.back().dim_idx != blk.dim_idx) {
-                    normalized_blocks.push_back(blk);
-                    dim_empty[blk.dim_idx] = true;
-                } else {
-                    normalized_blocks.back().block *= blk.block;
-                }
-            }
-        }
-        return normalized_blocks;
-    }
-
-    type_t type_;
-    int ndims_;
-    expr_t offset_;
-    blocks_t blocks_;
-};
-
-// Given two layouts, finds an equivalent pair of simpler layouts by attempting
-// to combine consecutive blocks that appear in both layouts at the same level
-// of nesting for the dimensions to which the blocks belong. E.g.,
-//
-//             1.          2.
-// 16a16b16c ---> 256a16c ---> 256a16b
-// 16c16a16b ---> 16c256a ---> 16b256a
-//
-// 1. The consecutive blocks 16a16b are repeated. For the first layout it
-//    appears with an inner tile 1x1x16, and 1x1x1 for the second. Because the
-//    ab-subtile is 1x1 for both and  the inner block (16b) is the same for
-//    both, we can combine these blocks.
-// 2. The b dimension no longer appears, so we can remove it from the layout and
-//    re-index the dimensions so that the new layouts are 2D.
-void reorder_ir_builder_t::normalize_reorder_layouts(layout_t &a, layout_t &b) {
-    using direction_t = merge_info_t::merge_direction_t;
-    int ndims = a.ndims();
-    auto cmp = [](const normalization_stage_t &a,
-                       const normalization_stage_t &b) {
-        return a.elems() <= b.elems();
-    };
-    auto dim_blocks = [](int dim_idx) {
-        return [=](const normalization_stage_t &s) {
-            return s.curr.dim_idx == dim_idx;
-        };
-    };
-
-    std::vector<bool> empty_dimension(ndims, true);
-    for (auto &blk : a.blocks())
-        if (blk.block != 1) empty_dimension[blk.dim_idx] = false;
-    for (auto &blk : b.blocks())
-        if (blk.block != 1) empty_dimension[blk.dim_idx] = false;
-
-    layout_normalization_t a_normalization {a, empty_dimension};
-    layout_normalization_t b_normalization {b, empty_dimension};
-
-    std::vector<merge_info_t> a_merges;
-    std::vector<merge_info_t> b_merges;
-    // Find pairs of consecutive blocks which can be combined
-    for (int i = 0; i < ndims; ++i) {
-        auto dim_i_blocks = dim_blocks(i);
-        auto a_stages = a_normalization | filter(dim_i_blocks);
-        auto b_stages = b_normalization | filter(dim_i_blocks);
-        for (auto p : merge(a_stages, b_stages, cmp)) {
-            if (!p[0].is_dense() || !p[1].is_dense()) continue;
-            direction_t direction = merge_direction(p[0], p[1]);
-            if (direction == direction_t::none) continue;
-            a_merges.emplace_back(p[0].idx, direction);
-            b_merges.emplace_back(p[1].idx, direction);
-        }
-    }
-    a_normalization.merge(std::move(a_merges));
-    b_normalization.merge(std::move(b_merges));
-
-    // Find dimensions present in either normalized layout and construct map of
-    // new dimension indices
-    int curr_dim = 0;
-    std::vector<int> dim_map(ndims);
-    for (int i = 0; i < ndims; ++i)
-        if (a_normalization.contains_dim(i) || b_normalization.contains_dim(i))
-            dim_map[i] = curr_dim++;
-    a_normalization.reindex(curr_dim, dim_map);
-    b_normalization.reindex(curr_dim, dim_map);
-
-    a = a_normalization.layout();
-    b = b_normalization.layout();
 }
 
 void reorder_ir_builder_t::build() {
-    std::vector<int> iter_blocks;
-    std::vector<int> loop_blocks;
-    std::vector<int> tg_blocks;
-    compute_blocks(cfg_.exec_cfg(), src_layout_, dst_layout_, iter_blocks,
-            loop_blocks, tg_blocks);
-
-    int max_iters = 10;
-    int cur_iter_bytes
-            = max_tile_size(cfg_.exec_cfg().hw(), dst_layout_, src_layout_);
-    for (int i = 0; i < max_iters; i++) {
-        if (try_build(iter_blocks, loop_blocks, tg_blocks)) {
-            ir_info() << "Reorder configuration:" << std::endl;
-            ir_info() << "  Source layout:              " << src_layout_
-                      << std::endl;
-            ir_info() << "  Destination layout:         " << dst_layout_
-                      << std::endl;
-            ir_info() << "  Iteration blocks:           "
-                      << ir_utils::make_seq_print_helper(iter_blocks, " x ")
-                      << std::endl;
-            ir_info() << "  Loop blocks:                "
-                      << ir_utils::make_seq_print_helper(loop_blocks, " x ")
-                      << std::endl;
-            ir_info() << "  Thread group blocks:        "
-                      << ir_utils::make_seq_print_helper(tg_blocks, " x ")
-                      << std::endl;
+    const auto &wg_block = cfg_.tiles().front();
+
+    pvar_tile_t iter_tile, loop_tile;
+    for (auto &loop_block : cfg_.tiles()) {
+        if (!wg_block.is_divisible(loop_block)) continue;
+        split_tile(wg_block, loop_block, iter_tile, loop_tile);
+        if (try_build(iter_tile, loop_tile)) {
+            gpu_info() << "Reorder configuration:";
+            gpu_info() << "  Source layout:       " << cfg_.src_layout().user();
+            gpu_info() << "  Destination layout:  " << cfg_.dst_layout().user();
+            gpu_info() << "  Iteration blocks:    " << iter_tile;
+            gpu_info() << "  Loop blocks:         " << loop_tile;
+            gpu_info() << "  Thread group blocks: " << cfg_.thread_group_dims();
             return;
         }
-
-        cur_iter_bytes /= 2;
-        while (cur_iter_bytes >= 1) {
-            std::vector<int> new_iter_blocks;
-            compute_blocks(cfg_.exec_cfg(), src_layout_, dst_layout_,
-                    new_iter_blocks, loop_blocks, tg_blocks, cur_iter_bytes);
-            if (!ir_utils::is_equal(new_iter_blocks, iter_blocks)) {
-                iter_blocks = std::move(new_iter_blocks);
-                break;
-            }
-            cur_iter_bytes /= 2;
-        }
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
 }
 
-bool reorder_ir_builder_t::try_build(const std::vector<int> &iter_blocks,
-        const std::vector<int> &loop_blocks,
-        const std::vector<int> &tg_blocks) {
+bool reorder_ir_builder_t::try_build(
+        const pvar_tile_t &iter_tile, const pvar_tile_t &loop_tile) {
     constraint_set_t init_cset;
 
-    int ndims = src_layout_.ndims();
+    const auto &padded_dims = cfg_.padded_dims().get();
+    auto string_map = padded_dims.to_string_map();
+    const auto ndims = cfg_.src_layout().user().ndims();
+
     std::vector<expr_t> vars;
-    for (int i = 0; i < ndims; i++) {
-        char letter = 'a' + i;
-        vars.push_back(var_t::make(type_t::s32(), std::string(1, letter)));
+    vars.reserve(ndims);
+    for (auto &d : padded_dims) {
+        auto var = var_t::make(type_t::s32(), d.name());
+        vars.emplace_back(var);
     }
 
-    std::vector<int> dim2grid;
-    compute_grid(src_layout_, dst_layout_, iter_blocks, loop_blocks, tg_blocks,
-            kernel_grid_, tg_grid_, &dim2grid);
-
     std::vector<stmt_t> init_stmts;
-    init_kernel_grid(kernel_grid_, tg_grid_, cfg_.exec_cfg().simd(), init_cset,
-            init_stmts);
-
-    std::vector<dim_t> vdims(ndims);
-    for (int i = 0; i < ndims; i++) {
-        vdims[i] = std::max(src_layout_.dim(i), dst_layout_.dim(i));
-    }
-    std::unordered_map<std::string, int> vdim_map;
-    for (int i = 0; i < ndims; i++) {
-        vdim_map[vars[i].as<var_t>().name] = vdims[i];
-    }
+    init_kernel_grid(cfg_.kernel_grid(), cfg_.thread_group_grid(),
+            cfg_.exec_cfg().simd(), init_cset, init_stmts);
 
     view_t src_view(vars, ndims);
-    for (int i = 0; i < ndims; i++) {
-        src_view.set_vdim(vars[i], vdims[i]);
-        src_view.set_tdim(i, vars[i]);
-    }
-    src_view.set_tlayout(src_layout_);
-    src_view.set_tmasks(vdim_map);
-
     view_t dst_view(vars, ndims);
-    for (int i = 0; i < ndims; i++) {
-        dst_view.set_vdim(vars[i], vdims[i]);
-        dst_view.set_tdim(i, vars[i]);
+    for (dim_idx_t i = 0; i < ndims; ++i) {
+        const auto &d = reorder::pvars[i];
+        expr_t &var = vars[i];
+        dim_t vdim = padded_dims[d];
+        src_view.set_vdim(var, vdim);
+        dst_view.set_vdim(var, vdim);
+        src_view.set_tdim(i, var);
+        dst_view.set_tdim(i, var);
     }
-    dst_view.set_tlayout(dst_layout_);
-    dst_view.set_tmasks(vdim_map);
+    src_view.set_tlayout(cfg_.src_layout().user());
+    dst_view.set_tlayout(cfg_.dst_layout().user());
+    src_view.set_tmasks(string_map);
+    dst_view.set_tmasks(string_map);
 
-    gemm_schedule_t schedule(init_cset, kernel_grid_, tg_grid_);
+    gemm_schedule_t schedule(
+            init_cset, cfg_.kernel_grid(), cfg_.thread_group_grid());
 
     schedule.set_view(src_view);
     schedule.set_view(dst_view);
 
     std::array<std::vector<expr_t>, 3> fused_idxs;
-    for (int i = 0; i < ndims; i++) {
+    auto find_grid_idx = [&](const pvar_t &d) {
+        for (dim_idx_t grid_idx = 0; grid_idx < 3; ++grid_idx) {
+            const auto &grid = cfg_.grid()[grid_idx];
+            for (auto &grid_dim : grid)
+                if (grid_dim == d) return grid_idx;
+        }
+        gpu_error_not_expected();
+        return dim_idx::invalid;
+    };
+    for (dim_idx_t i = 0; i < ndims; i++) {
         std::vector<expr_t> ordered;
         auto v = vars[i];
-        if (iter_blocks[i] != 1) {
+        const auto &d = reorder::pvars[i];
+        auto grid_idx = find_grid_idx(d);
+        const auto &iter_dim = iter_tile[d];
+        const auto &loop_dim = loop_tile[d];
+        const auto &tg_dim = cfg_.thread_group_dims().get(d);
+        if (iter_dim != 1) {
             expr_t outer, inner;
-            schedule.split(v, iter_blocks[i], outer, inner);
+            schedule.split(v, iter_dim, outer, inner);
             schedule.tensorize(inner);
             v = outer;
             ordered.insert(ordered.begin(), outer);
         }
-        if (loop_blocks[i] != 1) {
+        if (loop_dim != 1) {
             if (!ordered.empty()) ordered.erase(ordered.begin());
             expr_t outer, inner;
-            schedule.split(v, loop_blocks[i], outer, inner);
+            schedule.split(v, loop_dim, outer, inner);
             v = outer;
             ordered.insert(ordered.begin(), inner);
             ordered.insert(ordered.begin(), outer);
         }
-        if (tg_blocks[i] != 1) {
+        if (tg_dim != 1) {
             if (!ordered.empty()) ordered.erase(ordered.begin());
             expr_t outer, inner;
-            schedule.split(v, tg_blocks[i], outer, inner);
-            schedule.bind(inner, tg_grid_.idx(dim2grid[i]));
+            schedule.split(v, tg_dim, outer, inner);
+            schedule.bind(inner, cfg_.thread_group_grid().idx(grid_idx));
             v = outer;
             ordered.insert(ordered.begin(), inner);
             ordered.insert(ordered.begin(), outer);
         }
-        fused_idxs[dim2grid[i]].push_back(v);
         schedule.reorder(ordered);
+        fused_idxs[grid_idx].push_back(std::move(v));
     }
 
-    for (int i = 0; i < (int)fused_idxs.size(); i++) {
+    for (dim_idx_t i = 0; i < into<dim_idx_t>(fused_idxs.size()); i++) {
         auto &vec = fused_idxs[i];
         if (vec.empty()) continue;
         auto var = (vec.size() == 1 ? vec[0] : schedule.fuse(vec));
-        schedule.bind(var, kernel_grid_.idx(i));
+        schedule.bind(var, cfg_.kernel_grid().idx(i));
     }
 
     schedule.finalize();
 
     auto thr_tile = schedule.thr_view_tile(src_view, /*is_relative=*/false);
-
     auto src_thr_view = src_view.create_sub_view(thr_tile);
     auto dst_thr_view = dst_view.create_sub_view(thr_tile);
 
@@ -767,7 +212,8 @@ bool reorder_ir_builder_t::try_build(const std::vector<int> &iter_blocks,
     bool has_post_ops = dst_md_ && attr_
             && (!attr_->post_ops_.has_default_values()
                     || !attr_->zero_points_.has_default_values()
-                    || !attr_->scales_.has_default_values());
+                    || !attr_->scales_.has_default_values()
+                    || !attr_->rounding_mode_.has_default_values());
 
     if (has_post_ops) {
         post_op_view_mapper_t view_mapper(dst_view);
@@ -815,15 +261,14 @@ bool reorder_ir_builder_t::try_build(const std::vector<int> &iter_blocks,
     int reserved_regs = 16;
     int regs = ir_regs + reserved_regs;
     if (regs > cfg_.exec_cfg().regs()) {
-        ir_warning() << "Estimated GRF usage is " << regs
-                     << " registers which exceeds available space, retry with "
-                        "a smaller tile."
-                     << std::endl;
+        gpu_warning() << "Estimated GRF usage is " << regs
+                      << " registers which exceeds available space, retry with "
+                         "a smaller tile.";
 
         return false;
     }
 
-    ir_trace() << "Reorder kernel body:\n" << stmt_ << std::endl;
+    gpu_trace() << "Reorder kernel body:\n" << stmt_;
     return true;
 }
 
diff --git a/src/gpu/intel/jit/reorder/ir_builder.hpp b/src/gpu/intel/jit/reorder/ir_builder.hpp
index 579f4f04803..88195213962 100644
--- a/src/gpu/intel/jit/reorder/ir_builder.hpp
+++ b/src/gpu/intel/jit/reorder/ir_builder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,63 +38,18 @@ class reorder_ir_builder_t : public ir_builder_t {
     reorder_ir_builder_t(const reorder_config_t &cfg,
             const kernel_info_t &kernel_info, const primitive_attr_t *attr,
             const memory_desc_t *dst_md)
-        : ir_builder_t(kernel_info)
-        , src_layout_(cfg.src_layout().user())
-        , dst_layout_(cfg.dst_layout().user())
-        , cfg_(cfg)
-        , attr_(attr)
-        , dst_md_(dst_md) {
-        normalize_reorder_layouts(src_layout_, dst_layout_);
+        : cfg_(cfg), kernel_info_(kernel_info), attr_(attr), dst_md_(dst_md) {
         build();
     }
 
-    const grid_info_t &kernel_grid() const { return kernel_grid_; }
-
-    static void compute_blocks(const exec_config_t &exec_cfg,
-            const layout_t &src, const layout_t &dst,
-            std::vector<int> &iter_blocks, std::vector<int> &loop_blocks,
-            std::vector<int> &tg_blocks, dim_t max_iter_tile_bytes = 0,
-            dim_t max_thr_tile_bytes = 0);
-
-    static void compute_blocks(const exec_config_t &exec_cfg,
-            const layout_t &src, const layout_t &dst,
-            std::vector<int> &tile_blocks, std::vector<int> &tg_blocks);
-
-    static void compute_grid(const layout_t &src, const layout_t &dst,
-            const std::vector<int> &iter_blocks,
-            const std::vector<int> &loop_blocks,
-            const std::vector<int> &tg_blocks, grid_info_t &kernel_grid,
-            grid_info_t &tg_grid, std::vector<int> *dim2grid = nullptr);
-
-    static compute::nd_range_t nd_range(
-            const exec_config_t &exec_cfg, layout_t src, layout_t dst);
+    const grid_info_t &kernel_grid() const { return cfg_.kernel_grid(); }
 
 private:
     void build() override;
-    bool try_build(const std::vector<int> &iter_blocks,
-            const std::vector<int> &loop_blocks,
-            const std::vector<int> &tg_blocks);
-
-    static void normalize_reorder_layouts(layout_t &a, layout_t &b);
-    static dim_t max_tile_size(
-            const hw_t &hw, const layout_t &dst, const layout_t &src) {
-        // XeHPC is fine with 2048 bytes, XeHPG and below can fit 2048 bytes if
-        // reorder is a simple copy.
-        return (hw <= ngen::HW::XeHPG && dst != src) ? 1024 : 2048;
-    }
-
-    static dim_t count_block_messages(
-            const exec_config_t &exec_cfg, dim_t bytes, dim_t iterations);
-    static dim_t count_scattered_messages(
-            const exec_config_t &exec_cfg, dim_t bytes, dim_t iterations);
-    static dim_t message_latency(const exec_config_t &exec_cfg,
-            const layout_t &l, const tensor_t &t);
+    bool try_build(const pvar_tile_t &iter_tile, const pvar_tile_t &loop_tile);
 
-    grid_info_t kernel_grid_;
-    grid_info_t tg_grid_;
-    layout_t src_layout_;
-    layout_t dst_layout_;
     const reorder_config_t &cfg_;
+    const kernel_info_t &kernel_info_;
     const primitive_attr_t *attr_;
     const memory_desc_t *dst_md_;
 };
diff --git a/src/gpu/intel/jit/reorder/normalization.cpp b/src/gpu/intel/jit/reorder/normalization.cpp
new file mode 100644
index 00000000000..3e0a6c9e2d9
--- /dev/null
+++ b/src/gpu/intel/jit/reorder/normalization.cpp
@@ -0,0 +1,263 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/reorder/normalization.hpp"
+
+#include "gpu/intel/jit/utils/range.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace reorder {
+
+struct normalization_stage_t {
+    int idx;
+    block_t curr, last;
+    std::array<dim_t, 2> tile;
+
+    bool is_dense() const { return curr.stride == last.stride * last.block; }
+
+    dim_t elems() const { return tile[0]; }
+
+    normalization_stage_t() = default;
+    normalization_stage_t(int idx, const block_t &curr, const block_t &last,
+            std::vector<dim_t> tile)
+        : idx(idx)
+        , curr(curr)
+        , last(last)
+        , tile({tile[curr.dim_idx], tile[last.dim_idx]}) {}
+};
+
+struct merge_info_t {
+    enum class merge_direction_t { none = 0, forward, backward };
+
+    int iter_idx;
+    merge_direction_t direction;
+
+    merge_info_t(int iter_idx, merge_direction_t direction)
+        : iter_idx(iter_idx), direction(direction) {}
+};
+
+merge_info_t::merge_direction_t merge_direction(
+        const normalization_stage_t &l, const normalization_stage_t &r) {
+    using direction_t = merge_info_t::merge_direction_t;
+    if (l.curr.dim_idx != r.curr.dim_idx) return direction_t::none;
+    if (l.last.dim_idx != r.last.dim_idx) return direction_t::none;
+    if (l.tile[0] != r.tile[0]) return direction_t::none;
+    if (l.curr.block == r.curr.block
+            && l.tile[1] * l.last.block == r.tile[1] * r.last.block)
+        return direction_t::backward;
+    if (l.tile[1] == r.tile[1] && l.last.block == r.last.block)
+        return direction_t::forward;
+    return direction_t::none;
+}
+
+struct layout_normalization_t {
+    using blocks_t = std::vector<block_t>;
+    using block_iterator_t = typename blocks_t::const_iterator;
+    using stage_t = normalization_stage_t;
+
+    struct iterator_t {
+        bool operator==(const iterator_t &o) const { return curr_ == o.curr_; }
+        bool operator!=(const iterator_t &o) const { return !operator==(o); }
+        stage_t operator*() const { return {idx_, *curr_, *last_, tile_}; }
+        iterator_t &operator++() {
+            if (curr_ == end_) return *this;
+            auto blk = *last_;
+            tile_[blk.dim_idx] *= blk.block;
+            last_ = curr_;
+            ++curr_;
+            ++idx_;
+            return *this;
+        }
+
+        iterator_t(dim_idx_t ndims, block_iterator_t it, block_iterator_t end)
+            : curr_(it == end ? end : it + 1)
+            , last_(it)
+            , end_(end)
+            , idx_(0)
+            , tile_(ndims, 1) {}
+
+    private:
+        block_iterator_t curr_, last_, end_;
+        int idx_;
+        std::vector<dim_t> tile_;
+    };
+
+    int ndims() const { return ndims_; }
+    const blocks_t &blocks() const { return blocks_; }
+
+    bool empty() const { return begin() == end(); }
+    bool contains_dim(dim_idx_t dim_idx) const {
+        for (auto &blk : blocks_)
+            if (blk.dim_idx == dim_idx) return true;
+        return false;
+    }
+
+    void merge(std::vector<merge_info_t> merges) {
+        using direction_t = merge_info_t::merge_direction_t;
+        if (empty()) {
+            if (blocks_.empty()) blocks_.emplace_back(0, 1, 1);
+            return;
+        }
+
+        std::sort(merges.begin(), merges.end(),
+                [](const merge_info_t &l, const merge_info_t &r) {
+                    return l.iter_idx < r.iter_idx;
+                });
+        auto merge_it = merges.begin();
+        auto merge_end = merges.end();
+        std::vector<block_t> blocks;
+        block_t last = (*begin()).last;
+        for (auto s : *this) {
+            if (merge_it != merge_end && merge_it->iter_idx == s.idx) {
+                if (merge_it->direction == direction_t::backward)
+                    s.curr.dim_idx = last.dim_idx;
+                s.curr.block *= last.block;
+                s.curr.stride = last.stride;
+                ++merge_it;
+            } else
+                blocks.push_back(last);
+            last = s.curr;
+        }
+        blocks.push_back(last);
+        blocks_ = std::move(blocks);
+    }
+
+    void reindex(int ndims, const std::vector<int> &map) {
+        ndims_ = ndims;
+        for (auto &blk : blocks_)
+            blk.dim_idx = map[blk.dim_idx];
+    }
+
+    layout_t layout() const {
+        return {type_, ndims_, offset_, blocks_, /*do_normalize=*/false};
+    }
+
+    iterator_t begin() const {
+        return {ndims_, blocks_.begin(), blocks_.end()};
+    }
+    iterator_t end() const { return {ndims_, blocks_.end(), blocks_.end()}; }
+
+    layout_normalization_t(
+            const layout_t &layout, const std::vector<bool> &dim_empty)
+        : type_(layout.type())
+        , ndims_(layout.ndims())
+        , offset_(layout.offset())
+        , blocks_(normalized_blocks(layout, dim_empty)) {}
+
+private:
+    static std::vector<block_t> normalized_blocks(
+            const layout_t &layout, std::vector<bool> dim_empty) {
+        std::vector<block_t> normalized_blocks;
+        for (auto &eb : layout.enumerated_blocks()) {
+            auto &blk = eb.second;
+            if (blk.block != 1
+                    || (layout.is_outermost(eb) && !dim_empty[blk.dim_idx])) {
+                if (normalized_blocks.empty()
+                        || normalized_blocks.back().dim_idx != blk.dim_idx) {
+                    normalized_blocks.push_back(blk);
+                    dim_empty[blk.dim_idx] = true;
+                } else {
+                    normalized_blocks.back().block *= blk.block;
+                }
+            }
+        }
+        return normalized_blocks;
+    }
+
+    type_t type_;
+    dim_idx_t ndims_;
+    expr_t offset_;
+    blocks_t blocks_;
+};
+
+// Given two layouts, finds an equivalent pair of simpler layouts by attempting
+// to combine consecutive blocks that appear in both layouts at the same level
+// of nesting for the dimensions to which the blocks belong. E.g.,
+//
+//             1.          2.
+// 16a16b16c ---> 256a16c ---> 256a16b
+// 16c16a16b ---> 16c256a ---> 16b256a
+//
+// 1. The consecutive blocks 16a16b are repeated. For the first layout it
+//    appears with an inner tile 1x1x16, and 1x1x1 for the second. Because the
+//    ab-subtile is 1x1 for both and  the inner block (16b) is the same for
+//    both, we can combine these blocks.
+// 2. The b dimension no longer appears, so we can remove it from the layout and
+//    re-index the dimensions so that the new layouts are 2D.
+void normalize(layout_t &a, layout_t &b) {
+    using direction_t = merge_info_t::merge_direction_t;
+    int ndims = a.ndims();
+    auto cmp = [](const normalization_stage_t &a,
+                       const normalization_stage_t &b) {
+        return a.elems() <= b.elems();
+    };
+    auto dim_blocks = [](dim_idx_t dim_idx) {
+        return [=](const normalization_stage_t &s) {
+            return s.curr.dim_idx == dim_idx;
+        };
+    };
+
+    std::vector<bool> empty_dimension(ndims, true);
+    for (auto &blk : a.blocks())
+        if (blk.block != 1) empty_dimension[blk.dim_idx] = false;
+    for (auto &blk : b.blocks())
+        if (blk.block != 1) empty_dimension[blk.dim_idx] = false;
+
+    layout_normalization_t a_normalization {a, empty_dimension};
+    layout_normalization_t b_normalization {b, empty_dimension};
+
+    std::vector<merge_info_t> a_merges;
+    std::vector<merge_info_t> b_merges;
+    // Find pairs of consecutive blocks which can be combined
+    for (int i = 0; i < ndims; ++i) {
+        auto dim_i_blocks = dim_blocks(i);
+        auto a_stages = a_normalization | filter(dim_i_blocks);
+        auto b_stages = b_normalization | filter(dim_i_blocks);
+        for (auto p : merge(a_stages, b_stages, cmp)) {
+            if (!p[0].is_dense() || !p[1].is_dense()) continue;
+            direction_t direction = merge_direction(p[0], p[1]);
+            if (direction == direction_t::none) continue;
+            a_merges.emplace_back(p[0].idx, direction);
+            b_merges.emplace_back(p[1].idx, direction);
+        }
+    }
+    a_normalization.merge(std::move(a_merges));
+    b_normalization.merge(std::move(b_merges));
+
+    // Find dimensions present in either normalized layout and construct map of
+    // new dimension indices
+    int curr_dim = 0;
+    std::vector<int> dim_map(ndims);
+    for (int i = 0; i < ndims; ++i)
+        if (a_normalization.contains_dim(i) || b_normalization.contains_dim(i))
+            dim_map[i] = curr_dim++;
+    a_normalization.reindex(curr_dim, dim_map);
+    b_normalization.reindex(curr_dim, dim_map);
+
+    a = a_normalization.layout();
+    b = b_normalization.layout();
+}
+
+} // namespace reorder
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/reorder/normalization.hpp b/src/gpu/intel/jit/reorder/normalization.hpp
new file mode 100644
index 00000000000..4c623bf950a
--- /dev/null
+++ b/src/gpu/intel/jit/reorder/normalization.hpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_REORDER_NORMALIZATION_HPP
+#define GPU_INTEL_JIT_REORDER_NORMALIZATION_HPP
+
+#include <array>
+
+#include "gpu/intel/jit/ir/tensor.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace reorder {
+
+void normalize(layout_t &a, layout_t &b);
+
+} // namespace reorder
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/reorder/reorder_kernel.hpp b/src/gpu/intel/jit/reorder/reorder_kernel.hpp
index bed385ef142..96496f54c45 100644
--- a/src/gpu/intel/jit/reorder/reorder_kernel.hpp
+++ b/src/gpu/intel/jit/reorder/reorder_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,27 +42,18 @@ class reorder_kernel_t : public ir_kernel_t<hw> {
     reorder_kernel_t(const reorder_config_t &cfg,
             const std::string &kernel_name, const kernel_info_t &kernel_info,
             bool require_dpas, const primitive_desc_t *pd = nullptr)
-        : ir_kernel_t<hw>(kernel_name, cfg.exec_cfg(), kernel_info,
-                kernel_info.nd_range().local_range(), require_dpas) {
+        : ir_kernel_t<hw>(kernel_name, cfg.exec_cfg(),
+                kernel_info.nd_range().local_range(), require_dpas,
+                {GENERATOR_NAME, GENERATOR_LINE}) {
         const primitive_attr_t *attr = (pd) ? pd->attr() : nullptr;
         const memory_desc_t *dst_md = (pd) ? pd->dst_md() : nullptr;
+        set_kernel_iface(kernel_info.iface());
         reorder_ir_builder_t builder(cfg, kernel_info, attr, dst_md);
-        stmt_t body = builder.stmt();
+        const stmt_t &body = builder.stmt();
         setup_interface(body);
-        generate_prologue();
-        expr_binding_t expr_binding(hw);
-        bind_external_vars(
-                body, builder.kernel_grid(), builder.local_id(), expr_binding);
 
         // Generate assembly from IR.
-        convert_ir_to_ngen<hw>(body, this, expr_binding);
-
-        generate_epilogue();
-    }
-
-    static compute::nd_range_t nd_range(const exec_config_t &exec_cfg,
-            const layout_t &src, const layout_t &dst) {
-        return reorder_ir_builder_t::nd_range(exec_cfg, src, dst);
+        convert_ir_to_ngen<ir_kernel_t<hw>>(body, this);
     }
 };
 
diff --git a/src/gpu/intel/jit/reorder/tiler.cpp b/src/gpu/intel/jit/reorder/tiler.cpp
new file mode 100644
index 00000000000..ec649a9c10b
--- /dev/null
+++ b/src/gpu/intel/jit/reorder/tiler.cpp
@@ -0,0 +1,294 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/reorder/tiler.hpp"
+
+#include "gpu/intel/jit/utils/range.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace reorder {
+
+enum class message_kind_t {
+    block,
+    scattered,
+};
+
+dim_t max_strided_bytes(
+        const hw_t &hw, const type_t &src_type, const type_t &dst_type) {
+    // These conversions use an additional temporary buffer
+    const bool use_smaller_buffer
+            = utils::one_of(true, src_type.is_fp8(), dst_type.is_fp8())
+            || (src_type.is_x32() && (dst_type.is_bf16() || dst_type.is_f16()))
+            || (src_type.is_f16() && dst_type.is_bf16());
+    // Assume 12 work registers and the rest are used for buffers
+    const int buf_regs = use_smaller_buffer ? 38 : 58;
+    //                                        ~^   ^~
+    //                            (128 - 12) / 3   (128 - 12) / 2
+    // TODO: This should be adjusted when post-ops are present.
+    return buf_regs * hw.grf_size();
+}
+
+dim_t max_packed_bytes(const hw_t &hw) {
+    return 32 * hw.grf_size();
+}
+
+dim_t count_block_messages(
+        const hw_t &hw, dim_t inner_bytes, dim_t iterations) {
+    const auto max_block_owords = hw.grf_size() / 2;
+    const auto oword_size = 16;
+    const auto owords_per_grf = hw.grf_size() / oword_size;
+
+    dim_t block_owords = max_block_owords / 2;
+    auto inner_owords = inner_bytes / oword_size;
+    dim_t messages = inner_owords / max_block_owords;
+    inner_owords -= messages * max_block_owords;
+    // If iterations != 1, tail block messages must end on a grf boundary
+    const dim_t lower_bound = iterations == 1 ? 1 : owords_per_grf;
+    for (; block_owords >= lower_bound; block_owords >>= 1) {
+        if (inner_owords >= block_owords) {
+            inner_owords -= block_owords;
+            messages++;
+        }
+    }
+    gpu_assert(inner_owords == 0);
+    return messages * iterations;
+}
+
+dim_t count_scattered_messages(
+        const hw_t &hw, dim_t inner_bytes, dim_t iterations, int item_size) {
+    constexpr int scattered_message_penalty = 4;
+    const int message_items = hw.grf_size() / 2;
+
+    auto inner_items = (iterations * inner_bytes) / item_size;
+    auto messages = utils::div_up(inner_items, message_items);
+    return messages * scattered_message_penalty;
+}
+
+struct message_info_t {
+    message_info_t() = default;
+    message_info_t(message_kind_t kind, dim_t inner_bytes, dim_t iterations,
+            int item_size)
+        : kind(kind)
+        , inner_bytes(inner_bytes)
+        , iterations(iterations)
+        , item_size(item_size) {}
+
+    message_kind_t kind = message_kind_t::block;
+    dim_t inner_bytes = 0;
+    dim_t iterations = 0;
+    int item_size = 16;
+
+    dim_t latency(const hw_t &hw) const {
+        if (inner_bytes == 0 || iterations == 0) return 0;
+        return kind == message_kind_t::block
+                ? count_block_messages(hw, inner_bytes, iterations)
+                : count_scattered_messages(
+                        hw, inner_bytes, iterations, item_size);
+    }
+};
+
+message_info_t estimate_message_info(
+        const hw_t &hw, const layout_t &layout, const tensor_t &tile) {
+    const auto grf_size = hw.grf_size();
+    bool can_use_block_messages = true;
+    std::vector<dim_t> outer = tile.dims();
+    dim_t inner_elems = 1;
+    int item_size = 16;
+
+    for (auto &blk : layout.blocks()) {
+        auto block = blk.block;
+        auto dim_idx = blk.dim_idx;
+        if (block == 1) continue;
+        if (outer[dim_idx] < block) {
+            if (block % outer[dim_idx] == 0) {
+                inner_elems *= outer[dim_idx];
+                outer[dim_idx] = 1;
+            }
+            break;
+        }
+
+        can_use_block_messages &= (outer[dim_idx] % block == 0);
+        inner_elems *= block;
+        outer[dim_idx] = utils::div_up(outer[dim_idx], block);
+    }
+
+    auto inner_bytes = utils::div_up(
+            layout.type().with_elems(8).size() * inner_elems, 8);
+    auto iterations = tensor_t(outer).elems();
+    can_use_block_messages &= (inner_bytes % 16 == 0);
+    can_use_block_messages &= (iterations == 1 || inner_bytes % grf_size == 0);
+
+    if (inner_bytes == 0 || iterations == 0) return {};
+
+    auto message_kind = can_use_block_messages ? message_kind_t::block
+                                               : message_kind_t::scattered;
+    if (!can_use_block_messages)
+        // Find the largest unit size we can use
+        for (item_size = 8; item_size > 1; item_size >>= 1) {
+            if (inner_bytes % item_size == 0) break;
+        }
+    return {message_kind, inner_bytes, iterations, item_size};
+}
+
+std::vector<tensor_t> tiles(const hw_t &hw, layout_t a, layout_t b) {
+    using tile_pair_t = std::array<tensor_t, 2>;
+
+    std::vector<dim_t> dims(a.ndims());
+    for (dim_idx_t i = 0; i < a.ndims(); ++i)
+        dims[i] = std::max(a.dim(i), b.dim(i));
+
+    // Pad src/dst layouts to match each other.
+    auto pad_layout = [&](layout_t &l) {
+        std::vector<block_t> padded_blocks;
+        for (auto &eb : l.enumerated_blocks()) {
+            auto b = eb.second;
+            if (l.is_outermost(eb)) {
+                dim_t inner = l.dim(b.dim_idx) / b.block;
+                b.block = ir_utils::safe_divide(dims[b.dim_idx], inner);
+            }
+            padded_blocks.push_back(b);
+        }
+        l = {l.type(), l.ndims(), 0, padded_blocks, /*do_normalize=*/false};
+    };
+    pad_layout(a);
+    pad_layout(b);
+    gpu_assert(ir_utils::is_equal(a.dims(), b.dims()));
+
+    auto can_be_mapped = [](const layout_t &l, const tensor_t &t) {
+        std::vector<dim_t> rem_dims = t.dims();
+        for (auto &b : l.blocks()) {
+            auto &rem_dim = rem_dims[b.dim_idx];
+            if (rem_dim >= b.block) {
+                if (rem_dim % b.block != 0) return false;
+                rem_dim /= b.block;
+                continue;
+            }
+            if (b.block % rem_dim != 0) return false;
+            rem_dim = 1;
+        }
+        for (auto d : rem_dims)
+            gpu_assert(d == 1);
+        return true;
+    };
+
+    auto add_pseudo_dimension = [](const layout_t &l) {
+        auto layout_size = l.size();
+        return [=](const tensor_t &t) {
+            auto dims = t.dims();
+            dims.push_back(layout_size);
+            return tensor_t(dims);
+        };
+    };
+
+    auto mappable_tiles = [&](const tensor_t &t) {
+        return can_be_mapped(a, t) && can_be_mapped(b, t);
+    };
+
+    auto merge_tiles = [](const tile_pair_t &p) {
+        auto ndims = p[0].ndims() - 1;
+        std::vector<dim_t> dims(ndims);
+        for (dim_idx_t i = 0; i < ndims; ++i)
+            dims[i] = std::max(p[0](i), p[1](i));
+        return tensor_t(dims);
+    };
+
+    auto take_smaller = [](const tensor_t &a, const tensor_t &b) {
+        return a.elems() < b.elems();
+    };
+
+    const auto eu_count = hw.eu_count();
+    auto cmp = [&](const tensor_t &l, const tensor_t &r) {
+        auto l_threads_reqd = a.elems() / l.elems();
+        auto r_threads_reqd = a.elems() / r.elems();
+        auto l_eu_util = utils::div_up(l_threads_reqd, eu_count);
+        auto r_eu_util = utils::div_up(r_threads_reqd, eu_count);
+        auto l_a_msg = estimate_message_info(hw, a, l);
+        auto l_b_msg = estimate_message_info(hw, b, l);
+        auto r_a_msg = estimate_message_info(hw, a, r);
+        auto r_b_msg = estimate_message_info(hw, b, r);
+        auto l_msg_load = l_a_msg.latency(hw) + l_b_msg.latency(hw);
+        auto r_msg_load = r_a_msg.latency(hw) + r_b_msg.latency(hw);
+
+        // Choose tiles with less message overhead per thread
+        if (l_eu_util * l_msg_load != r_eu_util * r_msg_load)
+            return (l_eu_util * l_msg_load < r_eu_util * r_msg_load);
+
+        // Choose tiles with more bytes per message
+        if (l.elems() * r_msg_load != r.elems() * l_msg_load)
+            return (l.elems() * r_msg_load > r.elems() * l_msg_load);
+
+        // If all else fails, go with the bigger tile
+        return l.elems() > r.elems();
+    };
+
+    // Incrementally increase subtiles in a and b. The goal is to find the
+    // maximum tiles so that the final combined tile covers dense regions as big
+    // as possible in a/b layouts.
+    std::vector<tensor_t> candidate_tiles;
+    auto a_tiles = inner_tiles(a.blocks(), a.ndims()) | filter(mappable_tiles)
+            | transform(add_pseudo_dimension(a));
+    auto b_tiles = inner_tiles(b.blocks(), b.ndims()) | filter(mappable_tiles)
+            | transform(add_pseudo_dimension(b));
+    auto tiles = merge(a_tiles, b_tiles, take_smaller) | transform(merge_tiles);
+
+    const int elem_size = std::max(a.type().size(), b.type().size());
+    const dim_t max_layout_size = max_strided_bytes(hw, a.type(), b.type());
+    const dim_t max_elems = max_packed_bytes(hw) / elem_size;
+
+    auto get_grf_layout_size = [&](const tensor_t &tile) {
+        auto elems = tile.elems();
+        dim_t grf_layout_size = 0;
+        for (const auto &l : {a, b}) {
+            auto info = estimate_message_info(hw, l, tile);
+            int elem_size = std::max(info.item_size, 4);
+            int elem_packing = info.item_size / l.type().size();
+            auto layout_size = elem_size * elems / elem_packing;
+            if (layout_size > grf_layout_size) grf_layout_size = layout_size;
+        }
+        return grf_layout_size;
+    };
+
+    for (auto tile : tiles) {
+        if (tile.elems() > max_elems) break;
+        if (get_grf_layout_size(tile) > max_layout_size) continue;
+        if (candidate_tiles.empty() || !tile.is_equal(candidate_tiles.back()))
+            candidate_tiles.push_back(std::move(tile));
+    }
+    gpu_assert(!candidate_tiles.empty());
+
+    size_t best_idx = 0;
+    for (size_t i = 0; i < candidate_tiles.size(); ++i)
+        if (cmp(candidate_tiles[i], candidate_tiles[best_idx])) best_idx = i;
+    candidate_tiles.resize(best_idx + 1);
+    auto best = candidate_tiles.back();
+    candidate_tiles.erase(
+            std::remove_if(candidate_tiles.begin(), candidate_tiles.end(),
+                    [&](const tensor_t &t) { return !best.is_divisible(t); }),
+            candidate_tiles.end());
+    candidate_tiles.shrink_to_fit();
+    return candidate_tiles;
+}
+
+} // namespace reorder
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/reorder/tiler.hpp b/src/gpu/intel/jit/reorder/tiler.hpp
new file mode 100644
index 00000000000..c5c4a6da0af
--- /dev/null
+++ b/src/gpu/intel/jit/reorder/tiler.hpp
@@ -0,0 +1,40 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_REORDER_TILER_HPP
+#define GPU_INTEL_JIT_REORDER_TILER_HPP
+
+#include <array>
+
+#include "gpu/intel/jit/ir/tensor.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace reorder {
+
+std::vector<tensor_t> tiles(const hw_t &hw, layout_t a, layout_t b);
+
+} // namespace reorder
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/utils/ngen_proxy.hpp b/src/gpu/intel/jit/utils/ngen_proxy.hpp
index fb21a5fa7f3..bb61f392361 100644
--- a/src/gpu/intel/jit/utils/ngen_proxy.hpp
+++ b/src/gpu/intel/jit/utils/ngen_proxy.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ enum AddressModel {
 
 enum AtomicOp { undef, fadd };
 
-class Bundle {
+class Bundle { // NOLINT(readability-identifier-naming)
 public:
     Bundle() : bundle_id(any), bank_id(any) {}
 
@@ -57,7 +57,7 @@ class Bundle {
     int8_t bank_id;
 };
 
-class SBID {
+class SBID { // NOLINT(readability-identifier-naming)
 public:
     SBID(int token = -1) : token(token) {}
 
@@ -66,7 +66,7 @@ class SBID {
     int token;
 };
 
-class InstructionModifier {
+class InstructionModifier { // NOLINT(readability-identifier-naming)
 public:
     bool operator==(const InstructionModifier &other) const {
         return (is_atomic == other.is_atomic);
diff --git a/src/gpu/intel/jit/utils/ngen_type_bridge.hpp b/src/gpu/intel/jit/utils/ngen_type_bridge.hpp
index 7af906bbb42..f6ab36fd64c 100644
--- a/src/gpu/intel/jit/utils/ngen_type_bridge.hpp
+++ b/src/gpu/intel/jit/utils/ngen_type_bridge.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include "common/c_types_map.hpp"
 #include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "ngen.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -39,6 +39,28 @@ inline ngen::DataType convert_dnnl_type_to_ngen(data_type_t dt) {
         case data_type::s32: dt_out = DataType::d; break;
         case data_type::s8: dt_out = DataType::b; break;
         case data_type::u8: dt_out = DataType::ub; break;
+        case data_type::f8_e5m2: dt_out = DataType::bf8; break;
+        case data_type::f8_e4m3: dt_out = DataType::hf8; break;
+        default: assert(!"Unknown datatype");
+    }
+
+    return dt_out;
+}
+
+inline data_type_t convert_ngen_type_to_dnnl(ngen::DataType dt) {
+    using namespace ngen;
+
+    data_type_t dt_out = data_type::undef;
+
+    switch (dt) {
+        case DataType::hf: dt_out = data_type::f16; break;
+        case DataType::bf: dt_out = data_type::bf16; break;
+        case DataType::f: dt_out = data_type::f32; break;
+        case DataType::d: dt_out = data_type::s32; break;
+        case DataType::b: dt_out = data_type::s8; break;
+        case DataType::ub: dt_out = data_type::u8; break;
+        case DataType::bf8: dt_out = data_type::f8_e5m2; break;
+        case DataType::hf8: dt_out = data_type::f8_e4m3; break;
         default: assert(!"Unknown datatype");
     }
 
@@ -54,6 +76,7 @@ inline ngen::HW convert_dnnl_arch_to_ngen(compute::gpu_arch_t gpu_arch) {
         case compute::gpu_arch_t::xe_hpg: return ngen::HW::XeHPG;
         case compute::gpu_arch_t::xe_hpc: return ngen::HW::XeHPC;
         case compute::gpu_arch_t::xe2: return ngen::HW::Xe2;
+        case compute::gpu_arch_t::xe3: return ngen::HW::Xe3;
         case compute::gpu_arch_t::unknown: return ngen::HW::Unknown;
     }
     return ngen::HW::Unknown;
@@ -68,6 +91,7 @@ inline compute::gpu_arch_t convert_ngen_arch_to_dnnl(ngen::HW gpu_arch) {
         case ngen::HW::XeHPG: return compute::gpu_arch_t::xe_hpg;
         case ngen::HW::XeHPC: return compute::gpu_arch_t::xe_hpc;
         case ngen::HW::Xe2: return compute::gpu_arch_t::xe2;
+        case ngen::HW::Xe3: return compute::gpu_arch_t::xe3;
         case ngen::HW::Gen10:
             // Gen10 is not supported. Included here instead of default so
             // warnings are emitted when new architectures are added.
diff --git a/src/gpu/intel/jit/utils/trace.cpp b/src/gpu/intel/jit/utils/trace.cpp
index 74a74c1f172..ca99bf5486e 100644
--- a/src/gpu/intel/jit/utils/trace.cpp
+++ b/src/gpu/intel/jit/utils/trace.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,11 +36,10 @@ ir_utils::debug_profiler_t &get_trace_profiler() {
 void trace_pass(
         const char *pass_name, const stmt_t &stmt, const ir_context_t &ir_ctx) {
     trace_stop(pass_name);
-    ir_trace() << "=== After " << pass_name << std::endl;
-    ir_trace() << stmt << std::endl;
+    gpu_trace() << "=== After " << pass_name << "\n" << stmt;
     auto grf_usage = get_grf_usage(stmt, ir_ctx.hw().grf_size());
-    if (!grf_usage.is_empty()) ir_trace() << grf_usage << std::endl;
-    ir_trace() << ir_ctx.cset() << std::endl;
+    if (!grf_usage.is_empty()) gpu_trace() << grf_usage;
+    gpu_trace() << ir_ctx.cset();
 }
 #endif
 
diff --git a/src/gpu/intel/jit/utils/trace.hpp b/src/gpu/intel/jit/utils/trace.hpp
index ffb584b8d04..e5d0e7fe3f1 100644
--- a/src/gpu/intel/jit/utils/trace.hpp
+++ b/src/gpu/intel/jit/utils/trace.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,23 +32,27 @@ class ir_context_t;
 #ifdef DNNL_DEV_MODE
 ir_utils::debug_profiler_t &get_trace_profiler();
 inline void trace_start() {
-    if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF)
+    if (get_verbose(verbose_t::debuginfo)
+            >= static_cast<int>(log_level_t::perf))
         get_trace_profiler().start();
 }
 inline void trace_reset() {
-    if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF)
+    if (get_verbose(verbose_t::debuginfo)
+            >= static_cast<int>(log_level_t::perf))
         get_trace_profiler().reset();
 }
 inline void trace_stamp(const char *pass_name) {
-    if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF)
+    if (get_verbose(verbose_t::debuginfo)
+            >= static_cast<int>(log_level_t::perf))
         get_trace_profiler().stamp(pass_name);
 }
 inline void trace_stop(const char *pass_name) {
-    if (get_verbose(verbose_t::debuginfo) >= ir_utils::LOG_PERF)
+    if (get_verbose(verbose_t::debuginfo)
+            >= static_cast<int>(log_level_t::perf))
         get_trace_profiler().stop(pass_name);
 }
 inline void trace_perf() {
-    ir_perf() << get_trace_profiler() << std::endl;
+    gpu_perf() << get_trace_profiler();
 }
 #else
 inline void trace_start() {};
diff --git a/src/gpu/intel/jit/utils/utils.cpp b/src/gpu/intel/jit/utils/utils.cpp
index 598df2a6ee7..8611cd34ed4 100644
--- a/src/gpu/intel/jit/utils/utils.cpp
+++ b/src/gpu/intel/jit/utils/utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,11 +21,6 @@ namespace impl {
 namespace gpu {
 namespace intel {
 namespace jit {
-namespace ir_utils {
-
-thread_local int ir_check_log_level_t::level_ = LOG_CHECK_DEFAULT;
-
-} // namespace ir_utils
 
 void stringify_to_cpp_file(const std::string &file_name,
         const std::string &var_name, const std::vector<std::string> &namespaces,
diff --git a/src/gpu/intel/jit/utils/utils.hpp b/src/gpu/intel/jit/utils/utils.hpp
index ada05e3b638..fc4a8efb860 100644
--- a/src/gpu/intel/jit/utils/utils.hpp
+++ b/src/gpu/intel/jit/utils/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,10 +29,12 @@
 #include <unordered_set>
 
 #include "common/math_utils.hpp"
+#include "common/serialization.hpp"
 #include "common/utils.hpp"
 #include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/jit/ngen/ngen.hpp"
+#include "gpu/intel/logging.hpp"
 #include "gpu/intel/utils.hpp"
+#include "ngen.hpp"
 
 #ifdef DNNL_DEV_MODE
 #include "common/profiler.hpp"
@@ -53,16 +55,6 @@ inline std::ostream &operator<<(std::ostream &out, const T &obj) {
 
 namespace ir_utils {
 
-const int LOG_DYNAMIC = -1;
-const int LOG_OFF = 0;
-const int LOG_FATAL = 50;
-const int LOG_WARNING = 100;
-const int LOG_SUGGESTION = 120;
-const int LOG_INFO = 150;
-const int LOG_PERF = 170;
-const int LOG_TRACE = 200;
-const int LOG_CHECK_DEFAULT = LOG_TRACE;
-
 template <typename T>
 size_t get_hash(const T &t);
 
@@ -133,6 +125,16 @@ size_t get_hash(const std::vector<T> &v) {
     return h;
 }
 
+template <typename Key, typename T, typename Compare, typename Allocator>
+size_t get_hash(const std::map<Key, T, Compare, Allocator> &m) {
+    size_t h = 0;
+    for (auto &kv : m) {
+        h = hash_combine(h, get_hash(kv.first));
+        h = hash_combine(h, get_hash(kv.second));
+    }
+    return h;
+}
+
 template <typename... ArgsT>
 size_t get_hash(const ArgsT &...args) {
     size_t h = 0;
@@ -211,187 +213,6 @@ bool contains(const std::vector<T> &vec, const U &u) {
     return false;
 }
 
-// Checks assertion and, in case of error, evaluates output operators to print
-// related messages. Usage:
-//     ir_assert(condition) << "Error message" << ...;
-
-#if !defined(NDEBUG) || defined(DNNL_DEV_MODE)
-#define ir_assert(cond) \
-    !(cond) \
-            && dnnl::impl::gpu::intel::gpu_utils::error_stream_t( \
-                    __FILE__, __LINE__, #cond)
-#else
-#define ir_assert(cond) \
-    (false) && !(cond) \
-            && dnnl::impl::gpu::intel::gpu_utils::error_stream_t( \
-                    __FILE__, __LINE__, #cond)
-#endif
-
-#define ir_error_not_expected() ir_assert(false) << "Not expected. "
-#define ir_except_not_implemented(msg) throw std::runtime_error(msg)
-
-template <int level, bool value = true, bool add_new_line = false>
-class base_logger_t {
-public:
-    template <int L = level>
-    base_logger_t(
-            typename std::enable_if<L == LOG_DYNAMIC, int>::type dynamic_level,
-            std::ostream &out = std::cout)
-        : dynamic_level_(dynamic_level), out_(out) {}
-    template <int L = level>
-    base_logger_t(
-            typename std::enable_if<L != LOG_DYNAMIC, std::ostream &>::type out
-            = std::cout)
-        : out_(out) {}
-    ~base_logger_t() {
-        if (add_new_line && !is_first_print_) out_ << std::endl;
-#if defined(DNNL_DEV_MODE)
-        if (get_level() <= LOG_FATAL) {
-            out_ << "Aborting after fatal error..." << std::endl;
-            abort();
-        }
-#endif
-    }
-
-    template <int L = level>
-    static typename std::enable_if<L != LOG_DYNAMIC, bool>::type is_enabled() {
-#if defined(DNNL_DEV_MODE)
-        return level <= LOG_FATAL || get_verbose(verbose_t::debuginfo) >= level;
-#else
-        return false;
-#endif
-    }
-
-    template <int L = level>
-    typename std::enable_if<L != LOG_DYNAMIC, int>::type get_level() const {
-        return level;
-    }
-
-    template <int L = level>
-    typename std::enable_if<L == LOG_DYNAMIC, int>::type get_level() const {
-        return dynamic_level_;
-    }
-
-    operator bool() const { return value; }
-
-    template <typename T>
-    base_logger_t &operator<<(const T &obj) {
-        using dnnl::impl::gpu::intel::jit::operator<<;
-        maybe_print_header();
-        out_ << obj;
-        return *this;
-    }
-
-    base_logger_t &operator<<(std::ostream &(*os)(std::ostream &)) {
-        maybe_print_header();
-        out_ << os;
-        return *this;
-    }
-
-private:
-    void maybe_print_header() {
-        if (!is_first_print_) return;
-
-        switch (get_level()) {
-            case LOG_FATAL: out_ << "[FATAL] "; break;
-            case LOG_WARNING: out_ << "[WARNING] "; break;
-            case LOG_SUGGESTION: out_ << "[SUGGESTION] "; break;
-            default: break;
-        }
-        is_first_print_ = false;
-    }
-
-    int dynamic_level_ = level;
-    std::ostream &out_;
-    bool is_first_print_ = true;
-};
-
-template <int level>
-class logger_t : public base_logger_t<level> {
-public:
-    logger_t() : base_logger_t<level>() {}
-};
-
-class ir_check_log_level_t {
-public:
-    static int level() { return level_; }
-    static bool is_enabled() {
-#if defined(DNNL_DEV_MODE)
-        switch (level_) {
-            case LOG_FATAL: return logger_t<LOG_FATAL>::is_enabled();
-            case LOG_TRACE: return logger_t<LOG_TRACE>::is_enabled();
-            default: abort();
-        }
-#else
-        return false;
-#endif
-    }
-    ir_check_log_level_t(int new_level) {
-        old_level_ = level_;
-        level_ = new_level;
-    }
-    ~ir_check_log_level_t() { level_ = old_level_; }
-    ir_check_log_level_t(const ir_check_log_level_t &) = delete;
-
-private:
-    static thread_local int level_;
-    int old_level_ = LOG_CHECK_DEFAULT;
-};
-
-template <bool value = true, bool add_new_line = false>
-base_logger_t<LOG_DYNAMIC, value, add_new_line> make_logger(int level) {
-    return base_logger_t<LOG_DYNAMIC, value, add_new_line>(level);
-}
-
-#define ir_perf() \
-    ir_utils::logger_t<ir_utils::LOG_PERF>::is_enabled() \
-            && ir_utils::logger_t<ir_utils::LOG_PERF>()
-
-// Trace can result in overhead making measurement meaningless
-#define ir_perf_no_trace() \
-    ir_utils::logger_t<ir_utils::LOG_PERF>::is_enabled() \
-            && !ir_utils::logger_t<ir_utils::LOG_TRACE>::is_enabled() \
-            && ir_utils::logger_t<ir_utils::LOG_PERF>()
-
-#define ir_info() \
-    ir_utils::logger_t<ir_utils::LOG_INFO>::is_enabled() \
-            && ir_utils::logger_t<ir_utils::LOG_INFO>()
-
-#define ir_warning() \
-    ir_utils::logger_t<ir_utils::LOG_WARNING>::is_enabled() \
-            && ir_utils::logger_t<ir_utils::LOG_WARNING>()
-
-#define ir_suggestion() \
-    ir_utils::logger_t<ir_utils::LOG_SUGGESTION>::is_enabled() \
-            && ir_utils::logger_t<ir_utils::LOG_SUGGESTION>()
-
-#define ir_trace() \
-    ir_utils::logger_t<ir_utils::LOG_TRACE>::is_enabled() \
-            && ir_utils::logger_t<ir_utils::LOG_TRACE>()
-
-#define ir_check(cond) \
-    if (!(cond)) \
-    return ir_utils::ir_check_log_level_t::is_enabled() \
-            && ir_utils::make_logger<false, true>( \
-                    ir_utils::ir_check_log_level_t::level())
-
-// This macro enables logging in all nested ir_check() calls. This is useful
-// when a check function can be used in scenarios when a failed check is
-// expected (regular check) or unexpected (e.g. debug assertion).
-// Example 1 (regular check):
-//     for (auto &cfg: generate_configs()) {
-//         // No logging here.
-//         if (!cfg.is_ok()) continue;
-//         ...
-//     }
-//
-// Example 2 (debug assertion):
-//     auto config = read_from_environment(...);
-//     // Detailed logging will show the cause of the failed assertion.
-//     ir_assert(ir_check_fatal(config.is_ok()));
-#define ir_check_fatal(call) \
-    (ir_utils::ir_check_log_level_t(ir_utils::LOG_FATAL), (call))
-
 // Pretty printers for STL objects.
 template <typename KeyT, typename HashT, typename EqualT>
 inline std::ostream &operator<<(
@@ -452,7 +273,7 @@ inline std::ostream &operator<<(std::ostream &out, const std::vector<T> &v) {
 template <typename T, typename = void>
 struct str_ostream_helper_t {
     static std::string call(const T &t) {
-        ir_error_not_expected();
+        gpu_error_not_expected();
         return {};
     }
 };
@@ -563,7 +384,7 @@ class table_t {
 
 private:
     void new_row() {
-        ir_assert(cur_row_.size() == header_.size());
+        gpu_assert(cur_row_.size() == header_.size());
         rows_.emplace_back();
         rows_.back().swap(cur_row_);
     }
@@ -579,6 +400,10 @@ inline std::string to_string(bool b) {
     return b ? "True" : "False";
 }
 
+inline std::string to_yes_no(bool b) {
+    return b ? "Yes" : "No";
+}
+
 inline std::string to_lower(const std::string &s) {
     auto ret = s;
     std::transform(ret.begin(), ret.end(), ret.begin(),
@@ -586,6 +411,13 @@ inline std::string to_lower(const std::string &s) {
     return ret;
 }
 
+inline std::string to_upper(const std::string &s) {
+    auto ret = s;
+    std::transform(ret.begin(), ret.end(), ret.begin(),
+            [](char c) { return std::toupper(c); });
+    return ret;
+}
+
 inline std::string add_indent(const std::string &s, const std::string &indent,
         bool skip_first = false) {
     auto lines = gpu_utils::split(s, "\n");
@@ -621,7 +453,7 @@ inline T max_divisor(T n, std::initializer_list<T> divisors) {
     for (auto d : divisors) {
         if (n % d == 0) ret = std::max(ret, d);
     }
-    ir_assert(ret != -1);
+    gpu_assert(ret != -1);
     return ret;
 }
 
@@ -633,7 +465,7 @@ inline T max_pow2_divisor(T n) {
 
 template <typename T, typename U>
 inline T safe_divide(T a, U b) {
-    ir_assert(b != 0 && a % b == 0) << "Can't divide: " << a << " / " << b;
+    gpu_assert(b != 0 && a % b == 0) << "Can't divide: " << a << " / " << b;
     return a / b;
 }
 
@@ -781,16 +613,16 @@ inline std::vector<std::pair<std::string, int>> to_string_int_pairs(
 // Adapted version of magicgu function from Hacker's Delight 10-15.
 inline void idiv_magicgu(uint32_t d, uint32_t &m, uint32_t &p) {
     uint32_t s32_max = std::numeric_limits<int32_t>::max();
-    ir_assert(d != 0 && d <= s32_max);
+    gpu_assert(d != 0 && d <= s32_max);
     uint64_t nc = (s32_max / d) * d - 1;
     for (p = 32; p < 64; p++) {
         uint64_t _2p = 1LL << p;
         if (_2p > nc * (d - 1 - (_2p - 1) % d)) {
-            m = (_2p + d - 1 - (_2p - 1) % d) / d;
+            m = into<uint32_t>((_2p + d - 1 - (_2p - 1) % d) / d);
             return;
         }
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
 }
 
 inline uint64_t idiv_magicgu_packed(uint32_t d) {
@@ -805,10 +637,11 @@ inline uint64_t idiv_magicgu_packed(uint32_t d) {
 
 // Calculate how many unique filter padding states a conv dimension can produce
 // (see conv_post_op_view_mapper_t for more context)
-inline int max_unique_pad_states(int O, int I, int KD, int P, int S, bool lim) {
-    int retn = 1;
+inline dim_t max_unique_pad_states(
+        dim_t O, dim_t I, dim_t KD, dim_t P, dim_t S, bool lim) {
+    dim_t retn = 1;
     if (I > KD) {
-        retn += std::min((O - 1) * S - P, 0)
+        retn += std::min((O - 1) * S - P, dim_t(0))
                 + std::max((O - 1) * S + (KD - P), I) + (P - I);
     } else { // I <= KD, no two states are the same
         retn += (O - 1) * S;
@@ -822,7 +655,7 @@ template <typename T>
 T stream_parse(std::istream &in) {
     T t;
     in >> t;
-    ir_assert(!in.fail());
+    gpu_assert(!in.fail());
     return t;
 }
 
@@ -839,7 +672,7 @@ inline void stream_match(std::istream &in, const std::string &s) {
     for (auto &c : s) {
         auto next = in.get();
         if (next != c || in.fail())
-            ir_error_not_expected() << "Cannot match " << s;
+            gpu_error_not_expected() << "Cannot match " << s;
     }
 }
 
@@ -884,7 +717,8 @@ bool is_enum_name_templ_impl(
     inline std::string to_string(enum_type e) { \
         return to_string_impl(e, enum_names); \
     } \
-    inline void to_enum_impl(const std::string &s, enum_type &e) { \
+    inline void to_enum_impl( \
+            const std::string &s, decltype(std::declval<enum_type>()) &e) { \
         to_enum_templ_impl(s, e, enum_names); \
     } \
     inline bool is_enum_name_impl(const std::string &s, const enum_type *) { \
@@ -901,6 +735,7 @@ static auto hw_names = nstl::to_array({
         make_enum_name(ngen::Core::XeHPG, "xehpg"),
         make_enum_name(ngen::Core::XeHPC, "xehpc"),
         make_enum_name(ngen::Core::Xe2, "xe2"),
+        make_enum_name(ngen::Core::Xe3, "xe3"),
 });
 GPU_DEFINE_PARSE_ENUM(ngen::HW, hw_names)
 
@@ -918,6 +753,7 @@ static auto product_family_names = nstl::to_array({
         make_enum_name(ngen::ProductFamily::GenericXeHPC, "xehpc"),
         make_enum_name(ngen::ProductFamily::PVC, "pvc"),
         make_enum_name(ngen::ProductFamily::GenericXe2, "xe2"),
+        make_enum_name(ngen::ProductFamily::GenericXe3, "xe3"),
 });
 GPU_DEFINE_PARSE_ENUM(ngen::ProductFamily, product_family_names)
 
@@ -1074,18 +910,63 @@ T parse(const std::string &s) {
     return t;
 }
 
+class parse_result_t {
+public:
+    const std::unordered_map<std::string, std::string> &args() const {
+        return args_;
+    }
+    void set_arg(const std::string &name, const std::string &value) {
+        args_[name] = value;
+    }
+    bool is_set(const std::string &name) const { return args_.count(name) > 0; }
+    const std::string &arg_value(const std::string &name) const {
+        gpu_assert(is_set(name)) << "Argument is not set: " << name;
+        return args_.at(name);
+    }
+
+private:
+    std::unordered_map<std::string, std::string> args_;
+};
+
 template <typename T>
 class parse_iface_t {
 public:
     using base_type = T;
+    using str_default_func_type = std::string (*)(const T &);
+
+    template <typename U>
+    static std::string str_default_func(const T &) {
+        return jit::stringify(U());
+    }
+
+    struct entry_t {
+        std::string name;
+        std::string help;
+        bool required = false;
+        std::function<std::string(const T &)> _default;
+        std::function<void(std::ostream &, const T &)> stringify;
+        std::function<void(std::istream &, T &)> parse;
+
+        bool matches_relaxed(const std::string &_s) const {
+            auto s = (_s.find("--") == 0 ? _s.substr(2) : _s);
+            if (s.length() != name.length()) return false;
+            for (size_t i = 0; i < s.length(); i++) {
+                if (s[i] == name[i]) continue;
+                if (s[i] == '-' && name[i] == '_') continue;
+                return false;
+            }
+            return true;
+        }
+    };
 
     template <typename U, U T::*ptr>
     void add(const std::string &name = {}, const std::string &help = {},
-            bool required = false) {
+            bool required = false,
+            const str_default_func_type &_default = str_default_func<U>) {
         entry_t e;
         e.name = name;
         e.help = help;
-        e._default = jit::stringify(U());
+        e._default = _default;
         e.required = required;
         e.stringify = [](std::ostream &out, const T &parent) {
             jit::stringify(out, parent.*ptr);
@@ -1093,10 +974,14 @@ class parse_iface_t {
         e.parse = [](std::istream &in, T &parent) {
             jit::parse(in, parent.*ptr);
         };
+        add(e);
+    }
+
+    void add(const entry_t &e) {
         if (relaxed_) {
-            ir_assert(!e.name.empty())
+            gpu_assert(!e.name.empty())
                     << "Relaxed support requires non-empty name.";
-            ir_assert(!e.help.empty())
+            gpu_assert(!e.help.empty())
                     << "Relaxed support requires non-empty help.";
         }
         entries_.push_back(e);
@@ -1120,7 +1005,7 @@ class parse_iface_t {
         for (auto &e : entries_) {
             std::ostringstream e_oss;
             e.stringify(e_oss, parent);
-            if (!e.required && e_oss.str() == e._default) continue;
+            if (!e.required && e_oss.str() == e._default(parent)) continue;
             if (!is_first) out << " ";
             if (!e.name.empty()) {
                 if (cli) {
@@ -1134,11 +1019,13 @@ class parse_iface_t {
         }
     }
 
-    void parse(std::istream &in, T &parent) const {
+    void parse(std::istream &in, T &parent,
+            parse_result_t *result = nullptr) const {
         parent = T();
         if (relaxed_) {
-            parse_relaxed(in, parent);
+            parse_relaxed(in, parent, result);
         } else {
+            gpu_assert(!result);
             for (auto &e : entries_) {
                 if (!e.name.empty()) {
                     stream_match(in, e.name);
@@ -1150,9 +1037,10 @@ class parse_iface_t {
         if (post_parse_func_) post_parse_func_(parent);
     }
 
-    void parse(const std::string &s, T &parent) const {
+    void parse(const std::string &s, T &parent,
+            parse_result_t *result = nullptr) const {
         std::istringstream iss(s);
-        parse(iss, parent);
+        parse(iss, parent, result);
     }
 
     int size() const { return static_cast<int>(entries_.size()); }
@@ -1174,26 +1062,6 @@ class parse_iface_t {
     }
 
 private:
-    struct entry_t {
-        std::string name;
-        std::string help;
-        std::string _default;
-        bool required = false;
-        std::function<void(std::ostream &, const T &)> stringify;
-        std::function<void(std::istream &, T &)> parse;
-
-        bool matches_relaxed(const std::string &_s) const {
-            auto s = (_s.find("--") == 0 ? _s.substr(2) : _s);
-            if (s.length() != name.length()) return false;
-            for (size_t i = 0; i < s.length(); i++) {
-                if (s[i] == name[i]) continue;
-                if (s[i] == '-' && name[i] == '_') continue;
-                return false;
-            }
-            return true;
-        }
-    };
-
     int find_entry_index(const std::string name) const {
         for (int i = 0; i < (int)entries_.size(); i++) {
             if (entries_[i].matches_relaxed(name)) return i;
@@ -1201,28 +1069,30 @@ class parse_iface_t {
         return -1;
     }
 
-    void parse_relaxed(std::istream &in, T &parent) const {
+    void parse_relaxed(std::istream &in, T &parent,
+            parse_result_t *result = nullptr) const {
         std::vector<bool> seen(entries_.size());
         while (true) {
             std::string name;
             std::string value;
             if (!try_parse_key_value(in, name, value)) break;
             auto idx = find_entry_index(name);
-            ir_assert(idx != -1);
+            gpu_assert(idx != -1);
             if (seen[idx]) {
                 std::cout << "Error: argument set twice: " << name << std::endl;
-                ir_error_not_expected();
+                gpu_error_not_expected();
                 exit(1);
             }
             std::istringstream iss(value);
             entries_[idx].parse(iss, parent);
             seen[idx] = true;
+            if (result) result->set_arg(name, value);
         }
         for (size_t i = 0; i < entries_.size(); i++) {
             if (entries_[i].required && !seen[i]) {
                 std::cout << "Error: missing required argument: "
                           << entries_[i].name << std::endl;
-                ir_error_not_expected();
+                gpu_error_not_expected();
                 exit(1);
             }
         }
@@ -1277,7 +1147,7 @@ std::string to_string_impl(
         E e, const std::array<enum_name_t<E>, N> &enum_names) {
     for (auto &p : enum_names)
         if (p.first == e) return p.second;
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return {};
 }
 
@@ -1290,7 +1160,7 @@ void to_enum_templ_impl(const std::string &s, E &e,
             return;
         }
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
 }
 
 template <typename E>
@@ -1326,6 +1196,38 @@ void stringify_to_cpp_file(const std::string &file_name,
         const std::string &var_name, const std::vector<std::string> &namespaces,
         const std::vector<std::string> &lines);
 
+inline std::string data_to_hex(const std::vector<uint8_t> &data) {
+    std::ostringstream oss;
+    for (auto v : data) {
+        oss << std::uppercase << std::hex << std::setw(2) << std::setfill('0')
+            << into<int>(v);
+    }
+    return oss.str();
+}
+
+inline std::vector<uint8_t> hex_to_data(const std::string &s_hex) {
+    std::vector<uint8_t> data;
+    for (size_t i = 0; i < s_hex.size(); i += 2) {
+        data.push_back(static_cast<uint8_t>(
+                std::stoi(s_hex.substr(i, 2), nullptr, 16)));
+    }
+    return data;
+}
+
+template <typename T>
+std::string serialize_to_hex(const T &t) {
+    serialization_stream_t s;
+    s.append(t);
+    return data_to_hex(s.get_data());
+}
+
+template <typename T>
+void deserialize_from_hex(T &t, const std::string &s_hex) {
+    auto s = serialization_stream_t::from_data(hex_to_data(s_hex));
+    deserializer_t d(s);
+    d.pop(t);
+}
+
 } // namespace jit
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/jit/v2/conv/README.md b/src/gpu/intel/jit/v2/conv/README.md
index 8e52c9d0663..2b535179294 100644
--- a/src/gpu/intel/jit/v2/conv/README.md
+++ b/src/gpu/intel/jit/v2/conv/README.md
@@ -11,21 +11,19 @@ This is a new convolution implementation for GPU which aims to solve two issues:
 ### How to build and test
 
 ```bash
-# 1. Build with OpenCL GPU runtime
-cmake . -Bbuild -DONEDNN_GPU_RUNTIME=OCL -DONEDNN_DEV_MODE=ON -DDNNL_GPU_CONV_PLANNER=ON -DONEDNN_BUILD_GRAPH=OFF
+# 1. Build with OpenCL GPU runtime with experimental support to enable v2 convolution
+cmake . -Bbuild -DONEDNN_GPU_RUNTIME=OCL -DONEDNN_EXPERIMENTAL=ON -DONEDNN_BUILD_GRAPH=OFF
 make -C build -j `nproc` benchdnn gpu_conv_planner
 
 # 2. Test
-export enable_conv_v2=1
-./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --dir=FWD_I --batch=shapes_resnet_50_v1_5
+./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --impl=v2 --dir=FWD_I --batch=shapes_resnet_50_v1_5
 ...
 run: --mode=F --conv --engine=gpu --dir=FWD_I ic64ih56oc64oh56kh3ph1n"resnet_50_v1_5:res2a_branch2b*3"
 perf,gpu,jit:ir_v2,"resnet_50_v1_5:res2a_branch2b*3",--mode=F --conv --engine=gpu --dir=FWD_I ic64ih56oc64oh56kh3ph1n"resnet_50_v1_5:res2a_branch2b*3",0.451478,155.925,0.10656,4236.84,0.107055,4217.25
 
 # 3. Set kernel descriptor from environment
-export enable_conv_v2=1
 export desc="--prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128 --iter ic16mb16oc32 --tg ow4oc4 --loop-desc kw,kh,kd,ic --load a:2d,b:2d --store c:2d"
-./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --dir=FWD_I --dt=f32 mb128ic256ih56oc64oh56kh1ph0
+./build/tests/benchdnn/benchdnn -v5 --engine=gpu --mode=F --conv --impl=v2 --dir=FWD_I --dt=f32 mb128ic256ih56oc64oh56kh1ph0
 ...
 perf,gpu,jit:ir_v2,,--mode=F --conv --engine=gpu --dir=FWD_I mb128ic256ih56oc64oh56kh1ph0,13.1533,158.426,1.124,11702.3,1.13858,11552.4
 ```
@@ -44,7 +42,6 @@ event of changes in the kernel generation or of adding new features, use the
 snippet below to overwrite the kernel registry in oneDNN.
 
 ```bash
-export enable_conv_v2=1
 export ONEDNN_GPU_CONV_PLAN_REGISTRY_PATH=plan_registry_data.txt
 ./build/src/gpu/intel/jit/v2/conv/planner/gpu_conv_planner --auto-search
 cp ${ONEDNN_GPU_CONV_PLAN_REGISTRY_PATH}.cpp /path/to/onednn/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp
diff --git a/src/gpu/intel/jit/v2/conv/bench_data.cpp b/src/gpu/intel/jit/v2/conv/bench_data.cpp
index 032dd71ef7d..9536d8b45b3 100644
--- a/src/gpu/intel/jit/v2/conv/bench_data.cpp
+++ b/src/gpu/intel/jit/v2/conv/bench_data.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,11 +30,72 @@ std::string bench_data_t::str() const {
     std::ostringstream oss;
     for (int i = 0; i < size(); i++) {
         if (i > 0) oss << std::endl;
-        oss << "bench," << prbs[i].csv_str() << "," << times[i];
+        double gops_sec = prbs[i].ops() / times[i].total;
+        oss << "bench," << prbs[i].csv_str() << "," << times[i].total << ","
+            << gops_sec;
     }
     return oss.str();
 }
 
+std::vector<int> bench_data_set_t::find_best_ids(int nbest) const {
+    auto idxs = find_best_idxs(nbest);
+    std::vector<int> ret;
+    ret.reserve(idxs.size());
+    for (int idx : idxs)
+        ret.push_back(vec_[idx].id);
+    return ret;
+}
+
+std::vector<bench_data_t> bench_data_set_t::find_best(int nbest) const {
+    auto idxs = find_best_idxs(nbest);
+    std::vector<bench_data_t> ret;
+    ret.reserve(idxs.size());
+    for (int idx : idxs)
+        ret.push_back(vec_[idx]);
+    return ret;
+}
+
+std::vector<int> bench_data_set_t::find_best_idxs(int _nbest) const {
+    int nbest = std::min(_nbest, (int)vec_.size());
+    if (nbest == 0) return {};
+    int nprbs = vec_[0].size();
+    uint64_t max_time = std::numeric_limits<uint64_t>::max();
+    std::vector<uint64_t> best_times(nprbs, max_time);
+    std::vector<uint64_t> cur_times(nprbs, max_time);
+    for (auto &bd : vec_) {
+        for (int i = 0; i < nprbs; i++) {
+            best_times[i] = std::min(best_times[i], bd.times[i].total);
+        }
+    }
+    std::unordered_set<int> best_idxs;
+    for (int k = 0; k < nbest; k++) {
+        double best_geomean = 0;
+        int best_idx = -1;
+        // Greedily select the kernel that gives the highest improvement in geomean.
+        for (int i = 0; i < size(); i++) {
+            if (best_idxs.count(i) > 0) continue;
+            double geomean = 1.0;
+            for (int j = 0; j < nprbs; j++) {
+                double ratio = best_times[j]
+                        / (double)std::min(
+                                cur_times[j], vec_[i].times[j].total);
+                geomean *= std::pow(ratio, 1.0 / nprbs);
+            }
+            if (geomean >= best_geomean) {
+                best_geomean = geomean;
+                best_idx = i;
+            }
+        }
+        gpu_assert(best_idx != -1);
+        for (int j = 0; j < nprbs; j++) {
+            cur_times[j]
+                    = std::min(cur_times[j], vec_[best_idx].times[j].total);
+        }
+        best_idxs.insert(best_idx);
+    }
+    return std::vector<int>(best_idxs.begin(), best_idxs.end());
+}
+
 } // namespace conv
 } // namespace v2
 } // namespace jit
diff --git a/src/gpu/intel/jit/v2/conv/bench_data.hpp b/src/gpu/intel/jit/v2/conv/bench_data.hpp
index dc1e2538a96..6e20f865e0b 100644
--- a/src/gpu/intel/jit/v2/conv/bench_data.hpp
+++ b/src/gpu/intel/jit/v2/conv/bench_data.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,20 +29,51 @@ namespace jit {
 namespace v2 {
 namespace conv {
 
+// Stores device times for primitive execution.
+// Includes:
+// - Total time, computed via profiling queries as:
+//     the end of the last kernel - the start of the first kernel
+// - Kernel times: individual kernel times
+struct bench_time_t {
+    uint64_t total = 0;
+    std::vector<uint64_t> kernel_times;
+
+    bench_time_t() = default;
+    bench_time_t(uint64_t total) : total(total) {
+        kernel_times.push_back(total);
+    }
+    template <typename IteratorT>
+    bench_time_t(uint64_t total, IteratorT beg, IteratorT end) : total(total) {
+        kernel_times = std::vector<uint64_t>(beg, end);
+    }
+    int nkernels() const { return (int)kernel_times.size(); }
+
+    bench_time_t min(const bench_time_t &other) const {
+        bench_time_t ret = *this;
+        ret.total = std::min(ret.total, other.total);
+        for (int i = 0; i < nkernels(); i++) {
+            ret.kernel_times[i]
+                    = std::min(ret.kernel_times[i], other.kernel_times[i]);
+        }
+        return ret;
+    }
+};
+
 class bench_data_t {
 public:
+    int id = -1;
     kernel_desc_t kernel_desc;
     std::vector<problem_t> prbs;
-    std::vector<uint64_t> times;
+    std::vector<bench_time_t> times;
 
     bench_data_t() = default;
-    explicit bench_data_t(const kernel_desc_t &kernel_desc)
-        : kernel_desc(kernel_desc) {}
+    explicit bench_data_t(int id, const kernel_desc_t &kernel_desc)
+        : id(id), kernel_desc(kernel_desc) {}
 
     int size() const { return (int)prbs.size(); }
     explicit operator bool() const { return size() > 0; }
 
-    void add(const problem_t &prb, uint64_t time) {
+    void add(const problem_t &prb, const bench_time_t &time) {
         prbs.push_back(prb);
         times.push_back(time);
     }
@@ -50,6 +81,23 @@ class bench_data_t {
     std::string str() const;
 };
 
+class bench_data_set_t {
+public:
+    void add(const bench_data_t &bd) { vec_.push_back(bd); }
+    int size() const { return (int)vec_.size(); }
+    std::vector<bench_data_t>::const_iterator begin() const {
+        return vec_.begin();
+    }
+    std::vector<bench_data_t>::const_iterator end() const { return vec_.end(); }
+    std::vector<int> find_best_ids(int nbest) const;
+    std::vector<bench_data_t> find_best(int nbest) const;
+
+private:
+    std::vector<int> find_best_idxs(int nbest) const;
+
+    std::vector<bench_data_t> vec_;
+};
+
 } // namespace conv
 } // namespace v2
 } // namespace jit
diff --git a/src/gpu/intel/jit/v2/conv/bridge.hpp b/src/gpu/intel/jit/v2/conv/bridge.hpp
index d7169468f06..26bb5f15091 100644
--- a/src/gpu/intel/jit/v2/conv/bridge.hpp
+++ b/src/gpu/intel/jit/v2/conv/bridge.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include "gpu/intel/jit/ir/message.hpp"
 #include "gpu/intel/jit/ir/tensor.hpp"
 #include "gpu/intel/jit/v2/conv/plan.hpp"
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
+#include "gpu/intel/jit/v2/ir/bridge.hpp"
 #include "gpu/intel/jit/v2/ir/tensor.hpp"
 
 namespace dnnl {
@@ -32,81 +34,30 @@ namespace jit {
 namespace v2 {
 namespace conv {
 
-inline jit::send_address_t to_ir(send_address_t address) {
-    jit::send_address_t ret = jit::send_address_t::a64;
-    switch (address) {
-#define CASE(name) \
-    case v2::send_address_t::name: ret = jit::send_address_t::name; break;
-        CASE(a64);
-        CASE(slm);
-#undef CASE
-        default: ir_error_not_expected();
-    }
-    return ret;
-}
-
-inline jit::send_op_t to_ir(send_op_t op, bool is_2d = false) {
-    jit::send_op_t ret = jit::send_op_t::undef;
-    switch (op) {
-#define CASE(name) \
-    case v2::send_op_t::name: ret = jit::send_op_t::name; break;
-        CASE(atomic_fadd);
-        CASE(load);
-        CASE(prefetch);
-        CASE(store);
-#undef CASE
-        default: ir_error_not_expected();
-    }
-    if (is_2d) {
-        switch (ret) {
-            case jit::send_op_t::load: ret = jit::send_op_t::load_2d; break;
-            case jit::send_op_t::prefetch:
-                ret = jit::send_op_t::prefetch_2d;
-                break;
-            case jit::send_op_t::store: ret = jit::send_op_t::store_2d; break;
-            default: ir_error_not_expected();
-        }
-    }
-    return ret;
-}
-
-inline jit::layout_t to_ir(const layout_t &layout) {
-    ir_assert(layout.has_const_sizes());
-    ir_assert(layout.has_const_strides());
-    std::vector<gpu::intel::block_t> blocks;
-    for (auto &b : layout.blocks()) {
-        int dim_idx = layout.desc().dim_index(b.dim);
-        blocks.emplace_back(dim_idx, b.int_size(), b.int_stride());
-    }
-
-    return jit::layout_t(
-            layout.type(), layout.desc().ndims(), layout.base(), blocks);
-}
-
-inline prb_tile_t to_shape(const convolution_pd_t *pd) {
-    prb_tile_t shape;
-    shape[prb_dims::mb] = pd->MB();
-    shape[prb_dims::ic] = ir_utils::safe_div(pd->IC(), pd->G());
-    shape[prb_dims::oc] = ir_utils::safe_div(pd->OC(), pd->G());
-    shape[prb_dims::g] = pd->G();
-    shape[prb_dims::id] = pd->ID();
-    shape[prb_dims::ih] = pd->IH();
-    shape[prb_dims::iw] = pd->IW();
-    shape[prb_dims::od] = pd->OD();
-    shape[prb_dims::oh] = pd->OH();
-    shape[prb_dims::ow] = pd->OW();
-    shape[prb_dims::kd] = pd->KD();
-    shape[prb_dims::kh] = pd->KH();
-    shape[prb_dims::kw] = pd->KW();
-    shape[prb_dims::sd] = pd->KSD();
-    shape[prb_dims::sh] = pd->KSH();
-    shape[prb_dims::sw] = pd->KSW();
-    shape[prb_dims::dd] = pd->KDD();
-    shape[prb_dims::dh] = pd->KDH();
-    shape[prb_dims::dw] = pd->KDW();
-    shape[prb_dims::pd] = pd->padFront();
-    shape[prb_dims::ph] = pd->padT();
-    shape[prb_dims::pw] = pd->padL();
+inline pvar_tile_t to_shape(const convolution_pd_t *pd) {
+    pvar_tile_t shape;
+    shape[pvars::mb] = pd->MB();
+    shape[pvars::ic] = ir_utils::safe_div(pd->IC(), pd->G());
+    shape[pvars::oc] = ir_utils::safe_div(pd->OC(), pd->G());
+    shape[pvars::g] = pd->G();
+    shape[pvars::id] = pd->ID();
+    shape[pvars::ih] = pd->IH();
+    shape[pvars::iw] = pd->IW();
+    shape[pvars::od] = pd->OD();
+    shape[pvars::oh] = pd->OH();
+    shape[pvars::ow] = pd->OW();
+    shape[pvars::kd] = pd->KD();
+    shape[pvars::kh] = pd->KH();
+    shape[pvars::kw] = pd->KW();
+    shape[pvars::sd] = pd->KSD();
+    shape[pvars::sh] = pd->KSH();
+    shape[pvars::sw] = pd->KSW();
+    shape[pvars::dd] = pd->KDD();
+    shape[pvars::dh] = pd->KDH();
+    shape[pvars::dw] = pd->KDW();
+    shape[pvars::pd] = pd->padFront();
+    shape[pvars::ph] = pd->padT();
+    shape[pvars::pw] = pd->padL();
     return shape;
 }
 
@@ -124,16 +75,57 @@ inline problem_t to_problem(
     problem_t prb;
     prb.set_hw(hw_t(engine));
     prb.set_prop(prop);
-    prb.set_bias(pd->with_bias());
+    prb.set_with_groups(pd->with_groups());
+    prb.set_bias_type(type_t(pd->invariant_bia_md()->data_type));
     prb.set_src_tag(src);
     prb.set_wei_tag(wei);
     prb.set_dst_tag(dst);
     prb.set_shape(shape);
+    if (pd->attr()->post_ops_.len() > 0) prb.set_with_post_ops(true);
+    prb.set_deterministic(pd->attr()->deterministic_);
     prb.normalize();
 
     return prb;
 }
 
+inline jit::layout_t to_conv_layout(const layout_tag_t &_tag,
+        const memory_desc_t &md, bool remove_a_dim = false) {
+    auto tag = _tag.raw_tag();
+    dim_idx_t non_spatial_ndims = tag.ndims() - 3;
+    if (remove_a_dim) {
+        tag.remove_dim('a');
+        non_spatial_ndims--;
+    }
+    while (tag.ndims() > into<dim_idx_t>(md.ndims)) {
+        tag.remove_dim(dim_idx::as_tag(non_spatial_ndims));
+    }
+    return jit::layout_t(md, tag.str(), /*do_normalize=*/false);
+}
+
+inline jit::layout_t to_conv_layout(
+        const layout_tag_t &_tag, const pvar_tile_t &shape) {
+    int ndims = _tag.desc().ndims();
+    const auto &tag = _tag.raw_tag();
+    std::vector<dim_t> dims(ndims);
+    for (int i = 0; i < ndims; i++) {
+        auto d = _tag.desc().prb_dim(i);
+        dims[i] = shape.at(d);
+    }
+    return jit::layout_t(_tag.type(), expr_t(0), tag.str(), dims);
+}
+
+inline jit::grid_info_t to_grid_info(
+        const grid_t &grid, const pvar_tile_t &tile) {
+    std::vector<dim_t> dims;
+    std::vector<expr_t> idxs;
+    for (int i = 0; i < grid_t::N; i++) {
+        dim_t size = grid.size(i, tile);
+        dims.push_back(size);
+        idxs.push_back(size == 1 ? 0 : grid.index_var(i));
+    }
+    return jit::grid_info_t(dims, idxs);
+}
+
 } // namespace conv
 } // namespace v2
 } // namespace jit
diff --git a/src/gpu/intel/jit/v2/conv/builder.cpp b/src/gpu/intel/jit/v2/conv/builder.cpp
new file mode 100644
index 00000000000..f1fe66c265c
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/builder.cpp
@@ -0,0 +1,1064 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/v2/conv/builder.hpp"
+
+#include <sstream>
+
+#include "gpu/intel/jit/ir/ir.hpp"
+#include "gpu/intel/jit/ir/ir_builder.hpp"
+#include "gpu/intel/jit/ir/kernel_info.hpp"
+#include "gpu/intel/jit/ir/message.hpp"
+#include "gpu/intel/jit/pass/dpas.hpp"
+#include "gpu/intel/jit/pass/pass.hpp"
+#include "gpu/intel/jit/utils/trace.hpp"
+#include "gpu/intel/jit/v2/conv/bridge.hpp"
+#include "gpu/intel/jit/v2/conv/plan.hpp"
+#include "gpu/intel/jit/v2/ir/builder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+
+class iterator_t {
+public:
+    iterator_t() = default;
+
+    iterator_t(buffer_manager_t &buf_mgr, const loop_nest_t &loop_nest)
+        : loop_nest_(loop_nest) {
+        linear_idx_ = loop_index_t(buf_mgr);
+        for (size_t i = 0; i < loop_nest.nloops(); i++) {
+            loop_idxs_.emplace_back(buf_mgr);
+        }
+    }
+
+    int nloops() const { return (int)loop_nest_.nloops(); }
+
+    stmt_t init_stmt() const {
+        stmt_t ret;
+        for (int i = 0; i < nloops(); i++) {
+            ret = ret.append(loop_idxs_[i].store(loop_nest_[i].init));
+        }
+        ret = linear_idx_.store(loop_nest_.linear_bound() - 1).append(ret);
+        return ret;
+    }
+
+    stmt_t check_bounds_stmt(const stmt_t &body) const {
+        return if_t::make(linear_idx_.var() >= 0, body);
+    }
+
+    stmt_t inc_stmt(const offset_ctx_t &off_ctx) const {
+        stmt_t body;
+        for (int i = nloops() - 1; i >= 0; i--) {
+            stmt_t stmt;
+            if (i - 1 >= 0) stmt = stmt.append(loop_idxs_[i - 1].store(0));
+            stmt = stmt.append(loop_idxs_[i].inc_stmt());
+            stmt = stmt.append(off_ctx.inc_loop_stmt(i));
+            if (i + 1 < nloops())
+                stmt = stmt.append(if_t::make(
+                        loop_idxs_[i].var() >= loop_nest_[i].bound, body));
+            body = std::move(stmt);
+        }
+        body = linear_idx_.inc_stmt(-1).append(body);
+        return body;
+    }
+
+private:
+    struct loop_index_t {
+        expr_t buf;
+
+        loop_index_t() = default;
+        loop_index_t(buffer_manager_t &buf_mgr) {
+            auto buf_name = buf_mgr.ir_ctx().create_tmp_name("i");
+            buf = buf_mgr.get(buf_name, sizeof(int32_t));
+        }
+
+        stmt_t store(const expr_t &value) const {
+            return store_t::make(buf, 0, value);
+        }
+
+        stmt_t inc_stmt(int inc = 1) const { return store(var() + inc); }
+
+        expr_t var() const { return load_t::make(type_t::s32(), buf, 0); }
+    };
+
+    loop_nest_t loop_nest_;
+    std::vector<loop_index_t> loop_idxs_;
+    loop_index_t linear_idx_;
+};
+
+type_t to_send_type(const send_1d_desc_t &desc) {
+    if (desc.type_size <= 8) return type_t::u(desc.type_size * 8);
+    return type_t::oword(desc.type_size / 16);
+}
+
+int get_reg_off(const send_1d_plan_t &plan, const pvar_coord_t<dim_t> &coord) {
+    return into<int>(plan.reg_layout.offset_in_bytes(coord));
+}
+
+class idiv_fixup_mutator_t : public ir_mutator_t {
+public:
+    idiv_fixup_mutator_t(var_manager_t &var_mgr) : var_mgr_(var_mgr) {}
+
+    object_t _mutate(const binary_op_t &_obj) override {
+        auto new_obj = ir_mutator_t::_mutate(_obj);
+        auto &obj = new_obj.as<binary_op_t>();
+        bool is_var_idivmod
+                = utils::one_of(obj.op_kind, op_kind_t::_div, op_kind_t::_mod)
+                && obj.type.is_int() && !is_const(obj.b);
+        if (!is_var_idivmod) return new_obj;
+        auto magic = var_mgr_.get_idiv_magic(obj.b);
+        auto i_op = (obj.op_kind == op_kind_t::_div ? op_kind_t::_idiv
+                                                    : op_kind_t::_imod);
+        return ternary_op_t::make(
+                i_op, obj.a, cast(obj.b, type_t::u32()), magic);
+    }
+
+private:
+    var_manager_t &var_mgr_;
+};
+
+stmt_t fixup_idiv(
+        const stmt_t &s, var_manager_t &var_mgr, ir_context_t &ir_ctx) {
+    trace_start();
+    auto ret = idiv_fixup_mutator_t(var_mgr).mutate(s);
+    trace_pass("fixup_idiv", ret, ir_ctx);
+    return ret;
+}
+
+class var_replacer_t : public ir_mutator_t {
+public:
+    var_replacer_t(var_manager_t &var_mgr) : var_mgr_(var_mgr) {}
+    object_t _mutate(const var_t &obj) override {
+        return map_var(obj.name, obj, /*is_const=*/false);
+    }
+
+    object_t _mutate(const const_var_t &obj) override {
+        return map_var(obj.name, obj, /*is_const_var=*/true);
+    }
+
+    object_t _mutate(const binary_op_t &obj) override {
+        switch (obj.op_kind) {
+            case op_kind_t::_div_up: return mutate((obj.a + obj.b - 1) / obj.b);
+            default: return ir_mutator_t::_mutate(obj);
+        }
+    }
+
+private:
+    expr_t map_var(
+            const std::string &name, const expr_t &var, bool is_const_var) {
+        auto it = var_map_.find(var);
+        if (it != var_map_.end()) return it->second;
+        expr_t arg;
+        if (!is_const_var) {
+            arg = var_mgr_.get_arg(name, /*allow_empty=*/true);
+        } else {
+            arg = var_mgr_.get_arg(var.type(), name);
+        }
+        auto value = (arg.is_empty() ? var : arg);
+        var_map_.emplace(var, value);
+        return value;
+    }
+
+    var_manager_t &var_mgr_;
+    object_map_t<expr_t, expr_t> var_map_;
+};
+
+stmt_t finalize_vars(
+        const stmt_t &stmt, var_manager_t &var_mgr, ir_context_t &ir_ctx) {
+    auto ret = var_replacer_t(var_mgr).mutate(stmt);
+    ret = inject_external_var_let(ret, ir_ctx);
+    return ret;
+}
+
+// Stream-K parameters.
+// For more details refer to https://arxiv.org/pdf/2301.03598.
+struct stream_k_params_t {
+    bool enable = false;
+
+    // The following values are fixed for all threads.
+    // Total number of threadgroup-level reductions. One reduction is one
+    // thread-group wide iteration.
+    expr_t total_iters;
+    // Iterations per threadgroup (rounded up).
+    expr_t iters_per_tg;
+    // Iterations per one output tile, div_up(k, k_blk).
+    expr_t iters_per_tile;
+    // Number of reduction batches.
+    expr_t k_batches;
+
+    // The following values are thread-specific.
+    // Index of this threadgroup.
+    expr_t tg_idx;
+    // Index of the current k-batch. Reduction is done in batches (i.e. the 1st
+    // thread wave reduces all tiles for k in [0, x), the 2nd wave reduces all
+    // tiles for k in [x, 2 * x), etc);
+    expr_t k_batch_idx;
+    // Index of the first threadgroup for the current output tile.
+    expr_t tg_beg;
+    // Index of the last threadgroup for the current output tile.
+    expr_t tg_end;
+    // Linear index of the current output tile.
+    expr_t tile_idx;
+
+    // Linear index of the first reduction iteration.
+    expr_t local_beg;
+    // Linear index of the last reduction iteration.
+    expr_t local_end;
+    // Variables holding initial loop indices.
+    std::vector<expr_t> loop_inits;
+
+    stream_k_params_t() = default;
+
+    stream_k_params_t(bool enable, const loop_desc_t &loop_desc)
+        : enable(enable) {
+        if (!enable) return;
+        local_beg = var_t::make(type_t::s32(), "local_beg");
+        local_end = var_t::make(type_t::s32(), "local_end");
+        for (auto &e : loop_desc) {
+            loop_inits.push_back(
+                    var_t::make(type_t::s32(), e.dim.str() + "_init"));
+        }
+    }
+
+    operator bool() const { return enable; }
+};
+
+loop_nest_t make_loop_nest(const loop_desc_t &loop_desc,
+        const coord_info_t &coord_info, const stream_k_params_t &sk_params) {
+    loop_nest_t ret;
+    expr_t linear_bound = 1;
+    for (auto &e : loop_desc) {
+        const auto &var = coord_info.loop_index(e.dim);
+        const auto &size = coord_info.loop_size(e.dim);
+        if (is_one(size)) continue;
+        expr_t init = 0;
+        if (sk_params) {
+            init = sk_params.loop_inits[e.idx];
+        } else {
+            linear_bound *= size;
+        }
+        ret.add_loop(e.dim, var, init, size);
+    }
+    if (sk_params) linear_bound = sk_params.local_end - sk_params.local_beg;
+    ret.set_linear_bound(linear_bound);
+    return ret;
+}
+
+class buffer_info_t {
+public:
+    buffer_info_t(buffer_manager_t &buf_mgr, const kernel_desc_t &desc,
+            const var_manager_t &var_mgr, const x2r_fma_plan_t &plan) {
+        for (auto &s : plan.stages) {
+            if (!s.is_x2r()) continue;
+            auto kind = s.x2r.tensor_kind;
+            auto name = pick_abc(kind, desc.prop, "src", "wei", "dst");
+            if (entries_.count(name) > 0) continue;
+            auto &e = entries_[name];
+            e.mem_buf = var_mgr.get_arg(name);
+            if (s.x2r.reorder) {
+                e.reg_buf = buf_mgr.get(
+                        to_string(kind), s.x2r.reorder.dst.size());
+            } else {
+                e.reg_buf = buf_mgr.get(
+                        to_string(kind), s.x2r.load.reg_layout().size());
+            }
+        }
+        auto c_name = pick_c(desc.prop, "src", "wei", "dst");
+        auto &c_e = entries_[c_name];
+        c_e.mem_buf = var_mgr.get_arg(c_name);
+        c_e.reg_buf = buf_mgr.get("c", plan.c_layout.size());
+        for (auto &abc :
+                {tensor_kind_t::a, tensor_kind_t::b, tensor_kind_t::c}) {
+            auto name = pick_abc(abc, desc.prop, "src", "wei", "dst");
+            entries_[to_string(abc)] = entries_.at(name);
+        }
+
+        if (desc.with_bias_fwd() || desc.with_bias_bwd_w()) {
+            auto &e = entries_["bias"];
+            e.mem_buf = var_mgr.get_arg("bias");
+            if (!plan.bias_layout.is_empty())
+                e.reg_buf
+                        = buf_mgr.get("bias_reduced", plan.bias_layout.size());
+        }
+
+        arg_helper_t arg_helper(desc);
+        for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+            if (desc.scales.has_default_values(arg)) continue;
+            auto name = arg_helper.scales_name(arg);
+            auto &e = entries_[name];
+            e.mem_buf = var_mgr.get_arg(name);
+        }
+
+        for (size_t i = 0; i < desc.post_ops.len(); i++) {
+            auto &po = desc.post_ops[i];
+            if (po.is_binary()) {
+                auto name = arg_helper.post_op_name(i);
+                auto &e = entries_[name];
+                e.mem_buf = var_mgr.get_arg(name);
+            }
+        }
+    }
+
+    expr_t mem_buf(const std::string &name) const {
+        if (entries_.count(name) == 0) return expr_t();
+        auto &e = entries_.at(name);
+        return e.mem_buf;
+    }
+
+    expr_t reg_buf(const std::string &name) const {
+        if (entries_.count(name) == 0) return expr_t();
+        auto &e = entries_.at(name);
+        return e.reg_buf;
+    }
+
+private:
+    struct buf_entry_t {
+        expr_t mem_buf;
+        expr_t reg_buf;
+    };
+
+    std::unordered_map<std::string, buf_entry_t> entries_;
+};
+
+class x2r_mul_builder_t : public ir_builder_t {
+public:
+    x2r_mul_builder_t(ir_builder_t &parent, const loop_nest_t &loop_nest,
+            const buffer_info_t &buf_info, const kernel_desc_t &desc,
+            const x2r_fma_plan_t &plan)
+        : ir_builder_t(parent, loop_nest)
+        , buf_info_(buf_info)
+        , desc_(desc)
+        , plan_(plan) {
+        for (auto &s : plan_.stages) {
+            if (s.is_fma()) {
+                build_mul(s.fma);
+            } else if (s.is_x2r()) {
+                build_x2r(s.x2r);
+                if (s.x2r.tensor_kind == tensor_kind_t::b) {
+                    uint32_t mask = (1 << 1) | (1 << 2);
+                    const auto &b_buf = buf_info_.reg_buf("b");
+                    if (!s.x2r.bias_layout.is_empty()) {
+                        reduce(s.x2r.layout, s.x2r.bias_layout, b_buf,
+                                buf_info_.reg_buf("bias"), mask);
+                    }
+                }
+            }
+        }
+    }
+
+private:
+    void build_x2r(const x2r_plan_t &plan) {
+        const auto &mem_buf = buf_info_.mem_buf(to_string(plan.tensor_kind));
+        const auto &reg_buf = buf_info_.reg_buf(to_string(plan.tensor_kind));
+        auto load_buf
+                = load(plan.load, mem_buf, (plan.reorder ? expr_t() : reg_buf));
+        if (plan.reorder) reorder(plan.reorder, load_buf, reg_buf);
+    }
+
+    void build_mul(const fma_plan_t &fma) {
+        auto &a_layout = fma.a_layout;
+        auto &b_layout = fma.b_layout;
+        auto &c_layout = fma.c_layout;
+        const auto &a_buf = buf_info_.reg_buf("a");
+        const auto &b_buf = buf_info_.reg_buf("b");
+        const auto &c_buf = buf_info_.reg_buf("c");
+
+        pvar_tile_t sizes = a_layout.int_dim_sizes();
+        auto b_sizes = b_layout.int_dim_sizes();
+        for (auto &d : b_sizes) {
+            if (sizes.has(d)) gpu_assert(sizes[d] == b_sizes[d]);
+            sizes[d] = b_sizes[d];
+        }
+
+        // BMNK order (outer -> inner).
+        std::vector<pvar_t> dim_order;
+        for (const auto &bmnk : {pvars::m, pvars::n, pvars::k, pvars::b}) {
+            for (auto &d : sizes) {
+                if (to_gemm(d, desc_.prop) != bmnk) continue;
+                dim_order.push_back(d);
+            }
+        }
+
+        auto bmnk_inst_tile = to_gemm(fma.inst_tile, desc_.prop);
+        dim_t B = bmnk_inst_tile.get(pvars::b, 1);
+        dim_t M = bmnk_inst_tile.get(pvars::m, 1);
+        dim_t N = bmnk_inst_tile.get(pvars::n, 1);
+        dim_t K = bmnk_inst_tile.get(pvars::k, 1);
+        bool is_a_bcast = (B * M * K == 1);
+        bool is_b_bcast = (B * K * N == 1);
+        func_t fma_func;
+        switch (fma.fma) {
+            case fma_kind_t::mad: {
+                int a_stride = is_a_bcast ? 0 : a_layout.inner_stride();
+                int b_stride = is_b_bcast ? 0 : b_layout.inner_stride();
+                fma_func = mad_t::make(plan_.hw, c_layout.type(), fma.simd,
+                        a_layout.type(), a_stride, b_layout.type(), b_stride);
+                break;
+            }
+            case fma_kind_t::dpas: {
+                fma_func = dpas_t::make(/*is_dpasw=*/false, fma.simd, 8, 8,
+                        c_layout.type(), b_layout.type(), a_layout.type());
+                break;
+            }
+            default: gpu_error_not_expected();
+        }
+        stmt_t call_stmt;
+        for_each(sizes, fma.inst_tile, dim_order,
+                [&](const pvar_coord_t<dim_t> &coord) {
+                    dim_t a_off = a_layout.offset_in_bytes(coord);
+                    dim_t b_off = b_layout.offset_in_bytes(coord);
+                    dim_t c_off = c_layout.offset_in_bytes(coord);
+                    auto dst = c_buf[c_off];
+                    auto src1 = a_buf[a_off];
+                    auto src2 = b_buf[b_off];
+                    if (fma.fma == fma_kind_t::dpas) std::swap(src1, src2);
+                    call_stmt = call_stmt.append(fma_func.call(
+                            {dst, dst, std::move(src1), std::move(src2)}));
+                });
+
+        if (fma.fma == fma_kind_t::dpas) {
+            call_stmt
+                    = inject_dpas_atomic(call_stmt, /*filter_by_label=*/false);
+        }
+        emit(call_stmt);
+    }
+
+    const buffer_info_t &buf_info_;
+    const kernel_desc_t &desc_;
+    const x2r_fma_plan_t &plan_;
+};
+
+class prefetch_builder_t : public ir_builder_t {
+public:
+    prefetch_builder_t(ir_builder_t &parent, const loop_nest_t &loop_nest,
+            const buffer_info_t &buf_info, const prefetch_plan_t &plan)
+        : ir_builder_t(parent, loop_nest), buf_info_(buf_info), plan_(plan) {
+        if (plan_.a_prefetch) {
+            load(plan_.a_prefetch, buf_info_.mem_buf("a"), expr_t());
+        }
+        if (plan_.b_prefetch) {
+            load(plan_.b_prefetch, buf_info_.mem_buf("b"), expr_t());
+        }
+    }
+
+private:
+    const buffer_info_t &buf_info_;
+    const prefetch_plan_t &plan_;
+};
+
+class post_op_builder_t : public ir_builder_t {
+public:
+    post_op_builder_t(ir_builder_t &parent, const kernel_desc_t &desc,
+            const pvar_coord_t<expr_t> &coord, const pvar_tile_t &tile,
+            alg_kind_t binary_alg, const gpu_post_ops_t::entry_t *post_op_entry,
+            const layout_t &lhs_reg_layout, const expr_t &lhs_reg_buf,
+            const expr_t &rhs_mem_buf, const type_t &_rhs_type,
+            uint16_t rhs_mask, float rhs_scale, int rhs_zero_point)
+        : ir_builder_t(parent, loop_nest_t()), desc_(desc) {
+        // Binary post-op.
+        if (!rhs_mem_buf.is_empty()) {
+            auto &c_tag = pick_c(
+                    desc_.prop, desc_.src_tag, desc_.wei_tag, desc_.dst_tag);
+            auto rhs_type = _rhs_type.is_undef() ? c_tag.type() : _rhs_type;
+            auto rhs_view = rhs_mem_view(coord, tile, rhs_type, rhs_mask);
+            layout_t rhs_reg_layout;
+            auto rhs_reg_buf
+                    = load(rhs_view, rhs_mem_buf, expr_t(), &rhs_reg_layout);
+            build_binary_post_op(binary_alg, lhs_reg_layout, rhs_reg_layout,
+                    lhs_reg_buf, rhs_reg_buf, rhs_scale, rhs_zero_point);
+            return;
+        }
+        // Eltwise post-op.
+        auto &e = post_op_entry->as_eltwise();
+        auto func = eltwise_t::make(e.alg, e.scale, e.alpha, e.beta);
+        emit(func.call({expr_t(lhs_reg_layout.elems()), lhs_reg_buf}));
+    }
+
+private:
+    view_t rhs_mem_view(const pvar_coord_t<expr_t> &_coord,
+            const pvar_tile_t &_tile, const type_t &type, uint16_t mask) {
+        dim_mapper_manager_t mger(desc_.prop, desc_.spec.reqs());
+        auto &c_mapper = mger.mapper(tensor_kind_t::c);
+        auto kind = pick_c(desc_.prop, tensor_kind_t::src, tensor_kind_t::wei,
+                tensor_kind_t::dst);
+        auto &c_tag = pick_c(
+                desc_.prop, desc_.src_tag, desc_.wei_tag, desc_.dst_tag);
+        auto rhs_layout = make_conv_layout(
+                kind, c_tag, desc_.is_dw, desc_.spec.reqs(), mask);
+        rhs_layout = rhs_layout.retype(type);
+        auto is_bcast = [&](const pvar_t &dim) {
+            for (auto &b : rhs_layout.blocks()) {
+                if (b.dim == dim) return false;
+            }
+            return true;
+        };
+        auto coord = _coord;
+        auto tile = _tile;
+        for (auto &d : rhs_layout.desc().letter_map()) {
+            if (is_bcast(d)) {
+                if (tile.has(d)) tile.unset(d);
+                if (coord.has(d)) coord.unset(d);
+            }
+        }
+        return view_t(c_mapper, rhs_layout, coord, tile);
+    }
+
+    void build_binary_post_op(alg_kind_t alg, const layout_t &lhs,
+            const layout_t &_rhs, const expr_t &lhs_buf, const expr_t &_rhs_buf,
+            float scale = 1, int zero_point = 0) {
+        gpu_assert(lhs.type() == type_t::f32());
+        auto rhs = _rhs;
+        auto rhs_buf = _rhs_buf;
+        if (rhs.type() != type_t::f32()) {
+            auto rhs_f32 = _rhs.retype(type_t::f32(), /*dense=*/true);
+            rhs_buf = reorder(_rhs, rhs_f32, _rhs_buf);
+            rhs = std::move(rhs_f32);
+        }
+        if (zero_point != 0) {
+            auto func = eltwise_t::make(
+                    alg_kind::eltwise_linear, 1, 1.0f, -zero_point);
+            emit(func.call({expr_t(rhs.elems()), rhs_buf}));
+        }
+        if (scale != 1) {
+            auto func
+                    = eltwise_t::make(alg_kind::eltwise_linear, 1, scale, 0.0f);
+            emit(func.call({expr_t(rhs.elems()), rhs_buf}));
+        }
+        gpu_assert(lhs.nblocks() > 0);
+        int max_simd = (2 * desc_.hw_desc.grf_size()) / sizeof(float);
+        auto &lhs0 = lhs.blocks()[0];
+        int elems = math::gcd(max_simd, lhs0.int_size());
+        bool is_bcast = !rhs.dim_sizes().has(lhs0.dim);
+        if (!is_bcast) {
+            auto &rhs0 = rhs.blocks()[0];
+            if (rhs0.dim == lhs0.dim) {
+                elems = math::gcd(elems, rhs0.int_size());
+            } else {
+                elems = 1;
+            }
+        }
+        elems = (elems < 8 ? 1 : elems);
+        pvar_tile_t tile;
+        tile[lhs0.dim] = elems;
+        for_each(lhs.int_dim_sizes(), tile,
+                [&](const pvar_coord_t<dim_t> &coord) {
+                    auto lhs_off = lhs.offset_in_bytes(coord);
+                    auto rhs_off = rhs.offset_in_bytes(coord);
+                    auto e_l = load_t::make(
+                            type_t::f32().with_elems(elems), lhs_buf, lhs_off);
+                    auto e_r = load_t::make(
+                            type_t::f32().with_elems(is_bcast ? 1 : elems),
+                            rhs_buf, rhs_off);
+                    if (is_bcast) e_r = shuffle_t::make_broadcast(e_r, elems);
+                    auto e_op = binary_op_t::make(
+                            alg_kind_to_op_kind(alg), e_l, e_r);
+                    if (e_op.type().is_bool()) {
+                        e_op = cast(e_op, lhs.type().with_elems(elems));
+                    }
+                    emit(store_t::make(lhs_buf, lhs_off, e_op));
+                });
+    }
+
+    const kernel_desc_t &desc_;
+};
+
+class epilogue_tile_builder_t : public ir_builder_t {
+public:
+    epilogue_tile_builder_t(ir_builder_t &parent, const buffer_info_t &buf_info,
+            const kernel_desc_t &desc, const layout_t &c_layout,
+            const expr_t &c_mem_buf, const expr_t &c_reg_buf,
+            const pvar_coord_t<expr_t> &c_coord,
+            const pvar_coord_t<dim_t> &coord,
+            const epilogue_store_plan_t &store_plan)
+        : ir_builder_t(parent, loop_nest_t())
+        , buf_info_(buf_info)
+        , desc_(desc) {
+        dim_t off = c_layout.offset_in_bytes(coord);
+        auto store_layout
+                = store_plan.c_store.reg_layout().map(store_plan.tile);
+        layout_t payload_layout = store_layout;
+        auto payload_buf = build_post_ops(c_layout.map(store_plan.tile),
+                c_coord + coord, c_reg_buf + off, payload_layout);
+        payload_buf = reorder(payload_layout, store_layout, payload_buf);
+        store(store_plan.c_store, c_mem_buf, payload_buf, coord,
+                store_plan.tile);
+    }
+
+private:
+    static uint16_t reverse_post_op_mask(uint16_t mask, int ndims) {
+        uint16_t ret = 0;
+        for (int i = 0; i < ndims; i++) {
+            uint16_t bit = (mask >> (ndims - i - 1)) & 0x1;
+            ret |= bit << i;
+        }
+        return ret;
+    }
+
+    void build_post_op(const pvar_coord_t<expr_t> &coord,
+            const pvar_tile_t &tile, alg_kind_t binary_alg,
+            const gpu_post_ops_t::entry_t *post_op_entry,
+            const layout_t &lhs_reg_layout, const expr_t &lhs_reg_buf,
+            const expr_t &rhs_mem_buf = expr_t(),
+            const type_t &rhs_type = type_t::undef(), uint16_t rhs_mask = 0,
+            float rhs_scale = 1.0f, int rhs_zero_point = 0) {
+        post_op_builder_t builder(*this, desc_, coord, tile, binary_alg,
+                post_op_entry, lhs_reg_layout, lhs_reg_buf, rhs_mem_buf,
+                rhs_type, rhs_mask, rhs_scale, rhs_zero_point);
+        emit(builder.get_init_stmt());
+        emit(builder.get_stmt());
+    }
+
+    expr_t build_post_ops(const layout_t &layout,
+            const pvar_coord_t<expr_t> &coord, const expr_t &_buf,
+            layout_t &out_layout) {
+        if (desc_.post_ops.len() == 0 && desc_.scales.has_default_values()
+                && !desc_.with_bias_fwd()) {
+            out_layout = layout;
+            return _buf;
+        }
+        auto f32_layout = out_layout.retype(type_t::f32(), /*dense=*/true);
+        auto tile = f32_layout.int_dim_sizes();
+        int elems = f32_layout.elems();
+        gpu_assert(elems * type_t::f32().size() == f32_layout.size());
+        auto buf = reorder(layout, f32_layout, _buf);
+        arg_helper_t arg_helper(desc_);
+        auto &c_tag = pick_c(
+                desc_.prop, desc_.src_tag, desc_.wei_tag, desc_.dst_tag);
+        auto to_rhs_mask = [](int arg, int mask) {
+            if (mask == 0) return 0;
+            switch (arg) {
+                case DNNL_ARG_WEIGHTS:
+                    gpu_assert(mask == ((1 << 0) | (1 << 1)))
+                            << "Only per-output channels scales are supported, "
+                               "mask: "
+                            << mask;
+                    return (1 << 1);
+                case DNNL_ARG_DST: return mask;
+                default: gpu_error_not_expected();
+            }
+            return 0;
+        };
+        auto build_scale = [&](int arg) {
+            if (desc_.scales.has_default_values(arg)) return;
+            auto rhs_buf_name = arg_helper.scales_name(arg);
+            auto mask = desc_.scales.get_mask(arg);
+            auto data_type = desc_.scales.get_data_type(arg);
+            alg_kind_t alg = (arg == DNNL_ARG_DST ? alg_kind::binary_div
+                                                  : alg_kind::binary_mul);
+            build_post_op(coord, tile, alg, nullptr, f32_layout, buf,
+                    buf_info_.mem_buf(rhs_buf_name), type_t(data_type),
+                    into<uint16_t>(to_rhs_mask(arg, mask)));
+        };
+        // Apply non-dst scales.
+        build_scale(DNNL_ARG_SRC);
+        build_scale(DNNL_ARG_WEIGHTS);
+        // Apply bias.
+        if (desc_.with_bias_fwd()) {
+            build_post_op(coord, tile, alg_kind::binary_add, nullptr,
+                    f32_layout, buf, buf_info_.mem_buf("bias"), desc_.bias_type,
+                    /*rhs_mask=*/0x2);
+        }
+        // Apply post-ops.
+        for (size_t i = 0; i < desc_.post_ops.len(); i++) {
+            auto &po = desc_.post_ops[i];
+            if (po.is_eltwise()) {
+                build_post_op(
+                        coord, tile, alg_kind::undef, &po, f32_layout, buf);
+            } else if (po.is_sum()) {
+                auto &s = po.as_sum();
+                build_post_op(coord, tile, alg_kind::binary_add, &po,
+                        f32_layout, buf, buf_info_.mem_buf("c"), type_t(s.dt),
+                        0xFFFF, s.scale, s.zero_point);
+            } else if (po.is_binary()) {
+                auto &b = po.as_binary();
+                auto rhs_buf_name = arg_helper.post_op_name(i);
+                auto mask = reverse_post_op_mask(
+                        b.src1_desc.broadcast_mask ^ 0xFFFF,
+                        c_tag.raw_tag().ndims());
+                build_post_op(coord, tile, b.alg, &po, f32_layout, buf,
+                        buf_info_.mem_buf(rhs_buf_name), type_t(b.src1_desc.dt),
+                        mask);
+            } else {
+                gpu_error_not_expected();
+            }
+        }
+        // Apply dst scales.
+        build_scale(DNNL_ARG_DST);
+        out_layout = std::move(f32_layout);
+        return buf;
+    }
+
+    const buffer_info_t &buf_info_;
+    const kernel_desc_t &desc_;
+};
+
+class epilogue_builder_t : public ir_builder_t {
+public:
+    epilogue_builder_t(ir_builder_t &parent, const buffer_info_t &buf_info,
+            const kernel_desc_t &desc, const epilogue_plan_t &plan)
+        : ir_builder_t(parent, loop_nest_t())
+        , buf_info_(buf_info)
+        , desc_(desc)
+        , plan_(plan) {
+        build_slm_reduce();
+        build_c_store();
+        build_bias_reduce_store();
+    }
+
+private:
+    void build_slm_reduce() {
+        auto &slm_reduce = plan_.slm_reduce;
+        if (!slm_reduce) return;
+
+        const auto &c_buf = buf_info_.reg_buf("c");
+        auto c_tmp_buf = alloc("c_reduce", slm_reduce.load.reg_layout().size());
+        auto c_slm_buf = alloc("slm", slm_reduce.slm_usage_bytes());
+        store(slm_reduce.store, c_slm_buf, c_buf);
+        barrier();
+        load(slm_reduce.load, c_slm_buf, c_tmp_buf);
+        zero_out(c_buf);
+        reduce(slm_reduce.reduce, c_tmp_buf, c_buf);
+    }
+
+    void build_c_store() {
+        auto &store_plan = plan_.store;
+        auto &c_layout = plan_.c_reg_layout;
+        const auto &c_mem_buf = buf_info_.mem_buf("c");
+        const auto &c_reg_buf = buf_info_.reg_buf("c");
+        for_each(c_layout.int_dim_sizes(), store_plan.tile,
+                [&](const pvar_coord_t<dim_t> &coord) {
+                    epilogue_tile_builder_t builder(*this, buf_info_, desc_,
+                            c_layout, c_mem_buf, c_reg_buf, plan_.c_coord,
+                            coord, store_plan);
+                    emit(builder.get_init_stmt());
+                    emit(builder.get_stmt());
+                });
+    }
+
+    void build_bias_reduce_store() {
+        if (plan_.bias_layout.is_empty()) return;
+        auto &store_plan = plan_.store;
+        const auto &bias_red_mem_buf = buf_info_.mem_buf("bias");
+        const auto &bias_red_reg_buf = buf_info_.reg_buf("bias");
+        expr_t tmp_buf;
+        if (store_plan.bias_reorder)
+            tmp_buf = alloc("bias_tmp", store_plan.bias_reorder.dst.size());
+        auto payload_buf = bias_red_reg_buf;
+        if (store_plan.bias_reorder) {
+            reorder(store_plan.bias_reorder, bias_red_reg_buf, tmp_buf);
+            payload_buf = std::move(tmp_buf);
+        }
+        _if(plan_.bias_reduce_cond, [&]() {
+            store(store_plan.bias_store, bias_red_mem_buf, payload_buf);
+        });
+    }
+
+    const buffer_info_t &buf_info_;
+    const kernel_desc_t &desc_;
+    const epilogue_plan_t &plan_;
+};
+
+class conv_builder_t : public ir_builder_t {
+public:
+    conv_builder_t(ir_context_t &ir_ctx, const kernel_desc_t &desc,
+            var_manager_t &var_mgr, const plan_t &plan)
+        : ir_builder_t(ir_ctx)
+        , desc_(desc)
+        , var_mgr_(var_mgr)
+        , plan_(plan)
+        , buf_info_(buf_mgr(), desc, var_mgr, plan.x2r_fma) {
+
+        stream_k_params_t sk_params(desc.use_stream_k, desc_.loop_desc);
+        emit_thread_index_let();
+        if (desc.use_stream_k) {
+            sk_params.tg_idx = plan_.tg_grid.index_var(0);
+            sk_params.k_batch_idx = plan_.tg_grid.index_var(1);
+
+            auto total_iters_main
+                    = const_var_t::make(type_t::s32(), "sk_total_iters_main");
+            auto total_iters_tail
+                    = const_var_t::make(type_t::s32(), "sk_total_iters_tail");
+            auto iters_per_tg_main
+                    = const_var_t::make(type_t::s32(), "sk_iters_per_tg_main");
+            auto iters_per_tg_tail
+                    = const_var_t::make(type_t::s32(), "sk_iters_per_tg_tail");
+            auto iters_per_tg_main_magic = const_var_t::make(
+                    type_t::u64(), "sk_iters_per_tg_main_magic");
+            auto iters_per_tg_tail_magic = const_var_t::make(
+                    type_t::u64(), "sk_iters_per_tg_tail_magic");
+            auto iters_per_tile_main = const_var_t::make(
+                    type_t::s32(), "sk_iters_per_tile_main");
+            auto iters_per_tile_tail = const_var_t::make(
+                    type_t::s32(), "sk_iters_per_tile_tail");
+            auto iters_per_tile_main_magic = const_var_t::make(
+                    type_t::u64(), "sk_iters_per_tile_main_magic");
+            auto iters_per_tile_tail_magic = const_var_t::make(
+                    type_t::u64(), "sk_iters_per_tile_tail_magic");
+
+            sk_params.k_batches
+                    = const_var_t::make(type_t::s32(), "sk_k_batches");
+            auto cond = (sk_params.k_batch_idx == sk_params.k_batches - 1);
+            sk_params.total_iters = let("sk_total_iters",
+                    iif_t::make(cond, total_iters_tail, total_iters_main));
+            sk_params.iters_per_tg = let("sk_iters_per_tg",
+                    iif_t::make(cond, iters_per_tg_tail, iters_per_tg_main));
+            sk_params.iters_per_tile = let("sk_iters_per_tile",
+                    iif_t::make(
+                            cond, iters_per_tile_tail, iters_per_tile_main));
+            auto iters_per_tg_magic = let("sk_iters_per_tg_magic",
+                    iif_t::make(cond, iters_per_tg_tail_magic,
+                            iters_per_tg_main_magic));
+            auto iters_per_tile_magic = let("sk_iters_per_tile_magic",
+                    iif_t::make(cond, iters_per_tile_tail_magic,
+                            iters_per_tile_main_magic));
+            auto iter = alloc_var(type_t::s32(), "sk_iter");
+            iter = sk_params.tg_idx * sk_params.iters_per_tg;
+            auto iter_end = let("sk_iter_end",
+                    min(sk_params.total_iters, iter + sk_params.iters_per_tg));
+
+            _while(iter < iter_end, [&]() {
+                sk_params.tile_idx = let("sk_tile_idx",
+                        ternary_idiv(iter,
+                                cast(sk_params.iters_per_tile, type_t::u32()),
+                                iters_per_tile_magic));
+                auto global_beg = let("sk_global_beg",
+                        sk_params.tile_idx * sk_params.iters_per_tile);
+                auto global_end = let(
+                        "sk_global_end", global_beg + sk_params.iters_per_tile);
+                let(sk_params.local_beg,
+                        iter - global_beg
+                                + sk_params.k_batch_idx * iters_per_tile_main);
+                let(sk_params.local_end,
+                        min(iter_end, global_end) - global_beg
+                                + sk_params.k_batch_idx * iters_per_tile_main);
+                sk_params.tg_beg = let("sk_tg_beg",
+                        ternary_idiv(global_beg,
+                                cast(sk_params.iters_per_tg, type_t::u32()),
+                                iters_per_tg_magic));
+                sk_params.tg_end = let("sk_tg_beg",
+                        ternary_idiv(global_beg - 1,
+                                cast(sk_params.iters_per_tg, type_t::u32()),
+                                iters_per_tg_magic)
+                                + 1);
+                emit_thread_group_index_let(sk_params.tile_idx);
+                pipeline(sk_params);
+                epilogue();
+                iter = global_end;
+            });
+        } else {
+            emit_thread_group_index_let();
+            pipeline();
+            epilogue();
+        }
+
+        auto _stmt = get_stmt();
+        _stmt = inject_alloc_stmts(_stmt, buf_mgr());
+        _stmt = inject_dangling_let_stmts(_stmt);
+        _stmt = off_scope().inject_let_stmts(_stmt);
+        _stmt = inject_global_alloc(_stmt);
+        _stmt = fixup_idiv(_stmt, var_mgr, ir_ctx);
+        _stmt = finalize_vars(_stmt, var_mgr, ir_ctx);
+        _stmt = merge_slm_buffers(_stmt, ir_ctx);
+        _stmt = inject_slm_reorder(_stmt, ir_ctx,
+                to_grid_info(plan_.thr_grid, desc_.thread_group_tile),
+                /*has_slm_usage=*/(bool)plan_.epilogue.slm_reduce);
+        _stmt = inject_send(_stmt, ir_ctx);
+        _stmt = simplify(_stmt, ir_ctx);
+        _stmt = optimize_alloc_let(_stmt, ir_ctx);
+        _stmt = split_wide_stores(_stmt, ir_ctx);
+        _stmt = fixup_if_conditions(_stmt, ir_ctx);
+        _stmt = eliminate_common_subexprs(_stmt, ir_ctx, 16, 0);
+        _stmt = inject_bank_conflict_attribute(_stmt, ir_ctx);
+        set_stmt(_stmt);
+    }
+
+private:
+    void pipeline(const stream_k_params_t &sk_params = {}) {
+        auto &loop_desc = desc_.loop_desc;
+        auto &coord_info = plan_.coord_info;
+        auto value = sk_params.local_beg;
+        if (sk_params) {
+            // Unpack the initial loop offsets from a linear index.
+            for (auto &e : loop_desc) {
+                auto &size = coord_info.loop_size(e.dim);
+                let(sk_params.loop_inits[e.idx], value % size);
+                value /= size;
+            }
+        }
+        auto loop_nest = make_loop_nest(loop_desc, coord_info, sk_params);
+        prefetch_builder_t prefetch_builder(
+                *this, loop_nest, buf_info_, plan_.prefetch);
+        x2r_mul_builder_t x2r_mul_builder(
+                *this, loop_nest, buf_info_, desc_, plan_.x2r_fma);
+
+        zero_out(buf_info_.reg_buf("c"));
+        if (!buf_info_.reg_buf("bias").is_empty())
+            zero_out(buf_info_.reg_buf("bias"));
+
+        emit(x2r_mul_builder.get_init_stmt());
+        emit(prefetch_builder.get_init_stmt());
+        int prefetch_dist = desc_.prefetch.dist;
+        auto x2r_mul_stmt = x2r_mul_builder.get_stmt();
+        auto prefetch_stmt = prefetch_builder.get_stmt();
+        iterator_t prefetch_it;
+        if (prefetch_dist > 0) {
+            prefetch_it = iterator_t(buf_mgr(), loop_nest);
+            emit(prefetch_it.init_stmt());
+            for (int i = 0; i < prefetch_dist; i++) {
+                auto i_prefetch_stmt = prefetch_stmt;
+                if (i > 0) {
+                    i_prefetch_stmt
+                            = prefetch_it.check_bounds_stmt(i_prefetch_stmt);
+                }
+                emit(i_prefetch_stmt);
+                emit(prefetch_it.inc_stmt(prefetch_builder.off_ctx()));
+            }
+        }
+        if (desc_.use_stream_k) {
+            // Use iterator-based loop with Stream-K.
+            iterator_t mul_it(buf_mgr(), loop_nest);
+            emit(mul_it.init_stmt());
+            auto it_var = var_t::make(type_t::s32(), "sk_local_iter");
+            _for(it_var, 0, loop_nest.linear_bound(), [&]() {
+                if (prefetch_dist > 0) {
+                    emit(prefetch_it.check_bounds_stmt(prefetch_stmt));
+                }
+                emit(x2r_mul_stmt);
+                emit(mul_it.inc_stmt(x2r_mul_builder.off_ctx()));
+                if (prefetch_dist > 0) {
+                    emit(prefetch_it.inc_stmt(prefetch_builder.off_ctx()));
+                }
+            });
+        } else {
+            std::function<void(size_t)> emit_loop;
+            emit_loop = [&](size_t i) {
+                if (i == 0) {
+                    // Innermost loop body.
+                    if (prefetch_dist > 0) {
+                        emit(prefetch_it.check_bounds_stmt(prefetch_stmt));
+                    }
+                    emit(x2r_mul_stmt);
+                    if (prefetch_dist > 0) {
+                        emit(prefetch_it.inc_stmt(prefetch_builder.off_ctx()));
+                    }
+                    return;
+                }
+                auto &loop = loop_nest[i - 1];
+                const auto &var = coord_info.loop_index(loop.dim);
+                _for(var, 0, loop.bound, [&]() {
+                    emit_loop(i - 1);
+                    emit(x2r_mul_builder.off_ctx().inc_loop_stmt(
+                            (int)loop.idx));
+                });
+            };
+            emit_loop(loop_nest.nloops());
+        }
+    }
+
+    void epilogue() {
+        epilogue_builder_t builder(*this, buf_info_, desc_, plan_.epilogue);
+        emit(builder.get_init_stmt());
+        emit(builder.get_stmt());
+    }
+
+    stmt_t inject_global_alloc(const stmt_t &stmt) const {
+        std::vector<stmt_t> allocs;
+        for (auto &var : var_mgr_.ptr_args()) {
+            allocs.push_back(alloc_t::make(var, 0, alloc_kind_t::global));
+        }
+        return inject_alloc_stmts(stmt, allocs);
+    }
+
+    void emit_thread_index_let() {
+        for (int i = 0; i < 3; i++) {
+            auto value = var_t::make(
+                    type_t::u16(), jit::ir_builder_t::local_id(i));
+            if (i == 0) value /= plan_.desc.simd;
+            auto thr_idx = plan_.thr_grid.index_var(i);
+            let(thr_idx, cast(value, thr_idx.type()));
+        }
+        for (auto &kv : plan_.virt_grid.idxs()) {
+            let(kv.first, kv.second);
+        }
+    }
+
+    expr_t unpack_tg_index(const pvar_t &dim, const expr_t &base_idx) const {
+        auto &tg_grid = plan_.tg_grid;
+        expr_t value = base_idx;
+        auto &dims = tg_grid.dims(tg_grid.index(dim));
+        int ndims = (int)dims.size();
+        for (int i = 0; i < ndims; i++) {
+            if (dims[i] == dim) {
+                if (i == ndims - 1) return value;
+                break;
+            }
+            auto grid_size = var_mgr_.get_grid_size(dims[i].str());
+            value = value / grid_size;
+        }
+        auto grid_size = var_mgr_.get_grid_size(dim.str());
+        value = value % grid_size;
+        return value;
+    }
+
+    void emit_thread_group_index_let(const expr_t &_base_tg_idx = {}) {
+        auto &tg_grid = plan_.tg_grid;
+        auto &coord_info = plan_.coord_info;
+        for (auto &d : conv_index_dims(plan_.desc.prop)) {
+            const auto &tg_idx = coord_info.tg_index(d);
+            if (is_const(tg_idx)) continue;
+            auto base_tg_idx
+                    = (!_base_tg_idx.is_empty() ? _base_tg_idx
+                                                : tg_grid.index_var(d));
+            if (base_tg_idx.is_empty()) continue;
+            auto value = unpack_tg_index(d, base_tg_idx);
+            let(tg_idx, value);
+        }
+    }
+
+    kernel_desc_t desc_;
+    var_manager_t &var_mgr_;
+    plan_t plan_;
+    buffer_info_t buf_info_;
+};
+
+stmt_t build_ir(const exec_config_t &exec_cfg, const kernel_desc_t &desc,
+        var_manager_t &var_mgr) {
+    auto plan = create_conv_plan(desc, exec_cfg.hw());
+    if (!plan) gpu_except_not_implemented("Cannot create plan.");
+
+    gpu_info() << desc;
+    gpu_trace() << plan;
+
+    constraint_set_t cset;
+    ir_context_t ir_ctx(exec_cfg, cset);
+    conv_builder_t builder(ir_ctx, desc, var_mgr, plan);
+    auto stmt = builder.get_stmt();
+    gpu_trace() << "Convolution kernel body:\n" << stmt;
+    return stmt;
+}
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/builder.hpp b/src/gpu/intel/jit/v2/conv/builder.hpp
new file mode 100644
index 00000000000..5c8e069dbbb
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/builder.hpp
@@ -0,0 +1,46 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_V2_CONV_BUILDER_HPP
+#define GPU_INTEL_JIT_V2_CONV_BUILDER_HPP
+
+#include "gpu/intel/jit/ir/ir.hpp"
+#include "gpu/intel/jit/ir/kernel_info.hpp"
+#include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+
+class var_manager_t;
+
+namespace conv {
+
+stmt_t build_ir(const exec_config_t &exec_cfg, const kernel_desc_t &desc,
+        var_manager_t &var_mgr);
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/v2/conv/debug.cpp b/src/gpu/intel/jit/v2/conv/debug.cpp
new file mode 100644
index 00000000000..8d4d450bd68
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/debug.cpp
@@ -0,0 +1,35 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/v2/conv/debug.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+
+thread_local kernel_desc_t debug_t::tls_desc_;
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/debug.hpp b/src/gpu/intel/jit/v2/conv/debug.hpp
new file mode 100644
index 00000000000..2a72aa8610c
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/debug.hpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+* Copyright 2023-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_V2_CONV_DEBUG_HPP
+#define GPU_INTEL_JIT_V2_CONV_DEBUG_HPP
+
+#include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
+#include "gpu/intel/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+
+class debug_t {
+public:
+    static bool init_kernel_desc(kernel_desc_t &desc) {
+        if (!env_desc().is_empty()) {
+            desc = env_desc();
+            return true;
+        }
+        if (!tls_desc_.is_empty()) {
+            desc = tls_desc_;
+            return true;
+        }
+        return false;
+    }
+
+    struct kernel_desc_setter_t {
+        kernel_desc_setter_t(const kernel_desc_t &desc, kernel_desc_t *desc_ptr)
+            : desc_ptr_(desc_ptr) {
+            *desc_ptr_ = desc;
+        }
+
+        kernel_desc_setter_t(kernel_desc_setter_t &&other)
+            : desc_ptr_(nullptr) {
+            std::swap(desc_ptr_, other.desc_ptr_);
+        }
+        kernel_desc_setter_t &operator=(kernel_desc_setter_t &&other) {
+            if (&other != this) std::swap(desc_ptr_, other.desc_ptr_);
+            return *this;
+        }
+
+        kernel_desc_setter_t(const kernel_desc_setter_t &) = delete;
+        kernel_desc_setter_t &operator=(const kernel_desc_setter_t &) = delete;
+
+        ~kernel_desc_setter_t() {
+            if (desc_ptr_) *desc_ptr_ = kernel_desc_t();
+        }
+
+    private:
+        kernel_desc_t *desc_ptr_ = nullptr;
+    };
+
+    static kernel_desc_setter_t make_kernel_desc_setter(
+            const kernel_desc_t &desc) {
+        return kernel_desc_setter_t(desc, &tls_desc_);
+    }
+
+    static debug_t &instance() {
+        static debug_t _instance;
+        return _instance;
+    }
+
+private:
+    static kernel_desc_t &env_desc() {
+        static kernel_desc_t _env_desc = []() {
+            kernel_desc_t d;
+            auto s_desc = gpu_utils::dev_getenv("desc", std::string());
+            if (!s_desc.empty()) d.set(s_desc);
+            return d;
+        }();
+        return _env_desc;
+    }
+
+    static thread_local kernel_desc_t tls_desc_;
+};
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/v2/conv/gen_convolution.cpp b/src/gpu/intel/jit/v2/conv/gen_convolution.cpp
index 3e79884d9b0..3a07046307a 100644
--- a/src/gpu/intel/jit/v2/conv/gen_convolution.cpp
+++ b/src/gpu/intel/jit/v2/conv/gen_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,9 +27,9 @@
 #include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
 #include "gpu/intel/jit/v2/conv/bridge.hpp"
-#include "gpu/intel/jit/v2/conv/ir_builder.hpp"
+#include "gpu/intel/jit/v2/conv/builder.hpp"
+#include "gpu/intel/jit/v2/conv/debug.hpp"
 #include "gpu/intel/jit/v2/conv/kernel.hpp"
-#include "gpu/intel/jit/v2/conv/plan_preset.hpp"
 #include "gpu/intel/jit/v2/conv/plan_registry.hpp"
 
 namespace dnnl {
@@ -41,48 +41,80 @@ namespace v2 {
 namespace conv {
 
 void maybe_init_layout(
-        memory_desc_t &md, const layout_raw_tag_t &_tag, bool remove_a_dim) {
+        memory_desc_t &md, const layout_tag_t &_tag, bool remove_a_dim) {
     if (md.format_kind != format_kind::any) return;
-    auto tag = _tag;
-    int non_spatial_ndims = tag.ndims() - 3;
-    if (remove_a_dim) {
-        tag.remove_dim('a');
-        non_spatial_ndims--;
-    }
-    while (tag.ndims() > md.ndims) {
-        tag.remove_dim('a' + non_spatial_ndims);
-    }
-    jit::layout_t layout(md, tag.str(), /*do_normalize=*/false);
+    // This code sets the any memory descriptor as follows:
+    // - Blocking is set according to the tag from the kernel descriptor
+    // - Type is copied from the user descriptor
+    // XXX: When internal blocked layouts are added, this needs an adjustment.
+    // The user layout should be set to a plain layout for consistency with
+    // other layers.
+    auto layout = to_conv_layout(_tag, md, remove_a_dim);
     md = layout.to_dnnl(md.dims);
 }
 
+status_t init_default_layouts(convolution_pd_t *pd) {
+    auto &src_md = *const_cast<memory_desc_t *>(pd->invariant_src_md());
+    auto &dst_md = *const_cast<memory_desc_t *>(pd->invariant_dst_md());
+    if (src_md.format_kind == format_kind::any)
+        set_default_format(src_md, "axb");
+    if (dst_md.format_kind == format_kind::any)
+        set_default_format(dst_md, "axb");
+    return status::success;
+}
+
 status_t init_layouts(const kernel_desc_t &desc, convolution_pd_t *pd) {
     auto &src_md = *const_cast<memory_desc_t *>(pd->invariant_src_md());
     auto &wei_md = *const_cast<memory_desc_t *>(pd->invariant_wei_md());
     auto &dst_md = *const_cast<memory_desc_t *>(pd->invariant_dst_md());
     auto &bia_md = *const_cast<memory_desc_t *>(pd->invariant_bia_md());
-    maybe_init_layout(src_md, desc.src_tag.raw_tag(), false);
-    maybe_init_layout(wei_md, desc.wei_tag.raw_tag(), !pd->with_groups());
-    maybe_init_layout(dst_md, desc.dst_tag.raw_tag(), false);
-    maybe_init_layout(bia_md, desc.bia_tag.raw_tag(), false);
+    maybe_init_layout(src_md, desc.src_tag, false);
+    maybe_init_layout(wei_md, desc.wei_tag, !pd->with_groups());
+    maybe_init_layout(dst_md, desc.dst_tag, false);
+    maybe_init_layout(
+            bia_md, make_conv_layout_tag(tensor_kind_t::bias, "a"), false);
     return status::success;
 }
 
+bool has_large_buffers(const convolution_pd_t *pd) {
+    auto is_large = [](const memory_desc_t &md) {
+        memory_desc_wrapper mdw(md);
+        gpu_assert(!mdw.format_any());
+        return mdw.size() > size_t(std::numeric_limits<int32_t>::max());
+    };
+    if (is_large(*pd->invariant_src_md())) return true;
+    if (is_large(*pd->invariant_wei_md())) return true;
+    if (is_large(*pd->invariant_dst_md())) return true;
+    if (is_large(*pd->invariant_bia_md())) return true;
+    auto &post_ops = pd->attr()->post_ops_;
+    for (int i = 0; i < post_ops.len(); i++) {
+        auto &e = post_ops.entry_[i];
+        if (e.is_binary()) {
+            if (is_large(e.binary.src1_desc)) return true;
+        }
+    }
+    return false;
+}
+
 class gen_convolution_t {
 public:
     template <typename T>
     static bool is_supported(T *pd, prop_kind_t prop) {
-        bool enable_conv_v2 = gpu_utils::dev_getenv("enable_conv_v2", false);
-        if (!enable_conv_v2) return false;
-
         // Non-trivial strides produces non-linear offset arithmetic which is
         // not yet supported.
         if (pd->is_bwd_d()
                 && !(pd->KSW() == 1 && pd->KSH() == 1 && pd->KSD() == 1))
             return false;
-
-        if ((pd->is_bwd_d() || pd->is_fwd()) && pd->with_bias()) return false;
-        if (!pd->attr()->has_default_values()) return false;
+        // Mixed types are not supported for backward by data.
+        if (pd->is_bwd_d()
+                && pd->diff_dst_md()->data_type
+                        != pd->diff_src_md()->data_type) {
+            return false;
+        }
+        using sm = primitive_attr_t::skip_mask_t;
+        auto skip_mask
+                = sm::post_ops | sm::sum_dt | sm::scales | sm::scales_data_type;
+        if (!pd->attr()->has_default_values(skip_mask)) return false;
         return true;
     }
 
@@ -97,8 +129,35 @@ class gen_convolution_t {
         if (!pd->set_default_alg_kind(alg_kind::convolution_direct))
             return status::unimplemented;
 
-        pd->exec_plan = std::make_shared<exec_plan_t>();
-        CHECK(init_exec_plan(*pd->exec_plan, pd, engine));
+        CHECK(init_default_layouts(pd));
+
+        auto prb = to_problem(pd, engine);
+        kernel_desc_t _desc;
+        if (debug_t::init_kernel_desc(_desc)) {
+            _desc.set_defaults();
+        } else {
+            auto &registry = const_plan_registry();
+            _desc = registry.find_best(prb);
+            if (_desc.is_empty()) {
+                gpu_info() << "Cannot find kernels that can fit the problem.";
+                return status::unimplemented;
+            }
+        }
+        _desc.spec.mode = specialization_mode_t::min_dims;
+        _desc.fit_to(prb);
+        CHECK(init_layouts(_desc, pd));
+        CHECK(pd->attr_.set_default_formats(out_md(pd)));
+
+        // Large buffer support is unimplemented.
+        if (has_large_buffers(pd)) return status::unimplemented;
+
+        CHECK(_desc.set_attr(pd, pd->attr(), out_md(pd)));
+        if (!create_conv_plan(_desc, prb)) {
+            gpu_info() << "Cannot create kernel descriptor.\n";
+            return status::runtime_error;
+        }
+        pd->init_plan = std::make_shared<primitive_init_plan_t>();
+        CHECK(_desc.init_primitive_plan(*pd->init_plan, prb, pd));
         return status::success;
     }
 
@@ -106,51 +165,26 @@ class gen_convolution_t {
 
     template <typename T>
     status_t init(T *primitive, impl::engine_t *engine) {
-        auto &exec_plan = *primitive->pd()->exec_plan;
-        CHECK(exec_plan.create_kernels(kernels_, primitive, engine));
+        auto &init_plan = *primitive->pd()->init_plan;
+        CHECK(init_plan.create_exec_plan(exec_plan_, primitive, engine));
         return status::success;
     }
 
-    template <typename T>
-    status_t execute(const T *primitive, const exec_ctx_t &ctx) const {
-        auto &exec_plan = *primitive->pd()->exec_plan;
-        return exec_plan.execute(primitive, ctx, kernels_);
+    status_t execute(
+            const gpu_primitive_t *primitive, const exec_ctx_t &ctx) const {
+        return exec_plan_.execute(primitive, ctx);
     }
 
 private:
-    template <typename T>
-    static status_t init_exec_plan(
-            exec_plan_t &exec_plan, T *pd, impl::engine_t *engine) {
-        std::shared_ptr<kernel_desc_base_t> desc;
-        std::shared_ptr<kernel_params_base_t> params;
-        CHECK(init_conv(desc, params, pd, engine));
-        exec_plan.add_kernel(desc, params);
-        return status::success;
-    }
-
-    static status_t init_conv(std::shared_ptr<kernel_desc_base_t> &desc,
-            std::shared_ptr<kernel_params_base_t> &params, convolution_pd_t *pd,
-            impl::engine_t *engine) {
-        auto prb = to_problem(pd, engine);
-        kernel_desc_t _desc;
-        kernel_params_t _params;
-        if (plan_preset_t::instance().is_set()) {
-            _desc = plan_preset_t::instance().get();
-        } else {
-            auto &registry = const_plan_registry();
-            _desc = registry.find_best(prb);
-        }
-        if (_desc.is_empty()) return status::unimplemented;
-        ir_assert(ir_check_fatal(finalize_conv_desc(_desc, prb)));
-        ir_assert(ir_check_fatal(_desc.fits(prb)));
-        CHECK(init_layouts(_desc, pd));
-        _params.prb = std::move(prb);
-        desc = std::make_shared<kernel_desc_t>(_desc);
-        params = std::make_shared<kernel_params_t>(_params);
-        return status::success;
+    static const memory_desc_t *out_md(const convolution_pd_t *pd) {
+        if (pd->is_fwd()) return pd->dst_md();
+        if (pd->is_bwd_d()) return pd->diff_src_md();
+        if (pd->is_bwd_w()) return pd->diff_weights_md();
+        gpu_error_not_expected();
+        return nullptr;
     }
 
-    std::vector<compute::kernel_t> kernels_;
+    primitive_exec_plan_t exec_plan_;
 };
 
 status_t gen_convolution_fwd_t::pd_t::init(impl::engine_t *engine) {
@@ -158,7 +192,7 @@ status_t gen_convolution_fwd_t::pd_t::init(impl::engine_t *engine) {
 }
 
 status_t gen_convolution_fwd_t::init(impl::engine_t *engine) {
-    impl_.reset(new gen_convolution_t());
+    impl_ = std::make_shared<gen_convolution_t>();
     return impl_->init(this, engine);
 }
 
@@ -171,7 +205,7 @@ status_t gen_convolution_bwd_data_t::pd_t::init(impl::engine_t *engine) {
 }
 
 status_t gen_convolution_bwd_data_t::init(impl::engine_t *engine) {
-    impl_.reset(new gen_convolution_t());
+    impl_ = std::make_shared<gen_convolution_t>();
     return impl_->init(this, engine);
 }
 
@@ -185,7 +219,7 @@ status_t gen_convolution_bwd_weights_t::pd_t::init(impl::engine_t *engine) {
 }
 
 status_t gen_convolution_bwd_weights_t::init(impl::engine_t *engine) {
-    impl_.reset(new gen_convolution_t());
+    impl_ = std::make_shared<gen_convolution_t>();
     return impl_->init(this, engine);
 }
 
diff --git a/src/gpu/intel/jit/v2/conv/gen_convolution.hpp b/src/gpu/intel/jit/v2/conv/gen_convolution.hpp
index 1d4b21304cf..493829e3195 100644
--- a/src/gpu/intel/jit/v2/conv/gen_convolution.hpp
+++ b/src/gpu/intel/jit/v2/conv/gen_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "common/c_types_map.hpp"
 #include "gpu/gpu_convolution_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
+#include "gpu/intel/jit/ir/primitive_plan.hpp"
 #include "gpu/intel/jit/v2/conv/kernel.hpp"
 
 namespace dnnl {
@@ -30,7 +31,7 @@ namespace gpu {
 namespace intel {
 namespace jit {
 
-class exec_plan_t;
+class primitive_init_plan_t;
 
 namespace v2 {
 namespace conv {
@@ -40,7 +41,7 @@ class gen_convolution_t;
 class gen_convolution_fwd_t : public gpu_primitive_t {
 public:
     friend gen_convolution_t;
-    friend exec_plan_t;
+    friend primitive_init_plan_t;
     struct pd_t : public gpu_convolution_fwd_pd_t {
         friend gen_convolution_t;
         using gpu_convolution_fwd_pd_t::gpu_convolution_fwd_pd_t;
@@ -48,7 +49,7 @@ class gen_convolution_fwd_t : public gpu_primitive_t {
         DECLARE_COMMON_PD_T("jit:ir_v2", gen_convolution_fwd_t);
         status_t init(impl::engine_t *engine);
 
-        std::shared_ptr<exec_plan_t> exec_plan;
+        std::shared_ptr<primitive_init_plan_t> init_plan;
     };
 
     using gpu_primitive_t::gpu_primitive_t;
@@ -65,7 +66,7 @@ class gen_convolution_fwd_t : public gpu_primitive_t {
 class gen_convolution_bwd_data_t : public gpu_primitive_t {
 public:
     friend gen_convolution_t;
-    friend exec_plan_t;
+    friend primitive_init_plan_t;
     struct pd_t : public gpu_convolution_bwd_data_pd_t {
         friend gen_convolution_t;
         using gpu_convolution_bwd_data_pd_t::gpu_convolution_bwd_data_pd_t;
@@ -73,7 +74,7 @@ class gen_convolution_bwd_data_t : public gpu_primitive_t {
         DECLARE_COMMON_PD_T("jit:ir_v2", gen_convolution_bwd_data_t);
         status_t init(impl::engine_t *engine);
 
-        std::shared_ptr<exec_plan_t> exec_plan;
+        std::shared_ptr<primitive_init_plan_t> init_plan;
     };
 
     using gpu_primitive_t::gpu_primitive_t;
@@ -90,7 +91,7 @@ class gen_convolution_bwd_data_t : public gpu_primitive_t {
 class gen_convolution_bwd_weights_t : public gpu_primitive_t {
 public:
     friend gen_convolution_t;
-    friend exec_plan_t;
+    friend primitive_init_plan_t;
     struct pd_t : public gpu_convolution_bwd_weights_pd_t {
         friend gen_convolution_t;
         using gpu_convolution_bwd_weights_pd_t::
@@ -99,7 +100,7 @@ class gen_convolution_bwd_weights_t : public gpu_primitive_t {
         DECLARE_COMMON_PD_T("jit:ir_v2", gen_convolution_bwd_weights_t);
         status_t init(impl::engine_t *engine);
 
-        std::shared_ptr<exec_plan_t> exec_plan;
+        std::shared_ptr<primitive_init_plan_t> init_plan;
     };
 
     using gpu_primitive_t::gpu_primitive_t;
diff --git a/src/gpu/intel/jit/v2/conv/ir_builder.cpp b/src/gpu/intel/jit/v2/conv/ir_builder.cpp
deleted file mode 100644
index 0d47756fe50..00000000000
--- a/src/gpu/intel/jit/v2/conv/ir_builder.cpp
+++ /dev/null
@@ -1,1214 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/jit/v2/conv/ir_builder.hpp"
-
-#include <sstream>
-
-#include "gpu/intel/jit/ir/ir.hpp"
-#include "gpu/intel/jit/ir/kernel_info.hpp"
-#include "gpu/intel/jit/ir/message.hpp"
-#include "gpu/intel/jit/ir/reduce.hpp"
-#include "gpu/intel/jit/ir/reorder.hpp"
-#include "gpu/intel/jit/pass/dpas.hpp"
-#include "gpu/intel/jit/pass/pass.hpp"
-#include "gpu/intel/jit/v2/conv/bridge.hpp"
-#include "gpu/intel/jit/v2/conv/plan.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-
-class loop_nest_t {
-public:
-    loop_nest_t() = default;
-
-    void add_loop(const expr_t &idx, const expr_t &size) {
-        loops_.push_back(loop_t {idx, size});
-    }
-
-    int nloops() const { return (int)loops_.size(); }
-    const expr_t &index(int level) const { return loops_[level].index; }
-    const expr_t &size(int level) const { return loops_[level].size; }
-    std::vector<expr_t> indices() const {
-        std::vector<expr_t> ret;
-        ret.reserve(nloops());
-        for (int i = 0; i < nloops(); i++) {
-            ret.push_back(index(i));
-        }
-        return ret;
-    }
-
-    std::string str() const {
-        std::ostringstream oss;
-        oss << "nloops: " << nloops();
-        for (int i = 0; i < nloops(); i++) {
-            oss << std::endl;
-            oss << "  idx: " << index(i) << " size: " << size(i);
-        }
-        return oss.str();
-    }
-
-    IR_DEFINE_DUMP()
-
-private:
-    struct loop_t {
-        expr_t index;
-        expr_t size;
-    };
-
-    std::vector<loop_t> loops_;
-};
-
-struct offset_params_t {
-    // Type of the offset.
-    type_t type;
-    // Execution size:
-    // - esize = 1: used as a scalar
-    // - esize > 1: used as a vector
-    // Note that esize > 1 may be used with scalar offsets, in this case the
-    // offset is broadcasted when used.
-    int esize = 0;
-    // Offset buffer size alignment (e.g. used for header allocations, aligned
-    // at a GRF boundary).
-    int buf_align = 0;
-    // Whether the offset can be used with broadcasting (e.g. scalar mask with
-    // multiple slots).
-    bool allow_bcast = false;
-    // Whether the offset can be used directly from the base (if the offset is
-    // equal to the base).
-    bool allow_reuse = false;
-    // Whether inline initialization can be used (see offset_t for details).
-    bool allow_inline_init = false;
-    // Optional pre-allocated buffer for the offset.
-    expr_t buf;
-    // Prefix for the buffer name.
-    std::string buf_prefix;
-
-    offset_params_t(
-            const type_t &type, int esize = 1, const char *buf_prefix = nullptr)
-        : type(type), esize(esize) {
-        if (buf_prefix) this->buf_prefix = buf_prefix;
-    }
-
-    expr_t get_buffer(buffer_manager_t &buf_mgr, int size) const {
-        if (buf_prefix.empty()) return buf;
-        auto buf_name = buf_mgr.ir_ctx().create_tmp_name(buf_prefix);
-        return buf_mgr.get(buf_name, size);
-    }
-};
-
-// Offset is represented as the sum of three terms:
-//     base + shift + shift_vec
-// where:
-// - (base + shift) is a scalar portion
-// - shift_vec is a vector portion
-//
-// base/shift split is relative which is determined during load/store planning
-// to group instructions performing access to a shifted tiles of the same
-// sub-layout. In general "shift" portion consists of simpler expressions
-// comparing with "base".
-// shift_vec is a vector of offsets (e.g. for per slot in a message or per lane
-// in a mask comparison).
-struct offset_t {
-    // Offset version. This is relevant for offsets that are used in multiple
-    // versions of the same loop, e.g. load and prefetch.
-    int version = -1;
-    // GRF buffer for the offset. If empty, the base storage is used for the
-    // offset.
-    expr_t buf;
-    // Offset type (scalar or vector).
-    type_t type;
-    // Scalar base.
-    expr_t base;
-    // Scalar shift.
-    expr_t shift;
-    // Vector shift.
-    expr_t shift_vec;
-    // Loop increments, used to implement strength reduction.
-    std::vector<expr_t> loop_incs;
-    // Execution size.
-    int esize;
-    // Inline initialization. When set, the offset is initialized right before
-    // use. This implies no loop increments and no pre-initialization. This is
-    // used as an optimization when offset A is a shifted version of another
-    // offset B: in this case we can do A = B + shift and avoid any other
-    // operations.
-    stmt_t inline_init;
-
-    bool is_equal(const offset_t &other, bool compare_shift = true) const {
-        if (version != other.version) return false;
-        if (type != other.type) return false;
-        if (!base.is_equal(other.base)) return false;
-        if (compare_shift && !shift.is_equal(other.shift)) return false;
-        if (!shift_vec.is_equal(other.shift_vec)) return false;
-        if (!ir_utils::is_equal(loop_incs, other.loop_incs)) return false;
-        if (esize != other.esize) return false;
-        if (!ir_utils::is_equal(inline_init, other.inline_init)) return false;
-        return true;
-    }
-
-    bool operator==(const offset_t &other) const { return is_equal(other); }
-
-    expr_t load() const {
-        if (buf.is_empty()) return make_broadcast(base);
-        return make_broadcast(load_t::make(type, buf, 0));
-    }
-
-    stmt_t store(const expr_t &_value) const {
-        auto value = _value;
-        if (value.type() != type) value = cast(value, type);
-        return store_t::make(buf, 0, value);
-    }
-
-    stmt_t init_stmt() const {
-        if (buf.is_empty() || !inline_init.is_empty()) return stmt_t();
-        auto base_bcast = shuffle_t::make_broadcast(base + shift, type.elems());
-        return store(base_bcast + shift_vec);
-    }
-
-    stmt_t inc_stmt(int loop_idx) const {
-        if (loop_incs.empty()) return stmt_t();
-        auto inc = loop_incs[loop_idx];
-        if (is_zero(inc)) return stmt_t();
-        inc = shuffle_t::make_broadcast(inc, type.elems());
-        auto value = load_t::make(type, buf, 0) + inc;
-        return store(value);
-    }
-
-    expr_t make_broadcast(const expr_t &e) const {
-        if (e.type().elems() == esize) return e;
-        return shuffle_t::make_broadcast(e, esize);
-    }
-
-    std::string str() const {
-        std::ostringstream oss;
-        oss << "buf:       " << buf << std::endl;
-        oss << "base:      " << base << std::endl;
-        oss << "shift:     " << shift << std::endl;
-        oss << "shift_vec: " << shift_vec << std::endl;
-        oss << "loop_incs:";
-        for (int i = 0; i < (int)loop_incs.size(); i++) {
-            oss << std::endl;
-            oss << "  " << loop_incs[i];
-        }
-        return oss.str();
-    }
-
-    IR_DEFINE_DUMP()
-
-    static bool can_reuse_base(const type_t &type, const expr_t &base,
-            const expr_t &shift, const expr_t &shift_vec,
-            const std::vector<expr_t> &loop_incs) {
-        if (!type.is_scalar()) return false;
-        if (!is_zero(shift)) return false;
-        if (!is_var(base) && !is_const(base)) return false;
-        if (!all_of(shift_vec, 0)) return false;
-        for (auto &e : loop_incs)
-            if (!is_zero(e)) return false;
-        return true;
-    }
-};
-
-class send_header_t {
-public:
-    send_header_t() = default;
-    send_header_t(const offset_t &off) : off_(off) {}
-    const offset_t &off() const { return off_; }
-    const expr_t &to_expr() const { return off_.buf; }
-
-private:
-    offset_t off_;
-};
-
-class send_mask_t {
-public:
-    send_mask_t() = default;
-
-    void add_mask(
-            const offset_t &off, const expr_t &bound, bool has_underflow) {
-        entries_.emplace_back(off, bound, has_underflow);
-    }
-
-    expr_t to_expr() const {
-        if (entries_.empty()) return expr_t();
-        expr_t ret;
-        for (auto &e : entries_) {
-            auto cmp = (e.off.load() < e.off.make_broadcast(e.bound));
-            ret = (ret.is_empty() ? std::move(cmp) : (ret & cmp));
-            if (e.has_underflow)
-                ret &= (e.off.load() >= e.off.make_broadcast(0));
-        }
-        return ret;
-    }
-
-private:
-    struct entry_t {
-        entry_t() = default;
-        entry_t(const offset_t &off, const expr_t &bound, bool has_underflow)
-            : off(off), bound(bound), has_underflow(has_underflow) {}
-        offset_t off;
-        expr_t bound;
-        bool has_underflow = false;
-    };
-
-    std::vector<entry_t> entries_;
-};
-
-class offset_scope_t {
-public:
-    offset_scope_t(buffer_manager_t &buf_mgr, ir_context_t &ir_ctx,
-            const loop_nest_t &loop_nest)
-        : buf_mgr_(buf_mgr), ir_ctx_(ir_ctx), loop_nest_(loop_nest) {}
-
-    send_header_t add_header(int version, const send_1d_desc_t &desc,
-            const expr_t &mem_buf, const addr_t &addr, const expr_t &addr_inc) {
-        auto base0 = cast(mem_buf, type_t::u64());
-        auto params = offset_params_t(type_t::u64(), desc.slots, "h");
-        params.buf_align = buf_mgr_.ir_ctx().grf_size();
-        params.allow_inline_init = true;
-        auto off = get_offset(
-                version, base0, addr.base, addr.slot_incs, addr_inc, params);
-        return send_header_t(off);
-    }
-
-    send_header_t add_header(int version, const send_2d_desc_t &desc,
-            const expr_t &mem_buf, const expr_t &base, const expr_t &x_base,
-            const expr_t &y_base, const expr_t &x_inc, const expr_t &y_inc) {
-        auto base0 = cast(mem_buf, type_t::u64());
-        auto params = offset_params_t(type_t::u64(), /*esize=*/1, "h");
-        params.buf_align = desc.hw.grf_size();
-        auto off = get_offset(version, base0, base, expr_t(0), params);
-        auto x_params = offset_params_t(type_t::s32());
-        auto y_params = offset_params_t(type_t::s32());
-        x_params.buf = off.buf + send_t::header_2d_off_x();
-        y_params.buf = off.buf + send_t::header_2d_off_y();
-        auto x = get_offset(version, expr_t(0), x_base, x_inc, x_params);
-        auto y = get_offset(version, expr_t(0), y_base, y_inc, y_params);
-
-        int type_size = desc.type.size();
-        auto W_enc = to_simple_expr(desc.W) * type_size - 1;
-        auto H_enc = to_simple_expr(desc.H) - 1;
-        auto P_enc = to_simple_expr(desc.P) * type_size - 1;
-        (void)get_offset(version, W_enc,
-                off.buf + send_t::header_2d_off_surface_width());
-        (void)get_offset(version, H_enc,
-                off.buf + send_t::header_2d_off_surface_height());
-        (void)get_offset(version, P_enc,
-                off.buf + send_t::header_2d_off_surface_pitch());
-
-        uint32_t w_enc = desc.w - 1;
-        uint32_t h_enc = desc.h - 1;
-        uint32_t count_enc = desc.c - 1;
-        uint32_t whc_value = (count_enc << 16) + (h_enc << 8) + w_enc;
-        (void)get_offset(
-                version, whc_value, off.buf + send_t::header_2d_off_whc());
-
-        return send_header_t(off);
-    }
-
-    send_mask_t add_mask(int version, const mask_t &mask,
-            const std::vector<expr_t> &mask_incs) {
-        send_mask_t ret;
-        for (int i = 0; i < mask.nmasks(); i++) {
-            auto &dm = mask.dim_masks[i];
-            if (dm.is_empty()) continue;
-            auto shift = mask_incs.empty() ? expr_t(0) : mask_incs[i];
-            auto params = offset_params_t(type_t::s32(), dm.slots(), "m");
-            params.allow_bcast = true;
-            params.allow_reuse = true;
-            auto off = get_offset(
-                    version, expr_t(0), dm.base, dm.slot_incs, shift, params);
-            ret.add_mask(off, to_simple_expr(dm.bound), dm.has_underflow);
-        }
-        return ret;
-    }
-
-    stmt_t init_stmt(int version) const {
-        stmt_t ret;
-        for (auto &o : offsets_) {
-            if (o.version != version) continue;
-            ret = ret.append(o.init_stmt());
-        }
-        return ret;
-    }
-
-    stmt_t inc_loop_stmt(int loop_idx, int version) const {
-        stmt_t ret;
-        for (auto &o : offsets_) {
-            if (o.version != version) continue;
-            auto inc = o.inc_stmt(loop_idx);
-            ret = ret.append(inc);
-        }
-        return ret;
-    }
-
-    stmt_t inject_let_stmts(const stmt_t &stmt) const {
-        return jit::inject_let_stmts(stmt, let_stmts_);
-    }
-
-private:
-    // base0 - memory buffer base address
-    // base, shift_vec, shift - offset parts (see offset_t description)
-    offset_t get_offset(int version, const expr_t &base0, const expr_t &base,
-            const std::vector<expr_t> &_shift_vec, const expr_t &_shift,
-            const offset_params_t &_params) {
-        auto params = _params;
-        expr_t _base_init;
-        std::vector<expr_t> _loop_incs;
-        split_to_linear(base, loop_nest_.indices(), _base_init, _loop_incs);
-
-        auto type = params.type.with_elems(params.esize);
-        auto shift_vec
-                = _shift_vec.empty() ? expr_t(0) : shuffle_t::make(_shift_vec);
-        if (params.allow_bcast) {
-            if (auto *shuffle = shift_vec.as_ptr<shuffle_t>()) {
-                if (shuffle->is_broadcast()) {
-                    shift_vec = shuffle->vec[0];
-                    type = type.scalar();
-                }
-            }
-        }
-        offset_t ret;
-        ret.version = version;
-        ret.type = type;
-        ret.base = base0 + _base_init;
-        ret.shift = _shift;
-        ret.shift_vec = std::move(shift_vec);
-        ret.esize = params.esize;
-
-        expr_t comp_value = 0;
-        for (int i = 0; i < loop_nest_.nloops(); i++) {
-            auto loop_size = loop_nest_.size(i);
-            auto inc_value = simplify(_loop_incs[i] - comp_value);
-            auto inc = to_simple_expr(inc_value);
-            ret.loop_incs.push_back(inc);
-            comp_value = to_simple_expr(_loop_incs[i] * loop_size);
-        }
-
-        if (params.allow_reuse) {
-            for (auto &o : offsets_) {
-                if (o == ret) return o;
-            }
-        }
-
-        bool can_reuse_base = offset_t::can_reuse_base(
-                ret.type, ret.base, ret.shift, ret.shift_vec, ret.loop_incs);
-        if (!params.allow_reuse || !can_reuse_base) {
-            int size = type.size();
-            if (params.buf_align != 0)
-                size = utils::rnd_up(size, params.buf_align);
-            ret.buf = params.get_buffer(buf_mgr_, size);
-            buf_versions_.emplace(ret.buf, version);
-        }
-
-        // Try to use inline initialization.
-        if (params.allow_inline_init && !is_zero(ret.shift)
-                && ret.type.is_scalar()) {
-            for (auto &o : offsets_) {
-                if (o.is_equal(ret, /*compare_shift=*/false)) {
-                    ir_assert(o.type.is_scalar());
-                    ret.inline_init = ret.store(o.load() + ret.shift);
-                    ret.loop_incs.clear();
-                    break;
-                }
-            }
-        }
-
-        return add_offset(ret);
-    }
-
-    offset_t get_offset(int version, const expr_t &base0, const expr_t &base,
-            const expr_t &shift, const offset_params_t &_params) {
-        return get_offset(
-                version, base0, base, std::vector<expr_t>(), shift, _params);
-    }
-
-    offset_t get_offset(int version, const expr_t &base, const expr_t &buf) {
-        offset_t ret;
-        ret.version = version;
-        ret.buf = buf;
-        ret.type = base.type();
-        ret.base = base;
-        ret.shift = expr_t(0);
-        ret.shift_vec = expr_t(0);
-        ret.esize = 1;
-        return add_offset(ret);
-    }
-
-    offset_t add_offset(const offset_t &off) {
-        offsets_.push_back(off);
-        return off;
-    }
-
-    expr_t to_simple_expr(const expr_t &e) {
-        if (is_const(e) || e.is<const_var_t>() || e.is<var_t>()) return e;
-        auto it = expr_to_let_var_.find(e);
-        if (it != expr_to_let_var_.end()) return it->second;
-        auto tmp_var = ir_ctx_.create_tmp_var(type_t::s32());
-        let_stmts_.push_back(let_t::make(tmp_var, e));
-        expr_to_let_var_.emplace(e, tmp_var);
-        return tmp_var;
-    }
-
-    buffer_manager_t &buf_mgr_;
-    ir_context_t &ir_ctx_;
-    loop_nest_t loop_nest_;
-    object_eq_map_t<expr_t, expr_t> expr_to_let_var_;
-    std::vector<stmt_t> let_stmts_;
-    std::vector<offset_t> offsets_;
-    object_map_t<expr_t, int> buf_versions_;
-};
-
-class offset_ctx_t {
-public:
-    offset_ctx_t() = default;
-    offset_ctx_t(buffer_manager_t &buf_mgr, ir_context_t &ir_ctx,
-            const loop_nest_t &loop_nest = loop_nest_t())
-        : version_(0)
-        , scope_(std::make_shared<offset_scope_t>(buf_mgr, ir_ctx, loop_nest)) {
-    }
-
-    offset_ctx_t bump_version() const {
-        ir_assert(version_ != -1);
-        offset_ctx_t ret = *this;
-        ret.version_++;
-        return ret;
-    }
-
-    send_header_t add_header(const send_1d_desc_t &desc, const expr_t &mem_buf,
-            const addr_t &addr, const expr_t &addr_inc) {
-        return scope_->add_header(version_, desc, mem_buf, addr, addr_inc);
-    }
-
-    send_header_t add_header(const send_2d_desc_t &desc, const expr_t &mem_buf,
-            const expr_t &base, const expr_t &x_base, const expr_t &y_base,
-            const expr_t &x_inc, const expr_t &y_inc) {
-        return scope_->add_header(
-                version_, desc, mem_buf, base, x_base, y_base, x_inc, y_inc);
-    }
-
-    send_mask_t add_mask(const mask_t &mask,
-            const std::vector<expr_t> &mask_incs = std::vector<expr_t>()) {
-        return scope_->add_mask(version_, mask, mask_incs);
-    }
-
-    stmt_t init_stmt() const { return scope_->init_stmt(version_); }
-    stmt_t inject_let_stmts(const stmt_t &stmt) const {
-        return scope_->inject_let_stmts(stmt);
-    }
-    stmt_t inc_loop_stmt(int loop_idx) const {
-        return scope_->inc_loop_stmt(loop_idx, version_);
-    }
-
-private:
-    int version_ = -1;
-    std::shared_ptr<offset_scope_t> scope_;
-};
-
-class iterator_t {
-public:
-    iterator_t() = default;
-
-    iterator_t(buffer_manager_t &buf_mgr, const loop_nest_t &loop_nest)
-        : loop_nest_(loop_nest) {
-        linear_idx_ = loop_index_t(buf_mgr);
-        for (int i = 0; i < loop_nest.nloops(); i++) {
-            loop_idxs_.emplace_back(buf_mgr);
-        }
-    }
-
-    int nloops() const { return loop_nest_.nloops(); }
-
-    stmt_t init_stmt() const {
-        stmt_t ret;
-        for (int i = 0; i < nloops(); i++) {
-            ret = ret.append(loop_idxs_[i].store(0));
-        }
-        ret = linear_idx_.store(linear_bound() - 1).append(ret);
-        return ret;
-    }
-
-    stmt_t check_bounds_stmt(const stmt_t &body) const {
-        return if_t::make(linear_idx_.var() >= 0, body);
-    }
-
-    stmt_t inc_stmt(const offset_ctx_t &off_ctx) const {
-        stmt_t body;
-        for (int i = nloops() - 1; i >= 0; i--) {
-            stmt_t stmt;
-            if (i - 1 >= 0) stmt = stmt.append(loop_idxs_[i - 1].store(0));
-            stmt = stmt.append(loop_idxs_[i].inc_stmt());
-            stmt = stmt.append(off_ctx.inc_loop_stmt(i));
-            if (i + 1 < nloops())
-                stmt = stmt.append(if_t::make(
-                        loop_idxs_[i].var() >= loop_nest_.size(i), body));
-            body = std::move(stmt);
-        }
-        body = linear_idx_.inc_stmt(-1).append(body);
-        return body;
-    }
-
-private:
-    struct loop_index_t {
-        expr_t buf;
-
-        loop_index_t() = default;
-        loop_index_t(buffer_manager_t &buf_mgr) {
-            auto buf_name = buf_mgr.ir_ctx().create_tmp_name("i");
-            buf = buf_mgr.get(buf_name, sizeof(int32_t));
-        }
-
-        stmt_t store(const expr_t &value) const {
-            return store_t::make(buf, 0, value);
-        }
-
-        stmt_t inc_stmt(int inc = 1) const { return store(var() + inc); }
-
-        expr_t var() const { return load_t::make(type_t::s32(), buf, 0); }
-    };
-
-    expr_t linear_bound() const {
-        expr_t ret;
-        for (int i = 0; i < nloops(); i++) {
-            if (ret.is_empty()) {
-                ret = loop_nest_.size(i);
-            } else {
-                ret *= loop_nest_.size(i);
-            }
-        }
-        return ret;
-    }
-
-    loop_nest_t loop_nest_;
-    std::vector<loop_index_t> loop_idxs_;
-    loop_index_t linear_idx_;
-};
-
-type_t to_send_type(const send_1d_desc_t &desc) {
-    if (desc.type_size <= 8) return type_t::u(desc.type_size * 8);
-    return type_t::oword(desc.type_size / 16);
-}
-
-int get_reg_off(const send_1d_plan_t &plan, const prb_coord_t<int> &coord) {
-    return plan.reg_layout.offset_in_bytes(coord);
-}
-
-stmt_t create_stmt(const reduce_plan_t &plan, const expr_t &src_buf,
-        const expr_t &dst_buf) {
-    if (!plan) return stmt_t();
-    return create_reduce_stmt(
-            to_ir(plan.src), to_ir(plan.dst), src_buf, dst_buf);
-}
-
-stmt_t create_stmt(const reorder_plan_t &plan, const expr_t &src_buf,
-        const expr_t &dst_buf) {
-    if (!plan) return stmt_t();
-    return create_reorder_stmt(
-            to_ir(plan.src), to_ir(plan.dst), src_buf, dst_buf);
-}
-
-stmt_t create_stmt(const send_1d_plan_t &plan, const expr_t &mem_buf,
-        const expr_t &reg_buf, offset_ctx_t &off_ctx,
-        const prb_coord_t<int> &coord, const prb_tile_t &tile,
-        const layout_t &payload_layout, const prb_coord_t<int> &payload_coord) {
-    for (auto &d : plan.entry_tile) {
-        ir_assert(tile.at(d) % plan.entry_tile.at(d) == 0);
-    }
-    auto op = to_ir(plan.desc.op);
-    auto address = to_ir(plan.desc.address);
-    auto type = to_send_type(plan.desc);
-    auto slots = plan.desc.slots;
-    auto send_func = jit::send_t::make(
-            plan.hw, op, address, type, slots, /*zero_out=*/true);
-    auto &send = send_func.as<send_t>();
-    stmt_t ret;
-    for_each(tile, plan.entry_tile, [&](const prb_coord_t<int> &sub_coord) {
-        int entry_idx = plan.reg_layout.to_linear_index(
-                plan.entry_tile, coord + sub_coord);
-        auto &e = plan.entries[entry_idx];
-        ir_assert(e.coord == coord + sub_coord);
-        auto header
-                = off_ctx.add_header(plan.desc, mem_buf, plan.addr, e.addr_inc);
-        auto mask = off_ctx.add_mask(plan.mask, e.mask_incs);
-        auto call_reg_buf = reg_buf;
-        if (!reg_buf.is_empty())
-            call_reg_buf += payload_layout.offset_in_bytes(
-                    payload_coord + sub_coord);
-        auto call
-                = send(mem_buf, header.to_expr(), call_reg_buf, mask.to_expr());
-        ret = ret.append(header.off().inline_init);
-        ret = ret.append(call);
-    });
-    return ret;
-}
-
-stmt_t create_stmt(const send_2d_plan_t &plan, const expr_t &mem_buf,
-        const expr_t &reg_buf, offset_ctx_t &off_ctx,
-        const prb_coord_t<int> &coord, const prb_tile_t &tile,
-        const layout_t &payload_layout, const prb_coord_t<int> &payload_coord) {
-    auto op = to_ir(plan.desc.op, /*is_2d=*/true);
-    auto &type = plan.desc.type;
-    auto &desc = plan.desc;
-    auto send_func = jit::send_t::make_2d(plan.hw, op, type, desc.w, desc.h,
-            desc.c, desc.vnni, desc.transpose, /*zero_out=*/true);
-    auto &send = send_func.as<send_t>();
-    stmt_t ret;
-    for_each(tile, plan.entry_tile, [&](const prb_coord_t<int> &sub_coord) {
-        int entry_idx = plan.reg_layout.to_linear_index(
-                plan.entry_tile, coord + sub_coord);
-        auto &e = plan.entries[entry_idx];
-        ir_assert(e.coord == coord + sub_coord);
-        auto header = off_ctx.add_header(plan.desc, mem_buf, plan.base,
-                plan.x_base, plan.y_base, e.x_inc, e.y_inc);
-        auto mask = off_ctx.add_mask(plan.mask);
-        auto call_reg_buf = reg_buf;
-        if (!reg_buf.is_empty())
-            call_reg_buf += payload_layout.offset_in_bytes(
-                    payload_coord + sub_coord);
-        auto call
-                = send(mem_buf, header.to_expr(), call_reg_buf, mask.to_expr());
-        ret = ret.append(header.off().inline_init);
-        ret = ret.append(call);
-    });
-    return ret;
-}
-
-stmt_t create_stmt(const send_plan_t &plan, const expr_t &mem_buf,
-        const expr_t &reg_buf, offset_ctx_t &off_ctx,
-        const prb_coord_t<int> &coord, const prb_tile_t &tile,
-        const layout_t &payload_layout, const prb_coord_t<int> &payload_coord) {
-    if (plan.is_1d())
-        return create_stmt(plan._1d, mem_buf, reg_buf, off_ctx, coord, tile,
-                payload_layout, payload_coord);
-    if (plan.is_2d())
-        return create_stmt(plan._2d, mem_buf, reg_buf, off_ctx, coord, tile,
-                payload_layout, payload_coord);
-    ir_error_not_expected();
-    return stmt_t();
-}
-
-stmt_t create_stmt(const send_plan_t &plan, const expr_t &mem_buf,
-        const expr_t &reg_buf, offset_ctx_t &off_ctx) {
-    return create_stmt(plan, mem_buf, reg_buf, off_ctx, prb_coord_t<int>(),
-            plan.reg_layout().int_dim_sizes(), plan.reg_layout(),
-            prb_coord_t<int>());
-}
-
-class var_replacer_t : public ir_mutator_t {
-public:
-    var_replacer_t(const kernel_info_t &kernel_info,
-            const grid_context_t &grid_ctx, const grid_t &tg_grid) {
-        for (auto &d : conv_dims()) {
-            auto &size = size_var(d).as<const_var_t>();
-            auto size_arg = kernel_info.find_arg(size.name);
-            var_map_.emplace(size, size_arg);
-        }
-        for (int i = 0; i < grid_ctx.ndims(); i++) {
-            auto tg_idx = tg_grid.index_var(i);
-            var_map_.emplace(tg_idx, grid_ctx.tg_idx(i));
-        }
-    }
-    object_t _mutate(const var_t &obj) override {
-        auto it = var_map_.find(obj);
-        if (it != var_map_.end()) return it->second;
-        return obj;
-    }
-
-    object_t _mutate(const const_var_t &obj) override {
-        auto it = var_map_.find(obj);
-        if (it != var_map_.end()) return it->second;
-        ir_error_not_expected() << "Cannot map const var: " << obj;
-        return expr_t();
-    }
-
-    object_t _mutate(const binary_op_t &obj) override {
-        switch (obj.op_kind) {
-            case op_kind_t::_div_up: return mutate((obj.a + obj.b - 1) / obj.b);
-            default: return ir_mutator_t::_mutate(obj);
-        }
-    }
-
-private:
-    object_map_t<expr_t, expr_t> var_map_;
-};
-
-stmt_t finalize_vars(const stmt_t &stmt, const kernel_info_t &kernel_info,
-        const grid_context_t &grid_ctx, const grid_t &tg_grid,
-        ir_context_t &ir_ctx) {
-    auto ret = var_replacer_t(kernel_info, grid_ctx, tg_grid).mutate(stmt);
-    ret = inject_external_var_let(ret, ir_ctx);
-    return ret;
-}
-
-loop_nest_t make_loop_nest(
-        const loop_desc_t &loop_desc, const coord_info_t &coord_info) {
-    loop_nest_t ret;
-    for (auto &e : loop_desc) {
-        const auto &index = coord_info.loop_index(e.dim);
-        const auto &size = coord_info.loop_size(e.dim);
-        ret.add_loop(index, size);
-    }
-    return ret;
-}
-
-class ir_builder_t {
-public:
-    ir_builder_t(const kernel_desc_t &desc, const kernel_info_t &kernel_info,
-            const grid_context_t &grid_ctx, const plan_t &plan)
-        : desc_(desc)
-        , kernel_info_(kernel_info)
-        , grid_ctx_(grid_ctx)
-        , plan_(plan)
-        , ir_ctx_(desc.exec_cfg(), cset_)
-        , buf_mgr_(ir_ctx_)
-        , loop_nest_(make_loop_nest(desc_.loop_desc, plan_.coord_info))
-        , off_ctx_(buf_mgr_, ir_ctx_, loop_nest_)
-        , prefetch_off_ctx_(off_ctx_.bump_version())
-        , epilogue_off_ctx_(prefetch_off_ctx_.bump_version()) {}
-
-    stmt_t build() {
-        build_prefetch();
-        build_x2r_mul();
-        build_epilogue();
-
-        stmt_t compute_stmt;
-        compute_stmt = compute_stmt.append(zero_out_stmt());
-        compute_stmt = compute_stmt.append(off_ctx_.init_stmt());
-        compute_stmt = compute_stmt.append(prefetch_off_ctx_.init_stmt());
-        compute_stmt = compute_stmt.append(loop());
-
-        stmt_t epilogue_stmt;
-        epilogue_stmt = epilogue_stmt.append(epilogue_off_ctx_.init_stmt());
-        epilogue_stmt = epilogue_stmt.append(epilogue_stmt_);
-
-        stmt_t stmt;
-        stmt = stmt.append(compute_stmt);
-        stmt = stmt.append(epilogue_stmt);
-
-        stmt = inject_alloc_stmts(stmt, buf_mgr_);
-        stmt = off_ctx_.inject_let_stmts(stmt);
-        stmt = inject_global_alloc(stmt);
-        stmt = inject_index_let(stmt);
-        stmt = finalize_vars(
-                stmt, kernel_info_, grid_ctx_, plan_.tg_grid, ir_ctx_);
-
-        stmt = simplify(stmt, ir_ctx_);
-        stmt = optimize_alloc_let(stmt, ir_ctx_);
-        stmt = split_wide_stores(stmt, ir_ctx_);
-        stmt = fixup_if_conditions(stmt, ir_ctx_);
-        stmt = eliminate_common_subexprs(stmt, ir_ctx_, 16, 0);
-        stmt = inject_bank_conflict_attribute(stmt, ir_ctx_);
-        return stmt;
-    }
-
-private:
-    stmt_t loop() const {
-        auto &loop_desc = desc_.loop_desc;
-        auto &coord_info = plan_.coord_info;
-        int prefetch_dist = desc_.prefetch.dist;
-        stmt_t init_stmt;
-        iterator_t prefetch_it;
-        if (prefetch_dist > 0) {
-            prefetch_it = iterator_t(buf_mgr_, loop_nest_);
-            init_stmt = init_stmt.append(prefetch_it.init_stmt());
-            for (int i = 0; i < prefetch_dist; i++) {
-                auto i_prefetch_stmt = prefetch_stmt_;
-                if (i > 0)
-                    i_prefetch_stmt
-                            = prefetch_it.check_bounds_stmt(i_prefetch_stmt);
-                init_stmt = init_stmt.append(i_prefetch_stmt);
-                init_stmt = init_stmt.append(
-                        prefetch_it.inc_stmt(prefetch_off_ctx_));
-            }
-        }
-        stmt_t ret;
-        if (prefetch_dist > 0) {
-            ret = ret.append(prefetch_it.check_bounds_stmt(prefetch_stmt_));
-        }
-        ret = ret.append(x2r_mul_stmt_);
-        if (prefetch_dist > 0) {
-            ret = ret.append(prefetch_it.inc_stmt(prefetch_off_ctx_));
-        }
-        for (auto &e : loop_desc) {
-            const auto &var = coord_info.loop_index(e.dim);
-            const auto &bound = coord_info.loop_size(e.dim);
-            ret = ret.append(off_ctx_.inc_loop_stmt(e.idx));
-            ret = for_t::make(var, 0, bound, ret);
-        }
-        ret = init_stmt.append(ret);
-        return ret;
-    }
-
-    stmt_t zero_out_stmt() const {
-        auto &c_entry = buf_mgr_.find_ref("c");
-        auto stmt = funcs::zero_out(c_entry.buf, c_entry.size);
-        if (desc_.prop == prop_kind::backward_weights && desc_.with_bias) {
-            auto &bia_entry = buf_mgr_.find_ref("bia_buf");
-            stmt = stmt.append(funcs::zero_out(bia_entry.buf, bia_entry.size));
-        }
-        auto ret = stmt_group_t::make(stmt_label_t::c_zero_out(), stmt);
-        return ret;
-    }
-
-    stmt_t inject_global_alloc(const stmt_t &stmt) const {
-        std::vector<stmt_t> allocs;
-        for (int i = 0; i < kernel_info_.nargs(); i++) {
-            auto &var = kernel_info_.arg_var(i);
-            if (!var.type().is_ptr()) continue;
-            allocs.push_back(alloc_t::make(var, 0, alloc_kind_t::global));
-        }
-        return inject_alloc_stmts(stmt, allocs);
-    }
-
-    stmt_t inject_index_let(const stmt_t &stmt) const {
-        auto &tg_grid = plan_.tg_grid;
-        auto &coord_info = plan_.coord_info;
-        stmt_t ret = stmt;
-        for (auto &d : conv_index_dims(plan_.desc.prop)) {
-            const auto &tg_idx = coord_info.tg_index(d);
-            if (is_const(tg_idx)) continue;
-            auto base_tg_idx = tg_grid.index_var(d);
-            if (base_tg_idx.is_empty()) continue;
-            auto value = unpack_tg_index(d);
-            ret = let_t::make(tg_idx, value, ret);
-        }
-        for (auto &kv : plan_.virt_grid.idxs()) {
-            ret = let_t::make(kv.first, kv.second, ret);
-        }
-        for (int i = 0; i < grid_ctx_.ndims(); i++) {
-            auto value = grid_ctx_.local_id(i);
-            if (i == 0) value /= plan_.desc.simd;
-            auto thr_idx = plan_.thr_grid.index_var(i);
-            ret = let_t::make(thr_idx, cast(value, thr_idx.type()), ret);
-        }
-        return ret;
-    }
-
-    expr_t unpack_tg_index(const prb_dim_t &dim) const {
-        auto &tg_grid = plan_.tg_grid;
-        auto base_idx = tg_grid.index_var(dim);
-        if (base_idx.is_empty()) return expr_t();
-
-        expr_t value = std::move(base_idx);
-        auto &dims = tg_grid.dims(tg_grid.index(dim));
-        int ndims = (int)dims.size();
-        for (int i = 0; i < ndims; i++) {
-            if (dims[i] == dim) break;
-            auto i_dim_size
-                    = kernel_info_.find_arg(dims[i].str() + "_grid_size");
-            auto i_magic = kernel_info_.find_arg(dims[i].str() + "_magic");
-            value = ternary_op_t::make(
-                    op_kind_t::_idiv, value, i_dim_size, i_magic);
-        }
-        auto dim_size = kernel_info_.find_arg(dim.str() + "_grid_size");
-        auto magic = kernel_info_.find_arg(dim.str() + "_magic");
-        value = ternary_op_t::make(op_kind_t::_imod, value, dim_size, magic);
-        return value;
-    }
-
-    void build_prefetch() {
-        auto &prefetch = plan_.prefetch;
-        if (prefetch.a_prefetch) {
-            auto a_prefetch = create_stmt(prefetch.a_prefetch, a_mem_buf(),
-                    expr_t(), prefetch_off_ctx_);
-            prefetch_stmt_ = prefetch_stmt_.append(a_prefetch);
-        }
-        if (prefetch.b_prefetch) {
-            auto b_prefetch = create_stmt(prefetch.b_prefetch, b_mem_buf(),
-                    expr_t(), prefetch_off_ctx_);
-            prefetch_stmt_ = prefetch_stmt_.append(b_prefetch);
-        }
-    }
-
-    void build_x2r(const x2r_plan_t &plan) {
-        auto prefix = to_string(plan.tensor_kind);
-        expr_t load_buf;
-        expr_t mul_buf;
-        if (plan.reorder) {
-            load_buf = buf_mgr_.get(
-                    prefix + "_tmp", plan.load.reg_layout().size());
-            mul_buf = buf_mgr_.get(prefix, plan.reorder.dst.size());
-        } else {
-            load_buf = buf_mgr_.get(prefix, plan.load.reg_layout().size());
-            mul_buf = load_buf;
-        }
-        auto load_stmt = create_stmt(
-                plan.load, mem_buf(plan.tensor_kind), load_buf, off_ctx_);
-        auto reorder_stmt = create_stmt(plan.reorder, load_buf, mul_buf);
-        x2r_mul_stmt_ = x2r_mul_stmt_.append(load_stmt);
-        x2r_mul_stmt_ = x2r_mul_stmt_.append(reorder_stmt);
-    }
-
-    void build_mul(const fma_plan_t &fma, const expr_t &c_buf) {
-        auto &a_layout = fma.a_layout;
-        auto &b_layout = fma.b_layout;
-        auto &c_layout = fma.c_layout;
-        auto a_buf = buf_mgr_.get("a");
-        auto b_buf = buf_mgr_.get("b");
-
-        for (auto &d : a_layout.dims())
-            ir_assert(fma.inst_tile.has(d)) << d;
-        for (auto &d : b_layout.dims())
-            ir_assert(fma.inst_tile.has(d)) << d;
-
-        // BMNK order.
-        prb_dim_t dims[4];
-        int blocks[4] = {1, 1, 1, 1};
-        int sizes[4] = {1, 1, 1, 1};
-        dim_map_t<prb_dim_t, int> bmnk_map;
-        bmnk_map[prb_dims::b] = 0;
-        bmnk_map[prb_dims::m] = 1;
-        bmnk_map[prb_dims::n] = 2;
-        bmnk_map[prb_dims::k] = 3;
-        for (auto &d : fma.inst_tile) {
-            int idx = bmnk_map.at(to_gemm(d, desc_.prop));
-            dims[idx] = d;
-            blocks[idx] = fma.inst_tile[d];
-            sizes[idx] = (idx != 2 ? a_layout : b_layout).int_dim_size(d);
-        }
-
-        // BKNM order.
-        int i0 = 0;
-        int i1 = 3;
-        int i2 = 2;
-        int i3 = 1;
-        stmt_t stmt;
-        prb_coord_t<int> off(0);
-        bool is_a_bcast = (blocks[0] * blocks[1] * blocks[3] == 1);
-        bool is_b_bcast = (blocks[0] * blocks[2] * blocks[3] == 1);
-        func_t fma_func;
-        switch (fma.fma) {
-            case fma_kind_t::mad: {
-                int a_stride = is_a_bcast ? 0 : a_layout.inner_stride();
-                int b_stride = is_b_bcast ? 0 : b_layout.inner_stride();
-                fma_func = mad_t::make(plan_.hw, c_layout.type(), fma.simd,
-                        a_layout.type(), a_stride, b_layout.type(), b_stride);
-                break;
-            }
-            case fma_kind_t::dpas: {
-                fma_func = dpas_t::make(/*is_dpasw=*/false, fma.simd, 8, 8,
-                        c_layout.type(), b_layout.type(), a_layout.type());
-                break;
-            }
-            default: ir_error_not_expected();
-        }
-        for (int b = 0; b < sizes[i0]; b += blocks[i0]) {
-            off[dims[i0]] = b;
-            for (int k = 0; k < sizes[i1]; k += blocks[i1]) {
-                off[dims[i1]] = k;
-                for (int n = 0; n < sizes[i2]; n += blocks[i2]) {
-                    off[dims[i2]] = n;
-                    for (int m = 0; m < sizes[i3]; m += blocks[i3]) {
-                        off[dims[i3]] = m;
-                        int a_off = a_layout.offset_in_bytes(off);
-                        int b_off = b_layout.offset_in_bytes(off);
-                        int c_off = c_layout.offset_in_bytes(off);
-                        auto dst = c_buf[c_off];
-                        auto src1 = a_buf[a_off];
-                        auto src2 = b_buf[b_off];
-                        if (fma.fma == fma_kind_t::dpas) std::swap(src1, src2);
-                        stmt = stmt.append(fma_func.call(
-                                {dst, dst, std::move(src1), std::move(src2)}));
-                    }
-                }
-            }
-        }
-        stmt = inject_dpas_atomic(stmt, /*filter_by_label=*/false);
-        x2r_mul_stmt_ = x2r_mul_stmt_.append(stmt);
-    }
-
-    void build_bia_reduce(x2r_plan_t &x2r, const expr_t &bia_buf) {
-        if (desc_.prop != prop_kind::backward_weights || !desc_.with_bias
-                || x2r.tensor_kind != tensor_kind_t::b)
-            return;
-        auto b_buf = buf_mgr_.find_buf("b");
-        auto &b_layout = x2r.layout;
-        auto stmt = create_reduce_stmt(to_ir(b_layout), to_ir(x2r.bia_layout),
-                b_buf, bia_buf, tensor_t(), (1 << 1) | (1 << 2));
-        x2r_mul_stmt_ = x2r_mul_stmt_.append(stmt);
-    }
-
-    void build_x2r_mul() {
-        auto &x2r_fma = plan_.x2r_fma;
-        auto c_buf = buf_mgr_.get("c", x2r_fma.c_layout.size());
-        for (auto &s : x2r_fma.stages) {
-            if (s.is_fma()) {
-                build_mul(s.fma, c_buf);
-            } else if (s.is_x2r()) {
-                build_x2r(s.x2r);
-                build_bia_reduce(s.x2r,
-                        buf_mgr_.get("bia_buf", x2r_fma.bia_layout.size()));
-            }
-        }
-    }
-
-    void build_bias_store() {
-        if (desc_.prop != prop_kind::backward_weights || !desc_.with_bias)
-            return;
-        auto &fma = plan_.x2r_fma;
-        auto &epilogue = plan_.epilogue;
-        auto &bia_buf = buf_mgr_.find_buf("bia_buf");
-        auto bia_tile = epilogue.bia_store.reg_layout().int_dim_sizes();
-        auto epilogue_tile = bia_tile;
-        for (auto &d : bia_tile)
-            epilogue_tile[d] = epilogue.tile[d];
-        for_each(bia_tile, epilogue_tile, [&](const prb_coord_t<int> &coord) {
-            auto bia_payload_buf = bia_buf;
-            auto bia_payload_layout = epilogue.bia_store.reg_layout();
-            auto payload_coord = coord;
-            if (epilogue.bia_reorder) {
-                auto bia_tmp_buf = buf_mgr_.get(
-                        "bia_tmp", epilogue.bia_reorder.dst.size());
-                int src_off = fma.bia_layout.offset_in_bytes(coord);
-                auto stmt = create_stmt(
-                        epilogue.bia_reorder, bia_buf + src_off, bia_tmp_buf);
-                epilogue_stmt_ = epilogue_stmt_.append(stmt);
-                bia_payload_buf = std::move(bia_tmp_buf);
-                bia_payload_layout = epilogue.bia_reorder.dst;
-                payload_coord = prb_coord_t<int>();
-            }
-            auto bia_stmt = create_stmt(epilogue.bia_store, bia_mem_buf(),
-                    bia_payload_buf, epilogue_off_ctx_, coord, epilogue_tile,
-                    bia_payload_layout, payload_coord);
-            bia_stmt = if_t::make(epilogue.reduce_cond, bia_stmt);
-            epilogue_stmt_ = epilogue_stmt_.append(bia_stmt);
-        });
-    }
-
-    void build_slm_reduce() {
-        auto &slm_reduce = plan_.epilogue.slm_reduce;
-        if (!slm_reduce) return;
-
-        auto &c_buf = buf_mgr_.find_buf("c");
-        auto c_tmp_buf
-                = buf_mgr_.get("c_reduce", slm_reduce.load.reg_layout().size());
-        auto c_slm_buf = buf_mgr_.get("slm", slm_reduce.slm_usage_bytes());
-        auto store_stmt = create_stmt(
-                slm_reduce.store, c_slm_buf, c_buf, epilogue_off_ctx_);
-        auto load_stmt = create_stmt(
-                slm_reduce.load, c_slm_buf, c_tmp_buf, epilogue_off_ctx_);
-        auto reduce_stmt = create_stmt(slm_reduce.reduce, c_tmp_buf, c_buf);
-        epilogue_stmt_ = epilogue_stmt_.append(store_stmt);
-        epilogue_stmt_ = epilogue_stmt_.append(funcs::barrier());
-        epilogue_stmt_ = epilogue_stmt_.append(load_stmt);
-        epilogue_stmt_ = epilogue_stmt_.append(
-                funcs::zero_out(c_buf, slm_reduce.c_layout.size()));
-        epilogue_stmt_ = epilogue_stmt_.append(reduce_stmt);
-    }
-
-    void build_c_store() {
-        auto &c_layout = plan_.x2r_fma.c_layout;
-        auto &epilogue = plan_.epilogue;
-        auto &store = epilogue.c_store;
-        auto c_tile = store.reg_layout().int_dim_sizes();
-        auto &c_buf = buf_mgr_.find_buf("c");
-        for_each(c_tile, epilogue.tile, [&](const prb_coord_t<int> &coord) {
-            auto payload_buf = c_buf;
-            auto payload_layout = c_layout;
-            auto payload_coord = coord;
-            if (epilogue.reorder) {
-                auto c_tmp_buf
-                        = buf_mgr_.get("c_tmp", epilogue.reorder.dst.size());
-                int src_off = c_layout.offset_in_bytes(coord);
-                auto stmt = create_stmt(
-                        epilogue.reorder, c_buf + src_off, c_tmp_buf);
-                epilogue_stmt_ = epilogue_stmt_.append(stmt);
-                payload_buf = std::move(c_tmp_buf);
-                payload_layout = epilogue.reorder.dst;
-                payload_coord = prb_coord_t<int>();
-            }
-            auto stmt = create_stmt(store, c_mem_buf(), payload_buf,
-                    epilogue_off_ctx_, coord, epilogue.tile, payload_layout,
-                    payload_coord);
-            epilogue_stmt_ = epilogue_stmt_.append(stmt);
-        });
-    }
-
-    void build_epilogue() {
-        build_slm_reduce();
-        build_c_store();
-        build_bias_store();
-    }
-
-    expr_t mem_buf(tensor_kind_t abc) const {
-        std::string name;
-        std::string src("src");
-        std::string wei("wei");
-        std::string dst("dst");
-        switch (abc) {
-            case tensor_kind_t::a:
-                name = pick_a(desc_.prop, src, wei, dst);
-                break;
-            case tensor_kind_t::b:
-                name = pick_b(desc_.prop, src, wei, dst);
-                break;
-            case tensor_kind_t::c:
-                name = pick_c(desc_.prop, src, wei, dst);
-                break;
-            default: ir_error_not_expected();
-        }
-        return kernel_info_.find_arg(name);
-    }
-
-    expr_t a_mem_buf() const { return mem_buf(tensor_kind_t::a); }
-    expr_t b_mem_buf() const { return mem_buf(tensor_kind_t::b); }
-    expr_t c_mem_buf() const { return mem_buf(tensor_kind_t::c); }
-    expr_t bia_mem_buf() const { return kernel_info_.find_arg("bia"); }
-
-    kernel_desc_t desc_;
-    kernel_info_t kernel_info_;
-    grid_context_t grid_ctx_;
-    plan_t plan_;
-
-    mutable constraint_set_t cset_;
-    mutable ir_context_t ir_ctx_;
-    mutable buffer_manager_t buf_mgr_;
-    loop_nest_t loop_nest_;
-    mutable offset_ctx_t off_ctx_;
-    mutable offset_ctx_t prefetch_off_ctx_;
-    mutable offset_ctx_t epilogue_off_ctx_;
-
-    stmt_t prefetch_stmt_;
-    stmt_t x2r_mul_stmt_;
-    stmt_t epilogue_stmt_;
-};
-
-stmt_t build_ir(const kernel_desc_t &desc, const kernel_info_t &kernel_info,
-        const grid_context_t &grid_ctx) {
-    auto plan = create_conv_plan(desc);
-    if (!plan) ir_except_not_implemented("Cannot create plan.");
-
-    ir_info() << desc << std::endl;
-    ir_trace() << plan << std::endl;
-
-    ir_builder_t builder(desc, kernel_info, grid_ctx, plan);
-    auto stmt = builder.build();
-    ir_trace() << "Convolution kernel body:\n" << stmt << std::endl;
-    return stmt;
-}
-
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/ir_builder.hpp b/src/gpu/intel/jit/v2/conv/ir_builder.hpp
deleted file mode 100644
index 8a56026ca4d..00000000000
--- a/src/gpu/intel/jit/v2/conv/ir_builder.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_V2_CONV_IR_BUILDER_HPP
-#define GPU_INTEL_JIT_V2_CONV_IR_BUILDER_HPP
-
-#include "gpu/intel/jit/ir/ir.hpp"
-#include "gpu/intel/jit/ir/kernel_info.hpp"
-#include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-
-stmt_t build_ir(const kernel_desc_t &desc, const kernel_info_t &kernel_info,
-        const grid_context_t &grid_ctx);
-
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/jit/v2/conv/kernel.hpp b/src/gpu/intel/jit/v2/conv/kernel.hpp
index cd70a6150fd..0137590ac35 100644
--- a/src/gpu/intel/jit/v2/conv/kernel.hpp
+++ b/src/gpu/intel/jit/v2/conv/kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,8 +23,9 @@
 #include "gpu/intel/jit/codegen/kernel.hpp"
 #include "gpu/intel/jit/ir/ir.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
-#include "gpu/intel/jit/v2/conv/ir_builder.hpp"
+#include "gpu/intel/jit/v2/conv/builder.hpp"
 #include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
+#include "gpu/intel/jit/v2/ir/builder.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -39,35 +40,25 @@ class kernel_t : public ir_kernel_t<hw> {
 public:
     IR_KERNEL_FORWARD(hw)
 
-    kernel_t(const kernel_desc_base_t &_desc, const kernel_info_t &kernel_info);
+    kernel_t(const kernel_desc_base_t &_desc, const impl::engine_t *engine);
 };
 
 template <ngen::HW hw>
 kernel_t<hw>::kernel_t(
-        const kernel_desc_base_t &_desc, const kernel_info_t &kernel_info)
-    : ir_kernel_t<hw>(_desc, kernel_info) {
+        const kernel_desc_base_t &_desc, const impl::engine_t *engine)
+    : ir_kernel_t<hw>(_desc, engine, {GENERATOR_NAME, GENERATOR_LINE}) {
 
     auto &desc = static_cast<const kernel_desc_t &>(_desc);
 
-    this->require_signal_header_ = true;
-
     // Build IR for the kernel.
-    grid_context_t grid_ctx;
-    stmt_t body = build_ir(desc, kernel_info, grid_ctx);
+    var_manager_t var_mgr(kernel_iface());
+    stmt_t body = build_ir(exec_cfg(), desc, var_mgr);
 
     alloc_manager_t alloc_mgr(body);
     setup_interface(body);
 
-    generate_prologue();
-
-    // Bind "external" variables.
-    expr_binding_t expr_binding(hw);
-    bind_external_vars(body, grid_ctx, expr_binding);
-
     // Generate assembly from IR.
-    convert_ir_to_ngen<hw>(body, this, expr_binding);
-
-    generate_epilogue();
+    convert_ir_to_ngen<ir_kernel_t<hw>>(body, this);
 }
 
 } // namespace conv
diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp
index a781c317e99..42f95358f73 100644
--- a/src/gpu/intel/jit/v2/conv/kernel_desc.cpp
+++ b/src/gpu/intel/jit/v2/conv/kernel_desc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,14 +17,19 @@
 #include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
 
 #include "common/c_types_map.hpp"
+#include "common/convolution_pd.hpp"
 #include "common/memory_desc_wrapper.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/jit/codegen/kernel.hpp"
+#include "gpu/intel/jit/ir/config.hpp"
 #include "gpu/intel/jit/ir/kernel_info.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
+#include "gpu/intel/jit/v2/conv/bridge.hpp"
 #include "gpu/intel/jit/v2/conv/kernel.hpp"
 #include "gpu/intel/jit/v2/conv/plan.hpp"
 #include "gpu/intel/jit/v2/conv/problem.hpp"
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -34,83 +39,106 @@ namespace jit {
 namespace v2 {
 namespace conv {
 
-void load_desc_t::parse(std::istream &in) {
-    operator=(load_desc_t());
-    a = send_kind_t::undef;
-    b = send_kind_t::undef;
-    std::string s;
-    in >> s;
-    if (s == "x") return;
-    auto parts = gpu_utils::split(s, ",");
-    for (auto &p : parts) {
-        auto p_parts = gpu_utils::split(p, ":");
-        ir_assert(p_parts.size() == 2);
-        auto tensor = p_parts[0];
-        auto kind = p_parts[1];
-        if (tensor == "a") {
-            a = to_enum<send_kind_t>(kind);
-        } else if (tensor == "b") {
-            b = to_enum<send_kind_t>(kind);
-        } else {
-            ir_error_not_expected() << p;
+pvar_tile_t min_dims_tile(const problem_t &prb) {
+    pvar_tile_t xd;
+    xd[pvars::id] = xd[pvars::od] = xd[pvars::kd] = 1;
+    xd[pvars::dd] = xd[pvars::pd] = 0;
+    xd[pvars::sd] = 1;
+    pvar_tile_t xhd = xd;
+    xhd[pvars::ih] = xhd[pvars::oh] = xhd[pvars::kh] = 1;
+    xhd[pvars::dh] = xhd[pvars::ph] = 0;
+    xhd[pvars::sh] = 1;
+    for (auto *t : {&xhd, &xd}) {
+        bool ok = true;
+        for (auto &d : *t) {
+            if (prb.shape().at(d) != (*t).at(d)) {
+                ok = false;
+                break;
+            }
         }
+        if (ok) return *t;
     }
+    return pvar_tile_t();
 }
 
-void store_desc_t::parse(std::istream &in) {
-    operator=(store_desc_t());
-    std::string s;
-    in >> s;
-    if (s == "x") return;
-    auto parts = gpu_utils::split(s, ",");
-    for (auto &p : parts) {
-        auto p_parts = gpu_utils::split(p, ":");
-        ir_assert(p_parts.size() == 2);
-        auto tensor = p_parts[0];
-        auto kind = p_parts[1];
-        if (tensor == "c") {
-            c = to_enum<send_kind_t>(kind);
-        } else {
-            ir_error_not_expected() << p;
-        }
+pvar_tile_t get_dims_tile(const problem_t &prb, specialization_mode_t mode) {
+    switch (mode) {
+        case specialization_mode_t::min_dims: return min_dims_tile(prb);
+        case specialization_mode_t::max: return prb.shape();
+        default: gpu_error_not_expected();
     }
+    return {};
 }
 
-std::string align_desc_t::align_t::str() const {
-    std::string s = (value == 0 ? "*" : std::to_string(value));
-    if (in_bytes) s += "b";
-    return s;
+void specialization_t::specialize(const problem_t &prb) {
+    auto t = get_dims_tile(prb, mode);
+    for (auto &d : t) {
+        gpu_assert(!dim_values.has(d) || dim_values[d] == t[d]);
+        dim_values[d] = t[d];
+    }
+    mode = specialization_mode_t::none;
+    canonicalize();
 }
 
-void align_desc_t::align_t::parse(const std::string &_s) {
-    auto s = _s;
-    in_bytes = (!s.empty() && s.back() == 'b');
-    if (in_bytes) s = s.substr(0, s.length() - 1);
-    value = (s == "*") ? 0 : std::stoi(s);
+prb_reqs_t specialization_t::reqs() const {
+    gpu_assert(!is_dynamic()) << "Must be specialized before this call";
+    prb_reqs_t reqs;
+    reqs.add(dim_values);
+    for (auto &d : dim_mods) {
+        reqs.add(d.var() % dim_mods[d] == 0);
+    }
+    return reqs;
 }
 
-std::string align_desc_t::str() const {
-    if (is_default()) return "x";
+std::string specialization_t::str() const {
     std::vector<std::string> parts;
-    parts.emplace_back(src.str());
-    parts.emplace_back(wei.str());
-    parts.emplace_back(dst.str());
+    std::string s_dims;
+    if (!dim_values.is_empty()) s_dims = dim_values.str();
+    for (auto &d : dim_mods) {
+        s_dims += d.str() + "@" + std::to_string(dim_mods[d]);
+    }
+    if (!s_dims.empty()) parts.emplace_back(s_dims);
+    if (mode != specialization_mode_t::none)
+        parts.emplace_back(to_string(mode));
     return gpu_utils::join(":", parts);
 }
 
-void align_desc_t::parse(std::istream &in) {
-    operator=(align_desc_t());
+void specialization_t::parse(std::istream &in) {
     auto s = jit::parse<std::string>(in);
-    if (s == "x") return;
     auto parts = gpu_utils::split(s, ":");
-    if (parts.size() == 1) {
-        parts.push_back(parts[0]);
-        parts.push_back(parts[0]);
+    for (auto &p : parts) {
+        bool found = false;
+        for (auto &kv : specialization_mode_names) {
+            if (p == kv.second) {
+                gpu_assert(mode == specialization_mode_t::none);
+                mode = kv.first;
+                found = true;
+                break;
+            }
+        }
+        if (found) continue;
+        auto tile = jit::parse<pvar_tile_t>(p);
+        for (auto &d : tile) {
+            if (d.name().back() == '@') {
+                dim_mods[pvar_t(d.name().substr(0, d.name().size() - 1))]
+                        = tile[d];
+            } else {
+                dim_values[d] = tile[d];
+            }
+        }
+    }
+    canonicalize();
+}
+
+void specialization_t::canonicalize() {
+    for (auto &d : dim_values) {
+        if (dim_mods.has(d)) {
+            gpu_assert(dim_values[d] % dim_mods[d] == 0)
+                    << "Incompatible dim_values/dim_mods: " << dim_values.str()
+                    << "/" << dim_mods.str();
+            dim_mods.unset(d);
+        }
     }
-    ir_assert(parts.size() == 3);
-    src.parse(parts[0]);
-    wei.parse(parts[1]);
-    dst.parse(parts[2]);
 }
 
 void prefetch_desc_t::parse(std::istream &in) {
@@ -118,199 +146,60 @@ void prefetch_desc_t::parse(std::istream &in) {
     std::string s;
     in >> s;
     auto parts = gpu_utils::split(s, ".");
-    ir_assert(utils::one_of((int)parts.size(), 1, 2));
-    ir_assert(parts[0].size() >= 2);
+    gpu_assert(utils::one_of((int)parts.size(), 1, 2));
+    gpu_assert(parts[0].size() >= 2);
     dist = std::stoi(parts[0].substr(1));
-    ir_assert(dist >= 0);
+    gpu_assert(dist >= 0);
     a = (dist > 0);
     b = (dist > 0);
     if (parts.size() == 2 && dist > 0) {
-        ir_assert(utils::one_of(parts[1], "a", "b", "ab"));
+        gpu_assert(utils::one_of(parts[1], "a", "b", "ab"));
         a = (parts[1].find("a") != std::string::npos);
         b = (parts[1].find("b") != std::string::npos);
     }
 }
 
-layout_desc_t make_conv_layout_desc(
-        tensor_kind_t tensor_kind, bool src_dst_with_group) {
-    bool is_wei = (tensor_kind == tensor_kind_t::wei);
-    dim_map_t<prb_dim_t, char> letter_map;
-    for (auto &d : conv_layout_dims(tensor_kind, src_dst_with_group)) {
-        char c = ' ';
-        switch (d.kind()) {
-            case prb_dim_kind_t::g: c = 'g'; break;
-            case prb_dim_kind_t::mb: c = 'n'; break;
-            case prb_dim_kind_t::ic: c = is_wei ? 'i' : 'c'; break;
-            case prb_dim_kind_t::oc: c = is_wei ? 'o' : 'c'; break;
-            case prb_dim_kind_t::id:
-            case prb_dim_kind_t::od: c = 'd'; break;
-            case prb_dim_kind_t::kd: c = is_wei ? 'd' : 'z'; break;
-            case prb_dim_kind_t::ih:
-            case prb_dim_kind_t::oh: c = 'h'; break;
-            case prb_dim_kind_t::kh: c = is_wei ? 'h' : 'y'; break;
-            case prb_dim_kind_t::iw:
-            case prb_dim_kind_t::ow: c = 'w'; break;
-            case prb_dim_kind_t::kw: c = is_wei ? 'w' : 'x'; break;
-            default: ir_error_not_expected();
-        }
-        letter_map[d] = c;
-    }
-    return layout_desc_t(letter_map);
-}
-
-layout_desc_t make_conv_algo_layout_desc(
-        prop_kind_t prop, tensor_kind_t tensor_kind) {
-    auto desc = make_conv_layout_desc(tensor_kind, /*src_dst_with_group=*/true);
-    switch (tensor_kind) {
-        case tensor_kind_t::bia:
-        case tensor_kind_t::wei: return desc;
-        case tensor_kind_t::src:
-            if (prop == prop_kind::backward_data) return desc;
-            break;
-        case tensor_kind_t::dst:
-            if (prop != prop_kind::backward_data) return desc;
-            break;
-        default: ir_error_not_expected();
-    }
-    dim_map_t<prb_dim_t, char> letter_map;
-    bool is_src = (tensor_kind == tensor_kind_t::src);
-    prb_dim_t xd = (is_src ? prb_dims::od : prb_dims::id);
-    prb_dim_t xh = (is_src ? prb_dims::oh : prb_dims::ih);
-    prb_dim_t xw = (is_src ? prb_dims::ow : prb_dims::iw);
-    for (int i = 0; i < desc.ndims(); i++) {
-        auto d = desc.prb_dim(i);
-        switch (d.kind()) {
-            case prb_dim_kind_t::id:
-            case prb_dim_kind_t::od:
-                letter_map[xd] = 'd';
-                letter_map[prb_dims::kd] = 'z';
-                break;
-            case prb_dim_kind_t::ih:
-            case prb_dim_kind_t::oh:
-                letter_map[xh] = 'h';
-                letter_map[prb_dims::kh] = 'y';
-                break;
-            case prb_dim_kind_t::iw:
-            case prb_dim_kind_t::ow:
-                letter_map[xw] = 'w';
-                letter_map[prb_dims::kw] = 'x';
-                break;
-            default: letter_map[d] = desc.layout_letter(d); break;
-        }
-    }
-    return layout_desc_t(letter_map);
+void extensions_t::add(extension_kind_t kind) {
+    kinds = static_cast<extension_kind_t>(
+            static_cast<uint32_t>(kinds) | static_cast<uint32_t>(kind));
 }
 
-layout_tag_t make_conv_layout_tag(
-        tensor_kind_t tensor_kind, const std::string &s) {
-    if (s.empty()) return layout_tag_t();
-    bool is_wei = (tensor_kind == tensor_kind_t::wei);
-    auto desc = make_conv_layout_desc(tensor_kind);
-    auto parts = gpu_utils::split(s, ":");
-    auto type = (parts.size() > 1 ? type_t(parts[1]) : type_t::f32());
-    auto str_tag = desc.to_abx_tag(parts[0]);
-    auto raw_tag = layout_raw_tag_t(str_tag, is_wei ? 6 : 5);
-    return layout_tag_t(desc, type, raw_tag);
+bool extensions_t::has(extension_kind_t kind) const {
+    return static_cast<uint32_t>(kinds) & static_cast<uint32_t>(kind);
 }
 
-std::string blocked_to_str_tag(const memory_desc_t &md) {
-    auto &blk = md.format_desc.blocking;
-    int ndims = md.ndims;
-    std::vector<dim_t> full_inner_blks(ndims, 1);
-    std::vector<std::string> parts;
-    dim_t stride = 1;
-    for (int i = blk.inner_nblks - 1; i >= 0; i--) {
-        int idx = blk.inner_idxs[i];
-        dim_t block = blk.inner_blks[i];
-        char letter = 'a' + idx;
-        parts.push_back(std::string(1, letter));
-        parts.push_back(std::to_string(block));
-        full_inner_blks[idx] *= block;
-        stride *= block;
-    }
-    std::vector<bool> seen(ndims);
-    dims_t rem_dims;
-    for (int i = 0; i < ndims; i++) {
-        rem_dims[i] = md.padded_dims[i] / full_inner_blks[i];
-    }
-    for (int i = 0; i < ndims; i++) {
-        bool found = false;
-        dim_t min_dim = std::numeric_limits<dim_t>::max();
-        for (int j = 0; j < ndims; j++) {
-            if (!seen[j] && blk.strides[j] == stride) {
-                min_dim = std::min(min_dim, rem_dims[j]);
-            }
-        }
-        for (int j = ndims - 1; j >= 0; j--) {
-            if (!seen[j] && blk.strides[j] == stride) {
-                // Size-one blocks have to be added first.
-                if (min_dim == 1 && rem_dims[j] != min_dim) continue;
-                bool is_blocked = (full_inner_blks[j] != 1);
-                char letter = (is_blocked ? 'A' : 'a') + j;
-                parts.push_back(std::string(1, letter));
-                stride *= rem_dims[j];
-                seen[j] = true;
-                found = true;
-                break;
-            }
+std::string extensions_t::str() const {
+    if (kinds == extension_kind_t::undef) return "x";
+    std::ostringstream oss;
+    bool is_first = true;
+    for (auto &p : extension_kind_names) {
+        if (p.first == extension_kind_t::undef) continue;
+        if (has(p.first)) {
+            if (!is_first) oss << ",";
+            oss << p.second;
+            is_first = false;
         }
-        if (!found) ir_error_not_expected();
     }
-    std::ostringstream oss;
-    for (int i = (int)parts.size() - 1; i >= 0; i--)
-        oss << parts[i];
     return oss.str();
 }
 
-layout_raw_tag_t normalize_conv_tag(tensor_kind_t tensor_kind, int conv_ndims,
-        const layout_raw_tag_t &tag) {
-    bool is_wei = (tensor_kind == tensor_kind_t::wei);
-    bool add_groups = (is_wei && tag.ndims() == conv_ndims);
-    int old_sp_ndims = conv_ndims - 2;
-    int new_sp_ndims = 3;
-    layout_raw_tag_t ret = tag;
-    if (add_groups) ret.add_dim('a', 0);
-    char sp_letter = 'c' + ret.ndims() - conv_ndims;
-    int entry_idx = ret.entry_index(sp_letter);
-    for (int i = old_sp_ndims; i < new_sp_ndims; i++) {
-        ret.add_dim(sp_letter, entry_idx);
-    }
-    return ret;
-}
-
-layout_tag_t make_conv_layout_tag(
-        tensor_kind_t tensor_kind, int conv_ndims, const memory_desc_t &md) {
-    bool is_any = (md.format_kind == format_kind::any);
-    bool is_blocked = (md.format_kind == format_kind::blocked);
-    ir_assert(is_any || is_blocked);
-    auto desc = make_conv_layout_desc(tensor_kind);
-    type_t type(md.data_type);
-    if (is_any) return layout_tag_t(desc, type, layout_raw_tag_t::any());
-    auto str_tag = blocked_to_str_tag(md);
-    auto raw_tag = layout_raw_tag_t(str_tag);
-    raw_tag = normalize_conv_tag(tensor_kind, conv_ndims, raw_tag);
-    return layout_tag_t(desc, type, raw_tag);
-}
-prb_tile_t min_dims_tile(const problem_t &prb) {
-    prb_tile_t xd;
-    xd[prb_dims::id] = xd[prb_dims::od] = xd[prb_dims::kd] = 1;
-    xd[prb_dims::dd] = xd[prb_dims::pd] = 0;
-    xd[prb_dims::sd] = 1;
-    prb_tile_t xhd = xd;
-    xhd[prb_dims::ih] = xhd[prb_dims::oh] = xhd[prb_dims::kh] = 1;
-    xhd[prb_dims::dh] = xhd[prb_dims::ph] = 0;
-    xhd[prb_dims::sh] = 1;
-    for (auto *t : {&xhd, &xd}) {
-        bool ok = true;
-        for (auto &d : *t) {
-            if (prb.shape().at(d) != (*t).at(d)) {
-                ok = false;
-                break;
-            }
-        }
-        if (ok) return *t;
+void extensions_t::parse(std::istream &in) {
+    auto s = jit::parse<std::string>(in);
+    auto parts = gpu_utils::split(s, ",");
+    kinds = extension_kind_t::undef;
+    for (auto &p : parts) {
+        add(to_enum<extension_kind_t>(p));
+    }
+}
+
+extension_kind_t extensions_t::out_size(int size) {
+    switch (size) {
+        case 1: return extension_kind_t::out_b1;
+        case 2: return extension_kind_t::out_b2;
+        case 4: return extension_kind_t::out_b4;
+        default: gpu_error_not_expected();
     }
-    return prb_tile_t();
+    return extension_kind_t::undef;
 }
 
 int estimate_grf_usage_bytes(const kernel_desc_t &desc) {
@@ -318,123 +207,296 @@ int estimate_grf_usage_bytes(const kernel_desc_t &desc) {
     int b_type_size = desc.b_type().size();
     int c_type_size = desc.c_type().size();
     auto iter = to_gemm(desc.iter_tile, desc.prop);
-    int b_iter = iter.at(prb_dims::b);
-    int m_iter = iter.at(prb_dims::m);
-    int n_iter = iter.at(prb_dims::n);
-    int k_iter = iter.at(prb_dims::k);
-    int a_elems = b_iter * m_iter * k_iter;
-    int b_elems = b_iter * k_iter * n_iter;
-    int c_elems = m_iter * n_iter;
-    auto iter_outer_dim
-            = (desc.iter_outer_tile.is_empty() ? prb_dims::undef
+    dim_t b_iter = iter.at(pvars::b);
+    dim_t m_iter = iter.at(pvars::m);
+    dim_t n_iter = iter.at(pvars::n);
+    dim_t k_iter = iter.at(pvars::k);
+    dim_t a_elems = b_iter * m_iter * k_iter;
+    dim_t b_elems = b_iter * k_iter * n_iter;
+    dim_t c_elems = m_iter * n_iter;
+    const auto &iter_outer_dim
+            = (desc.iter_outer_tile.is_empty() ? pvar_t()
                                                : *desc.iter_outer_tile.begin());
     auto bmnk = to_gemm(iter_outer_dim, desc.prop);
-    if (bmnk == prb_dims::m) {
+    if (bmnk == pvars::m) {
         a_elems = utils::div_up(a_elems, desc.iter_outer_tile.elems());
-    } else if (bmnk == prb_dims::n) {
+    } else if (bmnk == pvars::n) {
         b_elems = utils::div_up(b_elems, desc.iter_outer_tile.elems());
     }
-    int a_size = a_elems * a_type_size;
+    dim_t a_size = a_elems * a_type_size;
     int a_reorder_size = 0;
-    int b_size = b_elems * b_type_size;
+    dim_t b_size = b_elems * b_type_size;
     int b_reorder_size = 0;
-    int c_size = c_elems * c_type_size;
-    int abc_size = 0;
+    dim_t c_size = c_elems * c_type_size;
+    dim_t abc_size = 0;
     abc_size += a_size + a_reorder_size;
     abc_size += b_size + b_reorder_size;
     abc_size += c_size;
-    return abc_size;
+    return into<int>(abc_size);
 }
 
-bool is_tg_size_ok(const kernel_desc_t &desc) {
-    int max_tg_size = desc.hw.max_tg_size(desc.regs, desc.simd);
+bool is_tg_size_ok(const kernel_desc_t &desc, const hw_t &hw) {
+    int max_tg_size = hw.max_tg_size(desc.regs, desc.simd);
     return desc.thread_group_tile.elems() <= max_tg_size;
 }
 
 bool is_grf_usage_ok(const kernel_desc_t &desc) {
     int size = estimate_grf_usage_bytes(desc);
-    if (size > desc.hw.grf_size() * desc.regs) { return false; }
+    if (size > desc.hw_desc.grf_size() * desc.regs) { return false; }
     return true;
 }
 
-bool kernel_desc_t::is_supported() const {
-    ir_check(prop != prop_kind::undef)
+prb_reqs_t kernel_desc_t::reqs() const {
+    return generate_2d_reqs(*this);
+}
+
+bool kernel_desc_t::is_supported(const hw_t &hw, const problem_t *prb) const {
+    gpu_check(prop != prop_kind::undef)
             << "Invalid prop: " << ir_utils::to_string(prop);
-    ir_check(!hw.is_undef()) << "Invalid hw: " << jit::to_string(hw.to_ngen());
-    ir_check(fma != fma_kind_t::undef)
+    gpu_check(!prb || (hw_desc.hw == prb->hw().to_ngen()))
+            << "HW mismatch, desc: " << jit::to_string(hw_desc.hw)
+            << ", problem: " << jit::to_string(prb->hw().to_ngen());
+    gpu_check(fma != fma_kind_t::undef)
             << "Invalid fma: " << jit::to_string(fma);
-    ir_check(simd != 0) << "Invalid simd: " << simd;
-    ir_check(regs != 0) << "Invalid regs: " << regs;
-    ir_check(is_tg_size_ok(*this))
+    gpu_check(simd != 0) << "Invalid simd: " << simd;
+    gpu_check(regs != 0) << "Invalid regs: " << regs;
+    gpu_check(is_tg_size_ok(*this, hw))
             << "Invalid thread_group_tile: " << thread_group_tile;
-    ir_check(is_grf_usage_ok(*this)) << "GRF usage exceeded";
+    if (use_stream_k) {
+        gpu_check(c_type() == accumulator_type(a_type(), b_type()))
+                << "Output/accumulator types must match for Stream-K";
+    }
+    gpu_check(is_grf_usage_ok(*this)) << "GRF usage exceeded";
+    if (prb) gpu_check(matches(*prb)) << "Descriptor does not match problem";
     return true;
 }
 
 void kernel_desc_t::set(const std::string &s) {
     operator=(kernel_desc_t());
     if (s.empty()) return;
-    auto iface = parse_iface();
-    iface.parse(s, *this);
+    auto &iface = parse_iface();
+    parse_result_t result;
+    iface.parse(s, *this, &result);
+    if (!result.is_set("--iter") && !result.is_set("iter")) {
+        gpu_error_not_expected()
+                << "Error: missing --iter parameter in kernel descriptor";
+    }
     set_defaults();
 }
 
+loop_desc_t default_loop_desc(prop_kind_t prop) {
+    loop_desc_t loop_desc;
+    switch (prop) {
+        case prop_kind::forward_training:
+        case prop_kind::forward_inference:
+            loop_desc.add(pvars::kw);
+            loop_desc.add(pvars::kh);
+            loop_desc.add(pvars::kd);
+            loop_desc.add(pvars::ic);
+            break;
+        case prop_kind::backward_data:
+            loop_desc.add(pvars::kw);
+            loop_desc.add(pvars::kh);
+            loop_desc.add(pvars::kd);
+            loop_desc.add(pvars::oc);
+            break;
+        case prop_kind::backward_weights:
+            loop_desc.add(pvars::ow);
+            loop_desc.add(pvars::oh);
+            loop_desc.add(pvars::od);
+            loop_desc.add(pvars::mb);
+            break;
+        default: gpu_error_not_expected(); break;
+    }
+    return loop_desc;
+}
+
 void kernel_desc_t::set_defaults() {
-    if (loop_desc.is_empty()) {
-        switch (prop) {
-            case prop_kind::forward_training:
-            case prop_kind::forward_inference:
-                loop_desc.add(prb_dims::kw);
-                loop_desc.add(prb_dims::kh);
-                loop_desc.add(prb_dims::kd);
-                loop_desc.add(prb_dims::ic);
-                break;
-            case prop_kind::backward_data:
-                loop_desc.add(prb_dims::kw);
-                loop_desc.add(prb_dims::kh);
-                loop_desc.add(prb_dims::kd);
-                loop_desc.add(prb_dims::oc);
-                break;
-            case prop_kind::backward_weights:
-                loop_desc.add(prb_dims::mb);
-                loop_desc.add(prb_dims::ow);
-                loop_desc.add(prb_dims::oh);
-                loop_desc.add(prb_dims::od);
-                break;
-            default: ir_error_not_expected(); break;
+    src_tag = make_conv_layout_tag(tensor_kind_t::src, src_tag.str());
+    wei_tag = make_conv_layout_tag(tensor_kind_t::wei, wei_tag.str());
+    dst_tag = make_conv_layout_tag(tensor_kind_t::dst, dst_tag.str());
+    if (loop_desc.is_empty()) loop_desc = default_loop_desc(prop);
+    if (is_dw) {
+        spec.dim_values[pvars::ic] = 1;
+        spec.dim_values[pvars::oc] = 1;
+    }
+    if (prop == prop_kind::backward_data) {
+        // XXX: No stride support in backward by data yet.
+        spec.dim_values[pvars::sw] = 1;
+        spec.dim_values[pvars::sh] = 1;
+        spec.dim_values[pvars::sd] = 1;
+    }
+}
+
+bool is_compatible(tensor_kind_t abc, const kernel_desc_t &kernel_desc,
+        const problem_t &prb, bool exact) {
+    auto &desc_tag = kernel_desc.layout_tag(abc);
+    auto &prb_tag = prb.layout_tag(abc);
+    bool is_out = (abc == tensor_kind_t::c);
+    auto &desc_type = desc_tag.type();
+    auto &prb_type = prb_tag.type();
+    bool type_ok = (desc_tag.type() == prb_tag.type());
+    if (!exact) type_ok = (desc_type.size() == prb_type.size());
+    if (!exact && is_out
+            && kernel_desc.ext.has(extensions_t::out_size(prb_type.size())))
+        type_ok = true;
+    if (!type_ok && is_out && kernel_desc.use_stream_k) type_ok = true;
+    gpu_check(type_ok) << to_string(abc) << " tag " << prb_tag
+                       << " does not match kernel descriptor tag " << desc_tag;
+    return true;
+}
+
+bool is_compatible(const hw_desc_t &hw_desc, const hw_t &hw, bool exact) {
+    if (!exact && hw != hw_desc.hw) {
+        switch (hw_desc.hw) {
+            case ngen::HW::XeHPC:
+                return utils::one_of(
+                        hw.to_ngen(), ngen::HW::Xe2, ngen::HW::Xe3);
+            default: break;
         }
     }
-    if (is_dw) {
-        reqs.set(prb_dims::ic, 1);
-        reqs.set(prb_dims::oc, 1);
+    return hw_desc.hw == hw.to_ngen();
+}
+
+bool is_compatible(
+        const kernel_desc_t &desc, const problem_t &prb, bool exact) {
+    gpu_check(is_compatible(desc.hw_desc, prb.hw(), exact))
+            << "HW does not match";
+    gpu_check(prb.prop() == desc.prop) << "Propagation kind does not match";
+    gpu_check(is_compatible(tensor_kind_t::a, desc, prb, exact));
+    gpu_check(is_compatible(tensor_kind_t::b, desc, prb, exact));
+    gpu_check(is_compatible(tensor_kind_t::c, desc, prb, exact));
+    gpu_check(prb.is_depthwise() == desc.is_dw)
+            << "Mixing depthwise/non-depthwise descriptor and problem";
+    if (desc.use_stream_k) {
+        gpu_check(!prb.with_bias_fwd() && !prb.with_post_ops()
+                && !prb.with_scales())
+                << "Stream-K is incompatible with post-ops/bias";
+        gpu_check(!prb.deterministic())
+                << "Stream-K is not supported in deterministic mode";
+    }
+    if (exact) {
+        gpu_check(prb.with_bias_bwd_w() == desc.with_bias_bwd_w())
+                << "Problem and descriptor bias reduction mismatch";
+        gpu_check(prb.with_bias_fwd() == desc.with_bias_fwd())
+                << "Problem and descriptor bias mismatch";
+    }
+    if (prb.with_bias_bwd_w() != desc.with_bias_bwd_w()) {
+        if (prb.with_bias_bwd_w()) {
+            gpu_check(desc.ext.has(extension_kind_t::bias))
+                    << "Bias is not supported";
+        }
+    }
+    gpu_check(desc.reqs().fits(prb.shape()));
+    return true;
+}
+
+void fit_tag_to(
+        tensor_kind_t abc, kernel_desc_t &kernel_desc, const problem_t &prb) {
+    auto &desc_tag = const_cast<layout_tag_t &>(kernel_desc.layout_tag(abc));
+    auto &prb_tag = prb.layout_tag(abc);
+    bool is_out_stream_k
+            = (abc == tensor_kind_t::c) && kernel_desc.use_stream_k;
+    if (desc_tag.type() != prb_tag.type() && !is_out_stream_k) {
+        desc_tag = layout_tag_t(
+                desc_tag.desc(), prb_tag.type(), desc_tag.raw_tag());
+    }
+}
+
+void fit_to_impl(kernel_desc_t &desc, const problem_t &prb) {
+    desc.hw_desc = hw_desc_t(prb.hw().to_ngen());
+    fit_tag_to(tensor_kind_t::a, desc, prb);
+    fit_tag_to(tensor_kind_t::b, desc, prb);
+    fit_tag_to(tensor_kind_t::c, desc, prb);
+    if (!prb.bias_type().is_undef()) {
+        if (desc.use_stream_k) {
+            auto acc_type = accumulator_type(desc.a_type(), desc.b_type());
+            desc.bias_type = acc_type;
+        } else {
+            desc.bias_type = prb.bias_type();
+        }
+    }
+}
+
+bool kernel_desc_t::can_fit(const problem_t &prb) const {
+    return is_compatible(*this, prb, /*exact=*/false);
+}
+
+void kernel_desc_t::fit_to(const problem_t &prb) {
+    fit_to_impl(*this, prb);
+    spec.specialize(prb);
+}
+
+status_t kernel_desc_t::set_attr(const convolution_pd_t *pd,
+        const primitive_attr_t *attr, const memory_desc_t *out_md) {
+    scales = attr->scales_;
+    if (!pd->with_groups()) {
+        // Extend weights scales mask to include groups as the kernel can be
+        // reused for group convolution as well.
+        auto &e = scales.get(DNNL_ARG_WEIGHTS);
+        if (e.get_mask() & ((1 << 0) | (1 << 1))) {
+            scales.set(DNNL_ARG_WEIGHTS, (e.get_mask() << 1) | 1,
+                    e.get_data_type(), 0, {});
+        }
     }
-    if (prop == prop_kind::backward_weights && with_bias) {
-        bia_tag = make_conv_layout_tag(tensor_kind_t::bia, "a");
-        bia_tag = layout_tag_t(
-                bia_tag.desc(), dst_tag.type(), bia_tag.raw_tag());
+    auto &attr_post_ops = attr->post_ops_;
+    for (int i = 0; i < attr_post_ops.len(); i++) {
+        auto &e = attr_post_ops.entry_[i];
+        if (e.is_binary()) {
+            auto &md = e.binary.src1_desc;
+            gpu_assert(out_md->ndims == md.ndims);
+            memory_desc_t axb_md;
+            CHECK(memory_desc_init_by_tag(axb_md, md.ndims, md.dims,
+                    md.data_type,
+                    utils::pick(md.ndims - 3, format_tag::acb, format_tag::acdb,
+                            format_tag::acdeb)));
+            if (memory_desc_wrapper(md) != memory_desc_wrapper(axb_md))
+                return status::unimplemented;
+        }
     }
+
+    // Adjust post-ops to be expressed in terms of the full layout, including
+    // all spatial dimensions.
+    int old_ndims = out_md->ndims;
+    int new_ndims = 5;
+    post_op::ndim_normalizer_t ndim_normalizer {2, new_ndims - old_ndims};
+    return gpu_post_ops_t::make(post_ops, attr_post_ops, out_md,
+            post_op::specializations_t(), ndim_normalizer);
 }
 
-void kernel_desc_t::finalize(const prb_reqs_t &final_reqs) {
-    is_finalized = true;
-    reqs.add(final_reqs);
+bool kernel_desc_t::matches(const problem_t &prb) const {
+    return is_compatible(*this, prb, /*exact=*/true);
 }
 
 std::string kernel_desc_t::cmd_str() const {
     return parse_iface().cmd_str(*this);
 }
 
+std::string kernel_desc_t::brief_str() const {
+    std::ostringstream oss;
+    oss << jit::to_string(prop) << "_";
+    oss << "i_" << iter_tile.str();
+    oss << "_T_" << thread_group_tile.str();
+    oss << "_p_" << prefetch.str();
+    oss << "_sk_" << (use_stream_k ? "1" : "0");
+    return oss.str();
+}
+
 std::string kernel_desc_t::str() const {
+    if (is_empty()) return "(empty)";
     std::ostringstream oss;
-    oss << "Propagation:            " << jit::to_string(prop) << std::endl;
+    oss << "Propagation:            "
+        << ir_utils::to_upper(jit::to_string(prop)) << std::endl;
     oss << "Depthwise:              " << ir_utils::to_string(is_dw)
         << std::endl;
-    oss << "With bias:              " << ir_utils::to_string(with_bias)
-        << std::endl;
+    oss << "Bias:                   "
+        << ir_utils::to_yes_no(!bias_type.is_undef());
+    if (!bias_type.is_undef()) oss << " (" << bias_type.str() << ")";
+    oss << std::endl;
     oss << "Source tag:             " << src_tag << std::endl;
     oss << "Weights tag:            " << wei_tag << std::endl;
     oss << "Destination tag:        " << dst_tag << std::endl;
-    oss << "HW:                     " << jit::to_string(hw.to_ngen())
+    oss << "HW:                     " << jit::to_string(hw_desc.hw)
         << std::endl;
     oss << "FMA kind:               " << to_string(fma) << std::endl;
     oss << "SIMD:                   " << simd << std::endl;
@@ -443,26 +505,27 @@ std::string kernel_desc_t::str() const {
     oss << "Iteration outer tile:   " << iter_outer_tile << std::endl;
     oss << "Thread group tile:      " << thread_group_tile << std::endl;
     oss << "Loop desc:              " << loop_desc << std::endl;
-    oss << "Load:                   " << load.str() << std::endl;
-    oss << "Store:                  " << store.str() << std::endl;
-    oss << "Use block 2D access:    " << ir_utils::to_string(use_2d_access)
+    oss << "Use Stream-K:           " << ir_utils::to_yes_no(use_stream_k)
+        << std::endl;
+    oss << "Use block 2D access:    " << ir_utils::to_yes_no(use_2d_access)
         << std::endl;
-    oss << "Align:                  " << align.str() << std::endl;
     oss << "Prefetch:               " << prefetch.str() << std::endl;
-    if (reqs) oss << ir_utils::add_tag("Reqs", reqs.str()) << std::endl;
+    if (spec) oss << "Specialization:         " << spec.str() << std::endl;
+    oss << "Extensions:             " << ext.str() << std::endl;
     oss << "Command:                " << cmd_str();
-    return ir_utils::add_tag("Desc", oss.str());
+    return oss.str();
 }
 
 void kernel_desc_t::init_parse_iface(parse_iface_t<kernel_desc_t> *iface) {
     iface->set_relaxed(true);
 #define PACK(member) decltype(kernel_desc_t::member), &kernel_desc_t::member
+    iface->add<PACK(hw_desc)>(
+            "hw", "Hardware (xehpc, xe2 or xe3).", /*required=*/true);
     iface->add<PACK(prop)>("prop", "Propagation kind (fwd, bwd_d or bwd_w).",
             /*required=*/true);
     iface->add<PACK(is_dw)>(
             "dw", "Whether the problem is a depthwise convolution (0 or 1).");
-    iface->add<PACK(with_bias)>(
-            "with_bias", "Whether the problem has bias (0 or 1).");
+    iface->add<PACK(bias_type)>("bias", "Bias type.");
     iface->add<PACK(src_tag)>("src",
             "Source layout tag. Examples: axb:f32, aBx16b:f16).",
             /*required=*/true);
@@ -470,77 +533,190 @@ void kernel_desc_t::init_parse_iface(parse_iface_t<kernel_desc_t> *iface) {
             /*required=*/true);
     iface->add<PACK(dst_tag)>("dst", "Destination layout tag (e.g. axb:f32).",
             /*required=*/true);
-    iface->add<PACK(hw_desc)>("hw", "Hardware (xehpc).", /*required=*/true);
     iface->add<PACK(fma)>("fma", "FMA kind (e.g. mad).", /*required=*/true);
     iface->add<PACK(simd)>("simd", "SIMD size (16 or 32).", /*required=*/true);
     iface->add<PACK(regs)>(
             "regs", "Number of registers (128 or 256).", /*required=*/true);
     iface->add<PACK(iter_tile)>("iter", "Iteration tile (e.g. mb32ic16oc16).",
-            /*required=*/true);
+            /*required=*/false);
     iface->add<PACK(iter_outer_tile)>("iter_outer",
             "Outer iteration tile (e.g. mb2).",
             /*required=*/false);
     iface->add<PACK(thread_group_tile)>(
-            "tg", "Threadgroup tile (e.g. ow4oc4).", /*required=*/true);
+            "tg", "Threadgroup tile (e.g. ow4oc4).", /*required=*/false);
     iface->add<PACK(loop_desc)>("loop_desc",
             "Loop description, variables ordered from innermost to outermost "
-            "(e.g. kw,kh,kd,ic).");
-    iface->add<PACK(load)>("load",
-            "Load type (block, scattered [default], 2d) for A and B, e.g. "
-            "a:2d,b:block.");
-    iface->add<PACK(store)>("store",
-            "Store type (block, scattered [default], 2d) for C,  e.g. c:2d.");
+            "(e.g. kw,kh,kd,ic).",
+            /*required=*/false, [](const kernel_desc_t &parent) {
+                return default_loop_desc(parent.prop).str();
+            });
+    iface->add<PACK(use_stream_k)>("stream-k", "Whether to use Stream-K.");
     iface->add<PACK(use_2d_access)>(
             "2d", "Whether to use block 2D messages for access.");
-    iface->add<PACK(align)>("align",
-            "Alignments in bytes/elements for the innermost dimension in "
-            "source, weights and destination. Examples: 8b:8b:8b (in bytes), "
-            "2:2:2 (in elements), *:*:* (for optimal values determined during "
-            "kernel plan generation).");
     iface->add<PACK(prefetch)>("prefetch",
             "Prefetch description specifying distance and whether A/B are "
             "prefetched. Examples: x3 (distance is 3, both A/B are "
             "prefetched), x2.a (distance is 2, only A is prefetched), x0 (no "
             "prefetch, default).");
-    iface->add<PACK(spec_strategy)>("spec_strategy",
-            "Specialization strategy for problem dimensions (e.g. min_dims to "
-            "eliminate unused spatial dimensions).");
-    iface->add<PACK(reqs)>("reqs",
-            "Dimension requirements, colon-separated (e.g. kd=1:mb>=16).");
+    iface->add<PACK(spec)>("spec",
+            "Dimension specialization requirements (e.g. kd1kh1 for fixed "
+            "values or oc@64 for divisibility requirements). Special "
+            "values max and min_dims can be used for "
+            "problem-specific specialization, e.g. mb1:min_dims.");
+    iface->add<PACK(ext)>("ext",
+            "Kernel extensions, comma-separated (e.g. "
+            "bias,out_b1,out_b2,out_b4).");
+
+    parse_iface_t<kernel_desc_t>::entry_t scales_entry;
+    scales_entry.name = "scales";
+    scales_entry.help = "Kernel scales.";
+    scales_entry._default = [](const kernel_desc_t &) {
+        return serialize_to_hex(scales_t());
+    };
+    scales_entry.stringify
+            = [](std::ostream &out, const kernel_desc_t &parent) {
+                  out << serialize_to_hex(parent.scales);
+              };
+    scales_entry.parse = [](std::istream &in, kernel_desc_t &parent) {
+        auto s_data = stream_parse<std::string>(in);
+        deserialize_from_hex(parent.scales, s_data);
+    };
+    iface->add(scales_entry);
+
+    parse_iface_t<kernel_desc_t>::entry_t po_entry;
+    po_entry.name = "post_ops";
+    po_entry.help = "Kernel post-ops.";
+    po_entry._default = [](const kernel_desc_t &) {
+        return serialize_to_hex(gpu_post_ops_t());
+    };
+    po_entry.stringify = [](std::ostream &out, const kernel_desc_t &parent) {
+        out << serialize_to_hex(parent.post_ops);
+    };
+    po_entry.parse = [](std::istream &in, kernel_desc_t &parent) {
+        auto s_data = stream_parse<std::string>(in);
+        deserialize_from_hex(parent.post_ops, s_data);
+    };
+    iface->add(po_entry);
 #undef PACK
 
-    iface->set_pre_stringify_func(
-            [](const kernel_desc_t &desc) { ir_assert(desc.is_finalized); });
-    iface->set_post_parse_func([](kernel_desc_t &desc) {
-        desc.src_tag
-                = make_conv_layout_tag(tensor_kind_t::src, desc.src_tag.str());
-        desc.wei_tag
-                = make_conv_layout_tag(tensor_kind_t::wei, desc.wei_tag.str());
-        desc.dst_tag
-                = make_conv_layout_tag(tensor_kind_t::dst, desc.dst_tag.str());
-        desc.is_finalized = true;
-    });
-}
-
-void init_kernel_info_div_magic(
-        kernel_info_t &kernel_info, const kernel_desc_t &desc) {
-    auto tg_grid = create_thread_group_grid(desc);
-    for (auto &d : tg_grid.all_dims()) {
-        auto var_size = var_t::make(type_t::u32(), d.str() + "_grid_size");
-        auto var_magic = var_t::make(type_t::u64(), d.str() + "_magic");
-        kernel_info.register_internal_arg(var_magic);
-        kernel_info.register_internal_arg(var_size);
+    iface->set_post_parse_func(
+            [](kernel_desc_t &desc) { desc.set_defaults(); });
+}
+
+arg_helper_t::arg_helper_t(const kernel_desc_t &desc) : desc_(desc) {}
+
+int arg_helper_t::key(const std::string &name) const {
+    if (name == "src") {
+        if (is_fwd()) return DNNL_ARG_SRC;
+        if (is_bwd_d()) return DNNL_ARG_DIFF_SRC;
+        if (is_bwd_w()) return DNNL_ARG_SRC;
+    } else if (name == "wei") {
+        if (is_fwd()) return DNNL_ARG_WEIGHTS;
+        if (is_bwd_d()) return DNNL_ARG_WEIGHTS;
+        if (is_bwd_w()) return DNNL_ARG_DIFF_WEIGHTS;
+    } else if (name == "dst") {
+        if (is_fwd()) return DNNL_ARG_DST;
+        if (is_bwd_d()) return DNNL_ARG_DIFF_DST;
+        if (is_bwd_w()) return DNNL_ARG_DIFF_DST;
+    } else if (name == "bias") {
+        if (is_fwd()) return DNNL_ARG_BIAS;
+        if (is_bwd_d()) return DNNL_ARG_BIAS;
+        if (is_bwd_w()) return DNNL_ARG_DIFF_BIAS;
+    }
+    return DNNL_ARG_UNDEF;
+}
+
+bool arg_helper_t::is_input(const std::string &name) const {
+    if (name == "src") return is_fwd() || is_bwd_w();
+    if (name == "wei") return is_fwd() || is_bwd_d();
+    if (name == "dst") return is_bwd_d() || is_bwd_w();
+    if (name == "bias") return desc_.with_bias_fwd();
+    gpu_error_not_expected();
+    return false;
+}
+
+bool arg_helper_t::is_output(const std::string &name) const {
+    if (name == "src") return is_bwd_d();
+    if (name == "wei") return is_bwd_w();
+    if (name == "dst") return is_fwd();
+    if (name == "bias") return desc_.with_bias_bwd_w();
+    gpu_error_not_expected();
+    return false;
+}
+
+std::string arg_helper_t::scales_name(int arg) const {
+    switch (arg) {
+        case DNNL_ARG_SRC: return "src_scales";
+        case DNNL_ARG_WEIGHTS: return "wei_scales";
+        case DNNL_ARG_DST: return "dst_scales";
+        default: gpu_error_not_expected();
+    }
+    return "";
+}
+
+std::string arg_helper_t::post_op_name(size_t idx) const {
+    gpu_assert(idx < desc_.post_ops.len());
+    auto &po = desc_.post_ops[idx];
+    if (po.is_eltwise() || po.is_sum()) return "";
+    if (po.is_binary()) return "binary_" + std::to_string(idx);
+    gpu_error_not_expected();
+    return "";
+}
+
+int arg_helper_t::scales_key(int arg) const {
+    return DNNL_ARG_ATTR_SCALES | arg;
+}
+
+int arg_helper_t::post_op_key(size_t idx) const {
+    int _idx = static_cast<int>(idx);
+    gpu_assert(idx < desc_.post_ops.len());
+    auto &po = desc_.post_ops[idx];
+    if (po.is_eltwise() || po.is_sum()) return DNNL_ARG_UNDEF;
+    if (po.is_binary() && po.as_binary().alg == alg_kind::binary_prelu) {
+        return DNNL_ARG_ATTR_MULTIPLE_POST_OP(_idx) | DNNL_ARG_WEIGHTS;
     }
+    if (po.is_binary()) {
+        return DNNL_ARG_ATTR_MULTIPLE_POST_OP(_idx) | DNNL_ARG_SRC_1;
+    }
+    gpu_error_not_expected();
+    return -1;
 }
 
-void init_dispatch_kernel_info_div_magic(
-        kernel_info_t &kernel_info, const prb_tile_t &tg_dims) {
-    for (auto &d : tg_dims) {
-        uint32_t size = tg_dims.at(d);
-        uint64_t magic = ir_utils::idiv_magicgu_packed(size);
-        kernel_info.set_internal_arg(d.str() + "_grid_size", size);
-        kernel_info.set_internal_arg(d.str() + "_magic", magic);
+tensor_config_t get_tensor_config(
+        const kernel_desc_t &desc, const convolution_pd_t *pd = nullptr) {
+    arg_helper_t h(desc);
+    tensor_config_t tensor_cfg;
+    for (auto *t : {"src", "wei", "dst", "bias"}) {
+        bool is_input = h.is_input(t);
+        bool is_output = h.is_output(t);
+        if (!is_input && !is_output) continue;
+        int key = h.key(t);
+        tensor_cfg.add_tensor(t, key, is_input, is_output,
+                pd ? jit::layout_t(pd->arg_md(key)) : jit::layout_t());
+    }
+    for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+        if (desc.scales.get(arg).has_default_values()) continue;
+        tensor_cfg.add_tensor(h.scales_name(arg), h.scales_key(arg),
+                /*is_input=*/true,
+                /*is_output=*/false, jit::layout_t());
     }
+    for (size_t i = 0; i < desc.post_ops.len(); i++) {
+        auto name = h.post_op_name(i);
+        if (name.empty()) continue;
+        int key = h.post_op_key(i);
+        tensor_cfg.add_tensor(name, key, /*is_input=*/true,
+                /*is_output=*/false,
+                pd ? jit::layout_t(pd->arg_md(key)) : jit::layout_t());
+    }
+    return tensor_cfg;
+}
+
+send_kind_t kernel_desc_t::access_kind(
+        send_op_t op, tensor_kind_t tensor) const {
+    if (use_2d_access && tensor != tensor_kind_t::undef && !is_atomic(op)
+            && can_use_2d(*this, tensor))
+        return send_kind_t::_2d;
+    return send_kind_t::undef;
 }
 
 compute::range_t kernel_desc_t::local_range() const {
@@ -548,24 +724,244 @@ compute::range_t kernel_desc_t::local_range() const {
     compute::range_t lws = compute::range_t::empty();
     for (size_t i = 0; i < compute::range_t::max_ndims; i++) {
         size_t tg_dim = thr_grid.size(i, thread_group_tile);
-        lws[i] = tg_dim * (i == 0 ? gpu_utils::into<size_t>(simd) : 1);
+        lws[i] = tg_dim * (i == 0 ? into<size_t>(simd) : 1);
     }
     return lws;
 }
 
-status_t kernel_desc_t::init_kernel_info(kernel_info_t &kernel_info) const {
-    auto tensor_config = get_tensor_config(prop, with_bias);
+void kernel_desc_t::init_kernel_iface(kernel_iface_t &kernel_iface) const {
+    auto tensor_config = get_tensor_config(*this);
     for (auto &t : tensor_config.tensors()) {
-        auto buf = make_buffer(t.name);
-        kernel_info.register_user_arg(buf, t.arg_key, t.is_input);
+        kernel_iface.register_arg(t.name, type_t::byte_ptr());
+    }
+    auto _reqs = reqs();
+    auto tg_grid = create_thread_group_grid(*this);
+    for (int i = 0; i < grid_t::N; i++) {
+        auto &dims = tg_grid.dims(i);
+        for (size_t j = 0; j < dims.size(); j++) {
+            if (j == dims.size() - 1) continue;
+            kernel_iface.register_arg(
+                    dims[j].str() + "_grid_size", type_t::u32());
+            kernel_iface.register_arg(
+                    dims[j].str() + "_grid_size_magic", type_t::u64());
+        }
     }
     for (auto &d : conv_dims()) {
+        dim_t dummy;
+        if (_reqs.get_value(d, dummy)) continue;
         auto var = var_t::make(type_t::s32(), d.str());
-        kernel_info.register_internal_arg(var);
+        kernel_iface.register_arg(var);
+        if (d == pvars::sw)
+            kernel_iface.register_arg("sw_magic", type_t::u64());
+    }
+    if (use_stream_k) {
+        kernel_iface.register_arg("sk_iters_per_tile_main", type_t::s32());
+        kernel_iface.register_arg(
+                "sk_iters_per_tile_main_magic", type_t::u64());
+        kernel_iface.register_arg("sk_total_iters_main", type_t::s32());
+        kernel_iface.register_arg("sk_iters_per_tg_main", type_t::s32());
+        kernel_iface.register_arg("sk_iters_per_tg_main_magic", type_t::u64());
+        kernel_iface.register_arg("sk_iters_per_tile_tail", type_t::s32());
+        kernel_iface.register_arg(
+                "sk_iters_per_tile_tail_magic", type_t::u64());
+        kernel_iface.register_arg("sk_total_iters_tail", type_t::s32());
+        kernel_iface.register_arg("sk_iters_per_tg_tail", type_t::s32());
+        kernel_iface.register_arg("sk_iters_per_tg_tail_magic", type_t::u64());
+        kernel_iface.register_arg("sk_k_batches", type_t::s32());
+        for (auto &e : loop_desc) {
+            dim_t dummy;
+            if (_reqs.get_value(e.dim, dummy)) continue;
+            dim_t iter_size = iter_tile.get(e.dim, 1);
+            dim_t tg_size = thread_group_tile.get(e.dim, 1);
+            dim_t size = iter_size * tg_size;
+            std::string bound_name = e.dim.str();
+            if (size != 1) bound_name += "_divup_" + std::to_string(size);
+            kernel_iface.register_arg(bound_name + "_magic", type_t::u64());
+        }
     }
+}
 
-    init_kernel_info_div_magic(kernel_info, *this);
-    return status::success;
+static bool try_parse_internal_arg(std::string s, pvar_t &dim, dim_t &denom,
+        const std::string &suffix = {}) {
+    size_t pos;
+    if (suffix.empty()) {
+        pos = s.size();
+    } else {
+        pos = s.find(suffix);
+        if (pos == std::string::npos) return false;
+    }
+    s = s.substr(0, pos);
+    const char *divup_tag = "_divup_";
+    size_t divup_pos = s.find(divup_tag);
+    denom = 1;
+    if (divup_pos != std::string::npos) {
+        auto pos = divup_pos + std::strlen(divup_tag);
+        denom = std::stoi(s.substr(pos));
+        s = s.substr(0, divup_pos);
+    }
+    dim = pvar_t(s);
+    return true;
+}
+
+bool try_register_internal_arg(kernel_info_t &kernel_info, const expr_t &var,
+        const pvar_tile_t &pvar_map) {
+    auto &type = var.type();
+    auto &name = var.as<var_t>().name;
+    pvar_t dim;
+    dim_t denom = 1;
+    if (try_parse_internal_arg(name, dim, denom, "_magic")) {
+        gpu_assert(var.type().is_u64());
+        uint64_t value = ir_utils::idiv_magicgu_packed(
+                into<uint32_t>(utils::div_up(pvar_map.at(dim), denom)));
+        kernel_info.set_internal_arg(name, value);
+        return true;
+    }
+    if (try_parse_internal_arg(name, dim, denom)) {
+        gpu_assert(!dim.is_undef());
+        if (type == type_t::s32()) {
+            int32_t value
+                    = into<int32_t>(utils::div_up(pvar_map.at(dim), denom));
+            kernel_info.set_internal_arg(name, value);
+        } else if (type == type_t::u32()) {
+            uint32_t value
+                    = into<uint32_t>(utils::div_up(pvar_map.at(dim), denom));
+            kernel_info.set_internal_arg(name, value);
+        }
+        return true;
+    }
+    return false;
+}
+
+dim_t stream_k_thread_groups(
+        dim_t total_iters, dim_t max_thread_groups_per_wave) {
+    const dim_t min_iters_per_tg = 2;
+    dim_t ref_iters = utils::div_up(total_iters, min_iters_per_tg);
+    return std::min(ref_iters, max_thread_groups_per_wave);
+}
+
+dim_t stream_k_k_batches(const kernel_desc_t &desc, const problem_t &prb) {
+    const size_t l3_size = prb.hw().l3_cache_size();
+    auto a = to_conv_layout(desc.layout_tag(tensor_kind_t::a), prb.shape());
+    auto b = to_conv_layout(desc.layout_tag(tensor_kind_t::b), prb.shape());
+    dim_t ab_size = a.size() + b.size();
+    return utils::div_up(2 * ab_size, l3_size);
+}
+
+type_t accumulator_type(const type_t &a_type, const type_t &b_type) {
+    gpu_assert(a_type.size() == b_type.size());
+    return a_type.is_fp() ? type_t::f32() : type_t::s32();
+}
+
+kernel_desc_t to_stream_k(const kernel_desc_t &desc, bool check_ext) {
+    if (desc.use_stream_k) return desc;
+    if (check_ext && !desc.ext.has(extension_kind_t::stream_k))
+        return kernel_desc_t();
+    if (desc.with_bias_fwd()) return kernel_desc_t();
+
+    auto sk_desc = desc;
+    sk_desc.use_stream_k = true;
+    auto out_kind = pick_c(sk_desc.prop, tensor_kind_t::src, tensor_kind_t::wei,
+            tensor_kind_t::dst);
+    auto acc_type = accumulator_type(sk_desc.a_type(), sk_desc.b_type());
+    switch (out_kind) {
+        case tensor_kind_t::src:
+            sk_desc.src_tag = sk_desc.src_tag.with_type(acc_type);
+            break;
+        case tensor_kind_t::wei:
+            sk_desc.wei_tag = sk_desc.wei_tag.with_type(acc_type);
+            break;
+        case tensor_kind_t::dst:
+            sk_desc.dst_tag = sk_desc.dst_tag.with_type(acc_type);
+            break;
+        default: gpu_error_not_expected();
+    }
+    sk_desc.set_defaults();
+    return sk_desc;
+}
+
+void init_kernel_info(kernel_info_t &kernel_info, const problem_t &prb,
+        const kernel_desc_t &desc, const grid_t &tg_grid,
+        const pvar_tile_t &grid_dims, dim_t max_tgs, dim_t &stream_k_tg0,
+        dim_t &stream_k_tg1) {
+    auto pvar_map = prb.shape();
+    for (auto &d : grid_dims) {
+        pvar_map[pvar_t(d.str() + "_grid_size")] = grid_dims.at(d);
+    }
+    if (desc.use_stream_k) {
+        dim_t k_iters = 1;
+        for (auto &e : desc.loop_desc) {
+            dim_t tg_size = desc.thread_group_tile.get(e.dim, 1);
+            dim_t iter_size = desc.iter_tile.get(e.dim, 1);
+            dim_t dim_iters_per_tile
+                    = utils::div_up(prb.shape().at(e.dim), tg_size * iter_size);
+            k_iters *= dim_iters_per_tile;
+        }
+        dim_t k_batches = std::min(k_iters, stream_k_k_batches(desc, prb));
+        dim_t bmn_tiles = tg_grid.size(0, grid_dims);
+        dim_t iters_per_tile = utils::div_up(k_iters, k_batches);
+        dim_t iters_per_tile_tail = iters_per_tile
+                - (utils::rnd_up(k_iters, k_batches) - k_iters);
+        if (iters_per_tile_tail == 0) iters_per_tile_tail = iters_per_tile;
+        stream_k_tg0
+                = stream_k_thread_groups(bmn_tiles * iters_per_tile, max_tgs);
+        stream_k_tg1 = k_batches;
+        dim_t total_iters = bmn_tiles * iters_per_tile;
+        dim_t total_iters_tail = bmn_tiles * iters_per_tile_tail;
+        dim_t iters_per_tg = utils::div_up(total_iters, stream_k_tg0);
+        dim_t iters_per_tg_tail = utils::div_up(total_iters_tail, stream_k_tg0);
+        pvar_map[pvar_t("sk_iters_per_tile_main")] = iters_per_tile;
+        pvar_map[pvar_t("sk_total_iters_main")] = total_iters;
+        pvar_map[pvar_t("sk_iters_per_tg_main")] = iters_per_tg;
+        pvar_map[pvar_t("sk_iters_per_tile_tail")] = iters_per_tile_tail;
+        pvar_map[pvar_t("sk_total_iters_tail")] = total_iters_tail;
+        pvar_map[pvar_t("sk_iters_per_tg_tail")] = iters_per_tg_tail;
+        pvar_map[pvar_t("sk_k_batches")] = stream_k_tg1;
+    }
+    for (int i = 0; i < kernel_info.nargs(); i++) {
+        auto &var = kernel_info.arg_var(i);
+        if (var.type().is_scalar()) {
+            bool ok = try_register_internal_arg(kernel_info, var, pvar_map);
+            gpu_assert(ok) << "Cannot handle argument: " << var;
+        }
+    }
+}
+
+void kernel_desc_t::init_kernel_info(kernel_info_t &kernel_info,
+        const kernel_params_base_t &params,
+        const impl::engine_t *engine) const {
+    auto &prb = static_cast<const kernel_params_t &>(params).prb;
+    auto tg_grid = create_thread_group_grid(*this);
+    auto thr_grid = create_thread_grid(*this);
+    auto &shape = prb.shape();
+    pvar_tile_t grid_dims;
+    for (auto &d : tg_grid.all_dims()) {
+        dim_t tg_size = thread_group_tile.get(d, 1);
+        dim_t iter_size = iter_tile.get(d, 1);
+        grid_dims[d] = utils::div_up(shape.at(d), tg_size * iter_size);
+    }
+    dim_t max_tgs = prim_config_t::get_max_threadgroups_per_wave(
+            exec_cfg(engine), thread_group_tile.elems());
+    dim_t stream_k_tg0 = 0;
+    dim_t stream_k_tg1 = 0;
+    conv::init_kernel_info(kernel_info, prb, *this, tg_grid, grid_dims, max_tgs,
+            stream_k_tg0, stream_k_tg1);
+    compute::range_t gws = compute::range_t::empty();
+    compute::range_t lws = compute::range_t::empty();
+    for (size_t i = 0; i < compute::range_t::max_ndims; i++) {
+        size_t tg_dim = thr_grid.size(i, thread_group_tile);
+        lws[i] = tg_dim * (i == 0 ? into<size_t>(simd) : 1);
+        gws[i] = lws[i];
+    }
+    if (use_stream_k) {
+        gws[0] *= stream_k_tg0;
+        gws[1] *= stream_k_tg1;
+    } else {
+        for (size_t i = 0; i < compute::range_t::max_ndims; i++) {
+            gws[i] *= tg_grid.size(i, grid_dims);
+        }
+    }
+    auto nd_range = compute::nd_range_t(gws, lws);
+    kernel_info.set_nd_range(nd_range);
 }
 
 status_t kernel_desc_t::create_kernel(compute::kernel_t &kernel,
@@ -578,18 +974,88 @@ status_t kernel_desc_t::create_generator(
         const compute::compute_engine_t &engine,
         compute::kernel_t &kernel) const {
     ir_generator_t<kernel_t> ir_gen(*this);
-    return engine.create_kernel(&kernel, &ir_gen, cache_blob_t());
+    return engine.create_kernel(&kernel, &ir_gen);
+}
+
+jit::layout_t get_kernel_layout(const std::string &name,
+        const kernel_desc_t &desc, const memory_desc_t &md,
+        const convolution_pd_t *pd) {
+    layout_tag_t tag;
+    if (name == "src") {
+        tag = desc.src_tag;
+    } else if (name == "wei") {
+        tag = desc.wei_tag;
+    } else if (name == "dst") {
+        tag = desc.dst_tag;
+    } else if (name == "bias") {
+        tag = make_conv_layout_tag(
+                tensor_kind_t::bias, "a:" + desc.bias_type.str());
+    } else if (name.find("_scales") != std::string::npos) {
+        return jit::layout_t();
+    } else if (name.find("binary") == 0) {
+        auto out_kind = pick_c(desc.prop, tensor_kind_t::src,
+                tensor_kind_t::wei, tensor_kind_t::dst);
+        tag = make_conv_layout_tag(
+                out_kind, "axb:" + type_t(md.data_type).str());
+    }
+    gpu_assert(!tag.is_empty()) << "Unknown tensor: " << name;
+    auto layout = to_conv_layout(tag, md, name == "wei" && !pd->with_groups());
+    if (layout.type() != tag.type()) layout = layout.retype(tag.type());
+    return layout;
+}
+
+status_t kernel_desc_t::init_primitive_plan(primitive_init_plan_t &plan,
+        const problem_t &prb, convolution_pd_t *pd) const {
+    auto tensor_config = get_tensor_config(*this, pd);
+    int scratchpad_key = memory_tracking::names::key_none;
+    for (auto &t : tensor_config.tensors()) {
+        auto user_name = t.name;
+        auto &md = *pd->arg_md(t.arg_key);
+        auto compute_layout = get_kernel_layout(t.name, *this, md, pd);
+        auto user_layout
+                = (md.ndims == 0 ? jit::layout_t()
+                                 : jit::layout_t(md, /*do_normalize=*/false));
+        bool is_out_stream_k = use_stream_k && t.is_output;
+        bool zero_out = is_out_stream_k;
+        if (compute_layout != user_layout) {
+            user_name += "_user";
+            scratchpad_key++;
+            pd->scratchpad_registry().registrar().book(
+                    into<uint32_t>(scratchpad_key), compute_layout.size(), 1,
+                    ocl::OCL_BUFFER_ALIGNMENT);
+            plan.add_internal_buffer(t.name, compute_layout, user_name,
+                    scratchpad_key, zero_out);
+            zero_out = false;
+        }
+        plan.add_user_buffer(user_name, user_layout, t.is_input, t.is_output,
+                t.arg_key, zero_out);
+        if (user_name == t.name) {
+            gpu_assert(user_layout == compute_layout)
+                    << "Incompatible user/kernel layouts. User: "
+                    << user_layout.str()
+                    << ", kernel: " << compute_layout.str();
+        }
+    }
+    kernel_params_t _params;
+    _params.prb = prb;
+    auto desc = std::make_shared<kernel_desc_t>(*this);
+    auto params = std::make_shared<kernel_params_t>(_params);
+    plan.set_regs(regs);
+    plan.set_simd(simd);
+    plan.set_dpas(fma == fma_kind_t::dpas);
+    plan.add_kernel(desc, params);
+    return status::success;
 }
 
-serialized_t kernel_desc_t::serialize() const {
+serialization_stream_t kernel_desc_t::serialize() const {
     std::ostringstream oss;
     jit::stringify(oss, *this);
     auto str = oss.str();
-    return serialized_t::from_data(
+    return serialization_stream_t::from_data(
             std::vector<uint8_t>(str.begin(), str.end()));
 }
 
-kernel_desc_t kernel_desc_t::deserialize(const serialized_t &s) {
+kernel_desc_t kernel_desc_t::deserialize(const serialization_stream_t &s) {
     auto &data = s.get_data();
     std::string str(data.begin(), data.end());
     std::istringstream iss(str);
@@ -606,61 +1072,61 @@ void kernel_desc_t::show_help() {
 }
 
 grid_t create_thread_group_grid(const kernel_desc_t &desc) {
-    grid_t grid("tg_idx");
+    grid_t grid(jit::ir_builder_t::tg_idx);
+    auto set = [&](const pvar_t &dim, int idx) {
+        grid.add_mapping(dim, desc.use_stream_k ? 0 : idx);
+    };
     switch (desc.prop) {
         case prop_kind::forward:
-            grid.add_mapping(prb_dims::oc, 0);
-            grid.add_mapping(prb_dims::g, 1);
-            grid.add_mapping(prb_dims::od, 1);
-            grid.add_mapping(prb_dims::oh, 1);
-            grid.add_mapping(prb_dims::ow, 1);
-            grid.add_mapping(prb_dims::mb, 2);
+            set(pvars::oc, 0);
+            set(pvars::g, 1);
+            set(pvars::ow, 1);
+            set(pvars::oh, 1);
+            set(pvars::od, 1);
+            set(pvars::mb, 2);
             break;
         case prop_kind::backward_data:
-            grid.add_mapping(prb_dims::ic, 0);
-            grid.add_mapping(prb_dims::g, 1);
-            grid.add_mapping(prb_dims::id, 1);
-            grid.add_mapping(prb_dims::ih, 1);
-            grid.add_mapping(prb_dims::iw, 1);
-            grid.add_mapping(prb_dims::mb, 2);
+            set(pvars::ic, 0);
+            set(pvars::g, 1);
+            set(pvars::iw, 1);
+            set(pvars::ih, 1);
+            set(pvars::id, 1);
+            set(pvars::mb, 2);
             break;
         case prop_kind::backward_weights:
-            grid.add_mapping(prb_dims::oc, 0);
-            grid.add_mapping(prb_dims::ic, 1);
-            grid.add_mapping(prb_dims::kd, 1);
-            grid.add_mapping(prb_dims::kh, 1);
-            grid.add_mapping(prb_dims::kw, 1);
-            grid.add_mapping(prb_dims::od, 1);
-            grid.add_mapping(prb_dims::oh, 1);
-            grid.add_mapping(prb_dims::ow, 1);
-            grid.add_mapping(prb_dims::g, 2);
-            grid.add_mapping(prb_dims::mb, 2);
+            set(pvars::oc, 0);
+            set(pvars::ic, 1);
+            set(pvars::kw, 1);
+            set(pvars::kh, 1);
+            set(pvars::kd, 1);
+            set(pvars::g, 2);
             break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     return grid;
 }
 
 grid_t create_thread_grid(const kernel_desc_t &desc) {
-    grid_t grid("thr_idx");
+    grid_t grid(jit::ir_builder_t::thr_idx);
     switch (desc.prop) {
         case prop_kind::forward:
-            grid.add_mapping(prb_dims::oc, 0);
-            grid.add_mapping(prb_dims::mb, 1);
-            grid.add_mapping(prb_dims::ow, 1);
-            grid.add_mapping(prb_dims::ic, 2);
+            grid.add_mapping(pvars::oc, 0);
+            grid.add_mapping(pvars::mb, 1);
+            grid.add_mapping(pvars::ow, 1);
+            grid.add_mapping(pvars::ic, 2);
             break;
         case prop_kind::backward_data:
-            grid.add_mapping(prb_dims::ic, 0);
-            grid.add_mapping(prb_dims::mb, 1);
-            grid.add_mapping(prb_dims::iw, 1);
-            grid.add_mapping(prb_dims::oc, 2);
+            grid.add_mapping(pvars::ic, 0);
+            grid.add_mapping(pvars::mb, 1);
+            grid.add_mapping(pvars::iw, 1);
+            grid.add_mapping(pvars::oc, 2);
             break;
         case prop_kind::backward_weights:
-            grid.add_mapping(prb_dims::oc, 0);
-            grid.add_mapping(prb_dims::ic, 1);
+            grid.add_mapping(pvars::oc, 0);
+            grid.add_mapping(pvars::ic, 1);
+            grid.add_mapping(pvars::kh, 2);
             break;
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
     for (auto &d : grid.all_dims()) {
         if (!desc.thread_group_tile.has(d)) grid.unset(d);
@@ -668,33 +1134,28 @@ grid_t create_thread_grid(const kernel_desc_t &desc) {
     return grid;
 }
 
-status_t kernel_params_t::init_dispatch_kernel_info(
-        kernel_info_t &kernel_info, const kernel_desc_base_t &_desc) const {
-    auto &desc = static_cast<const kernel_desc_t &>(_desc);
-    auto tg_grid = create_thread_group_grid(desc);
-    auto thr_grid = create_thread_grid(desc);
-    CHECK(desc.init_kernel_info(kernel_info));
-    auto &dims = prb.shape();
-    for (auto &d : dims) {
-        kernel_info.set_internal_arg(d.str(), dims.at(d));
-    }
-    prb_tile_t tg_dims;
-    for (auto &d : tg_grid.all_dims()) {
-        int tg_size = desc.thread_group_tile.get(d, 1);
-        int iter_size = desc.iter_tile.get(d, 1);
-        tg_dims[d] = utils::div_up(dims.at(d), tg_size * iter_size);
-    }
-    init_dispatch_kernel_info_div_magic(kernel_info, tg_dims);
-    compute::range_t gws = compute::range_t::empty();
-    compute::range_t lws = compute::range_t::empty();
-    for (size_t i = 0; i < compute::range_t::max_ndims; i++) {
-        size_t tg_dim = thr_grid.size(i, desc.thread_group_tile);
-        lws[i] = tg_dim * (i == 0 ? gpu_utils::into<size_t>(desc.simd) : 1);
-        gws[i] = tg_grid.size(i, tg_dims) * lws[i];
+bool can_use_2d(const kernel_desc_t &desc, tensor_kind_t tensor) {
+    auto abc = to_abc(desc.prop, tensor);
+    auto &tag = desc.layout_tag(tensor);
+    // No block 2D access with atomics.
+    if (abc == tensor_kind_t::c && desc.use_stream_k) return false;
+    // No block 2D access with blocked layouts.
+    if (tag.is_blocked()) return false;
+    auto &e_inner = tag.raw_tag().entries().back();
+    auto inner_dim = tag.desc().prb_dim(e_inner.index());
+    int ndims = 0;
+    bool found_inner_dim = false;
+    for (auto &d : desc.iter_tile) {
+        auto bmnk = to_gemm(d, desc.prop);
+        if (abc == tensor_kind_t::a && bmnk == pvars::n) continue;
+        if (abc == tensor_kind_t::b && bmnk == pvars::m) continue;
+        if (abc == tensor_kind_t::c && bmnk == pvars::k) continue;
+        if (d == inner_dim) found_inner_dim = true;
+        ndims++;
     }
-    auto nd_range = compute::nd_range_t(gws, lws);
-    kernel_info.set_nd_range(nd_range);
-    return status::success;
+    // Tile has too many dimensions or does not include the innermost dimension.
+    if (ndims > 2 || !found_inner_dim) return false;
+    return true;
 }
 
 } // namespace conv
diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc.hpp b/src/gpu/intel/jit/v2/conv/kernel_desc.hpp
index 17560fd56cb..cd049a3adb6 100644
--- a/src/gpu/intel/jit/v2/conv/kernel_desc.hpp
+++ b/src/gpu/intel/jit/v2/conv/kernel_desc.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,10 +17,12 @@
 #ifndef GPU_INTEL_JIT_V2_CONV_KERNEL_DESC_HPP
 #define GPU_INTEL_JIT_V2_CONV_KERNEL_DESC_HPP
 
+#include "gpu/intel/gpu_post_ops.hpp"
 #include "gpu/intel/jit/ir/fma.hpp"
 #include "gpu/intel/jit/ir/hw.hpp"
 #include "gpu/intel/jit/ir/kernel_desc.hpp"
 #include "gpu/intel/jit/ir/message.hpp"
+#include "gpu/intel/jit/ir/primitive_plan.hpp"
 #include "gpu/intel/jit/v2/conv/problem.hpp"
 #include "gpu/intel/jit/v2/ir/reqs.hpp"
 #include "gpu/intel/jit/v2/ir/send.hpp"
@@ -47,6 +49,10 @@ namespace conv {
 struct hw_desc_t {
     ngen::HW hw = ngen::HW::Unknown;
 
+    hw_desc_t() = default;
+    hw_desc_t(ngen::HW hw) : hw(hw) {}
+    operator ngen::HW() const { return hw; }
+    int grf_size() const { return ngen::GRF::bytes(hw); }
     void stringify(std::ostream &out) const { jit::stringify(out, hw); }
     void parse(std::istream &in) { jit::parse(in, hw); }
 #if __cplusplus >= 202002L
@@ -56,17 +62,56 @@ struct hw_desc_t {
 
 // Represents specialization requirements for problem dimensions. A call to
 // desc.specialize(problem_t) is required to finish generation.
-enum class spec_strategy_t { none, max, min_dims };
+enum class specialization_mode_t {
+    none,
+    // Whether to specialize all problem values.
+    max,
+    // Whether to reduce dimensions based on the problem, e.g. 3D -> 2D.
+    min_dims
+};
 
-static auto spec_strategy_names = nstl::to_array({
-        make_enum_name(spec_strategy_t::none, "none"),
-        make_enum_name(spec_strategy_t::max, "max"),
-        make_enum_name(spec_strategy_t::min_dims, "min_dims"),
+static auto specialization_mode_names = nstl::to_array({
+        make_enum_name(specialization_mode_t::none, "none"),
+        make_enum_name(specialization_mode_t::max, "max"),
+        make_enum_name(specialization_mode_t::min_dims, "min_dims"),
 });
-GPU_DEFINE_PARSE_ENUM(spec_strategy_t, spec_strategy_names)
+GPU_DEFINE_PARSE_ENUM(specialization_mode_t, specialization_mode_names)
+
+struct specialization_t {
+    specialization_mode_t mode = specialization_mode_t::none;
+    // Dimension values to specialize (e.g. kw1).
+    pvar_tile_t dim_values;
+    // Dimension modulus to specialize (e.g. oc@64)
+    pvar_tile_t dim_mods;
+
+    // Whether the specialization depends on the problem dimensions, meaning
+    // that specialize() must be called.
+    bool is_dynamic() const { return mode != specialization_mode_t::none; }
+
+    // Deduce problem dimensions based on max/min_dims specialization mode.
+    void specialize(const problem_t &prb);
+
+    explicit operator bool() const {
+        return mode != specialization_mode_t::none || !dim_values.is_empty()
+                || !dim_mods.is_empty();
+    }
+
+    prb_reqs_t reqs() const;
+
+    std::string str() const;
+    IR_DEFINE_DUMP()
+
+#if __cplusplus >= 202002L
+    bool operator==(const specialization_t &other) const = default;
+#endif
+
+    void stringify(std::ostream &out) const { out << str(); }
+    void parse(std::istream &in);
+    void canonicalize();
+};
 
 struct loop_desc_entry_t {
-    prb_dim_t dim;
+    pvar_t dim;
     int idx = -1;
     bool is_outer = true;
     // Whether the dimension range is distributed between thread groups (global
@@ -74,7 +119,7 @@ struct loop_desc_entry_t {
     bool is_global = false;
 
     loop_desc_entry_t() = default;
-    loop_desc_entry_t(const prb_dim_t &dim, int idx, bool is_global)
+    loop_desc_entry_t(const pvar_t &dim, int idx, bool is_global)
         : dim(dim), idx(idx), is_global(is_global) {}
 
     bool is_empty() const { return dim.is_undef(); }
@@ -97,18 +142,18 @@ class loop_desc_t {
     bool is_empty() const { return entries_.empty(); }
     const std::vector<loop_desc_entry_t> &entries() const { return entries_; }
     int ndims() const { return (int)entries_.size(); }
-    bool has(const prb_dim_t &dim) const { return !find(dim).is_empty(); }
-    loop_desc_entry_t find(const prb_dim_t &dim) const {
+    bool has(const pvar_t &dim) const { return !find(dim).is_empty(); }
+    loop_desc_entry_t find(const pvar_t &dim) const {
         for (auto &e : entries_)
             if (e.dim == dim) return e;
         return loop_desc_entry_t();
     }
-    bool is_global(const prb_dim_t &dim) const { return find(dim).is_global; }
-    void add(const prb_dim_t &dim, bool is_global = false) {
+    bool is_global(const pvar_t &dim) const { return find(dim).is_global; }
+    void add(const pvar_t &dim, bool is_global = false) {
         if (!entries_.empty()) entries_.back().is_outer = false;
         entries_.emplace_back(dim, ndims(), is_global);
     }
-    void remove(const prb_dim_t &dim) {
+    void remove(const pvar_t &dim) {
         for (auto it = entries_.begin(); it != entries_.end(); it++) {
             if (it->dim == dim) {
                 entries_.erase(it);
@@ -117,7 +162,7 @@ class loop_desc_t {
         }
         update_indices();
     }
-    int index(const prb_dim_t &dim) const { return find(dim).idx; }
+    int index(const pvar_t &dim) const { return find(dim).idx; }
     std::vector<loop_desc_entry_t>::const_iterator begin() const {
         return entries_.begin();
     }
@@ -148,7 +193,7 @@ class loop_desc_t {
         in >> s;
         auto parts = gpu_utils::split(s, ",");
         for (auto &p : parts)
-            add(prb_dim_t::from_name(p));
+            add(pvar_t(p));
     }
 
 private:
@@ -162,88 +207,14 @@ class loop_desc_t {
     std::vector<loop_desc_entry_t> entries_;
 };
 
-enum class access_mode_t {
-    // Rely on explicit load/store settings.
-    direct,
-    // Rely on alignment/2D settings
-    alignment,
-};
-
-struct align_desc_t {
-    struct align_t {
-        int value = 0;
-        // If true, then value in bytes, otherwise in elements.
-        bool in_bytes = false;
-
-        bool is_default() const { return value == 0; }
-        std::string str() const;
-        void parse(const std::string &s);
-    };
-    align_t src;
-    align_t wei;
-    align_t dst;
-
-    bool is_default() const {
-        return src.is_default() && wei.is_default() && dst.is_default();
-    }
-
-    std::string str() const;
-
-    IR_DEFINE_DUMP()
-
-#if __cplusplus >= 202002L
-    bool operator==(const align_desc_t &other) const = default;
-#endif
-
-    void stringify(std::ostream &out) const { out << str(); }
-    void parse(std::istream &in);
-};
-
-struct load_desc_t {
-    send_kind_t a = send_kind_t::undef;
-    send_kind_t b = send_kind_t::undef;
-
-    std::string str() const {
-        std::vector<std::string> parts;
-        if (a != send_kind_t::undef) parts.emplace_back("a:" + to_string(a));
-        if (b != send_kind_t::undef) parts.emplace_back("b:" + to_string(b));
-        if (parts.empty()) return "x";
-        return gpu_utils::join(",", parts);
-    }
-
-    IR_DEFINE_DUMP()
-
-#if __cplusplus >= 202002L
-    bool operator==(const load_desc_t &other) const = default;
-#endif
-
-    void stringify(std::ostream &out) const { out << str(); }
-    void parse(std::istream &in);
-};
-
-struct store_desc_t {
-    send_kind_t c = send_kind_t::undef;
-
-    std::string str() const {
-        if (c != send_kind_t::undef) return "c:" + to_string(c);
-        return "x";
-    }
-
-    IR_DEFINE_DUMP()
-
-#if __cplusplus >= 202002L
-    bool operator==(const store_desc_t &other) const = default;
-#endif
-
-    void stringify(std::ostream &out) const { out << str(); }
-    void parse(std::istream &in);
-};
-
 struct prefetch_desc_t {
     int dist = 0;
     bool a = false;
     bool b = false;
 
+    prefetch_desc_t() = default;
+    prefetch_desc_t(int dist, bool a, bool b) : dist(dist), a(a), b(b) {}
+
     std::string str() const {
         if (!a && !b) return "x0";
         std::ostringstream oss;
@@ -263,80 +234,79 @@ struct prefetch_desc_t {
     void parse(std::istream &in);
 };
 
-layout_desc_t make_conv_layout_desc(
-        tensor_kind_t tensor_kind, bool src_dst_with_group = false);
-layout_desc_t make_conv_algo_layout_desc(
-        prop_kind_t prop, tensor_kind_t tensor_kind);
-layout_tag_t make_conv_layout_tag(
-        tensor_kind_t tensor_kind, const std::string &s);
-layout_tag_t make_conv_layout_tag(
-        tensor_kind_t tensor_kind, int conv_ndims, const memory_desc_t &md);
-prb_tile_t min_dims_tile(const problem_t &prb);
+enum class extension_kind_t : uint32_t {
+    undef = 0,
+    out_b1 = 1,
+    out_b2 = 2,
+    out_b4 = 4,
+    bias = 8,
+    stream_k = 16,
+};
+
+static auto extension_kind_names = nstl::to_array({
+        make_enum_name(extension_kind_t::undef, "undef"),
+        make_enum_name(extension_kind_t::out_b1, "out_b1"),
+        make_enum_name(extension_kind_t::out_b2, "out_b2"),
+        make_enum_name(extension_kind_t::out_b4, "out_b4"),
+        make_enum_name(extension_kind_t::bias, "bias"),
+        make_enum_name(extension_kind_t::stream_k, "stream_k"),
+});
+GPU_DEFINE_PARSE_ENUM(extension_kind_t, extension_kind_names)
+
+struct extensions_t {
+    extension_kind_t kinds = extension_kind_t::undef;
+
+    void add(extension_kind_t kind);
+    bool has(extension_kind_t kind) const;
+    std::string str() const;
+    IR_DEFINE_DUMP()
+    void stringify(std::ostream &out) const { out << str(); }
+    void parse(std::istream &in);
+
+    static extension_kind_t out_size(int size);
+};
 
 struct plan_t;
+class grid_t;
 
 class kernel_desc_t : public kernel_desc_base_t {
 public:
     prop_kind_t prop = prop_kind::undef;
     bool is_dw = false;
-    bool with_bias = false;
     layout_tag_t src_tag;
     layout_tag_t wei_tag;
     layout_tag_t dst_tag;
-    layout_tag_t bia_tag;
-    spec_strategy_t spec_strategy = spec_strategy_t::none;
+    type_t bias_type;
+    specialization_t spec;
     hw_desc_t hw_desc;
     fma_kind_t fma = fma_kind_t::undef;
     int simd = 0;
     int regs = 0;
-    prb_tile_t iter_tile;
-    prb_tile_t iter_outer_tile;
-    prb_tile_t thread_group_tile;
+    pvar_tile_t iter_tile;
+    pvar_tile_t iter_outer_tile;
+    pvar_tile_t thread_group_tile;
     loop_desc_t loop_desc;
 
-    access_mode_t access_mode = access_mode_t::direct;
-    // For direct mode.
-    load_desc_t load;
-    store_desc_t store;
-
-    // For alignment-based/2D mode.
+    bool use_stream_k = false;
     bool use_2d_access = false;
-    align_desc_t align;
 
     prefetch_desc_t prefetch;
-    prb_reqs_t reqs;
-
-    hw_t hw;
-    bool is_finalized = false;
+    extensions_t ext;
+    scales_t scales;
+    gpu_post_ops_t post_ops;
 
     bool is_empty() const { return prop == prop_kind::undef; }
-    bool is_supported() const;
+    bool is_supported(const hw_t &hw, const problem_t *prb = nullptr) const;
+    prb_reqs_t reqs() const;
     void set(const std::string &s);
     void set_defaults();
-    void finalize(const prb_reqs_t &final_reqs);
-
-    bool fits(const problem_t &prb, bool check_tags = true) const {
-        ir_check(prb.prop() == prop) << "Propagation kind does not match";
-        if (check_tags) {
-            ir_check(prb.src_tag().matches(src_tag, prb.shape()))
-                    << "Source tag  " << prb.src_tag()
-                    << " does not match kernel descriptor tag " << src_tag;
-            ir_check(prb.wei_tag().matches(wei_tag, prb.shape()))
-                    << "Weights tag " << prb.wei_tag()
-                    << " does not match kernel descriptor tag " << wei_tag;
-            ir_check(prb.dst_tag().matches(dst_tag, prb.shape()))
-                    << "Destination tag " << prb.dst_tag()
-                    << " does not match kernel descriptor tag " << dst_tag;
-        }
-        ir_check(prb.is_depthwise() == is_dw)
-                << "Mixing depthwise/non-depthwise descriptor and problem";
-        ir_check(prb.with_bias() == with_bias)
-                << "Problem and descriptor 'with_bias' field mismatch";
-        ir_check(reqs.fits(prb.shape()));
-        return true;
-    }
-
+    bool can_fit(const problem_t &prb) const;
+    void fit_to(const problem_t &prb);
+    status_t set_attr(const convolution_pd_t *pd, const primitive_attr_t *attr,
+            const memory_desc_t *out_md);
+    bool matches(const problem_t &prb) const;
     std::string cmd_str() const;
+    std::string brief_str() const;
     std::string str() const;
 
     IR_DEFINE_DUMP()
@@ -345,50 +315,36 @@ class kernel_desc_t : public kernel_desc_base_t {
     static void init_parse_iface(parse_iface_t<kernel_desc_t> *iface);
 
     // Helper methods.
-    const type_t &a_type() const {
-        return pick_a(prop, src_tag.type(), wei_tag.type(), dst_tag.type());
-    }
-    const type_t &b_type() const {
-        return pick_b(prop, src_tag.type(), wei_tag.type(), dst_tag.type());
+    const layout_tag_t &layout_tag(tensor_kind_t kind) const {
+        switch (kind) {
+            case tensor_kind_t::a:
+                return pick_a(prop, src_tag, wei_tag, dst_tag);
+            case tensor_kind_t::b:
+                return pick_b(prop, src_tag, wei_tag, dst_tag);
+            case tensor_kind_t::c:
+                return pick_c(prop, src_tag, wei_tag, dst_tag);
+            case tensor_kind_t::src: return src_tag;
+            case tensor_kind_t::wei: return wei_tag;
+            case tensor_kind_t::dst: return dst_tag;
+            default: gpu_error_not_expected();
+        }
+        return src_tag;
     }
-    const type_t &c_type() const {
-        return pick_c(prop, src_tag.type(), wei_tag.type(), dst_tag.type());
+    const type_t &a_type() const { return layout_tag(tensor_kind_t::a).type(); }
+    const type_t &b_type() const { return layout_tag(tensor_kind_t::b).type(); }
+    const type_t &c_type() const { return layout_tag(tensor_kind_t::c).type(); }
+    bool with_bias_fwd() const {
+        return prop == prop_kind::forward && !bias_type.is_undef();
     }
-
-    send_kind_t access_kind(send_op_t op, tensor_kind_t tensor) const {
-        if (access_mode == access_mode_t::direct) {
-            switch (op) {
-                case send_op_t::load:
-                    switch (tensor) {
-                        case tensor_kind_t::a: return load.a;
-                        case tensor_kind_t::b: return load.b;
-                        default: ir_error_not_expected();
-                    }
-                    break;
-                case send_op_t::store:
-                    switch (tensor) {
-                        case tensor_kind_t::c: return store.c;
-                        case tensor_kind_t::bia: return send_kind_t::undef;
-                        default: ir_error_not_expected();
-                    }
-                    break;
-                default: ir_error_not_expected();
-            }
-            return send_kind_t::undef;
-        } else {
-            if (use_2d_access) return send_kind_t::_2d;
-            return send_kind_t::undef;
-        }
+    bool with_bias_bwd_w() const {
+        return prop == prop_kind::backward_weights && !bias_type.is_undef();
     }
 
+    send_kind_t access_kind(send_op_t op, tensor_kind_t tensor) const;
     std::string kernel_name() const override { return "gen_conv_v2"; }
 
-    exec_config_t exec_cfg() const override {
-        exec_config_t ret(hw);
-        ret.set_regs(regs);
-        ret.set_simd(simd);
-        ret.set_vec_size(simd);
-        return ret;
+    exec_config_t exec_cfg(const impl::engine_t *engine) const override {
+        return exec_config_t(hw_t(engine), regs, simd, true);
     }
 
     compute::range_t local_range() const override;
@@ -397,55 +353,64 @@ class kernel_desc_t : public kernel_desc_base_t {
         return utils::one_of(fma, fma_kind_t::dpas, fma_kind_t::dpasw);
     }
 
-    bool has_spec_strategy() const {
-        return spec_strategy != spec_strategy_t::none;
-    }
+    void specialize(const problem_t &prb) { spec.specialize(prb); }
 
-    void specialize(const problem_t &prb) {
-        if (!has_spec_strategy()) return;
-        switch (spec_strategy) {
-            case spec_strategy_t::max:
-                reqs.add(prb.shape());
-                reqs.simplify();
-                break;
-            case spec_strategy_t::min_dims:
-                reqs.add(min_dims_tile(prb));
-                reqs.simplify();
-                break;
-            case spec_strategy_t::none: break;
-            default: ir_error_not_expected();
-        }
-        spec_strategy = spec_strategy_t::none;
-    }
-
-    status_t init_kernel_info(kernel_info_t &kernel_info) const override;
+    void init_kernel_iface(kernel_iface_t &kernel_iface) const override;
+    void init_kernel_info(kernel_info_t &kernel_info,
+            const kernel_params_base_t &params,
+            const impl::engine_t *engine) const override;
     status_t create_kernel(compute::kernel_t &kernel,
             gpu_primitive_t *primitive, impl::engine_t *engine) const override;
     status_t create_generator(const compute::compute_engine_t &engine,
             compute::kernel_t &kernel) const;
-    serialized_t serialize() const override;
-    static kernel_desc_t deserialize(const serialized_t &s);
+    status_t init_primitive_plan(primitive_init_plan_t &plan,
+            const problem_t &prb, convolution_pd_t *pd) const;
+    serialization_stream_t serialize() const override;
+    static kernel_desc_t deserialize(const serialization_stream_t &s);
     static void show_help();
 };
 
+class arg_helper_t {
+public:
+    arg_helper_t(const kernel_desc_t &desc);
+    int key(const std::string &name) const;
+    bool is_input(const std::string &name) const;
+    bool is_output(const std::string &name) const;
+    std::string scales_name(int idx) const;
+    std::string post_op_name(size_t idx) const;
+    int scales_key(int arg) const;
+    int post_op_key(size_t idx) const;
+
+private:
+    bool is_fwd() const { return desc_.prop == prop_kind::forward; }
+    bool is_bwd_d() const { return desc_.prop == prop_kind::backward_data; }
+    bool is_bwd_w() const { return desc_.prop == prop_kind::backward_weights; }
+
+    const kernel_desc_t &desc_;
+};
+
 class grid_t {
 public:
     static const int N = 3;
 
     grid_t() = default;
-    grid_t(const std::string &prefix) {
+    grid_t(std::string (*genname)(int)) {
         for (int i = 0; i < N; i++)
-            entries_[i].idx_var
-                    = var_t::make(type_t::s32(), prefix + std::to_string(i));
+            entries_[i].idx_var = var_t::make(type_t::s32(), genname(i));
+    }
+    grid_t(const std::array<expr_t, N> &idx_vars) {
+        for (int i = 0; i < N; i++) {
+            entries_[i].idx_var = idx_vars[i];
+        }
     }
 
-    void add_mapping(const prb_dim_t &dim, int idx) {
-        ir_assert(idx >= 0 && idx < N);
-        ir_assert(index_var(dim).is_empty());
+    void add_mapping(const pvar_t &dim, int idx) {
+        gpu_assert(idx >= 0 && idx < N);
+        gpu_assert(index_var(dim).is_empty());
         entries_[idx].dims.push_back(dim);
     }
 
-    void unset(const prb_dim_t &dim) {
+    void unset(const pvar_t &dim) {
         for (int i = 0; i < N; i++) {
             auto &dims = entries_[i].dims;
             for (auto it = dims.begin(); it != dims.end(); it++) {
@@ -457,7 +422,7 @@ class grid_t {
         }
     }
 
-    int index(const prb_dim_t &dim) const {
+    int index(const pvar_t &dim) const {
         for (int i = 0; i < N; i++) {
             for (auto &d : entries_[i].dims) {
                 if (d == dim) return i;
@@ -471,17 +436,15 @@ class grid_t {
         return entries_[idx].idx_var;
     }
 
-    expr_t index_var(const prb_dim_t &dim) const {
-        return index_var(index(dim));
-    }
+    expr_t index_var(const pvar_t &dim) const { return index_var(index(dim)); }
 
-    const std::vector<prb_dim_t> &dims(int idx) const {
-        ir_assert(idx >= 0 && idx < N);
+    const std::vector<pvar_t> &dims(int idx) const {
+        gpu_assert(idx >= 0 && idx < N);
         return entries_[idx].dims;
     }
 
-    std::vector<prb_dim_t> all_dims() const {
-        std::vector<prb_dim_t> ret;
+    std::vector<pvar_t> all_dims() const {
+        std::vector<pvar_t> ret;
         for (int i = 0; i < N; i++) {
             auto &e = entries_[i];
             ret.insert(ret.end(), e.dims.begin(), e.dims.end());
@@ -489,11 +452,11 @@ class grid_t {
         return ret;
     }
 
-    size_t size(size_t idx, const prb_tile_t &tile) const {
-        ir_assert(idx < N);
+    size_t size(size_t idx, const pvar_tile_t &tile) const {
+        gpu_assert(idx < N);
         size_t ret = 1;
         for (auto &d : entries_[idx].dims) {
-            ret *= gpu_utils::into<size_t>(tile.get(d, 1));
+            ret *= into<size_t>(tile.get(d, 1));
         }
         return ret;
     }
@@ -501,7 +464,7 @@ class grid_t {
 private:
     struct entry_t {
         expr_t idx_var;
-        std::vector<prb_dim_t> dims;
+        std::vector<pvar_t> dims;
 
         int ndims() const { return (int)dims.size(); }
     };
@@ -511,13 +474,16 @@ class grid_t {
 
 grid_t create_thread_group_grid(const kernel_desc_t &desc);
 grid_t create_thread_grid(const kernel_desc_t &desc);
+dim_t stream_k_thread_groups(
+        dim_t total_iters, dim_t max_thread_groups_per_wave);
+type_t accumulator_type(const type_t &a_type, const type_t &b_type);
+kernel_desc_t to_stream_k(const kernel_desc_t &desc, bool check_ext = true);
+prb_reqs_t generate_2d_reqs(const kernel_desc_t &desc);
+bool can_use_2d(const kernel_desc_t &desc, tensor_kind_t tensor);
 
 class kernel_params_t : public kernel_params_base_t {
 public:
     problem_t prb;
-
-    status_t init_dispatch_kernel_info(kernel_info_t &kernel_info,
-            const kernel_desc_base_t &_desc) const override;
 };
 
 } // namespace conv
@@ -530,16 +496,15 @@ struct trivial_key_validator_t<jit::v2::conv::kernel_desc_t> {
         auto tmp = jit::v2::conv::kernel_desc_t::deserialize(t.serialize());
         return (t.prop == tmp.prop) && (t.is_dw == tmp.is_dw)
                 && (t.src_tag == tmp.src_tag) && (t.wei_tag == tmp.wei_tag)
-                && (t.dst_tag == tmp.dst_tag)
-                && (t.spec_strategy == tmp.spec_strategy) && (t.hw == tmp.hw)
-                && (t.fma == tmp.fma) && (t.simd == tmp.simd)
-                && (t.regs == tmp.regs) && (t.iter_tile == tmp.iter_tile)
+                && (t.dst_tag == tmp.dst_tag) && (t.spec == tmp.spec)
+                && (t.hw_desc == tmp.hw_desc) && (t.fma == tmp.fma)
+                && (t.simd == tmp.simd) && (t.regs == tmp.regs)
+                && (t.iter_tile == tmp.iter_tile)
                 && (t.thread_group_tile == tmp.thread_group_tile)
-                && (t.loop_desc == tmp.loop_desc) && (t.load == tmp.load)
-                && (t.prefetch == tmp.prefetch) && (t.store == tmp.store)
-                && (t.align == tmp.align)
-                && (t.use_2d_access == tmp.use_2d_access)
-                && (t.is_finalized == tmp.is_finalized);
+                && (t.loop_desc == tmp.loop_desc)
+                && (t.prefetch == tmp.prefetch)
+                && (t.use_stream_k == tmp.use_stream_k)
+                && (t.use_2d_access == tmp.use_2d_access);
     }
 };
 #endif
diff --git a/src/gpu/intel/jit/v2/conv/kernel_desc_2d_reqs.cpp b/src/gpu/intel/jit/v2/conv/kernel_desc_2d_reqs.cpp
new file mode 100644
index 00000000000..d4db4c23f0e
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/kernel_desc_2d_reqs.cpp
@@ -0,0 +1,211 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
+
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+
+struct stride_t {
+    dim_t factor = 0;
+    std::vector<pvar_t> pvars;
+
+    stride_t() = default;
+    stride_t(dim_t factor) : factor(factor) {}
+    bool is_zero() const { return factor == 0; }
+    bool is_one() const { return factor == 1 && pvars.empty(); }
+
+    stride_t &operator*=(const pvar_t &pvar) {
+        pvars.push_back(pvar);
+        return *this;
+    }
+
+    stride_t &operator*=(dim_t factor) {
+        this->factor *= factor;
+        if (is_zero()) pvars.clear();
+        return *this;
+    }
+
+    template <typename BinaryFunc>
+    expr_t binary_expr(dim_t rhs, const BinaryFunc &func) const {
+        if (is_zero()) return func(0, rhs);
+        if (is_one()) return func(1, rhs);
+        expr_t lhs = pvars[0].var();
+        for (size_t i = 1; i < pvars.size(); i++)
+            lhs *= pvars[i].var();
+        return func(lhs, ir_utils::safe_div(rhs, factor));
+    }
+
+    expr_t mod(dim_t rhs) const {
+        return binary_expr(
+                rhs, [](const expr_t &a, const expr_t &b) { return a % b; });
+    }
+    expr_t ge(dim_t rhs) const {
+        return binary_expr(
+                rhs, [](const expr_t &a, const expr_t &b) { return a >= b; });
+    }
+    expr_t le(dim_t rhs) const {
+        return binary_expr(
+                rhs, [](const expr_t &a, const expr_t &b) { return a <= b; });
+    }
+
+    void intersect(const stride_t &other) {
+        if (is_zero()) {
+            factor = other.factor;
+            pvars = other.pvars;
+            return;
+        }
+        factor = math::gcd(factor, other.factor);
+        std::vector<pvar_t> new_pvars;
+        for (auto &a : pvars) {
+            bool found = false;
+            for (auto &b : other.pvars) {
+                if (a == b) { found = true; }
+            }
+            if (found) new_pvars.push_back(a);
+        }
+        pvars = std::move(new_pvars);
+    }
+};
+
+struct block_2d_params_t {
+    stride_t base_stride;
+    expr_t y_stride;
+    pvar_t w_dim;
+    pvar_t h_dim;
+};
+
+block_2d_params_t to_block_2d_params(const prop_kind_t &prop,
+        const tensor_kind_t &tensor_kind, int type_size,
+        const pvar_tile_t &tg_tile, const pvar_tile_t &iter_tile,
+        const pvar_map_t<stride_t> &strides) {
+    bool is_fwd = (prop == prop_kind::forward);
+    bool is_bwd_d = (prop == prop_kind::backward_data);
+    bool is_bwd_w = (prop == prop_kind::backward_weights);
+    auto abc = to_abc(prop, tensor_kind);
+    auto to_layout_dim = [&](const pvar_t &d) {
+        if (strides.has(d)) return d;
+        auto mnk = to_gemm(d, prop);
+        if (mnk == pvars::m && abc == tensor_kind_t::b) return pvar_t();
+        if (mnk == pvars::n && abc == tensor_kind_t::a) return pvar_t();
+        if (mnk == pvars::k && abc == tensor_kind_t::c) return pvar_t();
+        if ((is_fwd || is_bwd_w) && tensor_kind == tensor_kind_t::src
+                && utils::one_of(d, pvars::ow, pvars::kw))
+            return pvars::iw;
+        if (is_bwd_d && tensor_kind == tensor_kind_t::dst
+                && utils::one_of(d, pvars::iw, pvars::kw))
+            return pvars::ow;
+        gpu_error_not_expected() << "Unknown dim: " << d;
+        return pvar_t();
+    };
+    block_2d_params_t params;
+    for (auto &_d : iter_tile) {
+        auto d = to_layout_dim(_d);
+        if (d.is_undef()) continue;
+        if (strides.at(d).is_one()) {
+            gpu_assert(params.w_dim.is_undef());
+            params.w_dim = std::move(d);
+        } else {
+            gpu_assert(params.h_dim.is_undef());
+            params.h_dim = std::move(d);
+        }
+    }
+    gpu_assert(!params.w_dim.is_undef());
+    gpu_assert(!params.h_dim.is_undef());
+    params.y_stride = expr_t(1);
+    if ((is_fwd || is_bwd_w) && params.h_dim == pvars::iw) {
+        params.y_stride = pvars::sw.var();
+    }
+    for (auto &d : strides) {
+        if (utils::one_of(d, params.w_dim, params.h_dim)) continue;
+        params.base_stride.intersect(strides[d]);
+    }
+    params.base_stride *= type_size;
+    return params;
+}
+
+void generate_2d_reqs(const kernel_desc_t &desc, tensor_kind_t tensor_kind,
+        prb_reqs_t &reqs) {
+    using ir_utils::safe_div;
+    if (!can_use_2d(desc, tensor_kind)) return;
+    bool is_fwd = (desc.prop == prop_kind::forward);
+    bool is_bwd_w = (desc.prop == prop_kind::backward_weights);
+    auto tag = append_groups(
+            tensor_kind, desc.layout_tag(tensor_kind), desc.is_dw);
+    if (tag.raw_tag().is_blocked()) return;
+    pvar_map_t<stride_t> strides;
+    stride_t stride(1);
+    for (int i = tag.raw_tag().nentries() - 1; i >= 0; i--) {
+        auto &e = tag.raw_tag().entries()[i];
+        gpu_assert(!e.is_blocked);
+        auto dim = tag.desc().prb_dim(e.index());
+        strides[dim] = stride;
+        stride *= dim;
+    }
+    int type_size = tag.type().size();
+    auto params = to_block_2d_params(desc.prop, tensor_kind, type_size,
+            desc.thread_group_tile, desc.iter_tile, strides);
+    int base_align = block_2d_base_alignment(desc.hw_desc);
+    auto W = params.w_dim.var();
+    auto H = params.h_dim.var();
+    auto P = strides.at(params.h_dim);
+    if (!is_one(params.y_stride))
+        P.pvars.push_back(pvar_t::from_var(params.y_stride));
+    reqs.add_no_simplify(W >= safe_div(block_2d_min_dim(), type_size));
+    reqs.add_no_simplify(W <= safe_div(block_2d_max_dim(), type_size));
+    reqs.add_no_simplify(W % block_2d_w_alignment(type_size) == 0);
+    if (is_one(params.y_stride)) {
+        reqs.add_no_simplify(H <= block_2d_max_dim());
+    } else {
+        reqs.add_no_simplify(H % params.y_stride == 0);
+        reqs.add_no_simplify(H / params.y_stride <= block_2d_max_dim());
+    }
+    reqs.add_no_simplify(P.ge(safe_div(block_2d_min_dim(), type_size)));
+    reqs.add_no_simplify(P.le(safe_div(block_2d_max_dim(), type_size)));
+    reqs.add_no_simplify(
+            P.mod(safe_div(block_2d_pitch_alignment(desc.hw_desc), type_size))
+            == 0);
+    reqs.add_no_simplify(params.base_stride.mod(base_align) == 0);
+    if ((is_fwd || is_bwd_w) && params.h_dim == pvars::iw) {
+        reqs.add_no_simplify((params.y_stride == 1) | (pvars::pw.var() == 0));
+        reqs.add_no_simplify((params.y_stride == 1) | (pvars::kw.var() == 1));
+    }
+}
+
+prb_reqs_t generate_2d_reqs(const kernel_desc_t &desc) {
+    prb_reqs_t reqs = desc.spec.reqs();
+    if (!desc.use_2d_access) return reqs;
+    generate_2d_reqs(desc, tensor_kind_t::src, reqs);
+    generate_2d_reqs(desc, tensor_kind_t::wei, reqs);
+    generate_2d_reqs(desc, tensor_kind_t::dst, reqs);
+    reqs.simplify();
+    return reqs;
+}
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/ml.cpp b/src/gpu/intel/jit/v2/conv/ml.cpp
deleted file mode 100644
index 795e62d4bc4..00000000000
--- a/src/gpu/intel/jit/v2/conv/ml.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/jit/v2/conv/ml.hpp"
-#include "gpu/intel/jit/utils/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-
-float linear_model_t::predict(const vec1d &x) const {
-    ir_assert(x.size() == coef_.size() - 1);
-    float ret = coef_[0];
-    for (size_t i = 0; i < x.size(); i++)
-        ret += coef_[i + 1] * x[i];
-    return ret;
-}
-
-void linear_model_t::serialize(serialized_data_t &s) const {
-    s.append(coef_);
-}
-
-linear_model_t linear_model_t::deserialize(deserializer_t &d) {
-    linear_model_t m;
-    d.pop(m.coef_);
-    return m;
-}
-
-void ml_model_t::serialize(serialized_data_t &s) const {
-    s.append(kind());
-    if (!impl_) return;
-    switch (impl_->kind()) {
-        case ml_model_kind_t::linear_regression:
-            static_cast<const linear_model_t *>(impl_.get())->serialize(s);
-            break;
-        default: ir_error_not_expected();
-    }
-}
-
-ml_model_t ml_model_t::deserialize(deserializer_t &d) {
-    ml_model_kind_t kind;
-    d.pop(kind);
-    switch (kind) {
-        case ml_model_kind_t::undef: return ml_model_t();
-        case ml_model_kind_t::linear_regression: {
-            ml_model_t m;
-            m.impl_ = std::make_shared<linear_model_t>(
-                    linear_model_t::deserialize(d));
-            return m;
-        }
-        default: ir_error_not_expected();
-    }
-    return ml_model_t();
-}
-
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/ml.hpp b/src/gpu/intel/jit/v2/conv/ml.hpp
deleted file mode 100644
index 9aa1d6c461b..00000000000
--- a/src/gpu/intel/jit/v2/conv/ml.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_V2_CONV_ML_HPP
-#define GPU_INTEL_JIT_V2_CONV_ML_HPP
-
-#include <iostream>
-#include <memory>
-#include <vector>
-
-#include "gpu/intel/serialization.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-
-using vec1d = std::vector<float>;
-using vec2d = std::vector<std::vector<float>>;
-
-enum class ml_model_kind_t {
-    undef,
-    linear_regression,
-};
-
-class ml_model_impl_t {
-public:
-    virtual ml_model_kind_t kind() const = 0;
-    virtual float predict(const vec1d &x) const = 0;
-};
-
-class linear_model_t final : public ml_model_impl_t {
-public:
-    linear_model_t() = default;
-    linear_model_t(const vec1d &coef) : coef_(coef) {}
-    ml_model_kind_t kind() const override {
-        return ml_model_kind_t::linear_regression;
-    }
-    float predict(const vec1d &x) const override;
-    void serialize(serialized_data_t &s) const;
-    static linear_model_t deserialize(deserializer_t &d);
-
-private:
-    vec1d coef_;
-};
-
-class ml_model_t {
-public:
-    ml_model_t() = default;
-
-    template <typename MLModelT>
-    ml_model_t(const MLModelT &impl)
-        : impl_(std::make_shared<MLModelT>(impl)) {}
-
-    ml_model_kind_t kind() const {
-        return impl_ ? impl_->kind() : ml_model_kind_t::undef;
-    }
-    float predict(const vec1d &x) const { return impl_->predict(x); }
-    void serialize(serialized_data_t &s) const;
-    static ml_model_t deserialize(deserializer_t &d);
-
-private:
-    std::shared_ptr<ml_model_impl_t> impl_;
-};
-
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/jit/v2/conv/model.cpp b/src/gpu/intel/jit/v2/conv/model.cpp
index c5f4dd6f5fd..133ba6603c7 100644
--- a/src/gpu/intel/jit/v2/conv/model.cpp
+++ b/src/gpu/intel/jit/v2/conv/model.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "gpu/intel/jit/v2/conv/model.hpp"
 
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -27,179 +29,323 @@ namespace conv {
 struct hw_config_t {
     hw_t hw;
     fma_kind_t fma = fma_kind_t::undef;
-    type_t type;
     int regs = 0;
 
     hw_config_t() = default;
-    hw_config_t(const hw_t &hw, fma_kind_t fma, const type_t &type)
-        : hw(hw), fma(fma), type(type) {
+    hw_config_t(const hw_t &hw, fma_kind_t fma) : hw(hw), fma(fma) {
         regs = (utils::one_of(fma, fma_kind_t::dpas, fma_kind_t::dpasw) ? 256
                                                                         : 128);
     }
 
-    int max_tgs_per_gpu() const {
-        int ss_per_gpu = hw.eu_count() / hw.eus_per_ss_or_dss();
-        return ss_per_gpu * hw.threads_per_eu(regs);
+    int max_tgs_per_gpu(dim_t tg_size) const {
+        int tgs_per_ss
+                = hw.eus_per_ss_or_dss() * hw.threads_per_eu(regs) / tg_size;
+        return hw.eu_count() / hw.eus_per_ss_or_dss() * tgs_per_ss;
     }
+};
 
-    int max_threads() const { return hw.eu_count() * hw.threads_per_eu(regs); }
-
-    int f32_mad_ops_per_clock() const {
-        switch (hw.to_ngen()) {
-            case ngen::HW::XeHPC: return 32;
-            default: ir_error_not_expected();
-        }
-        return 0;
+class sample_impl_t {
+public:
+    sample_impl_t(model_kind_t model_kind, const problem_t &prb,
+            const kernel_desc_t &desc)
+        : model_kind_(model_kind), prb_(prb), desc_(desc) {
+        hw_cfg_ = hw_config_t(prb_.hw(), desc_.fma);
     }
+    virtual ~sample_impl_t() = default;
+    virtual vec1d to_x() const = 0;
+    virtual float to_y() const = 0;
 
-    int int8_dpas_ops_per_clock() const {
-        switch (hw.to_ngen()) {
-            case ngen::HW::XeHPC: return 1024;
-            default: ir_error_not_expected();
-        }
-        return 0;
-    }
-
-    int ops_per_clock(fma_kind_t fma, const type_t &type) const;
+protected:
+    model_kind_t model_kind_ = model_kind_t::undef;
+    problem_t prb_;
+    kernel_desc_t desc_;
+    hw_config_t hw_cfg_;
+};
 
-    int ops_per_clock() const {
-        bool is_mad = (fma == fma_kind_t::mad);
-        bool is_dpas = utils::one_of(fma, fma_kind_t::dpas, fma_kind_t::dpasw);
-        switch (type.size()) {
-            case 1: {
-                return is_dpas ? int8_dpas_ops_per_clock()
-                               : f32_mad_ops_per_clock() * 4;
-            }
-            case 2: {
-                return is_dpas ? int8_dpas_ops_per_clock() / 2
-                               : f32_mad_ops_per_clock() * 2;
-            }
-            case 4: {
-                return is_dpas ? int8_dpas_ops_per_clock() / 4
-                               : f32_mad_ops_per_clock() * 1;
-            }
-            case 8: {
-                ir_assert(is_mad);
-                return f32_mad_ops_per_clock() / 2;
-            }
-            default: ir_error_not_expected();
-        }
-        return 0;
+std::vector<std::string> feature_names(model_kind_t kind) {
+    switch (kind) {
+        case model_kind_t::data_parallel:
+            return std::vector<std::string>({"kl", "waves"});
+        case model_kind_t::stream_k: return std::vector<std::string>({"iters"});
+        default: gpu_error_not_expected();
     }
+    return std::vector<std::string>();
+}
 
-    float freq() const {
-        switch (hw.to_ngen()) {
-            case ngen::HW::XeHPC: return 1.6e9;
-            default: ir_error_not_expected();
-        }
-        return 0;
-    }
+void to_bmnk(prop_kind_t prop, const pvar_tile_t &tile, dim_t &b, dim_t &m,
+        dim_t &n, dim_t &k) {
+    const auto t = to_gemm(tile, prop);
+    b = t[pvars::b];
+    m = t[pvars::m];
+    n = t[pvars::n];
+    k = t[pvars::k];
+}
 
-    float max_gops_per_sec() const {
-        float max_ops_per_sec = freq() * hw.eu_count() * ops_per_clock();
-        return max_ops_per_sec / 1e9;
-    }
-};
+struct bmnk_helper_t {
+    dim_t b, m, n, k;
+    dim_t bt, mt, nt, kt;
+    dim_t bl, ml, nl, kl;
+    dim_t bi, mi, ni, ki;
+    dim_t tiles;
+    dim_t iters;
 
-struct sample_t {
-    problem_t prb;
-    kernel_desc_t kernel_desc;
-    uint64_t time_ns = 0;
-
-    hw_config_t hw_cfg;
-    int b, m, n, k;
-    int bt, mt, nt, kt;
-    int bl, ml, nl, kl;
-    int bi, mi, ni, ki;
-    float pad_eff = 0;
-
-    sample_t() = default;
-    sample_t(const problem_t &prb, const kernel_desc_t &kernel_desc,
-            uint64_t time_ns = 0)
-        : prb(prb), kernel_desc(kernel_desc), time_ns(time_ns) {
-        hw_cfg = hw_config_t(
-                prb.hw(), kernel_desc.fma, kernel_desc.src_tag.type());
-        prb_tile_t padded_shape = prb.shape();
-        pad_eff = 1;
+    bmnk_helper_t(const problem_t &prb, const kernel_desc_t &desc) {
+        auto padded_shape = prb.shape();
+        dim_t tmp_iters = 1;
         for (auto &d : padded_shape) {
             if (!is_conv_index(d)) continue;
-            int tg = kernel_desc.thread_group_tile.get(d, 1);
-            int iter = kernel_desc.iter_tile.get(d, 1);
-            int dim = padded_shape[d];
-            int padded_dim = utils::rnd_up(dim, tg * iter);
+            dim_t tg = desc.thread_group_tile.get(d, 1);
+            dim_t iter = desc.iter_tile.get(d, 1);
+            dim_t dim = padded_shape[d];
+            dim_t padded_dim = utils::rnd_up(dim, tg * iter);
             padded_shape[d] = padded_dim;
-            pad_eff *= ((float)dim / padded_dim);
+            if (!to_gemm(d, prb.prop()).is_undef()) {
+                tmp_iters *= utils::div_up(dim, iter * tg);
+            }
         }
         to_bmnk(prb.prop(), padded_shape, b, m, n, k);
-        to_bmnk(prb.prop(), kernel_desc.thread_group_tile, bt, mt, nt, kt);
-        to_bmnk(prb.prop(), kernel_desc.iter_tile, bi, mi, ni, ki);
+        to_bmnk(prb.prop(), desc.thread_group_tile, bt, mt, nt, kt);
+        to_bmnk(prb.prop(), desc.iter_tile, bi, mi, ni, ki);
         bl = ml = nl = 1;
         kl = ir_utils::safe_div(k, kt * ki);
+        tiles = 1;
+        tiles *= ir_utils::safe_div(b, bl * bt * bi);
+        tiles *= ir_utils::safe_div(m, ml * mt * mi);
+        tiles *= ir_utils::safe_div(n, nl * nt * ni);
+        iters = tiles * kl;
+        gpu_assert(tmp_iters == iters);
     }
+};
 
-    vec1d to_x() const {
-        std::vector<float> ret;
-        ret.push_back(thr_util());
-        ret.push_back(tg_util());
-        ret.push_back(inv_kl());
-        return ret;
+dim_t layout_size(const layout_tag_t &tag, const problem_t &prb) {
+    gpu_assert(!tag.is_any() && !tag.is_empty())
+            << "Unexpected tag: " << tag.str();
+    pvar_tile_t tile;
+    for (auto &d : tag.desc().letter_map())
+        tile[d] = prb.shape().at(d);
+    dim_t elems = 1;
+    for (auto &e : tag.raw_tag().entries()) {
+        auto d = tag.desc().prb_dim(e.index());
+        dim_t e_block = (e.block != 0 ? e.block : tile.at(d));
+        elems *= e_block;
+        tile[d] = utils::div_up(tile[d], e_block);
     }
+    gpu_assert(tile.elems() == 1);
+    return elems * tag.type().size();
+}
 
-    float to_y() const { return eff(); }
+float conv_time_nsec(const bench_time_t &time) {
+    if (time.nkernels() == 0) return 0;
+    if (time.nkernels() == 1) return time.total;
+    gpu_assert(utils::one_of(time.nkernels(), 2, 3))
+            << "Expecting zero-out -> conv [-> reorder] kernel sequence.";
+    return time.kernel_times[1];
+}
 
-    float thr_util() const {
-        return std::min(1.0f, threads() / (float)hw_cfg.max_threads());
+class data_parallel_sample_t : public sample_impl_t {
+public:
+    data_parallel_sample_t(const problem_t &prb, const kernel_desc_t &desc,
+            const bench_time_t &time)
+        : sample_impl_t(model_kind_t::data_parallel, prb, desc)
+        , nsec_(conv_time_nsec(time)) {
+        bmnk_helper_t h(prb, desc);
+        int tgs_per_wave = hw_cfg_.max_tgs_per_gpu(h.bt * h.mt * h.nt * h.kt);
+        kl_ = h.kl;
+        waves_ = (float)h.tiles / tgs_per_wave;
     }
 
-    float tg_util() const {
-        float ntgs = 1.0f;
-        ntgs *= ir_utils::safe_div(b, bl * bt * bi);
-        ntgs *= ir_utils::safe_div(m, ml * mt * mi);
-        ntgs *= ir_utils::safe_div(n, nl * nt * ni);
-        ntgs *= ir_utils::safe_div(k, kl * kt * ki);
-        return std::min(1.0f, ntgs / hw_cfg.max_tgs_per_gpu());
+    vec1d to_x() const override {
+        std::vector<float> ret;
+        ret.push_back(kl_);
+        ret.push_back(waves_);
+        return ret;
     }
 
-    float inv_kl() const {
-        int iters = bl * ml * nl * kl;
-        return 1.0f / iters;
+    float to_y() const override { return nsec_; }
+
+private:
+    uint64_t nsec_ = 0;
+    dim_t kl_ = 0;
+    float waves_ = 0;
+};
+
+class stream_k_sample_t : public sample_impl_t {
+public:
+    stream_k_sample_t(const problem_t &prb, const kernel_desc_t &desc,
+            const bench_time_t &time)
+        : sample_impl_t(model_kind_t::stream_k, prb, desc)
+        , nsec_(conv_time_nsec(time)) {
+        bmnk_helper_t h(prb, desc);
+        iters_ = h.iters;
     }
 
-    int64_t threads() const {
-        int64_t ret = 1;
-        ret *= ir_utils::safe_div(b, bl * bi);
-        ret *= ir_utils::safe_div(m, ml * mi);
-        ret *= ir_utils::safe_div(n, nl * ni);
-        ret *= ir_utils::safe_div(k, kl * ki);
-        return ret;
+    vec1d to_x() const override { return vec1d({(float)iters_}); }
+    float to_y() const override { return nsec_; }
+
+private:
+    uint64_t nsec_ = 0;
+    dim_t iters_;
+};
+
+class sample_t {
+public:
+    sample_t(model_kind_t kind, const problem_t &prb, const kernel_desc_t &desc,
+            const bench_time_t &time = bench_time_t()) {
+        switch (kind) {
+            case model_kind_t::data_parallel:
+                impl_ = std::make_shared<data_parallel_sample_t>(
+                        prb, desc, time);
+                break;
+            case model_kind_t::stream_k:
+                impl_ = std::make_shared<stream_k_sample_t>(prb, desc, time);
+                break;
+            default: gpu_error_not_expected();
+        }
     }
+    vec1d to_x() const { return impl_->to_x(); }
+    float to_y() const { return impl_->to_y(); }
+
+private:
+    std::shared_ptr<sample_impl_t> impl_;
+};
+
+float coef_kl(float x, float a, float b) {
+    return 1 + 1.0f / (a * std::pow(x, b));
+}
 
-    float ops() const { return 2.0f * b * m * n * k; }
+float coef_wp(float x, float a, float b) {
+    return 1 - 1.0f / (a * std::pow(x, b));
+}
+
+// The performance model is based on two inputs:
+// - kl:    the number of reduction iterations (integer)
+// - waves: the number of thread waves to execute the kernel (may be fractional)
+//
+// waves input is split into wf/wp:
+// - wf: number of full waves (integer)
+// - wp: fractional number of waves, wp = 0 is translated to 1 to have smooth
+//       function behavior.
+//
+// Model parameters:
+// - T0 - "time per normalized wave-iteration",
+//   For large kl/waves the total time is T0 * kl * wf
+//
+// Intermediate coefficients:
+// - coef_kl = 1 + 1 / (a_kl * kl ^ b_kl)
+//   This is for non-linear scaling of kl value
+// - coef_wp = 1 - 1 / (a_wp * wf ^ b_wp)
+//   This is for non-linear scaling of wp value
+//
+// The model evaluates the expected time as:
+//   T = T0 * kl * coef_kl * (wf + wp * coef_wp)
+//
+// - For large kl/wf the coefficients approach 1
+// - For small kl values (coef_kl > 1): the assumed iteration time in a small
+//   loop is higher due to shorter pipeline and higher relative impact of kernel
+//   prologue/epilogue
+// - For small wf values (coef_wp < 1): this is to take into account the effect
+//   of wave tails. For example when moving from one full wave to one full wave
+//   and a few extra threadgroups a distinct increase in time is typically
+//   observed. This effect is more pronounced with a smaller number of full
+//   waves.
+float predict_data_parallel(const vec1d &x, const vec1d &coef) {
+    float kl = x[0];
+    float waves = x[1];
+    float waves_frac = waves - (int)waves;
+    float wp = (waves_frac == 0 ? 1 : waves_frac);
+    float wf = std::ceil(waves);
+    float T0 = coef[0];
+    float a_kl = coef[1];
+    float b_kl = coef[2];
+    float a_wp = coef[3];
+    float b_wp = coef[4];
+    float Tw = T0 * kl * coef_kl(kl, a_kl, b_kl);
+    return Tw * (wf + wp * coef_wp(wf, a_wp, b_wp));
+}
+
+float predict_stream_k(const vec1d &x, const vec1d &coef) {
+    float iters = x[0];
+    float a = coef[0];
+    float b = coef[1];
+    return a + b * iters;
+}
 
-    float eff() const {
-        float sec = time_ns / 1e9;
-        return ops() / 1e9 / sec / hw_cfg.max_gops_per_sec();
+float predict_data_copy(const problem_t &prb, const kernel_desc_t &desc) {
+    auto tensor_kind = from_abc(desc.prop, tensor_kind_t::c);
+    auto desc_tag = append_groups(
+            tensor_kind, desc.layout_tag(tensor_kind_t::c), desc.is_dw);
+    auto prb_tag = append_groups(
+            tensor_kind, prb.layout_tag(tensor_kind_t::c), desc.is_dw);
+    dim_t bytes = 0;
+    if (desc.use_stream_k) bytes += layout_size(desc_tag, prb);
+    if (prb_tag.is_any()) prb_tag = desc_tag.with_type(prb_tag.type());
+    if (prb_tag != desc_tag) {
+        bytes += layout_size(prb_tag, prb);
+        bytes += layout_size(desc_tag, prb);
     }
+    // XXX: Hardcoding for now, need a separate hardware-specific model.
+    // Time is in nanoseconds.
+    const int const_cost_time = 30000;
+    const float time_per_byte = 1e-2f;
+    return const_cost_time + time_per_byte * bytes;
+}
 
-    static void to_bmnk(prop_kind_t prop, const prb_tile_t &tile, int &b,
-            int &m, int &n, int &k) {
-        const auto t = to_gemm(tile, prop);
-        b = t[prb_dims::b];
-        m = t[prb_dims::m];
-        n = t[prb_dims::n];
-        k = t[prb_dims::k];
+void model_t::coef_ranges(model_kind_t kind, const vec2d &X, const vec1d &y,
+        std::vector<std::string> &coef_names, vec1d &coef_init, vec1d &coef_min,
+        vec1d &coef_max) {
+    auto add = [&](const char *name, float init, float min, float max) {
+        coef_names.emplace_back(name);
+        coef_init.emplace_back(init);
+        coef_min.emplace_back(min);
+        coef_max.emplace_back(max);
+    };
+    switch (kind) {
+        case model_kind_t::data_parallel:
+            // Empirically-based parameter ranges.
+            add("T0", 1000, 1, 100000);
+            add("a_kl", 1, 0.0001f, 100);
+            add("b_kl", 1, 0.0001f, 100);
+            add("a_wp", 2, 1, 100);
+            add("b_wp", 1, 0.0001f, 100);
+            break;
+        case model_kind_t::stream_k: {
+            float t_min = *std::min_element(y.begin(), y.end());
+            float t_max = *std::max_element(y.begin(), y.end());
+            float t0 = *std::min_element(y.begin(), y.end());
+            float t1 = 0;
+            float x1 = 0;
+            for (size_t i = 0; i < y.size(); i++) {
+                if (y[i] < 0.5 * t_max) continue;
+                t1 += (y[i] - t_min);
+                x1 += X[i][0];
+            }
+            t1 /= x1;
+            add("T0", t0, t0 / 10, t0 * 10);
+            add("T1", t1, t1 / 10, t1 * 10);
+            break;
+        }
+        default:
+            gpu_error_not_expected() << "Unknown kind: " << to_string(kind);
     }
-};
+}
 
-float model_t::predict(const problem_t &prb, const kernel_desc_t &desc) const {
-    sample_t s(prb, desc);
-    return ml_model_.predict(s.to_x());
+float model_t::predict(model_kind_t kind, const vec1d &x, const vec1d &coef) {
+    switch (kind) {
+        case model_kind_t::data_parallel: return predict_data_parallel(x, coef);
+        case model_kind_t::stream_k: return predict_stream_k(x, coef);
+        default:
+            gpu_error_not_expected() << "Unknown kind: " << to_string(kind);
+    }
+    return 0;
 }
 
-float model_t::eff(const problem_t &prb, const kernel_desc_t &desc) const {
-    sample_t s(prb, desc);
-    float raw_eff = ml_model_.predict(s.to_x());
-    return raw_eff * s.pad_eff;
+float model_t::predict(const vec1d &x) const {
+    return predict(kind_, x, coef_);
+}
+
+float model_t::predict(const problem_t &prb, const kernel_desc_t &desc) const {
+    sample_t s(kind_, prb, desc);
+    return predict(s.to_x());
 }
 
 void model_t::score(const bench_data_t &bd) {
@@ -208,46 +354,153 @@ void model_t::score(const bench_data_t &bd) {
     vec1d y_test;
     vec1d y_pred;
     for (int i = 0; i < bd.size(); i++) {
-        sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i]);
+        sample_t s(kind_, bd.prbs[i], bd.kernel_desc, bd.times[i]);
         y_test.push_back(s.to_y());
         y_pred.push_back(predict(bd.prbs[i], bd.kernel_desc));
     }
 }
 
-void model_t::stringify(std::ostream &out) const {
+size_t model_t::coef_count(model_kind_t kind) {
+    switch (kind) {
+        case model_kind_t::data_parallel: return 5;
+        case model_kind_t::stream_k: return 2;
+        default:
+            gpu_error_not_expected() << "Unknown kind: " << to_string(kind);
+    }
+    return 0;
+}
+
+std::string model_t::str() const {
+    using namespace ir_utils;
     std::ostringstream oss;
-    serialized_data_t s;
-    ml_model_.serialize(s);
-    for (uint8_t d : s.get_data()) {
-        oss << std::uppercase << std::hex << std::setw(2) << std::setfill('0')
-            << (int)d;
+    oss << to_string(kind_) << ": " << coef_;
+    return oss.str();
+}
+
+bool with_data_copy(const problem_t &prb, const kernel_desc_t &desc) {
+    if (desc.use_stream_k) return true;
+    auto &prb_tag = prb.layout_tag(tensor_kind_t::c);
+    auto &desc_tag = desc.layout_tag(tensor_kind_t::c);
+    bool is_layout_compatible
+            = (prb_tag.is_any() || prb_tag.raw_tag() == desc_tag.raw_tag());
+    bool is_type_compatible = (prb_tag.type().size() == desc_tag.type().size());
+    if (is_layout_compatible && is_type_compatible) return false;
+    if (is_layout_compatible
+            && desc.ext.has(extensions_t::out_size(prb_tag.type().size())))
+        return false;
+    return !is_layout_compatible || !is_type_compatible;
+}
+
+std::string to_str(const vec1d &x) {
+    std::ostringstream oss;
+    bool is_first = true;
+    for (float f : x) {
+        if (!is_first) oss << ",";
+        oss << f;
+        is_first = false;
     }
-    out << oss.str();
+    return oss.str();
 }
 
-void model_t::parse(std::istream &in) {
-    std::vector<uint8_t> data;
-    auto s_data = stream_parse<std::string>(in);
-    for (size_t i = 0; i < s_data.size(); i += 2) {
-        data.push_back(std::stoi(s_data.substr(i, 2), nullptr, 16));
+float model_set_t::time(const problem_t &prb, const kernel_desc_t &desc) const {
+    float ret = 0;
+    if (desc.use_stream_k) {
+        ret += time(model_kind_t::stream_k, prb, desc);
+    } else {
+        ret += time(model_kind_t::data_parallel, prb, desc);
+    }
+    if (with_data_copy(prb, desc)) ret += predict_data_copy(prb, desc);
+    return ret;
+}
+
+float model_set_t::time(model_kind_t kind, const problem_t &prb,
+        const kernel_desc_t &desc) const {
+    for (auto &m : models_) {
+        if (m.kind() == kind) return m.predict(prb, desc);
     }
-    auto s = serialized_t::from_data(std::move(data));
+    gpu_error_not_expected() << "Unknown kind: " << to_string(kind);
+    return 0;
+}
+
+void model_set_t::stringify(std::ostream &out) const {
+    serialization_stream_t s;
+    for (auto &m : models_) {
+        s.append(m.kind());
+        for (auto &c : m.coef()) {
+            s.append(c);
+        }
+    }
+    out << data_to_hex(s.get_data());
+}
+
+void model_set_t::parse(std::istream &in) {
+    auto s_data = stream_parse<std::string>(in);
+    auto s = serialization_stream_t::from_data(hex_to_data(s_data));
     deserializer_t d(s);
-    ml_model_ = ml_model_t::deserialize(d);
+    while (!d.empty()) {
+        auto kind = d.pop<model_kind_t>();
+        size_t coef_count = model_t::coef_count(kind);
+        vec1d coef(coef_count);
+        for (size_t i = 0; i < coef_count; i++) {
+            d.pop(coef[i]);
+        }
+        models_.emplace_back(kind, coef);
+    }
 }
 
-void to_model_xy(const bench_data_t &bd, vec2d &X, vec1d &y) {
+std::string model_set_t::str() const {
+    std::ostringstream oss;
+    bool is_first = true;
+    for (auto &m : models_) {
+        if (!is_first) oss << std::endl;
+        oss << m.str();
+        is_first = false;
+    }
+    return oss.str();
+}
+
+void to_model_data(
+        model_kind_t kind, const bench_data_t &bd, vec2d &X, vec1d &y) {
     X.clear();
     y.clear();
     X.reserve(bd.size());
     y.reserve(bd.size());
     for (int i = 0; i < bd.size(); i++) {
-        sample_t s(bd.prbs[i], bd.kernel_desc, bd.times[i]);
+        sample_t s(kind, bd.prbs[i], bd.kernel_desc, bd.times[i]);
         X.push_back(s.to_x());
         y.push_back(s.to_y());
     }
 }
 
+void dump_csv(const bench_data_t &bd, const model_t &model) {
+    auto name = bd.kernel_desc.brief_str();
+    std::ofstream out(name + ".csv");
+    out << "desc,";
+    for (auto &name : feature_names(model.kind())) {
+        out << name << ",";
+    }
+    out << "time,model_time" << std::endl;
+    for (int i = 0; i < bd.size(); i++) {
+        sample_t s(model.kind(), bd.prbs[i], bd.kernel_desc, bd.times[i]);
+        auto x = s.to_x();
+        auto y = s.to_y();
+        float model_time = model.predict(x);
+        out << bd.prbs[i].desc_str() << "," << to_str(x) << "," << y << ","
+            << model_time << std::endl;
+    }
+}
+
+void dump_model_params(const kernel_desc_t &kernel_desc, const model_t &model) {
+    auto name = kernel_desc.brief_str();
+    std::ofstream out(name + "_params.txt");
+    bool is_first = true;
+    for (auto &c : model.coef()) {
+        if (!is_first) out << ", ";
+        out << c;
+        is_first = false;
+    }
+}
+
 } // namespace conv
 } // namespace v2
 } // namespace jit
diff --git a/src/gpu/intel/jit/v2/conv/model.hpp b/src/gpu/intel/jit/v2/conv/model.hpp
index 42a5b4634d9..f2bcd5e3589 100644
--- a/src/gpu/intel/jit/v2/conv/model.hpp
+++ b/src/gpu/intel/jit/v2/conv/model.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #ifndef GPU_INTEL_JIT_V2_CONV_MODEL_HPP
 #define GPU_INTEL_JIT_V2_CONV_MODEL_HPP
 
+#include "common/serialization.hpp"
 #include "gpu/intel/jit/v2/conv/bench_data.hpp"
-#include "gpu/intel/jit/v2/conv/ml.hpp"
 #include "gpu/intel/jit/v2/conv/problem.hpp"
 
 namespace dnnl {
@@ -29,21 +29,70 @@ namespace jit {
 namespace v2 {
 namespace conv {
 
+using vec1d = std::vector<float>;
+using vec2d = std::vector<std::vector<float>>;
+
+enum class model_kind_t : uint8_t {
+    undef = 0,
+    data_parallel = 1,
+    stream_k = 2,
+};
+
+static auto model_kind_names = nstl::to_array({
+        make_enum_name(model_kind_t::undef, "undef"),
+        make_enum_name(model_kind_t::data_parallel, "data_parallel"),
+        make_enum_name(model_kind_t::stream_k, "stream_k"),
+});
+GPU_DEFINE_PARSE_ENUM(model_kind_t, model_kind_names)
+
 class model_t {
 public:
     model_t() = default;
-    model_t(const ml_model_t &ml_model) : ml_model_(ml_model) {}
+    model_t(model_kind_t kind, const vec1d &coef) : kind_(kind), coef_(coef) {}
+    bool is_empty() const { return kind_ == model_kind_t::undef; }
+    model_kind_t kind() const { return kind_; }
+    const vec1d &coef() const { return coef_; }
+    float predict(const vec1d &x) const;
     float predict(const problem_t &prb, const kernel_desc_t &desc) const;
-    float eff(const problem_t &prb, const kernel_desc_t &desc) const;
     void score(const bench_data_t &bd);
+
+    static void coef_ranges(model_kind_t kind, const vec2d &X, const vec1d &y,
+            std::vector<std::string> &coef_names, vec1d &coef_init,
+            vec1d &coef_min, vec1d &coef_max);
+    static float predict(model_kind_t kind, const vec1d &x, const vec1d &coef);
+    static size_t coef_count(model_kind_t kind);
+
+    std::string str() const;
+    IR_DEFINE_DUMP()
+
+private:
+    model_kind_t kind_;
+    vec1d coef_;
+};
+
+class model_set_t {
+public:
+    model_set_t() = default;
+    model_set_t(const model_t &model) { models_.push_back(model); }
+    bool is_empty() const { return models_.empty(); }
+    void add(const model_t &model) { models_.push_back(model); }
+    float time(const problem_t &prb, const kernel_desc_t &desc) const;
     void stringify(std::ostream &out) const;
     void parse(std::istream &in);
+    std::string str() const;
+    IR_DEFINE_DUMP()
 
 private:
-    ml_model_t ml_model_;
+    float time(model_kind_t kind, const problem_t &prb,
+            const kernel_desc_t &desc) const;
+
+    std::vector<model_t> models_;
 };
 
-void to_model_xy(const bench_data_t &bd, vec2d &X, vec1d &y);
+void to_model_data(
+        model_kind_t kind, const bench_data_t &bd, vec2d &X, vec1d &y);
+void dump_csv(const bench_data_t &bd, const model_t &model);
+void dump_model_params(const kernel_desc_t &kernel_desc, const model_t &model);
 
 } // namespace conv
 } // namespace v2
diff --git a/src/gpu/intel/jit/v2/conv/plan.cpp b/src/gpu/intel/jit/v2/conv/plan.cpp
index e7ef3540a6a..957e39e8170 100644
--- a/src/gpu/intel/jit/v2/conv/plan.cpp
+++ b/src/gpu/intel/jit/v2/conv/plan.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "gpu/intel/jit/v2/conv/plan.hpp"
 
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
+
 #include <algorithm>
 #include <string>
 
@@ -27,8 +29,8 @@ namespace jit {
 namespace v2 {
 namespace conv {
 
-prb_coord_t<expr_t> coord_info_t::iter_coord() const {
-    prb_coord_t<expr_t> ret;
+pvar_coord_t<expr_t> coord_info_t::iter_coord() const {
+    pvar_coord_t<expr_t> ret;
     for (auto &d : entries_) {
         auto &e = entries_.at(d);
         ret[d] = simplify_rewrite(e.iter_size * e.iter_idx);
@@ -36,8 +38,8 @@ prb_coord_t<expr_t> coord_info_t::iter_coord() const {
     return ret;
 }
 
-prb_coord_t<expr_t> coord_info_t::tg_iter_coord() const {
-    prb_coord_t<expr_t> ret;
+pvar_coord_t<expr_t> coord_info_t::tg_iter_coord() const {
+    pvar_coord_t<expr_t> ret;
     for (auto &d : entries_) {
         auto &e = entries_.at(d);
         auto idx = e.iter_size * e.iter_idx;
@@ -49,8 +51,8 @@ prb_coord_t<expr_t> coord_info_t::tg_iter_coord() const {
     return ret;
 }
 
-prb_tile_t coord_info_t::tg_iter_tile() const {
-    prb_tile_t ret;
+pvar_tile_t coord_info_t::tg_iter_tile() const {
+    pvar_tile_t ret;
     for (auto &d : entries_) {
         auto &e = entries_.at(d);
         ret[d] = e.tg_size * e.iter_size;
@@ -58,237 +60,20 @@ prb_tile_t coord_info_t::tg_iter_tile() const {
     return ret;
 }
 
-layout_tag_t append_groups(
-        tensor_kind_t tensor_kind, const layout_tag_t &layout_tag, bool is_dw) {
-    bool is_src = (tensor_kind == tensor_kind_t::src);
-    bool is_dst = (tensor_kind == tensor_kind_t::dst);
-    bool is_bia = (tensor_kind == tensor_kind_t::bia);
-    if (!is_src && !is_dst && !is_bia) return layout_tag;
-    auto xc_dim = (is_src ? prb_dims::ic : prb_dims::oc);
-    auto xc_letter = 'a' + layout_tag.desc().dim_index(xc_dim);
-    auto new_g_letter = xc_letter;
-    auto new_xc_letter = xc_letter + 1;
-    auto &raw_tag = layout_tag.raw_tag();
-    auto &entries = raw_tag.entries();
-    layout_raw_tag_t new_raw_tag;
-    for (auto &e : entries) {
-        if (e.letter == xc_letter) {
-            if (is_dw) {
-                new_raw_tag.add_entry(new_g_letter, e.block, e.is_blocked);
-                new_raw_tag.add_entry(new_xc_letter, 1, false);
-            } else if (e.is_outer()) {
-                new_raw_tag.add_entry(new_g_letter, 0, false);
-                new_raw_tag.add_entry(new_xc_letter, e.block, e.is_blocked);
-            } else {
-                new_raw_tag.add_entry(new_xc_letter, e.block, e.is_blocked);
-            }
-        } else {
-            char letter = e.letter;
-            if (letter >= new_xc_letter) letter++;
-            new_raw_tag.add_entry(letter, e.block, e.is_blocked);
-        }
-    }
-    auto desc = make_conv_layout_desc(tensor_kind, /*src_dst_with_group=*/true);
-    return layout_tag_t(desc, layout_tag.type(), new_raw_tag);
-}
-
-layout_t make_conv_layout(tensor_kind_t tensor_kind, const layout_tag_t &_tag,
-        bool is_dw, const prb_reqs_t &reqs) {
-    auto tag = append_groups(tensor_kind, _tag, is_dw);
-    layout_t ret(tag.desc(), tag.type());
-    dim_map_t<prb_dim_t, int> blocks;
-    auto rem_size = [&](const prb_dim_t &dim,
-                            const dim_map_t<prb_dim_t, int> &blocks) {
-        auto dim_size = size_var(dim);
-        bool is_dim_1 = reqs.is_equal(dim, 1);
-        if (is_dim_1) return expr_t(1);
-        if (!blocks.has(dim)) return dim_size;
-        return binary_op_t::make(op_kind_t::_div_up, dim_size, blocks[dim]);
-    };
-    auto &entries = tag.raw_tag().entries();
-    for (auto it = entries.rbegin(); it != entries.rend(); it++) {
-        prb_dim_t dim = tag.desc().prb_dim(it->index());
-        int block_size = it->block;
-        expr_t block_size_expr;
-        if (block_size > 0) {
-            blocks[dim] = blocks.get(dim, 1) * block_size;
-            block_size_expr = expr_t(block_size);
-        } else {
-            block_size_expr = rem_size(dim, blocks);
-        }
-        ret.add_block(dim, block_size_expr);
-    }
-    return ret;
-}
-
-class dim_mapper_manager_t {
-public:
-    dim_mapper_manager_t() = default;
-    dim_mapper_manager_t(prop_kind_t prop, const prb_reqs_t &reqs)
-        : prop_(prop), reqs_(reqs) {
-        src_mapper_ = init_src_mapper();
-        wei_mapper_ = init_wei_mapper();
-        dst_mapper_ = init_dst_mapper();
-        bia_mapper_ = init_bia_mapper();
-    }
-
-    const dim_mapper_t &mapper(tensor_kind_t tensor) const {
-        switch (tensor) {
-            case tensor_kind_t::src: return src_mapper_;
-            case tensor_kind_t::wei: return wei_mapper_;
-            case tensor_kind_t::dst: return dst_mapper_;
-            case tensor_kind_t::a:
-                return mapper(pick_a(prop_, tensor_kind_t::src,
-                        tensor_kind_t::wei, tensor_kind_t::dst));
-            case tensor_kind_t::b:
-                return mapper(pick_b(prop_, tensor_kind_t::src,
-                        tensor_kind_t::wei, tensor_kind_t::dst));
-            case tensor_kind_t::c:
-                return mapper(pick_c(prop_, tensor_kind_t::src,
-                        tensor_kind_t::wei, tensor_kind_t::dst));
-            case tensor_kind_t::bia: return bia_mapper_;
-            default: ir_error_not_expected();
-        }
-        return src_mapper_;
-    }
-
-private:
-    expr_t kw_idx = index_var(prb_dims::kw);
-    expr_t kh_idx = index_var(prb_dims::kh);
-    expr_t kd_idx = index_var(prb_dims::kd);
-    expr_t id_idx = index_var(prb_dims::id);
-    expr_t ih_idx = index_var(prb_dims::ih);
-    expr_t iw_idx = index_var(prb_dims::iw);
-    expr_t od_idx = index_var(prb_dims::od);
-    expr_t oh_idx = index_var(prb_dims::oh);
-    expr_t ow_idx = index_var(prb_dims::ow);
-
-    dim_mapper_t init_src_mapper() const {
-        auto pd = reqs_.to_expr(prb_dims::pd);
-        auto ph = reqs_.to_expr(prb_dims::ph);
-        auto pw = reqs_.to_expr(prb_dims::pw);
-        auto sd = reqs_.to_expr(prb_dims::sd);
-        auto sh = reqs_.to_expr(prb_dims::sh);
-        auto sw = reqs_.to_expr(prb_dims::sw);
-        auto dd = reqs_.to_expr(prb_dims::dd);
-        auto dh = reqs_.to_expr(prb_dims::dh);
-        auto dw = reqs_.to_expr(prb_dims::dw);
-        dim_mapper_t mapper;
-        mapper.set_dim(prb_dims::mb);
-        mapper.set_dim(prb_dims::g);
-        mapper.set_dim(prb_dims::ic);
-        if (utils::one_of(
-                    prop_, prop_kind::forward, prop_kind::backward_weights)) {
-            auto dd_inc = const_fold(dd + 1);
-            auto dh_inc = const_fold(dh + 1);
-            auto dw_inc = const_fold(dw + 1);
-            auto neg_pd = const_fold(-pd);
-            auto neg_ph = const_fold(-ph);
-            auto neg_pw = const_fold(-pw);
-            mapper.set_dim(prb_dims::id,
-                    simplify_rewrite(sd * od_idx + neg_pd + kd_idx * dd_inc),
-                    true);
-            mapper.set_dim(prb_dims::ih,
-                    simplify_rewrite(sh * oh_idx + neg_ph + kh_idx * dh_inc),
-                    true);
-            mapper.set_dim(prb_dims::iw,
-                    simplify_rewrite(sw * ow_idx + neg_pw + kw_idx * dw_inc),
-                    true);
-        } else {
-            mapper.set_dim(prb_dims::id);
-            mapper.set_dim(prb_dims::ih);
-            mapper.set_dim(prb_dims::iw);
-        }
-        mapper.set_layout_desc(
-                make_conv_algo_layout_desc(prop_, tensor_kind_t::src));
-        return mapper;
-    }
-
-    dim_mapper_t init_wei_mapper() const {
-        dim_mapper_t mapper;
-        mapper.set_dim(prb_dims::g);
-        mapper.set_dim(prb_dims::oc);
-        mapper.set_dim(prb_dims::ic);
-        mapper.set_dim(prb_dims::kd);
-        mapper.set_dim(prb_dims::kh);
-        mapper.set_dim(prb_dims::kw);
-        mapper.set_layout_desc(
-                make_conv_algo_layout_desc(prop_, tensor_kind_t::wei));
-        return mapper;
-    }
-
-    dim_mapper_t init_bia_mapper() const {
-        dim_mapper_t mapper;
-        mapper.set_dim(prb_dims::g);
-        mapper.set_dim(prb_dims::oc);
-        mapper.set_layout_desc(
-                make_conv_algo_layout_desc(prop_, tensor_kind_t::bia));
-        return mapper;
-    }
-
-    dim_mapper_t init_dst_mapper() const {
-        dim_mapper_t mapper;
-        mapper.set_dim(prb_dims::mb);
-        mapper.set_dim(prb_dims::g);
-        mapper.set_dim(prb_dims::oc);
-        if (utils::one_of(
-                    prop_, prop_kind::forward, prop_kind::backward_weights)) {
-            mapper.set_dim(prb_dims::od);
-            mapper.set_dim(prb_dims::oh);
-            mapper.set_dim(prb_dims::ow);
-        } else {
-            auto pd = reqs_.to_expr(prb_dims::pd);
-            auto ph = reqs_.to_expr(prb_dims::ph);
-            auto pw = reqs_.to_expr(prb_dims::pw);
-            auto sd = reqs_.to_expr(prb_dims::sd);
-            auto sh = reqs_.to_expr(prb_dims::sh);
-            auto sw = reqs_.to_expr(prb_dims::sw);
-            auto dd = reqs_.to_expr(prb_dims::dd);
-            auto dh = reqs_.to_expr(prb_dims::dh);
-            auto dw = reqs_.to_expr(prb_dims::dw);
-
-            auto dd_inc = const_fold(dd + 1);
-            auto dh_inc = const_fold(dh + 1);
-            auto dw_inc = const_fold(dw + 1);
-
-            mapper.set_dim(prb_dims::od,
-                    simplify_rewrite((id_idx + pd - (kd_idx * dd_inc)) / sd),
-                    true);
-            mapper.set_dim(prb_dims::oh,
-                    simplify_rewrite((ih_idx + ph - (kh_idx * dh_inc)) / sh),
-                    true);
-            mapper.set_dim(prb_dims::ow,
-                    simplify_rewrite((iw_idx + pw - (kw_idx * dw_inc)) / sw),
-                    true);
-        }
-        mapper.set_layout_desc(
-                make_conv_algo_layout_desc(prop_, tensor_kind_t::dst));
-        return mapper;
-    }
-
-    prop_kind_t prop_ = prop_kind::undef;
-    prb_reqs_t reqs_;
-    dim_mapper_t src_mapper_;
-    dim_mapper_t wei_mapper_;
-    dim_mapper_t dst_mapper_;
-    dim_mapper_t bia_mapper_;
-};
-
 class multiply_info_t {
 public:
     multiply_info_t() = default;
-    multiply_info_t(fma_kind_t fma, int simd, const prb_tile_t &iter_tile,
-            const dim_map_t<prb_dim_t, prb_dim_kind_t> &bmnk_map,
-            const type_t &a_type, const layout_desc_t &a_desc,
-            const type_t &b_type, const layout_desc_t &b_desc,
-            const layout_desc_t &c_desc)
+    multiply_info_t(fma_kind_t fma, int simd, const pvar_tile_t &iter_tile,
+            const pvar_map_t<char> &bmnk_map, const type_t &a_type,
+            const layout_desc_t &a_desc, const type_t &b_type,
+            const layout_desc_t &b_desc, const layout_desc_t &c_desc)
         : fma_(fma)
         , simd_(simd)
         , iter_tile_(iter_tile)
         , bmnk_map_(bmnk_map)
         , a_type_(a_type)
-        , b_type_(b_type) {
-        init_acc_type();
+        , b_type_(b_type)
+        , acc_type_(accumulator_type(a_type, b_type)) {
         if (!init(a_desc, b_desc, c_desc)) return;
         is_valid_ = true;
     }
@@ -315,33 +100,30 @@ class multiply_info_t {
     const type_t &b_type() const { return b_type_; }
     const type_t &acc_type() const { return acc_type_; }
 
-    bool has(tensor_kind_t abc, const prb_dim_t &dim) const {
+    bool has(tensor_kind_t abc, const pvar_t &dim) const {
         switch (abc) {
             case tensor_kind_t::a: return is_b(dim) || is_m(dim) || is_k(dim);
             case tensor_kind_t::b: return is_b(dim) || is_k(dim) || is_n(dim);
             case tensor_kind_t::c: return is_b(dim) || is_m(dim) || is_n(dim);
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
 
-    bool is(const prb_dim_t &dim, prb_dim_kind_t kind) const {
-        ir_assert(utils::one_of(kind, prb_dim_kind_t::b, prb_dim_kind_t::m,
-                prb_dim_kind_t::n, prb_dim_kind_t::k));
+    bool is(const pvar_t &dim, char bmnk) const {
+        gpu_assert(utils::one_of(bmnk, 'b', 'm', 'n', 'k'));
         if (!bmnk_map_.has(dim)) return false;
-        return bmnk_map_[dim] == kind;
+        return bmnk_map_[dim] == bmnk;
     }
 
-    bool is_b(const prb_dim_t &dim) const { return is(dim, prb_dim_kind_t::b); }
-    bool is_m(const prb_dim_t &dim) const { return is(dim, prb_dim_kind_t::m); }
-    bool is_n(const prb_dim_t &dim) const { return is(dim, prb_dim_kind_t::n); }
-    bool is_k(const prb_dim_t &dim) const { return is(dim, prb_dim_kind_t::k); }
-    prb_dim_kind_t to_bmnk(const prb_dim_t &dim) const {
-        return bmnk_map_.at(dim);
-    }
+    bool is_b(const pvar_t &dim) const { return is(dim, 'b'); }
+    bool is_m(const pvar_t &dim) const { return is(dim, 'm'); }
+    bool is_n(const pvar_t &dim) const { return is(dim, 'n'); }
+    bool is_k(const pvar_t &dim) const { return is(dim, 'k'); }
+    char to_bmnk(const pvar_t &dim) const { return bmnk_map_.at(dim); }
 
-    prb_tile_t inst_tile() const {
-        prb_tile_t ret;
+    pvar_tile_t inst_tile() const {
+        pvar_tile_t ret;
         switch (fma_) {
             case fma_kind_t::mad: ret = b_inner_.int_dim_sizes(); break;
             case fma_kind_t::dpas: {
@@ -349,12 +131,12 @@ class multiply_info_t {
                 auto b_tile = b_inner_.int_dim_sizes();
                 ret = std::move(a_tile);
                 for (auto &d : b_tile) {
-                    if (ret.has(d)) ir_assert(ret[d] == b_tile[d]);
+                    if (ret.has(d)) gpu_assert(ret[d] == b_tile[d]);
                     ret[d] = b_tile[d];
                 }
                 return ret;
             }
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         for (auto &d : iter_tile_) {
             if (!ret.has(d)) ret[d] = 1;
@@ -367,7 +149,7 @@ class multiply_info_t {
         switch (abc) {
             case tensor_kind_t::a: return layout.is_blocked_by(a_inner_);
             case tensor_kind_t::b: return layout.is_blocked_by(b_inner_);
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
@@ -378,7 +160,7 @@ class multiply_info_t {
         switch (abc) {
             case tensor_kind_t::a: ret.block_by(a_inner_.blocks()); break;
             case tensor_kind_t::b: ret.block_by(b_inner_.blocks()); break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         ret = get_fma_type_layout(ret);
         return ret;
@@ -386,8 +168,8 @@ class multiply_info_t {
 
     layout_t acc_layout(const layout_t &a_layout, const layout_t &b_layout,
             const layout_t &c_layout) const {
-        ir_assert(a_layout.has_const_sizes());
-        ir_assert(b_layout.has_const_sizes());
+        gpu_assert(a_layout.has_const_sizes());
+        gpu_assert(b_layout.has_const_sizes());
         layout_t acc(c_layout.desc(), acc_type());
         for (auto &b : a_layout.blocks()) {
             if (is_k(b.dim)) continue;
@@ -401,10 +183,10 @@ class multiply_info_t {
         return acc;
     }
 
-    layout_t bia_layout(
-            const layout_t &b_layout, const layout_t &bia_layout) const {
-        ir_assert(b_layout.has_const_sizes());
-        layout_t acc(bia_layout.desc(), acc_type());
+    layout_t bias_layout(
+            const layout_t &b_layout, const layout_t &bias_layout) const {
+        gpu_assert(b_layout.has_const_sizes());
+        layout_t acc(bias_layout.desc(), acc_type());
 
         for (auto &b : b_layout.blocks()) {
             if (is_k(b.dim)) continue;
@@ -414,18 +196,34 @@ class multiply_info_t {
     }
 
 private:
-    void init_acc_type() {
-        ir_assert(a_type_.size() == b_type_.size());
-        switch (fma_) {
-            case fma_kind_t::mad:
-                acc_type_ = a_type_.is_fp() ? type_t::f32() : type_t::s32();
-                break;
-            case fma_kind_t::dpas:
-                acc_type_ = a_type_.is_fp() ? type_t::f32() : type_t::s32();
-                break;
-            default: ir_error_not_expected();
+    struct fused_dim_t {
+        char mnk = '\0';
+        std::vector<pvar_t> dims;
+        std::vector<dim_t> sizes;
+
+        fused_dim_t(char mnk) : mnk(mnk) {}
+        int ndims() const { return (int)dims.size(); }
+        dim_t size() const { return utils::array_product(sizes); }
+        void add(const pvar_t &dim, dim_t size) {
+            gpu_assert(size > 1);
+            dims.push_back(dim);
+            sizes.push_back(size);
+        }
+
+        std::pair<pvar_t, dim_t> pop(dim_t &block) {
+            gpu_assert(!dims.empty());
+            dim_t b = math::gcd(sizes.back(), block);
+            gpu_assert(b > 1);
+            auto ret = std::make_pair(dims.back(), b);
+            sizes.back() /= b;
+            block /= b;
+            if (sizes.back() == 1) {
+                dims.pop_back();
+                sizes.pop_back();
+            }
+            return ret;
         }
-    }
+    };
 
     bool fma_type_supported(const type_t &type) const {
         switch (fma_) {
@@ -436,7 +234,7 @@ class multiply_info_t {
                 return utils::one_of(type, type_t::u8(), type_t::s8(),
                         type_t::f16(), type_t::bf16());
                 break;
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
@@ -465,7 +263,7 @@ class multiply_info_t {
         switch (fma_) {
             case fma_kind_t::mad: return init_mad(a_desc, b_desc, c_desc);
             case fma_kind_t::dpas: return init_dpas(a_desc, b_desc, c_desc);
-            default: ir_error_not_expected();
+            default: gpu_error_not_expected();
         }
         return false;
     }
@@ -485,67 +283,64 @@ class multiply_info_t {
                 break;
             }
         }
-        ir_check(found) << "init_mad: cannot find dimension to vectorize.";
+        gpu_check(found) << "init_mad: cannot find dimension to vectorize.";
         c_inner_ = layout_t(c_desc, acc_type_, 0, b_inner_.blocks());
         return true;
     }
 
     bool init_dpas(const layout_desc_t &a_desc, const layout_desc_t &b_desc,
             const layout_desc_t &c_desc) {
-        prb_dim_t m_dim;
-        prb_dim_t n_dim;
-        prb_dim_t k_dim;
+        fused_dim_t m_dim('m');
+        fused_dim_t n_dim('n');
+        fused_dim_t k_dim('k');
         for (auto &d : iter_tile_) {
             switch (to_bmnk(d)) {
-                case prb_dim_kind_t::m:
-                    ir_assert(m_dim.is_undef());
-                    m_dim = d;
-                    break;
-                case prb_dim_kind_t::n:
-                    ir_assert(n_dim.is_undef());
-                    n_dim = d;
-                    break;
-                case prb_dim_kind_t::k:
-                    ir_assert(k_dim.is_undef());
-                    k_dim = d;
-                    break;
-                default: ir_error_not_expected();
+                case 'm': m_dim.add(d, iter_tile_[d]); break;
+                case 'n': n_dim.add(d, iter_tile_[d]); break;
+                case 'k': k_dim.add(d, iter_tile_[d]); break;
+                default: gpu_error_not_expected();
             }
         }
-        ir_check(!m_dim.is_undef() && !n_dim.is_undef() && !k_dim.is_undef())
+        gpu_check(utils::one_of(m_dim.ndims(), 1, 2) && n_dim.ndims() == 1
+                && utils::one_of(k_dim.ndims(), 1, 2))
                 << "init_dpas: cannot initialize MNK dimensions.";
-        int m_size = iter_tile_.at(m_dim);
-        int n_size = iter_tile_.at(n_dim);
-        int k_size = iter_tile_.at(k_dim);
-        int sdepth = 8;
-        int rcount = 8;
+        if (k_dim.ndims() == 2) {
+            std::swap(k_dim.dims[0], k_dim.dims[1]);
+            std::swap(k_dim.sizes[0], k_dim.sizes[1]);
+        }
+        const uint8_t sdepth = 8;
+        const uint8_t rcount = 8;
         int type_size = a_type_.size();
-        ir_check(m_size % rcount == 0)
-                << "init_dpas: M dimension size is invalid: " << m_size;
-        ir_check(n_size % simd_ == 0)
-                << "init_dpas: N dimension size is invalid: " << n_size;
-        ir_check((k_size * type_size) % (sdepth * 4) == 0)
-                << "init_dpas: K dimension size is invalid: " << k_size;
+        int dword_size = 4;
+        gpu_check(m_dim.size() % rcount == 0)
+                << "init_dpas: M dimension size is invalid: " << m_dim.size();
+        gpu_check(n_dim.size() % simd_ == 0)
+                << "init_dpas: N dimension size is invalid: " << n_dim.size();
+        gpu_check((k_dim.size() * type_size) % (sdepth * dword_size) == 0)
+                << "init_dpas: K dimension size is invalid: " << k_dim.size();
 
         auto _dpas = dpas_t::make(
                 /*is_dpasw=*/false, simd_, sdepth, rcount, acc_type_, b_type_,
                 a_type_);
         auto &dpas = _dpas.as<dpas_t>();
-        a_inner_ = to_v2_layout(
-                dpas.b_layout(), a_desc, std::vector<prb_dim_t> {k_dim, m_dim});
-        b_inner_ = to_v2_layout(
-                dpas.a_layout(), b_desc, std::vector<prb_dim_t> {n_dim, k_dim});
-        c_inner_ = to_v2_layout(
-                dpas.c_layout(), c_desc, std::vector<prb_dim_t> {n_dim, m_dim});
+        a_inner_ = to_v2_layout(dpas.b_layout(), a_desc,
+                std::vector<fused_dim_t> {k_dim, m_dim});
+        b_inner_ = to_v2_layout(dpas.a_layout(), b_desc,
+                std::vector<fused_dim_t> {n_dim, std::move(k_dim)});
+        c_inner_ = to_v2_layout(dpas.c_layout(), c_desc,
+                std::vector<fused_dim_t> {std::move(n_dim), std::move(m_dim)});
         return true;
     }
 
     static layout_t to_v2_layout(const jit::layout_t &layout,
-            const layout_desc_t &desc, const std::vector<prb_dim_t> &dims) {
+            const layout_desc_t &desc, std::vector<fused_dim_t> dims) {
         layout_t ret(desc, layout.type());
         for (auto &b : layout.blocks()) {
-            auto dim = dims[b.dim_idx];
-            ret.add_block(dim, b.block);
+            dim_t block = b.block;
+            while (block > 1) {
+                auto dim_block = dims[b.dim_idx].pop(block);
+                ret.add_block(dim_block.first, dim_block.second);
+            }
         }
         return ret;
     }
@@ -553,8 +348,8 @@ class multiply_info_t {
     bool is_valid_ = false;
     fma_kind_t fma_ = fma_kind_t::undef;
     int simd_ = 0;
-    prb_tile_t iter_tile_;
-    dim_map_t<prb_dim_t, prb_dim_kind_t> bmnk_map_;
+    pvar_tile_t iter_tile_;
+    pvar_map_t<char> bmnk_map_;
     type_t a_type_;
     type_t b_type_;
     type_t acc_type_;
@@ -566,9 +361,9 @@ class multiply_info_t {
 class plan_builder_t {
 public:
     plan_builder_t() = default;
-    plan_builder_t(const kernel_desc_t &desc) : desc_(desc) {
-        reqs_ = desc_.reqs;
-        desc_.reqs = prb_reqs_t();
+    plan_builder_t(const kernel_desc_t &desc, const hw_t &hw)
+        : desc_(desc), hw_(hw) {
+        reqs_ = desc_.reqs();
     }
 
     const prb_reqs_t &reqs() const { return reqs_; }
@@ -576,7 +371,7 @@ class plan_builder_t {
     plan_t build() {
         init_dim_mapper_manager();
         init_tiles();
-        init_layouts();
+        if (!init_layouts()) return plan_t();
         if (!init_info()) return plan_t();
         return init_plan();
     }
@@ -586,26 +381,24 @@ class plan_builder_t {
             const send_params_t &params, const view_t &view) {
         auto plan = create_send_plan(params, view, /*allow_fail=*/true);
         bool ok = [&]() {
-            ir_check(plan) << tag << ": cannot create send plan" << std::endl
-                           << params << std::endl
-                           << ir_utils::add_tag("view", view.str());
+            gpu_check(plan) << tag << ": cannot create send plan\n"
+                            << params << "\n"
+                            << ir_utils::add_tag("view", view.str());
             return true;
         }();
         if (!ok) return send_plan_t();
         return plan;
     }
 
-    void add_align_req(const prb_dim_t &dim, const type_t &type,
-            const align_desc_t::align_t &align) {
-        if (align.value == 0) {
-            reqs_.set_any_mod(dim);
-        } else {
-            int align_bytes = (align.in_bytes ? align.value
-                                              : align.value * type.size());
-            reqs_.add(
-                    size_var(dim) % ir_utils::safe_div(align_bytes, type.size())
-                    == 0);
+    static bool check_compatible_layout(
+            const layout_t &layout, const pvar_tile_t &tile) {
+        for (auto &d : tile) {
+            int inner = layout.inner_block(d, /*with_outer=*/false);
+            gpu_check(tile[d] % inner == 0)
+                    << "Incompatible layout and tiling. Layout: "
+                    << layout.str() << ", tile: " << tile.str();
         }
+        return true;
     }
 
     void init_dim_mapper_manager() {
@@ -618,42 +411,42 @@ class plan_builder_t {
         for (auto &d : conv_index_dims(desc_.prop)) {
             bool is_loop = desc_.loop_desc.has(d);
             bool is_global_loop = desc_.loop_desc.is_global(d);
-            int tg_tile = desc_.thread_group_tile.get(d, 1);
-            int iter_tile = desc_.iter_tile.get(d, 1);
+            dim_t tg_tile = desc_.thread_group_tile.get(d, 1);
+            dim_t iter_tile = desc_.iter_tile.get(d, 1);
             auto thr_idx = thr_grid_.index_var(d);
             coord_info_.add_dim(d, is_loop, is_global_loop, tg_tile, thr_idx,
                     iter_tile, reqs_);
         }
     }
 
-    void init_layouts() {
+    bool init_layouts() {
         auto src_layout = make_conv_layout(
                 tensor_kind_t::src, desc_.src_tag, desc_.is_dw, reqs_);
         auto wei_layout = make_conv_layout(
                 tensor_kind_t::wei, desc_.wei_tag, desc_.is_dw, reqs_);
         auto dst_layout = make_conv_layout(
                 tensor_kind_t::dst, desc_.dst_tag, desc_.is_dw, reqs_);
+        gpu_check(check_compatible_layout(src_layout, desc_.iter_tile));
+        gpu_check(check_compatible_layout(wei_layout, desc_.iter_tile));
+        gpu_check(check_compatible_layout(dst_layout, desc_.iter_tile));
         a_layout_ = pick_a(desc_.prop, src_layout, wei_layout, dst_layout);
         b_layout_ = pick_b(desc_.prop, src_layout, wei_layout, dst_layout);
         c_layout_ = pick_c(desc_.prop, src_layout, wei_layout, dst_layout);
-        if (desc_.prop == prop_kind::backward_weights && desc_.with_bias)
-            bia_layout_ = make_conv_layout(
-                    tensor_kind_t::bia, desc_.bia_tag, desc_.is_dw, reqs_);
-        if (desc_.access_mode == access_mode_t::alignment) {
-            auto &align = desc_.align;
-            add_align_req(
-                    src_layout.blocks()[0].dim, src_layout.type(), align.src);
-            add_align_req(
-                    wei_layout.blocks()[0].dim, wei_layout.type(), align.wei);
-            add_align_req(
-                    dst_layout.blocks()[0].dim, dst_layout.type(), align.dst);
+        if (desc_.with_bias_bwd_w()) {
+            auto bias_tag = make_conv_layout_tag(
+                    tensor_kind_t::bias, "a:" + desc_.bias_type.str());
+            bias_layout_ = make_conv_layout(
+                    tensor_kind_t::bias, bias_tag, desc_.is_dw, reqs_);
         }
+        return true;
     }
 
-    dim_map_t<prb_dim_t, prb_dim_kind_t> to_bmnk_map() const {
-        dim_map_t<prb_dim_t, prb_dim_kind_t> ret;
+    pvar_map_t<char> to_bmnk_map() const {
+        pvar_map_t<char> ret;
         for (auto &d : conv_index_dims(desc_.prop)) {
-            ret[d] = to_gemm(d, desc_.prop).kind();
+            auto gemm_d = to_gemm(d, desc_.prop);
+            gpu_assert(!gemm_d.is_undef());
+            ret[d] = gemm_d.name()[0];
         }
         return ret;
     }
@@ -673,37 +466,39 @@ class plan_builder_t {
     }
 
     plan_t init_plan() {
-        plan_t plan(desc_.hw);
-        if (!try_init_plan(plan)) return plan_t();
-        if (!check_plan(plan)) return plan_t();
-
-        reqs_.add(plan.reqs());
-        plan = plan_t(desc_.hw);
-        if (!try_init_plan(plan) || !check_plan(plan)) {
-            ir_error_not_expected();
+        plan_t plan(hw_);
+        if (!try_init_plan(plan, reqs_) || !check_plan(plan)) return plan_t();
+
+        // Re-create plan to ensure all collected requirements are cross-used
+        // between sub-plans.
+        plan = plan_t(hw_);
+        if (!try_init_plan(plan, reqs_) || !check_plan(plan)) {
+            gpu_error_not_expected();
             return plan_t();
         }
+        reqs_.simplify();
         return plan;
     }
 
-    bool try_init_plan(plan_t &plan) const {
+    bool try_init_plan(plan_t &plan, prb_reqs_t &reqs) const {
         plan.desc = desc_;
         plan.tg_grid = tg_grid_;
         plan.thr_grid = thr_grid_;
         plan.virt_grid = virt_grid_;
         plan.coord_info = coord_info_;
-        ir_check(init_x2r_fma_plan(plan.x2r_fma));
-        ir_check(init_prefetch_plan(
+        gpu_check(init_x2r_fma_plan(plan.x2r_fma, reqs));
+        gpu_check(init_prefetch_plan(
                 plan.x2r_fma, plan.virt_grid, plan.prefetch));
-        ir_check(init_epilogue_plan(
-                plan.x2r_fma.c_layout, plan.virt_grid, plan.epilogue));
-        if (desc_.prop == prop_kind::backward_weights && desc_.with_bias)
-            ir_check(init_epilogue_bia(plan.x2r_fma.bia_layout, plan.epilogue));
+        gpu_check(init_epilogue_plan(
+                plan.x2r_fma.c_layout, plan.virt_grid, plan.epilogue, reqs));
+        if (desc_.with_bias_bwd_w())
+            gpu_check(init_epilogue_bias(
+                    plan.x2r_fma.bias_layout, plan.epilogue, reqs));
         return true;
     }
 
     bool init_x_prefetch_plan(tensor_kind_t abc,
-            const prb_coord_t<expr_t> &coord, const prb_tile_t &tile,
+            const pvar_coord_t<expr_t> &coord, const pvar_tile_t &tile,
             const x2r_fma_plan_t &x2r_fma, virt_grid_t &virt_grid,
             send_plan_t &prefetch) const {
         auto &mapper = dim_mapper_manager_.mapper(abc);
@@ -718,15 +513,19 @@ class plan_builder_t {
             virt_grid.add(kv.first, kv.second);
         }
         // Try 2D messages first.
-        auto params = get_send_params(
-                abc, send_op_t::prefetch, view, send_kind_t::_2d);
-        prefetch = create_send_plan(params, view, /*allow_fail=*/true);
-        if (!prefetch || !x2r_fma.reqs().implies(prefetch.reqs())) {
+        bool try_2d = can_use_2d(desc_, abc);
+        if (try_2d) {
+            auto params = get_send_params(
+                    abc, send_op_t::prefetch, view, send_kind_t::_2d);
+            prefetch = create_send_plan(params, view, /*allow_fail=*/true);
+        }
+        if (!try_2d || !prefetch || !reqs_.implies(prefetch.reqs())) {
             // If 2D failed, try compressed prefetch.
-            params = get_send_params(abc, send_op_t::prefetch, view,
+            auto params = get_send_params(abc, send_op_t::prefetch, view,
                     send_kind_t::compressed_prefetch);
             prefetch = try_create_send_plan(__func__, params, view);
             if (!prefetch) return false;
+            if (!reqs_.implies(prefetch.reqs())) return false;
         }
         return true;
     }
@@ -734,12 +533,12 @@ class plan_builder_t {
     bool init_prefetch_plan(const x2r_fma_plan_t &x2r_fma,
             virt_grid_t &virt_grid, prefetch_plan_t &plan) const {
         if (desc_.prefetch.a) {
-            ir_check(init_x_prefetch_plan(tensor_kind_t::a,
+            gpu_check(init_x_prefetch_plan(tensor_kind_t::a,
                     coord_info_.tg_iter_coord(), coord_info_.tg_iter_tile(),
                     x2r_fma, virt_grid, plan.a_prefetch));
         }
         if (desc_.prefetch.b) {
-            ir_check(init_x_prefetch_plan(tensor_kind_t::b,
+            gpu_check(init_x_prefetch_plan(tensor_kind_t::b,
                     coord_info_.tg_iter_coord(), coord_info_.tg_iter_tile(),
                     x2r_fma, virt_grid, plan.b_prefetch));
         }
@@ -758,17 +557,17 @@ class plan_builder_t {
         } else {
             auto &src = load.reg_layout();
             auto dst = mul_info_.to_compatible_layout(abc, load.reg_layout());
-            reorder = reorder_plan_t(desc_.hw, src, dst);
+            reorder = reorder_plan_t(hw_, src, dst);
             reg_layout = reorder.dst;
         }
-        plan = x2r_plan_t(desc_.hw);
+        plan = x2r_plan_t(hw_);
         plan.tensor_kind = abc;
         plan.load = std::move(load);
         plan.reorder = std::move(reorder);
         plan.layout = std::move(reg_layout);
-        if (abc == tensor_kind_t::b) {
-            auto bia_layout = mul_info_.bia_layout(plan.layout, bia_layout_);
-            plan.bia_layout = std::move(bia_layout);
+        if (desc_.with_bias_bwd_w() && abc == tensor_kind_t::b) {
+            auto bias_layout = mul_info_.bias_layout(plan.layout, bias_layout_);
+            plan.bias_layout = std::move(bias_layout);
         }
         return true;
     }
@@ -777,8 +576,8 @@ class plan_builder_t {
             const layout_t &a, const layout_t &b, fma_plan_t &plan) const {
         auto inst_tile = mul_info_.inst_tile();
         auto acc_layout = mul_info_.acc_layout(a, b, c_layout_);
-        ir_check(!acc_layout.is_empty()) << "init_fma_plan: cannot vectorize.";
-        plan = fma_plan_t(desc_.hw);
+        gpu_check(!acc_layout.is_empty()) << "init_fma_plan: cannot vectorize.";
+        plan = fma_plan_t(hw_);
         plan.simd = desc_.simd;
         plan.fma = desc_.fma;
         plan.a_layout = a;
@@ -788,21 +587,21 @@ class plan_builder_t {
         return true;
     }
 
-    bool init_x2r_fma_plan(x2r_fma_plan_t &plan) const {
+    bool init_x2r_fma_plan(x2r_fma_plan_t &plan, prb_reqs_t &reqs) const {
         auto &outer = desc_.iter_outer_tile;
         auto &tile = desc_.iter_tile;
-        ir_assert(outer.is_empty() || outer.size() == 1);
-        auto outer_dim = (outer.is_empty() ? prb_dims::undef : *outer.begin());
-        int outer_size = outer.get(outer_dim, 1);
+        gpu_assert(outer.is_empty() || outer.size() == 1);
+        auto outer_dim = (outer.is_empty() ? pvar_t() : *outer.begin());
+        dim_t outer_size = outer.get(outer_dim, 1);
         auto sub_tile = tile;
         if (!outer_dim.is_undef()) sub_tile[outer_dim] /= outer_size;
         bool is_outer_m = mul_info_.is_m(outer_dim);
         layout_t a_prev_layout;
         layout_t b_prev_layout;
         layout_t c_prev_layout;
-        layout_t bia_prev_layout;
+        layout_t bias_prev_layout;
         int c_off_elems = 0;
-        int bia_off_elems = 0;
+        int bias_off_elems = 0;
         auto &a_mapper = dim_mapper_manager_.mapper(tensor_kind_t::a);
         auto &b_mapper = dim_mapper_manager_.mapper(tensor_kind_t::b);
         for (int i = 0; i < outer_size; i++) {
@@ -814,7 +613,7 @@ class plan_builder_t {
                 auto a_sub_view
                         = view_t(a_mapper, a_layout_, sub_coord, sub_tile);
                 x2r_plan_t a;
-                ir_check(init_x2r_plan(tensor_kind_t::a, a_sub_view, a));
+                gpu_check(init_x2r_plan(tensor_kind_t::a, a_sub_view, a));
                 plan.add_stage(a);
                 a_prev_layout = a.layout;
             }
@@ -822,18 +621,20 @@ class plan_builder_t {
                 auto b_sub_view
                         = view_t(b_mapper, b_layout_, sub_coord, sub_tile);
                 x2r_plan_t b;
-                ir_check(init_x2r_plan(tensor_kind_t::b, b_sub_view, b));
+                gpu_check(init_x2r_plan(tensor_kind_t::b, b_sub_view, b));
                 b_prev_layout = b.layout;
-                bia_prev_layout = b.bia_layout;
-                b.bia_layout.set_base(bia_off_elems);
-                bia_off_elems += ir_utils::safe_div(
-                        b.bia_layout.size(), b.bia_layout.type().size());
+                if (desc_.with_bias_bwd_w()) {
+                    bias_prev_layout = b.bias_layout;
+                    b.bias_layout.set_base(bias_off_elems);
+                    bias_off_elems += ir_utils::safe_div(
+                            b.bias_layout.size(), b.bias_layout.type().size());
+                }
                 plan.add_stage(b);
             }
 
             fma_plan_t fma;
-            ir_check(init_fma_plan(a_prev_layout, b_prev_layout, fma));
-            ir_check(c_prev_layout.is_empty() || fma.c_layout == c_prev_layout)
+            gpu_check(init_fma_plan(a_prev_layout, b_prev_layout, fma));
+            gpu_check(c_prev_layout.is_empty() || fma.c_layout == c_prev_layout)
                     << "init_x2r_fma_plan: inconsistent C layout from "
                        "subtiles.";
             c_prev_layout = fma.c_layout;
@@ -843,68 +644,86 @@ class plan_builder_t {
             plan.add_stage(fma);
         }
         plan.c_layout = c_prev_layout;
-        plan.bia_layout = bia_prev_layout;
-        auto &bia_mapper = dim_mapper_manager_.mapper(tensor_kind_t::bia);
+        if (desc_.with_bias_bwd_w()) plan.bias_layout = bias_prev_layout;
+
         if (!outer_dim.is_undef()) {
             int stride = ir_utils::safe_div(
                     c_prev_layout.size(), c_prev_layout.type().size());
             plan.c_layout.add_block(outer_dim, outer_size, stride);
-            if (bia_mapper.has(outer_dim)) {
-                int bia_stride = ir_utils::safe_div(
-                        bia_prev_layout.size(), bia_prev_layout.type().size());
-                plan.bia_layout.add_block(outer_dim, outer_size, bia_stride);
+            if (desc_.with_bias_bwd_w()) {
+                auto &bias_mapper
+                        = dim_mapper_manager_.mapper(tensor_kind_t::bias);
+                if (bias_mapper.has(outer_dim)) {
+                    int bias_stride
+                            = ir_utils::safe_div(bias_prev_layout.size(),
+                                    bias_prev_layout.type().size());
+                    plan.bias_layout.add_block(
+                            outer_dim, outer_size, bias_stride);
+                }
             }
         }
+        reqs.add(plan.reqs());
         return true;
     }
 
-    bool init_epilogue_bia(
-            const layout_t &bia_layout, epilogue_plan_t &plan) const {
-        auto &bia_mapper = dim_mapper_manager_.mapper(tensor_kind_t::bia);
-        auto bia_iter_view
-                = view_t(dim_mapper_manager_.mapper(tensor_kind_t::bia),
-                        bia_layout_, coord_info_.iter_coord(), desc_.iter_tile);
+    bool init_epilogue_store_bias(bool is_atomic,
+            const layout_t &bias_reg_layout, const view_t &bias_mem_view,
+            epilogue_store_plan_t &plan, prb_reqs_t &reqs) const {
+        auto params = get_send_params(tensor_kind_t::undef,
+                is_atomic ? send_op_t::atomic_add : send_op_t::store,
+                bias_mem_view);
+        auto store = try_create_send_plan(__func__, params, bias_mem_view);
+        if (!store) return false;
+        gpu_check(reqs.implies(store.reqs()))
+                << "Bias store add needs additional requirements.";
+        plan.bias_store = store;
+        if (bias_reg_layout != store.reg_layout()) {
+            auto store_layout = store.reg_layout();
+            if (bias_reg_layout != store_layout) {
+                plan.bias_reorder = reorder_plan_t(hw_);
+                plan.bias_reorder.src = bias_reg_layout;
+                plan.bias_reorder.dst = std::move(store_layout);
+            }
+        }
+        return true;
+    }
+
+    bool init_epilogue_bias(const layout_t &bias_reg_layout,
+            epilogue_plan_t &plan, prb_reqs_t &reqs) const {
+        auto &bias_mapper = dim_mapper_manager_.mapper(tensor_kind_t::bias);
+        auto bias_mem_view = view_t(
+                dim_mapper_manager_.mapper(tensor_kind_t::bias), bias_layout_,
+                coord_info_.iter_coord(), desc_.iter_tile);
         auto reduce_cond = expr_t(true);
         for (int i = 0; i < c_layout_.desc().ndims(); i++) {
             auto dim = c_layout_.desc().prb_dim(i);
-            if (!bia_mapper.has(dim))
+            if (!bias_mapper.has(dim))
                 reduce_cond
                         = reduce_cond & (coord_info_.iter_coord()[dim] == 0);
         }
-        plan.reduce_cond = std::move(reduce_cond);
-        auto bia_params = get_send_params(
-                tensor_kind_t::bia, send_op_t::store, bia_iter_view);
-        auto bia_store = create_send_plan(bia_params, bia_iter_view);
-        auto tile = plan.tile;
-        plan.bia_store = bia_store;
-        if (bia_layout != bia_store.reg_layout()) {
-            auto fma_layout = bia_layout.map(tile);
-            auto store_layout = bia_store.reg_layout().map(tile);
-            if (fma_layout != store_layout) {
-                plan.bia_reorder = reorder_plan_t(desc_.hw);
-                plan.bia_reorder.src = std::move(fma_layout);
-                plan.bia_reorder.dst = std::move(store_layout);
-            }
-        }
+        plan.bias_reduce_cond = std::move(reduce_cond);
+        plan.bias_layout = bias_reg_layout;
+        gpu_check(init_epilogue_store_bias(/*is_atomic=*/desc_.use_stream_k,
+                bias_reg_layout, bias_mem_view, plan.store, reqs));
         return true;
     }
 
     bool init_slm_reduce_plan(const layout_t &c_layout, virt_grid_t &virt_grid,
             slm_reduce_plan_t &plan) const {
-        prb_dim_t k_dim;
+        pvar_t k_dim;
         for (auto &d : desc_.thread_group_tile) {
-            if (to_gemm(d, desc_.prop) == prb_dim_kind_t::k) {
+            if (to_gemm(d, desc_.prop) == pvars::k) {
                 k_dim = d;
                 break;
             }
         }
         if (k_dim.is_undef()) return true;
 
-        int k_tg = desc_.thread_group_tile.at(k_dim);
-        ir_assert(k_tg > 1);
-        ir_assert(desc_.thread_group_tile.elems() == k_tg)
+        dim_t k_tg = desc_.thread_group_tile.at(k_dim);
+        gpu_assert(k_tg > 1);
+        gpu_assert(desc_.thread_group_tile.elems() == k_tg)
                 << "Local k-slicing assumes no split by M/N.";
-        ir_check(c_layout.size() % desc_.hw.grf_size() == 0)
+        gpu_check(c_layout.size() % hw_.grf_size() == 0)
                 << "init_slm_reduce_plan: c_layout is not aligned to a "
                    "reigster boundary.";
 
@@ -916,9 +735,9 @@ class plan_builder_t {
         slm_layout.add_block(k_dim, k_tg);
         auto c_tile = c_layout.desc().filter_dim_map(desc_.iter_tile);
 
-        prb_coord_t<expr_t> store_coord;
+        pvar_coord_t<expr_t> store_coord;
         store_coord[k_dim] = thr_grid_.index_var(k_dim);
-        prb_tile_t store_tile = c_tile;
+        pvar_tile_t store_tile = c_tile;
         store_tile.unset(k_dim);
 
         // Store partial reductions.
@@ -932,8 +751,8 @@ class plan_builder_t {
         // Split the original tile evenly between k_tg threads.
         grid_splitter_t grid_splitter;
         grid_splitter.add(thr_grid_.index_var(k_dim), k_tg);
-        auto split_view = view_t::split(
-                mapper, c_layout, prb_coord_t<expr_t>(), c_tile, grid_splitter);
+        auto split_view = view_t::split(mapper, c_layout,
+                pvar_coord_t<expr_t>(), c_tile, grid_splitter);
         for (auto &kv : grid_splitter.virt_grid_idxs()) {
             virt_grid.add(kv.first, kv.second);
         }
@@ -953,11 +772,11 @@ class plan_builder_t {
 
         auto &load_layout = load.reg_layout();
         auto reduced_layout = load_layout.map(split_view.tile());
-        auto reduce = reduce_plan_t(desc_.hw, load_layout, reduced_layout);
+        auto reduce = reduce_plan_t(hw_, load_layout, reduced_layout);
         auto c_post_layout = std::move(reduced_layout);
         c_post_layout.remove(k_dim);
 
-        plan = slm_reduce_plan_t(desc_.hw);
+        plan = slm_reduce_plan_t(hw_);
         plan.store = std::move(store);
         plan.load = std::move(load);
         plan.reduce = std::move(reduce);
@@ -967,65 +786,75 @@ class plan_builder_t {
         return true;
     }
 
-    bool init_epilogue_plan(const layout_t &c_fma_layout,
-            virt_grid_t &virt_grid, epilogue_plan_t &plan) const {
-        ir_check(
-                init_slm_reduce_plan(c_fma_layout, virt_grid, plan.slm_reduce));
-        auto &c_mapper = dim_mapper_manager_.mapper(tensor_kind_t::c);
-        auto c_reg_layout
-                = (plan.slm_reduce ? plan.slm_reduce.c_layout : c_fma_layout);
-        auto c_coord = (plan.slm_reduce ? plan.slm_reduce.c_coord
-                                        : coord_info_.iter_coord());
-        auto c_tile = c_reg_layout.int_dim_sizes();
-        auto c_mem_view = view_t(c_mapper, c_layout_, c_coord, c_tile);
-        int target_elems = 128 / c_layout_.type().size();
-        auto it_beg = begin(c_mem_view.layout());
-        auto it_end = end(c_mem_view.layout());
-        auto tile_last = it_beg;
-        for (auto it = it_beg; it != it_end; ++it) {
-            if (it.elems() > target_elems) break;
-            tile_last = it;
-        }
-        auto full_tile = desc_.iter_tile;
-        for (auto &d : full_tile) {
-            if (mul_info_.is_k(d)) full_tile.unset(d);
-        }
-        auto params = get_send_params(
-                tensor_kind_t::c, send_op_t::store, c_mem_view);
+    bool init_epilogue_store_plan(bool is_atomic, const layout_t &c_reg_layout,
+            const view_t &c_mem_view, epilogue_store_plan_t &plan,
+            prb_reqs_t &reqs) const {
+        auto params = get_send_params(tensor_kind_t::c,
+                is_atomic ? send_op_t::atomic_add : send_op_t::store,
+                c_mem_view);
         // TODO: Implement fallback from 2D to block/scattered messages to
         // allow partial use of 2D messages when possible.
-        auto c_store = try_create_send_plan(__func__, params, c_mem_view);
-        if (!c_store) return false;
-        auto &tile = c_store.entry_tile();
+        auto store = try_create_send_plan(__func__, params, c_mem_view);
+        if (!store) return false;
+        auto &tile = store.entry_tile();
         plan.tile = tile;
-        plan.c_store = c_store;
+        plan.c_store = store;
         auto c_reg_tile_layout = c_reg_layout.map(tile);
-        auto store_layout = c_store.reg_layout().map(tile);
+        auto store_layout = store.reg_layout().map(tile);
         if (c_reg_tile_layout != store_layout) {
-            plan.reorder = reorder_plan_t(desc_.hw);
+            plan.reorder = reorder_plan_t(hw_);
             plan.reorder.src = std::move(c_reg_tile_layout);
             plan.reorder.dst = std::move(store_layout);
         }
+        reqs.add(plan.c_store.reqs());
+        return true;
+    }
+
+    bool init_epilogue_plan(const layout_t &c_fma_layout,
+            virt_grid_t &virt_grid, epilogue_plan_t &plan,
+            prb_reqs_t &reqs) const {
+        gpu_check(
+                init_slm_reduce_plan(c_fma_layout, virt_grid, plan.slm_reduce));
+        auto &c_mapper = dim_mapper_manager_.mapper(tensor_kind_t::c);
+        const auto &c_reg_layout
+                = (plan.slm_reduce ? plan.slm_reduce.c_layout : c_fma_layout);
+        const auto &c_coord = (plan.slm_reduce ? plan.slm_reduce.c_coord
+                                               : coord_info_.iter_coord());
+        auto c_tile = c_reg_layout.int_dim_sizes();
+        auto c_mem_view = view_t(c_mapper, c_layout_, c_coord, c_tile);
+        plan.c_reg_layout = c_reg_layout;
+        plan.c_coord = c_mem_view.coord();
+        gpu_check(init_epilogue_store_plan(/*is_atomic=*/desc_.use_stream_k,
+                c_reg_layout, c_mem_view, plan.store, reqs));
         return true;
     }
 
     bool check_plan(const plan_t &plan) const {
-        int grf_bound = desc_.hw.grf_size() * desc_.regs;
+        int grf_bound = hw_.grf_size() * desc_.regs;
         int grf_bytes = plan.grf_usage_bytes();
-        ir_check(grf_bytes <= grf_bound) << "check_plan: out of registers";
+        gpu_check(grf_bytes <= grf_bound)
+                << "Plan:\n"
+                << plan.str() << "\ncheck_plan: out of registers";
         int slm_bound = compute::device_info_t::max_slm_size_per_tg(
-                convert_ngen_arch_to_dnnl(desc_.hw.to_ngen()),
-                desc_.thread_group_tile.elems(), desc_.regs > 128);
+                convert_ngen_arch_to_dnnl(hw_.to_ngen()),
+                into<int>(desc_.thread_group_tile.elems()), desc_.regs > 128);
         int slm_bytes = plan.slm_usage_bytes();
-        ir_check(slm_bytes <= slm_bound) << "check_plan: out of SLM";
+        gpu_check(slm_bytes <= slm_bound)
+                << "Plan:\n"
+                << plan.str() << "\ncheck_plan: out of SLM";
         return true;
     }
 
     send_params_t get_send_params(tensor_kind_t abc, send_op_t op,
             const view_t &view, send_kind_t send_kind = send_kind_t::undef,
             send_address_t send_address = send_address_t::a64) const {
+        if (op == send_op_t::atomic_add) {
+            auto &type = view.type();
+            gpu_assert(type.is_f32() || type.is_s32());
+            if (type.is_f32()) op = send_op_t::atomic_fadd;
+        }
         send_params_t params;
-        params.hw = desc_.hw;
+        params.hw = hw_;
         params.kind = (send_kind != send_kind_t::undef
                         ? send_kind
                         : desc_.access_kind(op, abc));
@@ -1039,41 +868,12 @@ class plan_builder_t {
         return params;
     }
 
-    std::vector<prb_dim_t> skip_mask(const view_t &view) const {
-        std::vector<prb_dim_t> ret;
-        auto &mask_desc = view.mask_desc();
-        auto tg_iter_tile = coord_info_.tg_iter_tile();
-        auto dim_sizes = view.base_layout().dim_sizes();
-        for (int i = 0; i < mask_desc.nmasks(); i++) {
-            prb_dim_t dim = mask_desc[i].dim;
-            ir_assert(view.dim_mapper().has(dim));
-            // Assume that dimensions with non-trivial mapping always require
-            // masking.
-            if (!view.dim_mapper().expr(dim).is_same(index_var(dim))) continue;
-            // Assume global k-slciing implies masking.
-            if (coord_info_.is_global_loop(dim)) continue;
-            // Check if the mask can be proven with known dimension requirements.
-            if (!reqs_.can_prove(dim_sizes.at(dim) % tg_iter_tile.at(dim) == 0))
-                continue;
-            // Mask is not required for this dimension.
-            ret.push_back(dim);
-        }
-        return ret;
-    }
-
-    dim_mapper_t extend_mapper(const dim_mapper_t &mapper,
-            const prb_dim_t &extra_dim, char letter) const {
-        auto new_mapper = mapper;
-        new_mapper.set_dim(extra_dim);
-        auto &desc = mapper.layout_desc();
-        auto new_letter_map = desc.letter_map();
-        new_letter_map[extra_dim] = letter;
-        auto new_desc = layout_desc_t(new_letter_map);
-        new_mapper.set_layout_desc(new_desc);
-        return new_mapper;
+    std::vector<pvar_t> skip_mask(const view_t &view) const {
+        return conv::skip_mask(view, coord_info_.tg_iter_tile(), reqs_);
     }
 
     kernel_desc_t desc_;
+    hw_t hw_;
 
     dim_mapper_manager_t dim_mapper_manager_;
     multiply_info_t mul_info_;
@@ -1084,48 +884,36 @@ class plan_builder_t {
     layout_t a_layout_;
     layout_t b_layout_;
     layout_t c_layout_;
-    layout_t bia_layout_;
+    layout_t bias_layout_;
     prb_reqs_t reqs_;
 };
 
-prb_reqs_t plan_t::reqs() const {
-    prb_reqs_t ret;
-    ret.add(prefetch.reqs());
-    ret.add(x2r_fma.reqs());
-    ret.add(epilogue.c_store.reqs());
-    ret.simplify();
-    return ret;
-}
-
-template <typename KernelDescT>
-plan_t create_conv_plan_impl(KernelDescT &desc, bool finalize) {
-    if (!desc.is_supported()) return plan_t();
-    ir_assert(!desc.has_spec_strategy())
-            << "Kernel descriptor strategies are required to be specialized "
-               "before plan creation";
-    plan_builder_t builder(desc);
+plan_t create_conv_plan_impl(const kernel_desc_t &desc, const hw_t &hw,
+        const problem_t *prb = nullptr) {
+    if (!desc.is_supported(hw, prb)) return plan_t();
+    plan_builder_t builder(desc, hw);
     auto plan = builder.build();
     if (plan) {
-        if (finalize) {
-            const_cast<kernel_desc_t &>(desc).finalize(builder.reqs());
-        } else {
-            ir_assert(desc.reqs.implies(builder.reqs()));
-        }
+#ifdef DNNL_DEV_MODE
+        auto &plan_reqs = builder.reqs();
+        auto desc_reqs = desc.reqs();
+        desc_reqs.simplify();
+        gpu_assert(plan_reqs.str() == desc_reqs.str())
+                << "Mismatch between plan and descriptor dimension "
+                   "requirements:\n== Plan:\n"
+                << plan_reqs.str() << "\n== Descriptor:\n"
+                << desc_reqs.str();
+#endif
     }
     return plan;
 }
 
-plan_t create_conv_plan(const kernel_desc_t &desc) {
-    return create_conv_plan_impl(desc, /*finalize=*/false);
+plan_t create_conv_plan(const kernel_desc_t &desc, const hw_t &hw) {
+    return create_conv_plan_impl(desc, hw);
 }
 
-bool finalize_conv_desc(kernel_desc_t &desc, const problem_t &prb) {
-    ir_assert(desc.hw_desc.hw == prb.hw().to_ngen());
-    desc.specialize(prb);
-    desc.hw = prb.hw();
-    if (desc.is_finalized) return true;
-    auto plan = create_conv_plan_impl(desc, /*finalize=*/true);
-    return (bool)plan;
+plan_t create_conv_plan(const kernel_desc_t &desc, const problem_t &prb) {
+    return create_conv_plan_impl(desc, prb.hw(), &prb);
 }
 
 } // namespace conv
diff --git a/src/gpu/intel/jit/v2/conv/plan.hpp b/src/gpu/intel/jit/v2/conv/plan.hpp
index 0b50b2d3942..ac8760f6f87 100644
--- a/src/gpu/intel/jit/v2/conv/plan.hpp
+++ b/src/gpu/intel/jit/v2/conv/plan.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 
 #include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
 #include "gpu/intel/jit/v2/conv/problem.hpp"
-#include "gpu/intel/jit/v2/ir/plan_utils.hpp"
+#include "gpu/intel/jit/v2/ir/plan.hpp"
 #include "gpu/intel/jit/v2/ir/reqs.hpp"
 #include "gpu/intel/jit/v2/ir/send.hpp"
 #include "gpu/intel/jit/v2/ir/tensor.hpp"
@@ -36,8 +36,8 @@ namespace conv {
 
 class coord_info_t {
 public:
-    void add_dim(const prb_dim_t &dim, bool is_loop, bool is_global_loop,
-            int tg_tile, const expr_t &thr_idx, int iter_tile,
+    void add_dim(const pvar_t &dim, bool is_loop, bool is_global_loop,
+            dim_t tg_tile, const expr_t &thr_idx, dim_t iter_tile,
             const prb_reqs_t &reqs) {
         auto &e = entries_[dim];
         e.dim = dim;
@@ -47,15 +47,16 @@ class coord_info_t {
         e.loop_size = expr_t(1);
         bool is_dim_1 = reqs.is_equal(dim, 1);
         if (is_loop && !is_dim_1) {
-            e.loop_idx = var_t::make(type_t::s32(), e.dim.str() + "_loop_idx");
             if (is_global_loop) {
                 e.loop_size = const_var_t::make(
                         type_t::s32(), e.dim.str() + "_loop_size");
                 e.is_global_loop = true;
             } else {
-                e.loop_size = binary_op_t::make(op_kind_t::_div_up,
-                        size_var(e.dim), tg_tile * iter_tile);
+                e.loop_size = div_up(reqs.to_expr(e.dim), tg_tile * iter_tile);
             }
+            e.loop_idx = is_one(e.loop_size)
+                    ? expr_t(0)
+                    : var_t::make(type_t::s32(), e.dim.str() + "_loop_idx");
         }
         e.tg_idx = expr_t(0);
         e.thr_idx = (tg_tile == 1 ? expr_t(0) : thr_idx);
@@ -74,33 +75,31 @@ class coord_info_t {
         e.loop_size = simplify_rewrite(e.loop_size);
     }
 
-    std::vector<prb_dim_t> dims() const { return entries_.keys(); }
+    std::vector<pvar_t> dims() const { return entries_.keys(); }
 
-    bool is_loop(const prb_dim_t &dim) const {
-        return entries_.at(dim).is_loop();
-    }
-    bool is_global_loop(const prb_dim_t &dim) const {
+    bool is_loop(const pvar_t &dim) const { return entries_.at(dim).is_loop(); }
+    bool is_global_loop(const pvar_t &dim) const {
         return entries_.at(dim).is_global_loop;
     }
-    const expr_t &tg_index(const prb_dim_t &dim) const {
+    const expr_t &tg_index(const pvar_t &dim) const {
         return entries_.at(dim).tg_idx;
     }
-    const expr_t &thr_index(const prb_dim_t &dim) const {
+    const expr_t &thr_index(const pvar_t &dim) const {
         return entries_.at(dim).thr_idx;
     }
-    const expr_t &iter_index(const prb_dim_t &dim) const {
+    const expr_t &iter_index(const pvar_t &dim) const {
         return entries_.at(dim).iter_idx;
     }
-    const expr_t &loop_size(const prb_dim_t &dim) const {
+    const expr_t &loop_size(const pvar_t &dim) const {
         return entries_.at(dim).loop_size;
     }
-    const expr_t &loop_index(const prb_dim_t &dim) const {
+    const expr_t &loop_index(const pvar_t &dim) const {
         return entries_.at(dim).loop_idx;
     }
 
-    prb_coord_t<expr_t> iter_coord() const;
-    prb_coord_t<expr_t> tg_iter_coord() const;
-    prb_tile_t tg_iter_tile() const;
+    pvar_coord_t<expr_t> iter_coord() const;
+    pvar_coord_t<expr_t> tg_iter_coord() const;
+    pvar_tile_t tg_iter_tile() const;
 
     std::string str() const {
         std::ostringstream oss;
@@ -118,14 +117,14 @@ class coord_info_t {
 
 private:
     struct entry_t {
-        prb_dim_t dim;
+        pvar_t dim;
         expr_t tg_idx;
         expr_t thr_idx;
         expr_t iter_idx;
         expr_t loop_idx;
 
-        int tg_size = 0;
-        int iter_size = 0;
+        dim_t tg_size = 0;
+        dim_t iter_size = 0;
         expr_t loop_size;
         bool is_global_loop = false;
 
@@ -142,14 +141,14 @@ class coord_info_t {
         IR_DEFINE_DUMP()
     };
 
-    dim_map_t<prb_dim_t, entry_t> entries_;
+    pvar_map_t<entry_t> entries_;
 };
 
 class virt_grid_t {
 public:
     void add(const expr_t &var, const expr_t &expr) {
         auto ret = idxs_.emplace(var, expr);
-        ir_assert(ret.second);
+        gpu_assert(ret.second);
     }
 
     const object_map_t<expr_t, expr_t> &idxs() const { return idxs_; }
@@ -158,60 +157,6 @@ class virt_grid_t {
     object_map_t<expr_t, expr_t> idxs_;
 };
 
-struct reduce_plan_t : public base_plan_t {
-    layout_t src;
-    layout_t dst;
-
-    using base_plan_t::base_plan_t;
-
-    reduce_plan_t() = default;
-    reduce_plan_t(const hw_t &hw, const layout_t &src, const layout_t &dst)
-        : base_plan_t(hw), src(src), dst(dst) {}
-
-    int grf_usage_bytes() const {
-        int ret = 0;
-        ret += utils::rnd_up(dst.size(), grf_size());
-        return ret;
-    }
-
-    std::string str() const {
-        if (!*this) return "(empty)";
-        std::ostringstream oss;
-        oss << "src_layout: " << src.str() << std::endl;
-        oss << "dst_layout: " << dst.str();
-        return oss.str();
-    }
-
-    IR_DEFINE_DUMP()
-};
-
-struct reorder_plan_t : public base_plan_t {
-    layout_t src;
-    layout_t dst;
-
-    using base_plan_t::base_plan_t;
-
-    reorder_plan_t() = default;
-    reorder_plan_t(const hw_t &hw, const layout_t &src, const layout_t &dst)
-        : base_plan_t(hw), src(src), dst(dst) {}
-
-    int grf_usage_bytes() const {
-        int ret = 0;
-        ret += utils::rnd_up(dst.size(), grf_size());
-        return ret;
-    }
-
-    std::string str() const {
-        if (!*this) return "(empty)";
-        std::ostringstream oss;
-        oss << "src_layout: " << src.str() << std::endl;
-        oss << "dst_layout: " << dst.str();
-        return oss.str();
-    }
-
-    IR_DEFINE_DUMP()
-};
-
 struct prefetch_plan_t : public base_plan_t {
     send_plan_t a_prefetch;
     send_plan_t b_prefetch;
@@ -234,7 +179,7 @@ struct prefetch_plan_t : public base_plan_t {
     }
 
     std::string str() const {
-        if (!*this) return "(empty)";
+        if (!*this || (!a_prefetch && !b_prefetch)) return "(empty)";
         std::ostringstream oss;
         oss << ir_utils::add_tag("a_prefetch", a_prefetch.str()) << std::endl;
         oss << ir_utils::add_tag("b_prefetch", b_prefetch.str());
@@ -249,7 +194,7 @@ struct x2r_plan_t : public base_plan_t {
     send_plan_t load;
     reorder_plan_t reorder;
     layout_t layout;
-    layout_t bia_layout;
+    layout_t bias_layout;
 
     using base_plan_t::base_plan_t;
 
@@ -286,7 +231,7 @@ struct fma_plan_t : public base_plan_t {
     layout_t a_layout;
     layout_t b_layout;
     layout_t c_layout;
-    prb_tile_t inst_tile;
+    pvar_tile_t inst_tile;
     fma_kind_t fma = fma_kind_t::undef;
     int simd = 0;
 
@@ -334,9 +279,9 @@ struct x2r_fma_plan_t : public base_plan_t {
         }
     };
 
-    prb_tile_t outer;
+    pvar_tile_t outer;
     layout_t c_layout;
-    layout_t bia_layout;
+    layout_t bias_layout;
     std::vector<stage_t> stages;
 
     x2r_fma_plan_t(const hw_t &hw) : base_plan_t(hw) {}
@@ -389,7 +334,7 @@ struct slm_reduce_plan_t : public base_plan_t {
     // C layout and tile coordinate after reduction and redistribution in
     // threadgroup.
     layout_t c_layout;
-    prb_coord_t<expr_t> c_coord;
+    pvar_coord_t<expr_t> c_coord;
 
     using base_plan_t::base_plan_t;
 
@@ -418,15 +363,38 @@ struct slm_reduce_plan_t : public base_plan_t {
     }
 };
 
+struct epilogue_store_plan_t : public base_plan_t {
+    pvar_tile_t tile;
+    reorder_plan_t reorder;
+    reorder_plan_t bias_reorder;
+    send_plan_t c_store;
+    send_plan_t bias_store;
+
+    std::string str() const {
+        if (!*this) return "(empty)";
+        std::ostringstream oss;
+        oss << "tile: " << tile << std::endl;
+        if (reorder)
+            oss << ir_utils::add_tag("reorder", bias_reorder.str())
+                << std::endl;
+        if (bias_reorder)
+            oss << ir_utils::add_tag("bias_reorder", bias_reorder.str())
+                << std::endl;
+        oss << ir_utils::add_tag("c_store", c_store.str()) << std::endl;
+        if (bias_store)
+            oss << ir_utils::add_tag("bias_store", bias_store.str());
+        return oss.str();
+    }
+};
+
 struct epilogue_plan_t : public base_plan_t {
-    prb_tile_t tile;
     slm_reduce_plan_t slm_reduce;
-    reorder_plan_t reorder;
-    reorder_plan_t bia_reorder;
     layout_t c_reg_layout;
-    send_plan_t c_store;
-    send_plan_t bia_store;
-    expr_t reduce_cond;
+    pvar_coord_t<expr_t> c_coord;
+    layout_t bias_layout;
+    expr_t bias_reduce_cond;
+
+    epilogue_store_plan_t store;
 
     using base_plan_t::base_plan_t;
 
@@ -436,17 +404,10 @@ struct epilogue_plan_t : public base_plan_t {
     std::string str() const {
         if (!*this) return "(empty)";
         std::ostringstream oss;
-        oss << "tile: " << tile << std::endl;
         if (slm_reduce)
             oss << ir_utils::add_tag("slm_reduce", slm_reduce.str())
                 << std::endl;
-        if (reorder)
-            oss << ir_utils::add_tag("reorder", reorder.str()) << std::endl;
-        if (bia_reorder)
-            oss << ir_utils::add_tag("bia_reorder", bia_reorder.str())
-                << std::endl;
-        oss << ir_utils::add_tag("c_store", c_store.str()) << std::endl;
-        oss << ir_utils::add_tag("bia_store", bia_store.str());
+        if (store) oss << ir_utils::add_tag("store", store.str()) << std::endl;
         return oss.str();
     }
 
@@ -480,8 +441,6 @@ struct plan_t : public base_plan_t {
         return ret;
     }
 
-    prb_reqs_t reqs() const;
-
     std::string str() const {
         if (!*this) return "(empty)";
         std::ostringstream oss;
@@ -494,8 +453,9 @@ struct plan_t : public base_plan_t {
     IR_DEFINE_DUMP()
 };
 
-plan_t create_conv_plan(const kernel_desc_t &desc);
-bool finalize_conv_desc(kernel_desc_t &desc, const problem_t &prb);
+plan_t create_conv_plan(const kernel_desc_t &desc, const hw_t &hw);
+plan_t create_conv_plan(const kernel_desc_t &desc, const problem_t &prb);
+prb_reqs_t generate_reqs(const kernel_desc_t &desc);
 
 } // namespace conv
 } // namespace v2
diff --git a/src/gpu/intel/jit/v2/conv/plan_preset.cpp b/src/gpu/intel/jit/v2/conv/plan_preset.cpp
deleted file mode 100644
index d3673030603..00000000000
--- a/src/gpu/intel/jit/v2/conv/plan_preset.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/jit/v2/conv/plan_preset.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-
-kernel_desc_t plan_preset_t::env_desc_;
-thread_local kernel_desc_t plan_preset_t::tls_desc_;
-
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/plan_preset.hpp b/src/gpu/intel/jit/v2/conv/plan_preset.hpp
deleted file mode 100644
index 5217b876e8b..00000000000
--- a/src/gpu/intel/jit/v2/conv/plan_preset.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_V2_CONV_PLAN_PRESET_HPP
-#define GPU_INTEL_JIT_V2_CONV_PLAN_PRESET_HPP
-
-#include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
-#include "gpu/intel/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-
-class plan_preset_t {
-public:
-    bool is_set() const {
-        if (!env_desc_.is_empty()) return true;
-        if (!tls_desc_.is_empty()) return true;
-        return false;
-    }
-
-    const kernel_desc_t &get() const {
-        if (!env_desc_.is_empty()) return env_desc_;
-        if (!tls_desc_.is_empty()) return tls_desc_;
-        ir_error_not_expected();
-        return env_desc_;
-    }
-
-    struct guard_t {
-        guard_t(const kernel_desc_t &desc, kernel_desc_t *desc_ptr)
-            : desc_ptr_(desc_ptr) {
-            *desc_ptr_ = desc;
-        }
-
-        guard_t(guard_t &&other) {
-            desc_ptr_ = other.desc_ptr_;
-            other.desc_ptr_ = nullptr;
-        }
-
-        guard_t(const guard_t &) = delete;
-        guard_t &operator=(const guard_t &) = delete;
-
-        ~guard_t() {
-            if (desc_ptr_) *desc_ptr_ = kernel_desc_t();
-        }
-
-    private:
-        kernel_desc_t *desc_ptr_ = nullptr;
-    };
-
-    guard_t make_guard(const kernel_desc_t &desc) {
-        return guard_t(desc, &tls_desc_);
-    }
-
-    static plan_preset_t &instance() {
-        static plan_preset_t _instance;
-        return _instance;
-    }
-
-private:
-    plan_preset_t() {
-        auto s_desc = gpu_utils::dev_getenv("desc", std::string());
-        if (!s_desc.empty()) env_desc_.set(s_desc);
-    }
-
-    static kernel_desc_t env_desc_;
-    static thread_local kernel_desc_t tls_desc_;
-};
-
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/jit/v2/conv/plan_registry.cpp b/src/gpu/intel/jit/v2/conv/plan_registry.cpp
index 9f05dcf1d13..dd7cc021207 100644
--- a/src/gpu/intel/jit/v2/conv/plan_registry.cpp
+++ b/src/gpu/intel/jit/v2/conv/plan_registry.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,44 +38,34 @@ plan_registry_t::plan_registry_t(const char **entries) {
             std::ostringstream oss;
             e.stringify(oss);
             if (oss.str() != *entries) {
-                ir_warning()
-                        << "parsed from:\n  " << *entries
-                        << "\nstringified to\n  " << oss.str() << std::endl;
+                gpu_warning() << "parsed from:\n  " << *entries
+                              << "\nstringified to\n  " << oss.str();
             }
         }
 #endif
-        entries_.push_back(e);
+        entries_.push_back(std::move(e));
         entries++;
     }
 }
 
-void plan_registry_t::merge(const plan_registry_t &other) {
-    std::unordered_map<std::string, entry_t> new_entries;
-    auto *const_this = const_cast<const plan_registry_t *>(this);
-    for (auto *entries_ptr : {&const_this->entries_, &other.entries_}) {
-        for (auto &e : *entries_ptr) {
-            auto key = jit::stringify(e);
-            new_entries[key] = e;
-        }
-    }
-    entries_.clear();
-    for (auto &kv : new_entries) {
-        entries_.push_back(kv.second);
-    }
-}
-
 kernel_desc_t plan_registry_t::find_best(const problem_t &prb) const {
     kernel_desc_t best;
-    float best_eff = 0;
+    float min_time = std::numeric_limits<float>::max();
     for (auto &e : entries_) {
-        if (!e.desc.fits(prb)) continue;
-        float eff = e.model.eff(prb, e.desc);
-        if (eff > best_eff) {
-            best_eff = eff;
+        if (!e.desc.can_fit(prb)) continue;
+        float time = e.model_set.time(prb, e.desc);
+        if (time < min_time) {
+            min_time = time;
             best = e.desc;
         }
+        auto desc = to_stream_k(e.desc);
+        if (desc.is_empty() || !desc.can_fit(prb)) continue;
+        time = e.model_set.time(prb, desc);
+        if (time < min_time) {
+            min_time = time;
+            best = std::move(desc);
+        }
     }
-    best.spec_strategy = spec_strategy_t::min_dims;
     return best;
 }
 
@@ -92,6 +82,7 @@ void plan_registry_t::parse(std::istream &in) {
     entries_.clear();
     std::string line;
     while (std::getline(in, line)) {
+        if (line.empty() || line[0] == '#') continue;
         entries_.emplace_back();
         jit::parse(line, entries_.back());
     }
@@ -100,13 +91,34 @@ void plan_registry_t::parse(std::istream &in) {
 void plan_registry_t::entry_t::stringify(std::ostream &out) const {
     jit::stringify(out, desc);
     out << " model=";
-    jit::stringify(out, model);
+    jit::stringify(out, model_set);
 }
 
 void plan_registry_t::entry_t::parse(std::istream &in) {
     jit::parse(in, desc);
     stream_match(in, "model=");
-    jit::parse(in, model);
+    jit::parse(in, model_set);
+}
+
+std::string plan_registry_t::entry_t::str() const {
+    if (is_empty()) return "(empty)";
+    std::ostringstream oss;
+    oss << ir_utils::add_tag("Desc", desc.str());
+    if (!model_set.is_empty()) {
+        oss << std::endl;
+        oss << ir_utils::add_tag("Model", model_set.str());
+    }
+    return oss.str();
+}
+
+std::string plan_registry_t::entry_t::registry_str() const {
+    gpu_assert(!desc.is_empty() && !model_set.is_empty())
+            << "Need both descriptor/model for kernel registry";
+    std::ostringstream oss;
+    jit::stringify(oss, desc);
+    oss << " model=";
+    model_set.stringify(oss);
+    return oss.str();
 }
 
 struct plan_registry_instance_t {
@@ -122,9 +134,8 @@ struct plan_registry_instance_t {
             std::ifstream in(registry_path);
             if (in.good()) {
                 registry.parse(in);
-                ir_info() << "Loaded kernel registry from " << registry_path
-                          << " with " << registry.size() << " entries"
-                          << std::endl;
+                gpu_info() << "Loaded kernel registry from " << registry_path
+                           << " with " << registry.size() << " entries";
                 return;
             }
         }
diff --git a/src/gpu/intel/jit/v2/conv/plan_registry.hpp b/src/gpu/intel/jit/v2/conv/plan_registry.hpp
index de1d5848289..62aaad03c71 100644
--- a/src/gpu/intel/jit/v2/conv/plan_registry.hpp
+++ b/src/gpu/intel/jit/v2/conv/plan_registry.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,28 +33,29 @@ class plan_registry_t {
 public:
     struct entry_t {
         kernel_desc_t desc;
-        model_t model;
+        model_set_t model_set;
 
         entry_t() = default;
-        entry_t(const kernel_desc_t &desc, const model_t &model)
-            : desc(desc), model(model) {}
+        entry_t(const kernel_desc_t &desc, const model_set_t &model_set)
+            : desc(desc), model_set(model_set) {}
+        bool is_empty() const { return desc.is_empty(); }
         void stringify(std::ostream &out) const;
         void parse(std::istream &in);
+        std::string str() const;
+        std::string registry_str() const;
+        IR_DEFINE_DUMP()
     };
 
     plan_registry_t() = default;
     plan_registry_t(const char **entries);
 
-    void set(const kernel_desc_t &desc, const model_t &model) {
-        entries_.emplace_back(desc, model);
-    }
+    void set(const entry_t &entry) { entries_.emplace_back(entry); }
     int size() const { return (int)entries_.size(); }
-    void merge(const plan_registry_t &other);
     kernel_desc_t find_best(const problem_t &prb) const;
     void stringify(std::ostream &out) const;
     void parse(std::istream &out);
 
-public:
+private:
     std::vector<entry_t> entries_;
 };
 
diff --git a/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp b/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp
index 56656e27f98..c0c8e4eaf3d 100644
--- a/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp
+++ b/src/gpu/intel/jit/v2/conv/plan_registry_data.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,103 +25,119 @@ namespace conv {
 // clang-format off
 const char** get_plan_registry_entries() {
     static const char *entries[] = {
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic8oc16ow2 tg=oc16 loop_desc=mb,ow,oh,od model=010000000400000000000000E594413978B0D0370A9B50377F3BCE3D",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64mb4 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=01000000040000000000000021AB13371D2CD739262C573BA91A0738",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow2 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000C2148B36ADBF1039B2BF903AC16DBC39",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic16iw4oc2 tg=ic8iw4 loop_desc=kw,kh,kd,oc load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sd==1:sh==1:sw==1 model=0100000004000000000000001E3C903C1F503B3DF526F53CA12B83BE",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=oc16ow2 loop_desc=kw,kh,kd,ic model=010000000400000000000000D579783A4333073B524B843AE5C0ECBA",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32mb8oc2 tg=ic2 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=010000000400000000000000A18613B9754F023EA426073D842CAFBE",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32iw2oc8 tg=ic8iw2 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=010000000400000000000000022CB63C2DD96A3C8BC71F3DF5374EBD",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic16mb2oc32 tg=mb2 loop_desc=kw,kh,kd,oc load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304:sd==1:sh==1:sw==1 model=0100000004000000000000003AACD13CD5BDDB3D45EB003C56AD8FBD",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb8 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000BF769B3736CFDB3931CF5B3BD86B1039",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:u8 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=ow2 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=010000000400000000000000D689C3C0B916353C790ADB3A368AC340",
-        "prop=fwd dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64mb2 tg=mb4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000FD783EC3A901AB3BF0F1173AFD783E43",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32oc16ow4 tg=oc4ow16 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sw==1 model=0100000004000000000000008E74923D13868B3ED749FC3D611C9CBE",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic2mb4oc64 tg=oc16 loop_desc=kw,kh,kd,ic model=0100000004000000000000000A0DC33CFE1B773D82E9AE3D3C9998BE",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic64mb8oc16 tg=oc8ow2 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d prefetch=x3 reqs=ic%64==0:ic>=64:id*ih*iw*g*ic<=16777216:mb<=16777216:oc%64==0:oc>=64:od*oh*ow*g*oc<=16777216 model=010000000400000000000000C268293C45FB303C89CF813C433EDCBC",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc4ow4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d prefetch=x3 reqs=g*ic<=16777216:g*oc<=16777216:ic%64==0:ic>=64:iw<=16777216:oc%64==0:oc>=64:ow<=16777216:sw==1 model=010000000400000000000000458DE03CEFBFBB3DAAD446BCDED182BD",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic8mb2oc64 tg=oc4 loop_desc=mb,ow,oh,od load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=0100000004000000000000003AFC9C3A4B0CF8389D077839D54CF53C",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=oc8ow2 loop_desc=kw,kh,kd,ic model=01000000040000000000000033CC3D3A508E123B5C4A573AB54EC2BA",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=mb2 loop_desc=kw,kh,kd,ic model=010000000400000000000000DD8B033AE297473B010D4F395C83B0BA",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic16mb4oc16 tg=oc4ow16 loop_desc=kw,kh,kd,ic model=0100000004000000000000003E96A83CEBBC823D76EA3A3D370878BD",
-        "prop=bwd_d src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=iw16 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=0100000004000000000000009AF4B93A0288273B3551023BBF3A5CBB",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic4mb8oc64 tg=mb8oc8 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=010000000400000000000000B2E73B3DD637A33E6030943E602EBDBE",
-        "prop=bwd_d src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=ic8iw4 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=010000000400000000000000E4EBA43AD399393B5525CF3A076E2EBB",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic32mb32oc16 tg=mb4oc4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d prefetch=x3 reqs=ic%64==0:ic>=64:id*ih*iw*g*ic<=16777216:mb<=16777216:oc%64==0:oc>=64:od*oh*ow*g*oc<=16777216 model=0100000004000000000000002ED3ED3C45610F3D8F8F1B3DFD7003BE",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g16mb16 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=0100000004000000000000004145F737D22D5F39DA2DDF3AA90F063A",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic4oc64ow16 tg=ic8 loop_desc=mb,ow,oh,od load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sw==1 model=010000000400000000000000D29F063B866EBD3C816EBD3C0C63903E",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow2 tg=ow4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000AD139A438AAA853BC24352B8A8139AC3",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g16ow16 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000E5B08B372469E0392F69603BF40B4138",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:s32 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=ow4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=010000000400000000000000178D684370B9053C78AC7D3B068D68C3",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=mb2oc8 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=010000000400000000000000B8D6053ED75DB03D923FB63E10A527BF",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow8 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000E0E244377F10AD397E102D3BC77E9938",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic4mb8oc64 tg=oc2ow8 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=0100000004000000000000002EE7813D5B07DA3E6F2A35BDF2BFCDBE",
-        "prop=fwd dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow4 tg=ow2 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=01000000040000000000000074F08144ADD1E83B3EE0E73871F081C4",
-        "prop=fwd dw=1 src=axb:s8 wei=xab:s8 dst=axb:s32 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=ow16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=01000000040000000000000027AA13C45FEC2C3CC7198B3A3DAA1344",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g16ow2 tg=ow16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000786B2AC32B0A5A3AA5EBBE3A8A6B2A43",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32mb2oc8 tg=ic16mb4 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=01000000040000000000000052A20D3CE53C6C3DCD915B3C298065BD",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic8oc32ow4 tg=ic2oc8 loop_desc=mb,ow,oh,od load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sw==1 model=0100000004000000000000004110EA395736023C8035823B1BFD2F3E",
-        "prop=bwd_d src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=x loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=01000000040000000000000034414039B217973BFFE1853A5C6B46BB",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:u8 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=ow16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=01000000040000000000000056E31D431DD2E63A661D2A3B2CE31DC3",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g16mb4 tg=ow8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=0100000004000000000000009FA2D9C1E0D1A03F7A69A0BFD2A2D941",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic8oc16ow4 tg=oc8 loop_desc=mb,ow,oh,od model=010000000400000000000000B6CD003A6D159C3A08159C3A1523A53D",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=ow16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000B65D26C30E2F0F3C660BCA39E55D2643",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow8 tg=ow8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000E77E7440CDF133BE735E3D3E3A7374C0",
-        "prop=fwd dw=1 src=axb:s8 wei=xab:s8 dst=axb:s8 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=x loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=0100000004000000000000008C2B3DC305E1423CAC1A2739992B3D43",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic2mb64oc16 tg=oc4ow8 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=010000000400000000000000F1ED973DE453873E6F152D3EE4D0B6BE",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb4 tg=x loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=0100000004000000000000004AD3DCC1EE63463B5C9A2A394BD3DC41",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32mb4oc2 tg=ic8iw4 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=010000000400000000000000ADE3693CD4CB473D34491F3D8C0B3ABE",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic2mb8oc16 tg=oc16 loop_desc=kw,kh,kd,ic model=010000000400000000000000B665513C720D4E3DD8CB373D61C5D5BD",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic64oc16ow64 tg=oc4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d prefetch=x3 reqs=g*ic<=16777216:g*oc<=16777216:ic%64==0:ic>=64:iw<=16777216:oc%64==0:oc>=64:ow<=16777216:sw==1 model=0100000004000000000000004E32C03C6EB38A3D0B5B433DED7560BD",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32mb4oc8 tg=ic2iw16 loop_desc=kw,kh,kd,oc load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304:sd==1:sh==1:sw==1 model=0100000004000000000000001C3D333DA685033EFF337E3D390E17BE",
-        "prop=fwd dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=ow16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=0100000004000000000000001FEE57431B1F003B864AF63A04EE57C3",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=ow8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000F5D118C2F038DB3EE69BD9BE1ED21842",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=0100000004000000000000009F5B9E36C65B083AE35B883B75128138",
-        "prop=fwd dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow2 tg=ow4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000481DD5C3B4D8BA3BDB2B6E3A4C1DD543",
-        "prop=fwd dw=1 src=axb:s8 wei=xab:s8 dst=axb:s8 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=mb16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=010000000400000000000000B0562FC3B599F23A03FD2A3BD7562F43",
-        "prop=fwd dw=1 src=axb:s8 wei=xab:s8 dst=axb:s32 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=mb16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=01000000040000000000000015C889C2E5C7493A2DD26F3B6AC88942",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb4 tg=mb4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=0100000004000000000000007D8D0FC32DFE963B591A94397E8D0F43",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb2 tg=mb8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000BF0648C3BF48C6BF28CAC63FBC064843",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:u8 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=x loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=0100000004000000000000000A2032C453AE3D3CFF411D3A0A203244",
-        "prop=fwd dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=mb4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000622486C397987E3BC8B80EB963248643",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow2 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000864FEE3608850039F884803ABA593439",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32 tg=mb8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=01000000040000000000000010FC19C286868B3D14D886BDFEFB1942",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=mb4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=01000000040000000000000089F26A436A5E683BFEB8A9B780F26AC3",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb2 tg=ow4 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000F4B32DC1DFD1643B9350723841B42D41",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=x loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000B62099C331F7573BD2C111BABF209943",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic2mb16oc32 tg=ow4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=010000000400000000000000CE1C5E3AADE0E63D667E423E6C7E53BF",
-        "prop=fwd dw=1 src=axb:s8 wei=xab:s8 dst=axb:s32 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=mb2 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=0100000004000000000000007803A4C33A8C923BEF60EDB97F03A443",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic16mb4oc16 tg=oc2ow4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=0100000004000000000000001B6A1D3D1E99063E1799063E82BA68BE",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:u8 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow2 tg=ow8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=010000000400000000000000ACED0EC2261F993E9F1195BE19EE0E42",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic64mb64oc16 tg=oc4ow4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d prefetch=x3 reqs=ic%64==0:ic>=64:id*ih*iw*g*ic<=16777216:mb<=16777216:oc%64==0:oc>=64:od*oh*ow*g*oc<=16777216 model=010000000400000000000000F8F96E3DC1C0E63D5E97053D51930CBE",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic16mb4oc16 tg=oc8 loop_desc=kw,kh,kd,ic model=0100000004000000000000003126BA3CEBCF6B3DE4CF6B3DF798BFBD",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow2 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=0100000004000000000000008B1F8E36BF458139C345013B9A3F9839",
-        "prop=fwd dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32 tg=ow2 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=0100000004000000000000003AA3DD43B948193B110BADB92FA3DDC3",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=oc2ow4 loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d prefetch=x3 reqs=ic%64==0:ic>=64:id*ih*iw*g*ic<=16777216:mb<=16777216:oc%64==0:oc>=64:od*oh*ow*g*oc<=16777216 model=0100000004000000000000004ED6D43C5CCF2A3D5DCF2A3DDD6E6CBD",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic16iw8oc2 tg=ic4iw16 loop_desc=kw,kh,kd,oc load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sd==1:sh==1:sw==1 model=010000000400000000000000E73C893CA908FB3D80C42E3DDBA23BBE",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:s32 hw=xehpc fma=mad simd=16 regs=128 iter=g64mb2 tg=ow8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=010000000400000000000000A23999C279935F40AC245FC0D4399942",
-        "prop=bwd_d src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic16iw8oc32 tg=ic4iw4 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=010000000400000000000000B60C8F3A733F213B828A043BA67D13BB",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000D83FB636729FC4386C9F443A7C7D0839",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic8mb4oc32 tg=ic8oc4 loop_desc=mb,ow,oh,od model=010000000400000000000000943F3E3A982C8738DA3D87374704223E",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g16 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000717151368CB9853764BA0539C503EE3A",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic4oc32ow8 tg=ic2oc16 loop_desc=mb,ow,oh,od model=01000000040000000000000036FB1F3B6B990EBB19990EBA87E66E3D",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g16mb8 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000F612E037061DB138E01C313AC235A03A",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g64mb2 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000FB86473677E387396FE3073BF2A4A03A",
-        "prop=bwd_w dw=1 src=axb:f32 wei=xab:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb8 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%16==0:ic==1:oc==1 model=010000000400000000000000F5B0283796D19F3996D11F3B74A90538",
-        "prop=fwd dw=1 src=axb:s8 wei=xab:s8 dst=axb:s8 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow2 tg=ow16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=010000000400000000000000AF99ECC2FC85993BB4245E3B279AEC42",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic2mb16oc16 tg=ic8 loop_desc=mb,ow,oh,od model=010000000400000000000000F17DE3393BDB2C39E9D92C3904FA403D",
-        "prop=fwd dw=1 src=axb:u8 wei=xab:s8 dst=axb:s32 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow4 tg=ow2 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%64==0:ic==1:oc==1 model=0100000004000000000000009733C5C26FCD143CB24F233B8433C542",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic2oc16ow16 tg=ic16 loop_desc=mb,ow,oh,od load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sw==1 model=010000000400000000000000F75304B953C926B866D3C23A92032A3E",
-        "prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic32mb8oc16 tg=mb2oc8 loop_desc=kw,kh,kd,ic model=010000000400000000000000A9595A3A3F43C93A90DDCD3A0D0CE9BA",
-        "prop=bwd_d src=axb:s8 wei=axcb:s8 dst=axb:s8 hw=xehpc fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=ic16 loop_desc=kw,kh,kd,oc reqs=sd==1:sh==1:sw==1 model=01000000040000000000000053FCFE3A1ED7173B0CDEF73AF2455ABB",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic4mb16oc32 tg=ic2oc2 loop_desc=mb,ow,oh,od load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304 model=0100000004000000000000004087E73ADEEDA339E7E2233A9B7CB73E",
-        "prop=fwd dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g32 tg=mb16 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=0100000004000000000000008F9809C1817756398E670D3BFD990941",
-        "prop=fwd dw=1 src=axb:f16 wei=xab:f16 dst=axb:f16 hw=xehpc fma=mad simd=16 regs=128 iter=g64ow2 tg=ow8 loop_desc=kw,kh,kd,ic load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000EFBF53C36815D74030E6D6C007C05343",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g32ow8 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000A59641379382F8399682783B66221D3A",
-        "prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic4oc32ow2 tg=x loop_desc=kw,kh,kd,ic load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sw==1 model=010000000400000000000000B10C563BB586D23DA531DD3B434C85BE",
-        "prop=bwd_w dw=1 src=axb:bf16 wei=xab:bf16 dst=axb:bf16 hw=xehpc fma=mad simd=16 regs=128 iter=g32mb2 tg=x loop_desc=mb,ow,oh,od load=a:block,b:block store=c:block reqs=g%32==0:ic==1:oc==1 model=010000000400000000000000A49C17379566B5389166353A0AFF6E39",
-        "prop=bwd_d src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32mb8oc4 tg=ic16 loop_desc=kw,kh,kd,oc load=a:2d,b:2d store=c:2d reqs=ic%16==0:ic>=16:id*ih*iw*g*ic<=4194304:mb<=16777216:oc%16==0:oc>=16:od*oh*ow*g*oc<=4194304:sd==1:sh==1:sw==1 model=0100000004000000000000008F17A73D713A353E0332123EF0FE17BF",
-        "prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 hw=xehpc fma=mad simd=16 regs=128 iter=ic32oc16ow4 tg=ic8oc2 loop_desc=mb,ow,oh,od load=a:2d,b:2d store=c:2d reqs=g*ic<=4194304:g*oc<=4194304:ic%16==0:ic>=16:iw<=16777216:oc%16==0:oc>=16:ow<=16777216:sw==1 model=01000000040000000000000004789CB97C068C3CA0060C3CB0B3EE3E",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow32 2d=1 ext=out_b2,out_b4,stream_k model=015DBA59441308D13E08C8673FEB49D441F6FFA74102487A344669B2AD3F",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc32ow8 tg=oc2ow4 2d=1 prefetch=x1 ext=out_b2,out_b4,stream_k model=0179BFD54311B8EE3EFDCF963F65618D3F5EA3713C0298A0D54590B01740",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ic8 2d=1 ext=out_b2,out_b4,stream_k model=014799AD44FF6F923F2B9A4F3E338BD63FFD8F9A3F0240EB9C45D3A53541",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64oc64ow16 tg=ow4 2d=1 prefetch=x1 ext=out_b2,out_b4,stream_k model=010EF322441180EA3EFC278A3F4B95AB3FFB450640027BB2F6450FBD0B40",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb32oc64 tg=ow2 2d=1 ext=out_b2,out_b4,stream_k model=017E567644124CEC3E02145D3F80570D40F6FFA7410260586246FD5E1540",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ic8 2d=1 ext=out_b2,out_b4,stream_k model=0105CFA64400408B3F2B1A573E64EA9A3FFEBF923F0200B88C4566793A41",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb16oc64 tg=ow16 2d=1 prefetch=x1 ext=out_b2,out_b4,stream_k model=01E2332E441794B83EFE5F933F62B09B3F08E0403F02D2F42D462A151641",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 2d=1 prefetch=x1 ext=out_b2,out_b4,stream_k model=019F6E0B44111CDB3EFD77993F9D358A3F29DA553E020E774346C90B4540",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 ext=out_b2,out_b4,stream_k model=016A335745FFC7B23F02FC5B3F0000803F34005B3E02603B91464538E73F",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic8 ext=out_b2,out_b4,stream_k model=017A514445FF378E3F05303C3F66B6833F1300D93E02803449468A9D9441",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow16 ext=out_b2,out_b4,stream_k model=0131847B4514E8BE3E0130823F0000803F6494C23D02C76B3847DAB61640",
+        "hw=xehpc prop=fwd src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic32oc16ow8 tg=ic2 ext=out_b2,out_b4,stream_k model=01B9145345FE67A43F03D0553F0000803F1A809F3E02C8290547D9EF6C40",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc16ow8 tg=oc2ow2 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=01D1461744037C4A3FFEA7A03F357F863F17B7D13802CF9B46461C49C33F",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc64ow32 tg=oc2ow4 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=01E63AE14332D43B3E05C6553FFBAE8040F6FFA741020739CB46C2B0AB40",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32oc32ow32 tg=oc2 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=01A33207441218DF3E0470293F34311540F7FF9D41028CC8F9450FF3AB3F",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64oc32ow16 tg=oc2ow4 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=0163D639440458323FFFAB833FCDE1A23FFD1F9C3F02B5F0E64542169440",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb16oc64 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=019489524403684F3F0316473F0000803FF5FFBB41024C40E9458D42853F",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc32 tg=ow8 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=01B74D2B44062C0E3F01E0863F654AA73FCB68703D02DC9F8246153C4E40",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb32oc64 tg=mb2 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=0157EB2F441670963E05A8413FA5F0EB40F6FFA74102775383460C6DF63F",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=ow8 2d=1 prefetch=x1 ext=out_b1,out_b4,stream_k model=01978B6644021E303F04B07F3F6764813F1440F33E028A550F469C2AA440",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic8 ext=out_b1,out_b4,stream_k model=01CA9A2F4501B86D3F009C8E3F98F1913F1000ED3E023BA4A046A3AF5241",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 ext=out_b1,out_b4,stream_k model=0173AD02451380EA3E0308503FCE6C873FFD6FA13F024EDF6946FF70A73F",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc16ow8 tg=ic8 ext=out_b1,out_b4,stream_k model=01B8EADA440012823FFEBFAB3F0000803FF100083D0259EB49471741E540",
+        "hw=xehpc prop=fwd src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic4 ext=out_b1,out_b4,stream_k model=017AC7234501904A3F0048833F996F893FD268163D020641CD466BD8B540",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow16 tg=ow2 2d=1 ext=out_b1,out_b2,stream_k model=01B7DE43447330D73D01B8633F980ACF3FF4FFCF4102CDBA0B4616FD223F",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ow4 2d=1 prefetch=x1 ext=out_b1,out_b2,stream_k model=013E95854408D0193F07E83D3F983F833F04003A3F0200500F465403A63F",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=oc2 2d=1 ext=out_b1,out_b2,stream_k model=0198618B440038613F0500303F0041843F0180553F028D048F45F4F96D3F",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16oc32ow8 tg=ic4 2d=1 ext=out_b1,out_b2,stream_k model=01908AC244F927EE3F08300A3F0000803F17B7D1380219F5CC45DB161240",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic2 2d=1 ext=out_b1,out_b2,stream_k model=012187B144FD6FC43F04BC233F0000803F33806C3E028009994584B7843F",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=oc2 2d=1 ext=out_b1,out_b2,stream_k model=0170A29644FDFFA23F04F8283F6A3B803F0680143F0268F1AB455A05523F",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ic4 2d=1 ext=out_b1,out_b2,stream_k model=012108BF44FBFFBB3F04602A3F0000803F36801C3E0200121F46E73A0940",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic16mb8oc32 tg=ow8 2d=1 prefetch=x1 ext=out_b1,out_b2,stream_k model=01D7C88D440788203F04C83A3F9A05843F0514263F02D9424E4609C32B40",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 ext=out_b1,out_b2,stream_k model=013EBEAE440B00083FFEDF9F3F0000803F0628533F0200D8E345FFFFCB3E",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic16 ext=out_b1,out_b2,stream_k model=013D23DC4402783A3FFFB79F3FCCACD13F2E1A073E020C57CA46A9D42A41",
+        "hw=xehpc prop=fwd src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic8oc32ow8 tg=ic4 ext=out_b1,out_b2,stream_k model=01A9B1B44406D01E3FFFB78B3F9A41BD3F1440A83E02D8D4964611F1D73F",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw32oc32 tg=ic2iw2 2d=1 prefetch=x1 spec=sd1sh1sw1 model=0167A6114414E0E93EFFEF6C3F68A7DC3FF6FFA741",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw8oc64 tg=iw4 2d=1 prefetch=x1 spec=sd1sh1sw1 model=0193592A4402C85D3FFEAF7F3F67BEC23F05002B3F",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64iw32oc32 tg=ic2 2d=1 prefetch=x1 spec=sd1sh1sw1 model=01174837441504CB3E0492433F73FEA142F6FFA741",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc64 tg=ic2iw4 2d=1 prefetch=x1 spec=sd1sh1sw1 model=017ABA3D44024C393F01C8803F01B69F3FFB9FA83F",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb32oc32 tg=iw2 2d=1 spec=sd1sh1sw1 model=013C0B624413D8DD3E043C3F3FD89E1541F6FFA741",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32mb8oc64 tg=iw8 2d=1 prefetch=x1 spec=sd1sh1sw1 model=01836E274405841A3FFF878E3FFDE1943FA9D1F03C",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb16oc32 tg=iw16 2d=1 prefetch=x1 spec=sd1sh1sw1 model=019FEAEF433150743E02986A3F32EBC53FF93FA340",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic64mb8oc64 tg=iw8 2d=1 prefetch=x1 spec=sd1sh1sw1 model=010148534404D0283F04A6753F343B8E3F10D0CC3E",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb8oc32 tg=ic2iw4 spec=sd1sh1sw1 model=01FD3B1B4503A0533F0110613F679E803F2A1A343E",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic32iw16oc16 spec=sd1sh1sw1 model=01DFD9FE441250F73E0500443F9AFB8A3FFE3FAE3F",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw8oc32 spec=sd1sh1sw1 model=014F1D174501A8643F00406D3F0000803F2A1A393E",
+        "hw=xehpc prop=bwd_d src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16iw16oc16 spec=sd1sh1sw1 model=016778C644065C293F0008693F0000803F06800F3F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 2d=1 spec=sd1sh1sw1 ext=stream_k model=01F6655C441858813E06544F3FCE92B13FF6FFA741028C1AED4510A3AE3E",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw4 2d=1 prefetch=x1 spec=sd1sh1sw1 ext=stream_k model=01794F884403A04E3F06C83A3F34D1883FFFDF5E3F0266280546C57FA43F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=ic2 2d=1 prefetch=x1 spec=sd1sh1sw1 ext=stream_k model=01BAC8A944FB2FD73F0300623FCB2B833F30C05E3E020D1951457BB9923F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc16 tg=iw2 2d=1 prefetch=x1 spec=sd1sh1sw1 ext=stream_k model=01EE598A440020743F0388393F322D843FFB7F783F027283DC45A184363F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb16oc8 2d=1 spec=sd1sh1sw1 ext=stream_k model=014CF8824414B0AB3E04085A3FFED38A3FFF8FAE3F02799B06460502AF3E",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 2d=1 prefetch=x1 spec=sd1sh1sw1 ext=stream_k model=010D32AB4402764B3F00C8853F0000803F0630283F02CF7C464572B5133F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw4 2d=1 spec=sd1sh1sw1 ext=stream_k model=0137529744FE4F943F04E8383F0000803F0400083F0234E0CF457EE5C53F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc16 tg=iw8 2d=1 prefetch=x1 spec=sd1sh1sw1 ext=stream_k model=0138AC8F4405742F3F02A8413F9979823F07C01A3F02D9C338464F032B40",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32mb8oc8 tg=ic2mb2 spec=sd1sh1sw1 ext=stream_k model=01ABB1B4440890133F00588C3FCEEE8F3F15C0823E02570432468130CB3F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 tg=ic2 spec=sd1sh1sw1 ext=stream_k model=0191F8984417509D3E009E643F98E0933FFBE7EC3F02E86A124694BE443F",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw16oc8 spec=sd1sh1sw1 model=01953EB3447120D83D01CC5E3FCC889B3FF9078640",
+        "hw=xehpc prop=bwd_d src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=ic32iw8oc8 spec=sd1sh1sw1 ext=stream_k model=01BEAAA7441560F13E00106B3FCB65823F0520603F02F392FE454F03CF3E",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc64ow16 tg=ic4 2d=1 prefetch=x1 ext=out_b4,bias,stream_k model=01F63F7A4331C0683E0340313FCDB821408186453F0208D50546E2F3503F",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8oc2 2d=1 prefetch=x1 ext=out_b4,bias,stream_k model=01B4F3D7433200563E0440403FCCB41040C780B14002AC04BD456ABBC040",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc64ow16 tg=ic8 2d=1 prefetch=x3 ext=out_b4,bias,stream_k model=0106E66C437380D73D04F85A3F9A09B23FF6FFA74102312420462DCECB3F",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc32ow64 tg=ic8 2d=1 prefetch=x1 ext=out_b4,bias,stream_k model=01125DCB4375E0B33D01B8633F9AD9B140F6FFA74102F3B0B7459F656A40",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc64 tg=ic4 2d=1 prefetch=x1 ext=out_b4,bias,stream_k model=014EA99443F6FF1942F6FFA74165C6C73FF6FFA74102C324A2468988393F",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc16 2d=1 prefetch=x3 ext=out_b4,bias,stream_k model=012D706343F283443C05CE2F3FA5EAA142F6FFA74102FF2D274558F36B3F",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb64oc32 tg=ic4oc2 2d=1 prefetch=x1 ext=out_b4,bias,stream_k model=01284AE243EF01D43CFF7F963F3533CF3FF6FFA74102B3FC7E45926A8740",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16mb16oc32 tg=ic2 2d=1 prefetch=x3 ext=out_b4,bias,stream_k model=01E5348143F7FF0F42F6FFA741FEF5D441F6FFA74102208DC445D97B6C3F",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow16 tg=ic8 ext=out_b4,bias,stream_k model=019D4C9C44F000673D0178583F66D2B43FF6FFA741029A73CE456FCAD540",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic16oc32ow16 tg=ic2 ext=out_b4,bias,stream_k model=018EA5FB447400C13D00C0793F31D3DF3FF6FFA74102FF975446712F4140",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8mb16oc16 tg=oc2 ext=out_b4,bias,stream_k model=01CD95AA44F3803C3DFF1F883F330BA53FF9FFFC3F024E2213467787D23F",
+        "hw=xehpc prop=bwd_w src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic8oc16ow32 ext=out_b4,bias,stream_k model=0107A0EB4471C0B03D0290543FCB44C03FF6FFA74102C1184046AD31BD3F",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic2oc2 2d=1 prefetch=x1 ext=bias,stream_k model=01496D9E43F281DB3C0128623FFDBF7B40F6FFA741028077C8456740353F",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow64 tg=ic4oc2 2d=1 prefetch=x1 ext=bias,stream_k model=01E29F5344EE21FF3C0218543FFDDFB041F6FFA74102805197450FA4D540",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 tg=oc4 2d=1 prefetch=x1 ext=bias,stream_k model=01DF909B43F500033DFE6F923F66F65041F6FFA741022D5D0B46D955203F",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow32 tg=ic4 2d=1 prefetch=x1 ext=bias,stream_k model=01F23CEE43F541B73C02D8523FF85FAF40F6FFA7410266990D446C4E0940",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb64oc16 tg=ic2oc4 2d=1 prefetch=x1 ext=bias,stream_k model=012ED67144F4A1BB3C01B86D3F3551A142F6FFA7410200001844E3AB4F41",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic2 2d=1 prefetch=x1 ext=bias,stream_k model=0189A1AF43F521AF3C0138613F66FA9E40F6FFA7410299DA7D45A68B0C3F",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc32 tg=ic4oc4 2d=1 prefetch=x1 ext=bias,stream_k model=01CB64BE43F283623CFF3B8A3FCCD43540F6FFA741020D6B004639992E40",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb8oc16 tg=ic2oc2 2d=1 prefetch=x3 ext=bias,stream_k model=01FA188543F183713C00EC843F00180340F6FFA741028D8034464B26C73E",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=ic8 ext=bias,stream_k model=017EFD2B44EF01DE3C0318543F3551A142F6FFA74102666E5D46299C5740",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8mb16oc16 tg=ic4oc2 ext=bias,stream_k model=01D36F4044F880193D01F0713F33230E41F6FFA74102AFB4D24654CD5340",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic16oc16ow8 ext=bias,stream_k model=01201A23447380D73D0200493FFE3F8A40F6FF2A410274EE0E460CDDB53E",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=16 regs=128 iter=ic8oc16ow16 tg=oc4 ext=bias,stream_k model=015F373144F421B43C02486A3F65964441F6FFA74102ED9B63463F4CDA3F",
+        "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 tg=ow2 spec=ic1oc1 ext=out_b2,out_b4,stream_k model=01AB61AB45000A803F0000803F99E1CD3F7000DA3D02CC119C45AEA3FC3F",
+        "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow16 spec=ic1oc1 ext=out_b2,out_b4,stream_k model=01D8FBA5450000803F0000803F9CC9CA3F1500CF3E027A75A44517ED7B41",
+        "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32mb8 spec=ic1oc1 ext=out_b2,out_b4,stream_k model=011791AE450000803F0000803FCD44A33F7300C63D024A68954500D0713F",
+        "hw=xehpc prop=fwd dw=1 src=axb:s8 wei=axcb:s8 dst=axb:s8 fma=mad simd=32 regs=128 iter=g32ow8 spec=ic1oc1 ext=out_b2,out_b4,stream_k model=01F52BAB450000803F0000803F335BB33F7100EE3D02FE7CA645A785733F",
+        "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 spec=ic1oc1 ext=out_b1,out_b4,stream_k model=01A58618450000803F0000803FC98CCF40F6FFA741029F4EAB45ECC91440",
+        "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 spec=ic1oc1 ext=out_b1,out_b4,stream_k model=01E45E16450000803F0000803F01006340F6FFA74102A78E9F452DBF893F",
+        "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 tg=mb2 spec=ic1oc1 ext=out_b1,out_b4,stream_k model=018A6D16450000803F0000803F5DC09F40F6FFA74102004D9C455D460E40",
+        "hw=xehpc prop=fwd dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32ow8 spec=ic1oc1 ext=out_b1,out_b4,stream_k model=01521D17450000803F0000803F67C687400F00F23E028037B04586C9893F",
+        "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=mb2 spec=ic1oc1 ext=out_b1,out_b2,stream_k model=01BE89E3440000803F0000803F340B08400200623F02C07AAB45FCDA0340",
+        "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 spec=ic1oc1 ext=out_b1,out_b2,stream_k model=01DA38DB440020833F0000803F34D34240F33F5C400220E7A8459A9F723F",
+        "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 spec=ic1oc1 ext=out_b1,out_b2,stream_k model=010B36E144004A813F0000803F344F004006B0023F0282889B456C73733F",
+        "hw=xehpc prop=fwd dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 tg=ow2 spec=ic1oc1 ext=out_b1,out_b2,stream_k model=0133D7DA440000803F0000803FFF2F06400700173F02F02BAB450498FD3F",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32mb8 spec=ic1oc1sd1sh1sw1 model=012C8315450000803F0000803F98397E40F6FFA741",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 spec=ic1oc1sd1sh1sw1 model=01CA8111450000803F0000803F66E6AC40F6FFA741",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 spec=ic1oc1sd1sh1sw1 model=01A61516450000803F0000803FFECF9C40F6FFA741",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:bf16 wei=axbc:bf16 dst=axb:bf16 fma=mad simd=32 regs=128 iter=g32iw8 spec=ic1oc1sd1sh1sw1 model=011C5C15450000803F0000803FFF4F2B40FCFF9D3F",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw8 spec=ic1oc1sd1sh1sw1 ext=stream_k model=01D60AD5440040813F0000803F008C0140FA7FD73F02287D9D4510DFEF40",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 spec=ic1oc1sd1sh1sw1 ext=stream_k model=017612E4440000803F0000803F9855CC3F05E0223F02DE549A450B00733F",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 spec=ic1oc1sd1sh1sw1 ext=stream_k model=01E5A2D7440000803F0000803F996BF83FFA7F0D4002DBBD9E45E84F653F",
+        "hw=xehpc prop=bwd_d dw=1 src=axb:f32 wei=axbc:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32iw8 tg=iw2 spec=ic1oc1sd1sh1sw1 ext=stream_k model=0193B8C8440050803F0000803F00722740FAFFD43F0210F5A045A71EED3F",
+        "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16mb8 spec=ic1oc1 ext=out_b4,bias,stream_k model=01A7516F44F6FFA7411400C03E00408A400000803F026564974649F0A83E",
+        "hw=xehpc prop=bwd_w dw=1 src=axb:bf16 wei=axcb:bf16 dst=axb:bf16 fma=mad simd=16 regs=128 iter=g16ow8 spec=ic1oc1 ext=out_b4,bias,stream_k model=011E1E5944F3804B3D0220603FCD8C93420000803F0298B3224636B9AD3E",
+        "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32mb8 spec=ic1oc1 ext=bias,stream_k model=013A4E5E440700263F03005D3F666626420000803F02BC42884682C3883F",
+        "hw=xehpc prop=bwd_w dw=1 src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=g32ow8 spec=ic1oc1 ext=bias,stream_k model=015B0B4644F480643D014C613FCCCC61420000803F02108F5246FC299A3F",
+        "hw=xehpc prop=fwd src=ABx32a4b:s8 wei=aCBx32b4c:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic4kw8mb32oc64 spec=oc@64 ext=out_b2,out_b4 model=0132A59E441460B03E0050713F337B923F3400653E02F3686046CCC6363F",
+        "hw=xehpc prop=fwd src=ABx32a2b:f16 wei=aCBx32b2c:f16 dst=axb:f16 fma=dpas simd=16 regs=256 iter=ic2kw8mb32oc64 spec=oc@64 ext=out_b1,out_b4 model=013AF19F443000793E00306E3F9A01903F1680903E02AF4066478BEFC33F",
+        "hw=xehpc prop=fwd src=axb:s8 wei=aCBx16b4c:s8 dst=axb:s8 fma=dpas simd=16 regs=256 iter=ic4kw8oc32ow16 tg=oc2ow4 spec=oc@64 ext=out_b2,out_b4 model=01E33E004505E0223F0280553F68728B3FFB3F6D3F0200489F46DFE60C41",
+        "hw=xehpc prop=fwd src=axb:f16 wei=aCBx16b2c:f16 dst=axb:f16 fma=dpas simd=16 regs=256 iter=ic2kw8oc32ow16 tg=oc2ow4 spec=oc@64 ext=out_b1,out_b4 model=0158E2DE440400303F0180783F99E3873F04C0293F02E6FE60463C8CE840",
+        "hw=xehpc prop=bwd_w src=ABx16a4b:bf16 wei=axcb:bf16 dst=axb:bf16 fma=dpas simd=16 regs=256 iter=ic4kw8mb16oc64 tg=kh4 2d=1 prefetch=x3.b spec=dd0id1kd1od1pd0sd1oc@64 ext=out_b4,bias,stream_k model=019B9A1B463600153E0040723F0060A93FF6FFA7410270844E48863BE041",
+        "hw=xehpc prop=bwd_w src=axb:f32 wei=axcb:f32 dst=axb:f32 fma=mad simd=32 regs=128 iter=kw8mb16oc32 iter_outer=kw2 tg=kh8oc2 2d=1 spec=dd0id1kd1od1pd0sd1 ext=bias,stream_k model=0128328F4438E00C3E02905E3F99E52C42F6FFA741026F63C54663EA0C41",
         nullptr,
     };
     return entries;
diff --git a/src/gpu/intel/jit/v2/conv/planner/bench.cpp b/src/gpu/intel/jit/v2/conv/planner/bench.cpp
index 39e29482bdf..3d352e18c44 100644
--- a/src/gpu/intel/jit/v2/conv/planner/bench.cpp
+++ b/src/gpu/intel/jit/v2/conv/planner/bench.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,15 +17,18 @@
 #include "gpu/intel/jit/v2/conv/planner/bench.hpp"
 
 #include "common/dnnl_thread.hpp"
+#include "gpu/intel/jit/v2/conv/debug.hpp"
 #include "gpu/intel/jit/v2/conv/plan.hpp"
-#include "gpu/intel/jit/v2/conv/plan_preset.hpp"
 #include "gpu/intel/jit/v2/conv/plan_registry.hpp"
+#include "gpu/intel/jit/v2/conv/planner/model_fit.hpp"
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
 #include "gpu/intel/ocl/usm_utils.hpp"
 
 #include <algorithm>
 #include <cassert>
 #include <fstream>
 #include <iostream>
+#include <random>
 #include <sstream>
 
 #include "oneapi/dnnl/dnnl.hpp"
@@ -33,9 +36,11 @@
 
 using namespace dnnl;
 
+#ifndef DNNL_EXPERIMENTAL_PROFILING
 extern "C" dnnl_status_t dnnl_reset_profiling(dnnl_stream_t stream);
 extern "C" dnnl_status_t dnnl_query_profiling_data(dnnl_stream_t stream,
         int32_t data_kind, int *num_entries, uint64_t *data);
+#endif
 
 namespace dnnl {
 namespace impl {
@@ -64,14 +69,14 @@ class memory_pool_t {
 public:
     std::unordered_map<int, memory> get_args(
             const std::unordered_map<int, memory::desc> &mds) const {
-        ir_assert(is_finalized_);
+        gpu_assert(is_finalized_);
         std::unordered_map<int, memory> ret;
         for (auto &kv : mds) {
             int id = kv.first;
             auto &base_mem = base_mems_.at(id);
             auto &md = kv.second;
             auto eng = base_mem.get_engine();
-            ir_assert(md.get_size() <= base_mem.get_desc().get_size());
+            gpu_assert(md.get_size() <= base_mem.get_desc().get_size());
             auto mem = ocl_interop::make_memory(md, eng,
                     ocl_interop::memory_kind::usm, base_mem.get_data_handle());
             ret.emplace(id, mem);
@@ -99,6 +104,8 @@ class memory_pool_t {
         is_finalized_ = true;
     }
 
+    operator bool() const { return !base_mems_.empty(); }
+
 private:
     bool is_finalized_ = false;
     std::unordered_map<int, size_t> arg_sizes_;
@@ -107,7 +114,7 @@ class memory_pool_t {
 
 class bench_task_base_t {
 public:
-    static const int iters = 10;
+    static const int iters = 3;
 
     void init_mem(memory_pool_t &mem_pool) {
         for (auto &kv : get_mds()) {
@@ -115,31 +122,59 @@ class bench_task_base_t {
         }
     }
 
-    dnnl_status_t bench(stream &strm, const memory_pool_t &mem_pool) {
+    dnnl_status_t bench_async(stream &strm, const memory_pool_t &mem_pool) {
         using namespace dnnl::impl;
-        CHECK(dnnl_reset_profiling(strm.get()));
         auto args = mem_pool.get_args(get_mds());
         for (int i = 0; i < iters; i++) {
             prim_.execute(strm, args);
         }
+        return status::success;
+    }
+
+    template <typename TaskVectorT>
+    static dnnl_status_t sync(stream &strm, TaskVectorT &vec) {
         strm.wait();
+        int ntasks = (int)vec.size();
         int nentries = 0;
+        int nkernels = 0;
         CHECK(dnnl_query_profiling_data(
                 strm.get(), profiling_data_kind::time, &nentries, nullptr));
-
-        assert(nentries == iters);
+        CHECK(dnnl_query_profiling_data(strm.get(),
+                profiling_data_kind::time_per_kernel, &nkernels, nullptr));
+        gpu_assert(nentries == ntasks * iters);
 
         std::vector<uint64_t> entries(nentries);
+        std::vector<uint64_t> kernel_entries;
         CHECK(dnnl_query_profiling_data(strm.get(), profiling_data_kind::time,
                 &nentries, entries.data()));
-        time_ = entries[0];
-        for (uint64_t t : entries)
-            time_ = std::min(time_, t);
-
+        int kernels_per_entry = ir_utils::safe_div(nkernels, nentries);
+        if (kernels_per_entry > 1) {
+            kernel_entries.resize(nkernels);
+            CHECK(dnnl_query_profiling_data(strm.get(),
+                    profiling_data_kind::time_per_kernel, &nkernels,
+                    kernel_entries.data()));
+        }
+        auto get_bench_time = [&](int i, int j) {
+            int idx = iters * i + j;
+            if (kernels_per_entry == 1) return bench_time_t(entries[idx]);
+            int beg = idx * kernels_per_entry;
+            int end = idx * kernels_per_entry + kernels_per_entry;
+            return bench_time_t(entries[idx], kernel_entries.begin() + beg,
+                    kernel_entries.begin() + end);
+        };
+        for (int i = 0; i < ntasks; i++) {
+            auto time = get_bench_time(i, 0);
+            for (int j = 1; j < iters; j++) {
+                auto j_time = get_bench_time(i, j);
+                time = time.min(j_time);
+            }
+            vec[i].set_time(time);
+        }
         return status::success;
     }
 
-    uint64_t time() const { return time_; }
+    const bench_time_t &time() const { return time_; }
+    void set_time(const bench_time_t &time) { time_ = time; }
 
 protected:
     void set_primitive(const primitive &prim) { prim_ = prim; }
@@ -167,50 +202,69 @@ class bench_task_base_t {
     }
 
     primitive prim_;
-    uint64_t time_ = 0;
+    bench_time_t time_;
 };
 
 using problem_t = dnnl::impl::gpu::intel::jit::v2::conv::problem_t;
 using kernel_desc_t = dnnl::impl::gpu::intel::jit::v2::conv::kernel_desc_t;
 using bench_data_t = dnnl::impl::gpu::intel::jit::v2::conv::bench_data_t;
-using prb_tile_t = dnnl::impl::gpu::intel::jit::prb_tile_t;
-namespace prb_dims = dnnl::impl::gpu::intel::jit::prb_dims;
+using bench_time_t = dnnl::impl::gpu::intel::jit::v2::conv::bench_time_t;
+using pvar_tile_t = dnnl::impl::gpu::intel::jit::pvar_tile_t;
+namespace pvars = dnnl::impl::gpu::intel::jit::pvars;
+
+std::string c_pd_name(dnnl_primitive_desc_t pd) {
+    const char *res = nullptr;
+    dnnl_status_t status
+            = dnnl_primitive_desc_query(pd, dnnl_query_impl_info_str, 0, &res);
+    gpu_assert(status == dnnl_success);
+    return std::string(res);
+}
+
+dim_t opp_pad(dim_t i, dim_t o, dim_t k, dim_t s, dim_t p, dim_t d) {
+    return (o - 1) * s - i + ((k - 1) * (d + 1) + 1) - p;
+}
 
 class bench_task_t : public bench_task_base_t {
 public:
-    bench_task_t(const problem_t &prb) : prb_(prb) {
-        g = prb.shape()[prb_dims::g];
-        mb = prb.shape()[prb_dims::mb];
-        oc = prb.shape()[prb_dims::oc];
-        ic = prb.shape()[prb_dims::ic];
-        ih = prb.shape()[prb_dims::ih];
-        iw = prb.shape()[prb_dims::iw];
-        oh = prb.shape()[prb_dims::oh];
-        ow = prb.shape()[prb_dims::ow];
-        kh = prb.shape()[prb_dims::kh];
-        kw = prb.shape()[prb_dims::kw];
-        sh = prb.shape()[prb_dims::sh];
-        sw = prb.shape()[prb_dims::sw];
-        ph = prb.shape()[prb_dims::ph];
-        pw = prb.shape()[prb_dims::pw];
-    }
+    bench_task_t(const problem_t &prb)
+        : prb_(prb)
+        , g(prb.shape()[pvars::g])
+        , mb(prb.shape()[pvars::mb])
+        , oc(prb.shape()[pvars::oc])
+        , ic(prb.shape()[pvars::ic])
+        , ih(prb.shape()[pvars::ih])
+        , iw(prb.shape()[pvars::iw])
+        , oh(prb.shape()[pvars::oh])
+        , ow(prb.shape()[pvars::ow])
+        , kh(prb.shape()[pvars::kh])
+        , kw(prb.shape()[pvars::kw])
+        , sh(prb.shape()[pvars::sh])
+        , sw(prb.shape()[pvars::sw])
+        , ph(prb.shape()[pvars::ph])
+        , pw(prb.shape()[pvars::pw]) {}
+
+    const problem_t &prb() const { return prb_; }
 
     bool init_primitive(engine &eng) {
+        const std::string v2_impl_name = "jit:ir_v2";
         try {
-            memory::dims src_dims = {mb, g * ic, ih, iw};
-            memory::dims wei_dims = {g, oc, ic, kh, kw};
-            memory::dims dst_dims = {mb, g * oc, oh, ow};
-
-            memory::dims strides = {sh, sw};
-            memory::dims padding_l = {ph, pw};
-            memory::dims padding_r = {ph, pw};
+            memory::dims src_dims = {mb, g * ic, 1, ih, iw};
+            memory::dims wei_dims = {g, oc, ic, 1, kh, kw};
+            memory::dims dst_dims = {mb, g * oc, 1, oh, ow};
+            memory::dims bias_dims = {g * oc};
+
+            memory::dims strides = {1, sh, sw};
+            memory::dims padding_l = {0, ph, pw};
+            memory::dims padding_r(3);
+            padding_r[0] = 0;
+            padding_r[1] = opp_pad(ih, oh, kh, sh, ph, 0);
+            padding_r[2] = opp_pad(iw, ow, kw, sw, pw, 0);
 
             switch (prb_.prop()) {
                 case prop_kind::forward_inference:
                 case prop_kind::forward_training: {
                     auto src_md = to_memory_desc(prb_.src_tag(), src_dims);
-                    auto wei_md = to_memory_desc(
-                            prb_.wei_tag(), wei_dims, /*is_wei=*/true);
+                    auto wei_md = to_memory_desc(prb_.wei_tag(), wei_dims);
                     auto dst_md = to_memory_desc(prb_.dst_tag(), dst_dims);
 
                     primitive_attr attr;
@@ -219,8 +273,10 @@ class bench_task_t : public bench_task_base_t {
                             algorithm::convolution_direct, src_md, wei_md,
                             memory::desc(), dst_md, strides, padding_l,
                             padding_r, attr);
-                    auto *impl_name = pd.impl_info_str();
-                    if (strcmp(impl_name, "jit:ir_v2") != 0) {
+                    while (pd.impl_info_str() != v2_impl_name) {
+                        if (!pd.next_impl()) break;
+                    }
+                    if (pd.impl_info_str() != v2_impl_name) {
                         std::cout << "Error: expected conv_v2." << std::endl;
                         exit(1);
                     }
@@ -230,23 +286,29 @@ class bench_task_t : public bench_task_base_t {
                 }
                 case prop_kind::backward_data: {
                     auto diff_src_md = to_memory_desc(prb_.src_tag(), src_dims);
-                    auto wei_md = to_memory_desc(
-                            prb_.wei_tag(), wei_dims, /*is_wei=*/true);
+                    auto wei_md = to_memory_desc(prb_.wei_tag(), wei_dims);
                     auto diff_dst_md = to_memory_desc(prb_.dst_tag(), dst_dims);
 
                     // Uses the C API as fwd_hint is not currently optional
                     // under the C++ API.
                     primitive_attr attr;
                     dnnl_primitive_desc_t c_pd = nullptr;
-                    CHECK(dnnl_convolution_backward_data_primitive_desc_create(
-                            &c_pd, eng.get(), alg_kind::convolution_direct,
-                            diff_src_md.get(), wei_md.get(), diff_dst_md.get(),
-                            &strides[0], nullptr, &padding_l[0], &padding_r[0],
-                            nullptr, attr.get()));
+                    auto status
+                            = dnnl_convolution_backward_data_primitive_desc_create(
+                                    &c_pd, eng.get(),
+                                    alg_kind::convolution_direct,
+                                    diff_src_md.get(), wei_md.get(),
+                                    diff_dst_md.get(), &strides[0], nullptr,
+                                    &padding_l[0], &padding_r[0], nullptr,
+                                    attr.get());
+                    if (status != status::success) return false;
+                    while (c_pd_name(c_pd) != v2_impl_name) {
+                        auto status = dnnl_primitive_desc_next_impl(c_pd);
+                        if (status == dnnl_last_impl_reached) break;
+                        gpu_assert(status == dnnl_success);
+                    }
                     auto pd = convolution_backward_data::primitive_desc(c_pd);
-
-                    auto *impl_name = pd.impl_info_str();
-                    if (strcmp(impl_name, "jit:ir_v2") != 0) {
+                    if (pd.impl_info_str() != v2_impl_name) {
                         std::cout << "Error: expected conv_v2." << std::endl;
                         exit(1);
                     }
@@ -256,24 +318,36 @@ class bench_task_t : public bench_task_base_t {
                 }
                 case prop_kind::backward_weights: {
                     auto src_md = to_memory_desc(prb_.src_tag(), src_dims);
-                    auto diff_wei_md = to_memory_desc(
-                            prb_.wei_tag(), wei_dims, /*is_wei=*/true);
+                    auto diff_wei_md = to_memory_desc(prb_.wei_tag(), wei_dims);
                     auto diff_dst_md = to_memory_desc(prb_.dst_tag(), dst_dims);
+                    memory::desc diff_bias_md;
+                    if (!prb_.bias_type().is_undef()) {
+                        auto tag = make_conv_layout_tag(tensor_kind_t::bias,
+                                "a:" + prb_.bias_type().str());
+                        diff_bias_md = to_memory_desc(tag, bias_dims);
+                    }
 
                     // Uses the C API as fwd_hint is not currently optional
                     // under the C++ API.
                     primitive_attr attr;
                     dnnl_primitive_desc_t c_pd = nullptr;
-                    CHECK(dnnl_convolution_backward_weights_primitive_desc_create(
-                            &c_pd, eng.get(), alg_kind::convolution_direct,
-                            src_md.get(), diff_wei_md.get(), nullptr,
-                            diff_dst_md.get(), &strides[0], nullptr,
-                            &padding_l[0], &padding_r[0], nullptr, attr.get()));
+                    auto status
+                            = dnnl_convolution_backward_weights_primitive_desc_create(
+                                    &c_pd, eng.get(),
+                                    alg_kind::convolution_direct, src_md.get(),
+                                    diff_wei_md.get(), diff_bias_md.get(),
+                                    diff_dst_md.get(), &strides[0], nullptr,
+                                    &padding_l[0], &padding_r[0], nullptr,
+                                    attr.get());
+                    if (status != status::success) return false;
+                    while (c_pd_name(c_pd) != v2_impl_name) {
+                        auto status = dnnl_primitive_desc_next_impl(c_pd);
+                        if (status == dnnl_last_impl_reached) break;
+                        gpu_assert(status == dnnl_success);
+                    }
                     auto pd = convolution_backward_weights::primitive_desc(
                             c_pd);
-
-                    auto *impl_name = pd.impl_info_str();
-                    if (strcmp(impl_name, "jit:ir_v2") != 0) {
+                    if (pd.impl_info_str() != v2_impl_name) {
                         std::cout << "Error: expected conv_v2." << std::endl;
                         exit(1);
                     }
@@ -307,16 +381,46 @@ class bench_task_t : public bench_task_base_t {
     }
 
 private:
-    memory::desc to_memory_desc(const layout_tag_t &tag,
-            const memory::dims &dims, bool is_wei = false) const {
-        auto type = static_cast<dnnl::memory::data_type>(to_dnnl(tag.type()));
-        memory::desc md(dims, type,
-                is_wei ? memory::format_tag::ghwio : memory::format_tag::nhwc);
-        return md;
+    memory::desc to_memory_desc(
+            const layout_tag_t &tag, const memory::dims &dims) const {
+        gpu_assert(tag.raw_tag().ndims() == dims.size());
+        auto md = utils::make_unique<memory_desc_t>();
+        md->ndims = tag.raw_tag().ndims();
+        md->data_type = to_dnnl(tag.type());
+        md->format_kind = format_kind::blocked;
+        auto &blk = md->format_desc.blocking;
+        blk = blocking_desc_t();
+        dim_t stride = 1;
+        auto rem_dims = dims;
+        for (int i = tag.raw_tag().nentries() - 1; i >= 0; i--) {
+            auto &e = tag.raw_tag().entries()[i];
+            if (e.is_blocked && e.block != 0) {
+                blk.inner_idxs[blk.inner_nblks] = e.index();
+                blk.inner_blks[blk.inner_nblks] = e.block;
+                rem_dims[e.index()]
+                        = utils::div_up(rem_dims[e.index()], e.block);
+                blk.inner_nblks++;
+                stride *= e.block;
+            } else {
+                blk.strides[e.index()] = stride;
+                stride *= rem_dims[e.index()];
+            }
+        }
+        for (int i = 0; i < md->ndims; i++) {
+            dim_t inner = 1;
+            for (int j = 0; j < blk.inner_nblks; j++) {
+                if (blk.inner_idxs[j] == i) inner *= blk.inner_blks[j];
+            }
+            md->dims[i] = dims[i];
+            md->padded_dims[i] = utils::rnd_up(dims[i], inner);
+        }
+        std::reverse(blk.inner_idxs, blk.inner_idxs + blk.inner_nblks);
+        std::reverse(blk.inner_blks, blk.inner_blks + blk.inner_nblks);
+        return memory::desc(md.release());
     }
 
     problem_t prb_;
-    memory::dim mb, g;
+    memory::dim g, mb;
     memory::dim oc, ic;
     memory::dim ih, iw;
     memory::dim oh, ow;
@@ -325,51 +429,153 @@ class bench_task_t : public bench_task_base_t {
     memory::dim ph, pw;
 };
 
-int random(int a, int b) {
+dim_t random(dim_t a, dim_t b) {
     return a + rand() % (b - a + 1);
 }
 
-prb_tile_t random_shape(bool is_dw = false) {
-    prb_tile_t s = problem_t::default_shape();
-    if (is_dw) {
-        auto c = random(2, 512);
-        s[prb_dims::g] = c;
-        s[prb_dims::mb] = random(1, 16);
-        s[prb_dims::ic] = 1;
-        s[prb_dims::oc] = 1;
-        s[prb_dims::iw] = s[prb_dims::ow] = random(1, 512);
+struct random_dim_t {
+    dim_t lo = 0;
+    dim_t hi = 0;
+    dim_t tile = 0;
+
+    random_dim_t(const pvar_t &dim, dim_t _tile) : tile(_tile) {}
+    random_dim_t with_range(dim_t _lo, dim_t _hi) {
+        auto ret = *this;
+        ret.lo = utils::div_up(_lo, tile);
+        ret.hi = _hi / tile;
+        return ret;
+    }
+    explicit operator bool() const { return lo <= hi; }
+    bool with_tile() const { return tile > 1; }
+    dim_t operator()() const {
+        gpu_assert(*this);
+        return random(lo, hi) * tile;
+    }
+};
+
+struct random_dim_set_t {
+    std::vector<random_dim_t> dims;
+
+    random_dim_set_t(const random_dim_t &d) {
+        if (!d) return;
+        dims.push_back(d);
+    }
+    random_dim_set_t operator|(const random_dim_set_t &other) const {
+        random_dim_set_t ret = *this;
+        ret.dims.insert(ret.dims.end(), other.dims.begin(), other.dims.end());
+        return ret;
+    }
+    size_t size() const { return dims.size(); }
+    bool with_tile() const { return dims[0].with_tile(); }
+    dim_t operator()() const {
+        dim_t idx = random(0, static_cast<dim_t>(size()) - 1);
+        return dims[idx]();
+    }
+};
+
+random_dim_set_t operator|(const random_dim_t &a, const random_dim_set_t &b) {
+    return random_dim_set_t(a) | b;
+}
+
+pvar_tile_t random_shape(
+        const bench_input_params_t &params, const pvar_tile_t &tile) {
+    auto make_random_dim = [&](const pvar_t &dim, dim_t lo = 0, dim_t hi = 0) {
+        auto ret = random_dim_t(dim, tile.get(dim, 1));
+        return ret.with_range(lo, hi);
+    };
+    auto make_random_dim_set
+            = [&](const pvar_t &dim, dim_t s, dim_t m, dim_t l) {
+                  auto d = make_random_dim(dim);
+                  auto d_s = d.with_range(1, s);
+                  auto d_m = d.with_range(s + 1, m);
+                  auto d_l = d.with_range(m + 1, l);
+                  return d_s | d_m | d_l;
+              };
+    pvar_tile_t s = problem_t::default_shape();
+    auto g = make_random_dim(pvars::g, 2, 512);
+    auto mb = make_random_dim_set(pvars::mb, 1, 16, 128);
+    auto ic = make_random_dim_set(pvars::ic, 64, 512, 2048);
+    auto oc = make_random_dim_set(pvars::oc, 64, 512, 2048);
+    auto ow = make_random_dim_set(pvars::ow, 64, 512, 2048);
+    auto iw = make_random_dim_set(pvars::iw, 64, 512, 2048);
+    if (params.is_dw) {
+        s[pvars::g] = g();
+        s[pvars::mb] = mb();
+        s[pvars::ic] = 1;
+        s[pvars::oc] = 1;
+        s[pvars::iw] = s[pvars::ow] = (ow.with_tile() ? ow() : iw());
     } else {
-        s[prb_dims::g] = 1;
-        s[prb_dims::mb] = random(1, 16);
-        s[prb_dims::ic] = random(1, 512);
-        s[prb_dims::oc] = random(1, 512);
-        s[prb_dims::iw] = s[prb_dims::ow] = random(1, 512);
+        s[pvars::g] = 1;
+        s[pvars::mb] = mb();
+        s[pvars::ic] = ic();
+        s[pvars::oc] = oc();
+        s[pvars::iw] = s[pvars::ow] = (ow.with_tile() ? ow() : iw());
+    }
+    s[pvars::kw] = tile.get(pvars::kw, 1);
+    s[pvars::pw] = (s[pvars::kw] - 1) / 2;
+    s[pvars::kh] = tile.get(pvars::kh, 1);
+    s[pvars::ph] = (s[pvars::kh] - 1) / 2;
+    for (auto &d : s) {
+        dim_t value;
+        if (params.reqs.get_value(d, value)) s[d] = value;
     }
     return s;
 }
 
-std::vector<problem_t> generate_problems(const kernel_desc_t &kd) {
-    srand(ir_utils::get_hash(jit::stringify(kd)));
+double footprint(const layout_tag_t &src, const layout_tag_t &wei,
+        const layout_tag_t &dst, const pvar_tile_t &shape) {
+#define GET(name) shape[pvars::name]
+    double src_elems
+            = (double)GET(g) * GET(mb) * GET(ic) * GET(id) * GET(ih) * GET(iw);
+    double wei_elems
+            = (double)GET(g) * GET(oc) * GET(ic) * GET(kd) * GET(kh) * GET(kw);
+    double dst_elems
+            = (double)GET(g) * GET(mb) * GET(oc) * GET(od) * GET(oh) * GET(ow);
+#undef GET
+    double ret = 0;
+    ret += src_elems * src.type().size();
+    ret += wei_elems * wei.type().size();
+    ret += dst_elems * dst.type().size();
+    return ret;
+}
+
+pvar_tile_t expand_tile(
+        prop_kind_t prop, const prb_reqs_t &reqs, const pvar_tile_t &_tile) {
+    pvar_tile_t tile = _tile;
+    for (auto &d : conv_index_dims(prop)) {
+        dim_t mod = reqs.max_factor(d);
+        mod = math::lcm(mod, tile.get(d, 1));
+        if (mod == 1) continue;
+        tile[d] = mod;
+    }
+    return tile;
+}
+
+std::vector<problem_t> generate_problems(const bench_input_params_t &params) {
+    if (params.nprbs == 0) return {};
+    const double max_ops = 1e10;
+    const double max_bytes = 100e6;
+    auto tile = expand_tile(params.prop, params.reqs, params.tile);
+    srand(static_cast<unsigned>(
+            ir_utils::get_hash(params.reqs.str()) & 0xFFFFFFFFu));
     std::vector<problem_t> ret;
-    const int nprbs = 100;
-    const int max_iters = (1 << 20);
+    const int max_iters = (1 << 24);
     for (int iter = 0; iter < max_iters; iter++) {
-        problem_t prb;
-        prb.set_hw(kd.hw);
-        prb.set_prop(kd.prop);
-        prb.set_shape(random_shape(kd.is_dw));
-        prb.set_src_tag(kd.src_tag);
-        prb.set_wei_tag(kd.wei_tag);
-        prb.set_dst_tag(kd.dst_tag);
-        if (!kd.fits(prb, /*check_tags=*/false)) continue;
-        ir_assert(kd.fits(prb));
-        ret.push_back(prb);
-        if ((int)ret.size() >= nprbs) break;
-    }
-    if ((int)ret.size() < nprbs) {
-        std::cout << "Could not generate " << nprbs << " problems after "
+        auto shape = random_shape(params, tile);
+        if (problem_t::ops(params.prop, shape) > max_ops) continue;
+        if (footprint(params.src_tag, params.wei_tag, params.dst_tag, shape)
+                > max_bytes)
+            continue;
+        auto prb = params.problem();
+        prb.set_shape(shape);
+        if (!params.reqs.fits(prb.shape())) continue;
+        ret.push_back(std::move(prb));
+        if ((int)ret.size() >= params.nprbs) break;
+    }
+    if ((int)ret.size() < params.nprbs) {
+        std::cout << "Could not generate " << params.nprbs << " problems after "
                   << max_iters << " iterations" << std::endl;
-        std::cout << kd.reqs << std::endl;
+        std::cout << params.reqs << std::endl;
         exit(1);
     }
     return ret;
@@ -386,52 +592,194 @@ std::vector<problem_t> load_problems(const std::string &path) {
     return prbs;
 }
 
-bench_data_t bench(
-        const bench_manager_t &bench_mger, const kernel_desc_t &_kernel_desc) {
-    if (!_kernel_desc.is_supported()) return {};
-    auto kernel_desc = _kernel_desc;
-    auto plan = create_conv_plan_and_finalize_desc(kernel_desc);
-    if (!plan) return {};
+bench_data_t bench(const bench_manager_t &bench_mger,
+        const kernel_desc_t &kernel_desc, std::vector<bench_task_t> &tasks,
+        memory_pool_t *mem_pool_ptr = nullptr) {
+    int ntasks = (int)tasks.size();
 
     auto eng = bench_mger.get_engine();
-    auto prbs = generate_problems(kernel_desc);
-    int nprbs = (int)prbs.size();
-
-    std::vector<bench_task_t> tasks;
-    for (auto &prb : prbs) {
-        tasks.emplace_back(prb);
-    }
-
+    auto strm = bench_mger.get_stream();
+    std::cout << "Running benchmark for descriptor: " << kernel_desc.cmd_str()
+              << std::endl;
+    gpu_assert(!kernel_desc.spec.is_dynamic());
+    auto kernel_desc_min_dims = kernel_desc;
+    kernel_desc_min_dims.spec.mode = specialization_mode_t::min_dims;
     {
-        auto guard = plan_preset_t::instance().make_guard(kernel_desc);
+        auto guard = debug_t::make_kernel_desc_setter(kernel_desc_min_dims);
         if (!tasks[0].init_primitive(eng)) return {};
     }
 
-    parallel_nd(nprbs, [&](dim_t i) {
-        auto guard = plan_preset_t::instance().make_guard(kernel_desc);
+    parallel_nd(ntasks, [&](dim_t i) {
+        auto guard = debug_t::make_kernel_desc_setter(kernel_desc_min_dims);
         bool ok = tasks[i].init_primitive(eng);
         if (!ok) throw std::runtime_error("Initialization failed");
     });
 
-    auto flags = stream_flags::in_order | stream_flags::profiling;
-    stream strm(eng, static_cast<stream::flags>(flags));
-    memory_pool_t mem_pool;
-    for (auto &t : tasks) {
-        t.init_mem(mem_pool);
+    memory_pool_t _mem_pool;
+    memory_pool_t &mem_pool = (mem_pool_ptr ? *mem_pool_ptr : _mem_pool);
+    if (!mem_pool) {
+        for (auto &t : tasks) {
+            t.init_mem(mem_pool);
+        }
+        mem_pool.finalize(strm);
     }
-    mem_pool.finalize(strm);
 
-    bench_data_t bd(kernel_desc);
-    for (int i = 0; i < nprbs; i++) {
-        tasks[i].bench(strm, mem_pool);
-        bd.add(prbs[i], tasks[i].time());
+    bench_data_t bd(0, kernel_desc);
+    dnnl_reset_profiling(strm.get());
+    for (int i = 0; i < ntasks; i++) {
+        tasks[i].bench_async(strm, mem_pool);
+    }
+    bench_task_base_t::sync(strm, tasks);
+    for (int i = 0; i < ntasks; i++) {
+        bd.add(tasks[i].prb(), tasks[i].time());
     }
-
     std::cout << bd << std::endl;
-
     return bd;
 }
 
+class bench_runner_impl_t {
+public:
+    bench_runner_impl_t(const bench_manager_t &bench_mger,
+            const bench_input_params_t &params)
+        : bench_mger_(bench_mger) {
+        auto prbs = generate_problems(params);
+        for (auto &prb : prbs) {
+            tasks_.emplace_back(prb);
+        }
+    }
+
+    bench_data_t bench(const kernel_desc_t &kernel_desc) {
+        if (tasks_.empty()) return bench_data_t();
+        if (!create_conv_plan(kernel_desc, bench_mger_.hw())) return {};
+        return planner::bench(bench_mger_, kernel_desc, tasks_, &mem_pool_);
+    }
+
+private:
+    const bench_manager_t &bench_mger_;
+    std::vector<bench_task_t> tasks_;
+    memory_pool_t mem_pool_;
+};
+
+bench_runner_t::bench_runner_t(
+        const bench_manager_t &bench_mger, const bench_input_params_t &params)
+    : impl_(std::make_shared<bench_runner_impl_t>(bench_mger, params)) {}
+
+bench_data_t bench_runner_t::bench(const kernel_desc_t &kernel_desc) {
+    return impl_->bench(kernel_desc);
+}
+
+bench_data_t bench(const bench_manager_t &bench_mger,
+        const kernel_desc_t &kernel_desc, int nprbs) {
+    if (!create_conv_plan(kernel_desc, bench_mger.hw())) return {};
+    bench_runner_t runner(bench_mger,
+            bench_input_params_t(kernel_desc, bench_mger.hw(), nprbs));
+    return runner.bench(kernel_desc);
+}
+
+bool try_create(
+        const bench_manager_t &bench_mger, const kernel_desc_t &kernel_desc) {
+    bench_input_params_t params(kernel_desc, bench_mger.hw(), /*nprbs=*/1);
+    bench_task_t task(generate_problems(params)[0]);
+    auto engine = bench_mger.get_engine();
+    auto guard = debug_t::make_kernel_desc_setter(kernel_desc);
+    return task.init_primitive(engine);
+}
+
+layout_tag_t &get_out_tag(kernel_desc_t &kernel_desc) {
+    switch (kernel_desc.prop) {
+        case prop_kind::forward: return kernel_desc.dst_tag;
+        case prop_kind::backward_data: return kernel_desc.src_tag;
+        case prop_kind::backward_weights: return kernel_desc.wei_tag;
+        default: gpu_error_not_expected();
+    }
+    return kernel_desc.dst_tag;
+}
+
+std::vector<type_t> get_out_types(const kernel_desc_t &kernel_desc) {
+    std::vector<type_t> ret;
+    switch (kernel_desc.prop) {
+        case prop_kind::forward:
+            ret.push_back(type_t::s8());
+            ret.push_back(type_t::f16());
+            ret.push_back(type_t::f32());
+            break;
+        case prop_kind::backward_data: break;
+        case prop_kind::backward_weights:
+            ret.push_back(type_t::f32());
+            if (kernel_desc.wei_tag.type().is_bf16())
+                ret.push_back(type_t::bf16());
+        default: break;
+    }
+    return ret;
+}
+
+kernel_desc_t try_extensions(
+        const bench_manager_t &bench_mger, const kernel_desc_t &kernel_desc) {
+    auto &desc_out_type = kernel_desc.c_type();
+    std::vector<prb_reqs_t> reqs_vec({kernel_desc.reqs()});
+    std::vector<int> out_type_sizes({desc_out_type.size()});
+    extensions_t ext;
+    for (auto &out_type : get_out_types(kernel_desc)) {
+        if (out_type.size() == desc_out_type.size()) continue;
+        auto d = kernel_desc;
+        auto &tag = get_out_tag(d);
+        tag = layout_tag_t(tag.desc(), out_type, tag.raw_tag());
+        if (!create_conv_plan(d, bench_mger.hw())) continue;
+        if (!try_create(bench_mger, d)) continue;
+        ext.add(extensions_t::out_size(out_type.size()));
+        reqs_vec.push_back(d.reqs());
+        out_type_sizes.push_back(out_type.size());
+    }
+
+    if (kernel_desc.prop == prop_kind::backward_weights
+            && !kernel_desc.with_bias_bwd_w()) {
+        auto d = kernel_desc;
+        d.bias_type = type_t::f32();
+        if (create_conv_plan(d, bench_mger.hw()) && try_create(bench_mger, d)) {
+            ext.add(extension_kind_t::bias);
+            reqs_vec.push_back(d.reqs());
+            out_type_sizes.push_back(desc_out_type.size());
+        }
+    }
+
+    // Try Stream-K.
+    bool try_stream_k = !kernel_desc.use_stream_k;
+    try_stream_k &= (kernel_desc.prop != prop_kind::backward_data
+            || (kernel_desc.a_type() == type_t::f32()
+                    && kernel_desc.b_type() == type_t::f32()));
+    try_stream_k &= (!kernel_desc.is_dw
+            || kernel_desc.prop == prop_kind::backward_weights);
+    if (try_stream_k) {
+        auto d = to_stream_k(kernel_desc, /*check_ext=*/false);
+        if (!d.is_empty()) {
+            if (create_conv_plan(d, bench_mger.hw())
+                    && try_create(bench_mger, d)) {
+                ext.add(extension_kind_t::stream_k);
+            }
+        }
+    }
+
+    auto _kernel_desc = kernel_desc;
+    _kernel_desc.ext = ext;
+    return _kernel_desc;
+}
+
+plan_registry_t::entry_t prepare_plan_registry_entry(
+        const bench_manager_t &bench_mger, const kernel_desc_t &kernel_desc) {
+    plan_registry_t::entry_t entry;
+    auto bd = bench(bench_mger, kernel_desc);
+    if (!bd) return entry;
+    model_fit(bd, entry.model_set);
+    entry.desc = try_extensions(bench_mger, kernel_desc);
+    if (entry.desc.ext.has(extension_kind_t::stream_k)) {
+        // Fit another model for Stream-K.
+        auto d_sk = to_stream_k(entry.desc);
+        auto bd = bench(bench_mger, d_sk);
+        model_fit(bd, entry.model_set);
+    }
+    return entry;
+}
+
 } // namespace planner
 } // namespace conv
 } // namespace v2
diff --git a/src/gpu/intel/jit/v2/conv/planner/bench.hpp b/src/gpu/intel/jit/v2/conv/planner/bench.hpp
index 86201d1f749..d933ca267f3 100644
--- a/src/gpu/intel/jit/v2/conv/planner/bench.hpp
+++ b/src/gpu/intel/jit/v2/conv/planner/bench.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,9 +19,12 @@
 
 #include "gpu/intel/jit/v2/conv/bench_data.hpp"
 #include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
+#include "gpu/intel/jit/v2/conv/plan_registry.hpp"
 
 #include "oneapi/dnnl/dnnl.hpp"
 
+#include <memory>
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -33,15 +36,84 @@ namespace planner {
 
 class bench_manager_t {
 public:
-    bench_manager_t() : engine_(engine::kind::gpu, 0) {}
+    bench_manager_t()
+        : engine_(engine::kind::gpu, 0)
+        , stream_(engine_, _stream_flags)
+        , hw_(engine_.get()) {}
     const engine &get_engine() const { return engine_; }
+    const stream &get_stream() const { return stream_; }
+    const hw_t &hw() const { return hw_; }
     ~bench_manager_t();
 
 private:
+    static const stream::flags _stream_flags = static_cast<stream::flags>(
+            stream_flags::in_order | stream_flags::profiling);
+
     engine engine_;
+    stream stream_;
+    hw_t hw_;
+};
+
+struct bench_input_params_t {
+    static const int default_nprbs = 250;
+
+    hw_t hw;
+    prop_kind_t prop;
+    layout_tag_t src_tag;
+    layout_tag_t wei_tag;
+    layout_tag_t dst_tag;
+    prb_reqs_t reqs;
+    bool is_dw = false;
+    type_t bias_type;
+    pvar_tile_t tile;
+    int nprbs = 0;
+
+    bench_input_params_t() = default;
+    bench_input_params_t(const kernel_desc_t &kernel_desc, const hw_t &hw,
+            int nprbs = default_nprbs)
+        : hw(hw)
+        , prop(kernel_desc.prop)
+        , src_tag(kernel_desc.src_tag)
+        , wei_tag(kernel_desc.wei_tag)
+        , dst_tag(kernel_desc.dst_tag)
+        , reqs(kernel_desc.reqs())
+        , is_dw(kernel_desc.is_dw)
+        , bias_type(kernel_desc.bias_type)
+        , tile(kernel_desc.iter_tile)
+        , nprbs(nprbs) {
+        for (auto &d : kernel_desc.thread_group_tile) {
+            tile[d] = tile.get(d, 1) * kernel_desc.thread_group_tile[d];
+        }
+    }
+
+    problem_t problem() const {
+        problem_t prb;
+        prb.set_hw(hw);
+        prb.set_prop(prop);
+        prb.set_src_tag(src_tag);
+        prb.set_wei_tag(wei_tag);
+        prb.set_dst_tag(dst_tag);
+        prb.set_bias_type(bias_type);
+        return prb;
+    }
+};
+
+class bench_runner_impl_t;
+
+class bench_runner_t {
+public:
+    bench_runner_t(const bench_manager_t &bench_mger,
+            const bench_input_params_t &params);
+    bench_data_t bench(const kernel_desc_t &kernel_desc);
+
+private:
+    std::shared_ptr<bench_runner_impl_t> impl_;
 };
 
-bench_data_t bench(
+bench_data_t bench(const bench_manager_t &bench_mger,
+        const kernel_desc_t &kernel_desc,
+        int nprbs = bench_input_params_t::default_nprbs);
+plan_registry_t::entry_t prepare_plan_registry_entry(
         const bench_manager_t &bench_mger, const kernel_desc_t &kernel_desc);
 
 } // namespace planner
diff --git a/src/gpu/intel/jit/v2/conv/planner/mkl_iface.hpp b/src/gpu/intel/jit/v2/conv/planner/mkl_iface.hpp
deleted file mode 100644
index 573cdd1ff9c..00000000000
--- a/src/gpu/intel/jit/v2/conv/planner/mkl_iface.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_JIT_V2_CONV_PLANNER_MKL_IFACE_HPP
-#define GPU_INTEL_JIT_V2_CONV_PLANNER_MKL_IFACE_HPP
-
-#include <dlfcn.h>
-#include <stdexcept>
-
-#include "common/cpp_compat.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace jit {
-namespace v2 {
-namespace conv {
-namespace planner {
-
-template <typename FuncTypeT>
-struct func_t {
-    void load(void *handle, const char *name) {
-        if (!handle) return;
-        name_ = name;
-        ptr_ = reinterpret_cast<FuncTypeT>(dlsym(handle, name));
-    }
-
-    template <typename... ArgsT>
-    typename cpp_compat::invoke_result<FuncTypeT, ArgsT...>::type operator()(
-            ArgsT... args) const {
-        if (!ptr_) throw std::runtime_error("Cannot call function " + name_);
-        return ptr_(args...);
-    }
-
-    std::string name_;
-    FuncTypeT ptr_ = nullptr;
-};
-
-struct mkl_iface_t {
-    using LAPACKE_sgelsd_func_type = int (*)(int, int, int, int, float *, int,
-            float *, int, float *, float, int *);
-    using mkl_set_threading_layer_func_type = int (*)(int);
-    func_t<mkl_set_threading_layer_func_type> mkl_set_threading_layer;
-    func_t<LAPACKE_sgelsd_func_type> LAPACKE_sgelsd;
-
-    static mkl_iface_t &instance() {
-        static mkl_iface_t _instance;
-        return _instance;
-    }
-
-    mkl_iface_t() {
-        const char *library_name = "libmkl_rt.so";
-        lib_ = dlopen(library_name, RTLD_LAZY);
-        if (!lib_) {
-            dnnl::impl::verbose_printf(
-                    "Error: cannot open library: %s\n", library_name);
-            exit(1);
-        }
-        mkl_set_threading_layer.load(lib_, "MKL_Set_Threading_Layer");
-        LAPACKE_sgelsd.load(lib_, "LAPACKE_sgelsd");
-        mkl_set_threading_layer(3);
-    }
-
-    ~mkl_iface_t() {
-        if (lib_) dlclose(lib_);
-    }
-
-    void *lib_ = nullptr;
-};
-
-} // namespace planner
-} // namespace conv
-} // namespace v2
-} // namespace jit
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp b/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp
index eee6e224633..a69a133aaea 100644
--- a/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp
+++ b/src/gpu/intel/jit/v2/conv/planner/model_fit.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,8 +16,6 @@
 
 #include "gpu/intel/jit/v2/conv/planner/model_fit.hpp"
 
-#include "gpu/intel/jit/v2/conv/planner/mkl_iface.hpp"
-
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -27,59 +25,178 @@ namespace v2 {
 namespace conv {
 namespace planner {
 
-linear_model_t linear_model_fit(const vec2d &X, const vec1d &y) {
-    int nsamples = (int)y.size();
-    int nfeatures = (int)X[0].size();
-    std::vector<float> _X(nsamples * (nfeatures + 1));
-    auto _y = y;
-    int idx = 0;
-    for (int i = 0; i < nsamples; i++)
-        _X[idx++] = 1.0f;
-    for (int j = 0; j < nfeatures; j++) {
-        for (int i = 0; i < nsamples; i++) {
-            _X[idx++] = X[i][j];
+namespace {
+
+float r2_score(
+        model_kind_t kind, const vec2d &X, const vec1d &y, const vec1d &coef) {
+    std::vector<float> y_true, y_pred;
+    for (size_t i = 0; i < X.size(); i++) {
+        y_true.push_back(y[i]);
+        y_pred.push_back(model_t::predict(kind, X[i], coef));
+    }
+    float u = 0;
+    float v = 0;
+    float y_mean = 0;
+    int n = (int)y_true.size();
+    for (int i = 0; i < n; i++)
+        y_mean += y_true[i];
+    y_mean /= n;
+    for (int i = 0; i < n; i++) {
+        u += (y_true[i] - y_pred[i]) * (y_true[i] - y_pred[i]);
+        v += (y_true[i] - y_mean) * (y_true[i] - y_mean);
+    }
+    return 1 - u / v;
+}
+
+struct model_params_t {
+    struct param_t {
+        std::string name;
+        float val = 0;
+        float lo = 0;
+        float hi = 0;
+        float step = 0;
+
+        param_t(const std::string &name, float val, float lo, float hi)
+            : name(name), val(val), lo(lo), hi(hi) {
+            step = (hi - lo) / 5;
         }
+
+        void set(float v) { val = std::min(hi, std::max(lo, v)); }
+
+        float operator()() const { return val; }
+    };
+
+    model_params_t() = default;
+    model_params_t(model_kind_t kind) : kind(kind) {}
+
+    void add(const std::string &name, float val, float lo, float hi) {
+        vec.emplace_back(param_t(name, val, lo, hi));
+    }
+
+    param_t &operator[](int idx) { return vec[idx]; }
+    const param_t &operator[](int idx) const { return vec[idx]; }
+    int size() const { return (int)vec.size(); }
+
+    std::string str() const {
+        std::ostringstream oss;
+        bool is_first = true;
+        oss << "(";
+        for (auto &p : vec) {
+            if (!is_first) oss << ", ";
+            oss << p.val;
+            is_first = false;
+        }
+        oss << ")";
+        return oss.str();
     }
-    int m = nsamples;
-    int n = nfeatures + 1;
-    int nrhs = 1;
-    int lda = m;
-    int ldb = std::max(m, n);
-    _y.resize(std::max(m, n));
-    float *a = _X.data();
-    float *b = _y.data();
-    std::vector<float> s_vec(std::min(m, n));
-    int LAPACK_COL_MAJOR = 102;
-    float rcond = -1;
-    int rank = 0;
-    int info = mkl_iface_t::instance().LAPACKE_sgelsd(LAPACK_COL_MAJOR, m, n,
-            nrhs, a, lda, b, ldb, s_vec.data(), rcond, &rank);
-    ir_assert(info == 0);
-    _y.resize(n);
-    return linear_model_t(_y);
+
+    model_kind_t kind = model_kind_t::undef;
+    std::vector<param_t> vec;
+};
+
+float r2_score(const vec2d &X, const vec1d &y, const model_params_t &params) {
+    vec1d coef;
+    for (int i = 0; i < params.size(); i++)
+        coef.push_back(params[i].val);
+    return r2_score(params.kind, X, y, coef);
 }
 
-ml_model_t ml_model_fit(ml_model_kind_t kind, const vec2d &X, const vec1d &y) {
-    switch (kind) {
-        case ml_model_kind_t::linear_regression: {
-            auto model = linear_model_fit(X, y);
-            return ml_model_t(model);
+void find_optimal_param(
+        model_params_t &params, int idx, const vec2d &X, const vec1d &y) {
+    auto &p = params[idx];
+    float step = p.step;
+    for (int iter = 0; iter < 10; iter++) {
+        float p_val = p.val;
+        float p_val_best = p_val;
+        float r2_best = r2_score(X, y, params);
+        for (int sign : {-1, 1}) {
+            p.set(p_val + sign * step);
+            float r2 = r2_score(X, y, params);
+            if (r2 > r2_best) {
+                p_val_best = p.val;
+                r2_best = r2;
+            }
         }
-        default: ir_error_not_expected();
+        p.val = p_val_best;
+        step /= 2;
     }
-    return ml_model_t();
+    p.step /= 2;
 }
 
-model_t model_fit(const bench_data_t &bd) {
-    if (!bd) {
-        std::cout << "Warning: empty bench_data." << std::endl;
-        return model_t();
+} // namespace
+
+model_t model_fit(
+        model_params_t &params, const vec2d &X, const vec1d &y, bool verbose) {
+    int nparams = params.size();
+    // Perform a coordinate descent search optimizing one parameter at a time.
+    // The goal is to maximize R2. See conv/model.cpp file for more details on
+    // modeling.
+    int niters = 10 * nparams;
+    for (int i = 0; i < niters; i++) {
+        find_optimal_param(params, i % nparams, X, y);
     }
+    if (verbose) {
+        std::cout << "R2: " << r2_score(X, y, params) << " (cases: " << X.size()
+                  << ") model params = " << params.str() << std::endl;
+    }
+    vec1d coef;
+    for (int i = 0; i < params.size(); i++)
+        coef.push_back(params[i].val);
+    return model_t(params.kind, coef);
+}
+
+model_t model_fit(model_kind_t kind, const vec2d &X, const vec1d &y,
+        bool verbose = false) {
+    model_params_t params(kind);
+    std::vector<std::string> param_names;
+    std::vector<float> param_values;
+    std::vector<float> param_min;
+    std::vector<float> param_max;
+    model_t::coef_ranges(
+            kind, X, y, param_names, param_values, param_min, param_max);
+    for (size_t i = 0; i < param_names.size(); i++) {
+        params.add(param_names[i], param_values[i], param_min[i], param_max[i]);
+    }
+    return model_fit(params, X, y, verbose);
+}
+
+model_t model_fit(model_kind_t kind, const bench_data_t &bd) {
+    // Step 1. Fit model.
     vec2d X;
     vec1d y;
-    to_model_xy(bd, X, y);
-    auto ml_model = ml_model_fit(ml_model_kind_t::linear_regression, X, y);
-    return model_t(ml_model);
+    to_model_data(kind, bd, X, y);
+    auto model = model_fit(kind, X, y);
+
+    // Step 2. Remove outliers where the fitted model predicts significantly
+    // higher times. For example this may happen due to better L1 cache reuse
+    // for some shapes which is hard to control. It is preferable for the model
+    // to overestimate time rather than underestimate it.
+    vec2d X_adjusted;
+    vec1d y_adjusted;
+    for (size_t i = 0; i < X.size(); i++) {
+        float pred = model.predict(X[i]);
+        if ((pred - y[i]) > 0.25 * y[i]) continue;
+        X_adjusted.push_back(X[i]);
+        y_adjusted.push_back(y[i]);
+    }
+    model = model_fit(kind, X_adjusted, y_adjusted, /*verbose=*/true);
+    dump_csv(bd, model);
+    dump_model_params(bd.kernel_desc, model);
+    return model;
+}
+
+void model_fit(const bench_data_t &bd, model_set_t &model_set) {
+    if (!bd) {
+        std::cout << "Warning: empty bench_data." << std::endl;
+        return;
+    }
+    if (bd.kernel_desc.use_stream_k) {
+        auto model = model_fit(model_kind_t::stream_k, bd);
+        model_set.add(model);
+    } else {
+        auto model = model_fit(model_kind_t::data_parallel, bd);
+        model_set.add(model);
+    }
 }
 
 } // namespace planner
diff --git a/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp b/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp
index f5f7be24d02..5a659ff3ae2 100644
--- a/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp
+++ b/src/gpu/intel/jit/v2/conv/planner/model_fit.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@ namespace v2 {
 namespace conv {
 namespace planner {
 
-model_t model_fit(const bench_data_t &bd);
+void model_fit(const bench_data_t &bd, model_set_t &model_set);
 
 } // namespace planner
 } // namespace conv
diff --git a/src/gpu/intel/jit/v2/conv/planner/planner.cpp b/src/gpu/intel/jit/v2/conv/planner/planner.cpp
index 6e66a0efa89..43ff40aeb65 100644
--- a/src/gpu/intel/jit/v2/conv/planner/planner.cpp
+++ b/src/gpu/intel/jit/v2/conv/planner/planner.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,10 +16,13 @@
 
 #include "gpu/intel/jit/v2/conv/planner/planner.hpp"
 
+#include "oneapi/dnnl/dnnl_config.h"
+
+#include "common/primitive_cache.hpp"
 #include "gpu/intel/jit/v2/conv/model.hpp"
 #include "gpu/intel/jit/v2/conv/plan.hpp"
+#include "gpu/intel/jit/v2/conv/plan_registry.hpp"
 #include "gpu/intel/jit/v2/conv/planner/bench.hpp"
-#include "gpu/intel/jit/v2/conv/planner/mkl_iface.hpp"
 #include "gpu/intel/jit/v2/conv/planner/model_fit.hpp"
 #include "gpu/intel/jit/v2/conv/planner/search.hpp"
 
@@ -32,20 +35,7 @@ namespace v2 {
 namespace conv {
 namespace planner {
 
-enum class planner_mode_t {
-    undef,
-    trace,
-    bench,
-    search,
-    auto_search,
-};
-
-struct params_t {
-    planner_mode_t mode = planner_mode_t::undef;
-    kernel_desc_t desc;
-};
-
-static params_t params;
+static planner_params_t params;
 
 bool find_remove(const char *arg, std::string &s) {
     auto pos = s.find(arg);
@@ -54,6 +44,23 @@ bool find_remove(const char *arg, std::string &s) {
     return true;
 }
 
+std::string find_remove_key_value_impl(const std::string &key, std::string &s) {
+    auto i = s.find(key);
+    if (i == std::string::npos) return {};
+    auto j = key.find(" ", i + key.length());
+    if (j == std::string::npos) j = s.length();
+    i += key.length();
+    auto value = s.substr(i, j - i);
+    s.replace(i, j - i, "");
+    return value;
+}
+
+std::string find_remove_key_value(const std::string &key, std::string &s) {
+    auto value = find_remove_key_value_impl("--" + key + " ", s);
+    if (!value.empty()) return value;
+    return find_remove_key_value_impl(key + "=", s);
+}
+
 void print_help() {
     std::cout
             << R"(Usage: gpu_conv_planner [--help] [--bench] [--search] [--auto-search] [kernel descriptor arguments]
@@ -79,6 +86,7 @@ void init_params(
     bool has_search = find_remove("--search", cmd_args);
     bool has_auto_search = find_remove("--auto-search", cmd_args);
     bool has_help = (argc == 1) || find_remove("--help", cmd_args);
+    auto s_model = find_remove_key_value("model", cmd_args);
 
     if (has_help) {
         print_help();
@@ -104,53 +112,58 @@ void init_params(
         params.mode = planner_mode_t::trace;
     }
     switch (params.mode) {
+        case planner_mode_t::auto_search: return;
         case planner_mode_t::search:
-        case planner_mode_t::auto_search: (void)mkl_iface_t::instance(); break;
+            if (cmd_args.find("--iter") == std::string::npos) {
+                cmd_args += " --iter x";
+            }
+            break;
         default: break;
     }
-    // Check if conv v2 is enabled.
-    bool enable_conv_v2 = gpu_utils::dev_getenv("enable_conv_v2", false);
-    if (!enable_conv_v2) {
-        std::cout << "Error: conv_v2 is not enabled, set "
-                     "enable_conv_v2=1 in environment."
-                  << std::endl;
-        exit(1);
-    }
-
-    if (params.mode != planner_mode_t::auto_search) {
-        auto iface = params.desc.parse_iface();
-        iface.parse(cmd_args, params.desc);
-        params.desc.set_defaults();
-        params.desc.hw = hw_t(bench_mger.get_engine().get());
-        problem_t prb;
-        prb_tile_t s = problem_t::default_shape();
-        prb.set_shape(s);
-    }
+    auto &iface = params.desc.parse_iface();
+    iface.parse(cmd_args, params.desc, &params.parse_result);
+    params.desc.set_defaults();
+    if (!s_model.empty()) params.model_set = jit::parse<model_set_t>(s_model);
 }
 
-void planner_main(int argc, const char **argv) {
+void DNNL_API planner_main(int argc, const char **argv) {
+    auto status = set_primitive_cache_capacity(0, 1024);
+    if (status != status::success) {
+        std::cout << "Error: cannot set primitive cache capacity\n";
+        exit(1);
+    }
     bench_manager_t bench_mger;
     init_params(argc, argv, bench_mger);
     switch (params.mode) {
         case planner_mode_t::trace: {
-            auto plan = create_conv_plan_and_finalize_desc(params.desc);
-            std::cout << std::endl;
-            std::cout << ir_utils::add_tag("plan", plan.str()) << std::endl;
+            plan_t plan = create_conv_plan(params.desc, bench_mger.hw());
+            if (!plan) {
+                std::cout << "Error: cannot create plan\n";
+                exit(1);
+            }
+            std::cout << plan.str() << std::endl;
+            std::cout << ir_utils::add_tag("Reqs", params.desc.reqs().str())
+                      << std::endl;
+            if (!params.model_set.is_empty()) {
+                std::cout << ir_utils::add_tag("Model", params.model_set.str())
+                          << std::endl;
+            }
             break;
         }
         case planner_mode_t::bench: {
-            bench(bench_mger, params.desc);
-            break;
-        }
-        case planner_mode_t::auto_search: {
-            auto_search(bench_mger);
+            auto entry = prepare_plan_registry_entry(bench_mger, params.desc);
+            std::cout << entry.str() << std::endl;
+            std::cout << "Kernel registry entry:\n  " << entry.registry_str()
+                      << std::endl;
             break;
         }
+        case planner_mode_t::auto_search:
         case planner_mode_t::search: {
-            search(bench_mger, params.desc);
+            plan_registry() = plan_registry_t();
+            search(bench_mger, params);
             break;
         }
-        default: ir_error_not_expected();
+        default: gpu_error_not_expected();
     }
 }
 
diff --git a/src/gpu/intel/jit/v2/conv/planner/planner.hpp b/src/gpu/intel/jit/v2/conv/planner/planner.hpp
index a16c98223b8..06426098c9a 100644
--- a/src/gpu/intel/jit/v2/conv/planner/planner.hpp
+++ b/src/gpu/intel/jit/v2/conv/planner/planner.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,9 @@
 #ifndef GPU_INTEL_JIT_V2_CONV_PLANNER_PLANNER_HPP
 #define GPU_INTEL_JIT_V2_CONV_PLANNER_PLANNER_HPP
 
-#include "oneapi/dnnl/dnnl_config.h"
+#include "gpu/intel/jit/utils/utils.hpp"
+#include "gpu/intel/jit/v2/conv/kernel_desc.hpp"
+#include "gpu/intel/jit/v2/conv/model.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -28,7 +30,20 @@ namespace v2 {
 namespace conv {
 namespace planner {
 
-void DNNL_API planner_main(int argc, const char **argv);
+enum class planner_mode_t {
+    undef,
+    trace,
+    bench,
+    search,
+    auto_search,
+};
+
+struct planner_params_t {
+    planner_mode_t mode = planner_mode_t::undef;
+    kernel_desc_t desc;
+    model_set_t model_set;
+    parse_result_t parse_result;
+};
 
 } // namespace planner
 } // namespace conv
diff --git a/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp b/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp
index 973ba4c0faf..2bf0788d4bc 100644
--- a/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp
+++ b/src/gpu/intel/jit/v2/conv/planner/planner_main.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,7 +14,25 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "gpu/intel/jit/v2/conv/planner/planner.hpp"
+#include "oneapi/dnnl/dnnl_config.h"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+namespace planner {
+void DNNL_API planner_main(int argc, const char **argv);
+}
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
 
 int main(int argc, const char **argv) {
     dnnl::impl::gpu::intel::jit::v2::conv::planner::planner_main(argc, argv);
diff --git a/src/gpu/intel/jit/v2/conv/planner/search.cpp b/src/gpu/intel/jit/v2/conv/planner/search.cpp
index acabe4c811b..b394f5b0f60 100644
--- a/src/gpu/intel/jit/v2/conv/planner/search.cpp
+++ b/src/gpu/intel/jit/v2/conv/planner/search.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "gpu/intel/jit/v2/conv/planner/search.hpp"
 
+#include "common/profiler.hpp"
+#include "gpu/intel/jit/ir/blocking.hpp"
 #include "gpu/intel/jit/utils/utils.hpp"
 #include "gpu/intel/jit/v2/conv/model.hpp"
 #include "gpu/intel/jit/v2/conv/plan.hpp"
@@ -39,46 +41,6 @@ namespace v2 {
 namespace conv {
 namespace planner {
 
-class search_iterator_t {
-public:
-    int add(const std::vector<int> &key_values) {
-        int key = (int)values_.size();
-        values_.push_back(key_values);
-        idxs_.push_back(0);
-        if (key == 0) {
-            idxs_[0] = -1;
-            total_ = 1;
-        }
-        total_ *= (int)key_values.size();
-        return key;
-    }
-
-    int nkeys() const { return (int)values_.size(); }
-
-    bool has_next() const { return idx_ + 1 < total_; }
-
-    void next() {
-        ir_assert(has_next());
-        int carry = 1;
-        for (int j = 0; j < nkeys(); j++) {
-            int new_idx = idxs_[j] + carry;
-            int bound = (int)values_[j].size();
-            idxs_[j] = new_idx % bound;
-            carry = new_idx / bound;
-            if (carry == 0) break;
-        }
-        idx_++;
-    }
-
-    int operator()(int key) const { return values_[key][idxs_[key]]; }
-
-private:
-    int idx_ = -1;
-    int total_ = 0;
-    std::vector<std::vector<int>> values_;
-    std::vector<int> idxs_;
-};
-
 // Flags specifying blocking restrictions for a convolution dimension.
 enum class tile_flags_t : uint32_t {
     undef = 0,
@@ -127,17 +89,17 @@ inline std::vector<int> pow_range(int a, int b, int step) {
 }
 
 struct tile_info_t {
-    prb_dim_t dim;
+    pvar_t dim;
     tile_flags_t flags = tile_flags_t::undef;
 
     tile_info_t() = default;
-    tile_info_t(const prb_dim_t &dim) : dim(dim) {}
+    tile_info_t(const pvar_t &dim) : dim(dim) {}
 
     void add(tile_flags_t f) { flags = flags | f; }
 
     std::vector<int> iter_tiles() const {
         if (!any(flags & tile_flags_t::iter)) return {1};
-        return pow_range(1, 64, 2);
+        return pow_range(8, 64, 2);
     }
 
     std::vector<int> thread_group_tiles() const {
@@ -176,36 +138,36 @@ class tile_scheme_t {
         }
         key_idxs.push_back(parts.size());
         std::unordered_map<std::string, std::vector<std::string>> k2v;
-        for (int i = 0; i < (int)key_idxs.size() - 1; i++) {
-            int cur = key_idxs[i];
-            int next = key_idxs[i + 1];
-            for (int j = cur + 1; j < next; j++) {
+        for (size_t i = 0; i < key_idxs.size() - 1; i++) {
+            size_t cur = key_idxs[i];
+            size_t next = key_idxs[i + 1];
+            for (size_t j = cur + 1; j < next; j++) {
                 set(parts[cur], parts[j]);
             }
         }
     }
 
-    void unset(const prb_dim_t &dim) { tile_infos_.unset(dim); }
+    void unset(const pvar_t &dim) { tile_infos_.unset(dim); }
 
-    std::vector<prb_dim_t> dims() const { return tile_infos_.keys(); }
-    const tile_info_t &tile_info(const prb_dim_t &dim) const {
+    std::vector<pvar_t> dims() const { return tile_infos_.keys(); }
+    const tile_info_t &tile_info(const pvar_t &dim) const {
         return tile_infos_.at(dim);
     }
 
 private:
     void set(const std::string &key, const std::string &value) {
         if (key == "iter") {
-            auto dim = prb_dim_t::from_name(value);
+            auto dim = pvar_t(value);
             tile_infos_[dim].add(tile_flags_t::iter);
         } else if (key == "tg") {
-            auto dim = prb_dim_t::from_name(value);
+            auto dim = pvar_t(value);
             tile_infos_[dim].add(tile_flags_t::thread_group);
         } else {
-            ir_error_not_expected();
+            gpu_error_not_expected();
         }
     }
 
-    dim_map_t<prb_dim_t, tile_info_t> tile_infos_;
+    pvar_map_t<tile_info_t> tile_infos_;
 };
 
 struct dim_tile_t {
@@ -230,18 +192,27 @@ std::ostream &operator<<(std::ostream &out, const dim_tile_t &tile) {
 }
 
 struct tiling_desc_t {
-    prb_tile_t iter;
-    prb_tile_t thread_group;
+    pvar_tile_t iter;
+    pvar_tile_t thread_group;
 
-    void set(const prb_dim_t &dim, const dim_tile_t &tile) {
+    void set(const pvar_t &dim, const dim_tile_t &tile) {
         if (tile.iter != 1) iter[dim] = tile.iter;
         if (tile.tg != 1) thread_group[dim] = tile.tg;
     }
 
-    void unset(const prb_dim_t &dim) {
+    void unset(const pvar_t &dim) {
         iter.unset(dim);
         thread_group.unset(dim);
     }
+
+    std::string str() const {
+        std::ostringstream oss;
+        oss << "iter: " << iter.str();
+        oss << " thread_group: " << thread_group.str();
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
 };
 
 class dim_tile_set_t {
@@ -278,7 +249,7 @@ class dim_tile_set_t {
     }
 
     static std::vector<dim_tile_t> get_dim_tiles(
-            const tile_scheme_t &scheme, const prb_dim_t &dim) {
+            const tile_scheme_t &scheme, const pvar_t &dim) {
         std::vector<dim_tile_t> ret;
         auto &info = scheme.tile_info(dim);
         auto iter_tiles = info.iter_tiles();
@@ -294,155 +265,399 @@ class dim_tile_set_t {
         return ret;
     }
 
-    std::vector<prb_dim_t> dims_;
-    dim_map_t<prb_dim_t, std::vector<dim_tile_t>> tiles_;
+    std::vector<pvar_t> dims_;
+    pvar_map_t<std::vector<dim_tile_t>> tiles_;
 };
 
-std::vector<tile_scheme_t> get_tile_schemes(prop_kind_t prop, bool is_dw) {
+struct search_params_t {
+    kernel_desc_t base_desc;
+    bool is_iter_set = false;
+    bool is_tg_set = false;
+    bool is_prefetch_set = false;
+
+    search_params_t(
+            const kernel_desc_t &_base_desc, const parse_result_t &parse_result)
+        : base_desc(_base_desc) {
+        is_iter_set = parse_result.is_set("--iter");
+        is_tg_set = parse_result.is_set("--tg");
+        is_prefetch_set = parse_result.is_set("--prefetch");
+    }
+
+    search_params_t(const planner_params_t &params)
+        : search_params_t(params.desc, params.parse_result) {}
+};
+
+std::vector<tile_scheme_t> get_tile_schemes(const search_params_t &params) {
     std::vector<tile_scheme_t> schemes;
-    if (prop == prop_kind::forward) {
-        schemes.emplace_back("tg=[ow], iter=[mb,g,oc,ic]");
+    if (params.base_desc.prop == prop_kind::forward) {
+        schemes.emplace_back("tg=[ic],    iter=[mb,g,oc,ic]");
+        schemes.emplace_back("tg=[ic],    iter=[ow,g,oc,ic]");
         schemes.emplace_back("tg=[oc,mb], iter=[mb,g,oc,ic]");
-        schemes.emplace_back("tg=[ic,ow], iter=[ow,g,oc,ic]");
-    } else if (prop == prop_kind::backward_data) {
+        schemes.emplace_back("tg=[oc,mb], iter=[ow,g,oc,ic]");
+        schemes.emplace_back("tg=[oc,ow], iter=[mb,g,oc,ic]");
+        schemes.emplace_back("tg=[oc,ow], iter=[ow,g,oc,ic]");
+    } else if (params.base_desc.prop == prop_kind::backward_data) {
         schemes.emplace_back("tg=[ic,iw], iter=[mb,g,oc,ic]");
         schemes.emplace_back("tg=[ic,mb], iter=[mb,g,oc,ic]");
         schemes.emplace_back("tg=[ic,iw], iter=[iw,g,oc,ic]");
-    } else if (prop == prop_kind::backward_weights) {
+    } else if (params.base_desc.prop == prop_kind::backward_weights) {
         schemes.emplace_back("tg=[oc,ic], iter=[mb,g,oc,ic]");
         schemes.emplace_back("tg=[oc,ic], iter=[ow,g,oc,ic]");
     } else {
-        ir_error_not_expected();
+        gpu_error_not_expected();
     }
     for (auto &s : schemes) {
-        if (is_dw) {
-            s.unset(prb_dims::ic);
-            s.unset(prb_dims::oc);
+        if (params.base_desc.is_dw) {
+            s.unset(pvars::ic);
+            s.unset(pvars::oc);
         } else {
-            s.unset(prb_dims::g);
+            s.unset(pvars::g);
         }
     }
     return schemes;
 }
 
+// A group of kernel descriptors sharing the same set of requriements.
+class search_kernel_desc_group_t {
+public:
+    search_kernel_desc_group_t() = default;
+    search_kernel_desc_group_t(const prb_reqs_t &reqs) : reqs_(reqs) {}
+
+    const prb_reqs_t &reqs() const { return reqs_; }
+    const std::vector<kernel_desc_t> &descs() const { return descs_; }
+
+    void add_desc(const kernel_desc_t &desc) {
+        gpu_assert(desc.reqs().str() == reqs_.str())
+                << "Reqs mismatch:\n"
+                << desc.cmd_str() << "\ndesc.reqs:" << desc.reqs().str()
+                << "\nreqs:\n"
+                << reqs_.str();
+        if (descs_.empty()) {
+            is_dw_ = desc.is_dw;
+        } else {
+            gpu_assert(desc.is_dw == is_dw_);
+        }
+        descs_.push_back(desc);
+    }
+
+    bench_input_params_t bench_input_params(int nprbs, const hw_t &hw) const {
+        if (descs_.empty()) return bench_input_params_t();
+        auto &kd = descs_.front();
+        bench_input_params_t params;
+        params.hw = hw;
+        params.prop = kd.prop;
+        params.src_tag = kd.src_tag;
+        params.wei_tag = kd.wei_tag;
+        params.dst_tag = kd.dst_tag;
+        params.reqs = reqs_;
+        params.is_dw = is_dw_;
+        params.nprbs = nprbs;
+        return params;
+    }
+
+private:
+    prb_reqs_t reqs_;
+    std::vector<kernel_desc_t> descs_;
+    bool is_dw_ = false;
+};
+
+bench_data_set_t bench_kernel_desc_group(const bench_manager_t &bench_mger,
+        const search_kernel_desc_group_t &desc_group, int nprbs, int max_descs);
+
 class kernel_search_manager_t {
 public:
+    // Number of problems to generate to rank kernel descriptors in a kernel
+    // descriptor group.
+    static const int bench_nprbs = 50;
+    // Number of top kernel descriptors in a kernel descriptor group to save to
+    // registry.
+    static const int registry_top_k = 8;
+    // Number of descriptors to search through.
+    static const int max_descs = 256;
+
     kernel_search_manager_t(
-            const bench_manager_t &bench_mger, const kernel_desc_t &base_desc)
-        : bench_mger_(bench_mger), base_desc_(base_desc) {}
+            const bench_manager_t &bench_mger, const search_params_t &params)
+        : bench_mger_(bench_mger), params_(params) {}
 
     void search() {
         std::cout << "Starting kernel search" << std::endl;
-        auto descs = gen_descs();
-        for (size_t i = 0; i < descs.size(); i++) {
-            search_desc(descs[i]);
+        auto &registry = plan_registry();
+        auto desc_groups = gen_desc_groups();
+        for (auto &dg : desc_groups) {
+            auto bench_data_set = bench_kernel_desc_group(
+                    bench_mger_, dg, bench_nprbs, max_descs);
+            auto best = bench_data_set.find_best(registry_top_k);
+            for (auto &bd : best) {
+                auto entry = prepare_plan_registry_entry(
+                        bench_mger_, bd.kernel_desc);
+                registry.set(entry);
+            }
         }
         std::cout << "Kernel search completed" << std::endl;
     }
 
 private:
-    std::vector<kernel_desc_t> gen_descs() const {
-        std::unordered_map<std::string, kernel_desc_t> descs;
-        for (auto &s : get_tile_schemes(base_desc_.prop, base_desc_.is_dw)) {
+    std::vector<search_kernel_desc_group_t> gen_desc_groups() const {
+        std::unordered_set<std::string> seen;
+        std::vector<kernel_desc_t> descs;
+        for (auto &s : get_tile_schemes(params_)) {
             dim_tile_set_t tile_set(s);
             auto tiling_descs = tile_set.create_tiling_descs();
             for (auto &td : tiling_descs) {
-                auto d = base_desc_;
-                d.thread_group_tile = td.thread_group;
-                d.iter_tile = td.iter;
-                if (!is_supported(d)) continue;
+                auto d = params_.base_desc;
+                if (!params_.is_tg_set) d.thread_group_tile = td.thread_group;
+                if (!params_.is_iter_set) d.iter_tile = td.iter;
                 auto d_key = jit::stringify(d);
-                if (descs.find(d_key) != descs.end()) continue;
-                descs[d_key] = d;
-                std::cout << d_key << std::endl;
+                if (seen.count(d_key) > 0) continue;
+                seen.insert(std::move(d_key));
+                if (!create_conv_plan(d, bench_mger_.hw())) {
+                    std::cout << d.brief_str() << ": \033[1;31mFAIL\033[0m"
+                              << std::endl;
+                    continue;
+                }
+                std::cout << d.brief_str() << ": \033[1;32mOK\033[0m"
+                          << std::endl;
+                descs.push_back(std::move(d));
             }
         }
-        std::vector<kernel_desc_t> ret;
-        for (auto &kv : descs) {
+        gpu_info() << "gen_desc_groups(): descs.size() = " << descs.size();
+        std::unordered_map<std::string, search_kernel_desc_group_t> desc_groups;
+        std::vector<int> prefetch_dists;
+        if (params_.is_prefetch_set) {
+            prefetch_dists.push_back(params_.base_desc.prefetch.dist);
+        } else {
+            prefetch_dists.push_back(1);
+            prefetch_dists.push_back(3);
+        }
+        for (auto &d : descs) {
+            auto ret = desc_groups.emplace(
+                    d.reqs().str(), search_kernel_desc_group_t(d.reqs()));
+            ret.first->second.add_desc(d);
+            for (int dist : prefetch_dists) {
+                auto _d = d;
+                _d.prefetch = prefetch_desc_t(dist, true, true);
+                if (!create_conv_plan(_d, bench_mger_.hw())) {
+                    std::cout << d.brief_str() << ": \033[1;31mFAIL\033[0m"
+                              << std::endl;
+                    continue;
+                }
+                std::cout << _d.brief_str() << ": \033[1;32mOK\033[0m"
+                          << std::endl;
+                ret.first->second.add_desc(_d);
+            }
+        }
+        std::vector<search_kernel_desc_group_t> ret;
+        ret.reserve(desc_groups.size());
+        for (auto &kv : desc_groups) {
             ret.push_back(kv.second);
         }
-        std::minstd_rand seed;
-        std::shuffle(ret.begin(), ret.end(), seed);
-        ret.resize(std::min((int)ret.size(), 8));
-        std::cout << "Generated " << ret.size() << " kernel descriptors"
-                  << std::endl;
+        std::cout << "Generated " << ret.size()
+                  << " kernel descriptor groups\n";
         return ret;
     }
 
-    bool is_supported(kernel_desc_t &desc) const {
-        if (!desc.is_supported()) return false;
-        auto plan = create_conv_plan(desc);
-        if (!plan) return false;
-        desc.finalize(plan);
-        return true;
-    }
-
-    static std::vector<prb_tile_t> generate_iter_outer_tiles(
+    static std::vector<pvar_tile_t> generate_iter_outer_tiles(
             const kernel_desc_t &desc) {
-        std::vector<prb_tile_t> tiles = {prb_tile_t()};
+        std::vector<pvar_tile_t> tiles = {pvar_tile_t()};
         for (auto &d : desc.iter_tile) {
-            auto bmnk = to_gemm(d, desc.prop).kind();
-            if (!utils::one_of(bmnk, prb_dim_kind_t::m, prb_dim_kind_t::n))
-                continue;
+            auto bmnk = to_gemm(d, desc.prop);
+            if (!utils::one_of(bmnk, pvars::m, pvars::n)) continue;
             for (int outer : {2, 4}) {
                 if (desc.iter_tile.at(d) % outer != 0) continue;
-                tiles.push_back(outer);
+                pvar_tile_t tile_outer;
+                tile_outer[d] = outer;
+                tiles.push_back(std::move(tile_outer));
             }
         }
         return tiles;
     }
 
-    void search_desc(const kernel_desc_t &_desc) const {
-        auto iter_outer_tiles = generate_iter_outer_tiles(_desc);
-        auto &registry = plan_registry();
-        for (auto &iter_outer : iter_outer_tiles) {
-            auto desc = _desc;
-            desc.iter_outer_tile = iter_outer;
-            std::cout << "Running benchmark for descriptor: " << desc.cmd_str()
-                      << std::endl;
-            auto bd = bench(bench_mger_, desc);
-            if (!bd) {
-                std::cout << "Benchmarking failed" << std::endl;
-                continue;
+    const bench_manager_t &bench_mger_;
+    search_params_t params_;
+};
+
+class search_sequence_t {
+public:
+    search_sequence_t(const std::vector<kernel_desc_t> &descs, int max_entries)
+        : max_entries_(max_entries) {
+        std::vector<std::vector<pvar_tile_t>> tiles;
+        pvar_t prefetch_dim("p");
+        for (int i = 0; i < (int)descs.size(); i++) {
+            auto &d = descs[i];
+            entries_.emplace_back(i, d);
+            std::vector<pvar_tile_t> d_tiles;
+            auto iter = to_gemm(d.iter_tile, d.prop);
+            auto tg = to_gemm(d.thread_group_tile, d.prop);
+            d_tiles.push_back(std::move(iter));
+            d_tiles.push_back(std::move(tg));
+            pvar_tile_t prefetch_tile;
+            prefetch_tile[prefetch_dim] = d.prefetch.dist;
+            d_tiles.push_back(std::move(prefetch_tile));
+            tiles.push_back(std::move(d_tiles));
+        }
+        tile_to_vec_ = tile_to_vec_t(tiles);
+        entry_it_ = entries_.begin();
+        std::default_random_engine rng(0);
+        std::shuffle(entries_.begin(), entries_.end(), rng);
+    }
+
+    explicit operator bool() const {
+        return entry_idx_ < max_entries_ && entry_it_ != entries_.end();
+    }
+
+    std::pair<int, kernel_desc_t> next() {
+        gpu_assert((bool)*this);
+        auto &e = *entry_it_;
+        ++entry_it_;
+        return std::make_pair(e.id, e.desc);
+    }
+
+    void update(const bench_data_set_t &data_set) {
+        entry_idx_++;
+        if (batch_entry_idx_++ < rescore_period_) return;
+        batch_entry_idx_ = 0;
+
+        const int nbest = 5;
+        auto best_ids = data_set.find_best_ids(nbest);
+        std::unordered_map<int, float> min_dists;
+        for (auto it = entry_it_; it != entries_.end(); ++it) {
+            min_dists[it->id] = std::numeric_limits<float>::max();
+            for (auto &id : best_ids) {
+                min_dists[it->id] = std::min(
+                        min_dists[it->id], tile_to_vec_.dist(it->id, id));
             }
-            auto model = model_fit(bd);
-            registry.set(desc, model);
-            return;
         }
+        std::sort(entry_it_, entries_.end(),
+                [&](const entry_t &a, const entry_t &b) {
+                    return min_dists[a.id] < min_dists[b.id];
+                });
     }
 
-    const bench_manager_t &bench_mger_;
-    kernel_desc_t base_desc_;
+private:
+    struct entry_t {
+        int id = -1;
+        kernel_desc_t desc;
+
+        entry_t(int id, const kernel_desc_t &desc) : id(id), desc(desc) {}
+    };
+
+    static const int rescore_period_ = 16;
+
+    std::vector<entry_t> entries_;
+    std::vector<entry_t>::iterator entry_it_;
+    tile_to_vec_t tile_to_vec_;
+
+    // The indices below are tracked only for successfully created kernels
+    // (update() must be called).
+    int batch_entry_idx_ = 0;
+    int entry_idx_ = 0;
+    int max_entries_ = 0;
 };
 
-void search(const bench_manager_t &bench_mger, const kernel_desc_t &desc) {
-    kernel_search_manager_t mger(bench_mger, desc);
-    mger.search();
+bench_data_set_t bench_kernel_desc_group(const bench_manager_t &bench_mger,
+        const search_kernel_desc_group_t &desc_group, int nprbs,
+        int max_descs) {
+    bench_runner_t runner(
+            bench_mger, desc_group.bench_input_params(nprbs, bench_mger.hw()));
+    bench_data_set_t bd_set;
+    search_sequence_t seq(desc_group.descs(), max_descs);
+    while (seq) {
+        auto seq_next = seq.next();
+        int kernel_desc_id = seq_next.first;
+        auto &kernel_desc = seq_next.second;
+        auto bd = runner.bench(kernel_desc);
+        if (!bd) continue;
+        bd.id = kernel_desc_id;
+        bd_set.add(bd);
+        seq.update(bd_set);
+    }
+
+    return bd_set;
 }
 
-void auto_search(const bench_manager_t &bench_mger) {
+std::string merge_cmd_lines(const std::string &recipe_line,
+        const parse_result_t &cmd_parse_result) {
+    auto &iface = kernel_desc_t::parse_iface();
+    kernel_desc_t recipe_desc;
+    parse_result_t recipe_parse_result;
+    iface.parse(recipe_line, recipe_desc, &recipe_parse_result);
+    bool is_first = true;
+    std::ostringstream oss;
+    for (auto &kv : cmd_parse_result.args()) {
+        auto &name = kv.first;
+        ;
+        auto &value = kv.second;
+        if (!is_first) oss << " ";
+        oss << name << "=" << value;
+        is_first = false;
+    }
+    for (auto &kv : recipe_parse_result.args()) {
+        auto &name = kv.first;
+        auto &value = kv.second;
+        if (cmd_parse_result.args().count(name) > 0) continue;
+        if (!is_first) oss << " ";
+        oss << name << "=" << value;
+        is_first = false;
+    }
+    return oss.str();
+}
+
+void auto_search(
+        const bench_manager_t &bench_mger, const planner_params_t &params) {
     // clang-format off
     std::vector<const char *> recipes = {
-        "--prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128",
-        "--prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128 --load a:2d,b:2d --store c:2d",
-        "--prop fwd --src axb:s8 --wei axcb:s8 --dst axb:s8 --hw xehpc --fma dpas --simd 16 --regs 256 --load a:2d,b:2d --store c:2d --prefetch x3",
-        "--prop fwd --src axb:s8 --wei axcb:s8 --dst axb:s8 --hw xehpc --fma dpas --simd 16 --regs 256",
-
-        "--prop bwd_d --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128 --spec-reqs sw1sh1sd1",
-        "--prop bwd_d --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128 --load a:2d,b:2d --store c:2d --spec-reqs sw1sh1sd1",
-        "--prop bwd_d --src axb:s8 --wei axcb:s8 --dst axb:s8 --hw xehpc --fma dpas --simd 16 --regs 256 --load a:2d,b:2d --store c:2d --prefetch x3 --spec-reqs sw1sh1sd1",
-        "--prop bwd_d --src axb:s8 --wei axcb:s8 --dst axb:s8 --hw xehpc --fma dpas --simd 16 --regs 256 --spec-reqs sw1sh1sd1",
-
-        "--prop bwd_w --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128",
-        "--prop bwd_w --src axb:f32 --wei axcb:f32 --dst axb:f32 --hw xehpc --fma mad --simd 16 --regs 128 --load a:2d,b:2d --store c:2d",
+        "--hw xehpc --prop fwd --src axb:s8 --wei axcb:s8 --dst axb:s8 --fma dpas --simd 16 --regs 256 --2d 1",
+        "--hw xehpc --prop fwd --src axb:s8 --wei axcb:s8 --dst axb:s8 --fma dpas --simd 16 --regs 256",
+        "--hw xehpc --prop fwd --src axb:bf16 --wei axcb:bf16 --dst axb:bf16 --fma dpas --simd 16 --regs 256 --2d 1",
+        "--hw xehpc --prop fwd --src axb:bf16 --wei axcb:bf16 --dst axb:bf16 --fma dpas --simd 16 --regs 256",
+        "--hw xehpc --prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 32 --regs 128 --2d 1",
+        "--hw xehpc --prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --prop bwd_d --src axb:bf16 --wei axbc:bf16 --dst axb:bf16 --fma dpas --simd 16 --regs 256 --2d 1",
+        "--hw xehpc --prop bwd_d --src axb:bf16 --wei axbc:bf16 --dst axb:bf16 --fma dpas --simd 16 --regs 256",
+        "--hw xehpc --prop bwd_d --src axb:f32 --wei axbc:f32 --dst axb:f32 --fma mad --simd 32 --regs 128 --2d 1",
+        "--hw xehpc --prop bwd_d --src axb:f32 --wei axbc:f32 --dst axb:f32 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --prop bwd_w --src axb:bf16 --wei axcb:bf16 --dst axb:bf16 --fma dpas --simd 16 --regs 256 --2d 1",
+        "--hw xehpc --prop bwd_w --src axb:bf16 --wei axcb:bf16 --dst axb:bf16 --fma dpas --simd 16 --regs 256",
+        "--hw xehpc --prop bwd_w --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 16 --regs 128 --2d 1",
+        "--hw xehpc --prop bwd_w --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 16 --regs 128",
+        "--hw xehpc --dw 1 --prop fwd --src axb:s8 --wei axcb:s8 --dst axb:s8 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --dw 1 --prop fwd --src axb:bf16 --wei axcb:bf16 --dst axb:bf16 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --dw 1 --prop fwd --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --dw 1 --prop bwd_d --src axb:bf16 --wei axbc:bf16 --dst axb:bf16 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --dw 1 --prop bwd_d --src axb:f32 --wei axbc:f32 --dst axb:f32 --fma mad --simd 32 --regs 128",
+        "--hw xehpc --dw 1 --prop bwd_w --src axb:bf16 --wei axcb:bf16 --dst axb:bf16 --fma mad --simd 16 --regs 128",
+        "--hw xehpc --dw 1 --prop bwd_w --src axb:f32 --wei axcb:f32 --dst axb:f32 --fma mad --simd 32 --regs 128",
     };
     // clang-format on
+    auto &iface = kernel_desc_t::parse_iface();
+    double t = get_msec();
+    std::unordered_set<std::string> seen;
     for (const char *_r : recipes) {
-        auto r = std::string(_r) + " --iter x --tg x";
+        std::string line = merge_cmd_lines(_r, params.parse_result);
+        if (seen.count(line) > 0) continue;
+        seen.insert(line);
         kernel_desc_t desc;
-        desc.set(r);
-        desc.hw = hw_t(bench_mger.get_engine().get());
-        search(bench_mger, desc);
+        parse_result_t parse_result;
+        iface.parse(line, desc, &parse_result);
+        kernel_search_manager_t mger(
+                bench_mger, search_params_t(desc, parse_result));
+        mger.search();
+    }
+    t = get_msec() - t;
+    std::cout << "Kernel search done, took: " << t / 1e3 << " sec" << std::endl;
+}
+
+void search(const bench_manager_t &bench_mger, const planner_params_t &params) {
+    switch (params.mode) {
+        case planner_mode_t::search: {
+            kernel_search_manager_t mger(bench_mger, search_params_t(params));
+            mger.search();
+            break;
+        }
+        case planner_mode_t::auto_search:
+            auto_search(bench_mger, params);
+            break;
+        default: gpu_error_not_expected();
     }
 }
 
diff --git a/src/gpu/intel/jit/v2/conv/planner/search.hpp b/src/gpu/intel/jit/v2/conv/planner/search.hpp
index 70a00892386..3ee39fe26c7 100644
--- a/src/gpu/intel/jit/v2/conv/planner/search.hpp
+++ b/src/gpu/intel/jit/v2/conv/planner/search.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_INTEL_JIT_V2_CONV_PLANNER_SEARCH_HPP
 
 #include "gpu/intel/jit/v2/conv/planner/bench.hpp"
+#include "gpu/intel/jit/v2/conv/planner/planner.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -31,8 +32,7 @@ class kernel_desc_t;
 
 namespace planner {
 
-void search(const bench_manager_t &bench_mger, const kernel_desc_t &desc);
-void auto_search(const bench_manager_t &bench_mger);
+void search(const bench_manager_t &bench_mger, const planner_params_t &params);
 
 } // namespace planner
 } // namespace conv
diff --git a/src/gpu/intel/jit/v2/conv/problem.cpp b/src/gpu/intel/jit/v2/conv/problem.cpp
index 107b3f3c347..3aaa8cc6bee 100644
--- a/src/gpu/intel/jit/v2/conv/problem.cpp
+++ b/src/gpu/intel/jit/v2/conv/problem.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,35 +25,45 @@ namespace v2 {
 namespace conv {
 
 problem_t::problem_t(const std::string &line) {
-    ir_error_not_expected();
+    gpu_error_not_expected();
     auto s_desc = gpu_utils::split(line, " ").back();
     set_shape(s_desc);
 }
 
+const type_t &problem_t::out_type() const {
+    switch (prop_) {
+        case prop_kind::forward: return dst_tag_.type();
+        case prop_kind::backward_data: return src_tag_.type();
+        case prop_kind::backward_weights: return wei_tag_.type();
+        default: gpu_error_not_expected();
+    }
+    return src_tag_.type();
+}
+
 void problem_t::set_shape(const std::string &s) {
-    ir_assert(prop_ != prop_kind::undef);
-    prb_tile_t s_tile(s);
-    bool has_d = has_spatial(s_tile, prb_dim_spatial_kind_t::d);
-    bool has_h = has_spatial(s_tile, prb_dim_spatial_kind_t::h);
-    bool has_w = has_spatial(s_tile, prb_dim_spatial_kind_t::w);
+    gpu_assert(prop_ != prop_kind::undef);
+    pvar_tile_t s_tile(s);
+    bool has_d = has_spatial(s_tile, 'd');
+    bool has_h = has_spatial(s_tile, 'h');
+    bool has_w = has_spatial(s_tile, 'w');
     if ((has_d && has_h && has_w) || (has_h && has_w) || has_w) {
         // Nothing to propagate.
     } else if (has_d && !has_h && !has_w) {
-        s_tile[prb_dims::ih] = s_tile[prb_dims::iw] = s_tile[prb_dims::id];
-        s_tile[prb_dims::oh] = s_tile[prb_dims::ow] = s_tile[prb_dims::od];
-        s_tile[prb_dims::kh] = s_tile[prb_dims::kw] = s_tile[prb_dims::kd];
-        s_tile[prb_dims::sh] = s_tile[prb_dims::sw] = s_tile[prb_dims::sd];
-        s_tile[prb_dims::dh] = s_tile[prb_dims::dw] = s_tile[prb_dims::dd];
-        s_tile[prb_dims::ph] = s_tile[prb_dims::pw] = s_tile[prb_dims::pd];
+        s_tile[pvars::ih] = s_tile[pvars::iw] = s_tile[pvars::id];
+        s_tile[pvars::oh] = s_tile[pvars::ow] = s_tile[pvars::od];
+        s_tile[pvars::kh] = s_tile[pvars::kw] = s_tile[pvars::kd];
+        s_tile[pvars::sh] = s_tile[pvars::sw] = s_tile[pvars::sd];
+        s_tile[pvars::dh] = s_tile[pvars::dw] = s_tile[pvars::dd];
+        s_tile[pvars::ph] = s_tile[pvars::pw] = s_tile[pvars::pd];
     } else if (has_h && !has_w) {
-        s_tile[prb_dims::iw] = s_tile[prb_dims::ih];
-        s_tile[prb_dims::ow] = s_tile[prb_dims::oh];
-        s_tile[prb_dims::kw] = s_tile[prb_dims::kh];
-        s_tile[prb_dims::sw] = s_tile[prb_dims::sh];
-        s_tile[prb_dims::dw] = s_tile[prb_dims::dh];
-        s_tile[prb_dims::pw] = s_tile[prb_dims::ph];
+        s_tile[pvars::iw] = s_tile[pvars::ih];
+        s_tile[pvars::ow] = s_tile[pvars::oh];
+        s_tile[pvars::kw] = s_tile[pvars::kh];
+        s_tile[pvars::sw] = s_tile[pvars::sh];
+        s_tile[pvars::dw] = s_tile[pvars::dh];
+        s_tile[pvars::pw] = s_tile[pvars::ph];
     } else {
-        ir_error_not_expected();
+        gpu_error_not_expected();
     }
     for (auto &d : default_shape()) {
         if (s_tile.has(d)) continue;
@@ -67,7 +77,7 @@ double problem_t::ops() const {
 }
 
 void problem_t::normalize() {
-#define GET(name) shape_[prb_dims::name]
+#define GET(name) shape_[pvars::name]
     normalize_conv_shape(GET(od), GET(id), GET(kd), GET(sd), GET(dd), GET(pd),
             GET(oh), GET(ih), GET(kh), GET(sh), GET(dh), GET(ph), GET(ow),
             GET(iw), GET(kw), GET(sw), GET(dw), GET(pw),
@@ -76,37 +86,37 @@ void problem_t::normalize() {
 }
 
 std::string problem_t::desc_str() const {
-    int g = shape_[prb_dims::g];
-    int mb = shape_[prb_dims::mb];
-    int oc = shape_[prb_dims::oc];
-    int ic = shape_[prb_dims::ic];
-    int id = shape_[prb_dims::id];
-    int ih = shape_[prb_dims::ih];
-    int iw = shape_[prb_dims::iw];
-    int od = shape_[prb_dims::od];
-    int oh = shape_[prb_dims::oh];
-    int ow = shape_[prb_dims::ow];
-    int kd = shape_[prb_dims::kd];
-    int kh = shape_[prb_dims::kh];
-    int kw = shape_[prb_dims::kw];
-    int sd = shape_[prb_dims::sd];
-    int sh = shape_[prb_dims::sh];
-    int sw = shape_[prb_dims::sw];
-    int pd = shape_[prb_dims::pd];
-    int ph = shape_[prb_dims::ph];
-    int pw = shape_[prb_dims::pw];
-    int dd = shape_[prb_dims::dd];
-    int dh = shape_[prb_dims::dh];
-    int dw = shape_[prb_dims::dw];
+    dim_t g = shape_[pvars::g];
+    dim_t mb = shape_[pvars::mb];
+    dim_t oc = shape_[pvars::oc];
+    dim_t ic = shape_[pvars::ic];
+    dim_t id = shape_[pvars::id];
+    dim_t ih = shape_[pvars::ih];
+    dim_t iw = shape_[pvars::iw];
+    dim_t od = shape_[pvars::od];
+    dim_t oh = shape_[pvars::oh];
+    dim_t ow = shape_[pvars::ow];
+    dim_t kd = shape_[pvars::kd];
+    dim_t kh = shape_[pvars::kh];
+    dim_t kw = shape_[pvars::kw];
+    dim_t sd = shape_[pvars::sd];
+    dim_t sh = shape_[pvars::sh];
+    dim_t sw = shape_[pvars::sw];
+    dim_t pd = shape_[pvars::pd];
+    dim_t ph = shape_[pvars::ph];
+    dim_t pw = shape_[pvars::pw];
+    dim_t dd = shape_[pvars::dd];
+    dim_t dh = shape_[pvars::dh];
+    dim_t dw = shape_[pvars::dw];
     std::ostringstream oss;
     oss << "mb" << mb;
     if (g > 1) oss << "g" << g;
     oss << "ic" << g * ic;
 
-    std::vector<int> xd = {id, od, kd, sd, dd, pd};
-    std::vector<int> xh = {ih, oh, kh, sh, dh, ph};
-    std::vector<int> xw = {iw, ow, kw, sw, dw, pw};
-    std::vector<int> xdef = {1, 1, 1, 1, 0, 0};
+    std::vector<dim_t> xd = {id, od, kd, sd, dd, pd};
+    std::vector<dim_t> xh = {ih, oh, kh, sh, dh, ph};
+    std::vector<dim_t> xw = {iw, ow, kw, sw, dw, pw};
+    std::vector<dim_t> xdef = {1, 1, 1, 1, 0, 0};
     bool has_d = (xd != xdef);
     bool has_h = (xh != xdef);
     bool is_square = !has_d && (xh == xw);
@@ -140,19 +150,23 @@ std::string problem_t::desc_str() const {
 std::string problem_t::str() const {
     std::ostringstream oss;
     oss << "Conv problem" << std::endl;
-    oss << "  HW:          " << to_string(hw_.to_ngen()) << std::endl;
-    oss << "  Propagation: " << ir_utils::to_string(prop_) << std::endl;
-    oss << "  Source:      " << src_tag_ << std::endl;
-    oss << "  Weights:     " << wei_tag_ << std::endl;
-    oss << "  Destination: " << dst_tag_ << std::endl;
-    oss << "  Descriptor:  " << desc_str();
+    oss << "  HW:            " << to_string(hw_.to_ngen()) << std::endl;
+    oss << "  Propagation:   " << jit::to_string(prop_) << std::endl;
+    oss << "  Source:        " << src_tag_ << std::endl;
+    oss << "  Weights:       " << wei_tag_ << std::endl;
+    oss << "  Destination:   " << dst_tag_ << std::endl;
+    oss << "  With post-ops: " << ir_utils::to_string(with_post_ops_)
+        << std::endl;
+    oss << "  Deterministic: " << ir_utils::to_string(deterministic_)
+        << std::endl;
+    oss << "  Descriptor:    " << desc_str();
     return oss.str();
 }
 
 std::string problem_t::csv_str() const {
     std::vector<std::string> parts;
     parts.push_back(hw_.brief_str());
-    parts.push_back(ir_utils::to_string(prop_));
+    parts.push_back(jit::to_string(prop_));
     parts.push_back(src_tag_.str());
     parts.push_back(wei_tag_.str());
     parts.push_back(dst_tag_.str());
@@ -167,14 +181,14 @@ std::string problem_t::csv_str() const {
     return oss.str();
 }
 
-prb_tile_t problem_t::default_shape() {
-    static prb_tile_t _default_shape = []() {
-        static prb_tile_t ret;
-        ret[prb_dims::g] = 1;
-        ret[prb_dims::mb] = 1;
-        ret[prb_dims::id] = ret[prb_dims::ih] = ret[prb_dims::iw] = 1;
-        ret[prb_dims::od] = ret[prb_dims::oh] = ret[prb_dims::ow] = 1;
-        ret[prb_dims::kd] = ret[prb_dims::kh] = ret[prb_dims::kw] = 1;
+pvar_tile_t problem_t::default_shape() {
+    static pvar_tile_t _default_shape = []() {
+        static pvar_tile_t ret;
+        ret[pvars::g] = 1;
+        ret[pvars::mb] = 1;
+        ret[pvars::id] = ret[pvars::ih] = ret[pvars::iw] = 1;
+        ret[pvars::od] = ret[pvars::oh] = ret[pvars::ow] = 1;
+        ret[pvars::kd] = ret[pvars::kh] = ret[pvars::kw] = 1;
         for (auto &d : conv_stride_dims())
             ret[d] = 1;
         for (auto &d : conv_dilation_dims())
@@ -186,8 +200,8 @@ prb_tile_t problem_t::default_shape() {
     return _default_shape;
 }
 
-double problem_t::ops(prop_kind_t prop, const prb_tile_t &shape) {
-#define GET(name) shape[prb_dims::name]
+double problem_t::ops(prop_kind_t prop, const pvar_tile_t &shape) {
+#define GET(name) shape[pvars::name]
     double ret = 2.0;
     ret *= (double)GET(g) * GET(mb) * GET(oc) * GET(ic);
     ret *= GET(kd) * GET(kh) * GET(kw);
@@ -200,82 +214,6 @@ double problem_t::ops(prop_kind_t prop, const prb_tile_t &shape) {
     return ret;
 }
 
-class arg_helper_t {
-public:
-    arg_helper_t(prop_kind_t prop, bool with_bias)
-        : prop_(prop), with_bias_(with_bias) {}
-
-    int src_arg_key() const {
-        if (is_fwd()) return DNNL_ARG_SRC;
-        if (is_bwd_d()) return DNNL_ARG_DIFF_SRC;
-        if (is_bwd_w()) return DNNL_ARG_SRC;
-        ir_error_not_expected();
-        return DNNL_ARG_UNDEF;
-    }
-
-    bool is_src_input() const { return is_fwd() || is_bwd_w(); }
-    bool is_src_output() const { return is_bwd_d(); }
-
-    int wei_arg_key() const {
-        if (is_fwd()) return DNNL_ARG_WEIGHTS;
-        if (is_bwd_d()) return DNNL_ARG_WEIGHTS;
-        if (is_bwd_w()) return DNNL_ARG_DIFF_WEIGHTS;
-        ir_error_not_expected();
-        return DNNL_ARG_UNDEF;
-    }
-
-    bool is_wei_input() const { return is_fwd() || is_bwd_d(); }
-    bool is_wei_output() const { return is_bwd_w(); }
-
-    int bia_arg_key() const {
-        if (is_fwd()) return DNNL_ARG_BIAS;
-        if (is_bwd_d()) return DNNL_ARG_BIAS;
-        if (is_bwd_w()) return DNNL_ARG_DIFF_BIAS;
-        ir_error_not_expected();
-        return DNNL_ARG_UNDEF;
-    }
-
-    bool is_bia_input() const {
-        return with_bias() && (is_fwd() || is_bwd_d());
-    }
-    bool is_bia_output() const { return is_bwd_w() && with_bias(); }
-
-    int dst_arg_key() const {
-        if (is_fwd()) return DNNL_ARG_DST;
-        if (is_bwd_d()) return DNNL_ARG_DIFF_DST;
-        if (is_bwd_w()) return DNNL_ARG_DIFF_DST;
-        ir_error_not_expected();
-        return DNNL_ARG_UNDEF;
-    }
-
-    bool is_dst_input() const { return is_bwd_d() || is_bwd_w(); }
-    bool is_dst_output() const { return is_fwd(); }
-
-private:
-    bool is_fwd() const { return prop_ == prop_kind::forward; }
-    bool is_bwd_d() const { return prop_ == prop_kind::backward_data; }
-    bool is_bwd_w() const { return prop_ == prop_kind::backward_weights; }
-    bool with_bias() const { return with_bias_; }
-
-    prop_kind_t prop_;
-    bool with_bias_;
-};
-
-tensor_config_t get_tensor_config(prop_kind_t prop, bool with_bias) {
-    arg_helper_t h(prop, with_bias);
-    tensor_config_t tensor_cfg;
-    tensor_cfg.add_tensor("src", h.src_arg_key(), h.is_src_input(),
-            h.is_src_output(), jit::layout_t());
-    tensor_cfg.add_tensor("wei", h.wei_arg_key(), h.is_wei_input(),
-            h.is_wei_output(), jit::layout_t());
-    tensor_cfg.add_tensor("dst", h.dst_arg_key(), h.is_dst_input(),
-            h.is_dst_output(), jit::layout_t());
-    if (h.is_bia_output())
-        tensor_cfg.add_tensor("bia", h.bia_arg_key(), h.is_bia_input(),
-                h.is_bia_output(), jit::layout_t());
-    return tensor_cfg;
-}
-
 } // namespace conv
 } // namespace v2
 } // namespace jit
diff --git a/src/gpu/intel/jit/v2/conv/problem.hpp b/src/gpu/intel/jit/v2/conv/problem.hpp
index 78cdffdbb29..a5dd93c75b3 100644
--- a/src/gpu/intel/jit/v2/conv/problem.hpp
+++ b/src/gpu/intel/jit/v2/conv/problem.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,15 +38,31 @@ class problem_t {
     const layout_tag_t &src_tag() const { return src_tag_; }
     const layout_tag_t &wei_tag() const { return wei_tag_; }
     const layout_tag_t &dst_tag() const { return dst_tag_; }
-    const prb_tile_t &shape() const { return shape_; }
-
+    const type_t &bias_type() const { return bias_type_; }
+    const layout_tag_t &layout_tag(tensor_kind_t kind) const {
+        switch (kind) {
+            case tensor_kind_t::a:
+                return pick_a(prop_, src_tag_, wei_tag_, dst_tag_);
+            case tensor_kind_t::b:
+                return pick_b(prop_, src_tag_, wei_tag_, dst_tag_);
+            case tensor_kind_t::c:
+                return pick_c(prop_, src_tag_, wei_tag_, dst_tag_);
+            default: gpu_error_not_expected();
+        }
+        return src_tag_;
+    }
+    const pvar_tile_t &shape() const { return shape_; }
+    bool with_groups() const { return with_groups_; }
+    bool with_scales() const { return with_scales_; }
+    bool with_post_ops() const { return with_post_ops_; }
+    bool deterministic() const { return deterministic_; }
     bool is_depthwise() const {
-        int g = shape_.at(prb_dims::g);
-        int ic = shape_.at(prb_dims::ic);
-        int oc = shape_.at(prb_dims::oc);
+        dim_t g = shape_.at(pvars::g);
+        dim_t ic = shape_.at(pvars::ic);
+        dim_t oc = shape_.at(pvars::oc);
         return (g > 1) && (ic == 1) && (oc == 1);
     }
-
+    const type_t &out_type() const;
     void set_hw(const hw_t &hw) { hw_ = hw; }
     void set_prop(prop_kind_t prop) {
         prop_ = prop;
@@ -55,9 +71,18 @@ class problem_t {
     void set_src_tag(const layout_tag_t &tag) { src_tag_ = tag; }
     void set_wei_tag(const layout_tag_t &tag) { wei_tag_ = tag; }
     void set_dst_tag(const layout_tag_t &tag) { dst_tag_ = tag; }
-    void set_shape(const prb_tile_t &shape) { shape_ = shape; }
-    void set_bias(bool with_bias) { with_bias_ = with_bias; }
-    bool with_bias() const { return with_bias_; }
+    void set_bias_type(const type_t &bias_type) { bias_type_ = bias_type; }
+    void set_shape(const pvar_tile_t &shape) { shape_ = shape; }
+    void set_with_groups(bool value) { with_groups_ = value; }
+    void set_with_scales(bool value) { with_scales_ = value; }
+    void set_with_post_ops(bool value) { with_post_ops_ = value; }
+    void set_deterministic(bool value) { deterministic_ = value; }
+    bool with_bias_fwd() const {
+        return prop_ == prop_kind::forward && !bias_type_.is_undef();
+    }
+    bool with_bias_bwd_w() const {
+        return prop_ == prop_kind::backward_weights && !bias_type_.is_undef();
+    }
     double ops() const;
 
     void set_shape(const std::string &s);
@@ -68,22 +93,24 @@ class problem_t {
 
     IR_DEFINE_DUMP()
 
-    static prb_tile_t default_shape();
-    static double ops(prop_kind_t prop, const prb_tile_t &shape);
+    static pvar_tile_t default_shape();
+    static double ops(prop_kind_t prop, const pvar_tile_t &shape);
 
 private:
     hw_t hw_;
     prop_kind_t prop_ = prop_kind::undef;
-    bool with_bias_ = false;
     layout_tag_t src_tag_;
     layout_tag_t wei_tag_;
     layout_tag_t dst_tag_;
-    prb_tile_t shape_;
-    std::array<int, 3> dhw_map_;
+    type_t bias_type_;
+    pvar_tile_t shape_;
+    std::array<int, 3> dhw_map_ = {0};
+    bool with_groups_ = false;
+    bool with_scales_ = false;
+    bool with_post_ops_ = false;
+    bool deterministic_ = false;
 };
 
-tensor_config_t get_tensor_config(prop_kind_t prop, bool with_bias);
-
 } // namespace conv
 } // namespace v2
 } // namespace jit
diff --git a/src/gpu/intel/jit/v2/conv/tensor_utils.cpp b/src/gpu/intel/jit/v2/conv/tensor_utils.cpp
new file mode 100644
index 00000000000..1fcbf2079ae
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/tensor_utils.cpp
@@ -0,0 +1,420 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/v2/conv/tensor_utils.hpp"
+
+#include "gpu/intel/jit/conv/problem.hpp"
+#include "gpu/intel/jit/pass/simplify.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+
+layout_desc_t make_conv_layout_desc(
+        tensor_kind_t tensor_kind, bool src_dst_with_group) {
+    bool is_wei = (tensor_kind == tensor_kind_t::wei);
+    pvar_map_t<char> letter_map;
+    for (auto &d : conv_layout_dims(tensor_kind, src_dst_with_group)) {
+        char c = ' ';
+#define CASE(key, value) \
+    if (d == pvars::key) c = (value)
+        CASE(g, 'g');
+        CASE(mb, 'n');
+        CASE(ic, is_wei ? 'i' : 'c');
+        CASE(oc, is_wei ? 'o' : 'c');
+        CASE(id, 'd');
+        CASE(od, 'd');
+        CASE(kd, is_wei ? 'd' : 'z');
+        CASE(ih, 'h');
+        CASE(oh, 'h');
+        CASE(kh, is_wei ? 'h' : 'y');
+        CASE(iw, 'w');
+        CASE(ow, 'w');
+        CASE(kw, is_wei ? 'w' : 'x');
+#undef CASE
+        gpu_assert(c != ' ');
+        letter_map[d] = c;
+    }
+    return layout_desc_t(letter_map);
+}
+
+layout_desc_t make_conv_algo_layout_desc(
+        prop_kind_t prop, tensor_kind_t tensor_kind) {
+    auto desc = make_conv_layout_desc(tensor_kind, /*src_dst_with_group=*/true);
+    switch (tensor_kind) {
+        case tensor_kind_t::bias:
+        case tensor_kind_t::wei: return desc;
+        case tensor_kind_t::src:
+            if (prop == prop_kind::backward_data) return desc;
+            break;
+        case tensor_kind_t::dst:
+            if (prop != prop_kind::backward_data) return desc;
+            break;
+        default: gpu_error_not_expected();
+    }
+    pvar_map_t<char> letter_map;
+    bool is_src = (tensor_kind == tensor_kind_t::src);
+    pvar_t xd = (is_src ? pvars::od : pvars::id);
+    pvar_t xh = (is_src ? pvars::oh : pvars::ih);
+    pvar_t xw = (is_src ? pvars::ow : pvars::iw);
+    for (int i = 0; i < desc.ndims(); i++) {
+        auto d = desc.prb_dim(i);
+        if (utils::one_of(d, pvars::id, pvars::od)) {
+            letter_map[xd] = 'd';
+            letter_map[pvars::kd] = 'z';
+        } else if (utils::one_of(d, pvars::ih, pvars::oh)) {
+            letter_map[xh] = 'h';
+            letter_map[pvars::kh] = 'y';
+        } else if (utils::one_of(d, pvars::iw, pvars::ow)) {
+            letter_map[xw] = 'w';
+            letter_map[pvars::kw] = 'x';
+        } else {
+            letter_map[d] = desc.layout_letter(d);
+        }
+    }
+    return layout_desc_t(letter_map);
+}
+
+layout_tag_t make_conv_layout_tag(
+        tensor_kind_t tensor_kind, const std::string &s) {
+    if (s.empty()) return layout_tag_t();
+    bool is_wei = (tensor_kind == tensor_kind_t::wei);
+    auto desc = make_conv_layout_desc(tensor_kind);
+    auto parts = gpu_utils::split(s, ":");
+    auto type = (parts.size() > 1 ? type_t(parts[1]) : type_t::f32());
+    auto str_tag = desc.to_abx_tag(parts[0]);
+    auto raw_tag = layout_raw_tag_t(str_tag, is_wei ? 6 : 5);
+    return layout_tag_t(desc, type, raw_tag);
+}
+
+layout_tag_t append_groups(
+        tensor_kind_t tensor_kind, const layout_tag_t &layout_tag, bool is_dw) {
+    if (layout_tag.is_any()) return layout_tag;
+    bool is_src = (tensor_kind == tensor_kind_t::src);
+    bool is_dst = (tensor_kind == tensor_kind_t::dst);
+    bool is_bias = (tensor_kind == tensor_kind_t::bias);
+    if (!is_src && !is_dst && !is_bias) return layout_tag;
+    const auto &xc_dim = (is_src ? pvars::ic : pvars::oc);
+    auto xc_letter = dim_idx::as_tag(layout_tag.desc().dim_index(xc_dim));
+    auto new_g_letter = xc_letter;
+    auto new_xc_letter = into<char>(xc_letter + 1);
+    auto &raw_tag = layout_tag.raw_tag();
+    auto &entries = raw_tag.entries();
+    layout_raw_tag_t new_raw_tag;
+    for (auto &e : entries) {
+        if (e.letter == xc_letter) {
+            if (is_dw) {
+                new_raw_tag.add_entry(new_g_letter, e.block, e.is_blocked);
+                new_raw_tag.add_entry(new_xc_letter, 1, false);
+            } else if (e.is_outer()) {
+                new_raw_tag.add_entry(new_g_letter, 0, false);
+                new_raw_tag.add_entry(new_xc_letter, e.block, e.is_blocked);
+            } else {
+                new_raw_tag.add_entry(new_xc_letter, e.block, e.is_blocked);
+            }
+        } else {
+            char letter = e.letter;
+            if (letter >= new_xc_letter) letter++;
+            new_raw_tag.add_entry(letter, e.block, e.is_blocked);
+        }
+    }
+    auto desc = make_conv_layout_desc(tensor_kind, /*src_dst_with_group=*/true);
+    return layout_tag_t(desc, layout_tag.type(), new_raw_tag);
+}
+
+uint32_t append_groups(tensor_kind_t tensor_kind, uint32_t mask, bool is_dw) {
+    bool is_src = (tensor_kind == tensor_kind_t::src);
+    bool is_dst = (tensor_kind == tensor_kind_t::dst);
+    bool is_bias = (tensor_kind == tensor_kind_t::bias);
+    if (!is_src && !is_dst && !is_bias) return mask;
+    uint32_t c_mask = (mask >> 1) & 0x1;
+    uint32_t n_mask = mask & 0x1;
+    uint32_t dhw_mask = (mask >> 2);
+    return n_mask | (c_mask << 1) | (c_mask << 2) | (dhw_mask << 3);
+}
+
+layout_t make_conv_layout(tensor_kind_t tensor_kind, const layout_tag_t &_tag,
+        bool is_dw, const prb_reqs_t &reqs, uint32_t _mask) {
+    auto tag = append_groups(tensor_kind, _tag, is_dw);
+    auto mask = append_groups(tensor_kind, _mask, is_dw);
+    layout_t ret(tag.desc(), tag.type());
+    pvar_map_t<int> blocks;
+    auto rem_size = [&](const pvar_t &dim, const pvar_map_t<int> &blocks) {
+        uint32_t dim_mask = (mask & (1 << tag.desc().dim_index(dim)));
+        if (dim_mask == 0) return expr_t(1);
+        auto dim_size = reqs.to_expr(dim);
+        if (!blocks.has(dim)) return dim_size;
+        return div_up(dim_size, blocks[dim]);
+    };
+    auto &entries = tag.raw_tag().entries();
+    for (auto it = entries.rbegin(); it != entries.rend(); it++) {
+        pvar_t dim = tag.desc().prb_dim(it->index());
+        int block_size = it->block;
+        expr_t block_size_expr;
+        if (block_size > 0) {
+            blocks[dim] = blocks.get(dim, 1) * block_size;
+            block_size_expr = expr_t(block_size);
+        } else {
+            block_size_expr = rem_size(dim, blocks);
+        }
+        ret.add_block(dim, block_size_expr);
+    }
+    return ret;
+}
+
+std::string blocked_to_str_tag(const memory_desc_t &md) {
+    auto &blk = md.format_desc.blocking;
+    int ndims = md.ndims;
+    std::vector<dim_t> full_inner_blks(ndims, 1);
+    std::vector<std::string> parts;
+    dim_t stride = 1;
+    for (int i = blk.inner_nblks - 1; i >= 0; i--) {
+        dim_idx_t idx = into<dim_idx_t>(blk.inner_idxs[i]);
+        dim_t block = blk.inner_blks[i];
+        parts.emplace_back(1, dim_idx::as_tag(idx));
+        parts.push_back(std::to_string(block));
+        full_inner_blks[idx] *= block;
+        stride *= block;
+    }
+    std::vector<bool> seen(ndims);
+    dims_t rem_dims;
+    for (int i = 0; i < ndims; i++) {
+        rem_dims[i] = md.padded_dims[i] / full_inner_blks[i];
+    }
+    for (int i = 0; i < ndims; i++) {
+        bool found = false;
+        dim_t min_dim = std::numeric_limits<dim_t>::max();
+        for (int j = 0; j < ndims; j++) {
+            if (!seen[j] && blk.strides[j] == stride) {
+                min_dim = std::min(min_dim, rem_dims[j]);
+            }
+        }
+        for (int j = ndims - 1; j >= 0; j--) {
+            if (!seen[j] && blk.strides[j] == stride) {
+                // Size-one blocks have to be added first.
+                if (min_dim == 1 && rem_dims[j] != min_dim) continue;
+                bool is_blocked = (full_inner_blks[j] != 1);
+                parts.emplace_back(1, dim_idx::as_tag(j, is_blocked));
+                stride *= rem_dims[j];
+                seen[j] = true;
+                found = true;
+                break;
+            }
+        }
+        if (!found) gpu_error_not_expected();
+    }
+    std::ostringstream oss;
+    for (int i = (int)parts.size() - 1; i >= 0; i--)
+        oss << parts[i];
+    return oss.str();
+}
+
+layout_raw_tag_t normalize_conv_tag(tensor_kind_t tensor_kind,
+        dim_idx_t conv_ndims, const layout_raw_tag_t &tag) {
+    bool is_wei = (tensor_kind == tensor_kind_t::wei);
+    bool add_groups = (is_wei && tag.ndims() == conv_ndims);
+    int old_sp_ndims = conv_ndims - 2;
+    int new_sp_ndims = 3;
+    layout_raw_tag_t ret = tag;
+    if (add_groups) ret.add_dim('a', 0);
+    char sp_letter = dim_idx::as_tag(2u + ret.ndims() - conv_ndims);
+    int entry_idx = ret.entry_index(sp_letter);
+    for (int i = old_sp_ndims; i < new_sp_ndims; i++) {
+        ret.add_dim(sp_letter, entry_idx);
+    }
+    return ret;
+}
+
+layout_tag_t make_conv_layout_tag(tensor_kind_t tensor_kind,
+        dim_idx_t conv_ndims, const memory_desc_t &md) {
+    bool is_any = (md.format_kind == format_kind::any);
+    bool is_blocked = (md.format_kind == format_kind::blocked);
+    gpu_assert(is_any || is_blocked);
+    auto desc = make_conv_layout_desc(tensor_kind);
+    type_t type(md.data_type);
+    if (is_any) return layout_tag_t(desc, type, layout_raw_tag_t::any());
+    auto str_tag = blocked_to_str_tag(md);
+    auto raw_tag = layout_raw_tag_t(str_tag);
+    raw_tag = normalize_conv_tag(tensor_kind, conv_ndims, raw_tag);
+    return layout_tag_t(desc, type, raw_tag);
+}
+
+dim_mapper_manager_t::dim_mapper_manager_t(
+        prop_kind_t prop, const prb_reqs_t &reqs)
+    : prop_(prop), reqs_(reqs) {
+    src_mapper_ = init_src_mapper();
+    wei_mapper_ = init_wei_mapper();
+    dst_mapper_ = init_dst_mapper();
+    bias_mapper_ = init_bias_mapper();
+}
+
+const dim_mapper_t &dim_mapper_manager_t::mapper(tensor_kind_t tensor) const {
+    switch (tensor) {
+        case tensor_kind_t::src: return src_mapper_;
+        case tensor_kind_t::wei: return wei_mapper_;
+        case tensor_kind_t::dst: return dst_mapper_;
+        case tensor_kind_t::a:
+            return mapper(pick_a(prop_, tensor_kind_t::src, tensor_kind_t::wei,
+                    tensor_kind_t::dst));
+        case tensor_kind_t::b:
+            return mapper(pick_b(prop_, tensor_kind_t::src, tensor_kind_t::wei,
+                    tensor_kind_t::dst));
+        case tensor_kind_t::c:
+            return mapper(pick_c(prop_, tensor_kind_t::src, tensor_kind_t::wei,
+                    tensor_kind_t::dst));
+        case tensor_kind_t::bias: return bias_mapper_;
+        default: gpu_error_not_expected();
+    }
+    return src_mapper_;
+}
+
+dim_mapper_t dim_mapper_manager_t::init_src_mapper() const {
+    auto pd = reqs_.to_expr(pvars::pd);
+    auto ph = reqs_.to_expr(pvars::ph);
+    auto pw = reqs_.to_expr(pvars::pw);
+    auto sd = reqs_.to_expr(pvars::sd);
+    auto sh = reqs_.to_expr(pvars::sh);
+    auto sw = reqs_.to_expr(pvars::sw);
+    auto dd = reqs_.to_expr(pvars::dd);
+    auto dh = reqs_.to_expr(pvars::dh);
+    auto dw = reqs_.to_expr(pvars::dw);
+    dim_mapper_t mapper;
+    mapper.set_dim(pvars::mb);
+    mapper.set_dim(pvars::g);
+    mapper.set_dim(pvars::ic);
+    if (utils::one_of(prop_, prop_kind::forward, prop_kind::backward_weights)) {
+        auto dd_inc = const_fold(dd + 1);
+        auto dh_inc = const_fold(dh + 1);
+        auto dw_inc = const_fold(dw + 1);
+        auto neg_pd = const_fold(-pd);
+        auto neg_ph = const_fold(-ph);
+        auto neg_pw = const_fold(-pw);
+        mapper.set_dim(pvars::id,
+                simplify_rewrite(sd * od_idx + neg_pd + kd_idx * dd_inc), true);
+        mapper.set_dim(pvars::ih,
+                simplify_rewrite(sh * oh_idx + neg_ph + kh_idx * dh_inc), true);
+        mapper.set_dim(pvars::iw,
+                simplify_rewrite(sw * ow_idx + neg_pw + kw_idx * dw_inc), true);
+    } else {
+        mapper.set_dim(pvars::id);
+        mapper.set_dim(pvars::ih);
+        mapper.set_dim(pvars::iw);
+    }
+    mapper.set_layout_desc(
+            make_conv_algo_layout_desc(prop_, tensor_kind_t::src));
+    return mapper;
+}
+
+dim_mapper_t dim_mapper_manager_t::init_wei_mapper() const {
+    dim_mapper_t mapper;
+    mapper.set_dim(pvars::g);
+    mapper.set_dim(pvars::oc);
+    mapper.set_dim(pvars::ic);
+    mapper.set_dim(pvars::kd);
+    mapper.set_dim(pvars::kh);
+    mapper.set_dim(pvars::kw);
+    mapper.set_layout_desc(
+            make_conv_algo_layout_desc(prop_, tensor_kind_t::wei));
+    return mapper;
+}
+
+dim_mapper_t dim_mapper_manager_t::init_bias_mapper() const {
+    dim_mapper_t mapper;
+    mapper.set_dim(pvars::g);
+    mapper.set_dim(pvars::oc);
+    mapper.set_layout_desc(
+            make_conv_algo_layout_desc(prop_, tensor_kind_t::bias));
+    return mapper;
+}
+
+dim_mapper_t dim_mapper_manager_t::init_dst_mapper() const {
+    dim_mapper_t mapper;
+    mapper.set_dim(pvars::mb);
+    mapper.set_dim(pvars::g);
+    mapper.set_dim(pvars::oc);
+    if (utils::one_of(prop_, prop_kind::forward, prop_kind::backward_weights)) {
+        mapper.set_dim(pvars::od);
+        mapper.set_dim(pvars::oh);
+        mapper.set_dim(pvars::ow);
+    } else {
+        auto pd = reqs_.to_expr(pvars::pd);
+        auto ph = reqs_.to_expr(pvars::ph);
+        auto pw = reqs_.to_expr(pvars::pw);
+        auto sd = reqs_.to_expr(pvars::sd);
+        auto sh = reqs_.to_expr(pvars::sh);
+        auto sw = reqs_.to_expr(pvars::sw);
+        auto dd = reqs_.to_expr(pvars::dd);
+        auto dh = reqs_.to_expr(pvars::dh);
+        auto dw = reqs_.to_expr(pvars::dw);
+
+        auto dd_inc = const_fold(dd + 1);
+        auto dh_inc = const_fold(dh + 1);
+        auto dw_inc = const_fold(dw + 1);
+
+        mapper.set_dim(pvars::od,
+                simplify_rewrite((id_idx + pd - (kd_idx * dd_inc)) / sd), true);
+        mapper.set_dim(pvars::oh,
+                simplify_rewrite((ih_idx + ph - (kh_idx * dh_inc)) / sh), true);
+        mapper.set_dim(pvars::ow,
+                simplify_rewrite((iw_idx + pw - (kw_idx * dw_inc)) / sw), true);
+    }
+    mapper.set_layout_desc(
+            make_conv_algo_layout_desc(prop_, tensor_kind_t::dst));
+    return mapper;
+}
+
+dim_mapper_t extend_mapper(
+        const dim_mapper_t &mapper, const pvar_t &extra_dim, char letter) {
+    auto new_mapper = mapper;
+    new_mapper.set_dim(extra_dim);
+    auto &desc = mapper.layout_desc();
+    auto new_letter_map = desc.letter_map();
+    new_letter_map[extra_dim] = letter;
+    auto new_desc = layout_desc_t(new_letter_map);
+    new_mapper.set_layout_desc(new_desc);
+    return new_mapper;
+}
+
+std::vector<pvar_t> skip_mask(
+        const view_t &view, const pvar_tile_t &tile, const prb_reqs_t &reqs) {
+    std::vector<pvar_t> ret;
+    auto &mask_desc = view.mask_desc();
+    auto dim_sizes = view.base_layout().dim_sizes();
+    for (int i = 0; i < mask_desc.nmasks(); i++) {
+        pvar_t dim = mask_desc[i].dim;
+        gpu_assert(view.dim_mapper().has(dim));
+        // Assume that dimensions with non-trivial mapping always require
+        // masking.
+        if (!view.dim_mapper().expr(dim).is_same(dim.index_var())) continue;
+        // Check if the mask can be proven with known dimension requirements.
+        if (!reqs.can_prove(dim_sizes.at(dim) % tile.at(dim) == 0)) continue;
+        // Mask is not required for this dimension.
+        ret.push_back(std::move(dim));
+    }
+    return ret;
+}
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/conv/tensor_utils.hpp b/src/gpu/intel/jit/v2/conv/tensor_utils.hpp
new file mode 100644
index 00000000000..b7aab0c3c24
--- /dev/null
+++ b/src/gpu/intel/jit/v2/conv/tensor_utils.hpp
@@ -0,0 +1,88 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_V2_CONV_TENSOR_UTILS_HPP
+#define GPU_INTEL_JIT_V2_CONV_TENSOR_UTILS_HPP
+
+#include "gpu/intel/jit/ir/problem.hpp"
+#include "gpu/intel/jit/v2/ir/tensor.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+namespace conv {
+
+layout_desc_t make_conv_layout_desc(
+        tensor_kind_t tensor_kind, bool src_dst_with_group = false);
+layout_desc_t make_conv_algo_layout_desc(
+        prop_kind_t prop, tensor_kind_t tensor_kind);
+layout_tag_t make_conv_layout_tag(
+        tensor_kind_t tensor_kind, const std::string &s);
+layout_tag_t make_conv_layout_tag(tensor_kind_t tensor_kind,
+        dim_idx_t conv_ndims, const memory_desc_t &md);
+layout_t make_conv_layout(tensor_kind_t tensor_kind, const layout_tag_t &_tag,
+        bool is_dw, const prb_reqs_t &reqs, uint32_t mask = 0xFFFFFFFF);
+layout_tag_t append_groups(
+        tensor_kind_t tensor_kind, const layout_tag_t &layout_tag, bool is_dw);
+
+class dim_mapper_manager_t {
+public:
+    dim_mapper_manager_t() = default;
+    dim_mapper_manager_t(prop_kind_t prop, const prb_reqs_t &reqs);
+    const dim_mapper_t &mapper(tensor_kind_t tensor) const;
+
+private:
+    expr_t kw_idx = pvars::kw.index_var();
+    expr_t kh_idx = pvars::kh.index_var();
+    expr_t kd_idx = pvars::kd.index_var();
+    expr_t id_idx = pvars::id.index_var();
+    expr_t ih_idx = pvars::ih.index_var();
+    expr_t iw_idx = pvars::iw.index_var();
+    expr_t od_idx = pvars::od.index_var();
+    expr_t oh_idx = pvars::oh.index_var();
+    expr_t ow_idx = pvars::ow.index_var();
+
+    dim_mapper_t init_src_mapper() const;
+    dim_mapper_t init_wei_mapper() const;
+    dim_mapper_t init_dst_mapper() const;
+    dim_mapper_t init_bias_mapper() const;
+
+    prop_kind_t prop_ = prop_kind::undef;
+    prb_reqs_t reqs_;
+    dim_mapper_t src_mapper_;
+    dim_mapper_t wei_mapper_;
+    dim_mapper_t dst_mapper_;
+    dim_mapper_t bias_mapper_;
+};
+
+dim_mapper_t extend_mapper(
+        const dim_mapper_t &mapper, const pvar_t &extra_dim, char letter);
+
+std::vector<pvar_t> skip_mask(
+        const view_t &view, const pvar_tile_t &tile, const prb_reqs_t &reqs);
+
+} // namespace conv
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/v2/ir/bridge.hpp b/src/gpu/intel/jit/v2/ir/bridge.hpp
new file mode 100644
index 00000000000..090e8cf6ef5
--- /dev/null
+++ b/src/gpu/intel/jit/v2/ir/bridge.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_V2_IR_BRIDGE_HPP
+#define GPU_INTEL_JIT_V2_IR_BRIDGE_HPP
+
+#include "gpu/intel/jit/ir/message.hpp"
+#include "gpu/intel/jit/ir/tensor.hpp"
+#include "gpu/intel/jit/v2/ir/send.hpp"
+#include "gpu/intel/jit/v2/ir/tensor.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+
+inline jit::send_address_t to_ir(send_address_t address) {
+    jit::send_address_t ret = jit::send_address_t::a64;
+    switch (address) {
+#define CASE(name) \
+    case v2::send_address_t::name: ret = jit::send_address_t::name; break;
+        CASE(a64);
+        CASE(slm);
+#undef CASE
+        default: gpu_error_not_expected();
+    }
+    return ret;
+}
+
+inline jit::send_op_t to_ir(send_op_t op, bool is_2d = false) {
+    jit::send_op_t ret = jit::send_op_t::undef;
+    switch (op) {
+#define CASE(name) \
+    case v2::send_op_t::name: ret = jit::send_op_t::name; break;
+        CASE(atomic_add);
+        CASE(atomic_fadd);
+        CASE(load);
+        CASE(prefetch);
+        CASE(store);
+#undef CASE
+        default: gpu_error_not_expected();
+    }
+    if (is_2d) {
+        switch (ret) {
+            case jit::send_op_t::load: ret = jit::send_op_t::load_2d; break;
+            case jit::send_op_t::prefetch:
+                ret = jit::send_op_t::prefetch_2d;
+                break;
+            case jit::send_op_t::store: ret = jit::send_op_t::store_2d; break;
+            default: gpu_error_not_expected();
+        }
+    }
+    return ret;
+}
+
+inline jit::layout_t to_ir(const layout_t &layout) {
+    gpu_assert(layout.has_const_sizes());
+    gpu_assert(layout.has_const_strides());
+    std::vector<gpu::intel::block_t> blocks;
+    for (auto &b : layout.blocks()) {
+        int dim_idx = layout.desc().dim_index(b.dim);
+        blocks.emplace_back(dim_idx, b.int_size(), b.int_stride());
+    }
+
+    return jit::layout_t(
+            layout.type(), layout.desc().ndims(), layout.base(), blocks);
+}
+
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/v2/ir/builder.cpp b/src/gpu/intel/jit/v2/ir/builder.cpp
new file mode 100644
index 00000000000..4cf8fbfdd2d
--- /dev/null
+++ b/src/gpu/intel/jit/v2/ir/builder.cpp
@@ -0,0 +1,233 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/v2/ir/builder.hpp"
+#include "gpu/intel/jit/ir/message.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+
+send_header_t offset_scope_t::add_header(int version,
+        const send_2d_desc_t &desc, const expr_t &mem_buf, const expr_t &base,
+        const expr_t &x_base, const expr_t &y_base, const expr_t &x_inc,
+        const expr_t &y_inc, const loop_nest_t &loop_nest) {
+    auto base0 = cast(mem_buf, type_t::u64());
+    auto params = offset_params_t(type_t::u64(), /*esize=*/1, "h");
+    params.buf_align = desc.hw.grf_size();
+    auto off = get_offset(version, base0, base, expr_t(0), params, loop_nest);
+    auto x_params = offset_params_t(type_t::s32());
+    auto y_params = offset_params_t(type_t::s32());
+    x_params.buf = off.buf + send_t::header_2d_off_x();
+    y_params.buf = off.buf + send_t::header_2d_off_y();
+    auto x = get_offset(version, expr_t(0), x_base, x_inc, x_params, loop_nest);
+    auto y = get_offset(version, expr_t(0), y_base, y_inc, y_params, loop_nest);
+
+    int type_size = desc.type.size();
+    auto W_enc = to_simple_expr(desc.W) * type_size - 1;
+    auto H_enc = to_simple_expr(desc.H) - 1;
+    auto P_enc = to_simple_expr(desc.P) * type_size - 1;
+    (void)get_offset(version, W_enc,
+            off.buf + send_t::header_2d_off_surface_width(), loop_nest);
+    (void)get_offset(version, H_enc,
+            off.buf + send_t::header_2d_off_surface_height(), loop_nest);
+    (void)get_offset(version, P_enc,
+            off.buf + send_t::header_2d_off_surface_pitch(), loop_nest);
+
+    uint32_t w_enc = desc.w - 1;
+    uint32_t h_enc = desc.h - 1;
+    uint32_t count_enc = desc.c - 1;
+    uint32_t whc_value = (count_enc << 16) + (h_enc << 8) + w_enc;
+    (void)get_offset(version, whc_value, off.buf + send_t::header_2d_off_whc(),
+            loop_nest);
+
+    return send_header_t(off);
+}
+
+send_mask_t offset_scope_t::add_mask(int version, const mask_t &mask,
+        const std::vector<expr_t> &mask_incs, const loop_nest_t &loop_nest) {
+    send_mask_t ret;
+    for (int i = 0; i < mask.nmasks(); i++) {
+        auto &dm = mask.dim_masks[i];
+        if (dm.is_empty()) continue;
+        auto shift = mask_incs.empty() ? expr_t(0) : mask_incs[i];
+        auto params = offset_params_t(type_t::s32(), dm.slots(), "m");
+        params.allow_bcast = true;
+        params.allow_reuse = true;
+        auto off = get_offset(version, expr_t(0), dm.base, dm.slot_incs, shift,
+                params, loop_nest);
+        ret.add_mask(off, to_simple_expr(dm.bound), dm.has_underflow);
+    }
+    return ret;
+}
+
+offset_t offset_scope_t::get_offset(int version, const expr_t &base0,
+        const expr_t &base, const std::vector<expr_t> &_shift_vec,
+        const expr_t &_shift, const offset_params_t &_params,
+        const loop_nest_t &loop_nest) {
+    auto params = _params;
+    expr_t _base_init;
+    std::vector<expr_t> _loop_incs;
+    split_to_linear(base, loop_nest.indices(), loop_nest.init_exprs(),
+            _base_init, _loop_incs);
+
+    auto type = params.type.with_elems(params.esize);
+    auto shift_vec
+            = _shift_vec.empty() ? expr_t(0) : shuffle_t::make(_shift_vec);
+    if (params.allow_bcast) {
+        if (auto *shuffle = shift_vec.as_ptr<shuffle_t>()) {
+            if (shuffle->is_broadcast()) {
+                shift_vec = shuffle->vec[0];
+                type = type.scalar();
+            }
+        }
+    }
+    offset_t ret;
+    ret.version = version;
+    ret.type = type;
+    ret.base = base0 + _base_init;
+    ret.shift = _shift;
+    ret.shift_vec = std::move(shift_vec);
+    ret.esize = params.esize;
+
+    expr_t comp_value = 0;
+    for (size_t i = 0; i < loop_nest.nloops(); i++) {
+        auto inc_value = simplify(_loop_incs[i] - comp_value);
+        auto inc = to_simple_expr(inc_value);
+        ret.loop_incs.push_back(std::move(inc));
+        if (i == loop_nest.nloops() - 1) break;
+        comp_value = to_simple_expr(_loop_incs[i] * loop_nest[i].bound);
+    }
+
+    if (params.allow_reuse) {
+        for (auto &o : offsets_) {
+            if (o == ret) return o;
+        }
+    }
+
+    bool can_reuse_base = offset_t::can_reuse_base(
+            ret.type, ret.base, ret.shift, ret.shift_vec, ret.loop_incs);
+    if (!params.allow_reuse || !can_reuse_base) {
+        int size = type.size();
+        if (params.buf_align != 0) size = utils::rnd_up(size, params.buf_align);
+        ret.buf = params.get_buffer(buf_mgr_, size);
+        buf_versions_.emplace(ret.buf, version);
+    }
+
+    // Try to use inline initialization.
+    if (params.allow_inline_init && !is_zero(ret.shift)
+            && ret.type.is_scalar()) {
+        for (auto &o : offsets_) {
+            if (o.is_equal(ret, /*compare_shift=*/false)) {
+                gpu_assert(o.type.is_scalar());
+                ret.inline_init = ret.store(o.load() + ret.shift);
+                ret.loop_incs.clear();
+                break;
+            }
+        }
+    }
+
+    return add_offset(ret);
+}
+
+type_t to_send_type(const send_1d_desc_t &desc) {
+    if (desc.type_size <= 8) return type_t::u(desc.type_size * 8);
+    return type_t::oword(desc.type_size / 16);
+}
+
+stmt_t create_stmt(const send_1d_plan_t &plan, const expr_t &mem_buf,
+        const expr_t &reg_buf, offset_ctx_t &off_ctx,
+        const pvar_coord_t<dim_t> &coord, const pvar_tile_t &tile) {
+    for (auto &d : plan.entry_tile) {
+        gpu_assert(tile.at(d) % plan.entry_tile.at(d) == 0);
+    }
+    auto op = to_ir(plan.desc.op);
+    auto address = to_ir(plan.desc.address);
+    auto type = to_send_type(plan.desc);
+    auto slots = plan.desc.slots;
+    auto send_func = jit::send_t::make(
+            plan.hw, op, address, type, slots, /*zero_out=*/true);
+    auto &send = send_func.as<send_t>();
+    stmt_t ret;
+    for_each(tile, plan.entry_tile, [&](const pvar_coord_t<dim_t> &sub_coord) {
+        int entry_idx = plan.reg_layout.to_linear_index(
+                plan.entry_tile, coord + sub_coord);
+        auto &e = plan.entries[entry_idx];
+        gpu_assert(
+                e.coord.drop_defaults() == (coord + sub_coord).drop_defaults());
+        auto header
+                = off_ctx.add_header(plan.desc, mem_buf, plan.addr, e.addr_inc);
+        auto mask = off_ctx.add_mask(plan.mask, e.mask_incs);
+        auto call_reg_buf = reg_buf;
+        if (!reg_buf.is_empty())
+            call_reg_buf += plan.reg_layout.offset_in_bytes(sub_coord);
+        auto call
+                = send(mem_buf, header.to_expr(), call_reg_buf, mask.to_expr());
+        ret = ret.append(header.off().inline_init);
+        ret = ret.append(call);
+    });
+    return ret;
+}
+
+stmt_t create_stmt(const send_2d_plan_t &plan, const expr_t &mem_buf,
+        const expr_t &reg_buf, offset_ctx_t &off_ctx,
+        const pvar_coord_t<dim_t> &coord, const pvar_tile_t &tile) {
+    auto op = to_ir(plan.desc.op, /*is_2d=*/true);
+    auto &type = plan.desc.type;
+    auto &desc = plan.desc;
+    auto send_func = jit::send_t::make_2d(plan.hw, op, type, desc.w, desc.h,
+            desc.c, desc.vnni, desc.transpose, /*zero_out=*/true);
+    auto &send = send_func.as<send_t>();
+    stmt_t ret;
+    for_each(tile, plan.entry_tile, [&](const pvar_coord_t<dim_t> &sub_coord) {
+        int entry_idx = plan.reg_layout.to_linear_index(
+                plan.entry_tile, coord + sub_coord);
+        auto &e = plan.entries[entry_idx];
+        gpu_assert(e.coord == coord + sub_coord);
+        auto header = off_ctx.add_header(plan.desc, mem_buf, plan.base,
+                plan.x_base, plan.y_base, e.x_inc, e.y_inc);
+        auto mask = off_ctx.add_mask(plan.mask);
+        auto call_reg_buf = reg_buf;
+        if (!reg_buf.is_empty())
+            call_reg_buf += plan.reg_layout.offset_in_bytes(sub_coord);
+        auto call
+                = send(mem_buf, header.to_expr(), call_reg_buf, mask.to_expr());
+        ret = ret.append(header.off().inline_init);
+        ret = ret.append(call);
+    });
+    return ret;
+}
+
+stmt_t create_stmt(const send_plan_t &plan, const expr_t &mem_buf,
+        const expr_t &reg_buf, offset_ctx_t &off_ctx,
+        const pvar_coord_t<dim_t> &coord, const pvar_tile_t &tile) {
+    if (plan.is_1d())
+        return create_stmt(plan._1d, mem_buf, reg_buf, off_ctx, coord, tile);
+    if (plan.is_2d())
+        return create_stmt(plan._2d, mem_buf, reg_buf, off_ctx, coord, tile);
+    gpu_error_not_expected();
+    return stmt_t();
+}
+
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/ir/builder.hpp b/src/gpu/intel/jit/v2/ir/builder.hpp
new file mode 100644
index 00000000000..4d243fdb90c
--- /dev/null
+++ b/src/gpu/intel/jit/v2/ir/builder.hpp
@@ -0,0 +1,737 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_V2_IR_BUILDER_HPP
+#define GPU_INTEL_JIT_V2_IR_BUILDER_HPP
+
+#include "gpu/intel/jit/ir/kernel_info.hpp"
+#include "gpu/intel/jit/ir/reduce.hpp"
+#include "gpu/intel/jit/ir/reorder.hpp"
+#include "gpu/intel/jit/v2/ir/bridge.hpp"
+#include "gpu/intel/jit/v2/ir/plan.hpp"
+#include "gpu/intel/jit/v2/ir/tensor.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+
+struct loop_t {
+    size_t idx = 0;
+    pvar_t dim;
+    expr_t var;
+    expr_t init;
+    expr_t bound;
+
+    loop_t() = default;
+    loop_t(size_t idx, const pvar_t &dim, const expr_t &var, const expr_t &init,
+            const expr_t &bound)
+        : idx(idx), dim(dim), var(var), init(init), bound(bound) {}
+};
+
+class loop_nest_t {
+public:
+    loop_nest_t() = default;
+
+    void add_loop(const pvar_t &dim, const expr_t &idx, const expr_t &init,
+            const expr_t &bound) {
+        loops_.emplace_back(loops_.size(), dim, idx, init, bound);
+    }
+
+    void set_linear_bound(const expr_t &linear_bound) {
+        linear_bound_ = linear_bound;
+    }
+
+    size_t nloops() const { return loops_.size(); }
+    const expr_t &linear_bound() const { return linear_bound_; }
+    const loop_t &operator[](size_t idx) const { return loops_[idx]; }
+    std::vector<expr_t> indices() const {
+        std::vector<expr_t> ret;
+        ret.reserve(nloops());
+        for (size_t i = 0; i < nloops(); i++) {
+            ret.push_back(loops_[i].var);
+        }
+        return ret;
+    }
+
+    std::vector<expr_t> init_exprs() const {
+        std::vector<expr_t> ret;
+        ret.reserve(nloops());
+        for (size_t i = 0; i < nloops(); i++) {
+            ret.push_back(loops_[i].init);
+        }
+        return ret;
+    }
+
+    std::string str() const {
+        std::ostringstream oss;
+        oss << "nloops: " << nloops();
+        for (size_t i = 0; i < nloops(); i++) {
+            oss << std::endl;
+            oss << "  var: " << loops_[i].var << " init: " << loops_[i].init
+                << " bound: " << loops_[i].bound;
+        }
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
+
+private:
+    std::vector<loop_t> loops_;
+    expr_t linear_bound_;
+};
+
+struct offset_params_t {
+    // Type of the offset.
+    type_t type;
+    // Execution size:
+    // - esize = 1: used as a scalar
+    // - esize > 1: used as a vector
+    // Note that esize > 1 may be used with scalar offsets, in this case the
+    // offset is broadcasted when used.
+    int esize = 0;
+    // Offset buffer size alignment (e.g. used for header allocations, aligned
+    // at a GRF boundary).
+    int buf_align = 0;
+    // Whether the offset can be used with broadcasting (e.g. scalar mask with
+    // multiple slots).
+    bool allow_bcast = false;
+    // Whether the offset can be used directly from the base (if the offset is
+    // equal to the base).
+    bool allow_reuse = false;
+    // Whether inline initialization can be used (see offset_t for details).
+    bool allow_inline_init = false;
+    // Optional pre-allocated buffer for the offset.
+    expr_t buf;
+    // Prefix for the buffer name.
+    std::string buf_prefix;
+
+    offset_params_t(
+            const type_t &type, int esize = 1, const char *buf_prefix = nullptr)
+        : type(type), esize(esize) {
+        if (buf_prefix) this->buf_prefix = buf_prefix;
+    }
+
+    expr_t get_buffer(buffer_manager_t &buf_mgr, int size) const {
+        if (buf_prefix.empty()) return buf;
+        auto buf_name = buf_mgr.ir_ctx().create_tmp_name(buf_prefix);
+        return buf_mgr.get(buf_name, size);
+    }
+};
+
+// Offset is represented as the sum of three terms:
+//     base + shift + shift_vec
+// where:
+// - (base + shift) is a scalar portion
+// - shift_vec is a vector portion
+//
+// base/shift split is relative which is determined during load/store planning
+// to group instructions performing access to a shifted tiles of the same
+// sub-layout. In general "shift" portion consists of simpler expressions
+// comparing with "base".
+// shift_vec is a vector of offsets (e.g. for per slot in a message or per lane
+// in a mask comparison).
+struct offset_t {
+    // Offset version. This is relevant for offsets that are used in multiple
+    // versions of the same loop, e.g. load and prefetch.
+    int version = -1;
+    // GRF buffer for the offset. If empty, the base storage is used for the
+    // offset.
+    expr_t buf;
+    // Offset type (scalar or vector).
+    type_t type;
+    // Scalar base.
+    expr_t base;
+    // Scalar shift.
+    expr_t shift;
+    // Vector shift.
+    expr_t shift_vec;
+    // Loop increments, used to implement strength reduction.
+    std::vector<expr_t> loop_incs;
+    // Execution size.
+    int esize;
+    // Inline initialization. When set, the offset is initialized right before
+    // use. This implies no loop increments and no pre-initialization. This is
+    // used as an optimization when offset A is a shifted version of another
+    // offset B: in this case we can do A = B + shift and avoid any other
+    // operations.
+    stmt_t inline_init;
+
+    bool is_equal(const offset_t &other, bool compare_shift = true) const {
+        if (version != other.version) return false;
+        if (type != other.type) return false;
+        if (!base.is_equal(other.base)) return false;
+        if (compare_shift && !shift.is_equal(other.shift)) return false;
+        if (!shift_vec.is_equal(other.shift_vec)) return false;
+        if (!ir_utils::is_equal(loop_incs, other.loop_incs)) return false;
+        if (esize != other.esize) return false;
+        if (!ir_utils::is_equal(inline_init, other.inline_init)) return false;
+        return true;
+    }
+
+    bool operator==(const offset_t &other) const { return is_equal(other); }
+
+    expr_t load() const {
+        if (buf.is_empty()) return make_broadcast(base);
+        return make_broadcast(load_t::make(type, buf, 0));
+    }
+
+    stmt_t store(const expr_t &_value) const {
+        auto value = _value;
+        if (value.type() != type) value = cast(value, type);
+        return store_t::make(buf, 0, value);
+    }
+
+    stmt_t init_stmt() const {
+        if (buf.is_empty() || !inline_init.is_empty()) return stmt_t();
+        auto base_bcast = shuffle_t::make_broadcast(base + shift, type.elems());
+        return store(base_bcast + shift_vec);
+    }
+
+    stmt_t inc_stmt(int loop_idx) const {
+        if (loop_incs.empty()) return stmt_t();
+        auto inc = loop_incs[loop_idx];
+        if (is_zero(inc)) return stmt_t();
+        inc = shuffle_t::make_broadcast(inc, type.elems());
+        auto value = load_t::make(type, buf, 0) + inc;
+        return store(value);
+    }
+
+    expr_t make_broadcast(const expr_t &e) const {
+        if (e.type().elems() == esize) return e;
+        return shuffle_t::make_broadcast(e, esize);
+    }
+
+    std::string str() const {
+        std::ostringstream oss;
+        oss << "buf:       " << buf << std::endl;
+        oss << "base:      " << base << std::endl;
+        oss << "shift:     " << shift << std::endl;
+        oss << "shift_vec: " << shift_vec << std::endl;
+        oss << "loop_incs:";
+        for (int i = 0; i < (int)loop_incs.size(); i++) {
+            oss << std::endl;
+            oss << "  " << loop_incs[i];
+        }
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
+
+    static bool can_reuse_base(const type_t &type, const expr_t &base,
+            const expr_t &shift, const expr_t &shift_vec,
+            const std::vector<expr_t> &loop_incs) {
+        if (!type.is_scalar()) return false;
+        if (!is_zero(shift)) return false;
+        if (!is_var(base) && !is_const(base)) return false;
+        if (!all_of(shift_vec, 0)) return false;
+        for (auto &e : loop_incs)
+            if (!is_zero(e)) return false;
+        return true;
+    }
+};
+
+class send_header_t {
+public:
+    send_header_t() = default;
+    send_header_t(const offset_t &off) : off_(off) {}
+    const offset_t &off() const { return off_; }
+    const expr_t &to_expr() const { return off_.buf; }
+
+private:
+    offset_t off_;
+};
+
+class send_mask_t {
+public:
+    send_mask_t() = default;
+
+    void add_mask(
+            const offset_t &off, const expr_t &bound, bool has_underflow) {
+        entries_.emplace_back(off, bound, has_underflow);
+    }
+
+    expr_t to_expr() const {
+        if (entries_.empty()) return expr_t();
+        expr_t ret;
+        for (auto &e : entries_) {
+            auto cmp = (e.off.load() < e.off.make_broadcast(e.bound));
+            ret = (ret.is_empty() ? std::move(cmp) : (ret & cmp));
+            if (e.has_underflow)
+                ret &= (e.off.load() >= e.off.make_broadcast(0));
+        }
+        return ret;
+    }
+
+private:
+    struct entry_t {
+        entry_t() = default;
+        entry_t(const offset_t &off, const expr_t &bound, bool has_underflow)
+            : off(off), bound(bound), has_underflow(has_underflow) {}
+        offset_t off;
+        expr_t bound;
+        bool has_underflow = false;
+    };
+
+    std::vector<entry_t> entries_;
+};
+
+class offset_scope_t {
+public:
+    offset_scope_t(buffer_manager_t &buf_mgr) : buf_mgr_(buf_mgr) {}
+    buffer_manager_t &buf_mgr() { return buf_mgr_; }
+    int make_version() { return version_++; }
+
+    send_header_t add_header(int version, const send_1d_desc_t &desc,
+            const expr_t &mem_buf, const addr_t &addr, const expr_t &addr_inc,
+            const loop_nest_t &loop_nest) {
+        auto base0 = cast(mem_buf, type_t::u64());
+        auto params = offset_params_t(type_t::u64(), desc.slots, "h");
+        params.buf_align = buf_mgr_.ir_ctx().grf_size();
+        params.allow_inline_init = true;
+        auto off = get_offset(version, base0, addr.base, addr.slot_incs,
+                addr_inc, params, loop_nest);
+        return send_header_t(off);
+    }
+
+    send_header_t add_header(int version, const send_2d_desc_t &desc,
+            const expr_t &mem_buf, const expr_t &base, const expr_t &x_base,
+            const expr_t &y_base, const expr_t &x_inc, const expr_t &y_inc,
+            const loop_nest_t &loop_nest);
+    send_mask_t add_mask(int version, const mask_t &mask,
+            const std::vector<expr_t> &mask_incs, const loop_nest_t &loop_nest);
+
+    stmt_t init_stmt(int version) const {
+        stmt_t ret;
+        for (auto &o : offsets_) {
+            if (o.version != version) continue;
+            ret = ret.append(o.init_stmt());
+        }
+        return ret;
+    }
+
+    stmt_t inc_loop_stmt(int loop_idx, int version) const {
+        stmt_t ret;
+        for (auto &o : offsets_) {
+            if (o.version != version) continue;
+            auto inc = o.inc_stmt(loop_idx);
+            ret = ret.append(inc);
+        }
+        return ret;
+    }
+
+    stmt_t inject_let_stmts(const stmt_t &stmt) const {
+        return jit::inject_let_stmts(stmt, let_stmts_);
+    }
+
+private:
+    // base0 - memory buffer base address
+    // base, shift_vec, shift - offset parts (see offset_t description)
+    offset_t get_offset(int version, const expr_t &base0, const expr_t &base,
+            const std::vector<expr_t> &_shift_vec, const expr_t &_shift,
+            const offset_params_t &_params, const loop_nest_t &loop_nest);
+
+    offset_t get_offset(int version, const expr_t &base0, const expr_t &base,
+            const expr_t &shift, const offset_params_t &_params,
+            const loop_nest_t &loop_nest) {
+        return get_offset(version, base0, base, std::vector<expr_t>(), shift,
+                _params, loop_nest);
+    }
+
+    offset_t get_offset(int version, const expr_t &base, const expr_t &buf,
+            const loop_nest_t &loop_nest) {
+        offset_t ret;
+        ret.version = version;
+        ret.buf = buf;
+        ret.type = base.type();
+        ret.base = base;
+        ret.shift = expr_t(0);
+        ret.shift_vec = expr_t(0);
+        ret.esize = 1;
+        return add_offset(ret);
+    }
+
+    offset_t add_offset(const offset_t &off) {
+        offsets_.push_back(off);
+        return off;
+    }
+
+    expr_t to_simple_expr(const expr_t &e) {
+        if (is_const(e) || e.is<const_var_t>() || e.is<var_t>()) return e;
+        auto it = expr_to_let_var_.find(e);
+        if (it != expr_to_let_var_.end()) return it->second;
+        auto tmp_var = buf_mgr_.ir_ctx().create_tmp_var(type_t::s32());
+        let_stmts_.push_back(let_t::make(tmp_var, e));
+        expr_to_let_var_.emplace(e, tmp_var);
+        return tmp_var;
+    }
+
+    buffer_manager_t &buf_mgr_;
+    object_eq_map_t<expr_t, expr_t> expr_to_let_var_;
+    std::vector<stmt_t> let_stmts_;
+    std::vector<offset_t> offsets_;
+    object_map_t<expr_t, int> buf_versions_;
+    int version_ = 0;
+};
+
+class offset_ctx_t {
+public:
+    offset_ctx_t() = default;
+    offset_ctx_t(offset_scope_t *scope, const loop_nest_t &loop_nest = {})
+        : scope_(scope)
+        , loop_nest_(loop_nest)
+        , version_(scope->make_version()) {}
+
+    send_header_t add_header(const send_1d_desc_t &desc, const expr_t &mem_buf,
+            const addr_t &addr, const expr_t &addr_inc) {
+        return scope_->add_header(
+                version_, desc, mem_buf, addr, addr_inc, loop_nest_);
+    }
+
+    send_header_t add_header(const send_2d_desc_t &desc, const expr_t &mem_buf,
+            const expr_t &base, const expr_t &x_base, const expr_t &y_base,
+            const expr_t &x_inc, const expr_t &y_inc) {
+        return scope_->add_header(version_, desc, mem_buf, base, x_base, y_base,
+                x_inc, y_inc, loop_nest_);
+    }
+
+    send_mask_t add_mask(const mask_t &mask,
+            const std::vector<expr_t> &mask_incs = std::vector<expr_t>()) {
+        return scope_->add_mask(version_, mask, mask_incs, loop_nest_);
+    }
+
+    stmt_t init_stmt() const { return scope_->init_stmt(version_); }
+    stmt_t inc_loop_stmt(int loop_idx) const {
+        return scope_->inc_loop_stmt(loop_idx, version_);
+    }
+
+private:
+    offset_scope_t *scope_ = nullptr;
+    loop_nest_t loop_nest_;
+    int version_ = -1;
+};
+
+inline stmt_t create_stmt(const reduce_plan_t &plan, const expr_t &src_buf,
+        const expr_t &dst_buf) {
+    if (!plan) return stmt_t();
+    return create_reduce_stmt(
+            to_ir(plan.src), to_ir(plan.dst), src_buf, dst_buf);
+}
+
+inline stmt_t create_stmt(const reorder_plan_t &plan, const expr_t &src_buf,
+        const expr_t &dst_buf) {
+    if (!plan) return stmt_t();
+    return create_reorder_stmt(
+            to_ir(plan.src), to_ir(plan.dst), src_buf, dst_buf);
+}
+
+stmt_t create_stmt(const send_plan_t &plan, const expr_t &mem_buf,
+        const expr_t &reg_buf, offset_ctx_t &off_ctx,
+        const pvar_coord_t<dim_t> &coord, const pvar_tile_t &tile);
+
+inline stmt_t create_stmt(const send_plan_t &plan, const expr_t &mem_buf,
+        const expr_t &reg_buf, offset_ctx_t &off_ctx) {
+    return create_stmt(plan, mem_buf, reg_buf, off_ctx, pvar_coord_t<dim_t>(),
+            plan.reg_layout().int_dim_sizes());
+}
+
+class ir_builder_t;
+
+class var_ref_t {
+public:
+    var_ref_t(ir_builder_t *parent, const type_t &type, const expr_t &buf)
+        : parent_(parent), type_(type), buf_(buf) {
+        gpu_assert(buf_.type().is_ptr());
+    }
+
+    operator expr_t() const { return load_t::make(type_, buf_, 0); }
+    var_ref_t &operator=(const expr_t &value);
+    var_ref_t &operator=(const var_ref_t &other) = default;
+    std::string str() const { return buf_.str(); }
+
+    IR_DEFINE_DUMP()
+
+private:
+    ir_builder_t *parent_;
+    type_t type_;
+    expr_t buf_;
+};
+
+class ir_builder_t {
+public:
+    ir_builder_t(ir_context_t &ir_ctx)
+        : buf_mgr_(std::make_shared<buffer_manager_t>(ir_ctx))
+        , off_scope_(std::make_shared<offset_scope_t>(*buf_mgr_))
+        , off_ctx_(off_scope_.get()) {
+        enter_scope();
+    }
+    ir_builder_t(ir_builder_t &parent, const loop_nest_t &loop_nest)
+        : buf_mgr_(parent.buf_mgr_)
+        , off_scope_(parent.off_scope_)
+        , off_ctx_(off_scope_.get(), loop_nest) {
+        enter_scope();
+    }
+    ir_builder_t(const ir_builder_t &parent) = delete;
+    const hw_t &hw() const { return buf_mgr_->ir_ctx().hw(); }
+    ir_context_t &ir_ctx() { return buf_mgr_->ir_ctx(); }
+    buffer_manager_t &buf_mgr() { return *buf_mgr_; }
+    const offset_scope_t &off_scope() const { return *off_scope_; }
+    const offset_ctx_t &off_ctx() const { return off_ctx_; }
+    expr_t alloc(const std::string &_name, int size) {
+        const auto &name = (buf_mgr_->has(_name)
+                        ? buf_mgr_->ir_ctx().create_tmp_name(_name)
+                        : _name);
+        return buf_mgr_->get(name, size);
+    }
+    var_ref_t alloc_var(const type_t &type, const std::string &_name) {
+        const auto &name = (buf_mgr_->has(_name)
+                        ? buf_mgr_->ir_ctx().create_tmp_name(_name)
+                        : _name);
+        auto buf = alloc(name, type.size());
+        return var_ref_t(this, type, buf);
+    }
+    expr_t get_or_alloc(const std::string &name, int size) {
+        return buf_mgr_->get(name, size);
+    }
+
+    void zero_out(const expr_t &reg_buf, int size = 0) {
+        if (size == 0) size = buf_mgr_->size(reg_buf);
+        auto stmt = funcs::zero_out(reg_buf, size);
+        emit(stmt);
+    }
+
+    expr_t let(const std::string &prefix, const expr_t &value) {
+        auto name = buf_mgr_->ir_ctx().create_tmp_name(prefix);
+        auto var = var_t::make(value.type(), name);
+        let(var, value);
+        return var;
+    }
+
+    expr_t let(const expr_t &var, const expr_t &value) {
+        emit(let_t::make(var, value, stmt_t()));
+        return var;
+    }
+
+    expr_t let(const expr_t &value) { return let("tmp", value); }
+
+    expr_t load(const view_t &mem_view, const expr_t &mem_buf,
+            const expr_t &reg_buf = {}, layout_t *reg_layout = nullptr) {
+        send_params_t params;
+        params.hw = hw();
+        params.address = send_address_t::a64;
+        params.op = send_op_t::load;
+        params.init_max_entry_reg_size();
+        auto plan = create_send_plan(params, mem_view);
+        return load(plan, mem_buf, reg_buf, reg_layout);
+    }
+
+    expr_t load(const send_plan_t &plan, const expr_t &mem_buf,
+            expr_t reg_buf = {}, layout_t *reg_layout = nullptr) {
+        auto buf_prefix = get_buf_name(mem_buf) + "_buf";
+        if (plan.op() != send_op_t::prefetch && reg_buf.is_empty()) {
+            reg_buf = alloc(buf_prefix,
+                    utils::rnd_up(plan.reg_layout().size(), hw().grf_size()));
+        }
+        auto load_stmt = create_stmt(plan, mem_buf, reg_buf, off_ctx_);
+        emit(load_stmt);
+        if (reg_layout) *reg_layout = plan.reg_layout();
+        return reg_buf;
+    }
+
+    void store(const view_t &mem_view, const expr_t &mem_buf,
+            const layout_t &reg_layout, const expr_t &reg_buf) {
+        send_params_t params;
+        params.hw = hw();
+        params.address = send_address_t::a64;
+        params.op = send_op_t::store;
+        params.init_max_entry_reg_size();
+        auto plan = create_send_plan(params, mem_view);
+        gpu_assert(plan.reg_layout() == reg_layout);
+        store(plan, mem_buf, reg_buf);
+    }
+
+    void store(const send_plan_t &plan, const expr_t &mem_buf,
+            const expr_t &reg_buf) {
+        auto store_stmt = create_stmt(plan, mem_buf, reg_buf, off_ctx_);
+        emit(store_stmt);
+    }
+
+    void store(const send_plan_t &plan, const expr_t &mem_buf,
+            const expr_t &reg_buf, const pvar_coord_t<dim_t> &coord,
+            const pvar_tile_t &tile) {
+        auto store_stmt
+                = create_stmt(plan, mem_buf, reg_buf, off_ctx_, coord, tile);
+        emit(store_stmt);
+    }
+
+    expr_t reorder(const layout_t &src, const layout_t &dst,
+            const expr_t &src_buf, const expr_t &dst_buf = {}) {
+        if (src == dst) return src_buf;
+        auto plan = reorder_plan_t(hw(), src, dst);
+        return reorder(plan, src_buf, dst_buf);
+    }
+
+    expr_t reorder(const reorder_plan_t &plan, const expr_t &src_buf,
+            expr_t dst_buf = {}) {
+        if (dst_buf.is_empty()) { dst_buf = alloc("tmp", plan.dst.size()); }
+        auto reorder_stmt = create_stmt(plan, src_buf, dst_buf);
+        emit(reorder_stmt);
+        return dst_buf;
+    }
+
+    void reduce(const reduce_plan_t &plan, const expr_t &src_buf,
+            const expr_t &dst_buf) {
+        auto reduce_stmt = create_stmt(plan, src_buf, dst_buf);
+        emit(reduce_stmt);
+    }
+
+    void reduce(const layout_t &src, const layout_t &dst, const expr_t &src_buf,
+            const expr_t &dst_buf, uint32_t mask) {
+        auto stmt = create_reduce_stmt(
+                to_ir(src), to_ir(dst), src_buf, dst_buf, tensor_t(), mask);
+        emit(stmt);
+    }
+
+    void barrier() { emit(funcs::barrier()); }
+
+    template <typename BodyFuncT>
+    void _if(const expr_t &cond, const BodyFuncT &body_func) {
+        enter_scope();
+        body_func();
+        emit(if_t::make(cond, exit_scope()));
+    }
+
+    template <typename BodyFuncT>
+    void _for(const expr_t &var, const expr_t &init, const expr_t &bound,
+            const BodyFuncT &body_func) {
+        enter_scope();
+        body_func();
+        emit(for_t::make(var, init, bound, exit_scope()));
+    }
+
+    template <typename BodyFuncT>
+    void _while(const expr_t &cond, const BodyFuncT &body_func) {
+        enter_scope();
+        body_func();
+        emit(while_t::make(cond, exit_scope()));
+    }
+
+    void emit(const stmt_t &stmt) { top_stmt() = top_stmt().append(stmt); }
+    stmt_t get_stmt() const { return top_stmt(); }
+    void set_stmt(const stmt_t &stmt) {
+        gpu_assert(stmt_stack_.size() == 1);
+        top_stmt() = stmt;
+    }
+    stmt_t get_init_stmt() const { return off_ctx().init_stmt(); }
+
+private:
+    static const std::string &get_buf_name(const expr_t &e) {
+        auto *var = e.as_ptr<var_t>();
+        gpu_assert(var) << e;
+        return var->name;
+    }
+
+    stmt_t &top_stmt() {
+        gpu_assert(!stmt_stack_.empty());
+        return stmt_stack_.back();
+    }
+    const stmt_t &top_stmt() const {
+        gpu_assert(!stmt_stack_.empty());
+        return stmt_stack_.back();
+    }
+    void enter_scope() { stmt_stack_.emplace_back(); }
+    stmt_t exit_scope() {
+        auto ret = stmt_stack_.back();
+        stmt_stack_.pop_back();
+        return ret;
+    }
+
+    std::shared_ptr<buffer_manager_t> buf_mgr_;
+    std::shared_ptr<offset_scope_t> off_scope_;
+    offset_ctx_t off_ctx_;
+    std::vector<stmt_t> stmt_stack_;
+};
+
+inline var_ref_t &var_ref_t::operator=(const expr_t &value) {
+    gpu_assert(value.type() == type_);
+    parent_->emit(store_t::make(buf_, 0, value));
+    return *this;
+}
+
+class var_manager_t {
+public:
+    var_manager_t(const kernel_iface_t &kernel_iface)
+        : kernel_iface_(kernel_iface) {}
+
+    std::vector<expr_t> ptr_args() const {
+        std::vector<expr_t> ret;
+        for (int i = 0; i < kernel_iface_.nargs(); i++) {
+            auto &var = kernel_iface_.arg_var(i);
+            if (var.type().is_ptr()) ret.push_back(var);
+        }
+        return ret;
+    }
+
+    expr_t get_arg(const std::string &name, bool allow_empty = false) const {
+        return kernel_iface_.find_arg(name, allow_empty);
+    }
+
+    expr_t get_grid_size(const std::string &name) const {
+        return get_arg(type_t::u32(), name + "_grid_size");
+    }
+
+    expr_t get_idiv_magic(const expr_t &value) const {
+        std::string name;
+        if (auto *op = value.as_ptr<binary_op_t>()) {
+            if (op->op_kind == op_kind_t::_div_up) {
+                gpu_assert(is_const(op->b))
+                        << "Expected constant denominator: " << value;
+                if (is_one(op->b)) return get_idiv_magic(op->a);
+                gpu_assert(op->a.is<var_t>() || op->a.is<const_var_t>())
+                        << "Expected var/const var: " << op->a;
+                name = op->a.str();
+                name += "_divup_" + op->b.str();
+            }
+        } else {
+            gpu_assert(value.is<var_t>() || value.is<const_var_t>())
+                    << "Expected var/const var: " << value;
+            name = value.str();
+        }
+        return get_arg(type_t::u64(), name + "_magic");
+    }
+
+    expr_t get_arg(const type_t &type, const std::string &name) const {
+        gpu_assert(kernel_iface_.has(name)) << "Cannot find argument " << name;
+        auto var = kernel_iface_.find_arg(name);
+        gpu_assert(var.type() == type) << "Type mismatch, found: " << var.type()
+                                       << " expected: " << type;
+        return var;
+    }
+
+private:
+    const kernel_iface_t &kernel_iface_;
+};
+
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/v2/ir/plan.hpp b/src/gpu/intel/jit/v2/ir/plan.hpp
new file mode 100644
index 00000000000..40bbc5b1ce1
--- /dev/null
+++ b/src/gpu/intel/jit/v2/ir/plan.hpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_JIT_V2_IR_PLAN_HPP
+#define GPU_INTEL_JIT_V2_IR_PLAN_HPP
+
+#include "gpu/intel/jit/ir/hw.hpp"
+#include "gpu/intel/jit/v2/ir/plan_utils.hpp"
+#include "gpu/intel/jit/v2/ir/tensor.hpp"
+
+#include <sstream>
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+
+struct reduce_plan_t : public base_plan_t {
+    layout_t src;
+    layout_t dst;
+
+    using base_plan_t::base_plan_t;
+
+    reduce_plan_t() = default;
+    reduce_plan_t(const hw_t &hw, const layout_t &src, const layout_t &dst)
+        : base_plan_t(hw), src(src), dst(dst) {}
+
+    int grf_usage_bytes() const {
+        int ret = 0;
+        ret += utils::rnd_up(dst.size(), grf_size());
+        return ret;
+    }
+
+    std::string str() const {
+        if (!*this) return "(empty)";
+        std::ostringstream oss;
+        oss << "src_layout: " << src.str() << std::endl;
+        oss << "dst_layout: " << dst.str();
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
+};
+
+struct reorder_plan_t : public base_plan_t {
+    layout_t src;
+    layout_t dst;
+
+    using base_plan_t::base_plan_t;
+
+    reorder_plan_t() = default;
+    reorder_plan_t(const hw_t &hw, const layout_t &src, const layout_t &dst)
+        : base_plan_t(hw), src(src), dst(dst) {}
+
+    int grf_usage_bytes() const {
+        int ret = 0;
+        ret += utils::rnd_up(dst.size(), grf_size());
+        return ret;
+    }
+
+    std::string str() const {
+        if (!*this) return "(empty)";
+        std::ostringstream oss;
+        oss << "src_layout: " << src.str() << std::endl;
+        oss << "dst_layout: " << dst.str();
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
+};
+
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/jit/v2/ir/reqs.cpp b/src/gpu/intel/jit/v2/ir/reqs.cpp
index 162cdbdddf3..224b88a3afc 100644
--- a/src/gpu/intel/jit/v2/ir/reqs.cpp
+++ b/src/gpu/intel/jit/v2/ir/reqs.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,15 +28,31 @@ namespace intel {
 namespace jit {
 namespace v2 {
 
-bool is_a_mod_b_eq_0(const expr_t &e, expr_t &a, int &b) {
+bool is_a_mod_b_eq_0(const expr_t &e, expr_t &a, expr_t &b) {
     auto *eq_op = e.as_ptr<binary_op_t>();
     if (!eq_op || eq_op->op_kind != op_kind_t::_eq) return false;
     if (!is_zero(eq_op->b)) return false;
     auto *mod_op = eq_op->a.as_ptr<binary_op_t>();
     if (!mod_op || mod_op->op_kind != op_kind_t::_mod) return false;
-    if (!is_const(mod_op->b)) return false;
     a = mod_op->a;
-    b = to_cpp<int>(mod_op->b);
+    b = mod_op->b;
+    return true;
+}
+
+bool is_a_eq_b_or_c_eq_d(
+        const expr_t &e, expr_t &a, expr_t &b, expr_t &c, expr_t &d) {
+    auto *or_op = e.as_ptr<binary_op_t>();
+    if (!or_op || or_op->op_kind != op_kind_t::_or) return false;
+    auto &op0 = or_op->a;
+    auto &op1 = or_op->b;
+    auto *eq_op0 = op0.as_ptr<binary_op_t>();
+    auto *eq_op1 = op1.as_ptr<binary_op_t>();
+    if (!eq_op0 || eq_op0->op_kind != op_kind_t::_eq) return false;
+    if (!eq_op1 || eq_op1->op_kind != op_kind_t::_eq) return false;
+    a = eq_op0->a;
+    b = eq_op0->b;
+    c = eq_op1->a;
+    d = eq_op1->b;
     return true;
 }
 
@@ -46,9 +62,9 @@ class linear_cmp_simplifier_t : public ir_mutator_t {
         if (!is_cmp_op(obj.op_kind)) return ir_mutator_t::_mutate(obj);
 
         if (!is_const(obj.b)) return obj;
-        int a_div = linear_max_pow2_divisor(obj.a);
-        int b_div = to_cpp<int>(obj.b);
-        int factor = math::gcd(a_div, b_div);
+        dim_t a_div = linear_max_pow2_divisor(obj.a);
+        dim_t b_div = to_cpp<dim_t>(obj.b);
+        dim_t factor = math::gcd(a_div, b_div);
         if (factor == 1) return obj;
 
         auto a = linear_div(obj.a, factor);
@@ -58,10 +74,11 @@ class linear_cmp_simplifier_t : public ir_mutator_t {
 };
 
 expr_t simplify_expr(const expr_t &_e) {
-    expr_t a;
-    int b;
+    expr_t a, b;
     if (is_a_mod_b_eq_0(_e, a, b)) {
-        return simplify_linear_mod(simplify_expr(a), b) == 0;
+        a = simplify_expr(a);
+        if (is_const(b)) return simplify_linear_mod(a, to_cpp<int>(b)) == 0;
+        return a % b == 0;
     }
     auto e = _e;
     for (int i = 0; i < 2; i++) {
@@ -73,58 +90,72 @@ expr_t simplify_expr(const expr_t &_e) {
 }
 
 // Represents a product of dimensions.
-class dim_product_t {
+class req_lhs_t {
 public:
-    dim_product_t() = default;
-    explicit dim_product_t(const expr_t &e) : dims_(split(e)) {
-        std::sort(dims_.begin(), dims_.end(),
-                [](const prb_dim_t &a, const prb_dim_t &b) {
-                    return a.id() < b.id();
-                });
+    req_lhs_t() = default;
+    explicit req_lhs_t(const pvar_t &pvar) : pvars_ {pvar} {}
+    req_lhs_t(const pvar_t &pvar0, const pvar_t &pvar1)
+        : pvars_ {pvar0, pvar1} {}
+    explicit req_lhs_t(const std::vector<pvar_t> &pvars) : pvars_(pvars) {
+        std::sort(pvars_.begin(), pvars_.end());
+    }
+    explicit req_lhs_t(const expr_t &e) : pvars_(split(e)) {
+        std::sort(pvars_.begin(), pvars_.end());
         // Duplicates are not expected.
         for (int i = 1; i < size(); i++) {
-            ir_assert(dims_[i] != dims_[i - 1]);
+            gpu_assert(pvars_[i] != pvars_[i - 1]);
         }
     }
 
-    int size() const { return (int)dims_.size(); }
-    const prb_dim_t &operator[](int idx) const { return dims_[idx]; }
-    bool operator==(const dim_product_t &other) const {
-        return dims_ == other.dims_;
+    int size() const { return (int)pvars_.size(); }
+    const std::vector<pvar_t> &pvars() const { return pvars_; }
+    const pvar_t &operator[](int idx) const { return pvars_[idx]; }
+    bool operator==(const req_lhs_t &other) const {
+        return pvars_ == other.pvars_;
     }
-    bool operator==(const prb_dim_t &dim) const {
-        return size() == 1 && (*this)[0] == dim;
-    }
-    bool operator!=(const dim_product_t &other) const {
-        return !operator==(other);
+    bool operator==(const pvar_t &pvar) const {
+        return size() == 1 && (*this)[0] == pvar;
     }
+    bool operator!=(const req_lhs_t &other) const { return !operator==(other); }
 
-    int64_t to_int(const prb_tile_t &sizes) const {
-        int64_t value = 1;
-        for (auto &dim : dims_) {
-            value *= sizes.at(dim);
+    template <typename T>
+    T to_int(const pvar_map_t<T> &values) const {
+        T value = 1;
+        for (auto &pvar : pvars_) {
+            value *= values.at(pvar);
         }
         return value;
     }
 
     bool is_ge_1() const {
-        for (auto &dim : dims_) {
+        for (auto &pvar : pvars_) {
             // All dimensions take positive values except padding and dilation.
-            if (utils::one_of(dim, prb_dims::dd, prb_dims::dh, prb_dims::dw))
-                return false;
-            if (utils::one_of(dim, prb_dims::pd, prb_dims::ph, prb_dims::pw))
-                return false;
+            if (is_dilation(pvar)) return false;
+            if (is_padding(pvar)) return false;
+        }
+        return true;
+    }
+
+    bool has(const pvar_t &pvar) const {
+        for (auto &p : pvars_)
+            if (p == pvar) return true;
+        return false;
+    }
+
+    bool has(const req_lhs_t &other) const {
+        for (int i = 0; i < other.size(); i++) {
+            if (!has(other[i])) return false;
         }
         return true;
     }
 
-    int substitute(const prb_tile_t &dim_sizes) {
+    int substitute(const pvar_map_t<dim_t> &values) {
         int factor = 1;
-        for (auto &d : dim_sizes) {
+        for (auto &v : values) {
             for (int i = 0; i < size(); i++) {
-                if (dims_[i] == d) {
-                    dims_.erase(dims_.begin() + i);
-                    factor *= dim_sizes[d];
+                if (pvars_[i] == v) {
+                    pvars_.erase(pvars_.begin() + i);
+                    factor *= values[v];
                     break;
                 }
             }
@@ -135,11 +166,11 @@ class dim_product_t {
     void stringify(std::ostream &out) const { stringify_impl(out); }
 
     void stringify_impl(std::ostream &out, const std::string &sep = "*") const {
-        ir_assert(size() > 0);
+        gpu_assert(size() > 0);
         bool is_first = true;
-        for (auto &dim : dims_) {
+        for (auto &p : pvars_) {
             if (!is_first) out << sep;
-            out << dim.str();
+            out << p.str();
             is_first = false;
         }
     }
@@ -149,13 +180,12 @@ class dim_product_t {
         auto parts = gpu_utils::split(s, "*");
         std::vector<expr_t> args;
         for (auto &p : parts) {
-            auto dim = prb_dim_t::from_name(p);
-            dims_.push_back(dim);
+            pvars_.push_back(pvar_t(p));
         }
     }
 
     std::string str() const {
-        if (dims_.empty()) return "(empty)";
+        if (pvars_.empty()) return "(empty)";
         std::ostringstream oss;
         stringify_impl(oss, " * ");
         return oss.str();
@@ -164,20 +194,196 @@ class dim_product_t {
     IR_DEFINE_DUMP()
 
 private:
-    static std::vector<prb_dim_t> split(const expr_t &e) {
-        if (auto *var = e.as_ptr<const_var_t>()) return {size_to_prb_dim(*var)};
+    static std::vector<pvar_t> split(const expr_t &e) {
+        if (auto *var = e.as_ptr<const_var_t>())
+            return {pvar_t::from_var(*var)};
         if (auto *op = e.as_ptr<binary_op_t>()) {
-            ir_assert(op->op_kind == op_kind_t::_mul);
-            auto a_dims = split(op->a);
-            auto b_dims = split(op->b);
-            a_dims.insert(a_dims.end(), b_dims.begin(), b_dims.end());
-            return a_dims;
+            gpu_assert(op->op_kind == op_kind_t::_mul);
+            auto a_params = split(op->a);
+            auto b_params = split(op->b);
+            a_params.insert(a_params.end(), b_params.begin(), b_params.end());
+            return a_params;
         }
-        ir_error_not_expected() << "Unknown expression: " << e;
+        gpu_error_not_expected() << "Unknown expression: " << e;
         return {};
     }
 
-    std::vector<prb_dim_t> dims_;
+    std::vector<pvar_t> pvars_;
+};
+
+class req_rhs_entry_t {
+public:
+    req_rhs_entry_t() = default;
+    explicit req_rhs_entry_t(dim_t value) : value_(value) { is_undef_ = false; }
+    explicit req_rhs_entry_t(const pvar_t &pvar) : pvar_(pvar) {
+        is_undef_ = false;
+    }
+    explicit req_rhs_entry_t(const expr_t &e) {
+        if (is_const(e)) {
+            value_ = to_cpp<int>(e);
+        } else {
+            pvar_ = pvar_t::from_var(e);
+            gpu_assert(!pvar_.is_undef()) << e;
+        }
+        is_undef_ = false;
+    }
+    explicit req_rhs_entry_t(const std::string &s) {
+        if (!s.empty() && std::isdigit(s[0])) {
+            value_ = std::stoi(s);
+        } else {
+            pvar_ = pvar_t(s);
+            gpu_assert(!pvar_.is_undef()) << s;
+        }
+        is_undef_ = false;
+    }
+    bool is_undef() const { return is_undef_; }
+    bool is_pvar() const { return !is_undef_ && !pvar_.is_undef(); }
+    bool is_value() const { return !is_undef_ && pvar_.is_undef(); }
+    const pvar_t &pvar() const {
+        gpu_assert(is_pvar());
+        return pvar_;
+    }
+    dim_t value() const {
+        gpu_assert(is_value());
+        return value_;
+    }
+    template <typename T>
+    T to_int(const pvar_map_t<T> &values) const {
+        gpu_assert(!is_undef());
+        if (is_value()) return value_;
+        return values.at(pvar_);
+    }
+
+    bool operator==(const req_rhs_entry_t &other) const {
+        return (is_undef_ == other.is_undef_) && (pvar_ == other.pvar_)
+                && (value_ == other.value_);
+    }
+
+    bool operator!=(const req_rhs_entry_t &other) const {
+        return !operator==(other);
+    }
+
+    std::string str() const {
+        std::ostringstream oss;
+        if (is_pvar()) {
+            oss << pvar_.str();
+        } else {
+            oss << value_;
+        }
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
+
+private:
+    bool is_undef_ = true;
+    pvar_t pvar_;
+    dim_t value_ = 0;
+};
+
+class req_rhs_t {
+public:
+    req_rhs_t() = default;
+    explicit req_rhs_t(dim_t value) { entries_[0] = req_rhs_entry_t(value); }
+    explicit req_rhs_t(const expr_t &e) { entries_[0] = req_rhs_entry_t(e); }
+    explicit req_rhs_t(
+            const req_rhs_entry_t &e0, const req_rhs_entry_t &e1 = {}) {
+        entries_[0] = e0;
+        entries_[1] = e1;
+        gpu_assert(!entries_[0].is_undef());
+    }
+    req_rhs_t(const expr_t &e0, const expr_t &e1)
+        : req_rhs_t(req_rhs_entry_t(e0), req_rhs_entry_t(e1)) {}
+    bool is_undef() const { return size() == 0; }
+    bool is_pvar() const { return size() == 1 && entries_[0].is_pvar(); }
+    bool is_value() const { return size() == 1 && entries_[0].is_value(); }
+    const pvar_t &pvar() const {
+        gpu_assert(is_pvar());
+        return entries_[0].pvar();
+    }
+    dim_t value() const {
+        gpu_assert(is_value());
+        return entries_[0].value();
+    }
+    int size() const {
+        return !entries_[0].is_undef() + !entries_[1].is_undef();
+    }
+    const req_rhs_entry_t &operator[](int idx) const {
+        gpu_assert(idx >= 0 && idx < size());
+        return entries_[idx];
+    }
+    bool operator==(const req_rhs_t &other) const {
+        for (int i = 0; i < max_entries; i++) {
+            if (entries_[i] != other.entries_[i]) return false;
+        }
+        return true;
+    }
+    bool operator!=(const req_rhs_t &other) const { return !operator==(other); }
+
+    template <typename T>
+    T to_int(const pvar_map_t<T> &values) const {
+        T ret = 1;
+        for (int i = 0; i < size(); i++) {
+            ret *= entries_[i].to_int(values);
+        }
+        return ret;
+    }
+
+    void substitute(const pvar_map_t<dim_t> &values) {
+        dim_t value = 1;
+        pvar_t pvar;
+        for (int i = 0; i < size(); i++) {
+            if (entries_[i].is_value()) {
+                value *= entries_[i].value();
+            } else {
+                if (values.has(entries_[i].pvar())) {
+                    value *= values.at(entries_[i].pvar());
+                } else {
+                    gpu_assert(pvar.is_undef());
+                    pvar = entries_[i].pvar();
+                }
+            }
+        }
+        operator=(req_rhs_t());
+        int idx = 0;
+        if (value != 1) entries_[idx++] = req_rhs_entry_t(value);
+        if (!pvar.is_undef()) entries_[idx++] = req_rhs_entry_t(pvar);
+        if (size() == 0) entries_[0] = req_rhs_entry_t(1);
+    }
+
+    void stringify_impl(
+            std::ostream &out, const std::string &delim = {}) const {
+        bool is_first = true;
+        for (int i = 0; i < size(); i++) {
+            if (!is_first) out << delim << "*" << delim;
+            out << entries_[i].str();
+            is_first = false;
+        }
+    }
+
+    void stringify(std::ostream &out) const { stringify_impl(out); }
+
+    void parse(std::istream &in) {
+        auto s = jit::parse<std::string>(in);
+        auto parts = gpu_utils::split(s, "*");
+        gpu_assert(!parts.empty() && (int)parts.size() <= max_entries);
+        for (int i = 0; i < (int)parts.size(); i++) {
+            entries_[i] = req_rhs_entry_t(parts[i]);
+        }
+    }
+
+    std::string str() const {
+        if (is_undef()) return "(empty)";
+        std::ostringstream oss;
+        stringify_impl(oss, /*delim=*/" ");
+        return oss.str();
+    }
+
+    IR_DEFINE_DUMP()
+
+private:
+    static const int max_entries = 2;
+    req_rhs_entry_t entries_[max_entries];
 };
 
 enum class req_kind_t : uint32_t {
@@ -186,6 +392,7 @@ enum class req_kind_t : uint32_t {
     ge = 2,
     le = 4,
     mod_eq_0 = 8,
+    _or_eq = 16,
 };
 
 static auto req_kind_names = nstl::to_array({
@@ -194,33 +401,68 @@ static auto req_kind_names = nstl::to_array({
         make_enum_name(req_kind_t::ge, ">="),
         make_enum_name(req_kind_t::le, "<="),
         make_enum_name(req_kind_t::mod_eq_0, "mod_eq_0"),
+        make_enum_name(req_kind_t::_or_eq, "|"),
 });
 GPU_DEFINE_PARSE_ENUM(req_kind_t, req_kind_names)
 
+bool has_req_op(const std::string &s) {
+    for (const char *op : {"=", "%", "|"})
+        if (s.find(op) != std::string::npos) return true;
+    return false;
+}
+
+bool is_pvar_product(const expr_t &e) {
+    if (auto *var = e.as_ptr<const_var_t>())
+        return !pvar_t::from_var(*var).is_undef();
+    if (auto *op = e.as_ptr<binary_op_t>()) {
+        if (op->op_kind != op_kind_t::_mul) return false;
+        return is_pvar_product(op->a) && is_pvar_product(op->b);
+    }
+    return false;
+}
+
 class req_impl_t {
 public:
     req_impl_t() = default;
     req_impl_t(const expr_t &e) {
         if (try_init_mod_eq_0(e)) return;
+        if (try_init_or_eq(e)) return;
         if (try_init(e)) return;
-        ir_error_not_expected() << "Cannot handle expression: " << e;
+        gpu_error_not_expected() << "Cannot handle expression: " << e;
     }
+    req_impl_t(req_kind_t kind, const req_lhs_t &lhs, const req_rhs_t &rhs)
+        : kind_(kind), lhs_(lhs), rhs_(rhs) {}
+    req_impl_t(req_kind_t kind, const req_lhs_t &lhs, int rhs)
+        : kind_(kind), lhs_(lhs), rhs_(rhs) {}
 
     req_kind_t kind() const { return kind_; }
-    const dim_product_t &lhs() const { return lhs_; }
-    int rhs() const { return rhs_; }
-
-    void substitute(const prb_tile_t &dim_sizes) {
-        int factor = lhs_.substitute(dim_sizes);
-        ir_assert(rhs_ % factor == 0);
-        rhs_ /= factor;
+    bool is_undef() const { return kind_ == req_kind_t::undef; }
+    const req_lhs_t &lhs() const { return lhs_; }
+    const req_rhs_t &rhs() const { return rhs_; }
+
+    void substitute(const pvar_map_t<dim_t> &values) {
+        if (kind_ == req_kind_t::_or_eq) {
+            auto a = req_impl_t(
+                    req_kind_t::eq, req_lhs_t(lhs_[0]), req_rhs_t(rhs_[0]));
+            auto b = req_impl_t(
+                    req_kind_t::eq, req_lhs_t(lhs_[1]), req_rhs_t(rhs_[1]));
+            a.substitute(values);
+            b.substitute(values);
+            if (a.is_undef() || b.is_undef()) *this = req_impl_t();
+            return;
+        }
+        rhs_.substitute(values);
+        if (rhs_.size() != 1) return;
+        int factor = lhs_.substitute(values);
+        if (factor != 1) {
+            gpu_assert(rhs().value() % factor == 0);
+            rhs_ = req_rhs_t(rhs().value() / factor);
+        }
         if (lhs_.size() == 0) {
             // Fully reduced, check that the requirement evaluates to true and
             // reset it to skip later.
-            ir_assert(fits(prb_tile_t()));
-            lhs_ = dim_product_t();
-            rhs_ = 0;
-            kind_ = req_kind_t::undef;
+            gpu_assert(fits(pvar_map_t<dim_t>()));
+            *this = req_impl_t();
         }
     }
 
@@ -228,33 +470,37 @@ class req_impl_t {
         return (kind_ == other.kind_) && (lhs_ == other.lhs_)
                 && (rhs_ == other.rhs_);
     }
-    bool fits(const prb_tile_t &sizes) const {
-        int64_t lhs = lhs_.to_int(sizes);
+    bool fits(const pvar_map_t<dim_t> &values) const {
+        if (kind_ == req_kind_t::_or_eq) {
+            int64_t lhs0 = values.at(lhs_[0]);
+            int64_t lhs1 = values.at(lhs_[1]);
+            int64_t rhs0 = rhs_[0].to_int(values);
+            int64_t rhs1 = rhs_[1].to_int(values);
+            return (lhs0 == rhs0) || (lhs1 == rhs1);
+        }
+        int64_t lhs_val = lhs_.to_int(values);
+        int64_t rhs_val = rhs_.to_int(values);
         bool ret = false;
         switch (kind_) {
-            case req_kind_t::eq: ret = lhs == rhs_; break;
-            case req_kind_t::ge: ret = lhs >= rhs_; break;
-            case req_kind_t::le: ret = lhs <= rhs_; break;
-            case req_kind_t::mod_eq_0: ret = (lhs % rhs_) == 0; break;
-            default: ir_error_not_expected();
-        }
-        ir_check(ret) << "Requirement is not satisfied: " << str()
-                      << ". LHS evaluates to " << lhs;
+            case req_kind_t::eq: ret = lhs_val == rhs_val; break;
+            case req_kind_t::ge: ret = lhs_val >= rhs_val; break;
+            case req_kind_t::le: ret = lhs_val <= rhs_val; break;
+            case req_kind_t::mod_eq_0: ret = (lhs_val % rhs_val) == 0; break;
+            default: gpu_error_not_expected();
+        }
+        gpu_check(ret) << "Requirement is not satisfied: " << str()
+                       << ". LHS evaluates to " << lhs_val
+                       << ", RHS evaluates to " << rhs_val;
         return ret;
     }
 
     // Checks if the condition is an implication of the current
     // requirement.
-    bool can_prove(const req_impl_t &to_prove) const {
-        if (*this == to_prove) return true;
-        if (kind_ != to_prove.kind_) return false;
-        if (lhs_ != to_prove.lhs_) return false;
-        switch (kind_) {
-            case req_kind_t::ge: return rhs_ >= to_prove.rhs_;
-            case req_kind_t::le: return rhs_ <= to_prove.rhs_;
-            case req_kind_t::mod_eq_0: return rhs_ % to_prove.rhs_ == 0;
-            default: return false;
-        }
+    bool can_prove(const req_impl_t &other) const {
+        if (*this == other) return true;
+        if (can_prove_le_ge(other)) return true;
+        if (can_prove_mod(other)) return true;
+        return false;
     }
 
     void stringify(std::ostream &out) const { stringify_impl(out); }
@@ -267,20 +513,27 @@ class req_impl_t {
             case req_kind_t::le:
                 lhs_.stringify(out);
                 out << delim << to_string(kind_) << delim;
-                out << rhs_;
+                rhs_.stringify(out);
                 break;
             case req_kind_t::mod_eq_0:
                 lhs_.stringify(out);
-                out << delim << "%" << delim << rhs_ << delim << "==" << delim
-                    << "0";
+                out << delim << "%" << delim;
+                rhs_.stringify(out);
+                out << delim << "==" << delim << "0";
+                break;
+            case req_kind_t::_or_eq:
+                out << lhs_[0] << delim << "==" << delim << rhs_[0];
+                out << delim << to_string(kind_) << delim;
+                out << lhs_[1] << delim << "==" << delim << rhs_[1];
                 break;
-            default: ir_error_not_expected() << "kind: " << to_string(kind_);
+            default: gpu_error_not_expected() << "kind: " << to_string(kind_);
         }
     }
 
     void parse(std::istream &in) {
         auto s = jit::parse<std::string>(in);
-        for (req_kind_t op : {req_kind_t::eq, req_kind_t::ge, req_kind_t::le}) {
+        for (req_kind_t op : {req_kind_t::_or_eq, req_kind_t::eq,
+                     req_kind_t::ge, req_kind_t::le}) {
             auto s_op = to_string(op);
             auto pos = s.find(s_op);
             if (pos == std::string::npos) continue;
@@ -289,20 +542,27 @@ class req_impl_t {
             auto mod_pos = s_lhs.find("%");
             expr_t lhs;
             if (mod_pos != std::string::npos) {
-                ir_assert(op == req_kind_t::eq);
+                gpu_assert(op == req_kind_t::eq);
                 kind_ = req_kind_t::mod_eq_0;
                 auto s_mod_lhs = s_lhs.substr(0, mod_pos);
                 auto s_mod_rhs = s_lhs.substr(mod_pos + 1);
-                lhs_ = jit::parse<dim_product_t>(s_mod_lhs);
-                rhs_ = std::stoi(s_mod_rhs);
+                lhs_ = jit::parse<req_lhs_t>(s_mod_lhs);
+                rhs_ = jit::parse<req_rhs_t>(s_mod_rhs);
             } else {
                 kind_ = op;
-                lhs_ = jit::parse<dim_product_t>(s_lhs);
-                rhs_ = std::stoi(s_rhs);
+                if (op == req_kind_t::_or_eq) {
+                    auto a = jit::parse<req_impl_t>(s_lhs);
+                    auto b = jit::parse<req_impl_t>(s_rhs);
+                    lhs_ = req_lhs_t(a.lhs()[0], b.lhs()[0]);
+                    rhs_ = req_rhs_t(a.rhs()[0], b.rhs()[0]);
+                } else {
+                    lhs_ = jit::parse<req_lhs_t>(s_lhs);
+                    rhs_ = jit::parse<req_rhs_t>(s_rhs);
+                }
             }
             return;
         }
-        ir_error_not_expected() << s;
+        gpu_error_not_expected() << s;
     }
 
     std::string str() const {
@@ -316,12 +576,22 @@ class req_impl_t {
 
 private:
     bool try_init_mod_eq_0(const expr_t &e) {
-        expr_t a;
-        int b;
+        expr_t a, b;
         if (!is_a_mod_b_eq_0(e, a, b)) return false;
+        if (!is_pvar_product(a)) return false;
         kind_ = req_kind_t::mod_eq_0;
-        lhs_ = dim_product_t(a);
-        rhs_ = b;
+        lhs_ = req_lhs_t(a);
+        rhs_ = req_rhs_t(b);
+        return true;
+    }
+
+    bool try_init_or_eq(const expr_t &e) {
+        expr_t lhs0, rhs0;
+        expr_t lhs1, rhs1;
+        if (!is_a_eq_b_or_c_eq_d(e, lhs0, rhs0, lhs1, rhs1)) return false;
+        kind_ = req_kind_t::_or_eq;
+        lhs_ = req_lhs_t(pvar_t::from_var(lhs0), pvar_t::from_var(lhs1));
+        rhs_ = req_rhs_t(rhs0, rhs1);
         return true;
     }
 
@@ -335,21 +605,76 @@ class req_impl_t {
             case op_kind_t::_le: kind_ = req_kind_t::le; break;
             default: return false;
         }
-        lhs_ = dim_product_t(op->a);
-        rhs_ = to_cpp<int>(op->b);
+        gpu_assert(is_const(op->b)) << "Unexpected non-const RHS: " << op->b;
+        auto *div_a_op = op->a.as_ptr<binary_op_t>();
+        if (div_a_op && div_a_op->op_kind == op_kind_t::_div) {
+            if (!is_pvar_product(div_a_op->a)) return false;
+            lhs_ = req_lhs_t(div_a_op->a);
+            rhs_ = req_rhs_t(
+                    req_rhs_entry_t(op->b), req_rhs_entry_t(div_a_op->b));
+            return true;
+        }
+        if (!is_pvar_product(op->a)) return false;
+        lhs_ = req_lhs_t(op->a);
+        rhs_ = req_rhs_t(op->b);
         return true;
     }
 
+    bool can_prove_le_ge(const req_impl_t &other) const {
+        if (kind() != other.kind()) return false;
+        bool is_le = (kind() == req_kind_t::le);
+        bool is_ge = (kind() == req_kind_t::ge);
+        if (!is_le && !is_ge) return false;
+        if (rhs().size() != 1 || other.rhs().size() != 1) return false;
+        if (lhs() == other.lhs()) {
+            if (is_ge) return rhs().value() >= other.rhs().value();
+            if (is_le) return rhs().value() <= other.rhs().value();
+            return false;
+        }
+        switch (kind()) {
+            case req_kind_t::le:
+                // (a * b <= C) => (a <= C + x) if a >= 1 and b >= 1.
+                if (!lhs().is_ge_1()) return false;
+                if (other.rhs().value() < rhs().value()) return false;
+                for (int i = 0; i < other.lhs().size(); i++) {
+                    if (!lhs().has(other.lhs()[i])) return false;
+                }
+                return true;
+            case req_kind_t::ge:
+                // (a >= C + x) => (a * b >= C) if a >= 1 and b >= 1.
+                if (!other.lhs().is_ge_1()) return false;
+                if (other.rhs().value() > rhs().value()) return false;
+                for (int i = 0; i < lhs().size(); i++) {
+                    if (!other.lhs().has(lhs()[i])) return false;
+                }
+                return true;
+            default: return false;
+        }
+    }
+
+    bool can_prove_mod(const req_impl_t &other) const {
+        // (a % (C * D) == 0) => ((a * b) % C == 0)
+        if (kind_ != req_kind_t::mod_eq_0) return false;
+        if (!rhs().is_value() || !other.rhs().is_value()) return false;
+        if (!other.lhs_.has(lhs_)) return false;
+        if (other.kind() == req_kind_t::mod_eq_0)
+            return rhs().value() % other.rhs().value() == 0;
+        // (a % (C + x) == 0) => ((a * b >= C) if (a * b) > 0
+        if (other.kind() == req_kind_t::ge && other.lhs().is_ge_1())
+            return rhs().value() >= other.rhs().value();
+        return false;
+    }
+
     req_kind_t kind_ = req_kind_t::undef;
-    dim_product_t lhs_;
-    int rhs_ = 0;
+    req_lhs_t lhs_;
+    req_rhs_t rhs_;
 };
 
 void prb_reqs_t::add(const expr_t &_e) {
     auto e = simplify_expr(_e);
     if (auto *imm = e.as_ptr<bool_imm_t>()) {
         if (imm->value) return;
-        ir_error_not_expected() << _e;
+        gpu_error_not_expected() << _e;
     }
     add_if_not_found(req_impl_t(e));
 }
@@ -359,18 +684,18 @@ void prb_reqs_t::add(const prb_reqs_t &other) {
         add_if_not_found(r.impl());
 }
 
-void prb_reqs_t::add(const prb_tile_t &tile) {
-    for (auto &d : tile) {
-        set(d, tile[d]);
+void prb_reqs_t::add(const pvar_map_t<dim_t> &values) {
+    for (auto &v : values) {
+        set(v, values[v]);
     }
 }
 
-void prb_reqs_t::set(const prb_dim_t &dim, int value) {
-    add(size_var(dim) == value);
+void prb_reqs_t::add_no_simplify(const expr_t &e) {
+    add_if_not_found(req_impl_t(e));
 }
 
-void prb_reqs_t::set_any_mod(const prb_dim_t &dim) {
-    any_mods_.push_back(dim);
+void prb_reqs_t::set(const pvar_t &pvar, dim_t value) {
+    add(pvar.var() == value);
 }
 
 void prb_reqs_t::add_if_not_found(const req_impl_t &new_req) {
@@ -384,24 +709,38 @@ prover_t prb_reqs_t::prover(const prb_reqs_t &parent, bool can_update) {
     return prover_t(&parent, this, can_update);
 }
 
-bool prb_reqs_t::fits(const prb_tile_t &sizes) const {
+bool prb_reqs_t::fits(const pvar_map_t<dim_t> &values) const {
     for (auto &r : reqs_) {
-        ir_check(r.impl().fits(sizes));
+        gpu_check(r.impl().fits(values));
     }
     return true;
 }
 
-void prb_reqs_t::stringify(std::ostream &out) const {
+void prb_reqs_t::stringify_impl(std::ostream &out, const std::string &req_delim,
+        const std::string &delim) const {
     if (reqs_.empty()) {
         out << "x";
         return;
     }
+    pvar_map_t<dim_t> var_eq_map;
     bool is_first = true;
     for (auto &r : reqs_) {
-        if (!is_first) out << ":";
-        r.impl().stringify(out);
+        if (r.impl().kind() == req_kind_t::eq) {
+            var_eq_map[r.impl().lhs()[0]] = r.impl().rhs().value();
+            continue;
+        }
+        if (!is_first) out << req_delim;
+        r.impl().stringify_impl(out, delim);
         is_first = false;
     }
+    if (!var_eq_map.is_empty()) {
+        if (!is_first) out << req_delim;
+        out << var_eq_map.str();
+    }
+}
+
+void prb_reqs_t::stringify(std::ostream &out) const {
+    stringify_impl(out, ":", "");
 }
 
 void prb_reqs_t::parse(std::istream &in) {
@@ -409,150 +748,67 @@ void prb_reqs_t::parse(std::istream &in) {
     auto s = stream_parse<std::string>(in);
     if (s == "x") return;
     auto parts = gpu_utils::split(s, ":");
+    pvar_map_t<int> var_eq_map;
     for (auto &p : parts) {
+        if (!has_req_op(p)) {
+            var_eq_map = pvar_map_t<int>(p);
+            continue;
+        }
         auto ri = jit::parse<req_impl_t>(p);
         reqs_.emplace_back(ri);
     }
+    for (auto &v : var_eq_map) {
+        reqs_.emplace_back(req_impl_t(
+                req_kind_t::eq, req_lhs_t(v), req_rhs_t(var_eq_map.at(v))));
+    }
 }
 
 std::string prb_reqs_t::str() const {
     std::ostringstream oss;
-    bool is_first = true;
-    for (auto &r : reqs_) {
-        if (!is_first) oss << "\n";
-        oss << r.impl().str();
-        is_first = false;
-    }
+    stringify_impl(oss, "\n", " ");
     return oss.str();
 }
 
-prb_reqs_t::req_t::req_t() : impl_(std::make_shared<req_impl_t>()) {}
+prb_reqs_t::req_t::req_t() : impl_(utils::make_unique<req_impl_t>()) {}
+prb_reqs_t::req_t::req_t(const req_t &other)
+    : impl_(utils::make_unique<req_impl_t>(other.impl())) {}
 prb_reqs_t::req_t::req_t(const req_impl_t &impl)
-    : impl_(std::make_shared<req_impl_t>(impl)) {}
+    : impl_(utils::make_unique<req_impl_t>(impl)) {}
+prb_reqs_t::req_t::~req_t() = default;
+prb_reqs_t::req_t &prb_reqs_t::req_t::operator=(const req_t &other) {
+    impl_ = utils::make_unique<req_impl_t>(other.impl());
+    return *this;
+}
+std::string prb_reqs_t::req_t::str() const {
+    return impl_->str();
+}
 
 void prb_reqs_t::simplify() {
-    int default_mod = 1;
-    int default_low = 0;
-    int default_high = std::numeric_limits<int>::max();
-    dim_map_t<prb_dim_t, int> low_bound;
-    dim_map_t<prb_dim_t, int> high_bound;
-    dim_map_t<prb_dim_t, int> mod;
-    dim_map_t<prb_dim_t, uint32_t> mask;
-    mod.fill_missing(default_mod);
-    low_bound.fill_missing(default_low);
-    high_bound.fill_missing(default_high);
-    mask.fill_missing(0);
-    // Collect low/high bounds and modulus information for individual
-    // dimensions.
-    for (auto &r : reqs_) {
-        auto &ri = r.impl();
-        if (ri.lhs().size() != 1) continue;
-        auto dim = ri.lhs()[0];
-        switch (ri.kind()) {
-            case req_kind_t::mod_eq_0: {
-                int &f = mod[dim];
-                f = std::max(f, ri.rhs());
-                break;
-            }
-            case req_kind_t::eq:
-                low_bound[dim] = high_bound[dim] = ri.rhs();
-                break;
-            case req_kind_t::le:
-                high_bound[dim] = std::min(high_bound[dim], ri.rhs());
-                break;
-            case req_kind_t::ge:
-                low_bound[dim] = std::max(low_bound[dim], ri.rhs());
-                break;
-            default: break;
+    auto new_reqs = reqs_;
+    // Drop redundant requirements.
+    for (size_t i = 0; i < new_reqs.size(); i++) {
+        auto &ri = new_reqs[i].impl();
+        if (ri.is_undef()) continue;
+        pvar_map_t<dim_t> sub;
+        if (ri.kind() == req_kind_t::eq && ri.lhs().size() == 1) {
+            sub[ri.lhs()[0]] = ri.rhs().value();
         }
-    }
-    // Set masks based on known modulus information and bounds.
-    for (auto &dim : mask) {
-        if (mod[dim] != default_mod) {
-            mask[dim] |= static_cast<uint32_t>(req_kind_t::mod_eq_0);
-        }
-        if (low_bound[dim] == high_bound[dim]) {
-            mask[dim] |= static_cast<uint32_t>(req_kind_t::eq);
-        } else if (low_bound[dim] != default_low) {
-            mask[dim] |= static_cast<uint32_t>(req_kind_t::ge);
-        } else if (high_bound[dim] != default_high) {
-            mask[dim] |= static_cast<uint32_t>(req_kind_t::le);
-        }
-    }
-    // Drop redundant requirements based on the collected restrictions.
-    std::vector<req_t> new_reqs;
-    for (auto &r : reqs_) {
-        auto &ri = r.impl();
-        switch (ri.kind()) {
-            case req_kind_t::mod_eq_0: {
-                int f = 1;
-                for (int i = 0; i < ri.lhs().size(); i++) {
-                    f *= mod.at(ri.lhs()[i]);
-                }
-                if (f % ri.rhs() == 0) continue;
-                break;
+        for (size_t j = 0; j < new_reqs.size(); j++) {
+            auto &rj = new_reqs[j].impl();
+            if (rj.is_undef() || i == j) continue;
+            if (ri.can_prove(rj)) {
+                rj = req_impl_t();
+                continue;
             }
-            case req_kind_t::eq:
-                // Individual restrictions are to be added later.
-                if (ri.lhs().size() == 1) continue;
-                break;
-            case req_kind_t::le:
-                if (ri.lhs().size() == 1) continue;
-                if (ri.lhs().is_ge_1()) {
-                    // (a * b <= C + x) => (a <= C) if a >= 1 and b >= 1.
-                    for (int i = 0; i < ri.lhs().size(); i++) {
-                        auto dim = ri.lhs()[i];
-                        if (mask[dim] & static_cast<uint32_t>(req_kind_t::le)) {
-                            if (high_bound[dim] >= ri.rhs()) {
-                                mask[dim] &= ~static_cast<uint32_t>(
-                                        req_kind_t::le);
-                            }
-                        }
-                    }
-                }
-                break;
-            case req_kind_t::ge: {
-                if (ri.lhs().size() == 1) continue;
-                bool skip = false;
-                if (ri.lhs().is_ge_1()) {
-                    // (a >= C + x) => (a * b >= C) if a >= 1 and b >= 1.
-                    for (int i = 0; i < ri.lhs().size(); i++) {
-                        auto dim = ri.lhs()[i];
-                        if (low_bound[dim] >= ri.rhs()) {
-                            skip = true;
-                            break;
-                        }
-                    }
-                }
-                if (skip) continue;
+            // Propagate scalar values.
+            if (!sub.is_empty() && rj.lhs().has(ri.lhs())) {
+                rj.substitute(sub);
             }
-            default: break;
-        }
-        new_reqs.emplace_back(ri);
-    }
-    prb_tile_t fixed_dims;
-    // Add requirements for individual dimensions based on the modulus data and
-    // bounds.
-    for (auto &d : mask) {
-        if (mask[d] & static_cast<uint32_t>(req_kind_t::mod_eq_0)) {
-            new_reqs.emplace_back(req_impl_t((size_var(d) % mod[d]) == 0));
-        }
-        if (mask[d] & static_cast<uint32_t>(req_kind_t::eq)) {
-            fixed_dims[d] = low_bound[d];
-            new_reqs.emplace_back(req_impl_t(size_var(d) == low_bound[d]));
-        }
-        if (mask[d] & static_cast<uint32_t>(req_kind_t::ge)) {
-            new_reqs.emplace_back(req_impl_t(size_var(d) >= low_bound[d]));
-        }
-        if (mask[d] & static_cast<uint32_t>(req_kind_t::le)) {
-            new_reqs.emplace_back(req_impl_t(size_var(d) <= high_bound[d]));
         }
     }
     reqs_.clear();
-    // Substitute exact values and add overwrite requirements.
     for (auto &r : new_reqs) {
-        if (r.impl().lhs().size() != 1) r.impl().substitute(fixed_dims);
-        if (r.impl().kind() == req_kind_t::undef) continue;
+        if (r.impl().is_undef()) continue;
         reqs_.push_back(r);
     }
     // Sort for deterministic representation.
@@ -561,71 +817,75 @@ void prb_reqs_t::simplify() {
     });
 }
 
+void prb_reqs_t::substitute(const pvar_map_t<dim_t> &values) {
+    for (auto &r : reqs_) {
+        r.impl().substitute(values);
+    }
+    simplify();
+}
+
 bool prb_reqs_t::can_prove(const expr_t &to_prove) const {
     auto e = simplify_expr(to_prove);
     if (auto *imm = e.as_ptr<bool_imm_t>()) { return imm->value; }
     return can_prove(req_impl_t(e));
 }
 
-bool prb_reqs_t::can_prove(const req_impl_t &to_prove, bool use_any_mod) const {
+bool prb_reqs_t::can_prove(const req_impl_t &to_prove) const {
     for (auto &r : reqs_) {
         if (r.impl().can_prove(to_prove)) return true;
     }
-    if (to_prove.kind() == req_kind_t::mod_eq_0) {
+    if (to_prove.kind() == req_kind_t::mod_eq_0 && to_prove.rhs().is_value()) {
         int mod = 1;
         for (int i = 0; i < to_prove.lhs().size(); i++) {
-            auto &dim = to_prove.lhs()[i];
-            if (use_any_mod) {
-                for (auto &d : any_mods_) {
-                    if (d == dim) return true;
-                }
-            }
-            mod *= max_factor(dim);
+            auto &lhs_pvar = to_prove.lhs()[i];
+            mod *= max_factor(lhs_pvar);
         }
-        if (mod % to_prove.rhs() == 0) return true;
+        if (mod % to_prove.rhs().value() == 0) return true;
     }
     return false;
 }
 
-bool prb_reqs_t::get_value(const prb_dim_t &dim, int &value) const {
-    auto var = size_var(dim);
+bool prb_reqs_t::get_value(const pvar_t &pvar, dim_t &value) const {
     for (auto &r : reqs_) {
         auto &ri = r.impl();
-        if (ri.kind() == req_kind_t::eq && ri.lhs() == dim) {
-            value = ri.rhs();
+        if (ri.kind() == req_kind_t::eq && ri.lhs() == pvar) {
+            value = ri.rhs().value();
             return true;
         }
     }
     return false;
 }
 
-int prb_reqs_t::max_factor(const prb_dim_t &dim) const {
-    int ret = 1;
+dim_t prb_reqs_t::max_factor(const pvar_t &pvar) const {
+    dim_t ret = 1;
     for (auto &r : reqs_) {
         auto &ri = r.impl();
-        if (ri.kind() == req_kind_t::mod_eq_0 && ri.lhs() == dim) {
-            ret = std::max(ret, ri.rhs());
+        if (ri.kind() == req_kind_t::eq && ri.lhs() == pvar)
+            return ri.rhs().value();
+        if (ri.kind() == req_kind_t::mod_eq_0 && ri.lhs() == pvar
+                && ri.rhs().is_value()) {
+            ret = std::max(ret, ri.rhs().value());
         }
     }
     return ret;
 }
 
-bool prb_reqs_t::is_equal(const prb_dim_t &dim, int value) const {
-    int dim_value;
-    return get_value(dim, dim_value) && dim_value == value;
+bool prb_reqs_t::is_equal(const pvar_t &pvar, dim_t value) const {
+    dim_t pvar_value;
+    return get_value(pvar, pvar_value) && pvar_value == value;
 }
 
 bool prb_reqs_t::implies(const prb_reqs_t &other) const {
     for (auto &req : other.reqs_) {
-        ir_check(can_prove(req.impl())) << "Cannot prove: " << req.impl();
+        gpu_check(can_prove(req.impl())) << "Cannot prove: " << req.impl();
     }
     return true;
 }
 
-expr_t prb_reqs_t::to_expr(const prb_dim_t &dim) const {
-    int dim_value;
-    if (get_value(dim, dim_value)) return dim_value;
-    return size_var(dim);
+expr_t prb_reqs_t::to_expr(const pvar_t &pvar) const {
+    dim_t pvar_value;
+    if (get_value(pvar, pvar_value)) return pvar_value;
+    return pvar.var();
 }
 
 const prover_t &prover_t::instance() {
@@ -638,7 +898,7 @@ bool prover_t::require(const expr_t &_e) const {
     if (auto *imm = e.as_ptr<bool_imm_t>()) return imm->value;
 
     req_impl_t ri(e);
-    bool is_true = (parent_ && parent_->can_prove(ri, /*use_any_mod=*/true));
+    bool is_true = (parent_ && parent_->can_prove(ri));
     if (!is_true && !can_update_) return false;
     reqs_->add_if_not_found(ri);
     return true;
diff --git a/src/gpu/intel/jit/v2/ir/reqs.hpp b/src/gpu/intel/jit/v2/ir/reqs.hpp
index 2051908f148..a006a7b196b 100644
--- a/src/gpu/intel/jit/v2/ir/reqs.hpp
+++ b/src/gpu/intel/jit/v2/ir/reqs.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,10 +44,11 @@ class prover_t {
     // TODO: Change to non-const.
     bool require(const expr_t &e) const;
     const prb_reqs_t &reqs() const {
-        ir_assert(reqs_);
+        gpu_assert(reqs_);
         return *reqs_;
     }
     explicit operator bool() const { return reqs_; }
+    bool can_update() const { return can_update_; }
 
 private:
     const prb_reqs_t *parent_ = nullptr;
@@ -65,30 +66,31 @@ class prb_reqs_t {
 
     void add(const expr_t &e);
     void add(const prb_reqs_t &other);
-    void add(const prb_tile_t &tile);
-    void set(const prb_dim_t &dim, int value);
-    // Mark the dimension as being divisible by any number - this changes
-    // behavior of methods like can_prove() and max_factor().
-    void set_any_mod(const prb_dim_t &dim);
+    void add(const pvar_map_t<dim_t> &sizes);
+    void add_no_simplify(const expr_t &e);
+    void set(const pvar_t &dim, dim_t value);
     prover_t prover(const prb_reqs_t &parent, bool can_update = true);
 
     explicit operator bool() const { return !reqs_.empty(); }
     // Checks if the requirements are satisfied for the given problem sizes .
-    bool fits(const prb_tile_t &sizes) const;
+    bool fits(const pvar_map_t<dim_t> &sizes) const;
     // Simplifies and eliminates redundant requirements.
     void simplify();
+    void substitute(const pvar_map_t<dim_t> &values);
     // Checks if an expression/condition is an implication of the requirements.
     // For example: prb_reqs_t(oc % 64 == 0) implies (oc % 16) == 0 so the
     // latter can be proven from the original requirements.
     bool can_prove(const expr_t &to_prove) const;
-    bool can_prove(const req_impl_t &to_prove, bool use_any_mod = false) const;
-    bool get_value(const prb_dim_t &dim, int &value) const;
-    int max_factor(const prb_dim_t &dim) const;
-    bool is_equal(const prb_dim_t &dim, int value) const;
+    bool can_prove(const req_impl_t &to_prove) const;
+    bool get_value(const pvar_t &dim, dim_t &value) const;
+    dim_t max_factor(const pvar_t &dim) const;
+    bool is_equal(const pvar_t &dim, dim_t value) const;
     // Checks if other prb_reqs_t object is fully implied from the requirements
     // of this object.
     bool implies(const prb_reqs_t &other) const;
-    expr_t to_expr(const prb_dim_t &dim) const;
+    expr_t to_expr(const pvar_t &dim) const;
+    void stringify_impl(std::ostream &out, const std::string &req_delim,
+            const std::string &delim) const;
     void stringify(std::ostream &out) const;
     void parse(std::istream &in);
     std::string str() const;
@@ -100,20 +102,22 @@ class prb_reqs_t {
     class req_t {
     public:
         req_t();
+        req_t(const req_t &other);
         req_t(const req_impl_t &impl);
+        ~req_t();
+        req_t &operator=(const req_t &other);
         const req_impl_t &impl() const { return *impl_; }
         req_impl_t &impl() { return *impl_; }
+        std::string str() const;
+        IR_DEFINE_DUMP()
 
     private:
-        std::shared_ptr<req_impl_t> impl_;
+        std::unique_ptr<req_impl_t> impl_;
     };
 
     void add_if_not_found(const req_impl_t &new_req);
 
     std::vector<req_t> reqs_;
-    // List of dimensions that are treated as having any arbitrary factors
-    // during proving.
-    std::vector<prb_dim_t> any_mods_;
 };
 
 } // namespace v2
diff --git a/src/gpu/intel/jit/v2/ir/send.cpp b/src/gpu/intel/jit/v2/ir/send.cpp
new file mode 100644
index 00000000000..cd9530cd913
--- /dev/null
+++ b/src/gpu/intel/jit/v2/ir/send.cpp
@@ -0,0 +1,120 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/jit/v2/ir/send.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace jit {
+namespace v2 {
+
+bool process_coef_y_stride(
+        plane_t &plane, expr_t &u, const expr_t &v, const prover_t &prover) {
+    auto reduce_if_divisible = [](expr_t &a, const expr_t &b) {
+        auto args = op_split(op_kind_t::_mul, a);
+        for (auto &arg : args) {
+            if (arg.is_equal(b)) {
+                arg = expr_t();
+                a = op_combine(op_kind_t::_mul, args);
+                return true;
+            }
+        }
+        return false;
+    };
+    if (reduce_if_divisible(u, plane.y_stride)) return true;
+    if (auto *op = u.as_ptr<unary_op_t>()) {
+        auto a = op->a;
+        return process_coef_y_stride(plane, a, v, prover);
+    }
+    for (auto &size_val : {std::make_pair(u, 0), std::make_pair(v, 1)}) {
+        auto dim = pvar_t::from_var(size_val.first);
+        if (!dim.is_undef()) {
+            if (!prover.require((plane.y_stride == 1)
+                        | (size_val.first == size_val.second)))
+                return false;
+            return true;
+        }
+    }
+    gpu_error_not_expected() << "Cannot make " << u << " x " << v
+                             << " divisible by " << plane.y_stride;
+    return false;
+}
+
+bool adjust_for_non_unit_y_stride(plane_t &plane,
+        const pvar_coord_t<expr_t> &coord, const prover_t &prover) {
+    if (is_one(plane.y_stride)) return true;
+    auto y_stride_dim = pvar_t::from_var(plane.y_stride);
+    if (y_stride_dim.is_undef()) return false;
+    auto _y = to_linear(plane.y);
+    auto &y = _y.as<linear_t>();
+    auto to_size = [&](const expr_t &idx) {
+        for (auto &d : coord) {
+            if (coord[d].is_equal(idx)) return d.var();
+        }
+        return expr_t();
+    };
+    auto c = y.c;
+    auto u_vec = y.u_vec;
+    if (!process_coef_y_stride(plane, c, expr_t(), prover)) return false;
+    for (int i = 0; i < y.nargs(); i++) {
+        if (!process_coef_y_stride(
+                    plane, u_vec[i], to_size(y.v_vec[i]), prover))
+            return false;
+    }
+    if (!prover.require(plane.H % plane.y_stride == 0)) return false;
+    plane.y = linear_t::to_expr(c, u_vec, y.v_vec);
+    plane.H /= plane.y_stride;
+    return true;
+}
+
+send_2d_desc_t::send_2d_desc_t(const view_t &view, const send_params_t &params,
+        const prover_t &prover) {
+    auto plane = view.plane();
+    if (!params.hint_2d) return;
+    if (!plane) return;
+    if (!adjust_for_non_unit_y_stride(plane, view.coord(), prover)) return;
+
+    auto &hint = params.hint_2d;
+    hw = params.hw;
+    address = params.address;
+    op = params.op;
+    type = view.type();
+    transpose = hint.transpose;
+    vnni = hint.vnni;
+    W = plane.W;
+    H = plane.H;
+    P = plane.P;
+    w = hint.width;
+    h = hint.height;
+    c = 1;
+    w_rcount = ir_utils::safe_div(plane.w, w);
+    h_rcount = ir_utils::safe_div(plane.h, h);
+    w_dim = plane.w_dim;
+    h_dim = plane.h_dim;
+    base = get_2d_base(view);
+    x_base = plane.x;
+    y_base = plane.y;
+    try_promote_count();
+    is_valid = is_supported(view, prover);
+}
+} // namespace v2
+} // namespace jit
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/jit/v2/ir/send.hpp b/src/gpu/intel/jit/v2/ir/send.hpp
index 6b365bc64df..ab65f4ec059 100644
--- a/src/gpu/intel/jit/v2/ir/send.hpp
+++ b/src/gpu/intel/jit/v2/ir/send.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,6 +37,7 @@ static const int max_slot_size = 8;
 
 enum class send_op_t {
     undef,
+    atomic_add,
     atomic_fadd,
     load,
     prefetch,
@@ -45,6 +46,7 @@ enum class send_op_t {
 
 static auto send_op_names = nstl::to_array({
         make_enum_name(send_op_t::undef, "undef"),
+        make_enum_name(send_op_t::atomic_add, "atomic_add"),
         make_enum_name(send_op_t::atomic_fadd, "atomic_fadd"),
         make_enum_name(send_op_t::load, "load"),
         make_enum_name(send_op_t::prefetch, "prefetch"),
@@ -53,6 +55,10 @@ static auto send_op_names = nstl::to_array({
 
 GPU_DEFINE_PARSE_ENUM(send_op_t, send_op_names)
 
+inline bool is_atomic(send_op_t op) {
+    return utils::one_of(op, send_op_t::atomic_add, send_op_t::atomic_fadd);
+}
+
 enum class send_address_t {
     undef,
     a64,
@@ -115,12 +121,12 @@ struct addr_t {
 struct dim_mask_t {
     dim_mask_t() = default;
 
-    dim_mask_t(const dim_mask_desc_t &dmd, int slots) : slot_incs(slots, 0) {
-        dim = dmd.dim;
-        base = dmd.base;
-        bound = dmd.bound;
-        has_underflow = dmd.has_underflow;
-    }
+    dim_mask_t(const dim_mask_desc_t &dmd, int slots)
+        : dim(dmd.dim)
+        , base(dmd.base)
+        , bound(dmd.bound)
+        , slot_incs(slots, 0)
+        , has_underflow(dmd.has_underflow) {}
 
     bool is_empty() const { return slot_incs.empty(); }
     int slots() const { return (int)slot_incs.size(); }
@@ -136,7 +142,7 @@ struct dim_mask_t {
 
     IR_DEFINE_DUMP()
 
-    prb_dim_t dim;
+    pvar_t dim;
     expr_t base;
     expr_t bound;
     std::vector<expr_t> slot_incs;
@@ -166,7 +172,7 @@ struct mask_t {
     // TODO: Rename.
     int nmasks() const { return static_cast<int>(dim_masks.size()); }
     int slots() const { return dim_masks[0].slots(); }
-    void clear(const prb_dim_t &dim) {
+    void clear(const pvar_t &dim) {
         for (auto &dm : dim_masks) {
             if (dm.dim == dim) {
                 dm = dim_mask_t();
@@ -198,11 +204,9 @@ struct multiply_hint_t {
     int simd = 0;
     bool src1 = false;
     bool src2 = false;
-    dim_map_t<prb_dim_t, prb_dim_kind_t> bmnk_map;
+    pvar_map_t<char> bmnk_map;
 
-    bool is_k(const prb_dim_t &dim) const {
-        return bmnk_map.get(dim, prb_dim_kind_t::undef) == prb_dim_kind_t::k;
-    }
+    bool is_k(const pvar_t &dim) const { return bmnk_map.get(dim, ' ') == 'k'; }
 };
 
 struct send_2d_hint_t {
@@ -242,6 +246,12 @@ struct send_2d_hint_t {
             w_blk = is_w_reduce ? k_blk : mn_blk;
             h_blk = !is_w_reduce ? k_blk : mn_blk;
             if (vnni && transpose) return;
+            if (transpose && plane.type.size() != 4) {
+                // Transpose is not supported for sub-dword types, load VNNI
+                // transformed data followed by GRF reorder.
+                vnni = true;
+                transpose = false;
+            }
         }
         if (!init(send_op, plane.type, vnni, transpose, plane.w, plane.h, w_blk,
                     h_blk))
@@ -325,7 +335,7 @@ struct send_params_t {
     // For register payload.
     int max_entry_reg_size = 0;
     const prb_reqs_t *external_reqs = nullptr;
-    std::vector<prb_dim_t> skip_mask;
+    std::vector<pvar_t> skip_mask;
 
     void init_max_entry_reg_size() {
         if (hint_2d) {
@@ -395,7 +405,7 @@ struct send_1d_entry_t {
     expr_t addr_inc;
     std::vector<expr_t> mask_incs; // Per dimension mask.
     int reg_off = 0;
-    prb_coord_t<int> coord;
+    pvar_coord_t<dim_t> coord;
 
     std::string str() const {
         using namespace ir_utils;
@@ -413,7 +423,7 @@ struct send_1d_plan_t : public base_plan_t {
     mask_t mask;
     std::vector<send_1d_entry_t> entries;
     layout_t reg_layout;
-    prb_tile_t entry_tile;
+    pvar_tile_t entry_tile;
 
     using base_plan_t::base_plan_t;
 
@@ -429,7 +439,7 @@ struct send_1d_plan_t : public base_plan_t {
         if (!desc.base_alignment_ok(addr_inc, prover)) return false;
         std::vector<expr_t> mask_incs(nmasks());
         auto coord = it.coord();
-        ir_assert(reg_layout.offset_in_bytes(coord) == reg_off);
+        gpu_assert(reg_layout.offset_in_bytes(coord) == reg_off);
         for (int i = 0; i < nmasks(); i++) {
             mask_incs[i] = mask_desc[i].to_expr(coord, /*with_const=*/false);
         }
@@ -481,40 +491,16 @@ struct send_2d_desc_t {
     int c = 0; // Batch count.
     int w_rcount = 0;
     int h_rcount = 0;
-    prb_dim_t w_dim;
-    prb_dim_t h_dim;
+    pvar_t w_dim;
+    pvar_t h_dim;
     bool is_valid = false;
     expr_t base;
+    expr_t x_base;
+    expr_t y_base;
 
     send_2d_desc_t() = default;
     send_2d_desc_t(const view_t &view, const send_params_t &params,
-            const prover_t &prover) {
-        auto &plane = view.plane();
-        if (!params.hint_2d) return;
-        if (!plane) return;
-
-        auto &hint = params.hint_2d;
-        hw = params.hw;
-        address = params.address;
-        op = params.op;
-        type = view.type();
-        transpose = hint.transpose;
-        vnni = hint.vnni;
-        W = plane.W;
-        H = plane.H;
-        P = plane.P;
-        w = hint.width;
-        h = hint.height;
-        c = 1;
-        w_rcount = ir_utils::safe_div(plane.w, w);
-        h_rcount = ir_utils::safe_div(plane.h, h);
-        w_dim = plane.w_dim;
-        h_dim = plane.h_dim;
-        base = get_2d_base(view);
-        try_promote_count();
-        is_valid = is_supported(view, prover);
-    }
-
+            const prover_t &prover);
     explicit operator bool() const { return is_valid; }
 
     // Reduce the number of messages by increasing count per
@@ -532,22 +518,22 @@ struct send_2d_desc_t {
     bool is_supported(const view_t &view, const prover_t &prover) const {
         if (w % block_2d_x_alignment(type.size()) != 0) return false;
 
-        auto &plane = view.plane();
         auto width_bytes = W * type.size();
         auto pitch_bytes = P * type.size();
         int base_align = block_2d_base_alignment(hw);
         int x_align = block_2d_x_alignment(type.size());
-        if (!prover.require(width_bytes >= 64)) return false;
-        if (!prover.require(width_bytes <= (1 << 24))) return false;
-        if (!prover.require(width_bytes % std::max(4, type.size()) == 0))
+        if (!prover.require(width_bytes >= block_2d_min_dim())) return false;
+        if (!prover.require(width_bytes <= block_2d_max_dim())) return false;
+        if (!prover.require(
+                    width_bytes % block_2d_w_alignment(type.size()) == 0))
+            return false;
+        if (!prover.require(H <= block_2d_max_dim())) return false;
+        if (!prover.require(pitch_bytes >= block_2d_min_dim())) return false;
+        if (!prover.require(pitch_bytes <= block_2d_max_dim())) return false;
+        if (!prover.require(pitch_bytes % block_2d_pitch_alignment(hw) == 0))
             return false;
-        if (!prover.require(H <= (1 << 24))) return false;
-        if (!prover.require(pitch_bytes >= 64)) return false;
-        if (!prover.require(pitch_bytes <= (1 << 24))) return false;
-        if (!prover.require(pitch_bytes % 8 == 0)) return false;
-        if (!prover.require(plane.y_stride == 1)) return false;
         if (!prover.require(base % base_align == 0)) return false;
-        if (!prover.require(plane.x % x_align == 0)) return false;
+        if (!prover.require(x_base % x_align == 0)) return false;
         return true;
     }
 
@@ -559,7 +545,7 @@ struct send_2d_desc_t {
             stride_grf,
         };
         int cur_stride = 1;
-        auto add_block = [&](prb_dim_t dim, int size,
+        auto add_block = [&](const pvar_t &dim, int size,
                                  pad_kind_t pad = pad_kind_t::none) {
             ret.add_block(dim, size, cur_stride);
             int stride = cur_stride * size;
@@ -571,7 +557,7 @@ struct send_2d_desc_t {
                     stride = utils::rnd_up(stride, grf_size / type.size());
                     break;
                 case pad_kind_t::none: break;
-                default: ir_error_not_expected();
+                default: gpu_error_not_expected();
             }
             cur_stride = stride;
         };
@@ -621,7 +607,7 @@ struct send_2d_entry_t {
     expr_t x_inc;
     expr_t y_inc;
     int reg_off = 0;
-    prb_coord_t<int> coord;
+    pvar_coord_t<dim_t> coord;
 
     std::string str() const {
         std::ostringstream oss;
@@ -642,14 +628,14 @@ struct send_2d_plan_t : public base_plan_t {
     mask_t mask;
     std::vector<send_2d_entry_t> entries;
     layout_t reg_layout;
-    prb_tile_t entry_tile;
+    pvar_tile_t entry_tile;
 
     using base_plan_t::base_plan_t;
 
     int nentries() const { return static_cast<int>(entries.size()); }
     explicit operator bool() const { return (bool)desc; }
 
-    bool add_entry(const prb_coord_t<int> &coord, int reg_off,
+    bool add_entry(const pvar_coord_t<dim_t> &coord, int reg_off,
             const prover_t &prover) {
         entries.emplace_back();
         auto &e = entries.back();
@@ -698,6 +684,10 @@ struct send_plan_t : public base_plan_t {
     const send_1d_plan_t &get_1d() const { return _1d; }
     send_2d_plan_t &get_2d() { return _2d; }
     const send_2d_plan_t &get_2d() const { return _2d; }
+    send_op_t op() const {
+        if (is_1d()) return _1d.desc.op;
+        return _2d.desc.op;
+    }
 
     const prb_reqs_t &reqs() const {
         if (is_1d()) return _1d.reqs;
@@ -709,7 +699,7 @@ struct send_plan_t : public base_plan_t {
         return _2d.reg_layout;
     }
 
-    const prb_tile_t &entry_tile() const {
+    const pvar_tile_t &entry_tile() const {
         if (is_1d()) return _1d.entry_tile;
         return _2d.entry_tile;
     }
@@ -736,6 +726,9 @@ class send_plan_builder_t {
 
     send_plan_t build() const {
         send_params_t params = init_params_;
+        if (params.op == send_op_t::atomic_fadd) {
+            params.kind = send_kind_t::scattered;
+        }
         prb_reqs_t reqs;
         auto prover = reqs.prover(*params.external_reqs,
                 /*can_update=*/params.kind != send_kind_t::undef);
@@ -764,14 +757,15 @@ class send_plan_builder_t {
         int inner_elems = inner_last.elems();
         int inner_bytes = type_size * inner_elems;
         int slot_size = ir_utils::max_pow2_divisor(inner_bytes);
+        if (is_atomic(params.op)) slot_size = type_size;
         int grf_size = plan.hw.grf_size();
 
         if (slot_size < grf_size)
             slot_size = std::min(max_slot_size, slot_size);
         if (type_size < slot_size && slot_size < 4) slot_size = type_size;
 
-        ir_assert(inner_bytes % slot_size == 0);
-        ir_assert(slot_size % type_size == 0);
+        gpu_assert(inner_bytes % slot_size == 0);
+        gpu_assert(slot_size % type_size == 0);
         bool is_scattered = (slot_size <= max_slot_size);
         if (is_scattered && params.kind == send_kind_t::block)
             return send_plan_t();
@@ -785,13 +779,13 @@ class send_plan_builder_t {
         auto outer_begin = end(layout);
         if (is_scattered) {
             // Add blocks to fill up slots in the scattered message.
-            for (auto it = inner_end; it != end(layout); ++it) {
+            for (auto it = std::move(inner_end); it != end(layout); ++it) {
                 int it_slots = ir_utils::safe_div(it.elems(), elems_per_slot);
                 int entry_reg_size
                         = utils::rnd_up(it_slots * slot_stride, grf_size);
                 if (it_slots > max_slots
                         || entry_reg_size > params.max_entry_reg_size) {
-                    outer_begin = it;
+                    outer_begin = std::move(it);
                     break;
                 }
                 slots = it_slots;
@@ -811,8 +805,8 @@ class send_plan_builder_t {
 
         int elem_stride = 1;
         if (slot_stride > slot_size) {
-            ir_assert(slot_size < 4);
-            ir_assert(type_size == slot_size);
+            gpu_assert(slot_size < 4);
+            gpu_assert(type_size == slot_size);
             elem_stride = ir_utils::safe_div(slot_stride, slot_size);
         }
         auto reg_layout = middle_last.sub_layout(elem_stride);
@@ -857,7 +851,7 @@ class send_plan_builder_t {
         int grf_size = params.hw.grf_size();
         auto reg_layout = desc.reg_layout(grf_size, view.layout().desc());
         int entry_reg_size = utils::rnd_up(reg_layout.size(), grf_size);
-        ir_assert(entry_reg_size <= params.max_entry_reg_size);
+        gpu_assert(entry_reg_size <= params.max_entry_reg_size);
         reg_layout.pad_bytes(grf_size);
 
         auto entry_tile = reg_layout.int_dim_sizes();
@@ -868,8 +862,8 @@ class send_plan_builder_t {
         plan_2d = send_2d_plan_t(plan.hw);
         plan_2d.desc = desc;
         plan_2d.base = desc.base;
-        plan_2d.x_base = plane.x;
-        plan_2d.y_base = plane.y;
+        plan_2d.x_base = desc.x_base;
+        plan_2d.y_base = desc.y_base;
         plan_2d.mask = mask_t(view.mask_desc());
         plan_2d.mask.clear(plane.x_dim);
         plan_2d.mask.clear(plane.y_dim);
@@ -881,7 +875,7 @@ class send_plan_builder_t {
         int reg_off = 0;
         for (int h = 0; h < plane.h; h += desc.h) {
             for (int w = 0; w < plane.w; w += desc.w * desc.c) {
-                prb_coord_t<int> coord;
+                pvar_coord_t<dim_t> coord;
                 coord[plane.w_dim] = w;
                 coord[plane.h_dim] = h;
                 if (!plan_2d.add_entry(coord, reg_off, prover))
@@ -905,10 +899,16 @@ class send_plan_builder_t {
             return type_size * inner_last.elems() >= grf_size;
         };
         for (auto it = begin(layout); it != end(layout); ++it) {
-            auto _prover = prover_t(prover, /*can_update=*/!ok_to_return());
+            auto _prover
+                    = prover_t(prover, prover.can_update() && !ok_to_return());
             if (!mask_desc.is_uniform(it, _prover)) break;
             if (!it.is_dense()) break;
-            if (type_size * it.elems() > params.max_entry_reg_size) break;
+            int inner_bytes = type_size * it.elems();
+            if (inner_bytes > params.max_entry_reg_size) break;
+            if (params.kind == send_kind_t::scattered
+                    && inner_bytes > max_slot_size * max_slots)
+                break;
+            if (is_atomic(params.op) && it.elems() > max_slots) break;
             inner_last = it;
         }
         return inner_last;
@@ -921,7 +921,7 @@ class send_plan_builder_t {
         const int max_type_size = 512;
         if (desc.type_size <= max_type_size) return;
 
-        ir_assert(desc.type_size % max_type_size == 0);
+        gpu_assert(desc.type_size % max_type_size == 0);
         send_1d_plan_t new_plan;
         new_plan.desc = desc;
         new_plan.desc.type_size = max_type_size;
@@ -945,10 +945,11 @@ class send_plan_builder_t {
 
 inline send_plan_t create_send_plan(const send_params_t &params,
         const view_t &view, bool allow_fail = false) {
+    gpu_assert(params.max_entry_reg_size > 0);
     send_plan_builder_t spb(params, view);
     auto plan = spb.build();
     if (!plan) {
-        if (!allow_fail) ir_error_not_expected() << "Cannot create send plan.";
+        if (!allow_fail) gpu_error_not_expected() << "Cannot create send plan.";
     }
     return plan;
 }
diff --git a/src/gpu/intel/jit/v2/ir/tensor.cpp b/src/gpu/intel/jit/v2/ir/tensor.cpp
index e7afa558385..45708c9c228 100644
--- a/src/gpu/intel/jit/v2/ir/tensor.cpp
+++ b/src/gpu/intel/jit/v2/ir/tensor.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,46 +62,46 @@ std::string block_t::str() const {
     return oss.str();
 }
 
-layout_desc_t::layout_desc_t(const dim_map_t<prb_dim_t, char> &letter_map)
+layout_desc_t::layout_desc_t(const pvar_map_t<char> &letter_map)
     : letter_map_(letter_map) {
-    auto append = [&](const prb_dim_t &dim) {
+    auto append = [&](const pvar_t &dim) {
         if (letter_map_.has(dim)) canonical_ += letter_map_[dim];
     };
-    append(prb_dims::mb);
-    append(prb_dims::g);
-    append(prb_dims::oc);
-    append(prb_dims::ic);
-    append(prb_dims::id);
-    append(prb_dims::ih);
-    append(prb_dims::iw);
-    append(prb_dims::od);
-    append(prb_dims::oh);
-    append(prb_dims::ow);
-    append(prb_dims::kd);
-    append(prb_dims::kh);
-    append(prb_dims::kw);
-}
-
-char layout_desc_t::layout_letter(const prb_dim_t &dim) const {
+    append(pvars::mb);
+    append(pvars::g);
+    append(pvars::oc);
+    append(pvars::ic);
+    append(pvars::id);
+    append(pvars::ih);
+    append(pvars::iw);
+    append(pvars::od);
+    append(pvars::oh);
+    append(pvars::ow);
+    append(pvars::kd);
+    append(pvars::kh);
+    append(pvars::kw);
+}
+
+char layout_desc_t::layout_letter(const pvar_t &dim) const {
     if (!letter_map_.has(dim)) return '?';
     return letter_map_.at(dim);
 }
 
-prb_dim_t layout_desc_t::prb_dim(int idx) const {
-    ir_assert(idx >= 0 && idx < ndims());
+pvar_t layout_desc_t::prb_dim(int idx) const {
+    gpu_assert(idx >= 0 && idx < ndims());
     char c = canonical_[idx];
     for (auto &d : letter_map_) {
         if (layout_letter(d) == c) return d;
     }
-    ir_error_not_expected();
-    return prb_dims::undef;
+    gpu_error_not_expected();
+    return pvar_t();
 }
 
-int layout_desc_t::dim_index(const prb_dim_t &dim) const {
+int layout_desc_t::dim_index(const pvar_t &dim) const {
     for (int i = 0; i < ndims(); i++) {
         if (canonical_[i] == layout_letter(dim)) return i;
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return -1;
 }
 
@@ -118,13 +118,14 @@ std::string layout_desc_t::to_abx_tag(const std::string &tag) const {
         bool found = false;
         for (int i = 0; i < (int)std::strlen(tensor_map); i++) {
             if (tensor_map[i] == c_lower) {
-                char ab = 'a' + i;
-                ret += (c == c_lower ? ab : std::toupper(ab));
+                char ab = dim_idx::as_tag(i);
+                ret += (c == c_lower ? ab
+                                     : static_cast<char>(std::toupper(ab)));
                 found = true;
                 break;
             }
         }
-        ir_assert(found);
+        gpu_assert(found);
     }
     return ret;
 }
@@ -137,16 +138,16 @@ std::string layout_desc_t::str() const {
 }
 
 void dim_mapper_t::set_dim(
-        const prb_dim_t &dim, const expr_t &expr, bool has_underflow) {
-    map_.set(dim, {expr.is_empty() ? index_var(dim) : expr, has_underflow});
+        const pvar_t &dim, const expr_t &expr, bool has_underflow) {
+    map_.set(dim, {expr.is_empty() ? dim.index_var() : expr, has_underflow});
 }
 
-const expr_t &dim_mapper_t::expr(const prb_dim_t &dim) const {
-    if (is_empty()) return index_var(dim);
+const expr_t &dim_mapper_t::expr(const pvar_t &dim) const {
+    if (is_empty()) return dim.index_var();
     return map_[dim].expr;
 }
 
-bool dim_mapper_t::has_underflow(const prb_dim_t &dim) const {
+bool dim_mapper_t::has_underflow(const pvar_t &dim) const {
     if (is_empty()) return false;
     return map_[dim].has_underflow;
 }
@@ -169,12 +170,12 @@ int layout_raw_tag_t::entry_index(char letter) {
     for (int i = 0; i < (int)entries_.size(); i++) {
         if (entries_[i].letter == letter) return i;
     }
-    ir_error_not_expected();
+    gpu_error_not_expected();
     return -1;
 }
 
 void layout_raw_tag_t::add_dim(char letter, int pos) {
-    ir_assert(!has_x());
+    gpu_assert(!has_x());
     std::vector<layout_raw_tag_entry_t> new_entries;
     for (int i = 0; i < (int)entries_.size(); i++) {
         auto &e = entries_[i];
@@ -187,7 +188,7 @@ void layout_raw_tag_t::add_dim(char letter, int pos) {
 }
 
 void layout_raw_tag_t::remove_dim(char letter) {
-    ir_assert(!has_x());
+    gpu_assert(!has_x());
     std::vector<layout_raw_tag_entry_t> new_entries;
     for (auto &e : entries_) {
         if (e.letter == letter) continue;
@@ -205,17 +206,24 @@ bool layout_raw_tag_t::is_blocked(char letter) const {
     return false;
 }
 
-int layout_raw_tag_t::ndims() const {
-    ir_assert(!is_any() && !has_x());
-    int max_index = 0;
+bool layout_raw_tag_t::is_blocked() const {
+    for (auto &e : entries_) {
+        if (e.is_blocked) return true;
+    }
+    return false;
+}
+
+dim_idx_t layout_raw_tag_t::ndims() const {
+    gpu_assert(!is_any() && !has_x());
+    dim_idx_t max_index = 0;
     for (auto &e : entries_) {
         max_index = std::max(max_index, e.index());
     }
     return max_index + 1;
 }
 
-int layout_raw_tag_t::non_x_ndims() const {
-    ir_assert(!is_any());
+dim_idx_t layout_raw_tag_t::non_x_ndims() const {
+    gpu_assert(!is_any());
     std::array<bool, 'z' - 'a' + 1> seen;
     seen.fill(false);
     for (auto &e : entries_) {
@@ -229,14 +237,29 @@ int layout_raw_tag_t::non_x_ndims() const {
 
 std::string layout_raw_tag_t::str() const {
     if (is_any()) return "any";
-    std::ostringstream oss;
+    std::string s;
     for (auto &e : entries_)
-        oss << e.str();
-    return oss.str();
+        s += e.str();
+    if (has_x()) return s;
+    std::string x;
+    for (dim_idx_t i = ndims() - 1; i >= 2; i--) {
+        if (is_blocked(dim_idx::as_tag(i))) break;
+        // NOLINTNEXTLINE(performance-inefficient-string-concatenation)
+        x = dim_idx::as_tag(i) + x;
+    }
+    while (!x.empty()) {
+        auto pos = s.find(x);
+        if (pos != std::string::npos) {
+            s.replace(pos, x.length(), "x");
+            break;
+        }
+        x.erase(0, 1);
+    }
+    return s;
 }
 
 bool layout_raw_tag_t::matches(const layout_raw_tag_t &other,
-        const layout_desc_t &desc, const prb_tile_t &sizes) const {
+        const layout_desc_t &desc, const pvar_tile_t &sizes) const {
     if (is_any()) return true;
     int n0 = nentries();
     int n1 = other.nentries();
@@ -263,14 +286,14 @@ bool layout_raw_tag_t::has_x() const {
     return false;
 }
 
-void layout_raw_tag_t::expand_x(int ndims) {
+void layout_raw_tag_t::expand_x(dim_idx_t ndims) {
     if (!has_x() || ndims == 0) return;
     std::vector<layout_raw_tag_entry_t> new_entries;
     for (auto &e : entries_) {
         if (e.is_x()) {
-            for (int i = non_x_ndims(); i < ndims; i++) {
+            for (dim_idx_t i = non_x_ndims(); i < ndims; i++) {
                 auto new_e = e;
-                new_e.letter = 'a' + i;
+                new_e.letter = dim_idx::as_tag(i);
                 new_entries.push_back(new_e);
             }
         } else {
@@ -280,34 +303,10 @@ void layout_raw_tag_t::expand_x(int ndims) {
     entries_ = std::move(new_entries);
 }
 
-layout_raw_tag_t layout_raw_tag_t::collapse_x() const {
-    if (has_x()) return *this;
-    auto tag = to_tag(entries_);
-    for (int i = 2; i < ndims() - 1; i++) {
-        std::string x_expanded;
-        bool ok = true;
-        for (int j = i; j < ndims(); j++) {
-            for (auto &e : entries_) {
-                if (e.index() == j && e.is_blocked) {
-                    ok = false;
-                    break;
-                }
-            }
-            x_expanded += 'a' + j;
-        }
-        if (!ok) continue;
-        auto pos = tag.find(x_expanded);
-        if (pos == std::string::npos) continue;
-        tag.replace(pos, x_expanded.length(), "x");
-        return layout_raw_tag_t(tag);
-    }
-    return *this;
-}
-
 std::vector<layout_raw_tag_entry_t> layout_raw_tag_t::to_entries(
         const std::string &tag) {
     if (tag == "any") return {};
-    ir_assert(is_abx_tag(tag)) << tag;
+    gpu_assert(is_abx_tag(tag)) << tag;
     std::array<bool, 'z' - 'a' + 1> is_blocked;
     is_blocked.fill(false);
     auto letter_blocks = parse_letter_blocks(tag);
@@ -316,7 +315,7 @@ std::vector<layout_raw_tag_entry_t> layout_raw_tag_t::to_entries(
     }
     std::vector<layout_raw_tag_entry_t> entries;
     for (auto &p : letter_blocks) {
-        char letter = std::tolower(p.first);
+        char letter = static_cast<char>(std::tolower(p.first));
         entries.emplace_back();
         auto &e = entries.back();
         e.letter = letter;
@@ -326,27 +325,17 @@ std::vector<layout_raw_tag_entry_t> layout_raw_tag_t::to_entries(
     return entries;
 }
 
-std::string layout_raw_tag_t::to_tag(
-        const std::vector<layout_raw_tag_entry_t> &entries) {
-    std::string tag;
-    for (auto &e : entries) {
-        if (e.is_blocked) { tag += std::to_string(e.block); }
-        tag += e.letter;
-    }
-    return tag;
-}
-
 std::vector<bool> layout_raw_tag_t::skip_mask(
-        const layout_desc_t &desc, const prb_tile_t &sizes) const {
+        const layout_desc_t &desc, const pvar_tile_t &sizes) const {
     std::vector<bool> ret(nentries());
     auto rem_sizes = sizes;
     for (int i = nentries() - 1; i >= 0; i--) {
         auto &e = entries_[i];
         int idx = e.letter - 'a';
         auto dim = desc.prb_dim(idx);
-        ir_assert(sizes.has(dim));
+        gpu_assert(sizes.has(dim));
         if (e.block != 0) {
-            ir_assert(e.block != 1);
+            gpu_assert(e.block != 1);
             rem_sizes[dim] = utils::rnd_up(rem_sizes[dim], e.block);
         }
         if (rem_sizes[dim] == 1) ret[i] = true;
@@ -369,18 +358,31 @@ std::vector<std::pair<char, int>> layout_raw_tag_t::parse_letter_blocks(
             next = ss.peek();
         }
         char letter = char(ss.peek());
-        ir_assert(!ss.eof());
+        gpu_assert(!ss.eof());
         ss.ignore(1);
         ret.emplace_back(letter, block);
     }
     return ret;
 }
 
-static void advance(prb_coord_t<int> &idx, const prb_tile_t &bound,
-        const prb_tile_t &block) {
-    int inc = 1;
-    for (auto &d : idx) {
-        int inc_idx = (idx[d] / block[d] + inc) % bound[d];
+static void advance(pvar_coord_t<dim_t> &idx,
+        const std::vector<pvar_t> &_idx_order, const pvar_tile_t &bound,
+        const pvar_tile_t &block) {
+    dim_t inc = 1;
+    auto idx_order = _idx_order;
+    if (idx_order.empty()) {
+        for (auto &d : idx)
+            idx_order.push_back(d);
+    } else {
+        pvar_map_t<bool> seen;
+        for (auto &d : idx_order)
+            seen[d] = true;
+        gpu_assert(seen.size() == idx.size());
+        for (auto &d : idx)
+            gpu_assert(seen.has(d));
+    }
+    for (auto &d : idx_order) {
+        dim_t inc_idx = (idx[d] / block[d] + inc) % bound[d];
         inc = (idx[d] / block[d] + inc) / bound[d];
         idx[d] = inc_idx * block[d];
         if (inc == 0) break;
@@ -388,9 +390,9 @@ static void advance(prb_coord_t<int> &idx, const prb_tile_t &bound,
 }
 
 static void advance(std::vector<int> &idxs, const std::vector<block_t> &blocks,
-        const std::vector<int> &block_incs) {
-    ir_assert(idxs.size() == blocks.size());
-    ir_assert(idxs.size() == block_incs.size());
+        const std::vector<dim_t> &block_incs) {
+    gpu_assert(idxs.size() == blocks.size());
+    gpu_assert(idxs.size() == block_incs.size());
     for (size_t i = 0; i < idxs.size(); i++) {
         int size = blocks[i].int_size();
         if (idxs[i] + block_incs[i] < size) {
@@ -412,9 +414,9 @@ static inline void advance(
     }
 }
 
-bool layout_tag_t::matches(
-        const layout_tag_t &other, const prb_tile_t &sizes) const {
-    if (type_ != other.type_) return false;
+bool layout_tag_t::matches(const layout_tag_t &other, const pvar_tile_t &sizes,
+        bool check_type) const {
+    if (check_type && type_ != other.type_) return false;
     return raw_tag().matches(other.raw_tag(), desc_, sizes);
 }
 
@@ -426,7 +428,7 @@ std::string layout_tag_t::str() const {
 }
 
 int layout_t::elems() const {
-    ir_assert(has_const_sizes());
+    gpu_assert(has_const_sizes());
     int ret = 1;
     for (auto &b : blocks_)
         ret *= b.int_size();
@@ -434,8 +436,8 @@ int layout_t::elems() const {
 }
 
 int layout_t::size() const {
-    ir_assert(has_const_sizes());
-    ir_assert(has_const_strides());
+    gpu_assert(has_const_sizes());
+    gpu_assert(has_const_strides());
     if (is_empty()) return 0;
     int max_off = 0;
     int max_block_size = 0;
@@ -448,14 +450,14 @@ int layout_t::size() const {
     return std::max(max_off_bytes, max_block_size * type().size());
 }
 
-int layout_t::nblocks(const prb_dim_t &dim) const {
+int layout_t::nblocks(const pvar_t &dim) const {
     int ret = 0;
     for (auto &b : blocks_)
         if (b.dim == dim) ret++;
     return ret;
 }
 
-int layout_t::int_dim_size(const prb_dim_t &dim) const {
+int layout_t::int_dim_size(const pvar_t &dim) const {
     int ret = 1;
     for (auto &b : blocks_)
         if (b.dim == dim) ret *= b.int_size();
@@ -474,25 +476,34 @@ bool layout_t::has_const_strides() const {
     return true;
 }
 
-prb_tile_t layout_t::int_dim_sizes() const {
-    prb_tile_t ret;
+pvar_tile_t layout_t::int_dim_sizes() const {
+    pvar_tile_t ret;
     for (auto &b : blocks_)
         ret[b.dim] = ret.get(b.dim, 1) * b.int_size();
     return ret;
 }
 
-dim_map_t<prb_dim_t, expr_t> layout_t::dim_sizes() const {
-    dim_map_t<prb_dim_t, expr_t> ret;
+pvar_map_t<expr_t> layout_t::dim_sizes() const {
+    pvar_map_t<expr_t> ret;
     for (auto &b : blocks_)
         ret[b.dim] = ret.get(b.dim, 1) * b.size;
     return ret;
 }
 
-int layout_t::inner_block(const prb_dim_t &dim) const {
+int layout_t::inner_block(const pvar_t &dim, bool with_outer) const {
     int ret = 1;
+    int outer = 1;
     for (auto &b : blocks_) {
-        if (b.dim == dim && b.has_const_size()) ret *= b.int_size();
+        if (b.dim == dim) {
+            if (b.has_const_size()) {
+                ret *= b.int_size();
+                outer = b.int_size();
+            } else {
+                outer = 1;
+            }
+        }
     }
+    if (!with_outer) ret /= outer;
     return ret;
 }
 
@@ -501,7 +512,7 @@ int layout_t::inner_stride() const {
     return blocks_[0].int_stride();
 }
 
-expr_t layout_t::stride(const prb_dim_t &dim, int dim_block_idx) const {
+expr_t layout_t::stride(const pvar_t &dim, int dim_block_idx) const {
     int idx = 0;
     for (auto &b : blocks_) {
         if (b.dim != dim) continue;
@@ -520,19 +531,19 @@ expr_t layout_t::shift_in_bytes(const std::vector<int> &block_off) const {
     return ret * type_.size();
 }
 
-int layout_t::offset_in_bytes(prb_coord_t<int> coord) const {
-    ir_assert(has_const_sizes() && has_const_strides());
-    int ret = to_cpp<int>(base_);
+dim_t layout_t::offset_in_bytes(pvar_coord_t<dim_t> coord) const {
+    gpu_assert(has_const_sizes() && has_const_strides());
+    dim_t ret = to_cpp<dim_t>(base_);
     for (int i = 0; i < nblocks(); i++) {
         auto &b = blocks_[i];
-        int &rem_dim = coord[b.dim];
+        dim_t &rem_dim = coord[b.dim];
         ret += (rem_dim % b.int_size()) * b.int_stride();
         rem_dim /= b.int_size();
     }
     return ret * type_.size();
 }
 
-bool layout_t::is_blocked_by(const prb_dim_t &dim, int block) const {
+bool layout_t::is_blocked_by(const pvar_t &dim, int block) const {
     if (block == 1) return true;
     if (nblocks() == 0) return false;
     auto &b = blocks_[0];
@@ -544,7 +555,7 @@ bool layout_t::is_blocked_by(const prb_dim_t &dim, int block) const {
 bool layout_t::is_blocked_by(const layout_t &other) const {
     if (other.is_empty()) return true;
 
-    ir_assert(other.type() == type());
+    gpu_assert(other.type() == type());
     if (nblocks() < other.nblocks()) return false;
 
     for (int i = 0; i < other.nblocks(); i++) {
@@ -563,7 +574,7 @@ bool layout_t::is_blocked_by(const layout_t &other) const {
 }
 
 void layout_t::add_block(
-        const prb_dim_t &dim, const expr_t &size, const expr_t &_stride) {
+        const pvar_t &dim, const expr_t &size, const expr_t &_stride) {
     if (is_one(size)) return;
     expr_t stride = _stride;
     if (stride.is_empty()) {
@@ -579,7 +590,7 @@ void layout_t::add_block(
     blocks_.emplace_back(dim, size, stride);
 }
 
-void layout_t::remove(const prb_dim_t &dim) {
+void layout_t::remove(const pvar_t &dim) {
     std::vector<block_t> new_blocks;
     for (auto &b : blocks_) {
         if (b.dim == dim) continue;
@@ -592,11 +603,12 @@ void layout_t::remove(const prb_dim_t &dim) {
 }
 
 void layout_t::block_by(const std::vector<block_t> &inner_blocks) {
-    ir_assert(has_zero_base());
-    ir_assert(has_const_sizes());
+    gpu_assert(has_zero_base());
+    gpu_assert(has_const_sizes());
     auto rem_sizes = int_dim_sizes();
     for (auto &b : inner_blocks) {
-        if (!rem_sizes.try_factor(b.dim, b.int_size())) ir_error_not_expected();
+        if (!rem_sizes.try_factor(b.dim, b.int_size()))
+            gpu_error_not_expected();
     }
 
     auto old_blocks = std::move(blocks_);
@@ -607,18 +619,18 @@ void layout_t::block_by(const std::vector<block_t> &inner_blocks) {
         add_block(b.dim, b.size);
     }
     for (auto &b : old_blocks) {
-        int b_size = b.int_size();
+        dim_t b_size = b.int_size();
         bool ok = rem_sizes.try_factor(b.dim, b_size);
         if (!ok) {
             b_size = math::gcd(b_size, rem_sizes.at(b.dim));
             ok = rem_sizes.try_factor(b.dim, b_size);
         }
-        ir_assert(ok);
+        gpu_assert(ok);
         if (b_size == 1) continue;
         add_block(b.dim, b_size);
     }
     for (auto &d : rem_sizes)
-        ir_assert(rem_sizes.at(d) == 1);
+        gpu_assert(rem_sizes.at(d) == 1);
     normalize();
 }
 
@@ -630,7 +642,7 @@ void layout_t::normalize() {
         auto &cur = blocks_[i];
         if (prev && cur.dim == prev->dim && cur.stride.is_equal(stride)) {
             prev->size *= cur.size;
-            cur.dim = prb_dim_t();
+            cur.dim = pvar_t();
             changed = true;
         } else {
             prev = &cur;
@@ -647,7 +659,7 @@ void layout_t::normalize() {
 }
 
 layout_t layout_t::split_block(
-        const block_t *block_ptr, int inner, int outer) const {
+        const block_t *block_ptr, dim_t inner, dim_t outer) const {
     std::vector<block_t> split_blocks;
     split_blocks.reserve(blocks_.size() + 1);
     for (auto &b : blocks_) {
@@ -655,8 +667,8 @@ layout_t layout_t::split_block(
             split_blocks.push_back(b);
             continue;
         }
-        ir_assert(b.has_const_size());
-        ir_assert(b.int_size() == inner * outer);
+        gpu_assert(b.has_const_size());
+        gpu_assert(b.int_size() == inner * outer);
         split_blocks.emplace_back(b.dim, inner, b.stride);
         split_blocks.emplace_back(b.dim, outer, inner * b.stride);
     }
@@ -664,7 +676,7 @@ layout_t layout_t::split_block(
 }
 
 template <typename T>
-struct try_div_mod {
+struct try_div_mod_t {
     static bool call(const T &a, int b, const var_range_info_t &range_info,
             T &div, T &mod) {
         if (a % b != 0) return false;
@@ -675,10 +687,10 @@ struct try_div_mod {
 };
 
 template <>
-struct try_div_mod<expr_t> {
+struct try_div_mod_t<expr_t> {
     static bool call(const expr_t &a, int b, const var_range_info_t &range_info,
             expr_t &div, expr_t &mod) {
-        int factor = linear_max_pow2_divisor(a);
+        dim_t factor = linear_max_pow2_divisor(a);
         if (factor % b == 0) {
             div = linear_div(a, b);
             mod = expr_t(0);
@@ -686,14 +698,14 @@ struct try_div_mod<expr_t> {
         }
         auto _linear = to_linear(a);
         auto &linear = _linear.as<linear_t>();
-        int c_factor = linear_max_pow2_divisor(linear.c);
+        dim_t c_factor = linear_max_pow2_divisor(linear.c);
         if (c_factor % b != 0) return false;
         expr_t a_div = linear_div(linear.c, b);
         expr_t a_mod;
         for (int i = 0; i < linear.nargs(); i++) {
             auto &u = linear.u_vec[i];
             auto &v = linear.v_vec[i];
-            int u_factor = linear_max_pow2_divisor(u);
+            dim_t u_factor = linear_max_pow2_divisor(u);
             if (u_factor % b == 0) {
                 a_div += linear_div(u, b) * v;
                 continue;
@@ -702,67 +714,71 @@ struct try_div_mod<expr_t> {
             if (!a_mod.is_empty()) return false;
             a_mod = v;
         }
-        div = a_div;
-        mod = a_mod;
+        div = std::move(a_div);
+        mod = std::move(a_mod);
         return true;
     }
 };
 
 template <typename T>
 layout_t layout_t::map(const dim_mapper_t &dim_mapper,
-        const prb_coord_t<T> &coord, const prb_tile_t &tile,
+        const pvar_coord_t<T> &coord, const pvar_tile_t &tile,
         const var_range_info_t &var_range_info) const {
     auto idxs = coord;
     auto rem_sizes = tile;
-    idxs.fill_missing(0);
-    rem_sizes.fill_missing(1);
     expr_t base = base_;
     std::vector<block_t> mapped_blocks;
-    dim_map_t<prb_dim_t, bool> seen_outer;
+    pvar_map_t<bool> idx_final;
+    pvar_map_t<int> rem_blocks;
+    for (auto &d : dims())
+        rem_blocks[d] = nblocks(d);
     for (auto &b : blocks()) {
+        rem_blocks[b.dim]--;
+        bool is_outer = rem_blocks[b.dim] == 0;
         auto &expr = dim_mapper.expr(b.dim);
         auto _linear = to_linear(expr);
         auto &linear = _linear.as<linear_t>();
         expr_t off = linear.c;
         for (int i = 0; i < linear.nargs(); i++) {
-            auto dim = index_to_prb_dim(linear.v_vec[i]);
-            int &cur_size = rem_sizes[dim];
-            int mapped_size = cur_size;
+            auto dim = pvar_t::from_index_var(linear.v_vec[i]);
+            if (!idxs.has(dim)) idxs[dim] = T(0);
+            if (!rem_sizes.has(dim)) rem_sizes[dim] = 1;
+            dim_t &cur_size = rem_sizes[dim];
+            dim_t mapped_size = cur_size;
             if (b.has_const_size() && cur_size != 1) {
-                ir_assert(linear.nargs() == 1);
+                gpu_assert(linear.nargs() == 1);
                 int b_size = b.int_size();
                 if (cur_size % b_size != 0) {
                     if (b_size % cur_size == 0) {
-                        int inner = cur_size;
-                        int outer = b_size / cur_size;
+                        dim_t inner = cur_size;
+                        dim_t outer = b_size / cur_size;
                         return split_block(&b, inner, outer)
                                 .map(dim_mapper, coord, tile, var_range_info);
                     }
-                    return layout_t();
                 }
-                mapped_size = b_size;
+                if (!is_outer) mapped_size = b_size;
             }
             if (mapped_size != 1) {
                 cur_size /= mapped_size;
                 auto mapped_stride = linear.u_vec[i] * b.stride;
                 mapped_blocks.emplace_back(dim, mapped_size, mapped_stride);
             }
-            bool is_outer = true;
-            if (b.has_const_size()) {
-                ir_assert(is_zero(off));
-                ir_assert(!seen_outer.has(dim));
+            bool is_final = true;
+            if (b.has_const_size() && !is_outer) {
+                gpu_assert(is_zero(off));
+                gpu_assert(!idx_final.has(dim));
                 T div = T();
                 T mod = T();
-                if (try_div_mod<T>::call(idxs[dim], b.int_size(),
+                if (try_div_mod_t<T>::call(idxs[dim], b.int_size(),
                             var_range_info, div, mod)) {
-                    idxs[dim] = div;
-                    off = mod;
-                    is_outer = false;
+                    idxs[dim] = std::move(div);
+                    off = std::move(mod);
+                    is_final = false;
                 }
             }
-            if (is_outer) {
-                ir_assert(!seen_outer.has(dim));
-                seen_outer.set(dim, true);
+            if (is_final) {
+                gpu_assert(!idx_final.has(dim));
+                idx_final.set(dim, true);
                 off += idxs[dim] * linear.u_vec[i];
             }
         }
@@ -771,19 +787,39 @@ layout_t layout_t::map(const dim_mapper_t &dim_mapper,
     return layout_t(dim_mapper.layout_desc(), type(), base, mapped_blocks);
 }
 
+layout_t layout_t::make_dense() const {
+    gpu_assert(has_const_sizes() && has_const_strides());
+    dim_t stride = 1;
+    auto new_blocks = blocks_;
+    for (auto &b : new_blocks) {
+        b.stride = expr_t(stride);
+        stride *= b.int_size();
+    }
+    return layout_t(desc_, type_, base_, new_blocks);
+}
+
+layout_t layout_t::retype(const type_t &new_type, bool dense) const {
+    if (new_type == type_) return *this;
+    auto ret = layout_t(desc_, new_type, base_, blocks_);
+    if (dense) return ret.make_dense();
+    return ret;
+}
+
 template layout_t layout_t::map<int>(const dim_mapper_t &dim_mapper,
-        const prb_coord_t<int> &coord, const prb_tile_t &tile,
+        const pvar_coord_t<int> &coord, const pvar_tile_t &tile,
         const var_range_info_t &var_range_info) const;
 template layout_t layout_t::map<expr_t>(const dim_mapper_t &dim_mapper,
-        const prb_coord_t<expr_t> &coord, const prb_tile_t &tile,
+        const pvar_coord_t<expr_t> &coord, const pvar_tile_t &tile,
         const var_range_info_t &var_range_info) const;
 
-prb_coord_t<int> layout_t::to_coord(const std::vector<int> &block_idx) const {
-    ir_assert((int)block_idx.size() == nblocks());
-    prb_coord_t<int> ret;
-    prb_tile_t block_sizes(1);
+pvar_coord_t<dim_t> layout_t::to_coord(
+        const std::vector<int> &block_idx) const {
+    gpu_assert((int)block_idx.size() == nblocks());
+    pvar_coord_t<dim_t> ret;
+    pvar_tile_t block_sizes;
     for (int i = 0; i < nblocks(); i++) {
         auto &d = blocks_[i].dim;
+        if (!block_sizes.has(d)) block_sizes[d] = 1;
         auto &blk = block_sizes[d];
         ret[d] = ret.get(d, 0) + block_idx[i] * blk;
         blk *= blocks_[i].int_size();
@@ -792,30 +828,31 @@ prb_coord_t<int> layout_t::to_coord(const std::vector<int> &block_idx) const {
 }
 
 int layout_t::to_linear_index(
-        const prb_tile_t &tile, const prb_coord_t<int> &coord) const {
-    ir_assert(has_const_sizes());
-    std::vector<int> tile_blocks;
+        const pvar_tile_t &tile, const pvar_coord_t<dim_t> &coord) const {
+    gpu_assert(has_const_sizes());
+    std::vector<dim_t> tile_blocks;
     auto rem_tile = tile;
-    rem_tile.fill_missing(1);
     for (auto &b : blocks_) {
-        int &rem = rem_tile[b.dim];
-        int factor = 1;
+        if (!rem_tile.has(b.dim)) rem_tile[b.dim] = 1;
+        dim_t &rem = rem_tile[b.dim];
+        dim_t factor = 1;
         if (rem != 1 && b.int_size() != 1) {
-            factor = math::gcd(b.int_size(), rem);
-            ir_assert(factor == std::min(b.int_size(), rem));
+            factor = math::gcd(to_cpp<dim_t>(b.size), rem);
+            gpu_assert(factor == std::min(to_cpp<dim_t>(b.size), rem));
             rem /= factor;
         }
         tile_blocks.push_back(factor);
     }
     for (auto &d : rem_tile)
-        ir_assert(rem_tile[d] == 1);
+        gpu_assert(rem_tile[d] == 1);
     int ntiles = ir_utils::safe_div(elems(), tile.elems());
     std::vector<int> idx(nblocks());
     for (int i = 0; i < ntiles; i++) {
         auto i_coord = to_coord(idx);
-        if (i_coord == coord) return i;
+        if (i_coord.drop_defaults() == coord.drop_defaults()) return i;
         advance(idx, blocks_, tile_blocks);
     }
+    gpu_error_not_expected();
     return -1;
 }
 
@@ -823,7 +860,7 @@ std::string layout_t::blocks_str() const {
     if (blocks_.empty()) return "(scalar):" + type().str();
     std::string ret;
     expr_t stride(1);
-    dim_map_t<prb_dim_t, int> seen;
+    pvar_map_t<int> seen;
     for (auto &b : blocks_) {
         std::string b_str;
         char letter = desc_.layout_letter(b.dim);
@@ -831,12 +868,15 @@ std::string layout_t::blocks_str() const {
             b_str = std::to_string(b.int_size());
             b_str.append(1, letter);
         } else {
-            b_str.append(1, seen[b.dim] ? std::toupper(letter) : letter);
+            b_str.append(1,
+                    seen[b.dim] ? static_cast<char>(std::toupper(letter))
+                                : letter);
         }
         if (b.has_const_stride() && b.int_stride() != to_int(stride)) {
             b_str.append(1, '*');
         }
-        ret = b_str + ret;
+        b_str += ret;
+        std::swap(b_str, ret);
         if (b.has_const_size() && b.has_const_stride())
             stride = b.stride * b.size;
         seen[b.dim] = true;
@@ -865,15 +905,22 @@ std::string layout_t::str_with_size(const hw_t &hw) const {
     return oss.str();
 }
 
-void for_each(const prb_tile_t &base_tile, prb_tile_t tile,
-        const std::function<void(const prb_coord_t<int> &)> &func) {
+void for_each(const pvar_tile_t &base_tile, const pvar_tile_t &tile,
+        const std::function<void(const pvar_coord_t<dim_t> &)> &func) {
+    for_each(base_tile, tile, {}, func);
+}
+
+void for_each(const pvar_tile_t &base_tile, const pvar_tile_t &_tile,
+        const std::vector<pvar_t> &idx_order,
+        const std::function<void(const pvar_coord_t<dim_t> &)> &func) {
+    auto tile = _tile;
     for (auto &d : tile) {
-        ir_assert(base_tile.has(d));
-        ir_assert(base_tile[d] % tile[d] == 0);
+        gpu_assert(base_tile.has(d));
+        gpu_assert(base_tile[d] % tile[d] == 0);
     }
 
-    prb_coord_t<int> idx;
-    prb_tile_t bound;
+    pvar_coord_t<dim_t> idx;
+    pvar_tile_t bound;
     int ntiles = 1;
     for (auto &d : base_tile) {
         if (!tile.has(d)) tile[d] = 1;
@@ -883,12 +930,13 @@ void for_each(const prb_tile_t &base_tile, prb_tile_t tile,
     }
     for (int i = 0; i < ntiles; i++) {
         func(idx);
-        advance(idx, bound, tile);
+        advance(idx, idx_order, bound, tile);
     }
 }
+
 block_iterator_t::block_iterator_t(const layout_t &layout, bool set_to_end)
     : parent_(&layout), block_idx_(set_to_end ? parent_->nblocks() : 0) {
-    ir_assert(layout.has_const_sizes());
+    gpu_assert(layout.has_const_sizes());
     if (is_end()) return;
     block_ = parent_->blocks().front();
     block_.size = 1;
@@ -915,7 +963,7 @@ block_iterator_t &block_iterator_t::operator++() {
 }
 
 block_t block_iterator_t::remaining_block() const {
-    ir_assert(!is_end());
+    gpu_assert(!is_end());
     auto &b = parent_->blocks()[block_idx_];
     int size = b.int_size() / block_.int_size();
     auto stride = block_.stride * block_.size;
@@ -933,7 +981,7 @@ bool block_iterator_t::is_dense(const prover_t &prover) const {
     return prover.require(block_.stride == stride);
 }
 
-int block_iterator_t::elems(const prb_dim_t &dim) const {
+int block_iterator_t::elems(const pvar_t &dim) const {
     if (dim.is_undef()) return elems_;
     int ret = 1;
     auto &blocks = parent_->blocks();
@@ -1007,7 +1055,7 @@ void layout_iterator_t::next(int elems) {
     offset_ += elems;
 }
 
-int layout_iterator_t::offset(const prb_dim_t &dim) const {
+int layout_iterator_t::offset(const pvar_t &dim) const {
     int ret = 1;
     int stride = 1;
     for (int i = 0; i < parent_->nblocks(); i++) {
@@ -1018,9 +1066,9 @@ int layout_iterator_t::offset(const prb_dim_t &dim) const {
     return ret;
 }
 
-prb_coord_t<int> layout_iterator_t::coord() const {
-    prb_coord_t<int> ret;
-    prb_tile_t sizes;
+pvar_coord_t<dim_t> layout_iterator_t::coord() const {
+    pvar_coord_t<dim_t> ret;
+    pvar_tile_t sizes;
     for (int i = 0; i < parent_->nblocks(); i++) {
         auto &b = parent_->blocks()[i];
         ret[b.dim] = ret.get(b.dim, 0) + block_off_[i] * sizes.get(b.dim, 1);
@@ -1037,20 +1085,20 @@ std::string layout_iterator_t::str() const {
     return ir_utils::add_tag("layout_iterator", oss.str());
 }
 
-dim_mask_desc_t::dim_mask_desc_t(const prb_dim_t &dim, const expr_t &expr,
+dim_mask_desc_t::dim_mask_desc_t(const pvar_t &dim, const expr_t &expr,
         const expr_t &bound, int block, bool has_underflow)
     : dim(dim)
     , bound(bound)
     , block(block)
     , has_underflow(has_underflow)
     , base(0) {
-    ir_assert(math::is_pow2(block));
+    gpu_assert(math::is_pow2(block));
     init_abc_xy(expr);
 }
 
 template <typename T>
 expr_t dim_mask_desc_t::to_expr(
-        const prb_coord_t<T> &coord, bool with_const) const {
+        const pvar_coord_t<T> &coord, bool with_const) const {
     expr_t ret = (with_const ? c : 0);
     if (coord.has(x_dim)) ret += a * coord[x_dim];
     if (!y_dim.is_undef() && coord.has(y_dim)) ret += b * coord[y_dim];
@@ -1058,31 +1106,31 @@ expr_t dim_mask_desc_t::to_expr(
 }
 
 template expr_t dim_mask_desc_t::to_expr(
-        const prb_coord_t<expr_t> &coord, bool with_const) const;
+        const pvar_coord_t<expr_t> &coord, bool with_const) const;
 template expr_t dim_mask_desc_t::to_expr(
-        const prb_coord_t<int> &coord, bool with_const) const;
+        const pvar_coord_t<dim_t> &coord, bool with_const) const;
 
-dim_mask_desc_t dim_mask_desc_t::map(const prb_coord_t<expr_t> &coord) const {
+dim_mask_desc_t dim_mask_desc_t::map(const pvar_coord_t<expr_t> &coord) const {
     auto ret = *this;
     ret.base = simplify_rewrite(to_expr(coord));
     if (!is_identity()) return ret;
-    int x_div = linear_max_pow2_divisor(coord.get(x_dim, 0));
+    dim_t x_div = linear_max_pow2_divisor(coord.get(x_dim, 0));
     ret.block = math::gcd(block, x_div);
     return ret;
 }
 
-bool dim_mask_desc_t::has(const prb_dim_t &dim) const {
+bool dim_mask_desc_t::has(const pvar_t &dim) const {
     return utils::one_of(dim, x_dim, y_dim);
 }
 
-expr_t dim_mask_desc_t::dim_stride(const prb_dim_t &dim) const {
+expr_t dim_mask_desc_t::dim_stride(const pvar_t &dim) const {
     if (dim == x_dim) return a;
     if (dim == y_dim) return b;
     return expr_t(0);
 }
 
 std::string dim_mask_desc_t::str() const {
-    prb_coord_t<expr_t> dummy_coord;
+    pvar_coord_t<expr_t> dummy_coord;
     if (!x.is_empty()) dummy_coord[x_dim] = x;
     if (!y.is_empty()) dummy_coord[y_dim] = y;
     auto expr = simplify_rewrite(to_expr(dummy_coord));
@@ -1104,8 +1152,8 @@ void dim_mask_desc_t::init_abc_xy(const expr_t &expr) {
         b = linear.u_vec[1];
         y = linear.v_vec[1];
     }
-    x_dim = index_to_prb_dim(x);
-    y_dim = index_to_prb_dim(y);
+    x_dim = pvar_t::from_index_var(x);
+    y_dim = pvar_t::from_index_var(y);
 }
 
 mask_desc_t::mask_desc_t(
@@ -1113,7 +1161,7 @@ mask_desc_t::mask_desc_t(
     auto dim_sizes = layout.dim_sizes();
     for (auto &d : dim_sizes) {
         auto &expr = dim_mapper.expr(d);
-        int block = layout.inner_block(d);
+        int block = ir_utils::max_pow2_divisor(layout.inner_block(d));
         if (block == 1) {
             const int large_pow2 = (1 << 10);
             block = large_pow2;
@@ -1124,16 +1172,16 @@ mask_desc_t::mask_desc_t(
 }
 
 const dim_mask_desc_t &mask_desc_t::operator[](int idx) const {
-    ir_assert(idx >= 0 && idx < nmasks());
+    gpu_assert(idx >= 0 && idx < nmasks());
     return dim_masks_[idx];
 }
 
 dim_mask_desc_t &mask_desc_t::operator[](int idx) {
-    ir_assert(idx >= 0 && idx < nmasks());
+    gpu_assert(idx >= 0 && idx < nmasks());
     return dim_masks_[idx];
 }
 
-mask_desc_t mask_desc_t::map(const prb_coord_t<expr_t> &coord) const {
+mask_desc_t mask_desc_t::map(const pvar_coord_t<expr_t> &coord) const {
     auto ret = *this;
     for (auto &dm : ret.dim_masks_)
         dm = dm.map(coord);
@@ -1146,7 +1194,7 @@ bool mask_desc_t::is_uniform(
         if (!dm.has((*it).dim)) continue;
         if (!dm.is_identity()) return false;
         int dim_size = it.elems((*it).dim);
-        ir_assert(math::is_pow2(dim_size));
+        gpu_assert(math::is_pow2(dim_size));
         if (dim_size > dm.block) return false;
         if (!prover.require(dm.bound % dim_size == 0)) return false;
     }
@@ -1163,8 +1211,8 @@ std::string mask_desc_t::str() const {
     return oss.str();
 }
 
-plane_t::plane_t(const layout_t &layout, const mask_desc_t &mask_desc) {
-    type = layout.type();
+plane_t::plane_t(const layout_t &layout, const mask_desc_t &mask_desc)
+    : type(layout.type()) {
     const block_t *w_block = nullptr;
     const block_t *h_block = nullptr;
     for (auto &b : layout.blocks()) {
@@ -1223,8 +1271,8 @@ plane_t::plane_t(const layout_t &layout, const mask_desc_t &mask_desc) {
     is_valid = true;
 }
 
-void grid_splitter_t::add(const expr_t &idx, int size) {
-    ir_assert(size > 1);
+void grid_splitter_t::add(const expr_t &idx, dim_t size) {
+    gpu_assert(size > 1);
     idxs_.emplace_back(idx, size);
 }
 
@@ -1243,21 +1291,21 @@ expr_t grid_splitter_t::pop(int _size) {
         cur = size * cur;
         cur += idx.pop(size);
     }
-    ir_assert(size == 1);
+    gpu_assert(size == 1);
     return register_index(simplify_rewrite(cur), _size);
 }
 
 expr_t grid_splitter_t::index_t::pop(int &n) {
     if (n == 1) return 0;
     if (size >= n) {
-        ir_assert(size % n == 0);
+        gpu_assert(size % n == 0);
         auto ret = (size == n ? expr : expr % n);
         expr = (size == n ? 0 : expr / n);
         size /= n;
         n = 1;
         return ret;
     }
-    ir_assert(n % size == 0);
+    gpu_assert(n % size == 0);
     n /= size;
     size = 1;
     auto ret = expr;
@@ -1276,7 +1324,7 @@ expr_t grid_splitter_t::register_index(const expr_t &expr, int size) {
 }
 
 view_t::view_t(const dim_mapper_t &dim_mapper, const layout_t &base_layout,
-        const prb_coord_t<expr_t> &coord, const prb_tile_t &tile,
+        const pvar_coord_t<expr_t> &coord, const pvar_tile_t &tile,
         const var_range_info_t &var_range_info)
     : dim_mapper_(dim_mapper)
     , base_layout_(base_layout)
@@ -1307,7 +1355,7 @@ view_t view_t::scatterize(int stride_bytes, const prover_t &prover) const {
     if (!tile_.has(compress_dim)) return view_t();
     if (stride_bytes % type_size != 0) return view_t();
     int stride = stride_bytes / type_size;
-    int size = tile_.at(compress_dim);
+    dim_t size = tile_.at(compress_dim);
     if (size % stride != 0) return view_t();
     int compress_mask_idx = -1;
     for (int i = 0; i < mask_desc_.nmasks(); i++) {
@@ -1320,9 +1368,9 @@ view_t view_t::scatterize(int stride_bytes, const prover_t &prover) const {
     }
     if (compress_mask_idx != -1) {
         auto &dmd = mask_desc_[compress_mask_idx];
-        ir_assert(dmd.dim == compress_dim);
-        ir_assert(dmd.x_dim == compress_dim);
-        ir_assert(dmd.bound.is_equal(block0.size));
+        gpu_assert(dmd.dim == compress_dim);
+        gpu_assert(dmd.x_dim == compress_dim);
+        gpu_assert(dmd.bound.is_equal(block0.size));
         if (!prover.require(dmd.base % stride == 0)) return view_t();
         if (!prover.require(dmd.bound % stride == 0)) return view_t();
     }
@@ -1338,15 +1386,16 @@ view_t view_t::scatterize(int stride_bytes, const prover_t &prover) const {
     if (compress_mask_idx != -1) {
         auto &new_dmd = ret.mask_desc_[compress_mask_idx];
         new_dmd.a = expr_t(stride);
+        new_dmd.base *= stride;
         new_dmd.bound = block0.size;
     }
     return ret;
 }
 
-layout_t split_layout(const layout_t &layout, int inner_elems, int outer_elems,
-        std::vector<int> &inner_block_idxs,
+layout_t split_layout(const layout_t &layout, dim_t inner_elems,
+        dim_t outer_elems, std::vector<int> &inner_block_idxs,
         std::vector<int> &outer_block_idxs) {
-    int cur_elems = 1;
+    dim_t cur_elems = 1;
     auto in_inner = [&]() { return cur_elems < inner_elems; };
     auto in_outer = [&]() {
         return cur_elems >= inner_elems
@@ -1357,11 +1406,11 @@ layout_t split_layout(const layout_t &layout, int inner_elems, int outer_elems,
     for (int i = 0; i < layout.nblocks(); i++) {
         auto &b = layout.blocks()[i];
         int b_size = b.int_size();
-        ir_assert(b_size != 1);
+        gpu_assert(b_size != 1);
         if (in_inner()) {
             inner_block_idxs.push_back(i);
             if (cur_elems * b_size > inner_elems) {
-                int b_inner = ir_utils::safe_div(inner_elems, cur_elems);
+                dim_t b_inner = ir_utils::safe_div(inner_elems, cur_elems);
                 int b_outer = ir_utils::safe_div(b_size, b_inner);
                 auto new_layout = layout.split_block(&b, b_inner, b_outer);
                 return split_layout(new_layout, inner_elems, outer_elems,
@@ -1370,7 +1419,7 @@ layout_t split_layout(const layout_t &layout, int inner_elems, int outer_elems,
         } else if (in_outer()) {
             outer_block_idxs.push_back(i);
             if (cur_elems * b_size > inner_elems * outer_elems) {
-                int b_inner = ir_utils::safe_div(
+                dim_t b_inner = ir_utils::safe_div(
                         cur_elems, inner_elems * outer_elems);
                 int b_outer = ir_utils::safe_div(b_size, b_inner);
                 auto new_layout = layout.split_block(&b, b_inner, b_outer);
@@ -1386,21 +1435,22 @@ layout_t split_layout(const layout_t &layout, int inner_elems, int outer_elems,
 }
 
 view_t view_t::split(const dim_mapper_t &dim_mapper,
-        const layout_t &base_layout, const prb_coord_t<expr_t> &_coord,
-        const prb_tile_t &_tile, grid_splitter_t &grid_splitter) {
+        const layout_t &base_layout, const pvar_coord_t<expr_t> &_coord,
+        const pvar_tile_t &_tile, grid_splitter_t &grid_splitter) {
     auto coord = dim_mapper.layout_desc().filter_dim_map(_coord);
     auto tile = dim_mapper.layout_desc().filter_dim_map(_tile);
-    prb_tile_t split_tile = tile;
-    prb_coord_t<expr_t> split_coord = coord;
+    pvar_tile_t split_tile = tile;
+    pvar_coord_t<expr_t> split_coord = coord;
     int outer_elems = grid_splitter.size();
-    int inner_elems = tile.elems() / outer_elems;
+    dim_t inner_elems = tile.elems() / outer_elems;
     std::vector<int> inner_idxs;
     std::vector<int> outer_idxs;
     auto layout = split_layout(base_layout.map(dim_mapper, coord, tile),
             inner_elems, outer_elems, inner_idxs, outer_idxs);
-    prb_tile_t inner_dims(1);
+    pvar_tile_t inner_dims;
     for (int i = 0; i < layout.nblocks(); i++) {
         auto &b = layout.blocks()[i];
+        if (!inner_dims.has(b.dim)) inner_dims[b.dim] = 1;
         if (std::find(outer_idxs.begin(), outer_idxs.end(), i)
                 != outer_idxs.end()) {
             int b_size = b.int_size();
@@ -1413,7 +1463,7 @@ view_t view_t::split(const dim_mapper_t &dim_mapper,
         }
         inner_dims[b.dim] *= b.int_size();
     }
-    ir_assert(grid_splitter.is_empty());
+    gpu_assert(grid_splitter.is_empty());
     return view_t(dim_mapper, base_layout, split_coord, split_tile,
             grid_splitter.var_range_info());
 }
diff --git a/src/gpu/intel/jit/v2/ir/tensor.hpp b/src/gpu/intel/jit/v2/ir/tensor.hpp
index 643b66ef427..cc04bc57de7 100644
--- a/src/gpu/intel/jit/v2/ir/tensor.hpp
+++ b/src/gpu/intel/jit/v2/ir/tensor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,7 +43,7 @@ class var_range_info_t {
                 return;
             }
         }
-        entries_.emplace_back(entry_t {var, bound});
+        entries_.emplace_back(var, bound);
     }
 
     int bound(const expr_t &var) const {
@@ -64,13 +64,12 @@ class var_range_info_t {
         entry_t(const expr_t &var, int bound) : var(var), bound(bound) {}
     };
 
-private:
     std::vector<entry_t> entries_;
 };
 
 struct block_t {
     block_t() = default;
-    block_t(const prb_dim_t &dim, const expr_t &size,
+    block_t(const pvar_t &dim, const expr_t &size,
             const expr_t &stride = expr_t())
         : dim(dim), size(size), stride(stride) {}
 
@@ -109,7 +108,7 @@ struct block_t {
     std::string str() const;
     IR_DEFINE_DUMP()
 
-    prb_dim_t dim;
+    pvar_t dim;
     expr_t size;
     expr_t stride;
 };
@@ -117,21 +116,21 @@ struct block_t {
 class layout_desc_t {
 public:
     layout_desc_t() = default;
-    layout_desc_t(const dim_map_t<prb_dim_t, char> &letter_map);
-    char layout_letter(const prb_dim_t &dim) const;
+    layout_desc_t(const pvar_map_t<char> &letter_map);
+    char layout_letter(const pvar_t &dim) const;
     const std::string &canonical() const { return canonical_; }
-    const dim_map_t<prb_dim_t, char> &letter_map() const { return letter_map_; }
+    const pvar_map_t<char> &letter_map() const { return letter_map_; }
     int ndims() const { return letter_map_.size(); }
-    prb_dim_t prb_dim(int idx) const;
-    int dim_index(const prb_dim_t &dim) const;
+    pvar_t prb_dim(int idx) const;
+    int dim_index(const pvar_t &dim) const;
     std::string to_abx_tag(const std::string &tag) const;
 
     template <typename T>
-    T filter_dim_map(const T &dim_map) const {
+    T filter_dim_map(const T &map) const {
         T ret;
-        for (auto &d : dim_map) {
+        for (auto &d : map) {
             if (!letter_map_.has(d)) continue;
-            ret[d] = dim_map[d];
+            ret[d] = map[d];
         }
         return ret;
     }
@@ -148,19 +147,19 @@ class layout_desc_t {
     IR_DEFINE_DUMP()
 
 private:
-    dim_map_t<prb_dim_t, char> letter_map_;
+    pvar_map_t<char> letter_map_;
     std::string canonical_;
 };
 
 class dim_mapper_t {
 public:
-    void set_dim(const prb_dim_t &dim, const expr_t &expr = expr_t(),
+    void set_dim(const pvar_t &dim, const expr_t &expr = expr_t(),
             bool has_undeflow = false);
     void set_layout_desc(const layout_desc_t &desc) { layout_desc_ = desc; }
     bool is_empty() const { return map_.is_empty(); }
-    bool has(const prb_dim_t &dim) const { return map_.has(dim); }
-    const expr_t &expr(const prb_dim_t &dim) const;
-    bool has_underflow(const prb_dim_t &dim) const;
+    bool has(const pvar_t &dim) const { return map_.has(dim); }
+    const expr_t &expr(const pvar_t &dim) const;
+    bool has_underflow(const pvar_t &dim) const;
     const layout_desc_t &layout_desc() const { return layout_desc_; }
     std::string str() const;
     IR_DEFINE_DUMP()
@@ -178,7 +177,7 @@ class dim_mapper_t {
         bool has_underflow;
     };
 
-    dim_map_t<prb_dim_t, map_data_t> map_;
+    pvar_map_t<map_data_t> map_;
     layout_desc_t layout_desc_;
 };
 
@@ -191,8 +190,8 @@ struct layout_raw_tag_entry_t {
     layout_raw_tag_entry_t(char letter, int block, bool is_blocked)
         : letter(letter), block(block), is_blocked(is_blocked) {}
 
-    int index() const {
-        ir_assert(letter >= 'a' && letter < 'x');
+    dim_idx_t index() const {
+        gpu_assert(letter >= 'a' && letter < 'x');
         return letter - 'a';
     }
     bool is_outer() const { return !is_blocked || (is_blocked && block == 0); }
@@ -201,8 +200,10 @@ struct layout_raw_tag_entry_t {
     std::string str() const {
         std::ostringstream oss;
         if (block != 0) oss << block;
-        oss << std::string(
-                1, (is_blocked && block == 0 ? std::toupper(letter) : letter));
+        oss << std::string(1,
+                (is_blocked && block == 0
+                                ? static_cast<char>(std::toupper(letter))
+                                : letter));
         return oss.str();
     }
 
@@ -223,7 +224,7 @@ class layout_raw_tag_t {
     static layout_raw_tag_t any() { return layout_raw_tag_t("any"); }
 
     layout_raw_tag_t() = default;
-    explicit layout_raw_tag_t(const std::string &tag, int ndims = 0)
+    explicit layout_raw_tag_t(const std::string &tag, dim_idx_t ndims = 0)
         : is_any_(tag == "any"), entries_(to_entries(tag)) {
         expand_x(ndims);
     }
@@ -239,13 +240,14 @@ class layout_raw_tag_t {
     void add_dim(char letter, int pos);
     void remove_dim(char letter);
     bool is_blocked(char letter) const;
-    int ndims() const;
-    int non_x_ndims() const;
+    bool is_blocked() const;
+    dim_idx_t ndims() const;
+    dim_idx_t non_x_ndims() const;
     std::string str() const;
     IR_DEFINE_DUMP()
 
     bool matches(const layout_raw_tag_t &other, const layout_desc_t &desc,
-            const prb_tile_t &sizes) const;
+            const pvar_tile_t &sizes) const;
 
     bool operator==(const layout_raw_tag_t &other) const {
         return (is_any_ == other.is_any_) && (entries_ == other.entries_);
@@ -255,9 +257,7 @@ class layout_raw_tag_t {
         return !operator==(other);
     }
 
-    void stringify(std::ostream &out) const {
-        jit::stringify(out, to_tag(collapse_x().entries()));
-    }
+    void stringify(std::ostream &out) const { out << str(); }
 
     void parse(std::istream &in) {
         auto tag = jit::parse<std::string>(in);
@@ -268,16 +268,13 @@ class layout_raw_tag_t {
 private:
     void init_entries(const std::string &s);
     bool has_x() const;
-    void expand_x(int ndims);
-    layout_raw_tag_t collapse_x() const;
+    void expand_x(dim_idx_t ndims);
     std::vector<bool> skip_mask(
-            const layout_desc_t &desc, const prb_tile_t &sizes) const;
+            const layout_desc_t &desc, const pvar_tile_t &sizes) const;
     static std::vector<std::pair<char, int>> parse_letter_blocks(
             const std::string &tag);
     static std::vector<layout_raw_tag_entry_t> to_entries(
             const std::string &tag);
-    static std::string to_tag(
-            const std::vector<layout_raw_tag_entry_t> &entries);
 
     bool is_any_ = false;
     std::vector<layout_raw_tag_entry_t> entries_;
@@ -298,16 +295,26 @@ class layout_tag_t {
 
     bool is_empty() const { return raw_tag_.is_empty(); }
     bool is_any() const { return raw_tag_.is_any(); }
+    bool is_blocked() const { return raw_tag_.is_blocked(); }
     const layout_desc_t &desc() const { return desc_; }
     const type_t &type() const { return type_; }
     const layout_raw_tag_t &raw_tag() const { return raw_tag_; }
-    bool matches(const layout_tag_t &other, const prb_tile_t &sizes) const;
+    bool matches(const layout_tag_t &other, const pvar_tile_t &sizes,
+            bool check_type = true) const;
+    layout_tag_t with_type(const type_t &new_type) const {
+        return layout_tag_t(desc_, new_type, raw_tag_);
+    }
     std::string str() const;
     IR_DEFINE_DUMP()
 
-#if __cplusplus >= 202002L
-    bool operator==(const layout_tag_t &other) const = default;
-#endif
+    bool operator==(const layout_tag_t &other) const {
+        return (desc_ == other.desc_) && (type_ == other.type_)
+                && (raw_tag_ == other.raw_tag_);
+    }
+
+    bool operator!=(const layout_tag_t &other) const {
+        return !operator==(other);
+    }
 
     void stringify(std::ostream &out) const {
         jit::stringify(out, raw_tag_);
@@ -319,7 +326,7 @@ class layout_tag_t {
         desc_ = layout_desc_t();
         auto s = stream_parse<std::string>(in);
         auto parts = gpu_utils::split(s, ":");
-        ir_assert(parts.size() == 2);
+        gpu_assert(parts.size() == 2);
         jit::parse(parts[0], raw_tag_);
         jit::parse(parts[1], type_);
     }
@@ -346,8 +353,9 @@ class layout_t {
     const expr_t &base() const { return base_; }
     void set_base(const expr_t &base) { base_ = base; }
     const std::vector<block_t> &blocks() const { return blocks_; }
-    std::vector<prb_dim_t> dims() const {
-        dim_map_t<prb_dim_t, int> seen;
+    std::vector<pvar_t> dims() const {
+        // TODO: Change to bool.
+        pvar_map_t<int> seen;
         for (auto &b : blocks_)
             seen[b.dim] = 1;
         return seen.keys();
@@ -364,50 +372,53 @@ class layout_t {
     // Storage size in bytes.
     int size() const;
     int nblocks() const { return static_cast<int>(blocks().size()); }
-    int nblocks(const prb_dim_t &dim) const;
+    int nblocks(const pvar_t &dim) const;
     int int_base_in_bytes() const { return to_int(base_) * type_.size(); }
-    int int_dim_size(const prb_dim_t &dim) const;
+    int int_dim_size(const pvar_t &dim) const;
     bool has_zero_base() const { return is_zero(base_); }
     bool has_const_sizes() const;
     bool has_const_strides() const;
-    prb_tile_t int_dim_sizes() const;
-    dim_map_t<prb_dim_t, expr_t> dim_sizes() const;
-    int inner_block(const prb_dim_t &dim) const;
+    pvar_tile_t int_dim_sizes() const;
+    pvar_map_t<expr_t> dim_sizes() const;
+    int inner_block(const pvar_t &dim, bool with_outer = true) const;
     int inner_stride() const;
-    expr_t stride(const prb_dim_t &dim, int dim_block_idx = 0) const;
+    expr_t stride(const pvar_t &dim, int dim_block_idx = 0) const;
     expr_t shift_in_bytes(const std::vector<int> &block_off) const;
-    int offset_in_bytes(prb_coord_t<int> coord) const;
-    bool is_blocked_by(const prb_dim_t &dim, int block) const;
+    dim_t offset_in_bytes(pvar_coord_t<dim_t> coord) const;
+    bool is_blocked_by(const pvar_t &dim, int block) const;
     bool is_blocked_by(const layout_t &other) const;
-    void add_block(const prb_dim_t &dim, const expr_t &size,
+    void add_block(const pvar_t &dim, const expr_t &size,
             const expr_t &_stride = expr_t());
-    void remove(const prb_dim_t &dim);
+    void remove(const pvar_t &dim);
     void block_by(const std::vector<block_t> &blocks);
     void pad(int elems) { stride_pad_ = elems; }
     void pad_bytes(int bytes) { pad(ir_utils::safe_div(bytes, type().size())); }
     void normalize();
-    layout_t split_block(const block_t *block_ptr, int inner, int outer) const;
+    layout_t split_block(
+            const block_t *block_ptr, dim_t inner, dim_t outer) const;
 
     template <typename T>
-    layout_t map(const dim_mapper_t &dim_mapper, const prb_coord_t<T> &coord,
-            const prb_tile_t &tile,
+    layout_t map(const dim_mapper_t &dim_mapper, const pvar_coord_t<T> &coord,
+            const pvar_tile_t &tile,
             const var_range_info_t &var_range_info = {}) const;
 
     template <typename T>
-    layout_t map(const prb_coord_t<T> &coord, const prb_tile_t &tile) const {
+    layout_t map(const pvar_coord_t<T> &coord, const pvar_tile_t &tile) const {
         return map(dim_mapper_t(), coord, tile);
     }
 
     template <typename T = int>
-    layout_t map(const prb_tile_t &tile) const {
+    layout_t map(const pvar_tile_t &tile) const {
         dim_mapper_t mapper;
         mapper.set_layout_desc(desc_);
-        return map(mapper, prb_coord_t<T>(), tile);
+        return map(mapper, pvar_coord_t<T>(), tile);
     }
 
-    prb_coord_t<int> to_coord(const std::vector<int> &block_idx) const;
+    layout_t make_dense() const;
+    layout_t retype(const type_t &new_type, bool dense = false) const;
+    pvar_coord_t<dim_t> to_coord(const std::vector<int> &block_idx) const;
     int to_linear_index(
-            const prb_tile_t &tile, const prb_coord_t<int> &coord) const;
+            const pvar_tile_t &tile, const pvar_coord_t<dim_t> &coord) const;
     std::string blocks_str() const;
     std::string str() const;
     std::string str_with_size(const hw_t &hw) const;
@@ -425,8 +436,11 @@ class layout_t {
     int stride_pad_ = 1;
 };
 
-void for_each(const prb_tile_t &base_tile, prb_tile_t tile,
-        const std::function<void(const prb_coord_t<int> &)> &func);
+void for_each(const pvar_tile_t &base_tile, const pvar_tile_t &tile,
+        const std::function<void(const pvar_coord_t<dim_t> &)> &func);
+void for_each(const pvar_tile_t &base_tile, const pvar_tile_t &tile,
+        const std::vector<pvar_t> &idx_order,
+        const std::function<void(const pvar_coord_t<dim_t> &)> &func);
 
 class block_iterator_t {
 public:
@@ -462,14 +476,14 @@ class block_iterator_t {
     }
 
     const block_t &operator*() const {
-        ir_assert(!is_end());
+        gpu_assert(!is_end());
         return block_;
     }
 
     int block_index() const { return block_idx_; }
     block_t remaining_block() const;
     bool is_dense(const prover_t &prover = prover_t::instance()) const;
-    int elems(const prb_dim_t &dim = prb_dim_t()) const;
+    int elems(const pvar_t &dim = pvar_t()) const;
     layout_t sub_layout(int stride = 1) const;
     std::string str() const;
 
@@ -508,8 +522,8 @@ class layout_iterator_t {
     const std::vector<int> &block_offset() const { return block_off_; }
     bool has_next(int elems) const { return offset_ + elems < total_elems_; }
     void next(int elems);
-    int offset(const prb_dim_t &dim) const;
-    prb_coord_t<int> coord() const;
+    int offset(const pvar_t &dim) const;
+    pvar_coord_t<dim_t> coord() const;
     std::string str() const;
     IR_DEFINE_DUMP()
 
@@ -526,28 +540,28 @@ class layout_iterator_t {
 class dim_mask_desc_t {
 public:
     dim_mask_desc_t() = default;
-    dim_mask_desc_t(const prb_dim_t &dim, const expr_t &expr,
-            const expr_t &bound, int block, bool has_underflow);
+    dim_mask_desc_t(const pvar_t &dim, const expr_t &expr, const expr_t &bound,
+            int block, bool has_underflow);
     bool is_identity() const { return is_zero(c) && is_one(a) && y.is_empty(); }
 
     template <typename T>
-    expr_t to_expr(const prb_coord_t<T> &coord, bool with_const = true) const;
+    expr_t to_expr(const pvar_coord_t<T> &coord, bool with_const = true) const;
 
-    dim_mask_desc_t map(const prb_coord_t<expr_t> &coord) const;
-    bool has(const prb_dim_t &dim) const;
-    expr_t dim_stride(const prb_dim_t &dim) const;
+    dim_mask_desc_t map(const pvar_coord_t<expr_t> &coord) const;
+    bool has(const pvar_t &dim) const;
+    expr_t dim_stride(const pvar_t &dim) const;
     std::string str() const;
     IR_DEFINE_DUMP()
 
-    prb_dim_t dim;
+    pvar_t dim;
     expr_t bound;
-    int block = 0;
+    dim_t block = 0;
     bool has_underflow = false;
 
     expr_t base;
     expr_t a, b, c;
     expr_t x, y;
-    prb_dim_t x_dim, y_dim;
+    pvar_t x_dim, y_dim;
 
 private:
     void init_abc_xy(const expr_t &expr);
@@ -560,7 +574,7 @@ class mask_desc_t {
     int nmasks() const { return static_cast<int>(dim_masks_.size()); }
     const dim_mask_desc_t &operator[](int idx) const;
     dim_mask_desc_t &operator[](int idx);
-    mask_desc_t map(const prb_coord_t<expr_t> &coord) const;
+    mask_desc_t map(const pvar_coord_t<expr_t> &coord) const;
     bool is_uniform(const block_iterator_t &it,
             const prover_t &prover = prover_t::instance()) const;
     std::string str() const;
@@ -573,13 +587,13 @@ class mask_desc_t {
 struct plane_t {
     type_t type;
     // Width and height algorithmic dimensions.
-    prb_dim_t w_dim, h_dim;
+    pvar_t w_dim, h_dim;
     // Width and height block size.
     int w = 0, h = 0;
     // Width, height, pitch of the plane.
     expr_t W, H, P;
     // Width and height layout dimensions.
-    prb_dim_t x_dim, y_dim;
+    pvar_t x_dim, y_dim;
     // Width, height offsets for masks.
     expr_t x, y;
     // Height stride.
@@ -595,7 +609,7 @@ struct plane_t {
 // Helper class for layout splitting across a grid.
 class grid_splitter_t {
 public:
-    void add(const expr_t &idx, int size);
+    void add(const expr_t &idx, dim_t size);
     int size() const {
         int ret = 1;
         for (auto &idx : idxs_)
@@ -626,9 +640,9 @@ class grid_splitter_t {
 private:
     struct index_t {
         expr_t expr;
-        int size = 0;
+        dim_t size = 0;
 
-        index_t(const expr_t &expr, int size) : expr(expr), size(size) {}
+        index_t(const expr_t &expr, dim_t size) : expr(expr), size(size) {}
         expr_t pop(int &n);
     };
 
@@ -643,13 +657,13 @@ class view_t {
 public:
     view_t() = default;
     view_t(const dim_mapper_t &dim_mapper, const layout_t &base_layout,
-            const prb_coord_t<expr_t> &coord, const prb_tile_t &tile,
+            const pvar_coord_t<expr_t> &coord, const pvar_tile_t &tile,
             const var_range_info_t &var_range_info = {});
     bool is_empty() const { return base_layout_.is_empty(); }
     const dim_mapper_t &dim_mapper() const { return dim_mapper_; }
     const layout_t &base_layout() const { return base_layout_; }
-    const prb_coord_t<expr_t> &coord() const { return coord_; }
-    const prb_tile_t &tile() const { return tile_; }
+    const pvar_coord_t<expr_t> &coord() const { return coord_; }
+    const pvar_tile_t &tile() const { return tile_; }
     const layout_t &layout() const { return layout_; }
     const mask_desc_t &mask_desc() const { return mask_desc_; }
     const plane_t &plane() const { return plane_; }
@@ -662,14 +676,14 @@ class view_t {
     IR_DEFINE_DUMP()
 
     static view_t split(const dim_mapper_t &dim_mapper,
-            const layout_t &base_layout, const prb_coord_t<expr_t> &coord,
-            const prb_tile_t &tile, grid_splitter_t &grid_splitter);
+            const layout_t &base_layout, const pvar_coord_t<expr_t> &coord,
+            const pvar_tile_t &tile, grid_splitter_t &grid_splitter);
 
 private:
     dim_mapper_t dim_mapper_;
     layout_t base_layout_;
-    prb_coord_t<expr_t> coord_;
-    prb_tile_t tile_;
+    pvar_coord_t<expr_t> coord_;
+    pvar_tile_t tile_;
     layout_t layout_;
     mask_desc_t mask_desc_;
     plane_t plane_;
diff --git a/src/gpu/intel/kernel_cache.hpp b/src/gpu/intel/kernel_cache.hpp
index 96e4cf31732..53cd7d01f17 100644
--- a/src/gpu/intel/kernel_cache.hpp
+++ b/src/gpu/intel/kernel_cache.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,9 +23,9 @@
 #include "common/cache_hit_types.hpp"
 #include "common/engine_id.hpp"
 #include "common/kernel_cache.hpp"
+#include "common/serialization.hpp"
 #include "common/utils.hpp"
 #include "gpu/intel/compute/compute_engine.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -36,7 +36,7 @@ template <typename T>
 struct trivial_key_validator_t {
 
     template <typename V>
-    struct is_trivially_validatable {
+    struct is_trivially_validatable_t {
         using yes_t = uint8_t;
         using no_t = uint16_t;
 
@@ -49,7 +49,7 @@ struct trivial_key_validator_t {
     };
 
     template <typename U,
-            gpu_utils::enable_if_t<is_trivially_validatable<U>::value,
+            utils::enable_if_t<is_trivially_validatable_t<U>::value,
                     bool> = true>
     static bool is_valid(const U &t) {
         static_assert(std::is_same<T, U>::value,
@@ -58,7 +58,7 @@ struct trivial_key_validator_t {
     }
 
     template <typename U,
-            gpu_utils::enable_if_t<!is_trivially_validatable<U>::value,
+            utils::enable_if_t<!is_trivially_validatable_t<U>::value,
                     bool> = true>
     static bool is_valid(const U &t) {
         // Runtime validation only occurs in C++20 as default comparisons
@@ -84,10 +84,10 @@ template <typename T>
 struct trivial_key_t : public T {
     // helper for extracting the value type associated with the key
     template <typename S>
-    struct create_signature {};
+    struct create_signature_t {};
 
     template <typename R, typename C, typename A1, typename A2>
-    struct create_signature<R (C::*)(A1, A2) const> {
+    struct create_signature_t<R (C::*)(A1, A2) const> {
         using result_type = R;
         using class_type = C;
         using arg1_type = A1;
@@ -95,7 +95,7 @@ struct trivial_key_t : public T {
     };
 
     using value_type =
-            typename std::remove_reference<typename create_signature<decltype(
+            typename std::remove_reference<typename create_signature_t<decltype(
                     &T::create_generator)>::arg2_type>::type;
 
     trivial_key_t() = delete;
@@ -103,7 +103,7 @@ struct trivial_key_t : public T {
         : T(t)
         , id_(id)
         , serialization(t.serialize())
-        , hash_(hash_combine(serialization.hash(), id_.hash())) {}
+        , hash_(hash_combine(serialization.get_hash(), id_.hash())) {}
     bool operator==(const trivial_key_t &other) const {
         return serialization == other.serialization && id_ == other.id_;
     }
@@ -116,7 +116,7 @@ struct trivial_key_t : public T {
 
 private:
     engine_id_t id_;
-    serialized_t serialization;
+    serialization_stream_t serialization;
     size_t hash_;
 };
 
diff --git a/src/gpu/intel/logging.hpp b/src/gpu/intel/logging.hpp
new file mode 100644
index 00000000000..929ff54ba7a
--- /dev/null
+++ b/src/gpu/intel/logging.hpp
@@ -0,0 +1,174 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_LOGGING_HPP
+#define GPU_INTEL_LOGGING_HPP
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "common/utils.hpp"
+#include "common/verbose.hpp"
+#include "gpu/intel/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+
+enum class log_level_t {
+    off = 0,
+    warning = 100,
+    suggestion = 120,
+    info = 150,
+    perf = 170,
+    trace = 200,
+};
+
+template <typename T, typename = void>
+struct print_helper_t {
+    static void call(std::ostream &out, const T &t) { out << t; }
+};
+
+template <typename T>
+struct print_helper_t<T, decltype(std::declval<T>().str(), void())> {
+    static void call(std::ostream &out, const T &t) { out << t.str(); }
+};
+
+template <log_level_t level, bool value = true>
+class logger_t {
+public:
+    logger_t(const char *file_name, int line, std::ostream &out = std::cout)
+        : file_path_(file_name + std::string(":") + std::to_string(line))
+        , out_(out) {}
+
+    logger_t(const logger_t &) = delete;
+    logger_t &operator=(const logger_t &) = delete;
+
+    ~logger_t() {
+        add_header(true);
+        if (lines_.size() == 1) {
+            out_ << " " << lines_[0] << std::endl;
+        } else {
+            out_ << std::endl;
+            for (auto &l : lines_) {
+                add_header(/*with_file=*/false);
+                out_ << "  " << l << std::endl;
+            }
+        }
+    }
+
+    static bool is_enabled() {
+        return get_verbose_dev_mode(verbose_t::debuginfo)
+                >= static_cast<int>(level);
+    }
+
+    log_level_t get_level() const { return level; }
+
+    operator bool() const { return value; }
+
+    template <typename T>
+    logger_t &operator<<(const T &obj) {
+        std::ostringstream oss;
+        print_helper_t<T>::call(oss, obj);
+        auto lines = gpu_utils::split(oss.str(), "\n");
+        if (lines_.empty() || lines.empty()) {
+            lines_ = std::move(lines);
+            return *this;
+        }
+        lines_.back() += lines[0];
+        lines_.insert(lines_.end(), lines.begin() + 1, lines.end());
+        return *this;
+    }
+
+private:
+    void add_header(bool with_file) {
+        switch (level) {
+            case log_level_t::warning: out_ << "[ WARN]"; break;
+            case log_level_t::suggestion: out_ << "[SUGGESTION]"; break;
+            case log_level_t::info: out_ << "[ INFO]"; break;
+            case log_level_t::perf: out_ << "[ PERF]"; break;
+            case log_level_t::trace: out_ << "[TRACE]"; break;
+            default: gpu_error_not_expected();
+        }
+        if (with_file) out_ << "[" << file_path_ << "]";
+    }
+
+    std::string file_path_;
+    std::ostream &out_;
+    std::vector<std::string> lines_;
+};
+
+#define gpu_perf() \
+    dnnl::impl::gpu::intel::logger_t< \
+            dnnl::impl::gpu::intel::log_level_t::perf>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::perf>( \
+                    __FILENAME__, __LINE__)
+
+// Trace can result in overhead making measurement meaningless
+#define gpu_perf_no_trace() \
+    dnnl::impl::gpu::intel::logger_t< \
+            dnnl::impl::gpu::intel::log_level_t::perf>::is_enabled() \
+            && !dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::trace>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::perf>( \
+                    __FILENAME__, __LINE__)
+
+#define gpu_info() \
+    dnnl::impl::gpu::intel::logger_t< \
+            dnnl::impl::gpu::intel::log_level_t::info>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::info>( \
+                    __FILENAME__, __LINE__)
+
+#define gpu_warning() \
+    dnnl::impl::gpu::intel::logger_t< \
+            dnnl::impl::gpu::intel::log_level_t::warning>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::warning>( \
+                    __FILENAME__, __LINE__)
+
+#define gpu_suggestion() \
+    dnnl::impl::gpu::intel::logger_t< \
+            dnnl::impl::gpu::intel::log_level_t::suggestion>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::suggestion>( \
+                    __FILENAME__, __LINE__)
+
+#define gpu_trace() \
+    dnnl::impl::gpu::intel::logger_t< \
+            dnnl::impl::gpu::intel::log_level_t::trace>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::trace>( \
+                    __FILENAME__, __LINE__)
+
+#define gpu_check(cond) \
+    if (!(cond)) \
+    return dnnl::impl::gpu::intel::logger_t< \
+                   dnnl::impl::gpu::intel::log_level_t::trace>::is_enabled() \
+            && dnnl::impl::gpu::intel::logger_t< \
+                    dnnl::impl::gpu::intel::log_level_t::trace, false>( \
+                    __FILENAME__, __LINE__)
+
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/microkernels/.clang-tidy b/src/gpu/intel/microkernels/.clang-tidy
new file mode 100644
index 00000000000..fc05834a23b
--- /dev/null
+++ b/src/gpu/intel/microkernels/.clang-tidy
@@ -0,0 +1,3 @@
+Checks: '-*,misc-definitions-in-headers'
+CheckOptions:
+  - { key: HeaderFileExtensions,          value: "x" }
diff --git a/src/gpu/intel/microkernels/entrance_agent.cpp b/src/gpu/intel/microkernels/entrance_agent.cpp
index 0af9ed7b8d8..b39fe323344 100644
--- a/src/gpu/intel/microkernels/entrance_agent.cpp
+++ b/src/gpu/intel/microkernels/entrance_agent.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,9 +18,8 @@
 
 #include <array>
 
-#include "gpu/intel/jit/ngen/ngen_config.hpp"
-#include "gpu/intel/jit/ngen/ngen_decoder.hpp"
-#include "gpu/intel/jit/ngen/npack/neo_packager.hpp"
+#include "ngen_decoder.hpp"
+#include "npack/neo_packager.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -72,7 +71,8 @@ EntranceAgent::Status EntranceAgent::scan(Package &package) {
             else
                 base = j, len = 1;
         } else if (len > 0) {
-            package.clobbers.emplace_back(base * regBytes, len * regBytes);
+            package.clobbers.emplace_back(
+                    RegisterRange(base * regBytes, len * regBytes));
             len = 0;
         }
     }
diff --git a/src/gpu/intel/microkernels/fuser.cpp b/src/gpu/intel/microkernels/fuser.cpp
index fd63fdf49ec..c03be4a9ca1 100644
--- a/src/gpu/intel/microkernels/fuser.cpp
+++ b/src/gpu/intel/microkernels/fuser.cpp
@@ -32,7 +32,7 @@ namespace micro {
 static void fixupJumpTargets(uint8_t *start, size_t len, ptrdiff_t adjust);
 
 void fuseMicrokernel(std::vector<uint8_t> &binary,
-        const std::vector<uint8_t> &microkernel, int id) {
+        const std::vector<uint8_t> &microkernel, long id) {
     auto base = binary.data();
     auto bytes = binary.size();
 
@@ -185,11 +185,12 @@ void fuseMicrokernels(std::vector<uint8_t> &binary, const char *source) {
             s = std::strstr(s, sigilBinary)) {
         s += sigilLen;
         char *after;
-        int id = strtol(s, &after, 10);
+        long id = strtol(s, &after, 10);
         microkernel.clear();
         for (s = after + 1; *s != '\n'; s += 2) {
             if (!s[0] || !s[1]) break;
-            microkernel.push_back((toNybble(s[0]) << 4) | toNybble(s[1]));
+            microkernel.push_back(static_cast<uint8_t>(
+                    (toNybble(s[0]) << 4) | toNybble(s[1])));
         }
         fuseMicrokernel(binary, microkernel, id);
     }
diff --git a/src/gpu/intel/microkernels/protocol.cpp b/src/gpu/intel/microkernels/protocol.cpp
index aff7f93f9c4..46183fe06a8 100644
--- a/src/gpu/intel/microkernels/protocol.cpp
+++ b/src/gpu/intel/microkernels/protocol.cpp
@@ -29,20 +29,11 @@ namespace micro {
 
 GEMMProtocol::GEMMProtocol(const Options &options) {
     family = Family::GEMM;
-    ioptions = 0;
-    if (options.localA) ioptions |= (1 << 0);
-    if (options.localB) ioptions |= (1 << 1);
-    if (options.addToC) ioptions |= (1 << 2);
-    if (options.slmPtr) ioptions |= (1 << 3);
+    ioptions = options.toOptionsMask();
 }
 
 GEMMProtocol::Options GEMMProtocol::options() const {
-    Options options {};
-    options.localA = (ioptions & (1 << 0));
-    options.localB = (ioptions & (1 << 1));
-    options.addToC = (ioptions & (1 << 2));
-    options.slmPtr = (ioptions & (1 << 3));
-    return options;
+    return Options(ioptions);
 }
 
 #define PDISPATCH(routine, cand) \
@@ -96,10 +87,18 @@ std::vector<ProtocolArgument> GEMMProtocol::arguments() const {
     std::vector<ProtocolArgument> argsV
             = {args, args + sizeof(args) / sizeof(args[0])};
 
-    if (options().localA) argsV[0].stype.format = LocalPointer;
-    if (options().localB) argsV[2].stype.format = LocalPointer;
-    if (options().addToC) argsV[4].direction = ProtocolArgument::InOut;
-    if (options().slmPtr) argsV.push_back({"slm", In, LocalPointer});
+    auto o = options();
+
+    if (o.localA) argsV[0].stype.format = LocalPointer;
+    if (o.localB) argsV[2].stype.format = LocalPointer;
+    if (o.addToC) argsV[4].direction = ProtocolArgument::InOut;
+    if (o.slmPtr) argsV.push_back({"slm", In, LocalPointer});
+    if (o.scaleA) argsV.push_back({"a_scale", In, GlobalPointer});
+    if (o.offsetA) argsV.push_back({"a_offset", In, GlobalPointer});
+    if (o.offsetA || o.scaleA) argsV.push_back({"ldaq", In, s32});
+    if (o.scaleB) argsV.push_back({"b_scale", In, GlobalPointer});
+    if (o.offsetB) argsV.push_back({"b_offset", In, GlobalPointer});
+    if (o.offsetB || o.scaleB) { argsV.push_back({"ldbq", In, s32}); }
 
     return argsV;
 }
diff --git a/src/gpu/intel/microkernels/protocol.hpp b/src/gpu/intel/microkernels/protocol.hpp
index 9a0b2b9cfcb..1b30e1a76f8 100644
--- a/src/gpu/intel/microkernels/protocol.hpp
+++ b/src/gpu/intel/microkernels/protocol.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,6 +53,34 @@ class GEMMProtocol : public Protocol {
         bool localB = false;
         bool addToC = false;
         bool slmPtr = false;
+        bool offsetA = false;
+        bool offsetB = false;
+        bool scaleA = false;
+        bool scaleB = false;
+
+        Options() = default;
+        explicit Options(int flags)
+            : localA(flags & (1 << 0))
+            , localB(flags & (1 << 1))
+            , addToC(flags & (1 << 2))
+            , slmPtr(flags & (1 << 3))
+            , offsetA(flags & (1 << 4))
+            , offsetB(flags & (1 << 5))
+            , scaleA(flags & (1 << 6))
+            , scaleB(flags & (1 << 7)) {}
+
+        int toOptionsMask() const {
+            int ioptions = 0;
+            if (localA) ioptions |= (1 << 0);
+            if (localB) ioptions |= (1 << 1);
+            if (addToC) ioptions |= (1 << 2);
+            if (slmPtr) ioptions |= (1 << 3);
+            if (offsetA) ioptions |= (1 << 4);
+            if (offsetB) ioptions |= (1 << 5);
+            if (scaleA) ioptions |= (1 << 6);
+            if (scaleB) ioptions |= (1 << 7);
+            return ioptions;
+        }
     };
 
     GEMMProtocol() : GEMMProtocol(Options {}) {}
@@ -77,7 +105,9 @@ struct StructuredType {
         u16,
         s16,
         u8,
-        s8, //    integral
+        s8,
+        u4,
+        s4, //    integral
         f64,
         f32,
         f16,
@@ -88,7 +118,7 @@ struct StructuredType {
     enum Format { Scalar, GlobalPointer, LocalPointer, Tensor } format = Scalar;
     int ndims = 1;
 
-    StructuredType() {}
+    StructuredType() = default;
     StructuredType(Type type_) : type(type_) {}
     StructuredType(Format format_) : format(format_) {}
     StructuredType(int ndims_) : format(Tensor), ndims(ndims_) {}
diff --git a/src/gpu/intel/microkernels/shim.cpp b/src/gpu/intel/microkernels/shim.cpp
index 228420751f1..49d3b2638c4 100644
--- a/src/gpu/intel/microkernels/shim.cpp
+++ b/src/gpu/intel/microkernels/shim.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ int grfWidth(uint32_t gmdid) {
     if (decode.architecture == 12 && decode.release >= 60
             && decode.release < 70) /* XeHPC */
         return 64;
-    if (decode.architecture >= 20) /* Xe2 */
+    if (decode.architecture >= 20) /* Xe2/Xe3 */
         return 64;
 
     return 32;
@@ -125,6 +125,8 @@ const char *typeName(
             case StructuredType::u32: return "uint";
             case StructuredType::u16: return "ushort";
             case StructuredType::u8: return "uchar";
+            case StructuredType::s4: return "uchar";
+            case StructuredType::u4: return "uchar";
             case StructuredType::f64: return "double";
             case StructuredType::f32: return "float";
             case StructuredType::f16: return "half";
@@ -436,7 +438,7 @@ std::string generateShim(const Package &package, HostLanguage language,
             varg.copy = options.copyScalarArgs;
             varg.name = pargs[i].name;
             if (byPtr) varg.name = '*' + varg.name;
-            vargList.push_back(varg);
+            vargList.push_back(std::move(varg));
         }
     }
 
@@ -448,6 +450,8 @@ std::string generateShim(const Package &package, HostLanguage language,
 
     /* Tie arguments to physical registers */
     int gwidth = grfWidth(package.gmdidCompat);
+    std::vector<std::string> copyNames(vargs.size());
+
     for (int i = 0; i < int(vargs.size()); i++) {
         auto &range = vargs[i].location;
         auto goffset = range.boffset % gwidth;
@@ -458,14 +462,17 @@ std::string generateShim(const Package &package, HostLanguage language,
                     "Microkernel tensor argument misaligned in registers");
 
         if (vargs[i].copy) {
-            shim << "            \".decl COPY" << i << " v_type=G type="
-                 << typeName(vargs[i].type, HostLanguage::vISA) << " num_elts="
-                 << (vargs[i].location.blen / typeSize(vargs[i].type))
+            copyNames[i] = "COPY" + std::to_string(i) + '_'
+                    + std::to_string(range.boffset) + '_'
+                    + std::to_string(range.blen);
+            shim << "            \".decl " << copyNames[i] << " v_type=G type="
+                 << typeName(vargs[i].type, HostLanguage::vISA)
+                 << " num_elts=" << (range.blen / typeSize(vargs[i].type))
                  << "\\n\"\n";
-        }
+        } else
+            copyNames[i] = '%' + std::to_string(i);
 
-        shim << "            \".implicit_PSEUDO_INPUT "
-             << (vargs[i].copy ? "COPY" : "%") << i
+        shim << "            \".implicit_PSEUDO_INPUT " << copyNames[i]
              << " offset=" << range.boffset << " size=" << range.blen
              << "\\n\"\n";
     }
@@ -481,7 +488,18 @@ std::string generateShim(const Package &package, HostLanguage language,
     if (anyCopyIn) shim << "            \"fence_sw\\n\"\n";
 
     /* Copy inputs as needed */
-    auto copyArg = [&](int i, const char *from, const char *to) {
+    enum CopyArgType { Argument, Copy, Null };
+
+    auto copyArgName = [&](CopyArgType type, int i) {
+        switch (type) {
+            case Argument: return '%' + std::to_string(i);
+            case Copy: return copyNames[i];
+            case Null: return std::string("V0");
+            default: throw std::runtime_error("Invalid argument class");
+        }
+    };
+
+    auto copyArg = [&](int i, CopyArgType from, CopyArgType to) {
         int remaining = vargs[i].location.blen;
         int tsize = typeSize(vargs[i].type);
         int offset = 0;
@@ -491,19 +509,16 @@ std::string generateShim(const Package &package, HostLanguage language,
             chunk = esize * tsize;
             int r = offset / gwidth;
             int c = (offset % gwidth) / tsize;
-            shim << "            \"mov (M1_NM, " << esize << ") ";
-            if (to)
-                shim << to << i;
-            else
-                shim << "V0";
-            shim << '(' << r << ',' << c << ")<1> " << from << i << '(' << r
-                 << ',' << c << ")<1;1,0>\\n\"\n";
+            shim << "            \"mov (M1_NM, " << esize << ") "
+                 << copyArgName(to, i) << '(' << r << ',' << c << ")<1> "
+                 << copyArgName(from, i) << '(' << r << ',' << c
+                 << ")<1;1,0>\\n\"\n";
             offset += chunk;
         }
     };
 
     for (int i = 0; i < int(vargs.size()); i++)
-        if (vargs[i].copy && vargs[i].in) copyArg(i, "%", "COPY");
+        if (vargs[i].copy && vargs[i].in) copyArg(i, Argument, Copy);
 
     /* Wrangle clobber regions. */
     struct clobber_t {
@@ -522,7 +537,7 @@ std::string generateShim(const Package &package, HostLanguage language,
         clobber.name = (vargs[i].copy ? "COPY" : "%") + std::to_string(i);
         clobber.arg = false; // Reuse 'arg' field as flag
         clobber.preclobbered = vargs[i].copy && vargs[i].in;
-        vargClobbers.push_back(clobber);
+        vargClobbers.push_back(std::move(clobber));
     }
 
     std::sort(vargClobbers.begin(), vargClobbers.end(),
@@ -576,7 +591,8 @@ std::string generateShim(const Package &package, HostLanguage language,
 
                 clobber_t clobber;
                 clobber.location = RegisterRange(offset, chunk);
-                clobber.name = "CLOBBER" + std::to_string(clobbers.size());
+                clobber.name = "CLOBBER" + std::to_string(clobbers.size()) + '_'
+                        + std::to_string(offset) + '_' + std::to_string(chunk);
                 clobber.arg = false;
                 clobbers.push_back(std::move(clobber));
 
@@ -607,15 +623,17 @@ std::string generateShim(const Package &package, HostLanguage language,
     }
 
     /* Mark beginning of patch region */
+    const auto &clobber0Name = clobbers[0].name;
     shim << "            \"fence_sw\\n\"\n"
-            "            \"add (M1,1) CLOBBER0(0,0)<1> CLOBBER0(0,0)<0;1,0> 0x"
+            "            \"add (M1,1) "
+         << clobber0Name << "(0,0)<1> " << clobber0Name << "(0,0)<0;1,0> 0x"
          << std::hex << (sigilStart ^ options.microkernelID) << std::dec
          << ":ud\\n\"\n"
             "            \"fence_sw\\n\"\n";
 
     /* Use inputs to ensure vISA considers their values live */
     for (int i = 0; i < int(vargs.size()); i++)
-        if (vargs[i].in) copyArg(i, vargs[i].copy ? "COPY" : "%", nullptr);
+        if (vargs[i].in) copyArg(i, vargs[i].copy ? Copy : Argument, Null);
 
     /* Overwrite clobbers to ensure vISA considers their ranges live */
     for (int i = 0; i < int(clobbers.size()); i++) {
@@ -674,14 +692,15 @@ std::string generateShim(const Package &package, HostLanguage language,
 
     /* Mark end of patch region */
     shim << "            \"fence_sw\\n\"\n"
-            "            \"add (M1,1) CLOBBER0(0,0)<1> CLOBBER0(0,0)<0;1,0> 0x"
+            "            \"add (M1,1) "
+         << clobber0Name << "(0,0)<1> " << clobber0Name << "(0,0)<0;1,0> 0x"
          << std::hex << (sigilEnd ^ options.microkernelID) << std::dec
          << ":ud\\n\"\n"
             "            \"fence_sw\\n\"\n";
 
     /* Copy output arguments as needed */
     for (int i = 0; i < int(vargs.size()); i++)
-        if (vargs[i].copy && vargs[i].out) copyArg(i, "COPY", "%");
+        if (vargs[i].copy && vargs[i].out) copyArg(i, Copy, Argument);
 
     /* Protect output copies from preceding code */
     if (anyCopyOut) shim << "            \"fence_sw\\n\"\n";
diff --git a/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.cpp b/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.cpp
index b7a26f773e6..3714bed2c87 100644
--- a/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.cpp
+++ b/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -602,7 +602,7 @@ std::string get_nhwc_desc_str(const params_t &conf) {
 
 const char *bnorm_lookup_table_t::find(
         const params_t &conf, const gpu_arch_t &gpu_arch) const {
-    auto key
+    const auto &key
             = is_nhwc_impl(conf) ? get_nhwc_desc_str(conf) : get_desc_str(conf);
     auto it = map_.find(key);
     if (it == map_.end()) return nullptr;
@@ -628,7 +628,8 @@ int params_t::sort_key(const param_t *param) const {
             nullptr,
     };
     for (const char **p = ordered_params; *p; p++) {
-        if (param->short_name() == *p) return p - ordered_params;
+        if (param->short_name() == *p)
+            return static_cast<int>(p - ordered_params);
     }
     return (int)(sizeof(ordered_params) / sizeof(ordered_params[0]));
 }
diff --git a/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.hpp b/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.hpp
index cdc45fc20e3..ebe2f8752bd 100644
--- a/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.hpp
+++ b/src/gpu/intel/ocl/bnorm/bnorm_lookup_table.hpp
@@ -61,7 +61,7 @@ class max_vect_size_param_t : public int_param_t {
     bool is_overridable() const override { return true; }
 };
 
-class stat_sp_block_param_t : public int_param_t {
+class stat_sp_block_param_t : public dim_param_t {
 public:
     std::string name() const override { return "stat-sp-block"; }
     std::string short_name() const override { return "sspb"; }
@@ -71,7 +71,7 @@ class stat_sp_block_param_t : public int_param_t {
     bool is_overridable() const override { return true; }
 };
 
-class update_sp_block_param_t : public int_param_t {
+class update_sp_block_param_t : public dim_param_t {
 public:
     std::string name() const override { return "update-sp-block"; }
     std::string short_name() const override { return "uspb"; }
diff --git a/src/gpu/intel/ocl/bnorm/bnorm_model.cpp b/src/gpu/intel/ocl/bnorm/bnorm_model.cpp
index 9cececbaf9a..4ff98bbb33d 100644
--- a/src/gpu/intel/ocl/bnorm/bnorm_model.cpp
+++ b/src/gpu/intel/ocl/bnorm/bnorm_model.cpp
@@ -14,7 +14,6 @@
 * limitations under the License.
 *******************************************************************************/
 #include "gpu/intel/ocl/bnorm/bnorm_model.hpp"
-#include <climits>
 #include <cmath>
 #include "common/utils.hpp"
 #include "gpu/intel/compute/utils.hpp"
@@ -40,23 +39,23 @@ int get_nhwc_vect_size(int ic, int max_vect_size, int simd) {
     return 1;
 }
 int get_nhwc_sp_block_size(
-        int sp, int ic_dim, int eu_count, int threads_per_eu, int simd) {
+        dim_t sp, dim_t ic_dim, int eu_count, int threads_per_eu, int simd) {
 
     float efficiency_thr = 0.0f;
     float efficiency_peak_eu_thr = 0.0f;
-    int block_size_thr = 1;
-    int block_size_peak_eu_thr = 1;
-    int curr_block_size = sp;
+    dim_t block_size_thr = 1;
+    dim_t block_size_peak_eu_thr = 1;
+    dim_t curr_block_size = sp;
     int nthr_mul = 1;
-    const int ic_nsg = ic_dim / simd; // number of subgroups by ic dim
+    const dim_t ic_nsg = ic_dim / simd; // number of subgroups by ic dim
 
     // The search is based on threads wave efficiency.
     // Higher priority for cases with peak EUs utilization.
     while (nthr_mul <= 32) {
         const int nthr = nthr_mul * eu_count;
         curr_block_size = div_up(sp * ic_nsg, nthr);
-        const int nblock = div_up(sp, curr_block_size);
-        const int nthr_gen = nblock * ic_nsg;
+        const dim_t nblock = div_up(sp, curr_block_size);
+        const dim_t nthr_gen = nblock * ic_nsg;
 
         const float curr_efficiency_eus
                 = (float)nthr_gen / rnd_up(nthr_gen, eu_count);
@@ -74,10 +73,10 @@ int get_nhwc_sp_block_size(
         }
         nthr_mul++;
     }
-    if (efficiency_peak_eu_thr > 0.0f) return block_size_peak_eu_thr;
-    return block_size_thr;
+    if (efficiency_peak_eu_thr > 0.0f) return into<int>(block_size_peak_eu_thr);
+    return into<int>(block_size_thr);
 }
-int get_nhwc_calc_stat_ic(int ic, int ic_block, int sg_size) {
+dim_t get_nhwc_calc_stat_ic(dim_t ic, int ic_block, int sg_size) {
     return div_up(ic, ic_block) * sg_size;
 }
 
@@ -123,7 +122,7 @@ float get_used_ss_thr_utilization(hw_params_t &hw_params, int sg_size,
     return (float)num_thrs_generated
             / std::min(
                     num_wgs * hw_params.eus_per_ss * hw_params.threads_per_eu,
-                    gpu_utils::into<size_t>(
+                    into<size_t>(
                             hw_params.eu_count * hw_params.threads_per_eu));
 }
 
@@ -285,8 +284,8 @@ size_t get_kernel_input_size(const model_params_t &p,
     size_t nbytes = 0;
     const size_t tensor_sz = conf.sp * conf.ic * conf.elsz;
     const size_t stat_vect_sz = conf.ic * sizeof(float);
-    const int num_sp_blocks = div_up(conf.sp, p.stat_sp_block);
-    const int ws_sz = conf.sp * conf.ic * into<int>(sizeof(char));
+    const dim_t num_sp_blocks = div_up(conf.sp, p.stat_sp_block);
+    const dim_t ws_sz = conf.sp * conf.ic * into<int>(sizeof(char));
 
     switch (desc.kernel) {
         case calc_mean_ker:
@@ -339,7 +338,7 @@ size_t get_kernel_output_size(const model_params_t &p,
     size_t nbytes = 0;
     const size_t tensor_sz = conf.sp * conf.ic * conf.elsz;
     const size_t stat_vect_sz = conf.ic * sizeof(float);
-    const int num_sp_blocks = div_up(conf.sp, p.stat_sp_block);
+    const dim_t num_sp_blocks = div_up(conf.sp, p.stat_sp_block);
 
     switch (desc.kernel) {
         case calc_mean_ker:
@@ -797,7 +796,7 @@ status_t get_params_by_model(nhwc_bnorm_params_t &conf,
     while (p.ic_block <= conf.ic
             && (reusable_version ? p.ic_block <= conf.max_ic_block : true)) {
         if (conf.ic % p.ic_block == 0) {
-            const int calc_stat_ic = get_nhwc_calc_stat_ic(
+            const dim_t calc_stat_ic = get_nhwc_calc_stat_ic(
                     conf.ic, p.ic_block, conf.sub_group_size);
             p.stat_sp_block = get_nhwc_sp_block_size(conf.sp, calc_stat_ic,
                     hw_params.eu_count, hw_params.threads_per_eu,
diff --git a/src/gpu/intel/ocl/bnorm/bnorm_model.hpp b/src/gpu/intel/ocl/bnorm/bnorm_model.hpp
index b5c99b1fd78..184037c4341 100644
--- a/src/gpu/intel/ocl/bnorm/bnorm_model.hpp
+++ b/src/gpu/intel/ocl/bnorm/bnorm_model.hpp
@@ -100,7 +100,7 @@ void dump_params(std::vector<model_params_t> &params);
 int get_nhwc_vect_size(int ic, int max_vect_size, int simd = 16);
 int get_nhwc_sp_block_size(
         int sp, int ic_dim, int eu_count, int threads_per_eu, int simd = 16);
-int get_nhwc_calc_stat_ic(int ic, int ic_block, int sg_size);
+dim_t get_nhwc_calc_stat_ic(dim_t ic, int ic_block, int sg_size);
 
 status_t get_estimated_hw_utilization(model_params_t &p,
         nhwc_bnorm_params_t &conf, hw_params_t &hw_params, kernel_desc_t &desc);
diff --git a/src/gpu/intel/ocl/bnorm/bnorm_utils.cpp b/src/gpu/intel/ocl/bnorm/bnorm_utils.cpp
index 977c9b15441..ed2c566b9f4 100644
--- a/src/gpu/intel/ocl/bnorm/bnorm_utils.cpp
+++ b/src/gpu/intel/ocl/bnorm/bnorm_utils.cpp
@@ -118,11 +118,11 @@ std::string get_desc_str(const batch_normalization_pd_t *pd) {
     const memory_desc_wrapper data_mdw(
             pd->is_fwd() ? pd->src_md() : pd->diff_src_md());
     const int ndims = data_mdw.ndims();
-    const int mb = data_mdw.dims()[0];
-    const int ic = data_mdw.dims()[1];
-    const int id = (ndims == 5) ? data_mdw.dims()[2] : 1;
-    const int ih = (ndims == 3) ? 1 : data_mdw.dims()[ndims - 2];
-    const int iw = data_mdw.dims()[ndims - 1];
+    const dim_t mb = data_mdw.dims()[0];
+    const dim_t ic = data_mdw.dims()[1];
+    const dim_t id = (ndims == 5) ? data_mdw.dims()[2] : 1;
+    const dim_t ih = (ndims == 3) ? 1 : data_mdw.dims()[ndims - 2];
+    const dim_t iw = data_mdw.dims()[ndims - 1];
     std::string s;
     s += std::to_string(mb) + ",";
     s += std::to_string(ic) + ",";
diff --git a/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.cpp b/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.cpp
index 9dfa3fea5b7..434ccbc3e30 100644
--- a/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.cpp
+++ b/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "common/utils.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/ocl/bnorm/bnorm_utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 using namespace dnnl::impl::memory_tracking::names;
 
@@ -75,8 +75,8 @@ static void adjust_lws_calc_kernel(bn_lookup_table::params_t &conf,
     const compute::range_t &base_lws = generated_nd.local_range();
     gpu_assert(base_lws) << "lws is missing";
 
-    compute::range_t tuned_lws = {gpu_utils::into<size_t>(conf.sub_group_size),
-            base_lws[1], base_lws[2]};
+    compute::range_t tuned_lws
+            = {into<size_t>(conf.sub_group_size), base_lws[1], base_lws[2]};
     compute::range_t curr_lws = tuned_lws;
 
     // The search is based on subslice utilization which calculated as the ratio
@@ -109,20 +109,20 @@ static void adjust_lws_calc_kernel(bn_lookup_table::params_t &conf,
     dispatch.set_lws(tuned_lws);
 }
 
-static int get_block_size(bool is_backward, int hw_threads, int nn, int ic,
-        int work_size, int simd = 16) {
-    int block_size = 256;
+static dim_t get_block_size(bool is_backward, int hw_threads, dim_t nn,
+        dim_t ic, dim_t work_size, int simd = 16) {
+    dim_t block_size = 256;
     float thread_efficiency = 0;
     int hw_thread_mult = hw_threads;
     const int align_size = is_backward ? 8 : 16;
     while (true) {
         const int nof_blocks
-                = nstl::max(rnd_dn(hw_thread_mult * simd, ic) / ic, 1);
-        const int min_block_size = rnd_up(work_size, nof_blocks) / nof_blocks;
-        const int curr_block_size = rnd_up(min_block_size, align_size);
-        const int nof_blocks_generated
+                = nstl::max(into<int>((hw_thread_mult * simd) / ic), 1);
+        const dim_t min_block_size = rnd_up(work_size, nof_blocks) / nof_blocks;
+        const dim_t curr_block_size = rnd_up(min_block_size, align_size);
+        const dim_t nof_blocks_generated
                 = rnd_up(work_size, curr_block_size) / curr_block_size;
-        const int threads_generated = nof_blocks_generated * ic / simd;
+        const dim_t threads_generated = nof_blocks_generated * ic / simd;
         const float curr_thread_efficiency = float(threads_generated * nn)
                 / float(rnd_up(threads_generated * nn, hw_threads));
         if (curr_thread_efficiency > thread_efficiency) {
@@ -303,7 +303,7 @@ static status_t init_conf_common(bn_lookup_table::params_t &conf,
     dispatch_reduce_stat.set_kernel_attr_suffix("REDUCE");
     dispatch_reduce_stat.generate();
 
-    const int sp_pad = rnd_up(conf.sp, conf.vect_size);
+    const dim_t sp_pad = rnd_up(conf.sp, conf.vect_size);
     conf.sp_tail = rnd_dn(conf.sp, conf.vect_size);
 
     dispatch = compute_engine->create_dispatch(data_mdw.md_);
diff --git a/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp b/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp
index 64c819bf393..954fa2f419a 100644
--- a/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp
+++ b/src/gpu/intel/ocl/bnorm/gen9_batch_normalization.hpp
@@ -32,10 +32,8 @@ namespace ocl {
 struct gen9_batch_normalization_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_fwd_pd_t::
+                gpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(impl_name(), gen9_batch_normalization_fwd_t);
 
@@ -187,10 +185,8 @@ struct gen9_batch_normalization_fwd_t : public gpu_primitive_t {
 struct gen9_batch_normalization_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_bwd_pd_t::
+                gpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T(impl_name(), gen9_batch_normalization_bwd_t);
 
diff --git a/src/gpu/intel/ocl/bnorm/gen9_bnorm_bwd.cl b/src/gpu/intel/ocl/bnorm/gen9_bnorm_bwd.cl
index c077e77c64d..97efc5af2d6 100644
--- a/src/gpu/intel/ocl/bnorm/gen9_bnorm_bwd.cl
+++ b/src/gpu/intel/ocl/bnorm/gen9_bnorm_bwd.cl
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 #include "gpu/intel/ocl/bnorm/gen9_bnorm.h"
+#include "gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h"
 
 // BWD kernels that support both blocked and NHWC layouts (USE_NHWC definition).
 // These kernels perform IC tail processing for NHWC and for ic % 8 == 0
diff --git a/src/gpu/intel/ocl/bnorm/gen9_bnorm_fwd.cl b/src/gpu/intel/ocl/bnorm/gen9_bnorm_fwd.cl
index 8301c8e324d..01623b93357 100644
--- a/src/gpu/intel/ocl/bnorm/gen9_bnorm_fwd.cl
+++ b/src/gpu/intel/ocl/bnorm/gen9_bnorm_fwd.cl
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 #include "gpu/intel/ocl/bnorm/gen9_bnorm.h"
+#include "gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h"
 
 // FWD kernels for regular and 1-pass (under USE_STATS_ONE_PASS) bnorm
 // algorithms that support both blocked and NHWC layouts (USE_NHWC definition).
diff --git a/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_bwd.cl b/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_bwd.cl
index f0a7629e565..8a50801284a 100644
--- a/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_bwd.cl
+++ b/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_bwd.cl
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 #include "gpu/intel/ocl/bnorm/gen9_bnorm.h"
+#include "gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h"
 
 // BWD kernels that are that are specially optimized for NHWC layout
 // (NHWC_OPTIMIZED definition).
diff --git a/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_fwd.cl b/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_fwd.cl
index f9f543a06a0..32332d63dab 100644
--- a/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_fwd.cl
+++ b/src/gpu/intel/ocl/bnorm/gen9_bnorm_nhwc_fwd.cl
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 #include "gpu/intel/ocl/bnorm/gen9_bnorm.h"
+#include "gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h"
 
 // FWD kernels for regular and 1-pass (under USE_STATS_ONE_PASS) bnorm
 // algorithms that are specially optimized for NHWC layout
diff --git a/src/gpu/intel/ocl/bnorm/gen9_bnorm_reduce.cl b/src/gpu/intel/ocl/bnorm/gen9_bnorm_reduce.cl
deleted file mode 100644
index 07821b1275f..00000000000
--- a/src/gpu/intel/ocl/bnorm/gen9_bnorm_reduce.cl
+++ /dev/null
@@ -1,241 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include "gpu/intel/ocl/bnorm/gen9_bnorm.h"
-
-// Zeroing and finalization kernels are required for atomics-based
-// reduction (FUSED_ATOMICS_REDUCTION definition).
-
-NAMED_KERNEL_ATTR(AUX)
-__kernel void gen9_fused_reduce_init(
-#if IS_FWD
-        __global float *mean, __global float *variance
-#else
-        __global float *diff_scale, __global float *diff_shift
-#endif
-) {
-    const int c = GWS_GET_IC_AUX();
-#if IS_FWD
-    mean[c] = 0.0f;
-    variance[c] = 0.0f;
-#else
-    diff_scale[c] = 0.0f;
-#if DIFF_SHIFT == 1
-    diff_shift[c] = 0.0f;
-#else
-    diff_shift[IC + IC * REDUCE_STAT_NBLOCKS + c] = 0.0f;
-#endif
-#endif
-    return;
-}
-
-#if IS_FWD
-NAMED_KERNEL_ATTR(AUX)
-__kernel void gen9_fused_reduce_final(
-#if USE_STATS_ONE_PASS
-        __global float *mean, __global float *variance
-#else
-        __global float *data_reduce
-#endif
-) {
-    const int c = GWS_GET_IC_AUX();
-#if USE_STATS_ONE_PASS
-    mean[c] = mean[c] / (MB * ID * IH * IW);
-    float tmp_var = max(
-            0.0f, (variance[c] / (MB * ID * IH * IW)) - (mean[c] * mean[c]));
-    variance[c] = tmp_var;
-#else
-    data_reduce[c] /= (MB * ID * IH * IW);
-#endif
-    return;
-}
-#else
-NAMED_KERNEL_ATTR(AUX)
-__kernel void gen9_fused_reduce_final(
-        __global float *diff_scale, __global float *variance, float eps) {
-    const int c = GWS_GET_IC_AUX();
-    diff_scale[c] *= 1.0f / sqrt(variance[c] + eps);
-    return;
-}
-#endif // IS_FWD
-
-// Reduction over scratchpad (reduce_temp) with SLM use.
-
-#if IS_FWD
-#if USE_STATS_ONE_PASS
-// Calculates mean and variance by further reducing sum of values
-// and sum of squares-of-values.
-NAMED_KERNEL_ATTR(REDUCE)
-__kernel void gen9_reduce_mean_var(__global ACCUM_DATA_T *reduce_temp,
-        __global float *mean, __global float *variance) {
-
-    __local SUM_DATA_T local_sum[SG_SIZE * REDUCE_IC_SUB_GROUPS];
-    __local SUM_DATA_T local_sum_sq[SG_SIZE * REDUCE_IC_SUB_GROUPS];
-
-    const int ic_sub_group = get_global_id(0) / SG_SIZE;
-    const int group_c = get_global_id(1);
-    const int simd_id = get_sub_group_local_id();
-    const int c = group_c * SG_SIZE + simd_id;
-    SUM_DATA_T sum;
-    SUM_DATA_T sum_sq;
-    sum.s0 = 0;
-    sum.s1 = 0;
-    sum_sq.s0 = 0;
-    sum_sq.s1 = 0;
-
-    int offs_sq = REDUCE_STAT_NBLOCKS * IC;
-    int offs = REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS * SG_SIZE
-                    * ic_sub_group
-            + REDUCE_STAT_NBLOCKS * SG_SIZE * group_c + simd_id;
-
-    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
-        float tmp = reduce_temp[offs + i * SG_SIZE];
-        sum = summation(tmp, sum);
-    }
-    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
-        float tmp = reduce_temp[offs_sq + offs + i * SG_SIZE];
-        sum_sq = summation(tmp, sum_sq);
-    }
-
-    if (ic_sub_group > 0) {
-        local_sum[ic_sub_group * SG_SIZE + simd_id] = sum;
-        local_sum_sq[ic_sub_group * SG_SIZE + simd_id] = sum_sq;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (ic_sub_group == 0) {
-        for (int i = 1; i < REDUCE_IC_SUB_GROUPS; i++) {
-            SUM_DATA_T tmp = local_sum[i * SG_SIZE + simd_id];
-            SUM_DATA_T tmp_sq = local_sum_sq[i * SG_SIZE + simd_id];
-            sum = summation(tmp.s1, sum);
-            sum_sq = summation(tmp_sq.s1, sum_sq);
-            sum = summation(tmp.s0, sum);
-            sum_sq = summation(tmp_sq.s0, sum_sq);
-        }
-        float tmp_mean = sum.s0 / (MB * ID * IH * IW);
-        mean[c] = tmp_mean;
-        float tmp_var = max(0.0f,
-                (sum_sq.s0 / (MB * ID * IH * IW)) - (tmp_mean * tmp_mean));
-        variance[c] = tmp_var;
-    }
-}
-
-#else // USE_STATS_ONE_PASS
-
-void gen9_reduce_common(__global float *reduce_temp, __local float *local_sum,
-        __global float *dst) {
-
-    const int ic_sub_group = get_global_id(0) / SG_SIZE;
-    const int group_c = get_global_id(1);
-    const int simd_id = get_sub_group_local_id();
-    const int c = group_c * SG_SIZE + simd_id;
-    const bool is_last_ic_block = (IC - group_c * SG_SIZE) < SG_SIZE;
-    float sum = 0.0f;
-
-    reduce_temp += REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS * SG_SIZE
-                    * ic_sub_group
-            + REDUCE_STAT_NBLOCKS * SG_SIZE * group_c + simd_id;
-    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
-        sum += reduce_temp[i * SG_SIZE];
-    }
-
-    if (ic_sub_group > 0) { local_sum[ic_sub_group * SG_SIZE + simd_id] = sum; }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if (ic_sub_group == 0) {
-        for (int i = 1; i < REDUCE_IC_SUB_GROUPS; i++) {
-            sum += local_sum[i * SG_SIZE + simd_id];
-        }
-#if HAS_IC_TAIL
-        if (!is_last_ic_block || (is_last_ic_block && simd_id < 8))
-#endif
-            dst[c] = sum / (MB * ID * IH * IW);
-    }
-}
-
-NAMED_KERNEL_ATTR(REDUCE)
-__kernel void gen9_reduce_mean(
-        __global float *reduce_temp, __global float *mean) {
-    __local float local_sum[SG_SIZE * REDUCE_IC_SUB_GROUPS];
-    gen9_reduce_common(reduce_temp, local_sum, mean);
-}
-
-NAMED_KERNEL_ATTR(REDUCE)
-__kernel void gen9_reduce_variance(
-        __global float *reduce_temp, __global float *variance) {
-    __local float local_sum[SG_SIZE * REDUCE_IC_SUB_GROUPS];
-    gen9_reduce_common(
-            reduce_temp + REDUCE_STAT_NBLOCKS * PADDED_IC, local_sum, variance);
-}
-
-#endif // USE_STATS_ONE_PASS
-#endif // IS_FWD
-
-#if IS_BWD
-NAMED_KERNEL_ATTR(REDUCE)
-__kernel void gen9_reduce_stats(__global float *temp_reduce,
-        __global float *diff_scale, __global float *diff_shift,
-        __global float *variance, float eps) {
-
-    __local float local_gamma[SG_SIZE * REDUCE_IC_SUB_GROUPS];
-    __local float local_beta[SG_SIZE * REDUCE_IC_SUB_GROUPS];
-    const int ic_sub_group = get_global_id(0) / SG_SIZE;
-    const int group_c = get_global_id(1);
-    const int simd_id = get_sub_group_local_id();
-    const int c = group_c * SG_SIZE + simd_id;
-    float diff_gamma = 0.0f;
-    float diff_beta = 0.0f;
-
-    temp_reduce += PADDED_IC + REDUCE_STAT_NBLOCKS * SG_SIZE * group_c
-            + REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS * SG_SIZE
-                    * ic_sub_group
-            + simd_id;
-    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
-        diff_gamma += temp_reduce[i * SG_SIZE];
-    }
-    temp_reduce += PADDED_IC + PADDED_IC * REDUCE_STAT_NBLOCKS;
-    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
-        diff_beta += temp_reduce[i * SG_SIZE];
-    }
-
-    if (ic_sub_group > 0) {
-        local_gamma[ic_sub_group * SG_SIZE + simd_id] = diff_gamma;
-        local_beta[ic_sub_group * SG_SIZE + simd_id] = diff_beta;
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-    if (ic_sub_group == 0) {
-        for (int i = 1; i < REDUCE_IC_SUB_GROUPS; i++) {
-            diff_gamma += local_gamma[i * SG_SIZE + simd_id];
-            diff_beta += local_beta[i * SG_SIZE + simd_id];
-        }
-
-        float sqrt_variance = 1.0f / sqrt(variance[c] + eps);
-
-#if HAS_IC_TAIL
-        const bool is_last_ic_block = group_c * SG_SIZE + SG_SIZE > IC;
-        if (!is_last_ic_block || (is_last_ic_block && simd_id < 8))
-#endif
-        {
-            diff_scale[c] = diff_gamma * sqrt_variance;
-#if DIFF_SHIFT == 1
-            diff_shift[c] = diff_beta;
-#else
-            diff_shift[PADDED_IC + PADDED_IC * REDUCE_STAT_NBLOCKS + c]
-                    = diff_beta;
-#endif // #if DIFF_SHIFT == 1
-        }
-    }
-}
-#endif
diff --git a/src/gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h b/src/gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h
new file mode 100644
index 00000000000..43e6697b406
--- /dev/null
+++ b/src/gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h
@@ -0,0 +1,245 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#ifndef GPU_INTEL_OCL_BNORM_GEN9_BNORM_REDUCE_H
+#define GPU_INTEL_OCL_BNORM_GEN9_BNORM_REDUCE_H
+
+#include "gpu/intel/ocl/bnorm/gen9_bnorm.h"
+
+// Zeroing and finalization kernels are required for atomics-based
+// reduction (FUSED_ATOMICS_REDUCTION definition).
+
+NAMED_KERNEL_ATTR(AUX)
+__kernel void gen9_fused_reduce_init(
+#if IS_FWD
+        __global float *mean, __global float *variance
+#else
+        __global float *diff_scale, __global float *diff_shift
+#endif
+) {
+    const int c = GWS_GET_IC_AUX();
+#if IS_FWD
+    mean[c] = 0.0f;
+    variance[c] = 0.0f;
+#else
+    diff_scale[c] = 0.0f;
+#if DIFF_SHIFT == 1
+    diff_shift[c] = 0.0f;
+#else
+    diff_shift[IC + IC * REDUCE_STAT_NBLOCKS + c] = 0.0f;
+#endif
+#endif
+    return;
+}
+
+#if IS_FWD
+NAMED_KERNEL_ATTR(AUX)
+__kernel void gen9_fused_reduce_final(
+#if USE_STATS_ONE_PASS
+        __global float *mean, __global float *variance
+#else
+        __global float *data_reduce
+#endif
+) {
+    const int c = GWS_GET_IC_AUX();
+#if USE_STATS_ONE_PASS
+    mean[c] = mean[c] / (MB * ID * IH * IW);
+    float tmp_var = max(
+            0.0f, (variance[c] / (MB * ID * IH * IW)) - (mean[c] * mean[c]));
+    variance[c] = tmp_var;
+#else
+    data_reduce[c] /= (MB * ID * IH * IW);
+#endif
+    return;
+}
+#else
+NAMED_KERNEL_ATTR(AUX)
+__kernel void gen9_fused_reduce_final(
+        __global float *diff_scale, __global float *variance, float eps) {
+    const int c = GWS_GET_IC_AUX();
+    diff_scale[c] *= 1.0f / sqrt(variance[c] + eps);
+    return;
+}
+#endif // IS_FWD
+
+// Reduction over scratchpad (reduce_temp) with SLM use.
+
+#if IS_FWD
+#if USE_STATS_ONE_PASS
+// Calculates mean and variance by further reducing sum of values
+// and sum of squares-of-values.
+NAMED_KERNEL_ATTR(REDUCE)
+__kernel void gen9_reduce_mean_var(__global ACCUM_DATA_T *reduce_temp,
+        __global float *mean, __global float *variance) {
+
+    __local SUM_DATA_T local_sum[SG_SIZE * REDUCE_IC_SUB_GROUPS];
+    __local SUM_DATA_T local_sum_sq[SG_SIZE * REDUCE_IC_SUB_GROUPS];
+
+    const int ic_sub_group = get_global_id(0) / SG_SIZE;
+    const int group_c = get_global_id(1);
+    const int simd_id = get_sub_group_local_id();
+    const int c = group_c * SG_SIZE + simd_id;
+    SUM_DATA_T sum;
+    SUM_DATA_T sum_sq;
+    sum.s0 = 0;
+    sum.s1 = 0;
+    sum_sq.s0 = 0;
+    sum_sq.s1 = 0;
+
+    int offs_sq = REDUCE_STAT_NBLOCKS * IC;
+    int offs = REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS * SG_SIZE
+                    * ic_sub_group
+            + REDUCE_STAT_NBLOCKS * SG_SIZE * group_c + simd_id;
+
+    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
+        float tmp = reduce_temp[offs + i * SG_SIZE];
+        sum = summation(tmp, sum);
+    }
+    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
+        float tmp = reduce_temp[offs_sq + offs + i * SG_SIZE];
+        sum_sq = summation(tmp, sum_sq);
+    }
+
+    if (ic_sub_group > 0) {
+        local_sum[ic_sub_group * SG_SIZE + simd_id] = sum;
+        local_sum_sq[ic_sub_group * SG_SIZE + simd_id] = sum_sq;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (ic_sub_group == 0) {
+        for (int i = 1; i < REDUCE_IC_SUB_GROUPS; i++) {
+            SUM_DATA_T tmp = local_sum[i * SG_SIZE + simd_id];
+            SUM_DATA_T tmp_sq = local_sum_sq[i * SG_SIZE + simd_id];
+            sum = summation(tmp.s1, sum);
+            sum_sq = summation(tmp_sq.s1, sum_sq);
+            sum = summation(tmp.s0, sum);
+            sum_sq = summation(tmp_sq.s0, sum_sq);
+        }
+        float tmp_mean = sum.s0 / (MB * ID * IH * IW);
+        mean[c] = tmp_mean;
+        float tmp_var = max(0.0f,
+                (sum_sq.s0 / (MB * ID * IH * IW)) - (tmp_mean * tmp_mean));
+        variance[c] = tmp_var;
+    }
+}
+
+#else // USE_STATS_ONE_PASS
+
+void gen9_reduce_common(__global float *reduce_temp, __local float *local_sum,
+        __global float *dst) {
+
+    const int ic_sub_group = get_global_id(0) / SG_SIZE;
+    const int group_c = get_global_id(1);
+    const int simd_id = get_sub_group_local_id();
+    const int c = group_c * SG_SIZE + simd_id;
+    const bool is_last_ic_block = (IC - group_c * SG_SIZE) < SG_SIZE;
+    float sum = 0.0f;
+
+    reduce_temp += REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS * SG_SIZE
+                    * ic_sub_group
+            + REDUCE_STAT_NBLOCKS * SG_SIZE * group_c + simd_id;
+    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
+        sum += reduce_temp[i * SG_SIZE];
+    }
+
+    if (ic_sub_group > 0) { local_sum[ic_sub_group * SG_SIZE + simd_id] = sum; }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if (ic_sub_group == 0) {
+        for (int i = 1; i < REDUCE_IC_SUB_GROUPS; i++) {
+            sum += local_sum[i * SG_SIZE + simd_id];
+        }
+#if HAS_IC_TAIL
+        if (!is_last_ic_block || (is_last_ic_block && simd_id < 8))
+#endif
+            dst[c] = sum / (MB * ID * IH * IW);
+    }
+}
+
+NAMED_KERNEL_ATTR(REDUCE)
+__kernel void gen9_reduce_mean(
+        __global float *reduce_temp, __global float *mean) {
+    __local float local_sum[SG_SIZE * REDUCE_IC_SUB_GROUPS];
+    gen9_reduce_common(reduce_temp, local_sum, mean);
+}
+
+NAMED_KERNEL_ATTR(REDUCE)
+__kernel void gen9_reduce_variance(
+        __global float *reduce_temp, __global float *variance) {
+    __local float local_sum[SG_SIZE * REDUCE_IC_SUB_GROUPS];
+    gen9_reduce_common(
+            reduce_temp + REDUCE_STAT_NBLOCKS * PADDED_IC, local_sum, variance);
+}
+
+#endif // USE_STATS_ONE_PASS
+#endif // IS_FWD
+
+#if IS_BWD
+NAMED_KERNEL_ATTR(REDUCE)
+__kernel void gen9_reduce_stats(__global float *temp_reduce,
+        __global float *diff_scale, __global float *diff_shift,
+        __global float *variance, float eps) {
+
+    __local float local_gamma[SG_SIZE * REDUCE_IC_SUB_GROUPS];
+    __local float local_beta[SG_SIZE * REDUCE_IC_SUB_GROUPS];
+    const int ic_sub_group = get_global_id(0) / SG_SIZE;
+    const int group_c = get_global_id(1);
+    const int simd_id = get_sub_group_local_id();
+    const int c = group_c * SG_SIZE + simd_id;
+    float diff_gamma = 0.0f;
+    float diff_beta = 0.0f;
+
+    temp_reduce += PADDED_IC + REDUCE_STAT_NBLOCKS * SG_SIZE * group_c
+            + REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS * SG_SIZE
+                    * ic_sub_group
+            + simd_id;
+    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
+        diff_gamma += temp_reduce[i * SG_SIZE];
+    }
+    temp_reduce += PADDED_IC + PADDED_IC * REDUCE_STAT_NBLOCKS;
+    for (int i = 0; i < REDUCE_STAT_NBLOCKS / REDUCE_IC_SUB_GROUPS; i++) {
+        diff_beta += temp_reduce[i * SG_SIZE];
+    }
+
+    if (ic_sub_group > 0) {
+        local_gamma[ic_sub_group * SG_SIZE + simd_id] = diff_gamma;
+        local_beta[ic_sub_group * SG_SIZE + simd_id] = diff_beta;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if (ic_sub_group == 0) {
+        for (int i = 1; i < REDUCE_IC_SUB_GROUPS; i++) {
+            diff_gamma += local_gamma[i * SG_SIZE + simd_id];
+            diff_beta += local_beta[i * SG_SIZE + simd_id];
+        }
+
+        float sqrt_variance = 1.0f / sqrt(variance[c] + eps);
+
+#if HAS_IC_TAIL
+        const bool is_last_ic_block = group_c * SG_SIZE + SG_SIZE > IC;
+        if (!is_last_ic_block || (is_last_ic_block && simd_id < 8))
+#endif
+        {
+            diff_scale[c] = diff_gamma * sqrt_variance;
+#if DIFF_SHIFT == 1
+            diff_shift[c] = diff_beta;
+#else
+            diff_shift[PADDED_IC + PADDED_IC * REDUCE_STAT_NBLOCKS + c]
+                    = diff_beta;
+#endif // #if DIFF_SHIFT == 1
+        }
+    }
+}
+#endif
+#endif
diff --git a/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.cpp b/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.cpp
index 897a07c663d..778cc4c930b 100644
--- a/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.cpp
+++ b/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/ocl/bnorm/bnorm_model.hpp"
 #include "gpu/intel/ocl/bnorm/bnorm_utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 using namespace dnnl::impl::memory_tracking::names;
 
@@ -63,8 +63,8 @@ static void adjust_lws_calc_kernel(int ic_block, nhwc_bnorm_params_t &conf,
     const compute::range_t &base_lws = generated_nd.local_range();
     gpu_assert(base_lws) << "lws is missing";
 
-    compute::range_t tuned_lws = {gpu_utils::into<size_t>(conf.sub_group_size),
-            base_lws[1], base_lws[2]};
+    compute::range_t tuned_lws
+            = {into<size_t>(conf.sub_group_size), base_lws[1], base_lws[2]};
     compute::range_t curr_lws = tuned_lws;
 
     // The search is based on subslice utilization which calculated as the ratio
@@ -100,7 +100,7 @@ static void adjust_lws_calc_kernel(int ic_block, nhwc_bnorm_params_t &conf,
 }
 
 static int get_reduce_sub_group_count(
-        const int reduce_stat_nblocks, const int sub_group_size) {
+        const dim_t reduce_stat_nblocks, const int sub_group_size) {
     int reduce_sub_group_count = 1;
     while (reduce_stat_nblocks % (2 * reduce_sub_group_count) == 0
             && 2 * reduce_sub_group_count * sub_group_size <= 256) {
@@ -124,7 +124,7 @@ status_t nhwc_bnorm_kernel_dispatching(kernel_kind_t kernel,
             = rnd_dn(conf.sp, conf.update_sp_block()) / conf.update_sp_block();
     conf.reduce_stat_nblocks = conf.stat_sp_nblocks;
 
-    const int calc_stat_ic = get_nhwc_calc_stat_ic(
+    const dim_t calc_stat_ic = get_nhwc_calc_stat_ic(
             conf.ic, conf.ic_block(), conf.sub_group_size);
 
     switch (kernel) {
diff --git a/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp b/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp
index eb642bbd22a..bd10fc8254a 100644
--- a/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp
+++ b/src/gpu/intel/ocl/bnorm/nhwc_batch_normalization.hpp
@@ -57,10 +57,8 @@ status_t nhwc_bnorm_kernel_dispatching(kernel_kind_t kernel,
 struct nhwc_batch_normalization_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_fwd_pd_t::
+                gpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(impl_name(), nhwc_batch_normalization_fwd_t);
 
@@ -211,10 +209,8 @@ struct nhwc_batch_normalization_fwd_t : public gpu_primitive_t {
 struct nhwc_batch_normalization_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_bwd_pd_t::
+                gpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T(impl_name(), nhwc_batch_normalization_bwd_t);
 
diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl
index fab295fe52c..08653e956ea 100644
--- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl
+++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.cl
@@ -13,6 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
+#include "gpu/intel/ocl/bnorm/gen9_bnorm_reduce.h"
 #include "gpu/intel/ocl/bnorm/nhwc_reusable.h"
 
 // Two sets of nhwc-optimized reusable kernels which are implemented with and
diff --git a/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp b/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp
index bc52da845fe..f5655208b9f 100644
--- a/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp
+++ b/src/gpu/intel/ocl/bnorm/nhwc_reusable.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,15 +21,15 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive.hpp"
+#include "common/serialization.hpp"
 #include "gpu/gpu_batch_normalization_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/compute/kernel.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
-#include "gpu/intel/serialization.hpp"
 
 #include "common/experimental.hpp"
 #include "gpu/intel/ocl/bnorm/bnorm_utils.hpp"
@@ -68,13 +68,14 @@ struct nhwc_reusable_bnorm_compile_params_t {
             const nhwc_reusable_bnorm_compile_params_t &) const = default;
 #endif
 
-    serialized_t serialize() const {
-        assert_trivially_serializable(nhwc_reusable_bnorm_compile_params_t);
-        return serialized_t(*this);
+    serialization_stream_t serialize() const {
+        DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(
+                nhwc_reusable_bnorm_compile_params_t);
+        return serialization_stream_t(*this);
     }
 
     static nhwc_reusable_bnorm_compile_params_t deserialize(
-            const serialized_t &s) {
+            const serialization_stream_t &s) {
         return deserializer_t(s).pop<nhwc_reusable_bnorm_compile_params_t>();
     }
 
@@ -113,10 +114,8 @@ struct nhwc_reusable_bnorm_runtime_params_t {
 struct nhwc_reusable_batch_normalization_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_fwd_pd_t::
+                gpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 impl_name(), nhwc_reusable_batch_normalization_fwd_t);
@@ -190,7 +189,7 @@ struct nhwc_reusable_batch_normalization_fwd_t : public gpu_primitive_t {
 
     status_t init(impl::engine_t *engine) override {
         if (pd()->has_zero_dim_memory()) return status::success;
-        auto kernel_names = pd()->cmpl_conf.get_kernel_names();
+        const auto &kernel_names = pd()->cmpl_conf.get_kernel_names();
         CHECK(create_kernels(engine, kernels_, kernel_names, pd()->cmpl_conf));
         return status::success;
     }
@@ -208,10 +207,8 @@ struct nhwc_reusable_batch_normalization_fwd_t : public gpu_primitive_t {
 struct nhwc_reusable_batch_normalization_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_bwd_pd_t::
+                gpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T(
                 "ocl:nhwc_reusable", nhwc_reusable_batch_normalization_bwd_t);
@@ -274,7 +271,7 @@ struct nhwc_reusable_batch_normalization_bwd_t : public gpu_primitive_t {
 
     status_t init(impl::engine_t *engine) override {
         if (pd()->has_zero_dim_memory()) return status::success;
-        auto kernel_names = pd()->cmpl_conf.get_kernel_names();
+        const auto &kernel_names = pd()->cmpl_conf.get_kernel_names();
         CHECK(create_kernels(engine, kernels_, kernel_names, pd()->cmpl_conf));
         return status::success;
     }
diff --git a/src/gpu/intel/ocl/bnorm/ref_batch_normalization.cpp b/src/gpu/intel/ocl/bnorm/ref_batch_normalization.cpp
index 7b263f72ac0..f398a34ad37 100644
--- a/src/gpu/intel/ocl/bnorm/ref_batch_normalization.cpp
+++ b/src/gpu/intel/ocl/bnorm/ref_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "common/math_utils.hpp"
 #include "common/scratchpad.hpp"
 #include "common/type_helpers.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 using namespace dnnl::impl::memory_tracking::names;
 
@@ -124,7 +124,7 @@ static void init_conf_common(bnorm_conf_t &conf, compute::dispatch_t &dispatch,
 static void init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
         const bnorm_conf_t &conf, const compute::dispatch_t &dispatch,
         const offsets_t &off) {
-    kernel_ctx.set_data_type(conf.data_type);
+    kernel_ctx.set_data_type(conf.data_type, false);
 
     kernel_ctx.define_int("NDIMS", conf.ndims);
     kernel_ctx.define_int("MB", conf.mb);
diff --git a/src/gpu/intel/ocl/bnorm/ref_batch_normalization.hpp b/src/gpu/intel/ocl/bnorm/ref_batch_normalization.hpp
index 1aff59c26db..9767430aebb 100644
--- a/src/gpu/intel/ocl/bnorm/ref_batch_normalization.hpp
+++ b/src/gpu/intel/ocl/bnorm/ref_batch_normalization.hpp
@@ -32,10 +32,8 @@ namespace ocl {
 struct ref_batch_normalization_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_fwd_pd_t::
+                gpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:ref:any", ref_batch_normalization_fwd_t);
 
@@ -144,10 +142,8 @@ struct ref_batch_normalization_fwd_t : public gpu_primitive_t {
 struct ref_batch_normalization_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_bwd_pd_t::
+                gpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:ref:any", ref_batch_normalization_bwd_t);
 
diff --git a/src/gpu/intel/ocl/bnorm/ref_bnorm.cl b/src/gpu/intel/ocl/bnorm/ref_bnorm.cl
index a354fa928b4..c9036655b85 100644
--- a/src/gpu/intel/ocl/bnorm/ref_bnorm.cl
+++ b/src/gpu/intel/ocl/bnorm/ref_bnorm.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_io.h"
 #include "gpu/intel/ocl/ocl_types.h"
 
 int reduce_index(int x[5]) {
@@ -38,7 +39,7 @@ __kernel void calculate_mean(__global DATA_T *src, __global float *mean) {
     float sum = 0;
     for (int i = 0; i < REDUCE_DIM; i++) {
         x[REDUCE_DIM_IDX] = i;
-        sum += TO_DEF_ACC_DATA_T(src[SRC_OFF(x[0], x[1], x[2], x[3], x[4])]);
+        sum += load(sum, src + SRC_OFF(x[0], x[1], x[2], x[3], x[4]));
     }
     x[REDUCE_DIM_IDX] = 0;
     int reduce_idx = reduce_index(x);
@@ -58,7 +59,7 @@ __kernel void calculate_variance(
     for (int i = 0; i < REDUCE_DIM; i++) {
         x[REDUCE_DIM_IDX] = i;
         DEF_ACC_DATA_T v0
-                = TO_DEF_ACC_DATA_T(src[SRC_OFF(x[0], x[1], x[2], x[3], x[4])])
+                = load(v0, src + SRC_OFF(x[0], x[1], x[2], x[3], x[4]))
                 - mean[x[1]];
         sum += v0 * v0;
     }
@@ -126,11 +127,11 @@ __kernel void ref_bnorm_fwd(__global DATA_T *src, __global float *mean,
     float v_mean = mean[c];
     float v_variance = variance[c];
     const int off = SRC_OFF(n, c, d, h, w);
-    float v0 = TO_DEF_ACC_DATA_T(src[off]);
+    float v0 = load(v0, src + off);
     float sqrt_variance = 1.0f / sqrt(v_variance + eps);
     float bn_res = sm * (v0 - v_mean) * sqrt_variance + sv;
 #if FUSE_BN_ADD_RELU == 1
-    bn_res += TO_DEF_ACC_DATA_T(src_add[off]);
+    bn_res += load(bn_res, src_add + off);
 #endif
 
 #if FUSE_BN_RELU == 1
@@ -152,7 +153,7 @@ __kernel void ref_bnorm_fwd(__global DATA_T *src, __global float *mean,
 #endif //WITH_LEAKY_RELU
 #endif //WITH_RELU
 
-    dst[off] = TO_DATA_T(bn_res);
+    write(dst + off, bn_res);
 }
 #endif
 
@@ -173,11 +174,11 @@ __kernel void calculate_stats(__global DATA_T *src, __global float *mean,
     for (int i = 0; i < REDUCE_DIM; i++) {
         x[REDUCE_DIM_IDX] = i;
         int off = SRC_OFF(x[0], x[1], x[2], x[3], x[4]);
-        float dd = CONVERT_FLOAT_T(diff_dst[off]);
+        float dd = load(dd, diff_dst + off);
 #if FUSE_BN_RELU == 1
         if (!ws[off]) dd = 0;
 #endif
-        diff_gamma += (CONVERT_FLOAT_T(src[off]) - mean[x[1]]) * dd;
+        diff_gamma += (load(diff_gamma, src + off) - mean[x[1]]) * dd;
         diff_beta += dd;
     }
 
@@ -244,22 +245,22 @@ __kernel void ref_bnorm_bwd(__global DATA_T *src, __global float *mean,
 #endif
 
     const int off = SRC_OFF(n, c, d, h, w);
-    float dd = TO_DEF_ACC_DATA_T(diff_dst[off]);
+    float dd = load(dd, diff_dst + off);
 #if FUSE_BN_RELU == 1
     if (!ws[off]) dd = 0;
 #if FUSE_BN_ADD_RELU == 1
-    diff_src_add[off] = TO_DATA_T(dd);
+    write(diff_src_add + off, dd);
 #endif
 #endif
 
     float v_diff_src = dd;
 #if CALCULATE_STATS == 1
     v_diff_src -= diff_beta / (MB * ID * IH * IW)
-            + (CONVERT_FLOAT_T(src[off]) - v_mean) * diff_gamma * sqrt_variance
-                    / (MB * ID * IH * IW);
+            + (load(v_diff_src, src + off) - v_mean) * diff_gamma
+                    * sqrt_variance / (MB * ID * IH * IW);
 #endif
     v_diff_src *= gamma * sqrt_variance;
 
-    diff_src[off] = TO_DATA_T(v_diff_src);
+    write(diff_src + off, v_diff_src);
 }
 #endif
diff --git a/src/gpu/intel/ocl/bnorm/reusable_bnorm.cl b/src/gpu/intel/ocl/bnorm/reusable_bnorm.cl
index 90276c5d16d..db83f644547 100644
--- a/src/gpu/intel/ocl/bnorm/reusable_bnorm.cl
+++ b/src/gpu/intel/ocl/bnorm/reusable_bnorm.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_io.h"
 #include "gpu/intel/ocl/ocl_types.h"
 #include "gpu/intel/ocl/types_interop.h"
 
@@ -26,7 +27,7 @@ __kernel void reusable_calculate_mean(__global DATA_T *src,
     mean = GWS_GET_BUFFER_POS_NAMED(DST, CALC, gws_params, mean);
     float sum = 0;
     unroll_16_for(off_t i = 0; i < reduce_dim; i++) {
-        sum += TO_DEF_ACC_DATA_T(src[i * (off_t)reduce_dim_stride]);
+        sum += load(sum, src + i * (off_t)reduce_dim_stride);
     }
 
     *mean = sum;
@@ -41,8 +42,8 @@ __kernel void reusable_calculate_variance(__global DATA_T *src,
     variance = GWS_GET_BUFFER_POS_NAMED(DST, CALC, gws_params, variance);
     float sum = 0;
     unroll_16_for(off_t i = 0; i < reduce_dim; i++) {
-        DEF_ACC_DATA_T v0 = TO_DEF_ACC_DATA_T(src[i * (off_t)reduce_dim_stride])
-                - mean[c];
+        DEF_ACC_DATA_T v0
+                = load(v0, src + i * (off_t)reduce_dim_stride) - mean[c];
         sum += v0 * v0;
     }
 
@@ -96,10 +97,10 @@ __kernel void reusable_bnorm_fwd(__global DATA_T *src, __global float *mean,
     float sv = USE_SHIFT ? shift[c] : 0;
     float v_mean = mean[c];
     float v_variance = variance[c];
-    float v0 = TO_DEF_ACC_DATA_T(*src);
+    float v0 = load(v0, src);
     float sqrt_variance = 1.0f / sqrt(v_variance + eps);
     float bn_res = sm * (v0 - v_mean) * sqrt_variance + sv;
-    if (FUSE_BN_ADD_RELU) bn_res += TO_DEF_ACC_DATA_T(*src_add);
+    if (FUSE_BN_ADD_RELU) bn_res += load(bn_res, src_add);
 
 #if FUSE_BN_RELU == 1
     if (bn_res <= 0) {
@@ -120,7 +121,7 @@ __kernel void reusable_bnorm_fwd(__global DATA_T *src, __global float *mean,
 #endif //WITH_LEAKY_RELU
 #endif //WITH_RELU
 
-    *dst = TO_DATA_T(bn_res);
+    write(dst, bn_res);
 }
 
 NAMED_KERNEL_ATTR(CALC)
@@ -142,9 +143,9 @@ __kernel void reusable_calculate_stats(__global DATA_T *src,
 
     unroll_16_for(off_t i = 0; i < reduce_dim; i++) {
         const off_t offi = i * (off_t)reduce_dim_stride;
-        float dd = CONVERT_FLOAT_T(diff_dst[offi]);
+        float dd = load(dd, diff_dst + offi);
         if (FUSE_BN_RELU && !ws[offi]) dd = 0;
-        diff_gamma += (CONVERT_FLOAT_T(src[offi]) - mean[c]) * dd;
+        diff_gamma += (load(diff_gamma, src + offi) - mean[c]) * dd;
         diff_beta += dd;
     }
 
@@ -196,10 +197,10 @@ __kernel void reusable_bnorm_bwd(__global DATA_T *src, __global float *mean,
     float sqrt_variance = 1.0f / sqrt(v_variance + eps);
     float gamma = USE_SCALE ? scale[c] : 1;
 
-    float dd = TO_DEF_ACC_DATA_T(*diff_dst);
+    float dd = load(dd, diff_dst);
 #if FUSE_BN_RELU == 1
     if (!*ws) dd = 0;
-    if (FUSE_BN_ADD_RELU) *diff_src_add = TO_DATA_T(dd);
+    if (FUSE_BN_ADD_RELU) write(diff_src_add, dd);
 #endif
 
     float v_diff_src = dd;
@@ -208,10 +209,10 @@ __kernel void reusable_bnorm_bwd(__global DATA_T *src, __global float *mean,
     float diff_gamma = diff_scale[c];
     float diff_beta = diff_shift[c];
     v_diff_src -= diff_beta / div
-            + (CONVERT_FLOAT_T(*src) - v_mean) * diff_gamma * sqrt_variance
+            + (load(v_diff_src, src) - v_mean) * diff_gamma * sqrt_variance
                     / div;
 #endif
     v_diff_src *= gamma * sqrt_variance;
 
-    *diff_src = TO_DATA_T(v_diff_src);
+    write(diff_src, v_diff_src);
 }
diff --git a/src/gpu/intel/ocl/bnorm/reusable_bnorm.cpp b/src/gpu/intel/ocl/bnorm/reusable_bnorm.cpp
index 54e83ab50ef..ba7d94b0378 100644
--- a/src/gpu/intel/ocl/bnorm/reusable_bnorm.cpp
+++ b/src/gpu/intel/ocl/bnorm/reusable_bnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,15 +35,15 @@ namespace intel {
 namespace ocl {
 
 namespace bnorm_dims_t {
-compute::dim_id_t mb = 0;
-compute::dim_id_t ic = 1;
-compute::dim_id_t sp0 = 2;
-compute::dim_id_t sp1 = 3;
-compute::dim_id_t sp2 = 4;
+dim_idx_t mb = 0;
+dim_idx_t ic = 1;
+dim_idx_t sp0 = 2;
+dim_idx_t sp1 = 3;
+dim_idx_t sp2 = 4;
 }; // namespace bnorm_dims_t
 
-static std::vector<compute::dim_id_t> get_dims(size_t ndims) {
-    std::vector<compute::dim_id_t> ret(ndims);
+static std::vector<dim_idx_t> get_dims(size_t ndims) {
+    std::vector<dim_idx_t> ret(ndims);
     uint8_t idx = 0;
     ret[idx++] = bnorm_dims_t::mb;
     ret[idx++] = bnorm_dims_t::ic;
@@ -75,7 +75,7 @@ static status_t init_calculate_stats_conf(reusable_bnorm_params_t &conf,
     // - SRC: Just the input src buffer
     // - DST: SRC, but with one dim reduced and ic moved to the innermost position
     // - [variance only] MEAN: just the ic dim, stores the mean per ic
-    std::vector<compute::dim_id_t> dim_ids = get_dims(ndims);
+    std::vector<dim_idx_t> dim_ids = get_dims(ndims);
     compute::named_buffer_t src_buf("SRC", *data_mdw.md_, dim_ids);
 
     compute::named_buffer_t dst_buf("DST", src_buf);
@@ -103,7 +103,7 @@ static status_t init_calculate_stats_conf(reusable_bnorm_params_t &conf,
 
     // Dispatches all dims except for reduction dim
     for (size_t i = 0; i < dim_ids.size(); i++) {
-        if (dim_ids[i] == compute::dim_id_t(reduce_dim_idx)) {
+        if (dim_ids[i] == dim_idx_t(reduce_dim_idx)) {
             dim_ids.erase(dim_ids.begin() + static_cast<int>(i));
             break;
         }
@@ -165,7 +165,7 @@ static status_t init_conf_common(reusable_bnorm_params_t &conf,
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
 
     size_t ndims = static_cast<size_t>(data_mdw.ndims());
-    std::vector<compute::dim_id_t> dims = get_dims(ndims);
+    std::vector<dim_idx_t> dims = get_dims(ndims);
     compute::named_buffer_t buffer("BUFFER", *data_mdw.md_, dims);
 
     // Dispatch to all dims
@@ -185,7 +185,7 @@ static status_t init_conf_common(reusable_bnorm_params_t &conf,
 
 static void init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
         const reusable_bnorm_params_t &conf) {
-    kernel_ctx.set_data_type(conf.data_type);
+    kernel_ctx.set_data_type(conf.data_type, false);
 
     kernel_ctx.define_int("WITH_RELU", conf.with_relu);
     if (conf.with_leaky_relu) kernel_ctx.define_int("WITH_LEAKY_RELU", 1);
@@ -284,7 +284,7 @@ status_t reusable_batch_normalization_fwd_t::execute_forward(
                 = [use_int32_offset](
                           compute::kernel_arg_list_t &arg_list, dim_t off) {
                       if (use_int32_offset) {
-                          arg_list.append(gpu_utils::into<int32_t>(off));
+                          arg_list.append(into<int32_t>(off));
                       } else {
                           arg_list.append(off);
                       }
@@ -408,7 +408,7 @@ status_t reusable_batch_normalization_bwd_t::execute_backward(
             = [use_int32_offset](
                       compute::kernel_arg_list_t &arg_list, dim_t off) {
                   if (use_int32_offset) {
-                      arg_list.append(gpu_utils::into<int32_t>(off));
+                      arg_list.append(into<int32_t>(off));
                   } else {
                       arg_list.append(off);
                   }
diff --git a/src/gpu/intel/ocl/bnorm/reusable_bnorm.hpp b/src/gpu/intel/ocl/bnorm/reusable_bnorm.hpp
index d47d65ba96d..48ff3c7a7de 100644
--- a/src/gpu/intel/ocl/bnorm/reusable_bnorm.hpp
+++ b/src/gpu/intel/ocl/bnorm/reusable_bnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,15 +21,15 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive.hpp"
+#include "common/serialization.hpp"
 #include "gpu/gpu_batch_normalization_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/compute/kernel.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -57,12 +57,13 @@ struct reusable_bnorm_params_t {
 #if __cplusplus >= 202002L
     bool operator==(const reusable_bnorm_params_t &) const = default;
 #endif
-    serialized_t serialize() const {
-        assert_trivially_serializable(reusable_bnorm_params_t);
-        return serialized_t(*this);
+    serialization_stream_t serialize() const {
+        DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(reusable_bnorm_params_t);
+        return serialization_stream_t(*this);
     }
 
-    static reusable_bnorm_params_t deserialize(const serialized_t &s) {
+    static reusable_bnorm_params_t deserialize(
+            const serialization_stream_t &s) {
         reusable_bnorm_params_t t {};
         deserializer_t d(s);
         d.pop(t);
@@ -107,10 +108,8 @@ struct reusable_bnorm_runtime_params_t {
 struct reusable_batch_normalization_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_fwd_pd_t::
+                gpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:reusable", reusable_batch_normalization_fwd_t);
 
@@ -211,10 +210,8 @@ struct reusable_batch_normalization_fwd_t : public gpu_primitive_t {
 struct reusable_batch_normalization_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_bwd_pd_t::
+                gpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:reusable", reusable_batch_normalization_bwd_t);
 
diff --git a/src/gpu/intel/ocl/bnorm/simple_bnorm.cpp b/src/gpu/intel/ocl/bnorm/simple_bnorm.cpp
index 51fc034cd40..79b15716d10 100644
--- a/src/gpu/intel/ocl/bnorm/simple_bnorm.cpp
+++ b/src/gpu/intel/ocl/bnorm/simple_bnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,11 +17,8 @@
 #include "gpu/intel/ocl/bnorm/simple_bnorm.hpp"
 
 #include "common/c_types_map.hpp"
-#include "common/dnnl_traits.hpp"
-#include "common/math_utils.hpp"
-#include "common/scratchpad.hpp"
 #include "common/type_helpers.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 using namespace dnnl::impl::memory_tracking::names;
 
@@ -37,7 +34,7 @@ static status_t init_conf_common(bnorm_conf_t &conf, offsets_t &off,
     const batch_normalization_desc_t &bd = *pd->desc();
     const memory_desc_wrapper data_mdw(
             pd->is_fwd() ? pd->src_md() : pd->diff_src_md());
-    const int ndims = data_mdw.ndims();
+    const dim_idx_t ndims = into<dim_idx_t>(data_mdw.ndims());
 
     conf = utils::zero<decltype(conf)>();
     conf.data_type = data_mdw.data_type();
@@ -121,7 +118,7 @@ status_t simple_batch_normalization_fwd_t::pd_t::init_conf(
     if (!conf.calculate_stats) return status::unimplemented;
 
     const memory_desc_wrapper data_mdw(src_md());
-    const int ndims = data_mdw.ndims();
+    const dim_idx_t ndims = into<dim_idx_t>(data_mdw.ndims());
 
     compute::compute_engine_t *compute_engine
             = utils::downcast<compute::compute_engine_t *>(engine);
@@ -134,8 +131,8 @@ status_t simple_batch_normalization_fwd_t::pd_t::init_conf(
     calc_dims[2] = (ndims < 5) ? 1 : dims[ndims - 3];
     calc_dims[3] = (ndims < 4) ? 1 : dims[ndims - 2];
     calc_dims[4] = (ndims < 3) ? 1 : dims[ndims - 1];
-    int reduce_dim_idx = 0;
-    for (int i = 2; i < 5; i++) {
+    dim_idx_t reduce_dim_idx = 0;
+    for (dim_idx_t i = 2; i < 5; i++) {
         if (calc_dims[i] > calc_dims[reduce_dim_idx]) { reduce_dim_idx = i; }
     }
     conf.stat_ic = utils::array_product(calc_dims, 5);
@@ -146,9 +143,9 @@ status_t simple_batch_normalization_fwd_t::pd_t::init_conf(
     const std::string &reduce_dim_name = dim_names[reduce_dim_idx];
 
     // Translate reduce_dim_idx from being an index in calc_dims to dims array
-    const int base_reduce_dim_idx
+    const dim_idx_t base_reduce_dim_idx
             = reduce_dim_idx == 0 ? 0 : reduce_dim_idx - (5 - ndims);
-    const int reduce_dim_stride
+    const dim_t reduce_dim_stride
             = data_mdw.blocking_desc().strides[base_reduce_dim_idx];
     const bool can_vectorize_calc_stats
             = conf.reduce_dim % 16 == 0 && reduce_dim_stride == 1;
@@ -160,7 +157,7 @@ status_t simple_batch_normalization_fwd_t::pd_t::init_conf(
     // for the experimental flag. This should be updated to follow the API.
     if (!conf.skip_reduce_stat) return status::unimplemented;
 
-    int calc_dims_blocks[5] = {1, 1, 1, 1, 1};
+    dim_t calc_dims_blocks[5] = {1, 1, 1, 1, 1};
 
     // Calculations over reduce dimension will be split
     // between work items in the single subgroup.
@@ -178,11 +175,11 @@ status_t simple_batch_normalization_fwd_t::pd_t::init_conf(
             dim_names[0], 0, calc_dims[0], calc_dims_blocks[0]);
     dispatch_calc_stat.define_dim(
             dim_names[1], 1, calc_dims[1], calc_dims_blocks[1]);
-    dispatch_calc_stat.define_dim(dim_names[2], nstl::max(1, ndims - 3),
+    dispatch_calc_stat.define_dim(dim_names[2], nstl::max(1u, ndims - 3u),
             calc_dims[2], calc_dims_blocks[2]);
-    dispatch_calc_stat.define_dim(dim_names[3], nstl::max(1, ndims - 2),
+    dispatch_calc_stat.define_dim(dim_names[3], nstl::max(1u, ndims - 2u),
             calc_dims[3], calc_dims_blocks[3]);
-    dispatch_calc_stat.define_dim(dim_names[4], nstl::max(1, ndims - 1),
+    dispatch_calc_stat.define_dim(dim_names[4], nstl::max(1u, ndims - 1u),
             calc_dims[4], calc_dims_blocks[4]);
 
     CHECK(dispatch_calc_stat.vectorize_dim(
@@ -199,9 +196,9 @@ status_t simple_batch_normalization_fwd_t::pd_t::init_conf(
     dispatch = compute_engine->create_dispatch(data_mdw.md_);
     dispatch.define_dim("MB", 0, conf.mb);
     dispatch.define_dim("IC", 1, conf.ic);
-    dispatch.define_dim("ID", nstl::max(1, ndims - 3), conf.id);
-    dispatch.define_dim("IH", nstl::max(1, ndims - 2), conf.ih);
-    dispatch.define_dim("IW", nstl::max(1, ndims - 1), conf.iw);
+    dispatch.define_dim("ID", nstl::max(1u, ndims - 3u), conf.id);
+    dispatch.define_dim("IH", nstl::max(1u, ndims - 2u), conf.ih);
+    dispatch.define_dim("IW", nstl::max(1u, ndims - 1u), conf.iw);
     dispatch.generate();
 
     return status::success;
@@ -314,12 +311,12 @@ status_t simple_batch_normalization_bwd_t::pd_t::init_conf(
     }
 
     const int max_stat_nblocks = 256;
-    int stat_mb_nblocks = conf.mb / conf.mb_block;
-    int stat_sp_nblocks = utils::max_div(conf.id * conf.ih * conf.iw,
-            nstl::max(1, max_stat_nblocks / stat_mb_nblocks));
+    dim_t stat_mb_nblocks = conf.mb / conf.mb_block;
+    dim_t stat_sp_nblocks = utils::max_div(conf.id * conf.ih * conf.iw,
+            nstl::max<dim_t>(1, max_stat_nblocks / stat_mb_nblocks));
     assert(stat_mb_nblocks * stat_sp_nblocks <= max_stat_nblocks);
 
-    int stat_sp_block = conf.id * conf.ih * conf.iw / stat_sp_nblocks;
+    dim_t stat_sp_block = conf.id * conf.ih * conf.iw / stat_sp_nblocks;
 
     conf.reduce_stat_nblocks = stat_mb_nblocks * stat_sp_nblocks;
 
@@ -341,9 +338,9 @@ status_t simple_batch_normalization_bwd_t::pd_t::init_conf(
     dispatch = compute_engine->create_dispatch(data_mdw.md_);
     dispatch.define_dim("MB", 0, conf.mb, conf.mb_block);
     dispatch.define_dim("IC", 1, conf.ic);
-    dispatch.define_dim("ID", nstl::max(1, conf.ndims - 3), conf.id);
-    dispatch.define_dim("IH", nstl::max(1, conf.ndims - 2), conf.ih);
-    dispatch.define_dim("IW", nstl::max(1, conf.ndims - 1), conf.iw);
+    dispatch.define_dim("ID", nstl::max(1u, conf.ndims - 3u), conf.id);
+    dispatch.define_dim("IH", nstl::max(1u, conf.ndims - 2u), conf.ih);
+    dispatch.define_dim("IW", nstl::max(1u, conf.ndims - 1u), conf.iw);
     CHECK(dispatch.vectorize_dim("IC", 16));
     dispatch.generate();
 
diff --git a/src/gpu/intel/ocl/bnorm/simple_bnorm.hpp b/src/gpu/intel/ocl/bnorm/simple_bnorm.hpp
index a9597be6f32..e2ac3b6d96f 100644
--- a/src/gpu/intel/ocl/bnorm/simple_bnorm.hpp
+++ b/src/gpu/intel/ocl/bnorm/simple_bnorm.hpp
@@ -32,10 +32,8 @@ namespace ocl {
 struct simple_batch_normalization_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_fwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_fwd_pd_t::
+                gpu_batch_normalization_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:simple", simple_batch_normalization_fwd_t);
 
@@ -131,10 +129,8 @@ struct simple_batch_normalization_fwd_t : public gpu_primitive_t {
 struct simple_batch_normalization_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : gpu_batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_batch_normalization_bwd_pd_t::
+                gpu_batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:simple", simple_batch_normalization_bwd_t);
 
diff --git a/src/gpu/intel/ocl/concat_utils.hpp b/src/gpu/intel/ocl/concat_utils.hpp
index e54ca317105..15cd6d01cf3 100644
--- a/src/gpu/intel/ocl/concat_utils.hpp
+++ b/src/gpu/intel/ocl/concat_utils.hpp
@@ -364,9 +364,10 @@ struct prb_info_t {
         const int store_regs = scattered_store ? 2 : 8;
         const dim_t load_size = load_regs * reg_size;
         const dim_t store_size = store_regs * reg_size;
-        return utils::div_up(read_block * load_type_size, load_size)
+        return into<int>(utils::div_up(read_block * load_type_size, load_size)
                 + (read_block / write_block)
-                * utils::div_up(write_block * store_type_size, store_size);
+                        * utils::div_up(
+                                write_block * store_type_size, store_size));
     }
 
     bool operator<(const prb_info_t &other) const {
diff --git a/src/gpu/intel/ocl/convolution_deconvolution.hpp b/src/gpu/intel/ocl/convolution_deconvolution.hpp
index 4d157346f5f..f65a2a61a87 100644
--- a/src/gpu/intel/ocl/convolution_deconvolution.hpp
+++ b/src/gpu/intel/ocl/convolution_deconvolution.hpp
@@ -83,307 +83,11 @@ static status_t conv_descr_create(
             dd->strides, dd->dilates, dd->padding[0], dd->padding[1]);
 }
 
-struct convolution_deconvolution_fwd_t : public gpu_primitive_t {
-    using gpu_primitive_t::gpu_primitive_t;
-    struct pd_t : public gpu_deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : gpu_deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        pd_t(const pd_t &other) = default;
-
-        ~pd_t() = default;
-
-        DECLARE_COMMON_PD_T(name_.c_str(), convolution_deconvolution_fwd_t);
-        status_t init_convolution(impl::engine_t *engine) {
-            convolution_desc_t cd;
-            CHECK(conv_descr_create(desc(), &cd));
-            primitive_attr_t conv_attr(*attr());
-            if (!conv_attr.is_initialized()) return status::out_of_memory;
-            primitive_desc_iterator_t it(
-                    engine, (op_desc_t *)&cd, &conv_attr, nullptr);
-            if (!it.is_initialized()) return status::out_of_memory;
-            conv_pd_ = *(++it);
-
-            return (conv_pd_) ? status::success : status::unimplemented;
-        }
-
-        status_t init(impl::engine_t *engine) {
-            using namespace format_tag;
-            using sm = primitive_attr_t::skip_mask_t;
-
-            const auto attr_skip_mask = sm::post_ops | sm::zero_points_runtime
-                    | sm::scales_runtime;
-
-            VDISPATCH_DECONVOLUTION(is_fwd(), VERBOSE_BAD_PROPKIND);
-            VDISPATCH_DECONVOLUTION(
-                    desc()->alg_kind == alg_kind::deconvolution_direct,
-                    VERBOSE_BAD_ALGORITHM);
-            VDISPATCH_DECONVOLUTION(attr()->has_default_values(attr_skip_mask),
-                    VERBOSE_UNSUPPORTED_ATTR);
-            VDISPATCH_DECONVOLUTION(
-                    post_ops_with_binary_ok(
-                            attr(), desc()->dst_desc.data_type, ndims()),
-                    VERBOSE_UNSUPPORTED_POSTOP);
-            VDISPATCH_DECONVOLUTION(
-                    (utils::everyone_is(data_type::f32,
-                             desc()->src_desc.data_type,
-                             desc()->weights_desc.data_type,
-                             desc()->dst_desc.data_type)
-                            || (utils::everyone_is(data_type::f64,
-                                    desc()->src_desc.data_type,
-                                    desc()->weights_desc.data_type,
-                                    desc()->dst_desc.data_type))
-                            || ((utils::everyone_is(data_type::f16,
-                                         desc()->src_desc.data_type,
-                                         desc()->weights_desc.data_type)
-                                        || utils::everyone_is(data_type::f32,
-                                                desc()->src_desc.data_type,
-                                                desc()->weights_desc.data_type)
-                                        || utils::everyone_is(data_type::bf16,
-                                                desc()->src_desc.data_type,
-                                                desc()->weights_desc.data_type))
-                                    && utils::one_of(desc()->dst_desc.data_type,
-                                            data_type::f16, data_type::u8,
-                                            data_type::s8))
-                            || (utils::everyone_is(data_type::bf16,
-                                        desc()->src_desc.data_type,
-                                        desc()->weights_desc.data_type)
-                                    && utils::one_of(desc()->dst_desc.data_type,
-                                            data_type::f32, data_type::bf16))
-                            || (utils::everyone_is(data_type::f16,
-                                        desc()->src_desc.data_type,
-                                        desc()->weights_desc.data_type)
-                                    && utils::one_of(desc()->dst_desc.data_type,
-                                            data_type::f32, data_type::f16))
-                            || (desc()->weights_desc.data_type == data_type::s8
-                                    && utils::one_of(desc()->src_desc.data_type,
-                                            data_type::u8, data_type::s8)
-                                    && desc()->dst_desc.data_type
-                                            != data_type::f64)),
-                    VERBOSE_UNSUPPORTED_DT);
-
-            VDISPATCH_DECONVOLUTION_SC(
-                    init_convolution(engine), "init_convolution()");
-            if (weights_md_.format_kind == format_kind::any) {
-                VDISPATCH_DECONVOLUTION_SC(
-                        weights_axes_permutation(&weights_md_,
-                                conv_pd_->weights_md(), with_groups()),
-                        "weights_axes_permutation()");
-            }
-            if (src_md_.format_kind == format_kind::any)
-                src_md_ = *conv_pd_->diff_dst_md();
-            if (dst_md_.format_kind == format_kind::any)
-                dst_md_ = *conv_pd_->diff_src_md();
-            if (bias_md_.format_kind == format_kind::any) {
-                VDISPATCH_DECONVOLUTION_SC(memory_desc_init_by_tag(bias_md_, x),
-                        VERBOSE_UNSUPPORTED_TAG);
-            }
-            init_name();
-            init_scratchpad();
-            VDISPATCH_DECONVOLUTION_SC(attr_.set_default_formats(dst_md(0)),
-                    VERBOSE_UNSUPPORTED_ATTR);
-
-            return status::success;
-        }
-
-        std::shared_ptr<primitive_desc_t> conv_pd_;
-
-    private:
-        std::string name_ = "conv:any";
-
-        void init_name() {
-            name_.append("+");
-            name_.append(conv_pd_->name());
-        }
-
-        void init_scratchpad() {
-            auto scratchpad = scratchpad_registry().registrar();
-            scratchpad.book(memory_tracking::names::key_nested,
-                    conv_pd_->scratchpad_registry());
-        }
-    };
-
-    status_t init(impl::engine_t *engine) override {
-        return create_nested_primitive(conv_p_, pd()->conv_pd_, engine);
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        using namespace memory_tracking::names;
-        const auto &args = ctx.args();
-        exec_args_t conv_args;
-        conv_args[DNNL_ARG_DIFF_DST] = args.at(DNNL_ARG_SRC);
-        conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
-        conv_args[DNNL_ARG_DIFF_SRC] = args.at(DNNL_ARG_DST);
-        if (pd()->with_bias())
-            conv_args[DNNL_ARG_BIAS] = args.at(DNNL_ARG_BIAS);
-
-        for (int idx = 0; idx < pd()->attr()->post_ops_.len(); ++idx) {
-            if (pd()->attr()->post_ops_.entry_[idx].is_binary()) {
-                conv_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1]
-                        = args.at(DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx)
-                                | DNNL_ARG_SRC_1);
-            } else if (pd()->attr()->post_ops_.entry_[idx].is_prelu()) {
-                conv_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx)
-                        | DNNL_ARG_WEIGHTS]
-                        = args.at(DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx)
-                                | DNNL_ARG_WEIGHTS);
-            }
-        }
-        const auto z_src = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC;
-        const auto z_dst = DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST;
-        if (args.find(z_src) != args.end()) conv_args[z_src] = args.at(z_src);
-        if (args.find(z_dst) != args.end()) conv_args[z_dst] = args.at(z_dst);
-
-        for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-            int key = DNNL_ARG_ATTR_SCALES | arg;
-            if (args.find(key) != args.end()) conv_args[key] = args.at(key);
-        }
-
-        exec_ctx_t conv_ctx(ctx, std::move(conv_args));
-
-        nested_scratchpad_t ns(ctx, key_nested, conv_p_);
-        conv_ctx.set_scratchpad_grantor(ns.grantor());
-        // Executing the convolution kernel
-        return conv_p_->execute(conv_ctx);
-    }
-
-private:
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::shared_ptr<impl::primitive_t> conv_p_;
-};
-
-struct convolution_deconvolution_bwd_data_t : public gpu_primitive_t {
-    using gpu_primitive_t::gpu_primitive_t;
-    struct pd_t : public gpu_deconvolution_bwd_data_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : gpu_deconvolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
-
-        pd_t(const pd_t &other) = default;
-
-        ~pd_t() = default;
-
-        DECLARE_COMMON_PD_T(
-                name_.c_str(), convolution_deconvolution_bwd_data_t);
-
-        status_t init_convolution(impl::engine_t *engine) {
-            convolution_desc_t cd;
-            CHECK(conv_descr_create(desc(), &cd));
-            primitive_attr_t conv_attr(*attr());
-            if (!conv_attr.is_initialized()) return status::out_of_memory;
-            primitive_desc_iterator_t it(
-                    engine, (op_desc_t *)&cd, &conv_attr, nullptr);
-            if (!it.is_initialized()) return status::out_of_memory;
-            conv_pd_ = *(++it);
-            return (conv_pd_) ? status::success : status::unimplemented;
-        }
-
-        status_t init(impl::engine_t *engine) {
-            VDISPATCH_DECONVOLUTION(
-                    desc()->prop_kind == prop_kind::backward_data,
-                    VERBOSE_BAD_PROPKIND);
-
-            VDISPATCH_DECONVOLUTION(
-                    (utils::everyone_is(data_type::f32,
-                             desc()->diff_src_desc.data_type,
-                             desc()->weights_desc.data_type,
-                             desc()->diff_dst_desc.data_type)
-                            || (utils::everyone_is(data_type::f64,
-                                    desc()->diff_src_desc.data_type,
-                                    desc()->weights_desc.data_type,
-                                    desc()->diff_dst_desc.data_type))
-                            || utils::everyone_is(data_type::f16,
-                                    desc()->weights_desc.data_type,
-                                    desc()->diff_dst_desc.data_type)
-                            || utils::everyone_is(data_type::bf16,
-                                    desc()->weights_desc.data_type,
-                                    desc()->diff_dst_desc.data_type)),
-                    VERBOSE_UNSUPPORTED_DT);
-
-            VDISPATCH_DECONVOLUTION(
-                    utils::one_of(desc()->diff_src_desc.data_type,
-                            data_type::bf16, data_type::f16, data_type::f32,
-                            data_type::f64),
-                    VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_DECONVOLUTION(
-                    desc()->alg_kind == alg_kind::deconvolution_direct,
-                    VERBOSE_BAD_ALGORITHM);
-            VDISPATCH_DECONVOLUTION(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
-
-            VDISPATCH_DECONVOLUTION_SC(
-                    init_convolution(engine), "init_convolution()");
-            if (weights_md_.format_kind == format_kind::any)
-                VDISPATCH_DECONVOLUTION_SC(
-                        weights_axes_permutation(&weights_md_,
-                                conv_pd_->weights_md(), with_groups()),
-                        "weights_axes_permutation()");
-            if (diff_src_md_.format_kind == format_kind::any)
-                diff_src_md_ = *conv_pd_->dst_md();
-            if (diff_dst_md_.format_kind == format_kind::any)
-                diff_dst_md_ = *conv_pd_->src_md();
-
-            init_name();
-            init_scratchpad();
-
-            return status::success;
-        }
-
-        std::shared_ptr<primitive_desc_t> conv_pd_;
-
-    private:
-        std::string name_ = "conv:any";
-
-        void init_name() {
-            name_.append("+");
-            name_.append(conv_pd_->name());
-        }
-
-        void init_scratchpad() {
-            auto scratchpad = scratchpad_registry().registrar();
-            scratchpad.book(memory_tracking::names::key_nested,
-                    conv_pd_->scratchpad_registry());
-        }
-    };
-
-    status_t init(impl::engine_t *engine) override {
-        return create_nested_primitive(conv_p_, pd()->conv_pd_, engine);
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        using namespace memory_tracking::names;
-        const auto &args = ctx.args();
-        exec_args_t conv_args;
-        conv_args[DNNL_ARG_SRC] = args.at(DNNL_ARG_DIFF_DST);
-        conv_args[DNNL_ARG_WEIGHTS] = args.at(DNNL_ARG_WEIGHTS);
-        conv_args[DNNL_ARG_DST] = args.at(DNNL_ARG_DIFF_SRC);
-        if (!types::is_zero_md(pd()->scratchpad_md()))
-            conv_args[DNNL_ARG_SCRATCHPAD] = args.at(DNNL_ARG_SCRATCHPAD);
-        exec_ctx_t conv_ctx(ctx, std::move(conv_args));
-
-        nested_scratchpad_t ns(ctx, key_nested, conv_p_);
-        conv_ctx.set_scratchpad_grantor(ns.grantor());
-        // Executing the convolution kernel
-        return conv_p_->execute(conv_ctx);
-    }
-
-private:
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    std::shared_ptr<impl::primitive_t> conv_p_;
-};
-
 struct convolution_deconvolution_bwd_weights_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_deconvolution_bwd_weights_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : gpu_deconvolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        pd_t(const pd_t &other) = default;
-
-        ~pd_t() = default;
+        using gpu_deconvolution_bwd_weights_pd_t::
+                gpu_deconvolution_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(
                 name_.c_str(), convolution_deconvolution_bwd_weights_t);
diff --git a/src/gpu/intel/ocl/convolution_inner_product.cpp b/src/gpu/intel/ocl/convolution_inner_product.cpp
index 66bbe1c9952..756e5ca46ea 100644
--- a/src/gpu/intel/ocl/convolution_inner_product.cpp
+++ b/src/gpu/intel/ocl/convolution_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include "common/c_types_map.hpp"
 #include "common/convolution_pd.hpp"
 #include "common/reorder.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 using namespace dnnl::impl::memory_tracking;
 
@@ -164,7 +164,7 @@ status_t convolution_inner_product_fwd_t::execute_forward(
     auto bia = ctx.input(DNNL_ARG_BIAS);
     auto dst = ctx.output(DNNL_ARG_DST);
 
-    std::unique_ptr<memory_t> wspace_dst;
+    std::unique_ptr<memory_t, memory_deleter_t> wspace_dst;
     auto exec_reorder = [&](memory_t *in, memory_t *out,
                                 const std::shared_ptr<impl::primitive_t> &prim,
                                 int r_num) -> status_t {
@@ -197,6 +197,13 @@ status_t convolution_inner_product_fwd_t::execute_forward(
             = memory_arg_t {conf.reorder_dst ? wspace_dst.get() : dst, false};
 
     const auto &args = ctx.args();
+    for (const int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
+        if (pd()->attr()->scales_.has_default_values(arg)) continue;
+
+        c_args[DNNL_ARG_ATTR_SCALES | arg]
+                = args.at(DNNL_ARG_ATTR_SCALES | arg);
+    }
+
     for (int idx = 0; idx < pd()->attr()->post_ops_.len(); ++idx) {
         if (pd()->attr()->post_ops_.entry_[idx].is_binary()) {
             c_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | DNNL_ARG_SRC_1]
diff --git a/src/gpu/intel/ocl/convolution_inner_product.hpp b/src/gpu/intel/ocl/convolution_inner_product.hpp
index 3aee2a33e6f..0d356ee3860 100644
--- a/src/gpu/intel/ocl/convolution_inner_product.hpp
+++ b/src/gpu/intel/ocl/convolution_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -73,7 +73,7 @@ struct convolution_inner_product_fwd_t : public gpu_primitive_t {
                     IMPLICATION(desc()->src_desc.data_type == f16,
                             compute_engine->mayiuse(
                                     compute::device_ext_t::khr_fp16)),
-                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+                    VERBOSE_UNSUPPORTED_DT);
             VDISPATCH_INNER_PRODUCT(
                     (invariant_src_md()->format_desc.blocking.inner_nblks > 0
                             || invariant_wei_md()
@@ -82,7 +82,7 @@ struct convolution_inner_product_fwd_t : public gpu_primitive_t {
                             || (src_md_.format_kind == format_kind::any
                                     && weights_md_.format_kind
                                             == format_kind::any)),
-                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+                    VERBOSE_UNSUPPORTED_FORMAT_KIND);
 
             VDISPATCH_INNER_PRODUCT_SC(
                     init_conf(engine), VERBOSE_PRIMITIVE_CREATION_FAIL, "ip");
diff --git a/src/gpu/intel/ocl/custom_reorder.cl b/src/gpu/intel/ocl/custom_reorder.cl
index 7e45c873183..ec59ed9888d 100644
--- a/src/gpu/intel/ocl/custom_reorder.cl
+++ b/src/gpu/intel/ocl/custom_reorder.cl
@@ -14,6 +14,9 @@
 * limitations under the License.
 *******************************************************************************/
 
+// Temporary W/A for bf16 problems in HW and compiler
+#undef cl_future_bf16_cvt
+
 #include "gpu/intel/ocl/dispatch.h"
 #include "gpu/intel/ocl/reorder_common.h"
 
@@ -79,8 +82,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_DST_SCALE && DST_NUM_SCALES > 1
         dst_scale = dst_scales[SCALE_OFF(DST, d0, d1, d2, d3, d4, d5)];
 #endif
-        REORDER(dst[dst_off], src[src_off], src_scale, dst_scale, sum_scale,
-                src_zp, dst_zp, sum_zp);
+        REORDER(DEFAULT_ROUND, dst[dst_off], src[src_off], src_scale, dst_scale,
+                sum_scale, src_zp, dst_zp, sum_zp);
     }
 
 #elif ALT_OFFSETS
@@ -105,8 +108,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
     for (int db = 0; db < BLK; ++db) {
         const int src_off = src_base + db * SB;
         const int dst_off = dst_base + db * DB;
-        REORDER(dst[dst_off], src[src_off], src_scale, dst_scale, sum_scale,
-                src_zp, dst_zp, sum_zp);
+        REORDER(DEFAULT_ROUND, dst[dst_off], src[src_off], src_scale, dst_scale,
+                sum_scale, src_zp, dst_zp, sum_zp);
     }
 
 #elif PLAIN_xFxE_TO_ABCDEF
@@ -148,8 +151,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
             if (block_size >= 16) \
                 dst_off = DST_OFF(d0, d1, d2, d3, d4 + sidx, src_offset); \
             if (SUM_OUTPUT) dst_tmp = DST_BLOCK_READ(&dst[dst_off]); \
-            REORDER(dst_tmp, src_swap[sidx][sglid], src_scale, dst_scale, \
-                    sum_scale, src_zp, dst_zp, sum_zp); \
+            REORDER(DEFAULT_ROUND, dst_tmp, src_swap[sidx][sglid], src_scale, \
+                    dst_scale, sum_scale, src_zp, dst_zp, sum_zp); \
             DST_BLOCK_WRITE(&dst[dst_off], dst_tmp); \
             if (block_size < 16) dst_off += SUB_GROUP_SIZE; \
         } \
@@ -282,12 +285,12 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
             dst_scale = dst_scales[SCALE_OFF(DST, d0, d1, d2, d3, d4, d5)];
 #endif
 #ifdef TRANSPOSE_NXN
-            REORDER(dst_tmp, dst_buf[iter], src_scale, dst_scale, sum_scale,
-                    src_zp, dst_zp, sum_zp);
+            REORDER(DEFAULT_ROUND, dst_tmp, dst_buf[iter], src_scale, dst_scale,
+                    sum_scale, src_zp, dst_zp, sum_zp);
 #else
             send_buf = tmp[sg_off + sgId * SUB_GROUP_SIZE + iter];
-            REORDER(dst_tmp, send_buf, src_scale, dst_scale, sum_scale, src_zp,
-                    dst_zp, sum_zp);
+            REORDER(DEFAULT_ROUND, dst_tmp, send_buf, src_scale, dst_scale,
+                    sum_scale, src_zp, dst_zp, sum_zp);
 #endif
         } else {
             dst_tmp = 0;
@@ -351,8 +354,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
         dst_tmp = DST_BLOCK_READ(&dst[dst_off]);
 #endif
-        REORDER(dst_tmp, dst_buf[i], src_scale, dst_scale, sum_scale, src_zp,
-                dst_zp, sum_zp);
+        REORDER(DEFAULT_ROUND, dst_tmp, dst_buf[i], src_scale, dst_scale,
+                sum_scale, src_zp, dst_zp, sum_zp);
         DST_BLOCK_WRITE(&dst[dst_off], dst_tmp);
     }
 
@@ -415,8 +418,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
                 dst_tmp[n] = dst[dst_off + sglid + n * SUB_GROUP_SIZE];
             }
 #endif // WITH_SUM_SCALE || WITH_SUM_ZPOINT
-            REORDER8(dst_tmp, src_tmp, src_scale, dst_scale, sum_scale, src_zp,
-                    dst_zp, sum_zp);
+            REORDER8(DEFAULT_ROUND, dst_tmp, src_tmp, src_scale, dst_scale,
+                    sum_scale, src_zp, dst_zp, sum_zp);
             for (int n = 0; n < 8; n++) {
                 dst[dst_off + sglid + n * SUB_GROUP_SIZE] = dst_tmp[n];
             }
@@ -430,8 +433,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
                 dst_tmp[n] = dst[dst_off + sglid];
 #endif // WITH_SUM_SCALE || WITH_SUM_ZPOINT
-                REORDER(dst_tmp, src_tmp, src_scale, dst_scale, sum_scale,
-                        src_zp, dst_zp, sum_zp);
+                REORDER(DEFAULT_ROUND, dst_tmp, src_tmp, src_scale, dst_scale,
+                        sum_scale, src_zp, dst_zp, sum_zp);
                 dst[dst_off + sglid] = dst_tmp;
             }
         }
@@ -490,8 +493,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
         dst_tmp = DST_BLOCK_READ(&dst[dst_off]);
 #endif
-        REORDER(dst_tmp, dst_buf[d0i], src_scale, dst_scale, sum_scale, src_zp,
-                dst_zp, sum_zp);
+        REORDER(DEFAULT_ROUND, dst_tmp, dst_buf[d0i], src_scale, dst_scale,
+                sum_scale, src_zp, dst_zp, sum_zp);
         DST_BLOCK_WRITE(&dst[dst_off], dst_tmp);
     }
 
@@ -526,8 +529,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
         dst_tmp = DST_BLOCK_READ(&dst[dst_off]);
 #endif
-        REORDER(dst_tmp, src_tmp, src_scale, dst_scale, sum_scale, src_zp,
-                dst_zp, sum_zp);
+        REORDER(DEFAULT_ROUND, dst_tmp, src_tmp, src_scale, dst_scale,
+                sum_scale, src_zp, dst_zp, sum_zp);
         DST_BLOCK_WRITE(&dst[dst_off], dst_tmp);
     }
 
@@ -601,8 +604,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
             dst_tmp = dst[doff + sgId * DST_INNERMOST_STRIDE];
 #endif
-            REORDER(dst_tmp, cache[coff + sgId], src_scale, dst_scale,
-                    sum_scale, src_zp, dst_zp, sum_zp);
+            REORDER(DEFAULT_ROUND, dst_tmp, cache[coff + sgId], src_scale,
+                    dst_scale, sum_scale, src_zp, dst_zp, sum_zp);
             dst[doff + sgId * DST_INNERMOST_STRIDE] = dst_tmp;
         }
     }
@@ -690,8 +693,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
                     d[2] + b[2], d[3] + b[3], d[4] + b[4], d[5] + b[5])];
 #endif
 
-            REORDER(dst_tmp, cache[cidx], src_scale, dst_scale, sum_scale,
-                    src_zp, dst_zp, sum_zp);
+            REORDER(DEFAULT_ROUND, dst_tmp, cache[cidx], src_scale, dst_scale,
+                    sum_scale, src_zp, dst_zp, sum_zp);
             DST_BLOCK_WRITE(&dst[didx], dst_tmp);
         }
     }
@@ -705,8 +708,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
         dst_tmp = DST_BLOCK_READ8(&dst[d0]);
 #endif
-        REORDER8(dst_tmp, src_tmp, src_scale, dst_scale, sum_scale, src_zp,
-                dst_zp, sum_zp);
+        REORDER8(DEFAULT_ROUND, dst_tmp, src_tmp, src_scale, dst_scale,
+                sum_scale, src_zp, dst_zp, sum_zp);
         DST_BLOCK_WRITE8(&dst[d0], dst_tmp);
     }
 
@@ -784,8 +787,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #if WITH_SUM_SCALE || WITH_SUM_ZPOINT
         dst_tmp = dst[dst_off + sgId];
 #endif
-        REORDER(dst_tmp, data, src_scale, dst_scale, sum_scale, src_zp, dst_zp,
-                sum_zp);
+        REORDER(DEFAULT_ROUND, dst_tmp, data, src_scale, dst_scale, sum_scale,
+                src_zp, dst_zp, sum_zp);
         dst[dst_off + sgId] = dst_tmp;
     }
 
@@ -853,10 +856,10 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #endif // WITH_SUM_SCALE || WITH_SUM_ZPOINT
 #endif // (SRC_16A16B || SRC_16B16A) && (DST_16A16B || DST_16B16A)
 
-    REORDER8(
-            dst0, in0, src_scale, dst_scale, sum_scale, src_zp, dst_zp, sum_zp);
-    REORDER8(
-            dst1, in1, src_scale, dst_scale, sum_scale, src_zp, dst_zp, sum_zp);
+    REORDER8(DEFAULT_ROUND, dst0, in0, src_scale, dst_scale, sum_scale, src_zp,
+            dst_zp, sum_zp);
+    REORDER8(DEFAULT_ROUND, dst1, in1, src_scale, dst_scale, sum_scale, src_zp,
+            dst_zp, sum_zp);
 
 #if (SRC_16A16B || SRC_16B16A) && (DST_16A16B || DST_16B16A)
     for (int i = 0; i < 8; i++) {
@@ -907,8 +910,8 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #endif // WITH_SUM_SCALE || WITH_SUM_ZPOINT
 #endif // DST_16B
 
-    REORDER(dst_tmp, src_tmp, src_scale, dst_scale, sum_scale, src_zp, dst_zp,
-            sum_zp);
+    REORDER(DEFAULT_ROUND, dst_tmp, src_tmp, src_scale, dst_scale, sum_scale,
+            src_zp, dst_zp, sum_zp);
 
 #if DST_16B
     DST_BLOCK_WRITE(&dst[0], dst_tmp);
@@ -971,10 +974,10 @@ __kernel void custom_reorder(__global SRC_DATA_T *restrict src,
 #endif // WITH_SUM_SCALE || WITH_SUM_ZPOINT
 #endif // (SRC_16B16C || SRC_16C16B) && (DST_16B16C || DST_16C16B)
 
-    REORDER8(
-            dst0, in0, src_scale, dst_scale, sum_scale, src_zp, dst_zp, sum_zp);
-    REORDER8(
-            dst1, in1, src_scale, dst_scale, sum_scale, src_zp, dst_zp, sum_zp);
+    REORDER8(DEFAULT_ROUND, dst0, in0, src_scale, dst_scale, sum_scale, src_zp,
+            dst_zp, sum_zp);
+    REORDER8(DEFAULT_ROUND, dst1, in1, src_scale, dst_scale, sum_scale, src_zp,
+            dst_zp, sum_zp);
 
 #if (SRC_16B16C || SRC_16C16B) && (DST_16B16C || DST_16C16B)
     for (int i = 0; i < 8; i++) {
diff --git a/src/gpu/intel/ocl/custom_reorder.cpp b/src/gpu/intel/ocl/custom_reorder.cpp
index 85bbe1984c4..962d233e81d 100644
--- a/src/gpu/intel/ocl/custom_reorder.cpp
+++ b/src/gpu/intel/ocl/custom_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 #include "gpu/intel/ocl/custom_reorder.hpp"
 
 #include "common/utils.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -33,13 +33,13 @@ using namespace dnnl::impl::memory_tracking::names;
 
 using dimension = struct {
     dim_t size;
-    int idx;
+    dim_idx_t idx;
 };
 
 struct stride_t {
     dim_t stride = 1;
     dim_t size = 1;
-    int idx = 0;
+    dim_idx_t idx = 0;
 };
 
 // Stride sorter. Smaller stride = inner dim, bigger stride = outer dim.
@@ -67,21 +67,21 @@ bool stride_cmp(const stride_t &a, const stride_t &b) {
 // 8x8x1x1 aBcd8b becomes cdaB8b (notice the "B8b").
 // In such cases this function combines last dim with first block (xB8b := xb)
 dimension get_Nth_last_dim_or_block(
-        const memory_desc_wrapper &md, int distance = 0) {
-    int nblks = md.blocking_desc().inner_nblks;
+        const memory_desc_wrapper &md, dim_idx_t distance = 0) {
+    dim_idx_t nblks = into<dim_idx_t>(md.blocking_desc().inner_nblks);
     dimension ret;
-    int ndims = md.ndims();
+    dim_idx_t ndims = into<dim_idx_t>(md.ndims());
 
     std::vector<stride_t> strides(ndims);
-    for (int d = 0; d < ndims; ++d) {
+    for (dim_idx_t d = 0; d < ndims; ++d) {
         strides[d].idx = d;
         strides[d].stride = md.blocking_desc().strides[d];
         strides[d].size = md.padded_dims()[d];
     }
     std::sort(strides.begin(), strides.end(), stride_cmp);
-    for (int i = 0; i < nblks; i++) {
+    for (dim_idx_t i = 0; i < nblks; i++) {
         stride_t blk;
-        blk.idx = md.blocking_desc().inner_idxs[i];
+        blk.idx = into<dim_idx_t>(md.blocking_desc().inner_idxs[i]);
         blk.size = md.blocking_desc().inner_blks[i];
         if (i == 0 && blk.idx == strides[0].idx) { continue; }
         strides.insert(strides.begin(), blk);
@@ -93,7 +93,7 @@ dimension get_Nth_last_dim_or_block(
 
 int innermost_block(const blocking_desc_t &blk) {
     int last = blk.inner_nblks - 1;
-    return blk.inner_blks[last];
+    return into<int>(blk.inner_blks[last]);
 }
 
 bool is_alt_faster_than_ref(const memory_desc_wrapper &src_mdw,
@@ -148,7 +148,7 @@ bool matches_one_NxN_layout(const memory_desc_wrapper &src,
 // to in bursts.
 bool fill_conf_xab_xba(const memory_desc_wrapper &src,
         const memory_desc_wrapper &dst, int scale_mask, xb_to_xab_xba_t &cfg,
-        int &vect_dim, int &vect_size, dim_t *blocks) {
+        dim_idx_t &vect_dim, int &vect_size, dim_t *blocks) {
 
     vect_size = 16;
     if (dst.ndims() < 2) { return false; }
@@ -227,7 +227,7 @@ bool fill_conf_xab_xba(const memory_desc_wrapper &src,
 bool fits_xab_xba(const memory_desc_wrapper &src,
         const memory_desc_wrapper &dst, int scale_mask) {
     xb_to_xab_xba_t cfg;
-    int vect_dim;
+    dim_idx_t vect_dim;
     int vect_size;
     dim_t blocks[6];
 
@@ -243,7 +243,8 @@ bool matches_ABxxxx8ayb_layout(const blocking_desc_t &blk, int ndims) {
     // used for calculation of dst address return wrong values.
     for (int d = last - 2; d >= 0; d--) {
         if (blk.inner_idxs[d] == ndims - 1) {
-            int double_block = blk.inner_blks[last] * blk.inner_blks[d];
+            int double_block
+                    = into<int>(blk.inner_blks[last] * blk.inner_blks[d]);
             if (double_block < 16) {
                 return false;
             } else {
@@ -551,7 +552,7 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
 
     if (conf.nelems == 0) return status::success;
 
-    int last = conf.ndims - 1;
+    dim_idx_t last = conf.ndims - 1;
     size_t last_dim = padded_dims[last];
 
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
@@ -561,7 +562,7 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
 
     dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
     int vect_size = 1;
-    int vect_dim = 0;
+    dim_idx_t vect_dim = 0;
 
     conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_);
     int temp_block = 1;
@@ -599,8 +600,8 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
             break;
         case plain_to_ABcd84a42b: {
             auto &blk = dst_mdw.blocking_desc();
-            int inner_block = blk.inner_blks[blk.inner_nblks - 1];
-            int outer_block = blk.inner_blks[blk.inner_nblks - 2];
+            int inner_block = into<int>(blk.inner_blks[blk.inner_nblks - 1]);
+            int outer_block = into<int>(blk.inner_blks[blk.inner_nblks - 2]);
             conf.sub_group_size = inner_block * outer_block;
             blocks[0] = outer_block;
             blocks[1] = inner_block;
@@ -618,8 +619,8 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
             if (!may_use_sg8 && vect_size == 8) {
                 return status_t::dnnl_unimplemented;
             }
-            for (int dim = last - 1;
-                    dim >= 0 && dim < MAX_NDIMS && temp_block == 1; dim--) {
+            for (dim_idx_t dim = last - 1; dim < MAX_NDIMS && temp_block == 1;
+                    dim--) {
                 if (padded_dims[dim] % 4 == 0) { temp_block = 4; }
                 if (padded_dims[dim] % 8 == 0) { temp_block = 8; }
                 if (padded_dims[dim] % 16 == 0) { temp_block = 16; }
@@ -632,11 +633,11 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
             auto last_dim_dst = get_Nth_last_dim_or_block(dst_mdw);
             auto nextlast_dim_dst = get_Nth_last_dim_or_block(dst_mdw, 1);
 
-            int min_common_size
+            dim_t min_common_size
                     = std::min(last_dim_src.size, last_dim_dst.size);
-            int max_common_size
+            dim_t max_common_size
                     = std::max(last_dim_src.size, last_dim_dst.size);
-            conf.sub_group_size = max_common_size;
+            conf.sub_group_size = into<int>(max_common_size);
             if (!may_use_sg8 && conf.sub_group_size == 8) {
                 return status_t::dnnl_unimplemented;
             }
@@ -652,7 +653,7 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
             conf.aux_data.vg.vector_dim = last_dim_src.idx;
             conf.aux_data.vg.src_loop_dim = nextlast_dim_dst.idx;
             conf.aux_data.vg.dst_loop_dim = nextlast_dim_src.idx;
-            conf.aux_data.vg.innermost_size = min_common_size;
+            conf.aux_data.vg.innermost_size = into<int>(min_common_size);
 
             blocks[conf.aux_data.vg.src_loop_dim] = max_group_size;
             blocks[conf.aux_data.vg.dst_loop_dim] = max_group_size;
@@ -673,7 +674,7 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
             auto nextlast_dim_src = get_Nth_last_dim_or_block(src_mdw, 1);
             auto last_dim_dst = get_Nth_last_dim_or_block(dst_mdw);
             auto nextlast_dim_dst = get_Nth_last_dim_or_block(dst_mdw, 1);
-            int min_common_size
+            dim_t min_common_size
                     = std::min(last_dim_src.size, last_dim_dst.size);
             vect_size = (min_common_size % 16 == 0) ? 16 : 8;
             vect_dim = last_dim_src.idx;
@@ -684,7 +685,7 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
             assert(last_dim_src.size % vect_size == 0
                     && last_dim_dst.size % vect_size == 0);
             assert(last_dim_src.idx == last_dim_dst.idx);
-            int src_chunks;
+            dim_t src_chunks;
             if (last_dim_src.size / vect_size > 1) {
                 src_chunks = last_dim_src.size / vect_size;
                 conf.aux_data.vg.dst_loop_dim = last_dim_src.idx;
@@ -692,7 +693,7 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
                 src_chunks = nextlast_dim_src.size;
                 conf.aux_data.vg.dst_loop_dim = nextlast_dim_src.idx;
             }
-            int dst_chunks;
+            dim_t dst_chunks;
             if (last_dim_dst.size / vect_size > 1) {
                 dst_chunks = last_dim_dst.size / vect_size;
                 conf.aux_data.vg.src_loop_dim = last_dim_dst.idx;
@@ -767,10 +768,10 @@ status_t custom_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
         conf.dispatch.define_dim("D0", 0, conf.nelems, 16);
         CHECK(conf.dispatch.vectorize_dim("D0", 16));
     } else {
-        for (int i = 0; i < MAX_NDIMS; ++i) {
+        for (dim_idx_t i = 0; i < MAX_NDIMS; ++i) {
             auto dim_str = utils::format("D%d", i);
-            if (i < dst_mdw.ndims()) {
-                int dim = padded_dims[i];
+            if (i < into<dim_idx_t>(dst_mdw.ndims())) {
+                dim_t dim = padded_dims[i];
                 // if needed to align vectorized dim with vector size, pad that dim again
                 if (i == vect_dim) { dim = utils::rnd_up(dim, vect_size); }
                 conf.dispatch.define_dim(dim_str, i, dim, blocks[i]);
@@ -902,7 +903,7 @@ status_t custom_reorder_t::pd_t::init_kernel_ctx(
                 "INNERMOST_SIZE", conf.aux_data.vg.innermost_size);
         kernel_ctx.define_int("VECT_SIZE", conf.sub_group_size);
         bool has_non_innermost_padding = false;
-        for (int i = 0; i < MAX_NDIMS; i++) {
+        for (dim_idx_t i = 0; i < MAX_NDIMS; i++) {
             if (i == conf.aux_data.vg.vector_dim) { continue; }
             has_non_innermost_padding
                     |= (dst_mdw.dims()[i] != dst_mdw.padded_dims()[i]);
diff --git a/src/gpu/intel/ocl/custom_reorder.hpp b/src/gpu/intel/ocl/custom_reorder.hpp
index a0e9bd190b1..81d8335f814 100644
--- a/src/gpu/intel/ocl/custom_reorder.hpp
+++ b/src/gpu/intel/ocl/custom_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include "gpu/gpu_reorder_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -60,26 +60,21 @@ struct custom_reorder_t : public gpu_primitive_t {
             auto *compute_engine = utils::downcast<compute::compute_engine_t *>(
                     dst_engine->kind() == engine_kind::gpu ? dst_engine
                                                            : src_engine);
+            using namespace data_type;
+            auto sdt = src_md()->data_type;
+            auto ddt = dst_md()->data_type;
             VDISPATCH_REORDER(
-                    IMPLICATION(utils::one_of(dst_md()->data_type,
-                                        data_type::f8_e4m3, data_type::f8_e5m2),
-                            utils::one_of(src_md()->data_type, data_type::f32,
-                                    data_type::f16, data_type::bf16,
-                                    data_type::f64))
-                            && IMPLICATION(utils::one_of(src_md()->data_type,
-                                                   data_type::f8_e4m3,
-                                                   data_type::f8_e5m2),
-                                    utils::one_of(dst_md()->data_type,
-                                            data_type::f32, data_type::f16,
-                                            data_type::bf16, data_type::f64)),
+                    utils::one_of(sdt, f32, f16, f8_e5m2, f8_e4m3, s32, s8, u8),
                     VERBOSE_UNSUPPORTED_DT);
-
             VDISPATCH_REORDER(
-                    !(utils::one_of(data_type::s4, dst_md()->data_type,
-                              src_md()->data_type)
-                            || utils::one_of(data_type::u4, dst_md()->data_type,
-                                    src_md()->data_type)),
+                    utils::one_of(ddt, f32, f16, f8_e5m2, f8_e4m3, s32, s8, u8),
                     VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_REORDER(IMPLICATION(utils::one_of(ddt, f8_e4m3, f8_e5m2),
+                                      utils::one_of(sdt, f32, f16, bf16)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_REORDER(IMPLICATION(utils::one_of(sdt, f8_e4m3, f8_e5m2),
+                                      utils::one_of(ddt, f32, f16, bf16)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
 
             VDISPATCH_REORDER(!memory_desc_ndims_ok(src_md(), dst_md()),
                     VERBOSE_INCONSISTENT_NDIMS, "src", "dst");
@@ -87,19 +82,13 @@ struct custom_reorder_t : public gpu_primitive_t {
                                       compute::device_ext_t::intel_subgroups),
                     VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "subgroups");
             VDISPATCH_REORDER(
-                    IMPLICATION(
-                            utils::one_of(data_type::f16, src_md()->data_type,
-                                    dst_md()->data_type),
+                    IMPLICATION(utils::one_of(f16, sdt, ddt),
                             compute_engine->mayiuse(
                                     compute::device_ext_t::khr_fp16)
                                     && compute_engine->mayiuse(
                                             compute::device_ext_t::
                                                     intel_subgroups_short)),
                     VERBOSE_UNSUPPORTED_DT_CFG);
-            VDISPATCH_REORDER(
-                    (!utils::one_of(data_type::f64, src_md()->data_type,
-                            dst_md()->data_type)),
-                    VERBOSE_UNSUPPORTED_DT);
 
             VDISPATCH_REORDER_SC(init_conf(engine),
                     VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder");
diff --git a/src/gpu/intel/ocl/device_info.cpp b/src/gpu/intel/ocl/device_info.cpp
new file mode 100644
index 00000000000..b65373b13d3
--- /dev/null
+++ b/src/gpu/intel/ocl/device_info.cpp
@@ -0,0 +1,201 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/ocl/device_info.hpp"
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/hw_info.hpp"
+
+#include <CL/cl_ext.h>
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+status_t device_info_t::init_arch(impl::engine_t *engine) {
+    cl_int err = CL_SUCCESS;
+    auto device = utils::downcast<const engine_t *>(engine)->device();
+
+    // skip other vendors
+    const cl_uint intel_vendor_id = 0x8086;
+    cl_uint vendor_id;
+    err = clGetDeviceInfo(
+            device, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &vendor_id, nullptr);
+    OCL_CHECK(err);
+    if (vendor_id != intel_vendor_id) return status::success;
+
+    cl_context context
+            = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
+    OCL_CHECK(err);
+
+    init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
+            gpu_product_family_, stepping_id_, native_extensions_,
+            mayiuse_systolic_, mayiuse_ngen_kernels_);
+
+    err = clReleaseContext(context);
+    OCL_CHECK(err);
+
+    // XXX: temporary WA for different Xe_HP devices
+    if (gpu_arch_ == compute::gpu_arch_t::xe_hp) {
+        // query extensions
+        size_t param_size = 0;
+        err = clGetDeviceInfo(
+                device, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
+        OCL_CHECK(err);
+
+        std::string extension_string(param_size, '\0');
+        err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_size,
+                &extension_string[0], &param_size);
+        OCL_CHECK(err);
+        if (extension_string.find(ext2cl_str(compute::device_ext_t::khr_fp64))
+                == std::string::npos)
+            gpu_arch_ = compute::gpu_arch_t::xe_hpg;
+    }
+    return status::success;
+}
+
+status_t device_info_t::init_device_name(impl::engine_t *engine) {
+    cl_int err = CL_SUCCESS;
+    auto device = utils::downcast<const engine_t *>(engine)->device();
+
+    size_t param_size = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &param_size);
+    OCL_CHECK(err);
+
+    name_ = std::string(param_size, '\0');
+    err = clGetDeviceInfo(
+            device, CL_DEVICE_NAME, param_size, &name_[0], &param_size);
+    OCL_CHECK(err);
+
+    return status::success;
+}
+
+status_t device_info_t::init_runtime_version(impl::engine_t *engine) {
+    auto device = utils::downcast<const engine_t *>(engine)->device();
+    runtime_version_ = get_driver_version(device);
+    return status::success;
+}
+
+status_t device_info_t::init_extensions(impl::engine_t *engine) {
+    cl_int err = CL_SUCCESS;
+    auto device = utils::downcast<const engine_t *>(engine)->device();
+
+    // query device for extensions
+    size_t param_size = 0;
+    err = clGetDeviceInfo(
+            device, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
+    OCL_CHECK(err);
+
+    std::string extension_string(param_size, '\0');
+    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_size,
+            &extension_string[0], &param_size);
+    OCL_CHECK(err);
+
+    // convert to ours
+    using namespace compute;
+    for (uint64_t i_ext = 1; i_ext < (uint64_t)device_ext_t::last;
+            i_ext <<= 1) {
+        const char *s_ext = ext2cl_str((device_ext_t)i_ext);
+        if (s_ext && extension_string.find(s_ext) != std::string::npos) {
+            extensions_ |= i_ext;
+        }
+    }
+
+    // Handle future extensions, not yet supported by the OpenCL API
+    extensions_
+            |= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic());
+
+    return status::success;
+}
+
+status_t device_info_t::init_attributes(impl::engine_t *engine) {
+    cl_int err = CL_SUCCESS;
+    auto device = utils::downcast<const engine_t *>(engine)->device();
+
+    CHECK(get_ocl_device_eu_count(device, gpu_arch_, &eu_count_));
+
+    size_t max_wg_size = 0;
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+            sizeof(max_wg_size), &max_wg_size, nullptr);
+    OCL_CHECK(err);
+    max_wg_size_ = max_wg_size;
+
+    cl_ulong mem_cache_size;
+    err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
+            sizeof(mem_cache_size), &mem_cache_size, nullptr);
+    OCL_CHECK(err);
+    l3_cache_size_ = mem_cache_size;
+
+    size_t max_kernel_param_size;
+    err = clGetDeviceInfo(device, CL_DEVICE_MAX_PARAMETER_SIZE,
+            sizeof(max_kernel_param_size), &max_kernel_param_size, nullptr);
+    OCL_CHECK(err);
+    max_kernel_param_size_ = max_kernel_param_size;
+
+    cl_uint device_address_bits;
+    err = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS,
+            sizeof(device_address_bits), &device_address_bits, nullptr);
+    OCL_CHECK(err);
+    device_address_bits_ = device_address_bits;
+
+#ifdef cl_intel_unified_shared_memory
+    cl_device_unified_shared_memory_capabilities_intel
+            system_memory_capabilities_intel
+            = 0;
+    err = clGetDeviceInfo(device,
+            CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL,
+            sizeof(cl_device_unified_shared_memory_capabilities_intel),
+            &system_memory_capabilities_intel, nullptr);
+    OCL_CHECK(err);
+    mayiuse_system_memory_allocators_ = system_memory_capabilities_intel
+            & CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL;
+#endif
+
+    return status::success;
+}
+
+std::string device_info_t::get_cl_ext_options() const {
+    using namespace compute;
+
+    std::string opts;
+    for (uint64_t i_ext = 1; i_ext < (uint64_t)device_ext_t::last;
+            i_ext <<= 1) {
+        auto ext = (device_ext_t)i_ext;
+
+        // Use real GPU extensions
+        if (!has(ext)) continue;
+
+        // These extensions are not handled properly by the OpenCL runtime.
+        // Pass macros for them manually.
+        if (utils::one_of(ext, device_ext_t::intel_global_float_atomics,
+                    device_ext_t::intel_subgroup_matrix_multiply_accumulate,
+                    device_ext_t::
+                            intel_subgroup_split_matrix_multiply_accumulate,
+                    device_ext_t::intel_global_float_atomics,
+                    device_ext_t::future_bf16_cvt,
+                    device_ext_t::intel_dot_accumulate))
+            opts += std::string("-D") + ext2cl_str(ext) + " ";
+    }
+
+    return opts;
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/device_info.hpp b/src/gpu/intel/ocl/device_info.hpp
new file mode 100644
index 00000000000..9533e01d4a4
--- /dev/null
+++ b/src/gpu/intel/ocl/device_info.hpp
@@ -0,0 +1,51 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_DEVICE_INFO_HPP
+#define GPU_INTEL_OCL_DEVICE_INFO_HPP
+
+#include <string>
+#include <vector>
+#include <CL/cl.h>
+
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+class device_info_t : public compute::device_info_t {
+public:
+    std::string get_cl_ext_options() const;
+
+protected:
+    status_t init_device_name(impl::engine_t *engine) override;
+    status_t init_arch(impl::engine_t *engine) override;
+    status_t init_runtime_version(impl::engine_t *engine) override;
+    status_t init_extensions(impl::engine_t *engine) override;
+    status_t init_attributes(impl::engine_t *engine) override;
+};
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_OCL_DEVICE_INFO_HPP
diff --git a/src/gpu/intel/ocl/dispatch.h b/src/gpu/intel/ocl/dispatch.h
index 282025465f9..8d7db4a42f6 100644
--- a/src/gpu/intel/ocl/dispatch.h
+++ b/src/gpu/intel/ocl/dispatch.h
@@ -17,17 +17,13 @@
 #ifndef GPU_INTEL_OCL_DISPATCH_H
 #define GPU_INTEL_OCL_DISPATCH_H
 
-#define ROUND_UP(a, b) (((a) + (b)-1) / (b))
+#include "gpu/intel/ocl/ocl_math_utils.h"
 
 #ifndef USE_CUSTOM_GWS_GET_ID
 #define GWS_GET_THREAD_ID(index) get_global_id(index)
 #endif
 
-#ifdef USE_INT32_OFFSET
-#define off_t int
-#else
-#define off_t long
-#endif
+#define GWS_OVERFLOW (GWS0_OVERFLOW || GWS1_OVERFLOW || GWS2_OVERFLOW)
 
 #ifdef GWS_WITH_RUNTIME_PARAMS
 
@@ -146,35 +142,35 @@
 #define GWS_OP_MOD(idx, stride, max) ((idx) / (stride) % (max))
 
 // clang-format off
-#define GWS0_GET_ID0() GWS0_OP0(GWS_GET_THREAD_ID(GWS0_IDX0), GWS0_STRIDE0, ROUND_UP(GWS0_DIM0, GWS0_BLOCK0)) / GWS0_VEC_SIZE0 * GWS0_VEC_SIZE0 * GWS0_BLOCK0
-#define GWS0_GET_ID1() GWS0_OP1(GWS_GET_THREAD_ID(GWS0_IDX1), GWS0_STRIDE1, ROUND_UP(GWS0_DIM1, GWS0_BLOCK1)) / GWS0_VEC_SIZE1 * GWS0_VEC_SIZE1 * GWS0_BLOCK1
-#define GWS0_GET_ID2() GWS0_OP2(GWS_GET_THREAD_ID(GWS0_IDX2), GWS0_STRIDE2, ROUND_UP(GWS0_DIM2, GWS0_BLOCK2)) / GWS0_VEC_SIZE2 * GWS0_VEC_SIZE2 * GWS0_BLOCK2
-#define GWS0_GET_ID3() GWS0_OP3(GWS_GET_THREAD_ID(GWS0_IDX3), GWS0_STRIDE3, ROUND_UP(GWS0_DIM3, GWS0_BLOCK3)) / GWS0_VEC_SIZE3 * GWS0_VEC_SIZE3 * GWS0_BLOCK3
-#define GWS0_GET_ID4() GWS0_OP4(GWS_GET_THREAD_ID(GWS0_IDX4), GWS0_STRIDE4, ROUND_UP(GWS0_DIM4, GWS0_BLOCK4)) / GWS0_VEC_SIZE4 * GWS0_VEC_SIZE4 * GWS0_BLOCK4
-#define GWS0_GET_ID5() GWS0_OP5(GWS_GET_THREAD_ID(GWS0_IDX5), GWS0_STRIDE5, ROUND_UP(GWS0_DIM5, GWS0_BLOCK5)) / GWS0_VEC_SIZE5 * GWS0_VEC_SIZE5 * GWS0_BLOCK5
-
-#define GWS1_GET_ID0() GWS1_OP0(GWS_GET_THREAD_ID(GWS1_IDX0), GWS1_STRIDE0, ROUND_UP(GWS1_DIM0, GWS1_BLOCK0)) / GWS1_VEC_SIZE0 * GWS1_VEC_SIZE0 * GWS1_BLOCK0
-#define GWS1_GET_ID1() GWS1_OP1(GWS_GET_THREAD_ID(GWS1_IDX1), GWS1_STRIDE1, ROUND_UP(GWS1_DIM1, GWS1_BLOCK1)) / GWS1_VEC_SIZE1 * GWS1_VEC_SIZE1 * GWS1_BLOCK1
-#define GWS1_GET_ID2() GWS1_OP2(GWS_GET_THREAD_ID(GWS1_IDX2), GWS1_STRIDE2, ROUND_UP(GWS1_DIM2, GWS1_BLOCK2)) / GWS1_VEC_SIZE2 * GWS1_VEC_SIZE2 * GWS1_BLOCK2
-#define GWS1_GET_ID3() GWS1_OP3(GWS_GET_THREAD_ID(GWS1_IDX3), GWS1_STRIDE3, ROUND_UP(GWS1_DIM3, GWS1_BLOCK3)) / GWS1_VEC_SIZE3 * GWS1_VEC_SIZE3 * GWS1_BLOCK3
-#define GWS1_GET_ID4() GWS1_OP4(GWS_GET_THREAD_ID(GWS1_IDX4), GWS1_STRIDE4, ROUND_UP(GWS1_DIM4, GWS1_BLOCK4)) / GWS1_VEC_SIZE4 * GWS1_VEC_SIZE4 * GWS1_BLOCK4
-#define GWS1_GET_ID5() GWS1_OP5(GWS_GET_THREAD_ID(GWS1_IDX5), GWS1_STRIDE5, ROUND_UP(GWS1_DIM5, GWS1_BLOCK5)) / GWS1_VEC_SIZE5 * GWS1_VEC_SIZE5 * GWS1_BLOCK5
-
-
-#define GWS2_GET_ID0() GWS2_OP0(GWS_GET_THREAD_ID(GWS2_IDX0), GWS2_STRIDE0, ROUND_UP(GWS2_DIM0, GWS2_BLOCK0)) / GWS2_VEC_SIZE0 * GWS2_VEC_SIZE0 * GWS2_BLOCK0
-#define GWS2_GET_ID1() GWS2_OP1(GWS_GET_THREAD_ID(GWS2_IDX1), GWS2_STRIDE1, ROUND_UP(GWS2_DIM1, GWS2_BLOCK1)) / GWS2_VEC_SIZE1 * GWS2_VEC_SIZE1 * GWS2_BLOCK1
-#define GWS2_GET_ID2() GWS2_OP2(GWS_GET_THREAD_ID(GWS2_IDX2), GWS2_STRIDE2, ROUND_UP(GWS2_DIM2, GWS2_BLOCK2)) / GWS2_VEC_SIZE2 * GWS2_VEC_SIZE2 * GWS2_BLOCK2
-#define GWS2_GET_ID3() GWS2_OP3(GWS_GET_THREAD_ID(GWS2_IDX3), GWS2_STRIDE3, ROUND_UP(GWS2_DIM3, GWS2_BLOCK3)) / GWS2_VEC_SIZE3 * GWS2_VEC_SIZE3 * GWS2_BLOCK3
-#define GWS2_GET_ID4() GWS2_OP4(GWS_GET_THREAD_ID(GWS2_IDX4), GWS2_STRIDE4, ROUND_UP(GWS2_DIM4, GWS2_BLOCK4)) / GWS2_VEC_SIZE4 * GWS2_VEC_SIZE4 * GWS2_BLOCK4
-#define GWS2_GET_ID5() GWS2_OP5(GWS_GET_THREAD_ID(GWS2_IDX5), GWS2_STRIDE5, ROUND_UP(GWS2_DIM5, GWS2_BLOCK5)) / GWS2_VEC_SIZE5 * GWS2_VEC_SIZE5 * GWS2_BLOCK5
-
-
-#define GWS3_GET_ID0() GWS3_OP0(GWS_GET_THREAD_ID(GWS3_IDX0), GWS3_STRIDE0, ROUND_UP(GWS3_DIM0, GWS3_BLOCK0)) / GWS3_VEC_SIZE0 * GWS3_VEC_SIZE0 * GWS3_BLOCK0
-#define GWS3_GET_ID1() GWS3_OP1(GWS_GET_THREAD_ID(GWS3_IDX1), GWS3_STRIDE1, ROUND_UP(GWS3_DIM1, GWS3_BLOCK1)) / GWS3_VEC_SIZE1 * GWS3_VEC_SIZE1 * GWS3_BLOCK1
-#define GWS3_GET_ID2() GWS3_OP2(GWS_GET_THREAD_ID(GWS3_IDX2), GWS3_STRIDE2, ROUND_UP(GWS3_DIM2, GWS3_BLOCK2)) / GWS3_VEC_SIZE2 * GWS3_VEC_SIZE2 * GWS3_BLOCK2
-#define GWS3_GET_ID3() GWS3_OP3(GWS_GET_THREAD_ID(GWS3_IDX3), GWS3_STRIDE3, ROUND_UP(GWS3_DIM3, GWS3_BLOCK3)) / GWS3_VEC_SIZE3 * GWS3_VEC_SIZE3 * GWS3_BLOCK3
-#define GWS3_GET_ID4() GWS3_OP4(GWS_GET_THREAD_ID(GWS3_IDX4), GWS3_STRIDE4, ROUND_UP(GWS3_DIM4, GWS3_BLOCK4)) / GWS3_VEC_SIZE4 * GWS3_VEC_SIZE4 * GWS3_BLOCK4
-#define GWS3_GET_ID5() GWS3_OP5(GWS_GET_THREAD_ID(GWS3_IDX5), GWS3_STRIDE5, ROUND_UP(GWS3_DIM5, GWS3_BLOCK5)) / GWS3_VEC_SIZE5 * GWS3_VEC_SIZE5 * GWS3_BLOCK5
+#define GWS0_GET_ID0() GWS0_OP0(GWS_GET_THREAD_ID(GWS0_IDX0), GWS0_STRIDE0, div_up(GWS0_DIM0, GWS0_BLOCK0)) / GWS0_VEC_SIZE0 * GWS0_VEC_SIZE0 * GWS0_BLOCK0
+#define GWS0_GET_ID1() GWS0_OP1(GWS_GET_THREAD_ID(GWS0_IDX1), GWS0_STRIDE1, div_up(GWS0_DIM1, GWS0_BLOCK1)) / GWS0_VEC_SIZE1 * GWS0_VEC_SIZE1 * GWS0_BLOCK1
+#define GWS0_GET_ID2() GWS0_OP2(GWS_GET_THREAD_ID(GWS0_IDX2), GWS0_STRIDE2, div_up(GWS0_DIM2, GWS0_BLOCK2)) / GWS0_VEC_SIZE2 * GWS0_VEC_SIZE2 * GWS0_BLOCK2
+#define GWS0_GET_ID3() GWS0_OP3(GWS_GET_THREAD_ID(GWS0_IDX3), GWS0_STRIDE3, div_up(GWS0_DIM3, GWS0_BLOCK3)) / GWS0_VEC_SIZE3 * GWS0_VEC_SIZE3 * GWS0_BLOCK3
+#define GWS0_GET_ID4() GWS0_OP4(GWS_GET_THREAD_ID(GWS0_IDX4), GWS0_STRIDE4, div_up(GWS0_DIM4, GWS0_BLOCK4)) / GWS0_VEC_SIZE4 * GWS0_VEC_SIZE4 * GWS0_BLOCK4
+#define GWS0_GET_ID5() GWS0_OP5(GWS_GET_THREAD_ID(GWS0_IDX5), GWS0_STRIDE5, div_up(GWS0_DIM5, GWS0_BLOCK5)) / GWS0_VEC_SIZE5 * GWS0_VEC_SIZE5 * GWS0_BLOCK5
+
+#define GWS1_GET_ID0() GWS1_OP0(GWS_GET_THREAD_ID(GWS1_IDX0), GWS1_STRIDE0, div_up(GWS1_DIM0, GWS1_BLOCK0)) / GWS1_VEC_SIZE0 * GWS1_VEC_SIZE0 * GWS1_BLOCK0
+#define GWS1_GET_ID1() GWS1_OP1(GWS_GET_THREAD_ID(GWS1_IDX1), GWS1_STRIDE1, div_up(GWS1_DIM1, GWS1_BLOCK1)) / GWS1_VEC_SIZE1 * GWS1_VEC_SIZE1 * GWS1_BLOCK1
+#define GWS1_GET_ID2() GWS1_OP2(GWS_GET_THREAD_ID(GWS1_IDX2), GWS1_STRIDE2, div_up(GWS1_DIM2, GWS1_BLOCK2)) / GWS1_VEC_SIZE2 * GWS1_VEC_SIZE2 * GWS1_BLOCK2
+#define GWS1_GET_ID3() GWS1_OP3(GWS_GET_THREAD_ID(GWS1_IDX3), GWS1_STRIDE3, div_up(GWS1_DIM3, GWS1_BLOCK3)) / GWS1_VEC_SIZE3 * GWS1_VEC_SIZE3 * GWS1_BLOCK3
+#define GWS1_GET_ID4() GWS1_OP4(GWS_GET_THREAD_ID(GWS1_IDX4), GWS1_STRIDE4, div_up(GWS1_DIM4, GWS1_BLOCK4)) / GWS1_VEC_SIZE4 * GWS1_VEC_SIZE4 * GWS1_BLOCK4
+#define GWS1_GET_ID5() GWS1_OP5(GWS_GET_THREAD_ID(GWS1_IDX5), GWS1_STRIDE5, div_up(GWS1_DIM5, GWS1_BLOCK5)) / GWS1_VEC_SIZE5 * GWS1_VEC_SIZE5 * GWS1_BLOCK5
+
+
+#define GWS2_GET_ID0() GWS2_OP0(GWS_GET_THREAD_ID(GWS2_IDX0), GWS2_STRIDE0, div_up(GWS2_DIM0, GWS2_BLOCK0)) / GWS2_VEC_SIZE0 * GWS2_VEC_SIZE0 * GWS2_BLOCK0
+#define GWS2_GET_ID1() GWS2_OP1(GWS_GET_THREAD_ID(GWS2_IDX1), GWS2_STRIDE1, div_up(GWS2_DIM1, GWS2_BLOCK1)) / GWS2_VEC_SIZE1 * GWS2_VEC_SIZE1 * GWS2_BLOCK1
+#define GWS2_GET_ID2() GWS2_OP2(GWS_GET_THREAD_ID(GWS2_IDX2), GWS2_STRIDE2, div_up(GWS2_DIM2, GWS2_BLOCK2)) / GWS2_VEC_SIZE2 * GWS2_VEC_SIZE2 * GWS2_BLOCK2
+#define GWS2_GET_ID3() GWS2_OP3(GWS_GET_THREAD_ID(GWS2_IDX3), GWS2_STRIDE3, div_up(GWS2_DIM3, GWS2_BLOCK3)) / GWS2_VEC_SIZE3 * GWS2_VEC_SIZE3 * GWS2_BLOCK3
+#define GWS2_GET_ID4() GWS2_OP4(GWS_GET_THREAD_ID(GWS2_IDX4), GWS2_STRIDE4, div_up(GWS2_DIM4, GWS2_BLOCK4)) / GWS2_VEC_SIZE4 * GWS2_VEC_SIZE4 * GWS2_BLOCK4
+#define GWS2_GET_ID5() GWS2_OP5(GWS_GET_THREAD_ID(GWS2_IDX5), GWS2_STRIDE5, div_up(GWS2_DIM5, GWS2_BLOCK5)) / GWS2_VEC_SIZE5 * GWS2_VEC_SIZE5 * GWS2_BLOCK5
+
+
+#define GWS3_GET_ID0() GWS3_OP0(GWS_GET_THREAD_ID(GWS3_IDX0), GWS3_STRIDE0, div_up(GWS3_DIM0, GWS3_BLOCK0)) / GWS3_VEC_SIZE0 * GWS3_VEC_SIZE0 * GWS3_BLOCK0
+#define GWS3_GET_ID1() GWS3_OP1(GWS_GET_THREAD_ID(GWS3_IDX1), GWS3_STRIDE1, div_up(GWS3_DIM1, GWS3_BLOCK1)) / GWS3_VEC_SIZE1 * GWS3_VEC_SIZE1 * GWS3_BLOCK1
+#define GWS3_GET_ID2() GWS3_OP2(GWS_GET_THREAD_ID(GWS3_IDX2), GWS3_STRIDE2, div_up(GWS3_DIM2, GWS3_BLOCK2)) / GWS3_VEC_SIZE2 * GWS3_VEC_SIZE2 * GWS3_BLOCK2
+#define GWS3_GET_ID3() GWS3_OP3(GWS_GET_THREAD_ID(GWS3_IDX3), GWS3_STRIDE3, div_up(GWS3_DIM3, GWS3_BLOCK3)) / GWS3_VEC_SIZE3 * GWS3_VEC_SIZE3 * GWS3_BLOCK3
+#define GWS3_GET_ID4() GWS3_OP4(GWS_GET_THREAD_ID(GWS3_IDX4), GWS3_STRIDE4, div_up(GWS3_DIM4, GWS3_BLOCK4)) / GWS3_VEC_SIZE4 * GWS3_VEC_SIZE4 * GWS3_BLOCK4
+#define GWS3_GET_ID5() GWS3_OP5(GWS_GET_THREAD_ID(GWS3_IDX5), GWS3_STRIDE5, div_up(GWS3_DIM5, GWS3_BLOCK5)) / GWS3_VEC_SIZE5 * GWS3_VEC_SIZE5 * GWS3_BLOCK5
 // clang-format on
 #endif
 
@@ -226,22 +222,4 @@
 #define NAMED_KERNEL_ATTR(name) \
     CONCAT2(NAMED_KERNEL_ATTR_SG, CONCAT2(GWS_WITH_SG_, name))(name)
 #define KERNEL_ATTR NAMED_KERNEL_ATTR(DEFAULT)
-
-// Macro to emulate behavior of non-uniform work-groups. It is expected to be
-// called at the beginning of the kernel.
-// NOTE: The kernel cannot use synchronization within work-group (barrier,
-// etc).
-#define MAYBE_SKIP_NON_UNIFORM_WG() \
-    do { \
-        if ((GWS_0 != GWS_ORIG_0) && (GWS_ORIG_0 % LWS_0 != 0) \
-                && (GWS_GET_THREAD_ID(0) >= GWS_ORIG_0)) \
-            return; \
-        if ((GWS_1 != GWS_ORIG_1) && (GWS_ORIG_1 % LWS_1 != 0) \
-                && (GWS_GET_THREAD_ID(1) >= GWS_ORIG_1)) \
-            return; \
-        if ((GWS_2 != GWS_ORIG_2) && (GWS_ORIG_2 % LWS_2 != 0) \
-                && (GWS_GET_THREAD_ID(2) >= GWS_ORIG_2)) \
-            return; \
-    } while (0)
-
 #endif
diff --git a/src/gpu/intel/ocl/engine.cpp b/src/gpu/intel/ocl/engine.cpp
new file mode 100644
index 00000000000..cac863bd7fe
--- /dev/null
+++ b/src/gpu/intel/ocl/engine.cpp
@@ -0,0 +1,410 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <algorithm>
+#include <sstream>
+#include <CL/cl.h>
+
+#include "gpu/intel/ocl/engine.hpp"
+
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+
+#include "xpu/ocl/memory_storage.hpp"
+
+#include "gpu/intel/microkernels/fuser.hpp"
+#include "gpu/intel/ocl/device_info.hpp"
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/kernel.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+const char *get_kernel_source(const char *name);
+const char *get_kernel_header(const std::string &name);
+
+status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
+        cl_device_id dev, cl_context ctx, size_t index,
+        const std::vector<uint8_t> &cache_blob) {
+    gpu_assert(engine_kind == engine_kind::gpu);
+    std::unique_ptr<ocl::engine_t, engine_deleter_t> e(
+            (new ocl::engine_t(dev, ctx, index)));
+    if (!e) return status::out_of_memory;
+
+    CHECK(e->init(cache_blob));
+    *engine = e.release();
+
+    return status::success;
+}
+
+void maybe_print_build_info(const std::vector<const char *> &kernel_names,
+        const compute::kernel_ctx_t &kernel_ctx) {
+#if defined(DISABLE_VERBOSE)
+    return;
+#endif
+
+    // Print out kernel options if the correct verbosity is set
+    if (get_verbose(verbose_t::debuginfo) >= 5) {
+        std::ostringstream oss;
+        for (const char *name : kernel_names)
+            oss << name << " ";
+
+        VFORMAT(get_msec(), verbose_t::debuginfo, primitive, exec,
+                VERBOSE_debug, "kernel options,%s,%s", oss.str().c_str(),
+                kernel_ctx.options().c_str());
+    }
+}
+
+status_t engine_t::init() {
+    return init({});
+}
+
+status_t engine_t::init(const std::vector<uint8_t> &cache_blob) {
+    CHECK(init_impl());
+    CHECK(compute::compute_engine_t::init(cache_blob));
+    return status::success;
+}
+
+status_t engine_t::create_stream(
+        impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
+    return stream_t::create_stream(stream, this, stream_impl);
+}
+
+namespace {
+
+status_t create_ocl_kernel_from_cache_blob(const engine_t *ocl_engine,
+        const cache_blob_t &cache_blob,
+        const std::vector<const char *> &kernel_names,
+        std::vector<compute::kernel_t> *kernels) {
+    auto dev = ocl_engine->device();
+    auto ctx = ocl_engine->context();
+    cl_int err = CL_SUCCESS;
+    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (!kernel_names[i] && kernel_names.size() > 1) continue;
+        std::string kernel_name(kernel_names[i] ? kernel_names[i] : "");
+
+        const uint8_t *binary = nullptr;
+        size_t binary_size = 0;
+
+        CHECK(cache_blob.get_binary(&binary, &binary_size));
+
+        auto program = xpu::ocl::make_wrapper(clCreateProgramWithBinary(
+                ctx, 1, &dev, &binary_size, &binary, nullptr, &err));
+        OCL_CHECK(err);
+        err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr);
+        OCL_CHECK(err);
+
+        if (kernel_name.empty()) {
+            // Handle the ngen cases when kernel name is not available.
+            // Query the kernel name from the program. It's expected that
+            // an ngen based program contains only 1 kernel.
+            if (kernel_names.size() != 1 || kernels->size() != 1)
+                return status::invalid_arguments;
+            size_t kernel_name_size = 0;
+            err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, nullptr,
+                    &kernel_name_size);
+            OCL_CHECK(err);
+
+            kernel_name.resize(kernel_name_size);
+            err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES,
+                    kernel_name_size, &kernel_name[0], nullptr);
+            OCL_CHECK(err);
+            assert(!kernel_name.empty());
+            if (kernel_name.empty()) return status::runtime_error;
+            // Remove the null terminator as std::string already includes it.
+            kernel_name.pop_back();
+        }
+        auto ocl_kernel = xpu::ocl::make_wrapper(
+                clCreateKernel(program, kernel_name.c_str(), &err));
+        OCL_CHECK(err);
+        CHECK(kernel_t::make((*kernels)[i], std::move(ocl_kernel), {}));
+    }
+
+    return status::success;
+}
+
+cl_int maybe_print_debug_info(
+        cl_int err_, cl_program program, cl_device_id dev) {
+    // Return error code if verbose is not enabled.
+    bool is_err = get_verbose(verbose_t::error) && err_ != CL_SUCCESS;
+    bool is_warn = get_verbose(verbose_t::warn);
+
+    if (!is_err && !is_warn) return err_;
+
+    size_t log_length = 0;
+    auto err = clGetProgramBuildInfo(
+            program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_length);
+    gpu_assert(err == CL_SUCCESS);
+
+    if (log_length > 1 && (is_err || is_warn)) {
+        std::vector<char> log_buf(log_length);
+        err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
+                log_length, log_buf.data(), nullptr);
+        gpu_assert(err == CL_SUCCESS);
+        if (is_err)
+            VERROR(common, ocl,
+                    "Error during the build of OpenCL program. Build log:\n%s",
+                    log_buf.data());
+        else if (is_warn)
+            VWARN(common, ocl,
+                    "Warning during the build of OpenCL program. Build "
+                    "log:\n%s",
+                    log_buf.data());
+    }
+    MAYBE_UNUSED(err);
+    return err_;
+};
+
+inline status_t preprocess_headers(std::stringstream &pp_code, const char *code,
+        const compute::kernel_ctx_t &kernel_ctx) {
+    std::stringstream code_stream(code);
+
+    for (std::string line; std::getline(code_stream, line);) {
+        const size_t include_pos = line.find("#include");
+        if (include_pos != std::string::npos) {
+            static constexpr size_t include_len = 8;
+            const size_t first_quote_pos
+                    = line.find("\"", include_pos + include_len);
+            const size_t second_quote_pos
+                    = line.find("\"", first_quote_pos + 1);
+            const size_t kernel_name_len
+                    = second_quote_pos - first_quote_pos - 1;
+            const auto header_name
+                    = line.substr(first_quote_pos + 1, kernel_name_len);
+            const char *header_source
+                    = kernel_ctx.get_custom_header(header_name);
+            if (!header_source) header_source = get_kernel_header(header_name);
+            CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
+        } else {
+            pp_code << line << std::endl;
+        }
+    }
+    return status::success;
+}
+
+inline status_t fuse_microkernels(cl_context context, cl_device_id device,
+        xpu::ocl::wrapper_t<cl_program> &program, const char *code) {
+    if (micro::hasMicrokernels(code)) {
+        cl_int status = CL_SUCCESS;
+        size_t binary_size = 0;
+        OCL_CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+                sizeof(binary_size), &binary_size, nullptr));
+
+        std::vector<uint8_t> binary(binary_size);
+        auto binary_data = binary.data();
+        OCL_CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARIES,
+                sizeof(binary_data), &binary_data, nullptr));
+
+        try {
+            micro::fuseMicrokernels(binary, code);
+        } catch (...) { return status::runtime_error; }
+
+        auto nbinary_size = binary.size();
+        auto nbinary_data = const_cast<const uint8_t *>(binary.data());
+
+        program = xpu::ocl::make_wrapper(clCreateProgramWithBinary(context, 1,
+                &device, &nbinary_size, &nbinary_data, nullptr, &status));
+        OCL_CHECK(status);
+        OCL_CHECK(clBuildProgram(program, 1, &device, "", nullptr, nullptr));
+    } else {
+        VWARN(common, runtime, "gpu microkernels not found");
+    }
+    return status::success;
+}
+
+} // namespace
+
+status_t engine_t::build_program_from_source(
+        xpu::ocl::wrapper_t<cl_program> &program, compute::program_src_t &src,
+        const char *code_string,
+        const compute::kernel_ctx_t &kernel_ctx) const {
+    std::string options = kernel_ctx.options();
+
+    // XXX: Update options by adding macros for OpenCL extensions that are not
+    // handled properly by the OpenCL runtime
+    auto *dev_info = utils::downcast<const device_info_t *>(device_info());
+    options += " " + dev_info->get_cl_ext_options();
+
+    cl_int err;
+    std::stringstream pp_code;
+    // The `cl_cache` requires using `clBuildProgram`. Unfortunately, unlike
+    // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
+    // that, a manual preprocessing of `include` header directives in the
+    // OpenCL kernels is required.
+    CHECK(preprocess_headers(pp_code, code_string, kernel_ctx));
+    std::string pp_code_str = pp_code.str();
+    const char *pp_code_str_ptr = pp_code_str.c_str();
+
+    src = {pp_code_str};
+    if (src) { options += " -g -s " + std::string(src.name()); }
+
+    debugdump_processed_source(
+            pp_code_str, options, dev_info->get_cl_ext_options());
+
+    auto ctx = context();
+    program = xpu::ocl::make_wrapper(
+            clCreateProgramWithSource(ctx, 1, &pp_code_str_ptr, nullptr, &err));
+    OCL_CHECK(err);
+
+    auto dev = device();
+    err = clBuildProgram(program, 1, &dev, options.c_str(), nullptr, nullptr);
+    OCL_CHECK(maybe_print_debug_info(err, program, dev));
+
+    if (kernel_ctx.has_custom_headers())
+        CHECK(fuse_microkernels(ctx, dev, program, pp_code_str_ptr));
+
+    return status::success;
+}
+
+status_t engine_t::create_kernel_from_binary(compute::kernel_t &kernel,
+        const xpu::binary_t &binary, const char *kernel_name,
+        const compute::program_src_t &src) const {
+    xpu::ocl::wrapper_t<cl_program> program;
+    CHECK(xpu::ocl::create_program(
+            program, this->device(), this->context(), binary));
+
+    cl_int err;
+    auto ocl_kernel = xpu::ocl::make_wrapper(
+            clCreateKernel(program, kernel_name, &err));
+    OCL_CHECK(err);
+    CHECK(kernel_t::make(kernel, std::move(ocl_kernel), src));
+
+    return status::success;
+}
+
+status_t engine_t::create_kernels_from_cache_blob(
+        const cache_blob_t &cache_blob, std::vector<compute::kernel_t> &kernels,
+        const std::vector<const char *> &kernel_names) const {
+    return create_ocl_kernel_from_cache_blob(
+            this, cache_blob, kernel_names, &kernels);
+}
+
+status_t engine_t::create_kernel(
+        compute::kernel_t *kernel, jit::generator_base_t *jitter) const {
+    if (!jitter) return status::invalid_arguments;
+    return jitter->get_kernel(*kernel, this);
+}
+
+status_t engine_t::create_program(xpu::ocl::wrapper_t<cl_program> &program,
+        compute::program_src_t &src,
+        const std::vector<const char *> &kernel_names,
+        const compute::kernel_ctx_t &kernel_ctx) const {
+
+    const char *source = nullptr;
+    for (size_t i = 0; source == nullptr && i < kernel_names.size(); i++)
+        source = ocl::get_kernel_source(kernel_names[i]);
+    gpu_assert(source)
+            << "No kernel source file was found for the kernels: " <<
+            [&]() {
+                std::ostringstream oss;
+                bool is_first = true;
+                for (auto &n : kernel_names) {
+                    if (!is_first) oss << ", ";
+                    oss << n;
+                    is_first = false;
+                }
+                return oss.str();
+            }()
+            << ". In order to map kernel names to the implementation "
+               "file, at least one kernel needs to be implemented in a .cl "
+               "file";
+
+    gpu_assert([&]() {
+        for (auto &name : kernel_names) {
+            if (!utils::one_of(ocl::get_kernel_source(name), source, nullptr))
+                return false;
+        }
+        return true;
+    }()) << "Due to the cost of compiling OpenCL programs, building kernels "
+            "from multiple source files is unsupported. Either consolidate "
+            "kernels in a single .cl source file or split creation in groups "
+            "based on their .cl source file.";
+
+    return build_program_from_source(program, src, source, kernel_ctx);
+}
+
+status_t engine_t::create_kernels(std::vector<compute::kernel_t> *kernels,
+        const std::vector<const char *> &kernel_names,
+        const compute::kernel_ctx_t &kernel_ctx) const {
+    maybe_print_build_info(kernel_names, kernel_ctx);
+
+    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
+
+    xpu::ocl::wrapper_t<cl_program> program;
+    compute::program_src_t src;
+    CHECK(create_program(program, src, kernel_names, kernel_ctx));
+    return create_kernels_from_program(kernels, kernel_names, program, src);
+}
+
+status_t engine_t::create_kernels_from_program(
+        std::vector<compute::kernel_t> *kernels,
+        const std::vector<const char *> &kernel_names, cl_program program,
+        const compute::program_src_t &src) {
+    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); ++i) {
+        if (!kernel_names[i]) continue;
+        cl_int err;
+        xpu::ocl::wrapper_t<cl_kernel> ocl_kernel
+                = clCreateKernel(program, kernel_names[i], &err);
+        OCL_CHECK(err);
+        CHECK(kernel_t::make((*kernels)[i], std::move(ocl_kernel), src));
+    }
+
+    return status::success;
+}
+
+status_t engine_t::init_device_info() {
+    return init_device_info({});
+}
+
+status_t engine_t::init_device_info(const std::vector<uint8_t> &cache_blob) {
+    device_info_ = std::make_shared<device_info_t>();
+    CHECK(device_info_->init(this, cache_blob));
+    return status::success;
+}
+
+status_t engine_t::serialize_device(serialization_stream_t &sstream) const {
+    size_t platform_name_len;
+    cl_int err = clGetPlatformInfo(impl()->platform(), CL_PLATFORM_NAME, 0,
+            nullptr, &platform_name_len);
+    OCL_CHECK(err);
+
+    std::vector<char> platform_name(platform_name_len);
+    err = clGetPlatformInfo(impl()->platform(), CL_PLATFORM_NAME,
+            platform_name.size(), platform_name.data(), nullptr);
+    OCL_CHECK(err);
+
+    sstream.append_array(platform_name.size(), platform_name.data());
+    sstream.append_array(
+            device_info()->name().size(), device_info()->name().data());
+    sstream.append(device_info()->runtime_version().major);
+    sstream.append(device_info()->runtime_version().minor);
+    sstream.append(device_info()->runtime_version().build);
+
+    return status::success;
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/engine.hpp b/src/gpu/intel/ocl/engine.hpp
new file mode 100644
index 00000000000..24eea8b664a
--- /dev/null
+++ b/src/gpu/intel/ocl/engine.hpp
@@ -0,0 +1,131 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_ENGINE_HPP
+#define GPU_INTEL_OCL_ENGINE_HPP
+
+#include "common/c_types_map.hpp"
+#include "common/utils.hpp"
+#include "gpu/gpu_impl_list.hpp"
+#include "gpu/intel/compute/compute_engine.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+#include "xpu/ocl/engine_impl.hpp"
+#include "xpu/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
+        cl_device_id dev, cl_context ctx, size_t index,
+        const std::vector<uint8_t> &cache_blob);
+
+class engine_t : public compute::compute_engine_t {
+public:
+    engine_t(cl_device_id adevice, cl_context acontext, size_t index)
+        : compute::compute_engine_t(
+                new xpu::ocl::engine_impl_t(adevice, acontext, index)) {}
+
+    status_t init() override;
+    status_t init(const std::vector<uint8_t> &cache_blob);
+
+    status_t create_stream(
+            impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
+
+    status_t create_kernel_from_binary(compute::kernel_t &kernel,
+            const xpu::binary_t &binary, const char *kernel_name,
+            const compute::program_src_t &src) const override;
+
+    status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob,
+            std::vector<compute::kernel_t> &kernels,
+            const std::vector<const char *> &kernel_names) const override;
+
+    status_t create_kernel(compute::kernel_t *kernel,
+            jit::generator_base_t *jitter) const override;
+
+    status_t create_kernels(std::vector<compute::kernel_t> *kernels,
+            const std::vector<const char *> &kernel_names,
+            const compute::kernel_ctx_t &kernel_ctx) const override;
+
+    static status_t create_kernels_from_program(
+            std::vector<compute::kernel_t> *kernels,
+            const std::vector<const char *> &kernel_names, cl_program program,
+            const compute::program_src_t &src);
+
+    const impl_list_item_t *get_concat_implementation_list() const override {
+        return gpu_impl_list_t::get_concat_implementation_list();
+    }
+
+    const impl_list_item_t *get_reorder_implementation_list(
+            const memory_desc_t *src_md,
+            const memory_desc_t *dst_md) const override {
+        return gpu_impl_list_t::get_reorder_implementation_list(src_md, dst_md);
+    }
+
+    const impl_list_item_t *get_sum_implementation_list() const override {
+        return gpu_impl_list_t::get_sum_implementation_list();
+    }
+
+    const impl_list_item_t *get_implementation_list(
+            const op_desc_t *desc) const override {
+        return gpu_impl_list_t::get_implementation_list(desc);
+    }
+
+    gpu_utils::device_id_t device_id() const override {
+        return std::make_tuple(0, reinterpret_cast<uint64_t>(device()), 0);
+    }
+
+    status_t serialize_device(serialization_stream_t &sstream) const override;
+
+    status_t get_cache_blob_size(size_t *size) const override {
+        return device_info_->get_cache_blob_size(size);
+    }
+
+    status_t get_cache_blob(size_t size, uint8_t *cache_blob) const override {
+        return device_info_->get_cache_blob(size, cache_blob);
+    }
+
+    status_t create_program(xpu::ocl::wrapper_t<cl_program> &program,
+            compute::program_src_t &src,
+            const std::vector<const char *> &kernel_names,
+            const compute::kernel_ctx_t &kernel_ctx) const;
+
+    DECLARE_COMMON_OCL_ENGINE_FUNCTIONS();
+
+protected:
+    const xpu::ocl::engine_impl_t *impl() const {
+        return (const xpu::ocl::engine_impl_t *)gpu::engine_t::impl();
+    }
+
+    status_t build_program_from_source(xpu::ocl::wrapper_t<cl_program> &program,
+            compute::program_src_t &src, const char *code_string,
+            const compute::kernel_ctx_t &kernel_ctx) const;
+
+    ~engine_t() override = default;
+
+    status_t init_device_info() override;
+    status_t init_device_info(const std::vector<uint8_t> &cache_blob) override;
+};
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/ocl/gemm/conv_gemm.hpp b/src/gpu/intel/ocl/gemm/conv_gemm.hpp
index 870f015f176..9cff580f84a 100644
--- a/src/gpu/intel/ocl/gemm/conv_gemm.hpp
+++ b/src/gpu/intel/ocl/gemm/conv_gemm.hpp
@@ -188,25 +188,32 @@ struct conv_gemm_t : public gpu_gemm_t {
 
     status_t execute(const gemm_exec_ctx_t &ctx) const override {
         exec_args_t args;
-        memory_t a(ctx.stream()->engine(), pd()->conv_pd->src_md(0),
-                ctx.args().a->clone());
-        memory_t b(ctx.stream()->engine(), pd()->conv_pd->src_md(1),
-                ctx.args().b->clone());
-        memory_t c(ctx.stream()->engine(), pd()->conv_pd->dst_md(),
-                ctx.args().c->clone());
-
-        std::unique_ptr<memory_t> bias = [&] {
+        std::unique_ptr<memory_t, memory_deleter_t> a;
+        CHECK(safe_ptr_assign(a,
+                new memory_t(ctx.stream()->engine(), pd()->conv_pd->src_md(0),
+                        ctx.args().a->clone())));
+        std::unique_ptr<memory_t, memory_deleter_t> b;
+        CHECK(safe_ptr_assign(b,
+                new memory_t(ctx.stream()->engine(), pd()->conv_pd->src_md(1),
+                        ctx.args().b->clone())));
+        std::unique_ptr<memory_t, memory_deleter_t> c;
+        CHECK(safe_ptr_assign(c,
+                new memory_t(ctx.stream()->engine(), pd()->conv_pd->dst_md(),
+                        ctx.args().c->clone())));
+
+        std::unique_ptr<memory_t, memory_deleter_t> bias = [&] {
             if (ctx.args().bias) {
-                return utils::make_unique<memory_t>(ctx.stream()->engine(),
-                        pd()->conv_pd->src_md(2), ctx.args().bias->clone());
+                return std::unique_ptr<memory_t, memory_deleter_t>(new memory_t(
+                        ctx.stream()->engine(), pd()->conv_pd->src_md(2),
+                        ctx.args().bias->clone()));
             } else {
-                return std::unique_ptr<memory_t>();
+                return std::unique_ptr<memory_t, memory_deleter_t>();
             }
         }();
 
-        args[DNNL_ARG_SRC] = {&a, true};
-        args[DNNL_ARG_WEIGHTS] = {&b, true};
-        args[DNNL_ARG_DST] = {&c, false};
+        args[DNNL_ARG_SRC] = {a.get(), true};
+        args[DNNL_ARG_WEIGHTS] = {b.get(), true};
+        args[DNNL_ARG_DST] = {c.get(), false};
         if (bias) args[DNNL_ARG_BIAS] = {bias.get(), true};
 
         auto exec_ctx = ctx.into_exec_ctx_t(std::move(args));
diff --git a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cl b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cl
index f8181ea8990..788779a0763 100644
--- a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cl
+++ b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,69 +15,19 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_io.h"
 #include "gpu/intel/ocl/ocl_math_utils.h"
 #include "gpu/intel/ocl/ocl_post_ops.h"
 #include "gpu/intel/ocl/ocl_types.h"
 
-#if defined(DST_DT_BF16)
-#define DST_TO_ACC(x) cvt_bf16_to_f32(x)
-#elif defined(DST_DT_BF8)
-#define DST_TO_ACC(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#elif defined(DST_DT_HF8)
-#define DST_TO_ACC(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#else
-#define DST_TO_ACC(x) (x)
-#endif
-#if defined(BIA_DT_BF16)
-#define BIA_TO_ACC(x) cvt_bf16_to_f32(x)
-#elif defined(BIA_DT_BF8)
-#define BIA_TO_ACC(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#elif defined(BIA_DT_HF8)
-#define BIA_TO_ACC(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#else
-#define BIA_TO_ACC(x) (x)
-#endif
-#if defined(SRC_DT_BF16)
-#define SRC_TO_ACC(x) cvt_bf16_to_f32(x)
-#elif defined(SRC_DT_BF8)
-#define SRC_TO_ACC(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#elif defined(SRC_DT_HF8)
-#define SRC_TO_ACC(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#else
-#define SRC_TO_ACC(x) (x)
-#endif
-#ifndef BIA_D2
-#define BIA_D2 1
-#endif
-#ifndef BIA_D3
-#define BIA_D3 1
-#endif
-#if BIA_NDIMS == 4
-#define BIA_OFF(x0, x1, d, h, w) \
-    (((x0 % BIA_D0) % BIA_B0) * BIA_SB0 + ((x0 % BIA_D0) / BIA_B0) * BIA_S0 \
-            + ((x1 % BIA_D1) % BIA_B1) * BIA_SB1 \
-            + ((x1 % BIA_D1) / BIA_B1) * BIA_S1 \
-            + ((h % BIA_D2) % BIA_B2) * BIA_SB2 \
-            + ((h % BIA_D2) / BIA_B2) * BIA_S2 \
-            + ((w % BIA_D3) % BIA_B3) * BIA_SB3 \
-            + ((w % BIA_D3) / BIA_B3) * BIA_S3)
-#elif BIA_NDIMS == 3
-#define BIA_OFF(x0, x1, d, h, w) \
-    (((x0 % BIA_D0) % BIA_B0) * BIA_SB0 + ((x0 % BIA_D0) / BIA_B0) * BIA_S0 \
-            + ((x1 % BIA_D1) % BIA_B1) * BIA_SB1 \
-            + ((x1 % BIA_D1) / BIA_B1) * BIA_S1 \
-            + ((w % BIA_D2) % BIA_B2) * BIA_SB2 \
-            + ((w % BIA_D2) / BIA_B2) * BIA_S2)
-#elif BIA_NDIMS == 2
-#define BIA_OFF(x0, x1, d, h, w) \
-    (((x0 % BIA_D0) % BIA_B0) * BIA_SB0 + ((x0 % BIA_D0) / BIA_B0) * BIA_S0 \
-            + ((x1 % BIA_D1) % BIA_B1) * BIA_SB1 \
-            + ((x1 % BIA_D1) / BIA_B1) * BIA_S1)
-#elif BIA_NDIMS == 1
-#define BIA_OFF(x1, x0, d, h, w) (x0)
-#endif
-__kernel void gemm_post_ops(__global SRC_DATA_T *src, __global BIA_DATA_T *bias,
-        __global DST_DATA_T *dst POST_OP_ARGS, __global SPAD_DATA_T *scratchpad,
+#undef SRC_OFF
+#define SRC_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(SRC, x0, x1, x2, x3, x4, x5)
+#define BIAS_OFF(x0, x1, x2, x3, x4, x5) \
+    OFF_MD(BIAS, (x0 % BIAS_PD0), (x1 % BIAS_PD1), (x2 % BIAS_PD2), \
+            (x3 % BIAS_PD3), (x4 % BIAS_PD4), (x5 % BIAS_PD5))
+
+__kernel void gemm_post_ops(__global SRC_DATA_T *src,
+        __global BIAS_DATA_T *bias, __global DST_DATA_T *dst POST_OP_ARGS,
         global float *a_scales, global WEI_SCALES_DATA_T *b_scales,
         global DST_SCALES_DATA_T *c_scales, int scale_stride,
         global int *dst_zp) {
@@ -86,75 +36,45 @@ __kernel void gemm_post_ops(__global SRC_DATA_T *src, __global BIA_DATA_T *bias,
     const uint d2 = GWS_GET_D2();
     const uint d3 = GWS_GET_D3();
 
-#if NDIMS == 4
-    size_t data_idx = DST_OFF(d0, d1, 0, d2, d3);
-#elif NDIMS == 3
-    size_t data_idx = DST_OFF(d0, d1, 0, 0, d2);
-#else
-    size_t data_idx = DST_OFF(d0, d1, 0, 0, 0);
-#endif
-#if USE_TEMP_DST == 1
-    ACC_DATA_T acc = SRC_TO_ACC(scratchpad[data_idx]);
-#else
-    ACC_DATA_T acc = SRC_TO_ACC(src[data_idx]);
-#endif
-    float accumulator = acc;
-    if ((d0 == D0_WO_PADDING && d1 == D1_WO_PADDING && d2 == D2_WO_PADDING
-                && d3 == D3_WO_PADDING)
-            || (d0 < D0_WO_PADDING && d1 < D1_WO_PADDING && d2 < D2_WO_PADDING
-                    && d3 < D3_WO_PADDING)) {
-
-#if A_SCALES || B_SCALES
-#define A_SCALE (A_SCALES ? a_scales[0] : 1)
-#if NDIMS == 2
-        const float b_scale
-                = B_SCALES ? WEI_SCALES_TO_REF(b_scales[scale_stride * d1]) : 1;
-#elif NDIMS == 3
-        const float b_scale
-                = B_SCALES ? WEI_SCALES_TO_REF(b_scales[scale_stride * d2]) : 1;
-#elif NDIMS == 4
-        const float b_scale
-                = B_SCALES ? WEI_SCALES_TO_REF(b_scales[scale_stride * d3]) : 1;
-#endif
-        acc *= A_SCALE * b_scale;
-#endif
+    size_t data_idx = SRC_OFF(d0, d1, d2, d3, 0, 0);
 
-#if WITH_BIAS == 1
-#if NDIMS == 4
-        size_t bia_idx = BIA_OFF(d0, d1, 0, d2, d3);
-#elif NDIMS == 3
-        size_t bia_idx = BIA_OFF(d0, d1, 0, 0, d2);
+    ACC_DATA_T acc;
+#if SRC_DT_F4_E2M1 || SRC_DT_F4_E3M0
+    load(&acc, src, data_idx);
 #else
-        size_t bia_idx = BIA_OFF(d0, d1, 0, 0, 0);
-#endif
-        acc += BIA_TO_ACC(bias[bia_idx]);
-#endif
+    load(&acc, src + data_idx);
+#endif
+    POST_OP_DATA_T accumulator = 0;
+    if (d0 < DST_D0 && d1 < DST_D1 && d2 < DST_D2 && d3 < DST_D3) {
+        const float a_scale = A_SCALES ? a_scales[0] : 1;
+        const uint b_scale_dim = (NDIMS == 2) ? d1 : (NDIMS == 3) ? d2 : d3;
+        float b_scale = 1;
+        if (B_SCALES) load(&b_scale, b_scales + scale_stride * b_scale_dim);
+        if (A_SCALES || B_SCALES) acc *= a_scale * b_scale;
+
+        if (bias) {
+            ACC_DATA_T b = load(b, bias + BIAS_OFF(d0, d1, d2, d3, 0, 0));
+            acc += b;
+        }
 
         // Apply postops
-        float sum_src = 0.0f;
-#if WITH_SUM
-        sum_src = DST_TO_ACC(dst[data_idx]);
+        POST_OP_DATA_T sum_src = 0.0f;
+#if DST_DT_F4_E2M1 || DST_DT_F4_E3M0
+        if (WITH_SUM) load(&sum_src, dst, data_idx);
+#else
+        if (WITH_SUM) load(&sum_src, dst + data_idx);
 #endif
 
-        accumulator = acc;
-#if NDIMS == 2
-        APPLY_POST_OPS_SERIAL(accumulator, float, sum_src, float, d0, 1, d1, 1,
-                0, 1, 0, 1, 0, 1, 0, 1);
-#elif NDIMS == 3
-        APPLY_POST_OPS_SERIAL(accumulator, float, sum_src, float, d0, 1, d1, 1,
-                d2, 1, 0, 1, 0, 1, 0, 1);
-#elif NDIMS == 4
-        APPLY_POST_OPS_SERIAL(accumulator, float, sum_src, float, d0, 1, d1, 1,
-                d2, 1, d3, 1, 0, 1, 0, 1);
-#endif
+        accumulator = AS_POST_OP_DATA_T(acc);
+        APPLY_POST_OPS_SERIAL(accumulator, POST_OP_DATA_T, sum_src,
+                POST_OP_DATA_T, d0, 1, d1, 1, d2, 1, d3, 1, 0, 1, 0, 1);
 
-#if C_SCALES
-        accumulator /= DST_SCALES_TO_REF(c_scales[0]);
-#endif
-#if DST_ZERO_POINT
-        accumulator += dst_zp[0];
-#endif
+        if (C_SCALES) {
+            POST_OP_DATA_T c_scale = load(c_scale, c_scales);
+            accumulator /= c_scale;
+        }
+        if (DST_ZERO_POINT) accumulator += dst_zp[0];
     }
 
-    dst[data_idx] = TO_DST(accumulator);
+    write(dst + data_idx, accumulator);
 }
diff --git a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp
index 2562d54f869..3ae452ff759 100644
--- a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp
+++ b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/gemm/gemm_with_post_ops.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -26,12 +27,10 @@ status_t gemm_with_post_ops_t::pd_t::init(impl::engine_t *engine) {
     using namespace data_type;
 
     const auto &d = desc();
-    const auto attr_skip_mask = primitive_attr_t::skip_mask_t::scales_runtime
-            | primitive_attr_t::skip_mask_t::scales_runtime_data_type
+    const auto attr_skip_mask = primitive_attr_t::skip_mask_t::scales_data_type
             | primitive_attr_t::skip_mask_t::post_ops
             | primitive_attr_t::skip_mask_t::fpmath_mode
-            | primitive_attr_t::skip_mask_t::zero_points_runtime
-            | primitive_attr_t::skip_mask_t::zero_points_runtime_data_type;
+            | primitive_attr_t::skip_mask_t::zero_points_data_type;
 
     bool wei_decomp = (utils::one_of(d->c_type(), f32, f16, bf16)
                               && utils::one_of(d->a_type(), u8, s8, u4, s4)
@@ -43,10 +42,13 @@ status_t gemm_with_post_ops_t::pd_t::init(impl::engine_t *engine) {
             VERBOSE_RUNTIMEDIM_UNSUPPORTED);
     VDISPATCH_GEMM(attr()->has_default_values(attr_skip_mask),
             VERBOSE_UNSUPPORTED_ATTR);
+    VDISPATCH_GEMM(!utils::one_of(d->c_type(), u4, s4), VERBOSE_UNSUPPORTED_DT);
 
     const primitive_attr_t *attributes_with_po = attr();
     for (int arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
-        const auto &mask = attr()->scales_.get(arg).mask_;
+        if (attr()->scales_.has_default_values(arg)) continue;
+
+        const auto &mask = attr()->scales_.get_mask(arg);
         if (arg == DNNL_ARG_WEIGHTS && !wei_decomp)
             VDISPATCH_GEMM((mask == 0 || mask == (1 << (dst_md()->ndims - 1))),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
@@ -58,6 +60,17 @@ status_t gemm_with_post_ops_t::pd_t::init(impl::engine_t *engine) {
     VDISPATCH_GEMM(d->sum_ab == sum_ab::sum_none, VERBOSE_UNSUPPORTED_FEATURE,
             "bias reduction");
 
+    subbyte_pack_ = utils::one_of(d->c_type(), f4_e2m1, f4_e3m0);
+    if (subbyte_pack_) {
+        using namespace dnnl::impl::memory_tracking::names;
+        const memory_desc_wrapper dst_mdw(dst_md(0));
+        const auto &padded_dims = dst_mdw.padded_dims();
+        const dim_t ndims = dst_mdw.ndims();
+        const dim_t nelems = utils::array_product(padded_dims, ndims);
+        auto scratchpad = scratchpad_registry().registrar();
+        scratchpad.book(memory_tracking::names::key_matmul_pack_space, nelems,
+                sizeof(char), OCL_BUFFER_ALIGNMENT);
+    }
     const auto impl_list = engine->get_implementation_list(op_desc());
     int current_impl_idx
             = impl_list_item_t::find<ocl::gemm_with_post_ops_t::pd_t>(
@@ -75,7 +88,7 @@ status_t gemm_with_post_ops_t::pd_t::init(impl::engine_t *engine) {
     auto skip_impl = is_xe_hp ? "ocl" : "ref";
     VDISPATCH_GEMM(
             !(gemm_pd_ && strstr(gemm_pd_->name(), skip_impl) == nullptr),
-            VERBOSE_PRIMITIVE_CREATION_FAIL, gemm_pd_->name());
+            VERBOSE_SKIP_PRIMITIVE_IMPL);
     auto gemm_desc = *desc();
     auto dst_type = gemm_desc.c_desc.data_type;
     gemm_desc.c_desc.data_type = engine->mayiuse_f16_accumulator_with_f16()
@@ -88,18 +101,19 @@ status_t gemm_with_post_ops_t::pd_t::init(impl::engine_t *engine) {
     // Setup empty attributes but keep zero points for gemm.
     primitive_attr_t attributes_without_po = *attr();
     attributes_without_po.set_post_ops(post_ops_t());
-    attributes_without_po.scales_ = arg_scales_t();
-    attributes_without_po.output_scales_ = runtime_scales_t();
+    attributes_without_po.scales_ = scales_t();
     attributes_without_po.zero_points_ = zero_points_t();
-    int src_mask, wei_mask;
-    auto zp = attributes_with_po->zero_points_;
-    zp.get(DNNL_ARG_SRC, &src_mask);
-    zp.get(DNNL_ARG_WEIGHTS, &wei_mask);
-    if (!zp.has_default_values(DNNL_ARG_SRC))
+    const auto &zp = attributes_with_po->zero_points_;
+    int src_mask = zp.get_mask(DNNL_ARG_SRC);
+    int wei_mask = zp.get_mask(DNNL_ARG_WEIGHTS);
+    if (!zp.has_default_values(DNNL_ARG_SRC)) {
         attributes_without_po.zero_points_.set(DNNL_ARG_SRC, src_mask);
-    if (!zp.has_default_values(DNNL_ARG_WEIGHTS))
-        attributes_without_po.zero_points_.set(DNNL_ARG_WEIGHTS, wei_mask, 0,
-                nullptr, attr()->zero_points_.get_data_type(DNNL_ARG_WEIGHTS));
+    }
+    if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+        const auto dt = attr()->zero_points_.get_data_type(DNNL_ARG_WEIGHTS);
+        attributes_without_po.zero_points_.set(
+                DNNL_ARG_WEIGHTS, wei_mask, dt, 0, {});
+    }
 
     primitive_desc_iterator_t it_gemm_without_po(engine,
             reinterpret_cast<const op_desc_t *>(&gemm_desc),
@@ -141,54 +155,47 @@ status_t gemm_with_post_ops_t::pd_t::init(impl::engine_t *engine) {
 status_t gemm_with_post_ops_t::pd_t::init_kernel_ctx(
         compute::kernel_ctx_t &kernel_ctx) const {
     auto c_type = dst_md(0)->data_type;
-    const memory_desc_wrapper bia_d(src_md(2));
-    const memory_desc_wrapper dst_d(gemm_pd_->dst_md(0));
-    offsets_t off;
-    dim_t bia_off[4][MAX_NDIMS];
-    set_offsets(dst_d, off.dst_off);
-    set_offsets(bia_d, bia_off);
-    int ndims = dst_d.ndims();
-    def_offsets(off.dst_off, kernel_ctx, "DST", ndims);
-    def_offsets(bia_off, kernel_ctx, "BIA", ndims);
-    bool with_bias = !bia_d.is_zero();
-    bool is_int8 = src_md(1)->data_type == data_type::s8;
-    kernel_ctx.set_data_type(c_type);
-    //here SRC is output tensor of gemm call
-    def_data_type(kernel_ctx, desc_.acc_type, "SRC");
-    def_data_type(kernel_ctx, is_int8 ? data_type::f32 : desc_.acc_type, "ACC");
-    def_data_type(kernel_ctx, with_bias ? src_md(2)->data_type : c_type, "BIA");
-    def_data_type(kernel_ctx, desc()->acc_type, "SPAD");
-    def_data_type(kernel_ctx, c_type, "DST");
+    const auto src_info = memory_desc_info_t::create(gemm_pd_->dst_md(0));
+    const auto bias_info = [&]() {
+        // If no bias, just default to same layout as dst - any valid layout will work, it's just a dummy
+        auto info = memory_desc_info_t::create(
+                with_bias() ? src_md(2) : dst_md(0));
+        if (info.data_type == data_type::undef) info.data_type = data_type::f32;
+        return info;
+    }();
 
-    kernel_ctx.define_int("USE_TEMP_DST", use_scratchpad_with_post_op_worker);
+    def_memory_desc_info(kernel_ctx, src_info, "SRC", false);
+    def_memory_desc_info(kernel_ctx, bias_info, "BIAS", false);
+    def_memory_desc_info(
+            kernel_ctx, memory_desc_info_t::create(dst_md(0)), "DST", false);
+
+    int ndims = src_info.ndims;
+    kernel_ctx.set_data_type(c_type);
 
-    kernel_ctx.define_int("WITH_BIAS", with_bias);
-    kernel_ctx.define_int("NDIMS", ndims);
-    kernel_ctx.define_int("BIA_NDIMS", bia_d.md_->ndims);
-    kernel_ctx.define_int("D0_WO_PADDING", gemm_pd_->dst_md()->dims[0]);
-    kernel_ctx.define_int("D1_WO_PADDING", gemm_pd_->dst_md()->dims[1]);
-    kernel_ctx.define_int(
-            "D3_WO_PADDING", ndims > 3 ? gemm_pd_->dst_md()->dims[3] : 1);
-    kernel_ctx.define_int(
-            "D2_WO_PADDING", ndims > 2 ? gemm_pd_->dst_md()->dims[2] : 1);
-    CHECK(def_attr_info(
-            kernel_ctx, attr_info_, attr()->post_ops_, *gemm_pd_->dst_md()));
     const auto &attr_scales = attr()->scales_;
-    const bool with_src_scales
-            = !attr_scales.get(DNNL_ARG_SRC).has_default_values();
+    const bool with_src_scales = !attr_scales.has_default_values(DNNL_ARG_SRC);
     const bool with_wei_scales
-            = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_values();
-    const bool with_dst_scales
-            = !attr_scales.get(DNNL_ARG_DST).has_default_values();
+            = !attr_scales.has_default_values(DNNL_ARG_WEIGHTS);
+    const bool with_dst_scales = !attr_scales.has_default_values(DNNL_ARG_DST);
+    auto is_int_type = [](data_type_t t) {
+        return utils::one_of(t, data_type::s8, data_type::u8, data_type::s32);
+    };
+    data_type_t acc_type = desc_.acc_type;
+    if (desc_.acc_type == data_type::s32) {
+        if (with_src_scales || with_wei_scales
+                || !is_int_type(bias_info.data_type)
+                || !is_int_type(dst_md(0)->data_type)) {
+            acc_type = data_type::f32;
+        }
+    }
+    def_data_type(kernel_ctx, acc_type, "ACC");
+
+    kernel_ctx.define_int("NDIMS", ndims);
+    CHECK(def_attr_info(kernel_ctx, attr_info_, attr()->post_ops_,
+            *gemm_pd_->dst_md(), false));
     kernel_ctx.define_int("A_SCALES", with_src_scales);
     kernel_ctx.define_int("B_SCALES", with_wei_scales);
     kernel_ctx.define_int("C_SCALES", with_dst_scales);
-    def_data_type(kernel_ctx, attr_scales.get(DNNL_ARG_WEIGHTS).data_type_,
-            "WEI_SCALES");
-    def_data_type(
-            kernel_ctx, attr_scales.get(DNNL_ARG_DST).data_type_, "DST_SCALES");
-    int dst_zp_mask;
-    attr()->zero_points_.get(DNNL_ARG_DST, &dst_zp_mask);
     kernel_ctx.define_int("DST_ZERO_POINT",
             !attr()->zero_points_.has_default_values(DNNL_ARG_DST));
     def_dispatch(kernel_ctx, dispatch_);
@@ -207,7 +214,7 @@ void gemm_with_post_ops_t::pd_t::init_scratchpad() {
 }
 
 status_t gemm_with_post_ops_t::execute(const gemm_exec_ctx_t &ctx) const {
-    std::unique_ptr<memory_t> c_mem_before_po_worker;
+    std::unique_ptr<memory_t, memory_deleter_t> c_mem_before_po_worker;
     status_t exec_status;
     gemm_exec_args_t g_args(ctx.args());
 
@@ -235,31 +242,47 @@ status_t gemm_with_post_ops_t::execute(const gemm_exec_ctx_t &ctx) const {
 
     exec_status = gpu_gemm(gemm_prim_)->execute(gemm_ex_ctx);
     CHECK(exec_status);
+
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(
             ctx.stream()->engine());
+    const bool subbyte_pack = pd()->subbyte_pack_;
+
     auto arch = compute_engine->device_info()->gpu_arch();
     // Workaround correctness issue on Gen9
     if (arch == compute::gpu_arch_t::gen9) ctx.stream()->wait();
+    auto tmp = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_matmul_pack_space);
     compute::kernel_arg_list_t arg_list;
-    arg_list.set(0, GEMM_CTX_ARG_STORAGE(c));
+    arg_list.set(0,
+            pd()->use_scratchpad() ? *c_mem_before_po_worker->memory_storage()
+                                   : GEMM_CTX_ARG_STORAGE(c));
     arg_list.set(1, GEMM_CTX_ARG_STORAGE(bias));
-    arg_list.set(2, GEMM_CTX_ARG_STORAGE(c));
+    arg_list.set(2, pd()->subbyte_pack_ ? *tmp : GEMM_CTX_ARG_STORAGE(c));
     const auto &args = ctx.args();
     int idx = append_post_ops_to_arg_list_gemm(
             args.exec_args, arg_list, 3, pd()->attr()->post_ops_);
-    arg_list.set(idx++,
-            pd()->use_scratchpad() ? *c_mem_before_po_worker->memory_storage()
-                                   : memory_storage_t::empty_storage());
     //a/b tensors are swapped for gemm
     arg_list.set(idx++, GEMM_CTX_ARG_STORAGE(b_scales));
     arg_list.set(idx++, GEMM_CTX_ARG_STORAGE(a_scales));
     arg_list.set(idx++, GEMM_CTX_ARG_STORAGE(c_scales));
     arg_list.set(idx++,
-            pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_ != 0 ? 1 : 0);
+            pd()->attr()->scales_.get_mask(DNNL_ARG_WEIGHTS) > 0 ? 1 : 0);
     arg_list.set(idx, GEMM_CTX_ARG_STORAGE(c_zero_point));
     auto nd_range = pd()->dispatch_.nd_range();
-    exec_status = parallel_for(ctx, nd_range, post_process_kernel_, arg_list);
-    return exec_status;
+    CHECK(parallel_for(ctx, nd_range, post_process_kernel_, arg_list));
+
+    if (!subbyte_pack) return status_t::dnnl_success;
+    memory_desc_wrapper dst_mdw(pd()->dst_md(0));
+    const dim_t nelems = dst_mdw.nelems();
+    compute::kernel_arg_list_t repack_arg_list;
+    repack_arg_list.set(0, *tmp);
+    repack_arg_list.set(1, GEMM_CTX_ARG_STORAGE(c));
+    repack_arg_list.set(2, into<dim_t>(nelems));
+    repack_arg_list.set(3, 4);
+    compute::range_t repack_gws((nelems * 4 + 7) / 8);
+    compute::nd_range_t repack_nd_range(repack_gws);
+    return large_parallel_for(exec_ctx_t(ctx.stream()), repack_nd_range,
+            subbyte_pack_kernel_, repack_arg_list, 4);
 };
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.hpp b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.hpp
index 12633f8f452..08b00060f5e 100644
--- a/src/gpu/intel/ocl/gemm/gemm_with_post_ops.hpp
+++ b/src/gpu/intel/ocl/gemm/gemm_with_post_ops.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,15 +30,10 @@ namespace ocl {
 struct gemm_with_post_ops_t : public gpu_gemm_t {
     using gpu_gemm_t::gpu_gemm_t;
     struct pd_t : public gpu_gemm_pd_t {
+        using gpu_gemm_pd_t::gpu_gemm_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gemm_with_po:any", gemm_with_post_ops_t);
 
-        pd_t(const gemm_desc_t *adesc, const primitive_attr_t *attr,
-                const hint_class *hint_fwd_pd)
-            : gpu_gemm_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        pd_t(const pd_t &other) = default;
-
         status_t init(impl::engine_t *engine);
 
         void init_scratchpad();
@@ -53,6 +48,7 @@ struct gemm_with_post_ops_t : public gpu_gemm_t {
         bool use_reorder = false;
         compute::dispatch_t dispatch_;
         attr_info_t attr_info_;
+        bool subbyte_pack_ = false;
     };
 
     status_t init(impl::engine_t *engine) override {
@@ -71,6 +67,9 @@ struct gemm_with_post_ops_t : public gpu_gemm_t {
         CHECK(ret_status);
         ret_status = create_kernel(
                 engine, &post_process_kernel_, "gemm_post_ops", kernel_ctx);
+        if (pd()->subbyte_pack_)
+            CHECK(create_kernel(
+                    engine, &subbyte_pack_kernel_, "subbyte_pack", kernel_ctx));
         return ret_status;
     }
 
@@ -80,6 +79,7 @@ struct gemm_with_post_ops_t : public gpu_gemm_t {
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::shared_ptr<impl::primitive_t> gemm_prim_;
     compute::kernel_t post_process_kernel_;
+    compute::kernel_t subbyte_pack_kernel_;
 };
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/gemm/ocl_gemm_attrs.h b/src/gpu/intel/ocl/gemm/ocl_gemm_attrs.h
index 6dbc411f7eb..37c36eca259 100644
--- a/src/gpu/intel/ocl/gemm/ocl_gemm_attrs.h
+++ b/src/gpu/intel/ocl/gemm/ocl_gemm_attrs.h
@@ -17,11 +17,7 @@
 #ifndef GPU_INTEL_OCL_GEMM_OCL_GEMM_ATTRS_H
 #define GPU_INTEL_OCL_GEMM_OCL_GEMM_ATTRS_H
 
-#if WITH_SCALES
-#define ATTR_ALPHA alpha[0]
-#else
 #define ATTR_ALPHA 1.0f
-#endif
 
 #if WITH_SRC_ZPOINTS
 #define ATTR_A0 ao[0]
diff --git a/src/gpu/intel/ocl/gemm/ref_gemm.cl b/src/gpu/intel/ocl/gemm/ref_gemm.cl
index f1ceab5a406..5e09a2180a5 100644
--- a/src/gpu/intel/ocl/gemm/ref_gemm.cl
+++ b/src/gpu/intel/ocl/gemm/ref_gemm.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,14 +34,15 @@ void get_strides(int mask, long dim0, long dim1, long dim2, long *str0,
 __kernel void ref_gemm(__global A_DATA_T *a, __global B_DATA_T *b,
         __global C_DATA_T *c, __global BIA_DATA_T *bias, long offset_a0,
         long offset_b0, long offset_c0, long offset_bias0, int transa,
-        int transb, long MB, long M, long N, long K, long stride_a,
-        long stride_b, long stride_c, long lda, long ldb, long ldc,
+        int transb, long MB, long M, long N, long K, long stride_a_mb,
+        long stride_b_mb, long stride_c, long lda, long ldb, long ldc,
         float eltwise_alpha, float eltwise_beta, float eltwise_scale,
         int bias_mask, __global int *ao, __global int *bo, __global int *c0,
-        int c0_mask, __global float *scales, long scale_stride, float beta) {
+        int c0_mask, float beta) {
 
-    int n = get_global_id(1);
-    int mb = get_global_id(2);
+    long n = get_global_id(0);
+    long m = get_global_id(1);
+    long mb = get_global_id(2);
 
 #if WITH_BIAS
     bias += offset_bias0;
@@ -61,39 +62,38 @@ __kernel void ref_gemm(__global A_DATA_T *a, __global B_DATA_T *b,
             c0_mask, MB, M, N, &c0_strides[0], &c0_strides[1], &c0_strides[2]);
 #endif
 
-    for (long m = 0; m < M; ++m) {
-        ACC_DATA_T acc = 0;
-        for (long k = 0; k < K; ++k) {
-            long off_a = mb * stride_a + (transa ? m * lda + k : k * lda + m);
-            long off_b = mb * stride_b + (transb ? k * ldb + n : n * ldb + k);
-            acc += TO_ACC(A_TO_REF(a[off_a]) - ATTR_A0)
-                    * TO_ACC(B_TO_REF(b[off_b]) - ATTR_B0);
-        }
+    long stride_a_m = transa ? lda : 1;
+    long stride_a_k = transa ? 1 : lda;
+    long stride_b_k = transb ? ldb : 1;
+    long stride_b_n = transb ? 1 : ldb;
 
-        long off_c = mb * stride_c + n * ldc + m;
+    ACC_DATA_T acc = 0;
+    for (long k = 0; k < K; ++k) {
+        long off_a = mb * stride_a_mb + m * stride_a_m + k * stride_a_k;
+        long off_b = mb * stride_b_mb + k * stride_b_k + n * stride_b_n;
+        acc += TO_ACC(A_TO_REF(a[off_a]) - ATTR_A0)
+                * TO_ACC(B_TO_REF(b[off_b]) - ATTR_B0);
+    }
+
+    long off_c = mb * stride_c + n * ldc + m;
 #if WITH_BIAS || NON_DEFAULT_ATTRS
-        POST_OP_DATA_T temp = (POST_OP_DATA_T)acc;
+    POST_OP_DATA_T temp = (POST_OP_DATA_T)acc;
 #if WITH_BIAS
-        long off_bias = mb * b_strides[0] + m * b_strides[1] + n * b_strides[2];
-        temp += BIA_TO_REF(bias[off_bias]);
-#endif
-#if WITH_SCALES
-        temp *= scales[scale_stride * n];
+    long off_bias = mb * b_strides[0] + m * b_strides[1] + n * b_strides[2];
+    temp += BIA_TO_REF(bias[off_bias]);
 #endif
+#if WITH_POST_OP
 #if WITH_SUM
-        temp += (POST_OP_DATA_T)(beta * C_TO_REF(c[off_c]));
+    temp += (POST_OP_DATA_T)(beta * C_TO_REF(c[off_c]));
 #endif
-#if WITH_ELTWISE
-        temp = fwd_eltwise(temp, eltwise_alpha, eltwise_beta, eltwise_scale);
+    temp = fwd_eltwise(temp, eltwise_alpha, eltwise_beta, eltwise_scale);
 #endif
 #if WITH_DST_ZPOINTS
-        long off_c0
-                = mb * c0_strides[0] + m * c0_strides[1] + n * c0_strides[2];
-        temp += c0[off_c0];
+    long off_c0 = mb * c0_strides[0] + m * c0_strides[1] + n * c0_strides[2];
+    temp += c0[off_c0];
 #endif
-        c[off_c] = TO_C(temp);
+    c[off_c] = TO_C(temp);
 #else
-        c[off_c] = TO_C(acc);
+    c[off_c] = TO_C(acc);
 #endif
-    }
 }
diff --git a/src/gpu/intel/ocl/gemm/ref_gemm.cpp b/src/gpu/intel/ocl/gemm/ref_gemm.cpp
index e0b669148af..b7e6e3c2da9 100644
--- a/src/gpu/intel/ocl/gemm/ref_gemm.cpp
+++ b/src/gpu/intel/ocl/gemm/ref_gemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,13 +43,14 @@ status_t ref_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
             ? bias.offset() / types::data_type_size(exec_d->bias_type())
             : 0;
 
-    const auto &scales = memory_storage_t::empty_storage();
     const auto &a0 = GEMM_CTX_ARG_STORAGE(a_zero_point);
     const auto &b0 = GEMM_CTX_ARG_STORAGE(b_zero_point);
     const auto &c0 = GEMM_CTX_ARG_STORAGE(c_zero_point);
 
     int c0_mask = 0;
-    CHECK(pd()->attr()->zero_points_.get(DNNL_ARG_C, &c0_mask));
+    if (!pd()->attr()->zero_points_.has_default_values(DNNL_ARG_C)) {
+        c0_mask = pd()->attr()->zero_points_.get_mask(DNNL_ARG_C);
+    }
 
     const dim_t MB = exec_d->batch();
     const dim_t M = exec_d->m();
@@ -62,7 +63,6 @@ status_t ref_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
     const dim_t ldb = exec_d->ldb();
     const dim_t ldc = exec_d->ldc();
 
-    const dim_t scale_stride = 1;
     const float eltwise_alpha = pd()->attr_info.eltwise_alpha;
     const float eltwise_beta = pd()->attr_info.eltwise_beta;
     const float eltwise_scale = pd()->attr_info.eltwise_scale;
@@ -101,11 +101,9 @@ status_t ref_gemm_t::execute(const gemm_exec_ctx_t &ctx) const {
     arg_list.set(25, b0);
     arg_list.set(26, c0);
     arg_list.set(27, c0_mask);
-    arg_list.set(28, scales);
-    arg_list.set(29, scale_stride);
-    arg_list.set(30, beta);
+    arg_list.set(28, beta);
 
-    const compute::range_t gws = {1, (size_t)N, (size_t)MB};
+    const compute::range_t gws = {(size_t)N, (size_t)M, (size_t)MB};
     const auto nd_range = compute::nd_range_t(gws);
 
     status_t status = parallel_for(ctx, nd_range, kernel_, arg_list);
diff --git a/src/gpu/intel/ocl/gemm/ref_gemm.hpp b/src/gpu/intel/ocl/gemm/ref_gemm.hpp
index b3452437ea7..ab8dc82550f 100644
--- a/src/gpu/intel/ocl/gemm/ref_gemm.hpp
+++ b/src/gpu/intel/ocl/gemm/ref_gemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #ifndef GPU_INTEL_OCL_GEMM_REF_GEMM_HPP
 #define GPU_INTEL_OCL_GEMM_REF_GEMM_HPP
 
+#include "common/serialization.hpp"
 #include "gpu/gpu_gemm_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gemm/gpu_gemm.hpp"
@@ -28,6 +29,56 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
+struct ref_gemm_jit_params_t
+    : public trivially_serializable_t<ref_gemm_jit_params_t> {
+    status_t create_generator(const compute::compute_engine_t &engine,
+            compute::kernel_bundle_t &bundle) const {
+        return engine.create_kernel_bundle(
+                bundle, get_kernel_names(), get_kernel_ctx());
+    }
+
+    const std::vector<const char *> &get_kernel_names() const {
+        static const std::vector<const char *> names {"ref_gemm"};
+        return names;
+    }
+
+    compute::kernel_ctx_t get_kernel_ctx() const {
+        compute::kernel_ctx_t kernel_ctx;
+        kernel_ctx.set_data_type(c_dt);
+        def_data_type(kernel_ctx, a_dt, "A");
+        def_data_type(kernel_ctx, b_dt, "B");
+        def_data_type(kernel_ctx, c_dt, "C");
+        def_data_type(kernel_ctx, bia_dt, "BIA");
+        def_data_type(kernel_ctx, acc_dt, "ACC");
+
+        kernel_ctx.define_int("WITH_BIAS", bia_dt != data_type::undef);
+        kernel_ctx.define_int("NON_DEFAULT_ATTRS",
+                with_post_ops || with_sum || with_dst_zpoints);
+
+        kernel_ctx.define_int("WITH_POST_OP", with_post_ops);
+        if (with_post_ops) {
+            def_binary_alg_kinds(kernel_ctx);
+            def_eltwise_alg_kinds(kernel_ctx);
+            kernel_ctx.define_int("ELTWISE_ALG", eltwise_alg);
+        }
+        kernel_ctx.define_int("WITH_SUM", with_sum);
+        kernel_ctx.define_int("WITH_DST_ZPOINTS", with_dst_zpoints);
+
+        return kernel_ctx;
+    };
+
+    data_type_t a_dt = {};
+    data_type_t b_dt = {};
+    data_type_t c_dt = {};
+    data_type_t bia_dt = {};
+    data_type_t acc_dt = {};
+    bool with_post_ops = {};
+    bool with_sum = {};
+    bool with_dst_zpoints = {};
+    uint8_t pad[1] = {};
+    int eltwise_alg = {};
+};
+
 struct ref_gemm_t : public gpu_gemm_t {
     using gpu_gemm_t::gpu_gemm_t;
     struct pd_t : public gpu_gemm_pd_t {
@@ -55,9 +106,6 @@ struct ref_gemm_t : public gpu_gemm_t {
             const auto b_strides = desc()->b_desc.format_desc.blocking.strides;
             const auto c_strides = desc()->c_desc.format_desc.blocking.strides;
 
-            VDISPATCH_GEMM(
-                    IMPLICATION(acc_dt == s32, attr()->zero_points_.common()),
-                    VERBOSE_UNSUPPORTED_DT_CFG);
             VDISPATCH_GEMM(!has_blocks(), VERBOSE_UNSUPPORTED_FEATURE,
                     "blocked format");
             VDISPATCH_GEMM(desc()->c_desc.ndims <= 3, VERBOSE_BAD_NDIMS,
@@ -78,9 +126,8 @@ struct ref_gemm_t : public gpu_gemm_t {
             VDISPATCH_GEMM(IMPLICATION(acc_dt != s32 && !wei_decompress,
                                    attr()->zero_points_.has_default_values()),
                     VERBOSE_UNSUPPORTED_ZP_CFG);
-            VDISPATCH_GEMM(
-                    attr()->has_default_values(smask_t::zero_points_runtime
-                            | smask_t::post_ops | smask_t::fpmath_mode),
+            VDISPATCH_GEMM(attr()->has_default_values(smask_t::zero_points
+                                   | smask_t::post_ops | smask_t::fpmath_mode),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_GEMM(attr_oscale_ok(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_GEMM(attr_zp_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
@@ -127,6 +174,16 @@ struct ref_gemm_t : public gpu_gemm_t {
                     VERBOSE_UNSUPPORTED_DT_CFG);
             attr_info = attr_info_t::create(attr());
 
+            conf.a_dt = a_dt;
+            conf.b_dt = b_dt;
+            conf.c_dt = c_dt;
+            conf.bia_dt = bia_dt;
+            conf.acc_dt = acc_dt;
+            conf.with_post_ops = attr()->post_ops_.len() > 0;
+            conf.with_sum = attr_info.with_sum;
+            conf.with_dst_zpoints = attr_info.with_dst_zpoints;
+            conf.eltwise_alg = attr_info.eltwise_alg;
+
             return status::success;
         }
 
@@ -136,15 +193,32 @@ struct ref_gemm_t : public gpu_gemm_t {
 
         bool attr_oscale_ok() const {
             const auto &scales = attr()->scales_;
-            return scales.get(DNNL_ARG_SRC).mask_ == 0
-                    && scales.get(DNNL_ARG_WEIGHTS).mask_ == 0
-                    && scales.get(DNNL_ARG_DST).mask_ == 0;
+            const bool src_scale_ok = scales.has_default_values(DNNL_ARG_SRC)
+                    || scales.get_mask(DNNL_ARG_SRC) == 0;
+            const bool wei_scale_ok
+                    = scales.has_default_values(DNNL_ARG_WEIGHTS)
+                    || scales.get_mask(DNNL_ARG_WEIGHTS) == 0;
+            const bool dst_scale_ok = scales.has_default_values(DNNL_ARG_DST)
+                    || scales.get_mask(DNNL_ARG_DST) == 0;
+            return src_scale_ok && wei_scale_ok && dst_scale_ok;
         }
 
         bool attr_zp_ok() const {
-            return this->attr()->zero_points_.common(DNNL_ARG_A)
-                    && this->attr()->zero_points_.common(DNNL_ARG_B)
-                    && this->attr()->zero_points_.common(DNNL_ARG_C);
+            const auto &zp = attr()->zero_points_;
+
+            bool ok = IMPLICATION(desc()->acc_type != data_type::s32,
+                    zp.has_default_values());
+            if (!ok) return false;
+
+            static const std::vector<int> supported_args {
+                    DNNL_ARG_A, DNNL_ARG_B, DNNL_ARG_C};
+            for (int arg : supported_args) {
+                if (!zp.has_default_values(arg)) {
+                    const int mask = zp.get_mask(arg);
+                    if (mask > 0) return false;
+                }
+            }
+            return true;
         }
 
         bool attr_post_ops_ok() const {
@@ -164,29 +238,11 @@ struct ref_gemm_t : public gpu_gemm_t {
         }
 
         attr_info_t attr_info = {};
+        ref_gemm_jit_params_t conf = {};
     };
 
     status_t init(impl::engine_t *engine) override {
-        compute::kernel_ctx_t kernel_ctx;
-
-        kernel_ctx.define_int("WITH_BIAS", pd()->with_bias());
-        kernel_ctx.define_int(
-                "NON_DEFAULT_ATTRS", !pd()->attr()->has_default_values());
-
-        const auto d = pd()->desc();
-        kernel_ctx.set_data_type(d->c_type());
-        CHECK(def_attr_info(kernel_ctx, pd()->attr_info,
-                pd()->attr()->post_ops_, *pd()->dst_md()));
-
-        const auto bias_type = d->bias_type() != data_type::undef
-                ? d->bias_type()
-                : data_type::f32;
-        def_data_type(kernel_ctx, d->a_type(), "A");
-        def_data_type(kernel_ctx, d->b_type(), "B");
-        def_data_type(kernel_ctx, d->c_type(), "C");
-        def_data_type(kernel_ctx, d->acc_type, "ACC");
-        def_data_type(kernel_ctx, bias_type, "BIA");
-        CHECK(create_kernel(engine, &kernel_, "ref_gemm", kernel_ctx));
+        CHECK(create_kernel(engine, kernel_, "ref_gemm", pd()->conf));
         if (!kernel_) return status::runtime_error;
 
         return status::success;
diff --git a/src/gpu/intel/ocl/gemm/xe_systolic_gemm_copy_kernel.hpp b/src/gpu/intel/ocl/gemm/xe_systolic_gemm_copy_kernel.hpp
index 0ba8e0c8a60..0416369e39d 100644
--- a/src/gpu/intel/ocl/gemm/xe_systolic_gemm_copy_kernel.hpp
+++ b/src/gpu/intel/ocl/gemm/xe_systolic_gemm_copy_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_INTEL_OCL_GEMM_XE_SYSTOLIC_GEMM_COPY_KERNEL_HPP
 
 #include "common/c_types_map.hpp"
+#include "common/serialization.hpp"
 #include "gpu/intel/compute/compute_engine.hpp"
 #include "gpu/intel/compute/device_info.hpp"
 #include "gpu/intel/compute/kernel.hpp"
@@ -76,7 +77,7 @@ struct xe_systolic_gemm_copy_kernel_t {
                 "-cl-intel-256-GRF-per-thread"); // avoid GRF mode switch
         if (utils::one_of(arch, arch_t::xe_hp, arch_t::xe_hpg))
             kernel_ctx.add_option("-DCOPY_XE_HP");
-        if (utils::one_of(arch, arch_t::xe_hpc, arch_t::xe2))
+        if (utils::one_of(arch, arch_t::xe_hpc, arch_t::xe2, arch_t::xe3))
             kernel_ctx.add_option("-DCOPY_XE_HPC");
 
         return status::success;
@@ -89,7 +90,8 @@ struct xe_systolic_gemm_copy_kernel_t {
             case arch_t::xe_hp:
             case arch_t::xe_hpg: return "xe_hp_systolic_gemm_copy";
             case arch_t::xe_hpc:
-            case arch_t::xe2: return "xe_hpc_systolic_gemm_copy";
+            case arch_t::xe2:
+            case arch_t::xe3: return "xe_hpc_systolic_gemm_copy";
             default: assert(!"Unsupported architecture"); return "";
         }
     }
@@ -121,9 +123,12 @@ struct xe_systolic_gemm_copy_kernel_t {
     bool operator==(const xe_systolic_gemm_copy_kernel_t &) const = default;
 #endif
 
-    serialized_t serialize() const { return serialized_t(*this); }
+    serialization_stream_t serialize() const {
+        return serialization_stream_t(*this);
+    }
 
-    static xe_systolic_gemm_copy_kernel_t deserialize(const serialized_t &s) {
+    static xe_systolic_gemm_copy_kernel_t deserialize(
+            const serialization_stream_t &s) {
         xe_systolic_gemm_copy_kernel_t t {};
         deserializer_t d(s);
         d.pop(t);
diff --git a/src/gpu/intel/ocl/gemm_inner_product.cpp b/src/gpu/intel/ocl/gemm_inner_product.cpp
index 7ab6663a03a..1ed22c30b89 100644
--- a/src/gpu/intel/ocl/gemm_inner_product.cpp
+++ b/src/gpu/intel/ocl/gemm_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #include "gpu/intel/ocl/gemm_inner_product.hpp"
 
 #include "gpu/intel/gemm/gpu_gemm.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
+#include "gpu/intel/ocl/stream.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/ocl/gemm_inner_product.hpp b/src/gpu/intel/ocl/gemm_inner_product.hpp
index d419c5a0c85..19859f3951c 100644
--- a/src/gpu/intel/ocl/gemm_inner_product.hpp
+++ b/src/gpu/intel/ocl/gemm_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include "common/c_types_map.hpp"
 #include "common/gemm_utils.hpp"
+#include "common/memory_desc.hpp"
 #include "common/primitive.hpp"
 #include "common/primitive_desc_iterator.hpp"
 #include "common/reduction_pd.hpp"
@@ -38,11 +39,7 @@ namespace ocl {
 struct gemm_inner_product_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_inner_product_fwd_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_inner_product_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
-        pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        using gpu_inner_product_fwd_pd_t::gpu_inner_product_fwd_pd_t;
 
         DECLARE_COMMON_PD_T((gemm_pd_ ? gemm_pd_->name() : "ocl:gemm"),
                 gemm_inner_product_fwd_t);
@@ -53,9 +50,10 @@ struct gemm_inner_product_fwd_t : public gpu_primitive_t {
             using namespace data_type;
             assert(engine->kind() == engine_kind::gpu);
 
-            const auto attr_skip_mask
-                    = primitive_attr_t::skip_mask_t::scales_runtime
-                    | primitive_attr_t::skip_mask_t::post_ops;
+            using smask_t = primitive_attr_t::skip_mask_t;
+
+            const auto attr_skip_mask = smask_t::scales | smask_t::post_ops
+                    | smask_t::fpmath_mode | smask_t::accumulation_mode;
 
             VDISPATCH_INNER_PRODUCT(is_fwd(), VERBOSE_BAD_PROPKIND);
             VDISPATCH_INNER_PRODUCT_SC(
@@ -78,25 +76,40 @@ struct gemm_inner_product_fwd_t : public gpu_primitive_t {
 
             attr_info_ = attr_info_t::create(attr());
 
-            memory_desc_t a_md, b_md, c_md;
+            memory_desc_t a_md, b_md, c_md, bias_md;
             VDISPATCH_INNER_PRODUCT_SC(
                     init_2d_desc(&a_md, src_md()), "init_2d_desc()");
             VDISPATCH_INNER_PRODUCT_SC(
                     init_2d_desc(&b_md, weights_md(), true), "init_2d_desc()");
             VDISPATCH_INNER_PRODUCT_SC(
                     init_2d_desc(&c_md, dst_md()), "init_2d_desc()");
-            primitive_attr_t gemm_attr = *attr();
-            auto wei_mask = gemm_attr.scales_.get(DNNL_ARG_WEIGHTS).mask_;
-            if (wei_mask == 1) //transpose mask for gemm
+            if (with_bias()) {
+                dims_t bias_dims;
+                bias_dims[0] = 1;
+                utils::array_copy(&bias_dims[1], weights_md(1)->dims,
+                        weights_md(1)->ndims);
                 VDISPATCH_INNER_PRODUCT_SC(
-                        gemm_attr.scales_.set(
-                                DNNL_ARG_WEIGHTS, 1 << (b_md.ndims - 1)),
-                        VERBOSE_UNSUPPORTED_ATTR);
-            else if (wei_mask != 0)
-                return status::unimplemented;
+                        memory_desc_reshape(bias_md, *weights_md(1),
+                                weights_md(1)->ndims + 1, bias_dims),
+                        "memory_desc_reshape()");
+            }
+            primitive_attr_t gemm_attr = *attr();
+            if (!gemm_attr.scales_.has_default_values(DNNL_ARG_WEIGHTS)) {
+                auto wei_mask = gemm_attr.scales_.get_mask(DNNL_ARG_WEIGHTS);
+                if (wei_mask == 1) {
+                    // Transpose the mask for gemm.
+                    VDISPATCH_INNER_PRODUCT_SC(
+                            gemm_attr.scales_.set(
+                                    DNNL_ARG_WEIGHTS, 1 << (b_md.ndims - 1)),
+                            VERBOSE_UNSUPPORTED_SCALES_CFG);
+                } else {
+                    VDISPATCH_INNER_PRODUCT(
+                            wei_mask == 0, VERBOSE_UNSUPPORTED_ATTR);
+                }
+            }
             VDISPATCH_INNER_PRODUCT_SC(
                     create_gemm_pd(gemm_pd_, engine, &a_md, &b_md, &c_md,
-                            weights_md(1), desc()->accum_data_type, &gemm_attr,
+                            &bias_md, desc()->accum_data_type, &gemm_attr,
                             true),
                     VERBOSE_PRIMITIVE_CREATION_FAIL, "gemm");
 
@@ -134,11 +147,7 @@ struct gemm_inner_product_fwd_t : public gpu_primitive_t {
 struct gemm_inner_product_bwd_data_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_inner_product_bwd_data_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_inner_product_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
-        pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        using gpu_inner_product_bwd_data_pd_t::gpu_inner_product_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T((gemm_pd_ ? gemm_pd_->name() : "ocl:gemm"),
                 gemm_inner_product_bwd_data_t);
@@ -224,12 +233,7 @@ struct gemm_inner_product_bwd_weights_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     using gpu_ip_bwd_weights_pd_t = gpu_inner_product_bwd_weights_pd_t;
     struct pd_t : public gpu_ip_bwd_weights_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_ip_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) {}
-        pd_t(const pd_t &rhs) = default;
-
-        ~pd_t() = default;
+        using gpu_ip_bwd_weights_pd_t::gpu_ip_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T(gemm_pd_ ? gemm_pd_->name() : "ocl:gemm",
                 gemm_inner_product_bwd_weights_t);
diff --git a/src/gpu/intel/ocl/gemm_matmul.cpp b/src/gpu/intel/ocl/gemm_matmul.cpp
index 004c2f0c73b..4d6a30526b5 100644
--- a/src/gpu/intel/ocl/gemm_matmul.cpp
+++ b/src/gpu/intel/ocl/gemm_matmul.cpp
@@ -58,9 +58,11 @@ status_t gemm_matmul_t::execute(const exec_ctx_t &ctx) const {
             = &CTX_IN_STORAGE(DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
     gemm_args.b_scales = &CTX_IN_STORAGE(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
     gemm_args.c_scales = &CTX_IN_STORAGE(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+    gemm_args.sround_seed = &CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED);
     gemm_args.exec_args = ctx.args();
-    auto gemm_desc = create_gemm_desc(src_d.md_, weights_d.md_, dst_d.md_,
-            bia_d.md_, pd()->desc()->accum_data_type, ctx.stream()->engine());
+    gemm_desc_t gemm_desc;
+    CHECK(create_gemm_desc(&gemm_desc, src_d.md_, weights_d.md_, dst_d.md_,
+            bia_d.md_, pd()->desc()->accum_data_type, ctx.stream()->engine()));
 
     gemm_exec_ctx_t gemm_ctx(ctx, gemm_args, &gemm_desc);
 
diff --git a/src/gpu/intel/ocl/gemm_matmul.hpp b/src/gpu/intel/ocl/gemm_matmul.hpp
index c67c9294be6..deb34095112 100644
--- a/src/gpu/intel/ocl/gemm_matmul.hpp
+++ b/src/gpu/intel/ocl/gemm_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,11 +33,7 @@ namespace ocl {
 struct gemm_matmul_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_matmul_pd_t {
-        pd_t(const matmul_desc_t *adesc, const primitive_attr_t *attr,
-                const matmul_pd_t *hint_pd)
-            : gpu_matmul_pd_t(adesc, attr, hint_pd) {}
-
-        pd_t(const pd_t &other) = default;
+        using gpu_matmul_pd_t::gpu_matmul_pd_t;
 
         DECLARE_COMMON_PD_T(gemm_pd_->name(), gemm_matmul_t);
 
@@ -62,29 +58,42 @@ struct gemm_matmul_t : public gpu_primitive_t {
 
             auto map_gemm_zp = [&](int arg, int gemm_arg, bool reshape = false,
                                        int diff_dims = 0) {
-                auto &zp = attr()->zero_points_;
-                if (!zp.has_default_values(arg)) {
-                    int mask = 0;
-                    CHECK(zp.get(arg, &mask));
-                    if (reshape) mask = mask >> diff_dims;
-                    CHECK(gemm_attr.zero_points_.set(arg, mask,
-                            zp.get_groups_ndims(arg), zp.get_groups(arg),
-                            zp.get_data_type(arg)));
+                const auto &zp = attr()->zero_points_;
+                if (zp.has_default_values(arg)) return status::success;
+
+                int mask = zp.get_mask(arg);
+                if (reshape) mask = mask >> diff_dims;
+                data_type_t dt = zp.get_data_type(arg);
+                int nd = 0;
+                dims_t dims {};
+                if (!zp.get(arg).has_default_groups()) {
+                    nd = 2; // Note: hardcoded so far.
+                    dims[0] = zp.get_group(arg, 0);
+                    dims[1] = zp.get_group(arg, 1);
                 }
+                CHECK(gemm_attr.zero_points_.set(arg, mask, dt, nd, dims));
                 return status::success;
             };
 
-            auto adjust_scales_mask = [&](arg_scales_t &scales, int arg,
-                                              int diff_dims) {
-                int mask = 0, nd = 0;
-                bool is_set = false;
-                data_type_t dt = dnnl_data_type_undef;
-                dims_t dims = {};
-                CHECK(attr()->scales_.get(arg, &mask, &is_set, &nd, dims, &dt));
-                mask = mask >> diff_dims;
-                if (is_set) { CHECK(scales.set(arg, mask, nd, dims, dt)); }
-                return status::success;
-            };
+            // The function shrinks the mask for scales and updates it in
+            // `scales` object.
+            auto adjust_scales_mask
+                    = [&](scales_t &scales, int arg, int diff_dims) {
+                          if (attr()->scales_.has_default_values(arg))
+                              return status::success;
+
+                          int mask = attr()->scales_.get_mask(arg) >> diff_dims;
+                          data_type_t dt = attr()->scales_.get_data_type(arg);
+                          int nd = 0;
+                          dims_t dims {};
+                          if (!attr()->scales_.get(arg).has_default_groups()) {
+                              nd = 2; // Note: hardcoded so far.
+                              dims[0] = attr()->scales_.get_group(arg, 0);
+                              dims[1] = attr()->scales_.get_group(arg, 1);
+                          }
+                          CHECK(scales.set(arg, mask, dt, nd, dims));
+                          return status::success;
+                      };
             if (!attr()->zero_points_.has_default_values()) {
                 CHECK(map_gemm_zp(DNNL_ARG_SRC, DNNL_ARG_B));
                 CHECK(map_gemm_zp(
@@ -92,13 +101,13 @@ struct gemm_matmul_t : public gpu_primitive_t {
                 CHECK(map_gemm_zp(DNNL_ARG_DST, DNNL_ARG_C));
             }
 
-            auto maybe_reshape = [&](dims_t &orig_a_dims, dims_t &orig_b_dims,
-                                         dims_t &orig_c_dims,
-                                         dims_t &orig_bias_dims,
-                                         const int orig_dims) {
+            auto maybe_reshape
+                    = [&](dims_t &orig_a_dims, dims_t &orig_b_dims,
+                              dims_t &orig_c_dims, dims_t &orig_bias_dims,
+                              const int orig_dims) -> status_t {
                 int batch_b_dims = 1;
-                for (int i = b_md->ndims; i > 2; i--) {
-                    batch_b_dims *= b_md->dims[b_md->ndims - i];
+                for (int i = 0; i < b_md->ndims - 2; i++) {
+                    batch_b_dims *= b_md->dims[i];
                 }
                 for (int i = 0; i < orig_dims; i++) {
                     orig_a_dims[i] = a_md->dims[i];
@@ -165,7 +174,7 @@ struct gemm_matmul_t : public gpu_primitive_t {
                     for (int i = 0; i < attr()->post_ops_.len(); i++) {
                         auto &po = post_ops.entry_[i];
                         if (po.is_binary()) {
-                            auto &po_desc = po.binary.src1_desc;
+                            const auto &po_desc = po.binary.src1_desc;
                             auto a_dim = po_desc.dims[po_desc.ndims
                                     - reshape_size];
                             for (int i = po_desc.ndims; i > reshape_size; i--) {
@@ -191,9 +200,11 @@ struct gemm_matmul_t : public gpu_primitive_t {
                                         ? po_desc.dims[po_desc.ndims - 1]
                                         : 1;
                             }
-                            CHECK(memory_desc_reshape(
-                                    po_desc, po_desc, reshape_size, po_dims));
-                            tmp_post_ops.entry_[i].binary.src1_desc = po_desc;
+                            memory_desc_t tmp_po_desc;
+                            CHECK(memory_desc_reshape(tmp_po_desc, po_desc,
+                                    reshape_size, po_dims));
+                            tmp_post_ops.entry_[i].binary.src1_desc
+                                    = tmp_po_desc;
                         } else if (po.is_prelu()) {
                             auto mask = po.prelu.mask;
                             int new_mask = 0;
@@ -248,6 +259,7 @@ struct gemm_matmul_t : public gpu_primitive_t {
 
             CHECK(gemm_attr.set_fpmath_mode(
                     attr()->fpmath_.mode_, attr()->fpmath_.apply_to_int_));
+            CHECK(gemm_attr.set_accumulation_mode(attr()->acc_mode_));
             gemm_attr.deterministic_ = attr()->deterministic_;
 
             dims_t orig_a_dims, orig_b_dims, orig_c_dims, orig_bias_dims;
@@ -258,6 +270,9 @@ struct gemm_matmul_t : public gpu_primitive_t {
             if (!attr()->post_ops_.has_default_values()) {
                 gemm_attr.post_ops_ = post_ops;
             }
+            if (!attr()->rounding_mode_.has_default_values()) {
+                gemm_attr.rounding_mode_ = attr()->rounding_mode_;
+            }
 
             // We create a gemm_pd and resolve 'any' desc by querying gemm_pd
             VDISPATCH_MATMUL(
diff --git a/src/gpu/intel/ocl/gemm_post_ops_inner_product.cl b/src/gpu/intel/ocl/gemm_post_ops_inner_product.cl
deleted file mode 100644
index 71b3779e01b..00000000000
--- a/src/gpu/intel/ocl/gemm_post_ops_inner_product.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/ocl/ocl_post_ops.h"
-#include "gpu/intel/ocl/ocl_types.h"
-#ifdef DST_DT_F32
-#define DST_TO_ACC(x) (x)
-#else
-#define DST_TO_ACC(x) TO_DEF_ACC_DATA_T(x)
-#endif
-#ifdef BIAS_DT_F32
-#define BIAS_TO_ACC(x) (x)
-#else
-#define BIAS_TO_ACC(x) TO_DEF_ACC_DATA_T(x)
-#endif
-#ifdef SRC_DT_F32
-#define SRC_TO_ACC(x) (x)
-#else
-#define SRC_TO_ACC(x) TO_DEF_ACC_DATA_T(x)
-#endif
-
-__kernel void gemm_post_ops_inner_product(__global SRC_DATA_T *src,
-        __global BIAS_DATA_T *bias, __global DST_DATA_T *dst POST_OP_ARGS,
-        __global SPAD_DATA_T *scratchpad, global float *scales) {
-    const size_t mb = get_global_id(0) / OC;
-    const size_t oc = get_global_id(0) % OC;
-
-    const size_t data_idx = mb * OC + oc;
-#if USE_TEMP_DST == 1
-    ACC_DATA_T acc = SRC_TO_ACC(scratchpad[data_idx]);
-#else
-    ACC_DATA_T acc = SRC_TO_ACC(src[data_idx]);
-#endif
-
-#if WITH_BIAS == 1
-    acc += BIAS_TO_ACC(bias[oc]);
-#endif
-
-#if WITH_SCALES
-#if SCALES_COMMON
-    const float scale = scales[0];
-#elif SCALES_PER_OC
-    const float scale = scales[oc];
-#else
-#error "Unsupported scale type"
-#endif
-    acc *= scale;
-#endif
-
-    // Apply postops
-    float sum_src;
-#if WITH_SUM
-    sum_src = DST_TO_ACC(dst[data_idx]);
-#endif
-
-    float accumulator = acc;
-    APPLY_POST_OPS_SERIAL_BINARY_2D(
-            accumulator, float, sum_src, float, mb, 1, oc, 1);
-
-    dst[data_idx] = TO_DST(accumulator);
-}
diff --git a/src/gpu/intel/ocl/gemm_post_ops_inner_product.cpp b/src/gpu/intel/ocl/gemm_post_ops_inner_product.cpp
deleted file mode 100644
index 13127f8f5de..00000000000
--- a/src/gpu/intel/ocl/gemm_post_ops_inner_product.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/ocl/gemm_post_ops_inner_product.hpp"
-#include "common/c_types_map.hpp"
-#include "gpu/intel/gemm/gpu_gemm.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-status_t gemm_post_ops_inner_product_fwd_t::execute_forward(
-        const exec_ctx_t &ctx) const {
-    using namespace memory_tracking::names;
-
-    gemm_exec_args_t gemm_args;
-    gemm_args.a = &CTX_IN_STORAGE(DNNL_ARG_SRC);
-    gemm_args.b = &CTX_IN_STORAGE(DNNL_ARG_WEIGHTS);
-
-    std::unique_ptr<memory_storage_t> acc;
-    if (pd()->use_scratchpad() || pd()->use_temp_dst())
-        acc = ctx.get_scratchpad_grantor().get_memory_storage(
-                key_iprod_int_dat_in_acc_dt);
-
-    if (pd()->use_temp_dst()) {
-        gemm_args.c = acc.get();
-    } else {
-        gemm_args.c = &CTX_OUT_STORAGE(DNNL_ARG_DST);
-    }
-
-    gemm_exec_ctx_t gemm_ctx(ctx, gemm_args);
-
-    nested_scratchpad_t ns(ctx, key_nested, gemm_);
-    gemm_ctx.set_scratchpad_grantor(ns.grantor());
-
-    status_t gemm_exec_status = gpu_gemm(gemm_)->execute(gemm_ctx);
-    if (gemm_exec_status != status::success) return gemm_exec_status;
-
-    if (pd()->with_post_process()) {
-        compute::kernel_arg_list_t arg_list;
-        arg_list.set(0, CTX_OUT_STORAGE(DNNL_ARG_DST));
-        arg_list.set(1, CTX_IN_STORAGE(DNNL_ARG_BIAS));
-        arg_list.set(2, CTX_OUT_STORAGE(DNNL_ARG_DST));
-        unsigned arg_idx = append_post_ops_to_arg_list(
-                ctx, arg_list, 3, pd()->attr()->post_ops_);
-        arg_list.set(arg_idx++,
-                pd()->use_scratchpad() ? *acc
-                                       : memory_storage_t::empty_storage());
-        arg_list.set(arg_idx,
-                pd()->attr_info_.with_runtime_oscales
-                        ? CTX_IN_STORAGE(DNNL_ARG_ATTR_OUTPUT_SCALES)
-                        : memory_storage_t::empty_storage());
-
-        size_t mb = pd()->MB();
-        size_t oc = pd()->OC();
-
-        compute::range_t gws(mb * oc);
-        compute::nd_range_t nd_range(gws);
-
-        status_t status
-                = parallel_for(ctx, nd_range, post_process_kernel_, arg_list);
-        if (status != status::success) return status;
-    }
-
-    return status::success;
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/gemm_post_ops_inner_product.hpp b/src/gpu/intel/ocl/gemm_post_ops_inner_product.hpp
deleted file mode 100644
index 6d0cb58cf93..00000000000
--- a/src/gpu/intel/ocl/gemm_post_ops_inner_product.hpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_GEMM_POST_OPS_INNER_PRODUCT_HPP
-#define GPU_INTEL_OCL_GEMM_POST_OPS_INNER_PRODUCT_HPP
-
-#include <assert.h>
-
-#include "common/c_types_map.hpp"
-#include "common/gemm_utils.hpp"
-#include "common/primitive.hpp"
-#include "common/primitive_desc_iterator.hpp"
-#include "gpu/gpu_inner_product_pd.hpp"
-#include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-#include "gpu/intel/primitive_conf.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-struct gemm_post_ops_inner_product_fwd_t : public gpu_primitive_t {
-    using gpu_primitive_t::gpu_primitive_t;
-    struct pd_t : public gpu_inner_product_fwd_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_inner_product_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        pd_t(const pd_t &rhs) = default;
-
-        DECLARE_COMMON_PD_T(
-                "ocl:gemm_post_ops_fwd", gemm_post_ops_inner_product_fwd_t);
-
-        status_t init(impl::engine_t *engine) {
-            using namespace status;
-            using namespace utils;
-            using namespace data_type;
-            using namespace primitive_kind;
-            assert(engine->kind() == engine_kind::gpu);
-
-            const primitive_attr_t::skip_mask_t attr_skip_mask
-                    = primitive_attr_t::skip_mask_t::oscale_runtime
-                    | primitive_attr_t::skip_mask_t::post_ops;
-
-            VDISPATCH_INNER_PRODUCT(is_fwd(), VERBOSE_BAD_PROPKIND);
-            VDISPATCH_INNER_PRODUCT_SC(
-                    set_default_params(), VERBOSE_UNSUPPORTED_TAG);
-            VDISPATCH_INNER_PRODUCT(
-                    dense_consistency_check(src_md(), weights_md(), dst_md()),
-                    VERBOSE_INCONSISTENT_MDS, "src, weights", "dst");
-            VDISPATCH_INNER_PRODUCT(dense_gemm_consistency_check(
-                                            src_md(), weights_md(), dst_md()),
-                    VERBOSE_INCONSISTENT_MDS, "src, weights", "dst");
-            VDISPATCH_INNER_PRODUCT(attr()->has_default_values(attr_skip_mask),
-                    VERBOSE_UNSUPPORTED_ATTR);
-            VDISPATCH_INNER_PRODUCT(
-                    post_ops_with_binary_ok(attr(), dst_md()->data_type),
-                    VERBOSE_UNSUPPORTED_POSTOP);
-            VDISPATCH_INNER_PRODUCT_SC(attr_.set_default_formats(dst_md(0)),
-                    VERBOSE_UNSUPPORTED_POSTOP);
-            VDISPATCH_INNER_PRODUCT(
-                    IMPLICATION(!attr()->output_scales_.has_default_values(),
-                            one_of(attr()->output_scales_.mask_, 0, 1 << 1)),
-                    VERBOSE_UNSUPPORTED_SCALES_CFG);
-
-            attr_info_ = attr_info_t::create(attr());
-
-            // XXX: Empty attributes increase chances of creating a gemm
-            // primitive. Ideally gemm should be created multiple times with
-            // different attr combinations, but this mechanism might be tricky.
-            // Current implementation computes attr - related things in the post
-            // process kernel.
-            primitive_attr_t gemm_attr;
-            is_int8_ = weights_md()->data_type == s8;
-
-            memory_desc_t a_md, b_md, c_md;
-            VDISPATCH_INNER_PRODUCT(init_2d_desc(&a_md, src_md()),
-                    VERBOSE_DESC_CREATION_FAIL, "2d memory");
-            VDISPATCH_INNER_PRODUCT(init_2d_desc(&b_md, weights_md(), true),
-                    VERBOSE_DESC_CREATION_FAIL, "2d memory");
-            VDISPATCH_INNER_PRODUCT(init_2d_desc(&c_md, dst_md()),
-                    VERBOSE_DESC_CREATION_FAIL, "2d memory");
-            c_md.data_type = desc()->accum_data_type;
-            bool gemm_ok = status::success
-                    == create_gemm_pd(gemm_pd_, engine, &a_md, &b_md, &c_md,
-                            &glob_zero_md, desc()->accum_data_type, &gemm_attr,
-                            true);
-            VDISPATCH_INNER_PRODUCT(
-                    gemm_ok, VERBOSE_PRIMITIVE_CREATION_FAIL, "gemm");
-
-            VDISPATCH_INNER_PRODUCT_SC(
-                    init_ip_scratchpad_md(), VERBOSE_SCRATCHPAD_INIT);
-            init_scratchpad();
-
-            return success;
-        }
-
-        bool with_post_process() const {
-            return use_scratchpad() || with_bias() || attr_info_.with_oscales
-                    || attr_info_.with_eltwise || attr_info_.with_binary
-                    || attr_info_.with_sum;
-        }
-        bool use_scratchpad() const { return use_temp_dst(); }
-
-        bool use_temp_dst() const {
-            using namespace data_type;
-            return (is_int8_ && !utils::one_of(dst_md()->data_type, s32, f32))
-                    || attr_info_.with_sum
-                    || desc()->accum_data_type != dst_md()->data_type;
-        }
-        const memory_desc_t *ip_scratchpad_md() const {
-            return &ip_scratchpad_md_;
-        }
-
-        status_t init_ip_scratchpad_md() {
-            if (use_scratchpad()) {
-                ip_scratchpad_md_.data_type = desc()->accum_data_type;
-                ip_scratchpad_md_.ndims = 1;
-                ip_scratchpad_md_.dims[0] = 0;
-
-                if (use_temp_dst()) {
-                    const size_t temp_dst_size = MB() * OC();
-                    ip_scratchpad_md_.dims[0] += temp_dst_size;
-                }
-                return memory_desc_init_by_tag(
-                        ip_scratchpad_md_, format_tag::x);
-            }
-
-            return status::success;
-        }
-
-        std::shared_ptr<primitive_desc_t> gemm_pd_;
-
-        memory_desc_t ip_scratchpad_md_;
-        bool is_int8_ = false;
-        attr_info_t attr_info_ = {};
-
-    private:
-        void init_scratchpad() {
-            auto scratchpad = scratchpad_registry().registrar();
-
-            if (use_scratchpad()) {
-                memory_desc_wrapper scratchpad_mdw(ip_scratchpad_md());
-                size_t sz = scratchpad_mdw.size();
-                scratchpad.book(
-                        memory_tracking::names::key_iprod_int_dat_in_acc_dt, sz,
-                        1, OCL_BUFFER_ALIGNMENT);
-            }
-
-            scratchpad.book(memory_tracking::names::key_nested,
-                    gemm_pd_->scratchpad_registry());
-        }
-    };
-
-    status_t init(impl::engine_t *engine) override {
-        CHECK(create_nested_primitive(gemm_, pd()->gemm_pd_, engine));
-
-        const size_t mb = pd()->MB();
-        const size_t oc = pd()->OC();
-
-        // Prepare post process kernel
-        if (pd()->with_post_process()) {
-            compute::kernel_ctx_t kernel_ctx;
-
-            kernel_ctx.define_int("MB", mb);
-            kernel_ctx.define_int("OC", oc);
-            bool int8 = pd()->is_int8_;
-            kernel_ctx.set_data_type(
-                    int8 ? data_type::f32 : pd()->dst_md()->data_type);
-            //here SRC is output tensor of gemm call
-            def_data_type(kernel_ctx, pd()->desc()->accum_data_type, "SRC");
-            def_data_type(kernel_ctx,
-                    int8 ? data_type::f32 : pd()->desc()->accum_data_type,
-                    "ACC");
-            def_data_type(kernel_ctx,
-                    pd()->with_bias() ? pd()->weights_md(1)->data_type
-                            : int8    ? data_type::f32
-                                      : pd()->dst_md()->data_type,
-                    "BIAS");
-            def_data_type(kernel_ctx, pd()->desc()->accum_data_type, "SPAD");
-            def_data_type(kernel_ctx, pd()->dst_md()->data_type, "DST");
-
-            kernel_ctx.define_int("USE_TEMP_DST", pd()->use_temp_dst());
-
-            kernel_ctx.define_int("WITH_BIAS", pd()->with_bias());
-
-            CHECK(def_attr_info(kernel_ctx, pd()->attr_info_,
-                    pd()->attr()->post_ops_, *pd()->invariant_dst_md()));
-
-            CHECK(create_kernel(engine, &post_process_kernel_,
-                    "gemm_post_ops_inner_product", kernel_ctx));
-            if (!post_process_kernel_) return status::runtime_error;
-        }
-
-        return status::success;
-    }
-
-    status_t execute(const exec_ctx_t &ctx) const override {
-        return execute_forward(ctx);
-    }
-
-private:
-    status_t execute_forward(const exec_ctx_t &ctx) const;
-    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-
-    std::shared_ptr<impl::primitive_t> gemm_;
-    compute::kernel_t post_process_kernel_;
-};
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/ocl/gen9_binary.cpp b/src/gpu/intel/ocl/gen9_binary.cpp
index 7620ab4fe17..72fea877543 100644
--- a/src/gpu/intel/ocl/gen9_binary.cpp
+++ b/src/gpu/intel/ocl/gen9_binary.cpp
@@ -92,7 +92,8 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
                 && src0_d.dims()[i] != src1_d.dims()[i]) {
             return status::unimplemented;
         }
-        conf.src1_bcast_dims[i] = i < ndims ? broadcast_dims()[i] : 1;
+        conf.src1_bcast_dims[i]
+                = i < ndims ? into<int>(broadcast_dims()[i]) : 1;
     }
 
     if (conf.src1_bcast_dims[1] && !conf.src1_bcast_dims[ndims - 1]) {
@@ -137,7 +138,7 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
     if (plain_and_X4a4b) {
         dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
         auto &blk = dst_d.blocking_desc();
-        int b_block = blk.inner_blks[blk.inner_nblks - 1];
+        int b_block = into<int>(blk.inner_blks[blk.inner_nblks - 1]);
         int sub_group_size = (b_block == 2 ? 8 : 16);
         blocks[0] = 4;
         blocks[1] = b_block;
@@ -160,8 +161,9 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
         conf.nvect = 8;
         int channel_blk = 16;
         const int vect_dim_size = 16;
-        const int padded_channels = padded_dims[1];
-        conf.mb_block = dst_d.md_->format_desc.blocking.inner_blks[0];
+        const dim_t padded_channels = padded_dims[1];
+        conf.mb_block
+                = into<int>(dst_d.md_->format_desc.blocking.inner_blks[0]);
         while (padded_channels % (vect_dim_size * channel_blk) != 0) {
             channel_blk /= 2;
         }
@@ -187,7 +189,7 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
                     "D0", ndims, dst_d.dims()[0], 1);
         }
         for (int i = idx; i < MAX_NDIMS; ++i) {
-            int dim = i < ndims ? dst_d.dims()[i] : 1;
+            dim_t dim = i < ndims ? dst_d.dims()[i] : 1;
             if (i == 1) {
                 conf.dispatch.define_dim(utils::format("D%d", i),
                         nstl::min(i, ndims - 1), dim, 1);
@@ -216,7 +218,7 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
         }
         if (!size_check) return status::unimplemented;
         for (int i = 0; i < MAX_NDIMS; ++i) {
-            int dim = i < ndims ? dst_d.dims()[i] : 1;
+            dim_t dim = i < ndims ? dst_d.dims()[i] : 1;
             if (i == 1) {
                 conf.dispatch.define_dim(utils::format("D%d", i),
                         nstl::min(i, ndims - 1), dim, 1);
@@ -236,7 +238,8 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
         }
 
         const int subgroup_size = 16;
-        int rem = (dst_d.dims()[ndims - 1]) % subgroup_size;
+        const dim_t last_dim = dst_d.dims()[ndims - 1];
+        int rem = last_dim % subgroup_size;
         if (rem) { conf.has_tail = 1; }
         conf.nvect = subgroup_size;
         bool all_dims_broadcast = true;
@@ -247,21 +250,23 @@ status_t gen9_binary_t::pd_t::init_conf(impl::engine_t *engine) {
 
         if (rem && !all_dims_broadcast) { return status::unimplemented; }
 
-        int rounded_last_dim = rem
-                ? utils::rnd_up(dst_d.dims()[ndims - 1], subgroup_size)
-                : dst_d.dims()[ndims - 1];
+        dim_t rounded_last_dim = utils::rnd_up(last_dim, subgroup_size);
 
-        while ((rounded_last_dim / 16) % conf.nvect != 0) {
-            --conf.nvect;
-        }
-
-        dim_t mixed_dim = rounded_last_dim;
+        dim_t mixed_dim = 1;
         for (int i = 0; i < (ndims - 1); ++i) {
             mixed_dim *= dst_d.dims()[i];
         }
 
+        if (rem && (mixed_dim * last_dim) % subgroup_size == 0)
+            return status::unimplemented;
+        mixed_dim *= rounded_last_dim;
+
+        while ((rounded_last_dim / subgroup_size) % conf.nvect != 0) {
+            --conf.nvect;
+        }
+
         conf.dispatch.define_dim("MIXED_DIM", 0, mixed_dim, conf.nvect);
-        CHECK(conf.dispatch.vectorize_dim("MIXED_DIM", 16));
+        CHECK(conf.dispatch.vectorize_dim("MIXED_DIM", subgroup_size));
     } else {
         return status::unimplemented;
     }
diff --git a/src/gpu/intel/ocl/gen9_binary.hpp b/src/gpu/intel/ocl/gen9_binary.hpp
index 0513f3ef483..6fd7b19dbf2 100644
--- a/src/gpu/intel/ocl/gen9_binary.hpp
+++ b/src/gpu/intel/ocl/gen9_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ struct gen9_binary_t : public gpu_primitive_t {
             auto *compute_engine
                     = utils::downcast<compute::compute_engine_t *>(engine);
 
-            const auto attr_skip_mask = sm::post_ops | sm::scales_runtime;
+            const auto attr_skip_mask = sm::post_ops | sm::scales;
             VDISPATCH_BINARY_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_BINARY(
                     !memory_desc_ndims_ok(src_md(0), src_md(1), dst_md()),
@@ -53,22 +53,27 @@ struct gen9_binary_t : public gpu_primitive_t {
                     ((utils::everyone_is(
                               bf16, src_md(0)->data_type, src_md(1)->data_type)
                              && utils::one_of(dst_md()->data_type, bf16, u8))
-                            || (utils::one_of(
-                                        src_md(0)->data_type, f16, f32, s8, u8)
+                            || (utils::one_of(src_md(0)->data_type, f16, f32,
+                                        s8, u8, s32)
                                     && utils::one_of(src_md(1)->data_type, f16,
-                                            f32, s8, u8)
+                                            f32, s8, u8, s32)
                                     && utils::one_of(dst_md()->data_type, f16,
-                                            f32, s8, u8))),
+                                            f32, s8, u8, s32))),
                     VERBOSE_UNSUPPORTED_DT);
+            VDISPATCH_BINARY(!is_ternary_op(), VERBOSE_BAD_ALGORITHM);
             VDISPATCH_BINARY(
                     IMPLICATION(!attr()->scales_.has_default_values(),
-                            utils::one_of(dst_md()->data_type, s8, u8)
-                                    && utils::everyone_is(
-                                            attr()->scales_.get(DNNL_ARG_SRC_0)
-                                                    .mask_,
-                                            attr()->scales_.get(DNNL_ARG_SRC_1)
-                                                    .mask_,
-                                            0)),
+                            utils::one_of(dst_md()->data_type, s8, u8)),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_BINARY(
+                    IMPLICATION(
+                            !attr()->scales_.has_default_values(DNNL_ARG_SRC_0),
+                            attr()->scales_.get_mask(DNNL_ARG_SRC_0) == 0),
+                    VERBOSE_UNSUPPORTED_SCALES_CFG);
+            VDISPATCH_BINARY(
+                    IMPLICATION(
+                            !attr()->scales_.has_default_values(DNNL_ARG_SRC_1),
+                            attr()->scales_.get_mask(DNNL_ARG_SRC_1) == 0),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_BINARY(attr()->has_default_values(attr_skip_mask),
                     VERBOSE_UNSUPPORTED_ATTR);
diff --git a/src/gpu/intel/ocl/gen9_concat.cl b/src/gpu/intel/ocl/gen9_concat.cl
index 3789dbf62c7..50920f57646 100644
--- a/src/gpu/intel/ocl/gen9_concat.cl
+++ b/src/gpu/intel/ocl/gen9_concat.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_io.h"
 #include "gpu/intel/ocl/ocl_types.h"
 
 #include "gpu/intel/ocl/concat_common.h"
@@ -22,19 +23,19 @@
 #define SCALE_PTR(n) __global const float *scale##n
 
 #define IS_IN_PART(x) (dst_dims[CONCAT_AXIS] < CONCAT3(SRC, x, _END))
-#define WRITE_DST_X(x) dst[dst_off] = TO_DST(x);
+#define WRITE_DST_X(x) write(dst + dst_off, x);
 #if SCALES_MASK > 0
 #define EXTRACT_SCALES_IF(n) \
     if (SCALES_MASK & (1 << n)) tmp_src_scale = *scale##n;
 #define WRITE_DST WRITE_DST_X((float)src_val *tmp_src_scale)
-#define INIT_FLOAT_SCALE float tmp_src_scale = 1.0;
+#define INIT_FLOAT_SCALE float tmp_src_scale = 1.f;
 #define SCALE_PTRS , REDUCE(NUM_INPUTS, JOIN_COMMA, SCALE_PTR)
 #else
 #define WRITE_DST WRITE_DST_X(src_val)
 #define EXTRACT_SCALES_IF(n)
 #define INIT_FLOAT_SCALE
 #define SCALE_PTRS
-#define WRITE_SCALE dst[dst_off] = TO_DST(src_val);
+#define WRITE_SCALE write(dst + dst_off, src_val);
 #endif
 
 #define SET_DIMS(x, y) \
@@ -68,7 +69,7 @@ __kernel void gen9_concat(__global DST_DATA_T *dst, long dst_offset0,
         __global const SRC_DATA_T *src14,
         __global const SRC_DATA_T *src15 SCALE_PTRS) {
     dst += dst_offset0;
-    int dst_dims[6], src_dims[6];
+    off_t dst_dims[6], src_dims[6];
     src_dims[0] = dst_dims[0] = GWS_GET_D0();
     src_dims[1] = dst_dims[1] = GWS_GET_D1();
     src_dims[2] = dst_dims[2] = GWS_GET_D2();
@@ -76,19 +77,20 @@ __kernel void gen9_concat(__global DST_DATA_T *dst, long dst_offset0,
     src_dims[4] = dst_dims[4] = GWS_GET_D4();
     src_dims[5] = dst_dims[5] = GWS_GET_D5();
 
-    const int iter_dim_end = min(
+    const off_t iter_dim_end = min(
             dst_dims[ITER_DIM_IDX] + ITER_DIM_CHUNK, ITER_DIM_PADDED_SIZE);
 
     if (NEEDS_PADDING(dst_dims[0], dst_dims[1], dst_dims[2], dst_dims[3],
                 dst_dims[4], dst_dims[5])) {
         for (; dst_dims[ITER_DIM_IDX] < iter_dim_end;
                 dst_dims[ITER_DIM_IDX]++) {
-            const int dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1],
+            const off_t dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1],
                     dst_dims[2], dst_dims[3], dst_dims[4], dst_dims[5]);
 #if SUB_GROUP_SIZE > 1
-            BLOCK_WRITE_DST(&dst[dst_off], TO_DST(0.0f));
+            float zval = 0.0f;
+            block_write(dst + dst_off, &zval, VECT_DT_N);
 #else // SUB_GROUP_SIZE > 1
-            dst[dst_off] = TO_DST(0.0f);
+            write(dst + dst_off, 0.0f);
 #endif // SUB_GROUP_SIZE > 1
         }
         return;
@@ -96,7 +98,7 @@ __kernel void gen9_concat(__global DST_DATA_T *dst, long dst_offset0,
     for (; dst_dims[ITER_DIM_IDX] < min(DD(ITER_DIM_IDX), iter_dim_end);
             dst_dims[ITER_DIM_IDX]++, src_dims[ITER_DIM_IDX]++) {
         int part;
-        int src_off;
+        off_t src_off;
         const __global SRC_DATA_T *src;
         INIT_FLOAT_SCALE;
 
@@ -162,38 +164,29 @@ __kernel void gen9_concat(__global DST_DATA_T *dst, long dst_offset0,
             SET_DIMS(14, 15)
 #endif
 
-        const int dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], dst_dims[2],
+        const off_t dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], dst_dims[2],
                 dst_dims[3], dst_dims[4], dst_dims[5]);
 
+        FLT_ACC_DATA_T src_val;
 #if SUB_GROUP_SIZE > 1
-#if DT_BF16 == 1
-        float src_val = DATA_TO_REF(AS_DATA_T(
-                BLOCK_READ((const __global BLOCK_DATA_T *)&src[src_off])));
-#else // DT_BF16 == 1
-        SRC_DATA_T src_val = AS_DATA_T(
-                BLOCK_READ((const __global BLOCK_DATA_T *)&src[src_off]));
-#endif // DT_BF16 == 1
+        block_load(&src_val, (__global SRC_DATA_T *)src + src_off);
 #if SCALES_MASK > 0
-        BLOCK_WRITE_DST(&dst[dst_off], TO_DST((float)src_val * tmp_src_scale));
-#else
-        BLOCK_WRITE_DST(&dst[dst_off], TO_DST(src_val));
+        src_val = SCALES_MASK ? src_val * tmp_src_scale : src_val;
 #endif
+        block_write(dst + dst_off, &src_val, VECT_DT_N);
 #else // SUB_GROUP_SIZE > 1
-#if DT_BF16 == 1
-        float src_val = DATA_TO_REF(src[src_off]);
-#else // DT_BF16 == 1
-        SRC_DATA_T src_val = src[src_off];
-#endif // DT_BF16 == 1
+        load(&src_val, src + src_off);
         WRITE_DST
 #endif // SUB_GROUP_SIZE > 1
     }
     for (; dst_dims[ITER_DIM_IDX] < iter_dim_end; dst_dims[ITER_DIM_IDX]++) {
-        const int dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], dst_dims[2],
+        const off_t dst_off = OFF_MD(DST, dst_dims[0], dst_dims[1], dst_dims[2],
                 dst_dims[3], dst_dims[4], dst_dims[5]);
 #if SUB_GROUP_SIZE > 1
-        BLOCK_WRITE_DST(&dst[dst_off], TO_DST(0.0f));
+        FLT_ACC_DATA_T zval = 0.f;
+        block_write(dst + dst_off, &zval, VECT_DT_N);
 #else // SUB_GROUP_SIZE > 1
-        dst[dst_off] = TO_DST(0.0f);
+        write(dst + dst_off, 0.f);
 #endif // SUB_GROUP_SIZE > 1
     }
 }
diff --git a/src/gpu/intel/ocl/gen9_concat.cpp b/src/gpu/intel/ocl/gen9_concat.cpp
index b7dbe830e9f..7f0d909c513 100644
--- a/src/gpu/intel/ocl/gen9_concat.cpp
+++ b/src/gpu/intel/ocl/gen9_concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "common/primitive_exec_types.hpp"
 
 #include "gpu/intel/ocl/gen9_concat.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -33,7 +33,7 @@ std::pair<int, int> gen9_concat_t::pd_t::calculate_iter_dim_idx_chunk(
     const auto &dst_dims = conf.dst_md_info.padded_dims;
     int max_dim_idx = (conf.concat_axis == conf.ndims - 1) ? conf.ndims - 2
                                                            : conf.ndims - 1;
-    int max_dim = dst_dims[max_dim_idx];
+    dim_t max_dim = dst_dims[max_dim_idx];
     for (int dim_idx = max_dim_idx - 1; dim_idx >= 0; dim_idx--) {
         if (dim_idx == conf.concat_axis) continue;
         const auto dim = dst_dims[dim_idx];
@@ -125,10 +125,12 @@ status_t gen9_concat_t::pd_t::init_conf(impl::engine_t *engine) {
     conf.n = pd->n_inputs();
     conf.concat_axis = pd->concat_dim();
 
+    dim_t max_bytes = dst_mdw.size();
     int concat_axis_end = 0;
     conf.scales_mask = 0;
     for (int i = 0; i < conf.n; ++i) {
         const memory_desc_wrapper src_mdw(pd->src_md(i));
+        max_bytes = std::max(max_bytes, into<dim_t>(src_mdw.size()));
         concat_axis_end += src_mdw.dims()[conf.concat_axis];
         conf.offset[i] = concat_axis_end;
         conf.src_md_infos[i] = memory_desc_info_t::create(pd->src_md(i));
@@ -137,6 +139,8 @@ status_t gen9_concat_t::pd_t::init_conf(impl::engine_t *engine) {
         conf.scales_mask |= ((!conf.scale_src[i].has_default_values()) << i);
     }
 
+    conf.use_large_index = (max_bytes > std::numeric_limits<int>::max());
+
     conf.sub_group_size = calculate_sub_group_size(compute_engine);
     std::tie(conf.iter_dim_idx, conf.iter_dim_chunk)
             = calculate_iter_dim_idx_chunk(
@@ -155,7 +159,7 @@ status_t gen9_concat_t::pd_t::init_conf(impl::engine_t *engine) {
     for (int dim_idx = 0; dim_idx < MAX_NDIMS; dim_idx++) {
         const int dim_block
                 = conf.iter_dim_idx == dim_idx ? conf.iter_dim_chunk : 1;
-        const int dim_size = conf.ndims > dim_idx ? dst_dims[dim_idx] : 1;
+        const dim_t dim_size = conf.ndims > dim_idx ? dst_dims[dim_idx] : 1;
         conf.dispatch.define_dim(
                 utils::format("D%d", dim_idx), 0, dim_size, dim_block);
     }
@@ -169,13 +173,16 @@ status_t gen9_concat_t::pd_t::init_conf(impl::engine_t *engine) {
 
 static status_t init_kernel_ctx_common(
         compute::kernel_ctx_t &kernel_ctx, const concat_conf_t &conf) {
+    constexpr bool with_punning = false;
+
     for (int i = 0; i < conf.n; ++i) {
         kernel_ctx.define_int(utils::format("SRC%d_END", i), conf.offset[i]);
         def_memory_desc_info(kernel_ctx, conf.src_md_infos[i],
-                utils::format("SRC%d", i).c_str());
+                utils::format("SRC%d", i).c_str(), with_punning);
     }
-    def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST");
-    kernel_ctx.set_data_type(conf.src_type);
+    def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST", with_punning);
+
+    kernel_ctx.set_data_type(conf.src_type, with_punning);
 
     kernel_ctx.define_int("NDIMS", conf.ndims);
     kernel_ctx.define_int("CONCAT_AXIS", conf.concat_axis);
@@ -188,6 +195,8 @@ static status_t init_kernel_ctx_common(
     kernel_ctx.define_int("ITER_DIM_CHUNK", conf.iter_dim_chunk);
     kernel_ctx.define_int("SCALES_MASK", conf.scales_mask);
 
+    kernel_ctx.use_int32_offset(!conf.use_large_index);
+
     def_dispatch(kernel_ctx, conf.dispatch);
 
     return status::success;
diff --git a/src/gpu/intel/ocl/gen9_concat.hpp b/src/gpu/intel/ocl/gen9_concat.hpp
index d4bd3bdabfd..1eb6f34571b 100644
--- a/src/gpu/intel/ocl/gen9_concat.hpp
+++ b/src/gpu/intel/ocl/gen9_concat.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include "common/stream.hpp"
 #include "gpu/gpu_concat_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -41,7 +41,7 @@ struct gen9_concat_t : public gpu_primitive_t {
             : gpu_concat_pd_t(attr, dst_md, n, concat_dim, src_mds) {}
 
         pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        ~pd_t() override = default;
 
         DECLARE_CONCAT_PD_T("gen9:any", gen9_concat_t);
 
@@ -50,7 +50,7 @@ struct gen9_concat_t : public gpu_primitive_t {
             using sm = primitive_attr_t::skip_mask_t;
 
             VDISPATCH_CONCAT(n_inputs() <= 16, VERBOSE_BAD_PARAM, "n_inputs");
-            VDISPATCH_CONCAT(attr()->has_default_values(sm::scales_runtime),
+            VDISPATCH_CONCAT(attr()->has_default_values(sm::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_CONCAT_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_CONCAT(!memory_desc_ndims_ok(dst_md()), VERBOSE_BAD_NDIMS,
@@ -87,7 +87,7 @@ struct gen9_concat_t : public gpu_primitive_t {
         return status::success;
     }
 
-    virtual status_t execute(const exec_ctx_t &ctx) const override {
+    status_t execute(const exec_ctx_t &ctx) const override {
         return execute_concat(ctx);
     }
 
diff --git a/src/gpu/intel/ocl/gen9_eltwise.cpp b/src/gpu/intel/ocl/gen9_eltwise.cpp
index 817af28d71a..2908569e3c3 100644
--- a/src/gpu/intel/ocl/gen9_eltwise.cpp
+++ b/src/gpu/intel/ocl/gen9_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,7 +69,6 @@ compute::kernel_ctx_t gen9_eltwise_jit_params_t::get_kernel_ctx() const {
     kernel_ctx.set_data_type(data_type);
     def_eltwise_alg_kinds(kernel_ctx);
 
-    kernel_ctx.define_int("WITH_ELTWISE", 1);
     kernel_ctx.define_int("ELTWISE_ALG", alg_kind);
 
     kernel_ctx.define_int("VECT_DT_N", vector_size);
@@ -108,9 +107,8 @@ status_t gen9_eltwise_fwd_t::execute_forward_dense(
     arg_list.set(3, alpha);
     arg_list.set(4, beta);
 
-    size_t lws = gpu_utils::into<size_t>(conf.work_group_size);
-    size_t total_wi
-            = gpu_utils::into<size_t>(utils::div_up(nelems, conf.vector_size));
+    size_t lws = into<size_t>(conf.work_group_size);
+    size_t total_wi = into<size_t>(utils::div_up(nelems, conf.vector_size));
     compute::nd_range_t nd_range({utils::rnd_up(total_wi, lws)}, {lws});
 
     status = parallel_for(ctx, nd_range, kernel_, arg_list);
@@ -158,9 +156,8 @@ status_t gen9_eltwise_bwd_t::execute_backward_dense(
     arg_list.set(4, alpha);
     arg_list.set(5, beta);
 
-    size_t lws = gpu_utils::into<size_t>(conf.work_group_size);
-    size_t total_wi
-            = gpu_utils::into<size_t>(utils::div_up(nelems, conf.vector_size));
+    size_t lws = into<size_t>(conf.work_group_size);
+    size_t total_wi = into<size_t>(utils::div_up(nelems, conf.vector_size));
     compute::nd_range_t nd_range({utils::rnd_up(total_wi, lws)}, {lws});
 
     status = parallel_for(ctx, nd_range, kernel_, arg_list);
diff --git a/src/gpu/intel/ocl/gen9_eltwise.hpp b/src/gpu/intel/ocl/gen9_eltwise.hpp
index 348fbfcb663..9cc0ba0194a 100644
--- a/src/gpu/intel/ocl/gen9_eltwise.hpp
+++ b/src/gpu/intel/ocl/gen9_eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,9 +19,9 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive.hpp"
+#include "common/serialization.hpp"
 #include "gpu/gpu_eltwise_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -117,9 +117,7 @@ struct gen9_eltwise_fwd_t : public gpu_primitive_t {
 struct gen9_eltwise_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_eltwise_bwd_pd_t {
-        pd_t(const eltwise_desc_t *adesc, const primitive_attr_t *attr,
-                const eltwise_fwd_pd_t *hint_fwd_pd)
-            : gpu_eltwise_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_eltwise_bwd_pd_t::gpu_eltwise_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gen9:any", gen9_eltwise_bwd_t);
 
diff --git a/src/gpu/intel/ocl/gen9_global_pooling.cl b/src/gpu/intel/ocl/gen9_global_pooling.cl
index acb3451e15c..8a99833b174 100644
--- a/src/gpu/intel/ocl/gen9_global_pooling.cl
+++ b/src/gpu/intel/ocl/gen9_global_pooling.cl
@@ -23,10 +23,13 @@
 KERNEL_ATTR
 __kernel void gen9_global_pooling_fwd(
         __global DATA_T *src, __global int *ws, __global DST_DATA_T *dst) {
-    const int mb = get_global_id(0) / C;
-    const int oc = get_global_id(0) % C;
 
-    const uint dst_off = DST_OFF(mb, oc, 0, 0, 0);
+    if (GWS_OVERFLOW) return;
+
+    const off_t mb = get_global_id(0) / C;
+    const off_t oc = get_global_id(0) % C;
+
+    const off_t dst_off = DST_OFF(mb, oc, 0, 0, 0);
 
 #if ALG_MAX
 #if DT_BF16
@@ -35,7 +38,7 @@ __kernel void gen9_global_pooling_fwd(
     float dst_val = src[0];
 #endif
 #if IS_TRAINING
-    int max_idx = -1;
+    off_t max_idx = -1;
 #endif
 #else
 #if DT_BF16
@@ -45,10 +48,10 @@ __kernel void gen9_global_pooling_fwd(
 #endif
 #endif
 
-    for (int id = 0; id < ID; id++) {
-        for (int ih = 0; ih < IH; ih++) {
-            for (int iw = 0; iw < IW; iw++) {
-                uint src_off = SRC_OFF(mb, oc, id, ih, iw);
+    for (off_t id = 0; id < ID; id++) {
+        for (off_t ih = 0; ih < IH; ih++) {
+            for (off_t iw = 0; iw < IW; iw++) {
+                off_t src_off = SRC_OFF(mb, oc, id, ih, iw);
 #if DT_BF16
                 DEF_ACC_DATA_T val = DATA_TO_REF(src[src_off]);
 #else
@@ -74,7 +77,7 @@ __kernel void gen9_global_pooling_fwd(
     ws[dst_off] = max_idx;
 #endif
 #else
-    dst[dst_off] = TO_DST(dst_val / ID / IH / IW);
+    dst[dst_off] = TO_DST(dst_val / convert_float(ID * IH * IW));
 #endif
 }
 #endif // IS_FWD
@@ -93,33 +96,37 @@ __kernel void gen9_global_pooling_fwd(
 KERNEL_ATTR
 __kernel void gen9_global_pooling_bwd(__global DATA_T *diff_src,
         __global int *ws, __global DATA_T *diff_dst) {
-    const int mb = GWS_GET_MB();
+
+    if (GWS_OVERFLOW) return;
+
+    const off_t mb = GWS_GET_MB();
 #if IS_VECTORIZED
-    const int c = GWS_GET_C() + get_sub_group_local_id();
+    const off_t c = GWS_GET_C() + get_sub_group_local_id();
 #else
-    const int c = GWS_GET_C();
+    const off_t c = GWS_GET_C();
 #endif
-    const int spatial = GWS_GET_SPATIAL();
+    const off_t spatial = GWS_GET_SPATIAL();
 
     const bool is_in_padded_area = NEED_ZERO_PADDING && (mb >= MB || c >= C);
-    const int dst_off = DST_OFF(mb, c, 0, 0, 0);
+    const off_t dst_off = DST_OFF(mb, c, 0, 0, 0);
 #if ALG_AVG
     // Read dst value only once
     const DATA_T dst_val = diff_dst[dst_off];
 #endif // ALG_AVG
     int ws_val = ws[dst_off];
-    for (int sp_idx = spatial;
-            sp_idx < min(spatial + SPATIAL_CHUNK, SPATIAL_DIM); sp_idx++) {
-        const int iw = sp_idx % IW;
-        const int ih = ((sp_idx - iw) % (IH * IW)) / IW;
-        const int id = (sp_idx - iw - ih * IW) / (IH * IW);
+    for (off_t sp_idx = spatial;
+            sp_idx < min(spatial + SPATIAL_CHUNK, (off_t)SPATIAL_DIM);
+            sp_idx++) {
+        const off_t iw = sp_idx % IW;
+        const off_t ih = ((sp_idx - iw) % (IH * IW)) / IW;
+        const off_t id = (sp_idx - iw - ih * IW) / (IH * IW);
         DATA_T val_to_write;
         if (is_in_padded_area)
             val_to_write = DATA_ZERO;
         else {
 #if ALG_MAX
             // Read dst value only in case it's going to be used
-            const int current_input_idx = id * IH * IW + ih * IW + iw;
+            const off_t current_input_idx = id * IH * IW + ih * IW + iw;
             if (current_input_idx == ws_val) {
                 val_to_write = diff_dst[dst_off];
             } else {
@@ -130,7 +137,7 @@ __kernel void gen9_global_pooling_bwd(__global DATA_T *diff_src,
             val_to_write = CONVERT_DATA_T(dst_val_f);
 #endif // ALG_MAX
         }
-        const int src_off = SRC_OFF(mb, GWS_GET_C(), id, ih, iw);
+        const off_t src_off = SRC_OFF(mb, GWS_GET_C(), id, ih, iw);
 #if IS_VECTORIZED
         DST_BLOCK_WRITE(&diff_src[src_off], val_to_write);
 #else
diff --git a/src/gpu/intel/ocl/gen9_global_pooling.cpp b/src/gpu/intel/ocl/gen9_global_pooling.cpp
index bc46103fc47..b5a0dfcb83c 100644
--- a/src/gpu/intel/ocl/gen9_global_pooling.cpp
+++ b/src/gpu/intel/ocl/gen9_global_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/gen9_global_pooling.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -23,14 +23,14 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
-int calculate_spatial_chunk(const pool_conf_t &conf, impl::engine_t *engine) {
+dim_t calculate_spatial_chunk(const pool_conf_t &conf, impl::engine_t *engine) {
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
     const int hw_threads = compute_engine->device_info()->hw_threads();
     const bool is_xe_hp_plus = compute_engine->is_xe_hp()
             || compute_engine->is_xe_hpg() || compute_engine->is_xe_hpc();
 
-    const int spatial_dim = conf.id * conf.ih * conf.iw;
-    int chunk_size = spatial_dim;
+    const dim_t spatial_dim = conf.id * conf.ih * conf.iw;
+    dim_t chunk_size = spatial_dim;
 
     // Experimentally selected values for XeHP family
     const int desired_wi_per_thread = is_xe_hp_plus && conf.is_plain ? 1024 : 4;
@@ -52,28 +52,33 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
     set_default_pool_conf(conf, *pd->desc(), *pd->invariant_src_md(),
             *pd->invariant_dst_md(), *pd->attr());
 
-    if (conf.iw != conf.kw || conf.ih != conf.kh || conf.ow * conf.oh != 1)
-        return status::unimplemented;
+    VDISPATCH_POOLING_IC(
+            conf.iw == conf.kw && conf.ih == conf.kh && conf.ow * conf.oh == 1,
+            "%s," VERBOSE_SHAPE_RESTRICTION, pd->info(engine));
 
     const memory_desc_wrapper src_mdw(pd->invariant_src_md());
     const memory_desc_wrapper dst_mdw(pd->invariant_dst_md());
     const auto &padded_src_dims = src_mdw.padded_dims();
     const auto &padded_dst_dims = dst_mdw.padded_dims();
-    if (utils::array_product(padded_src_dims + 2, conf.ndims - 2)
-                    != conf.id * conf.ih * conf.iw
-            || utils::array_product(padded_dst_dims + 2, conf.ndims - 2)
-                    != conf.od * conf.oh * conf.ow)
-        return status::unimplemented;
+    VDISPATCH_POOLING_IC(
+            utils::array_product(padded_src_dims + 2, conf.ndims - 2)
+                            == conf.id * conf.ih * conf.iw
+                    && utils::array_product(padded_dst_dims + 2, conf.ndims - 2)
+                            == conf.od * conf.oh * conf.ow,
+            "%s," VERBOSE_SHAPE_RESTRICTION, pd->info(engine));
 
     using namespace dnnl::impl::alg_kind;
     if (!conf.is_backward) {
         if (conf.alg == pooling_max) {
             // gen9_global_pooling_fwd doesn't support zero padding.
-            if (conf.mb != conf.mb_padded || conf.c != conf.c_padded)
-                return status::unimplemented;
+            VDISPATCH_POOLING_IC(
+                    conf.mb == conf.mb_padded && conf.c == conf.c_padded,
+                    "%s," VERBOSE_SHAPE_RESTRICTION, pd->info(engine));
         }
         // heuristics: for small shapes, gen9_pooling_fwd provides better perf.
-        if (conf.kd * conf.kh * conf.kw < 128) return status::unimplemented;
+        VDISPATCH_POOLING_IC(conf.kd * conf.kh * conf.kw >= 128,
+                "%s," VERBOSE_IMPL_HEURISTIC_FAIL, pd->info(engine),
+                "input kernel spatial size is too small");
     }
 
     set_offsets(src_mdw, off.src_off);
@@ -84,7 +89,7 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
     conf.is_plain = src_mdw.is_plain();
     conf.global_pool_spatial_chunk = calculate_spatial_chunk(conf, engine);
 
-    const int spatial_dim_padded = utils::rnd_up(
+    const dim_t spatial_dim_padded = utils::rnd_up(
             conf.id * conf.ih * conf.iw, conf.global_pool_spatial_chunk);
     conf.dispatch = compute_engine->create_dispatch(src_mdw.md_);
     conf.dispatch.define_dim("MB", 0, conf.mb_padded);
@@ -100,7 +105,11 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
                 || !dst_mdw.is_plain())
             conf.vectorize = false;
         if (conf.vectorize) {
-            CHECK(conf.dispatch.vectorize_dim("C", conf.sub_group_size));
+            VDISPATCH_POOLING_IC(
+                    conf.dispatch.vectorize_dim("C", conf.sub_group_size)
+                            == status::success,
+                    "%s," VERBOSE_BLOCKING_FAIL, pd->info(engine),
+                    "failed to block channels across subgroup");
         }
     }
     conf.dispatch.generate();
diff --git a/src/gpu/intel/ocl/gen9_global_pooling.hpp b/src/gpu/intel/ocl/gen9_global_pooling.hpp
index d4a8994d7aa..86d73eac981 100644
--- a/src/gpu/intel/ocl/gen9_global_pooling.hpp
+++ b/src/gpu/intel/ocl/gen9_global_pooling.hpp
@@ -33,9 +33,7 @@ namespace ocl {
 struct gen9_global_pooling_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_pooling_fwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_fwd_pd_t::gpu_pooling_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gen9_global:any", gen9_global_pooling_fwd_t);
 
@@ -59,12 +57,17 @@ struct gen9_global_pooling_fwd_t : public gpu_primitive_t {
             VDISPATCH_POOLING(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
+            CHECK(init_conf(engine));
+
             bool is_training = desc_.prop_kind == forward_training;
-            if (desc()->alg_kind == pooling_max && is_training)
+            if (desc()->alg_kind == pooling_max && is_training) {
+                // Required for storing spatial offsets into workspace for
+                // pooling_max training due to use of int type.
+                VDISPATCH_POOLING(conf.id * conf.ih * conf.iw <= INT_MAX,
+                        VERBOSE_OFFSET_DT_MISMATCH, "kernel spatial", "int");
                 init_default_ws(s32);
+            }
 
-            VDISPATCH_POOLING_SC(init_conf(engine),
-                    VERBOSE_PRIMITIVE_CREATION_FAIL, "pooling");
             VDISPATCH_POOLING_SC(init_reduction(engine), "init_reduction()");
             init_scratchpad();
             return status::success;
@@ -141,9 +144,7 @@ struct gen9_global_pooling_fwd_t : public gpu_primitive_t {
 struct gen9_global_pooling_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_pooling_bwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_bwd_pd_t::gpu_pooling_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gen9_global:any", gen9_global_pooling_bwd_t);
 
@@ -187,14 +188,17 @@ struct gen9_global_pooling_bwd_t : public gpu_primitive_t {
             VDISPATCH_POOLING(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
+            CHECK(init_conf(engine));
+
             if (desc()->alg_kind == pooling_max) {
+                // Required for storing spatial offsets into workspace for
+                // pooling_max training due to use of int type.
+                VDISPATCH_POOLING(conf.id * conf.ih * conf.iw <= INT_MAX,
+                        VERBOSE_OFFSET_DT_MISMATCH, "kernel spatial", "int");
                 init_default_ws(data_type::s32);
                 VDISPATCH_POOLING(
                         compare_ws(hint_fwd_pd_), VERBOSE_WS_MISMATCH);
             }
-
-            VDISPATCH_POOLING_SC(init_conf(engine),
-                    VERBOSE_PRIMITIVE_CREATION_FAIL, "pooling");
             return status::success;
         }
 
diff --git a/src/gpu/intel/ocl/gen9_pooling.cl b/src/gpu/intel/ocl/gen9_pooling.cl
index c344ce7863b..8323492a6bf 100644
--- a/src/gpu/intel/ocl/gen9_pooling.cl
+++ b/src/gpu/intel/ocl/gen9_pooling.cl
@@ -19,16 +19,16 @@
 #include "gpu/intel/ocl/ocl_types.h"
 
 // Read functions.
-inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr, int c,
-        int blocks_stride, int chunks_per_block);
-inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr, int c,
-        int blocks_stride, int chunks_per_block);
+inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr,
+        off_t c, off_t blocks_stride, int chunks_per_block);
+inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr,
+        off_t c, off_t blocks_stride, int chunks_per_block);
 
 // Write functions.
-inline void write_vect_c_block(int idx, __global DATA_T *ptr, int c,
-        int blocks_stride, int chunks_per_block, VECT_DATA_T block);
-inline void write_vect_c_block_int(int idx, __global int *ptr, int c,
-        int blocks_stride, int chunks_per_block, VECT_INT_T block);
+inline void write_vect_c_block(int idx, __global DATA_T *ptr, off_t c,
+        off_t blocks_stride, int chunks_per_block, VECT_DATA_T block);
+inline void write_vect_c_block_int(int idx, __global int *ptr, off_t c,
+        off_t blocks_stride, int chunks_per_block, VECT_INT_T block);
 
 #if DT_BF16 || DT_F16
 #define USE_FLOATS true
@@ -39,39 +39,42 @@ inline void write_vect_c_block_int(int idx, __global int *ptr, int c,
 #if IS_FWD
 KERNEL_ATTR
 __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
-        __global DATA_T *dst, const int batch_id POST_OP_ARGS) {
-    const int mb0 = MB_BLOCK_SIZE * batch_id + GWS_GET_MB();
+        __global DATA_T *dst, const dim_t batch_id POST_OP_ARGS) {
+
+    if (GWS_OVERFLOW) return;
+
+    const off_t mb0 = MB_BLOCK_SIZE * batch_id + GWS_GET_MB();
 #if UNROLL_MB_COUNT > 1
-    const int mb1 = mb0 + MB / 2;
+    const off_t mb1 = mb0 + MB / 2;
 #endif
-    const int c = GWS_GET_C();
-    const int od = GWS_GET_OD();
-    const int oh = GWS_GET_OH();
-    const int ow = GWS_GET_OW();
+    const off_t c = GWS_GET_C();
+    const off_t od = GWS_GET_OD();
+    const off_t oh = GWS_GET_OH();
+    const off_t ow = GWS_GET_OW();
 
     // Calculate number of subgroup chunks inside C block
     // and stride between consecutive MB/C blocks
 #if USE_MB_C_BLOCK
-    const int src_stride = (SRC_SB0 > 1) ? SRC_SB0 : SRC_S0;
-    const int dst_stride = (DST_SB0 > 1) ? DST_SB0 : DST_S0;
+    const off_t src_stride = (SRC_SB0 > 1) ? SRC_SB0 : SRC_S0;
+    const off_t dst_stride = (DST_SB0 > 1) ? DST_SB0 : DST_S0;
     const int src_chunks_per_c_block = CHUNKS_PER_C_BLOCK;
     const int dst_chunks_per_c_block = CHUNKS_PER_C_BLOCK;
 #elif USE_ONLY_C_BLOCK
-    const int src_stride = (SRC_B1 > 1) ? SRC_S1 : SUB_GROUP_SIZE;
-    const int dst_stride = (DST_B1 > 1) ? DST_S1 : SUB_GROUP_SIZE;
+    const off_t src_stride = (SRC_B1 > 1) ? SRC_S1 : SUB_GROUP_SIZE;
+    const off_t dst_stride = (DST_B1 > 1) ? DST_S1 : SUB_GROUP_SIZE;
     const int src_chunks_per_c_block
             = (SRC_B1 > 1) ? (SRC_B1 / SUB_GROUP_SIZE) : 1;
     const int dst_chunks_per_c_block
             = (DST_B1 > 1) ? (DST_B1 / SUB_GROUP_SIZE) : 1;
 #endif
 
-    const int ws_stride = dst_stride;
+    const off_t ws_stride = dst_stride;
     const int ws_chunks_per_c_block = dst_chunks_per_c_block;
 
     if (mb0 >= SRC_D0) {
         VECT_DATA_T dst_zero = DATA_ZERO;
         VECT_INT_T ws_zero = 0;
-        int off = DST_OFF(mb0, c, od, oh, ow);
+        off_t off = DST_OFF(mb0, c, od, oh, ow);
         write_vect_c_block(
                 0, &dst[off], c, dst_stride, dst_chunks_per_c_block, dst_zero);
         write_vect_c_block(
@@ -86,9 +89,9 @@ __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
         return;
     }
 
-    const int id = od * SD - PD;
-    const int ih = oh * SH - PH;
-    const int iw = ow * SW - PW;
+    const off_t id = od * SD - PD;
+    const off_t ih = oh * SH - PH;
+    const off_t iw = ow * SW - PW;
 #if USE_FLOATS
     // Convert DATA_MIN to float instead of using -FLT_MAX to avoid -inf
     // Can use 0.0f safely, however
@@ -107,9 +110,9 @@ __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
             for (int kw = 0; kw < KW; ++kw) {
                 if (iw + kw < 0 || iw + kw >= IW) continue;
 
-                int src_off0 = SRC_OFF(mb0, c, id + kd, ih + kh, iw + kw);
+                off_t src_off0 = SRC_OFF(mb0, c, id + kd, ih + kh, iw + kw);
 #if UNROLL_MB_COUNT > 1
-                int src_off1 = SRC_OFF(mb1, c, id + kd, ih + kh, iw + kw);
+                off_t src_off1 = SRC_OFF(mb1, c, id + kd, ih + kh, iw + kw);
 #endif
 #if USE_FLOATS
                 VECT_FLOAT_T S0 = CONVERT_VECT_FLOAT_T(read_vect_c_block(0,
@@ -162,21 +165,21 @@ __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
 #endif // ALG_AVG_P
 
 #if ALG_AVG_NP
-    const int id_start = max(od * SD - PD, 0);
-    const int ih_start = max(oh * SH - PH, 0);
-    const int iw_start = max(ow * SW - PW, 0);
-    const int id_end = min(od * SD - PD + KD, ID);
-    const int ih_end = min(oh * SH - PH + KH, IH);
-    const int iw_end = min(ow * SW - PW + KW, IW);
-    const int num_summands
-            = (ih_end - ih_start) * (iw_end - iw_start) * (id_end - id_start);
+    const off_t id_start = max(od * SD - PD, 0);
+    const off_t ih_start = max(oh * SH - PH, 0);
+    const off_t iw_start = max(ow * SW - PW, 0);
+    const off_t id_end = min(od * SD - PD + KD, ID);
+    const off_t ih_end = min(oh * SH - PH + KH, IH);
+    const off_t iw_end = min(ow * SW - PW + KW, IW);
+    const int num_summands = (int)(ih_end - ih_start) * (int)(iw_end - iw_start)
+            * (int)(id_end - id_start);
     D0 = D0 / num_summands;
     D1 = D1 / num_summands;
 #endif // ALG_AVG_NP
 
-    int dst_off0 = DST_OFF(mb0, c, od, oh, ow);
+    off_t dst_off0 = DST_OFF(mb0, c, od, oh, ow);
 #if UNROLL_MB_COUNT > 1
-    int dst_off1 = DST_OFF(mb1, c, od, oh, ow);
+    off_t dst_off1 = DST_OFF(mb1, c, od, oh, ow);
 #endif
     VECT_DATA_T sum0;
     VECT_DATA_T sum1;
@@ -195,8 +198,8 @@ __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
     const int local_id = get_sub_group_local_id();
 
 #if VECT_DT_N == 1
-    const int po_mb = mb0;
-    const int po_oc = c + local_id;
+    const off_t po_mb = mb0;
+    const off_t po_oc = c + local_id;
     if (po_oc < C_WO_PADDING) {
         POST_OP_DATA_T po_sum0 = DATA_TO_REF(sum0);
         float po_D0 = USE_FLOATS ? D0 : CONVERT_FLOAT_T(D0);
@@ -216,11 +219,11 @@ __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
 #if USE_MB_C_BLOCK
         int c_sub_block_id = idx % CHUNKS_PER_C_BLOCK;
         int mb_sub_block_id = idx / CHUNKS_PER_C_BLOCK;
-        const int po_oc = c + c_sub_block_id * SUB_GROUP_SIZE + local_id;
-        int po_mb = (mb0 + mb_sub_block_id) % MB;
+        const off_t po_oc = c + c_sub_block_id * SUB_GROUP_SIZE + local_id;
+        off_t po_mb = (mb0 + mb_sub_block_id) % MB;
 #else // USE_MB_C_BLOCK
-        const int po_oc = c + idx * SUB_GROUP_SIZE + local_id;
-        int po_mb = mb0;
+        const off_t po_oc = c + idx * SUB_GROUP_SIZE + local_id;
+        off_t po_mb = mb0;
 #endif // USE_MB_C_BLOCK
 
         if (po_mb >= MB || po_oc >= C_WO_PADDING) continue;
@@ -257,9 +260,9 @@ __kernel void gen9_pooling_fwd(__global DATA_T *src, __global int *ws,
 #endif
 
 #if ALG_MAX && IS_TRAINING
-    int ws_off0 = dst_off0;
+    off_t ws_off0 = dst_off0;
 #if UNROLL_MB_COUNT > 1
-    int ws_off1 = dst_off1;
+    off_t ws_off1 = dst_off1;
 #endif
     write_vect_c_block_int(
             0, &ws[ws_off0], c, ws_stride, ws_chunks_per_c_block, WS0);
@@ -279,36 +282,38 @@ KERNEL_ATTR
 __kernel void gen9_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
         __global DATA_T *diff_dst) {
 
-    const int mb0 = GWS_GET_MB();
+    if (GWS_OVERFLOW) return;
+
+    const off_t mb0 = GWS_GET_MB();
 #if UNROLL_MB_COUNT > 1
-    int mb[UNROLL_MB_COUNT];
+    off_t mb[UNROLL_MB_COUNT];
     mb[0] = GWS_GET_MB();
     unroll_for(int i = 1; i < UNROLL_MB_COUNT; i++) {
         mb[i] = mb[i - 1] + MB / UNROLL_MB_COUNT;
     }
 #endif
-    const int c = GWS_GET_C();
-    const int id = GWS_GET_ID();
-    const int ih = GWS_GET_IH();
-    const int iw = GWS_GET_IW();
+    const off_t c = GWS_GET_C();
+    const off_t id = GWS_GET_ID();
+    const off_t ih = GWS_GET_IH();
+    const off_t iw = GWS_GET_IW();
 
     // Calculate number of subgroup chunks inside C block
     // and stride between consecutive MB/C blocks
 #if USE_MB_C_BLOCK
-    const int src_stride = (SRC_SB0 > 1) ? SRC_SB0 : SRC_S0;
-    const int dst_stride = (DST_SB0 > 1) ? DST_SB0 : DST_S0;
+    const off_t src_stride = (SRC_SB0 > 1) ? SRC_SB0 : SRC_S0;
+    const off_t dst_stride = (DST_SB0 > 1) ? DST_SB0 : DST_S0;
     const int src_chunks_per_c_block = CHUNKS_PER_C_BLOCK;
     const int dst_chunks_per_c_block = CHUNKS_PER_C_BLOCK;
 #elif USE_ONLY_C_BLOCK
-    const int src_stride = (SRC_B1 > 1) ? SRC_S1 : SUB_GROUP_SIZE;
-    const int dst_stride = (DST_B1 > 1) ? DST_S1 : SUB_GROUP_SIZE;
+    const off_t src_stride = (SRC_B1 > 1) ? SRC_S1 : SUB_GROUP_SIZE;
+    const off_t dst_stride = (DST_B1 > 1) ? DST_S1 : SUB_GROUP_SIZE;
     const int src_chunks_per_c_block
             = (SRC_B1 > 1) ? (SRC_B1 / SUB_GROUP_SIZE) : 1;
     const int dst_chunks_per_c_block
             = (DST_B1 > 1) ? (DST_B1 / SUB_GROUP_SIZE) : 1;
 #endif
 
-    const int ws_stride = dst_stride;
+    const off_t ws_stride = dst_stride;
     const int ws_chunks_per_c_block = dst_chunks_per_c_block;
 
     VECT_FLOAT_T S0 = 0, S1 = 0;
@@ -316,26 +321,26 @@ __kernel void gen9_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
     VECT_FLOAT_T S[UNROLL_MB_COUNT] = {0};
 #endif
     for (int kd = 0; kd < KD; kd++) {
-        int od = (id + PD - kd);
+        off_t od = (id + PD - kd);
         if (od % SD != 0) continue;
         od /= SD;
         if (od < 0 || od >= OD) continue;
 
         for (int kh = 0; kh < KH; kh++) {
-            int oh = (ih + PH - kh);
+            off_t oh = (ih + PH - kh);
             if (oh % SH != 0) continue;
             oh /= SH;
             if (oh < 0 || oh >= OH) continue;
 
             for (int kw = 0; kw < KW; kw++) {
-                int ow = (iw + PW - kw);
+                off_t ow = (iw + PW - kw);
                 if (ow % SW != 0) continue;
                 ow /= SW;
                 if (ow < 0 || ow >= OW) continue;
 
-                const int dst_off0 = DST_OFF(mb0, c, od, oh, ow);
+                const off_t dst_off0 = DST_OFF(mb0, c, od, oh, ow);
 #if UNROLL_MB_COUNT > 1
-                int dst_off[UNROLL_MB_COUNT];
+                off_t dst_off[UNROLL_MB_COUNT];
                 unroll_for(int i = 0; i < UNROLL_MB_COUNT; i++) {
                     dst_off[i] = DST_OFF(mb[i], c, od, oh, ow);
                 }
@@ -389,14 +394,14 @@ __kernel void gen9_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
 #endif
 #endif
 #if ALG_AVG_NP
-                const int id_start = max(id - kd, 0);
-                const int ih_start = max(ih - kh, 0);
-                const int iw_start = max(iw - kw, 0);
-                const int id_end = min(id - kd + KD, ID);
-                const int ih_end = min(ih - kh + KH, IH);
-                const int iw_end = min(iw - kw + KW, IW);
-                const int num_summands = (ih_end - ih_start)
-                        * (iw_end - iw_start) * (id_end - id_start);
+                const off_t id_start = max(id - kd, 0);
+                const off_t ih_start = max(ih - kh, 0);
+                const off_t iw_start = max(iw - kw, 0);
+                const off_t id_end = min(id - kd + KD, ID);
+                const off_t ih_end = min(ih - kh + KH, IH);
+                const off_t iw_end = min(iw - kw + KW, IW);
+                const int num_summands = (int)(ih_end - ih_start)
+                        * (int)(iw_end - iw_start) * (int)(id_end - id_start);
                 D0 /= num_summands;
                 D1 /= num_summands;
 #if UNROLL_MB_COUNT > 1
@@ -423,9 +428,9 @@ __kernel void gen9_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
 #endif
 #endif
 
-    int src_off0 = SRC_OFF(mb0, c, id, ih, iw);
+    off_t src_off0 = SRC_OFF(mb0, c, id, ih, iw);
 #if UNROLL_MB_COUNT > 1
-    int src_off[UNROLL_MB_COUNT];
+    off_t src_off[UNROLL_MB_COUNT];
     unroll_for(int i = 0; i < UNROLL_MB_COUNT; i++) {
         src_off[i] = SRC_OFF(mb[i], c, id, ih, iw);
     }
@@ -444,10 +449,10 @@ __kernel void gen9_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
 }
 #endif
 
-inline DATA_T read_c_block(const __global DATA_T *ptr, int c) {
+inline DATA_T read_c_block(const __global DATA_T *ptr, off_t c) {
 #if C_W_PADDING % SUB_GROUP_SIZE != 0
     int local_id = get_sub_group_local_id();
-    int tail = C_WO_PADDING - c;
+    off_t tail = C_WO_PADDING - c;
     return (local_id < tail) ? ptr[local_id] : 0;
 #else
     return AS_DATA_T(BLOCK_READ((const __global BLOCK_DATA_T *)ptr));
@@ -456,7 +461,7 @@ inline DATA_T read_c_block(const __global DATA_T *ptr, int c) {
 
 #define CALC_VECT_LEN() \
     ({ \
-        int size; \
+        off_t size; \
         if (USE_ONLY_C_BLOCK == 1 \
                 && VECT_DT_N > C_WO_PADDING / SUB_GROUP_SIZE + 1) \
             size = C_WO_PADDING / SUB_GROUP_SIZE + 1; \
@@ -465,8 +470,8 @@ inline DATA_T read_c_block(const __global DATA_T *ptr, int c) {
         size; \
     })
 
-inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr, int c,
-        int blocks_stride, int chunks_per_block) {
+inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr,
+        off_t c, off_t blocks_stride, int chunks_per_block) {
     if (idx >= NVECT) return 0;
 
     if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE)
@@ -479,7 +484,7 @@ inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr, int c,
             const int offset_index = (idx * VECT_DT_N + i);
             const int local_c_block_index = offset_index % chunks_per_block;
             const int global_c_block_index = offset_index / chunks_per_block;
-            const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE
+            const off_t ptr_offset = local_c_block_index * SUB_GROUP_SIZE
                     + global_c_block_index * blocks_stride;
             const int c_off
                     = (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE
@@ -499,18 +504,18 @@ inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr, int c,
     }
 }
 
-inline int read_c_block_int(const __global int *ptr, int c) {
+inline int read_c_block_int(const __global int *ptr, off_t c) {
 #if C_W_PADDING % SUB_GROUP_SIZE != 0
     int local_id = get_sub_group_local_id();
-    int tail = C_WO_PADDING - c;
+    off_t tail = C_WO_PADDING - c;
     return (local_id < tail) ? ptr[local_id] : 0;
 #else
     return as_int(intel_sub_group_block_read((const __global uint *)ptr));
 #endif
 }
 
-inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr, int c,
-        int blocks_stride, int chunks_per_block) {
+inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr,
+        off_t c, off_t blocks_stride, int chunks_per_block) {
     if (idx >= NVECT) return 0;
 
     if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE)
@@ -523,7 +528,7 @@ inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr, int c,
             const int offset_index = (idx * VECT_DT_N + i);
             const int local_c_block_index = offset_index % chunks_per_block;
             const int global_c_block_index = offset_index / chunks_per_block;
-            const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE
+            const off_t ptr_offset = local_c_block_index * SUB_GROUP_SIZE
                     + global_c_block_index * blocks_stride;
             const int c_off
                     = (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE
@@ -538,10 +543,10 @@ inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr, int c,
     }
 }
 
-inline void write_c_block(__global DATA_T *ptr, int c, DATA_T value) {
+inline void write_c_block(__global DATA_T *ptr, off_t c, DATA_T value) {
 #if C_W_PADDING % SUB_GROUP_SIZE != 0
     int local_id = get_sub_group_local_id();
-    int tail = C_WO_PADDING - c;
+    off_t tail = C_WO_PADDING - c;
 
     if (local_id < tail) ptr[local_id] = value;
 #else
@@ -558,8 +563,8 @@ inline void write_c_block(__global DATA_T *ptr, int c, DATA_T value) {
 #endif
 }
 
-inline void write_vect_c_block(int idx, __global DATA_T *ptr, int c,
-        int blocks_stride, int chunks_per_block, VECT_DATA_T block) {
+inline void write_vect_c_block(int idx, __global DATA_T *ptr, off_t c,
+        off_t blocks_stride, int chunks_per_block, VECT_DATA_T block) {
     if (idx >= NVECT) return;
 
     if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE)
@@ -572,7 +577,7 @@ inline void write_vect_c_block(int idx, __global DATA_T *ptr, int c,
             const int offset_index = (idx * VECT_DT_N + i);
             const int local_c_block_index = offset_index % chunks_per_block;
             const int global_c_block_index = offset_index / chunks_per_block;
-            const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE
+            const off_t ptr_offset = local_c_block_index * SUB_GROUP_SIZE
                     + global_c_block_index * blocks_stride;
             const int c_off
                     = (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE
@@ -586,10 +591,10 @@ inline void write_vect_c_block(int idx, __global DATA_T *ptr, int c,
     }
 }
 
-inline void write_c_block_int(__global int *ptr, int c, int value) {
+inline void write_c_block_int(__global int *ptr, off_t c, int value) {
 #if C_WO_PADDING % SUB_GROUP_SIZE != 0
     int local_id = get_sub_group_local_id();
-    int tail = C_WO_PADDING - c;
+    off_t tail = C_WO_PADDING - c;
     if (local_id < tail)
         ptr[local_id] = value;
     else if (local_id < C_W_PADDING - c) {
@@ -605,8 +610,8 @@ inline void write_c_block_int(__global int *ptr, int c, int value) {
 #endif
 }
 
-inline void write_vect_c_block_int(int idx, __global int *ptr, int c,
-        int blocks_stride, int chunks_per_block, VECT_INT_T block) {
+inline void write_vect_c_block_int(int idx, __global int *ptr, off_t c,
+        off_t blocks_stride, int chunks_per_block, VECT_INT_T block) {
     if (idx >= NVECT) return;
 
     if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE)
@@ -618,7 +623,7 @@ inline void write_vect_c_block_int(int idx, __global int *ptr, int c,
             const int offset_index = (idx * VECT_DT_N + i);
             const int local_c_block_index = offset_index % chunks_per_block;
             const int global_c_block_index = offset_index / chunks_per_block;
-            const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE
+            const off_t ptr_offset = local_c_block_index * SUB_GROUP_SIZE
                     + global_c_block_index * blocks_stride;
             const int c_off
                     = (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE
diff --git a/src/gpu/intel/ocl/gen9_pooling.cpp b/src/gpu/intel/ocl/gen9_pooling.cpp
index 332ac251746..f766e34ffdf 100644
--- a/src/gpu/intel/ocl/gen9_pooling.cpp
+++ b/src/gpu/intel/ocl/gen9_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,23 +41,24 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
                           && (blk.inner_blks[blk.inner_nblks - 1] == blockSize);
               };
 
-    if (!is_c_blocked_by(src_mdw, 16) && !is_c_blocked_by(src_mdw, 32)
-            && !is_c_dense(src_mdw))
-        return status::unimplemented;
+    VDISPATCH_POOLING_IC(is_c_blocked_by(src_mdw, 16)
+                    || is_c_blocked_by(src_mdw, 32) || is_c_dense(src_mdw),
+            "%s," VERBOSE_SHAPE_RESTRICTION, pd->info(engine));
 
-    if (!is_c_blocked_by(dst_mdw, 16) && !is_c_blocked_by(dst_mdw, 32)
-            && !is_c_dense(dst_mdw))
-        return status::unimplemented;
+    VDISPATCH_POOLING_IC(is_c_blocked_by(dst_mdw, 16)
+                    || is_c_blocked_by(dst_mdw, 32) || is_c_dense(dst_mdw),
+            "%s," VERBOSE_SHAPE_RESTRICTION, pd->info(engine));
 
     int c_block_size = 1, n_block_size = 1;
     auto &src_blk = src_mdw.blocking_desc();
     if (src_blk.inner_nblks > 0) {
         // C is the last blocked dimension as it was checked in is_c_blocked_by
-        c_block_size = src_blk.inner_blks[src_blk.inner_nblks - 1];
+        c_block_size = into<int>(src_blk.inner_blks[src_blk.inner_nblks - 1]);
         // if there is NC blocking (N is the blocked dimension before C) use N blocks as well
         if (src_blk.inner_nblks > 1
                 && src_blk.inner_idxs[src_blk.inner_nblks - 2] == 0) {
-            n_block_size = src_blk.inner_blks[src_blk.inner_nblks - 2];
+            n_block_size
+                    = into<int>(src_blk.inner_blks[src_blk.inner_nblks - 2]);
         }
     }
 
@@ -70,7 +71,7 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
     conf.sub_group_size = 16;
     conf.use_mb_c_block = false;
     conf.use_only_c_block = false;
-    int c_padded = utils::rnd_up(conf.c_padded, conf.sub_group_size);
+    dim_t c_padded = utils::rnd_up(conf.c_padded, conf.sub_group_size);
 
     if (c_block_size >= 16 && n_block_size >= 16) {
         c_padded = utils::rnd_up(conf.c_padded, c_block_size);
@@ -91,9 +92,9 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
         }
         conf.chunks_per_c_block = conf.nvect * conf.vect_dt_n;
         conf.chunks_per_mb_block = 1;
-        // heuristics: ocl ref kernel is faster for small filters.
-        if (((float)conf.kh / (float)conf.ih) < 0.5)
-            return status::unimplemented;
+        VDISPATCH_POOLING_IC((float)conf.kh / (float)conf.ih >= 0.5,
+                "%s," VERBOSE_IMPL_HEURISTIC_FAIL, pd->info(engine),
+                "ocl ref kernel is faster for small kernels");
     } else {
         conf.use_only_c_block = true;
         const size_t num_c_blocks = c_padded / conf.sub_group_size;
@@ -113,10 +114,13 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
         conf.chunks_per_mb_block = 1;
         // fallback to ref_pooling kernel for better perf.
         if (conf.is_backward) {
-            if ((conf.vect_dt_n < 4) && (num_c_blocks > 2))
-                return status::unimplemented;
+            VDISPATCH_POOLING_IC(!((conf.vect_dt_n < 4) && (num_c_blocks > 2)),
+                    "%s," VERBOSE_IMPL_HEURISTIC_FAIL, pd->info(engine),
+                    "ocl ref_kernel is faster");
         } else { // FWD
-            if (conf.vect_dt_n == 1) return status::unimplemented;
+            VDISPATCH_POOLING_IC(conf.vect_dt_n != 1,
+                    "%s," VERBOSE_IMPL_HEURISTIC_FAIL, pd->info(engine),
+                    "ocl ref_kernel is faster");
         }
     }
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
@@ -136,11 +140,14 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
     // for IO bytes less than 256 KB fall back into ocl ref kernel for better performance.
     size_t io_bytes = src_mdw.nelems() * src_mdw.data_type_size()
             + dst_mdw.nelems() * dst_mdw.data_type_size();
-    if (io_bytes < 256 * 1024) return status::unimplemented;
+    VDISPATCH_POOLING_IC(io_bytes >= 256 * 1024,
+            "%s," VERBOSE_IMPL_HEURISTIC_FAIL, pd->info(engine),
+            "ocl ref_kernel is faster");
 
     if (conf.num_batches > 1) {
         conf.dispatch.define_dim("MB", 0,
-                nstl::min(conf.mb_block_size, conf.mb_padded),
+                nstl::min(
+                        static_cast<dim_t>(conf.mb_block_size), conf.mb_padded),
                 conf.chunks_per_mb_block);
     } else {
         conf.dispatch.define_dim("MB", 0, conf.mb_padded / conf.unroll_mb_count,
@@ -158,7 +165,11 @@ static status_t init_conf_common(pool_conf_t &conf, offsets_t &off,
         conf.dispatch.define_dim("IH", nstl::max(2, ndims - 2), conf.ih);
         conf.dispatch.define_dim("IW", nstl::max(2, ndims - 1), conf.iw);
     }
-    CHECK(conf.dispatch.vectorize_dim("C", conf.sub_group_size));
+
+    VDISPATCH_POOLING_IC(conf.dispatch.vectorize_dim("C", conf.sub_group_size)
+                    == status::success,
+            "%s," VERBOSE_BLOCKING_FAIL, pd->info(engine),
+            "failed to block channels across subgroup");
     conf.dispatch.generate();
 
     return status::success;
@@ -172,7 +183,8 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
 
     kernel_ctx.define_int("NDIMS", conf.ndims);
     if (conf.num_batches > 1) {
-        kernel_ctx.define_int("MB", nstl::min(conf.mb_block_size, conf.mb));
+        kernel_ctx.define_int("MB",
+                nstl::min(static_cast<dim_t>(conf.mb_block_size), conf.mb));
     } else {
         kernel_ctx.define_int("MB", conf.mb);
     }
@@ -218,6 +230,9 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
     def_offsets(off.src_off, kernel_ctx, "SRC", conf.ndims);
     def_offsets(off.dst_off, kernel_ctx, "DST", conf.ndims);
 
+    kernel_ctx.register_buffer_size(conf.src_md_info);
+    kernel_ctx.register_buffer_size(conf.dst_md_info);
+
     CHECK(def_attr_info(kernel_ctx, conf.attr_info, post_ops, *dst_md));
 
     def_dispatch(kernel_ctx, conf.dispatch);
@@ -251,8 +266,8 @@ status_t gen9_pooling_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
 
     auto nd_range = pd()->conf.dispatch.nd_range();
 
-    int num_batches = pd()->conf.num_batches;
-    for (int batch_iter = 0; batch_iter < num_batches; batch_iter++) {
+    dim_t num_batches = pd()->conf.num_batches;
+    for (dim_t batch_iter = 0; batch_iter < num_batches; batch_iter++) {
         arg_list.set(3, batch_iter);
         status = parallel_for(ctx, nd_range, kernel_, arg_list);
         if (status != status::success) return status;
diff --git a/src/gpu/intel/ocl/gen9_pooling.hpp b/src/gpu/intel/ocl/gen9_pooling.hpp
index 845816aed4f..c7329311389 100644
--- a/src/gpu/intel/ocl/gen9_pooling.hpp
+++ b/src/gpu/intel/ocl/gen9_pooling.hpp
@@ -33,9 +33,7 @@ namespace ocl {
 struct gen9_pooling_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_pooling_fwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_fwd_pd_t::gpu_pooling_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gen9", gen9_pooling_fwd_t);
 
@@ -93,8 +91,13 @@ struct gen9_pooling_fwd_t : public gpu_primitive_t {
             if (desc()->alg_kind == pooling_max && is_training)
                 init_default_ws(s32);
 
-            VDISPATCH_POOLING_SC(init_conf(engine),
-                    VERBOSE_PRIMITIVE_CREATION_FAIL, "pooling");
+            CHECK(init_conf(engine));
+
+            // Required for storing spatial offsets into workspace for
+            // pooling_max training.
+            VDISPATCH_POOLING(conf.kd * conf.kh * conf.kw <= INT_MAX,
+                    VERBOSE_OFFSET_DT_MISMATCH, "kernel spatial", "int");
+
             return status::success;
         }
 
@@ -129,9 +132,7 @@ struct gen9_pooling_fwd_t : public gpu_primitive_t {
 struct gen9_pooling_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_pooling_bwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_bwd_pd_t::gpu_pooling_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gen9:any", gen9_pooling_bwd_t);
 
@@ -184,8 +185,13 @@ struct gen9_pooling_bwd_t : public gpu_primitive_t {
                         compare_ws(hint_fwd_pd_), VERBOSE_WS_MISMATCH);
             }
 
-            VDISPATCH_POOLING_SC(init_conf(engine),
-                    VERBOSE_PRIMITIVE_CREATION_FAIL, "pooling");
+            CHECK(init_conf(engine));
+
+            // Required for storing spatial offsets into workspace for
+            // pooling_max training due to use of int type.
+            VDISPATCH_POOLING(conf.kd * conf.kh * conf.kw <= INT_MAX,
+                    VERBOSE_OFFSET_DT_MISMATCH, "kernel spatial", "int");
+
             return status::success;
         }
 
diff --git a/src/gpu/intel/ocl/gen9_softmax.cl b/src/gpu/intel/ocl/gen9_softmax.cl
index 04b7374441e..6c350b1c194 100644
--- a/src/gpu/intel/ocl/gen9_softmax.cl
+++ b/src/gpu/intel/ocl/gen9_softmax.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -187,7 +187,7 @@ gen9_softmax_fwd(__global SRC_DATA_T *src, __global DST_DATA_T *dst,
 #if LOGSOFTMAX
     denom_ = log(denom_);
 #else
-    denom_ = 1.0 / denom_;
+    denom_ = 1.0f / denom_;
 #endif
 
     for (int i = 0; i < num_buf; i++) {
@@ -267,7 +267,7 @@ gen9_softmax_fwd(__global SRC_DATA_T *src, __global DST_DATA_T *dst,
 #if LOGSOFTMAX
     denom_ = log(denom_);
 #else
-    denom_ = 1.0 / denom_;
+    denom_ = 1.0f / denom_;
 #endif
 
     dst += data_off;
@@ -377,7 +377,7 @@ gen9_softmax_fwd(__global SRC_DATA_T *src, __global DST_DATA_T *dst,
 #if LOGSOFTMAX
     denom_ = log(denom_);
 #else
-    denom_ = 1.0 / denom_;
+    denom_ = 1.0f / denom_;
 #endif
 
     dst += data_off;
diff --git a/src/gpu/intel/ocl/gen9_softmax.hpp b/src/gpu/intel/ocl/gen9_softmax.hpp
index c0168bed237..4dbb3cdcbb9 100644
--- a/src/gpu/intel/ocl/gen9_softmax.hpp
+++ b/src/gpu/intel/ocl/gen9_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -80,8 +80,7 @@ struct gen9_softmax_fwd_t : public gpu_primitive_t {
                                       compute_engine->mayiuse(
                                               compute::device_ext_t::khr_fp64)),
                     VERBOSE_UNSUPPORTED_DT_CFG);
-            VDISPATCH_SOFTMAX(
-                    attr()->has_default_values(skip_mask_t::scales_runtime),
+            VDISPATCH_SOFTMAX(attr()->has_default_values(skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_SOFTMAX_SC(
@@ -103,7 +102,7 @@ struct gen9_softmax_fwd_t : public gpu_primitive_t {
             }
 
             if (is_nhwc) {
-                int axis_padded = utils::rnd_up(axis_size(), subgroup_size);
+                dim_t axis_padded = utils::rnd_up(axis_size(), subgroup_size);
                 group_size = subgroup_size
                         * utils::div_up(axis_padded, buffer_size);
                 if (group_size > (size_t)max_lws) {
@@ -215,9 +214,9 @@ struct gen9_softmax_fwd_t : public gpu_primitive_t {
         kernel_ctx.add_option("-cl-std=CL2.0");
         kernel_ctx.define_int("LOGSOFTMAX", pd()->is_logsoftmax());
         kernel_ctx.define_int("WITH_SRC_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_SRC).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC));
         kernel_ctx.define_int("WITH_DST_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_DST).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_DST));
 
         const memory_desc_wrapper dst_mdw(pd()->dst_md());
         const memory_desc_wrapper src_mdw(pd()->src_md());
diff --git a/src/gpu/intel/ocl/gen9_wino_conv_fwd_data_fused.cl b/src/gpu/intel/ocl/gen9_wino_conv_fwd_data_fused.cl
index c97a0e2b4aa..e695b9b405e 100644
--- a/src/gpu/intel/ocl/gen9_wino_conv_fwd_data_fused.cl
+++ b/src/gpu/intel/ocl/gen9_wino_conv_fwd_data_fused.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
+ * Copyright 2020-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,13 @@
 
 #define WINO_D (WINO_M + WINO_R - 1)
 
-#define TO_TYPE(value) ((DATA_T)value)
+#if DT_FP64
+#define F_LIT(value) value
+#else
+#define F_LIT(value) value##f
+#endif
+
+#define TO_TYPE(value) ((DATA_T)(value))
 
 #define UTRANS_BLOCK VECT_DT_N
 #define UTRANS_DATA_T VECT_DATA_T
@@ -196,16 +202,20 @@ static inline int get_out_oc0(int lx, int ly) {
 static inline void wino_U_transform(
         UTRANS_DATA_T U[WINO_D], UTRANS_DATA_T wei[WINO_R]) {
     U[0] = wei[0];
-    U[1] = TO_TYPE(-2.0 / 9) * (wei[0] + wei[1] + wei[2]);
-    U[2] = TO_TYPE(2.0 / 9) * (-wei[0] + wei[1] - wei[2]);
-    U[3] = TO_TYPE(1.0 / 90) * wei[0] + TO_TYPE(2.0 / 90) * wei[1]
-            + TO_TYPE(4.0 / 90) * wei[2];
-    U[4] = TO_TYPE(1.0 / 90) * wei[0] - TO_TYPE(2.0 / 90) * wei[1]
-            + TO_TYPE(4.0 / 90) * wei[2];
-    U[5] = TO_TYPE(64.0 / 90) * wei[0] + TO_TYPE(32.0 / 90) * wei[1]
-            + TO_TYPE(16.0 / 90) * wei[2];
-    U[6] = TO_TYPE(64.0 / 90) * wei[0] - TO_TYPE(32.0 / 90) * wei[1]
-            + TO_TYPE(16.0 / 90) * wei[2];
+    U[1] = TO_TYPE(F_LIT(-2.) / F_LIT(9.)) * (wei[0] + wei[1] + wei[2]);
+    U[2] = TO_TYPE(F_LIT(2.) / F_LIT(9.)) * (-wei[0] + wei[1] - wei[2]);
+    U[3] = TO_TYPE(F_LIT(1.) / F_LIT(90.)) * wei[0]
+            + TO_TYPE(F_LIT(2.) / F_LIT(90.)) * wei[1]
+            + TO_TYPE(F_LIT(4.) / F_LIT(90.)) * wei[2];
+    U[4] = TO_TYPE(F_LIT(1.) / F_LIT(90.)) * wei[0]
+            - TO_TYPE(F_LIT(2.) / F_LIT(90.)) * wei[1]
+            + TO_TYPE(F_LIT(4.) / F_LIT(90.)) * wei[2];
+    U[5] = TO_TYPE(F_LIT(64.) / F_LIT(90.)) * wei[0]
+            + TO_TYPE(F_LIT(32.) / F_LIT(90.)) * wei[1]
+            + TO_TYPE(F_LIT(16.) / F_LIT(90.)) * wei[2];
+    U[6] = TO_TYPE(F_LIT(64.) / F_LIT(90.)) * wei[0]
+            - TO_TYPE(F_LIT(32.) / F_LIT(90.)) * wei[1]
+            + TO_TYPE(F_LIT(16.) / F_LIT(90.)) * wei[2];
     U[7] = wei[2];
 }
 
@@ -215,33 +225,33 @@ static inline void wino_U_transform(
 static inline void wino_V_transform(
         __local VTRANS_DATA_T *V, const VTRANS_DATA_T src[WINO_D]) {
     // Compute Winograd f6x3 data transform and store components in SLM.
-    V[V_off(0, 0, 0, VTRANS_BLOCK)]
-            = src[0] - TO_TYPE(5.25) * src[2] + TO_TYPE(5.25) * src[4] - src[6];
+    V[V_off(0, 0, 0, VTRANS_BLOCK)] = src[0] - TO_TYPE(F_LIT(5.25)) * src[2]
+            + TO_TYPE(F_LIT(5.25)) * src[4] - src[6];
 
-    VTRANS_DATA_T x0 = src[1] - TO_TYPE(4.25) * src[3] + src[5];
-    VTRANS_DATA_T x1 = src[2] - TO_TYPE(4.25) * src[4] + src[6];
+    VTRANS_DATA_T x0 = src[1] - TO_TYPE(F_LIT(4.25)) * src[3] + src[5];
+    VTRANS_DATA_T x1 = src[2] - TO_TYPE(F_LIT(4.25)) * src[4] + src[6];
 
     V[V_off(0, 1, 0, VTRANS_BLOCK)] = x1 + x0;
     V[V_off(0, 2, 0, VTRANS_BLOCK)] = x1 - x0;
 
-    VTRANS_DATA_T x2 = TO_TYPE(-5) * src[3] + src[1];
-    VTRANS_DATA_T x3 = TO_TYPE(4) * src[5] + x2;
-    VTRANS_DATA_T x4 = TO_TYPE(0.25) * src[2] + src[6];
-    VTRANS_DATA_T x5 = TO_TYPE(-1.25) * src[4] + x4;
+    VTRANS_DATA_T x2 = TO_TYPE(F_LIT(-5.)) * src[3] + src[1];
+    VTRANS_DATA_T x3 = TO_TYPE(F_LIT(4.)) * src[5] + x2;
+    VTRANS_DATA_T x4 = TO_TYPE(F_LIT(0.25)) * src[2] + src[6];
+    VTRANS_DATA_T x5 = TO_TYPE(F_LIT(-1.25)) * src[4] + x4;
 
-    V[V_off(0, 3, 0, VTRANS_BLOCK)] = TO_TYPE(0.5) * x3 + x5;
-    V[V_off(0, 4, 0, VTRANS_BLOCK)] = TO_TYPE(-0.5) * x3 + x5;
+    V[V_off(0, 3, 0, VTRANS_BLOCK)] = TO_TYPE(F_LIT(0.5)) * x3 + x5;
+    V[V_off(0, 4, 0, VTRANS_BLOCK)] = TO_TYPE(F_LIT(-0.5)) * x3 + x5;
 
-    VTRANS_DATA_T x6 = TO_TYPE(4) * src[1] + src[5];
-    VTRANS_DATA_T x7 = TO_TYPE(-5) * src[3] + x6;
-    VTRANS_DATA_T x8 = TO_TYPE(4) * src[2] + src[6];
-    VTRANS_DATA_T x9 = TO_TYPE(-5) * src[4] + x8;
+    VTRANS_DATA_T x6 = TO_TYPE(F_LIT(4.)) * src[1] + src[5];
+    VTRANS_DATA_T x7 = TO_TYPE(F_LIT(-5.)) * src[3] + x6;
+    VTRANS_DATA_T x8 = TO_TYPE(F_LIT(4.)) * src[2] + src[6];
+    VTRANS_DATA_T x9 = TO_TYPE(F_LIT(-5.)) * src[4] + x8;
 
-    V[V_off(0, 5, 0, VTRANS_BLOCK)] = TO_TYPE(+0.5) * x7 + x9;
-    V[V_off(0, 6, 0, VTRANS_BLOCK)] = TO_TYPE(-0.5) * x7 + x9;
+    V[V_off(0, 5, 0, VTRANS_BLOCK)] = TO_TYPE(F_LIT(+0.5)) * x7 + x9;
+    V[V_off(0, 6, 0, VTRANS_BLOCK)] = TO_TYPE(F_LIT(-0.5)) * x7 + x9;
 
-    V[V_off(0, 7, 0, VTRANS_BLOCK)] = -src[1] + TO_TYPE(5.25) * src[3]
-            - TO_TYPE(5.25) * src[5] + src[7];
+    V[V_off(0, 7, 0, VTRANS_BLOCK)] = -src[1] + TO_TYPE(F_LIT(5.25)) * src[3]
+            - TO_TYPE(F_LIT(5.25)) * src[5] + src[7];
 }
 static inline void wino_m_transform(
         OUT_BLOCK_DATA_T C[WINO_M], OUT_BLOCK_DATA_T M[WINO_D]) {
@@ -256,11 +266,11 @@ static inline void wino_m_transform(
     OUT_BLOCK_DATA_T x5 = M[5] - M[6];
 
     C[0] = M[0] + x0 + x2 + x4;
-    C[1] = x1 + TO_TYPE(2) * x3 + TO_TYPE(0.5f) * x5;
-    C[2] = x0 + TO_TYPE(4.f) * x2 + TO_TYPE(0.25f) * x4;
-    C[3] = x1 + TO_TYPE(8.f) * x3 + TO_TYPE(0.125f) * x5;
-    C[4] = x0 + TO_TYPE(16.f) * x2 + TO_TYPE(0.0625f) * x4;
-    C[5] = x1 + TO_TYPE(32.f) * x3 + TO_TYPE(0.03125f) * x5 + M[7];
+    C[1] = x1 + TO_TYPE(F_LIT(2.)) * x3 + TO_TYPE(F_LIT(0.5)) * x5;
+    C[2] = x0 + TO_TYPE(F_LIT(4.)) * x2 + TO_TYPE(F_LIT(0.25)) * x4;
+    C[3] = x1 + TO_TYPE(F_LIT(8.)) * x3 + TO_TYPE(F_LIT(0.125)) * x5;
+    C[4] = x0 + TO_TYPE(F_LIT(16.)) * x2 + TO_TYPE(F_LIT(0.0625)) * x4;
+    C[5] = x1 + TO_TYPE(F_LIT(32.)) * x3 + TO_TYPE(F_LIT(0.03125)) * x5 + M[7];
 }
 #elif WINO_M == 4
 static inline void wino_U_transform(
@@ -376,8 +386,8 @@ gen9_wino_conv_fwd(__global DATA_T *dst, const __global DATA_T *src,
             = (WINO_IC_BLOCK * WINO_D * IW_INTERNAL_BLOCK) / VTRANS_BLOCK;
     __local VTRANS_DATA_T V[slm_size]; // 8 KB
 
-    const DATA_T scl = TO_TYPE(16);
-    const DATA_T sc = TO_TYPE(1) / scl;
+    const DATA_T scl = TO_TYPE(F_LIT(16.));
+    const DATA_T sc = TO_TYPE(F_LIT(1.)) / scl;
     const VTRANS_DATA_T scl_vec = (VTRANS_DATA_T)(sc, sc, sc, sc);
 
     const int ow0 = get_group_id(0) * OW_BLOCK;
diff --git a/src/gpu/intel/ocl/gen9_wino_convolution.cpp b/src/gpu/intel/ocl/gen9_wino_convolution.cpp
index a7caa895070..9777683f885 100644
--- a/src/gpu/intel/ocl/gen9_wino_convolution.cpp
+++ b/src/gpu/intel/ocl/gen9_wino_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,13 +17,11 @@
 #include "gpu/intel/ocl/gen9_wino_convolution.hpp"
 
 #include "common/c_types_map.hpp"
-#include "common/dnnl_traits.hpp"
-#include "common/math_utils.hpp"
 #include "common/memory_storage.hpp"
 #include "common/type_helpers.hpp"
 
 #include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 using namespace dnnl::impl::memory_tracking::names;
 
@@ -40,15 +38,15 @@ static bool is_impl_optimal(conv_conf_t &conf, const convolution_desc_t &cd,
         const compute::gpu_arch_t arch) {
     if (cd.alg_kind == alg_kind::convolution_winograd) return true;
 
-    int ow_blocks = conf.wino_ow / conf.ow_block;
+    dim_t ow_blocks = conf.wino_ow / conf.ow_block;
     float ow_util = (float)conf.ow / conf.wino_ow;
-    int oh_blocks = conf.wino_oh / conf.oh_block;
+    dim_t oh_blocks = conf.wino_oh / conf.oh_block;
     float oh_util = (float)conf.oh / conf.wino_oh;
-    int oc_blocks = conf.ocb;
+    dim_t oc_blocks = conf.ocb;
     float oc_util = (float)conf.oc_without_padding / conf.wino_oc;
     float ic_util = (float)conf.ic_without_padding / conf.wino_ic;
 
-    int blocks = ow_blocks * oh_blocks * oc_blocks;
+    dim_t blocks = ow_blocks * oh_blocks * oc_blocks;
     float utilization = ow_util * oh_util * oc_util * ic_util;
     float score;
 
@@ -97,7 +95,7 @@ static void fwd_compute_block_sizes(
             ? static_cast<int>(16 / types::data_type_size(conf.src_data_type))
             : 8;
     conf.oc_block = 16;
-    conf.ic_block = nstl::min(conf.ic, 16);
+    conf.ic_block = into<int>(nstl::min<dim_t>(conf.ic, 16));
     if (conf.src_data_type == data_type::f16)
         conf.wino_ic_block = 32;
     else if (is_pre_gen12 && conf.ow * conf.oh <= 256)
@@ -113,7 +111,8 @@ static void fwd_compute_block_sizes(
     if (conf.is_fused) {
         conf.wino_oc_block = 16;
         conf.oh_block = conf.wino_m;
-        conf.ow_block = conf.ow > 14 ? 14 : utils::rnd_up(conf.ow, 2);
+        conf.ow_block
+                = conf.ow > 14 ? 14 : into<int>(utils::rnd_up(conf.ow, 2));
     } else {
         conf.wino_oc_block = 32;
         conf.oh_block = 8;
@@ -138,6 +137,11 @@ status_t gen9_wino_convolution_fwd_t::pd_t::init_conf(
     const memory_desc_wrapper dst_mdw(dst_md());
     const memory_desc_wrapper bias_mdw(weights_md(1));
 
+    VDISPATCH_CONV_IC(std::max({src_mdw.nelems(true), weights_mdw.nelems(true),
+                              dst_mdw.nelems(true)})
+                    <= INT_MAX,
+            VERBOSE_SHAPE_RESTRICTION);
+
     set_default_conf(conf, cd, *src_md(), *weights_md(), *dst_md(),
             *weights_md(1), *attr());
 
@@ -381,7 +385,6 @@ status_t gen9_wino_convolution_fwd_t::execute_forward(
     auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST);
 
     const auto &conf = pd()->conf;
-    const auto &attr_info = conf.attr_info;
 
     std::unique_ptr<memory_storage_t> wei_trans
             = ctx.get_scratchpad_grantor().get_memory_storage(key_wino_U);
@@ -433,10 +436,7 @@ status_t gen9_wino_convolution_fwd_t::execute_forward(
                 ctx, dst_trans_nd_range, dst_trans_kernel_, dst_transform_args);
     }
 
-    if (attr_info.with_eltwise
-            && !gpu_eltwise_fwd_pd_t::eltwise_preserves_zero(
-                    attr_info.eltwise_alg, attr_info.eltwise_alpha,
-                    attr_info.eltwise_beta)) {
+    if (!post_ops_preserves_zeroes(ctx, pd()->attr()->post_ops_)) {
         CHECK(ctx.zero_pad_output(DNNL_ARG_DST));
     }
     return status;
diff --git a/src/gpu/intel/ocl/gen9_wino_convolution.hpp b/src/gpu/intel/ocl/gen9_wino_convolution.hpp
index 553740a42e1..2140c499fde 100644
--- a/src/gpu/intel/ocl/gen9_wino_convolution.hpp
+++ b/src/gpu/intel/ocl/gen9_wino_convolution.hpp
@@ -34,9 +34,7 @@ namespace ocl {
 struct gen9_wino_convolution_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_convolution_fwd_pd_t {
-        pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
-                const convolution_fwd_pd_t *hint_fwd_pd)
-            : gpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_convolution_fwd_pd_t::gpu_convolution_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:gen9:wino", gen9_wino_convolution_fwd_t);
 
diff --git a/src/gpu/intel/ocl/generic_reorder.cl b/src/gpu/intel/ocl/generic_reorder.cl
index efbc8859dff..53c1f4812d3 100644
--- a/src/gpu/intel/ocl/generic_reorder.cl
+++ b/src/gpu/intel/ocl/generic_reorder.cl
@@ -16,6 +16,9 @@
 
 #define USE_CUSTOM_GWS_GET_ID
 
+// Temporary W/A for bf16 problems in HW and compiler
+#undef cl_future_bf16_cvt
+
 #include "gpu/intel/ocl/dispatch.h"
 #include "gpu/intel/ocl/reorder_common.h"
 #include "gpu/intel/ocl/types_interop.h"
@@ -217,8 +220,8 @@ __kernel void generic_reorder(__global SRC_DATA_T *restrict src,
                     ? dst_scales[dst_scale_idx]
                     : 0.0;
 #endif
-            REORDER(dst_tmp, from_cache, src_scale, dst_scale, sum_scale,
-                    src_zp, dst_zp, sum_zp);
+            REORDER(DEFAULT_ROUND, dst_tmp, from_cache, src_scale, dst_scale,
+                    sum_scale, src_zp, dst_zp, sum_zp);
             dst[dst_off] = dst_tmp;
         }
     }
diff --git a/src/gpu/intel/ocl/generic_reorder.cpp b/src/gpu/intel/ocl/generic_reorder.cpp
index e3a165775df..7670286c62f 100644
--- a/src/gpu/intel/ocl/generic_reorder.cpp
+++ b/src/gpu/intel/ocl/generic_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,8 +49,8 @@ Difficulty is in determining how to achieve the above goal for
 #include "gpu/intel/ocl/generic_reorder.hpp"
 
 #include "common/utils.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -62,7 +62,7 @@ using namespace dnnl::impl::memory_tracking::names;
 struct dimension_t {
     dim_t size;
     dim_t step;
-    int idx;
+    dim_idx_t idx;
 };
 
 using dimensions_t = std::vector<dimension_t>;
@@ -108,17 +108,17 @@ dimensions_t dims_by_stride(const memory_desc_wrapper &mdw) {
 // outermost-> 1c:4a:3b:7d:8b:8a:2b <-innermost
 dimensions_t query_dims_and_blocks(const memory_desc_wrapper &mdw) {
     auto blocks = dims_by_stride(mdw);
-    const int ndims = mdw.ndims();
+    const dim_idx_t ndims = mdw.ndims();
     const auto &desc = mdw.blocking_desc();
-    const int nblks = desc.inner_nblks;
+    const dim_idx_t nblks = desc.inner_nblks;
 
     // Calculate info for inner blocks
     dimensions_t inner_blks(nblks);
-    std::vector<int> steps(ndims, 1);
+    std::vector<dim_t> steps(ndims, 1);
     dim_t blks_size = 1;
     for (int i = nblks - 1; i >= 0; --i) {
         auto &blk = inner_blks[i];
-        blk.idx = desc.inner_idxs[i];
+        blk.idx = into<dim_idx_t>(desc.inner_idxs[i]);
         blk.size = desc.inner_blks[i];
         blk.step = steps[blk.idx];
         // steps increase in reverse order of how blocks are listed
@@ -346,13 +346,13 @@ struct pair_filter_t {
     predicate_t pred;
 };
 
-#define NO_IDX (-1)
+#define NO_IDX dim_idx::invalid
 // Find the index of the dimension that always and only follows the dimension
 // with index idx. If none exists, return NO_IDX. If no dimension with index idx
 // is present in the given block representation, return idx to delete the
 // dimension
-int successor(const dimensions_t &a, int idx) {
-    int succ;
+dim_idx_t successor(const dimensions_t &a, dim_idx_t idx) {
+    dim_idx_t succ;
     auto match_idx = [&](const dim_pair_t &p) { return p[0].idx == idx; };
     auto match_xor = [&](const dim_pair_t &p) {
         return match_idx(p) ^ (p[1].idx == succ);
@@ -373,7 +373,8 @@ int successor(const dimensions_t &a, int idx) {
 // given block representations. The successor dimension will be combined with
 // the given dimension, or, in the case that the given dimension does not appear
 // in the block representation, it will be deleted.
-int successor(const dimensions_t &a, const dimensions_t &b, int idx) {
+dim_idx_t successor(
+        const dimensions_t &a, const dimensions_t &b, dim_idx_t idx) {
     auto succ = successor(a, idx);
     if (succ == NO_IDX || succ != successor(b, idx)) return NO_IDX;
 
@@ -392,7 +393,7 @@ int successor(const dimensions_t &a, const dimensions_t &b, int idx) {
     return (it_a != end_a || it_b != end_b) ? NO_IDX : succ;
 }
 
-bool can_be_combined(int idx, int mask) {
+bool can_be_combined(dim_idx_t idx, int mask) {
     return !(idx == NO_IDX || (mask & (1 << idx)));
 }
 
@@ -401,10 +402,10 @@ void compress(memory_desc_t &a, memory_desc_t &b, int &a_mask, int &b_mask) {
     const auto blks_b = query_dims_and_blocks(b);
     const int skip_mask = a_mask | b_mask | extended_dims(a) | extended_dims(b);
 
-    const int ndims = a.ndims;
-    std::vector<int> successors(ndims, NO_IDX);
-    std::vector<int> aliases(ndims);
-    for (int i = 0; i < ndims; ++i) {
+    const dim_idx_t ndims = a.ndims;
+    std::vector<dim_idx_t> successors(ndims, NO_IDX);
+    std::vector<dim_idx_t> aliases(ndims);
+    for (dim_idx_t i = 0; i < ndims; ++i) {
         aliases[i] = i;
         if ((a_mask | b_mask) & (1 << i)) continue;
         auto succ = successor(blks_a, blks_b, i);
@@ -413,12 +414,12 @@ void compress(memory_desc_t &a, memory_desc_t &b, int &a_mask, int &b_mask) {
     }
 
     for (int i = ndims - 1; i >= 0; --i) {
-        int succ = successors[i];
+        dim_idx_t succ = successors[i];
         if (succ == NO_IDX) continue;
         while (succ != aliases[succ])
             succ = aliases[succ];
-        int from = std::max(i, succ);
-        int into = std::min(i, succ);
+        dim_idx_t from = std::max<dim_idx_t>(i, succ);
+        dim_idx_t into = std::min<dim_idx_t>(i, succ);
         combine(a, into, from);
         combine(b, into, from);
         remove_bit(a_mask, from);
@@ -496,7 +497,7 @@ bool fill_to_vect(
     subset.clear();
     for (auto &dim : all) {
         dim_t next_size = current_size * dim.size;
-        int next_full_vecs = next_size / simd_size;
+        dim_t next_full_vecs = next_size / simd_size;
         if (next_full_vecs >= min_full_vecs || next_size % simd_size == 0) {
             // Vectorize innermost dim(s). If it's not divisible by simd size,
             // they will need to be padded. And for that the vectorised dim(s)
@@ -528,7 +529,7 @@ bool add_to_vector(dimensions_t &v, const dimension_t &item) {
 }
 
 bool no_more_such_idx(dimensions_t &vect, size_t iter) {
-    const int idx_to_search_for = vect[iter].idx;
+    const dim_idx_t idx_to_search_for = vect[iter].idx;
     for (size_t i = iter + 1; i < vect.size(); i++) {
         if (vect[i].idx == idx_to_search_for) { return false; }
     }
@@ -546,7 +547,7 @@ dimensions_t fix_order_to(dimensions_t input, dimensions_t ref) {
         for (size_t j = 0; j < input.size(); j++) {
             if (ref[i].size != 1 && input[j].size != 1
                     && ref[i].idx == input[j].idx) {
-                int smaller = std::min(ref[i].size, input[j].size);
+                dim_t smaller = std::min(ref[i].size, input[j].size);
                 if (no_more_such_idx(ref, i) || j == input.size() - 1) {
                     smaller = input[j].size;
                 }
@@ -687,8 +688,8 @@ bool split_into_blocks_and_packets(size_t vect, size_t optimal_burst_bytes,
 
 bool fill_conf_vld(const memory_desc_wrapper &src,
         const memory_desc_wrapper &dst, int scale_mask, size_t memlimit_bytes,
-        size_t optimal_burst_bytes, vectorize_last_dim_t &cfg, int &vect_dim,
-        int &vect_size, dim_t *blocks) {
+        size_t optimal_burst_bytes, vectorize_last_dim_t &cfg,
+        dim_idx_t &vect_dim, int &vect_size, dim_t *blocks) {
 
     const dimensions_t src_dims = query_dims_and_blocks(src);
     const dimensions_t dst_dims = query_dims_and_blocks(dst);
@@ -717,31 +718,31 @@ bool fill_conf_vld(const memory_desc_wrapper &src,
         cfg.src_blk[i].dim_idx = 0;
         cfg.dst_blk[i].dim_idx = 0;
     }
-    cfg.src_vct[0].blk_size = src_packet[0].size;
+    cfg.src_vct[0].blk_size = into<int>(src_packet[0].size);
     cfg.src_vct[0].dim_idx = src_packet[0].idx;
-    cfg.dst_vct[0].blk_size = dst_packet[0].size;
+    cfg.dst_vct[0].blk_size = into<int>(dst_packet[0].size);
     cfg.dst_vct[0].dim_idx = dst_packet[0].idx;
     for (size_t i = 0; i < src_packet.size(); i++) {
         cfg.src_vct[i].dim_idx = src_packet[i].idx;
-        cfg.src_vct[i].blk_size = src_packet[i].size;
-        cfg.src_vct[i].step_size = src_packet[i].step;
+        cfg.src_vct[i].blk_size = into<int>(src_packet[i].size);
+        cfg.src_vct[i].step_size = into<int>(src_packet[i].step);
     }
     for (size_t i = 0; i < dst_packet.size(); i++) {
         cfg.dst_vct[i].dim_idx = dst_packet[i].idx;
-        cfg.dst_vct[i].blk_size = dst_packet[i].size;
-        cfg.dst_vct[i].step_size = dst_packet[i].step;
+        cfg.dst_vct[i].blk_size = into<int>(dst_packet[i].size);
+        cfg.dst_vct[i].step_size = into<int>(dst_packet[i].step);
     }
 
     // fill src's and dst's loop recipe
     for (size_t i = 0; i < src_block.size(); i++) {
         cfg.src_blk[i].dim_idx = src_block[i].idx;
-        cfg.src_blk[i].blk_size = src_block[i].size;
-        cfg.src_blk[i].step_size = src_block[i].step;
+        cfg.src_blk[i].blk_size = into<int>(src_block[i].size);
+        cfg.src_blk[i].step_size = into<int>(src_block[i].step);
     }
     for (size_t i = 0; i < dst_block.size(); i++) {
         cfg.dst_blk[i].dim_idx = dst_block[i].idx;
-        cfg.dst_blk[i].blk_size = dst_block[i].size;
-        cfg.dst_blk[i].step_size = dst_block[i].step;
+        cfg.dst_blk[i].blk_size = into<int>(dst_block[i].size);
+        cfg.dst_blk[i].step_size = into<int>(dst_block[i].step);
     }
     cfg.vector_dim = dst_packet[0].idx;
     vect_dim = dst_packet[0].idx;
@@ -790,8 +791,8 @@ status_t generic_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
     memcpy(&new_a, src_md(), sizeof(new_a));
     memcpy(&new_b, dst_md(), sizeof(new_b));
     compress(new_a, new_b, src_mask, dst_mask);
-    if (src_mask) CHECK(attr_copy.scales_.set(DNNL_ARG_SRC, src_mask));
-    if (dst_mask) CHECK(attr_copy.scales_.set(DNNL_ARG_DST, dst_mask));
+    if (src_mask >= 0) { CHECK(attr_copy.scales_.set(DNNL_ARG_SRC, src_mask)); }
+    if (dst_mask >= 0) { CHECK(attr_copy.scales_.set(DNNL_ARG_DST, dst_mask)); }
 
     if (!is_generic_faster_than_ref(new_a, new_b)) return status::unimplemented;
 
@@ -824,7 +825,7 @@ status_t generic_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
 
     dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
     int vect_size = 1;
-    int vect_dim = 0;
+    dim_idx_t vect_dim = 0;
 
     if (!fill_conf_vld(src_mdw, dst_mdw, src_mask | dst_mask, memlimit_bytes,
                 optimal_burst_bytes, conf.aux_data.vld, vect_dim, vect_size,
@@ -836,9 +837,9 @@ status_t generic_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
 
     conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_);
 
-    for (int i = 0; i < MAX_NDIMS; ++i) {
+    for (dim_idx_t i = 0; i < MAX_NDIMS; ++i) {
         auto dim_str = utils::format("D%d", i);
-        if (i < dst_mdw.ndims()) {
+        if (i < into<dim_idx_t>(dst_mdw.ndims())) {
             uint64_t dim = padded_dims[i];
             // Pad vectorized dim to multiple of block size (to make sure that
             // enough work items will be generated to have only full subgroups,
diff --git a/src/gpu/intel/ocl/generic_reorder.hpp b/src/gpu/intel/ocl/generic_reorder.hpp
index ef8f72f9c15..a5b232c3dff 100644
--- a/src/gpu/intel/ocl/generic_reorder.hpp
+++ b/src/gpu/intel/ocl/generic_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include "gpu/gpu_reorder_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -74,6 +74,12 @@ struct generic_reorder_t : public gpu_primitive_t {
                                             compute::device_ext_t::
                                                     intel_subgroups_short)),
                     VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_REORDER(IMPLICATION(utils::one_of(data_type::f64,
+                                                  src_md()->data_type,
+                                                  dst_md()->data_type),
+                                      compute_engine->mayiuse(
+                                              compute::device_ext_t::khr_fp64)),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
 
             VDISPATCH_REORDER_SC(init_conf(engine),
                     VERBOSE_PRIMITIVE_CREATION_FAIL, "reorder");
diff --git a/src/gpu/intel/ocl/graph/gen_index.cl b/src/gpu/intel/ocl/graph/gen_index.cl
new file mode 100644
index 00000000000..05259eb256a
--- /dev/null
+++ b/src/gpu/intel/ocl/graph/gen_index.cl
@@ -0,0 +1,53 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+__kernel void gen_index(__global int *dst, int axis) {
+    long id = get_global_id(0);
+    long result, offset = 0;
+    long idx;
+
+    idx = id % D0;
+    id = id / D0;
+    offset += idx * S0;
+    if (axis == 0) result = idx;
+
+    idx = id % D1;
+    id = id / D1;
+    offset += idx * S1;
+    if (axis == 1) result = idx;
+
+    idx = id % D2;
+    id = id / D2;
+    offset += idx * S2;
+    if (axis == 2) result = idx;
+
+    idx = id % D3;
+    id = id / D3;
+    offset += idx * S3;
+    if (axis == 3) result = idx;
+
+    idx = id % D4;
+    id = id / D4;
+    offset += idx * S4;
+    if (axis == 4) result = idx;
+
+    idx = id % D5;
+    id = id / D5;
+    offset += idx * S5;
+    if (axis == 5) result = idx;
+
+    dst[offset] = result;
+}
diff --git a/src/gpu/intel/ocl/hw_info.cpp b/src/gpu/intel/ocl/hw_info.cpp
new file mode 100644
index 00000000000..b92fec7b838
--- /dev/null
+++ b/src/gpu/intel/ocl/hw_info.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/ocl/hw_info.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+
+#include "gpu/intel/jit/binary_format.hpp"
+#include "gpu/intel/jit/generator.hpp"
+#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
+
+#ifndef CL_DEVICE_IP_VERSION_INTEL
+#define CL_DEVICE_IP_VERSION_INTEL 0x4250
+#endif
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+xpu::runtime_version_t get_driver_version(cl_device_id device) {
+    cl_int err;
+    xpu::runtime_version_t runtime_version(-1, -1, -1);
+
+    size_t param_size = 0;
+    err = clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, nullptr, &param_size);
+    std::string driver_version(param_size, '\0');
+
+    if (err == CL_SUCCESS) {
+        err = clGetDeviceInfo(device, CL_DRIVER_VERSION, param_size,
+                &driver_version[0], nullptr);
+    }
+
+    if (err != CL_SUCCESS
+            || runtime_version.set_from_string(&driver_version[0])
+                    != status::success) {
+        runtime_version.major = 0;
+        runtime_version.minor = 0;
+        runtime_version.build = 0;
+    }
+
+    return runtime_version;
+}
+
+status_t init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
+        cl_context context, uint32_t &ip_version, compute::gpu_arch_t &gpu_arch,
+        int &gpu_product_family, int &stepping_id, uint64_t &native_extensions,
+        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels) {
+    using namespace ngen;
+    Product product = ngen::OpenCLCodeGenerator<HW::Unknown>::detectHWInfo(
+            context, device);
+    bool is_xelpg = (product.family == ngen::ProductFamily::ARL
+            || product.family == ngen::ProductFamily::MTL);
+
+    gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
+    gpu_product_family = static_cast<int>(product.family);
+    stepping_id = product.stepping;
+
+    mayiuse_systolic = false;
+    CHECK(get_ocl_device_enabled_systolic_intel(device, mayiuse_systolic));
+    CHECK(get_ocl_device_enabled_native_float_atomics(
+            device, native_extensions, is_xelpg));
+
+    auto status
+            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
+    if (status != status::success) {
+        VWARN(common, runtime,
+                "ngen fallback (gpu does not support binary format kernels)");
+        mayiuse_ngen_kernels = false;
+    }
+
+    ip_version = 0;
+    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL,
+            sizeof(ip_version), &ip_version, nullptr));
+    return status::success;
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/hw_info.hpp b/src/gpu/intel/ocl/hw_info.hpp
new file mode 100644
index 00000000000..a608c535a98
--- /dev/null
+++ b/src/gpu/intel/ocl/hw_info.hpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_HW_INFO_HPP
+#define GPU_INTEL_OCL_HW_INFO_HPP
+
+#include <CL/cl.h>
+
+#include "common/c_types_map.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+xpu::runtime_version_t get_driver_version(cl_device_id device);
+
+status_t init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
+        cl_context context, uint32_t &ip_version, compute::gpu_arch_t &gpu_arch,
+        int &gpu_product_family, int &stepping_id, uint64_t &native_extensions,
+        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels);
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_OCL_HW_INFO_HPP
diff --git a/src/gpu/intel/ocl/kernel.cpp b/src/gpu/intel/ocl/kernel.cpp
new file mode 100644
index 00000000000..8d6896c4d1c
--- /dev/null
+++ b/src/gpu/intel/ocl/kernel.cpp
@@ -0,0 +1,297 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <assert.h>
+#include <string>
+#include <CL/cl.h>
+
+#include "gpu/intel/ocl/kernel.hpp"
+
+#include "common/rw_mutex.hpp"
+#include "common/utils.hpp"
+
+#include "xpu/stream_profiler.hpp"
+
+#include "xpu/ocl/context.hpp"
+#include "xpu/ocl/memory_storage.hpp"
+#include "xpu/ocl/usm_utils.hpp"
+
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+// Kernel wrapper storing a per-thread copy of cl_kernel.
+class kernel_wrapper_t {
+public:
+    kernel_wrapper_t(cl_kernel kernel = nullptr) : kernel_(kernel) {}
+
+    operator cl_kernel() const { return kernel_; }
+
+    status_t set_arg(int arg_index, size_t arg_size, const void *arg_value) {
+        cl_int err = clSetKernelArg(kernel_, arg_index, arg_size, arg_value);
+        return xpu::ocl::convert_to_dnnl(err);
+    }
+
+    status_t set_svm_arg(int arg_index, const void *arg_value) {
+#ifdef CL_VERSION_2_0
+        cl_int err = clSetKernelArgSVMPointer(kernel_, arg_index, arg_value);
+        return xpu::ocl::convert_to_dnnl(err);
+#else
+        // SVM is not supported.
+        UNUSED(arg_index);
+        UNUSED(arg_value);
+        return status::runtime_error;
+#endif
+    }
+
+    status_t set_usm_arg(
+            impl::engine_t *engine, int arg_index, const void *arg_value) {
+        return xpu::ocl::usm::set_kernel_arg(
+                engine, kernel_, arg_index, arg_value);
+    }
+
+private:
+    cl_kernel kernel_;
+};
+
+class kernel_cache_t {
+public:
+    kernel_cache_t(cl_kernel main_kernel) : main_kernel_(main_kernel) {}
+
+    ~kernel_cache_t() {
+        for (auto &kv : kernels_) {
+            OCL_CHECK_V(clReleaseKernel(kv.second));
+        }
+    }
+
+    status_t get(kernel_wrapper_t **kernel) {
+        auto id = std::this_thread::get_id();
+        {
+            utils::lock_read_t lock_read(mutex_);
+            auto it = kernels_.find(id);
+            if (it != kernels_.end()) {
+                *kernel = &it->second;
+                return status::success;
+            }
+        }
+
+        // No copy for this thread, clone the original kernel and save the
+        // copy.
+        cl_kernel cloned_kernel;
+        CHECK(xpu::ocl::clone_kernel(main_kernel_, &cloned_kernel));
+
+        utils::lock_write_t lock_write(mutex_);
+        auto ret = kernels_.emplace(id, cloned_kernel);
+        *kernel = &ret.first->second;
+        return status::success;
+    }
+
+private:
+    cl_kernel main_kernel_;
+    std::unordered_map<std::thread::id, kernel_wrapper_t> kernels_;
+    utils::rw_mutex_t mutex_;
+};
+
+status_t kernel_t::get_binary(
+        const impl::engine_t *engine, xpu::binary_t &binary) const {
+    auto *ocl_engine = utils::downcast<const engine_t *>(engine);
+    return get_ocl_program_binary(ocl_kernel(), ocl_engine->device(), binary);
+}
+
+status_t kernel_t::get_binary_size(
+        const impl::engine_t *engine, size_t *binary_size) const {
+    auto *ocl_engine = utils::downcast<const engine_t *>(engine);
+    return get_ocl_program_binary_size(
+            ocl_kernel(), ocl_engine->device(), binary_size);
+}
+
+status_t kernel_t::parallel_for(impl::stream_t &stream,
+        const compute::nd_range_t &range,
+        const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps,
+        xpu::event_t &out_dep) {
+
+    auto *ocl_stream = utils::downcast<stream_t *>(&stream);
+    cl_command_queue queue = ocl_stream->queue();
+
+    kernel_wrapper_t *kernel = nullptr;
+    CHECK(cache_->get(&kernel));
+    CHECK(check_scalar_arguments(arg_list));
+
+    auto stream_ocl_device_info
+            = utils::downcast<engine_t *>(stream.engine())->device_info();
+    const size_t pointer_size
+            = stream_ocl_device_info->device_address_bits() / 8;
+    size_t param_bytes = 0;
+    for (int i = 0; i < arg_list.nargs(); ++i) {
+        auto &arg = arg_list.get(i);
+        if (arg.is_global()) {
+            auto *mem_storage
+                    = static_cast<const memory_storage_t *>(arg.value());
+            if (!mem_storage->is_null()) {
+                auto *ocl_mem_storage = utils::downcast<
+                        const xpu::ocl::memory_storage_base_t *>(mem_storage);
+
+                // Validate that the OpenCL contexts match for execution
+                // context and memory.
+                auto stream_ocl_ctx
+                        = utils::downcast<engine_t *>(stream.engine())
+                                  ->context();
+                auto memory_storage_ocl_ctx
+                        = utils::downcast<engine_t *>(ocl_mem_storage->engine())
+                                  ->context();
+                if (stream_ocl_ctx != memory_storage_ocl_ctx) {
+                    MAYBE_REPORT_ERROR(
+                            "mismatched OpenCL context for primitive/memory");
+                    return status::invalid_arguments;
+                }
+
+                switch (ocl_mem_storage->memory_kind()) {
+                    case xpu::ocl::memory_kind::buffer: {
+                        auto *m = utils::downcast<
+                                const xpu::ocl::buffer_memory_storage_t *>(
+                                ocl_mem_storage);
+                        auto ocl_mem = m->mem_object();
+                        CHECK(kernel->set_arg(i, sizeof(cl_mem), &ocl_mem));
+                        param_bytes += pointer_size;
+                        break;
+                    }
+                    case xpu::ocl::memory_kind::usm: {
+                        auto *m = utils::downcast<
+                                const xpu::ocl::usm_memory_storage_t *>(
+                                ocl_mem_storage);
+                        auto *usm_ptr = m->usm_ptr();
+                        CHECK(kernel->set_usm_arg(stream.engine(), i, usm_ptr));
+                        param_bytes += pointer_size;
+                        break;
+                    }
+                    default: assert(!"not expected");
+                }
+            } else {
+                if (xpu::ocl::usm::is_usm_supported(stream.engine())) {
+                    CHECK(kernel->set_usm_arg(stream.engine(), i, nullptr));
+                    param_bytes += pointer_size;
+                } else {
+                    cl_mem null_mem = nullptr;
+                    CHECK(kernel->set_arg(i, sizeof(cl_mem), &null_mem));
+                    param_bytes += pointer_size;
+                }
+            }
+        } else if (arg.is_local()) {
+            CHECK(kernel->set_arg(i, arg.size(), arg.value()));
+            // Assuming local memory arguments contribute to
+            // the CL_DEVICE_MAX_PARAMETER_SIZE limit as a pointer type
+            param_bytes += pointer_size;
+        } else if (arg.is_svm_pointer()) {
+            CHECK(kernel->set_svm_arg(i, arg.value()));
+            param_bytes += pointer_size;
+        } else {
+            CHECK(kernel->set_arg(i, arg.size(), arg.value()));
+            param_bytes += arg.size();
+        }
+    }
+
+    if (param_bytes > stream_ocl_device_info->max_kernel_param_size()) {
+        MAYBE_REPORT_ERROR(
+                "parameter bytes requirements greater than device supports");
+        return status::invalid_arguments;
+    }
+
+    cl_uint ndims = static_cast<cl_uint>(range.ndims());
+    if (range.is_zero()) { return status::success; }
+
+    xpu::ocl::wrapper_t<cl_event> event;
+    if (ocl_stream->flags() & stream_flags::out_of_order) {
+        const auto &event_wrappers = xpu::ocl::event_t::from(deps).events;
+        std::vector<cl_event> events(
+                event_wrappers.begin(), event_wrappers.end());
+
+        cl_uint num_events = (cl_uint)events.size();
+        const cl_event *events_data = num_events ? events.data() : nullptr;
+        cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr,
+                range.global_range().data(),
+                range.local_range() ? range.local_range().data() : nullptr,
+                num_events, events_data, &event.unwrap());
+        OCL_CHECK(err);
+        xpu::ocl::event_t::from(out_dep).events = {event};
+    } else {
+        bool save_event = save_events_ || stream.is_profiling_enabled();
+        cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr,
+                range.global_range().data(),
+                range.local_range() ? range.local_range().data() : nullptr, 0,
+                nullptr, save_event ? &event.unwrap() : nullptr);
+        OCL_CHECK(err);
+    }
+
+    if (stream.is_profiling_enabled()) {
+        ocl_stream->profiler().register_event(
+                utils::make_unique<xpu::ocl::event_t>(std::move(event)));
+    }
+
+    return status::success;
+}
+
+status_t kernel_t::dump() const {
+    xpu::binary_t binary;
+    CHECK(get_ocl_kernel_binary(ocl_kernel(), binary));
+    CHECK(gpu_utils::dump_kernel_binary(binary, name()));
+    return status::success;
+}
+
+std::string kernel_t::name() const {
+    return xpu::ocl::get_kernel_name(ocl_kernel());
+}
+
+// This class is to get around std::make_shared requirement to have a public
+// constructor. We keep the original constructor as private but expose it here
+// to use with std::make_shared.
+class kernel_compat_t : public kernel_t {
+public:
+    template <typename... Args>
+    kernel_compat_t(Args &&...args) : kernel_t(std::forward<Args>(args)...) {}
+};
+
+status_t kernel_t::make(compute::kernel_t &compute_kernel,
+        xpu::ocl::wrapper_t<cl_kernel> &&ocl_kernel,
+        const compute::program_src_t &src) {
+    std::vector<gpu::intel::compute::scalar_type_t> arg_types;
+    CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
+    compute_kernel = compute::kernel_t(std::make_shared<kernel_compat_t>(
+            std::forward<xpu::ocl::wrapper_t<cl_kernel>>(ocl_kernel), arg_types,
+            src));
+    return status::success;
+}
+
+kernel_t::kernel_t(xpu::ocl::wrapper_t<cl_kernel> &&ocl_kernel,
+        const std::vector<gpu::intel::compute::scalar_type_t> &arg_types,
+        const compute::program_src_t &src)
+    : ocl_kernel_(std::move(ocl_kernel))
+    , arg_types_(arg_types)
+    , src_(src)
+    , save_events_(false) {
+    cache_ = std::make_shared<kernel_cache_t>(ocl_kernel_);
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/kernel.hpp b/src/gpu/intel/ocl/kernel.hpp
new file mode 100644
index 00000000000..5940bce4404
--- /dev/null
+++ b/src/gpu/intel/ocl/kernel.hpp
@@ -0,0 +1,87 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_KERNEL_HPP
+#define GPU_INTEL_OCL_KERNEL_HPP
+
+#include <string>
+#include <CL/cl.h>
+
+#include "gpu/intel/compute/kernel.hpp"
+#include "xpu/ocl/utils.hpp"
+#include "xpu/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+class kernel_cache_t;
+
+class kernel_t : public compute::kernel_impl_t {
+public:
+    ~kernel_t() override = default;
+
+    cl_kernel ocl_kernel() const { return ocl_kernel_; }
+
+    status_t get_binary(
+            const impl::engine_t *engine, xpu::binary_t &binary) const override;
+    status_t get_binary_size(
+            const impl::engine_t *engine, size_t *binary_size) const override;
+
+    status_t parallel_for(impl::stream_t &stream,
+            const compute::nd_range_t &range,
+            const compute::kernel_arg_list_t &arg_list,
+            const xpu::event_t &deps, xpu::event_t &out_dep) override;
+
+    const std::vector<gpu::intel::compute::scalar_type_t> &
+    arg_types() const override {
+        return arg_types_;
+    }
+
+    void save_output_events() override { save_events_ = true; }
+
+    status_t dump() const override;
+    std::string name() const override;
+    const compute::program_src_t &src() const { return src_; }
+
+    static status_t make(compute::kernel_t &compute_kernel,
+            xpu::ocl::wrapper_t<cl_kernel> &&ocl_kernel,
+            const compute::program_src_t &src);
+
+private:
+    // See description in the class implementation.
+    friend class kernel_compat_t;
+
+    kernel_t(xpu::ocl::wrapper_t<cl_kernel> &&ocl_kernel,
+            const std::vector<gpu::intel::compute::scalar_type_t> &arg_types,
+            const compute::program_src_t &src);
+
+    xpu::ocl::wrapper_t<cl_kernel> ocl_kernel_;
+    std::vector<gpu::intel::compute::scalar_type_t> arg_types_;
+    std::shared_ptr<kernel_cache_t> cache_;
+    compute::program_src_t src_;
+    bool save_events_;
+};
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif // GPU_INTEL_OCL_KERNEL_HPP
diff --git a/src/gpu/intel/ocl/kernel_utils.hpp b/src/gpu/intel/ocl/kernel_utils.hpp
deleted file mode 100644
index 5fc23e6f724..00000000000
--- a/src/gpu/intel/ocl/kernel_utils.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_KERNEL_UTILS_HPP
-#define GPU_INTEL_OCL_KERNEL_UTILS_HPP
-
-#include <vector>
-#include <unordered_map>
-
-#include "gpu/intel/compute/kernel_list.hpp"
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-const char *get_kernel_source(const char *name);
-const char *get_kernel_header(const std::string &name);
-
-template <typename GetKernelSourceFunc>
-status_t create_kernels(const compute::compute_engine_t *engine,
-        compute::kernel_list_t &kernel_list,
-        const compute::kernel_ctx_t &kernel_ctx,
-        const GetKernelSourceFunc &get_kernel_source_func) {
-    auto *ocl_engine = utils::downcast<const ocl::ocl_gpu_engine_t *>(engine);
-
-    // Group kernels by their source.
-    std::unordered_map<const char *, std::vector<const char *>> source_to_names;
-    for (auto &kv : kernel_list.kernels()) {
-        auto &name = kv.first;
-        const char *source = get_kernel_source_func(name.c_str());
-        source_to_names[source].push_back(name.c_str());
-    }
-
-    // Iterate through sources, create all kernels for the current source.
-    for (auto &kv : source_to_names) {
-        std::vector<compute::kernel_t> kernels;
-        CHECK(ocl_engine->create_kernels_from_ocl_source(
-                &kernels, kv.second, kv.first, kernel_ctx));
-
-        // Update kernel list with created kernels.
-        for (size_t i = 0; i < kv.second.size(); ++i) {
-            if (kv.second[i]) kernel_list.set(kv.second[i], kernels[i]);
-        }
-    }
-    return status::success;
-}
-
-inline status_t create_kernels(const compute::compute_engine_t *engine,
-        compute::kernel_list_t &kernel_list,
-        const compute::kernel_ctx_t &kernel_ctx) {
-    return create_kernels(
-            engine, kernel_list, kernel_ctx, ocl::get_kernel_source);
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/ocl/lnorm_utils.hpp b/src/gpu/intel/ocl/lnorm_utils.hpp
index 99628dc08bc..5d4382e273b 100644
--- a/src/gpu/intel/ocl/lnorm_utils.hpp
+++ b/src/gpu/intel/ocl/lnorm_utils.hpp
@@ -29,19 +29,18 @@ namespace intel {
 namespace ocl {
 
 namespace lnorm_dims {
-constexpr compute::dim_id_t mb = 0;
-constexpr compute::dim_id_t ic = 1;
-constexpr compute::dim_id_t sp0 = 2;
-constexpr compute::dim_id_t sp1 = 3;
-constexpr compute::dim_id_t sp2 = 4;
+constexpr dim_idx_t mb = 0;
+constexpr dim_idx_t ic = 1;
+constexpr dim_idx_t sp0 = 2;
+constexpr dim_idx_t sp1 = 3;
+constexpr dim_idx_t sp2 = 4;
 }; // namespace lnorm_dims
 
-static std::vector<compute::dim_id_t> get_dims(
-        size_t ndims, bool for_stats = false) {
+static std::vector<dim_idx_t> get_dims(size_t ndims, bool for_stats = false) {
     assert(ndims > 1 && ndims < 6);
     // The last logical dimension is not included in lnorm stats
     if (for_stats) ndims--;
-    std::vector<compute::dim_id_t> ret(ndims);
+    std::vector<dim_idx_t> ret(ndims);
     uint8_t idx = 0;
     ret[idx++] = lnorm_dims::mb;
     if (ndims >= 2) ret[idx++] = lnorm_dims::ic;
@@ -52,7 +51,7 @@ static std::vector<compute::dim_id_t> get_dims(
 }
 
 static compute::named_buffer_t get_ss_buffer(
-        const memory_desc_t *md, compute::dim_id_t dim) {
+        const memory_desc_t *md, dim_idx_t dim) {
     if (types::is_zero_md(md)) {
         // Scale/shift are unused. We need to construct a buffer that will not be dispatched to
         compute::named_buffer_t ret("SS");
diff --git a/src/gpu/intel/ocl/mdapi/metrics_discovery_api.h b/src/gpu/intel/ocl/mdapi/metrics_discovery_api.h
deleted file mode 100644
index de25d985e08..00000000000
--- a/src/gpu/intel/ocl/mdapi/metrics_discovery_api.h
+++ /dev/null
@@ -1,1150 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-/*****************************************************************************\
-
-    Copyright © 2019, Intel Corporation
-
-    Permission is hereby granted, free of charge, to any person obtaining a
-    copy of this software and associated documentation files (the "Software"),
-    to deal in the Software without restriction, including without limitation
-    the rights to use, copy, modify, merge, publish, distribute, sublicense,
-    and/or sell copies of the Software, and to permit persons to whom the
-    Software is furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-    IN THE SOFTWARE.
-
-    File Name:  metrics_discovery_api.h
-
-    Abstract:   Interface for metrics discovery DLL
-
-    Notes:
-
-\*****************************************************************************/
-#include <stdint.h>
-
-#ifndef __METRICS_DISCOVERY_H_
-#define __METRICS_DISCOVERY_H_
-
-#ifdef _MSC_VER
-#define MD_STDCALL __stdcall
-#else
-#define MD_STDCALL
-#endif // _MSC_VER
-
-namespace MetricsDiscovery {
-//*****************************************************************************/
-// API major version number:
-//*****************************************************************************/
-typedef enum EMD_API_MAJOR_VERSION {
-    MD_API_MAJOR_NUMBER_1 = 1,
-    MD_API_MAJOR_NUMBER_CURRENT = MD_API_MAJOR_NUMBER_1,
-    MD_API_MAJOR_NUMBER_CEIL = 0xFFFFFFFF
-
-} MD_API_MAJOR_VERSION;
-
-//*****************************************************************************/
-// API minor version number:
-//*****************************************************************************/
-typedef enum EMD_API_MINOR_VERSION {
-    MD_API_MINOR_NUMBER_0 = 0,
-    MD_API_MINOR_NUMBER_1 = 1, // CalculationAPI
-    MD_API_MINOR_NUMBER_2 = 2, // OverridesAPI
-    MD_API_MINOR_NUMBER_3 = 3, // BatchBuffer Sampling (aka DMA Sampling)
-    MD_API_MINOR_NUMBER_4 = 4, // GT dependent MetricSets
-    MD_API_MINOR_NUMBER_5 = 5, // MaxValue calculation for CalculationAPI
-    MD_API_MINOR_NUMBER_CURRENT = MD_API_MINOR_NUMBER_5,
-    MD_API_MINOR_NUMBER_CEIL = 0xFFFFFFFF
-
-} MD_API_MINOR_VERSION;
-
-//*****************************************************************************/
-// API build number:
-//*****************************************************************************/
-#define MD_API_BUILD_NUMBER_CURRENT 115
-
-//*****************************************************************************/
-// Completion codes:
-//*****************************************************************************/
-typedef enum ECompletionCode {
-    CC_OK = 0,
-    CC_READ_PENDING = 1,
-    CC_ALREADY_INITIALIZED = 2,
-    CC_STILL_INITIALIZED = 3,
-    CC_CONCURRENT_GROUP_LOCKED = 4,
-    CC_WAIT_TIMEOUT = 5,
-    CC_TRY_AGAIN = 6,
-    CC_INTERRUPTED = 7,
-    // ...
-    CC_ERROR_INVALID_PARAMETER = 40,
-    CC_ERROR_NO_MEMORY = 41,
-    CC_ERROR_GENERAL = 42,
-    CC_ERROR_FILE_NOT_FOUND = 43,
-    CC_ERROR_NOT_SUPPORTED = 44,
-    // ...
-    CC_LAST_1_0 = 45
-
-} TCompletionCode;
-
-/* Forward declarations */
-
-//*******************************************************************************/
-// Abstract interface for the GPU metrics root object.
-//*******************************************************************************/
-class IMetricsDevice_1_0;
-class IMetricsDevice_1_1;
-class IMetricsDevice_1_2;
-class IMetricsDevice_1_5;
-
-//*******************************************************************************/
-// Abstract interface for Metrics Device overrides.
-//*******************************************************************************/
-class IOverride_1_2;
-
-//*******************************************************************************/
-// Abstract interface for the metrics groups that can be collected concurrently
-// to another group.
-//*******************************************************************************/
-class IConcurrentGroup_1_0;
-class IConcurrentGroup_1_1;
-class IConcurrentGroup_1_5;
-
-//*******************************************************************************/
-// Abstract interface for the metric sets mapping to different HW configuration
-// that should be used exclusively to each other metric set in the concurrent
-// group.
-//*******************************************************************************/
-class IMetricSet_1_0;
-class IMetricSet_1_1;
-class IMetricSet_1_4;
-class IMetricSet_1_5;
-
-//*******************************************************************************/
-// Abstract interface for the metric that is sampled.
-//*******************************************************************************/
-class IMetric_1_0;
-
-//*******************************************************************************/
-// Abstract interface for the measurement information (report reason, etc.).
-//*******************************************************************************/
-class IInformation_1_0;
-
-//*******************************************************************************/
-// Abstract interface for the metric read and normalization equation.
-//*******************************************************************************/
-class IEquation_1_0;
-
-//*******************************************************************************/
-// Value types:
-//*******************************************************************************/
-typedef enum EValueType {
-    VALUE_TYPE_UINT32,
-    VALUE_TYPE_UINT64,
-    VALUE_TYPE_FLOAT,
-    VALUE_TYPE_BOOL,
-    VALUE_TYPE_CSTRING,
-    // ...
-    VALUE_TYPE_LAST,
-
-} TValueType;
-
-//*******************************************************************************/
-// Typed value:
-//*******************************************************************************/
-typedef struct STypedValue_1_0 {
-    TValueType ValueType;
-    union {
-        uint32_t ValueUInt32;
-        uint64_t ValueUInt64;
-        struct {
-            uint32_t Low;
-            uint32_t High;
-        } ValueUInt64Fields;
-        float ValueFloat;
-        bool ValueBool;
-        char *ValueCString;
-    };
-
-} TTypedValue_1_0;
-
-//*******************************************************************************/
-// Global symbol:
-//     Global symbols will be available to describe SKU specific information.
-//     Example global symbols:
-//     "EuCoresTotalCount", "EuThreadsCount", "EuSlicesTotalCount", "EuSubslicesTotalCount",
-//     "SamplersTotalCount", "PciDeviceId", "NumberOfShadingUnits", "GpuTimestampFrequency",
-//     "MaxTimestamp", "GpuMinFrequencyMHz", "GpuMaxFrequencyMHz"
-//*******************************************************************************/
-typedef struct SGlobalSymbol_1_0 {
-    const char *SymbolName;
-    TTypedValue_1_0 SymbolTypedValue;
-
-} TGlobalSymbol_1_0;
-
-//*******************************************************************************/
-// Global parameters of Metrics Device:
-//*******************************************************************************/
-typedef struct SMetricsDeviceParams_1_0 {
-    // API version
-    struct SApiVersion {
-        uint32_t MajorNumber;
-        uint32_t MinorNumber;
-        uint32_t BuildNumber;
-    } Version;
-
-    uint32_t ConcurrentGroupsCount;
-
-    uint32_t GlobalSymbolsCount;
-    uint32_t DeltaFunctionsCount;
-    uint32_t EquationElementTypesCount;
-    uint32_t EquationOperationsCount;
-
-    const char *DeviceName;
-
-} TMetricsDeviceParams_1_0;
-
-//*******************************************************************************/
-// Global parameters of Metrics Device 1.2:
-//*******************************************************************************/
-typedef struct SMetricsDeviceParams_1_2 : public SMetricsDeviceParams_1_0 {
-    uint32_t OverrideCount;
-
-} TMetricsDeviceParams_1_2;
-
-//*******************************************************************************/
-// Metric API types:
-//*******************************************************************************/
-typedef enum EMetricApiType {
-    API_TYPE_IOSTREAM = 0x00000001, // API independent method
-    API_TYPE_DX9 = 0x00000002,
-    API_TYPE_DX10 = 0x00000004,
-    API_TYPE_DX11 = 0x00000008,
-    API_TYPE_OGL = 0x00000010,
-    API_TYPE_OGL4_X = 0x00000020,
-    API_TYPE_OCL = 0x00000040,
-    API_TYPE_MEDIA
-    = 0x00000080, // Only option would be using DmaBuffer sampling
-    API_TYPE_DX12 = 0x00000100,
-    API_TYPE_BBSTREAM = 0x00000200,
-    API_TYPE_VULKAN = 0x00000400,
-    API_TYPE_RESERVED = 0x00000800,
-    API_TYPE_ALL = 0xffffffff
-
-} TMetricApiType;
-
-//*******************************************************************************/
-// Measurement types:
-//*******************************************************************************/
-typedef enum EMeasurementType {
-    MEASUREMENT_TYPE_SNAPSHOT_IO = 0x00000001,
-    MEASUREMENT_TYPE_SNAPSHOT_QUERY = 0x00000002,
-    MEASUREMENT_TYPE_DELTA_QUERY = 0x00000004,
-    MEASUREMENT_TYPE_ALL = 0x0000ffff,
-
-} TMeasurementType;
-
-//*******************************************************************************/
-// Usage flags:
-//*******************************************************************************/
-typedef enum EMetricUsageFlag {
-    USAGE_FLAG_OVERVIEW = 0x00000001, // GPU system overview metric
-    // Useful for high level workload characterization
-    USAGE_FLAG_INDICATE = 0x00000002, // Metric indicating a performance problem
-    // Useful when comparing with threshold
-    USAGE_FLAG_CORRELATE
-    = 0x00000004, // Metric correlating with performance problem
-    // Useful for proving to false only
-    //...
-    USAGE_FLAG_SYSTEM = 0x00000020, // Metric useful at system level
-    USAGE_FLAG_FRAME = 0x00000040, // Metric useful at frame level
-    USAGE_FLAG_BATCH = 0x00000080, // Metric useful at batch level
-    USAGE_FLAG_DRAW = 0x00000100, // Metric useful at draw level
-
-    // ...
-    USAGE_FLAG_TIER_1 = 0x00000400,
-    USAGE_FLAG_TIER_2 = 0x00000800,
-    USAGE_FLAG_TIER_3 = 0x00001000,
-    USAGE_FLAG_TIER_4 = 0x00002000,
-    USAGE_FLAG_GLASS_JAW = 0x00004000,
-
-    USAGE_FLAG_ALL = 0x0000ffff,
-
-} TMetricUsageFlag;
-
-//*******************************************************************************/
-// Sampling types:
-//*******************************************************************************/
-typedef enum ESamplingType {
-    SAMPLING_TYPE_OA_TIMER = 0x00000001,
-    SAMPLING_TYPE_OA_EVENT = 0x00000002,
-    SAMPLING_TYPE_GPU_QUERY = 0x00000004,
-    SAMPLING_TYPE_DMA_BUFFER
-    = 0x00000008, // Possible future extension for media
-    SAMPLING_TYPE_ALL = 0x0000ffff,
-
-} TSamplingType;
-
-//*******************************************************************************/
-// Metric categories:
-//*******************************************************************************/
-typedef enum EMetricCategory {
-    GPU_RENDER = 0x0001,
-    GPU_COMPUTE = 0x0002,
-    GPU_MEDIA = 0x0004,
-    GPU_GENERIC
-    = 0x0008, // Does not belong to any specific category like memory traffic
-
-} TMetricCategory;
-
-//*******************************************************************************/
-// IoStream read flags:
-//*******************************************************************************/
-typedef enum EIoReadFlag {
-    IO_READ_FLAG_DROP_OLD_REPORTS = 0x00000001,
-    IO_READ_FLAG_GET_CONTEXT_ID_TAGS = 0x00000002,
-
-} TIoReadFlag;
-
-//*******************************************************************************/
-// Override modes:
-//*******************************************************************************/
-typedef enum EOverrideMode {
-    OVERRIDE_MODE_GLOBAL = 0x0001,
-    OVERRIDE_MODE_LOCAL = 0x0002,
-
-} TOverrideMode;
-
-//*******************************************************************************/
-// Global parameters of Concurrent Group:
-//*******************************************************************************/
-typedef struct SConcurrentGroupParams_1_0 {
-    const char *SymbolName; // For example "PerfMon" or "OA" or "PipeStats"
-    const char *
-            Description; // For example "PerfMon and ODLAT Uncore ring counters"
-
-    uint32_t MeasurementTypeMask;
-
-    uint32_t MetricSetsCount;
-    uint32_t IoMeasurementInformationCount;
-    uint32_t IoGpuContextInformationCount;
-
-} TConcurrentGroupParams_1_0;
-
-//*******************************************************************************/
-// Global parameters of an Override:
-//*******************************************************************************/
-typedef struct SOverrideParams_1_2 {
-    const char *SymbolName; // For example "FrequencyOverride"
-    const char *
-            Description; // For example "Overrides device GPU frequency with a static value."
-
-    uint32_t ApiMask;
-    uint32_t PlatformMask;
-
-    uint32_t OverrideModeMask;
-
-} TOverrideParams_1_2;
-
-//*******************************************************************************/
-// Base params of SetOverride method:
-//*******************************************************************************/
-typedef struct SSetOverrideParams_1_2 {
-    bool Enable;
-
-} TSetOverrideParams_1_2;
-
-//*******************************************************************************/
-// Frequency override specific SetOverride params:
-//*******************************************************************************/
-typedef struct SSetFrequencyOverrideParams_1_2 : SSetOverrideParams_1_2 {
-    uint32_t FrequencyMhz;
-    uint32_t Pid;
-
-} TSetFrequencyOverrideParams_1_2;
-
-//*******************************************************************************/
-// Query override specific SetOverride params:
-//*******************************************************************************/
-typedef struct SSetQueryOverrideParams_1_2 : SSetOverrideParams_1_2 {
-    uint32_t Period; // Nanoseconds
-
-} TSetQueryOverrideParams_1_2;
-
-//*******************************************************************************/
-// Driver override params:
-//*******************************************************************************/
-typedef struct SSetDriverOverrideParams_1_2 : SSetOverrideParams_1_2 {
-    uint32_t Value;
-
-} SSetDriverOverrideParams_1_2;
-
-//*******************************************************************************/
-// API specific id:
-//*******************************************************************************/
-typedef struct SApiSpecificId_1_0 {
-    uint32_t D3D9QueryId; // D3D9 Query ID
-    uint32_t D3D9Fourcc; // D3D9 FourCC
-    uint32_t D3D1XQueryId; // D3D1X Query ID
-    uint32_t D3D1XDevDependentId; // D3D1X device dependent counter ID
-    const char *D3D1XDevDependentName; // Device dependent counter name
-    uint32_t OGLQueryIntelId; // Intel OGL query extension ID
-    const char *OGLQueryIntelName; // Intel OGL query extension name
-    uint32_t OGLQueryARBTargetId; // ARB OGL Query Target ID
-    uint32_t OCL; // OCL configuration ID
-    uint32_t HwConfigId; // Config ID for IO stream
-    uint32_t placeholder[1];
-
-} TApiSpecificId_1_0;
-
-//*******************************************************************************/
-// Global parameters of Metric set:
-//*******************************************************************************/
-typedef struct SMetricSetParams_1_0 {
-    const char *SymbolName; // For example "Dx11Tessellation"
-    const char *ShortName; // For example "DX11 Tessellation Metrics Set"
-
-    uint32_t ApiMask;
-    uint32_t CategoryMask;
-
-    uint32_t RawReportSize; // As in HW
-    uint32_t QueryReportSize; // As in Query API
-
-    uint32_t MetricsCount;
-    uint32_t InformationCount;
-    uint32_t ComplementarySetsCount;
-
-    TApiSpecificId_1_0 ApiSpecificId;
-
-    uint32_t PlatformMask;
-    //...
-
-} TMetricSetParams_1_0;
-
-//*******************************************************************************/
-// GT differenced MetricSet params:
-//*******************************************************************************/
-typedef struct SMetricSetParams_1_4 : SMetricSetParams_1_0 {
-    uint32_t GtMask;
-
-} TMetricSetParams_1_4;
-
-//*******************************************************************************/
-// Metric result types:
-//*******************************************************************************/
-typedef enum EMetricResultType {
-    RESULT_UINT32,
-    RESULT_UINT64,
-    RESULT_BOOL,
-    RESULT_FLOAT,
-    // ...
-    RESULT_LAST
-
-} TMetricResultType;
-
-//*******************************************************************************/
-// Metric types:
-//*******************************************************************************/
-typedef enum EMetricType {
-    METRIC_TYPE_DURATION,
-    METRIC_TYPE_EVENT,
-    METRIC_TYPE_EVENT_WITH_RANGE,
-    METRIC_TYPE_THROUGHPUT,
-    METRIC_TYPE_TIMESTAMP,
-    METRIC_TYPE_FLAG,
-    METRIC_TYPE_RATIO,
-    METRIC_TYPE_RAW,
-    // ...
-    METRIC_TYPE_LAST
-
-} TMetricType;
-
-//*******************************************************************************/
-// Information types:
-//*******************************************************************************/
-typedef enum EInformationType {
-    INFORMATION_TYPE_REPORT_REASON,
-    INFORMATION_TYPE_VALUE,
-    INFORMATION_TYPE_FLAG,
-    INFORMATION_TYPE_TIMESTAMP,
-    INFORMATION_TYPE_CONTEXT_ID_TAG,
-    INFORMATION_TYPE_SAMPLE_PHASE,
-    INFORMATION_TYPE_GPU_NODE,
-    // ...
-    INFORMATION_TYPE_LAST
-
-} TInformationType;
-
-//*******************************************************************************/
-// Report reasons:
-//*******************************************************************************/
-typedef enum EReportReason {
-    REPORT_REASON_UNDEFINED = 0x0000,
-    REPORT_REASON_INTERNAL_TIMER = 0x0001,
-    REPORT_REASON_INTERNAL_TRIGGER1 = 0x0002,
-    REPORT_REASON_INTERNAL_TRIGGER2 = 0x0004,
-    REPORT_REASON_INTERNAL_CONTEXT_SWITCH = 0x0008,
-    REPORT_REASON_INTERNAL_GO = 0x0010,
-    REPORT_REASON_INTERNAL_FREQUENCY_CHANGE = 0x0020,
-    REPORT_REASON_QUERY_DEFAULT = 0x0100,
-    REPORT_REASON_QUERY_INTERNAL_RESOLVE = 0x0200,
-    REPORT_REASON_QUERY_INTERNAL_CLEAR = 0x0400,
-
-} TReportReason;
-
-//*******************************************************************************/
-// Sample phase:
-//*******************************************************************************/
-typedef enum ESamplePhase {
-    SAMPLE_PHASE_END,
-    SAMPLE_PHASE_BEGIN,
-    // ...
-    SAMPLE_PHASE_LAST
-
-} TSamplePhase;
-
-//*******************************************************************************/
-// Gpu Node:
-//*******************************************************************************/
-typedef enum EInformationGpuNode {
-    INFORMATION_GPUNODE_3D = 0, // Available by default on all platform
-    INFORMATION_GPUNODE_VIDEO = 1, // Available on CTG+
-    INFORMATION_GPUNODE_BLT = 2, // Available on GT
-    INFORMATION_GPUNODE_VE = 3, // Available on HSW+ (VideoEnhancement)
-    INFORMATION_GPUNODE_VCS2 = 4, // Available on BDW+ GT3+
-    INFORMATION_GPUNODE_REAL_MAX
-    = 5, // All nodes beyond this are virtual nodes - they don't have an actual GPU engine
-    // ...
-    INFORMATION_GPUNODE_LAST
-
-} TInformationGpuNode;
-
-//*******************************************************************************/
-// Hardware unit types:
-//*******************************************************************************/
-typedef enum EHwUnitType {
-    HW_UNIT_GPU,
-    HW_UNIT_SLICE,
-    HW_UNIT_SUBSLICE,
-    HW_UNIT_SUBSLICE_BANK,
-    HW_UNIT_EU_UNIT,
-    HW_UNIT_UNCORE,
-    // ...
-    HW_UNIT_LAST
-
-} THwUnitType;
-
-//*******************************************************************************/
-// Delta function types:
-//*******************************************************************************/
-typedef enum EDeltaFunctionType {
-    DELTA_FUNCTION_NULL = 0,
-    DELTA_N_BITS,
-    DELTA_BOOL_OR, // Logic OR - good for exceptions
-    DELTA_BOOL_XOR, // Logic XOR - good to check if bits were changed
-    DELTA_GET_PREVIOUS, // Preserve previous value
-    DELTA_GET_LAST, // Preserve last value
-    DELTA_NS_TIME, // Delta for nanosecond timestamps (GPU timestamp wraps at 32 bits but was value multiplied by 80)
-    DELTA_FUNCTION_LAST_1_0
-
-} TDeltaFunctionType;
-
-//*******************************************************************************/
-// Delta function:
-//*******************************************************************************/
-typedef struct SDeltaFunction_1_0 {
-    TDeltaFunctionType FunctionType;
-    union {
-        uint32_t BitsCount; // Used for DELTA_N_BITS to specify bits count
-    };
-
-} TDeltaFunction_1_0;
-
-//*******************************************************************************/
-// Equation element types:
-//*******************************************************************************/
-typedef enum EEquationElementType {
-    EQUATION_ELEM_OPERATION, // See TEquationOperation enumeration
-
-    EQUATION_ELEM_RD_BITFIELD,
-    EQUATION_ELEM_RD_UINT8,
-    EQUATION_ELEM_RD_UINT16,
-    EQUATION_ELEM_RD_UINT32,
-    EQUATION_ELEM_RD_UINT64,
-    EQUATION_ELEM_RD_FLOAT,
-
-    // Extended RD operation
-    EQUATION_ELEM_RD_40BIT_CNTR, // Assemble 40 bit counter that is in two locations, result in unsigned integer 64b
-
-    EQUATION_ELEM_IMM_UINT64,
-    EQUATION_ELEM_IMM_FLOAT,
-    EQUATION_ELEM_SELF_COUNTER_VALUE, // Defined by $Self token, the UINT64 result of DeltaFunction for IO or QueryReadEquation
-    EQUATION_ELEM_GLOBAL_SYMBOL, // Defined by $"SymbolName", available in MetricsDevice SymbolTable
-    EQUATION_ELEM_LOCAL_COUNTER_SYMBOL, // Defined by $"SymbolName", refers to counter delta value in the local set
-    EQUATION_ELEM_OTHER_SET_COUNTER_SYMBOL, // Defined by concatenated string of $"setSymbolName/SymbolName", refers to counter
-    // Delta value in the other set
-
-    EQUATION_ELEM_LOCAL_METRIC_SYMBOL, // Defined by $$"SymbolName", refers to metric normalized value in the local set
-    EQUATION_ELEM_OTHER_SET_METRIC_SYMBOL, // Defined by concatenated string of $$"setSymbolName/SymbolName", refers to metric
-    // Normalized value in the other set
-
-    EQUATION_ELEM_INFORMATION_SYMBOL, // Defined by i$"SymbolName", refers to information value type only
-
-    // Extended types - standard normalization functions
-    EQUATION_ELEM_STD_NORM_GPU_DURATION, // Action is $Self $GpuCoreClocks FDIV 100 FMUL
-    EQUATION_ELEM_STD_NORM_EU_AGGR_DURATION, // Action is $Self $GpuCoreClocks $EuCoresTotalCount UMUL FDIV 100 FMUL
-
-    EQUATION_ELEM_LAST_1_0
-
-} TEquationElementType;
-
-//*******************************************************************************/
-// Equation operations:
-//*******************************************************************************/
-typedef enum EEquationOperation {
-    EQUATION_OPER_RSHIFT, // 64b unsigned integer right shift
-    EQUATION_OPER_LSHIFT, // 64b unsigned integer left shift
-    EQUATION_OPER_AND, // Bitwise AND of two unsigned integers, 64b each
-    EQUATION_OPER_OR, // Bitwise OR of two unsigned integers, 64b each
-    EQUATION_OPER_XOR, // Bitwise XOR of two unsigned integers, 64b each
-    EQUATION_OPER_XNOR, // Bitwise XNOR of two unsigned integers, 64b each
-    EQUATION_OPER_AND_L, // Logical AND (C-like "&&") of two unsigned integers, 64b each, result is true(1) if both values are true(greater than 0)
-    EQUATION_OPER_EQUALS, // Equality (C-like "==") of two unsigned integers, 64b each, result is true(1) or false(0)
-    EQUATION_OPER_UADD, // Unsigned integer add, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
-    EQUATION_OPER_USUB, // Unsigned integer subtract, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
-    EQUATION_OPER_UMUL, // Unsigned integer mul, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
-    EQUATION_OPER_UDIV, // Unsigned integer div, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
-    EQUATION_OPER_FADD, // Floating point add, arguments are casted to be 32b floating points, result is a 32b float
-    EQUATION_OPER_FSUB, // Floating point subtract, arguments are casted to be 32b floating points, result is a 32b float
-    EQUATION_OPER_FMUL, // Floating point multiply, arguments are casted to be 32b floating points, result is a 32b float
-    EQUATION_OPER_FDIV, // Floating point divide, arguments are casted to be 32b floating points, result is a 32b float
-
-    EQUATION_OPER_UGT, // 64b unsigned integers comparison of is greater than, result is bool true(1) or false(0)
-    EQUATION_OPER_ULT, // 64b unsigned integers comparison of is less than, result is bool true(1) or false(0)
-    EQUATION_OPER_UGTE, // 64b unsigned integers comparison of is greater than or equal, result is bool true(1) or false(0)
-    EQUATION_OPER_ULTE, // 64b unsigned integers comparison of is less than or equal, result is bool true(1) or false(0)
-
-    EQUATION_OPER_FGT, // 32b floating point numbers comparison of is greater than, result is bool true(1) or false(0)
-    EQUATION_OPER_FLT, // 32b floating point numbers comparison of is less than, result is bool true(1) or false(0)
-    EQUATION_OPER_FGTE, // 32b floating point numbers comparison of is greater than or equal, result is bool true(1) or false(0)
-    EQUATION_OPER_FLTE, // 32b floating point numbers comparison of is less than or equal, result is bool true(1) or false(0)
-
-    EQUATION_OPER_UMIN, // Unsigned integer MIN function, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
-    EQUATION_OPER_UMAX, // Unsigned integer MAX function, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
-
-    EQUATION_OPER_FMIN, // Floating point MIN function, arguments are casted to be 32b floating points, result is a 32b float
-    EQUATION_OPER_FMAX, // Floating point MAX function, arguments are casted to be 32b floating points, result is a 32b float
-
-    EQUATION_OPER_LAST_1_0
-
-} TEquationOperation;
-
-//*******************************************************************************/
-// Read params:
-//*******************************************************************************/
-typedef struct SReadParams_1_0 {
-    uint32_t ByteOffset;
-    uint32_t BitOffset;
-    uint32_t BitsCount;
-    uint32_t ByteOffsetExt;
-
-} TReadParams_1_0;
-
-//*******************************************************************************/
-// Equation element:
-//*******************************************************************************/
-typedef struct SEquationElement_1_0 {
-    TEquationElementType Type;
-    union {
-        uint64_t ImmediateUInt64;
-        float ImmediateFloat;
-        TEquationOperation Operation;
-        TReadParams_1_0 ReadParams;
-    };
-    char *SymbolName;
-
-} TEquationElement_1_0;
-
-/*****************************************************************************\
-
-Class:
-    IEquation_1_0
-
-Description:
-    Abstract interface for the equation object.
-
-\*****************************************************************************/
-class IEquation_1_0 {
-public:
-    virtual ~IEquation_1_0();
-
-    virtual uint32_t GetEquationElementsCount(void);
-    virtual TEquationElement_1_0 *GetEquationElement(uint32_t index);
-};
-
-//*******************************************************************************/
-// Global parameters of Metric:
-//*******************************************************************************/
-typedef struct SMetricParams_1_0 {
-    uint32_t IdInSet; // Position in the set
-    uint32_t GroupId; // Specific metric group id
-    const char *SymbolName; // Symbol name, used in equations
-    const char *
-            ShortName; // Consistent metric name, not changed platform to platform
-    const char *GroupName; // VertexShader for example
-    const char *LongName; // Hint about the metric shown to users
-
-    const char *DxToOglAlias; // To replace DX pixels with OGL fragments
-
-    uint32_t UsageFlagsMask;
-    uint32_t ApiMask;
-
-    TMetricResultType ResultType;
-    const char *MetricResultUnits;
-    TMetricType MetricType;
-
-    uint64_t
-            LowWatermark; // Low watermark for hotspot indication (USAGE_FLAG_INDICATE only)
-    uint64_t
-            HighWatermark; // High watermark for hotspot indication (USAGE_FLAG_INDICATE only)
-
-    THwUnitType HwUnitType;
-
-    // Read equation specification for IO stream (accessing raw values potentially spread in report in several locations)
-    IEquation_1_0 *IoReadEquation;
-    // Read equation specification for query (accessing calculated delta values)
-    IEquation_1_0 *QueryReadEquation;
-
-    TDeltaFunction_1_0 DeltaFunction;
-
-    // Normalization equation to get normalized value to bytes transfered or to a percentage of utilization
-    IEquation_1_0 *NormEquation;
-
-    // To calculate metrics max value as a function of other metrics and device parameters (e.g. 100 for percentage)
-    IEquation_1_0 *MaxValueEquation;
-
-} TMetricParams_1_0;
-
-//*******************************************************************************/
-// Global parameters of Information:
-//*******************************************************************************/
-typedef struct SInformationParams_1_0 {
-    uint32_t IdInSet; // Position in the set
-    const char *SymbolName; // Symbol name, used in equations
-    const char *ShortName; // Consistent name, not changed platform to platform
-    const char *GroupName; // Some more global context of the information
-    const char *LongName; // Hint about the information shown to users
-
-    uint32_t ApiMask;
-
-    TInformationType InfoType;
-    const char *InfoUnits;
-
-    // Read equation specification for IO stream (accessing raw values potentially spread in report in several locations)
-    IEquation_1_0 *IoReadEquation;
-    // Read equation specification for query (accessing calculated delta values)
-    IEquation_1_0 *QueryReadEquation;
-
-    TDeltaFunction_1_0 OverflowFunction;
-
-} TInformationParams_1_0;
-
-/*****************************************************************************\
-
-Class:
-    IInformation_1_0
-
-Description:
-    Abstract interface for the measurement information parameter.
-
-\*****************************************************************************/
-class IInformation_1_0 {
-public:
-    virtual ~IInformation_1_0();
-
-    virtual TInformationParams_1_0 *GetParams();
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetric_1_0
-
-Description:
-    Abstract interface for the metric that is sampled.
-
-\*****************************************************************************/
-class IMetric_1_0 {
-public:
-    virtual ~IMetric_1_0();
-
-    virtual TMetricParams_1_0 *GetParams();
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricSet_1_0
-
-Description:
-    Abstract interface for the metric sets mapping to different HW configuration that should be used
-    exclusively to each other metric set in the concurrent group.
-
-\*****************************************************************************/
-class IMetricSet_1_0 {
-public:
-    virtual ~IMetricSet_1_0();
-
-    virtual TMetricSetParams_1_0 *GetParams(void);
-
-    // To get particular metric
-    virtual IMetric_1_0 *GetMetric(uint32_t index);
-
-    // To get particular information about measurement
-    virtual IInformation_1_0 *GetInformation(uint32_t index);
-
-    // Below proposal to address multi-passes at the set level
-    virtual IMetricSet_1_0 *GetComplementaryMetricSet(uint32_t index);
-
-    // To enable this configuration before query instance is created
-    virtual TCompletionCode Activate(void);
-
-    // To disable this configuration after query instance is created
-    virtual TCompletionCode Deactivate(void);
-
-    // To add an additional custom metric to this set
-    virtual IMetric_1_0 *AddCustomMetric(const char *symbolName,
-            const char *shortName, const char *groupName, const char *longName,
-            const char *dxToOglAlias, uint32_t usageFlagsMask, uint32_t apiMask,
-            TMetricResultType resultType, const char *resultUnits,
-            TMetricType metricType, int64_t loWatermark, int64_t hiWatermark,
-            THwUnitType hwType, const char *ioReadEquation,
-            const char *deltaFunction, const char *queryReadEquation,
-            const char *normalizationEquation, const char *maxValueEquation,
-            const char *signalName);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricSet_1_1
-
-Description:
-    Updated 1.0 version to use with 1.1 interface version.
-    Introduces an ability to calculate metrics from raw data.
-
-    New:
-        - SetApiFiltering
-        - CalculateMetrics
-        - CalculateIoMeasurementInformation
-
-\*****************************************************************************/
-class IMetricSet_1_1 : public IMetricSet_1_0 {
-public:
-    virtual ~IMetricSet_1_1();
-
-    // To filter available metrics/information for the given API. Use TMetricApiType to build the mask.
-    virtual TCompletionCode SetApiFiltering(uint32_t apiMask);
-
-    // To calculate normalized metrics/information from the raw data.
-    virtual TCompletionCode CalculateMetrics(const unsigned char *rawData,
-            uint32_t rawDataSize, TTypedValue_1_0 *out, uint32_t outSize,
-            uint32_t *outReportCount, bool enableContextFiltering);
-
-    // To calculate additional information for stream measurements.
-    virtual TCompletionCode CalculateIoMeasurementInformation(
-            TTypedValue_1_0 *out, uint32_t outSize);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricSet_1_4
-
-Description:
-    Updated 1.1 version to use with 1.4 interface version.
-    Extends set params with gtType information.
-
-    Updates:
-        - GetParams
-
-\*****************************************************************************/
-class IMetricSet_1_4 : public IMetricSet_1_1 {
-public:
-    virtual ~IMetricSet_1_4();
-
-    virtual TMetricSetParams_1_4 *GetParams(void);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricSet_1_5
-
-Description:
-    Updated 1.4 version to use with 1.5 interface version.
-    Adds an ability to calculate MaxValueEquations (maximal value) for each metric.
-    Param 'enableContextFiltering' becomes deprecated.
-
-    Updates:
-        - GetComplementaryMetricSet
-        - CalculateMetrics
-
-\*****************************************************************************/
-class IMetricSet_1_5 : public IMetricSet_1_4 {
-public:
-    // To avoid hiding by 1.5 interface function
-    using IMetricSet_1_1::CalculateMetrics;
-
-    // Update to 1.5 interface
-    virtual IMetricSet_1_5 *GetComplementaryMetricSet(uint32_t index);
-
-    // CalculateMetrics extended with max values calculation.
-    // Optional param 'outMaxValues' should have a memory for at least 'MetricCount * RawReportCount' values, can be NULL.
-    virtual TCompletionCode CalculateMetrics(const unsigned char *rawData,
-            uint32_t rawDataSize, TTypedValue_1_0 *out, uint32_t outSize,
-            uint32_t *outReportCount, TTypedValue_1_0 *outMaxValues,
-            uint32_t outMaxValuesSize);
-};
-
-/*****************************************************************************\
-
-Class:
-    IConcurrentGroup_1_0
-
-Description:
-    Abstract interface for the metrics groups that can be collected concurrently to another group.
-
-\*****************************************************************************/
-class IConcurrentGroup_1_0 {
-public:
-    virtual ~IConcurrentGroup_1_0();
-
-    virtual TConcurrentGroupParams_1_0 *GetParams(void);
-    virtual IMetricSet_1_0 *GetMetricSet(uint32_t index);
-
-    virtual TCompletionCode OpenIoStream(IMetricSet_1_0 *metricSet,
-            uint32_t processId, uint32_t *nsTimerPeriod,
-            uint32_t *oaBufferSize);
-    virtual TCompletionCode ReadIoStream(
-            uint32_t *reportsCount, char *reportData, uint32_t readFlags);
-    virtual TCompletionCode CloseIoStream(void);
-    virtual TCompletionCode WaitForReports(uint32_t milliseconds);
-    virtual IInformation_1_0 *GetIoMeasurementInformation(uint32_t index);
-    virtual IInformation_1_0 *GetIoGpuContextInformation(uint32_t index);
-};
-
-/*****************************************************************************\
-
-Class:
-    IConcurrentGroup_1_1
-
-Description:
-    Updated 1.0 version to use with 1.1 interface version.
-
-    Updates:
-        - GetMetricSet
-
-\*****************************************************************************/
-class IConcurrentGroup_1_1 : public IConcurrentGroup_1_0 {
-public:
-    // Update to 1.1 interface
-    virtual IMetricSet_1_1 *GetMetricSet(uint32_t index);
-};
-
-/*****************************************************************************\
-
-Class:
-    IConcurrentGroup_1_3
-
-Description:
-    Updated 1.1 version to use with 1.3 interface version.
-    Introduces setting Stream Sampling Type.
-
-    New:
-        - SetIoStreamSamplingType
-
-\*****************************************************************************/
-class IConcurrentGroup_1_3 : public IConcurrentGroup_1_1 {
-public:
-    // To set sampling type during IoStream measurements
-    virtual TCompletionCode SetIoStreamSamplingType(TSamplingType type);
-};
-
-/*****************************************************************************\
-
-Class:
-    IConcurrentGroup_1_5
-
-Description:
-    Updated 1.3 version to use with 1.5 interface version.
-
-    Updates:
-        - GetMetricSet
-
-\*****************************************************************************/
-class IConcurrentGroup_1_5 : public IConcurrentGroup_1_3 {
-public:
-    // Update to 1.5 interface
-    virtual IMetricSet_1_5 *GetMetricSet(uint32_t index);
-};
-
-/*****************************************************************************\
-
-Class:
-    IOverride_1_2
-
-Description:
-    Abstract interface for Metrics Device overrides.
-
-\*****************************************************************************/
-class IOverride_1_2 {
-public:
-    virtual ~IOverride_1_2();
-
-    // To get this Override params
-    virtual TOverrideParams_1_2 *GetParams(void);
-
-    // To enable/disable this Override
-    virtual TCompletionCode SetOverride(
-            TSetOverrideParams_1_2 *params, uint32_t paramsSize);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricsDevice_1_0
-
-Description:
-    Abstract interface for the GPU metrics root object.
-
-\*****************************************************************************/
-class IMetricsDevice_1_0 {
-public:
-    virtual ~IMetricsDevice_1_0();
-
-    // To get MetricsDevice params
-    virtual TMetricsDeviceParams_1_0 *GetParams(void);
-
-    // Child objects are of IConcurrentGroup
-    virtual IConcurrentGroup_1_0 *GetConcurrentGroup(uint32_t index);
-
-    // To get GlobalSymbol at the given index
-    virtual TGlobalSymbol_1_0 *GetGlobalSymbol(uint32_t index);
-
-    // To get GlobalSymbol with the given name
-    virtual TTypedValue_1_0 *GetGlobalSymbolValueByName(const char *name);
-
-    // To get last error from TCompletionCode enum
-    virtual TCompletionCode GetLastError(void);
-
-    // To get both GPU and CPU timestamp at the same time
-    virtual TCompletionCode GetGpuCpuTimestamps(uint64_t *gpuTimestampNs,
-            uint64_t *cpuTimestampNs, uint32_t *cpuId);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricsDevice_1_1
-
-Description:
-    Updated 1.0 version to use with 1.1 interface version.
-
-    Updates:
-        - GetConcurrentGroup
-
-\*****************************************************************************/
-class IMetricsDevice_1_1 : public IMetricsDevice_1_0 {
-public:
-    // Update to 1.1 interface
-    virtual IConcurrentGroup_1_1 *GetConcurrentGroup(uint32_t index);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricsDevice_1_2
-
-Description:
-    Updated 1.1 version to use with 1.2 interface version.
-    Introduces an interface for getting overrides.
-
-    Updates:
-        - GetParams
-    New:
-        - GetOverride
-        - GetOverrideByName
-
-\*****************************************************************************/
-class IMetricsDevice_1_2 : public IMetricsDevice_1_1 {
-public:
-    // Update returned params
-    virtual TMetricsDeviceParams_1_2 *GetParams(void);
-
-    // To get override at the given index
-    virtual IOverride_1_2 *GetOverride(uint32_t index);
-
-    // To get override with the given name
-    virtual IOverride_1_2 *GetOverrideByName(const char *symbolName);
-};
-
-/*****************************************************************************\
-
-Class:
-    IMetricsDevice_1_5
-
-Description:
-    Updated 1.2 version to use with 1.5 interface version.
-
-    Updates:
-        - GetConcurrentGroup
-
-\*****************************************************************************/
-class IMetricsDevice_1_5 : public IMetricsDevice_1_2 {
-public:
-    // Update to 1.5 interface
-    virtual IConcurrentGroup_1_5 *GetConcurrentGroup(uint32_t index);
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Factory functions
-typedef TCompletionCode(MD_STDCALL *OpenMetricsDevice_fn)(
-        IMetricsDevice_1_5 **device);
-typedef TCompletionCode(MD_STDCALL *OpenMetricsDeviceFromFile_fn)(
-        const char *fileName, void *openParams, IMetricsDevice_1_5 **device);
-typedef TCompletionCode(MD_STDCALL *CloseMetricsDevice_fn)(
-        IMetricsDevice_1_5 *device);
-typedef TCompletionCode(MD_STDCALL *SaveMetricsDeviceToFile_fn)(
-        const char *fileName, void *saveParams, IMetricsDevice_1_5 *device);
-
-#ifdef __cplusplus
-}
-#endif
-
-}; // namespace MetricsDiscovery
-#endif // __METRICS_DISCOVERY_H_
diff --git a/src/gpu/intel/ocl/mdapi_utils.cpp b/src/gpu/intel/ocl/mdapi_utils.cpp
index 385c10e8650..734661bce46 100644
--- a/src/gpu/intel/ocl/mdapi_utils.cpp
+++ b/src/gpu/intel/ocl/mdapi_utils.cpp
@@ -28,8 +28,8 @@
 #include <dlfcn.h>
 #include <vector>
 
-#include "gpu/intel/ocl/mdapi/metrics_discovery_api.h"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+#include "mdapi/metrics_discovery_api.h"
 
 #ifndef CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL
 #define CL_PROFILING_COMMAND_PERFCOUNTERS_INTEL 0x407F
@@ -44,7 +44,7 @@ namespace ocl {
 
 #ifdef DNNL_GPU_ENABLE_MDAPI
 
-static bool open_metrics_device(MetricsDiscovery::IMetricsDevice_1_5 **device,
+static bool open_metrics_device(MetricsDiscovery::IMetricsDevice_1_13 **device,
         const std::shared_ptr<void> &lib) {
     static MetricsDiscovery::OpenMetricsDevice_fn func;
     if (!func) { *(void **)(&func) = dlsym(lib.get(), "OpenMetricsDevice"); }
@@ -53,7 +53,7 @@ static bool open_metrics_device(MetricsDiscovery::IMetricsDevice_1_5 **device,
     return code == MetricsDiscovery::CC_OK;
 }
 
-static bool close_metrics_device(MetricsDiscovery::IMetricsDevice_1_5 *device,
+static bool close_metrics_device(MetricsDiscovery::IMetricsDevice_1_13 *device,
         const std::shared_ptr<void> &lib) {
     static MetricsDiscovery::CloseMetricsDevice_fn func;
     if (!func) { *(void **)(&func) = dlsym(lib.get(), "CloseMetricsDevice"); }
@@ -184,7 +184,7 @@ class mdapi_helper_impl_t {
     }
 
     bool is_initialized_ = false;
-    MetricsDiscovery::IMetricsDevice_1_5 *metric_device_ = nullptr;
+    MetricsDiscovery::IMetricsDevice_1_13 *metric_device_ = nullptr;
     MetricsDiscovery::IMetricSet_1_1 *metric_set_ = nullptr;
     int freq_metric_idx_ = -1;
     std::shared_ptr<void> lib = nullptr;
diff --git a/src/gpu/intel/ocl/micro_sdpa.cl b/src/gpu/intel/ocl/micro_sdpa.cl
index 206b28d5617..d444888b97b 100644
--- a/src/gpu/intel/ocl/micro_sdpa.cl
+++ b/src/gpu/intel/ocl/micro_sdpa.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "gpu/intel/ocl/ocl_types.h"
 #include "gpu/intel/ocl/sdpa_utils.h"
 #include "gpu/intel/ocl/tile_ops.h"
 
@@ -22,6 +21,12 @@
 #include "gemm_kq.h"
 #include "gemm_vs.h"
 
+/* The quantization parameter may be unique for each token/element */
+#define QUANTIZE_2D 2
+
+/* The quantization parameter shares the same value across the work-group */
+#define QUANTIZE_COMMON 3
+
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 #define DIV_UP(x, y) (((x) + (y)-1) / (y))
 
@@ -32,25 +37,51 @@
 typedef ugemm_kq_c_type s_tile_type;
 typedef ugemm_vs_c_type a_tile_type;
 
+#ifdef QRY_DT_F16
+#define VEC_TYPE2 half2
+#elif defined(QRY_DT_BF16)
+#define VEC_TYPE2 ushort2
+#else
+#error "Data type not supported for VEC_TYPE2"
+#endif
+
+#ifdef SCALE_DT_BF16
+#define SCALES_TO_FLOAT cvt_bf16_to_f32
+#else
+#define SCALES_TO_FLOAT convert_float
+#endif
+
+#ifdef VAL_ATTR_SCALES_DT_BF16
+#define VAL_SCALES_TO_FLOAT cvt_bf16_to_f32
+#else
+#define VAL_SCALES_TO_FLOAT convert_float
+#endif
+
+#if KEY_ATTR_SCALES_DT_BF16
+#define KEY_SCALES_TO_FLOAT cvt_bf16_to_f32
+#else
+#define KEY_SCALES_TO_FLOAT convert_float
+#endif
+
 DECLARE_2D_TILE(q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
 
 #ifdef BLOCK_Q
 DECLARE_2D_TILE_BLOCK_OPS(
         q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
 #elif Q_ALIGN < 4
-DECLARE_2D_TILE_LOAD_PACKED_HALF(
-        q_tile_type, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
+DECLARE_2D_TILE_LOAD_PACKED_VEC(q_tile_type, QRY_DATA_T, VEC_TYPE2,
+        SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
 #endif
 
 #ifdef BLOCK_A
-DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 1, 1,
-        ugemm_vs_sg_tile_n)
+DECLARE_2D_TILE(a_tile_type_dst, DST_DATA_T, SUBGROUP_SIZE, ugemm_vs_sg_tile_m,
+        1, 1, ugemm_vs_sg_tile_n)
 #else
-DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 8, 1,
-        ugemm_vs_sg_tile_n / 8)
+DECLARE_2D_TILE(a_tile_type_dst, DST_DATA_T, SUBGROUP_SIZE, ugemm_vs_sg_tile_m,
+        8, 1, ugemm_vs_sg_tile_n / 8)
 #endif
 
-DECLARE_2D_TILE(s_tile_type_half2, uint, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
+DECLARE_2D_TILE(s_tile_type_packed, uint, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
         ugemm_kq_c_type_block1 / 2, ugemm_kq_c_type_nblock0,
         ugemm_kq_c_type_nblock1)
 
@@ -72,35 +103,43 @@ DECLARE_2D_TILE(
 #define mask_nbc ugemm_kq_c_type_nblock1
 #endif
 
-DECLARE_2D_TILE(mask_tile_type, half, SUBGROUP_SIZE, mask_br, mask_bc, mask_nbr,
-        mask_nbc)
-DECLARE_2D_TILE(mask_tile_type_float, float, SUBGROUP_SIZE, mask_br, mask_bc,
+DECLARE_2D_TILE(kmask_tile_type_float, float, SUBGROUP_SIZE, ugemm_kq_sg_tile_m,
+        1, 1, 1)
+
+#if WITH_ATTN_MASK
+DECLARE_2D_TILE(mask_tile_type, MSK_DATA_T, SUBGROUP_SIZE, mask_br, mask_bc,
         mask_nbr, mask_nbc)
 
 #if BROADCAST_MASK_Q
-DECLARE_2D_TILE_BLOCK_OPS(mask_tile_type, half, SUBGROUP_SIZE, mask_br, mask_bc,
+DECLARE_2D_TILE_BLOCK_OPS(mask_tile_type, MSK_DATA_T, SUBGROUP_SIZE, mask_br,
+        mask_bc, mask_nbr, mask_nbc)
+#endif
+DECLARE_2D_TILE(mask_tile_type_float, float, SUBGROUP_SIZE, mask_br, mask_bc,
         mask_nbr, mask_nbc)
+DECLARE_2D_TILE_COPY_REBLOCK(mask_tile_type, SUBGROUP_SIZE, mask_br, mask_bc,
+        mask_nbr, mask_nbc, mask_tile_type_float, SUBGROUP_SIZE, mask_br,
+        mask_bc, mask_nbr, mask_nbc, CONVERT_FLOAT_T)
 #endif
 
 #ifdef BLOCK_A
-DECLARE_2D_TILE_BLOCK_OPS(a_tile_type_half, half, SUBGROUP_SIZE,
+DECLARE_2D_TILE_BLOCK_OPS(a_tile_type_dst, DST_DATA_T, SUBGROUP_SIZE,
         ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n)
 #endif
 #ifdef BLOCK_2D_A
-DECLARE_2D_TILE_BLOCK2D_OPS(a_tile_type_half, half, SUBGROUP_SIZE,
+DECLARE_2D_TILE_BLOCK2D_OPS(a_tile_type_dst, DST_DATA_T, SUBGROUP_SIZE,
         ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8)
 #endif
 
 #ifdef BLOCK_A
 DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0,
         ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0,
-        ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE,
-        ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n)
+        ugemm_vs_c_type_nblock1, a_tile_type_dst, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n, CONVERT_DATA_T)
 #else
 DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0,
         ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0,
-        ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE,
-        ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8)
+        ugemm_vs_c_type_nblock1, a_tile_type_dst, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8, CONVERT_DATA_T)
 #endif
 
 DECLARE_2D_TILE_VREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
@@ -110,8 +149,14 @@ DECLARE_2D_TILE_VREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
 
 DECLARE_2D_TILE_HREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
         ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0,
-        ugemm_kq_c_type_nblock1, mask_tile_type_float, SUBGROUP_SIZE,
+        ugemm_kq_c_type_nblock1, kmask_tile_type_float, SUBGROUP_SIZE,
         ugemm_kq_sg_tile_m, 1, 1, 1)
+#if WITH_ATTN_MASK
+DECLARE_2D_TILE_HREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
+        ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0,
+        ugemm_kq_c_type_nblock1, mask_tile_type_float, SUBGROUP_SIZE, mask_br,
+        mask_bc, mask_nbr, mask_nbc)
+#endif
 
 DECLARE_2D_TILE_HREDUCE(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0,
         ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0,
@@ -154,12 +199,22 @@ DECLARE_2D_TILE_RSELECT(a_scale_tile_type, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1,
 #define binary_add(x, y) ((x) + (y))
 
 __attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) kernel void
-micro_sdpa(const global half *K, const global half *Q, const global half *V,
-        global half *A, global SCALE_DATA_T *scale_ptr, const global half *msk,
-        int d, int k, int q) {
+micro_sdpa(const global KEY_DATA_T *K, const global QRY_DATA_T *Q,
+        const global VAL_DATA_T *V, global DST_DATA_T *A,
+        const global SCALE_DATA_T *scale_ptr, int d, int k, int q,
+        const global KEY_ATTR_SCALES_DATA_T *K_scales,
+        const global KEY_ATTR_ZP_DATA_T *K_zp,
+        const global VAL_ATTR_SCALES_DATA_T *V_scales,
+        const global VAL_ATTR_ZP_DATA_T *V_zp, const int attn_mask_type
+#if WITH_ATTN_MASK
+        ,
+        const global MSK_DATA_T *msk
+#endif
+) {
     uint sg_ij = sub_group_broadcast(get_local_id(1), 0);
     uint b0 = get_group_id(1);
     uint b1 = get_group_id(2);
+    uint b0_kv = b0 / KV_GROUP_SIZE;
 
     uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n;
 
@@ -169,6 +224,15 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
     uint ldv = VAL_S2;
     uint lda = DST_S2;
 
+#if KEY_SCALES || KEY_ZERO_POINTS
+    uint ldkq = KEY_D3;
+    uint num_key_groups = d / KEY_GROUP_SIZE;
+#endif
+#if VAL_SCALES || VAL_ZERO_POINTS
+    uint ldvq = div_up(d, VAL_GROUP_SIZE);
+    uint num_val_groups = d / VAL_GROUP_SIZE;
+#endif
+
     /* Subgroup IDs for each GEMM */
     uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m;
     uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m;
@@ -177,8 +241,9 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
     uint sg_j_vs = sg_ij / ugemm_vs_sg_per_wg_m;
 
     /* SLM allocations -- place in one array to work around compiler bug */
-#define Q_slm_size (D_MAX * ugemm_kq_wg_tile_n * sizeof(half))
-#define S_slm_size (ugemm_kq_wg_tile_m * ugemm_kq_wg_tile_n * sizeof(half))
+#define Q_slm_size (D_MAX * ugemm_kq_wg_tile_n * sizeof(QRY_DATA_T))
+#define S_slm_size \
+    (ugemm_kq_wg_tile_m * ugemm_kq_wg_tile_n * sizeof(QRY_DATA_T))
 #define S_sum_slm_size \
     (ugemm_kq_wg_tile_n * ugemm_kq_sg_per_wg_m * sizeof(float))
 #define S_max_slm_size (ugemm_kq_wg_tile_n * sizeof(float))
@@ -187,8 +252,8 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
     local char slm[Q_slm_size + S_slm_size + S_sum_slm_size + S_max_slm_size
             + ugemm_slm_size];
 
-    local half *Q_slm = (local half *)&slm[0];
-    local half *S_slm = (local half *)&slm[Q_slm_size];
+    local QRY_DATA_T *Q_slm = (local QRY_DATA_T *)&slm[0];
+    local QRY_DATA_T *S_slm = (local QRY_DATA_T *)&slm[Q_slm_size];
     local float *S_sum_slm = (local float *)&slm[Q_slm_size + S_slm_size];
     local float *S_max_slm
             = (local float *)&slm[Q_slm_size + S_slm_size + S_sum_slm_size];
@@ -198,11 +263,38 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
     const bool need_sum_barrier = (ugemm_vs_barrier_count == 0);
 
     /* Locate K/Q/V/A matrices within batch */
-    K += KEY_OFF(b1, b0 / KV_GROUP_SIZE, 0, 0);
+    K += KEY_OFF(b1, b0_kv, 0, 0) / KEY_ELEMENTS_PER_BYTE;
     Q += QRY_OFF(b1, b0, 0, 0);
-    V += VAL_OFF(b1, b0 / KV_GROUP_SIZE, 0, 0);
+    V += VAL_OFF(b1, b0_kv, 0, 0) / VAL_ELEMENTS_PER_BYTE;
     A += DST_OFF(b1, b0, 0, 0, 0);
+#if WITH_ATTN_MASK
+    uint ldmsk = MSK_S2;
     msk += MSK_OFF(b1 % MSK_D0, b0 % MSK_D1, 0, 0);
+#ifndef BLOCK_MSK
+    int mask_aligned = (((size_t)msk) % 4) == 0;
+#endif
+#endif
+
+#if KEY_SCALES
+    K_scales += KEY_OFF(b1, b0_kv, 0, 0) / KEY_GROUP_SIZE;
+#endif
+#if KEY_SCALES == QUANTIZE_COMMON
+    float k_scale = KEY_SCALES_TO_FLOAT(*K_scales);
+#endif
+#if KEY_ZERO_POINTS
+    K_zp += KEY_OFF(b1, b0_kv, 0, 0) / KEY_GROUP_SIZE
+            / KEY_ZP_ELEMENTS_PER_BYTE;
+#endif
+#if VAL_SCALES
+    V_scales += VAL_OFF(b1, b0_kv, 0, 0) / VAL_GROUP_SIZE;
+#endif
+#if VAL_SCALES == QUANTIZE_COMMON
+    float v_scale = VAL_SCALES_TO_FLOAT(*V_scales);
+#endif
+#if VAL_ZERO_POINTS
+    V_zp += VAL_OFF(b1, b0_kv, 0, 0) / VAL_GROUP_SIZE
+            / VAL_ZP_ELEMENTS_PER_BYTE;
+#endif
 
     /* Load Q tile, destined for SLM */
     q_tile_type Q_tile;
@@ -214,16 +306,16 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
     tile_load(&Q_tile, (global uint *)Q, (d + 1) >> 1, q, ldq >> 1, 0,
             wg_j0 + q0_copy);
 #else
-    tile_load_packed_half(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy);
+    tile_load_packed_vec2(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy);
 #endif
 
     /* Load scale */
 #if WITH_ATTN_SCALE
 #if INVERT_SCALE
-    float iscale = convert_float(*scale_ptr);
+    float iscale = SCALES_TO_FLOAT(*scale_ptr);
     float scale = native_recip(iscale);
 #else
-    float scale = convert_float(*scale_ptr);
+    float scale = SCALES_TO_FLOAT(*scale_ptr);
     float iscale = native_recip(scale);
 #endif
 #else
@@ -234,8 +326,44 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
 
 #ifdef PREFETCH_K0
     /* Prefetch first K tile. */
-    cooperative_prefetch_2d_k(K, k, d, ugemm_kq_wg_tile_m, PREFETCH_D_MAX, ldk,
-            sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C);
+    cooperative_prefetch_2d_k(
+            /* ptr */ K,
+            /* r */ k,
+            /* c */ d,
+            /* rmax */ ugemm_kq_wg_tile_m,
+            /* cmax */ PREFETCH_D_MAX,
+            /* ld */ ldk,
+            /* sg_id */ sg_ij,
+            /* n_sg */ sg_per_wg,
+            /* sg_size */ SUBGROUP_SIZE,
+            /* cache */ LSC_LDCC_L1C_L3C);
+
+#if KEY_SCALES == QUANTIZE_2D
+    cooperative_prefetch_2d_maybe_rem(
+            /* ptr */ K_scales,
+            /* r */ k,
+            /* c */ num_key_groups,
+            /* rmax */ ugemm_kq_wg_tile_m,
+            /* cmax */ D_MAX / KEY_GROUP_SIZE,
+            /* ld */ ldkq,
+            /* sg_id */ sg_ij,
+            /* n_sg */ sg_per_wg,
+            /* sg_size */ SUBGROUP_SIZE,
+            /* cache */ LSC_LDCC_L1C_L3C);
+#endif
+#if KEY_ZERO_POINTS == QUANTIZE_2D
+    cooperative_prefetch_2d_maybe_rem(
+            /* ptr */ K_zp,
+            /* r */ k,
+            /* c */ num_key_groups,
+            /* rmax */ ugemm_kq_wg_tile_m,
+            /* cmax */ D_MAX / KEY_GROUP_SIZE,
+            /* ld */ ldkq,
+            /* sg_id */ sg_ij,
+            /* n_sg */ sg_per_wg,
+            /* sg_size */ SUBGROUP_SIZE,
+            /* cache */ LSC_LDCC_L1C_L3C);
+#endif
 #endif
 
     /* Initialize S column sums in SLM to -inf */
@@ -278,15 +406,23 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
         /* Load mask. No remainder handling needed assuming k block size is a power of 2. */
         mask_tile_type mask_tile;
 #if BROADCAST_MASK_Q
+#if BLOCK_MSK
         tile_load_block(&mask_tile, msk, 0, k0 + sg_i0_kq, 0);
 #else
-        tile_load_t(&mask_tile, msk, q, k, q, sg_j0_kq + wg_j0, k0 + sg_i0_kq);
+        if (mask_aligned) {
+            tile_load_block(&mask_tile, msk, 0, k0 + sg_i0_kq, 0);
+        } else {
+            tile_load_full(&mask_tile, msk, 0, k0 + sg_i0_kq, 0);
+        }
+#endif
+#else
+        tile_load_t(&mask_tile, msk, q, k, sg_j0_kq + wg_j0, k0 + sg_i0_kq);
 #endif
 #endif
 
 #if REMAINDER_K
         /* Prepare k mask: NaN in bounds, -inf out of bounds */
-        mask_tile_type_float k_mask;
+        kmask_tile_type_float k_mask;
 #pragma unroll
         for (int ii = 0; ii < ugemm_kq_sg_tile_m / SUBGROUP_SIZE; ii++)
             k_mask.x[0][ii] = (k0 + sg_i0_kq + ii * SUBGROUP_SIZE
@@ -299,13 +435,31 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
         /* Calculate S = (K^T) * Q */
         s_tile_type S_tile
                 = ugemm_kq(K, ldk, Q_slm, D_MAX, k, ugemm_kq_wg_tile_n, d, k0,
-                        0, 0, sg_i_kq, sg_j_kq, (local char *)ugemm_slm);
+                        0, 0, sg_i_kq, sg_j_kq, (local char *)ugemm_slm
+#if KEY_SCALES == QUANTIZE_2D
+                        ,
+                        K_scales
+#endif
+#if KEY_ZERO_POINTS
+                        ,
+                        K_zp
+#endif
+#if (KEY_SCALES == QUANTIZE_2D) || KEY_ZERO_POINTS
+                        ,
+                        ldkq
+#endif
+                );
+
+#if KEY_SCALES == QUANTIZE_COMMON
+#define k_scale_op(x) ((x)*k_scale)
+        tile_elementwise(S_tile, k_scale_op);
+#endif
 
         /* Apply attention mask */
 #if WITH_ATTN_MASK
 #define unscale(x) ((x)*iscale)
         mask_tile_type_float mask_tile_float;
-        tile_copy(mask_tile, mask_tile_float);
+        tile_copy_reblock(mask_tile, &mask_tile_float);
 #if WITH_ATTN_SCALE
         tile_elementwise(mask_tile_float, unscale);
 #endif
@@ -321,6 +475,19 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
         tile_hbroadcast_min(&S_tile, k_mask);
 #endif
 
+#if WITH_CAUSAL_MASK
+#define less_than(offset_k, offset_q) (offset_q < offset_k)
+
+        int col_offset = wg_j0 + sg_j0_kq;
+        if (attn_mask_type == ATTN_MASK_BOTTOM_RIGHT) col_offset += k - q;
+
+        /* Apply causal mask */
+        tile_predicated_assignment_t(S_tile, k0 + sg_i0_kq, col_offset,
+                less_than, -INFINITY, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
+                ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0,
+                ugemm_kq_c_type_nblock1);
+#endif
+
         /* Before softmax, we will need to scale columns by maximum values to avoid overflow. */
 
         /* Compute our maxima and reduce across SLM */
@@ -329,13 +496,50 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
                 S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0);
         intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE);
 
+        int k_chunk = min(k - k0, ugemm_kq_wg_tile_m);
 #ifdef PREFETCH_V
         /* Prefetch V tile. */
-        cooperative_prefetch_2d_maybe_rem(V, d, k - k0, D_MAX,
-                (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldv, sg_ij,
-                sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C);
+        cooperative_prefetch_2d_maybe_rem(
+                /* ptr */ V,
+                /* r */ d,
+                /* c */ k - k0,
+                /* rmax */ PREFETCH_D_MAX,
+                /* cmax */ ugemm_kq_wg_tile_m,
+                /* ld */ ldv,
+                /* sg_id */ sg_ij,
+                /* n_sg */ sg_per_wg,
+                /* sg_size */ SUBGROUP_SIZE,
+                /* cache */ LSC_LDCC_L1C_L3C);
+
+#if VAL_SCALES == QUANTIZE_2D
+        /* Prefetch V scales. */
+        cooperative_prefetch_2d_maybe_rem(
+                /* ptr */ V_scales,
+                /* r */ num_val_groups,
+                /* c */ k - k0,
+                /* rmax */ PREFETCH_D_MAX / VAL_GROUP_SIZE,
+                /* cmax */ k_chunk,
+                /* ld */ ldvq,
+                /* sg_id */ sg_ij,
+                /* n_sg */ sg_per_wg,
+                /* sg_size */ SUBGROUP_SIZE,
+                /* cache */ LSC_LDCC_L1C_L3C);
+#endif
+#if VAL_ZERO_POINTS == QUANTIZE_2D
+        /* Prefetch V zero points. */
+        cooperative_prefetch_2d_maybe_rem(
+                /* ptr */ V_zp,
+                /* r */ num_val_groups,
+                /* c */ k - k0,
+                /* rmax */ PREFETCH_D_MAX / VAL_GROUP_SIZE,
+                /* cmax */ k_chunk,
+                /* ld */ ldvq,
+                /* sg_id */ sg_ij,
+                /* n_sg */ sg_per_wg,
+                /* sg_size */ SUBGROUP_SIZE,
+                /* cache */ LSC_LDCC_L1C_L3C);
+#endif
 #endif
-
 #ifndef ALT_MAX
         /* Read back WG-wide maxima */
         intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE);
@@ -365,12 +569,12 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
         tile_fill(S_sum_tile1, 0.0f);
         tile_vreduce_add(S_tile, &S_sum_tile1);
 
-        /* Convert to half, VNNI format */
-        s_tile_type_half2 S_tile_half2;
-        tile_copy_to_half2(S_tile, S_tile_half2);
+        /* Convert to half or bf16, VNNI format */
+        s_tile_type_packed S_tile_packed;
+        tile_copy_to_vec2(S_tile, S_tile_packed, VEC_TYPE2);
 
         /* Store to SLM, in packed format */
-        tile_store_t_sys_src2(S_tile_half2, (local uint *)S_slm,
+        tile_store_t_sys_src2(S_tile_packed, (local uint *)S_slm,
                 ugemm_vs_sg_tile_n, ugemm_kq_wg_tile_m / 2, sg_i0_kq / 2,
                 sg_j0_kq);
         intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE);
@@ -417,24 +621,74 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
 #else
             const uint stride_k = 1;
 #endif
-            cooperative_prefetch_2d_k(K + (k0 + ugemm_kq_wg_tile_m) * stride_k,
-                    k - k0 - ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_m,
-                    PREFETCH_D_MAX, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE,
-                    LSC_LDCC_L1C_L3C);
+
+            cooperative_prefetch_2d_k(
+                    /* ptr */ K + (k0 + ugemm_kq_wg_tile_m) * stride_k,
+                    /* r */ k - k0 - ugemm_kq_wg_tile_m,
+                    /* c */ d,
+                    /* rmax */ ugemm_kq_wg_tile_m,
+                    /* cmax */ D_MAX,
+                    /* ld*/ ldk,
+                    /* sg_id */ sg_ij,
+                    /* n_sg */ sg_per_wg,
+                    /* sg_size */ SUBGROUP_SIZE,
+                    /* cache*/ LSC_LDCC_L1C_L3C);
+#if KEY_SCALES == QUANTIZE_2D
+            cooperative_prefetch_2d_maybe_rem(
+                    /* ptr */ K_scales + (k0 + ugemm_kq_wg_tile_m),
+                    /* r */ k - k0 - ugemm_kq_wg_tile_m,
+                    /* c */ num_key_groups,
+                    /* rmax */ ugemm_kq_wg_tile_m,
+                    /* cmax */ D_MAX / KEY_GROUP_SIZE,
+                    /* ld */ ldkq,
+                    /* sg_id */ sg_ij,
+                    /* n_sg */ sg_per_wg,
+                    /* sg_size */ SUBGROUP_SIZE,
+                    /* cache */ LSC_LDCC_L1C_L3C);
+#endif
+#if KEY_ZERO_POINTS == QUANTIZE_2D
+            cooperative_prefetch_2d_maybe_rem(
+                    /* ptr */ K_zp + (k0 + ugemm_kq_wg_tile_m),
+                    /* r */ k - k0 - ugemm_kq_wg_tile_m,
+                    /* c */ num_key_groups,
+                    /* rmax */ ugemm_kq_wg_tile_m,
+                    /* cmax */ D_MAX / KEY_GROUP_SIZE,
+                    /* ld */ ldkq,
+                    /* sg_id */ sg_ij,
+                    /* n_sg */ sg_per_wg,
+                    /* sg_size */ SUBGROUP_SIZE,
+                    /* cache */ LSC_LDCC_L1C_L3C);
+#endif
         }
 #endif
+
 #if WITH_ATTN_MASK && defined(PREFETCH_MASK)
         /* Prefetch next mask tile. */
         if (!last) {
 #if BROADCAST_MASK_Q
-            cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq,
-                    ugemm_kq_sg_tile_m, 1, 0, 0, 1, SUBGROUP_SIZE,
-                    LSC_LDCC_L1UC_L3C);
+            cooperative_prefetch_2d_maybe_rem(
+                    /* ptr */ msk + k0 + ugemm_kq_wg_tile_m,
+                    /* r */ k - k0,
+                    /* c */ 1,
+                    /* rmax */ ugemm_kq_wg_tile_m,
+                    /* cmax */ 1,
+                    /* ld */ 0,
+                    /* sg_id */ sg_ij,
+                    /* n_sg */ sg_per_wg,
+                    /* sg_size */ SUBGROUP_SIZE,
+                    /* cache */ LSC_LDCC_L1C_L3C);
 #else
-            cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq
-                            + (sg_j0_kq + wg_j0) * q,
-                    ugemm_kq_sg_tile_m, ugemm_kq_sg_tile_n, 0, 0, 1,
-                    SUBGROUP_SIZE, LSC_LDCC_L1UC_L3C);
+            cooperative_prefetch_2d_maybe_rem(
+                    /* ptr */ msk + k0 + ugemm_kq_sg_tile_m + (wg_j0)*ldmsk,
+                    /* r */ k - k0 - ugemm_kq_wg_tile_m,
+                    /* c */ q - wg_j0,
+                    /* rmax */ ugemm_kq_wg_tile_m,
+                    /* cmax */ (ugemm_kq_wg_tile_n * PREFETCH_D_MAX) / D_MAX,
+                    /* ld */ ldmsk,
+                    /* sg_id */ sg_ij,
+                    /* n_sg */ sg_per_wg,
+                    /* sg_size */ SUBGROUP_SIZE,
+                    /* cache */ LSC_LDCC_L1UC_L3C);
 #endif
         }
 #endif
@@ -447,11 +701,30 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
             intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE);
 
         /* Accumulate A += V * S */
-        int k_chunk = min(k - k0, ugemm_kq_wg_tile_m);
-        a_tile_type A_tile1 = ugemm_vs(V, ldv, S_slm, ugemm_kq_wg_tile_m, d,
-                ugemm_kq_wg_tile_n, k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs,
-                (local char *)ugemm_slm);
-        V += ldv * ugemm_kq_wg_tile_m;
+        a_tile_type A_tile1 = ugemm_vs(
+                V, ldv, S_slm, ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_n,
+                k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs, (local char *)ugemm_slm
+#if VAL_SCALES == QUANTIZE_2D
+                ,
+                V_scales
+#endif
+#if VAL_ZERO_POINTS
+                ,
+                V_zp
+#endif
+#if (VAL_SCALES == QUANTIZE_2D) || VAL_ZERO_POINTS
+                ,
+                ldvq
+#endif
+        );
+
+        V += ldv * ugemm_kq_wg_tile_m / VAL_ELEMENTS_PER_BYTE;
+#if VAL_SCALES == QUANTIZE_2D
+        V_scales += ldvq * ugemm_kq_wg_tile_m;
+#endif
+#if VAL_ZERO_POINTS == QUANTIZE_2D
+        V_zp += ldvq * ugemm_kq_wg_tile_m / VAL_ZP_ELEMENTS_PER_BYTE;
+#endif
         tile_binary(A_tile, A_tile1, binary_add);
     }
 
@@ -469,22 +742,27 @@ micro_sdpa(const global half *K, const global half *Q, const global half *V,
         tile_binary(A_scale_tile, A_scale_tile_load, binary_add);
     }
 
+#if VAL_SCALES == QUANTIZE_COMMON
+#define v_scale_op(x) ((x)*v_scale)
+    tile_elementwise(A_tile, v_scale_op);
+#endif
+
     /* Rescale by 1 / (column sums) */
     tile_elementwise(A_scale_tile, native_vrecip);
     tile_hbroadcast_mul(&A_tile, A_scale_tile);
 
     /* Convert to half precision and store */
-    a_tile_type_half A_tile_half;
-    tile_copy_reblock(A_tile, &A_tile_half);
+    a_tile_type_dst A_tile_dst;
+    tile_copy_reblock(A_tile, &A_tile_dst);
 
     uint sg_i0_vs = sg_i_vs * ugemm_vs_sg_tile_m;
     uint sg_j0_vs = sg_j_vs * ugemm_vs_sg_tile_n + wg_j0;
 
 #ifdef BLOCK_2D_A
-    tile_store_block2d(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs);
+    tile_store_block2d(A_tile_dst, A, d, q, lda, sg_i0_vs, sg_j0_vs);
 #elif defined(BLOCK_A)
-    tile_store_block_rem_q(A_tile_half, A, q, lda, sg_i0_vs, sg_j0_vs);
+    tile_store_block_rem_q(A_tile_dst, A, q, lda, sg_i0_vs, sg_j0_vs);
 #else
-    tile_store(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs);
+    tile_store(A_tile_dst, A, d, q, lda, sg_i0_vs, sg_j0_vs);
 #endif
 }
diff --git a/src/gpu/intel/ocl/micro_sdpa.cpp b/src/gpu/intel/ocl/micro_sdpa.cpp
index 20b4a7309a6..c0d3f53b899 100644
--- a/src/gpu/intel/ocl/micro_sdpa.cpp
+++ b/src/gpu/intel/ocl/micro_sdpa.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,13 +15,20 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/micro_sdpa.hpp"
+#include "gpu/intel/ocl/micro_sdpa_configs.hpp"
 
 #include "common/c_types_map.hpp"
+#include "common/sdpa_utils.hpp"
 #include "common/type_helpers.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/jit/gemm/gen_gemm_kernel.hpp"
 #include "gpu/intel/jit/gemm/include/microkernel_provider.hpp"
 
+#include <algorithm>
+#include <cstdio>
+#include <iostream>
+#include <vector>
+
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -30,119 +37,73 @@ namespace ocl {
 
 namespace {
 
-struct sdpa_config_t {
-    int unroll_m_kq, unroll_n_kq; // Subgroup tile sizes for K*Q GEMM
-    int unroll_m_vs, unroll_n_vs; // Subgroup tile sizes for V*S GEMM
-    int wg_m_kq, wg_n_kq; // Workgroup configuration for K*Q GEMM
-    int wg_m_vs, wg_n_vs; // Workgroup configuration for V*S GEMM
-};
-
-// Kernel configurations:
-//  h<N> -- maximum head size = N
-//  s<M> -- target sequence length = M
-//   2nd -- second token (thin Q)
-sdpa_config_t xehpg_h32 = {32, 16, 16, 16, 2, 16, 2, 16};
-sdpa_config_t xehpg_h32_s256 = {16, 16, 16, 16, 2, 8, 2, 8};
-sdpa_config_t xehpg_h32_s64 = {16, 16, 16, 8, 4, 4, 2, 8};
-sdpa_config_t xehpg_h32_s32 = {8, 8, 8, 8, 4, 4, 4, 4};
-sdpa_config_t xehpg_h32_2nd = {8, 32, 16, 8, 8, 1, 2, 4};
-
-sdpa_config_t xehpg_h64 = {32, 16, 16, 16, 4, 8, 4, 8};
-sdpa_config_t xehpg_h64_s128 = {16, 16, 16, 16, 4, 8, 4, 8};
-sdpa_config_t xehpg_h64_s64 = {32, 16, 16, 8, 8, 4, 4, 8};
-sdpa_config_t xehpg_h64_2nd = {8, 16, 16, 8, 8, 1, 4, 2};
-
-sdpa_config_t xehpg_h128 = {16, 16, 32, 8, 8, 4, 4, 8};
-sdpa_config_t xehpg_h128_s32 = {16, 16, 16, 8, 16, 2, 8, 4};
-sdpa_config_t xehpg_h128_2nd = {8, 16, 16, 8, 16, 1, 8, 2};
-sdpa_config_t xehpg_h128_s256_2nd = {8, 16, 32, 8, 8, 1, 4, 2};
-
-sdpa_config_t xehpg_h256 = {16, 16, 32, 8, 16, 2, 8, 4};
-sdpa_config_t xehpg_h256_s128 = {8, 16, 32, 16, 8, 4, 8, 4};
-sdpa_config_t xehpg_h256_s32 = {8, 16, 32, 8, 16, 2, 8, 4};
-sdpa_config_t xehpg_h256_2nd = {8, 8, 16, 8, 16, 1, 16, 1};
-sdpa_config_t xehpg_h256_s64_2nd = {16, 8, 16, 8, 16, 1, 16, 1};
-sdpa_config_t xehpg_h256_s32_2nd = {16, 16, 32, 8, 16, 1, 8, 2};
-
-sdpa_config_t xehpc_h32 = {16, 64, 32, 16, 4, 2, 1, 8};
-sdpa_config_t xehpc_h32_s32 = {16, 16, 16, 16, 2, 4, 2, 4};
-sdpa_config_t xehpc_h32_2nd = {16, 64, 16, 16, 8, 1, 2, 4};
-
-sdpa_config_t xehpc_h64 = {16, 64, 32, 16, 8, 2, 2, 8};
-sdpa_config_t xehpc_h64_s64 = {32, 32, 32, 16, 4, 2, 2, 4};
-sdpa_config_t xehpc_h64_s32 = {16, 16, 16, 16, 4, 2, 4, 2};
-sdpa_config_t xehpc_h64_2nd = {32, 32, 32, 16, 4, 1, 2, 2};
-sdpa_config_t xehpc_h64_s64_2nd = {16, 16, 16, 16, 4, 1, 4, 1};
-
-sdpa_config_t xehpc_h128 = {16, 64, 32, 16, 16, 2, 4, 8};
-sdpa_config_t xehpc_h128_s64 = {16, 32, 32, 32, 4, 2, 4, 2};
-sdpa_config_t xehpc_h128_s32 = {16, 16, 16, 16, 8, 2, 8, 2};
-sdpa_config_t xehpc_h128_2nd = {32, 32, 32, 16, 8, 1, 4, 2};
-
-sdpa_config_t xehpc_h256 = {16, 32, 32, 32, 8, 4, 8, 4};
-sdpa_config_t xehpc_h256_s64 = {16, 32, 32, 32, 8, 1, 8, 1};
-sdpa_config_t xehpc_h256_2nd = {16, 16, 16, 16, 16, 1, 16, 1};
-
-sdpa_config_t *choose_config_xehpg(int head_size, int seq, bool thin_q) {
-    if (head_size <= 32) {
-        if (thin_q) return &xehpg_h32_2nd;
-        if (seq <= 32) return &xehpg_h32_s32;
-        if (seq <= 64) return &xehpg_h32_s64;
-        if (seq <= 256) return &xehpg_h32_s256;
-        return &xehpg_h32;
-    } else if (head_size <= 64) {
-        if (thin_q) return &xehpg_h64_2nd;
-        if (seq <= 64) return &xehpg_h64_s64;
-        if (seq <= 128) return &xehpg_h64_s128;
-        return &xehpg_h64;
-    } else if (head_size <= 128) {
-        if (thin_q) {
-            if (seq <= 256) return &xehpg_h128_s256_2nd;
-            return &xehpg_h128_2nd;
-        }
-        if (seq <= 32) return &xehpg_h128_s32;
-        return &xehpg_h128;
-    } else if (head_size <= 256) {
-        if (thin_q) {
-            if (seq <= 32) return &xehpg_h256_s32_2nd;
-            if (seq <= 64) return &xehpg_h256_s64_2nd;
-            return &xehpg_h256_2nd;
-        }
-        if (seq <= 32) return &xehpg_h256_s32;
-        if (seq <= 128) return &xehpg_h256_s128;
-        return &xehpg_h256;
-    }
-    return nullptr;
+/// Returns true if a common scale value is used for each slice of the tensor
+/// operation. For 4D case it's when the mask's two first bits are on and two
+/// last bits are off.
+/// Examples:
+///   | mask      | result  |
+///   |-----------+---------|
+///   |  0 (0000) | true    |
+///   | 12 (0011) | false   |
+///   |  3 (1100) | true    |
+///   |  1 (1000) | true    |
+///   |  8 (0001) | false   |
+bool with_quantize_common(const quant_entry_t &scale_entry) {
+    return !scale_entry.has_default_values()
+            && (((scale_entry.get_mask() & 3) != 0
+                        && (scale_entry.get_mask() & 12) == 0)
+                    || scale_entry.get_mask() == 0);
 }
 
-sdpa_config_t *choose_config_xehpc(int head_size, int seq, bool thin_q) {
-    if (head_size <= 32) {
-        if (thin_q) return &xehpc_h32_2nd;
-        if (seq <= 32) return &xehpc_h32_s32;
-        return &xehpc_h32;
-    } else if (head_size <= 64) {
-        if (thin_q) {
-            if (seq <= 64) return &xehpc_h64_s64_2nd;
-            return &xehpc_h64_2nd;
-        }
-        if (seq <= 32) return &xehpc_h64_s32;
-        if (seq <= 64) return &xehpc_h64_s64;
-        return &xehpc_h64;
-    } else if (head_size <= 128) {
-        if (thin_q) return &xehpc_h128_2nd;
-        if (seq <= 32) return &xehpc_h128_s32;
-        if (seq <= 64) return &xehpc_h128_s64;
-        return &xehpc_h128;
-    } else if (head_size <= 256) {
-        if (thin_q) return &xehpc_h256_2nd;
-        if (seq <= 64) return &xehpc_h256_s64;
-        return &xehpc_h256;
-    }
-    return nullptr;
+/// Returns true if a common zero points value is used for each slice of the
+/// tensor operation
+bool with_quantize_common(const zero_points_t &zp) {
+    int mask = zp.get_mask(DNNL_ARG_WEIGHTS);
+    return !zp.has_default_values(DNNL_ARG_WEIGHTS)
+            && (((mask & 3) != 0 && (mask & 12) == 0) || mask == 0);
 }
 
 } /* anonymous namespace */
 
+status_t update_config_from_devenv_values(
+        sdpa_config_t *config, bool quantized) {
+    std::string q_config_str
+            = gpu_utils::dev_getenv("QUANTIZED_SDPA_CONFIG", std::string(""));
+    std::string config_str
+            = gpu_utils::dev_getenv("SDPA_CONFIG", std::string(""));
+    if ((!config_str.empty() && !quantized)
+            || (!q_config_str.empty() && quantized)) {
+        std::array<int, 8> config_values;
+        int i;
+        int num_values = 0;
+        if (!q_config_str.empty() && quantized)
+            config_str = std::move(q_config_str);
+
+        std::stringstream ss(config_str);
+        while (ss >> i) {
+            config_values[num_values++] = i;
+            if (ss.peek() == ',') ss.ignore();
+        }
+        VCHECK_SDPA_COND(num_values == 8,
+                "(QUANTIZED_)SDPA_CONFIG(%s) is invalid. Must be 8 integers "
+                "separate by a comma: "
+                "<unroll_m_kq>,<unroll_n_kq>,<unroll_m_vs>,<unroll_n_vs>,<wg_m_"
+                "kq>,<wg_n_kq>,<wg_m_vs>,<wg_n_vs>",
+                config_str.c_str());
+        if (num_values == 8) {
+            config->unroll_m_kq = config_values[0];
+            config->unroll_n_kq = config_values[1];
+            config->unroll_m_vs = config_values[2];
+            config->unroll_n_vs = config_values[3];
+            config->wg_m_kq = config_values[4];
+            config->wg_n_kq = config_values[5];
+            config->wg_m_vs = config_values[6];
+            config->wg_n_vs = config_values[7];
+        }
+    }
+    return status::success;
+}
+
 status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
     using namespace jit;
     using arch_t = compute::gpu_arch_t;
@@ -153,26 +114,63 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
     arch_ = dev_info->gpu_arch();
     auto *d = desc();
 
-    VCONDCHECK(primitive, create, check, sdpa,
-            (dev_info->mayiuse_microkernels()), status::unimplemented,
+    VCHECK_SDPA_COND(mayiuse_microkernels(engine),
             "Microkernels not supported by the OpenCL driver.");
 
     /* Retrieve pre-tuned kernel configuration */
     sdpa_config_t *config = nullptr;
     bool thin_q = (d->queries() <= 16);
+    bool quantized = with_key_scales() || with_key_zp() || with_value_scales()
+            || with_value_zp();
+    bool is_integrated = compute_engine->device_info()->is_integrated();
 
     switch (arch_) {
         case arch_t::xe_hpg:
-            config = choose_config_xehpg(d->head_size(), d->keys(), thin_q);
+            config = choose_config_xehpg(
+                    d->head_size(), d->keys(), thin_q, quantized);
             break;
         case arch_t::xe_hpc:
+            config = choose_config_xehpc(d->head_size(), d->keys(), thin_q,
+                    quantized, is_integrated);
+            break;
         case arch_t::xe2:
-            config = choose_config_xehpc(d->head_size(), d->keys(), thin_q);
+        case arch_t::xe3:
+            config = choose_config_xe2(d->head_size(), d->keys(), thin_q,
+                    quantized, is_integrated);
         default: break;
     }
 
     if (!config) return status::unimplemented;
 
+    auto status = update_config_from_devenv_values(config, quantized);
+    if (status != status::success) return status;
+
+    VDEBUGINFO(4, primitive, sdpa,
+            "kq_tile(%d, %d): unroll_m=%d unroll_n=%d wg_m=%d wg_n=%d, "
+            "vs_tile(%d, %d): unroll_m=%d unroll_n=%d wg_m=%d wg_n=%d",
+            config->unroll_m_kq * config->wg_m_kq,
+            config->unroll_n_kq * config->wg_n_kq, config->unroll_m_kq,
+            config->unroll_n_kq, config->wg_m_kq, config->wg_n_kq,
+            config->unroll_m_vs * config->wg_m_vs,
+            config->unroll_n_vs * config->wg_n_vs, config->unroll_m_vs,
+            config->unroll_n_vs, config->wg_m_vs, config->wg_n_vs);
+
+    VCHECK_SDPA_COND(config->unroll_n_kq * config->wg_n_kq
+                            == config->unroll_n_vs * config->wg_n_vs
+                    && config->unroll_n_kq % config->unroll_n_vs == 0,
+            "[CONFIG] The config KQ work_group tile N(%d) axis must equal "
+            "VS work_group tile N(%d) axis and KQ subgroup tile N(%d) axis "
+            "must be divisible by VS subgroup tile N(%d) axis",
+            config->unroll_n_kq * config->wg_n_kq,
+            config->unroll_n_vs * config->wg_n_vs, config->unroll_n_kq,
+            config->unroll_n_vs);
+
+    VCHECK_SDPA_COND(config->unroll_m_vs * config->wg_m_vs >= d->head_size(),
+            "The vs matmul config work_group tile M(%d*%d=%d) axis must be "
+            "greater than or equal to head size(%ld)",
+            config->unroll_m_vs, config->wg_m_vs,
+            config->unroll_m_vs * config->wg_m_vs, d->head_size());
+
     /* Get device information */
     HWInformation hw_info;
     hw_info.euCount = dev_info->eu_count();
@@ -189,24 +187,69 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
                                                            : MatrixLayout::N;
     };
 
+    bool kq_common_scales = with_quantize_common(d->kq_scales);
+    bool kq_common_zp = with_quantize_common(d->kq_zero_points);
+
     /* Set up GEMMProblem structure for first GEMM: K^T * Q */
     GEMMProblem problem;
-    problem.Ta = problem.Ta_ext
-            = jit::convert_dnnl_to_kernel_type(key_md()->data_type);
-    problem.Tb = problem.Tb_ext
-            = jit::convert_dnnl_to_kernel_type(qry_md()->data_type);
+    problem.Ta_ext = jit::convert_dnnl_to_kernel_type(key_md()->data_type);
+    problem.Tb_ext = jit::convert_dnnl_to_kernel_type(qry_md()->data_type);
+    if (qry_md()->data_type == data_type::f16) {
+        problem.Ta = problem.Tb = Type::f16;
+    } else if (qry_md()->data_type == data_type::bf16) {
+        problem.Ta = problem.Tb = Type::bf16;
+    } else {
+        VCHECK_SDPA_COND(utils::one_of(qry_md()->data_type, data_type::f16,
+                                 data_type::bf16),
+                "Q tensor's data type(%s) must be bf16 or f16");
+    }
     problem.Tc = problem.Tc_ext = Type::f32;
     problem.Ts = problem.Tc;
 
     auto problem_kq = problem;
     problem_kq.A.layout = convert_dnnl_to_kernel_layout(key_md());
+
+    /* Set up microkernel options */
+    micro::GEMMProtocol::Options opts_kq;
+    opts_kq.localB = true;
+    opts_kq.slmPtr = true;
+
+    if (with_key_scales() && !kq_common_scales) {
+        auto scale_dt = key_scales_dt();
+        problem_kq.Ta_scale = jit::convert_dnnl_to_kernel_type(scale_dt);
+        problem_kq.A_scale.setAlignment(
+                int8_t(d->keys() * types::data_type_size(scale_dt)));
+        problem_kq.A_scale.layout = MatrixLayout::N;
+        problem_kq.aScale2D = true;
+    }
+    if (with_key_zp()) {
+        auto zp_dt = key_zp_dt();
+        problem_kq.Tao = jit::convert_dnnl_to_kernel_type(zp_dt);
+        problem_kq.AO.setAlignment(
+                int8_t(d->keys() * types::data_type_size(zp_dt)));
+        problem_kq.AO.layout = MatrixLayout::N;
+        problem_kq.aoPtrDims = kq_common_zp ? 0 : 2;
+        problem_kq.aOffset = ABOffset::Calc;
+    }
+
+    if (with_key_scales() || with_key_zp()) {
+        problem_kq.aqGroupM = 1;
+        problem_kq.aqGroupK
+                = (kq_common_scales || kq_common_zp) ? 1 : key_group_size();
+    }
+    opts_kq.scaleA = with_key_scales() && !kq_common_scales;
+    opts_kq.offsetA = with_key_zp();
+
     problem_kq.B.layout = MatrixLayout::Pr;
     problem_kq.C.layout = MatrixLayout::T;
-    problem_kq.A.setAlignment(alignmentForLD(d->head_size() * problem.Ta));
+    const memory_desc_wrapper key_mdw(key_md());
+    auto ldk = static_cast<int>(
+            gemm_desc_t::get_ld(*key_md()) * key_mdw.data_type_size());
+    problem_kq.A.setAlignment(alignmentForLD(ldk));
     problem_kq.B.setAlignment(64); // Q is packed in VNNI format in SLM
     problem_kq.B.crosspack = 2;
-    problem_kq.B.tileR = d_max();
-    problem_kq.B.tileC = sg_size_;
+    problem_kq.B.tileR = into<uint16_t>(d_max());
+    problem_kq.B.tileC = into<uint16_t>(sg_size_);
 
     /* Set up problem size information */
     SizeParams sizes;
@@ -222,25 +265,60 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
     reqs_kq.push_back(StrategyRequirement::WGM == config->wg_m_kq);
     reqs_kq.push_back(StrategyRequirement::WGN == config->wg_n_kq);
 
-    /* Set up microkernel options */
-    micro::GEMMProtocol::Options opts_kq;
-    opts_kq.localB = true;
-    opts_kq.slmPtr = true;
-
     /* Ask microkernel provider for microkernel */
     try {
         gemm_kq_ = selectGEMMMicrokernel(
                 opts_kq, hw_info, sizes, problem_kq, reqs_kq);
-    } catch (...) { return status::unimplemented; }
+    } catch (const std::runtime_error &ex) {
+        VCHECK_SDPA_COND(false,
+                "gemm_kq microkernel generation failure with message: %s",
+                ex.what());
+    }
+
+    bool vs_common_scales = with_quantize_common(d->vs_scales);
+    bool vs_common_zp = with_quantize_common(d->vs_zero_points);
+
+    /* Set up microkernel options */
+    micro::GEMMProtocol::Options opts_vs;
+    opts_vs.localB = true;
+    opts_vs.slmPtr = true;
 
     /* Update for second GEMM: V*S */
-    auto problem_vs = problem;
-    problem_vs.Ta = problem_vs.Ta_ext
-            = jit::convert_dnnl_to_kernel_type(val_md()->data_type);
+    auto problem_vs = std::move(problem);
+    problem_vs.Ta_ext = jit::convert_dnnl_to_kernel_type(val_md()->data_type);
     problem_vs.A.layout = convert_dnnl_to_kernel_layout(val_md());
+    if (with_value_scales() && !vs_common_scales) {
+        auto scale_dt = value_scales_dt();
+        problem_vs.Ta_scale = jit::convert_dnnl_to_kernel_type(scale_dt);
+        problem_vs.A_scale.setAlignment(uint8_t(d->head_size()
+                / value_group_size() * types::data_type_size(scale_dt)));
+        problem_vs.A_scale.layout = MatrixLayout::N;
+        problem_vs.aScale2D = true;
+    }
+    if (with_value_zp()) {
+        auto zp_dt = value_zp_dt();
+        problem_vs.Tao = jit::convert_dnnl_to_kernel_type(zp_dt);
+        problem_vs.AO.setAlignment(uint8_t(d->head_size() / value_group_size()
+                * types::data_type_size(zp_dt)));
+        problem_vs.AO.layout = MatrixLayout::N;
+        problem_vs.aoPtrDims = vs_common_zp ? 0 : 2;
+        problem_vs.aOffset = ABOffset::Calc;
+    }
+    if (with_value_scales() || with_value_zp()) {
+        problem_vs.aqGroupM = (vs_common_scales || vs_common_zp)
+                ? 1
+                : utils::rnd_up_pow2(value_group_size());
+        problem_vs.aqGroupK = 1;
+    }
+    opts_vs.scaleA = with_value_scales() && !vs_common_scales;
+    opts_vs.offsetA = with_value_zp();
+
     problem_vs.B.layout = MatrixLayout::Pr;
     problem_vs.C.layout = MatrixLayout::N;
-    problem_vs.A.setAlignment(alignmentForLD(d->head_size() * problem.Ta));
+    const memory_desc_wrapper val_mdw(val_md());
+    auto ldv = static_cast<int>(
+            gemm_desc_t::get_ld(*val_md()) * val_mdw.data_type_size());
+    problem_vs.A.setAlignment(alignmentForLD(ldv));
     problem_vs.B.setAlignment(64); // S is packed in SLM
     problem_vs.B.crosspack = 16;
     sizes.m = d->values();
@@ -254,10 +332,6 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
     reqs_vs.push_back(StrategyRequirement::WGM == config->wg_m_vs);
     reqs_vs.push_back(StrategyRequirement::WGN == config->wg_n_vs);
 
-    micro::GEMMProtocol::Options opts_vs;
-    opts_vs.localB = true;
-    opts_vs.slmPtr = true;
-
     /* Ask microkernel provider for microkernel */
     try {
         auto adjust_vs = [](GEMMStrategy &strategy) {
@@ -266,7 +340,13 @@ status_t micro_sdpa_t::pd_t::init_microkernels(impl::engine_t *engine) {
         };
         gemm_vs_ = selectGEMMMicrokernel(
                 opts_vs, hw_info, sizes, problem_vs, reqs_vs, adjust_vs);
-    } catch (...) { return status::unimplemented; }
+    } catch (const std::runtime_error &ex) {
+        VCHECK_SDPA_COND(false,
+                "gemm_vs microkernel generation failure with message: %s",
+                ex.what());
+    }
+    VDEBUGINFO(4, primitive, sdpa, "kq_gemm: %s, vs_gemm: %s,",
+            problem_kq.toString().c_str(), problem_vs.toString().c_str());
 
     return status::success;
 }
@@ -300,6 +380,20 @@ status_t micro_sdpa_t::init(impl::engine_t *engine) {
     def_offsets(msk_off, kernel_ctx, "MSK", ndims);
     kernel_ctx.define_int("NDIMS", ndims);
 
+    def_data_type(kernel_ctx, key_mdw.data_type(), "KEY");
+    def_data_type(kernel_ctx, qry_mdw.data_type(), "QRY");
+    def_data_type(kernel_ctx, val_mdw.data_type(), "VAL");
+    def_data_type(kernel_ctx, dst_mdw.data_type(), "DST");
+    if (pd()->with_attn_mask()) {
+        def_data_type(kernel_ctx, msk_mdw.data_type(), "MSK");
+    }
+
+    def_data_type(kernel_ctx, pd()->key_scales_dt(), "KEY_ATTR_SCALES");
+    def_data_type(kernel_ctx, pd()->value_scales_dt(), "VAL_ATTR_SCALES");
+
+    def_data_type(kernel_ctx, pd()->key_zp_dt(), "KEY_ATTR_ZP");
+    def_data_type(kernel_ctx, pd()->value_zp_dt(), "VAL_ATTR_ZP");
+
     auto Q_num_heads_dim = qry_mdw.dims()[1];
     kernel_ctx.define_int("KV_GROUP_SIZE", Q_num_heads_dim / d->kv_head_number);
 
@@ -307,6 +401,9 @@ status_t micro_sdpa_t::init(impl::engine_t *engine) {
     auto ldk = gemm_desc_t::get_ld(*pd()->key_md()) * key_mdw.data_type_size();
     auto ldv = gemm_desc_t::get_ld(*pd()->val_md()) * val_mdw.data_type_size();
     auto lda = gemm_desc_t::get_ld(*pd()->dst_md()) * dst_mdw.data_type_size();
+    auto ldmsk = pd()->with_attn_mask()
+            ? msk_mdw.dims()[3] * msk_mdw.data_type_size()
+            : 0;
     kernel_ctx.define_int("Q_ALIGN", jit::alignmentForLD(int(ldq)));
     kernel_ctx.define_int("K_ALIGN", jit::alignmentForLD(int(ldk)));
     kernel_ctx.define_int("V_ALIGN", jit::alignmentForLD(int(ldv)));
@@ -315,14 +412,60 @@ status_t micro_sdpa_t::init(impl::engine_t *engine) {
     kernel_ctx.define_int("TRANSPOSE_K",
             gemm_desc_t::get_trans(*pd()->key_md()) == dnnl_trans);
 
+    int kq_scale_mask = (static_cast<int>(pd()->with_key_scales()) << 1)
+            | static_cast<int>(with_quantize_common(d->kq_scales));
+    kernel_ctx.define_int("KEY_SCALES", kq_scale_mask);
+
+    int vs_scale_mask = (static_cast<int>(pd()->with_value_scales()) << 1)
+            | static_cast<int>(with_quantize_common(d->vs_scales));
+    kernel_ctx.define_int("VAL_SCALES", vs_scale_mask);
+
+    int kq_zp_mask = (static_cast<int>(pd()->with_key_zp()) << 1)
+            | static_cast<int>(with_quantize_common(d->kq_zero_points));
+    kernel_ctx.define_int("KEY_ZERO_POINTS", kq_zp_mask);
+
+    int vs_zp_mask = (static_cast<int>(pd()->with_value_zp()) << 1)
+            | static_cast<int>(with_quantize_common(d->vs_zero_points));
+    kernel_ctx.define_int("VAL_ZERO_POINTS", vs_zp_mask);
+
+    using namespace data_type;
+    auto elems_per_byte = [](data_type_t dt) {
+        switch (dt) {
+            case u4:
+            case s4: return 2;
+            default: return 1;
+        }
+    };
+    kernel_ctx.define_int(
+            "KEY_ELEMENTS_PER_BYTE", elems_per_byte(key_mdw.data_type()));
+    kernel_ctx.define_int(
+            "KEY_ZP_ELEMENTS_PER_BYTE", elems_per_byte(pd()->key_zp_dt()));
+    kernel_ctx.define_int(
+            "VAL_ELEMENTS_PER_BYTE", elems_per_byte(val_mdw.data_type()));
+    kernel_ctx.define_int(
+            "VAL_ZP_ELEMENTS_PER_BYTE", elems_per_byte(pd()->value_zp_dt()));
+
+    if (pd()->with_key_scales() || pd()->with_key_zp())
+        kernel_ctx.define_int("KEY_GROUP_SIZE", pd()->key_group_size());
+    if (pd()->with_value_scales() || pd()->with_value_zp())
+        kernel_ctx.define_int("VAL_GROUP_SIZE", pd()->value_group_size());
+
     def_data_type(kernel_ctx, d->scale_dt, "SCALE");
     kernel_ctx.define_int("INVERT_SCALE", d->invert_scale);
     kernel_ctx.define_int("WITH_ATTN_SCALE", pd()->with_attn_scale());
+    kernel_ctx.define_int("ATTN_MASK_UNDEF", attn_mask_type::undef);
+    kernel_ctx.define_int("ATTN_MASK_BUFFER", attn_mask_type::buffer);
+    kernel_ctx.define_int("ATTN_MASK_TOP_LEFT", attn_mask_type::top_left);
+    kernel_ctx.define_int(
+            "ATTN_MASK_BOTTOM_RIGHT", attn_mask_type::bottom_right);
 
-    kernel_ctx.define_int("WITH_ATTN_MASK", pd()->with_attn_mask());
+    kernel_ctx.define_int("WITH_ATTN_MASK",
+            pd()->with_attn_mask() && !pd()->with_causal_mask());
     kernel_ctx.define_int(
             "BROADCAST_MASK_Q", msk_mdw.dims()[pd_t::mask_q_index] == 1);
 
+    kernel_ctx.define_int("WITH_CAUSAL_MASK", pd()->with_causal_mask());
+
     kernel_ctx.define_int("SUBGROUP_SIZE", pd()->sg_size());
     kernel_ctx.define_int("D_MAX", pd()->d_max());
 
@@ -339,6 +482,7 @@ status_t micro_sdpa_t::init(impl::engine_t *engine) {
     if (d_full) {
         if (ldq % 4 == 0) kernel_ctx.define_int("BLOCK_Q", 1);
         if (lda % 4 == 0 && v_full) kernel_ctx.define_int("BLOCK_A", 1);
+        if (ldmsk % 4 == 0) kernel_ctx.define_int("BLOCK_MSK", 1);
         kernel_ctx.define_int("REMAINDER_Q", (d->queries() % tile_q) != 0);
     } else if (pd()->arch() >= compute::gpu_arch_t::xe_hpc) {
         auto vbytes = d->values() * val_mdw.data_type_size();
@@ -389,6 +533,14 @@ status_t micro_sdpa_t::execute(const exec_ctx_t &ctx) const {
     const auto &scale = CTX_IN_STORAGE(DNNL_ARG_SCALE);
     const auto &attn_mask = CTX_IN_STORAGE(DNNL_ARG_ATTN_MASK);
 
+    const auto &key_scales
+            = CTX_IN_STORAGE(DNNL_ARG_KEYS | DNNL_ARG_ATTR_SCALES);
+    const auto &key_zp
+            = CTX_IN_STORAGE(DNNL_ARG_KEYS | DNNL_ARG_ATTR_ZERO_POINTS);
+    const auto &value_scales
+            = CTX_IN_STORAGE(DNNL_ARG_VALUES | DNNL_ARG_ATTR_SCALES);
+    const auto &value_zp
+            = CTX_IN_STORAGE(DNNL_ARG_VALUES | DNNL_ARG_ATTR_ZERO_POINTS);
     const dim_t Q = pd()->desc()->queries();
     const dim_t K = pd()->desc()->keys();
     const dim_t D = pd()->desc()->head_size();
@@ -398,16 +550,22 @@ status_t micro_sdpa_t::execute(const exec_ctx_t &ctx) const {
     auto sg_per_wg = gemm_kq.getSetting("sg_per_wg_m")
             * gemm_kq.getSetting("sg_per_wg_n");
 
+    int mask_type = static_cast<int>(pd()->desc()->mask_type);
     compute::kernel_arg_list_t arg_list;
-    arg_list.set(0, key);
-    arg_list.set(1, qry);
-    arg_list.set(2, val);
-    arg_list.set(3, dst);
-    arg_list.set(4, scale);
-    arg_list.set(5, attn_mask);
-    arg_list.set(6, (int)D);
-    arg_list.set(7, (int)K);
-    arg_list.set(8, (int)Q);
+    arg_list.append(key);
+    arg_list.append(qry);
+    arg_list.append(val);
+    arg_list.append(dst);
+    arg_list.append(scale);
+    arg_list.append((int)D);
+    arg_list.append((int)K);
+    arg_list.append((int)Q);
+    arg_list.append(key_scales);
+    arg_list.append(key_zp);
+    arg_list.append(value_scales);
+    arg_list.append(value_zp);
+    arg_list.append(mask_type);
+    if (pd()->with_attn_mask()) arg_list.append(attn_mask);
 
     compute::range_t lws = {(size_t)pd()->sg_size(), (size_t)sg_per_wg, 1};
     compute::range_t gws = lws;
diff --git a/src/gpu/intel/ocl/micro_sdpa.hpp b/src/gpu/intel/ocl/micro_sdpa.hpp
index a210ba994c1..17b3177523a 100644
--- a/src/gpu/intel/ocl/micro_sdpa.hpp
+++ b/src/gpu/intel/ocl/micro_sdpa.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "common/c_types_map.hpp"
 #include "common/gemm_types.hpp"
 #include "common/gemm_utils.hpp"
+#include "common/math_utils.hpp"
 #include "common/primitive.hpp"
 #include "common/sdpa_pd.hpp"
 #include "common/type_helpers.hpp"
@@ -29,7 +30,7 @@
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
 #include "gpu/intel/microkernels/shim.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -50,33 +51,118 @@ struct micro_sdpa_t : public gpu_primitive_t {
 
         status_t init(impl::engine_t *engine) {
             using namespace data_type;
-            using smask_t = primitive_attr_t::skip_mask_t;
 
-            VDISPATCH_SDPA(attr()->has_default_values(smask_t::scales_runtime),
-                    VERBOSE_UNSUPPORTED_ATTR);
-            VDISPATCH_SDPA(
+            VCHECK_SDPA_COND(
                     utils::everyone_is(4, qry_md()->ndims, key_md()->ndims,
                             val_md()->ndims, dst_md()->ndims),
                     VERBOSE_UNSUPPORTED_TAG);
             if (with_attn_mask()) {
-                VDISPATCH_SDPA(
+                VCHECK_SDPA_COND(
                         attn_mask_md()->ndims == 4, VERBOSE_UNSUPPORTED_TAG);
-                VDISPATCH_SDPA(utils::one_of(attn_mask_md()->dims[mask_q_index],
-                                       desc()->queries(), 1),
+                VCHECK_SDPA_COND(
+                        utils::one_of(attn_mask_md()->dims[mask_q_index],
+                                desc()->queries(), 1),
                         VERBOSE_INVALID_BROADCAST, "attn_mask", mask_q_index);
-                VDISPATCH_SDPA(
+                VCHECK_SDPA_COND(
                         attn_mask_md()->dims[mask_k_index] == desc()->keys(),
                         VERBOSE_INVALID_BROADCAST, "attn_mask", mask_k_index);
+                VCHECK_SDPA_COND(
+                        attn_mask_md()->data_type == qry_md()->data_type,
+                        "Mask data type should match Qry/Dst data type.");
             }
-            VDISPATCH_SDPA(utils::everyone_is(data_type::f16,
-                                   qry_md()->data_type, key_md()->data_type,
-                                   val_md()->data_type, dst_md()->data_type),
+            VCHECK_SDPA_COND(
+                    (utils::everyone_is(data_type::f16, qry_md()->data_type,
+                             dst_md()->data_type)
+                            || utils::everyone_is(data_type::bf16,
+                                    qry_md()->data_type, dst_md()->data_type)),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_SDPA(set_default_formats() == status::success,
+            VCHECK_SDPA_COND(utils::one_of(key_md()->data_type, bf16, f16, u8,
+                                     s8, u4, s4),
+                    VERBOSE_UNSUPPORTED_DT);
+            VCHECK_SDPA_COND(utils::one_of(val_md()->data_type, bf16, f16, u8,
+                                     s8, u4, s4),
+                    VERBOSE_UNSUPPORTED_DT);
+            VCHECK_SDPA_COND(set_default_formats() == status::success,
                     VERBOSE_UNSUPPORTED_TAG);
-            VDISPATCH_SDPA(desc()->values() == desc()->head_size(),
+            VCHECK_SDPA_COND(desc()->values() == desc()->head_size(),
                     "values does not match head size");
 
+            VCHECK_SDPA_COND(qry_md()->dims[1] >= key_md()->dims[1]
+                            && qry_md()->dims[1] >= val_md()->dims[1],
+                    "number of heads in query tensor(%s) must be greater "
+                    "than the number of heads in the key(%s) and value(%s) "
+                    "tensors",
+                    qry_md()->dims[1], key_md()->dims[1], val_md()->dims[1]);
+
+            int kq_scales_mask = desc()->kq_scales.get_mask();
+            int kq_zp_mask = desc()->kq_zero_points.get_mask(DNNL_ARG_WEIGHTS);
+            if (!desc()->kq_scales.has_default_values()
+                    && !desc()->kq_zero_points.has_default_values())
+                VCHECK_SDPA_COND(kq_scales_mask == kq_zp_mask,
+                        "kq scales mask(%d) must equal kq zero point(%d) "
+                        "mask",
+                        kq_scales_mask, kq_zp_mask);
+            if (!desc()->kq_scales.has_default_values())
+                VCHECK_SDPA_COND(utils::one_of(kq_scales_mask, 0, 1, 3, 11, 15),
+                        "unsupported mask for kq matmul(%d). must be 0, 1, 3, "
+                        "11, or 15",
+                        kq_scales_mask);
+            if (!desc()->kq_zero_points.has_default_values())
+                VCHECK_SDPA_COND(utils::one_of(kq_zp_mask, 0, 1, 3, 11, 15),
+                        "unsupported mask for kq matmul(%d). must be 0, 1, 3, "
+                        "11, or 15",
+                        kq_zp_mask);
+
+            /// NOTE: Limitation of microkernels
+            if (utils::one_of(
+                        desc()->kq_zero_points.get_data_type(DNNL_ARG_WEIGHTS),
+                        s4, u4)) {
+                VCHECK_SDPA_COND(key_group_size() == 16,
+                        "if kq zero points data type is s4 or u4 then the "
+                        "group size(%d) must be 16.",
+                        key_group_size());
+            }
+
+            int vs_scales_mask = desc()->vs_scales.get_mask();
+            int vs_zp_mask = desc()->vs_zero_points.get_mask(DNNL_ARG_WEIGHTS);
+            if (!desc()->vs_scales.has_default_values()
+                    && !desc()->vs_zero_points.has_default_values())
+                VCHECK_SDPA_COND(vs_scales_mask == vs_zp_mask,
+                        "vs scales mask(%d) must equal vs zero point(%d) "
+                        "mask",
+                        vs_scales_mask, vs_zp_mask);
+            if (!desc()->vs_scales.has_default_values())
+                VCHECK_SDPA_COND(utils::one_of(vs_scales_mask, 0, 1, 3, 7, 15),
+                        "unsupported mask for vs matmul(%d). must be 0, 1, 3, "
+                        "7, or 15",
+                        vs_scales_mask);
+            if (!desc()->vs_zero_points.has_default_values())
+                VCHECK_SDPA_COND(utils::one_of(vs_zp_mask, 0, 1, 3, 7, 15),
+                        "unsupported mask for vs matmul(%d). must be 0, 1, 3, "
+                        "7, or 15",
+                        vs_zp_mask);
+
+            /// NOTE: Limitation of microkernels
+            if (utils::one_of(
+                        desc()->vs_zero_points.get_data_type(DNNL_ARG_WEIGHTS),
+                        s4, u4)) {
+                VCHECK_SDPA_COND(value_group_size() == 16,
+                        "if vs zero points data type is s4 or u4 then the "
+                        "group size(%d) must be 16.",
+                        value_group_size());
+            }
+
+            if (!desc()->vs_scales.has_default_values()
+                    || !desc()->vs_zero_points.has_default_values()) {
+                int vgs = value_group_size();
+                VCHECK_SDPA_COND(utils::one_of(vs_scales_mask, 0, 1, 3)
+                                || (math::is_pow2<int>(vgs)
+                                        || vgs == val_md()->dims[3]),
+                        "the value group size(%d) must be a power of 2 or "
+                        "equal to the number of values(%d).",
+                        vgs, val_md()->dims[3]);
+            }
+
             CHECK(init_microkernels(engine));
             return status::success;
         }
@@ -107,7 +193,7 @@ struct micro_sdpa_t : public gpu_primitive_t {
 
         // Block size for head_size, which must be hard-coded into the kernel.
         int d_max() const {
-            int head_size = desc()->head_size();
+            int head_size = into<int>(desc()->head_size());
             for (int i = 32; i <= 1024; i *= 2)
                 if (head_size <= i) return i;
             return head_size;
diff --git a/src/gpu/intel/ocl/micro_sdpa_configs.hpp b/src/gpu/intel/ocl/micro_sdpa_configs.hpp
new file mode 100644
index 00000000000..4abfbc7d86e
--- /dev/null
+++ b/src/gpu/intel/ocl/micro_sdpa_configs.hpp
@@ -0,0 +1,487 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_OCL_MICRO_SDPA_CONFIGS_HPP
+#define GPU_OCL_MICRO_SDPA_CONFIGS_HPP
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+namespace {
+
+struct sdpa_config_t {
+    int unroll_m_kq, unroll_n_kq; // Subgroup tile sizes for K*Q GEMM
+    int unroll_m_vs, unroll_n_vs; // Subgroup tile sizes for V*S GEMM
+    int wg_m_kq, wg_n_kq; // Workgroup configuration for K*Q GEMM
+    int wg_m_vs, wg_n_vs; // Workgroup configuration for V*S GEMM
+};
+
+// Kernel configurations:
+//  h<N> -- maximum head size = N
+//  s<M> -- target sequence length = M
+//   2nd -- second token (thin Q)
+sdpa_config_t xehpg_h32 = {32, 16, 16, 16, 2, 16, 2, 16};
+sdpa_config_t xehpg_h32_s256 = {16, 16, 16, 16, 2, 8, 2, 8};
+sdpa_config_t xehpg_h32_s64 = {16, 16, 16, 8, 4, 4, 2, 8};
+sdpa_config_t xehpg_h32_s32 = {8, 8, 8, 8, 4, 4, 4, 4};
+sdpa_config_t xehpg_h32_2nd = {8, 32, 16, 8, 8, 1, 2, 4};
+
+sdpa_config_t xehpg_q_h32 = {32, 16, 16, 16, 2, 8, 2, 8};
+sdpa_config_t xehpg_q_h32_2nd = {32, 16, 8, 8, 8, 1, 4, 2};
+
+sdpa_config_t xehpg_h64 = {32, 16, 16, 16, 4, 8, 4, 8};
+sdpa_config_t xehpg_h64_s128 = {16, 16, 16, 16, 4, 8, 4, 8};
+sdpa_config_t xehpg_h64_s64 = {32, 16, 16, 8, 8, 4, 4, 8};
+sdpa_config_t xehpg_h64_2nd = {8, 16, 16, 8, 8, 1, 4, 2};
+
+sdpa_config_t xehpg_q_h64 = {32, 16, 16, 16, 4, 8, 4, 8};
+sdpa_config_t xehpg_q_h64_s128 = {16, 16, 16, 8, 8, 4, 4, 8};
+sdpa_config_t xehpg_q_h64_s64 = {32, 8, 32, 8, 2, 8, 2, 8};
+sdpa_config_t xehpg_q_h64_s32 = {8, 8, 16, 8, 4, 8, 4, 8};
+
+sdpa_config_t xehpg_q_h64_s64_2nd = {8, 8, 8, 8, 8, 2, 8, 2};
+sdpa_config_t xehpg_q_h64_s128_2nd = {16, 8, 8, 8, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h64_2nd = {16, 16, 8, 8, 16, 2, 8, 4};
+
+sdpa_config_t xehpg_h128 = {16, 16, 32, 8, 8, 4, 4, 8};
+sdpa_config_t xehpg_h128_s32 = {16, 16, 16, 8, 16, 2, 8, 4};
+sdpa_config_t xehpg_h128_2nd = {8, 16, 16, 8, 16, 1, 8, 2};
+sdpa_config_t xehpg_h128_s256_2nd = {8, 16, 32, 8, 8, 1, 4, 2};
+
+sdpa_config_t xehpg_q_h128 = {8, 32, 16, 32, 8, 2, 8, 2};
+sdpa_config_t xehpg_q_h128_s64 = {8, 8, 16, 8, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h128_s512 = {16, 16, 16, 16, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h128_2nd = {16, 16, 16, 8, 16, 2, 8, 4};
+sdpa_config_t xehpg_q_h128_s96_2nd = {8, 8, 8, 8, 16, 2, 16, 2};
+
+sdpa_config_t xehpg_h256 = {16, 16, 32, 8, 16, 2, 8, 4};
+sdpa_config_t xehpg_h256_s128 = {8, 16, 32, 16, 8, 4, 8, 4};
+sdpa_config_t xehpg_h256_s32 = {8, 16, 32, 8, 16, 2, 8, 4};
+
+sdpa_config_t xehpg_q_h256 = {16, 16, 64, 8, 8, 4, 4, 8};
+sdpa_config_t xehpg_q_h256_s512 = {16, 16, 32, 16, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h256_s64 = {8, 8, 32, 8, 8, 4, 8, 4};
+
+sdpa_config_t xehpg_h256_2nd = {8, 8, 16, 8, 16, 1, 16, 1};
+sdpa_config_t xehpg_h256_s64_2nd = {16, 8, 16, 8, 16, 1, 16, 1};
+sdpa_config_t xehpg_h256_s32_2nd = {16, 16, 32, 8, 16, 1, 8, 2};
+
+sdpa_config_t xehpg_q_h256_2nd = {32, 8, 32, 8, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h256_s96_2nd = {8, 8, 16, 8, 16, 2, 16, 2};
+
+sdpa_config_t xehpg_q_h512_s64 = {8, 8, 64, 8, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h512_s128 = {8, 16, 32, 16, 16, 2, 16, 2};
+sdpa_config_t xehpg_q_h512_s256 = {16, 8, 64, 8, 8, 4, 8, 4};
+sdpa_config_t xehpg_q_h512 = {8, 16, 64, 8, 16, 2, 8, 4};
+
+sdpa_config_t xehpg_q_h512_s64_2nd = {8, 16, 32, 8, 32, 1, 16, 2};
+sdpa_config_t xehpg_q_h512_s256_2nd = {16, 8, 32, 8, 16, 2, 16, 2};
+sdpa_config_t xehpg_q_h512_2nd = {16, 8, 16, 8, 32, 1, 32, 1};
+
+sdpa_config_t xehpg_h512 = {8, 16, 32, 16, 16, 2, 16, 2};
+sdpa_config_t xehpg_h512_2nd = {8, 8, 32, 8, 16, 1, 16, 1};
+
+sdpa_config_t xehpc_h32 = {16, 64, 32, 16, 4, 2, 1, 8};
+sdpa_config_t xehpc_h32_s32 = {16, 16, 16, 16, 2, 4, 2, 4};
+sdpa_config_t xehpc_h32_2nd = {16, 64, 16, 16, 8, 1, 2, 4};
+
+sdpa_config_t xehpc_h64 = {16, 64, 32, 16, 8, 2, 2, 8};
+sdpa_config_t xehpc_h64_s64 = {32, 32, 32, 16, 4, 2, 2, 4};
+sdpa_config_t xehpc_h64_s32 = {16, 16, 16, 16, 4, 2, 4, 2};
+sdpa_config_t xehpc_h64_2nd = {32, 32, 32, 16, 4, 1, 2, 2};
+sdpa_config_t xehpc_h64_s64_2nd = {16, 16, 16, 16, 4, 1, 4, 1};
+
+sdpa_config_t xehpc_q_h64_s64 = {16, 16, 16, 16, 4, 4, 4, 4};
+sdpa_config_t xehpc_q_h64_s384 = {16, 64, 16, 32, 8, 2, 4, 4};
+sdpa_config_t xehpc_q_h64_s1024 = {16, 64, 16, 16, 16, 1, 4, 4};
+sdpa_config_t xehpc_q_h64 = {16, 64, 16, 32, 8, 1, 4, 2};
+
+sdpa_config_t xehpc_q_h64_s96_2nd = {16, 16, 16, 16, 8, 1, 4, 1};
+sdpa_config_t xehpc_q_h64_s256_2nd = {16, 16, 16, 16, 16, 1, 16, 1};
+sdpa_config_t xehpc_q_h64_s1152_2nd = {16, 16, 16, 16, 16, 1, 16, 1};
+sdpa_config_t xehpc_q_h64_2nd = {64, 16, 16, 16, 16, 2, 16, 2};
+
+sdpa_config_t xehpc_h128 = {16, 64, 32, 16, 16, 2, 4, 8};
+sdpa_config_t xehpc_h128_s64 = {16, 32, 32, 32, 4, 2, 4, 2};
+sdpa_config_t xehpc_h128_s32 = {16, 16, 16, 16, 8, 2, 8, 2};
+sdpa_config_t xehpc_h128_2nd = {32, 32, 32, 16, 8, 1, 4, 2};
+
+sdpa_config_t xehpc_q_h128 = {16, 64, 16, 32, 16, 1, 8, 2};
+sdpa_config_t xehpc_q_h128_s32 = {16, 16, 16, 16, 8, 2, 8, 2};
+sdpa_config_t xehpc_q_h128_s128 = {16, 16, 16, 16, 8, 4, 8, 4};
+sdpa_config_t xehpc_q_h128_s128_integrated = {16, 16, 16, 16, 8, 2, 8, 2};
+
+sdpa_config_t xehpc_q_h128_2nd = {16, 16, 16, 16, 16, 1, 16, 1};
+sdpa_config_t xehpc_q_h128_2nd_integrated = {16, 16, 16, 16, 8, 1, 8, 1};
+sdpa_config_t xehpc_q_h128_s96_2nd = {16, 16, 16, 16, 8, 1, 8, 1};
+sdpa_config_t xehpc_q_h128_s512_2nd = {16, 16, 16, 16, 16, 2, 8, 2};
+
+sdpa_config_t xehpc_h256 = {16, 32, 32, 32, 8, 4, 8, 4};
+sdpa_config_t xehpc_h256_s64 = {16, 32, 32, 32, 8, 1, 8, 1};
+sdpa_config_t xehpc_h256_2nd = {16, 16, 16, 16, 16, 1, 16, 1};
+
+sdpa_config_t xehpc_h512_s32 = {16, 16, 64, 16, 8, 2, 8, 2};
+sdpa_config_t xehpc_h512_s128 = {16, 16, 64, 16, 8, 4, 8, 4};
+sdpa_config_t xehpc_h512 = {32, 16, 64, 16, 8, 4, 8, 4};
+
+sdpa_config_t xehpc_h512_s128_2nd = {16, 16, 64, 16, 8, 1, 8, 1};
+sdpa_config_t xehpc_h512_s512_2nd = {32, 16, 32, 16, 16, 1, 16, 1};
+sdpa_config_t xehpc_h512_s1024_2nd = {64, 16, 32, 16, 16, 1, 16, 1};
+sdpa_config_t xehpc_h512_2nd = {32, 16, 32, 16, 16, 1, 16, 1};
+
+sdpa_config_t xehpc_h576 = {16, 32, 32, 32, 32, 1, 32, 1};
+sdpa_config_t xehpc_h576_2nd = {32, 16, 32, 16, 32, 1, 31, 1};
+
+sdpa_config_t xehpc_q_h512_s128 = {16, 16, 64, 16, 8, 2, 8, 2};
+sdpa_config_t xehpc_q_h512 = {16, 32, 64, 16, 16, 2, 8, 4};
+
+sdpa_config_t xehpc_q_h512_2nd = {16, 16, 32, 16, 16, 2, 16, 2};
+
+sdpa_config_t xe2_q_h64 = {16, 64, 16, 32, 16, 1, 8, 2};
+sdpa_config_t xe2_q_h64_s1024_integrated = {16, 64, 16, 32, 8, 4, 4, 8};
+sdpa_config_t xe2_q_h64_s512 = {16, 64, 16, 32, 8, 4, 4, 8};
+sdpa_config_t xe2_q_h64_s384 = {16, 64, 16, 16, 16, 1, 4, 4};
+sdpa_config_t xe2_q_h64_s128 = {16, 64, 16, 32, 8, 1, 4, 2};
+sdpa_config_t xe2_q_h64_s128_integrated = {16, 16, 16, 16, 4, 4, 4, 4};
+sdpa_config_t xe2_q_h64_s32 = {16, 16, 16, 16, 4, 4, 4, 4};
+
+sdpa_config_t xe2_q_h64_2nd = {16, 16, 16, 16, 16, 1, 8, 1};
+sdpa_config_t xe2_q_h64_2nd_integrated = {16, 16, 16, 16, 8, 1, 8, 1};
+sdpa_config_t xe2_q_h64_s96_2nd_integrated = {16, 16, 16, 16, 8, 1, 4, 1};
+sdpa_config_t xe2_q_h64_s384_2nd_integrated = {64, 16, 16, 16, 4, 1, 4, 1};
+sdpa_config_t xe2_q_h64_s64_2nd = {16, 16, 16, 16, 4, 2, 4, 2};
+sdpa_config_t xe2_q_h64_s128_2nd = {16, 16, 16, 16, 8, 2, 8, 2};
+sdpa_config_t xe2_q_h64_s384_2nd = {16, 16, 16, 16, 16, 1, 4, 1};
+sdpa_config_t xe2_q_h64_s512_2nd = {64, 16, 16, 16, 8, 1, 8, 1};
+sdpa_config_t xe2_q_h64_s768_2nd = {64, 16, 16, 16, 16, 1, 8, 1};
+
+sdpa_config_t xe2_q_h256 = {16, 64, 16, 32, 32, 1, 16, 2};
+sdpa_config_t xe2_q_h256_s384 = {16, 32, 32, 32, 8, 2, 8, 2};
+sdpa_config_t xe2_q_h256_s128 = {16, 32, 32, 32, 8, 1, 8, 1};
+sdpa_config_t xe2_q_h256_s128_integrated = {16, 32, 32, 32, 8, 2, 8, 2};
+sdpa_config_t xe2_q_h256_s64_integrated = {16, 16, 16, 16, 16, 1, 16, 1};
+sdpa_config_t xe2_q_h256_s64 = {16, 32, 64, 16, 8, 2, 4, 4};
+
+sdpa_config_t xe2_q_h256_2nd_integrated = {32, 16, 64, 16, 4, 1, 4, 1};
+sdpa_config_t xe2_q_h256_s1152_2nd_integrated = {16, 16, 64, 16, 4, 1, 4, 1};
+sdpa_config_t xe2_q_h256_s768_2nd_integrated = {64, 16, 16, 16, 16, 1, 16, 1};
+sdpa_config_t xe2_q_h256_s512_2nd_integrated = {32, 32, 32, 16, 16, 1, 8, 2};
+sdpa_config_t xe2_q_h256_s384_2nd_integrated = {16, 16, 16, 16, 16, 1, 16, 1};
+
+sdpa_config_t xe2_h512_s64 = {16, 16, 64, 16, 8, 2, 8, 2};
+sdpa_config_t xe2_h512 = {32, 16, 64, 16, 8, 4, 8, 4};
+
+sdpa_config_t xe2_h512_s128_2nd = {16, 16, 64, 16, 8, 1, 8, 1};
+sdpa_config_t xe2_h512_s512_2nd = {32, 16, 64, 16, 16, 1, 16, 1};
+sdpa_config_t xe2_h512_s1024_2nd = {64, 16, 32, 16, 16, 2, 16, 2};
+sdpa_config_t xe2_h512_2nd = {32, 16, 64, 16, 16, 1, 16, 1};
+
+sdpa_config_t xe2_q_h512_s128 = {16, 16, 64, 16, 8, 2, 8, 2};
+sdpa_config_t xe2_q_h512 = {16, 32, 64, 16, 16, 2, 8, 4};
+
+sdpa_config_t xe2_q_h512_s64_2nd = {16, 16, 64, 16, 8, 1, 8, 1};
+sdpa_config_t xe2_q_h512_2nd = {16, 16, 64, 16, 16, 1, 16, 1};
+
+sdpa_config_t xe2_h512_s128_integrated = {16, 16, 64, 16, 8, 2, 8, 2};
+sdpa_config_t xe2_h512_integrated = {16, 16, 32, 16, 16, 1, 16, 1};
+
+sdpa_config_t xe2_h512_s256_2nd_integrated = {16, 16, 64, 16, 8, 1, 8, 1};
+sdpa_config_t xe2_h512_s1024_2nd_integrated = {16, 16, 64, 16, 8, 2, 8, 2};
+sdpa_config_t xe2_h512_2nd_integrated = {16, 16, 64, 16, 16, 2, 16, 2};
+
+sdpa_config_t xe2_h576 = {16, 32, 32, 32, 32, 1, 32, 1};
+
+sdpa_config_t xe2_q_h512_integrated = {16, 32, 32, 32, 16, 1, 16, 1};
+
+sdpa_config_t xe2_q_h512_s64_2nd_integrated = {16, 32, 64, 32, 16, 2, 8, 2};
+sdpa_config_t xe2_q_h512_s128_2nd_integrated = {16, 16, 64, 16, 8, 1, 32, 1};
+sdpa_config_t xe2_q_h512_s256_2nd_integrated = {16, 32, 64, 32, 16, 2, 8, 2};
+sdpa_config_t xe2_q_h512_s512_2nd_integrated = {16, 16, 64, 16, 4, 4, 8, 4};
+sdpa_config_t xe2_q_h512_s1024_2nd_integrated = {16, 16, 64, 16, 16, 1, 16, 1};
+sdpa_config_t xe2_q_h512_2nd_integrated = {32, 16, 64, 16, 8, 1, 16, 1};
+
+sdpa_config_t *choose_config_xehpg(
+        dim_t head_size, dim_t seq, bool thin_q, bool quantized) {
+    if (head_size <= 32) {
+        if (quantized && seq >= 128) {
+            if (thin_q) return &xehpg_q_h32_2nd;
+            return &xehpg_q_h32;
+        }
+        if (thin_q) return &xehpg_h32_2nd;
+        if (seq <= 32) return &xehpg_h32_s32;
+        if (seq <= 64) return &xehpg_h32_s64;
+        if (seq <= 256) return &xehpg_h32_s256;
+        return &xehpg_h32;
+    } else if (head_size <= 64) {
+        if (quantized) {
+            if (thin_q) {
+                if (seq <= 64) return &xehpg_q_h64_s64_2nd;
+                if (seq <= 128) return &xehpg_q_h64_s128_2nd;
+                return &xehpg_q_h64_2nd;
+            } else {
+                if (seq <= 32) return &xehpg_q_h64_s32;
+                if (seq <= 64) return &xehpg_q_h64_s64;
+                if (seq <= 128) return &xehpg_q_h64_s128;
+                return &xehpg_q_h64;
+            }
+        }
+        if (thin_q) return &xehpg_h64_2nd;
+        if (seq <= 64) return &xehpg_h64_s64;
+        if (seq <= 128) return &xehpg_h64_s128;
+        return &xehpg_h64;
+    } else if (head_size <= 128) {
+        if (quantized) {
+            if (thin_q) {
+                if (seq <= 96) return &xehpg_q_h128_s96_2nd;
+                return &xehpg_q_h128_2nd;
+            }
+            if (seq <= 64) return &xehpg_q_h128_s64;
+            if (seq <= 512) return &xehpg_q_h128_s512;
+            return &xehpg_q_h128;
+        }
+        if (thin_q) {
+            if (seq <= 256) return &xehpg_h128_s256_2nd;
+            return &xehpg_h128_2nd;
+        }
+        if (seq <= 32) return &xehpg_h128_s32;
+        return &xehpg_h128;
+    } else if (head_size <= 256) {
+        if (thin_q) {
+            if (quantized) {
+                if (seq <= 96) return &xehpg_q_h256_s96_2nd;
+                return &xehpg_q_h256_2nd;
+            }
+            if (seq <= 32) return &xehpg_h256_s32_2nd;
+            if (seq <= 64) return &xehpg_h256_s64_2nd;
+            return &xehpg_h256_2nd;
+        }
+        if (quantized) {
+            if (seq <= 64) return &xehpg_q_h256_s64;
+            if (seq <= 512) return &xehpg_q_h256_s512;
+            return &xehpg_q_h256;
+        }
+        if (seq <= 32) return &xehpg_h256_s32;
+        if (seq <= 128) return &xehpg_h256_s128;
+        return &xehpg_h256;
+    } else if (head_size <= 512) {
+        if (quantized) {
+            if (thin_q) {
+                if (seq <= 64) return &xehpg_q_h512_s64_2nd;
+                if (seq <= 256) return &xehpg_q_h512_s256_2nd;
+                return &xehpg_q_h512_2nd;
+            }
+            if (seq <= 64) return &xehpg_q_h512_s64;
+            if (seq <= 128) return &xehpg_q_h512_s128;
+            if (seq <= 256) return &xehpg_q_h512_s256;
+            return &xehpg_q_h512;
+        }
+        if (thin_q) { return &xehpg_h512_2nd; }
+        return &xehpg_h512;
+    }
+    return nullptr;
+}
+
+sdpa_config_t *choose_config_xehpc(dim_t head_size, dim_t seq, bool thin_q,
+        bool quantized, bool is_integrated) {
+    if (head_size <= 32) {
+        if (thin_q) return &xehpc_h32_2nd;
+        if (seq <= 32) return &xehpc_h32_s32;
+        return &xehpc_h32;
+    } else if (head_size <= 64) {
+        if (thin_q) {
+            if (quantized) {
+                if (seq <= 96) return &xehpc_q_h64_s96_2nd;
+                if (seq <= 256) return &xehpc_q_h64_s256_2nd;
+                if (seq <= 1152) return &xehpc_q_h64_s1152_2nd;
+                return &xehpc_q_h64_2nd;
+            }
+
+            if (seq <= 64) return &xehpc_h64_s64_2nd;
+            return &xehpc_h64_2nd;
+        }
+        if (quantized) {
+            if (seq <= 64) return &xehpc_q_h64_s64;
+            if (seq <= 384) return &xehpc_q_h64_s384;
+            if (seq <= 1024) return &xehpc_q_h64_s1024;
+            return &xehpc_q_h64;
+        }
+        if (seq <= 32) return &xehpc_h64_s32;
+        if (seq <= 64) return &xehpc_h64_s64;
+        return &xehpc_h64;
+    } else if (head_size <= 128) {
+        if (quantized) {
+            if (thin_q) {
+                if (is_integrated) { return &xehpc_q_h128_2nd_integrated; }
+                if (seq <= 96) return &xehpc_q_h128_s96_2nd;
+                if (seq <= 512) return &xehpc_q_h128_s512_2nd;
+                return &xehpc_q_h128_2nd;
+            }
+            if (is_integrated) {
+                if (seq <= 128) { return &xehpc_q_h128_s128_integrated; }
+            }
+            if (seq <= 32) return &xehpc_q_h128_s32;
+            if (seq <= 128) return &xehpc_q_h128_s128;
+            return &xehpc_q_h128;
+        }
+        if (thin_q) return &xehpc_h128_2nd;
+        if (seq <= 32) return &xehpc_h128_s32;
+        if (seq <= 64) return &xehpc_h128_s64;
+        return &xehpc_h128;
+    } else if (head_size <= 256) {
+        if (thin_q) return &xehpc_h256_2nd;
+        if (seq <= 64) return &xehpc_h256_s64;
+        return &xehpc_h256;
+    } else if (head_size <= 512) {
+        if (thin_q) {
+            if (quantized) return &xehpc_q_h512_2nd;
+
+            if (seq <= 128) return &xehpc_h512_s128_2nd;
+            if (seq <= 512) return &xehpc_h512_s512_2nd;
+            if (seq <= 1024) return &xehpc_h512_s1024_2nd;
+            return &xehpc_h512_2nd;
+        }
+
+        if (quantized) {
+            if (seq <= 128) return &xehpc_q_h512_s128;
+            return &xehpc_q_h512;
+        }
+        if (seq <= 32) return &xehpc_h512_s32;
+        if (seq <= 128) return &xehpc_h512_s128;
+        return &xehpc_h512;
+    } else if (head_size <= 576) {
+        if (!quantized) {
+            if (thin_q) return &xehpc_h576_2nd;
+            return &xehpc_h576;
+        }
+    }
+    return nullptr;
+}
+
+sdpa_config_t *choose_config_xe2(dim_t head_size, dim_t seq, bool thin_q,
+        bool quantized, bool is_integrated) {
+    if (head_size <= 64) {
+        if (quantized) {
+            if (thin_q) {
+                if (is_integrated) {
+                    if (seq <= 96) return &xe2_q_h64_s96_2nd_integrated;
+                    if (seq <= 384) return &xe2_q_h64_s384_2nd_integrated;
+                    return &xe2_q_h64_2nd_integrated;
+                }
+                if (seq <= 64) return &xe2_q_h64_s64_2nd;
+                if (seq <= 128) return &xe2_q_h64_s128_2nd;
+                if (seq <= 384) return &xe2_q_h64_s384_2nd;
+                if (seq <= 512) return &xe2_q_h64_s512_2nd;
+                if (seq <= 768) return &xe2_q_h64_s768_2nd;
+                return &xe2_q_h64_2nd;
+            }
+            if (seq <= 32) return &xe2_q_h64_s32;
+            if (is_integrated) {
+                if (seq <= 128) return &xe2_q_h64_s128_integrated;
+            }
+            if (seq <= 128) return &xe2_q_h64_s128;
+            if (seq <= 384) return &xe2_q_h64_s384;
+            if (seq <= 512) return &xe2_q_h64_s512;
+            if (is_integrated) {
+                if (seq <= 1024) return &xe2_q_h64_s1024_integrated;
+            }
+            return &xe2_q_h64;
+        }
+    }
+
+    if (head_size <= 128) {
+        return choose_config_xehpc(
+                head_size, seq, thin_q, quantized, is_integrated);
+    }
+
+    if (head_size <= 256) {
+        if (quantized) {
+            if (is_integrated) {
+                if (thin_q) {
+                    if (seq < 384) return &xe2_q_h256_s384_2nd_integrated;
+                    if (seq < 512) return &xe2_q_h256_s512_2nd_integrated;
+                    if (seq < 768) return &xe2_q_h256_s768_2nd_integrated;
+                    if (seq < 1152) return &xe2_q_h256_s1152_2nd_integrated;
+                    return &xe2_q_h256_2nd_integrated;
+                }
+                if (seq <= 64) return &xe2_q_h256_s64_integrated;
+                if (seq <= 128) return &xe2_q_h256_s128_integrated;
+            }
+            if (!thin_q) {
+                if (seq <= 64) return &xe2_q_h256_s64;
+                if (seq <= 128) return &xe2_q_h256_s128;
+                if (seq <= 384) return &xe2_q_h256_s384;
+                return &xe2_q_h256;
+            }
+        }
+    }
+
+    if (head_size <= 512) {
+        if (thin_q) {
+            if (quantized) {
+                if (is_integrated) {
+                    if (seq <= 64) return &xe2_q_h512_s64_2nd_integrated;
+                    if (seq <= 128) return &xe2_q_h512_s128_2nd_integrated;
+                    if (seq <= 256) return &xe2_q_h512_s256_2nd_integrated;
+                    if (seq <= 512) return &xe2_q_h512_s512_2nd_integrated;
+                    if (seq <= 1024) return &xe2_q_h512_s1024_2nd_integrated;
+                    return &xe2_q_h512_2nd_integrated;
+                }
+                if (seq <= 64) return &xe2_q_h512_s64_2nd;
+                return &xe2_q_h512_2nd;
+            }
+
+            if (is_integrated) {
+                if (seq <= 256) return &xe2_h512_s256_2nd_integrated;
+                if (seq <= 1024) return &xe2_h512_s1024_2nd_integrated;
+                return &xe2_h512_2nd_integrated;
+            }
+            if (seq <= 128) return &xe2_h512_s128_2nd;
+            if (seq <= 512) return &xe2_h512_s512_2nd;
+            if (seq <= 1024) return &xe2_h512_s1024_2nd;
+            return &xe2_h512_2nd;
+        }
+
+        if (quantized) {
+            if (is_integrated) return &xe2_q_h512_integrated;
+            if (seq <= 128) return &xe2_q_h512_s128;
+            return &xe2_q_h512;
+        }
+        if (is_integrated) {
+            if (seq <= 128) return &xe2_h512_s128_integrated;
+            return &xe2_h512_integrated;
+        }
+        if (seq <= 64) return &xe2_h512_s64;
+        return &xe2_h512;
+    }
+    if (head_size <= 576) {
+        if (!quantized) { return &xe2_h576; }
+    }
+    return choose_config_xehpc(
+            head_size, seq, thin_q, quantized, is_integrated);
+}
+
+} /* anonymous namespace */
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/ocl/multi_concat.hpp b/src/gpu/intel/ocl/multi_concat.hpp
index 2993dc08480..e6913a40128 100644
--- a/src/gpu/intel/ocl/multi_concat.hpp
+++ b/src/gpu/intel/ocl/multi_concat.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include "common/stream.hpp"
 #include "gpu/gpu_concat_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -40,7 +40,7 @@ struct multi_concat_t : public gpu_primitive_t {
         using gpu_concat_pd_t::gpu_concat_pd_t;
 
         pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        ~pd_t() override = default;
 
         DECLARE_CONCAT_PD_T("multi:any", multi_concat_t);
 
diff --git a/src/gpu/intel/ocl/multi_po_reorder_binary.hpp b/src/gpu/intel/ocl/multi_po_reorder_binary.hpp
index 99d3023aee8..d590cc809df 100644
--- a/src/gpu/intel/ocl/multi_po_reorder_binary.hpp
+++ b/src/gpu/intel/ocl/multi_po_reorder_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include "gpu/gpu_binary_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -33,20 +33,23 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
-struct multi_po_reorder_binary : public gpu_primitive_t {
+struct multi_po_reorder_binary_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_binary_pd_t {
         using gpu_binary_pd_t::gpu_binary_pd_t;
 
-        DECLARE_COMMON_PD_T("multi_po_reorder_binary", multi_po_reorder_binary);
+        DECLARE_COMMON_PD_T(
+                "multi_po_reorder_binary", multi_po_reorder_binary_t);
 
         status_t init(impl::engine_t *engine) {
-            if (attr()->scales_.get(DNNL_ARG_SRC_0).is_set_
-                    || attr()->scales_.get(DNNL_ARG_SRC_1).is_set_
+            if (!attr()->scales_.has_default_values(DNNL_ARG_SRC_0)
+                    || !attr()->scales_.get(DNNL_ARG_SRC_1).has_default_values()
                     || attr()->post_ops_.len() >= 1) {
                 VDISPATCH_BINARY(false, VERBOSE_UNSUPPORTED_ATTR);
             }
 
+            VDISPATCH_BINARY(!is_ternary_op(), VERBOSE_BAD_ALGORITHM);
+
             // Assumption: src_mds have different layouts with dst mem
             // descriptor matching only with one of the src mem descriptors
             // or, all of them have the same memory layout
diff --git a/src/gpu/intel/ocl/multi_po_reorder_sum.hpp b/src/gpu/intel/ocl/multi_po_reorder_sum.hpp
index 2f4ad5594ae..e856d307c2f 100644
--- a/src/gpu/intel/ocl/multi_po_reorder_sum.hpp
+++ b/src/gpu/intel/ocl/multi_po_reorder_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
 #include "gpu/gpu_resource.hpp"
 #include "gpu/gpu_sum_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -33,15 +33,15 @@ namespace gpu {
 namespace intel {
 namespace ocl {
 
-struct multi_po_reorder_sum : public gpu_primitive_t {
+struct multi_po_reorder_sum_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_sum_pd_t {
         using gpu_sum_pd_t::gpu_sum_pd_t;
 
         pd_t(const pd_t &rhs) = default;
-        ~pd_t() = default;
+        ~pd_t() override = default;
 
-        DECLARE_SUM_PD_T("multi_po_reorder_sum", multi_po_reorder_sum);
+        DECLARE_SUM_PD_T("multi_po_reorder_sum", multi_po_reorder_sum_t);
 
         status_t init(impl::engine_t *engine) {
             VDISPATCH_SUM_SC(
@@ -150,7 +150,7 @@ struct multi_po_reorder_sum : public gpu_primitive_t {
 
         if (pd()->has_zero_dim_memory()) return status::success;
 
-        std::unique_ptr<memory_t> p_temp_dst_acc;
+        std::unique_ptr<memory_t, memory_deleter_t> p_temp_dst_acc;
         const bool need_output_reorder
                 = pd()->need_output_reorder() && (reorders_.size() > 1);
         if (need_output_reorder) {
diff --git a/src/gpu/intel/ocl/ocl_conversion.h b/src/gpu/intel/ocl/ocl_conversion.h
index 7635972175f..7417f340d3b 100644
--- a/src/gpu/intel/ocl/ocl_conversion.h
+++ b/src/gpu/intel/ocl/ocl_conversion.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -73,6 +73,8 @@ def_std_into_sat(uchar, int);
 IF_DOUBLE_SUPPORTED(def_std_into_sat(uchar, double));
 IF_HALF_SUPPORTED(def_std_into_sat(uchar, half));
 
+def_std_into_sat(int, float);
+
 IF_HALF_SUPPORTED(def_std_into(half, char));
 IF_HALF_SUPPORTED(def_std_into(half, uchar));
 IF_HALF_SUPPORTED(def_std_into(half, float));
@@ -91,6 +93,17 @@ IF_DOUBLE_SUPPORTED(def_std_into(double, float));
 IF_DOUBLE_SUPPORTED(def_std_into(double, int));
 IF_DOUBLE_SUPPORTED(IF_HALF_SUPPORTED(def_std_into(double, half)));
 
+#define def_undef_into(out_type) \
+    out_type __attribute__((overloadable)) \
+            CONCAT2(into_, out_type)(undef_data val) { \
+        DEBUG_PRINT("Error: unexpected conversion from undefined data"); \
+        return 0xbadbad; \
+    }
+
+def_undef_into(float);
+def_undef_into(int);
+IF_DOUBLE_SUPPORTED(def_undef_into(double));
+
 #undef def_std_into
 #undef def_std_into_sat
 
@@ -149,28 +162,7 @@ IF_HALF_SUPPORTED(def_two_step_conversion(bf16, half, float));
 #ifdef MATH_UTILS_DECLARE_BF8
 
 f8_e5m2 __attribute__((overloadable)) into_f8_e5m2(half f) {
-    // we just need to apply rounding
-    ushort fraw = as_ushort(f);
-    ushort naninf_mask = 0x7c00;
-
-    bool is_special = (fraw & naninf_mask) == naninf_mask;
-    bool is_nan = is_special && (fraw & 0x03ff); // one of the lsb is non zero
-
-    // we always return R ind for Nan input as there is no good
-    // conversion of payload
-    if (is_nan) { return as_f8_e5m2((fraw >> 8) | 0x02); }
-
-    // if infinity, we just return it as is
-    if (is_special) {
-        uchar raw_bits = fraw >> 8;
-        return as_f8_e5m2(raw_bits);
-    }
-
-    // otherwise we just round and return
-    ushort rounding_nudge = 0x007f + ((fraw & 0x0100) >> 8);
-    fraw = fraw + rounding_nudge;
-    uchar raw_bits = fraw >> 8;
-    return as_f8_e5m2(raw_bits);
+    return as_f8_e5m2(cvt_hf_to_f8_e5m2(f));
 }
 
 half __attribute__((overloadable)) into_half(f8_e5m2 b) {
@@ -188,81 +180,11 @@ IF_DOUBLE_SUPPORTED(def_two_step_conversion(double, f8_e5m2, float));
 
 #ifdef MATH_UTILS_DECLARE_HF8
 f8_e4m3 __attribute__((overloadable)) into_f8_e4m3(half f) {
-    // Here the idea is to add a large constant to the float16_t to force the
-    // proper rounding to f8_e4m3 accuracy.
-    uchar raw_bits = 0;
-    ushort fraw = as_ushort(f);
-
-    // first we extract the sign and make the input positive
-    uint s8 = (fraw & 0x8000) >> 8;
-    fraw = fraw & 0x7fff;
-
-    // we filter out overlow, nan
-    if (fraw >= 0x5f40) {
-        raw_bits = s8 | 0x7f;
-        return as_f8_e4m3(raw_bits);
-    }
-    // we filter out underflow when f <= 2^-10
-    if (fraw <= 0x1400) {
-        raw_bits = s8;
-        return as_f8_e4m3(raw_bits);
-    }
-
-    // compute the rounding shifter by taking its exponent + 0x1p7
-    // Lucky us, it does not overflow as fraw <= 448.
-    ushort a = 0x7c00, b = 0x1c00;
-    ushort shifter = (fraw & a) + b;
-    // e8 = e16 - e16_bias + e8_bias = e16 - 15 + 7
-    // e8 will be denorm if e8 <= 0 or e16 + 7 < 16
-    const int exp_threshold = 0x4000; // raw bits of exponent = 16
-    ushort is_denorm = shifter < exp_threshold;
-    if (is_denorm) shifter = exp_threshold;
-
-    ushort rounded
-            = as_ushort((as_half(fraw) + as_half(shifter)) - as_half(shifter));
-
-    int e8 = ((rounded & 0x7c00) >> 10) - 8;
-    uchar m8 = (rounded & 0x03ff) >> 7;
-
-    // we need to make the implicit f32 mantissa bit explicit for
-    // denorm f8_e4m3
-    if (is_denorm) {
-        m8 = (m8 | 0x08) >> (-e8 + 1);
-        e8 = 0;
-    }
-
-    raw_bits = s8 | (e8 << 3) | m8;
-    return as_f8_e4m3(raw_bits);
+    return as_f8_e4m3(cvt_hf_to_f8_e4m3(f));
 }
 
 half __attribute__((overloadable)) into_half(f8_e4m3 b) {
-    uchar raw_bits_ = b.data;
-    ushort s8 = (raw_bits_ & 0x80) >> 7;
-    ushort e8 = (raw_bits_ & 0x78) >> 3;
-    ushort m8 = (raw_bits_ & 0x7);
-    ushort s16 = s8;
-    ushort e16 = e8 + 8; /* 15 - 7 = e16_bias - e8_bias */
-    ushort m16 = m8;
-
-    // Need to convert f8_e4m3 denormal into f16 normal.
-    if (e8 == 0 && m8 != 0) {
-        ushort count = 2;
-        count = m8 > 0x1 ? 1 : count;
-        count = m8 > 0x3 ? 0 : count;
-        e16 -= count;
-        m16 = (m16 << (count + 1)) & 0x7;
-    } else if (e8 == 0 && m8 == 0) {
-        e16 = 0;
-    } else if (e8 == 0xf && m8 == 0x7) {
-        e16 = 0x1f;
-        m16 = 0x4; // Real Indefinite (a qNaN)
-    }
-    s16 <<= 15;
-    e16 <<= 10;
-    m16 <<= 7;
-
-    ushort u16 = s16 | e16 | m16;
-    return as_half(u16);
+    return cvt_f8_e4m3_to_hf(b.data);
 }
 
 def_two_step_conversion(f8_e4m3, float, half);
@@ -273,6 +195,49 @@ IF_DOUBLE_SUPPORTED(def_two_step_conversion(f8_e4m3, double, float));
 IF_DOUBLE_SUPPORTED(def_two_step_conversion(double, f8_e4m3, float));
 #endif // MATH_UTILS_DECLARE_HF8
 
+#ifdef MATH_UTILS_DECLARE_F4_E2M1
+f4_e2m1 __attribute__((overloadable)) into_f4_e2m1(float f) {
+    return as_f4_e2m1(cvt_f32_to_f4_e2m1(f));
+}
+
+half __attribute__((overloadable)) into_float(f4_e2m1 b) {
+    return cvt_f4_e2m1_to_f32(b.data);
+}
+
+def_two_step_conversion(f4_e2m1, half, float);
+def_two_step_conversion(f4_e2m1, int, float);
+def_two_step_conversion(half, f4_e2m1, float);
+
+IF_DOUBLE_SUPPORTED(def_two_step_conversion(f4_e2m1, double, float));
+IF_DOUBLE_SUPPORTED(def_two_step_conversion(double, f4_e2m1, float));
+#endif // MATH_UTILS_DECLARE_F4_E2M1
+
+#ifdef MATH_UTILS_DECLARE_F4_E3M0
+f4_e3m0 __attribute__((overloadable)) into_f4_e3m0(float f) {
+    return as_f4_e3m0(cvt_f32_to_f4_e3m0(f));
+}
+
+half __attribute__((overloadable)) into_float(f4_e3m0 b) {
+    return cvt_f4_e3m0_to_f32(b.data);
+}
+
+def_two_step_conversion(f4_e3m0, half, float);
+def_two_step_conversion(f4_e3m0, int, float);
+def_two_step_conversion(half, f4_e3m0, float);
+
+IF_DOUBLE_SUPPORTED(def_two_step_conversion(f4_e3m0, double, float));
+IF_DOUBLE_SUPPORTED(def_two_step_conversion(double, f4_e3m0, float));
+#endif // MATH_UTILS_DECLARE_F4_E3M0
+
+#ifdef MATH_UTILS_DECLARE_E8M0
+// Copy-paste from `cvt_e8m0_to_f32`.
+float __attribute__((overloadable)) into_float(e8m0 b) {
+    if (b.data == (char)0xff) return as_float(0xffc00000);
+    uint bits = b.data << 23;
+    return as_float(bits);
+}
+#endif
+
 IF_DOUBLE_SUPPORTED(def_two_step_conversion(bf16, double, float));
 IF_DOUBLE_SUPPORTED(def_two_step_conversion(double, bf16, float));
 
diff --git a/src/gpu/intel/ocl/ocl_custom_types.h b/src/gpu/intel/ocl/ocl_custom_types.h
index fd763b777a0..b12bfc5eef0 100644
--- a/src/gpu/intel/ocl/ocl_custom_types.h
+++ b/src/gpu/intel/ocl/ocl_custom_types.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,10 +17,16 @@
 #ifndef GPU_INTEL_OCL_OCL_CUSTOM_TYPES_H
 #define GPU_INTEL_OCL_OCL_CUSTOM_TYPES_H
 
-#define dim_t long // 64 bit per the OpenCL specification
+// Fixed to 64 bit per the OpenCL specification to align with same type in C++
+// source code
+typedef long dim_t;
 
-// include to get the MATH_UTILS_* macros
-#include "gpu/intel/ocl/ocl_math_utils.h"
+// Signed offset used to support the (rarely) used negative strides
+#ifdef USE_INT32_OFFSET
+typedef int off_t;
+#else
+typedef long off_t;
+#endif
 
 typedef struct {
     short data;
@@ -34,7 +40,6 @@ bf16 as_bf16(short data) {
 
 /*****************************/
 
-#ifdef MATH_UTILS_DECLARE_BF8
 typedef struct {
     char data;
 } f8_e5m2;
@@ -44,11 +49,9 @@ f8_e5m2 as_f8_e5m2(char data) {
     res.data = data;
     return res;
 }
-#endif
 
 /*****************************/
 
-#ifdef MATH_UTILS_DECLARE_HF8
 typedef struct {
     char data;
 } f8_e4m3;
@@ -58,6 +61,52 @@ f8_e4m3 as_f8_e4m3(char data) {
     res.data = data;
     return res;
 }
-#endif
+
+/*****************************/
+
+typedef struct {
+    unsigned char data;
+} e8m0;
+
+e8m0 as_e8m0(unsigned char data) {
+    e8m0 res;
+    res.data = data;
+    return res;
+}
+
+/*****************************/
+
+typedef struct {
+    char data;
+} f4_e2m1;
+
+f4_e2m1 as_f4_e2m1(unsigned char data) {
+    f4_e2m1 res;
+    res.data = data;
+    return res;
+}
+
+/*****************************/
+
+typedef struct {
+    char data;
+} f4_e3m0;
+
+f4_e3m0 as_f4_e3m0(unsigned char data) {
+    f4_e3m0 res;
+    res.data = data;
+    return res;
+}
+
+/*****************************/
+
+typedef struct {
+    char invalid_data;
+} undef_data;
+
+undef_data as_undef_data(char data) {
+    undef_data ret = {0xba};
+    return ret;
+}
 
 #endif
diff --git a/src/gpu/intel/ocl/ocl_eltwise.h b/src/gpu/intel/ocl/ocl_eltwise.h
index e71e9752bee..971033d56b8 100644
--- a/src/gpu/intel/ocl/ocl_eltwise.h
+++ b/src/gpu/intel/ocl/ocl_eltwise.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #ifndef GPU_INTEL_OCL_OCL_ELTWISE_H
 #define GPU_INTEL_OCL_OCL_ELTWISE_H
 
-#if WITH_ELTWISE
 #include "gpu/intel/ocl/ocl_types.h"
 
 #if DT_F16 == 1
@@ -36,6 +35,12 @@
 #endif
 #endif
 
+#if DT_F64 == 1
+#define POST_OP_LITERAL(x) x
+#else
+#define POST_OP_LITERAL(x) x##f
+#endif
+
 POST_OP_DATA_T relu_fwd(POST_OP_DATA_T s, float alpha) {
     return s > 0 ? s : s * alpha;
 }
@@ -61,18 +66,18 @@ POST_OP_DATA_T soft_relu_fwd(POST_OP_DATA_T s, float alpha) {
 }
 POST_OP_DATA_T soft_relu_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s, float alpha) {
     s = alpha * s;
-    return dd / ((POST_OP_DATA_T)1 + exp(-s));
+    return dd / (POST_OP_LITERAL(1.) + exp(-s));
 }
 
 POST_OP_DATA_T logistic_fwd(POST_OP_DATA_T s) {
-    return (AS_POST_OP_DATA_T(1.0)) / (AS_POST_OP_DATA_T(1.0) + exp(-s));
+    return (POST_OP_LITERAL(1.)) / (POST_OP_LITERAL(1.) + exp(-s));
 }
 POST_OP_DATA_T logistic_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s) {
     POST_OP_DATA_T v = logistic_fwd(s);
-    return dd * v * (1 - v);
+    return dd * v * (POST_OP_LITERAL(1.) - v);
 }
 POST_OP_DATA_T logistic_bwd_use_dst(POST_OP_DATA_T dd, POST_OP_DATA_T d) {
-    return dd * d * (1 - d);
+    return dd * d * (POST_OP_LITERAL(1.) - d);
 }
 
 POST_OP_DATA_T square_fwd(POST_OP_DATA_T s) {
@@ -86,10 +91,10 @@ POST_OP_DATA_T sqrt_fwd(POST_OP_DATA_T s) {
     return sqrt(s);
 }
 POST_OP_DATA_T sqrt_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s) {
-    return dd / (2 * sqrt(s));
+    return dd / (POST_OP_LITERAL(2.) * sqrt(s));
 }
 POST_OP_DATA_T sqrt_bwd_use_dst(POST_OP_DATA_T dd, POST_OP_DATA_T d) {
-    return dd / (2 * d);
+    return dd / (POST_OP_LITERAL(2.) * d);
 }
 
 POST_OP_DATA_T abs_fwd(POST_OP_DATA_T s) {
@@ -104,22 +109,22 @@ POST_OP_DATA_T tanh_fwd(POST_OP_DATA_T s) {
 }
 POST_OP_DATA_T tanh_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s) {
     POST_OP_DATA_T e = tanh_fwd(s);
-    return dd * (1 - e) * (1 + e);
+    return dd * (POST_OP_LITERAL(1.) - e) * (POST_OP_LITERAL(1.) + e);
 }
 POST_OP_DATA_T tanh_bwd_use_dst(POST_OP_DATA_T dd, POST_OP_DATA_T d) {
-    return dd * (1 - d) * (1 + d);
+    return dd * (POST_OP_LITERAL(1.) - d) * (POST_OP_LITERAL(1.) + d);
 }
 
 POST_OP_DATA_T mish_fwd(POST_OP_DATA_T s) {
-    return s * tanh_fwd(soft_relu_fwd(s, 1.f));
+    return s * tanh_fwd(soft_relu_fwd(s, POST_OP_LITERAL(1.)));
 }
 POST_OP_DATA_T mish_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s) {
-    const POST_OP_DATA_T tanh = tanh_fwd(soft_relu_fwd(s, (POST_OP_DATA_T)1.0));
+    const POST_OP_DATA_T tanh = tanh_fwd(soft_relu_fwd(s, POST_OP_LITERAL(1.)));
     const POST_OP_DATA_T srelu_bwd
-            = soft_relu_bwd((POST_OP_DATA_T)1.0, s, (POST_OP_DATA_T)1.0);
+            = soft_relu_bwd(POST_OP_LITERAL(1.), s, POST_OP_LITERAL(1.));
     const POST_OP_DATA_T derivative = tanh
             + s * srelu_bwd
-                    * ((POST_OP_DATA_T)1 - pow(tanh, (POST_OP_DATA_T)2.0));
+                    * (POST_OP_LITERAL(1.) - pow(tanh, POST_OP_LITERAL(2.)));
     return dd * derivative;
 }
 
@@ -147,33 +152,33 @@ POST_OP_DATA_T exp_bwd_use_dst(POST_OP_DATA_T dd, POST_OP_DATA_T d) {
 
 POST_OP_DATA_T gelu_tanh_fwd(POST_OP_DATA_T s) {
     const POST_OP_DATA_T sqrt_2_over_pi
-            = AS_POST_OP_DATA_T(0.79788458347320556640625);
-    const POST_OP_DATA_T fitting_const = AS_POST_OP_DATA_T(0.044715);
+            = POST_OP_LITERAL(0.79788458347320556640625);
+    const POST_OP_DATA_T fitting_const = POST_OP_LITERAL(0.044715);
     const POST_OP_DATA_T g = sqrt_2_over_pi * s
-            * ((POST_OP_DATA_T)1.0 + fitting_const * s * s);
-    return ((POST_OP_DATA_T)0.5 * s * ((POST_OP_DATA_T)1.0 + tanh_fwd(g)));
+            * (POST_OP_LITERAL(1.) + fitting_const * s * s);
+    return (POST_OP_LITERAL(0.5) * s * (POST_OP_LITERAL(1.) + tanh_fwd(g)));
 }
 POST_OP_DATA_T gelu_tanh_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s) {
     const POST_OP_DATA_T sqrt_2_over_pi
-            = AS_POST_OP_DATA_T(0.79788458347320556640625);
-    const POST_OP_DATA_T fitting_const = AS_POST_OP_DATA_T(0.044715);
+            = POST_OP_LITERAL(0.79788458347320556640625);
+    const POST_OP_DATA_T fitting_const = POST_OP_LITERAL(0.044715);
     const POST_OP_DATA_T g = sqrt_2_over_pi * s
-            * ((POST_OP_DATA_T)1.0 + fitting_const * s * s);
+            * (POST_OP_LITERAL(1.) + fitting_const * s * s);
     const POST_OP_DATA_T dg = sqrt_2_over_pi
-            * ((POST_OP_DATA_T)1.0
-                    + (POST_OP_DATA_T)3.0 * fitting_const * s * s);
+            * (POST_OP_LITERAL(1.)
+                    + POST_OP_LITERAL(3.) * fitting_const * s * s);
     const POST_OP_DATA_T v = tanh_fwd(g);
-    return dd * (POST_OP_DATA_T)0.5 * ((POST_OP_DATA_T)1.0 + v)
-            * ((POST_OP_DATA_T)1.0 + s * ((POST_OP_DATA_T)1.0 - v) * dg);
+    return dd * POST_OP_LITERAL(0.5) * (POST_OP_LITERAL(1.) + v)
+            * (POST_OP_LITERAL(1.) + s * (POST_OP_LITERAL(1.) - v) * dg);
 }
 
 POST_OP_DATA_T swish_fwd(POST_OP_DATA_T s, float alpha) {
     POST_OP_DATA_T w = -alpha * s;
-    return s / ((POST_OP_DATA_T)1.0 + exp(w));
+    return s / (POST_OP_LITERAL(1.) + exp(w));
 }
 POST_OP_DATA_T swish_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s, float alpha) {
     POST_OP_DATA_T v = logistic_fwd(alpha * s);
-    return dd * (v + s * alpha * v * ((POST_OP_DATA_T)1.0 - v));
+    return dd * (v + s * alpha * v * (POST_OP_LITERAL(1.) - v));
 }
 
 POST_OP_DATA_T log_fwd(POST_OP_DATA_T s) {
@@ -218,19 +223,19 @@ POST_OP_DATA_T pow_bwd(
 
 POST_OP_DATA_T gelu_erf_fwd(POST_OP_DATA_T s) {
     const POST_OP_DATA_T sqrt_2_over_2
-            = AS_POST_OP_DATA_T(0.707106769084930419921875);
+            = POST_OP_LITERAL(0.707106769084930419921875);
     POST_OP_DATA_T v = s * sqrt_2_over_2;
-    return (POST_OP_DATA_T)0.5 * s * ((POST_OP_DATA_T)1.0 + erf(v));
+    return POST_OP_LITERAL(0.5) * s * (POST_OP_LITERAL(1.) + erf(v));
 }
 
 POST_OP_DATA_T gelu_erf_bwd(POST_OP_DATA_T dd, POST_OP_DATA_T s) {
     const POST_OP_DATA_T two_over_sqrt_pi
-            = AS_POST_OP_DATA_T(1.12837922573089599609375);
+            = POST_OP_LITERAL(1.12837922573089599609375);
     const POST_OP_DATA_T sqrt_2_over_2
-            = AS_POST_OP_DATA_T(0.707106769084930419921875);
+            = POST_OP_LITERAL(0.707106769084930419921875);
     POST_OP_DATA_T v = s * sqrt_2_over_2;
-    return dd * (POST_OP_DATA_T)0.5
-            * ((POST_OP_DATA_T)1.0 + erf(v)
+    return dd * POST_OP_LITERAL(0.5)
+            * (POST_OP_LITERAL(1.) + erf(v)
                     + v * two_over_sqrt_pi * exp(-v * v));
 }
 
@@ -240,9 +245,9 @@ float round_fwd(POST_OP_DATA_T s) {
 
 POST_OP_DATA_T hardsigmoid_fwd(POST_OP_DATA_T s, float alpha, float beta) {
     POST_OP_DATA_T v = (POST_OP_DATA_T)alpha * s + (POST_OP_DATA_T)beta;
-    return v <= AS_POST_OP_DATA_T(0.0)    ? AS_POST_OP_DATA_T(0.0)
-            : v >= AS_POST_OP_DATA_T(1.0) ? AS_POST_OP_DATA_T(1.0)
-                                          : v;
+    return v <= POST_OP_LITERAL(0.)    ? POST_OP_LITERAL(0.)
+            : v >= POST_OP_LITERAL(1.) ? POST_OP_LITERAL(1.)
+                                       : v;
 }
 POST_OP_DATA_T hardsigmoid_bwd(
         POST_OP_DATA_T dd, POST_OP_DATA_T s, float alpha, float beta) {
@@ -256,7 +261,7 @@ POST_OP_DATA_T hardswish_fwd(POST_OP_DATA_T s, float alpha, float beta) {
 POST_OP_DATA_T hardswish_bwd(
         POST_OP_DATA_T dd, POST_OP_DATA_T s, float alpha, float beta) {
     POST_OP_DATA_T v = alpha * s + beta;
-    POST_OP_DATA_T w = 2.f * alpha * s + beta;
+    POST_OP_DATA_T w = POST_OP_LITERAL(2.) * alpha * s + beta;
     return (v <= 0.f ? 0.f : v >= 1.f ? dd : dd * w);
 }
 
@@ -346,6 +351,4 @@ float bwd_eltwise(
 #endif
 }
 
-#endif // WITH_ELTWISE
-
 #endif // GPU_INTEL_OCL_OCL_ELTWISE_H
diff --git a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp b/src/gpu/intel/ocl/ocl_gpu_device_info.cpp
deleted file mode 100644
index ca0be02bc73..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_device_info.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/ocl/ocl_gpu_device_info.hpp"
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-#include "gpu/intel/ocl/ocl_gpu_hw_info.hpp"
-
-#include <CL/cl_ext.h>
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-status_t ocl_gpu_device_info_t::init_arch(impl::engine_t *engine) {
-    cl_int err = CL_SUCCESS;
-    auto device = utils::downcast<const ocl_gpu_engine_t *>(engine)->device();
-
-    // skip other vendors
-    const cl_uint intel_vendor_id = 0x8086;
-    cl_uint vendor_id;
-    err = clGetDeviceInfo(
-            device, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &vendor_id, nullptr);
-    OCL_CHECK(err);
-    if (vendor_id != intel_vendor_id) return status::success;
-
-    cl_context context
-            = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
-    OCL_CHECK(err);
-
-    init_gpu_hw_info(engine, device, context, ip_version_, gpu_arch_,
-            gpu_product_family_, stepping_id_, native_extensions_,
-            mayiuse_systolic_, mayiuse_ngen_kernels_, mayiuse_microkernels_);
-
-    err = clReleaseContext(context);
-    OCL_CHECK(err);
-
-    // XXX: temporary WA for different Xe_HP devices
-    if (gpu_arch_ == compute::gpu_arch_t::xe_hp) {
-        // query extensions
-        size_t param_size = 0;
-        err = clGetDeviceInfo(
-                device, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
-        OCL_CHECK(err);
-
-        std::string extension_string(param_size, '\0');
-        err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_size,
-                &extension_string[0], &param_size);
-        OCL_CHECK(err);
-        if (extension_string.find(ext2cl_str(compute::device_ext_t::khr_fp64))
-                == std::string::npos)
-            gpu_arch_ = compute::gpu_arch_t::xe_hpg;
-    }
-    return status::success;
-}
-
-status_t ocl_gpu_device_info_t::init_device_name(impl::engine_t *engine) {
-    cl_int err = CL_SUCCESS;
-    auto device = utils::downcast<const ocl_gpu_engine_t *>(engine)->device();
-
-    size_t param_size = 0;
-    err = clGetDeviceInfo(device, CL_DEVICE_NAME, 0, nullptr, &param_size);
-    OCL_CHECK(err);
-
-    name_ = std::string(param_size, '\0');
-    err = clGetDeviceInfo(
-            device, CL_DEVICE_NAME, param_size, &name_[0], &param_size);
-    OCL_CHECK(err);
-
-    return status::success;
-}
-
-status_t ocl_gpu_device_info_t::init_runtime_version(impl::engine_t *engine) {
-    auto device = utils::downcast<const ocl_gpu_engine_t *>(engine)->device();
-    runtime_version_ = get_driver_version(device);
-    return status::success;
-}
-
-status_t ocl_gpu_device_info_t::init_extensions(impl::engine_t *engine) {
-    cl_int err = CL_SUCCESS;
-    auto device = utils::downcast<const ocl_gpu_engine_t *>(engine)->device();
-
-    // query device for extensions
-    size_t param_size = 0;
-    err = clGetDeviceInfo(
-            device, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
-    OCL_CHECK(err);
-
-    std::string extension_string(param_size, '\0');
-    err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, param_size,
-            &extension_string[0], &param_size);
-    OCL_CHECK(err);
-
-    // convert to ours
-    using namespace compute;
-    for (uint64_t i_ext = 1; i_ext < (uint64_t)device_ext_t::last;
-            i_ext <<= 1) {
-        const char *s_ext = ext2cl_str((device_ext_t)i_ext);
-        if (s_ext && extension_string.find(s_ext) != std::string::npos) {
-            extensions_ |= i_ext;
-        }
-    }
-
-    // Handle future extensions, not yet supported by the OpenCL API
-    extensions_
-            |= (uint64_t)get_future_extensions(gpu_arch(), mayiuse_systolic());
-
-    return status::success;
-}
-
-status_t ocl_gpu_device_info_t::init_attributes(impl::engine_t *engine) {
-    cl_int err = CL_SUCCESS;
-    auto device = utils::downcast<const ocl_gpu_engine_t *>(engine)->device();
-
-    CHECK(get_ocl_device_eu_count(device, gpu_arch_, &eu_count_));
-
-    size_t max_wg_size = 0;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
-            sizeof(max_wg_size), &max_wg_size, nullptr);
-    OCL_CHECK(err);
-    max_wg_size_ = max_wg_size;
-
-    cl_ulong mem_cache_size;
-    err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
-            sizeof(mem_cache_size), &mem_cache_size, nullptr);
-    OCL_CHECK(err);
-    l3_cache_size_ = mem_cache_size;
-
-    size_t max_kernel_param_size;
-    err = clGetDeviceInfo(device, CL_DEVICE_MAX_PARAMETER_SIZE,
-            sizeof(max_kernel_param_size), &max_kernel_param_size, nullptr);
-    OCL_CHECK(err);
-    max_kernel_param_size_ = max_kernel_param_size;
-
-    cl_uint device_address_bits;
-    err = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS,
-            sizeof(device_address_bits), &device_address_bits, nullptr);
-    OCL_CHECK(err);
-    device_address_bits_ = device_address_bits;
-
-#ifdef cl_intel_unified_shared_memory
-    cl_device_unified_shared_memory_capabilities_intel
-            system_memory_capabilities_intel
-            = 0;
-    err = clGetDeviceInfo(device,
-            CL_DEVICE_SHARED_SYSTEM_MEM_CAPABILITIES_INTEL,
-            sizeof(cl_device_unified_shared_memory_capabilities_intel),
-            &system_memory_capabilities_intel, nullptr);
-    OCL_CHECK(err);
-    mayiuse_system_memory_allocators_ = system_memory_capabilities_intel
-            & CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL;
-#endif
-
-    return status::success;
-}
-
-std::string ocl_gpu_device_info_t::get_cl_ext_options() const {
-    using namespace compute;
-
-    std::string opts;
-    for (uint64_t i_ext = 1; i_ext < (uint64_t)device_ext_t::last;
-            i_ext <<= 1) {
-        auto ext = (device_ext_t)i_ext;
-
-        // Use real GPU extensions
-        if (!has(ext)) continue;
-
-        // These extensions are not handled properly by the OpenCL runtime.
-        // Pass macros for them manually.
-        if (utils::one_of(ext, device_ext_t::intel_global_float_atomics,
-                    device_ext_t::intel_subgroup_matrix_multiply_accumulate,
-                    device_ext_t::
-                            intel_subgroup_split_matrix_multiply_accumulate,
-                    device_ext_t::intel_global_float_atomics,
-                    device_ext_t::future_bf16_cvt,
-                    device_ext_t::intel_dot_accumulate))
-            opts += std::string("-D") + ext2cl_str(ext) + " ";
-    }
-
-#ifdef DNNL_DEV_MODE
-    // Preferably this would be in `kernel_ctx::set_default_options()`, but
-    // warnings are emitted for the automatic down conversions of double
-    // literals to float. This behavior is desirable to avoid duplicate
-    // implementations, so -Werror is disabled when fp64 support is not
-    // available instead.
-    bool enabled_werror = gpu_utils::dev_getenv(
-            "enable_ocl_werror", has(device_ext_t::khr_fp64));
-
-    if (enabled_werror) opts += "-Werror ";
-#endif
-
-    return opts;
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ocl_gpu_device_info.hpp b/src/gpu/intel/ocl/ocl_gpu_device_info.hpp
deleted file mode 100644
index b5abce259db..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_device_info.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_OCL_GPU_DEVICE_INFO_HPP
-#define GPU_INTEL_OCL_OCL_GPU_DEVICE_INFO_HPP
-
-#include <string>
-#include <vector>
-#include <CL/cl.h>
-
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-class ocl_gpu_device_info_t : public compute::device_info_t {
-public:
-    std::string get_cl_ext_options() const;
-
-protected:
-    status_t init_device_name(impl::engine_t *engine);
-    status_t init_arch(impl::engine_t *engine);
-    status_t init_runtime_version(impl::engine_t *engine);
-    status_t init_extensions(impl::engine_t *engine);
-    status_t init_attributes(impl::engine_t *engine);
-};
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_OCL_OCL_GPU_DEVICE_INFO_HPP
diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.cpp b/src/gpu/intel/ocl/ocl_gpu_engine.cpp
deleted file mode 100644
index 4835dd243a0..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_engine.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <algorithm>
-#include <sstream>
-#include <CL/cl.h>
-
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-
-#include "common/type_helpers.hpp"
-#include "common/utils.hpp"
-
-#include "xpu/ocl/memory_storage.hpp"
-
-#include "gpu/intel/compute/kernel_list.hpp"
-#include "gpu/intel/microkernels/fuser.hpp"
-#include "gpu/intel/ocl/kernel_utils.hpp"
-#include "gpu/intel/ocl/ocl_gpu_device_info.hpp"
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-#include "gpu/intel/ocl/ocl_gpu_kernel.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
-        cl_device_id dev, cl_context ctx, size_t index,
-        const std::vector<uint8_t> &cache_blob) {
-    gpu_assert(engine_kind == engine_kind::gpu);
-    std::unique_ptr<intel::ocl::ocl_gpu_engine_t, engine_deleter_t> e(
-            (new intel::ocl::ocl_gpu_engine_t(dev, ctx, index)));
-    if (!e) return status::out_of_memory;
-
-    CHECK(e->init(cache_blob));
-    *engine = e.release();
-
-    return status::success;
-}
-
-void maybe_print_build_info(const std::vector<const char *> &kernel_names,
-        const compute::kernel_ctx_t &kernel_ctx) {
-#ifndef DISABLE_VERBOSE
-    // Print out kernel options if the correct verbosity is set
-    if (get_verbose(verbose_t::debuginfo) >= 5) {
-        std::ostringstream oss;
-        for (const char *name : kernel_names)
-            oss << name << " ";
-
-        VFORMAT(get_msec(), primitive, exec, VERBOSE_debug,
-                "kernel options,%s,%s", oss.str().c_str(),
-                kernel_ctx.options().c_str());
-    }
-#endif
-}
-
-status_t ocl_gpu_engine_t::init() {
-    return init({});
-}
-
-status_t ocl_gpu_engine_t::init(const std::vector<uint8_t> &cache_blob) {
-    CHECK(init_impl());
-    CHECK(compute::compute_engine_t::init(cache_blob));
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::create_memory_storage(
-        memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
-    std::unique_ptr<memory_storage_t> _storage;
-
-    if (flags & memory_flags_t::prefer_device_usm) {
-        _storage.reset(new xpu::ocl::usm_memory_storage_t(
-                this, xpu::ocl::usm::kind_t::device));
-    } else
-        _storage.reset(new xpu::ocl::buffer_memory_storage_t(this));
-
-    if (!_storage) return status::out_of_memory;
-
-    status_t status = _storage->init(flags, size, handle);
-    if (status != status::success) return status;
-
-    *storage = _storage.release();
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::create_stream(
-        impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
-    return ocl_stream_t::create_stream(stream, this, stream_impl);
-}
-
-namespace {
-
-status_t create_ocl_kernel_from_cache_blob(const ocl_gpu_engine_t *ocl_engine,
-        const cache_blob_t &cache_blob,
-        const std::vector<const char *> &kernel_names,
-        std::vector<compute::kernel_t> *kernels) {
-    auto dev = ocl_engine->device();
-    auto ctx = ocl_engine->context();
-    cl_int err = CL_SUCCESS;
-    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
-    for (size_t i = 0; i < kernel_names.size(); i++) {
-        if (!kernel_names[i] && kernel_names.size() > 1) continue;
-        std::string kernel_name(kernel_names[i] ? kernel_names[i] : "");
-
-        const uint8_t *binary = nullptr;
-        size_t binary_size = 0;
-
-        CHECK(cache_blob.get_binary(&binary, &binary_size));
-
-        auto program = xpu::ocl::make_wrapper(clCreateProgramWithBinary(
-                ctx, 1, &dev, &binary_size, &binary, nullptr, &err));
-        OCL_CHECK(err);
-        err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr);
-        OCL_CHECK(err);
-
-        if (kernel_name.empty()) {
-            // Handle the ngen cases when kernel name is not available.
-            // Query the kernel name from the program. It's expected that
-            // an ngen based program contains only 1 kernel.
-            if (kernel_names.size() != 1 || kernels->size() != 1)
-                return status::invalid_arguments;
-            size_t kernel_name_size = 0;
-            err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES, 0, nullptr,
-                    &kernel_name_size);
-            OCL_CHECK(err);
-
-            kernel_name.resize(kernel_name_size);
-            err = clGetProgramInfo(program, CL_PROGRAM_KERNEL_NAMES,
-                    kernel_name_size, &kernel_name[0], nullptr);
-            OCL_CHECK(err);
-            assert(!kernel_name.empty());
-            if (kernel_name.empty()) return status::runtime_error;
-            // Remove the null terminator as std::string already includes it.
-            kernel_name.pop_back();
-        }
-        auto ocl_kernel = xpu::ocl::make_wrapper(
-                clCreateKernel(program, kernel_name.c_str(), &err));
-        OCL_CHECK(err);
-
-        std::vector<gpu::intel::compute::scalar_type_t> arg_types;
-        CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
-        OCL_CHECK(err);
-
-        std::shared_ptr<compute::kernel_impl_t> kernel_impl
-                = std::make_shared<ocl_gpu_kernel_t>(
-                        std::move(ocl_kernel), arg_types);
-        (*kernels)[i] = std::move(kernel_impl);
-    }
-
-    return status::success;
-}
-
-cl_int maybe_print_debug_info(
-        cl_int err_, cl_program program, cl_device_id dev) {
-    // Return error code if verbose is not enabled.
-    if (err_ == CL_SUCCESS || !get_verbose(verbose_t::error)) return err_;
-
-    size_t log_length = 0;
-    auto err = clGetProgramBuildInfo(
-            program, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &log_length);
-    assert(err == CL_SUCCESS);
-
-    std::vector<char> log_buf(log_length);
-    err = clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG, log_length,
-            log_buf.data(), nullptr);
-    assert(err == CL_SUCCESS);
-    VERROR(common, ocl,
-            "Error during the build of OpenCL program. Build log:\n%s",
-            log_buf.data());
-    MAYBE_UNUSED(err);
-    return err_;
-};
-
-inline status_t preprocess_headers(std::stringstream &pp_code, const char *code,
-        const compute::kernel_ctx_t &kernel_ctx) {
-    std::stringstream code_stream(code);
-
-    for (std::string line; std::getline(code_stream, line);) {
-        const size_t include_pos = line.find("#include");
-        if (include_pos != std::string::npos) {
-            static constexpr size_t include_len = 8;
-            const size_t first_quote_pos
-                    = line.find("\"", include_pos + include_len);
-            const size_t second_quote_pos
-                    = line.find("\"", first_quote_pos + 1);
-            const size_t kernel_name_len
-                    = second_quote_pos - first_quote_pos - 1;
-            const auto header_name
-                    = line.substr(first_quote_pos + 1, kernel_name_len);
-            const char *header_source
-                    = kernel_ctx.get_custom_header(header_name);
-            if (!header_source) header_source = get_kernel_header(header_name);
-            CHECK(preprocess_headers(pp_code, header_source, kernel_ctx));
-        } else {
-            pp_code << line << std::endl;
-        }
-    }
-    return status::success;
-}
-
-inline status_t fuse_microkernels(cl_context context, cl_device_id device,
-        xpu::ocl::wrapper_t<cl_program> &program, const char *code) {
-    if (micro::hasMicrokernels(code)) {
-        cl_int status = CL_SUCCESS;
-        size_t binary_size = 0;
-        OCL_CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
-                sizeof(binary_size), &binary_size, nullptr));
-
-        std::vector<uint8_t> binary(binary_size);
-        auto binary_data = binary.data();
-        OCL_CHECK(clGetProgramInfo(program, CL_PROGRAM_BINARIES,
-                sizeof(binary_data), &binary_data, nullptr));
-
-        try {
-            micro::fuseMicrokernels(binary, code);
-        } catch (...) { return status::runtime_error; }
-
-        auto nbinary_size = binary.size();
-        auto nbinary_data = const_cast<const uint8_t *>(binary.data());
-
-        program = xpu::ocl::make_wrapper(clCreateProgramWithBinary(context, 1,
-                &device, &nbinary_size, &nbinary_data, nullptr, &status));
-        OCL_CHECK(status);
-        OCL_CHECK(clBuildProgram(program, 1, &device, "", nullptr, nullptr));
-    }
-
-    return status::success;
-}
-
-} // namespace
-
-status_t ocl_gpu_engine_t::build_program_from_source(
-        xpu::ocl::wrapper_t<cl_program> &program, const char *code_string,
-        const compute::kernel_ctx_t &kernel_ctx) const {
-    std::string options = kernel_ctx.options();
-
-    // XXX: Update options by adding macros for OpenCL extensions that are not
-    // handled properly by the OpenCL runtime
-    auto *dev_info
-            = utils::downcast<const ocl_gpu_device_info_t *>(device_info());
-    options += " " + dev_info->get_cl_ext_options();
-
-    cl_int err;
-    std::stringstream pp_code;
-    // The `cl_cache` requires using `clBuildProgram`. Unfortunately, unlike
-    // `clCompileProgram` `clBuildProgram` doesn't take headers. Because of
-    // that, a manual preprocessing of `include` header directives in the
-    // OpenCL kernels is required.
-    CHECK(preprocess_headers(pp_code, code_string, kernel_ctx));
-    std::string pp_code_str = pp_code.str();
-    const char *pp_code_str_ptr = pp_code_str.c_str();
-
-    debugdump_processed_source(
-            pp_code_str, options, dev_info->get_cl_ext_options());
-
-    auto ctx = context();
-    program = xpu::ocl::make_wrapper(
-            clCreateProgramWithSource(ctx, 1, &pp_code_str_ptr, nullptr, &err));
-    OCL_CHECK(err);
-
-    auto dev = device();
-    err = clBuildProgram(program, 1, &dev, options.c_str(), nullptr, nullptr);
-    OCL_CHECK(maybe_print_debug_info(err, program, dev));
-
-    if (kernel_ctx.has_custom_headers())
-        CHECK(fuse_microkernels(ctx, dev, program, pp_code_str_ptr));
-
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::create_binary_from_ocl_source(xpu::binary_t &binary,
-        const char *code_string,
-        const compute::kernel_ctx_t &kernel_ctx) const {
-    xpu::ocl::wrapper_t<cl_program> program;
-    CHECK(build_program_from_source(program, code_string, kernel_ctx));
-
-    CHECK(get_ocl_program_binary(program, device(), binary));
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::create_kernel_from_binary(compute::kernel_t &kernel,
-        const xpu::binary_t &binary, const char *kernel_name) const {
-    xpu::ocl::wrapper_t<cl_program> program;
-    CHECK(xpu::ocl::create_program(
-            program, this->device(), this->context(), binary));
-
-    cl_int err;
-    auto ocl_kernel = xpu::ocl::make_wrapper(
-            clCreateKernel(program, kernel_name, &err));
-    OCL_CHECK(err);
-
-    std::vector<gpu::intel::compute::scalar_type_t> arg_types;
-    CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
-
-    std::shared_ptr<compute::kernel_impl_t> kernel_impl
-            = std::make_shared<ocl_gpu_kernel_t>(
-                    std::move(ocl_kernel), arg_types);
-    kernel = std::move(kernel_impl);
-
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::create_kernels_from_cache_blob(
-        const cache_blob_t &cache_blob, std::vector<compute::kernel_t> &kernels,
-        const std::vector<const char *> &kernel_names) const {
-    return create_ocl_kernel_from_cache_blob(
-            this, cache_blob, kernel_names, &kernels);
-}
-
-status_t ocl_gpu_engine_t::create_kernel(compute::kernel_t *kernel,
-        jit::jit_generator_base *jitter, const cache_blob_t &cache_blob) const {
-    if (!jitter && !cache_blob) return status::invalid_arguments;
-
-    const char *kernel_name = jitter ? jitter->kernel_name() : "";
-
-    if (cache_blob) {
-        std::vector<compute::kernel_t> kernels(1);
-        auto status = create_ocl_kernel_from_cache_blob(
-                this, cache_blob, {kernel_name}, &kernels);
-        CHECK(status);
-        (*kernel) = kernels[0];
-        return status::success;
-    }
-
-    xpu::binary_t binary = jitter->get_binary(context(), device());
-    if (binary.empty()) return status::runtime_error;
-    return create_kernel_from_binary(*kernel, binary, kernel_name);
-}
-
-status_t ocl_gpu_engine_t::create_kernels(
-        std::vector<compute::kernel_t> *kernels,
-        const std::vector<const char *> &kernel_names,
-        const compute::kernel_ctx_t &kernel_ctx,
-        const cache_blob_t &cache_blob) const {
-    maybe_print_build_info(kernel_names, kernel_ctx);
-
-    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
-
-    if (cache_blob) {
-        return create_ocl_kernel_from_cache_blob(
-                this, cache_blob, kernel_names, kernels);
-    }
-
-    compute::kernel_list_t kernel_list;
-    for (size_t i = 0; i < kernels->size(); ++i) {
-        if (kernel_names[i]) kernel_list.add(kernel_names[i], &(*kernels)[i]);
-    }
-
-    return ocl::create_kernels(this, kernel_list, kernel_ctx);
-}
-
-status_t ocl_gpu_engine_t::create_kernels_from_ocl_source(
-        std::vector<compute::kernel_t> *kernels,
-        const std::vector<const char *> &kernel_names, const char *code_string,
-        const compute::kernel_ctx_t &kernel_ctx) const {
-    xpu::ocl::wrapper_t<cl_program> program;
-    CHECK(build_program_from_source(program, code_string, kernel_ctx));
-
-    *kernels = std::vector<compute::kernel_t>(kernel_names.size());
-    for (size_t i = 0; i < kernel_names.size(); ++i) {
-        if (!kernel_names[i]) continue;
-        cl_int err;
-        xpu::ocl::wrapper_t<cl_kernel> ocl_kernel
-                = clCreateKernel(program, kernel_names[i], &err);
-        OCL_CHECK(err);
-        std::vector<gpu::intel::compute::scalar_type_t> arg_types;
-        CHECK(get_kernel_arg_types(ocl_kernel, &arg_types));
-
-        std::shared_ptr<compute::kernel_impl_t> kernel_impl
-                = std::make_shared<ocl_gpu_kernel_t>(
-                        std::move(ocl_kernel), arg_types);
-        (*kernels)[i] = std::move(kernel_impl);
-    }
-
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::init_device_info() {
-    return init_device_info({});
-}
-
-status_t ocl_gpu_engine_t::init_device_info(
-        const std::vector<uint8_t> &cache_blob) {
-    device_info_ = std::make_shared<ocl_gpu_device_info_t>();
-    CHECK(device_info_->init(this, cache_blob));
-    return status::success;
-}
-
-status_t ocl_gpu_engine_t::serialize_device(
-        serialization_stream_t &sstream) const {
-    size_t platform_name_len;
-    cl_int err = clGetPlatformInfo(impl()->platform(), CL_PLATFORM_NAME, 0,
-            nullptr, &platform_name_len);
-    OCL_CHECK(err);
-
-    std::vector<char> platform_name(platform_name_len);
-    err = clGetPlatformInfo(impl()->platform(), CL_PLATFORM_NAME,
-            platform_name.size(), platform_name.data(), nullptr);
-    OCL_CHECK(err);
-
-    sstream.write(platform_name.data(), platform_name.size());
-    sstream.write(device_info()->name().data(), device_info()->name().size());
-    sstream.write(&device_info()->runtime_version().major);
-    sstream.write(&device_info()->runtime_version().minor);
-    sstream.write(&device_info()->runtime_version().build);
-
-    return status::success;
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ocl_gpu_engine.hpp b/src/gpu/intel/ocl/ocl_gpu_engine.hpp
deleted file mode 100644
index 4165112dc7f..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_engine.hpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_OCL_GPU_ENGINE_HPP
-#define GPU_INTEL_OCL_OCL_GPU_ENGINE_HPP
-
-#include "common/c_types_map.hpp"
-#include "common/utils.hpp"
-#include "gpu/gpu_impl_list.hpp"
-#include "gpu/intel/compute/compute_engine.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-#include "xpu/ocl/engine_impl.hpp"
-#include "xpu/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
-        cl_device_id dev, cl_context ctx, size_t index,
-        const std::vector<uint8_t> &cache_blob);
-
-class ocl_gpu_engine_t : public compute::compute_engine_t {
-public:
-    ocl_gpu_engine_t(cl_device_id adevice, cl_context acontext, size_t index)
-        : compute::compute_engine_t(
-                new xpu::ocl::engine_impl_t(adevice, acontext, index)) {}
-
-    status_t init() override;
-    status_t init(const std::vector<uint8_t> &cache_blob);
-
-    status_t create_memory_storage(memory_storage_t **storage, unsigned flags,
-            size_t size, void *handle) override;
-
-    status_t create_stream(
-            impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
-
-    status_t create_binary_from_ocl_source(xpu::binary_t &binary,
-            const char *code_string,
-            const compute::kernel_ctx_t &kernel_ctx) const;
-
-    status_t create_kernel_from_binary(compute::kernel_t &kernel,
-            const xpu::binary_t &binary,
-            const char *kernel_name) const override;
-
-    status_t create_kernels_from_cache_blob(const cache_blob_t &cache_blob,
-            std::vector<compute::kernel_t> &kernels,
-            const std::vector<const char *> &kernel_names) const override;
-
-    status_t create_kernel(compute::kernel_t *kernel,
-            jit::jit_generator_base *jitter,
-            const cache_blob_t &cache_blob) const override;
-
-    status_t create_kernels(std::vector<compute::kernel_t> *kernels,
-            const std::vector<const char *> &kernel_names,
-            const compute::kernel_ctx_t &kernel_ctx,
-            const cache_blob_t &cache_blob) const override;
-
-    status_t create_kernels_from_ocl_source(
-            std::vector<compute::kernel_t> *kernels,
-            const std::vector<const char *> &kernel_names,
-            const char *source_string,
-            const compute::kernel_ctx_t &kernel_ctx) const override;
-
-    const impl_list_item_t *get_concat_implementation_list() const override {
-        return gpu_impl_list_t::get_concat_implementation_list();
-    }
-
-    const impl_list_item_t *get_reorder_implementation_list(
-            const memory_desc_t *src_md,
-            const memory_desc_t *dst_md) const override {
-        return gpu_impl_list_t::get_reorder_implementation_list(src_md, dst_md);
-    }
-
-    const impl_list_item_t *get_sum_implementation_list() const override {
-        return gpu_impl_list_t::get_sum_implementation_list();
-    }
-
-    const impl_list_item_t *get_implementation_list(
-            const op_desc_t *desc) const override {
-        return gpu_impl_list_t::get_implementation_list(desc);
-    }
-
-    cl_device_id device() const { return impl()->device(); }
-    cl_context context() const { return impl()->context(); }
-    cl_platform_id platform() const { return impl()->platform(); }
-
-    gpu_utils::device_id_t device_id() const override {
-        return std::make_tuple(0, reinterpret_cast<uint64_t>(device()), 0);
-    }
-
-    status_t serialize_device(serialization_stream_t &sstream) const override;
-
-    status_t get_cache_blob_size(size_t *size) const override {
-        return device_info_->get_cache_blob_size(size);
-    }
-
-    status_t get_cache_blob(size_t size, uint8_t *cache_blob) const override {
-        return device_info_->get_cache_blob(size, cache_blob);
-    }
-
-protected:
-    const xpu::ocl::engine_impl_t *impl() const {
-        return (const xpu::ocl::engine_impl_t *)engine_t::impl();
-    }
-
-    status_t build_program_from_source(xpu::ocl::wrapper_t<cl_program> &program,
-            const char *code_string,
-            const compute::kernel_ctx_t &kernel_ctx) const;
-
-    ~ocl_gpu_engine_t() override = default;
-
-    status_t init_device_info() override;
-    status_t init_device_info(const std::vector<uint8_t> &cache_blob) override;
-};
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp b/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp
deleted file mode 100644
index 6b27a30ddd9..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_hw_info.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gpu/intel/ocl/ocl_gpu_hw_info.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-#include "gpu/intel/jit/binary_format.hpp"
-#include "gpu/intel/jit/jit_generator.hpp"
-#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
-
-#ifndef CL_DEVICE_IP_VERSION_INTEL
-#define CL_DEVICE_IP_VERSION_INTEL 0x4250
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-xpu::runtime_version_t get_driver_version(cl_device_id device) {
-    cl_int err;
-    xpu::runtime_version_t runtime_version(-1, -1, -1);
-
-    size_t param_size = 0;
-    err = clGetDeviceInfo(device, CL_DRIVER_VERSION, 0, nullptr, &param_size);
-    std::string driver_version(param_size, '\0');
-
-    if (err == CL_SUCCESS) {
-        err = clGetDeviceInfo(device, CL_DRIVER_VERSION, param_size,
-                &driver_version[0], nullptr);
-    }
-
-    if (err != CL_SUCCESS
-            || runtime_version.set_from_string(&driver_version[0])
-                    != status::success) {
-        runtime_version.major = 0;
-        runtime_version.minor = 0;
-        runtime_version.build = 0;
-    }
-
-    return runtime_version;
-}
-
-/// Tries to build a kernel with assembly instructions to check to see if the
-/// OpenCL compiler supports microkernels.
-bool try_building_with_microkernels(cl_context context, cl_device_id device) {
-    const char *kernel_code = R""""(
-        kernel void igc_check() {
-            __asm__ volatile(
-                    ".decl AA0 v_type=G type=ud num_elts=1\n"
-                    ".decl AA1 v_type=G type=ud num_elts=1\n"
-                    ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
-                    ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
-                    "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
-            );
-        }
-        )"""";
-    cl_int err;
-    /// Not using existing build infrastructure to avoid error messages in the CI logs
-    xpu::ocl::wrapper_t<cl_program> program(
-            clCreateProgramWithSource(context, 1, &kernel_code, nullptr, &err));
-    if (err != CL_SUCCESS) return false;
-    err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
-    return err == CL_SUCCESS;
-}
-
-void init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
-        cl_context context, uint32_t &ip_version, compute::gpu_arch_t &gpu_arch,
-        int &gpu_product_family, int &stepping_id, uint64_t &native_extensions,
-        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels,
-        bool &mayiuse_microkernels) {
-    using namespace ngen;
-    HW hw = HW::Unknown;
-    Product product = {ProductFamily::Unknown, 0};
-    jit::jit_generator<HW::Unknown>::detectHWInfo(context, device, hw, product);
-    bool is_xelpg = (product.family == ngen::ProductFamily::ARL
-            || product.family == ngen::ProductFamily::MTL);
-
-    gpu_arch = jit::convert_ngen_arch_to_dnnl(hw);
-    gpu_product_family = static_cast<int>(product.family);
-    stepping_id = product.stepping;
-
-    mayiuse_systolic = false;
-    status_t ret
-            = get_ocl_device_enabled_systolic_intel(device, mayiuse_systolic);
-    assert(ret == CL_SUCCESS);
-    ret = get_ocl_device_enabled_native_float_atomics(
-            device, native_extensions, is_xelpg);
-    assert(ret == CL_SUCCESS);
-    MAYBE_UNUSED(ret);
-
-    auto status
-            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
-    if (status != status::success) mayiuse_ngen_kernels = false;
-
-    mayiuse_microkernels = get_driver_version(device)
-            >= xpu::runtime_version_t(24, 22, 29735);
-    if (!mayiuse_microkernels) {
-        mayiuse_microkernels = try_building_with_microkernels(context, device);
-    }
-
-    ip_version = 0;
-    if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ip_version),
-                &ip_version, nullptr)
-            != CL_SUCCESS)
-        ip_version = 0;
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp b/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp
deleted file mode 100644
index 3184116fb11..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_hw_info.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_OCL_GPU_HW_INFO_HPP
-#define GPU_INTEL_OCL_OCL_GPU_HW_INFO_HPP
-
-#include <CL/cl.h>
-
-#include "common/c_types_map.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-xpu::runtime_version_t get_driver_version(cl_device_id device);
-
-void init_gpu_hw_info(impl::engine_t *engine, cl_device_id device,
-        cl_context context, uint32_t &ip_version, compute::gpu_arch_t &gpu_arch,
-        int &gpu_product_family, int &stepping_id, uint64_t &native_extensions,
-        bool &mayiuse_systolic, bool &mayiuse_ngen_kernels,
-        bool &mayiuse_microkernels);
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_OCL_OCL_GPU_HW_INFO_HPP
diff --git a/src/gpu/intel/ocl/ocl_gpu_kernel.cpp b/src/gpu/intel/ocl/ocl_gpu_kernel.cpp
deleted file mode 100644
index 0f5f320a1c3..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_kernel.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <assert.h>
-#include <string>
-#include <CL/cl.h>
-
-#include "gpu/intel/ocl/ocl_gpu_kernel.hpp"
-
-#include "common/rw_mutex.hpp"
-#include "common/utils.hpp"
-
-#include "xpu/stream_profiler.hpp"
-
-#include "xpu/ocl/context.hpp"
-#include "xpu/ocl/memory_storage.hpp"
-#include "xpu/ocl/usm_utils.hpp"
-
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-// Kernel wrapper storing a per-thread copy of cl_kernel.
-class kernel_wrapper_t {
-public:
-    kernel_wrapper_t(cl_kernel kernel = nullptr) : kernel_(kernel) {}
-
-    operator cl_kernel() const { return kernel_; }
-
-    status_t set_arg(int arg_index, size_t arg_size, const void *arg_value) {
-        cl_int err = clSetKernelArg(kernel_, arg_index, arg_size, arg_value);
-        return xpu::ocl::convert_to_dnnl(err);
-    }
-
-    status_t set_svm_arg(int arg_index, const void *arg_value) {
-#ifdef CL_VERSION_2_0
-        cl_int err = clSetKernelArgSVMPointer(kernel_, arg_index, arg_value);
-        return xpu::ocl::convert_to_dnnl(err);
-#else
-        // SVM is not supported.
-        UNUSED(arg_index);
-        UNUSED(arg_value);
-        return status::runtime_error;
-#endif
-    }
-
-    status_t set_usm_arg(
-            impl::engine_t *engine, int arg_index, const void *arg_value) {
-        return xpu::ocl::usm::set_kernel_arg(
-                engine, kernel_, arg_index, arg_value);
-    }
-
-private:
-    cl_kernel kernel_;
-};
-
-class ocl_gpu_kernel_cache_t {
-public:
-    ocl_gpu_kernel_cache_t(cl_kernel main_kernel) : main_kernel_(main_kernel) {}
-
-    ~ocl_gpu_kernel_cache_t() {
-        for (auto &kv : kernels_) {
-            OCL_CHECK_V(clReleaseKernel(kv.second));
-        }
-    }
-
-    status_t get(kernel_wrapper_t **kernel) {
-        auto id = std::this_thread::get_id();
-        {
-            utils::lock_read_t lock_read(mutex_);
-            auto it = kernels_.find(id);
-            if (it != kernels_.end()) {
-                *kernel = &it->second;
-                return status::success;
-            }
-        }
-
-        // No copy for this thread, clone the original kernel and save the
-        // copy.
-        cl_kernel cloned_kernel;
-        CHECK(xpu::ocl::clone_kernel(main_kernel_, &cloned_kernel));
-
-        utils::lock_write_t lock_write(mutex_);
-        auto ret = kernels_.emplace(id, cloned_kernel);
-        *kernel = &ret.first->second;
-        return status::success;
-    }
-
-private:
-    cl_kernel main_kernel_;
-    std::unordered_map<std::thread::id, kernel_wrapper_t> kernels_;
-    utils::rw_mutex_t mutex_;
-};
-
-ocl_gpu_kernel_t::ocl_gpu_kernel_t(xpu::ocl::wrapper_t<cl_kernel> &&ocl_kernel,
-        const std::vector<gpu::intel::compute::scalar_type_t> &arg_types)
-    : ocl_kernel_(std::move(ocl_kernel))
-    , arg_types_(arg_types)
-    , save_events_(false) {
-    cache_ = std::make_shared<ocl_gpu_kernel_cache_t>(ocl_kernel_);
-}
-
-status_t ocl_gpu_kernel_t::get_binary(
-        const impl::engine_t *engine, xpu::binary_t &binary) const {
-    auto *ocl_engine = utils::downcast<const ocl_gpu_engine_t *>(engine);
-    return get_ocl_program_binary(ocl_kernel(), ocl_engine->device(), binary);
-}
-
-status_t ocl_gpu_kernel_t::get_binary_size(
-        const impl::engine_t *engine, size_t *binary_size) const {
-    auto *ocl_engine = utils::downcast<const ocl_gpu_engine_t *>(engine);
-    return get_ocl_program_binary_size(
-            ocl_kernel(), ocl_engine->device(), binary_size);
-}
-
-status_t ocl_gpu_kernel_t::parallel_for(impl::stream_t &stream,
-        const compute::nd_range_t &range,
-        const compute::kernel_arg_list_t &arg_list, const xpu::event_t &deps,
-        xpu::event_t &out_dep) {
-
-    auto *ocl_stream = utils::downcast<ocl_stream_t *>(&stream);
-    cl_command_queue queue = ocl_stream->queue();
-
-    kernel_wrapper_t *kernel = nullptr;
-    CHECK(cache_->get(&kernel));
-    CHECK(check_scalar_arguments(arg_list));
-
-    auto stream_ocl_device_info
-            = utils::downcast<ocl_gpu_engine_t *>(stream.engine())
-                      ->device_info();
-    const size_t pointer_size
-            = stream_ocl_device_info->device_address_bits() / 8;
-    size_t param_bytes = 0;
-    for (int i = 0; i < arg_list.nargs(); ++i) {
-        auto &arg = arg_list.get(i);
-        if (arg.is_global()) {
-            auto *mem_storage
-                    = static_cast<const memory_storage_t *>(arg.value());
-            if (!mem_storage->is_null()) {
-                auto *ocl_mem_storage = utils::downcast<
-                        const xpu::ocl::memory_storage_base_t *>(mem_storage);
-
-                // Validate that the OpenCL contexts match for execution
-                // context and memory.
-                auto stream_ocl_ctx
-                        = utils::downcast<ocl_gpu_engine_t *>(stream.engine())
-                                  ->context();
-                auto memory_storage_ocl_ctx
-                        = utils::downcast<ocl_gpu_engine_t *>(
-                                ocl_mem_storage->engine())
-                                  ->context();
-                if (stream_ocl_ctx != memory_storage_ocl_ctx) {
-                    MAYBE_REPORT_ERROR(
-                            "mismatched OpenCL context for primitive/memory");
-                    return status::invalid_arguments;
-                }
-
-                switch (ocl_mem_storage->memory_kind()) {
-                    case xpu::ocl::memory_kind::buffer: {
-                        auto *m = utils::downcast<
-                                const xpu::ocl::buffer_memory_storage_t *>(
-                                ocl_mem_storage);
-                        auto ocl_mem = m->mem_object();
-                        CHECK(kernel->set_arg(i, sizeof(cl_mem), &ocl_mem));
-                        param_bytes += pointer_size;
-                        break;
-                    }
-                    case xpu::ocl::memory_kind::usm: {
-                        auto *m = utils::downcast<
-                                const xpu::ocl::usm_memory_storage_t *>(
-                                ocl_mem_storage);
-                        auto *usm_ptr = m->usm_ptr();
-                        CHECK(kernel->set_usm_arg(stream.engine(), i, usm_ptr));
-                        param_bytes += pointer_size;
-                        break;
-                    }
-                    default: assert(!"not expected");
-                }
-            } else {
-                if (xpu::ocl::usm::is_usm_supported(stream.engine())) {
-                    CHECK(kernel->set_usm_arg(stream.engine(), i, nullptr));
-                    param_bytes += pointer_size;
-                } else {
-                    cl_mem null_mem = nullptr;
-                    CHECK(kernel->set_arg(i, sizeof(cl_mem), &null_mem));
-                    param_bytes += pointer_size;
-                }
-            }
-        } else if (arg.is_local()) {
-            CHECK(kernel->set_arg(i, arg.size(), arg.value()));
-            // Assuming local memory arguments contribute to
-            // the CL_DEVICE_MAX_PARAMETER_SIZE limit as a pointer type
-            param_bytes += pointer_size;
-        } else if (arg.is_svm_pointer()) {
-            CHECK(kernel->set_svm_arg(i, arg.value()));
-            param_bytes += pointer_size;
-        } else {
-            CHECK(kernel->set_arg(i, arg.size(), arg.value()));
-            param_bytes += arg.size();
-        }
-    }
-
-    if (param_bytes > stream_ocl_device_info->max_kernel_param_size()) {
-        MAYBE_REPORT_ERROR(
-                "parameter bytes requirements greater than device supports");
-        return status::invalid_arguments;
-    }
-
-    cl_uint ndims = static_cast<cl_uint>(range.ndims());
-    if (range.is_zero()) { return status::success; }
-
-    xpu::ocl::wrapper_t<cl_event> event;
-    if (ocl_stream->flags() & stream_flags::out_of_order) {
-        const auto &event_wrappers = xpu::ocl::event_t::from(deps).events;
-        std::vector<cl_event> events(
-                event_wrappers.begin(), event_wrappers.end());
-
-        cl_uint num_events = (cl_uint)events.size();
-        const cl_event *events_data = num_events ? events.data() : nullptr;
-        cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr,
-                range.global_range().data(),
-                range.local_range() ? range.local_range().data() : nullptr,
-                num_events, events_data, &event.unwrap());
-        OCL_CHECK(err);
-        xpu::ocl::event_t::from(out_dep).events = {event};
-    } else {
-        bool save_event = save_events_ || stream.is_profiling_enabled();
-        cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr,
-                range.global_range().data(),
-                range.local_range() ? range.local_range().data() : nullptr, 0,
-                nullptr, save_event ? &event.unwrap() : nullptr);
-        OCL_CHECK(err);
-    }
-
-    if (stream.is_profiling_enabled()) {
-        ocl_stream->profiler().register_event(
-                utils::make_unique<xpu::ocl::event_t>(std::move(event)));
-    }
-
-    return status::success;
-}
-
-status_t ocl_gpu_kernel_t::dump() const {
-    xpu::binary_t binary;
-    CHECK(get_ocl_kernel_binary(ocl_kernel(), binary));
-    CHECK(gpu_utils::dump_kernel_binary(binary, name()));
-    return status::success;
-}
-
-std::string ocl_gpu_kernel_t::name() const {
-    return xpu::ocl::get_kernel_name(ocl_kernel());
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ocl_gpu_kernel.hpp b/src/gpu/intel/ocl/ocl_gpu_kernel.hpp
deleted file mode 100644
index a3fde1c967a..00000000000
--- a/src/gpu/intel/ocl/ocl_gpu_kernel.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_OCL_GPU_KERNEL_HPP
-#define GPU_INTEL_OCL_OCL_GPU_KERNEL_HPP
-
-#include <string>
-#include <CL/cl.h>
-
-#include "gpu/intel/compute/kernel.hpp"
-#include "xpu/ocl/utils.hpp"
-#include "xpu/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-class ocl_gpu_kernel_cache_t;
-
-class ocl_gpu_kernel_t : public compute::kernel_impl_t {
-public:
-    ocl_gpu_kernel_t(xpu::ocl::wrapper_t<cl_kernel> &&ocl_kernel,
-            const std::vector<gpu::intel::compute::scalar_type_t> &arg_types);
-    ~ocl_gpu_kernel_t() override = default;
-
-    cl_kernel ocl_kernel() const { return ocl_kernel_; }
-
-    status_t get_binary(
-            const impl::engine_t *engine, xpu::binary_t &binary) const override;
-    status_t get_binary_size(
-            const impl::engine_t *engine, size_t *binary_size) const override;
-
-    status_t parallel_for(impl::stream_t &stream,
-            const compute::nd_range_t &range,
-            const compute::kernel_arg_list_t &arg_list,
-            const xpu::event_t &deps, xpu::event_t &out_dep) override;
-
-    const std::vector<gpu::intel::compute::scalar_type_t> &
-    arg_types() const override {
-        return arg_types_;
-    }
-
-    void save_output_events() override { save_events_ = true; }
-
-    status_t dump() const override;
-    std::string name() const override;
-
-private:
-    xpu::ocl::wrapper_t<cl_kernel> ocl_kernel_;
-    std::vector<gpu::intel::compute::scalar_type_t> arg_types_;
-    std::shared_ptr<ocl_gpu_kernel_cache_t> cache_;
-    bool save_events_;
-};
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif // GPU_INTEL_OCL_OCL_GPU_KERNEL_HPP
diff --git a/src/gpu/intel/ocl/ocl_io.h b/src/gpu/intel/ocl/ocl_io.h
index 56a25f84c2e..3fdc3c417ec 100644
--- a/src/gpu/intel/ocl/ocl_io.h
+++ b/src/gpu/intel/ocl/ocl_io.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,20 +23,31 @@
 
 //******* Always-available load/writes *********//
 
+#define BLOCK_READ_FUNC_0 get_half_byte
 #define BLOCK_READ_FUNC_1 intel_sub_group_block_read_uc
 #define BLOCK_READ_FUNC_2 intel_sub_group_block_read_us
 #define BLOCK_READ_FUNC_4 intel_sub_group_block_read
 #define BLOCK_READ_FUNC_8 intel_sub_group_block_read_ul
 
+#define BLOCK_WRITE_FUNC_1 intel_sub_group_block_write_uc
+#define BLOCK_WRITE_FUNC_2 intel_sub_group_block_write_us
+#define BLOCK_WRITE_FUNC_4 intel_sub_group_block_write
+#define BLOCK_WRITE_FUNC_8 intel_sub_group_block_write_ul
+
+#define BLOCK_DT_0 uchar
 #define BLOCK_DT_1 uchar
 #define BLOCK_DT_2 ushort
 #define BLOCK_DT_4 uint
 #define BLOCK_DT_8 ulong
 
+#define SIZE_undef_data 1
 #define SIZE_char 1
 #define SIZE_uchar 1
 #define SIZE_f8_e5m2 1
 #define SIZE_f8_e4m3 1
+#define SIZE_f4_e2m1 0
+#define SIZE_f4_e3m0 0
+#define SIZE_e8m0 1
 #define SIZE_bf16 2
 #define SIZE_half 2
 #define SIZE_int 4
@@ -46,12 +57,101 @@
 #define SIZE(dt) CONCAT2(SIZE_, dt)
 #define BLOCK_DT(dt) CONCAT2(BLOCK_DT_, SIZE(dt))
 #define BLOCK_READ_FUNC(dt) CONCAT2(BLOCK_READ_FUNC_, SIZE(dt))
+#define BLOCK_WRITE_FUNC(dt) CONCAT2(BLOCK_WRITE_FUNC_, SIZE(dt))
+
+#define BLOCK_DT_N(dt, n) CONCAT2(BLOCK_DT(dt), n)
+#define BLOCK_READ_FUNC_N(dt, n) CONCAT2(BLOCK_READ_FUNC(dt), n)
+#define BLOCK_WRITE_FUNC_N(dt, n) CONCAT2(BLOCK_WRITE_FUNC(dt), n)
+
+#define DECLARE_AS_BLOCK(t) \
+    BLOCK_DT(t) __attribute__((overloadable)) as_block_data(t a) { \
+        return CONCAT2(as_, BLOCK_DT(t))(a); \
+    }
+
+#define DECLARE_AS_STRUCT_BLOCK(t) \
+    BLOCK_DT(t) __attribute__((overloadable)) as_block_data(t a) { \
+        return CONCAT2(as_, BLOCK_DT(t))(a.data); \
+    }
+
+DECLARE_AS_BLOCK(char)
+DECLARE_AS_BLOCK(uchar)
+DECLARE_AS_STRUCT_BLOCK(f8_e5m2)
+DECLARE_AS_STRUCT_BLOCK(f8_e4m3)
+DECLARE_AS_STRUCT_BLOCK(e8m0)
+DECLARE_AS_STRUCT_BLOCK(bf16)
+DECLARE_AS_BLOCK(half)
+DECLARE_AS_BLOCK(int)
+DECLARE_AS_BLOCK(float)
+#ifdef cl_khr_fp64
+DECLARE_AS_BLOCK(double)
+#endif
+
+#undef DECLARE_AS_BLOCK
+#undef DECLARE_AS_STRUCT_BLOCK
 
 #define DEF_load(dst_dt, src_dt) \
     void __attribute__((overloadable)) \
-            load(__private dst_dt *dst, __global src_dt *val) { \
+            load(__private dst_dt *dst, __global const src_dt *val) { \
         *dst = CONCAT2(into_, dst_dt)(*val); \
     } \
+    dst_dt __attribute__((overloadable, warn_unused_result)) \
+            load(dst_dt dst, __global src_dt *val) { \
+        return CONCAT2(into_, dst_dt)(*val); \
+    } \
+    dst_dt __attribute__((overloadable, warn_unused_result)) \
+            load(dst_dt dst, __global const src_dt *val) { \
+        return CONCAT2(into_, dst_dt)(*val); \
+    } \
+    __attribute__((overloadable)) void block_load( \
+            __private dst_dt *dst, __global const src_dt *src, int n) { \
+        __attribute__((opencl_unroll_hint)) while (n >= 8) { \
+            __global BLOCK_DT(src_dt) *data \
+                    = (__global BLOCK_DT(src_dt) *)(src); \
+            BLOCK_DT_N(src_dt, 8) \
+            block_val = BLOCK_READ_FUNC_N(src_dt, 8)(data); \
+            for (int i = 0; i < 8; i++) { \
+                src_dt src_val = CONCAT2(as_, src_dt)(block_val[i]); \
+                dst[i] = CONCAT2(into_, dst_dt)(src_val); \
+            } \
+            dst += 8; \
+            src += 8 * get_max_sub_group_size(); \
+            n -= 8; \
+        } \
+        if (n >= 4) { \
+            __global BLOCK_DT(src_dt) *data \
+                    = (__global BLOCK_DT(src_dt) *)(src); \
+            BLOCK_DT_N(src_dt, 4) \
+            block_val = BLOCK_READ_FUNC_N(src_dt, 4)(data); \
+            for (int i = 0; i < 4; i++) { \
+                src_dt src_val = CONCAT2(as_, src_dt)(block_val[i]); \
+                dst[i] = CONCAT2(into_, dst_dt)(src_val); \
+            } \
+            dst += 4; \
+            src += 4 * get_max_sub_group_size(); \
+            n -= 4; \
+        } \
+        if (n >= 2) { \
+            __global BLOCK_DT(src_dt) *data \
+                    = (__global BLOCK_DT(src_dt) *)(src); \
+            BLOCK_DT_N(src_dt, 2) \
+            block_val = BLOCK_READ_FUNC_N(src_dt, 2)(data); \
+            for (int i = 0; i < 2; i++) { \
+                src_dt src_val = CONCAT2(as_, src_dt)(block_val[i]); \
+                dst[i] = CONCAT2(into_, dst_dt)(src_val); \
+            } \
+            dst += 2; \
+            src += 2 * get_max_sub_group_size(); \
+            n -= 2; \
+        } \
+        if (n >= 1) { \
+            __global BLOCK_DT(src_dt) *data \
+                    = (__global BLOCK_DT(src_dt) *)(src); \
+            BLOCK_DT(src_dt) \
+            block_val = BLOCK_READ_FUNC(src_dt)(data); \
+            src_dt src_val = CONCAT2(as_, src_dt)(block_val); \
+            *dst = CONCAT2(into_, dst_dt)(src_val); \
+        } \
+    } \
     __attribute__((overloadable)) void block_load( \
             __private dst_dt *dst, __global src_dt *src) { \
         __global BLOCK_DT(src_dt) *data = (__global BLOCK_DT(src_dt) *)(src); \
@@ -60,10 +160,72 @@
         *dst = CONCAT2(into_, dst_dt)(src_val); \
     }
 
+#define DEF_load_half_byte(dst_dt, src_dt) \
+    void __attribute__((overloadable)) \
+            load(__private dst_dt *dst, __global src_dt *val, off_t off) { \
+        __global BLOCK_DT(src_dt) *data = (__global BLOCK_DT(src_dt) *)(val); \
+        BLOCK_DT(src_dt) block_val = BLOCK_READ_FUNC(src_dt)(data, off); \
+        *dst = CONCAT2(into_, dst_dt)(block_val); \
+    }
+
 #define DEF_write(dst_dt, src_dt) \
     void __attribute__((overloadable)) \
-            write(__global dst_dt *dst, __private src_dt *val) { \
+            write(__global dst_dt *dst, __private const src_dt *val) { \
         *dst = CONCAT2(into_, dst_dt)(*val); \
+    } \
+    void __attribute__((overloadable)) \
+            write(__global dst_dt *dst, __private src_dt val) { \
+        *dst = CONCAT2(into_, dst_dt)(val); \
+    } \
+    __attribute__((overloadable)) void block_write( \
+            __global dst_dt *dst, __private const src_dt *src, int n) { \
+        __attribute__((opencl_unroll_hint)) while (n >= 8) { \
+            BLOCK_DT_N(dst_dt, 8) block_val; \
+            for (int i = 0; i < 8; i++) { \
+                dst_dt val = CONCAT2(into_, dst_dt)(src[i]); \
+                block_val[i] = as_block_data(val); \
+            } \
+            __global BLOCK_DT(dst_dt) *data \
+                    = (__global BLOCK_DT(dst_dt) *)(dst); \
+            BLOCK_WRITE_FUNC_N(dst_dt, 8)(data, block_val); \
+            dst += 8 * get_max_sub_group_size(); \
+            src += 8; \
+            n -= 8; \
+        } \
+        if (n >= 4) { \
+            BLOCK_DT_N(dst_dt, 4) block_val; \
+            for (int i = 0; i < 4; i++) { \
+                dst_dt val = CONCAT2(into_, dst_dt)(src[i]); \
+                block_val[i] = as_block_data(val); \
+            } \
+            __global BLOCK_DT(dst_dt) *data \
+                    = (__global BLOCK_DT(dst_dt) *)(dst); \
+            BLOCK_WRITE_FUNC_N(dst_dt, 4)(data, block_val); \
+            dst += 4 * get_max_sub_group_size(); \
+            src += 4; \
+            n -= 4; \
+        } \
+        if (n >= 2) { \
+            BLOCK_DT_N(dst_dt, 2) block_val; \
+            for (int i = 0; i < 2; i++) { \
+                dst_dt val = CONCAT2(into_, dst_dt)(src[i]); \
+                block_val[i] = as_block_data(val); \
+            } \
+            __global BLOCK_DT(dst_dt) *data \
+                    = (__global BLOCK_DT(dst_dt) *)(dst); \
+            BLOCK_WRITE_FUNC_N(dst_dt, 2)(data, block_val); \
+            dst += 2 * get_max_sub_group_size(); \
+            src += 2; \
+            n -= 2; \
+        } \
+        if (n >= 1) { \
+            BLOCK_DT(dst_dt) block_val; \
+            dst_dt val = CONCAT2(into_, dst_dt)(*src); \
+            block_val = as_block_data(val); \
+            __global BLOCK_DT(dst_dt) *data \
+                    = (__global BLOCK_DT(dst_dt) *)(dst); \
+            BLOCK_WRITE_FUNC(dst_dt)(data, block_val); \
+        } \
     }
 
 // Loads
@@ -73,8 +235,13 @@ DEF_load(float, char);
 DEF_load(float, uchar);
 DEF_load(int, char);
 DEF_load(int, uchar);
+DEF_load(int, int);
 DEF_load(float, bf16);
 
+// Included for compile time compatibility
+DEF_load(int, undef_data);
+DEF_load(float, undef_data);
+
 // Writes
 DEF_write(char, float);
 DEF_write(uchar, float);
@@ -84,7 +251,9 @@ DEF_write(float, float);
 DEF_write(char, int);
 DEF_write(uchar, int);
 DEF_write(bf16, int);
+DEF_write(int, int);
 DEF_write(float, int);
+DEF_write(int, float);
 
 //******* Conditionally-available load/writes *********//
 
@@ -124,9 +293,43 @@ DEF_write(f8_e4m3, float);
 DEF_write(f8_e4m3, int);
 
 #endif // MATH_UTILS_DECLARE_HF8
+
+#ifdef MATH_UTILS_DECLARE_F4_E2M1
+// Loads
+DEF_load_half_byte(half, f4_e2m1);
+DEF_load_half_byte(float, f4_e2m1);
+
+// Writes
+DEF_write(f4_e2m1, half);
+DEF_write(f4_e2m1, float);
+DEF_write(f4_e2m1, int);
+
+#endif // MATH_UTILS_DECLARE_F4_E2M1
+
+#ifdef MATH_UTILS_DECLARE_F4_E3M0
+// Loads
+DEF_load_half_byte(half, f4_e3m0);
+DEF_load_half_byte(float, f4_e3m0);
+
+// Writes
+DEF_write(f4_e3m0, half);
+DEF_write(f4_e3m0, float);
+DEF_write(f4_e3m0, int);
+
+#endif // MATH_UTILS_DECLARE_F4_E3M0
+
+#ifdef MATH_UTILS_DECLARE_E8M0
+// Loads
+DEF_load(float, e8m0);
+
+#endif
+
 #endif // cl_khr_fp16
 
 #ifdef cl_khr_fp64
+// Included for compile time compatibility
+DEF_load(double, undef_data);
+
 DEF_load(float, double); // Needed for src=f64, dst=f32
 DEF_load(double, bf16);
 DEF_load(double, float);
diff --git a/src/gpu/intel/ocl/ocl_kernel_list.cpp.in b/src/gpu/intel/ocl/ocl_kernel_list.cpp.in
index 998e7d8a709..f9a38c748e6 100644
--- a/src/gpu/intel/ocl/ocl_kernel_list.cpp.in
+++ b/src/gpu/intel/ocl/ocl_kernel_list.cpp.in
@@ -24,7 +24,8 @@
 #include <sstream>
 #endif
 
-#include "gpu/intel/ocl/kernel_utils.hpp"
+#include "common/verbose.hpp"
+#include "gpu/intel/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -77,10 +78,14 @@ const char *get_kernel_source(const char *name) {
 
     if (!name) return nullptr;
 
-    gpu_assert(kernel_list.count(name) == 1)
+    // Missing kernels are expected as they may be included via header files to
+    // enable better reuse between implementations.
+    gpu_assert(kernel_list.count(name) <= 1)
             << "Found " << kernel_list.count(name) << " kernels with the name "
-            << name << ". Expected 1";
-    return kernel_list.at(name);
+            << name << ". Expected at most 1.";
+
+    auto r = kernel_list.find(name);
+    return r != kernel_list.end() ? r->second : nullptr;
 }
 
 const char *get_kernel_header(const std::string &name) {
diff --git a/src/gpu/intel/ocl/ocl_math_utils.h b/src/gpu/intel/ocl/ocl_math_utils.h
index bfbe7d41778..5aa3ff1a1c7 100644
--- a/src/gpu/intel/ocl/ocl_math_utils.h
+++ b/src/gpu/intel/ocl/ocl_math_utils.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,16 +17,24 @@
 #ifndef GPU_INTEL_OCL_OCL_MATH_UTILS_H
 #define GPU_INTEL_OCL_OCL_MATH_UTILS_H
 
-#include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_custom_types.h"
 #include "gpu/intel/ocl/ocl_utils.h"
 
 // Due to JIT compilation and a lack of bitwise operations in implementations,
 // this warning has a high false-positive rate.
 #pragma clang diagnostic ignored "-Wconstant-logical-operand"
 
-int div_up(int a, unsigned int b) {
-    return (a + b - 1) / b;
+// Due to JIT compilation this feature is generally desired.
+#pragma clang diagnostic ignored "-Wtautological-compare"
+
+int __attribute__((overloadable)) div_up(int a, unsigned int b) {
+    return (a / b) + (a % b != 0);
+}
+
+long __attribute__((overloadable)) div_up(long a, unsigned int b) {
+    return (a / b) + (a % b != 0);
 }
+
 int rnd_up(int a, unsigned int b) {
     return div_up(a, b) * b;
 }
@@ -37,32 +45,52 @@ int rnd_down(int a, unsigned int b) {
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 #if DT_BF8 || SRC_DT_BF8 || WEI_DT_BF8 || DST_DT_BF8 || BIA_DT_BF8 || A_DT_BF8 \
-        || B_DT_BF8 || C_DT_BF8 || DATA_DT_BF8 || POST_OP_USING_BF8
+        || B_DT_BF8 || C_DT_BF8 || DATA_DT_BF8 || POST_OP_USING_BF8 \
+        || SRC_SCALES_DT_BF8 || WEI_SCALES_DT_BF8 || DST_SCALES_DT_BF8
 #define MATH_UTILS_DECLARE_BF8 1
 #endif
 
 #if DT_HF8 || SRC_DT_HF8 || WEI_DT_HF8 || DST_DT_HF8 || BIA_DT_HF8 || A_DT_HF8 \
         || A_DT_HF8 || B_DT_HF8 || C_DT_HF8 || DATA_DT_HF8 \
-        || POST_OP_USING_HF8
+        || POST_OP_USING_HF8 || SRC_SCALES_DT_HF8 || WEI_SCALES_DT_HF8 \
+        || DST_SCALES_DT_HF8
 #define MATH_UTILS_DECLARE_HF8 1
 #endif
 
+#if DT_F4_E2M1 || SRC_DT_F4_E2M1 || WEI_DT_F4_E2M1 || DST_DT_F4_E2M1 \
+        || BIA_DT_F4_E2M1 || A_DT_F4_E2M1 || A_DT_F4_E2M1 || B_DT_F4_E2M1 \
+        || C_DT_F4_E2M1 || DATA_DT_F4_E2M1 || POST_OP_USING_F4_E2M1
+#define MATH_UTILS_DECLARE_F4_E2M1 1
+#endif
+
+#if DT_F4_E3M0 || SRC_DT_F4_E3M0 || WEI_DT_F4_E3M0 || DST_DT_F4_E3M0 \
+        || BIA_DT_F4_E3M0 || A_DT_F4_E3M0 || A_DT_F4_E3M0 || B_DT_F4_E3M0 \
+        || C_DT_F4_E3M0 || DATA_DT_F4_E3M0 || POST_OP_USING_F4_E3M0
+#define MATH_UTILS_DECLARE_F4_E3M0 1
+#endif
+
 #if DT_S4 || SRC_DT_S4 || WEI_DT_S4 || DST_DT_S4 || BIA_DT_S4 || A_DT_S4 \
-        || B_DT_S4 || C_DT_S4 || DATA_DT_S4
+        || B_DT_S4 || C_DT_S4 || DATA_DT_S4 || WEI_ZP_DT_S4 || SRC_ZP_DT_S4
 #define MATH_UTILS_DECLARE_S4 1
 #endif
 
 #if DT_U4 || SRC_DT_U4 || WEI_DT_U4 || DST_DT_U4 || BIA_DT_U4 || A_DT_U4 \
-        || A_DT_U4 || B_DT_U4 || C_DT_U4 || DATA_DT_U4
+        || A_DT_U4 || B_DT_U4 || C_DT_U4 || DATA_DT_U4 || WEI_ZP_DT_U4 \
+        || SRC_ZP_DT_U4
 #define MATH_UTILS_DECLARE_U4 1
 #endif
 
 #if DT_BF16 || SRC_DT_BF16 || WEI_DT_BF16 || DST_DT_BF16 || BIA_DT_BF16 \
         || A_DT_BF16 || B_DT_BF16 || C_DT_BF16 || SUM_DT_BF16 || DATA_DT_BF16 \
-        || POST_OP_USING_BF16
+        || POST_OP_USING_BF16 || SRC_SCALES_DT_BF16 || WEI_SCALES_DT_BF16 \
+        || DST_SCALES_DT_BF16
 #define MATH_UTILS_DECLARE_BF16 1
 #endif
 
+#if DST_SCALES_DT_E8M0
+#define MATH_UTILS_DECLARE_E8M0 1
+#endif
+
 ulong8 __builtin_IB_simd_block_read_8_global_l(const __global ulong *);
 ushort16 __builtin_IB_simd_block_read_16_global_h(const __global ushort *);
 
@@ -476,62 +504,6 @@ double16 __attribute__((overloadable)) cvt_bf16_to_f64(ushort16 b) {
 #endif
 #endif
 
-int __attribute__((overloadable)) idot4(char4 a, char4 b, int c) {
-    c += a[0] * b[0];
-    c += a[1] * b[1];
-    c += a[2] * b[2];
-    c += a[3] * b[3];
-    return c;
-}
-
-int __attribute__((overloadable)) idot4(uchar4 a, uchar4 b, int c) {
-    c += a[0] * b[0];
-    c += a[1] * b[1];
-    c += a[2] * b[2];
-    c += a[3] * b[3];
-    return c;
-}
-
-int __attribute__((overloadable)) idot4(char4 a, uchar4 b, int c) {
-    c += a[0] * b[0];
-    c += a[1] * b[1];
-    c += a[2] * b[2];
-    c += a[3] * b[3];
-    return c;
-}
-
-int __attribute__((overloadable)) idot4(uchar4 a, char4 b, int c) {
-    c += a[0] * b[0];
-    c += a[1] * b[1];
-    c += a[2] * b[2];
-    c += a[3] * b[3];
-    return c;
-}
-
-int __attribute__((overloadable)) idot4(int a, int b, int c) {
-    return idot4(as_char4(a), as_char4(b), c);
-}
-
-int __attribute__((overloadable)) idot4(uint a, int b, int c) {
-    return idot4(as_uchar4(a), as_char4(b), c);
-}
-
-float __attribute__((overloadable)) f16_dot2(int a, int b, float c) {
-    half2 _a = as_half2(a);
-    half2 _b = as_half2(b);
-    return c + _a[0] * _b[0] + _a[1] * _b[1];
-}
-
-#if MATH_UTILS_DECLARE_BF16
-float __attribute__((overloadable)) bf16_dot2(int a, int b, float c) {
-    ushort2 _a = as_ushort2(a);
-    ushort2 _b = as_ushort2(b);
-    c += cvt_bf16_to_f32(_a[0]) * cvt_bf16_to_f32(_b[0]);
-    c += cvt_bf16_to_f32(_a[1]) * cvt_bf16_to_f32(_b[1]);
-    return c;
-}
-#endif
-
 #define DECLARE_BLOCK_READ(suffix, func, data_type, addr_space, p_type) \
     data_type __attribute__((overloadable)) \
             block_read##suffix(const addr_space p_type *p) { \
@@ -633,40 +605,6 @@ DECLARE_BLOCK_WRITE(_us, block_write_us_emu, ushort, __local, ushort)
 
 #endif
 
-// Matrix-matrix multiplication: ACC += A * B
-//
-// A is (m x (E * K))
-// B is ((E * K) x sub_group_size)
-// where E is 4 for s8/u8 elements and 2 for f16/bf16 elements.
-#define DECLARE_MMAD_EMU(name, dot, K, m, a_type, b_type, acc_type) \
-    acc_type __attribute__((overloadable)) \
-            name(a_type A_vectors, b_type B_vectors, acc_type acc) { \
-        for (uint i = 0; i < (m); ++i) { \
-            for (uint j = 0; j < (K); ++j) \
-                acc[i] = dot(sub_group_broadcast(A_vectors[i], j), \
-                        B_vectors[j], acc[i]); \
-        } \
-        return acc; \
-    }
-
-DECLARE_MMAD_EMU(mmad8x4, idot4, 8, 4, uint4, int8, int4)
-DECLARE_MMAD_EMU(mmad8x4, idot4, 8, 4, int4, int8, int4)
-DECLARE_MMAD_EMU(mmad8x8, idot4, 8, 8, uint8, int8, int8)
-DECLARE_MMAD_EMU(mmad8x8, idot4, 8, 8, int8, int8, int8)
-DECLARE_MMAD_EMU(mmad8x8, idot4, 8, 8, ushort8, int8, int8)
-DECLARE_MMAD_EMU(mmad8x8, idot4, 8, 8, short8, int8, int8)
-DECLARE_MMAD_EMU(mmad8x4_f16, f16_dot2, 8, 4, uint4, int8, float4)
-DECLARE_MMAD_EMU(mmad8x4_f16, f16_dot2, 8, 4, short4, int8, float4)
-DECLARE_MMAD_EMU(mmad8x8_f16, f16_dot2, 8, 8, uint8, int8, float8)
-DECLARE_MMAD_EMU(mmad8x8_f16, f16_dot2, 8, 8, short8, int8, float8)
-#if MATH_UTILS_DECLARE_BF16
-DECLARE_MMAD_EMU(mmad8x4_bf16, bf16_dot2, 8, 4, uint4, int8, float4)
-DECLARE_MMAD_EMU(mmad8x8_bf16, bf16_dot2, 8, 8, uint8, int8, float8)
-DECLARE_MMAD_EMU(mmad8x4_bf16, bf16_dot2, 8, 4, ushort4, int8, float4)
-DECLARE_MMAD_EMU(mmad8x8_bf16, bf16_dot2, 8, 8, ushort8, int8, float8)
-DECLARE_MMAD_EMU(mmad8x8_bf16, bf16_dot2, 8, 8, short8, int8, float8)
-#endif
-
 // Atomics
 #if !DETERMINISTIC && ATOMICS_SUPPORTED
 #define ATOMIC(x) CONCAT2(atomic_, x)
@@ -736,7 +674,106 @@ float __attribute__((overloadable)) cvt_s4_to_s32(char a) {
     return convert_int_sat_rte(val);
 }
 
-uchar __attribute__((overloadable)) get_half_byte(__global uchar *x, off_t y) {
+#endif
+
+#if MATH_UTILS_DECLARE_F4_E2M1
+
+uchar __attribute__((overloadable)) cvt_f32_to_f4_e2m1(float a) {
+    const float f4_e2m1_max = as_float(0x40c00000);
+    const float exp_shift = as_float(0x00800000);
+
+    // clamp
+    // sel (lt)f0.0 t0:f (abs)x:f 0x40c00000:f
+    float intermediate = fmin(fabs(a), f4_e2m1_max);
+    if (isnan(intermediate)) intermediate = f4_e2m1_max;
+
+    // shift high exp bit down
+    // mul t0:f t0:f 0x00800000:f
+    intermediate *= exp_shift;
+
+    // rtne logic
+    // add t0:ud to:ud -0x00200000:ud
+    // and (nz)f0.0 null t0:ud 0x007fffff:ud
+    uint bits = as_uint(intermediate);
+    bits -= 0x00200000;
+    uint round_up = bits & 0x007fffff;
+
+    // shr t0:ud t0:ud 22
+    bits >>= 22;
+
+    // round
+    // (f0.0) add t0:ud t0:ud 1
+    if (round_up) bits += 1;
+
+    // copy sign
+    // shr y:ud x:ud 28
+    // bfn.0xCA y:ud y:ud t0:ud 0x07
+    uint dst = as_uint(a) >> 28;
+    return ((dst & ~0x07) | (bits & 0x07)) & 0xf;
+}
+
+float __attribute__((overloadable)) cvt_f4_e2m1_to_f32(uchar a) {
+    uint sign = a & 0x08;
+    uint em = a & 0x07;
+    uint exp = em >> 1;
+    uint mant = exp ? a & 0x01 : 0x0;
+    if (em) exp += 126; // No f4 values are subnormal in f32
+    return as_float((sign << 28) | (exp << 23) | (mant << 22));
+}
+
+#endif
+#if MATH_UTILS_DECLARE_F4_E3M0
+
+// OCL translation of common fp4 methods.
+uchar __attribute__((overloadable)) cvt_f32_to_f4_e3m0(float a) {
+    const float f4_e3m0_max = as_float(0x41800000);
+    const float exp_shift = as_float(0x01800000);
+
+    // clamp
+    // sel (lt)f0.0 t0:f (abs)x:f 0x41800000:f
+    float intermediate = fmin(fabs(a), f4_e3m0_max);
+    if (isnan(intermediate)) intermediate = f4_e3m0_max;
+
+    // shift high exp bit down
+    // mul t0:f t0:f 0x01800000:f
+    intermediate *= exp_shift;
+
+    // rtne logic
+    // add t0:ud to:ud -0x00400000:ud
+    // and (nz)f0.0 null t0:ud 0x00ffffff:ud
+    uint bits = as_uint(intermediate);
+    bits -= 0x00400000;
+    uint round_up = bits & 0x00ffffff;
+
+    // shr t0:ud t0:ud 23
+    bits >>= 23;
+
+    // round
+    // (f0.0) add t0:ud t0:ud 1
+    if (round_up) bits += 1;
+
+    // copy sign
+    // shr y:ud x:ud 28
+    // bfn.0xCA y:ud y:ud t0:ud 0x07
+    uint dst = as_uint(a) >> 28;
+    return ((dst & ~0x07) | (bits & 0x07)) & 0xf;
+}
+
+float __attribute__((overloadable)) cvt_f4_e3m0_to_f32(uchar a) {
+    // List of e3m0 values. The index of each value maps to its encoding.
+    const float e3m0_table[16] = {0.0f, .25f, .5f, 1.0f, 2.0f, 4.0f, 8.0f,
+            16.0f, -0.0f, -.25f, -.5f, -1.0f, -2.0f, -4.0f, -8.0f, -16.0f};
+    return e3m0_table[a];
+}
+
+#endif
+
+#if MATH_UTILS_DECLARE_S4 || MATH_UTILS_DECLARE_U4 \
+        || MATH_UTILS_DECLARE_F4_E2M1 || MATH_UTILS_DECLARE_F4_E3M0
+#define GET_HALF_BYTE(x, y) get_half_byte(x, y)
+
+uchar __attribute__((overloadable))
+get_half_byte(const __global uchar *x, off_t y) {
     uchar ret = 0;
     if (y % 2) {
         ret = (uchar)((uchar)(x[y / 2] & 0xf0) >> 4);
@@ -745,20 +782,25 @@ uchar __attribute__((overloadable)) get_half_byte(__global uchar *x, off_t y) {
     }
     return ret;
 }
-char __attribute__((overloadable)) get_half_byte(__global char *x, off_t y) {
+
+char __attribute__((overloadable))
+get_half_byte(const __global char *x, off_t y) {
     if (y % 2) {
         return (x[y / 2] & 0xf0) >> 4;
     } else {
         return x[y / 2] & 0x0f;
     }
 }
+
 void __attribute__((overloadable))
 set_double_half_byte(__global uchar *x, off_t y, uchar z) {
     x[y / 2] = z;
 }
+
 void __attribute__((overloadable))
 set_double_half_byte(__global char *x, off_t y, uchar z) {
     x[y / 2] = z;
 }
+
 #endif
 #endif
diff --git a/src/gpu/intel/ocl/ocl_philox.h b/src/gpu/intel/ocl/ocl_philox.h
new file mode 100644
index 00000000000..3bba014479b
--- /dev/null
+++ b/src/gpu/intel/ocl/ocl_philox.h
@@ -0,0 +1,71 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#define DT_UNDEF 1
+#include "gpu/intel/ocl/ocl_types.h"
+
+uint philox_4x32(long idx, uint seed) {
+#define PHILOX_4UINT_ROUND(mul, ctr, key) \
+    as_uint4(convert_ulong2(ctr.s31) * mul) ^ (uint4)(ctr.s20 ^ key, 0, 0).s3120
+
+    uint4 ctr = 0;
+    const ulong2 ctr_mul = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL);
+    const ulong key_add = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u));
+    const uint16 key0 = (uint16)(seed)
+            + as_uint16((ulong8)(key_add))
+                    * (uint16)(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
+    const uint4 key1
+            = (uint4)(seed) + as_uint4((ulong2)(key_add)) * (uint4)(8, 8, 9, 9);
+    ctr = (uint4)(idx & ~3L) + (uint4)(3, 2, 1, 0);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s01);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s23);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s45);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s67);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s89);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sAB);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sCD);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sEF);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s01);
+    ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s23);
+    return ctr[~idx & 3L];
+}
+
+ushort philox_8x16(long idx, uint seed) {
+    return as_ushort2(philox_4x32(idx >> 1, seed))[idx & 1];
+}
+
+uchar philox_16x8(long idx, uint seed) {
+    return as_uchar4(philox_4x32(idx >> 2, seed))[idx & 3];
+}
+
+#if WITH_SROUND
+
+#if DST_DT_DIGITS > 24
+#error "Invalid dst digits"
+#endif
+
+float stochastic_round_fwd(float s, long idx, uint seed) {
+    if (isnan(s) || isinf(s)) return s;
+    uint truncation_mask = 0xffffffff << (24 - DST_DT_DIGITS);
+    uint bias_val = sizeof(DST_DATA_T) == 2 ? philox_16x8(idx, seed)
+                                            : philox_8x16(idx, seed);
+    uint rnd_bias = (uint)(bias_val & ~truncation_mask);
+    float r = as_float((as_uint(s) + rnd_bias) & truncation_mask);
+    r = fmin(fmax((float)DST_DATA_FLOW, r), (float)DST_DATA_FMAX);
+    if (fabs(r) > 0 && fabs(r) < DST_DATA_FMIN) r = 0;
+    return r;
+}
+#endif
diff --git a/src/gpu/intel/ocl/ocl_post_ops.h b/src/gpu/intel/ocl/ocl_post_ops.h
index da32a3900a8..7ffc7480a25 100644
--- a/src/gpu/intel/ocl/ocl_post_ops.h
+++ b/src/gpu/intel/ocl/ocl_post_ops.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,18 +19,13 @@
 
 #if WITH_POST_OP
 
-#if !WITH_ELTWISE
-#undef WITH_ELTWISE
-#define WITH_ELTWISE 1
-#endif
-
 #include "gpu/intel/ocl/ocl_conversion.h"
 #include "gpu/intel/ocl/ocl_eltwise.h"
 #include "gpu/intel/ocl/ocl_io.h"
 
-float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
+float fwd_Xnary(bool is_binary, unsigned algorithm, float x, float y,
         float alpha, float beta, float scale) {
-    if (kind == PO_BINARY) {
+    if (is_binary) {
         switch (algorithm) {
             // binary
             case BINARY_ADD: return x + y; break;
@@ -55,8 +50,8 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
     }
 }
 
-#define FWD_XNARY_GENERIC_DT(po_kind, algorithm, res_ptr, arg0_ptr, arg0_len, \
-        arg1_ptr, arg1_len, alpha, beta, scale) \
+#define FWD_XNARY_GENERIC_DT(is_binary, algorithm, res_ptr, arg0_ptr, \
+        arg0_len, arg1_ptr, arg1_len, alpha, beta, scale) \
     { \
         auto ty = arg0_len + arg1_len; \
         const typeof(ty) out_len \
@@ -64,8 +59,8 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
         unroll_for(typeof(out_len + 0) idx = 0; idx < out_len; ++idx) { \
             const int arg0_idx = arg0_len == 1 ? 0 : idx; \
             const int arg1_idx = arg1_len == 1 ? 0 : idx; \
-            res_ptr[arg0_len == 1 && arg1_len == 1 ? 0 : idx] = fwd_Xnary( \
-                    po_kind, algorithm, into_float(arg0_ptr[arg0_idx]), \
+            res_ptr[idx] = fwd_Xnary(is_binary, algorithm, \
+                    into_float(arg0_ptr[arg0_idx]), \
                     into_float(arg1_ptr[arg1_idx]), alpha, beta, scale); \
         } \
     }
@@ -94,12 +89,12 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
 #define po_dt(idx) CONCAT3(PO_, idx, _BIN_ARG_ACTUAL_DATA_T)
 #define po_buf(idx) ((__global po_dt(idx) *)(CONCAT3(po_, idx, _binary_arg)))
 
-#define FILL_BIN_ARG_SERIAL(idx, dest_ptr, x0, x0_s, x1, x1_s, x1_incr, x2, \
-        x2_s, x3, x3_s, x4, x4_s, x5, x5_s) \
+#define FILL_BIN_ARG_SERIAL(idx, dest_ptr, x0, x0_s, x1, x1_s, x2, x2_s, x3, \
+        x3_s, x4, x4_s, x5, x5_s) \
     unroll_for(typeof(x0 + x0_s) x0_idx = x0, bin_arg_offset = 0; \
                x0_idx < x0 + x0_s; ++x0_idx) { \
-        unroll_for(typeof(x1 + x1_s + x1_incr) x1_idx = x1; \
-                   x1_idx < x1 + x1_s; x1_idx += x1_incr) { \
+        unroll_for(typeof(x1 + x1_s) x1_idx = x1; x1_idx < x1 + x1_s; \
+                   ++x1_idx) { \
             unroll_for(typeof(x2 + x2_s) x2_idx = x2; x2_idx < x2 + x2_s; \
                        ++x2_idx) { \
                 unroll_for(typeof(x3 + x3_s) x3_idx = x3; x3_idx < x3 + x3_s; \
@@ -126,204 +121,93 @@ float fwd_Xnary(unsigned kind, unsigned algorithm, float x, float y,
         } \
     }
 
-#define FILL_WITH_BLOCK_READ(src_ptr, dst_ptr, nelem) \
-    unroll_for(typeof(nelem + 0) load_idx = 0; load_idx < nelem; ++load_idx) { \
-        block_load(&dst_ptr[load_idx], \
-                src_ptr + load_idx * get_sub_group_size()); \
-    }
-
-#define X_NELEMS(x) (x / get_sub_group_size())
-
-#define CONDITIONAL_FILL(blocked_coord, nelem, src_ptr, dst_ptr) \
-    if (blocked_coord / get_sub_group_size() == nelem) \
-        FILL_WITH_BLOCK_READ(src_ptr, dst_ptr, nelem);
-
-#define FILL_BIN_ARG_TRY_BLOCK(idx, dest_ptr, dest_size, x0, x0_s, x1, x1_s, \
-        x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s) \
-    { \
-        unroll_for(typeof(x0 + x0_s) x0_idx = x0, arg_off = 0; \
-                   x0_idx < x0 + x0_s; ++x0_idx, arg_off += X_NELEMS(x1_s)) { \
-            const auto bin_arg_glob_off = OFF_MD(CONCAT3(PO_, idx, _BIN_ARG), \
-                    x0_idx % CONCAT3(PO_, idx, _BIN_ARG_D0), \
-                    x1 % CONCAT3(PO_, idx, _BIN_ARG_D1), \
-                    x2 % CONCAT3(PO_, idx, _BIN_ARG_D2), \
-                    x3 % CONCAT3(PO_, idx, _BIN_ARG_D3), \
-                    x4 % CONCAT3(PO_, idx, _BIN_ARG_D4), \
-                    x5 % CONCAT3(PO_, idx, _BIN_ARG_D5)); \
-\
-            CONDITIONAL_FILL(x1_s, 1, (po_buf(idx) + bin_arg_glob_off), \
-                    (dest_ptr + arg_off)); \
-            CONDITIONAL_FILL(x1_s, 2, (po_buf(idx) + bin_arg_glob_off), \
-                    (dest_ptr + arg_off)); \
-            CONDITIONAL_FILL(x1_s, 4, (po_buf(idx) + bin_arg_glob_off), \
-                    (dest_ptr + arg_off)); \
-        } \
-    }
-
-#define REPLICATE_DATA( \
-        dest_ptr, dest_size, x0_s, x1_s, x2_s, x3_s, x4_s, x5_s) \
-    { \
-        const auto copy_size = x0_s * x1_s * x2_s * x3_s * x4_s * x5_s; \
-        unroll_for(typeof(dest_size + 0) fid = copy_size; fid < dest_size; \
-                   ++fid) { \
-            *(dest_ptr + fid) = *(dest_ptr + (fid % copy_size)); \
-        } \
-    }
-
-#define IS_BURSTABLE(idx, x0, x0_s, x1, x1_s, x2, x2_s, x3, x3_s, x4, x4_s, \
-        x5, x5_s, is_burst) \
-    ({ \
-        bool is_burstable = is_burst; \
-        if (x0_s > CONCAT3(PO_, idx, _BIN_ARG_D0) && x0_s > 1) \
-            is_burstable = false; \
-        if (x1_s > CONCAT3(PO_, idx, _BIN_ARG_D1) && x1_s > 1) \
-            is_burstable = false; \
-        if (x2_s > CONCAT3(PO_, idx, _BIN_ARG_D2) && x2_s > 1) \
-            is_burstable = false; \
-        if (x3_s > CONCAT3(PO_, idx, _BIN_ARG_D3) && x3_s > 1) \
-            is_burstable = false; \
-        if (x4_s > CONCAT3(PO_, idx, _BIN_ARG_D4) && x4_s > 1) \
-            is_burstable = false; \
-        if (x5_s > CONCAT3(PO_, idx, _BIN_ARG_D5) && x5_s > 1) \
-            is_burstable = false; \
-        if ((CONCAT3(PO_, idx, _BIN_ARG_D0) == 1) \
-                && (CONCAT3(PO_, idx, _BIN_ARG_D1) == 1) \
-                && (CONCAT3(PO_, idx, _BIN_ARG_D2) == 1) \
-                && (CONCAT3(PO_, idx, _BIN_ARG_D2) == 1) \
-                && (CONCAT3(PO_, idx, _BIN_ARG_D3) == 1) \
-                && (CONCAT3(PO_, idx, _BIN_ARG_D4) == 1) \
-                && (CONCAT3(PO_, idx, _BIN_ARG_D5) == 1)) \
-            is_burstable = false; \
-\
-        is_burstable; \
-    })
-
-#define APPLY_PO_BINARY(idx, accumulator, x0, x0_s, x1, x1_s, x1_incr, x2, \
-        x2_s, x3, x3_s, x4, x4_s, x5, x5_s, is_burst, bin_arg_size) \
+// sum_args are unused and maintained for interface compatibility
+#define APPLY_PO_BINARY(idx, bin_arg_size, accumulator, _sum_arg1, _sum_arg2, \
+        _sum_arg3, x0, x0_s, x1, x1_s, x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, \
+        x5, x5_s) \
     { \
         float bin_arg[bin_arg_size]; \
         __private float *bin_arg_ptr = &bin_arg[0]; \
-        const bool use_burst_read = IS_BURSTABLE(idx, x0, x0_s, x1, x1_s, x2, \
-                x2_s, x3, x3_s, x4, x4_s, x5, x5_s, is_burst); \
-        if (use_burst_read) { \
-            FILL_BIN_ARG_TRY_BLOCK(idx, bin_arg_ptr, bin_arg_size, x0, x0_s, \
-                    x1, x1_s, x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, x5, \
-                    x5_s); \
-            REPLICATE_DATA(bin_arg_ptr, bin_arg_size, x0_s, X_NELEMS(x1_s), \
-                    x2_s, x3_s, x4_s, x5_s); \
-        } else { \
-            const auto x1_jump = is_burst ? (int)get_sub_group_size() : 1; \
-            const auto x1_size = x1_s / x1_jump; \
-            FILL_BIN_ARG_SERIAL(idx, bin_arg_ptr, x0, x0_s, (x1 + x1_incr), \
-                    x1_s, x1_jump, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s); \
-            REPLICATE_DATA(bin_arg_ptr, bin_arg_size, x0_s, x1_size, x2_s, \
-                    x3_s, x4_s, x5_s); \
-        } \
-        FWD_XNARY_GENERIC_DT(PO_BINARY, CONCAT3(PO_, idx, _ALG), accumulator, \
+        FILL_BIN_ARG_SERIAL(idx, bin_arg_ptr, x0, x0_s, (x1 + x1_incr), x1_s, \
+                x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s); \
+        FWD_XNARY_GENERIC_DT(true, CONCAT3(PO_, idx, _ALG), accumulator, \
                 accumulator, bin_arg_size, bin_arg_ptr, bin_arg_size, 0.0f, \
                 0.0f, 1.0f); \
     }
 
+// VA_ARGS are unused and maintained for interface compatibility
 #define APPLY_PO_SUM( \
-        idx, accumulator, acc_size, acc_elem_dt, sum_src, sum_elem_dt) \
+        idx, acc_size, accumulator, acc_elem_dt, sum_src, sum_elem_dt, ...) \
     FMA_MIXED(acc_size, sum_src, sum_elem_dt, CONCAT3(PO_, idx, _SUM_SCALE), \
             accumulator, acc_elem_dt);
 
-#define APPLY_PO_ELTWISE(idx, accumulator, nelems) \
-    FWD_XNARY_GENERIC_DT(PO_ELTWISE, CONCAT3(PO_, idx, _ALG), accumulator, \
+// VA_ARGS are unused and maintained for interface compatibility
+#define APPLY_PO_ELTWISE(idx, nelems, accumulator, ...) \
+    FWD_XNARY_GENERIC_DT(false, CONCAT3(PO_, idx, _ALG), accumulator, \
             accumulator, nelems, accumulator, nelems, \
             CONCAT3(PO_, idx, _ELTWISE_ALPHA), \
             CONCAT3(PO_, idx, _ELTWISE_BETA), \
             CONCAT3(PO_, idx, _ELTWISE_SCALE));
 
-#define APPLY_PO_STAGE(idx, acc, acc_elem_dt, nelems, sum_src, sum_elem_dt, \
-        x0, x0_s, x1, x1_s, x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s, \
-        is_burst) \
-    switch (CONCAT3(PO_, idx, _KIND)) { \
-        case PO_BINARY: \
-            APPLY_PO_BINARY(idx, acc, x0, x0_s, x1, x1_s, x1_incr, x2, x2_s, \
-                    x3, x3_s, x4, x4_s, x5, x5_s, is_burst, nelems); \
-            break; \
-        case PO_ELTWISE: APPLY_PO_ELTWISE(idx, acc, nelems); break; \
-        case PO_SUM: \
-            APPLY_PO_SUM(idx, acc, nelems, acc_elem_dt, sum_src, sum_elem_dt); \
-            break; \
-    }
-
 // clang-format off
 #define APPLY_PO_STAGE_0(...)
-#define APPLY_PO_STAGE_1(...) APPLY_PO_STAGE(0, __VA_ARGS__)
-#define APPLY_PO_STAGE_2(...) APPLY_PO_STAGE_1(__VA_ARGS__) APPLY_PO_STAGE(1, __VA_ARGS__)
-#define APPLY_PO_STAGE_3(...) APPLY_PO_STAGE_2(__VA_ARGS__) APPLY_PO_STAGE(2, __VA_ARGS__)
-#define APPLY_PO_STAGE_4(...) APPLY_PO_STAGE_3(__VA_ARGS__) APPLY_PO_STAGE(3, __VA_ARGS__)
-#define APPLY_PO_STAGE_5(...) APPLY_PO_STAGE_4(__VA_ARGS__) APPLY_PO_STAGE(4, __VA_ARGS__)
-#define APPLY_PO_STAGE_6(...) APPLY_PO_STAGE_5(__VA_ARGS__) APPLY_PO_STAGE(5, __VA_ARGS__)
-#define APPLY_PO_STAGE_7(...) APPLY_PO_STAGE_6(__VA_ARGS__) APPLY_PO_STAGE(6, __VA_ARGS__)
-#define APPLY_PO_STAGE_8(...) APPLY_PO_STAGE_7(__VA_ARGS__) APPLY_PO_STAGE(7, __VA_ARGS__)
-#define APPLY_PO_STAGE_9(...) APPLY_PO_STAGE_8(__VA_ARGS__) APPLY_PO_STAGE(8, __VA_ARGS__)
-#define APPLY_PO_STAGE_10(...) APPLY_PO_STAGE_9(__VA_ARGS__) APPLY_PO_STAGE(9, __VA_ARGS__)
-#define APPLY_PO_STAGE_11(...) APPLY_PO_STAGE_10(__VA_ARGS__) APPLY_PO_STAGE(10, __VA_ARGS__)
-#define APPLY_PO_STAGE_12(...) APPLY_PO_STAGE_11(__VA_ARGS__) APPLY_PO_STAGE(11, __VA_ARGS__)
-#define APPLY_PO_STAGE_13(...) APPLY_PO_STAGE_12(__VA_ARGS__) APPLY_PO_STAGE(12, __VA_ARGS__)
-#define APPLY_PO_STAGE_14(...) APPLY_PO_STAGE_13(__VA_ARGS__) APPLY_PO_STAGE(13, __VA_ARGS__)
-#define APPLY_PO_STAGE_15(...) APPLY_PO_STAGE_14(__VA_ARGS__) APPLY_PO_STAGE(14, __VA_ARGS__)
-#define APPLY_PO_STAGE_16(...) APPLY_PO_STAGE_15(__VA_ARGS__) APPLY_PO_STAGE(15, __VA_ARGS__)
-#define APPLY_PO_STAGE_17(...) APPLY_PO_STAGE_16(__VA_ARGS__) APPLY_PO_STAGE(16, __VA_ARGS__)
-#define APPLY_PO_STAGE_18(...) APPLY_PO_STAGE_17(__VA_ARGS__) APPLY_PO_STAGE(17, __VA_ARGS__)
-#define APPLY_PO_STAGE_19(...) APPLY_PO_STAGE_18(__VA_ARGS__) APPLY_PO_STAGE(18, __VA_ARGS__)
-#define APPLY_PO_STAGE_20(...) APPLY_PO_STAGE_19(__VA_ARGS__) APPLY_PO_STAGE(19, __VA_ARGS__)
-#define APPLY_PO_STAGE_21(...) APPLY_PO_STAGE_20(__VA_ARGS__) APPLY_PO_STAGE(20, __VA_ARGS__)
-#define APPLY_PO_STAGE_22(...) APPLY_PO_STAGE_21(__VA_ARGS__) APPLY_PO_STAGE(21, __VA_ARGS__)
-#define APPLY_PO_STAGE_23(...) APPLY_PO_STAGE_22(__VA_ARGS__) APPLY_PO_STAGE(22, __VA_ARGS__)
-#define APPLY_PO_STAGE_24(...) APPLY_PO_STAGE_23(__VA_ARGS__) APPLY_PO_STAGE(23, __VA_ARGS__)
-#define APPLY_PO_STAGE_25(...) APPLY_PO_STAGE_24(__VA_ARGS__) APPLY_PO_STAGE(24, __VA_ARGS__)
-#define APPLY_PO_STAGE_26(...) APPLY_PO_STAGE_25(__VA_ARGS__) APPLY_PO_STAGE(25, __VA_ARGS__)
-#define APPLY_PO_STAGE_27(...) APPLY_PO_STAGE_26(__VA_ARGS__) APPLY_PO_STAGE(26, __VA_ARGS__)
-#define APPLY_PO_STAGE_28(...) APPLY_PO_STAGE_27(__VA_ARGS__) APPLY_PO_STAGE(27, __VA_ARGS__)
-#define APPLY_PO_STAGE_29(...) APPLY_PO_STAGE_28(__VA_ARGS__) APPLY_PO_STAGE(28, __VA_ARGS__)
-#define APPLY_PO_STAGE_30(...) APPLY_PO_STAGE_29(__VA_ARGS__) APPLY_PO_STAGE(29, __VA_ARGS__)
-#define APPLY_PO_STAGE_31(...) APPLY_PO_STAGE_30(__VA_ARGS__) APPLY_PO_STAGE(30, __VA_ARGS__)
-#define APPLY_PO_STAGE_32(...) APPLY_PO_STAGE_31(__VA_ARGS__) APPLY_PO_STAGE(31, __VA_ARGS__)
+#define APPLY_PO_STAGE_1(...) APPLY_PO_0(0, __VA_ARGS__)
+#define APPLY_PO_STAGE_2(...) APPLY_PO_STAGE_1(__VA_ARGS__) APPLY_PO_1(1, __VA_ARGS__)
+#define APPLY_PO_STAGE_3(...) APPLY_PO_STAGE_2(__VA_ARGS__) APPLY_PO_2(2, __VA_ARGS__)
+#define APPLY_PO_STAGE_4(...) APPLY_PO_STAGE_3(__VA_ARGS__) APPLY_PO_3(3, __VA_ARGS__)
+#define APPLY_PO_STAGE_5(...) APPLY_PO_STAGE_4(__VA_ARGS__) APPLY_PO_4(4, __VA_ARGS__)
+#define APPLY_PO_STAGE_6(...) APPLY_PO_STAGE_5(__VA_ARGS__) APPLY_PO_5(5, __VA_ARGS__)
+#define APPLY_PO_STAGE_7(...) APPLY_PO_STAGE_6(__VA_ARGS__) APPLY_PO_6(6, __VA_ARGS__)
+#define APPLY_PO_STAGE_8(...) APPLY_PO_STAGE_7(__VA_ARGS__) APPLY_PO_7(7, __VA_ARGS__)
+#define APPLY_PO_STAGE_9(...) APPLY_PO_STAGE_8(__VA_ARGS__) APPLY_PO_8(8, __VA_ARGS__)
+#define APPLY_PO_STAGE_10(...) APPLY_PO_STAGE_9(__VA_ARGS__) APPLY_PO_9(9, __VA_ARGS__)
+#define APPLY_PO_STAGE_11(...) APPLY_PO_STAGE_10(__VA_ARGS__) APPLY_PO_10(10, __VA_ARGS__)
+#define APPLY_PO_STAGE_12(...) APPLY_PO_STAGE_11(__VA_ARGS__) APPLY_PO_11(11, __VA_ARGS__)
+#define APPLY_PO_STAGE_13(...) APPLY_PO_STAGE_12(__VA_ARGS__) APPLY_PO_12(12, __VA_ARGS__)
+#define APPLY_PO_STAGE_14(...) APPLY_PO_STAGE_13(__VA_ARGS__) APPLY_PO_13(13, __VA_ARGS__)
+#define APPLY_PO_STAGE_15(...) APPLY_PO_STAGE_14(__VA_ARGS__) APPLY_PO_14(14, __VA_ARGS__)
+#define APPLY_PO_STAGE_16(...) APPLY_PO_STAGE_15(__VA_ARGS__) APPLY_PO_15(15, __VA_ARGS__)
+#define APPLY_PO_STAGE_17(...) APPLY_PO_STAGE_16(__VA_ARGS__) APPLY_PO_16(16, __VA_ARGS__)
+#define APPLY_PO_STAGE_18(...) APPLY_PO_STAGE_17(__VA_ARGS__) APPLY_PO_17(17, __VA_ARGS__)
+#define APPLY_PO_STAGE_19(...) APPLY_PO_STAGE_18(__VA_ARGS__) APPLY_PO_18(18, __VA_ARGS__)
+#define APPLY_PO_STAGE_20(...) APPLY_PO_STAGE_19(__VA_ARGS__) APPLY_PO_19(19, __VA_ARGS__)
+#define APPLY_PO_STAGE_21(...) APPLY_PO_STAGE_20(__VA_ARGS__) APPLY_PO_20(20, __VA_ARGS__)
+#define APPLY_PO_STAGE_22(...) APPLY_PO_STAGE_21(__VA_ARGS__) APPLY_PO_21(21, __VA_ARGS__)
+#define APPLY_PO_STAGE_23(...) APPLY_PO_STAGE_22(__VA_ARGS__) APPLY_PO_22(22, __VA_ARGS__)
+#define APPLY_PO_STAGE_24(...) APPLY_PO_STAGE_23(__VA_ARGS__) APPLY_PO_23(23, __VA_ARGS__)
+#define APPLY_PO_STAGE_25(...) APPLY_PO_STAGE_24(__VA_ARGS__) APPLY_PO_24(24, __VA_ARGS__)
+#define APPLY_PO_STAGE_26(...) APPLY_PO_STAGE_25(__VA_ARGS__) APPLY_PO_25(25, __VA_ARGS__)
+#define APPLY_PO_STAGE_27(...) APPLY_PO_STAGE_26(__VA_ARGS__) APPLY_PO_26(26, __VA_ARGS__)
+#define APPLY_PO_STAGE_28(...) APPLY_PO_STAGE_27(__VA_ARGS__) APPLY_PO_27(27, __VA_ARGS__)
+#define APPLY_PO_STAGE_29(...) APPLY_PO_STAGE_28(__VA_ARGS__) APPLY_PO_28(28, __VA_ARGS__)
+#define APPLY_PO_STAGE_30(...) APPLY_PO_STAGE_29(__VA_ARGS__) APPLY_PO_29(29, __VA_ARGS__)
+#define APPLY_PO_STAGE_31(...) APPLY_PO_STAGE_30(__VA_ARGS__) APPLY_PO_30(30, __VA_ARGS__)
+#define APPLY_PO_STAGE_32(...) APPLY_PO_STAGE_31(__VA_ARGS__) APPLY_PO_31(31, __VA_ARGS__)
 // clang-format on
 
 #define APPLY_ALL_PO_STAGES(accumulator, acc_elem_dt, ...) \
     { \
         const int nelems = sizeof(accumulator) / sizeof(acc_elem_dt); \
-        acc_elem_dt *acc = &accumulator; \
+        acc_elem_dt *acc = (acc_elem_dt *)&accumulator; \
         CONCAT2(APPLY_PO_STAGE_, POST_OP_CHAIN_LENGTH) \
-        (acc, acc_elem_dt, nelems, __VA_ARGS__) \
+        (nelems, acc, acc_elem_dt, __VA_ARGS__) \
     }
 
-#define APPLY_POST_OPS_BL(accumulator, acc_elem_dt, sum_src, sum_elem_dt, x0, \
-        x0_s, x1, x1_s, x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s, \
-        is_burst) \
-    APPLY_ALL_PO_STAGES(accumulator, acc_elem_dt, sum_src, sum_elem_dt, x0, \
-            x0_s, x1, x1_s, x1_incr, x2, x2_s, x3, x3_s, x4, x4_s, x5, x5_s, \
-            is_burst);
-
-#define APPLY_POST_OPS_TRY_BURST(accumulator, acc_elem_dt, sum_src, \
-        sum_elem_dt, mb_start, mb_size, oc_start, oc_size, oc_serial_incr) \
-    APPLY_POST_OPS_BL(accumulator, acc_elem_dt, sum_src, sum_elem_dt, \
-            mb_start, mb_size, oc_start, oc_size, oc_serial_incr, 0, 1, 0, 1, \
-            0, 1, 0, 1, true)
-
 #define APPLY_POST_OPS_SERIAL(accumulator, acc_elem_dt, sum_src, sum_elem_dt, \
         mb_start, mb_size, oc_start, oc_size, d2_start, d2_size, d3_start, \
         d3_size, d4_start, d4_size, d5_start, d5_size) \
-    APPLY_POST_OPS_BL(accumulator, acc_elem_dt, sum_src, sum_elem_dt, \
+    APPLY_ALL_PO_STAGES(accumulator, acc_elem_dt, sum_src, sum_elem_dt, \
             mb_start, mb_size, oc_start, oc_size, 0, d2_start, d2_size, \
-            d3_start, d3_size, d4_start, d4_size, d5_start, d5_size, false)
+            d3_start, d3_size, d4_start, d4_size, d5_start, d5_size)
 
 #define APPLY_POST_OPS_SERIAL_BINARY_2D(accumulator, acc_elem_dt, sum_src, \
         sum_elem_dt, mb_start, mb_size, oc_start, oc_size) \
-    APPLY_POST_OPS_BL(accumulator, acc_elem_dt, sum_src, sum_elem_dt, \
-            mb_start, mb_size, oc_start, oc_size, 0, 0, 1, 0, 1, 0, 1, 0, 1, \
-            false)
+    APPLY_ALL_PO_STAGES(accumulator, acc_elem_dt, sum_src, sum_elem_dt, \
+            mb_start, mb_size, oc_start, oc_size, 0, 0, 1, 0, 1, 0, 1, 0, 1)
 #else
 
 #define APPLY_POST_OPS_SERIAL(...)
 #define APPLY_POST_OPS_SERIAL_BINARY_2D(...)
-#define APPLY_POST_OPS_TRY_BURST(...)
 
 #endif // WITH_POST_OP
 
diff --git a/src/gpu/intel/ocl/ocl_stream.cpp b/src/gpu/intel/ocl/ocl_stream.cpp
deleted file mode 100644
index 83f1389d775..00000000000
--- a/src/gpu/intel/ocl/ocl_stream.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <cstring>
-
-#include <CL/cl.h>
-
-#include "common/verbose.hpp"
-
-#include "xpu/ocl/memory_storage.hpp"
-#include "xpu/ocl/stream_profiler.hpp"
-
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-status_t ocl_stream_t::init() {
-    if (is_profiling_enabled()) {
-        profiler_ = utils::make_unique<xpu::ocl::stream_profiler_t>(this);
-        mdapi_helper_ = utils::make_unique<mdapi_helper_t>();
-    }
-    // Restore queue on successful exit, otherwise queue may be released
-    // without retain
-    cl_command_queue queue = impl()->queue();
-    impl()->set_queue(nullptr);
-
-    assert(engine()->kind() == engine_kind::gpu);
-
-    const auto *ocl_engine_impl
-            = utils::downcast<const xpu::ocl::engine_impl_t *>(
-                    engine()->impl());
-
-    // Create queue if it is not set
-    if (!queue) {
-        cl_int err;
-        queue = create_queue(
-                ocl_engine_impl->context(), ocl_engine_impl->device(), &err);
-        OCL_CHECK(err);
-    } else {
-        // Check that queue is compatible with the engine
-        cl_context ocl_ctx;
-        OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT,
-                sizeof(cl_context), &ocl_ctx, nullptr));
-
-        cl_device_id ocl_dev;
-        OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE,
-                sizeof(cl_device_id), &ocl_dev, nullptr));
-
-        if (ocl_engine_impl->device() != ocl_dev
-                || ocl_engine_impl->context() != ocl_ctx)
-            return status::invalid_arguments;
-
-        OCL_CHECK(clRetainCommandQueue(queue));
-    }
-    impl()->set_queue(queue);
-
-    if (is_profiling_enabled()) {
-        cl_command_queue_properties props;
-        OCL_CHECK(clGetCommandQueueInfo(impl()->queue(), CL_QUEUE_PROPERTIES,
-                sizeof(props), &props, nullptr));
-        bool is_out_of_order
-                = (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
-        if (is_out_of_order) {
-            VERROR(common, ocl,
-                    "OpenCL kernel profiling is not "
-                    "supported with out-of-order queues");
-            return status::invalid_arguments;
-        }
-    }
-
-    return status::success;
-}
-
-cl_command_queue ocl_stream_t::create_queue(
-        cl_context ctx, cl_device_id dev, cl_int *err) const {
-    if (is_profiling_enabled() && mdapi_helper_) {
-        auto ret = mdapi_helper_->create_queue(ctx, dev, err);
-        if (ret) return ret;
-    }
-
-    const bool is_out_of_order = (flags() & stream_flags::out_of_order);
-
-    cl_command_queue_properties queue_props {};
-    if (is_profiling_enabled()) queue_props |= CL_QUEUE_PROFILING_ENABLE;
-    if (is_out_of_order) queue_props |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
-#ifdef CL_VERSION_2_0
-    cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, queue_props, 0};
-    return clCreateCommandQueueWithProperties(ctx, dev, props, err);
-#else
-    return clCreateCommandQueue(ctx, dev, queue_props, err);
-#endif
-}
-
-void ocl_stream_t::before_exec_hook() {
-    if (is_profiling_enabled()) profiler_->start_profiling();
-}
-
-void ocl_stream_t::after_exec_hook() {
-    ocl_ctx().set_deps(xpu::ocl::event_t());
-    if (is_profiling_enabled()) profiler_->stop_profiling();
-}
-
-status_t ocl_stream_t::copy(const memory_storage_t &src,
-        const memory_storage_t &dst, size_t size, const xpu::event_t &deps,
-        xpu::event_t &out_dep) {
-    return impl()->copy(this, src, dst, size, deps, out_dep, profiler_.get());
-}
-
-status_t ocl_stream_t::fill(const memory_storage_t &dst, uint8_t pattern,
-        size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) {
-    return impl()->fill(
-            this, dst, pattern, size, deps, out_dep, profiler_.get());
-}
-
-status_t ocl_stream_t::barrier() {
-    return impl()->barrier();
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ocl_stream.hpp b/src/gpu/intel/ocl/ocl_stream.hpp
deleted file mode 100644
index a08f9daef81..00000000000
--- a/src/gpu/intel/ocl/ocl_stream.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_OCL_STREAM_HPP
-#define GPU_INTEL_OCL_OCL_STREAM_HPP
-
-#include <memory>
-
-#include "common/c_types_map.hpp"
-#include "common/thread_local_storage.hpp"
-
-#include "xpu/stream_profiler.hpp"
-
-#include "xpu/ocl/context.hpp"
-#include "xpu/ocl/stream_impl.hpp"
-
-#include "gpu/intel/compute/compute_stream.hpp"
-#include "gpu/intel/ocl/mdapi_utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-struct ocl_stream_t : public compute::compute_stream_t {
-    static status_t create_stream(impl::stream_t **stream,
-            impl::engine_t *engine, impl::stream_impl_t *stream_impl) {
-
-        std::unique_ptr<ocl_stream_t> s(new ocl_stream_t(engine, stream_impl));
-        if (!s) return status::out_of_memory;
-
-        status_t status = s->init();
-        if (status != status::success) {
-            // Stream owns stream_impl only if it's created successfully (including initialization).
-            s->impl_.release();
-            return status;
-        }
-
-        *stream = s.release();
-        return status::success;
-    }
-
-    status_t wait() override { return impl()->wait(); }
-
-    void before_exec_hook() override;
-    void after_exec_hook() override;
-
-    double get_freq(const xpu::event_t &event) const override {
-        const auto &ocl_event = xpu::ocl::event_t::from(event).events;
-        gpu_assert(ocl_event.size() == 1);
-        return mdapi_helper().get_freq(ocl_event[0]);
-    }
-
-    status_t reset_profiling() override {
-        if (!is_profiling_enabled()) return status::invalid_arguments;
-        profiler_->reset();
-        return status::success;
-    }
-
-    status_t get_profiling_data(profiling_data_kind_t data_kind,
-            int *num_entries, uint64_t *data) const override {
-        if (!is_profiling_enabled()) return status::invalid_arguments;
-        return profiler_->get_info(data_kind, num_entries, data);
-    }
-
-    cl_command_queue queue() const { return impl()->queue(); }
-
-    const mdapi_helper_t &mdapi_helper() const { return *mdapi_helper_; }
-
-    status_t copy(const memory_storage_t &src, const memory_storage_t &dst,
-            size_t size, const xpu::event_t &deps,
-            xpu::event_t &out_dep) override;
-
-    status_t fill(const memory_storage_t &dst, uint8_t pattern, size_t size,
-            const xpu::event_t &deps, xpu::event_t &out_dep) override;
-
-    status_t barrier() override;
-
-    ~ocl_stream_t() override = default;
-
-    const xpu::ocl::context_t &ocl_ctx() const { return impl()->ocl_ctx(); }
-    xpu::ocl::context_t &ocl_ctx() { return impl()->ocl_ctx(); }
-    xpu::context_t &ctx() override { return impl()->ocl_ctx(); }
-    const xpu::context_t &ctx() const override { return impl()->ocl_ctx(); }
-
-    const xpu::ocl::wrapper_t<cl_event> &get_output_event() const {
-        return impl()->get_output_event();
-    }
-
-private:
-    xpu::ocl::stream_impl_t *impl() const {
-        return (xpu::ocl::stream_impl_t *)impl::stream_t::impl_.get();
-    }
-
-    ocl_stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl)
-        : compute_stream_t(engine, stream_impl) {}
-
-    status_t init();
-
-    cl_command_queue create_queue(
-            cl_context ctx, cl_device_id dev, cl_int *err) const;
-
-    std::unique_ptr<mdapi_helper_t> mdapi_helper_;
-};
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/ocl/ocl_types.h b/src/gpu/intel/ocl/ocl_types.h
index 1760ea62944..f8b373b4a6d 100644
--- a/src/gpu/intel/ocl/ocl_types.h
+++ b/src/gpu/intel/ocl/ocl_types.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -89,6 +89,7 @@
 
 #include "gpu/intel/ocl/ocl_custom_types.h"
 #include "gpu/intel/ocl/ocl_math_utils.h"
+#include "gpu/intel/ocl/ocl_types_specific.h"
 #include "gpu/intel/ocl/ocl_utils.h"
 
 #define auto __auto_type
@@ -163,11 +164,6 @@
 #define AS_DATA4_T as_float4
 #define AS_DATA8_T as_float8
 
-#define AS_UINT_T as_uint
-#define AS_UINT2_T as_uint2
-#define AS_UINT4_T as_uint4
-#define AS_UINT8_T as_uint8
-
 #define BLOCK_DATA_T uint
 #define BLOCK_DATA2_T uint2
 #define BLOCK_DATA4_T uint4
@@ -286,11 +282,6 @@
 #define AS_DATA4_T as_half4
 #define AS_DATA8_T as_half8
 
-#define AS_UINT_T as_ushort
-#define AS_UINT2_T as_ushort2
-#define AS_UINT4_T as_ushort4
-#define AS_UINT8_T as_ushort8
-
 #define BLOCK_DATA_T ushort
 #define BLOCK_DATA2_T ushort2
 #define BLOCK_DATA4_T ushort4
@@ -304,7 +295,11 @@
 #define TO_FLT_ACC_DATA_T convert_float
 
 #elif DT_BF16 == 1
+#if WITH_PUNNING
 #define DATA_T ushort
+#else
+#define DATA_T bf16
+#endif
 #define DATA2_T ushort2
 #define POST_OP_DATA_T float
 #define DATA2_T ushort2
@@ -347,11 +342,6 @@
 #define AS_DATA4_T as_ushort4
 #define AS_DATA8_T as_ushort8
 
-#define AS_UINT_T as_ushort
-#define AS_UINT2_T as_ushort2
-#define AS_UINT4_T as_ushort4
-#define AS_UINT8_T as_ushort8
-
 #define BLOCK_DATA_T ushort
 #define BLOCK_DATA2_T ushort2
 #define BLOCK_DATA4_T ushort4
@@ -365,7 +355,11 @@
 #define TO_FLT_ACC_DATA_T cvt_bf16_to_f32
 
 #elif DT_BF8 == 1
+#if WITH_PUNNING
 #define DATA_T uchar
+#else
+#define DATA_T f8_e5m2
+#endif
 #define DATA2_T uchar2
 #define DATA4_T uchar4
 #define DATA8_T uchar8
@@ -408,12 +402,6 @@
 #define AS_DATA8_T as_uchar8
 #define AS_DATA16_T as_uchar16
 
-#define AS_UINT_T as_uchar
-#define AS_UINT2_T as_uchar2
-#define AS_UINT4_T as_uchar4
-#define AS_UINT8_T as_uchar8
-#define AS_INT8_T as_uint8
-
 #define BLOCK_DATA_T uchar
 #define BLOCK_DATA2_T uchar2
 #define BLOCK_DATA4_T uchar4
@@ -433,7 +421,11 @@
 #define TO_FLT_ACC_DATA_T convert_float(cvt_f8_e5m2_to_hf(v))
 
 #elif DT_HF8 == 1
+#if WITH_PUNNING
 #define DATA_T uchar
+#else
+#define DATA_T f8_e4m3
+#endif
 #define DATA2_T uchar2
 #define DATA4_T uchar4
 #define DATA8_T uchar8
@@ -476,12 +468,6 @@
 #define AS_DATA8_T as_uchar8
 #define AS_DATA16_T as_uchar16
 
-#define AS_UINT_T as_uchar
-#define AS_UINT2_T as_uchar2
-#define AS_UINT4_T as_uchar4
-#define AS_UINT8_T as_uchar8
-#define AS_INT8_T as_uint8
-
 #define BLOCK_DATA_T uchar
 #define BLOCK_DATA2_T uchar2
 #define BLOCK_DATA4_T uchar4
@@ -500,6 +486,138 @@
 #define FLT_ACC_DATA_T float
 #define TO_FLT_ACC_DATA_T(v) convert_float(cvt_f8_e4m3_to_hf(v))
 
+#elif DT_F4_E3M0 == 1
+#if WITH_PUNNING
+#define DATA_T uchar
+#else
+#define DATA_T f4_e3m0
+#endif
+#define DATA2_T uchar2
+#define DATA4_T uchar4
+#define DATA8_T uchar8
+#define DATA16_T uchar16
+#define DATA_MAX (uchar)0x07
+#define DATA_MIN (uchar)0x08
+#define DATA_ZERO (uchar)0x00
+#define DATA_ONE (uchar)0x03
+#define DEF_ACC_DATA_T float
+#define DEF_ACC_DATA2_T float2
+#define DEF_ACC_DATA4_T float4
+#define DEF_ACC_DATA8_T float8
+#define POST_OP_DATA_T float
+#define TO_DATA_T(v) cvt_f32_to_f4_e3m0((v))
+#define TO_DEF_ACC_DATA_T(v) (cvt_f4_e3m0_to_f32(v))
+#define TO_DEF_ACC_DATA2_T(v) (cvt_f4_e3m0_to_f32(v))
+#define TO_DEF_ACC_DATA4_T(v) (cvt_f4_e3m0_to_f32(v))
+#define TO_DEF_ACC_DATA8_T(v) (cvt_f4_e3m0_to_f32(v))
+#define DATA_TO_REF(v) (cvt_f4_e3m0_to_f32(v))
+#define CONVERT_DATA_T(v) cvt_f32_to_f4_e3m0(v)
+#define CONVERT_DATA2_T(v) cvt_f32_to_f4_e3m0(v)
+#define CONVERT_DATA4_T(v) cvt_f32_to_f4_e3m0(v)
+#define CONVERT_DATA8_T(v) cvt_f32_to_f4_e3m0(v)
+#define CONVERT_FLOAT_T(v) cvt_f4_e3m0_to_f32(v)
+#define CONVERT_FLOAT2_T(v) cvt_f4_e3m0_to_f32(v)
+#define CONVERT_FLOAT4_T(v) cvt_f4_e3m0_to_f32(v)
+#define CONVERT_FLOAT8_T(v) cvt_f4_e3m0_to_f32(v)
+
+#define BLOCK_READ intel_sub_group_block_read_uc
+#define BLOCK_WRITE intel_sub_group_block_write_uc
+#define BLOCK_READ2 intel_sub_group_block_read_uc2
+#define BLOCK_READ4 intel_sub_group_block_read_uc4
+#define BLOCK_READ8 intel_sub_group_block_read_uc8
+#define BLOCK_WRITE2 intel_sub_group_block_write_uc2
+#define BLOCK_WRITE4 intel_sub_group_block_write_uc4
+#define BLOCK_WRITE8 intel_sub_group_block_write_uc8
+#define AS_DATA_T as_uchar
+#define AS_DATA2_T as_uchar2
+#define AS_DATA4_T as_uchar4
+#define AS_DATA8_T as_uchar8
+#define AS_DATA16_T as_uchar16
+
+#define AS_UINT_T as_uchar
+#define AS_UINT2_T as_uchar2
+#define AS_UINT4_T as_uchar4
+#define AS_UINT8_T as_uchar8
+#define AS_INT8_T as_uint8
+
+#define BLOCK_DATA_T uchar
+#define BLOCK_DATA2_T uchar2
+#define BLOCK_DATA4_T uchar4
+#define BLOCK_DATA8_T uchar8
+#define AS_BLOCK_DATA_T as_uchar
+#define AS_BLOCK_DATA2_T as_uchar2
+#define AS_BLOCK_DATA4_T as_uchar4
+#define AS_BLOCK_DATA8_T as_uchar8
+
+#define FLT_ACC_DATA_T float
+#define TO_FLT_ACC_DATA_T(v) (cvt_f4_e3m0_to_f32(v))
+
+#elif DT_F4_E2M1 == 1
+#if WITH_PUNNING
+#define DATA_T uchar
+#else
+#define DATA_T f4_e2m1
+#endif
+#define DATA2_T uchar2
+#define DATA4_T uchar4
+#define DATA8_T uchar8
+#define DATA16_T uchar16
+#define DATA_MAX (uchar)0x07
+#define DATA_MIN (uchar)0x01
+#define DATA_ZERO (uchar)0x00
+#define DATA_ONE (uchar)0x02
+#define DEF_ACC_DATA_T float
+#define DEF_ACC_DATA2_T float2
+#define DEF_ACC_DATA4_T float4
+#define DEF_ACC_DATA8_T float8
+#define POST_OP_DATA_T float
+#define TO_DATA_T(v) cvt_f32_to_f4_e2m1((v))
+#define TO_DEF_ACC_DATA_T(v) (cvt_f4_e2m1_to_f32(v))
+#define TO_DEF_ACC_DATA2_T(v) (cvt_f4_e2m1_to_f32(v))
+#define TO_DEF_ACC_DATA4_T(v) (cvt_f4_e2m1_to_f32(v))
+#define TO_DEF_ACC_DATA8_T(v) (cvt_f4_e2m1_to_f32(v))
+#define DATA_TO_REF(v) (cvt_f4_e2m1_to_f32(v))
+#define CONVERT_DATA_T(v) cvt_f32_to_f4_e2m1(v)
+#define CONVERT_DATA2_T(v) cvt_f32_to_f4_e2m1(v)
+#define CONVERT_DATA4_T(v) cvt_f32_to_f4_e2m1(v)
+#define CONVERT_DATA8_T(v) cvt_f32_to_f4_e2m1(v)
+#define CONVERT_FLOAT_T(v) cvt_f4_e2m1_to_f32(v)
+#define CONVERT_FLOAT2_T(v) cvt_f4_e2m1_to_f32(v)
+#define CONVERT_FLOAT4_T(v) cvt_f4_e2m1_to_f32(v)
+#define CONVERT_FLOAT8_T(v) cvt_f4_e2m1_to_f32(v)
+
+#define BLOCK_READ intel_sub_group_block_read_uc
+#define BLOCK_WRITE intel_sub_group_block_write_uc
+#define BLOCK_READ2 intel_sub_group_block_read_uc2
+#define BLOCK_READ4 intel_sub_group_block_read_uc4
+#define BLOCK_READ8 intel_sub_group_block_read_uc8
+#define BLOCK_WRITE2 intel_sub_group_block_write_uc2
+#define BLOCK_WRITE4 intel_sub_group_block_write_uc4
+#define BLOCK_WRITE8 intel_sub_group_block_write_uc8
+#define AS_DATA_T as_uchar
+#define AS_DATA2_T as_uchar2
+#define AS_DATA4_T as_uchar4
+#define AS_DATA8_T as_uchar8
+#define AS_DATA16_T as_uchar16
+
+#define AS_UINT_T as_uchar
+#define AS_UINT2_T as_uchar2
+#define AS_UINT4_T as_uchar4
+#define AS_UINT8_T as_uchar8
+#define AS_INT8_T as_uint8
+
+#define BLOCK_DATA_T uchar
+#define BLOCK_DATA2_T uchar2
+#define BLOCK_DATA4_T uchar4
+#define BLOCK_DATA8_T uchar8
+#define AS_BLOCK_DATA_T as_uchar
+#define AS_BLOCK_DATA2_T as_uchar2
+#define AS_BLOCK_DATA4_T as_uchar4
+#define AS_BLOCK_DATA8_T as_uchar8
+
+#define FLT_ACC_DATA_T float
+#define TO_FLT_ACC_DATA_T(v) (cvt_f4_e2m1_to_f32(v))
+
 #elif DT_S8 == 1
 #define DATA_T char
 #define DATA2_T char2
@@ -545,12 +663,6 @@
 #define AS_DATA8_T as_char8
 #define AS_DATA16_T as_char16
 
-#define AS_UINT_T as_uchar
-#define AS_UINT2_T as_uchar2
-#define AS_UINT4_T as_uchar4
-#define AS_UINT8_T as_uchar8
-#define AS_INT8_T as_int8
-
 #define BLOCK_DATA_T uchar
 #define BLOCK_DATA2_T uchar2
 #define BLOCK_DATA4_T uchar4
@@ -608,12 +720,6 @@
 #define AS_DATA8_T as_uchar8
 #define AS_DATA16_T as_uchar16
 
-#define AS_UINT_T as_uchar
-#define AS_UINT2_T as_uchar2
-#define AS_UINT4_T as_uchar4
-#define AS_UINT8_T as_uchar8
-#define AS_INT8_T as_uint8
-
 #define BLOCK_DATA_T uchar
 #define BLOCK_DATA2_T uchar2
 #define BLOCK_DATA4_T uchar4
@@ -674,11 +780,6 @@
 #define AS_DATA4_T as_int4
 #define AS_DATA8_T as_int8
 
-#define AS_UINT_T as_uint
-#define AS_UINT2_T as_uint2
-#define AS_UINT4_T as_uint4
-#define AS_UINT8_T as_uint8
-
 #define BLOCK_DATA_T uint
 #define BLOCK_DATA2_T uint2
 #define BLOCK_DATA4_T uint4
@@ -809,703 +910,6 @@
 #define AS_VECT_UCHAR_T as_uchar8
 #endif
 
-#ifdef SRC_DATA_T
-#define SRC_DATA2_T CONCAT2(SRC_DATA_T, 2)
-#define SRC_DATA4_T CONCAT2(SRC_DATA_T, 4)
-#define SRC_DATA8_T CONCAT2(SRC_DATA_T, 8)
-#define SRC_DATA16_T CONCAT2(SRC_DATA_T, 16)
-
-#define AS_SRC_DATA_T CONCAT2(as_, SRC_DATA_T)
-#define AS_SRC_DATA2_T CONCAT2(as_, SRC_DATA2_T)
-#define AS_SRC_DATA4_T CONCAT2(as_, SRC_DATA4_T)
-#define AS_SRC_DATA8_T CONCAT2(as_, SRC_DATA8_T)
-#define AS_SRC_DATA16_T CONCAT2(as_, SRC_DATA16_T)
-#if SRC_DT_U8
-#define SRC_TO_REF(x) convert_float(x)
-#define SRC_TO_REF8(x) convert_float8(x)
-#define REF_TO_SRC(x) convert_uchar(x)
-#elif SRC_DT_S8
-#define SRC_TO_REF(x) convert_float(x)
-#define SRC_TO_REF8(x) convert_float8(x)
-#define REF_TO_SRC(x) convert_char(x)
-#elif SRC_DT_BF16
-#define SRC_TO_REF(x) cvt_bf16_to_f32(x)
-#define SRC_TO_REF8(x) cvt_bf16_to_f32(x)
-#define REF_TO_SRC(x) cvt_f32_to_bf16(x)
-#elif SRC_DT_F16
-#define SRC_TO_REF(x) convert_float(x)
-#define SRC_TO_REF8(x) convert_float8(x)
-#define REF_TO_SRC(x) convert_half(x)
-#elif SRC_DT_BF8
-#define SRC_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#define SRC_TO_REF8(x) convert_float8(cvt_f8_e5m2_to_hf(x))
-#define REF_TO_SRC(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif SRC_DT_HF8
-#define SRC_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#define SRC_TO_REF8(x) convert_float8(cvt_f8_e4m3_to_hf(x))
-#define REF_TO_SRC(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif SRC_DT_U4
-#define GET_HALF_BYTE(x, y) get_half_byte(x, y)
-#define SRC_TO_REF(x) convert_float(x)
-#elif SRC_DT_S4
-#define GET_HALF_BYTE(x, y) get_half_byte(x, y)
-#define SRC_TO_REF(x) convert_float(cvt_s4_to_f32(x))
-#else
-#define SRC_TO_REF(x) (x)
-#define SRC_TO_REF8(x) (x)
-#define REF_TO_SRC(x) (x)
-#endif
-#if SRC_DT_BF16
-#define TO_SRC(x) cvt_f32_to_bf16(x)
-#elif SRC_DT_F16
-#define TO_SRC(x) convert_half(x)
-#elif SRC_DT_BF8
-#define TO_SRC(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif SRC_DT_HF8
-#define TO_SRC(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif SRC_DT_U8
-#define TO_SRC(x) convert_uchar_sat_rte(x)
-#elif SRC_DT_S8
-#define TO_SRC(x) convert_char_sat_rte(x)
-#elif SRC_DT_S32
-#define TO_SRC(x) convert_int_sat_rte(x)
-#else
-#define TO_SRC(x) (x)
-#endif
-#endif
-
-#ifdef A_DATA_T
-#define A_DATA8_T CONCAT2(A_DATA_T, 8)
-#if A_DT_BF16
-#define A_TO_REF(x) cvt_bf16_to_f32(x)
-#define A_TO_REF8(x) cvt_bf16_to_f32(x)
-#define REF_TO_A(x) cvt_f32_to_bf16(x)
-#elif A_DT_BF8
-#define A_TO_REF(x) cvt_f8_e5m2_to_hf(x)
-#define A_TO_REF8(x) cvt_f8_e5m2_to_hf(x)
-#define REF_TO_A(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif A_DT_HF8
-#define A_TO_REF(x) cvt_f8_e4m3_to_hf(x)
-#define A_TO_REF8(x) cvt_f8_e4m3_to_hf(x)
-#define REF_TO_A(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#else
-#define A_TO_REF(x) (x)
-#define A_TO_REF8(x) (x)
-#define REF_TO_A(x) (x)
-#endif
-#if A_DT_BF16
-#define TO_A(x) cvt_f32_to_bf16(x)
-#elif A_DT_BF8
-#define TO_A(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif A_DT_HF8
-#define TO_A(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif A_DT_U8
-#define TO_A(x) convert_uchar_sat_rte(x)
-#elif A_DT_S8
-#define TO_A(x) convert_char_sat_rte(x)
-#elif A_DT_S32
-#define TO_A(x) convert_int_sat_rte(x)
-#else
-#define TO_A(x) (x)
-#endif
-#endif
-
-#ifdef WEI_DATA_T
-#if WEI_DT_BF16
-#define WEI_TO_REF(x) cvt_bf16_to_f32(x)
-#define REF_TO_WEI(x) cvt_f32_to_bf16(x)
-#elif WEI_DT_BF8
-#define WEI_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#define REF_TO_WEI(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif WEI_DT_HF8
-#define WEI_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#define REF_TO_WEI(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif WEI_DT_S8
-#define WEI_TO_REF(x) convert_int_sat_rte(x)
-#define REF_TO_WEI(x) convert_char_sat_rte(x)
-#elif WEI_DT_U8
-#define WEI_TO_REF(x) convert_int_sat_rte(x)
-#define REF_TO_WEI(x) convert_uchar_sat_rte(x)
-#elif WEI_DT_S4
-#define GET_HALF_BYTE(x, y) get_half_byte(x, y)
-#define WEI_TO_REF(x) cvt_s4_to_s32(x)
-#elif WEI_DT_U4
-#define GET_HALF_BYTE(x, y) get_half_byte(x, y)
-#define WEI_TO_REF(x) convert_int_sat_rte(x)
-#else
-#define WEI_TO_REF(x) (x)
-#define REF_TO_WEI(x) (x)
-#endif
-#if WEI_DT_BF16
-#define TO_WEI(x) cvt_f32_to_bf16(x)
-#elif WEI_DT_BF8
-#define TO_WEI(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif WEI_DT_HF8
-#define TO_WEI(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif WEI_DT_U8
-#define TO_WEI(x) convert_uchar_sat_rte(x)
-#elif WEI_DT_S8
-#define TO_WEI(x) convert_char_sat_rte(x)
-#elif WEI_DT_S32
-#define TO_WEI(x) convert_int_sat_rte(x)
-#else
-#define TO_WEI(x) (x)
-#endif
-#endif
-
-#ifdef DIFF_WEI_DATA_T
-#if DIFF_WEI_DT_BF16
-#define DIFF_WEI_TO_REF(x) cvt_bf16_to_f32(x)
-#define REF_TO_DIFF_WEI(x) cvt_f32_to_bf16(x)
-#else
-#define DIFF_WEI_TO_REF(x) (x)
-#define REF_TO_DIFF_WEI(x) (x)
-#endif
-#if DIFF_WEI_DT_BF16
-#define TO_DIFF_WEI(x) cvt_f32_to_bf16(x)
-#elif DIFF_WEI_DT_U8
-#define TO_DIFF_WEI(x) convert_uchar_sat_rte(x)
-#elif DIFF_WEI_DT_S8
-#define TO_DIFF_WEI(x) convert_char_sat_rte(x)
-#elif DIFF_WEI_DT_S32
-#define TO_DIFF_WEI(x) convert_int_sat_rte(x)
-#else
-#define TO_DIFF_WEI(x) (x)
-#endif
-#endif
-
-// f16/bf16 scale-shift support for normalization primitives
-#ifdef WEI_DATA_T
-#if WEI_DT_F32 == 1
-#define AS_WEI_DATA_T as_float
-#define BLOCK_WEI_READ intel_sub_group_block_read
-#define BLOCK_WEI_WRITE intel_sub_group_block_write
-#define BLOCK_WEI_DATA_T uint
-#define AS_BLOCK_WEI_DATA_T as_uint
-#define CONVERT_WEI_FLOAT_T convert_float
-#define CONVERT_WEI_DATA_T convert_float
-#elif WEI_DT_F16 == 1
-#define AS_WEI_DATA_T as_half
-#define BLOCK_WEI_READ intel_sub_group_block_read_us
-#define BLOCK_WEI_WRITE intel_sub_group_block_write_us
-#define BLOCK_WEI_DATA_T ushort
-#define AS_BLOCK_WEI_DATA_T as_ushort
-#define CONVERT_WEI_FLOAT_T convert_float
-#define CONVERT_WEI_DATA_T convert_half
-#elif WEI_DT_BF16 == 1
-#define AS_WEI_DATA_T as_ushort
-#define BLOCK_WEI_READ intel_sub_group_block_read_us
-#define BLOCK_WEI_WRITE intel_sub_group_block_write_us
-#define BLOCK_WEI_DATA_T ushort
-#define AS_BLOCK_WEI_DATA_T as_ushort
-#define CONVERT_WEI_FLOAT_T cvt_bf16_to_f32
-#define CONVERT_WEI_DATA_T cvt_f32_to_bf16
-#endif
-#if VECT_DT_N == 1
-#define VECT_BLOCK_WEI_READ BLOCK_WEI_READ
-#define VECT_BLOCK_WEI_WRITE BLOCK_WEI_WRITE
-#define AS_VECT_WEI_DATA_T AS_WEI_DATA_T
-#define AS_VECT_BLOCK_WEI_DATA_T AS_BLOCK_WEI_DATA_T
-#define CONVERT_VECT_WEI_FLOAT_T CONVERT_WEI_FLOAT_T
-#define CONVERT_VECT_WEI_DATA_T CONVERT_WEI_DATA_T
-#else
-#define VECT_BLOCK_WEI_READ CONCAT2(BLOCK_WEI_READ, VECT_DT_N)
-#define VECT_BLOCK_WEI_WRITE CONCAT2(BLOCK_WEI_WRITE, VECT_DT_N)
-#define AS_VECT_WEI_DATA_T CONCAT2(AS_WEI_DATA_T, VECT_DT_N)
-#define AS_VECT_BLOCK_WEI_DATA_T CONCAT2(AS_BLOCK_WEI_DATA_T, VECT_DT_N)
-#if WEI_DT_BF16 == 1
-#define CONVERT_VECT_WEI_FLOAT_T CONVERT_WEI_FLOAT_T
-#define CONVERT_VECT_WEI_DATA_T CONVERT_WEI_DATA_T
-#else
-#define CONVERT_VECT_WEI_FLOAT_T CONCAT2(CONVERT_WEI_FLOAT_T, VECT_DT_N)
-#define CONVERT_VECT_WEI_DATA_T CONCAT2(CONVERT_WEI_DATA_T, VECT_DT_N)
-#endif
-#endif
-#define LOAD_VECT_WEI(ptr) \
-    CONVERT_VECT_WEI_FLOAT_T(AS_VECT_WEI_DATA_T( \
-            VECT_BLOCK_WEI_READ((const __global BLOCK_WEI_DATA_T *)(ptr))))
-#define SAVE_VECT_WEI(ptr, val) \
-    VECT_BLOCK_WEI_WRITE((__global BLOCK_WEI_DATA_T *)(ptr), \
-            AS_VECT_BLOCK_WEI_DATA_T(CONVERT_VECT_WEI_DATA_T(val)))
-#endif
-
-#ifdef B_DATA_T
-#if B_DT_BF16
-#define B_TO_REF(x) cvt_bf16_to_f32(x)
-#define REF_TO_B(x) cvt_f32_to_bf16(x)
-#elif B_DT_BF8
-#define B_TO_REF(x) cvt_f8_e5m2_to_hf(x)
-#define REF_TO_B(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif B_DT_HF8
-#define B_TO_REF(x) cvt_f8_e4m3_to_hf(x)
-#define REF_TO_B(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#else
-#define B_TO_REF(x) (x)
-#define REF_TO_B(x) (x)
-#endif
-#if B_DT_BF16
-#define TO_B(x) cvt_f32_to_bf16(x)
-#elif B_DT_BF8
-#define TO_B(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif B_DT_HF8
-#define TO_B(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif B_DT_U8
-#define TO_B(x) convert_uchar_sat_rte(x)
-#elif B_DT_S8
-#define TO_B(x) convert_char_sat_rte(x)
-#elif B_DT_S32
-#define TO_B(x) convert_int_sat_rte(x)
-#else
-#define TO_B(x) (x)
-#endif
-#endif
-
-#ifdef BIA_DATA_T
-#define BIA_DATA2_T CONCAT2(BIA_DATA_T, 2)
-#if BIA_DT_BF16
-#define BIA_TO_REF(x) cvt_bf16_to_f32(x)
-#define REF_TO_BIA(x) cvt_f32_to_bf16(x)
-#elif BIA_DT_BF8
-#define BIA_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#define REF_TO_BIA(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif BIA_DT_HF8
-#define BIA_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#define REF_TO_BIA(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#else
-#define BIA_TO_REF(x) (x)
-#define REF_TO_BIA(x) (x)
-#endif
-#if BIA_DT_BF16
-#define TO_BIA(x) cvt_f32_to_bf16(x)
-#elif BIA_DT_BF8
-#define TO_BIA(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#elif BIA_DT_HF8
-#define TO_BIA(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#elif BIA_DT_U8
-#define TO_BIA(x) convert_uchar_sat_rte(x)
-#elif BIA_DT_S8
-#define TO_BIA(x) convert_char_sat_rte(x)
-#elif BIA_DT_S32
-#define TO_BIA(x) convert_int_sat_rte(x)
-#else
-#define TO_BIA(x) (x)
-#endif
-#endif
-
-#ifdef DST_DATA_T
-#define DST_DATA2_T CONCAT2(DST_DATA_T, 2)
-#define DST_DATA4_T CONCAT2(DST_DATA_T, 4)
-#define DST_DATA8_T CONCAT2(DST_DATA_T, 8)
-#define DST_DATA16_T CONCAT2(DST_DATA_T, 16)
-
-#define AS_DST_DATA_T CONCAT2(as_, DST_DATA_T)
-#define AS_DST_DATA2_T CONCAT2(as_, DST_DATA2_T)
-#define AS_DST_DATA4_T CONCAT2(as_, DST_DATA4_T)
-#define AS_DST_DATA8_T CONCAT2(as_, DST_DATA8_T)
-#define AS_DST_DATA16_T CONCAT2(as_, DST_DATA16_T)
-
-#if DST_DT_F32 || DST_DT_F16
-#define CONVERT_DST_DATA_T CONCAT2(convert_, DST_DATA_T)
-#define CONVERT_DST_DATA2_T CONCAT2(convert_, DST_DATA2_T)
-#define CONVERT_DST_DATA4_T CONCAT2(convert_, DST_DATA4_T)
-#define CONVERT_DST_DATA8_T CONCAT2(convert_, DST_DATA8_T)
-#define CONVERT_DST_DATA16_T CONCAT2(convert_, DST_DATA16_T)
-#elif DST_DT_BF16
-#define CONVERT_DST_DATA_T TO_DST
-#define CONVERT_DST_DATA2_T TO_DST2
-#define CONVERT_DST_DATA4_T TO_DST4
-#define CONVERT_DST_DATA8_T TO_DST8
-#define CONVERT_DST_DATA16_T TO_DST16
-#else
-#define CONVERT_DST_DATA_T CONCAT3(convert_, DST_DATA_T, _sat_rte)
-#define CONVERT_DST_DATA2_T CONCAT3(convert_, DST_DATA2_T, _sat_rte)
-#define CONVERT_DST_DATA4_T CONCAT3(convert_, DST_DATA4_T, _sat_rte)
-#define CONVERT_DST_DATA8_T CONCAT3(convert_, DST_DATA8_T, _sat_rte)
-#define CONVERT_DST_DATA16_T CONCAT3(convert_, DST_DATA16_T, _sat_rte)
-#endif
-
-// Block read/write macros for dst.
-#if DST_DT_U8 || DST_DT_S8
-#define BLOCK_READ_DST2(ptr) \
-    AS_DST_DATA2_T(intel_sub_group_block_read_uc2((__global uchar *)ptr))
-#define BLOCK_WRITE_DST2(ptr, v) \
-    intel_sub_group_block_write_uc2((__global uchar *)ptr, as_uchar2(v))
-
-#define BLOCK_READ_DST(ptr) \
-    AS_DST_DATA_T(intel_sub_group_block_read_uc((__global uchar *)ptr))
-#define BLOCK_WRITE_DST(ptr, v) \
-    intel_sub_group_block_write_uc((__global uchar *)ptr, as_uchar(v))
-
-#define BLOCK_READ_DST2(ptr) \
-    AS_DST_DATA2_T(intel_sub_group_block_read_uc2((__global uchar *)ptr))
-#define BLOCK_WRITE_DST2(ptr, v) \
-    intel_sub_group_block_write_uc2((__global uchar *)ptr, as_uchar2(v))
-
-#define BLOCK_READ_DST4(ptr) \
-    AS_DST_DATA4_T(intel_sub_group_block_read_uc4((__global uchar *)ptr))
-#define BLOCK_WRITE_DST4(ptr, v) \
-    intel_sub_group_block_write_uc4((__global uchar *)ptr, as_uchar4(v))
-
-#define BLOCK_READ_DST8(ptr) \
-    AS_DST_DATA8_T(intel_sub_group_block_read_uc8((__global uchar *)ptr))
-#define BLOCK_WRITE_DST8(ptr, v) \
-    intel_sub_group_block_write_uc8((__global uchar *)ptr, as_uchar8(v))
-
-#define BLOCK_READ_DST16(ptr) \
-    AS_DST_DATA16_T(intel_sub_group_block_read_uc16((__global uchar *)ptr))
-#define BLOCK_WRITE_DST16(ptr, v) \
-    intel_sub_group_block_write_uc16((__global uchar *)ptr, as_uchar16(v))
-
-#elif DST_DT_F16 || DST_DT_BF16
-#define BLOCK_READ_DST(ptr) \
-    AS_DST_DATA_T(intel_sub_group_block_read_us((__global ushort *)ptr))
-#define BLOCK_WRITE_DST(ptr, v) \
-    intel_sub_group_block_write_us((__global ushort *)ptr, as_ushort(v))
-
-#define BLOCK_READ_DST2(ptr) \
-    AS_DST_DATA2_T(intel_sub_group_block_read_us2((__global ushort *)ptr))
-#define BLOCK_WRITE_DST2(ptr, v) \
-    intel_sub_group_block_write_us2((__global ushort *)ptr, as_ushort2(v))
-
-#define BLOCK_READ_DST4(ptr) \
-    AS_DST_DATA4_T(intel_sub_group_block_read_us4((__global ushort *)ptr))
-#define BLOCK_WRITE_DST4(ptr, v) \
-    intel_sub_group_block_write_us4((__global ushort *)ptr, as_ushort4(v))
-
-#define BLOCK_READ_DST8(ptr) \
-    AS_DST_DATA8_T(intel_sub_group_block_read_us8((__global ushort *)ptr))
-#define BLOCK_WRITE_DST8(ptr, v) \
-    intel_sub_group_block_write_us8((__global ushort *)ptr, as_ushort8(v))
-
-#define BLOCK_READ_DST16(ptr) \
-    (DST_DATA16_T)( \
-            BLOCK_READ_DST8(ptr), BLOCK_READ_DST8(ptr + 8 * SUB_GROUP_SIZE))
-#define BLOCK_WRITE_DST16(ptr, v) \
-    do { \
-        BLOCK_WRITE_DST8(ptr, (v).s01234567); \
-        BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \
-    } while (0)
-
-#elif DST_DT_S32 || DST_DT_F32
-
-#define BLOCK_READ_DST(ptr) \
-    AS_DST_DATA_T(intel_sub_group_block_read((__global uint *)ptr))
-#define BLOCK_WRITE_DST(ptr, v) \
-    intel_sub_group_block_write((__global uint *)ptr, as_uint(v))
-
-#define BLOCK_READ_DST2(ptr) \
-    AS_DST_DATA2_T(intel_sub_group_block_read2((__global uint *)ptr))
-#define BLOCK_WRITE_DST2(ptr, v) \
-    intel_sub_group_block_write2((__global uint *)ptr, as_uint2(v))
-
-#define BLOCK_READ_DST4(ptr) \
-    AS_DST_DATA4_T(intel_sub_group_block_read4((__global uint *)ptr))
-#define BLOCK_WRITE_DST4(ptr, v) \
-    intel_sub_group_block_write4((__global uint *)ptr, as_uint4(v))
-
-#define BLOCK_READ_DST8(ptr) \
-    AS_DST_DATA8_T(intel_sub_group_block_read8((__global uint *)ptr))
-#define BLOCK_WRITE_DST8(ptr, v) \
-    intel_sub_group_block_write8((__global uint *)ptr, as_uint8(v))
-
-#define BLOCK_READ_DST16(ptr) \
-    (DST_DATA16_T)( \
-            BLOCK_READ_DST8(ptr), BLOCK_READ_DST8(ptr + 8 * SUB_GROUP_SIZE))
-#define BLOCK_WRITE_DST16(ptr, v) \
-    do { \
-        BLOCK_WRITE_DST8(ptr, (v).s01234567); \
-        BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \
-    } while (0)
-
-#elif DST_DT_F16 || DST_DT_BF16
-
-#define BLOCK_READ_DST(ptr) \
-    AS_DST_DATA_T(intel_sub_group_block_read_us((__global ushort *)ptr))
-#define BLOCK_WRITE_DST(ptr, v) \
-    intel_sub_group_block_write_us((__global ushort *)ptr, as_ushort(v))
-
-#define BLOCK_READ_DST2(ptr) \
-    AS_DST_DATA2_T(intel_sub_group_block_read_us2((__global ushort *)ptr))
-#define BLOCK_WRITE_DST2(ptr, v) \
-    intel_sub_group_block_write_us2((__global ushort *)ptr, as_short2(v))
-
-#define BLOCK_READ_DST4(ptr) \
-    AS_DST_DATA4_T(intel_sub_group_block_read_us4((__global ushort *)ptr))
-#define BLOCK_WRITE_DST4(ptr, v) \
-    intel_sub_group_block_write_us4((__global ushort *)ptr, as_ushort4(v))
-
-#define BLOCK_READ_DST8(ptr) \
-    AS_DST_DATA8_T(intel_sub_group_block_read_us8((__global ushort *)ptr))
-#define BLOCK_WRITE_DST8(ptr, v) \
-    intel_sub_group_block_write_us8((__global ushort *)ptr, as_ushort8(v))
-
-#define BLOCK_READ_DST16(ptr) \
-    (DST_DATA16_T)( \
-            BLOCK_READ_DST8(ptr), BLOCK_READ_DST8(ptr + 8 * SUB_GROUP_SIZE))
-#define BLOCK_WRITE_DST16(ptr, v) \
-    do { \
-        BLOCK_WRITE_DST8(ptr, (v).s01234567); \
-        BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \
-    } while (0)
-
-#endif
-
-#if DST_DT_BF16
-#define DST_TO_REF(x) cvt_bf16_to_f32(x)
-#define DST_TO_REF2(x) cvt_bf16_to_f32(x)
-#define DST_TO_REF8(x) cvt_bf16_to_f32(x)
-#define REF_TO_DST(x) cvt_f32_to_bf16(x)
-#define REF_TO_DST8(x) cvt_f32_to_bf16(convert_float8(x))
-#elif DST_DT_BF8
-#define DST_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#define DST_TO_REF2(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#define DST_TO_REF8(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#define REF_TO_DST(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#define REF_TO_DST8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
-#elif DST_DT_HF8
-#define DST_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#define DST_TO_REF2(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#define DST_TO_REF8(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#define REF_TO_DST(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#define REF_TO_DST8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
-#elif DST_DT_F16
-#define REF_TO_DST(x) convert_half(x)
-#define DST_TO_REF(x) convert_float(x)
-#define DST_TO_REF2(x) convert_float2(x)
-#define DST_TO_REF8(x) convert_float8(x)
-#elif DST_DT_U8
-#define DST_TO_REF(x) (x)
-#define DST_TO_REF2(x) (x)
-#define DST_TO_REF8(x) (x)
-#define REF_TO_DST(x) convert_uchar(x)
-#define REF_TO_DST8(x) convert_uchar8(x)
-#elif DST_DT_S8
-#define DST_TO_REF(x) (x)
-#define DST_TO_REF2(x) (x)
-#define DST_TO_REF8(x) (x)
-#define REF_TO_DST(x) convert_char(x)
-#define REF_TO_DST8(x) convert_char8(x)
-#else
-#define DST_TO_REF(x) (x)
-#define DST_TO_REF2(x) (x)
-#define DST_TO_REF8(x) (x)
-#define REF_TO_DST(x) (x)
-#define REF_TO_DST8(x) (x)
-#endif
-
-#if DST_DT_F64
-#define TO_DST(x) convert_double(x)
-#define TO_DST2(x) convert_double2(x)
-#define TO_DST4(x) convert_double4(x)
-#define TO_DST8(x) convert_double8(x)
-#elif DST_DT_BF16
-#define TO_DST(x) cvt_f32_to_bf16(convert_float(x))
-#define TO_DST2(x) cvt_f32_to_bf16(convert_float2(x))
-#define TO_DST4(x) cvt_f32_to_bf16(convert_float4(x))
-#define TO_DST8(x) cvt_f32_to_bf16(convert_float8(x))
-#elif DST_DT_F16
-#define TO_DST(x) convert_half(x)
-#define TO_DST2(x) convert_half2(x)
-#define TO_DST4(x) convert_half4(x)
-#define TO_DST8(x) convert_half8(x)
-#elif DST_DT_BF8
-#define TO_DST(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#define TO_DST2(x) cvt_hf_to_f8_e5m2(convert_half2(x))
-#define TO_DST4(x) cvt_hf_to_f8_e5m2(convert_half4(x))
-#define TO_DST8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
-#define TO_DST16(x) cvt_hf_to_f8_e5m2(convert_half16(x))
-#elif DST_DT_HF8
-#define TO_DST(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#define TO_DST2(x) cvt_hf_to_f8_e4m3(convert_half2(x))
-#define TO_DST4(x) cvt_hf_to_f8_e4m3(convert_half4(x))
-#define TO_DST8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
-#define TO_DST16(x) cvt_hf_to_f8_e4m3(convert_half16(x))
-#elif DST_DT_U4
-#define SET_DOUBLE_HALF_BYTE(x, y, z) set_double_half_byte(x, y, z)
-#define TO_DST(x) cvt_f32_to_u4(convert_float(x))
-#define TO_DST2(x) cvt_f32_to_u4(convert_float2(x))
-#define TO_DST4(x) cvt_f32_to_u4(convert_float4(x))
-#define TO_DST8(x) cvt_f32_to_u4(convert_float8(x))
-#define TO_DST16(x) cvt_f32_to_u4(convert_float16(x))
-#elif DST_DT_S4
-#define SET_DOUBLE_HALF_BYTE(x, y, z) set_double_half_byte(x, y, z)
-#define TO_DST(x) cvt_f32_to_s4(convert_float(x))
-#define TO_DST2(x) cvt_f32_to_s4(convert_float2(x))
-#define TO_DST4(x) cvt_f32_to_s4(convert_float4(x))
-#define TO_DST8(x) cvt_f32_to_s4(convert_float8(x))
-#define TO_DST16(x) cvt_f32_to_s4(convert_float16(x))
-#elif DST_DT_U8
-#define TO_DST(x) convert_uchar_sat_rte(x)
-#define TO_DST2(x) convert_uchar2_sat_rte(x)
-#define TO_DST4(x) convert_uchar4_sat_rte(x)
-#define TO_DST8(x) convert_uchar8_sat_rte(x)
-#define TO_DST16(x) convert_uchar16_sat_rte(x)
-#elif DST_DT_S8
-#define TO_DST(x) convert_char_sat_rte(x)
-#define TO_DST2(x) convert_char2_sat_rte(x)
-#define TO_DST4(x) convert_char4_sat_rte(x)
-#define TO_DST8(x) convert_char8_sat_rte(x)
-#define TO_DST16(x) convert_char16_sat_rte(x)
-#elif DST_DT_S32
-#define TO_DST(x) convert_int_sat_rte(x)
-#define TO_DST2(x) convert_int2_sat_rte(x)
-#define TO_DST4(x) convert_int4_sat_rte(x)
-#define TO_DST8(x) convert_int8_sat_rte(x)
-#elif DST_DT_F32
-#define TO_DST(x) convert_float(x)
-#define TO_DST2(x) convert_float2(x)
-#define TO_DST4(x) convert_float4(x)
-#define TO_DST8(x) convert_float8(x)
-#elif DST_DT_F64
-#define TO_DST(x) convert_double(x)
-#define TO_DST2(x) convert_double2(x)
-#define TO_DST4(x) convert_double4(x)
-#define TO_DST8(x) convert_double8(x)
-#else
-#error "Not expected"
-#endif
-#endif
-
-#ifdef C_DATA_T
-#define C_DATA8_T CONCAT2(C_DATA_T, 8)
-#if C_DT_BF16
-#define C_TO_REF(x) cvt_bf16_to_f32(x)
-#define C_TO_REF8(x) cvt_bf16_to_f32(x)
-#define REF_TO_C(x) cvt_f32_to_bf16(x)
-#define REF_TO_C8(x) cvt_f32_to_bf16(convert_float8(x))
-#elif C_DT_BF8
-#define C_TO_REF(x) cvt_f8_e5m2_to_hf(convert_half(x))
-#define C_TO_REF8(x) cvt_f8_e5m2_to_hf(convert_half8(x))
-#define REF_TO_C(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#define REF_TO_C8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
-#elif C_DT_HF8
-#define C_TO_REF(x) cvt_f8_e4m3_to_hf(convert_half(x))
-#define C_TO_REF8(x) cvt_f8_e4m3_to_hf(convert_half8(x))
-#define REF_TO_C(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#define REF_TO_C8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
-#else
-#define C_TO_REF(x) (x)
-#define C_TO_REF8(x) (x)
-#define REF_TO_C(x) (x)
-#define REF_TO_C8(x) (x)
-#endif
-#if C_DT_BF16
-#define TO_C(x) cvt_f32_to_bf16(x)
-#define TO_C8(x) cvt_f32_to_bf16(convert_float8(x))
-#elif C_DT_BF8
-#define TO_C(x) cvt_hf_to_f8_e5m2(convert_half(x))
-#define TO_C8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
-#elif C_DT_HF8
-#define TO_C(x) cvt_hf_to_f8_e4m3(convert_half(x))
-#define TO_C8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
-#elif C_DT_F16
-#define TO_C(x) convert_half(x)
-#define TO_C8(x) convert_half8(x)
-#elif C_DT_U8
-#define TO_C(x) convert_uchar_sat_rte(x)
-#define TO_C8(x) convert_uchar8_sat_rte(x)
-#elif C_DT_S8
-#define TO_C(x) convert_char_sat_rte(x)
-#define TO_C8(x) convert_char8_sat_rte(x)
-#elif C_DT_S32
-#define TO_C(x) convert_int_sat_rte(x)
-#define TO_C8(x) convert_int8_sat_rte(x)
-#elif C_DT_F32
-#define TO_C(x) convert_float(x)
-#define TO_C8(x) convert_float8(x)
-#elif C_DT_F64
-#define TO_C(x) convert_double(x)
-#define TO_C8(x) convert_double8(x)
-#else
-#error "Not expected"
-#endif
-#endif
-
-#ifdef ACC_DATA_T
-#if ACC_DT_F16
-#define TO_ACC(x) convert_half(x)
-#define ACC_TO_REF(x) convert_float(x)
-#elif ACC_DT_F32
-#define TO_ACC(x) convert_float(x)
-#define ACC_TO_REF(x) convert_float(x)
-#elif ACC_DT_F64
-#define TO_ACC(x) convert_double(x)
-#define ACC_TO_REF(x) convert_double(x)
-#elif ACC_DT_S32
-#define TO_ACC(x) convert_int(x)
-#define ACC_TO_REF(x) convert_float(x)
-#else
-#error "Unexpected accumulation data type"
-#endif
-#endif
-
-#ifdef SUM_DATA_T
-#define SUM_DATA2_T CONCAT2(SUM_DATA_T, 2)
-#define SUM_DATA4_T CONCAT2(SUM_DATA_T, 4)
-#define SUM_DATA8_T CONCAT2(SUM_DATA_T, 8)
-#define SUM_DATA16_T CONCAT2(SUM_DATA_T, 16)
-#define AS_SUM_DATA_T CONCAT2(as_, SUM_DATA_T)
-#define AS_SUM_DATA2_T CONCAT2(as_, SUM_DATA2_T)
-#define AS_SUM_DATA4_T CONCAT2(as_, SUM_DATA4_T)
-#define AS_SUM_DATA8_T CONCAT2(as_, SUM_DATA8_T)
-#define AS_SUM_DATA16_T CONCAT2(as_, SUM_DATA16_T)
-#if SUM_DT_BF16
-#define SUM_TO_REF cvt_bf16_to_f32
-#elif SUM_DT_BF8
-#define SUM_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
-#elif SUM_DT_HF8
-#define SUM_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
-#else
-#define SUM_TO_REF
-#endif
-#endif
-
-#ifdef DST_SCALES_DATA_T
-#if DST_SCALES_DT_F16
-#define DST_SCALES_TO_REF(x) convert_float(x)
-#elif DST_SCALES_DT_BF16
-#define DST_SCALES_TO_REF(x) cvt_bf16_to_f32(x)
-#elif DST_SCALES_DT_E8M0
-#define DST_SCALES_TO_REF(x) cvt_e8m0_to_f32(x)
-#else
-#define DST_SCALES_TO_REF(x) (x)
-#endif
-#endif
-
-#ifdef WEI_SCALES_DATA_T
-#if WEI_SCALES_DT_F16
-#define WEI_SCALES_TO_REF(x) convert_float(x)
-#elif WEI_SCALES_DT_BF16
-#define WEI_SCALES_TO_REF(x) cvt_bf16_to_f32(x)
-#else
-#define WEI_SCALES_TO_REF(x) (x)
-#endif
-#endif
-
-#ifdef SRC_SCALES_DATA_T
-#if SRC_SCALES_DT_F16
-#define SRC_SCALES_TO_REF(x) convert_float(x)
-#elif SRC_SCALES_DT_BF16
-#define SRC_SCALES_TO_REF(x) cvt_bf16_to_f32(x)
-#else
-#define SRC_SCALES_TO_REF(x) (x)
-#endif
-#endif
-
-#ifdef WEI_ZP_DATA_T
-#if WEI_ZP_DT_S8
-#define WEI_ZP_TO_REF(zp, off) convert_int_sat_rte(zp[off])
-#elif WEI_ZP_DT_U8
-#define WEI_ZP_TO_REF(zp, off) convert_int_sat_rte(zp[off])
-#elif WEI_ZP_DT_S4
-#define WEI_ZP_TO_REF(zp, off) cvt_s4_to_s32(GET_HALF_BYTE(zp, off))
-#elif WEI_ZP_DT_U4
-#define WEI_ZP_TO_REF(zp, off) convert_int_sat_rte(GET_HALF_BYTE(zp, off))
-#else
-#define WEI_ZP_TO_REF(zp, off) (zp[off])
-#endif
-#endif
-
 #define OFF_MD_2(prefix, x0, x1, x2, x3, x4, x5) \
     ((((x0) / CONCAT2(prefix, _B0_2)) / CONCAT2(prefix, _B0_1) \
              * CONCAT2(prefix, _S0_0)) \
@@ -1538,229 +942,9 @@
                     * CONCAT2(prefix, _S5_1)) \
             + (((x5) % CONCAT2(prefix, _B5_2)) * CONCAT2(prefix, _S5_2)))
 
-#define OFF_MD_3(prefix, x0, x1, x2, x3, x4, x5) \
-    ((((((x0) / CONCAT2(prefix, _B0_3)) / CONCAT2(prefix, _B0_2)) \
-              / CONCAT2(prefix, _B0_1)) \
-             * CONCAT2(prefix, _S0_0)) \
-            + (((((x0) / CONCAT2(prefix, _B0_3)) / CONCAT2(prefix, _B0_2)) \
-                       % CONCAT2(prefix, _B0_1)) \
-                    * CONCAT2(prefix, _S0_1)) \
-            + ((((x0) / CONCAT2(prefix, _B0_3)) % CONCAT2(prefix, _B0_2)) \
-                    * CONCAT2(prefix, _S0_2)) \
-            + (((x0) % CONCAT2(prefix, _B0_3)) * CONCAT2(prefix, _S0_3)) \
-            + (((((x1) / CONCAT2(prefix, _B1_3)) / CONCAT2(prefix, _B1_2)) \
-                       / CONCAT2(prefix, _B1_1)) \
-                    * CONCAT2(prefix, _S1_0)) \
-            + (((((x1) / CONCAT2(prefix, _B1_3)) / CONCAT2(prefix, _B1_2)) \
-                       % CONCAT2(prefix, _B1_1)) \
-                    * CONCAT2(prefix, _S1_1)) \
-            + ((((x1) / CONCAT2(prefix, _B1_3)) % CONCAT2(prefix, _B1_2)) \
-                    * CONCAT2(prefix, _S1_2)) \
-            + (((x1) % CONCAT2(prefix, _B1_3)) * CONCAT2(prefix, _S1_3)) \
-            + (((((x2) / CONCAT2(prefix, _B2_3)) / CONCAT2(prefix, _B2_2)) \
-                       / CONCAT2(prefix, _B2_1)) \
-                    * CONCAT2(prefix, _S2_0)) \
-            + (((((x2) / CONCAT2(prefix, _B2_3)) / CONCAT2(prefix, _B2_2)) \
-                       % CONCAT2(prefix, _B2_1)) \
-                    * CONCAT2(prefix, _S2_1)) \
-            + ((((x2) / CONCAT2(prefix, _B2_3)) % CONCAT2(prefix, _B2_2)) \
-                    * CONCAT2(prefix, _S2_2)) \
-            + (((x2) % CONCAT2(prefix, _B2_3)) * CONCAT2(prefix, _S2_3)) \
-            + (((((x3) / CONCAT2(prefix, _B3_3)) / CONCAT2(prefix, _B3_2)) \
-                       / CONCAT2(prefix, _B3_1)) \
-                    * CONCAT2(prefix, _S3_0)) \
-            + (((((x3) / CONCAT2(prefix, _B3_3)) / CONCAT2(prefix, _B3_2)) \
-                       % CONCAT2(prefix, _B3_1)) \
-                    * CONCAT2(prefix, _S3_1)) \
-            + ((((x3) / CONCAT2(prefix, _B3_3)) % CONCAT2(prefix, _B3_2)) \
-                    * CONCAT2(prefix, _S3_2)) \
-            + (((x3) % CONCAT2(prefix, _B3_3)) * CONCAT2(prefix, _S3_3)) \
-            + (((((x4) / CONCAT2(prefix, _B4_3)) / CONCAT2(prefix, _B4_2)) \
-                       / CONCAT2(prefix, _B4_1)) \
-                    * CONCAT2(prefix, _S4_0)) \
-            + (((((x4) / CONCAT2(prefix, _B4_3)) / CONCAT2(prefix, _B4_2)) \
-                       % CONCAT2(prefix, _B4_1)) \
-                    * CONCAT2(prefix, _S4_1)) \
-            + ((((x4) / CONCAT2(prefix, _B4_3)) % CONCAT2(prefix, _B4_2)) \
-                    * CONCAT2(prefix, _S4_2)) \
-            + (((x4) % CONCAT2(prefix, _B4_3)) * CONCAT2(prefix, _S4_3)) \
-            + (((((x5) / CONCAT2(prefix, _B5_3)) / CONCAT2(prefix, _B5_2)) \
-                       / CONCAT2(prefix, _B5_1)) \
-                    * CONCAT2(prefix, _S5_0)) \
-            + (((((x5) / CONCAT2(prefix, _B5_3)) / CONCAT2(prefix, _B5_2)) \
-                       % CONCAT2(prefix, _B5_1)) \
-                    * CONCAT2(prefix, _S5_1)) \
-            + ((((x5) / CONCAT2(prefix, _B5_3)) % CONCAT2(prefix, _B5_2)) \
-                    * CONCAT2(prefix, _S5_2)) \
-            + (((x5) % CONCAT2(prefix, _B5_3)) * CONCAT2(prefix, _S5_3)))
-
 #define OFF_MD(prefix, x0, x1, x2, x3, x4, x5) \
     CONCAT2(OFF_MD_, CONCAT2(prefix, _NLEVELS))(prefix, x0, x1, x2, x3, x4, x5)
 
-#if SRC_NDIMS == 3
-#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, w, 0, 0, 0)
-#elif SRC_NDIMS == 4
-#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, h, w, 0, 0)
-#elif SRC_NDIMS == 5
-#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, d, h, w, 0)
-#endif
-
-#if WEI_NDIMS == 3
-#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, w, 0, 0, 0)
-#elif WEI_NDIMS == 4
-#if WITH_GROUPS == 0
-#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, h, w, 0, 0)
-#else
-#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, w, 0, 0)
-#endif
-#elif WEI_NDIMS == 5
-#if WITH_GROUPS == 0
-#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, d, h, w, 0)
-#else
-#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, h, w, 0)
-#endif
-#elif WEI_NDIMS == 6
-#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, d, h, w)
-#endif
-
-#if DST_NDIMS == 3
-#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, w, 0, 0, 0)
-#elif DST_NDIMS == 4
-#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, h, w, 0, 0)
-#elif DST_NDIMS == 5
-#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, d, h, w, 0)
-#endif
-
-#if NDIMS == 2
-#define SRC_OFF(x0, x1, d, h, w) \
-    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
-            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1)
-
-#if WITH_GROUPS == 1
-#define WEI_OFF(x0, x1, x2, d, h, w) \
-    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
-            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
-            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2)
-#else
-#define WEI_OFF(g, x0, x1, d, h, w) \
-    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
-            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1)
-#endif
-
-#define DST_OFF(x0, x1, d, h, w) \
-    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
-            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1)
-#elif NDIMS == 3
-#define SRC_OFF(x0, x1, d, h, x2) \
-    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
-            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \
-            + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2)
-
-#if WITH_GROUPS == 1
-#define WEI_OFF(x0, x1, x2, d, h, x3) \
-    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
-            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
-            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \
-            + ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3)
-#else
-#define WEI_OFF(g, x0, x1, d, h, x2) \
-    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
-            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
-            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2)
-#endif
-
-#define DST_OFF(x0, x1, d, h, x2) \
-    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
-            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \
-            + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2)
-#elif NDIMS == 4
-#define SRC_OFF(x0, x1, d, x2, x3) \
-    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
-            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \
-            + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \
-            + ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3)
-
-#if WITH_GROUPS == 1
-#define WEI_OFF(x0, x1, x2, d, x3, x4) \
-    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
-            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
-            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \
-            + ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3 \
-            + ((x4) % WEI_B4) * WEI_SB4 + ((x4) / WEI_B4) * WEI_S4)
-#else
-#define WEI_OFF(g, x1, x2, d, x3, x4) \
-    (((x1) % WEI_B0) * WEI_SB0 + ((x1) / WEI_B0) * WEI_S0 \
-            + ((x2) % WEI_B1) * WEI_SB1 + ((x2) / WEI_B1) * WEI_S1 \
-            + ((x3) % WEI_B2) * WEI_SB2 + ((x3) / WEI_B2) * WEI_S2 \
-            + ((x4) % WEI_B3) * WEI_SB3 + ((x4) / WEI_B3) * WEI_S3)
-#endif
-
-#define DST_OFF(x0, x1, d, x2, x3) \
-    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
-            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \
-            + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \
-            + ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3)
-#elif NDIMS == 5
-#define SRC_OFF(x0, x1, x2, x3, x4) \
-    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
-            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \
-            + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \
-            + ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3 \
-            + ((x4) % SRC_B4) * SRC_SB4 + ((x4) / SRC_B4) * SRC_S4)
-
-#if WITH_GROUPS == 1
-#define WEI_OFF(x0, x1, x2, x3, x4, x5) \
-    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
-            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
-            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \
-            + ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3 \
-            + ((x4) % WEI_B4) * WEI_SB4 + ((x4) / WEI_B4) * WEI_S4 \
-            + ((x5) % WEI_B5) * WEI_SB5 + ((x5) / WEI_B5) * WEI_S5)
-#else
-#define WEI_OFF(g, x1, x2, x3, x4, x5) \
-    (((x1) % WEI_B0) * WEI_SB0 + ((x1) / WEI_B0) * WEI_S0 \
-            + ((x2) % WEI_B1) * WEI_SB1 + ((x2) / WEI_B1) * WEI_S1 \
-            + ((x3) % WEI_B2) * WEI_SB2 + ((x3) / WEI_B2) * WEI_S2 \
-            + ((x4) % WEI_B3) * WEI_SB3 + ((x4) / WEI_B3) * WEI_S3 \
-            + ((x5) % WEI_B4) * WEI_SB4 + ((x5) / WEI_B4) * WEI_S4)
-#endif
-
-#define DST_OFF(x0, x1, x2, x3, x4) \
-    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
-            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \
-            + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \
-            + ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3 \
-            + ((x4) % DST_B4) * DST_SB4 + ((x4) / DST_B4) * DST_S4)
-#endif
-
-#if SRC_DT_U8 == 1
-#define SRC_DT_ALIAS UCHAR
-#elif SRC_DT_S8 == 1
-#define SRC_DT_ALIAS CHAR
-#elif SRC_DT_F16 == 1
-#define SRC_DT_ALIAS HALF
-#elif SRC_DT_BF16 == 1
-#define SRC_DT_ALIAS BFLOAT
-#elif SRC_DT_F32 == 1
-#define SRC_DT_ALIAS FLOAT
-#elif SRC_DT_F64 == 1
-#define SRC_DT_ALIAS DOUBLE
-#endif
-
-#if DST_DT_U8 == 1
-#define DST_DT_ALIAS UCHAR
-#elif DST_DT_S8 == 1
-#define DST_DT_ALIAS CHAR
-#elif DST_DT_F16 == 1
-#define DST_DT_ALIAS HALF
-#elif DST_DT_BF16 == 1
-#define DST_DT_ALIAS BFLOAT
-#elif DST_DT_F32 == 1
-#define DST_DT_ALIAS FLOAT
-#elif DST_DT_F64 == 1
-#define DST_DT_ALIAS DOUBLE
-#endif
-
 #define ALIAS(prefix) CONCAT2(prefix, _DT_ALIAS)
 
 // BLOCK types
diff --git a/src/gpu/intel/ocl/ocl_types_specific.h b/src/gpu/intel/ocl/ocl_types_specific.h
new file mode 100644
index 00000000000..a4c47bfd27a
--- /dev/null
+++ b/src/gpu/intel/ocl/ocl_types_specific.h
@@ -0,0 +1,1007 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// See ocl_types.h for details about macros naming and organization.
+
+#ifndef GPU_INTEL_OCL_OCL_TYPES_SPECIFIC_H
+#define GPU_INTEL_OCL_OCL_TYPES_SPECIFIC_H
+
+#ifdef SRC_DATA_T
+#define SRC_DATA2_T CONCAT2(SRC_DATA_T, 2)
+#define SRC_DATA4_T CONCAT2(SRC_DATA_T, 4)
+#define SRC_DATA8_T CONCAT2(SRC_DATA_T, 8)
+#define SRC_DATA16_T CONCAT2(SRC_DATA_T, 16)
+
+#define AS_SRC_DATA_T CONCAT2(as_, SRC_DATA_T)
+#define AS_SRC_DATA2_T CONCAT2(as_, SRC_DATA2_T)
+#define AS_SRC_DATA4_T CONCAT2(as_, SRC_DATA4_T)
+#define AS_SRC_DATA8_T CONCAT2(as_, SRC_DATA8_T)
+#define AS_SRC_DATA16_T CONCAT2(as_, SRC_DATA16_T)
+#if SRC_DT_U8
+#define SRC_TO_REF(x) convert_float(x)
+#define SRC_TO_REF8(x) convert_float8(x)
+#define REF_TO_SRC(x) convert_uchar(x)
+#elif SRC_DT_S8
+#define SRC_TO_REF(x) convert_float(x)
+#define SRC_TO_REF8(x) convert_float8(x)
+#define REF_TO_SRC(x) convert_char(x)
+#elif SRC_DT_BF16
+#define SRC_TO_REF(x) cvt_bf16_to_f32(x)
+#define SRC_TO_REF8(x) cvt_bf16_to_f32(x)
+#define REF_TO_SRC(x) cvt_f32_to_bf16(x)
+#elif SRC_DT_F16
+#define SRC_TO_REF(x) convert_float(x)
+#define SRC_TO_REF8(x) convert_float8(x)
+#define REF_TO_SRC(x) convert_half(x)
+#elif SRC_DT_BF8
+#define SRC_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#define SRC_TO_REF8(x) convert_float8(cvt_f8_e5m2_to_hf(x))
+#define REF_TO_SRC(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif SRC_DT_HF8
+#define SRC_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#define SRC_TO_REF8(x) convert_float8(cvt_f8_e4m3_to_hf(x))
+#define REF_TO_SRC(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif SRC_DT_F4_E2M1
+#define SRC_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define SRC_TO_REF8(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_SRC(x) cvt_f32_to_f4_e2m1(x)
+#elif SRC_DT_F4_E3M0
+#define SRC_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define SRC_TO_REF8(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_SRC(x) cvt_f32_to_f4_e3m0(x)
+#elif SRC_DT_U4
+#define SRC_TO_REF(x) convert_float(x)
+#elif SRC_DT_S4
+#define SRC_TO_REF(x) convert_float(cvt_s4_to_f32(x))
+#else
+#define SRC_TO_REF(x) (x)
+#define SRC_TO_REF8(x) (x)
+#define REF_TO_SRC(x) (x)
+#endif
+#if SRC_DT_BF16
+#define TO_SRC(x) cvt_f32_to_bf16(x)
+#elif SRC_DT_F16
+#define TO_SRC(x) convert_half(x)
+#elif SRC_DT_BF8
+#define TO_SRC(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif SRC_DT_HF8
+#define TO_SRC(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif SRC_DT_F4_E2M1
+#define TO_SRC(x) cvt_f32_to_f4_e2m1(x)
+#elif SRC_DT_F4_E3M0
+#define TO_SRC(x) cvt_f32_to_f4_e3m0(x)
+#elif SRC_DT_U8
+#define TO_SRC(x) convert_uchar_sat_rte(x)
+#elif SRC_DT_S8
+#define TO_SRC(x) convert_char_sat_rte(x)
+#elif SRC_DT_S32
+#define TO_SRC(x) convert_int_sat_rte(x)
+#else
+#define TO_SRC(x) (x)
+#endif
+#endif
+
+#ifdef A_DATA_T
+#if A_DT_BF16
+#define A_TO_REF(x) cvt_bf16_to_f32(x)
+#define A_TO_REF8(x) cvt_bf16_to_f32(x)
+#define REF_TO_A(x) cvt_f32_to_bf16(x)
+#elif A_DT_BF8
+#define A_TO_REF(x) cvt_f8_e5m2_to_hf(x)
+#define A_TO_REF8(x) cvt_f8_e5m2_to_hf(x)
+#define REF_TO_A(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif A_DT_HF8
+#define A_TO_REF(x) cvt_f8_e4m3_to_hf(x)
+#define A_TO_REF8(x) cvt_f8_e4m3_to_hf(x)
+#define REF_TO_A(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif A_DT_F4_E2M1
+#define A_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define A_TO_REF8(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_A(x) cvt_f32_to_f4_e2m1(x)
+#elif A_DT_F4_E3M0
+#define A_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define A_TO_REF8(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_A(x) cvt_f32_to_f4_e3m0(x)
+#else
+#define A_TO_REF(x) (x)
+#define A_TO_REF8(x) (x)
+#define REF_TO_A(x) (x)
+#endif
+#if A_DT_BF16
+#define TO_A(x) cvt_f32_to_bf16(x)
+#elif A_DT_BF8
+#define TO_A(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif A_DT_HF8
+#define TO_A(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif A_DT_F4_E2M1
+#define TO_A(x) cvt_f32_to_f4_e2m1(x)
+#elif A_DT_F4_E3M0
+#define TO_A(x) cvt_f32_to_f4_e3m0(x)
+#elif A_DT_U8
+#define TO_A(x) convert_uchar_sat_rte(x)
+#elif A_DT_S8
+#define TO_A(x) convert_char_sat_rte(x)
+#elif A_DT_S32
+#define TO_A(x) convert_int_sat_rte(x)
+#else
+#define TO_A(x) (x)
+#endif
+#endif
+
+#ifdef WEI_DATA_T
+#if WEI_DT_BF16
+#define WEI_TO_REF(x) cvt_bf16_to_f32(x)
+#define REF_TO_WEI(x) cvt_f32_to_bf16(x)
+#elif WEI_DT_BF8
+#define WEI_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#define REF_TO_WEI(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif WEI_DT_HF8
+#define WEI_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#define REF_TO_WEI(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif WEI_DT_F4_E2M1
+#define WEI_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_WEI(x) cvt_f32_to_f4_e2m1(x)
+#elif WEI_DT_F4_E3M0
+#define WEI_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_WEI(x) cvt_f32_to_f4_e3m0(x)
+#elif WEI_DT_S8
+#define WEI_TO_REF(x) convert_int_sat_rte(x)
+#define REF_TO_WEI(x) convert_char_sat_rte(x)
+#elif WEI_DT_U8
+#define WEI_TO_REF(x) convert_int_sat_rte(x)
+#define REF_TO_WEI(x) convert_uchar_sat_rte(x)
+#elif WEI_DT_S4
+#define WEI_TO_REF(x) cvt_s4_to_s32(x)
+#elif WEI_DT_U4
+#define WEI_TO_REF(x) convert_int_sat_rte(x)
+#else
+#define WEI_TO_REF(x) (x)
+#define REF_TO_WEI(x) (x)
+#endif
+#if WEI_DT_BF16
+#define TO_WEI(x) cvt_f32_to_bf16(x)
+#elif WEI_DT_BF8
+#define TO_WEI(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif WEI_DT_HF8
+#define TO_WEI(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif WEI_DT_F4_E2M1
+#define TO_WEI(x) cvt_f32_to_f4_e2m1(x)
+#elif WEI_DT_F4_E3M0
+#define TO_WEI(x) cvt_f32_to_f4_e3m0(x)
+#elif WEI_DT_U8
+#define TO_WEI(x) convert_uchar_sat_rte(x)
+#elif WEI_DT_S8
+#define TO_WEI(x) convert_char_sat_rte(x)
+#elif WEI_DT_S32
+#define TO_WEI(x) convert_int_sat_rte(x)
+#else
+#define TO_WEI(x) (x)
+#endif
+#endif
+
+#ifdef DIFF_WEI_DATA_T
+#if DIFF_WEI_DT_BF16
+#define DIFF_WEI_TO_REF(x) cvt_bf16_to_f32(x)
+#define REF_TO_DIFF_WEI(x) cvt_f32_to_bf16(x)
+#else
+#define DIFF_WEI_TO_REF(x) (x)
+#define REF_TO_DIFF_WEI(x) (x)
+#endif
+#if DIFF_WEI_DT_BF16
+#define TO_DIFF_WEI(x) cvt_f32_to_bf16(x)
+#elif DIFF_WEI_DT_U8
+#define TO_DIFF_WEI(x) convert_uchar_sat_rte(x)
+#elif DIFF_WEI_DT_S8
+#define TO_DIFF_WEI(x) convert_char_sat_rte(x)
+#elif DIFF_WEI_DT_S32
+#define TO_DIFF_WEI(x) convert_int_sat_rte(x)
+#else
+#define TO_DIFF_WEI(x) (x)
+#endif
+#endif
+
+// f16/bf16 scale-shift support for normalization primitives
+#ifdef WEI_DATA_T
+#if WEI_DT_F32 == 1
+#define AS_WEI_DATA_T as_float
+#define BLOCK_WEI_READ intel_sub_group_block_read
+#define BLOCK_WEI_WRITE intel_sub_group_block_write
+#define BLOCK_WEI_DATA_T uint
+#define AS_BLOCK_WEI_DATA_T as_uint
+#define CONVERT_WEI_FLOAT_T convert_float
+#define CONVERT_WEI_DATA_T convert_float
+#elif WEI_DT_F16 == 1
+#define AS_WEI_DATA_T as_half
+#define BLOCK_WEI_READ intel_sub_group_block_read_us
+#define BLOCK_WEI_WRITE intel_sub_group_block_write_us
+#define BLOCK_WEI_DATA_T ushort
+#define AS_BLOCK_WEI_DATA_T as_ushort
+#define CONVERT_WEI_FLOAT_T convert_float
+#define CONVERT_WEI_DATA_T convert_half
+#elif WEI_DT_BF16 == 1
+#define AS_WEI_DATA_T as_ushort
+#define BLOCK_WEI_READ intel_sub_group_block_read_us
+#define BLOCK_WEI_WRITE intel_sub_group_block_write_us
+#define BLOCK_WEI_DATA_T ushort
+#define AS_BLOCK_WEI_DATA_T as_ushort
+#define CONVERT_WEI_FLOAT_T cvt_bf16_to_f32
+#define CONVERT_WEI_DATA_T cvt_f32_to_bf16
+#endif
+#if VECT_DT_N == 1
+#define VECT_BLOCK_WEI_READ BLOCK_WEI_READ
+#define VECT_BLOCK_WEI_WRITE BLOCK_WEI_WRITE
+#define AS_VECT_WEI_DATA_T AS_WEI_DATA_T
+#define AS_VECT_BLOCK_WEI_DATA_T AS_BLOCK_WEI_DATA_T
+#define CONVERT_VECT_WEI_FLOAT_T CONVERT_WEI_FLOAT_T
+#define CONVERT_VECT_WEI_DATA_T CONVERT_WEI_DATA_T
+#else
+#define VECT_BLOCK_WEI_READ CONCAT2(BLOCK_WEI_READ, VECT_DT_N)
+#define VECT_BLOCK_WEI_WRITE CONCAT2(BLOCK_WEI_WRITE, VECT_DT_N)
+#define AS_VECT_WEI_DATA_T CONCAT2(AS_WEI_DATA_T, VECT_DT_N)
+#define AS_VECT_BLOCK_WEI_DATA_T CONCAT2(AS_BLOCK_WEI_DATA_T, VECT_DT_N)
+#if WEI_DT_BF16 == 1
+#define CONVERT_VECT_WEI_FLOAT_T CONVERT_WEI_FLOAT_T
+#define CONVERT_VECT_WEI_DATA_T CONVERT_WEI_DATA_T
+#else
+#define CONVERT_VECT_WEI_FLOAT_T CONCAT2(CONVERT_WEI_FLOAT_T, VECT_DT_N)
+#define CONVERT_VECT_WEI_DATA_T CONCAT2(CONVERT_WEI_DATA_T, VECT_DT_N)
+#endif
+#endif
+#define LOAD_VECT_WEI(ptr) \
+    CONVERT_VECT_WEI_FLOAT_T(AS_VECT_WEI_DATA_T( \
+            VECT_BLOCK_WEI_READ((const __global BLOCK_WEI_DATA_T *)(ptr))))
+#define SAVE_VECT_WEI(ptr, val) \
+    VECT_BLOCK_WEI_WRITE((__global BLOCK_WEI_DATA_T *)(ptr), \
+            AS_VECT_BLOCK_WEI_DATA_T(CONVERT_VECT_WEI_DATA_T(val)))
+#endif
+
+#ifdef B_DATA_T
+#if B_DT_BF16
+#define B_TO_REF(x) cvt_bf16_to_f32(x)
+#define REF_TO_B(x) cvt_f32_to_bf16(x)
+#define TO_B(x) cvt_f32_to_bf16(x)
+#elif B_DT_BF8
+#define B_TO_REF(x) cvt_f8_e5m2_to_hf(x)
+#define REF_TO_B(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#define TO_B(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif B_DT_HF8
+#define B_TO_REF(x) cvt_f8_e4m3_to_hf(x)
+#define REF_TO_B(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#define TO_B(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif B_DT_F4_E2M1
+#define B_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_B(x) cvt_f32_to_f4_e2m1(x)
+#define TO_B(x) cvt_f32_to_f4_e2m1(x)
+#elif B_DT_F4_E3M0
+#define B_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_B(x) cvt_f32_to_f4_e3m0(x)
+#define TO_B(x) cvt_f32_to_f4_e3m0(x)
+#elif B_DT_U8
+#define B_TO_REF(x) (x)
+#define REF_TO_B(x) (x)
+#define TO_B(x) convert_uchar_sat_rte(x)
+#elif B_DT_S8
+#define B_TO_REF(x) (x)
+#define REF_TO_B(x) (x)
+#define TO_B(x) convert_char_sat_rte(x)
+#elif B_DT_S32
+#define B_TO_REF(x) (x)
+#define REF_TO_B(x) (x)
+#define TO_B(x) convert_int_sat_rte(x)
+#else
+#define B_TO_REF(x) (x)
+#define REF_TO_B(x) (x)
+#define TO_B(x) (x)
+#endif
+#endif
+
+#ifdef BIA_DATA_T
+#define BIA_DATA2_T CONCAT2(BIA_DATA_T, 2)
+#if BIA_DT_BF16
+#define BIA_TO_REF(x) cvt_bf16_to_f32(x)
+#define REF_TO_BIA(x) cvt_f32_to_bf16(x)
+#elif BIA_DT_BF8
+#define BIA_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#define REF_TO_BIA(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif BIA_DT_HF8
+#define BIA_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#define REF_TO_BIA(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif BIA_DT_F4_E2M1
+#define BIA_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_BIA(x) cvt_f32_to_f4_e2m1(x)
+#elif BIA_DT_F4_E3M0
+#define BIA_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_BIA(x) cvt_f32_to_f4_e3m0(x)
+#else
+#define BIA_TO_REF(x) (x)
+#define REF_TO_BIA(x) (x)
+#endif
+
+#if BIA_DT_BF16
+#define TO_BIA(x) cvt_f32_to_bf16(x)
+#elif BIA_DT_BF8
+#define TO_BIA(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#elif BIA_DT_HF8
+#define TO_BIA(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#elif BIA_DT_F4_E2M1
+#define TO_BIA(x) cvt_f32_to_f4_e2m1(x)
+#elif BIA_DT_F4_E3M0
+#define TO_BIA(x) cvt_f32_to_f4_e3m0(x)
+#elif BIA_DT_U8
+#define TO_BIA(x) convert_uchar_sat_rte(x)
+#elif BIA_DT_S8
+#define TO_BIA(x) convert_char_sat_rte(x)
+#elif BIA_DT_S32
+#define TO_BIA(x) convert_int_sat_rte(x)
+#else
+#define TO_BIA(x) (x)
+#endif
+#endif
+
+#ifdef DST_DATA_T
+#define DST_DATA2_T CONCAT2(DST_DATA_T, 2)
+#define DST_DATA4_T CONCAT2(DST_DATA_T, 4)
+#define DST_DATA8_T CONCAT2(DST_DATA_T, 8)
+#define DST_DATA16_T CONCAT2(DST_DATA_T, 16)
+
+#define AS_DST_DATA_T CONCAT2(as_, DST_DATA_T)
+#define AS_DST_DATA2_T CONCAT2(as_, DST_DATA2_T)
+#define AS_DST_DATA4_T CONCAT2(as_, DST_DATA4_T)
+#define AS_DST_DATA8_T CONCAT2(as_, DST_DATA8_T)
+#define AS_DST_DATA16_T CONCAT2(as_, DST_DATA16_T)
+
+#if DST_DT_F32 || DST_DT_F16
+#define CONVERT_DST_DATA_T CONCAT2(convert_, DST_DATA_T)
+#define CONVERT_DST_DATA2_T CONCAT2(convert_, DST_DATA2_T)
+#define CONVERT_DST_DATA4_T CONCAT2(convert_, DST_DATA4_T)
+#define CONVERT_DST_DATA8_T CONCAT2(convert_, DST_DATA8_T)
+#define CONVERT_DST_DATA16_T CONCAT2(convert_, DST_DATA16_T)
+#elif DST_DT_BF16
+#define CONVERT_DST_DATA_T TO_DST
+#define CONVERT_DST_DATA2_T TO_DST2
+#define CONVERT_DST_DATA4_T TO_DST4
+#define CONVERT_DST_DATA8_T TO_DST8
+#define CONVERT_DST_DATA16_T TO_DST16
+#else
+#define CONVERT_DST_DATA_T CONCAT3(convert_, DST_DATA_T, _sat_rte)
+#define CONVERT_DST_DATA2_T CONCAT3(convert_, DST_DATA2_T, _sat_rte)
+#define CONVERT_DST_DATA4_T CONCAT3(convert_, DST_DATA4_T, _sat_rte)
+#define CONVERT_DST_DATA8_T CONCAT3(convert_, DST_DATA8_T, _sat_rte)
+#define CONVERT_DST_DATA16_T CONCAT3(convert_, DST_DATA16_T, _sat_rte)
+#endif
+
+// Block read/write macros for dst.
+#if DST_DT_U8 || DST_DT_S8
+#define BLOCK_READ_DST2(ptr) \
+    AS_DST_DATA2_T(intel_sub_group_block_read_uc2((__global uchar *)ptr))
+#define BLOCK_WRITE_DST2(ptr, v) \
+    intel_sub_group_block_write_uc2((__global uchar *)ptr, as_uchar2(v))
+
+#define BLOCK_READ_DST(ptr) \
+    AS_DST_DATA_T(intel_sub_group_block_read_uc((__global uchar *)ptr))
+#define BLOCK_WRITE_DST(ptr, v) \
+    intel_sub_group_block_write_uc((__global uchar *)ptr, as_uchar(v))
+
+#define BLOCK_READ_DST2(ptr) \
+    AS_DST_DATA2_T(intel_sub_group_block_read_uc2((__global uchar *)ptr))
+#define BLOCK_WRITE_DST2(ptr, v) \
+    intel_sub_group_block_write_uc2((__global uchar *)ptr, as_uchar2(v))
+
+#define BLOCK_READ_DST4(ptr) \
+    AS_DST_DATA4_T(intel_sub_group_block_read_uc4((__global uchar *)ptr))
+#define BLOCK_WRITE_DST4(ptr, v) \
+    intel_sub_group_block_write_uc4((__global uchar *)ptr, as_uchar4(v))
+
+#define BLOCK_WRITE_DST8(ptr, v) \
+    intel_sub_group_block_write_uc8((__global uchar *)ptr, as_uchar8(v))
+
+#define BLOCK_WRITE_DST16(ptr, v) \
+    intel_sub_group_block_write_uc16((__global uchar *)ptr, as_uchar16(v))
+
+#elif DST_DT_F16 || DST_DT_BF16
+#define BLOCK_WRITE_DST(ptr, v) \
+    intel_sub_group_block_write_us((__global ushort *)ptr, as_ushort(v))
+
+#define BLOCK_WRITE_DST2(ptr, v) \
+    intel_sub_group_block_write_us2((__global ushort *)ptr, as_ushort2(v))
+
+#define BLOCK_WRITE_DST4(ptr, v) \
+    intel_sub_group_block_write_us4((__global ushort *)ptr, as_ushort4(v))
+
+#define BLOCK_WRITE_DST8(ptr, v) \
+    intel_sub_group_block_write_us8((__global ushort *)ptr, as_ushort8(v))
+
+#define BLOCK_WRITE_DST16(ptr, v) \
+    do { \
+        BLOCK_WRITE_DST8(ptr, (v).s01234567); \
+        BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \
+    } while (0)
+
+#elif DST_DT_S32 || DST_DT_F32
+
+#define BLOCK_WRITE_DST(ptr, v) \
+    intel_sub_group_block_write((__global uint *)ptr, as_uint(v))
+
+#define BLOCK_WRITE_DST2(ptr, v) \
+    intel_sub_group_block_write2((__global uint *)ptr, as_uint2(v))
+
+#define BLOCK_WRITE_DST4(ptr, v) \
+    intel_sub_group_block_write4((__global uint *)ptr, as_uint4(v))
+
+#define BLOCK_WRITE_DST8(ptr, v) \
+    intel_sub_group_block_write8((__global uint *)ptr, as_uint8(v))
+
+#define BLOCK_WRITE_DST16(ptr, v) \
+    do { \
+        BLOCK_WRITE_DST8(ptr, (v).s01234567); \
+        BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \
+    } while (0)
+
+#elif DST_DT_F16 || DST_DT_BF16
+
+#define BLOCK_WRITE_DST(ptr, v) \
+    intel_sub_group_block_write_us((__global ushort *)ptr, as_ushort(v))
+
+#define BLOCK_WRITE_DST2(ptr, v) \
+    intel_sub_group_block_write_us2((__global ushort *)ptr, as_short2(v))
+
+#define BLOCK_WRITE_DST4(ptr, v) \
+    intel_sub_group_block_write_us4((__global ushort *)ptr, as_ushort4(v))
+
+#define BLOCK_WRITE_DST8(ptr, v) \
+    intel_sub_group_block_write_us8((__global ushort *)ptr, as_ushort8(v))
+
+#define BLOCK_WRITE_DST16(ptr, v) \
+    do { \
+        BLOCK_WRITE_DST8(ptr, (v).s01234567); \
+        BLOCK_WRITE_DST8(ptr + 8 * SUB_GROUP_SIZE, (v).s89abcdef); \
+    } while (0)
+
+#endif
+
+#if DST_DT_BF16
+#define DST_TO_REF(x) cvt_bf16_to_f32(x)
+#define DST_TO_REF2(x) cvt_bf16_to_f32(x)
+#define DST_TO_REF8(x) cvt_bf16_to_f32(x)
+#define REF_TO_DST(x) cvt_f32_to_bf16(x)
+#define REF_TO_DST8(x) cvt_f32_to_bf16(convert_float8(x))
+#elif DST_DT_BF8
+#define DST_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#define DST_TO_REF2(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#define DST_TO_REF8(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#define REF_TO_DST(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#define REF_TO_DST8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
+#elif DST_DT_HF8
+#define DST_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#define DST_TO_REF2(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#define DST_TO_REF8(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#define REF_TO_DST(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#define REF_TO_DST8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
+#define DST_DATA_MAX (uchar)0x7B
+#define DST_DATA_MIN (uchar)0xFB
+#elif DST_DT_F4_E2M1
+#define DST_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define DST_TO_REF2(x) cvt_f4_e2m1_to_f32(x)
+#define DST_TO_REF8(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_DST(x) cvt_f32_to_f4_e2m1(x)
+#define REF_TO_DST8(x) cvt_f32_to_f4_e2m1(x)
+#define DST_DATA_MAX (uchar)0x07
+#define DST_DATA_MIN (uchar)0x01
+#elif DST_DT_F4_E3M0
+#define DST_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define DST_TO_REF2(x) cvt_f4_e3m0_to_f32(x)
+#define DST_TO_REF8(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_DST(x) cvt_f32_to_f4_e3m0(x)
+#define REF_TO_DST8(x) cvt_f32_to_f4_e3m0(x)
+#define DST_DATA_MAX (uchar)0x07
+#define DST_DATA_MIN (uchar)0x08
+#elif DST_DT_F16
+#define REF_TO_DST(x) convert_half(x)
+#define DST_TO_REF(x) convert_float(x)
+#define DST_TO_REF2(x) convert_float2(x)
+#define DST_TO_REF8(x) convert_float8(x)
+#elif DST_DT_U8
+#define DST_TO_REF(x) (x)
+#define DST_TO_REF2(x) (x)
+#define DST_TO_REF8(x) (x)
+#define REF_TO_DST(x) convert_uchar(x)
+#define REF_TO_DST8(x) convert_uchar8(x)
+#elif DST_DT_S8
+#define DST_TO_REF(x) (x)
+#define DST_TO_REF2(x) (x)
+#define DST_TO_REF8(x) (x)
+#define REF_TO_DST(x) convert_char(x)
+#define REF_TO_DST8(x) convert_char8(x)
+#else
+#define DST_TO_REF(x) (x)
+#define DST_TO_REF2(x) (x)
+#define DST_TO_REF8(x) (x)
+#define REF_TO_DST(x) (x)
+#define REF_TO_DST8(x) (x)
+#endif
+
+#if DST_DT_BF16
+#define TO_DST(x) cvt_f32_to_bf16(convert_float(x))
+#define TO_DST2(x) cvt_f32_to_bf16(convert_float2(x))
+#define TO_DST4(x) cvt_f32_to_bf16(convert_float4(x))
+#define TO_DST8(x) cvt_f32_to_bf16(convert_float8(x))
+#define DST_DATA_FMAX cvt_bf16_to_f32((ushort)0x7F7F)
+#define DST_DATA_FMIN cvt_bf16_to_f32((ushort)0x0080)
+#define DST_DATA_FLOW cvt_bf16_to_f32((ushort)0xFF7F)
+#elif DST_DT_F16
+#define TO_DST(x) convert_half(x)
+#define TO_DST2(x) convert_half2(x)
+#define TO_DST4(x) convert_half4(x)
+#define TO_DST8(x) convert_half8(x)
+#define DST_DATA_FMAX convert_float(HALF_MAX)
+#define DST_DATA_FMIN convert_float(HALF_MIN)
+#define DST_DATA_FLOW -DST_DATA_FMAX
+#elif DST_DT_BF8
+#define TO_DST(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#define TO_DST2(x) cvt_hf_to_f8_e5m2(convert_half2(x))
+#define TO_DST4(x) cvt_hf_to_f8_e5m2(convert_half4(x))
+#define TO_DST8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
+#define TO_DST16(x) cvt_hf_to_f8_e5m2(convert_half16(x))
+#define DST_DATA_FMAX convert_float(cvt_f8_e5m2_to_hf((uchar)0x7B))
+#define DST_DATA_FMIN convert_float(cvt_f8_e5m2_to_hf((uchar)0x04))
+#define DST_DATA_FLOW convert_float(cvt_f8_e5m2_to_hf((uchar)0xFB))
+#elif DST_DT_HF8
+#define TO_DST(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#define TO_DST2(x) cvt_hf_to_f8_e4m3(convert_half2(x))
+#define TO_DST4(x) cvt_hf_to_f8_e4m3(convert_half4(x))
+#define TO_DST8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
+#define TO_DST16(x) cvt_hf_to_f8_e4m3(convert_half16(x))
+#define DST_DATA_FMAX convert_float(cvt_f8_e4m3_to_hf((uchar)0x7E))
+#define DST_DATA_FMIN convert_float(cvt_f8_e4m3_to_hf((uchar)0x08))
+#define DST_DATA_FLOW convert_float(cvt_f8_e4m3_to_hf((uchar)0xFE))
+#elif DST_DT_U4
+#define SET_DOUBLE_HALF_BYTE(x, y, z) set_double_half_byte(x, y, z)
+#define TO_DST(x) cvt_f32_to_u4(convert_float(x))
+#define TO_DST2(x) cvt_f32_to_u4(convert_float2(x))
+#define TO_DST4(x) cvt_f32_to_u4(convert_float4(x))
+#define TO_DST8(x) cvt_f32_to_u4(convert_float8(x))
+#define TO_DST16(x) cvt_f32_to_u4(convert_float16(x))
+#define DST_DATA_FMAX 15.0
+#define DST_DATA_FMIN 0.0
+#define DST_DATA_FLOW DST_DATA_FMIN
+#elif DST_DT_S4
+#define SET_DOUBLE_HALF_BYTE(x, y, z) set_double_half_byte(x, y, z)
+#define TO_DST(x) cvt_f32_to_s4(convert_float(x))
+#define TO_DST2(x) cvt_f32_to_s4(convert_float2(x))
+#define TO_DST4(x) cvt_f32_to_s4(convert_float4(x))
+#define TO_DST8(x) cvt_f32_to_s4(convert_float8(x))
+#define TO_DST16(x) cvt_f32_to_s4(convert_float16(x))
+#define DST_DATA_FMAX 7.0
+#define DST_DATA_FMIN 1
+#define DST_DATA_FLOW -8.0
+#elif DST_DT_F4_E2M1
+#define SET_DOUBLE_HALF_BYTE(x, y, z) set_double_half_byte(x, y, z)
+#define TO_DST(x) cvt_f32_to_f4_e2m1(convert_float(x))
+#define TO_DST2(x) cvt_f32_to_f4_e2m1(convert_float2(x))
+#define TO_DST4(x) cvt_f32_to_f4_e2m1(convert_float4(x))
+#define TO_DST8(x) cvt_f32_to_f4_e2m1(convert_float8(x))
+#define TO_DST16(x) cvt_f32_to_f4_e2m1(convert_float16(x))
+#define DST_DATA_FMAX 6.0
+#define DST_DATA_FMIN 1.0
+#define DST_DATA_FLOW -6.0
+#elif DST_DT_F4_E3M0
+#define SET_DOUBLE_HALF_BYTE(x, y, z) set_double_half_byte(x, y, z)
+#define TO_DST(x) cvt_f32_to_f4_e3m0(convert_float(x))
+#define TO_DST2(x) cvt_f32_to_f4_e3m0(convert_float2(x))
+#define TO_DST4(x) cvt_f32_to_f4_e3m0(convert_float4(x))
+#define TO_DST8(x) cvt_f32_to_f4_e3m0(convert_float8(x))
+#define TO_DST16(x) cvt_f32_to_f4_e3m0(convert_float16(x))
+#define DST_DATA_FMAX 16.0
+#define DST_DATA_FMIN 0.25
+#define DST_DATA_FLOW -16.0
+#elif DST_DT_U8
+#define TO_DST(x) convert_uchar_sat_rte(x)
+#define TO_DST2(x) convert_uchar2_sat_rte(x)
+#define TO_DST4(x) convert_uchar4_sat_rte(x)
+#define TO_DST8(x) convert_uchar8_sat_rte(x)
+#define TO_DST16(x) convert_uchar16_sat_rte(x)
+#define DST_DATA_FMAX convert_float(UCHAR_MAX)
+#define DST_DATA_FMIN 1
+#define DST_DATA_FLOW 0
+#elif DST_DT_S8
+#define TO_DST(x) convert_char_sat_rte(x)
+#define TO_DST2(x) convert_char2_sat_rte(x)
+#define TO_DST4(x) convert_char4_sat_rte(x)
+#define TO_DST8(x) convert_char8_sat_rte(x)
+#define TO_DST16(x) convert_char16_sat_rte(x)
+#define DST_DATA_FMAX convert_float(CHAR_MAX)
+#define DST_DATA_FMIN 1
+#define DST_DATA_FLOW -DST_DATA_FMAX
+#elif DST_DT_S32
+#define TO_DST(x) convert_int_sat_rte(x)
+#define TO_DST2(x) convert_int2_sat_rte(x)
+#define TO_DST4(x) convert_int4_sat_rte(x)
+#define TO_DST8(x) convert_int8_sat_rte(x)
+#define DST_DATA_FMAX convert_float(INT_MAX)
+#define DST_DATA_FMIN 1
+#define DST_DATA_FLOW -DST_DATA_FMAX
+#elif DST_DT_F32
+#define TO_DST(x) convert_float(x)
+#define TO_DST2(x) convert_float2(x)
+#define TO_DST4(x) convert_float4(x)
+#define TO_DST8(x) convert_float8(x)
+#define DST_DATA_FMAX convert_float(FLT_MAX)
+#define DST_DATA_FMIN convert_float(FLT_MIN)
+#define DST_DATA_FLOW -DST_DATA_FMAX
+#elif DST_DT_F64
+#define TO_DST(x) convert_double(x)
+#define TO_DST2(x) convert_double2(x)
+#define TO_DST4(x) convert_double4(x)
+#define TO_DST8(x) convert_double8(x)
+#define DST_DATA_FMAX convert_float(FLT_MAX)
+#define DST_DATA_FMIN convert_float(FLT_MIN)
+#define DST_DATA_FLOW -DST_DATA_FMAX
+#else
+#error "Not expected"
+#endif
+#endif
+
+#ifdef C_DATA_T
+#define C_DATA8_T CONCAT2(C_DATA_T, 8)
+#if C_DT_BF16
+#define C_TO_REF(x) cvt_bf16_to_f32(x)
+#define C_TO_REF8(x) cvt_bf16_to_f32(x)
+#define REF_TO_C(x) cvt_f32_to_bf16(x)
+#define REF_TO_C8(x) cvt_f32_to_bf16(convert_float8(x))
+#elif C_DT_BF8
+#define C_TO_REF(x) cvt_f8_e5m2_to_hf(convert_half(x))
+#define C_TO_REF8(x) cvt_f8_e5m2_to_hf(convert_half8(x))
+#define REF_TO_C(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#define REF_TO_C8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
+#elif C_DT_HF8
+#define C_TO_REF(x) cvt_f8_e4m3_to_hf(convert_half(x))
+#define C_TO_REF8(x) cvt_f8_e4m3_to_hf(convert_half8(x))
+#define REF_TO_C(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#define REF_TO_C8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
+#elif C_DT_F4_E2M1
+#define C_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#define C_TO_REF8(x) cvt_f4_e2m1_to_f32(x)
+#define REF_TO_C(x) cvt_f32_to_f4_e2m1(x)
+#define REF_TO_C8(x) cvt_f32_to_f4_e2m1(x)
+#elif C_DT_F4_E3M0
+#define C_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#define C_TO_REF8(x) cvt_f4_e3m0_to_f32(x)
+#define REF_TO_C(x) cvt_f32_to_f4_e3m0(x)
+#define REF_TO_C8(x) cvt_f32_to_f4_e3m0(x)
+#else
+#define C_TO_REF(x) (x)
+#define C_TO_REF8(x) (x)
+#define REF_TO_C(x) (x)
+#define REF_TO_C8(x) (x)
+#endif
+#if C_DT_BF16
+#define TO_C(x) cvt_f32_to_bf16(x)
+#define TO_C8(x) cvt_f32_to_bf16(convert_float8(x))
+#elif C_DT_BF8
+#define TO_C(x) cvt_hf_to_f8_e5m2(convert_half(x))
+#define TO_C8(x) cvt_hf_to_f8_e5m2(convert_half8(x))
+#elif C_DT_HF8
+#define TO_C(x) cvt_hf_to_f8_e4m3(convert_half(x))
+#define TO_C8(x) cvt_hf_to_f8_e4m3(convert_half8(x))
+#elif C_DT_F4_E2M1
+#define TO_C(x) cvt_f32_to_f4_e2m1(x)
+#define TO_C8(x) cvt_f32_to_f4_e2m1(x)
+#elif C_DT_F4_E3M0
+#define TO_C(x) cvt_f32_to_f4_e3m0(x)
+#define TO_C8(x) cvt_f32_to_f4_e3m0(x)
+#elif C_DT_F16
+#define TO_C(x) convert_half(x)
+#define TO_C8(x) convert_half8(x)
+#elif C_DT_U8
+#define TO_C(x) convert_uchar_sat_rte(x)
+#define TO_C8(x) convert_uchar8_sat_rte(x)
+#elif C_DT_S8
+#define TO_C(x) convert_char_sat_rte(x)
+#define TO_C8(x) convert_char8_sat_rte(x)
+#elif C_DT_S32
+#define TO_C(x) convert_int_sat_rte(x)
+#define TO_C8(x) convert_int8_sat_rte(x)
+#elif C_DT_F32
+#define TO_C(x) convert_float(x)
+#define TO_C8(x) convert_float8(x)
+#elif C_DT_F64
+#define TO_C(x) convert_double(x)
+#define TO_C8(x) convert_double8(x)
+#else
+#error "Not expected"
+#endif
+#endif
+
+#ifdef ACC_DATA_T
+#if ACC_DT_F16
+#define TO_ACC(x) convert_half(x)
+#define ACC_TO_REF(x) convert_float(x)
+#elif ACC_DT_F32
+#define TO_ACC(x) convert_float(x)
+#define ACC_TO_REF(x) convert_float(x)
+#elif ACC_DT_F64
+#define TO_ACC(x) convert_double(x)
+#define ACC_TO_REF(x) convert_double(x)
+#elif ACC_DT_S32
+#define TO_ACC(x) convert_int(x)
+#define ACC_TO_REF(x) convert_float(x)
+#else
+#error "Unexpected accumulation data type"
+#endif
+#endif
+
+#ifdef SUM_DATA_T
+#define SUM_DATA2_T CONCAT2(SUM_DATA_T, 2)
+#define SUM_DATA4_T CONCAT2(SUM_DATA_T, 4)
+#define SUM_DATA8_T CONCAT2(SUM_DATA_T, 8)
+#define SUM_DATA16_T CONCAT2(SUM_DATA_T, 16)
+#define AS_SUM_DATA_T CONCAT2(as_, SUM_DATA_T)
+#define AS_SUM_DATA2_T CONCAT2(as_, SUM_DATA2_T)
+#define AS_SUM_DATA4_T CONCAT2(as_, SUM_DATA4_T)
+#define AS_SUM_DATA8_T CONCAT2(as_, SUM_DATA8_T)
+#define AS_SUM_DATA16_T CONCAT2(as_, SUM_DATA16_T)
+#if SUM_DT_BF16
+#define SUM_TO_REF cvt_bf16_to_f32
+#elif SUM_DT_BF8
+#define SUM_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#elif SUM_DT_HF8
+#define SUM_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#elif SUM_DT_F4_E2M1
+#define SUM_TO_REF(x) cvt_f4_e2m1_to_f32(x)
+#elif SUM_DT_F4_E3M0
+#define SUM_TO_REF(x) cvt_f4_e3m0_to_f32(x)
+#else
+#define SUM_TO_REF
+#endif
+#endif
+
+#ifdef DST_SCALES_DATA_T
+#if DST_SCALES_DT_HF8
+#define DST_SCALES_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#elif DST_SCALES_DT_BF8
+#define DST_SCALES_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#elif DST_SCALES_DT_F16
+#define DST_SCALES_TO_REF(x) convert_float(x)
+#elif DST_SCALES_DT_BF16
+#define DST_SCALES_TO_REF(x) cvt_bf16_to_f32(x)
+#elif DST_SCALES_DT_E8M0
+#define DST_SCALES_TO_REF(x) cvt_e8m0_to_f32(x)
+#else
+#define DST_SCALES_TO_REF(x) (x)
+#endif
+#endif
+
+#ifdef WEI_SCALES_DATA_T
+#if WEI_SCALES_DT_HF8
+#define WEI_SCALES_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#elif WEI_SCALES_DT_BF8
+#define WEI_SCALES_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#elif WEI_SCALES_DT_F16
+#define WEI_SCALES_TO_REF(x) convert_float(x)
+#elif WEI_SCALES_DT_BF16
+#define WEI_SCALES_TO_REF(x) cvt_bf16_to_f32(x)
+#else
+#define WEI_SCALES_TO_REF(x) (x)
+#endif
+#endif
+
+#ifdef SRC_SCALES_DATA_T
+#if SRC_SCALES_DT_HF8
+#define SRC_SCALES_TO_REF(x) convert_float(cvt_f8_e4m3_to_hf(x))
+#elif SRC_SCALES_DT_BF8
+#define SRC_SCALES_TO_REF(x) convert_float(cvt_f8_e5m2_to_hf(x))
+#elif SRC_SCALES_DT_F16
+#define SRC_SCALES_TO_REF(x) convert_float(x)
+#elif SRC_SCALES_DT_BF16
+#define SRC_SCALES_TO_REF(x) cvt_bf16_to_f32(x)
+#else
+#define SRC_SCALES_TO_REF(x) (x)
+#endif
+#endif
+
+#ifdef WEI_ZP_DATA_T
+#if WEI_ZP_DT_S8
+#define WEI_ZP_TO_REF(zp, off) convert_int_sat_rte(zp[off])
+#elif WEI_ZP_DT_U8
+#define WEI_ZP_TO_REF(zp, off) convert_int_sat_rte(zp[off])
+#elif WEI_ZP_DT_S4
+#define WEI_ZP_TO_REF(zp, off) cvt_s4_to_s32(GET_HALF_BYTE(zp, off))
+#elif WEI_ZP_DT_U4
+#define WEI_ZP_TO_REF(zp, off) convert_int_sat_rte(GET_HALF_BYTE(zp, off))
+#else
+#define WEI_ZP_TO_REF(zp, off) (zp[off])
+#endif
+#endif
+
+#ifdef SRC_ZP_DATA_T
+#if SRC_ZP_DT_S8
+#define SRC_ZP_TO_REF(zp, off) convert_int_sat_rte(zp[off])
+#elif SRC_ZP_DT_U8
+#define SRC_ZP_TO_REF(zp, off) convert_int_sat_rte(zp[off])
+#elif SRC_ZP_DT_S4
+#define SRC_ZP_TO_REF(zp, off) cvt_s4_to_s32(GET_HALF_BYTE(zp, off))
+#elif SRC_ZP_DT_U4
+#define SRC_ZP_TO_REF(zp, off) convert_int_sat_rte(GET_HALF_BYTE(zp, off))
+#else
+#define SRC_ZP_TO_REF(zp, off) (zp[off])
+#endif
+#endif
+
+#if SRC_NDIMS == 3
+#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, w, 0, 0, 0)
+#elif SRC_NDIMS == 4
+#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, h, w, 0, 0)
+#elif SRC_NDIMS == 5
+#define CONV_SRC_OFF(n, c, d, h, w) OFF_MD(SRC, n, c, d, h, w, 0)
+#endif
+
+#if WEI_NDIMS == 3
+#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, w, 0, 0, 0)
+#elif WEI_NDIMS == 4
+#if WITH_GROUPS == 0
+#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, h, w, 0, 0)
+#else
+#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, w, 0, 0)
+#endif
+#elif WEI_NDIMS == 5
+#if WITH_GROUPS == 0
+#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, o, i, d, h, w, 0)
+#else
+#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, h, w, 0)
+#endif
+#elif WEI_NDIMS == 6
+#define CONV_WEI_OFF(g, o, i, d, h, w) OFF_MD(WEI, g, o, i, d, h, w)
+#endif
+
+#if DST_NDIMS == 3
+#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, w, 0, 0, 0)
+#elif DST_NDIMS == 4
+#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, h, w, 0, 0)
+#elif DST_NDIMS == 5
+#define CONV_DST_OFF(n, c, d, h, w) OFF_MD(DST, n, c, d, h, w, 0)
+#endif
+
+#if NDIMS == 2
+#define SRC_OFF(x0, x1, d, h, w) \
+    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
+            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1)
+
+#if WITH_GROUPS == 1
+#define WEI_OFF(x0, x1, x2, d, h, w) \
+    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
+            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
+            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2)
+#else
+#define WEI_OFF(g, x0, x1, d, h, w) \
+    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
+            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1)
+#endif
+
+#define DST_OFF(x0, x1, d, h, w) \
+    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
+            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1)
+#elif NDIMS == 3
+#define SRC_OFF(x0, x1, d, h, x2) \
+    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
+            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \
+            + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2)
+
+#if WITH_GROUPS == 1
+#define WEI_OFF(x0, x1, x2, d, h, x3) \
+    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
+            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
+            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \
+            + ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3)
+#else
+#define WEI_OFF(g, x0, x1, d, h, x2) \
+    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
+            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
+            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2)
+#endif
+
+#define DST_OFF(x0, x1, d, h, x2) \
+    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
+            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \
+            + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2)
+#elif NDIMS == 4
+#define SRC_OFF(x0, x1, d, x2, x3) \
+    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
+            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \
+            + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \
+            + ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3)
+
+#if WITH_GROUPS == 1
+#define WEI_OFF(x0, x1, x2, d, x3, x4) \
+    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
+            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
+            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \
+            + ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3 \
+            + ((x4) % WEI_B4) * WEI_SB4 + ((x4) / WEI_B4) * WEI_S4)
+#else
+#define WEI_OFF(g, x1, x2, d, x3, x4) \
+    (((x1) % WEI_B0) * WEI_SB0 + ((x1) / WEI_B0) * WEI_S0 \
+            + ((x2) % WEI_B1) * WEI_SB1 + ((x2) / WEI_B1) * WEI_S1 \
+            + ((x3) % WEI_B2) * WEI_SB2 + ((x3) / WEI_B2) * WEI_S2 \
+            + ((x4) % WEI_B3) * WEI_SB3 + ((x4) / WEI_B3) * WEI_S3)
+#endif
+
+#define DST_OFF(x0, x1, d, x2, x3) \
+    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
+            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \
+            + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \
+            + ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3)
+#elif NDIMS == 5
+#define SRC_OFF(x0, x1, x2, x3, x4) \
+    (((x0) % SRC_B0) * SRC_SB0 + ((x0) / SRC_B0) * SRC_S0 \
+            + ((x1) % SRC_B1) * SRC_SB1 + ((x1) / SRC_B1) * SRC_S1 \
+            + ((x2) % SRC_B2) * SRC_SB2 + ((x2) / SRC_B2) * SRC_S2 \
+            + ((x3) % SRC_B3) * SRC_SB3 + ((x3) / SRC_B3) * SRC_S3 \
+            + ((x4) % SRC_B4) * SRC_SB4 + ((x4) / SRC_B4) * SRC_S4)
+
+#if WITH_GROUPS == 1
+#define WEI_OFF(x0, x1, x2, x3, x4, x5) \
+    (((x0) % WEI_B0) * WEI_SB0 + ((x0) / WEI_B0) * WEI_S0 \
+            + ((x1) % WEI_B1) * WEI_SB1 + ((x1) / WEI_B1) * WEI_S1 \
+            + ((x2) % WEI_B2) * WEI_SB2 + ((x2) / WEI_B2) * WEI_S2 \
+            + ((x3) % WEI_B3) * WEI_SB3 + ((x3) / WEI_B3) * WEI_S3 \
+            + ((x4) % WEI_B4) * WEI_SB4 + ((x4) / WEI_B4) * WEI_S4 \
+            + ((x5) % WEI_B5) * WEI_SB5 + ((x5) / WEI_B5) * WEI_S5)
+#else
+#define WEI_OFF(g, x1, x2, x3, x4, x5) \
+    (((x1) % WEI_B0) * WEI_SB0 + ((x1) / WEI_B0) * WEI_S0 \
+            + ((x2) % WEI_B1) * WEI_SB1 + ((x2) / WEI_B1) * WEI_S1 \
+            + ((x3) % WEI_B2) * WEI_SB2 + ((x3) / WEI_B2) * WEI_S2 \
+            + ((x4) % WEI_B3) * WEI_SB3 + ((x4) / WEI_B3) * WEI_S3 \
+            + ((x5) % WEI_B4) * WEI_SB4 + ((x5) / WEI_B4) * WEI_S4)
+#endif
+
+#define DST_OFF(x0, x1, x2, x3, x4) \
+    (((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
+            + ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1 \
+            + ((x2) % DST_B2) * DST_SB2 + ((x2) / DST_B2) * DST_S2 \
+            + ((x3) % DST_B3) * DST_SB3 + ((x3) / DST_B3) * DST_S3 \
+            + ((x4) % DST_B4) * DST_SB4 + ((x4) / DST_B4) * DST_S4)
+#endif
+
+#if SRC_DT_U8 == 1
+#define SRC_DT_ALIAS UCHAR
+#elif SRC_DT_S8 == 1
+#define SRC_DT_ALIAS CHAR
+#elif SRC_DT_F16 == 1
+#define SRC_DT_ALIAS HALF
+#elif SRC_DT_BF16 == 1
+#define SRC_DT_ALIAS BFLOAT
+#elif SRC_DT_F32 == 1
+#define SRC_DT_ALIAS FLOAT
+#elif SRC_DT_F64 == 1
+#define SRC_DT_ALIAS DOUBLE
+#endif
+
+#if DST_DT_U8 == 1
+#define DST_DT_ALIAS UCHAR
+#elif DST_DT_S8 == 1
+#define DST_DT_ALIAS CHAR
+#elif DST_DT_F16 == 1
+#define DST_DT_ALIAS HALF
+#elif DST_DT_BF16 == 1
+#define DST_DT_ALIAS BFLOAT
+#elif DST_DT_F32 == 1
+#define DST_DT_ALIAS FLOAT
+#elif DST_DT_F64 == 1
+#define DST_DT_ALIAS DOUBLE
+#endif
+
+#endif
diff --git a/src/gpu/intel/ocl/ocl_utils.cpp b/src/gpu/intel/ocl/ocl_utils.cpp
deleted file mode 100644
index 999d33be891..00000000000
--- a/src/gpu/intel/ocl/ocl_utils.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-#include <CL/cl_ext.h>
-
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-#include "gpu/intel/ocl/ocl_gpu_kernel.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-#include "xpu/ocl/utils.hpp"
-
-#ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
-#define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
-#endif
-
-#ifndef CL_DEVICE_NUM_SLICES_INTEL
-#define CL_DEVICE_NUM_SLICES_INTEL 0x4252
-#endif
-
-#ifndef CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL
-#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253
-#endif
-
-#ifndef CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL
-#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254
-#endif
-
-#ifndef CL_DEVICE_FEATURE_CAPABILITIES_INTEL
-#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL 0x4256
-#endif
-
-#ifndef CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT
-#define CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT 0x4231
-#endif
-
-#ifndef CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT
-#define CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT 0x4232
-#endif
-
-#ifndef CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT
-#define CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT 0x4233
-#endif
-
-#ifndef CL_DEVICE_ATOMIC_FLAGS
-#define CL_DEVICE_ATOMIC_FLAGS
-#define CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT (1 << 0)
-#define CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT (1 << 1)
-#define CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT (1 << 2)
-#define CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT (1 << 16)
-#define CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT (1 << 17)
-#define CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT (1 << 18)
-#endif
-
-#ifndef CL_DEVICE_FEATURE_FLAG_DPAS_INTEL
-#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL (1 << 1)
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
-        cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) {
-    char s_type[16];
-    auto cl_status = clGetKernelArgInfo(ocl_kernel, idx,
-            CL_KERNEL_ARG_TYPE_NAME, sizeof(s_type), s_type, nullptr);
-    if (cl_status == CL_SUCCESS) {
-#define CASE(x) \
-    if (!strcmp(STRINGIFY(x), s_type)) { \
-        *type = compute::scalar_type_t::_##x; \
-        return status::success; \
-    }
-        CASE(char)
-        CASE(float)
-        CASE(half)
-        CASE(int)
-        CASE(long)
-        CASE(short)
-        CASE(uchar)
-        CASE(uint)
-        CASE(ulong)
-        CASE(ushort)
-        CASE(zero_pad_mask_t)
-#undef CASE
-    }
-
-    if (allow_undef) {
-        *type = compute::scalar_type_t::undef;
-        return status::success;
-    }
-
-    assert(!"Not expected");
-    return status::runtime_error;
-}
-
-static status_t get_number_devices(cl_program program, size_t *n_devices) {
-    cl_int err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
-            sizeof(size_t), n_devices, nullptr);
-    OCL_CHECK(err);
-    return status::success;
-}
-
-status_t get_ocl_program_binary_size(
-        cl_kernel kernel, cl_device_id device, size_t *size) {
-    cl_program program;
-    cl_int err = clGetKernelInfo(
-            kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
-    OCL_CHECK(err);
-
-    size_t n_devices = 0;
-    CHECK(get_number_devices(program, &n_devices));
-
-    std::vector<size_t> binary_sizes(n_devices);
-    err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
-            sizeof(size_t) * n_devices, binary_sizes.data(), nullptr);
-    OCL_CHECK(err);
-
-    // Identify local device index in the list of devices the program was
-    // compiled for. Using global indexing through `get_device_index` may
-    // fail due to presence of two or more physical devices in the system.
-    std::vector<cl_device_id> devices(n_devices);
-    err = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
-            sizeof(cl_device_id) * n_devices, devices.data(), nullptr);
-    OCL_CHECK(err);
-
-    auto device_it = std::find(devices.begin(), devices.end(), device);
-    if (device_it == devices.end()) return status::invalid_arguments;
-
-    size_t device_idx = std::distance(devices.begin(), device_it);
-    (*size) = binary_sizes[device_idx];
-    return status::success;
-}
-
-status_t get_ocl_program_binary(
-        cl_program program, cl_device_id device, xpu::binary_t &binary) {
-    size_t n_devices = 0;
-    CHECK(get_number_devices(program, &n_devices));
-
-    std::vector<size_t> binarySize(n_devices);
-    cl_int err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
-            sizeof(size_t) * n_devices, binarySize.data(), nullptr);
-    OCL_CHECK(err);
-
-    std::vector<cl_device_id> devices(n_devices);
-    err = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
-            sizeof(cl_device_id) * n_devices, devices.data(), nullptr);
-    OCL_CHECK(err);
-
-    size_t device_idx = std::distance(
-            devices.begin(), std::find(devices.begin(), devices.end(), device));
-    std::vector<uint8_t *> binary_pointers(n_devices);
-    std::vector<xpu::binary_t> binaries(n_devices);
-    for (size_t i = 0; i < n_devices; ++i) {
-        binaries[i] = xpu::binary_t(binarySize[i]);
-        binary_pointers[i] = binaries[i].data();
-    }
-
-    err = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
-            sizeof(uint8_t *) * n_devices, binary_pointers.data(), nullptr);
-    OCL_CHECK(err);
-    binary = binaries[device_idx];
-    return status::success;
-}
-
-status_t get_ocl_program_binary(
-        cl_kernel kernel, cl_device_id device, xpu::binary_t &binary) {
-    cl_int err;
-
-    cl_program program;
-    err = clGetKernelInfo(
-            kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
-    OCL_CHECK(err);
-
-    return get_ocl_program_binary(program, device, binary);
-}
-
-status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) {
-    binary.clear();
-    size_t binary_size;
-    OCL_CHECK(clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, 0,
-            nullptr, &binary_size));
-    binary.resize(binary_size);
-    OCL_CHECK(clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL,
-            binary.size(), binary.data(), nullptr));
-    return status::success;
-}
-
-void debugdump_processed_source(const std::string &source,
-        const std::string &options, const std::string &cl_options) {
-#if defined(__linux__) && defined(DNNL_DEV_MODE)
-    if (get_verbose(verbose_t::debuginfo) >= 10) {
-        auto get_defines = [](const std::string &from) {
-            std::string ret;
-            size_t pos = 0;
-            while (pos < from.length()) {
-                // Find next define argument
-                pos = from.find("-D", pos);
-
-                // Generate argument, quotes are interpreted literally, but
-                // other special shell characters need escaped. Does not
-                // currently handle quotes with the ' character or nested quotes
-                char quote_parity = true;
-                while (pos < from.length()) {
-                    if (quote_parity
-                            && utils::one_of(from[pos], '~', '#', '$', '&', '*',
-                                    '(', ')', '\\', '|', '[', ']', '{', '}',
-                                    ';', '\'', '<', '>', '/', '?', '!')) {
-                        ret += '\\';
-                    }
-                    ret += from[pos];
-                    if (from[pos] == '"') quote_parity ^= true;
-                    if (from[pos] == ' ' && quote_parity) break;
-
-                    pos++;
-                }
-            }
-            return ret;
-        };
-        auto execute_command = [](const std::string &cmd,
-                                       const std::string &stdin) {
-            std::string result;
-            std::array<char, 256> buffer;
-            FILE *pipe = popen(cmd.c_str(), "w");
-            fputs(stdin.c_str(), pipe);
-            if (pipe) {
-                while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
-                    result += buffer.data();
-                }
-            }
-            pclose(pipe);
-            return result;
-        };
-
-        // Run utilities to evaluate preprocessor defines and format the file
-        // Theoretically, we can accomplish this task with libclang, but it
-        // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
-        // so that calls to the system are not included in the default build.
-
-        // Due to the use of a different C preprocessor, warnings should not be
-        // ignored, as they may correspond to a different behavior in the OpenCL
-        // C preprocessor
-        auto o = get_defines(options) + get_defines(cl_options);
-        std::string preprocess_cmd
-                = std::string() + "cpp -P " + o + " | clang-format";
-        execute_command(preprocess_cmd, source);
-        std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
-    }
-#endif
-}
-
-status_t get_kernel_arg_types(cl_kernel ocl_kernel,
-        std::vector<gpu::intel::compute::scalar_type_t> *arg_types) {
-    cl_uint nargs;
-    OCL_CHECK(clGetKernelInfo(
-            ocl_kernel, CL_KERNEL_NUM_ARGS, sizeof(nargs), &nargs, nullptr));
-
-    *arg_types = std::vector<gpu::intel::compute::scalar_type_t>(nargs);
-
-    for (cl_uint i = 0; i < nargs; i++) {
-        gpu::intel::compute::scalar_type_t type {};
-        CHECK(gpu::intel::ocl::get_ocl_kernel_arg_type(
-                &type, ocl_kernel, i, /*allow_undef=*/true));
-        (*arg_types)[i] = type;
-    }
-
-    return status::success;
-}
-
-static status_t get_ocl_device_eu_count_intel(cl_device_id device,
-        gpu::intel::compute::gpu_arch_t arch, int32_t *eu_count) {
-    cl_uint num_slices = 0;
-    cl_uint num_sub_slices_per_slice = 0;
-    cl_uint num_eus_per_sub_slice = 0;
-
-    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SLICES_INTEL,
-            sizeof(num_slices), &num_slices, nullptr));
-    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL,
-            sizeof(num_sub_slices_per_slice), &num_sub_slices_per_slice,
-            nullptr));
-    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL,
-            sizeof(num_eus_per_sub_slice), &num_eus_per_sub_slice, nullptr));
-
-    if (arch == gpu::intel::compute::gpu_arch_t::xe2) {
-#ifdef _WIN32
-        return status::unimplemented; /* cannot rely on these queries */
-#endif
-        num_eus_per_sub_slice = 8; /* runtime reports incorrect value */
-    }
-
-    *eu_count = (int32_t)(
-            num_slices * num_sub_slices_per_slice * num_eus_per_sub_slice);
-    return status::success;
-}
-
-status_t get_ocl_device_enabled_systolic_intel(
-        cl_device_id device, bool &enabled_systolic) {
-    cl_bitfield res;
-    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_FEATURE_CAPABILITIES_INTEL,
-            sizeof(cl_bitfield), &res, nullptr));
-    enabled_systolic = res & CL_DEVICE_FEATURE_FLAG_DPAS_INTEL;
-    return status::success;
-}
-
-status_t get_ocl_device_enabled_native_float_atomics(
-        cl_device_id device, uint64_t &native_extensions, bool is_xelpg) {
-    cl_bitfield res;
-
-    cl_int err
-            = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT,
-                    sizeof(cl_bitfield), &res, nullptr);
-    if (err == status::success) {
-        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
-                && res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
-            native_extensions |= (uint64_t)
-                    gpu::intel::compute::native_ext_t::fp16_atomic_load_store;
-        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
-                && res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
-            native_extensions |= (uint64_t)
-                    gpu::intel::compute::native_ext_t::fp16_atomic_add;
-        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
-                && res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
-            native_extensions |= (uint64_t)
-                    gpu::intel::compute::native_ext_t::fp16_atomic_min_max;
-    }
-
-    err = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT,
-            sizeof(cl_bitfield), &res, nullptr);
-    if (err == status::success) {
-        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
-                && res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
-            native_extensions |= (uint64_t)
-                    gpu::intel::compute::native_ext_t::fp32_atomic_load_store;
-        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
-                && res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
-            native_extensions |= (uint64_t)
-                    gpu::intel::compute::native_ext_t::fp32_atomic_add;
-        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
-                && res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
-            native_extensions |= (uint64_t)
-                    gpu::intel::compute::native_ext_t::fp32_atomic_min_max;
-    }
-
-    // XeLPG lacks native support for f64 atomics.
-    if (!is_xelpg) {
-        err = clGetDeviceInfo(device,
-                CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT,
-                sizeof(cl_bitfield), &res, nullptr);
-        if (err == status::success) {
-            if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
-                    && res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
-                native_extensions |= (uint64_t)gpu::intel::compute::
-                        native_ext_t::fp64_atomic_load_store;
-            if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
-                    && res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
-                native_extensions |= (uint64_t)
-                        gpu::intel::compute::native_ext_t::fp64_atomic_add;
-            if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
-                    && res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
-                native_extensions |= (uint64_t)
-                        gpu::intel::compute::native_ext_t::fp64_atomic_min_max;
-        }
-    }
-
-    return status::success;
-}
-
-status_t get_ocl_device_eu_count(cl_device_id device,
-        gpu::intel::compute::gpu_arch_t arch, int32_t *eu_count) {
-    // Try to use Intel-specific slices/sub-slices to deduce EU count.
-    auto status = get_ocl_device_eu_count_intel(device, arch, eu_count);
-    if (status == status::success) return status;
-
-    // If failed, fall back to common OpenCL query.
-    cl_uint max_compute_units = 0;
-    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
-            sizeof(max_compute_units), &max_compute_units, nullptr));
-    *eu_count = (int32_t)max_compute_units;
-
-    return status::success;
-}
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ocl_utils.hpp b/src/gpu/intel/ocl/ocl_utils.hpp
deleted file mode 100644
index 482f60c7438..00000000000
--- a/src/gpu/intel/ocl/ocl_utils.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_OCL_OCL_UTILS_HPP
-#define GPU_INTEL_OCL_OCL_UTILS_HPP
-
-#include <string.h>
-#include <string>
-#include <utility>
-#include <vector>
-#include <CL/cl.h>
-#include <unordered_map>
-
-#include "common/c_types_map.hpp"
-#include "common/cpp_compat.hpp"
-#include "common/verbose.hpp"
-#include "gpu/intel/compute/device_info.hpp"
-#include "gpu/intel/compute/kernel_arg_list.hpp"
-#include "gpu/intel/compute/utils.hpp"
-#include "xpu/ocl/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-namespace ocl {
-
-enum { OCL_BUFFER_ALIGNMENT = 128 };
-
-status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
-        cl_kernel ocl_kernel, int idx, bool allow_undef = false);
-
-status_t get_ocl_program_binary(
-        cl_program program, cl_device_id device, xpu::binary_t &binary);
-
-status_t get_ocl_program_binary(
-        cl_kernel kernel, cl_device_id device, xpu::binary_t &binary);
-
-status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary);
-
-status_t get_ocl_program_binary_size(
-        cl_kernel kernel, cl_device_id device, size_t *size);
-
-void debugdump_processed_source(const std::string &source,
-        const std::string &options, const std::string &ocl_options);
-
-status_t get_kernel_arg_types(cl_kernel ocl_kernel,
-        std::vector<gpu::intel::compute::scalar_type_t> *arg_types);
-
-status_t get_ocl_device_eu_count(cl_device_id device,
-        gpu::intel::compute::gpu_arch_t arch, int32_t *eu_count);
-
-status_t get_ocl_device_enabled_systolic_intel(
-        cl_device_id device, bool &systolic_enabled);
-
-status_t get_ocl_device_enabled_native_float_atomics(
-        cl_device_id device, uint64_t &native_extensions, bool is_xelpg);
-
-} // namespace ocl
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/ocl/reduction/atomic_reduction.cpp b/src/gpu/intel/ocl/reduction/atomic_reduction.cpp
index 250d17cea5d..1547e825c42 100644
--- a/src/gpu/intel/ocl/reduction/atomic_reduction.cpp
+++ b/src/gpu/intel/ocl/reduction/atomic_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,9 +27,9 @@
 #include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/gpu_primitive_attr.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/reduction/atomic_reduction.hpp"
 #include "gpu/intel/ocl/reduction/reduction_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -50,13 +50,13 @@ class atomic_lws_strategy_t : public compute::lws_strategy_t {
         return false;
     };
 
-    void include(compute::dim_id_t dim, size_t size) {
+    void include(dim_idx_t dim, size_t size) {
         inc_blocks.emplace_back(into<dim_t>(dim), into<dim_t>(size), 1);
     }
 
 private:
     using compute::lws_strategy_t::lws_strategy_t;
-    compute::range_t create_lws(const compute::range_t &gws,
+    compute::range_t create_lws(compute::range_t &gws,
             const compute::gws_bin_mapping_t &mapper) const override {
         auto lws = compute::range_t::one(gws.ndims());
 
@@ -83,13 +83,13 @@ class atomic_lws_strategy_t : public compute::lws_strategy_t {
 // reduction is broken up into global, local, and loop
 // outer is left unchanged
 namespace reduction_dims {
-compute::dim_id_t subgroup = 0;
+dim_idx_t subgroup = 0;
 // implicit vector = 1
-compute::dim_id_t inner_group = 2;
-compute::dim_id_t global = 3;
-compute::dim_id_t local = 4;
-compute::dim_id_t loop = 5;
-compute::dim_id_t outer = 6;
+dim_idx_t inner_group = 2;
+dim_idx_t global = 3;
+dim_idx_t local = 4;
+dim_idx_t loop = 5;
+dim_idx_t outer = 6;
 } // namespace reduction_dims
 
 atomic_reduction_conf_t::atomic_reduction_conf_t(
@@ -101,6 +101,10 @@ atomic_reduction_conf_t::atomic_reduction_conf_t(
     conf.src_type = src_type;
     conf.dst_type = dst_type;
     conf.subgroup_size = device_info.max_subgroup_size();
+    // Short-circuit if zero-dim is present
+    gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements";
+    if (outer_block.block == 0 || inner_block.block == 0) return;
+
     auto arch = device_info.gpu_arch();
     const int base_threads_per_eu
             = compute::device_info_t::threads_per_eu(arch);
@@ -181,11 +185,11 @@ atomic_reduction_conf_t::atomic_reduction_conf_t(
             // XXX: This encodes the reduction loop size directly into the kernel,
             // which limits reusability for these cases (small reduction sizes) in
             // exchange for fast execution time.
-            unroll = gpu_utils::into<int>(loop_size);
+            unroll = into<int>(loop_size);
         } else {
             int min_iters = std::numeric_limits<int>::max();
             for (int u = max_unroll; u > 0; u--) {
-                const int unroll_iters = gpu_utils::into<int>(loop_size / u);
+                const int unroll_iters = into<int>(loop_size / u);
                 const int extra_iters = loop_size % u;
                 const int total_iters = unroll_iters + extra_iters;
 
@@ -208,14 +212,14 @@ atomic_reduction_conf_t::atomic_reduction_conf_t(
 status_t atomic_reduction_conf_t::init_dispatcher(
         const compute::compute_engine_t *engine,
         const gpu_primitive_attr_t *gpu_attr) {
-    const std::vector<compute::dim_id_t> dispatch_dims = {
+    std::vector<dim_idx_t> dispatch_dims = {
             reduction_dims::outer,
             reduction_dims::local,
             reduction_dims::global,
             reduction_dims::inner_group,
             reduction_dims::subgroup,
     };
-    const std::vector<compute::dim_id_t> all_dims = {
+    const std::vector<dim_idx_t> all_dims = {
             reduction_dims::outer,
             reduction_dims::loop,
             reduction_dims::local,
@@ -237,7 +241,8 @@ status_t atomic_reduction_conf_t::init_dispatcher(
         src.append_block(all_dims[dim_idx], sizes[dim_idx]);
     }
     // the loop dim may have padding - update the outer block's stride to avoid it
-    size_t src_outer_idx = src.get_dim_idx(reduction_dims::outer);
+    dim_idx_t src_outer_idx = src.get_dim_idx(reduction_dims::outer);
+    gpu_assert(src_outer_idx != dim_idx::invalid);
     src.format_desc.blocking.strides[src_outer_idx]
             = outer_block.stride / conf.vect_size;
 
@@ -251,12 +256,14 @@ status_t atomic_reduction_conf_t::init_dispatcher(
     src.remove_dim(reduction_dims::local, false);
 
     // Once again, loop dim padding causes issues
-    size_t dst_outer_idx = dst.get_dim_idx(reduction_dims::outer);
+    dim_idx_t dst_outer_idx = dst.get_dim_idx(reduction_dims::outer);
+    gpu_assert(dst_outer_idx != dim_idx::invalid);
     dst.format_desc.blocking.strides[dst_outer_idx]
             = inner_block.block / conf.vect_size;
 
     // Create the dispatcher
-    compute::reusable_dispatch_config_t config(engine, dispatch_dims);
+    compute::reusable_dispatch_config_t config(
+            engine, std::move(dispatch_dims));
     CHECK(config.register_buffer(src));
     CHECK(config.register_buffer(dst));
     CHECK(config.define_dim_index(
@@ -542,7 +549,7 @@ status_t atomic_reduction_t::execute_atomic(const exec_ctx_t &ctx) const {
     // Run a finalization kernel if needed
     if (pd()->needs_finalization) {
         exec_args_t eltwise_args;
-        std::unique_ptr<memory_t> eltwise_src;
+        std::unique_ptr<memory_t, memory_deleter_t> eltwise_src;
         CHECK(safe_ptr_assign(eltwise_src,
                 new memory_t(ctx.stream()->engine(), pd()->dst_md(0),
                         std::move(sp_reduce[(num_phases - 1) % 2]))));
diff --git a/src/gpu/intel/ocl/reduction/atomic_reduction.hpp b/src/gpu/intel/ocl/reduction/atomic_reduction.hpp
index 38b2a3c088f..161b6516c61 100644
--- a/src/gpu/intel/ocl/reduction/atomic_reduction.hpp
+++ b/src/gpu/intel/ocl/reduction/atomic_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,13 +19,13 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive.hpp"
+#include "common/serialization.hpp"
 #include "gpu/gpu_reduction_pd.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
 #include "gpu/intel/gpu_primitive_attr.hpp"
 #include "gpu/intel/ocl/reduction/reduction_utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -66,7 +66,7 @@ struct atomic_reduction_key_params_t
 
     compute::dispatch_compile_params_t params;
 };
-assert_trivially_serializable(atomic_reduction_key_params_t);
+DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(atomic_reduction_key_params_t);
 
 struct atomic_reduction_conf_t : public reduction_subproblem_t {
     atomic_reduction_conf_t(const reduction_subproblem_t &subprb,
@@ -125,7 +125,7 @@ struct atomic_reduction_t : public gpu_primitive_t {
         for (auto &phase : phases) {
             compute::kernel_t kernel;
             CHECK(create_kernel(engine, kernel, "atomic_reduce", phase.conf));
-            kernels_.push_back(kernel);
+            kernels_.push_back(std::move(kernel));
         }
 
         if (pd()->needs_finalization) {
diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cl b/src/gpu/intel/ocl/reduction/combined_reduction.cl
index 694d6b640f9..7be5297b1b8 100644
--- a/src/gpu/intel/ocl/reduction/combined_reduction.cl
+++ b/src/gpu/intel/ocl/reduction/combined_reduction.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -144,7 +144,7 @@ void reverse_indexing(dim_t dst_off, int *res) {
 }
 #endif
 
-__attribute__((overloadable)) void write(
+__attribute__((overloadable)) void write_dst(
         __global DST_DATA_T *dst, DST_DATA_T val) {
     *dst = val;
 }
@@ -158,22 +158,27 @@ void write_padded_zeros(__global DST_DATA_T *dst) {
     for (int i = 0; i < DST_Z0_SIZE0; i++) {
         for (int j = 0; j < DST_Z1_SIZE0; j++) {
             if (i == 0 && j == 0) continue;
-            write(dst + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0, TO_DST(0.0f));
+            write_dst(dst + i * DST_Z0_STRIDE0 + j * DST_Z1_STRIDE0,
+                    TO_DST(0.0f));
         }
     }
 #elif DST_Z0_IS_REDUCED
     for (int i = 1; i < DST_Z0_SIZE0; i++) {
-        write(dst + i * DST_Z0_STRIDE0, TO_DST(0.0f));
+        write_dst(dst + i * DST_Z0_STRIDE0, TO_DST(0.0f));
     }
 #elif DST_Z1_IS_REDUCED
     for (int j = 1; j < DST_Z1_SIZE0; j++) {
-        write(dst + j * DST_Z1_STRIDE0, TO_DST(0.0f));
+        write_dst(dst + j * DST_Z1_STRIDE0, TO_DST(0.0f));
     }
 #endif
 }
 
 #if INNER_DIM_SIZE < SUBGROUP_SIZE
+#if INNER_DIM_SIZE == 0
+#define SLM_PER_SG 1
+#else
 #define SLM_PER_SG INNER_DIM_SIZE
+#endif // INNER_DIM_SIZE == 0
 #else
 #define SLM_PER_SG SUBGROUP_SIZE
 #endif
@@ -210,7 +215,8 @@ combined_reduce(
     const int red_off_sg = (inner_idx_start + sglid) / INNER_DIM_SIZE;
     const int red_off_tg = red_off_sg + sgid * red_per_sg;
 
-    const int active_channels = min(SUBGROUP_SIZE, red_per_sg * INNER_DIM_SIZE);
+    const int active_channels = min(
+            SUBGROUP_SIZE, red_per_sg * (INNER_DIM_SIZE - inner_idx_start));
     ASSUME(active_channels == SUBGROUP_SIZE || !WITH_BLOCK_READ);
 
     const int loop_stride = _SRC_OFF(0, other_reductions, 0);
@@ -306,7 +312,7 @@ combined_reduce(
             }
 #endif
             if (is_dst_zero_padded(dst_off)) res = 0.0f;
-            write(dst + dst_off, IS_FINAL ? TO_DST(res) : res);
+            write_dst(dst + dst_off, IS_FINAL ? TO_DST(res) : res);
             DUMP("Wrote dst[%ld] = %f\n", dst_off, res);
             write_padded_zeros(dst + dst_off);
             DUMP("dst[%ld] <- %f\n", dst_off, TO_DST(res));
diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.cpp b/src/gpu/intel/ocl/reduction/combined_reduction.cpp
index ba2ba229c6d..fc1193e9ea3 100644
--- a/src/gpu/intel/ocl/reduction/combined_reduction.cpp
+++ b/src/gpu/intel/ocl/reduction/combined_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@
 #include "gpu/intel/block_structure.hpp"
 #include "gpu/intel/compute/device_info.hpp"
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/reduction/reduction_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/utils.hpp"
 
 namespace dnnl {
@@ -80,8 +80,15 @@ reduction_phase_conf_t::reduction_phase_conf_t(
     : reduction_subproblem_t(subprb)
     , src_type(src_type)
     , dst_type(dst_type)
-    , subgroup_size(compute_engine->device_info()->max_subgroup_size())
-    , with_block_reads(can_use_block_reads()) {
+    , subgroup_size(compute_engine->device_info()->max_subgroup_size()) {
+    // Short-circuit if zero-dim is present
+    gpu_assert(reduction_block.block != 0) << "Reducing over 0 elements";
+    if (outer_block.block == 0 || inner_block.block == 0) {
+        nd_range = compute::nd_range_t({0}, {into<size_t>(subgroup_size)});
+        with_block_reads = false;
+        return;
+    }
+    with_block_reads = can_use_block_reads();
 
     const int num_EU = compute_engine->device_info()->eu_count();
     const int max_wg_size = static_cast<int>(
@@ -106,8 +113,7 @@ reduction_phase_conf_t::reduction_phase_conf_t(
     slm_reductions = [this, &num_packed_inner_dims, &max_slm]() {
         const dim_t rem_red = reduction_block.block / num_packed_inner_dims;
         // XXX: max_div no longer required
-        int n_slm = gpu_utils::into<int>(
-                nstl::min(rem_red, gpu_utils::into<dim_t>(max_slm)));
+        int n_slm = into<int>(nstl::min(rem_red, into<dim_t>(max_slm)));
         return gpu_utils::dev_getenv("combined_reduction_n_slm", n_slm);
     }();
     dim_t num_subgroups
@@ -122,18 +128,16 @@ reduction_phase_conf_t::reduction_phase_conf_t(
             block_size = num_subgroups / num_threads;
             block_size = get_previous_factor(outer_block.block, block_size);
         }
-        return gpu_utils::dev_getenv("combined_reduction_num_outer",
-                gpu_utils::into<int>(block_size));
+        return gpu_utils::dev_getenv(
+                "combined_reduction_num_outer", into<int>(block_size));
     }();
     gpu_assert(outer_block.block % outer_tile_size == 0)
             << "Invalid choice of persistent thread outer idxs";
     num_subgroups /= outer_tile_size;
 
     // Compute the nd_range for this phase
-    compute::range_t gws(
-            gpu_utils::into<size_t>(num_subgroups * subgroup_size));
-    compute::range_t lws(
-            gpu_utils::into<size_t>(slm_reductions * subgroup_size));
+    compute::range_t gws(into<size_t>(num_subgroups * subgroup_size));
+    compute::range_t lws(into<size_t>(slm_reductions * subgroup_size));
     nd_range = compute::nd_range_t(gws, lws);
 
     is_first = false;
@@ -185,12 +189,17 @@ status_t split_into_phases(const reduction_subproblem_t &subprb,
         const compute::compute_engine_t *compute_engine,
         std::vector<reduction_phase_conf_t> &phases, bool large_grf_mode) {
     const dim_t reduction_elems = subprb.reduction_block.block;
+    reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type,
+            compute_engine, large_grf_mode);
+    // Zero-dim short circuit
+    if (try_phase.outer_block.block == 0 || try_phase.inner_block.block == 0) {
+        phases.emplace_back(try_phase);
+        return status::success;
+    }
 
     //Heuristic:
     // subsplitting has a high cost due to launching multiple sequential threads,
     // so only split when parallelism is low and reductions per thread is large
-    reduction_phase_conf_t try_phase(subprb, accum_data_type, accum_data_type,
-            compute_engine, large_grf_mode);
     const bool low_parallelism = [&compute_engine, &large_grf_mode,
                                          &try_phase]() {
         compute::gpu_arch_t arch = compute_engine->device_info()->gpu_arch();
@@ -201,14 +210,13 @@ status_t split_into_phases(const reduction_subproblem_t &subprb,
         const int min_threads = gpu_utils::dev_getenv(
                 "combined_reduction_occ_thresh", threads_per_EU * num_EU / 2);
         const int dispatched_threads
-                = gpu_utils::into<int>(try_phase.nd_range.global_range()[0]
-                        / gpu_utils::into<size_t>(try_phase.subgroup_size));
+                = into<int>(try_phase.nd_range.global_range()[0]
+                        / into<size_t>(try_phase.subgroup_size));
         return dispatched_threads < min_threads;
     }();
     const bool large_reduction = [&try_phase]() {
-        const int slm_red
-                = gpu_utils::into<int>(try_phase.nd_range.local_range()[0]
-                        / gpu_utils::into<size_t>(try_phase.subgroup_size));
+        const int slm_red = into<int>(try_phase.nd_range.local_range()[0]
+                / into<size_t>(try_phase.subgroup_size));
         const dim_t sg_red = nstl::clamp(
                 try_phase.subgroup_size / try_phase.inner_block.block,
                 dim_t {1}, try_phase.reduction_block.block);
@@ -247,6 +255,10 @@ status_t combined_reduction_t::pd_t::init_conf(impl::engine_t *engine) {
     const dim_t *src_padded_dims = src_mdw.padded_dims();
     const dim_t *dst_dims = dst_mdw.dims();
 
+    // Implementation uses int for offset calculations
+    if (src_mdw.nelems(true) > INT_MAX || dst_mdw.nelems(true) > INT_MAX)
+        return status::unimplemented;
+
     for (int i = 0; i < ndims; i++) {
         // Actually reduced dimensions
         if (src_dims[i] != dst_dims[i]) {
@@ -453,6 +465,8 @@ status_t combined_reduction_t::pd_t::init_kernel_ctx(
 }
 
 status_t combined_reduction_t::execute_combined(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim_memory()) return status::success;
+
     auto &src = CTX_IN_STORAGE(DNNL_ARG_SRC);
     auto &dst = CTX_OUT_STORAGE(DNNL_ARG_DST);
     std::unique_ptr<memory_storage_t> sp_reduce[2]
diff --git a/src/gpu/intel/ocl/reduction/combined_reduction.hpp b/src/gpu/intel/ocl/reduction/combined_reduction.hpp
index a1ee98479e8..8b9256e5d35 100644
--- a/src/gpu/intel/ocl/reduction/combined_reduction.hpp
+++ b/src/gpu/intel/ocl/reduction/combined_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -84,16 +84,13 @@ struct combined_reduction_t : public gpu_primitive_t {
     status_t init(impl::engine_t *engine) override {
         auto &phases = pd()->phases;
 
-        status_t status;
         for (auto &phase : phases) {
             compute::kernel_ctx_t kernel_ctx(pd()->attr());
-            status = pd()->init_kernel_ctx(kernel_ctx, phase);
-            CHECK(status);
+            CHECK(pd()->init_kernel_ctx(kernel_ctx, phase));
             compute::kernel_t kernel;
-            status = create_kernel(
-                    engine, &kernel, "combined_reduce", kernel_ctx);
-            CHECK(status);
-            kernels_.push_back(kernel);
+            CHECK(create_kernel(
+                    engine, &kernel, "combined_reduce", kernel_ctx));
+            kernels_.push_back(std::move(kernel));
         }
 
         return status::success;
diff --git a/src/gpu/intel/ocl/reduction/ocl_reduction.h b/src/gpu/intel/ocl/reduction/ocl_reduction.h
index 2e0d15ed57a..a9e8e30e58d 100644
--- a/src/gpu/intel/ocl/reduction/ocl_reduction.h
+++ b/src/gpu/intel/ocl/reduction/ocl_reduction.h
@@ -64,6 +64,14 @@ IF_DOUBLE_SUPPORTED(DEF_fp_minmax_abs(double));
         } \
     }
 
+DEF_reduce(float);
+DEF_reduce(int);
+IF_DOUBLE_SUPPORTED(DEF_reduce(double));
+IF_HALF_SUPPORTED(DEF_reduce(half));
+
+#undef DEF_reduce
+
+#if !DETERMINISTIC && ATOMICS_SUPPORTED
 // Atomic reduction does not support mul... Must be checked on the caller side
 #define DEF_atomic_reduce(dt) \
     dt __attribute__((overloadable)) atomic_reduce( \
@@ -89,20 +97,13 @@ IF_DOUBLE_SUPPORTED(DEF_fp_minmax_abs(double));
         return SPECIAL(dt, zero); \
     }
 
-DEF_reduce(float);
-DEF_reduce(int);
-IF_DOUBLE_SUPPORTED(DEF_reduce(double));
-IF_DOUBLE_SUPPORTED(DEF_reduce(half));
-
-#if ATOMICS_SUPPORTED
 #if ATOMIC_FLOAT_SUPPORTED
 DEF_atomic_reduce(float);
 #endif
 DEF_atomic_reduce(int);
-#endif
 
-#undef DEF_reduce
 #undef DEF_atomic_reduce
+#endif
 
 // ************ Initialization functions ************* //
 
diff --git a/src/gpu/intel/ocl/reduction/reduction_utils.cpp b/src/gpu/intel/ocl/reduction/reduction_utils.cpp
index 29282f97525..f89bade4476 100644
--- a/src/gpu/intel/ocl/reduction/reduction_utils.cpp
+++ b/src/gpu/intel/ocl/reduction/reduction_utils.cpp
@@ -31,7 +31,7 @@ std::vector<zero_padding_t> calc_zero_padding(
     const dim_t *dims = mdw.dims();
     for (int i = 0; i < src_blocking.inner_nblks; i++) {
         // Check if this needs zero-padding
-        const dim_t dim_idx = src_blocking.inner_idxs[i];
+        const dim_idx_t dim_idx = into<dim_idx_t>(src_blocking.inner_idxs[i]);
         const dim_t blk_size = src_blocking.inner_blks[i];
         if (dims[dim_idx] % blk_size != 0) {
             // Needs zero-padding: Find the 1 or 2 blocks related to this zero-padding
@@ -193,7 +193,7 @@ status_t generate_reduction_phases(const memory_desc_t *src,
     reduction_subproblem_t &last_subprb = subprbs.back();
     const auto &dst_blk = dst_mdw.blocking_desc();
     for (size_t i = 0; i < static_cast<size_t>(dst_blk.inner_nblks); i++) {
-        const dim_t dim_idx = dst_blk.inner_idxs[i];
+        const dim_idx_t dim_idx = into<dim_idx_t>(dst_blk.inner_idxs[i]);
         const bool needs_zero_padding
                 = (dst_mdw.dims()[dim_idx] < dst_mdw.padded_dims()[dim_idx]);
         bool accounted_for = false;
@@ -204,8 +204,7 @@ status_t generate_reduction_phases(const memory_desc_t *src,
             }
         }
         if (needs_zero_padding && !accounted_for) {
-            const block_t default_outer(
-                    static_cast<int>(dim_idx), 1, dst_mdw.strides()[dim_idx]);
+            const block_t default_outer(dim_idx, 1, dst_mdw.strides()[dim_idx]);
 
             // Get the first (inner) and second (outer) block for this dim
             const block_t *inner = nullptr;
diff --git a/src/gpu/intel/ocl/reduction/reduction_utils.hpp b/src/gpu/intel/ocl/reduction/reduction_utils.hpp
index a502705e945..ae9c1198b84 100644
--- a/src/gpu/intel/ocl/reduction/reduction_utils.hpp
+++ b/src/gpu/intel/ocl/reduction/reduction_utils.hpp
@@ -128,7 +128,7 @@ inline void def_reduction_alg_kinds(compute::kernel_ctx_t &kernel_ctx) {
 // zeros. This is a kind of reorder that can be used
 // to short-circuit calculations to avoid reading/writing zeros.
 struct zero_padding_t {
-    zero_padding_t(const int dim_idx, const dim_t data_size,
+    zero_padding_t(const dim_idx_t dim_idx, const dim_t data_size,
             const dim_t outer_stride, const dim_t outer_size,
             const dim_t inner_stride, const dim_t inner_size)
         : dim_idx(dim_idx)
@@ -161,7 +161,7 @@ struct zero_padding_t {
         return os.str();
     }
 
-    dim_t dim_idx;
+    dim_idx_t dim_idx;
     dim_t data_size;
     dim_t outer_stride, outer_size;
     dim_t inner_stride, inner_size;
diff --git a/src/gpu/intel/ocl/reduction/ref_reduction.cl b/src/gpu/intel/ocl/reduction/ref_reduction.cl
index de06e1d80bb..87a57f200c2 100644
--- a/src/gpu/intel/ocl/reduction/ref_reduction.cl
+++ b/src/gpu/intel/ocl/reduction/ref_reduction.cl
@@ -69,15 +69,15 @@
 
 __kernel void ref_reduce(
         __global SRC_DATA_T *src, __global DST_DATA_T *dst POST_OP_ARGS) {
-    int d0 = GWS_GET_D0();
-    int d1 = GWS_GET_D1();
-    int d2 = GWS_GET_D2();
-    int d3 = GWS_GET_D3();
-    int d4 = GWS_GET_D4();
-    int d5 = GWS_GET_D5();
+    off_t d0 = GWS_GET_D0();
+    off_t d1 = GWS_GET_D1();
+    off_t d2 = GWS_GET_D2();
+    off_t d3 = GWS_GET_D3();
+    off_t d4 = GWS_GET_D4();
+    off_t d5 = GWS_GET_D5();
 
     // If the index combination is supposed to be zero-padded, write a zero and quit
-    const int dst_off = _DST_OFF(d0, d1, d2, d3, d4, d5);
+    const off_t dst_off = _DST_OFF(d0, d1, d2, d3, d4, d5);
     if (d0 >= DST_D0 || d1 >= DST_D1 || d2 >= DST_D2 || d3 >= DST_D3
             || d4 >= DST_D4 || d5 >= DST_D5) {
         dst[dst_off] = TO_DST(0.0f);
@@ -85,14 +85,14 @@ __kernel void ref_reduce(
     }
 
     DEF_ACC_DATA_T acc = INIT_ACC;
-    for_(int d0_off = 0; d0_off < REDUCTION_D0; d0_off++)
-    for_(int d1_off = 0; d1_off < REDUCTION_D1; d1_off++)
-    for_(int d2_off = 0; d2_off < REDUCTION_D2; d2_off++)
-    for_(int d3_off = 0; d3_off < REDUCTION_D3; d3_off++)
-    for_(int d4_off = 0; d4_off < REDUCTION_D4; d4_off++)
-    for_(int d5_off = 0; d5_off < REDUCTION_D5; d5_off++)
+    for_(off_t d0_off = 0; d0_off < REDUCTION_D0; d0_off++)
+    for_(off_t d1_off = 0; d1_off < REDUCTION_D1; d1_off++)
+    for_(off_t d2_off = 0; d2_off < REDUCTION_D2; d2_off++)
+    for_(off_t d3_off = 0; d3_off < REDUCTION_D3; d3_off++)
+    for_(off_t d4_off = 0; d4_off < REDUCTION_D4; d4_off++)
+    for_(off_t d5_off = 0; d5_off < REDUCTION_D5; d5_off++)
     {
-        const int src_off = _SRC_OFF(d0 + d0_off, d1 + d1_off, d2 + d2_off,
+        const off_t src_off = _SRC_OFF(d0 + d0_off, d1 + d1_off, d2 + d2_off,
                 d3 + d3_off, d4 + d4_off, d5 + d5_off);
         acc = ACCUMULATE(acc, TO_DEF_ACC_DATA_T(src[src_off]));
     }
diff --git a/src/gpu/intel/ocl/reduction/ref_reduction.cpp b/src/gpu/intel/ocl/reduction/ref_reduction.cpp
index db7ab1933c0..a4ad4c5db57 100644
--- a/src/gpu/intel/ocl/reduction/ref_reduction.cpp
+++ b/src/gpu/intel/ocl/reduction/ref_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 
 #include "common/primitive_exec_types.hpp"
 
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/reduction/ref_reduction.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/ocl/reduction/ref_reduction.hpp b/src/gpu/intel/ocl/reduction/ref_reduction.hpp
index 211d24fff08..587a11234f4 100644
--- a/src/gpu/intel/ocl/reduction/ref_reduction.hpp
+++ b/src/gpu/intel/ocl/reduction/ref_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ struct ref_reduction_t : public gpu_primitive_t {
         return status::success;
     }
 
-    virtual status_t execute(const exec_ctx_t &ctx) const override {
+    status_t execute(const exec_ctx_t &ctx) const override {
         return execute_ref(ctx);
     }
 
diff --git a/src/gpu/intel/ocl/reduction/reusable_ref_reduction.cpp b/src/gpu/intel/ocl/reduction/reusable_ref_reduction.cpp
index 2b180ee2d27..57743d52f29 100644
--- a/src/gpu/intel/ocl/reduction/reusable_ref_reduction.cpp
+++ b/src/gpu/intel/ocl/reduction/reusable_ref_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,9 +24,9 @@
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/gpu_primitive_attr.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/reduction/reduction_utils.hpp"
 #include "gpu/intel/ocl/reduction/reusable_ref_reduction.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -38,17 +38,17 @@ using namespace gpu_utils;
 
 namespace { // Use an anonymous namespace to avoid collisions with ocl:atomic
 namespace reduction_dims {
-compute::dim_id_t outer = 0;
-compute::dim_id_t reduction = 1;
-compute::dim_id_t inner = 2;
+dim_idx_t outer = 0;
+dim_idx_t reduction = 1;
+dim_idx_t inner = 2;
 } // namespace reduction_dims
 } // namespace
-static const std::vector<compute::dim_id_t> dims {
+static const std::vector<dim_idx_t> dims {
         reduction_dims::outer,
         reduction_dims::reduction,
         reduction_dims::inner,
 };
-static const std::vector<compute::dim_id_t> dispatch_dims {
+static const std::vector<dim_idx_t> dispatch_dims {
         reduction_dims::outer,
         reduction_dims::inner,
 };
@@ -59,7 +59,7 @@ ref_reduction_conf_t::ref_reduction_conf_t(const reduction_subproblem_t &subprb,
         gpu_primitive_attr_t *gpu_attr)
     : reduction_stride(subprb.reduction_block.stride)
     , reduction_size(subprb.reduction_block.block)
-    , num_dst_elems(gpu_utils::into<size_t>(
+    , num_dst_elems(into<size_t>(
               subprb.outer_block.block * subprb.inner_block.block)) {
     conf.alg = alg;
     conf.src_dt = src_dt;
@@ -242,7 +242,7 @@ status_t reusable_ref_reduction_t::execute(const exec_ctx_t &ctx) const {
                 = [use_int32_offset](
                           compute::kernel_arg_list_t &arg_list, dim_t off) {
                       if (use_int32_offset) {
-                          arg_list.append(gpu_utils::into<int32_t>(off));
+                          arg_list.append(into<int32_t>(off));
                       } else {
                           arg_list.append(off);
                       }
@@ -258,8 +258,7 @@ status_t reusable_ref_reduction_t::execute(const exec_ctx_t &ctx) const {
         reduction_arg_list.append(src_mem);
         reduction_arg_list.append(dst_mem);
         append_off(reduction_arg_list, phase.reduction_stride);
-        append_off(reduction_arg_list,
-                gpu_utils::into<dim_t>(phase.reduction_size));
+        append_off(reduction_arg_list, into<dim_t>(phase.reduction_size));
         reduction_arg_list.append(pd()->div);
         reduction_arg_list.append(pd()->desc()->p);
         reduction_arg_list.append(pd()->desc()->eps);
diff --git a/src/gpu/intel/ocl/reduction/reusable_ref_reduction.hpp b/src/gpu/intel/ocl/reduction/reusable_ref_reduction.hpp
index 3e3d7f30d45..5f4e32b0ffa 100644
--- a/src/gpu/intel/ocl/reduction/reusable_ref_reduction.hpp
+++ b/src/gpu/intel/ocl/reduction/reusable_ref_reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,13 +19,13 @@
 
 #include "common/c_types_map.hpp"
 #include "common/primitive.hpp"
+#include "common/serialization.hpp"
 #include "gpu/gpu_reduction_pd.hpp"
 #include "gpu/intel/compute/device_info.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
 #include "gpu/intel/ocl/reduction/reduction_utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -58,7 +58,7 @@ struct ref_reduction_key_params_t
 
     compute::dispatch_compile_params_t params;
 };
-assert_trivially_serializable(ref_reduction_key_params_t);
+DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(ref_reduction_key_params_t);
 
 struct ref_reduction_conf_t {
     ref_reduction_conf_t(const reduction_subproblem_t &subprb,
@@ -114,7 +114,7 @@ struct reusable_ref_reduction_t : public gpu_primitive_t {
             compute::kernel_t kernel;
             CHECK(create_kernel(engine, kernel,
                     phase.conf.get_kernel_names()[0], phase.conf));
-            kernels_.push_back(kernel);
+            kernels_.push_back(std::move(kernel));
         }
         return status::success;
     }
diff --git a/src/gpu/intel/ocl/ref_convolution.cl b/src/gpu/intel/ocl/ref_convolution.cl
index 63c9689de69..2be09918431 100644
--- a/src/gpu/intel/ocl/ref_convolution.cl
+++ b/src/gpu/intel/ocl/ref_convolution.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_philox.h"
 #include "gpu/intel/ocl/ocl_post_ops.h"
 #include "gpu/intel/ocl/ocl_types.h"
 
@@ -36,48 +37,72 @@
 
 #if IS_FWD
 KERNEL_ATTR
-__kernel void ref_convolution_fwd(const __global SRC_DATA_T *src,
-        const __global WEI_DATA_T *wei, const __global BIA_DATA_T *bias,
-        __global DST_DATA_T *dst POST_OP_ARGS, const __global float *src_scales,
-        const __global float *wei_scales, const __global float *dst_scales,
+__kernel void ref_convolution_fwd(
+        const __global SRC_DATA_T *src, const __global WEI_DATA_T *wei,
+        const __global BIA_DATA_T *bias, __global DST_DATA_T *dst POST_OP_ARGS,
+        const __global SRC_SCALES_DATA_T *src_scales,
+        const __global WEI_SCALES_DATA_T *wei_scales,
+        const __global DST_SCALES_DATA_T *dst_scales,
         const __global int *src_zpoints, const __global WEI_ZP_T *wei_zpoints,
-        const __global int *dst_zpoints) {
+        const __global int *dst_zpoints
+#if WITH_SROUND
+        ,
+        __global uint *sround_seed_buf
+#endif
+) {
+#if WITH_SROUND
+    uint sround_seed = sround_seed_buf[0];
+#endif
 
     src += SRC_OFFSET0;
     dst += DST_OFFSET0;
 
-    const int n = GWS_GET_MB();
-    const int oc = GWS_GET_OC();
-    const int g = GWS_GET_G();
-    const int od = GWS_GET_OD();
-    const int oh = GWS_GET_OH();
-    const int ow = GWS_GET_OW();
+    const off_t n = GWS_GET_MB();
+    const off_t oc = GWS_GET_OC();
+    const off_t g = GWS_GET_G();
+    const off_t od = GWS_GET_OD();
+    const off_t oh = GWS_GET_OH();
+    const off_t ow = GWS_GET_OW();
 
     ACC_DATA_T d = 0;
-    for (int ic = 0; ic < IC; ++ic)
-        for (int kd = 0; kd < KD; ++kd)
-            for (int kh = 0; kh < KH; ++kh)
-                for (int kw = 0; kw < KW; ++kw) {
-                    const int id = od * SD - PD + kd * (1 + DD);
-                    const int ih = oh * SH - PH + kh * (1 + DH);
-                    const int iw = ow * SW - PW + kw * (1 + DW);
+    for (off_t ic = 0; ic < IC; ++ic)
+        for (off_t kd = 0; kd < KD; ++kd)
+            for (off_t kh = 0; kh < KH; ++kh)
+                for (off_t kw = 0; kw < KW; ++kw) {
+                    const off_t id = od * SD - PD + kd * (1 + DD);
+                    const off_t ih = oh * SH - PH + kh * (1 + DH);
+                    const off_t iw = ow * SW - PW + kw * (1 + DW);
 
                     if (id < 0 || id >= ID || ih < 0 || ih >= IH || iw < 0
                             || iw >= IW)
                         continue;
 
-                    const uint src_off = SRC_OFF(n, g * IC + ic, id, ih, iw);
-                    const uint wei_off = WEI_OFF(g, oc, ic, kd, kh, kw);
-                    d += SRC_TO_REF(src[src_off]) * WEI_TO_REF(wei[wei_off]);
+                    const off_t src_off = SRC_OFF(n, g * IC + ic, id, ih, iw);
+                    const off_t wei_off = WEI_OFF(g, oc, ic, kd, kh, kw);
+
+#if SRC_DT_F4_E2M1 || SRC_DT_F4_E3M0
+                    ACC_DATA_T s
+                            = TO_ACC(SRC_TO_REF(GET_HALF_BYTE(src, src_off)));
+#else
+                    ACC_DATA_T s = TO_ACC(SRC_TO_REF(src[src_off]));
+#endif
+#if WEI_DT_F4_E2M1 || WEI_DT_F4_E3M0
+                    ACC_DATA_T w
+                            = TO_ACC(WEI_TO_REF(GET_HALF_BYTE(wei, wei_off)));
+#else
+                    ACC_DATA_T w = TO_ACC(WEI_TO_REF(wei[wei_off]));
+#endif
+                    d += s * w;
+
 #if WITH_SRC_ZPOINTS
                     const int src_zp
                             = src_zpoints[WITH_SRC_ZPOINTS_PER_IC ? g * IC + ic
                                                                   : 0];
-                    d -= src_zp * WEI_TO_REF(wei[wei_off]);
+                    d -= src_zp * w;
 #endif
 #if WITH_WEI_ZPOINTS
                     const int wei_zp = wei_zpoints[0];
-                    d -= wei_zp * SRC_TO_REF(src[src_off]);
+                    d -= wei_zp * s;
 #endif
 #if WITH_SRC_ZPOINTS
 #if WITH_WEI_ZPOINTS
@@ -88,13 +113,13 @@ __kernel void ref_convolution_fwd(const __global SRC_DATA_T *src,
     POST_OP_DATA_T tmp = d;
 
 #if WITH_SRC_SCALES
-    tmp *= src_scales[0];
+    tmp *= SRC_SCALES_TO_REF(src_scales[0]);
 #endif
 #if WITH_WEI_SCALES
 #if WEI_SCALES_MASK == 0
-    tmp *= wei_scales[0];
+    tmp *= WEI_SCALES_TO_REF(wei_scales[0]);
 #else
-    tmp *= wei_scales[g * OC + oc];
+    tmp *= WEI_SCALES_TO_REF(wei_scales[g * OC + oc]);
 #endif
 #endif
 
@@ -124,12 +149,20 @@ __kernel void ref_convolution_fwd(const __global SRC_DATA_T *src,
     const unsigned po_d2 = 0;
     const unsigned po_d3 = 0;
     const unsigned po_d4 = 0;
+#endif
+#if WITH_SROUND
+    tmp = stochastic_round_fwd(
+            tmp, DST_OFF(n, g * OC + oc, od, oh, ow), sround_seed);
 #endif
     APPLY_POST_OPS_SERIAL(tmp, POST_OP_DATA_T, sum_src, POST_OP_DATA_T, n, 1,
             g * OC + oc, 1, po_d2, 1, po_d3, 1, po_d4, 1, 0, 1);
 
 #if WITH_DST_SCALES
-    tmp /= dst_scales[0];
+#if DST_SCALES_MASK == 0
+    tmp /= DST_SCALES_TO_REF(dst_scales[0]);
+#else
+    tmp /= DST_SCALES_TO_REF(dst_scales[g * OC + oc]);
+#endif
 #endif
 
 #if WITH_DST_ZPOINTS
@@ -148,40 +181,50 @@ __kernel void ref_convolution_bwd_data(__global SRC_DATA_T *diff_src,
         const __global float *src_scales, const __global float *wei_scales,
         const __global float *dst_scales, const __global int *src_zpoints,
         const __global WEI_ZP_T *wei_zpoints, const __global int *dst_zpoints) {
-    const int n = GWS_GET_MB();
-    const int ic = GWS_GET_IC();
-    const int g = GWS_GET_G();
-    const int id = GWS_GET_ID();
-    const int ih = GWS_GET_IH();
-    const int iw = GWS_GET_IW();
-    ACC_DATA_T d = 0.0;
-    for_(int oc = 0; oc < OC; ++oc)
-    for_(int kd = 0; kd < KD; ++kd)
-    for_(int kh = 0; kh < KH; ++kh)
-    for (int kw = 0; kw < KW; ++kw) {
+    const off_t n = GWS_GET_MB();
+    const off_t ic = GWS_GET_IC();
+    const off_t g = GWS_GET_G();
+    const off_t id = GWS_GET_ID();
+    const off_t ih = GWS_GET_IH();
+    const off_t iw = GWS_GET_IW();
+    ACC_DATA_T d = 0.0f;
+    for_(off_t oc = 0; oc < OC; ++oc)
+    for_(off_t kd = 0; kd < KD; ++kd)
+    for_(off_t kh = 0; kh < KH; ++kh)
+    for (off_t kw = 0; kw < KW; ++kw) {
         if (iw + PW < kw * (1 + DW) || ih + PH < kh * (1 + DH)
                 || id + PD < kd * (1 + DD))
             continue;
-        int ow = iw - kw * (1 + DW) + PW;
-        int oh = ih - kh * (1 + DH) + PH;
-        int od = id - kd * (1 + DD) + PD;
+        off_t ow = iw - kw * (1 + DW) + PW;
+        off_t oh = ih - kh * (1 + DH) + PH;
+        off_t od = id - kd * (1 + DD) + PD;
         if (ow % SW != 0 || oh % SH != 0 || od % SD != 0) continue;
 
         ow /= SW;
         oh /= SH;
         od /= SD;
         if (oh < OH && ow < OW && od < OD) {
-            const uint dst_off = DST_OFF(n, g * OC + oc, od, oh, ow);
-            const uint wei_off = WEI_OFF(g, oc, ic, kd, kh, kw);
-            d += DST_TO_REF(diff_dst[dst_off]) * WEI_TO_REF(wei[wei_off]);
+            const off_t dst_off = DST_OFF(n, g * OC + oc, od, oh, ow);
+            const off_t wei_off = WEI_OFF(g, oc, ic, kd, kh, kw);
+#if DST_DT_F4_E2M1 || DST_DT_F4_E3M0
+            ACC_DATA_T diff_d = DST_TO_REF(GET_HALF_BYTE(diff_dst, dst_off));
+#else
+            ACC_DATA_T diff_d = DST_TO_REF(diff_dst[dst_off]);
+#endif
+#if WEI_DT_F4_E2M1 || WEI_DT_F4_E3M0
+            ACC_DATA_T w = WEI_TO_REF(GET_HALF_BYTE(wei, wei_off));
+#else
+            ACC_DATA_T w = WEI_TO_REF(wei[wei_off]);
+#endif
+            d += diff_d * w;
 #if WITH_SRC_ZPOINTS
             const int src_zp
                     = src_zpoints[WITH_SRC_ZPOINTS_PER_IC ? g * OC + oc : 0];
-            d -= src_zp * WEI_TO_REF(wei[wei_off]);
+            d -= src_zp * w;
 #endif
 #if WITH_WEI_ZPOINTS
             const int wei_zp = wei_zpoints[0];
-            d -= wei_zp * DST_TO_REF(diff_dst[dst_off]);
+            d -= wei_zp * diff_d;
 #endif
 #if WITH_SRC_ZPOINTS
 #if WITH_WEI_ZPOINTS
@@ -191,27 +234,27 @@ __kernel void ref_convolution_bwd_data(__global SRC_DATA_T *diff_src,
         }
     }
 
-    float sum_src;
-#if WITH_SUM
-    sum_src = convert_float(
-            SRC_TO_REF(diff_src[SRC_OFF(n, g * IC + ic, id, ih, iw)]));
-#endif
-
-    float accumulator = convert_float(d);
+    POST_OP_DATA_T tmp = d;
 
 #if WITH_SRC_SCALES
-    accumulator *= src_scales[0];
+    tmp *= src_scales[0];
 #endif
 #if WITH_WEI_SCALES
 #if WEI_SCALES_MASK == 0
-    accumulator *= wei_scales[0];
+    tmp *= wei_scales[0];
 #else
-    accumulator *= wei_scales[g * IC + ic];
+    tmp *= wei_scales[g * IC + ic];
 #endif
 #endif
 
 #if WITH_BIAS
-    accumulator += BIA_TO_REF(bias[g * IC + ic]);
+    tmp += (POST_OP_DATA_T)BIA_TO_REF(bias[g * IC + ic]);
+#endif
+
+    POST_OP_DATA_T sum_src;
+#if WITH_SUM
+    sum_src = (POST_OP_DATA_T)SUM_TO_REF(
+            AS_SUM_DATA_T(diff_src[SRC_OFF(n, g * IC + ic, id, ih, iw)]));
 #endif
 
 #if NDIMS == 3
@@ -231,19 +274,19 @@ __kernel void ref_convolution_bwd_data(__global SRC_DATA_T *diff_src,
     const unsigned po_d3 = 0;
     const unsigned po_d4 = 0;
 #endif
-    APPLY_POST_OPS_SERIAL(accumulator, float, sum_src, float, n, 1, g *IC + ic,
-            1, po_d2, 1, po_d3, 1, po_d4, 1, 0, 1);
+    APPLY_POST_OPS_SERIAL(tmp[0], POST_OP_DATA_T, sum_src, POST_OP_DATA_T, n, 1,
+            g * IC + ic, 1, po_d2, 1, po_d3, 1, po_d4, 1, 0, 1);
 
 #if WITH_DST_SCALES
-    accumulator /= dst_scales[0];
+    tmp /= dst_scales[0];
 #endif
 
 #if WITH_DST_ZPOINTS
     const int dst_zp = dst_zpoints[WITH_DST_ZPOINTS_PER_OC ? g * IC + ic : 0];
-    accumulator += dst_zp;
+    tmp += dst_zp;
 #endif // WITH_DST_ZPOINTS
 
-    diff_src[SRC_OFF(n, g * IC + ic, id, ih, iw)] = TO_SRC(accumulator);
+    diff_src[SRC_OFF(n, g * IC + ic, id, ih, iw)] = TO_SRC(tmp);
 }
 #endif
 
@@ -252,20 +295,20 @@ KERNEL_ATTR
 __kernel void ref_convolution_bwd_weights(const __global SRC_DATA_T *src,
         __global WEI_DATA_T *diff_wei, __global BIA_DATA_T *diff_bias,
         const __global DST_DATA_T *diff_dst) {
-    const int g = GWS_GET_G();
-    const int ic = GWS_GET_IC();
-    const int oc = GWS_GET_OC();
-    const int kd = GWS_GET_KD();
-    const int kh = GWS_GET_KH();
-    const int kw = GWS_GET_KW();
+    const off_t g = GWS_GET_G();
+    const off_t ic = GWS_GET_IC();
+    const off_t oc = GWS_GET_OC();
+    const off_t kd = GWS_GET_KD();
+    const off_t kh = GWS_GET_KH();
+    const off_t kw = GWS_GET_KW();
 
 #if WITH_BIAS
     if (ic == 0 && kh == 0 && kw == 0 & kd == 0) {
-        ACC_DATA_T d = 0.0;
-        for (int n = 0; n < MB; ++n)
-            for (int od = 0; od < OD; ++od)
-                for (int oh = 0; oh < OH; ++oh)
-                    for (int ow = 0; ow < OW; ++ow) {
+        ACC_DATA_T d = 0.0f;
+        for (off_t n = 0; n < MB; ++n)
+            for (off_t od = 0; od < OD; ++od)
+                for (off_t oh = 0; oh < OH; ++oh)
+                    for (off_t ow = 0; ow < OW; ++ow) {
                         d += DST_TO_REF(
                                 diff_dst[DST_OFF(n, g * OC + oc, od, oh, ow)]);
                     }
@@ -273,11 +316,11 @@ __kernel void ref_convolution_bwd_weights(const __global SRC_DATA_T *src,
     }
 #endif
 
-    ACC_DATA_T dw = 0.0;
-    for (int n = 0; n < MB; ++n)
-        for (int od = 0; od < OD; ++od)
-            for (int oh = 0; oh < OH; ++oh)
-                for (int ow = 0; ow < OW; ++ow) {
+    ACC_DATA_T dw = 0.0f;
+    for (off_t n = 0; n < MB; ++n)
+        for (off_t od = 0; od < OD; ++od)
+            for (off_t oh = 0; oh < OH; ++oh)
+                for (off_t ow = 0; ow < OW; ++ow) {
                     if (ow * SW + kw * (1 + DW) < PW
                             || oh * SH + kh * (1 + DH) < PH
                             || od * SD + kd * (1 + DD) < PD
@@ -286,14 +329,24 @@ __kernel void ref_convolution_bwd_weights(const __global SRC_DATA_T *src,
                             || od * SD + kd * (1 + DD) >= ID + PD)
                         continue;
 
-                    int id = od * SD - PD + kd * (1 + DD);
-                    int ih = oh * SH - PH + kh * (1 + DH);
-                    int iw = ow * SW - PW + kw * (1 + DW);
+                    off_t id = od * SD - PD + kd * (1 + DD);
+                    off_t ih = oh * SH - PH + kh * (1 + DH);
+                    off_t iw = ow * SW - PW + kw * (1 + DW);
+                    off_t dst_off = DST_OFF(n, g * OC + oc, od, oh, ow);
+                    off_t src_off = SRC_OFF(n, g * IC + ic, id, ih, iw);
 
-                    dw += DST_TO_REF(
-                                  diff_dst[DST_OFF(n, g * OC + oc, od, oh, ow)])
-                            * SRC_TO_REF(
-                                    src[SRC_OFF(n, g * IC + ic, id, ih, iw)]);
+#if DST_DT_F4_E2M1 || DST_DT_F4_E3M0
+                    ACC_DATA_T diff_d
+                            = SRC_TO_REF(GET_HALF_BYTE(diff_dst, dst_off));
+#else
+                    ACC_DATA_T diff_d = DST_TO_REF(diff_dst[dst_off]);
+#endif
+#if SRC_DT_F4_E2M1 || SRC_DT_F4_E3M0
+                    ACC_DATA_T s = SRC_TO_REF(GET_HALF_BYTE(src, src_off));
+#else
+                    ACC_DATA_T s = SRC_TO_REF(src[src_off]);
+#endif
+                    dw += diff_d * s;
                 }
     diff_wei[WEI_OFF(g, oc, ic, kd, kh, kw)] = TO_WEI(dw);
 }
diff --git a/src/gpu/intel/ocl/ref_convolution.cpp b/src/gpu/intel/ocl/ref_convolution.cpp
index a2e45fa19b3..134a8801674 100644
--- a/src/gpu/intel/ocl/ref_convolution.cpp
+++ b/src/gpu/intel/ocl/ref_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -148,6 +148,10 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
         default: break;
     }
 
+    kernel_ctx.define_int("WITH_SROUND", conf.attr_info.with_dst_sround);
+    kernel_ctx.define_int("DST_DT_DIGITS",
+            dnnl::impl::types::digits<uint32_t>(conf.dst_data_type));
+
     def_data_type(kernel_ctx, conf.src_data_type, "SRC");
     def_data_type(kernel_ctx, conf.weights_data_type, "WEI");
     def_data_type(kernel_ctx, conf.bias_data_type, "BIA");
@@ -191,14 +195,20 @@ status_t ref_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
             = CTX_IN_STORAGE(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS);
     auto &dst_zpoints
             = CTX_IN_STORAGE(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST);
+    auto &dst_sround = CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED);
 
     auto &conf = pd()->conf;
+    const bool subbyte_pack = pd()->subbyte_pack_;
+    const auto c_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
+    const dim_t nelems = c_d.nelems();
+    auto tmp = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_conv_pack_space);
 
     compute::kernel_arg_list_t arg_list;
     arg_list.set(0, src);
     arg_list.set(1, weights);
     arg_list.set(2, bias);
-    arg_list.set(3, dst);
+    arg_list.set(3, subbyte_pack ? *tmp : dst);
 
     unsigned arg_idx = append_post_ops_to_arg_list(
             ctx, arg_list, 4, pd()->attr()->post_ops_);
@@ -222,10 +232,22 @@ status_t ref_convolution_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
     else
         arg_list.set(arg_idx++, memory_storage_t::empty_storage());
 
+    if (conf.attr_info.with_dst_sround) arg_list.set(arg_idx++, dst_sround);
+
     auto nd_range = pd()->conf.dispatch.nd_range();
 
-    status = parallel_for(ctx, nd_range, kernel_, arg_list);
-    return status;
+    CHECK(parallel_for(ctx, nd_range, kernels_[0], arg_list));
+
+    if (!subbyte_pack) return status::success;
+    compute::kernel_arg_list_t repack_arg_list;
+    repack_arg_list.set(0, *tmp);
+    repack_arg_list.set(1, dst);
+    repack_arg_list.set(2, into<dim_t>(nelems));
+    repack_arg_list.set(3, 4);
+    compute::range_t repack_gws((nelems * 4 + 7) / 8);
+    compute::nd_range_t repack_nd_range(repack_gws);
+    return large_parallel_for(
+            ctx, repack_nd_range, kernels_[1], repack_arg_list, 4);
 }
 
 status_t ref_convolution_bwd_data_t::pd_t::init_conf(impl::engine_t *engine) {
@@ -260,8 +282,14 @@ status_t ref_convolution_bwd_data_t::execute_backward_data(
 
     auto &conf = pd()->conf;
 
+    const bool subbyte_pack = pd()->subbyte_pack_;
+    const auto c_d = ctx.memory_mdw(DNNL_ARG_DIFF_DST, pd()->diff_dst_md());
+    const dim_t nelems = c_d.nelems();
+    auto tmp = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_conv_pack_space);
+
     compute::kernel_arg_list_t arg_list;
-    arg_list.set(0, diff_src);
+    arg_list.set(0, subbyte_pack ? *tmp : diff_src);
     arg_list.set(1, weights);
     arg_list.set(2, diff_dst);
     arg_list.set(3, bias);
@@ -290,9 +318,18 @@ status_t ref_convolution_bwd_data_t::execute_backward_data(
 
     auto nd_range = pd()->conf.dispatch.nd_range();
 
-    status = parallel_for(ctx, nd_range, kernel_, arg_list);
-
-    return status;
+    CHECK(parallel_for(ctx, nd_range, kernels_[0], arg_list));
+
+    if (!subbyte_pack) return status::success;
+    compute::kernel_arg_list_t repack_arg_list;
+    repack_arg_list.set(0, *tmp);
+    repack_arg_list.set(1, diff_src);
+    repack_arg_list.set(2, into<dim_t>(nelems));
+    repack_arg_list.set(3, 4);
+    compute::range_t repack_gws((nelems * 4 + 7) / 8);
+    compute::nd_range_t repack_nd_range(repack_gws);
+    return large_parallel_for(
+            ctx, repack_nd_range, kernels_[1], repack_arg_list, 4);
 }
 
 status_t ref_convolution_bwd_weights_t::pd_t::init_conf(
@@ -317,17 +354,33 @@ status_t ref_convolution_bwd_weights_t::execute_backward_weights(
     auto &diff_bias = CTX_OUT_CLEAN_STORAGE(DNNL_ARG_DIFF_BIAS, status);
     CHECK(status);
 
+    const bool subbyte_pack = pd()->subbyte_pack_;
+    const auto c_d
+            = ctx.memory_mdw(DNNL_ARG_DIFF_WEIGHTS, pd()->diff_weights_md());
+    const dim_t nelems = c_d.nelems();
+    auto tmp = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_conv_pack_space);
+
     compute::kernel_arg_list_t arg_list;
     arg_list.set(0, src);
-    arg_list.set(1, diff_weights);
+    arg_list.set(1, subbyte_pack ? *tmp : diff_weights);
     arg_list.set(2, diff_bias);
     arg_list.set(3, diff_dst);
 
     auto nd_range = pd()->conf.dispatch.nd_range();
 
-    status = parallel_for(ctx, nd_range, kernel_, arg_list);
-
-    return status;
+    CHECK(parallel_for(ctx, nd_range, kernels_[0], arg_list));
+
+    if (!subbyte_pack) return status::success;
+    compute::kernel_arg_list_t repack_arg_list;
+    repack_arg_list.set(0, *tmp);
+    repack_arg_list.set(1, diff_weights);
+    repack_arg_list.set(2, into<dim_t>(nelems));
+    repack_arg_list.set(3, 4);
+    compute::range_t repack_gws((nelems * 4 + 7) / 8);
+    compute::nd_range_t repack_nd_range(repack_gws);
+    return large_parallel_for(
+            ctx, repack_nd_range, kernels_[1], repack_arg_list, 4);
 }
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_convolution.hpp b/src/gpu/intel/ocl/ref_convolution.hpp
index 609b0bbfcc1..2c6cd5af35d 100644
--- a/src/gpu/intel/ocl/ref_convolution.hpp
+++ b/src/gpu/intel/ocl/ref_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,9 +18,11 @@
 #define GPU_INTEL_OCL_REF_CONVOLUTION_HPP
 
 #include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
 #include "common/primitive.hpp"
 #include "gpu/gpu_convolution_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -43,11 +45,12 @@ struct ref_convolution_fwd_t : public gpu_primitive_t {
                     = utils::downcast<compute::compute_engine_t *>(engine);
 
             using sm = primitive_attr_t::skip_mask_t;
-            const auto attr_skip_mask = sm::post_ops | sm::zero_points_runtime
-                    | sm::zero_points_runtime_data_type | sm::scales_runtime
-                    | sm::sum_dt;
+            const auto attr_skip_mask = sm::post_ops | sm::zero_points_data_type
+                    | sm::scales_data_type | sm::sum_dt | sm::rounding_mode;
 
             const bool is_int8 = utils::one_of(src_md_.data_type, s8, u8);
+            const bool is_fp8
+                    = utils::one_of(src_md_.data_type, f8_e5m2, f8_e4m3);
 
             VDISPATCH_CONV(set_default_alg_kind(alg_kind::convolution_direct),
                     VERBOSE_BAD_ALGORITHM);
@@ -89,21 +92,34 @@ struct ref_convolution_fwd_t : public gpu_primitive_t {
             VDISPATCH_CONV_SC(attr_.set_default_formats(dst_md(0)),
                     VERBOSE_UNSUPPORTED_POSTOP);
             VDISPATCH_CONV(post_ops_with_binary_ok(
-                                   attr(), dst_md()->data_type, 5, 0xffff),
+                                   attr(), dst_md_.data_type, 5, 0xffff),
                     VERBOSE_UNSUPPORTED_POSTOP);
 
             VDISPATCH_CONV(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
-            VDISPATCH_CONV(
-                    IMPLICATION(!attr()->scales_.has_default_values(), is_int8),
+            VDISPATCH_CONV(IMPLICATION(!attr()->scales_.has_default_values(),
+                                   is_int8 || is_fp8),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
 
             VDISPATCH_CONV(zero_points_ok(attr()), VERBOSE_UNSUPPORTED_ZP_CFG);
+            subbyte_pack_ = utils::one_of(
+                    dst_md_.data_type, data_type::f4_e2m1, data_type::f4_e3m0);
+            if (subbyte_pack_) {
+                using namespace dnnl::impl::memory_tracking::names;
+                const memory_desc_wrapper dst_mdw(dst_md(0));
+                const auto &padded_dims = dst_mdw.padded_dims();
+                const dim_t ndims = dst_mdw.ndims();
+                const dim_t nelems = utils::array_product(padded_dims, ndims);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_conv_pack_space,
+                        nelems, sizeof(char), OCL_BUFFER_ALIGNMENT);
+            }
 
             return init_conf(engine);
         }
 
         status_t init_conf(impl::engine_t *engine);
         status_t init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const;
+        bool subbyte_pack_ = false;
 
         conv_conf_t conf;
 
@@ -123,10 +139,15 @@ struct ref_convolution_fwd_t : public gpu_primitive_t {
 
         auto status = pd()->init_kernel_ctx(kernel_ctx);
         if (status != status::success) return status;
+        kernels_.resize(2);
 
         CHECK(create_kernel(
-                engine, &kernel_, "ref_convolution_fwd", kernel_ctx));
-        if (!kernel_) return status::runtime_error;
+                engine, &kernels_[0], "ref_convolution_fwd", kernel_ctx));
+        if (pd()->subbyte_pack_)
+            CHECK(create_kernel(
+                    engine, &kernels_[1], "subbyte_pack", kernel_ctx));
+        if (!kernels_[0]) return status::runtime_error;
+        if (pd()->subbyte_pack_ && !kernels_[1]) return status::runtime_error;
 
         return status::success;
     }
@@ -138,7 +159,7 @@ struct ref_convolution_fwd_t : public gpu_primitive_t {
 private:
     status_t execute_forward(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    compute::kernel_t kernel_;
+    std::vector<compute::kernel_t> kernels_;
 };
 
 struct ref_convolution_bwd_data_t : public gpu_primitive_t {
@@ -150,8 +171,8 @@ struct ref_convolution_bwd_data_t : public gpu_primitive_t {
 
         status_t init(impl::engine_t *engine) {
             using sm = primitive_attr_t::skip_mask_t;
-            const auto attr_skip_mask = sm::post_ops | sm::zero_points_runtime
-                    | sm::zero_points_runtime_data_type | sm::scales_runtime;
+            const auto attr_skip_mask
+                    = sm::post_ops | sm::zero_points_data_type | sm::scales;
             using namespace data_type;
             const auto *compute_engine
                     = utils::downcast<compute::compute_engine_t *>(engine);
@@ -190,11 +211,25 @@ struct ref_convolution_bwd_data_t : public gpu_primitive_t {
 
             VDISPATCH_CONV(zero_points_ok(attr()), VERBOSE_UNSUPPORTED_ZP_CFG);
 
+            subbyte_pack_ = utils::one_of(dst_md()->data_type,
+                    data_type::f4_e2m1, data_type::f4_e3m0);
+            if (subbyte_pack_) {
+                using namespace dnnl::impl::memory_tracking::names;
+                const memory_desc_wrapper dst_mdw(dst_md(0));
+                const auto &padded_dims = dst_mdw.padded_dims();
+                const dim_t ndims = dst_mdw.ndims();
+                const dim_t nelems = utils::array_product(padded_dims, ndims);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_conv_pack_space,
+                        nelems, sizeof(char), OCL_BUFFER_ALIGNMENT);
+            }
+
             return init_conf(engine);
         }
 
         status_t init_conf(impl::engine_t *engine);
         status_t init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const;
+        bool subbyte_pack_ = false;
 
         conv_conf_t conf;
 
@@ -215,9 +250,14 @@ struct ref_convolution_bwd_data_t : public gpu_primitive_t {
         auto status = pd()->init_kernel_ctx(kernel_ctx);
         if (status != status::success) return status;
 
+        kernels_.resize(2);
         CHECK(create_kernel(
-                engine, &kernel_, "ref_convolution_bwd_data", kernel_ctx));
-        if (!kernel_) return status::runtime_error;
+                engine, &kernels_[0], "ref_convolution_bwd_data", kernel_ctx));
+        if (pd()->subbyte_pack_)
+            CHECK(create_kernel(
+                    engine, &kernels_[1], "subbyte_pack", kernel_ctx));
+        if (!kernels_[0]) return status::runtime_error;
+        if (pd()->subbyte_pack_ && !kernels_[1]) return status::runtime_error;
 
         return status::success;
     }
@@ -229,7 +269,7 @@ struct ref_convolution_bwd_data_t : public gpu_primitive_t {
 private:
     status_t execute_backward_data(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    compute::kernel_t kernel_;
+    std::vector<compute::kernel_t> kernels_;
 };
 
 struct ref_convolution_bwd_weights_t : public gpu_primitive_t {
@@ -273,11 +313,13 @@ struct ref_convolution_bwd_weights_t : public gpu_primitive_t {
             VDISPATCH_CONV(utils::one_of(desc()->diff_weights_desc.data_type,
                                    f32, bf16, f16, f64, f8_e5m2, f8_e4m3),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_CONV(utils::one_of(desc()->src_desc.data_type, f32, bf16,
-                                   f16, f64, f8_e5m2, f8_e4m3),
+            VDISPATCH_CONV(
+                    utils::one_of(desc()->src_desc.data_type, f32, bf16, f16,
+                            f64, f8_e5m2, f8_e4m3, f4_e2m1, f4_e3m0),
                     VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_CONV(utils::one_of(desc()->diff_dst_desc.data_type, f32,
-                                   bf16, f16, f64, f8_e5m2, f8_e4m3),
+            VDISPATCH_CONV(
+                    utils::one_of(desc()->diff_dst_desc.data_type, f32, bf16,
+                            f16, f64, f8_e5m2, f8_e4m3, f4_e2m1, f4_e3m0),
                     VERBOSE_UNSUPPORTED_DT);
 
             VDISPATCH_CONV(
@@ -285,11 +327,25 @@ struct ref_convolution_bwd_weights_t : public gpu_primitive_t {
             VDISPATCH_CONV(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
+            subbyte_pack_ = utils::one_of(dst_md()->data_type,
+                    data_type::f4_e2m1, data_type::f4_e3m0);
+            if (subbyte_pack_) {
+                using namespace dnnl::impl::memory_tracking::names;
+                const memory_desc_wrapper dst_mdw(dst_md(0));
+                const auto &padded_dims = dst_mdw.padded_dims();
+                const dim_t ndims = dst_mdw.ndims();
+                const dim_t nelems = utils::array_product(padded_dims, ndims);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_conv_pack_space,
+                        nelems, sizeof(char), OCL_BUFFER_ALIGNMENT);
+            }
+
             return init_conf(engine);
         }
 
         status_t init_conf(impl::engine_t *engine);
         status_t init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const;
+        bool subbyte_pack_ = false;
 
         conv_conf_t conf;
 
@@ -310,9 +366,14 @@ struct ref_convolution_bwd_weights_t : public gpu_primitive_t {
         auto status = pd()->init_kernel_ctx(kernel_ctx);
         if (status != status::success) return status;
 
-        CHECK(create_kernel(
-                engine, &kernel_, "ref_convolution_bwd_weights", kernel_ctx));
-        if (!kernel_) return status::runtime_error;
+        kernels_.resize(2);
+        CHECK(create_kernel(engine, &kernels_[0], "ref_convolution_bwd_weights",
+                kernel_ctx));
+        if (pd()->subbyte_pack_)
+            CHECK(create_kernel(
+                    engine, &kernels_[1], "subbyte_pack", kernel_ctx));
+        if (!kernels_[0]) return status::runtime_error;
+        if (pd()->subbyte_pack_ && !kernels_[1]) return status::runtime_error;
 
         return status::success;
     }
@@ -324,7 +385,7 @@ struct ref_convolution_bwd_weights_t : public gpu_primitive_t {
 private:
     status_t execute_backward_weights(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    compute::kernel_t kernel_;
+    std::vector<compute::kernel_t> kernels_;
 };
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_eltwise.cl b/src/gpu/intel/ocl/ref_eltwise.cl
index 2f4fc8381c8..0eeffacbbd3 100644
--- a/src/gpu/intel/ocl/ref_eltwise.cl
+++ b/src/gpu/intel/ocl/ref_eltwise.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,19 +16,21 @@
 
 #include "gpu/intel/ocl/dispatch.h"
 #include "gpu/intel/ocl/ocl_eltwise.h"
+#include "gpu/intel/ocl/ocl_io.h"
 #include "gpu/intel/ocl/ocl_post_ops.h"
 #include "gpu/intel/ocl/types_interop.h"
 
-#define DATA_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(DATA, x0, x1, x2, x3, x4, x5)
+#define DATA_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(DST, x0, x1, x2, x3, x4, x5)
 
 #define DIFF_DATA_OFF(x0, x1, x2, x3, x4, x5) \
-    OFF_MD(DIFF_DATA, x0, x1, x2, x3, x4, x5)
+    OFF_MD(DIFF, x0, x1, x2, x3, x4, x5)
 
 #define GWS_GET_THREAD_ID(index) (get_global_id(index) + offset.array[index])
 
 #if IS_FWD
-__kernel void ref_eltwise_fwd(__global DATA_T *src, __global DATA_T *dst,
-        float alpha, float beta, int64x3_t offset POST_OP_ARGS) {
+__kernel void ref_eltwise_fwd(__global SRC_DATA_T *src,
+        __global DST_DATA_T *dst, float alpha, float beta,
+        int64x3_t offset POST_OP_ARGS) {
 #if USE_GWS_GET
     dim_t d0 = GWS_GET_D0();
     dim_t d1 = GWS_GET_D1();
@@ -39,9 +41,9 @@ __kernel void ref_eltwise_fwd(__global DATA_T *src, __global DATA_T *dst,
 
     const dim_t data_off = DATA_OFF(d0, d1, d2, d3, d4, d5);
 
-    if (d0 >= DATA_D0 || d1 >= DATA_D1 || d2 >= DATA_D2 || d3 >= DATA_D3
-            || d4 >= DATA_D4 || d5 >= DATA_D5) {
-        dst[data_off] = CONVERT_DATA_T(0.f);
+    if (d0 >= DST_D0 || d1 >= DST_D1 || d2 >= DST_D2 || d3 >= DST_D3
+            || d4 >= DST_D4 || d5 >= DST_D5) {
+        write(dst + data_off, 0.f);
         return;
     }
 #else
@@ -63,28 +65,29 @@ __kernel void ref_eltwise_fwd(__global DATA_T *src, __global DATA_T *dst,
 #endif
 
 #if DT_F16 == 1
-    float tmp_s = CONVERT_FLOAT_T(src[data_off]);
+    float tmp_s = load(tmp_s, src + data_off);
 #else
-    float tmp_s = DATA_TO_REF(src[data_off]);
+    float tmp_s = load(tmp_s, src + data_off);
 #endif
     tmp_s = fwd_eltwise(tmp_s, alpha, beta, 1.0f);
 
     float dst_data;
 #if WITH_SUM
-    dst_data = convert_float(DATA_TO_REF(dst[data_off]));
+    load(dst_data, dst + data_off);
 #endif
 
     APPLY_POST_OPS_SERIAL(tmp_s, float, dst_data, float, d0, 1, d1, 1, d2, 1,
             d3, 1, d4, 1, d5, 1);
-    dst[data_off] = CONVERT_DATA_T(tmp_s);
+    write(dst + data_off, tmp_s);
 }
 
 #else // #if IS_FWD
 
-#if DT_F32 == 1 || DT_BF16 == 1 || DT_F16 == 1
+#if DT_F64 == 1 || DT_F32 == 1 || DT_BF16 == 1 || DT_F16 == 1
 
-__kernel void ref_eltwise_bwd(__global DATA_T *src, __global DATA_T *diff_src,
-        __global DATA_T *diff_dst, float alpha, float beta, int64x3_t offset) {
+__kernel void ref_eltwise_bwd(__global SRC_DATA_T *src,
+        __global DIFF_DATA_T *diff_src, __global DIFF_DATA_T *diff_dst,
+        float alpha, float beta, int64x3_t offset) {
 
     dim_t d0 = GWS_GET_D0();
     dim_t d1 = GWS_GET_D1();
@@ -96,17 +99,16 @@ __kernel void ref_eltwise_bwd(__global DATA_T *src, __global DATA_T *diff_src,
     const dim_t data_off = DATA_OFF(d0, d1, d2, d3, d4, d5);
     const dim_t diff_data_off = DIFF_DATA_OFF(d0, d1, d2, d3, d4, d5);
 
-    if (d0 >= DATA_D0 || d1 >= DATA_D1 || d2 >= DATA_D2 || d3 >= DATA_D3
-            || d4 >= DATA_D4 || d5 >= DATA_D5) {
-        diff_src[diff_data_off] = CONVERT_DATA_T(0.f);
+    if (d0 >= DST_D0 || d1 >= DST_D1 || d2 >= DST_D2 || d3 >= DST_D3
+            || d4 >= DST_D4 || d5 >= DST_D5) {
+        write(diff_src + diff_data_off, 0.f);
         return;
     }
 
-    POST_OP_DATA_T tmp_dd = DATA_TO_REF(diff_dst[diff_data_off]);
-    POST_OP_DATA_T tmp_s = DATA_TO_REF(src[data_off]);
+    POST_OP_DATA_T tmp_dd = load(tmp_dd, diff_dst + diff_data_off);
+    POST_OP_DATA_T tmp_s = load(tmp_s, src + data_off);
 
-    diff_src[diff_data_off]
-            = CONVERT_DATA_T(bwd_eltwise(tmp_dd, tmp_s, alpha, beta));
+    write(diff_src + diff_data_off, bwd_eltwise(tmp_dd, tmp_s, alpha, beta));
 }
 #endif
 
diff --git a/src/gpu/intel/ocl/ref_eltwise.cpp b/src/gpu/intel/ocl/ref_eltwise.cpp
index c1c006e224d..5ced9a083e1 100644
--- a/src/gpu/intel/ocl/ref_eltwise.cpp
+++ b/src/gpu/intel/ocl/ref_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,7 +69,6 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
 
     def_eltwise_alg_kinds(kernel_ctx);
 
-    kernel_ctx.define_int("WITH_ELTWISE", 1);
     kernel_ctx.define_int("ELTWISE_ALG", conf.alg);
     kernel_ctx.define_int("NDIMS", conf.ndims);
     kernel_ctx.define_int("GWS0", conf.dispatch.nd_range().global_range()[0]);
@@ -82,10 +81,11 @@ static status_t init_kernel_ctx_common(compute::kernel_ctx_t &kernel_ctx,
     kernel_ctx.define_int(
             "USE_GWS_GET", conf.with_zero_padding || with_binary_post_ops);
 
-    def_memory_desc_info(kernel_ctx, conf.data_md_info, "DATA");
+    def_data_type(kernel_ctx, conf.data_md_info.data_type, "SRC", false);
+    def_memory_desc_info(kernel_ctx, conf.data_md_info, "DST", false);
 
     if (!conf.is_forward) {
-        def_memory_desc_info(kernel_ctx, conf.data_diff_md_info, "DIFF_DATA");
+        def_memory_desc_info(kernel_ctx, conf.data_diff_md_info, "DIFF", false);
     } else {
         kernel_ctx.define_int("IS_FWD", 1);
     }
diff --git a/src/gpu/intel/ocl/ref_eltwise.hpp b/src/gpu/intel/ocl/ref_eltwise.hpp
index 1181b649fc7..26079e97daf 100644
--- a/src/gpu/intel/ocl/ref_eltwise.hpp
+++ b/src/gpu/intel/ocl/ref_eltwise.hpp
@@ -122,9 +122,7 @@ struct ref_eltwise_fwd_t : public gpu_primitive_t {
 struct ref_eltwise_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_eltwise_bwd_pd_t {
-        pd_t(const eltwise_desc_t *adesc, const primitive_attr_t *attr,
-                const eltwise_fwd_pd_t *hint_fwd_pd)
-            : gpu_eltwise_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_eltwise_bwd_pd_t::gpu_eltwise_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:ref:any", ref_eltwise_bwd_t);
 
diff --git a/src/gpu/intel/ocl/ref_group_normalization.cpp b/src/gpu/intel/ocl/ref_group_normalization.cpp
index a5e5f14121d..6fc8ec913fc 100644
--- a/src/gpu/intel/ocl/ref_group_normalization.cpp
+++ b/src/gpu/intel/ocl/ref_group_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,8 +78,7 @@ status_t ref_group_normalization_fwd_t::pd_t::init(impl::engine_t *engine) {
     VDISPATCH_GNORM(utils::one_of(dst_dt, f32, bf16, f16, s8, u8),
             VERBOSE_UNSUPPORTED_DT);
 
-    const skip_mask_t attr_mask
-            = skip_mask_t::scales_runtime | skip_mask_t::post_ops;
+    const skip_mask_t attr_mask = skip_mask_t::scales | skip_mask_t::post_ops;
     VDISPATCH_GNORM(
             attr()->has_default_values(attr_mask), VERBOSE_UNSUPPORTED_ATTR);
     VDISPATCH_GNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
@@ -108,9 +107,9 @@ status_t ref_group_normalization_fwd_t::pd_t::init_kernel_ctx(
         compute::kernel_ctx_t &kernel_ctx) const {
 
     kernel_ctx.define_int("WITH_SRC_SCALES",
-            !attr()->scales_.get(DNNL_ARG_SRC).has_default_values());
+            !attr()->scales_.has_default_values(DNNL_ARG_SRC));
     kernel_ctx.define_int("WITH_DST_SCALES",
-            !attr()->scales_.get(DNNL_ARG_DST).has_default_values());
+            !attr()->scales_.has_default_values(DNNL_ARG_DST));
     init_kernel_ctx_common(kernel_ctx, this);
 
     // promote macros defined by parameters to OpenCL command line
diff --git a/src/gpu/intel/ocl/ref_inner_product.cl b/src/gpu/intel/ocl/ref_inner_product.cl
index e2671523a54..8bcfaa00550 100644
--- a/src/gpu/intel/ocl/ref_inner_product.cl
+++ b/src/gpu/intel/ocl/ref_inner_product.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,21 +26,23 @@ __kernel void ref_inner_product_fwd(__global SRC_DATA_T *src,
         __global DST_DATA_T *dst POST_OP_ARGS, __global float *src_scales,
         __global float *wei_scales, __global float *dst_scales) {
 
-    const int mb = GWS_GET_MB();
-    const int oc = GWS_GET_OC();
+    const off_t mb = GWS_GET_MB();
+    const off_t oc = GWS_GET_OC();
+
+    if (mb >= MB || oc >= OC) return;
 
     ACC_DATA_T d = 0;
 #if HAS_SPATIAL == 1
-    for (int ic = 0; ic < IC; ++ic)
-        for (int kd = 0; kd < KD; ++kd)
-            for (int kh = 0; kh < KH; ++kh)
-                for (int kw = 0; kw < KW; ++kw) {
-                    const uint src_off = SRC_OFF(mb, ic, kd, kh, kw);
-                    const uint wei_off = WEI_OFF(0, oc, ic, kd, kh, kw);
+    for (off_t ic = 0; ic < IC; ++ic)
+        for (off_t kd = 0; kd < KD; ++kd)
+            for (off_t kh = 0; kh < KH; ++kh)
+                for (off_t kw = 0; kw < KW; ++kw) {
+                    const off_t src_off = SRC_OFF(mb, ic, kd, kh, kw);
+                    const off_t wei_off = WEI_OFF(0, oc, ic, kd, kh, kw);
 #else
-    for (int ic = 0; ic < IC_TOTAL; ++ic) {
-        const uint src_off = mb * IC_TOTAL + ic;
-        const uint wei_off = oc * IC_TOTAL + ic;
+    for (off_t ic = 0; ic < IC_TOTAL; ++ic) {
+        const off_t src_off = mb * IC_TOTAL + ic;
+        const off_t wei_off = oc * IC_TOTAL + ic;
 #endif
                     d += SRC_TO_REF(src[src_off]) * WEI_TO_REF(wei[wei_off]);
                 }
@@ -81,19 +83,19 @@ KERNEL_ATTR
 __kernel void ref_inner_product_bwd_data(__global SRC_DATA_T *diff_src,
         __global WEI_DATA_T *wei, __global DST_DATA_T *diff_dst) {
 
-    const int mb = GWS_GET_MB_IC() / IC;
-    const int ic = GWS_GET_MB_IC() % IC;
-    const int kd = GWS_GET_KD();
-    const int kh = GWS_GET_KH();
-    const int kw = GWS_GET_KW();
+    const off_t mb = GWS_GET_MB_IC() / IC;
+    const off_t ic = GWS_GET_MB_IC() % IC;
+    const off_t kd = GWS_GET_KD();
+    const off_t kh = GWS_GET_KH();
+    const off_t kw = GWS_GET_KW();
 
     float ds = 0.0f;
-    for (int oc = 0; oc < OC; ++oc) {
-        const uint diff_dst_off = DST_OFF(mb, oc, 0, 0, 0);
-        const uint wei_off = WEI_OFF(0, oc, ic, kd, kh, kw);
+    for (off_t oc = 0; oc < OC; ++oc) {
+        const off_t diff_dst_off = DST_OFF(mb, oc, 0, 0, 0);
+        const off_t wei_off = WEI_OFF(0, oc, ic, kd, kh, kw);
         ds += DST_TO_REF(diff_dst[diff_dst_off]) * WEI_TO_REF(wei[wei_off]);
     }
-    const uint diff_src_off = SRC_OFF(mb, ic, kd, kh, kw);
+    const off_t diff_src_off = SRC_OFF(mb, ic, kd, kh, kw);
     diff_src[diff_src_off] = REF_TO_SRC(ds);
 }
 #endif
@@ -104,25 +106,25 @@ __kernel void ref_inner_product_bwd_weights(__global SRC_DATA_T *src,
         __global WEI_DATA_T *diff_wei, __global BIA_DATA_T *diff_bias,
         __global DST_DATA_T *diff_dst) {
 
-    const int oc = GWS_GET_OC();
-    const int ic = GWS_GET_IC();
-    const int kd = GWS_GET_KD();
-    const int kh = GWS_GET_KH();
-    const int kw = GWS_GET_KW();
+    const off_t oc = GWS_GET_OC();
+    const off_t ic = GWS_GET_IC();
+    const off_t kd = GWS_GET_KD();
+    const off_t kh = GWS_GET_KH();
+    const off_t kw = GWS_GET_KW();
 
     float ds = 0.0f;
-    for (int mb = 0; mb < MB; ++mb) {
-        const uint diff_dst_off = DST_OFF(mb, oc, 0, 0, 0);
-        const uint src_off = SRC_OFF(mb, ic, kd, kh, kw);
+    for (off_t mb = 0; mb < MB; ++mb) {
+        const off_t diff_dst_off = DST_OFF(mb, oc, 0, 0, 0);
+        const off_t src_off = SRC_OFF(mb, ic, kd, kh, kw);
         ds += DST_TO_REF(diff_dst[diff_dst_off]) * SRC_TO_REF(src[src_off]);
     }
-    const uint diff_wei_off = WEI_OFF(0, oc, ic, kd, kh, kw);
+    const off_t diff_wei_off = WEI_OFF(0, oc, ic, kd, kh, kw);
     diff_wei[diff_wei_off] = REF_TO_WEI(ds);
 #if WITH_BIAS == 1
     if (ic == 0) {
         float db = 0.0f;
-        for (int mb = 0; mb < MB; ++mb) {
-            const uint diff_dst_off = DST_OFF(mb, oc, 0, 0, 0);
+        for (off_t mb = 0; mb < MB; ++mb) {
+            const off_t diff_dst_off = DST_OFF(mb, oc, 0, 0, 0);
             db += DST_TO_REF(diff_dst[diff_dst_off]);
         }
         diff_bias[oc] = REF_TO_BIA(db);
diff --git a/src/gpu/intel/ocl/ref_inner_product.cpp b/src/gpu/intel/ocl/ref_inner_product.cpp
index eeed2fa124e..e9d0c355141 100644
--- a/src/gpu/intel/ocl/ref_inner_product.cpp
+++ b/src/gpu/intel/ocl/ref_inner_product.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ static status_t init_conf_common(inner_product_conf_t &conf, offsets_t &off,
     conf.wei_ndims = wei_d.ndims();
     conf.dst_ndims = dst_d.ndims();
 
-    conf.has_spatial = utils::one_of(conf.ndims, 3, 4, 5);
+    conf.has_spatial = utils::one_of(conf.ndims, 3u, 4u, 5u);
 
     conf.mb = pd->MB();
     conf.ic = pd->IC();
@@ -169,6 +169,12 @@ status_t ref_inner_product_fwd_t::pd_t::init_conf(impl::engine_t *engine) {
 
 status_t ref_inner_product_fwd_t::pd_t::init_kernel_ctx(
         compute::kernel_ctx_t &kernel_ctx) const {
+
+    kernel_ctx.register_buffer_size(*src_md());
+    kernel_ctx.register_buffer_size(*dst_md());
+    kernel_ctx.register_buffer_size(*weights_md(0));
+    kernel_ctx.register_buffer_size(*weights_md(1));
+
     return init_kernel_ctx_common(kernel_ctx, conf, off, *this);
 }
 
@@ -213,6 +219,10 @@ status_t ref_inner_product_bwd_data_t::pd_t::init_conf(impl::engine_t *engine) {
 
 status_t ref_inner_product_bwd_data_t::pd_t::init_kernel_ctx(
         compute::kernel_ctx_t &kernel_ctx) const {
+    kernel_ctx.register_buffer_size(*diff_src_md());
+    kernel_ctx.register_buffer_size(*diff_dst_md());
+    kernel_ctx.register_buffer_size(*weights_md(0));
+    kernel_ctx.register_buffer_size(*weights_md(1));
     return init_kernel_ctx_common(kernel_ctx, conf, off, *this);
 }
 
@@ -246,6 +256,10 @@ status_t ref_inner_product_bwd_weights_t::pd_t::init_conf(
 
 status_t ref_inner_product_bwd_weights_t::pd_t::init_kernel_ctx(
         compute::kernel_ctx_t &kernel_ctx) const {
+    kernel_ctx.register_buffer_size(*src_md());
+    kernel_ctx.register_buffer_size(*diff_dst_md());
+    kernel_ctx.register_buffer_size(*diff_weights_md(0));
+    kernel_ctx.register_buffer_size(*diff_weights_md(1));
     return init_kernel_ctx_common(kernel_ctx, conf, off, *this);
 }
 
diff --git a/src/gpu/intel/ocl/ref_inner_product.hpp b/src/gpu/intel/ocl/ref_inner_product.hpp
index 7b15ac43074..03084b4002e 100644
--- a/src/gpu/intel/ocl/ref_inner_product.hpp
+++ b/src/gpu/intel/ocl/ref_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,9 +34,7 @@ namespace ocl {
 struct ref_inner_product_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_inner_product_fwd_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_inner_product_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_inner_product_fwd_pd_t::gpu_inner_product_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:ref:any", ref_inner_product_fwd_t);
 
@@ -48,8 +46,7 @@ struct ref_inner_product_fwd_t : public gpu_primitive_t {
             auto *compute_engine
                     = utils::downcast<compute::compute_engine_t *>(engine);
 
-            const auto attr_skip_mask
-                    = primitive_attr_t::skip_mask_t::scales_runtime
+            const auto attr_skip_mask = primitive_attr_t::skip_mask_t::scales
                     | primitive_attr_t::skip_mask_t::post_ops;
 
             VDISPATCH_INNER_PRODUCT(
@@ -141,9 +138,7 @@ struct ref_inner_product_fwd_t : public gpu_primitive_t {
 struct ref_inner_product_bwd_data_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_inner_product_bwd_data_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_inner_product_bwd_data_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_inner_product_bwd_data_pd_t::gpu_inner_product_bwd_data_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_inner_product_bwd_data_t);
 
@@ -209,9 +204,8 @@ struct ref_inner_product_bwd_data_t : public gpu_primitive_t {
 struct ref_inner_product_bwd_weights_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_inner_product_bwd_weights_pd_t {
-        pd_t(const inner_product_desc_t *adesc, const primitive_attr_t *attr,
-                const inner_product_fwd_pd_t *hint_fwd_pd)
-            : gpu_inner_product_bwd_weights_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_inner_product_bwd_weights_pd_t::
+                gpu_inner_product_bwd_weights_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_inner_product_bwd_weights_t);
 
diff --git a/src/gpu/intel/ocl/ref_layer_normalization.cpp b/src/gpu/intel/ocl/ref_layer_normalization.cpp
index 989cb5fb183..fca6a9810e4 100644
--- a/src/gpu/intel/ocl/ref_layer_normalization.cpp
+++ b/src/gpu/intel/ocl/ref_layer_normalization.cpp
@@ -39,7 +39,7 @@ static status_t init_conf_common(lnorm_conf_t &conf,
     conf.src_dt = src_mdw.data_type();
     conf.dst_dt = dst_mdw.data_type();
     conf.ndims = ndims;
-    conf.norm_axis = pd->norm_axis();
+    conf.norm_axis = into<dim_idx_t>(pd->norm_axis());
     conf.use_scale = pd->use_scale();
     conf.use_shift = pd->use_shift();
     conf.calculate_stats = !pd->stats_are_src();
@@ -66,14 +66,14 @@ static status_t init_conf_common(lnorm_conf_t &conf,
     if (pd->is_fwd()) {
         for (int i = 0; i < 4; i++) {
             int md_hint_idx = nstl::min(i, ndims - 1);
-            int dim = (i < ndims - 1) ? dims[i] : 1;
+            dim_t dim = (i < ndims - 1) ? dims[i] : 1;
             conf.dispatch.define_dim(utils::format("X%d", i), md_hint_idx, dim);
         }
     } else {
         conf.dispatch_scaleshift.define_dim("C", pd->norm_axis());
         for (int i = 0; i < 4; i++) {
             int md_hint_idx = nstl::min(i, ndims - 1);
-            int dim = (i < ndims - 1) ? dims[i] : 1;
+            dim_t dim = (i < ndims - 1) ? dims[i] : 1;
             conf.dispatch.define_dim(utils::format("X%d", i), md_hint_idx, dim);
         }
         conf.dispatch_scaleshift.set_kernel_attr_suffix("SCALESHIFT");
diff --git a/src/gpu/intel/ocl/ref_layer_normalization.hpp b/src/gpu/intel/ocl/ref_layer_normalization.hpp
index 45fb291964a..fcf0a70dc07 100644
--- a/src/gpu/intel/ocl/ref_layer_normalization.hpp
+++ b/src/gpu/intel/ocl/ref_layer_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,8 +67,7 @@ struct ref_layer_normalization_fwd_t : public gpu_primitive_t {
                     stat_md()->data_type == f32, VERBOSE_UNSUPPORTED_DT_CFG);
             VDISPATCH_LNORM(check_scale_shift_data_type({f32, bf16, f16}),
                     VERBOSE_UNSUPPORTED_DT_CFG);
-            VDISPATCH_LNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime),
+            VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_LNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_LNORM(
@@ -93,9 +92,9 @@ struct ref_layer_normalization_fwd_t : public gpu_primitive_t {
         CHECK(status);
 
         kernel_ctx.define_int("WITH_SRC_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_SRC).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC));
         kernel_ctx.define_int("WITH_DST_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_DST).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_DST));
 
         CHECK(create_kernel(engine, &kernel_, "ref_lnorm_fwd", kernel_ctx));
         if (!kernel_) return status::runtime_error;
diff --git a/src/gpu/intel/ocl/ref_lrn.cl b/src/gpu/intel/ocl/ref_lrn.cl
index f9d644a4534..e2445e491dd 100644
--- a/src/gpu/intel/ocl/ref_lrn.cl
+++ b/src/gpu/intel/ocl/ref_lrn.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_io.h"
 #include "gpu/intel/ocl/ocl_types.h"
 
 #if IS_FWD == 1
@@ -39,9 +40,8 @@ __kernel void ref_lrn_fwd(__global const DATA_T *src,
     for (int j = 0; j < LOCAL_SIZE; j++) {
         const int z_idx = (j + ic - PADDING);
         bool zero = (z_idx < 0 || z_idx >= IC);
-        DEF_ACC_DATA_T val = zero
-                ? 0.0f
-                : TO_DEF_ACC_DATA_T(src[SRC_OFF(mb, z_idx, id, ih, iw)]);
+        DEF_ACC_DATA_T val = 0;
+        if (!zero) load(&val, src + SRC_OFF(mb, z_idx, id, ih, iw));
         sum += val * val;
     }
 #else
@@ -60,8 +60,7 @@ __kernel void ref_lrn_fwd(__global const DATA_T *src,
     for (int k = d_start; k < d_end; ++k) {
         for (int j = h_start; j < h_end; ++j) {
             for (int i = w_start; i < w_end; ++i) {
-                DEF_ACC_DATA_T val
-                        = TO_DEF_ACC_DATA_T(src[SRC_OFF(mb, ic, k, j, i)]);
+                DEF_ACC_DATA_T val = load(val, src + SRC_OFF(mb, ic, k, j, i));
                 sum += val * val;
             }
         }
@@ -75,12 +74,12 @@ __kernel void ref_lrn_fwd(__global const DATA_T *src,
     const DEF_ACC_DATA_T normalization_factor
             = native_powr(base, (DEF_ACC_DATA_T)(-LRN_BETA));
 
-    const DEF_ACC_DATA_T val = TO_DEF_ACC_DATA_T(src[src_index]);
+    const DEF_ACC_DATA_T val = load(val, src + src_index);
     const DEF_ACC_DATA_T normres = val * normalization_factor;
 #if IS_TRAINING == 1
     ws[dst_index] = base;
 #endif
-    dst[dst_index] = TO_DATA_T(normres);
+    write(dst + dst_index, normres);
 }
 #endif
 
@@ -106,13 +105,12 @@ __kernel void ref_lrn_bwd(__global const DATA_T *src,
         bool zero = (z_idx < 0 || z_idx >= IC);
         if (!zero) {
             DEF_ACC_DATA_T val
-                    = TO_DEF_ACC_DATA_T(src[SRC_OFF(mb, z_idx, id, ih, iw)]);
-            DEF_ACC_DATA_T omega = ws[SRC_OFF(mb, z_idx, id, ih, iw)];
+                    = load(val, src + SRC_OFF(mb, z_idx, id, ih, iw));
+            DEF_ACC_DATA_T omega
+                    = load(omega, ws + SRC_OFF(mb, z_idx, id, ih, iw));
             DEF_ACC_DATA_T tmp = (DEF_ACC_DATA_T)1.0f
                     / native_powr(omega, (DEF_ACC_DATA_T)LRN_BETA + 1);
-            B += tmp * val
-                    * TO_DEF_ACC_DATA_T(
-                            diff_dst[DST_OFF(mb, z_idx, id, ih, iw)]);
+            B += tmp * val * load(B, diff_dst + DST_OFF(mb, z_idx, id, ih, iw));
         }
     }
 #else
@@ -130,21 +128,20 @@ __kernel void ref_lrn_bwd(__global const DATA_T *src,
         for (int j = h_start; j < h_end; ++j) {
             for (int i = w_start; i < w_end; ++i) {
                 int data_off = SRC_OFF(mb, ic, k, j, i);
-                DEF_ACC_DATA_T val = TO_DEF_ACC_DATA_T(src[data_off]);
+                DEF_ACC_DATA_T val = load(val, src + data_off);
                 DEF_ACC_DATA_T omega = ws[data_off];
                 DEF_ACC_DATA_T tmp = (DEF_ACC_DATA_T)1.0f
                         / native_powr(omega, (DEF_ACC_DATA_T)(LRN_BETA + 1));
-                B += tmp * val * TO_DEF_ACC_DATA_T(diff_dst[data_off]);
+                B += tmp * val * load(B, diff_dst + data_off);
             }
         }
     }
 #endif
-    const DEF_ACC_DATA_T A
-            = native_powr(ws[src_index], (DEF_ACC_DATA_T)-LRN_BETA)
-            * TO_DEF_ACC_DATA_T(diff_dst[dst_index]);
+    DEF_ACC_DATA_T A = native_powr(ws[src_index], (DEF_ACC_DATA_T)-LRN_BETA)
+            * load(A, diff_dst + dst_index);
 
-    diff_src[src_index] = TO_DATA_T(A
-            - TO_DEF_ACC_DATA_T(src[src_index]) * 2 * (DEF_ACC_DATA_T)LRN_ALPHA
-                    * (DEF_ACC_DATA_T)LRN_BETA * num_elements_div * B);
+    A -= load(A, src + src_index) * 2 * (DEF_ACC_DATA_T)LRN_ALPHA
+            * (DEF_ACC_DATA_T)LRN_BETA * num_elements_div * B;
+    write(diff_src + src_index, A);
 }
 #endif
diff --git a/src/gpu/intel/ocl/ref_lrn.hpp b/src/gpu/intel/ocl/ref_lrn.hpp
index f9001f18843..cec25fc7a5b 100644
--- a/src/gpu/intel/ocl/ref_lrn.hpp
+++ b/src/gpu/intel/ocl/ref_lrn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,10 +33,7 @@ namespace ocl {
 struct ref_lrn_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_lrn_fwd_pd_t {
-        pd_t(const lrn_desc_t *adesc, const primitive_attr_t *attr,
-                const lrn_fwd_pd_t *hint_fwd_pd)
-            : gpu_lrn_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
-        virtual ~pd_t() {}
+        using gpu_lrn_fwd_pd_t::gpu_lrn_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_lrn_fwd_t);
 
@@ -92,7 +89,7 @@ struct ref_lrn_fwd_t : public gpu_primitive_t {
         status_t status = status::success;
         const auto *desc = pd()->desc();
 
-        kernel_ctx.set_data_type(desc->src_desc.data_type);
+        kernel_ctx.set_data_type(desc->src_desc.data_type, false);
 
         kernel_ctx.define_int("IS_FWD", 1);
 
@@ -121,8 +118,8 @@ struct ref_lrn_fwd_t : public gpu_primitive_t {
         kernel_ctx.define_int("IH", pd()->H());
         kernel_ctx.define_int("IW", pd()->W());
 
-        const uint32_t round_norm_size = desc->local_size;
-        uint32_t num_elements = pow(round_norm_size, nstl::max(0, ndims - 2));
+        const dim_t round_norm_size = desc->local_size;
+        dim_t num_elements = pow(round_norm_size, nstl::max(0, ndims - 2));
         if (desc->alg_kind == lrn_across_channels) {
             num_elements = round_norm_size;
         }
@@ -164,10 +161,7 @@ struct ref_lrn_fwd_t : public gpu_primitive_t {
 struct ref_lrn_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_lrn_bwd_pd_t {
-        pd_t(const lrn_desc_t *adesc, const primitive_attr_t *attr,
-                const lrn_fwd_pd_t *hint_fwd_pd)
-            : gpu_lrn_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
-        virtual ~pd_t() {}
+        using gpu_lrn_bwd_pd_t::gpu_lrn_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_lrn_bwd_t);
 
@@ -220,7 +214,7 @@ struct ref_lrn_bwd_t : public gpu_primitive_t {
         status_t status = status::success;
         const auto *desc = pd()->desc();
 
-        kernel_ctx.set_data_type(desc->src_desc.data_type);
+        kernel_ctx.set_data_type(desc->src_desc.data_type, false);
 
         kernel_ctx.define_int("IS_BWD", 1);
 
@@ -246,8 +240,8 @@ struct ref_lrn_bwd_t : public gpu_primitive_t {
         kernel_ctx.define_int("IH", pd()->H());
         kernel_ctx.define_int("IW", pd()->W());
 
-        const uint32_t round_norm_size = desc->local_size;
-        uint32_t num_elements = pow(round_norm_size, nstl::max(0, ndims - 2));
+        const dim_t round_norm_size = desc->local_size;
+        dim_t num_elements = pow(round_norm_size, nstl::max(0, ndims - 2));
         if (desc->alg_kind == lrn_across_channels) {
             num_elements = round_norm_size;
         }
diff --git a/src/gpu/intel/ocl/ref_matmul.cl b/src/gpu/intel/ocl/ref_matmul.cl
index ad2fe34f06f..e2b7d005d5d 100644
--- a/src/gpu/intel/ocl/ref_matmul.cl
+++ b/src/gpu/intel/ocl/ref_matmul.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include "gpu/intel/ocl/ocl_philox.h"
 #include "gpu/intel/ocl/ocl_post_ops.h"
 #include "gpu/intel/ocl/ocl_types.h"
 
@@ -35,41 +36,50 @@ uint get_dropout_threshold(float p) {
 #endif
 
 __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
-        __global DST_DATA_T *C, __global BIA_DATA_T *bia, __global int *a0,
-        __global WEI_ZP_DATA_T *b0, long wei_zp_stride_n, long wei_zp_stride_k,
-        long wei_zp_group_k, __global int *c0,
+        __global DST_DATA_T *C, __global BIA_DATA_T *bia,
+        __global SRC_ZP_DATA_T *a0, long src_zp_stride_k, long src_zp_stride_m,
+        long src_zp_group_k, __global WEI_ZP_DATA_T *b0, long wei_zp_stride_n,
+        long wei_zp_stride_k, long wei_zp_stride_d0, long wei_zp_stride_d1,
+        long wei_zp_group_n, long wei_zp_group_k, __global int *c0,
         __global SRC_SCALES_DATA_T *src_scales, long src_scale_stride_k,
-        long src_scale_stride_m, long src_scale_group_k,
+        long src_scale_stride_m, long src_scale_stride_d0,
+        long src_scale_stride_d1, long src_scale_group_k,
         __global WEI_SCALES_DATA_T *wei_scales, long wei_scale_stride_n,
-        long wei_scale_stride_k, long wei_scale_group_k,
-        __global DST_SCALES_DATA_T *dst_scales, long group_K, long K, long N,
-        long M, long D0, long D1, long D2, long bia_stride_d3,
-        long bia_stride_d2, long bia_stride_d1, long bia_stride_d0,
-        long bia_stride_m, long bia_stride_n, long a_stride_d3,
-        long a_stride_d2, long a_stride_d1, long a_stride_d0, long a_stride_m,
-        long a_stride_k, long b_stride_d3, long b_stride_d2, long b_stride_d1,
-        long b_stride_d0, long b_stride_k, long b_stride_n, long c_stride_d3,
-        long c_stride_d2, long c_stride_d1, long c_stride_d0, long c_stride_m,
-        long c_stride_n
+        long wei_scale_stride_k, long wei_scale_stride_d0,
+        long wei_scale_stride_d1, long wei_scale_group_n,
+        long wei_scale_group_k, __global DST_SCALES_DATA_T *dst_scales,
+        long group_K, long K, long N, long M, long D0, long D1, long D2,
+        long bia_stride_d3, long bia_stride_d2, long bia_stride_d1,
+        long bia_stride_d0, long bia_stride_m, long bia_stride_n,
+        long a_stride_d3, long a_stride_d2, long a_stride_d1, long a_stride_d0,
+        long a_stride_m, long a_stride_k, long b_stride_d3, long b_stride_d2,
+        long b_stride_d1, long b_stride_d0, long b_stride_k, long b_stride_n,
+        long c_stride_d3, long c_stride_d2, long c_stride_d1, long c_stride_d0,
+        long c_stride_m, long c_stride_n
 #if WITH_DROPOUT
         ,
         __global uchar *dropout_mask_buf, __global uint *dropout_seed_buf,
-        __global float *dropout_p_buf POST_OP_ARGS) {
+        __global float *dropout_p_buf
+#endif
+#if WITH_SROUND
+        ,
+        __global uint *sround_seed_buf
+#endif
+                POST_OP_ARGS) {
+
+#if WITH_DROPOUT
     uint dropout_seed = dropout_seed_buf[0];
     uint dropout_threshold = get_dropout_threshold(dropout_p_buf[0]);
     float dropout_inv_q
             = (dropout_p_buf[0] != 1.f) ? 1.f / (1.f - dropout_p_buf[0]) : 0.f;
-#else
-                POST_OP_ARGS) {
 #endif
+#if WITH_SROUND
+    uint sround_seed = sround_seed_buf[0];
+#endif
+
     long n = get_global_id(1);
     int mb = get_global_id(2);
 
-#if WITH_SRC_ZPOINTS
-    int src_zp = a0[0];
-#else
-    int src_zp = 0;
-#endif
 #if WITH_DST_ZPOINTS
     int dst_zp = c0[0];
 #else
@@ -123,12 +133,24 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
 #endif
                 int wei_zp = 0;
 #if WITH_WEI_ZPOINTS
-                wei_zp = WEI_ZP_TO_REF(b0,
-                        wei_zp_stride_n * n
-                                + wei_zp_stride_k * (k / wei_zp_group_k));
+                long wei_zp_off = wei_zp_stride_n * (n / wei_zp_group_n)
+                        + wei_zp_stride_k * (k / wei_zp_group_k)
+                        + wei_zp_stride_d0 * d0 + wei_zp_stride_d1 * d1;
+                wei_zp = WEI_ZP_TO_REF(b0, wei_zp_off);
 #endif
+                int src_zp = 0;
+#if WITH_SRC_ZPOINTS
+                long src_zp_off = src_zp_stride_k * (k / src_zp_group_k)
+                        + src_zp_stride_m * m;
+                src_zp = SRC_ZP_TO_REF(a0, src_zp_off);
+#endif
+#if SRC_DT_F4_E2M1 || SRC_DT_F4_E3M0
+                ACC_DATA_T s = TO_ACC(
+                        SRC_TO_REF(GET_HALF_BYTE(A, src_off)) - src_zp);
+#else
                 ACC_DATA_T s = TO_ACC(SRC_TO_REF(A[src_off]) - src_zp);
-#if WEI_DT_S4 || WEI_DT_U4
+#endif
+#if WEI_DT_S4 || WEI_DT_U4 || WEI_DT_F4_E2M1 || WEI_DT_F4_E3M0
                 ACC_DATA_T w_raw = WEI_TO_REF(GET_HALF_BYTE(B, wei_off));
 #else
                 ACC_DATA_T w_raw = WEI_TO_REF(B[wei_off]);
@@ -140,13 +162,16 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
             FLT_ACC_DATA_T src_scale = 1.f;
             FLT_ACC_DATA_T wei_scale = 1.f;
 #if WITH_SRC_SCALES
-            src_scale = SRC_SCALES_TO_REF(src_scales[src_scale_stride_k
-                            * (g * group_K / src_scale_group_k)
-                    + src_scale_stride_m * m]);
+            long src_scale_off = src_scale_stride_m * m
+                    + src_scale_stride_k * (g * group_K / src_scale_group_k)
+                    + src_scale_stride_d0 * d0 + src_scale_stride_d1 * d1;
+            src_scale = SRC_SCALES_TO_REF(src_scales[src_scale_off]);
 #endif
 #if WITH_WEI_SCALES
-            wei_scale = WEI_SCALES_TO_REF(wei_scales[wei_scale_stride_n * n
-                    + wei_scale_stride_k * (g * group_K / wei_scale_group_k)]);
+            long wei_scale_off = wei_scale_stride_n * (n / wei_scale_group_n)
+                    + wei_scale_stride_k * (g * group_K / wei_scale_group_k)
+                    + wei_scale_stride_d0 * d0 + wei_scale_stride_d1 * d1;
+            wei_scale = WEI_SCALES_TO_REF(wei_scales[wei_scale_off]);
 #endif
             FLT_ACC_DATA_T acc_g_to_f
                     = ACC_TO_REF(acc_g) * src_scale * wei_scale;
@@ -185,33 +210,16 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
         float po_acc = convert_float(temp);
 
 #if WITH_DROPOUT
-        const ulong2 ctr_mul = (ulong2)(0xD2511F53uL, 0xCD9E8D57uL);
-        const ulong key_add = as_ulong((uint2)(0x9E3779B9u, 0xBB67AE85u));
-        const uint16 key0 = (uint16)(dropout_seed)
-                + as_uint16((ulong8)(key_add))
-                        * (uint16)(
-                                0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7);
-        const uint4 key1 = (uint4)(dropout_seed)
-                + as_uint4((ulong2)(key_add)) * (uint4)(8, 8, 9, 9);
-        uint4 ctr = (uint4)(dst_off & ~3L) + (uint4)(3, 2, 1, 0);
-#define PHILOX_4UINT_ROUND(mul, ctr, key) \
-    as_uint4(convert_ulong2(ctr.s31) * mul) ^ (uint4)(ctr.s20 ^ key, 0, 0).s3120
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s01);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s23);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s45);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s67);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.s89);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sAB);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sCD);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key0.sEF);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s01);
-        ctr = PHILOX_4UINT_ROUND(ctr_mul, ctr, key1.s23);
-#undef PHILOX_4UINT_ROUND
-        uchar dropout = ctr[~dst_off & 3L] > dropout_threshold;
+        uint res = philox_4x32(dst_off, dropout_seed);
+        uchar dropout = res > dropout_threshold;
         po_acc = (dropout) ? po_acc * dropout_inv_q : 0;
         dropout_mask_buf[dst_off] = dropout;
 #endif
 
+#if WITH_SROUND
+        po_acc = stochastic_round_fwd(po_acc, dst_off, sround_seed);
+#endif
+
         if (DST_NDIMS == 2)
             APPLY_POST_OPS_SERIAL(po_acc, float, dst_data, float, m, 1, n, 1, 0,
                     1, 0, 1, 0, 1, 0, 1);
@@ -229,7 +237,11 @@ __kernel void ref_matmul(__global SRC_DATA_T *A, __global WEI_DATA_T *B,
                     d1, 1, d0, 1, m, 1, n, 1);
 
 #if WITH_DST_SCALES
+#if DST_SCALES_MASK == 0
         po_acc /= DST_SCALES_TO_REF(dst_scales[0]);
+#else
+        po_acc /= DST_SCALES_TO_REF(dst_scales[n]);
+#endif
 #endif
         po_acc += dst_zp;
         C[dst_off] = TO_DST(po_acc);
diff --git a/src/gpu/intel/ocl/ref_matmul.cpp b/src/gpu/intel/ocl/ref_matmul.cpp
index 32e06799189..e9a6a1e8877 100644
--- a/src/gpu/intel/ocl/ref_matmul.cpp
+++ b/src/gpu/intel/ocl/ref_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -84,118 +84,232 @@ status_t ref_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
     const dim_t K = a_d.dims()[last];
 
     const auto &attr_scales = pd()->attr()->scales_;
-    const int wei_scale_mask = attr_scales.get(DNNL_ARG_WEIGHTS).mask_;
-    const bool wei_scale_per_n = wei_scale_mask & pd()->wei_qmask_N();
+    const int wei_scale_mask = attr_scales.get_mask(DNNL_ARG_WEIGHTS);
     const bool wei_scale_per_k = wei_scale_mask & pd()->wei_qmask_K();
-    const auto wei_scale_group_ndim = attr_scales.get(DNNL_ARG_WEIGHTS).ndims_;
-    const auto wei_scale_group_k = wei_scale_group_ndim > 0
-            ? attr_scales.get(DNNL_ARG_WEIGHTS).group_dims_[0]
+    const auto wei_scale_group_k
+            = !attr_scales.get(DNNL_ARG_WEIGHTS).has_default_groups()
+            ? attr_scales.get_group(DNNL_ARG_WEIGHTS, 0)
             : (wei_scale_per_k ? 1 : K);
-    const dim_t wei_scale_stride_n = wei_scale_per_n ? 1 : 0;
-    const dim_t wei_scale_stride_k
-            = wei_scale_group_k < K ? wei_scale_per_n ? N : 1 : 0;
+    const auto wei_scale_group_n = attr_scales.get_group(DNNL_ARG_WEIGHTS, 1);
     const auto wei_scale_ngroups_k = K / wei_scale_group_k;
+    // Identify wei_scales dimensions as user may not pass them.
+    dims_t wei_scale_dims {};
+    dims_t wei_scale_strides {};
+    utils::copy_dims_with_mask(
+            wei_scale_dims, b_d.dims(), b_d.ndims(), wei_scale_mask);
+    wei_scale_dims[b_d.ndims() - 1] /= wei_scale_group_n;
+    wei_scale_dims[b_d.ndims() - 2] /= wei_scale_group_k;
 
-    const int src_scale_mask = attr_scales.get(DNNL_ARG_SRC).mask_;
+    dim_t last_scale_dim = 0;
+    dim_t last_scale_stride = 0;
+    for (int d = b_d.ndims() - 1; d >= 0; d--) {
+        if (wei_scale_dims[d] == 0) continue;
+        wei_scale_strides[d] = last_scale_stride == 0
+                ? 1
+                : last_scale_dim * last_scale_stride;
+        last_scale_stride = wei_scale_strides[d];
+        last_scale_dim = wei_scale_dims[d];
+    }
+
+    const dim_t wei_scale_stride_n = wei_scale_strides[b_d.ndims() - 1];
+    const dim_t wei_scale_stride_k = wei_scale_strides[b_d.ndims() - 2];
+    const dim_t wei_scale_stride_b0
+            = b_d.ndims() > 2 ? wei_scale_strides[b_d.ndims() - 3] : 0;
+    const dim_t wei_scale_stride_b1
+            = b_d.ndims() > 3 ? wei_scale_strides[b_d.ndims() - 4] : 0;
+
+    const int src_scale_mask = attr_scales.get_mask(DNNL_ARG_SRC);
     const bool src_scale_per_k = src_scale_mask & pd()->src_qmask_K();
-    const bool src_scale_per_m = src_scale_mask & pd()->src_qmask_M();
-    const auto src_scale_group_ndim = attr_scales.get(DNNL_ARG_SRC).ndims_;
-    const auto src_scale_group_k = src_scale_group_ndim > 0
-            ? attr_scales.get(DNNL_ARG_SRC).group_dims_[1]
+    const auto src_scale_group_k
+            = !attr_scales.get(DNNL_ARG_SRC).has_default_groups()
+            ? attr_scales.get_group(DNNL_ARG_SRC, 1)
             : (src_scale_per_k ? 1 : K);
     const auto src_scale_ngroups_k = K / src_scale_group_k;
-    const dim_t src_scale_stride_k = src_scale_group_k < K ? 1 : 0;
-    const dim_t src_scale_stride_m = src_scale_per_m
-            ? src_scale_group_k < K ? src_scale_ngroups_k : 1
-            : 0;
+    // Identify src_scales dimensions as user may not pass them.
+    dims_t src_scale_dims {};
+    dims_t src_scale_strides {};
+    utils::copy_dims_with_mask(
+            src_scale_dims, a_d.dims(), a_d.ndims(), src_scale_mask);
+    src_scale_dims[a_d.ndims() - 1] /= src_scale_group_k;
+
+    last_scale_dim = 0;
+    last_scale_stride = 0;
+    for (int d = a_d.ndims() - 1; d >= 0; d--) {
+        if (src_scale_dims[d] == 0) continue;
+        src_scale_strides[d] = last_scale_stride == 0
+                ? 1
+                : last_scale_dim * last_scale_stride;
+        last_scale_stride = src_scale_strides[d];
+        last_scale_dim = src_scale_dims[d];
+    }
+
+    const dim_t src_scale_stride_k = src_scale_strides[a_d.ndims() - 1];
+    const dim_t src_scale_stride_m = src_scale_strides[a_d.ndims() - 2];
+    const dim_t src_scale_stride_b0
+            = a_d.ndims() > 2 ? src_scale_strides[a_d.ndims() - 3] : 0;
+    const dim_t src_scale_stride_b1
+            = a_d.ndims() > 3 ? src_scale_strides[a_d.ndims() - 4] : 0;
 
     const auto &attr_zps = pd()->attr()->zero_points_;
-    int wei_zp_mask = 0;
-    attr_zps.get(DNNL_ARG_WEIGHTS, &wei_zp_mask);
-    const bool wei_zp_per_n = wei_zp_mask & pd()->wei_qmask_N();
-    const bool wei_zp_per_k = wei_zp_mask & pd()->wei_qmask_K();
-    const auto wei_zp_group_ndims = attr_zps.get_groups_ndims(DNNL_ARG_WEIGHTS);
-    const auto wei_zp_group_k = wei_zp_group_ndims > 0
-            ? attr_zps.get_groups(DNNL_ARG_WEIGHTS)[0]
-            : (wei_zp_per_k ? 1 : K);
-    const dim_t wei_zp_stride_n = wei_zp_per_n ? 1 : 0;
-    const dim_t wei_zp_stride_k = wei_zp_group_k < K ? wei_zp_per_n ? N : 1 : 0;
+    int wei_zp_mask = attr_zps.get_mask(DNNL_ARG_WEIGHTS);
+    const auto wei_zp_group_k = attr_zps.get_group(DNNL_ARG_WEIGHTS, 0);
+    const auto wei_zp_group_n = attr_zps.get_group(DNNL_ARG_WEIGHTS, 1);
     const auto wei_zp_ngroups_k = K / wei_zp_group_k;
+    // Identify wei_zp dimensions as user may not pass them.
+    dims_t wei_zp_dims {};
+    dims_t wei_zp_strides {};
+    utils::copy_dims_with_mask(
+            wei_zp_dims, b_d.dims(), b_d.ndims(), wei_zp_mask);
+    wei_zp_dims[b_d.ndims() - 1] /= wei_zp_group_n;
+    wei_zp_dims[b_d.ndims() - 2] /= wei_zp_group_k;
+
+    dim_t last_zp_dim = 0;
+    dim_t last_zp_stride = 0;
+    for (int d = b_d.ndims() - 1; d >= 0; d--) {
+        if (wei_zp_dims[d] == 0) continue;
+        wei_zp_strides[d]
+                = last_zp_stride == 0 ? 1 : last_zp_dim * last_zp_stride;
+        last_zp_stride = wei_zp_strides[d];
+        last_zp_dim = wei_zp_dims[d];
+    }
+    const dim_t wei_zp_stride_n = wei_zp_strides[b_d.ndims() - 1];
+    const dim_t wei_zp_stride_k = wei_zp_strides[b_d.ndims() - 2];
+    const dim_t wei_zp_stride_b0
+            = b_d.ndims() > 2 ? wei_zp_strides[b_d.ndims() - 3] : 0;
+    const dim_t wei_zp_stride_b1
+            = b_d.ndims() > 3 ? wei_zp_strides[b_d.ndims() - 4] : 0;
+
+    int src_zp_mask = attr_zps.get_mask(DNNL_ARG_SRC);
+    const auto src_zp_group_k = attr_zps.get_group(DNNL_ARG_SRC, 1);
+    const auto src_zp_ngroups_k = K / src_zp_group_k;
+    // Identify src_zp dimensions as user may not pass them.
+    dims_t src_zp_dims {};
+    dims_t src_zp_strides {};
+    utils::copy_dims_with_mask(
+            src_zp_dims, a_d.dims(), a_d.ndims(), src_zp_mask);
+    src_zp_dims[a_d.ndims() - 1] /= src_zp_group_k;
+
+    last_zp_dim = 0;
+    last_zp_stride = 0;
+    for (int d = a_d.ndims() - 1; d >= 0; d--) {
+        if (src_zp_dims[d] == 0) continue;
+        src_zp_strides[d]
+                = last_zp_stride == 0 ? 1 : last_zp_dim * last_zp_stride;
+        last_zp_stride = src_zp_strides[d];
+        last_zp_dim = src_zp_dims[d];
+    }
+    const dim_t src_zp_stride_k = src_zp_strides[a_d.ndims() - 1];
+    const dim_t src_zp_stride_m = src_zp_strides[a_d.ndims() - 2];
 
     // For compute kernel, the minimal group is picked.
     const auto scale_ngroups_k
             = std::max(src_scale_ngroups_k, wei_scale_ngroups_k);
-    const auto ngroups_k = std::max(wei_zp_ngroups_k, scale_ngroups_k);
+    const auto zp_ngroups_k = std::max(src_zp_ngroups_k, wei_zp_ngroups_k);
+    const auto ngroups_k = std::max(zp_ngroups_k, scale_ngroups_k);
     const auto group_K = K / ngroups_k;
 
+    const bool subbyte_pack
+            = pd()->subbyte_pack_; //(c_d.data_type() == data_type::f4_e2m1);
+    const dim_t nelems = c_d.nelems();
+    auto tmp = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_matmul_pack_space);
+
     compute::kernel_arg_list_t arg_list;
-    arg_list.set(0, a);
-    arg_list.set(1, b);
-    arg_list.set(2, c);
-    arg_list.set(3, bias);
-    arg_list.set(4, a0);
-    arg_list.set(5, b0);
-    arg_list.set(6, wei_zp_stride_n);
-    arg_list.set(7, wei_zp_stride_k);
-    arg_list.set(8, wei_zp_group_k);
-    arg_list.set(9, c0);
-    arg_list.set(10, src_scales);
-    arg_list.set(11, src_scale_stride_k);
-    arg_list.set(12, src_scale_stride_m);
-    arg_list.set(13, src_scale_group_k);
-    arg_list.set(14, wei_scales);
-    arg_list.set(15, wei_scale_stride_n);
-    arg_list.set(16, wei_scale_stride_k);
-    arg_list.set(17, wei_scale_group_k);
-    arg_list.set(18, dst_scales);
-    arg_list.set(19, group_K);
-    arg_list.set(20, K);
-    arg_list.set(21, N);
-    arg_list.set(22, M);
-    arg_list.set(23, D0);
-    arg_list.set(24, D1);
-    arg_list.set(25, D2);
-    arg_list.set(26, bia_stride[5]);
-    arg_list.set(27, bia_stride[4]);
-    arg_list.set(28, bia_stride[3]);
-    arg_list.set(29, bia_stride[2]);
-    arg_list.set(30, bia_stride[1]);
-    arg_list.set(31, bia_stride[0]);
-    arg_list.set(32, a_stride[5]);
-    arg_list.set(33, a_stride[4]);
-    arg_list.set(34, a_stride[3]);
-    arg_list.set(35, a_stride[2]);
-    arg_list.set(36, a_stride[1]);
-    arg_list.set(37, a_stride[0]);
-    arg_list.set(38, b_stride[5]);
-    arg_list.set(39, b_stride[4]);
-    arg_list.set(40, b_stride[3]);
-    arg_list.set(41, b_stride[2]);
-    arg_list.set(42, b_stride[1]);
-    arg_list.set(43, b_stride[0]);
-    arg_list.set(44, c_stride[5]);
-    arg_list.set(45, c_stride[4]);
-    arg_list.set(46, c_stride[3]);
-    arg_list.set(47, c_stride[2]);
-    arg_list.set(48, c_stride[1]);
-    arg_list.set(49, c_stride[0]);
+    int arg_idx = 0;
+    arg_list.set(arg_idx++, a);
+    arg_list.set(arg_idx++, b);
+    arg_list.set(arg_idx++, subbyte_pack ? *tmp : c);
+    arg_list.set(arg_idx++, bias);
+    arg_list.set(arg_idx++, a0);
+    arg_list.set(arg_idx++, src_zp_stride_k);
+    arg_list.set(arg_idx++, src_zp_stride_m);
+    arg_list.set(arg_idx++, src_zp_group_k);
+    arg_list.set(arg_idx++, b0);
+    arg_list.set(arg_idx++, wei_zp_stride_n);
+    arg_list.set(arg_idx++, wei_zp_stride_k);
+    arg_list.set(arg_idx++, wei_zp_stride_b0);
+    arg_list.set(arg_idx++, wei_zp_stride_b1);
+    arg_list.set(arg_idx++, wei_zp_group_n);
+    arg_list.set(arg_idx++, wei_zp_group_k);
+    arg_list.set(arg_idx++, c0);
+    arg_list.set(arg_idx++, src_scales);
+    arg_list.set(arg_idx++, src_scale_stride_k);
+    arg_list.set(arg_idx++, src_scale_stride_m);
+    arg_list.set(arg_idx++, src_scale_stride_b0);
+    arg_list.set(arg_idx++, src_scale_stride_b1);
+    arg_list.set(arg_idx++, src_scale_group_k);
+    arg_list.set(arg_idx++, wei_scales);
+    arg_list.set(arg_idx++, wei_scale_stride_n);
+    arg_list.set(arg_idx++, wei_scale_stride_k);
+    arg_list.set(arg_idx++, wei_scale_stride_b0);
+    arg_list.set(arg_idx++, wei_scale_stride_b1);
+    arg_list.set(arg_idx++, wei_scale_group_n);
+    arg_list.set(arg_idx++, wei_scale_group_k);
+    arg_list.set(arg_idx++, dst_scales);
+    arg_list.set(arg_idx++, group_K);
+    arg_list.set(arg_idx++, K);
+    arg_list.set(arg_idx++, N);
+    arg_list.set(arg_idx++, M);
+    arg_list.set(arg_idx++, D0);
+    arg_list.set(arg_idx++, D1);
+    arg_list.set(arg_idx++, D2);
+    arg_list.set(arg_idx++, bia_stride[5]);
+    arg_list.set(arg_idx++, bia_stride[4]);
+    arg_list.set(arg_idx++, bia_stride[3]);
+    arg_list.set(arg_idx++, bia_stride[2]);
+    arg_list.set(arg_idx++, bia_stride[1]);
+    arg_list.set(arg_idx++, bia_stride[0]);
+    arg_list.set(arg_idx++, a_stride[5]);
+    arg_list.set(arg_idx++, a_stride[4]);
+    arg_list.set(arg_idx++, a_stride[3]);
+    arg_list.set(arg_idx++, a_stride[2]);
+    arg_list.set(arg_idx++, a_stride[1]);
+    arg_list.set(arg_idx++, a_stride[0]);
+    arg_list.set(arg_idx++, b_stride[5]);
+    arg_list.set(arg_idx++, b_stride[4]);
+    arg_list.set(arg_idx++, b_stride[3]);
+    arg_list.set(arg_idx++, b_stride[2]);
+    arg_list.set(arg_idx++, b_stride[1]);
+    arg_list.set(arg_idx++, b_stride[0]);
+    arg_list.set(arg_idx++, c_stride[5]);
+    arg_list.set(arg_idx++, c_stride[4]);
+    arg_list.set(arg_idx++, c_stride[3]);
+    arg_list.set(arg_idx++, c_stride[2]);
+    arg_list.set(arg_idx++, c_stride[1]);
+    arg_list.set(arg_idx++, c_stride[0]);
 
     const bool dropout = !pd()->attr()->dropout_.has_default_values();
     if (dropout) {
-        arg_list.set(50, CTX_OUT_STORAGE(DNNL_ARG_ATTR_DROPOUT_MASK));
-        arg_list.set(51, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED));
-        arg_list.set(52, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY));
+        arg_list.set(arg_idx++, CTX_OUT_STORAGE(DNNL_ARG_ATTR_DROPOUT_MASK));
+        arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_SEED));
+        arg_list.set(
+                arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_DROPOUT_PROBABILITY));
+    }
+
+    const bool sround = !pd()->attr()->rounding_mode_.has_default_values();
+    if (sround) {
+        arg_list.set(arg_idx++, CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED));
     }
     append_post_ops_to_arg_list(
-            ctx, arg_list, 50 + 3 * dropout, pd()->attr()->post_ops_);
+            ctx, arg_list, arg_idx, pd()->attr()->post_ops_);
 
     compute::range_t gws = {1, (size_t)N, (size_t)(D0 * D1 * D2 * D3)};
     auto nd_range = compute::nd_range_t(gws);
 
-    status_t status = parallel_for(ctx, nd_range, kernel_, arg_list);
+    CHECK(parallel_for(ctx, nd_range, kernels_[0], arg_list));
 
     ctx.zero_pad_output(DNNL_ARG_DST);
-    return status;
+
+    if (!subbyte_pack) return status_t::dnnl_success;
+    compute::kernel_arg_list_t repack_arg_list;
+    repack_arg_list.set(0, *tmp);
+    repack_arg_list.set(1, c);
+    repack_arg_list.set(2, into<dim_t>(nelems));
+    repack_arg_list.set(3, 4);
+    compute::range_t repack_gws((nelems * 4 + 7) / 8);
+    compute::nd_range_t repack_nd_range(repack_gws);
+    return large_parallel_for(
+            ctx, repack_nd_range, kernels_[1], repack_arg_list, 4);
 }
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_matmul.hpp b/src/gpu/intel/ocl/ref_matmul.hpp
index 7c86287b3d7..d73f4d2dcb3 100644
--- a/src/gpu/intel/ocl/ref_matmul.hpp
+++ b/src/gpu/intel/ocl/ref_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,13 +20,14 @@
 #include <assert.h>
 
 #include "common/c_types_map.hpp"
+#include "common/memory_tracking.hpp"
 #include "common/primitive.hpp"
 #include "common/type_helpers.hpp"
 #include "common/utils.hpp"
 #include "gpu/gpu_matmul_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -58,11 +59,11 @@ struct ref_matmul_t : public gpu_primitive_t {
             VDISPATCH_MATMUL(
                     is_dense_format_kind(), VERBOSE_UNSUPPORTED_SPARSE_CFG);
             VDISPATCH_MATMUL(
-                    attr()->has_default_values(smask_t::scales_runtime_data_type
-                            | smask_t::scales_runtime_groups | smask_t::dropout
-                            | smask_t::zero_points_runtime_data_type
-                            | smask_t::zero_points_runtime_groups
-                            | smask_t::post_ops | smask_t::fpmath_mode),
+                    attr()->has_default_values(smask_t::scales_data_type
+                            | smask_t::scales_groups | smask_t::dropout
+                            | smask_t::zero_points_data_type
+                            | smask_t::zero_points_groups | smask_t::post_ops
+                            | smask_t::fpmath_mode | smask_t::rounding_mode),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_MATMUL(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_MATMUL(zero_points_ok(), VERBOSE_UNSUPPORTED_ZP_CFG);
@@ -77,23 +78,28 @@ struct ref_matmul_t : public gpu_primitive_t {
                     && dst_dt_ == f32;
             const bool is_f16 = src_dt_ == f16
                     && utils::one_of(wei_dt_, f16, s8, u8, s4, u4)
-                    && utils::one_of(dst_dt_, u8, s8, f16);
+                    && utils::one_of(dst_dt_, u8, s8, f16, f32);
+            const bool is_bf16 = src_dt_ == bf16
+                    && utils::one_of(wei_dt_, bf16, s8, u8, s4, u4)
+                    && utils::one_of(dst_dt_, u8, s8, bf16, f32);
             const bool is_f8
                     = (utils::one_of(src_dt_, f8_e5m2, f8_e4m3)
                               || utils::one_of(wei_dt_, f8_e5m2, f8_e4m3))
                     && utils::one_of(dst_dt_, f32, bf16, f16, src_dt_);
-            const bool is_bf16 = src_dt_ == bf16
-                    && utils::one_of(wei_dt_, bf16, s8, u8, s4, u4)
-                    && utils::one_of(dst_dt_, bf16, f32);
+            const bool is_f4
+                    = ((utils::one_of(src_dt_, f4_e2m1, f4_e3m0)
+                               || utils::everyone_is(wei_dt_, f4_e2m1, f4_e3m0))
+                            && utils::one_of(dst_dt_, f32, bf16, f16, f4_e3m0,
+                                    f4_e2m1, src_dt_));
             const bool is_int8 = utils::one_of(src_dt_, u8, s8)
                     && utils::one_of(wei_dt_, u8, s8, u4, s4)
                     && utils::one_of(dst_dt_, f32, s8, u8, s32, f16);
-            VDISPATCH_MATMUL(
-                    (is_int8
-                            || ((is_f32 || is_f64 || is_f16 || is_f8 || is_bf16)
-                                    && IMPLICATION(with_bias(),
-                                            utils::one_of(
-                                                    bia_dt_, f32, dst_dt_)))),
+            VDISPATCH_MATMUL((is_int8
+                                     || ((is_f32 || is_f64 || is_f16 || is_f8
+                                                 || is_f4 || is_bf16)
+                                             && IMPLICATION(with_bias(),
+                                                     utils::one_of(bia_dt_, f32,
+                                                             dst_dt_)))),
                     VERBOSE_UNSUPPORTED_DT_CFG);
             VDISPATCH_MATMUL_SC(attr_.set_default_formats(dst_md(0)),
                     VERBOSE_UNSUPPORTED_POSTOP);
@@ -112,6 +118,18 @@ struct ref_matmul_t : public gpu_primitive_t {
                     IMPLICATION(utils::one_of(f64, src_dt_, wei_dt_, dst_dt_),
                             dev_info_->has_native(f64)),
                     VERBOSE_UNSUPPORTED_DT);
+            subbyte_pack_ = utils::one_of(
+                    dst_dt_, data_type::f4_e2m1, data_type::f4_e3m0);
+            if (subbyte_pack_) {
+                using namespace dnnl::impl::memory_tracking::names;
+                const memory_desc_wrapper dst_mdw(dst_md(0));
+                const auto &padded_dims = dst_mdw.padded_dims();
+                const dim_t ndims = dst_mdw.ndims();
+                const dim_t nelems = utils::array_product(padded_dims, ndims);
+                auto scratchpad = scratchpad_registry().registrar();
+                scratchpad.book(memory_tracking::names::key_matmul_pack_space,
+                        nelems, sizeof(char), OCL_BUFFER_ALIGNMENT);
+            }
 
             non_default_attrs_ = !attr()->has_default_values();
             attr_info_ = attr_info_t::create(attr());
@@ -120,6 +138,7 @@ struct ref_matmul_t : public gpu_primitive_t {
         }
 
         bool non_default_attrs_ = false;
+        bool subbyte_pack_ = false;
         data_type_t bia_dt_ = data_type::undef;
         data_type_t src_dt_ = data_type::undef;
         data_type_t dst_dt_ = data_type::undef;
@@ -129,28 +148,45 @@ struct ref_matmul_t : public gpu_primitive_t {
 
     private:
         bool zero_points_ok() const {
-            int mask_src = 0, mask_wei = 0, mask_dst = 0;
-            CHECK_BOOL(attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src));
-            CHECK_BOOL(attr()->zero_points_.get(DNNL_ARG_WEIGHTS, &mask_wei));
-            CHECK_BOOL(attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst));
-
-            const auto wei_group_ndims
-                    = attr()->zero_points_.get_groups_ndims(DNNL_ARG_WEIGHTS);
-            const auto wei_group_dims
-                    = attr()->zero_points_.get_groups(DNNL_ARG_WEIGHTS);
-
-            bool mask_src_ok = mask_src == 0;
-            bool mask_wei_ok = utils::one_of(
-                    mask_wei, 0, wei_qmask_N(), wei_qmask_K() + wei_qmask_N());
-            bool mask_dst_ok = mask_dst == 0;
-
-            return mask_src_ok && mask_wei_ok && mask_dst_ok;
-
-            return mask_src_ok && mask_dst_ok && mask_wei_ok
-                    && utils::one_of(wei_group_ndims, 0, 2)
-                    && IMPLICATION(wei_group_ndims == 2,
-                            wei_group_dims[1] == 1
-                                    && K() % wei_group_dims[0] == 0);
+            const auto &zp = attr()->zero_points_;
+            if (!zp.has_default_values(DNNL_ARG_SRC)) {
+                int mask_src = zp.get_mask(DNNL_ARG_SRC);
+                bool ok = utils::one_of(mask_src, 0, src_qmask_K(),
+                        src_qmask_M() + src_qmask_K());
+                if (!ok) return false;
+
+                if (!zp.get(DNNL_ARG_SRC).has_default_groups()) {
+                    const auto gM = zp.get_group(DNNL_ARG_SRC, 0);
+                    ok = gM == 1;
+                    if (!ok) return false;
+
+                    const auto gK = zp.get_group(DNNL_ARG_SRC, 1);
+                    ok = IMPLICATION(gK > 1, K() % gK == 0);
+                    if (!ok) return false;
+                }
+            }
+            /* weights decompression requires zero points support */
+            if (!zp.has_default_values(DNNL_ARG_WEIGHTS)) {
+                if (!zp.get(DNNL_ARG_WEIGHTS).has_default_groups()) {
+                    const auto gK = zp.get_group(DNNL_ARG_WEIGHTS, 0);
+                    bool ok = IMPLICATION(gK > 1, K() % gK == 0);
+                    if (!ok) return false;
+
+                    const auto gN = zp.get_group(DNNL_ARG_WEIGHTS, 1);
+                    ok = IMPLICATION(gN > 1, N() % gN == 0);
+                    if (!ok) return false;
+
+                    // Only one non-unit group is supported.
+                    ok = utils::one_of(1, gK, gN);
+                    if (!ok) return false;
+                }
+            }
+            if (!zp.has_default_values(DNNL_ARG_DST)) {
+                int mask_dst = zp.get_mask(DNNL_ARG_DST);
+                bool ok = mask_dst == 0;
+                if (!ok) return false;
+            }
+            return true;
         }
     };
 
@@ -164,6 +200,12 @@ struct ref_matmul_t : public gpu_primitive_t {
                 "WITH_DROPOUT", !pd()->attr()->dropout_.has_default_values());
         kernel_ctx.define_int("NON_DEFAULT_ATTRS", pd()->non_default_attrs_);
 
+        auto dst_rnd_mode = pd()->attr()->rounding_mode_.get(DNNL_ARG_DST);
+        kernel_ctx.define_int(
+                "WITH_SROUND", dst_rnd_mode == rounding_mode::stochastic);
+        kernel_ctx.define_int("DST_DT_DIGITS",
+                dnnl::impl::types::digits<uint32_t>(pd()->dst_dt_));
+
         kernel_ctx.set_data_type(pd()->dst_dt_);
         CHECK(def_attr_info(kernel_ctx, pd()->attr_info_,
                 pd()->attr()->post_ops_, *pd()->dst_md()));
@@ -190,19 +232,27 @@ struct ref_matmul_t : public gpu_primitive_t {
         def_data_type(kernel_ctx, pd()->bia_dt_, "BIA");
         def_data_type(kernel_ctx, pd()->desc()->accum_data_type, "ACC");
         def_data_type(kernel_ctx,
-                pd()->attr()->scales_.get(DNNL_ARG_WEIGHTS).data_type_,
+                pd()->attr()->scales_.get_data_type(DNNL_ARG_WEIGHTS),
                 "WEI_SCALES");
         def_data_type(kernel_ctx,
                 pd()->attr()->zero_points_.get_data_type(DNNL_ARG_WEIGHTS),
                 "WEI_ZP");
         def_data_type(kernel_ctx,
-                pd()->attr()->scales_.get(DNNL_ARG_SRC).data_type_,
+                pd()->attr()->scales_.get_data_type(DNNL_ARG_SRC),
                 "SRC_SCALES");
         def_data_type(kernel_ctx,
-                pd()->attr()->scales_.get(DNNL_ARG_DST).data_type_,
+                pd()->attr()->zero_points_.get_data_type(DNNL_ARG_SRC),
+                "SRC_ZP");
+        def_data_type(kernel_ctx,
+                pd()->attr()->scales_.get_data_type(DNNL_ARG_DST),
                 "DST_SCALES");
-        CHECK(create_kernel(engine, &kernel_, "ref_matmul", kernel_ctx));
-        if (!kernel_) return status::runtime_error;
+        kernels_.resize(2);
+        CHECK(create_kernel(engine, &kernels_[0], "ref_matmul", kernel_ctx));
+        if (pd()->subbyte_pack_)
+            CHECK(create_kernel(
+                    engine, &kernels_[1], "subbyte_pack", kernel_ctx));
+        if (!kernels_[0]) return status::runtime_error;
+        if (pd()->subbyte_pack_ && !kernels_[1]) return status::runtime_error;
         return status::success;
     }
 
@@ -213,7 +263,7 @@ struct ref_matmul_t : public gpu_primitive_t {
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     status_t execute_ref(const exec_ctx_t &ctx) const;
-    compute::kernel_t kernel_;
+    std::vector<compute::kernel_t> kernels_;
 };
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_pooling.cl b/src/gpu/intel/ocl/ref_pooling.cl
index 60846b82683..9bcbf250c4f 100644
--- a/src/gpu/intel/ocl/ref_pooling.cl
+++ b/src/gpu/intel/ocl/ref_pooling.cl
@@ -50,14 +50,17 @@
 KERNEL_ATTR
 __kernel void ref_pooling_fwd(__global DATA_T *src, __global int *ws,
         __global DST_DATA_T *dst POST_OP_ARGS) {
-    const int mb = GWS_GET_MB();
-    const int oc = GWS_GET_OC();
-    const int od = GWS_GET_OD();
-    const int oh = GWS_GET_OH();
-    const int ow = GWS_GET_OW();
+
+    if (GWS_OVERFLOW) return;
+
+    const off_t mb = GWS_GET_MB();
+    const off_t oc = GWS_GET_OC();
+    const off_t od = GWS_GET_OD();
+    const off_t oh = GWS_GET_OH();
+    const off_t ow = GWS_GET_OW();
 
     if (mb >= SRC_D0 || oc >= SRC_D1) {
-        const uint dst_off = DST_OFF(mb, oc, od, oh, ow);
+        const off_t dst_off = DST_OFF(mb, oc, od, oh, ow);
         dst[dst_off] = TO_DST(0.0f);
 #if ALG_MAX && IS_TRAINING
         ws[dst_off] = 0;
@@ -65,7 +68,7 @@ __kernel void ref_pooling_fwd(__global DATA_T *src, __global int *ws,
         return;
     }
 
-    const uint dst_off = DST_OFF(mb, oc, od, oh, ow);
+    const off_t dst_off = DST_OFF(mb, oc, od, oh, ow);
 #if ALG_MAX && IS_TRAINING
     ws[dst_off] = -1;
 #endif
@@ -80,17 +83,17 @@ __kernel void ref_pooling_fwd(__global DATA_T *src, __global int *ws,
 #endif // DT_BF16
 #endif // ALG_MAX
 
-    for (int kd = 0; kd < KD; ++kd) {
-        const int id = od * SD - PD + kd * (DD + 1);
+    for (off_t kd = 0; kd < KD; ++kd) {
+        const off_t id = od * SD - PD + kd * (DD + 1);
         if (id < 0 || id >= ID) continue;
-        for (int kh = 0; kh < KH; ++kh) {
-            const int ih = oh * SH - PH + kh * (DH + 1);
+        for (off_t kh = 0; kh < KH; ++kh) {
+            const off_t ih = oh * SH - PH + kh * (DH + 1);
             if (ih < 0 || ih >= IH) continue;
-            for (int kw = 0; kw < KW; ++kw) {
-                const int iw = ow * SW - PW + kw * (DW + 1);
+            for (off_t kw = 0; kw < KW; ++kw) {
+                const off_t iw = ow * SW - PW + kw * (DW + 1);
                 if (iw < 0 || iw >= IW) continue;
 
-                int src_off = SRC_OFF(mb, oc, id, ih, iw);
+                off_t src_off = SRC_OFF(mb, oc, id, ih, iw);
 #if ALG_MAX
 #if IS_TRAINING
                 if (ws[dst_off] < 0) ws[dst_off] = kd * KH * KW + kh * KW + kw;
@@ -124,29 +127,29 @@ __kernel void ref_pooling_fwd(__global DATA_T *src, __global int *ws,
 #endif
 #else
 #if ALG_AVG_P
-    const int num_summands = KD * KW * KH;
+    const off_t num_summands = KD * KW * KH;
 #else
-    const int id_start = od * SD - PD;
-    const int ih_start = oh * SH - PH;
-    const int iw_start = ow * SW - PW;
-    const int id_end = od * SD - PD + (KD - 1) * DD + KD;
-    const int ih_end = oh * SH - PH + (KH - 1) * DH + KH;
-    const int iw_end = ow * SW - PW + (KW - 1) * DW + KW;
+    const off_t id_start = od * SD - PD;
+    const off_t ih_start = oh * SH - PH;
+    const off_t iw_start = ow * SW - PW;
+    const off_t id_end = od * SD - PD + (KD - 1) * DD + KD;
+    const off_t ih_end = oh * SH - PH + (KH - 1) * DH + KH;
+    const off_t iw_end = ow * SW - PW + (KW - 1) * DW + KW;
 
-    const int id_start_excluded
+    const off_t id_start_excluded
             = id_start < 0 ? (0 - id_start - 1) / (DD + 1) + 1 : 0;
-    const int ih_start_excluded
+    const off_t ih_start_excluded
             = ih_start < 0 ? (0 - ih_start - 1) / (DH + 1) + 1 : 0;
-    const int iw_start_excluded
+    const off_t iw_start_excluded
             = iw_start < 0 ? (0 - iw_start - 1) / (DW + 1) + 1 : 0;
-    const int id_end_excluded
+    const off_t id_end_excluded
             = id_end > ID ? (id_end - ID - 1) / (DD + 1) + 1 : 0;
-    const int ih_end_excluded
+    const off_t ih_end_excluded
             = ih_end > IH ? (ih_end - IH - 1) / (DH + 1) + 1 : 0;
-    const int iw_end_excluded
+    const off_t iw_end_excluded
             = iw_end > IW ? (iw_end - IW - 1) / (DW + 1) + 1 : 0;
 
-    const int num_summands = (KD - id_start_excluded - id_end_excluded)
+    const off_t num_summands = (KD - id_start_excluded - id_end_excluded)
             * (KH - ih_start_excluded - ih_end_excluded)
             * (KW - iw_start_excluded - iw_end_excluded);
 #endif
@@ -196,67 +199,70 @@ __kernel void ref_pooling_fwd(__global DATA_T *src, __global int *ws,
 KERNEL_ATTR
 __kernel void ref_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
         __global DST_DATA_T *diff_dst) {
-    const int mb = GWS_GET_MB();
-    const int oc = GWS_GET_OC();
-    const int id = GWS_GET_ID();
-    const int ih = GWS_GET_IH();
-    const int iw = GWS_GET_IW();
+
+    if (GWS_OVERFLOW) return;
+
+    const off_t mb = GWS_GET_MB();
+    const off_t oc = GWS_GET_OC();
+    const off_t id = GWS_GET_ID();
+    const off_t ih = GWS_GET_IH();
+    const off_t iw = GWS_GET_IW();
 
     DEF_ACC_DATA_T s = 0;
-    int denom = 1;
+    off_t denom = 1;
 #if ALG_AVG_P
     denom = KD * KH * KW;
 #endif
 
-    for (int kd = 0; kd < KD; ++kd) {
-        int _od = id + PD - kd * (DD + 1);
+    for (off_t kd = 0; kd < KD; ++kd) {
+        off_t _od = id + PD - kd * (DD + 1);
         if (_od % SD != 0) continue;
-        int od = _od / SD;
+        off_t od = _od / SD;
         if (od < 0 || od >= OD) continue;
 
-        for (int kh = 0; kh < KH; ++kh) {
-            int _oh = ih + PH - kh * (DH + 1);
+        for (off_t kh = 0; kh < KH; ++kh) {
+            off_t _oh = ih + PH - kh * (DH + 1);
             if (_oh % SH != 0) continue;
-            int oh = _oh / SH;
+            off_t oh = _oh / SH;
             if (oh < 0 || oh >= OH) continue;
 
-            for (int kw = 0; kw < KW; ++kw) {
-                int _ow = iw + PW - kw * (DW + 1);
+            for (off_t kw = 0; kw < KW; ++kw) {
+                off_t _ow = iw + PW - kw * (DW + 1);
                 if (_ow % SW != 0) continue;
-                int ow = _ow / SW;
+                off_t ow = _ow / SW;
                 if (ow < 0 || ow >= OW) continue;
 
-                const uint dst_off = DST_OFF(mb, oc, od, oh, ow);
+                const off_t dst_off = DST_OFF(mb, oc, od, oh, ow);
 
 #if ALG_MAX
-                const int index = ws[dst_off];
+                const off_t index = ws[dst_off];
 
-                const int hw = index % (KW * KH);
-                const int w_kd = index / (KW * KH);
-                const int w_kw = hw % KW;
-                const int w_kh = hw / KW;
+                const off_t hw = index % (KW * KH);
+                const off_t w_kd = index / (KW * KH);
+                const off_t w_kw = hw % KW;
+                const off_t w_kh = hw / KW;
                 if (w_kd != kd || w_kh != kh || w_kw != kw) continue;
 #endif
 
 #if ALG_AVG_NP
-                const int id_start = od * SD - PD;
-                const int ih_start = oh * SH - PH;
-                const int iw_start = ow * SW - PW;
-                const int id_end = od * SD - PD + (KD - 1) * DD + KD;
-                const int ih_end = oh * SH - PH + (KH - 1) * DH + KH;
-                const int iw_end = ow * SW - PW + (KW - 1) * DW + KW;
+                const off_t id_start = od * SD - PD;
+                const off_t ih_start = oh * SH - PH;
+                const off_t iw_start = ow * SW - PW;
+                const off_t id_end = od * SD - PD + (KD - 1) * DD + KD;
+                const off_t ih_end = oh * SH - PH + (KH - 1) * DH + KH;
+                const off_t iw_end = ow * SW - PW + (KW - 1) * DW + KW;
 
-                const int id_start_excluded
+                const off_t id_start_excluded
                         = id_start < 0 ? (0 - id_start - 1) / (DD + 1) + 1 : 0;
-                const int ih_start_excluded
+                const off_t ih_start_excluded
                         = ih_start < 0 ? (0 - ih_start - 1) / (DH + 1) + 1 : 0;
-                const int iw_start_excluded
+                const off_t iw_start_excluded
                         = iw_start < 0 ? (0 - iw_start - 1) / (DW + 1) + 1 : 0;
-                const int id_end_excluded
+                const off_t id_end_excluded
                         = id_end > ID ? (id_end - ID - 1) / (DD + 1) + 1 : 0;
-                const int ih_end_excluded
+                const off_t ih_end_excluded
                         = ih_end > IH ? (ih_end - IH - 1) / (DH + 1) + 1 : 0;
-                const int iw_end_excluded
+                const off_t iw_end_excluded
                         = iw_end > IW ? (iw_end - IW - 1) / (DW + 1) + 1 : 0;
 
                 denom = (KD - id_start_excluded - id_end_excluded)
@@ -277,7 +283,7 @@ __kernel void ref_pooling_bwd(__global DATA_T *diff_src, __global int *ws,
     s /= denom;
 #endif
 
-    uint diff_src_offset = SRC_OFF(mb, oc, id, ih, iw);
+    off_t diff_src_offset = SRC_OFF(mb, oc, id, ih, iw);
     diff_src[diff_src_offset] = CONVERT_DATA_T(s);
 }
 #endif
diff --git a/src/gpu/intel/ocl/ref_pooling.hpp b/src/gpu/intel/ocl/ref_pooling.hpp
index bc5ba6fd193..bc941586019 100644
--- a/src/gpu/intel/ocl/ref_pooling.hpp
+++ b/src/gpu/intel/ocl/ref_pooling.hpp
@@ -32,9 +32,7 @@ namespace ocl {
 struct ref_pooling_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_pooling_fwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_fwd_pd_t::gpu_pooling_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:ref", ref_pooling_fwd_t);
 
@@ -105,11 +103,17 @@ struct ref_pooling_fwd_t : public gpu_primitive_t {
             VDISPATCH_POOLING_SC(attr_.set_default_formats(dst_md(0)),
                     VERBOSE_UNSUPPORTED_TAG);
 
+            VDISPATCH_POOLING_SC(init_conf(engine), "init_conf()");
+
             bool is_training = desc_.prop_kind == forward_training;
-            if (desc()->alg_kind == pooling_max && is_training)
+            if (desc()->alg_kind == pooling_max && is_training) {
+                // Required for storing spatial offsets into workspace for
+                // pooling_max training due to use of int type.
+                VDISPATCH_POOLING(conf.kd * conf.kh * conf.kw <= INT_MAX,
+                        VERBOSE_OFFSET_DT_MISMATCH, "kernel spatial", "int");
                 init_default_ws(s32);
+            }
 
-            VDISPATCH_POOLING_SC(init_conf(engine), "init_conf()");
             return status::success;
         }
 
@@ -144,9 +148,7 @@ struct ref_pooling_fwd_t : public gpu_primitive_t {
 struct ref_pooling_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_pooling_bwd_pd_t {
-        pd_t(const pooling_desc_t *adesc, const primitive_attr_t *attr,
-                const pooling_fwd_pd_t *hint_fwd_pd)
-            : gpu_pooling_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using gpu_pooling_bwd_pd_t::gpu_pooling_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:ref:any", ref_pooling_bwd_t);
 
@@ -184,13 +186,18 @@ struct ref_pooling_bwd_t : public gpu_primitive_t {
             VDISPATCH_POOLING(
                     attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
 
+            VDISPATCH_POOLING_SC(init_conf(engine), "init_conf()");
+
             if (desc()->alg_kind == pooling_max) {
+                // Required for storing spatial offsets into workspace for
+                // pooling_max training due to use of int type.
+                VDISPATCH_POOLING(conf.kd * conf.kh * conf.kw <= INT_MAX,
+                        VERBOSE_OFFSET_DT_MISMATCH, "kernel spatial", "int");
                 init_default_ws(data_type::s32);
                 VDISPATCH_POOLING(
                         compare_ws(hint_fwd_pd_), VERBOSE_WS_MISMATCH);
             }
 
-            VDISPATCH_POOLING_SC(init_conf(engine), "init_conf()");
             return status::success;
         }
 
diff --git a/src/gpu/intel/ocl/ref_prelu.cl b/src/gpu/intel/ocl/ref_prelu.cl
index 7e47f114725..261ee540c66 100644
--- a/src/gpu/intel/ocl/ref_prelu.cl
+++ b/src/gpu/intel/ocl/ref_prelu.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,15 +23,15 @@
 __kernel void ref_prelu_fwd(const __global SRC_DATA_T *src,
         const __global WEI_DATA_T *weights, __global DST_DATA_T *dst) {
 
-    const int d0 = GWS_GET_D0();
-    const int d1 = GWS_GET_D1();
-    const int d2 = GWS_GET_D2();
-    const int d3 = GWS_GET_D3();
-    const int d4 = GWS_GET_D4();
-    const int d5 = GWS_GET_D5();
+    const off_t d0 = GWS_GET_D0();
+    const off_t d1 = GWS_GET_D1();
+    const off_t d2 = GWS_GET_D2();
+    const off_t d3 = GWS_GET_D3();
+    const off_t d4 = GWS_GET_D4();
+    const off_t d5 = GWS_GET_D5();
 
-    const unsigned data_off = OFF_MD(SRC, d0, d1, d2, d3, d4, d5);
-    const unsigned wei_off = OFF_MD(WEI, d0 % WEI_D0, d1 % WEI_D1, d2 % WEI_D2,
+    const off_t data_off = OFF_MD(SRC, d0, d1, d2, d3, d4, d5);
+    const off_t wei_off = OFF_MD(WEI, d0 % WEI_D0, d1 % WEI_D1, d2 % WEI_D2,
             d3 % WEI_D3, d4 % WEI_D4, d5 % WEI_D5);
 
     const float src_data = SRC_TO_REF(src[data_off]);
@@ -49,18 +49,18 @@ __kernel void ref_prelu_bwd(const __global SRC_DATA_T *src,
         const __global WEI_DATA_T *weights, const __global DST_DATA_T *diff_dst,
         __global DIFF_SRC_DATA_T *diff_src,
         __global DIFF_WEI_DATA_T *diff_weights) {
-    const int d0 = GWS_GET_D0();
-    const int d1 = GWS_GET_D1();
-    const int d2 = GWS_GET_D2();
-    const int d3 = GWS_GET_D3();
-    const int d4 = GWS_GET_D4();
-    const int d5 = GWS_GET_D5();
-
-    const unsigned data_off = OFF_MD(SRC, d0 % SRC_D0, d1 % SRC_D1, d2 % SRC_D2,
+    const off_t d0 = GWS_GET_D0();
+    const off_t d1 = GWS_GET_D1();
+    const off_t d2 = GWS_GET_D2();
+    const off_t d3 = GWS_GET_D3();
+    const off_t d4 = GWS_GET_D4();
+    const off_t d5 = GWS_GET_D5();
+
+    const off_t data_off = OFF_MD(SRC, d0 % SRC_D0, d1 % SRC_D1, d2 % SRC_D2,
             d3 % SRC_D3, d4 % SRC_D4, d5 % SRC_D5);
-    const unsigned data_off_pd = OFF_MD(SRC, d0 % SRC_PD0, d1 % SRC_PD1,
+    const off_t data_off_pd = OFF_MD(SRC, d0 % SRC_PD0, d1 % SRC_PD1,
             d2 % SRC_PD2, d3 % SRC_PD3, d4 % SRC_PD4, d5 % SRC_PD5);
-    const unsigned wei_off = OFF_MD(WEI, d0 % WEI_D0, d1 % WEI_D1, d2 % WEI_D2,
+    const off_t wei_off = OFF_MD(WEI, d0 % WEI_D0, d1 % WEI_D1, d2 % WEI_D2,
             d3 % WEI_D3, d4 % WEI_D4, d5 % WEI_D5);
 
     const float src_data = SRC_TO_REF(src[data_off]);
@@ -86,10 +86,10 @@ __kernel void ref_prelu_bwd(const __global SRC_DATA_T *src,
         diff_src[data_off_pd] = TO_SRC(diff_src_data);
     if (!COORDINATES_ARE_IN_RANGE(DIFF_WEI)) return;
 
-    const unsigned diff_wei_off = OFF_MD(DIFF_WEI, d0 % DIFF_WEI_D0,
+    const off_t diff_wei_off = OFF_MD(DIFF_WEI, d0 % DIFF_WEI_D0,
             d1 % DIFF_WEI_D1, d2 % DIFF_WEI_D2, d3 % DIFF_WEI_D3,
             d4 % DIFF_WEI_D4, d5 % DIFF_WEI_D5);
-    const unsigned diff_wei_off_pd = OFF_MD(DIFF_WEI, d0 % DIFF_WEI_PD0,
+    const off_t diff_wei_off_pd = OFF_MD(DIFF_WEI, d0 % DIFF_WEI_PD0,
             d1 % DIFF_WEI_PD1, d2 % DIFF_WEI_PD2, d3 % DIFF_WEI_PD3,
             d4 % DIFF_WEI_PD4, d5 % DIFF_WEI_PD5);
 
diff --git a/src/gpu/intel/ocl/ref_prelu.cpp b/src/gpu/intel/ocl/ref_prelu.cpp
index f945585cb5d..0ba7e1722f4 100644
--- a/src/gpu/intel/ocl/ref_prelu.cpp
+++ b/src/gpu/intel/ocl/ref_prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 *******************************************************************************/
 
 #include "gpu/intel/ocl/ref_prelu.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -83,7 +83,6 @@ static status_t init_kernel_ctx_common(
 
     kernel_ctx.set_data_type(conf.dst_md_info.data_type);
     def_eltwise_alg_kinds(kernel_ctx);
-    kernel_ctx.define_int("WITH_ELTWISE", 1);
 
     kernel_ctx.define_int("IS_FWD", conf.is_forward);
 
@@ -159,7 +158,7 @@ status_t ref_prelu_bwd_t::execute_backward(const exec_ctx_t &ctx) const {
 
     const auto &conf = pd()->conf;
 
-    std::unique_ptr<memory_t> diff_weights_to_reduce;
+    std::unique_ptr<memory_t, memory_deleter_t> diff_weights_to_reduce;
     if (conf.reduce_diff_weights) {
         auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage(
                 memory_tracking::names::key_prelu_reduction);
diff --git a/src/gpu/intel/ocl/ref_prelu.hpp b/src/gpu/intel/ocl/ref_prelu.hpp
index 1301ac0e431..94df1e54010 100644
--- a/src/gpu/intel/ocl/ref_prelu.hpp
+++ b/src/gpu/intel/ocl/ref_prelu.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ struct ref_prelu_fwd_t : public gpu_primitive_t {
         return status::success;
     }
 
-    virtual status_t execute(const exec_ctx_t &ctx) const override {
+    status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
     }
 
@@ -91,14 +91,6 @@ struct ref_prelu_bwd_t : public gpu_primitive_t {
     struct pd_t : public gpu_prelu_bwd_pd_t {
         using gpu_prelu_bwd_pd_t::gpu_prelu_bwd_pd_t;
 
-        pd_t(const prelu_desc_t *adesc, const primitive_attr_t *attr,
-                const prelu_fwd_pd_t *hint_fwd_pd)
-            : gpu_prelu_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
-
-        pd_t(const pd_t &other) = default;
-
-        ~pd_t() = default;
-
         DECLARE_COMMON_PD_T("prelu_ref:any", ref_prelu_bwd_t);
 
         status_t init(impl::engine_t *engine) {
diff --git a/src/gpu/intel/ocl/ref_reorder.cl b/src/gpu/intel/ocl/ref_reorder.cl
index 364a7410e74..459127c11b4 100644
--- a/src/gpu/intel/ocl/ref_reorder.cl
+++ b/src/gpu/intel/ocl/ref_reorder.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,22 +16,31 @@
 
 #define USE_CUSTOM_GWS_GET_ID
 
+// Temporary W/A for bf16 problems in HW and compiler
+#undef cl_future_bf16_cvt
+
 #include "gpu/intel/ocl/dispatch.h"
+#include "gpu/intel/ocl/ocl_philox.h"
 #include "gpu/intel/ocl/reorder_common.h"
 #include "gpu/intel/ocl/types_interop.h"
 
-#define TO_I4 ((DST_DT_U4 || DST_DT_S4) && (!SRC_DT_U4 && !SRC_DT_S4))
-#define FROM_I4 ((SRC_DT_U4 || SRC_DT_S4) && (!DST_DT_U4 && !DST_DT_S4))
+#define FROM_I4 (SRC_DT_U4 || SRC_DT_S4)
 #define GWS_GET_THREAD_ID(index) \
     (off_t)(get_global_id(index) + offset.array[index])
 
 KERNEL_ATTR
 __kernel void ref_reorder(__global SRC_DATA_T *restrict src,
-        __global DST_DATA_T *restrict dst, __global float *restrict src_scales,
-        __global int *restrict src_zps, __global float *restrict dst_scales,
-        __global int *dst_zps, float sum_scale, int sum_zp, int64x3_t offset) {
+        __global DST_DATA_T *restrict dst,
+        __global SRC_SCALES_DATA_T *restrict src_scales,
+        __global SRC_ZP_DATA_T *restrict src_zps,
+        __global float *restrict dst_scales, __global int *dst_zps,
+        float sum_scale, int sum_zp,
+#if WITH_SROUND
+        __global uint *sround_seed_buf,
+#endif
+        int64x3_t offset) {
 
-    const int src_zp = GET_SRC_ZP(src_zps);
+    int src_zp = 0;
     const int dst_zp = GET_DST_ZP(dst_zps);
     float src_scale = 1.0f;
     float dst_scale = 1.0f;
@@ -53,6 +62,10 @@ __kernel void ref_reorder(__global SRC_DATA_T *restrict src,
     const off_t d4_blk_end = d4_blk_start + GWS_GET_D4_BLOCK();
     const off_t d5_blk_end = d5_blk_start + GWS_GET_D5_BLOCK();
 
+#if WITH_SROUND
+    const uint sround_seed = *sround_seed_buf;
+#endif
+
     for_(off_t d0 = d0_blk_start; d0 < d0_blk_end; ++d0)
     for_(off_t d1 = d1_blk_start; d1 < d1_blk_end; ++d1)
     for_(off_t d2 = d2_blk_start; d2 < d2_blk_end; ++d2)
@@ -69,37 +82,36 @@ __kernel void ref_reorder(__global SRC_DATA_T *restrict src,
         int pad_d4 = NDIMS > 4 && d4 >= SRC_D4;
         int pad_d5 = NDIMS > 5 && d5 >= SRC_D5;
         if (pad_d0 || pad_d1 || pad_d2 || pad_d3 || pad_d4 || pad_d5) {
-#if TO_I4
-            SET_DOUBLE_HALF_BYTE(dst, dst_off, 0);
-#else
             dst[dst_off] = 0;
-#endif
             continue;
         }
+#endif
+        // Both scales and zero-points include groups in their offsets.
+        // It involves division by a group value of a correspondent dX value,
+        // and also adjusting stride for a dimension with a group.
+#if WITH_SRC_ZPOINT
+        off_t zp_off = ZPOINT_OFF(SRC, d0, d1, d2, d3, d4, d5);
+        src_zp = SRC_ZP_TO_REF(src_zps, zp_off);
 #endif
 #if WITH_SRC_SCALE
-        src_scale = src_scales[SCALE_OFF(SRC, d0, d1, d2, d3, d4, d5)];
+        off_t scale_off = SCALE_OFF(SRC, d0, d1, d2, d3, d4, d5);
+        src_scale = SRC_SCALES_TO_REF(src_scales[scale_off]);
 #endif
 #if WITH_DST_SCALE
         dst_scale = dst_scales[SCALE_OFF(DST, d0, d1, d2, d3, d4, d5)];
 #endif
-#if FROM_I4
-        SRC_DATA_T sval = GET_HALF_BYTE(src, src_off);
-        dst[dst_off] = TO_DST(SRC_TO_REF(sval));
-#elif TO_I4
-        if (dst_off % 2) {
-            continue;
-        } else {
-            SRC_DATA_T sval = src[src_off];
-            uchar dval = 0;
-            dval = dval | TO_DST(sval);
-            sval = src[src_off + SRC_S_CONTIG_D];
-            dval = dval | ((TO_DST(sval) << 4));
-            SET_DOUBLE_HALF_BYTE(dst, dst_off, dval);
-        }
+#if FROM_I4 || SRC_DT_F4_E2M1 || SRC_DT_F4_E3M0
+        SRC_DATA_T src_value = GET_HALF_BYTE(src, src_off);
 #else
-        REORDER(dst[dst_off], src[src_off], src_scale, dst_scale, sum_scale,
-                src_zp, dst_zp, sum_zp);
+        SRC_DATA_T src_value = src[src_off];
 #endif
+#if WITH_SROUND
+#define ROUND(f) stochastic_round_fwd(f, dst_off, sround_seed)
+#else
+#define ROUND DEFAULT_ROUND
+#endif
+
+        REORDER(ROUND, dst[dst_off], src_value, src_scale, dst_scale, sum_scale,
+                src_zp, dst_zp, sum_zp);
     }
 }
diff --git a/src/gpu/intel/ocl/ref_reorder.cpp b/src/gpu/intel/ocl/ref_reorder.cpp
index 68a2dd7c096..e511249fae7 100644
--- a/src/gpu/intel/ocl/ref_reorder.cpp
+++ b/src/gpu/intel/ocl/ref_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,12 +14,10 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include <algorithm>
 #include "gpu/intel/ocl/ref_reorder.hpp"
 
 #include "common/utils.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -30,6 +28,7 @@ using namespace dnnl::impl::memory_tracking::names;
 
 status_t ref_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
     using namespace format_tag;
+    using namespace data_type;
 
     const memory_desc_wrapper src_mdw(src_md());
     const memory_desc_wrapper dst_mdw(dst_md());
@@ -52,17 +51,15 @@ status_t ref_reorder_t::pd_t::init_conf(impl::engine_t *engine) {
     if (conf.nelems == 0) return status::success;
 
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
-
-    dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
-
     conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_);
+    conf.subbyte_pack
+            = utils::one_of(dst_mdw.data_type(), u4, s4, f4_e2m1, f4_e3m0);
 
-    blocks[2] = blocks[3] = blocks[4] = blocks[5] = 0;
-
+    dim_t blocks[MAX_NDIMS] = {1, 1, 0, 0, 0, 0};
     for (int i = 0; i < MAX_NDIMS; ++i) {
         auto dim_str = utils::format("D%d", i);
         if (i < dst_mdw.ndims()) {
-            int dim = padded_dims[i];
+            dim_t dim = padded_dims[i];
             // if needed to align vectorized dim with vector size, pad that dim again
             conf.dispatch.define_dim(dim_str, i, dim, blocks[i]);
         } else {
@@ -94,53 +91,32 @@ status_t ref_reorder_t::pd_t::init_kernel_ctx(
     def_dispatch(kernel_ctx, conf.dispatch);
 
     kernel_ctx.define_int("REF_REORDER", 1);
-
     kernel_ctx.define_int("PAD_FILL_ZERO", conf.has_padding);
 
+    auto dst_rnd_mode = attr()->rounding_mode_.get(DNNL_ARG_DST);
+    kernel_ctx.define_int(
+            "WITH_SROUND", dst_rnd_mode == rounding_mode::stochastic);
+    kernel_ctx.define_int("DST_DT_DIGITS",
+            dnnl::impl::types::digits<uint32_t>(conf.dst_md_info.data_type));
+
     def_memory_desc_info(kernel_ctx, conf.src_md_info, "SRC");
     def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST");
 
-    bool to_i4 = utils::one_of(dst_md()->data_type, s4, u4)
-            && !utils::one_of(src_md()->data_type, s4, u4);
-
-    if (to_i4) {
-        if (dst_md()->format_kind != format_kind::blocked)
-            return status::unimplemented;
-        auto &dst_blk = dst_md()->format_desc.blocking;
-
-        int dst_contig_dim = -1;
-        if (dst_blk.inner_nblks > 0) {
-            for (int i = dst_md()->ndims; i >= 0; i--)
-                if (dst_blk.inner_idxs[i] != 0) {
-                    dst_contig_dim = dst_blk.inner_idxs[i];
-                    break;
-                }
-        } else {
-            for (int i = 0; i < dst_md()->ndims; i++)
-                if (dst_blk.strides[i] == 1) dst_contig_dim = i;
-        }
-        // TODO: also check that innermost block or dimension has even size
-        if (dst_contig_dim < 0) return status::unimplemented;
-
-        dims_t d = {0};
-        d[dst_contig_dim] = 1;
-
-        const memory_desc_wrapper src_mdw(src_md());
-        kernel_ctx.define_int("SRC_S_CONTIG_D", src_mdw.off_v(d));
-    }
-
     return status::success;
 }
 
 void ref_reorder_t::pd_t::init_scratchpad() {
+    auto scratchpad = scratchpad_registry().registrar();
+    if (conf.subbyte_pack) {
+        scratchpad.book(memory_tracking::names::key_reorder_space, conf.nelems,
+                sizeof(char), OCL_BUFFER_ALIGNMENT);
+    }
     if (conf.src_quant.with_scale()) {
-        auto scratchpad = scratchpad_registry().registrar();
         scratchpad.book(memory_tracking::names::key_reorder_src_scales,
                 conf.src_quant.num_scales(), sizeof(float),
                 OCL_BUFFER_ALIGNMENT);
     }
     if (conf.dst_quant.with_scale()) {
-        auto scratchpad = scratchpad_registry().registrar();
         scratchpad.book(memory_tracking::names::key_reorder_dst_scales,
                 conf.dst_quant.num_scales(), sizeof(float),
                 OCL_BUFFER_ALIGNMENT);
@@ -148,31 +124,46 @@ void ref_reorder_t::pd_t::init_scratchpad() {
 }
 
 status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
-
-    status_t status = status::success;
-
     auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM);
     auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO);
-    CHECK(status);
+    auto tmp = ctx.get_scratchpad_grantor().get_memory_storage(
+            memory_tracking::names::key_reorder_space);
 
     const auto &conf = pd()->conf;
     if (conf.nelems == 0) return status::success;
 
     compute::kernel_arg_list_t arg_list;
-    arg_list.set(0, src);
-    arg_list.set(1, dst);
+    arg_list.append(src);
+    arg_list.append(conf.subbyte_pack ? *tmp : dst);
 
-    arg_list.set(2, conf.src_quant.scales(ctx));
-    arg_list.set(3, conf.src_quant.zero_points(ctx));
-    arg_list.set(4, conf.dst_quant.scales(ctx));
-    arg_list.set(5, conf.dst_quant.zero_points(ctx));
+    arg_list.append(conf.src_quant.scales(ctx));
+    arg_list.append(conf.src_quant.zero_points(ctx));
+    arg_list.append(conf.dst_quant.scales(ctx));
+    arg_list.append(conf.dst_quant.zero_points(ctx));
 
-    arg_list.set(6, conf.sum_quant.scales());
-    arg_list.set(7, conf.sum_quant.zero_points());
+    arg_list.append(conf.sum_quant.scales());
+    arg_list.append(conf.sum_quant.zero_points());
 
-    auto nd_range = conf.dispatch.nd_range();
+    if (!pd()->attr()->rounding_mode_.has_default_values())
+        arg_list.append(CTX_IN_STORAGE(DNNL_ARG_ATTR_ROUNDING_SEED));
 
-    return large_parallel_for(ctx, nd_range, kernel_, arg_list, 8);
+    auto nd_range = conf.dispatch.nd_range();
+    CHECK(large_parallel_for(
+            ctx, nd_range, kernels_[0], arg_list, arg_list.nargs()));
+
+    if (conf.subbyte_pack) {
+        compute::kernel_arg_list_t repack_arg_list;
+        repack_arg_list.set(0, *tmp);
+        repack_arg_list.set(1, dst);
+        repack_arg_list.set(2, into<dim_t>(conf.nelems));
+        repack_arg_list.set(3, 4);
+        compute::range_t repack_gws((conf.nelems * 4 + 7) / 8);
+        compute::nd_range_t repack_nd_range(repack_gws);
+        CHECK(large_parallel_for(
+                ctx, repack_nd_range, kernels_[1], repack_arg_list, 4));
+    }
+    CHECK(pd()->maybe_exec_zp_precompute_conv(ctx, zp_precomp_conv_));
+    return status::success;
 }
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_reorder.hpp b/src/gpu/intel/ocl/ref_reorder.hpp
index dfa44c40273..765ddc4a0d9 100644
--- a/src/gpu/intel/ocl/ref_reorder.hpp
+++ b/src/gpu/intel/ocl/ref_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include "gpu/gpu_reorder_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -42,6 +42,11 @@ struct ref_reorder_t : public gpu_primitive_t {
 
         status_t init(impl::engine_t *engine, impl::engine_t *src_engine,
                 impl::engine_t *dst_engine) {
+            using namespace data_type;
+            using smask_t = dnnl_primitive_attr::skip_mask_t;
+            using compute::device_ext_t;
+            const auto sdt = src_md()->data_type;
+            const auto ddt = dst_md()->data_type;
 
             VDISPATCH_REORDER(
                     src_engine == dst_engine, VERBOSE_BAD_ENGINE_KIND);
@@ -49,7 +54,20 @@ struct ref_reorder_t : public gpu_primitive_t {
                     VERBOSE_INCONSISTENT_NDIMS, "src", "dst");
             VDISPATCH_REORDER(src_engine->kind() == engine_kind::gpu,
                     VERBOSE_BAD_ENGINE_KIND);
-            VDISPATCH_REORDER(attr_ok(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_REORDER(
+                    attr()->has_default_values(smask_t::scales_data_type
+                            | smask_t::scales_groups
+                            | smask_t::zero_points_data_type
+                            | smask_t::zero_points_groups | smask_t::post_ops
+                            | smask_t::rounding_mode)
+                            && post_ops_ok(),
+                    VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_REORDER(
+                    IMPLICATION(!attr()->rounding_mode_.has_default_values(),
+                            utils::one_of(sdt, f32, bf16, f16)
+                                    && utils::one_of(ddt, f8_e4m3, f8_e5m2)),
+                    VERBOSE_UNSUPPORTED_ATTR);
+
             VDISPATCH_REORDER(
                     extra_ok(), VERBOSE_UNSUPPORTED_MD_FLAG, "extra_ok");
 
@@ -57,22 +75,13 @@ struct ref_reorder_t : public gpu_primitive_t {
                                               .has_runtime_dims_or_strides()),
                     VERBOSE_RUNTIMEDIM_UNSUPPORTED);
 
-            using namespace data_type;
-            const auto sdt = src_md()->data_type;
-            const auto ddt = dst_md()->data_type;
-            VDISPATCH_REORDER(utils::one_of(sdt, f32, f16, bf16, f8_e5m2,
-                                      f8_e4m3, s32, s8, u8, s4, u4, f64),
-                    VERBOSE_UNSUPPORTED_DT);
-            VDISPATCH_REORDER(utils::one_of(ddt, f32, f16, bf16, f8_e5m2,
-                                      f8_e4m3, s32, s8, u8, s4, u4, f64),
+            VDISPATCH_REORDER(
+                    utils::one_of(sdt, f32, f16, bf16, f8_e5m2, f8_e4m3,
+                            f4_e2m1, f4_e3m0, s32, s8, u8, s4, u4, f64),
                     VERBOSE_UNSUPPORTED_DT);
-
             VDISPATCH_REORDER(
-                    IMPLICATION(utils::one_of(ddt, f8_e4m3, f8_e5m2),
-                            utils::one_of(sdt, f32, f16, bf16, f64, ddt))
-                            && IMPLICATION(utils::one_of(sdt, f8_e4m3, f8_e5m2),
-                                    utils::one_of(
-                                            ddt, f32, f16, bf16, f64, sdt)),
+                    utils::one_of(ddt, f32, f16, bf16, f8_e5m2, f8_e4m3,
+                            f4_e2m1, f4_e3m0, s32, s8, u8, s4, u4, f64),
                     VERBOSE_UNSUPPORTED_DT);
 
             auto *compute_engine = utils::downcast<compute::compute_engine_t *>(
@@ -83,30 +92,28 @@ struct ref_reorder_t : public gpu_primitive_t {
                                       compute::device_ext_t::intel_subgroups),
                     VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "subgroups");
             VDISPATCH_REORDER(
-                    IMPLICATION(
-                            utils::one_of(data_type::f16, src_md()->data_type,
-                                    dst_md()->data_type),
-                            compute_engine->mayiuse(
-                                    compute::device_ext_t::khr_fp16)
-                                    && compute_engine->mayiuse(
-                                            compute::device_ext_t::
+                    IMPLICATION(utils::one_of(data_type::f16, sdt, ddt),
+                            compute_engine->mayiuse(device_ext_t::khr_fp16)
+                                    && compute_engine->mayiuse(device_ext_t::
                                                     intel_subgroups_short)),
                     VERBOSE_UNSUPPORTED_DT_CFG);
             VDISPATCH_REORDER(
                     IMPLICATION(utils::one_of(data_type::f64, sdt, ddt),
-                            compute_engine->mayiuse(
-                                    compute::device_ext_t::khr_fp64)
-                                    && attr()->post_ops_.has_default_values())
-                            && IMPLICATION(
-                                    (utils::one_of(data_type::u4, sdt, ddt)
-                                            || utils::one_of(
-                                                    data_type::s4, sdt, ddt)),
-                                    attr()->post_ops_.has_default_values()),
+                            compute_engine->mayiuse(device_ext_t::khr_fp64)
+                                    && attr()->post_ops_.has_default_values()),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+            VDISPATCH_REORDER(
+                    IMPLICATION(
+                            (utils::one_of(data_type::u4, sdt, ddt)
+                                    || utils::one_of(data_type::s4, sdt, ddt)),
+                            attr()->post_ops_.has_default_values()),
                     VERBOSE_UNSUPPORTED_DT_CFG);
 
             VDISPATCH_REORDER_SC(init_conf(engine), "init_conf()");
-            init_scratchpad();
+            VDISPATCH_REORDER_SC(maybe_create_zp_precompute_conv_pd(dst_engine),
+                    "failed to create nested zp precompute convolution");
 
+            init_scratchpad();
             return status::success;
         }
 
@@ -121,6 +128,9 @@ struct ref_reorder_t : public gpu_primitive_t {
     };
 
     status_t init(impl::engine_t *engine) override {
+        CHECK(pd()->maybe_create_zp_precompute_conv(
+                zp_precomp_conv_, engine, this));
+
         compute::kernel_ctx_t kernel_ctx;
 
         auto status = pd()->init_kernel_ctx(kernel_ctx);
@@ -128,9 +138,15 @@ struct ref_reorder_t : public gpu_primitive_t {
 
         const auto &conf = pd()->conf;
         if (conf.nelems == 0) return status::success;
+        kernels_.resize(2);
+
+        CHECK(create_kernel(engine, &kernels_[0], "ref_reorder", kernel_ctx));
+        if (conf.subbyte_pack)
+            CHECK(create_kernel(
+                    engine, &kernels_[1], "subbyte_pack", kernel_ctx));
 
-        CHECK(create_kernel(engine, &kernel_, "ref_reorder", kernel_ctx));
-        if (!kernel_) return status::runtime_error;
+        if (!kernels_[0]) return status::runtime_error;
+        if (conf.subbyte_pack && !kernels_[1]) return status::runtime_error;
         return status::success;
     }
 
@@ -138,7 +154,8 @@ struct ref_reorder_t : public gpu_primitive_t {
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
-    compute::kernel_t kernel_;
+    std::vector<compute::kernel_t> kernels_;
+    std::shared_ptr<impl::primitive_t> zp_precomp_conv_;
 };
 
 } // namespace ocl
diff --git a/src/gpu/intel/ocl/ref_resampling.hpp b/src/gpu/intel/ocl/ref_resampling.hpp
index a4c1cbb0ff8..e6fec35fa00 100644
--- a/src/gpu/intel/ocl/ref_resampling.hpp
+++ b/src/gpu/intel/ocl/ref_resampling.hpp
@@ -30,10 +30,7 @@ namespace ocl {
 struct ref_resampling_fwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_resampling_fwd_pd_t {
-        pd_t(const resampling_desc_t *adesc, const primitive_attr_t *attr,
-                const resampling_fwd_pd_t *hint_fwd_pd)
-            : gpu_resampling_fwd_pd_t(adesc, attr, hint_fwd_pd) {}
-        virtual ~pd_t() {}
+        using gpu_resampling_fwd_pd_t::gpu_resampling_fwd_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_resampling_fwd_t);
 
@@ -91,10 +88,7 @@ struct ref_resampling_fwd_t : public gpu_primitive_t {
 struct ref_resampling_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_resampling_bwd_pd_t {
-        pd_t(const resampling_desc_t *adesc, const primitive_attr_t *attr,
-                const resampling_fwd_pd_t *hint_fwd_pd)
-            : gpu_resampling_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
-        virtual ~pd_t() {}
+        using gpu_resampling_bwd_pd_t::gpu_resampling_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ref:any", ref_resampling_bwd_t);
 
diff --git a/src/gpu/intel/ocl/ref_sdpa.hpp b/src/gpu/intel/ocl/ref_sdpa.hpp
index 17451e67544..f1ddaf16a45 100644
--- a/src/gpu/intel/ocl/ref_sdpa.hpp
+++ b/src/gpu/intel/ocl/ref_sdpa.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 #include "common/utils.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -50,7 +50,7 @@ struct ref_sdpa_t : public gpu_primitive_t {
             bool enable_ref = gpu_utils::dev_getenv("enable_ref_sdpa", false);
             VDISPATCH_SDPA(enable_ref, VERBOSE_SKIP_PRIMITIVE_IMPL);
 
-            VDISPATCH_SDPA(attr()->has_default_values(smask_t::scales_runtime),
+            VDISPATCH_SDPA(attr()->has_default_values(smask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_SDPA(
                     utils::everyone_is(4, qry_md()->ndims, key_md()->ndims,
diff --git a/src/gpu/intel/ocl/ref_sparse_matmul.cl b/src/gpu/intel/ocl/ref_sparse_matmul.cl
new file mode 100644
index 00000000000..f2c2152e040
--- /dev/null
+++ b/src/gpu/intel/ocl/ref_sparse_matmul.cl
@@ -0,0 +1,42 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/ocl/ocl_post_ops.h"
+#include "gpu/intel/ocl/ocl_types.h"
+
+__kernel void ref_sparse_matmul(__global const SRC_DATA_T *A_values,
+        __global const int *A_rows, __global const int *A_cols,
+        __global const WEI_DATA_T *B, __global DST_DATA_T *C, const dim_t nnz) {
+
+    size_t m = get_global_id(0);
+    size_t n = get_global_id(1);
+
+    // initialize dense destination tensor
+    dim_t dst_off = DST_OFF(m, n, 0, 0, 0);
+    float accum = 0.0f;
+
+    for (dim_t idx = 0; idx < nnz; idx++) {
+        int a_row = A_rows[idx];
+        if (a_row == m) {
+            int a_col = A_cols[idx];
+            float val = SRC_TO_REF(A_values[idx]);
+            dim_t wei_off = WEI_OFF(0, a_col, n, 0, 0, 0);
+            accum += val * WEI_TO_REF(B[wei_off]);
+        }
+    }
+
+    C[dst_off] = TO_DST(accum);
+}
diff --git a/src/gpu/intel/ocl/ref_sparse_matmul.cpp b/src/gpu/intel/ocl/ref_sparse_matmul.cpp
new file mode 100644
index 00000000000..fda734be241
--- /dev/null
+++ b/src/gpu/intel/ocl/ref_sparse_matmul.cpp
@@ -0,0 +1,62 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/intel/ocl/ref_sparse_matmul.hpp"
+
+#include "common/c_types_map.hpp"
+#include "common/type_helpers.hpp"
+#include "gpu/intel/compute/utils.hpp"
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+status_t ref_sparse_matmul_t::execute_ref(const exec_ctx_t &ctx) const {
+    const auto &a_values = CTX_IN_STORAGE(DNNL_ARG_SRC, 0);
+    const auto &a_rows = CTX_IN_STORAGE(DNNL_ARG_SRC, 1);
+    const auto &a_cols = CTX_IN_STORAGE(DNNL_ARG_SRC, 2);
+
+    const auto a_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
+    const auto c_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
+    const dim_t nnz = a_d.nnz();
+
+    const auto &b = CTX_IN_STORAGE(DNNL_ARG_WEIGHTS);
+    auto &c = CTX_OUT_STORAGE(DNNL_ARG_DST);
+
+    const dim_t M = c_d.dims()[0];
+    const dim_t N = c_d.dims()[1];
+
+    compute::kernel_arg_list_t arg_list;
+    arg_list.set(0, a_values);
+    arg_list.set(1, a_rows);
+    arg_list.set(2, a_cols);
+    arg_list.set(3, b);
+    arg_list.set(4, c);
+    arg_list.set(5, nnz);
+    compute::range_t gws = {(size_t)M, (size_t)N};
+
+    auto nd_range = compute::nd_range_t(gws);
+
+    status_t status = parallel_for(ctx, nd_range, kernel_, arg_list);
+    return status;
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/ref_sparse_matmul.hpp b/src/gpu/intel/ocl/ref_sparse_matmul.hpp
new file mode 100644
index 00000000000..debc3c1f0e7
--- /dev/null
+++ b/src/gpu/intel/ocl/ref_sparse_matmul.hpp
@@ -0,0 +1,130 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_REF_SPARSE_MATMUL_HPP
+#define GPU_INTEL_OCL_REF_SPARSE_MATMUL_HPP
+
+#include <assert.h>
+
+#include "common/c_types_map.hpp"
+#include "common/primitive.hpp"
+#include "common/type_helpers.hpp"
+#include "common/utils.hpp"
+#include "gpu/gpu_matmul_pd.hpp"
+#include "gpu/gpu_resource.hpp"
+#include "gpu/intel/gpu_primitive.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+#include "gpu/intel/primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+struct ref_sparse_matmul_t : public gpu_primitive_t {
+    using gpu_primitive_t::gpu_primitive_t;
+    struct pd_t : public gpu_matmul_pd_t {
+        using gpu_matmul_pd_t::gpu_matmul_pd_t;
+
+        DECLARE_COMMON_PD_T("ocl:ref:any", ref_sparse_matmul_t);
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+
+            src_dt_ = src_md()->data_type;
+            dst_dt_ = dst_md()->data_type;
+            wei_dt_ = weights_md(0)->data_type;
+
+            memory_desc_wrapper src_d(src_md());
+            memory_desc_wrapper wei_d(weights_md(0));
+
+            bool is_f16_dt = utils::everyone_is(f16, src_dt_, wei_dt_, dst_dt_);
+            bool is_f32_dt = utils::everyone_is(f32, src_dt_, wei_dt_, dst_dt_);
+            VDISPATCH_MATMUL(
+                    is_f32_dt || is_f16_dt, VERBOSE_UNSUPPORTED_DT_CFG);
+
+            bool is_src_coo_sparse = src_d.is_sparse_desc()
+                    && (src_d.encoding() == sparse_encoding::coo);
+            VDISPATCH_MATMUL(is_src_coo_sparse, VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            bool is_meta_data_valid = src_d.metadata_type(0) == s32;
+            VDISPATCH_MATMUL(
+                    is_meta_data_valid, VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            VDISPATCH_MATMUL(set_default_formats(), VERBOSE_UNSUPPORTED_TAG);
+            bool wei_tag_check
+                    = wei_d.matches_one_of_tag(format_tag::ab, format_tag::ba);
+            VDISPATCH_MATMUL(wei_tag_check, VERBOSE_UNSUPPORTED_TAG);
+
+            VDISPATCH_MATMUL(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(!with_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
+
+            return status::success;
+        }
+
+        data_type_t src_dt_ = data_type::undef;
+        data_type_t dst_dt_ = data_type::undef;
+        data_type_t wei_dt_ = data_type::undef;
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        compute::kernel_ctx_t kernel_ctx;
+
+        int ndims = pd()->dst_md()->ndims;
+
+        kernel_ctx.set_data_type(pd()->dst_dt_);
+
+        const memory_desc_wrapper src_d(pd()->src_md(0));
+        const memory_desc_wrapper wei_d(pd()->weights_md(0));
+        const memory_desc_wrapper dst_d(pd()->dst_md(0));
+        offsets_t off;
+        set_offsets(src_d, off.src_off);
+        set_offsets(wei_d, off.wei_off);
+        set_offsets(dst_d, off.dst_off);
+        def_offsets(off.src_off, kernel_ctx, "SRC", ndims);
+        def_offsets(off.wei_off, kernel_ctx, "WEI", ndims);
+        def_offsets(off.dst_off, kernel_ctx, "DST", ndims);
+        kernel_ctx.define_int("NDIMS", ndims);
+
+        def_data_type(kernel_ctx, pd()->src_dt_, "SRC");
+        def_data_type(kernel_ctx, pd()->wei_dt_, "WEI");
+        def_data_type(kernel_ctx, pd()->dst_dt_, "DST");
+        def_data_type(kernel_ctx, pd()->desc()->accum_data_type, "ACC");
+
+        CHECK(create_kernel(engine, &kernel_, "ref_sparse_matmul", kernel_ctx));
+        if (!kernel_) return status::runtime_error;
+
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override {
+        return execute_ref(ctx);
+    }
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    status_t execute_ref(const exec_ctx_t &ctx) const;
+    compute::kernel_t kernel_;
+};
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/ocl/reorder_common.h b/src/gpu/intel/ocl/reorder_common.h
index 8ebbd8cb93f..6adbd4f8b7b 100644
--- a/src/gpu/intel/ocl/reorder_common.h
+++ b/src/gpu/intel/ocl/reorder_common.h
@@ -14,9 +14,6 @@
 * limitations under the License.
 *******************************************************************************/
 
-// Temporary W/A for bf16 problems in HW and compiler
-#undef cl_future_bf16_cvt
-
 #define DT_UNDEF 1
 #include "gpu/intel/ocl/ocl_types.h"
 
@@ -297,59 +294,129 @@
 #endif
 
 #if WITH_SUM_MOD
-#define REORDER(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+#define REORDER(round, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
     do { \
         const float _x = SRC_TO_REF(_src); \
         const float _y = DST_TO_REF(_dst); \
         const float _s = AXPY(_x, _y, _a, _b, _c, _x0, _y0, _z0); \
-        _dst = TO_DST(_s); \
+        _dst = TO_DST(round(_s)); \
     } while (0)
-#define REORDER8(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+#define REORDER8(round8, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
     do { \
         const float8 _x = convert_float8(SRC_TO_REF8(_src)); \
         const float8 _y = convert_float8(DST_TO_REF8(_dst)); \
         const float8 _s = AXPY(_x, _y, _a, _b, _c, _x0, _y0, _z0); \
-        _dst = TO_DST8(_s); \
+        _dst = TO_DST8(round8(_s)); \
     } while (0)
 #elif WITH_SRC_MOD || WITH_DST_MOD
-#define REORDER(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+#define REORDER(round, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
     do { \
         const float _x = SRC_TO_REF(_src); \
         const float _s = AXPY(_x, 0.f, _a, _b, _c, _x0, _y0, _z0); \
-        _dst = TO_DST(_s); \
+        _dst = TO_DST(round(_s)); \
     } while (0)
-#define REORDER8(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+#define REORDER8(round8, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
     do { \
         const float8 _x = convert_float8(SRC_TO_REF8(_src)); \
         const float8 _s = AXPY(_x, 0.f, _a, _b, _c, _x0, _y0, _z0); \
-        _dst = TO_DST8(_s); \
+        _dst = TO_DST8(round8(_s)); \
+    } while (0)
+#elif WITH_SROUND
+#define REORDER(round, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+    do { \
+        _dst = TO_DST(round(SRC_TO_REF(_src))); \
+    } while (0)
+#define REORDER8(round8, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+    do { \
+        _dst = TO_DST8(round8(SRC_TO_REF8(_src)); \
     } while (0)
 #else
-#define REORDER(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+#define REORDER(round, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
     do { \
         _dst = SRC_TO_DST(_src); \
     } while (0)
-#define REORDER8(_dst, _src, _a, _b, _c, _x0, _y0, _z0) \
+#define REORDER8(round8, _dst, _src, _a, _b, _c, _x0, _y0, _z0) \
     do { \
         _dst = SRC_TO_DST8(_src); \
     } while (0)
 #endif // WITH_SUM_MOD
 
-#if WITH_SRC_SCALE || WITH_DST_SCALE
-#define MASK_DIM(prefix, dim) ((CONCAT2(prefix, _SCALE_MASK) >> dim) & 1)
-#define SCALE_DIM(prefix, dim) \
-    (MASK_DIM(prefix, dim) ? CONCAT3(prefix, _D, dim) : 1)
-#define SCALE_S5(prefix) (1)
-#define SCALE_S4(prefix) (SCALE_DIM(prefix, 5) * SCALE_S5(prefix))
-#define SCALE_S3(prefix) (SCALE_DIM(prefix, 4) * SCALE_S4(prefix))
-#define SCALE_S2(prefix) (SCALE_DIM(prefix, 3) * SCALE_S3(prefix))
-#define SCALE_S1(prefix) (SCALE_DIM(prefix, 2) * SCALE_S2(prefix))
-#define SCALE_S0(prefix) (SCALE_DIM(prefix, 1) * SCALE_S1(prefix))
-#define SCALE_STRIDE(prefix, dim) \
-    (CONCAT2(SCALE_S, dim)(prefix) * MASK_DIM(prefix, dim))
+#define DEFAULT_ROUND(f) f
+
+#if WITH_SRC_SCALE || WITH_DST_SCALE || WITH_SRC_ZPOINT || WITH_DST_ZPOINT
+#define MASK_DIM(prefix, mask, dim) ((CONCAT2(prefix, mask) >> dim) & 1)
+#define QUANT_DIM(prefix, mask, dim) \
+    (MASK_DIM(prefix, mask, dim) ? CONCAT3(prefix, _D, dim) : 1)
+#define GROUP_DIM(prefix, quant_group_dim, quant_group, dim) \
+    ((CONCAT2(prefix, quant_group_dim) == dim) ? CONCAT2(prefix, quant_group) \
+                                               : 1)
+#define QUANT_S5(prefix, mask, quant_group_dim, quant_group) (1)
+#define QUANT_S4(prefix, mask, quant_group_dim, quant_group) \
+    ((QUANT_DIM(prefix, mask, 5) \
+             / GROUP_DIM(prefix, quant_group_dim, quant_group, 5)) \
+            * QUANT_S5(prefix, mask, quant_group_dim, quant_group))
+#define QUANT_S3(prefix, mask, quant_group_dim, quant_group) \
+    ((QUANT_DIM(prefix, mask, 4) \
+             / GROUP_DIM(prefix, quant_group_dim, quant_group, 4)) \
+            * QUANT_S4(prefix, mask, quant_group_dim, quant_group))
+#define QUANT_S2(prefix, mask, quant_group_dim, quant_group) \
+    ((QUANT_DIM(prefix, mask, 3) \
+             / GROUP_DIM(prefix, quant_group_dim, quant_group, 3)) \
+            * QUANT_S3(prefix, mask, quant_group_dim, quant_group))
+#define QUANT_S1(prefix, mask, quant_group_dim, quant_group) \
+    ((QUANT_DIM(prefix, mask, 2) \
+             / GROUP_DIM(prefix, quant_group_dim, quant_group, 2)) \
+            * QUANT_S2(prefix, mask, quant_group_dim, quant_group))
+#define QUANT_S0(prefix, mask, quant_group_dim, quant_group) \
+    ((QUANT_DIM(prefix, mask, 1) \
+             / GROUP_DIM(prefix, quant_group_dim, quant_group, 1)) \
+            * QUANT_S1(prefix, mask, quant_group_dim, quant_group))
+#define QUANT_STRIDE(prefix, mask, quant_group_dim, quant_group, dim) \
+    (CONCAT2(QUANT_S, dim)(prefix, mask, quant_group_dim, quant_group) \
+            * MASK_DIM(prefix, mask, dim))
 
+#if WITH_SRC_SCALE || WITH_DST_SCALE
 #define SCALE_OFF(prefix, x0, x1, x2, x3, x4, x5) \
-    ((x0)*SCALE_STRIDE(prefix, 0) + (x1)*SCALE_STRIDE(prefix, 1) \
-            + (x2)*SCALE_STRIDE(prefix, 2) + (x3)*SCALE_STRIDE(prefix, 3) \
-            + (x4)*SCALE_STRIDE(prefix, 4) + (x5)*SCALE_STRIDE(prefix, 5))
+    (((x0) / GROUP_DIM(prefix, _SCALE_GROUP_DIM, _SCALE_GROUP, 0)) \
+                    * QUANT_STRIDE(prefix, _SCALE_MASK, _SCALE_GROUP_DIM, \
+                            _SCALE_GROUP, 0) \
+            + ((x1) / GROUP_DIM(prefix, _SCALE_GROUP_DIM, _SCALE_GROUP, 1)) \
+                    * QUANT_STRIDE(prefix, _SCALE_MASK, _SCALE_GROUP_DIM, \
+                            _SCALE_GROUP, 1) \
+            + ((x2) / GROUP_DIM(prefix, _SCALE_GROUP_DIM, _SCALE_GROUP, 2)) \
+                    * QUANT_STRIDE(prefix, _SCALE_MASK, _SCALE_GROUP_DIM, \
+                            _SCALE_GROUP, 2) \
+            + ((x3) / GROUP_DIM(prefix, _SCALE_GROUP_DIM, _SCALE_GROUP, 3)) \
+                    * QUANT_STRIDE(prefix, _SCALE_MASK, _SCALE_GROUP_DIM, \
+                            _SCALE_GROUP, 3) \
+            + ((x4) / GROUP_DIM(prefix, _SCALE_GROUP_DIM, _SCALE_GROUP, 4)) \
+                    * QUANT_STRIDE(prefix, _SCALE_MASK, _SCALE_GROUP_DIM, \
+                            _SCALE_GROUP, 4) \
+            + ((x5) / GROUP_DIM(prefix, _SCALE_GROUP_DIM, _SCALE_GROUP, 5)) \
+                    * QUANT_STRIDE(prefix, _SCALE_MASK, _SCALE_GROUP_DIM, \
+                            _SCALE_GROUP, 5))
 #endif // WITH_SRC_SCALE || WITH_DST_SCALE
+
+#if WITH_SRC_ZPOINT || WITH_DST_ZPOINT
+#define ZPOINT_OFF(prefix, x0, x1, x2, x3, x4, x5) \
+    (((x0) / GROUP_DIM(prefix, _ZPOINT_GROUP_DIM, _ZPOINT_GROUP, 0)) \
+                    * QUANT_STRIDE(prefix, _ZPOINT_MASK, _ZPOINT_GROUP_DIM, \
+                            _ZPOINT_GROUP, 0) \
+            + ((x1) / GROUP_DIM(prefix, _ZPOINT_GROUP_DIM, _ZPOINT_GROUP, 1)) \
+                    * QUANT_STRIDE(prefix, _ZPOINT_MASK, _ZPOINT_GROUP_DIM, \
+                            _ZPOINT_GROUP, 1) \
+            + ((x2) / GROUP_DIM(prefix, _ZPOINT_GROUP_DIM, _ZPOINT_GROUP, 2)) \
+                    * QUANT_STRIDE(prefix, _ZPOINT_MASK, _ZPOINT_GROUP_DIM, \
+                            _ZPOINT_GROUP, 2) \
+            + ((x3) / GROUP_DIM(prefix, _ZPOINT_GROUP_DIM, _ZPOINT_GROUP, 3)) \
+                    * QUANT_STRIDE(prefix, _ZPOINT_MASK, _ZPOINT_GROUP_DIM, \
+                            _ZPOINT_GROUP, 3) \
+            + ((x4) / GROUP_DIM(prefix, _ZPOINT_GROUP_DIM, _ZPOINT_GROUP, 4)) \
+                    * QUANT_STRIDE(prefix, _ZPOINT_MASK, _ZPOINT_GROUP_DIM, \
+                            _ZPOINT_GROUP, 4) \
+            + ((x5) / GROUP_DIM(prefix, _ZPOINT_GROUP_DIM, _ZPOINT_GROUP, 5)) \
+                    * QUANT_STRIDE(prefix, _ZPOINT_MASK, _ZPOINT_GROUP_DIM, \
+                            _ZPOINT_GROUP, 5))
+#endif // WITH_SRC_ZP || WITH_DST_ZP
+
+#endif // WITH_SRC_SCALE || WITH_DST_SCALE || WITH_SRC_ZP || WITH_DST_ZP
diff --git a/src/gpu/intel/ocl/reusable_lnorm.cl b/src/gpu/intel/ocl/reusable_lnorm.cl
index e30559fdc8f..3622534ba6b 100644
--- a/src/gpu/intel/ocl/reusable_lnorm.cl
+++ b/src/gpu/intel/ocl/reusable_lnorm.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,8 +27,7 @@ __kernel void lnorm_reusable_calc_mean(__global SRC_STAT_DT *src,
 
     ACC_DT sum = SPECIAL(ACC_DT, zero);
     for (off_t i = 0; i < reduce_size; i++) {
-        ACC_DT src_val;
-        load(&src_val, src + i * (off_t)reduce_stride);
+        ACC_DT src_val = load(src_val, src + i * (off_t)reduce_stride);
         sum += src_val;
     }
 
@@ -44,12 +43,10 @@ __kernel void lnorm_reusable_calc_var(__global SRC_STAT_DT *src,
     mean = GWS_GET_BUFFER_POS_NAMED(STAT, STAT, gws_params, mean);
     variance = GWS_GET_BUFFER_POS_NAMED(STAT, STAT, gws_params, variance);
 
-    float mean_val;
-    load(&mean_val, mean);
+    float mean_val = load(mean_val, mean);
     float sum = 0.0f;
     for (off_t i = 0; i < reduce_size; i++) {
-        ACC_DT src_val;
-        load(&src_val, src + i * (off_t)reduce_stride);
+        ACC_DT src_val = load(src_val, src + i * (off_t)reduce_stride);
         float v0 = src_val - mean_val;
         sum += v0 * v0;
     }
@@ -76,21 +73,18 @@ __kernel void lnorm_reusable_fwd(__global SRC_DT *src, __global float *mean,
     float sv = 0.0f;
     if (USE_SCALE) load(&sm, scale);
     if (USE_SHIFT) load(&sv, shift);
-    float src_val, var_val, mean_val;
-    load(&src_val, src);
-    load(&var_val, variance);
-    load(&mean_val, mean);
+    float src_val = load(src_val, src);
+    float var_val = load(var_val, variance);
+    float mean_val = load(mean_val, mean);
     float sqrt_variance = 1.0f / sqrt(var_val + eps);
     float res = sm * (src_val - mean_val) * sqrt_variance + sv;
 
     if (WITH_SRC_SCALES) {
-        float src_scale_val;
-        load(&src_scale_val, src_scale);
+        float src_scale_val = load(src_scale_val, src_scale);
         res *= src_scale_val;
     }
     if (WITH_DST_SCALES) {
-        float dst_scale_val;
-        load(&dst_scale_val, dst_scale);
+        float dst_scale_val = load(dst_scale_val, dst_scale);
         res /= dst_scale_val;
     }
 
@@ -116,14 +110,10 @@ __kernel void lnorm_reusable_bwd_scaleshift(__global SRC_SS_DT *src,
     float beta = 0.0f;
     for (int i = 0; i < stat_size; i++) {
         off_t off = i * stat_stride;
-        ACC_BWD_DT dst_val;
-        load(&dst_val, diff_dst + off);
-        ACC_DT src_val;
-        load(&src_val, src + off);
-
-        float mean_val, var_val;
-        load(&mean_val, mean + i);
-        load(&var_val, variance + i);
+        ACC_BWD_DT dst_val = load(dst_val, diff_dst + off);
+        ACC_DT src_val = load(src_val, src + off);
+        float mean_val = load(mean_val, mean + i);
+        float var_val = load(var_val, variance + i);
 
         beta += dst_val;
         gamma += dst_val * (src_val - mean_val) / sqrt(var_val + eps);
@@ -150,9 +140,8 @@ __kernel void lnorm_reusable_bwd(__global SRC_STAT_DT *src,
     mean = GWS_GET_BUFFER_POS_NAMED(STAT, STAT, gws_params, mean);
     variance = GWS_GET_BUFFER_POS_NAMED(STAT, STAT, gws_params, variance);
 
-    float mean_val, var_val;
-    load(&mean_val, mean);
-    load(&var_val, variance);
+    float mean_val = load(mean_val, mean);
+    float var_val = load(var_val, variance);
 
     float inv_var = rsqrt(var_val + eps);
 
@@ -162,10 +151,8 @@ __kernel void lnorm_reusable_bwd(__global SRC_STAT_DT *src,
     if (include_stats) {
         for (int i = 0; i < norm_size; i++) {
             off_t off = i * norm_stride;
-            ACC_BWD_DT dst_val;
-            load(&dst_val, diff_dst + off);
-            ACC_DT src_val;
-            load(&src_val, src + off);
+            ACC_BWD_DT dst_val = load(dst_val, diff_dst + off);
+            ACC_DT src_val = load(src_val, src + off);
             float scale_val = 1.0f;
             if (scale) load(&scale_val, scale + i);
 
@@ -179,10 +166,8 @@ __kernel void lnorm_reusable_bwd(__global SRC_STAT_DT *src,
     // Apply these stats to the entirety of the norm dim
     for (int i = 0; i < norm_size; i++) {
         off_t off = i * norm_stride;
-        ACC_BWD_DT dst_val;
-        load(&dst_val, diff_dst + off);
-        ACC_DT src_val;
-        load(&src_val, src + off);
+        ACC_BWD_DT dst_val = load(dst_val, diff_dst + off);
+        ACC_DT src_val = load(src_val, src + off);
         float scale_val = 1.0f;
         if (scale) load(&scale_val, scale + i);
 
diff --git a/src/gpu/intel/ocl/reusable_lnorm.cpp b/src/gpu/intel/ocl/reusable_lnorm.cpp
index 2167e05b27a..6b5bd0df958 100644
--- a/src/gpu/intel/ocl/reusable_lnorm.cpp
+++ b/src/gpu/intel/ocl/reusable_lnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,8 +27,8 @@
 #include "gpu/intel/compute/kernel_arg_list.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/ocl/lnorm_utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/reusable_lnorm.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -48,19 +48,19 @@ static status_t init_conf_common(const layer_normalization_pd_t *pd,
     conf->src_dt = src_buf.data_type;
     conf->dst_dt = dst_buf.data_type;
 
-    auto scales = pd->attr()->scales_;
-    conf->with_src_scale = !scales.get(DNNL_ARG_SRC).has_default_values();
-    conf->with_dst_scale = !scales.get(DNNL_ARG_DST).has_default_values();
+    const auto &scales = pd->attr()->scales_;
+    conf->with_src_scale = !scales.has_default_values(DNNL_ARG_SRC);
+    conf->with_dst_scale = !scales.has_default_values(DNNL_ARG_DST);
 
     // We require that the lnorm axis is a single dense block, so that it can
     // be represented by a stride + size alone.
-    size_t ndims = gpu_utils::into<size_t>(src_buf.ndims);
-    std::vector<compute::dim_id_t> dims = get_dims(ndims);
+    size_t ndims = into<size_t>(src_buf.ndims);
+    std::vector<dim_idx_t> dims = get_dims(ndims);
     block_layout_t layout = src_buf.layout();
     const block_t *norm_block = [&layout, &dims]() -> const block_t * {
         const block_t *ret = nullptr;
         for (const block_t &block : layout) {
-            if (gpu_utils::into<size_t>(block.dim_idx) == dims.back()) {
+            if (into<size_t>(block.dim_idx) == dims.back()) {
                 if (ret) return nullptr;
                 ret = &block;
             }
@@ -109,10 +109,8 @@ static status_t init_conf_common(const layer_normalization_pd_t *pd,
         // We need to reduce over all but the last dimension as well. Make sure
         // this is a single block by ensuring that the last dimension's block
         // is either the innermost or outermost block.
-        bool is_innermost = gpu_utils::into<size_t>(layout.front().dim_idx)
-                == dims.back();
-        bool is_outermost
-                = gpu_utils::into<size_t>(layout.back().dim_idx) == dims.back();
+        bool is_innermost = into<size_t>(layout.front().dim_idx) == dims.back();
+        bool is_outermost = into<size_t>(layout.back().dim_idx) == dims.back();
         if (is_innermost) {
             rt_conf->stat_stride = layout.front().block;
         } else if (is_outermost) {
@@ -141,16 +139,15 @@ void init_scratchpad_common(
         memory_tracking::registrar_t &scratchpad, dim_t nelems) {
     using namespace memory_tracking::names;
     for (auto key : {key_lnorm_tmp_mean, key_lnorm_tmp_var}) {
-        scratchpad.book<float>(
-                key, gpu_utils::into<size_t>(nelems), OCL_BUFFER_ALIGNMENT);
+        scratchpad.book<float>(key, into<size_t>(nelems), OCL_BUFFER_ALIGNMENT);
     }
 }
 
 status_t reusable_layer_normalization_fwd_t::pd_t::init_conf(
         impl::engine_t *engine) {
     size_t ndims = static_cast<size_t>(src_md()->ndims);
-    std::vector<compute::dim_id_t> dims = get_dims(ndims);
-    std::vector<compute::dim_id_t> stat_dims = get_dims(ndims, true);
+    std::vector<dim_idx_t> dims = get_dims(ndims);
+    std::vector<dim_idx_t> stat_dims = get_dims(ndims, true);
 
     // FWD buffers:
     // - src: all dims
@@ -283,8 +280,8 @@ status_t reusable_layer_normalization_fwd_t::execute_forward(
 status_t reusable_layer_normalization_bwd_t::pd_t::init_conf(
         impl::engine_t *engine) {
     size_t ndims = static_cast<size_t>(diff_dst_md()->ndims);
-    std::vector<compute::dim_id_t> dims = get_dims(ndims);
-    std::vector<compute::dim_id_t> stat_dims = get_dims(ndims, true);
+    std::vector<dim_idx_t> dims = get_dims(ndims);
+    std::vector<dim_idx_t> stat_dims = get_dims(ndims, true);
 
     // BWD buffers:
     // - diff_dst: all dims
diff --git a/src/gpu/intel/ocl/reusable_lnorm.hpp b/src/gpu/intel/ocl/reusable_lnorm.hpp
index 85e91798007..48575191e35 100644
--- a/src/gpu/intel/ocl/reusable_lnorm.hpp
+++ b/src/gpu/intel/ocl/reusable_lnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,12 +19,12 @@
 
 #include "common/c_types_map.hpp"
 #include "common/layer_normalization_pd.hpp"
+#include "common/serialization.hpp"
 #include "common/utils.hpp"
 #include "gpu/gpu_layer_normalization_pd.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -109,8 +109,7 @@ struct reusable_layer_normalization_fwd_t : public gpu_primitive_t {
                     VERBOSE_UNSUPPORTED_DT);
 
             using skip_mask_t = primitive_attr_t::skip_mask_t;
-            VDISPATCH_LNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime),
+            VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_LNORM(
                     set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
diff --git a/src/gpu/intel/ocl/reusable_simple_concat.cpp b/src/gpu/intel/ocl/reusable_simple_concat.cpp
index 49862250997..85add60c05c 100644
--- a/src/gpu/intel/ocl/reusable_simple_concat.cpp
+++ b/src/gpu/intel/ocl/reusable_simple_concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,6 +33,8 @@ static status_t init_conf_common(impl::engine_t *engine, const concat_pd_t *pd,
         reusable_simple_concat_runtime_params_t &rt_conf) {
     using namespace utils;
     const memory_desc_t &ref_dst_md = *pd->dst_md();
+    const memory_desc_wrapper ref_dst_mdw = *pd->dst_md();
+
     if (ref_dst_md.format_kind != format_kind::blocked) {
         return status::unimplemented;
     }
@@ -56,9 +58,8 @@ static status_t init_conf_common(impl::engine_t *engine, const concat_pd_t *pd,
     const int hw_threads = device_info->hw_threads();
     const int max_sg_size = device_info->max_subgroup_size();
     const auto data_type_size = normalize.data_type_size();
-    dim_t total_bytes = data_type_size;
-    for (int i = 0; i < pd->dst_md()->ndims; ++i)
-        total_bytes *= pd->dst_md()->padded_dims[i];
+    dim_t dst_bytes = ref_dst_mdw.size();
+    dim_t max_bytes = ref_dst_mdw.size();
 
     std::vector<prb_info_t> infos;
     for (int simd : {32, 16, 8, 1}) {
@@ -67,7 +68,7 @@ static status_t init_conf_common(impl::engine_t *engine, const concat_pd_t *pd,
         for (int bytes : {8, 4, 2, 1}) {
             if (has_scales && bytes < (int)data_type_size) break;
             if (max_write_size % bytes) continue;
-            const dim_t total_elems = total_bytes / bytes;
+            const dim_t total_elems = dst_bytes / bytes;
             const dim_t concurrent_elems
                     = utils::div_up(simd * total_elems, hw_threads);
             const dim_t elems_per_reg = register_bytes / bytes;
@@ -87,6 +88,8 @@ static status_t init_conf_common(impl::engine_t *engine, const concat_pd_t *pd,
     dim_t final_padding = 0;
     for (int i = 0; i < pd->n_inputs(); ++i) {
         if (pd->src_md(i)->padded_dims[concat_dim] == 0) continue;
+        max_bytes = std::max(max_bytes,
+                into<dim_t>(memory_desc_wrapper(pd->src_md(i)).size()));
         memcpy(&src_md, pd->src_md(i), sizeof(memory_desc_t));
         normalize(src_md);
         const auto &src_blkg = src_md.format_desc.blocking;
@@ -154,9 +157,9 @@ static status_t init_conf_common(impl::engine_t *engine, const concat_pd_t *pd,
     }
 
     rt_conf.lws_d = compute::get_optimal_lws(
-            rt_conf.gws_d, 0, device_info->gpu_arch());
+            rt_conf.gws_d, dim_idx::invalid, device_info->gpu_arch());
 
-    conf.use_large_index = (total_bytes > std::numeric_limits<int>::max());
+    conf.use_large_index = (max_bytes > std::numeric_limits<int>::max());
     return status::success;
 }
 
diff --git a/src/gpu/intel/ocl/reusable_softmax.cpp b/src/gpu/intel/ocl/reusable_softmax.cpp
index 371dc765bf0..fdc8fe0a6eb 100644
--- a/src/gpu/intel/ocl/reusable_softmax.cpp
+++ b/src/gpu/intel/ocl/reusable_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,20 +29,18 @@ class softmax_lws_strategy_t : public compute::lws_strategy_t {
 public:
     bool is_included(const compute::mapped_block_t &blocks) const override {
         for (const block_t &block : inc_blocks) {
-            if (blocks.get_dim_idx() == into<size_t>(block.dim_idx)) {
-                return true;
-            }
+            if (blocks.get_dim_idx() == block.dim_idx) { return true; }
         }
         return false;
     };
 
-    void include(compute::dim_id_t dim, size_t size) {
-        inc_blocks.emplace_back(into<dim_t>(dim), into<dim_t>(size), 1);
+    void include(dim_idx_t dim, size_t size) {
+        inc_blocks.emplace_back(dim, into<dim_t>(size), 1);
     }
 
 private:
     using compute::lws_strategy_t::lws_strategy_t;
-    compute::range_t create_lws(const compute::range_t &gws,
+    compute::range_t create_lws(compute::range_t &gws,
             const compute::gws_bin_mapping_t &mapper) const override {
         auto lws = compute::range_t::one(gws.ndims());
 
@@ -50,7 +48,7 @@ class softmax_lws_strategy_t : public compute::lws_strategy_t {
             const auto &bins = mapper.get_bins(i);
             if (bins.empty()) continue;
             for (const block_t &inc_block : inc_blocks) {
-                if (bins[0].get_dim_idx() == into<size_t>(inc_block.dim_idx)) {
+                if (bins[0].get_dim_idx() == inc_block.dim_idx) {
                     lws[i] *= into<size_t>(inc_block.block);
                 }
             }
@@ -63,16 +61,16 @@ class softmax_lws_strategy_t : public compute::lws_strategy_t {
 };
 
 namespace softmax_dims_t {
-compute::dim_id_t mb = 0;
-compute::dim_id_t ic = 1;
-compute::dim_id_t sp0 = 2;
-compute::dim_id_t sp1 = 3;
-compute::dim_id_t sp2 = 4;
-compute::dim_id_t workers = 5; // artificial dimension partitions reductions
+dim_idx_t mb = 0;
+dim_idx_t ic = 1;
+dim_idx_t sp0 = 2;
+dim_idx_t sp1 = 3;
+dim_idx_t sp2 = 4;
+dim_idx_t workers = 5; // artificial dimension partitions reductions
 }; // namespace softmax_dims_t
 
-static std::vector<compute::dim_id_t> get_dims(size_t ndims) {
-    std::vector<compute::dim_id_t> ret(ndims);
+static std::vector<dim_idx_t> get_dims(size_t ndims) {
+    std::vector<dim_idx_t> ret(ndims);
     uint8_t idx = 0;
     ret[idx++] = softmax_dims_t::mb;
     ret[idx++] = softmax_dims_t::ic;
@@ -83,8 +81,8 @@ static std::vector<compute::dim_id_t> get_dims(size_t ndims) {
 }
 
 status_t reusable_softmax_fwd_t::pd_t::init_dispatch_default_reusable(
-        engine_t *engine) {
-    using dims_vec_t = std::vector<compute::dim_id_t>;
+        gpu::engine_t *engine) {
+    using dims_vec_t = std::vector<dim_idx_t>;
 
     dims_vec_t src_dim_ids(memory_desc_wrapper(src_md()).ndims());
     std::iota(src_dim_ids.begin(), src_dim_ids.end(), 0);
@@ -114,13 +112,13 @@ status_t reusable_softmax_fwd_t::pd_t::init_dispatch_default_reusable(
 }
 
 status_t reusable_softmax_fwd_t::pd_t::init_dispatch_workgroup_per_reduction(
-        engine_t *engine, const size_t num_workers_per_workgroup) {
+        gpu::engine_t *engine, const size_t num_workers_per_workgroup) {
 
     const memory_desc_wrapper src_mdw(src_md());
-    std::vector<compute::dim_id_t> dims_ids = get_dims(src_mdw.ndims());
+    std::vector<dim_idx_t> dims_ids = get_dims(src_mdw.ndims());
     auto sizes = src_mdw.dims(); // TODO: dynamic worker policy
     const size_t softmax_axis = static_cast<size_t>(desc()->softmax_axis);
-    const int softmax_axis_size = sizes[desc()->softmax_axis];
+    const dim_t softmax_axis_size = sizes[desc()->softmax_axis];
 
     // set number of work items per reduction block
     rt_conf.softmax_chunk_size = dnnl::impl::utils::div_up(
@@ -163,7 +161,7 @@ status_t reusable_softmax_fwd_t::pd_t::init_dispatch_workgroup_per_reduction(
     compute::named_buffer_t dst_buf("DST", src_buf);
 
     // dispatch: all dims except reduction dimension plus workers dimension
-    std::vector<compute::dim_id_t> dispatch_dims = std::move(dims_ids);
+    std::vector<dim_idx_t> dispatch_dims = std::move(dims_ids);
     dispatch_dims[softmax_axis] = softmax_dims_t::workers;
 
     auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
@@ -182,6 +180,20 @@ status_t reusable_softmax_fwd_t::pd_t::init_dispatch_workgroup_per_reduction(
     conf.gws_params = dispatch.get_compile_params();
     rt_conf.gws_params = dispatch.get_runtime_params();
 
+    auto dispatch_lws = dispatch.get_runtime_params().nd_range.local_range();
+    auto dispatch_gws = dispatch.get_runtime_params().nd_range.global_range();
+
+    auto *device_info = compute_engine->device_info();
+    const size_t multiple_of_sg_lws
+            = utils::rnd_up(dispatch_lws[0], device_info->max_subgroup_size());
+
+    compute::range_t softmax_gws
+            = {multiple_of_sg_lws, dispatch_gws[1], dispatch_gws[2]};
+    compute::range_t softmax_lws
+            = {multiple_of_sg_lws, dispatch_lws[1], dispatch_lws[2]};
+    compute::nd_range_t softmax_ndrange(softmax_gws, softmax_lws);
+    rt_conf.gws_params.nd_range = softmax_ndrange;
+
     return status::success;
 }
 
diff --git a/src/gpu/intel/ocl/reusable_softmax.hpp b/src/gpu/intel/ocl/reusable_softmax.hpp
index 513fb3acecf..d300f9b8aa5 100644
--- a/src/gpu/intel/ocl/reusable_softmax.hpp
+++ b/src/gpu/intel/ocl/reusable_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 #include "gpu/gpu_softmax_pd.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -57,12 +57,13 @@ struct reusable_softmax_params_t {
 #if __cplusplus >= 202002L
     bool operator==(const reusable_softmax_params_t &) const = default;
 #endif
-    serialized_t serialize() const {
-        assert_trivially_serializable(reusable_softmax_params_t);
-        return serialized_t(*this);
+    serialization_stream_t serialize() const {
+        DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(reusable_softmax_params_t);
+        return serialization_stream_t(*this);
     }
 
-    static reusable_softmax_params_t deserialize(const serialized_t &s) {
+    static reusable_softmax_params_t deserialize(
+            const serialization_stream_t &s) {
         return deserializer_t(s).pop<reusable_softmax_params_t>();
     }
 
@@ -120,9 +121,8 @@ struct reusable_softmax_fwd_t : public gpu_primitive_t {
                             compute_engine->mayiuse(
                                     compute::device_ext_t::khr_fp64)),
                     VERBOSE_UNSUPPORTED_DT_CFG);
-            VDISPATCH_SOFTMAX(
-                    attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime),
+            VDISPATCH_SOFTMAX(attr()->has_default_values(
+                                      primitive_attr_t::skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_SOFTMAX_SC(
@@ -163,7 +163,7 @@ struct reusable_softmax_fwd_t : public gpu_primitive_t {
             // run-time configuration setup
             rt_conf.softmax_axis_size = src_mdw.dims()[desc()->softmax_axis];
             for (const auto &block : layout) {
-                if (block.dim_idx == desc()->softmax_axis) {
+                if (block.dim_idx == into<dim_idx_t>(desc()->softmax_axis)) {
                     rt_conf.softmax_axis_stride = block.stride;
                     break;
                 }
@@ -211,9 +211,9 @@ struct reusable_softmax_fwd_t : public gpu_primitive_t {
             return status::success;
         }
 
-        status_t init_dispatch_default_reusable(engine_t *engine);
+        status_t init_dispatch_default_reusable(gpu::engine_t *engine);
         status_t init_dispatch_workgroup_per_reduction(
-                engine_t *engine, const size_t num_workers_per_workgroup);
+                gpu::engine_t *engine, const size_t num_workers_per_workgroup);
 
         reusable_softmax_params_t conf;
         reusable_softmax_runtime_params_t rt_conf;
diff --git a/src/gpu/intel/ocl/reusable_vectorized_lnorm.cpp b/src/gpu/intel/ocl/reusable_vectorized_lnorm.cpp
index 78a5bf6c07e..5614c630410 100644
--- a/src/gpu/intel/ocl/reusable_vectorized_lnorm.cpp
+++ b/src/gpu/intel/ocl/reusable_vectorized_lnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,8 +26,8 @@
 #include "gpu/intel/compute/kernel_arg_list.hpp"
 #include "gpu/intel/compute/utils.hpp"
 #include "gpu/intel/ocl/lnorm_utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/reusable_lnorm.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 #include <vector>
@@ -48,8 +48,8 @@ struct single_subgroup_lws_strategy_t : public lws_strategy_t {
         : lws_strategy_t(engine, gpu_attr)
         , desired_sg_size(_desired_sg_size) {};
 
-    range_t create_lws(const range_t &gws,
-            const gws_bin_mapping_t &mapper) const override {
+    range_t create_lws(
+            range_t &gws, const gws_bin_mapping_t &mapper) const override {
         range_t lws = {desired_sg_size, 1, 1};
         return lws;
     }
@@ -97,12 +97,10 @@ static status_t init_conf_common(const layer_normalization_pd_t *pd,
     conf->calculate_stats = !pd->stats_are_src();
     conf->save_stats = pd->is_training();
 
-    auto scales = pd->attr()->scales_;
-
     // We require that the lnorm axis is a single dense block, so that it can
     // be represented by a stride + size alone.
-    size_t ndims = gpu_utils::into<size_t>(input_buf.ndims);
-    vector<compute::dim_id_t> dims = get_dims(ndims);
+    size_t ndims = into<size_t>(input_buf.ndims);
+    vector<dim_idx_t> dims = get_dims(ndims);
 
     memory_desc_wrapper src_mdw(pd->src_md());
     memory_desc_wrapper dst_mdw(pd->dst_md());
@@ -139,7 +137,7 @@ static status_t init_conf_common(const layer_normalization_pd_t *pd,
             bool sg_and_vector_size_ok = is_sg_and_vector_size_compatible(
                     compute_engine, sg_size, vector_size);
             bool sg_stride_ok = is_sg_stride_compatible(
-                    pd->norm_axis(), sg_size * vector_size);
+                    into<dim_idx_t>(pd->norm_axis()), sg_size * vector_size);
 
             if (sg_and_vector_size_ok && sg_stride_ok) {
                 conf->sg_size = sg_size;
@@ -184,8 +182,8 @@ static status_t init_conf_common(const layer_normalization_pd_t *pd,
 status_t reusable_vectorized_layer_normalization_fwd_t::pd_t::init_conf(
         impl::engine_t *engine) {
     size_t ndims = static_cast<size_t>(src_md()->ndims);
-    vector<compute::dim_id_t> dims = get_dims(ndims);
-    vector<compute::dim_id_t> stat_dims = get_dims(ndims, true);
+    vector<dim_idx_t> dims = get_dims(ndims);
+    vector<dim_idx_t> stat_dims = get_dims(ndims, true);
 
     //init_scratchpad();
     // FWD buffers:
diff --git a/src/gpu/intel/ocl/reusable_vectorized_lnorm.hpp b/src/gpu/intel/ocl/reusable_vectorized_lnorm.hpp
index dcf1b99cc1c..3e0320b95e7 100644
--- a/src/gpu/intel/ocl/reusable_vectorized_lnorm.hpp
+++ b/src/gpu/intel/ocl/reusable_vectorized_lnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,12 +19,12 @@
 
 #include "common/c_types_map.hpp"
 #include "common/layer_normalization_pd.hpp"
+#include "common/serialization.hpp"
 #include "common/utils.hpp"
 #include "gpu/gpu_layer_normalization_pd.hpp"
 #include "gpu/intel/compute/dispatch_reusable.hpp"
 #include "gpu/intel/compute/kernel_ctx.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/serialization.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -119,8 +119,7 @@ struct reusable_vectorized_layer_normalization_fwd_t : public gpu_primitive_t {
                     VERBOSE_UNSUPPORTED_DT);
 
             using skip_mask_t = primitive_attr_t::skip_mask_t;
-            VDISPATCH_LNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime),
+            VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_LNORM(
                     set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
diff --git a/src/gpu/intel/ocl/rnn/cell_common.cpp b/src/gpu/intel/ocl/rnn/cell_common.cpp
index a45214cc544..9cf25e8ac4c 100644
--- a/src/gpu/intel/ocl/rnn/cell_common.cpp
+++ b/src/gpu/intel/ocl/rnn/cell_common.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ strides_t<out_ndims> inner(const strides_t<in_ndims> &s) {
 }
 
 status_t compute_cell_fwd(const exec_ctx_t &ctx,
-        const compute::kernel_t &kernel, int lay, int dir, int iter,
+        const compute::kernel_t &kernel, dim_t lay, dim_t dir, dim_t iter,
         const workspace_t &workspace, const user_data_t user_data,
         const sub_buffer_t &weights_layer, const sub_buffer_t &weights_iter,
         const sub_buffer_t &cell_layer, const strides_t<4> &cell_layer_strides,
@@ -84,13 +84,13 @@ status_t compute_cell_fwd(const exec_ctx_t &ctx,
     arg_list.append(offsets.weights_layer);
     arg_list.append(weights_iter, ocl_conf.wei_dt);
     arg_list.append(offsets.weights_iter);
-    arg_list.append(cell_layer, ocl_conf.aux_dt);
+    arg_list.append(cell_layer, ocl_conf.ws_state_dt);
     arg_list.append(inner<2>(cell_layer_strides));
-    arg_list.append(cell_iter, ocl_conf.aux_dt);
+    arg_list.append(cell_iter, ocl_conf.ws_state_dt);
     arg_list.append(inner<2>(cell_iter_strides));
     arg_list.append(gates, ocl_conf.aux_dt);
     arg_list.append(inner<2>(gates_strides));
-    arg_list.append(states, ocl_conf.aux_dt);
+    arg_list.append(states, ocl_conf.ws_state_dt);
     arg_list.append(inner<2>(states_strides));
 
     if (ocl_conf.cell_kind == alg_kind::vanilla_lstm) {
@@ -114,13 +114,13 @@ status_t compute_cell_fwd(const exec_ctx_t &ctx,
     arg_list.append(conf.slc);
     arg_list.append(conf.sic);
 
-    arg_list.append(gpu_utils::into<dim_t>(dhc_loop));
+    arg_list.append(into<dim_t>(dhc_loop));
 
     return gpu_primitive_t::parallel_for(ctx, nd_range, kernel, arg_list.args);
 }
 
 template <prop_kind_t aprop>
-cell_execution_sig((_simple_rnn_common_t<aprop>::cell_execution)) {
+cell_execution_sig((simple_rnn_common_t<aprop>::cell_execution)) {
     const conf_t &rnn = this->pd()->rnn_conf;
     const ocl_conf_t &ocl_conf = this->pd()->ocl_conf;
     const rnn_offsets_t &offsets = this->pd()->off;
@@ -207,8 +207,12 @@ cell_execution_sig((_simple_rnn_common_t<aprop>::cell_execution)) {
                 engine, ctx, wei_iter, diff_gates, diff_states, gemm_iter_bwd));
 
         if (!rnn.merge_gemm_layer) {
+
+            auto gemm_layer_cell_bwd = !rnn.copy_diff_src_layer && lay == 0
+                    ? gemm_layer_bwd_src
+                    : gemm_layer_bwd;
             CHECK(gemm_primitive(engine, ctx, wei_layer, diff_gates,
-                    diff_states1, gemm_layer_bwd));
+                    diff_states1, gemm_layer_cell_bwd));
 
             auto gemm_diff_wei_cell_layer = !rnn.copy_src_layer && lay == 0
                     ? gemm_diff_wei_layer_src
diff --git a/src/gpu/intel/ocl/rnn/cell_compute.h b/src/gpu/intel/ocl/rnn/cell_compute.h
index 8f77ad1c37b..cede852b5cc 100644
--- a/src/gpu/intel/ocl/rnn/cell_compute.h
+++ b/src/gpu/intel/ocl/rnn/cell_compute.h
@@ -17,6 +17,7 @@
 #ifndef GPU_INTEL_OCL_RNN_CELL_COMPUTE_H
 #define GPU_INTEL_OCL_RNN_CELL_COMPUTE_H
 
+#include "gpu/intel/ocl/ocl_conversion.h"
 #include "gpu/intel/ocl/rnn/rnn_common.h"
 
 #if CELL_COMP_ENABLED
@@ -87,6 +88,16 @@ typedef struct {
     cell_strides_t strides;
 } const_aux_cell_t;
 
+typedef struct {
+    __global WS_STATE_DATA_T *ptr;
+    cell_strides_t strides;
+} ws_state_cell_t;
+
+typedef struct {
+    __global const WS_STATE_DATA_T *ptr;
+    cell_strides_t strides;
+} const_ws_state_cell_t;
+
 typedef struct {
     cell_dim_t mb;
     cell_dim_t dhc;
@@ -121,9 +132,22 @@ typedef struct {
 #define NEED_SCRATCH_GATES \
     (!(CELL_COMPUTE_GEMM_LAYER && CELL_COMPUTE_GEMM_ITER))
 
-inline void load(float *s, const __global float *data, bool is_valid) {
-    *s = is_valid ? as_float(data[get_sub_group_local_id()]) : 0;
+inline void __attribute__((overloadable))
+load(float *s, const __global float *data, bool is_valid) {
+    *s = is_valid ? data[get_sub_group_local_id()] : 0;
 }
+
+inline void __attribute__((overloadable))
+load(float *s, const __global half *data, bool is_valid) {
+    *s = is_valid ? into_float(data[get_sub_group_local_id()]) : 0;
+}
+
+// Bfloat 16
+inline void __attribute__((overloadable))
+load(float *s, const __global ushort *data, bool is_valid) {
+    *s = is_valid ? into_float(as_bf16(data[get_sub_group_local_id()])) : 0;
+}
+
 inline float __attribute__((overloadable)) sg_get(float s, int offset) {
     return intel_sub_group_shuffle(s, offset);
 }
diff --git a/src/gpu/intel/ocl/rnn/cell_gru.cpp b/src/gpu/intel/ocl/rnn/cell_gru.cpp
index bc00306c3b8..16b5540ce31 100644
--- a/src/gpu/intel/ocl/rnn/cell_gru.cpp
+++ b/src/gpu/intel/ocl/rnn/cell_gru.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ using namespace dnnl::impl::utils;
 using namespace rnn_utils;
 
 template <prop_kind_t aprop>
-cell_execution_sig((_simple_rnn_common_t<aprop>::cell_execution_gru)) {
+cell_execution_sig((simple_rnn_common_t<aprop>::cell_execution_gru)) {
     const conf_t &rnn = this->pd()->rnn_conf;
     const ocl_conf_t &ocl_conf = this->pd()->ocl_conf;
     const rnn_offsets_t &offsets = this->pd()->off;
@@ -146,9 +146,12 @@ cell_execution_sig((_simple_rnn_common_t<aprop>::cell_execution_gru)) {
                     user_data.diff_wei_layer(lay, dir),
                     gemm_diff_wei_cell_layer));
 
+            auto gemm_layer_cell_bwd = !rnn.copy_diff_src_layer && lay == 0
+                    ? gemm_layer_bwd_src
+                    : gemm_layer_bwd;
             // dx = dG2 * W2x + dG1 * W1x + dG0 * W0x
             CHECK(gemm_primitive(engine, ctx, wei_layer, diff_gates,
-                    diff_states1, gemm_layer_bwd));
+                    diff_states1, gemm_layer_cell_bwd));
         }
     }
     return status::success;
diff --git a/src/gpu/intel/ocl/rnn/cell_gru_lbr.cpp b/src/gpu/intel/ocl/rnn/cell_gru_lbr.cpp
index a42d9f58378..f4f7cd8ab8d 100644
--- a/src/gpu/intel/ocl/rnn/cell_gru_lbr.cpp
+++ b/src/gpu/intel/ocl/rnn/cell_gru_lbr.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@ template cell_execution_sig(simple_rnn_fwd_t::cell_execution_gru_lbr);
 template cell_execution_sig(simple_rnn_bwd_t::cell_execution_gru_lbr);
 
 template <prop_kind_t aprop>
-cell_execution_sig((_simple_rnn_common_t<aprop>::cell_execution_gru_lbr)) {
+cell_execution_sig((simple_rnn_common_t<aprop>::cell_execution_gru_lbr)) {
     const conf_t &rnn = this->pd()->rnn_conf;
     const ocl_conf_t &ocl_conf = this->pd()->ocl_conf;
     const rnn_offsets_t &offsets = this->pd()->off;
@@ -98,8 +98,11 @@ cell_execution_sig((_simple_rnn_common_t<aprop>::cell_execution_gru_lbr)) {
                     user_data.diff_wei_layer(lay, dir),
                     gemm_diff_wei_cell_layer));
 
+            auto gemm_layer_cell_bwd = !rnn.copy_diff_src_layer && lay == 0
+                    ? gemm_layer_bwd_src
+                    : gemm_layer_bwd;
             CHECK(gemm_primitive(engine, ctx, wei_layer, diff_gates,
-                    diff_states1, gemm_layer_bwd));
+                    diff_states1, gemm_layer_cell_bwd));
         }
 
         CHECK(gemm_primitive(engine, ctx, wei_iter, scratch_cell, diff_states,
diff --git a/src/gpu/intel/ocl/rnn/rnn_common.h b/src/gpu/intel/ocl/rnn/rnn_common.h
index ea55642c392..1fc014658ec 100644
--- a/src/gpu/intel/ocl/rnn/rnn_common.h
+++ b/src/gpu/intel/ocl/rnn/rnn_common.h
@@ -137,9 +137,11 @@ int off_scratch_diff_states(int n_layer, int n_dir, int n_states, int n_iter,
             : n_states;
 
     int i3_size = n_iter + 1;
-    if (i0_size == 0) {
-        i3_size = 2;
-        i3 %= i3_size;
+    if (i0_size <= 1) {
+        if (i0_size <= 0) {
+            i3_size = 2;
+            i3 %= i3_size;
+        }
         return OFF5(i1, conf_.n_dir, i2, i2_size, i3, i3_size, i4, batch, i5,
                 scratch_diff_states_ld);
     }
diff --git a/src/gpu/intel/ocl/rnn/rnn_grid.cl b/src/gpu/intel/ocl/rnn/rnn_grid.cl
index 29d8385aa92..d6b43d38e46 100644
--- a/src/gpu/intel/ocl/rnn/rnn_grid.cl
+++ b/src/gpu/intel/ocl/rnn/rnn_grid.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -745,13 +745,13 @@ simple_rnn_elemwise_bwd(int dir, int lay, int iter,
                 convert_float(bias[off_ker_bias(dhc, 0, j)]), alpha, tm_scales);
 #endif
 #if IS_TESTMODE
-        float tmp = = dH * activation_bwd(g, tm_scales[0], 0.);
+        float tmp = = dH * activation_bwd(g, tm_scales[0], 0.0f);
         scratch_diff_gates[cell_scratch_mem(
                 scratch_diff_gates_ld, dhc, i, 0, j)]
                 = TO_SRC(tmp);
         diff_bias_acc[0] += tmp;
 #else
-        float tmp = dH * activation_bwd(g, alpha, 0.);
+        float tmp = dH * activation_bwd(g, alpha, 0.0f);
         scratch_diff_gates[cell_scratch_mem(
                 scratch_diff_gates_ld, dhc, i, 0, j)]
                 = TO_SRC(tmp);
@@ -829,8 +829,8 @@ __kernel void simple_rnn_elemwise_bwd() {}
 #if CELL_COMP_ENABLED
 
 void gemm_sum_inner(float(C)[M_THR_BLOCK][N_THR_BLOCK],
-        const __global float *restrict A, const int a_stride,
-        const __global float *restrict B, const int b_stride,
+        const __global WS_STATE_DATA_T *restrict A, const int a_stride,
+        const __global WEI_LAYER_DATA_T *restrict B, const int b_stride,
         const int m_thr_stride, const int n_thr_stride, int m_l_end,
         int k_l_end, int n_l_end, bool mn_valid) {
 
@@ -866,10 +866,11 @@ void gemm_sum_inner(float(C)[M_THR_BLOCK][N_THR_BLOCK],
 
 // Perform C += A * B where all matrices are in row major layout
 void gemm_sum(float(C)[N_OUTER_BLOCK][M_THR_BLOCK][N_THR_BLOCK],
-        const __global float *restrict A, const int a_stride,
-        const __global float *restrict B, const int b_stride, gemm_dims_t size,
-        int m_sg, int m_thr_stride, int n_sg, int n_thr_stride,
-        bool enable_m_tail, bool enable_k_tail, bool enable_n_tail) {
+        const __global WS_STATE_DATA_T *restrict A, const int a_stride,
+        const __global WEI_LAYER_DATA_T *restrict B, const int b_stride,
+        gemm_dims_t size, int m_sg, int m_thr_stride, int n_sg,
+        int n_thr_stride, bool enable_m_tail, bool enable_k_tail,
+        bool enable_n_tail) {
 
     // Optimization opportunity: Loads across the m and n dimension can overflow
     // so long as they do not cross the end of the buffer.
@@ -930,10 +931,10 @@ void gemm_sum(float(C)[N_OUTER_BLOCK][M_THR_BLOCK][N_THR_BLOCK],
 }
 
 void cell_common_inner(const_wei_layer_cell_t wei_layer,
-        const_wei_iter_cell_t wei_iter, const_aux_cell_t cell_layer,
-        const_aux_cell_t cell_iter, aux_cell_t gates, aux_cell_t states,
-        const_aux_cell_t scratch_gates, cell_ctx_t ctx, cell_dims_t outer,
-        cell_dims_t dims) {
+        const_wei_iter_cell_t wei_iter, const_ws_state_cell_t cell_layer,
+        const_ws_state_cell_t cell_iter, aux_cell_t gates,
+        ws_state_cell_t states, const_aux_cell_t scratch_gates, cell_ctx_t ctx,
+        cell_dims_t outer, cell_dims_t dims) {
     // Extract local id from the subgroup id rather than `get_local_id` as the
     // mapping from subgroups to the local work group is not well defined.
     const int local_sgid0
@@ -1031,10 +1032,10 @@ void cell_common_inner(const_wei_layer_cell_t wei_layer,
 }
 
 void cell_common(const_wei_layer_cell_t wei_layer,
-        const_wei_iter_cell_t wei_iter, const_aux_cell_t cell_layer,
-        const_aux_cell_t cell_iter, aux_cell_t gates, aux_cell_t states,
-        const_aux_cell_t scratch_gates, cell_ctx_t ctx, cell_dims_t dims,
-        cell_loops_t loops) {
+        const_wei_iter_cell_t wei_iter, const_ws_state_cell_t cell_layer,
+        const_ws_state_cell_t cell_iter, aux_cell_t gates,
+        ws_state_cell_t states, const_aux_cell_t scratch_gates, cell_ctx_t ctx,
+        cell_dims_t dims, cell_loops_t loops) {
 
     for_(cell_dim_t mb_outer = 0; mb_outer < loops.mb; mb_outer += BATCH_LOCAL)
     for (cell_dim_t dhc_outer = 0; dhc_outer < loops.dhc;
@@ -1049,12 +1050,14 @@ __attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) __kernel void
 simple_rnn_cell_fwd(__global const WEI_LAYER_DATA_T *wei_layer_,
         dim_t wei_layer_off, int64x5_t wei_layer_strides_,
         __global const WEI_ITER_DATA_T *wei_iter_, dim_t wei_iter_off,
-        int64x5_t wei_iter_strides_, __global const AUX_DATA_T *cell_layer_,
-        dim_t cell_layer_off, int64x2_t cell_layer_strides_,
-        __global const AUX_DATA_T *cell_iter_, dim_t cell_iter_off,
+        int64x5_t wei_iter_strides_,
+        __global const WS_STATE_DATA_T *cell_layer_, dim_t cell_layer_off,
+        int64x2_t cell_layer_strides_,
+        __global const WS_STATE_DATA_T *cell_iter_, dim_t cell_iter_off,
         int64x2_t cell_iter_strides_, __global AUX_DATA_T *gates_,
-        dim_t gates_off, int64x2_t gates_strides_, __global AUX_DATA_T *states_,
-        dim_t states_off, int64x2_t states_strides_,
+        dim_t gates_off, int64x2_t gates_strides_,
+        __global WS_STATE_DATA_T *states_, dim_t states_off,
+        int64x2_t states_strides_,
 #if CELL_KIND == VANILLA_LSTM
         __global AUX_DATA_T *c_states_, dim_t c_states_off,
         __global const AUX_DATA_T *c_states_iter_, dim_t c_states_iter_off,
@@ -1113,16 +1116,16 @@ simple_rnn_cell_fwd(__global const WEI_LAYER_DATA_T *wei_layer_,
     __global BIAS_DATA_T *bias = bias_ + bias_off;
 
     for (dim_t iter = 0; iter < iter_loop; iter++) {
-        const_aux_cell_t cell_layer = {.ptr
+        const_ws_state_cell_t cell_layer = {.ptr
                 = cell_layer_ + cell_layer_off + cell_layer_strides.iter * iter,
                 .strides = cell_layer_strides.cell};
-        const_aux_cell_t cell_iter = {.ptr
+        const_ws_state_cell_t cell_iter = {.ptr
                 = cell_iter_ + cell_iter_off + cell_iter_strides.iter * iter,
                 .strides = cell_iter_strides.cell};
         aux_cell_t gates
                 = {.ptr = gates_ + gates_off + gates_strides.iter * iter,
                         .strides = gates_strides.cell};
-        aux_cell_t states
+        ws_state_cell_t states
                 = {.ptr = states_ + states_off + states_strides.iter * iter,
                         .strides = states_strides.cell};
         const_aux_cell_t scratch_gates = {.ptr = scratch_gates_
diff --git a/src/gpu/intel/ocl/rnn/rnn_grid.cpp b/src/gpu/intel/ocl/rnn/rnn_grid.cpp
index fb020460f7c..c70b5854807 100644
--- a/src/gpu/intel/ocl/rnn/rnn_grid.cpp
+++ b/src/gpu/intel/ocl/rnn/rnn_grid.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,22 +29,12 @@
 #include "gpu/intel/ocl/rnn/rnn_grid.hpp"
 
 #include "common/c_types_map.hpp"
-#include "common/dnnl_traits.hpp"
 #include "common/gemm_utils.hpp"
-#include "common/math_utils.hpp"
 #include "common/type_helpers.hpp"
 #include "gpu/intel/gemm/gpu_gemm.hpp"
 #include "gpu/intel/gpu_primitive_attr.hpp"
 #include "gpu/intel/utils.hpp"
 
-#define DPRINT(fmt, ...) \
-    do { \
-        if (get_verbose_dev_mode(verbose_t::debuginfo) >= 2) { \
-            printf(fmt, __VA_ARGS__); \
-            fflush(nullptr); \
-        } \
-    } while (0)
-
 namespace dnnl {
 namespace impl {
 namespace gpu {
@@ -53,7 +43,6 @@ namespace ocl {
 
 using namespace dnnl::impl::utils;
 using namespace dnnl::impl::gpu::intel::gpu_utils;
-using namespace dnnl::impl::math;
 using namespace prop_kind;
 using namespace alg_kind;
 using namespace rnn_utils;
@@ -61,143 +50,145 @@ using namespace dnnl::impl::memory_tracking::names;
 
 #define AOC array_offset_calculator
 
-static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
-        const rnn_pd_t *rnn_pd, const rnn_utils::conf_t &rnn,
-        const compute::device_info_t &device_info,
-        const memory_desc_wrapper &src_layer_d,
-        const memory_desc_wrapper &src_iter_d,
-        const memory_desc_wrapper &src_iter_c_d,
-        const memory_desc_wrapper &weights_layer_d,
-        const memory_desc_wrapper &weights_iter_d,
-        const memory_desc_wrapper &bias_d,
-        const memory_desc_wrapper &dst_layer_d,
-        const memory_desc_wrapper &dst_iter_d,
-        const memory_desc_wrapper &dst_iter_c_d,
-        const memory_desc_wrapper &diff_src_layer_d,
-        const memory_desc_wrapper &diff_src_iter_d,
-        const memory_desc_wrapper &diff_src_iter_c_d,
-        const memory_desc_wrapper &diff_weights_layer_d,
-        const memory_desc_wrapper &diff_weights_iter_d,
-        const memory_desc_wrapper &diff_bias_d,
-        const memory_desc_wrapper &diff_dst_layer_d,
-        const memory_desc_wrapper &diff_dst_iter_d,
-        const memory_desc_wrapper &diff_dst_iter_c_d,
-        const memory_desc_wrapper &ws_d, rnn_offsets_t &off) {
-
-    using namespace rnn_utils;
-
-    ocl_conf.src_dt = rnn.src_data_type;
-    ocl_conf.src_c_dt = src_iter_c_d.data_type();
-    ocl_conf.wei_dt = weights_layer_d.data_type();
-    ocl_conf.bia_dt = rnn.bias_data_type;
-    ocl_conf.acc_dt = rnn.acc_data_type;
-    ocl_conf.aux_dt = rnn.aux_data_type;
-    ocl_conf.diff_dt = rnn.diff_data_type;
-    ocl_conf.input_dt = rnn.input_data_type;
-    ocl_conf.output_dt = rnn.output_data_type;
-    ocl_conf.dst_dt = rnn.dst_data_type;
-    ocl_conf.dst_c_dt = dst_iter_c_d.data_type();
-
-    ocl_conf.is_fwd = rnn.is_fwd;
-
-    ocl_conf.with_bias = rnn_pd->with_bias();
-    ocl_conf.with_src_iter = rnn_pd->with_src_iter();
-    ocl_conf.with_src_iter_c = rnn_pd->with_src_iter_c();
-    ocl_conf.with_dst_iter = rnn_pd->with_dst_iter();
-    ocl_conf.with_dst_iter_c = rnn_pd->with_dst_iter_c();
-    ocl_conf.copy_bias = rnn.copy_bias;
-    ocl_conf.is_int8 = rnn.is_int8;
-    ocl_conf.is_training = rnn.is_training;
-    ocl_conf.recompute_gates = rnn.recompute_gates;
-    ocl_conf.copy_src_layer = rnn.copy_src_layer;
-    ocl_conf.copy_diff_dst_layer = rnn.copy_diff_dst_layer;
-    ocl_conf.copy_diff_src_layer = rnn.copy_diff_src_layer;
-
-    if (!rnn.is_fwd) {
-        if (!utils::everyone_is(ocl_conf.diff_dt, diff_src_layer_d.data_type(),
-                    diff_dst_layer_d.data_type()))
-            return status::unimplemented;
-        if (!utils::one_of(diff_src_iter_d.data_type(), ocl_conf.diff_dt,
-                    data_type::undef)
-                || !utils::one_of(diff_src_iter_c_d.data_type(),
-                        ocl_conf.diff_dt, data_type::undef)
-                || !utils::one_of(diff_dst_iter_d.data_type(), ocl_conf.diff_dt,
-                        data_type::undef)
-                || !utils::one_of(diff_dst_iter_c_d.data_type(),
-                        ocl_conf.diff_dt, data_type::undef))
-            return status::unimplemented;
-    }
+static status_t init_layouts_data(rnn_offsets_t &off,
+        ocl_conf_t::inner_layouts_t &inner_layouts, const rnn_pd_t *pd,
+        const rnn_utils::conf_t &rnn) {
+    const memory_desc_wrapper &src_layer_d = pd->src_md(0);
+    const memory_desc_wrapper &src_iter_d = pd->src_md(1);
+    const memory_desc_wrapper &src_iter_c_d = pd->src_md(2);
+    const memory_desc_wrapper &weights_layer_d = pd->weights_md(0);
+    const memory_desc_wrapper &weights_iter_d = pd->weights_md(1);
+    const memory_desc_wrapper &bias_d = pd->weights_md(2);
+    const memory_desc_wrapper &dst_layer_d = pd->dst_md(0);
+    const memory_desc_wrapper &dst_iter_d = pd->dst_md(1);
+    const memory_desc_wrapper &dst_iter_c_d = pd->dst_md(2);
+    const memory_desc_wrapper &diff_src_layer_d = pd->diff_src_md(0);
+    const memory_desc_wrapper &diff_src_iter_d = pd->diff_src_md(1);
+    const memory_desc_wrapper &diff_src_iter_c_d = pd->diff_src_md(2);
+    const memory_desc_wrapper &diff_weights_layer_d = pd->diff_weights_md(0);
+    const memory_desc_wrapper &diff_weights_iter_d = pd->diff_weights_md(1);
+    const memory_desc_wrapper &diff_bias_d = pd->diff_weights_md(2);
+    const memory_desc_wrapper &diff_dst_layer_d = pd->diff_dst_md(0);
+    const memory_desc_wrapper &diff_dst_iter_d = pd->diff_dst_md(1);
+    const memory_desc_wrapper &diff_dst_iter_c_d = pd->diff_dst_md(2);
 
     off.src_layer = gpu::intel::get_outer_strides(src_layer_d);
-    ocl_conf.inner_layouts.src_layer
-            = gpu::intel::get_inner_layout(src_layer_d);
+    inner_layouts.src_layer = gpu::intel::get_inner_layout(src_layer_d);
     off.src_iter = gpu::intel::get_outer_strides(src_iter_d);
-    ocl_conf.inner_layouts.src_iter = gpu::intel::get_inner_layout(src_iter_d);
-    if (ocl_conf.with_src_iter_c) {
+    inner_layouts.src_iter = gpu::intel::get_inner_layout(src_iter_d);
+    if (pd->with_src_iter_c()) {
         off.src_iter_c = gpu::intel::get_outer_strides(src_iter_c_d);
-        ocl_conf.inner_layouts.src_iter_c
-                = gpu::intel::get_inner_layout(src_iter_c_d);
+        inner_layouts.src_iter_c = gpu::intel::get_inner_layout(src_iter_c_d);
     }
     off.weights_layer = gpu::intel::get_outer_strides(weights_layer_d);
-    ocl_conf.inner_layouts.weights_layer
-            = gpu::intel::get_inner_layout(weights_layer_d);
+    inner_layouts.weights_layer = gpu::intel::get_inner_layout(weights_layer_d);
     off.weights_layer_comp_off
             = weights_layer_d.dims()[0] * weights_layer_d.strides()[0];
     off.weights_iter = gpu::intel::get_outer_strides(weights_iter_d);
-    ocl_conf.inner_layouts.weights_iter
-            = gpu::intel::get_inner_layout(weights_iter_d);
+    inner_layouts.weights_iter = gpu::intel::get_inner_layout(weights_iter_d);
     off.weights_iter_comp_off
             = weights_iter_d.dims()[0] * weights_iter_d.strides()[0];
     off.bias = gpu::intel::get_outer_strides(bias_d);
-    ocl_conf.inner_layouts.bias = gpu::intel::get_inner_layout(bias_d);
+    inner_layouts.bias = gpu::intel::get_inner_layout(bias_d);
     off.dst_layer = gpu::intel::get_outer_strides(dst_layer_d);
-    ocl_conf.inner_layouts.dst_layer
-            = gpu::intel::get_inner_layout(dst_layer_d);
+    inner_layouts.dst_layer = gpu::intel::get_inner_layout(dst_layer_d);
     off.dst_iter = gpu::intel::get_outer_strides(dst_iter_d);
-    ocl_conf.inner_layouts.dst_iter = gpu::intel::get_inner_layout(dst_iter_d);
-    if (ocl_conf.with_dst_iter_c) {
+    inner_layouts.dst_iter = gpu::intel::get_inner_layout(dst_iter_d);
+    if (pd->with_dst_iter_c()) {
         off.dst_iter_c = gpu::intel::get_outer_strides(dst_iter_c_d);
-        ocl_conf.inner_layouts.dst_iter_c
-                = gpu::intel::get_inner_layout(dst_iter_c_d);
+        inner_layouts.dst_iter_c = gpu::intel::get_inner_layout(dst_iter_c_d);
     }
 
-    if (!ocl_conf.is_fwd) {
+    if (!pd->is_fwd()) {
+        if (!utils::everyone_is(rnn.diff_data_type,
+                    diff_src_layer_d.data_type(), diff_dst_layer_d.data_type()))
+            return status::unimplemented;
+        if (!utils::one_of(diff_src_iter_d.data_type(), rnn.diff_data_type,
+                    data_type::undef)
+                || !utils::one_of(diff_src_iter_c_d.data_type(),
+                        rnn.diff_data_type, data_type::undef)
+                || !utils::one_of(diff_dst_iter_d.data_type(),
+                        rnn.diff_data_type, data_type::undef)
+                || !utils::one_of(diff_dst_iter_c_d.data_type(),
+                        rnn.diff_data_type, data_type::undef))
+            return status::unimplemented;
+
         off.diff_src_layer = gpu::intel::get_outer_strides(diff_src_layer_d);
-        ocl_conf.inner_layouts.diff_src_layer
+        inner_layouts.diff_src_layer
                 = gpu::intel::get_inner_layout(diff_src_layer_d);
         off.diff_src_iter = gpu::intel::get_outer_strides(diff_src_iter_d);
-        ocl_conf.inner_layouts.diff_src_iter
+        inner_layouts.diff_src_iter
                 = gpu::intel::get_inner_layout(diff_src_iter_d);
-        if (ocl_conf.with_src_iter_c) {
+        if (pd->with_src_iter_c()) {
             off.diff_src_iter_c
                     = gpu::intel::get_outer_strides(diff_src_iter_c_d);
-            ocl_conf.inner_layouts.diff_src_iter_c
+            inner_layouts.diff_src_iter_c
                     = gpu::intel::get_inner_layout(diff_src_iter_c_d);
         }
         off.diff_weights_layer
                 = gpu::intel::get_outer_strides(diff_weights_layer_d);
-        ocl_conf.inner_layouts.diff_weights_layer
+        inner_layouts.diff_weights_layer
                 = gpu::intel::get_inner_layout(diff_weights_layer_d);
         off.diff_weights_iter
                 = gpu::intel::get_outer_strides(diff_weights_iter_d);
-        ocl_conf.inner_layouts.diff_weights_iter
+        inner_layouts.diff_weights_iter
                 = gpu::intel::get_inner_layout(diff_weights_iter_d);
         off.diff_bias = gpu::intel::get_outer_strides(diff_bias_d);
-        ocl_conf.inner_layouts.diff_bias
-                = gpu::intel::get_inner_layout(diff_bias_d);
+        inner_layouts.diff_bias = gpu::intel::get_inner_layout(diff_bias_d);
         off.diff_dst_layer = gpu::intel::get_outer_strides(diff_dst_layer_d);
-        ocl_conf.inner_layouts.diff_dst_layer
+        inner_layouts.diff_dst_layer
                 = gpu::intel::get_inner_layout(diff_dst_layer_d);
         off.diff_dst_iter = gpu::intel::get_outer_strides(diff_dst_iter_d);
-        ocl_conf.inner_layouts.diff_dst_iter
+        inner_layouts.diff_dst_iter
                 = gpu::intel::get_inner_layout(diff_dst_iter_d);
-        if (ocl_conf.with_dst_iter_c) {
+        if (pd->with_dst_iter_c()) {
             off.diff_dst_iter_c
                     = gpu::intel::get_outer_strides(diff_dst_iter_c_d);
-            ocl_conf.inner_layouts.diff_dst_iter_c
+            inner_layouts.diff_dst_iter_c
                     = gpu::intel::get_inner_layout(diff_dst_iter_c_d);
         }
     }
+    return status::success;
+}
+
+static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
+        const rnn_pd_t *rnn_pd, const rnn_utils::conf_t &rnn,
+        int threads_per_eu, const compute::device_info_t &device_info,
+        rnn_offsets_t &off) {
+
+    using namespace rnn_utils;
+
+    const memory_desc_wrapper &src_iter_c_d = rnn_pd->src_md(2);
+    const memory_desc_wrapper &weights_layer_d = rnn_pd->weights_md(0);
+    const memory_desc_wrapper &dst_iter_c_d = rnn_pd->dst_md(2);
+
+    ocl_conf.src_dt = rnn.src_data_type;
+    ocl_conf.src_c_dt = src_iter_c_d.data_type();
+    ocl_conf.wei_dt = weights_layer_d.data_type();
+    ocl_conf.bia_dt = rnn.bias_data_type;
+    ocl_conf.acc_dt = rnn.acc_data_type;
+    ocl_conf.aux_dt = rnn.aux_data_type;
+    ocl_conf.ws_state_dt = rnn.src_data_type;
+    ocl_conf.diff_dt = rnn.diff_data_type;
+    ocl_conf.input_dt = rnn.input_data_type;
+    ocl_conf.output_dt = rnn.output_data_type;
+    ocl_conf.dst_dt = rnn.dst_data_type;
+    ocl_conf.dst_c_dt = dst_iter_c_d.data_type();
+
+    ocl_conf.is_fwd = rnn.is_fwd;
+
+    ocl_conf.with_bias = rnn_pd->with_bias();
+    ocl_conf.with_src_iter = rnn_pd->with_src_iter();
+    ocl_conf.with_src_iter_c = rnn_pd->with_src_iter_c();
+    ocl_conf.with_dst_iter = rnn_pd->with_dst_iter();
+    ocl_conf.with_dst_iter_c = rnn_pd->with_dst_iter_c();
+    ocl_conf.copy_bias = rnn.copy_bias;
+    ocl_conf.is_int8 = rnn.is_int8;
+    ocl_conf.is_training = rnn.is_training;
+    ocl_conf.recompute_gates = rnn.recompute_gates;
+    ocl_conf.copy_src_layer = rnn.copy_src_layer;
+    ocl_conf.copy_diff_dst_layer = rnn.copy_diff_dst_layer;
+    ocl_conf.copy_diff_src_layer = rnn.copy_diff_src_layer;
 
     ocl_conf.cell_kind = rnn_pd->cell_kind();
     ocl_conf.activation_kind = rnn_pd->activation_kind();
@@ -206,7 +197,7 @@ static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
     ocl_conf.wei_qparam_mask = rnn_pd->attr()->rnn_weights_qparams_.mask_;
     ocl_conf.is_testmode = rnn.is_testmode;
 
-    ocl_conf.threads_per_eu = 0; // Currently unset, to be set later
+    ocl_conf.threads_per_eu = threads_per_eu;
     ocl_conf.subgroup_size = dev_getenv(
             "subgroup_size", device_info.max_subgroup_size(ocl_conf.acc_dt));
     auto max_elemwise_threads
@@ -247,8 +238,8 @@ static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
 
         std::array<dim_t, 9> dhc_hw_threads = {1, 2, 3, 4, 5, 6, 7, 8, 16};
         std::array<dim_t, 3> mb_hw_threads = {1, 2, 4};
-        int dhc_tg_best = 1;
-        int mb_tg_best = 1;
+        dim_t dhc_tg_best = 1;
+        dim_t mb_tg_best = 1;
         double best_score = 0;
         for (auto b_thread : mb_hw_threads) {
             for (auto d_thread : dhc_hw_threads) {
@@ -256,6 +247,9 @@ static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
                 dim_t dhc_block = dhc_thr * dhc_tg;
                 dim_t mb_tg = b_thread;
                 dim_t mb_block = mb_thr * mb_tg;
+                if (size_t(dhc_tg * mb_tg) > device_info.max_wg_size(
+                            threads_per_eu == 4, ocl_conf.subgroup_size))
+                    break;
 
                 double score = [&]() {
                     // subslice efficiency
@@ -307,8 +301,8 @@ static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
             }
         }
 
-        int dhc_tg = dev_getenv("dhc_tg", dhc_tg_best);
-        int mb_tg = dev_getenv("mb_tg", mb_tg_best);
+        dim_t dhc_tg = dev_getenv("dhc_tg", into<int>(dhc_tg_best));
+        dim_t mb_tg = dev_getenv("mb_tg", into<int>(mb_tg_best));
 
         int mb_tail = dev_getenv("mb_tail",
                 rnn.mb % (mb_tg * mb_thr) != 0
@@ -329,9 +323,9 @@ static status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
         ocl_conf.cell_comp.mb_tail = mb_tail;
         ocl_conf.cell_comp.enable_iter_block = rnn.iter_loop != 1;
         ocl_conf.cell_comp.dhc_thr = dhc_thr;
-        ocl_conf.cell_comp.dhc_tg = dhc_tg;
+        ocl_conf.cell_comp.dhc_tg = into<int>(dhc_tg);
         ocl_conf.cell_comp.mb_thr = mb_thr;
-        ocl_conf.cell_comp.mb_tg = mb_tg;
+        ocl_conf.cell_comp.mb_tg = into<int>(mb_tg);
     }
 
     return status::success;
@@ -426,7 +420,7 @@ status_t ocl_conf_t::init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const {
     } else
         kernel_ctx.set_data_type(data_type::f32);
 
-    def_data_type(kernel_ctx, src_dt, "WS_STATE");
+    def_data_type(kernel_ctx, ws_state_dt, "WS_STATE");
     def_data_type(kernel_ctx, src_dt, "SRC");
     def_data_type(kernel_ctx, src_c_dt, "SRC_C");
     def_data_type(kernel_ctx, wei_dt, "WEI_LAYER");
@@ -466,38 +460,8 @@ status_t ocl_conf_t::init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const {
     return status::success;
 }
 
-template <prop_kind_t aprop>
-inline status_t init_ocl_conf(rnn_utils::ocl_conf_t &ocl_conf,
-        const rnn_utils::conf_t &rnn, const rnn_pd_t *rnn_pd,
-        const compute::device_info_t &device_info, rnn_offsets_t &off) {
-
-    const memory_desc_wrapper fakedesc = rnn_pd->src_md(0);
-    return init_ocl_conf(ocl_conf, rnn_pd, rnn, device_info, rnn_pd->src_md(0),
-            rnn_pd->src_md(1), rnn_pd->src_md(2), rnn_pd->weights_md(0),
-            rnn_pd->weights_md(1), rnn_pd->weights_md(2), rnn_pd->dst_md(0),
-            rnn_pd->dst_md(1), rnn_pd->dst_md(2), fakedesc, fakedesc, fakedesc,
-            fakedesc, fakedesc, fakedesc, fakedesc, fakedesc, fakedesc,
-            rnn_pd->workspace_md(0), off);
-}
-
-template <>
-inline status_t init_ocl_conf<prop_kind::backward>(
-        rnn_utils::ocl_conf_t &ocl_conf, const rnn_utils::conf_t &rnn,
-        const rnn_pd_t *rnn_pd, const compute::device_info_t &device_info,
-        rnn_offsets_t &off) {
-    return init_ocl_conf(ocl_conf, rnn_pd, rnn, device_info, rnn_pd->src_md(0),
-            rnn_pd->src_md(1), rnn_pd->src_md(2), rnn_pd->weights_md(0),
-            rnn_pd->weights_md(1), rnn_pd->weights_md(2), rnn_pd->dst_md(0),
-            rnn_pd->dst_md(1), rnn_pd->dst_md(2), rnn_pd->diff_src_md(0),
-            rnn_pd->diff_src_md(1), rnn_pd->diff_src_md(2),
-            rnn_pd->diff_weights_md(0), rnn_pd->diff_weights_md(1),
-            rnn_pd->diff_weights_md(2), rnn_pd->diff_dst_md(0),
-            rnn_pd->diff_dst_md(1), rnn_pd->diff_dst_md(2),
-            rnn_pd->workspace_md(0), off);
-}
-
 template <>
-status_t _simple_rnn_common_t<prop_kind::forward>::pd_t::set_default_params() {
+status_t simple_rnn_common_t<prop_kind::forward>::pd_t::set_default_params() {
     using namespace format_tag;
     if (src_layer_md_.format_kind == format_kind::any)
         CHECK(memory_desc_init_by_tag(src_layer_md_, tnc));
@@ -525,7 +489,7 @@ status_t _simple_rnn_common_t<prop_kind::forward>::pd_t::set_default_params() {
 }
 
 template <>
-status_t _simple_rnn_common_t<prop_kind::backward>::pd_t::set_default_params() {
+status_t simple_rnn_common_t<prop_kind::backward>::pd_t::set_default_params() {
     using namespace format_tag;
     int arch_ld = is_xe_hpc ? 128 : 64;
     if (src_layer_md_.format_kind == format_kind::any)
@@ -600,7 +564,7 @@ status_t _simple_rnn_common_t<prop_kind::backward>::pd_t::set_default_params() {
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
+status_t simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
     using namespace prop_kind;
     using namespace utils;
     using namespace rnn_utils;
@@ -766,9 +730,9 @@ status_t _simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
                 "memory_desc_init_by_tag()");
     }
 
-    VDISPATCH_RNN_SC(init_ocl_conf<aprop>(
-                             ocl_conf, rnn_conf, this, device_info, this->off),
-            "init_ocl_conf<>()");
+    VDISPATCH_RNN_SC(
+            init_layouts_data(off, ocl_conf.inner_layouts, this, rnn_conf),
+            "init_layouts_data()");
 
     dim_t batch = rnn_conf.mb;
     dim_t n_gates = rnn_conf.n_gates;
@@ -777,6 +741,7 @@ status_t _simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
     dim_t dhc = rnn_conf.dhc;
 
     auto fpmath_mode = this->attr()->fpmath_.mode_;
+    int threads_per_eu = 0;
 
     // The inputs of create_gemm_pd describe a gemm in column major.
     // Below, we have to transpose the a and b descriptor to describe
@@ -802,13 +767,13 @@ status_t _simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
         attr.deterministic_ = this->attr()->deterministic_;
         CHECK(dnnl::impl::create_gemm_pd(gemm_pd, engine, &a_md, &b_md, &c_md,
                 &glob_zero_md, c_dt, &attr));
-        if (ocl_conf.threads_per_eu == 0)
-            CHECK(gemm_pd->query(query::preferred_gpu_threads_per_eu, 0,
-                    &ocl_conf.threads_per_eu));
+        if (threads_per_eu == 0)
+            CHECK(gemm_pd->query(
+                    query::preferred_gpu_threads_per_eu, 0, &threads_per_eu));
         else if (get_verbose_dev_mode(verbose_t::debuginfo) > 1) {
             auto t = 0;
             CHECK(gemm_pd->query(query::preferred_gpu_threads_per_eu, 0, &t));
-            if (t != ocl_conf.threads_per_eu)
+            if (t != threads_per_eu)
                 verbose_printf("[WARNING] GEMM grf modes are inconsistent");
         }
         return status::success;
@@ -938,6 +903,19 @@ status_t _simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
                         {rnn_conf.scratch_diff_states_ld, 1}, weights_type,
                         src_type, rnn_conf.acc_data_type, 0.0f),
                 "create_gemm_pd(gemm_layer_bwd_pd_)");
+        if (!rnn_conf.copy_diff_src_layer) {
+            if (rnn_conf.scratch_diff_states_ld != off.diff_src_layer[1])
+                VDISPATCH_RNN_SC(
+                        create_gemm_pd(gemm_layer_bwd_src_pd_, slc,
+                                layer_merged_size, n_gates * dhc,
+                                {rnn_conf.scratch_diff_gates_ld, 1},
+                                {off.weights_layer[4], off.weights_layer[2]},
+                                {off.diff_src_layer[1], 1}, weights_type,
+                                src_type, rnn_conf.acc_data_type, 0.0f),
+                        "create_gemm_pd(gemm_layer_bwd_src_pd_)");
+            else
+                gemm_layer_bwd_src_pd_ = gemm_layer_bwd_pd_;
+        }
         VDISPATCH_RNN_SC(
                 create_gemm_pd(gemm_diff_wei_layer_pd_, n_gates * dhc, slc,
                         layer_merged_size, {1, rnn_conf.states_ws_ld},
@@ -961,12 +939,16 @@ status_t _simple_rnn_common_t<aprop>::pd_t::init(impl::engine_t *engine) {
         }
     }
 
+    VDISPATCH_RNN_SC(init_ocl_conf(ocl_conf, this, rnn_conf, threads_per_eu,
+                             device_info, this->off),
+            "init_ocl_conf()");
+
     init_scratchpad(rnn_conf.use_workspace ? 0 : workspace_size);
     return status::success;
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::init(impl::engine_t *engine) {
+status_t simple_rnn_common_t<aprop>::init(impl::engine_t *engine) {
     using namespace rnn_utils;
 
     switch (pd()->cell_kind()) {
@@ -998,7 +980,7 @@ status_t _simple_rnn_common_t<aprop>::init(impl::engine_t *engine) {
     rnn_utils::set_workspace_offsets(rnn, ws_gates_offset_, ws_states_offset_,
             ws_c_states_offset_, ws_grid_comp_offset_, ws_bias_offset_);
 
-    auto kernel_names = pd()->ocl_conf.get_kernel_names();
+    const auto &kernel_names = pd()->ocl_conf.get_kernel_names();
     CHECK(create_kernels(engine, kernels_, kernel_names, pd()->ocl_conf));
 
     bool gemm_ok = utils::everyone_is(status::success,
@@ -1025,6 +1007,12 @@ status_t _simple_rnn_common_t<aprop>::init(impl::engine_t *engine) {
                     && utils::everyone_is(status::success,
                             create_nested_primitive(gemm_layer_bwd_,
                                     pd()->gemm_layer_bwd_pd_, engine),
+                            (pd()->gemm_layer_bwd_src_pd_
+                                            ? create_nested_primitive(
+                                                    gemm_layer_bwd_src_,
+                                                    pd()->gemm_layer_bwd_src_pd_,
+                                                    engine)
+                                            : status::success),
                             create_nested_primitive(gemm_iter_bwd_,
                                     pd()->gemm_iter_bwd_pd_, engine),
                             create_nested_primitive(gemm_diff_wei_layer_,
@@ -1055,7 +1043,7 @@ status_t _simple_rnn_common_t<aprop>::init(impl::engine_t *engine) {
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::init_res_storage(
+status_t simple_rnn_common_t<aprop>::init_res_storage(
         impl::engine_t *engine, gpu_resource_t *r) const {
     if (pd()->rnn_conf.is_int8 && pd()->rnn_conf.copy_bias) {
         dim_t size = pd()->rnn_conf.n_gates * pd()->rnn_conf.dhc
@@ -1098,7 +1086,7 @@ status_t _simple_rnn_common_t<aprop>::init_res_storage(
 }
 
 template <prop_kind_t aprop>
-gemm_sig((_simple_rnn_common_t<aprop>::gemm_primitive)) {
+gemm_sig((simple_rnn_common_t<aprop>::gemm_primitive)) {
     // We flip A and B here since the GEMM API is row major but the
     // RNN code describes GEMM in column major fashion
     gemm_exec_args_t gemm_args;
@@ -1151,6 +1139,11 @@ gemm_sig((_simple_rnn_common_t<aprop>::gemm_primitive)) {
                     gemm_layer_bwd_, rnn_utils::scratch_t::key_gemm_layer_bwd);
             CHECK(gpu_gemm(gemm_layer_bwd_)->execute(gemm_ctx));
             break;
+        case gemm_layer_bwd_src:
+            init_gemm_nested_scratchpad(gemm_layer_bwd_src_,
+                    rnn_utils::scratch_t::key_gemm_layer_bwd);
+            CHECK(gpu_gemm(gemm_layer_bwd_src_)->execute(gemm_ctx));
+            break;
         case gemm_diff_wei_iter:
             init_gemm_nested_scratchpad(gemm_diff_wei_iter_,
                     rnn_utils::scratch_t::key_gemm_diff_wei_iter);
@@ -1178,7 +1171,7 @@ gemm_sig((_simple_rnn_common_t<aprop>::gemm_primitive)) {
 
 //*************** Grid computations strategy: linear ***************//
 template <prop_kind_t aprop>
-grid_execution_sig((_simple_rnn_common_t<aprop>::linear_execution)) {
+grid_execution_sig((simple_rnn_common_t<aprop>::linear_execution)) {
     const conf_t &rnn = pd()->rnn_conf;
     dim_t n_layer = rnn.n_layer;
     dim_t n_dir = rnn.n_dir;
@@ -1264,7 +1257,7 @@ grid_execution_sig((_simple_rnn_common_t<aprop>::linear_execution)) {
 //********* GRID computations strategy: utility functions **********//
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::bias_prepare(const exec_ctx_t &ctx,
+status_t simple_rnn_common_t<aprop>::bias_prepare(const exec_ctx_t &ctx,
         compute::compute_stream_t *compute_stream, dim_t n_layer, dim_t n_dir,
         dim_t n_bias, dim_t n_gates, dim_t dhc, const memory_storage_t &ws_bias,
         const memory_storage_t &scales, const memory_storage_t &wei_layer,
@@ -1290,14 +1283,13 @@ status_t _simple_rnn_common_t<aprop>::bias_prepare(const exec_ctx_t &ctx,
     arg_list.append(pd()->off.bias);
 
     return parallel_for(ctx,
-            compute::nd_range_t({gpu_utils::into<size_t>(dhc),
-                    gpu_utils::into<size_t>(n_bias),
-                    gpu_utils::into<size_t>(n_layer * n_dir)}),
+            compute::nd_range_t({into<size_t>(dhc), into<size_t>(n_bias),
+                    into<size_t>(n_layer * n_dir)}),
             kernels_[kernel_id::bias_prepare], arg_list);
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::copy_init_layer(const exec_ctx_t &ctx,
+status_t simple_rnn_common_t<aprop>::copy_init_layer(const exec_ctx_t &ctx,
         compute::compute_stream_t *compute_stream, bool lr, bool rl,
         dim_t batch, dim_t dhc, dim_t slc, dim_t n_iter, dim_t n_layer,
         dim_t n_dir, dim_t n_states, dim_t states_ws_ld,
@@ -1356,7 +1348,7 @@ status_t _simple_rnn_common_t<aprop>::copy_init_layer(const exec_ctx_t &ctx,
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::copy_init_iter(const exec_ctx_t &ctx,
+status_t simple_rnn_common_t<aprop>::copy_init_iter(const exec_ctx_t &ctx,
         compute::compute_stream_t *compute_stream, dim_t batch, dim_t dhc,
         dim_t sic, dim_t n_iter, dim_t n_layer, dim_t n_dir, dim_t n_states,
         dim_t states_ws_ld, dim_t scratch_diff_states_ld,
@@ -1396,9 +1388,8 @@ status_t _simple_rnn_common_t<aprop>::copy_init_iter(const exec_ctx_t &ctx,
         arg_list.append(into<int32_t>(quantize));
         arg_list.append(unused_ld);
         return parallel_for(ctx,
-                compute::nd_range_t({gpu_utils::into<size_t>(max_d),
-                        gpu_utils::into<size_t>(batch),
-                        gpu_utils::into<size_t>(n_layer * n_dir)}),
+                compute::nd_range_t({into<size_t>(max_d), into<size_t>(batch),
+                        into<size_t>(n_layer * n_dir)}),
                 kernels_[kernel_id::copy_init_iter], arg_list);
     } else {
         compute::kernel_arg_list_t arg_list;
@@ -1422,15 +1413,14 @@ status_t _simple_rnn_common_t<aprop>::copy_init_iter(const exec_ctx_t &ctx,
         arg_list.append(into<int32_t>(scratch_diff_states_ld));
 
         return parallel_for(ctx,
-                compute::nd_range_t({gpu_utils::into<size_t>(dhc),
-                        gpu_utils::into<size_t>(batch),
-                        gpu_utils::into<size_t>(n_layer * n_dir)}),
+                compute::nd_range_t({into<size_t>(dhc), into<size_t>(batch),
+                        into<size_t>(n_layer * n_dir)}),
                 kernels_[kernel_id::copy_init_iter], arg_list);
     }
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::copy_res_layer(const exec_ctx_t &ctx,
+status_t simple_rnn_common_t<aprop>::copy_res_layer(const exec_ctx_t &ctx,
         compute::compute_stream_t *compute_stream, bool lr, bool rl,
         dim_t batch, dim_t dhc, dim_t slc, dim_t n_iter, dim_t n_layer,
         dim_t n_dir, dim_t n_states, dim_t states_ws_ld,
@@ -1492,7 +1482,7 @@ status_t _simple_rnn_common_t<aprop>::copy_res_layer(const exec_ctx_t &ctx,
 }
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::copy_res_iter(const exec_ctx_t &ctx,
+status_t simple_rnn_common_t<aprop>::copy_res_iter(const exec_ctx_t &ctx,
         compute::compute_stream_t *compute_stream, dim_t batch, dim_t dhc,
         dim_t sic, dim_t n_iter, dim_t n_layer, dim_t n_dir, dim_t n_states,
         dim_t states_ws_ld, dim_t scratch_diff_states_ld,
@@ -1531,9 +1521,8 @@ status_t _simple_rnn_common_t<aprop>::copy_res_iter(const exec_ctx_t &ctx,
         arg_list.append(scale);
         arg_list.append(into<int32_t>(dequantize));
         return parallel_for(ctx,
-                compute::nd_range_t({gpu_utils::into<size_t>(dhc),
-                        gpu_utils::into<size_t>(batch),
-                        gpu_utils::into<size_t>(n_layer * n_dir)}),
+                compute::nd_range_t({into<size_t>(dhc), into<size_t>(batch),
+                        into<size_t>(n_layer * n_dir)}),
                 kernels_[kernel_id::copy_res_iter], arg_list);
     } else {
         dim_t max_d = std::max(dhc, sic);
@@ -1559,9 +1548,8 @@ status_t _simple_rnn_common_t<aprop>::copy_res_iter(const exec_ctx_t &ctx,
             arg_list.append(pd()->off.diff_src_iter_c);
 
         return parallel_for(ctx,
-                compute::nd_range_t({gpu_utils::into<size_t>(max_d),
-                        gpu_utils::into<size_t>(batch),
-                        gpu_utils::into<size_t>(n_layer * n_dir)}),
+                compute::nd_range_t({into<size_t>(max_d), into<size_t>(batch),
+                        into<size_t>(n_layer * n_dir)}),
                 kernels_[kernel_id::copy_res_iter], arg_list);
     }
 }
@@ -1569,14 +1557,12 @@ status_t _simple_rnn_common_t<aprop>::copy_res_iter(const exec_ctx_t &ctx,
 //********************* Execution function *********************//
 
 template <prop_kind_t aprop>
-status_t _simple_rnn_common_t<aprop>::execute_(const exec_ctx_t &ctx) const {
+status_t simple_rnn_common_t<aprop>::execute_(const exec_ctx_t &ctx) const {
 
     impl::engine_t *engine = ctx.stream()->engine();
     auto *compute_stream
             = utils::downcast<compute::compute_stream_t *>(ctx.stream());
 
-    auto rnn_pd = this->pd();
-
     const conf_t &rnn = this->pd()->rnn_conf;
 
     dim_t n_layer = rnn.n_layer;
@@ -1589,10 +1575,8 @@ status_t _simple_rnn_common_t<aprop>::execute_(const exec_ctx_t &ctx) const {
     dim_t slc = rnn.slc;
     dim_t sic = rnn.sic;
     dim_t dhc = rnn.dhc;
-    dim_t dlc = rnn.dlc;
 
     bool is_fwd = rnn.is_fwd;
-    bool is_vanilla_gru = rnn.is_vanilla_gru;
 
     auto &src_layer_native_ = CTX_IN_STORAGE(DNNL_ARG_SRC_LAYER);
     auto &src_iter_native_ = CTX_IN_STORAGE(DNNL_ARG_SRC_ITER);
@@ -1639,36 +1623,6 @@ status_t _simple_rnn_common_t<aprop>::execute_(const exec_ctx_t &ctx) const {
             diff_dst_layer_native_, diff_weights_layer_native_,
             diff_weights_iter_native_, rnn, pd()->off);
 
-    DPRINT("\n%s\n", "+++++++++++++++");
-    DPRINT(" aprop = %d\n", (int)aprop);
-    DPRINT("%s\n", "+++++++++++++++");
-    DPRINT("  n_layer         = %lld\n", into<long long>(n_layer));
-    DPRINT("  n_dir           = %lld\n", into<long long>(n_dir));
-    DPRINT("  n_iter          = %lld\n", into<long long>(n_iter));
-    DPRINT("  n_gates         = %lld\n", into<long long>(n_gates));
-    DPRINT("  n_bias          = %lld\n", into<long long>(n_bias));
-    DPRINT("  n_states        = %lld\n", into<long long>(n_states));
-    DPRINT("  n_weights_layer = %lld\n", into<long long>(rnn_pd->SLC()));
-    DPRINT("  n_weights_iter  = %lld\n", into<long long>(rnn_pd->SIC()));
-    DPRINT("  batch           = %lld\n", into<long long>(batch));
-    DPRINT("  slc             = %lld\n", into<long long>(slc));
-    DPRINT("  sic             = %lld\n", into<long long>(sic));
-    DPRINT("  dhc             = %lld\n", into<long long>(dhc));
-    DPRINT("  dlc             = %lld\n", into<long long>(dlc));
-    DPRINT("%s\n", "+++++++++++++++");
-    DPRINT("  is_fwd          = %s\n", is_fwd ? "yes" : "no");
-    DPRINT("  is_vanilla_gru  = %s\n", is_vanilla_gru ? "yes" : "no");
-    DPRINT("  use_workspace   = %s\n", rnn.use_workspace ? "yes" : "no");
-    DPRINT("%s\n", "+++++++++++++++");
-    DPRINT("  with_src_iter   = %s\n", rnn_pd->with_src_iter() ? "yes" : "no");
-    DPRINT("  with_src_iter_c = %s\n",
-            rnn_pd->with_src_iter_c() ? "yes" : "no");
-    DPRINT("  with_bias       = %s\n", rnn_pd->with_bias() ? "yes" : "no");
-    DPRINT("  with_dst_iter   = %s\n", rnn_pd->with_dst_iter() ? "yes" : "no");
-    DPRINT("  with_dst_iter_c = %s\n",
-            rnn_pd->with_dst_iter_c() ? "yes" : "no");
-    DPRINT("%s\n", "+++++++++++++++");
-
     // TODO: implement without copies
     bool is_lr = !one_of(rnn.exec_dir, r2l, r2l);
     bool is_rl = !one_of(rnn.exec_dir, l2r, l2r);
@@ -1768,8 +1722,8 @@ elemwise_sig_gru(simple_rnn_fwd_t::gru_elemwise);
 template <>
 elemwise_sig_gru(simple_rnn_bwd_t::gru_elemwise);
 
-template struct _simple_rnn_common_t<prop_kind::forward>;
-template struct _simple_rnn_common_t<prop_kind::backward>;
+template struct simple_rnn_common_t<prop_kind::forward>;
+template struct simple_rnn_common_t<prop_kind::backward>;
 
 } // namespace ocl
 } // namespace intel
diff --git a/src/gpu/intel/ocl/rnn/rnn_grid.hpp b/src/gpu/intel/ocl/rnn/rnn_grid.hpp
index 0092c7b2333..582273391a3 100644
--- a/src/gpu/intel/ocl/rnn/rnn_grid.hpp
+++ b/src/gpu/intel/ocl/rnn/rnn_grid.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,8 +26,8 @@
 #include "gpu/gpu_resource.hpp"
 #include "gpu/gpu_rnn_pd.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/rnn/rnn_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 // TODO just to debug
@@ -47,6 +47,7 @@ enum gemm_kind_t {
     gemm_iter_bwd,
     gemm_iter_bwd_2,
     gemm_layer_bwd,
+    gemm_layer_bwd_src,
     gemm_diff_wei_iter,
     gemm_diff_wei_iter_2,
     gemm_diff_wei_layer,
@@ -54,17 +55,17 @@ enum gemm_kind_t {
 };
 
 template <prop_kind_t aprop>
-struct _simple_rnn_common_t : public gpu_primitive_t {
+struct simple_rnn_common_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
 
-    using class_name = _simple_rnn_common_t<aprop>;
+    using class_name = simple_rnn_common_t<aprop>;
 
-    typedef elemwise_sig((class_name::*elemwise_f));
-    typedef elemwise_sig_gru((class_name::*elemwise_gru_f));
-    typedef elemwise_sig_gru_lbr((class_name::*elemwise_gru_lbr_f));
-    typedef cell_execution_sig((class_name::*cell_execution_f));
-    typedef grid_execution_sig((class_name::*grid_execution_f));
-    typedef gemm_sig((class_name::*gemm_t));
+    using elemwise_f = elemwise_sig((class_name::*));
+    using elemwise_gru_f = elemwise_sig_gru((class_name::*));
+    using elemwise_gru_lbr_f = elemwise_sig_gru_lbr((class_name::*));
+    using cell_execution_f = cell_execution_sig((class_name::*));
+    using grid_execution_f = grid_execution_sig((class_name::*));
+    using gemm_t = gemm_sig((class_name::*));
 
     using base_pd_t =
             typename utils::conditional<false || aprop == prop_kind::forward,
@@ -98,6 +99,7 @@ struct _simple_rnn_common_t : public gpu_primitive_t {
         std::shared_ptr<primitive_desc_t> gemm_iter_bwd_pd_;
         std::shared_ptr<primitive_desc_t> gemm_iter_bwd_2_pd_;
         std::shared_ptr<primitive_desc_t> gemm_layer_bwd_pd_;
+        std::shared_ptr<primitive_desc_t> gemm_layer_bwd_src_pd_;
         std::shared_ptr<primitive_desc_t> gemm_diff_wei_layer_pd_;
         std::shared_ptr<primitive_desc_t> gemm_diff_wei_layer_src_pd_;
         std::shared_ptr<primitive_desc_t> gemm_diff_wei_iter_pd_;
@@ -118,6 +120,7 @@ struct _simple_rnn_common_t : public gpu_primitive_t {
                             gemm_iter_bwd_pd_.get(),
                             gemm_iter_bwd_2_pd_.get(),
                             gemm_layer_bwd_pd_.get(),
+                            gemm_layer_bwd_src_pd_.get(),
                             gemm_diff_wei_layer_pd_.get(),
                             gemm_diff_wei_layer_src_pd_.get(),
                             gemm_diff_wei_iter_pd_.get(),
@@ -147,8 +150,8 @@ struct _simple_rnn_common_t : public gpu_primitive_t {
         std::vector<dim_t> lws;
         lws.reserve(gws.size());
         for (size_t i = 0; i < gws.size(); i++) {
-            int l_dim = 2 * gws[i] <= lws_max ? utils::rnd_up_pow2(gws[i])
-                                              : lws_max;
+            dim_t l_dim = 2 * gws[i] <= lws_max ? utils::rnd_up_pow2(gws[i])
+                                                : lws_max;
             if (i == 0 && l_dim < subgroup_size) l_dim = subgroup_size;
             lws.emplace_back(l_dim);
             gws[i] = utils::rnd_up(gws[i], l_dim);
@@ -230,6 +233,7 @@ struct _simple_rnn_common_t : public gpu_primitive_t {
     std::shared_ptr<impl::primitive_t> gemm_iter_fwd_;
     std::shared_ptr<impl::primitive_t> gemm_iter_fwd_2_;
     std::shared_ptr<impl::primitive_t> gemm_layer_bwd_;
+    std::shared_ptr<impl::primitive_t> gemm_layer_bwd_src_;
     std::shared_ptr<impl::primitive_t> gemm_iter_bwd_;
     std::shared_ptr<impl::primitive_t> gemm_iter_bwd_2_;
     std::shared_ptr<impl::primitive_t> gemm_diff_wei_layer_;
@@ -259,8 +263,8 @@ struct _simple_rnn_common_t : public gpu_primitive_t {
 
     enum { SCALES_ = 0, TM_SCALES_ = 1 };
 };
-using simple_rnn_fwd_t = _simple_rnn_common_t<prop_kind::forward>;
-using simple_rnn_bwd_t = _simple_rnn_common_t<prop_kind::backward>;
+using simple_rnn_fwd_t = simple_rnn_common_t<prop_kind::forward>;
+using simple_rnn_bwd_t = simple_rnn_common_t<prop_kind::backward>;
 } // namespace ocl
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/ocl/rnn/rnn_reorders.cpp b/src/gpu/intel/ocl/rnn/rnn_reorders.cpp
index 154ed2c6f9d..f8c1464cee9 100644
--- a/src/gpu/intel/ocl/rnn/rnn_reorders.cpp
+++ b/src/gpu/intel/ocl/rnn/rnn_reorders.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #include "gpu/intel/ocl/rnn/rnn_reorders.hpp"
 
-#include "gpu/intel/ocl/ocl_stream.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp
index 80f1ed4c0b3..7f0ea63cfb7 100644
--- a/src/gpu/intel/ocl/rnn/rnn_reorders.hpp
+++ b/src/gpu/intel/ocl/rnn/rnn_reorders.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include "gpu/gpu_reorder_pd.hpp"
 #include "gpu/gpu_resource.hpp"
 #include "gpu/intel/gpu_primitive.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
 
 namespace dnnl {
@@ -42,14 +42,8 @@ struct rnn_weights_reorder_t : public gpu_primitive_t {
 
         status_t init(impl::engine_t *engine, impl::engine_t *src_engine,
                 impl::engine_t *dst_engine) {
-            // Note: currently rnn_u8s8_compensation and rnn_s8s8_compensation
-            // have common bit so we have to perform additional checks to
-            // separate these two cases
-            VDISPATCH_REORDER(
-                    !IMPLICATION(dst_md()->extra.flags
-                                    & memory_extra_flags::rnn_u8s8_compensation,
-                            types::extra_flag_rnn_s8s8_compensation_is_set(
-                                    dst_md()->extra.flags)),
+            VDISPATCH_REORDER(dst_md()->extra.flags
+                            & memory_extra_flags::rnn_u8s8_compensation,
                     VERBOSE_BAD_FLAGS);
 
             VDISPATCH_REORDER(utils::one_of(src_engine->kind(),
diff --git a/src/gpu/intel/ocl/rnn/rnn_utils.cpp b/src/gpu/intel/ocl/rnn/rnn_utils.cpp
index 39b580242a7..7835b68d887 100644
--- a/src/gpu/intel/ocl/rnn/rnn_utils.cpp
+++ b/src/gpu/intel/ocl/rnn/rnn_utils.cpp
@@ -164,7 +164,8 @@ void rnn_utils::init_rnn_conf(conf_t &rnn, const rnn_desc_t &rd,
                     && !(rnn.is_fwd || is_gru));
 
     if (rnn.is_fwd) {
-        bool can_fuse_gemm = rnn.dt_conf == all_f32 && rnn.is_fwd
+        bool can_fuse_gemm = !rnn.is_int8
+                && rnn.wei_iter_type == rnn.wei_layer_type && rnn.is_fwd
                 && utils::one_of(rd.cell_kind, alg_kind::vanilla_rnn,
                         alg_kind::vanilla_lstm);
         // Poor implementation performance if dhc % subgroup_size != 0
@@ -271,8 +272,7 @@ void rnn_utils::set_rnn_conf(conf_t &rnn, const rnn_desc_t &rd,
     const bool is_fwd = rnn.is_fwd;
     const bool is_bwd = !rnn.is_fwd;
 
-    dim_t aux_elsz
-            = gpu_utils::into<dim_t>(types::data_type_size(rnn.aux_data_type));
+    dim_t aux_elsz = into<dim_t>(types::data_type_size(rnn.aux_data_type));
     rnn.ws_states_elsz = types::data_type_size(rnn.src_data_type);
 
     rnn.scratch_gates_elsz = types::data_type_size(rnn.acc_data_type);
diff --git a/src/gpu/intel/ocl/rnn/rnn_utils.hpp b/src/gpu/intel/ocl/rnn/rnn_utils.hpp
index cfbbf0276ba..fac32ccf5a8 100644
--- a/src/gpu/intel/ocl/rnn/rnn_utils.hpp
+++ b/src/gpu/intel/ocl/rnn/rnn_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,11 +19,11 @@
 
 #include "common/c_types_map.hpp"
 #include "common/memory_desc_wrapper.hpp"
+#include "common/serialization.hpp"
 #include "gpu/intel/compute/compute_engine.hpp"
 #include "gpu/intel/compute/kernel.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/primitive_conf.hpp"
-#include "gpu/intel/serialization.hpp"
 
 #define OFF6(i0, d0, i1, d1, i2, d2, i3, d3, i4, d4, i5, d5) \
     ((((((i0) * (d1) + (i1)) * (d2) + (i2)) * (d3) + (i3)) * (d4) + (i4)) \
@@ -172,12 +172,12 @@ struct ocl_conf_t {
 #if __cplusplus >= 202002L
     bool operator==(const ocl_conf_t &) const = default;
 #endif
-    serialized_t serialize() const {
-        assert_trivially_serializable(ocl_conf_t);
-        return serialized_t(*this);
+    serialization_stream_t serialize() const {
+        DNNL_ASSERT_TRIVIALLY_SERIALIZABLE(ocl_conf_t);
+        return serialization_stream_t(*this);
     }
 
-    static ocl_conf_t deserialize(const serialized_t &s) {
+    static ocl_conf_t deserialize(const serialization_stream_t &s) {
         ocl_conf_t t {};
         deserializer_t d(s);
         d.pop(t);
@@ -200,9 +200,11 @@ struct ocl_conf_t {
     data_type_t dst_c_dt = data_type::undef;
     data_type_t acc_dt = data_type::undef;
     data_type_t aux_dt = data_type::undef;
+    data_type_t ws_state_dt = data_type::undef;
     data_type_t input_dt = data_type::undef;
     data_type_t output_dt = data_type::undef;
     data_type_t diff_dt = data_type::undef;
+    uint8_t pad[4] = {};
 
     struct inner_layouts_t {
 #if __cplusplus >= 202002L
@@ -417,8 +419,8 @@ struct sub_buffer_t {
 };
 
 struct data_helper_t {
-    static dim_t type_size(data_type_t d) {
-        return static_cast<dim_t>(types::data_type_size(d));
+    static uint32_t type_size(data_type_t d) {
+        return into<uint32_t>(types::data_type_size(d));
     }
 };
 
@@ -644,17 +646,17 @@ struct workspace_t : public data_helper_t {
                              conf.ws_grid_comp_offset, conf.ws_grid_comp_size)
                                                 : nullptr) {
         if (gates_) {
-            const int n_b = conf_.mb;
-            const int n_tb = conf_.n_iter * n_b;
-            const int n_dtb = conf_.n_dir * n_tb;
+            const dim_t n_b = conf_.mb;
+            const dim_t n_tb = conf_.n_iter * n_b;
+            const dim_t n_dtb = conf_.n_dir * n_tb;
             gates_strides_
                     = {n_dtb * conf_.gates_ws_ld, n_tb * conf_.gates_ws_ld,
                             n_b * conf_.gates_ws_ld, conf_.gates_ws_ld};
         }
         if (states_) {
-            const int n_b = conf_.mb;
-            const int n_tb = (conf_.n_iter + 1) * n_b;
-            const int n_dtb = conf_.n_dir * n_tb;
+            const dim_t n_b = conf_.mb;
+            const dim_t n_tb = (conf_.n_iter + 1) * n_b;
+            const dim_t n_dtb = conf_.n_dir * n_tb;
             states_strides_
                     = {n_dtb * conf_.states_ws_ld, n_tb * conf_.states_ws_ld,
                             n_b * conf_.states_ws_ld, conf_.states_ws_ld};
@@ -786,6 +788,7 @@ struct scratch_t : public data_helper_t {
         key_gemm_iter_bwd,
         key_gemm_iter_bwd_2,
         key_gemm_layer_bwd,
+        key_gemm_layer_bwd_src,
         key_gemm_diff_wei_layer,
         key_gemm_diff_wei_layer_src,
         key_gemm_diff_wei_iter,
@@ -802,7 +805,7 @@ struct scratch_t : public data_helper_t {
         diff_ht_ = scratchpad.get_memory_storage(key_rnn_diff_ht);
     }
 
-    struct gemm_pds {
+    struct gemm_pds_t {
         const primitive_desc_t *iter_fwd_pd;
         const primitive_desc_t *iter_fwd_2_pd;
         const primitive_desc_t *layer_fwd_pd;
@@ -810,6 +813,7 @@ struct scratch_t : public data_helper_t {
         const primitive_desc_t *iter_bwd_pd;
         const primitive_desc_t *iter_bwd_2_pd;
         const primitive_desc_t *layer_bwd_pd;
+        const primitive_desc_t *layer_bwd_src_pd;
         const primitive_desc_t *diff_wei_layer_pd;
         const primitive_desc_t *diff_wei_layer_src_pd;
         const primitive_desc_t *diff_wei_iter_pd;
@@ -817,7 +821,7 @@ struct scratch_t : public data_helper_t {
     };
 
     static void book(memory_tracking::registrar_t &scratchpad,
-            const conf_t &rnn_conf, const gemm_pds &gemms) {
+            const conf_t &rnn_conf, const gemm_pds_t &gemms) {
         using namespace memory_tracking::names;
         if (rnn_conf.scratch_gates_size > 0)
             scratchpad.book(key_rnn_gates, rnn_conf.scratch_gates_size, 1,
@@ -854,6 +858,9 @@ struct scratch_t : public data_helper_t {
                     gemms.iter_bwd_pd->scratchpad_registry());
             scratchpad.book(key_gemm_layer_bwd,
                     gemms.layer_bwd_pd->scratchpad_registry());
+            if (gemms.layer_bwd_src_pd)
+                scratchpad.book(key_gemm_layer_bwd_src,
+                        gemms.layer_bwd_src_pd->scratchpad_registry());
             scratchpad.book(key_gemm_diff_wei_layer,
                     gemms.diff_wei_layer_pd->scratchpad_registry());
             if (gemms.diff_wei_layer_src_pd)
@@ -924,18 +931,20 @@ struct scratch_t : public data_helper_t {
                 ? conf_.n_states + 1
                 : conf_.n_states;
         auto i3_size = conf_.n_iter + 1;
-        if (i0_size == 0) {
-            i3_size = 2;
-            i3 %= i3_size;
+        if (i0_size <= 1) {
+            if (i0_size <= 0) {
+                i3_size = 2;
+                i3 %= i3_size;
+            }
             return OFF5(i1, conf_.n_dir, i2, i2_size, i3, i3_size, i4, conf_.mb,
                     i5, conf_.scratch_diff_states_ld);
         }
         gpu_assert(i0 < i0_size && i0 >= 0)
-                << "Logical index " << i0 << " must be less than its size "
-                << i0_size;
+                << "Logical index " << i0 << " should be in [0, " << i0_size
+                << ")";
         gpu_assert(i2 < i2_size && i2 >= 0)
-                << "Logical index " << i2 << " must be less than its size "
-                << i2_size;
+                << "Logical index " << i2 << " should be in [0, " << i2_size
+                << ")" << i2_size;
         MAYBE_UNUSED(i0_size);
 
         return OFF6(i0, i0_size, i1, conf_.n_dir, i2, i2_size, i3, i3_size, i4,
@@ -974,7 +983,7 @@ struct arg_list_t {
     }
     void append(const rnn_utils::sub_buffer_t &buffer, data_type_t dt) {
         args.append(buffer.get_storage());
-        args.append(gpu_utils::into<dim_t>(buffer.offset(dt)));
+        args.append(into<dim_t>(buffer.offset(dt)));
     }
     compute::kernel_arg_list_t args;
 };
diff --git a/src/gpu/intel/ocl/rnn/simple_rnn_postgemm.cpp b/src/gpu/intel/ocl/rnn/simple_rnn_postgemm.cpp
index 0664a3ce530..9df38786da0 100644
--- a/src/gpu/intel/ocl/rnn/simple_rnn_postgemm.cpp
+++ b/src/gpu/intel/ocl/rnn/simple_rnn_postgemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ using namespace dnnl::impl::gpu::intel::gpu_utils;
 using namespace rnn_utils;
 
 template <prop_kind_t aprop>
-elemwise_sig((_simple_rnn_common_t<aprop>::rnn_elemwise)) {
+elemwise_sig((simple_rnn_common_t<aprop>::rnn_elemwise)) {
     auto nd_range = get_nd_range({dhc,
             utils::div_up(
                     batch, aprop == prop_kind::forward ? 1 : bwd_batch_block)});
@@ -101,7 +101,7 @@ template elemwise_sig(simple_rnn_fwd_t::rnn_elemwise);
 template elemwise_sig(simple_rnn_bwd_t::rnn_elemwise);
 
 template <prop_kind_t aprop>
-elemwise_sig((_simple_rnn_common_t<aprop>::lstm_elemwise)) {
+elemwise_sig((simple_rnn_common_t<aprop>::lstm_elemwise)) {
     auto nd_range = get_nd_range({dhc,
             utils::div_up(
                     batch, aprop == prop_kind::forward ? 1 : bwd_batch_block)});
@@ -176,7 +176,7 @@ template elemwise_sig(simple_rnn_fwd_t::lstm_elemwise);
 template elemwise_sig(simple_rnn_bwd_t::lstm_elemwise);
 
 template <prop_kind_t aprop>
-elemwise_sig((_simple_rnn_common_t<aprop>::lstm_elemwise_u8s8)) {
+elemwise_sig((simple_rnn_common_t<aprop>::lstm_elemwise_u8s8)) {
     auto nd_range = get_nd_range({dhc,
             utils::div_up(
                     batch, aprop == prop_kind::forward ? 1 : bwd_batch_block)});
@@ -235,7 +235,7 @@ template elemwise_sig(simple_rnn_fwd_t::lstm_elemwise_u8s8);
 template elemwise_sig(simple_rnn_bwd_t::lstm_elemwise_u8s8);
 
 template <prop_kind_t aprop>
-elemwise_sig_gru_lbr((_simple_rnn_common_t<aprop>::gru_lbr_elemwise)) {
+elemwise_sig_gru_lbr((simple_rnn_common_t<aprop>::gru_lbr_elemwise)) {
     auto nd_range = get_nd_range({dhc,
             utils::div_up(
                     batch, aprop == prop_kind::forward ? 1 : bwd_batch_block)});
@@ -311,7 +311,7 @@ template elemwise_sig_gru_lbr(simple_rnn_fwd_t::gru_lbr_elemwise);
 template elemwise_sig_gru_lbr(simple_rnn_bwd_t::gru_lbr_elemwise);
 
 template <prop_kind_t aprop>
-elemwise_sig_gru((_simple_rnn_common_t<aprop>::gru_elemwise)) {
+elemwise_sig_gru((simple_rnn_common_t<aprop>::gru_elemwise)) {
     auto nd_range = get_nd_range({dhc,
             utils::div_up(
                     batch, aprop == prop_kind::forward ? 1 : bwd_batch_block)});
diff --git a/src/gpu/intel/ocl/simple_binary.hpp b/src/gpu/intel/ocl/simple_binary.hpp
index 33d917d2971..d4fba74698e 100644
--- a/src/gpu/intel/ocl/simple_binary.hpp
+++ b/src/gpu/intel/ocl/simple_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ struct simple_binary_t : public gpu_primitive_t {
             using namespace data_type;
             using sm = primitive_attr_t::skip_mask_t;
 
-            const auto attr_skip_mask = sm::post_ops | sm::scales_runtime;
+            const auto attr_skip_mask = sm::post_ops | sm::scales;
 
             VDISPATCH_BINARY_SC(set_default_params(), VERBOSE_UNSUPPORTED_TAG);
             VDISPATCH_BINARY(
@@ -48,12 +48,12 @@ struct simple_binary_t : public gpu_primitive_t {
                               bf16, src_md(0)->data_type, src_md(1)->data_type)
                              && utils::one_of(
                                      dst_md()->data_type, bf16, u8, f32))
-                            || (utils::one_of(
-                                        src_md(0)->data_type, f16, f32, s8, u8)
+                            || (utils::one_of(src_md(0)->data_type, f16, f32,
+                                        s8, u8, s32)
                                     && utils::one_of(src_md(1)->data_type, f16,
-                                            f32, s8, u8)
+                                            f32, s8, u8, s32)
                                     && utils::one_of(dst_md()->data_type, f16,
-                                            f32, s8, u8))
+                                            f32, s8, u8, s32))
                             || (src_md(0)->data_type == f32
                                     && src_md(1)->data_type == bf16
                                     && utils::one_of(
@@ -67,6 +67,8 @@ struct simple_binary_t : public gpu_primitive_t {
                     !memory_desc_ndims_ok(src_md(0), src_md(1), dst_md()),
                     VERBOSE_INCONSISTENT_NDIMS, "src", "dst");
 
+            VDISPATCH_BINARY(!is_ternary_op(), VERBOSE_BAD_ALGORITHM);
+
             VDISPATCH_BINARY(IMPLICATION(!attr()->scales_.has_default_values(),
                                      check_scales_mask()),
                     VERBOSE_UNSUPPORTED_SCALES_CFG);
@@ -95,7 +97,7 @@ struct simple_binary_t : public gpu_primitive_t {
         status_t init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const;
 
         bool with_scales(int position) const {
-            return !attr()->scales_.get(position).has_default_values();
+            return !attr()->scales_.has_default_values(position);
         }
 
         bool with_scales() const {
diff --git a/src/gpu/intel/ocl/simple_concat.cpp b/src/gpu/intel/ocl/simple_concat.cpp
index 937a68c02f8..f804b2d6ebf 100644
--- a/src/gpu/intel/ocl/simple_concat.cpp
+++ b/src/gpu/intel/ocl/simple_concat.cpp
@@ -133,17 +133,17 @@ static status_t init_conf_common(
     conf.dst_offset0 = dst_md.offset0 * data_type_size / info.type_size;
     conf.read_block = info.block;
     conf.write_block = std::min(info.block, max_write_size / info.type_size);
-    // TODO: Fix math::lcm overflow
-    dim_t shared_read = math::gcd(inner_axis, conf.read_block);
-    conf.gws0_block = inner_axis * conf.read_block / shared_read;
+    conf.gws0_block = math::lcm(inner_axis, conf.read_block);
     conf.read_overlap = conf.gws0_block / inner_axis;
     conf.gws_d[0] = conf.gws0_block * conf.simd / conf.read_block;
     conf.gws_d[1] = extern_axis / conf.read_overlap;
     conf.gws_d[2] = concat_dim_size;
 
     // Bound estimates based on limited empirical evidence
-    int coalesced_writes = ((max_write_size ^ (max_write_size - 1)) >> 1) + 1;
-    size_t extern_axis_bound = 256 * 512 * std::min(coalesced_writes, 8);
+    size_t coalesced_writes
+            = ((max_write_size ^ (max_write_size - 1)) >> 1) + 1;
+    size_t extern_axis_bound
+            = 256 * 512 * std::min(coalesced_writes, into<size_t>(8));
     if (conf.simd == 1 && conf.gws_d[2] > 64) return status::unimplemented;
     if (conf.simd == 1 && conf.gws_d[1] > extern_axis_bound)
         return status::unimplemented;
diff --git a/src/gpu/intel/ocl/simple_layer_normalization.cpp b/src/gpu/intel/ocl/simple_layer_normalization.cpp
index 865722c8136..6b441d88ccb 100644
--- a/src/gpu/intel/ocl/simple_layer_normalization.cpp
+++ b/src/gpu/intel/ocl/simple_layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include "common/c_types_map.hpp"
 #include "common/primitive_exec_types.hpp"
 #include "common/scratchpad.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -36,12 +36,12 @@ static status_t init_conf_common(lnorm_conf_t &conf,
     memory_desc_wrapper dst_mdw(
             pd->is_fwd() ? pd->dst_md() : pd->diff_dst_md());
 
-    int ndims = src_mdw.ndims();
+    dim_idx_t ndims = into<dim_idx_t>(src_mdw.ndims());
 
     conf.src_dt = src_mdw.data_type();
     conf.dst_dt = dst_mdw.data_type();
     conf.ndims = ndims;
-    conf.norm_axis = pd->norm_axis();
+    conf.norm_axis = into<dim_idx_t>(pd->norm_axis());
     conf.src_md_info = memory_desc_info_t::create(src_mdw);
     conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
     conf.stat_md_info = memory_desc_info_t::create(stat_mdw);
@@ -64,8 +64,9 @@ static status_t init_conf_common(lnorm_conf_t &conf,
     bool c_is_last_physical = false;
 
     if (src_mdw.blocking_desc().inner_nblks > 0) {
-        c_block = src_mdw.blocking_desc()
-                          .inner_blks[src_mdw.blocking_desc().inner_nblks - 1];
+        c_block = into<int>(
+                src_mdw.blocking_desc()
+                        .inner_blks[src_mdw.blocking_desc().inner_nblks - 1]);
         c_is_last_physical
                 = src_mdw.blocking_desc().inner_idxs[ndims - 1] == ndims - 1;
     } else {
@@ -113,9 +114,9 @@ static status_t init_conf_common(lnorm_conf_t &conf,
         } else {
             return status::unimplemented;
         }
-        for (int i = 0; i < 4; i++) {
-            int md_hint_idx = nstl::min(i, ndims - 1);
-            int dim = (i < ndims - 1) ? dims[i] : 1;
+        for (dim_idx_t i = 0; i < 4; i++) {
+            dim_idx_t md_hint_idx = nstl::min(i, ndims - 1);
+            dim_t dim = (i < ndims - 1) ? dims[i] : 1;
             if (conf.vectorize_calc_stats && (i == ndims - 1)) {
                 dim = sg_size;
                 conf.dispatch.define_dim(
@@ -154,9 +155,9 @@ static status_t init_conf_common(lnorm_conf_t &conf,
                 conf.vect_dt_n /= 2;
             }
         }
-        for (int i = 0; i < 4; i++) {
-            int md_hint_idx = nstl::min(i, ndims - 1);
-            int dim = (i < ndims - 1) ? dims[i] : 1;
+        for (dim_idx_t i = 0; i < 4; i++) {
+            dim_idx_t md_hint_idx = nstl::min(i, ndims - 1);
+            dim_t dim = (i < ndims - 1) ? dims[i] : 1;
             if (conf.vectorize_bwd && (i == ndims - 1)) {
                 conf.dispatch.define_dim(utils::format("X%d", i), md_hint_idx,
                         conf.sub_group_size);
@@ -174,7 +175,7 @@ static status_t init_conf_common(lnorm_conf_t &conf,
         conf.n_chunks = dims[0] / conf.n_chunk_size;
         if (src_mdw.blocking_desc().inner_nblks == 2
                 && src_mdw.blocking_desc().inner_idxs[0] == 0) {
-            n_block = src_mdw.blocking_desc().inner_blks[0];
+            n_block = into<int>(src_mdw.blocking_desc().inner_blks[0]);
         }
         // Scaleshift vectorization is supported for tensors
         // with shapes AxB, 1xBxC
@@ -188,7 +189,7 @@ static status_t init_conf_common(lnorm_conf_t &conf,
         if (!conf.vectorize_bwd_scaleshift) { return status::unimplemented; }
         // Use partial reduction in order to increase number of used threads
         conf.vector_size_scaleshift = c_block == sg_size ? 8 : 1;
-        const int first_dim = ndims == 2 ? dims[0] : dims[1];
+        const dim_t first_dim = ndims == 2 ? dims[0] : dims[1];
         while (n_block % conf.vector_size_scaleshift != 0
                 || first_dim % conf.vector_size_scaleshift != 0) {
             conf.vector_size_scaleshift /= 2;
diff --git a/src/gpu/intel/ocl/simple_layer_normalization.hpp b/src/gpu/intel/ocl/simple_layer_normalization.hpp
index 2719bd4491d..b880987cab7 100644
--- a/src/gpu/intel/ocl/simple_layer_normalization.hpp
+++ b/src/gpu/intel/ocl/simple_layer_normalization.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,8 +67,7 @@ struct simple_layer_normalization_fwd_t : public gpu_primitive_t {
                     stat_md()->data_type == f32, VERBOSE_UNSUPPORTED_DT_CFG);
             VDISPATCH_LNORM(check_scale_shift_data_type({f32, bf16, f16}),
                     VERBOSE_UNSUPPORTED_DT_CFG);
-            VDISPATCH_LNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime),
+            VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_LNORM(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_LNORM(
@@ -92,9 +91,9 @@ struct simple_layer_normalization_fwd_t : public gpu_primitive_t {
         CHECK(status);
 
         kernel_ctx.define_int("WITH_SRC_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_SRC).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC));
         kernel_ctx.define_int("WITH_DST_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_DST).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_DST));
 
         CHECK(create_kernel(engine, &kernel_, "simple_lnorm_fwd", kernel_ctx));
         if (!kernel_) return status::runtime_error;
diff --git a/src/gpu/intel/ocl/simple_softmax.cl b/src/gpu/intel/ocl/simple_softmax.cl
index e6b169f1292..b76d237f3ea 100644
--- a/src/gpu/intel/ocl/simple_softmax.cl
+++ b/src/gpu/intel/ocl/simple_softmax.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -124,7 +124,7 @@ simple_softmax_fwd_generic(__global SRC_DATA_T *src, __global DATA_T *dst,
 #if LOGSOFTMAX
     denom_ = log(denom_);
 #else
-    denom_ = 1.0 / denom_;
+    denom_ = 1.0f / denom_;
 #endif
 
     for (int i = begin; i < end; ++i) {
diff --git a/src/gpu/intel/ocl/simple_softmax.hpp b/src/gpu/intel/ocl/simple_softmax.hpp
index 62dabd76920..215ee15e991 100644
--- a/src/gpu/intel/ocl/simple_softmax.hpp
+++ b/src/gpu/intel/ocl/simple_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -74,9 +74,8 @@ struct simple_softmax_fwd_t : public gpu_primitive_t {
                     VERBOSE_UNSUPPORTED_DEVICE_FEATURE, "subgroup_size");
             VDISPATCH_SOFTMAX(!memory_desc_ndims_ok(src_md(), dst_md()),
                     VERBOSE_INCONSISTENT_NDIMS, "src", "dst");
-            VDISPATCH_SOFTMAX(
-                    attr()->has_default_values(skip_mask_t::scales_runtime
-                            | skip_mask_t::post_ops),
+            VDISPATCH_SOFTMAX(attr()->has_default_values(skip_mask_t::scales
+                                      | skip_mask_t::post_ops),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_SOFTMAX(attr_scales_ok(), VERBOSE_UNSUPPORTED_SCALES_CFG);
             VDISPATCH_SOFTMAX(post_ops_ok(), VERBOSE_UNSUPPORTED_POSTOP);
@@ -85,7 +84,7 @@ struct simple_softmax_fwd_t : public gpu_primitive_t {
             VDISPATCH_SOFTMAX_SC(attr_.set_default_formats(dst_md(0)),
                     VERBOSE_UNSUPPORTED_POSTOP);
 
-            int nelems = axis_size(true);
+            dim_t nelems = axis_size(true);
 
             if (nelems < subgroup_size) {
                 group_size = subgroup_size = 1;
@@ -140,9 +139,9 @@ struct simple_softmax_fwd_t : public gpu_primitive_t {
         kernel_ctx.add_option("-cl-std=CL2.0");
         kernel_ctx.define_int("LOGSOFTMAX", pd()->is_logsoftmax());
         kernel_ctx.define_int("WITH_SRC_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_SRC).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC));
         kernel_ctx.define_int("WITH_DST_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_DST).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_DST));
 
         const memory_desc_wrapper dst_mdw(pd()->dst_md());
         const memory_desc_wrapper src_mdw(pd()->src_md());
@@ -251,7 +250,7 @@ struct simple_softmax_bwd_t : public gpu_primitive_t {
                 }
             }
 
-            int nelems = axis_size(true);
+            dim_t nelems = axis_size(true);
             if (nelems <= 100) {
                 group_size = 16;
             } else if (nelems <= 1000) {
diff --git a/src/gpu/intel/ocl/simple_sum.hpp b/src/gpu/intel/ocl/simple_sum.hpp
index c5b8ef5f3cb..2fb26bbadd2 100644
--- a/src/gpu/intel/ocl/simple_sum.hpp
+++ b/src/gpu/intel/ocl/simple_sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ struct simple_sum_t : public gpu_primitive_t {
     status_t execute(const exec_ctx_t &ctx) const override;
 
     enum { max_num_arrs = 16 };
-    typedef typename prec_traits<data_type>::type data_t;
+    using data_t = typename prec_traits_t<data_type>::type;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
diff --git a/src/gpu/intel/ocl/simple_zero_pad.cl b/src/gpu/intel/ocl/simple_zero_pad.cl
index 97ee8ab9cc9..d6847748933 100644
--- a/src/gpu/intel/ocl/simple_zero_pad.cl
+++ b/src/gpu/intel/ocl/simple_zero_pad.cl
@@ -131,7 +131,7 @@ __kernel void simple_zero_pad(__global void *a, ulong type_size,
 }
 
 __attribute__((intel_reqd_sub_group_size(16))) __kernel void
-simple_zero_pad_subg_16(__global void *a, const uint type_size,
+simple_zero_pad_subg_16(__global char *a, const uint type_size,
         const ulong base_offset, const ulong b_block_size,
         const ulong b_block_offset, const ulong d0_stride,
         const ulong d1_stride, const ulong d2_stride, const ulong d3_stride,
@@ -148,7 +148,7 @@ simple_zero_pad_subg_16(__global void *a, const uint type_size,
     const unsigned d1_dim = mixed_dims % d1_size;
     const unsigned d0_dim = mixed_dims / d1_size;
 
-    __global void *p = a + base_offset;
+    __global char *p = a + base_offset;
     p += a_block_id * b_block_size;
     p += b_block_id * b_block_offset;
     p += d0_dim * d0_stride;
@@ -178,7 +178,7 @@ simple_zero_pad_subg_16(__global void *a, const uint type_size,
 
 __attribute__((intel_reqd_sub_group_size(16))) __kernel void
 simple_zero_pad_subg_16_mask_and_clear_dt_1b(
-        __global void *a, const uint mask) {
+        __global char *a, const uint mask) {
     const uint block_size = 8;
     const uint data_stride = 32;
     const uint simd = 16;
@@ -186,7 +186,7 @@ simple_zero_pad_subg_16_mask_and_clear_dt_1b(
     const ulong offset = get_global_id(0) * block_size;
     const unsigned subg_local_id = get_sub_group_local_id();
 
-    __global void *p = a + offset;
+    __global char *p = a + offset;
 
     const uint mask_val = mask > subg_local_id ? 1 : 0;
 
diff --git a/src/gpu/intel/ocl/simple_zero_pad.cpp b/src/gpu/intel/ocl/simple_zero_pad.cpp
index 24c575bb94f..a37246e9f70 100644
--- a/src/gpu/intel/ocl/simple_zero_pad.cpp
+++ b/src/gpu/intel/ocl/simple_zero_pad.cpp
@@ -164,7 +164,6 @@ status_t simple_zero_pad_t::execute_subg_16(const exec_ctx_t &ctx,
     const int ndims = mdw.ndims();
     const auto &dims = mdw.dims();
     const auto &pdims = mdw.padded_dims();
-    const auto mem_total_size = mdw.size();
 
     const auto most_inner_nblk = blocking_desc.inner_nblks - 1;
 
@@ -214,10 +213,10 @@ status_t simple_zero_pad_t::execute_subg_16(const exec_ctx_t &ctx,
         const cl_ulong s2most_inner_block_stride = mem_dt_size
                 * blocking_desc.strides[blocking_desc.inner_idxs[most_inner_nblk
                         - 1]];
-        const unsigned most_inner_block_write_multiplier
-                = (pdims[blocking_desc.inner_idxs[most_inner_nblk]]
-                          - dims[blocking_desc.inner_idxs[most_inner_nblk]])
-                / 16;
+        const unsigned most_inner_block_write_multiplier = into<unsigned>(
+                (pdims[blocking_desc.inner_idxs[most_inner_nblk]]
+                        - dims[blocking_desc.inner_idxs[most_inner_nblk]])
+                / 16);
 
         arg_list.set(2, most_inner_block_base_offset);
         arg_list.set(4, s2most_inner_block_stride);
@@ -238,10 +237,16 @@ status_t simple_zero_pad_t::execute_subg_16(const exec_ctx_t &ctx,
         CHECK(status);
 
         if (dims[blocking_desc.inner_idxs[most_inner_nblk - 1]]
-                        != pdims[blocking_desc.inner_idxs[most_inner_nblk - 1]]
-                && s2most_inner_block_stride != mem_total_size) {
-            const cl_ulong base_offset_b2 = most_inner_block_base_offset
-                    + s2most_inner_block_stride * gws1;
+                != pdims[blocking_desc.inner_idxs[most_inner_nblk - 1]]) {
+
+            // skip inner block stride if dim < single block size
+            const bool s2most_dim_lt_block_size
+                    = dims[blocking_desc.inner_idxs[most_inner_nblk - 1]]
+                    < blocking_desc.inner_blks[most_inner_nblk - 1];
+            const cl_ulong base_offset_b2 = (s2most_dim_lt_block_size)
+                    ? most_inner_block_base_offset
+                    : most_inner_block_base_offset
+                            + s2most_inner_block_stride * gws1;
             arg_list.set(2, base_offset_b2);
 
             const size_t gws_10 = 16
@@ -265,8 +270,9 @@ status_t simple_zero_pad_t::execute_subg_16(const exec_ctx_t &ctx,
     const cl_ulong most_inner_block_offset = mem_dt_size
             * blocking_desc.strides[blocking_desc.inner_idxs[most_inner_nblk]];
 
-    const unsigned most_inner_block_write_multiplier = nstl::max<dnnl_dim_t>(
-            blocking_desc.inner_blks[most_inner_nblk] / 16, 1);
+    const unsigned most_inner_block_write_multiplier
+            = into<unsigned>(nstl::max<dnnl_dim_t>(
+                    blocking_desc.inner_blks[most_inner_nblk] / 16, 1));
 
     arg_list.set(2, s2most_inner_block_base_offset);
     arg_list.set(4, most_inner_block_offset);
@@ -312,8 +318,8 @@ status_t simple_zero_pad_t::execute_subg_16_mask_and_clear_dt_1B(
     compute::kernel_arg_list_t arg_list;
     arg_list.set(0, *mem_storage);
 
-    const unsigned mask
-            = dims[blocking_desc.inner_idxs[0]] % blocking_desc.inner_blks[0];
+    const unsigned mask = into<unsigned>(
+            dims[blocking_desc.inner_idxs[0]] % blocking_desc.inner_blks[0]);
     arg_list.set(1, mask);
 
     const unsigned block_size = 16 * 8; // SIMD * block_size
diff --git a/src/gpu/intel/ocl/stream.cpp b/src/gpu/intel/ocl/stream.cpp
new file mode 100644
index 00000000000..5ece1a8f88e
--- /dev/null
+++ b/src/gpu/intel/ocl/stream.cpp
@@ -0,0 +1,143 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cstring>
+
+#include <CL/cl.h>
+
+#include "common/verbose.hpp"
+
+#include "xpu/ocl/engine_impl.hpp"
+#include "xpu/ocl/memory_storage.hpp"
+#include "xpu/ocl/stream_profiler.hpp"
+
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/stream.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+status_t stream_t::init() {
+    if (is_profiling_enabled()) {
+        profiler_ = utils::make_unique<xpu::ocl::stream_profiler_t>(this);
+        mdapi_helper_ = utils::make_unique<mdapi_helper_t>();
+    }
+    // Restore queue on successful exit, otherwise queue may be released
+    // without retain
+    cl_command_queue queue = impl()->queue();
+    impl()->set_queue(nullptr);
+
+    assert(engine()->kind() == engine_kind::gpu);
+
+    const auto *ocl_engine_impl
+            = utils::downcast<const xpu::ocl::engine_impl_t *>(
+                    engine()->impl());
+
+    // Create queue if it is not set
+    if (!queue) {
+        cl_int err;
+        queue = create_queue(
+                ocl_engine_impl->context(), ocl_engine_impl->device(), &err);
+        OCL_CHECK(err);
+    } else {
+        // Check that queue is compatible with the engine
+        cl_context ocl_ctx;
+        OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT,
+                sizeof(cl_context), &ocl_ctx, nullptr));
+
+        cl_device_id ocl_dev;
+        OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE,
+                sizeof(cl_device_id), &ocl_dev, nullptr));
+
+        if (ocl_engine_impl->device() != ocl_dev
+                || ocl_engine_impl->context() != ocl_ctx)
+            return status::invalid_arguments;
+
+        OCL_CHECK(clRetainCommandQueue(queue));
+    }
+    impl()->set_queue(queue);
+
+    if (is_profiling_enabled()) {
+        cl_command_queue_properties props;
+        OCL_CHECK(clGetCommandQueueInfo(impl()->queue(), CL_QUEUE_PROPERTIES,
+                sizeof(props), &props, nullptr));
+        bool is_out_of_order
+                = (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
+        if (is_out_of_order) {
+            VERROR(common, ocl,
+                    "OpenCL kernel profiling is not "
+                    "supported with out-of-order queues");
+            return status::invalid_arguments;
+        }
+    }
+
+    return status::success;
+}
+
+cl_command_queue stream_t::create_queue(
+        cl_context ctx, cl_device_id dev, cl_int *err) const {
+    if (is_profiling_enabled() && mdapi_helper_) {
+        auto ret = mdapi_helper_->create_queue(ctx, dev, err);
+        if (ret) return ret;
+    }
+
+    const bool is_out_of_order = (flags() & stream_flags::out_of_order);
+
+    cl_command_queue_properties queue_props {};
+    if (is_profiling_enabled()) queue_props |= CL_QUEUE_PROFILING_ENABLE;
+    if (is_out_of_order) queue_props |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+#ifdef CL_VERSION_2_0
+    cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, queue_props, 0};
+    return clCreateCommandQueueWithProperties(ctx, dev, props, err);
+#else
+    return clCreateCommandQueue(ctx, dev, queue_props, err);
+#endif
+}
+
+void stream_t::before_exec_hook() {
+    if (is_profiling_enabled()) profiler_->start_profiling();
+}
+
+void stream_t::after_exec_hook() {
+    ocl_ctx().set_deps(xpu::ocl::event_t());
+    if (is_profiling_enabled()) profiler_->stop_profiling();
+}
+
+status_t stream_t::copy(const memory_storage_t &src,
+        const memory_storage_t &dst, size_t size, const xpu::event_t &deps,
+        xpu::event_t &out_dep) {
+    return impl()->copy(this, src, dst, size, deps, out_dep, profiler_.get());
+}
+
+status_t stream_t::fill(const memory_storage_t &dst, uint8_t pattern,
+        size_t size, const xpu::event_t &deps, xpu::event_t &out_dep) {
+    return impl()->fill(
+            this, dst, pattern, size, deps, out_dep, profiler_.get());
+}
+
+status_t stream_t::barrier() {
+    return impl()->barrier();
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/stream.hpp b/src/gpu/intel/ocl/stream.hpp
new file mode 100644
index 00000000000..65d82c18e7a
--- /dev/null
+++ b/src/gpu/intel/ocl/stream.hpp
@@ -0,0 +1,127 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_STREAM_HPP
+#define GPU_INTEL_OCL_STREAM_HPP
+
+#include <memory>
+
+#include "common/c_types_map.hpp"
+#include "common/thread_local_storage.hpp"
+
+#include "xpu/stream_profiler.hpp"
+
+#include "xpu/ocl/context.hpp"
+#include "xpu/ocl/stream_impl.hpp"
+
+#include "gpu/intel/compute/compute_stream.hpp"
+#include "gpu/intel/ocl/mdapi_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+struct stream_t : public compute::compute_stream_t {
+    static status_t create_stream(impl::stream_t **stream,
+            impl::engine_t *engine, impl::stream_impl_t *stream_impl) {
+
+        std::unique_ptr<stream_t> s(new stream_t(engine, stream_impl));
+        if (!s) return status::out_of_memory;
+
+        status_t status = s->init();
+        if (status != status::success) {
+            // Stream owns stream_impl only if it's created successfully (including initialization).
+            s->impl_.release();
+            return status;
+        }
+
+        *stream = s.release();
+        return status::success;
+    }
+
+    status_t wait() override { return impl()->wait(); }
+
+    void before_exec_hook() override;
+    void after_exec_hook() override;
+
+    double get_freq(const xpu::event_t &event) const override {
+        const auto &ocl_event = xpu::ocl::event_t::from(event).events;
+        gpu_assert(ocl_event.size() == 1);
+        return mdapi_helper().get_freq(ocl_event[0]);
+    }
+
+    status_t reset_profiling() override {
+        if (!is_profiling_enabled()) return status::invalid_arguments;
+        profiler_->reset();
+        return status::success;
+    }
+
+    status_t get_profiling_data(profiling_data_kind_t data_kind,
+            int *num_entries, uint64_t *data) const override {
+        if (!is_profiling_enabled()) return status::invalid_arguments;
+        return profiler_->get_info(data_kind, num_entries, data);
+    }
+
+    cl_command_queue queue() const { return impl()->queue(); }
+
+    const mdapi_helper_t &mdapi_helper() const { return *mdapi_helper_; }
+
+    status_t copy(const memory_storage_t &src, const memory_storage_t &dst,
+            size_t size, const xpu::event_t &deps,
+            xpu::event_t &out_dep) override;
+
+    status_t fill(const memory_storage_t &dst, uint8_t pattern, size_t size,
+            const xpu::event_t &deps, xpu::event_t &out_dep) override;
+
+    status_t barrier() override;
+
+    ~stream_t() override = default;
+
+    const xpu::ocl::context_t &ocl_ctx() const { return impl()->ocl_ctx(); }
+    xpu::ocl::context_t &ocl_ctx() { return impl()->ocl_ctx(); }
+    xpu::context_t &ctx() override { return impl()->ocl_ctx(); }
+    const xpu::context_t &ctx() const override { return impl()->ocl_ctx(); }
+
+    const xpu::ocl::wrapper_t<cl_event> &get_output_event() const {
+        return impl()->get_output_event();
+    }
+
+private:
+    xpu::ocl::stream_impl_t *impl() const {
+        return (xpu::ocl::stream_impl_t *)impl::stream_t::impl_.get();
+    }
+
+    stream_t(impl::engine_t *engine, impl::stream_impl_t *stream_impl)
+        : compute_stream_t(engine, stream_impl) {}
+
+    status_t init();
+
+    cl_command_queue create_queue(
+            cl_context ctx, cl_device_id dev, cl_int *err) const;
+
+    std::unique_ptr<mdapi_helper_t> mdapi_helper_;
+};
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/ocl/subbyte_pack.cl b/src/gpu/intel/ocl/subbyte_pack.cl
new file mode 100644
index 00000000000..47f3ccd74b2
--- /dev/null
+++ b/src/gpu/intel/ocl/subbyte_pack.cl
@@ -0,0 +1,34 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#define DT_UNDEF 1
+#include "gpu/intel/ocl/ocl_types.h"
+#include "gpu/intel/ocl/types_interop.h"
+
+__kernel void subbyte_pack(__global uchar *restrict src,
+        __global uchar *restrict dst, off_t n, int bits, int64x3_t offset) {
+    const uchar mask = (1 << bits) - 1;
+
+    const off_t dst_off = get_global_id(0) + offset.array[0];
+    const off_t src_off = (8 / bits) * dst_off;
+
+    uchar packed = 0;
+    for (int i = 0, j = 0; i < 8; i += bits, ++j) {
+        uchar byte = src_off + j < n ? src[src_off + j] : 0;
+        packed |= (byte & mask) << i;
+    }
+    dst[dst_off] = packed;
+}
diff --git a/src/gpu/intel/ocl/tile_ops.h b/src/gpu/intel/ocl/tile_ops.h
index a81af96bf45..4fab3436bda 100644
--- a/src/gpu/intel/ocl/tile_ops.h
+++ b/src/gpu/intel/ocl/tile_ops.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2024 Intel Corporation
+ * Copyright 2024-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define GPU_OCL_TILE_OPS_H
 
 #include "gpu/intel/ocl/ocl_generic_vector_ops.h"
+#include "gpu/intel/ocl/ocl_types.h"
 
 float __builtin_IB_atomic_max_local_f32(__local float *, float);
 
@@ -30,6 +31,11 @@ __attribute__((overloadable)) half local_atomic_max(
     return v;
 }
 
+__attribute__((overloadable)) ushort local_atomic_max(
+        local ushort *p, ushort v) { /* not implemented */
+    return v;
+}
+
 __attribute__((overloadable)) uint local_atomic_max(local uint *p, uint v) {
     return atomic_max(p, v);
 }
@@ -86,16 +92,63 @@ __attribute__((overloadable)) int local_atomic_max(local int *p, int v) {
                 as_##itype##8(v.s89abcdef)); \
     }
 
+#define DEF_BLOCK_LOAD_STORE32(type, itype, suffix) \
+    __attribute__((overloadable)) \
+            type##32 block_load(const global type *p, int vlen) __attribute__( \
+                    (enable_if(vlen == 32, "wrong vector length"))) { \
+        type##32 x; \
+        x = (type##32)(as_##type##8(intel_sub_group_block_read##suffix##8( \
+                               (global void *)p)), \
+                as_##type##8(intel_sub_group_block_read##suffix##8( \
+                        (global void *)(p + 8 * get_sub_group_size()))), \
+                as_##type##8(intel_sub_group_block_read##suffix##8( \
+                        (global void *)(p + 16 * get_sub_group_size()))), \
+                as_##type##8(intel_sub_group_block_read##suffix##8( \
+                        (global void *)(p + 24 * get_sub_group_size())))); \
+        return x; \
+    } \
+    __attribute__((overloadable)) void block_store( \
+            global type *p, type##32 v) { \
+        intel_sub_group_block_write##suffix##8((global itype *)p, \
+                (itype##8)(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7])); \
+        intel_sub_group_block_write##suffix##8( \
+                (global itype *)(p + 8 * get_sub_group_size()), \
+                (itype##8)(v[8], v[9], v[10], v[11], v[12], v[13], v[14], \
+                        v[15])); \
+        intel_sub_group_block_write##suffix##8( \
+                (global itype *)(p + 16 * get_sub_group_size()), \
+                (itype##8)(v[16], v[17], v[18], v[19], v[20], v[21], v[22], \
+                        v[23])); \
+        intel_sub_group_block_write##suffix##8( \
+                (global itype *)(p + 24 * get_sub_group_size()), \
+                (itype##8)(v[24], v[25], v[26], v[27], v[28], v[29], v[30], \
+                        v[31])); \
+    }
+
 DEF_BLOCK_LOAD_STORE1(half, ushort, _us)
 DEF_BLOCK_LOAD_STORE(half, ushort, _us, 2)
 DEF_BLOCK_LOAD_STORE(half, ushort, _us, 4)
 DEF_BLOCK_LOAD_STORE(half, ushort, _us, 8)
 DEF_BLOCK_LOAD_STORE(half, ushort, _us, 16)
+typedef ushort ushort32 __attribute__((ext_vector_type(32)));
+typedef half half32 __attribute__((ext_vector_type(32)));
+DEF_BLOCK_LOAD_STORE32(half, ushort, _us)
+
+typedef ushort ushort1 __attribute__((ext_vector_type(1)));
+DEF_BLOCK_LOAD_STORE1(ushort, ushort, _us)
+DEF_BLOCK_LOAD_STORE(ushort, ushort, _us, 2)
+DEF_BLOCK_LOAD_STORE(ushort, ushort, _us, 4)
+DEF_BLOCK_LOAD_STORE(ushort, ushort, _us, 8)
+DEF_BLOCK_LOAD_STORE(ushort, ushort, _us, 16)
+DEF_BLOCK_LOAD_STORE32(ushort, ushort, _us)
+
 DEF_BLOCK_LOAD_STORE1(uint, uint, )
 DEF_BLOCK_LOAD_STORE(uint, uint, , 2)
 DEF_BLOCK_LOAD_STORE(uint, uint, , 4)
 DEF_BLOCK_LOAD_STORE(uint, uint, , 8)
 DEF_BLOCK_LOAD_STORE16(uint, uint, )
+typedef uint uint32 __attribute__((ext_vector_type(32)));
+DEF_BLOCK_LOAD_STORE32(uint, uint, )
 
 #define DEF_BLOCK2D_LOAD_STORE(type, itype, vl, SG, suffix, BR, BC) \
     itype##vl __builtin_IB_subgroup_block_read_flat_##suffix( \
@@ -118,7 +171,7 @@ DEF_BLOCK_LOAD_STORE16(uint, uint, )
                 pp, w - 1, h - 1, ld - 1, coord)); \
     } \
     __attribute__((overloadable)) void block2d_store(type##vl v, \
-            global type *p, int w, int h, int ld, int x, int y, int br, \
+            const global type *p, int w, int h, int ld, int x, int y, int br, \
             int bc, \
             int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \
             __attribute__((enable_if(bc == BC, "wrong #columns"))) \
@@ -134,9 +187,14 @@ DEF_BLOCK_LOAD_STORE16(uint, uint, )
                 pp, w - 1, h - 1, ld - 1, coord, as_##itype##vl(v)); \
     }
 
+DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m8k16v1, 16, 8)
 DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m4k32v1, 32, 4)
 DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
 
+DEF_BLOCK2D_LOAD_STORE(ushort, ushort, 8, 16, u16_m8k16v1, 16, 8)
+DEF_BLOCK2D_LOAD_STORE(ushort, ushort, 8, 16, u16_m4k32v1, 32, 4)
+DEF_BLOCK2D_LOAD_STORE(ushort, ushort, 16, 16, u16_m8k32v1, 32, 8)
+
 #define tile_fill(t, v) \
     do { \
         _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
@@ -176,14 +234,15 @@ DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
                 = __builtin_convertvector(t.x[i], __typeof__(t_new.x[i])); \
     } while (0)
 
-#define tile_copy_to_half2(t, t_new) \
+#define tile_copy_to_vec2(t, t_new, type) \
     do { \
         _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
                                i++) { \
             _Pragma("unroll") for (int s = 0; \
                                    s < sizeof(t.x[0]) / sizeof(t.x[0][0]) / 2; \
                                    s++) { \
-                half2 v = {t.x[i][2 * s], t.x[i][2 * s + 1]}; \
+                type v = {CONVERT_DATA_T(t.x[i][2 * s]), \
+                        CONVERT_DATA_T(t.x[i][2 * s + 1])}; \
                 t_new.x[i][s] = as_uint(v); \
             } \
         } \
@@ -196,6 +255,21 @@ DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
 #define xlane_tile_access(t, i, j, sg, br, bc, nbr) \
     sub_group_broadcast(tile_access(t, i, j, sg, br, bc, nbr), i % sg)
 
+#define tile_predicated_assignment_t( \
+        t, sg_offset_r, sg_offset_c, predicate, value, sg, br, bc, nbr, nbc) \
+    do { \
+        for (int j = 0; j < (bc * nbc); j++) { \
+            for (int i0 = 0; i0 < (br * nbr); i0 += sg) { \
+                int i = i0 + get_sub_group_local_id(); \
+                int offset_r = sg_offset_r + j; \
+                int offset_c = sg_offset_c + i; \
+                if (predicate(offset_r, offset_c)) { \
+                    tile_access(t, i0, j, sg, br, bc, nbr) = value; \
+                } \
+            } \
+        } \
+    } while (0)
+
 #define DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) \
     __attribute__((overloadable)) void tile_load_full(tile_type *t, \
             const global element_type *ptr, int ld, int offset_r, \
@@ -457,13 +531,14 @@ DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
     }
 
 #define DECLARE_2D_TILE_COPY_REBLOCK(tile_type0, sg0, br0, bc0, nbr0, nbc0, \
-        tile_type1, sg1, br1, bc1, nbr1, nbc1) \
+        tile_type1, sg1, br1, bc1, nbr1, nbc1, conversion_func) \
     __attribute__((overloadable)) void tile_copy_reblock( \
             tile_type0 t0, tile_type1 *t1) { \
         _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \
             _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \
                 tile_access(*t1, i0, j, sg1, br1, bc1, nbr1) \
-                        = tile_access(t0, i0, j, sg0, br0, bc0, nbr0); \
+                        = conversion_func( \
+                                tile_access(t0, i0, j, sg0, br0, bc0, nbr0)); \
             } \
         } \
     }
@@ -542,8 +617,8 @@ DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
         tile_load_block2d(t, ptr, m, n, m, offset_r, offset_c); \
     } \
     __attribute__((overloadable)) void tile_store_block2d(tile_type t, \
-            global element_type *ptr, int m, int n, int ld, int offset_r, \
-            int offset_c) { \
+            const global element_type *ptr, int m, int n, int ld, \
+            int offset_r, int offset_c) { \
         const int e = sizeof(element_type); \
         _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \
             _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) block2d_store( \
@@ -557,16 +632,17 @@ DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
         tile_store_block2d(t, ptr, m, n, m, offset_r, offset_c); \
     }
 
-#define DECLARE_2D_TILE_LOAD_PACKED_HALF(tile_type, sg, br, bc, nbr, nbc) \
-    __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \
-            const global half *ptr, int m, int n, int ld, int offset_r, \
-            int offset_c) { \
+#define DECLARE_2D_TILE_LOAD_PACKED_VEC( \
+        tile_type, element_type, vec_type, sg, br, bc, nbr, nbc) \
+    __attribute__((overloadable)) void tile_load_packed_vec2(tile_type *t, \
+            const global element_type *ptr, int m, int n, int ld, \
+            int offset_r, int offset_c) { \
         ptr += ld * offset_c + offset_r; \
         _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
             if (offset_c + j < n) { \
                 _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
                     int i = 2 * (i0 + get_sub_group_local_id()); \
-                    half2 loaded = 0; \
+                    vec_type loaded = 0; \
                     if (offset_r + i < m) loaded.s0 = ptr[i]; \
                     if (offset_r + i + 1 < m) loaded.s1 = ptr[i + 1]; \
                     tile_access(*t, i0, j, sg, br, bc, nbr) = as_uint(loaded); \
@@ -574,10 +650,10 @@ DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
             } \
         } \
     } \
-    __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \
-            const global half *ptr, int m, int n, int offset_r, \
+    __attribute__((overloadable)) void tile_load_packed_vec2(tile_type *t, \
+            const global element_type *ptr, int m, int n, int offset_r, \
             int offset_c) { \
-        tile_load_packed_half(t, ptr, m, n, m, offset_r, offset_c); \
+        tile_load_packed_vec2(t, ptr, m, n, m, offset_r, offset_c); \
     }
 
 #define cooperative_prefetch_2d(ptr, r, c, ld, sg_id, n_sg, sg_size, caching) \
@@ -614,11 +690,13 @@ __attribute__((overloadable)) void cooperative_prefetch_2d_internal(
         uint n_sg, uint sg_size, enum LSC_LDCC caching) {
     const uint cl_per_col = (rbytes + 63) >> 6;
     const uint cl = cl_per_col * c;
+
     const uint cl_per_sg = (cl + n_sg - 1) / n_sg;
     const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size;
 #pragma unroll
     for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) {
-        uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id();
+        uint i_cl = (ii_cl * cl_per_sg + sg_id) * sg_size
+                + get_sub_group_local_id();
         uint r_cl = i_cl % cl_per_col;
         uint c_cl = i_cl / cl_per_col;
         if (i_cl < cl) {
@@ -640,7 +718,8 @@ __attribute__((overloadable)) void cooperative_prefetch_2d_internal(
     const uint max_off = rbytes - 1 + (c - 1) * ld_bytes;
 #pragma unroll
     for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) {
-        uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id();
+        uint i_cl = (ii_cl * cl_per_sg + sg_id) * sg_size
+                + get_sub_group_local_id();
         uint r_cl = i_cl % cl_per_col;
         uint c_cl = i_cl / cl_per_col;
         uint pf_off = min(r_cl * 64 + c_cl * ld_bytes, max_off);
diff --git a/src/gpu/intel/ocl/types_interop.hpp b/src/gpu/intel/ocl/types_interop.hpp
index 8c6a6876918..da38983b681 100644
--- a/src/gpu/intel/ocl/types_interop.hpp
+++ b/src/gpu/intel/ocl/types_interop.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2024 Intel Corporation
+ * Copyright 2024-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,37 +27,37 @@ namespace intel {
 namespace compute {
 
 template <>
-struct scalar_type_traits<int64x2_t> {
+struct scalar_type_traits_t<int64x2_t> {
     static const auto type = scalar_type_t::_int64x3_t;
 };
 
 template <>
-struct scalar_type_traits<int64x3_t> {
+struct scalar_type_traits_t<int64x3_t> {
     static const auto type = scalar_type_t::_int64x3_t;
 };
 
 template <>
-struct scalar_type_traits<int64x4_t> {
+struct scalar_type_traits_t<int64x4_t> {
     static const auto type = scalar_type_t::_int64x4_t;
 };
 
 template <>
-struct scalar_type_traits<int64x5_t> {
+struct scalar_type_traits_t<int64x5_t> {
     static const auto type = scalar_type_t::_int64x5_t;
 };
 
 template <>
-struct scalar_type_traits<int64x6_t> {
+struct scalar_type_traits_t<int64x6_t> {
     static const auto type = scalar_type_t::_int64x6_t;
 };
 
 template <>
-struct scalar_type_traits<dispatch_gws_rt_params_t> {
+struct scalar_type_traits_t<dispatch_gws_rt_params_t> {
     static const auto type = scalar_type_t::_dispatch_gws_rt_params_t;
 };
 
 template <>
-struct scalar_type_traits<zero_pad_mask_t> {
+struct scalar_type_traits_t<zero_pad_mask_t> {
     static const auto type = scalar_type_t::_zero_pad_mask_t;
 };
 
diff --git a/src/gpu/intel/ocl/utils.cpp b/src/gpu/intel/ocl/utils.cpp
new file mode 100644
index 00000000000..d4c88a6e2a2
--- /dev/null
+++ b/src/gpu/intel/ocl/utils.cpp
@@ -0,0 +1,502 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <algorithm>
+#include <cstring>
+#include <iostream>
+#include <mutex>
+#include <CL/cl_ext.h>
+
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/hw_info.hpp"
+#include "gpu/intel/ocl/kernel.hpp"
+#include "gpu/intel/ocl/utils.hpp"
+#include "xpu/ocl/utils.hpp"
+
+#ifdef DNNL_WITH_SYCL
+#include "gpu/intel/sycl/engine.hpp"
+#endif
+
+#ifndef CL_KERNEL_BINARY_PROGRAM_INTEL
+#define CL_KERNEL_BINARY_PROGRAM_INTEL 0x407D
+#endif
+
+#ifndef CL_DEVICE_NUM_SLICES_INTEL
+#define CL_DEVICE_NUM_SLICES_INTEL 0x4252
+#endif
+
+#ifndef CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL
+#define CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL 0x4253
+#endif
+
+#ifndef CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL
+#define CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL 0x4254
+#endif
+
+#ifndef CL_DEVICE_FEATURE_CAPABILITIES_INTEL
+#define CL_DEVICE_FEATURE_CAPABILITIES_INTEL 0x4256
+#endif
+
+#ifndef CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT
+#define CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT 0x4231
+#endif
+
+#ifndef CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT
+#define CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT 0x4232
+#endif
+
+#ifndef CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT
+#define CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT 0x4233
+#endif
+
+#ifndef CL_DEVICE_ATOMIC_FLAGS
+#define CL_DEVICE_ATOMIC_FLAGS
+#define CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT (1 << 0)
+#define CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT (1 << 1)
+#define CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT (1 << 2)
+#define CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT (1 << 16)
+#define CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT (1 << 17)
+#define CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT (1 << 18)
+#endif
+
+#ifndef CL_DEVICE_FEATURE_FLAG_DPAS_INTEL
+#define CL_DEVICE_FEATURE_FLAG_DPAS_INTEL (1 << 1)
+#endif
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+/// Tries to build a kernel with assembly instructions to check to see if the
+/// OpenCL compiler supports microkernels.
+bool try_building_with_microkernels(cl_context context, cl_device_id device) {
+    const char *kernel_code = R""""(
+        kernel void igc_check() {
+            __asm__ volatile(
+                    ".decl AA0 v_type=G type=ud num_elts=1\n"
+                    ".decl AA1 v_type=G type=ud num_elts=1\n"
+                    ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
+                    ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
+                    "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
+            );
+        }
+        )"""";
+    cl_int err;
+    /// Not using existing build infrastructure to avoid error messages in the CI logs
+    xpu::ocl::wrapper_t<cl_program> program(
+            clCreateProgramWithSource(context, 1, &kernel_code, nullptr, &err));
+    if (err != CL_SUCCESS) return false;
+    err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
+    return err == CL_SUCCESS;
+}
+
+int get_sycl_ocl_device_and_context(
+        xpu::ocl::wrapper_t<cl_context> &ocl_context,
+        xpu::ocl::wrapper_t<cl_device_id> &ocl_device,
+        const impl::engine_t *engine) {
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+    auto *sycl_engine = utils::downcast<const sycl::engine_t *>(engine);
+    auto &device = sycl_engine->device();
+
+    auto be = xpu::sycl::get_backend(device);
+    if (be == xpu::sycl::backend_t::opencl) {
+        cl_int err = CL_SUCCESS;
+        auto ocl_dev = xpu::sycl::compat::get_native<cl_device_id>(device);
+        ocl_device = xpu::ocl::make_wrapper(ocl_dev, true);
+
+        ocl_context = xpu::ocl::make_wrapper(
+                clCreateContext(nullptr, 1, &ocl_dev, nullptr, nullptr, &err),
+                true);
+        if (err) return -1;
+    } else if (be == xpu::sycl::backend_t::level0) {
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t> ocl_engine;
+        auto err
+                = gpu::intel::sycl::create_ocl_engine(&ocl_engine, sycl_engine);
+        if (err != status::success) return -1;
+        ocl_device = xpu::ocl::make_wrapper(ocl_engine->device(), true);
+        ocl_context = xpu::ocl::make_wrapper(ocl_engine->context(), true);
+    }
+#endif
+    return 0;
+}
+
+bool mayiuse_microkernels(const impl::engine_t *engine) {
+    auto mayiuse_mk = [](const impl::engine_t *engine) {
+        xpu::ocl::wrapper_t<cl_device_id> ocl_device;
+        xpu::ocl::wrapper_t<cl_context> ocl_context;
+
+        switch (engine->runtime_kind()) {
+            case runtime_kind::sycl: {
+                auto err = get_sycl_ocl_device_and_context(
+                        ocl_context, ocl_device, engine);
+                if (err) return false;
+            } break;
+            case runtime_kind::ocl: {
+                const engine_t *eng = utils::downcast<const engine_t *>(engine);
+                ocl_device = xpu::ocl::make_wrapper(eng->device(), true);
+                ocl_context = xpu::ocl::make_wrapper(eng->context(), true);
+            } break;
+            default: return false;
+        }
+
+        bool mayiuse_microkernels = get_driver_version(ocl_device)
+                >= xpu::runtime_version_t(24, 22, 29735);
+        if (!mayiuse_microkernels) {
+            mayiuse_microkernels
+                    = try_building_with_microkernels(ocl_context, ocl_device);
+        }
+        return mayiuse_microkernels;
+    };
+
+    static std::map<engine_id_t, bool> engine_microkernel_map {
+            {engine->engine_id(), mayiuse_mk(engine)}};
+
+    static std::mutex map_mutex;
+    std::lock_guard<std::mutex> map_lock(map_mutex);
+    auto it = engine_microkernel_map.find(engine->engine_id());
+    if (it != std::end(engine_microkernel_map)) { return it->second; }
+    return engine_microkernel_map[engine->engine_id()] = mayiuse_mk(engine);
+}
+
+status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
+        cl_kernel ocl_kernel, cl_uint idx, bool allow_undef) {
+    char s_type[16];
+    auto cl_status = clGetKernelArgInfo(ocl_kernel, idx,
+            CL_KERNEL_ARG_TYPE_NAME, sizeof(s_type), s_type, nullptr);
+    if (cl_status == CL_SUCCESS) {
+#define CASE(x) \
+    if (!strcmp(STRINGIFY(x), s_type)) { \
+        *type = compute::scalar_type_t::_##x; \
+        return status::success; \
+    }
+        CASE(char)
+        CASE(float)
+        CASE(half)
+        CASE(int)
+        CASE(long)
+        CASE(short)
+        CASE(uchar)
+        CASE(uint)
+        CASE(ulong)
+        CASE(ushort)
+        CASE(zero_pad_mask_t)
+#undef CASE
+    }
+
+    if (allow_undef) {
+        *type = compute::scalar_type_t::undef;
+        return status::success;
+    }
+
+    assert(!"Not expected");
+    return status::runtime_error;
+}
+
+static status_t get_number_devices(cl_program program, size_t *n_devices) {
+    cl_int err = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
+            sizeof(size_t), n_devices, nullptr);
+    OCL_CHECK(err);
+    return status::success;
+}
+
+status_t get_ocl_program_binary_size(
+        cl_kernel kernel, cl_device_id device, size_t *size) {
+    cl_program program;
+    cl_int err = clGetKernelInfo(
+            kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
+    OCL_CHECK(err);
+
+    size_t n_devices = 0;
+    CHECK(get_number_devices(program, &n_devices));
+
+    std::vector<size_t> binary_sizes(n_devices);
+    err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+            sizeof(size_t) * n_devices, binary_sizes.data(), nullptr);
+    OCL_CHECK(err);
+
+    // Identify local device index in the list of devices the program was
+    // compiled for. Using global indexing through `get_device_index` may
+    // fail due to presence of two or more physical devices in the system.
+    std::vector<cl_device_id> devices(n_devices);
+    err = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
+            sizeof(cl_device_id) * n_devices, devices.data(), nullptr);
+    OCL_CHECK(err);
+
+    auto device_it = std::find(devices.begin(), devices.end(), device);
+    if (device_it == devices.end()) return status::invalid_arguments;
+
+    size_t device_idx = std::distance(devices.begin(), device_it);
+    (*size) = binary_sizes[device_idx];
+    return status::success;
+}
+
+status_t get_ocl_program_binary(
+        cl_program program, cl_device_id device, xpu::binary_t &binary) {
+    size_t n_devices = 0;
+    CHECK(get_number_devices(program, &n_devices));
+
+    std::vector<size_t> binarySize(n_devices);
+    cl_int err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
+            sizeof(size_t) * n_devices, binarySize.data(), nullptr);
+    OCL_CHECK(err);
+
+    std::vector<cl_device_id> devices(n_devices);
+    err = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
+            sizeof(cl_device_id) * n_devices, devices.data(), nullptr);
+    OCL_CHECK(err);
+
+    size_t device_idx = std::distance(
+            devices.begin(), std::find(devices.begin(), devices.end(), device));
+    std::vector<uint8_t *> binary_pointers(n_devices);
+    std::vector<xpu::binary_t> binaries(n_devices);
+    for (size_t i = 0; i < n_devices; ++i) {
+        binaries[i] = xpu::binary_t(binarySize[i]);
+        binary_pointers[i] = binaries[i].data();
+    }
+
+    err = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
+            sizeof(uint8_t *) * n_devices, binary_pointers.data(), nullptr);
+    OCL_CHECK(err);
+    binary = binaries[device_idx];
+    return status::success;
+}
+
+status_t get_ocl_program_binary(
+        cl_kernel kernel, cl_device_id device, xpu::binary_t &binary) {
+    cl_int err;
+
+    cl_program program;
+    err = clGetKernelInfo(
+            kernel, CL_KERNEL_PROGRAM, sizeof(program), &program, nullptr);
+    OCL_CHECK(err);
+
+    return get_ocl_program_binary(program, device, binary);
+}
+
+status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary) {
+    binary.clear();
+    size_t binary_size;
+    OCL_CHECK(clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL, 0,
+            nullptr, &binary_size));
+    binary.resize(binary_size);
+    OCL_CHECK(clGetKernelInfo(ocl_kernel, CL_KERNEL_BINARY_PROGRAM_INTEL,
+            binary.size(), binary.data(), nullptr));
+    return status::success;
+}
+
+void debugdump_processed_source(const std::string &source,
+        const std::string &options, const std::string &cl_options) {
+#if defined(__linux__) && defined(DNNL_DEV_MODE)
+    if (get_verbose(verbose_t::debuginfo) >= 10) {
+        auto get_defines = [](const std::string &from) {
+            std::string ret;
+            size_t pos = 0;
+            while (pos < from.length()) {
+                // Find next define argument
+                pos = from.find("-D", pos);
+
+                // Generate argument, quotes are interpreted literally, but
+                // other special shell characters need escaped. Does not
+                // currently handle quotes with the ' character or nested quotes
+                char quote_parity = true;
+                while (pos < from.length()) {
+                    if (quote_parity
+                            && utils::one_of(from[pos], '~', '#', '$', '&', '*',
+                                    '(', ')', '\\', '|', '[', ']', '{', '}',
+                                    ';', '\'', '<', '>', '/', '?', '!')) {
+                        ret += '\\';
+                    }
+                    ret += from[pos];
+                    if (from[pos] == '"') quote_parity ^= true;
+                    if (from[pos] == ' ' && quote_parity) break;
+
+                    pos++;
+                }
+            }
+            return ret;
+        };
+        auto execute_command = [](const std::string &cmd,
+                                       const std::string &stdin) {
+            std::string result;
+            std::array<char, 256> buffer;
+            FILE *pipe = popen(cmd.c_str(), "w");
+            fputs(stdin.c_str(), pipe);
+            if (pipe) {
+                while (fgets(buffer.data(), buffer.size(), pipe) != nullptr) {
+                    result += buffer.data();
+                }
+            }
+            pclose(pipe);
+            return result;
+        };
+
+        // Run utilities to evaluate preprocessor defines and format the file
+        // Theoretically, we can accomplish this task with libclang, but it
+        // seems more work than it is worth. Instead, wrapping this in OCL_DEBUG
+        // so that calls to the system are not included in the default build.
+
+        // Due to the use of a different C preprocessor, warnings should not be
+        // ignored, as they may correspond to a different behavior in the OpenCL
+        // C preprocessor
+        auto o = get_defines(options) + get_defines(cl_options);
+        std::string preprocess_cmd
+                = std::string() + "cpp -P " + o + " | clang-format";
+        execute_command(preprocess_cmd, source);
+        std::cout << "OCL_ARCH_OPTIONS: " << cl_options << std::endl;
+    }
+#endif
+}
+
+status_t get_kernel_arg_types(cl_kernel ocl_kernel,
+        std::vector<gpu::intel::compute::scalar_type_t> *arg_types) {
+    cl_uint nargs;
+    OCL_CHECK(clGetKernelInfo(
+            ocl_kernel, CL_KERNEL_NUM_ARGS, sizeof(nargs), &nargs, nullptr));
+
+    *arg_types = std::vector<gpu::intel::compute::scalar_type_t>(nargs);
+
+    for (cl_uint i = 0; i < nargs; i++) {
+        gpu::intel::compute::scalar_type_t type {};
+        CHECK(gpu::intel::ocl::get_ocl_kernel_arg_type(
+                &type, ocl_kernel, i, /*allow_undef=*/true));
+        (*arg_types)[i] = type;
+    }
+
+    return status::success;
+}
+
+status_t get_ocl_device_enabled_systolic_intel(
+        cl_device_id device, bool &enabled_systolic) {
+    cl_bitfield res;
+    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_FEATURE_CAPABILITIES_INTEL,
+            sizeof(cl_bitfield), &res, nullptr));
+    enabled_systolic = res & CL_DEVICE_FEATURE_FLAG_DPAS_INTEL;
+    return status::success;
+}
+
+status_t get_ocl_device_enabled_native_float_atomics(
+        cl_device_id device, uint64_t &native_extensions, bool is_xelpg) {
+    cl_bitfield res;
+
+    cl_int err
+            = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT,
+                    sizeof(cl_bitfield), &res, nullptr);
+    if (err == status::success) {
+        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
+                && res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
+            native_extensions |= (uint64_t)
+                    gpu::intel::compute::native_ext_t::fp16_atomic_load_store;
+        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
+                && res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
+            native_extensions |= (uint64_t)
+                    gpu::intel::compute::native_ext_t::fp16_atomic_add;
+        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
+                && res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+            native_extensions |= (uint64_t)
+                    gpu::intel::compute::native_ext_t::fp16_atomic_min_max;
+    }
+
+    err = clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT,
+            sizeof(cl_bitfield), &res, nullptr);
+    if (err == status::success) {
+        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
+                && res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
+            native_extensions |= (uint64_t)
+                    gpu::intel::compute::native_ext_t::fp32_atomic_load_store;
+        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
+                && res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
+            native_extensions |= (uint64_t)
+                    gpu::intel::compute::native_ext_t::fp32_atomic_add;
+        if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
+                && res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+            native_extensions |= (uint64_t)
+                    gpu::intel::compute::native_ext_t::fp32_atomic_min_max;
+    }
+
+    // XeLPG lacks native support for f64 atomics.
+    if (!is_xelpg) {
+        err = clGetDeviceInfo(device,
+                CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT,
+                sizeof(cl_bitfield), &res, nullptr);
+        if (err == status::success) {
+            if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT
+                    && res & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT)
+                native_extensions |= (uint64_t)gpu::intel::compute::
+                        native_ext_t::fp64_atomic_load_store;
+            if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT
+                    && res & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT)
+                native_extensions |= (uint64_t)
+                        gpu::intel::compute::native_ext_t::fp64_atomic_add;
+            if (res & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT
+                    && res & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT)
+                native_extensions |= (uint64_t)
+                        gpu::intel::compute::native_ext_t::fp64_atomic_min_max;
+        }
+    }
+
+    return status::success;
+}
+
+status_t get_ocl_device_eu_count(cl_device_id device,
+        gpu::intel::compute::gpu_arch_t arch, int32_t *eu_count) {
+    // Start with standard OpenCL query.
+    cl_uint max_compute_units = 0;
+    OCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS,
+            sizeof(max_compute_units), &max_compute_units, nullptr));
+
+    // Try to use Intel-specific slice/sub-slice queries to correct EU count
+    //   for certain buggy drivers.
+    bool ok = true;
+
+#ifdef _WIN32
+    // But don't try this on Windows Xe2 to avoid undercounting EUs.
+    ok &= (arch != gpu::intel::compute::gpu_arch_t::xe2);
+#endif
+
+    auto do_query = [&](cl_uint query) -> cl_uint {
+        cl_uint val = 0;
+        ok = ok
+                && (clGetDeviceInfo(device, query, sizeof(val), &val, nullptr)
+                        == CL_SUCCESS);
+        return val;
+    };
+
+    cl_uint num_slices = do_query(CL_DEVICE_NUM_SLICES_INTEL);
+    cl_uint num_sub_slices_per_slice
+            = do_query(CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL);
+    cl_uint num_eus_per_sub_slice
+            = do_query(CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL);
+
+    if (ok) {
+        /* Some drivers report incorrect values on Xe2 */
+        if (arch == gpu::intel::compute::gpu_arch_t::xe2)
+            num_eus_per_sub_slice = 8;
+        max_compute_units = std::min(max_compute_units,
+                num_slices * num_sub_slices_per_slice * num_eus_per_sub_slice);
+    }
+
+    *eu_count = (int32_t)max_compute_units;
+
+    return status::success;
+}
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/ocl/utils.hpp b/src/gpu/intel/ocl/utils.hpp
new file mode 100644
index 00000000000..a2fcb7854cf
--- /dev/null
+++ b/src/gpu/intel/ocl/utils.hpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_UTILS_HPP
+#define GPU_INTEL_OCL_UTILS_HPP
+
+#include <string.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include <CL/cl.h>
+#include <unordered_map>
+
+#include "common/c_types_map.hpp"
+#include "common/cpp_compat.hpp"
+#include "common/verbose.hpp"
+#include "gpu/intel/compute/device_info.hpp"
+#include "gpu/intel/compute/kernel_arg_list.hpp"
+#include "gpu/intel/compute/utils.hpp"
+#include "xpu/ocl/utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+namespace ocl {
+
+enum { OCL_BUFFER_ALIGNMENT = 128 };
+
+bool mayiuse_microkernels(const impl::engine_t *engine);
+
+status_t get_ocl_kernel_arg_type(compute::scalar_type_t *type,
+        cl_kernel ocl_kernel, int idx, bool allow_undef = false);
+
+status_t get_ocl_program_binary(
+        cl_program program, cl_device_id device, xpu::binary_t &binary);
+
+status_t get_ocl_program_binary(
+        cl_kernel kernel, cl_device_id device, xpu::binary_t &binary);
+
+status_t get_ocl_kernel_binary(cl_kernel ocl_kernel, xpu::binary_t &binary);
+
+status_t get_ocl_program_binary_size(
+        cl_kernel kernel, cl_device_id device, size_t *size);
+
+void debugdump_processed_source(const std::string &source,
+        const std::string &options, const std::string &ocl_options);
+
+status_t get_kernel_arg_types(cl_kernel ocl_kernel,
+        std::vector<gpu::intel::compute::scalar_type_t> *arg_types);
+
+status_t get_ocl_device_eu_count(cl_device_id device,
+        gpu::intel::compute::gpu_arch_t arch, int32_t *eu_count);
+
+status_t get_ocl_device_enabled_systolic_intel(
+        cl_device_id device, bool &systolic_enabled);
+
+status_t get_ocl_device_enabled_native_float_atomics(
+        cl_device_id device, uint64_t &native_extensions, bool is_xelpg);
+
+} // namespace ocl
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/intel/ocl/vectorized_lnorm.cpp b/src/gpu/intel/ocl/vectorized_lnorm.cpp
index 5b28d1e1943..6fa21c2883e 100644
--- a/src/gpu/intel/ocl/vectorized_lnorm.cpp
+++ b/src/gpu/intel/ocl/vectorized_lnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,9 +18,8 @@
 #include "common/c_types_map.hpp"
 
 #include "common/primitive_exec_types.hpp"
-#include "common/scratchpad.hpp"
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -191,12 +190,12 @@ static status_t init_conf_common(lnorm_conf_t &conf,
     memory_desc_wrapper dst_mdw(
             pd->is_fwd() ? pd->dst_md() : pd->diff_dst_md());
 
-    int ndims = src_mdw.ndims();
+    dim_idx_t ndims = into<dim_idx_t>(src_mdw.ndims());
 
     conf.src_dt = src_mdw.data_type();
     conf.ndims = ndims;
-    conf.norm_axis = pd->norm_axis();
-    conf.across_axis = pd->across_axis();
+    conf.norm_axis = into<dim_idx_t>(pd->norm_axis());
+    conf.across_axis = into<dim_idx_t>(pd->across_axis());
     conf.use_scale = pd->use_scale();
     conf.use_shift = pd->use_shift();
     conf.calculate_stats = !pd->stats_are_src();
@@ -221,8 +220,9 @@ static status_t init_conf_common(lnorm_conf_t &conf,
     int c_block = 1;
     bool c_is_last_physical = false;
     if (src_mdw.blocking_desc().inner_nblks > 0) {
-        c_block = src_mdw.blocking_desc()
-                          .inner_blks[src_mdw.blocking_desc().inner_nblks - 1];
+        c_block = into<int>(
+                src_mdw.blocking_desc()
+                        .inner_blks[src_mdw.blocking_desc().inner_nblks - 1]);
         c_is_last_physical
                 = src_mdw.blocking_desc().inner_idxs[ndims - 1] == ndims - 1;
     } else {
@@ -338,13 +338,12 @@ static status_t init_conf_common(lnorm_conf_t &conf,
         assert(conf.norm_axis % conf.norm_block == 0);
         assert(conf.norm_block % (conf.sub_group_size * conf.vect_dt_n) == 0);
 
-        const size_t norm_gws = conf.sub_group_size * conf.num_norm_blocks;
-        assert(norm_gws <= max_wg_size);
-        MAYBE_UNUSED(max_wg_size);
+        const int norm_gws = conf.sub_group_size * conf.num_norm_blocks;
+        gpu_assert(into<size_t>(norm_gws) <= max_wg_size);
 
-        for (int i = 0; i < 4; i++) {
-            int md_hint_idx = nstl::min(i, ndims - 1);
-            size_t dim = (i < ndims - 1) ? dims[i] : 1;
+        for (dim_idx_t i = 0; i < 4; i++) {
+            dim_idx_t md_hint_idx = nstl::min(i, ndims - 1);
+            dim_t dim = (i < ndims - 1) ? dims[i] : 1;
             if (i == ndims - 1) {
                 dim = norm_gws;
                 conf.dispatch.define_dim(
@@ -356,7 +355,7 @@ static status_t init_conf_common(lnorm_conf_t &conf,
                         utils::format("X%d", i), md_hint_idx, dim);
         }
         conf.dispatch.generate();
-        const compute::range_t tuned_lws = {norm_gws, 1, 1};
+        const compute::range_t tuned_lws = {into<size_t>(norm_gws), 1, 1};
         conf.dispatch.set_lws(tuned_lws);
 
     } else { // bwd
@@ -393,9 +392,9 @@ static status_t init_conf_common(lnorm_conf_t &conf,
             conf.vect_dt_n /= 2;
         }
 
-        for (int i = 0; i < 4; i++) {
-            int md_hint_idx = nstl::min(i, ndims - 1);
-            int dim = (i < ndims - 1) ? dims[i] : 1;
+        for (dim_idx_t i = 0; i < 4; i++) {
+            dim_idx_t md_hint_idx = nstl::min(i, ndims - 1);
+            dim_t dim = (i < ndims - 1) ? dims[i] : 1;
             if (i == ndims - 1) {
                 conf.dispatch.define_dim(utils::format("X%d", i), md_hint_idx,
                         conf.sub_group_size);
@@ -424,7 +423,7 @@ static status_t init_conf_common(lnorm_conf_t &conf,
         if ((conf.use_scale || conf.use_shift) && !vectorize_bwd_scaleshift)
             return status::unimplemented;
 
-        const int first_dim = ndims == 2 ? dims[0] : dims[1];
+        const dim_t first_dim = ndims == 2 ? dims[0] : dims[1];
         const int max_n_chunk_size = 16; // Experimentally selected values
         const int min_n_chunk_size = 4;
         int best_n_chunk_size = max_n_chunk_size;
@@ -449,7 +448,7 @@ static status_t init_conf_common(lnorm_conf_t &conf,
                 "C_finalize", conf.norm_axis);
         const int max_n_finalize = 256; // Experimentally selected values
 
-        int n_finalize = conf.n_chunks;
+        dim_t n_finalize = conf.n_chunks;
         while (n_finalize > max_n_finalize) {
             n_finalize = utils::div_up(n_finalize, 2);
         }
diff --git a/src/gpu/intel/ocl/vectorized_lnorm.hpp b/src/gpu/intel/ocl/vectorized_lnorm.hpp
index b7afb6c3bbf..375d939c6a4 100644
--- a/src/gpu/intel/ocl/vectorized_lnorm.hpp
+++ b/src/gpu/intel/ocl/vectorized_lnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -66,8 +66,7 @@ struct vectorized_lnorm_fwd_t : public gpu_primitive_t {
                     stat_md()->data_type == f32, VERBOSE_UNSUPPORTED_DT_CFG);
             VDISPATCH_LNORM(check_scale_shift_data_type({f32, bf16, f16}),
                     VERBOSE_UNSUPPORTED_DT_CFG);
-            VDISPATCH_LNORM(
-                    attr()->has_default_values(skip_mask_t::scales_runtime),
+            VDISPATCH_LNORM(attr()->has_default_values(skip_mask_t::scales),
                     VERBOSE_UNSUPPORTED_ATTR);
             VDISPATCH_LNORM(
                     set_default_formats_common(), VERBOSE_UNSUPPORTED_TAG);
@@ -89,9 +88,9 @@ struct vectorized_lnorm_fwd_t : public gpu_primitive_t {
         CHECK(status);
 
         kernel_ctx.define_int("WITH_SRC_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_SRC).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC));
         kernel_ctx.define_int("WITH_DST_SCALES",
-                !pd()->attr()->scales_.get(DNNL_ARG_DST).has_default_values());
+                !pd()->attr()->scales_.has_default_values(DNNL_ARG_DST));
 
         CHECK(create_kernel(
                 engine, &kernel_, "vectorized_lnorm_fwd", kernel_ctx));
diff --git a/src/gpu/intel/ocl/vectorized_resampling.cpp b/src/gpu/intel/ocl/vectorized_resampling.cpp
index 955f764acbd..b79826d84f5 100644
--- a/src/gpu/intel/ocl/vectorized_resampling.cpp
+++ b/src/gpu/intel/ocl/vectorized_resampling.cpp
@@ -138,15 +138,16 @@ status_t vectorized_resampling_bwd_t::pd_t::init_conf(impl::engine_t *engine) {
     // Padded C: multiple of sub_group_size and vect_size (subgroup padding), and at least vect_size * sub_group_size
     const int c_divisor = math::lcm(conf.sub_group_size, conf.vect_size);
     conf.padded_c = utils::rnd_up(diff_src_md()->padded_dims[1], c_divisor);
-    conf.padded_c
-            = std::max(conf.padded_c, conf.vect_size * conf.sub_group_size);
+    conf.padded_c = std::max(
+            conf.padded_c, dim_t(conf.vect_size * conf.sub_group_size));
 
     // lws: Multiple of sub_group_size
     conf.lws[0] = utils::rnd_up(conf.lws[0], conf.sub_group_size);
     conf.lws[1] = conf.lws[2] = 1;
 
     // gws: multiple of lws and padded C, and each other dim
-    const int gws_divisor = math::lcm((int)conf.lws[0], (int)conf.padded_c);
+    const dim_t gws_divisor
+            = math::lcm(into<dim_t>(conf.lws[0]), conf.padded_c);
     conf.gws[0] = diff_src_md()->padded_dims[0] * conf.padded_c * ID() * IH()
             * IW() / conf.vect_size;
     conf.gws[0] = utils::rnd_up(conf.gws[0], gws_divisor);
diff --git a/src/gpu/intel/ocl/vectorized_resampling.hpp b/src/gpu/intel/ocl/vectorized_resampling.hpp
index 23376aff38b..c335f326eb9 100644
--- a/src/gpu/intel/ocl/vectorized_resampling.hpp
+++ b/src/gpu/intel/ocl/vectorized_resampling.hpp
@@ -30,10 +30,7 @@ namespace ocl {
 struct vectorized_resampling_bwd_t : public gpu_primitive_t {
     using gpu_primitive_t::gpu_primitive_t;
     struct pd_t : public gpu_resampling_bwd_pd_t {
-        pd_t(const resampling_desc_t *adesc, const primitive_attr_t *attr,
-                const resampling_fwd_pd_t *hint_fwd_pd)
-            : gpu_resampling_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
-        virtual ~pd_t() {}
+        using gpu_resampling_bwd_pd_t::gpu_resampling_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("ocl:vectorized", vectorized_resampling_bwd_t);
 
diff --git a/src/gpu/intel/primitive_conf.cpp b/src/gpu/intel/primitive_conf.cpp
new file mode 100644
index 00000000000..115e236a727
--- /dev/null
+++ b/src/gpu/intel/primitive_conf.cpp
@@ -0,0 +1,839 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/gpu_eltwise_pd.hpp"
+
+#include "gpu/intel/primitive_conf.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace intel {
+
+bool memory_desc_ndims_ok(const memory_desc_t *md) {
+    return md->ndims > MAX_NDIMS;
+}
+
+memory_desc_info_t memory_desc_info_t::create(const memory_desc_wrapper &mdw) {
+    using namespace format_tag;
+
+    auto md_info = memory_desc_info_t();
+
+    md_info.nlevels = 2;
+
+    md_info.ndims = mdw.ndims();
+    md_info.data_type = mdw.data_type();
+    md_info.size = mdw.size();
+    md_info.offset0 = mdw.offset0();
+
+    auto &blk = mdw.blocking_desc();
+    dim_t blk_stride = utils::array_product(blk.inner_blks, blk.inner_nblks);
+
+    for (int d = 0; d < mdw.ndims(); ++d) {
+        utils::array_set(md_info.blocks[d], 1, md_info.nlevels + 1);
+        utils::array_set(md_info.strides[d], 0, md_info.nlevels + 1);
+    }
+
+    for (int d = 0; d < mdw.ndims(); ++d) {
+        md_info.dims[d] = mdw.dims()[d];
+        md_info.padded_dims[d] = mdw.padded_dims()[d];
+        md_info.strides[d][0] = blk.strides[d];
+    }
+
+    int levels[MAX_NDIMS] = {0};
+    for (int iblk = 0; iblk < blk.inner_nblks; ++iblk) {
+        dim_t d = blk.inner_idxs[iblk];
+        ++levels[d];
+
+        md_info.blocks[d][levels[d]] = blk.inner_blks[iblk];
+        blk_stride /= blk.inner_blks[iblk];
+        md_info.strides[d][levels[d]] = blk_stride;
+    }
+    return md_info;
+}
+
+attr_info_t attr_info_t::create(const primitive_attr_t *attr) {
+    const auto &po = attr->post_ops_;
+
+    attr_info_t attr_info;
+
+    attr_info.binary_idx = po.find(primitive_kind::binary);
+    attr_info.with_binary = (attr_info.binary_idx != -1);
+
+    // Eltwise
+    attr_info.eltwise_idx = po.find(primitive_kind::eltwise);
+    attr_info.with_eltwise = (attr_info.eltwise_idx != -1);
+
+    if (attr_info.with_eltwise) {
+        auto &eltwise = po.entry_[attr_info.eltwise_idx].eltwise;
+        attr_info.eltwise_alg = eltwise.alg;
+        attr_info.eltwise_scale = eltwise.scale;
+        attr_info.eltwise_alpha = eltwise.alpha;
+        attr_info.eltwise_beta = eltwise.beta;
+    } else {
+        attr_info.eltwise_alg = alg_kind::undef;
+        attr_info.eltwise_scale = 1.0f;
+        attr_info.eltwise_alpha = 1.0f;
+        attr_info.eltwise_beta = 0.0f;
+    }
+
+    // Sum
+    attr_info.sum_idx = po.find(primitive_kind::sum);
+    attr_info.sum_scale
+            = (attr_info.sum_idx != -1 ? po.entry_[attr_info.sum_idx].sum.scale
+                                       : 0.0f);
+    attr_info.sum_data_type = (attr_info.sum_idx != -1)
+            ? po.entry_[attr_info.sum_idx].sum.dt
+            : dnnl_data_type_undef;
+    attr_info.with_sum
+            = (attr_info.sum_idx != -1) && (attr_info.sum_scale != 0.0f);
+
+    const auto &src_scales = attr->scales_.get(DNNL_ARG_SRC);
+    attr_info.with_src_scales = !src_scales.has_default_values();
+    attr_info.with_src0_scale = !src_scales.has_default_values();
+    attr_info.src_scales_data_type = src_scales.get_data_type();
+
+    const auto &src1_scales = attr->scales_.get(DNNL_ARG_SRC_1);
+    attr_info.with_src1_scale = !src1_scales.has_default_values();
+    if (attr_info.with_src1_scale) { gpu_assert(src1_scales.get_mask() == 0); }
+
+    const auto &wei_scales = attr->scales_.get(DNNL_ARG_WEIGHTS);
+    attr_info.with_wei_scales = !wei_scales.has_default_values();
+    // TODO: remove the default `0` value.
+    attr_info.wei_scales_mask
+            = attr_info.with_wei_scales ? wei_scales.get_mask() : 0;
+    attr_info.wei_scales_data_type = wei_scales.get_data_type();
+
+    const auto &dst_scales = attr->scales_.get(DNNL_ARG_DST);
+    attr_info.with_dst_scales = !dst_scales.has_default_values();
+    attr_info.dst_scales_mask = dst_scales.get_mask();
+    attr_info.dst_scales_data_type = dst_scales.get_data_type();
+
+    // zero points
+    const auto &zp = attr->zero_points_;
+    attr_info.with_src_zpoints = !zp.has_default_values(DNNL_ARG_SRC);
+    attr_info.with_wei_zpoints = !zp.has_default_values(DNNL_ARG_WEIGHTS);
+    attr_info.with_dst_zpoints = !zp.has_default_values(DNNL_ARG_DST);
+    attr_info.src_zpoints_data_type = zp.get_data_type(DNNL_ARG_SRC);
+    attr_info.wei_zpoints_data_type = zp.get_data_type(DNNL_ARG_WEIGHTS);
+    attr_info.dst_zpoints_data_type = zp.get_data_type(DNNL_ARG_DST);
+
+    attr_info.with_per_ic_src_zpoints = attr_info.with_src_zpoints
+            && !zp.has_default_values(DNNL_ARG_SRC)
+            && zp.get_mask(DNNL_ARG_SRC) > 0;
+
+    attr_info.with_per_oc_dst_zpoints = attr_info.with_dst_zpoints
+            && !zp.has_default_values(DNNL_ARG_DST)
+            && zp.get_mask(DNNL_ARG_DST) > 0;
+
+    // Rounding mode.
+    attr_info.with_dst_sround = attr->rounding_mode_.get(DNNL_ARG_DST)
+            == rounding_mode::stochastic;
+
+    attr_info.initialized = true;
+    return attr_info;
+}
+
+void quantization_t::define_macros(
+        compute::kernel_ctx_t &kernel_ctx, const std::string &name) const {
+    if (with_scale()) {
+        kernel_ctx.define_int("WITH_" + name + "_SCALE", 1);
+        kernel_ctx.define_int(name + "_SCALE_MASK", scale_mask());
+        kernel_ctx.define_int(name + "_NUM_SCALES", num_scales());
+        kernel_ctx.define_int(name + "_SCALE_GROUP", scale_group());
+        kernel_ctx.define_int(name + "_SCALE_GROUP_DIM", scale_group_dim());
+    }
+    // Unconditionally as this defines types in kernels.
+    // Note: consistent with ocl_types.hpp
+    def_data_type(kernel_ctx, scale_dt(), name + "_SCALES");
+
+    if (with_zp()) {
+        kernel_ctx.define_int("WITH_" + name + "_ZPOINT", 1);
+        kernel_ctx.define_int(name + "_ZPOINT_MASK", zp_mask());
+        kernel_ctx.define_int(name + "_NUM_ZPOINTS", num_zps());
+        kernel_ctx.define_int(name + "_ZPOINT_GROUP", zp_group());
+        kernel_ctx.define_int(name + "_ZPOINT_GROUP_DIM", zp_group_dim());
+    }
+    // Unconditionally as this defines types in kernels.
+    // Note: consistent with ocl_types.hpp
+    def_data_type(kernel_ctx, zp_dt(), name + "_ZP");
+}
+
+void sum_quantization_t::define_macros(
+        compute::kernel_ctx_t &kernel_ctx, const std::string &name) const {
+    if (with_scale()) kernel_ctx.define_int("WITH_" + name + "_SCALE", 1);
+    if (with_zp()) kernel_ctx.define_int("WITH_" + name + "_ZPOINT", 1);
+}
+
+void set_default_pool_conf(pool_conf_t &conf, const pooling_desc_t &desc,
+        const memory_desc_t &src_md, const memory_desc_t &dst_md,
+        const primitive_attr_t &attr) {
+    const memory_desc_wrapper src_mdw(src_md);
+    const memory_desc_wrapper dst_mdw(dst_md);
+
+    const auto &src_dims = src_mdw.dims();
+    const auto &dst_dims = dst_mdw.dims();
+
+    int ndims = src_mdw.ndims();
+    conf.ndims = ndims;
+
+    conf.mb = src_dims[0];
+
+    conf.c = src_dims[1];
+    conf.mb_padded = src_mdw.padded_dims()[0];
+    conf.c_padded = src_mdw.padded_dims()[1];
+    conf.id = (ndims == 5) ? src_dims[2] : 1;
+    conf.ih = (ndims == 3) ? 1 : src_dims[ndims - 2];
+    conf.iw = src_dims[ndims - 1];
+    conf.od = (ndims == 5) ? dst_dims[2] : 1;
+    conf.oh = (ndims == 3) ? 1 : dst_dims[ndims - 2];
+    conf.ow = dst_dims[ndims - 1];
+
+    conf.stride_d = (ndims == 5) ? desc.strides[0] : 1;
+    conf.stride_h = (ndims == 3) ? 1 : desc.strides[ndims - 4];
+    conf.stride_w = desc.strides[ndims - 3];
+    conf.kd = (ndims == 5) ? desc.kernel[0] : 1;
+    conf.kh = (ndims == 3) ? 1 : desc.kernel[ndims - 4];
+    conf.kw = desc.kernel[ndims - 3];
+
+    conf.dd = (ndims == 5) ? desc.dilation[0] : 0;
+    conf.dh = (ndims == 3) ? 0 : desc.dilation[ndims - 4];
+    conf.dw = desc.dilation[ndims - 3];
+
+    conf.f_pad = (ndims == 5) ? desc.padding[0][0] : 0;
+    conf.t_pad = (ndims == 3) ? 0 : desc.padding[0][ndims - 4];
+    conf.l_pad = desc.padding[0][ndims - 3];
+
+    conf.alg = desc.alg_kind;
+
+    conf.src_dt = src_mdw.data_type();
+    conf.dst_dt = dst_mdw.data_type();
+
+    conf.src_md_info = memory_desc_info_t::create(src_mdw);
+    conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
+
+    conf.is_training = desc.prop_kind == prop_kind::forward_training;
+    conf.is_backward = desc.prop_kind == prop_kind::backward_data;
+
+    conf.attr_info = attr_info_t::create(&attr);
+}
+
+void set_default_conf(conv_conf_t &conf, const convolution_desc_t &cd,
+        const memory_desc_t &src_md, const memory_desc_t &weights_md,
+        const memory_desc_t &dst_md, const memory_desc_t &bias_md,
+        const primitive_attr_t &attr) {
+
+    const memory_desc_wrapper src_mdw(&src_md);
+    const memory_desc_wrapper weights_mdw(&weights_md);
+    const memory_desc_wrapper dst_mdw(&dst_md);
+    const memory_desc_wrapper bias_mdw(&bias_md);
+
+    const bool with_groups = weights_mdw.ndims() == src_mdw.ndims() + 1;
+    int ndims = src_mdw.ndims();
+
+    conf = utils::zero<decltype(conf)>();
+    conf.with_groups = with_groups;
+    conf.ndims = ndims;
+    conf.prop_kind = cd.prop_kind;
+    conf.ngroups = with_groups ? weights_mdw.dims()[0] : 1;
+    conf.mb = src_mdw.dims()[0];
+    conf.oc_without_padding = dst_mdw.dims()[1] / conf.ngroups;
+    conf.ic_without_padding = src_mdw.dims()[1] / conf.ngroups;
+    conf.id = (ndims == 5) ? src_mdw.dims()[2] : 1;
+    conf.ih = (ndims == 3) ? 1 : src_mdw.dims()[ndims - 2];
+    conf.iw = src_mdw.dims()[ndims - 1];
+    conf.od = (ndims == 5) ? dst_mdw.dims()[2] : 1;
+    conf.oh = (ndims == 3) ? 1 : dst_mdw.dims()[ndims - 2];
+    conf.ow = dst_mdw.dims()[ndims - 1];
+    conf.kd = (ndims == 5) ? weights_mdw.dims()[with_groups + 2] : 1;
+    conf.kh = (ndims == 3) ? 1 : weights_mdw.dims()[with_groups + ndims - 2];
+    conf.kw = weights_mdw.dims()[with_groups + ndims - 1];
+
+    conf.is_depthwise = conf.with_groups && conf.oc_without_padding == 1
+            && conf.ic_without_padding == 1;
+    conf.oc = dst_mdw.dims()[1] / conf.ngroups;
+    conf.ic = src_mdw.dims()[1] / conf.ngroups;
+
+    conf.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
+    conf.back_pad = (ndims == 5) ? cd.padding[1][0] : 0;
+    conf.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims - 4];
+    conf.b_pad = (ndims == 3) ? 0 : cd.padding[1][ndims - 4];
+    conf.l_pad = cd.padding[0][ndims - 3];
+    conf.r_pad = cd.padding[1][ndims - 3];
+    conf.stride_d = (ndims == 5) ? cd.strides[0] : 1;
+    conf.stride_h = (ndims == 3) ? 1 : cd.strides[ndims - 4];
+    conf.stride_w = cd.strides[ndims - 3];
+    conf.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
+    conf.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims - 4];
+    conf.dilate_w = cd.dilates[ndims - 3];
+
+    conf.with_bias = bias_mdw.format_kind() != format_kind::undef;
+
+    conf.src_data_type = src_mdw.data_type();
+    conf.weights_data_type = weights_mdw.data_type();
+    conf.dst_data_type = dst_mdw.data_type();
+
+    conf.acc_data_type = cd.accum_data_type;
+    conf.bias_data_type
+            = conf.with_bias ? bias_mdw.data_type() : data_type::f32;
+
+    if (!src_mdw.format_any())
+        conf.src_md_info = memory_desc_info_t::create(src_mdw);
+    if (!weights_mdw.format_any())
+        conf.wei_md_info = memory_desc_info_t::create(weights_mdw);
+    if (!dst_mdw.format_any())
+        conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
+
+    conf.attr_info = attr_info_t::create(&attr);
+}
+
+void set_offsets(compute::kernel_ctx_t &kernel_ctx,
+        const memory_desc_wrapper &md, const char *str) {
+    dim_t block_dims[DNNL_MAX_NDIMS];
+    dim_t strides_compat[2][DNNL_MAX_NDIMS];
+
+    md.compute_blocks(block_dims);
+    md.compute_strides_compat(strides_compat);
+
+    for (int d = 0; d < MAX_NDIMS; ++d) {
+        const dim_t block = block_dims[d];
+
+        kernel_ctx.define_int(
+                utils::format("%s_B%d", str, d), (d < md.ndims()) ? block : 1);
+        kernel_ctx.define_int(utils::format("%s_S%d", str, d),
+                (d < md.ndims()) ? strides_compat[0][d] : 0);
+        kernel_ctx.define_int(utils::format("%s_SB%d", str, d),
+                (d < md.ndims()) ? strides_compat[1][d] : 0);
+    }
+
+    kernel_ctx.define_int(utils::format("%s_OFFSET_PAD", str), md.md_->offset0);
+}
+
+void set_offsets(const memory_desc_wrapper &md, dim_t offs[4][MAX_NDIMS]) {
+    dim_t block_dims[DNNL_MAX_NDIMS];
+    dim_t strides_compat[2][DNNL_MAX_NDIMS];
+
+    md.compute_blocks(block_dims);
+    md.compute_strides_compat(strides_compat);
+    const dims_t &dims = md.dims();
+
+    for (int d = 0; d < md.ndims(); ++d) {
+        const dim_t block = block_dims[d];
+
+        offs[0][d] = block;
+        offs[1][d] = strides_compat[0][d];
+        offs[2][d] = strides_compat[1][d];
+        offs[3][d] = dims[d];
+    }
+}
+
+outer_strides_getter_t get_outer_strides(const memory_desc_wrapper &md) {
+    return {md};
+}
+
+block_layout_t get_inner_layout(const memory_desc_wrapper &md) {
+    block_layout_t inner_layout(md, /* inner_only */ true);
+
+    block_layout_t ret;
+    // Explicitly initialize to size-1 blocks
+    for (int d = 0; d < MAX_NDIMS; d++) {
+        ret.append(block_t(d, 1, 0));
+    }
+
+    // Overwrite inner blocks with their actual values
+    for (const auto &block : inner_layout) {
+        ret[block.dim_idx] = block;
+    }
+
+    return ret;
+}
+
+void def_offsets(const dim_t offs[4][MAX_NDIMS],
+        compute::kernel_ctx_t &kernel_ctx, const char *str,
+        const dim_idx_t ndims) {
+
+    for (dim_idx_t d = 0; d < MAX_NDIMS; d++) {
+        kernel_ctx.define_int(
+                utils::format("%s_B%d", str, d), (d < ndims) ? offs[0][d] : 1);
+        kernel_ctx.define_int(
+                utils::format("%s_S%d", str, d), (d < ndims) ? offs[1][d] : 0);
+        kernel_ctx.define_int(
+                utils::format("%s_SB%d", str, d), (d < ndims) ? offs[2][d] : 0);
+        kernel_ctx.define_int(
+                utils::format("%s_D%d", str, d), (d < ndims) ? offs[3][d] : 1);
+    }
+}
+
+void def_block_offsets(const block_layout_t &layout,
+        compute::kernel_ctx_t &kernel_ctx, const char *str) {
+
+    for (const block_t &b : layout) {
+        kernel_ctx.define_int(utils::format("%s_B%d", str, b.dim_idx), b.block);
+        kernel_ctx.define_int(
+                utils::format("%s_SB%d", str, b.dim_idx), b.stride);
+    }
+}
+
+void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
+        const char *str, bool with_punning) {
+    const char *bf16_name = with_punning ? "ushort" : "bf16";
+    const char *bf8_name = with_punning ? "uchar" : "f8_e5m2";
+    const char *hf8_name = with_punning ? "uchar" : "f8_e4m3";
+    const char *f4_e2m1_name = with_punning ? "uchar" : "f4_e2m1";
+    const char *f4_e3m0_name = with_punning ? "uchar" : "f4_e3m0";
+    const char *e8m0_name = with_punning ? "uchar" : "e8m0";
+    const char *u4_name = with_punning ? "uchar" : "u4";
+    const char *s4_name = with_punning ? "uchar" : "s4";
+
+    switch (dt) {
+        case data_type::undef:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=undef_data -D%s_DT_UNDEF", str, str));
+            break;
+        case data_type::bf16:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_BF16", str, bf16_name, str));
+            break;
+        case data_type::f16:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=half -D%s_DT_F16", str, str));
+            break;
+        case data_type::f32:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=float -D%s_DT_F32", str, str));
+            break;
+        case data_type::f64:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=double -D%s_DT_F64", str, str));
+            break;
+        case data_type::s8:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=char -D%s_DT_S8", str, str));
+            break;
+        case data_type::u8:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=uchar -D%s_DT_U8", str, str));
+            break;
+        case data_type::f8_e4m3:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_HF8", str, hf8_name, str));
+            break;
+        case data_type::f8_e5m2:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_BF8", str, bf8_name, str));
+            break;
+        case data_type::f4_e2m1:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_F4_E2M1", str, f4_e2m1_name, str));
+            break;
+        case data_type::f4_e3m0:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_F4_E3M0", str, f4_e3m0_name, str));
+            break;
+        case data_type::e8m0:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_E8M0", str, e8m0_name, str));
+            break;
+        case data_type::s4:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_S4", str, s4_name, str));
+            break;
+        case data_type::u4:
+            kernel_ctx.add_option(utils::format(
+                    "-D%s_DATA_T=%s -D%s_DT_U4", str, u4_name, str));
+            break;
+        case data_type::s32:
+            kernel_ctx.add_option(
+                    utils::format("-D%s_DATA_T=int -D%s_DT_S32", str, str));
+            break;
+        default:
+            gpu_error_not_expected()
+                    << "Unexpected data type " << dnnl_dt2str(dt);
+            break;
+    }
+}
+void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
+        const std::string &str, bool with_punning) {
+    return def_data_type(kernel_ctx, dt, str.c_str(), with_punning);
+}
+
+void def_memory_desc_info(compute::kernel_ctx_t &kernel_ctx,
+        const memory_desc_info_t &md_info, const char *prefix,
+        bool with_punning) {
+    def_data_type(kernel_ctx, md_info.data_type, prefix, with_punning);
+    kernel_ctx.register_buffer_size(md_info);
+
+    kernel_ctx.define_int(utils::format("%s_OFFSET0", prefix), md_info.offset0);
+    kernel_ctx.define_int(utils::format("%s_NDIMS", prefix), md_info.ndims);
+
+    kernel_ctx.define_int(utils::format("%s_NLEVELS", prefix), md_info.nlevels);
+
+    for (int d = 0; d < MAX_NDIMS; ++d) {
+        dim_t dim = (d < md_info.ndims) ? md_info.dims[d] : 1;
+        dim_t padded_dim = (d < md_info.ndims) ? md_info.padded_dims[d] : 1;
+        kernel_ctx.define_int(utils::format("%s_D%d", prefix, d), dim);
+        kernel_ctx.define_int(utils::format("%s_PD%d", prefix, d), padded_dim);
+
+        for (int l = 0; l < md_info.nlevels + 1; ++l) {
+            dim_t block = (d < md_info.ndims) ? md_info.blocks[d][l] : 1;
+            dim_t stride = (d < md_info.ndims) ? md_info.strides[d][l] : 0;
+            kernel_ctx.define_int(
+                    utils::format("%s_B%d_%d", prefix, d, l), block);
+            kernel_ctx.define_int(
+                    utils::format("%s_S%d_%d", prefix, d, l), stride);
+        }
+    }
+}
+
+void def_binary_alg_kinds(compute::kernel_ctx_t &kernel_ctx) {
+    kernel_ctx.define_int("BINARY_ADD", alg_kind::binary_add);
+    kernel_ctx.define_int("BINARY_MUL", alg_kind::binary_mul);
+    kernel_ctx.define_int("BINARY_MIN", alg_kind::binary_min);
+    kernel_ctx.define_int("BINARY_MAX", alg_kind::binary_max);
+    kernel_ctx.define_int("BINARY_DIV", alg_kind::binary_div);
+    kernel_ctx.define_int("BINARY_SUB", alg_kind::binary_sub);
+    kernel_ctx.define_int("BINARY_GE", alg_kind::binary_ge);
+    kernel_ctx.define_int("BINARY_GT", alg_kind::binary_gt);
+    kernel_ctx.define_int("BINARY_LE", alg_kind::binary_le);
+    kernel_ctx.define_int("BINARY_LT", alg_kind::binary_lt);
+    kernel_ctx.define_int("BINARY_EQ", alg_kind::binary_eq);
+    kernel_ctx.define_int("BINARY_NE", alg_kind::binary_ne);
+}
+
+void def_eltwise_alg_kinds(compute::kernel_ctx_t &kernel_ctx) {
+    kernel_ctx.define_int("RELU", alg_kind::eltwise_relu);
+    kernel_ctx.define_int("LINEAR", alg_kind::eltwise_linear);
+    kernel_ctx.define_int("SOFT_RELU", alg_kind::eltwise_soft_relu);
+    kernel_ctx.define_int("MISH", alg_kind::eltwise_mish);
+    kernel_ctx.define_int("LOGISTIC", alg_kind::eltwise_logistic);
+    kernel_ctx.define_int("TANH", alg_kind::eltwise_tanh);
+    kernel_ctx.define_int("ELU", alg_kind::eltwise_elu);
+    kernel_ctx.define_int("SQUARE", alg_kind::eltwise_square);
+    kernel_ctx.define_int("SQRT", alg_kind::eltwise_sqrt);
+    kernel_ctx.define_int("ABS", alg_kind::eltwise_abs);
+    kernel_ctx.define_int("EXP", alg_kind::eltwise_exp);
+    kernel_ctx.define_int("GELU_TANH", alg_kind::eltwise_gelu_tanh);
+    kernel_ctx.define_int("SWISH", alg_kind::eltwise_swish);
+    kernel_ctx.define_int("LOG", alg_kind::eltwise_log);
+    kernel_ctx.define_int("CLIP", alg_kind::eltwise_clip);
+    kernel_ctx.define_int("CLIP_V2", alg_kind::eltwise_clip_v2);
+    kernel_ctx.define_int("POW", alg_kind::eltwise_pow);
+    kernel_ctx.define_int("GELU_ERF", alg_kind::eltwise_gelu_erf);
+    kernel_ctx.define_int("ROUND", alg_kind::eltwise_round);
+    kernel_ctx.define_int("HARDSWISH", alg_kind::eltwise_hardswish);
+    kernel_ctx.define_int("HARDSIGMOID", alg_kind::eltwise_hardsigmoid);
+
+    kernel_ctx.define_int("RELU_DST", alg_kind::eltwise_relu_use_dst_for_bwd);
+    kernel_ctx.define_int(
+            "LOGISTIC_DST", alg_kind::eltwise_logistic_use_dst_for_bwd);
+    kernel_ctx.define_int("TANH_DST", alg_kind::eltwise_tanh_use_dst_for_bwd);
+    kernel_ctx.define_int("ELU_DST", alg_kind::eltwise_elu_use_dst_for_bwd);
+    kernel_ctx.define_int("SQRT_DST", alg_kind::eltwise_sqrt_use_dst_for_bwd);
+    kernel_ctx.define_int("EXP_DST", alg_kind::eltwise_exp_use_dst_for_bwd);
+    kernel_ctx.define_int(
+            "CLIP_V2_DST", alg_kind::eltwise_clip_v2_use_dst_for_bwd);
+}
+
+bool post_ops_with_binary_ok(const primitive_attr_t *attr,
+        const data_type_t dst_dt, const int max_ndims_supported,
+        const int prelu_mask_supported) {
+    const auto &p = attr->post_ops_;
+
+    auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(false); };
+    auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); };
+    auto is_binary = [&](int idx) { return p.entry_[idx].is_binary(); };
+    auto is_prelu = [&](int idx) { return p.entry_[idx].is_prelu(); };
+
+    bool is_po_ok = true;
+    for (int po_idx = 0; po_idx < p.len(); ++po_idx) {
+        is_po_ok = is_po_ok
+                && (is_eltwise(po_idx) || is_sum(po_idx) || is_binary(po_idx)
+                        || is_prelu(po_idx));
+        if (is_binary(po_idx)) {
+            const auto &bin_desc = p.entry_[po_idx].binary.src1_desc;
+            if (bin_desc.ndims > max_ndims_supported) {
+                // accept descriptor if unsupported dims are equal to 1.
+                for (int dim_idx = max_ndims_supported;
+                        dim_idx < bin_desc.ndims; ++dim_idx) {
+                    if (bin_desc.dims[dim_idx] != 1) is_po_ok = false;
+                }
+            }
+        }
+        if (is_prelu(po_idx)) {
+            if (p.entry_[po_idx].prelu.mask > prelu_mask_supported)
+                is_po_ok = false;
+        }
+        if (is_sum(po_idx)) {
+            if (p.entry_[po_idx].sum.zero_point != 0) return false;
+            if (p.entry_[po_idx].sum.dt != dnnl_data_type_undef
+                    && types::data_type_size(p.entry_[po_idx].sum.dt)
+                            != types::data_type_size(dst_dt))
+                return false;
+        }
+    }
+
+    if (p.len() > MAX_POST_OPS_SUPPORTED) is_po_ok = false;
+    if (dst_dt == dnnl_f64 && !p.has_default_values()) is_po_ok = false;
+
+    return is_po_ok;
+}
+
+status_t get_prelu_md(int prelu_mask, const dim_t *dst_dims,
+        memory_desc_t &weight_mem_desc, int weight_ndims) {
+    format_tag_t weights_tag;
+    dims_t weight_dims {};
+    for (int d = 0; d < weight_ndims; ++d) {
+        if (((prelu_mask >> d) & 0x1) == 1) {
+            weight_dims[d] = dst_dims[d];
+        } else {
+            weight_dims[d] = 1;
+        }
+    }
+    switch (weight_ndims) {
+        case 1: weights_tag = format_tag_t::dnnl_a; break;
+        case 2: weights_tag = format_tag_t::dnnl_ab; break;
+        case 3: weights_tag = format_tag_t::dnnl_acb; break;
+        case 4: weights_tag = format_tag_t::dnnl_acdb; break;
+        case 5: weights_tag = format_tag_t::dnnl_acdeb; break;
+        default: weights_tag = format_tag_t::dnnl_format_tag_undef; break;
+    }
+    CHECK(memory_desc_init_by_tag(weight_mem_desc, weight_ndims, weight_dims,
+            data_type_t::dnnl_f32, weights_tag));
+    return status::success;
+}
+
+status_t def_post_ops_cfg(compute::kernel_ctx_t &kernel_ctx,
+        const post_ops_t &post_ops, const memory_desc_t &dst_md) {
+    std::string po_kernel_args = "-DPOST_OP_ARGS=\"";
+
+    bool post_op_uses_bf16 = false;
+    bool post_op_uses_bf8 = false;
+    bool post_op_uses_hf8 = false;
+
+    auto add_po_defines = [&](const std::string &bin_arg_name,
+                                  const post_ops_t::entry_t &e, int idx) {
+        auto define_binary_po_type = [&](data_type_t type) {
+            def_data_type(kernel_ctx, type,
+                    utils::format("%s_ACTUAL", bin_arg_name).c_str(), false);
+            post_op_uses_bf16 |= (type == data_type::bf16);
+            post_op_uses_bf8 |= (type == data_type::f8_e5m2);
+            post_op_uses_hf8 |= (type == data_type::f8_e4m3);
+        };
+        if (e.is_binary()) {
+            kernel_ctx.add_option(
+                    "-DAPPLY_PO_" + std::to_string(idx) + "=APPLY_PO_BINARY");
+            kernel_ctx.define_int(
+                    "PO_" + std::to_string(idx) + "_ALG", e.binary.alg);
+
+            const memory_desc_wrapper src1_mdw(e.binary.src1_desc);
+            const auto mdi = memory_desc_info_t::create(src1_mdw);
+            def_memory_desc_info(kernel_ctx, mdi, bin_arg_name.c_str());
+            define_binary_po_type(mdi.data_type);
+
+            po_kernel_args += ", const __global PO_" + std::to_string(idx)
+                    + "_BIN_ARG_DATA_T *po_" + std::to_string(idx)
+                    + "_binary_arg";
+        } else if (e.is_prelu()) {
+            // binary && eltwise relu = prelu post op
+            kernel_ctx.add_option(
+                    "-DAPPLY_PO_" + std::to_string(idx) + "=APPLY_PO_BINARY");
+            kernel_ctx.define_int("PO_" + std::to_string(idx) + "_ALG",
+                    alg_kind_t::dnnl_eltwise_relu);
+
+            memory_desc_t weight_mem_desc;
+            int weight_ndims = dst_md.ndims;
+            CHECK(get_prelu_md(
+                    e.prelu.mask, dst_md.dims, weight_mem_desc, weight_ndims));
+            const memory_desc_wrapper weight_mdw(weight_mem_desc);
+            const auto mdi = memory_desc_info_t::create(weight_mdw);
+            def_memory_desc_info(kernel_ctx, mdi, bin_arg_name.c_str());
+
+            // prelu weights are assumed to be f32
+            define_binary_po_type(data_type::f32);
+
+            po_kernel_args += ", const __global PO_" + std::to_string(idx)
+                    + "_BIN_ARG_DATA_T *po_" + std::to_string(idx)
+                    + "_binary_arg";
+        } else if (e.is_eltwise()) {
+            kernel_ctx.add_option(
+                    "-DAPPLY_PO_" + std::to_string(idx) + "=APPLY_PO_ELTWISE");
+            kernel_ctx.define_int(
+                    "PO_" + std::to_string(idx) + "_ALG", e.eltwise.alg);
+            kernel_ctx.define_float(
+                    ("PO_" + std::to_string(idx) + "_ELTWISE_ALPHA").c_str(),
+                    e.eltwise.alpha);
+            kernel_ctx.define_float(
+                    ("PO_" + std::to_string(idx) + "_ELTWISE_BETA").c_str(),
+                    e.eltwise.beta);
+            kernel_ctx.define_float(
+                    ("PO_" + std::to_string(idx) + "_ELTWISE_SCALE").c_str(),
+                    e.eltwise.scale);
+        } else if (e.is_sum(false)) {
+            kernel_ctx.add_option(
+                    "-DAPPLY_PO_" + std::to_string(idx) + "=APPLY_PO_SUM");
+            kernel_ctx.define_int(
+                    "PO_" + std::to_string(idx) + "_ALG", alg_kind::undef);
+            kernel_ctx.define_float(
+                    ("PO_" + std::to_string(idx) + "_SUM_SCALE").c_str(),
+                    e.sum.scale);
+        } else {
+            return status::runtime_error;
+        }
+        return status::success;
+    };
+
+    for (int idx = 0; idx < post_ops.len(); ++idx) {
+        const std::string bin_arg_name
+                = "PO_" + std::to_string(idx) + "_BIN_ARG";
+        CHECK(add_po_defines(bin_arg_name, post_ops.entry_[idx], idx));
+    }
+
+    kernel_ctx.define_int("POST_OP_CHAIN_LENGTH", post_ops.len());
+    if (post_op_uses_bf16) kernel_ctx.define_int("POST_OP_USING_BF16", 1);
+    if (post_op_uses_bf8) kernel_ctx.define_int("POST_OP_USING_BF8", 1);
+    if (post_op_uses_hf8) kernel_ctx.define_int("POST_OP_USING_HF8", 1);
+
+    po_kernel_args += "\"";
+    kernel_ctx.add_option(po_kernel_args);
+    return status::success;
+}
+
+int append_post_ops_to_arg_list_base(const exec_args_t &args,
+        compute::kernel_arg_list_t &arg_list, int post_op_idx,
+        const post_ops_t &post_ops) {
+    auto set_arg_entry = [&](const post_ops_t::entry_t &e, int po_idx) {
+        if (e.is_binary()) {
+            auto arg = args.at(
+                    DNNL_ARG_ATTR_MULTIPLE_POST_OP(po_idx) | DNNL_ARG_SRC_1);
+            gpu_assert(arg.is_const);
+
+            auto &binary_arg = arg.mem
+                    ? *(arg.mem->memory_storage())
+                    : dnnl::impl::memory_storage_t::empty_storage();
+            arg_list.set(post_op_idx++, binary_arg);
+        } else if (e.is_prelu()) {
+            auto arg = args.at(
+                    DNNL_ARG_ATTR_MULTIPLE_POST_OP(po_idx) | DNNL_ARG_WEIGHTS);
+            gpu_assert(arg.is_const);
+            auto &prelu_wei_arg = arg.mem
+                    ? *(arg.mem->memory_storage())
+                    : dnnl::impl::memory_storage_t::empty_storage();
+            arg_list.set(post_op_idx++, prelu_wei_arg);
+        }
+    };
+
+    for (int idx = 0; idx < post_ops.len(); ++idx) {
+        set_arg_entry(post_ops.entry_[idx], idx);
+    }
+    return post_op_idx;
+}
+int append_post_ops_to_arg_list_gemm(const exec_args_t &args,
+        compute::kernel_arg_list_t &arg_list, int post_op_idx,
+        const post_ops_t &post_ops) {
+    return append_post_ops_to_arg_list_base(
+            args, arg_list, post_op_idx, post_ops);
+}
+int append_post_ops_to_arg_list(const exec_ctx_t &ctx,
+        compute::kernel_arg_list_t &arg_list, int post_op_idx,
+        const post_ops_t &post_ops) {
+    exec_args_t args;
+    return append_post_ops_to_arg_list_base(
+            ctx.args(), arg_list, post_op_idx, post_ops);
+}
+
+bool post_ops_preserves_zeroes(
+        const exec_ctx_t &ctx, const post_ops_t &post_ops) {
+    bool preserve_zeroes = true;
+    for (int idx = 0; idx < post_ops.len(); ++idx) {
+        const post_ops_t::entry_t &po_entry = post_ops.entry_[idx];
+        if (po_entry.is_binary()) {
+            // only binary mul is preserving zeroes
+            preserve_zeroes &= po_entry.binary.alg
+                    == dnnl::impl::alg_kind_t::dnnl_binary_mul;
+        }
+        if (po_entry.is_eltwise(false)) {
+            preserve_zeroes &= gpu_eltwise_fwd_pd_t::eltwise_preserves_zero(
+                    po_entry.eltwise.alg, po_entry.eltwise.alpha,
+                    po_entry.eltwise.beta);
+        }
+    }
+    return preserve_zeroes;
+}
+
+status_t def_attr_info_impl(compute::kernel_ctx_t &kernel_ctx,
+        const attr_info_t &attr_info, const post_ops_t &post_ops,
+        const memory_desc_t &dst_md, bool with_punning) {
+    gpu_assert(attr_info.initialized);
+
+    kernel_ctx.define_int("WITH_POST_OP", post_ops.len() > 0);
+
+    if (!kernel_ctx.has_macro("ELTWISE_ALG"))
+        kernel_ctx.define_int("ELTWISE_ALG", attr_info.eltwise_alg);
+
+    kernel_ctx.define_int("WITH_SUM", attr_info.with_sum);
+    kernel_ctx.define_int("SUM_IDX", attr_info.sum_idx);
+    kernel_ctx.define_float("SUM_SCALE", attr_info.sum_scale);
+    kernel_ctx.define_int("SUM_SCALE1", attr_info.sum_scale == 1.0f);
+
+    kernel_ctx.define_int("WITH_SRC0_SCALE", attr_info.with_src0_scale);
+    kernel_ctx.define_int("WITH_SRC1_SCALE", attr_info.with_src1_scale);
+
+    kernel_ctx.define_int("WITH_SRC_SCALES", attr_info.with_src_scales);
+    kernel_ctx.define_int("WITH_WEI_SCALES", attr_info.with_wei_scales);
+    kernel_ctx.define_int("WITH_DST_SCALES", attr_info.with_dst_scales);
+    kernel_ctx.define_int("WEI_SCALES_MASK", attr_info.wei_scales_mask);
+    kernel_ctx.define_int("DST_SCALES_MASK", attr_info.dst_scales_mask);
+    def_data_type(kernel_ctx, attr_info.src_scales_data_type, "SRC_SCALES",
+            with_punning);
+    def_data_type(kernel_ctx, attr_info.wei_scales_data_type, "WEI_SCALES",
+            with_punning);
+    def_data_type(kernel_ctx, attr_info.dst_scales_data_type, "DST_SCALES",
+            with_punning);
+
+    kernel_ctx.define_int("WITH_SRC_ZPOINTS", attr_info.with_src_zpoints);
+    kernel_ctx.define_int("WITH_WEI_ZPOINTS", attr_info.with_wei_zpoints);
+    kernel_ctx.define_int("WITH_DST_ZPOINTS", attr_info.with_dst_zpoints);
+    kernel_ctx.define_int(
+            "WITH_SRC_ZPOINTS_PER_IC", attr_info.with_per_ic_src_zpoints);
+    kernel_ctx.define_int(
+            "WITH_DST_ZPOINTS_PER_OC", attr_info.with_per_oc_dst_zpoints);
+    kernel_ctx.define_int("WITH_WEI_ZPOINTS_DT_S8",
+            attr_info.wei_zpoints_data_type == dnnl_s8);
+    kernel_ctx.define_int("WITH_WEI_ZPOINTS_DT_U8",
+            attr_info.wei_zpoints_data_type == dnnl_u8);
+
+    def_binary_alg_kinds(kernel_ctx);
+    def_eltwise_alg_kinds(kernel_ctx);
+
+    return def_post_ops_cfg(kernel_ctx, post_ops, dst_md);
+}
+
+status_t def_attr_info(compute::kernel_ctx_t &kernel_ctx,
+        const attr_info_t &attr_info, const post_ops_t &post_ops,
+        const memory_desc_t &dst_md, bool with_punning) {
+    return def_attr_info_impl(
+            kernel_ctx, attr_info, post_ops, dst_md, with_punning);
+}
+
+void def_dispatch(compute::kernel_ctx_t &kernel_ctx,
+        const compute::dispatch_t &dispatch) {
+    dispatch.def_kernel_macros(kernel_ctx);
+}
+
+} // namespace intel
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/intel/primitive_conf.hpp b/src/gpu/intel/primitive_conf.hpp
index 53ce391c8ff..67e7e1c4d71 100644
--- a/src/gpu/intel/primitive_conf.hpp
+++ b/src/gpu/intel/primitive_conf.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,15 +21,11 @@
 
 #include "common/c_types_map.hpp"
 #include "common/memory_desc_wrapper.hpp"
-#include "common/memory_storage.hpp"
-#include "common/primitive_attr.hpp"
-#include "common/primitive_exec_types.hpp"
-#include "common/utils.hpp"
-#include "gpu/gpu_eltwise_pd.hpp"
+
 #include "gpu/gpu_utils.hpp"
+
 #include "gpu/intel/block_structure.hpp"
 #include "gpu/intel/compute/dispatch.hpp"
-#include "gpu/intel/compute/kernel_arg_list.hpp"
 #include "gpu/intel/compute/utils.hpp"
 
 namespace dnnl {
@@ -37,12 +33,7 @@ namespace impl {
 namespace gpu {
 namespace intel {
 
-#define MAX_NDIMS 6
-#define MAX_POST_OPS_SUPPORTED 32
-
-inline bool memory_desc_ndims_ok(const memory_desc_t *md) {
-    return md->ndims > MAX_NDIMS;
-}
+bool memory_desc_ndims_ok(const memory_desc_t *md);
 
 template <typename T, typename... Rest>
 bool memory_desc_ndims_ok(const T *first, const Rest *...rest) {
@@ -56,6 +47,7 @@ struct memory_desc_info_t {
     int ndims;
     data_type_t data_type;
 
+    size_t size;
     dim_t offset0;
     dim_t dims[MAX_NDIMS];
     dim_t padded_dims[MAX_NDIMS];
@@ -64,129 +56,11 @@ struct memory_desc_info_t {
     dim_t blocks[MAX_NDIMS][max_nlevels + 1];
     dim_t strides[MAX_NDIMS][max_nlevels + 1];
 
-    static memory_desc_info_t create(const memory_desc_wrapper &mdw) {
-        using namespace format_tag;
-
-        auto md_info = memory_desc_info_t();
-
-        md_info.nlevels = 2;
-
-        md_info.ndims = mdw.ndims();
-        md_info.data_type = mdw.data_type();
-        md_info.offset0 = mdw.offset0();
-
-        auto &blk = mdw.blocking_desc();
-        dim_t blk_stride
-                = utils::array_product(blk.inner_blks, blk.inner_nblks);
-
-        for (int d = 0; d < mdw.ndims(); ++d) {
-            utils::array_set(md_info.blocks[d], 1, md_info.nlevels + 1);
-            utils::array_set(md_info.strides[d], 0, md_info.nlevels + 1);
-        }
-
-        for (int d = 0; d < mdw.ndims(); ++d) {
-            md_info.dims[d] = mdw.dims()[d];
-            md_info.padded_dims[d] = mdw.padded_dims()[d];
-            md_info.strides[d][0] = blk.strides[d];
-        }
-
-        int levels[MAX_NDIMS] = {0};
-        for (int iblk = 0; iblk < blk.inner_nblks; ++iblk) {
-            int d = blk.inner_idxs[iblk];
-            ++levels[d];
-
-            md_info.blocks[d][levels[d]] = blk.inner_blks[iblk];
-            blk_stride /= blk.inner_blks[iblk];
-            md_info.strides[d][levels[d]] = blk_stride;
-        }
-        return md_info;
-    }
+    static memory_desc_info_t create(const memory_desc_wrapper &mdw);
 };
 
 struct attr_info_t {
-    static attr_info_t create(const primitive_attr_t *attr) {
-        const auto &po = attr->post_ops_;
-
-        attr_info_t attr_info;
-
-        attr_info.binary_idx = po.find(primitive_kind::binary);
-        attr_info.with_binary = (attr_info.binary_idx != -1);
-
-        // Eltwise
-        attr_info.eltwise_idx = po.find(primitive_kind::eltwise);
-        attr_info.with_eltwise = (attr_info.eltwise_idx != -1);
-
-        if (attr_info.with_eltwise) {
-            auto &eltwise = po.entry_[attr_info.eltwise_idx].eltwise;
-            attr_info.eltwise_alg = eltwise.alg;
-            attr_info.eltwise_scale = eltwise.scale;
-            attr_info.eltwise_alpha = eltwise.alpha;
-            attr_info.eltwise_beta = eltwise.beta;
-        } else {
-            attr_info.eltwise_alg = alg_kind::undef;
-            attr_info.eltwise_scale = 1.0f;
-            attr_info.eltwise_alpha = 1.0f;
-            attr_info.eltwise_beta = 0.0f;
-        }
-
-        // Sum
-        attr_info.sum_idx = po.find(primitive_kind::sum);
-        attr_info.sum_scale = (attr_info.sum_idx != -1
-                        ? po.entry_[attr_info.sum_idx].sum.scale
-                        : 0.0f);
-        attr_info.sum_data_type = (attr_info.sum_idx != -1)
-                ? po.entry_[attr_info.sum_idx].sum.dt
-                : dnnl_data_type_undef;
-        attr_info.with_sum
-                = (attr_info.sum_idx != -1) && (attr_info.sum_scale != 0.0f);
-
-        // Output scales
-        attr_info.with_oscales
-                = !attr->scales_.get(DNNL_ARG_WEIGHTS).has_default_values();
-
-        const auto &scales_mask = attr->scales_.get(DNNL_ARG_WEIGHTS).mask_;
-        attr_info.with_common_oscales
-                = attr_info.with_oscales && (scales_mask == 0);
-        attr_info.with_per_oc_oscales
-                = attr_info.with_oscales && (scales_mask == (1 << 1));
-
-        attr_info.with_runtime_oscales = !attr->output_scales_.defined();
-
-        const auto &src_scales = attr->scales_.get(DNNL_ARG_SRC);
-        attr_info.with_src_scales = !src_scales.has_default_values();
-        attr_info.with_src0_scale = !src_scales.has_default_values();
-        attr_info.src_scales_mask = src_scales.mask_;
-
-        const auto &src1_scales = attr->scales_.get(DNNL_ARG_SRC_1);
-        attr_info.with_src1_scale = !src1_scales.has_default_values();
-        gpu_assert(src1_scales.mask_ == 0);
-
-        const auto &wei_scales = attr->scales_.get(DNNL_ARG_WEIGHTS);
-        attr_info.with_wei_scales = !wei_scales.has_default_values();
-        attr_info.wei_scales_mask = wei_scales.mask_;
-
-        const auto &dst_scales = attr->scales_.get(DNNL_ARG_DST);
-        attr_info.with_dst_scales = !dst_scales.has_default_values();
-        gpu_assert(dst_scales.mask_ == 0);
-
-        // zero points
-        const auto &zp = attr->zero_points_;
-        attr_info.with_src_zpoints = !zp.has_default_values(DNNL_ARG_SRC);
-        attr_info.with_wei_zpoints = !zp.has_default_values(DNNL_ARG_WEIGHTS);
-        attr_info.with_dst_zpoints = !zp.has_default_values(DNNL_ARG_DST);
-        attr_info.src_zpoints_data_type = zp.get_data_type(DNNL_ARG_SRC);
-        attr_info.wei_zpoints_data_type = zp.get_data_type(DNNL_ARG_WEIGHTS);
-        attr_info.dst_zpoints_data_type = zp.get_data_type(DNNL_ARG_DST);
-
-        attr_info.with_per_ic_src_zpoints = attr_info.with_src_zpoints
-                && !zp.defined(DNNL_ARG_SRC) && !zp.common(DNNL_ARG_SRC);
-
-        attr_info.with_per_oc_dst_zpoints = attr_info.with_dst_zpoints
-                && !zp.defined(DNNL_ARG_DST) && !zp.common(DNNL_ARG_DST);
-
-        attr_info.initialized = true;
-        return attr_info;
-    }
+    static attr_info_t create(const primitive_attr_t *attr);
 
     bool initialized = false;
 
@@ -204,18 +78,16 @@ struct attr_info_t {
     float sum_scale;
     data_type_t sum_data_type;
 
-    bool with_oscales;
-    bool with_common_oscales;
-    bool with_per_oc_oscales;
-    bool with_runtime_oscales;
-
     bool with_src0_scale;
     bool with_src1_scale;
     bool with_src_scales;
     bool with_wei_scales;
     bool with_dst_scales;
-    bool src_scales_mask;
-    bool wei_scales_mask;
+    int wei_scales_mask;
+    int dst_scales_mask;
+    data_type_t src_scales_data_type;
+    data_type_t wei_scales_data_type;
+    data_type_t dst_scales_data_type;
 
     bool with_src_zpoints;
     bool with_wei_zpoints;
@@ -225,28 +97,29 @@ struct attr_info_t {
     data_type_t src_zpoints_data_type;
     data_type_t wei_zpoints_data_type;
     data_type_t dst_zpoints_data_type;
+    bool with_dst_sround;
 };
 
 template <size_t ndims>
 using strides_t = std::array<dim_t, ndims>;
 template <>
-struct compute::scalar_type_traits<strides_t<2>> {
+struct compute::scalar_type_traits_t<strides_t<2>> {
     static const auto type = scalar_type_t::_int64x3_t;
 };
 template <>
-struct compute::scalar_type_traits<strides_t<3>> {
+struct compute::scalar_type_traits_t<strides_t<3>> {
     static const auto type = scalar_type_t::_int64x3_t;
 };
 template <>
-struct compute::scalar_type_traits<strides_t<4>> {
+struct compute::scalar_type_traits_t<strides_t<4>> {
     static const auto type = scalar_type_t::_int64x4_t;
 };
 template <>
-struct compute::scalar_type_traits<strides_t<5>> {
+struct compute::scalar_type_traits_t<strides_t<5>> {
     static const auto type = scalar_type_t::_int64x5_t;
 };
 template <>
-struct compute::scalar_type_traits<strides_t<6>> {
+struct compute::scalar_type_traits_t<strides_t<6>> {
     static const auto type = scalar_type_t::_int64x5_t;
 };
 
@@ -302,25 +175,20 @@ struct conv_conf_t {
     prop_kind_t prop_kind;
 
     int ndims;
-    int mb;
-    int ngroups, ic, oc;
-    int ngroups_without_padding, oc_without_padding, ic_without_padding;
-    int id, ih, iw, od, oh, ow;
-    int f_pad, l_pad, t_pad;
-    int back_pad, r_pad, b_pad;
-    int kd, kh, kw, kwb;
-    int stride_d, stride_h, stride_w;
-    int dilate_d, dilate_h, dilate_w;
-
-    int sp_block, sp;
-    int od_block, oh_block, ow_block;
-    int id_block, ih_block, iw_block;
-    int oc_block, ic_block, nchunk;
-    int omb;
-    int odb, ohb, owb;
-    int icb;
-    int ocb;
-    int osp_chunk, mb_chunk, mb_block;
+    dim_t mb;
+    dim_t ngroups, ic, oc;
+    dim_t ngroups_without_padding, oc_without_padding, ic_without_padding;
+    dim_t id, ih, iw, od, oh, ow;
+    dim_t f_pad, l_pad, t_pad;
+    dim_t back_pad, r_pad, b_pad;
+    dim_t kd, kh, kw, kwb;
+    dim_t stride_d, stride_h, stride_w;
+    dim_t dilate_d, dilate_h, dilate_w;
+
+    int oh_block, ow_block;
+    int oc_block, ic_block;
+    dim_t ocb;
+    int mb_block;
     int iw_tail;
     size_t wei_slm_size, src_slm_size, dst_slm_size;
     int sub_group_size;
@@ -337,6 +205,7 @@ struct conv_conf_t {
     bool is_nhwc;
     bool reorder_wei = false;
     bool reorder_bias = false;
+    bool stochastic_round = false;
     int ver;
     format_tag_t src_tag, dst_tag, wei_tag;
     bool is_nchw;
@@ -346,10 +215,10 @@ struct conv_conf_t {
     int tile_size;
     int wino_m;
     int wino_r;
-    int wino_ih, wino_oh;
-    int wino_iw, wino_ow;
-    int wino_ic;
-    int wino_oc;
+    dim_t wino_ih, wino_oh;
+    dim_t wino_iw, wino_ow;
+    dim_t wino_ic;
+    dim_t wino_oc;
     int wino_ic_block;
     int wino_oc_block;
     int vect_size;
@@ -375,14 +244,14 @@ struct conv_conf_t {
 // Pooling
 struct pool_conf_t {
     int ndims;
-    int mb, c;
-    int mb_padded;
-    int c_padded;
-    int id, ih, iw, od, oh, ow;
-    int stride_d, stride_h, stride_w;
-    int kd, kh, kw;
-    int dd, dh, dw;
-    int f_pad, t_pad, l_pad;
+    dim_t mb, c;
+    dim_t mb_padded;
+    dim_t c_padded;
+    dim_t id, ih, iw, od, oh, ow;
+    dim_t stride_d, stride_h, stride_w;
+    dim_t kd, kh, kw;
+    dim_t dd, dh, dw;
+    dim_t f_pad, t_pad, l_pad;
     data_type_t src_dt;
     data_type_t dst_dt;
     alg_kind_t alg;
@@ -396,8 +265,8 @@ struct pool_conf_t {
     int nvect;
     compute::dispatch_t dispatch;
     int sub_group_size;
-    int global_pool_spatial_chunk;
-    int num_batches = 1;
+    dim_t global_pool_spatial_chunk;
+    dim_t num_batches = 1;
     int mb_block_size = 16;
 
     attr_info_t attr_info;
@@ -421,11 +290,11 @@ struct prelu_conf_t {
 
 // Inner Product
 struct inner_product_conf_t {
-    int ndims;
-    int src_ndims, wei_ndims, dst_ndims;
-    int mb, oc, ic, ic_total;
-    int id, ih, iw, od, oh, ow;
-    int kd, kh, kw;
+    dim_idx_t ndims;
+    dim_idx_t src_ndims, wei_ndims, dst_ndims;
+    dim_t mb, oc, ic, ic_total;
+    dim_t id, ih, iw, od, oh, ow;
+    dim_t kd, kh, kw;
     bool with_bias, has_spatial;
     bool is_forward, is_backward_data, is_backward_weights;
     compute::dispatch_t dispatch;
@@ -466,16 +335,16 @@ enum bn_impl_t {
 struct bnorm_conf_t {
     data_type_t data_type;
     size_t elsz;
-    int ndims;
+    dim_idx_t ndims;
     dim_t mb, ic, id, ih, iw;
     int mb_block;
-    int reduce_dim_idx;
+    dim_idx_t reduce_dim_idx;
     dim_t reduce_dim;
-    dim_t nn, sp;
-    int sp_tail, vect_size;
-    int stat_sp_nblocks, stat_sp_tail;
-    int update_sp_nblocks, update_sp_tail;
-    int reduce_stat_nblocks;
+    dim_t nn, sp, sp_tail;
+    int vect_size;
+    dim_t stat_sp_nblocks, stat_sp_tail;
+    dim_t update_sp_nblocks, update_sp_tail;
+    dim_t reduce_stat_nblocks;
     bool with_relu;
     dim_t stat_ic;
     bool is_forward, is_backward;
@@ -487,7 +356,7 @@ struct bnorm_conf_t {
     int sub_group_size;
     bool skip_reduce_stat;
     bool use_stats_one_pass;
-    int calc_stat_ic;
+    dim_t calc_stat_ic;
     int max_ic_block;
     bn_impl_t impl = bn_impl_t::unknown;
 };
@@ -498,9 +367,9 @@ struct lnorm_conf_t {
     data_type_t weights_data_type = data_type::f32;
 
     bool is_fwd;
-    int ndims;
-    int norm_axis;
-    int across_axis;
+    dim_idx_t ndims;
+    dim_idx_t norm_axis;
+    dim_idx_t across_axis;
     int norm_block;
     int num_norm_blocks;
     int norm_block_fused;
@@ -526,8 +395,8 @@ struct lnorm_conf_t {
     int vect_size_fused;
     int shift_off;
     int n_chunk_size;
-    int finalize_n_chunks;
-    int n_chunks;
+    dim_t finalize_n_chunks;
+    dim_t n_chunks;
     int vector_size_scaleshift;
     bool use_src_buffer;
 
@@ -611,13 +480,13 @@ enum reorder_kernel_t {
 
 // Resampling
 struct resampling_conf_t {
-    dim_t ndims;
+    dim_idx_t ndims;
     offsets_t off;
     dim_t MB, C;
     dim_t ID, IH, IW;
     dim_t OD, OH, OW;
     float FD, FH, FW;
-    dim_t vect_size;
+    int vect_size;
     dims_t padded_strides;
     compute::range_t gws = compute::range_t::empty();
     compute::range_t lws = compute::range_t::empty();
@@ -628,14 +497,14 @@ struct resampling_conf_t {
 };
 
 struct block_desc_t {
-    int dim_idx;
+    dim_idx_t dim_idx;
     int blk_size;
     int step_size;
 };
 
 #define LOOP_NEST_LEVEL 4
 struct vectorize_last_dim_t {
-    int vector_dim;
+    dim_idx_t vector_dim;
     int rescale_coeff;
     // composition of data within 16-item packet
     block_desc_t src_vct[LOOP_NEST_LEVEL];
@@ -650,20 +519,20 @@ struct vectorize_last_dim_t {
 };
 
 struct vectorize_group_t {
-    int vector_dim;
-    int src_loop_dim;
-    int dst_loop_dim;
+    dim_idx_t vector_dim;
+    dim_idx_t src_loop_dim;
+    dim_idx_t dst_loop_dim;
     int group_size;
     int innermost_size;
 };
 
 struct xb_to_xab_xba_t {
     int vd;
-    int blk_size;
-    int src_blk_dim;
-    int src_blk_coeff;
-    int dst_blk_dim;
-    int dst_blk_coeff;
+    dim_t blk_size;
+    dim_idx_t src_blk_dim;
+    dim_t src_blk_coeff;
+    dim_idx_t dst_blk_dim;
+    dim_t dst_blk_coeff;
 };
 
 union reorder_implementation {
@@ -677,19 +546,7 @@ struct quantization_t : public gpu::quantization_t {
     using gpu::quantization_t::quantization_t;
 
     void define_macros(
-            compute::kernel_ctx_t &kernel_ctx, const std::string &name) const {
-        if (with_scale()) {
-            kernel_ctx.define_int("WITH_" + name + "_SCALE", 1);
-            kernel_ctx.define_int(name + "_SCALE_MASK", scale_mask());
-            kernel_ctx.define_int(name + "_NUM_SCALES", num_scales());
-        }
-
-        if (with_zp()) {
-            kernel_ctx.define_int("WITH_" + name + "_ZPOINT", 1);
-            kernel_ctx.define_int(name + "_ZPOINT_MASK", zp_mask());
-            kernel_ctx.define_int(name + "_NUM_ZPOINTS", num_zps());
-        }
-    }
+            compute::kernel_ctx_t &kernel_ctx, const std::string &name) const;
 };
 
 struct sum_quantization_t : public gpu::sum_quantization_t {
@@ -697,10 +554,7 @@ struct sum_quantization_t : public gpu::sum_quantization_t {
     using gpu::sum_quantization_t::sum_quantization_t;
 
     void define_macros(
-            compute::kernel_ctx_t &kernel_ctx, const std::string &name) const {
-        if (with_scale()) kernel_ctx.define_int("WITH_" + name + "_SCALE", 1);
-        if (with_zp()) kernel_ctx.define_int("WITH_" + name + "_ZPOINT", 1);
-    }
+            compute::kernel_ctx_t &kernel_ctx, const std::string &name) const;
 };
 
 struct reorder_conf_t {
@@ -712,6 +566,7 @@ struct reorder_conf_t {
     reorder_kernel_t implementation;
     int ndims;
     size_t nelems;
+    bool subbyte_pack = false;
 
     compute::dispatch_t dispatch;
 
@@ -761,185 +616,36 @@ struct concat_conf_t {
 // Shuffle
 struct shuffle_conf_t {
     data_type_t data_type;
-    int axis;
-    int transpose_row;
-    int transpose_col;
+    dim_idx_t axis;
+    dim_t transpose_row;
+    dim_t transpose_col;
     compute::dispatch_t dispatch;
     memory_desc_info_t src_md_info;
     memory_desc_info_t dst_md_info;
 };
 
-inline void set_default_pool_conf(pool_conf_t &conf, const pooling_desc_t &desc,
+void set_default_pool_conf(pool_conf_t &conf, const pooling_desc_t &desc,
         const memory_desc_t &src_md, const memory_desc_t &dst_md,
-        const primitive_attr_t &attr) {
-    const memory_desc_wrapper src_mdw(src_md);
-    const memory_desc_wrapper dst_mdw(dst_md);
-
-    const auto &src_dims = src_mdw.dims();
-    const auto &dst_dims = dst_mdw.dims();
-
-    int ndims = src_mdw.ndims();
-    conf.ndims = ndims;
-
-    conf.mb = src_dims[0];
-
-    conf.c = src_dims[1];
-    conf.mb_padded = src_mdw.padded_dims()[0];
-    conf.c_padded = src_mdw.padded_dims()[1];
-    conf.id = (ndims == 5) ? src_dims[2] : 1;
-    conf.ih = (ndims == 3) ? 1 : src_dims[ndims - 2];
-    conf.iw = src_dims[ndims - 1];
-    conf.od = (ndims == 5) ? dst_dims[2] : 1;
-    conf.oh = (ndims == 3) ? 1 : dst_dims[ndims - 2];
-    conf.ow = dst_dims[ndims - 1];
-
-    conf.stride_d = (ndims == 5) ? desc.strides[0] : 1;
-    conf.stride_h = (ndims == 3) ? 1 : desc.strides[ndims - 4];
-    conf.stride_w = desc.strides[ndims - 3];
-    conf.kd = (ndims == 5) ? desc.kernel[0] : 1;
-    conf.kh = (ndims == 3) ? 1 : desc.kernel[ndims - 4];
-    conf.kw = desc.kernel[ndims - 3];
-
-    conf.dd = (ndims == 5) ? desc.dilation[0] : 0;
-    conf.dh = (ndims == 3) ? 0 : desc.dilation[ndims - 4];
-    conf.dw = desc.dilation[ndims - 3];
-
-    conf.f_pad = (ndims == 5) ? desc.padding[0][0] : 0;
-    conf.t_pad = (ndims == 3) ? 0 : desc.padding[0][ndims - 4];
-    conf.l_pad = desc.padding[0][ndims - 3];
-
-    conf.alg = desc.alg_kind;
-
-    conf.src_dt = src_mdw.data_type();
-    conf.dst_dt = dst_mdw.data_type();
-
-    conf.src_md_info = memory_desc_info_t::create(src_mdw);
-    conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
+        const primitive_attr_t &attr);
 
-    conf.is_training = desc.prop_kind == prop_kind::forward_training;
-    conf.is_backward = desc.prop_kind == prop_kind::backward_data;
-
-    conf.attr_info = attr_info_t::create(&attr);
-}
-
-inline void set_default_conf(conv_conf_t &conf, const convolution_desc_t &cd,
+void set_default_conf(conv_conf_t &conf, const convolution_desc_t &cd,
         const memory_desc_t &src_md, const memory_desc_t &weights_md,
         const memory_desc_t &dst_md, const memory_desc_t &bias_md,
-        const primitive_attr_t &attr) {
-
-    const memory_desc_wrapper src_mdw(&src_md);
-    const memory_desc_wrapper weights_mdw(&weights_md);
-    const memory_desc_wrapper dst_mdw(&dst_md);
-    const memory_desc_wrapper bias_mdw(&bias_md);
-
-    const bool with_groups = weights_mdw.ndims() == src_mdw.ndims() + 1;
-    int ndims = src_mdw.ndims();
-
-    conf = utils::zero<decltype(conf)>();
-    conf.with_groups = with_groups;
-    conf.ndims = ndims;
-    conf.prop_kind = cd.prop_kind;
-    conf.ngroups = with_groups ? weights_mdw.dims()[0] : 1;
-    conf.mb = src_mdw.dims()[0];
-    conf.oc_without_padding = dst_mdw.dims()[1] / conf.ngroups;
-    conf.ic_without_padding = src_mdw.dims()[1] / conf.ngroups;
-    conf.id = (ndims == 5) ? src_mdw.dims()[2] : 1;
-    conf.ih = (ndims == 3) ? 1 : src_mdw.dims()[ndims - 2];
-    conf.iw = src_mdw.dims()[ndims - 1];
-    conf.od = (ndims == 5) ? dst_mdw.dims()[2] : 1;
-    conf.oh = (ndims == 3) ? 1 : dst_mdw.dims()[ndims - 2];
-    conf.ow = dst_mdw.dims()[ndims - 1];
-    conf.kd = (ndims == 5) ? weights_mdw.dims()[with_groups + 2] : 1;
-    conf.kh = (ndims == 3) ? 1 : weights_mdw.dims()[with_groups + ndims - 2];
-    conf.kw = weights_mdw.dims()[with_groups + ndims - 1];
-
-    conf.is_depthwise = conf.with_groups && conf.oc_without_padding == 1
-            && conf.ic_without_padding == 1;
-    conf.oc = dst_mdw.dims()[1] / conf.ngroups;
-    conf.ic = src_mdw.dims()[1] / conf.ngroups;
-
-    conf.f_pad = (ndims == 5) ? cd.padding[0][0] : 0;
-    conf.back_pad = (ndims == 5) ? cd.padding[1][0] : 0;
-    conf.t_pad = (ndims == 3) ? 0 : cd.padding[0][ndims - 4];
-    conf.b_pad = (ndims == 3) ? 0 : cd.padding[1][ndims - 4];
-    conf.l_pad = cd.padding[0][ndims - 3];
-    conf.r_pad = cd.padding[1][ndims - 3];
-    conf.stride_d = (ndims == 5) ? cd.strides[0] : 1;
-    conf.stride_h = (ndims == 3) ? 1 : cd.strides[ndims - 4];
-    conf.stride_w = cd.strides[ndims - 3];
-    conf.dilate_d = (ndims == 5) ? cd.dilates[0] : 0;
-    conf.dilate_h = (ndims == 3) ? 0 : cd.dilates[ndims - 4];
-    conf.dilate_w = cd.dilates[ndims - 3];
-
-    conf.with_bias = bias_mdw.format_kind() != format_kind::undef;
-
-    conf.src_data_type = src_mdw.data_type();
-    conf.weights_data_type = weights_mdw.data_type();
-    conf.dst_data_type = dst_mdw.data_type();
-
-    conf.acc_data_type = cd.accum_data_type;
-    conf.bias_data_type
-            = conf.with_bias ? bias_mdw.data_type() : data_type::f32;
-
-    if (!src_mdw.format_any())
-        conf.src_md_info = memory_desc_info_t::create(src_mdw);
-    if (!weights_mdw.format_any())
-        conf.wei_md_info = memory_desc_info_t::create(weights_mdw);
-    if (!dst_mdw.format_any())
-        conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
-
-    conf.attr_info = attr_info_t::create(&attr);
-}
-
-inline void set_offsets(compute::kernel_ctx_t &kernel_ctx,
-        const memory_desc_wrapper &md, const char *str) {
-    dim_t block_dims[DNNL_MAX_NDIMS];
-    dim_t strides_compat[2][DNNL_MAX_NDIMS];
-
-    md.compute_blocks(block_dims);
-    md.compute_strides_compat(strides_compat);
-
-    for (int d = 0; d < MAX_NDIMS; ++d) {
-        const dim_t block = block_dims[d];
+        const primitive_attr_t &attr);
 
-        kernel_ctx.define_int(
-                utils::format("%s_B%d", str, d), (d < md.ndims()) ? block : 1);
-        kernel_ctx.define_int(utils::format("%s_S%d", str, d),
-                (d < md.ndims()) ? strides_compat[0][d] : 0);
-        kernel_ctx.define_int(utils::format("%s_SB%d", str, d),
-                (d < md.ndims()) ? strides_compat[1][d] : 0);
-    }
-
-    kernel_ctx.define_int(utils::format("%s_OFFSET_PAD", str), md.md_->offset0);
-}
-
-inline void set_offsets(
-        const memory_desc_wrapper &md, dim_t offs[4][MAX_NDIMS]) {
-    dim_t block_dims[DNNL_MAX_NDIMS];
-    dim_t strides_compat[2][DNNL_MAX_NDIMS];
+void set_offsets(compute::kernel_ctx_t &kernel_ctx,
+        const memory_desc_wrapper &md, const char *str);
 
-    md.compute_blocks(block_dims);
-    md.compute_strides_compat(strides_compat);
-    const dims_t &dims = md.dims();
-
-    for (int d = 0; d < md.ndims(); ++d) {
-        const dim_t block = block_dims[d];
-
-        offs[0][d] = block;
-        offs[1][d] = strides_compat[0][d];
-        offs[2][d] = strides_compat[1][d];
-        offs[3][d] = dims[d];
-    }
-}
+void set_offsets(const memory_desc_wrapper &md, dim_t offs[4][MAX_NDIMS]);
 
 struct outer_strides_getter_t {
     template <size_t ndims>
     operator strides_t<ndims>() const {
         strides_t<ndims> ret;
-        gpu_assert(gpu_utils::into<dim_t>(ndims) >= md.ndims());
+        gpu_assert(into<dim_t>(ndims) >= md.ndims());
         for (int d = ndims - 1; d >= 0; d--) {
-            // Assumes size 1 dimensions are dense with respect to the neighboring
-            // dimension so they can be used for size calculations in some layouts
+            // Assumes size 1 dimensions are dense w.r.t. the neighboring dims
+            // so they can be used for size calculations in some layouts.
             ret[d] = [&]() {
                 if (d >= md.ndims())
                     return static_cast<dim_t>(0);
@@ -957,522 +663,64 @@ struct outer_strides_getter_t {
     const memory_desc_wrapper &md;
 };
 
-inline outer_strides_getter_t get_outer_strides(const memory_desc_wrapper &md) {
-    return {md};
-}
-
-inline block_layout_t get_inner_layout(const memory_desc_wrapper &md) {
-    block_layout_t inner_layout(md, /* inner_only */ true);
-
-    block_layout_t ret;
-    // Explicitly initialize to size-1 blocks
-    for (int d = 0; d < MAX_NDIMS; d++) {
-        ret.append(block_t(d, 1, 0));
-    }
-
-    // Overwrite inner blocks with their actual values
-    for (const auto &block : inner_layout) {
-        ret[block.dim_idx] = block;
-    }
-
-    return ret;
-}
-
-inline void def_offsets(const dim_t offs[4][MAX_NDIMS],
-        compute::kernel_ctx_t &kernel_ctx, const char *str, const int ndims) {
-
-    for (int d = 0; d < MAX_NDIMS; d++) {
-        kernel_ctx.define_int(
-                utils::format("%s_B%d", str, d), (d < ndims) ? offs[0][d] : 1);
-        kernel_ctx.define_int(
-                utils::format("%s_S%d", str, d), (d < ndims) ? offs[1][d] : 0);
-        kernel_ctx.define_int(
-                utils::format("%s_SB%d", str, d), (d < ndims) ? offs[2][d] : 0);
-        kernel_ctx.define_int(
-                utils::format("%s_D%d", str, d), (d < ndims) ? offs[3][d] : 0);
-    }
-}
-
-inline void def_block_offsets(const block_layout_t &layout,
-        compute::kernel_ctx_t &kernel_ctx, const char *str) {
-
-    for (const block_t &b : layout) {
-        kernel_ctx.define_int(utils::format("%s_B%d", str, b.dim_idx), b.block);
-        kernel_ctx.define_int(
-                utils::format("%s_SB%d", str, b.dim_idx), b.stride);
-    }
-}
+outer_strides_getter_t get_outer_strides(const memory_desc_wrapper &md);
 
-inline void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
-        const char *str, bool with_punning = true) {
-    const char *bf16_name = with_punning ? "ushort" : "bf16";
-    const char *bf8_name = with_punning ? "uchar" : "f8_e5m2";
-    const char *hf8_name = with_punning ? "uchar" : "f8_e4m3";
-    const char *e8m0_name = with_punning ? "uchar" : "e8m0";
-    const char *u4_name = with_punning ? "uchar" : "u4";
-    const char *s4_name = with_punning ? "uchar" : "s4";
-
-    switch (dt) {
-        case data_type::undef:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=void -D%s_DT_UNDEF", str, str));
-            break;
-        case data_type::bf16:
-            kernel_ctx.add_option(utils::format(
-                    "-D%s_DATA_T=%s -D%s_DT_BF16", str, bf16_name, str));
-            break;
-        case data_type::f16:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=half -D%s_DT_F16", str, str));
-            break;
-        case data_type::f32:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=float -D%s_DT_F32", str, str));
-            break;
-        case data_type::f64:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=double -D%s_DT_F64", str, str));
-            break;
-        case data_type::s8:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=char -D%s_DT_S8", str, str));
-            break;
-        case data_type::u8:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=uchar -D%s_DT_U8", str, str));
-            break;
-        case data_type::f8_e4m3:
-            kernel_ctx.add_option(utils::format(
-                    "-D%s_DATA_T=%s -D%s_DT_HF8", str, hf8_name, str));
-            break;
-        case data_type::f8_e5m2:
-            kernel_ctx.add_option(utils::format(
-                    "-D%s_DATA_T=%s -D%s_DT_BF8", str, bf8_name, str));
-            break;
-        case data_type::e8m0:
-            kernel_ctx.add_option(utils::format(
-                    "-D%s_DATA_T=%s -D%s_DT_E8M0", str, e8m0_name, str));
-            break;
-        case data_type::s4:
-            kernel_ctx.add_option(utils::format(
-                    "-D%s_DATA_T=%s -D%s_DT_S4", str, s4_name, str));
-            break;
-        case data_type::u4:
-            kernel_ctx.add_option(utils::format(
-                    "-D%s_DATA_T=%s -D%s_DT_U4", str, u4_name, str));
-            break;
-        case data_type::s32:
-            kernel_ctx.add_option(
-                    utils::format("-D%s_DATA_T=int -D%s_DT_S32", str, str));
-            break;
-        default:
-            gpu_error_not_expected()
-                    << "Unexpected data type " << dnnl_dt2str(dt);
-            break;
-    }
-}
+block_layout_t get_inner_layout(const memory_desc_wrapper &md);
 
-inline void def_memory_desc_info(compute::kernel_ctx_t &kernel_ctx,
-        const memory_desc_info_t &md_info, const char *prefix) {
-    def_data_type(kernel_ctx, md_info.data_type, prefix);
+void def_offsets(const dim_t offs[4][MAX_NDIMS],
+        compute::kernel_ctx_t &kernel_ctx, const char *str,
+        const dim_idx_t ndims);
 
-    kernel_ctx.define_int(utils::format("%s_OFFSET0", prefix), md_info.offset0);
-    kernel_ctx.define_int(utils::format("%s_NDIMS", prefix), md_info.ndims);
+void def_block_offsets(const block_layout_t &layout,
+        compute::kernel_ctx_t &kernel_ctx, const char *str);
 
-    kernel_ctx.define_int(utils::format("%s_NLEVELS", prefix), md_info.nlevels);
+void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
+        const char *str, bool with_punning = true);
+void def_data_type(compute::kernel_ctx_t &kernel_ctx, data_type_t dt,
+        const std::string &str, bool with_punning = true);
 
-    for (int d = 0; d < MAX_NDIMS; ++d) {
-        dim_t dim = (d < md_info.ndims) ? md_info.dims[d] : 1;
-        dim_t padded_dim = (d < md_info.ndims) ? md_info.padded_dims[d] : 1;
-        kernel_ctx.define_int(utils::format("%s_D%d", prefix, d), dim);
-        kernel_ctx.define_int(utils::format("%s_PD%d", prefix, d), padded_dim);
-
-        for (int l = 0; l < md_info.nlevels + 1; ++l) {
-            dim_t block = (d < md_info.ndims) ? md_info.blocks[d][l] : 1;
-            dim_t stride = (d < md_info.ndims) ? md_info.strides[d][l] : 0;
-            kernel_ctx.define_int(
-                    utils::format("%s_B%d_%d", prefix, d, l), block);
-            kernel_ctx.define_int(
-                    utils::format("%s_S%d_%d", prefix, d, l), stride);
-        }
-    }
-}
+void def_memory_desc_info(compute::kernel_ctx_t &kernel_ctx,
+        const memory_desc_info_t &md_info, const char *prefix,
+        bool with_punning = true);
 
-inline void def_binary_alg_kinds(compute::kernel_ctx_t &kernel_ctx) {
-    kernel_ctx.define_int("BINARY_ADD", alg_kind::binary_add);
-    kernel_ctx.define_int("BINARY_MUL", alg_kind::binary_mul);
-    kernel_ctx.define_int("BINARY_MIN", alg_kind::binary_min);
-    kernel_ctx.define_int("BINARY_MAX", alg_kind::binary_max);
-    kernel_ctx.define_int("BINARY_DIV", alg_kind::binary_div);
-    kernel_ctx.define_int("BINARY_SUB", alg_kind::binary_sub);
-    kernel_ctx.define_int("BINARY_GE", alg_kind::binary_ge);
-    kernel_ctx.define_int("BINARY_GT", alg_kind::binary_gt);
-    kernel_ctx.define_int("BINARY_LE", alg_kind::binary_le);
-    kernel_ctx.define_int("BINARY_LT", alg_kind::binary_lt);
-    kernel_ctx.define_int("BINARY_EQ", alg_kind::binary_eq);
-    kernel_ctx.define_int("BINARY_NE", alg_kind::binary_ne);
-}
+void def_binary_alg_kinds(compute::kernel_ctx_t &kernel_ctx);
 
-inline void def_eltwise_alg_kinds(compute::kernel_ctx_t &kernel_ctx) {
-    kernel_ctx.define_int("RELU", alg_kind::eltwise_relu);
-    kernel_ctx.define_int("LINEAR", alg_kind::eltwise_linear);
-    kernel_ctx.define_int("SOFT_RELU", alg_kind::eltwise_soft_relu);
-    kernel_ctx.define_int("MISH", alg_kind::eltwise_mish);
-    kernel_ctx.define_int("LOGISTIC", alg_kind::eltwise_logistic);
-    kernel_ctx.define_int("TANH", alg_kind::eltwise_tanh);
-    kernel_ctx.define_int("ELU", alg_kind::eltwise_elu);
-    kernel_ctx.define_int("SQUARE", alg_kind::eltwise_square);
-    kernel_ctx.define_int("SQRT", alg_kind::eltwise_sqrt);
-    kernel_ctx.define_int("ABS", alg_kind::eltwise_abs);
-    kernel_ctx.define_int("EXP", alg_kind::eltwise_exp);
-    kernel_ctx.define_int("GELU_TANH", alg_kind::eltwise_gelu_tanh);
-    kernel_ctx.define_int("SWISH", alg_kind::eltwise_swish);
-    kernel_ctx.define_int("LOG", alg_kind::eltwise_log);
-    kernel_ctx.define_int("CLIP", alg_kind::eltwise_clip);
-    kernel_ctx.define_int("CLIP_V2", alg_kind::eltwise_clip_v2);
-    kernel_ctx.define_int("POW", alg_kind::eltwise_pow);
-    kernel_ctx.define_int("GELU_ERF", alg_kind::eltwise_gelu_erf);
-    kernel_ctx.define_int("ROUND", alg_kind::eltwise_round);
-    kernel_ctx.define_int("HARDSWISH", alg_kind::eltwise_hardswish);
-    kernel_ctx.define_int("HARDSIGMOID", alg_kind::eltwise_hardsigmoid);
-
-    kernel_ctx.define_int("RELU_DST", alg_kind::eltwise_relu_use_dst_for_bwd);
-    kernel_ctx.define_int(
-            "LOGISTIC_DST", alg_kind::eltwise_logistic_use_dst_for_bwd);
-    kernel_ctx.define_int("TANH_DST", alg_kind::eltwise_tanh_use_dst_for_bwd);
-    kernel_ctx.define_int("ELU_DST", alg_kind::eltwise_elu_use_dst_for_bwd);
-    kernel_ctx.define_int("SQRT_DST", alg_kind::eltwise_sqrt_use_dst_for_bwd);
-    kernel_ctx.define_int("EXP_DST", alg_kind::eltwise_exp_use_dst_for_bwd);
-    kernel_ctx.define_int(
-            "CLIP_V2_DST", alg_kind::eltwise_clip_v2_use_dst_for_bwd);
-}
+void def_eltwise_alg_kinds(compute::kernel_ctx_t &kernel_ctx);
 
-inline bool post_ops_with_binary_ok(const primitive_attr_t *attr,
+bool post_ops_with_binary_ok(const primitive_attr_t *attr,
         const data_type_t dst_dt, const int max_ndims_supported = 2,
-        const int prelu_mask_supported = 3) {
-    const auto &p = attr->post_ops_;
-
-    auto is_eltwise = [&](int idx) { return p.entry_[idx].is_eltwise(false); };
-    auto is_sum = [&](int idx) { return p.entry_[idx].is_sum(false); };
-    auto is_binary = [&](int idx) { return p.entry_[idx].is_binary(); };
-    auto is_prelu = [&](int idx) { return p.entry_[idx].is_prelu(); };
-
-    bool is_po_ok = true;
-    for (int po_idx = 0; po_idx < p.len(); ++po_idx) {
-        is_po_ok = is_po_ok
-                && (is_eltwise(po_idx) || is_sum(po_idx) || is_binary(po_idx)
-                        || is_prelu(po_idx));
-        if (is_binary(po_idx)) {
-            const auto &bin_desc = p.entry_[po_idx].binary.src1_desc;
-            if (bin_desc.ndims > max_ndims_supported) {
-                // accept descriptor if unsupported dims are equal to 1.
-                for (int dim_idx = max_ndims_supported;
-                        dim_idx < bin_desc.ndims; ++dim_idx) {
-                    if (bin_desc.dims[dim_idx] != 1) is_po_ok = false;
-                }
-            }
-        }
-        if (is_prelu(po_idx)) {
-            if (p.entry_[po_idx].prelu.mask > prelu_mask_supported)
-                is_po_ok = false;
-        }
-        if (is_sum(po_idx)) {
-            if (p.entry_[po_idx].sum.zero_point != 0) return false;
-            if (p.entry_[po_idx].sum.dt != dnnl_data_type_undef
-                    && types::data_type_size(p.entry_[po_idx].sum.dt)
-                            != types::data_type_size(dst_dt))
-                return false;
-        }
-    }
-
-    if (p.len() > MAX_POST_OPS_SUPPORTED) is_po_ok = false;
-    if (dst_dt == dnnl_f64 && !p.has_default_values()) is_po_ok = false;
-
-    return is_po_ok;
-}
+        const int prelu_mask_supported = 3);
 
 constexpr int prelu_max_ndims = 5;
-inline status_t get_prelu_md(int prelu_mask, const dim_t *dst_dims,
-        memory_desc_t &weight_mem_desc, int weight_ndims) {
-    format_tag_t weights_tag;
-    dims_t weight_dims {};
-    for (int d = 0; d < weight_ndims; ++d) {
-        if (((prelu_mask >> d) & 0x1) == 1) {
-            weight_dims[d] = dst_dims[d];
-        } else {
-            weight_dims[d] = 1;
-        }
-    }
-    switch (weight_ndims) {
-        case 1: weights_tag = format_tag_t::dnnl_a; break;
-        case 2: weights_tag = format_tag_t::dnnl_ab; break;
-        case 3: weights_tag = format_tag_t::dnnl_acb; break;
-        case 4: weights_tag = format_tag_t::dnnl_acdb; break;
-        case 5: weights_tag = format_tag_t::dnnl_acdeb; break;
-        default: weights_tag = format_tag_t::dnnl_format_tag_undef; break;
-    }
-    CHECK(memory_desc_init_by_tag(weight_mem_desc, weight_ndims, weight_dims,
-            data_type_t::dnnl_f32, weights_tag));
-    return status::success;
-}
-
-inline status_t def_post_ops_cfg(compute::kernel_ctx_t &kernel_ctx,
-        const post_ops_t &post_ops, const memory_desc_t &dst_md) {
-    const int po_nop_id = 0;
-    const int po_binary_id = 1;
-    const int po_eltwise_id = 2;
-    const int po_sum_id = 3;
-
-    kernel_ctx.define_int("PO_BINARY", po_binary_id);
-    kernel_ctx.define_int("PO_ELTWISE", po_eltwise_id);
-    kernel_ctx.define_int("PO_SUM", po_sum_id);
-
-    std::string po_kernel_args = "-DPOST_OP_ARGS=\"";
-    int nof_supported_post_ops = 0;
-
-    bool post_op_uses_bf16 = false;
-    bool post_op_uses_bf8 = false;
-    bool post_op_uses_hf8 = false;
-
-    auto add_po_defines = [&](const std::string &bin_arg_name,
-                                  const post_ops_t::entry_t &e, int idx) {
-        auto define_binary_po_type = [&](data_type_t type) {
-            def_data_type(kernel_ctx, type,
-                    utils::format("%s_ACTUAL", bin_arg_name).c_str(), false);
-            post_op_uses_bf16 |= (type == data_type::bf16);
-            post_op_uses_bf8 |= (type == data_type::f8_e5m2);
-            post_op_uses_hf8 |= (type == data_type::f8_e4m3);
-        };
-        if (e.is_binary()) {
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_KIND", po_binary_id);
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_ALG", e.binary.alg);
-
-            const memory_desc_wrapper src1_mdw(e.binary.src1_desc);
-            const auto mdi = memory_desc_info_t::create(src1_mdw);
-            def_memory_desc_info(kernel_ctx, mdi, bin_arg_name.c_str());
-            define_binary_po_type(mdi.data_type);
-        } else if (e.is_prelu()) {
-            // binary && eltwise relu = prelu post op
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_KIND", po_binary_id);
-            kernel_ctx.define_int("PO_" + std::to_string(idx) + "_ALG",
-                    alg_kind_t::dnnl_eltwise_relu);
-
-            memory_desc_t weight_mem_desc;
-            int weight_ndims = dst_md.ndims;
-            CHECK(get_prelu_md(
-                    e.prelu.mask, dst_md.dims, weight_mem_desc, weight_ndims));
-            const memory_desc_wrapper weight_mdw(weight_mem_desc);
-            const auto mdi = memory_desc_info_t::create(weight_mdw);
-            def_memory_desc_info(kernel_ctx, mdi, bin_arg_name.c_str());
-
-            // prelu weights are assumed to be f32
-            define_binary_po_type(data_type::f32);
-        } else {
-            memory_desc_t empty_mem_desc;
-            dnnl_dims_t empty_dims = {1, 1, 1, 1};
-            CHECK(memory_desc_init_by_tag(empty_mem_desc, 4, empty_dims,
-                    data_type_t::dnnl_s8, format_tag_t::dnnl_nchw));
-            const memory_desc_wrapper src1_mdw(empty_mem_desc);
-            const auto mdi = memory_desc_info_t::create(src1_mdw);
-            def_memory_desc_info(kernel_ctx, mdi, bin_arg_name.c_str());
-
-            // unused - just need any type that's convertible to float
-            define_binary_po_type(data_type::f32);
-        }
-        if (e.is_eltwise(false)) {
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_KIND", po_eltwise_id);
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_ALG", e.eltwise.alg);
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_ELTWISE_ALPHA").c_str(),
-                    e.eltwise.alpha);
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_ELTWISE_BETA").c_str(),
-                    e.eltwise.beta);
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_ELTWISE_SCALE").c_str(),
-                    e.eltwise.scale);
-        } else {
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_ELTWISE_ALPHA").c_str(),
-                    1.0f);
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_ELTWISE_BETA").c_str(),
-                    0.0f);
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_ELTWISE_SCALE").c_str(),
-                    1.0f);
-        }
-        if (e.is_sum(false)) {
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_KIND", po_sum_id);
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_ALG", alg_kind::undef);
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_SUM_SCALE").c_str(),
-                    e.sum.scale);
-        } else {
-            kernel_ctx.define_float(
-                    ("PO_" + std::to_string(idx) + "_SUM_SCALE").c_str(), 1.0f);
-        }
-        if (!(e.is_binary() || e.is_eltwise(false) || e.is_sum(false)
-                    || e.is_prelu())) {
-            // empty post op
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_KIND", po_nop_id);
-            // *_ALG need to be set but it's unused when kind is NOP
-            kernel_ctx.define_int(
-                    "PO_" + std::to_string(idx) + "_ALG", alg_kind::undef);
-            --nof_supported_post_ops;
-        }
-        po_kernel_args += ", const __global PO_" + std::to_string(idx)
-                + "_BIN_ARG_DATA_T *po_" + std::to_string(idx) + "_binary_arg";
-        return status::success;
-    };
-
-    for (int idx = 0; idx < post_ops.len(); ++idx, ++nof_supported_post_ops) {
-        const std::string bin_arg_name
-                = "PO_" + std::to_string(idx) + "_BIN_ARG";
-        CHECK(add_po_defines(bin_arg_name, post_ops.entry_[idx], idx));
-    }
-
-    kernel_ctx.define_int("POST_OP_CHAIN_LENGTH", nof_supported_post_ops);
-    if (post_op_uses_bf16) kernel_ctx.define_int("POST_OP_USING_BF16", 1);
-    if (post_op_uses_bf8) kernel_ctx.define_int("POST_OP_USING_BF8", 1);
-    if (post_op_uses_hf8) kernel_ctx.define_int("POST_OP_USING_HF8", 1);
+status_t get_prelu_md(int prelu_mask, const dim_t *dst_dims,
+        memory_desc_t &weight_mem_desc, int weight_ndims);
 
-    po_kernel_args += "\"";
-    kernel_ctx.add_option(po_kernel_args);
-    return status::success;
-}
+status_t def_post_ops_cfg(compute::kernel_ctx_t &kernel_ctx,
+        const post_ops_t &post_ops, const memory_desc_t &dst_md);
 
-inline int append_post_ops_to_arg_list_base(const exec_args_t &args,
+int append_post_ops_to_arg_list_base(const exec_args_t &args,
         compute::kernel_arg_list_t &arg_list, int post_op_idx,
-        const post_ops_t &post_ops) {
-    auto set_arg_entry = [&](const post_ops_t::entry_t &e, int po_idx) {
-        if (e.is_binary()) {
-            auto arg = args.at(
-                    DNNL_ARG_ATTR_MULTIPLE_POST_OP(po_idx) | DNNL_ARG_SRC_1);
-            gpu_assert(arg.is_const);
-
-            auto &binary_arg = arg.mem
-                    ? *(arg.mem->memory_storage())
-                    : dnnl::impl::memory_storage_t::empty_storage();
-            arg_list.set(post_op_idx++, binary_arg);
-        } else if (e.is_prelu()) {
-            auto arg = args.at(
-                    DNNL_ARG_ATTR_MULTIPLE_POST_OP(po_idx) | DNNL_ARG_WEIGHTS);
-            gpu_assert(arg.is_const);
-            auto &prelu_wei_arg = arg.mem
-                    ? *(arg.mem->memory_storage())
-                    : dnnl::impl::memory_storage_t::empty_storage();
-            arg_list.set(post_op_idx++, prelu_wei_arg);
-        } else {
-            arg_list.set(post_op_idx++, memory_storage_t::empty_storage());
-        }
-    };
-
-    for (int idx = 0; idx < post_ops.len(); ++idx) {
-        set_arg_entry(post_ops.entry_[idx], idx);
-    }
-    return post_op_idx;
-}
-inline int append_post_ops_to_arg_list_gemm(const exec_args_t &args,
+        const post_ops_t &post_ops);
+int append_post_ops_to_arg_list_gemm(const exec_args_t &args,
         compute::kernel_arg_list_t &arg_list, int post_op_idx,
-        const post_ops_t &post_ops) {
-    return append_post_ops_to_arg_list_base(
-            args, arg_list, post_op_idx, post_ops);
-}
-inline int append_post_ops_to_arg_list(const exec_ctx_t &ctx,
+        const post_ops_t &post_ops);
+int append_post_ops_to_arg_list(const exec_ctx_t &ctx,
         compute::kernel_arg_list_t &arg_list, int post_op_idx,
-        const post_ops_t &post_ops) {
-    exec_args_t args;
-    return append_post_ops_to_arg_list_base(
-            ctx.args(), arg_list, post_op_idx, post_ops);
-}
+        const post_ops_t &post_ops);
 
-inline bool post_ops_preserves_zeroes(
-        const exec_ctx_t &ctx, const post_ops_t &post_ops) {
-    bool preserve_zeroes = true;
-    for (int idx = 0; idx < post_ops.len(); ++idx) {
-        const post_ops_t::entry_t &po_entry = post_ops.entry_[idx];
-        if (po_entry.is_binary()) {
-            // only binary mul is preserving zeroes
-            preserve_zeroes &= po_entry.binary.alg
-                    == dnnl::impl::alg_kind_t::dnnl_binary_mul;
-        }
-        if (po_entry.is_eltwise(false)) {
-            preserve_zeroes &= gpu_eltwise_fwd_pd_t::eltwise_preserves_zero(
-                    po_entry.eltwise.alg, po_entry.eltwise.alpha,
-                    po_entry.eltwise.beta);
-        }
-    }
-    return preserve_zeroes;
-}
+bool post_ops_preserves_zeroes(
+        const exec_ctx_t &ctx, const post_ops_t &post_ops);
 
-inline status_t def_attr_info_impl(compute::kernel_ctx_t &kernel_ctx,
+status_t def_attr_info_impl(compute::kernel_ctx_t &kernel_ctx,
         const attr_info_t &attr_info, const post_ops_t &post_ops,
-        const memory_desc_t &dst_md) {
-    gpu_assert(attr_info.initialized);
-
-    kernel_ctx.define_int("WITH_POST_OP", post_ops.len() > 0);
-
-    kernel_ctx.define_int("WITH_ELTWISE", attr_info.with_eltwise);
-    kernel_ctx.define_int("ELTWISE_IDX", attr_info.eltwise_idx);
-    kernel_ctx.define_int("ELTWISE_ALG", attr_info.eltwise_alg);
-
-    kernel_ctx.define_int("WITH_SUM", attr_info.with_sum);
-    kernel_ctx.define_int("SUM_IDX", attr_info.sum_idx);
-    kernel_ctx.define_int("SUM_SCALE", attr_info.sum_scale);
-    kernel_ctx.define_int("SUM_SCALE1", attr_info.sum_scale == 1.0f);
-
-    kernel_ctx.define_int("WITH_SRC0_SCALE", attr_info.with_src0_scale);
-    kernel_ctx.define_int("WITH_SRC1_SCALE", attr_info.with_src1_scale);
-
-    kernel_ctx.define_int("WITH_SCALES", attr_info.with_oscales);
-    kernel_ctx.define_int(
-            "WITH_RUNTIME_SCALES", attr_info.with_runtime_oscales);
-    kernel_ctx.define_int("SCALES_PER_OC", attr_info.with_per_oc_oscales);
-    kernel_ctx.define_int("SCALES_COMMON", attr_info.with_common_oscales);
-
-    kernel_ctx.define_int("WITH_SRC_SCALES", attr_info.with_src_scales);
-    kernel_ctx.define_int("WITH_WEI_SCALES", attr_info.with_wei_scales);
-    kernel_ctx.define_int("WITH_DST_SCALES", attr_info.with_dst_scales);
-    kernel_ctx.define_int("SRC_SCALES_MASK", attr_info.src_scales_mask);
-    kernel_ctx.define_int("WEI_SCALES_MASK", attr_info.wei_scales_mask);
-
-    kernel_ctx.define_int("WITH_SRC_ZPOINTS", attr_info.with_src_zpoints);
-    kernel_ctx.define_int("WITH_WEI_ZPOINTS", attr_info.with_wei_zpoints);
-    kernel_ctx.define_int("WITH_DST_ZPOINTS", attr_info.with_dst_zpoints);
-    kernel_ctx.define_int(
-            "WITH_SRC_ZPOINTS_PER_IC", attr_info.with_per_ic_src_zpoints);
-    kernel_ctx.define_int(
-            "WITH_DST_ZPOINTS_PER_OC", attr_info.with_per_oc_dst_zpoints);
-    kernel_ctx.define_int("WITH_WEI_ZPOINTS_DT_S8",
-            attr_info.wei_zpoints_data_type == dnnl_s8);
-    kernel_ctx.define_int("WITH_WEI_ZPOINTS_DT_U8",
-            attr_info.wei_zpoints_data_type == dnnl_u8);
-
-    def_binary_alg_kinds(kernel_ctx);
-    def_eltwise_alg_kinds(kernel_ctx);
-
-    return def_post_ops_cfg(kernel_ctx, post_ops, dst_md);
-}
+        const memory_desc_t &dst_md, bool with_punning = true);
 
-inline status_t def_attr_info(compute::kernel_ctx_t &kernel_ctx,
+status_t def_attr_info(compute::kernel_ctx_t &kernel_ctx,
         const attr_info_t &attr_info, const post_ops_t &post_ops,
-        const memory_desc_t &dst_md) {
-    return def_attr_info_impl(kernel_ctx, attr_info, post_ops, dst_md);
-}
+        const memory_desc_t &dst_md, bool with_punning = true);
 
-inline void def_dispatch(compute::kernel_ctx_t &kernel_ctx,
-        const compute::dispatch_t &dispatch) {
-    dispatch.def_kernel_macros(kernel_ctx);
-}
+void def_dispatch(
+        compute::kernel_ctx_t &kernel_ctx, const compute::dispatch_t &dispatch);
 
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/serialization.hpp b/src/gpu/intel/serialization.hpp
deleted file mode 100644
index a4c5f8b77b4..00000000000
--- a/src/gpu/intel/serialization.hpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GPU_INTEL_SERIALIZATION_HPP
-#define GPU_INTEL_SERIALIZATION_HPP
-
-#include <iomanip>
-#include <sstream>
-
-#include "gpu/intel/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace gpu {
-namespace intel {
-
-#define assert_trivially_serializable(cls) \
-    static_assert(serialized_data_t::is_trivially_serialized<cls>::value, \
-            #cls " must be trivially serializable.")
-
-struct serialized_data_t {
-    serialized_data_t() = default;
-
-#if defined(__cpp_lib_has_unique_object_representations) \
-        && __cpp_lib_has_unique_object_representations >= 201606L
-    template <typename T>
-    struct is_trivially_serialized {
-        static const bool value
-                = (std::has_unique_object_representations<T>::value
-                          || std::is_floating_point<T>::value)
-                && !(std::is_pointer<T>::value);
-    };
-
-#else
-    // Fallback for backward compatibility. As the structure layout should not
-    // change between c++ versions, compiling with c++17 will already verify the
-    // structures are valid for this use case.
-    template <typename T>
-    struct is_trivially_serialized {
-        static const bool value = std::is_trivially_copyable<T>::value
-                && !(std::is_pointer<T>::value);
-    };
-#endif
-
-    const std::vector<uint8_t> &get_data() const { return data; }
-    void set_data(std::vector<uint8_t> d) { this->data = std::move(d); }
-
-    template <typename T>
-    struct has_serialize {
-        using yes_t = uint8_t;
-        using no_t = uint16_t;
-
-        template <typename U>
-        static yes_t test(gpu_utils::enable_if_t<
-                std::is_same<decltype(&U::serialize),
-                        void (U::*)(serialized_data_t &) const>::value,
-                bool>);
-        template <typename U>
-        static no_t test(...);
-
-        static const bool value = (sizeof(test<T>(0)) == sizeof(yes_t));
-    };
-
-    // Append helper function for structures with the member function
-    // void serialize(serialized_data_t &) const
-    template <typename T,
-            gpu_utils::enable_if_t<has_serialize<T>::value, bool> = true>
-    void append(const T &t) {
-        t.serialize(*this);
-    }
-
-    // Append helper function for trivially serialized objects
-    template <typename T,
-            gpu_utils::enable_if_t<is_trivially_serialized<T>::value
-                            && !has_serialize<T>::value,
-                    bool> = true>
-    void append(const T &t) {
-        std::array<uint8_t, sizeof(T)> type_data;
-        std::memcpy(type_data.data(), &t, sizeof(T));
-        data.insert(data.end(), type_data.begin(), type_data.end());
-    }
-
-    template <typename T,
-            gpu_utils::enable_if_t<gpu_utils::is_vector<T>::value, bool> = true>
-    void append(const T &v) {
-        append(v.size());
-        for (const typename T::value_type &d : v)
-            append<typename T::value_type>(d);
-    };
-
-    template <typename Arg1, typename Arg2, typename... Args>
-    void append(const Arg1 &a1, const Arg2 &a2, const Args &...args) {
-        append(a1);
-        append(a2, args...);
-    }
-
-    template <typename T,
-            gpu_utils::enable_if_t<is_trivially_serialized<T>::value,
-                    bool> = true>
-    T get(size_t idx) const {
-        T t {};
-        if (data.size() < idx + sizeof(T)) {
-            assert(!"unexpected");
-            return t;
-        }
-        std::memcpy(&t, &data[idx], sizeof(T));
-        return t;
-    }
-
-    size_t hash() const { return hash_range(data.data(), data.size()); };
-    std::string str() {
-        std::ostringstream oss;
-        oss << std::hex << std::setfill('0');
-        for (auto c : data) {
-            oss << std::setw(2) << static_cast<uint32_t>(c);
-        }
-        return oss.str();
-    }
-
-protected:
-    std::vector<uint8_t> data;
-    static size_t hash_range(const uint8_t *v, size_t size) {
-        size_t seed = 0;
-        const uint8_t *end = v + size;
-        for (; v < end; v += sizeof(seed)) {
-            size_t value = 0;
-            std::memcpy(&value, v,
-                    std::min(static_cast<size_t>(end - v), sizeof(seed)));
-            seed = hash_combine(seed, value);
-        }
-
-        return seed;
-    };
-};
-
-struct serialized_t : public serialized_data_t {
-    template <typename Arg1, typename... Args>
-    serialized_t(const Arg1 &a1, const Args &...args) {
-        append(a1, args...);
-    }
-
-    static serialized_t from_data(std::vector<uint8_t> data) {
-        serialized_t s;
-        s.set_data(std::move(data));
-        return s;
-    };
-
-    bool operator==(const serialized_t &other) const {
-        return data == other.data;
-    }
-
-    size_t get_hash() const { return hash(); }
-    template <typename T>
-    static size_t get_hash(const T &t) {
-        return serialized_t(t).get_hash();
-    }
-
-private:
-    serialized_t() = default;
-};
-
-struct deserializer_t {
-    deserializer_t(const serialized_data_t &s) : idx(0), s(s) {}
-
-    template <typename T>
-    struct has_deserialize {
-        using yes_t = uint8_t;
-        using no_t = uint16_t;
-
-        template <typename U>
-        static yes_t test(
-                gpu_utils::enable_if_t<std::is_same<decltype(&U::deserialize),
-                                               U (*)(deserializer_t &)>::value,
-                        bool>);
-        template <typename U>
-        static no_t test(...);
-
-        static const bool value = (sizeof(test<T>(0)) == sizeof(yes_t));
-    };
-
-    // Helper function for structures with the static member function
-    // void deserialize(deserializer_t&)
-    template <typename T,
-            gpu_utils::enable_if_t<has_deserialize<T>::value, bool> = true>
-    void pop(T &t) {
-        t = T::deserialize(*this);
-    }
-    template <typename T,
-            gpu_utils::enable_if_t<has_deserialize<T>::value, bool> = true>
-    T pop() {
-        return T::deserialize(*this);
-    }
-
-    template <typename T,
-            gpu_utils::enable_if_t<
-                    serialized_data_t::is_trivially_serialized<T>::value
-                            && !has_deserialize<T>::value,
-                    bool> = true>
-    void pop(T &t) {
-        t = s.get<T>(idx);
-        idx += sizeof(T);
-    };
-    template <typename T,
-            gpu_utils::enable_if_t<
-                    serialized_data_t::is_trivially_serialized<T>::value
-                            && !has_deserialize<T>::value,
-                    bool> = true>
-    T pop() {
-        auto idx_start = idx;
-        idx += sizeof(T);
-        return s.get<T>(idx_start);
-    };
-
-    // Helper for vector types
-    template <typename T,
-            gpu_utils::enable_if_t<gpu_utils::is_vector<T>::value, bool> = true>
-    void pop(T &v) {
-        size_t size;
-        pop(size);
-        v.clear();
-        v.reserve(size);
-        for (size_t i = 0; i < size; i++) {
-            typename T::value_type t = {};
-            pop(t);
-            v.emplace_back(t);
-        }
-    }
-
-    size_t idx;
-    const serialized_data_t &s;
-};
-
-template <typename T>
-struct trivially_serializable_t {
-    static constexpr bool is_trivially_validatable = true;
-
-    serialized_t serialize() const {
-        assert_trivially_serializable(T);
-        return serialized_t(*static_cast<const T *>(this));
-    }
-
-    static T deserialize(const serialized_t &s) {
-        return deserializer_t(s).pop<T>();
-    }
-};
-
-} // namespace intel
-} // namespace gpu
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/gpu/intel/sycl/compat.cpp b/src/gpu/intel/sycl/compat.cpp
index d198c04839a..70bf173bf2b 100644
--- a/src/gpu/intel/sycl/compat.cpp
+++ b/src/gpu/intel/sycl/compat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,9 +17,10 @@
 #include <type_traits>
 
 #include "gpu/intel/sycl/utils.hpp"
+#include "gpu/intel/utils.hpp"
 #include "oneapi/dnnl/dnnl_config.h"
 
-#include "gpu/intel/sycl/l0/level_zero/ze_api.h"
+#include "level_zero/ze_api.h"
 
 #if __has_include(<sycl/backend/opencl.hpp>)
 #include <sycl/backend/opencl.hpp>
@@ -47,44 +48,56 @@ namespace compat {
 
 using namespace gpu::intel::compute;
 
-status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel,
+status_t make_kernels(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
         const gpu::intel::sycl::engine_t *sycl_engine,
-        const xpu::binary_t &binary, const char *kernel_name) {
+        const xpu::binary_t &binary) {
     auto backend = xpu::sycl::get_backend(sycl_engine->device());
     if (backend == xpu::sycl::backend_t::opencl) {
         xpu::ocl::wrapper_t<cl_program> ocl_program;
         CHECK(xpu::ocl::create_program(ocl_program, sycl_engine->ocl_device(),
                 sycl_engine->ocl_context(), binary));
-        cl_int err;
-        xpu::ocl::wrapper_t<cl_kernel> ocl_kernel
-                = clCreateKernel(ocl_program, kernel_name, &err);
-        OCL_CHECK(err);
-        sycl_kernel = utils::make_unique<::sycl::kernel>(
-                ::sycl::make_kernel<::sycl::backend::opencl>(
-                        ocl_kernel, sycl_engine->context()));
+
+        sycl_kernels.resize(kernel_names.size());
+        for (size_t i = 0; i < kernel_names.size(); i++) {
+            if (kernel_names[i] == nullptr) continue;
+            cl_int err;
+            xpu::ocl::wrapper_t<cl_kernel> ocl_kernel
+                    = clCreateKernel(ocl_program, kernel_names[i], &err);
+            OCL_CHECK(err);
+            sycl_kernels[i] = utils::make_unique<::sycl::kernel>(
+                    ::sycl::make_kernel<::sycl::backend::opencl>(
+                            ocl_kernel, sycl_engine->context()));
+        }
     } else if (backend == xpu::sycl::backend_t::level0) {
-        CHECK(sycl_create_kernel_with_level_zero(
-                sycl_kernel, kernel_name, sycl_engine, binary));
+        CHECK(sycl_create_kernels_with_level_zero(
+                sycl_kernels, kernel_names, sycl_engine, binary));
     } else {
-        assert(!"unexpected");
+        gpu_error_not_expected();
         return status::invalid_arguments;
     }
     return status::success;
 }
 
+status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel,
+        const char *kernel_name, const gpu::intel::sycl::engine_t *sycl_engine,
+        const xpu::binary_t &binary) {
+    std::vector<std::unique_ptr<::sycl::kernel>> sycl_kernels;
+    std::vector<const char *> kernel_names = {kernel_name};
+    CHECK(make_kernels(sycl_kernels, kernel_names, sycl_engine, binary));
+
+    if (sycl_kernels.empty()) return status::runtime_error;
+
+    sycl_kernel = std::move(sycl_kernels[0]);
+    return status::success;
+}
+
 uint64_t init_extensions(const ::sycl::device &dev) {
     uint64_t extensions = 0;
 
-#if DNNL_USE_SYCL121_API
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-    constexpr auto base_atomics_aspect = ::sycl::aspect::int64_base_atomics;
-    constexpr auto extended_atomics_aspect
-            = ::sycl::aspect::int64_extended_atomics;
-#else
     constexpr auto base_atomics_aspect = ::sycl::aspect::atomic64;
     constexpr auto extended_atomics_aspect = ::sycl::aspect::atomic64;
-#endif
 
     for (uint64_t i_ext = 1; i_ext < (uint64_t)device_ext_t::last;
             i_ext <<= 1) {
@@ -121,9 +134,7 @@ uint64_t init_extensions(const ::sycl::device &dev) {
         }
         if (is_ext_supported) extensions |= i_ext;
     }
-#if DNNL_USE_SYCL121_API
-#pragma clang diagnostic pop
-#endif
+
     return extensions;
 }
 
diff --git a/src/gpu/intel/sycl/compat.hpp b/src/gpu/intel/sycl/compat.hpp
index afac1e2b024..b88c093ec0a 100644
--- a/src/gpu/intel/sycl/compat.hpp
+++ b/src/gpu/intel/sycl/compat.hpp
@@ -31,9 +31,15 @@ class engine_t;
 
 namespace compat {
 
-status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel,
+status_t make_kernels(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
         const gpu::intel::sycl::engine_t *sycl_engine,
-        const xpu::binary_t &binary, const char *kernel_name);
+        const xpu::binary_t &binary);
+
+status_t make_kernel(std::unique_ptr<::sycl::kernel> &sycl_kernel,
+        const char *kernel_name, const gpu::intel::sycl::engine_t *sycl_engine,
+        const xpu::binary_t &binary);
 
 uint64_t init_extensions(const ::sycl::device &dev);
 
diff --git a/src/gpu/intel/sycl/device_info.cpp b/src/gpu/intel/sycl/device_info.cpp
index e7b88bfbe51..62b0301bfee 100644
--- a/src/gpu/intel/sycl/device_info.cpp
+++ b/src/gpu/intel/sycl/device_info.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,10 +19,11 @@
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/device_info.hpp"
 #include "gpu/intel/sycl/engine.hpp"
+#include "gpu/intel/sycl/l0/utils.hpp"
 #include "gpu/intel/sycl/utils.hpp"
 
-#include "gpu/intel/ocl/ocl_gpu_hw_info.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/hw_info.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
@@ -34,6 +35,7 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
     auto *sycl_engine
             = utils::downcast<const gpu::intel::sycl::engine_t *>(engine);
     auto &device = sycl_engine->device();
+    auto &ctx = sycl_engine->context();
 
     // skip cpu engines
     if (!device.is_gpu()) return status::success;
@@ -41,41 +43,28 @@ status_t device_info_t::init_arch(impl::engine_t *engine) {
     // skip other vendors
     if (!xpu::sycl::is_intel_device(device)) return status::success;
 
+    auto status = status::success;
     auto be = xpu::sycl::get_backend(device);
     if (be == xpu::sycl::backend_t::opencl) {
-        cl_int err = CL_SUCCESS;
-
         auto ocl_dev = xpu::sycl::compat::get_native<cl_device_id>(device);
-        auto ocl_dev_wrapper = xpu::ocl::make_wrapper(ocl_dev);
-
-        auto ocl_ctx_wrapper = xpu::ocl::make_wrapper(
-                clCreateContext(nullptr, 1, &ocl_dev, nullptr, nullptr, &err));
-        OCL_CHECK(err);
+        auto ocl_ctx = xpu::sycl::compat::get_native<cl_context>(ctx);
 
-        gpu::intel::ocl::init_gpu_hw_info(engine, ocl_dev_wrapper,
-                ocl_ctx_wrapper, ip_version_, gpu_arch_, gpu_product_family_,
-                stepping_id_, native_extensions_, mayiuse_systolic_,
-                mayiuse_ngen_kernels_, mayiuse_microkernels_);
+        status = gpu::intel::ocl::init_gpu_hw_info(engine, ocl_dev, ocl_ctx,
+                ip_version_, gpu_arch_, gpu_product_family_, stepping_id_,
+                native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_);
     } else if (be == xpu::sycl::backend_t::level0) {
-        // TODO: add support for L0 binary ngen check
-        // XXX: query from ocl_engine for now
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
-                ocl_engine;
-        CHECK(gpu::intel::sycl::create_ocl_engine(&ocl_engine, sycl_engine));
-
-        auto *dev_info = ocl_engine->device_info();
-        ip_version_ = dev_info->ip_version();
-        gpu_arch_ = dev_info->gpu_arch();
-        gpu_product_family_ = dev_info->gpu_product_family();
-        stepping_id_ = dev_info->stepping_id();
-        mayiuse_systolic_ = dev_info->mayiuse_systolic();
-        mayiuse_ngen_kernels_ = dev_info->mayiuse_ngen_kernels();
-        mayiuse_microkernels_ = dev_info->mayiuse_microkernels();
+        auto ze_dev = xpu::sycl::compat::get_native<ze_device_handle_t>(device);
+        auto ze_ctx = xpu::sycl::compat::get_native<ze_context_handle_t>(ctx);
+
+        status = gpu::intel::sycl::init_gpu_hw_info(engine, ze_dev, ze_ctx,
+                ip_version_, gpu_arch_, gpu_product_family_, stepping_id_,
+                native_extensions_, mayiuse_systolic_, mayiuse_ngen_kernels_);
     } else {
         assert(!"not_expected");
+        status = status::unimplemented;
     }
 
-    return status::success;
+    return status;
 }
 
 status_t device_info_t::init_device_name(impl::engine_t *engine) {
diff --git a/src/gpu/intel/sycl/engine.cpp b/src/gpu/intel/sycl/engine.cpp
index 005a92bce24..3b79cd09326 100644
--- a/src/gpu/intel/sycl/engine.cpp
+++ b/src/gpu/intel/sycl/engine.cpp
@@ -41,11 +41,6 @@ status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
     return status::success;
 }
 
-status_t engine_t::create_memory_storage(
-        memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
-    return impl()->create_memory_storage(storage, this, flags, size, handle);
-}
-
 status_t engine_t::create_stream(
         impl::stream_t **stream, impl::stream_impl_t *stream_impl) {
     return gpu::intel::sycl::stream_t::create_stream(stream, this, stream_impl);
diff --git a/src/gpu/intel/sycl/engine.hpp b/src/gpu/intel/sycl/engine.hpp
index 8975e3ffb69..cb32b158744 100644
--- a/src/gpu/intel/sycl/engine.hpp
+++ b/src/gpu/intel/sycl/engine.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,9 +28,10 @@
 
 #include "gpu/intel/compute/compute_engine.hpp"
 
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-#include "gpu/intel/ocl/ocl_gpu_kernel.hpp"
+#include "gpu/intel/ocl/engine.hpp"
+#include "gpu/intel/ocl/kernel.hpp"
 
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/sycl/compat.hpp"
 #include "gpu/intel/sycl/utils.hpp"
 
@@ -59,42 +60,60 @@ class engine_t : public gpu::intel::compute::compute_engine_t {
         return status::success;
     }
 
-    status_t create_memory_storage(memory_storage_t **storage, unsigned flags,
-            size_t size, void *handle) override;
-
     status_t create_stream(
             impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
 
+    status_t convert_to_sycl(
+            std::vector<gpu::intel::compute::kernel_t> &kernels,
+            cl_program program,
+            const gpu::intel::compute::program_src_t &program_src,
+            const std::vector<const char *> &kernel_names,
+            gpu::intel::ocl::engine_t *ocl_engine) const {
+        kernels = std::vector<gpu::intel::compute::kernel_t>(
+                kernel_names.size());
+        xpu::binary_t binary;
+        CHECK(ocl::get_ocl_program_binary(
+                program, ocl_engine->device(), binary));
+
+        std::vector<std::unique_ptr<::sycl::kernel>> sycl_kernels;
+        CHECK(gpu::intel::sycl::compat::make_kernels(
+                sycl_kernels, kernel_names, this, binary));
+
+        for (size_t i = 0; i < kernel_names.size(); i++) {
+            if (!sycl_kernels[i]) continue;
+            CHECK(sycl_interop_gpu_kernel_t::make(
+                    kernels[i], *sycl_kernels[i], program_src));
+        }
+        return status::success;
+    }
+
     status_t convert_to_sycl(
             std::vector<gpu::intel::compute::kernel_t> &kernels,
             const std::vector<gpu::intel::compute::kernel_t> &ocl_kernels,
             const std::vector<const char *> &kernel_names,
-            gpu::intel::ocl::ocl_gpu_engine_t *ocl_engine) const {
+            gpu::intel::ocl::engine_t *ocl_engine) const {
         kernels = std::vector<gpu::intel::compute::kernel_t>(
                 kernel_names.size());
         for (size_t i = 0; i < ocl_kernels.size(); ++i) {
             if (!ocl_kernels[i]) continue;
-            auto *k = utils::downcast<gpu::intel::ocl::ocl_gpu_kernel_t *>(
+            auto *k = utils::downcast<gpu::intel::ocl::kernel_t *>(
                     ocl_kernels[i].impl());
             xpu::binary_t binary;
             CHECK(k->get_binary(ocl_engine, binary));
             CHECK(create_kernel_from_binary(
-                    kernels[i], binary, kernel_names[i]));
+                    kernels[i], binary, kernel_names[i], k->src()));
         }
         return status::success;
     }
 
     status_t create_kernel_from_binary(gpu::intel::compute::kernel_t &kernel,
-            const xpu::binary_t &binary,
-            const char *kernel_name) const override {
+            const xpu::binary_t &binary, const char *kernel_name,
+            const gpu::intel::compute::program_src_t &src) const override {
         std::unique_ptr<::sycl::kernel> sycl_kernel;
-        CHECK(gpu::intel::sycl::compat::make_kernel(
-                sycl_kernel, this, binary, kernel_name));
-
-        std::shared_ptr<gpu::intel::compute::kernel_impl_t> kernel_impl
-                = std::make_shared<sycl_interop_gpu_kernel_t>(
-                        std::move(sycl_kernel));
-        kernel = std::move(kernel_impl);
+        VCHECK_KERNEL(gpu::intel::sycl::compat::make_kernel(
+                              sycl_kernel, kernel_name, this, binary),
+                VERBOSE_KERNEL_CREATION_FAIL, kernel_name);
+        CHECK(sycl_interop_gpu_kernel_t::make(kernel, *sycl_kernel, {}));
         return status::success;
     }
 
@@ -106,8 +125,7 @@ class engine_t : public gpu::intel::compute::compute_engine_t {
             return status::invalid_arguments;
         }
 
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
-                ocl_engine;
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t> ocl_engine;
         auto status = gpu::intel::sycl::create_ocl_engine(&ocl_engine, this);
         if (status != status::success) return status;
 
@@ -120,53 +138,36 @@ class engine_t : public gpu::intel::compute::compute_engine_t {
     }
 
     status_t create_kernel(gpu::intel::compute::kernel_t *kernel,
-            gpu::intel::jit::jit_generator_base *jitter,
-            const cache_blob_t &cache_blob) const override {
+            gpu::intel::jit::generator_base_t *jitter) const override {
 
-        UNUSED(cache_blob);
         if (kind() != engine_kind::gpu) {
             assert(!"not expected");
             return status::invalid_arguments;
         }
-
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
-                ocl_engine;
-        CHECK(gpu::intel::sycl::create_ocl_engine(&ocl_engine, this));
-
-        auto kernel_name = jitter->kernel_name();
-
-        xpu::binary_t binary = jitter->get_binary(
-                ocl_engine->context(), ocl_engine->device());
-        return create_kernel_from_binary(*kernel, binary, kernel_name);
+        return jitter->get_kernel(*kernel, this);
     }
 
     status_t create_kernels(std::vector<gpu::intel::compute::kernel_t> *kernels,
             const std::vector<const char *> &kernel_names,
-            const gpu::intel::compute::kernel_ctx_t &kernel_ctx,
-            const cache_blob_t &cache_blob) const override {
-        UNUSED(cache_blob);
+            const gpu::intel::compute::kernel_ctx_t &kernel_ctx)
+            const override {
         if (kind() != engine_kind::gpu) {
             assert(!"not expected");
             return status::invalid_arguments;
         }
 
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
-                ocl_engine;
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t> ocl_engine;
         CHECK(gpu::intel::sycl::create_ocl_engine(&ocl_engine, this));
 
-        std::vector<gpu::intel::compute::kernel_t> ocl_kernels;
-        CHECK(ocl_engine->create_kernels(
-                &ocl_kernels, kernel_names, kernel_ctx, cache_blob));
+        xpu::ocl::wrapper_t<cl_program> ocl_program;
+        gpu::intel::compute::program_src_t src;
+        CHECK(ocl_engine->create_program(
+                ocl_program, src, kernel_names, kernel_ctx));
         CHECK(convert_to_sycl(
-                *kernels, ocl_kernels, kernel_names, ocl_engine.get()));
+                *kernels, ocl_program, src, kernel_names, ocl_engine.get()));
         return status::success;
     }
 
-    const ::sycl::device &device() const { return impl()->device(); }
-    const ::sycl::context &context() const { return impl()->context(); }
-
-    xpu::sycl::backend_t backend() const { return impl()->backend(); }
-
     cl_device_id ocl_device() const {
         if (backend() != xpu::sycl::backend_t::opencl) {
             assert(!"not expected");
@@ -191,6 +192,8 @@ class engine_t : public gpu::intel::compute::compute_engine_t {
         return gpu::intel::sycl::device_id(device());
     }
 
+    DECLARE_COMMON_SYCL_ENGINE_FUNCTIONS();
+
 protected:
     const xpu::sycl::engine_impl_t *impl() const {
         return (const xpu::sycl::engine_impl_t *)impl::engine_t::impl();
diff --git a/src/gpu/intel/sycl/l0/level_zero/_clang-format b/src/gpu/intel/sycl/l0/level_zero/_clang-format
deleted file mode 100644
index 97ea8ecd951..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2022 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_register_cb.h b/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_register_cb.h
deleted file mode 100644
index 79d74c1cf48..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_register_cb.h
+++ /dev/null
@@ -1,1584 +0,0 @@
-/*
- *
- * Copyright (C) 2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file zel_tracing_register_cb.h
- *
- */
-#ifndef zel_tracing_register_cb_H
-#define zel_tracing_register_cb_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-
-#include "../ze_api.h"
-
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of tracer object
-typedef struct _zel_tracer_handle_t *zel_tracer_handle_t;
-
-/// Callback definitions for all API released in LevelZero spec 1.1 or newer
-/// Callbacks for APIs included in spec 1.0 are contained in ze_api.helper
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDriverGetExtensionFunctionAddress
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_driver_get_extension_function_address_params_t
-{
-    ze_driver_handle_t* phDriver;
-    const char** pname;
-    void*** pppFunctionAddress;
-} ze_driver_get_extension_function_address_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDriverGetExtensionFunctionAddress
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnDriverGetExtensionFunctionAddressCb_t)(
-    ze_driver_get_extension_function_address_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetGlobalTimestamps
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_device_get_global_timestamps_params_t
-{
-    ze_device_handle_t* phDevice;
-    uint64_t** phostTimestamp;
-    uint64_t** pdeviceTimestamp;
-} ze_device_get_global_timestamps_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetGlobalTimestamps
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnDeviceGetGlobalTimestampsCb_t)(
-    ze_device_get_global_timestamps_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceReserveCacheExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_device_reserve_cache_ext_params_t
-{
-    ze_device_handle_t* phDevice;
-    size_t* pcacheLevel;
-    size_t* pcacheReservationSize;
-} ze_device_reserve_cache_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceReserveCacheExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnDeviceReserveCacheExtCb_t)(
-    ze_device_reserve_cache_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceSetCacheAdviceExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_device_set_cache_advice_ext_params_t
-{
-    ze_device_handle_t* phDevice;
-    void** pptr;
-    size_t* pregionSize;
-    ze_cache_ext_region_t* pcacheRegion;
-} ze_device_set_cache_advice_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceSetCacheAdviceExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnDeviceSetCacheAdviceExtCb_t)(
-    ze_device_set_cache_advice_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDevicePciGetPropertiesExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_device_pci_get_properties_ext_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_pci_ext_properties_t** ppPciProperties;
-} ze_device_pci_get_properties_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDevicePciGetPropertiesExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnDevicePciGetPropertiesExtCb_t)(
-    ze_device_pci_get_properties_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextCreateEx
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_context_create_ex_params_t
-{
-    ze_driver_handle_t* phDriver;
-    const ze_context_desc_t** pdesc;
-    uint32_t* pnumDevices;
-    ze_device_handle_t** pphDevices;
-    ze_context_handle_t** pphContext;
-} ze_context_create_ex_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextCreateEx
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnContextCreateExCb_t)(
-    ze_context_create_ex_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendImageCopyToMemoryExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_command_list_append_image_copy_to_memory_ext_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    void** pdstptr;
-    ze_image_handle_t* phSrcImage;
-    const ze_image_region_t** ppSrcRegion;
-    uint32_t* pdestRowPitch;
-    uint32_t* pdestSlicePitch;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_image_copy_to_memory_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendImageCopyToMemoryExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemoryExtCb_t)(
-    ze_command_list_append_image_copy_to_memory_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendImageCopyFromMemoryExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_command_list_append_image_copy_from_memory_ext_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_image_handle_t* phDstImage;
-    const void** psrcptr;
-    const ze_image_region_t** ppDstRegion;
-    uint32_t* psrcRowPitch;
-    uint32_t* psrcSlicePitch;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_image_copy_from_memory_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendImageCopyFromMemoryExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemoryExtCb_t)(
-    ze_command_list_append_image_copy_from_memory_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventQueryTimestampsExp
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_event_query_timestamps_exp_params_t
-{
-    ze_event_handle_t* phEvent;
-    ze_device_handle_t* phDevice;
-    uint32_t** ppCount;
-    ze_kernel_timestamp_result_t** ppTimestamps;
-} ze_event_query_timestamps_exp_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventQueryTimestampsExp
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnEventQueryTimestampsExpCb_t)(
-    ze_event_query_timestamps_exp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeImageGetMemoryPropertiesExp
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_image_get_memory_properties_exp_params_t
-{
-    ze_image_handle_t* phImage;
-    ze_image_memory_properties_exp_t** ppMemoryProperties;
-} ze_image_get_memory_properties_exp_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeImageGetMemoryPropertiesExp
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnImageGetMemoryPropertiesExpCb_t)(
-    ze_image_get_memory_properties_exp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeImageViewCreateExp
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_image_view_create_exp_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_image_desc_t** pdesc;
-    ze_image_handle_t* phImage;
-    ze_image_handle_t** pphImageView;
-} ze_image_view_create_exp_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeImageViewCreateExp
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnImageViewCreateExpCb_t)(
-    ze_image_view_create_exp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeImageGetAllocPropertiesExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_image_get_alloc_properties_ext_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_image_handle_t* phImage;
-    ze_image_allocation_ext_properties_t** ppImageAllocProperties;
-} ze_image_get_alloc_properties_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeImageGetAllocPropertiesExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnImageGetAllocPropertiesExtCb_t)(
-    ze_image_get_alloc_properties_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSetGlobalOffsetExp
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_kernel_set_global_offset_exp_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    uint32_t* poffsetX;
-    uint32_t* poffsetY;
-    uint32_t* poffsetZ;
-} ze_kernel_set_global_offset_exp_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSetGlobalOffsetExp
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnKernelSetGlobalOffsetExpCb_t)(
-    ze_kernel_set_global_offset_exp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSchedulingHintExp
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_kernel_scheduling_hint_exp_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    ze_scheduling_hint_exp_desc_t** ppHint;
-} ze_kernel_scheduling_hint_exp_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSchedulingHintExp
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnKernelSchedulingHintExpCb_t)(
-    ze_kernel_scheduling_hint_exp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemFreeExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_mem_free_ext_params_t
-{
-    ze_context_handle_t* phContext;
-    const ze_memory_free_ext_desc_t** ppMemFreeDesc;
-    void** pptr;
-} ze_mem_free_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemFreeExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnMemFreeExtCb_t)(
-    ze_mem_free_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleInspectLinkageExt
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-
-typedef struct _ze_module_inspect_linkage_ext_params_t
-{
-    ze_linkage_inspection_ext_desc_t** ppInspectDesc;
-    uint32_t* pnumModules;
-    ze_module_handle_t** pphModules;
-    ze_module_build_log_handle_t** pphLog;
-} ze_module_inspect_linkage_ext_params_t;
-
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleInspectLinkageExt
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-
-typedef void (ZE_APICALL *ze_pfnModuleInspectLinkageExtCb_t)(
-    ze_module_inspect_linkage_ext_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-
-typedef enum _zel_tracer_reg_t
-{
-    ZEL_REGISTER_PROLOGUE = 0,
-    ZEL_REGISTER_EPILOGUE = 1     
-} zel_tracer_reg_t;
-
-/// APIs to register callbacks for each core API
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerInitRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnInitCb_t pfnInitCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDriverGetRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDriverGetCb_t pfnGetCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDriverGetApiVersionRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDriverGetApiVersionCb_t pfnGetApiVersionCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDriverGetPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDriverGetPropertiesCb_t pfnGetPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDriverGetIpcPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDriverGetIpcPropertiesCb_t pfnGetIpcPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDriverGetExtensionPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDriverGetExtensionPropertiesCb_t pfnGetExtensionPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDriverGetExtensionFunctionAddressRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDriverGetExtensionFunctionAddressCb_t pfnGetExtensionFunctionAddressCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetCb_t pfnGetCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetSubDevicesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetSubDevicesCb_t pfnGetSubDevicesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetPropertiesCb_t pfnGetPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetComputePropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetComputePropertiesCb_t pfnGetComputePropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetModulePropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetModulePropertiesCb_t pfnGetModulePropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetCommandQueueGroupPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t pfnGetCommandQueueGroupPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetMemoryPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetMemoryPropertiesCb_t pfnGetMemoryPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetMemoryAccessPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetMemoryAccessPropertiesCb_t pfnGetMemoryAccessPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetCachePropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetCachePropertiesCb_t pfnGetCachePropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetImagePropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetImagePropertiesCb_t pfnGetImagePropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetExternalMemoryPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetExternalMemoryPropertiesCb_t pfnGetExternalMemoryPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetP2PPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetP2PPropertiesCb_t pfnGetP2PPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceCanAccessPeerRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceCanAccessPeerCb_t pfnCanAccessPeerCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetStatusRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetStatusCb_t pfnGetStatusCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceGetGlobalTimestampsRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceGetGlobalTimestampsCb_t pfnGetGlobalTimestampsCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextCreateExRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextCreateExCb_t pfnCreateExCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextGetStatusRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextGetStatusCb_t pfnGetStatusCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandQueueCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandQueueCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandQueueDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandQueueDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandQueueExecuteCommandListsRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandQueueExecuteCommandListsCb_t pfnExecuteCommandListsCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandQueueSynchronizeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandQueueSynchronizeCb_t pfnSynchronizeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListCreateImmediateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListCreateImmediateCb_t pfnCreateImmediateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListCloseRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListCloseCb_t pfnCloseCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListResetRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListResetCb_t pfnResetCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendWriteGlobalTimestampRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendWriteGlobalTimestampCb_t pfnAppendWriteGlobalTimestampCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendBarrierRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendBarrierCb_t pfnAppendBarrierCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemoryRangesBarrierRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemoryRangesBarrierCb_t pfnAppendMemoryRangesBarrierCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextSystemBarrierRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextSystemBarrierCb_t pfnSystemBarrierCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemoryCopyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemoryCopyCb_t pfnAppendMemoryCopyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemoryFillRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemoryFillCb_t pfnAppendMemoryFillCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemoryCopyRegionRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemoryCopyRegionCb_t pfnAppendMemoryCopyRegionCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemoryCopyFromContextRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemoryCopyFromContextCb_t pfnAppendMemoryCopyFromContextCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendImageCopyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendImageCopyCb_t pfnAppendImageCopyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendImageCopyRegionRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendImageCopyRegionCb_t pfnAppendImageCopyRegionCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendImageCopyToMemoryRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendImageCopyToMemoryCb_t pfnAppendImageCopyToMemoryCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendImageCopyFromMemoryRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendImageCopyFromMemoryCb_t pfnAppendImageCopyFromMemoryCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemoryPrefetchRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemoryPrefetchCb_t pfnAppendMemoryPrefetchCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendMemAdviseRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendMemAdviseCb_t pfnAppendMemAdviseCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventPoolCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventPoolCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventPoolDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventPoolDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventPoolGetIpcHandleRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventPoolGetIpcHandleCb_t pfnGetIpcHandleCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventPoolOpenIpcHandleRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventPoolOpenIpcHandleCb_t pfnOpenIpcHandleCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventPoolCloseIpcHandleRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventPoolCloseIpcHandleCb_t pfnCloseIpcHandleCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendSignalEventRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendSignalEventCb_t pfnAppendSignalEventCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendWaitOnEventsRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendWaitOnEventsCb_t pfnAppendWaitOnEventsCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventHostSignalRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventHostSignalCb_t pfnHostSignalCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventHostSynchronizeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventHostSynchronizeCb_t pfnHostSynchronizeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventQueryStatusRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventQueryStatusCb_t pfnQueryStatusCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendEventResetRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendEventResetCb_t pfnAppendEventResetCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventHostResetRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventHostResetCb_t pfnHostResetCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventQueryKernelTimestampRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventQueryKernelTimestampCb_t pfnQueryKernelTimestampCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendQueryKernelTimestampsRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendQueryKernelTimestampsCb_t pfnAppendQueryKernelTimestampsCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerFenceCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnFenceCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerFenceDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnFenceDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerFenceHostSynchronizeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnFenceHostSynchronizeCb_t pfnHostSynchronizeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerFenceQueryStatusRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnFenceQueryStatusCb_t pfnQueryStatusCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerFenceResetRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnFenceResetCb_t pfnResetCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerImageGetPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnImageGetPropertiesCb_t pfnGetPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerImageCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnImageCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerImageDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnImageDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemAllocSharedRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemAllocSharedCb_t pfnAllocSharedCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemAllocDeviceRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemAllocDeviceCb_t pfnAllocDeviceCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemAllocHostRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemAllocHostCb_t pfnAllocHostCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemFreeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemFreeCb_t pfnFreeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemGetAllocPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemGetAllocPropertiesCb_t pfnGetAllocPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemGetAddressRangeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemGetAddressRangeCb_t pfnGetAddressRangeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemGetIpcHandleRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemGetIpcHandleCb_t pfnGetIpcHandleCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemOpenIpcHandleRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemOpenIpcHandleCb_t pfnOpenIpcHandleCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemCloseIpcHandleRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemCloseIpcHandleCb_t pfnCloseIpcHandleCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleDynamicLinkRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleDynamicLinkCb_t pfnDynamicLinkCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleBuildLogDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleBuildLogDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleBuildLogGetStringRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleBuildLogGetStringCb_t pfnGetStringCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleGetNativeBinaryRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleGetNativeBinaryCb_t pfnGetNativeBinaryCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleGetGlobalPointerRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleGetGlobalPointerCb_t pfnGetGlobalPointerCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleGetKernelNamesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleGetKernelNamesCb_t pfnGetKernelNamesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleGetPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleGetPropertiesCb_t pfnGetPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleGetFunctionPointerRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleGetFunctionPointerCb_t pfnGetFunctionPointerCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSetGroupSizeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSetGroupSizeCb_t pfnSetGroupSizeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSuggestGroupSizeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSuggestGroupSizeCb_t pfnSuggestGroupSizeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSuggestMaxCooperativeGroupCountRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t pfnSuggestMaxCooperativeGroupCountCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSetArgumentValueRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSetArgumentValueCb_t pfnSetArgumentValueCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSetIndirectAccessRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSetIndirectAccessCb_t pfnSetIndirectAccessCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelGetIndirectAccessRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelGetIndirectAccessCb_t pfnGetIndirectAccessCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelGetSourceAttributesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelGetSourceAttributesCb_t pfnGetSourceAttributesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSetCacheConfigRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSetCacheConfigCb_t pfnSetCacheConfigCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelGetPropertiesRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelGetPropertiesCb_t pfnGetPropertiesCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelGetNameRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelGetNameCb_t pfnGetNameCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendLaunchKernelRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendLaunchKernelCb_t pfnAppendLaunchKernelCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendLaunchCooperativeKernelRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendLaunchCooperativeKernelCb_t pfnAppendLaunchCooperativeKernelCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendLaunchKernelIndirectRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendLaunchKernelIndirectCb_t pfnAppendLaunchKernelIndirectCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendLaunchMultipleKernelsIndirectRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t pfnAppendLaunchMultipleKernelsIndirectCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextMakeMemoryResidentRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextMakeMemoryResidentCb_t pfnMakeMemoryResidentCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextEvictMemoryRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextEvictMemoryCb_t pfnEvictMemoryCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextMakeImageResidentRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextMakeImageResidentCb_t pfnMakeImageResidentCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerContextEvictImageRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnContextEvictImageCb_t pfnEvictImageCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerSamplerCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnSamplerCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerSamplerDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnSamplerDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemReserveRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemReserveCb_t pfnReserveCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemFreeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemFreeCb_t pfnFreeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemQueryPageSizeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemQueryPageSizeCb_t pfnQueryPageSizeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerPhysicalMemCreateRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnPhysicalMemCreateCb_t pfnCreateCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerPhysicalMemDestroyRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnPhysicalMemDestroyCb_t pfnDestroyCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemMapRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemMapCb_t pfnMapCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemUnmapRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemUnmapCb_t pfnUnmapCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemSetAccessAttributeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemSetAccessAttributeCb_t pfnSetAccessAttributeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerVirtualMemGetAccessAttributeRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnVirtualMemGetAccessAttributeCb_t pfnGetAccessAttributeCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSetGlobalOffsetExpRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSetGlobalOffsetExpCb_t pfnSetGlobalOffsetExpCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceReserveCacheExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceReserveCacheExtCb_t pfnReserveCacheExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDeviceSetCacheAdviceExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDeviceSetCacheAdviceExtCb_t pfnSetCacheAdviceExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerEventQueryTimestampsExpRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnEventQueryTimestampsExpCb_t pfnQueryTimestampsExpCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerImageGetMemoryPropertiesExpRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnImageGetMemoryPropertiesExpCb_t pfnGetMemoryPropertiesExpCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerImageViewCreateExpRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnImageViewCreateExpCb_t pfnViewCreateExpCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerKernelSchedulingHintExpRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnKernelSchedulingHintExpCb_t pfnSchedulingHintExpCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerDevicePciGetPropertiesExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnDevicePciGetPropertiesExtCb_t pfnPciGetPropertiesExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendImageCopyToMemoryExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendImageCopyToMemoryExtCb_t pfnAppendImageCopyToMemoryExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerCommandListAppendImageCopyFromMemoryExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnCommandListAppendImageCopyFromMemoryExtCb_t pfnAppendImageCopyFromMemoryExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerImageGetAllocPropertiesExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnImageGetAllocPropertiesExtCb_t pfnGetAllocPropertiesExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerModuleInspectLinkageExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnModuleInspectLinkageExtCb_t pfnInspectLinkageExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerMemFreeExtRegisterCallback(
-    zel_tracer_handle_t hTracer,
-    zel_tracer_reg_t callback_type,
-    ze_pfnMemFreeExtCb_t pfnFreeExtCb
-    );
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelTracerResetAllCallbacks(zel_tracer_handle_t hTracer);
-
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // zel_tracing_register_cb_H
diff --git a/src/gpu/intel/sycl/l0/level_zero/loader/ze_loader.h b/src/gpu/intel/sycl/l0/level_zero/loader/ze_loader.h
deleted file mode 100644
index 748d4ff6199..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/loader/ze_loader.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (C) 2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file ze_loader.h
- */
-
-#ifndef _ZE_LOADER_H
-#define _ZE_LOADER_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-
-#include "../ze_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
- typedef struct _zel_version {
-    int major;
-    int minor;
-    int patch; 
-  } zel_version_t; 
-
- //Ex component string "ze_tracing", "ze_validation", etc 
-#define ZEL_COMPONENT_STRING_SIZE 64 
-
- typedef struct zel_component_version {
-    char component_name[ZEL_COMPONENT_STRING_SIZE];
-    ze_api_version_t spec_version;
-    zel_version_t component_lib_version;
-} zel_component_version_t; 
-
-
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelLoaderGetVersions(
-   size_t *num_elems,                     //Pointer to num versions to get.  
-   zel_component_version_t *versions);    //Pointer to array of versions. If set to NULL, num_elems is returned
-
-typedef enum _zel_handle_type_t {
-   ZEL_HANDLE_DRIVER,
-   ZEL_HANDLE_DEVICE,
-   ZEL_HANDLE_CONTEXT,
-   ZEL_HANDLE_COMMAND_QUEUE,
-   ZEL_HANDLE_COMMAND_LIST,
-   ZEL_HANDLE_FENCE,
-   ZEL_HANDLE_EVENT_POOL,
-   ZEL_HANDLE_EVENT,
-   ZEL_HANDLE_IMAGE,
-   ZEL_HANDLE_MODULE,
-   ZEL_HANDLE_MODULE_BUILD_LOG,
-   ZEL_HANDLE_KERNEL,
-   ZEL_HANDLE_SAMPLER,
-   ZEL_HANDLE_PHYSICAL_MEM
-} zel_handle_type_t;
-
-//Translates Loader Handles to Driver Handles if loader handle intercept is enabled.
-//If handle intercept is not enabled handleOut is set to handleIn  
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zelLoaderTranslateHandle(
-   zel_handle_type_t handleType,   //Handle Type
-   void *handleIn,                  //Input: handle to translate from loader handle to driver handle
-   void **handleOut);                //Output: Pointer to handleOut is set to driver handle if successful
-   
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#endif //_ZE_LOADER_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/level_zero/ze.py b/src/gpu/intel/sycl/l0/level_zero/ze.py
deleted file mode 100644
index f2ebacf1d58..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/ze.py
+++ /dev/null
@@ -1,4030 +0,0 @@
-"""
- Copyright (C) 2019-2021 Intel Corporation
-
- SPDX-License-Identifier: MIT
-
- @file ze.py
- @version v1.3-r1.3.7
-
- """
-import platform
-from ctypes import *
-from enum import *
-
-###############################################################################
-__version__ = "1.0"
-
-###############################################################################
-## @brief Generates generic 'oneAPI' API versions
-def ZE_MAKE_VERSION( _major, _minor ):
-    return (( _major << 16 )|( _minor & 0x0000ffff))
-
-###############################################################################
-## @brief Extracts 'oneAPI' API major version
-def ZE_MAJOR_VERSION( _ver ):
-    return ( _ver >> 16 )
-
-###############################################################################
-## @brief Extracts 'oneAPI' API minor version
-def ZE_MINOR_VERSION( _ver ):
-    return ( _ver & 0x0000ffff )
-
-###############################################################################
-## @brief Calling convention for all API functions
-# ZE_APICALL not required for python
-
-###############################################################################
-## @brief Microsoft-specific dllexport storage-class attribute
-# ZE_APIEXPORT not required for python
-
-###############################################################################
-## @brief GCC-specific dllexport storage-class attribute
-# ZE_APIEXPORT not required for python
-
-###############################################################################
-## @brief Microsoft-specific dllexport storage-class attribute
-# ZE_DLLEXPORT not required for python
-
-###############################################################################
-## @brief GCC-specific dllexport storage-class attribute
-# ZE_DLLEXPORT not required for python
-
-###############################################################################
-## @brief compiler-independent type
-class ze_bool_t(c_ubyte):
-    pass
-
-###############################################################################
-## @brief Handle of a driver instance
-class ze_driver_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's device object
-class ze_device_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's context object
-class ze_context_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's command queue object
-class ze_command_queue_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's command list object
-class ze_command_list_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's fence object
-class ze_fence_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's event pool object
-class ze_event_pool_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's event object
-class ze_event_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's image object
-class ze_image_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's module object
-class ze_module_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of module's build log object
-class ze_module_build_log_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's kernel object
-class ze_kernel_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of driver's sampler object
-class ze_sampler_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of physical memory object
-class ze_physical_mem_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Maximum IPC handle size
-ZE_MAX_IPC_HANDLE_SIZE = 64
-
-###############################################################################
-## @brief IPC handle to a memory allocation
-class ze_ipc_mem_handle_t(Structure):
-    _fields_ = [
-        ("data", c_char * ZE_MAX_IPC_HANDLE_SIZE)                       ## [out] Opaque data representing an IPC handle
-    ]
-
-###############################################################################
-## @brief IPC handle to a event pool allocation
-class ze_ipc_event_pool_handle_t(Structure):
-    _fields_ = [
-        ("data", c_char * ZE_MAX_IPC_HANDLE_SIZE)                       ## [out] Opaque data representing an IPC handle
-    ]
-
-###############################################################################
-## @brief Generic macro for enumerator bit masks
-def ZE_BIT( _i ):
-    return ( 1 << _i )
-
-###############################################################################
-## @brief Defines Return/Error codes
-class ze_result_v(IntEnum):
-    SUCCESS = 0                                     ## [Core] success
-    NOT_READY = 1                                   ## [Core] synchronization primitive not signaled
-    ERROR_DEVICE_LOST = 0x70000001                  ## [Core] device hung, reset, was removed, or driver update occurred
-    ERROR_OUT_OF_HOST_MEMORY = 0x70000002           ## [Core] insufficient host memory to satisfy call
-    ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003         ## [Core] insufficient device memory to satisfy call
-    ERROR_MODULE_BUILD_FAILURE = 0x70000004         ## [Core] error occurred when building module, see build log for details
-    ERROR_MODULE_LINK_FAILURE = 0x70000005          ## [Core] error occurred when linking modules, see build log for details
-    ERROR_DEVICE_REQUIRES_RESET = 0x70000006        ## [Core] device requires a reset
-    ERROR_DEVICE_IN_LOW_POWER_STATE = 0x70000007    ## [Core] device currently in low power state
-    ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000     ## [Sysman] access denied due to permission level
-    ERROR_NOT_AVAILABLE = 0x70010001                ## [Sysman] resource already in use and simultaneous access not allowed
-                                                    ## or resource was removed
-    ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000       ## [Tools] external required dependency is unavailable or missing
-    ERROR_UNINITIALIZED = 0x78000001                ## [Validation] driver is not initialized
-    ERROR_UNSUPPORTED_VERSION = 0x78000002          ## [Validation] generic error code for unsupported versions
-    ERROR_UNSUPPORTED_FEATURE = 0x78000003          ## [Validation] generic error code for unsupported features
-    ERROR_INVALID_ARGUMENT = 0x78000004             ## [Validation] generic error code for invalid arguments
-    ERROR_INVALID_NULL_HANDLE = 0x78000005          ## [Validation] handle argument is not valid
-    ERROR_HANDLE_OBJECT_IN_USE = 0x78000006         ## [Validation] object pointed to by handle still in-use by device
-    ERROR_INVALID_NULL_POINTER = 0x78000007         ## [Validation] pointer argument may not be nullptr
-    ERROR_INVALID_SIZE = 0x78000008                 ## [Validation] size argument is invalid (e.g., must not be zero)
-    ERROR_UNSUPPORTED_SIZE = 0x78000009             ## [Validation] size argument is not supported by the device (e.g., too
-                                                    ## large)
-    ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a        ## [Validation] alignment argument is not supported by the device (e.g.,
-                                                    ## too small)
-    ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b   ## [Validation] synchronization object in invalid state
-    ERROR_INVALID_ENUMERATION = 0x7800000c          ## [Validation] enumerator argument is not valid
-    ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d      ## [Validation] enumerator argument is not supported by the device
-    ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e     ## [Validation] image format is not supported by the device
-    ERROR_INVALID_NATIVE_BINARY = 0x7800000f        ## [Validation] native binary is not supported by the device
-    ERROR_INVALID_GLOBAL_NAME = 0x78000010          ## [Validation] global variable is not found in the module
-    ERROR_INVALID_KERNEL_NAME = 0x78000011          ## [Validation] kernel name is not found in the module
-    ERROR_INVALID_FUNCTION_NAME = 0x78000012        ## [Validation] function name is not found in the module
-    ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013 ## [Validation] group size dimension is not valid for the kernel or
-                                                    ## device
-    ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014   ## [Validation] global width dimension is not valid for the kernel or
-                                                    ## device
-    ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015## [Validation] kernel argument index is not valid for kernel
-    ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016 ## [Validation] kernel argument size does not match kernel
-    ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017   ## [Validation] value of kernel attribute is not valid for the kernel or
-                                                    ## device
-    ERROR_INVALID_MODULE_UNLINKED = 0x78000018      ## [Validation] module with imports needs to be linked before kernels can
-                                                    ## be created from it.
-    ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019    ## [Validation] command list type does not match command queue type
-    ERROR_OVERLAPPING_REGIONS = 0x7800001a          ## [Validation] copy operations do not support overlapping regions of
-                                                    ## memory
-    ERROR_UNKNOWN = 0x7ffffffe                      ## [Core] unknown or internal error
-
-class ze_result_t(c_int):
-    def __str__(self):
-        return str(ze_result_v(self.value))
-
-
-###############################################################################
-## @brief Defines structure types
-class ze_structure_type_v(IntEnum):
-    DRIVER_PROPERTIES = 0x1                         ## ::ze_driver_properties_t
-    DRIVER_IPC_PROPERTIES = 0x2                     ## ::ze_driver_ipc_properties_t
-    DEVICE_PROPERTIES = 0x3                         ## ::ze_device_properties_t
-    DEVICE_COMPUTE_PROPERTIES = 0x4                 ## ::ze_device_compute_properties_t
-    DEVICE_MODULE_PROPERTIES = 0x5                  ## ::ze_device_module_properties_t
-    COMMAND_QUEUE_GROUP_PROPERTIES = 0x6            ## ::ze_command_queue_group_properties_t
-    DEVICE_MEMORY_PROPERTIES = 0x7                  ## ::ze_device_memory_properties_t
-    DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8           ## ::ze_device_memory_access_properties_t
-    DEVICE_CACHE_PROPERTIES = 0x9                   ## ::ze_device_cache_properties_t
-    DEVICE_IMAGE_PROPERTIES = 0xa                   ## ::ze_device_image_properties_t
-    DEVICE_P2P_PROPERTIES = 0xb                     ## ::ze_device_p2p_properties_t
-    DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc         ## ::ze_device_external_memory_properties_t
-    CONTEXT_DESC = 0xd                              ## ::ze_context_desc_t
-    COMMAND_QUEUE_DESC = 0xe                        ## ::ze_command_queue_desc_t
-    COMMAND_LIST_DESC = 0xf                         ## ::ze_command_list_desc_t
-    EVENT_POOL_DESC = 0x10                          ## ::ze_event_pool_desc_t
-    EVENT_DESC = 0x11                               ## ::ze_event_desc_t
-    FENCE_DESC = 0x12                               ## ::ze_fence_desc_t
-    IMAGE_DESC = 0x13                               ## ::ze_image_desc_t
-    IMAGE_PROPERTIES = 0x14                         ## ::ze_image_properties_t
-    DEVICE_MEM_ALLOC_DESC = 0x15                    ## ::ze_device_mem_alloc_desc_t
-    HOST_MEM_ALLOC_DESC = 0x16                      ## ::ze_host_mem_alloc_desc_t
-    MEMORY_ALLOCATION_PROPERTIES = 0x17             ## ::ze_memory_allocation_properties_t
-    EXTERNAL_MEMORY_EXPORT_DESC = 0x18              ## ::ze_external_memory_export_desc_t
-    EXTERNAL_MEMORY_IMPORT_FD = 0x19                ## ::ze_external_memory_import_fd_t
-    EXTERNAL_MEMORY_EXPORT_FD = 0x1a                ## ::ze_external_memory_export_fd_t
-    MODULE_DESC = 0x1b                              ## ::ze_module_desc_t
-    MODULE_PROPERTIES = 0x1c                        ## ::ze_module_properties_t
-    KERNEL_DESC = 0x1d                              ## ::ze_kernel_desc_t
-    KERNEL_PROPERTIES = 0x1e                        ## ::ze_kernel_properties_t
-    SAMPLER_DESC = 0x1f                             ## ::ze_sampler_desc_t
-    PHYSICAL_MEM_DESC = 0x20                        ## ::ze_physical_mem_desc_t
-    KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21   ## ::ze_kernel_preferred_group_size_properties_t
-    EXTERNAL_MEMORY_IMPORT_WIN32 = 0x22             ## ::ze_external_memory_import_win32_handle_t
-    EXTERNAL_MEMORY_EXPORT_WIN32 = 0x23             ## ::ze_external_memory_export_win32_handle_t
-    DEVICE_RAYTRACING_EXT_PROPERTIES = 0x00010001   ## ::ze_device_raytracing_ext_properties_t
-    RAYTRACING_MEM_ALLOC_EXT_DESC = 0x10002         ## ::ze_raytracing_mem_alloc_ext_desc_t
-    FLOAT_ATOMIC_EXT_PROPERTIES = 0x10003           ## ::ze_float_atomic_ext_properties_t
-    CACHE_RESERVATION_EXT_DESC = 0x10004            ## ::ze_cache_reservation_ext_desc_t
-    EU_COUNT_EXT = 0x10005                          ## ::ze_eu_count_ext_t
-    SRGB_EXT_DESC = 0x10006                         ## ::ze_srgb_ext_desc_t
-    LINKAGE_INSPECTION_EXT_DESC = 0x10007           ## ::ze_linkage_inspection_ext_desc_t
-    PCI_EXT_PROPERTIES = 0x10008                    ## ::ze_pci_ext_properties_t
-    DRIVER_MEMORY_FREE_EXT_PROPERTIES = 0x10009     ## ::ze_driver_memory_free_ext_properties_t
-    MEMORY_FREE_EXT_DESC = 0x1000a                  ## ::ze_memory_free_ext_desc_t
-    MEMORY_COMPRESSION_HINTS_EXT_DESC = 0x1000b     ## ::ze_memory_compression_hints_ext_desc_t
-    IMAGE_ALLOCATION_EXT_PROPERTIES = 0x1000c       ## ::ze_image_allocation_ext_properties_t
-    RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001 ## ::ze_relaxed_allocation_limits_exp_desc_t
-    MODULE_PROGRAM_EXP_DESC = 0x00020002            ## ::ze_module_program_exp_desc_t
-    SCHEDULING_HINT_EXP_PROPERTIES = 0x00020003     ## ::ze_scheduling_hint_exp_properties_t
-    SCHEDULING_HINT_EXP_DESC = 0x00020004           ## ::ze_scheduling_hint_exp_desc_t
-    IMAGE_VIEW_PLANAR_EXP_DESC = 0x00020005         ## ::ze_image_view_planar_exp_desc_t
-    DEVICE_PROPERTIES_1_2 = 0x00020006              ## ::ze_device_properties_t
-    IMAGE_MEMORY_EXP_PROPERTIES = 0x00020007        ## ::ze_image_memory_properties_exp_t
-    POWER_SAVING_HINT_EXP_DESC = 0x00020008         ## ::ze_context_power_saving_hint_exp_desc_t
-
-class ze_structure_type_t(c_int):
-    def __str__(self):
-        return str(ze_structure_type_v(self.value))
-
-
-###############################################################################
-## @brief External memory type flags
-class ze_external_memory_type_flags_v(IntEnum):
-    OPAQUE_FD = ZE_BIT(0)                           ## an opaque POSIX file descriptor handle
-    DMA_BUF = ZE_BIT(1)                             ## a file descriptor handle for a Linux dma_buf
-    OPAQUE_WIN32 = ZE_BIT(2)                        ## an NT handle
-    OPAQUE_WIN32_KMT = ZE_BIT(3)                    ## a global share (KMT) handle
-    D3D11_TEXTURE = ZE_BIT(4)                       ## an NT handle referring to a Direct3D 10 or 11 texture resource
-    D3D11_TEXTURE_KMT = ZE_BIT(5)                   ## a global share (KMT) handle referring to a Direct3D 10 or 11 texture
-                                                    ## resource
-    D3D12_HEAP = ZE_BIT(6)                          ## an NT handle referring to a Direct3D 12 heap resource
-    D3D12_RESOURCE = ZE_BIT(7)                      ## an NT handle referring to a Direct3D 12 committed resource
-
-class ze_external_memory_type_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Base for all properties types
-class ze_base_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-    ]
-
-###############################################################################
-## @brief Base for all descriptor types
-class ze_base_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-    ]
-
-###############################################################################
-## @brief Forces driver to only report devices (and sub-devices) as specified by
-##        values
-
-###############################################################################
-## @brief Forces driver to report devices from lowest to highest PCI bus ID
-
-###############################################################################
-## @brief Forces all shared allocations into device memory
-
-###############################################################################
-## @brief Supported initialization flags
-class ze_init_flags_v(IntEnum):
-    GPU_ONLY = ZE_BIT(0)                            ## only initialize GPU drivers
-    VPU_ONLY = ZE_BIT(1)                            ## only initialize VPU drivers
-
-class ze_init_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Supported API versions
-## 
-## @details
-##     - API versions contain major and minor attributes, use
-##       ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION
-class ze_api_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    _1_1 = ZE_MAKE_VERSION( 1, 1 )                  ## version 1.1
-    _1_2 = ZE_MAKE_VERSION( 1, 2 )                  ## version 1.2
-    _1_3 = ZE_MAKE_VERSION( 1, 3 )                  ## version 1.3
-    CURRENT = ZE_MAKE_VERSION( 1, 3 )               ## latest known version
-
-class ze_api_version_t(c_int):
-    def __str__(self):
-        return str(ze_api_version_v(self.value))
-
-
-###############################################################################
-## @brief Maximum driver universal unique id (UUID) size in bytes
-ZE_MAX_DRIVER_UUID_SIZE = 16
-
-###############################################################################
-## @brief Driver universal unique id (UUID)
-class ze_driver_uuid_t(Structure):
-    _fields_ = [
-        ("id", c_ubyte * ZE_MAX_DRIVER_UUID_SIZE)                       ## [out] opaque data representing a driver UUID
-    ]
-
-###############################################################################
-## @brief Driver properties queried using ::zeDriverGetProperties
-class ze_driver_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("uuid", ze_driver_uuid_t),                                     ## [out] universal unique identifier.
-        ("driverVersion", c_ulong)                                      ## [out] driver version
-                                                                        ## The driver version is a non-zero, monotonically increasing value where
-                                                                        ## higher values always indicate a more recent version.
-    ]
-
-###############################################################################
-## @brief Supported IPC property flags
-class ze_ipc_property_flags_v(IntEnum):
-    MEMORY = ZE_BIT(0)                              ## Supports passing memory allocations between processes. See
-                                                    ## ::zeMemGetIpcHandle.
-    EVENT_POOL = ZE_BIT(1)                          ## Supports passing event pools between processes. See
-                                                    ## ::zeEventPoolGetIpcHandle.
-
-class ze_ipc_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief IPC properties queried using ::zeDriverGetIpcProperties
-class ze_driver_ipc_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_ipc_property_flags_t)                              ## [out] 0 (none) or a valid combination of ::ze_ipc_property_flag_t
-    ]
-
-###############################################################################
-## @brief Maximum extension name string size
-ZE_MAX_EXTENSION_NAME = 256
-
-###############################################################################
-## @brief Extension properties queried using ::zeDriverGetExtensionProperties
-class ze_driver_extension_properties_t(Structure):
-    _fields_ = [
-        ("name", c_char * ZE_MAX_EXTENSION_NAME),                       ## [out] extension name
-        ("version", c_ulong)                                            ## [out] extension version using ::ZE_MAKE_VERSION
-    ]
-
-###############################################################################
-## @brief Supported device types
-class ze_device_type_v(IntEnum):
-    GPU = 1                                         ## Graphics Processing Unit
-    CPU = 2                                         ## Central Processing Unit
-    FPGA = 3                                        ## Field Programmable Gate Array
-    MCA = 4                                         ## Memory Copy Accelerator
-    VPU = 5                                         ## Vision Processing Unit
-
-class ze_device_type_t(c_int):
-    def __str__(self):
-        return str(ze_device_type_v(self.value))
-
-
-###############################################################################
-## @brief Maximum device universal unique id (UUID) size in bytes
-ZE_MAX_DEVICE_UUID_SIZE = 16
-
-###############################################################################
-## @brief Device universal unique id (UUID)
-class ze_device_uuid_t(Structure):
-    _fields_ = [
-        ("id", c_ubyte * ZE_MAX_DEVICE_UUID_SIZE)                       ## [out] opaque data representing a device UUID
-    ]
-
-###############################################################################
-## @brief Maximum device name string size
-ZE_MAX_DEVICE_NAME = 256
-
-###############################################################################
-## @brief Supported device property flags
-class ze_device_property_flags_v(IntEnum):
-    INTEGRATED = ZE_BIT(0)                          ## Device is integrated with the Host.
-    SUBDEVICE = ZE_BIT(1)                           ## Device handle used for query represents a sub-device.
-    ECC = ZE_BIT(2)                                 ## Device supports error correction memory access.
-    ONDEMANDPAGING = ZE_BIT(3)                      ## Device supports on-demand page-faulting.
-
-class ze_device_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device properties queried using ::zeDeviceGetProperties
-class ze_device_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("type", ze_device_type_t),                                     ## [out] generic device type
-        ("vendorId", c_ulong),                                          ## [out] vendor id from PCI configuration
-        ("deviceId", c_ulong),                                          ## [out] device id from PCI configuration
-        ("flags", ze_device_property_flags_t),                          ## [out] 0 (none) or a valid combination of ::ze_device_property_flag_t
-        ("subdeviceId", c_ulong),                                       ## [out] sub-device id. Only valid if ::ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE
-                                                                        ## is set.
-        ("coreClockRate", c_ulong),                                     ## [out] Clock rate for device core.
-        ("maxMemAllocSize", c_ulonglong),                               ## [out] Maximum memory allocation size.
-        ("maxHardwareContexts", c_ulong),                               ## [out] Maximum number of logical hardware contexts.
-        ("maxCommandQueuePriority", c_ulong),                           ## [out] Maximum priority for command queues. Higher value is higher
-                                                                        ## priority.
-        ("numThreadsPerEU", c_ulong),                                   ## [out] Maximum number of threads per EU.
-        ("physicalEUSimdWidth", c_ulong),                               ## [out] The physical EU simd width.
-        ("numEUsPerSubslice", c_ulong),                                 ## [out] Maximum number of EUs per sub-slice.
-        ("numSubslicesPerSlice", c_ulong),                              ## [out] Maximum number of sub-slices per slice.
-        ("numSlices", c_ulong),                                         ## [out] Maximum number of slices.
-        ("timerResolution", c_ulonglong),                               ## [out] Returns the resolution of device timer used for profiling,
-                                                                        ## timestamps, etc. When stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES the
-                                                                        ## units are in nanoseconds. When
-                                                                        ## stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 units are in
-                                                                        ## cycles/sec
-        ("timestampValidBits", c_ulong),                                ## [out] Returns the number of valid bits in the timestamp value.
-        ("kernelTimestampValidBits", c_ulong),                          ## [out] Returns the number of valid bits in the kernel timestamp values
-        ("uuid", ze_device_uuid_t),                                     ## [out] universal unique identifier. Note: Subdevices will have their
-                                                                        ## own uuid.
-        ("name", c_char * ZE_MAX_DEVICE_NAME)                           ## [out] Device name
-    ]
-
-###############################################################################
-## @brief Device thread identifier.
-class ze_device_thread_t(Structure):
-    _fields_ = [
-        ("slice", c_ulong),                                             ## [in,out] the slice number.
-                                                                        ## Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numSlices.
-        ("subslice", c_ulong),                                          ## [in,out] the sub-slice number within its slice.
-                                                                        ## Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numSubslicesPerSlice.
-        ("eu", c_ulong),                                                ## [in,out] the EU number within its sub-slice.
-                                                                        ## Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numEUsPerSubslice.
-        ("thread", c_ulong)                                             ## [in,out] the thread number within its EU.
-                                                                        ## Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numThreadsPerEU.
-    ]
-
-###############################################################################
-## @brief Maximum number of subgroup sizes supported.
-ZE_SUBGROUPSIZE_COUNT = 8
-
-###############################################################################
-## @brief Device compute properties queried using ::zeDeviceGetComputeProperties
-class ze_device_compute_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("maxTotalGroupSize", c_ulong),                                 ## [out] Maximum items per compute group. (groupSizeX * groupSizeY *
-                                                                        ## groupSizeZ) <= maxTotalGroupSize
-        ("maxGroupSizeX", c_ulong),                                     ## [out] Maximum items for X dimension in group
-        ("maxGroupSizeY", c_ulong),                                     ## [out] Maximum items for Y dimension in group
-        ("maxGroupSizeZ", c_ulong),                                     ## [out] Maximum items for Z dimension in group
-        ("maxGroupCountX", c_ulong),                                    ## [out] Maximum groups that can be launched for x dimension
-        ("maxGroupCountY", c_ulong),                                    ## [out] Maximum groups that can be launched for y dimension
-        ("maxGroupCountZ", c_ulong),                                    ## [out] Maximum groups that can be launched for z dimension
-        ("maxSharedLocalMemory", c_ulong),                              ## [out] Maximum shared local memory per group.
-        ("numSubGroupSizes", c_ulong),                                  ## [out] Number of subgroup sizes supported. This indicates number of
-                                                                        ## entries in subGroupSizes.
-        ("subGroupSizes", c_ulong * ZE_SUBGROUPSIZE_COUNT)              ## [out] Size group sizes supported.
-    ]
-
-###############################################################################
-## @brief Maximum native kernel universal unique id (UUID) size in bytes
-ZE_MAX_NATIVE_KERNEL_UUID_SIZE = 16
-
-###############################################################################
-## @brief Native kernel universal unique id (UUID)
-class ze_native_kernel_uuid_t(Structure):
-    _fields_ = [
-        ("id", c_ubyte * ZE_MAX_NATIVE_KERNEL_UUID_SIZE)                ## [out] opaque data representing a native kernel UUID
-    ]
-
-###############################################################################
-## @brief Supported device module flags
-class ze_device_module_flags_v(IntEnum):
-    FP16 = ZE_BIT(0)                                ## Device supports 16-bit floating-point operations
-    FP64 = ZE_BIT(1)                                ## Device supports 64-bit floating-point operations
-    INT64_ATOMICS = ZE_BIT(2)                       ## Device supports 64-bit atomic operations
-    DP4A = ZE_BIT(3)                                ## Device supports four component dot product and accumulate operations
-
-class ze_device_module_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Supported floating-Point capability flags
-class ze_device_fp_flags_v(IntEnum):
-    DENORM = ZE_BIT(0)                              ## Supports denorms
-    INF_NAN = ZE_BIT(1)                             ## Supports INF and quiet NaNs
-    ROUND_TO_NEAREST = ZE_BIT(2)                    ## Supports rounding to nearest even rounding mode
-    ROUND_TO_ZERO = ZE_BIT(3)                       ## Supports rounding to zero.
-    ROUND_TO_INF = ZE_BIT(4)                        ## Supports rounding to both positive and negative INF.
-    FMA = ZE_BIT(5)                                 ## Supports IEEE754-2008 fused multiply-add.
-    ROUNDED_DIVIDE_SQRT = ZE_BIT(6)                 ## Supports rounding as defined by IEEE754 for divide and sqrt
-                                                    ## operations.
-    SOFT_FLOAT = ZE_BIT(7)                          ## Uses software implementation for basic floating-point operations.
-
-class ze_device_fp_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device module properties queried using ::zeDeviceGetModuleProperties
-class ze_device_module_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("spirvVersionSupported", c_ulong),                             ## [out] Maximum supported SPIR-V version.
-                                                                        ## Returns zero if SPIR-V is not supported.
-                                                                        ## Contains major and minor attributes, use ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION.
-        ("flags", ze_device_module_flags_t),                            ## [out] 0 or a valid combination of ::ze_device_module_flag_t
-        ("fp16flags", ze_device_fp_flags_t),                            ## [out] Capabilities for half-precision floating-point operations.
-                                                                        ## returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a
-                                                                        ## combination of ::ze_device_fp_flag_t.
-        ("fp32flags", ze_device_fp_flags_t),                            ## [out] Capabilities for single-precision floating-point operations.
-                                                                        ## returns a combination of ::ze_device_fp_flag_t.
-        ("fp64flags", ze_device_fp_flags_t),                            ## [out] Capabilities for double-precision floating-point operations.
-                                                                        ## returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a
-                                                                        ## combination of ::ze_device_fp_flag_t.
-        ("maxArgumentsSize", c_ulong),                                  ## [out] Maximum kernel argument size that is supported.
-        ("printfBufferSize", c_ulong),                                  ## [out] Maximum size of internal buffer that holds output of printf
-                                                                        ## calls from kernel.
-        ("nativeKernelSupported", ze_native_kernel_uuid_t)              ## [out] Compatibility UUID of supported native kernel.
-                                                                        ## UUID may or may not be the same across driver release, devices, or
-                                                                        ## operating systems.
-                                                                        ## Application is responsible for ensuring UUID matches before creating
-                                                                        ## module using
-                                                                        ## previously created native kernel.
-    ]
-
-###############################################################################
-## @brief Supported command queue group property flags
-class ze_command_queue_group_property_flags_v(IntEnum):
-    COMPUTE = ZE_BIT(0)                             ## Command queue group supports enqueing compute commands.
-    COPY = ZE_BIT(1)                                ## Command queue group supports enqueing copy commands.
-    COOPERATIVE_KERNELS = ZE_BIT(2)                 ## Command queue group supports cooperative kernels.
-                                                    ## See ::zeCommandListAppendLaunchCooperativeKernel for more details.
-    METRICS = ZE_BIT(3)                             ## Command queue groups supports metric queries.
-
-class ze_command_queue_group_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Command queue group properties queried using
-##        ::zeDeviceGetCommandQueueGroupProperties
-class ze_command_queue_group_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_command_queue_group_property_flags_t),             ## [out] 0 (none) or a valid combination of
-                                                                        ## ::ze_command_queue_group_property_flag_t
-        ("maxMemoryFillPatternSize", c_size_t),                         ## [out] maximum `pattern_size` supported by command queue group.
-                                                                        ## See ::zeCommandListAppendMemoryFill for more details.
-        ("numQueues", c_ulong)                                          ## [out] the number of physical engines within the group.
-    ]
-
-###############################################################################
-## @brief Supported device memory property flags
-class ze_device_memory_property_flags_v(IntEnum):
-    TBD = ZE_BIT(0)                                 ## reserved for future use
-
-class ze_device_memory_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device local memory properties queried using
-##        ::zeDeviceGetMemoryProperties
-class ze_device_memory_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_device_memory_property_flags_t),                   ## [out] 0 (none) or a valid combination of
-                                                                        ## ::ze_device_memory_property_flag_t
-        ("maxClockRate", c_ulong),                                      ## [out] Maximum clock rate for device memory.
-        ("maxBusWidth", c_ulong),                                       ## [out] Maximum bus width between device and memory.
-        ("totalSize", c_ulonglong),                                     ## [out] Total memory size in bytes that is available to the device.
-        ("name", c_char * ZE_MAX_DEVICE_NAME)                           ## [out] Memory name
-    ]
-
-###############################################################################
-## @brief Memory access capability flags
-## 
-## @details
-##     - Supported access capabilities for different types of memory
-##       allocations
-class ze_memory_access_cap_flags_v(IntEnum):
-    RW = ZE_BIT(0)                                  ## Supports load/store access
-    ATOMIC = ZE_BIT(1)                              ## Supports atomic access
-    CONCURRENT = ZE_BIT(2)                          ## Supports concurrent access
-    CONCURRENT_ATOMIC = ZE_BIT(3)                   ## Supports concurrent atomic access
-
-class ze_memory_access_cap_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device memory access properties queried using
-##        ::zeDeviceGetMemoryAccessProperties
-class ze_device_memory_access_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("hostAllocCapabilities", ze_memory_access_cap_flags_t),        ## [out] host memory capabilities.
-                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-        ("deviceAllocCapabilities", ze_memory_access_cap_flags_t),      ## [out] device memory capabilities.
-                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-        ("sharedSingleDeviceAllocCapabilities", ze_memory_access_cap_flags_t),  ## [out] shared, single-device memory capabilities.
-                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-        ("sharedCrossDeviceAllocCapabilities", ze_memory_access_cap_flags_t),   ## [out] shared, cross-device memory capabilities.
-                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-        ("sharedSystemAllocCapabilities", ze_memory_access_cap_flags_t) ## [out] shared, system memory capabilities.
-                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ]
-
-###############################################################################
-## @brief Supported cache control property flags
-class ze_device_cache_property_flags_v(IntEnum):
-    USER_CONTROL = ZE_BIT(0)                        ## Device support User Cache Control (i.e. SLM section vs Generic Cache)
-
-class ze_device_cache_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device cache properties queried using ::zeDeviceGetCacheProperties
-class ze_device_cache_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_device_cache_property_flags_t),                    ## [out] 0 (none) or a valid combination of
-                                                                        ## ::ze_device_cache_property_flag_t
-        ("cacheSize", c_size_t)                                         ## [out] Per-cache size, in bytes
-    ]
-
-###############################################################################
-## @brief Device image properties queried using ::zeDeviceGetImageProperties
-class ze_device_image_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("maxImageDims1D", c_ulong),                                    ## [out] Maximum image dimensions for 1D resources. if 0, then 1D images
-                                                                        ## are unsupported.
-        ("maxImageDims2D", c_ulong),                                    ## [out] Maximum image dimensions for 2D resources. if 0, then 2D images
-                                                                        ## are unsupported.
-        ("maxImageDims3D", c_ulong),                                    ## [out] Maximum image dimensions for 3D resources. if 0, then 3D images
-                                                                        ## are unsupported.
-        ("maxImageBufferSize", c_ulonglong),                            ## [out] Maximum image buffer size in bytes. if 0, then buffer images are
-                                                                        ## unsupported.
-        ("maxImageArraySlices", c_ulong),                               ## [out] Maximum image array slices. if 0, then image arrays are
-                                                                        ## unsupported.
-        ("maxSamplers", c_ulong),                                       ## [out] Max samplers that can be used in kernel. if 0, then sampling is
-                                                                        ## unsupported.
-        ("maxReadImageArgs", c_ulong),                                  ## [out] Returns the maximum number of simultaneous image objects that
-                                                                        ## can be read from by a kernel. if 0, then reading images is
-                                                                        ## unsupported.
-        ("maxWriteImageArgs", c_ulong)                                  ## [out] Returns the maximum number of simultaneous image objects that
-                                                                        ## can be written to by a kernel. if 0, then writing images is
-                                                                        ## unsupported.
-    ]
-
-###############################################################################
-## @brief Device external memory import and export properties
-class ze_device_external_memory_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("memoryAllocationImportTypes", ze_external_memory_type_flags_t),   ## [out] Supported external memory import types for memory allocations.
-        ("memoryAllocationExportTypes", ze_external_memory_type_flags_t),   ## [out] Supported external memory export types for memory allocations.
-        ("imageImportTypes", ze_external_memory_type_flags_t),          ## [out] Supported external memory import types for images.
-        ("imageExportTypes", ze_external_memory_type_flags_t)           ## [out] Supported external memory export types for images.
-    ]
-
-###############################################################################
-## @brief Supported device peer-to-peer property flags
-class ze_device_p2p_property_flags_v(IntEnum):
-    ACCESS = ZE_BIT(0)                              ## Device supports access between peer devices.
-    ATOMICS = ZE_BIT(1)                             ## Device supports atomics between peer devices.
-
-class ze_device_p2p_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device peer-to-peer properties queried using
-##        ::zeDeviceGetP2PProperties
-class ze_device_p2p_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_device_p2p_property_flags_t)                       ## [out] 0 (none) or a valid combination of
-                                                                        ## ::ze_device_p2p_property_flag_t
-    ]
-
-###############################################################################
-## @brief Supported context creation flags
-class ze_context_flags_v(IntEnum):
-    TBD = ZE_BIT(0)                                 ## reserved for future use
-
-class ze_context_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Context descriptor
-class ze_context_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_context_flags_t)                                   ## [in] creation flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_context_flag_t;
-                                                                        ## default behavior may use implicit driver-based heuristics.
-    ]
-
-###############################################################################
-## @brief Supported command queue flags
-class ze_command_queue_flags_v(IntEnum):
-    EXPLICIT_ONLY = ZE_BIT(0)                       ## command queue should be optimized for submission to a single device engine.
-                                                    ## driver **must** disable any implicit optimizations for distributing
-                                                    ## work across multiple engines.
-                                                    ## this flag should be used when applications want full control over
-                                                    ## multi-engine submission and scheduling.
-
-class ze_command_queue_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Supported command queue modes
-class ze_command_queue_mode_v(IntEnum):
-    DEFAULT = 0                                     ## implicit default behavior; uses driver-based heuristics
-    SYNCHRONOUS = 1                                 ## Device execution always completes immediately on execute;
-                                                    ## Host thread is blocked using wait on implicit synchronization object
-    ASYNCHRONOUS = 2                                ## Device execution is scheduled and will complete in future;
-                                                    ## explicit synchronization object must be used to determine completeness
-
-class ze_command_queue_mode_t(c_int):
-    def __str__(self):
-        return str(ze_command_queue_mode_v(self.value))
-
-
-###############################################################################
-## @brief Supported command queue priorities
-class ze_command_queue_priority_v(IntEnum):
-    NORMAL = 0                                      ## [default] normal priority
-    PRIORITY_LOW = 1                                ## lower priority than normal
-    PRIORITY_HIGH = 2                               ## higher priority than normal
-
-class ze_command_queue_priority_t(c_int):
-    def __str__(self):
-        return str(ze_command_queue_priority_v(self.value))
-
-
-###############################################################################
-## @brief Command Queue descriptor
-class ze_command_queue_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("ordinal", c_ulong),                                           ## [in] command queue group ordinal
-        ("index", c_ulong),                                             ## [in] command queue index within the group;
-                                                                        ## must be zero if ::ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set
-        ("flags", ze_command_queue_flags_t),                            ## [in] usage flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_command_queue_flag_t;
-                                                                        ## default behavior may use implicit driver-based heuristics to balance
-                                                                        ## latency and throughput.
-        ("mode", ze_command_queue_mode_t),                              ## [in] operation mode
-        ("priority", ze_command_queue_priority_t)                       ## [in] priority
-    ]
-
-###############################################################################
-## @brief Supported command list creation flags
-class ze_command_list_flags_v(IntEnum):
-    RELAXED_ORDERING = ZE_BIT(0)                    ## driver may reorder commands (e.g., kernels, copies) between barriers
-                                                    ## and synchronization primitives.
-                                                    ## using this flag may increase Host overhead of ::zeCommandListClose.
-                                                    ## therefore, this flag should **not** be set for low-latency usage-models.
-    MAXIMIZE_THROUGHPUT = ZE_BIT(1)                 ## driver may perform additional optimizations that increase execution
-                                                    ## throughput. 
-                                                    ## using this flag may increase Host overhead of ::zeCommandListClose and ::zeCommandQueueExecuteCommandLists.
-                                                    ## therefore, this flag should **not** be set for low-latency usage-models.
-    EXPLICIT_ONLY = ZE_BIT(2)                       ## command list should be optimized for submission to a single command
-                                                    ## queue and device engine.
-                                                    ## driver **must** disable any implicit optimizations for distributing
-                                                    ## work across multiple engines.
-                                                    ## this flag should be used when applications want full control over
-                                                    ## multi-engine submission and scheduling.
-
-class ze_command_list_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Command List descriptor
-class ze_command_list_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("commandQueueGroupOrdinal", c_ulong),                          ## [in] command queue group ordinal to which this command list will be
-                                                                        ## submitted
-        ("flags", ze_command_list_flags_t)                              ## [in] usage flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_command_list_flag_t;
-                                                                        ## default behavior may use implicit driver-based heuristics to balance
-                                                                        ## latency and throughput.
-    ]
-
-###############################################################################
-## @brief Copy region descriptor
-class ze_copy_region_t(Structure):
-    _fields_ = [
-        ("originX", c_ulong),                                           ## [in] The origin x offset for region in bytes
-        ("originY", c_ulong),                                           ## [in] The origin y offset for region in rows
-        ("originZ", c_ulong),                                           ## [in] The origin z offset for region in slices
-        ("width", c_ulong),                                             ## [in] The region width relative to origin in bytes
-        ("height", c_ulong),                                            ## [in] The region height relative to origin in rows
-        ("depth", c_ulong)                                              ## [in] The region depth relative to origin in slices. Set this to 0 for
-                                                                        ## 2D copy.
-    ]
-
-###############################################################################
-## @brief Region descriptor
-class ze_image_region_t(Structure):
-    _fields_ = [
-        ("originX", c_ulong),                                           ## [in] The origin x offset for region in pixels
-        ("originY", c_ulong),                                           ## [in] The origin y offset for region in pixels
-        ("originZ", c_ulong),                                           ## [in] The origin z offset for region in pixels
-        ("width", c_ulong),                                             ## [in] The region width relative to origin in pixels
-        ("height", c_ulong),                                            ## [in] The region height relative to origin in pixels
-        ("depth", c_ulong)                                              ## [in] The region depth relative to origin. For 1D or 2D images, set
-                                                                        ## this to 1.
-    ]
-
-###############################################################################
-## @brief Supported memory advice hints
-class ze_memory_advice_v(IntEnum):
-    SET_READ_MOSTLY = 0                             ## hint that memory will be read from frequently and written to rarely
-    CLEAR_READ_MOSTLY = 1                           ## removes the affect of ::ZE_MEMORY_ADVICE_SET_READ_MOSTLY
-    SET_PREFERRED_LOCATION = 2                      ## hint that the preferred memory location is the specified device
-    CLEAR_PREFERRED_LOCATION = 3                    ## removes the affect of ::ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION
-    SET_NON_ATOMIC_MOSTLY = 4                       ## hints that memory will mostly be accessed non-atomically
-    CLEAR_NON_ATOMIC_MOSTLY = 5                     ## removes the affect of ::ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY
-    BIAS_CACHED = 6                                 ## hints that memory should be cached
-    BIAS_UNCACHED = 7                               ## hints that memory should be not be cached
-
-class ze_memory_advice_t(c_int):
-    def __str__(self):
-        return str(ze_memory_advice_v(self.value))
-
-
-###############################################################################
-## @brief Supported event pool creation flags
-class ze_event_pool_flags_v(IntEnum):
-    HOST_VISIBLE = ZE_BIT(0)                        ## signals and waits are also visible to host
-    IPC = ZE_BIT(1)                                 ## signals and waits may be shared across processes
-    KERNEL_TIMESTAMP = ZE_BIT(2)                    ## Indicates all events in pool will contain kernel timestamps; cannot be
-                                                    ## combined with ::ZE_EVENT_POOL_FLAG_IPC
-
-class ze_event_pool_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Event pool descriptor
-class ze_event_pool_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_event_pool_flags_t),                               ## [in] creation flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_event_pool_flag_t;
-                                                                        ## default behavior is signals and waits are visible to the entire device
-                                                                        ## and peer devices.
-        ("count", c_ulong)                                              ## [in] number of events within the pool; must be greater than 0
-    ]
-
-###############################################################################
-## @brief Supported event scope flags
-class ze_event_scope_flags_v(IntEnum):
-    SUBDEVICE = ZE_BIT(0)                           ## cache hierarchies are flushed or invalidated sufficient for local
-                                                    ## sub-device access
-    DEVICE = ZE_BIT(1)                              ## cache hierarchies are flushed or invalidated sufficient for global
-                                                    ## device access and peer device access
-    HOST = ZE_BIT(2)                                ## cache hierarchies are flushed or invalidated sufficient for device and
-                                                    ## host access
-
-class ze_event_scope_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Event descriptor
-class ze_event_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("index", c_ulong),                                             ## [in] index of the event within the pool; must be less-than the count
-                                                                        ## specified during pool creation
-        ("signal", ze_event_scope_flags_t),                             ## [in] defines the scope of relevant cache hierarchies to flush on a
-                                                                        ## signal action before the event is triggered.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
-                                                                        ## default behavior is synchronization within the command list only, no
-                                                                        ## additional cache hierarchies are flushed.
-        ("wait", ze_event_scope_flags_t)                                ## [in] defines the scope of relevant cache hierarchies to invalidate on
-                                                                        ## a wait action after the event is complete.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
-                                                                        ## default behavior is synchronization within the command list only, no
-                                                                        ## additional cache hierarchies are invalidated.
-    ]
-
-###############################################################################
-## @brief Kernel timestamp clock data
-## 
-## @details
-##     - The timestamp frequency can be queried from
-##       ::ze_device_properties_t.timerResolution.
-##     - The number of valid bits in the timestamp value can be queried from
-##       ::ze_device_properties_t.kernelTimestampValidBits.
-class ze_kernel_timestamp_data_t(Structure):
-    _fields_ = [
-        ("kernelStart", c_ulonglong),                                   ## [out] device clock at start of kernel execution
-        ("kernelEnd", c_ulonglong)                                      ## [out] device clock at end of kernel execution
-    ]
-
-###############################################################################
-## @brief Kernel timestamp result
-class ze_kernel_timestamp_result_t(Structure):
-    _fields_ = [
-        ("global", ze_kernel_timestamp_data_t),                         ## [out] wall-clock data
-        ("context", ze_kernel_timestamp_data_t)                         ## [out] context-active data; only includes clocks while device context
-                                                                        ## was actively executing.
-    ]
-
-###############################################################################
-## @brief Supported fence creation flags
-class ze_fence_flags_v(IntEnum):
-    SIGNALED = ZE_BIT(0)                            ## fence is created in the signaled state, otherwise not signaled.
-
-class ze_fence_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Fence descriptor
-class ze_fence_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_fence_flags_t)                                     ## [in] creation flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_fence_flag_t.
-    ]
-
-###############################################################################
-## @brief Supported image creation flags
-class ze_image_flags_v(IntEnum):
-    KERNEL_WRITE = ZE_BIT(0)                        ## kernels will write contents
-    BIAS_UNCACHED = ZE_BIT(1)                       ## device should not cache contents
-
-class ze_image_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Supported image types
-class ze_image_type_v(IntEnum):
-    _1D = 0                                         ## 1D
-    _1DARRAY = 1                                    ## 1D array
-    _2D = 2                                         ## 2D
-    _2DARRAY = 3                                    ## 2D array
-    _3D = 4                                         ## 3D
-    BUFFER = 5                                      ## Buffer
-
-class ze_image_type_t(c_int):
-    def __str__(self):
-        return str(ze_image_type_v(self.value))
-
-
-###############################################################################
-## @brief Supported image format layouts
-class ze_image_format_layout_v(IntEnum):
-    _8 = 0                                          ## 8-bit single component layout
-    _16 = 1                                         ## 16-bit single component layout
-    _32 = 2                                         ## 32-bit single component layout
-    _8_8 = 3                                        ## 2-component 8-bit layout
-    _8_8_8_8 = 4                                    ## 4-component 8-bit layout
-    _16_16 = 5                                      ## 2-component 16-bit layout
-    _16_16_16_16 = 6                                ## 4-component 16-bit layout
-    _32_32 = 7                                      ## 2-component 32-bit layout
-    _32_32_32_32 = 8                                ## 4-component 32-bit layout
-    _10_10_10_2 = 9                                 ## 4-component 10_10_10_2 layout
-    _11_11_10 = 10                                  ## 3-component 11_11_10 layout
-    _5_6_5 = 11                                     ## 3-component 5_6_5 layout
-    _5_5_5_1 = 12                                   ## 4-component 5_5_5_1 layout
-    _4_4_4_4 = 13                                   ## 4-component 4_4_4_4 layout
-    Y8 = 14                                         ## Media Format: Y8. Format type and swizzle is ignored for this.
-    NV12 = 15                                       ## Media Format: NV12. Format type and swizzle is ignored for this.
-    YUYV = 16                                       ## Media Format: YUYV. Format type and swizzle is ignored for this.
-    VYUY = 17                                       ## Media Format: VYUY. Format type and swizzle is ignored for this.
-    YVYU = 18                                       ## Media Format: YVYU. Format type and swizzle is ignored for this.
-    UYVY = 19                                       ## Media Format: UYVY. Format type and swizzle is ignored for this.
-    AYUV = 20                                       ## Media Format: AYUV. Format type and swizzle is ignored for this.
-    P010 = 21                                       ## Media Format: P010. Format type and swizzle is ignored for this.
-    Y410 = 22                                       ## Media Format: Y410. Format type and swizzle is ignored for this.
-    P012 = 23                                       ## Media Format: P012. Format type and swizzle is ignored for this.
-    Y16 = 24                                        ## Media Format: Y16. Format type and swizzle is ignored for this.
-    P016 = 25                                       ## Media Format: P016. Format type and swizzle is ignored for this.
-    Y216 = 26                                       ## Media Format: Y216. Format type and swizzle is ignored for this.
-    P216 = 27                                       ## Media Format: P216. Format type and swizzle is ignored for this.
-    P8 = 28                                         ## Media Format: P8. Format type and swizzle is ignored for this.
-    YUY2 = 29                                       ## Media Format: YUY2. Format type and swizzle is ignored for this.
-    A8P8 = 30                                       ## Media Format: A8P8. Format type and swizzle is ignored for this.
-    IA44 = 31                                       ## Media Format: IA44. Format type and swizzle is ignored for this.
-    AI44 = 32                                       ## Media Format: AI44. Format type and swizzle is ignored for this.
-    Y416 = 33                                       ## Media Format: Y416. Format type and swizzle is ignored for this.
-    Y210 = 34                                       ## Media Format: Y210. Format type and swizzle is ignored for this.
-    I420 = 35                                       ## Media Format: I420. Format type and swizzle is ignored for this.
-    YV12 = 36                                       ## Media Format: YV12. Format type and swizzle is ignored for this.
-    _400P = 37                                      ## Media Format: 400P. Format type and swizzle is ignored for this.
-    _422H = 38                                      ## Media Format: 422H. Format type and swizzle is ignored for this.
-    _422V = 39                                      ## Media Format: 422V. Format type and swizzle is ignored for this.
-    _444P = 40                                      ## Media Format: 444P. Format type and swizzle is ignored for this.
-    RGBP = 41                                       ## Media Format: RGBP. Format type and swizzle is ignored for this.
-    BRGP = 42                                       ## Media Format: BRGP. Format type and swizzle is ignored for this.
-
-class ze_image_format_layout_t(c_int):
-    def __str__(self):
-        return str(ze_image_format_layout_v(self.value))
-
-
-###############################################################################
-## @brief Supported image format types
-class ze_image_format_type_v(IntEnum):
-    UINT = 0                                        ## Unsigned integer
-    SINT = 1                                        ## Signed integer
-    UNORM = 2                                       ## Unsigned normalized integer
-    SNORM = 3                                       ## Signed normalized integer
-    FLOAT = 4                                       ## Float
-
-class ze_image_format_type_t(c_int):
-    def __str__(self):
-        return str(ze_image_format_type_v(self.value))
-
-
-###############################################################################
-## @brief Supported image format component swizzle into channel
-class ze_image_format_swizzle_v(IntEnum):
-    R = 0                                           ## Red component
-    G = 1                                           ## Green component
-    B = 2                                           ## Blue component
-    A = 3                                           ## Alpha component
-    _0 = 4                                          ## Zero
-    _1 = 5                                          ## One
-    X = 6                                           ## Don't care
-
-class ze_image_format_swizzle_t(c_int):
-    def __str__(self):
-        return str(ze_image_format_swizzle_v(self.value))
-
-
-###############################################################################
-## @brief Image format 
-class ze_image_format_t(Structure):
-    _fields_ = [
-        ("layout", ze_image_format_layout_t),                           ## [in] image format component layout
-        ("type", ze_image_format_type_t),                               ## [in] image format type. Media formats can't be used for
-                                                                        ## ::ZE_IMAGE_TYPE_BUFFER.
-        ("x", ze_image_format_swizzle_t),                               ## [in] image component swizzle into channel x
-        ("y", ze_image_format_swizzle_t),                               ## [in] image component swizzle into channel y
-        ("z", ze_image_format_swizzle_t),                               ## [in] image component swizzle into channel z
-        ("w", ze_image_format_swizzle_t)                                ## [in] image component swizzle into channel w
-    ]
-
-###############################################################################
-## @brief Image descriptor
-class ze_image_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_image_flags_t),                                    ## [in] creation flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_image_flag_t;
-                                                                        ## default is read-only, cached access.
-        ("type", ze_image_type_t),                                      ## [in] image type
-        ("format", ze_image_format_t),                                  ## [in] image format
-        ("width", c_ulonglong),                                         ## [in] width dimension.
-                                                                        ## ::ZE_IMAGE_TYPE_BUFFER: size in bytes; see
-                                                                        ## ::ze_device_image_properties_t.maxImageBufferSize for limits.
-                                                                        ## ::ZE_IMAGE_TYPE_1D, ::ZE_IMAGE_TYPE_1DARRAY: width in pixels; see
-                                                                        ## ::ze_device_image_properties_t.maxImageDims1D for limits.
-                                                                        ## ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: width in pixels; see
-                                                                        ## ::ze_device_image_properties_t.maxImageDims2D for limits.
-                                                                        ## ::ZE_IMAGE_TYPE_3D: width in pixels; see
-                                                                        ## ::ze_device_image_properties_t.maxImageDims3D for limits.
-        ("height", c_ulong),                                            ## [in] height dimension.
-                                                                        ## ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: height in pixels; see
-                                                                        ## ::ze_device_image_properties_t.maxImageDims2D for limits.
-                                                                        ## ::ZE_IMAGE_TYPE_3D: height in pixels; see
-                                                                        ## ::ze_device_image_properties_t.maxImageDims3D for limits.
-                                                                        ## other: ignored.
-        ("depth", c_ulong),                                             ## [in] depth dimension.
-                                                                        ## ::ZE_IMAGE_TYPE_3D: depth in pixels; see
-                                                                        ## ::ze_device_image_properties_t.maxImageDims3D for limits.
-                                                                        ## other: ignored.
-        ("arraylevels", c_ulong),                                       ## [in] array levels.
-                                                                        ## ::ZE_IMAGE_TYPE_1DARRAY, ::ZE_IMAGE_TYPE_2DARRAY: see
-                                                                        ## ::ze_device_image_properties_t.maxImageArraySlices for limits.
-                                                                        ## other: ignored.
-        ("miplevels", c_ulong)                                          ## [in] mipmap levels (must be 0)
-    ]
-
-###############################################################################
-## @brief Supported sampler filtering flags
-class ze_image_sampler_filter_flags_v(IntEnum):
-    POINT = ZE_BIT(0)                               ## device supports point filtering
-    LINEAR = ZE_BIT(1)                              ## device supports linear filtering
-
-class ze_image_sampler_filter_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Image properties
-class ze_image_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("samplerFilterFlags", ze_image_sampler_filter_flags_t)         ## [out] supported sampler filtering.
-                                                                        ## returns 0 (unsupported) or a combination of ::ze_image_sampler_filter_flag_t.
-    ]
-
-###############################################################################
-## @brief Supported memory allocation flags
-class ze_device_mem_alloc_flags_v(IntEnum):
-    BIAS_CACHED = ZE_BIT(0)                         ## device should cache allocation
-    BIAS_UNCACHED = ZE_BIT(1)                       ## device should not cache allocation (UC)
-    BIAS_INITIAL_PLACEMENT = ZE_BIT(2)              ## optimize shared allocation for first access on the device
-
-class ze_device_mem_alloc_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device memory allocation descriptor
-class ze_device_mem_alloc_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_device_mem_alloc_flags_t),                         ## [in] flags specifying additional allocation controls.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_device_mem_alloc_flag_t;
-                                                                        ## default behavior may use implicit driver-based heuristics.
-        ("ordinal", c_ulong)                                            ## [in] ordinal of the device's local memory to allocate from.
-                                                                        ## must be less than the count returned from ::zeDeviceGetMemoryProperties.
-    ]
-
-###############################################################################
-## @brief Supported host memory allocation flags
-class ze_host_mem_alloc_flags_v(IntEnum):
-    BIAS_CACHED = ZE_BIT(0)                         ## host should cache allocation
-    BIAS_UNCACHED = ZE_BIT(1)                       ## host should not cache allocation (UC)
-    BIAS_WRITE_COMBINED = ZE_BIT(2)                 ## host memory should be allocated write-combined (WC)
-    BIAS_INITIAL_PLACEMENT = ZE_BIT(3)              ## optimize shared allocation for first access on the host
-
-class ze_host_mem_alloc_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Host memory allocation descriptor
-class ze_host_mem_alloc_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_host_mem_alloc_flags_t)                            ## [in] flags specifying additional allocation controls.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_host_mem_alloc_flag_t;
-                                                                        ## default behavior may use implicit driver-based heuristics.
-    ]
-
-###############################################################################
-## @brief Memory allocation type
-class ze_memory_type_v(IntEnum):
-    UNKNOWN = 0                                     ## the memory pointed to is of unknown type
-    HOST = 1                                        ## the memory pointed to is a host allocation
-    DEVICE = 2                                      ## the memory pointed to is a device allocation
-    SHARED = 3                                      ## the memory pointed to is a shared ownership allocation
-
-class ze_memory_type_t(c_int):
-    def __str__(self):
-        return str(ze_memory_type_v(self.value))
-
-
-###############################################################################
-## @brief Memory allocation properties queried using ::zeMemGetAllocProperties
-class ze_memory_allocation_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("type", ze_memory_type_t),                                     ## [out] type of allocated memory
-        ("id", c_ulonglong),                                            ## [out] identifier for this allocation
-        ("pageSize", c_ulonglong)                                       ## [out] page size used for allocation
-    ]
-
-###############################################################################
-## @brief Supported IPC memory flags
-class ze_ipc_memory_flags_v(IntEnum):
-    BIAS_CACHED = ZE_BIT(0)                         ## device should cache allocation
-    BIAS_UNCACHED = ZE_BIT(1)                       ## device should not cache allocation (UC)
-
-class ze_ipc_memory_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Additional allocation descriptor for exporting external memory
-## 
-## @details
-##     - This structure may be passed to ::zeMemAllocDevice, via the `pNext`
-##       member of ::ze_device_mem_alloc_desc_t, to indicate an exportable
-##       memory allocation.
-##     - This structure may be passed to ::zeImageCreate, via the `pNext`
-##       member of ::ze_image_desc_t, to indicate an exportable image.
-class ze_external_memory_export_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_external_memory_type_flags_t)                      ## [in] flags specifying memory export types for this allocation.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    ]
-
-###############################################################################
-## @brief Additional allocation descriptor for importing external memory as a
-##        file descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeMemAllocDevice, via the `pNext`
-##       member of ::ze_device_mem_alloc_desc_t, to import memory from a file
-##       descriptor.
-##     - This structure may be passed to ::zeImageCreate, via the `pNext`
-##       member of ::ze_image_desc_t, to import memory from a file descriptor.
-class ze_external_memory_import_fd_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory import type for the file descriptor.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-        ("fd", c_int)                                                   ## [in] the file descriptor handle to import
-    ]
-
-###############################################################################
-## @brief Exports an allocation as a file descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeMemGetAllocProperties, via the
-##       `pNext` member of ::ze_memory_allocation_properties_t, to export a
-##       memory allocation as a file descriptor.
-##     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
-##       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
-##       export an image as a file descriptor.
-##     - The requested memory export type must have been specified when the
-##       allocation was made.
-class ze_external_memory_export_fd_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory export type for the file descriptor.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-        ("fd", c_int)                                                   ## [out] the exported file descriptor handle representing the allocation.
-    ]
-
-###############################################################################
-## @brief Additional allocation descriptor for importing external memory as a
-##        Win32 handle
-## 
-## @details
-##     - When `handle` is `nullptr`, `name` must not be `nullptr`.
-##     - When `name` is `nullptr`, `handle` must not be `nullptr`.
-##     - When `flags` is ::ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT,
-##       `name` must be `nullptr`.
-##     - This structure may be passed to ::zeMemAllocDevice, via the `pNext`
-##       member of ::ze_device_mem_alloc_desc_t, to import memory from a Win32
-##       handle.
-##     - This structure may be passed to ::zeImageCreate, via the `pNext`
-##       member of ::ze_image_desc_t, to import memory from a Win32 handle.
-class ze_external_memory_import_win32_handle_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory import type for the Win32 handle.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-        ("handle", c_void_p),                                           ## [in][optional] the Win32 handle to import
-        ("name", c_void_p)                                              ## [in][optional] name of a memory object to import
-    ]
-
-###############################################################################
-## @brief Exports an allocation as a Win32 handle
-## 
-## @details
-##     - This structure may be passed to ::zeMemGetAllocProperties, via the
-##       `pNext` member of ::ze_memory_allocation_properties_t, to export a
-##       memory allocation as a Win32 handle.
-##     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
-##       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
-##       export an image as a Win32 handle.
-##     - The requested memory export type must have been specified when the
-##       allocation was made.
-class ze_external_memory_export_win32_handle_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory export type for the Win32 handle.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-        ("handle", c_void_p)                                            ## [out] the exported Win32 handle representing the allocation.
-    ]
-
-###############################################################################
-## @brief Supported module creation input formats
-class ze_module_format_v(IntEnum):
-    IL_SPIRV = 0                                    ## Format is SPIRV IL format
-    NATIVE = 1                                      ## Format is device native format
-
-class ze_module_format_t(c_int):
-    def __str__(self):
-        return str(ze_module_format_v(self.value))
-
-
-###############################################################################
-## @brief Specialization constants - User defined constants
-class ze_module_constants_t(Structure):
-    _fields_ = [
-        ("numConstants", c_ulong),                                      ## [in] Number of specialization constants.
-        ("pConstantIds", POINTER(c_ulong)),                             ## [in][range(0, numConstants)] Array of IDs that is sized to
-                                                                        ## numConstants.
-        ("pConstantValues", POINTER(c_void_p))                          ## [in][range(0, numConstants)] Array of pointers to values that is sized
-                                                                        ## to numConstants.
-    ]
-
-###############################################################################
-## @brief Module descriptor
-class ze_module_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("format", ze_module_format_t),                                 ## [in] Module format passed in with pInputModule
-        ("inputSize", c_size_t),                                        ## [in] size of input IL or ISA from pInputModule.
-        ("pInputModule", POINTER(c_ubyte)),                             ## [in] pointer to IL or ISA
-        ("pBuildFlags", c_char_p),                                      ## [in][optional] string containing compiler flags. Following options are supported.
-                                                                        ##  - "-ze-opt-disable"
-                                                                        ##       - Disable optimizations
-                                                                        ##  - "-ze-opt-level"
-                                                                        ##       - Specifies optimization level for compiler. Levels are
-                                                                        ## implementation specific.
-                                                                        ##           - 0 is no optimizations (equivalent to -ze-opt-disable)
-                                                                        ##           - 1 is optimize minimally (may be the same as 2)
-                                                                        ##           - 2 is optimize more (default)
-                                                                        ##  - "-ze-opt-greater-than-4GB-buffer-required"
-                                                                        ##       - Use 64-bit offset calculations for buffers.
-                                                                        ##  - "-ze-opt-large-register-file"
-                                                                        ##       - Increase number of registers available to threads.
-                                                                        ##  - "-ze-opt-has-buffer-offset-arg"
-                                                                        ##       - Extend stateless to stateful optimization to more
-                                                                        ##         cases with the use of additional offset (e.g. 64-bit
-                                                                        ##         pointer to binding table with 32-bit offset).
-                                                                        ##  - "-g"
-                                                                        ##       - Include debugging information.
-        ("pConstants", POINTER(ze_module_constants_t))                  ## [in][optional] pointer to specialization constants. Valid only for
-                                                                        ## SPIR-V input. This must be set to nullptr if no specialization
-                                                                        ## constants are provided.
-    ]
-
-###############################################################################
-## @brief Supported module property flags
-class ze_module_property_flags_v(IntEnum):
-    IMPORTS = ZE_BIT(0)                             ## Module has imports (i.e. imported global variables and/or kernels).
-                                                    ## See ::zeModuleDynamicLink.
-
-class ze_module_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Module properties
-class ze_module_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_module_property_flags_t)                           ## [out] 0 (none) or a valid combination of ::ze_module_property_flag_t
-    ]
-
-###############################################################################
-## @brief Supported kernel creation flags
-class ze_kernel_flags_v(IntEnum):
-    FORCE_RESIDENCY = ZE_BIT(0)                     ## force all device allocations to be resident during execution
-    EXPLICIT_RESIDENCY = ZE_BIT(1)                  ## application is responsible for all residency of device allocations.
-                                                    ## driver may disable implicit residency management.
-
-class ze_kernel_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Kernel descriptor
-class ze_kernel_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_kernel_flags_t),                                   ## [in] creation flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_kernel_flag_t;
-                                                                        ## default behavior may use driver-based residency.
-        ("pKernelName", c_char_p)                                       ## [in] null-terminated name of kernel in module
-    ]
-
-###############################################################################
-## @brief Kernel indirect access flags
-class ze_kernel_indirect_access_flags_v(IntEnum):
-    HOST = ZE_BIT(0)                                ## Indicates that the kernel accesses host allocations indirectly.
-    DEVICE = ZE_BIT(1)                              ## Indicates that the kernel accesses device allocations indirectly.
-    SHARED = ZE_BIT(2)                              ## Indicates that the kernel accesses shared allocations indirectly.
-
-class ze_kernel_indirect_access_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Supported Cache Config flags
-class ze_cache_config_flags_v(IntEnum):
-    LARGE_SLM = ZE_BIT(0)                           ## Large SLM size
-    LARGE_DATA = ZE_BIT(1)                          ## Large General Data size
-
-class ze_cache_config_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Maximum kernel universal unique id (UUID) size in bytes
-ZE_MAX_KERNEL_UUID_SIZE = 16
-
-###############################################################################
-## @brief Maximum module universal unique id (UUID) size in bytes
-ZE_MAX_MODULE_UUID_SIZE = 16
-
-###############################################################################
-## @brief Kernel universal unique id (UUID)
-class ze_kernel_uuid_t(Structure):
-    _fields_ = [
-        ("kid", c_ubyte * ZE_MAX_KERNEL_UUID_SIZE),                     ## [out] opaque data representing a kernel UUID
-        ("mid", c_ubyte * ZE_MAX_MODULE_UUID_SIZE)                      ## [out] opaque data representing the kernel's module UUID
-    ]
-
-###############################################################################
-## @brief Kernel properties
-class ze_kernel_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("numKernelArgs", c_ulong),                                     ## [out] number of kernel arguments.
-        ("requiredGroupSizeX", c_ulong),                                ## [out] required group size in the X dimension,
-                                                                        ## or zero if there is no required group size
-        ("requiredGroupSizeY", c_ulong),                                ## [out] required group size in the Y dimension,
-                                                                        ## or zero if there is no required group size
-        ("requiredGroupSizeZ", c_ulong),                                ## [out] required group size in the Z dimension,
-                                                                        ## or zero if there is no required group size
-        ("requiredNumSubGroups", c_ulong),                              ## [out] required number of subgroups per thread group,
-                                                                        ## or zero if there is no required number of subgroups
-        ("requiredSubgroupSize", c_ulong),                              ## [out] required subgroup size,
-                                                                        ## or zero if there is no required subgroup size
-        ("maxSubgroupSize", c_ulong),                                   ## [out] maximum subgroup size
-        ("maxNumSubgroups", c_ulong),                                   ## [out] maximum number of subgroups per thread group
-        ("localMemSize", c_ulong),                                      ## [out] local memory size used by each thread group
-        ("privateMemSize", c_ulong),                                    ## [out] private memory size allocated by compiler used by each thread
-        ("spillMemSize", c_ulong),                                      ## [out] spill memory size allocated by compiler
-        ("uuid", ze_kernel_uuid_t)                                      ## [out] universal unique identifier.
-    ]
-
-###############################################################################
-## @brief Additional kernel preferred group size properties
-## 
-## @details
-##     - This structure may be passed to ::zeKernelGetProperties, via the
-##       `pNext` member of ::ze_kernel_properties_t, to query additional kernel
-##       preferred group size properties.
-class ze_kernel_preferred_group_size_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("preferredMultiple", c_ulong)                                  ## [out] preferred group size multiple
-    ]
-
-###############################################################################
-## @brief Kernel dispatch group count.
-class ze_group_count_t(Structure):
-    _fields_ = [
-        ("groupCountX", c_ulong),                                       ## [in] number of thread groups in X dimension
-        ("groupCountY", c_ulong),                                       ## [in] number of thread groups in Y dimension
-        ("groupCountZ", c_ulong)                                        ## [in] number of thread groups in Z dimension
-    ]
-
-###############################################################################
-## @brief Module Program Extension Name
-ZE_MODULE_PROGRAM_EXP_NAME = "ZE_experimental_module_program"
-
-###############################################################################
-## @brief Module Program Extension Version(s)
-class ze_module_program_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_module_program_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_module_program_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Module extended descriptor to support multiple input modules.
-## 
-## @details
-##     - Implementation must support ::ZE_experimental_module_program extension
-##     - Modules support import and export linkage for functions and global
-##       variables.
-##     - SPIR-V import and export linkage types are used. See SPIR-V
-##       specification for linkage details.
-##     - pInputModules, pBuildFlags, and pConstants from ::ze_module_desc_t is
-##       ignored.
-##     - Format in ::ze_module_desc_t needs to be set to
-##       ::ZE_MODULE_FORMAT_IL_SPIRV.
-class ze_module_program_exp_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("count", c_ulong),                                             ## [in] Count of input modules
-        ("inputSizes", POINTER(c_size_t)),                              ## [in][range(0, count)] sizes of each input IL module in pInputModules.
-        ("pInputModules", POINTER(c_ubyte*)),                           ## [in][range(0, count)] pointer to an array of IL (e.g. SPIR-V modules).
-                                                                        ## Valid only for SPIR-V input.
-        ("pBuildFlags", POINTER(c_char_p)),                             ## [in][optional][range(0, count)] array of strings containing build
-                                                                        ## flags. See pBuildFlags in ::ze_module_desc_t.
-        ("pConstants", POINTER(ze_module_constants_t*))                 ## [in][optional][range(0, count)] pointer to array of specialization
-                                                                        ## constant strings. Valid only for SPIR-V input. This must be set to
-                                                                        ## nullptr if no specialization constants are provided.
-    ]
-
-###############################################################################
-## @brief Raytracing Extension Name
-ZE_RAYTRACING_EXT_NAME = "ZE_extension_raytracing"
-
-###############################################################################
-## @brief Raytracing Extension Version(s)
-class ze_raytracing_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_raytracing_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_raytracing_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported raytracing capability flags
-class ze_device_raytracing_ext_flags_v(IntEnum):
-    RAYQUERY = ZE_BIT(0)                            ## Supports rayquery
-
-class ze_device_raytracing_ext_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Raytracing properties queried using ::zeDeviceGetModuleProperties
-## 
-## @details
-##     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-##       `pNext` member of ::ze_device_module_properties_t.
-class ze_device_raytracing_ext_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_device_raytracing_ext_flags_t),                    ## [out] 0 or a valid combination of ::ze_device_raytracing_ext_flags_t
-        ("maxBVHLevels", c_ulong)                                       ## [out] Maximum number of BVH levels supported
-    ]
-
-###############################################################################
-## @brief Supported raytracing memory allocation flags
-class ze_raytracing_mem_alloc_ext_flags_v(IntEnum):
-    TBD = ZE_BIT(0)                                 ## reserved for future use
-
-class ze_raytracing_mem_alloc_ext_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Raytracing memory allocation descriptor
-## 
-## @details
-##     - This structure must be passed to ::zeMemAllocShared or
-##       ::zeMemAllocDevice, via `pNext` member of
-##       ::ze_device_mem_alloc_desc_t, for any memory allocation that is to be
-##       accessed by raytracing fixed-function of the device.
-class ze_raytracing_mem_alloc_ext_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_raytracing_mem_alloc_ext_flags_t)                  ## [in] flags specifying additional allocation controls.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_raytracing_mem_alloc_ext_flag_t;
-                                                                        ## default behavior may use implicit driver-based heuristics.
-    ]
-
-###############################################################################
-## @brief Sampler addressing modes
-class ze_sampler_address_mode_v(IntEnum):
-    NONE = 0                                        ## No coordinate modifications for out-of-bounds image access.
-    REPEAT = 1                                      ## Out-of-bounds coordinates are wrapped back around.
-    CLAMP = 2                                       ## Out-of-bounds coordinates are clamped to edge.
-    CLAMP_TO_BORDER = 3                             ## Out-of-bounds coordinates are clamped to border color which is (0.0f,
-                                                    ## 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise
-                                                    ## (0.0f, 0.0f, 0.0f, 1.0f).
-    MIRROR = 4                                      ## Out-of-bounds coordinates are mirrored starting from edge.
-
-class ze_sampler_address_mode_t(c_int):
-    def __str__(self):
-        return str(ze_sampler_address_mode_v(self.value))
-
-
-###############################################################################
-## @brief Sampler filtering modes
-class ze_sampler_filter_mode_v(IntEnum):
-    NEAREST = 0                                     ## No coordinate modifications for out of bounds image access.
-    LINEAR = 1                                      ## Out-of-bounds coordinates are wrapped back around.
-
-class ze_sampler_filter_mode_t(c_int):
-    def __str__(self):
-        return str(ze_sampler_filter_mode_v(self.value))
-
-
-###############################################################################
-## @brief Sampler descriptor
-class ze_sampler_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("addressMode", ze_sampler_address_mode_t),                     ## [in] Sampler addressing mode to determine how out-of-bounds
-                                                                        ## coordinates are handled.
-        ("filterMode", ze_sampler_filter_mode_t),                       ## [in] Sampler filter mode to determine how samples are filtered.
-        ("isNormalized", ze_bool_t)                                     ## [in] Are coordinates normalized [0, 1] or not.
-    ]
-
-###############################################################################
-## @brief Virtual memory page access attributes
-class ze_memory_access_attribute_v(IntEnum):
-    NONE = 0                                        ## Indicates the memory page is inaccessible.
-    READWRITE = 1                                   ## Indicates the memory page supports read write access.
-    READONLY = 2                                    ## Indicates the memory page supports read-only access.
-
-class ze_memory_access_attribute_t(c_int):
-    def __str__(self):
-        return str(ze_memory_access_attribute_v(self.value))
-
-
-###############################################################################
-## @brief Supported physical memory creation flags
-class ze_physical_mem_flags_v(IntEnum):
-    TBD = ZE_BIT(0)                                 ## reserved for future use.
-
-class ze_physical_mem_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Physical memory descriptor
-class ze_physical_mem_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_physical_mem_flags_t),                             ## [in] creation flags.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_physical_mem_flag_t.
-        ("size", c_size_t)                                              ## [in] size in bytes to reserve; must be page aligned.
-    ]
-
-###############################################################################
-## @brief Floating-Point Atomics Extension Name
-ZE_FLOAT_ATOMICS_EXT_NAME = "ZE_extension_float_atomics"
-
-###############################################################################
-## @brief Floating-Point Atomics Extension Version(s)
-class ze_float_atomics_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_float_atomics_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_float_atomics_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported floating-point atomic capability flags
-class ze_device_fp_atomic_ext_flags_v(IntEnum):
-    GLOBAL_LOAD_STORE = ZE_BIT(0)                   ## Supports atomic load, store, and exchange
-    GLOBAL_ADD = ZE_BIT(1)                          ## Supports atomic add and subtract
-    GLOBAL_MIN_MAX = ZE_BIT(2)                      ## Supports atomic min and max
-    LOCAL_LOAD_STORE = ZE_BIT(16)                   ## Supports atomic load, store, and exchange
-    LOCAL_ADD = ZE_BIT(17)                          ## Supports atomic add and subtract
-    LOCAL_MIN_MAX = ZE_BIT(18)                      ## Supports atomic min and max
-
-class ze_device_fp_atomic_ext_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device floating-point atomic properties queried using
-##        ::zeDeviceGetModuleProperties
-## 
-## @details
-##     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-##       `pNext` member of ::ze_device_module_properties_t.
-class ze_float_atomic_ext_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("fp16Flags", ze_device_fp_atomic_ext_flags_t),                 ## [out] Capabilities for half-precision floating-point atomic operations
-        ("fp32Flags", ze_device_fp_atomic_ext_flags_t),                 ## [out] Capabilities for single-precision floating-point atomic
-                                                                        ## operations
-        ("fp64Flags", ze_device_fp_atomic_ext_flags_t)                  ## [out] Capabilities for double-precision floating-point atomic
-                                                                        ## operations
-    ]
-
-###############################################################################
-## @brief Global Offset Extension Name
-ZE_GLOBAL_OFFSET_EXP_NAME = "ZE_experimental_global_offset"
-
-###############################################################################
-## @brief Global Offset Extension Version(s)
-class ze_global_offset_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_global_offset_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_global_offset_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Relaxed Allocation Limits Extension Name
-ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME = "ZE_experimental_relaxed_allocation_limits"
-
-###############################################################################
-## @brief Relaxed Allocation Limits Extension Version(s)
-class ze_relaxed_allocation_limits_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_relaxed_allocation_limits_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_relaxed_allocation_limits_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported relaxed memory allocation flags
-class ze_relaxed_allocation_limits_exp_flags_v(IntEnum):
-    MAX_SIZE = ZE_BIT(0)                            ## Allocation size may exceed ::ze_device_properties_t.maxMemAllocSize
-
-class ze_relaxed_allocation_limits_exp_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Relaxed limits memory allocation descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeMemAllocShared or
-##       ::zeMemAllocDevice, via `pNext` member of
-##       ::ze_device_mem_alloc_desc_t.
-##     - This structure may also be passed to ::zeMemAllocHost, via `pNext`
-##       member of ::ze_host_mem_alloc_desc_t.
-class ze_relaxed_allocation_limits_exp_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_relaxed_allocation_limits_exp_flags_t)             ## [in] flags specifying allocation limits to relax.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_relaxed_allocation_limits_exp_flag_t;
-    ]
-
-###############################################################################
-## @brief Cache_Reservation Extension Name
-ZE_CACHE_RESERVATION_EXT_NAME = "ZE_extension_cache_reservation"
-
-###############################################################################
-## @brief Cache_Reservation Extension Version(s)
-class ze_cache_reservation_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_cache_reservation_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_cache_reservation_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Cache Reservation Region
-class ze_cache_ext_region_v(IntEnum):
-    ZE_CACHE_REGION_DEFAULT = 0                     ## utilize driver default scheme
-    ZE_CACHE_RESERVE_REGION = 1                     ## Utilize reserver region
-    ZE_CACHE_NON_RESERVED_REGION = 2                ## Utilize non-reserverd region
-
-class ze_cache_ext_region_t(c_int):
-    def __str__(self):
-        return str(ze_cache_ext_region_v(self.value))
-
-
-###############################################################################
-## @brief CacheReservation structure
-## 
-## @details
-##     - This structure must be passed to ::zeDeviceGetCacheProperties via
-##       `pNext` member of ::ze_device_cache_properties_t
-##     - Used for determining the max cache reservation allowed on device. Size
-##       of zero means no reservation available.
-class ze_cache_reservation_ext_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("maxCacheReservationSize", c_size_t)                           ## [out] max cache reservation size
-    ]
-
-###############################################################################
-## @brief Event Query Timestamps Extension Name
-ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME = "ZE_experimental_event_query_timestamps"
-
-###############################################################################
-## @brief Event Query Timestamps Extension Version(s)
-class ze_event_query_timestamps_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_event_query_timestamps_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_event_query_timestamps_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Image Memory Properties Extension Name
-ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME = "ZE_experimental_image_memory_properties"
-
-###############################################################################
-## @brief Image Memory Properties Extension Version(s)
-class ze_image_memory_properties_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_image_memory_properties_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_image_memory_properties_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Image memory properties
-class ze_image_memory_properties_exp_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("size", c_ulonglong),                                          ## [out] size of image allocation in bytes.
-        ("rowPitch", c_ulonglong),                                      ## [out] size of image row in bytes.
-        ("slicePitch", c_ulonglong)                                     ## [out] size of image slice in bytes.
-    ]
-
-###############################################################################
-## @brief Image View Extension Name
-ZE_IMAGE_VIEW_EXP_NAME = "ZE_experimental_image_view"
-
-###############################################################################
-## @brief Image View Extension Version(s)
-class ze_image_view_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_image_view_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_image_view_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Image View Planar Extension Name
-ZE_IMAGE_VIEW_PLANAR_EXP_NAME = "ZE_experimental_image_view_planar"
-
-###############################################################################
-## @brief Image View Planar Extension Version(s)
-class ze_image_view_planar_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_image_view_planar_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_image_view_planar_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Image view planar descriptor
-class ze_image_view_planar_exp_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("planeIndex", c_ulong)                                         ## [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
-    ]
-
-###############################################################################
-## @brief Kernel Scheduling Hints Extension Name
-ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME = "ZE_experimental_scheduling_hints"
-
-###############################################################################
-## @brief Kernel Scheduling Hints Extension Version(s)
-class ze_scheduling_hints_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_scheduling_hints_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_scheduling_hints_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported kernel scheduling hint flags
-class ze_scheduling_hint_exp_flags_v(IntEnum):
-    OLDEST_FIRST = ZE_BIT(0)                        ## Hint that the kernel prefers oldest-first scheduling
-    ROUND_ROBIN = ZE_BIT(1)                         ## Hint that the kernel prefers round-robin scheduling
-    STALL_BASED_ROUND_ROBIN = ZE_BIT(2)             ## Hint that the kernel prefers stall-based round-robin scheduling
-
-class ze_scheduling_hint_exp_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device kernel scheduling hint properties queried using
-##        ::zeDeviceGetModuleProperties
-## 
-## @details
-##     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-##       `pNext` member of ::ze_device_module_properties_t.
-class ze_scheduling_hint_exp_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("schedulingHintFlags", ze_scheduling_hint_exp_flags_t)         ## [out] Supported kernel scheduling hints.
-                                                                        ## May be 0 (none) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
-    ]
-
-###############################################################################
-## @brief Kernel scheduling hint descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeKernelSchedulingHintExp.
-class ze_scheduling_hint_exp_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_scheduling_hint_exp_flags_t)                       ## [in] flags specifying kernel scheduling hints.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
-    ]
-
-###############################################################################
-## @brief Linkonce ODR Extension Name
-ZE_LINKONCE_ODR_EXT_NAME = "ZE_extension_linkonce_odr"
-
-###############################################################################
-## @brief Linkonce ODR Extension Version(s)
-class ze_linkonce_odr_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_linkonce_odr_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_linkonce_odr_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Power Saving Hint Extension Name
-ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME = "ZE_experimental_power_saving_hint"
-
-###############################################################################
-## @brief Power Saving Hint Extension Version(s)
-class ze_power_saving_hint_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_power_saving_hint_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_power_saving_hint_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported device types
-class ze_power_saving_hint_type_v(IntEnum):
-    MIN = 0                                         ## Minumum power savings. The device will make no attempt to save power
-                                                    ## while executing work submitted to this context.
-    MAX = 100                                       ## Maximum power savings. The device will do everything to bring power to
-                                                    ## a minimum while executing work submitted to this context.
-
-class ze_power_saving_hint_type_t(c_int):
-    def __str__(self):
-        return str(ze_power_saving_hint_type_v(self.value))
-
-
-###############################################################################
-## @brief Extended context descriptor containing power saving hint.
-class ze_context_power_saving_hint_exp_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("hint", c_ulong)                                               ## [in] power saving hint (default value = 0). This is value from [0,100]
-                                                                        ## and can use pre-defined settings from ::ze_power_saving_hint_type_t.
-    ]
-
-###############################################################################
-## @brief Subgroups Extension Name
-ZE_SUBGROUPS_EXT_NAME = "ZE_extension_subgroups"
-
-###############################################################################
-## @brief Subgroups Extension Version(s)
-class ze_subgroup_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_subgroup_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_subgroup_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief EU Count Extension Name
-ZE_EU_COUNT_EXT_NAME = "ZE_extension_eu_count"
-
-###############################################################################
-## @brief EU Count Extension Version(s)
-class ze_eu_count_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_eu_count_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_eu_count_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief EU count queried using ::zeDeviceGetProperties
-## 
-## @details
-##     - This structure may be returned from ::zeDeviceGetProperties via
-##       `pNext` member of ::ze_device_properties_t
-##     - Used for determining the total number of EUs available on device.
-class ze_eu_count_ext_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("numTotalEUs", c_ulong)                                        ## [out] Total number of EUs available
-    ]
-
-###############################################################################
-## @brief PCI Properties Extension Name
-ZE_PCI_PROPERTIES_EXT_NAME = "ZE_extension_pci_properties"
-
-###############################################################################
-## @brief PCI Properties Extension Version(s)
-class ze_pci_properties_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_pci_properties_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_pci_properties_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Device PCI address
-## 
-## @details
-##     - This structure may be passed to ::zeDevicePciGetPropertiesExt as an
-##       attribute of ::ze_pci_ext_properties_t.
-##     - A PCI BDF address is the bus:device:function address of the device and
-##       is useful for locating the device in the PCI switch fabric.
-class ze_pci_address_ext_t(Structure):
-    _fields_ = [
-        ("domain", c_ulong),                                            ## [out] PCI domain number
-        ("bus", c_ulong),                                               ## [out] PCI BDF bus number
-        ("device", c_ulong),                                            ## [out] PCI BDF device number
-        ("function", c_ulong)                                           ## [out] PCI BDF function number
-    ]
-
-###############################################################################
-## @brief Device PCI speed
-class ze_pci_speed_ext_t(Structure):
-    _fields_ = [
-        ("genVersion", c_int32_t),                                      ## [out] The link generation. A value of -1 means that this property is
-                                                                        ## unknown.
-        ("width", c_int32_t),                                           ## [out] The number of lanes. A value of -1 means that this property is
-                                                                        ## unknown.
-        ("maxBandwidth", c_int64_t)                                     ## [out] The theoretical maximum bandwidth in bytes/sec (sum of all
-                                                                        ## lanes). A value of -1 means that this property is unknown.
-    ]
-
-###############################################################################
-## @brief Static PCI properties
-class ze_pci_ext_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("address", ze_pci_address_ext_t),                              ## [out] The BDF address
-        ("maxSpeed", ze_pci_speed_ext_t)                                ## [out] Fastest port configuration supported by the device (sum of all
-                                                                        ## lanes)
-    ]
-
-###############################################################################
-## @brief sRGB Extension Name
-ZE_SRGB_EXT_NAME = "ZE_extension_srgb"
-
-###############################################################################
-## @brief sRGB Extension Version(s)
-class ze_srgb_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_srgb_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_srgb_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief sRGB image descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeImageCreate via the `pNext` member
-##       of ::ze_image_desc_t
-##     - Used for specifying that the image is in sRGB format.
-class ze_srgb_ext_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("sRGB", ze_bool_t)                                             ## [in] Is sRGB.
-    ]
-
-###############################################################################
-## @brief Image Copy Extension Name
-ZE_IMAGE_COPY_EXT_NAME = "ZE_extension_image_copy"
-
-###############################################################################
-## @brief Image Copy Extension Version(s)
-class ze_image_copy_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_image_copy_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_image_copy_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Image Query Allocation Properties Extension Name
-ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME = "ZE_extension_image_query_alloc_properties"
-
-###############################################################################
-## @brief Image Query Allocation Properties Extension Version(s)
-class ze_image_query_alloc_properties_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_image_query_alloc_properties_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_image_query_alloc_properties_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Image allocation properties queried using
-##        ::zeImageGetAllocPropertiesExt
-class ze_image_allocation_ext_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("id", c_ulonglong)                                             ## [out] identifier for this allocation
-    ]
-
-###############################################################################
-## @brief Linkage Inspection Extension Name
-ZE_LINKAGE_INSPECTION_EXT_NAME = "ZE_extension_linkage_inspection"
-
-###############################################################################
-## @brief Linkage Inspection Extension Version(s)
-class ze_linkage_inspection_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_linkage_inspection_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_linkage_inspection_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported module linkage inspection flags
-class ze_linkage_inspection_ext_flags_v(IntEnum):
-    IMPORTS = ZE_BIT(0)                             ## List all imports of modules
-    UNRESOLVABLE_IMPORTS = ZE_BIT(1)                ## List all imports of modules that do not have a corresponding export
-    EXPORTS = ZE_BIT(2)                             ## List all exports of modules
-
-class ze_linkage_inspection_ext_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Module linkage inspection descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeModuleInspectLinkageExt.
-class ze_linkage_inspection_ext_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_linkage_inspection_ext_flags_t)                    ## [in] flags specifying module linkage inspection.
-                                                                        ## must be 0 (default) or a valid combination of ::ze_linkage_inspection_ext_flag_t.
-    ]
-
-###############################################################################
-## @brief Memory Compression Hints Extension Name
-ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME = "ZE_extension_memory_compression_hints"
-
-###############################################################################
-## @brief Memory Compression Hints Extension Version(s)
-class ze_memory_compression_hints_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_memory_compression_hints_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_memory_compression_hints_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported memory compression hints flags
-class ze_memory_compression_hints_ext_flags_v(IntEnum):
-    COMPRESSED = ZE_BIT(0)                          ## Hint Driver implementation to make allocation compressible
-    UNCOMPRESSED = ZE_BIT(1)                        ## Hint Driver implementation to make allocation not compressible
-
-class ze_memory_compression_hints_ext_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Compression hints memory allocation descriptor
-## 
-## @details
-##     - This structure may be passed to ::zeMemAllocShared or
-##       ::zeMemAllocDevice, via `pNext` member of
-##       ::ze_device_mem_alloc_desc_t.
-##     - This structure may be passed to ::zeMemAllocHost, via `pNext` member
-##       of ::ze_host_mem_alloc_desc_t.
-##     - This structure may be passed to ::zeImageCreate, via `pNext` member of
-##       ::ze_image_desc_t.
-class ze_memory_compression_hints_ext_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("flags", ze_memory_compression_hints_ext_flags_t)              ## [in] flags specifying if allocation should be compressible or not.
-                                                                        ## Must be set to one of the ::ze_memory_compression_hints_ext_flag_t;
-    ]
-
-###############################################################################
-## @brief Memory Free Policies Extension Name
-ZE_MEMORY_FREE_POLICIES_EXT_NAME = "ZE_extension_memory_free_policies"
-
-###############################################################################
-## @brief Memory Free Policies Extension Version(s)
-class ze_memory_free_policies_ext_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_memory_free_policies_ext_version_t(c_int):
-    def __str__(self):
-        return str(ze_memory_free_policies_ext_version_v(self.value))
-
-
-###############################################################################
-## @brief Supported memory free policy capability flags
-class ze_driver_memory_free_policy_ext_flags_v(IntEnum):
-    BLOCKING_FREE = ZE_BIT(0)                       ## blocks until all commands using the memory are complete before freeing
-    DEFER_FREE = ZE_BIT(1)                          ## schedules the memory to be freed but does not free immediately
-
-class ze_driver_memory_free_policy_ext_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Driver memory free properties queried using ::zeDriverGetProperties
-## 
-## @details
-##     - All drivers must support an immediate free policy, which is the
-##       default free policy.
-##     - This structure may be returned from ::zeDriverGetProperties, via
-##       `pNext` member of ::ze_driver_properties_t.
-class ze_driver_memory_free_ext_properties_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("freePolicies", ze_driver_memory_free_policy_ext_flags_t)      ## [out] Supported memory free policies.
-                                                                        ## must be 0 or a combination of ::ze_driver_memory_free_policy_ext_flag_t.
-    ]
-
-###############################################################################
-## @brief Memory free descriptor with free policy
-class ze_memory_free_ext_desc_t(Structure):
-    _fields_ = [
-        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
-                                                                        ## structure (i.e. contains sType and pNext).
-        ("freePolicy", ze_driver_memory_free_policy_ext_flags_t)        ## [in] flags specifying the memory free policy.
-                                                                        ## must be 0 (default) or a supported ::ze_driver_memory_free_policy_ext_flag_t;
-                                                                        ## default behavior is to free immediately.
-    ]
-
-###############################################################################
-__use_win_types = "Windows" == platform.uname()[0]
-
-###############################################################################
-## @brief Function-pointer for zeInit
-if __use_win_types:
-    _zeInit_t = WINFUNCTYPE( ze_result_t, ze_init_flags_t )
-else:
-    _zeInit_t = CFUNCTYPE( ze_result_t, ze_init_flags_t )
-
-
-###############################################################################
-## @brief Table of Global functions pointers
-class _ze_global_dditable_t(Structure):
-    _fields_ = [
-        ("pfnInit", c_void_p)                                           ## _zeInit_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeDriverGet
-if __use_win_types:
-    _zeDriverGet_t = WINFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(ze_driver_handle_t) )
-else:
-    _zeDriverGet_t = CFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(ze_driver_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDriverGetApiVersion
-if __use_win_types:
-    _zeDriverGetApiVersion_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_api_version_t) )
-else:
-    _zeDriverGetApiVersion_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_api_version_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDriverGetProperties
-if __use_win_types:
-    _zeDriverGetProperties_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_properties_t) )
-else:
-    _zeDriverGetProperties_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDriverGetIpcProperties
-if __use_win_types:
-    _zeDriverGetIpcProperties_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_ipc_properties_t) )
-else:
-    _zeDriverGetIpcProperties_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_ipc_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDriverGetExtensionProperties
-if __use_win_types:
-    _zeDriverGetExtensionProperties_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_driver_extension_properties_t) )
-else:
-    _zeDriverGetExtensionProperties_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_driver_extension_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDriverGetExtensionFunctionAddress
-if __use_win_types:
-    _zeDriverGetExtensionFunctionAddress_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, c_char_p, POINTER(c_void_p) )
-else:
-    _zeDriverGetExtensionFunctionAddress_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, c_char_p, POINTER(c_void_p) )
-
-
-###############################################################################
-## @brief Table of Driver functions pointers
-class _ze_driver_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGet", c_void_p),                                           ## _zeDriverGet_t
-        ("pfnGetApiVersion", c_void_p),                                 ## _zeDriverGetApiVersion_t
-        ("pfnGetProperties", c_void_p),                                 ## _zeDriverGetProperties_t
-        ("pfnGetIpcProperties", c_void_p),                              ## _zeDriverGetIpcProperties_t
-        ("pfnGetExtensionProperties", c_void_p),                        ## _zeDriverGetExtensionProperties_t
-        ("pfnGetExtensionFunctionAddress", c_void_p)                    ## _zeDriverGetExtensionFunctionAddress_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGet
-if __use_win_types:
-    _zeDeviceGet_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
-else:
-    _zeDeviceGet_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetSubDevices
-if __use_win_types:
-    _zeDeviceGetSubDevices_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
-else:
-    _zeDeviceGetSubDevices_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetProperties
-if __use_win_types:
-    _zeDeviceGetProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_properties_t) )
-else:
-    _zeDeviceGetProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetComputeProperties
-if __use_win_types:
-    _zeDeviceGetComputeProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_compute_properties_t) )
-else:
-    _zeDeviceGetComputeProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_compute_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetModuleProperties
-if __use_win_types:
-    _zeDeviceGetModuleProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_module_properties_t) )
-else:
-    _zeDeviceGetModuleProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_module_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetCommandQueueGroupProperties
-if __use_win_types:
-    _zeDeviceGetCommandQueueGroupProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_command_queue_group_properties_t) )
-else:
-    _zeDeviceGetCommandQueueGroupProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_command_queue_group_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetMemoryProperties
-if __use_win_types:
-    _zeDeviceGetMemoryProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_memory_properties_t) )
-else:
-    _zeDeviceGetMemoryProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_memory_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetMemoryAccessProperties
-if __use_win_types:
-    _zeDeviceGetMemoryAccessProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_memory_access_properties_t) )
-else:
-    _zeDeviceGetMemoryAccessProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_memory_access_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetCacheProperties
-if __use_win_types:
-    _zeDeviceGetCacheProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_cache_properties_t) )
-else:
-    _zeDeviceGetCacheProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_cache_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetImageProperties
-if __use_win_types:
-    _zeDeviceGetImageProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_image_properties_t) )
-else:
-    _zeDeviceGetImageProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_image_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetExternalMemoryProperties
-if __use_win_types:
-    _zeDeviceGetExternalMemoryProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_external_memory_properties_t) )
-else:
-    _zeDeviceGetExternalMemoryProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_external_memory_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetP2PProperties
-if __use_win_types:
-    _zeDeviceGetP2PProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_device_p2p_properties_t) )
-else:
-    _zeDeviceGetP2PProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_device_p2p_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceCanAccessPeer
-if __use_win_types:
-    _zeDeviceCanAccessPeer_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_bool_t) )
-else:
-    _zeDeviceCanAccessPeer_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_bool_t) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetStatus
-if __use_win_types:
-    _zeDeviceGetStatus_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t )
-else:
-    _zeDeviceGetStatus_t = CFUNCTYPE( ze_result_t, ze_device_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceGetGlobalTimestamps
-if __use_win_types:
-    _zeDeviceGetGlobalTimestamps_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulonglong), POINTER(c_ulonglong) )
-else:
-    _zeDeviceGetGlobalTimestamps_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulonglong), POINTER(c_ulonglong) )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceReserveCacheExt
-if __use_win_types:
-    _zeDeviceReserveCacheExt_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, c_size_t, c_size_t )
-else:
-    _zeDeviceReserveCacheExt_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, c_size_t, c_size_t )
-
-###############################################################################
-## @brief Function-pointer for zeDeviceSetCacheAdviceExt
-if __use_win_types:
-    _zeDeviceSetCacheAdviceExt_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, c_void_p, c_size_t, ze_cache_ext_region_t )
-else:
-    _zeDeviceSetCacheAdviceExt_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, c_void_p, c_size_t, ze_cache_ext_region_t )
-
-###############################################################################
-## @brief Function-pointer for zeDevicePciGetPropertiesExt
-if __use_win_types:
-    _zeDevicePciGetPropertiesExt_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_pci_ext_properties_t) )
-else:
-    _zeDevicePciGetPropertiesExt_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_pci_ext_properties_t) )
-
-
-###############################################################################
-## @brief Table of Device functions pointers
-class _ze_device_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGet", c_void_p),                                           ## _zeDeviceGet_t
-        ("pfnGetSubDevices", c_void_p),                                 ## _zeDeviceGetSubDevices_t
-        ("pfnGetProperties", c_void_p),                                 ## _zeDeviceGetProperties_t
-        ("pfnGetComputeProperties", c_void_p),                          ## _zeDeviceGetComputeProperties_t
-        ("pfnGetModuleProperties", c_void_p),                           ## _zeDeviceGetModuleProperties_t
-        ("pfnGetCommandQueueGroupProperties", c_void_p),                ## _zeDeviceGetCommandQueueGroupProperties_t
-        ("pfnGetMemoryProperties", c_void_p),                           ## _zeDeviceGetMemoryProperties_t
-        ("pfnGetMemoryAccessProperties", c_void_p),                     ## _zeDeviceGetMemoryAccessProperties_t
-        ("pfnGetCacheProperties", c_void_p),                            ## _zeDeviceGetCacheProperties_t
-        ("pfnGetImageProperties", c_void_p),                            ## _zeDeviceGetImageProperties_t
-        ("pfnGetExternalMemoryProperties", c_void_p),                   ## _zeDeviceGetExternalMemoryProperties_t
-        ("pfnGetP2PProperties", c_void_p),                              ## _zeDeviceGetP2PProperties_t
-        ("pfnCanAccessPeer", c_void_p),                                 ## _zeDeviceCanAccessPeer_t
-        ("pfnGetStatus", c_void_p),                                     ## _zeDeviceGetStatus_t
-        ("pfnGetGlobalTimestamps", c_void_p),                           ## _zeDeviceGetGlobalTimestamps_t
-        ("pfnReserveCacheExt", c_void_p),                               ## _zeDeviceReserveCacheExt_t
-        ("pfnSetCacheAdviceExt", c_void_p),                             ## _zeDeviceSetCacheAdviceExt_t
-        ("pfnPciGetPropertiesExt", c_void_p)                            ## _zeDevicePciGetPropertiesExt_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeContextCreate
-if __use_win_types:
-    _zeContextCreate_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), POINTER(ze_context_handle_t) )
-else:
-    _zeContextCreate_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), POINTER(ze_context_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeContextDestroy
-if __use_win_types:
-    _zeContextDestroy_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t )
-else:
-    _zeContextDestroy_t = CFUNCTYPE( ze_result_t, ze_context_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextGetStatus
-if __use_win_types:
-    _zeContextGetStatus_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t )
-else:
-    _zeContextGetStatus_t = CFUNCTYPE( ze_result_t, ze_context_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextSystemBarrier
-if __use_win_types:
-    _zeContextSystemBarrier_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t )
-else:
-    _zeContextSystemBarrier_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextMakeMemoryResident
-if __use_win_types:
-    _zeContextMakeMemoryResident_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
-else:
-    _zeContextMakeMemoryResident_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextEvictMemory
-if __use_win_types:
-    _zeContextEvictMemory_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
-else:
-    _zeContextEvictMemory_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextMakeImageResident
-if __use_win_types:
-    _zeContextMakeImageResident_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
-else:
-    _zeContextMakeImageResident_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextEvictImage
-if __use_win_types:
-    _zeContextEvictImage_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
-else:
-    _zeContextEvictImage_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeContextCreateEx
-if __use_win_types:
-    _zeContextCreateEx_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_context_handle_t) )
-else:
-    _zeContextCreateEx_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_context_handle_t) )
-
-
-###############################################################################
-## @brief Table of Context functions pointers
-class _ze_context_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeContextCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeContextDestroy_t
-        ("pfnGetStatus", c_void_p),                                     ## _zeContextGetStatus_t
-        ("pfnSystemBarrier", c_void_p),                                 ## _zeContextSystemBarrier_t
-        ("pfnMakeMemoryResident", c_void_p),                            ## _zeContextMakeMemoryResident_t
-        ("pfnEvictMemory", c_void_p),                                   ## _zeContextEvictMemory_t
-        ("pfnMakeImageResident", c_void_p),                             ## _zeContextMakeImageResident_t
-        ("pfnEvictImage", c_void_p),                                    ## _zeContextEvictImage_t
-        ("pfnCreateEx", c_void_p)                                       ## _zeContextCreateEx_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeCommandQueueCreate
-if __use_win_types:
-    _zeCommandQueueCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_queue_handle_t) )
-else:
-    _zeCommandQueueCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_queue_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandQueueDestroy
-if __use_win_types:
-    _zeCommandQueueDestroy_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t )
-else:
-    _zeCommandQueueDestroy_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandQueueExecuteCommandLists
-if __use_win_types:
-    _zeCommandQueueExecuteCommandLists_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulong, POINTER(ze_command_list_handle_t), ze_fence_handle_t )
-else:
-    _zeCommandQueueExecuteCommandLists_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulong, POINTER(ze_command_list_handle_t), ze_fence_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandQueueSynchronize
-if __use_win_types:
-    _zeCommandQueueSynchronize_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulonglong )
-else:
-    _zeCommandQueueSynchronize_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulonglong )
-
-
-###############################################################################
-## @brief Table of CommandQueue functions pointers
-class _ze_command_queue_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeCommandQueueCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeCommandQueueDestroy_t
-        ("pfnExecuteCommandLists", c_void_p),                           ## _zeCommandQueueExecuteCommandLists_t
-        ("pfnSynchronize", c_void_p)                                    ## _zeCommandQueueSynchronize_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeCommandListCreate
-if __use_win_types:
-    _zeCommandListCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_list_desc_t), POINTER(ze_command_list_handle_t) )
-else:
-    _zeCommandListCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_list_desc_t), POINTER(ze_command_list_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListCreateImmediate
-if __use_win_types:
-    _zeCommandListCreateImmediate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_list_handle_t) )
-else:
-    _zeCommandListCreateImmediate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_list_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListDestroy
-if __use_win_types:
-    _zeCommandListDestroy_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t )
-else:
-    _zeCommandListDestroy_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListClose
-if __use_win_types:
-    _zeCommandListClose_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t )
-else:
-    _zeCommandListClose_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListReset
-if __use_win_types:
-    _zeCommandListReset_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t )
-else:
-    _zeCommandListReset_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendWriteGlobalTimestamp
-if __use_win_types:
-    _zeCommandListAppendWriteGlobalTimestamp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulonglong), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendWriteGlobalTimestamp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulonglong), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendBarrier
-if __use_win_types:
-    _zeCommandListAppendBarrier_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendBarrier_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemoryRangesBarrier
-if __use_win_types:
-    _zeCommandListAppendMemoryRangesBarrier_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_void_p), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendMemoryRangesBarrier_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_void_p), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemoryCopy
-if __use_win_types:
-    _zeCommandListAppendMemoryCopy_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendMemoryCopy_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemoryFill
-if __use_win_types:
-    _zeCommandListAppendMemoryFill_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendMemoryFill_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemoryCopyRegion
-if __use_win_types:
-    _zeCommandListAppendMemoryCopyRegion_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendMemoryCopyRegion_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemoryCopyFromContext
-if __use_win_types:
-    _zeCommandListAppendMemoryCopyFromContext_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_context_handle_t, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendMemoryCopyFromContext_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_context_handle_t, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendImageCopy
-if __use_win_types:
-    _zeCommandListAppendImageCopy_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendImageCopy_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendImageCopyRegion
-if __use_win_types:
-    _zeCommandListAppendImageCopyRegion_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, POINTER(ze_image_region_t), POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendImageCopyRegion_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, POINTER(ze_image_region_t), POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendImageCopyToMemory
-if __use_win_types:
-    _zeCommandListAppendImageCopyToMemory_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendImageCopyToMemory_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendImageCopyFromMemory
-if __use_win_types:
-    _zeCommandListAppendImageCopyFromMemory_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendImageCopyFromMemory_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemoryPrefetch
-if __use_win_types:
-    _zeCommandListAppendMemoryPrefetch_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_size_t )
-else:
-    _zeCommandListAppendMemoryPrefetch_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_size_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendMemAdvise
-if __use_win_types:
-    _zeCommandListAppendMemAdvise_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_device_handle_t, c_void_p, c_size_t, ze_memory_advice_t )
-else:
-    _zeCommandListAppendMemAdvise_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_device_handle_t, c_void_p, c_size_t, ze_memory_advice_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendSignalEvent
-if __use_win_types:
-    _zeCommandListAppendSignalEvent_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
-else:
-    _zeCommandListAppendSignalEvent_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendWaitOnEvents
-if __use_win_types:
-    _zeCommandListAppendWaitOnEvents_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendWaitOnEvents_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendEventReset
-if __use_win_types:
-    _zeCommandListAppendEventReset_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
-else:
-    _zeCommandListAppendEventReset_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendQueryKernelTimestamps
-if __use_win_types:
-    _zeCommandListAppendQueryKernelTimestamps_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t), c_void_p, POINTER(c_size_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendQueryKernelTimestamps_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t), c_void_p, POINTER(c_size_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendLaunchKernel
-if __use_win_types:
-    _zeCommandListAppendLaunchKernel_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendLaunchKernel_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendLaunchCooperativeKernel
-if __use_win_types:
-    _zeCommandListAppendLaunchCooperativeKernel_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendLaunchCooperativeKernel_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendLaunchKernelIndirect
-if __use_win_types:
-    _zeCommandListAppendLaunchKernelIndirect_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendLaunchKernelIndirect_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect
-if __use_win_types:
-    _zeCommandListAppendLaunchMultipleKernelsIndirect_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_kernel_handle_t), POINTER(c_ulong), POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendLaunchMultipleKernelsIndirect_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_kernel_handle_t), POINTER(c_ulong), POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendImageCopyToMemoryExt
-if __use_win_types:
-    _zeCommandListAppendImageCopyToMemoryExt_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendImageCopyToMemoryExt_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeCommandListAppendImageCopyFromMemoryExt
-if __use_win_types:
-    _zeCommandListAppendImageCopyFromMemoryExt_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zeCommandListAppendImageCopyFromMemoryExt_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-
-###############################################################################
-## @brief Table of CommandList functions pointers
-class _ze_command_list_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeCommandListCreate_t
-        ("pfnCreateImmediate", c_void_p),                               ## _zeCommandListCreateImmediate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeCommandListDestroy_t
-        ("pfnClose", c_void_p),                                         ## _zeCommandListClose_t
-        ("pfnReset", c_void_p),                                         ## _zeCommandListReset_t
-        ("pfnAppendWriteGlobalTimestamp", c_void_p),                    ## _zeCommandListAppendWriteGlobalTimestamp_t
-        ("pfnAppendBarrier", c_void_p),                                 ## _zeCommandListAppendBarrier_t
-        ("pfnAppendMemoryRangesBarrier", c_void_p),                     ## _zeCommandListAppendMemoryRangesBarrier_t
-        ("pfnAppendMemoryCopy", c_void_p),                              ## _zeCommandListAppendMemoryCopy_t
-        ("pfnAppendMemoryFill", c_void_p),                              ## _zeCommandListAppendMemoryFill_t
-        ("pfnAppendMemoryCopyRegion", c_void_p),                        ## _zeCommandListAppendMemoryCopyRegion_t
-        ("pfnAppendMemoryCopyFromContext", c_void_p),                   ## _zeCommandListAppendMemoryCopyFromContext_t
-        ("pfnAppendImageCopy", c_void_p),                               ## _zeCommandListAppendImageCopy_t
-        ("pfnAppendImageCopyRegion", c_void_p),                         ## _zeCommandListAppendImageCopyRegion_t
-        ("pfnAppendImageCopyToMemory", c_void_p),                       ## _zeCommandListAppendImageCopyToMemory_t
-        ("pfnAppendImageCopyFromMemory", c_void_p),                     ## _zeCommandListAppendImageCopyFromMemory_t
-        ("pfnAppendMemoryPrefetch", c_void_p),                          ## _zeCommandListAppendMemoryPrefetch_t
-        ("pfnAppendMemAdvise", c_void_p),                               ## _zeCommandListAppendMemAdvise_t
-        ("pfnAppendSignalEvent", c_void_p),                             ## _zeCommandListAppendSignalEvent_t
-        ("pfnAppendWaitOnEvents", c_void_p),                            ## _zeCommandListAppendWaitOnEvents_t
-        ("pfnAppendEventReset", c_void_p),                              ## _zeCommandListAppendEventReset_t
-        ("pfnAppendQueryKernelTimestamps", c_void_p),                   ## _zeCommandListAppendQueryKernelTimestamps_t
-        ("pfnAppendLaunchKernel", c_void_p),                            ## _zeCommandListAppendLaunchKernel_t
-        ("pfnAppendLaunchCooperativeKernel", c_void_p),                 ## _zeCommandListAppendLaunchCooperativeKernel_t
-        ("pfnAppendLaunchKernelIndirect", c_void_p),                    ## _zeCommandListAppendLaunchKernelIndirect_t
-        ("pfnAppendLaunchMultipleKernelsIndirect", c_void_p),           ## _zeCommandListAppendLaunchMultipleKernelsIndirect_t
-        ("pfnAppendImageCopyToMemoryExt", c_void_p),                    ## _zeCommandListAppendImageCopyToMemoryExt_t
-        ("pfnAppendImageCopyFromMemoryExt", c_void_p)                   ## _zeCommandListAppendImageCopyFromMemoryExt_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeImageGetProperties
-if __use_win_types:
-    _zeImageGetProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_properties_t) )
-else:
-    _zeImageGetProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeImageCreate
-if __use_win_types:
-    _zeImageCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_handle_t) )
-else:
-    _zeImageCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeImageDestroy
-if __use_win_types:
-    _zeImageDestroy_t = WINFUNCTYPE( ze_result_t, ze_image_handle_t )
-else:
-    _zeImageDestroy_t = CFUNCTYPE( ze_result_t, ze_image_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeImageGetAllocPropertiesExt
-if __use_win_types:
-    _zeImageGetAllocPropertiesExt_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_image_handle_t, POINTER(ze_image_allocation_ext_properties_t) )
-else:
-    _zeImageGetAllocPropertiesExt_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_image_handle_t, POINTER(ze_image_allocation_ext_properties_t) )
-
-
-###############################################################################
-## @brief Table of Image functions pointers
-class _ze_image_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zeImageGetProperties_t
-        ("pfnCreate", c_void_p),                                        ## _zeImageCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeImageDestroy_t
-        ("pfnGetAllocPropertiesExt", c_void_p)                          ## _zeImageGetAllocPropertiesExt_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeImageGetMemoryPropertiesExp
-if __use_win_types:
-    _zeImageGetMemoryPropertiesExp_t = WINFUNCTYPE( ze_result_t, ze_image_handle_t, POINTER(ze_image_memory_properties_exp_t) )
-else:
-    _zeImageGetMemoryPropertiesExp_t = CFUNCTYPE( ze_result_t, ze_image_handle_t, POINTER(ze_image_memory_properties_exp_t) )
-
-###############################################################################
-## @brief Function-pointer for zeImageViewCreateExp
-if __use_win_types:
-    _zeImageViewCreateExp_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), ze_image_handle_t, POINTER(ze_image_handle_t) )
-else:
-    _zeImageViewCreateExp_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), ze_image_handle_t, POINTER(ze_image_handle_t) )
-
-
-###############################################################################
-## @brief Table of ImageExp functions pointers
-class _ze_image_exp_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetMemoryPropertiesExp", c_void_p),                        ## _zeImageGetMemoryPropertiesExp_t
-        ("pfnViewCreateExp", c_void_p)                                  ## _zeImageViewCreateExp_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeFenceCreate
-if __use_win_types:
-    _zeFenceCreate_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(ze_fence_desc_t), POINTER(ze_fence_handle_t) )
-else:
-    _zeFenceCreate_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(ze_fence_desc_t), POINTER(ze_fence_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeFenceDestroy
-if __use_win_types:
-    _zeFenceDestroy_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t )
-else:
-    _zeFenceDestroy_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeFenceHostSynchronize
-if __use_win_types:
-    _zeFenceHostSynchronize_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t, c_ulonglong )
-else:
-    _zeFenceHostSynchronize_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t, c_ulonglong )
-
-###############################################################################
-## @brief Function-pointer for zeFenceQueryStatus
-if __use_win_types:
-    _zeFenceQueryStatus_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t )
-else:
-    _zeFenceQueryStatus_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeFenceReset
-if __use_win_types:
-    _zeFenceReset_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t )
-else:
-    _zeFenceReset_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t )
-
-
-###############################################################################
-## @brief Table of Fence functions pointers
-class _ze_fence_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeFenceCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeFenceDestroy_t
-        ("pfnHostSynchronize", c_void_p),                               ## _zeFenceHostSynchronize_t
-        ("pfnQueryStatus", c_void_p),                                   ## _zeFenceQueryStatus_t
-        ("pfnReset", c_void_p)                                          ## _zeFenceReset_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeEventPoolCreate
-if __use_win_types:
-    _zeEventPoolCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_event_pool_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_event_pool_handle_t) )
-else:
-    _zeEventPoolCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_event_pool_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_event_pool_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeEventPoolDestroy
-if __use_win_types:
-    _zeEventPoolDestroy_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
-else:
-    _zeEventPoolDestroy_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeEventPoolGetIpcHandle
-if __use_win_types:
-    _zeEventPoolGetIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_ipc_event_pool_handle_t) )
-else:
-    _zeEventPoolGetIpcHandle_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_ipc_event_pool_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeEventPoolOpenIpcHandle
-if __use_win_types:
-    _zeEventPoolOpenIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_event_pool_handle_t, POINTER(ze_event_pool_handle_t) )
-else:
-    _zeEventPoolOpenIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_event_pool_handle_t, POINTER(ze_event_pool_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeEventPoolCloseIpcHandle
-if __use_win_types:
-    _zeEventPoolCloseIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
-else:
-    _zeEventPoolCloseIpcHandle_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
-
-
-###############################################################################
-## @brief Table of EventPool functions pointers
-class _ze_event_pool_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeEventPoolCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeEventPoolDestroy_t
-        ("pfnGetIpcHandle", c_void_p),                                  ## _zeEventPoolGetIpcHandle_t
-        ("pfnOpenIpcHandle", c_void_p),                                 ## _zeEventPoolOpenIpcHandle_t
-        ("pfnCloseIpcHandle", c_void_p)                                 ## _zeEventPoolCloseIpcHandle_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeEventCreate
-if __use_win_types:
-    _zeEventCreate_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_event_desc_t), POINTER(ze_event_handle_t) )
-else:
-    _zeEventCreate_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_event_desc_t), POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeEventDestroy
-if __use_win_types:
-    _zeEventDestroy_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
-else:
-    _zeEventDestroy_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeEventHostSignal
-if __use_win_types:
-    _zeEventHostSignal_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
-else:
-    _zeEventHostSignal_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeEventHostSynchronize
-if __use_win_types:
-    _zeEventHostSynchronize_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, c_ulonglong )
-else:
-    _zeEventHostSynchronize_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, c_ulonglong )
-
-###############################################################################
-## @brief Function-pointer for zeEventQueryStatus
-if __use_win_types:
-    _zeEventQueryStatus_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
-else:
-    _zeEventQueryStatus_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeEventHostReset
-if __use_win_types:
-    _zeEventHostReset_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
-else:
-    _zeEventHostReset_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeEventQueryKernelTimestamp
-if __use_win_types:
-    _zeEventQueryKernelTimestamp_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_kernel_timestamp_result_t) )
-else:
-    _zeEventQueryKernelTimestamp_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_kernel_timestamp_result_t) )
-
-
-###############################################################################
-## @brief Table of Event functions pointers
-class _ze_event_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeEventCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeEventDestroy_t
-        ("pfnHostSignal", c_void_p),                                    ## _zeEventHostSignal_t
-        ("pfnHostSynchronize", c_void_p),                               ## _zeEventHostSynchronize_t
-        ("pfnQueryStatus", c_void_p),                                   ## _zeEventQueryStatus_t
-        ("pfnHostReset", c_void_p),                                     ## _zeEventHostReset_t
-        ("pfnQueryKernelTimestamp", c_void_p)                           ## _zeEventQueryKernelTimestamp_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeEventQueryTimestampsExp
-if __use_win_types:
-    _zeEventQueryTimestampsExp_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_kernel_timestamp_result_t) )
-else:
-    _zeEventQueryTimestampsExp_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_kernel_timestamp_result_t) )
-
-
-###############################################################################
-## @brief Table of EventExp functions pointers
-class _ze_event_exp_dditable_t(Structure):
-    _fields_ = [
-        ("pfnQueryTimestampsExp", c_void_p)                             ## _zeEventQueryTimestampsExp_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeModuleCreate
-if __use_win_types:
-    _zeModuleCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_module_desc_t), POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
-else:
-    _zeModuleCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_module_desc_t), POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleDestroy
-if __use_win_types:
-    _zeModuleDestroy_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t )
-else:
-    _zeModuleDestroy_t = CFUNCTYPE( ze_result_t, ze_module_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeModuleDynamicLink
-if __use_win_types:
-    _zeModuleDynamicLink_t = WINFUNCTYPE( ze_result_t, c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
-else:
-    _zeModuleDynamicLink_t = CFUNCTYPE( ze_result_t, c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleGetNativeBinary
-if __use_win_types:
-    _zeModuleGetNativeBinary_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
-else:
-    _zeModuleGetNativeBinary_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleGetGlobalPointer
-if __use_win_types:
-    _zeModuleGetGlobalPointer_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_size_t), POINTER(c_void_p) )
-else:
-    _zeModuleGetGlobalPointer_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_size_t), POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleGetKernelNames
-if __use_win_types:
-    _zeModuleGetKernelNames_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
-else:
-    _zeModuleGetKernelNames_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleGetProperties
-if __use_win_types:
-    _zeModuleGetProperties_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_module_properties_t) )
-else:
-    _zeModuleGetProperties_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_module_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleGetFunctionPointer
-if __use_win_types:
-    _zeModuleGetFunctionPointer_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_void_p) )
-else:
-    _zeModuleGetFunctionPointer_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeModuleInspectLinkageExt
-if __use_win_types:
-    _zeModuleInspectLinkageExt_t = WINFUNCTYPE( ze_result_t, POINTER(ze_linkage_inspection_ext_desc_t), c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
-else:
-    _zeModuleInspectLinkageExt_t = CFUNCTYPE( ze_result_t, POINTER(ze_linkage_inspection_ext_desc_t), c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
-
-
-###############################################################################
-## @brief Table of Module functions pointers
-class _ze_module_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeModuleCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeModuleDestroy_t
-        ("pfnDynamicLink", c_void_p),                                   ## _zeModuleDynamicLink_t
-        ("pfnGetNativeBinary", c_void_p),                               ## _zeModuleGetNativeBinary_t
-        ("pfnGetGlobalPointer", c_void_p),                              ## _zeModuleGetGlobalPointer_t
-        ("pfnGetKernelNames", c_void_p),                                ## _zeModuleGetKernelNames_t
-        ("pfnGetProperties", c_void_p),                                 ## _zeModuleGetProperties_t
-        ("pfnGetFunctionPointer", c_void_p),                            ## _zeModuleGetFunctionPointer_t
-        ("pfnInspectLinkageExt", c_void_p)                              ## _zeModuleInspectLinkageExt_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeModuleBuildLogDestroy
-if __use_win_types:
-    _zeModuleBuildLogDestroy_t = WINFUNCTYPE( ze_result_t, ze_module_build_log_handle_t )
-else:
-    _zeModuleBuildLogDestroy_t = CFUNCTYPE( ze_result_t, ze_module_build_log_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeModuleBuildLogGetString
-if __use_win_types:
-    _zeModuleBuildLogGetString_t = WINFUNCTYPE( ze_result_t, ze_module_build_log_handle_t, POINTER(c_size_t), c_char_p )
-else:
-    _zeModuleBuildLogGetString_t = CFUNCTYPE( ze_result_t, ze_module_build_log_handle_t, POINTER(c_size_t), c_char_p )
-
-
-###############################################################################
-## @brief Table of ModuleBuildLog functions pointers
-class _ze_module_build_log_dditable_t(Structure):
-    _fields_ = [
-        ("pfnDestroy", c_void_p),                                       ## _zeModuleBuildLogDestroy_t
-        ("pfnGetString", c_void_p)                                      ## _zeModuleBuildLogGetString_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeKernelCreate
-if __use_win_types:
-    _zeKernelCreate_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_kernel_desc_t), POINTER(ze_kernel_handle_t) )
-else:
-    _zeKernelCreate_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_kernel_desc_t), POINTER(ze_kernel_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeKernelDestroy
-if __use_win_types:
-    _zeKernelDestroy_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t )
-else:
-    _zeKernelDestroy_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSetCacheConfig
-if __use_win_types:
-    _zeKernelSetCacheConfig_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_cache_config_flags_t )
-else:
-    _zeKernelSetCacheConfig_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_cache_config_flags_t )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSetGroupSize
-if __use_win_types:
-    _zeKernelSetGroupSize_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
-else:
-    _zeKernelSetGroupSize_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSuggestGroupSize
-if __use_win_types:
-    _zeKernelSuggestGroupSize_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong, POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong) )
-else:
-    _zeKernelSuggestGroupSize_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong, POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong) )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSuggestMaxCooperativeGroupCount
-if __use_win_types:
-    _zeKernelSuggestMaxCooperativeGroupCount_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong) )
-else:
-    _zeKernelSuggestMaxCooperativeGroupCount_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong) )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSetArgumentValue
-if __use_win_types:
-    _zeKernelSetArgumentValue_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_size_t, c_void_p )
-else:
-    _zeKernelSetArgumentValue_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_size_t, c_void_p )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSetIndirectAccess
-if __use_win_types:
-    _zeKernelSetIndirectAccess_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_kernel_indirect_access_flags_t )
-else:
-    _zeKernelSetIndirectAccess_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_kernel_indirect_access_flags_t )
-
-###############################################################################
-## @brief Function-pointer for zeKernelGetIndirectAccess
-if __use_win_types:
-    _zeKernelGetIndirectAccess_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_indirect_access_flags_t) )
-else:
-    _zeKernelGetIndirectAccess_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_indirect_access_flags_t) )
-
-###############################################################################
-## @brief Function-pointer for zeKernelGetSourceAttributes
-if __use_win_types:
-    _zeKernelGetSourceAttributes_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
-else:
-    _zeKernelGetSourceAttributes_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
-
-###############################################################################
-## @brief Function-pointer for zeKernelGetProperties
-if __use_win_types:
-    _zeKernelGetProperties_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_properties_t) )
-else:
-    _zeKernelGetProperties_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zeKernelGetName
-if __use_win_types:
-    _zeKernelGetName_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_size_t), c_char_p )
-else:
-    _zeKernelGetName_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_size_t), c_char_p )
-
-
-###############################################################################
-## @brief Table of Kernel functions pointers
-class _ze_kernel_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeKernelCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zeKernelDestroy_t
-        ("pfnSetCacheConfig", c_void_p),                                ## _zeKernelSetCacheConfig_t
-        ("pfnSetGroupSize", c_void_p),                                  ## _zeKernelSetGroupSize_t
-        ("pfnSuggestGroupSize", c_void_p),                              ## _zeKernelSuggestGroupSize_t
-        ("pfnSuggestMaxCooperativeGroupCount", c_void_p),               ## _zeKernelSuggestMaxCooperativeGroupCount_t
-        ("pfnSetArgumentValue", c_void_p),                              ## _zeKernelSetArgumentValue_t
-        ("pfnSetIndirectAccess", c_void_p),                             ## _zeKernelSetIndirectAccess_t
-        ("pfnGetIndirectAccess", c_void_p),                             ## _zeKernelGetIndirectAccess_t
-        ("pfnGetSourceAttributes", c_void_p),                           ## _zeKernelGetSourceAttributes_t
-        ("pfnGetProperties", c_void_p),                                 ## _zeKernelGetProperties_t
-        ("pfnGetName", c_void_p)                                        ## _zeKernelGetName_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeKernelSetGlobalOffsetExp
-if __use_win_types:
-    _zeKernelSetGlobalOffsetExp_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
-else:
-    _zeKernelSetGlobalOffsetExp_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
-
-###############################################################################
-## @brief Function-pointer for zeKernelSchedulingHintExp
-if __use_win_types:
-    _zeKernelSchedulingHintExp_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_scheduling_hint_exp_desc_t) )
-else:
-    _zeKernelSchedulingHintExp_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_scheduling_hint_exp_desc_t) )
-
-
-###############################################################################
-## @brief Table of KernelExp functions pointers
-class _ze_kernel_exp_dditable_t(Structure):
-    _fields_ = [
-        ("pfnSetGlobalOffsetExp", c_void_p),                            ## _zeKernelSetGlobalOffsetExp_t
-        ("pfnSchedulingHintExp", c_void_p)                              ## _zeKernelSchedulingHintExp_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeSamplerCreate
-if __use_win_types:
-    _zeSamplerCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_sampler_desc_t), POINTER(ze_sampler_handle_t) )
-else:
-    _zeSamplerCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_sampler_desc_t), POINTER(ze_sampler_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeSamplerDestroy
-if __use_win_types:
-    _zeSamplerDestroy_t = WINFUNCTYPE( ze_result_t, ze_sampler_handle_t )
-else:
-    _zeSamplerDestroy_t = CFUNCTYPE( ze_result_t, ze_sampler_handle_t )
-
-
-###############################################################################
-## @brief Table of Sampler functions pointers
-class _ze_sampler_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zeSamplerCreate_t
-        ("pfnDestroy", c_void_p)                                        ## _zeSamplerDestroy_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zePhysicalMemCreate
-if __use_win_types:
-    _zePhysicalMemCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_physical_mem_desc_t), POINTER(ze_physical_mem_handle_t) )
-else:
-    _zePhysicalMemCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_physical_mem_desc_t), POINTER(ze_physical_mem_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zePhysicalMemDestroy
-if __use_win_types:
-    _zePhysicalMemDestroy_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_physical_mem_handle_t )
-else:
-    _zePhysicalMemDestroy_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_physical_mem_handle_t )
-
-
-###############################################################################
-## @brief Table of PhysicalMem functions pointers
-class _ze_physical_mem_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zePhysicalMemCreate_t
-        ("pfnDestroy", c_void_p)                                        ## _zePhysicalMemDestroy_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeMemAllocShared
-if __use_win_types:
-    _zeMemAllocShared_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
-else:
-    _zeMemAllocShared_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeMemAllocDevice
-if __use_win_types:
-    _zeMemAllocDevice_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
-else:
-    _zeMemAllocDevice_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeMemAllocHost
-if __use_win_types:
-    _zeMemAllocHost_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, POINTER(c_void_p) )
-else:
-    _zeMemAllocHost_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeMemFree
-if __use_win_types:
-    _zeMemFree_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
-else:
-    _zeMemFree_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
-
-###############################################################################
-## @brief Function-pointer for zeMemGetAllocProperties
-if __use_win_types:
-    _zeMemGetAllocProperties_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_memory_allocation_properties_t), POINTER(ze_device_handle_t) )
-else:
-    _zeMemGetAllocProperties_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_memory_allocation_properties_t), POINTER(ze_device_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeMemGetAddressRange
-if __use_win_types:
-    _zeMemGetAddressRange_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(c_void_p), POINTER(c_size_t) )
-else:
-    _zeMemGetAddressRange_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(c_void_p), POINTER(c_size_t) )
-
-###############################################################################
-## @brief Function-pointer for zeMemGetIpcHandle
-if __use_win_types:
-    _zeMemGetIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_ipc_mem_handle_t) )
-else:
-    _zeMemGetIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_ipc_mem_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zeMemOpenIpcHandle
-if __use_win_types:
-    _zeMemOpenIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_ipc_mem_handle_t, ze_ipc_memory_flags_t, POINTER(c_void_p) )
-else:
-    _zeMemOpenIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_ipc_mem_handle_t, ze_ipc_memory_flags_t, POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeMemCloseIpcHandle
-if __use_win_types:
-    _zeMemCloseIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
-else:
-    _zeMemCloseIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
-
-###############################################################################
-## @brief Function-pointer for zeMemFreeExt
-if __use_win_types:
-    _zeMemFreeExt_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_memory_free_ext_desc_t), c_void_p )
-else:
-    _zeMemFreeExt_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_memory_free_ext_desc_t), c_void_p )
-
-
-###############################################################################
-## @brief Table of Mem functions pointers
-class _ze_mem_dditable_t(Structure):
-    _fields_ = [
-        ("pfnAllocShared", c_void_p),                                   ## _zeMemAllocShared_t
-        ("pfnAllocDevice", c_void_p),                                   ## _zeMemAllocDevice_t
-        ("pfnAllocHost", c_void_p),                                     ## _zeMemAllocHost_t
-        ("pfnFree", c_void_p),                                          ## _zeMemFree_t
-        ("pfnGetAllocProperties", c_void_p),                            ## _zeMemGetAllocProperties_t
-        ("pfnGetAddressRange", c_void_p),                               ## _zeMemGetAddressRange_t
-        ("pfnGetIpcHandle", c_void_p),                                  ## _zeMemGetIpcHandle_t
-        ("pfnOpenIpcHandle", c_void_p),                                 ## _zeMemOpenIpcHandle_t
-        ("pfnCloseIpcHandle", c_void_p),                                ## _zeMemCloseIpcHandle_t
-        ("pfnFreeExt", c_void_p)                                        ## _zeMemFreeExt_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemReserve
-if __use_win_types:
-    _zeVirtualMemReserve_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(c_void_p) )
-else:
-    _zeVirtualMemReserve_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(c_void_p) )
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemFree
-if __use_win_types:
-    _zeVirtualMemFree_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
-else:
-    _zeVirtualMemFree_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemQueryPageSize
-if __use_win_types:
-    _zeVirtualMemQueryPageSize_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_size_t, POINTER(c_size_t) )
-else:
-    _zeVirtualMemQueryPageSize_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_size_t, POINTER(c_size_t) )
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemMap
-if __use_win_types:
-    _zeVirtualMemMap_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_physical_mem_handle_t, c_size_t, ze_memory_access_attribute_t )
-else:
-    _zeVirtualMemMap_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_physical_mem_handle_t, c_size_t, ze_memory_access_attribute_t )
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemUnmap
-if __use_win_types:
-    _zeVirtualMemUnmap_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
-else:
-    _zeVirtualMemUnmap_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemSetAccessAttribute
-if __use_win_types:
-    _zeVirtualMemSetAccessAttribute_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_memory_access_attribute_t )
-else:
-    _zeVirtualMemSetAccessAttribute_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_memory_access_attribute_t )
-
-###############################################################################
-## @brief Function-pointer for zeVirtualMemGetAccessAttribute
-if __use_win_types:
-    _zeVirtualMemGetAccessAttribute_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(ze_memory_access_attribute_t), POINTER(c_size_t) )
-else:
-    _zeVirtualMemGetAccessAttribute_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(ze_memory_access_attribute_t), POINTER(c_size_t) )
-
-
-###############################################################################
-## @brief Table of VirtualMem functions pointers
-class _ze_virtual_mem_dditable_t(Structure):
-    _fields_ = [
-        ("pfnReserve", c_void_p),                                       ## _zeVirtualMemReserve_t
-        ("pfnFree", c_void_p),                                          ## _zeVirtualMemFree_t
-        ("pfnQueryPageSize", c_void_p),                                 ## _zeVirtualMemQueryPageSize_t
-        ("pfnMap", c_void_p),                                           ## _zeVirtualMemMap_t
-        ("pfnUnmap", c_void_p),                                         ## _zeVirtualMemUnmap_t
-        ("pfnSetAccessAttribute", c_void_p),                            ## _zeVirtualMemSetAccessAttribute_t
-        ("pfnGetAccessAttribute", c_void_p)                             ## _zeVirtualMemGetAccessAttribute_t
-    ]
-
-###############################################################################
-class _ze_dditable_t(Structure):
-    _fields_ = [
-        ("Global", _ze_global_dditable_t),
-        ("Driver", _ze_driver_dditable_t),
-        ("Device", _ze_device_dditable_t),
-        ("Context", _ze_context_dditable_t),
-        ("CommandQueue", _ze_command_queue_dditable_t),
-        ("CommandList", _ze_command_list_dditable_t),
-        ("Image", _ze_image_dditable_t),
-        ("ImageExp", _ze_image_exp_dditable_t),
-        ("Fence", _ze_fence_dditable_t),
-        ("EventPool", _ze_event_pool_dditable_t),
-        ("Event", _ze_event_dditable_t),
-        ("EventExp", _ze_event_exp_dditable_t),
-        ("Module", _ze_module_dditable_t),
-        ("ModuleBuildLog", _ze_module_build_log_dditable_t),
-        ("Kernel", _ze_kernel_dditable_t),
-        ("KernelExp", _ze_kernel_exp_dditable_t),
-        ("Sampler", _ze_sampler_dditable_t),
-        ("PhysicalMem", _ze_physical_mem_dditable_t),
-        ("Mem", _ze_mem_dditable_t),
-        ("VirtualMem", _ze_virtual_mem_dditable_t)
-    ]
-
-###############################################################################
-## @brief ze device-driver interfaces
-class ZE_DDI:
-    def __init__(self, version : ze_api_version_t):
-        # load the ze_loader library
-        if "Windows" == platform.uname()[0]:
-            self.__dll = WinDLL("ze_loader.dll")
-        else:
-            self.__dll = CDLL("ze_loader.so")
-
-        # fill the ddi tables
-        self.__dditable = _ze_dditable_t()
-
-        # call driver to get function pointers
-        _Global = _ze_global_dditable_t()
-        r = ze_result_v(self.__dll.zeGetGlobalProcAddrTable(version, byref(_Global)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Global = _Global
-
-        # attach function interface to function address
-        self.zeInit = _zeInit_t(self.__dditable.Global.pfnInit)
-
-        # call driver to get function pointers
-        _Driver = _ze_driver_dditable_t()
-        r = ze_result_v(self.__dll.zeGetDriverProcAddrTable(version, byref(_Driver)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Driver = _Driver
-
-        # attach function interface to function address
-        self.zeDriverGet = _zeDriverGet_t(self.__dditable.Driver.pfnGet)
-        self.zeDriverGetApiVersion = _zeDriverGetApiVersion_t(self.__dditable.Driver.pfnGetApiVersion)
-        self.zeDriverGetProperties = _zeDriverGetProperties_t(self.__dditable.Driver.pfnGetProperties)
-        self.zeDriverGetIpcProperties = _zeDriverGetIpcProperties_t(self.__dditable.Driver.pfnGetIpcProperties)
-        self.zeDriverGetExtensionProperties = _zeDriverGetExtensionProperties_t(self.__dditable.Driver.pfnGetExtensionProperties)
-        self.zeDriverGetExtensionFunctionAddress = _zeDriverGetExtensionFunctionAddress_t(self.__dditable.Driver.pfnGetExtensionFunctionAddress)
-
-        # call driver to get function pointers
-        _Device = _ze_device_dditable_t()
-        r = ze_result_v(self.__dll.zeGetDeviceProcAddrTable(version, byref(_Device)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Device = _Device
-
-        # attach function interface to function address
-        self.zeDeviceGet = _zeDeviceGet_t(self.__dditable.Device.pfnGet)
-        self.zeDeviceGetSubDevices = _zeDeviceGetSubDevices_t(self.__dditable.Device.pfnGetSubDevices)
-        self.zeDeviceGetProperties = _zeDeviceGetProperties_t(self.__dditable.Device.pfnGetProperties)
-        self.zeDeviceGetComputeProperties = _zeDeviceGetComputeProperties_t(self.__dditable.Device.pfnGetComputeProperties)
-        self.zeDeviceGetModuleProperties = _zeDeviceGetModuleProperties_t(self.__dditable.Device.pfnGetModuleProperties)
-        self.zeDeviceGetCommandQueueGroupProperties = _zeDeviceGetCommandQueueGroupProperties_t(self.__dditable.Device.pfnGetCommandQueueGroupProperties)
-        self.zeDeviceGetMemoryProperties = _zeDeviceGetMemoryProperties_t(self.__dditable.Device.pfnGetMemoryProperties)
-        self.zeDeviceGetMemoryAccessProperties = _zeDeviceGetMemoryAccessProperties_t(self.__dditable.Device.pfnGetMemoryAccessProperties)
-        self.zeDeviceGetCacheProperties = _zeDeviceGetCacheProperties_t(self.__dditable.Device.pfnGetCacheProperties)
-        self.zeDeviceGetImageProperties = _zeDeviceGetImageProperties_t(self.__dditable.Device.pfnGetImageProperties)
-        self.zeDeviceGetExternalMemoryProperties = _zeDeviceGetExternalMemoryProperties_t(self.__dditable.Device.pfnGetExternalMemoryProperties)
-        self.zeDeviceGetP2PProperties = _zeDeviceGetP2PProperties_t(self.__dditable.Device.pfnGetP2PProperties)
-        self.zeDeviceCanAccessPeer = _zeDeviceCanAccessPeer_t(self.__dditable.Device.pfnCanAccessPeer)
-        self.zeDeviceGetStatus = _zeDeviceGetStatus_t(self.__dditable.Device.pfnGetStatus)
-        self.zeDeviceGetGlobalTimestamps = _zeDeviceGetGlobalTimestamps_t(self.__dditable.Device.pfnGetGlobalTimestamps)
-        self.zeDeviceReserveCacheExt = _zeDeviceReserveCacheExt_t(self.__dditable.Device.pfnReserveCacheExt)
-        self.zeDeviceSetCacheAdviceExt = _zeDeviceSetCacheAdviceExt_t(self.__dditable.Device.pfnSetCacheAdviceExt)
-        self.zeDevicePciGetPropertiesExt = _zeDevicePciGetPropertiesExt_t(self.__dditable.Device.pfnPciGetPropertiesExt)
-
-        # call driver to get function pointers
-        _Context = _ze_context_dditable_t()
-        r = ze_result_v(self.__dll.zeGetContextProcAddrTable(version, byref(_Context)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Context = _Context
-
-        # attach function interface to function address
-        self.zeContextCreate = _zeContextCreate_t(self.__dditable.Context.pfnCreate)
-        self.zeContextDestroy = _zeContextDestroy_t(self.__dditable.Context.pfnDestroy)
-        self.zeContextGetStatus = _zeContextGetStatus_t(self.__dditable.Context.pfnGetStatus)
-        self.zeContextSystemBarrier = _zeContextSystemBarrier_t(self.__dditable.Context.pfnSystemBarrier)
-        self.zeContextMakeMemoryResident = _zeContextMakeMemoryResident_t(self.__dditable.Context.pfnMakeMemoryResident)
-        self.zeContextEvictMemory = _zeContextEvictMemory_t(self.__dditable.Context.pfnEvictMemory)
-        self.zeContextMakeImageResident = _zeContextMakeImageResident_t(self.__dditable.Context.pfnMakeImageResident)
-        self.zeContextEvictImage = _zeContextEvictImage_t(self.__dditable.Context.pfnEvictImage)
-        self.zeContextCreateEx = _zeContextCreateEx_t(self.__dditable.Context.pfnCreateEx)
-
-        # call driver to get function pointers
-        _CommandQueue = _ze_command_queue_dditable_t()
-        r = ze_result_v(self.__dll.zeGetCommandQueueProcAddrTable(version, byref(_CommandQueue)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.CommandQueue = _CommandQueue
-
-        # attach function interface to function address
-        self.zeCommandQueueCreate = _zeCommandQueueCreate_t(self.__dditable.CommandQueue.pfnCreate)
-        self.zeCommandQueueDestroy = _zeCommandQueueDestroy_t(self.__dditable.CommandQueue.pfnDestroy)
-        self.zeCommandQueueExecuteCommandLists = _zeCommandQueueExecuteCommandLists_t(self.__dditable.CommandQueue.pfnExecuteCommandLists)
-        self.zeCommandQueueSynchronize = _zeCommandQueueSynchronize_t(self.__dditable.CommandQueue.pfnSynchronize)
-
-        # call driver to get function pointers
-        _CommandList = _ze_command_list_dditable_t()
-        r = ze_result_v(self.__dll.zeGetCommandListProcAddrTable(version, byref(_CommandList)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.CommandList = _CommandList
-
-        # attach function interface to function address
-        self.zeCommandListCreate = _zeCommandListCreate_t(self.__dditable.CommandList.pfnCreate)
-        self.zeCommandListCreateImmediate = _zeCommandListCreateImmediate_t(self.__dditable.CommandList.pfnCreateImmediate)
-        self.zeCommandListDestroy = _zeCommandListDestroy_t(self.__dditable.CommandList.pfnDestroy)
-        self.zeCommandListClose = _zeCommandListClose_t(self.__dditable.CommandList.pfnClose)
-        self.zeCommandListReset = _zeCommandListReset_t(self.__dditable.CommandList.pfnReset)
-        self.zeCommandListAppendWriteGlobalTimestamp = _zeCommandListAppendWriteGlobalTimestamp_t(self.__dditable.CommandList.pfnAppendWriteGlobalTimestamp)
-        self.zeCommandListAppendBarrier = _zeCommandListAppendBarrier_t(self.__dditable.CommandList.pfnAppendBarrier)
-        self.zeCommandListAppendMemoryRangesBarrier = _zeCommandListAppendMemoryRangesBarrier_t(self.__dditable.CommandList.pfnAppendMemoryRangesBarrier)
-        self.zeCommandListAppendMemoryCopy = _zeCommandListAppendMemoryCopy_t(self.__dditable.CommandList.pfnAppendMemoryCopy)
-        self.zeCommandListAppendMemoryFill = _zeCommandListAppendMemoryFill_t(self.__dditable.CommandList.pfnAppendMemoryFill)
-        self.zeCommandListAppendMemoryCopyRegion = _zeCommandListAppendMemoryCopyRegion_t(self.__dditable.CommandList.pfnAppendMemoryCopyRegion)
-        self.zeCommandListAppendMemoryCopyFromContext = _zeCommandListAppendMemoryCopyFromContext_t(self.__dditable.CommandList.pfnAppendMemoryCopyFromContext)
-        self.zeCommandListAppendImageCopy = _zeCommandListAppendImageCopy_t(self.__dditable.CommandList.pfnAppendImageCopy)
-        self.zeCommandListAppendImageCopyRegion = _zeCommandListAppendImageCopyRegion_t(self.__dditable.CommandList.pfnAppendImageCopyRegion)
-        self.zeCommandListAppendImageCopyToMemory = _zeCommandListAppendImageCopyToMemory_t(self.__dditable.CommandList.pfnAppendImageCopyToMemory)
-        self.zeCommandListAppendImageCopyFromMemory = _zeCommandListAppendImageCopyFromMemory_t(self.__dditable.CommandList.pfnAppendImageCopyFromMemory)
-        self.zeCommandListAppendMemoryPrefetch = _zeCommandListAppendMemoryPrefetch_t(self.__dditable.CommandList.pfnAppendMemoryPrefetch)
-        self.zeCommandListAppendMemAdvise = _zeCommandListAppendMemAdvise_t(self.__dditable.CommandList.pfnAppendMemAdvise)
-        self.zeCommandListAppendSignalEvent = _zeCommandListAppendSignalEvent_t(self.__dditable.CommandList.pfnAppendSignalEvent)
-        self.zeCommandListAppendWaitOnEvents = _zeCommandListAppendWaitOnEvents_t(self.__dditable.CommandList.pfnAppendWaitOnEvents)
-        self.zeCommandListAppendEventReset = _zeCommandListAppendEventReset_t(self.__dditable.CommandList.pfnAppendEventReset)
-        self.zeCommandListAppendQueryKernelTimestamps = _zeCommandListAppendQueryKernelTimestamps_t(self.__dditable.CommandList.pfnAppendQueryKernelTimestamps)
-        self.zeCommandListAppendLaunchKernel = _zeCommandListAppendLaunchKernel_t(self.__dditable.CommandList.pfnAppendLaunchKernel)
-        self.zeCommandListAppendLaunchCooperativeKernel = _zeCommandListAppendLaunchCooperativeKernel_t(self.__dditable.CommandList.pfnAppendLaunchCooperativeKernel)
-        self.zeCommandListAppendLaunchKernelIndirect = _zeCommandListAppendLaunchKernelIndirect_t(self.__dditable.CommandList.pfnAppendLaunchKernelIndirect)
-        self.zeCommandListAppendLaunchMultipleKernelsIndirect = _zeCommandListAppendLaunchMultipleKernelsIndirect_t(self.__dditable.CommandList.pfnAppendLaunchMultipleKernelsIndirect)
-        self.zeCommandListAppendImageCopyToMemoryExt = _zeCommandListAppendImageCopyToMemoryExt_t(self.__dditable.CommandList.pfnAppendImageCopyToMemoryExt)
-        self.zeCommandListAppendImageCopyFromMemoryExt = _zeCommandListAppendImageCopyFromMemoryExt_t(self.__dditable.CommandList.pfnAppendImageCopyFromMemoryExt)
-
-        # call driver to get function pointers
-        _Image = _ze_image_dditable_t()
-        r = ze_result_v(self.__dll.zeGetImageProcAddrTable(version, byref(_Image)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Image = _Image
-
-        # attach function interface to function address
-        self.zeImageGetProperties = _zeImageGetProperties_t(self.__dditable.Image.pfnGetProperties)
-        self.zeImageCreate = _zeImageCreate_t(self.__dditable.Image.pfnCreate)
-        self.zeImageDestroy = _zeImageDestroy_t(self.__dditable.Image.pfnDestroy)
-        self.zeImageGetAllocPropertiesExt = _zeImageGetAllocPropertiesExt_t(self.__dditable.Image.pfnGetAllocPropertiesExt)
-
-        # call driver to get function pointers
-        _ImageExp = _ze_image_exp_dditable_t()
-        r = ze_result_v(self.__dll.zeGetImageExpProcAddrTable(version, byref(_ImageExp)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.ImageExp = _ImageExp
-
-        # attach function interface to function address
-        self.zeImageGetMemoryPropertiesExp = _zeImageGetMemoryPropertiesExp_t(self.__dditable.ImageExp.pfnGetMemoryPropertiesExp)
-        self.zeImageViewCreateExp = _zeImageViewCreateExp_t(self.__dditable.ImageExp.pfnViewCreateExp)
-
-        # call driver to get function pointers
-        _Fence = _ze_fence_dditable_t()
-        r = ze_result_v(self.__dll.zeGetFenceProcAddrTable(version, byref(_Fence)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Fence = _Fence
-
-        # attach function interface to function address
-        self.zeFenceCreate = _zeFenceCreate_t(self.__dditable.Fence.pfnCreate)
-        self.zeFenceDestroy = _zeFenceDestroy_t(self.__dditable.Fence.pfnDestroy)
-        self.zeFenceHostSynchronize = _zeFenceHostSynchronize_t(self.__dditable.Fence.pfnHostSynchronize)
-        self.zeFenceQueryStatus = _zeFenceQueryStatus_t(self.__dditable.Fence.pfnQueryStatus)
-        self.zeFenceReset = _zeFenceReset_t(self.__dditable.Fence.pfnReset)
-
-        # call driver to get function pointers
-        _EventPool = _ze_event_pool_dditable_t()
-        r = ze_result_v(self.__dll.zeGetEventPoolProcAddrTable(version, byref(_EventPool)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.EventPool = _EventPool
-
-        # attach function interface to function address
-        self.zeEventPoolCreate = _zeEventPoolCreate_t(self.__dditable.EventPool.pfnCreate)
-        self.zeEventPoolDestroy = _zeEventPoolDestroy_t(self.__dditable.EventPool.pfnDestroy)
-        self.zeEventPoolGetIpcHandle = _zeEventPoolGetIpcHandle_t(self.__dditable.EventPool.pfnGetIpcHandle)
-        self.zeEventPoolOpenIpcHandle = _zeEventPoolOpenIpcHandle_t(self.__dditable.EventPool.pfnOpenIpcHandle)
-        self.zeEventPoolCloseIpcHandle = _zeEventPoolCloseIpcHandle_t(self.__dditable.EventPool.pfnCloseIpcHandle)
-
-        # call driver to get function pointers
-        _Event = _ze_event_dditable_t()
-        r = ze_result_v(self.__dll.zeGetEventProcAddrTable(version, byref(_Event)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Event = _Event
-
-        # attach function interface to function address
-        self.zeEventCreate = _zeEventCreate_t(self.__dditable.Event.pfnCreate)
-        self.zeEventDestroy = _zeEventDestroy_t(self.__dditable.Event.pfnDestroy)
-        self.zeEventHostSignal = _zeEventHostSignal_t(self.__dditable.Event.pfnHostSignal)
-        self.zeEventHostSynchronize = _zeEventHostSynchronize_t(self.__dditable.Event.pfnHostSynchronize)
-        self.zeEventQueryStatus = _zeEventQueryStatus_t(self.__dditable.Event.pfnQueryStatus)
-        self.zeEventHostReset = _zeEventHostReset_t(self.__dditable.Event.pfnHostReset)
-        self.zeEventQueryKernelTimestamp = _zeEventQueryKernelTimestamp_t(self.__dditable.Event.pfnQueryKernelTimestamp)
-
-        # call driver to get function pointers
-        _EventExp = _ze_event_exp_dditable_t()
-        r = ze_result_v(self.__dll.zeGetEventExpProcAddrTable(version, byref(_EventExp)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.EventExp = _EventExp
-
-        # attach function interface to function address
-        self.zeEventQueryTimestampsExp = _zeEventQueryTimestampsExp_t(self.__dditable.EventExp.pfnQueryTimestampsExp)
-
-        # call driver to get function pointers
-        _Module = _ze_module_dditable_t()
-        r = ze_result_v(self.__dll.zeGetModuleProcAddrTable(version, byref(_Module)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Module = _Module
-
-        # attach function interface to function address
-        self.zeModuleCreate = _zeModuleCreate_t(self.__dditable.Module.pfnCreate)
-        self.zeModuleDestroy = _zeModuleDestroy_t(self.__dditable.Module.pfnDestroy)
-        self.zeModuleDynamicLink = _zeModuleDynamicLink_t(self.__dditable.Module.pfnDynamicLink)
-        self.zeModuleGetNativeBinary = _zeModuleGetNativeBinary_t(self.__dditable.Module.pfnGetNativeBinary)
-        self.zeModuleGetGlobalPointer = _zeModuleGetGlobalPointer_t(self.__dditable.Module.pfnGetGlobalPointer)
-        self.zeModuleGetKernelNames = _zeModuleGetKernelNames_t(self.__dditable.Module.pfnGetKernelNames)
-        self.zeModuleGetProperties = _zeModuleGetProperties_t(self.__dditable.Module.pfnGetProperties)
-        self.zeModuleGetFunctionPointer = _zeModuleGetFunctionPointer_t(self.__dditable.Module.pfnGetFunctionPointer)
-        self.zeModuleInspectLinkageExt = _zeModuleInspectLinkageExt_t(self.__dditable.Module.pfnInspectLinkageExt)
-
-        # call driver to get function pointers
-        _ModuleBuildLog = _ze_module_build_log_dditable_t()
-        r = ze_result_v(self.__dll.zeGetModuleBuildLogProcAddrTable(version, byref(_ModuleBuildLog)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.ModuleBuildLog = _ModuleBuildLog
-
-        # attach function interface to function address
-        self.zeModuleBuildLogDestroy = _zeModuleBuildLogDestroy_t(self.__dditable.ModuleBuildLog.pfnDestroy)
-        self.zeModuleBuildLogGetString = _zeModuleBuildLogGetString_t(self.__dditable.ModuleBuildLog.pfnGetString)
-
-        # call driver to get function pointers
-        _Kernel = _ze_kernel_dditable_t()
-        r = ze_result_v(self.__dll.zeGetKernelProcAddrTable(version, byref(_Kernel)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Kernel = _Kernel
-
-        # attach function interface to function address
-        self.zeKernelCreate = _zeKernelCreate_t(self.__dditable.Kernel.pfnCreate)
-        self.zeKernelDestroy = _zeKernelDestroy_t(self.__dditable.Kernel.pfnDestroy)
-        self.zeKernelSetCacheConfig = _zeKernelSetCacheConfig_t(self.__dditable.Kernel.pfnSetCacheConfig)
-        self.zeKernelSetGroupSize = _zeKernelSetGroupSize_t(self.__dditable.Kernel.pfnSetGroupSize)
-        self.zeKernelSuggestGroupSize = _zeKernelSuggestGroupSize_t(self.__dditable.Kernel.pfnSuggestGroupSize)
-        self.zeKernelSuggestMaxCooperativeGroupCount = _zeKernelSuggestMaxCooperativeGroupCount_t(self.__dditable.Kernel.pfnSuggestMaxCooperativeGroupCount)
-        self.zeKernelSetArgumentValue = _zeKernelSetArgumentValue_t(self.__dditable.Kernel.pfnSetArgumentValue)
-        self.zeKernelSetIndirectAccess = _zeKernelSetIndirectAccess_t(self.__dditable.Kernel.pfnSetIndirectAccess)
-        self.zeKernelGetIndirectAccess = _zeKernelGetIndirectAccess_t(self.__dditable.Kernel.pfnGetIndirectAccess)
-        self.zeKernelGetSourceAttributes = _zeKernelGetSourceAttributes_t(self.__dditable.Kernel.pfnGetSourceAttributes)
-        self.zeKernelGetProperties = _zeKernelGetProperties_t(self.__dditable.Kernel.pfnGetProperties)
-        self.zeKernelGetName = _zeKernelGetName_t(self.__dditable.Kernel.pfnGetName)
-
-        # call driver to get function pointers
-        _KernelExp = _ze_kernel_exp_dditable_t()
-        r = ze_result_v(self.__dll.zeGetKernelExpProcAddrTable(version, byref(_KernelExp)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.KernelExp = _KernelExp
-
-        # attach function interface to function address
-        self.zeKernelSetGlobalOffsetExp = _zeKernelSetGlobalOffsetExp_t(self.__dditable.KernelExp.pfnSetGlobalOffsetExp)
-        self.zeKernelSchedulingHintExp = _zeKernelSchedulingHintExp_t(self.__dditable.KernelExp.pfnSchedulingHintExp)
-
-        # call driver to get function pointers
-        _Sampler = _ze_sampler_dditable_t()
-        r = ze_result_v(self.__dll.zeGetSamplerProcAddrTable(version, byref(_Sampler)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Sampler = _Sampler
-
-        # attach function interface to function address
-        self.zeSamplerCreate = _zeSamplerCreate_t(self.__dditable.Sampler.pfnCreate)
-        self.zeSamplerDestroy = _zeSamplerDestroy_t(self.__dditable.Sampler.pfnDestroy)
-
-        # call driver to get function pointers
-        _PhysicalMem = _ze_physical_mem_dditable_t()
-        r = ze_result_v(self.__dll.zeGetPhysicalMemProcAddrTable(version, byref(_PhysicalMem)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.PhysicalMem = _PhysicalMem
-
-        # attach function interface to function address
-        self.zePhysicalMemCreate = _zePhysicalMemCreate_t(self.__dditable.PhysicalMem.pfnCreate)
-        self.zePhysicalMemDestroy = _zePhysicalMemDestroy_t(self.__dditable.PhysicalMem.pfnDestroy)
-
-        # call driver to get function pointers
-        _Mem = _ze_mem_dditable_t()
-        r = ze_result_v(self.__dll.zeGetMemProcAddrTable(version, byref(_Mem)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Mem = _Mem
-
-        # attach function interface to function address
-        self.zeMemAllocShared = _zeMemAllocShared_t(self.__dditable.Mem.pfnAllocShared)
-        self.zeMemAllocDevice = _zeMemAllocDevice_t(self.__dditable.Mem.pfnAllocDevice)
-        self.zeMemAllocHost = _zeMemAllocHost_t(self.__dditable.Mem.pfnAllocHost)
-        self.zeMemFree = _zeMemFree_t(self.__dditable.Mem.pfnFree)
-        self.zeMemGetAllocProperties = _zeMemGetAllocProperties_t(self.__dditable.Mem.pfnGetAllocProperties)
-        self.zeMemGetAddressRange = _zeMemGetAddressRange_t(self.__dditable.Mem.pfnGetAddressRange)
-        self.zeMemGetIpcHandle = _zeMemGetIpcHandle_t(self.__dditable.Mem.pfnGetIpcHandle)
-        self.zeMemOpenIpcHandle = _zeMemOpenIpcHandle_t(self.__dditable.Mem.pfnOpenIpcHandle)
-        self.zeMemCloseIpcHandle = _zeMemCloseIpcHandle_t(self.__dditable.Mem.pfnCloseIpcHandle)
-        self.zeMemFreeExt = _zeMemFreeExt_t(self.__dditable.Mem.pfnFreeExt)
-
-        # call driver to get function pointers
-        _VirtualMem = _ze_virtual_mem_dditable_t()
-        r = ze_result_v(self.__dll.zeGetVirtualMemProcAddrTable(version, byref(_VirtualMem)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.VirtualMem = _VirtualMem
-
-        # attach function interface to function address
-        self.zeVirtualMemReserve = _zeVirtualMemReserve_t(self.__dditable.VirtualMem.pfnReserve)
-        self.zeVirtualMemFree = _zeVirtualMemFree_t(self.__dditable.VirtualMem.pfnFree)
-        self.zeVirtualMemQueryPageSize = _zeVirtualMemQueryPageSize_t(self.__dditable.VirtualMem.pfnQueryPageSize)
-        self.zeVirtualMemMap = _zeVirtualMemMap_t(self.__dditable.VirtualMem.pfnMap)
-        self.zeVirtualMemUnmap = _zeVirtualMemUnmap_t(self.__dditable.VirtualMem.pfnUnmap)
-        self.zeVirtualMemSetAccessAttribute = _zeVirtualMemSetAccessAttribute_t(self.__dditable.VirtualMem.pfnSetAccessAttribute)
-        self.zeVirtualMemGetAccessAttribute = _zeVirtualMemGetAccessAttribute_t(self.__dditable.VirtualMem.pfnGetAccessAttribute)
-
-        # success!
diff --git a/src/gpu/intel/sycl/l0/level_zero/ze_api.h b/src/gpu/intel/sycl/l0/level_zero/ze_api.h
deleted file mode 100644
index 4451f48795a..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/ze_api.h
+++ /dev/null
@@ -1,10639 +0,0 @@
-/*
- *
- * Copyright (C) 2019-2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file ze_api.h
- * @version v1.3-r1.3.7
- *
- */
-#ifndef _ZE_API_H
-#define _ZE_API_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-
-// standard headers
-#include <stdint.h>
-#include <stddef.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-// Intel 'oneAPI' Level-Zero API common types
-#if !defined(__GNUC__)
-#pragma region common
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAKE_VERSION
-/// @brief Generates generic 'oneAPI' API versions
-#define ZE_MAKE_VERSION( _major, _minor )  (( _major << 16 )|( _minor & 0x0000ffff))
-#endif // ZE_MAKE_VERSION
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAJOR_VERSION
-/// @brief Extracts 'oneAPI' API major version
-#define ZE_MAJOR_VERSION( _ver )  ( _ver >> 16 )
-#endif // ZE_MAJOR_VERSION
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MINOR_VERSION
-/// @brief Extracts 'oneAPI' API minor version
-#define ZE_MINOR_VERSION( _ver )  ( _ver & 0x0000ffff )
-#endif // ZE_MINOR_VERSION
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_APICALL
-#if defined(_WIN32)
-/// @brief Calling convention for all API functions
-#define ZE_APICALL  __cdecl
-#else
-#define ZE_APICALL  
-#endif // defined(_WIN32)
-#endif // ZE_APICALL
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_APIEXPORT
-#if defined(_WIN32)
-/// @brief Microsoft-specific dllexport storage-class attribute
-#define ZE_APIEXPORT  __declspec(dllexport)
-#endif // defined(_WIN32)
-#endif // ZE_APIEXPORT
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_APIEXPORT
-#if __GNUC__ >= 4
-/// @brief GCC-specific dllexport storage-class attribute
-#define ZE_APIEXPORT  __attribute__ ((visibility ("default")))
-#else
-#define ZE_APIEXPORT  
-#endif // __GNUC__ >= 4
-#endif // ZE_APIEXPORT
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_DLLEXPORT
-#if defined(_WIN32)
-/// @brief Microsoft-specific dllexport storage-class attribute
-#define ZE_DLLEXPORT  __declspec(dllexport)
-#endif // defined(_WIN32)
-#endif // ZE_DLLEXPORT
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_DLLEXPORT
-#if __GNUC__ >= 4
-/// @brief GCC-specific dllexport storage-class attribute
-#define ZE_DLLEXPORT  __attribute__ ((visibility ("default")))
-#else
-#define ZE_DLLEXPORT  
-#endif // __GNUC__ >= 4
-#endif // ZE_DLLEXPORT
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief compiler-independent type
-typedef uint8_t ze_bool_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of a driver instance
-typedef struct _ze_driver_handle_t *ze_driver_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's device object
-typedef struct _ze_device_handle_t *ze_device_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's context object
-typedef struct _ze_context_handle_t *ze_context_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's command queue object
-typedef struct _ze_command_queue_handle_t *ze_command_queue_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's command list object
-typedef struct _ze_command_list_handle_t *ze_command_list_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's fence object
-typedef struct _ze_fence_handle_t *ze_fence_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's event pool object
-typedef struct _ze_event_pool_handle_t *ze_event_pool_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's event object
-typedef struct _ze_event_handle_t *ze_event_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's image object
-typedef struct _ze_image_handle_t *ze_image_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's module object
-typedef struct _ze_module_handle_t *ze_module_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of module's build log object
-typedef struct _ze_module_build_log_handle_t *ze_module_build_log_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's kernel object
-typedef struct _ze_kernel_handle_t *ze_kernel_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of driver's sampler object
-typedef struct _ze_sampler_handle_t *ze_sampler_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of physical memory object
-typedef struct _ze_physical_mem_handle_t *ze_physical_mem_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_IPC_HANDLE_SIZE
-/// @brief Maximum IPC handle size
-#define ZE_MAX_IPC_HANDLE_SIZE  64
-#endif // ZE_MAX_IPC_HANDLE_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief IPC handle to a memory allocation
-typedef struct _ze_ipc_mem_handle_t
-{
-    char data[ZE_MAX_IPC_HANDLE_SIZE];              ///< [out] Opaque data representing an IPC handle
-
-} ze_ipc_mem_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief IPC handle to a event pool allocation
-typedef struct _ze_ipc_event_pool_handle_t
-{
-    char data[ZE_MAX_IPC_HANDLE_SIZE];              ///< [out] Opaque data representing an IPC handle
-
-} ze_ipc_event_pool_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_BIT
-/// @brief Generic macro for enumerator bit masks
-#define ZE_BIT( _i )  ( 1 << _i )
-#endif // ZE_BIT
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Defines Return/Error codes
-typedef enum _ze_result_t
-{
-    ZE_RESULT_SUCCESS = 0,                          ///< [Core] success
-    ZE_RESULT_NOT_READY = 1,                        ///< [Core] synchronization primitive not signaled
-    ZE_RESULT_ERROR_DEVICE_LOST = 0x70000001,       ///< [Core] device hung, reset, was removed, or driver update occurred
-    ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY = 0x70000002,///< [Core] insufficient host memory to satisfy call
-    ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003,  ///< [Core] insufficient device memory to satisfy call
-    ZE_RESULT_ERROR_MODULE_BUILD_FAILURE = 0x70000004,  ///< [Core] error occurred when building module, see build log for details
-    ZE_RESULT_ERROR_MODULE_LINK_FAILURE = 0x70000005,   ///< [Core] error occurred when linking modules, see build log for details
-    ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET = 0x70000006, ///< [Core] device requires a reset
-    ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE = 0x70000007, ///< [Core] device currently in low power state
-    ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000,  ///< [Sysman] access denied due to permission level
-    ZE_RESULT_ERROR_NOT_AVAILABLE = 0x70010001,     ///< [Sysman] resource already in use and simultaneous access not allowed
-                                                    ///< or resource was removed
-    ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000,///< [Tools] external required dependency is unavailable or missing
-    ZE_RESULT_ERROR_UNINITIALIZED = 0x78000001,     ///< [Validation] driver is not initialized
-    ZE_RESULT_ERROR_UNSUPPORTED_VERSION = 0x78000002,   ///< [Validation] generic error code for unsupported versions
-    ZE_RESULT_ERROR_UNSUPPORTED_FEATURE = 0x78000003,   ///< [Validation] generic error code for unsupported features
-    ZE_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004,  ///< [Validation] generic error code for invalid arguments
-    ZE_RESULT_ERROR_INVALID_NULL_HANDLE = 0x78000005,   ///< [Validation] handle argument is not valid
-    ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 0x78000006,  ///< [Validation] object pointed to by handle still in-use by device
-    ZE_RESULT_ERROR_INVALID_NULL_POINTER = 0x78000007,  ///< [Validation] pointer argument may not be nullptr
-    ZE_RESULT_ERROR_INVALID_SIZE = 0x78000008,      ///< [Validation] size argument is invalid (e.g., must not be zero)
-    ZE_RESULT_ERROR_UNSUPPORTED_SIZE = 0x78000009,  ///< [Validation] size argument is not supported by the device (e.g., too
-                                                    ///< large)
-    ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a, ///< [Validation] alignment argument is not supported by the device (e.g.,
-                                                    ///< too small)
-    ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b,///< [Validation] synchronization object in invalid state
-    ZE_RESULT_ERROR_INVALID_ENUMERATION = 0x7800000c,   ///< [Validation] enumerator argument is not valid
-    ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d,   ///< [Validation] enumerator argument is not supported by the device
-    ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e,  ///< [Validation] image format is not supported by the device
-    ZE_RESULT_ERROR_INVALID_NATIVE_BINARY = 0x7800000f, ///< [Validation] native binary is not supported by the device
-    ZE_RESULT_ERROR_INVALID_GLOBAL_NAME = 0x78000010,   ///< [Validation] global variable is not found in the module
-    ZE_RESULT_ERROR_INVALID_KERNEL_NAME = 0x78000011,   ///< [Validation] kernel name is not found in the module
-    ZE_RESULT_ERROR_INVALID_FUNCTION_NAME = 0x78000012, ///< [Validation] function name is not found in the module
-    ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013,  ///< [Validation] group size dimension is not valid for the kernel or
-                                                    ///< device
-    ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014,///< [Validation] global width dimension is not valid for the kernel or
-                                                    ///< device
-    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015, ///< [Validation] kernel argument index is not valid for kernel
-    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016,  ///< [Validation] kernel argument size does not match kernel
-    ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017,///< [Validation] value of kernel attribute is not valid for the kernel or
-                                                    ///< device
-    ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED = 0x78000018,   ///< [Validation] module with imports needs to be linked before kernels can
-                                                    ///< be created from it.
-    ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019, ///< [Validation] command list type does not match command queue type
-    ZE_RESULT_ERROR_OVERLAPPING_REGIONS = 0x7800001a,   ///< [Validation] copy operations do not support overlapping regions of
-                                                    ///< memory
-    ZE_RESULT_ERROR_UNKNOWN = 0x7ffffffe,           ///< [Core] unknown or internal error
-    ZE_RESULT_FORCE_UINT32 = 0x7fffffff
-
-} ze_result_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Defines structure types
-typedef enum _ze_structure_type_t
-{
-    ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES = 0x1,      ///< ::ze_driver_properties_t
-    ZE_STRUCTURE_TYPE_DRIVER_IPC_PROPERTIES = 0x2,  ///< ::ze_driver_ipc_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3,      ///< ::ze_device_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES = 0x4,  ///< ::ze_device_compute_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES = 0x5,   ///< ::ze_device_module_properties_t
-    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES = 0x6, ///< ::ze_command_queue_group_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES = 0x7,   ///< ::ze_device_memory_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8,///< ::ze_device_memory_access_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES = 0x9,///< ::ze_device_cache_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES = 0xa,///< ::ze_device_image_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_P2P_PROPERTIES = 0xb,  ///< ::ze_device_p2p_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc,  ///< ::ze_device_external_memory_properties_t
-    ZE_STRUCTURE_TYPE_CONTEXT_DESC = 0xd,           ///< ::ze_context_desc_t
-    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC = 0xe,     ///< ::ze_command_queue_desc_t
-    ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC = 0xf,      ///< ::ze_command_list_desc_t
-    ZE_STRUCTURE_TYPE_EVENT_POOL_DESC = 0x10,       ///< ::ze_event_pool_desc_t
-    ZE_STRUCTURE_TYPE_EVENT_DESC = 0x11,            ///< ::ze_event_desc_t
-    ZE_STRUCTURE_TYPE_FENCE_DESC = 0x12,            ///< ::ze_fence_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_DESC = 0x13,            ///< ::ze_image_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_PROPERTIES = 0x14,      ///< ::ze_image_properties_t
-    ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC = 0x15, ///< ::ze_device_mem_alloc_desc_t
-    ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC = 0x16,   ///< ::ze_host_mem_alloc_desc_t
-    ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES = 0x17,  ///< ::ze_memory_allocation_properties_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_DESC = 0x18,   ///< ::ze_external_memory_export_desc_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD = 0x19, ///< ::ze_external_memory_import_fd_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD = 0x1a, ///< ::ze_external_memory_export_fd_t
-    ZE_STRUCTURE_TYPE_MODULE_DESC = 0x1b,           ///< ::ze_module_desc_t
-    ZE_STRUCTURE_TYPE_MODULE_PROPERTIES = 0x1c,     ///< ::ze_module_properties_t
-    ZE_STRUCTURE_TYPE_KERNEL_DESC = 0x1d,           ///< ::ze_kernel_desc_t
-    ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES = 0x1e,     ///< ::ze_kernel_properties_t
-    ZE_STRUCTURE_TYPE_SAMPLER_DESC = 0x1f,          ///< ::ze_sampler_desc_t
-    ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC = 0x20,     ///< ::ze_physical_mem_desc_t
-    ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21,///< ::ze_kernel_preferred_group_size_properties_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_WIN32 = 0x22,  ///< ::ze_external_memory_import_win32_handle_t
-    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_WIN32 = 0x23,  ///< ::ze_external_memory_export_win32_handle_t
-    ZE_STRUCTURE_TYPE_DEVICE_RAYTRACING_EXT_PROPERTIES = 0x00010001,///< ::ze_device_raytracing_ext_properties_t
-    ZE_STRUCTURE_TYPE_RAYTRACING_MEM_ALLOC_EXT_DESC = 0x10002,  ///< ::ze_raytracing_mem_alloc_ext_desc_t
-    ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES = 0x10003,///< ::ze_float_atomic_ext_properties_t
-    ZE_STRUCTURE_TYPE_CACHE_RESERVATION_EXT_DESC = 0x10004, ///< ::ze_cache_reservation_ext_desc_t
-    ZE_STRUCTURE_TYPE_EU_COUNT_EXT = 0x10005,       ///< ::ze_eu_count_ext_t
-    ZE_STRUCTURE_TYPE_SRGB_EXT_DESC = 0x10006,      ///< ::ze_srgb_ext_desc_t
-    ZE_STRUCTURE_TYPE_LINKAGE_INSPECTION_EXT_DESC = 0x10007,///< ::ze_linkage_inspection_ext_desc_t
-    ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES = 0x10008, ///< ::ze_pci_ext_properties_t
-    ZE_STRUCTURE_TYPE_DRIVER_MEMORY_FREE_EXT_PROPERTIES = 0x10009,  ///< ::ze_driver_memory_free_ext_properties_t
-    ZE_STRUCTURE_TYPE_MEMORY_FREE_EXT_DESC = 0x1000a,   ///< ::ze_memory_free_ext_desc_t
-    ZE_STRUCTURE_TYPE_MEMORY_COMPRESSION_HINTS_EXT_DESC = 0x1000b,  ///< ::ze_memory_compression_hints_ext_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_ALLOCATION_EXT_PROPERTIES = 0x1000c,///< ::ze_image_allocation_ext_properties_t
-    ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001,  ///< ::ze_relaxed_allocation_limits_exp_desc_t
-    ZE_STRUCTURE_TYPE_MODULE_PROGRAM_EXP_DESC = 0x00020002, ///< ::ze_module_program_exp_desc_t
-    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_PROPERTIES = 0x00020003,  ///< ::ze_scheduling_hint_exp_properties_t
-    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_DESC = 0x00020004,///< ::ze_scheduling_hint_exp_desc_t
-    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXP_DESC = 0x00020005,  ///< ::ze_image_view_planar_exp_desc_t
-    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 = 0x00020006,   ///< ::ze_device_properties_t
-    ZE_STRUCTURE_TYPE_IMAGE_MEMORY_EXP_PROPERTIES = 0x00020007, ///< ::ze_image_memory_properties_exp_t
-    ZE_STRUCTURE_TYPE_POWER_SAVING_HINT_EXP_DESC = 0x00020008,  ///< ::ze_context_power_saving_hint_exp_desc_t
-    ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} ze_structure_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief External memory type flags
-typedef uint32_t ze_external_memory_type_flags_t;
-typedef enum _ze_external_memory_type_flag_t
-{
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_FD = ZE_BIT(0), ///< an opaque POSIX file descriptor handle
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF = ZE_BIT(1),   ///< a file descriptor handle for a Linux dma_buf
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32 = ZE_BIT(2),  ///< an NT handle
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT = ZE_BIT(3),  ///< a global share (KMT) handle
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE = ZE_BIT(4), ///< an NT handle referring to a Direct3D 10 or 11 texture resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE_KMT = ZE_BIT(5), ///< a global share (KMT) handle referring to a Direct3D 10 or 11 texture
-                                                    ///< resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_HEAP = ZE_BIT(6),///< an NT handle referring to a Direct3D 12 heap resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_RESOURCE = ZE_BIT(7),///< an NT handle referring to a Direct3D 12 committed resource
-    ZE_EXTERNAL_MEMORY_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_external_memory_type_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all properties types
-typedef struct _ze_base_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-
-} ze_base_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all descriptor types
-typedef struct _ze_base_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-
-} ze_base_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forces driver to only report devices (and sub-devices) as specified by
-///        values
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forces driver to report devices from lowest to highest PCI bus ID
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forces all shared allocations into device memory
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_ipc_mem_handle_t
-typedef struct _ze_ipc_mem_handle_t ze_ipc_mem_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_ipc_event_pool_handle_t
-typedef struct _ze_ipc_event_pool_handle_t ze_ipc_event_pool_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_base_properties_t
-typedef struct _ze_base_properties_t ze_base_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_base_desc_t
-typedef struct _ze_base_desc_t ze_base_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_driver_uuid_t
-typedef struct _ze_driver_uuid_t ze_driver_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_driver_properties_t
-typedef struct _ze_driver_properties_t ze_driver_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_driver_ipc_properties_t
-typedef struct _ze_driver_ipc_properties_t ze_driver_ipc_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_driver_extension_properties_t
-typedef struct _ze_driver_extension_properties_t ze_driver_extension_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_uuid_t
-typedef struct _ze_device_uuid_t ze_device_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_properties_t
-typedef struct _ze_device_properties_t ze_device_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_thread_t
-typedef struct _ze_device_thread_t ze_device_thread_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_compute_properties_t
-typedef struct _ze_device_compute_properties_t ze_device_compute_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_native_kernel_uuid_t
-typedef struct _ze_native_kernel_uuid_t ze_native_kernel_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_module_properties_t
-typedef struct _ze_device_module_properties_t ze_device_module_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_command_queue_group_properties_t
-typedef struct _ze_command_queue_group_properties_t ze_command_queue_group_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_memory_properties_t
-typedef struct _ze_device_memory_properties_t ze_device_memory_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_memory_access_properties_t
-typedef struct _ze_device_memory_access_properties_t ze_device_memory_access_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_cache_properties_t
-typedef struct _ze_device_cache_properties_t ze_device_cache_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_image_properties_t
-typedef struct _ze_device_image_properties_t ze_device_image_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_external_memory_properties_t
-typedef struct _ze_device_external_memory_properties_t ze_device_external_memory_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_p2p_properties_t
-typedef struct _ze_device_p2p_properties_t ze_device_p2p_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_context_desc_t
-typedef struct _ze_context_desc_t ze_context_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_command_queue_desc_t
-typedef struct _ze_command_queue_desc_t ze_command_queue_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_command_list_desc_t
-typedef struct _ze_command_list_desc_t ze_command_list_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_copy_region_t
-typedef struct _ze_copy_region_t ze_copy_region_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_region_t
-typedef struct _ze_image_region_t ze_image_region_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_event_pool_desc_t
-typedef struct _ze_event_pool_desc_t ze_event_pool_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_event_desc_t
-typedef struct _ze_event_desc_t ze_event_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_kernel_timestamp_data_t
-typedef struct _ze_kernel_timestamp_data_t ze_kernel_timestamp_data_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_kernel_timestamp_result_t
-typedef struct _ze_kernel_timestamp_result_t ze_kernel_timestamp_result_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_fence_desc_t
-typedef struct _ze_fence_desc_t ze_fence_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_format_t
-typedef struct _ze_image_format_t ze_image_format_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_desc_t
-typedef struct _ze_image_desc_t ze_image_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_properties_t
-typedef struct _ze_image_properties_t ze_image_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_mem_alloc_desc_t
-typedef struct _ze_device_mem_alloc_desc_t ze_device_mem_alloc_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_host_mem_alloc_desc_t
-typedef struct _ze_host_mem_alloc_desc_t ze_host_mem_alloc_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_memory_allocation_properties_t
-typedef struct _ze_memory_allocation_properties_t ze_memory_allocation_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_external_memory_export_desc_t
-typedef struct _ze_external_memory_export_desc_t ze_external_memory_export_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_external_memory_import_fd_t
-typedef struct _ze_external_memory_import_fd_t ze_external_memory_import_fd_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_external_memory_export_fd_t
-typedef struct _ze_external_memory_export_fd_t ze_external_memory_export_fd_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_external_memory_import_win32_handle_t
-typedef struct _ze_external_memory_import_win32_handle_t ze_external_memory_import_win32_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_external_memory_export_win32_handle_t
-typedef struct _ze_external_memory_export_win32_handle_t ze_external_memory_export_win32_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_module_constants_t
-typedef struct _ze_module_constants_t ze_module_constants_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_module_desc_t
-typedef struct _ze_module_desc_t ze_module_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_module_properties_t
-typedef struct _ze_module_properties_t ze_module_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_kernel_desc_t
-typedef struct _ze_kernel_desc_t ze_kernel_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_kernel_uuid_t
-typedef struct _ze_kernel_uuid_t ze_kernel_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_kernel_properties_t
-typedef struct _ze_kernel_properties_t ze_kernel_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_kernel_preferred_group_size_properties_t
-typedef struct _ze_kernel_preferred_group_size_properties_t ze_kernel_preferred_group_size_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_group_count_t
-typedef struct _ze_group_count_t ze_group_count_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_module_program_exp_desc_t
-typedef struct _ze_module_program_exp_desc_t ze_module_program_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_device_raytracing_ext_properties_t
-typedef struct _ze_device_raytracing_ext_properties_t ze_device_raytracing_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_raytracing_mem_alloc_ext_desc_t
-typedef struct _ze_raytracing_mem_alloc_ext_desc_t ze_raytracing_mem_alloc_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_sampler_desc_t
-typedef struct _ze_sampler_desc_t ze_sampler_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_physical_mem_desc_t
-typedef struct _ze_physical_mem_desc_t ze_physical_mem_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_float_atomic_ext_properties_t
-typedef struct _ze_float_atomic_ext_properties_t ze_float_atomic_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_relaxed_allocation_limits_exp_desc_t
-typedef struct _ze_relaxed_allocation_limits_exp_desc_t ze_relaxed_allocation_limits_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_cache_reservation_ext_desc_t
-typedef struct _ze_cache_reservation_ext_desc_t ze_cache_reservation_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_memory_properties_exp_t
-typedef struct _ze_image_memory_properties_exp_t ze_image_memory_properties_exp_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_view_planar_exp_desc_t
-typedef struct _ze_image_view_planar_exp_desc_t ze_image_view_planar_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_scheduling_hint_exp_properties_t
-typedef struct _ze_scheduling_hint_exp_properties_t ze_scheduling_hint_exp_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_scheduling_hint_exp_desc_t
-typedef struct _ze_scheduling_hint_exp_desc_t ze_scheduling_hint_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_context_power_saving_hint_exp_desc_t
-typedef struct _ze_context_power_saving_hint_exp_desc_t ze_context_power_saving_hint_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_eu_count_ext_t
-typedef struct _ze_eu_count_ext_t ze_eu_count_ext_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_pci_address_ext_t
-typedef struct _ze_pci_address_ext_t ze_pci_address_ext_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_pci_speed_ext_t
-typedef struct _ze_pci_speed_ext_t ze_pci_speed_ext_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_pci_ext_properties_t
-typedef struct _ze_pci_ext_properties_t ze_pci_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_srgb_ext_desc_t
-typedef struct _ze_srgb_ext_desc_t ze_srgb_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_image_allocation_ext_properties_t
-typedef struct _ze_image_allocation_ext_properties_t ze_image_allocation_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_linkage_inspection_ext_desc_t
-typedef struct _ze_linkage_inspection_ext_desc_t ze_linkage_inspection_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_memory_compression_hints_ext_desc_t
-typedef struct _ze_memory_compression_hints_ext_desc_t ze_memory_compression_hints_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_driver_memory_free_ext_properties_t
-typedef struct _ze_driver_memory_free_ext_properties_t ze_driver_memory_free_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare ze_memory_free_ext_desc_t
-typedef struct _ze_memory_free_ext_desc_t ze_memory_free_ext_desc_t;
-
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs
-#if !defined(__GNUC__)
-#pragma region driver
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported initialization flags
-typedef uint32_t ze_init_flags_t;
-typedef enum _ze_init_flag_t
-{
-    ZE_INIT_FLAG_GPU_ONLY = ZE_BIT(0),              ///< only initialize GPU drivers
-    ZE_INIT_FLAG_VPU_ONLY = ZE_BIT(1),              ///< only initialize VPU drivers
-    ZE_INIT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_init_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Initialize the 'oneAPI' driver(s)
-/// 
-/// @details
-///     - The application must call this function before calling any other
-///       function.
-///     - If this function is not called then all other functions will return
-///       ::ZE_RESULT_ERROR_UNINITIALIZED.
-///     - Only one instance of each driver will be initialized per process.
-///     - The application may call this function multiple times with different
-///       flags or environment variables enabled.
-///     - The application must call this function after forking new processes.
-///       Each forked process must call this function.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe for scenarios
-///       where multiple libraries may initialize the driver(s) simultaneously.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < flags`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeInit(
-    ze_init_flags_t flags                           ///< [in] initialization flags.
-                                                    ///< must be 0 (default) or a combination of ::ze_init_flag_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves driver instances
-/// 
-/// @details
-///     - A driver represents a collection of physical devices.
-///     - Multiple calls to this function will return identical driver handles,
-///       in the same order.
-///     - The application may pass nullptr for pDrivers when only querying the
-///       number of drivers.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clGetPlatformIDs
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDriverGet(
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of driver instances.
-                                                    ///< if count is zero, then the loader shall update the value with the
-                                                    ///< total number of drivers available.
-                                                    ///< if count is greater than the number of drivers available, then the
-                                                    ///< loader shall update the value with the correct number of drivers available.
-    ze_driver_handle_t* phDrivers                   ///< [in,out][optional][range(0, *pCount)] array of driver instance handles.
-                                                    ///< if count is less than the number of drivers available, then the loader
-                                                    ///< shall only retrieve that number of drivers.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported API versions
-/// 
-/// @details
-///     - API versions contain major and minor attributes, use
-///       ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION
-typedef enum _ze_api_version_t
-{
-    ZE_API_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
-    ZE_API_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),   ///< version 1.1
-    ZE_API_VERSION_1_2 = ZE_MAKE_VERSION( 1, 2 ),   ///< version 1.2
-    ZE_API_VERSION_1_3 = ZE_MAKE_VERSION( 1, 3 ),   ///< version 1.3
-    ZE_API_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 3 ),   ///< latest known version
-    ZE_API_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_api_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Returns the API version supported by the specified driver
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == version`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDriverGetApiVersion(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    ze_api_version_t* version                       ///< [out] api version
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_DRIVER_UUID_SIZE
-/// @brief Maximum driver universal unique id (UUID) size in bytes
-#define ZE_MAX_DRIVER_UUID_SIZE  16
-#endif // ZE_MAX_DRIVER_UUID_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Driver universal unique id (UUID)
-typedef struct _ze_driver_uuid_t
-{
-    uint8_t id[ZE_MAX_DRIVER_UUID_SIZE];            ///< [out] opaque data representing a driver UUID
-
-} ze_driver_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Driver properties queried using ::zeDriverGetProperties
-typedef struct _ze_driver_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_driver_uuid_t uuid;                          ///< [out] universal unique identifier.
-    uint32_t driverVersion;                         ///< [out] driver version
-                                                    ///< The driver version is a non-zero, monotonically increasing value where
-                                                    ///< higher values always indicate a more recent version.
-
-} ze_driver_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves properties of the driver.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clGetPlatformInfo**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pDriverProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDriverGetProperties(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    ze_driver_properties_t* pDriverProperties       ///< [in,out] query result for driver properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported IPC property flags
-typedef uint32_t ze_ipc_property_flags_t;
-typedef enum _ze_ipc_property_flag_t
-{
-    ZE_IPC_PROPERTY_FLAG_MEMORY = ZE_BIT(0),        ///< Supports passing memory allocations between processes. See
-                                                    ///< ::zeMemGetIpcHandle.
-    ZE_IPC_PROPERTY_FLAG_EVENT_POOL = ZE_BIT(1),    ///< Supports passing event pools between processes. See
-                                                    ///< ::zeEventPoolGetIpcHandle.
-    ZE_IPC_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_ipc_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief IPC properties queried using ::zeDriverGetIpcProperties
-typedef struct _ze_driver_ipc_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_ipc_property_flags_t flags;                  ///< [out] 0 (none) or a valid combination of ::ze_ipc_property_flag_t
-
-} ze_driver_ipc_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves IPC attributes of the driver
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pIpcProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDriverGetIpcProperties(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    ze_driver_ipc_properties_t* pIpcProperties      ///< [in,out] query result for IPC properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_EXTENSION_NAME
-/// @brief Maximum extension name string size
-#define ZE_MAX_EXTENSION_NAME  256
-#endif // ZE_MAX_EXTENSION_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Extension properties queried using ::zeDriverGetExtensionProperties
-typedef struct _ze_driver_extension_properties_t
-{
-    char name[ZE_MAX_EXTENSION_NAME];               ///< [out] extension name
-    uint32_t version;                               ///< [out] extension version using ::ZE_MAKE_VERSION
-
-} ze_driver_extension_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves extension properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkEnumerateInstanceExtensionProperties**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDriverGetExtensionProperties(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of extension properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of extension properties available.
-                                                    ///< if count is greater than the number of extension properties available,
-                                                    ///< then the driver shall update the value with the correct number of
-                                                    ///< extension properties available.
-    ze_driver_extension_properties_t* pExtensionProperties  ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< extension properties.
-                                                    ///< if count is less than the number of extension properties available,
-                                                    ///< then driver shall only retrieve that number of extension properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves function pointer for vendor-specific or experimental
-///        extensions
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == name`
-///         + `nullptr == ppFunctionAddress`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDriverGetExtensionFunctionAddress(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    const char* name,                               ///< [in] extension function name
-    void** ppFunctionAddress                        ///< [out] pointer to function pointer
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Device
-#if !defined(__GNUC__)
-#pragma region device
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves devices within a driver
-/// 
-/// @details
-///     - Multiple calls to this function will return identical device handles,
-///       in the same order.
-///     - The number and order of handles returned from this function is
-///       affected by the ::ZE_AFFINITY_MASK and ::ZE_ENABLE_PCI_ID_DEVICE_ORDER
-///       environment variables.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGet(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of devices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of devices available.
-                                                    ///< if count is greater than the number of devices available, then the
-                                                    ///< driver shall update the value with the correct number of devices available.
-    ze_device_handle_t* phDevices                   ///< [in,out][optional][range(0, *pCount)] array of handle of devices.
-                                                    ///< if count is less than the number of devices available, then driver
-                                                    ///< shall only retrieve that number of devices.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves a sub-device from a device
-/// 
-/// @details
-///     - Multiple calls to this function will return identical device handles,
-///       in the same order.
-///     - The number of handles returned from this function is affected by the
-///       ::ZE_AFFINITY_MASK environment variable.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clCreateSubDevices
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetSubDevices(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of sub-devices.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of sub-devices available.
-                                                    ///< if count is greater than the number of sub-devices available, then the
-                                                    ///< driver shall update the value with the correct number of sub-devices available.
-    ze_device_handle_t* phSubdevices                ///< [in,out][optional][range(0, *pCount)] array of handle of sub-devices.
-                                                    ///< if count is less than the number of sub-devices available, then driver
-                                                    ///< shall only retrieve that number of sub-devices.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device types
-typedef enum _ze_device_type_t
-{
-    ZE_DEVICE_TYPE_GPU = 1,                         ///< Graphics Processing Unit
-    ZE_DEVICE_TYPE_CPU = 2,                         ///< Central Processing Unit
-    ZE_DEVICE_TYPE_FPGA = 3,                        ///< Field Programmable Gate Array
-    ZE_DEVICE_TYPE_MCA = 4,                         ///< Memory Copy Accelerator
-    ZE_DEVICE_TYPE_VPU = 5,                         ///< Vision Processing Unit
-    ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_DEVICE_UUID_SIZE
-/// @brief Maximum device universal unique id (UUID) size in bytes
-#define ZE_MAX_DEVICE_UUID_SIZE  16
-#endif // ZE_MAX_DEVICE_UUID_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device universal unique id (UUID)
-typedef struct _ze_device_uuid_t
-{
-    uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];            ///< [out] opaque data representing a device UUID
-
-} ze_device_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_DEVICE_NAME
-/// @brief Maximum device name string size
-#define ZE_MAX_DEVICE_NAME  256
-#endif // ZE_MAX_DEVICE_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device property flags
-typedef uint32_t ze_device_property_flags_t;
-typedef enum _ze_device_property_flag_t
-{
-    ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ///< Device is integrated with the Host.
-    ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),  ///< Device handle used for query represents a sub-device.
-    ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),        ///< Device supports error correction memory access.
-    ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3), ///< Device supports on-demand page-faulting.
-    ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device properties queried using ::zeDeviceGetProperties
-typedef struct _ze_device_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_type_t type;                          ///< [out] generic device type
-    uint32_t vendorId;                              ///< [out] vendor id from PCI configuration
-    uint32_t deviceId;                              ///< [out] device id from PCI configuration
-    ze_device_property_flags_t flags;               ///< [out] 0 (none) or a valid combination of ::ze_device_property_flag_t
-    uint32_t subdeviceId;                           ///< [out] sub-device id. Only valid if ::ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE
-                                                    ///< is set.
-    uint32_t coreClockRate;                         ///< [out] Clock rate for device core.
-    uint64_t maxMemAllocSize;                       ///< [out] Maximum memory allocation size.
-    uint32_t maxHardwareContexts;                   ///< [out] Maximum number of logical hardware contexts.
-    uint32_t maxCommandQueuePriority;               ///< [out] Maximum priority for command queues. Higher value is higher
-                                                    ///< priority.
-    uint32_t numThreadsPerEU;                       ///< [out] Maximum number of threads per EU.
-    uint32_t physicalEUSimdWidth;                   ///< [out] The physical EU simd width.
-    uint32_t numEUsPerSubslice;                     ///< [out] Maximum number of EUs per sub-slice.
-    uint32_t numSubslicesPerSlice;                  ///< [out] Maximum number of sub-slices per slice.
-    uint32_t numSlices;                             ///< [out] Maximum number of slices.
-    uint64_t timerResolution;                       ///< [out] Returns the resolution of device timer used for profiling,
-                                                    ///< timestamps, etc. When stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES the
-                                                    ///< units are in nanoseconds. When
-                                                    ///< stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 units are in
-                                                    ///< cycles/sec
-    uint32_t timestampValidBits;                    ///< [out] Returns the number of valid bits in the timestamp value.
-    uint32_t kernelTimestampValidBits;              ///< [out] Returns the number of valid bits in the kernel timestamp values
-    ze_device_uuid_t uuid;                          ///< [out] universal unique identifier. Note: Subdevices will have their
-                                                    ///< own uuid.
-    char name[ZE_MAX_DEVICE_NAME];                  ///< [out] Device name
-
-} ze_device_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device thread identifier.
-typedef struct _ze_device_thread_t
-{
-    uint32_t slice;                                 ///< [in,out] the slice number.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numSlices.
-    uint32_t subslice;                              ///< [in,out] the sub-slice number within its slice.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numSubslicesPerSlice.
-    uint32_t eu;                                    ///< [in,out] the EU number within its sub-slice.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numEUsPerSubslice.
-    uint32_t thread;                                ///< [in,out] the thread number within its EU.
-                                                    ///< Must be UINT32_MAX (all) or less than ::ze_device_properties_t.numThreadsPerEU.
-
-} ze_device_thread_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves properties of the device.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clGetDeviceInfo
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pDeviceProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_properties_t* pDeviceProperties       ///< [in,out] query result for device properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_SUBGROUPSIZE_COUNT
-/// @brief Maximum number of subgroup sizes supported.
-#define ZE_SUBGROUPSIZE_COUNT  8
-#endif // ZE_SUBGROUPSIZE_COUNT
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device compute properties queried using ::zeDeviceGetComputeProperties
-typedef struct _ze_device_compute_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t maxTotalGroupSize;                     ///< [out] Maximum items per compute group. (groupSizeX * groupSizeY *
-                                                    ///< groupSizeZ) <= maxTotalGroupSize
-    uint32_t maxGroupSizeX;                         ///< [out] Maximum items for X dimension in group
-    uint32_t maxGroupSizeY;                         ///< [out] Maximum items for Y dimension in group
-    uint32_t maxGroupSizeZ;                         ///< [out] Maximum items for Z dimension in group
-    uint32_t maxGroupCountX;                        ///< [out] Maximum groups that can be launched for x dimension
-    uint32_t maxGroupCountY;                        ///< [out] Maximum groups that can be launched for y dimension
-    uint32_t maxGroupCountZ;                        ///< [out] Maximum groups that can be launched for z dimension
-    uint32_t maxSharedLocalMemory;                  ///< [out] Maximum shared local memory per group.
-    uint32_t numSubGroupSizes;                      ///< [out] Number of subgroup sizes supported. This indicates number of
-                                                    ///< entries in subGroupSizes.
-    uint32_t subGroupSizes[ZE_SUBGROUPSIZE_COUNT];  ///< [out] Size group sizes supported.
-
-} ze_device_compute_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves compute properties of the device.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clGetDeviceInfo
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pComputeProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetComputeProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_compute_properties_t* pComputeProperties  ///< [in,out] query result for compute properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_NATIVE_KERNEL_UUID_SIZE
-/// @brief Maximum native kernel universal unique id (UUID) size in bytes
-#define ZE_MAX_NATIVE_KERNEL_UUID_SIZE  16
-#endif // ZE_MAX_NATIVE_KERNEL_UUID_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Native kernel universal unique id (UUID)
-typedef struct _ze_native_kernel_uuid_t
-{
-    uint8_t id[ZE_MAX_NATIVE_KERNEL_UUID_SIZE];     ///< [out] opaque data representing a native kernel UUID
-
-} ze_native_kernel_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device module flags
-typedef uint32_t ze_device_module_flags_t;
-typedef enum _ze_device_module_flag_t
-{
-    ZE_DEVICE_MODULE_FLAG_FP16 = ZE_BIT(0),         ///< Device supports 16-bit floating-point operations
-    ZE_DEVICE_MODULE_FLAG_FP64 = ZE_BIT(1),         ///< Device supports 64-bit floating-point operations
-    ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS = ZE_BIT(2),///< Device supports 64-bit atomic operations
-    ZE_DEVICE_MODULE_FLAG_DP4A = ZE_BIT(3),         ///< Device supports four component dot product and accumulate operations
-    ZE_DEVICE_MODULE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_module_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported floating-Point capability flags
-typedef uint32_t ze_device_fp_flags_t;
-typedef enum _ze_device_fp_flag_t
-{
-    ZE_DEVICE_FP_FLAG_DENORM = ZE_BIT(0),           ///< Supports denorms
-    ZE_DEVICE_FP_FLAG_INF_NAN = ZE_BIT(1),          ///< Supports INF and quiet NaNs
-    ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST = ZE_BIT(2), ///< Supports rounding to nearest even rounding mode
-    ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO = ZE_BIT(3),    ///< Supports rounding to zero.
-    ZE_DEVICE_FP_FLAG_ROUND_TO_INF = ZE_BIT(4),     ///< Supports rounding to both positive and negative INF.
-    ZE_DEVICE_FP_FLAG_FMA = ZE_BIT(5),              ///< Supports IEEE754-2008 fused multiply-add.
-    ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT = ZE_BIT(6),  ///< Supports rounding as defined by IEEE754 for divide and sqrt
-                                                    ///< operations.
-    ZE_DEVICE_FP_FLAG_SOFT_FLOAT = ZE_BIT(7),       ///< Uses software implementation for basic floating-point operations.
-    ZE_DEVICE_FP_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_fp_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device module properties queried using ::zeDeviceGetModuleProperties
-typedef struct _ze_device_module_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t spirvVersionSupported;                 ///< [out] Maximum supported SPIR-V version.
-                                                    ///< Returns zero if SPIR-V is not supported.
-                                                    ///< Contains major and minor attributes, use ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION.
-    ze_device_module_flags_t flags;                 ///< [out] 0 or a valid combination of ::ze_device_module_flag_t
-    ze_device_fp_flags_t fp16flags;                 ///< [out] Capabilities for half-precision floating-point operations.
-                                                    ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a
-                                                    ///< combination of ::ze_device_fp_flag_t.
-    ze_device_fp_flags_t fp32flags;                 ///< [out] Capabilities for single-precision floating-point operations.
-                                                    ///< returns a combination of ::ze_device_fp_flag_t.
-    ze_device_fp_flags_t fp64flags;                 ///< [out] Capabilities for double-precision floating-point operations.
-                                                    ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a
-                                                    ///< combination of ::ze_device_fp_flag_t.
-    uint32_t maxArgumentsSize;                      ///< [out] Maximum kernel argument size that is supported.
-    uint32_t printfBufferSize;                      ///< [out] Maximum size of internal buffer that holds output of printf
-                                                    ///< calls from kernel.
-    ze_native_kernel_uuid_t nativeKernelSupported;  ///< [out] Compatibility UUID of supported native kernel.
-                                                    ///< UUID may or may not be the same across driver release, devices, or
-                                                    ///< operating systems.
-                                                    ///< Application is responsible for ensuring UUID matches before creating
-                                                    ///< module using
-                                                    ///< previously created native kernel.
-
-} ze_device_module_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves module properties of the device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pModuleProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetModuleProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_module_properties_t* pModuleProperties///< [in,out] query result for module properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported command queue group property flags
-typedef uint32_t ze_command_queue_group_property_flags_t;
-typedef enum _ze_command_queue_group_property_flag_t
-{
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE = ZE_BIT(0),   ///< Command queue group supports enqueing compute commands.
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY = ZE_BIT(1),  ///< Command queue group supports enqueing copy commands.
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COOPERATIVE_KERNELS = ZE_BIT(2),   ///< Command queue group supports cooperative kernels.
-                                                    ///< See ::zeCommandListAppendLaunchCooperativeKernel for more details.
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_METRICS = ZE_BIT(3),   ///< Command queue groups supports metric queries.
-    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_command_queue_group_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Command queue group properties queried using
-///        ::zeDeviceGetCommandQueueGroupProperties
-typedef struct _ze_command_queue_group_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_command_queue_group_property_flags_t flags;  ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_command_queue_group_property_flag_t
-    size_t maxMemoryFillPatternSize;                ///< [out] maximum `pattern_size` supported by command queue group.
-                                                    ///< See ::zeCommandListAppendMemoryFill for more details.
-    uint32_t numQueues;                             ///< [out] the number of physical engines within the group.
-
-} ze_command_queue_group_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves command queue group properties of the device.
-/// 
-/// @details
-///     - Properties are reported for each physical command queue type supported
-///       by the device.
-///     - Multiple calls to this function will return properties in the same
-///       order.
-///     - The order in which the properties are returned defines the command
-///       queue group's ordinal.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkGetPhysicalDeviceQueueFamilyProperties**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetCommandQueueGroupProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of command queue group properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of command queue group properties available.
-                                                    ///< if count is greater than the number of command queue group properties
-                                                    ///< available, then the driver shall update the value with the correct
-                                                    ///< number of command queue group properties available.
-    ze_command_queue_group_properties_t* pCommandQueueGroupProperties   ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< command queue group properties.
-                                                    ///< if count is less than the number of command queue group properties
-                                                    ///< available, then driver shall only retrieve that number of command
-                                                    ///< queue group properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device memory property flags
-typedef uint32_t ze_device_memory_property_flags_t;
-typedef enum _ze_device_memory_property_flag_t
-{
-    ZE_DEVICE_MEMORY_PROPERTY_FLAG_TBD = ZE_BIT(0), ///< reserved for future use
-    ZE_DEVICE_MEMORY_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_memory_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device local memory properties queried using
-///        ::zeDeviceGetMemoryProperties
-typedef struct _ze_device_memory_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_memory_property_flags_t flags;        ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_device_memory_property_flag_t
-    uint32_t maxClockRate;                          ///< [out] Maximum clock rate for device memory.
-    uint32_t maxBusWidth;                           ///< [out] Maximum bus width between device and memory.
-    uint64_t totalSize;                             ///< [out] Total memory size in bytes that is available to the device.
-    char name[ZE_MAX_DEVICE_NAME];                  ///< [out] Memory name
-
-} ze_device_memory_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves local memory properties of the device.
-/// 
-/// @details
-///     - Properties are reported for each physical memory type supported by the
-///       device.
-///     - Multiple calls to this function will return properties in the same
-///       order.
-///     - The order in which the properties are returned defines the device's
-///       local memory ordinal.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clGetDeviceInfo
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetMemoryProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of memory properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of memory properties available.
-                                                    ///< if count is greater than the number of memory properties available,
-                                                    ///< then the driver shall update the value with the correct number of
-                                                    ///< memory properties available.
-    ze_device_memory_properties_t* pMemProperties   ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< memory properties.
-                                                    ///< if count is less than the number of memory properties available, then
-                                                    ///< driver shall only retrieve that number of memory properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory access capability flags
-/// 
-/// @details
-///     - Supported access capabilities for different types of memory
-///       allocations
-typedef uint32_t ze_memory_access_cap_flags_t;
-typedef enum _ze_memory_access_cap_flag_t
-{
-    ZE_MEMORY_ACCESS_CAP_FLAG_RW = ZE_BIT(0),       ///< Supports load/store access
-    ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC = ZE_BIT(1),   ///< Supports atomic access
-    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT = ZE_BIT(2),   ///< Supports concurrent access
-    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC = ZE_BIT(3),///< Supports concurrent atomic access
-    ZE_MEMORY_ACCESS_CAP_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_access_cap_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device memory access properties queried using
-///        ::zeDeviceGetMemoryAccessProperties
-typedef struct _ze_device_memory_access_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_memory_access_cap_flags_t hostAllocCapabilities; ///< [out] host memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t deviceAllocCapabilities;   ///< [out] device memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t sharedSingleDeviceAllocCapabilities;   ///< [out] shared, single-device memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t sharedCrossDeviceAllocCapabilities;///< [out] shared, cross-device memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-    ze_memory_access_cap_flags_t sharedSystemAllocCapabilities; ///< [out] shared, system memory capabilities.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
-
-} ze_device_memory_access_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves memory access properties of the device.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clGetDeviceInfo
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pMemAccessProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetMemoryAccessProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_memory_access_properties_t* pMemAccessProperties  ///< [in,out] query result for memory access properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported cache control property flags
-typedef uint32_t ze_device_cache_property_flags_t;
-typedef enum _ze_device_cache_property_flag_t
-{
-    ZE_DEVICE_CACHE_PROPERTY_FLAG_USER_CONTROL = ZE_BIT(0), ///< Device support User Cache Control (i.e. SLM section vs Generic Cache)
-    ZE_DEVICE_CACHE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_cache_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device cache properties queried using ::zeDeviceGetCacheProperties
-typedef struct _ze_device_cache_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_cache_property_flags_t flags;         ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_device_cache_property_flag_t
-    size_t cacheSize;                               ///< [out] Per-cache size, in bytes
-
-} ze_device_cache_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves cache properties of the device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clGetDeviceInfo
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetCacheProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of cache properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of cache properties available.
-                                                    ///< if count is greater than the number of cache properties available,
-                                                    ///< then the driver shall update the value with the correct number of
-                                                    ///< cache properties available.
-    ze_device_cache_properties_t* pCacheProperties  ///< [in,out][optional][range(0, *pCount)] array of query results for cache properties.
-                                                    ///< if count is less than the number of cache properties available, then
-                                                    ///< driver shall only retrieve that number of cache properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device image properties queried using ::zeDeviceGetImageProperties
-typedef struct _ze_device_image_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t maxImageDims1D;                        ///< [out] Maximum image dimensions for 1D resources. if 0, then 1D images
-                                                    ///< are unsupported.
-    uint32_t maxImageDims2D;                        ///< [out] Maximum image dimensions for 2D resources. if 0, then 2D images
-                                                    ///< are unsupported.
-    uint32_t maxImageDims3D;                        ///< [out] Maximum image dimensions for 3D resources. if 0, then 3D images
-                                                    ///< are unsupported.
-    uint64_t maxImageBufferSize;                    ///< [out] Maximum image buffer size in bytes. if 0, then buffer images are
-                                                    ///< unsupported.
-    uint32_t maxImageArraySlices;                   ///< [out] Maximum image array slices. if 0, then image arrays are
-                                                    ///< unsupported.
-    uint32_t maxSamplers;                           ///< [out] Max samplers that can be used in kernel. if 0, then sampling is
-                                                    ///< unsupported.
-    uint32_t maxReadImageArgs;                      ///< [out] Returns the maximum number of simultaneous image objects that
-                                                    ///< can be read from by a kernel. if 0, then reading images is
-                                                    ///< unsupported.
-    uint32_t maxWriteImageArgs;                     ///< [out] Returns the maximum number of simultaneous image objects that
-                                                    ///< can be written to by a kernel. if 0, then writing images is
-                                                    ///< unsupported.
-
-} ze_device_image_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves image properties of the device
-/// 
-/// @details
-///     - See ::zeImageGetProperties for format-specific capabilities.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pImageProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetImageProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_image_properties_t* pImageProperties  ///< [in,out] query result for image properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device external memory import and export properties
-typedef struct _ze_device_external_memory_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t memoryAllocationImportTypes;///< [out] Supported external memory import types for memory allocations.
-    ze_external_memory_type_flags_t memoryAllocationExportTypes;///< [out] Supported external memory export types for memory allocations.
-    ze_external_memory_type_flags_t imageImportTypes;   ///< [out] Supported external memory import types for images.
-    ze_external_memory_type_flags_t imageExportTypes;   ///< [out] Supported external memory export types for images.
-
-} ze_device_external_memory_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves external memory import and export of the device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pExternalMemoryProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetExternalMemoryProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_device_external_memory_properties_t* pExternalMemoryProperties   ///< [in,out] query result for external memory properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device peer-to-peer property flags
-typedef uint32_t ze_device_p2p_property_flags_t;
-typedef enum _ze_device_p2p_property_flag_t
-{
-    ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS = ZE_BIT(0), ///< Device supports access between peer devices.
-    ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS = ZE_BIT(1),///< Device supports atomics between peer devices.
-    ZE_DEVICE_P2P_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_p2p_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device peer-to-peer properties queried using
-///        ::zeDeviceGetP2PProperties
-typedef struct _ze_device_p2p_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_p2p_property_flags_t flags;           ///< [out] 0 (none) or a valid combination of
-                                                    ///< ::ze_device_p2p_property_flag_t
-
-} ze_device_p2p_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves peer-to-peer properties between one device and a peer
-///        devices
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///         + `nullptr == hPeerDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pP2PProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetP2PProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device performing the access
-    ze_device_handle_t hPeerDevice,                 ///< [in] handle of the peer device with the allocation
-    ze_device_p2p_properties_t* pP2PProperties      ///< [in,out] Peer-to-Peer properties between source and peer device
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Queries if one device can directly access peer device allocations
-/// 
-/// @details
-///     - Any device can access any other device within a node through a
-///       scale-up fabric.
-///     - The following are conditions for CanAccessPeer query.
-///         + If both device and peer device are the same then return true.
-///         + If both sub-device and peer sub-device are the same then return
-///           true.
-///         + If both are sub-devices and share the same parent device then
-///           return true.
-///         + If both device and remote device are connected by a direct or
-///           indirect scale-up fabric or over PCIe (same root complex or shared
-///           PCIe switch) then true.
-///         + If both sub-device and remote parent device (and vice-versa) are
-///           connected by a direct or indirect scale-up fabric or over PCIe
-///           (same root complex or shared PCIe switch) then true.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///         + `nullptr == hPeerDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == value`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceCanAccessPeer(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device performing the access
-    ze_device_handle_t hPeerDevice,                 ///< [in] handle of the peer device with the allocation
-    ze_bool_t* value                                ///< [out] returned access capability
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Returns current status of the device.
-/// 
-/// @details
-///     - Once a device is reset, this call will update the OS handle attached
-///       to the device handle.
-///     - The application may call this function from simultaneous threads with
-///       the same device handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_SUCCESS
-///         + Device is available for use.
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///         + Device is lost; must be reset for use.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetStatus(
-    ze_device_handle_t hDevice                      ///< [in] handle of the device
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Returns synchronized Host and device global timestamps.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads with
-///       the same device handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == hostTimestamp`
-///         + `nullptr == deviceTimestamp`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceGetGlobalTimestamps(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    uint64_t* hostTimestamp,                        ///< [out] value of the Host's global timestamp that correlates with the
-                                                    ///< Device's global timestamp value
-    uint64_t* deviceTimestamp                       ///< [out] value of the Device's global timestamp that correlates with the
-                                                    ///< Host's global timestamp value
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Context
-#if !defined(__GNUC__)
-#pragma region context
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported context creation flags
-typedef uint32_t ze_context_flags_t;
-typedef enum _ze_context_flag_t
-{
-    ZE_CONTEXT_FLAG_TBD = ZE_BIT(0),                ///< reserved for future use
-    ZE_CONTEXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_context_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Context descriptor
-typedef struct _ze_context_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_context_flags_t flags;                       ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_context_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
-
-} ze_context_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a context for the driver.
-/// 
-/// @details
-///     - The application must only use the context for the driver which was
-///       provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phContext`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < desc->flags`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextCreate(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver object
-    const ze_context_desc_t* desc,                  ///< [in] pointer to context descriptor
-    ze_context_handle_t* phContext                  ///< [out] pointer to handle of context object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a context for the driver.
-/// 
-/// @details
-///     - The application must only use the context for the driver which was
-///       provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phContext`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < desc->flags`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phDevices) && (0 < numDevices)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextCreateEx(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver object
-    const ze_context_desc_t* desc,                  ///< [in] pointer to context descriptor
-    uint32_t numDevices,                            ///< [in][optional] number of device handles; must be 0 if `nullptr ==
-                                                    ///< phDevices`
-    ze_device_handle_t* phDevices,                  ///< [in][optional][range(0, numDevices)] array of device handles which
-                                                    ///< context has visibility.
-                                                    ///< if nullptr, then all devices supported by the driver instance are
-                                                    ///< visible to the context.
-                                                    ///< otherwise, context only has visibility to devices in this array.
-    ze_context_handle_t* phContext                  ///< [out] pointer to handle of context object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys a context.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the context before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same context handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextDestroy(
-    ze_context_handle_t hContext                    ///< [in][release] handle of context object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Returns current status of the context.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads with
-///       the same context handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_SUCCESS
-///         + Context is available for use.
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///         + Context is invalid; due to device lost or reset.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextGetStatus(
-    ze_context_handle_t hContext                    ///< [in] handle of context object
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Command Queue
-#if !defined(__GNUC__)
-#pragma region cmdqueue
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported command queue flags
-typedef uint32_t ze_command_queue_flags_t;
-typedef enum _ze_command_queue_flag_t
-{
-    ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY = ZE_BIT(0),///< command queue should be optimized for submission to a single device engine.
-                                                    ///< driver **must** disable any implicit optimizations for distributing
-                                                    ///< work across multiple engines.
-                                                    ///< this flag should be used when applications want full control over
-                                                    ///< multi-engine submission and scheduling.
-    ZE_COMMAND_QUEUE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_command_queue_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported command queue modes
-typedef enum _ze_command_queue_mode_t
-{
-    ZE_COMMAND_QUEUE_MODE_DEFAULT = 0,              ///< implicit default behavior; uses driver-based heuristics
-    ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS = 1,          ///< Device execution always completes immediately on execute;
-                                                    ///< Host thread is blocked using wait on implicit synchronization object
-    ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS = 2,         ///< Device execution is scheduled and will complete in future;
-                                                    ///< explicit synchronization object must be used to determine completeness
-    ZE_COMMAND_QUEUE_MODE_FORCE_UINT32 = 0x7fffffff
-
-} ze_command_queue_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported command queue priorities
-typedef enum _ze_command_queue_priority_t
-{
-    ZE_COMMAND_QUEUE_PRIORITY_NORMAL = 0,           ///< [default] normal priority
-    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW = 1,     ///< lower priority than normal
-    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH = 2,    ///< higher priority than normal
-    ZE_COMMAND_QUEUE_PRIORITY_FORCE_UINT32 = 0x7fffffff
-
-} ze_command_queue_priority_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Command Queue descriptor
-typedef struct _ze_command_queue_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t ordinal;                               ///< [in] command queue group ordinal
-    uint32_t index;                                 ///< [in] command queue index within the group;
-                                                    ///< must be zero if ::ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set
-    ze_command_queue_flags_t flags;                 ///< [in] usage flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_command_queue_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics to balance
-                                                    ///< latency and throughput.
-    ze_command_queue_mode_t mode;                   ///< [in] operation mode
-    ze_command_queue_priority_t priority;           ///< [in] priority
-
-} ze_command_queue_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a command queue on the context.
-/// 
-/// @details
-///     - A command queue represents a logical input stream to the device, tied
-///       to a physical input stream.
-///     - The application must only use the command queue for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clCreateCommandQueue**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phCommandQueue`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < desc->flags`
-///         + `::ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < desc->mode`
-///         + `::ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < desc->priority`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandQueueCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    const ze_command_queue_desc_t* desc,            ///< [in] pointer to command queue descriptor
-    ze_command_queue_handle_t* phCommandQueue       ///< [out] pointer to handle of command queue object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys a command queue.
-/// 
-/// @details
-///     - The application must destroy all fence handles created from the
-///       command queue before destroying the command queue itself
-///     - The application must ensure the device is not currently referencing
-///       the command queue before it is deleted
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this command queue
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command queue handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clReleaseCommandQueue**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandQueue`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandQueueDestroy(
-    ze_command_queue_handle_t hCommandQueue         ///< [in][release] handle of command queue object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Executes a command list in a command queue.
-/// 
-/// @details
-///     - The command lists are submitted to the device in the order they are
-///       received, whether from multiple calls (on the same or different
-///       threads) or a single call with multiple command lists.
-///     - The application must ensure the command lists are accessible by the
-///       device on which the command queue was created.
-///     - The application must ensure the device is not currently referencing
-///       the command list since the implementation is allowed to modify the
-///       contents of the command list for submission.
-///     - The application must only execute command lists created with an
-///       identical command queue group ordinal to the command queue.
-///     - The application must use a fence created using the same command queue.
-///     - The application must ensure the command queue, command list and fence
-///       were created on the same context.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - vkQueueSubmit
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandQueue`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phCommandLists`
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `0 == numCommandLists`
-///     - ::ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandQueueExecuteCommandLists(
-    ze_command_queue_handle_t hCommandQueue,        ///< [in] handle of the command queue
-    uint32_t numCommandLists,                       ///< [in] number of command lists to execute
-    ze_command_list_handle_t* phCommandLists,       ///< [in][range(0, numCommandLists)] list of handles of the command lists
-                                                    ///< to execute
-    ze_fence_handle_t hFence                        ///< [in][optional] handle of the fence to signal on completion
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Synchronizes a command queue by waiting on the host.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandQueue`
-///     - ::ZE_RESULT_NOT_READY
-///         + timeout expired
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandQueueSynchronize(
-    ze_command_queue_handle_t hCommandQueue,        ///< [in] handle of the command queue
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then immediately returns the status of the command queue;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Command List
-#if !defined(__GNUC__)
-#pragma region cmdlist
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported command list creation flags
-typedef uint32_t ze_command_list_flags_t;
-typedef enum _ze_command_list_flag_t
-{
-    ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING = ZE_BIT(0),  ///< driver may reorder commands (e.g., kernels, copies) between barriers
-                                                    ///< and synchronization primitives.
-                                                    ///< using this flag may increase Host overhead of ::zeCommandListClose.
-                                                    ///< therefore, this flag should **not** be set for low-latency usage-models.
-    ZE_COMMAND_LIST_FLAG_MAXIMIZE_THROUGHPUT = ZE_BIT(1),   ///< driver may perform additional optimizations that increase execution
-                                                    ///< throughput. 
-                                                    ///< using this flag may increase Host overhead of ::zeCommandListClose and ::zeCommandQueueExecuteCommandLists.
-                                                    ///< therefore, this flag should **not** be set for low-latency usage-models.
-    ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY = ZE_BIT(2), ///< command list should be optimized for submission to a single command
-                                                    ///< queue and device engine.
-                                                    ///< driver **must** disable any implicit optimizations for distributing
-                                                    ///< work across multiple engines.
-                                                    ///< this flag should be used when applications want full control over
-                                                    ///< multi-engine submission and scheduling.
-    ZE_COMMAND_LIST_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_command_list_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Command List descriptor
-typedef struct _ze_command_list_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t commandQueueGroupOrdinal;              ///< [in] command queue group ordinal to which this command list will be
-                                                    ///< submitted
-    ze_command_list_flags_t flags;                  ///< [in] usage flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_command_list_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics to balance
-                                                    ///< latency and throughput.
-
-} ze_command_list_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a command list on the context.
-/// 
-/// @details
-///     - A command list represents a sequence of commands for execution on a
-///       command queue.
-///     - The command list is created in the 'open' state.
-///     - The application must only use the command list for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < desc->flags`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    const ze_command_list_desc_t* desc,             ///< [in] pointer to command list descriptor
-    ze_command_list_handle_t* phCommandList         ///< [out] pointer to handle of command list object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates an immediate command list on the context.
-/// 
-/// @details
-///     - An immediate command list is used for low-latency submission of
-///       commands.
-///     - An immediate command list creates an implicit command queue.
-///     - The command list is created in the 'open' state and never needs to be
-///       closed.
-///     - The application must only use the command list for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == altdesc`
-///         + `nullptr == phCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < altdesc->flags`
-///         + `::ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < altdesc->mode`
-///         + `::ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < altdesc->priority`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListCreateImmediate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    const ze_command_queue_desc_t* altdesc,         ///< [in] pointer to command queue descriptor
-    ze_command_list_handle_t* phCommandList         ///< [out] pointer to handle of command list object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys a command list.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the command list before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this command list.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListDestroy(
-    ze_command_list_handle_t hCommandList           ///< [in][release] handle of command list object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Closes a command list; ready to be executed by a command queue.
-/// 
-/// @details
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListClose(
-    ze_command_list_handle_t hCommandList           ///< [in] handle of command list object to close
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Reset a command list to initial (empty) state; ready for appending
-///        commands.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the command list before it is reset
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListReset(
-    ze_command_list_handle_t hCommandList           ///< [in] handle of command list object to reset
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends a memory write of the device's global timestamp value into a
-///        command list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The timestamp frequency can be queried from
-///       ::ze_device_properties_t.timerResolution.
-///     - The number of valid bits in the timestamp value can be queried from
-///       ::ze_device_properties_t.timestampValidBits.
-///     - The application must ensure the memory pointed to by dstptr is
-///       accessible by the device on which the command list was created.
-///     - The application must ensure the command list and events were created,
-///       and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendWriteGlobalTimestamp(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint64_t* dstptr,                               ///< [in,out] pointer to memory where timestamp value will be written; must
-                                                    ///< be 8byte-aligned.
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing query;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing query
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Barrier
-#if !defined(__GNUC__)
-#pragma region barrier
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends an execution and global memory barrier into a command list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - If numWaitEvents is zero, then all previous commands, enqueued on same
-///       command queue, must complete prior to the execution of the barrier.
-///       This is not the case when numWaitEvents is non-zero.
-///     - If numWaitEvents is non-zero, then only all phWaitEvents must be
-///       signaled prior to the execution of the barrier.
-///     - This command blocks all following commands from beginning until the
-///       execution of the barrier completes.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkCmdPipelineBarrier**
-///     - clEnqueueBarrierWithWaitList
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendBarrier(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing barrier;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing barrier
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends a global memory ranges barrier into a command list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - If numWaitEvents is zero, then all previous commands are completed
-///       prior to the execution of the barrier.
-///     - If numWaitEvents is non-zero, then then all phWaitEvents must be
-///       signaled prior to the execution of the barrier.
-///     - This command blocks all following commands from beginning until the
-///       execution of the barrier completes.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pRangeSizes`
-///         + `nullptr == pRanges`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemoryRangesBarrier(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numRanges,                             ///< [in] number of memory ranges
-    const size_t* pRangeSizes,                      ///< [in][range(0, numRanges)] array of sizes of memory range
-    const void** pRanges,                           ///< [in][range(0, numRanges)] array of memory ranges
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing barrier;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing barrier
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Ensures in-bound writes to the device are globally observable.
-/// 
-/// @details
-///     - This is a special-case system level barrier that can be used to ensure
-///       global observability of writes; 
-///       typically needed after a producer (e.g., NIC) performs direct writes
-///       to the device's memory (e.g., Direct RDMA writes).
-///       This is typically required when the memory corresponding to the writes
-///       is subsequently accessed from a remote device.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextSystemBarrier(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice                      ///< [in] handle of the device
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Copies
-#if !defined(__GNUC__)
-#pragma region copy
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies host, device, or shared memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by dstptr and srcptr
-///       is accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by dstptr and
-///       srcptr as they are free to be modified by either the Host or device up
-///       until execution.
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the command list and events were created,
-///       and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clEnqueueCopyBuffer**
-///     - **clEnqueueReadBuffer**
-///     - **clEnqueueWriteBuffer**
-///     - **clEnqueueSVMMemcpy**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///         + `nullptr == srcptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemoryCopy(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    size_t size,                                    ///< [in] size in bytes to copy
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Initializes host, device, or shared memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by dstptr is
-///       accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by dstptr as
-///       it is free to be modified by either the Host or device up until
-///       execution.
-///     - The value to initialize memory to is described by the pattern and the
-///       pattern size.
-///     - The pattern size must be a power-of-two and less than or equal to
-///       ::ze_command_queue_group_properties_t.maxMemoryFillPatternSize.
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the command list and events were created,
-///       and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clEnqueueFillBuffer**
-///     - **clEnqueueSVMMemFill**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///         + `nullptr == pattern`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemoryFill(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* ptr,                                      ///< [in] pointer to memory to initialize
-    const void* pattern,                            ///< [in] pointer to value to initialize memory to
-    size_t pattern_size,                            ///< [in] size in bytes of the value to initialize memory to
-    size_t size,                                    ///< [in] size in bytes to initialize
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copy region descriptor
-typedef struct _ze_copy_region_t
-{
-    uint32_t originX;                               ///< [in] The origin x offset for region in bytes
-    uint32_t originY;                               ///< [in] The origin y offset for region in rows
-    uint32_t originZ;                               ///< [in] The origin z offset for region in slices
-    uint32_t width;                                 ///< [in] The region width relative to origin in bytes
-    uint32_t height;                                ///< [in] The region height relative to origin in rows
-    uint32_t depth;                                 ///< [in] The region depth relative to origin in slices. Set this to 0 for
-                                                    ///< 2D copy.
-
-} ze_copy_region_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies a region from a 2D or 3D array of host, device, or shared
-///        memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by dstptr and srcptr
-///       is accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by dstptr and
-///       srcptr as they are free to be modified by either the Host or device up
-///       until execution.
-///     - The region width, height, and depth for both src and dst must be same.
-///       The origins can be different.
-///     - The src and dst regions cannot be overlapping.
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the command list and events were created,
-///       and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///         + `nullptr == dstRegion`
-///         + `nullptr == srcptr`
-///         + `nullptr == srcRegion`
-///     - ::ZE_RESULT_ERROR_OVERLAPPING_REGIONS
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemoryCopyRegion(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    const ze_copy_region_t* dstRegion,              ///< [in] pointer to destination region to copy to
-    uint32_t dstPitch,                              ///< [in] destination pitch in bytes
-    uint32_t dstSlicePitch,                         ///< [in] destination slice pitch in bytes. This is required for 3D region
-                                                    ///< copies where ::ze_copy_region_t.depth is not 0, otherwise it's
-                                                    ///< ignored.
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    const ze_copy_region_t* srcRegion,              ///< [in] pointer to source region to copy from
-    uint32_t srcPitch,                              ///< [in] source pitch in bytes
-    uint32_t srcSlicePitch,                         ///< [in] source slice pitch in bytes. This is required for 3D region
-                                                    ///< copies where ::ze_copy_region_t.depth is not 0, otherwise it's
-                                                    ///< ignored.
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies host, device, or shared memory from another context.
-/// 
-/// @details
-///     - The current active and source context must be from the same driver.
-///     - The application must ensure the memory pointed to by dstptr and srcptr
-///       is accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by dstptr and
-///       srcptr as they are free to be modified by either the Host or device up
-///       until execution.
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the command list and events were created,
-///       and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hContextSrc`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///         + `nullptr == srcptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemoryCopyFromContext(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    ze_context_handle_t hContextSrc,                ///< [in] handle of source context object
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    size_t size,                                    ///< [in] size in bytes to copy
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies an image.
-/// 
-/// @details
-///     - The application must ensure the image and events are accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the image format descriptors for both
-///       source and destination images are the same.
-///     - The application must ensure the command list, images and events were
-///       created on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clEnqueueCopyImage**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hDstImage`
-///         + `nullptr == hSrcImage`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendImageCopy(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Region descriptor
-typedef struct _ze_image_region_t
-{
-    uint32_t originX;                               ///< [in] The origin x offset for region in pixels
-    uint32_t originY;                               ///< [in] The origin y offset for region in pixels
-    uint32_t originZ;                               ///< [in] The origin z offset for region in pixels
-    uint32_t width;                                 ///< [in] The region width relative to origin in pixels
-    uint32_t height;                                ///< [in] The region height relative to origin in pixels
-    uint32_t depth;                                 ///< [in] The region depth relative to origin. For 1D or 2D images, set
-                                                    ///< this to 1.
-
-} ze_image_region_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies a region of an image to another image.
-/// 
-/// @details
-///     - The application must ensure the image and events are accessible by the
-///       device on which the command list was created.
-///     - The region width and height for both src and dst must be same. The
-///       origins can be different.
-///     - The src and dst regions cannot be overlapping.
-///     - The application must ensure the image format descriptors for both
-///       source and destination images are the same.
-///     - The application must ensure the command list, images and events were
-///       created, and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hDstImage`
-///         + `nullptr == hSrcImage`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_OVERLAPPING_REGIONS
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendImageCopyRegion(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    const ze_image_region_t* pDstRegion,            ///< [in][optional] destination region descriptor
-    const ze_image_region_t* pSrcRegion,            ///< [in][optional] source region descriptor
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies from an image to device or shared memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by dstptr is
-///       accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by dstptr as
-///       it is free to be modified by either the Host or device up until
-///       execution.
-///     - The application must ensure the image and events are accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the image format descriptor for the source
-///       image is a single-planar format.
-///     - The application must ensure the command list, image and events were
-///       created, and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clEnqueueReadImage
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hSrcImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendImageCopyToMemory(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    const ze_image_region_t* pSrcRegion,            ///< [in][optional] source region descriptor
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies to an image from device or shared memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by srcptr is
-///       accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by srcptr as
-///       it is free to be modified by either the Host or device up until
-///       execution.
-///     - The application must ensure the image and events are accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the image format descriptor for the
-///       destination image is a single-planar format.
-///     - The application must ensure the command list, image and events were
-///       created, and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clEnqueueWriteImage
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hDstImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == srcptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendImageCopyFromMemory(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    const ze_image_region_t* pDstRegion,            ///< [in][optional] destination region descriptor
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Asynchronously prefetches shared memory to the device associated with
-///        the specified command list
-/// 
-/// @details
-///     - This is a hint to improve performance only and is not required for
-///       correctness.
-///     - Only prefetching to the device associated with the specified command
-///       list is supported.
-///       Prefetching to the host or to a peer device is not supported.
-///     - Prefetching may not be supported for all allocation types for all devices.
-///       If memory prefetching is not supported for the specified memory range
-///       the prefetch hint may be ignored.
-///     - Prefetching may only be supported at a device-specific granularity,
-///       such as at a page boundary.
-///       In this case, the memory range may be expanded such that the start and
-///       end of the range satisfy granularity requirements.
-///     - The application must ensure the memory pointed to by ptr is accessible
-///       by the device on which the command list was created.
-///     - The application must ensure the command list was created, and the
-///       memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clEnqueueSVMMigrateMem
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemoryPrefetch(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    const void* ptr,                                ///< [in] pointer to start of the memory range to prefetch
-    size_t size                                     ///< [in] size in bytes of the memory range to prefetch
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported memory advice hints
-typedef enum _ze_memory_advice_t
-{
-    ZE_MEMORY_ADVICE_SET_READ_MOSTLY = 0,           ///< hint that memory will be read from frequently and written to rarely
-    ZE_MEMORY_ADVICE_CLEAR_READ_MOSTLY = 1,         ///< removes the affect of ::ZE_MEMORY_ADVICE_SET_READ_MOSTLY
-    ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION = 2,    ///< hint that the preferred memory location is the specified device
-    ZE_MEMORY_ADVICE_CLEAR_PREFERRED_LOCATION = 3,  ///< removes the affect of ::ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION
-    ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY = 4,     ///< hints that memory will mostly be accessed non-atomically
-    ZE_MEMORY_ADVICE_CLEAR_NON_ATOMIC_MOSTLY = 5,   ///< removes the affect of ::ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY
-    ZE_MEMORY_ADVICE_BIAS_CACHED = 6,               ///< hints that memory should be cached
-    ZE_MEMORY_ADVICE_BIAS_UNCACHED = 7,             ///< hints that memory should be not be cached
-    ZE_MEMORY_ADVICE_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_advice_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Provides advice about the use of a shared memory range
-/// 
-/// @details
-///     - Memory advice is a performance hint only and is not required for
-///       functional correctness.
-///     - Memory advice can be used to override driver heuristics to explicitly
-///       control shared memory behavior.
-///     - Not all memory advice hints may be supported for all allocation types
-///       for all devices.
-///       If a memory advice hint is not supported by the device it will be ignored.
-///     - Memory advice may only be supported at a device-specific granularity,
-///       such as at a page boundary.
-///       In this case, the memory range may be expanded such that the start and
-///       end of the range satisfy granularity requirements.
-///     - The application must ensure the memory pointed to by ptr is accessible
-///       by the device on which the command list was created.
-///     - The application must ensure the command list was created, and memory
-///       was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle, and the memory was
-///       allocated.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_MEMORY_ADVICE_BIAS_UNCACHED < advice`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendMemAdvise(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_device_handle_t hDevice,                     ///< [in] device associated with the memory advice
-    const void* ptr,                                ///< [in] Pointer to the start of the memory range
-    size_t size,                                    ///< [in] Size in bytes of the memory range
-    ze_memory_advice_t advice                       ///< [in] Memory advice for the memory range
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Event
-#if !defined(__GNUC__)
-#pragma region event
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported event pool creation flags
-typedef uint32_t ze_event_pool_flags_t;
-typedef enum _ze_event_pool_flag_t
-{
-    ZE_EVENT_POOL_FLAG_HOST_VISIBLE = ZE_BIT(0),    ///< signals and waits are also visible to host
-    ZE_EVENT_POOL_FLAG_IPC = ZE_BIT(1),             ///< signals and waits may be shared across processes
-    ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP = ZE_BIT(2),///< Indicates all events in pool will contain kernel timestamps; cannot be
-                                                    ///< combined with ::ZE_EVENT_POOL_FLAG_IPC
-    ZE_EVENT_POOL_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_event_pool_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event pool descriptor
-typedef struct _ze_event_pool_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_event_pool_flags_t flags;                    ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_event_pool_flag_t;
-                                                    ///< default behavior is signals and waits are visible to the entire device
-                                                    ///< and peer devices.
-    uint32_t count;                                 ///< [in] number of events within the pool; must be greater than 0
-
-} ze_event_pool_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a pool of events on the context.
-/// 
-/// @details
-///     - The application must only use events within the pool for the
-///       device(s), or their sub-devices, which were provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phEventPool`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < desc->flags`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `0 == desc->count`
-///         + `(nullptr == phDevices) && (0 < numDevices)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventPoolCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_event_pool_desc_t* desc,               ///< [in] pointer to event pool descriptor
-    uint32_t numDevices,                            ///< [in][optional] number of device handles; must be 0 if `nullptr ==
-                                                    ///< phDevices`
-    ze_device_handle_t* phDevices,                  ///< [in][optional][range(0, numDevices)] array of device handles which
-                                                    ///< have visibility to the event pool.
-                                                    ///< if nullptr, then event pool is visible to all devices supported by the
-                                                    ///< driver instance.
-    ze_event_pool_handle_t* phEventPool             ///< [out] pointer handle of event pool object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Deletes an event pool object.
-/// 
-/// @details
-///     - The application must destroy all event handles created from the pool
-///       before destroying the pool itself.
-///     - The application must ensure the device is not currently referencing
-///       the any event within the pool before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this event pool.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same event pool handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEventPool`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventPoolDestroy(
-    ze_event_pool_handle_t hEventPool               ///< [in][release] handle of event pool object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported event scope flags
-typedef uint32_t ze_event_scope_flags_t;
-typedef enum _ze_event_scope_flag_t
-{
-    ZE_EVENT_SCOPE_FLAG_SUBDEVICE = ZE_BIT(0),      ///< cache hierarchies are flushed or invalidated sufficient for local
-                                                    ///< sub-device access
-    ZE_EVENT_SCOPE_FLAG_DEVICE = ZE_BIT(1),         ///< cache hierarchies are flushed or invalidated sufficient for global
-                                                    ///< device access and peer device access
-    ZE_EVENT_SCOPE_FLAG_HOST = ZE_BIT(2),           ///< cache hierarchies are flushed or invalidated sufficient for device and
-                                                    ///< host access
-    ZE_EVENT_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_event_scope_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event descriptor
-typedef struct _ze_event_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t index;                                 ///< [in] index of the event within the pool; must be less-than the count
-                                                    ///< specified during pool creation
-    ze_event_scope_flags_t signal;                  ///< [in] defines the scope of relevant cache hierarchies to flush on a
-                                                    ///< signal action before the event is triggered.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
-                                                    ///< default behavior is synchronization within the command list only, no
-                                                    ///< additional cache hierarchies are flushed.
-    ze_event_scope_flags_t wait;                    ///< [in] defines the scope of relevant cache hierarchies to invalidate on
-                                                    ///< a wait action after the event is complete.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
-                                                    ///< default behavior is synchronization within the command list only, no
-                                                    ///< additional cache hierarchies are invalidated.
-
-} ze_event_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates an event from the pool.
-/// 
-/// @details
-///     - An event is used to communicate fine-grain host-to-device,
-///       device-to-host or device-to-device dependencies have completed.
-///     - The application must ensure the location in the pool is not being used
-///       by another event.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same event pool handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clCreateUserEvent**
-///     - vkCreateEvent
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEventPool`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < desc->signal`
-///         + `0x7 < desc->wait`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventCreate(
-    ze_event_pool_handle_t hEventPool,              ///< [in] handle of the event pool
-    const ze_event_desc_t* desc,                    ///< [in] pointer to event descriptor
-    ze_event_handle_t* phEvent                      ///< [out] pointer to handle of event object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Deletes an event object.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the event before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this event.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same event handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clReleaseEvent**
-///     - vkDestroyEvent
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventDestroy(
-    ze_event_handle_t hEvent                        ///< [in][release] handle of event object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Gets an IPC event pool handle for the specified event handle that can
-///        be shared with another process.
-/// 
-/// @details
-///     - Event pool must have been created with ::ZE_EVENT_POOL_FLAG_IPC.
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEventPool`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phIpc`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventPoolGetIpcHandle(
-    ze_event_pool_handle_t hEventPool,              ///< [in] handle of event pool object
-    ze_ipc_event_pool_handle_t* phIpc               ///< [out] Returned IPC event handle
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Opens an IPC event pool handle to retrieve an event pool handle from
-///        another process.
-/// 
-/// @details
-///     - Multiple calls to this function with the same IPC handle will return
-///       unique event pool handles.
-///     - The event handle in this process should not be freed with
-///       ::zeEventPoolDestroy, but rather with ::zeEventPoolCloseIpcHandle.
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phEventPool`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventPoolOpenIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object to associate with the IPC event pool
-                                                    ///< handle
-    ze_ipc_event_pool_handle_t hIpc,                ///< [in] IPC event pool handle
-    ze_event_pool_handle_t* phEventPool             ///< [out] pointer handle of event pool object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Closes an IPC event handle in the current process.
-/// 
-/// @details
-///     - Closes an IPC event handle by destroying events that were opened in
-///       this process using ::zeEventPoolOpenIpcHandle.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same event pool handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEventPool`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventPoolCloseIpcHandle(
-    ze_event_pool_handle_t hEventPool               ///< [in][release] handle of event pool object
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends a signal of the event from the device into a command list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The duration of an event created from an event pool that was created
-///       using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
-///       However, for consistency and orthogonality the event will report
-///       correctly as signaled when used by other event API functionality.
-///     - The application must ensure the command list and events were created
-///       on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clSetUserEventStatus**
-///     - vkCmdSetEvent
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendSignalEvent(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends wait on event(s) on the device into a command list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the command list and events were created
-///       on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phEvents`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendWaitOnEvents(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numEvents,                             ///< [in] number of events to wait on before continuing
-    ze_event_handle_t* phEvents                     ///< [in][range(0, numEvents)] handles of the events to wait on before
-                                                    ///< continuing
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Signals a event from host.
-/// 
-/// @details
-///     - The duration of an event created from an event pool that was created
-///       using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
-///       However, for consistency and orthogonality the event will report
-///       correctly as signaled when used by other event API functionality.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clSetUserEventStatus
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventHostSignal(
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief The current host thread waits on an event to be signaled.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clWaitForEvents
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_NOT_READY
-///         + timeout expired
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventHostSynchronize(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then operates exactly like ::zeEventQueryStatus;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Queries an event object's status on the host.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **clGetEventInfo**
-///     - vkGetEventStatus
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_NOT_READY
-///         + not signaled
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventQueryStatus(
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends a reset of an event back to not signaled state into a command
-///        list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the command list and events were created
-///       on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - vkResetEvent
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendEventReset(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief The current host thread resets an event back to not signaled state.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - vkResetEvent
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventHostReset(
-    ze_event_handle_t hEvent                        ///< [in] handle of the event
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel timestamp clock data
-/// 
-/// @details
-///     - The timestamp frequency can be queried from
-///       ::ze_device_properties_t.timerResolution.
-///     - The number of valid bits in the timestamp value can be queried from
-///       ::ze_device_properties_t.kernelTimestampValidBits.
-typedef struct _ze_kernel_timestamp_data_t
-{
-    uint64_t kernelStart;                           ///< [out] device clock at start of kernel execution
-    uint64_t kernelEnd;                             ///< [out] device clock at end of kernel execution
-
-} ze_kernel_timestamp_data_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel timestamp result
-typedef struct _ze_kernel_timestamp_result_t
-{
-    ze_kernel_timestamp_data_t global;              ///< [out] wall-clock data
-    ze_kernel_timestamp_data_t context;             ///< [out] context-active data; only includes clocks while device context
-                                                    ///< was actively executing.
-
-} ze_kernel_timestamp_result_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Queries an event's timestamp value on the host.
-/// 
-/// @details
-///     - The application must ensure the event was created from an event pool
-///       that was created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag.
-///     - The destination memory will be unmodified if the event has not been
-///       signaled.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_NOT_READY
-///         + not signaled
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventQueryKernelTimestamp(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    ze_kernel_timestamp_result_t* dstptr            ///< [in,out] pointer to memory for where timestamp result will be written.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends a query of an events' timestamp value(s) into a command list.
-/// 
-/// @details
-///     - The application must ensure the events are accessible by the device on
-///       which the command list was created.
-///     - The application must ensure the events were created from an event pool
-///       that was created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag.
-///     - The application must ensure the memory pointed to by both dstptr and
-///       pOffsets is accessible by the device on which the command list was
-///       created.
-///     - The value(s) written to the destination buffer are undefined if any
-///       timestamp event has not been signaled.
-///     - If pOffsets is nullptr, then multiple results will be appended
-///       sequentially into memory in the same order as phEvents.
-///     - The application must ensure the command list and events were created,
-///       and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phEvents`
-///         + `nullptr == dstptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendQueryKernelTimestamps(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numEvents,                             ///< [in] the number of timestamp events to query
-    ze_event_handle_t* phEvents,                    ///< [in][range(0, numEvents)] handles of timestamp events to query
-    void* dstptr,                                   ///< [in,out] pointer to memory where ::ze_kernel_timestamp_result_t will
-                                                    ///< be written; must be size-aligned.
-    const size_t* pOffsets,                         ///< [in][optional][range(0, numEvents)] offset, in bytes, to write
-                                                    ///< results; address must be 4byte-aligned and offsets must be
-                                                    ///< size-aligned.
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before executing query;
-                                                    ///< must be 0 if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before executing query
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Fence
-#if !defined(__GNUC__)
-#pragma region fence
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported fence creation flags
-typedef uint32_t ze_fence_flags_t;
-typedef enum _ze_fence_flag_t
-{
-    ZE_FENCE_FLAG_SIGNALED = ZE_BIT(0),             ///< fence is created in the signaled state, otherwise not signaled.
-    ZE_FENCE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_fence_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fence descriptor
-typedef struct _ze_fence_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_fence_flags_t flags;                         ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_fence_flag_t.
-
-} ze_fence_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a fence for the command queue.
-/// 
-/// @details
-///     - A fence is a heavyweight synchronization primitive used to communicate
-///       to the host that command list execution has completed.
-///     - The application must only use the fence for the command queue which
-///       was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkCreateFence**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandQueue`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phFence`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < desc->flags`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeFenceCreate(
-    ze_command_queue_handle_t hCommandQueue,        ///< [in] handle of command queue
-    const ze_fence_desc_t* desc,                    ///< [in] pointer to fence descriptor
-    ze_fence_handle_t* phFence                      ///< [out] pointer to handle of fence object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Deletes a fence object.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the fence before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this fence.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same fence handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkDestroyFence**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFence`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeFenceDestroy(
-    ze_fence_handle_t hFence                        ///< [in][release] handle of fence object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief The current host thread waits on a fence to be signaled.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkWaitForFences**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFence`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_NOT_READY
-///         + timeout expired
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeFenceHostSynchronize(
-    ze_fence_handle_t hFence,                       ///< [in] handle of the fence
-    uint64_t timeout                                ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then operates exactly like ::zeFenceQueryStatus;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Queries a fence object's status.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkGetFenceStatus**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFence`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_NOT_READY
-///         + not signaled
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeFenceQueryStatus(
-    ze_fence_handle_t hFence                        ///< [in] handle of the fence
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Reset a fence back to the not signaled state.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - **vkResetFences**
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFence`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeFenceReset(
-    ze_fence_handle_t hFence                        ///< [in] handle of the fence
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Images
-#if !defined(__GNUC__)
-#pragma region image
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported image creation flags
-typedef uint32_t ze_image_flags_t;
-typedef enum _ze_image_flag_t
-{
-    ZE_IMAGE_FLAG_KERNEL_WRITE = ZE_BIT(0),         ///< kernels will write contents
-    ZE_IMAGE_FLAG_BIAS_UNCACHED = ZE_BIT(1),        ///< device should not cache contents
-    ZE_IMAGE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported image types
-typedef enum _ze_image_type_t
-{
-    ZE_IMAGE_TYPE_1D = 0,                           ///< 1D
-    ZE_IMAGE_TYPE_1DARRAY = 1,                      ///< 1D array
-    ZE_IMAGE_TYPE_2D = 2,                           ///< 2D
-    ZE_IMAGE_TYPE_2DARRAY = 3,                      ///< 2D array
-    ZE_IMAGE_TYPE_3D = 4,                           ///< 3D
-    ZE_IMAGE_TYPE_BUFFER = 5,                       ///< Buffer
-    ZE_IMAGE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported image format layouts
-typedef enum _ze_image_format_layout_t
-{
-    ZE_IMAGE_FORMAT_LAYOUT_8 = 0,                   ///< 8-bit single component layout
-    ZE_IMAGE_FORMAT_LAYOUT_16 = 1,                  ///< 16-bit single component layout
-    ZE_IMAGE_FORMAT_LAYOUT_32 = 2,                  ///< 32-bit single component layout
-    ZE_IMAGE_FORMAT_LAYOUT_8_8 = 3,                 ///< 2-component 8-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 = 4,             ///< 4-component 8-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_16_16 = 5,               ///< 2-component 16-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 = 6,         ///< 4-component 16-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_32_32 = 7,               ///< 2-component 32-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 = 8,         ///< 4-component 32-bit layout
-    ZE_IMAGE_FORMAT_LAYOUT_10_10_10_2 = 9,          ///< 4-component 10_10_10_2 layout
-    ZE_IMAGE_FORMAT_LAYOUT_11_11_10 = 10,           ///< 3-component 11_11_10 layout
-    ZE_IMAGE_FORMAT_LAYOUT_5_6_5 = 11,              ///< 3-component 5_6_5 layout
-    ZE_IMAGE_FORMAT_LAYOUT_5_5_5_1 = 12,            ///< 4-component 5_5_5_1 layout
-    ZE_IMAGE_FORMAT_LAYOUT_4_4_4_4 = 13,            ///< 4-component 4_4_4_4 layout
-    ZE_IMAGE_FORMAT_LAYOUT_Y8 = 14,                 ///< Media Format: Y8. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_NV12 = 15,               ///< Media Format: NV12. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YUYV = 16,               ///< Media Format: YUYV. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_VYUY = 17,               ///< Media Format: VYUY. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YVYU = 18,               ///< Media Format: YVYU. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_UYVY = 19,               ///< Media Format: UYVY. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_AYUV = 20,               ///< Media Format: AYUV. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P010 = 21,               ///< Media Format: P010. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y410 = 22,               ///< Media Format: Y410. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P012 = 23,               ///< Media Format: P012. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y16 = 24,                ///< Media Format: Y16. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P016 = 25,               ///< Media Format: P016. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y216 = 26,               ///< Media Format: Y216. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P216 = 27,               ///< Media Format: P216. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_P8 = 28,                 ///< Media Format: P8. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YUY2 = 29,               ///< Media Format: YUY2. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_A8P8 = 30,               ///< Media Format: A8P8. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_IA44 = 31,               ///< Media Format: IA44. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_AI44 = 32,               ///< Media Format: AI44. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y416 = 33,               ///< Media Format: Y416. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_Y210 = 34,               ///< Media Format: Y210. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_I420 = 35,               ///< Media Format: I420. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_YV12 = 36,               ///< Media Format: YV12. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_400P = 37,               ///< Media Format: 400P. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_422H = 38,               ///< Media Format: 422H. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_422V = 39,               ///< Media Format: 422V. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_444P = 40,               ///< Media Format: 444P. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_RGBP = 41,               ///< Media Format: RGBP. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_BRGP = 42,               ///< Media Format: BRGP. Format type and swizzle is ignored for this.
-    ZE_IMAGE_FORMAT_LAYOUT_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_format_layout_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported image format types
-typedef enum _ze_image_format_type_t
-{
-    ZE_IMAGE_FORMAT_TYPE_UINT = 0,                  ///< Unsigned integer
-    ZE_IMAGE_FORMAT_TYPE_SINT = 1,                  ///< Signed integer
-    ZE_IMAGE_FORMAT_TYPE_UNORM = 2,                 ///< Unsigned normalized integer
-    ZE_IMAGE_FORMAT_TYPE_SNORM = 3,                 ///< Signed normalized integer
-    ZE_IMAGE_FORMAT_TYPE_FLOAT = 4,                 ///< Float
-    ZE_IMAGE_FORMAT_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_format_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported image format component swizzle into channel
-typedef enum _ze_image_format_swizzle_t
-{
-    ZE_IMAGE_FORMAT_SWIZZLE_R = 0,                  ///< Red component
-    ZE_IMAGE_FORMAT_SWIZZLE_G = 1,                  ///< Green component
-    ZE_IMAGE_FORMAT_SWIZZLE_B = 2,                  ///< Blue component
-    ZE_IMAGE_FORMAT_SWIZZLE_A = 3,                  ///< Alpha component
-    ZE_IMAGE_FORMAT_SWIZZLE_0 = 4,                  ///< Zero
-    ZE_IMAGE_FORMAT_SWIZZLE_1 = 5,                  ///< One
-    ZE_IMAGE_FORMAT_SWIZZLE_X = 6,                  ///< Don't care
-    ZE_IMAGE_FORMAT_SWIZZLE_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_format_swizzle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image format 
-typedef struct _ze_image_format_t
-{
-    ze_image_format_layout_t layout;                ///< [in] image format component layout
-    ze_image_format_type_t type;                    ///< [in] image format type. Media formats can't be used for
-                                                    ///< ::ZE_IMAGE_TYPE_BUFFER.
-    ze_image_format_swizzle_t x;                    ///< [in] image component swizzle into channel x
-    ze_image_format_swizzle_t y;                    ///< [in] image component swizzle into channel y
-    ze_image_format_swizzle_t z;                    ///< [in] image component swizzle into channel z
-    ze_image_format_swizzle_t w;                    ///< [in] image component swizzle into channel w
-
-} ze_image_format_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image descriptor
-typedef struct _ze_image_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_image_flags_t flags;                         ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_image_flag_t;
-                                                    ///< default is read-only, cached access.
-    ze_image_type_t type;                           ///< [in] image type
-    ze_image_format_t format;                       ///< [in] image format
-    uint64_t width;                                 ///< [in] width dimension.
-                                                    ///< ::ZE_IMAGE_TYPE_BUFFER: size in bytes; see
-                                                    ///< ::ze_device_image_properties_t.maxImageBufferSize for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_1D, ::ZE_IMAGE_TYPE_1DARRAY: width in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims1D for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: width in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims2D for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_3D: width in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims3D for limits.
-    uint32_t height;                                ///< [in] height dimension.
-                                                    ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: height in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims2D for limits.
-                                                    ///< ::ZE_IMAGE_TYPE_3D: height in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims3D for limits.
-                                                    ///< other: ignored.
-    uint32_t depth;                                 ///< [in] depth dimension.
-                                                    ///< ::ZE_IMAGE_TYPE_3D: depth in pixels; see
-                                                    ///< ::ze_device_image_properties_t.maxImageDims3D for limits.
-                                                    ///< other: ignored.
-    uint32_t arraylevels;                           ///< [in] array levels.
-                                                    ///< ::ZE_IMAGE_TYPE_1DARRAY, ::ZE_IMAGE_TYPE_2DARRAY: see
-                                                    ///< ::ze_device_image_properties_t.maxImageArraySlices for limits.
-                                                    ///< other: ignored.
-    uint32_t miplevels;                             ///< [in] mipmap levels (must be 0)
-
-} ze_image_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported sampler filtering flags
-typedef uint32_t ze_image_sampler_filter_flags_t;
-typedef enum _ze_image_sampler_filter_flag_t
-{
-    ZE_IMAGE_SAMPLER_FILTER_FLAG_POINT = ZE_BIT(0), ///< device supports point filtering
-    ZE_IMAGE_SAMPLER_FILTER_FLAG_LINEAR = ZE_BIT(1),///< device supports linear filtering
-    ZE_IMAGE_SAMPLER_FILTER_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_sampler_filter_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image properties
-typedef struct _ze_image_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_image_sampler_filter_flags_t samplerFilterFlags; ///< [out] supported sampler filtering.
-                                                    ///< returns 0 (unsupported) or a combination of ::ze_image_sampler_filter_flag_t.
-
-} ze_image_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves supported properties of an image.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == pImageProperties`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < desc->flags`
-///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeImageGetProperties(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_properties_t* pImageProperties         ///< [out] pointer to image properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates an image on the context.
-/// 
-/// @details
-///     - The application must only use the image for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clCreateImage
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phImage`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < desc->flags`
-///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeImageCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_handle_t* phImage                      ///< [out] pointer to handle of image object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Deletes an image object.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the image before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this image.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same image handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hImage`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeImageDestroy(
-    ze_image_handle_t hImage                        ///< [in][release] handle of image object to destroy
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Memory
-#if !defined(__GNUC__)
-#pragma region memory
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported memory allocation flags
-typedef uint32_t ze_device_mem_alloc_flags_t;
-typedef enum _ze_device_mem_alloc_flag_t
-{
-    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0),   ///< device should cache allocation
-    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1), ///< device should not cache allocation (UC)
-    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(2),///< optimize shared allocation for first access on the device
-    ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_mem_alloc_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device memory allocation descriptor
-typedef struct _ze_device_mem_alloc_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_mem_alloc_flags_t flags;              ///< [in] flags specifying additional allocation controls.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_device_mem_alloc_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
-    uint32_t ordinal;                               ///< [in] ordinal of the device's local memory to allocate from.
-                                                    ///< must be less than the count returned from ::zeDeviceGetMemoryProperties.
-
-} ze_device_mem_alloc_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported host memory allocation flags
-typedef uint32_t ze_host_mem_alloc_flags_t;
-typedef enum _ze_host_mem_alloc_flag_t
-{
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0), ///< host should cache allocation
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1),   ///< host should not cache allocation (UC)
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED = ZE_BIT(2), ///< host memory should be allocated write-combined (WC)
-    ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(3),  ///< optimize shared allocation for first access on the host
-    ZE_HOST_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_host_mem_alloc_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Host memory allocation descriptor
-typedef struct _ze_host_mem_alloc_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_host_mem_alloc_flags_t flags;                ///< [in] flags specifying additional allocation controls.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_host_mem_alloc_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
-
-} ze_host_mem_alloc_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Allocates shared memory on the context.
-/// 
-/// @details
-///     - Shared allocations share ownership between the host and one or more
-///       devices.
-///     - Shared allocations may optionally be associated with a device by
-///       passing a handle to the device.
-///     - Devices supporting only single-device shared access capabilities may
-///       access shared memory associated with the device.
-///       For these devices, ownership of the allocation is shared between the
-///       host and the associated device only.
-///     - Passing nullptr as the device handle does not associate the shared
-///       allocation with any device.
-///       For allocations with no associated device, ownership of the allocation
-///       is shared between the host and all devices supporting cross-device
-///       shared access capabilities.
-///     - The application must only use the memory allocation for the context
-///       and device, or its sub-devices, which was provided during allocation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == device_desc`
-///         + `nullptr == host_desc`
-///         + `nullptr == pptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < device_desc->flags`
-///         + `0xf < host_desc->flags`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
-///         + Must be zero or a power-of-two
-///         + `0 != (alignment & (alignment - 1))`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemAllocShared(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_device_mem_alloc_desc_t* device_desc,  ///< [in] pointer to device memory allocation descriptor
-    const ze_host_mem_alloc_desc_t* host_desc,      ///< [in] pointer to host memory allocation descriptor
-    size_t size,                                    ///< [in] size in bytes to allocate; must be less-than
-                                                    ///< ::ze_device_properties_t.maxMemAllocSize.
-    size_t alignment,                               ///< [in] minimum alignment in bytes for the allocation; must be a power of
-                                                    ///< two.
-    ze_device_handle_t hDevice,                     ///< [in][optional] device handle to associate with
-    void** pptr                                     ///< [out] pointer to shared allocation
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Allocates device memory on the context.
-/// 
-/// @details
-///     - Device allocations are owned by a specific device.
-///     - In general, a device allocation may only be accessed by the device
-///       that owns it.
-///     - The application must only use the memory allocation for the context
-///       and device, or its sub-devices, which was provided during allocation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == device_desc`
-///         + `nullptr == pptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < device_desc->flags`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
-///         + Must be zero or a power-of-two
-///         + `0 != (alignment & (alignment - 1))`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemAllocDevice(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_device_mem_alloc_desc_t* device_desc,  ///< [in] pointer to device memory allocation descriptor
-    size_t size,                                    ///< [in] size in bytes to allocate; must be less-than
-                                                    ///< ::ze_device_properties_t.maxMemAllocSize.
-    size_t alignment,                               ///< [in] minimum alignment in bytes for the allocation; must be a power of
-                                                    ///< two.
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    void** pptr                                     ///< [out] pointer to device allocation
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Allocates host memory on the context.
-/// 
-/// @details
-///     - Host allocations are owned by the host process.
-///     - Host allocations are accessible by the host and all devices within the
-///       driver's context.
-///     - Host allocations are frequently used as staging areas to transfer data
-///       to or from devices.
-///     - The application must only use the memory allocation for the context
-///       which was provided during allocation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == host_desc`
-///         + `nullptr == pptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0xf < host_desc->flags`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
-///         + Must be zero or a power-of-two
-///         + `0 != (alignment & (alignment - 1))`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemAllocHost(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_host_mem_alloc_desc_t* host_desc,      ///< [in] pointer to host memory allocation descriptor
-    size_t size,                                    ///< [in] size in bytes to allocate; must be less-than
-                                                    ///< ::ze_device_properties_t.maxMemAllocSize.
-    size_t alignment,                               ///< [in] minimum alignment in bytes for the allocation; must be a power of
-                                                    ///< two.
-    void** pptr                                     ///< [out] pointer to host allocation
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frees allocated host memory, device memory, or shared memory on the
-///        context.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the memory before it is freed
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this memory
-///     - The application must **not** call this function from simultaneous
-///       threads with the same pointer.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemFree(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    void* ptr                                       ///< [in][release] pointer to memory to free
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory allocation type
-typedef enum _ze_memory_type_t
-{
-    ZE_MEMORY_TYPE_UNKNOWN = 0,                     ///< the memory pointed to is of unknown type
-    ZE_MEMORY_TYPE_HOST = 1,                        ///< the memory pointed to is a host allocation
-    ZE_MEMORY_TYPE_DEVICE = 2,                      ///< the memory pointed to is a device allocation
-    ZE_MEMORY_TYPE_SHARED = 3,                      ///< the memory pointed to is a shared ownership allocation
-    ZE_MEMORY_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory allocation properties queried using ::zeMemGetAllocProperties
-typedef struct _ze_memory_allocation_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_memory_type_t type;                          ///< [out] type of allocated memory
-    uint64_t id;                                    ///< [out] identifier for this allocation
-    uint64_t pageSize;                              ///< [out] page size used for allocation
-
-} ze_memory_allocation_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves attributes of a memory allocation
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The application may query attributes of a memory allocation unrelated
-///       to the context.
-///       When this occurs, the returned allocation type will be
-///       ::ZE_MEMORY_TYPE_UNKNOWN, and the returned identifier and associated
-///       device is unspecified.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///         + `nullptr == pMemAllocProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemGetAllocProperties(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] memory pointer to query
-    ze_memory_allocation_properties_t* pMemAllocProperties, ///< [in,out] query result for memory allocation properties
-    ze_device_handle_t* phDevice                    ///< [out][optional] device associated with this allocation
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves the base address and/or size of an allocation
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemGetAddressRange(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] memory pointer to query
-    void** pBase,                                   ///< [in,out][optional] base address of the allocation
-    size_t* pSize                                   ///< [in,out][optional] size of the allocation
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates an IPC memory handle for the specified allocation
-/// 
-/// @details
-///     - Takes a pointer to a device memory allocation and creates an IPC
-///       memory handle for exporting it for use in another process.
-///     - The pointer must be base pointer of the device memory allocation; i.e.
-///       the value returned from ::zeMemAllocDevice.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///         + `nullptr == pIpcHandle`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemGetIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to the device memory allocation
-    ze_ipc_mem_handle_t* pIpcHandle                 ///< [out] Returned IPC memory handle
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported IPC memory flags
-typedef uint32_t ze_ipc_memory_flags_t;
-typedef enum _ze_ipc_memory_flag_t
-{
-    ZE_IPC_MEMORY_FLAG_BIAS_CACHED = ZE_BIT(0),     ///< device should cache allocation
-    ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED = ZE_BIT(1),   ///< device should not cache allocation (UC)
-    ZE_IPC_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_ipc_memory_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Opens an IPC memory handle to retrieve a device pointer on the
-///        context.
-/// 
-/// @details
-///     - Takes an IPC memory handle from a remote process and associates it
-///       with a device pointer usable in this process.
-///     - The device pointer in this process should not be freed with
-///       ::zeMemFree, but rather with ::zeMemCloseIpcHandle.
-///     - Multiple calls to this function with the same IPC handle will return
-///       unique pointers.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < flags`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pptr`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemOpenIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device to associate with the IPC memory handle
-    ze_ipc_mem_handle_t handle,                     ///< [in] IPC memory handle
-    ze_ipc_memory_flags_t flags,                    ///< [in] flags controlling the operation.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_ipc_memory_flag_t.
-    void** pptr                                     ///< [out] pointer to device allocation in this process
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Closes an IPC memory handle
-/// 
-/// @details
-///     - Closes an IPC memory handle by unmapping memory that was opened in
-///       this process using ::zeMemOpenIpcHandle.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same pointer.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemCloseIpcHandle(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr                                 ///< [in][release] pointer to device allocation in this process
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Additional allocation descriptor for exporting external memory
-/// 
-/// @details
-///     - This structure may be passed to ::zeMemAllocDevice, via the `pNext`
-///       member of ::ze_device_mem_alloc_desc_t, to indicate an exportable
-///       memory allocation.
-///     - This structure may be passed to ::zeImageCreate, via the `pNext`
-///       member of ::ze_image_desc_t, to indicate an exportable image.
-typedef struct _ze_external_memory_export_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying memory export types for this allocation.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-
-} ze_external_memory_export_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Additional allocation descriptor for importing external memory as a
-///        file descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeMemAllocDevice, via the `pNext`
-///       member of ::ze_device_mem_alloc_desc_t, to import memory from a file
-///       descriptor.
-///     - This structure may be passed to ::zeImageCreate, via the `pNext`
-///       member of ::ze_image_desc_t, to import memory from a file descriptor.
-typedef struct _ze_external_memory_import_fd_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory import type for the file descriptor.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    int fd;                                         ///< [in] the file descriptor handle to import
-
-} ze_external_memory_import_fd_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exports an allocation as a file descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeMemGetAllocProperties, via the
-///       `pNext` member of ::ze_memory_allocation_properties_t, to export a
-///       memory allocation as a file descriptor.
-///     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
-///       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
-///       export an image as a file descriptor.
-///     - The requested memory export type must have been specified when the
-///       allocation was made.
-typedef struct _ze_external_memory_export_fd_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory export type for the file descriptor.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    int fd;                                         ///< [out] the exported file descriptor handle representing the allocation.
-
-} ze_external_memory_export_fd_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Additional allocation descriptor for importing external memory as a
-///        Win32 handle
-/// 
-/// @details
-///     - When `handle` is `nullptr`, `name` must not be `nullptr`.
-///     - When `name` is `nullptr`, `handle` must not be `nullptr`.
-///     - When `flags` is ::ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT,
-///       `name` must be `nullptr`.
-///     - This structure may be passed to ::zeMemAllocDevice, via the `pNext`
-///       member of ::ze_device_mem_alloc_desc_t, to import memory from a Win32
-///       handle.
-///     - This structure may be passed to ::zeImageCreate, via the `pNext`
-///       member of ::ze_image_desc_t, to import memory from a Win32 handle.
-typedef struct _ze_external_memory_import_win32_handle_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory import type for the Win32 handle.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    void* handle;                                   ///< [in][optional] the Win32 handle to import
-    const void* name;                               ///< [in][optional] name of a memory object to import
-
-} ze_external_memory_import_win32_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exports an allocation as a Win32 handle
-/// 
-/// @details
-///     - This structure may be passed to ::zeMemGetAllocProperties, via the
-///       `pNext` member of ::ze_memory_allocation_properties_t, to export a
-///       memory allocation as a Win32 handle.
-///     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
-///       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
-///       export an image as a Win32 handle.
-///     - The requested memory export type must have been specified when the
-///       allocation was made.
-typedef struct _ze_external_memory_export_win32_handle_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_external_memory_type_flags_t flags;          ///< [in] flags specifying the memory export type for the Win32 handle.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
-    void* handle;                                   ///< [out] the exported Win32 handle representing the allocation.
-
-} ze_external_memory_export_win32_handle_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Module
-#if !defined(__GNUC__)
-#pragma region module
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported module creation input formats
-typedef enum _ze_module_format_t
-{
-    ZE_MODULE_FORMAT_IL_SPIRV = 0,                  ///< Format is SPIRV IL format
-    ZE_MODULE_FORMAT_NATIVE = 1,                    ///< Format is device native format
-    ZE_MODULE_FORMAT_FORCE_UINT32 = 0x7fffffff
-
-} ze_module_format_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Specialization constants - User defined constants
-typedef struct _ze_module_constants_t
-{
-    uint32_t numConstants;                          ///< [in] Number of specialization constants.
-    const uint32_t* pConstantIds;                   ///< [in][range(0, numConstants)] Array of IDs that is sized to
-                                                    ///< numConstants.
-    const void** pConstantValues;                   ///< [in][range(0, numConstants)] Array of pointers to values that is sized
-                                                    ///< to numConstants.
-
-} ze_module_constants_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Module descriptor
-typedef struct _ze_module_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_module_format_t format;                      ///< [in] Module format passed in with pInputModule
-    size_t inputSize;                               ///< [in] size of input IL or ISA from pInputModule.
-    const uint8_t* pInputModule;                    ///< [in] pointer to IL or ISA
-    const char* pBuildFlags;                        ///< [in][optional] string containing compiler flags. Following options are supported.
-                                                    ///<  - "-ze-opt-disable"
-                                                    ///<       - Disable optimizations
-                                                    ///<  - "-ze-opt-level"
-                                                    ///<       - Specifies optimization level for compiler. Levels are
-                                                    ///< implementation specific.
-                                                    ///<           - 0 is no optimizations (equivalent to -ze-opt-disable)
-                                                    ///<           - 1 is optimize minimally (may be the same as 2)
-                                                    ///<           - 2 is optimize more (default)
-                                                    ///<  - "-ze-opt-greater-than-4GB-buffer-required"
-                                                    ///<       - Use 64-bit offset calculations for buffers.
-                                                    ///<  - "-ze-opt-large-register-file"
-                                                    ///<       - Increase number of registers available to threads.
-                                                    ///<  - "-ze-opt-has-buffer-offset-arg"
-                                                    ///<       - Extend stateless to stateful optimization to more
-                                                    ///<         cases with the use of additional offset (e.g. 64-bit
-                                                    ///<         pointer to binding table with 32-bit offset).
-                                                    ///<  - "-g"
-                                                    ///<       - Include debugging information.
-    const ze_module_constants_t* pConstants;        ///< [in][optional] pointer to specialization constants. Valid only for
-                                                    ///< SPIR-V input. This must be set to nullptr if no specialization
-                                                    ///< constants are provided.
-
-} ze_module_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a module on the context.
-/// 
-/// @details
-///     - Compiles the module for execution on the device.
-///     - The application must only use the module for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The module can be copied to other devices and contexts within the same
-///       driver instance by using ::zeModuleGetNativeBinary.
-///     - A build log can optionally be returned to the caller. The caller is
-///       responsible for destroying build log using ::zeModuleBuildLogDestroy.
-///     - The module descriptor constants are only supported for SPIR-V
-///       specialization constants.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == desc->pInputModule`
-///         + `nullptr == phModule`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_MODULE_FORMAT_NATIVE < desc->format`
-///     - ::ZE_RESULT_ERROR_INVALID_NATIVE_BINARY
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `0 == desc->inputSize`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-///     - ::ZE_RESULT_ERROR_MODULE_BUILD_FAILURE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_module_desc_t* desc,                   ///< [in] pointer to module descriptor
-    ze_module_handle_t* phModule,                   ///< [out] pointer to handle of module object created
-    ze_module_build_log_handle_t* phBuildLog        ///< [out][optional] pointer to handle of module's build log.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys module
-/// 
-/// @details
-///     - The application must destroy all kernel and build log handles created
-///       from the module before destroying the module itself.
-///     - The application must ensure the device is not currently referencing
-///       the module before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this module.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same module handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleDestroy(
-    ze_module_handle_t hModule                      ///< [in][release] handle of the module
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Dynamically link modules together that share import/export linkage
-///        dependencies.
-/// 
-/// @details
-///     - Modules support SPIR-V import and export linkage types for functions
-///       and global variables. See the SPIR-V specification for linkage
-///       details.
-///     - Modules can have both import and export linkage.
-///     - Modules that do not have any imports or exports do not need to be
-///       linked.
-///     - All module import requirements must be satisfied via linking before
-///       kernel objects can be created from them.
-///     - Modules cannot be partially linked. Unsatisfiable import dependencies
-///       in the set of modules passed to ::zeModuleDynamicLink will result in 
-///       ::ZE_RESULT_ERROR_MODULE_LINK_FAILURE being returned.
-///     - Modules will only be linked once. A module can be used in multiple
-///       link calls if it has exports but its imports will not be re-linked.
-///     - Ambiguous dependencies, where multiple modules satisfy the same import
-///       dependencies for a module, are not allowed.
-///     - The application must ensure the modules being linked were created on
-///       the same context.
-///     - The application may call this function from simultaneous threads as
-///       long as the import modules being linked are not the same.
-///     - ModuleGetNativeBinary can be called on any module regardless of
-///       whether it is linked or not.
-///     - A link log can optionally be returned to the caller. The caller is
-///       responsible for destroying the link log using
-///       ::zeModuleBuildLogDestroy.
-///     - The link log may contain a list of the unresolved import dependencies
-///       if present.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phModules`
-///     - ::ZE_RESULT_ERROR_MODULE_LINK_FAILURE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleDynamicLink(
-    uint32_t numModules,                            ///< [in] number of modules to be linked pointed to by phModules.
-    ze_module_handle_t* phModules,                  ///< [in][range(0, numModules)] pointer to an array of modules to
-                                                    ///< dynamically link together.
-    ze_module_build_log_handle_t* phLinkLog         ///< [out][optional] pointer to handle of dynamic link log.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys module build log object
-/// 
-/// @details
-///     - The implementation of this function may immediately free all Host
-///       allocations associated with this object.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same build log handle.
-///     - The implementation of this function should be lock-free.
-///     - This function can be called before or after ::zeModuleDestroy for the
-///       associated module.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModuleBuildLog`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleBuildLogDestroy(
-    ze_module_build_log_handle_t hModuleBuildLog    ///< [in][release] handle of the module build log object.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves text string for build log.
-/// 
-/// @details
-///     - The caller can pass nullptr for pBuildLog when querying only for size.
-///     - The caller must provide memory for build log.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModuleBuildLog`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pSize`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleBuildLogGetString(
-    ze_module_build_log_handle_t hModuleBuildLog,   ///< [in] handle of the module build log object.
-    size_t* pSize,                                  ///< [in,out] size of build log string.
-    char* pBuildLog                                 ///< [in,out][optional] pointer to null-terminated string of the log.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve native binary from Module.
-/// 
-/// @details
-///     - The native binary output can be cached to disk and new modules can be
-///       later constructed from the cached copy.
-///     - The native binary will retain debugging information that is associated
-///       with a module.
-///     - The caller can pass nullptr for pModuleNativeBinary when querying only
-///       for size.
-///     - The implementation will copy the native binary into a buffer supplied
-///       by the caller.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pSize`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleGetNativeBinary(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    size_t* pSize,                                  ///< [in,out] size of native binary in bytes.
-    uint8_t* pModuleNativeBinary                    ///< [in,out][optional] byte pointer to native binary
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve global variable pointer from Module.
-/// 
-/// @details
-///     - The application may query global pointer from any module that either
-///       exports or imports it.
-///     - The application must dynamically link a module that imports a global
-///       before the global pointer can be queried from it.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pGlobalName`
-///     - ::ZE_RESULT_ERROR_INVALID_GLOBAL_NAME
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleGetGlobalPointer(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    const char* pGlobalName,                        ///< [in] name of global variable in module
-    size_t* pSize,                                  ///< [in,out][optional] size of global variable
-    void** pptr                                     ///< [in,out][optional] device visible pointer
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve all kernel names in the module.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleGetKernelNames(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of names.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of names available.
-                                                    ///< if count is greater than the number of names available, then the
-                                                    ///< driver shall update the value with the correct number of names available.
-    const char** pNames                             ///< [in,out][optional][range(0, *pCount)] array of names of functions.
-                                                    ///< if count is less than the number of names available, then driver shall
-                                                    ///< only retrieve that number of names.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported module property flags
-typedef uint32_t ze_module_property_flags_t;
-typedef enum _ze_module_property_flag_t
-{
-    ZE_MODULE_PROPERTY_FLAG_IMPORTS = ZE_BIT(0),    ///< Module has imports (i.e. imported global variables and/or kernels).
-                                                    ///< See ::zeModuleDynamicLink.
-    ZE_MODULE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_module_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Module properties
-typedef struct _ze_module_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_module_property_flags_t flags;               ///< [out] 0 (none) or a valid combination of ::ze_module_property_flag_t
-
-} ze_module_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve module properties.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pModuleProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleGetProperties(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    ze_module_properties_t* pModuleProperties       ///< [in,out] query result for module properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported kernel creation flags
-typedef uint32_t ze_kernel_flags_t;
-typedef enum _ze_kernel_flag_t
-{
-    ZE_KERNEL_FLAG_FORCE_RESIDENCY = ZE_BIT(0),     ///< force all device allocations to be resident during execution
-    ZE_KERNEL_FLAG_EXPLICIT_RESIDENCY = ZE_BIT(1),  ///< application is responsible for all residency of device allocations.
-                                                    ///< driver may disable implicit residency management.
-    ZE_KERNEL_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_kernel_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel descriptor
-typedef struct _ze_kernel_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_kernel_flags_t flags;                        ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_kernel_flag_t;
-                                                    ///< default behavior may use driver-based residency.
-    const char* pKernelName;                        ///< [in] null-terminated name of kernel in module
-
-} ze_kernel_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Create a kernel from the module.
-/// 
-/// @details
-///     - Modules that have unresolved imports need to be dynamically linked
-///       before a kernel can be created from them. (See ::zeModuleDynamicLink)
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == desc->pKernelName`
-///         + `nullptr == phKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < desc->flags`
-///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_NAME
-///     - ::ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelCreate(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    const ze_kernel_desc_t* desc,                   ///< [in] pointer to kernel descriptor
-    ze_kernel_handle_t* phKernel                    ///< [out] handle of the Function object
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys a kernel object
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the kernel before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this kernel.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same kernel handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelDestroy(
-    ze_kernel_handle_t hKernel                      ///< [in][release] handle of the kernel object
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve a function pointer from a module by name
-/// 
-/// @details
-///     - The function pointer is unique for the device on which the module was
-///       created.
-///     - The function pointer is no longer valid if module is destroyed.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pFunctionName`
-///         + `nullptr == pfnFunction`
-///     - ::ZE_RESULT_ERROR_INVALID_FUNCTION_NAME
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleGetFunctionPointer(
-    ze_module_handle_t hModule,                     ///< [in] handle of the module
-    const char* pFunctionName,                      ///< [in] Name of function to retrieve function pointer for.
-    void** pfnFunction                              ///< [out] pointer to function.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set group size for a kernel.
-/// 
-/// @details
-///     - The group size will be used when a ::zeCommandListAppendLaunchKernel
-///       variant is called.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSetGroupSize(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t groupSizeX,                            ///< [in] group size for X dimension to use for this kernel
-    uint32_t groupSizeY,                            ///< [in] group size for Y dimension to use for this kernel
-    uint32_t groupSizeZ                             ///< [in] group size for Z dimension to use for this kernel
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Query a suggested group size for a kernel given a global size for each
-///        dimension.
-/// 
-/// @details
-///     - This function ignores the group size that is set using
-///       ::zeKernelSetGroupSize.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == groupSizeX`
-///         + `nullptr == groupSizeY`
-///         + `nullptr == groupSizeZ`
-///     - ::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSuggestGroupSize(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t globalSizeX,                           ///< [in] global width for X dimension
-    uint32_t globalSizeY,                           ///< [in] global width for Y dimension
-    uint32_t globalSizeZ,                           ///< [in] global width for Z dimension
-    uint32_t* groupSizeX,                           ///< [out] recommended size of group for X dimension
-    uint32_t* groupSizeY,                           ///< [out] recommended size of group for Y dimension
-    uint32_t* groupSizeZ                            ///< [out] recommended size of group for Z dimension
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Query a suggested max group count for a cooperative kernel.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == totalGroupCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSuggestMaxCooperativeGroupCount(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t* totalGroupCount                       ///< [out] recommended total group count.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set kernel argument for a kernel.
-/// 
-/// @details
-///     - The argument values will be used when a
-///       ::zeCommandListAppendLaunchKernel variant is called.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX
-///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSetArgumentValue(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t argIndex,                              ///< [in] argument index in range [0, num args - 1]
-    size_t argSize,                                 ///< [in] size of argument type
-    const void* pArgValue                           ///< [in][optional] argument value represented as matching arg type. If
-                                                    ///< null then argument value is considered null.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel indirect access flags
-typedef uint32_t ze_kernel_indirect_access_flags_t;
-typedef enum _ze_kernel_indirect_access_flag_t
-{
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST = ZE_BIT(0),///< Indicates that the kernel accesses host allocations indirectly.
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE = ZE_BIT(1),  ///< Indicates that the kernel accesses device allocations indirectly.
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED = ZE_BIT(2),  ///< Indicates that the kernel accesses shared allocations indirectly.
-    ZE_KERNEL_INDIRECT_ACCESS_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_kernel_indirect_access_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sets kernel indirect access flags.
-/// 
-/// @details
-///     - The application should specify which allocations will be indirectly
-///       accessed by the kernel to allow driver to optimize which allocations
-///       are made resident
-///     - This function may **not** be called from simultaneous threads with the
-///       same Kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < flags`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSetIndirectAccess(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_kernel_indirect_access_flags_t flags         ///< [in] kernel indirect access flags
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve kernel indirect access flags.
-/// 
-/// @details
-///     - This function may be called from simultaneous threads with the same
-///       Kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pFlags`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelGetIndirectAccess(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_kernel_indirect_access_flags_t* pFlags       ///< [out] query result for kernel indirect access flags.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve all declared kernel attributes (i.e. can be specified with
-///        __attribute__ in runtime language).
-/// 
-/// @details
-///     - This function may be called from simultaneous threads with the same
-///       Kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pSize`
-///         + `nullptr == pString`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelGetSourceAttributes(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t* pSize,                                ///< [in,out] pointer to size of string in bytes.
-    char** pString                                  ///< [in,out] pointer to null-terminated string, whose lifetime is tied to
-                                                    ///< the kernel object, where kernel source attributes are separated by
-                                                    ///< space.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported Cache Config flags
-typedef uint32_t ze_cache_config_flags_t;
-typedef enum _ze_cache_config_flag_t
-{
-    ZE_CACHE_CONFIG_FLAG_LARGE_SLM = ZE_BIT(0),     ///< Large SLM size
-    ZE_CACHE_CONFIG_FLAG_LARGE_DATA = ZE_BIT(1),    ///< Large General Data size
-    ZE_CACHE_CONFIG_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_cache_config_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sets the preferred cache configuration.
-/// 
-/// @details
-///     - The cache configuration will be used when a
-///       ::zeCommandListAppendLaunchKernel variant is called.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < flags`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSetCacheConfig(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_cache_config_flags_t flags                   ///< [in] cache configuration.
-                                                    ///< must be 0 (default configuration) or a valid combination of ::ze_cache_config_flag_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_KERNEL_UUID_SIZE
-/// @brief Maximum kernel universal unique id (UUID) size in bytes
-#define ZE_MAX_KERNEL_UUID_SIZE  16
-#endif // ZE_MAX_KERNEL_UUID_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MAX_MODULE_UUID_SIZE
-/// @brief Maximum module universal unique id (UUID) size in bytes
-#define ZE_MAX_MODULE_UUID_SIZE  16
-#endif // ZE_MAX_MODULE_UUID_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel universal unique id (UUID)
-typedef struct _ze_kernel_uuid_t
-{
-    uint8_t kid[ZE_MAX_KERNEL_UUID_SIZE];           ///< [out] opaque data representing a kernel UUID
-    uint8_t mid[ZE_MAX_MODULE_UUID_SIZE];           ///< [out] opaque data representing the kernel's module UUID
-
-} ze_kernel_uuid_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel properties
-typedef struct _ze_kernel_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t numKernelArgs;                         ///< [out] number of kernel arguments.
-    uint32_t requiredGroupSizeX;                    ///< [out] required group size in the X dimension,
-                                                    ///< or zero if there is no required group size
-    uint32_t requiredGroupSizeY;                    ///< [out] required group size in the Y dimension,
-                                                    ///< or zero if there is no required group size
-    uint32_t requiredGroupSizeZ;                    ///< [out] required group size in the Z dimension,
-                                                    ///< or zero if there is no required group size
-    uint32_t requiredNumSubGroups;                  ///< [out] required number of subgroups per thread group,
-                                                    ///< or zero if there is no required number of subgroups
-    uint32_t requiredSubgroupSize;                  ///< [out] required subgroup size,
-                                                    ///< or zero if there is no required subgroup size
-    uint32_t maxSubgroupSize;                       ///< [out] maximum subgroup size
-    uint32_t maxNumSubgroups;                       ///< [out] maximum number of subgroups per thread group
-    uint32_t localMemSize;                          ///< [out] local memory size used by each thread group
-    uint32_t privateMemSize;                        ///< [out] private memory size allocated by compiler used by each thread
-    uint32_t spillMemSize;                          ///< [out] spill memory size allocated by compiler
-    ze_kernel_uuid_t uuid;                          ///< [out] universal unique identifier.
-
-} ze_kernel_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Additional kernel preferred group size properties
-/// 
-/// @details
-///     - This structure may be passed to ::zeKernelGetProperties, via the
-///       `pNext` member of ::ze_kernel_properties_t, to query additional kernel
-///       preferred group size properties.
-typedef struct _ze_kernel_preferred_group_size_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t preferredMultiple;                     ///< [out] preferred group size multiple
-
-} ze_kernel_preferred_group_size_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve kernel properties.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pKernelProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelGetProperties(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_kernel_properties_t* pKernelProperties       ///< [in,out] query result for kernel properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve kernel name from Kernel.
-/// 
-/// @details
-///     - The caller can pass nullptr for pName when querying only for size.
-///     - The implementation will copy the kernel name into a buffer supplied by
-///       the caller.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pSize`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelGetName(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    size_t* pSize,                                  ///< [in,out] size of kernel name string, including null terminator, in
-                                                    ///< bytes.
-    char* pName                                     ///< [in,out][optional] char pointer to kernel name.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel dispatch group count.
-typedef struct _ze_group_count_t
-{
-    uint32_t groupCountX;                           ///< [in] number of thread groups in X dimension
-    uint32_t groupCountY;                           ///< [in] number of thread groups in Y dimension
-    uint32_t groupCountZ;                           ///< [in] number of thread groups in Z dimension
-
-} ze_group_count_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Launch kernel over one or more work groups.
-/// 
-/// @details
-///     - The application must ensure the kernel and events are accessible by
-///       the device on which the command list was created.
-///     - This may **only** be called for a command list created with command
-///       queue group ordinal that supports compute.
-///     - The application must ensure the command list, kernel and events were
-///       created on the same context.
-///     - This function may **not** be called from simultaneous threads with the
-///       same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pLaunchFuncArgs`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendLaunchKernel(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    const ze_group_count_t* pLaunchFuncArgs,        ///< [in] thread group launch arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Launch kernel cooperatively over one or more work groups.
-/// 
-/// @details
-///     - The application must ensure the kernel and events are accessible by
-///       the device on which the command list was created.
-///     - This may **only** be called for a command list created with command
-///       queue group ordinal that supports compute.
-///     - This may only be used for a command list that are submitted to command
-///       queue with cooperative flag set.
-///     - The application must ensure the command list, kernel and events were
-///       created on the same context.
-///     - This function may **not** be called from simultaneous threads with the
-///       same command list handle.
-///     - The implementation of this function should be lock-free.
-///     - Use ::zeKernelSuggestMaxCooperativeGroupCount to recommend max group
-///       count for device for cooperative functions that device supports.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pLaunchFuncArgs`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendLaunchCooperativeKernel(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    const ze_group_count_t* pLaunchFuncArgs,        ///< [in] thread group launch arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Launch kernel over one or more work groups using indirect arguments.
-/// 
-/// @details
-///     - The application must ensure the kernel and events are accessible by
-///       the device on which the command list was created.
-///     - The application must ensure the launch arguments are visible to the
-///       device on which the command list was created.
-///     - The implementation must not access the contents of the launch
-///       arguments as they are free to be modified by either the Host or device
-///       up until execution.
-///     - This may **only** be called for a command list created with command
-///       queue group ordinal that supports compute.
-///     - The application must ensure the command list, kernel and events were
-///       created, and the memory was allocated, on the same context.
-///     - This function may **not** be called from simultaneous threads with the
-///       same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pLaunchArgumentsBuffer`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendLaunchKernelIndirect(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    const ze_group_count_t* pLaunchArgumentsBuffer, ///< [in] pointer to device buffer that will contain thread group launch
-                                                    ///< arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Launch multiple kernels over one or more work groups using an array of
-///        indirect arguments.
-/// 
-/// @details
-///     - The application must ensure the kernel and events are accessible by
-///       the device on which the command list was created.
-///     - The application must ensure the array of launch arguments and count
-///       buffer are visible to the device on which the command list was
-///       created.
-///     - The implementation must not access the contents of the array of launch
-///       arguments or count buffer as they are free to be modified by either
-///       the Host or device up until execution.
-///     - This may **only** be called for a command list created with command
-///       queue group ordinal that supports compute.
-///     - The application must enusre the command list, kernel and events were
-///       created, and the memory was allocated, on the same context.
-///     - This function may **not** be called from simultaneous threads with the
-///       same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phKernels`
-///         + `nullptr == pCountBuffer`
-///         + `nullptr == pLaunchArgumentsBuffer`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendLaunchMultipleKernelsIndirect(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of the command list
-    uint32_t numKernels,                            ///< [in] maximum number of kernels to launch
-    ze_kernel_handle_t* phKernels,                  ///< [in][range(0, numKernels)] handles of the kernel objects
-    const uint32_t* pCountBuffer,                   ///< [in] pointer to device memory location that will contain the actual
-                                                    ///< number of kernels to launch; value must be less-than or equal-to
-                                                    ///< numKernels
-    const ze_group_count_t* pLaunchArgumentsBuffer, ///< [in][range(0, numKernels)] pointer to device buffer that will contain
-                                                    ///< a contiguous array of thread group launch arguments
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting module programs.
-#if !defined(__GNUC__)
-#pragma region program
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MODULE_PROGRAM_EXP_NAME
-/// @brief Module Program Extension Name
-#define ZE_MODULE_PROGRAM_EXP_NAME  "ZE_experimental_module_program"
-#endif // ZE_MODULE_PROGRAM_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Module Program Extension Version(s)
-typedef enum _ze_module_program_exp_version_t
-{
-    ZE_MODULE_PROGRAM_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_MODULE_PROGRAM_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_MODULE_PROGRAM_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_module_program_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Module extended descriptor to support multiple input modules.
-/// 
-/// @details
-///     - Implementation must support ::ZE_experimental_module_program extension
-///     - Modules support import and export linkage for functions and global
-///       variables.
-///     - SPIR-V import and export linkage types are used. See SPIR-V
-///       specification for linkage details.
-///     - pInputModules, pBuildFlags, and pConstants from ::ze_module_desc_t is
-///       ignored.
-///     - Format in ::ze_module_desc_t needs to be set to
-///       ::ZE_MODULE_FORMAT_IL_SPIRV.
-typedef struct _ze_module_program_exp_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t count;                                 ///< [in] Count of input modules
-    const size_t* inputSizes;                       ///< [in][range(0, count)] sizes of each input IL module in pInputModules.
-    const uint8_t** pInputModules;                  ///< [in][range(0, count)] pointer to an array of IL (e.g. SPIR-V modules).
-                                                    ///< Valid only for SPIR-V input.
-    const char** pBuildFlags;                       ///< [in][optional][range(0, count)] array of strings containing build
-                                                    ///< flags. See pBuildFlags in ::ze_module_desc_t.
-    const ze_module_constants_t** pConstants;       ///< [in][optional][range(0, count)] pointer to array of specialization
-                                                    ///< constant strings. Valid only for SPIR-V input. This must be set to
-                                                    ///< nullptr if no specialization constants are provided.
-
-} ze_module_program_exp_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Raytracing
-#if !defined(__GNUC__)
-#pragma region raytracing
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_RAYTRACING_EXT_NAME
-/// @brief Raytracing Extension Name
-#define ZE_RAYTRACING_EXT_NAME  "ZE_extension_raytracing"
-#endif // ZE_RAYTRACING_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Raytracing Extension Version(s)
-typedef enum _ze_raytracing_ext_version_t
-{
-    ZE_RAYTRACING_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_RAYTRACING_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_RAYTRACING_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_raytracing_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported raytracing capability flags
-typedef uint32_t ze_device_raytracing_ext_flags_t;
-typedef enum _ze_device_raytracing_ext_flag_t
-{
-    ZE_DEVICE_RAYTRACING_EXT_FLAG_RAYQUERY = ZE_BIT(0), ///< Supports rayquery
-    ZE_DEVICE_RAYTRACING_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_raytracing_ext_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Raytracing properties queried using ::zeDeviceGetModuleProperties
-/// 
-/// @details
-///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-///       `pNext` member of ::ze_device_module_properties_t.
-typedef struct _ze_device_raytracing_ext_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_raytracing_ext_flags_t flags;         ///< [out] 0 or a valid combination of ::ze_device_raytracing_ext_flags_t
-    uint32_t maxBVHLevels;                          ///< [out] Maximum number of BVH levels supported
-
-} ze_device_raytracing_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported raytracing memory allocation flags
-typedef uint32_t ze_raytracing_mem_alloc_ext_flags_t;
-typedef enum _ze_raytracing_mem_alloc_ext_flag_t
-{
-    ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_TBD = ZE_BIT(0),   ///< reserved for future use
-    ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_raytracing_mem_alloc_ext_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Raytracing memory allocation descriptor
-/// 
-/// @details
-///     - This structure must be passed to ::zeMemAllocShared or
-///       ::zeMemAllocDevice, via `pNext` member of
-///       ::ze_device_mem_alloc_desc_t, for any memory allocation that is to be
-///       accessed by raytracing fixed-function of the device.
-typedef struct _ze_raytracing_mem_alloc_ext_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_raytracing_mem_alloc_ext_flags_t flags;      ///< [in] flags specifying additional allocation controls.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_raytracing_mem_alloc_ext_flag_t;
-                                                    ///< default behavior may use implicit driver-based heuristics.
-
-} ze_raytracing_mem_alloc_ext_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Memory Residency
-#if !defined(__GNUC__)
-#pragma region residency
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Makes memory resident for the device.
-/// 
-/// @details
-///     - The application must ensure the memory is resident before being
-///       referenced by the device
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextMakeMemoryResident(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    void* ptr,                                      ///< [in] pointer to memory to make resident
-    size_t size                                     ///< [in] size in bytes to make resident
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Allows memory to be evicted from the device.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the memory before it is evicted
-///     - The application may free the memory without evicting; the memory is
-///       implicitly evicted when freed.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextEvictMemory(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    void* ptr,                                      ///< [in] pointer to memory to evict
-    size_t size                                     ///< [in] size in bytes to evict
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Makes image resident for the device.
-/// 
-/// @details
-///     - The application must ensure the image is resident before being
-///       referenced by the device
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///         + `nullptr == hImage`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextMakeImageResident(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_image_handle_t hImage                        ///< [in] handle of image to make resident
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Allows image to be evicted from the device.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the image before it is evicted
-///     - The application may destroy the image without evicting; the image is
-///       implicitly evicted when destroyed.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///         + `nullptr == hImage`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeContextEvictImage(
-    ze_context_handle_t hContext,                   ///< [in] handle of context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    ze_image_handle_t hImage                        ///< [in] handle of image to make evict
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Sampler
-#if !defined(__GNUC__)
-#pragma region sampler
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sampler addressing modes
-typedef enum _ze_sampler_address_mode_t
-{
-    ZE_SAMPLER_ADDRESS_MODE_NONE = 0,               ///< No coordinate modifications for out-of-bounds image access.
-    ZE_SAMPLER_ADDRESS_MODE_REPEAT = 1,             ///< Out-of-bounds coordinates are wrapped back around.
-    ZE_SAMPLER_ADDRESS_MODE_CLAMP = 2,              ///< Out-of-bounds coordinates are clamped to edge.
-    ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,    ///< Out-of-bounds coordinates are clamped to border color which is (0.0f,
-                                                    ///< 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise
-                                                    ///< (0.0f, 0.0f, 0.0f, 1.0f).
-    ZE_SAMPLER_ADDRESS_MODE_MIRROR = 4,             ///< Out-of-bounds coordinates are mirrored starting from edge.
-    ZE_SAMPLER_ADDRESS_MODE_FORCE_UINT32 = 0x7fffffff
-
-} ze_sampler_address_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sampler filtering modes
-typedef enum _ze_sampler_filter_mode_t
-{
-    ZE_SAMPLER_FILTER_MODE_NEAREST = 0,             ///< No coordinate modifications for out of bounds image access.
-    ZE_SAMPLER_FILTER_MODE_LINEAR = 1,              ///< Out-of-bounds coordinates are wrapped back around.
-    ZE_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff
-
-} ze_sampler_filter_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sampler descriptor
-typedef struct _ze_sampler_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_sampler_address_mode_t addressMode;          ///< [in] Sampler addressing mode to determine how out-of-bounds
-                                                    ///< coordinates are handled.
-    ze_sampler_filter_mode_t filterMode;            ///< [in] Sampler filter mode to determine how samples are filtered.
-    ze_bool_t isNormalized;                         ///< [in] Are coordinates normalized [0, 1] or not.
-
-} ze_sampler_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates sampler on the context.
-/// 
-/// @details
-///     - The application must only use the sampler for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phSampler`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_SAMPLER_ADDRESS_MODE_MIRROR < desc->addressMode`
-///         + `::ZE_SAMPLER_FILTER_MODE_LINEAR < desc->filterMode`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeSamplerCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_sampler_desc_t* desc,                  ///< [in] pointer to sampler descriptor
-    ze_sampler_handle_t* phSampler                  ///< [out] handle of the sampler
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys sampler object
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the sampler before it is deleted.
-///     - The implementation of this function may immediately free all Host and
-///       Device allocations associated with this sampler.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same sampler handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hSampler`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeSamplerDestroy(
-    ze_sampler_handle_t hSampler                    ///< [in][release] handle of the sampler
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero APIs for Virtual Memory Management
-#if !defined(__GNUC__)
-#pragma region virtual
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Virtual memory page access attributes
-typedef enum _ze_memory_access_attribute_t
-{
-    ZE_MEMORY_ACCESS_ATTRIBUTE_NONE = 0,            ///< Indicates the memory page is inaccessible.
-    ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE = 1,       ///< Indicates the memory page supports read write access.
-    ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY = 2,        ///< Indicates the memory page supports read-only access.
-    ZE_MEMORY_ACCESS_ATTRIBUTE_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_access_attribute_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Reserves pages in virtual address space.
-/// 
-/// @details
-///     - The application must only use the memory allocation on the context for
-///       which it was created.
-///     - The starting address and size must be page aligned. See
-///       ::zeVirtualMemQueryPageSize.
-///     - If pStart is not null then implementation will attempt to reserve
-///       starting from that address. If not available then will find another
-///       suitable starting address.
-///     - The application may call this function from simultaneous threads.
-///     - The access attributes will default to none to indicate reservation is
-///       inaccessible.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pStart`
-///         + `nullptr == pptr`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemReserve(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* pStart,                             ///< [in] pointer to start of region to reserve. If nullptr then
-                                                    ///< implementation will choose a start address.
-    size_t size,                                    ///< [in] size in bytes to reserve; must be page aligned.
-    void** pptr                                     ///< [out] pointer to virtual reservation.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Free pages in a reserved virtual address range.
-/// 
-/// @details
-///     - Any existing virtual mappings for the range will be unmapped.
-///     - Physical allocations objects that were mapped to this range will not
-///       be destroyed. These need to be destroyed explicitly.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemFree(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of region to free.
-    size_t size                                     ///< [in] size in bytes to free; must be page aligned.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Queries page size to use for aligning virtual memory reservations and
-///        physical memory allocations.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pagesize`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemQueryPageSize(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    size_t size,                                    ///< [in] unaligned allocation size in bytes
-    size_t* pagesize                                ///< [out] pointer to page size to use for start address and size
-                                                    ///< alignments.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported physical memory creation flags
-typedef uint32_t ze_physical_mem_flags_t;
-typedef enum _ze_physical_mem_flag_t
-{
-    ZE_PHYSICAL_MEM_FLAG_TBD = ZE_BIT(0),           ///< reserved for future use.
-    ZE_PHYSICAL_MEM_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_physical_mem_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Physical memory descriptor
-typedef struct _ze_physical_mem_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_physical_mem_flags_t flags;                  ///< [in] creation flags.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_physical_mem_flag_t.
-    size_t size;                                    ///< [in] size in bytes to reserve; must be page aligned.
-
-} ze_physical_mem_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a physical memory object for the context.
-/// 
-/// @details
-///     - The application must only use the physical memory object on the
-///       context for which it was created.
-///     - The size must be page aligned. See ::zeVirtualMemQueryPageSize.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phPhysicalMemory`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x1 < desc->flags`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == desc->size`
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zePhysicalMemCreate(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    ze_physical_mem_desc_t* desc,                   ///< [in] pointer to physical memory descriptor.
-    ze_physical_mem_handle_t* phPhysicalMemory      ///< [out] pointer to handle of physical memory object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys a physical memory object.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the physical memory object before it is deleted
-///     - The application must **not** call this function from simultaneous
-///       threads with the same physical memory handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hPhysicalMemory`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zePhysicalMemDestroy(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_physical_mem_handle_t hPhysicalMemory        ///< [in][release] handle of physical memory object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Maps pages in virtual address space to pages from physical memory
-///        object.
-/// 
-/// @details
-///     - The virtual address range must have been reserved using
-///       ::zeVirtualMemReserve.
-///     - The application must only use the mapped memory allocation on the
-///       context for which it was created.
-///     - The virtual start address and size must be page aligned. See
-///       ::zeVirtualMemQueryPageSize.
-///     - The application should use, for the starting address and size, the
-///       same size alignment used for the physical allocation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hPhysicalMemory`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemMap(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of virtual address range to map.
-    size_t size,                                    ///< [in] size in bytes of virtual address range to map; must be page
-                                                    ///< aligned.
-    ze_physical_mem_handle_t hPhysicalMemory,       ///< [in] handle to physical memory object.
-    size_t offset,                                  ///< [in] offset into physical memory allocation object; must be page
-                                                    ///< aligned.
-    ze_memory_access_attribute_t access             ///< [in] specifies page access attributes to apply to the virtual address
-                                                    ///< range.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Unmaps pages in virtual address space from pages from a physical
-///        memory object.
-/// 
-/// @details
-///     - The page access attributes for virtual address range will revert back
-///       to none.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned"
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///         + Size must be page aligned
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemUnmap(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of region to unmap.
-    size_t size                                     ///< [in] size in bytes to unmap; must be page aligned.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set memory access attributes for a virtual address range.
-/// 
-/// @details
-///     - This function may be called from simultaneous threads with the same
-///       function handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned"
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///         + Size must be page aligned
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemSetAccessAttribute(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of reserved virtual address region.
-    size_t size,                                    ///< [in] size in bytes; must be page aligned.
-    ze_memory_access_attribute_t access             ///< [in] specifies page access attributes to apply to the virtual address
-                                                    ///< range.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get memory access attribute for a virtual address range.
-/// 
-/// @details
-///     - If size and outSize are equal then the pages in the specified virtual
-///       address range have the same access attributes.
-///     - This function may be called from simultaneous threads with the same
-///       function handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///         + `nullptr == access`
-///         + `nullptr == outSize`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned"
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
-///         + `0 == size`
-///         + Size must be page aligned
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeVirtualMemGetAccessAttribute(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const void* ptr,                                ///< [in] pointer to start of virtual address region for query.
-    size_t size,                                    ///< [in] size in bytes; must be page aligned.
-    ze_memory_access_attribute_t* access,           ///< [out] query result for page access attribute.
-    size_t* outSize                                 ///< [out] query result for size of virtual address range, starting at ptr,
-                                                    ///< that shares same access attribute.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Floating-Point Atomics
-#if !defined(__GNUC__)
-#pragma region floatAtomics
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_FLOAT_ATOMICS_EXT_NAME
-/// @brief Floating-Point Atomics Extension Name
-#define ZE_FLOAT_ATOMICS_EXT_NAME  "ZE_extension_float_atomics"
-#endif // ZE_FLOAT_ATOMICS_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Floating-Point Atomics Extension Version(s)
-typedef enum _ze_float_atomics_ext_version_t
-{
-    ZE_FLOAT_ATOMICS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_FLOAT_ATOMICS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
-    ZE_FLOAT_ATOMICS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_float_atomics_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported floating-point atomic capability flags
-typedef uint32_t ze_device_fp_atomic_ext_flags_t;
-typedef enum _ze_device_fp_atomic_ext_flag_t
-{
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE = ZE_BIT(0), ///< Supports atomic load, store, and exchange
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD = ZE_BIT(1),///< Supports atomic add and subtract
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX = ZE_BIT(2),///< Supports atomic min and max
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE = ZE_BIT(16), ///< Supports atomic load, store, and exchange
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD = ZE_BIT(17),///< Supports atomic add and subtract
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX = ZE_BIT(18),///< Supports atomic min and max
-    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_device_fp_atomic_ext_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device floating-point atomic properties queried using
-///        ::zeDeviceGetModuleProperties
-/// 
-/// @details
-///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-///       `pNext` member of ::ze_device_module_properties_t.
-typedef struct _ze_float_atomic_ext_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_device_fp_atomic_ext_flags_t fp16Flags;      ///< [out] Capabilities for half-precision floating-point atomic operations
-    ze_device_fp_atomic_ext_flags_t fp32Flags;      ///< [out] Capabilities for single-precision floating-point atomic
-                                                    ///< operations
-    ze_device_fp_atomic_ext_flags_t fp64Flags;      ///< [out] Capabilities for double-precision floating-point atomic
-                                                    ///< operations
-
-} ze_float_atomic_ext_properties_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting kernel global work offset.
-#if !defined(__GNUC__)
-#pragma region globaloffset
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_GLOBAL_OFFSET_EXP_NAME
-/// @brief Global Offset Extension Name
-#define ZE_GLOBAL_OFFSET_EXP_NAME  "ZE_experimental_global_offset"
-#endif // ZE_GLOBAL_OFFSET_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Global Offset Extension Version(s)
-typedef enum _ze_global_offset_exp_version_t
-{
-    ZE_GLOBAL_OFFSET_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_GLOBAL_OFFSET_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
-    ZE_GLOBAL_OFFSET_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_global_offset_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set global work offset for a kernel.
-/// 
-/// @details
-///     - The global work offset will be used when
-///       a ::zeCommandListAppendLaunchKernel() variant is called.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSetGlobalOffsetExp(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    uint32_t offsetX,                               ///< [in] global offset for X dimension to use for this kernel
-    uint32_t offsetY,                               ///< [in] global offset for Y dimension to use for this kernel
-    uint32_t offsetZ                                ///< [in] global offset for Z dimension to use for this kernel
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting relaxed allocation limits.
-#if !defined(__GNUC__)
-#pragma region relaxedAllocLimits
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME
-/// @brief Relaxed Allocation Limits Extension Name
-#define ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME  "ZE_experimental_relaxed_allocation_limits"
-#endif // ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Relaxed Allocation Limits Extension Version(s)
-typedef enum _ze_relaxed_allocation_limits_exp_version_t
-{
-    ZE_RELAXED_ALLOCATION_LIMITS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_RELAXED_ALLOCATION_LIMITS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
-    ZE_RELAXED_ALLOCATION_LIMITS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_relaxed_allocation_limits_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported relaxed memory allocation flags
-typedef uint32_t ze_relaxed_allocation_limits_exp_flags_t;
-typedef enum _ze_relaxed_allocation_limits_exp_flag_t
-{
-    ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE = ZE_BIT(0), ///< Allocation size may exceed ::ze_device_properties_t.maxMemAllocSize
-    ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_relaxed_allocation_limits_exp_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Relaxed limits memory allocation descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeMemAllocShared or
-///       ::zeMemAllocDevice, via `pNext` member of
-///       ::ze_device_mem_alloc_desc_t.
-///     - This structure may also be passed to ::zeMemAllocHost, via `pNext`
-///       member of ::ze_host_mem_alloc_desc_t.
-typedef struct _ze_relaxed_allocation_limits_exp_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_relaxed_allocation_limits_exp_flags_t flags; ///< [in] flags specifying allocation limits to relax.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_relaxed_allocation_limits_exp_flag_t;
-
-} ze_relaxed_allocation_limits_exp_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Cache Reservation
-#if !defined(__GNUC__)
-#pragma region cacheReservation
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_CACHE_RESERVATION_EXT_NAME
-/// @brief Cache_Reservation Extension Name
-#define ZE_CACHE_RESERVATION_EXT_NAME  "ZE_extension_cache_reservation"
-#endif // ZE_CACHE_RESERVATION_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Cache_Reservation Extension Version(s)
-typedef enum _ze_cache_reservation_ext_version_t
-{
-    ZE_CACHE_RESERVATION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_CACHE_RESERVATION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
-    ZE_CACHE_RESERVATION_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_cache_reservation_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Cache Reservation Region
-typedef enum _ze_cache_ext_region_t
-{
-    ZE_CACHE_EXT_REGION_ZE_CACHE_REGION_DEFAULT = 0,///< utilize driver default scheme
-    ZE_CACHE_EXT_REGION_ZE_CACHE_RESERVE_REGION = 1,///< Utilize reserver region
-    ZE_CACHE_EXT_REGION_ZE_CACHE_NON_RESERVED_REGION = 2,   ///< Utilize non-reserverd region
-    ZE_CACHE_EXT_REGION_FORCE_UINT32 = 0x7fffffff
-
-} ze_cache_ext_region_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief CacheReservation structure
-/// 
-/// @details
-///     - This structure must be passed to ::zeDeviceGetCacheProperties via
-///       `pNext` member of ::ze_device_cache_properties_t
-///     - Used for determining the max cache reservation allowed on device. Size
-///       of zero means no reservation available.
-typedef struct _ze_cache_reservation_ext_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    size_t maxCacheReservationSize;                 ///< [out] max cache reservation size
-
-} ze_cache_reservation_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Reserve Cache on Device
-/// 
-/// @details
-///     - The application may call this function but may not be successful as
-///       some other application may have reserve prior
-/// 
-/// @remarks
-///   _Analogues_
-///     - None
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceReserveCacheExt(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    size_t cacheLevel,                              ///< [in] cache level where application want to reserve. If zero, then the
-                                                    ///< driver shall default to last level of cache and attempt to reserve in
-                                                    ///< that cache.
-    size_t cacheReservationSize                     ///< [in] value for reserving size, in bytes. If zero, then the driver
-                                                    ///< shall remove prior reservation
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Assign VA section to use reserved section
-/// 
-/// @details
-///     - The application may call this function to assign VA to particular
-///       reservartion region
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZE_CACHE_EXT_REGION_::ZE_CACHE_NON_RESERVED_REGION < cacheRegion`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDeviceSetCacheAdviceExt(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object
-    void* ptr,                                      ///< [in] memory pointer to query
-    size_t regionSize,                              ///< [in] region size, in pages
-    ze_cache_ext_region_t cacheRegion               ///< [in] reservation region
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting event query timestamps.
-#if !defined(__GNUC__)
-#pragma region eventquerytimestamps
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME
-/// @brief Event Query Timestamps Extension Name
-#define ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME  "ZE_experimental_event_query_timestamps"
-#endif // ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event Query Timestamps Extension Version(s)
-typedef enum _ze_event_query_timestamps_exp_version_t
-{
-    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_event_query_timestamps_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Query event timestamps for a device or sub-device.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-///     - The implementation must support
-///       ::ZE_experimental_event_query_timestamps.
-///     - The implementation must return all timestamps for the specified event
-///       and device pair.
-///     - The implementation must return all timestamps for all sub-devices when
-///       device handle is parent device.
-///     - The implementation may return all timestamps for sub-devices when
-///       device handle is sub-device or may return 0 for count.
-/// 
-/// @remarks
-///   _Analogues_
-///     - None
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEvent`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeEventQueryTimestampsExp(
-    ze_event_handle_t hEvent,                       ///< [in] handle of the event
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device to query
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of timestamp results.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of timestamps available.
-                                                    ///< if count is greater than the number of timestamps available, then the
-                                                    ///< driver shall update the value with the correct number of timestamps available.
-    ze_kernel_timestamp_result_t* pTimestamps       ///< [in,out][optional][range(0, *pCount)] array of timestamp results.
-                                                    ///< if count is less than the number of timestamps available, then driver
-                                                    ///< shall only retrieve that number of timestamps.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting image memory properties.
-#if !defined(__GNUC__)
-#pragma region imagememoryproperties
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME
-/// @brief Image Memory Properties Extension Name
-#define ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME  "ZE_experimental_image_memory_properties"
-#endif // ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image Memory Properties Extension Version(s)
-typedef enum _ze_image_memory_properties_exp_version_t
-{
-    ZE_IMAGE_MEMORY_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
-    ZE_IMAGE_MEMORY_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
-    ZE_IMAGE_MEMORY_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_memory_properties_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image memory properties
-typedef struct _ze_image_memory_properties_exp_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t size;                                  ///< [out] size of image allocation in bytes.
-    uint64_t rowPitch;                              ///< [out] size of image row in bytes.
-    uint64_t slicePitch;                            ///< [out] size of image slice in bytes.
-
-} ze_image_memory_properties_exp_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Query image memory properties.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-///     - The implementation must support
-///       ::ZE_experimental_image_memory_properties extension.
-/// 
-/// @remarks
-///   _Analogues_
-///     - None
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pMemoryProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeImageGetMemoryPropertiesExp(
-    ze_image_handle_t hImage,                       ///< [in] handle of image object
-    ze_image_memory_properties_exp_t* pMemoryProperties ///< [in,out] query result for image memory properties.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting image views.
-#if !defined(__GNUC__)
-#pragma region imageview
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_IMAGE_VIEW_EXP_NAME
-/// @brief Image View Extension Name
-#define ZE_IMAGE_VIEW_EXP_NAME  "ZE_experimental_image_view"
-#endif // ZE_IMAGE_VIEW_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image View Extension Version(s)
-typedef enum _ze_image_view_exp_version_t
-{
-    ZE_IMAGE_VIEW_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_IMAGE_VIEW_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_IMAGE_VIEW_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_view_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Create image view on the context.
-/// 
-/// @details
-///     - The application must only use the image view for the device, or its
-///       sub-devices, which was provided during creation.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-///     - The implementation must support ::ZE_experimental_image_view
-///       extension.
-///     - Image views are treated as images from the API.
-///     - Image views provide a mechanism to redescribe how an image is
-///       interpreted (e.g. different format).
-///     - Image views become disabled when their corresponding image resource is
-///       destroyed.
-///     - Use ::zeImageDestroy to destroy image view objects.
-/// 
-/// @remarks
-///   _Analogues_
-///     - None
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///         + `nullptr == hImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phImageView`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < desc->flags`
-///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeImageViewCreateExp(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device
-    const ze_image_desc_t* desc,                    ///< [in] pointer to image descriptor
-    ze_image_handle_t hImage,                       ///< [in] handle of image object to create view from
-    ze_image_handle_t* phImageView                  ///< [out] pointer to handle of image object created for view
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting image views for planar images.
-#if !defined(__GNUC__)
-#pragma region imageviewplanar
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_IMAGE_VIEW_PLANAR_EXP_NAME
-/// @brief Image View Planar Extension Name
-#define ZE_IMAGE_VIEW_PLANAR_EXP_NAME  "ZE_experimental_image_view_planar"
-#endif // ZE_IMAGE_VIEW_PLANAR_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image View Planar Extension Version(s)
-typedef enum _ze_image_view_planar_exp_version_t
-{
-    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
-    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_view_planar_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image view planar descriptor
-typedef struct _ze_image_view_planar_exp_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t planeIndex;                            ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
-
-} ze_image_view_planar_exp_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for specifying kernel scheduling hints.
-#if !defined(__GNUC__)
-#pragma region kernelSchedulingHints
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME
-/// @brief Kernel Scheduling Hints Extension Name
-#define ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME  "ZE_experimental_scheduling_hints"
-#endif // ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel Scheduling Hints Extension Version(s)
-typedef enum _ze_scheduling_hints_exp_version_t
-{
-    ZE_SCHEDULING_HINTS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_SCHEDULING_HINTS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_SCHEDULING_HINTS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_scheduling_hints_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported kernel scheduling hint flags
-typedef uint32_t ze_scheduling_hint_exp_flags_t;
-typedef enum _ze_scheduling_hint_exp_flag_t
-{
-    ZE_SCHEDULING_HINT_EXP_FLAG_OLDEST_FIRST = ZE_BIT(0),   ///< Hint that the kernel prefers oldest-first scheduling
-    ZE_SCHEDULING_HINT_EXP_FLAG_ROUND_ROBIN = ZE_BIT(1),///< Hint that the kernel prefers round-robin scheduling
-    ZE_SCHEDULING_HINT_EXP_FLAG_STALL_BASED_ROUND_ROBIN = ZE_BIT(2),///< Hint that the kernel prefers stall-based round-robin scheduling
-    ZE_SCHEDULING_HINT_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_scheduling_hint_exp_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device kernel scheduling hint properties queried using
-///        ::zeDeviceGetModuleProperties
-/// 
-/// @details
-///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
-///       `pNext` member of ::ze_device_module_properties_t.
-typedef struct _ze_scheduling_hint_exp_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_scheduling_hint_exp_flags_t schedulingHintFlags; ///< [out] Supported kernel scheduling hints.
-                                                    ///< May be 0 (none) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
-
-} ze_scheduling_hint_exp_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Kernel scheduling hint descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeKernelSchedulingHintExp.
-typedef struct _ze_scheduling_hint_exp_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_scheduling_hint_exp_flags_t flags;           ///< [in] flags specifying kernel scheduling hints.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
-
-} ze_scheduling_hint_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Provide kernel scheduling hints that may improve performance
-/// 
-/// @details
-///     - The scheduling hints may improve performance only and are not required
-///       for correctness.
-///     - If a specified scheduling hint is unsupported it will be silently
-///       ignored.
-///     - If two conflicting scheduling hints are specified there is no defined behavior;
-///       the hints may be ignored or one hint may be chosen arbitrarily.
-///     - The application must not call this function from simultaneous threads
-///       with the same kernel handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pHint`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < pHint->flags`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeKernelSchedulingHintExp(
-    ze_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
-    ze_scheduling_hint_exp_desc_t* pHint            ///< [in] pointer to kernel scheduling hint descriptor
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for One-Definition-Rule Linkage Types
-#if !defined(__GNUC__)
-#pragma region linkonceodr
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_LINKONCE_ODR_EXT_NAME
-/// @brief Linkonce ODR Extension Name
-#define ZE_LINKONCE_ODR_EXT_NAME  "ZE_extension_linkonce_odr"
-#endif // ZE_LINKONCE_ODR_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Linkonce ODR Extension Version(s)
-typedef enum _ze_linkonce_odr_ext_version_t
-{
-    ZE_LINKONCE_ODR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_LINKONCE_ODR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_LINKONCE_ODR_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_linkonce_odr_ext_version_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting power saving hint.
-#if !defined(__GNUC__)
-#pragma region powersavinghint
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME
-/// @brief Power Saving Hint Extension Name
-#define ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME  "ZE_experimental_power_saving_hint"
-#endif // ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Power Saving Hint Extension Version(s)
-typedef enum _ze_power_saving_hint_exp_version_t
-{
-    ZE_POWER_SAVING_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
-    ZE_POWER_SAVING_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
-    ZE_POWER_SAVING_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_power_saving_hint_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device types
-typedef enum _ze_power_saving_hint_type_t
-{
-    ZE_POWER_SAVING_HINT_TYPE_MIN = 0,              ///< Minumum power savings. The device will make no attempt to save power
-                                                    ///< while executing work submitted to this context.
-    ZE_POWER_SAVING_HINT_TYPE_MAX = 100,            ///< Maximum power savings. The device will do everything to bring power to
-                                                    ///< a minimum while executing work submitted to this context.
-    ZE_POWER_SAVING_HINT_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} ze_power_saving_hint_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Extended context descriptor containing power saving hint.
-typedef struct _ze_context_power_saving_hint_exp_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t hint;                                  ///< [in] power saving hint (default value = 0). This is value from [0,100]
-                                                    ///< and can use pre-defined settings from ::ze_power_saving_hint_type_t.
-
-} ze_context_power_saving_hint_exp_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Subgroups
-#if !defined(__GNUC__)
-#pragma region subgroups
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_SUBGROUPS_EXT_NAME
-/// @brief Subgroups Extension Name
-#define ZE_SUBGROUPS_EXT_NAME  "ZE_extension_subgroups"
-#endif // ZE_SUBGROUPS_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Subgroups Extension Version(s)
-typedef enum _ze_subgroup_ext_version_t
-{
-    ZE_SUBGROUP_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_SUBGROUP_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_SUBGROUP_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_subgroup_ext_version_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for EU Count
-#if !defined(__GNUC__)
-#pragma region EUCount
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_EU_COUNT_EXT_NAME
-/// @brief EU Count Extension Name
-#define ZE_EU_COUNT_EXT_NAME  "ZE_extension_eu_count"
-#endif // ZE_EU_COUNT_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief EU Count Extension Version(s)
-typedef enum _ze_eu_count_ext_version_t
-{
-    ZE_EU_COUNT_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_EU_COUNT_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_EU_COUNT_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_eu_count_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief EU count queried using ::zeDeviceGetProperties
-/// 
-/// @details
-///     - This structure may be returned from ::zeDeviceGetProperties via
-///       `pNext` member of ::ze_device_properties_t
-///     - Used for determining the total number of EUs available on device.
-typedef struct _ze_eu_count_ext_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint32_t numTotalEUs;                           ///< [out] Total number of EUs available
-
-} ze_eu_count_ext_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for PCI Properties
-#if !defined(__GNUC__)
-#pragma region PCIProperties
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_PCI_PROPERTIES_EXT_NAME
-/// @brief PCI Properties Extension Name
-#define ZE_PCI_PROPERTIES_EXT_NAME  "ZE_extension_pci_properties"
-#endif // ZE_PCI_PROPERTIES_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI Properties Extension Version(s)
-typedef enum _ze_pci_properties_ext_version_t
-{
-    ZE_PCI_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_PCI_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_PCI_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_pci_properties_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device PCI address
-/// 
-/// @details
-///     - This structure may be passed to ::zeDevicePciGetPropertiesExt as an
-///       attribute of ::ze_pci_ext_properties_t.
-///     - A PCI BDF address is the bus:device:function address of the device and
-///       is useful for locating the device in the PCI switch fabric.
-typedef struct _ze_pci_address_ext_t
-{
-    uint32_t domain;                                ///< [out] PCI domain number
-    uint32_t bus;                                   ///< [out] PCI BDF bus number
-    uint32_t device;                                ///< [out] PCI BDF device number
-    uint32_t function;                              ///< [out] PCI BDF function number
-
-} ze_pci_address_ext_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device PCI speed
-typedef struct _ze_pci_speed_ext_t
-{
-    int32_t genVersion;                             ///< [out] The link generation. A value of -1 means that this property is
-                                                    ///< unknown.
-    int32_t width;                                  ///< [out] The number of lanes. A value of -1 means that this property is
-                                                    ///< unknown.
-    int64_t maxBandwidth;                           ///< [out] The theoretical maximum bandwidth in bytes/sec (sum of all
-                                                    ///< lanes). A value of -1 means that this property is unknown.
-
-} ze_pci_speed_ext_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Static PCI properties
-typedef struct _ze_pci_ext_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_pci_address_ext_t address;                   ///< [out] The BDF address
-    ze_pci_speed_ext_t maxSpeed;                    ///< [out] Fastest port configuration supported by the device (sum of all
-                                                    ///< lanes)
-
-} ze_pci_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get PCI properties - address, max speed
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - None
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pPciProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeDevicePciGetPropertiesExt(
-    ze_device_handle_t hDevice,                     ///< [in] handle of the device object.
-    ze_pci_ext_properties_t* pPciProperties         ///< [in,out] returns the PCI properties of the device.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for sRGB
-#if !defined(__GNUC__)
-#pragma region SRGB
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_SRGB_EXT_NAME
-/// @brief sRGB Extension Name
-#define ZE_SRGB_EXT_NAME  "ZE_extension_srgb"
-#endif // ZE_SRGB_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief sRGB Extension Version(s)
-typedef enum _ze_srgb_ext_version_t
-{
-    ZE_SRGB_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_SRGB_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_SRGB_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_srgb_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief sRGB image descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeImageCreate via the `pNext` member
-///       of ::ze_image_desc_t
-///     - Used for specifying that the image is in sRGB format.
-typedef struct _ze_srgb_ext_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_bool_t sRGB;                                 ///< [in] Is sRGB.
-
-} ze_srgb_ext_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Image Copy To/From Memory
-#if !defined(__GNUC__)
-#pragma region imageCopy
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_IMAGE_COPY_EXT_NAME
-/// @brief Image Copy Extension Name
-#define ZE_IMAGE_COPY_EXT_NAME  "ZE_extension_image_copy"
-#endif // ZE_IMAGE_COPY_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image Copy Extension Version(s)
-typedef enum _ze_image_copy_ext_version_t
-{
-    ZE_IMAGE_COPY_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_IMAGE_COPY_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_IMAGE_COPY_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_copy_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies from an image to device or shared memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by dstptr is
-///       accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by dstptr as
-///       it is free to be modified by either the Host or device up until
-///       execution.
-///     - The application must ensure the image and events are accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the image format descriptor for the source
-///       image is a single-planar format.
-///     - The application must ensure that the rowPitch is set to 0 if image is
-///       a 1D image. Otherwise the rowPitch must be greater than or equal to
-///       the element size in bytes × width.
-///     - If rowPitch is set to 0, the appropriate row pitch is calculated based
-///       on the size of each element in bytes multiplied by width
-///     - The application must ensure that the slicePitch is set to 0 if image
-///       is a 1D or 2D image. Otherwise this value must be greater than or
-///       equal to rowPitch × height.
-///     - If slicePitch is set to 0, the appropriate slice pitch is calculated
-///       based on the rowPitch × height.
-///     - The application must ensure the command list, image and events were
-///       created, and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clEnqueueReadImage
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hSrcImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == dstptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendImageCopyToMemoryExt(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    void* dstptr,                                   ///< [in] pointer to destination memory to copy to
-    ze_image_handle_t hSrcImage,                    ///< [in] handle of source image to copy from
-    const ze_image_region_t* pSrcRegion,            ///< [in][optional] source region descriptor
-    uint32_t destRowPitch,                          ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
-                                                    ///< image or each image of a 1D or 2D image array being written
-    uint32_t destSlicePitch,                        ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
-                                                    ///< each image of a 1D or 2D image array being written
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Copies to an image from device or shared memory.
-/// 
-/// @details
-///     - The application must ensure the memory pointed to by srcptr is
-///       accessible by the device on which the command list was created.
-///     - The implementation must not access the memory pointed to by srcptr as
-///       it is free to be modified by either the Host or device up until
-///       execution.
-///     - The application must ensure the image and events are accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the image format descriptor for the
-///       destination image is a single-planar format.
-///     - The application must ensure that the rowPitch is set to 0 if image is
-///       a 1D image. Otherwise the rowPitch must be greater than or equal to
-///       the element size in bytes × width.
-///     - If rowPitch is set to 0, the appropriate row pitch is calculated based
-///       on the size of each element in bytes multiplied by width
-///     - The application must ensure that the slicePitch is set to 0 if image
-///       is a 1D or 2D image. Otherwise this value must be greater than or
-///       equal to rowPitch × height.
-///     - If slicePitch is set to 0, the appropriate slice pitch is calculated
-///       based on the rowPitch × height.
-///     - The application must ensure the command list, image and events were
-///       created, and the memory was allocated, on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @remarks
-///   _Analogues_
-///     - clEnqueueWriteImage
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hDstImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == srcptr`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeCommandListAppendImageCopyFromMemoryExt(
-    ze_command_list_handle_t hCommandList,          ///< [in] handle of command list
-    ze_image_handle_t hDstImage,                    ///< [in] handle of destination image to copy to
-    const void* srcptr,                             ///< [in] pointer to source memory to copy from
-    const ze_image_region_t* pDstRegion,            ///< [in][optional] destination region descriptor
-    uint32_t srcRowPitch,                           ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
-                                                    ///< image or each image of a 1D or 2D image array being read
-    uint32_t srcSlicePitch,                         ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
-                                                    ///< each image of a 1D or 2D image array being read
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in][optional] number of events to wait on before launching; must be 0
-                                                    ///< if `nullptr == phWaitEvents`
-    ze_event_handle_t* phWaitEvents                 ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
-                                                    ///< on before launching
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for Querying Image Allocation Properties.
-#if !defined(__GNUC__)
-#pragma region imageQueryAllocProperties
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME
-/// @brief Image Query Allocation Properties Extension Name
-#define ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME  "ZE_extension_image_query_alloc_properties"
-#endif // ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image Query Allocation Properties Extension Version(s)
-typedef enum _ze_image_query_alloc_properties_ext_version_t
-{
-    ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_image_query_alloc_properties_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Image allocation properties queried using
-///        ::zeImageGetAllocPropertiesExt
-typedef struct _ze_image_allocation_ext_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    uint64_t id;                                    ///< [out] identifier for this allocation
-
-} ze_image_allocation_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves attributes of an image allocation
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hImage`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pImageAllocProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeImageGetAllocPropertiesExt(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    ze_image_handle_t hImage,                       ///< [in] handle of image object to query
-    ze_image_allocation_ext_properties_t* pImageAllocProperties ///< [in,out] query result for image allocation properties
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Linkage Inspection
-#if !defined(__GNUC__)
-#pragma region linkageInspection
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_LINKAGE_INSPECTION_EXT_NAME
-/// @brief Linkage Inspection Extension Name
-#define ZE_LINKAGE_INSPECTION_EXT_NAME  "ZE_extension_linkage_inspection"
-#endif // ZE_LINKAGE_INSPECTION_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Linkage Inspection Extension Version(s)
-typedef enum _ze_linkage_inspection_ext_version_t
-{
-    ZE_LINKAGE_INSPECTION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_LINKAGE_INSPECTION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_LINKAGE_INSPECTION_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_linkage_inspection_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported module linkage inspection flags
-typedef uint32_t ze_linkage_inspection_ext_flags_t;
-typedef enum _ze_linkage_inspection_ext_flag_t
-{
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_IMPORTS = ZE_BIT(0), ///< List all imports of modules
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_UNRESOLVABLE_IMPORTS = ZE_BIT(1),///< List all imports of modules that do not have a corresponding export
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_EXPORTS = ZE_BIT(2), ///< List all exports of modules
-    ZE_LINKAGE_INSPECTION_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_linkage_inspection_ext_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Module linkage inspection descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeModuleInspectLinkageExt.
-typedef struct _ze_linkage_inspection_ext_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_linkage_inspection_ext_flags_t flags;        ///< [in] flags specifying module linkage inspection.
-                                                    ///< must be 0 (default) or a valid combination of ::ze_linkage_inspection_ext_flag_t.
-
-} ze_linkage_inspection_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief List Imports & Exports
-/// 
-/// @details
-///     - List all the import & unresolveable import dependencies & exports of a
-///       set of modules
-/// 
-/// @remarks
-///   _Analogues_
-///     - None
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pInspectDesc`
-///         + `nullptr == phModules`
-///         + `nullptr == phLog`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7 < pInspectDesc->flags`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeModuleInspectLinkageExt(
-    ze_linkage_inspection_ext_desc_t* pInspectDesc, ///< [in] pointer to linkage inspection descriptor structure.
-    uint32_t numModules,                            ///< [in] number of modules to be inspected pointed to by phModules.
-    ze_module_handle_t* phModules,                  ///< [in][range(0, numModules)] pointer to an array of modules to be
-                                                    ///< inspected for import dependencies.
-    ze_module_build_log_handle_t* phLog             ///< [out] pointer to handle of linkage inspection log. Log object will
-                                                    ///< contain separate lists of imports, un-resolvable imports, and exports.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension for supporting memory compression hints.
-#if !defined(__GNUC__)
-#pragma region memoryCompressionHints
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME
-/// @brief Memory Compression Hints Extension Name
-#define ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME  "ZE_extension_memory_compression_hints"
-#endif // ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory Compression Hints Extension Version(s)
-typedef enum _ze_memory_compression_hints_ext_version_t
-{
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_compression_hints_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported memory compression hints flags
-typedef uint32_t ze_memory_compression_hints_ext_flags_t;
-typedef enum _ze_memory_compression_hints_ext_flag_t
-{
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_COMPRESSED = ZE_BIT(0),///< Hint Driver implementation to make allocation compressible
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_UNCOMPRESSED = ZE_BIT(1),  ///< Hint Driver implementation to make allocation not compressible
-    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_compression_hints_ext_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Compression hints memory allocation descriptor
-/// 
-/// @details
-///     - This structure may be passed to ::zeMemAllocShared or
-///       ::zeMemAllocDevice, via `pNext` member of
-///       ::ze_device_mem_alloc_desc_t.
-///     - This structure may be passed to ::zeMemAllocHost, via `pNext` member
-///       of ::ze_host_mem_alloc_desc_t.
-///     - This structure may be passed to ::zeImageCreate, via `pNext` member of
-///       ::ze_image_desc_t.
-typedef struct _ze_memory_compression_hints_ext_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_memory_compression_hints_ext_flags_t flags;  ///< [in] flags specifying if allocation should be compressible or not.
-                                                    ///< Must be set to one of the ::ze_memory_compression_hints_ext_flag_t;
-
-} ze_memory_compression_hints_ext_desc_t;
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Extension APIs for Memory Free Policies
-#if !defined(__GNUC__)
-#pragma region memoryFreePolicies
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZE_MEMORY_FREE_POLICIES_EXT_NAME
-/// @brief Memory Free Policies Extension Name
-#define ZE_MEMORY_FREE_POLICIES_EXT_NAME  "ZE_extension_memory_free_policies"
-#endif // ZE_MEMORY_FREE_POLICIES_EXT_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory Free Policies Extension Version(s)
-typedef enum _ze_memory_free_policies_ext_version_t
-{
-    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_memory_free_policies_ext_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported memory free policy capability flags
-typedef uint32_t ze_driver_memory_free_policy_ext_flags_t;
-typedef enum _ze_driver_memory_free_policy_ext_flag_t
-{
-    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE = ZE_BIT(0),///< blocks until all commands using the memory are complete before freeing
-    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_DEFER_FREE = ZE_BIT(1),   ///< schedules the memory to be freed but does not free immediately
-    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} ze_driver_memory_free_policy_ext_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Driver memory free properties queried using ::zeDriverGetProperties
-/// 
-/// @details
-///     - All drivers must support an immediate free policy, which is the
-///       default free policy.
-///     - This structure may be returned from ::zeDriverGetProperties, via
-///       `pNext` member of ::ze_driver_properties_t.
-typedef struct _ze_driver_memory_free_ext_properties_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_driver_memory_free_policy_ext_flags_t freePolicies;  ///< [out] Supported memory free policies.
-                                                    ///< must be 0 or a combination of ::ze_driver_memory_free_policy_ext_flag_t.
-
-} ze_driver_memory_free_ext_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory free descriptor with free policy
-typedef struct _ze_memory_free_ext_desc_t
-{
-    ze_structure_type_t stype;                      ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] must be null or a pointer to an extension-specific
-                                                    ///< structure (i.e. contains sType and pNext).
-    ze_driver_memory_free_policy_ext_flags_t freePolicy;///< [in] flags specifying the memory free policy.
-                                                    ///< must be 0 (default) or a supported ::ze_driver_memory_free_policy_ext_flag_t;
-                                                    ///< default behavior is to free immediately.
-
-} ze_memory_free_ext_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frees allocated host memory, device memory, or shared memory using the
-///        specified free policy.
-/// 
-/// @details
-///     - The memory free policy is specified by the memory free descriptor.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same pointer.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pMemFreeDesc`
-///         + `nullptr == ptr`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x3 < pMemFreeDesc->freePolicy`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zeMemFreeExt(
-    ze_context_handle_t hContext,                   ///< [in] handle of the context object
-    const ze_memory_free_ext_desc_t* pMemFreeDesc,  ///< [in] pointer to memory free descriptor
-    void* ptr                                       ///< [in][release] pointer to memory to free
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero API Callbacks
-#if !defined(__GNUC__)
-#pragma region callbacks
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeInit 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_init_params_t
-{
-    ze_init_flags_t* pflags;
-} ze_init_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeInit 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnInitCb_t)(
-    ze_init_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Global callback functions pointers
-typedef struct _ze_global_callbacks_t
-{
-    ze_pfnInitCb_t                                                  pfnInitCb;
-} ze_global_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDriverGet 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_driver_get_params_t
-{
-    uint32_t** ppCount;
-    ze_driver_handle_t** pphDrivers;
-} ze_driver_get_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDriverGet 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDriverGetCb_t)(
-    ze_driver_get_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDriverGetApiVersion 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_driver_get_api_version_params_t
-{
-    ze_driver_handle_t* phDriver;
-    ze_api_version_t** pversion;
-} ze_driver_get_api_version_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDriverGetApiVersion 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDriverGetApiVersionCb_t)(
-    ze_driver_get_api_version_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDriverGetProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_driver_get_properties_params_t
-{
-    ze_driver_handle_t* phDriver;
-    ze_driver_properties_t** ppDriverProperties;
-} ze_driver_get_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDriverGetProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDriverGetPropertiesCb_t)(
-    ze_driver_get_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDriverGetIpcProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_driver_get_ipc_properties_params_t
-{
-    ze_driver_handle_t* phDriver;
-    ze_driver_ipc_properties_t** ppIpcProperties;
-} ze_driver_get_ipc_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDriverGetIpcProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDriverGetIpcPropertiesCb_t)(
-    ze_driver_get_ipc_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDriverGetExtensionProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_driver_get_extension_properties_params_t
-{
-    ze_driver_handle_t* phDriver;
-    uint32_t** ppCount;
-    ze_driver_extension_properties_t** ppExtensionProperties;
-} ze_driver_get_extension_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDriverGetExtensionProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDriverGetExtensionPropertiesCb_t)(
-    ze_driver_get_extension_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Driver callback functions pointers
-typedef struct _ze_driver_callbacks_t
-{
-    ze_pfnDriverGetCb_t                                             pfnGetCb;
-    ze_pfnDriverGetApiVersionCb_t                                   pfnGetApiVersionCb;
-    ze_pfnDriverGetPropertiesCb_t                                   pfnGetPropertiesCb;
-    ze_pfnDriverGetIpcPropertiesCb_t                                pfnGetIpcPropertiesCb;
-    ze_pfnDriverGetExtensionPropertiesCb_t                          pfnGetExtensionPropertiesCb;
-} ze_driver_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGet 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_params_t
-{
-    ze_driver_handle_t* phDriver;
-    uint32_t** ppCount;
-    ze_device_handle_t** pphDevices;
-} ze_device_get_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGet 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetCb_t)(
-    ze_device_get_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetSubDevices 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_sub_devices_params_t
-{
-    ze_device_handle_t* phDevice;
-    uint32_t** ppCount;
-    ze_device_handle_t** pphSubdevices;
-} ze_device_get_sub_devices_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetSubDevices 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetSubDevicesCb_t)(
-    ze_device_get_sub_devices_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_properties_t** ppDeviceProperties;
-} ze_device_get_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetPropertiesCb_t)(
-    ze_device_get_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetComputeProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_compute_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_compute_properties_t** ppComputeProperties;
-} ze_device_get_compute_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetComputeProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetComputePropertiesCb_t)(
-    ze_device_get_compute_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetModuleProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_module_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_module_properties_t** ppModuleProperties;
-} ze_device_get_module_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetModuleProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetModulePropertiesCb_t)(
-    ze_device_get_module_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetCommandQueueGroupProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_command_queue_group_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    uint32_t** ppCount;
-    ze_command_queue_group_properties_t** ppCommandQueueGroupProperties;
-} ze_device_get_command_queue_group_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetCommandQueueGroupProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t)(
-    ze_device_get_command_queue_group_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetMemoryProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_memory_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    uint32_t** ppCount;
-    ze_device_memory_properties_t** ppMemProperties;
-} ze_device_get_memory_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetMemoryProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetMemoryPropertiesCb_t)(
-    ze_device_get_memory_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetMemoryAccessProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_memory_access_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_memory_access_properties_t** ppMemAccessProperties;
-} ze_device_get_memory_access_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetMemoryAccessProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetMemoryAccessPropertiesCb_t)(
-    ze_device_get_memory_access_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetCacheProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_cache_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    uint32_t** ppCount;
-    ze_device_cache_properties_t** ppCacheProperties;
-} ze_device_get_cache_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetCacheProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetCachePropertiesCb_t)(
-    ze_device_get_cache_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetImageProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_image_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_image_properties_t** ppImageProperties;
-} ze_device_get_image_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetImageProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetImagePropertiesCb_t)(
-    ze_device_get_image_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetExternalMemoryProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_external_memory_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_external_memory_properties_t** ppExternalMemoryProperties;
-} ze_device_get_external_memory_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetExternalMemoryProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetExternalMemoryPropertiesCb_t)(
-    ze_device_get_external_memory_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetP2PProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_p2_p_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_handle_t* phPeerDevice;
-    ze_device_p2p_properties_t** ppP2PProperties;
-} ze_device_get_p2_p_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetP2PProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetP2PPropertiesCb_t)(
-    ze_device_get_p2_p_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceCanAccessPeer 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_can_access_peer_params_t
-{
-    ze_device_handle_t* phDevice;
-    ze_device_handle_t* phPeerDevice;
-    ze_bool_t** pvalue;
-} ze_device_can_access_peer_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceCanAccessPeer 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceCanAccessPeerCb_t)(
-    ze_device_can_access_peer_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeDeviceGetStatus 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_device_get_status_params_t
-{
-    ze_device_handle_t* phDevice;
-} ze_device_get_status_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeDeviceGetStatus 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnDeviceGetStatusCb_t)(
-    ze_device_get_status_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Device callback functions pointers
-typedef struct _ze_device_callbacks_t
-{
-    ze_pfnDeviceGetCb_t                                             pfnGetCb;
-    ze_pfnDeviceGetSubDevicesCb_t                                   pfnGetSubDevicesCb;
-    ze_pfnDeviceGetPropertiesCb_t                                   pfnGetPropertiesCb;
-    ze_pfnDeviceGetComputePropertiesCb_t                            pfnGetComputePropertiesCb;
-    ze_pfnDeviceGetModulePropertiesCb_t                             pfnGetModulePropertiesCb;
-    ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t                  pfnGetCommandQueueGroupPropertiesCb;
-    ze_pfnDeviceGetMemoryPropertiesCb_t                             pfnGetMemoryPropertiesCb;
-    ze_pfnDeviceGetMemoryAccessPropertiesCb_t                       pfnGetMemoryAccessPropertiesCb;
-    ze_pfnDeviceGetCachePropertiesCb_t                              pfnGetCachePropertiesCb;
-    ze_pfnDeviceGetImagePropertiesCb_t                              pfnGetImagePropertiesCb;
-    ze_pfnDeviceGetExternalMemoryPropertiesCb_t                     pfnGetExternalMemoryPropertiesCb;
-    ze_pfnDeviceGetP2PPropertiesCb_t                                pfnGetP2PPropertiesCb;
-    ze_pfnDeviceCanAccessPeerCb_t                                   pfnCanAccessPeerCb;
-    ze_pfnDeviceGetStatusCb_t                                       pfnGetStatusCb;
-} ze_device_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_create_params_t
-{
-    ze_driver_handle_t* phDriver;
-    const ze_context_desc_t** pdesc;
-    ze_context_handle_t** pphContext;
-} ze_context_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextCreateCb_t)(
-    ze_context_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_destroy_params_t
-{
-    ze_context_handle_t* phContext;
-} ze_context_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextDestroyCb_t)(
-    ze_context_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextGetStatus 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_get_status_params_t
-{
-    ze_context_handle_t* phContext;
-} ze_context_get_status_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextGetStatus 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextGetStatusCb_t)(
-    ze_context_get_status_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextSystemBarrier 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_system_barrier_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-} ze_context_system_barrier_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextSystemBarrier 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextSystemBarrierCb_t)(
-    ze_context_system_barrier_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextMakeMemoryResident 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_make_memory_resident_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    void** pptr;
-    size_t* psize;
-} ze_context_make_memory_resident_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextMakeMemoryResident 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextMakeMemoryResidentCb_t)(
-    ze_context_make_memory_resident_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextEvictMemory 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_evict_memory_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    void** pptr;
-    size_t* psize;
-} ze_context_evict_memory_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextEvictMemory 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextEvictMemoryCb_t)(
-    ze_context_evict_memory_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextMakeImageResident 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_make_image_resident_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    ze_image_handle_t* phImage;
-} ze_context_make_image_resident_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextMakeImageResident 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextMakeImageResidentCb_t)(
-    ze_context_make_image_resident_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeContextEvictImage 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_context_evict_image_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    ze_image_handle_t* phImage;
-} ze_context_evict_image_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeContextEvictImage 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnContextEvictImageCb_t)(
-    ze_context_evict_image_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Context callback functions pointers
-typedef struct _ze_context_callbacks_t
-{
-    ze_pfnContextCreateCb_t                                         pfnCreateCb;
-    ze_pfnContextDestroyCb_t                                        pfnDestroyCb;
-    ze_pfnContextGetStatusCb_t                                      pfnGetStatusCb;
-    ze_pfnContextSystemBarrierCb_t                                  pfnSystemBarrierCb;
-    ze_pfnContextMakeMemoryResidentCb_t                             pfnMakeMemoryResidentCb;
-    ze_pfnContextEvictMemoryCb_t                                    pfnEvictMemoryCb;
-    ze_pfnContextMakeImageResidentCb_t                              pfnMakeImageResidentCb;
-    ze_pfnContextEvictImageCb_t                                     pfnEvictImageCb;
-} ze_context_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandQueueCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_queue_create_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_command_queue_desc_t** pdesc;
-    ze_command_queue_handle_t** pphCommandQueue;
-} ze_command_queue_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandQueueCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandQueueCreateCb_t)(
-    ze_command_queue_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandQueueDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_queue_destroy_params_t
-{
-    ze_command_queue_handle_t* phCommandQueue;
-} ze_command_queue_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandQueueDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandQueueDestroyCb_t)(
-    ze_command_queue_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandQueueExecuteCommandLists 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_queue_execute_command_lists_params_t
-{
-    ze_command_queue_handle_t* phCommandQueue;
-    uint32_t* pnumCommandLists;
-    ze_command_list_handle_t** pphCommandLists;
-    ze_fence_handle_t* phFence;
-} ze_command_queue_execute_command_lists_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandQueueExecuteCommandLists 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandQueueExecuteCommandListsCb_t)(
-    ze_command_queue_execute_command_lists_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandQueueSynchronize 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_queue_synchronize_params_t
-{
-    ze_command_queue_handle_t* phCommandQueue;
-    uint64_t* ptimeout;
-} ze_command_queue_synchronize_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandQueueSynchronize 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandQueueSynchronizeCb_t)(
-    ze_command_queue_synchronize_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of CommandQueue callback functions pointers
-typedef struct _ze_command_queue_callbacks_t
-{
-    ze_pfnCommandQueueCreateCb_t                                    pfnCreateCb;
-    ze_pfnCommandQueueDestroyCb_t                                   pfnDestroyCb;
-    ze_pfnCommandQueueExecuteCommandListsCb_t                       pfnExecuteCommandListsCb;
-    ze_pfnCommandQueueSynchronizeCb_t                               pfnSynchronizeCb;
-} ze_command_queue_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_create_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_command_list_desc_t** pdesc;
-    ze_command_list_handle_t** pphCommandList;
-} ze_command_list_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListCreateCb_t)(
-    ze_command_list_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListCreateImmediate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_create_immediate_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_command_queue_desc_t** paltdesc;
-    ze_command_list_handle_t** pphCommandList;
-} ze_command_list_create_immediate_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListCreateImmediate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListCreateImmediateCb_t)(
-    ze_command_list_create_immediate_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_destroy_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-} ze_command_list_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListDestroyCb_t)(
-    ze_command_list_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListClose 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_close_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-} ze_command_list_close_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListClose 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListCloseCb_t)(
-    ze_command_list_close_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListReset 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_reset_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-} ze_command_list_reset_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListReset 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListResetCb_t)(
-    ze_command_list_reset_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendWriteGlobalTimestamp 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_write_global_timestamp_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    uint64_t** pdstptr;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_write_global_timestamp_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendWriteGlobalTimestamp 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendWriteGlobalTimestampCb_t)(
-    ze_command_list_append_write_global_timestamp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendBarrier 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_barrier_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_barrier_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendBarrier 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendBarrierCb_t)(
-    ze_command_list_append_barrier_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemoryRangesBarrier 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_memory_ranges_barrier_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    uint32_t* pnumRanges;
-    const size_t** ppRangeSizes;
-    const void*** ppRanges;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_memory_ranges_barrier_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemoryRangesBarrier 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryRangesBarrierCb_t)(
-    ze_command_list_append_memory_ranges_barrier_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemoryCopy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_memory_copy_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    void** pdstptr;
-    const void** psrcptr;
-    size_t* psize;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_memory_copy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemoryCopy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyCb_t)(
-    ze_command_list_append_memory_copy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemoryFill 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_memory_fill_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    void** pptr;
-    const void** ppattern;
-    size_t* ppattern_size;
-    size_t* psize;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_memory_fill_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemoryFill 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryFillCb_t)(
-    ze_command_list_append_memory_fill_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemoryCopyRegion 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_memory_copy_region_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    void** pdstptr;
-    const ze_copy_region_t** pdstRegion;
-    uint32_t* pdstPitch;
-    uint32_t* pdstSlicePitch;
-    const void** psrcptr;
-    const ze_copy_region_t** psrcRegion;
-    uint32_t* psrcPitch;
-    uint32_t* psrcSlicePitch;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_memory_copy_region_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemoryCopyRegion 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyRegionCb_t)(
-    ze_command_list_append_memory_copy_region_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemoryCopyFromContext 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_memory_copy_from_context_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    void** pdstptr;
-    ze_context_handle_t* phContextSrc;
-    const void** psrcptr;
-    size_t* psize;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_memory_copy_from_context_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemoryCopyFromContext 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyFromContextCb_t)(
-    ze_command_list_append_memory_copy_from_context_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendImageCopy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_image_copy_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_image_handle_t* phDstImage;
-    ze_image_handle_t* phSrcImage;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_image_copy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendImageCopy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyCb_t)(
-    ze_command_list_append_image_copy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendImageCopyRegion 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_image_copy_region_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_image_handle_t* phDstImage;
-    ze_image_handle_t* phSrcImage;
-    const ze_image_region_t** ppDstRegion;
-    const ze_image_region_t** ppSrcRegion;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_image_copy_region_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendImageCopyRegion 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyRegionCb_t)(
-    ze_command_list_append_image_copy_region_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendImageCopyToMemory 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_image_copy_to_memory_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    void** pdstptr;
-    ze_image_handle_t* phSrcImage;
-    const ze_image_region_t** ppSrcRegion;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_image_copy_to_memory_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendImageCopyToMemory 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemoryCb_t)(
-    ze_command_list_append_image_copy_to_memory_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendImageCopyFromMemory 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_image_copy_from_memory_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_image_handle_t* phDstImage;
-    const void** psrcptr;
-    const ze_image_region_t** ppDstRegion;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_image_copy_from_memory_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendImageCopyFromMemory 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemoryCb_t)(
-    ze_command_list_append_image_copy_from_memory_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemoryPrefetch 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_memory_prefetch_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    const void** pptr;
-    size_t* psize;
-} ze_command_list_append_memory_prefetch_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemoryPrefetch 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryPrefetchCb_t)(
-    ze_command_list_append_memory_prefetch_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendMemAdvise 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_mem_advise_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_device_handle_t* phDevice;
-    const void** pptr;
-    size_t* psize;
-    ze_memory_advice_t* padvice;
-} ze_command_list_append_mem_advise_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendMemAdvise 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendMemAdviseCb_t)(
-    ze_command_list_append_mem_advise_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendSignalEvent 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_signal_event_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_event_handle_t* phEvent;
-} ze_command_list_append_signal_event_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendSignalEvent 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendSignalEventCb_t)(
-    ze_command_list_append_signal_event_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendWaitOnEvents 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_wait_on_events_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    uint32_t* pnumEvents;
-    ze_event_handle_t** pphEvents;
-} ze_command_list_append_wait_on_events_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendWaitOnEvents 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendWaitOnEventsCb_t)(
-    ze_command_list_append_wait_on_events_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendEventReset 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_event_reset_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_event_handle_t* phEvent;
-} ze_command_list_append_event_reset_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendEventReset 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendEventResetCb_t)(
-    ze_command_list_append_event_reset_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendQueryKernelTimestamps 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_query_kernel_timestamps_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    uint32_t* pnumEvents;
-    ze_event_handle_t** pphEvents;
-    void** pdstptr;
-    const size_t** ppOffsets;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_query_kernel_timestamps_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendQueryKernelTimestamps 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendQueryKernelTimestampsCb_t)(
-    ze_command_list_append_query_kernel_timestamps_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendLaunchKernel 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_launch_kernel_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_kernel_handle_t* phKernel;
-    const ze_group_count_t** ppLaunchFuncArgs;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_launch_kernel_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendLaunchKernel 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchKernelCb_t)(
-    ze_command_list_append_launch_kernel_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendLaunchCooperativeKernel 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_launch_cooperative_kernel_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_kernel_handle_t* phKernel;
-    const ze_group_count_t** ppLaunchFuncArgs;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_launch_cooperative_kernel_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendLaunchCooperativeKernel 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchCooperativeKernelCb_t)(
-    ze_command_list_append_launch_cooperative_kernel_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendLaunchKernelIndirect 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_launch_kernel_indirect_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    ze_kernel_handle_t* phKernel;
-    const ze_group_count_t** ppLaunchArgumentsBuffer;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_launch_kernel_indirect_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendLaunchKernelIndirect 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchKernelIndirectCb_t)(
-    ze_command_list_append_launch_kernel_indirect_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeCommandListAppendLaunchMultipleKernelsIndirect 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_command_list_append_launch_multiple_kernels_indirect_params_t
-{
-    ze_command_list_handle_t* phCommandList;
-    uint32_t* pnumKernels;
-    ze_kernel_handle_t** pphKernels;
-    const uint32_t** ppCountBuffer;
-    const ze_group_count_t** ppLaunchArgumentsBuffer;
-    ze_event_handle_t* phSignalEvent;
-    uint32_t* pnumWaitEvents;
-    ze_event_handle_t** pphWaitEvents;
-} ze_command_list_append_launch_multiple_kernels_indirect_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t)(
-    ze_command_list_append_launch_multiple_kernels_indirect_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of CommandList callback functions pointers
-typedef struct _ze_command_list_callbacks_t
-{
-    ze_pfnCommandListCreateCb_t                                     pfnCreateCb;
-    ze_pfnCommandListCreateImmediateCb_t                            pfnCreateImmediateCb;
-    ze_pfnCommandListDestroyCb_t                                    pfnDestroyCb;
-    ze_pfnCommandListCloseCb_t                                      pfnCloseCb;
-    ze_pfnCommandListResetCb_t                                      pfnResetCb;
-    ze_pfnCommandListAppendWriteGlobalTimestampCb_t                 pfnAppendWriteGlobalTimestampCb;
-    ze_pfnCommandListAppendBarrierCb_t                              pfnAppendBarrierCb;
-    ze_pfnCommandListAppendMemoryRangesBarrierCb_t                  pfnAppendMemoryRangesBarrierCb;
-    ze_pfnCommandListAppendMemoryCopyCb_t                           pfnAppendMemoryCopyCb;
-    ze_pfnCommandListAppendMemoryFillCb_t                           pfnAppendMemoryFillCb;
-    ze_pfnCommandListAppendMemoryCopyRegionCb_t                     pfnAppendMemoryCopyRegionCb;
-    ze_pfnCommandListAppendMemoryCopyFromContextCb_t                pfnAppendMemoryCopyFromContextCb;
-    ze_pfnCommandListAppendImageCopyCb_t                            pfnAppendImageCopyCb;
-    ze_pfnCommandListAppendImageCopyRegionCb_t                      pfnAppendImageCopyRegionCb;
-    ze_pfnCommandListAppendImageCopyToMemoryCb_t                    pfnAppendImageCopyToMemoryCb;
-    ze_pfnCommandListAppendImageCopyFromMemoryCb_t                  pfnAppendImageCopyFromMemoryCb;
-    ze_pfnCommandListAppendMemoryPrefetchCb_t                       pfnAppendMemoryPrefetchCb;
-    ze_pfnCommandListAppendMemAdviseCb_t                            pfnAppendMemAdviseCb;
-    ze_pfnCommandListAppendSignalEventCb_t                          pfnAppendSignalEventCb;
-    ze_pfnCommandListAppendWaitOnEventsCb_t                         pfnAppendWaitOnEventsCb;
-    ze_pfnCommandListAppendEventResetCb_t                           pfnAppendEventResetCb;
-    ze_pfnCommandListAppendQueryKernelTimestampsCb_t                pfnAppendQueryKernelTimestampsCb;
-    ze_pfnCommandListAppendLaunchKernelCb_t                         pfnAppendLaunchKernelCb;
-    ze_pfnCommandListAppendLaunchCooperativeKernelCb_t              pfnAppendLaunchCooperativeKernelCb;
-    ze_pfnCommandListAppendLaunchKernelIndirectCb_t                 pfnAppendLaunchKernelIndirectCb;
-    ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t        pfnAppendLaunchMultipleKernelsIndirectCb;
-} ze_command_list_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeImageGetProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_image_get_properties_params_t
-{
-    ze_device_handle_t* phDevice;
-    const ze_image_desc_t** pdesc;
-    ze_image_properties_t** ppImageProperties;
-} ze_image_get_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeImageGetProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnImageGetPropertiesCb_t)(
-    ze_image_get_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeImageCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_image_create_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_image_desc_t** pdesc;
-    ze_image_handle_t** pphImage;
-} ze_image_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeImageCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnImageCreateCb_t)(
-    ze_image_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeImageDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_image_destroy_params_t
-{
-    ze_image_handle_t* phImage;
-} ze_image_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeImageDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnImageDestroyCb_t)(
-    ze_image_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Image callback functions pointers
-typedef struct _ze_image_callbacks_t
-{
-    ze_pfnImageGetPropertiesCb_t                                    pfnGetPropertiesCb;
-    ze_pfnImageCreateCb_t                                           pfnCreateCb;
-    ze_pfnImageDestroyCb_t                                          pfnDestroyCb;
-} ze_image_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeFenceCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_fence_create_params_t
-{
-    ze_command_queue_handle_t* phCommandQueue;
-    const ze_fence_desc_t** pdesc;
-    ze_fence_handle_t** pphFence;
-} ze_fence_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeFenceCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnFenceCreateCb_t)(
-    ze_fence_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeFenceDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_fence_destroy_params_t
-{
-    ze_fence_handle_t* phFence;
-} ze_fence_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeFenceDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnFenceDestroyCb_t)(
-    ze_fence_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeFenceHostSynchronize 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_fence_host_synchronize_params_t
-{
-    ze_fence_handle_t* phFence;
-    uint64_t* ptimeout;
-} ze_fence_host_synchronize_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeFenceHostSynchronize 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnFenceHostSynchronizeCb_t)(
-    ze_fence_host_synchronize_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeFenceQueryStatus 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_fence_query_status_params_t
-{
-    ze_fence_handle_t* phFence;
-} ze_fence_query_status_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeFenceQueryStatus 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnFenceQueryStatusCb_t)(
-    ze_fence_query_status_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeFenceReset 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_fence_reset_params_t
-{
-    ze_fence_handle_t* phFence;
-} ze_fence_reset_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeFenceReset 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnFenceResetCb_t)(
-    ze_fence_reset_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Fence callback functions pointers
-typedef struct _ze_fence_callbacks_t
-{
-    ze_pfnFenceCreateCb_t                                           pfnCreateCb;
-    ze_pfnFenceDestroyCb_t                                          pfnDestroyCb;
-    ze_pfnFenceHostSynchronizeCb_t                                  pfnHostSynchronizeCb;
-    ze_pfnFenceQueryStatusCb_t                                      pfnQueryStatusCb;
-    ze_pfnFenceResetCb_t                                            pfnResetCb;
-} ze_fence_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventPoolCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_pool_create_params_t
-{
-    ze_context_handle_t* phContext;
-    const ze_event_pool_desc_t** pdesc;
-    uint32_t* pnumDevices;
-    ze_device_handle_t** pphDevices;
-    ze_event_pool_handle_t** pphEventPool;
-} ze_event_pool_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventPoolCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventPoolCreateCb_t)(
-    ze_event_pool_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventPoolDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_pool_destroy_params_t
-{
-    ze_event_pool_handle_t* phEventPool;
-} ze_event_pool_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventPoolDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventPoolDestroyCb_t)(
-    ze_event_pool_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventPoolGetIpcHandle 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_pool_get_ipc_handle_params_t
-{
-    ze_event_pool_handle_t* phEventPool;
-    ze_ipc_event_pool_handle_t** pphIpc;
-} ze_event_pool_get_ipc_handle_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventPoolGetIpcHandle 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventPoolGetIpcHandleCb_t)(
-    ze_event_pool_get_ipc_handle_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventPoolOpenIpcHandle 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_pool_open_ipc_handle_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_ipc_event_pool_handle_t* phIpc;
-    ze_event_pool_handle_t** pphEventPool;
-} ze_event_pool_open_ipc_handle_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventPoolOpenIpcHandle 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventPoolOpenIpcHandleCb_t)(
-    ze_event_pool_open_ipc_handle_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventPoolCloseIpcHandle 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_pool_close_ipc_handle_params_t
-{
-    ze_event_pool_handle_t* phEventPool;
-} ze_event_pool_close_ipc_handle_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventPoolCloseIpcHandle 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventPoolCloseIpcHandleCb_t)(
-    ze_event_pool_close_ipc_handle_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of EventPool callback functions pointers
-typedef struct _ze_event_pool_callbacks_t
-{
-    ze_pfnEventPoolCreateCb_t                                       pfnCreateCb;
-    ze_pfnEventPoolDestroyCb_t                                      pfnDestroyCb;
-    ze_pfnEventPoolGetIpcHandleCb_t                                 pfnGetIpcHandleCb;
-    ze_pfnEventPoolOpenIpcHandleCb_t                                pfnOpenIpcHandleCb;
-    ze_pfnEventPoolCloseIpcHandleCb_t                               pfnCloseIpcHandleCb;
-} ze_event_pool_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_create_params_t
-{
-    ze_event_pool_handle_t* phEventPool;
-    const ze_event_desc_t** pdesc;
-    ze_event_handle_t** pphEvent;
-} ze_event_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventCreateCb_t)(
-    ze_event_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_destroy_params_t
-{
-    ze_event_handle_t* phEvent;
-} ze_event_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventDestroyCb_t)(
-    ze_event_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventHostSignal 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_host_signal_params_t
-{
-    ze_event_handle_t* phEvent;
-} ze_event_host_signal_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventHostSignal 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventHostSignalCb_t)(
-    ze_event_host_signal_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventHostSynchronize 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_host_synchronize_params_t
-{
-    ze_event_handle_t* phEvent;
-    uint64_t* ptimeout;
-} ze_event_host_synchronize_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventHostSynchronize 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventHostSynchronizeCb_t)(
-    ze_event_host_synchronize_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventQueryStatus 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_query_status_params_t
-{
-    ze_event_handle_t* phEvent;
-} ze_event_query_status_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventQueryStatus 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventQueryStatusCb_t)(
-    ze_event_query_status_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventHostReset 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_host_reset_params_t
-{
-    ze_event_handle_t* phEvent;
-} ze_event_host_reset_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventHostReset 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventHostResetCb_t)(
-    ze_event_host_reset_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeEventQueryKernelTimestamp 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_event_query_kernel_timestamp_params_t
-{
-    ze_event_handle_t* phEvent;
-    ze_kernel_timestamp_result_t** pdstptr;
-} ze_event_query_kernel_timestamp_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeEventQueryKernelTimestamp 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnEventQueryKernelTimestampCb_t)(
-    ze_event_query_kernel_timestamp_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Event callback functions pointers
-typedef struct _ze_event_callbacks_t
-{
-    ze_pfnEventCreateCb_t                                           pfnCreateCb;
-    ze_pfnEventDestroyCb_t                                          pfnDestroyCb;
-    ze_pfnEventHostSignalCb_t                                       pfnHostSignalCb;
-    ze_pfnEventHostSynchronizeCb_t                                  pfnHostSynchronizeCb;
-    ze_pfnEventQueryStatusCb_t                                      pfnQueryStatusCb;
-    ze_pfnEventHostResetCb_t                                        pfnHostResetCb;
-    ze_pfnEventQueryKernelTimestampCb_t                             pfnQueryKernelTimestampCb;
-} ze_event_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_create_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_module_desc_t** pdesc;
-    ze_module_handle_t** pphModule;
-    ze_module_build_log_handle_t** pphBuildLog;
-} ze_module_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleCreateCb_t)(
-    ze_module_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_destroy_params_t
-{
-    ze_module_handle_t* phModule;
-} ze_module_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleDestroyCb_t)(
-    ze_module_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleDynamicLink 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_dynamic_link_params_t
-{
-    uint32_t* pnumModules;
-    ze_module_handle_t** pphModules;
-    ze_module_build_log_handle_t** pphLinkLog;
-} ze_module_dynamic_link_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleDynamicLink 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleDynamicLinkCb_t)(
-    ze_module_dynamic_link_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleGetNativeBinary 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_get_native_binary_params_t
-{
-    ze_module_handle_t* phModule;
-    size_t** ppSize;
-    uint8_t** ppModuleNativeBinary;
-} ze_module_get_native_binary_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleGetNativeBinary 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleGetNativeBinaryCb_t)(
-    ze_module_get_native_binary_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleGetGlobalPointer 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_get_global_pointer_params_t
-{
-    ze_module_handle_t* phModule;
-    const char** ppGlobalName;
-    size_t** ppSize;
-    void*** ppptr;
-} ze_module_get_global_pointer_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleGetGlobalPointer 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleGetGlobalPointerCb_t)(
-    ze_module_get_global_pointer_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleGetKernelNames 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_get_kernel_names_params_t
-{
-    ze_module_handle_t* phModule;
-    uint32_t** ppCount;
-    const char*** ppNames;
-} ze_module_get_kernel_names_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleGetKernelNames 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleGetKernelNamesCb_t)(
-    ze_module_get_kernel_names_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleGetProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_get_properties_params_t
-{
-    ze_module_handle_t* phModule;
-    ze_module_properties_t** ppModuleProperties;
-} ze_module_get_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleGetProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleGetPropertiesCb_t)(
-    ze_module_get_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleGetFunctionPointer 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_get_function_pointer_params_t
-{
-    ze_module_handle_t* phModule;
-    const char** ppFunctionName;
-    void*** ppfnFunction;
-} ze_module_get_function_pointer_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleGetFunctionPointer 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleGetFunctionPointerCb_t)(
-    ze_module_get_function_pointer_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Module callback functions pointers
-typedef struct _ze_module_callbacks_t
-{
-    ze_pfnModuleCreateCb_t                                          pfnCreateCb;
-    ze_pfnModuleDestroyCb_t                                         pfnDestroyCb;
-    ze_pfnModuleDynamicLinkCb_t                                     pfnDynamicLinkCb;
-    ze_pfnModuleGetNativeBinaryCb_t                                 pfnGetNativeBinaryCb;
-    ze_pfnModuleGetGlobalPointerCb_t                                pfnGetGlobalPointerCb;
-    ze_pfnModuleGetKernelNamesCb_t                                  pfnGetKernelNamesCb;
-    ze_pfnModuleGetPropertiesCb_t                                   pfnGetPropertiesCb;
-    ze_pfnModuleGetFunctionPointerCb_t                              pfnGetFunctionPointerCb;
-} ze_module_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleBuildLogDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_build_log_destroy_params_t
-{
-    ze_module_build_log_handle_t* phModuleBuildLog;
-} ze_module_build_log_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleBuildLogDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleBuildLogDestroyCb_t)(
-    ze_module_build_log_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeModuleBuildLogGetString 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_module_build_log_get_string_params_t
-{
-    ze_module_build_log_handle_t* phModuleBuildLog;
-    size_t** ppSize;
-    char** ppBuildLog;
-} ze_module_build_log_get_string_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeModuleBuildLogGetString 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnModuleBuildLogGetStringCb_t)(
-    ze_module_build_log_get_string_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of ModuleBuildLog callback functions pointers
-typedef struct _ze_module_build_log_callbacks_t
-{
-    ze_pfnModuleBuildLogDestroyCb_t                                 pfnDestroyCb;
-    ze_pfnModuleBuildLogGetStringCb_t                               pfnGetStringCb;
-} ze_module_build_log_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_create_params_t
-{
-    ze_module_handle_t* phModule;
-    const ze_kernel_desc_t** pdesc;
-    ze_kernel_handle_t** pphKernel;
-} ze_kernel_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelCreateCb_t)(
-    ze_kernel_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_destroy_params_t
-{
-    ze_kernel_handle_t* phKernel;
-} ze_kernel_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelDestroyCb_t)(
-    ze_kernel_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSetCacheConfig 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_set_cache_config_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    ze_cache_config_flags_t* pflags;
-} ze_kernel_set_cache_config_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSetCacheConfig 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelSetCacheConfigCb_t)(
-    ze_kernel_set_cache_config_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSetGroupSize 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_set_group_size_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    uint32_t* pgroupSizeX;
-    uint32_t* pgroupSizeY;
-    uint32_t* pgroupSizeZ;
-} ze_kernel_set_group_size_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSetGroupSize 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelSetGroupSizeCb_t)(
-    ze_kernel_set_group_size_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSuggestGroupSize 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_suggest_group_size_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    uint32_t* pglobalSizeX;
-    uint32_t* pglobalSizeY;
-    uint32_t* pglobalSizeZ;
-    uint32_t** pgroupSizeX;
-    uint32_t** pgroupSizeY;
-    uint32_t** pgroupSizeZ;
-} ze_kernel_suggest_group_size_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSuggestGroupSize 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelSuggestGroupSizeCb_t)(
-    ze_kernel_suggest_group_size_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSuggestMaxCooperativeGroupCount 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_suggest_max_cooperative_group_count_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    uint32_t** ptotalGroupCount;
-} ze_kernel_suggest_max_cooperative_group_count_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSuggestMaxCooperativeGroupCount 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t)(
-    ze_kernel_suggest_max_cooperative_group_count_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSetArgumentValue 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_set_argument_value_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    uint32_t* pargIndex;
-    size_t* pargSize;
-    const void** ppArgValue;
-} ze_kernel_set_argument_value_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSetArgumentValue 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelSetArgumentValueCb_t)(
-    ze_kernel_set_argument_value_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelSetIndirectAccess 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_set_indirect_access_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    ze_kernel_indirect_access_flags_t* pflags;
-} ze_kernel_set_indirect_access_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelSetIndirectAccess 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelSetIndirectAccessCb_t)(
-    ze_kernel_set_indirect_access_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelGetIndirectAccess 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_get_indirect_access_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    ze_kernel_indirect_access_flags_t** ppFlags;
-} ze_kernel_get_indirect_access_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelGetIndirectAccess 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelGetIndirectAccessCb_t)(
-    ze_kernel_get_indirect_access_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelGetSourceAttributes 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_get_source_attributes_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    uint32_t** ppSize;
-    char*** ppString;
-} ze_kernel_get_source_attributes_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelGetSourceAttributes 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelGetSourceAttributesCb_t)(
-    ze_kernel_get_source_attributes_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelGetProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_get_properties_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    ze_kernel_properties_t** ppKernelProperties;
-} ze_kernel_get_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelGetProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelGetPropertiesCb_t)(
-    ze_kernel_get_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeKernelGetName 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_kernel_get_name_params_t
-{
-    ze_kernel_handle_t* phKernel;
-    size_t** ppSize;
-    char** ppName;
-} ze_kernel_get_name_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeKernelGetName 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnKernelGetNameCb_t)(
-    ze_kernel_get_name_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Kernel callback functions pointers
-typedef struct _ze_kernel_callbacks_t
-{
-    ze_pfnKernelCreateCb_t                                          pfnCreateCb;
-    ze_pfnKernelDestroyCb_t                                         pfnDestroyCb;
-    ze_pfnKernelSetCacheConfigCb_t                                  pfnSetCacheConfigCb;
-    ze_pfnKernelSetGroupSizeCb_t                                    pfnSetGroupSizeCb;
-    ze_pfnKernelSuggestGroupSizeCb_t                                pfnSuggestGroupSizeCb;
-    ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t                 pfnSuggestMaxCooperativeGroupCountCb;
-    ze_pfnKernelSetArgumentValueCb_t                                pfnSetArgumentValueCb;
-    ze_pfnKernelSetIndirectAccessCb_t                               pfnSetIndirectAccessCb;
-    ze_pfnKernelGetIndirectAccessCb_t                               pfnGetIndirectAccessCb;
-    ze_pfnKernelGetSourceAttributesCb_t                             pfnGetSourceAttributesCb;
-    ze_pfnKernelGetPropertiesCb_t                                   pfnGetPropertiesCb;
-    ze_pfnKernelGetNameCb_t                                         pfnGetNameCb;
-} ze_kernel_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeSamplerCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_sampler_create_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    const ze_sampler_desc_t** pdesc;
-    ze_sampler_handle_t** pphSampler;
-} ze_sampler_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeSamplerCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnSamplerCreateCb_t)(
-    ze_sampler_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeSamplerDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_sampler_destroy_params_t
-{
-    ze_sampler_handle_t* phSampler;
-} ze_sampler_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeSamplerDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnSamplerDestroyCb_t)(
-    ze_sampler_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Sampler callback functions pointers
-typedef struct _ze_sampler_callbacks_t
-{
-    ze_pfnSamplerCreateCb_t                                         pfnCreateCb;
-    ze_pfnSamplerDestroyCb_t                                        pfnDestroyCb;
-} ze_sampler_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zePhysicalMemCreate 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_physical_mem_create_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    ze_physical_mem_desc_t** pdesc;
-    ze_physical_mem_handle_t** pphPhysicalMemory;
-} ze_physical_mem_create_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zePhysicalMemCreate 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnPhysicalMemCreateCb_t)(
-    ze_physical_mem_create_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zePhysicalMemDestroy 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_physical_mem_destroy_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_physical_mem_handle_t* phPhysicalMemory;
-} ze_physical_mem_destroy_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zePhysicalMemDestroy 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnPhysicalMemDestroyCb_t)(
-    ze_physical_mem_destroy_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of PhysicalMem callback functions pointers
-typedef struct _ze_physical_mem_callbacks_t
-{
-    ze_pfnPhysicalMemCreateCb_t                                     pfnCreateCb;
-    ze_pfnPhysicalMemDestroyCb_t                                    pfnDestroyCb;
-} ze_physical_mem_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemAllocShared 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_alloc_shared_params_t
-{
-    ze_context_handle_t* phContext;
-    const ze_device_mem_alloc_desc_t** pdevice_desc;
-    const ze_host_mem_alloc_desc_t** phost_desc;
-    size_t* psize;
-    size_t* palignment;
-    ze_device_handle_t* phDevice;
-    void*** ppptr;
-} ze_mem_alloc_shared_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemAllocShared 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemAllocSharedCb_t)(
-    ze_mem_alloc_shared_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemAllocDevice 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_alloc_device_params_t
-{
-    ze_context_handle_t* phContext;
-    const ze_device_mem_alloc_desc_t** pdevice_desc;
-    size_t* psize;
-    size_t* palignment;
-    ze_device_handle_t* phDevice;
-    void*** ppptr;
-} ze_mem_alloc_device_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemAllocDevice 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemAllocDeviceCb_t)(
-    ze_mem_alloc_device_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemAllocHost 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_alloc_host_params_t
-{
-    ze_context_handle_t* phContext;
-    const ze_host_mem_alloc_desc_t** phost_desc;
-    size_t* psize;
-    size_t* palignment;
-    void*** ppptr;
-} ze_mem_alloc_host_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemAllocHost 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemAllocHostCb_t)(
-    ze_mem_alloc_host_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemFree 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_free_params_t
-{
-    ze_context_handle_t* phContext;
-    void** pptr;
-} ze_mem_free_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemFree 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemFreeCb_t)(
-    ze_mem_free_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemGetAllocProperties 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_get_alloc_properties_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    ze_memory_allocation_properties_t** ppMemAllocProperties;
-    ze_device_handle_t** pphDevice;
-} ze_mem_get_alloc_properties_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemGetAllocProperties 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemGetAllocPropertiesCb_t)(
-    ze_mem_get_alloc_properties_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemGetAddressRange 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_get_address_range_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    void*** ppBase;
-    size_t** ppSize;
-} ze_mem_get_address_range_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemGetAddressRange 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemGetAddressRangeCb_t)(
-    ze_mem_get_address_range_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemGetIpcHandle 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_get_ipc_handle_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    ze_ipc_mem_handle_t** ppIpcHandle;
-} ze_mem_get_ipc_handle_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemGetIpcHandle 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemGetIpcHandleCb_t)(
-    ze_mem_get_ipc_handle_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemOpenIpcHandle 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_open_ipc_handle_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    ze_ipc_mem_handle_t* phandle;
-    ze_ipc_memory_flags_t* pflags;
-    void*** ppptr;
-} ze_mem_open_ipc_handle_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemOpenIpcHandle 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemOpenIpcHandleCb_t)(
-    ze_mem_open_ipc_handle_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeMemCloseIpcHandle 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_mem_close_ipc_handle_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-} ze_mem_close_ipc_handle_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeMemCloseIpcHandle 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnMemCloseIpcHandleCb_t)(
-    ze_mem_close_ipc_handle_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Mem callback functions pointers
-typedef struct _ze_mem_callbacks_t
-{
-    ze_pfnMemAllocSharedCb_t                                        pfnAllocSharedCb;
-    ze_pfnMemAllocDeviceCb_t                                        pfnAllocDeviceCb;
-    ze_pfnMemAllocHostCb_t                                          pfnAllocHostCb;
-    ze_pfnMemFreeCb_t                                               pfnFreeCb;
-    ze_pfnMemGetAllocPropertiesCb_t                                 pfnGetAllocPropertiesCb;
-    ze_pfnMemGetAddressRangeCb_t                                    pfnGetAddressRangeCb;
-    ze_pfnMemGetIpcHandleCb_t                                       pfnGetIpcHandleCb;
-    ze_pfnMemOpenIpcHandleCb_t                                      pfnOpenIpcHandleCb;
-    ze_pfnMemCloseIpcHandleCb_t                                     pfnCloseIpcHandleCb;
-} ze_mem_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemReserve 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_reserve_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** ppStart;
-    size_t* psize;
-    void*** ppptr;
-} ze_virtual_mem_reserve_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemReserve 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemReserveCb_t)(
-    ze_virtual_mem_reserve_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemFree 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_free_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    size_t* psize;
-} ze_virtual_mem_free_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemFree 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemFreeCb_t)(
-    ze_virtual_mem_free_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemQueryPageSize 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_query_page_size_params_t
-{
-    ze_context_handle_t* phContext;
-    ze_device_handle_t* phDevice;
-    size_t* psize;
-    size_t** ppagesize;
-} ze_virtual_mem_query_page_size_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemQueryPageSize 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemQueryPageSizeCb_t)(
-    ze_virtual_mem_query_page_size_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemMap 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_map_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    size_t* psize;
-    ze_physical_mem_handle_t* phPhysicalMemory;
-    size_t* poffset;
-    ze_memory_access_attribute_t* paccess;
-} ze_virtual_mem_map_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemMap 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemMapCb_t)(
-    ze_virtual_mem_map_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemUnmap 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_unmap_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    size_t* psize;
-} ze_virtual_mem_unmap_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemUnmap 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemUnmapCb_t)(
-    ze_virtual_mem_unmap_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemSetAccessAttribute 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_set_access_attribute_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    size_t* psize;
-    ze_memory_access_attribute_t* paccess;
-} ze_virtual_mem_set_access_attribute_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemSetAccessAttribute 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemSetAccessAttributeCb_t)(
-    ze_virtual_mem_set_access_attribute_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function parameters for zeVirtualMemGetAccessAttribute 
-/// @details Each entry is a pointer to the parameter passed to the function;
-///     allowing the callback the ability to modify the parameter's value
-typedef struct _ze_virtual_mem_get_access_attribute_params_t
-{
-    ze_context_handle_t* phContext;
-    const void** pptr;
-    size_t* psize;
-    ze_memory_access_attribute_t** paccess;
-    size_t** poutSize;
-} ze_virtual_mem_get_access_attribute_params_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Callback function-pointer for zeVirtualMemGetAccessAttribute 
-/// @param[in] params Parameters passed to this instance
-/// @param[in] result Return value
-/// @param[in] pTracerUserData Per-Tracer user data
-/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
-typedef void (ZE_APICALL *ze_pfnVirtualMemGetAccessAttributeCb_t)(
-    ze_virtual_mem_get_access_attribute_params_t* params,
-    ze_result_t result,
-    void* pTracerUserData,
-    void** ppTracerInstanceUserData
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of VirtualMem callback functions pointers
-typedef struct _ze_virtual_mem_callbacks_t
-{
-    ze_pfnVirtualMemReserveCb_t                                     pfnReserveCb;
-    ze_pfnVirtualMemFreeCb_t                                        pfnFreeCb;
-    ze_pfnVirtualMemQueryPageSizeCb_t                               pfnQueryPageSizeCb;
-    ze_pfnVirtualMemMapCb_t                                         pfnMapCb;
-    ze_pfnVirtualMemUnmapCb_t                                       pfnUnmapCb;
-    ze_pfnVirtualMemSetAccessAttributeCb_t                          pfnSetAccessAttributeCb;
-    ze_pfnVirtualMemGetAccessAttributeCb_t                          pfnGetAccessAttributeCb;
-} ze_virtual_mem_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Container for all callbacks
-typedef struct _ze_callbacks_t
-{
-    ze_global_callbacks_t               Global;
-    ze_driver_callbacks_t               Driver;
-    ze_device_callbacks_t               Device;
-    ze_context_callbacks_t              Context;
-    ze_command_queue_callbacks_t        CommandQueue;
-    ze_command_list_callbacks_t         CommandList;
-    ze_fence_callbacks_t                Fence;
-    ze_event_pool_callbacks_t           EventPool;
-    ze_event_callbacks_t                Event;
-    ze_image_callbacks_t                Image;
-    ze_module_callbacks_t               Module;
-    ze_module_build_log_callbacks_t     ModuleBuildLog;
-    ze_kernel_callbacks_t               Kernel;
-    ze_sampler_callbacks_t              Sampler;
-    ze_physical_mem_callbacks_t         PhysicalMem;
-    ze_mem_callbacks_t                  Mem;
-    ze_virtual_mem_callbacks_t          VirtualMem;
-} ze_callbacks_t;
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // _ZE_API_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/level_zero/ze_ddi.h b/src/gpu/intel/sycl/l0/level_zero/ze_ddi.h
deleted file mode 100644
index 140183139e3..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/ze_ddi.h
+++ /dev/null
@@ -1,1890 +0,0 @@
-/*
- *
- * Copyright (C) 2019-2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file ze_ddi.h
- * @version v1.3-r1.3.7
- *
- */
-#ifndef _ZE_DDI_H
-#define _ZE_DDI_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-#include "ze_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeInit 
-typedef ze_result_t (ZE_APICALL *ze_pfnInit_t)(
-    ze_init_flags_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Global functions pointers
-typedef struct _ze_global_dditable_t
-{
-    ze_pfnInit_t                                                pfnInit;
-} ze_global_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Global table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetGlobalProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_global_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetGlobalProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetGlobalProcAddrTable_t)(
-    ze_api_version_t,
-    ze_global_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDriverGet 
-typedef ze_result_t (ZE_APICALL *ze_pfnDriverGet_t)(
-    uint32_t*,
-    ze_driver_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDriverGetApiVersion 
-typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetApiVersion_t)(
-    ze_driver_handle_t,
-    ze_api_version_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDriverGetProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetProperties_t)(
-    ze_driver_handle_t,
-    ze_driver_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDriverGetIpcProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetIpcProperties_t)(
-    ze_driver_handle_t,
-    ze_driver_ipc_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDriverGetExtensionProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetExtensionProperties_t)(
-    ze_driver_handle_t,
-    uint32_t*,
-    ze_driver_extension_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDriverGetExtensionFunctionAddress 
-typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetExtensionFunctionAddress_t)(
-    ze_driver_handle_t,
-    const char*,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Driver functions pointers
-typedef struct _ze_driver_dditable_t
-{
-    ze_pfnDriverGet_t                                           pfnGet;
-    ze_pfnDriverGetApiVersion_t                                 pfnGetApiVersion;
-    ze_pfnDriverGetProperties_t                                 pfnGetProperties;
-    ze_pfnDriverGetIpcProperties_t                              pfnGetIpcProperties;
-    ze_pfnDriverGetExtensionProperties_t                        pfnGetExtensionProperties;
-    ze_pfnDriverGetExtensionFunctionAddress_t                   pfnGetExtensionFunctionAddress;
-} ze_driver_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Driver table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetDriverProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_driver_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetDriverProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetDriverProcAddrTable_t)(
-    ze_api_version_t,
-    ze_driver_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGet 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGet_t)(
-    ze_driver_handle_t,
-    uint32_t*,
-    ze_device_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetSubDevices 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetSubDevices_t)(
-    ze_device_handle_t,
-    uint32_t*,
-    ze_device_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetProperties_t)(
-    ze_device_handle_t,
-    ze_device_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetComputeProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetComputeProperties_t)(
-    ze_device_handle_t,
-    ze_device_compute_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetModuleProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetModuleProperties_t)(
-    ze_device_handle_t,
-    ze_device_module_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetCommandQueueGroupProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetCommandQueueGroupProperties_t)(
-    ze_device_handle_t,
-    uint32_t*,
-    ze_command_queue_group_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetMemoryProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetMemoryProperties_t)(
-    ze_device_handle_t,
-    uint32_t*,
-    ze_device_memory_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetMemoryAccessProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetMemoryAccessProperties_t)(
-    ze_device_handle_t,
-    ze_device_memory_access_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetCacheProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetCacheProperties_t)(
-    ze_device_handle_t,
-    uint32_t*,
-    ze_device_cache_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetImageProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetImageProperties_t)(
-    ze_device_handle_t,
-    ze_device_image_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetExternalMemoryProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetExternalMemoryProperties_t)(
-    ze_device_handle_t,
-    ze_device_external_memory_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetP2PProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetP2PProperties_t)(
-    ze_device_handle_t,
-    ze_device_handle_t,
-    ze_device_p2p_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceCanAccessPeer 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceCanAccessPeer_t)(
-    ze_device_handle_t,
-    ze_device_handle_t,
-    ze_bool_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetStatus 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetStatus_t)(
-    ze_device_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceGetGlobalTimestamps 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetGlobalTimestamps_t)(
-    ze_device_handle_t,
-    uint64_t*,
-    uint64_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceReserveCacheExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceReserveCacheExt_t)(
-    ze_device_handle_t,
-    size_t,
-    size_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDeviceSetCacheAdviceExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnDeviceSetCacheAdviceExt_t)(
-    ze_device_handle_t,
-    void*,
-    size_t,
-    ze_cache_ext_region_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeDevicePciGetPropertiesExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnDevicePciGetPropertiesExt_t)(
-    ze_device_handle_t,
-    ze_pci_ext_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Device functions pointers
-typedef struct _ze_device_dditable_t
-{
-    ze_pfnDeviceGet_t                                           pfnGet;
-    ze_pfnDeviceGetSubDevices_t                                 pfnGetSubDevices;
-    ze_pfnDeviceGetProperties_t                                 pfnGetProperties;
-    ze_pfnDeviceGetComputeProperties_t                          pfnGetComputeProperties;
-    ze_pfnDeviceGetModuleProperties_t                           pfnGetModuleProperties;
-    ze_pfnDeviceGetCommandQueueGroupProperties_t                pfnGetCommandQueueGroupProperties;
-    ze_pfnDeviceGetMemoryProperties_t                           pfnGetMemoryProperties;
-    ze_pfnDeviceGetMemoryAccessProperties_t                     pfnGetMemoryAccessProperties;
-    ze_pfnDeviceGetCacheProperties_t                            pfnGetCacheProperties;
-    ze_pfnDeviceGetImageProperties_t                            pfnGetImageProperties;
-    ze_pfnDeviceGetExternalMemoryProperties_t                   pfnGetExternalMemoryProperties;
-    ze_pfnDeviceGetP2PProperties_t                              pfnGetP2PProperties;
-    ze_pfnDeviceCanAccessPeer_t                                 pfnCanAccessPeer;
-    ze_pfnDeviceGetStatus_t                                     pfnGetStatus;
-    ze_pfnDeviceGetGlobalTimestamps_t                           pfnGetGlobalTimestamps;
-    ze_pfnDeviceReserveCacheExt_t                               pfnReserveCacheExt;
-    ze_pfnDeviceSetCacheAdviceExt_t                             pfnSetCacheAdviceExt;
-    ze_pfnDevicePciGetPropertiesExt_t                           pfnPciGetPropertiesExt;
-} ze_device_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Device table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetDeviceProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_device_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetDeviceProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetDeviceProcAddrTable_t)(
-    ze_api_version_t,
-    ze_device_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextCreate_t)(
-    ze_driver_handle_t,
-    const ze_context_desc_t*,
-    ze_context_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextDestroy_t)(
-    ze_context_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextGetStatus 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextGetStatus_t)(
-    ze_context_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextSystemBarrier 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextSystemBarrier_t)(
-    ze_context_handle_t,
-    ze_device_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextMakeMemoryResident 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextMakeMemoryResident_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    void*,
-    size_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextEvictMemory 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextEvictMemory_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    void*,
-    size_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextMakeImageResident 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextMakeImageResident_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    ze_image_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextEvictImage 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextEvictImage_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    ze_image_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeContextCreateEx 
-typedef ze_result_t (ZE_APICALL *ze_pfnContextCreateEx_t)(
-    ze_driver_handle_t,
-    const ze_context_desc_t*,
-    uint32_t,
-    ze_device_handle_t*,
-    ze_context_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Context functions pointers
-typedef struct _ze_context_dditable_t
-{
-    ze_pfnContextCreate_t                                       pfnCreate;
-    ze_pfnContextDestroy_t                                      pfnDestroy;
-    ze_pfnContextGetStatus_t                                    pfnGetStatus;
-    ze_pfnContextSystemBarrier_t                                pfnSystemBarrier;
-    ze_pfnContextMakeMemoryResident_t                           pfnMakeMemoryResident;
-    ze_pfnContextEvictMemory_t                                  pfnEvictMemory;
-    ze_pfnContextMakeImageResident_t                            pfnMakeImageResident;
-    ze_pfnContextEvictImage_t                                   pfnEvictImage;
-    ze_pfnContextCreateEx_t                                     pfnCreateEx;
-} ze_context_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Context table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetContextProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_context_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetContextProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetContextProcAddrTable_t)(
-    ze_api_version_t,
-    ze_context_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandQueueCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueCreate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_command_queue_desc_t*,
-    ze_command_queue_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandQueueDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueDestroy_t)(
-    ze_command_queue_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandQueueExecuteCommandLists 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueExecuteCommandLists_t)(
-    ze_command_queue_handle_t,
-    uint32_t,
-    ze_command_list_handle_t*,
-    ze_fence_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandQueueSynchronize 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueSynchronize_t)(
-    ze_command_queue_handle_t,
-    uint64_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of CommandQueue functions pointers
-typedef struct _ze_command_queue_dditable_t
-{
-    ze_pfnCommandQueueCreate_t                                  pfnCreate;
-    ze_pfnCommandQueueDestroy_t                                 pfnDestroy;
-    ze_pfnCommandQueueExecuteCommandLists_t                     pfnExecuteCommandLists;
-    ze_pfnCommandQueueSynchronize_t                             pfnSynchronize;
-} ze_command_queue_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's CommandQueue table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetCommandQueueProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_command_queue_dditable_t* pDdiTable          ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetCommandQueueProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetCommandQueueProcAddrTable_t)(
-    ze_api_version_t,
-    ze_command_queue_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListCreate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_command_list_desc_t*,
-    ze_command_list_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListCreateImmediate 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListCreateImmediate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_command_queue_desc_t*,
-    ze_command_list_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListDestroy_t)(
-    ze_command_list_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListClose 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListClose_t)(
-    ze_command_list_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListReset 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListReset_t)(
-    ze_command_list_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendWriteGlobalTimestamp 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendWriteGlobalTimestamp_t)(
-    ze_command_list_handle_t,
-    uint64_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendBarrier 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendBarrier_t)(
-    ze_command_list_handle_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemoryRangesBarrier 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryRangesBarrier_t)(
-    ze_command_list_handle_t,
-    uint32_t,
-    const size_t*,
-    const void**,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemoryCopy 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryCopy_t)(
-    ze_command_list_handle_t,
-    void*,
-    const void*,
-    size_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemoryFill 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryFill_t)(
-    ze_command_list_handle_t,
-    void*,
-    const void*,
-    size_t,
-    size_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemoryCopyRegion 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyRegion_t)(
-    ze_command_list_handle_t,
-    void*,
-    const ze_copy_region_t*,
-    uint32_t,
-    uint32_t,
-    const void*,
-    const ze_copy_region_t*,
-    uint32_t,
-    uint32_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemoryCopyFromContext 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyFromContext_t)(
-    ze_command_list_handle_t,
-    void*,
-    ze_context_handle_t,
-    const void*,
-    size_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendImageCopy 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopy_t)(
-    ze_command_list_handle_t,
-    ze_image_handle_t,
-    ze_image_handle_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendImageCopyRegion 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyRegion_t)(
-    ze_command_list_handle_t,
-    ze_image_handle_t,
-    ze_image_handle_t,
-    const ze_image_region_t*,
-    const ze_image_region_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendImageCopyToMemory 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemory_t)(
-    ze_command_list_handle_t,
-    void*,
-    ze_image_handle_t,
-    const ze_image_region_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendImageCopyFromMemory 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemory_t)(
-    ze_command_list_handle_t,
-    ze_image_handle_t,
-    const void*,
-    const ze_image_region_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemoryPrefetch 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryPrefetch_t)(
-    ze_command_list_handle_t,
-    const void*,
-    size_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendMemAdvise 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemAdvise_t)(
-    ze_command_list_handle_t,
-    ze_device_handle_t,
-    const void*,
-    size_t,
-    ze_memory_advice_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendSignalEvent 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendSignalEvent_t)(
-    ze_command_list_handle_t,
-    ze_event_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendWaitOnEvents 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendWaitOnEvents_t)(
-    ze_command_list_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendEventReset 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendEventReset_t)(
-    ze_command_list_handle_t,
-    ze_event_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendQueryKernelTimestamps 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendQueryKernelTimestamps_t)(
-    ze_command_list_handle_t,
-    uint32_t,
-    ze_event_handle_t*,
-    void*,
-    const size_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendLaunchKernel 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchKernel_t)(
-    ze_command_list_handle_t,
-    ze_kernel_handle_t,
-    const ze_group_count_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendLaunchCooperativeKernel 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchCooperativeKernel_t)(
-    ze_command_list_handle_t,
-    ze_kernel_handle_t,
-    const ze_group_count_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendLaunchKernelIndirect 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchKernelIndirect_t)(
-    ze_command_list_handle_t,
-    ze_kernel_handle_t,
-    const ze_group_count_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchMultipleKernelsIndirect_t)(
-    ze_command_list_handle_t,
-    uint32_t,
-    ze_kernel_handle_t*,
-    const uint32_t*,
-    const ze_group_count_t*,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendImageCopyToMemoryExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemoryExt_t)(
-    ze_command_list_handle_t,
-    void*,
-    ze_image_handle_t,
-    const ze_image_region_t*,
-    uint32_t,
-    uint32_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeCommandListAppendImageCopyFromMemoryExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemoryExt_t)(
-    ze_command_list_handle_t,
-    ze_image_handle_t,
-    const void*,
-    const ze_image_region_t*,
-    uint32_t,
-    uint32_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of CommandList functions pointers
-typedef struct _ze_command_list_dditable_t
-{
-    ze_pfnCommandListCreate_t                                   pfnCreate;
-    ze_pfnCommandListCreateImmediate_t                          pfnCreateImmediate;
-    ze_pfnCommandListDestroy_t                                  pfnDestroy;
-    ze_pfnCommandListClose_t                                    pfnClose;
-    ze_pfnCommandListReset_t                                    pfnReset;
-    ze_pfnCommandListAppendWriteGlobalTimestamp_t               pfnAppendWriteGlobalTimestamp;
-    ze_pfnCommandListAppendBarrier_t                            pfnAppendBarrier;
-    ze_pfnCommandListAppendMemoryRangesBarrier_t                pfnAppendMemoryRangesBarrier;
-    ze_pfnCommandListAppendMemoryCopy_t                         pfnAppendMemoryCopy;
-    ze_pfnCommandListAppendMemoryFill_t                         pfnAppendMemoryFill;
-    ze_pfnCommandListAppendMemoryCopyRegion_t                   pfnAppendMemoryCopyRegion;
-    ze_pfnCommandListAppendMemoryCopyFromContext_t              pfnAppendMemoryCopyFromContext;
-    ze_pfnCommandListAppendImageCopy_t                          pfnAppendImageCopy;
-    ze_pfnCommandListAppendImageCopyRegion_t                    pfnAppendImageCopyRegion;
-    ze_pfnCommandListAppendImageCopyToMemory_t                  pfnAppendImageCopyToMemory;
-    ze_pfnCommandListAppendImageCopyFromMemory_t                pfnAppendImageCopyFromMemory;
-    ze_pfnCommandListAppendMemoryPrefetch_t                     pfnAppendMemoryPrefetch;
-    ze_pfnCommandListAppendMemAdvise_t                          pfnAppendMemAdvise;
-    ze_pfnCommandListAppendSignalEvent_t                        pfnAppendSignalEvent;
-    ze_pfnCommandListAppendWaitOnEvents_t                       pfnAppendWaitOnEvents;
-    ze_pfnCommandListAppendEventReset_t                         pfnAppendEventReset;
-    ze_pfnCommandListAppendQueryKernelTimestamps_t              pfnAppendQueryKernelTimestamps;
-    ze_pfnCommandListAppendLaunchKernel_t                       pfnAppendLaunchKernel;
-    ze_pfnCommandListAppendLaunchCooperativeKernel_t            pfnAppendLaunchCooperativeKernel;
-    ze_pfnCommandListAppendLaunchKernelIndirect_t               pfnAppendLaunchKernelIndirect;
-    ze_pfnCommandListAppendLaunchMultipleKernelsIndirect_t      pfnAppendLaunchMultipleKernelsIndirect;
-    ze_pfnCommandListAppendImageCopyToMemoryExt_t               pfnAppendImageCopyToMemoryExt;
-    ze_pfnCommandListAppendImageCopyFromMemoryExt_t             pfnAppendImageCopyFromMemoryExt;
-} ze_command_list_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's CommandList table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetCommandListProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_command_list_dditable_t* pDdiTable           ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetCommandListProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetCommandListProcAddrTable_t)(
-    ze_api_version_t,
-    ze_command_list_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeImageGetProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnImageGetProperties_t)(
-    ze_device_handle_t,
-    const ze_image_desc_t*,
-    ze_image_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeImageCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnImageCreate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_image_desc_t*,
-    ze_image_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeImageDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnImageDestroy_t)(
-    ze_image_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeImageGetAllocPropertiesExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnImageGetAllocPropertiesExt_t)(
-    ze_context_handle_t,
-    ze_image_handle_t,
-    ze_image_allocation_ext_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Image functions pointers
-typedef struct _ze_image_dditable_t
-{
-    ze_pfnImageGetProperties_t                                  pfnGetProperties;
-    ze_pfnImageCreate_t                                         pfnCreate;
-    ze_pfnImageDestroy_t                                        pfnDestroy;
-    ze_pfnImageGetAllocPropertiesExt_t                          pfnGetAllocPropertiesExt;
-} ze_image_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Image table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetImageProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_image_dditable_t* pDdiTable                  ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetImageProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetImageProcAddrTable_t)(
-    ze_api_version_t,
-    ze_image_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeImageGetMemoryPropertiesExp 
-typedef ze_result_t (ZE_APICALL *ze_pfnImageGetMemoryPropertiesExp_t)(
-    ze_image_handle_t,
-    ze_image_memory_properties_exp_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeImageViewCreateExp 
-typedef ze_result_t (ZE_APICALL *ze_pfnImageViewCreateExp_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_image_desc_t*,
-    ze_image_handle_t,
-    ze_image_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of ImageExp functions pointers
-typedef struct _ze_image_exp_dditable_t
-{
-    ze_pfnImageGetMemoryPropertiesExp_t                         pfnGetMemoryPropertiesExp;
-    ze_pfnImageViewCreateExp_t                                  pfnViewCreateExp;
-} ze_image_exp_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's ImageExp table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetImageExpProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_image_exp_dditable_t* pDdiTable              ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetImageExpProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetImageExpProcAddrTable_t)(
-    ze_api_version_t,
-    ze_image_exp_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeFenceCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnFenceCreate_t)(
-    ze_command_queue_handle_t,
-    const ze_fence_desc_t*,
-    ze_fence_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeFenceDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnFenceDestroy_t)(
-    ze_fence_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeFenceHostSynchronize 
-typedef ze_result_t (ZE_APICALL *ze_pfnFenceHostSynchronize_t)(
-    ze_fence_handle_t,
-    uint64_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeFenceQueryStatus 
-typedef ze_result_t (ZE_APICALL *ze_pfnFenceQueryStatus_t)(
-    ze_fence_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeFenceReset 
-typedef ze_result_t (ZE_APICALL *ze_pfnFenceReset_t)(
-    ze_fence_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Fence functions pointers
-typedef struct _ze_fence_dditable_t
-{
-    ze_pfnFenceCreate_t                                         pfnCreate;
-    ze_pfnFenceDestroy_t                                        pfnDestroy;
-    ze_pfnFenceHostSynchronize_t                                pfnHostSynchronize;
-    ze_pfnFenceQueryStatus_t                                    pfnQueryStatus;
-    ze_pfnFenceReset_t                                          pfnReset;
-} ze_fence_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Fence table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetFenceProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_fence_dditable_t* pDdiTable                  ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetFenceProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetFenceProcAddrTable_t)(
-    ze_api_version_t,
-    ze_fence_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventPoolCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolCreate_t)(
-    ze_context_handle_t,
-    const ze_event_pool_desc_t*,
-    uint32_t,
-    ze_device_handle_t*,
-    ze_event_pool_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventPoolDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolDestroy_t)(
-    ze_event_pool_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventPoolGetIpcHandle 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolGetIpcHandle_t)(
-    ze_event_pool_handle_t,
-    ze_ipc_event_pool_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventPoolOpenIpcHandle 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolOpenIpcHandle_t)(
-    ze_context_handle_t,
-    ze_ipc_event_pool_handle_t,
-    ze_event_pool_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventPoolCloseIpcHandle 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolCloseIpcHandle_t)(
-    ze_event_pool_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of EventPool functions pointers
-typedef struct _ze_event_pool_dditable_t
-{
-    ze_pfnEventPoolCreate_t                                     pfnCreate;
-    ze_pfnEventPoolDestroy_t                                    pfnDestroy;
-    ze_pfnEventPoolGetIpcHandle_t                               pfnGetIpcHandle;
-    ze_pfnEventPoolOpenIpcHandle_t                              pfnOpenIpcHandle;
-    ze_pfnEventPoolCloseIpcHandle_t                             pfnCloseIpcHandle;
-} ze_event_pool_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's EventPool table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetEventPoolProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_event_pool_dditable_t* pDdiTable             ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetEventPoolProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetEventPoolProcAddrTable_t)(
-    ze_api_version_t,
-    ze_event_pool_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventCreate_t)(
-    ze_event_pool_handle_t,
-    const ze_event_desc_t*,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventDestroy_t)(
-    ze_event_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventHostSignal 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventHostSignal_t)(
-    ze_event_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventHostSynchronize 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventHostSynchronize_t)(
-    ze_event_handle_t,
-    uint64_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventQueryStatus 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryStatus_t)(
-    ze_event_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventHostReset 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventHostReset_t)(
-    ze_event_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventQueryKernelTimestamp 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryKernelTimestamp_t)(
-    ze_event_handle_t,
-    ze_kernel_timestamp_result_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Event functions pointers
-typedef struct _ze_event_dditable_t
-{
-    ze_pfnEventCreate_t                                         pfnCreate;
-    ze_pfnEventDestroy_t                                        pfnDestroy;
-    ze_pfnEventHostSignal_t                                     pfnHostSignal;
-    ze_pfnEventHostSynchronize_t                                pfnHostSynchronize;
-    ze_pfnEventQueryStatus_t                                    pfnQueryStatus;
-    ze_pfnEventHostReset_t                                      pfnHostReset;
-    ze_pfnEventQueryKernelTimestamp_t                           pfnQueryKernelTimestamp;
-} ze_event_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Event table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetEventProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_event_dditable_t* pDdiTable                  ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetEventProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetEventProcAddrTable_t)(
-    ze_api_version_t,
-    ze_event_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeEventQueryTimestampsExp 
-typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryTimestampsExp_t)(
-    ze_event_handle_t,
-    ze_device_handle_t,
-    uint32_t*,
-    ze_kernel_timestamp_result_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of EventExp functions pointers
-typedef struct _ze_event_exp_dditable_t
-{
-    ze_pfnEventQueryTimestampsExp_t                             pfnQueryTimestampsExp;
-} ze_event_exp_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's EventExp table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetEventExpProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_event_exp_dditable_t* pDdiTable              ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetEventExpProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetEventExpProcAddrTable_t)(
-    ze_api_version_t,
-    ze_event_exp_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleCreate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_module_desc_t*,
-    ze_module_handle_t*,
-    ze_module_build_log_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleDestroy_t)(
-    ze_module_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleDynamicLink 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleDynamicLink_t)(
-    uint32_t,
-    ze_module_handle_t*,
-    ze_module_build_log_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleGetNativeBinary 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetNativeBinary_t)(
-    ze_module_handle_t,
-    size_t*,
-    uint8_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleGetGlobalPointer 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetGlobalPointer_t)(
-    ze_module_handle_t,
-    const char*,
-    size_t*,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleGetKernelNames 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetKernelNames_t)(
-    ze_module_handle_t,
-    uint32_t*,
-    const char**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleGetProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetProperties_t)(
-    ze_module_handle_t,
-    ze_module_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleGetFunctionPointer 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetFunctionPointer_t)(
-    ze_module_handle_t,
-    const char*,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleInspectLinkageExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleInspectLinkageExt_t)(
-    ze_linkage_inspection_ext_desc_t*,
-    uint32_t,
-    ze_module_handle_t*,
-    ze_module_build_log_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Module functions pointers
-typedef struct _ze_module_dditable_t
-{
-    ze_pfnModuleCreate_t                                        pfnCreate;
-    ze_pfnModuleDestroy_t                                       pfnDestroy;
-    ze_pfnModuleDynamicLink_t                                   pfnDynamicLink;
-    ze_pfnModuleGetNativeBinary_t                               pfnGetNativeBinary;
-    ze_pfnModuleGetGlobalPointer_t                              pfnGetGlobalPointer;
-    ze_pfnModuleGetKernelNames_t                                pfnGetKernelNames;
-    ze_pfnModuleGetProperties_t                                 pfnGetProperties;
-    ze_pfnModuleGetFunctionPointer_t                            pfnGetFunctionPointer;
-    ze_pfnModuleInspectLinkageExt_t                             pfnInspectLinkageExt;
-} ze_module_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Module table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetModuleProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_module_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetModuleProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetModuleProcAddrTable_t)(
-    ze_api_version_t,
-    ze_module_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleBuildLogDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleBuildLogDestroy_t)(
-    ze_module_build_log_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeModuleBuildLogGetString 
-typedef ze_result_t (ZE_APICALL *ze_pfnModuleBuildLogGetString_t)(
-    ze_module_build_log_handle_t,
-    size_t*,
-    char*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of ModuleBuildLog functions pointers
-typedef struct _ze_module_build_log_dditable_t
-{
-    ze_pfnModuleBuildLogDestroy_t                               pfnDestroy;
-    ze_pfnModuleBuildLogGetString_t                             pfnGetString;
-} ze_module_build_log_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's ModuleBuildLog table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetModuleBuildLogProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_module_build_log_dditable_t* pDdiTable       ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetModuleBuildLogProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetModuleBuildLogProcAddrTable_t)(
-    ze_api_version_t,
-    ze_module_build_log_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelCreate_t)(
-    ze_module_handle_t,
-    const ze_kernel_desc_t*,
-    ze_kernel_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelDestroy_t)(
-    ze_kernel_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSetCacheConfig 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetCacheConfig_t)(
-    ze_kernel_handle_t,
-    ze_cache_config_flags_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSetGroupSize 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetGroupSize_t)(
-    ze_kernel_handle_t,
-    uint32_t,
-    uint32_t,
-    uint32_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSuggestGroupSize 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSuggestGroupSize_t)(
-    ze_kernel_handle_t,
-    uint32_t,
-    uint32_t,
-    uint32_t,
-    uint32_t*,
-    uint32_t*,
-    uint32_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSuggestMaxCooperativeGroupCount 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSuggestMaxCooperativeGroupCount_t)(
-    ze_kernel_handle_t,
-    uint32_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSetArgumentValue 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetArgumentValue_t)(
-    ze_kernel_handle_t,
-    uint32_t,
-    size_t,
-    const void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSetIndirectAccess 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetIndirectAccess_t)(
-    ze_kernel_handle_t,
-    ze_kernel_indirect_access_flags_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelGetIndirectAccess 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetIndirectAccess_t)(
-    ze_kernel_handle_t,
-    ze_kernel_indirect_access_flags_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelGetSourceAttributes 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetSourceAttributes_t)(
-    ze_kernel_handle_t,
-    uint32_t*,
-    char**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelGetProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetProperties_t)(
-    ze_kernel_handle_t,
-    ze_kernel_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelGetName 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetName_t)(
-    ze_kernel_handle_t,
-    size_t*,
-    char*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Kernel functions pointers
-typedef struct _ze_kernel_dditable_t
-{
-    ze_pfnKernelCreate_t                                        pfnCreate;
-    ze_pfnKernelDestroy_t                                       pfnDestroy;
-    ze_pfnKernelSetCacheConfig_t                                pfnSetCacheConfig;
-    ze_pfnKernelSetGroupSize_t                                  pfnSetGroupSize;
-    ze_pfnKernelSuggestGroupSize_t                              pfnSuggestGroupSize;
-    ze_pfnKernelSuggestMaxCooperativeGroupCount_t               pfnSuggestMaxCooperativeGroupCount;
-    ze_pfnKernelSetArgumentValue_t                              pfnSetArgumentValue;
-    ze_pfnKernelSetIndirectAccess_t                             pfnSetIndirectAccess;
-    ze_pfnKernelGetIndirectAccess_t                             pfnGetIndirectAccess;
-    ze_pfnKernelGetSourceAttributes_t                           pfnGetSourceAttributes;
-    ze_pfnKernelGetProperties_t                                 pfnGetProperties;
-    ze_pfnKernelGetName_t                                       pfnGetName;
-} ze_kernel_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Kernel table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetKernelProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_kernel_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetKernelProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetKernelProcAddrTable_t)(
-    ze_api_version_t,
-    ze_kernel_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSetGlobalOffsetExp 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetGlobalOffsetExp_t)(
-    ze_kernel_handle_t,
-    uint32_t,
-    uint32_t,
-    uint32_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeKernelSchedulingHintExp 
-typedef ze_result_t (ZE_APICALL *ze_pfnKernelSchedulingHintExp_t)(
-    ze_kernel_handle_t,
-    ze_scheduling_hint_exp_desc_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of KernelExp functions pointers
-typedef struct _ze_kernel_exp_dditable_t
-{
-    ze_pfnKernelSetGlobalOffsetExp_t                            pfnSetGlobalOffsetExp;
-    ze_pfnKernelSchedulingHintExp_t                             pfnSchedulingHintExp;
-} ze_kernel_exp_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's KernelExp table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetKernelExpProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_kernel_exp_dditable_t* pDdiTable             ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetKernelExpProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetKernelExpProcAddrTable_t)(
-    ze_api_version_t,
-    ze_kernel_exp_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeSamplerCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnSamplerCreate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    const ze_sampler_desc_t*,
-    ze_sampler_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeSamplerDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnSamplerDestroy_t)(
-    ze_sampler_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Sampler functions pointers
-typedef struct _ze_sampler_dditable_t
-{
-    ze_pfnSamplerCreate_t                                       pfnCreate;
-    ze_pfnSamplerDestroy_t                                      pfnDestroy;
-} ze_sampler_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Sampler table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetSamplerProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_sampler_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetSamplerProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetSamplerProcAddrTable_t)(
-    ze_api_version_t,
-    ze_sampler_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zePhysicalMemCreate 
-typedef ze_result_t (ZE_APICALL *ze_pfnPhysicalMemCreate_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    ze_physical_mem_desc_t*,
-    ze_physical_mem_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zePhysicalMemDestroy 
-typedef ze_result_t (ZE_APICALL *ze_pfnPhysicalMemDestroy_t)(
-    ze_context_handle_t,
-    ze_physical_mem_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of PhysicalMem functions pointers
-typedef struct _ze_physical_mem_dditable_t
-{
-    ze_pfnPhysicalMemCreate_t                                   pfnCreate;
-    ze_pfnPhysicalMemDestroy_t                                  pfnDestroy;
-} ze_physical_mem_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's PhysicalMem table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetPhysicalMemProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_physical_mem_dditable_t* pDdiTable           ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetPhysicalMemProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetPhysicalMemProcAddrTable_t)(
-    ze_api_version_t,
-    ze_physical_mem_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemAllocShared 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemAllocShared_t)(
-    ze_context_handle_t,
-    const ze_device_mem_alloc_desc_t*,
-    const ze_host_mem_alloc_desc_t*,
-    size_t,
-    size_t,
-    ze_device_handle_t,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemAllocDevice 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemAllocDevice_t)(
-    ze_context_handle_t,
-    const ze_device_mem_alloc_desc_t*,
-    size_t,
-    size_t,
-    ze_device_handle_t,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemAllocHost 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemAllocHost_t)(
-    ze_context_handle_t,
-    const ze_host_mem_alloc_desc_t*,
-    size_t,
-    size_t,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemFree 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemFree_t)(
-    ze_context_handle_t,
-    void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemGetAllocProperties 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemGetAllocProperties_t)(
-    ze_context_handle_t,
-    const void*,
-    ze_memory_allocation_properties_t*,
-    ze_device_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemGetAddressRange 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemGetAddressRange_t)(
-    ze_context_handle_t,
-    const void*,
-    void**,
-    size_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemGetIpcHandle 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemGetIpcHandle_t)(
-    ze_context_handle_t,
-    const void*,
-    ze_ipc_mem_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemOpenIpcHandle 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemOpenIpcHandle_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    ze_ipc_mem_handle_t,
-    ze_ipc_memory_flags_t,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemCloseIpcHandle 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemCloseIpcHandle_t)(
-    ze_context_handle_t,
-    const void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeMemFreeExt 
-typedef ze_result_t (ZE_APICALL *ze_pfnMemFreeExt_t)(
-    ze_context_handle_t,
-    const ze_memory_free_ext_desc_t*,
-    void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Mem functions pointers
-typedef struct _ze_mem_dditable_t
-{
-    ze_pfnMemAllocShared_t                                      pfnAllocShared;
-    ze_pfnMemAllocDevice_t                                      pfnAllocDevice;
-    ze_pfnMemAllocHost_t                                        pfnAllocHost;
-    ze_pfnMemFree_t                                             pfnFree;
-    ze_pfnMemGetAllocProperties_t                               pfnGetAllocProperties;
-    ze_pfnMemGetAddressRange_t                                  pfnGetAddressRange;
-    ze_pfnMemGetIpcHandle_t                                     pfnGetIpcHandle;
-    ze_pfnMemOpenIpcHandle_t                                    pfnOpenIpcHandle;
-    ze_pfnMemCloseIpcHandle_t                                   pfnCloseIpcHandle;
-    ze_pfnMemFreeExt_t                                          pfnFreeExt;
-} ze_mem_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Mem table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetMemProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_mem_dditable_t* pDdiTable                    ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetMemProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetMemProcAddrTable_t)(
-    ze_api_version_t,
-    ze_mem_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemReserve 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemReserve_t)(
-    ze_context_handle_t,
-    const void*,
-    size_t,
-    void**
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemFree 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemFree_t)(
-    ze_context_handle_t,
-    const void*,
-    size_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemQueryPageSize 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemQueryPageSize_t)(
-    ze_context_handle_t,
-    ze_device_handle_t,
-    size_t,
-    size_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemMap 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemMap_t)(
-    ze_context_handle_t,
-    const void*,
-    size_t,
-    ze_physical_mem_handle_t,
-    size_t,
-    ze_memory_access_attribute_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemUnmap 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemUnmap_t)(
-    ze_context_handle_t,
-    const void*,
-    size_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemSetAccessAttribute 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemSetAccessAttribute_t)(
-    ze_context_handle_t,
-    const void*,
-    size_t,
-    ze_memory_access_attribute_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeVirtualMemGetAccessAttribute 
-typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemGetAccessAttribute_t)(
-    ze_context_handle_t,
-    const void*,
-    size_t,
-    ze_memory_access_attribute_t*,
-    size_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of VirtualMem functions pointers
-typedef struct _ze_virtual_mem_dditable_t
-{
-    ze_pfnVirtualMemReserve_t                                   pfnReserve;
-    ze_pfnVirtualMemFree_t                                      pfnFree;
-    ze_pfnVirtualMemQueryPageSize_t                             pfnQueryPageSize;
-    ze_pfnVirtualMemMap_t                                       pfnMap;
-    ze_pfnVirtualMemUnmap_t                                     pfnUnmap;
-    ze_pfnVirtualMemSetAccessAttribute_t                        pfnSetAccessAttribute;
-    ze_pfnVirtualMemGetAccessAttribute_t                        pfnGetAccessAttribute;
-} ze_virtual_mem_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's VirtualMem table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zeGetVirtualMemProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    ze_virtual_mem_dditable_t* pDdiTable            ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zeGetVirtualMemProcAddrTable
-typedef ze_result_t (ZE_APICALL *ze_pfnGetVirtualMemProcAddrTable_t)(
-    ze_api_version_t,
-    ze_virtual_mem_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Container for all DDI tables
-typedef struct _ze_dditable_t
-{
-    ze_global_dditable_t                Global;
-    ze_driver_dditable_t                Driver;
-    ze_device_dditable_t                Device;
-    ze_context_dditable_t               Context;
-    ze_command_queue_dditable_t         CommandQueue;
-    ze_command_list_dditable_t          CommandList;
-    ze_image_dditable_t                 Image;
-    ze_image_exp_dditable_t             ImageExp;
-    ze_fence_dditable_t                 Fence;
-    ze_event_pool_dditable_t            EventPool;
-    ze_event_dditable_t                 Event;
-    ze_event_exp_dditable_t             EventExp;
-    ze_module_dditable_t                Module;
-    ze_module_build_log_dditable_t      ModuleBuildLog;
-    ze_kernel_dditable_t                Kernel;
-    ze_kernel_exp_dditable_t            KernelExp;
-    ze_sampler_dditable_t               Sampler;
-    ze_physical_mem_dditable_t          PhysicalMem;
-    ze_mem_dditable_t                   Mem;
-    ze_virtual_mem_dditable_t           VirtualMem;
-} ze_dditable_t;
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // _ZE_DDI_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/level_zero/zes.py b/src/gpu/intel/sycl/l0/level_zero/zes.py
deleted file mode 100644
index bcc4b55be60..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/zes.py
+++ /dev/null
@@ -1,2848 +0,0 @@
-"""
- Copyright (C) 2019-2021 Intel Corporation
-
- SPDX-License-Identifier: MIT
-
- @file zes.py
- @version v1.3-r1.3.7
-
- """
-import platform
-from ctypes import *
-from enum import *
-
-###############################################################################
-__version__ = "1.0"
-
-###############################################################################
-## @brief Handle to a driver instance
-class zes_driver_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of device object
-class zes_device_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device scheduler queue
-class zes_sched_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device performance factors
-class zes_perf_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device power domain
-class zes_pwr_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device frequency domain
-class zes_freq_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device engine group
-class zes_engine_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device standby control
-class zes_standby_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device firmware
-class zes_firmware_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device memory module
-class zes_mem_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman fabric port
-class zes_fabric_port_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device temperature sensor
-class zes_temp_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device power supply
-class zes_psu_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device fan
-class zes_fan_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device LED
-class zes_led_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device RAS error set
-class zes_ras_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle for a Sysman device diagnostics test suite
-class zes_diag_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Defines structure types
-class zes_structure_type_v(IntEnum):
-    DEVICE_PROPERTIES = 0x1                         ## ::zes_device_properties_t
-    PCI_PROPERTIES = 0x2                            ## ::zes_pci_properties_t
-    PCI_BAR_PROPERTIES = 0x3                        ## ::zes_pci_bar_properties_t
-    DIAG_PROPERTIES = 0x4                           ## ::zes_diag_properties_t
-    ENGINE_PROPERTIES = 0x5                         ## ::zes_engine_properties_t
-    FABRIC_PORT_PROPERTIES = 0x6                    ## ::zes_fabric_port_properties_t
-    FAN_PROPERTIES = 0x7                            ## ::zes_fan_properties_t
-    FIRMWARE_PROPERTIES = 0x8                       ## ::zes_firmware_properties_t
-    FREQ_PROPERTIES = 0x9                           ## ::zes_freq_properties_t
-    LED_PROPERTIES = 0xa                            ## ::zes_led_properties_t
-    MEM_PROPERTIES = 0xb                            ## ::zes_mem_properties_t
-    PERF_PROPERTIES = 0xc                           ## ::zes_perf_properties_t
-    POWER_PROPERTIES = 0xd                          ## ::zes_power_properties_t
-    PSU_PROPERTIES = 0xe                            ## ::zes_psu_properties_t
-    RAS_PROPERTIES = 0xf                            ## ::zes_ras_properties_t
-    SCHED_PROPERTIES = 0x10                         ## ::zes_sched_properties_t
-    SCHED_TIMEOUT_PROPERTIES = 0x11                 ## ::zes_sched_timeout_properties_t
-    SCHED_TIMESLICE_PROPERTIES = 0x12               ## ::zes_sched_timeslice_properties_t
-    STANDBY_PROPERTIES = 0x13                       ## ::zes_standby_properties_t
-    TEMP_PROPERTIES = 0x14                          ## ::zes_temp_properties_t
-    DEVICE_STATE = 0x15                             ## ::zes_device_state_t
-    PROCESS_STATE = 0x16                            ## ::zes_process_state_t
-    PCI_STATE = 0x17                                ## ::zes_pci_state_t
-    FABRIC_PORT_CONFIG = 0x18                       ## ::zes_fabric_port_config_t
-    FABRIC_PORT_STATE = 0x19                        ## ::zes_fabric_port_state_t
-    FAN_CONFIG = 0x1a                               ## ::zes_fan_config_t
-    FREQ_STATE = 0x1b                               ## ::zes_freq_state_t
-    OC_CAPABILITIES = 0x1c                          ## ::zes_oc_capabilities_t
-    LED_STATE = 0x1d                                ## ::zes_led_state_t
-    MEM_STATE = 0x1e                                ## ::zes_mem_state_t
-    PSU_STATE = 0x1f                                ## ::zes_psu_state_t
-    BASE_STATE = 0x20                               ## ::zes_base_state_t
-    RAS_CONFIG = 0x21                               ## ::zes_ras_config_t
-    RAS_STATE = 0x22                                ## ::zes_ras_state_t
-    TEMP_CONFIG = 0x23                              ## ::zes_temp_config_t
-    PCI_BAR_PROPERTIES_1_2 = 0x24                   ## ::zes_pci_bar_properties_1_2_t
-
-class zes_structure_type_t(c_int):
-    def __str__(self):
-        return str(zes_structure_type_v(self.value))
-
-
-###############################################################################
-## @brief Base for all properties types
-class zes_base_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in,out][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Base for all descriptor types
-class zes_base_desc_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Base for all state types
-class zes_base_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Base for all config types
-class zes_base_config_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Base for all capability types
-class zes_base_capability_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Maximum number of characters in string properties.
-ZES_STRING_PROPERTY_SIZE = 64
-
-###############################################################################
-## @brief Types of accelerator engines
-class zes_engine_type_flags_v(IntEnum):
-    OTHER = ZE_BIT(0)                               ## Undefined types of accelerators.
-    COMPUTE = ZE_BIT(1)                             ## Engines that process compute kernels only (no 3D content).
-    _3D = ZE_BIT(2)                                 ## Engines that process 3D content only (no compute kernels).
-    MEDIA = ZE_BIT(3)                               ## Engines that process media workloads.
-    DMA = ZE_BIT(4)                                 ## Engines that copy blocks of data.
-    RENDER = ZE_BIT(5)                              ## Engines that can process both 3D content and compute kernels.
-
-class zes_engine_type_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device repair status
-class zes_repair_status_v(IntEnum):
-    UNSUPPORTED = 0                                 ## The device does not support in-field repairs.
-    NOT_PERFORMED = 1                               ## The device has never been repaired.
-    PERFORMED = 2                                   ## The device has been repaired.
-
-class zes_repair_status_t(c_int):
-    def __str__(self):
-        return str(zes_repair_status_v(self.value))
-
-
-###############################################################################
-## @brief Device reset reasons
-class zes_reset_reason_flags_v(IntEnum):
-    WEDGED = ZE_BIT(0)                              ## The device needs to be reset because one or more parts of the hardware
-                                                    ## is wedged
-    REPAIR = ZE_BIT(1)                              ## The device needs to be reset in order to complete in-field repairs
-
-class zes_reset_reason_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device state
-class zes_device_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("reset", zes_reset_reason_flags_t),                            ## [out] Indicates if the device needs to be reset and for what reasons.
-                                                                        ## returns 0 (none) or combination of ::zes_reset_reason_flag_t
-        ("repaired", zes_repair_status_t)                               ## [out] Indicates if the device has been repaired
-    ]
-
-###############################################################################
-## @brief Device properties
-class zes_device_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("core", ze_device_properties_t),                               ## [out] Core device properties
-        ("numSubdevices", c_ulong),                                     ## [out] Number of sub-devices. A value of 0 indicates that this device
-                                                                        ## doesn't have sub-devices.
-        ("serialNumber", c_char * ZES_STRING_PROPERTY_SIZE),            ## [out] Manufacturing serial number (NULL terminated string value). Will
-                                                                        ## be set to the string "unkown" if this cannot be determined for the
-                                                                        ## device.
-        ("boardNumber", c_char * ZES_STRING_PROPERTY_SIZE),             ## [out] Manufacturing board number (NULL terminated string value). Will
-                                                                        ## be set to the string "unkown" if this cannot be determined for the
-                                                                        ## device.
-        ("brandName", c_char * ZES_STRING_PROPERTY_SIZE),               ## [out] Brand name of the device (NULL terminated string value). Will be
-                                                                        ## set to the string "unkown" if this cannot be determined for the
-                                                                        ## device.
-        ("modelName", c_char * ZES_STRING_PROPERTY_SIZE),               ## [out] Model name of the device (NULL terminated string value). Will be
-                                                                        ## set to the string "unkown" if this cannot be determined for the
-                                                                        ## device.
-        ("vendorName", c_char * ZES_STRING_PROPERTY_SIZE),              ## [out] Vendor name of the device (NULL terminated string value). Will
-                                                                        ## be set to the string "unkown" if this cannot be determined for the
-                                                                        ## device.
-        ("driverVersion", c_char * ZES_STRING_PROPERTY_SIZE)            ## [out] Installed driver version (NULL terminated string value). Will be
-                                                                        ## set to the string "unkown" if this cannot be determined for the
-                                                                        ## device.
-    ]
-
-###############################################################################
-## @brief Contains information about a process that has an open connection with
-##        this device
-## 
-## @details
-##     - The application can use the process ID to query the OS for the owner
-##       and the path to the executable.
-class zes_process_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("processId", c_ulong),                                         ## [out] Host OS process ID.
-        ("memSize", c_ulonglong),                                       ## [out] Device memory size in bytes allocated by this process (may not
-                                                                        ## necessarily be resident on the device at the time of reading).
-        ("sharedSize", c_ulonglong),                                    ## [out] The size of shared device memory mapped into this process (may
-                                                                        ## not necessarily be resident on the device at the time of reading).
-        ("engines", zes_engine_type_flags_t)                            ## [out] Bitfield of accelerator engine types being used by this process.
-    ]
-
-###############################################################################
-## @brief PCI address
-class zes_pci_address_t(Structure):
-    _fields_ = [
-        ("domain", c_ulong),                                            ## [out] BDF domain
-        ("bus", c_ulong),                                               ## [out] BDF bus
-        ("device", c_ulong),                                            ## [out] BDF device
-        ("function", c_ulong)                                           ## [out] BDF function
-    ]
-
-###############################################################################
-## @brief PCI speed
-class zes_pci_speed_t(Structure):
-    _fields_ = [
-        ("gen", c_int32_t),                                             ## [out] The link generation. A value of -1 means that this property is
-                                                                        ## unknown.
-        ("width", c_int32_t),                                           ## [out] The number of lanes. A value of -1 means that this property is
-                                                                        ## unknown.
-        ("maxBandwidth", c_int64_t)                                     ## [out] The maximum bandwidth in bytes/sec (sum of all lanes). A value
-                                                                        ## of -1 means that this property is unknown.
-    ]
-
-###############################################################################
-## @brief Static PCI properties
-class zes_pci_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("address", zes_pci_address_t),                                 ## [out] The BDF address
-        ("maxSpeed", zes_pci_speed_t),                                  ## [out] Fastest port configuration supported by the device (sum of all
-                                                                        ## lanes)
-        ("haveBandwidthCounters", ze_bool_t),                           ## [out] Indicates if ::zes_pci_stats_t.rxCounter and
-                                                                        ## ::zes_pci_stats_t.txCounter will have valid values
-        ("havePacketCounters", ze_bool_t),                              ## [out] Indicates if ::zes_pci_stats_t.packetCounter will have valid
-                                                                        ## values
-        ("haveReplayCounters", ze_bool_t)                               ## [out] Indicates if ::zes_pci_stats_t.replayCounter will have valid
-                                                                        ## values
-    ]
-
-###############################################################################
-## @brief PCI link status
-class zes_pci_link_status_v(IntEnum):
-    UNKNOWN = 0                                     ## The link status could not be determined
-    GOOD = 1                                        ## The link is up and operating as expected
-    QUALITY_ISSUES = 2                              ## The link is up but has quality and/or bandwidth degradation
-    STABILITY_ISSUES = 3                            ## The link has stability issues and preventing workloads making forward
-                                                    ## progress
-
-class zes_pci_link_status_t(c_int):
-    def __str__(self):
-        return str(zes_pci_link_status_v(self.value))
-
-
-###############################################################################
-## @brief PCI link quality degradation reasons
-class zes_pci_link_qual_issue_flags_v(IntEnum):
-    REPLAYS = ZE_BIT(0)                             ## A significant number of replays are occurring
-    SPEED = ZE_BIT(1)                               ## There is a degradation in the maximum bandwidth of the link
-
-class zes_pci_link_qual_issue_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief PCI link stability issues
-class zes_pci_link_stab_issue_flags_v(IntEnum):
-    RETRAINING = ZE_BIT(0)                          ## Link retraining has occurred to deal with quality issues
-
-class zes_pci_link_stab_issue_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Dynamic PCI state
-class zes_pci_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("status", zes_pci_link_status_t),                              ## [out] The current status of the port
-        ("qualityIssues", zes_pci_link_qual_issue_flags_t),             ## [out] If status is ::ZES_PCI_LINK_STATUS_QUALITY_ISSUES, 
-                                                                        ## then this gives a combination of ::zes_pci_link_qual_issue_flag_t for
-                                                                        ## quality issues that have been detected;
-                                                                        ## otherwise, 0 indicates there are no quality issues with the link at
-                                                                        ## this time."
-        ("stabilityIssues", zes_pci_link_stab_issue_flags_t),           ## [out] If status is ::ZES_PCI_LINK_STATUS_STABILITY_ISSUES, 
-                                                                        ## then this gives a combination of ::zes_pci_link_stab_issue_flag_t for
-                                                                        ## reasons for the connection instability;
-                                                                        ## otherwise, 0 indicates there are no connection stability issues at
-                                                                        ## this time."
-        ("speed", zes_pci_speed_t)                                      ## [out] The current port configure speed
-    ]
-
-###############################################################################
-## @brief PCI bar types
-class zes_pci_bar_type_v(IntEnum):
-    MMIO = 0                                        ## MMIO registers
-    ROM = 1                                         ## ROM aperture
-    MEM = 2                                         ## Device memory
-
-class zes_pci_bar_type_t(c_int):
-    def __str__(self):
-        return str(zes_pci_bar_type_v(self.value))
-
-
-###############################################################################
-## @brief Properties of a pci bar
-class zes_pci_bar_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_pci_bar_type_t),                                   ## [out] The type of bar
-        ("index", c_ulong),                                             ## [out] The index of the bar
-        ("base", c_ulonglong),                                          ## [out] Base address of the bar.
-        ("size", c_ulonglong)                                           ## [out] Size of the bar.
-    ]
-
-###############################################################################
-## @brief Properties of a pci bar, including the resizable bar.
-class zes_pci_bar_properties_1_2_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_pci_bar_type_t),                                   ## [out] The type of bar
-        ("index", c_ulong),                                             ## [out] The index of the bar
-        ("base", c_ulonglong),                                          ## [out] Base address of the bar.
-        ("size", c_ulonglong),                                          ## [out] Size of the bar.
-        ("resizableBarSupported", ze_bool_t),                           ## [out] Support for Resizable Bar on this device.
-        ("resizableBarEnabled", ze_bool_t)                              ## [out] Resizable Bar enabled on this device
-    ]
-
-###############################################################################
-## @brief PCI stats counters
-## 
-## @details
-##     - Percent replays is calculated by taking two snapshots (s1, s2) and
-##       using the equation: %replay = 10^6 * (s2.replayCounter -
-##       s1.replayCounter) / (s2.maxBandwidth * (s2.timestamp - s1.timestamp))
-##     - Percent throughput is calculated by taking two snapshots (s1, s2) and
-##       using the equation: %bw = 10^6 * ((s2.rxCounter - s1.rxCounter) +
-##       (s2.txCounter - s1.txCounter)) / (s2.maxBandwidth * (s2.timestamp -
-##       s1.timestamp))
-class zes_pci_stats_t(Structure):
-    _fields_ = [
-        ("timestamp", c_ulonglong),                                     ## [out] Monotonic timestamp counter in microseconds when the measurement
-                                                                        ## was made.
-                                                                        ## This timestamp should only be used to calculate delta time between
-                                                                        ## snapshots of this structure.
-                                                                        ## Never take the delta of this timestamp with the timestamp from a
-                                                                        ## different structure since they are not guaranteed to have the same base.
-                                                                        ## The absolute value of the timestamp is only valid during within the
-                                                                        ## application and may be different on the next execution.
-        ("replayCounter", c_ulonglong),                                 ## [out] Monotonic counter for the number of replay packets (sum of all
-                                                                        ## lanes). Will always be 0 if ::zes_pci_properties_t.haveReplayCounters
-                                                                        ## is FALSE.
-        ("packetCounter", c_ulonglong),                                 ## [out] Monotonic counter for the number of packets (sum of all lanes).
-                                                                        ## Will always be 0 if ::zes_pci_properties_t.havePacketCounters is
-                                                                        ## FALSE.
-        ("rxCounter", c_ulonglong),                                     ## [out] Monotonic counter for the number of bytes received (sum of all
-                                                                        ## lanes). Will always be 0 if
-                                                                        ## ::zes_pci_properties_t.haveBandwidthCounters is FALSE.
-        ("txCounter", c_ulonglong),                                     ## [out] Monotonic counter for the number of bytes transmitted (including
-                                                                        ## replays) (sum of all lanes). Will always be 0 if
-                                                                        ## ::zes_pci_properties_t.haveBandwidthCounters is FALSE.
-        ("speed", zes_pci_speed_t)                                      ## [out] The current speed of the link (sum of all lanes)
-    ]
-
-###############################################################################
-## @brief Diagnostic results
-class zes_diag_result_v(IntEnum):
-    NO_ERRORS = 0                                   ## Diagnostic completed without finding errors to repair
-    ABORT = 1                                       ## Diagnostic had problems running tests
-    FAIL_CANT_REPAIR = 2                            ## Diagnostic had problems setting up repairs
-    REBOOT_FOR_REPAIR = 3                           ## Diagnostics found errors, setup for repair and reboot is required to
-                                                    ## complete the process
-
-class zes_diag_result_t(c_int):
-    def __str__(self):
-        return str(zes_diag_result_v(self.value))
-
-
-###############################################################################
-## @brief Diagnostic test index to use for the very first test.
-ZES_DIAG_FIRST_TEST_INDEX = 0x0
-
-###############################################################################
-## @brief Diagnostic test index to use for the very last test.
-ZES_DIAG_LAST_TEST_INDEX = 0xFFFFFFFF
-
-###############################################################################
-## @brief Diagnostic test
-class zes_diag_test_t(Structure):
-    _fields_ = [
-        ("index", c_ulong),                                             ## [out] Index of the test
-        ("name", c_char * ZES_STRING_PROPERTY_SIZE)                     ## [out] Name of the test
-    ]
-
-###############################################################################
-## @brief Diagnostics test suite properties
-class zes_diag_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("name", c_char * ZES_STRING_PROPERTY_SIZE),                    ## [out] Name of the diagnostics test suite
-        ("haveTests", ze_bool_t)                                        ## [out] Indicates if this test suite has individual tests which can be
-                                                                        ## run separately (use the function ::zesDiagnosticsGetTests() to get the
-                                                                        ## list of these tests)
-    ]
-
-###############################################################################
-## @brief Accelerator engine groups
-class zes_engine_group_v(IntEnum):
-    ALL = 0                                         ## Access information about all engines combined.
-    COMPUTE_ALL = 1                                 ## Access information about all compute engines combined. Compute engines
-                                                    ## can only process compute kernels (no 3D content).
-    MEDIA_ALL = 2                                   ## Access information about all media engines combined.
-    COPY_ALL = 3                                    ## Access information about all copy (blitter) engines combined.
-    COMPUTE_SINGLE = 4                              ## Access information about a single compute engine - this is an engine
-                                                    ## that can process compute kernels. Note that single engines may share
-                                                    ## the same underlying accelerator resources as other engines so activity
-                                                    ## of such an engine may not be indicative of the underlying resource
-                                                    ## utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    RENDER_SINGLE = 5                               ## Access information about a single render engine - this is an engine
-                                                    ## that can process both 3D content and compute kernels. Note that single
-                                                    ## engines may share the same underlying accelerator resources as other
-                                                    ## engines so activity of such an engine may not be indicative of the
-                                                    ## underlying resource utilization - use
-                                                    ## ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    MEDIA_DECODE_SINGLE = 6                         ## Access information about a single media decode engine. Note that
-                                                    ## single engines may share the same underlying accelerator resources as
-                                                    ## other engines so activity of such an engine may not be indicative of
-                                                    ## the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ## for that.
-    MEDIA_ENCODE_SINGLE = 7                         ## Access information about a single media encode engine. Note that
-                                                    ## single engines may share the same underlying accelerator resources as
-                                                    ## other engines so activity of such an engine may not be indicative of
-                                                    ## the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ## for that.
-    COPY_SINGLE = 8                                 ## Access information about a single media encode engine. Note that
-                                                    ## single engines may share the same underlying accelerator resources as
-                                                    ## other engines so activity of such an engine may not be indicative of
-                                                    ## the underlying resource utilization - use ::ZES_ENGINE_GROUP_COPY_ALL
-                                                    ## for that.
-    MEDIA_ENHANCEMENT_SINGLE = 9                    ## Access information about a single media enhancement engine. Note that
-                                                    ## single engines may share the same underlying accelerator resources as
-                                                    ## other engines so activity of such an engine may not be indicative of
-                                                    ## the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ## for that.
-    _3D_SINGLE = 10                                 ## Access information about a single 3D engine - this is an engine that
-                                                    ## can process 3D content only. Note that single engines may share the
-                                                    ## same underlying accelerator resources as other engines so activity of
-                                                    ## such an engine may not be indicative of the underlying resource
-                                                    ## utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    _3D_RENDER_COMPUTE_ALL = 11                     ## Access information about all 3D/render/compute engines combined.
-    RENDER_ALL = 12                                 ## Access information about all render engines combined. Render engines
-                                                    ## are those than process both 3D content and compute kernels.
-    _3D_ALL = 13                                    ## Access information about all 3D engines combined. 3D engines can
-                                                    ## process 3D content only (no compute kernels).
-
-class zes_engine_group_t(c_int):
-    def __str__(self):
-        return str(zes_engine_group_v(self.value))
-
-
-###############################################################################
-## @brief Engine group properties
-class zes_engine_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_engine_group_t),                                   ## [out] The engine group
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device
-    ]
-
-###############################################################################
-## @brief Engine activity counters
-## 
-## @details
-##     - Percent utilization is calculated by taking two snapshots (s1, s2) and
-##       using the equation: %util = (s2.activeTime - s1.activeTime) /
-##       (s2.timestamp - s1.timestamp)
-class zes_engine_stats_t(Structure):
-    _fields_ = [
-        ("activeTime", c_ulonglong),                                    ## [out] Monotonic counter for time in microseconds that this resource is
-                                                                        ## actively running workloads.
-        ("timestamp", c_ulonglong)                                      ## [out] Monotonic timestamp counter in microseconds when activeTime
-                                                                        ## counter was sampled.
-                                                                        ## This timestamp should only be used to calculate delta time between
-                                                                        ## snapshots of this structure.
-                                                                        ## Never take the delta of this timestamp with the timestamp from a
-                                                                        ## different structure since they are not guaranteed to have the same base.
-                                                                        ## The absolute value of the timestamp is only valid during within the
-                                                                        ## application and may be different on the next execution.
-    ]
-
-###############################################################################
-## @brief Event types
-class zes_event_type_flags_v(IntEnum):
-    DEVICE_DETACH = ZE_BIT(0)                       ## Event is triggered when the device is no longer available (due to a
-                                                    ## reset or being disabled).
-    DEVICE_ATTACH = ZE_BIT(1)                       ## Event is triggered after the device is available again.
-    DEVICE_SLEEP_STATE_ENTER = ZE_BIT(2)            ## Event is triggered when the driver is about to put the device into a
-                                                    ## deep sleep state
-    DEVICE_SLEEP_STATE_EXIT = ZE_BIT(3)             ## Event is triggered when the driver is waking the device up from a deep
-                                                    ## sleep state
-    FREQ_THROTTLED = ZE_BIT(4)                      ## Event is triggered when the frequency starts being throttled
-    ENERGY_THRESHOLD_CROSSED = ZE_BIT(5)            ## Event is triggered when the energy consumption threshold is reached
-                                                    ## (use ::zesPowerSetEnergyThreshold() to configure).
-    TEMP_CRITICAL = ZE_BIT(6)                       ## Event is triggered when the critical temperature is reached (use
-                                                    ## ::zesTemperatureSetConfig() to configure - disabled by default).
-    TEMP_THRESHOLD1 = ZE_BIT(7)                     ## Event is triggered when the temperature crosses threshold 1 (use
-                                                    ## ::zesTemperatureSetConfig() to configure - disabled by default).
-    TEMP_THRESHOLD2 = ZE_BIT(8)                     ## Event is triggered when the temperature crosses threshold 2 (use
-                                                    ## ::zesTemperatureSetConfig() to configure - disabled by default).
-    MEM_HEALTH = ZE_BIT(9)                          ## Event is triggered when the health of device memory changes.
-    FABRIC_PORT_HEALTH = ZE_BIT(10)                 ## Event is triggered when the health of fabric ports change.
-    PCI_LINK_HEALTH = ZE_BIT(11)                    ## Event is triggered when the health of the PCI link changes.
-    RAS_CORRECTABLE_ERRORS = ZE_BIT(12)             ## Event is triggered when accelerator RAS correctable errors cross
-                                                    ## thresholds (use ::zesRasSetConfig() to configure - disabled by
-                                                    ## default).
-    RAS_UNCORRECTABLE_ERRORS = ZE_BIT(13)           ## Event is triggered when accelerator RAS uncorrectable errors cross
-                                                    ## thresholds (use ::zesRasSetConfig() to configure - disabled by
-                                                    ## default).
-    DEVICE_RESET_REQUIRED = ZE_BIT(14)              ## Event is triggered when the device needs to be reset (use
-                                                    ## ::zesDeviceGetState() to determine the reasons for the reset).
-
-class zes_event_type_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Maximum Fabric port model string size
-ZES_MAX_FABRIC_PORT_MODEL_SIZE = 256
-
-###############################################################################
-## @brief Maximum size of the buffer that will return information about link
-##        types
-ZES_MAX_FABRIC_LINK_TYPE_SIZE = 256
-
-###############################################################################
-## @brief Fabric port status
-class zes_fabric_port_status_v(IntEnum):
-    UNKNOWN = 0                                     ## The port status cannot be determined
-    HEALTHY = 1                                     ## The port is up and operating as expected
-    DEGRADED = 2                                    ## The port is up but has quality and/or speed degradation
-    FAILED = 3                                      ## Port connection instabilities are preventing workloads making forward
-                                                    ## progress
-    DISABLED = 4                                    ## The port is configured down
-
-class zes_fabric_port_status_t(c_int):
-    def __str__(self):
-        return str(zes_fabric_port_status_v(self.value))
-
-
-###############################################################################
-## @brief Fabric port quality degradation reasons
-class zes_fabric_port_qual_issue_flags_v(IntEnum):
-    LINK_ERRORS = ZE_BIT(0)                         ## Excessive link errors are occurring
-    SPEED = ZE_BIT(1)                               ## There is a degradation in the bitrate and/or width of the link
-
-class zes_fabric_port_qual_issue_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Fabric port failure reasons
-class zes_fabric_port_failure_flags_v(IntEnum):
-    FAILED = ZE_BIT(0)                              ## A previously operating link has failed. Hardware will automatically
-                                                    ## retrain this port. This state will persist until either the physical
-                                                    ## connection is removed or the link trains successfully.
-    TRAINING_TIMEOUT = ZE_BIT(1)                    ## A connection has not been established within an expected time.
-                                                    ## Hardware will continue to attempt port training. This status will
-                                                    ## persist until either the physical connection is removed or the link
-                                                    ## successfully trains.
-    FLAPPING = ZE_BIT(2)                            ## Port has excessively trained and then transitioned down for some
-                                                    ## period of time. Driver will allow port to continue to train, but will
-                                                    ## not enable the port for use until the port has been disabled and
-                                                    ## subsequently re-enabled using ::zesFabricPortSetConfig().
-
-class zes_fabric_port_failure_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Unique identifier for a fabric port
-## 
-## @details
-##     - This not a universal identifier. The identified is garanteed to be
-##       unique for the current hardware configuration of the system. Changes
-##       in the hardware may result in a different identifier for a given port.
-##     - The main purpose of this identifier to build up an instantaneous
-##       topology map of system connectivity. An application should enumerate
-##       all fabric ports and match ::zes_fabric_port_state_t.remotePortId to
-##       ::zes_fabric_port_properties_t.portId.
-class zes_fabric_port_id_t(Structure):
-    _fields_ = [
-        ("fabricId", c_ulong),                                          ## [out] Unique identifier for the fabric end-point
-        ("attachId", c_ulong),                                          ## [out] Unique identifier for the device attachment point
-        ("portNumber", c_ubyte)                                         ## [out] The logical port number (this is typically marked somewhere on
-                                                                        ## the physical device)
-    ]
-
-###############################################################################
-## @brief Fabric port speed in one direction
-class zes_fabric_port_speed_t(Structure):
-    _fields_ = [
-        ("bitRate", c_int64_t),                                         ## [out] Bits/sec that the link is operating at. A value of -1 means that
-                                                                        ## this property is unknown.
-        ("width", c_int32_t)                                            ## [out] The number of lanes. A value of -1 means that this property is
-                                                                        ## unknown.
-    ]
-
-###############################################################################
-## @brief Fabric port properties
-class zes_fabric_port_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("model", c_char * ZES_MAX_FABRIC_PORT_MODEL_SIZE),             ## [out] Description of port technology. Will be set to the string
-                                                                        ## "unkown" if this cannot be determined for this port.
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the port is located on a sub-device; false means that
-                                                                        ## the port is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("portId", zes_fabric_port_id_t),                               ## [out] The unique port identifier
-        ("maxRxSpeed", zes_fabric_port_speed_t),                        ## [out] Maximum speed supported by the receive side of the port (sum of
-                                                                        ## all lanes)
-        ("maxTxSpeed", zes_fabric_port_speed_t)                         ## [out] Maximum speed supported by the transmit side of the port (sum of
-                                                                        ## all lanes)
-    ]
-
-###############################################################################
-## @brief Provides information about the fabric link attached to a port
-class zes_fabric_link_type_t(Structure):
-    _fields_ = [
-        ("desc", c_char * ZES_MAX_FABRIC_LINK_TYPE_SIZE)                ## [out] This provides a static textural description of the physic
-                                                                        ## attachment type. Will be set to the string "unkown" if this cannot be
-                                                                        ## determined for this port.
-    ]
-
-###############################################################################
-## @brief Fabric port configuration
-class zes_fabric_port_config_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("enabled", ze_bool_t),                                         ## [in,out] Port is configured up/down
-        ("beaconing", ze_bool_t)                                        ## [in,out] Beaconing is configured on/off
-    ]
-
-###############################################################################
-## @brief Fabric port state
-class zes_fabric_port_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("status", zes_fabric_port_status_t),                           ## [out] The current status of the port
-        ("qualityIssues", zes_fabric_port_qual_issue_flags_t),          ## [out] If status is ::ZES_FABRIC_PORT_STATUS_DEGRADED, 
-                                                                        ## then this gives a combination of ::zes_fabric_port_qual_issue_flag_t
-                                                                        ## for quality issues that have been detected;
-                                                                        ## otherwise, 0 indicates there are no quality issues with the link at
-                                                                        ## this time.
-        ("failureReasons", zes_fabric_port_failure_flags_t),            ## [out] If status is ::ZES_FABRIC_PORT_STATUS_FAILED,
-                                                                        ## then this gives a combination of ::zes_fabric_port_failure_flag_t for
-                                                                        ## reasons for the connection instability;
-                                                                        ## otherwise, 0 indicates there are no connection stability issues at
-                                                                        ## this time.
-        ("remotePortId", zes_fabric_port_id_t),                         ## [out] The unique port identifier for the remote connection point if
-                                                                        ## status is ::ZES_FABRIC_PORT_STATUS_HEALTHY,
-                                                                        ## ::ZES_FABRIC_PORT_STATUS_DEGRADED or ::ZES_FABRIC_PORT_STATUS_FAILED
-        ("rxSpeed", zes_fabric_port_speed_t),                           ## [out] Current maximum receive speed (sum of all lanes)
-        ("txSpeed", zes_fabric_port_speed_t)                            ## [out] Current maximum transmit speed (sum of all lanes)
-    ]
-
-###############################################################################
-## @brief Fabric port throughput.
-class zes_fabric_port_throughput_t(Structure):
-    _fields_ = [
-        ("timestamp", c_ulonglong),                                     ## [out] Monotonic timestamp counter in microseconds when the measurement
-                                                                        ## was made.
-                                                                        ## This timestamp should only be used to calculate delta time between
-                                                                        ## snapshots of this structure.
-                                                                        ## Never take the delta of this timestamp with the timestamp from a
-                                                                        ## different structure since they are not guaranteed to have the same base.
-                                                                        ## The absolute value of the timestamp is only valid during within the
-                                                                        ## application and may be different on the next execution.
-        ("rxCounter", c_ulonglong),                                     ## [out] Monotonic counter for the number of bytes received (sum of all
-                                                                        ## lanes). This includes all protocol overhead, not only the GPU traffic.
-        ("txCounter", c_ulonglong)                                      ## [out] Monotonic counter for the number of bytes transmitted (sum of
-                                                                        ## all lanes). This includes all protocol overhead, not only the GPU
-                                                                        ## traffic.
-    ]
-
-###############################################################################
-## @brief Fan resource speed mode
-class zes_fan_speed_mode_v(IntEnum):
-    DEFAULT = 0                                     ## The fan speed is operating using the hardware default settings
-    FIXED = 1                                       ## The fan speed is currently set to a fixed value
-    TABLE = 2                                       ## The fan speed is currently controlled dynamically by hardware based on
-                                                    ## a temp/speed table
-
-class zes_fan_speed_mode_t(c_int):
-    def __str__(self):
-        return str(zes_fan_speed_mode_v(self.value))
-
-
-###############################################################################
-## @brief Fan speed units
-class zes_fan_speed_units_v(IntEnum):
-    RPM = 0                                         ## The fan speed is in units of revolutions per minute (rpm)
-    PERCENT = 1                                     ## The fan speed is a percentage of the maximum speed of the fan
-
-class zes_fan_speed_units_t(c_int):
-    def __str__(self):
-        return str(zes_fan_speed_units_v(self.value))
-
-
-###############################################################################
-## @brief Fan speed
-class zes_fan_speed_t(Structure):
-    _fields_ = [
-        ("speed", c_int32_t),                                           ## [in,out] The speed of the fan. On output, a value of -1 indicates that
-                                                                        ## there is no fixed fan speed setting.
-        ("units", zes_fan_speed_units_t)                                ## [in,out] The units that the fan speed is expressed in. On output, if
-                                                                        ## fan speed is -1 then units should be ignored.
-    ]
-
-###############################################################################
-## @brief Fan temperature/speed pair
-class zes_fan_temp_speed_t(Structure):
-    _fields_ = [
-        ("temperature", c_ulong),                                       ## [in,out] Temperature in degrees Celsius.
-        ("speed", zes_fan_speed_t)                                      ## [in,out] The speed of the fan
-    ]
-
-###############################################################################
-## @brief Maximum number of fan temperature/speed pairs in the fan speed table.
-ZES_FAN_TEMP_SPEED_PAIR_COUNT = 32
-
-###############################################################################
-## @brief Fan speed table
-class zes_fan_speed_table_t(Structure):
-    _fields_ = [
-        ("numPoints", c_int32_t),                                       ## [in,out] The number of valid points in the fan speed table. 0 means
-                                                                        ## that there is no fan speed table configured. -1 means that a fan speed
-                                                                        ## table is not supported by the hardware.
-        ("table", zes_fan_temp_speed_t * ZES_FAN_TEMP_SPEED_PAIR_COUNT) ## [in,out] Array of temperature/fan speed pairs. The table is ordered
-                                                                        ## based on temperature from lowest to highest.
-    ]
-
-###############################################################################
-## @brief Fan properties
-class zes_fan_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can control the fan speed assuming the
-                                                                        ## user has permissions
-        ("supportedModes", c_ulong),                                    ## [out] Bitfield of supported fan configuration modes
-                                                                        ## (1<<::zes_fan_speed_mode_t)
-        ("supportedUnits", c_ulong),                                    ## [out] Bitfield of supported fan speed units
-                                                                        ## (1<<::zes_fan_speed_units_t)
-        ("maxRPM", c_int32_t),                                          ## [out] The maximum RPM of the fan. A value of -1 means that this
-                                                                        ## property is unknown. 
-        ("maxPoints", c_int32_t)                                        ## [out] The maximum number of points in the fan temp/speed table. A
-                                                                        ## value of -1 means that this fan doesn't support providing a temp/speed
-                                                                        ## table.
-    ]
-
-###############################################################################
-## @brief Fan configuration
-class zes_fan_config_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("mode", zes_fan_speed_mode_t),                                 ## [in,out] The fan speed mode (fixed, temp-speed table)
-        ("speedFixed", zes_fan_speed_t),                                ## [in,out] The current fixed fan speed setting
-        ("speedTable", zes_fan_speed_table_t)                           ## [out] A table containing temperature/speed pairs
-    ]
-
-###############################################################################
-## @brief Firmware properties
-class zes_firmware_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can flash the firmware assuming the user
-                                                                        ## has permissions
-        ("name", c_char * ZES_STRING_PROPERTY_SIZE),                    ## [out] NULL terminated string value. The string "unknown" will be
-                                                                        ## returned if this property cannot be determined.
-        ("version", c_char * ZES_STRING_PROPERTY_SIZE)                  ## [out] NULL terminated string value. The string "unknown" will be
-                                                                        ## returned if this property cannot be determined.
-    ]
-
-###############################################################################
-## @brief Frequency domains.
-class zes_freq_domain_v(IntEnum):
-    GPU = 0                                         ## GPU Core Domain.
-    MEMORY = 1                                      ## Local Memory Domain.
-
-class zes_freq_domain_t(c_int):
-    def __str__(self):
-        return str(zes_freq_domain_v(self.value))
-
-
-###############################################################################
-## @brief Frequency properties
-## 
-## @details
-##     - Indicates if this frequency domain can be overclocked (if true,
-##       functions such as ::zesFrequencyOcSetFrequencyTarget() are supported).
-##     - The min/max hardware frequencies are specified for non-overclock
-##       configurations. For overclock configurations, use
-##       ::zesFrequencyOcGetFrequencyTarget() to determine the maximum
-##       frequency that can be requested.
-class zes_freq_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_freq_domain_t),                                    ## [out] The hardware block that this frequency domain controls (GPU,
-                                                                        ## memory, ...)
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can control the frequency of this domain
-                                                                        ## assuming the user has permissions
-        ("isThrottleEventSupported", ze_bool_t),                        ## [out] Indicates if software can register to receive event
-                                                                        ## ::ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED
-        ("min", c_double),                                              ## [out] The minimum hardware clock frequency in units of MHz.
-        ("max", c_double)                                               ## [out] The maximum non-overclock hardware clock frequency in units of
-                                                                        ## MHz.
-    ]
-
-###############################################################################
-## @brief Frequency range between which the hardware can operate. The limits can
-##        be above or below the hardware limits - the hardware will clamp
-##        appropriately.
-class zes_freq_range_t(Structure):
-    _fields_ = [
-        ("min", c_double),                                              ## [in,out] The min frequency in MHz below which hardware frequency
-                                                                        ## management will not request frequencies. On input, setting to 0 will
-                                                                        ## permit the frequency to go down to the hardware minimum. On output, a
-                                                                        ## negative value indicates that no external minimum frequency limit is
-                                                                        ## in effect.
-        ("max", c_double)                                               ## [in,out] The max frequency in MHz above which hardware frequency
-                                                                        ## management will not request frequencies. On input, setting to 0 or a
-                                                                        ## very big number will permit the frequency to go all the way up to the
-                                                                        ## hardware maximum. On output, a negative number indicates that no
-                                                                        ## external maximum frequency limit is in effect.
-    ]
-
-###############################################################################
-## @brief Frequency throttle reasons
-class zes_freq_throttle_reason_flags_v(IntEnum):
-    AVE_PWR_CAP = ZE_BIT(0)                         ## frequency throttled due to average power excursion (PL1)
-    BURST_PWR_CAP = ZE_BIT(1)                       ## frequency throttled due to burst power excursion (PL2)
-    CURRENT_LIMIT = ZE_BIT(2)                       ## frequency throttled due to current excursion (PL4)
-    THERMAL_LIMIT = ZE_BIT(3)                       ## frequency throttled due to thermal excursion (T > TjMax)
-    PSU_ALERT = ZE_BIT(4)                           ## frequency throttled due to power supply assertion
-    SW_RANGE = ZE_BIT(5)                            ## frequency throttled due to software supplied frequency range
-    HW_RANGE = ZE_BIT(6)                            ## frequency throttled due to a sub block that has a lower frequency
-                                                    ## range when it receives clocks
-
-class zes_freq_throttle_reason_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Frequency state
-class zes_freq_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("currentVoltage", c_double),                                   ## [out] Current voltage in Volts. A negative value indicates that this
-                                                                        ## property is not known.
-        ("request", c_double),                                          ## [out] The current frequency request in MHz. A negative value indicates
-                                                                        ## that this property is not known.
-        ("tdp", c_double),                                              ## [out] The maximum frequency in MHz supported under the current TDP
-                                                                        ## conditions. This fluctuates dynamically based on the power and thermal
-                                                                        ## limits of the part. A negative value indicates that this property is
-                                                                        ## not known.
-        ("efficient", c_double),                                        ## [out] The efficient minimum frequency in MHz. A negative value
-                                                                        ## indicates that this property is not known.
-        ("actual", c_double),                                           ## [out] The resolved frequency in MHz. A negative value indicates that
-                                                                        ## this property is not known.
-        ("throttleReasons", zes_freq_throttle_reason_flags_t)           ## [out] The reasons that the frequency is being limited by the hardware.
-                                                                        ## Returns 0 (frequency not throttled) or a combination of ::zes_freq_throttle_reason_flag_t.
-    ]
-
-###############################################################################
-## @brief Frequency throttle time snapshot
-## 
-## @details
-##     - Percent time throttled is calculated by taking two snapshots (s1, s2)
-##       and using the equation: %throttled = (s2.throttleTime -
-##       s1.throttleTime) / (s2.timestamp - s1.timestamp)
-class zes_freq_throttle_time_t(Structure):
-    _fields_ = [
-        ("throttleTime", c_ulonglong),                                  ## [out] The monotonic counter of time in microseconds that the frequency
-                                                                        ## has been limited by the hardware.
-        ("timestamp", c_ulonglong)                                      ## [out] Microsecond timestamp when throttleTime was captured.
-                                                                        ## This timestamp should only be used to calculate delta time between
-                                                                        ## snapshots of this structure.
-                                                                        ## Never take the delta of this timestamp with the timestamp from a
-                                                                        ## different structure since they are not guaranteed to have the same base.
-                                                                        ## The absolute value of the timestamp is only valid during within the
-                                                                        ## application and may be different on the next execution.
-    ]
-
-###############################################################################
-## @brief Overclocking modes
-class zes_oc_mode_v(IntEnum):
-    OFF = 0                                         ## Overclocking if off - hardware is running using factory default
-                                                    ## voltages/frequencies.
-    OVERRIDE = 1                                    ## Overclock override mode - In this mode, a fixed user-supplied voltage
-                                                    ## is applied independent of the frequency request. The maximum permitted
-                                                    ## frequency can also be increased. This mode disables INTERPOLATIVE and
-                                                    ## FIXED modes.
-    INTERPOLATIVE = 2                               ## Overclock interpolative mode - In this mode, the voltage/frequency
-                                                    ## curve can be extended with a new voltage/frequency point that will be
-                                                    ## interpolated. The existing voltage/frequency points can also be offset
-                                                    ## (up or down) by a fixed voltage. This mode disables FIXED and OVERRIDE
-                                                    ## modes.
-    FIXED = 3                                       ## Overclocking fixed Mode - In this mode, hardware will disable most
-                                                    ## frequency throttling and lock the frequency and voltage at the
-                                                    ## specified overclock values. This mode disables OVERRIDE and
-                                                    ## INTERPOLATIVE modes. This mode can damage the part, most of the
-                                                    ## protections are disabled on this mode.
-
-class zes_oc_mode_t(c_int):
-    def __str__(self):
-        return str(zes_oc_mode_v(self.value))
-
-
-###############################################################################
-## @brief Overclocking properties
-## 
-## @details
-##     - Provides all the overclocking capabilities and properties supported by
-##       the device for the frequency domain.
-class zes_oc_capabilities_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("isOcSupported", ze_bool_t),                                   ## [out] Indicates if any overclocking features are supported on this
-                                                                        ## frequency domain.
-        ("maxFactoryDefaultFrequency", c_double),                       ## [out] Factory default non-overclock maximum frequency in Mhz.
-        ("maxFactoryDefaultVoltage", c_double),                         ## [out] Factory default voltage used for the non-overclock maximum
-                                                                        ## frequency in MHz.
-        ("maxOcFrequency", c_double),                                   ## [out] Maximum hardware overclocking frequency limit in Mhz.
-        ("minOcVoltageOffset", c_double),                               ## [out] The minimum voltage offset that can be applied to the
-                                                                        ## voltage/frequency curve. Note that this number can be negative.
-        ("maxOcVoltageOffset", c_double),                               ## [out] The maximum voltage offset that can be applied to the
-                                                                        ## voltage/frequency curve.
-        ("maxOcVoltage", c_double),                                     ## [out] The maximum overclock voltage that hardware supports.
-        ("isTjMaxSupported", ze_bool_t),                                ## [out] Indicates if the maximum temperature limit (TjMax) can be
-                                                                        ## changed for this frequency domain.
-        ("isIccMaxSupported", ze_bool_t),                               ## [out] Indicates if the maximum current (IccMax) can be changed for
-                                                                        ## this frequency domain.
-        ("isHighVoltModeCapable", ze_bool_t),                           ## [out] Indicates if this frequency domains supports a feature to set
-                                                                        ## very high voltages.
-        ("isHighVoltModeEnabled", ze_bool_t),                           ## [out] Indicates if very high voltages are permitted on this frequency
-                                                                        ## domain.
-        ("isExtendedModeSupported", ze_bool_t),                         ## [out] Indicates if the extended overclocking features are supported.
-                                                                        ## If this is supported, increments are on 1 Mhz basis.
-        ("isFixedModeSupported", ze_bool_t)                             ## [out] Indicates if the fixed mode is supported. In this mode, hardware
-                                                                        ## will disable most frequency throttling and lock the frequency and
-                                                                        ## voltage at the specified overclock values.
-    ]
-
-###############################################################################
-## @brief LED properties
-class zes_led_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can control the LED assuming the user has
-                                                                        ## permissions
-        ("haveRGB", ze_bool_t)                                          ## [out] Indicates if the LED is RGB capable
-    ]
-
-###############################################################################
-## @brief LED color
-class zes_led_color_t(Structure):
-    _fields_ = [
-        ("red", c_double),                                              ## [in,out][range(0.0, 1.0)] The LED red value. On output, a value less
-                                                                        ## than 0.0 indicates that the color is not known.
-        ("green", c_double),                                            ## [in,out][range(0.0, 1.0)] The LED green value. On output, a value less
-                                                                        ## than 0.0 indicates that the color is not known.
-        ("blue", c_double)                                              ## [in,out][range(0.0, 1.0)] The LED blue value. On output, a value less
-                                                                        ## than 0.0 indicates that the color is not known.
-    ]
-
-###############################################################################
-## @brief LED state
-class zes_led_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("isOn", ze_bool_t),                                            ## [out] Indicates if the LED is on or off
-        ("color", zes_led_color_t)                                      ## [out] Color of the LED
-    ]
-
-###############################################################################
-## @brief Memory module types
-class zes_mem_type_v(IntEnum):
-    HBM = 0                                         ## HBM memory
-    DDR = 1                                         ## DDR memory
-    DDR3 = 2                                        ## DDR3 memory
-    DDR4 = 3                                        ## DDR4 memory
-    DDR5 = 4                                        ## DDR5 memory
-    LPDDR = 5                                       ## LPDDR memory
-    LPDDR3 = 6                                      ## LPDDR3 memory
-    LPDDR4 = 7                                      ## LPDDR4 memory
-    LPDDR5 = 8                                      ## LPDDR5 memory
-    SRAM = 9                                        ## SRAM memory
-    L1 = 10                                         ## L1 cache
-    L3 = 11                                         ## L3 cache
-    GRF = 12                                        ## Execution unit register file
-    SLM = 13                                        ## Execution unit shared local memory
-    GDDR4 = 14                                      ## GDDR4 memory
-    GDDR5 = 15                                      ## GDDR5 memory
-    GDDR5X = 16                                     ## GDDR5X memory
-    GDDR6 = 17                                      ## GDDR6 memory
-    GDDR6X = 18                                     ## GDDR6X memory
-    GDDR7 = 19                                      ## GDDR7 memory
-
-class zes_mem_type_t(c_int):
-    def __str__(self):
-        return str(zes_mem_type_v(self.value))
-
-
-###############################################################################
-## @brief Memory module location
-class zes_mem_loc_v(IntEnum):
-    SYSTEM = 0                                      ## System memory
-    DEVICE = 1                                      ## On board local device memory
-
-class zes_mem_loc_t(c_int):
-    def __str__(self):
-        return str(zes_mem_loc_v(self.value))
-
-
-###############################################################################
-## @brief Memory health
-class zes_mem_health_v(IntEnum):
-    UNKNOWN = 0                                     ## The memory health cannot be determined.
-    OK = 1                                          ## All memory channels are healthy.
-    DEGRADED = 2                                    ## Excessive correctable errors have been detected on one or more
-                                                    ## channels. Device should be reset.
-    CRITICAL = 3                                    ## Operating with reduced memory to cover banks with too many
-                                                    ## uncorrectable errors.
-    REPLACE = 4                                     ## Device should be replaced due to excessive uncorrectable errors.
-
-class zes_mem_health_t(c_int):
-    def __str__(self):
-        return str(zes_mem_health_v(self.value))
-
-
-###############################################################################
-## @brief Memory properties
-class zes_mem_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_mem_type_t),                                       ## [out] The memory type
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("location", zes_mem_loc_t),                                    ## [out] Location of this memory (system, device)
-        ("physicalSize", c_ulonglong),                                  ## [out] Physical memory size in bytes. A value of 0 indicates that this
-                                                                        ## property is not known. However, a call to ::zesMemoryGetState() will
-                                                                        ## correctly return the total size of usable memory.
-        ("busWidth", c_int32_t),                                        ## [out] Width of the memory bus. A value of -1 means that this property
-                                                                        ## is unknown.
-        ("numChannels", c_int32_t)                                      ## [out] The number of memory channels. A value of -1 means that this
-                                                                        ## property is unknown.
-    ]
-
-###############################################################################
-## @brief Memory state - health, allocated
-## 
-## @details
-##     - Percent allocation is given by 100 * (size - free / size.
-##     - Percent free is given by 100 * free / size.
-class zes_mem_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("health", zes_mem_health_t),                                   ## [out] Indicates the health of the memory
-        ("free", c_ulonglong),                                          ## [out] The free memory in bytes
-        ("size", c_ulonglong)                                           ## [out] The total allocatable memory in bytes (can be less than
-                                                                        ## ::zes_mem_properties_t.physicalSize)
-    ]
-
-###############################################################################
-## @brief Memory bandwidth
-## 
-## @details
-##     - Percent bandwidth is calculated by taking two snapshots (s1, s2) and
-##       using the equation: %bw = 10^6 * ((s2.readCounter - s1.readCounter) +
-##       (s2.writeCounter - s1.writeCounter)) / (s2.maxBandwidth *
-##       (s2.timestamp - s1.timestamp))
-class zes_mem_bandwidth_t(Structure):
-    _fields_ = [
-        ("readCounter", c_ulonglong),                                   ## [out] Total bytes read from memory
-        ("writeCounter", c_ulonglong),                                  ## [out] Total bytes written to memory
-        ("maxBandwidth", c_ulonglong),                                  ## [out] Current maximum bandwidth in units of bytes/sec
-        ("timestamp", c_ulonglong)                                      ## [out] The timestamp when these measurements were sampled.
-                                                                        ## This timestamp should only be used to calculate delta time between
-                                                                        ## snapshots of this structure.
-                                                                        ## Never take the delta of this timestamp with the timestamp from a
-                                                                        ## different structure since they are not guaranteed to have the same base.
-                                                                        ## The absolute value of the timestamp is only valid during within the
-                                                                        ## application and may be different on the next execution.
-    ]
-
-###############################################################################
-## @brief Static information about a Performance Factor domain
-class zes_perf_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if this Performance Factor affects accelerators located on
-                                                                        ## a sub-device
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("engines", zes_engine_type_flags_t)                            ## [out] Bitfield of accelerator engine types that are affected by this
-                                                                        ## Performance Factor.
-    ]
-
-###############################################################################
-## @brief Properties related to device power settings
-class zes_power_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("canControl", ze_bool_t),                                      ## [out] Software can change the power limits of this domain assuming the
-                                                                        ## user has permissions.
-        ("isEnergyThresholdSupported", ze_bool_t),                      ## [out] Indicates if this power domain supports the energy threshold
-                                                                        ## event (::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED).
-        ("defaultLimit", c_int32_t),                                    ## [out] The factory default TDP power limit of the part in milliwatts. A
-                                                                        ## value of -1 means that this is not known.
-        ("minLimit", c_int32_t),                                        ## [out] The minimum power limit in milliwatts that can be requested.
-        ("maxLimit", c_int32_t)                                         ## [out] The maximum power limit in milliwatts that can be requested.
-    ]
-
-###############################################################################
-## @brief Energy counter snapshot
-## 
-## @details
-##     - Average power is calculated by taking two snapshots (s1, s2) and using
-##       the equation: PowerWatts = (s2.energy - s1.energy) / (s2.timestamp -
-##       s1.timestamp)
-class zes_power_energy_counter_t(Structure):
-    _fields_ = [
-        ("energy", c_ulonglong),                                        ## [out] The monotonic energy counter in microjoules.
-        ("timestamp", c_ulonglong)                                      ## [out] Microsecond timestamp when energy was captured.
-                                                                        ## This timestamp should only be used to calculate delta time between
-                                                                        ## snapshots of this structure.
-                                                                        ## Never take the delta of this timestamp with the timestamp from a
-                                                                        ## different structure since they are not guaranteed to have the same base.
-                                                                        ## The absolute value of the timestamp is only valid during within the
-                                                                        ## application and may be different on the next execution.
-    ]
-
-###############################################################################
-## @brief Sustained power limits
-## 
-## @details
-##     - The power controller (Punit) will throttle the operating frequency if
-##       the power averaged over a window (typically seconds) exceeds this
-##       limit.
-class zes_power_sustained_limit_t(Structure):
-    _fields_ = [
-        ("enabled", ze_bool_t),                                         ## [in,out] indicates if the limit is enabled (true) or ignored (false)
-        ("power", c_int32_t),                                           ## [in,out] power limit in milliwatts
-        ("interval", c_int32_t)                                         ## [in,out] power averaging window (Tau) in milliseconds
-    ]
-
-###############################################################################
-## @brief Burst power limit
-## 
-## @details
-##     - The power controller (Punit) will throttle the operating frequency of
-##       the device if the power averaged over a few milliseconds exceeds a
-##       limit known as PL2. Typically PL2 > PL1 so that it permits the
-##       frequency to burst higher for short periods than would be otherwise
-##       permitted by PL1.
-class zes_power_burst_limit_t(Structure):
-    _fields_ = [
-        ("enabled", ze_bool_t),                                         ## [in,out] indicates if the limit is enabled (true) or ignored (false)
-        ("power", c_int32_t)                                            ## [in,out] power limit in milliwatts
-    ]
-
-###############################################################################
-## @brief Peak power limit
-## 
-## @details
-##     - The power controller (Punit) will reactively/proactively throttle the
-##       operating frequency of the device when the instantaneous/100usec power
-##       exceeds this limit. The limit is known as PL4 or Psys. It expresses
-##       the maximum power that can be drawn from the power supply.
-##     - If this power limit is removed or set too high, the power supply will
-##       generate an interrupt when it detects an overcurrent condition and the
-##       power controller will throttle the device frequencies down to min. It
-##       is thus better to tune the PL4 value in order to avoid such
-##       excursions.
-class zes_power_peak_limit_t(Structure):
-    _fields_ = [
-        ("powerAC", c_int32_t),                                         ## [in,out] power limit in milliwatts for the AC power source.
-        ("powerDC", c_int32_t)                                          ## [in,out] power limit in milliwatts for the DC power source. On input,
-                                                                        ## this is ignored if the product does not have a battery. On output,
-                                                                        ## this will be -1 if the product does not have a battery.
-    ]
-
-###############################################################################
-## @brief Energy threshold
-## 
-## @details
-##     - .
-class zes_energy_threshold_t(Structure):
-    _fields_ = [
-        ("enable", ze_bool_t),                                          ## [in,out] Indicates if the energy threshold is enabled.
-        ("threshold", c_double),                                        ## [in,out] The energy threshold in Joules. Will be 0.0 if no threshold
-                                                                        ## has been set.
-        ("processId", c_ulong)                                          ## [in,out] The host process ID that set the energy threshold. Will be
-                                                                        ## 0xFFFFFFFF if no threshold has been set.
-    ]
-
-###############################################################################
-## @brief PSU voltage status
-class zes_psu_voltage_status_v(IntEnum):
-    UNKNOWN = 0                                     ## The status of the power supply voltage controllers cannot be
-                                                    ## determined
-    NORMAL = 1                                      ## No unusual voltages have been detected
-    OVER = 2                                        ## Over-voltage has occurred
-    UNDER = 3                                       ## Under-voltage has occurred
-
-class zes_psu_voltage_status_t(c_int):
-    def __str__(self):
-        return str(zes_psu_voltage_status_v(self.value))
-
-
-###############################################################################
-## @brief Static properties of the power supply
-class zes_psu_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("haveFan", ze_bool_t),                                         ## [out] True if the power supply has a fan
-        ("ampLimit", c_int32_t)                                         ## [out] The maximum electrical current in milliamperes that can be
-                                                                        ## drawn. A value of -1 indicates that this property cannot be
-                                                                        ## determined.
-    ]
-
-###############################################################################
-## @brief Dynamic state of the power supply
-class zes_psu_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("voltStatus", zes_psu_voltage_status_t),                       ## [out] The current PSU voltage status
-        ("fanFailed", ze_bool_t),                                       ## [out] Indicates if the fan has failed
-        ("temperature", c_int32_t),                                     ## [out] Read the current heatsink temperature in degrees Celsius. A
-                                                                        ## value of -1 indicates that this property cannot be determined.
-        ("current", c_int32_t)                                          ## [out] The amps being drawn in milliamperes. A value of -1 indicates
-                                                                        ## that this property cannot be determined.
-    ]
-
-###############################################################################
-## @brief RAS error type
-class zes_ras_error_type_v(IntEnum):
-    CORRECTABLE = 0                                 ## Errors were corrected by hardware
-    UNCORRECTABLE = 1                               ## Error were not corrected
-
-class zes_ras_error_type_t(c_int):
-    def __str__(self):
-        return str(zes_ras_error_type_v(self.value))
-
-
-###############################################################################
-## @brief RAS error categories
-class zes_ras_error_cat_v(IntEnum):
-    RESET = 0                                       ## The number of accelerator engine resets attempted by the driver
-    PROGRAMMING_ERRORS = 1                          ## The number of hardware exceptions generated by the way workloads have
-                                                    ## programmed the hardware
-    DRIVER_ERRORS = 2                               ## The number of low level driver communication errors have occurred
-    COMPUTE_ERRORS = 3                              ## The number of errors that have occurred in the compute accelerator
-                                                    ## hardware
-    NON_COMPUTE_ERRORS = 4                          ## The number of errors that have occurred in the fixed-function
-                                                    ## accelerator hardware
-    CACHE_ERRORS = 5                                ## The number of errors that have occurred in caches (L1/L3/register
-                                                    ## file/shared local memory/sampler)
-    DISPLAY_ERRORS = 6                              ## The number of errors that have occurred in the display
-
-class zes_ras_error_cat_t(c_int):
-    def __str__(self):
-        return str(zes_ras_error_cat_v(self.value))
-
-
-###############################################################################
-## @brief The maximum number of categories
-ZES_MAX_RAS_ERROR_CATEGORY_COUNT = 7
-
-###############################################################################
-## @brief RAS properties
-class zes_ras_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_ras_error_type_t),                                 ## [out] The type of RAS error
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device
-    ]
-
-###############################################################################
-## @brief RAS error details
-class zes_ras_state_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("category", c_ulonglong * ZES_MAX_RAS_ERROR_CATEGORY_COUNT)    ## [in][out] Breakdown of error by category
-    ]
-
-###############################################################################
-## @brief RAS error configuration - thresholds used for triggering RAS events
-##        (::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS,
-##        ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS)
-## 
-## @details
-##     - The driver maintains a total counter which is updated every time a
-##       hardware block covered by the corresponding RAS error set notifies
-##       that an error has occurred. When this total count goes above the
-##       totalThreshold specified below, a RAS event is triggered.
-##     - The driver also maintains a counter for each category of RAS error
-##       (see ::zes_ras_state_t for a breakdown). Each time a hardware block of
-##       that category notifies that an error has occurred, that corresponding
-##       category counter is updated. When it goes above the threshold
-##       specified in detailedThresholds, a RAS event is triggered.
-class zes_ras_config_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("totalThreshold", c_ulonglong),                                ## [in,out] If the total RAS errors exceeds this threshold, the event
-                                                                        ## will be triggered. A value of 0ULL disables triggering the event based
-                                                                        ## on the total counter.
-        ("detailedThresholds", zes_ras_state_t)                         ## [in,out] If the RAS errors for each category exceed the threshold for
-                                                                        ## that category, the event will be triggered. A value of 0ULL will
-                                                                        ## disable an event being triggered for that category.
-    ]
-
-###############################################################################
-## @brief Scheduler mode
-class zes_sched_mode_v(IntEnum):
-    TIMEOUT = 0                                     ## Multiple applications or contexts are submitting work to the hardware.
-                                                    ## When higher priority work arrives, the scheduler attempts to pause the
-                                                    ## current executing work within some timeout interval, then submits the
-                                                    ## other work.
-    TIMESLICE = 1                                   ## The scheduler attempts to fairly timeslice hardware execution time
-                                                    ## between multiple contexts submitting work to the hardware
-                                                    ## concurrently.
-    EXCLUSIVE = 2                                   ## Any application or context can run indefinitely on the hardware
-                                                    ## without being preempted or terminated. All pending work for other
-                                                    ## contexts must wait until the running context completes with no further
-                                                    ## submitted work.
-    COMPUTE_UNIT_DEBUG = 3                          ## This is a special mode that must ben enabled when debugging an
-                                                    ## application that uses this device e.g. using the Level0 Debug API. It
-                                                    ## has the effect of disabling any timeouts on workload execution time
-                                                    ## and will change workload scheduling to ensure debug accuracy.
-
-class zes_sched_mode_t(c_int):
-    def __str__(self):
-        return str(zes_sched_mode_v(self.value))
-
-
-###############################################################################
-## @brief Properties related to scheduler component
-class zes_sched_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("canControl", ze_bool_t),                                      ## [out] Software can change the scheduler component configuration
-                                                                        ## assuming the user has permissions.
-        ("engines", zes_engine_type_flags_t),                           ## [out] Bitfield of accelerator engine types that are managed by this
-                                                                        ## scheduler component. Note that there can be more than one scheduler
-                                                                        ## component for the same type of accelerator engine.
-        ("supportedModes", c_ulong)                                     ## [out] Bitfield of scheduler modes that can be configured for this
-                                                                        ## scheduler component (bitfield of 1<<::zes_sched_mode_t).
-    ]
-
-###############################################################################
-## @brief Disable forward progress guard timeout.
-ZES_SCHED_WATCHDOG_DISABLE = (~(0ULL))
-
-###############################################################################
-## @brief Configuration for timeout scheduler mode (::ZES_SCHED_MODE_TIMEOUT)
-class zes_sched_timeout_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("watchdogTimeout", c_ulonglong)                                ## [in,out] The maximum time in microseconds that the scheduler will wait
-                                                                        ## for a batch of work submitted to a hardware engine to complete or to
-                                                                        ## be preempted so as to run another context.
-                                                                        ## If this time is exceeded, the hardware engine is reset and the context terminated.
-                                                                        ## If set to ::ZES_SCHED_WATCHDOG_DISABLE, a running workload can run as
-                                                                        ## long as it wants without being terminated, but preemption attempts to
-                                                                        ## run other contexts are permitted but not enforced.
-    ]
-
-###############################################################################
-## @brief Configuration for timeslice scheduler mode
-##        (::ZES_SCHED_MODE_TIMESLICE)
-class zes_sched_timeslice_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("interval", c_ulonglong),                                      ## [in,out] The average interval in microseconds that a submission for a
-                                                                        ## context will run on a hardware engine before being preempted out to
-                                                                        ## run a pending submission for another context.
-        ("yieldTimeout", c_ulonglong)                                   ## [in,out] The maximum time in microseconds that the scheduler will wait
-                                                                        ## to preempt a workload running on an engine before deciding to reset
-                                                                        ## the hardware engine and terminating the associated context.
-    ]
-
-###############################################################################
-## @brief Standby hardware components
-class zes_standby_type_v(IntEnum):
-    GLOBAL = 0                                      ## Control the overall standby policy of the device/sub-device
-
-class zes_standby_type_t(c_int):
-    def __str__(self):
-        return str(zes_standby_type_v(self.value))
-
-
-###############################################################################
-## @brief Standby hardware component properties
-class zes_standby_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_standby_type_t),                                   ## [out] Which standby hardware component this controls
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device
-    ]
-
-###############################################################################
-## @brief Standby promotion modes
-class zes_standby_promo_mode_v(IntEnum):
-    DEFAULT = 0                                     ## Best compromise between performance and energy savings.
-    NEVER = 1                                       ## The device/component will never shutdown. This can improve performance
-                                                    ## but uses more energy.
-
-class zes_standby_promo_mode_t(c_int):
-    def __str__(self):
-        return str(zes_standby_promo_mode_v(self.value))
-
-
-###############################################################################
-## @brief Temperature sensors
-class zes_temp_sensors_v(IntEnum):
-    GLOBAL = 0                                      ## The maximum temperature across all device sensors
-    GPU = 1                                         ## The maximum temperature across all sensors in the GPU
-    MEMORY = 2                                      ## The maximum temperature across all sensors in the local memory
-    GLOBAL_MIN = 3                                  ## The minimum temperature across all device sensors
-    GPU_MIN = 4                                     ## The minimum temperature across all sensors in the GPU
-    MEMORY_MIN = 5                                  ## The minimum temperature across all sensors in the local device memory
-
-class zes_temp_sensors_t(c_int):
-    def __str__(self):
-        return str(zes_temp_sensors_v(self.value))
-
-
-###############################################################################
-## @brief Temperature sensor properties
-class zes_temp_properties_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", zes_temp_sensors_t),                                   ## [out] Which part of the device the temperature sensor measures
-        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
-                                                                        ## that the resource is on the device of the calling Sysman handle
-        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
-        ("maxTemperature", c_double),                                   ## [out] Will contain the maximum temperature for the specific device in
-                                                                        ## degrees Celsius.
-        ("isCriticalTempSupported", ze_bool_t),                         ## [out] Indicates if the critical temperature event
-                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL is supported
-        ("isThreshold1Supported", ze_bool_t),                           ## [out] Indicates if the temperature threshold 1 event
-                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 is supported
-        ("isThreshold2Supported", ze_bool_t)                            ## [out] Indicates if the temperature threshold 2 event
-                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 is supported
-    ]
-
-###############################################################################
-## @brief Temperature sensor threshold
-class zes_temp_threshold_t(Structure):
-    _fields_ = [
-        ("enableLowToHigh", ze_bool_t),                                 ## [in,out] Trigger an event when the temperature crosses from below the
-                                                                        ## threshold to above.
-        ("enableHighToLow", ze_bool_t),                                 ## [in,out] Trigger an event when the temperature crosses from above the
-                                                                        ## threshold to below.
-        ("threshold", c_double)                                         ## [in,out] The threshold in degrees Celsius.
-    ]
-
-###############################################################################
-## @brief Temperature configuration - which events should be triggered and the
-##        trigger conditions.
-class zes_temp_config_t(Structure):
-    _fields_ = [
-        ("stype", zes_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("enableCritical", ze_bool_t),                                  ## [in,out] Indicates if event ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL should
-                                                                        ## be triggered by the driver.
-        ("threshold1", zes_temp_threshold_t),                           ## [in,out] Configuration controlling if and when event
-                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 should be triggered by the
-                                                                        ## driver.
-        ("threshold2", zes_temp_threshold_t)                            ## [in,out] Configuration controlling if and when event
-                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 should be triggered by the
-                                                                        ## driver.
-    ]
-
-###############################################################################
-__use_win_types = "Windows" == platform.uname()[0]
-
-###############################################################################
-## @brief Function-pointer for zesDriverEventListen
-if __use_win_types:
-    _zesDriverEventListen_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
-else:
-    _zesDriverEventListen_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDriverEventListenEx
-if __use_win_types:
-    _zesDriverEventListenEx_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulonglong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
-else:
-    _zesDriverEventListenEx_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulonglong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
-
-
-###############################################################################
-## @brief Table of Driver functions pointers
-class _zes_driver_dditable_t(Structure):
-    _fields_ = [
-        ("pfnEventListen", c_void_p),                                   ## _zesDriverEventListen_t
-        ("pfnEventListenEx", c_void_p)                                  ## _zesDriverEventListenEx_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesDeviceGetProperties
-if __use_win_types:
-    _zesDeviceGetProperties_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_properties_t) )
-else:
-    _zesDeviceGetProperties_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceGetState
-if __use_win_types:
-    _zesDeviceGetState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_state_t) )
-else:
-    _zesDeviceGetState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceReset
-if __use_win_types:
-    _zesDeviceReset_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, ze_bool_t )
-else:
-    _zesDeviceReset_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, ze_bool_t )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceProcessesGetState
-if __use_win_types:
-    _zesDeviceProcessesGetState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_process_state_t) )
-else:
-    _zesDeviceProcessesGetState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_process_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDevicePciGetProperties
-if __use_win_types:
-    _zesDevicePciGetProperties_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_properties_t) )
-else:
-    _zesDevicePciGetProperties_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDevicePciGetState
-if __use_win_types:
-    _zesDevicePciGetState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_state_t) )
-else:
-    _zesDevicePciGetState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDevicePciGetBars
-if __use_win_types:
-    _zesDevicePciGetBars_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pci_bar_properties_t) )
-else:
-    _zesDevicePciGetBars_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pci_bar_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDevicePciGetStats
-if __use_win_types:
-    _zesDevicePciGetStats_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_stats_t) )
-else:
-    _zesDevicePciGetStats_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_stats_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumDiagnosticTestSuites
-if __use_win_types:
-    _zesDeviceEnumDiagnosticTestSuites_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_diag_handle_t) )
-else:
-    _zesDeviceEnumDiagnosticTestSuites_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_diag_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumEngineGroups
-if __use_win_types:
-    _zesDeviceEnumEngineGroups_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_engine_handle_t) )
-else:
-    _zesDeviceEnumEngineGroups_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_engine_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEventRegister
-if __use_win_types:
-    _zesDeviceEventRegister_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, zes_event_type_flags_t )
-else:
-    _zesDeviceEventRegister_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, zes_event_type_flags_t )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumFabricPorts
-if __use_win_types:
-    _zesDeviceEnumFabricPorts_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fabric_port_handle_t) )
-else:
-    _zesDeviceEnumFabricPorts_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fabric_port_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumFans
-if __use_win_types:
-    _zesDeviceEnumFans_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fan_handle_t) )
-else:
-    _zesDeviceEnumFans_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fan_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumFirmwares
-if __use_win_types:
-    _zesDeviceEnumFirmwares_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_firmware_handle_t) )
-else:
-    _zesDeviceEnumFirmwares_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_firmware_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumFrequencyDomains
-if __use_win_types:
-    _zesDeviceEnumFrequencyDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_freq_handle_t) )
-else:
-    _zesDeviceEnumFrequencyDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_freq_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumLeds
-if __use_win_types:
-    _zesDeviceEnumLeds_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_led_handle_t) )
-else:
-    _zesDeviceEnumLeds_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_led_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumMemoryModules
-if __use_win_types:
-    _zesDeviceEnumMemoryModules_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_mem_handle_t) )
-else:
-    _zesDeviceEnumMemoryModules_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_mem_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumPerformanceFactorDomains
-if __use_win_types:
-    _zesDeviceEnumPerformanceFactorDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_perf_handle_t) )
-else:
-    _zesDeviceEnumPerformanceFactorDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_perf_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumPowerDomains
-if __use_win_types:
-    _zesDeviceEnumPowerDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pwr_handle_t) )
-else:
-    _zesDeviceEnumPowerDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pwr_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceGetCardPowerDomain
-if __use_win_types:
-    _zesDeviceGetCardPowerDomain_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pwr_handle_t) )
-else:
-    _zesDeviceGetCardPowerDomain_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pwr_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumPsus
-if __use_win_types:
-    _zesDeviceEnumPsus_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_psu_handle_t) )
-else:
-    _zesDeviceEnumPsus_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_psu_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumRasErrorSets
-if __use_win_types:
-    _zesDeviceEnumRasErrorSets_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_ras_handle_t) )
-else:
-    _zesDeviceEnumRasErrorSets_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_ras_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumSchedulers
-if __use_win_types:
-    _zesDeviceEnumSchedulers_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_sched_handle_t) )
-else:
-    _zesDeviceEnumSchedulers_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_sched_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumStandbyDomains
-if __use_win_types:
-    _zesDeviceEnumStandbyDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_standby_handle_t) )
-else:
-    _zesDeviceEnumStandbyDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_standby_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDeviceEnumTemperatureSensors
-if __use_win_types:
-    _zesDeviceEnumTemperatureSensors_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_temp_handle_t) )
-else:
-    _zesDeviceEnumTemperatureSensors_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_temp_handle_t) )
-
-
-###############################################################################
-## @brief Table of Device functions pointers
-class _zes_device_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesDeviceGetProperties_t
-        ("pfnGetState", c_void_p),                                      ## _zesDeviceGetState_t
-        ("pfnReset", c_void_p),                                         ## _zesDeviceReset_t
-        ("pfnProcessesGetState", c_void_p),                             ## _zesDeviceProcessesGetState_t
-        ("pfnPciGetProperties", c_void_p),                              ## _zesDevicePciGetProperties_t
-        ("pfnPciGetState", c_void_p),                                   ## _zesDevicePciGetState_t
-        ("pfnPciGetBars", c_void_p),                                    ## _zesDevicePciGetBars_t
-        ("pfnPciGetStats", c_void_p),                                   ## _zesDevicePciGetStats_t
-        ("pfnEnumDiagnosticTestSuites", c_void_p),                      ## _zesDeviceEnumDiagnosticTestSuites_t
-        ("pfnEnumEngineGroups", c_void_p),                              ## _zesDeviceEnumEngineGroups_t
-        ("pfnEventRegister", c_void_p),                                 ## _zesDeviceEventRegister_t
-        ("pfnEnumFabricPorts", c_void_p),                               ## _zesDeviceEnumFabricPorts_t
-        ("pfnEnumFans", c_void_p),                                      ## _zesDeviceEnumFans_t
-        ("pfnEnumFirmwares", c_void_p),                                 ## _zesDeviceEnumFirmwares_t
-        ("pfnEnumFrequencyDomains", c_void_p),                          ## _zesDeviceEnumFrequencyDomains_t
-        ("pfnEnumLeds", c_void_p),                                      ## _zesDeviceEnumLeds_t
-        ("pfnEnumMemoryModules", c_void_p),                             ## _zesDeviceEnumMemoryModules_t
-        ("pfnEnumPerformanceFactorDomains", c_void_p),                  ## _zesDeviceEnumPerformanceFactorDomains_t
-        ("pfnEnumPowerDomains", c_void_p),                              ## _zesDeviceEnumPowerDomains_t
-        ("pfnGetCardPowerDomain", c_void_p),                            ## _zesDeviceGetCardPowerDomain_t
-        ("pfnEnumPsus", c_void_p),                                      ## _zesDeviceEnumPsus_t
-        ("pfnEnumRasErrorSets", c_void_p),                              ## _zesDeviceEnumRasErrorSets_t
-        ("pfnEnumSchedulers", c_void_p),                                ## _zesDeviceEnumSchedulers_t
-        ("pfnEnumStandbyDomains", c_void_p),                            ## _zesDeviceEnumStandbyDomains_t
-        ("pfnEnumTemperatureSensors", c_void_p)                         ## _zesDeviceEnumTemperatureSensors_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerGetProperties
-if __use_win_types:
-    _zesSchedulerGetProperties_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_properties_t) )
-else:
-    _zesSchedulerGetProperties_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerGetCurrentMode
-if __use_win_types:
-    _zesSchedulerGetCurrentMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_mode_t) )
-else:
-    _zesSchedulerGetCurrentMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_mode_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerGetTimeoutModeProperties
-if __use_win_types:
-    _zesSchedulerGetTimeoutModeProperties_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeout_properties_t) )
-else:
-    _zesSchedulerGetTimeoutModeProperties_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeout_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerGetTimesliceModeProperties
-if __use_win_types:
-    _zesSchedulerGetTimesliceModeProperties_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeslice_properties_t) )
-else:
-    _zesSchedulerGetTimesliceModeProperties_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeslice_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerSetTimeoutMode
-if __use_win_types:
-    _zesSchedulerSetTimeoutMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeout_properties_t), POINTER(ze_bool_t) )
-else:
-    _zesSchedulerSetTimeoutMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeout_properties_t), POINTER(ze_bool_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerSetTimesliceMode
-if __use_win_types:
-    _zesSchedulerSetTimesliceMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeslice_properties_t), POINTER(ze_bool_t) )
-else:
-    _zesSchedulerSetTimesliceMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeslice_properties_t), POINTER(ze_bool_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerSetExclusiveMode
-if __use_win_types:
-    _zesSchedulerSetExclusiveMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
-else:
-    _zesSchedulerSetExclusiveMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
-
-###############################################################################
-## @brief Function-pointer for zesSchedulerSetComputeUnitDebugMode
-if __use_win_types:
-    _zesSchedulerSetComputeUnitDebugMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
-else:
-    _zesSchedulerSetComputeUnitDebugMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
-
-
-###############################################################################
-## @brief Table of Scheduler functions pointers
-class _zes_scheduler_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesSchedulerGetProperties_t
-        ("pfnGetCurrentMode", c_void_p),                                ## _zesSchedulerGetCurrentMode_t
-        ("pfnGetTimeoutModeProperties", c_void_p),                      ## _zesSchedulerGetTimeoutModeProperties_t
-        ("pfnGetTimesliceModeProperties", c_void_p),                    ## _zesSchedulerGetTimesliceModeProperties_t
-        ("pfnSetTimeoutMode", c_void_p),                                ## _zesSchedulerSetTimeoutMode_t
-        ("pfnSetTimesliceMode", c_void_p),                              ## _zesSchedulerSetTimesliceMode_t
-        ("pfnSetExclusiveMode", c_void_p),                              ## _zesSchedulerSetExclusiveMode_t
-        ("pfnSetComputeUnitDebugMode", c_void_p)                        ## _zesSchedulerSetComputeUnitDebugMode_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesPerformanceFactorGetProperties
-if __use_win_types:
-    _zesPerformanceFactorGetProperties_t = WINFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(zes_perf_properties_t) )
-else:
-    _zesPerformanceFactorGetProperties_t = CFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(zes_perf_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPerformanceFactorGetConfig
-if __use_win_types:
-    _zesPerformanceFactorGetConfig_t = WINFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(c_double) )
-else:
-    _zesPerformanceFactorGetConfig_t = CFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(c_double) )
-
-###############################################################################
-## @brief Function-pointer for zesPerformanceFactorSetConfig
-if __use_win_types:
-    _zesPerformanceFactorSetConfig_t = WINFUNCTYPE( ze_result_t, zes_perf_handle_t, c_double )
-else:
-    _zesPerformanceFactorSetConfig_t = CFUNCTYPE( ze_result_t, zes_perf_handle_t, c_double )
-
-
-###############################################################################
-## @brief Table of PerformanceFactor functions pointers
-class _zes_performance_factor_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesPerformanceFactorGetProperties_t
-        ("pfnGetConfig", c_void_p),                                     ## _zesPerformanceFactorGetConfig_t
-        ("pfnSetConfig", c_void_p)                                      ## _zesPerformanceFactorSetConfig_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesPowerGetProperties
-if __use_win_types:
-    _zesPowerGetProperties_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_properties_t) )
-else:
-    _zesPowerGetProperties_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPowerGetEnergyCounter
-if __use_win_types:
-    _zesPowerGetEnergyCounter_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_energy_counter_t) )
-else:
-    _zesPowerGetEnergyCounter_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_energy_counter_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPowerGetLimits
-if __use_win_types:
-    _zesPowerGetLimits_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
-else:
-    _zesPowerGetLimits_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPowerSetLimits
-if __use_win_types:
-    _zesPowerSetLimits_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
-else:
-    _zesPowerSetLimits_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPowerGetEnergyThreshold
-if __use_win_types:
-    _zesPowerGetEnergyThreshold_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_energy_threshold_t) )
-else:
-    _zesPowerGetEnergyThreshold_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_energy_threshold_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPowerSetEnergyThreshold
-if __use_win_types:
-    _zesPowerSetEnergyThreshold_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, c_double )
-else:
-    _zesPowerSetEnergyThreshold_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, c_double )
-
-
-###############################################################################
-## @brief Table of Power functions pointers
-class _zes_power_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesPowerGetProperties_t
-        ("pfnGetEnergyCounter", c_void_p),                              ## _zesPowerGetEnergyCounter_t
-        ("pfnGetLimits", c_void_p),                                     ## _zesPowerGetLimits_t
-        ("pfnSetLimits", c_void_p),                                     ## _zesPowerSetLimits_t
-        ("pfnGetEnergyThreshold", c_void_p),                            ## _zesPowerGetEnergyThreshold_t
-        ("pfnSetEnergyThreshold", c_void_p)                             ## _zesPowerSetEnergyThreshold_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyGetProperties
-if __use_win_types:
-    _zesFrequencyGetProperties_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_properties_t) )
-else:
-    _zesFrequencyGetProperties_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyGetAvailableClocks
-if __use_win_types:
-    _zesFrequencyGetAvailableClocks_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_ulong), POINTER(c_double) )
-else:
-    _zesFrequencyGetAvailableClocks_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_ulong), POINTER(c_double) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyGetRange
-if __use_win_types:
-    _zesFrequencyGetRange_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
-else:
-    _zesFrequencyGetRange_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencySetRange
-if __use_win_types:
-    _zesFrequencySetRange_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
-else:
-    _zesFrequencySetRange_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyGetState
-if __use_win_types:
-    _zesFrequencyGetState_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_state_t) )
-else:
-    _zesFrequencyGetState_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyGetThrottleTime
-if __use_win_types:
-    _zesFrequencyGetThrottleTime_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_throttle_time_t) )
-else:
-    _zesFrequencyGetThrottleTime_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_throttle_time_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcGetCapabilities
-if __use_win_types:
-    _zesFrequencyOcGetCapabilities_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_capabilities_t) )
-else:
-    _zesFrequencyOcGetCapabilities_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_capabilities_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcGetFrequencyTarget
-if __use_win_types:
-    _zesFrequencyOcGetFrequencyTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
-else:
-    _zesFrequencyOcGetFrequencyTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcSetFrequencyTarget
-if __use_win_types:
-    _zesFrequencyOcSetFrequencyTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
-else:
-    _zesFrequencyOcSetFrequencyTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcGetVoltageTarget
-if __use_win_types:
-    _zesFrequencyOcGetVoltageTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double), POINTER(c_double) )
-else:
-    _zesFrequencyOcGetVoltageTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double), POINTER(c_double) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcSetVoltageTarget
-if __use_win_types:
-    _zesFrequencyOcSetVoltageTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double, c_double )
-else:
-    _zesFrequencyOcSetVoltageTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double, c_double )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcSetMode
-if __use_win_types:
-    _zesFrequencyOcSetMode_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, zes_oc_mode_t )
-else:
-    _zesFrequencyOcSetMode_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, zes_oc_mode_t )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcGetMode
-if __use_win_types:
-    _zesFrequencyOcGetMode_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_mode_t) )
-else:
-    _zesFrequencyOcGetMode_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_mode_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcGetIccMax
-if __use_win_types:
-    _zesFrequencyOcGetIccMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
-else:
-    _zesFrequencyOcGetIccMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcSetIccMax
-if __use_win_types:
-    _zesFrequencyOcSetIccMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
-else:
-    _zesFrequencyOcSetIccMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcGetTjMax
-if __use_win_types:
-    _zesFrequencyOcGetTjMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
-else:
-    _zesFrequencyOcGetTjMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
-
-###############################################################################
-## @brief Function-pointer for zesFrequencyOcSetTjMax
-if __use_win_types:
-    _zesFrequencyOcSetTjMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
-else:
-    _zesFrequencyOcSetTjMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
-
-
-###############################################################################
-## @brief Table of Frequency functions pointers
-class _zes_frequency_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesFrequencyGetProperties_t
-        ("pfnGetAvailableClocks", c_void_p),                            ## _zesFrequencyGetAvailableClocks_t
-        ("pfnGetRange", c_void_p),                                      ## _zesFrequencyGetRange_t
-        ("pfnSetRange", c_void_p),                                      ## _zesFrequencySetRange_t
-        ("pfnGetState", c_void_p),                                      ## _zesFrequencyGetState_t
-        ("pfnGetThrottleTime", c_void_p),                               ## _zesFrequencyGetThrottleTime_t
-        ("pfnOcGetCapabilities", c_void_p),                             ## _zesFrequencyOcGetCapabilities_t
-        ("pfnOcGetFrequencyTarget", c_void_p),                          ## _zesFrequencyOcGetFrequencyTarget_t
-        ("pfnOcSetFrequencyTarget", c_void_p),                          ## _zesFrequencyOcSetFrequencyTarget_t
-        ("pfnOcGetVoltageTarget", c_void_p),                            ## _zesFrequencyOcGetVoltageTarget_t
-        ("pfnOcSetVoltageTarget", c_void_p),                            ## _zesFrequencyOcSetVoltageTarget_t
-        ("pfnOcSetMode", c_void_p),                                     ## _zesFrequencyOcSetMode_t
-        ("pfnOcGetMode", c_void_p),                                     ## _zesFrequencyOcGetMode_t
-        ("pfnOcGetIccMax", c_void_p),                                   ## _zesFrequencyOcGetIccMax_t
-        ("pfnOcSetIccMax", c_void_p),                                   ## _zesFrequencyOcSetIccMax_t
-        ("pfnOcGetTjMax", c_void_p),                                    ## _zesFrequencyOcGetTjMax_t
-        ("pfnOcSetTjMax", c_void_p)                                     ## _zesFrequencyOcSetTjMax_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesEngineGetProperties
-if __use_win_types:
-    _zesEngineGetProperties_t = WINFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_properties_t) )
-else:
-    _zesEngineGetProperties_t = CFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesEngineGetActivity
-if __use_win_types:
-    _zesEngineGetActivity_t = WINFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_stats_t) )
-else:
-    _zesEngineGetActivity_t = CFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_stats_t) )
-
-
-###############################################################################
-## @brief Table of Engine functions pointers
-class _zes_engine_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesEngineGetProperties_t
-        ("pfnGetActivity", c_void_p)                                    ## _zesEngineGetActivity_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesStandbyGetProperties
-if __use_win_types:
-    _zesStandbyGetProperties_t = WINFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_properties_t) )
-else:
-    _zesStandbyGetProperties_t = CFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesStandbyGetMode
-if __use_win_types:
-    _zesStandbyGetMode_t = WINFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_promo_mode_t) )
-else:
-    _zesStandbyGetMode_t = CFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_promo_mode_t) )
-
-###############################################################################
-## @brief Function-pointer for zesStandbySetMode
-if __use_win_types:
-    _zesStandbySetMode_t = WINFUNCTYPE( ze_result_t, zes_standby_handle_t, zes_standby_promo_mode_t )
-else:
-    _zesStandbySetMode_t = CFUNCTYPE( ze_result_t, zes_standby_handle_t, zes_standby_promo_mode_t )
-
-
-###############################################################################
-## @brief Table of Standby functions pointers
-class _zes_standby_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesStandbyGetProperties_t
-        ("pfnGetMode", c_void_p),                                       ## _zesStandbyGetMode_t
-        ("pfnSetMode", c_void_p)                                        ## _zesStandbySetMode_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesFirmwareGetProperties
-if __use_win_types:
-    _zesFirmwareGetProperties_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(zes_firmware_properties_t) )
-else:
-    _zesFirmwareGetProperties_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(zes_firmware_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFirmwareFlash
-if __use_win_types:
-    _zesFirmwareFlash_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, c_void_p, c_ulong )
-else:
-    _zesFirmwareFlash_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, c_void_p, c_ulong )
-
-
-###############################################################################
-## @brief Table of Firmware functions pointers
-class _zes_firmware_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesFirmwareGetProperties_t
-        ("pfnFlash", c_void_p)                                          ## _zesFirmwareFlash_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesMemoryGetProperties
-if __use_win_types:
-    _zesMemoryGetProperties_t = WINFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_properties_t) )
-else:
-    _zesMemoryGetProperties_t = CFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesMemoryGetState
-if __use_win_types:
-    _zesMemoryGetState_t = WINFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_state_t) )
-else:
-    _zesMemoryGetState_t = CFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesMemoryGetBandwidth
-if __use_win_types:
-    _zesMemoryGetBandwidth_t = WINFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_bandwidth_t) )
-else:
-    _zesMemoryGetBandwidth_t = CFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_bandwidth_t) )
-
-
-###############################################################################
-## @brief Table of Memory functions pointers
-class _zes_memory_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesMemoryGetProperties_t
-        ("pfnGetState", c_void_p),                                      ## _zesMemoryGetState_t
-        ("pfnGetBandwidth", c_void_p)                                   ## _zesMemoryGetBandwidth_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesFabricPortGetProperties
-if __use_win_types:
-    _zesFabricPortGetProperties_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_properties_t) )
-else:
-    _zesFabricPortGetProperties_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFabricPortGetLinkType
-if __use_win_types:
-    _zesFabricPortGetLinkType_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_link_type_t) )
-else:
-    _zesFabricPortGetLinkType_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_link_type_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFabricPortGetConfig
-if __use_win_types:
-    _zesFabricPortGetConfig_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
-else:
-    _zesFabricPortGetConfig_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFabricPortSetConfig
-if __use_win_types:
-    _zesFabricPortSetConfig_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
-else:
-    _zesFabricPortSetConfig_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFabricPortGetState
-if __use_win_types:
-    _zesFabricPortGetState_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_state_t) )
-else:
-    _zesFabricPortGetState_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFabricPortGetThroughput
-if __use_win_types:
-    _zesFabricPortGetThroughput_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_throughput_t) )
-else:
-    _zesFabricPortGetThroughput_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_throughput_t) )
-
-
-###############################################################################
-## @brief Table of FabricPort functions pointers
-class _zes_fabric_port_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesFabricPortGetProperties_t
-        ("pfnGetLinkType", c_void_p),                                   ## _zesFabricPortGetLinkType_t
-        ("pfnGetConfig", c_void_p),                                     ## _zesFabricPortGetConfig_t
-        ("pfnSetConfig", c_void_p),                                     ## _zesFabricPortSetConfig_t
-        ("pfnGetState", c_void_p),                                      ## _zesFabricPortGetState_t
-        ("pfnGetThroughput", c_void_p)                                  ## _zesFabricPortGetThroughput_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesTemperatureGetProperties
-if __use_win_types:
-    _zesTemperatureGetProperties_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_properties_t) )
-else:
-    _zesTemperatureGetProperties_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesTemperatureGetConfig
-if __use_win_types:
-    _zesTemperatureGetConfig_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
-else:
-    _zesTemperatureGetConfig_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesTemperatureSetConfig
-if __use_win_types:
-    _zesTemperatureSetConfig_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
-else:
-    _zesTemperatureSetConfig_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesTemperatureGetState
-if __use_win_types:
-    _zesTemperatureGetState_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(c_double) )
-else:
-    _zesTemperatureGetState_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(c_double) )
-
-
-###############################################################################
-## @brief Table of Temperature functions pointers
-class _zes_temperature_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesTemperatureGetProperties_t
-        ("pfnGetConfig", c_void_p),                                     ## _zesTemperatureGetConfig_t
-        ("pfnSetConfig", c_void_p),                                     ## _zesTemperatureSetConfig_t
-        ("pfnGetState", c_void_p)                                       ## _zesTemperatureGetState_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesPsuGetProperties
-if __use_win_types:
-    _zesPsuGetProperties_t = WINFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_properties_t) )
-else:
-    _zesPsuGetProperties_t = CFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesPsuGetState
-if __use_win_types:
-    _zesPsuGetState_t = WINFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_state_t) )
-else:
-    _zesPsuGetState_t = CFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_state_t) )
-
-
-###############################################################################
-## @brief Table of Psu functions pointers
-class _zes_psu_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesPsuGetProperties_t
-        ("pfnGetState", c_void_p)                                       ## _zesPsuGetState_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesFanGetProperties
-if __use_win_types:
-    _zesFanGetProperties_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_properties_t) )
-else:
-    _zesFanGetProperties_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFanGetConfig
-if __use_win_types:
-    _zesFanGetConfig_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_config_t) )
-else:
-    _zesFanGetConfig_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFanSetDefaultMode
-if __use_win_types:
-    _zesFanSetDefaultMode_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t )
-else:
-    _zesFanSetDefaultMode_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zesFanSetFixedSpeedMode
-if __use_win_types:
-    _zesFanSetFixedSpeedMode_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_t) )
-else:
-    _zesFanSetFixedSpeedMode_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFanSetSpeedTableMode
-if __use_win_types:
-    _zesFanSetSpeedTableMode_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_table_t) )
-else:
-    _zesFanSetSpeedTableMode_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_table_t) )
-
-###############################################################################
-## @brief Function-pointer for zesFanGetState
-if __use_win_types:
-    _zesFanGetState_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, zes_fan_speed_units_t, POINTER(c_int32_t) )
-else:
-    _zesFanGetState_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, zes_fan_speed_units_t, POINTER(c_int32_t) )
-
-
-###############################################################################
-## @brief Table of Fan functions pointers
-class _zes_fan_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesFanGetProperties_t
-        ("pfnGetConfig", c_void_p),                                     ## _zesFanGetConfig_t
-        ("pfnSetDefaultMode", c_void_p),                                ## _zesFanSetDefaultMode_t
-        ("pfnSetFixedSpeedMode", c_void_p),                             ## _zesFanSetFixedSpeedMode_t
-        ("pfnSetSpeedTableMode", c_void_p),                             ## _zesFanSetSpeedTableMode_t
-        ("pfnGetState", c_void_p)                                       ## _zesFanGetState_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesLedGetProperties
-if __use_win_types:
-    _zesLedGetProperties_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_properties_t) )
-else:
-    _zesLedGetProperties_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesLedGetState
-if __use_win_types:
-    _zesLedGetState_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_state_t) )
-else:
-    _zesLedGetState_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_state_t) )
-
-###############################################################################
-## @brief Function-pointer for zesLedSetState
-if __use_win_types:
-    _zesLedSetState_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, ze_bool_t )
-else:
-    _zesLedSetState_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, ze_bool_t )
-
-###############################################################################
-## @brief Function-pointer for zesLedSetColor
-if __use_win_types:
-    _zesLedSetColor_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_color_t) )
-else:
-    _zesLedSetColor_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_color_t) )
-
-
-###############################################################################
-## @brief Table of Led functions pointers
-class _zes_led_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesLedGetProperties_t
-        ("pfnGetState", c_void_p),                                      ## _zesLedGetState_t
-        ("pfnSetState", c_void_p),                                      ## _zesLedSetState_t
-        ("pfnSetColor", c_void_p)                                       ## _zesLedSetColor_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesRasGetProperties
-if __use_win_types:
-    _zesRasGetProperties_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_properties_t) )
-else:
-    _zesRasGetProperties_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesRasGetConfig
-if __use_win_types:
-    _zesRasGetConfig_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
-else:
-    _zesRasGetConfig_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesRasSetConfig
-if __use_win_types:
-    _zesRasSetConfig_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
-else:
-    _zesRasSetConfig_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
-
-###############################################################################
-## @brief Function-pointer for zesRasGetState
-if __use_win_types:
-    _zesRasGetState_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, ze_bool_t, POINTER(zes_ras_state_t) )
-else:
-    _zesRasGetState_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, ze_bool_t, POINTER(zes_ras_state_t) )
-
-
-###############################################################################
-## @brief Table of Ras functions pointers
-class _zes_ras_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesRasGetProperties_t
-        ("pfnGetConfig", c_void_p),                                     ## _zesRasGetConfig_t
-        ("pfnSetConfig", c_void_p),                                     ## _zesRasSetConfig_t
-        ("pfnGetState", c_void_p)                                       ## _zesRasGetState_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zesDiagnosticsGetProperties
-if __use_win_types:
-    _zesDiagnosticsGetProperties_t = WINFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(zes_diag_properties_t) )
-else:
-    _zesDiagnosticsGetProperties_t = CFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(zes_diag_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDiagnosticsGetTests
-if __use_win_types:
-    _zesDiagnosticsGetTests_t = WINFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(c_ulong), POINTER(zes_diag_test_t) )
-else:
-    _zesDiagnosticsGetTests_t = CFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(c_ulong), POINTER(zes_diag_test_t) )
-
-###############################################################################
-## @brief Function-pointer for zesDiagnosticsRunTests
-if __use_win_types:
-    _zesDiagnosticsRunTests_t = WINFUNCTYPE( ze_result_t, zes_diag_handle_t, c_ulong, c_ulong, POINTER(zes_diag_result_t) )
-else:
-    _zesDiagnosticsRunTests_t = CFUNCTYPE( ze_result_t, zes_diag_handle_t, c_ulong, c_ulong, POINTER(zes_diag_result_t) )
-
-
-###############################################################################
-## @brief Table of Diagnostics functions pointers
-class _zes_diagnostics_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProperties", c_void_p),                                 ## _zesDiagnosticsGetProperties_t
-        ("pfnGetTests", c_void_p),                                      ## _zesDiagnosticsGetTests_t
-        ("pfnRunTests", c_void_p)                                       ## _zesDiagnosticsRunTests_t
-    ]
-
-###############################################################################
-class _zes_dditable_t(Structure):
-    _fields_ = [
-        ("Driver", _zes_driver_dditable_t),
-        ("Device", _zes_device_dditable_t),
-        ("Scheduler", _zes_scheduler_dditable_t),
-        ("PerformanceFactor", _zes_performance_factor_dditable_t),
-        ("Power", _zes_power_dditable_t),
-        ("Frequency", _zes_frequency_dditable_t),
-        ("Engine", _zes_engine_dditable_t),
-        ("Standby", _zes_standby_dditable_t),
-        ("Firmware", _zes_firmware_dditable_t),
-        ("Memory", _zes_memory_dditable_t),
-        ("FabricPort", _zes_fabric_port_dditable_t),
-        ("Temperature", _zes_temperature_dditable_t),
-        ("Psu", _zes_psu_dditable_t),
-        ("Fan", _zes_fan_dditable_t),
-        ("Led", _zes_led_dditable_t),
-        ("Ras", _zes_ras_dditable_t),
-        ("Diagnostics", _zes_diagnostics_dditable_t)
-    ]
-
-###############################################################################
-## @brief zes device-driver interfaces
-class ZES_DDI:
-    def __init__(self, version : ze_api_version_t):
-        # load the ze_loader library
-        if "Windows" == platform.uname()[0]:
-            self.__dll = WinDLL("ze_loader.dll")
-        else:
-            self.__dll = CDLL("ze_loader.so")
-
-        # fill the ddi tables
-        self.__dditable = _zes_dditable_t()
-
-        # call driver to get function pointers
-        _Driver = _zes_driver_dditable_t()
-        r = ze_result_v(self.__dll.zesGetDriverProcAddrTable(version, byref(_Driver)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Driver = _Driver
-
-        # attach function interface to function address
-        self.zesDriverEventListen = _zesDriverEventListen_t(self.__dditable.Driver.pfnEventListen)
-        self.zesDriverEventListenEx = _zesDriverEventListenEx_t(self.__dditable.Driver.pfnEventListenEx)
-
-        # call driver to get function pointers
-        _Device = _zes_device_dditable_t()
-        r = ze_result_v(self.__dll.zesGetDeviceProcAddrTable(version, byref(_Device)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Device = _Device
-
-        # attach function interface to function address
-        self.zesDeviceGetProperties = _zesDeviceGetProperties_t(self.__dditable.Device.pfnGetProperties)
-        self.zesDeviceGetState = _zesDeviceGetState_t(self.__dditable.Device.pfnGetState)
-        self.zesDeviceReset = _zesDeviceReset_t(self.__dditable.Device.pfnReset)
-        self.zesDeviceProcessesGetState = _zesDeviceProcessesGetState_t(self.__dditable.Device.pfnProcessesGetState)
-        self.zesDevicePciGetProperties = _zesDevicePciGetProperties_t(self.__dditable.Device.pfnPciGetProperties)
-        self.zesDevicePciGetState = _zesDevicePciGetState_t(self.__dditable.Device.pfnPciGetState)
-        self.zesDevicePciGetBars = _zesDevicePciGetBars_t(self.__dditable.Device.pfnPciGetBars)
-        self.zesDevicePciGetStats = _zesDevicePciGetStats_t(self.__dditable.Device.pfnPciGetStats)
-        self.zesDeviceEnumDiagnosticTestSuites = _zesDeviceEnumDiagnosticTestSuites_t(self.__dditable.Device.pfnEnumDiagnosticTestSuites)
-        self.zesDeviceEnumEngineGroups = _zesDeviceEnumEngineGroups_t(self.__dditable.Device.pfnEnumEngineGroups)
-        self.zesDeviceEventRegister = _zesDeviceEventRegister_t(self.__dditable.Device.pfnEventRegister)
-        self.zesDeviceEnumFabricPorts = _zesDeviceEnumFabricPorts_t(self.__dditable.Device.pfnEnumFabricPorts)
-        self.zesDeviceEnumFans = _zesDeviceEnumFans_t(self.__dditable.Device.pfnEnumFans)
-        self.zesDeviceEnumFirmwares = _zesDeviceEnumFirmwares_t(self.__dditable.Device.pfnEnumFirmwares)
-        self.zesDeviceEnumFrequencyDomains = _zesDeviceEnumFrequencyDomains_t(self.__dditable.Device.pfnEnumFrequencyDomains)
-        self.zesDeviceEnumLeds = _zesDeviceEnumLeds_t(self.__dditable.Device.pfnEnumLeds)
-        self.zesDeviceEnumMemoryModules = _zesDeviceEnumMemoryModules_t(self.__dditable.Device.pfnEnumMemoryModules)
-        self.zesDeviceEnumPerformanceFactorDomains = _zesDeviceEnumPerformanceFactorDomains_t(self.__dditable.Device.pfnEnumPerformanceFactorDomains)
-        self.zesDeviceEnumPowerDomains = _zesDeviceEnumPowerDomains_t(self.__dditable.Device.pfnEnumPowerDomains)
-        self.zesDeviceGetCardPowerDomain = _zesDeviceGetCardPowerDomain_t(self.__dditable.Device.pfnGetCardPowerDomain)
-        self.zesDeviceEnumPsus = _zesDeviceEnumPsus_t(self.__dditable.Device.pfnEnumPsus)
-        self.zesDeviceEnumRasErrorSets = _zesDeviceEnumRasErrorSets_t(self.__dditable.Device.pfnEnumRasErrorSets)
-        self.zesDeviceEnumSchedulers = _zesDeviceEnumSchedulers_t(self.__dditable.Device.pfnEnumSchedulers)
-        self.zesDeviceEnumStandbyDomains = _zesDeviceEnumStandbyDomains_t(self.__dditable.Device.pfnEnumStandbyDomains)
-        self.zesDeviceEnumTemperatureSensors = _zesDeviceEnumTemperatureSensors_t(self.__dditable.Device.pfnEnumTemperatureSensors)
-
-        # call driver to get function pointers
-        _Scheduler = _zes_scheduler_dditable_t()
-        r = ze_result_v(self.__dll.zesGetSchedulerProcAddrTable(version, byref(_Scheduler)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Scheduler = _Scheduler
-
-        # attach function interface to function address
-        self.zesSchedulerGetProperties = _zesSchedulerGetProperties_t(self.__dditable.Scheduler.pfnGetProperties)
-        self.zesSchedulerGetCurrentMode = _zesSchedulerGetCurrentMode_t(self.__dditable.Scheduler.pfnGetCurrentMode)
-        self.zesSchedulerGetTimeoutModeProperties = _zesSchedulerGetTimeoutModeProperties_t(self.__dditable.Scheduler.pfnGetTimeoutModeProperties)
-        self.zesSchedulerGetTimesliceModeProperties = _zesSchedulerGetTimesliceModeProperties_t(self.__dditable.Scheduler.pfnGetTimesliceModeProperties)
-        self.zesSchedulerSetTimeoutMode = _zesSchedulerSetTimeoutMode_t(self.__dditable.Scheduler.pfnSetTimeoutMode)
-        self.zesSchedulerSetTimesliceMode = _zesSchedulerSetTimesliceMode_t(self.__dditable.Scheduler.pfnSetTimesliceMode)
-        self.zesSchedulerSetExclusiveMode = _zesSchedulerSetExclusiveMode_t(self.__dditable.Scheduler.pfnSetExclusiveMode)
-        self.zesSchedulerSetComputeUnitDebugMode = _zesSchedulerSetComputeUnitDebugMode_t(self.__dditable.Scheduler.pfnSetComputeUnitDebugMode)
-
-        # call driver to get function pointers
-        _PerformanceFactor = _zes_performance_factor_dditable_t()
-        r = ze_result_v(self.__dll.zesGetPerformanceFactorProcAddrTable(version, byref(_PerformanceFactor)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.PerformanceFactor = _PerformanceFactor
-
-        # attach function interface to function address
-        self.zesPerformanceFactorGetProperties = _zesPerformanceFactorGetProperties_t(self.__dditable.PerformanceFactor.pfnGetProperties)
-        self.zesPerformanceFactorGetConfig = _zesPerformanceFactorGetConfig_t(self.__dditable.PerformanceFactor.pfnGetConfig)
-        self.zesPerformanceFactorSetConfig = _zesPerformanceFactorSetConfig_t(self.__dditable.PerformanceFactor.pfnSetConfig)
-
-        # call driver to get function pointers
-        _Power = _zes_power_dditable_t()
-        r = ze_result_v(self.__dll.zesGetPowerProcAddrTable(version, byref(_Power)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Power = _Power
-
-        # attach function interface to function address
-        self.zesPowerGetProperties = _zesPowerGetProperties_t(self.__dditable.Power.pfnGetProperties)
-        self.zesPowerGetEnergyCounter = _zesPowerGetEnergyCounter_t(self.__dditable.Power.pfnGetEnergyCounter)
-        self.zesPowerGetLimits = _zesPowerGetLimits_t(self.__dditable.Power.pfnGetLimits)
-        self.zesPowerSetLimits = _zesPowerSetLimits_t(self.__dditable.Power.pfnSetLimits)
-        self.zesPowerGetEnergyThreshold = _zesPowerGetEnergyThreshold_t(self.__dditable.Power.pfnGetEnergyThreshold)
-        self.zesPowerSetEnergyThreshold = _zesPowerSetEnergyThreshold_t(self.__dditable.Power.pfnSetEnergyThreshold)
-
-        # call driver to get function pointers
-        _Frequency = _zes_frequency_dditable_t()
-        r = ze_result_v(self.__dll.zesGetFrequencyProcAddrTable(version, byref(_Frequency)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Frequency = _Frequency
-
-        # attach function interface to function address
-        self.zesFrequencyGetProperties = _zesFrequencyGetProperties_t(self.__dditable.Frequency.pfnGetProperties)
-        self.zesFrequencyGetAvailableClocks = _zesFrequencyGetAvailableClocks_t(self.__dditable.Frequency.pfnGetAvailableClocks)
-        self.zesFrequencyGetRange = _zesFrequencyGetRange_t(self.__dditable.Frequency.pfnGetRange)
-        self.zesFrequencySetRange = _zesFrequencySetRange_t(self.__dditable.Frequency.pfnSetRange)
-        self.zesFrequencyGetState = _zesFrequencyGetState_t(self.__dditable.Frequency.pfnGetState)
-        self.zesFrequencyGetThrottleTime = _zesFrequencyGetThrottleTime_t(self.__dditable.Frequency.pfnGetThrottleTime)
-        self.zesFrequencyOcGetCapabilities = _zesFrequencyOcGetCapabilities_t(self.__dditable.Frequency.pfnOcGetCapabilities)
-        self.zesFrequencyOcGetFrequencyTarget = _zesFrequencyOcGetFrequencyTarget_t(self.__dditable.Frequency.pfnOcGetFrequencyTarget)
-        self.zesFrequencyOcSetFrequencyTarget = _zesFrequencyOcSetFrequencyTarget_t(self.__dditable.Frequency.pfnOcSetFrequencyTarget)
-        self.zesFrequencyOcGetVoltageTarget = _zesFrequencyOcGetVoltageTarget_t(self.__dditable.Frequency.pfnOcGetVoltageTarget)
-        self.zesFrequencyOcSetVoltageTarget = _zesFrequencyOcSetVoltageTarget_t(self.__dditable.Frequency.pfnOcSetVoltageTarget)
-        self.zesFrequencyOcSetMode = _zesFrequencyOcSetMode_t(self.__dditable.Frequency.pfnOcSetMode)
-        self.zesFrequencyOcGetMode = _zesFrequencyOcGetMode_t(self.__dditable.Frequency.pfnOcGetMode)
-        self.zesFrequencyOcGetIccMax = _zesFrequencyOcGetIccMax_t(self.__dditable.Frequency.pfnOcGetIccMax)
-        self.zesFrequencyOcSetIccMax = _zesFrequencyOcSetIccMax_t(self.__dditable.Frequency.pfnOcSetIccMax)
-        self.zesFrequencyOcGetTjMax = _zesFrequencyOcGetTjMax_t(self.__dditable.Frequency.pfnOcGetTjMax)
-        self.zesFrequencyOcSetTjMax = _zesFrequencyOcSetTjMax_t(self.__dditable.Frequency.pfnOcSetTjMax)
-
-        # call driver to get function pointers
-        _Engine = _zes_engine_dditable_t()
-        r = ze_result_v(self.__dll.zesGetEngineProcAddrTable(version, byref(_Engine)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Engine = _Engine
-
-        # attach function interface to function address
-        self.zesEngineGetProperties = _zesEngineGetProperties_t(self.__dditable.Engine.pfnGetProperties)
-        self.zesEngineGetActivity = _zesEngineGetActivity_t(self.__dditable.Engine.pfnGetActivity)
-
-        # call driver to get function pointers
-        _Standby = _zes_standby_dditable_t()
-        r = ze_result_v(self.__dll.zesGetStandbyProcAddrTable(version, byref(_Standby)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Standby = _Standby
-
-        # attach function interface to function address
-        self.zesStandbyGetProperties = _zesStandbyGetProperties_t(self.__dditable.Standby.pfnGetProperties)
-        self.zesStandbyGetMode = _zesStandbyGetMode_t(self.__dditable.Standby.pfnGetMode)
-        self.zesStandbySetMode = _zesStandbySetMode_t(self.__dditable.Standby.pfnSetMode)
-
-        # call driver to get function pointers
-        _Firmware = _zes_firmware_dditable_t()
-        r = ze_result_v(self.__dll.zesGetFirmwareProcAddrTable(version, byref(_Firmware)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Firmware = _Firmware
-
-        # attach function interface to function address
-        self.zesFirmwareGetProperties = _zesFirmwareGetProperties_t(self.__dditable.Firmware.pfnGetProperties)
-        self.zesFirmwareFlash = _zesFirmwareFlash_t(self.__dditable.Firmware.pfnFlash)
-
-        # call driver to get function pointers
-        _Memory = _zes_memory_dditable_t()
-        r = ze_result_v(self.__dll.zesGetMemoryProcAddrTable(version, byref(_Memory)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Memory = _Memory
-
-        # attach function interface to function address
-        self.zesMemoryGetProperties = _zesMemoryGetProperties_t(self.__dditable.Memory.pfnGetProperties)
-        self.zesMemoryGetState = _zesMemoryGetState_t(self.__dditable.Memory.pfnGetState)
-        self.zesMemoryGetBandwidth = _zesMemoryGetBandwidth_t(self.__dditable.Memory.pfnGetBandwidth)
-
-        # call driver to get function pointers
-        _FabricPort = _zes_fabric_port_dditable_t()
-        r = ze_result_v(self.__dll.zesGetFabricPortProcAddrTable(version, byref(_FabricPort)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.FabricPort = _FabricPort
-
-        # attach function interface to function address
-        self.zesFabricPortGetProperties = _zesFabricPortGetProperties_t(self.__dditable.FabricPort.pfnGetProperties)
-        self.zesFabricPortGetLinkType = _zesFabricPortGetLinkType_t(self.__dditable.FabricPort.pfnGetLinkType)
-        self.zesFabricPortGetConfig = _zesFabricPortGetConfig_t(self.__dditable.FabricPort.pfnGetConfig)
-        self.zesFabricPortSetConfig = _zesFabricPortSetConfig_t(self.__dditable.FabricPort.pfnSetConfig)
-        self.zesFabricPortGetState = _zesFabricPortGetState_t(self.__dditable.FabricPort.pfnGetState)
-        self.zesFabricPortGetThroughput = _zesFabricPortGetThroughput_t(self.__dditable.FabricPort.pfnGetThroughput)
-
-        # call driver to get function pointers
-        _Temperature = _zes_temperature_dditable_t()
-        r = ze_result_v(self.__dll.zesGetTemperatureProcAddrTable(version, byref(_Temperature)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Temperature = _Temperature
-
-        # attach function interface to function address
-        self.zesTemperatureGetProperties = _zesTemperatureGetProperties_t(self.__dditable.Temperature.pfnGetProperties)
-        self.zesTemperatureGetConfig = _zesTemperatureGetConfig_t(self.__dditable.Temperature.pfnGetConfig)
-        self.zesTemperatureSetConfig = _zesTemperatureSetConfig_t(self.__dditable.Temperature.pfnSetConfig)
-        self.zesTemperatureGetState = _zesTemperatureGetState_t(self.__dditable.Temperature.pfnGetState)
-
-        # call driver to get function pointers
-        _Psu = _zes_psu_dditable_t()
-        r = ze_result_v(self.__dll.zesGetPsuProcAddrTable(version, byref(_Psu)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Psu = _Psu
-
-        # attach function interface to function address
-        self.zesPsuGetProperties = _zesPsuGetProperties_t(self.__dditable.Psu.pfnGetProperties)
-        self.zesPsuGetState = _zesPsuGetState_t(self.__dditable.Psu.pfnGetState)
-
-        # call driver to get function pointers
-        _Fan = _zes_fan_dditable_t()
-        r = ze_result_v(self.__dll.zesGetFanProcAddrTable(version, byref(_Fan)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Fan = _Fan
-
-        # attach function interface to function address
-        self.zesFanGetProperties = _zesFanGetProperties_t(self.__dditable.Fan.pfnGetProperties)
-        self.zesFanGetConfig = _zesFanGetConfig_t(self.__dditable.Fan.pfnGetConfig)
-        self.zesFanSetDefaultMode = _zesFanSetDefaultMode_t(self.__dditable.Fan.pfnSetDefaultMode)
-        self.zesFanSetFixedSpeedMode = _zesFanSetFixedSpeedMode_t(self.__dditable.Fan.pfnSetFixedSpeedMode)
-        self.zesFanSetSpeedTableMode = _zesFanSetSpeedTableMode_t(self.__dditable.Fan.pfnSetSpeedTableMode)
-        self.zesFanGetState = _zesFanGetState_t(self.__dditable.Fan.pfnGetState)
-
-        # call driver to get function pointers
-        _Led = _zes_led_dditable_t()
-        r = ze_result_v(self.__dll.zesGetLedProcAddrTable(version, byref(_Led)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Led = _Led
-
-        # attach function interface to function address
-        self.zesLedGetProperties = _zesLedGetProperties_t(self.__dditable.Led.pfnGetProperties)
-        self.zesLedGetState = _zesLedGetState_t(self.__dditable.Led.pfnGetState)
-        self.zesLedSetState = _zesLedSetState_t(self.__dditable.Led.pfnSetState)
-        self.zesLedSetColor = _zesLedSetColor_t(self.__dditable.Led.pfnSetColor)
-
-        # call driver to get function pointers
-        _Ras = _zes_ras_dditable_t()
-        r = ze_result_v(self.__dll.zesGetRasProcAddrTable(version, byref(_Ras)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Ras = _Ras
-
-        # attach function interface to function address
-        self.zesRasGetProperties = _zesRasGetProperties_t(self.__dditable.Ras.pfnGetProperties)
-        self.zesRasGetConfig = _zesRasGetConfig_t(self.__dditable.Ras.pfnGetConfig)
-        self.zesRasSetConfig = _zesRasSetConfig_t(self.__dditable.Ras.pfnSetConfig)
-        self.zesRasGetState = _zesRasGetState_t(self.__dditable.Ras.pfnGetState)
-
-        # call driver to get function pointers
-        _Diagnostics = _zes_diagnostics_dditable_t()
-        r = ze_result_v(self.__dll.zesGetDiagnosticsProcAddrTable(version, byref(_Diagnostics)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Diagnostics = _Diagnostics
-
-        # attach function interface to function address
-        self.zesDiagnosticsGetProperties = _zesDiagnosticsGetProperties_t(self.__dditable.Diagnostics.pfnGetProperties)
-        self.zesDiagnosticsGetTests = _zesDiagnosticsGetTests_t(self.__dditable.Diagnostics.pfnGetTests)
-        self.zesDiagnosticsRunTests = _zesDiagnosticsRunTests_t(self.__dditable.Diagnostics.pfnRunTests)
-
-        # success!
diff --git a/src/gpu/intel/sycl/l0/level_zero/zes_api.h b/src/gpu/intel/sycl/l0/level_zero/zes_api.h
deleted file mode 100644
index 7b5f0611266..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/zes_api.h
+++ /dev/null
@@ -1,4741 +0,0 @@
-/*
- *
- * Copyright (C) 2019-2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file zes_api.h
- * @version v1.3-r1.3.7
- *
- */
-#ifndef _ZES_API_H
-#define _ZES_API_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-
-// 'core' API headers
-#include "ze_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-// Intel 'oneAPI' Level-Zero Sysman API common types
-#if !defined(__GNUC__)
-#pragma region common
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle to a driver instance
-typedef ze_driver_handle_t zes_driver_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of device object
-typedef ze_device_handle_t zes_device_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device scheduler queue
-typedef struct _zes_sched_handle_t *zes_sched_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device performance factors
-typedef struct _zes_perf_handle_t *zes_perf_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device power domain
-typedef struct _zes_pwr_handle_t *zes_pwr_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device frequency domain
-typedef struct _zes_freq_handle_t *zes_freq_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device engine group
-typedef struct _zes_engine_handle_t *zes_engine_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device standby control
-typedef struct _zes_standby_handle_t *zes_standby_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device firmware
-typedef struct _zes_firmware_handle_t *zes_firmware_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device memory module
-typedef struct _zes_mem_handle_t *zes_mem_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman fabric port
-typedef struct _zes_fabric_port_handle_t *zes_fabric_port_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device temperature sensor
-typedef struct _zes_temp_handle_t *zes_temp_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device power supply
-typedef struct _zes_psu_handle_t *zes_psu_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device fan
-typedef struct _zes_fan_handle_t *zes_fan_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device LED
-typedef struct _zes_led_handle_t *zes_led_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device RAS error set
-typedef struct _zes_ras_handle_t *zes_ras_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle for a Sysman device diagnostics test suite
-typedef struct _zes_diag_handle_t *zes_diag_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Defines structure types
-typedef enum _zes_structure_type_t
-{
-    ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,     ///< ::zes_device_properties_t
-    ZES_STRUCTURE_TYPE_PCI_PROPERTIES = 0x2,        ///< ::zes_pci_properties_t
-    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES = 0x3,    ///< ::zes_pci_bar_properties_t
-    ZES_STRUCTURE_TYPE_DIAG_PROPERTIES = 0x4,       ///< ::zes_diag_properties_t
-    ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES = 0x5,     ///< ::zes_engine_properties_t
-    ZES_STRUCTURE_TYPE_FABRIC_PORT_PROPERTIES = 0x6,///< ::zes_fabric_port_properties_t
-    ZES_STRUCTURE_TYPE_FAN_PROPERTIES = 0x7,        ///< ::zes_fan_properties_t
-    ZES_STRUCTURE_TYPE_FIRMWARE_PROPERTIES = 0x8,   ///< ::zes_firmware_properties_t
-    ZES_STRUCTURE_TYPE_FREQ_PROPERTIES = 0x9,       ///< ::zes_freq_properties_t
-    ZES_STRUCTURE_TYPE_LED_PROPERTIES = 0xa,        ///< ::zes_led_properties_t
-    ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,        ///< ::zes_mem_properties_t
-    ZES_STRUCTURE_TYPE_PERF_PROPERTIES = 0xc,       ///< ::zes_perf_properties_t
-    ZES_STRUCTURE_TYPE_POWER_PROPERTIES = 0xd,      ///< ::zes_power_properties_t
-    ZES_STRUCTURE_TYPE_PSU_PROPERTIES = 0xe,        ///< ::zes_psu_properties_t
-    ZES_STRUCTURE_TYPE_RAS_PROPERTIES = 0xf,        ///< ::zes_ras_properties_t
-    ZES_STRUCTURE_TYPE_SCHED_PROPERTIES = 0x10,     ///< ::zes_sched_properties_t
-    ZES_STRUCTURE_TYPE_SCHED_TIMEOUT_PROPERTIES = 0x11, ///< ::zes_sched_timeout_properties_t
-    ZES_STRUCTURE_TYPE_SCHED_TIMESLICE_PROPERTIES = 0x12,   ///< ::zes_sched_timeslice_properties_t
-    ZES_STRUCTURE_TYPE_STANDBY_PROPERTIES = 0x13,   ///< ::zes_standby_properties_t
-    ZES_STRUCTURE_TYPE_TEMP_PROPERTIES = 0x14,      ///< ::zes_temp_properties_t
-    ZES_STRUCTURE_TYPE_DEVICE_STATE = 0x15,         ///< ::zes_device_state_t
-    ZES_STRUCTURE_TYPE_PROCESS_STATE = 0x16,        ///< ::zes_process_state_t
-    ZES_STRUCTURE_TYPE_PCI_STATE = 0x17,            ///< ::zes_pci_state_t
-    ZES_STRUCTURE_TYPE_FABRIC_PORT_CONFIG = 0x18,   ///< ::zes_fabric_port_config_t
-    ZES_STRUCTURE_TYPE_FABRIC_PORT_STATE = 0x19,    ///< ::zes_fabric_port_state_t
-    ZES_STRUCTURE_TYPE_FAN_CONFIG = 0x1a,           ///< ::zes_fan_config_t
-    ZES_STRUCTURE_TYPE_FREQ_STATE = 0x1b,           ///< ::zes_freq_state_t
-    ZES_STRUCTURE_TYPE_OC_CAPABILITIES = 0x1c,      ///< ::zes_oc_capabilities_t
-    ZES_STRUCTURE_TYPE_LED_STATE = 0x1d,            ///< ::zes_led_state_t
-    ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,            ///< ::zes_mem_state_t
-    ZES_STRUCTURE_TYPE_PSU_STATE = 0x1f,            ///< ::zes_psu_state_t
-    ZES_STRUCTURE_TYPE_BASE_STATE = 0x20,           ///< ::zes_base_state_t
-    ZES_STRUCTURE_TYPE_RAS_CONFIG = 0x21,           ///< ::zes_ras_config_t
-    ZES_STRUCTURE_TYPE_RAS_STATE = 0x22,            ///< ::zes_ras_state_t
-    ZES_STRUCTURE_TYPE_TEMP_CONFIG = 0x23,          ///< ::zes_temp_config_t
-    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES_1_2 = 0x24,   ///< ::zes_pci_bar_properties_1_2_t
-    ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zes_structure_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all properties types
-typedef struct _zes_base_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-
-} zes_base_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all descriptor types
-typedef struct _zes_base_desc_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-
-} zes_base_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all state types
-typedef struct _zes_base_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-
-} zes_base_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all config types
-typedef struct _zes_base_config_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-
-} zes_base_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all capability types
-typedef struct _zes_base_capability_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-
-} zes_base_capability_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_base_properties_t
-typedef struct _zes_base_properties_t zes_base_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_base_desc_t
-typedef struct _zes_base_desc_t zes_base_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_base_state_t
-typedef struct _zes_base_state_t zes_base_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_base_config_t
-typedef struct _zes_base_config_t zes_base_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_base_capability_t
-typedef struct _zes_base_capability_t zes_base_capability_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_device_state_t
-typedef struct _zes_device_state_t zes_device_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_device_properties_t
-typedef struct _zes_device_properties_t zes_device_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_process_state_t
-typedef struct _zes_process_state_t zes_process_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_address_t
-typedef struct _zes_pci_address_t zes_pci_address_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_speed_t
-typedef struct _zes_pci_speed_t zes_pci_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_properties_t
-typedef struct _zes_pci_properties_t zes_pci_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_state_t
-typedef struct _zes_pci_state_t zes_pci_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_bar_properties_t
-typedef struct _zes_pci_bar_properties_t zes_pci_bar_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_bar_properties_1_2_t
-typedef struct _zes_pci_bar_properties_1_2_t zes_pci_bar_properties_1_2_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_pci_stats_t
-typedef struct _zes_pci_stats_t zes_pci_stats_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_diag_test_t
-typedef struct _zes_diag_test_t zes_diag_test_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_diag_properties_t
-typedef struct _zes_diag_properties_t zes_diag_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_engine_properties_t
-typedef struct _zes_engine_properties_t zes_engine_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_engine_stats_t
-typedef struct _zes_engine_stats_t zes_engine_stats_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_port_id_t
-typedef struct _zes_fabric_port_id_t zes_fabric_port_id_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_port_speed_t
-typedef struct _zes_fabric_port_speed_t zes_fabric_port_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_port_properties_t
-typedef struct _zes_fabric_port_properties_t zes_fabric_port_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_link_type_t
-typedef struct _zes_fabric_link_type_t zes_fabric_link_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_port_config_t
-typedef struct _zes_fabric_port_config_t zes_fabric_port_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_port_state_t
-typedef struct _zes_fabric_port_state_t zes_fabric_port_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fabric_port_throughput_t
-typedef struct _zes_fabric_port_throughput_t zes_fabric_port_throughput_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fan_speed_t
-typedef struct _zes_fan_speed_t zes_fan_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fan_temp_speed_t
-typedef struct _zes_fan_temp_speed_t zes_fan_temp_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fan_speed_table_t
-typedef struct _zes_fan_speed_table_t zes_fan_speed_table_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fan_properties_t
-typedef struct _zes_fan_properties_t zes_fan_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_fan_config_t
-typedef struct _zes_fan_config_t zes_fan_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_firmware_properties_t
-typedef struct _zes_firmware_properties_t zes_firmware_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_freq_properties_t
-typedef struct _zes_freq_properties_t zes_freq_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_freq_range_t
-typedef struct _zes_freq_range_t zes_freq_range_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_freq_state_t
-typedef struct _zes_freq_state_t zes_freq_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_freq_throttle_time_t
-typedef struct _zes_freq_throttle_time_t zes_freq_throttle_time_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_oc_capabilities_t
-typedef struct _zes_oc_capabilities_t zes_oc_capabilities_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_led_properties_t
-typedef struct _zes_led_properties_t zes_led_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_led_color_t
-typedef struct _zes_led_color_t zes_led_color_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_led_state_t
-typedef struct _zes_led_state_t zes_led_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_mem_properties_t
-typedef struct _zes_mem_properties_t zes_mem_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_mem_state_t
-typedef struct _zes_mem_state_t zes_mem_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_mem_bandwidth_t
-typedef struct _zes_mem_bandwidth_t zes_mem_bandwidth_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_perf_properties_t
-typedef struct _zes_perf_properties_t zes_perf_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_power_properties_t
-typedef struct _zes_power_properties_t zes_power_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_power_energy_counter_t
-typedef struct _zes_power_energy_counter_t zes_power_energy_counter_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_power_sustained_limit_t
-typedef struct _zes_power_sustained_limit_t zes_power_sustained_limit_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_power_burst_limit_t
-typedef struct _zes_power_burst_limit_t zes_power_burst_limit_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_power_peak_limit_t
-typedef struct _zes_power_peak_limit_t zes_power_peak_limit_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_energy_threshold_t
-typedef struct _zes_energy_threshold_t zes_energy_threshold_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_psu_properties_t
-typedef struct _zes_psu_properties_t zes_psu_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_psu_state_t
-typedef struct _zes_psu_state_t zes_psu_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_ras_properties_t
-typedef struct _zes_ras_properties_t zes_ras_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_ras_state_t
-typedef struct _zes_ras_state_t zes_ras_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_ras_config_t
-typedef struct _zes_ras_config_t zes_ras_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_sched_properties_t
-typedef struct _zes_sched_properties_t zes_sched_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_sched_timeout_properties_t
-typedef struct _zes_sched_timeout_properties_t zes_sched_timeout_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_sched_timeslice_properties_t
-typedef struct _zes_sched_timeslice_properties_t zes_sched_timeslice_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_standby_properties_t
-typedef struct _zes_standby_properties_t zes_standby_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_temp_properties_t
-typedef struct _zes_temp_properties_t zes_temp_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_temp_threshold_t
-typedef struct _zes_temp_threshold_t zes_temp_threshold_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zes_temp_config_t
-typedef struct _zes_temp_config_t zes_temp_config_t;
-
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Device management
-#if !defined(__GNUC__)
-#pragma region device
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_STRING_PROPERTY_SIZE
-/// @brief Maximum number of characters in string properties.
-#define ZES_STRING_PROPERTY_SIZE  64
-#endif // ZES_STRING_PROPERTY_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Types of accelerator engines
-typedef uint32_t zes_engine_type_flags_t;
-typedef enum _zes_engine_type_flag_t
-{
-    ZES_ENGINE_TYPE_FLAG_OTHER = ZE_BIT(0),         ///< Undefined types of accelerators.
-    ZES_ENGINE_TYPE_FLAG_COMPUTE = ZE_BIT(1),       ///< Engines that process compute kernels only (no 3D content).
-    ZES_ENGINE_TYPE_FLAG_3D = ZE_BIT(2),            ///< Engines that process 3D content only (no compute kernels).
-    ZES_ENGINE_TYPE_FLAG_MEDIA = ZE_BIT(3),         ///< Engines that process media workloads.
-    ZES_ENGINE_TYPE_FLAG_DMA = ZE_BIT(4),           ///< Engines that copy blocks of data.
-    ZES_ENGINE_TYPE_FLAG_RENDER = ZE_BIT(5),        ///< Engines that can process both 3D content and compute kernels.
-    ZES_ENGINE_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_engine_type_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device repair status
-typedef enum _zes_repair_status_t
-{
-    ZES_REPAIR_STATUS_UNSUPPORTED = 0,              ///< The device does not support in-field repairs.
-    ZES_REPAIR_STATUS_NOT_PERFORMED = 1,            ///< The device has never been repaired.
-    ZES_REPAIR_STATUS_PERFORMED = 2,                ///< The device has been repaired.
-    ZES_REPAIR_STATUS_FORCE_UINT32 = 0x7fffffff
-
-} zes_repair_status_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device reset reasons
-typedef uint32_t zes_reset_reason_flags_t;
-typedef enum _zes_reset_reason_flag_t
-{
-    ZES_RESET_REASON_FLAG_WEDGED = ZE_BIT(0),       ///< The device needs to be reset because one or more parts of the hardware
-                                                    ///< is wedged
-    ZES_RESET_REASON_FLAG_REPAIR = ZE_BIT(1),       ///< The device needs to be reset in order to complete in-field repairs
-    ZES_RESET_REASON_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_reset_reason_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device state
-typedef struct _zes_device_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zes_reset_reason_flags_t reset;                 ///< [out] Indicates if the device needs to be reset and for what reasons.
-                                                    ///< returns 0 (none) or combination of ::zes_reset_reason_flag_t
-    zes_repair_status_t repaired;                   ///< [out] Indicates if the device has been repaired
-
-} zes_device_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device properties
-typedef struct _zes_device_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_device_properties_t core;                    ///< [out] Core device properties
-    uint32_t numSubdevices;                         ///< [out] Number of sub-devices. A value of 0 indicates that this device
-                                                    ///< doesn't have sub-devices.
-    char serialNumber[ZES_STRING_PROPERTY_SIZE];    ///< [out] Manufacturing serial number (NULL terminated string value). Will
-                                                    ///< be set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char boardNumber[ZES_STRING_PROPERTY_SIZE];     ///< [out] Manufacturing board number (NULL terminated string value). Will
-                                                    ///< be set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char brandName[ZES_STRING_PROPERTY_SIZE];       ///< [out] Brand name of the device (NULL terminated string value). Will be
-                                                    ///< set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char modelName[ZES_STRING_PROPERTY_SIZE];       ///< [out] Model name of the device (NULL terminated string value). Will be
-                                                    ///< set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char vendorName[ZES_STRING_PROPERTY_SIZE];      ///< [out] Vendor name of the device (NULL terminated string value). Will
-                                                    ///< be set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-    char driverVersion[ZES_STRING_PROPERTY_SIZE];   ///< [out] Installed driver version (NULL terminated string value). Will be
-                                                    ///< set to the string "unkown" if this cannot be determined for the
-                                                    ///< device.
-
-} zes_device_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get properties about the device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceGetProperties(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_device_properties_t* pProperties            ///< [in,out] Structure that will contain information about the device.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get information about the state of the device - if a reset is
-///        required, reasons for the reset and if the device has been repaired
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceGetState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_device_state_t* pState                      ///< [in,out] Structure that will contain information about the device.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Reset device
-/// 
-/// @details
-///     - Performs a PCI bus reset of the device. This will result in all
-///       current device state being lost.
-///     - All applications using the device should be stopped before calling
-///       this function.
-///     - If the force argument is specified, all applications using the device
-///       will be forcibly killed.
-///     - The function will block until the device has restarted or a timeout
-///       occurred waiting for the reset to complete.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to perform this operation.
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - "Reset cannot be performed because applications are using this device."
-///     - ::ZE_RESULT_ERROR_UNKNOWN - "There were problems unloading the device driver, performing a bus reset or reloading the device driver."
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceReset(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle for the device
-    ze_bool_t force                                 ///< [in] If set to true, all applications that are currently using the
-                                                    ///< device will be forcibly killed.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Contains information about a process that has an open connection with
-///        this device
-/// 
-/// @details
-///     - The application can use the process ID to query the OS for the owner
-///       and the path to the executable.
-typedef struct _zes_process_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    uint32_t processId;                             ///< [out] Host OS process ID.
-    uint64_t memSize;                               ///< [out] Device memory size in bytes allocated by this process (may not
-                                                    ///< necessarily be resident on the device at the time of reading).
-    uint64_t sharedSize;                            ///< [out] The size of shared device memory mapped into this process (may
-                                                    ///< not necessarily be resident on the device at the time of reading).
-    zes_engine_type_flags_t engines;                ///< [out] Bitfield of accelerator engine types being used by this process.
-
-} zes_process_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get information about host processes using the device
-/// 
-/// @details
-///     - The number of processes connected to the device is dynamic. This means
-///       that between a call to determine the value of pCount and the
-///       subsequent call, the number of processes may have increased or
-///       decreased. It is recommended that a large array be passed in so as to
-///       avoid receiving the error ::ZE_RESULT_ERROR_INVALID_SIZE. Also, always
-///       check the returned value in pCount since it may be less than the
-///       earlier call to get the required array size.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + The provided value of pCount is not big enough to store information about all the processes currently attached to the device.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceProcessesGetState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle for the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of processes.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of processes currently attached to the device.
-                                                    ///< if count is greater than the number of processes currently attached to
-                                                    ///< the device, then the driver shall update the value with the correct
-                                                    ///< number of processes.
-    zes_process_state_t* pProcesses                 ///< [in,out][optional][range(0, *pCount)] array of process information.
-                                                    ///< if count is less than the number of processes currently attached to
-                                                    ///< the device, then the driver shall only retrieve information about that
-                                                    ///< number of processes. In this case, the return code will ::ZE_RESULT_ERROR_INVALID_SIZE.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI address
-typedef struct _zes_pci_address_t
-{
-    uint32_t domain;                                ///< [out] BDF domain
-    uint32_t bus;                                   ///< [out] BDF bus
-    uint32_t device;                                ///< [out] BDF device
-    uint32_t function;                              ///< [out] BDF function
-
-} zes_pci_address_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI speed
-typedef struct _zes_pci_speed_t
-{
-    int32_t gen;                                    ///< [out] The link generation. A value of -1 means that this property is
-                                                    ///< unknown.
-    int32_t width;                                  ///< [out] The number of lanes. A value of -1 means that this property is
-                                                    ///< unknown.
-    int64_t maxBandwidth;                           ///< [out] The maximum bandwidth in bytes/sec (sum of all lanes). A value
-                                                    ///< of -1 means that this property is unknown.
-
-} zes_pci_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Static PCI properties
-typedef struct _zes_pci_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_pci_address_t address;                      ///< [out] The BDF address
-    zes_pci_speed_t maxSpeed;                       ///< [out] Fastest port configuration supported by the device (sum of all
-                                                    ///< lanes)
-    ze_bool_t haveBandwidthCounters;                ///< [out] Indicates if ::zes_pci_stats_t.rxCounter and
-                                                    ///< ::zes_pci_stats_t.txCounter will have valid values
-    ze_bool_t havePacketCounters;                   ///< [out] Indicates if ::zes_pci_stats_t.packetCounter will have valid
-                                                    ///< values
-    ze_bool_t haveReplayCounters;                   ///< [out] Indicates if ::zes_pci_stats_t.replayCounter will have valid
-                                                    ///< values
-
-} zes_pci_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI link status
-typedef enum _zes_pci_link_status_t
-{
-    ZES_PCI_LINK_STATUS_UNKNOWN = 0,                ///< The link status could not be determined
-    ZES_PCI_LINK_STATUS_GOOD = 1,                   ///< The link is up and operating as expected
-    ZES_PCI_LINK_STATUS_QUALITY_ISSUES = 2,         ///< The link is up but has quality and/or bandwidth degradation
-    ZES_PCI_LINK_STATUS_STABILITY_ISSUES = 3,       ///< The link has stability issues and preventing workloads making forward
-                                                    ///< progress
-    ZES_PCI_LINK_STATUS_FORCE_UINT32 = 0x7fffffff
-
-} zes_pci_link_status_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI link quality degradation reasons
-typedef uint32_t zes_pci_link_qual_issue_flags_t;
-typedef enum _zes_pci_link_qual_issue_flag_t
-{
-    ZES_PCI_LINK_QUAL_ISSUE_FLAG_REPLAYS = ZE_BIT(0),   ///< A significant number of replays are occurring
-    ZES_PCI_LINK_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1), ///< There is a degradation in the maximum bandwidth of the link
-    ZES_PCI_LINK_QUAL_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_pci_link_qual_issue_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI link stability issues
-typedef uint32_t zes_pci_link_stab_issue_flags_t;
-typedef enum _zes_pci_link_stab_issue_flag_t
-{
-    ZES_PCI_LINK_STAB_ISSUE_FLAG_RETRAINING = ZE_BIT(0),///< Link retraining has occurred to deal with quality issues
-    ZES_PCI_LINK_STAB_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_pci_link_stab_issue_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Dynamic PCI state
-typedef struct _zes_pci_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zes_pci_link_status_t status;                   ///< [out] The current status of the port
-    zes_pci_link_qual_issue_flags_t qualityIssues;  ///< [out] If status is ::ZES_PCI_LINK_STATUS_QUALITY_ISSUES, 
-                                                    ///< then this gives a combination of ::zes_pci_link_qual_issue_flag_t for
-                                                    ///< quality issues that have been detected;
-                                                    ///< otherwise, 0 indicates there are no quality issues with the link at
-                                                    ///< this time."
-    zes_pci_link_stab_issue_flags_t stabilityIssues;///< [out] If status is ::ZES_PCI_LINK_STATUS_STABILITY_ISSUES, 
-                                                    ///< then this gives a combination of ::zes_pci_link_stab_issue_flag_t for
-                                                    ///< reasons for the connection instability;
-                                                    ///< otherwise, 0 indicates there are no connection stability issues at
-                                                    ///< this time."
-    zes_pci_speed_t speed;                          ///< [out] The current port configure speed
-
-} zes_pci_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI bar types
-typedef enum _zes_pci_bar_type_t
-{
-    ZES_PCI_BAR_TYPE_MMIO = 0,                      ///< MMIO registers
-    ZES_PCI_BAR_TYPE_ROM = 1,                       ///< ROM aperture
-    ZES_PCI_BAR_TYPE_MEM = 2,                       ///< Device memory
-    ZES_PCI_BAR_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zes_pci_bar_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Properties of a pci bar
-typedef struct _zes_pci_bar_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_pci_bar_type_t type;                        ///< [out] The type of bar
-    uint32_t index;                                 ///< [out] The index of the bar
-    uint64_t base;                                  ///< [out] Base address of the bar.
-    uint64_t size;                                  ///< [out] Size of the bar.
-
-} zes_pci_bar_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Properties of a pci bar, including the resizable bar.
-typedef struct _zes_pci_bar_properties_1_2_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_pci_bar_type_t type;                        ///< [out] The type of bar
-    uint32_t index;                                 ///< [out] The index of the bar
-    uint64_t base;                                  ///< [out] Base address of the bar.
-    uint64_t size;                                  ///< [out] Size of the bar.
-    ze_bool_t resizableBarSupported;                ///< [out] Support for Resizable Bar on this device.
-    ze_bool_t resizableBarEnabled;                  ///< [out] Resizable Bar enabled on this device
-
-} zes_pci_bar_properties_1_2_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PCI stats counters
-/// 
-/// @details
-///     - Percent replays is calculated by taking two snapshots (s1, s2) and
-///       using the equation: %replay = 10^6 * (s2.replayCounter -
-///       s1.replayCounter) / (s2.maxBandwidth * (s2.timestamp - s1.timestamp))
-///     - Percent throughput is calculated by taking two snapshots (s1, s2) and
-///       using the equation: %bw = 10^6 * ((s2.rxCounter - s1.rxCounter) +
-///       (s2.txCounter - s1.txCounter)) / (s2.maxBandwidth * (s2.timestamp -
-///       s1.timestamp))
-typedef struct _zes_pci_stats_t
-{
-    uint64_t timestamp;                             ///< [out] Monotonic timestamp counter in microseconds when the measurement
-                                                    ///< was made.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-    uint64_t replayCounter;                         ///< [out] Monotonic counter for the number of replay packets (sum of all
-                                                    ///< lanes). Will always be 0 if ::zes_pci_properties_t.haveReplayCounters
-                                                    ///< is FALSE.
-    uint64_t packetCounter;                         ///< [out] Monotonic counter for the number of packets (sum of all lanes).
-                                                    ///< Will always be 0 if ::zes_pci_properties_t.havePacketCounters is
-                                                    ///< FALSE.
-    uint64_t rxCounter;                             ///< [out] Monotonic counter for the number of bytes received (sum of all
-                                                    ///< lanes). Will always be 0 if
-                                                    ///< ::zes_pci_properties_t.haveBandwidthCounters is FALSE.
-    uint64_t txCounter;                             ///< [out] Monotonic counter for the number of bytes transmitted (including
-                                                    ///< replays) (sum of all lanes). Will always be 0 if
-                                                    ///< ::zes_pci_properties_t.haveBandwidthCounters is FALSE.
-    zes_pci_speed_t speed;                          ///< [out] The current speed of the link (sum of all lanes)
-
-} zes_pci_stats_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get PCI properties - address, max speed
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDevicePciGetProperties(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pci_properties_t* pProperties               ///< [in,out] Will contain the PCI properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current PCI state - current speed
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDevicePciGetState(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pci_state_t* pState                         ///< [in,out] Will contain the PCI properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get information about each configured bar
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDevicePciGetBars(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of PCI bars.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of PCI bars that are setup.
-                                                    ///< if count is greater than the number of PCI bars that are setup, then
-                                                    ///< the driver shall update the value with the correct number of PCI bars.
-    zes_pci_bar_properties_t* pProperties           ///< [in,out][optional][range(0, *pCount)] array of information about setup
-                                                    ///< PCI bars.
-                                                    ///< if count is less than the number of PCI bars that are setup, then the
-                                                    ///< driver shall only retrieve information about that number of PCI bars.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get PCI stats - bandwidth, number of packets, number of replays
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pStats`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to query this telemetry.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDevicePciGetStats(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pci_stats_t* pStats                         ///< [in,out] Will contain a snapshot of the latest stats.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region diagnostics
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Diagnostic results
-typedef enum _zes_diag_result_t
-{
-    ZES_DIAG_RESULT_NO_ERRORS = 0,                  ///< Diagnostic completed without finding errors to repair
-    ZES_DIAG_RESULT_ABORT = 1,                      ///< Diagnostic had problems running tests
-    ZES_DIAG_RESULT_FAIL_CANT_REPAIR = 2,           ///< Diagnostic had problems setting up repairs
-    ZES_DIAG_RESULT_REBOOT_FOR_REPAIR = 3,          ///< Diagnostics found errors, setup for repair and reboot is required to
-                                                    ///< complete the process
-    ZES_DIAG_RESULT_FORCE_UINT32 = 0x7fffffff
-
-} zes_diag_result_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_DIAG_FIRST_TEST_INDEX
-/// @brief Diagnostic test index to use for the very first test.
-#define ZES_DIAG_FIRST_TEST_INDEX  0x0
-#endif // ZES_DIAG_FIRST_TEST_INDEX
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_DIAG_LAST_TEST_INDEX
-/// @brief Diagnostic test index to use for the very last test.
-#define ZES_DIAG_LAST_TEST_INDEX  0xFFFFFFFF
-#endif // ZES_DIAG_LAST_TEST_INDEX
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Diagnostic test
-typedef struct _zes_diag_test_t
-{
-    uint32_t index;                                 ///< [out] Index of the test
-    char name[ZES_STRING_PROPERTY_SIZE];            ///< [out] Name of the test
-
-} zes_diag_test_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Diagnostics test suite properties
-typedef struct _zes_diag_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    char name[ZES_STRING_PROPERTY_SIZE];            ///< [out] Name of the diagnostics test suite
-    ze_bool_t haveTests;                            ///< [out] Indicates if this test suite has individual tests which can be
-                                                    ///< run separately (use the function ::zesDiagnosticsGetTests() to get the
-                                                    ///< list of these tests)
-
-} zes_diag_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of diagnostics test suites
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumDiagnosticTestSuites(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_diag_handle_t* phDiagnostics                ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get properties of a diagnostics test suite
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDiagnostics`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDiagnosticsGetProperties(
-    zes_diag_handle_t hDiagnostics,                 ///< [in] Handle for the component.
-    zes_diag_properties_t* pProperties              ///< [in,out] Structure describing the properties of a diagnostics test
-                                                    ///< suite
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get individual tests that can be run separately. Not all test suites
-///        permit running individual tests - check
-///        ::zes_diag_properties_t.haveTests
-/// 
-/// @details
-///     - The list of available tests is returned in order of increasing test
-///       index ::zes_diag_test_t.index.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDiagnostics`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDiagnosticsGetTests(
-    zes_diag_handle_t hDiagnostics,                 ///< [in] Handle for the component.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of tests.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of tests that are available.
-                                                    ///< if count is greater than the number of tests that are available, then
-                                                    ///< the driver shall update the value with the correct number of tests.
-    zes_diag_test_t* pTests                         ///< [in,out][optional][range(0, *pCount)] array of information about
-                                                    ///< individual tests sorted by increasing value of ::zes_diag_test_t.index.
-                                                    ///< if count is less than the number of tests that are available, then the
-                                                    ///< driver shall only retrieve that number of tests.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Run a diagnostics test suite, either all tests or a subset of tests.
-/// 
-/// @details
-///     - WARNING: Running diagnostics may destroy current device state
-///       information. Gracefully close any running workloads before initiating.
-///     - To run all tests in a test suite, set start =
-///       ::ZES_DIAG_FIRST_TEST_INDEX and end = ::ZES_DIAG_LAST_TEST_INDEX.
-///     - If the test suite permits running individual tests,
-///       ::zes_diag_properties_t.haveTests will be true. In this case, the
-///       function ::zesDiagnosticsGetTests() can be called to get the list of
-///       tests and corresponding indices that can be supplied to the arguments
-///       start and end in this function.
-///     - This function will block until the diagnostics have completed and
-///       force reset based on result
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDiagnostics`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pResult`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to perform diagnostics.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDiagnosticsRunTests(
-    zes_diag_handle_t hDiagnostics,                 ///< [in] Handle for the component.
-    uint32_t startIndex,                            ///< [in] The index of the first test to run. Set to
-                                                    ///< ::ZES_DIAG_FIRST_TEST_INDEX to start from the beginning.
-    uint32_t endIndex,                              ///< [in] The index of the last test to run. Set to
-                                                    ///< ::ZES_DIAG_LAST_TEST_INDEX to complete all tests after the start test.
-    zes_diag_result_t* pResult                      ///< [in,out] The result of the diagnostics
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Engine groups
-#if !defined(__GNUC__)
-#pragma region engine
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Accelerator engine groups
-typedef enum _zes_engine_group_t
-{
-    ZES_ENGINE_GROUP_ALL = 0,                       ///< Access information about all engines combined.
-    ZES_ENGINE_GROUP_COMPUTE_ALL = 1,               ///< Access information about all compute engines combined. Compute engines
-                                                    ///< can only process compute kernels (no 3D content).
-    ZES_ENGINE_GROUP_MEDIA_ALL = 2,                 ///< Access information about all media engines combined.
-    ZES_ENGINE_GROUP_COPY_ALL = 3,                  ///< Access information about all copy (blitter) engines combined.
-    ZES_ENGINE_GROUP_COMPUTE_SINGLE = 4,            ///< Access information about a single compute engine - this is an engine
-                                                    ///< that can process compute kernels. Note that single engines may share
-                                                    ///< the same underlying accelerator resources as other engines so activity
-                                                    ///< of such an engine may not be indicative of the underlying resource
-                                                    ///< utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    ZES_ENGINE_GROUP_RENDER_SINGLE = 5,             ///< Access information about a single render engine - this is an engine
-                                                    ///< that can process both 3D content and compute kernels. Note that single
-                                                    ///< engines may share the same underlying accelerator resources as other
-                                                    ///< engines so activity of such an engine may not be indicative of the
-                                                    ///< underlying resource utilization - use
-                                                    ///< ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE = 6,       ///< Access information about a single media decode engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE = 7,       ///< Access information about a single media encode engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_COPY_SINGLE = 8,               ///< Access information about a single media encode engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_COPY_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE = 9,  ///< Access information about a single media enhancement engine. Note that
-                                                    ///< single engines may share the same underlying accelerator resources as
-                                                    ///< other engines so activity of such an engine may not be indicative of
-                                                    ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
-                                                    ///< for that.
-    ZES_ENGINE_GROUP_3D_SINGLE = 10,                ///< Access information about a single 3D engine - this is an engine that
-                                                    ///< can process 3D content only. Note that single engines may share the
-                                                    ///< same underlying accelerator resources as other engines so activity of
-                                                    ///< such an engine may not be indicative of the underlying resource
-                                                    ///< utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
-    ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL = 11,    ///< Access information about all 3D/render/compute engines combined.
-    ZES_ENGINE_GROUP_RENDER_ALL = 12,               ///< Access information about all render engines combined. Render engines
-                                                    ///< are those than process both 3D content and compute kernels.
-    ZES_ENGINE_GROUP_3D_ALL = 13,                   ///< Access information about all 3D engines combined. 3D engines can
-                                                    ///< process 3D content only (no compute kernels).
-    ZES_ENGINE_GROUP_FORCE_UINT32 = 0x7fffffff
-
-} zes_engine_group_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Engine group properties
-typedef struct _zes_engine_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_engine_group_t type;                        ///< [out] The engine group
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-
-} zes_engine_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Engine activity counters
-/// 
-/// @details
-///     - Percent utilization is calculated by taking two snapshots (s1, s2) and
-///       using the equation: %util = (s2.activeTime - s1.activeTime) /
-///       (s2.timestamp - s1.timestamp)
-typedef struct _zes_engine_stats_t
-{
-    uint64_t activeTime;                            ///< [out] Monotonic counter for time in microseconds that this resource is
-                                                    ///< actively running workloads.
-    uint64_t timestamp;                             ///< [out] Monotonic timestamp counter in microseconds when activeTime
-                                                    ///< counter was sampled.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-
-} zes_engine_stats_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of engine groups
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumEngineGroups(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_engine_handle_t* phEngine                   ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get engine group properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEngine`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesEngineGetProperties(
-    zes_engine_handle_t hEngine,                    ///< [in] Handle for the component.
-    zes_engine_properties_t* pProperties            ///< [in,out] The properties for the specified engine group.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the activity stats for an engine group
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hEngine`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pStats`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesEngineGetActivity(
-    zes_engine_handle_t hEngine,                    ///< [in] Handle for the component.
-    zes_engine_stats_t* pStats                      ///< [in,out] Will contain a snapshot of the engine group activity
-                                                    ///< counters.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Event management
-#if !defined(__GNUC__)
-#pragma region events
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event types
-typedef uint32_t zes_event_type_flags_t;
-typedef enum _zes_event_type_flag_t
-{
-    ZES_EVENT_TYPE_FLAG_DEVICE_DETACH = ZE_BIT(0),  ///< Event is triggered when the device is no longer available (due to a
-                                                    ///< reset or being disabled).
-    ZES_EVENT_TYPE_FLAG_DEVICE_ATTACH = ZE_BIT(1),  ///< Event is triggered after the device is available again.
-    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_ENTER = ZE_BIT(2),   ///< Event is triggered when the driver is about to put the device into a
-                                                    ///< deep sleep state
-    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_EXIT = ZE_BIT(3),///< Event is triggered when the driver is waking the device up from a deep
-                                                    ///< sleep state
-    ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED = ZE_BIT(4), ///< Event is triggered when the frequency starts being throttled
-    ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED = ZE_BIT(5),   ///< Event is triggered when the energy consumption threshold is reached
-                                                    ///< (use ::zesPowerSetEnergyThreshold() to configure).
-    ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL = ZE_BIT(6),  ///< Event is triggered when the critical temperature is reached (use
-                                                    ///< ::zesTemperatureSetConfig() to configure - disabled by default).
-    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 = ZE_BIT(7),///< Event is triggered when the temperature crosses threshold 1 (use
-                                                    ///< ::zesTemperatureSetConfig() to configure - disabled by default).
-    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 = ZE_BIT(8),///< Event is triggered when the temperature crosses threshold 2 (use
-                                                    ///< ::zesTemperatureSetConfig() to configure - disabled by default).
-    ZES_EVENT_TYPE_FLAG_MEM_HEALTH = ZE_BIT(9),     ///< Event is triggered when the health of device memory changes.
-    ZES_EVENT_TYPE_FLAG_FABRIC_PORT_HEALTH = ZE_BIT(10),///< Event is triggered when the health of fabric ports change.
-    ZES_EVENT_TYPE_FLAG_PCI_LINK_HEALTH = ZE_BIT(11),   ///< Event is triggered when the health of the PCI link changes.
-    ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS = ZE_BIT(12),///< Event is triggered when accelerator RAS correctable errors cross
-                                                    ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
-                                                    ///< default).
-    ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS = ZE_BIT(13),  ///< Event is triggered when accelerator RAS uncorrectable errors cross
-                                                    ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
-                                                    ///< default).
-    ZES_EVENT_TYPE_FLAG_DEVICE_RESET_REQUIRED = ZE_BIT(14), ///< Event is triggered when the device needs to be reset (use
-                                                    ///< ::zesDeviceGetState() to determine the reasons for the reset).
-    ZES_EVENT_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_event_type_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Specify the list of events to listen to for a given device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `0x7fff < events`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEventRegister(
-    zes_device_handle_t hDevice,                    ///< [in] The device handle.
-    zes_event_type_flags_t events                   ///< [in] List of events to listen to.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Wait for events to be received from a one or more devices.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phDevices`
-///         + `nullptr == pNumDeviceEvents`
-///         + `nullptr == pEvents`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to listen to events.
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + One or more of the supplied device handles belongs to a different driver.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDriverEventListen(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint32_t timeout,                               ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then will check status and return immediately;
-                                                    ///< if UINT32_MAX, then function will not return until events arrive.
-    uint32_t count,                                 ///< [in] Number of device handles in phDevices.
-    zes_device_handle_t* phDevices,                 ///< [in][range(0, count)] Device handles to listen to for events. Only
-                                                    ///< devices from the provided driver handle can be specified in this list.
-    uint32_t* pNumDeviceEvents,                     ///< [in,out] Will contain the actual number of devices in phDevices that
-                                                    ///< generated events. If non-zero, check pEvents to determine the devices
-                                                    ///< and events that were received.
-    zes_event_type_flags_t* pEvents                 ///< [in,out] An array that will continue the list of events for each
-                                                    ///< device listened in phDevices.
-                                                    ///< This array must be at least as big as count.
-                                                    ///< For every device handle in phDevices, this will provide the events
-                                                    ///< that occurred for that device at the same position in this array. If
-                                                    ///< no event was received for a given device, the corresponding array
-                                                    ///< entry will be zero.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Wait for events to be received from a one or more devices.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDriver`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phDevices`
-///         + `nullptr == pNumDeviceEvents`
-///         + `nullptr == pEvents`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to listen to events.
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + One or more of the supplied device handles belongs to a different driver.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDriverEventListenEx(
-    ze_driver_handle_t hDriver,                     ///< [in] handle of the driver instance
-    uint64_t timeout,                               ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then will check status and return immediately;
-                                                    ///< if UINT64_MAX, then function will not return until events arrive.
-    uint32_t count,                                 ///< [in] Number of device handles in phDevices.
-    zes_device_handle_t* phDevices,                 ///< [in][range(0, count)] Device handles to listen to for events. Only
-                                                    ///< devices from the provided driver handle can be specified in this list.
-    uint32_t* pNumDeviceEvents,                     ///< [in,out] Will contain the actual number of devices in phDevices that
-                                                    ///< generated events. If non-zero, check pEvents to determine the devices
-                                                    ///< and events that were received.
-    zes_event_type_flags_t* pEvents                 ///< [in,out] An array that will continue the list of events for each
-                                                    ///< device listened in phDevices.
-                                                    ///< This array must be at least as big as count.
-                                                    ///< For every device handle in phDevices, this will provide the events
-                                                    ///< that occurred for that device at the same position in this array. If
-                                                    ///< no event was received for a given device, the corresponding array
-                                                    ///< entry will be zero.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region fabric
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_MAX_FABRIC_PORT_MODEL_SIZE
-/// @brief Maximum Fabric port model string size
-#define ZES_MAX_FABRIC_PORT_MODEL_SIZE  256
-#endif // ZES_MAX_FABRIC_PORT_MODEL_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_MAX_FABRIC_LINK_TYPE_SIZE
-/// @brief Maximum size of the buffer that will return information about link
-///        types
-#define ZES_MAX_FABRIC_LINK_TYPE_SIZE  256
-#endif // ZES_MAX_FABRIC_LINK_TYPE_SIZE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port status
-typedef enum _zes_fabric_port_status_t
-{
-    ZES_FABRIC_PORT_STATUS_UNKNOWN = 0,             ///< The port status cannot be determined
-    ZES_FABRIC_PORT_STATUS_HEALTHY = 1,             ///< The port is up and operating as expected
-    ZES_FABRIC_PORT_STATUS_DEGRADED = 2,            ///< The port is up but has quality and/or speed degradation
-    ZES_FABRIC_PORT_STATUS_FAILED = 3,              ///< Port connection instabilities are preventing workloads making forward
-                                                    ///< progress
-    ZES_FABRIC_PORT_STATUS_DISABLED = 4,            ///< The port is configured down
-    ZES_FABRIC_PORT_STATUS_FORCE_UINT32 = 0x7fffffff
-
-} zes_fabric_port_status_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port quality degradation reasons
-typedef uint32_t zes_fabric_port_qual_issue_flags_t;
-typedef enum _zes_fabric_port_qual_issue_flag_t
-{
-    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_LINK_ERRORS = ZE_BIT(0),///< Excessive link errors are occurring
-    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1),  ///< There is a degradation in the bitrate and/or width of the link
-    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_fabric_port_qual_issue_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port failure reasons
-typedef uint32_t zes_fabric_port_failure_flags_t;
-typedef enum _zes_fabric_port_failure_flag_t
-{
-    ZES_FABRIC_PORT_FAILURE_FLAG_FAILED = ZE_BIT(0),///< A previously operating link has failed. Hardware will automatically
-                                                    ///< retrain this port. This state will persist until either the physical
-                                                    ///< connection is removed or the link trains successfully.
-    ZES_FABRIC_PORT_FAILURE_FLAG_TRAINING_TIMEOUT = ZE_BIT(1),  ///< A connection has not been established within an expected time.
-                                                    ///< Hardware will continue to attempt port training. This status will
-                                                    ///< persist until either the physical connection is removed or the link
-                                                    ///< successfully trains.
-    ZES_FABRIC_PORT_FAILURE_FLAG_FLAPPING = ZE_BIT(2),  ///< Port has excessively trained and then transitioned down for some
-                                                    ///< period of time. Driver will allow port to continue to train, but will
-                                                    ///< not enable the port for use until the port has been disabled and
-                                                    ///< subsequently re-enabled using ::zesFabricPortSetConfig().
-    ZES_FABRIC_PORT_FAILURE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_fabric_port_failure_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Unique identifier for a fabric port
-/// 
-/// @details
-///     - This not a universal identifier. The identified is garanteed to be
-///       unique for the current hardware configuration of the system. Changes
-///       in the hardware may result in a different identifier for a given port.
-///     - The main purpose of this identifier to build up an instantaneous
-///       topology map of system connectivity. An application should enumerate
-///       all fabric ports and match ::zes_fabric_port_state_t.remotePortId to
-///       ::zes_fabric_port_properties_t.portId.
-typedef struct _zes_fabric_port_id_t
-{
-    uint32_t fabricId;                              ///< [out] Unique identifier for the fabric end-point
-    uint32_t attachId;                              ///< [out] Unique identifier for the device attachment point
-    uint8_t portNumber;                             ///< [out] The logical port number (this is typically marked somewhere on
-                                                    ///< the physical device)
-
-} zes_fabric_port_id_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port speed in one direction
-typedef struct _zes_fabric_port_speed_t
-{
-    int64_t bitRate;                                ///< [out] Bits/sec that the link is operating at. A value of -1 means that
-                                                    ///< this property is unknown.
-    int32_t width;                                  ///< [out] The number of lanes. A value of -1 means that this property is
-                                                    ///< unknown.
-
-} zes_fabric_port_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port properties
-typedef struct _zes_fabric_port_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    char model[ZES_MAX_FABRIC_PORT_MODEL_SIZE];     ///< [out] Description of port technology. Will be set to the string
-                                                    ///< "unkown" if this cannot be determined for this port.
-    ze_bool_t onSubdevice;                          ///< [out] True if the port is located on a sub-device; false means that
-                                                    ///< the port is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    zes_fabric_port_id_t portId;                    ///< [out] The unique port identifier
-    zes_fabric_port_speed_t maxRxSpeed;             ///< [out] Maximum speed supported by the receive side of the port (sum of
-                                                    ///< all lanes)
-    zes_fabric_port_speed_t maxTxSpeed;             ///< [out] Maximum speed supported by the transmit side of the port (sum of
-                                                    ///< all lanes)
-
-} zes_fabric_port_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Provides information about the fabric link attached to a port
-typedef struct _zes_fabric_link_type_t
-{
-    char desc[ZES_MAX_FABRIC_LINK_TYPE_SIZE];       ///< [out] This provides a static textural description of the physic
-                                                    ///< attachment type. Will be set to the string "unkown" if this cannot be
-                                                    ///< determined for this port.
-
-} zes_fabric_link_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port configuration
-typedef struct _zes_fabric_port_config_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    ze_bool_t enabled;                              ///< [in,out] Port is configured up/down
-    ze_bool_t beaconing;                            ///< [in,out] Beaconing is configured on/off
-
-} zes_fabric_port_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port state
-typedef struct _zes_fabric_port_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zes_fabric_port_status_t status;                ///< [out] The current status of the port
-    zes_fabric_port_qual_issue_flags_t qualityIssues;   ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_DEGRADED, 
-                                                    ///< then this gives a combination of ::zes_fabric_port_qual_issue_flag_t
-                                                    ///< for quality issues that have been detected;
-                                                    ///< otherwise, 0 indicates there are no quality issues with the link at
-                                                    ///< this time.
-    zes_fabric_port_failure_flags_t failureReasons; ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_FAILED,
-                                                    ///< then this gives a combination of ::zes_fabric_port_failure_flag_t for
-                                                    ///< reasons for the connection instability;
-                                                    ///< otherwise, 0 indicates there are no connection stability issues at
-                                                    ///< this time.
-    zes_fabric_port_id_t remotePortId;              ///< [out] The unique port identifier for the remote connection point if
-                                                    ///< status is ::ZES_FABRIC_PORT_STATUS_HEALTHY,
-                                                    ///< ::ZES_FABRIC_PORT_STATUS_DEGRADED or ::ZES_FABRIC_PORT_STATUS_FAILED
-    zes_fabric_port_speed_t rxSpeed;                ///< [out] Current maximum receive speed (sum of all lanes)
-    zes_fabric_port_speed_t txSpeed;                ///< [out] Current maximum transmit speed (sum of all lanes)
-
-} zes_fabric_port_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fabric port throughput.
-typedef struct _zes_fabric_port_throughput_t
-{
-    uint64_t timestamp;                             ///< [out] Monotonic timestamp counter in microseconds when the measurement
-                                                    ///< was made.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-    uint64_t rxCounter;                             ///< [out] Monotonic counter for the number of bytes received (sum of all
-                                                    ///< lanes). This includes all protocol overhead, not only the GPU traffic.
-    uint64_t txCounter;                             ///< [out] Monotonic counter for the number of bytes transmitted (sum of
-                                                    ///< all lanes). This includes all protocol overhead, not only the GPU
-                                                    ///< traffic.
-
-} zes_fabric_port_throughput_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of Fabric ports in a device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumFabricPorts(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_fabric_port_handle_t* phPort                ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get Fabric port properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPort`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFabricPortGetProperties(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_properties_t* pProperties       ///< [in,out] Will contain properties of the Fabric Port.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get Fabric port link type
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPort`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pLinkType`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFabricPortGetLinkType(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_link_type_t* pLinkType               ///< [in,out] Will contain details about the link attached to the Fabric
-                                                    ///< port.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get Fabric port configuration
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPort`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFabricPortGetConfig(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_config_t* pConfig               ///< [in,out] Will contain configuration of the Fabric Port.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set Fabric port configuration
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPort`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFabricPortSetConfig(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    const zes_fabric_port_config_t* pConfig         ///< [in] Contains new configuration of the Fabric Port.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get Fabric port state - status (health/degraded/failed/disabled),
-///        reasons for link degradation or instability, current rx/tx speed
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPort`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFabricPortGetState(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_state_t* pState                 ///< [in,out] Will contain the current state of the Fabric Port
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get Fabric port throughput
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPort`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pThroughput`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to query this telemetry.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFabricPortGetThroughput(
-    zes_fabric_port_handle_t hPort,                 ///< [in] Handle for the component.
-    zes_fabric_port_throughput_t* pThroughput       ///< [in,out] Will contain the Fabric port throughput counters.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region fan
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan resource speed mode
-typedef enum _zes_fan_speed_mode_t
-{
-    ZES_FAN_SPEED_MODE_DEFAULT = 0,                 ///< The fan speed is operating using the hardware default settings
-    ZES_FAN_SPEED_MODE_FIXED = 1,                   ///< The fan speed is currently set to a fixed value
-    ZES_FAN_SPEED_MODE_TABLE = 2,                   ///< The fan speed is currently controlled dynamically by hardware based on
-                                                    ///< a temp/speed table
-    ZES_FAN_SPEED_MODE_FORCE_UINT32 = 0x7fffffff
-
-} zes_fan_speed_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan speed units
-typedef enum _zes_fan_speed_units_t
-{
-    ZES_FAN_SPEED_UNITS_RPM = 0,                    ///< The fan speed is in units of revolutions per minute (rpm)
-    ZES_FAN_SPEED_UNITS_PERCENT = 1,                ///< The fan speed is a percentage of the maximum speed of the fan
-    ZES_FAN_SPEED_UNITS_FORCE_UINT32 = 0x7fffffff
-
-} zes_fan_speed_units_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan speed
-typedef struct _zes_fan_speed_t
-{
-    int32_t speed;                                  ///< [in,out] The speed of the fan. On output, a value of -1 indicates that
-                                                    ///< there is no fixed fan speed setting.
-    zes_fan_speed_units_t units;                    ///< [in,out] The units that the fan speed is expressed in. On output, if
-                                                    ///< fan speed is -1 then units should be ignored.
-
-} zes_fan_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan temperature/speed pair
-typedef struct _zes_fan_temp_speed_t
-{
-    uint32_t temperature;                           ///< [in,out] Temperature in degrees Celsius.
-    zes_fan_speed_t speed;                          ///< [in,out] The speed of the fan
-
-} zes_fan_temp_speed_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_FAN_TEMP_SPEED_PAIR_COUNT
-/// @brief Maximum number of fan temperature/speed pairs in the fan speed table.
-#define ZES_FAN_TEMP_SPEED_PAIR_COUNT  32
-#endif // ZES_FAN_TEMP_SPEED_PAIR_COUNT
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan speed table
-typedef struct _zes_fan_speed_table_t
-{
-    int32_t numPoints;                              ///< [in,out] The number of valid points in the fan speed table. 0 means
-                                                    ///< that there is no fan speed table configured. -1 means that a fan speed
-                                                    ///< table is not supported by the hardware.
-    zes_fan_temp_speed_t table[ZES_FAN_TEMP_SPEED_PAIR_COUNT];  ///< [in,out] Array of temperature/fan speed pairs. The table is ordered
-                                                    ///< based on temperature from lowest to highest.
-
-} zes_fan_speed_table_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan properties
-typedef struct _zes_fan_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can control the fan speed assuming the
-                                                    ///< user has permissions
-    uint32_t supportedModes;                        ///< [out] Bitfield of supported fan configuration modes
-                                                    ///< (1<<::zes_fan_speed_mode_t)
-    uint32_t supportedUnits;                        ///< [out] Bitfield of supported fan speed units
-                                                    ///< (1<<::zes_fan_speed_units_t)
-    int32_t maxRPM;                                 ///< [out] The maximum RPM of the fan. A value of -1 means that this
-                                                    ///< property is unknown. 
-    int32_t maxPoints;                              ///< [out] The maximum number of points in the fan temp/speed table. A
-                                                    ///< value of -1 means that this fan doesn't support providing a temp/speed
-                                                    ///< table.
-
-} zes_fan_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Fan configuration
-typedef struct _zes_fan_config_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zes_fan_speed_mode_t mode;                      ///< [in,out] The fan speed mode (fixed, temp-speed table)
-    zes_fan_speed_t speedFixed;                     ///< [in,out] The current fixed fan speed setting
-    zes_fan_speed_table_t speedTable;               ///< [out] A table containing temperature/speed pairs
-
-} zes_fan_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of fans
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumFans(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_fan_handle_t* phFan                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get fan properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFan`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFanGetProperties(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    zes_fan_properties_t* pProperties               ///< [in,out] Will contain the properties of the fan.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get fan configurations and the current fan speed mode (default, fixed,
-///        temp-speed table)
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFan`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFanGetConfig(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    zes_fan_config_t* pConfig                       ///< [in,out] Will contain the current configuration of the fan.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Configure the fan to run with hardware factory settings (set mode to
-///        ::ZES_FAN_SPEED_MODE_DEFAULT)
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFan`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFanSetDefaultMode(
-    zes_fan_handle_t hFan                           ///< [in] Handle for the component.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Configure the fan to rotate at a fixed speed (set mode to
-///        ::ZES_FAN_SPEED_MODE_FIXED)
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFan`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == speed`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Fixing the fan speed not supported by the hardware or the fan speed units are not supported. See ::zes_fan_properties_t.supportedModes and ::zes_fan_properties_t.supportedUnits.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFanSetFixedSpeedMode(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    const zes_fan_speed_t* speed                    ///< [in] The fixed fan speed setting
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Configure the fan to adjust speed based on a temperature/speed table
-///        (set mode to ::ZES_FAN_SPEED_MODE_TABLE)
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFan`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == speedTable`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + The temperature/speed pairs in the array are not sorted on temperature from lowest to highest.
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Fan speed table not supported by the hardware or the fan speed units are not supported. See ::zes_fan_properties_t.supportedModes and ::zes_fan_properties_t.supportedUnits.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFanSetSpeedTableMode(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    const zes_fan_speed_table_t* speedTable         ///< [in] A table containing temperature/speed pairs.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current state of a fan - current mode and speed
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFan`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZES_FAN_SPEED_UNITS_PERCENT < units`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pSpeed`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + The requested fan speed units are not supported. See ::zes_fan_properties_t.supportedUnits.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFanGetState(
-    zes_fan_handle_t hFan,                          ///< [in] Handle for the component.
-    zes_fan_speed_units_t units,                    ///< [in] The units in which the fan speed should be returned.
-    int32_t* pSpeed                                 ///< [in,out] Will contain the current speed of the fan in the units
-                                                    ///< requested. A value of -1 indicates that the fan speed cannot be
-                                                    ///< measured.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region firmware
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Firmware properties
-typedef struct _zes_firmware_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can flash the firmware assuming the user
-                                                    ///< has permissions
-    char name[ZES_STRING_PROPERTY_SIZE];            ///< [out] NULL terminated string value. The string "unknown" will be
-                                                    ///< returned if this property cannot be determined.
-    char version[ZES_STRING_PROPERTY_SIZE];         ///< [out] NULL terminated string value. The string "unknown" will be
-                                                    ///< returned if this property cannot be determined.
-
-} zes_firmware_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of firmwares
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumFirmwares(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_firmware_handle_t* phFirmware               ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get firmware properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFirmware`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFirmwareGetProperties(
-    zes_firmware_handle_t hFirmware,                ///< [in] Handle for the component.
-    zes_firmware_properties_t* pProperties          ///< [in,out] Pointer to an array that will hold the properties of the
-                                                    ///< firmware
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Flash a new firmware image
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFirmware`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pImage`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to perform this operation.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFirmwareFlash(
-    zes_firmware_handle_t hFirmware,                ///< [in] Handle for the component.
-    void* pImage,                                   ///< [in] Image of the new firmware to flash.
-    uint32_t size                                   ///< [in] Size of the flash image.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Frequency domains
-#if !defined(__GNUC__)
-#pragma region frequency
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frequency domains.
-typedef enum _zes_freq_domain_t
-{
-    ZES_FREQ_DOMAIN_GPU = 0,                        ///< GPU Core Domain.
-    ZES_FREQ_DOMAIN_MEMORY = 1,                     ///< Local Memory Domain.
-    ZES_FREQ_DOMAIN_FORCE_UINT32 = 0x7fffffff
-
-} zes_freq_domain_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frequency properties
-/// 
-/// @details
-///     - Indicates if this frequency domain can be overclocked (if true,
-///       functions such as ::zesFrequencyOcSetFrequencyTarget() are supported).
-///     - The min/max hardware frequencies are specified for non-overclock
-///       configurations. For overclock configurations, use
-///       ::zesFrequencyOcGetFrequencyTarget() to determine the maximum
-///       frequency that can be requested.
-typedef struct _zes_freq_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_freq_domain_t type;                         ///< [out] The hardware block that this frequency domain controls (GPU,
-                                                    ///< memory, ...)
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can control the frequency of this domain
-                                                    ///< assuming the user has permissions
-    ze_bool_t isThrottleEventSupported;             ///< [out] Indicates if software can register to receive event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED
-    double min;                                     ///< [out] The minimum hardware clock frequency in units of MHz.
-    double max;                                     ///< [out] The maximum non-overclock hardware clock frequency in units of
-                                                    ///< MHz.
-
-} zes_freq_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frequency range between which the hardware can operate. The limits can
-///        be above or below the hardware limits - the hardware will clamp
-///        appropriately.
-typedef struct _zes_freq_range_t
-{
-    double min;                                     ///< [in,out] The min frequency in MHz below which hardware frequency
-                                                    ///< management will not request frequencies. On input, setting to 0 will
-                                                    ///< permit the frequency to go down to the hardware minimum. On output, a
-                                                    ///< negative value indicates that no external minimum frequency limit is
-                                                    ///< in effect.
-    double max;                                     ///< [in,out] The max frequency in MHz above which hardware frequency
-                                                    ///< management will not request frequencies. On input, setting to 0 or a
-                                                    ///< very big number will permit the frequency to go all the way up to the
-                                                    ///< hardware maximum. On output, a negative number indicates that no
-                                                    ///< external maximum frequency limit is in effect.
-
-} zes_freq_range_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frequency throttle reasons
-typedef uint32_t zes_freq_throttle_reason_flags_t;
-typedef enum _zes_freq_throttle_reason_flag_t
-{
-    ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP = ZE_BIT(0),  ///< frequency throttled due to average power excursion (PL1)
-    ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP = ZE_BIT(1),///< frequency throttled due to burst power excursion (PL2)
-    ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT = ZE_BIT(2),///< frequency throttled due to current excursion (PL4)
-    ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT = ZE_BIT(3),///< frequency throttled due to thermal excursion (T > TjMax)
-    ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT = ZE_BIT(4),///< frequency throttled due to power supply assertion
-    ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE = ZE_BIT(5), ///< frequency throttled due to software supplied frequency range
-    ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE = ZE_BIT(6), ///< frequency throttled due to a sub block that has a lower frequency
-                                                    ///< range when it receives clocks
-    ZES_FREQ_THROTTLE_REASON_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zes_freq_throttle_reason_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frequency state
-typedef struct _zes_freq_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    double currentVoltage;                          ///< [out] Current voltage in Volts. A negative value indicates that this
-                                                    ///< property is not known.
-    double request;                                 ///< [out] The current frequency request in MHz. A negative value indicates
-                                                    ///< that this property is not known.
-    double tdp;                                     ///< [out] The maximum frequency in MHz supported under the current TDP
-                                                    ///< conditions. This fluctuates dynamically based on the power and thermal
-                                                    ///< limits of the part. A negative value indicates that this property is
-                                                    ///< not known.
-    double efficient;                               ///< [out] The efficient minimum frequency in MHz. A negative value
-                                                    ///< indicates that this property is not known.
-    double actual;                                  ///< [out] The resolved frequency in MHz. A negative value indicates that
-                                                    ///< this property is not known.
-    zes_freq_throttle_reason_flags_t throttleReasons;   ///< [out] The reasons that the frequency is being limited by the hardware.
-                                                    ///< Returns 0 (frequency not throttled) or a combination of ::zes_freq_throttle_reason_flag_t.
-
-} zes_freq_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Frequency throttle time snapshot
-/// 
-/// @details
-///     - Percent time throttled is calculated by taking two snapshots (s1, s2)
-///       and using the equation: %throttled = (s2.throttleTime -
-///       s1.throttleTime) / (s2.timestamp - s1.timestamp)
-typedef struct _zes_freq_throttle_time_t
-{
-    uint64_t throttleTime;                          ///< [out] The monotonic counter of time in microseconds that the frequency
-                                                    ///< has been limited by the hardware.
-    uint64_t timestamp;                             ///< [out] Microsecond timestamp when throttleTime was captured.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-
-} zes_freq_throttle_time_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Overclocking modes
-typedef enum _zes_oc_mode_t
-{
-    ZES_OC_MODE_OFF = 0,                            ///< Overclocking if off - hardware is running using factory default
-                                                    ///< voltages/frequencies.
-    ZES_OC_MODE_OVERRIDE = 1,                       ///< Overclock override mode - In this mode, a fixed user-supplied voltage
-                                                    ///< is applied independent of the frequency request. The maximum permitted
-                                                    ///< frequency can also be increased. This mode disables INTERPOLATIVE and
-                                                    ///< FIXED modes.
-    ZES_OC_MODE_INTERPOLATIVE = 2,                  ///< Overclock interpolative mode - In this mode, the voltage/frequency
-                                                    ///< curve can be extended with a new voltage/frequency point that will be
-                                                    ///< interpolated. The existing voltage/frequency points can also be offset
-                                                    ///< (up or down) by a fixed voltage. This mode disables FIXED and OVERRIDE
-                                                    ///< modes.
-    ZES_OC_MODE_FIXED = 3,                          ///< Overclocking fixed Mode - In this mode, hardware will disable most
-                                                    ///< frequency throttling and lock the frequency and voltage at the
-                                                    ///< specified overclock values. This mode disables OVERRIDE and
-                                                    ///< INTERPOLATIVE modes. This mode can damage the part, most of the
-                                                    ///< protections are disabled on this mode.
-    ZES_OC_MODE_FORCE_UINT32 = 0x7fffffff
-
-} zes_oc_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Overclocking properties
-/// 
-/// @details
-///     - Provides all the overclocking capabilities and properties supported by
-///       the device for the frequency domain.
-typedef struct _zes_oc_capabilities_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    ze_bool_t isOcSupported;                        ///< [out] Indicates if any overclocking features are supported on this
-                                                    ///< frequency domain.
-    double maxFactoryDefaultFrequency;              ///< [out] Factory default non-overclock maximum frequency in Mhz.
-    double maxFactoryDefaultVoltage;                ///< [out] Factory default voltage used for the non-overclock maximum
-                                                    ///< frequency in MHz.
-    double maxOcFrequency;                          ///< [out] Maximum hardware overclocking frequency limit in Mhz.
-    double minOcVoltageOffset;                      ///< [out] The minimum voltage offset that can be applied to the
-                                                    ///< voltage/frequency curve. Note that this number can be negative.
-    double maxOcVoltageOffset;                      ///< [out] The maximum voltage offset that can be applied to the
-                                                    ///< voltage/frequency curve.
-    double maxOcVoltage;                            ///< [out] The maximum overclock voltage that hardware supports.
-    ze_bool_t isTjMaxSupported;                     ///< [out] Indicates if the maximum temperature limit (TjMax) can be
-                                                    ///< changed for this frequency domain.
-    ze_bool_t isIccMaxSupported;                    ///< [out] Indicates if the maximum current (IccMax) can be changed for
-                                                    ///< this frequency domain.
-    ze_bool_t isHighVoltModeCapable;                ///< [out] Indicates if this frequency domains supports a feature to set
-                                                    ///< very high voltages.
-    ze_bool_t isHighVoltModeEnabled;                ///< [out] Indicates if very high voltages are permitted on this frequency
-                                                    ///< domain.
-    ze_bool_t isExtendedModeSupported;              ///< [out] Indicates if the extended overclocking features are supported.
-                                                    ///< If this is supported, increments are on 1 Mhz basis.
-    ze_bool_t isFixedModeSupported;                 ///< [out] Indicates if the fixed mode is supported. In this mode, hardware
-                                                    ///< will disable most frequency throttling and lock the frequency and
-                                                    ///< voltage at the specified overclock values.
-
-} zes_oc_capabilities_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of frequency domains
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumFrequencyDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_freq_handle_t* phFrequency                  ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get frequency properties - available frequencies
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyGetProperties(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_properties_t* pProperties              ///< [in,out] The frequency properties for the specified domain.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get available non-overclocked hardware clock frequencies for the
-///        frequency domain
-/// 
-/// @details
-///     - The list of available frequencies is returned in order of slowest to
-///       fastest.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyGetAvailableClocks(
-    zes_freq_handle_t hFrequency,                   ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of frequencies.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of frequencies that are available.
-                                                    ///< if count is greater than the number of frequencies that are available,
-                                                    ///< then the driver shall update the value with the correct number of frequencies.
-    double* phFrequency                             ///< [in,out][optional][range(0, *pCount)] array of frequencies in units of
-                                                    ///< MHz and sorted from slowest to fastest.
-                                                    ///< if count is less than the number of frequencies that are available,
-                                                    ///< then the driver shall only retrieve that number of frequencies.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current frequency limits
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pLimits`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyGetRange(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_range_t* pLimits                       ///< [in,out] The range between which the hardware can operate for the
-                                                    ///< specified domain.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set frequency range between which the hardware can operate.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pLimits`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencySetRange(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    const zes_freq_range_t* pLimits                 ///< [in] The limits between which the hardware can operate for the
-                                                    ///< specified domain.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current frequency state - frequency request, actual frequency, TDP
-///        limits
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyGetState(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_state_t* pState                        ///< [in,out] Frequency state for the specified domain.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get frequency throttle time
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pThrottleTime`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyGetThrottleTime(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_freq_throttle_time_t* pThrottleTime         ///< [in,out] Will contain a snapshot of the throttle time counters for the
-                                                    ///< specified domain.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the overclocking capabilities.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pOcCapabilities`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcGetCapabilities(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_oc_capabilities_t* pOcCapabilities          ///< [in,out] Pointer to the capabilities structure
-                                                    ///< ::zes_oc_capabilities_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the current overclocking frequency target, if extended moded is
-///        supported, will returned in 1 Mhz granularity.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCurrentOcFrequency`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcGetFrequencyTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pCurrentOcFrequency                     ///< [out] Overclocking Frequency in MHz, if extended moded is supported,
-                                                    ///< will returned in 1 Mhz granularity, else, in multiples of 50 Mhz. This
-                                                    ///< cannot be greater than ::zes_oc_capabilities_t.maxOcFrequency.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set the current overclocking frequency target, if extended moded is
-///        supported, can be set in 1 Mhz granularity.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcSetFrequencyTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double CurrentOcFrequency                       ///< [in] Overclocking Frequency in MHz, if extended moded is supported, it
-                                                    ///< could be set in 1 Mhz granularity, else, in multiples of 50 Mhz. This
-                                                    ///< cannot be greater than ::zes_oc_capabilities_t.maxOcFrequency.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the current overclocking voltage settings.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCurrentVoltageTarget`
-///         + `nullptr == pCurrentVoltageOffset`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcGetVoltageTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pCurrentVoltageTarget,                  ///< [out] Overclock voltage in Volts. This cannot be greater than
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltage.
-    double* pCurrentVoltageOffset                   ///< [out] This voltage offset is applied to all points on the
-                                                    ///< voltage/frequency curve, include the new overclock voltageTarget. It
-                                                    ///< can be in the range (::zes_oc_capabilities_t.minOcVoltageOffset,
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltageOffset).
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set the current overclocking voltage settings.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcSetVoltageTarget(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double CurrentVoltageTarget,                    ///< [in] Overclock voltage in Volts. This cannot be greater than
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltage.
-    double CurrentVoltageOffset                     ///< [in] This voltage offset is applied to all points on the
-                                                    ///< voltage/frequency curve, include the new overclock voltageTarget. It
-                                                    ///< can be in the range (::zes_oc_capabilities_t.minOcVoltageOffset,
-                                                    ///< ::zes_oc_capabilities_t.maxOcVoltageOffset).
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set the current overclocking mode.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZES_OC_MODE_FIXED < CurrentOcMode`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcSetMode(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_oc_mode_t CurrentOcMode                     ///< [in] Current Overclocking Mode ::zes_oc_mode_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the current overclocking mode.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCurrentOcMode`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see ::zes_oc_capabilities_t.maxOcFrequency, ::zes_oc_capabilities_t.maxOcVoltage, ::zes_oc_capabilities_t.minOcVoltageOffset, ::zes_oc_capabilities_t.maxOcVoltageOffset).
-///         + Requested voltage overclock is very high but ::zes_oc_capabilities_t.isHighVoltModeEnabled is not enabled for the device.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcGetMode(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    zes_oc_mode_t* pCurrentOcMode                   ///< [out] Current Overclocking Mode ::zes_oc_mode_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the maximum current limit setting.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pOcIccMax`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + Capability ::zes_oc_capabilities_t.isIccMaxSupported is false for this frequency domain
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcGetIccMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pOcIccMax                               ///< [in,out] Will contain the maximum current limit in Amperes on
-                                                    ///< successful return.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change the maximum current limit setting.
-/// 
-/// @details
-///     - Setting ocIccMax to 0.0 will return the value to the factory default.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + Capability ::zes_oc_capabilities_t.isIccMaxSupported is false for this frequency domain
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + The specified current limit is too low or too high
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcSetIccMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double ocIccMax                                 ///< [in] The new maximum current limit in Amperes.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the maximum temperature limit setting.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pOcTjMax`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcGetTjMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double* pOcTjMax                                ///< [in,out] Will contain the maximum temperature limit in degrees Celsius
-                                                    ///< on successful return.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change the maximum temperature limit setting.
-/// 
-/// @details
-///     - Setting ocTjMax to 0.0 will return the value to the factory default.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hFrequency`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Overclocking is not supported on this frequency domain (::zes_oc_capabilities_t.isOcSupported)
-///         + Capability ::zes_oc_capabilities_t.isTjMaxSupported is false for this frequency domain
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Overclocking feature is locked on this frequency domain
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + The specified temperature limit is too high
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesFrequencyOcSetTjMax(
-    zes_freq_handle_t hFrequency,                   ///< [in] Handle for the component.
-    double ocTjMax                                  ///< [in] The new maximum temperature limit in degrees Celsius.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region led
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief LED properties
-typedef struct _zes_led_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Indicates if software can control the LED assuming the user has
-                                                    ///< permissions
-    ze_bool_t haveRGB;                              ///< [out] Indicates if the LED is RGB capable
-
-} zes_led_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief LED color
-typedef struct _zes_led_color_t
-{
-    double red;                                     ///< [in,out][range(0.0, 1.0)] The LED red value. On output, a value less
-                                                    ///< than 0.0 indicates that the color is not known.
-    double green;                                   ///< [in,out][range(0.0, 1.0)] The LED green value. On output, a value less
-                                                    ///< than 0.0 indicates that the color is not known.
-    double blue;                                    ///< [in,out][range(0.0, 1.0)] The LED blue value. On output, a value less
-                                                    ///< than 0.0 indicates that the color is not known.
-
-} zes_led_color_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief LED state
-typedef struct _zes_led_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    ze_bool_t isOn;                                 ///< [out] Indicates if the LED is on or off
-    zes_led_color_t color;                          ///< [out] Color of the LED
-
-} zes_led_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of LEDs
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumLeds(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_led_handle_t* phLed                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get LED properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hLed`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesLedGetProperties(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    zes_led_properties_t* pProperties               ///< [in,out] Will contain the properties of the LED.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current state of a LED - on/off, color
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hLed`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesLedGetState(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    zes_led_state_t* pState                         ///< [in,out] Will contain the current state of the LED.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Turn the LED on/off
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hLed`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesLedSetState(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    ze_bool_t enable                                ///< [in] Set to TRUE to turn the LED on, FALSE to turn off.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set the color of the LED
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hLed`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pColor`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This LED doesn't not support color changes. See ::zes_led_properties_t.haveRGB.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesLedSetColor(
-    zes_led_handle_t hLed,                          ///< [in] Handle for the component.
-    const zes_led_color_t* pColor                   ///< [in] New color of the LED.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Memory management
-#if !defined(__GNUC__)
-#pragma region memory
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory module types
-typedef enum _zes_mem_type_t
-{
-    ZES_MEM_TYPE_HBM = 0,                           ///< HBM memory
-    ZES_MEM_TYPE_DDR = 1,                           ///< DDR memory
-    ZES_MEM_TYPE_DDR3 = 2,                          ///< DDR3 memory
-    ZES_MEM_TYPE_DDR4 = 3,                          ///< DDR4 memory
-    ZES_MEM_TYPE_DDR5 = 4,                          ///< DDR5 memory
-    ZES_MEM_TYPE_LPDDR = 5,                         ///< LPDDR memory
-    ZES_MEM_TYPE_LPDDR3 = 6,                        ///< LPDDR3 memory
-    ZES_MEM_TYPE_LPDDR4 = 7,                        ///< LPDDR4 memory
-    ZES_MEM_TYPE_LPDDR5 = 8,                        ///< LPDDR5 memory
-    ZES_MEM_TYPE_SRAM = 9,                          ///< SRAM memory
-    ZES_MEM_TYPE_L1 = 10,                           ///< L1 cache
-    ZES_MEM_TYPE_L3 = 11,                           ///< L3 cache
-    ZES_MEM_TYPE_GRF = 12,                          ///< Execution unit register file
-    ZES_MEM_TYPE_SLM = 13,                          ///< Execution unit shared local memory
-    ZES_MEM_TYPE_GDDR4 = 14,                        ///< GDDR4 memory
-    ZES_MEM_TYPE_GDDR5 = 15,                        ///< GDDR5 memory
-    ZES_MEM_TYPE_GDDR5X = 16,                       ///< GDDR5X memory
-    ZES_MEM_TYPE_GDDR6 = 17,                        ///< GDDR6 memory
-    ZES_MEM_TYPE_GDDR6X = 18,                       ///< GDDR6X memory
-    ZES_MEM_TYPE_GDDR7 = 19,                        ///< GDDR7 memory
-    ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zes_mem_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory module location
-typedef enum _zes_mem_loc_t
-{
-    ZES_MEM_LOC_SYSTEM = 0,                         ///< System memory
-    ZES_MEM_LOC_DEVICE = 1,                         ///< On board local device memory
-    ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
-
-} zes_mem_loc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory health
-typedef enum _zes_mem_health_t
-{
-    ZES_MEM_HEALTH_UNKNOWN = 0,                     ///< The memory health cannot be determined.
-    ZES_MEM_HEALTH_OK = 1,                          ///< All memory channels are healthy.
-    ZES_MEM_HEALTH_DEGRADED = 2,                    ///< Excessive correctable errors have been detected on one or more
-                                                    ///< channels. Device should be reset.
-    ZES_MEM_HEALTH_CRITICAL = 3,                    ///< Operating with reduced memory to cover banks with too many
-                                                    ///< uncorrectable errors.
-    ZES_MEM_HEALTH_REPLACE = 4,                     ///< Device should be replaced due to excessive uncorrectable errors.
-    ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
-
-} zes_mem_health_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory properties
-typedef struct _zes_mem_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_mem_type_t type;                            ///< [out] The memory type
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    zes_mem_loc_t location;                         ///< [out] Location of this memory (system, device)
-    uint64_t physicalSize;                          ///< [out] Physical memory size in bytes. A value of 0 indicates that this
-                                                    ///< property is not known. However, a call to ::zesMemoryGetState() will
-                                                    ///< correctly return the total size of usable memory.
-    int32_t busWidth;                               ///< [out] Width of the memory bus. A value of -1 means that this property
-                                                    ///< is unknown.
-    int32_t numChannels;                            ///< [out] The number of memory channels. A value of -1 means that this
-                                                    ///< property is unknown.
-
-} zes_mem_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory state - health, allocated
-/// 
-/// @details
-///     - Percent allocation is given by 100 * (size - free / size.
-///     - Percent free is given by 100 * free / size.
-typedef struct _zes_mem_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zes_mem_health_t health;                        ///< [out] Indicates the health of the memory
-    uint64_t free;                                  ///< [out] The free memory in bytes
-    uint64_t size;                                  ///< [out] The total allocatable memory in bytes (can be less than
-                                                    ///< ::zes_mem_properties_t.physicalSize)
-
-} zes_mem_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Memory bandwidth
-/// 
-/// @details
-///     - Percent bandwidth is calculated by taking two snapshots (s1, s2) and
-///       using the equation: %bw = 10^6 * ((s2.readCounter - s1.readCounter) +
-///       (s2.writeCounter - s1.writeCounter)) / (s2.maxBandwidth *
-///       (s2.timestamp - s1.timestamp))
-typedef struct _zes_mem_bandwidth_t
-{
-    uint64_t readCounter;                           ///< [out] Total bytes read from memory
-    uint64_t writeCounter;                          ///< [out] Total bytes written to memory
-    uint64_t maxBandwidth;                          ///< [out] Current maximum bandwidth in units of bytes/sec
-    uint64_t timestamp;                             ///< [out] The timestamp when these measurements were sampled.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-
-} zes_mem_bandwidth_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of memory modules
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumMemoryModules(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_mem_handle_t* phMemory                      ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get memory properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMemory`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesMemoryGetProperties(
-    zes_mem_handle_t hMemory,                       ///< [in] Handle for the component.
-    zes_mem_properties_t* pProperties               ///< [in,out] Will contain memory properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get memory state - health, allocated
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMemory`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesMemoryGetState(
-    zes_mem_handle_t hMemory,                       ///< [in] Handle for the component.
-    zes_mem_state_t* pState                         ///< [in,out] Will contain the current health and allocated memory.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get memory bandwidth
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMemory`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pBandwidth`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to query this telemetry.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesMemoryGetBandwidth(
-    zes_mem_handle_t hMemory,                       ///< [in] Handle for the component.
-    zes_mem_bandwidth_t* pBandwidth                 ///< [in,out] Will contain the current health, free memory, total memory
-                                                    ///< size.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Performance factor
-#if !defined(__GNUC__)
-#pragma region performance
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Static information about a Performance Factor domain
-typedef struct _zes_perf_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if this Performance Factor affects accelerators located on
-                                                    ///< a sub-device
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    zes_engine_type_flags_t engines;                ///< [out] Bitfield of accelerator engine types that are affected by this
-                                                    ///< Performance Factor.
-
-} zes_perf_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handles to accelerator domains whose performance can be optimized
-///        via a Performance Factor
-/// 
-/// @details
-///     - A Performance Factor should be tuned for each workload.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumPerformanceFactorDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_perf_handle_t* phPerf                       ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get properties about a Performance Factor domain
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPerf`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPerformanceFactorGetProperties(
-    zes_perf_handle_t hPerf,                        ///< [in] Handle for the Performance Factor domain.
-    zes_perf_properties_t* pProperties              ///< [in,out] Will contain information about the specified Performance
-                                                    ///< Factor domain.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current Performance Factor for a given domain
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPerf`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pFactor`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPerformanceFactorGetConfig(
-    zes_perf_handle_t hPerf,                        ///< [in] Handle for the Performance Factor domain.
-    double* pFactor                                 ///< [in,out] Will contain the actual Performance Factor being used by the
-                                                    ///< hardware (may not be the same as the requested Performance Factor).
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change the performance factor for a domain
-/// 
-/// @details
-///     - The Performance Factor is a number between 0 and 100.
-///     - A Performance Factor is a hint to the hardware. Depending on the
-///       hardware, the request may not be granted. Follow up this function with
-///       a call to ::zesPerformanceFactorGetConfig() to determine the actual
-///       factor being used by the hardware.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPerf`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPerformanceFactorSetConfig(
-    zes_perf_handle_t hPerf,                        ///< [in] Handle for the Performance Factor domain.
-    double factor                                   ///< [in] The new Performance Factor.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Scheduler management
-#if !defined(__GNUC__)
-#pragma region power
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Properties related to device power settings
-typedef struct _zes_power_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Software can change the power limits of this domain assuming the
-                                                    ///< user has permissions.
-    ze_bool_t isEnergyThresholdSupported;           ///< [out] Indicates if this power domain supports the energy threshold
-                                                    ///< event (::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED).
-    int32_t defaultLimit;                           ///< [out] The factory default TDP power limit of the part in milliwatts. A
-                                                    ///< value of -1 means that this is not known.
-    int32_t minLimit;                               ///< [out] The minimum power limit in milliwatts that can be requested.
-    int32_t maxLimit;                               ///< [out] The maximum power limit in milliwatts that can be requested.
-
-} zes_power_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Energy counter snapshot
-/// 
-/// @details
-///     - Average power is calculated by taking two snapshots (s1, s2) and using
-///       the equation: PowerWatts = (s2.energy - s1.energy) / (s2.timestamp -
-///       s1.timestamp)
-typedef struct _zes_power_energy_counter_t
-{
-    uint64_t energy;                                ///< [out] The monotonic energy counter in microjoules.
-    uint64_t timestamp;                             ///< [out] Microsecond timestamp when energy was captured.
-                                                    ///< This timestamp should only be used to calculate delta time between
-                                                    ///< snapshots of this structure.
-                                                    ///< Never take the delta of this timestamp with the timestamp from a
-                                                    ///< different structure since they are not guaranteed to have the same base.
-                                                    ///< The absolute value of the timestamp is only valid during within the
-                                                    ///< application and may be different on the next execution.
-
-} zes_power_energy_counter_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sustained power limits
-/// 
-/// @details
-///     - The power controller (Punit) will throttle the operating frequency if
-///       the power averaged over a window (typically seconds) exceeds this
-///       limit.
-typedef struct _zes_power_sustained_limit_t
-{
-    ze_bool_t enabled;                              ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
-    int32_t power;                                  ///< [in,out] power limit in milliwatts
-    int32_t interval;                               ///< [in,out] power averaging window (Tau) in milliseconds
-
-} zes_power_sustained_limit_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Burst power limit
-/// 
-/// @details
-///     - The power controller (Punit) will throttle the operating frequency of
-///       the device if the power averaged over a few milliseconds exceeds a
-///       limit known as PL2. Typically PL2 > PL1 so that it permits the
-///       frequency to burst higher for short periods than would be otherwise
-///       permitted by PL1.
-typedef struct _zes_power_burst_limit_t
-{
-    ze_bool_t enabled;                              ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
-    int32_t power;                                  ///< [in,out] power limit in milliwatts
-
-} zes_power_burst_limit_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Peak power limit
-/// 
-/// @details
-///     - The power controller (Punit) will reactively/proactively throttle the
-///       operating frequency of the device when the instantaneous/100usec power
-///       exceeds this limit. The limit is known as PL4 or Psys. It expresses
-///       the maximum power that can be drawn from the power supply.
-///     - If this power limit is removed or set too high, the power supply will
-///       generate an interrupt when it detects an overcurrent condition and the
-///       power controller will throttle the device frequencies down to min. It
-///       is thus better to tune the PL4 value in order to avoid such
-///       excursions.
-typedef struct _zes_power_peak_limit_t
-{
-    int32_t powerAC;                                ///< [in,out] power limit in milliwatts for the AC power source.
-    int32_t powerDC;                                ///< [in,out] power limit in milliwatts for the DC power source. On input,
-                                                    ///< this is ignored if the product does not have a battery. On output,
-                                                    ///< this will be -1 if the product does not have a battery.
-
-} zes_power_peak_limit_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Energy threshold
-/// 
-/// @details
-///     - .
-typedef struct _zes_energy_threshold_t
-{
-    ze_bool_t enable;                               ///< [in,out] Indicates if the energy threshold is enabled.
-    double threshold;                               ///< [in,out] The energy threshold in Joules. Will be 0.0 if no threshold
-                                                    ///< has been set.
-    uint32_t processId;                             ///< [in,out] The host process ID that set the energy threshold. Will be
-                                                    ///< 0xFFFFFFFF if no threshold has been set.
-
-} zes_energy_threshold_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of power domains
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumPowerDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_pwr_handle_t* phPower                       ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of the PCIe card-level power
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phPower`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + The device does not provide access to card level power controls or telemetry. An invalid power domain handle will be returned in phPower.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceGetCardPowerDomain(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    zes_pwr_handle_t* phPower                       ///< [in,out] power domain handle for the entire PCIe card.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get properties related to a power domain
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPower`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPowerGetProperties(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_power_properties_t* pProperties             ///< [in,out] Structure that will contain property data.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get energy counter
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPower`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pEnergy`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPowerGetEnergyCounter(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_power_energy_counter_t* pEnergy             ///< [in,out] Will contain the latest snapshot of the energy counter and
-                                                    ///< timestamp when the last counter value was measured.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get power limits
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPower`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPowerGetLimits(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_power_sustained_limit_t* pSustained,        ///< [in,out][optional] The sustained power limit. If this is null, the
-                                                    ///< current sustained power limits will not be returned.
-    zes_power_burst_limit_t* pBurst,                ///< [in,out][optional] The burst power limit. If this is null, the current
-                                                    ///< peak power limits will not be returned.
-    zes_power_peak_limit_t* pPeak                   ///< [in,out][optional] The peak power limit. If this is null, the peak
-                                                    ///< power limits will not be returned.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set power limits
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPower`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + The device is in use, meaning that the GPU is under Over clocking, applying power limits under overclocking is not supported.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPowerSetLimits(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    const zes_power_sustained_limit_t* pSustained,  ///< [in][optional] The sustained power limit. If this is null, no changes
-                                                    ///< will be made to the sustained power limits.
-    const zes_power_burst_limit_t* pBurst,          ///< [in][optional] The burst power limit. If this is null, no changes will
-                                                    ///< be made to the burst power limits.
-    const zes_power_peak_limit_t* pPeak             ///< [in][optional] The peak power limit. If this is null, no changes will
-                                                    ///< be made to the peak power limits.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get energy threshold
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPower`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pThreshold`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Energy threshold not supported on this power domain (check ::zes_power_properties_t.isEnergyThresholdSupported).
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to request this feature.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPowerGetEnergyThreshold(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    zes_energy_threshold_t* pThreshold              ///< [in,out] Returns information about the energy threshold setting -
-                                                    ///< enabled/energy threshold/process ID.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set energy threshold
-/// 
-/// @details
-///     - An event ::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED will be
-///       generated when the delta energy consumed starting from this call
-///       exceeds the specified threshold. Use the function
-///       ::zesDeviceEventRegister() to start receiving the event.
-///     - Only one running process can control the energy threshold at a given
-///       time. If another process attempts to change the energy threshold, the
-///       error ::ZE_RESULT_ERROR_NOT_AVAILABLE will be returned. The function
-///       ::zesPowerGetEnergyThreshold() to determine the process ID currently
-///       controlling this setting.
-///     - Calling this function will remove any pending energy thresholds and
-///       start counting from the time of this call.
-///     - Once the energy threshold has been reached and the event generated,
-///       the threshold is automatically removed. It is up to the application to
-///       request a new threshold.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPower`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Energy threshold not supported on this power domain (check ::zes_power_properties_t.isEnergyThresholdSupported).
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to request this feature.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Another running process has set the energy threshold.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPowerSetEnergyThreshold(
-    zes_pwr_handle_t hPower,                        ///< [in] Handle for the component.
-    double threshold                                ///< [in] The energy threshold to be set in joules.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region psu
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief PSU voltage status
-typedef enum _zes_psu_voltage_status_t
-{
-    ZES_PSU_VOLTAGE_STATUS_UNKNOWN = 0,             ///< The status of the power supply voltage controllers cannot be
-                                                    ///< determined
-    ZES_PSU_VOLTAGE_STATUS_NORMAL = 1,              ///< No unusual voltages have been detected
-    ZES_PSU_VOLTAGE_STATUS_OVER = 2,                ///< Over-voltage has occurred
-    ZES_PSU_VOLTAGE_STATUS_UNDER = 3,               ///< Under-voltage has occurred
-    ZES_PSU_VOLTAGE_STATUS_FORCE_UINT32 = 0x7fffffff
-
-} zes_psu_voltage_status_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Static properties of the power supply
-typedef struct _zes_psu_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t haveFan;                              ///< [out] True if the power supply has a fan
-    int32_t ampLimit;                               ///< [out] The maximum electrical current in milliamperes that can be
-                                                    ///< drawn. A value of -1 indicates that this property cannot be
-                                                    ///< determined.
-
-} zes_psu_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Dynamic state of the power supply
-typedef struct _zes_psu_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zes_psu_voltage_status_t voltStatus;            ///< [out] The current PSU voltage status
-    ze_bool_t fanFailed;                            ///< [out] Indicates if the fan has failed
-    int32_t temperature;                            ///< [out] Read the current heatsink temperature in degrees Celsius. A
-                                                    ///< value of -1 indicates that this property cannot be determined.
-    int32_t current;                                ///< [out] The amps being drawn in milliamperes. A value of -1 indicates
-                                                    ///< that this property cannot be determined.
-
-} zes_psu_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of power supplies
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumPsus(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_psu_handle_t* phPsu                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get power supply properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPsu`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPsuGetProperties(
-    zes_psu_handle_t hPsu,                          ///< [in] Handle for the component.
-    zes_psu_properties_t* pProperties               ///< [in,out] Will contain the properties of the power supply.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current power supply state
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hPsu`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesPsuGetState(
-    zes_psu_handle_t hPsu,                          ///< [in] Handle for the component.
-    zes_psu_state_t* pState                         ///< [in,out] Will contain the current state of the power supply.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region ras
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief RAS error type
-typedef enum _zes_ras_error_type_t
-{
-    ZES_RAS_ERROR_TYPE_CORRECTABLE = 0,             ///< Errors were corrected by hardware
-    ZES_RAS_ERROR_TYPE_UNCORRECTABLE = 1,           ///< Error were not corrected
-    ZES_RAS_ERROR_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zes_ras_error_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief RAS error categories
-typedef enum _zes_ras_error_cat_t
-{
-    ZES_RAS_ERROR_CAT_RESET = 0,                    ///< The number of accelerator engine resets attempted by the driver
-    ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS = 1,       ///< The number of hardware exceptions generated by the way workloads have
-                                                    ///< programmed the hardware
-    ZES_RAS_ERROR_CAT_DRIVER_ERRORS = 2,            ///< The number of low level driver communication errors have occurred
-    ZES_RAS_ERROR_CAT_COMPUTE_ERRORS = 3,           ///< The number of errors that have occurred in the compute accelerator
-                                                    ///< hardware
-    ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS = 4,       ///< The number of errors that have occurred in the fixed-function
-                                                    ///< accelerator hardware
-    ZES_RAS_ERROR_CAT_CACHE_ERRORS = 5,             ///< The number of errors that have occurred in caches (L1/L3/register
-                                                    ///< file/shared local memory/sampler)
-    ZES_RAS_ERROR_CAT_DISPLAY_ERRORS = 6,           ///< The number of errors that have occurred in the display
-    ZES_RAS_ERROR_CAT_FORCE_UINT32 = 0x7fffffff
-
-} zes_ras_error_cat_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_MAX_RAS_ERROR_CATEGORY_COUNT
-/// @brief The maximum number of categories
-#define ZES_MAX_RAS_ERROR_CATEGORY_COUNT  7
-#endif // ZES_MAX_RAS_ERROR_CATEGORY_COUNT
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief RAS properties
-typedef struct _zes_ras_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_ras_error_type_t type;                      ///< [out] The type of RAS error
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-
-} zes_ras_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief RAS error details
-typedef struct _zes_ras_state_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    uint64_t category[ZES_MAX_RAS_ERROR_CATEGORY_COUNT];///< [in][out] Breakdown of error by category
-
-} zes_ras_state_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief RAS error configuration - thresholds used for triggering RAS events
-///        (::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS,
-///        ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS)
-/// 
-/// @details
-///     - The driver maintains a total counter which is updated every time a
-///       hardware block covered by the corresponding RAS error set notifies
-///       that an error has occurred. When this total count goes above the
-///       totalThreshold specified below, a RAS event is triggered.
-///     - The driver also maintains a counter for each category of RAS error
-///       (see ::zes_ras_state_t for a breakdown). Each time a hardware block of
-///       that category notifies that an error has occurred, that corresponding
-///       category counter is updated. When it goes above the threshold
-///       specified in detailedThresholds, a RAS event is triggered.
-typedef struct _zes_ras_config_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    uint64_t totalThreshold;                        ///< [in,out] If the total RAS errors exceeds this threshold, the event
-                                                    ///< will be triggered. A value of 0ULL disables triggering the event based
-                                                    ///< on the total counter.
-    zes_ras_state_t detailedThresholds;             ///< [in,out] If the RAS errors for each category exceed the threshold for
-                                                    ///< that category, the event will be triggered. A value of 0ULL will
-                                                    ///< disable an event being triggered for that category.
-
-} zes_ras_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of all RAS error sets on a device
-/// 
-/// @details
-///     - A RAS error set is a collection of RAS error counters of a given type
-///       (correctable/uncorrectable) from hardware blocks contained within a
-///       sub-device or within the device.
-///     - A device without sub-devices will typically return two handles, one
-///       for correctable errors sets and one for uncorrectable error sets.
-///     - A device with sub-devices will return RAS error sets for each
-///       sub-device and possibly RAS error sets for hardware blocks outside the
-///       sub-devices.
-///     - If the function completes successfully but pCount is set to 0, RAS
-///       features are not available/enabled on this device.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumRasErrorSets(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_ras_handle_t* phRas                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get RAS properties of a given RAS error set - this enables discovery
-///        of the type of RAS error set (correctable/uncorrectable) and if
-///        located on a sub-device
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hRas`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesRasGetProperties(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    zes_ras_properties_t* pProperties               ///< [in,out] Structure describing RAS properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get RAS error thresholds that control when RAS events are generated
-/// 
-/// @details
-///     - The driver maintains counters for all RAS error sets and error
-///       categories. Events are generated when errors occur. The configuration
-///       enables setting thresholds to limit when events are sent.
-///     - When a particular RAS correctable error counter exceeds the configured
-///       threshold, the event ::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS will
-///       be triggered.
-///     - When a particular RAS uncorrectable error counter exceeds the
-///       configured threshold, the event
-///       ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS will be triggered.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hRas`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesRasGetConfig(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    zes_ras_config_t* pConfig                       ///< [in,out] Will be populed with the current RAS configuration -
-                                                    ///< thresholds used to trigger events
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set RAS error thresholds that control when RAS events are generated
-/// 
-/// @details
-///     - The driver maintains counters for all RAS error sets and error
-///       categories. Events are generated when errors occur. The configuration
-///       enables setting thresholds to limit when events are sent.
-///     - When a particular RAS correctable error counter exceeds the specified
-///       threshold, the event ::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS will
-///       be generated.
-///     - When a particular RAS uncorrectable error counter exceeds the
-///       specified threshold, the event
-///       ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS will be generated.
-///     - Call ::zesRasGetState() and set the clear flag to true to restart
-///       event generation once counters have exceeded thresholds.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hRas`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Another running process is controlling these settings.
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + Don't have permissions to set thresholds.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesRasSetConfig(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    const zes_ras_config_t* pConfig                 ///< [in] Change the RAS configuration - thresholds used to trigger events
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the current value of RAS error counters for a particular error set
-/// 
-/// @details
-///     - Clearing errors will affect other threads/applications - the counter
-///       values will start from zero.
-///     - Clearing errors requires write permissions.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hRas`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pState`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + Don't have permissions to clear error counters.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesRasGetState(
-    zes_ras_handle_t hRas,                          ///< [in] Handle for the component.
-    ze_bool_t clear,                                ///< [in] Set to 1 to clear the counters of this type
-    zes_ras_state_t* pState                         ///< [in,out] Breakdown of where errors have occurred
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Scheduler management
-#if !defined(__GNUC__)
-#pragma region scheduler
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Scheduler mode
-typedef enum _zes_sched_mode_t
-{
-    ZES_SCHED_MODE_TIMEOUT = 0,                     ///< Multiple applications or contexts are submitting work to the hardware.
-                                                    ///< When higher priority work arrives, the scheduler attempts to pause the
-                                                    ///< current executing work within some timeout interval, then submits the
-                                                    ///< other work.
-    ZES_SCHED_MODE_TIMESLICE = 1,                   ///< The scheduler attempts to fairly timeslice hardware execution time
-                                                    ///< between multiple contexts submitting work to the hardware
-                                                    ///< concurrently.
-    ZES_SCHED_MODE_EXCLUSIVE = 2,                   ///< Any application or context can run indefinitely on the hardware
-                                                    ///< without being preempted or terminated. All pending work for other
-                                                    ///< contexts must wait until the running context completes with no further
-                                                    ///< submitted work.
-    ZES_SCHED_MODE_COMPUTE_UNIT_DEBUG = 3,          ///< This is a special mode that must ben enabled when debugging an
-                                                    ///< application that uses this device e.g. using the Level0 Debug API. It
-                                                    ///< has the effect of disabling any timeouts on workload execution time
-                                                    ///< and will change workload scheduling to ensure debug accuracy.
-    ZES_SCHED_MODE_FORCE_UINT32 = 0x7fffffff
-
-} zes_sched_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Properties related to scheduler component
-typedef struct _zes_sched_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    ze_bool_t onSubdevice;                          ///< [out] True if this resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    ze_bool_t canControl;                           ///< [out] Software can change the scheduler component configuration
-                                                    ///< assuming the user has permissions.
-    zes_engine_type_flags_t engines;                ///< [out] Bitfield of accelerator engine types that are managed by this
-                                                    ///< scheduler component. Note that there can be more than one scheduler
-                                                    ///< component for the same type of accelerator engine.
-    uint32_t supportedModes;                        ///< [out] Bitfield of scheduler modes that can be configured for this
-                                                    ///< scheduler component (bitfield of 1<<::zes_sched_mode_t).
-
-} zes_sched_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZES_SCHED_WATCHDOG_DISABLE
-/// @brief Disable forward progress guard timeout.
-#define ZES_SCHED_WATCHDOG_DISABLE  (~(0ULL))
-#endif // ZES_SCHED_WATCHDOG_DISABLE
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Configuration for timeout scheduler mode (::ZES_SCHED_MODE_TIMEOUT)
-typedef struct _zes_sched_timeout_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    uint64_t watchdogTimeout;                       ///< [in,out] The maximum time in microseconds that the scheduler will wait
-                                                    ///< for a batch of work submitted to a hardware engine to complete or to
-                                                    ///< be preempted so as to run another context.
-                                                    ///< If this time is exceeded, the hardware engine is reset and the context terminated.
-                                                    ///< If set to ::ZES_SCHED_WATCHDOG_DISABLE, a running workload can run as
-                                                    ///< long as it wants without being terminated, but preemption attempts to
-                                                    ///< run other contexts are permitted but not enforced.
-
-} zes_sched_timeout_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Configuration for timeslice scheduler mode
-///        (::ZES_SCHED_MODE_TIMESLICE)
-typedef struct _zes_sched_timeslice_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    uint64_t interval;                              ///< [in,out] The average interval in microseconds that a submission for a
-                                                    ///< context will run on a hardware engine before being preempted out to
-                                                    ///< run a pending submission for another context.
-    uint64_t yieldTimeout;                          ///< [in,out] The maximum time in microseconds that the scheduler will wait
-                                                    ///< to preempt a workload running on an engine before deciding to reset
-                                                    ///< the hardware engine and terminating the associated context.
-
-} zes_sched_timeslice_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Returns handles to scheduler components.
-/// 
-/// @details
-///     - Each scheduler component manages the distribution of work across one
-///       or more accelerator engines.
-///     - If an application wishes to change the scheduler behavior for all
-///       accelerator engines of a specific type (e.g. compute), it should
-///       select all the handles where the structure member
-///       ::zes_sched_properties_t.engines contains that type.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumSchedulers(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_sched_handle_t* phScheduler                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get properties related to a scheduler component
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerGetProperties(
-    zes_sched_handle_t hScheduler,                  ///< [in] Handle for the component.
-    zes_sched_properties_t* pProperties             ///< [in,out] Structure that will contain property data.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get current scheduling mode in effect on a scheduler component.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pMode`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerGetCurrentMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    zes_sched_mode_t* pMode                         ///< [in,out] Will contain the current scheduler mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get scheduler config for mode ::ZES_SCHED_MODE_TIMEOUT
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerGetTimeoutModeProperties(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t getDefaults,                          ///< [in] If TRUE, the driver will return the system default properties for
-                                                    ///< this mode, otherwise it will return the current properties.
-    zes_sched_timeout_properties_t* pConfig         ///< [in,out] Will contain the current parameters for this mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get scheduler config for mode ::ZES_SCHED_MODE_TIMESLICE
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerGetTimesliceModeProperties(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t getDefaults,                          ///< [in] If TRUE, the driver will return the system default properties for
-                                                    ///< this mode, otherwise it will return the current properties.
-    zes_sched_timeslice_properties_t* pConfig       ///< [in,out] Will contain the current parameters for this mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change scheduler mode to ::ZES_SCHED_MODE_TIMEOUT or update scheduler
-///        mode parameters if already running in this mode.
-/// 
-/// @details
-///     - This mode is optimized for multiple applications or contexts
-///       submitting work to the hardware. When higher priority work arrives,
-///       the scheduler attempts to pause the current executing work within some
-///       timeout interval, then submits the other work.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-///         + `nullptr == pNeedReload`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make this modification.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerSetTimeoutMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    zes_sched_timeout_properties_t* pProperties,    ///< [in] The properties to use when configurating this mode.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change scheduler mode to ::ZES_SCHED_MODE_TIMESLICE or update
-///        scheduler mode parameters if already running in this mode.
-/// 
-/// @details
-///     - This mode is optimized to provide fair sharing of hardware execution
-///       time between multiple contexts submitting work to the hardware
-///       concurrently.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-///         + `nullptr == pNeedReload`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make this modification.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerSetTimesliceMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    zes_sched_timeslice_properties_t* pProperties,  ///< [in] The properties to use when configurating this mode.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change scheduler mode to ::ZES_SCHED_MODE_EXCLUSIVE
-/// 
-/// @details
-///     - This mode is optimized for single application/context use-cases. It
-///       permits a context to run indefinitely on the hardware without being
-///       preempted or terminated. All pending work for other contexts must wait
-///       until the running context completes with no further submitted work.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pNeedReload`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make this modification.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerSetExclusiveMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Change scheduler mode to ::ZES_SCHED_MODE_COMPUTE_UNIT_DEBUG
-/// 
-/// @details
-///     - This is a special mode that must ben enabled when debugging an
-///       application that uses this device e.g. using the Level0 Debug API.
-///     - It ensures that only one command queue can execute work on the
-///       hardware at a given time. Work is permitted to run as long as needed
-///       without enforcing any scheduler fairness policies.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hScheduler`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pNeedReload`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + This scheduler component does not support scheduler modes.
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make this modification.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesSchedulerSetComputeUnitDebugMode(
-    zes_sched_handle_t hScheduler,                  ///< [in] Sysman handle for the component.
-    ze_bool_t* pNeedReload                          ///< [in,out] Will be set to TRUE if a device driver reload is needed to
-                                                    ///< apply the new scheduler mode.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Standby domains
-#if !defined(__GNUC__)
-#pragma region standby
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Standby hardware components
-typedef enum _zes_standby_type_t
-{
-    ZES_STANDBY_TYPE_GLOBAL = 0,                    ///< Control the overall standby policy of the device/sub-device
-    ZES_STANDBY_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zes_standby_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Standby hardware component properties
-typedef struct _zes_standby_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_standby_type_t type;                        ///< [out] Which standby hardware component this controls
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-
-} zes_standby_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Standby promotion modes
-typedef enum _zes_standby_promo_mode_t
-{
-    ZES_STANDBY_PROMO_MODE_DEFAULT = 0,             ///< Best compromise between performance and energy savings.
-    ZES_STANDBY_PROMO_MODE_NEVER = 1,               ///< The device/component will never shutdown. This can improve performance
-                                                    ///< but uses more energy.
-    ZES_STANDBY_PROMO_MODE_FORCE_UINT32 = 0x7fffffff
-
-} zes_standby_promo_mode_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of standby controls
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumStandbyDomains(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_standby_handle_t* phStandby                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get standby hardware component properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hStandby`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesStandbyGetProperties(
-    zes_standby_handle_t hStandby,                  ///< [in] Handle for the component.
-    zes_standby_properties_t* pProperties           ///< [in,out] Will contain the standby hardware properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the current standby promotion mode
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hStandby`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pMode`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesStandbyGetMode(
-    zes_standby_handle_t hStandby,                  ///< [in] Handle for the component.
-    zes_standby_promo_mode_t* pMode                 ///< [in,out] Will contain the current standby mode.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set standby promotion mode
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hStandby`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZES_STANDBY_PROMO_MODE_NEVER < mode`
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to make these modifications.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesStandbySetMode(
-    zes_standby_handle_t hStandby,                  ///< [in] Handle for the component.
-    zes_standby_promo_mode_t mode                   ///< [in] New standby mode.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
-#if !defined(__GNUC__)
-#pragma region temperature
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Temperature sensors
-typedef enum _zes_temp_sensors_t
-{
-    ZES_TEMP_SENSORS_GLOBAL = 0,                    ///< The maximum temperature across all device sensors
-    ZES_TEMP_SENSORS_GPU = 1,                       ///< The maximum temperature across all sensors in the GPU
-    ZES_TEMP_SENSORS_MEMORY = 2,                    ///< The maximum temperature across all sensors in the local memory
-    ZES_TEMP_SENSORS_GLOBAL_MIN = 3,                ///< The minimum temperature across all device sensors
-    ZES_TEMP_SENSORS_GPU_MIN = 4,                   ///< The minimum temperature across all sensors in the GPU
-    ZES_TEMP_SENSORS_MEMORY_MIN = 5,                ///< The minimum temperature across all sensors in the local device memory
-    ZES_TEMP_SENSORS_FORCE_UINT32 = 0x7fffffff
-
-} zes_temp_sensors_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Temperature sensor properties
-typedef struct _zes_temp_properties_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zes_temp_sensors_t type;                        ///< [out] Which part of the device the temperature sensor measures
-    ze_bool_t onSubdevice;                          ///< [out] True if the resource is located on a sub-device; false means
-                                                    ///< that the resource is on the device of the calling Sysman handle
-    uint32_t subdeviceId;                           ///< [out] If onSubdevice is true, this gives the ID of the sub-device
-    double maxTemperature;                          ///< [out] Will contain the maximum temperature for the specific device in
-                                                    ///< degrees Celsius.
-    ze_bool_t isCriticalTempSupported;              ///< [out] Indicates if the critical temperature event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL is supported
-    ze_bool_t isThreshold1Supported;                ///< [out] Indicates if the temperature threshold 1 event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 is supported
-    ze_bool_t isThreshold2Supported;                ///< [out] Indicates if the temperature threshold 2 event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 is supported
-
-} zes_temp_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Temperature sensor threshold
-typedef struct _zes_temp_threshold_t
-{
-    ze_bool_t enableLowToHigh;                      ///< [in,out] Trigger an event when the temperature crosses from below the
-                                                    ///< threshold to above.
-    ze_bool_t enableHighToLow;                      ///< [in,out] Trigger an event when the temperature crosses from above the
-                                                    ///< threshold to below.
-    double threshold;                               ///< [in,out] The threshold in degrees Celsius.
-
-} zes_temp_threshold_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Temperature configuration - which events should be triggered and the
-///        trigger conditions.
-typedef struct _zes_temp_config_t
-{
-    zes_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    ze_bool_t enableCritical;                       ///< [in,out] Indicates if event ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL should
-                                                    ///< be triggered by the driver.
-    zes_temp_threshold_t threshold1;                ///< [in,out] Configuration controlling if and when event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 should be triggered by the
-                                                    ///< driver.
-    zes_temp_threshold_t threshold2;                ///< [in,out] Configuration controlling if and when event
-                                                    ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 should be triggered by the
-                                                    ///< driver.
-
-} zes_temp_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get handle of temperature sensors
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesDeviceEnumTemperatureSensors(
-    zes_device_handle_t hDevice,                    ///< [in] Sysman handle of the device.
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of components of this type.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of components of this type that are available.
-                                                    ///< if count is greater than the number of components of this type that
-                                                    ///< are available, then the driver shall update the value with the correct
-                                                    ///< number of components.
-    zes_temp_handle_t* phTemperature                ///< [in,out][optional][range(0, *pCount)] array of handle of components of
-                                                    ///< this type.
-                                                    ///< if count is less than the number of components of this type that are
-                                                    ///< available, then the driver shall only retrieve that number of
-                                                    ///< component handles.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get temperature sensor properties
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTemperature`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesTemperatureGetProperties(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    zes_temp_properties_t* pProperties              ///< [in,out] Will contain the temperature sensor properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get temperature configuration for this sensor - which events are
-///        triggered and the trigger conditions
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTemperature`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Temperature thresholds are not supported on this temperature sensor. Generally this is only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL
-///         + One or both of the thresholds is not supported - check ::zes_temp_properties_t.isThreshold1Supported and ::zes_temp_properties_t.isThreshold2Supported
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to request this feature.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesTemperatureGetConfig(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    zes_temp_config_t* pConfig                      ///< [in,out] Returns current configuration.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Set temperature configuration for this sensor - indicates which events
-///        are triggered and the trigger conditions
-/// 
-/// @details
-///     - Events ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL will be triggered when
-///       temperature reaches the critical range. Use the function
-///       ::zesDeviceEventRegister() to start receiving this event.
-///     - Events ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 and
-///       ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 will be generated when
-///       temperature cross the thresholds set using this function. Use the
-///       function ::zesDeviceEventRegister() to start receiving these events.
-///     - Only one running process can set the temperature configuration at a
-///       time. If another process attempts to change the configuration, the
-///       error ::ZE_RESULT_ERROR_NOT_AVAILABLE will be returned. The function
-///       ::zesTemperatureGetConfig() will return the process ID currently
-///       controlling these settings.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTemperature`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pConfig`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + Temperature thresholds are not supported on this temperature sensor. Generally they are only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL
-///         + Enabling the critical temperature event is not supported - check ::zes_temp_properties_t.isCriticalTempSupported
-///         + One or both of the thresholds is not supported - check ::zes_temp_properties_t.isThreshold1Supported and ::zes_temp_properties_t.isThreshold2Supported
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + User does not have permissions to request this feature.
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + Another running process is controlling these settings.
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + One or both the thresholds is above TjMax (see ::zesFrequencyOcGetTjMax()). Temperature thresholds must be below this value.
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesTemperatureSetConfig(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    const zes_temp_config_t* pConfig                ///< [in] New configuration.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Get the temperature from a specified sensor
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTemperature`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pTemperature`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zesTemperatureGetState(
-    zes_temp_handle_t hTemperature,                 ///< [in] Handle for the component.
-    double* pTemperature                            ///< [in,out] Will contain the temperature read from the specified sensor
-                                                    ///< in degrees Celsius.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // _ZES_API_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/level_zero/zes_ddi.h b/src/gpu/intel/sycl/l0/level_zero/zes_ddi.h
deleted file mode 100644
index b8ad7c8b838..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/zes_ddi.h
+++ /dev/null
@@ -1,1366 +0,0 @@
-/*
- *
- * Copyright (C) 2019-2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file zes_ddi.h
- * @version v1.3-r1.3.7
- *
- */
-#ifndef _ZES_DDI_H
-#define _ZES_DDI_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-#include "zes_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDriverEventListen 
-typedef ze_result_t (ZE_APICALL *zes_pfnDriverEventListen_t)(
-    ze_driver_handle_t,
-    uint32_t,
-    uint32_t,
-    zes_device_handle_t*,
-    uint32_t*,
-    zes_event_type_flags_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDriverEventListenEx 
-typedef ze_result_t (ZE_APICALL *zes_pfnDriverEventListenEx_t)(
-    ze_driver_handle_t,
-    uint64_t,
-    uint32_t,
-    zes_device_handle_t*,
-    uint32_t*,
-    zes_event_type_flags_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Driver functions pointers
-typedef struct _zes_driver_dditable_t
-{
-    zes_pfnDriverEventListen_t                                  pfnEventListen;
-    zes_pfnDriverEventListenEx_t                                pfnEventListenEx;
-} zes_driver_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Driver table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetDriverProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_driver_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetDriverProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetDriverProcAddrTable_t)(
-    ze_api_version_t,
-    zes_driver_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetProperties_t)(
-    zes_device_handle_t,
-    zes_device_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetState_t)(
-    zes_device_handle_t,
-    zes_device_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceReset 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceReset_t)(
-    zes_device_handle_t,
-    ze_bool_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceProcessesGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceProcessesGetState_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_process_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDevicePciGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetProperties_t)(
-    zes_device_handle_t,
-    zes_pci_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDevicePciGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetState_t)(
-    zes_device_handle_t,
-    zes_pci_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDevicePciGetBars 
-typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetBars_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_pci_bar_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDevicePciGetStats 
-typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetStats_t)(
-    zes_device_handle_t,
-    zes_pci_stats_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumDiagnosticTestSuites 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumDiagnosticTestSuites_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_diag_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumEngineGroups 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumEngineGroups_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_engine_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEventRegister 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEventRegister_t)(
-    zes_device_handle_t,
-    zes_event_type_flags_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumFabricPorts 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFabricPorts_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_fabric_port_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumFans 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFans_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_fan_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumFirmwares 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFirmwares_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_firmware_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumFrequencyDomains 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFrequencyDomains_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_freq_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumLeds 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumLeds_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_led_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumMemoryModules 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumMemoryModules_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_mem_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumPerformanceFactorDomains 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumPerformanceFactorDomains_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_perf_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumPowerDomains 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumPowerDomains_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_pwr_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceGetCardPowerDomain 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetCardPowerDomain_t)(
-    zes_device_handle_t,
-    zes_pwr_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumPsus 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumPsus_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_psu_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumRasErrorSets 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumRasErrorSets_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_ras_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumSchedulers 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumSchedulers_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_sched_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumStandbyDomains 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumStandbyDomains_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_standby_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDeviceEnumTemperatureSensors 
-typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumTemperatureSensors_t)(
-    zes_device_handle_t,
-    uint32_t*,
-    zes_temp_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Device functions pointers
-typedef struct _zes_device_dditable_t
-{
-    zes_pfnDeviceGetProperties_t                                pfnGetProperties;
-    zes_pfnDeviceGetState_t                                     pfnGetState;
-    zes_pfnDeviceReset_t                                        pfnReset;
-    zes_pfnDeviceProcessesGetState_t                            pfnProcessesGetState;
-    zes_pfnDevicePciGetProperties_t                             pfnPciGetProperties;
-    zes_pfnDevicePciGetState_t                                  pfnPciGetState;
-    zes_pfnDevicePciGetBars_t                                   pfnPciGetBars;
-    zes_pfnDevicePciGetStats_t                                  pfnPciGetStats;
-    zes_pfnDeviceEnumDiagnosticTestSuites_t                     pfnEnumDiagnosticTestSuites;
-    zes_pfnDeviceEnumEngineGroups_t                             pfnEnumEngineGroups;
-    zes_pfnDeviceEventRegister_t                                pfnEventRegister;
-    zes_pfnDeviceEnumFabricPorts_t                              pfnEnumFabricPorts;
-    zes_pfnDeviceEnumFans_t                                     pfnEnumFans;
-    zes_pfnDeviceEnumFirmwares_t                                pfnEnumFirmwares;
-    zes_pfnDeviceEnumFrequencyDomains_t                         pfnEnumFrequencyDomains;
-    zes_pfnDeviceEnumLeds_t                                     pfnEnumLeds;
-    zes_pfnDeviceEnumMemoryModules_t                            pfnEnumMemoryModules;
-    zes_pfnDeviceEnumPerformanceFactorDomains_t                 pfnEnumPerformanceFactorDomains;
-    zes_pfnDeviceEnumPowerDomains_t                             pfnEnumPowerDomains;
-    zes_pfnDeviceGetCardPowerDomain_t                           pfnGetCardPowerDomain;
-    zes_pfnDeviceEnumPsus_t                                     pfnEnumPsus;
-    zes_pfnDeviceEnumRasErrorSets_t                             pfnEnumRasErrorSets;
-    zes_pfnDeviceEnumSchedulers_t                               pfnEnumSchedulers;
-    zes_pfnDeviceEnumStandbyDomains_t                           pfnEnumStandbyDomains;
-    zes_pfnDeviceEnumTemperatureSensors_t                       pfnEnumTemperatureSensors;
-} zes_device_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Device table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetDeviceProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_device_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetDeviceProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetDeviceProcAddrTable_t)(
-    ze_api_version_t,
-    zes_device_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetProperties_t)(
-    zes_sched_handle_t,
-    zes_sched_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerGetCurrentMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetCurrentMode_t)(
-    zes_sched_handle_t,
-    zes_sched_mode_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerGetTimeoutModeProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetTimeoutModeProperties_t)(
-    zes_sched_handle_t,
-    ze_bool_t,
-    zes_sched_timeout_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerGetTimesliceModeProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetTimesliceModeProperties_t)(
-    zes_sched_handle_t,
-    ze_bool_t,
-    zes_sched_timeslice_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerSetTimeoutMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetTimeoutMode_t)(
-    zes_sched_handle_t,
-    zes_sched_timeout_properties_t*,
-    ze_bool_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerSetTimesliceMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetTimesliceMode_t)(
-    zes_sched_handle_t,
-    zes_sched_timeslice_properties_t*,
-    ze_bool_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerSetExclusiveMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetExclusiveMode_t)(
-    zes_sched_handle_t,
-    ze_bool_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesSchedulerSetComputeUnitDebugMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetComputeUnitDebugMode_t)(
-    zes_sched_handle_t,
-    ze_bool_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Scheduler functions pointers
-typedef struct _zes_scheduler_dditable_t
-{
-    zes_pfnSchedulerGetProperties_t                             pfnGetProperties;
-    zes_pfnSchedulerGetCurrentMode_t                            pfnGetCurrentMode;
-    zes_pfnSchedulerGetTimeoutModeProperties_t                  pfnGetTimeoutModeProperties;
-    zes_pfnSchedulerGetTimesliceModeProperties_t                pfnGetTimesliceModeProperties;
-    zes_pfnSchedulerSetTimeoutMode_t                            pfnSetTimeoutMode;
-    zes_pfnSchedulerSetTimesliceMode_t                          pfnSetTimesliceMode;
-    zes_pfnSchedulerSetExclusiveMode_t                          pfnSetExclusiveMode;
-    zes_pfnSchedulerSetComputeUnitDebugMode_t                   pfnSetComputeUnitDebugMode;
-} zes_scheduler_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Scheduler table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetSchedulerProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_scheduler_dditable_t* pDdiTable             ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetSchedulerProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetSchedulerProcAddrTable_t)(
-    ze_api_version_t,
-    zes_scheduler_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPerformanceFactorGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnPerformanceFactorGetProperties_t)(
-    zes_perf_handle_t,
-    zes_perf_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPerformanceFactorGetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnPerformanceFactorGetConfig_t)(
-    zes_perf_handle_t,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPerformanceFactorSetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnPerformanceFactorSetConfig_t)(
-    zes_perf_handle_t,
-    double
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of PerformanceFactor functions pointers
-typedef struct _zes_performance_factor_dditable_t
-{
-    zes_pfnPerformanceFactorGetProperties_t                     pfnGetProperties;
-    zes_pfnPerformanceFactorGetConfig_t                         pfnGetConfig;
-    zes_pfnPerformanceFactorSetConfig_t                         pfnSetConfig;
-} zes_performance_factor_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's PerformanceFactor table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetPerformanceFactorProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_performance_factor_dditable_t* pDdiTable    ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetPerformanceFactorProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetPerformanceFactorProcAddrTable_t)(
-    ze_api_version_t,
-    zes_performance_factor_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPowerGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetProperties_t)(
-    zes_pwr_handle_t,
-    zes_power_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPowerGetEnergyCounter 
-typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetEnergyCounter_t)(
-    zes_pwr_handle_t,
-    zes_power_energy_counter_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPowerGetLimits 
-typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetLimits_t)(
-    zes_pwr_handle_t,
-    zes_power_sustained_limit_t*,
-    zes_power_burst_limit_t*,
-    zes_power_peak_limit_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPowerSetLimits 
-typedef ze_result_t (ZE_APICALL *zes_pfnPowerSetLimits_t)(
-    zes_pwr_handle_t,
-    const zes_power_sustained_limit_t*,
-    const zes_power_burst_limit_t*,
-    const zes_power_peak_limit_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPowerGetEnergyThreshold 
-typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetEnergyThreshold_t)(
-    zes_pwr_handle_t,
-    zes_energy_threshold_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPowerSetEnergyThreshold 
-typedef ze_result_t (ZE_APICALL *zes_pfnPowerSetEnergyThreshold_t)(
-    zes_pwr_handle_t,
-    double
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Power functions pointers
-typedef struct _zes_power_dditable_t
-{
-    zes_pfnPowerGetProperties_t                                 pfnGetProperties;
-    zes_pfnPowerGetEnergyCounter_t                              pfnGetEnergyCounter;
-    zes_pfnPowerGetLimits_t                                     pfnGetLimits;
-    zes_pfnPowerSetLimits_t                                     pfnSetLimits;
-    zes_pfnPowerGetEnergyThreshold_t                            pfnGetEnergyThreshold;
-    zes_pfnPowerSetEnergyThreshold_t                            pfnSetEnergyThreshold;
-} zes_power_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Power table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetPowerProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_power_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetPowerProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetPowerProcAddrTable_t)(
-    ze_api_version_t,
-    zes_power_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetProperties_t)(
-    zes_freq_handle_t,
-    zes_freq_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyGetAvailableClocks 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetAvailableClocks_t)(
-    zes_freq_handle_t,
-    uint32_t*,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyGetRange 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetRange_t)(
-    zes_freq_handle_t,
-    zes_freq_range_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencySetRange 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencySetRange_t)(
-    zes_freq_handle_t,
-    const zes_freq_range_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetState_t)(
-    zes_freq_handle_t,
-    zes_freq_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyGetThrottleTime 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetThrottleTime_t)(
-    zes_freq_handle_t,
-    zes_freq_throttle_time_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcGetCapabilities 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetCapabilities_t)(
-    zes_freq_handle_t,
-    zes_oc_capabilities_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcGetFrequencyTarget 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetFrequencyTarget_t)(
-    zes_freq_handle_t,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcSetFrequencyTarget 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetFrequencyTarget_t)(
-    zes_freq_handle_t,
-    double
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcGetVoltageTarget 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetVoltageTarget_t)(
-    zes_freq_handle_t,
-    double*,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcSetVoltageTarget 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetVoltageTarget_t)(
-    zes_freq_handle_t,
-    double,
-    double
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcSetMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetMode_t)(
-    zes_freq_handle_t,
-    zes_oc_mode_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcGetMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetMode_t)(
-    zes_freq_handle_t,
-    zes_oc_mode_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcGetIccMax 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetIccMax_t)(
-    zes_freq_handle_t,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcSetIccMax 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetIccMax_t)(
-    zes_freq_handle_t,
-    double
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcGetTjMax 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetTjMax_t)(
-    zes_freq_handle_t,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFrequencyOcSetTjMax 
-typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetTjMax_t)(
-    zes_freq_handle_t,
-    double
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Frequency functions pointers
-typedef struct _zes_frequency_dditable_t
-{
-    zes_pfnFrequencyGetProperties_t                             pfnGetProperties;
-    zes_pfnFrequencyGetAvailableClocks_t                        pfnGetAvailableClocks;
-    zes_pfnFrequencyGetRange_t                                  pfnGetRange;
-    zes_pfnFrequencySetRange_t                                  pfnSetRange;
-    zes_pfnFrequencyGetState_t                                  pfnGetState;
-    zes_pfnFrequencyGetThrottleTime_t                           pfnGetThrottleTime;
-    zes_pfnFrequencyOcGetCapabilities_t                         pfnOcGetCapabilities;
-    zes_pfnFrequencyOcGetFrequencyTarget_t                      pfnOcGetFrequencyTarget;
-    zes_pfnFrequencyOcSetFrequencyTarget_t                      pfnOcSetFrequencyTarget;
-    zes_pfnFrequencyOcGetVoltageTarget_t                        pfnOcGetVoltageTarget;
-    zes_pfnFrequencyOcSetVoltageTarget_t                        pfnOcSetVoltageTarget;
-    zes_pfnFrequencyOcSetMode_t                                 pfnOcSetMode;
-    zes_pfnFrequencyOcGetMode_t                                 pfnOcGetMode;
-    zes_pfnFrequencyOcGetIccMax_t                               pfnOcGetIccMax;
-    zes_pfnFrequencyOcSetIccMax_t                               pfnOcSetIccMax;
-    zes_pfnFrequencyOcGetTjMax_t                                pfnOcGetTjMax;
-    zes_pfnFrequencyOcSetTjMax_t                                pfnOcSetTjMax;
-} zes_frequency_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Frequency table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetFrequencyProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_frequency_dditable_t* pDdiTable             ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetFrequencyProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetFrequencyProcAddrTable_t)(
-    ze_api_version_t,
-    zes_frequency_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesEngineGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnEngineGetProperties_t)(
-    zes_engine_handle_t,
-    zes_engine_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesEngineGetActivity 
-typedef ze_result_t (ZE_APICALL *zes_pfnEngineGetActivity_t)(
-    zes_engine_handle_t,
-    zes_engine_stats_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Engine functions pointers
-typedef struct _zes_engine_dditable_t
-{
-    zes_pfnEngineGetProperties_t                                pfnGetProperties;
-    zes_pfnEngineGetActivity_t                                  pfnGetActivity;
-} zes_engine_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Engine table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetEngineProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_engine_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetEngineProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetEngineProcAddrTable_t)(
-    ze_api_version_t,
-    zes_engine_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesStandbyGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnStandbyGetProperties_t)(
-    zes_standby_handle_t,
-    zes_standby_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesStandbyGetMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnStandbyGetMode_t)(
-    zes_standby_handle_t,
-    zes_standby_promo_mode_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesStandbySetMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnStandbySetMode_t)(
-    zes_standby_handle_t,
-    zes_standby_promo_mode_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Standby functions pointers
-typedef struct _zes_standby_dditable_t
-{
-    zes_pfnStandbyGetProperties_t                               pfnGetProperties;
-    zes_pfnStandbyGetMode_t                                     pfnGetMode;
-    zes_pfnStandbySetMode_t                                     pfnSetMode;
-} zes_standby_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Standby table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetStandbyProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_standby_dditable_t* pDdiTable               ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetStandbyProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetStandbyProcAddrTable_t)(
-    ze_api_version_t,
-    zes_standby_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFirmwareGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareGetProperties_t)(
-    zes_firmware_handle_t,
-    zes_firmware_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFirmwareFlash 
-typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareFlash_t)(
-    zes_firmware_handle_t,
-    void*,
-    uint32_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Firmware functions pointers
-typedef struct _zes_firmware_dditable_t
-{
-    zes_pfnFirmwareGetProperties_t                              pfnGetProperties;
-    zes_pfnFirmwareFlash_t                                      pfnFlash;
-} zes_firmware_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Firmware table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetFirmwareProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_firmware_dditable_t* pDdiTable              ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetFirmwareProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetFirmwareProcAddrTable_t)(
-    ze_api_version_t,
-    zes_firmware_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesMemoryGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnMemoryGetProperties_t)(
-    zes_mem_handle_t,
-    zes_mem_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesMemoryGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnMemoryGetState_t)(
-    zes_mem_handle_t,
-    zes_mem_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesMemoryGetBandwidth 
-typedef ze_result_t (ZE_APICALL *zes_pfnMemoryGetBandwidth_t)(
-    zes_mem_handle_t,
-    zes_mem_bandwidth_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Memory functions pointers
-typedef struct _zes_memory_dditable_t
-{
-    zes_pfnMemoryGetProperties_t                                pfnGetProperties;
-    zes_pfnMemoryGetState_t                                     pfnGetState;
-    zes_pfnMemoryGetBandwidth_t                                 pfnGetBandwidth;
-} zes_memory_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Memory table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetMemoryProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_memory_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetMemoryProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetMemoryProcAddrTable_t)(
-    ze_api_version_t,
-    zes_memory_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFabricPortGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetProperties_t)(
-    zes_fabric_port_handle_t,
-    zes_fabric_port_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFabricPortGetLinkType 
-typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetLinkType_t)(
-    zes_fabric_port_handle_t,
-    zes_fabric_link_type_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFabricPortGetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetConfig_t)(
-    zes_fabric_port_handle_t,
-    zes_fabric_port_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFabricPortSetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortSetConfig_t)(
-    zes_fabric_port_handle_t,
-    const zes_fabric_port_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFabricPortGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetState_t)(
-    zes_fabric_port_handle_t,
-    zes_fabric_port_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFabricPortGetThroughput 
-typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetThroughput_t)(
-    zes_fabric_port_handle_t,
-    zes_fabric_port_throughput_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of FabricPort functions pointers
-typedef struct _zes_fabric_port_dditable_t
-{
-    zes_pfnFabricPortGetProperties_t                            pfnGetProperties;
-    zes_pfnFabricPortGetLinkType_t                              pfnGetLinkType;
-    zes_pfnFabricPortGetConfig_t                                pfnGetConfig;
-    zes_pfnFabricPortSetConfig_t                                pfnSetConfig;
-    zes_pfnFabricPortGetState_t                                 pfnGetState;
-    zes_pfnFabricPortGetThroughput_t                            pfnGetThroughput;
-} zes_fabric_port_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's FabricPort table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetFabricPortProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_fabric_port_dditable_t* pDdiTable           ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetFabricPortProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetFabricPortProcAddrTable_t)(
-    ze_api_version_t,
-    zes_fabric_port_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesTemperatureGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureGetProperties_t)(
-    zes_temp_handle_t,
-    zes_temp_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesTemperatureGetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureGetConfig_t)(
-    zes_temp_handle_t,
-    zes_temp_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesTemperatureSetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureSetConfig_t)(
-    zes_temp_handle_t,
-    const zes_temp_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesTemperatureGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureGetState_t)(
-    zes_temp_handle_t,
-    double*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Temperature functions pointers
-typedef struct _zes_temperature_dditable_t
-{
-    zes_pfnTemperatureGetProperties_t                           pfnGetProperties;
-    zes_pfnTemperatureGetConfig_t                               pfnGetConfig;
-    zes_pfnTemperatureSetConfig_t                               pfnSetConfig;
-    zes_pfnTemperatureGetState_t                                pfnGetState;
-} zes_temperature_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Temperature table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetTemperatureProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_temperature_dditable_t* pDdiTable           ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetTemperatureProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetTemperatureProcAddrTable_t)(
-    ze_api_version_t,
-    zes_temperature_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPsuGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnPsuGetProperties_t)(
-    zes_psu_handle_t,
-    zes_psu_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesPsuGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnPsuGetState_t)(
-    zes_psu_handle_t,
-    zes_psu_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Psu functions pointers
-typedef struct _zes_psu_dditable_t
-{
-    zes_pfnPsuGetProperties_t                                   pfnGetProperties;
-    zes_pfnPsuGetState_t                                        pfnGetState;
-} zes_psu_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Psu table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetPsuProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_psu_dditable_t* pDdiTable                   ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetPsuProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetPsuProcAddrTable_t)(
-    ze_api_version_t,
-    zes_psu_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFanGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnFanGetProperties_t)(
-    zes_fan_handle_t,
-    zes_fan_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFanGetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnFanGetConfig_t)(
-    zes_fan_handle_t,
-    zes_fan_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFanSetDefaultMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnFanSetDefaultMode_t)(
-    zes_fan_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFanSetFixedSpeedMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnFanSetFixedSpeedMode_t)(
-    zes_fan_handle_t,
-    const zes_fan_speed_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFanSetSpeedTableMode 
-typedef ze_result_t (ZE_APICALL *zes_pfnFanSetSpeedTableMode_t)(
-    zes_fan_handle_t,
-    const zes_fan_speed_table_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesFanGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnFanGetState_t)(
-    zes_fan_handle_t,
-    zes_fan_speed_units_t,
-    int32_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Fan functions pointers
-typedef struct _zes_fan_dditable_t
-{
-    zes_pfnFanGetProperties_t                                   pfnGetProperties;
-    zes_pfnFanGetConfig_t                                       pfnGetConfig;
-    zes_pfnFanSetDefaultMode_t                                  pfnSetDefaultMode;
-    zes_pfnFanSetFixedSpeedMode_t                               pfnSetFixedSpeedMode;
-    zes_pfnFanSetSpeedTableMode_t                               pfnSetSpeedTableMode;
-    zes_pfnFanGetState_t                                        pfnGetState;
-} zes_fan_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Fan table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetFanProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_fan_dditable_t* pDdiTable                   ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetFanProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetFanProcAddrTable_t)(
-    ze_api_version_t,
-    zes_fan_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesLedGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnLedGetProperties_t)(
-    zes_led_handle_t,
-    zes_led_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesLedGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnLedGetState_t)(
-    zes_led_handle_t,
-    zes_led_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesLedSetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnLedSetState_t)(
-    zes_led_handle_t,
-    ze_bool_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesLedSetColor 
-typedef ze_result_t (ZE_APICALL *zes_pfnLedSetColor_t)(
-    zes_led_handle_t,
-    const zes_led_color_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Led functions pointers
-typedef struct _zes_led_dditable_t
-{
-    zes_pfnLedGetProperties_t                                   pfnGetProperties;
-    zes_pfnLedGetState_t                                        pfnGetState;
-    zes_pfnLedSetState_t                                        pfnSetState;
-    zes_pfnLedSetColor_t                                        pfnSetColor;
-} zes_led_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Led table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetLedProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_led_dditable_t* pDdiTable                   ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetLedProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetLedProcAddrTable_t)(
-    ze_api_version_t,
-    zes_led_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesRasGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnRasGetProperties_t)(
-    zes_ras_handle_t,
-    zes_ras_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesRasGetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnRasGetConfig_t)(
-    zes_ras_handle_t,
-    zes_ras_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesRasSetConfig 
-typedef ze_result_t (ZE_APICALL *zes_pfnRasSetConfig_t)(
-    zes_ras_handle_t,
-    const zes_ras_config_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesRasGetState 
-typedef ze_result_t (ZE_APICALL *zes_pfnRasGetState_t)(
-    zes_ras_handle_t,
-    ze_bool_t,
-    zes_ras_state_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Ras functions pointers
-typedef struct _zes_ras_dditable_t
-{
-    zes_pfnRasGetProperties_t                                   pfnGetProperties;
-    zes_pfnRasGetConfig_t                                       pfnGetConfig;
-    zes_pfnRasSetConfig_t                                       pfnSetConfig;
-    zes_pfnRasGetState_t                                        pfnGetState;
-} zes_ras_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Ras table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetRasProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_ras_dditable_t* pDdiTable                   ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetRasProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetRasProcAddrTable_t)(
-    ze_api_version_t,
-    zes_ras_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDiagnosticsGetProperties 
-typedef ze_result_t (ZE_APICALL *zes_pfnDiagnosticsGetProperties_t)(
-    zes_diag_handle_t,
-    zes_diag_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDiagnosticsGetTests 
-typedef ze_result_t (ZE_APICALL *zes_pfnDiagnosticsGetTests_t)(
-    zes_diag_handle_t,
-    uint32_t*,
-    zes_diag_test_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesDiagnosticsRunTests 
-typedef ze_result_t (ZE_APICALL *zes_pfnDiagnosticsRunTests_t)(
-    zes_diag_handle_t,
-    uint32_t,
-    uint32_t,
-    zes_diag_result_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Diagnostics functions pointers
-typedef struct _zes_diagnostics_dditable_t
-{
-    zes_pfnDiagnosticsGetProperties_t                           pfnGetProperties;
-    zes_pfnDiagnosticsGetTests_t                                pfnGetTests;
-    zes_pfnDiagnosticsRunTests_t                                pfnRunTests;
-} zes_diagnostics_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Diagnostics table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zesGetDiagnosticsProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zes_diagnostics_dditable_t* pDdiTable           ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zesGetDiagnosticsProcAddrTable
-typedef ze_result_t (ZE_APICALL *zes_pfnGetDiagnosticsProcAddrTable_t)(
-    ze_api_version_t,
-    zes_diagnostics_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Container for all DDI tables
-typedef struct _zes_dditable_t
-{
-    zes_driver_dditable_t               Driver;
-    zes_device_dditable_t               Device;
-    zes_scheduler_dditable_t            Scheduler;
-    zes_performance_factor_dditable_t   PerformanceFactor;
-    zes_power_dditable_t                Power;
-    zes_frequency_dditable_t            Frequency;
-    zes_engine_dditable_t               Engine;
-    zes_standby_dditable_t              Standby;
-    zes_firmware_dditable_t             Firmware;
-    zes_memory_dditable_t               Memory;
-    zes_fabric_port_dditable_t          FabricPort;
-    zes_temperature_dditable_t          Temperature;
-    zes_psu_dditable_t                  Psu;
-    zes_fan_dditable_t                  Fan;
-    zes_led_dditable_t                  Led;
-    zes_ras_dditable_t                  Ras;
-    zes_diagnostics_dditable_t          Diagnostics;
-} zes_dditable_t;
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // _ZES_DDI_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/level_zero/zet.py b/src/gpu/intel/sycl/l0/level_zero/zet.py
deleted file mode 100644
index 9da99c73126..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/zet.py
+++ /dev/null
@@ -1,1159 +0,0 @@
-"""
- Copyright (C) 2019-2021 Intel Corporation
-
- SPDX-License-Identifier: MIT
-
- @file zet.py
- @version v1.3-r1.3.7
-
- """
-import platform
-from ctypes import *
-from enum import *
-
-###############################################################################
-__version__ = "1.0"
-
-###############################################################################
-## @brief Handle to a driver instance
-class zet_driver_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of device object
-class zet_device_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of context object
-class zet_context_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of command list object
-class zet_command_list_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of module object
-class zet_module_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of function object
-class zet_kernel_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of metric group's object
-class zet_metric_group_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of metric's object
-class zet_metric_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of metric streamer's object
-class zet_metric_streamer_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of metric query pool's object
-class zet_metric_query_pool_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of metric query's object
-class zet_metric_query_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Handle of tracer object
-class zet_tracer_exp_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Debug session handle
-class zet_debug_session_handle_t(c_void_p):
-    pass
-
-###############################################################################
-## @brief Defines structure types
-class zet_structure_type_v(IntEnum):
-    METRIC_GROUP_PROPERTIES = 0x1                   ## ::zet_metric_group_properties_t
-    METRIC_PROPERTIES = 0x2                         ## ::zet_metric_properties_t
-    METRIC_STREAMER_DESC = 0x3                      ## ::zet_metric_streamer_desc_t
-    METRIC_QUERY_POOL_DESC = 0x4                    ## ::zet_metric_query_pool_desc_t
-    PROFILE_PROPERTIES = 0x5                        ## ::zet_profile_properties_t
-    DEVICE_DEBUG_PROPERTIES = 0x6                   ## ::zet_device_debug_properties_t
-    DEBUG_MEMORY_SPACE_DESC = 0x7                   ## ::zet_debug_memory_space_desc_t
-    DEBUG_REGSET_PROPERTIES = 0x8                   ## ::zet_debug_regset_properties_t
-    TRACER_EXP_DESC = 0x00010001                    ## ::zet_tracer_exp_desc_t
-
-class zet_structure_type_t(c_int):
-    def __str__(self):
-        return str(zet_structure_type_v(self.value))
-
-
-###############################################################################
-## @brief Base for all properties types
-class zet_base_properties_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in,out][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Base for all descriptor types
-class zet_base_desc_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p)                                             ## [in][optional] pointer to extension-specific structure
-    ]
-
-###############################################################################
-## @brief Supported value types
-class zet_value_type_v(IntEnum):
-    UINT32 = 0                                      ## 32-bit unsigned-integer
-    UINT64 = 1                                      ## 64-bit unsigned-integer
-    FLOAT32 = 2                                     ## 32-bit floating-point
-    FLOAT64 = 3                                     ## 64-bit floating-point
-    BOOL8 = 4                                       ## 8-bit boolean
-
-class zet_value_type_t(c_int):
-    def __str__(self):
-        return str(zet_value_type_v(self.value))
-
-
-###############################################################################
-## @brief Union of values
-class zet_value_t(Structure):
-    _fields_ = [
-        ("ui32", c_ulong),                                              ## [out] 32-bit unsigned-integer
-        ("ui64", c_ulonglong),                                          ## [out] 32-bit unsigned-integer
-        ("fp32", c_float),                                              ## [out] 32-bit floating-point
-        ("fp64", c_double),                                             ## [out] 64-bit floating-point
-        ("b8", ze_bool_t)                                               ## [out] 8-bit boolean
-    ]
-
-###############################################################################
-## @brief Typed value
-class zet_typed_value_t(Structure):
-    _fields_ = [
-        ("type", zet_value_type_t),                                     ## [out] type of value
-        ("value", zet_value_t)                                          ## [out] value
-    ]
-
-###############################################################################
-## @brief Supported module debug info formats.
-class zet_module_debug_info_format_v(IntEnum):
-    ELF_DWARF = 0                                   ## Format is ELF/DWARF
-
-class zet_module_debug_info_format_t(c_int):
-    def __str__(self):
-        return str(zet_module_debug_info_format_v(self.value))
-
-
-###############################################################################
-## @brief Supported device debug property flags
-class zet_device_debug_property_flags_v(IntEnum):
-    ATTACH = ZE_BIT(0)                              ## the device supports attaching for debug
-
-class zet_device_debug_property_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device debug properties queried using ::zetDeviceGetDebugProperties.
-class zet_device_debug_properties_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("flags", zet_device_debug_property_flags_t)                    ## [out] returns 0 (none) or a valid combination of
-                                                                        ## ::zet_device_debug_property_flag_t
-    ]
-
-###############################################################################
-## @brief Debug configuration provided to ::zetDebugAttach
-class zet_debug_config_t(Structure):
-    _fields_ = [
-        ("pid", c_ulong)                                                ## [in] the host process identifier
-    ]
-
-###############################################################################
-## @brief Supported debug event flags.
-class zet_debug_event_flags_v(IntEnum):
-    NEED_ACK = ZE_BIT(0)                            ## The event needs to be acknowledged by calling
-                                                    ## ::zetDebugAcknowledgeEvent.
-
-class zet_debug_event_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Supported debug event types.
-class zet_debug_event_type_v(IntEnum):
-    INVALID = 0                                     ## The event is invalid
-    DETACHED = 1                                    ## The tool was detached
-    PROCESS_ENTRY = 2                               ## The debuggee process created command queues on the device
-    PROCESS_EXIT = 3                                ## The debuggee process destroyed all command queues on the device
-    MODULE_LOAD = 4                                 ## An in-memory module was loaded onto the device
-    MODULE_UNLOAD = 5                               ## An in-memory module is about to get unloaded from the device
-    THREAD_STOPPED = 6                              ## The thread stopped due to a device exception
-    THREAD_UNAVAILABLE = 7                          ## The thread is not available to be stopped
-    PAGE_FAULT = 8                                  ## A page request could not be completed on the device
-
-class zet_debug_event_type_t(c_int):
-    def __str__(self):
-        return str(zet_debug_event_type_v(self.value))
-
-
-###############################################################################
-## @brief Supported debug detach reasons.
-class zet_debug_detach_reason_v(IntEnum):
-    INVALID = 0                                     ## The detach reason is not valid
-    HOST_EXIT = 1                                   ## The host process exited
-
-class zet_debug_detach_reason_t(c_int):
-    def __str__(self):
-        return str(zet_debug_detach_reason_v(self.value))
-
-
-###############################################################################
-## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_DETACHED
-class zet_debug_event_info_detached_t(Structure):
-    _fields_ = [
-        ("reason", zet_debug_detach_reason_t)                           ## [out] the detach reason
-    ]
-
-###############################################################################
-## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD and
-##        ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
-class zet_debug_event_info_module_t(Structure):
-    _fields_ = [
-        ("format", zet_module_debug_info_format_t),                     ## [out] the module format
-        ("moduleBegin", c_ulonglong),                                   ## [out] the begin address of the in-memory module (inclusive)
-        ("moduleEnd", c_ulonglong),                                     ## [out] the end address of the in-memory module (exclusive)
-        ("load", c_ulonglong)                                           ## [out] the load address of the module on the device
-    ]
-
-###############################################################################
-## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED and
-##        ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
-class zet_debug_event_info_thread_stopped_t(Structure):
-    _fields_ = [
-        ("thread", ze_device_thread_t)                                  ## [out] the stopped/unavailable thread
-    ]
-
-###############################################################################
-## @brief Page fault reasons.
-class zet_debug_page_fault_reason_v(IntEnum):
-    INVALID = 0                                     ## The page fault reason is not valid
-    MAPPING_ERROR = 1                               ## The address is not mapped
-    PERMISSION_ERROR = 2                            ## Invalid access permissions
-
-class zet_debug_page_fault_reason_t(c_int):
-    def __str__(self):
-        return str(zet_debug_page_fault_reason_v(self.value))
-
-
-###############################################################################
-## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
-class zet_debug_event_info_page_fault_t(Structure):
-    _fields_ = [
-        ("address", c_ulonglong),                                       ## [out] the faulting address
-        ("mask", c_ulonglong),                                          ## [out] the alignment mask
-        ("reason", zet_debug_page_fault_reason_t)                       ## [out] the page fault reason
-    ]
-
-###############################################################################
-## @brief Event type-specific information
-class zet_debug_event_info_t(Structure):
-    _fields_ = [
-        ("detached", zet_debug_event_info_detached_t),                  ## [out] type == ::ZET_DEBUG_EVENT_TYPE_DETACHED
-        ("module", zet_debug_event_info_module_t),                      ## [out] type == ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD or
-                                                                        ## ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
-        ("thread", zet_debug_event_info_thread_stopped_t),              ## [out] type == ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED or
-                                                                        ## ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
-        ("page_fault", zet_debug_event_info_page_fault_t)               ## [out] type == ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
-    ]
-
-###############################################################################
-## @brief A debug event on the device.
-class zet_debug_event_t(Structure):
-    _fields_ = [
-        ("type", zet_debug_event_type_t),                               ## [out] the event type
-        ("flags", zet_debug_event_flags_t),                             ## [out] returns 0 (none) or a combination of ::zet_debug_event_flag_t
-        ("info", zet_debug_event_info_t)                                ## [out] event type specific information
-    ]
-
-###############################################################################
-## @brief Supported device memory space types.
-class zet_debug_memory_space_type_v(IntEnum):
-    DEFAULT = 0                                     ## default memory space (attribute may be omitted)
-    SLM = 1                                         ## shared local memory space (GPU-only)
-
-class zet_debug_memory_space_type_t(c_int):
-    def __str__(self):
-        return str(zet_debug_memory_space_type_v(self.value))
-
-
-###############################################################################
-## @brief Device memory space descriptor
-class zet_debug_memory_space_desc_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("type", zet_debug_memory_space_type_t),                        ## [in] type of memory space
-        ("address", c_ulonglong)                                        ## [in] the virtual address within the memory space
-    ]
-
-###############################################################################
-## @brief Supported general register set flags.
-class zet_debug_regset_flags_v(IntEnum):
-    READABLE = ZE_BIT(0)                            ## register set is readable
-    WRITEABLE = ZE_BIT(1)                           ## register set is writeable
-
-class zet_debug_regset_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Device register set properties queried using
-##        ::zetDebugGetRegisterSetProperties.
-class zet_debug_regset_properties_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("type", c_ulong),                                              ## [out] device-specific register set type
-        ("version", c_ulong),                                           ## [out] device-specific version of this register set
-        ("generalFlags", zet_debug_regset_flags_t),                     ## [out] general register set flags
-        ("deviceFlags", c_ulong),                                       ## [out] device-specific register set flags
-        ("count", c_ulong),                                             ## [out] number of registers in the set
-        ("bitSize", c_ulong),                                           ## [out] the size of a register in bits
-        ("byteSize", c_ulong)                                           ## [out] the size required for reading or writing a register in bytes
-    ]
-
-###############################################################################
-## @brief Maximum metric group name string size
-ZET_MAX_METRIC_GROUP_NAME = 256
-
-###############################################################################
-## @brief Maximum metric group description string size
-ZET_MAX_METRIC_GROUP_DESCRIPTION = 256
-
-###############################################################################
-## @brief Metric group sampling type
-class zet_metric_group_sampling_type_flags_v(IntEnum):
-    EVENT_BASED = ZE_BIT(0)                         ## Event based sampling
-    TIME_BASED = ZE_BIT(1)                          ## Time based sampling
-
-class zet_metric_group_sampling_type_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Metric group properties queried using ::zetMetricGroupGetProperties
-class zet_metric_group_properties_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("name", c_char * ZET_MAX_METRIC_GROUP_NAME),                   ## [out] metric group name
-        ("description", c_char * ZET_MAX_METRIC_GROUP_DESCRIPTION),     ## [out] metric group description
-        ("samplingType", zet_metric_group_sampling_type_flags_t),       ## [out] metric group sampling type.
-                                                                        ## returns a combination of ::zet_metric_group_sampling_type_flag_t.
-        ("domain", c_ulong),                                            ## [out] metric group domain number. Cannot use multiple, simultaneous
-                                                                        ## metric groups from the same domain.
-        ("metricCount", c_ulong)                                        ## [out] metric count belonging to this group
-    ]
-
-###############################################################################
-## @brief Metric types
-class zet_metric_type_v(IntEnum):
-    DURATION = 0                                    ## Metric type: duration
-    EVENT = 1                                       ## Metric type: event
-    EVENT_WITH_RANGE = 2                            ## Metric type: event with range
-    THROUGHPUT = 3                                  ## Metric type: throughput
-    TIMESTAMP = 4                                   ## Metric type: timestamp
-    FLAG = 5                                        ## Metric type: flag
-    RATIO = 6                                       ## Metric type: ratio
-    RAW = 7                                         ## Metric type: raw
-    IP_EXP = 0x7ffffffe                             ## Metric type: instruction pointer
-
-class zet_metric_type_t(c_int):
-    def __str__(self):
-        return str(zet_metric_type_v(self.value))
-
-
-###############################################################################
-## @brief Metric group calculation type
-class zet_metric_group_calculation_type_v(IntEnum):
-    METRIC_VALUES = 0                               ## Calculated metric values from raw data.
-    MAX_METRIC_VALUES = 1                           ## Maximum metric values.
-
-class zet_metric_group_calculation_type_t(c_int):
-    def __str__(self):
-        return str(zet_metric_group_calculation_type_v(self.value))
-
-
-###############################################################################
-## @brief Maximum metric name string size
-ZET_MAX_METRIC_NAME = 256
-
-###############################################################################
-## @brief Maximum metric description string size
-ZET_MAX_METRIC_DESCRIPTION = 256
-
-###############################################################################
-## @brief Maximum metric component string size
-ZET_MAX_METRIC_COMPONENT = 256
-
-###############################################################################
-## @brief Maximum metric result units string size
-ZET_MAX_METRIC_RESULT_UNITS = 256
-
-###############################################################################
-## @brief Metric properties queried using ::zetMetricGetProperties
-class zet_metric_properties_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("name", c_char * ZET_MAX_METRIC_NAME),                         ## [out] metric name
-        ("description", c_char * ZET_MAX_METRIC_DESCRIPTION),           ## [out] metric description
-        ("component", c_char * ZET_MAX_METRIC_COMPONENT),               ## [out] metric component
-        ("tierNumber", c_ulong),                                        ## [out] number of tier
-        ("metricType", zet_metric_type_t),                              ## [out] metric type
-        ("resultType", zet_value_type_t),                               ## [out] metric result type
-        ("resultUnits", c_char * ZET_MAX_METRIC_RESULT_UNITS)           ## [out] metric result units
-    ]
-
-###############################################################################
-## @brief Metric streamer descriptor
-class zet_metric_streamer_desc_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("notifyEveryNReports", c_ulong),                               ## [in,out] number of collected reports after which notification event
-                                                                        ## will be signalled
-        ("samplingPeriod", c_ulong)                                     ## [in,out] streamer sampling period in nanoseconds
-    ]
-
-###############################################################################
-## @brief Metric query pool types
-class zet_metric_query_pool_type_v(IntEnum):
-    PERFORMANCE = 0                                 ## Performance metric query pool.
-    EXECUTION = 1                                   ## Skips workload execution between begin/end calls.
-
-class zet_metric_query_pool_type_t(c_int):
-    def __str__(self):
-        return str(zet_metric_query_pool_type_v(self.value))
-
-
-###############################################################################
-## @brief Metric query pool description
-class zet_metric_query_pool_desc_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("type", zet_metric_query_pool_type_t),                         ## [in] Query pool type.
-        ("count", c_ulong)                                              ## [in] Internal slots count within query pool object.
-    ]
-
-###############################################################################
-## @brief Supportted profile features
-class zet_profile_flags_v(IntEnum):
-    REGISTER_REALLOCATION = ZE_BIT(0)               ## request the compiler attempt to minimize register usage as much as
-                                                    ## possible to allow for instrumentation
-    FREE_REGISTER_INFO = ZE_BIT(1)                  ## request the compiler generate free register info
-
-class zet_profile_flags_t(c_int):
-    def __str__(self):
-        return hex(self.value)
-
-
-###############################################################################
-## @brief Profiling meta-data for instrumentation
-class zet_profile_properties_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in,out][optional] pointer to extension-specific structure
-        ("flags", zet_profile_flags_t),                                 ## [out] indicates which flags were enabled during compilation.
-                                                                        ## returns 0 (none) or a combination of ::zet_profile_flag_t
-        ("numTokens", c_ulong)                                          ## [out] number of tokens immediately following this structure
-    ]
-
-###############################################################################
-## @brief Supported profile token types
-class zet_profile_token_type_v(IntEnum):
-    FREE_REGISTER = 0                               ## GRF info
-
-class zet_profile_token_type_t(c_int):
-    def __str__(self):
-        return str(zet_profile_token_type_v(self.value))
-
-
-###############################################################################
-## @brief Profile free register token detailing unused registers in the current
-##        function
-class zet_profile_free_register_token_t(Structure):
-    _fields_ = [
-        ("type", zet_profile_token_type_t),                             ## [out] type of token
-        ("size", c_ulong),                                              ## [out] total size of the token, in bytes
-        ("count", c_ulong)                                              ## [out] number of register sequences immediately following this
-                                                                        ## structure
-    ]
-
-###############################################################################
-## @brief Profile register sequence detailing consecutive bytes, all of which
-##        are unused
-class zet_profile_register_sequence_t(Structure):
-    _fields_ = [
-        ("start", c_ulong),                                             ## [out] starting byte in the register table, representing the start of
-                                                                        ## unused bytes in the current function
-        ("count", c_ulong)                                              ## [out] number of consecutive bytes in the sequence, starting from start
-    ]
-
-###############################################################################
-## @brief API Tracing Experimental Extension Name
-ZET_API_TRACING_EXP_NAME = "ZET_experimental_api_tracing"
-
-###############################################################################
-## @brief API Tracing Experimental Extension Version(s)
-class zet_api_tracing_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class zet_api_tracing_exp_version_t(c_int):
-    def __str__(self):
-        return str(zet_api_tracing_exp_version_v(self.value))
-
-
-###############################################################################
-## @brief Alias the existing callbacks definition for 'core' callbacks
-class zet_core_callbacks_t(ze_callbacks_t):
-    pass
-
-###############################################################################
-## @brief Tracer descriptor
-class zet_tracer_exp_desc_t(Structure):
-    _fields_ = [
-        ("stype", zet_structure_type_t),                                ## [in] type of this structure
-        ("pNext", c_void_p),                                            ## [in][optional] pointer to extension-specific structure
-        ("pUserData", c_void_p)                                         ## [in] pointer passed to every tracer's callbacks
-    ]
-
-###############################################################################
-## @brief Calculating Multiple Metrics Experimental Extension Name
-ZET_MULTI_METRICS_EXP_NAME = "ZET_experimental_calculate_multiple_metrics"
-
-###############################################################################
-## @brief Calculating Multiple Metrics Experimental Extension Version(s)
-class ze_calculate_multiple_metrics_exp_version_v(IntEnum):
-    _1_0 = ZE_MAKE_VERSION( 1, 0 )                  ## version 1.0
-    CURRENT = ZE_MAKE_VERSION( 1, 0 )               ## latest known version
-
-class ze_calculate_multiple_metrics_exp_version_t(c_int):
-    def __str__(self):
-        return str(ze_calculate_multiple_metrics_exp_version_v(self.value))
-
-
-###############################################################################
-__use_win_types = "Windows" == platform.uname()[0]
-
-###############################################################################
-## @brief Function-pointer for zetDeviceGetDebugProperties
-if __use_win_types:
-    _zetDeviceGetDebugProperties_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_device_debug_properties_t) )
-else:
-    _zetDeviceGetDebugProperties_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_device_debug_properties_t) )
-
-
-###############################################################################
-## @brief Table of Device functions pointers
-class _zet_device_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetDebugProperties", c_void_p)                             ## _zetDeviceGetDebugProperties_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetContextActivateMetricGroups
-if __use_win_types:
-    _zetContextActivateMetricGroups_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, c_ulong, POINTER(zet_metric_group_handle_t) )
-else:
-    _zetContextActivateMetricGroups_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, c_ulong, POINTER(zet_metric_group_handle_t) )
-
-
-###############################################################################
-## @brief Table of Context functions pointers
-class _zet_context_dditable_t(Structure):
-    _fields_ = [
-        ("pfnActivateMetricGroups", c_void_p)                           ## _zetContextActivateMetricGroups_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetCommandListAppendMetricStreamerMarker
-if __use_win_types:
-    _zetCommandListAppendMetricStreamerMarker_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_streamer_handle_t, c_ulong )
-else:
-    _zetCommandListAppendMetricStreamerMarker_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_streamer_handle_t, c_ulong )
-
-###############################################################################
-## @brief Function-pointer for zetCommandListAppendMetricQueryBegin
-if __use_win_types:
-    _zetCommandListAppendMetricQueryBegin_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t )
-else:
-    _zetCommandListAppendMetricQueryBegin_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zetCommandListAppendMetricQueryEnd
-if __use_win_types:
-    _zetCommandListAppendMetricQueryEnd_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-else:
-    _zetCommandListAppendMetricQueryEnd_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetCommandListAppendMetricMemoryBarrier
-if __use_win_types:
-    _zetCommandListAppendMetricMemoryBarrier_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t )
-else:
-    _zetCommandListAppendMetricMemoryBarrier_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t )
-
-
-###############################################################################
-## @brief Table of CommandList functions pointers
-class _zet_command_list_dditable_t(Structure):
-    _fields_ = [
-        ("pfnAppendMetricStreamerMarker", c_void_p),                    ## _zetCommandListAppendMetricStreamerMarker_t
-        ("pfnAppendMetricQueryBegin", c_void_p),                        ## _zetCommandListAppendMetricQueryBegin_t
-        ("pfnAppendMetricQueryEnd", c_void_p),                          ## _zetCommandListAppendMetricQueryEnd_t
-        ("pfnAppendMetricMemoryBarrier", c_void_p)                      ## _zetCommandListAppendMetricMemoryBarrier_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetModuleGetDebugInfo
-if __use_win_types:
-    _zetModuleGetDebugInfo_t = WINFUNCTYPE( ze_result_t, zet_module_handle_t, zet_module_debug_info_format_t, POINTER(c_size_t), POINTER(c_ubyte) )
-else:
-    _zetModuleGetDebugInfo_t = CFUNCTYPE( ze_result_t, zet_module_handle_t, zet_module_debug_info_format_t, POINTER(c_size_t), POINTER(c_ubyte) )
-
-
-###############################################################################
-## @brief Table of Module functions pointers
-class _zet_module_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetDebugInfo", c_void_p)                                   ## _zetModuleGetDebugInfo_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetKernelGetProfileInfo
-if __use_win_types:
-    _zetKernelGetProfileInfo_t = WINFUNCTYPE( ze_result_t, zet_kernel_handle_t, POINTER(zet_profile_properties_t) )
-else:
-    _zetKernelGetProfileInfo_t = CFUNCTYPE( ze_result_t, zet_kernel_handle_t, POINTER(zet_profile_properties_t) )
-
-
-###############################################################################
-## @brief Table of Kernel functions pointers
-class _zet_kernel_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGetProfileInfo", c_void_p)                                 ## _zetKernelGetProfileInfo_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetMetricGroupGet
-if __use_win_types:
-    _zetMetricGroupGet_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_metric_group_handle_t) )
-else:
-    _zetMetricGroupGet_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_metric_group_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetMetricGroupGetProperties
-if __use_win_types:
-    _zetMetricGroupGetProperties_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(zet_metric_group_properties_t) )
-else:
-    _zetMetricGroupGetProperties_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(zet_metric_group_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zetMetricGroupCalculateMetricValues
-if __use_win_types:
-    _zetMetricGroupCalculateMetricValues_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(zet_typed_value_t) )
-else:
-    _zetMetricGroupCalculateMetricValues_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(zet_typed_value_t) )
-
-
-###############################################################################
-## @brief Table of MetricGroup functions pointers
-class _zet_metric_group_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGet", c_void_p),                                           ## _zetMetricGroupGet_t
-        ("pfnGetProperties", c_void_p),                                 ## _zetMetricGroupGetProperties_t
-        ("pfnCalculateMetricValues", c_void_p)                          ## _zetMetricGroupCalculateMetricValues_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetMetricGroupCalculateMultipleMetricValuesExp
-if __use_win_types:
-    _zetMetricGroupCalculateMultipleMetricValuesExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_typed_value_t) )
-else:
-    _zetMetricGroupCalculateMultipleMetricValuesExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_typed_value_t) )
-
-
-###############################################################################
-## @brief Table of MetricGroupExp functions pointers
-class _zet_metric_group_exp_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCalculateMultipleMetricValuesExp", c_void_p)               ## _zetMetricGroupCalculateMultipleMetricValuesExp_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetMetricGet
-if __use_win_types:
-    _zetMetricGet_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
-else:
-    _zetMetricGet_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetMetricGetProperties
-if __use_win_types:
-    _zetMetricGetProperties_t = WINFUNCTYPE( ze_result_t, zet_metric_handle_t, POINTER(zet_metric_properties_t) )
-else:
-    _zetMetricGetProperties_t = CFUNCTYPE( ze_result_t, zet_metric_handle_t, POINTER(zet_metric_properties_t) )
-
-
-###############################################################################
-## @brief Table of Metric functions pointers
-class _zet_metric_dditable_t(Structure):
-    _fields_ = [
-        ("pfnGet", c_void_p),                                           ## _zetMetricGet_t
-        ("pfnGetProperties", c_void_p)                                  ## _zetMetricGetProperties_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetMetricStreamerOpen
-if __use_win_types:
-    _zetMetricStreamerOpen_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_streamer_desc_t), ze_event_handle_t, POINTER(zet_metric_streamer_handle_t) )
-else:
-    _zetMetricStreamerOpen_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_streamer_desc_t), ze_event_handle_t, POINTER(zet_metric_streamer_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetMetricStreamerClose
-if __use_win_types:
-    _zetMetricStreamerClose_t = WINFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t )
-else:
-    _zetMetricStreamerClose_t = CFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zetMetricStreamerReadData
-if __use_win_types:
-    _zetMetricStreamerReadData_t = WINFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_ubyte) )
-else:
-    _zetMetricStreamerReadData_t = CFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_ubyte) )
-
-
-###############################################################################
-## @brief Table of MetricStreamer functions pointers
-class _zet_metric_streamer_dditable_t(Structure):
-    _fields_ = [
-        ("pfnOpen", c_void_p),                                          ## _zetMetricStreamerOpen_t
-        ("pfnClose", c_void_p),                                         ## _zetMetricStreamerClose_t
-        ("pfnReadData", c_void_p)                                       ## _zetMetricStreamerReadData_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetMetricQueryPoolCreate
-if __use_win_types:
-    _zetMetricQueryPoolCreate_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_query_pool_desc_t), POINTER(zet_metric_query_pool_handle_t) )
-else:
-    _zetMetricQueryPoolCreate_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_query_pool_desc_t), POINTER(zet_metric_query_pool_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetMetricQueryPoolDestroy
-if __use_win_types:
-    _zetMetricQueryPoolDestroy_t = WINFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t )
-else:
-    _zetMetricQueryPoolDestroy_t = CFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t )
-
-
-###############################################################################
-## @brief Table of MetricQueryPool functions pointers
-class _zet_metric_query_pool_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zetMetricQueryPoolCreate_t
-        ("pfnDestroy", c_void_p)                                        ## _zetMetricQueryPoolDestroy_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetMetricQueryCreate
-if __use_win_types:
-    _zetMetricQueryCreate_t = WINFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t, c_ulong, POINTER(zet_metric_query_handle_t) )
-else:
-    _zetMetricQueryCreate_t = CFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t, c_ulong, POINTER(zet_metric_query_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetMetricQueryDestroy
-if __use_win_types:
-    _zetMetricQueryDestroy_t = WINFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
-else:
-    _zetMetricQueryDestroy_t = CFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zetMetricQueryReset
-if __use_win_types:
-    _zetMetricQueryReset_t = WINFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
-else:
-    _zetMetricQueryReset_t = CFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zetMetricQueryGetData
-if __use_win_types:
-    _zetMetricQueryGetData_t = WINFUNCTYPE( ze_result_t, zet_metric_query_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
-else:
-    _zetMetricQueryGetData_t = CFUNCTYPE( ze_result_t, zet_metric_query_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
-
-
-###############################################################################
-## @brief Table of MetricQuery functions pointers
-class _zet_metric_query_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zetMetricQueryCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zetMetricQueryDestroy_t
-        ("pfnReset", c_void_p),                                         ## _zetMetricQueryReset_t
-        ("pfnGetData", c_void_p)                                        ## _zetMetricQueryGetData_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetTracerExpCreate
-if __use_win_types:
-    _zetTracerExpCreate_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, POINTER(zet_tracer_exp_desc_t), POINTER(zet_tracer_exp_handle_t) )
-else:
-    _zetTracerExpCreate_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, POINTER(zet_tracer_exp_desc_t), POINTER(zet_tracer_exp_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetTracerExpDestroy
-if __use_win_types:
-    _zetTracerExpDestroy_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t )
-else:
-    _zetTracerExpDestroy_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zetTracerExpSetPrologues
-if __use_win_types:
-    _zetTracerExpSetPrologues_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
-else:
-    _zetTracerExpSetPrologues_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
-
-###############################################################################
-## @brief Function-pointer for zetTracerExpSetEpilogues
-if __use_win_types:
-    _zetTracerExpSetEpilogues_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
-else:
-    _zetTracerExpSetEpilogues_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
-
-###############################################################################
-## @brief Function-pointer for zetTracerExpSetEnabled
-if __use_win_types:
-    _zetTracerExpSetEnabled_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, ze_bool_t )
-else:
-    _zetTracerExpSetEnabled_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, ze_bool_t )
-
-
-###############################################################################
-## @brief Table of TracerExp functions pointers
-class _zet_tracer_exp_dditable_t(Structure):
-    _fields_ = [
-        ("pfnCreate", c_void_p),                                        ## _zetTracerExpCreate_t
-        ("pfnDestroy", c_void_p),                                       ## _zetTracerExpDestroy_t
-        ("pfnSetPrologues", c_void_p),                                  ## _zetTracerExpSetPrologues_t
-        ("pfnSetEpilogues", c_void_p),                                  ## _zetTracerExpSetEpilogues_t
-        ("pfnSetEnabled", c_void_p)                                     ## _zetTracerExpSetEnabled_t
-    ]
-
-###############################################################################
-## @brief Function-pointer for zetDebugAttach
-if __use_win_types:
-    _zetDebugAttach_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_debug_config_t), POINTER(zet_debug_session_handle_t) )
-else:
-    _zetDebugAttach_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_debug_config_t), POINTER(zet_debug_session_handle_t) )
-
-###############################################################################
-## @brief Function-pointer for zetDebugDetach
-if __use_win_types:
-    _zetDebugDetach_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t )
-else:
-    _zetDebugDetach_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t )
-
-###############################################################################
-## @brief Function-pointer for zetDebugReadEvent
-if __use_win_types:
-    _zetDebugReadEvent_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, c_ulonglong, POINTER(zet_debug_event_t) )
-else:
-    _zetDebugReadEvent_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, c_ulonglong, POINTER(zet_debug_event_t) )
-
-###############################################################################
-## @brief Function-pointer for zetDebugAcknowledgeEvent
-if __use_win_types:
-    _zetDebugAcknowledgeEvent_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, POINTER(zet_debug_event_t) )
-else:
-    _zetDebugAcknowledgeEvent_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, POINTER(zet_debug_event_t) )
-
-###############################################################################
-## @brief Function-pointer for zetDebugInterrupt
-if __use_win_types:
-    _zetDebugInterrupt_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
-else:
-    _zetDebugInterrupt_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
-
-###############################################################################
-## @brief Function-pointer for zetDebugResume
-if __use_win_types:
-    _zetDebugResume_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
-else:
-    _zetDebugResume_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
-
-###############################################################################
-## @brief Function-pointer for zetDebugReadMemory
-if __use_win_types:
-    _zetDebugReadMemory_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
-else:
-    _zetDebugReadMemory_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
-
-###############################################################################
-## @brief Function-pointer for zetDebugWriteMemory
-if __use_win_types:
-    _zetDebugWriteMemory_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
-else:
-    _zetDebugWriteMemory_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
-
-###############################################################################
-## @brief Function-pointer for zetDebugGetRegisterSetProperties
-if __use_win_types:
-    _zetDebugGetRegisterSetProperties_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_debug_regset_properties_t) )
-else:
-    _zetDebugGetRegisterSetProperties_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_debug_regset_properties_t) )
-
-###############################################################################
-## @brief Function-pointer for zetDebugReadRegisters
-if __use_win_types:
-    _zetDebugReadRegisters_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
-else:
-    _zetDebugReadRegisters_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
-
-###############################################################################
-## @brief Function-pointer for zetDebugWriteRegisters
-if __use_win_types:
-    _zetDebugWriteRegisters_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
-else:
-    _zetDebugWriteRegisters_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
-
-
-###############################################################################
-## @brief Table of Debug functions pointers
-class _zet_debug_dditable_t(Structure):
-    _fields_ = [
-        ("pfnAttach", c_void_p),                                        ## _zetDebugAttach_t
-        ("pfnDetach", c_void_p),                                        ## _zetDebugDetach_t
-        ("pfnReadEvent", c_void_p),                                     ## _zetDebugReadEvent_t
-        ("pfnAcknowledgeEvent", c_void_p),                              ## _zetDebugAcknowledgeEvent_t
-        ("pfnInterrupt", c_void_p),                                     ## _zetDebugInterrupt_t
-        ("pfnResume", c_void_p),                                        ## _zetDebugResume_t
-        ("pfnReadMemory", c_void_p),                                    ## _zetDebugReadMemory_t
-        ("pfnWriteMemory", c_void_p),                                   ## _zetDebugWriteMemory_t
-        ("pfnGetRegisterSetProperties", c_void_p),                      ## _zetDebugGetRegisterSetProperties_t
-        ("pfnReadRegisters", c_void_p),                                 ## _zetDebugReadRegisters_t
-        ("pfnWriteRegisters", c_void_p)                                 ## _zetDebugWriteRegisters_t
-    ]
-
-###############################################################################
-class _zet_dditable_t(Structure):
-    _fields_ = [
-        ("Device", _zet_device_dditable_t),
-        ("Context", _zet_context_dditable_t),
-        ("CommandList", _zet_command_list_dditable_t),
-        ("Module", _zet_module_dditable_t),
-        ("Kernel", _zet_kernel_dditable_t),
-        ("MetricGroup", _zet_metric_group_dditable_t),
-        ("MetricGroupExp", _zet_metric_group_exp_dditable_t),
-        ("Metric", _zet_metric_dditable_t),
-        ("MetricStreamer", _zet_metric_streamer_dditable_t),
-        ("MetricQueryPool", _zet_metric_query_pool_dditable_t),
-        ("MetricQuery", _zet_metric_query_dditable_t),
-        ("TracerExp", _zet_tracer_exp_dditable_t),
-        ("Debug", _zet_debug_dditable_t)
-    ]
-
-###############################################################################
-## @brief zet device-driver interfaces
-class ZET_DDI:
-    def __init__(self, version : ze_api_version_t):
-        # load the ze_loader library
-        if "Windows" == platform.uname()[0]:
-            self.__dll = WinDLL("ze_loader.dll")
-        else:
-            self.__dll = CDLL("ze_loader.so")
-
-        # fill the ddi tables
-        self.__dditable = _zet_dditable_t()
-
-        # call driver to get function pointers
-        _Device = _zet_device_dditable_t()
-        r = ze_result_v(self.__dll.zetGetDeviceProcAddrTable(version, byref(_Device)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Device = _Device
-
-        # attach function interface to function address
-        self.zetDeviceGetDebugProperties = _zetDeviceGetDebugProperties_t(self.__dditable.Device.pfnGetDebugProperties)
-
-        # call driver to get function pointers
-        _Context = _zet_context_dditable_t()
-        r = ze_result_v(self.__dll.zetGetContextProcAddrTable(version, byref(_Context)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Context = _Context
-
-        # attach function interface to function address
-        self.zetContextActivateMetricGroups = _zetContextActivateMetricGroups_t(self.__dditable.Context.pfnActivateMetricGroups)
-
-        # call driver to get function pointers
-        _CommandList = _zet_command_list_dditable_t()
-        r = ze_result_v(self.__dll.zetGetCommandListProcAddrTable(version, byref(_CommandList)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.CommandList = _CommandList
-
-        # attach function interface to function address
-        self.zetCommandListAppendMetricStreamerMarker = _zetCommandListAppendMetricStreamerMarker_t(self.__dditable.CommandList.pfnAppendMetricStreamerMarker)
-        self.zetCommandListAppendMetricQueryBegin = _zetCommandListAppendMetricQueryBegin_t(self.__dditable.CommandList.pfnAppendMetricQueryBegin)
-        self.zetCommandListAppendMetricQueryEnd = _zetCommandListAppendMetricQueryEnd_t(self.__dditable.CommandList.pfnAppendMetricQueryEnd)
-        self.zetCommandListAppendMetricMemoryBarrier = _zetCommandListAppendMetricMemoryBarrier_t(self.__dditable.CommandList.pfnAppendMetricMemoryBarrier)
-
-        # call driver to get function pointers
-        _Module = _zet_module_dditable_t()
-        r = ze_result_v(self.__dll.zetGetModuleProcAddrTable(version, byref(_Module)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Module = _Module
-
-        # attach function interface to function address
-        self.zetModuleGetDebugInfo = _zetModuleGetDebugInfo_t(self.__dditable.Module.pfnGetDebugInfo)
-
-        # call driver to get function pointers
-        _Kernel = _zet_kernel_dditable_t()
-        r = ze_result_v(self.__dll.zetGetKernelProcAddrTable(version, byref(_Kernel)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Kernel = _Kernel
-
-        # attach function interface to function address
-        self.zetKernelGetProfileInfo = _zetKernelGetProfileInfo_t(self.__dditable.Kernel.pfnGetProfileInfo)
-
-        # call driver to get function pointers
-        _MetricGroup = _zet_metric_group_dditable_t()
-        r = ze_result_v(self.__dll.zetGetMetricGroupProcAddrTable(version, byref(_MetricGroup)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.MetricGroup = _MetricGroup
-
-        # attach function interface to function address
-        self.zetMetricGroupGet = _zetMetricGroupGet_t(self.__dditable.MetricGroup.pfnGet)
-        self.zetMetricGroupGetProperties = _zetMetricGroupGetProperties_t(self.__dditable.MetricGroup.pfnGetProperties)
-        self.zetMetricGroupCalculateMetricValues = _zetMetricGroupCalculateMetricValues_t(self.__dditable.MetricGroup.pfnCalculateMetricValues)
-
-        # call driver to get function pointers
-        _MetricGroupExp = _zet_metric_group_exp_dditable_t()
-        r = ze_result_v(self.__dll.zetGetMetricGroupExpProcAddrTable(version, byref(_MetricGroupExp)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.MetricGroupExp = _MetricGroupExp
-
-        # attach function interface to function address
-        self.zetMetricGroupCalculateMultipleMetricValuesExp = _zetMetricGroupCalculateMultipleMetricValuesExp_t(self.__dditable.MetricGroupExp.pfnCalculateMultipleMetricValuesExp)
-
-        # call driver to get function pointers
-        _Metric = _zet_metric_dditable_t()
-        r = ze_result_v(self.__dll.zetGetMetricProcAddrTable(version, byref(_Metric)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Metric = _Metric
-
-        # attach function interface to function address
-        self.zetMetricGet = _zetMetricGet_t(self.__dditable.Metric.pfnGet)
-        self.zetMetricGetProperties = _zetMetricGetProperties_t(self.__dditable.Metric.pfnGetProperties)
-
-        # call driver to get function pointers
-        _MetricStreamer = _zet_metric_streamer_dditable_t()
-        r = ze_result_v(self.__dll.zetGetMetricStreamerProcAddrTable(version, byref(_MetricStreamer)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.MetricStreamer = _MetricStreamer
-
-        # attach function interface to function address
-        self.zetMetricStreamerOpen = _zetMetricStreamerOpen_t(self.__dditable.MetricStreamer.pfnOpen)
-        self.zetMetricStreamerClose = _zetMetricStreamerClose_t(self.__dditable.MetricStreamer.pfnClose)
-        self.zetMetricStreamerReadData = _zetMetricStreamerReadData_t(self.__dditable.MetricStreamer.pfnReadData)
-
-        # call driver to get function pointers
-        _MetricQueryPool = _zet_metric_query_pool_dditable_t()
-        r = ze_result_v(self.__dll.zetGetMetricQueryPoolProcAddrTable(version, byref(_MetricQueryPool)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.MetricQueryPool = _MetricQueryPool
-
-        # attach function interface to function address
-        self.zetMetricQueryPoolCreate = _zetMetricQueryPoolCreate_t(self.__dditable.MetricQueryPool.pfnCreate)
-        self.zetMetricQueryPoolDestroy = _zetMetricQueryPoolDestroy_t(self.__dditable.MetricQueryPool.pfnDestroy)
-
-        # call driver to get function pointers
-        _MetricQuery = _zet_metric_query_dditable_t()
-        r = ze_result_v(self.__dll.zetGetMetricQueryProcAddrTable(version, byref(_MetricQuery)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.MetricQuery = _MetricQuery
-
-        # attach function interface to function address
-        self.zetMetricQueryCreate = _zetMetricQueryCreate_t(self.__dditable.MetricQuery.pfnCreate)
-        self.zetMetricQueryDestroy = _zetMetricQueryDestroy_t(self.__dditable.MetricQuery.pfnDestroy)
-        self.zetMetricQueryReset = _zetMetricQueryReset_t(self.__dditable.MetricQuery.pfnReset)
-        self.zetMetricQueryGetData = _zetMetricQueryGetData_t(self.__dditable.MetricQuery.pfnGetData)
-
-        # call driver to get function pointers
-        _TracerExp = _zet_tracer_exp_dditable_t()
-        r = ze_result_v(self.__dll.zetGetTracerExpProcAddrTable(version, byref(_TracerExp)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.TracerExp = _TracerExp
-
-        # attach function interface to function address
-        self.zetTracerExpCreate = _zetTracerExpCreate_t(self.__dditable.TracerExp.pfnCreate)
-        self.zetTracerExpDestroy = _zetTracerExpDestroy_t(self.__dditable.TracerExp.pfnDestroy)
-        self.zetTracerExpSetPrologues = _zetTracerExpSetPrologues_t(self.__dditable.TracerExp.pfnSetPrologues)
-        self.zetTracerExpSetEpilogues = _zetTracerExpSetEpilogues_t(self.__dditable.TracerExp.pfnSetEpilogues)
-        self.zetTracerExpSetEnabled = _zetTracerExpSetEnabled_t(self.__dditable.TracerExp.pfnSetEnabled)
-
-        # call driver to get function pointers
-        _Debug = _zet_debug_dditable_t()
-        r = ze_result_v(self.__dll.zetGetDebugProcAddrTable(version, byref(_Debug)))
-        if r != ze_result_v.SUCCESS:
-            raise Exception(r)
-        self.__dditable.Debug = _Debug
-
-        # attach function interface to function address
-        self.zetDebugAttach = _zetDebugAttach_t(self.__dditable.Debug.pfnAttach)
-        self.zetDebugDetach = _zetDebugDetach_t(self.__dditable.Debug.pfnDetach)
-        self.zetDebugReadEvent = _zetDebugReadEvent_t(self.__dditable.Debug.pfnReadEvent)
-        self.zetDebugAcknowledgeEvent = _zetDebugAcknowledgeEvent_t(self.__dditable.Debug.pfnAcknowledgeEvent)
-        self.zetDebugInterrupt = _zetDebugInterrupt_t(self.__dditable.Debug.pfnInterrupt)
-        self.zetDebugResume = _zetDebugResume_t(self.__dditable.Debug.pfnResume)
-        self.zetDebugReadMemory = _zetDebugReadMemory_t(self.__dditable.Debug.pfnReadMemory)
-        self.zetDebugWriteMemory = _zetDebugWriteMemory_t(self.__dditable.Debug.pfnWriteMemory)
-        self.zetDebugGetRegisterSetProperties = _zetDebugGetRegisterSetProperties_t(self.__dditable.Debug.pfnGetRegisterSetProperties)
-        self.zetDebugReadRegisters = _zetDebugReadRegisters_t(self.__dditable.Debug.pfnReadRegisters)
-        self.zetDebugWriteRegisters = _zetDebugWriteRegisters_t(self.__dditable.Debug.pfnWriteRegisters)
-
-        # success!
diff --git a/src/gpu/intel/sycl/l0/level_zero/zet_api.h b/src/gpu/intel/sycl/l0/level_zero/zet_api.h
deleted file mode 100644
index 78542ed65d9..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/zet_api.h
+++ /dev/null
@@ -1,1746 +0,0 @@
-/*
- *
- * Copyright (C) 2019-2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file zet_api.h
- * @version v1.3-r1.3.7
- *
- */
-#ifndef _ZET_API_H
-#define _ZET_API_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-
-// 'core' API headers
-#include "ze_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-// Intel 'oneAPI' Level-Zero Tool API common types
-#if !defined(__GNUC__)
-#pragma region common
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle to a driver instance
-typedef ze_driver_handle_t zet_driver_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of device object
-typedef ze_device_handle_t zet_device_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of context object
-typedef ze_context_handle_t zet_context_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of command list object
-typedef ze_command_list_handle_t zet_command_list_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of module object
-typedef ze_module_handle_t zet_module_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of function object
-typedef ze_kernel_handle_t zet_kernel_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of metric group's object
-typedef struct _zet_metric_group_handle_t *zet_metric_group_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of metric's object
-typedef struct _zet_metric_handle_t *zet_metric_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of metric streamer's object
-typedef struct _zet_metric_streamer_handle_t *zet_metric_streamer_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of metric query pool's object
-typedef struct _zet_metric_query_pool_handle_t *zet_metric_query_pool_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of metric query's object
-typedef struct _zet_metric_query_handle_t *zet_metric_query_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Handle of tracer object
-typedef struct _zet_tracer_exp_handle_t *zet_tracer_exp_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Debug session handle
-typedef struct _zet_debug_session_handle_t *zet_debug_session_handle_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Defines structure types
-typedef enum _zet_structure_type_t
-{
-    ZET_STRUCTURE_TYPE_METRIC_GROUP_PROPERTIES = 0x1,   ///< ::zet_metric_group_properties_t
-    ZET_STRUCTURE_TYPE_METRIC_PROPERTIES = 0x2,     ///< ::zet_metric_properties_t
-    ZET_STRUCTURE_TYPE_METRIC_STREAMER_DESC = 0x3,  ///< ::zet_metric_streamer_desc_t
-    ZET_STRUCTURE_TYPE_METRIC_QUERY_POOL_DESC = 0x4,///< ::zet_metric_query_pool_desc_t
-    ZET_STRUCTURE_TYPE_PROFILE_PROPERTIES = 0x5,    ///< ::zet_profile_properties_t
-    ZET_STRUCTURE_TYPE_DEVICE_DEBUG_PROPERTIES = 0x6,   ///< ::zet_device_debug_properties_t
-    ZET_STRUCTURE_TYPE_DEBUG_MEMORY_SPACE_DESC = 0x7,   ///< ::zet_debug_memory_space_desc_t
-    ZET_STRUCTURE_TYPE_DEBUG_REGSET_PROPERTIES = 0x8,   ///< ::zet_debug_regset_properties_t
-    ZET_STRUCTURE_TYPE_TRACER_EXP_DESC = 0x00010001,///< ::zet_tracer_exp_desc_t
-    ZET_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_structure_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all properties types
-typedef struct _zet_base_properties_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-
-} zet_base_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Base for all descriptor types
-typedef struct _zet_base_desc_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-
-} zet_base_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported value types
-typedef enum _zet_value_type_t
-{
-    ZET_VALUE_TYPE_UINT32 = 0,                      ///< 32-bit unsigned-integer
-    ZET_VALUE_TYPE_UINT64 = 1,                      ///< 64-bit unsigned-integer
-    ZET_VALUE_TYPE_FLOAT32 = 2,                     ///< 32-bit floating-point
-    ZET_VALUE_TYPE_FLOAT64 = 3,                     ///< 64-bit floating-point
-    ZET_VALUE_TYPE_BOOL8 = 4,                       ///< 8-bit boolean
-    ZET_VALUE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_value_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Union of values
-typedef union _zet_value_t
-{
-    uint32_t ui32;                                  ///< [out] 32-bit unsigned-integer
-    uint64_t ui64;                                  ///< [out] 32-bit unsigned-integer
-    float fp32;                                     ///< [out] 32-bit floating-point
-    double fp64;                                    ///< [out] 64-bit floating-point
-    ze_bool_t b8;                                   ///< [out] 8-bit boolean
-
-} zet_value_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Typed value
-typedef struct _zet_typed_value_t
-{
-    zet_value_type_t type;                          ///< [out] type of value
-    zet_value_t value;                              ///< [out] value
-
-} zet_typed_value_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_base_properties_t
-typedef struct _zet_base_properties_t zet_base_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_base_desc_t
-typedef struct _zet_base_desc_t zet_base_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_typed_value_t
-typedef struct _zet_typed_value_t zet_typed_value_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_device_debug_properties_t
-typedef struct _zet_device_debug_properties_t zet_device_debug_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_config_t
-typedef struct _zet_debug_config_t zet_debug_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_event_info_detached_t
-typedef struct _zet_debug_event_info_detached_t zet_debug_event_info_detached_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_event_info_module_t
-typedef struct _zet_debug_event_info_module_t zet_debug_event_info_module_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_event_info_thread_stopped_t
-typedef struct _zet_debug_event_info_thread_stopped_t zet_debug_event_info_thread_stopped_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_event_info_page_fault_t
-typedef struct _zet_debug_event_info_page_fault_t zet_debug_event_info_page_fault_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_event_t
-typedef struct _zet_debug_event_t zet_debug_event_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_memory_space_desc_t
-typedef struct _zet_debug_memory_space_desc_t zet_debug_memory_space_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_debug_regset_properties_t
-typedef struct _zet_debug_regset_properties_t zet_debug_regset_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_metric_group_properties_t
-typedef struct _zet_metric_group_properties_t zet_metric_group_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_metric_properties_t
-typedef struct _zet_metric_properties_t zet_metric_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_metric_streamer_desc_t
-typedef struct _zet_metric_streamer_desc_t zet_metric_streamer_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_metric_query_pool_desc_t
-typedef struct _zet_metric_query_pool_desc_t zet_metric_query_pool_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_profile_properties_t
-typedef struct _zet_profile_properties_t zet_profile_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_profile_free_register_token_t
-typedef struct _zet_profile_free_register_token_t zet_profile_free_register_token_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_profile_register_sequence_t
-typedef struct _zet_profile_register_sequence_t zet_profile_register_sequence_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Forward-declare zet_tracer_exp_desc_t
-typedef struct _zet_tracer_exp_desc_t zet_tracer_exp_desc_t;
-
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Device
-#if !defined(__GNUC__)
-#pragma region device
-#endif
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Context
-#if !defined(__GNUC__)
-#pragma region context
-#endif
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Command List
-#if !defined(__GNUC__)
-#pragma region cmdlist
-#endif
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Module
-#if !defined(__GNUC__)
-#pragma region module
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported module debug info formats.
-typedef enum _zet_module_debug_info_format_t
-{
-    ZET_MODULE_DEBUG_INFO_FORMAT_ELF_DWARF = 0,     ///< Format is ELF/DWARF
-    ZET_MODULE_DEBUG_INFO_FORMAT_FORCE_UINT32 = 0x7fffffff
-
-} zet_module_debug_info_format_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve debug info from module.
-/// 
-/// @details
-///     - The caller can pass nullptr for pDebugInfo when querying only for
-///       size.
-///     - The implementation will copy the native binary into a buffer supplied
-///       by the caller.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hModule`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZET_MODULE_DEBUG_INFO_FORMAT_ELF_DWARF < format`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pSize`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetModuleGetDebugInfo(
-    zet_module_handle_t hModule,                    ///< [in] handle of the module
-    zet_module_debug_info_format_t format,          ///< [in] debug info format requested
-    size_t* pSize,                                  ///< [in,out] size of debug info in bytes
-    uint8_t* pDebugInfo                             ///< [in,out][optional] byte pointer to debug info
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Program Debug
-#if !defined(__GNUC__)
-#pragma region debug
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device debug property flags
-typedef uint32_t zet_device_debug_property_flags_t;
-typedef enum _zet_device_debug_property_flag_t
-{
-    ZET_DEVICE_DEBUG_PROPERTY_FLAG_ATTACH = ZE_BIT(0),  ///< the device supports attaching for debug
-    ZET_DEVICE_DEBUG_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zet_device_debug_property_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device debug properties queried using ::zetDeviceGetDebugProperties.
-typedef struct _zet_device_debug_properties_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zet_device_debug_property_flags_t flags;        ///< [out] returns 0 (none) or a valid combination of
-                                                    ///< ::zet_device_debug_property_flag_t
-
-} zet_device_debug_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves debug properties of the device.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pDebugProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDeviceGetDebugProperties(
-    zet_device_handle_t hDevice,                    ///< [in] device handle
-    zet_device_debug_properties_t* pDebugProperties ///< [in,out] query result for debug properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Debug configuration provided to ::zetDebugAttach
-typedef struct _zet_debug_config_t
-{
-    uint32_t pid;                                   ///< [in] the host process identifier
-
-} zet_debug_config_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Attach to a device.
-/// 
-/// @details
-///     - The device must be enabled for debug; see
-///       ::zesSchedulerSetComputeUnitDebugMode.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == config`
-///         + `nullptr == phDebug`
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
-///         + attaching to this device is not supported
-///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
-///         + caller does not have sufficient permissions
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + a debugger is already attached
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugAttach(
-    zet_device_handle_t hDevice,                    ///< [in] device handle
-    const zet_debug_config_t* config,               ///< [in] the debug configuration
-    zet_debug_session_handle_t* phDebug             ///< [out] debug session handle
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Close a debug session.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugDetach(
-    zet_debug_session_handle_t hDebug               ///< [in][release] debug session handle
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported debug event flags.
-typedef uint32_t zet_debug_event_flags_t;
-typedef enum _zet_debug_event_flag_t
-{
-    ZET_DEBUG_EVENT_FLAG_NEED_ACK = ZE_BIT(0),      ///< The event needs to be acknowledged by calling
-                                                    ///< ::zetDebugAcknowledgeEvent.
-    ZET_DEBUG_EVENT_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zet_debug_event_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported debug event types.
-typedef enum _zet_debug_event_type_t
-{
-    ZET_DEBUG_EVENT_TYPE_INVALID = 0,               ///< The event is invalid
-    ZET_DEBUG_EVENT_TYPE_DETACHED = 1,              ///< The tool was detached
-    ZET_DEBUG_EVENT_TYPE_PROCESS_ENTRY = 2,         ///< The debuggee process created command queues on the device
-    ZET_DEBUG_EVENT_TYPE_PROCESS_EXIT = 3,          ///< The debuggee process destroyed all command queues on the device
-    ZET_DEBUG_EVENT_TYPE_MODULE_LOAD = 4,           ///< An in-memory module was loaded onto the device
-    ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD = 5,         ///< An in-memory module is about to get unloaded from the device
-    ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED = 6,        ///< The thread stopped due to a device exception
-    ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE = 7,    ///< The thread is not available to be stopped
-    ZET_DEBUG_EVENT_TYPE_PAGE_FAULT = 8,            ///< A page request could not be completed on the device
-    ZET_DEBUG_EVENT_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_debug_event_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported debug detach reasons.
-typedef enum _zet_debug_detach_reason_t
-{
-    ZET_DEBUG_DETACH_REASON_INVALID = 0,            ///< The detach reason is not valid
-    ZET_DEBUG_DETACH_REASON_HOST_EXIT = 1,          ///< The host process exited
-    ZET_DEBUG_DETACH_REASON_FORCE_UINT32 = 0x7fffffff
-
-} zet_debug_detach_reason_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_DETACHED
-typedef struct _zet_debug_event_info_detached_t
-{
-    zet_debug_detach_reason_t reason;               ///< [out] the detach reason
-
-} zet_debug_event_info_detached_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD and
-///        ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
-typedef struct _zet_debug_event_info_module_t
-{
-    zet_module_debug_info_format_t format;          ///< [out] the module format
-    uint64_t moduleBegin;                           ///< [out] the begin address of the in-memory module (inclusive)
-    uint64_t moduleEnd;                             ///< [out] the end address of the in-memory module (exclusive)
-    uint64_t load;                                  ///< [out] the load address of the module on the device
-
-} zet_debug_event_info_module_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED and
-///        ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
-typedef struct _zet_debug_event_info_thread_stopped_t
-{
-    ze_device_thread_t thread;                      ///< [out] the stopped/unavailable thread
-
-} zet_debug_event_info_thread_stopped_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Page fault reasons.
-typedef enum _zet_debug_page_fault_reason_t
-{
-    ZET_DEBUG_PAGE_FAULT_REASON_INVALID = 0,        ///< The page fault reason is not valid
-    ZET_DEBUG_PAGE_FAULT_REASON_MAPPING_ERROR = 1,  ///< The address is not mapped
-    ZET_DEBUG_PAGE_FAULT_REASON_PERMISSION_ERROR = 2,   ///< Invalid access permissions
-    ZET_DEBUG_PAGE_FAULT_REASON_FORCE_UINT32 = 0x7fffffff
-
-} zet_debug_page_fault_reason_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
-typedef struct _zet_debug_event_info_page_fault_t
-{
-    uint64_t address;                               ///< [out] the faulting address
-    uint64_t mask;                                  ///< [out] the alignment mask
-    zet_debug_page_fault_reason_t reason;           ///< [out] the page fault reason
-
-} zet_debug_event_info_page_fault_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Event type-specific information
-typedef union _zet_debug_event_info_t
-{
-    zet_debug_event_info_detached_t detached;       ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_DETACHED
-    zet_debug_event_info_module_t module;           ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD or
-                                                    ///< ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
-    zet_debug_event_info_thread_stopped_t thread;   ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED or
-                                                    ///< ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
-    zet_debug_event_info_page_fault_t page_fault;   ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
-
-} zet_debug_event_info_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief A debug event on the device.
-typedef struct _zet_debug_event_t
-{
-    zet_debug_event_type_t type;                    ///< [out] the event type
-    zet_debug_event_flags_t flags;                  ///< [out] returns 0 (none) or a combination of ::zet_debug_event_flag_t
-    zet_debug_event_info_t info;                    ///< [out] event type specific information
-
-} zet_debug_event_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Read the topmost debug event.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == event`
-///     - ::ZE_RESULT_NOT_READY
-///         + the timeout expired
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugReadEvent(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    uint64_t timeout,                               ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
-                                                    ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
-                                                    ///< if zero, then immediately returns the status of the event;
-                                                    ///< if UINT64_MAX, then function will not return until complete or device
-                                                    ///< is lost.
-                                                    ///< Due to external dependencies, timeout may be rounded to the closest
-                                                    ///< value allowed by the accuracy of those dependencies.
-    zet_debug_event_t* event                        ///< [in,out] a pointer to a ::zet_debug_event_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Acknowledge a debug event.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == event`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugAcknowledgeEvent(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    const zet_debug_event_t* event                  ///< [in] a pointer to a ::zet_debug_event_t.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Interrupt device threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + the thread is already stopped or unavailable
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugInterrupt(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    ze_device_thread_t thread                       ///< [in] the thread to interrupt
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Resume device threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + the thread is already running or unavailable
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugResume(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    ze_device_thread_t thread                       ///< [in] the thread to resume
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported device memory space types.
-typedef enum _zet_debug_memory_space_type_t
-{
-    ZET_DEBUG_MEMORY_SPACE_TYPE_DEFAULT = 0,        ///< default memory space (attribute may be omitted)
-    ZET_DEBUG_MEMORY_SPACE_TYPE_SLM = 1,            ///< shared local memory space (GPU-only)
-    ZET_DEBUG_MEMORY_SPACE_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_debug_memory_space_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device memory space descriptor
-typedef struct _zet_debug_memory_space_desc_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zet_debug_memory_space_type_t type;             ///< [in] type of memory space
-    uint64_t address;                               ///< [in] the virtual address within the memory space
-
-} zet_debug_memory_space_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Read memory.
-/// 
-/// @details
-///     - The thread identifier 'all' can be used for accessing the default
-///       memory space, e.g. for setting breakpoints.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == buffer`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZET_DEBUG_MEMORY_SPACE_TYPE_SLM < desc->type`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + the thread is running or unavailable
-///         + the memory cannot be accessed from the supplied thread
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugReadMemory(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    ze_device_thread_t thread,                      ///< [in] the thread identifier.
-    const zet_debug_memory_space_desc_t* desc,      ///< [in] memory space descriptor
-    size_t size,                                    ///< [in] the number of bytes to read
-    void* buffer                                    ///< [in,out] a buffer to hold a copy of the memory
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Write memory.
-/// 
-/// @details
-///     - The thread identifier 'all' can be used for accessing the default
-///       memory space, e.g. for setting breakpoints.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == buffer`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZET_DEBUG_MEMORY_SPACE_TYPE_SLM < desc->type`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + the thread is running or unavailable
-///         + the memory cannot be accessed from the supplied thread
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugWriteMemory(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    ze_device_thread_t thread,                      ///< [in] the thread identifier.
-    const zet_debug_memory_space_desc_t* desc,      ///< [in] memory space descriptor
-    size_t size,                                    ///< [in] the number of bytes to write
-    const void* buffer                              ///< [in] a buffer holding the pattern to write
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported general register set flags.
-typedef uint32_t zet_debug_regset_flags_t;
-typedef enum _zet_debug_regset_flag_t
-{
-    ZET_DEBUG_REGSET_FLAG_READABLE = ZE_BIT(0),     ///< register set is readable
-    ZET_DEBUG_REGSET_FLAG_WRITEABLE = ZE_BIT(1),    ///< register set is writeable
-    ZET_DEBUG_REGSET_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zet_debug_regset_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Device register set properties queried using
-///        ::zetDebugGetRegisterSetProperties.
-typedef struct _zet_debug_regset_properties_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    uint32_t type;                                  ///< [out] device-specific register set type
-    uint32_t version;                               ///< [out] device-specific version of this register set
-    zet_debug_regset_flags_t generalFlags;          ///< [out] general register set flags
-    uint32_t deviceFlags;                           ///< [out] device-specific register set flags
-    uint32_t count;                                 ///< [out] number of registers in the set
-    uint32_t bitSize;                               ///< [out] the size of a register in bits
-    uint32_t byteSize;                              ///< [out] the size required for reading or writing a register in bytes
-
-} zet_debug_regset_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves debug register set properties.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugGetRegisterSetProperties(
-    zet_device_handle_t hDevice,                    ///< [in] device handle
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of register set properties.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of register set properties available.
-                                                    ///< if count is greater than the number of register set properties
-                                                    ///< available, then the driver shall update the value with the correct
-                                                    ///< number of registry set properties available.
-    zet_debug_regset_properties_t* pRegisterSetProperties   ///< [in,out][optional][range(0, *pCount)] array of query results for
-                                                    ///< register set properties.
-                                                    ///< if count is less than the number of register set properties available,
-                                                    ///< then driver shall only retrieve that number of register set properties.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Read register state.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + the thread is running or unavailable
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugReadRegisters(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    ze_device_thread_t thread,                      ///< [in] the thread identifier
-    uint32_t type,                                  ///< [in] register set type
-    uint32_t start,                                 ///< [in] the starting offset into the register state area; must be less
-                                                    ///< than ::zet_debug_regset_properties_t.count for the type
-    uint32_t count,                                 ///< [in] the number of registers to read; start+count must be <=
-                                                    ///< zet_debug_register_group_properties_t.count for the type
-    void* pRegisterValues                           ///< [in,out][optional][range(0, count)] buffer of register values
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Write register state.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDebug`
-///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
-///         + the thread is running or unavailable
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetDebugWriteRegisters(
-    zet_debug_session_handle_t hDebug,              ///< [in] debug session handle
-    ze_device_thread_t thread,                      ///< [in] the thread identifier
-    uint32_t type,                                  ///< [in] register set type
-    uint32_t start,                                 ///< [in] the starting offset into the register state area; must be less
-                                                    ///< than ::zet_debug_regset_properties_t.count for the type
-    uint32_t count,                                 ///< [in] the number of registers to write; start+count must be <=
-                                                    ///< zet_debug_register_group_properties_t.count for the type
-    void* pRegisterValues                           ///< [in,out][optional][range(0, count)] buffer of register values
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Metric
-#if !defined(__GNUC__)
-#pragma region metric
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves metric group for a device.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricGroupGet(
-    zet_device_handle_t hDevice,                    ///< [in] handle of the device
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of metric groups.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of metric groups available.
-                                                    ///< if count is greater than the number of metric groups available, then
-                                                    ///< the driver shall update the value with the correct number of metric
-                                                    ///< groups available.
-    zet_metric_group_handle_t* phMetricGroups       ///< [in,out][optional][range(0, *pCount)] array of handle of metric groups.
-                                                    ///< if count is less than the number of metric groups available, then
-                                                    ///< driver shall only retrieve that number of metric groups.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MAX_METRIC_GROUP_NAME
-/// @brief Maximum metric group name string size
-#define ZET_MAX_METRIC_GROUP_NAME  256
-#endif // ZET_MAX_METRIC_GROUP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MAX_METRIC_GROUP_DESCRIPTION
-/// @brief Maximum metric group description string size
-#define ZET_MAX_METRIC_GROUP_DESCRIPTION  256
-#endif // ZET_MAX_METRIC_GROUP_DESCRIPTION
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric group sampling type
-typedef uint32_t zet_metric_group_sampling_type_flags_t;
-typedef enum _zet_metric_group_sampling_type_flag_t
-{
-    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_EVENT_BASED = ZE_BIT(0),///< Event based sampling
-    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_TIME_BASED = ZE_BIT(1), ///< Time based sampling
-    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zet_metric_group_sampling_type_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric group properties queried using ::zetMetricGroupGetProperties
-typedef struct _zet_metric_group_properties_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    char name[ZET_MAX_METRIC_GROUP_NAME];           ///< [out] metric group name
-    char description[ZET_MAX_METRIC_GROUP_DESCRIPTION]; ///< [out] metric group description
-    zet_metric_group_sampling_type_flags_t samplingType;///< [out] metric group sampling type.
-                                                    ///< returns a combination of ::zet_metric_group_sampling_type_flag_t.
-    uint32_t domain;                                ///< [out] metric group domain number. Cannot use multiple, simultaneous
-                                                    ///< metric groups from the same domain.
-    uint32_t metricCount;                           ///< [out] metric count belonging to this group
-
-} zet_metric_group_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves attributes of a metric group.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricGroup`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricGroupGetProperties(
-    zet_metric_group_handle_t hMetricGroup,         ///< [in] handle of the metric group
-    zet_metric_group_properties_t* pProperties      ///< [in,out] metric group properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric types
-typedef enum _zet_metric_type_t
-{
-    ZET_METRIC_TYPE_DURATION = 0,                   ///< Metric type: duration
-    ZET_METRIC_TYPE_EVENT = 1,                      ///< Metric type: event
-    ZET_METRIC_TYPE_EVENT_WITH_RANGE = 2,           ///< Metric type: event with range
-    ZET_METRIC_TYPE_THROUGHPUT = 3,                 ///< Metric type: throughput
-    ZET_METRIC_TYPE_TIMESTAMP = 4,                  ///< Metric type: timestamp
-    ZET_METRIC_TYPE_FLAG = 5,                       ///< Metric type: flag
-    ZET_METRIC_TYPE_RATIO = 6,                      ///< Metric type: ratio
-    ZET_METRIC_TYPE_RAW = 7,                        ///< Metric type: raw
-    ZET_METRIC_TYPE_IP_EXP = 0x7ffffffe,            ///< Metric type: instruction pointer
-    ZET_METRIC_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_metric_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric group calculation type
-typedef enum _zet_metric_group_calculation_type_t
-{
-    ZET_METRIC_GROUP_CALCULATION_TYPE_METRIC_VALUES = 0,///< Calculated metric values from raw data.
-    ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES = 1,///< Maximum metric values.
-    ZET_METRIC_GROUP_CALCULATION_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_metric_group_calculation_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Calculates metric values from raw data.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricGroup`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES < type`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pRawData`
-///         + `nullptr == pMetricValueCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricGroupCalculateMetricValues(
-    zet_metric_group_handle_t hMetricGroup,         ///< [in] handle of the metric group
-    zet_metric_group_calculation_type_t type,       ///< [in] calculation type to be applied on raw data
-    size_t rawDataSize,                             ///< [in] size in bytes of raw data buffer
-    const uint8_t* pRawData,                        ///< [in][range(0, rawDataSize)] buffer of raw data to calculate
-    uint32_t* pMetricValueCount,                    ///< [in,out] pointer to number of metric values calculated.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of metric values to be calculated.
-                                                    ///< if count is greater than the number available in the raw data buffer,
-                                                    ///< then the driver shall update the value with the actual number of
-                                                    ///< metric values to be calculated.
-    zet_typed_value_t* pMetricValues                ///< [in,out][optional][range(0, *pMetricValueCount)] buffer of calculated metrics.
-                                                    ///< if count is less than the number available in the raw data buffer,
-                                                    ///< then driver shall only calculate that number of metric values.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves metric from a metric group.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricGroup`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricGet(
-    zet_metric_group_handle_t hMetricGroup,         ///< [in] handle of the metric group
-    uint32_t* pCount,                               ///< [in,out] pointer to the number of metrics.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of metrics available.
-                                                    ///< if count is greater than the number of metrics available, then the
-                                                    ///< driver shall update the value with the correct number of metrics available.
-    zet_metric_handle_t* phMetrics                  ///< [in,out][optional][range(0, *pCount)] array of handle of metrics.
-                                                    ///< if count is less than the number of metrics available, then driver
-                                                    ///< shall only retrieve that number of metrics.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MAX_METRIC_NAME
-/// @brief Maximum metric name string size
-#define ZET_MAX_METRIC_NAME  256
-#endif // ZET_MAX_METRIC_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MAX_METRIC_DESCRIPTION
-/// @brief Maximum metric description string size
-#define ZET_MAX_METRIC_DESCRIPTION  256
-#endif // ZET_MAX_METRIC_DESCRIPTION
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MAX_METRIC_COMPONENT
-/// @brief Maximum metric component string size
-#define ZET_MAX_METRIC_COMPONENT  256
-#endif // ZET_MAX_METRIC_COMPONENT
-
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MAX_METRIC_RESULT_UNITS
-/// @brief Maximum metric result units string size
-#define ZET_MAX_METRIC_RESULT_UNITS  256
-#endif // ZET_MAX_METRIC_RESULT_UNITS
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric properties queried using ::zetMetricGetProperties
-typedef struct _zet_metric_properties_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    char name[ZET_MAX_METRIC_NAME];                 ///< [out] metric name
-    char description[ZET_MAX_METRIC_DESCRIPTION];   ///< [out] metric description
-    char component[ZET_MAX_METRIC_COMPONENT];       ///< [out] metric component
-    uint32_t tierNumber;                            ///< [out] number of tier
-    zet_metric_type_t metricType;                   ///< [out] metric type
-    zet_value_type_t resultType;                    ///< [out] metric result type
-    char resultUnits[ZET_MAX_METRIC_RESULT_UNITS];  ///< [out] metric result units
-
-} zet_metric_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves attributes of a metric.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetric`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricGetProperties(
-    zet_metric_handle_t hMetric,                    ///< [in] handle of the metric
-    zet_metric_properties_t* pProperties            ///< [in,out] metric properties
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Activates metric groups.
-/// 
-/// @details
-///     - Immediately reconfigures the device to activate only those metric
-///       groups provided.
-///     - Any metric groups previously activated but not provided will be
-///       deactivated.
-///     - Deactivating metric groups that are still in-use will result in
-///       undefined behavior.
-///     - All metric groups must have different domains, see
-///       ::zet_metric_group_properties_t.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same device handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phMetricGroups) && (0 < count)`
-///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
-///         + Multiple metric groups share the same domain
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetContextActivateMetricGroups(
-    zet_context_handle_t hContext,                  ///< [in] handle of the context object
-    zet_device_handle_t hDevice,                    ///< [in] handle of the device
-    uint32_t count,                                 ///< [in] metric group count to activate; must be 0 if `nullptr ==
-                                                    ///< phMetricGroups`
-    zet_metric_group_handle_t* phMetricGroups       ///< [in][optional][range(0, count)] handles of the metric groups to activate.
-                                                    ///< nullptr deactivates all previously used metric groups.
-                                                    ///< all metrics groups must come from a different domains.
-                                                    ///< metric query and metric stream must use activated metric groups.
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric streamer descriptor
-typedef struct _zet_metric_streamer_desc_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    uint32_t notifyEveryNReports;                   ///< [in,out] number of collected reports after which notification event
-                                                    ///< will be signalled
-    uint32_t samplingPeriod;                        ///< [in,out] streamer sampling period in nanoseconds
-
-} zet_metric_streamer_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Opens metric streamer for a device.
-/// 
-/// @details
-///     - The notification event must have been created from an event pool that
-///       was created using ::ZE_EVENT_POOL_FLAG_HOST_VISIBLE flag.
-///     - The duration of the signal event created from an event pool that was
-///       created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
-///       However, for consistency and orthogonality the event will report
-///       correctly as signaled when used by other event API functionality.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same device handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///         + `nullptr == hMetricGroup`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phMetricStreamer`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricStreamerOpen(
-    zet_context_handle_t hContext,                  ///< [in] handle of the context object
-    zet_device_handle_t hDevice,                    ///< [in] handle of the device
-    zet_metric_group_handle_t hMetricGroup,         ///< [in] handle of the metric group
-    zet_metric_streamer_desc_t* desc,               ///< [in,out] metric streamer descriptor
-    ze_event_handle_t hNotificationEvent,           ///< [in][optional] event used for report availability notification
-    zet_metric_streamer_handle_t* phMetricStreamer  ///< [out] handle of metric streamer
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Append metric streamer marker into a command list.
-/// 
-/// @details
-///     - The application must ensure the metric streamer is accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the command list and metric streamer were
-///       created on the same context.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-///     - Allow to associate metric stream time based metrics with executed
-///       workload.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hMetricStreamer`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetCommandListAppendMetricStreamerMarker(
-    zet_command_list_handle_t hCommandList,         ///< [in] handle of the command list
-    zet_metric_streamer_handle_t hMetricStreamer,   ///< [in] handle of the metric streamer
-    uint32_t value                                  ///< [in] streamer marker value
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Closes metric streamer.
-/// 
-/// @details
-///     - The application must **not** call this function from simultaneous
-///       threads with the same metric streamer handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricStreamer`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricStreamerClose(
-    zet_metric_streamer_handle_t hMetricStreamer    ///< [in][release] handle of the metric streamer
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Reads data from metric streamer.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricStreamer`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pRawDataSize`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricStreamerReadData(
-    zet_metric_streamer_handle_t hMetricStreamer,   ///< [in] handle of the metric streamer
-    uint32_t maxReportCount,                        ///< [in] the maximum number of reports the application wants to receive.
-                                                    ///< if UINT32_MAX, then function will retrieve all reports available
-    size_t* pRawDataSize,                           ///< [in,out] pointer to size in bytes of raw data requested to read.
-                                                    ///< if size is zero, then the driver will update the value with the total
-                                                    ///< size in bytes needed for all reports available.
-                                                    ///< if size is non-zero, then driver will only retrieve the number of
-                                                    ///< reports that fit into the buffer.
-                                                    ///< if size is larger than size needed for all reports, then driver will
-                                                    ///< update the value with the actual size needed.
-    uint8_t* pRawData                               ///< [in,out][optional][range(0, *pRawDataSize)] buffer containing streamer
-                                                    ///< reports in raw format
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric query pool types
-typedef enum _zet_metric_query_pool_type_t
-{
-    ZET_METRIC_QUERY_POOL_TYPE_PERFORMANCE = 0,     ///< Performance metric query pool.
-    ZET_METRIC_QUERY_POOL_TYPE_EXECUTION = 1,       ///< Skips workload execution between begin/end calls.
-    ZET_METRIC_QUERY_POOL_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_metric_query_pool_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Metric query pool description
-typedef struct _zet_metric_query_pool_desc_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    zet_metric_query_pool_type_t type;              ///< [in] Query pool type.
-    uint32_t count;                                 ///< [in] Internal slots count within query pool object.
-
-} zet_metric_query_pool_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a pool of metric queries on the context.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///         + `nullptr == hDevice`
-///         + `nullptr == hMetricGroup`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == phMetricQueryPool`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZET_METRIC_QUERY_POOL_TYPE_EXECUTION < desc->type`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricQueryPoolCreate(
-    zet_context_handle_t hContext,                  ///< [in] handle of the context object
-    zet_device_handle_t hDevice,                    ///< [in] handle of the device
-    zet_metric_group_handle_t hMetricGroup,         ///< [in] metric group associated with the query object.
-    const zet_metric_query_pool_desc_t* desc,       ///< [in] metric query pool descriptor
-    zet_metric_query_pool_handle_t* phMetricQueryPool   ///< [out] handle of metric query pool
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Deletes a query pool object.
-/// 
-/// @details
-///     - The application must destroy all query handles created from the pool
-///       before destroying the pool itself.
-///     - The application must ensure the device is not currently referencing
-///       the any query within the pool before it is deleted.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same query pool handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricQueryPool`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricQueryPoolDestroy(
-    zet_metric_query_pool_handle_t hMetricQueryPool ///< [in][release] handle of the metric query pool
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates metric query from the pool.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricQueryPool`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phMetricQuery`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricQueryCreate(
-    zet_metric_query_pool_handle_t hMetricQueryPool,///< [in] handle of the metric query pool
-    uint32_t index,                                 ///< [in] index of the query within the pool
-    zet_metric_query_handle_t* phMetricQuery        ///< [out] handle of metric query
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Deletes a metric query object.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the query before it is deleted.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same query handle.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricQuery`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricQueryDestroy(
-    zet_metric_query_handle_t hMetricQuery          ///< [in][release] handle of metric query
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Resets a metric query object back to inital state.
-/// 
-/// @details
-///     - The application must ensure the device is not currently referencing
-///       the query before it is reset
-///     - The application must **not** call this function from simultaneous
-///       threads with the same query handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricQuery`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricQueryReset(
-    zet_metric_query_handle_t hMetricQuery          ///< [in] handle of metric query
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends metric query begin into a command list.
-/// 
-/// @details
-///     - The application must ensure the metric query is accessible by the
-///       device on which the command list was created.
-///     - The application must ensure the command list and metric query were
-///       created on the same context.
-///     - This command blocks all following commands from beginning until the
-///       execution of the query completes.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hMetricQuery`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetCommandListAppendMetricQueryBegin(
-    zet_command_list_handle_t hCommandList,         ///< [in] handle of the command list
-    zet_metric_query_handle_t hMetricQuery          ///< [in] handle of the metric query
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends metric query end into a command list.
-/// 
-/// @details
-///     - The application must ensure the metric query and events are accessible
-///       by the device on which the command list was created.
-///     - The application must ensure the command list, events and metric query
-///       were created on the same context.
-///     - The duration of the signal event created from an event pool that was
-///       created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
-///       However, for consistency and orthogonality the event will report
-///       correctly as signaled when used by other event API functionality.
-///     - If numWaitEvents is zero, then all previous commands are completed
-///       prior to the execution of the query.
-///     - If numWaitEvents is non-zero, then all phWaitEvents must be signaled
-///       prior to the execution of the query.
-///     - This command blocks all following commands from beginning until the
-///       execution of the query completes.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-///         + `nullptr == hMetricQuery`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == phWaitEvents`
-///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
-///     - ::ZE_RESULT_ERROR_INVALID_SIZE
-///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetCommandListAppendMetricQueryEnd(
-    zet_command_list_handle_t hCommandList,         ///< [in] handle of the command list
-    zet_metric_query_handle_t hMetricQuery,         ///< [in] handle of the metric query
-    ze_event_handle_t hSignalEvent,                 ///< [in][optional] handle of the event to signal on completion
-    uint32_t numWaitEvents,                         ///< [in] must be zero
-    ze_event_handle_t* phWaitEvents                 ///< [in][mbz] must be nullptr
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Appends metric query commands to flush all caches.
-/// 
-/// @details
-///     - The application must **not** call this function from simultaneous
-///       threads with the same command list handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hCommandList`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetCommandListAppendMetricMemoryBarrier(
-    zet_command_list_handle_t hCommandList          ///< [in] handle of the command list
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieves raw data for a given metric query.
-/// 
-/// @details
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricQuery`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pRawDataSize`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricQueryGetData(
-    zet_metric_query_handle_t hMetricQuery,         ///< [in] handle of the metric query
-    size_t* pRawDataSize,                           ///< [in,out] pointer to size in bytes of raw data requested to read.
-                                                    ///< if size is zero, then the driver will update the value with the total
-                                                    ///< size in bytes needed for all reports available.
-                                                    ///< if size is non-zero, then driver will only retrieve the number of
-                                                    ///< reports that fit into the buffer.
-                                                    ///< if size is larger than size needed for all reports, then driver will
-                                                    ///< update the value with the actual size needed.
-    uint8_t* pRawData                               ///< [in,out][optional][range(0, *pRawDataSize)] buffer containing query
-                                                    ///< reports in raw format
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool APIs for Program Instrumentation (PIN)
-#if !defined(__GNUC__)
-#pragma region pin
-#endif
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supportted profile features
-typedef uint32_t zet_profile_flags_t;
-typedef enum _zet_profile_flag_t
-{
-    ZET_PROFILE_FLAG_REGISTER_REALLOCATION = ZE_BIT(0), ///< request the compiler attempt to minimize register usage as much as
-                                                    ///< possible to allow for instrumentation
-    ZET_PROFILE_FLAG_FREE_REGISTER_INFO = ZE_BIT(1),///< request the compiler generate free register info
-    ZET_PROFILE_FLAG_FORCE_UINT32 = 0x7fffffff
-
-} zet_profile_flag_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Profiling meta-data for instrumentation
-typedef struct _zet_profile_properties_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    void* pNext;                                    ///< [in,out][optional] pointer to extension-specific structure
-    zet_profile_flags_t flags;                      ///< [out] indicates which flags were enabled during compilation.
-                                                    ///< returns 0 (none) or a combination of ::zet_profile_flag_t
-    uint32_t numTokens;                             ///< [out] number of tokens immediately following this structure
-
-} zet_profile_properties_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Supported profile token types
-typedef enum _zet_profile_token_type_t
-{
-    ZET_PROFILE_TOKEN_TYPE_FREE_REGISTER = 0,       ///< GRF info
-    ZET_PROFILE_TOKEN_TYPE_FORCE_UINT32 = 0x7fffffff
-
-} zet_profile_token_type_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Profile free register token detailing unused registers in the current
-///        function
-typedef struct _zet_profile_free_register_token_t
-{
-    zet_profile_token_type_t type;                  ///< [out] type of token
-    uint32_t size;                                  ///< [out] total size of the token, in bytes
-    uint32_t count;                                 ///< [out] number of register sequences immediately following this
-                                                    ///< structure
-
-} zet_profile_free_register_token_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Profile register sequence detailing consecutive bytes, all of which
-///        are unused
-typedef struct _zet_profile_register_sequence_t
-{
-    uint32_t start;                                 ///< [out] starting byte in the register table, representing the start of
-                                                    ///< unused bytes in the current function
-    uint32_t count;                                 ///< [out] number of consecutive bytes in the sequence, starting from start
-
-} zet_profile_register_sequence_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Retrieve profiling information generated for the kernel.
-/// 
-/// @details
-///     - Module must be created using the following build option:
-///         + "-zet-profile-flags <n>" - enable generation of profile
-///           information
-///         + "<n>" must be a combination of ::zet_profile_flag_t, in hex
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function should be lock-free.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hKernel`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pProfileProperties`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetKernelGetProfileInfo(
-    zet_kernel_handle_t hKernel,                    ///< [in] handle to kernel
-    zet_profile_properties_t* pProfileProperties    ///< [out] pointer to profile properties
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool Experimental Extension APIs for API Tracing
-#if !defined(__GNUC__)
-#pragma region tracing
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_API_TRACING_EXP_NAME
-/// @brief API Tracing Experimental Extension Name
-#define ZET_API_TRACING_EXP_NAME  "ZET_experimental_api_tracing"
-#endif // ZET_API_TRACING_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief API Tracing Experimental Extension Version(s)
-typedef enum _zet_api_tracing_exp_version_t
-{
-    ZET_API_TRACING_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
-    ZET_API_TRACING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
-    ZET_API_TRACING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} zet_api_tracing_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Alias the existing callbacks definition for 'core' callbacks
-typedef ze_callbacks_t zet_core_callbacks_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Tracer descriptor
-typedef struct _zet_tracer_exp_desc_t
-{
-    zet_structure_type_t stype;                     ///< [in] type of this structure
-    const void* pNext;                              ///< [in][optional] pointer to extension-specific structure
-    void* pUserData;                                ///< [in] pointer passed to every tracer's callbacks
-
-} zet_tracer_exp_desc_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Creates a tracer on the context.
-/// 
-/// @details
-///     - The application must only use the tracer for the context which was
-///       provided during creation.
-///     - The tracer is created in the disabled state.
-///     - The application may call this function from simultaneous threads.
-///     - The implementation of this function must be thread-safe.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hContext`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == desc`
-///         + `nullptr == desc->pUserData`
-///         + `nullptr == phTracer`
-///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetTracerExpCreate(
-    zet_context_handle_t hContext,                  ///< [in] handle of the context object
-    const zet_tracer_exp_desc_t* desc,              ///< [in] pointer to tracer descriptor
-    zet_tracer_exp_handle_t* phTracer               ///< [out] pointer to handle of tracer object created
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Destroys a tracer.
-/// 
-/// @details
-///     - The application must **not** call this function from simultaneous
-///       threads with the same tracer handle.
-///     - The implementation of this function must be thread-safe.
-///     - The implementation of this function will stall and wait on any
-///       outstanding threads executing callbacks before freeing any Host
-///       allocations associated with this tracer.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTracer`
-///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetTracerExpDestroy(
-    zet_tracer_exp_handle_t hTracer                 ///< [in][release] handle of tracer object to destroy
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sets the collection of callbacks to be executed **before** driver
-///        execution.
-/// 
-/// @details
-///     - The application only needs to set the function pointers it is
-///       interested in receiving; all others should be 'nullptr'
-///     - The application must ensure that no other threads are executing
-///       functions for which the tracing functions are changing.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same tracer handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTracer`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCoreCbs`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetTracerExpSetPrologues(
-    zet_tracer_exp_handle_t hTracer,                ///< [in] handle of the tracer
-    zet_core_callbacks_t* pCoreCbs                  ///< [in] pointer to table of 'core' callback function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Sets the collection of callbacks to be executed **after** driver
-///        execution.
-/// 
-/// @details
-///     - The application only needs to set the function pointers it is
-///       interested in receiving; all others should be 'nullptr'
-///     - The application must ensure that no other threads are executing
-///       functions for which the tracing functions are changing.
-///     - The application must **not** call this function from simultaneous
-///       threads with the same tracer handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTracer`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pCoreCbs`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetTracerExpSetEpilogues(
-    zet_tracer_exp_handle_t hTracer,                ///< [in] handle of the tracer
-    zet_core_callbacks_t* pCoreCbs                  ///< [in] pointer to table of 'core' callback function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Enables (or disables) the tracer
-/// 
-/// @details
-///     - The application must **not** call this function from simultaneous
-///       threads with the same tracer handle.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hTracer`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetTracerExpSetEnabled(
-    zet_tracer_exp_handle_t hTracer,                ///< [in] handle of the tracer
-    ze_bool_t enable                                ///< [in] enable the tracer if true; disable if false
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Calculating Multiple Metrics
-#if !defined(__GNUC__)
-#pragma region multiMetricValues
-#endif
-///////////////////////////////////////////////////////////////////////////////
-#ifndef ZET_MULTI_METRICS_EXP_NAME
-/// @brief Calculating Multiple Metrics Experimental Extension Name
-#define ZET_MULTI_METRICS_EXP_NAME  "ZET_experimental_calculate_multiple_metrics"
-#endif // ZET_MULTI_METRICS_EXP_NAME
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Calculating Multiple Metrics Experimental Extension Version(s)
-typedef enum _ze_calculate_multiple_metrics_exp_version_t
-{
-    ZE_CALCULATE_MULTIPLE_METRICS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
-    ZE_CALCULATE_MULTIPLE_METRICS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
-    ZE_CALCULATE_MULTIPLE_METRICS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
-
-} ze_calculate_multiple_metrics_exp_version_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Calculate one or more sets of metric values from raw data.
-/// 
-/// @details
-///     - This function is similar to ::zetMetricGroupCalculateMetricValues
-///       except it may calculate more than one set of metric values from a
-///       single data buffer.  There may be one set of metric values for each
-///       sub-device, for example.
-///     - Each set of metric values may consist of a different number of metric
-///       values, returned as the metric value count.
-///     - All metric values are calculated into a single buffer; use the metric
-///       counts to determine which metric values belong to which set.
-///     - The application may call this function from simultaneous threads.
-/// 
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_DEVICE_LOST
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
-///         + `nullptr == hMetricGroup`
-///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES < type`
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///         + `nullptr == pRawData`
-///         + `nullptr == pSetCount`
-///         + `nullptr == pTotalMetricValueCount`
-ZE_APIEXPORT ze_result_t ZE_APICALL
-zetMetricGroupCalculateMultipleMetricValuesExp(
-    zet_metric_group_handle_t hMetricGroup,         ///< [in] handle of the metric group
-    zet_metric_group_calculation_type_t type,       ///< [in] calculation type to be applied on raw data
-    size_t rawDataSize,                             ///< [in] size in bytes of raw data buffer
-    const uint8_t* pRawData,                        ///< [in][range(0, rawDataSize)] buffer of raw data to calculate
-    uint32_t* pSetCount,                            ///< [in,out] pointer to number of metric sets.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of metric sets to be calculated.
-                                                    ///< if count is greater than the number available in the raw data buffer,
-                                                    ///< then the driver shall update the value with the actual number of
-                                                    ///< metric sets to be calculated.
-    uint32_t* pTotalMetricValueCount,               ///< [in,out] pointer to number of the total number of metric values
-                                                    ///< calculated, for all metric sets.
-                                                    ///< if count is zero, then the driver shall update the value with the
-                                                    ///< total number of metric values to be calculated.
-                                                    ///< if count is greater than the number available in the raw data buffer,
-                                                    ///< then the driver shall update the value with the actual number of
-                                                    ///< metric values to be calculated.
-    uint32_t* pMetricCounts,                        ///< [in,out][optional][range(0, *pSetCount)] buffer of metric counts per
-                                                    ///< metric set.
-    zet_typed_value_t* pMetricValues                ///< [in,out][optional][range(0, *pTotalMetricValueCount)] buffer of
-                                                    ///< calculated metrics.
-                                                    ///< if count is less than the number available in the raw data buffer,
-                                                    ///< then driver shall only calculate that number of metric values.
-    );
-
-#if !defined(__GNUC__)
-#pragma endregion
-#endif
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // _ZET_API_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/level_zero/zet_ddi.h b/src/gpu/intel/sycl/l0/level_zero/zet_ddi.h
deleted file mode 100644
index ec05241682a..00000000000
--- a/src/gpu/intel/sycl/l0/level_zero/zet_ddi.h
+++ /dev/null
@@ -1,763 +0,0 @@
-/*
- *
- * Copyright (C) 2019-2021 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- * @file zet_ddi.h
- * @version v1.3-r1.3.7
- *
- */
-#ifndef _ZET_DDI_H
-#define _ZET_DDI_H
-#if defined(__cplusplus)
-#pragma once
-#endif
-#include "zet_api.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDeviceGetDebugProperties 
-typedef ze_result_t (ZE_APICALL *zet_pfnDeviceGetDebugProperties_t)(
-    zet_device_handle_t,
-    zet_device_debug_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Device functions pointers
-typedef struct _zet_device_dditable_t
-{
-    zet_pfnDeviceGetDebugProperties_t                           pfnGetDebugProperties;
-} zet_device_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Device table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetDeviceProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_device_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetDeviceProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetDeviceProcAddrTable_t)(
-    ze_api_version_t,
-    zet_device_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetContextActivateMetricGroups 
-typedef ze_result_t (ZE_APICALL *zet_pfnContextActivateMetricGroups_t)(
-    zet_context_handle_t,
-    zet_device_handle_t,
-    uint32_t,
-    zet_metric_group_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Context functions pointers
-typedef struct _zet_context_dditable_t
-{
-    zet_pfnContextActivateMetricGroups_t                        pfnActivateMetricGroups;
-} zet_context_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Context table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetContextProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_context_dditable_t* pDdiTable               ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetContextProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetContextProcAddrTable_t)(
-    ze_api_version_t,
-    zet_context_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetCommandListAppendMetricStreamerMarker 
-typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricStreamerMarker_t)(
-    zet_command_list_handle_t,
-    zet_metric_streamer_handle_t,
-    uint32_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetCommandListAppendMetricQueryBegin 
-typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricQueryBegin_t)(
-    zet_command_list_handle_t,
-    zet_metric_query_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetCommandListAppendMetricQueryEnd 
-typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricQueryEnd_t)(
-    zet_command_list_handle_t,
-    zet_metric_query_handle_t,
-    ze_event_handle_t,
-    uint32_t,
-    ze_event_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetCommandListAppendMetricMemoryBarrier 
-typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricMemoryBarrier_t)(
-    zet_command_list_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of CommandList functions pointers
-typedef struct _zet_command_list_dditable_t
-{
-    zet_pfnCommandListAppendMetricStreamerMarker_t              pfnAppendMetricStreamerMarker;
-    zet_pfnCommandListAppendMetricQueryBegin_t                  pfnAppendMetricQueryBegin;
-    zet_pfnCommandListAppendMetricQueryEnd_t                    pfnAppendMetricQueryEnd;
-    zet_pfnCommandListAppendMetricMemoryBarrier_t               pfnAppendMetricMemoryBarrier;
-} zet_command_list_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's CommandList table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetCommandListProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_command_list_dditable_t* pDdiTable          ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetCommandListProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetCommandListProcAddrTable_t)(
-    ze_api_version_t,
-    zet_command_list_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetModuleGetDebugInfo 
-typedef ze_result_t (ZE_APICALL *zet_pfnModuleGetDebugInfo_t)(
-    zet_module_handle_t,
-    zet_module_debug_info_format_t,
-    size_t*,
-    uint8_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Module functions pointers
-typedef struct _zet_module_dditable_t
-{
-    zet_pfnModuleGetDebugInfo_t                                 pfnGetDebugInfo;
-} zet_module_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Module table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetModuleProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_module_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetModuleProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetModuleProcAddrTable_t)(
-    ze_api_version_t,
-    zet_module_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetKernelGetProfileInfo 
-typedef ze_result_t (ZE_APICALL *zet_pfnKernelGetProfileInfo_t)(
-    zet_kernel_handle_t,
-    zet_profile_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Kernel functions pointers
-typedef struct _zet_kernel_dditable_t
-{
-    zet_pfnKernelGetProfileInfo_t                               pfnGetProfileInfo;
-} zet_kernel_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Kernel table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetKernelProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_kernel_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetKernelProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetKernelProcAddrTable_t)(
-    ze_api_version_t,
-    zet_kernel_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricGroupGet 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupGet_t)(
-    zet_device_handle_t,
-    uint32_t*,
-    zet_metric_group_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricGroupGetProperties 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupGetProperties_t)(
-    zet_metric_group_handle_t,
-    zet_metric_group_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricGroupCalculateMetricValues 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCalculateMetricValues_t)(
-    zet_metric_group_handle_t,
-    zet_metric_group_calculation_type_t,
-    size_t,
-    const uint8_t*,
-    uint32_t*,
-    zet_typed_value_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of MetricGroup functions pointers
-typedef struct _zet_metric_group_dditable_t
-{
-    zet_pfnMetricGroupGet_t                                     pfnGet;
-    zet_pfnMetricGroupGetProperties_t                           pfnGetProperties;
-    zet_pfnMetricGroupCalculateMetricValues_t                   pfnCalculateMetricValues;
-} zet_metric_group_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's MetricGroup table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetMetricGroupProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_metric_group_dditable_t* pDdiTable          ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetMetricGroupProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricGroupProcAddrTable_t)(
-    ze_api_version_t,
-    zet_metric_group_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricGroupCalculateMultipleMetricValuesExp 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCalculateMultipleMetricValuesExp_t)(
-    zet_metric_group_handle_t,
-    zet_metric_group_calculation_type_t,
-    size_t,
-    const uint8_t*,
-    uint32_t*,
-    uint32_t*,
-    uint32_t*,
-    zet_typed_value_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of MetricGroupExp functions pointers
-typedef struct _zet_metric_group_exp_dditable_t
-{
-    zet_pfnMetricGroupCalculateMultipleMetricValuesExp_t        pfnCalculateMultipleMetricValuesExp;
-} zet_metric_group_exp_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's MetricGroupExp table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetMetricGroupExpProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_metric_group_exp_dditable_t* pDdiTable      ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetMetricGroupExpProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricGroupExpProcAddrTable_t)(
-    ze_api_version_t,
-    zet_metric_group_exp_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricGet 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricGet_t)(
-    zet_metric_group_handle_t,
-    uint32_t*,
-    zet_metric_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricGetProperties 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricGetProperties_t)(
-    zet_metric_handle_t,
-    zet_metric_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Metric functions pointers
-typedef struct _zet_metric_dditable_t
-{
-    zet_pfnMetricGet_t                                          pfnGet;
-    zet_pfnMetricGetProperties_t                                pfnGetProperties;
-} zet_metric_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Metric table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetMetricProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_metric_dditable_t* pDdiTable                ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetMetricProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricProcAddrTable_t)(
-    ze_api_version_t,
-    zet_metric_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricStreamerOpen 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricStreamerOpen_t)(
-    zet_context_handle_t,
-    zet_device_handle_t,
-    zet_metric_group_handle_t,
-    zet_metric_streamer_desc_t*,
-    ze_event_handle_t,
-    zet_metric_streamer_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricStreamerClose 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricStreamerClose_t)(
-    zet_metric_streamer_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricStreamerReadData 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricStreamerReadData_t)(
-    zet_metric_streamer_handle_t,
-    uint32_t,
-    size_t*,
-    uint8_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of MetricStreamer functions pointers
-typedef struct _zet_metric_streamer_dditable_t
-{
-    zet_pfnMetricStreamerOpen_t                                 pfnOpen;
-    zet_pfnMetricStreamerClose_t                                pfnClose;
-    zet_pfnMetricStreamerReadData_t                             pfnReadData;
-} zet_metric_streamer_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's MetricStreamer table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetMetricStreamerProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_metric_streamer_dditable_t* pDdiTable       ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetMetricStreamerProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricStreamerProcAddrTable_t)(
-    ze_api_version_t,
-    zet_metric_streamer_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricQueryPoolCreate 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryPoolCreate_t)(
-    zet_context_handle_t,
-    zet_device_handle_t,
-    zet_metric_group_handle_t,
-    const zet_metric_query_pool_desc_t*,
-    zet_metric_query_pool_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricQueryPoolDestroy 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryPoolDestroy_t)(
-    zet_metric_query_pool_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of MetricQueryPool functions pointers
-typedef struct _zet_metric_query_pool_dditable_t
-{
-    zet_pfnMetricQueryPoolCreate_t                              pfnCreate;
-    zet_pfnMetricQueryPoolDestroy_t                             pfnDestroy;
-} zet_metric_query_pool_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's MetricQueryPool table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetMetricQueryPoolProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_metric_query_pool_dditable_t* pDdiTable     ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetMetricQueryPoolProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricQueryPoolProcAddrTable_t)(
-    ze_api_version_t,
-    zet_metric_query_pool_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricQueryCreate 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryCreate_t)(
-    zet_metric_query_pool_handle_t,
-    uint32_t,
-    zet_metric_query_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricQueryDestroy 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryDestroy_t)(
-    zet_metric_query_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricQueryReset 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryReset_t)(
-    zet_metric_query_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetMetricQueryGetData 
-typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryGetData_t)(
-    zet_metric_query_handle_t,
-    size_t*,
-    uint8_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of MetricQuery functions pointers
-typedef struct _zet_metric_query_dditable_t
-{
-    zet_pfnMetricQueryCreate_t                                  pfnCreate;
-    zet_pfnMetricQueryDestroy_t                                 pfnDestroy;
-    zet_pfnMetricQueryReset_t                                   pfnReset;
-    zet_pfnMetricQueryGetData_t                                 pfnGetData;
-} zet_metric_query_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's MetricQuery table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetMetricQueryProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_metric_query_dditable_t* pDdiTable          ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetMetricQueryProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricQueryProcAddrTable_t)(
-    ze_api_version_t,
-    zet_metric_query_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetTracerExpCreate 
-typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpCreate_t)(
-    zet_context_handle_t,
-    const zet_tracer_exp_desc_t*,
-    zet_tracer_exp_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetTracerExpDestroy 
-typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpDestroy_t)(
-    zet_tracer_exp_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetTracerExpSetPrologues 
-typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpSetPrologues_t)(
-    zet_tracer_exp_handle_t,
-    zet_core_callbacks_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetTracerExpSetEpilogues 
-typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpSetEpilogues_t)(
-    zet_tracer_exp_handle_t,
-    zet_core_callbacks_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetTracerExpSetEnabled 
-typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpSetEnabled_t)(
-    zet_tracer_exp_handle_t,
-    ze_bool_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of TracerExp functions pointers
-typedef struct _zet_tracer_exp_dditable_t
-{
-    zet_pfnTracerExpCreate_t                                    pfnCreate;
-    zet_pfnTracerExpDestroy_t                                   pfnDestroy;
-    zet_pfnTracerExpSetPrologues_t                              pfnSetPrologues;
-    zet_pfnTracerExpSetEpilogues_t                              pfnSetEpilogues;
-    zet_pfnTracerExpSetEnabled_t                                pfnSetEnabled;
-} zet_tracer_exp_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's TracerExp table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetTracerExpProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_tracer_exp_dditable_t* pDdiTable            ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetTracerExpProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetTracerExpProcAddrTable_t)(
-    ze_api_version_t,
-    zet_tracer_exp_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugAttach 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugAttach_t)(
-    zet_device_handle_t,
-    const zet_debug_config_t*,
-    zet_debug_session_handle_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugDetach 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugDetach_t)(
-    zet_debug_session_handle_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugReadEvent 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugReadEvent_t)(
-    zet_debug_session_handle_t,
-    uint64_t,
-    zet_debug_event_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugAcknowledgeEvent 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugAcknowledgeEvent_t)(
-    zet_debug_session_handle_t,
-    const zet_debug_event_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugInterrupt 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugInterrupt_t)(
-    zet_debug_session_handle_t,
-    ze_device_thread_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugResume 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugResume_t)(
-    zet_debug_session_handle_t,
-    ze_device_thread_t
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugReadMemory 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugReadMemory_t)(
-    zet_debug_session_handle_t,
-    ze_device_thread_t,
-    const zet_debug_memory_space_desc_t*,
-    size_t,
-    void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugWriteMemory 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugWriteMemory_t)(
-    zet_debug_session_handle_t,
-    ze_device_thread_t,
-    const zet_debug_memory_space_desc_t*,
-    size_t,
-    const void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugGetRegisterSetProperties 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugGetRegisterSetProperties_t)(
-    zet_device_handle_t,
-    uint32_t*,
-    zet_debug_regset_properties_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugReadRegisters 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugReadRegisters_t)(
-    zet_debug_session_handle_t,
-    ze_device_thread_t,
-    uint32_t,
-    uint32_t,
-    uint32_t,
-    void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetDebugWriteRegisters 
-typedef ze_result_t (ZE_APICALL *zet_pfnDebugWriteRegisters_t)(
-    zet_debug_session_handle_t,
-    ze_device_thread_t,
-    uint32_t,
-    uint32_t,
-    uint32_t,
-    void*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Table of Debug functions pointers
-typedef struct _zet_debug_dditable_t
-{
-    zet_pfnDebugAttach_t                                        pfnAttach;
-    zet_pfnDebugDetach_t                                        pfnDetach;
-    zet_pfnDebugReadEvent_t                                     pfnReadEvent;
-    zet_pfnDebugAcknowledgeEvent_t                              pfnAcknowledgeEvent;
-    zet_pfnDebugInterrupt_t                                     pfnInterrupt;
-    zet_pfnDebugResume_t                                        pfnResume;
-    zet_pfnDebugReadMemory_t                                    pfnReadMemory;
-    zet_pfnDebugWriteMemory_t                                   pfnWriteMemory;
-    zet_pfnDebugGetRegisterSetProperties_t                      pfnGetRegisterSetProperties;
-    zet_pfnDebugReadRegisters_t                                 pfnReadRegisters;
-    zet_pfnDebugWriteRegisters_t                                pfnWriteRegisters;
-} zet_debug_dditable_t;
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Exported function for filling application's Debug table
-///        with current process' addresses
-///
-/// @returns
-///     - ::ZE_RESULT_SUCCESS
-///     - ::ZE_RESULT_ERROR_UNINITIALIZED
-///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
-///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
-ZE_DLLEXPORT ze_result_t ZE_APICALL
-zetGetDebugProcAddrTable(
-    ze_api_version_t version,                       ///< [in] API version requested
-    zet_debug_dditable_t* pDdiTable                 ///< [in,out] pointer to table of DDI function pointers
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Function-pointer for zetGetDebugProcAddrTable
-typedef ze_result_t (ZE_APICALL *zet_pfnGetDebugProcAddrTable_t)(
-    ze_api_version_t,
-    zet_debug_dditable_t*
-    );
-
-///////////////////////////////////////////////////////////////////////////////
-/// @brief Container for all DDI tables
-typedef struct _zet_dditable_t
-{
-    zet_device_dditable_t               Device;
-    zet_context_dditable_t              Context;
-    zet_command_list_dditable_t         CommandList;
-    zet_module_dditable_t               Module;
-    zet_kernel_dditable_t               Kernel;
-    zet_metric_group_dditable_t         MetricGroup;
-    zet_metric_group_exp_dditable_t     MetricGroupExp;
-    zet_metric_dditable_t               Metric;
-    zet_metric_streamer_dditable_t      MetricStreamer;
-    zet_metric_query_pool_dditable_t    MetricQueryPool;
-    zet_metric_query_dditable_t         MetricQuery;
-    zet_tracer_exp_dditable_t           TracerExp;
-    zet_debug_dditable_t                Debug;
-} zet_dditable_t;
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-#endif // _ZET_DDI_H
\ No newline at end of file
diff --git a/src/gpu/intel/sycl/l0/utils.cpp b/src/gpu/intel/sycl/l0/utils.cpp
index d6431090c45..0f4382dc04e 100644
--- a/src/gpu/intel/sycl/l0/utils.cpp
+++ b/src/gpu/intel/sycl/l0/utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,10 @@
 #include "gpu/intel/sycl/l0/utils.hpp"
 #include "oneapi/dnnl/dnnl_config.h"
 
+#include "gpu/intel/jit/binary_format.hpp"
+#include "gpu/intel/jit/utils/ngen_type_bridge.hpp"
+#include "ngen_level_zero.hpp"
+
 #if defined(__linux__)
 #include <dlfcn.h>
 #elif defined(_WIN32)
@@ -25,7 +29,8 @@
 #error "Level Zero is supported on Linux and Windows only"
 #endif
 
-#include "gpu/intel/sycl/l0/level_zero/ze_api.h"
+#include "level_zero/ze_api.h"
+#include "level_zero/ze_intel_gpu.h"
 
 #if !defined(__SYCL_COMPILER_VERSION)
 #error "Unsupported compiler"
@@ -173,6 +178,21 @@ status_t func_zeDeviceGetProperties(
     return status::success;
 }
 
+status_t func_zeDeviceGetModuleProperties(ze_device_handle_t hDevice,
+        ze_device_module_properties_t *pDeviceProperties) {
+    static auto f = find_ze_symbol<decltype(&zeDeviceGetModuleProperties)>(
+            "zeDeviceGetModuleProperties");
+
+    if (!f) {
+        VERROR(common, level_zero,
+                "failed to find systolic query extension (maybe update the "
+                "driver?)");
+        return status::runtime_error;
+    }
+    ZE_CHECK(f(hDevice, pDeviceProperties));
+    return status::success;
+}
+
 } // namespace
 
 // This function is called from compatibility layer that ensures compatibility
@@ -224,9 +244,9 @@ xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev) {
     return xpu::device_uuid_t(uuid[0], uuid[1]);
 }
 
-status_t sycl_create_kernel_with_level_zero(
-        std::unique_ptr<::sycl::kernel> &sycl_kernel,
-        const std::string &kernel_name,
+status_t sycl_create_kernels_with_level_zero(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
         const gpu::intel::sycl::engine_t *sycl_engine,
         const xpu::binary_t &binary) {
     auto desc = ze_module_desc_t();
@@ -250,13 +270,17 @@ status_t sycl_create_kernel_with_level_zero(
                     ::sycl::bundle_state::executable>(
                     {ze_module}, sycl_engine->context());
 
-    ze_kernel_handle_t ze_kernel;
-    ze_kernel_desc_t ze_kernel_desc {
-            ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_name.c_str()};
-    CHECK(func_zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel));
-    auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
-            {kernel_bundle, ze_kernel}, sycl_engine->context());
-    sycl_kernel = utils::make_unique<::sycl::kernel>(k);
+    sycl_kernels.resize(kernel_names.size());
+    for (size_t i = 0; i < kernel_names.size(); i++) {
+        if (kernel_names[i] == nullptr) continue;
+        ze_kernel_handle_t ze_kernel;
+        ze_kernel_desc_t ze_kernel_desc {
+                ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernel_names[i]};
+        CHECK(func_zeKernelCreate(ze_module, &ze_kernel_desc, &ze_kernel));
+        auto k = ::sycl::make_kernel<::sycl::backend::ext_oneapi_level_zero>(
+                {kernel_bundle, ze_kernel}, sycl_engine->context());
+        sycl_kernels[i] = utils::make_unique<::sycl::kernel>(k);
+    }
 
     return status::success;
 }
@@ -268,6 +292,121 @@ bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs) {
     return lhs_ze_handle == rhs_ze_handle;
 }
 
+status_t get_device_ip(ze_device_handle_t device, uint32_t &ip_version) {
+    auto devicePropsIP = ze_device_ip_version_ext_t();
+    devicePropsIP.stype = ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT;
+
+    auto deviceProps = ze_device_properties_t();
+    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    deviceProps.pNext = &devicePropsIP;
+
+    CHECK(func_zeDeviceGetProperties(device, &deviceProps));
+    ip_version = devicePropsIP.ipVersion;
+    return status::success;
+}
+
+status_t get_l0_device_enabled_systolic_intel(
+        ze_device_handle_t device, bool &mayiuse_systolic) {
+    // Note: supported by Intel Driver 24.05 and onwards
+    auto deviceModPropsExt = ze_intel_device_module_dp_exp_properties_t();
+    deviceModPropsExt.stype
+            = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES;
+
+    auto deviceModProps = ze_device_module_properties_t();
+    deviceModProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
+    deviceModProps.pNext = &deviceModPropsExt;
+
+    CHECK(func_zeDeviceGetModuleProperties(device, &deviceModProps));
+    mayiuse_systolic
+            = deviceModPropsExt.flags & ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS;
+    return status::success;
+}
+
+status_t get_l0_device_enabled_native_float_atomics(
+        ze_device_handle_t device, uint64_t &native_extensions) {
+    using namespace gpu::intel::compute;
+
+    auto fltAtom = ze_float_atomic_ext_properties_t();
+    fltAtom.stype = ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES;
+
+    auto deviceProps = ze_device_module_properties_t();
+    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES;
+    deviceProps.pNext = &fltAtom;
+
+    CHECK(func_zeDeviceGetModuleProperties(device, &deviceProps));
+
+    ze_device_fp_atomic_ext_flags_t atomic_load_store
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE;
+    ze_device_fp_atomic_ext_flags_t atomic_add
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD;
+    ze_device_fp_atomic_ext_flags_t atomic_min_max
+            = ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX
+            | ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX;
+
+    if ((fltAtom.fp16Flags & atomic_load_store) == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_load_store;
+    if ((fltAtom.fp16Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_add;
+    if ((fltAtom.fp16Flags & atomic_min_max) == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp16_atomic_min_max;
+
+    if ((fltAtom.fp32Flags & atomic_load_store) == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_load_store;
+    if ((fltAtom.fp32Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_add;
+    if ((fltAtom.fp32Flags & atomic_min_max) == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp32_atomic_min_max;
+
+    if ((fltAtom.fp64Flags & atomic_load_store) == atomic_load_store)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_load_store;
+    if ((fltAtom.fp64Flags & atomic_add) == atomic_add)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_add;
+    if ((fltAtom.fp64Flags & atomic_min_max) == atomic_min_max)
+        native_extensions |= (uint64_t)native_ext_t::fp64_atomic_min_max;
+
+    return status::success;
+}
+
+status_t get_l0_device_eu_count(ze_device_handle_t device, int &eu_count) {
+    auto eucnt = ze_eu_count_ext_t();
+    auto deviceProps = ze_device_properties_t();
+    deviceProps.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+    deviceProps.pNext = &eucnt;
+
+    CHECK(func_zeDeviceGetProperties(device, &deviceProps));
+    eu_count = eucnt.numTotalEUs;
+    return status::success;
+}
+
+status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
+        ze_context_handle_t context, uint32_t &ip_version,
+        compute::gpu_arch_t &gpu_arch, int &gpu_product_family,
+        int &stepping_id, uint64_t &native_extensions, bool &mayiuse_systolic,
+        bool &mayiuse_ngen_kernels) {
+    using namespace ngen;
+    Product product = LevelZeroCodeGenerator<HW::Unknown>::detectHWInfo(
+            context, device);
+
+    gpu_arch = jit::convert_ngen_arch_to_dnnl(ngen::getCore(product.family));
+    gpu_product_family = static_cast<int>(product.family);
+    stepping_id = product.stepping;
+
+    mayiuse_systolic = false;
+    CHECK(get_l0_device_enabled_systolic_intel(device, mayiuse_systolic));
+
+    CHECK(get_l0_device_enabled_native_float_atomics(
+            device, native_extensions));
+
+    auto status
+            = jit::gpu_supports_binary_format(&mayiuse_ngen_kernels, engine);
+    if (status != status::success) mayiuse_ngen_kernels = false;
+
+    ip_version = 0;
+    return get_device_ip(device, ip_version);
+}
+
 } // namespace sycl
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/sycl/l0/utils.hpp b/src/gpu/intel/sycl/l0/utils.hpp
index ce6005f9473..754f5b293ec 100644
--- a/src/gpu/intel/sycl/l0/utils.hpp
+++ b/src/gpu/intel/sycl/l0/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,9 +34,9 @@ class engine_t;
 
 xpu::device_uuid_t get_device_uuid(const ::sycl::device &dev);
 
-status_t sycl_create_kernel_with_level_zero(
-        std::unique_ptr<::sycl::kernel> &sycl_kernel,
-        const std::string &kernel_name,
+status_t sycl_create_kernels_with_level_zero(
+        std::vector<std::unique_ptr<::sycl::kernel>> &sycl_kernels,
+        const std::vector<const char *> &kernel_names,
         const gpu::intel::sycl::engine_t *sycl_engine,
         const xpu::binary_t &binary);
 
@@ -45,6 +45,12 @@ bool compare_ze_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
 status_t func_zeModuleGetNativeBinary(ze_module_handle_t hModule, size_t *pSize,
         uint8_t *pModuleNativeBinary);
 
+status_t init_gpu_hw_info(impl::engine_t *engine, ze_device_handle_t device,
+        ze_context_handle_t context, uint32_t &ip_version,
+        compute::gpu_arch_t &gpu_arch, int &gpu_product_family,
+        int &stepping_id, uint64_t &native_extensions, bool &mayiuse_systolic,
+        bool &mayiuse_ngen_kernels);
+
 } // namespace sycl
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/sycl/stream.cpp b/src/gpu/intel/sycl/stream.cpp
index 2171df4da15..bc7b86abb50 100644
--- a/src/gpu/intel/sycl/stream.cpp
+++ b/src/gpu/intel/sycl/stream.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 #include "gpu/intel/sycl/stream.hpp"
 
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/sycl/stream.hpp b/src/gpu/intel/sycl/stream.hpp
index 1d7c7857899..626e1db6649 100644
--- a/src/gpu/intel/sycl/stream.hpp
+++ b/src/gpu/intel/sycl/stream.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@
 #include "gpu/intel/compute/compute_engine.hpp"
 #include "gpu/intel/compute/compute_stream.hpp"
 
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/gpu/intel/sycl/sycl_interop_gpu_kernel.cpp b/src/gpu/intel/sycl/sycl_interop_gpu_kernel.cpp
index f5ff7ed800b..8b366316c1c 100644
--- a/src/gpu/intel/sycl/sycl_interop_gpu_kernel.cpp
+++ b/src/gpu/intel/sycl/sycl_interop_gpu_kernel.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 #include "common/utils.hpp"
 #include "common/verbose.hpp"
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "gpu/intel/ocl/types_interop.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 #include "gpu/intel/sycl/l0/utils.hpp"
 #include "gpu/intel/sycl/stream.hpp"
 #include "gpu/intel/sycl/utils.hpp"
@@ -162,14 +162,14 @@ status_t sycl_interop_gpu_kernel_t::parallel_for(impl::stream_t &stream,
         }
         if (range.local_range()) {
             auto sycl_nd_range = gpu::intel::sycl::to_sycl_nd_range(range);
-            cgh.parallel_for(sycl_nd_range, *sycl_kernel_);
+            cgh.parallel_for(sycl_nd_range, sycl_kernel_);
         } else {
             const auto &global_range = range.global_range();
             auto sycl_range = ::sycl::range<3>(
                     global_range.ndims() >= 3 ? global_range[2] : 1,
                     global_range.ndims() >= 2 ? global_range[1] : 1,
                     global_range[0]);
-            cgh.parallel_for(sycl_range, *sycl_kernel_);
+            cgh.parallel_for(sycl_range, sycl_kernel_);
         }
     });
 
@@ -189,6 +189,25 @@ status_t sycl_interop_gpu_kernel_t::dump() const {
     return gpu::intel::gpu_utils::dump_kernel_binary(binary, name());
 }
 
+// This class is to get around std::make_shared requirement to have a public
+// constructor. We keep the original constructor as private but expose it here
+// to use with std::make_shared.
+class sycl_interop_gpu_kernel_compat_t : public sycl_interop_gpu_kernel_t {
+public:
+    template <typename... Args>
+    sycl_interop_gpu_kernel_compat_t(Args &&...args)
+        : sycl_interop_gpu_kernel_t(std::forward<Args>(args)...) {}
+};
+
+status_t sycl_interop_gpu_kernel_t::make(compute::kernel_t &compute_kernel,
+        const ::sycl::kernel &sycl_kernel,
+        const gpu::intel::compute::program_src_t &src) {
+    compute_kernel = compute::kernel_t(
+            std::make_shared<sycl_interop_gpu_kernel_compat_t>(
+                    sycl_kernel, src));
+    return status::success;
+}
+
 } // namespace sycl
 } // namespace intel
 } // namespace gpu
diff --git a/src/gpu/intel/sycl/sycl_interop_gpu_kernel.hpp b/src/gpu/intel/sycl/sycl_interop_gpu_kernel.hpp
index 5b8a980a926..f89f89f1775 100644
--- a/src/gpu/intel/sycl/sycl_interop_gpu_kernel.hpp
+++ b/src/gpu/intel/sycl/sycl_interop_gpu_kernel.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,10 +29,7 @@ namespace sycl {
 
 class sycl_interop_gpu_kernel_t : public gpu::intel::compute::kernel_impl_t {
 public:
-    sycl_interop_gpu_kernel_t(std::unique_ptr<::sycl::kernel> &&sycl_kernel)
-        : sycl_kernel_(std::move(sycl_kernel)) {}
-
-    ::sycl::kernel sycl_kernel() const { return *sycl_kernel_; }
+    const ::sycl::kernel &sycl_kernel() const { return sycl_kernel_; }
 
     status_t parallel_for(impl::stream_t &stream,
             const gpu::intel::compute::nd_range_t &range,
@@ -46,12 +43,25 @@ class sycl_interop_gpu_kernel_t : public gpu::intel::compute::kernel_impl_t {
 
     status_t dump() const override;
     std::string name() const override {
-        return sycl_kernel_->get_info<::sycl::info::kernel::function_name>();
+        return sycl_kernel_.get_info<::sycl::info::kernel::function_name>();
     }
+    const compute::program_src_t &src() const { return src_; }
+
+    static status_t make(compute::kernel_t &compute_kernel,
+            const ::sycl::kernel &sycl_kernel,
+            const gpu::intel::compute::program_src_t &src);
 
 private:
-    std::unique_ptr<::sycl::kernel> sycl_kernel_;
+    // See description in the class implementation.
+    friend class sycl_interop_gpu_kernel_compat_t;
+
+    sycl_interop_gpu_kernel_t(const ::sycl::kernel &sycl_kernel,
+            const gpu::intel::compute::program_src_t &src)
+        : sycl_kernel_(sycl_kernel), src_(src) {}
+
+    ::sycl::kernel sycl_kernel_;
     std::vector<gpu::intel::compute::scalar_type_t> arg_types_;
+    gpu::intel::compute::program_src_t src_;
 };
 
 } // namespace sycl
diff --git a/src/gpu/intel/sycl/utils.cpp b/src/gpu/intel/sycl/utils.cpp
index 626deb8e477..db32955e7c1 100644
--- a/src/gpu/intel/sycl/utils.cpp
+++ b/src/gpu/intel/sycl/utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -132,13 +132,21 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) {
         return uuid2ocl_dev_tmp;
     }();
 
-    if (uuid2ocl_dev.empty()) return status::runtime_error;
+    if (uuid2ocl_dev.empty()) {
+        VERROR(common, runtime, VERBOSE_MISSING_OCL_DEVICE,
+                dev.get_info<::sycl::info::device::name>().c_str());
+        return status::runtime_error;
+    }
 
     const xpu::device_uuid_t l0_dev_uuid
             = gpu::intel::sycl::get_device_uuid(dev);
     auto d = uuid2ocl_dev.get(l0_dev_uuid);
 
-    if (!d) return status::runtime_error;
+    if (!d) {
+        VERROR(common, runtime, VERBOSE_MISSING_OCL_DEVICE,
+                dev.get_info<::sycl::info::device::name>().c_str());
+        return status::runtime_error;
+    }
 
     *ocl_dev = d;
 
@@ -146,7 +154,7 @@ status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev) {
 }
 
 static status_t create_ocl_engine(
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
                 *ocl_engine,
         const ::sycl::device &sycl_dev,
         const ::sycl::context *sycl_ctx = nullptr) {
@@ -183,13 +191,13 @@ static status_t create_ocl_engine(
     size_t index;
     CHECK(xpu::ocl::get_device_index(&index, ocl_dev));
     CHECK(f.engine_create(&ocl_engine_ptr, ocl_dev, ocl_ctx, index));
-    ocl_engine->reset(utils::downcast<gpu::intel::ocl::ocl_gpu_engine_t *>(
-            ocl_engine_ptr));
+    ocl_engine->reset(
+            utils::downcast<gpu::intel::ocl::engine_t *>(ocl_engine_ptr));
     return status::success;
 }
 
 status_t create_ocl_engine(
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
                 *ocl_engine,
         const gpu::intel::sycl::engine_t *engine) {
     const auto sycl_ctx = engine->context();
@@ -214,8 +222,7 @@ status_t get_kernel_binary(
             CHECK(gpu::intel::sycl::func_zeModuleGetNativeBinary(
                     module, &module_binary_size, module_binary.data()));
             {
-                std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t,
-                        engine_deleter_t>
+                std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
                         ocl_engine;
                 CHECK(create_ocl_engine(&ocl_engine, devs[0]));
                 xpu::ocl::wrapper_t<cl_program> ocl_program;
diff --git a/src/gpu/intel/sycl/utils.hpp b/src/gpu/intel/sycl/utils.hpp
index c48b92d8e87..706fbc2ffba 100644
--- a/src/gpu/intel/sycl/utils.hpp
+++ b/src/gpu/intel/sycl/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #define GPU_INTEL_SYCL_UTILS_HPP
 
 #include "gpu/intel/compute/utils.hpp"
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
+#include "gpu/intel/ocl/engine.hpp"
 #include "xpu/sycl/utils.hpp"
 
 namespace dnnl {
@@ -32,15 +32,17 @@ class engine_t;
 ::sycl::nd_range<3> to_sycl_nd_range(
         const gpu::intel::compute::nd_range_t &range);
 
+status_t sycl_dev2ocl_dev(cl_device_id *ocl_dev, const ::sycl::device &dev);
+
 status_t create_ocl_engine(
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
                 *ocl_engine,
         const gpu::intel::sycl::engine_t *engine);
 
 status_t get_kernel_binary(const ::sycl::kernel &kernel, xpu::binary_t &binary);
 
 status_t create_ocl_engine(
-        std::unique_ptr<gpu::intel::ocl::ocl_gpu_engine_t, engine_deleter_t>
+        std::unique_ptr<gpu::intel::ocl::engine_t, engine_deleter_t>
                 *ocl_engine,
         const gpu::intel::sycl::engine_t *engine);
 
diff --git a/src/gpu/intel/utils.hpp b/src/gpu/intel/utils.hpp
index c9337f8ddbb..634fd142a07 100644
--- a/src/gpu/intel/utils.hpp
+++ b/src/gpu/intel/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,9 @@
 #include "common/utils.hpp"
 #include "gpu/intel/compute/device_info.hpp"
 
+#define VCHECK_KERNEL(stat, msg, ...) \
+    VCHECK(common, create, check, runtime, stat, msg, ##__VA_ARGS__);
+
 // Uncomment this when aborting on ir_assert is desired:
 // #define GPU_ABORT_ON_ERROR
 
@@ -29,14 +32,22 @@ namespace dnnl {
 namespace impl {
 namespace gpu {
 namespace intel {
-namespace gpu_utils {
 
-// Replacement implementation of std::enable_if_t from C++14, included here for
-// interoperability with C++11
-template <bool B, class T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-template <typename T>
-using is_vector = std::is_same<T, typename std::vector<typename T::value_type>>;
+#define MAX_NDIMS 6
+#define MAX_POST_OPS_SUPPORTED 32
+
+using dim_idx_t = uint32_t;
+namespace dim_idx {
+
+constexpr dim_idx_t invalid = static_cast<dim_idx_t>(-1);
+
+inline char as_tag(dim_idx_t idx, bool is_outer = false) {
+    return (is_outer ? 'A' : 'a') + static_cast<char>(idx);
+}
+
+} // namespace dim_idx
+
+namespace gpu_utils {
 
 class error_stream_t {
 public:
@@ -102,7 +113,7 @@ class error_stream_t {
 #define gpu_error_not_expected() gpu_assert(false) << "Not expected. "
 #define gpu_except_not_implemented(msg) \
     throw std::runtime_error(std::string(msg) + std::string(" at ") \
-            + std::string(__FILE_NAME__) + std::string(":") \
+            + std::string(__FILENAME__) + std::string(":") \
             + std::to_string(__LINE__))
 
 template <typename out_type, typename in_type,
@@ -127,14 +138,6 @@ inline bool validate_into(bool b) {
     return std::is_integral<out_type>::value;
 }
 
-template <typename out_type, typename in_type>
-inline out_type into(in_type in) {
-    gpu_assert(validate_into<out_type>(in))
-            << "Value " << in << " cannot be converted into type "
-            << typeid(out_type).name();
-    return static_cast<out_type>(in);
-}
-
 inline int dev_getenv(const char *name, int default_value) {
 #ifdef DNNL_DEV_MODE
     return getenv_int(name, default_value);
@@ -246,6 +249,15 @@ struct device_id_hash_t {
 };
 
 } // namespace gpu_utils
+
+template <typename out_type, typename in_type>
+inline out_type into(in_type in) {
+    gpu_assert(gpu_utils::validate_into<out_type>(in))
+            << "Value " << in << " cannot be converted into type "
+            << typeid(out_type).name();
+    return static_cast<out_type>(in);
+}
+
 } // namespace intel
 } // namespace gpu
 } // namespace impl
diff --git a/src/gpu/nvidia/README.md b/src/gpu/nvidia/README.md
index 0e7f8b8aa42..b6466032110 100644
--- a/src/gpu/nvidia/README.md
+++ b/src/gpu/nvidia/README.md
@@ -215,6 +215,14 @@ limitations when using Nvidia backend for eltwise primitive:
 The inner product primitives is an implementation of matrix multiplication plus
 bias activation. There are two implementation of inner product in cuDNN backend.
 
+With `sum` post-op, the accumulation mode attribute affects the behavior as
+follows:
+- `relaxed`: Uses GEMM’s beta parameter for a fused and optimized sum post-op
+  but may reduce output precision for large `f16` inputs.
+- `strict` (default): Converts GEMM output to `f32`, performs `sum` post-op as a
+  separate operation, then converts it back to the original type. This attribute
+  provides better output precision but reduced performance.
+
 #### Using GEMM
 
 The default backend for inner product is the gemm backend using `cublasGemmEx`
diff --git a/src/gpu/nvidia/cudnn_batch_normalization.hpp b/src/gpu/nvidia/cudnn_batch_normalization.hpp
index dfe5eb48c4c..28721bad3bd 100644
--- a/src/gpu/nvidia/cudnn_batch_normalization.hpp
+++ b/src/gpu/nvidia/cudnn_batch_normalization.hpp
@@ -133,10 +133,7 @@ struct cudnn_batch_normalization_bwd_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
 
     struct pd_t : public batch_normalization_bwd_pd_t {
-        pd_t(const batch_normalization_desc_t *adesc,
-                const primitive_attr_t *attr,
-                const batch_normalization_fwd_pd_t *hint_fwd_pd)
-            : batch_normalization_bwd_pd_t(adesc, attr, hint_fwd_pd) {}
+        using batch_normalization_bwd_pd_t::batch_normalization_bwd_pd_t;
 
         DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_batch_normalization_bwd_t);
 
diff --git a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp
index c9d9cc8df05..ca742dd4678 100644
--- a/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp
+++ b/src/gpu/nvidia/cudnn_batch_normalization_executor.hpp
@@ -192,14 +192,12 @@ struct bnorm_exec_base_t {
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write>
                     arg_scale,
             float val, const size_t n) const {
-        cuda_stream->interop_task([&](::sycl::handler &cgh) {
-            T *scale_ptr = static_cast<T *>(arg_scale.get_native_pointer(ih));
-            CUDA_EXECUTE_FUNC(cuMemsetD32Async,
-                    reinterpret_cast<CUdeviceptr>(scale_ptr),
-                    reinterpret_cast<int &>(val), n,
-                    cuda_stream->get_underlying_stream());
-            cudaDeviceSynchronize();
-        });
+        T *scale_ptr = static_cast<T *>(arg_scale.get_native_pointer(ih));
+        CUDA_EXECUTE_FUNC(cuMemsetD32Async,
+                reinterpret_cast<CUdeviceptr>(scale_ptr),
+                reinterpret_cast<int &>(val), n,
+                cuda_stream->get_underlying_stream());
+        sync_device();
     }
 
     // Handle the cases when mean and var are read-only accessors or nullptr
@@ -216,17 +214,15 @@ struct bnorm_exec_base_t {
             xpu::sycl::interop_memory_arg_t<::sycl::access_mode::write> arg_var,
             const size_t n) const {
         constexpr T mean_var_val = 0;
-        cuda_stream->interop_task([&](::sycl::handler &cgh) {
-            T *mean_ptr = static_cast<T *>(arg_mean.get_native_pointer(ih));
-            T *var_ptr = static_cast<T *>(arg_var.get_native_pointer(ih));
-            CUDA_EXECUTE_FUNC(cuMemsetD32Async,
-                    reinterpret_cast<CUdeviceptr>(mean_ptr), mean_var_val, n,
-                    cuda_stream->get_underlying_stream());
-            CUDA_EXECUTE_FUNC(cuMemsetD32Async,
-                    reinterpret_cast<CUdeviceptr>(var_ptr), mean_var_val, n,
-                    cuda_stream->get_underlying_stream());
-            cudaDeviceSynchronize();
-        });
+        T *mean_ptr = static_cast<T *>(arg_mean.get_native_pointer(ih));
+        T *var_ptr = static_cast<T *>(arg_var.get_native_pointer(ih));
+        CUDA_EXECUTE_FUNC(cuMemsetD32Async,
+                reinterpret_cast<CUdeviceptr>(mean_ptr), mean_var_val, n,
+                cuda_stream->get_underlying_stream());
+        CUDA_EXECUTE_FUNC(cuMemsetD32Async,
+                reinterpret_cast<CUdeviceptr>(var_ptr), mean_var_val, n,
+                cuda_stream->get_underlying_stream());
+        sync_device();
     }
 };
 
diff --git a/src/gpu/nvidia/cudnn_binary.cpp b/src/gpu/nvidia/cudnn_binary.cpp
index f8f6dc89b78..bee3a6eb153 100644
--- a/src/gpu/nvidia/cudnn_binary.cpp
+++ b/src/gpu/nvidia/cudnn_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -35,11 +35,11 @@ status_t cudnn_binary_t::execute(const exec_ctx_t &ctx) const {
     nvidia::stream_t *cuda_stream
             = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-    if (!pd()->attr()->scales_.get(DNNL_ARG_SRC_0).defined())
+    if (!pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC_0))
         CHECK(stream_utils::copy_input_arg_to_host(ctx, cuda_stream,
                 &host_scales_[0], DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0,
                 sizeof(float)));
-    if (!pd()->attr()->scales_.get(DNNL_ARG_SRC_1).defined())
+    if (!pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC_1))
         CHECK(stream_utils::copy_input_arg_to_host(ctx, cuda_stream,
                 &host_scales_[1], DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1,
                 sizeof(float)));
diff --git a/src/gpu/nvidia/cudnn_binary.hpp b/src/gpu/nvidia/cudnn_binary.hpp
index 4bba9c0ba6b..46afea40940 100644
--- a/src/gpu/nvidia/cudnn_binary.hpp
+++ b/src/gpu/nvidia/cudnn_binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -47,7 +47,7 @@ struct cudnn_binary_t : public gpu::primitive_t {
                     && check_data_types(engine) && check_no_blocking()
                     && check_broadcast()
                     && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime)
+                            primitive_attr_t::skip_mask_t::scales)
                     && IMPLICATION(!attr()->scales_.has_default_values(),
                             check_scales_mask());
 
@@ -66,12 +66,7 @@ struct cudnn_binary_t : public gpu::primitive_t {
                     || has_zero_dims(dst_md()->dims, dst_md()->ndims);
         }
 
-        bool check_scales_mask() const {
-            for (const auto &s : attr()->scales_.scales_) {
-                if (s.second.mask_ != 0) return false;
-            }
-            return true;
-        }
+        bool check_scales_mask() const { return attr_scales_ok(); }
 
         bool check_no_blocking() const {
             // Blocking is not supported by cudnnOpTensor, return false if any
diff --git a/src/gpu/nvidia/cudnn_conv_inner_product.hpp b/src/gpu/nvidia/cudnn_conv_inner_product.hpp
index 454b859a8b4..7cb95c4d93e 100644
--- a/src/gpu/nvidia/cudnn_conv_inner_product.hpp
+++ b/src/gpu/nvidia/cudnn_conv_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,7 +65,7 @@ struct cudnn_conv_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
             using namespace prop_kind;
             using sm_t = primitive_attr_t::skip_mask_t;
 
-            const auto attr_skip_mask = sm_t::scales_runtime | sm_t::post_ops;
+            const auto attr_skip_mask = sm_t::scales | sm_t::post_ops;
             // Flag for checking if the fused routine can be used for the
             // blocked format case. If set to true, that implies ReLU and
             // blocking are used.
@@ -91,7 +91,8 @@ struct cudnn_conv_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
                     new cudnn_conv_inner_product_fwd_impl_t());
 
             auto st = inner_product_impl_->init(engine, this, with_relu(),
-                    with_eltwise(), with_sum(), use_fused_path_for_blocking);
+                    with_eltwise(), with_sum(), use_fused_path_for_blocking,
+                    false);
             return st;
         }
         bool with_eltwise() const {
@@ -250,7 +251,7 @@ struct cudnn_conv_inner_product_bwd_data_t
                     new cudnn_conv_inner_product_bwd_data_impl_t());
 
             return inner_product_impl_->init(
-                    engine, this, false, false, false, false);
+                    engine, this, false, false, false, false, false);
         }
 
         status_t set_default_params() {
@@ -341,7 +342,7 @@ struct cudnn_conv_inner_product_bwd_weights_t
                     new cudnn_conv_inner_product_bwd_weights_impl_t());
 
             return inner_product_impl_->init(
-                    engine, this, false, false, false, false);
+                    engine, this, false, false, false, false, false);
         }
 
         status_t set_default_params() {
diff --git a/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp
index 1f698a58c49..4804a97b539 100644
--- a/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp
+++ b/src/gpu/nvidia/cudnn_conv_inner_product_impl.hpp
@@ -117,7 +117,7 @@ struct cudnn_conv_inner_product_fwd_impl_t
     }
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool with_relu, bool with_eltwise, bool with_sum,
-            bool use_fuse_path_for_blocking) override {
+            bool use_fuse_path_for_blocking, bool /* use_f32_sum */) override {
         with_bias_ = pd->with_bias();
         with_relu_ = with_relu;
         with_eltwise_ = with_eltwise;
@@ -424,7 +424,8 @@ struct cudnn_conv_inner_product_bwd_data_impl_t
     cudnnTensorFormat_t diff_source_format_;
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool /*using_fused_path_for_blocking*/) override {
+            bool /*using_fused_path_for_blocking*/,
+            bool /* use_f32_sum */) override {
         // Pad out the dimensions to 4
         if (pd->ndims() > CUDNN_DIM_MAX || pd->ndims() < 2) {
             return status::invalid_arguments;
@@ -575,7 +576,8 @@ struct cudnn_conv_inner_product_bwd_weights_impl_t
 
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool /*using_fused_path_for_blocking*/) override {
+            bool /*using_fused_path_for_blocking*/,
+            bool /* use_f32_sum */) override {
         // If any of the dimensions are 0 we should not continue with creating
         // cudnn descriptors
         with_bias_ = pd->with_bias();
diff --git a/src/gpu/nvidia/cudnn_convolution.cpp b/src/gpu/nvidia/cudnn_convolution.cpp
index 8aa5be577ea..6c56df65fa1 100644
--- a/src/gpu/nvidia/cudnn_convolution.cpp
+++ b/src/gpu/nvidia/cudnn_convolution.cpp
@@ -49,16 +49,11 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
 
         xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
                 temp_dst;
-        xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
-                temp_reorder;
 
         if (pd()->use_temp_dst()) {
             memory_storage_t *temp_dst_mem = scratch_storage.get();
-            memory_storage_t *temp_reorder_mem = scratch_storage_2.get();
             temp_dst = xpu::sycl::interop_memory_arg_t<
                     ::sycl::access::mode::read_write>(temp_dst_mem, cgh);
-            temp_reorder = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>(temp_reorder_mem, cgh);
         }
 
         xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
@@ -85,7 +80,6 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
             args.push_back(arg_scratch.get_native_pointer(ih));
             args.push_back(arg_filter_scratch.get_native_pointer(ih));
             args.push_back(temp_dst.get_native_pointer(ih));
-            args.push_back(temp_reorder.get_native_pointer(ih));
             args.push_back(arg_src_scale.get_native_pointer(ih));
             args.push_back(arg_wei_scale.get_native_pointer(ih));
             args.push_back(arg_dst_scale.get_native_pointer(ih));
diff --git a/src/gpu/nvidia/cudnn_convolution.hpp b/src/gpu/nvidia/cudnn_convolution.hpp
index 01f4f2f7fbe..dba2ef07f30 100644
--- a/src/gpu/nvidia/cudnn_convolution.hpp
+++ b/src/gpu/nvidia/cudnn_convolution.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -51,7 +51,7 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t {
 
             using sm_t = primitive_attr_t::skip_mask_t;
             const auto attr_skip_mask
-                    = sm_t::scales_runtime | sm_t::post_ops | sm_t::fpmath_mode;
+                    = sm_t::scales | sm_t::post_ops | sm_t::fpmath_mode;
             auto *sycl_engine = utils::downcast<nvidia::engine_t *>(engine);
 
             bool ok = utils::one_of(desc()->prop_kind,
@@ -155,15 +155,17 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t {
                     && ndims() < 5;
         }
 
-        bool attr_scales_ok() const {
-            const auto &scales = attr()->scales_;
-            const auto &supported_args
-                    = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
-            if (!scales.has_default_values(supported_args)) return false;
-            // cuDNN does not support scaling per dimension.
-            for (auto arg : supported_args)
-                if (scales.get(arg).mask_ != 0) return false;
-            return true;
+        bool attr_scales_ok(const std::vector<int> &supported_args
+                = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) const {
+            bool ok = attr()->scales_.has_default_values(supported_args);
+            for (int arg : supported_args) {
+                if (attr()->scales_.has_default_values(arg)) continue;
+
+                const auto &mask = attr()->scales_.get_mask(arg);
+                // cuDNN does not support scaling per dimension.
+                ok = ok && (mask == 0);
+            }
+            return ok;
         }
     };
 
@@ -176,10 +178,6 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t {
             CHECK(sycl_engine->create_memory_storage(
                     &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
             scratch_storage.reset(scratch_ptr);
-
-            CHECK(sycl_engine->create_memory_storage(
-                    &scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
-            scratch_storage_2.reset(scratch_ptr);
         }
         if (impl && impl->use_scales_dst()) {
             CHECK(sycl_engine->create_memory_storage(
diff --git a/src/gpu/nvidia/cudnn_convolution_impl.hpp b/src/gpu/nvidia/cudnn_convolution_impl.hpp
index c4d91d61206..e0fb47e796b 100644
--- a/src/gpu/nvidia/cudnn_convolution_impl.hpp
+++ b/src/gpu/nvidia/cudnn_convolution_impl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -143,13 +143,10 @@ struct cudnn_convolution_impl_base_t
         with_bias = pd->with_bias();
         beta = 0.0f;
         do_scaling = !pd->attr()->scales_.has_default_values();
-        do_dst_scaling
-                = !pd->attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-        do_src_scaling
-                = !pd->attr()->scales_.get(DNNL_ARG_SRC).has_default_values();
-        do_wei_scaling = !pd->attr()
-                                  ->scales_.get(DNNL_ARG_WEIGHTS)
-                                  .has_default_values();
+        do_dst_scaling = !pd->attr()->scales_.has_default_values(DNNL_ARG_DST);
+        do_src_scaling = !pd->attr()->scales_.has_default_values(DNNL_ARG_SRC);
+        do_wei_scaling
+                = !pd->attr()->scales_.has_default_values(DNNL_ARG_WEIGHTS);
         dnnl_descs[x] = *pd->invariant_src_md();
         dnnl_descs[weights] = *pd->invariant_wei_md();
         dnnl_descs[y] = *pd->invariant_dst_md();
@@ -330,7 +327,7 @@ struct cudnn_convolution_impl_base_t
         int expected_dims[CUDNN_DIM_MAX] = {};
         CUDNN_EXECUTE_FUNC_V(cudnnGetConvolutionNdForwardOutputDim, conv_desc,
                 descs[x], weights_desc, ndims[y], &expected_dims[0]);
-        for (size_t i = 0; i < ndims[y]; i++) {
+        for (int i = 0; i < ndims[y]; i++) {
             if (dims[y][i] != expected_dims[i]) return status::unimplemented;
         }
         return status::success;
@@ -438,7 +435,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
     status_t configure_post_ops(convolution_pd_t *pd) {
         auto &p = pd->attr()->post_ops_;
         num_post_ops = p.len();
-        for (size_t i = 0; i < p.len(); i++) {
+        for (int i = 0; i < p.len(); i++) {
             post_ops[i] = p.entry_[i].kind;
             if (post_ops[i] == dnnl_eltwise) {
                 CHECK(create_and_set_eltwise_descriptor(pd));
@@ -511,14 +508,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
         const float beta = 0.0f;
         if (flip_formats) {
             CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
-                    reorder_dst_desc, src, &beta, descs[y], dst);
+                    reorder_dst_desc, src, &beta, y_fp32_desc, dst);
         } else {
-            CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y],
-                    src, &beta, reorder_dst_desc, dst);
+            CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
+                    y_fp32_desc, src, &beta, reorder_dst_desc, dst);
         }
     }
 
-    void execute_f32_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
+    void execute_f32_dst_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
             float alpha_, float beta_) const {
         float alpha1 = 0.0f;
         float alpha2 = alpha_;
@@ -528,6 +525,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
                 y_fp32_data);
     }
 
+    void execute_f32_src_sum(cudnnHandle_t handle, void *x, void *y,
+            float alpha_, float beta_) const {
+        float alpha = alpha_;
+        float beta = beta_;
+        CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x,
+                &beta, y_fp32_desc, y);
+    }
+
     void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const {
         float alpha = 1.0f;
         float beta = 0.0f;
@@ -551,8 +556,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
             const std::vector<void *> &args) const override {
         auto x = args[0], weights = args[1], y = args[2], bias = args[3],
              scratchpad = args[4], post_op_scratch = args[6],
-             post_op_reorder = args[7], src_scale = args[8],
-             wei_scale = args[9], dst_scale = args[10];
+             src_scale = args[7], wei_scale = args[8], dst_scale = args[9];
         void *output = use_temp_dst_ ? post_op_scratch : y;
         if (using_transformed_filter()) {
             auto w_scratch = args[5];
@@ -561,7 +565,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
         }
 
         float *y_fp32_data = nullptr;
-        if (y_f32_is_required()) { y_fp32_data = (float *)args[11]; }
+        if (y_f32_is_required()) { y_fp32_data = (float *)args[10]; }
 
         bool fused = conv_bias || conv_bias_eltwise;
 
@@ -581,7 +585,8 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
             }
         }
 
-        auto &y_desc = y_f32_is_required() ? y_fp32_desc : descs[io::y];
+        auto &y_desc = (y_f32_is_required() || use_temp_dst_) ? y_fp32_desc
+                                                              : descs[io::y];
         void *y_data = y_f32_is_required() ? y_fp32_data : output;
 
         if (fused) {
@@ -619,12 +624,11 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
             switch (post_ops[i]) {
                 case dnnl_sum:
                     if (need_reorder) {
-                        execute_reorder(handle, y, post_op_reorder, true);
-                        execute_sum(handle, post_op_reorder, post_op_scratch,
-                                sum_scale, 1.0f);
+                        execute_f32_src_sum(
+                                handle, y, post_op_scratch, sum_scale, 1.0f);
                     } else if (last_op) {
                         if (y_f32_is_required()) {
-                            execute_f32_sum(
+                            execute_f32_dst_sum(
                                     handle, y, y_fp32_data, 1.0f, sum_scale);
                         } else {
                             execute_sum(handle, post_op_scratch, y, 1.0f,
@@ -687,7 +691,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
         // The scratchpad size will need to be modified in
         // cases where the dst_scaling is used and the output
         // uses s8 values.
-        if (use_scales_dst_) {
+        if (use_scales_dst_ || use_temp_dst_) {
             CHECK(create_and_set_tensor_descriptor(&y_fp32_desc,
                     CUDNN_DATA_FLOAT, ndims[y], dims[y], strides[y]));
             CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,
@@ -728,7 +732,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
                 cudnnGetConvolutionMathType, conv_desc, &expected_math_mode));
 
         auto submit_status = CUDNN_STATUS_NOT_SUPPORTED;
-        for (size_t i = 0; i < returned_algo_count; i++) {
+        for (int i = 0; i < returned_algo_count; i++) {
             submit_status = perf[i].status;
             if (submit_status == CUDNN_STATUS_SUCCESS) {
                 // cudnnFindConvolutionForwardAlgorithm can erroneously report
@@ -857,7 +861,7 @@ struct cudnn_convolution_impl_bwd_data_t
 
         dnnl_fpmath_mode_t dnnl_fpmath_mode = pd->attr()->fpmath_.mode_;
 
-        for (size_t i = 0; i < returned_algo_count; i++) {
+        for (int i = 0; i < returned_algo_count; i++) {
             if (perf[i].status == CUDNN_STATUS_SUCCESS) {
                 switch (pd->desc()->alg_kind) {
                     case dnnl_convolution_auto:
@@ -1006,7 +1010,7 @@ struct cudnn_convolution_impl_bwd_weights_t
 
         dnnl_fpmath_mode_t dnnl_fpmath_mode = pd->attr()->fpmath_.mode_;
 
-        for (size_t i = 0; i < returned_algo_count; i++) {
+        for (int i = 0; i < returned_algo_count; i++) {
             if (perf[i].status == CUDNN_STATUS_SUCCESS) {
                 switch (pd->desc()->alg_kind) {
                     case dnnl_convolution_auto:
diff --git a/src/gpu/nvidia/cudnn_deconvolution.hpp b/src/gpu/nvidia/cudnn_deconvolution.hpp
index 2702a3b54aa..031891a7bd5 100644
--- a/src/gpu/nvidia/cudnn_deconvolution.hpp
+++ b/src/gpu/nvidia/cudnn_deconvolution.hpp
@@ -113,10 +113,7 @@ static status_t conv_descr_create(
 struct cudnn_deconvolution_fwd_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public deconvolution_fwd_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : deconvolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
+        using deconvolution_fwd_pd_t::deconvolution_fwd_pd_t;
 
         pd_t(const pd_t &other)
             : deconvolution_fwd_pd_t(other)
@@ -263,17 +260,12 @@ struct cudnn_deconvolution_fwd_t : public gpu::primitive_t {
 struct cudnn_deconvolution_bwd_data_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public deconvolution_bwd_data_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : deconvolution_bwd_data_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
+        using deconvolution_bwd_data_pd_t::deconvolution_bwd_data_pd_t;
 
         pd_t(const pd_t &other)
             : deconvolution_bwd_data_pd_t(other)
             , conv_pd_(other.conv_pd_->clone()) {}
 
-        ~pd_t() {}
-
         DECLARE_COMMON_PD_T("cuda:cudnn:any", cudnn_deconvolution_bwd_data_t);
 
         status_t init_convolution(impl::engine_t *engine) {
@@ -373,17 +365,12 @@ struct cudnn_deconvolution_bwd_data_t : public gpu::primitive_t {
 struct cudnn_deconvolution_bwd_weights_t : public gpu::primitive_t {
     using gpu::primitive_t::primitive_t;
     struct pd_t : public deconvolution_bwd_weights_pd_t {
-        pd_t(const deconvolution_desc_t *adesc, const primitive_attr_t *attr,
-                const deconvolution_fwd_pd_t *hint_fwd_pd)
-            : deconvolution_bwd_weights_pd_t(adesc, attr, hint_fwd_pd)
-            , conv_pd_(nullptr) {}
+        using deconvolution_bwd_weights_pd_t::deconvolution_bwd_weights_pd_t;
 
         pd_t(const pd_t &other)
             : deconvolution_bwd_weights_pd_t(other)
             , conv_pd_(other.conv_pd_->clone()) {}
 
-        ~pd_t() {}
-
         DECLARE_COMMON_PD_T(
                 "cuda:cudnn:any", cudnn_deconvolution_bwd_weights_t);
 
diff --git a/src/gpu/nvidia/cudnn_eltwise_impl.hpp b/src/gpu/nvidia/cudnn_eltwise_impl.hpp
index aba08f055fb..16134bbabc9 100644
--- a/src/gpu/nvidia/cudnn_eltwise_impl.hpp
+++ b/src/gpu/nvidia/cudnn_eltwise_impl.hpp
@@ -92,6 +92,13 @@ struct cudnn_eltwise_fwd_impl_t : public cudnn_eltwise_impl_base_t {
         if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
         ndims = pd->ndims() < 4 ? 4 : pd->ndims();
 
+        for (int i = 0; i < ndims; ++i) {
+            if (pd->src_md()->padded_dims[i]
+                    > std::numeric_limits<int>::max()) {
+                return status::unimplemented;
+            }
+        }
+
         // Obtain source and destination dimensions, strides and datatype
         convert_dims(pd->src_md()->padded_dims, dims_, pd->ndims());
         convert_dims(pd->src_md()->format_desc.blocking.strides, strides_,
@@ -139,6 +146,12 @@ struct cudnn_eltwise_bwd_impl_t : public cudnn_eltwise_impl_base_t {
         if (pd->ndims() > CUDNN_DIM_MAX) { return status::invalid_arguments; }
         ndims = pd->ndims() < 4 ? 4 : pd->ndims();
 
+        for (int i = 0; i < ndims; ++i) {
+            if (pd->src_md()->padded_dims[i]
+                    > std::numeric_limits<int>::max()) {
+                return status::unimplemented;
+            }
+        }
         // Obtain dimension and strides for the backward eltwise operation
         convert_dims(pd->src_md()->padded_dims, dims_, pd->ndims());
 
diff --git a/src/gpu/nvidia/cudnn_gemm_inner_product.hpp b/src/gpu/nvidia/cudnn_gemm_inner_product.hpp
index d90c2f3f090..f86162e5de7 100644
--- a/src/gpu/nvidia/cudnn_gemm_inner_product.hpp
+++ b/src/gpu/nvidia/cudnn_gemm_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -160,10 +160,10 @@ status_t template_set_default_params(memory_desc_t &src_md,
 
 struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
     using cudnn_inner_product_fwd_t::cudnn_inner_product_fwd_t;
-    using parrent_pd_t = cudnn_inner_product_fwd_t::pd_t;
+    using parent_pd_t = cudnn_inner_product_fwd_t::pd_t;
 
-    struct pd_t : public parrent_pd_t {
-        using parrent_pd_t::parrent_pd_t;
+    struct pd_t : public parent_pd_t {
+        using parent_pd_t::parent_pd_t;
 
         DECLARE_COMMON_PD_T("cuda:cudnn:gemm", cudnn_gemm_inner_product_fwd_t);
 
@@ -173,7 +173,7 @@ struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
             using namespace data_type;
 
             using sm_t = primitive_attr_t::skip_mask_t;
-            const auto attr_skip_mask = sm_t::scales_runtime | sm_t::post_ops;
+            const auto attr_skip_mask = sm_t::scales | sm_t::post_ops;
 
             bool with_eltwise
                     = attr()->post_ops_.find(primitive_kind::eltwise) != -1;
@@ -222,10 +222,14 @@ struct cudnn_gemm_inner_product_fwd_t : public cudnn_inner_product_fwd_t {
                     && (gemm_compatible || need_reorder);
             if (!ok) return status::unimplemented;
 
+            const bool is_relaxed_acc_mode
+                    = attr()->acc_mode_ == dnnl_accumulation_mode_relaxed;
+            const bool use_f32_sum = with_sum && !is_relaxed_acc_mode;
+
             inner_product_impl_.reset(
                     new cudnn_gemm_inner_product_fwd_impl_t());
             return inner_product_impl_->init(engine, this, with_eltwise,
-                    with_eltwise, with_sum, need_reorder);
+                    with_eltwise, with_sum, need_reorder, use_f32_sum);
         }
 
         status_t set_default_params() {
@@ -289,7 +293,7 @@ struct cudnn_gemm_inner_product_bwd_data_t
                     new cudnn_gemm_inner_product_bwd_data_impl_t());
 
             return inner_product_impl_->init(
-                    engine, this, false, false, false, need_reorder);
+                    engine, this, false, false, false, need_reorder, false);
         }
 
         status_t set_default_params() {
@@ -345,7 +349,7 @@ struct cudnn_gemm_inner_product_bwd_weights_t
             inner_product_impl_.reset(
                     new cudnn_gemm_inner_product_bwd_weights_impl_t());
             return inner_product_impl_->init(
-                    engine, this, false, false, false, need_reorder);
+                    engine, this, false, false, false, need_reorder, false);
         }
 
         status_t set_default_params() {
diff --git a/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp
index 4c1dd601fde..fea64ea979c 100644
--- a/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp
+++ b/src/gpu/nvidia/cudnn_gemm_inner_product_impl.hpp
@@ -77,8 +77,8 @@ struct cudnn_gemm_inner_product_fwd_impl_t
     bool need_reorder_;
 
     virtual status_t init(impl::engine_t *, inner_product_pd_t *pd,
-            bool with_relu, bool with_eltwise, bool with_sum,
-            bool need_reorder) override {
+            bool with_relu, bool with_eltwise, bool with_sum, bool need_reorder,
+            bool use_f32_sum) override {
         need_reorder_ = need_reorder;
         // GEMM is column major, here the data is row major.
         // By switching the weight and source we convert the row major to
@@ -121,8 +121,10 @@ struct cudnn_gemm_inner_product_fwd_impl_t
         use_acc_dst_ = ((pd->dst_md()->data_type == data_type::s8)
                 || (with_bias_
                         && pd->weights_md(1)->data_type
-                                != pd->dst_md()->data_type));
+                                != pd->dst_md()->data_type)
+                || use_f32_sum);
         with_sum_ = with_sum;
+        with_f32_sum_ = use_f32_sum;
         // scaling factor to add the previous destination value to the current
         // computation. This is equivalent of
         sum_scale_ = sum_scale(pd);
@@ -154,12 +156,23 @@ struct cudnn_gemm_inner_product_fwd_impl_t
 
         if (with_bias_) {
             CHECK(convert_data_type(pd->weights_md(1), &data_types_[io::bia]));
+
             // format is always nchw
             set_bias_dims(CUDNN_TENSOR_NCHW, ndims_, pd->OC());
 
             CHECK(create_and_set_tensor_descriptor(&tensor_descs_[io::bia],
                     data_types_[io::bia], ndims_, dims_[io::bia],
                     strides_[io::bia]));
+
+            if (with_f32_sum_) {
+                pd->scratchpad_registry().registrar().book(
+                        memory_tracking::names::key_iprod_bias_bf16_convert_wsp,
+                        memory_desc_wrapper(pd->weights_md(1)).nelems(),
+                        types::data_type_size(data_type::f32));
+                CHECK(create_and_set_tensor_descriptor(&bias_f32_desc_,
+                        CUDNN_DATA_FLOAT, ndims_, dims_[io::bia],
+                        strides_[io::bia]));
+            }
         }
         if (use_acc_dst_) {
             pd->scratchpad_registry().registrar().book(
@@ -178,10 +191,10 @@ struct cudnn_gemm_inner_product_fwd_impl_t
 
     void execute(cudnnHandle_t cudnn_handle, cublasHandle_t cublas_handle,
             const std::vector<void *> &args) const override {
-        assert(args.size() == 9);
+        assert(args.size() == 10);
         auto x = args[0], w = args[1], b = args[2], y = args[3],
              workspace = args[4], src_scale = args[6], wei_scale = args[7],
-             dst_scale = args[8];
+             dst_scale = args[8], bias_f32 = args[9];
         auto w_arg = w;
         if (need_reorder_) {
             void *transformed_w = args[5];
@@ -222,8 +235,18 @@ struct cudnn_gemm_inner_product_fwd_impl_t
 
         if (with_bias_) {
             float alpha = 1.0f;
-            CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &alpha,
-                    tensor_descs_[io::bia], b, &alpha, y_acc_desc_, y_dst);
+            float beta = 0.f;
+            auto bias = b;
+            auto bias_desc = tensor_descs_[io::bia];
+            if (with_f32_sum_) {
+                cudnnTransformTensor(cudnn_handle, &alpha,
+                        tensor_descs_[io::bia], b, &beta, bias_f32_desc_,
+                        bias_f32);
+                bias = bias_f32;
+                bias_desc = bias_f32_desc_;
+            }
+            CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &alpha, bias_desc,
+                    bias, &alpha, y_acc_desc_, y_dst);
         }
         if (with_eltwise_) {
             CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
@@ -281,7 +304,7 @@ struct cudnn_gemm_inner_product_bwd_data_impl_t
 
     virtual status_t init(impl::engine_t *, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool need_reorder) override {
+            bool need_reorder, bool /* use_f32_sum */) override {
         need_reorder_ = need_reorder;
 
         // GEMM is column major, here the data is row major.
@@ -365,7 +388,7 @@ struct cudnn_gemm_inner_product_bwd_weights_impl_t
     }
     virtual status_t init(impl::engine_t *engine, inner_product_pd_t *pd,
             bool /*with_relu*/, bool /*with_eltwise*/, bool /*with_sum */,
-            bool need_reorder) override {
+            bool need_reorder, bool /* use_f32_sum */) override {
         need_reorder_ = need_reorder;
         with_bias_ = pd->with_bias();
 
diff --git a/src/gpu/nvidia/cudnn_inner_product.cpp b/src/gpu/nvidia/cudnn_inner_product.cpp
index 672d35fba88..c9169313a64 100644
--- a/src/gpu/nvidia/cudnn_inner_product.cpp
+++ b/src/gpu/nvidia/cudnn_inner_product.cpp
@@ -49,6 +49,8 @@ status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
                 memory_tracking::names::key_iprod_int_dat_in_acc_dt);
         auto arg_spacial_scratch
                 = CTX_SCRATCH_SYCL_MEMORY(memory_tracking::names::key_none);
+        auto arg_f32_bias_scratch = CTX_SCRATCH_SYCL_MEMORY(
+                memory_tracking::names::key_iprod_bias_bf16_convert_wsp);
         compat::host_task(cgh, [=, this](const compat::interop_handle &ih) {
             auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
                     cuda_stream->engine());
@@ -72,6 +74,7 @@ status_t cudnn_inner_product_fwd_t::execute(const exec_ctx_t &ctx) const {
             args.push_back(arg_src_scale.get_native_pointer(ih));
             args.push_back(arg_wei_scale.get_native_pointer(ih));
             args.push_back(arg_dst_scale.get_native_pointer(ih));
+            args.push_back(arg_f32_bias_scratch.get_native_pointer(ih));
 
             pd()->inner_product_impl_->execute(
                     cudnn_handle, cublas_handle, args);
diff --git a/src/gpu/nvidia/cudnn_inner_product.hpp b/src/gpu/nvidia/cudnn_inner_product.hpp
index b4d5c7aa438..a6ec15d578d 100644
--- a/src/gpu/nvidia/cudnn_inner_product.hpp
+++ b/src/gpu/nvidia/cudnn_inner_product.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -39,15 +39,17 @@ struct cudnn_inner_product_fwd_t : public gpu::primitive_t {
     struct pd_t : public inner_product_fwd_pd_t {
         using inner_product_fwd_pd_t::inner_product_fwd_pd_t;
 
-        bool attr_scales_ok() const {
-            const auto &scales = attr()->scales_;
-            const auto &supported_args
-                    = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
-            if (!scales.has_default_values(supported_args)) return false;
-            // cuDNN does not support scaling per dimension.
-            for (auto arg : supported_args)
-                if (scales.get(arg).mask_ != 0) return false;
-            return true;
+        bool attr_scales_ok(const std::vector<int> &supported_args
+                = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) const {
+            bool ok = attr()->scales_.has_default_values(supported_args);
+            for (int arg : supported_args) {
+                if (attr()->scales_.has_default_values(arg)) continue;
+
+                const auto &mask = attr()->scales_.get_mask(arg);
+                // cuDNN does not support scaling per dimension.
+                ok = ok && (mask == 0);
+            }
+            return ok;
         }
 
         std::shared_ptr<cudnn_inner_product_impl_base_t> inner_product_impl_;
diff --git a/src/gpu/nvidia/cudnn_inner_product_impl.hpp b/src/gpu/nvidia/cudnn_inner_product_impl.hpp
index 984b0b11be7..37ee9132752 100644
--- a/src/gpu/nvidia/cudnn_inner_product_impl.hpp
+++ b/src/gpu/nvidia/cudnn_inner_product_impl.hpp
@@ -125,6 +125,9 @@ struct cudnn_inner_product_impl_base_t {
     bool with_relu_ = false, with_eltwise_ = false, with_sum_ = false;
     bool filter_using_spatial_format_ = false;
 
+    cudnnTensorDescriptor_t bias_f32_desc_;
+    bool with_f32_sum_ = false;
+
     virtual bool need_to_transform_filter() const {
         return filter_using_spatial_format_;
     }
@@ -135,7 +138,7 @@ struct cudnn_inner_product_impl_base_t {
         // of filter should be equal, as cuDNN always stores dimensions in
         // NCDHW order. The first dimension of filter must be equal to the
         // second dimension of bias
-        for (size_t i = 0; i < ndims; ++i) {
+        for (int i = 0; i < ndims; ++i) {
             dims_[io::bia][i] = 1;
             strides_[io::bia][i] = (format != CUDNN_TENSOR_NHWC ? 1 : bias_dim);
         }
@@ -146,12 +149,19 @@ struct cudnn_inner_product_impl_base_t {
     virtual status_t init(impl::engine_t * /*engine*/,
             inner_product_pd_t * /*pd*/, bool /*with_relu*/,
             bool /*with_eltwise*/, bool /*with_sum */,
-            bool /*using_fused_path_for_blocking*/)
+            bool /*using_fused_path_for_blocking*/, bool /* use_f32_sum */)
             = 0;
 
     virtual void execute(cudnnHandle_t /*handle*/,
             cublasHandle_t /*cublas_handle*/,
             const std::vector<void *> & /*args*/) const = 0;
+
+    virtual ~cudnn_inner_product_impl_base_t() {
+        for (int i = 0; i < NUM_IO; ++i) {
+            cudnnDestroyTensorDescriptor(tensor_descs_[i]);
+        }
+        if (with_f32_sum_) { cudnnDestroyTensorDescriptor(bias_f32_desc_); }
+    }
 };
 
 struct cudnn_inner_product_fwd_base_t : public cudnn_inner_product_impl_base_t {
diff --git a/src/gpu/nvidia/cudnn_matmul.cpp b/src/gpu/nvidia/cudnn_matmul.cpp
index 03aadc39e58..51f668322af 100644
--- a/src/gpu/nvidia/cudnn_matmul.cpp
+++ b/src/gpu/nvidia/cudnn_matmul.cpp
@@ -16,6 +16,7 @@
 *******************************************************************************/
 
 #include "gpu/nvidia/cudnn_matmul.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt.hpp"
 
 #include "common/c_types_map.hpp"
 #include "common/dnnl_thread.hpp"
@@ -35,39 +36,93 @@ namespace nvidia {
 status_t cudnn_matmul_t::execute(const exec_ctx_t &ctx) const {
     if (pd()->has_zero_dim_memory()) return status::success;
 
-    const bool has_runtime_args = matmul_impl_->has_runtime_params();
-
     const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
     const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
     const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
     const auto bias_d = ctx.memory_mdw(DNNL_ARG_BIAS, pd()->weights_md(1));
 
-    status_t status;
-    size_t scratchpad_size = 0; // To avoid extra allocation in an executor.
-    if (has_runtime_args) {
-        // Initialise all runtime parameters
-        status = matmul_impl_->init_parameters(src_d, weights_d, dst_d, bias_d);
-        if (status != status::success) return status;
+    nvidia::stream_t *cuda_stream
+            = utils::downcast<nvidia::stream_t *>(ctx.stream());
+
+    status_t status = executor_->execute(ctx, ctx.stream()->engine(),
+            matmul_impl_, pd()->params_, src_d, weights_d, dst_d, bias_d);
 
-        scratchpad_size = pd()->scratchpad_size(dst_d.md_);
+    if (pd()->params_->has_runtime_params_) {
+        auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
+        for (auto e : evts) {
+            e.wait();
+        }
     }
+    return status;
+}
+
+status_t cudnn_matmul_lt_t::execute(const exec_ctx_t &ctx) const {
+    if (pd()->has_zero_dim_memory()) return status::success;
+
+    const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
+    const auto weights_d = ctx.memory_mdw(DNNL_ARG_WEIGHTS, pd()->weights_md());
+    const auto dst_d = ctx.memory_mdw(DNNL_ARG_DST, pd()->dst_md());
 
     nvidia::stream_t *cuda_stream
             = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-    status = executor_->execute(
-            ctx, ctx.stream()->engine(), matmul_impl_, scratchpad_size);
+    const bool has_dst_scales
+            = ctx.args().find(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)
+            != ctx.args().end();
+
+    CHECK(executor_->execute(ctx, ctx.stream()->engine(), matmul_impl_,
+            pd()->params_, src_d, weights_d, dst_d));
+
+    if (pd()->params_->with_bias_) {
+        // bias sycl binary
+        exec_args_t binary_args;
+        std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
+        if (dst_d.data_type() == dnnl_s8) {
+            auto scratchpad_storage
+                    = ctx.get_scratchpad_grantor().get_memory_storage(
+                            memory_tracking::names::key_matmul_dst_in_acc_dt);
+
+            safe_ptr_assign(scratch_mem,
+                    new memory_t(ctx.stream()->engine(), &pd()->s32_dst_md_,
+                            std::move(scratchpad_storage)));
+            binary_args[DNNL_ARG_SRC_0]
+                    = memory_arg_t {scratch_mem.get(), true};
+        } else {
+            binary_args[DNNL_ARG_SRC_0]
+                    = memory_arg_t {ctx.args().at(DNNL_ARG_DST).mem, true};
+        }
+        binary_args[DNNL_ARG_SRC_1] = ctx.args().at(DNNL_ARG_BIAS);
+        binary_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DST);
+
+        exec_ctx_t binary_ctx(ctx, std::move(binary_args));
+
+        CHECK(binary_->execute(binary_ctx));
+    }
+
+    if (has_dst_scales
+            && (pd()->params_->multi_dst_scale_
+                    || pd()->params_->acc_type_ == CUDA_R_32I)) {
+        // dst scale sycl binary
+        exec_args_t dst_scale_binary_args;
+        dst_scale_binary_args[DNNL_ARG_SRC_0]
+                = memory_arg_t {ctx.args().at(DNNL_ARG_DST).mem, true};
+        dst_scale_binary_args[DNNL_ARG_SRC_1] = memory_arg_t {
+                ctx.args().at(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST).mem, true};
+        dst_scale_binary_args[DNNL_ARG_DST] = ctx.args().at(DNNL_ARG_DST);
 
-    if (has_runtime_args) {
+        exec_ctx_t binary_ctx(ctx, std::move(dst_scale_binary_args));
+
+        CHECK(dst_scale_binary_->execute(binary_ctx));
+    }
+
+    if (pd()->params_->has_runtime_params_) {
         auto &evts = cuda_stream->sycl_ctx().get_sycl_deps().events;
         for (auto e : evts) {
             e.wait();
         }
-
-        matmul_impl_->cleanup();
     }
 
-    return status;
+    return status::success;
 }
 
 } // namespace nvidia
diff --git a/src/gpu/nvidia/cudnn_matmul.hpp b/src/gpu/nvidia/cudnn_matmul.hpp
index 480c09cd353..7c0bd165f6c 100644
--- a/src/gpu/nvidia/cudnn_matmul.hpp
+++ b/src/gpu/nvidia/cudnn_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,12 +18,14 @@
 #ifndef GPU_NVIDIA_CUDNN_MATMUL_HPP
 #define GPU_NVIDIA_CUDNN_MATMUL_HPP
 
-#include "gpu/gpu_primitive.hpp"
-
 #include "gpu/gpu_matmul_pd.hpp"
 
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "gpu/gpu_primitive.hpp"
 #include "gpu/nvidia/cudnn_matmul_executor.hpp"
 #include "gpu/nvidia/cudnn_matmul_impl.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
 #include "gpu/nvidia/sycl_cuda_utils.hpp"
 
 namespace dnnl {
@@ -32,7 +34,7 @@ namespace gpu {
 namespace nvidia {
 
 struct cudnn_matmul_t : public gpu::primitive_t {
-    using gpu::primitive_t::primitive_t;
+    using primitive_t::primitive_t;
 
     struct pd_t : public gpu_matmul_pd_t {
         using gpu_matmul_pd_t::gpu_matmul_pd_t;
@@ -61,7 +63,7 @@ struct cudnn_matmul_t : public gpu::primitive_t {
 
             bool ok = is_dense_format_kind() && blocking_ok()
                     && attr()->has_default_values(
-                            smask_t::scales_runtime | smask_t::post_ops)
+                            smask_t::scales | smask_t::post_ops)
                     && scales_ok() && attr_post_ops_ok(attr())
                     && IMPLICATION(bf16_case,
                             has_bf16_support(sycl_engine_impl->device()))
@@ -80,45 +82,32 @@ struct cudnn_matmul_t : public gpu::primitive_t {
 
             if (src_md()->ndims > 3) return status::unimplemented;
 
-            init_scratchpad();
+            params_ = std::make_shared<cublas_params>();
+            CHECK(params_->init(src_md(), weights_md(), dst_md(), weights_md(1),
+                    attr(), batched(), with_bias()));
 
+            if (!params_->has_runtime_params_) {
+                auto scratchpad = scratchpad_registry().registrar();
+                params_->init_scratchpad(dst_md(), scratchpad);
+            }
             return status::success;
         }
 
-        // Use scratchpad memory and reorder from it in two scenarios:
-        // * Bias dt is different from dst dt.
-        // * Dst dt is not f32. cuBLAS only supports s8s8f32.
-        bool reorder_required() const {
-            return dst_md()->data_type != data_type::f32
-                    || (with_bias()
-                            && (weights_md(1)->data_type
-                                    != dst_md()->data_type));
-        }
-
-        size_t scratchpad_size(const memory_desc_t *dst_md) const {
-            const auto dst_nelems = memory_desc_wrapper(dst_md).nelems(true);
-            return dst_nelems * sizeof(float);
-        }
-
-    private:
-        void init_scratchpad() {
-            // Runtime dimensions allocate memory at execute.
-            if (has_runtime_dims_or_strides()) return;
-            if (!reorder_required()) return;
-
-            auto scratchpad = scratchpad_registry().registrar();
-            scratchpad.book(memory_tracking::names::key_matmul_dst_in_acc_dt,
-                    scratchpad_size(dst_md()), 1);
-        }
-
         bool scales_ok() const {
+            using namespace data_type;
             const auto &scales = attr()->scales_;
             const auto &supported_args
                     = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST};
             if (!scales.has_default_values(supported_args)) return false;
             // cuDNN does not support scaling per dimension.
-            for (auto arg : supported_args)
-                if (scales.get(arg).mask_ != 0) return false;
+            for (auto arg : supported_args) {
+                if (scales.has_default_values(arg)) continue;
+
+                if (scales.get_mask(arg) > 0) return false;
+                if (!utils::one_of(
+                            scales.get_data_type(arg), s8, s32, f32, f16, bf16))
+                    return false;
+            }
             return true;
         }
 
@@ -134,44 +123,28 @@ struct cudnn_matmul_t : public gpu::primitive_t {
             }
             return true;
         }
+
+        std::shared_ptr<cublas_params> params_;
     };
 
     status_t init(impl::engine_t *engine) override {
         matmul_impl_.reset(new cudnn_matmul_impl_t());
-        const auto status = matmul_impl_->init((matmul_pd_t *)pd());
-        if (status != status::success) return status;
-
-        const bool with_bias = matmul_impl_->with_bias();
-        const bool has_runtime_args = matmul_impl_->has_runtime_params();
-        const bool with_scratchpad = pd()->reorder_required();
-
-        if (with_scratchpad && has_runtime_args && with_bias) {
-            executor_.reset(new cudnn_matmul_scratch_runtime_args_bias_exec_t);
-        } else if (with_scratchpad && has_runtime_args) {
-            executor_.reset(new cudnn_matmul_runtime_args_scratch_exec_t);
-        } else if (has_runtime_args && with_bias) {
-            executor_.reset(new cudnn_matmul_runtime_args_bias_exec_t);
-        } else if (has_runtime_args) {
+
+        bool has_runtime_args = pd()->params_->has_runtime_params_;
+
+        if (has_runtime_args) {
             executor_.reset(new cudnn_matmul_runtime_args_exec_t);
-        } else if (with_bias && with_scratchpad) {
-            executor_.reset(new cudnn_matmul_bias_scratch_exec_t);
-        } else if (with_scratchpad) {
-            executor_.reset(new cudnn_matmul_scratch_exec_t);
-        } else if (with_bias) {
-            executor_.reset(new cudnn_matmul_bias_exec_t);
-        } else if (!with_scratchpad && !has_runtime_args && !with_bias) {
-            executor_.reset(new cudnn_matmul_exec_t);
         } else {
-            return status::unimplemented;
+            executor_.reset(new cudnn_matmul_exec_t);
+            matmul_impl_->set_non_runtime_params(pd()->params_);
         }
-
-        return status;
+        return status::success;
     }
 
     status_t execute(const exec_ctx_t &ctx) const override;
 
     std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_;
-    std::shared_ptr<cudnn_matmul_exec_base_t> executor_;
+    std::shared_ptr<cudnn_matmul_base_exec_t> executor_;
 
 private:
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
diff --git a/src/gpu/nvidia/cudnn_matmul_base_impl.hpp b/src/gpu/nvidia/cudnn_matmul_base_impl.hpp
new file mode 100644
index 00000000000..ab5b7a24c6f
--- /dev/null
+++ b/src/gpu/nvidia/cudnn_matmul_base_impl.hpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_NVIDIA_CUDNN_MATMUL_BASE_IMPL_HPP
+#define GPU_NVIDIA_CUDNN_MATMUL_BASE_IMPL_HPP
+
+#include "cublasLt.h"
+#include "cudnn.h"
+
+#include "gpu/nvidia/engine.hpp"
+#include "gpu/nvidia/stream.hpp"
+#include "gpu/nvidia/sycl_cuda_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+
+struct cublas_base_params {
+
+    bool with_eltwise(int position, const primitive_attr_t *attr) {
+        return attr->post_ops_.contain(primitive_kind::eltwise, position);
+    }
+
+    bool with_sum(const primitive_attr_t *attr) {
+        return attr->post_ops_.contain(primitive_kind::sum, 0)
+                || attr->post_ops_.contain(primitive_kind::sum, 1);
+    }
+
+    // Returns scaling factor for post-ops=sum operation
+    float sum_scale(const primitive_attr_t *attr) {
+        int sum_idx_ = attr->post_ops_.find(primitive_kind::sum);
+        return attr->post_ops_.entry_[sum_idx_].sum.scale;
+    }
+
+    alg_kind_t eltwise_algo(const primitive_attr_t *attr) {
+        int eltwise_idx_ = attr->post_ops_.find(primitive_kind::eltwise);
+        return with_eltwise(0, attr) || with_eltwise(1, attr)
+                ? attr->post_ops_.entry_[eltwise_idx_].eltwise.alg
+                : dnnl_alg_kind_undef;
+    }
+
+    int get_batch_stride(const memory_desc_wrapper desc) {
+        auto dims = desc.dims();
+        auto strides = desc.blocking_desc().strides;
+        return dims[0] == 1 ? 0 : strides[0];
+    }
+
+    uint64_t M_, N_, K_;
+
+    bool isbatched_ = false, with_separate_bias_ = false,
+         bias_dt_mismatch_ = false, with_dst_scale_ = false;
+    bool reorder_required_ = false, with_separate_eltwise_ = false;
+    bool has_runtime_params_ = false;
+    cudaDataType_t src_type_, weights_type_, dst_type_;
+    cudaDataType_t acc_type_ = cudaDataType_t::CUDA_R_32F;
+    int batch_count_;
+    enum io { bias = 0, dst, NUM_IO };
+    cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {},
+                            temp_mem_desc_ = nullptr;
+    cudnnActivationDescriptor_t act_desc_ = nullptr;
+    float post_op_sum_ = 0;
+    size_t reorder_scratch_size_ = 0;
+
+    cublasOperation_t transA_;
+    cublasOperation_t transB_;
+    cublasOperation_t transC_;
+    cublasGemmAlgo_t gemm_algo_
+            = cublasGemmAlgo_t::CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+};
+
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/nvidia/cudnn_matmul_executor.hpp b/src/gpu/nvidia/cudnn_matmul_executor.hpp
index 695ca074c9a..8c3b19b7941 100644
--- a/src/gpu/nvidia/cudnn_matmul_executor.hpp
+++ b/src/gpu/nvidia/cudnn_matmul_executor.hpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2020-2024 Intel Corporation
-* Copyright 2020 Codeplay Software Limited
+* Copyright 2020-2024 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,8 +18,11 @@
 #ifndef GPU_NVIDIA_CUDNN_MATMUL_EXECUTOR_HPP
 #define GPU_NVIDIA_CUDNN_MATMUL_EXECUTOR_HPP
 
+#include "common/compiler_workarounds.hpp"
+#include "common/primitive_exec_types.hpp"
 #include "gpu/nvidia/cudnn_matmul.hpp"
 #include "gpu/nvidia/cudnn_matmul_impl.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
 #include "gpu/nvidia/engine.hpp"
 #include "gpu/nvidia/stream.hpp"
 #include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
@@ -31,109 +34,119 @@ namespace dnnl {
 namespace impl {
 namespace gpu {
 namespace nvidia {
+struct cudnn_matmul_base_exec_t {
 
-struct cudnn_matmul_exec_base_t {
     virtual status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
             const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size)
+            const std::shared_ptr<cublas_params> params,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d, const memory_desc_wrapper &bias_d)
             = 0;
-    virtual ~cudnn_matmul_exec_base_t() = default;
 
 protected:
     template <::sycl::access::mode bias_m, ::sycl::access::mode scratch_m>
     void interop_task(std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            impl::engine_t *engine, ::sycl::handler &cgh,
-            nvidia::stream_t *cuda_stream,
+            const std::shared_ptr<cublas_params> params, impl::engine_t *engine,
+            ::sycl::handler &cgh, nvidia::stream_t *cuda_stream,
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>
                     arg_weights,
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src,
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write>
                     arg_dst,
             xpu::sycl::interop_memory_arg_t<bias_m> arg_bias,
-            xpu::sycl::interop_memory_arg_t<scratch_m> arg_scratch,
+            xpu::sycl::interop_memory_arg_t<scratch_m> arg_bias_scratch,
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>
                     arg_src_scale,
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>
                     arg_wei_scale,
             xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>
-                    arg_dst_scale) {
-
-        compat::host_task(cgh, [=](const compat::interop_handle &ih) {
-            auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
-                    cuda_stream->engine());
-            auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
-            // SYCL out-of-order queue encapsulates multiple CUstream objects.
-            // Every query of the CUstream object can return any of those
-            // therefore we need to make sure that we activate both cuDNN and
-            // cuBLAS handles for the same CUstream object.
-            auto native_stream = cuda_stream->get_underlying_stream();
-            auto cublas_handle = cuda_stream->get_cublas_handle(native_stream);
-            auto cudnn_handle = cuda_stream->get_cudnn_handle(native_stream);
-
-            void *scratch = arg_scratch.get_native_pointer(ih);
-            void *bias = arg_bias.get_native_pointer(ih);
-            void *weights = arg_weights.get_native_pointer(ih);
-            void *src = arg_src.get_native_pointer(ih);
-            void *dst = arg_dst.get_native_pointer(ih);
-
-            void *src_scale = arg_src_scale.get_native_pointer(ih);
-            void *wei_scale = arg_wei_scale.get_native_pointer(ih);
-            void *dst_scale = arg_dst_scale.get_native_pointer(ih);
-
-            matmul_impl_->execute(cublas_handle, cudnn_handle, weights, src,
-                    dst, bias, scratch, src_scale, wei_scale, dst_scale);
-        });
-    }
-
-    template <typename T, ::sycl::access::mode md, typename sc_t>
-    void *maybe_cast_to_ptr(::sycl::accessor<T, 1, md> acc, sc_t &sc,
-            const compat::interop_handle &ih) const {
-        return sc.template memory<void *>(ih, acc);
+                    arg_dst_scale,
+            uint8_t *bias_scratch_ptr) {
+
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
+                            cuda_stream->engine());
+                    auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
+                    // SYCL out-of-order queue encapsulates multiple CUstream objects.
+                    // Every query of the CUstream object can return any of those
+                    // therefore we need to make sure that we activate both cuDNN and
+                    // cuBLAS handles for the same CUstream object.
+                    auto native_stream = cuda_stream->get_underlying_stream();
+                    auto cublas_handle
+                            = cuda_stream->get_cublas_handle(native_stream);
+                    auto cudnn_handle
+                            = cuda_stream->get_cudnn_handle(native_stream);
+
+                    void *reorder_scratch
+                            = arg_bias_scratch.get_native_pointer(ih);
+                    void *bias = arg_bias.get_native_pointer(ih);
+                    void *weights = arg_weights.get_native_pointer(ih);
+                    void *src = arg_src.get_native_pointer(ih);
+                    void *dst = arg_dst.get_native_pointer(ih);
+
+                    void *src_scale = arg_src_scale.get_native_pointer(ih);
+                    void *wei_scale = arg_wei_scale.get_native_pointer(ih);
+                    void *dst_scale = arg_dst_scale.get_native_pointer(ih);
+
+                    matmul_impl_->execute(cublas_handle, cudnn_handle, params,
+                            weights, src, dst, bias, reorder_scratch, src_scale,
+                            wei_scale, dst_scale);
+
+                    if (params->has_runtime_params_) {
+                        sync_device();
+                        free_runtime_scratch(
+                                cublas_handle, cuda_stream, bias_scratch_ptr);
+                        params->cleanup();
+                    }
+                });
     }
 
-    template <typename sc_t>
-    std::nullptr_t maybe_cast_to_ptr(std::nullptr_t acc, sc_t &,
-            const compat::interop_handle &ih) const {
-        return acc;
+    void free_runtime_scratch(cublasHandle_t cublas_handle,
+            nvidia::stream_t *cuda_stream, uint8_t *bias_scratch_ptr) {
+        if (bias_scratch_ptr) {
+            ::sycl::free(bias_scratch_ptr, cuda_stream->queue());
+        }
     }
-};
-
-struct cudnn_matmul_scratch_runtime_args_base_exec_t
-    : public cudnn_matmul_exec_base_t {
-    virtual status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size)
-            = 0;
-    virtual ~cudnn_matmul_scratch_runtime_args_base_exec_t() = default;
 
-protected:
-    void init_scratch_buffer(std::size_t scratch_size) {
-        if (scratch_size > 0) {
-            scratch_buff_.reset(new ::sycl::buffer<uint8_t, 1>(scratch_size));
+    xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
+    init_scratch_from_ptr(std::size_t size_of_ptr, uint8_t *ptr) {
+        auto scratch = xpu::sycl::interop_memory_arg_t<
+                ::sycl::access::mode::read_write>();
+        if (size_of_ptr > 0) {
+            scratch = xpu::sycl::interop_memory_arg_t<
+                    ::sycl::access::mode::read_write>(ptr);
         }
+        return scratch;
     }
-
-    std::shared_ptr<::sycl::buffer<uint8_t, 1>> scratch_buff_ {nullptr};
 };
 
-struct cudnn_matmul_scratch_runtime_args_bias_exec_t
-    : public cudnn_matmul_scratch_runtime_args_base_exec_t {
+struct cudnn_matmul_exec_t final : cudnn_matmul_base_exec_t {
+
     status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
             const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+            const std::shared_ptr<cublas_params> params,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d,
+            const memory_desc_wrapper &bias_d) override {
 
         nvidia::stream_t *cuda_stream
                 = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-        init_scratch_buffer(scratchpad_size);
-
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
+        return cuda_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                 ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
             auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>(*scratch_buff_, cgh);
+
+            auto arg_bias_scratch = params->reorder_scratch_size_ != 0
+                    ? CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_matmul_dst_in_acc_dt)
+                    : xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
 
             auto arg_src_scale
                     = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
@@ -142,63 +155,50 @@ struct cudnn_matmul_scratch_runtime_args_bias_exec_t
             auto arg_dst_scale
                     = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
 
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, arg_bias, arg_scratch, arg_src_scale,
-                    arg_wei_scale, arg_dst_scale);
+            interop_task(matmul_impl_, params, engine, cgh, cuda_stream, arg_wt,
+                    arg_src, arg_dst, arg_bias, arg_bias_scratch, arg_src_scale,
+                    arg_wei_scale, arg_dst_scale, nullptr);
         });
     }
+
+    ~cudnn_matmul_exec_t() = default;
 };
 
-struct cudnn_matmul_runtime_args_scratch_exec_t
-    : public cudnn_matmul_scratch_runtime_args_base_exec_t {
+struct cudnn_matmul_runtime_args_exec_t final
+    : public cudnn_matmul_base_exec_t {
     status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
             const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+            const std::shared_ptr<cublas_params> params,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d,
+            const memory_desc_wrapper &bias_d) override {
 
         nvidia::stream_t *cuda_stream
                 = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-        init_scratch_buffer(scratchpad_size);
-
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>(*scratch_buff_, cgh);
-            auto arg_bias = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read>();
-
-            auto arg_src_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
-            auto arg_wei_scale = CTX_IN_SYCL_MEMORY(
-                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
-            auto arg_dst_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
-
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, /*nullptr*/ arg_bias, arg_scratch,
-                    arg_src_scale, arg_wei_scale, arg_dst_scale);
-        });
-    }
-};
-
-struct cudnn_matmul_runtime_args_bias_exec_t : public cudnn_matmul_exec_base_t {
-    status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+        std::shared_ptr<cublas_params> matmul_params
+                = std::make_shared<cublas_params>();
+        matmul_params->init_from_params(params);
+        if (matmul_params->has_runtime_params_) {
+            matmul_params->cleanup();
+            matmul_params->set_params(src_d, weights_d, dst_d, bias_d);
+        }
 
-        nvidia::stream_t *cuda_stream
-                = utils::downcast<nvidia::stream_t *>(ctx.stream());
+        uint8_t *bias_scratch_ptr = nullptr;
+        if (matmul_params->reorder_scratch_size_ > 0) {
+            bias_scratch_ptr = ::sycl::malloc_device<uint8_t>(
+                    matmul_params->reorder_scratch_size_, cuda_stream->queue());
+        }
 
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
+        return cuda_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                 ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
             auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
-
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>();
+            auto arg_bias_scratch = init_scratch_from_ptr(
+                    matmul_params->reorder_scratch_size_, bias_scratch_ptr);
 
             auto arg_src_scale
                     = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
@@ -207,169 +207,264 @@ struct cudnn_matmul_runtime_args_bias_exec_t : public cudnn_matmul_exec_base_t {
             auto arg_dst_scale
                     = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
 
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, arg_bias, /*nullptr*/ arg_scratch,
-                    arg_src_scale, arg_wei_scale, arg_dst_scale);
+            interop_task(matmul_impl_, matmul_params, engine, cgh, cuda_stream,
+                    arg_wt, arg_src, arg_dst, arg_bias, arg_bias_scratch,
+                    arg_src_scale, arg_wei_scale, arg_dst_scale,
+                    bias_scratch_ptr);
         });
     }
-};
+    ~cudnn_matmul_runtime_args_exec_t() = default;
 
-struct cudnn_matmul_runtime_args_exec_t : public cudnn_matmul_exec_base_t {
-    status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+protected:
+};
 
-        nvidia::stream_t *cuda_stream
-                = utils::downcast<nvidia::stream_t *>(ctx.stream());
+struct cudnn_matmul_lt_base_exec_t {
 
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
+    virtual status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
+            const std::shared_ptr<cudnn_matmul_lt_impl_t> matmul_impl_,
+            const std::shared_ptr<cublas_lt_params> params,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d)
+            = 0;
 
-            auto arg_bias = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read>();
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>();
+protected:
+    template <::sycl::access::mode bias_m, ::sycl::access::mode scratch_m>
+    void interop_task(std::shared_ptr<cudnn_matmul_lt_impl_t> matmul_impl_,
+            const std::shared_ptr<cublas_lt_params> params,
+            impl::engine_t *engine, ::sycl::handler &cgh,
+            nvidia::stream_t *cuda_stream,
+            xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>
+                    arg_weights,
+            xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read> arg_src,
+            xpu::sycl::interop_memory_arg_t<::sycl::access::mode::write>
+                    arg_dst,
+            xpu::sycl::interop_memory_arg_t<bias_m> arg_bias,
+            xpu::sycl::interop_memory_arg_t<scratch_m> arg_algo_scratch,
+            xpu::sycl::interop_memory_arg_t<scratch_m> arg_bias_scratch,
+            xpu::sycl::interop_memory_arg_t<scratch_m> arg_block_a_scratch,
+            xpu::sycl::interop_memory_arg_t<scratch_m> arg_block_b_scratch,
+            xpu::sycl::interop_memory_arg_t<scratch_m> arg_block_c_scratch,
+            xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read>
+                    arg_dst_scale,
+            uint8_t *algo_scratch_ptr, uint8_t *bias_scratch_ptr,
+            uint8_t *block_a_scratch_ptr, uint8_t *block_b_scratch_ptr,
+            uint8_t *block_c_scratch_ptr) {
+
+        compat::host_task(cgh,
+                [= WA_THIS_COPY_CAPTURE](const compat::interop_handle &ih) {
+                    auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
+                            cuda_stream->engine());
+                    auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
+                    // SYCL out-of-order queue encapsulates multiple CUstream objects.
+                    // Every query of the CUstream object can return any of those
+                    // therefore we need to make sure that we activate both cuDNN and
+                    // cuBLAS handles for the same CUstream object.
+                    auto native_stream = cuda_stream->get_underlying_stream();
+                    auto cublas_handle
+                            = cuda_stream->get_cublas_handle(native_stream);
+
+                    void *reorder_scratch
+                            = arg_bias_scratch.get_native_pointer(ih);
+                    void *algo_scratch
+                            = arg_algo_scratch.get_native_pointer(ih);
+                    void *block_a_scratch
+                            = arg_block_a_scratch.get_native_pointer(ih);
+                    void *block_b_scratch
+                            = arg_block_b_scratch.get_native_pointer(ih);
+                    void *block_c_scratch
+                            = arg_block_c_scratch.get_native_pointer(ih);
+
+                    void *bias = arg_bias.get_native_pointer(ih);
+                    void *weights = arg_weights.get_native_pointer(ih);
+                    void *src = arg_src.get_native_pointer(ih);
+                    void *dst = arg_dst.get_native_pointer(ih);
+
+                    void *dst_scale = arg_dst_scale.get_native_pointer(ih);
+
+                    matmul_impl_->execute(cublas_handle, params, weights, src,
+                            dst, bias, algo_scratch, reorder_scratch,
+                            block_a_scratch, block_b_scratch, block_c_scratch,
+                            nullptr, nullptr, dst_scale);
+
+                    free_runtime_scratch(params->has_runtime_params_,
+                            cublas_handle, cuda_stream, algo_scratch_ptr,
+                            bias_scratch_ptr, block_a_scratch_ptr,
+                            block_b_scratch_ptr, block_c_scratch_ptr);
+                    if (params->has_runtime_params_) { params->rt_cleanup(); }
+                });
+    }
 
-            auto arg_src_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
-            auto arg_wei_scale = CTX_IN_SYCL_MEMORY(
-                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
-            auto arg_dst_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+protected:
+    void free_runtime_scratch(bool has_runtime_params,
+            cublasHandle_t cublas_handle, nvidia::stream_t *cuda_stream,
+            uint8_t *algo_scratch_ptr, uint8_t *bias_scratch_ptr,
+            uint8_t *block_a_scratch_ptr, uint8_t *block_b_scratch_ptr,
+            uint8_t *block_c_scratch_ptr) {
+        if (has_runtime_params || bias_scratch_ptr) {
+            cudaStream_t streamId;
+            cublasGetStream(cublas_handle, &streamId);
+            cudaStreamSynchronize(streamId);
+            if (algo_scratch_ptr) {
+                ::sycl::free(algo_scratch_ptr, cuda_stream->queue());
+            }
+            if (bias_scratch_ptr) {
+                ::sycl::free(bias_scratch_ptr, cuda_stream->queue());
+            }
+            if (block_a_scratch_ptr) {
+                ::sycl::free(block_a_scratch_ptr, cuda_stream->queue());
+            }
+            if (block_b_scratch_ptr) {
+                ::sycl::free(block_b_scratch_ptr, cuda_stream->queue());
+            }
+            if (block_c_scratch_ptr) {
+                ::sycl::free(block_c_scratch_ptr, cuda_stream->queue());
+            }
+        }
+    }
 
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, /*nullptr*/ arg_bias,
-                    /*nullptr*/ arg_scratch, arg_src_scale, arg_wei_scale,
-                    arg_dst_scale);
-        });
+    xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
+    init_scratch_from_ptr(std::size_t size_of_ptr, uint8_t *ptr) {
+        auto scratch = xpu::sycl::interop_memory_arg_t<
+                ::sycl::access::mode::read_write>();
+        if (size_of_ptr > 0) {
+            scratch = xpu::sycl::interop_memory_arg_t<
+                    ::sycl::access::mode::read_write>(ptr);
+        }
+        return scratch;
     }
 };
 
-struct cudnn_matmul_bias_scratch_exec_t : public cudnn_matmul_exec_base_t {
+struct cudnn_matmul_lt_exec_t final : public cudnn_matmul_lt_base_exec_t {
+
     status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+            const std::shared_ptr<cudnn_matmul_lt_impl_t> matmul_impl_,
+            const std::shared_ptr<cublas_lt_params> params,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) override {
 
         nvidia::stream_t *cuda_stream
                 = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
+        return cuda_stream->interop_task([= WA_THIS_COPY_CAPTURE, &params](
+                                                 ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
             auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
-            auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY(
-                    memory_tracking::names::key_matmul_dst_in_acc_dt);
+            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
 
-            auto arg_src_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
-            auto arg_wei_scale = CTX_IN_SYCL_MEMORY(
-                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
             auto arg_dst_scale
                     = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
-
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, arg_bias, arg_scratch, arg_src_scale,
-                    arg_wei_scale, arg_dst_scale);
+            auto arg_algo_scratch = params->algo_scratch_size_ != 0
+                    ? CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_matmul_lt_algo_scratch)
+                    : xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+            auto arg_bias_scratch = params->reorder_scratch_size_ != 0
+                    ? CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_matmul_dst_in_acc_dt)
+                    : xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+            auto arg_block_a_scratch = params->weight_size_ != 0
+                    ? CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_gemm_blocked_a)
+                    : xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+            auto arg_block_b_scratch = params->source_size_ != 0
+                    ? CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_gemm_blocked_b)
+                    : xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+            auto arg_block_c_scratch = params->dest_size_ != 0
+                    ? CTX_SCRATCH_SYCL_MEMORY(
+                            memory_tracking::names::key_matmul_lt_block_c)
+                    : xpu::sycl::interop_memory_arg_t<
+                            ::sycl::access::mode::read_write>();
+
+            interop_task(matmul_impl_, params, engine, cgh, cuda_stream, arg_wt,
+                    arg_src, arg_dst, arg_bias, arg_algo_scratch,
+                    arg_bias_scratch, arg_block_a_scratch, arg_block_b_scratch,
+                    arg_block_c_scratch, arg_dst_scale, nullptr, nullptr,
+                    nullptr, nullptr, nullptr);
         });
     }
+
+    ~cudnn_matmul_lt_exec_t() = default;
 };
 
-struct cudnn_matmul_scratch_exec_t : public cudnn_matmul_exec_base_t {
+struct cudnn_matmul_lt_runtime_args_exec_t final
+    : public cudnn_matmul_lt_base_exec_t {
     status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+            const std::shared_ptr<cudnn_matmul_lt_impl_t> matmul_impl_,
+            const std::shared_ptr<cublas_lt_params> params,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) {
+
+        std::shared_ptr<cublas_lt_params> matmul_params
+                = std::make_shared<cublas_lt_params>();
+        matmul_params->init_from_params(params);
+        if (matmul_params->has_runtime_params_) {
+            matmul_params->rt_cleanup();
+            matmul_params->set_params(src_d, weights_d, dst_d, engine);
+        }
 
         nvidia::stream_t *cuda_stream
                 = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-            auto arg_scratch = CTX_SCRATCH_SYCL_MEMORY(
-                    memory_tracking::names::key_matmul_dst_in_acc_dt);
-
-            auto arg_bias = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read>();
+        uint8_t *bias_scratch_ptr = alloc_ptr(
+                matmul_params->algo_scratch_size_, cuda_stream->queue());
 
-            auto arg_src_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
-            auto arg_wei_scale = CTX_IN_SYCL_MEMORY(
-                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
-            auto arg_dst_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+        uint8_t *algo_scratch_ptr = alloc_ptr(
+                matmul_params->reorder_scratch_size_, cuda_stream->queue());
 
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, /*nullptr*/ arg_bias, arg_scratch,
-                    arg_src_scale, arg_wei_scale, arg_dst_scale);
-        });
-    }
-};
+        uint8_t *block_a_scratch_ptr
+                = alloc_ptr(matmul_params->weight_size_, cuda_stream->queue());
 
-struct cudnn_matmul_bias_exec_t : public cudnn_matmul_exec_base_t {
-    status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
+        uint8_t *block_b_scratch_ptr
+                = alloc_ptr(matmul_params->source_size_, cuda_stream->queue());
 
-        nvidia::stream_t *cuda_stream
-                = utils::downcast<nvidia::stream_t *>(ctx.stream());
+        uint8_t *block_c_scratch_ptr
+                = alloc_ptr(matmul_params->dest_size_, cuda_stream->queue());
 
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
+        return cuda_stream->interop_task([= WA_THIS_COPY_CAPTURE](
+                                                 ::sycl::handler &cgh) {
             auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
             auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
             auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
             auto arg_bias = CTX_IN_SYCL_MEMORY(DNNL_ARG_BIAS);
 
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>();
+            auto arg_algo_scratch = init_scratch_from_ptr(
+                    matmul_params->algo_scratch_size_, algo_scratch_ptr);
+            auto arg_bias_scratch = init_scratch_from_ptr(
+                    matmul_params->reorder_scratch_size_, bias_scratch_ptr);
+            auto arg_block_a_scratch = init_scratch_from_ptr(
+                    matmul_params->source_size_, block_a_scratch_ptr);
+            auto arg_block_b_scratch = init_scratch_from_ptr(
+                    matmul_params->weight_size_, block_b_scratch_ptr);
+            auto arg_block_c_scratch = init_scratch_from_ptr(
+                    matmul_params->dest_size_, block_c_scratch_ptr);
 
-            auto arg_src_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
-            auto arg_wei_scale = CTX_IN_SYCL_MEMORY(
-                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
             auto arg_dst_scale
                     = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
 
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, arg_bias, /*nullptr*/ arg_scratch,
-                    arg_src_scale, arg_wei_scale, arg_dst_scale);
+            interop_task(matmul_impl_, matmul_params, engine, cgh, cuda_stream,
+                    arg_wt, arg_src, arg_dst, arg_bias, arg_algo_scratch,
+                    arg_bias_scratch, arg_block_a_scratch, arg_block_b_scratch,
+                    arg_block_c_scratch, arg_dst_scale, algo_scratch_ptr,
+                    bias_scratch_ptr, block_a_scratch_ptr, block_b_scratch_ptr,
+                    block_c_scratch_ptr);
         });
     }
-};
-
-struct cudnn_matmul_exec_t : public cudnn_matmul_exec_base_t {
-    status_t execute(const exec_ctx_t &ctx, impl::engine_t *engine,
-            const std::shared_ptr<cudnn_matmul_impl_t> matmul_impl_,
-            std::size_t scratchpad_size) override {
 
-        nvidia::stream_t *cuda_stream
-                = utils::downcast<nvidia::stream_t *>(ctx.stream());
-
-        return cuda_stream->interop_task([=, this](::sycl::handler &cgh) {
-            auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
-            auto arg_wt = CTX_IN_SYCL_MEMORY(DNNL_ARG_WEIGHTS);
-            auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
-
-            auto arg_bias = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read>();
-            auto arg_scratch = xpu::sycl::interop_memory_arg_t<
-                    ::sycl::access::mode::read_write>();
-
-            auto arg_src_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
-            auto arg_wei_scale = CTX_IN_SYCL_MEMORY(
-                    DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
-            auto arg_dst_scale
-                    = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+    ~cudnn_matmul_lt_runtime_args_exec_t() = default;
 
-            interop_task(matmul_impl_, engine, cgh, cuda_stream, arg_wt,
-                    arg_src, arg_dst, /*nullptr*/ arg_bias,
-                    /*nullptr*/ arg_scratch, arg_src_scale, arg_wei_scale,
-                    arg_dst_scale);
-        });
+protected:
+    uint8_t *alloc_ptr(size_t size, ::sycl::queue q) {
+        uint8_t *ptr = nullptr;
+        if (size > 0) { ptr = ::sycl::malloc_device<uint8_t>(size, q); }
+        return ptr;
     }
 };
 
diff --git a/src/gpu/nvidia/cudnn_matmul_impl.hpp b/src/gpu/nvidia/cudnn_matmul_impl.hpp
index 9293ce7edfd..c77a26d47e4 100644
--- a/src/gpu/nvidia/cudnn_matmul_impl.hpp
+++ b/src/gpu/nvidia/cudnn_matmul_impl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +20,9 @@
 
 #include "cudnn.h"
 
+#include "gpu/nvidia/cudnn_matmul_base_impl.hpp"
 #include "gpu/nvidia/engine.hpp"
+#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
 #include "gpu/nvidia/sycl_cuda_utils.hpp"
 
 namespace dnnl {
@@ -28,103 +30,37 @@ namespace impl {
 namespace gpu {
 namespace nvidia {
 
-struct cudnn_matmul_impl_t {
-
-    bool with_eltwise(int position, const matmul_pd_t *pd) const {
-        return pd->attr()->post_ops_.contain(primitive_kind::eltwise, position);
-    }
-
-    float eltwise_alpha(const matmul_pd_t *pd) const {
-        int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise);
-        return with_eltwise(0, pd) || with_eltwise(1, pd)
-                ? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.alpha
-                : 1.0f;
-    }
-
-    float eltwise_beta(const matmul_pd_t *pd) const {
-        int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise);
-        return with_eltwise(0, pd) || with_eltwise(1, pd)
-                ? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.beta
-                : 0.0f;
-    }
+struct cublas_params : cublas_base_params {
 
-    alg_kind_t eltwise_algo(const matmul_pd_t *pd) const {
-        int eltwise_idx_ = pd->attr()->post_ops_.find(primitive_kind::eltwise);
-        return with_eltwise(0, pd) || with_eltwise(1, pd)
-                ? pd->attr()->post_ops_.entry_[eltwise_idx_].eltwise.alg
-                : dnnl_alg_kind_undef;
-    }
+    status_t init(const memory_desc_t *src_md, const memory_desc_t *weights_md,
+            const memory_desc_t *dst_md, const memory_desc_t *bias_md,
+            const primitive_attr_t *attr, bool batched, bool with_bias) {
 
-    bool with_sum(const matmul_pd_t *pd) const {
-        return pd->attr()->post_ops_.contain(primitive_kind::sum, 0)
-                || pd->attr()->post_ops_.contain(primitive_kind::sum, 1);
-    }
+        CHECK(get_cublas_data_type(src_md->data_type, src_type_));
 
-    // Returns scaling factor for post-ops=sum operation
-    float sum_scale(const matmul_pd_t *pd) const {
-        int sum_idx_ = pd->attr()->post_ops_.find(primitive_kind::sum);
-        return pd->attr()->post_ops_.entry_[sum_idx_].sum.scale;
-    }
+        CHECK(get_cublas_data_type(weights_md->data_type, weights_type_));
 
-    // creates operation descriptor based on the elemen-wise operation specified
-    status_t create_and_set_op_descriptor(const matmul_pd_t *pd) {
-        CHECK(CUDNN_EXECUTE_FUNC_S(
-                cudnnCreateActivationDescriptor, &act_desc_));
+        isbatched_ = batched;
 
-        cudnnActivationMode_t mode;
+        memory_desc_wrapper src_d = memory_desc_wrapper(src_md);
+        memory_desc_wrapper weights_d = memory_desc_wrapper(weights_md);
+        memory_desc_wrapper dst_d = memory_desc_wrapper(dst_md);
 
-        switch (eltwise_algo(pd)) {
-            case alg_kind::eltwise_relu:
-                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_RELU;
-                break;
-            case alg_kind::eltwise_tanh:
-                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_TANH;
-                break;
-            case alg_kind::eltwise_elu:
-                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_ELU;
-                break;
-            case alg_kind::eltwise_logistic:
-                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_SIGMOID;
-                break;
-            default: return status::unimplemented;
+        if (!(src_d.is_plain() && weights_d.is_plain() && dst_d.is_plain())) {
+            return status::unimplemented;
         }
 
-        // NaNs by default are propagated in oneDNN, although the forward
-        // convolution routine does not support this.
-        auto propagate_nan = cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN;
-
-        // For ReLU, a ceiling of 0 means no limit.
-        double ceiling = eltwise_alpha(pd);
-
-        CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc_,
-                mode, propagate_nan, ceiling));
-
-        return status::success;
-    }
-
-    status_t init(matmul_pd_t *pd) {
-        CHECK(get_cublas_data_type(pd->src_md()->data_type, src_type_));
-        CHECK(get_cublas_data_type(pd->weights_md()->data_type, weights_type_));
-
-        isbatched_ = pd->batched();
-
-        memory_desc_wrapper src_d = memory_desc_wrapper(pd->src_md());
-        memory_desc_wrapper weights_d = memory_desc_wrapper(pd->weights_md());
-        memory_desc_wrapper dst_d = memory_desc_wrapper(pd->dst_md());
-
-        with_dst_scale_
-                = !pd->attr()->scales_.get(DNNL_ARG_DST).has_default_values();
-        with_bias_ = pd->with_bias();
-        if ((with_bias_)
-                && (pd->weights_md(1)->data_type != pd->dst_md()->data_type)) {
+        with_dst_scale_ = !attr->scales_.has_default_values(DNNL_ARG_DST);
+        with_separate_bias_ = with_bias;
+        if ((with_separate_bias_)
+                && (bias_md->data_type != dst_md->data_type)) {
             // When datatype of bias is different from the dst,
             // we need to reorder the output.
             bias_dt_mismatch_ = true;
             reorder_required_ = true;
-            CHECK(get_cublas_data_type(
-                    pd->weights_md(1)->data_type, dst_type_));
+            CHECK(get_cublas_data_type(bias_md->data_type, dst_type_));
         } else {
-            CHECK(get_cublas_data_type(pd->dst_md()->data_type, dst_type_));
+            CHECK(get_cublas_data_type(dst_md->data_type, dst_type_));
         }
 
         // cuBLAS only supports s8s8f32 configuration.
@@ -134,13 +70,13 @@ struct cudnn_matmul_impl_t {
             dst_type_ = cudaDataType_t::CUDA_R_32F;
         }
 
-        if (with_eltwise(0, pd) || with_eltwise(1, pd)) {
-            with_eltwise_ = true;
-            CHECK(create_and_set_op_descriptor(pd));
+        if (with_eltwise(0, attr) || with_eltwise(1, attr)) {
+            with_separate_eltwise_ = true;
+            CHECK(create_and_set_op_descriptor(attr, act_desc_));
         }
 
         // Set parameter when post-op sum is specified
-        if (with_sum(pd)) { post_op_sum_ = sum_scale(pd); }
+        if (with_sum(attr)) { post_op_sum_ = sum_scale(attr); }
 
         has_runtime_params_ = src_d.has_runtime_dims_or_strides()
                 || dst_d.has_runtime_dims_or_strides()
@@ -148,47 +84,30 @@ struct cudnn_matmul_impl_t {
 
         if (!has_runtime_params_) {
             // Initialise all gemm parameters if there are no runtime parameters
-            init_parameters(src_d, weights_d, dst_d,
-                    memory_desc_wrapper(pd->weights_md(1)));
+            set_params(src_d, weights_d, dst_d, memory_desc_wrapper(bias_md));
         }
 
         return status::success;
     }
 
-    bool isbatched() { return isbatched_; }
-    bool with_bias() { return with_bias_; }
-    bool has_runtime_params() { return has_runtime_params_; }
-
-    void convert_dims_matmul(
-            const dnnl_dim_t *dims, int *new_dims, int n_dims) {
-        // Moving the dimensions because cudnnAddTensor doesn't work when
-        // bia_mask=1
-        if (n_dims == 3) { return convert_dims(dims, new_dims, n_dims); }
-        new_dims[0] = 1;
-        for (size_t i = 0; i < n_dims; i++) {
-            new_dims[i + 1] = static_cast<int>(dims[i]);
-        }
-        for (size_t i = n_dims; i < 4; i++) {
-            new_dims[i + 1] = 1;
-        }
-    }
-
-    int get_ld(const memory_desc_wrapper desc, cublasOperation_t trans) {
-        const int ndims = desc.ndims();
-        const auto *strides = &desc.blocking_desc().strides[ndims - 2];
-        const int ld = strides[trans == cublasOperation_t::CUBLAS_OP_N ? 0 : 1];
-        return ld;
-    }
-
-    int get_batch_stride(const memory_desc_wrapper desc) {
-        auto dims = desc.dims();
-        auto strides = desc.blocking_desc().strides;
-        return dims[0] == 1 ? 0 : strides[0];
+    status_t init_from_params(const std::shared_ptr<cublas_params> &other) {
+        if (!other) { return status::invalid_arguments; }
+        src_type_ = other->src_type_;
+        weights_type_ = other->weights_type_;
+        isbatched_ = other->isbatched_;
+        with_dst_scale_ = other->with_dst_scale_;
+        with_separate_bias_ = other->with_separate_bias_;
+        bias_dt_mismatch_ = other->bias_dt_mismatch_;
+        reorder_required_ = other->reorder_required_;
+        dst_type_ = other->dst_type_;
+        with_separate_eltwise_ = other->with_separate_eltwise_;
+        has_runtime_params_ = other->has_runtime_params_;
+        return status::success;
     }
 
-    status_t init_gemm_parameters(const memory_desc_wrapper src_d,
-            const memory_desc_wrapper weights_d,
-            const memory_desc_wrapper dst_d) {
+    status_t set_gemm_params(const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) {
 
         if (isbatched_) batch_count_ = dst_d.dims()[0];
         const dim_t M = dst_d.dims()[isbatched_ + 1];
@@ -238,14 +157,15 @@ struct cudnn_matmul_impl_t {
         return status::success;
     }
 
-    status_t init_parameters(const memory_desc_wrapper src_d,
-            const memory_desc_wrapper weights_d,
-            const memory_desc_wrapper dst_d, const memory_desc_wrapper bias_d) {
+    status_t set_params(const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d,
+            const memory_desc_wrapper &bias_d) {
         // Matmul supports runtime paramters for dimensions and scales.
         // We need to initialize them in the execute function.
-        CHECK(init_gemm_parameters(src_d, weights_d, dst_d));
+        CHECK(set_gemm_params(src_d, weights_d, dst_d));
 
-        if (with_bias_ || reorder_required_ || with_eltwise_
+        if (with_separate_bias_ || reorder_required_ || with_separate_eltwise_
                 || with_dst_scale_) {
             // Initialise cuDNN descriptors
             cudnnDataType_t data_types[NUM_IO];
@@ -268,7 +188,7 @@ struct cudnn_matmul_impl_t {
                         strides[dst]));
             }
 
-            if (with_bias_) {
+            if (with_separate_bias_) {
                 // Create bias and destination tensor descriptors
                 convert_dims_matmul(bias_d.dims(), dims[bias], bias_d.ndims());
                 convert_dims_matmul(bias_d.blocking_desc().strides,
@@ -282,18 +202,177 @@ struct cudnn_matmul_impl_t {
                 }
             }
         }
+
+        const auto dst_nelems = dst_d.nelems(true);
+        reorder_scratch_size_ = dst_nelems * sizeof(float);
+
+        return status::success;
+    }
+
+    size_t scratchpad_size(const memory_desc_t *dst_md) const {
+        const auto dst_nelems = memory_desc_wrapper(dst_md).nelems(true);
+        return dst_nelems * sizeof(float);
+    }
+
+    void init_scratchpad(const memory_desc_t *dst_md,
+            memory_tracking::registrar_t scratchpad) {
+        auto reorder_scratch_size = scratchpad_size(dst_md);
+        if (reorder_scratch_size > 0) {
+            scratchpad.book(memory_tracking::names::key_matmul_dst_in_acc_dt,
+                    reorder_scratch_size, 1);
+        }
+    }
+
+    void convert_dims_matmul(
+            const dnnl_dim_t *dims, int *new_dims, int n_dims) {
+        // Moving the dimensions because cudnnAddTensor doesn't work when
+        // bia_mask=1
+        if (n_dims == 3) { return convert_dims(dims, new_dims, n_dims); }
+        new_dims[0] = 1;
+        for (int i = 0; i < n_dims; i++) {
+            new_dims[i + 1] = static_cast<int>(dims[i]);
+        }
+        for (int i = n_dims; i < 4; i++) {
+            new_dims[i + 1] = 1;
+        }
+    }
+
+    int get_ld(const memory_desc_wrapper desc, cublasOperation_t trans) {
+        const int ndims = desc.ndims();
+        const auto *strides = &desc.blocking_desc().strides[ndims - 2];
+        const int ld = strides[trans == cublasOperation_t::CUBLAS_OP_N ? 0 : 1];
+        return ld;
+    }
+
+    // creates operation descriptor based on the elemen-wise operation specified
+    status_t create_and_set_op_descriptor(const primitive_attr_t *attr,
+            cudnnActivationDescriptor_t &act_desc) {
+        CHECK(CUDNN_EXECUTE_FUNC_S(cudnnCreateActivationDescriptor, &act_desc));
+
+        cudnnActivationMode_t mode;
+
+        switch (eltwise_algo(attr)) {
+            case alg_kind::eltwise_relu:
+                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_RELU;
+                break;
+            case alg_kind::eltwise_tanh:
+                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_TANH;
+                break;
+            case alg_kind::eltwise_elu:
+                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_ELU;
+                break;
+            case alg_kind::eltwise_logistic:
+                mode = cudnnActivationMode_t::CUDNN_ACTIVATION_SIGMOID;
+                break;
+            default: return status::unimplemented;
+        }
+
+        // NaNs by default are propagated in oneDNN, although the forward
+        // convolution routine does not support this.
+        auto propagate_nan = cudnnNanPropagation_t::CUDNN_NOT_PROPAGATE_NAN;
+
+        // For ReLU, a ceiling of 0 means no limit.
+        double ceiling = eltwise_alpha(attr);
+
+        CHECK(CUDNN_EXECUTE_FUNC_S(cudnnSetActivationDescriptor, act_desc, mode,
+                propagate_nan, ceiling));
+
+        return status::success;
+    }
+
+    float eltwise_alpha(const primitive_attr_t *attr) {
+        int eltwise_idx_ = attr->post_ops_.find(primitive_kind::eltwise);
+        return with_eltwise(0, attr) || with_eltwise(1, attr)
+                ? attr->post_ops_.entry_[eltwise_idx_].eltwise.alpha
+                : 1.0f;
+    }
+
+    status_t handle_post_ops(cudnnHandle_t cudnn_handle, void *dst, void *bias,
+            void *reorder_scratch, float host_dst_scale) {
+        if (with_separate_bias_) {
+            // When bias is specified call cudnnAddTensor()
+            float bias_beta = 1;
+            auto scale = (with_separate_eltwise_ ? 1 : 1.0f / host_dst_scale);
+            CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &scale,
+                    tensor_descs_[io::bias], bias, &bias_beta, temp_mem_desc_,
+                    reorder_scratch);
+        }
+        if (with_separate_eltwise_) {
+            // Perform elementwise operation if specified
+            float alpha = 1.0f / host_dst_scale;
+            float beta = 0;
+            CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
+                    &alpha, temp_mem_desc_, reorder_scratch, &beta,
+                    temp_mem_desc_, reorder_scratch);
+        }
+        if (reorder_required_) {
+            // Reorder from scratchpad to destination if required
+            float reorder_alpha = 1;
+            CUDNN_EXECUTE_FUNC(cudnnTransformTensor, cudnn_handle,
+                    &reorder_alpha, temp_mem_desc_, reorder_scratch,
+                    &post_op_sum_, tensor_descs_[io::dst], dst);
+        }
+
         return status::success;
     }
 
+    void cleanup() const {
+        if (act_desc_) {
+            CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_);
+        }
+        if ((reorder_required_ && !bias_dt_mismatch_)
+                || ((with_separate_bias_ && bias_dt_mismatch_)
+                        && temp_mem_desc_)) {
+            CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, temp_mem_desc_);
+        }
+        for (size_t i = 0; i < NUM_IO; i++) {
+            if (tensor_descs_[i]) {
+                CUDNN_EXECUTE_FUNC_V(
+                        cudnnDestroyTensorDescriptor, tensor_descs_[i]);
+            }
+        }
+    }
+
+    int lda_, ldb_, ldc_;
+
+    int64_t stride_a_, stride_b_, stride_c_;
+
+    enum io { bias = 0, dst, NUM_IO };
+    cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {},
+                            temp_mem_desc_ = nullptr;
+    cudnnActivationDescriptor_t act_desc_ = nullptr;
+
+    cublasOperation_t transA_;
+    cublasOperation_t transB_;
+    cublasOperation_t transC_;
+    cublasGemmAlgo_t gemm_algo_
+            = cublasGemmAlgo_t::CUBLAS_GEMM_DEFAULT_TENSOR_OP;
+};
+
+struct cudnn_matmul_impl_t {
+
+    void set_non_runtime_params(
+            const std::shared_ptr<cublas_params> &matmul_params) {
+        matmul_params_ = matmul_params;
+    }
+
     void execute(cublasHandle_t cublas_handle, cudnnHandle_t cudnn_handle,
-            void *a, void *b, void *c, void *bias, void *scratch,
+            const std::shared_ptr<cublas_params> &matmul_params, void *a,
+            void *b, void *c, void *bias, void *reorder_scratch,
             void *src_scale, void *wei_scale, void *dst_scale) {
+
+        // use cached params unless using runtime dimensions
+        std::shared_ptr<cublas_params> params
+                = matmul_params->has_runtime_params_ ? matmul_params
+                                                     : matmul_params_;
+
         float gemm_beta = 0;
-        if (!bias_dt_mismatch_ && !reorder_required_) {
+        if (!params->bias_dt_mismatch_ && !params->reorder_required_) {
             // Case where no reorder is required, scratchpad points to dst (c)
-            scratch = c;
-            temp_mem_desc_ = tensor_descs_[io::dst];
-            gemm_beta = post_op_sum_;
+            reorder_scratch = c;
+            params->temp_mem_desc_
+                    = params->tensor_descs_[cublas_params::io::dst];
+            gemm_beta = params->post_op_sum_;
         }
         auto flip_op = [](cublasOperation_t op) {
             return (op == cublasOperation_t::CUBLAS_OP_T)
@@ -319,125 +398,73 @@ struct cudnn_matmul_impl_t {
             CUDA_EXECUTE_FUNC(cuMemcpy, (CUdeviceptr)&host_dst_scale,
                     (CUdeviceptr)dst_scale, sizeof(float));
             // For eltwise post-ops, apply the dst scale afterward
-            if (!with_eltwise_) scale /= host_dst_scale;
+            if (!params->with_separate_eltwise_) scale /= host_dst_scale;
         }
 
-        if (isbatched_) {
+        auto M = params->M_;
+        auto N = params->N_;
+        auto K = params->K_;
+
+        auto lda = params->lda_;
+        auto ldb = params->ldb_;
+        auto ldc = params->ldc_;
+
+        auto src_type = params->src_type_;
+        auto weights_type = params->weights_type_;
+        auto dst_type = params->dst_type_;
+
+        auto stride_a = params->stride_a_;
+        auto stride_b = params->stride_b_;
+        auto stride_c = params->stride_c_;
+
+        auto batch_count = params->batch_count_;
+        auto acc_type = params->acc_type_;
+        auto gemm_algo = params->gemm_algo_;
+
+        auto transA = params->transA_;
+        auto transB = params->transB_;
+        auto transC = params->transC_;
+
+        if (params->isbatched_) {
             // Calls cublasGemmStridedBatchedEx()
-            if (transC_ == cublasOperation_t::CUBLAS_OP_T) {
+            if (transC == cublasOperation_t::CUBLAS_OP_T) {
                 CUBLAS_EXECUTE_FUNC(cublasGemmStridedBatchedEx, cublas_handle,
-                        flip_op(transB_), flip_op(transA_), N_, M_, K_, &scale,
-                        b, src_type_, ldb_, stride_b_, a, weights_type_, lda_,
-                        stride_a_, &gemm_beta, scratch, dst_type_, ldc_,
-                        stride_c_, batch_count_, acc_type_, gemm_algo_);
+                        flip_op(transB), flip_op(transA), N, M, K, &scale, b,
+                        src_type, ldb, stride_b, a, weights_type, lda, stride_a,
+                        &gemm_beta, reorder_scratch, dst_type, ldc, stride_c,
+                        batch_count, acc_type, gemm_algo);
 
             } else {
                 CUBLAS_EXECUTE_FUNC(cublasGemmStridedBatchedEx, cublas_handle,
-                        transA_, transB_, M_, N_, K_, &scale, a, weights_type_,
-                        lda_, stride_a_, b, src_type_, ldb_, stride_b_,
-                        &gemm_beta, scratch, dst_type_, ldc_, stride_c_,
-                        batch_count_, acc_type_, gemm_algo_);
+                        transA, transB, M, N, K, &scale, a, weights_type, lda,
+                        stride_a, b, src_type, ldb, stride_b, &gemm_beta,
+                        reorder_scratch, dst_type, ldc, stride_c, batch_count,
+                        acc_type, gemm_algo);
             }
         } else {
             // Calls cublasGemmEx()
-            if (transC_ == cublasOperation_t::CUBLAS_OP_T) {
+            if (transC == cublasOperation_t::CUBLAS_OP_T) {
                 CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle,
-                        flip_op(transB_), flip_op(transA_), N_, M_, K_, &scale,
-                        b, src_type_, ldb_, a, weights_type_, lda_, &gemm_beta,
-                        scratch, dst_type_, ldc_, acc_type_, gemm_algo_);
+                        flip_op(transB), flip_op(transA), N, M, K, &scale, b,
+                        src_type, ldb, a, weights_type, lda, &gemm_beta,
+                        reorder_scratch, dst_type, ldc, acc_type, gemm_algo);
             } else {
-                CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, transA_,
-                        transB_, M_, N_, K_, &scale, a, weights_type_, lda_, b,
-                        src_type_, ldb_, &gemm_beta, scratch, dst_type_, ldc_,
-                        acc_type_, gemm_algo_);
+                CUBLAS_EXECUTE_FUNC(cublasGemmEx, cublas_handle, transA, transB,
+                        M, N, K, &scale, a, weights_type, lda, b, src_type, ldb,
+                        &gemm_beta, reorder_scratch, dst_type, ldc, acc_type,
+                        gemm_algo);
             }
         }
-        if (with_bias_) {
-            // When bias is specified call cudnnAddTensor()
-            float bias_beta = 1;
-            scale = (with_eltwise_ ? 1 : 1.0f / host_dst_scale);
-            CUDNN_EXECUTE_FUNC(cudnnAddTensor, cudnn_handle, &scale,
-                    tensor_descs_[io::bias], bias, &bias_beta, temp_mem_desc_,
-                    scratch);
-        }
-        if (with_eltwise_) {
-            // Perform elementwise operation if specified
-            float alpha = 1.0f / host_dst_scale;
-            float beta = 0;
-            CUDNN_EXECUTE_FUNC(cudnnActivationForward, cudnn_handle, act_desc_,
-                    &alpha, temp_mem_desc_, scratch, &beta, temp_mem_desc_,
-                    scratch);
-        }
-        if (reorder_required_) {
-            // Reorder from scratchpad to destination if required
-            float reorder_alpha = 1;
-            CUDNN_EXECUTE_FUNC(cudnnTransformTensor, cudnn_handle,
-                    &reorder_alpha, temp_mem_desc_, scratch, &post_op_sum_,
-                    tensor_descs_[io::dst], c);
-        }
+        params->handle_post_ops(
+                cudnn_handle, c, bias, reorder_scratch, host_dst_scale);
     }
 
-    ~cudnn_matmul_impl_t() { cleanup(); }
-
-    void cleanup() {
-        if (act_desc_) {
-            CUDNN_EXECUTE_FUNC_V(cudnnDestroyActivationDescriptor, act_desc_);
-            act_desc_ = nullptr;
-        }
-        if ((reorder_required_ && !bias_dt_mismatch_)
-                || ((with_bias_ && bias_dt_mismatch_) && temp_mem_desc_)) {
-            CUDNN_EXECUTE_FUNC_V(cudnnDestroyTensorDescriptor, temp_mem_desc_);
-            temp_mem_desc_ = nullptr;
-        }
-        for (size_t i = 0; i < NUM_IO; i++) {
-            if (tensor_descs_[i]) {
-                CUDNN_EXECUTE_FUNC_V(
-                        cudnnDestroyTensorDescriptor, tensor_descs_[i]);
-                tensor_descs_[i] = nullptr;
-            }
-        }
+    ~cudnn_matmul_impl_t() {
+        if (matmul_params_) { matmul_params_->cleanup(); }
     }
 
 private:
-    status_t get_cublas_data_type(
-            dnnl_data_type_t data_type, cudaDataType_t &blas_dt) {
-        switch (data_type) {
-            case dnnl_data_type_t::dnnl_f32:
-                blas_dt = CUDA_R_32F;
-                return status::success;
-            case dnnl_data_type_t::dnnl_f16:
-                blas_dt = CUDA_R_16F;
-                return status::success;
-            case dnnl_data_type_t::dnnl_bf16:
-                blas_dt = CUDA_R_16BF;
-                return status::success;
-            case dnnl_data_type_t::dnnl_s8:
-                blas_dt = CUDA_R_8I;
-                return status::success;
-            default: return status::unimplemented;
-        }
-        return status::unimplemented;
-    }
-    cublasOperation_t transA_;
-    cublasOperation_t transB_;
-    cublasOperation_t transC_;
-    int M_, N_, K_;
-    int lda_, ldb_, ldc_;
-    long long int stride_a_, stride_b_, stride_c_;
-    bool isbatched_ = false, with_bias_ = false, bias_dt_mismatch_ = false,
-         with_dst_scale_ = false;
-    bool reorder_required_ = false, with_eltwise_ = false;
-    bool has_runtime_params_ = false;
-    cudaDataType_t src_type_, weights_type_, dst_type_;
-    cudaDataType_t acc_type_ = cudaDataType_t::CUDA_R_32F;
-    cublasGemmAlgo_t gemm_algo_
-            = cublasGemmAlgo_t::CUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    int batch_count_;
-    enum io { bias = 0, dst, NUM_IO };
-    cudnnTensorDescriptor_t tensor_descs_[NUM_IO] = {},
-                            temp_mem_desc_ = nullptr;
-    cudnnActivationDescriptor_t act_desc_ = nullptr;
-    float post_op_sum_;
+    std::shared_ptr<cublas_params> matmul_params_;
 };
 
 } // namespace nvidia
diff --git a/src/gpu/nvidia/cudnn_matmul_lt.hpp b/src/gpu/nvidia/cudnn_matmul_lt.hpp
new file mode 100644
index 00000000000..7c3f1684425
--- /dev/null
+++ b/src/gpu/nvidia/cudnn_matmul_lt.hpp
@@ -0,0 +1,476 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_NVIDIA_CUDNN_MATMUL_LT_HPP
+#define GPU_NVIDIA_CUDNN_MATMUL_LT_HPP
+
+#include <cublas_v2.h>
+
+#include "gpu/gpu_matmul_pd.hpp"
+
+#include "common/primitive.hpp"
+#include "common/primitive_desc_iterator.hpp"
+#include "gpu/gpu_primitive.hpp"
+#include "gpu/nvidia/cudnn_matmul_executor.hpp"
+#include "gpu/nvidia/cudnn_matmul_lt_impl.hpp"
+#include "gpu/nvidia/sycl_cuda_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+
+struct cudnn_matmul_lt_t : public gpu::primitive_t {
+    using primitive_t::primitive_t;
+
+    struct pd_t : public gpu_matmul_pd_t {
+        using gpu_matmul_pd_t::gpu_matmul_pd_t;
+
+        DECLARE_COMMON_PD_T("cuda:cublaslt:any", cudnn_matmul_lt_t);
+
+        status_t init(impl::engine_t *engine) {
+            using namespace data_type;
+            using smask_t = primitive_attr_t::skip_mask_t;
+
+            data_type_t src_dt = src_md()->data_type;
+            data_type_t dst_dt = dst_md()->data_type;
+            data_type_t wei_dt = weights_md(0)->data_type;
+            data_type_t bia_dt
+                    = with_bias() ? weights_md(1)->data_type : data_type::f32;
+
+            bool f32_case = utils::everyone_is(f32, src_dt, wei_dt, dst_dt);
+            bool f16_case = utils::everyone_is(f16, src_dt, wei_dt, dst_dt);
+            bool bf16_case = utils::everyone_is(bf16, src_dt, wei_dt, dst_dt);
+
+            bool s8_case = false;
+            if (has_imma_dst_int8_support()) {
+                s8_case = utils::everyone_is(s8, src_dt, wei_dt)
+                        && utils::one_of(dst_dt, s8, s32);
+            } else {
+                s8_case = utils::everyone_is(s8, src_dt, wei_dt)
+                        && utils::one_of(dst_dt, s32);
+            }
+            auto *sycl_engine_impl
+                    = utils::downcast<const xpu::sycl::engine_impl_t *>(
+                            engine->impl());
+
+            bool is_eltwise_ok = eltwise_ok();
+
+            bool ok = is_dense_format_kind()
+                    && attr()->has_default_values(smask_t::scales)
+                    // src & weights scaling is not supported as this implementation uses integer types
+                    // for the compute type, but the scales are floating point numbers
+                    && attr()->scales_.get(DNNL_ARG_SRC).has_default_values()
+                    && attr()->scales_.get(DNNL_ARG_WEIGHTS)
+                               .has_default_values()
+                    && attr_post_ops_ok(attr())
+                    && IMPLICATION(bf16_case,
+                            has_bf16_support(sycl_engine_impl->device()))
+                    && (s8_case ? set_default_formats_lt()
+                                : (set_default_formats() && blocking_ok()))
+                    && tags_ok()
+                    && (f32_case || f16_case || bf16_case || s8_case)
+                    && IMPLICATION(with_bias(),
+                            (IMPLICATION(f32_case, utils::one_of(bia_dt, f32))
+                                    && IMPLICATION(f16_case,
+                                            utils::one_of(bia_dt, f16, f32))
+                                    && IMPLICATION(bf16_case,
+                                            utils::one_of(bia_dt, bf16, f32))
+                                    && IMPLICATION(s8_case,
+                                            utils::one_of(bia_dt, s8, s32, f32))
+                                    && IMPLICATION(s8_case, scales_ok())
+                                    && IMPLICATION(!s8_case, bia_dt == dst_dt)))
+                    && IMPLICATION(with_bias(), !has_runtime_dims_or_strides());
+
+            memory_desc_wrapper src_wrap(src_md());
+            memory_desc_wrapper weight_wrap(weights_md());
+            memory_desc_wrapper dst_wrap(dst_md());
+
+            ok = ok && src_wrap.ndims() <= 3;
+            ok = ok
+                    && IMPLICATION(
+                            is_md_col32(weight_wrap) || is_md_col32(dst_wrap),
+                            s8_case);
+            bool is_imma_blocks = imma_blocks();
+            ok = ok && (is_imma_blocks || dst_ok()) && bias_ok()
+                    && is_eltwise_ok;
+            if (!ok) return status::unimplemented;
+
+            if (!with_bias() && !with_eltwise() && !s8_case) {
+                return status::unimplemented;
+            }
+            if (s8_case && with_eltwise() && is_eltwise_ok) {
+                return status::unimplemented;
+            }
+
+            if (separate_bias() && !s8_case) { return status::unimplemented; }
+
+            if (src_md()->ndims > 3) return status::unimplemented;
+            if (with_bias()) {
+                primitive_attr_t binary_attr;
+
+                auto binary_desc = binary_desc_t();
+                binary_desc.primitive_kind = primitive_kind::binary;
+                binary_desc.alg_kind = alg_kind::binary_add;
+                if (dst_dt == dnnl_s8) {
+                    s32_dst_md_ = types::zero_md();
+                    auto tag = dst_wrap.matches_one_of_tag(format_tag::ab,
+                            format_tag::abc, format_tag::acb, format_tag::Ab32a,
+                            format_tag::aBc32b);
+
+                    if (tag == format_tag::undef) return status::unimplemented;
+
+                    memory_desc_init_by_tag(s32_dst_md_, dst_md()->ndims,
+                            dst_md()->dims, dnnl_s32, tag);
+                    binary_desc.src_desc[0] = s32_dst_md_;
+                } else {
+                    binary_desc.src_desc[0] = *dst_md();
+                }
+                binary_desc.src_desc[1] = *weights_md(1);
+                binary_desc.dst_desc = *dst_md();
+
+                primitive_desc_iterator_t it(engine, (op_desc_t *)&binary_desc,
+                        &binary_attr, nullptr);
+                while (++it != it.end()) {
+                    binary_pd_ = *it;
+                    if (binary_pd_) { break; }
+                }
+                if (!binary_pd_) return status::unimplemented;
+            }
+
+            // When src is in cublaslt blocked format only default scaling is supported.
+            if (!IMPLICATION(src_wrap.is_cublaslt_blocked_desc(),
+                        default_scale(DNNL_ARG_SRC))) {
+                return status::unimplemented;
+            }
+
+            const bool is_scale_s32
+                    = (s8_case && dst_wrap.data_type() == dnnl_s32);
+            auto is_scale_ok = [&](int ARG) {
+                return !default_scale(ARG)
+                        && (!single_scale(ARG) || is_scale_s32);
+            };
+
+            if (is_scale_ok(DNNL_ARG_DST)) {
+                CHECK(create_scale_binary_pd(engine, DNNL_ARG_DST));
+            }
+
+            params_ = std::make_shared<cublas_lt_params>();
+            CHECK(params_->init(engine, src_md(), weights_md(), dst_md(),
+                    weights_md(1), attr(), batched(), with_bias()));
+
+            if (!params_->has_runtime_params()) {
+                auto scratchpad = scratchpad_registry().registrar();
+                params_->init_scratchpad(scratchpad);
+            }
+
+            return status::success;
+        }
+
+        std::shared_ptr<primitive_desc_t> dst_scale_binary_pd_;
+        std::shared_ptr<primitive_desc_t> binary_pd_;
+        std::shared_ptr<cublas_lt_params> params_;
+
+        memory_desc_t s32_dst_md_;
+
+        bool default_scale(int ARG) const {
+            return attr()->scales_.has_default_values(ARG);
+        }
+
+    private:
+        bool tags_ok() const {
+            memory_desc_wrapper src_wrap(src_md());
+            memory_desc_wrapper dst_wrap(dst_md());
+            memory_desc_wrapper wei_wrap(weights_md());
+
+            bool ok = (src_wrap.is_cublaslt_blocked_desc()
+                    || src_wrap.matches_one_of_tag(format_tag::ab,
+                            format_tag::ba, format_tag::abc, format_tag::acb));
+            for (auto &wrap : {dst_wrap, wei_wrap}) {
+                ok = ok
+                        && wrap.matches_one_of_tag(format_tag::ab,
+                                format_tag::ba, format_tag::abc,
+                                format_tag::acb, format_tag::Ab32a,
+                                format_tag::aBc32b);
+            }
+
+            return ok;
+        }
+
+        status_t create_scale_binary_pd(impl::engine_t *engine, int ARG) {
+            if (ARG != DNNL_ARG_DST) return status::unimplemented;
+
+            auto md = arg_md(ARG);
+            dims_t dims;
+            dims_t strides;
+            for (int i = 0; i < md->ndims; i++) {
+                if (attr()->scales_.get(1).get_mask() & (1 << i)) {
+                    dims[i] = md->dims[i];
+                } else {
+                    dims[i] = 1;
+                }
+            }
+            for (int i = 0; i < md->ndims; i++) {
+                auto stride = 1;
+                for (int j = i + 1; j < md->ndims; j++) {
+                    stride *= md->dims[j];
+                }
+                strides[i] = stride;
+            }
+
+            memory_desc_t scale_md;
+            CHECK(memory_desc_init_by_strides(scale_md, md->ndims, dims,
+                    attr()->scales_.get(ARG).get_data_type(), strides));
+
+            return init_scale_binary_pd(engine, ARG, dst_scale_binary_pd_,
+                    arg_md(ARG), scale_md, alg_kind::binary_div);
+        }
+
+        status_t init_scale_binary_pd(impl::engine_t *engine, int ARG,
+                std::shared_ptr<primitive_desc_t> &scale_binary_pd,
+                const memory_desc_t *in_out, memory_desc_t &in2,
+                alg_kind_t mul_or_div) {
+            primitive_attr_t scale_binary_attr;
+
+            auto scale_binary_desc = binary_desc_t();
+            scale_binary_desc.primitive_kind = primitive_kind::binary;
+            scale_binary_desc.alg_kind = mul_or_div;
+            scale_binary_desc.src_desc[0] = *in_out;
+            scale_binary_desc.src_desc[1] = in2;
+            scale_binary_desc.dst_desc = *in_out;
+
+            primitive_desc_iterator_t it(engine,
+                    (op_desc_t *)&scale_binary_desc, &scale_binary_attr,
+                    nullptr);
+            while (++it != it.end()) {
+                if (*it) {
+                    scale_binary_pd = *it;
+                    break;
+                }
+            }
+            if (!scale_binary_pd) return status::unimplemented;
+            return status::success;
+        }
+
+        bool single_scale(int ARG) const {
+            const auto &scales = attr()->scales_;
+            return scales.get_mask(ARG) == 0;
+        }
+
+        bool scales_ok() {
+            bool src_scales_ok = default_scale(DNNL_ARG_SRC);
+            bool wei_scales_ok = default_scale(DNNL_ARG_WEIGHTS);
+            return src_scales_ok && wei_scales_ok;
+        }
+
+        bool dst_ok() {
+            bool ok = false;
+
+            memory_desc_wrapper dst_wrap(dst_md());
+            bool isbatched = batched() && dst_wrap.dims()[0];
+            //check if dst is col_major
+            if (dst_wrap.is_plain()) {
+                const auto &md_strides
+                        = &dst_wrap.blocking_desc().strides[isbatched];
+                ok = (md_strides[1] == 1 && dst_wrap.dims()[isbatched + 0] > 1);
+            } else {
+                // Ensure blocked format is Ab32a or aBc32b
+                ok = is_md_col32(*dst_md());
+            }
+            // dst not supported for ndims = 1
+            ok = ok
+                    && (dst_wrap.dims()[isbatched + 1] != 1
+                            && dst_wrap.dims()[isbatched + 0] != 1);
+
+            return ok;
+        }
+
+        bool bias_ok() {
+            if (!with_bias()) { return true; }
+            memory_desc_wrapper dst_wrap(dst_md());
+            memory_desc_wrapper bia_wrap(weights_md(1));
+
+            if (!separate_bias()) {
+                bool isbatched = batched() && dst_wrap.dims()[0];
+                if (bia_wrap.dims()[0 + isbatched] != 1) { return false; }
+            }
+
+            return true;
+        }
+
+        bool separate_bias() {
+            if (!with_bias()) { return false; }
+            memory_desc_wrapper dst_wrap(dst_md());
+            memory_desc_wrapper bia_wrap(weights_md(1));
+
+            bool bias_dt_mismatch
+                    = (dst_md()->data_type != weights_md(1)->data_type);
+
+            if (bia_wrap.data_type() == dnnl_s8) { return true; }
+
+            bool isbatched = batched() && dst_wrap.dims()[0];
+            const auto &md_strides
+                    = &dst_wrap.blocking_desc().strides[isbatched];
+            bool col_maj_dst
+                    = md_strides[1] == 1 && dst_wrap.dims()[isbatched] > 1;
+
+            if (bias_dt_mismatch || col_maj_dst
+                    || (bia_wrap.dims()[1 + isbatched]
+                                    != dst_wrap.dims()[isbatched]
+                            || bia_wrap.dims()[0 + isbatched] != 1)
+                    || static_cast<uint64_t>(dst_wrap.dims()[isbatched]) == 1
+                    || static_cast<uint64_t>(dst_wrap.dims()[isbatched]) == 1) {
+                return true;
+            }
+            return false;
+        }
+
+        bool with_eltwise() {
+            return attr()->post_ops_.contain(primitive_kind::eltwise, 0)
+                    || attr()->post_ops_.contain(primitive_kind::eltwise, 1);
+        }
+
+        bool eltwise_ok() {
+            if (!with_eltwise()) { return true; }
+
+            int eltwise_idx_ = attr()->post_ops_.find(primitive_kind::eltwise);
+            auto eltwise_algo
+                    = attr()->post_ops_.entry_[eltwise_idx_].eltwise.alg;
+            if (eltwise_algo == alg_kind::eltwise_relu) { return true; }
+            return false;
+        }
+
+        bool imma_blocks() {
+            // weights should be blocked in Ab32a, ab or ba
+            bool weights_supported = false;
+            memory_desc_wrapper weight_wrap(weights_md());
+            if (is_md_col32(weight_wrap) || weight_wrap.is_plain()) {
+                weights_supported = true;
+            }
+            // src plain format or internal cublaslt format
+            bool src_supported = false;
+            memory_desc_wrapper src_wrap(src_md());
+            if (src_wrap.is_cublaslt_blocked_desc() || src_wrap.is_plain()) {
+                src_supported = true;
+            }
+            // dst blocked in Ab32a, ab or ba
+            bool dst_supported = false;
+            memory_desc_wrapper dst_wrap(dst_md());
+            if (is_md_col32(dst_wrap) || dst_wrap.is_plain()) {
+                dst_supported = true;
+            }
+            return (weights_supported && src_supported && dst_supported);
+        }
+
+        bool set_default_formats_lt() {
+            memory_desc_wrapper w_wrap(this->weights_md_);
+            if (w_wrap.format_any()) {
+                auto tag = batched() ? format_tag::aBc32b : format_tag::Ab32a;
+                CHECK(memory_desc_init_by_tag(this->weights_md_, w_wrap.ndims(),
+                        w_wrap.dims(), w_wrap.data_type(), tag));
+            }
+
+            memory_desc_wrapper dst_wrap(dst_md());
+            if (dst_wrap.format_any()) {
+                auto tag = batched() ? format_tag::aBc32b : format_tag::Ab32a;
+                CHECK(memory_desc_init_by_tag(this->dst_md_, dst_wrap.ndims(),
+                        dst_wrap.dims(), dst_wrap.data_type(), tag));
+            }
+
+            memory_desc_wrapper src_wrap(this->src_md_);
+            if (src_wrap.format_any()) {
+                auto ceildiv = [](dim_t n, dim_t d) { return (n + d - 1) / d; };
+                auto n_rows = 32 * ceildiv(src_wrap.dims()[batched()], 32);
+                auto n_cols = 32 * ceildiv(src_wrap.dims()[batched() + 1], 32);
+                auto n_batch = batched() ? src_wrap.dims()[0] : 1;
+                size_t size = n_batch * n_rows * n_cols;
+
+                this->src_md_.padded_dims[batched()] = n_rows;
+                this->src_md_.padded_dims[batched() + 1] = n_cols;
+                this->src_md_.format_kind = format_kind::cublaslt_blocked;
+                this->src_md_.format_desc.cublaslt_blocked_desc
+                        = cublaslt_blocked_desc_t {
+                                cublaslt_memory_format_t::col32_2r_4r4, size};
+            }
+
+            memory_desc_wrapper b_wrap(this->bias_md_);
+            if (b_wrap.format_any()) {
+                auto tag = batched() ? format_tag::aBc32b : format_tag::Ab32a;
+                CHECK(memory_desc_init_by_tag(this->bias_md_, b_wrap.ndims(),
+                        b_wrap.dims(), b_wrap.data_type(), tag));
+            }
+
+            return true;
+        }
+
+        bool blocking_ok() const {
+            std::vector<const memory_desc_t *> mds
+                    = {src_md(), dst_md(), weights_md(0)};
+            if (with_bias()) mds.push_back(weights_md(1));
+            for (const memory_desc_t *md : mds) {
+                memory_desc_wrapper mdw(md);
+                if (mdw.is_blocking_desc()) {
+                    if (mdw.blocking_desc().inner_nblks != 0) { return false; }
+                }
+            }
+            return true;
+        }
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        // LT matmul
+        matmul_impl_.reset(new cudnn_matmul_lt_impl_t());
+
+        bool has_runtime_args = pd()->params_->has_runtime_params();
+        if (has_runtime_args) {
+            executor_.reset(new cudnn_matmul_lt_runtime_args_exec_t);
+        } else if (!has_runtime_args) {
+            executor_.reset(new cudnn_matmul_lt_exec_t);
+            matmul_impl_->set_non_runtime_params(pd()->params_);
+        }
+
+        if (pd()->params_->with_bias_) {
+            CHECK(create_nested_primitive(binary_, pd()->binary_pd_, engine));
+        }
+
+        if (!pd()->default_scale(DNNL_ARG_DST)
+                && (pd()->params_->multi_dst_scale_
+                        || pd()->params_->acc_type_ == CUDA_R_32I)) {
+            CHECK(create_nested_primitive(
+                    dst_scale_binary_, pd()->dst_scale_binary_pd_, engine));
+        }
+
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+
+    std::shared_ptr<impl::primitive_t> binary_;
+    std::shared_ptr<impl::primitive_t> dst_scale_binary_;
+    std::shared_ptr<cudnn_matmul_lt_impl_t> matmul_impl_;
+    std::shared_ptr<cudnn_matmul_lt_base_exec_t> executor_;
+
+private:
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+};
+
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/nvidia/cudnn_matmul_lt_impl.hpp b/src/gpu/nvidia/cudnn_matmul_lt_impl.hpp
new file mode 100644
index 00000000000..a8f7d77ae53
--- /dev/null
+++ b/src/gpu/nvidia/cudnn_matmul_lt_impl.hpp
@@ -0,0 +1,790 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_NVIDIA_CUDNN_MATMUL_LT_IMPL_HPP
+#define GPU_NVIDIA_CUDNN_MATMUL_LT_IMPL_HPP
+
+#include <cublasLt.h>
+#include "cudnn.h"
+#include <cublas_v2.h>
+
+#include "common/float16.hpp"
+#include "gpu/nvidia/cudnn_matmul_base_impl.hpp"
+#include "gpu/nvidia/cudnn_matmul_executor.hpp"
+#include "gpu/nvidia/engine.hpp"
+#include "gpu/nvidia/stream.hpp"
+#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
+#include "gpu/nvidia/sycl_cuda_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+
+struct cublas_lt_params : cublas_base_params {
+
+    status_t init(impl::engine_t *engine, const memory_desc_t *src_md,
+            const memory_desc_t *weights_md, const memory_desc_t *dst_md,
+            const memory_desc_t *bias_md, const primitive_attr_t *attr,
+            bool batched, bool with_bias) {
+        CHECK(get_cublas_data_type(src_md->data_type, src_type_));
+        CHECK(get_cublas_data_type(weights_md->data_type, weights_type_));
+        CHECK(get_cublas_data_type(dst_md->data_type, dst_type_));
+
+        auto src_d = memory_desc_wrapper(*src_md);
+        auto weights_d = memory_desc_wrapper(*weights_md);
+        auto dst_d = memory_desc_wrapper(*dst_md);
+
+        isbatched_ = batched && dst_d.dims()[0];
+
+        has_runtime_params_ = src_d.has_runtime_dims_or_strides()
+                || dst_d.has_runtime_dims_or_strides()
+                || weights_d.has_runtime_dims_or_strides();
+
+        with_dst_scale_ = !attr->scales_.get(DNNL_ARG_DST).has_default_values();
+        if (with_dst_scale_) {
+            if (attr->scales_.get_mask(DNNL_ARG_DST) > 0) {
+                multi_dst_scale_ = true;
+            }
+        }
+
+        // Initialise flags and variables for the imma case (E.g. imma_case_ flag).
+        check_imma_case(src_d, weights_d, dst_d);
+
+        with_bias_ = with_bias;
+
+        bool dst_row_major = !is_md_col_major(dst_d);
+
+        // Check if bias can be used in the epilogue
+        if (with_bias_) {
+            if (has_runtime_params_) {
+                return status::unimplemented;
+            } else {
+                bias_dt_mismatch_ = (bias_md->data_type != dst_md->data_type);
+                if (imma_case_) {
+                    with_separate_bias_ = true;
+                    if (dst_d.data_type() == dnnl_s8) {
+                        reorder_required_ = true;
+                    } else {
+                        reorder_required_ = false;
+                    }
+                } else {
+                    if (bias_dt_mismatch_ || dst_row_major) {
+                        with_separate_bias_ = true;
+                        reorder_required_ = false;
+                    }
+                    if (!with_separate_bias_ && !dst_row_major) {
+                        // bias epilogue not supported for dst dim = 1
+                        memory_desc_wrapper bias_d
+                                = memory_desc_wrapper(*bias_md);
+                        if ((bias_d.dims()[1 + isbatched_]
+                                            != static_cast<dim_t>(M_)
+                                    || bias_d.dims()[0 + isbatched_] != 1)
+                                || M_ == 1 || N_ == 1 || has_runtime_params_) {
+                            with_separate_bias_ = true;
+                            reorder_required_ = false;
+                        }
+                    }
+                }
+            }
+        }
+        with_bias_epilogue_ = with_bias_ && !with_separate_bias_;
+
+        // Check if activation can be used in epilogue
+        if (with_eltwise(0, attr) || with_eltwise(1, attr)) {
+            if (dst_d.has_runtime_dims_or_strides()) {
+                return status::unimplemented;
+            } else {
+                with_relu_ = eltwise_algo(attr) == alg_kind::eltwise_relu;
+                if (!with_relu_ || dst_row_major || with_separate_bias_) {
+                    with_separate_eltwise_ = true;
+                }
+            }
+        }
+        with_relu_epilogue_ = with_relu_ && !with_separate_eltwise_;
+
+        // Separate activation is not supported and separate bias in non-imma case not supported.
+        if ((with_separate_bias_ && !imma_case_) || with_separate_eltwise_) {
+            return status::unimplemented;
+        }
+
+        // CublasLt is only used for the IMMA case and when the bias and relu are used in the epilogue
+        if (!imma_case_ && !with_relu_epilogue_ && !with_bias_epilogue_) {
+            return status::unimplemented;
+        }
+
+        // Imma case only supports default epilogue
+        if (imma_case_ && with_relu_epilogue_) { return status::unimplemented; }
+
+        // we use separate bias to support imma case
+        if (imma_case_ && with_bias_epilogue_) {
+            with_separate_bias_ = true;
+            with_bias_epilogue_ = false;
+        }
+
+        // if dst case is single value but we have post ops
+        if (with_dst_scale_
+                && (with_bias_epilogue_ || with_separate_bias_
+                        || with_relu_epilogue_)) {
+            multi_dst_scale_ = true;
+        }
+
+        const bool supports_ampere_layout = has_imma_ampere_layout_support(
+                utils::downcast<nvidia::engine_t *>(engine)->device());
+
+        imma_ampere_case_ = imma_case_ && supports_ampere_layout;
+        imma_plain_case_ = imma_case_ && !imma_ampere_case_;
+
+        // Set parameter when post-op sum is specified
+        if (with_sum(attr)) { post_op_sum_ = sum_scale(attr); }
+
+        // Initialise scaling parameters
+        alpha_beta_size_bytes_ = dst_d.data_type_size();
+        if (dst_d.data_type() == dnnl_s8) {
+            alpha_beta_size_bytes_ = sizeof(float);
+        }
+        alpha_ = std::malloc(alpha_beta_size_bytes_);
+        beta_ = std::malloc(alpha_beta_size_bytes_);
+
+        // Initialise all gemm parameters
+        if (!has_runtime_params_) {
+            CHECK(set_params(src_d, weights_d, dst_d, engine));
+        }
+
+        return status::success;
+    }
+
+    status_t init_from_params(const std::shared_ptr<cublas_lt_params> &other) {
+        if (!other) { return status::invalid_arguments; }
+        src_type_ = other->src_type_;
+        weights_type_ = other->weights_type_;
+        dst_type_ = other->dst_type_;
+        isbatched_ = other->isbatched_;
+        has_runtime_params_ = other->has_runtime_params_;
+        with_dst_scale_ = other->with_dst_scale_;
+        multi_dst_scale_ = other->multi_dst_scale_;
+        with_bias_ = other->with_bias_;
+        bias_dt_mismatch_ = other->bias_dt_mismatch_;
+        with_separate_bias_ = other->with_separate_bias_;
+        reorder_required_ = other->reorder_required_;
+        with_bias_epilogue_ = other->with_bias_epilogue_;
+        with_relu_epilogue_ = other->with_relu_epilogue_;
+        imma_ampere_case_ = other->imma_ampere_case_;
+        imma_plain_case_ = other->imma_plain_case_;
+        alpha_beta_size_bytes_ = other->alpha_beta_size_bytes_;
+        alpha_ = std::malloc(alpha_beta_size_bytes_);
+        beta_ = std::malloc(alpha_beta_size_bytes_);
+        return status::success;
+    }
+
+    status_t set_params(const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d, impl::engine_t *engine) {
+        batch_count_ = isbatched_ ? dst_d.dims()[0] : 1;
+        M_ = static_cast<uint64_t>(dst_d.dims()[isbatched_ + 1]);
+        N_ = static_cast<uint64_t>(dst_d.dims()[isbatched_ + 0]);
+        K_ = static_cast<uint64_t>(src_d.dims()[isbatched_ + 1]);
+
+        if (imma_case_) {
+            w_blocked_ = is_md_col32(weights_d);
+            dst_blocked_ = is_md_col32(dst_d);
+            src_blocked_ = src_d.is_cublaslt_blocked_desc();
+
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixTransformDescCreate, &trans_desc_,
+                    CUDA_R_32I);
+
+            if (!imma_ampere_case_) {
+                const bool all_plain_layout = src_d.is_plain()
+                        && weights_d.is_plain() && dst_d.is_plain();
+                // The plain imma configuration requires TN->N
+                const bool is_tnn = is_md_col_major(src_d)
+                        && !is_md_col_major(weights_d)
+                        && is_md_col_major(dst_d);
+                // The plain imma configuration is only supported if dimensions are multiples of 4
+                const bool are_dims_ok
+                        = M_ % 4 == 0 && K_ % 4 == 0 && N_ % 4 == 0;
+                if (!(is_tnn && all_plain_layout && are_dims_ok)) {
+                    return status::unimplemented;
+                }
+            } else {
+                CHECK(init_imma_ampere_sizes(src_d, weights_d, dst_d));
+            }
+        }
+
+        // Matmul supports runtime paramters for dimensions and scales.
+        // We need to initialize them in the execute function.
+        CHECK(set_gemm_params(src_d, weights_d, dst_d));
+
+        auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(engine);
+
+        impl::stream_t *service_stream;
+        CHECK(sycl_engine.get_service_stream(service_stream));
+
+        auto cuda_stream = utils::downcast<nvidia::stream_t *>(service_stream);
+        auto cublas_handle = cuda_stream->get_cublas_handle();
+        auto lt_handle = (cublasLtHandle_t)cublas_handle;
+        CHECK(init_scratchpad_size(lt_handle, src_d, weights_d, dst_d));
+
+        return status::success;
+    }
+
+    void check_imma_case(const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) {
+        if (src_d.data_type() == dnnl_s8 && weights_d.data_type() == dnnl_s8
+                && (dst_d.data_type() == dnnl_s32
+                        || dst_d.data_type() == dnnl_s8)) {
+            // weights blocked in Ab32a
+            w_blocked_ = is_md_col32(weights_d);
+            bool weights_supported = weights_d.has_runtime_dims_or_strides()
+                    || w_blocked_ || weights_d.is_plain();
+
+            // src not blocked
+            src_blocked_ = src_d.is_cublaslt_blocked_desc();
+            bool src_supported = src_d.has_runtime_dims_or_strides()
+                    || src_blocked_ || src_d.is_plain();
+
+            // dst blocked in Ab32a
+            dst_blocked_ = is_md_col32(dst_d);
+            bool dst_supported = dst_d.has_runtime_dims_or_strides()
+                    || dst_blocked_ || dst_d.is_plain();
+
+            imma_case_ = weights_supported && src_supported && dst_supported;
+        }
+    }
+
+    status_t set_gemm_params(const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) {
+        // C matrix is the dst
+        trans_c_ = !is_md_col_major(dst_d);
+        // A matrix is the weights
+        trans_a_ = !is_md_col_major(weights_d);
+        // B matrix is the src
+        trans_b_ = !is_md_col_major(src_d);
+
+        if (imma_ampere_case_) {
+            // IMMA kernels support only NT->N config
+            //trans_b_ = false;
+            if (w_blocked_) { trans_a_ = false; }
+            if (dst_blocked_) { trans_c_ = false; }
+        }
+        auto dst_dt = dst_d.data_type();
+        if (imma_case_ && reorder_required_) { dst_dt = dnnl_s32; }
+
+        if (dst_dt == dnnl_s8 || dst_dt == dnnl_bf16) {
+            CHECK(get_cublas_data_type(dnnl_f32, acc_type_));
+        } else {
+            CHECK(get_cublas_data_type(dst_dt, acc_type_));
+        }
+        CHECK(get_cublas_data_type(src_d.data_type(), src_type_));
+        CHECK(get_cublas_data_type(weights_d.data_type(), weights_type_));
+        CHECK(get_cublas_data_type(dst_dt, dst_type_));
+
+        if (dst_dt == dnnl_f16 && src_d.data_type() == dnnl_f16
+                && weights_d.data_type() == dnnl_f16) {
+            compute_type_ = CUBLAS_COMPUTE_16F;
+        } else if (src_d.data_type() == dnnl_s8
+                && weights_d.data_type() == dnnl_s8
+                && (dst_dt == dnnl_s32 || dst_dt == dnnl_s8)) {
+            compute_type_ = CUBLAS_COMPUTE_32I;
+        }
+
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulDescCreate, &operation_desc_,
+                compute_type_, acc_type_);
+
+        if (batch_count_ != 1) {
+            stride_a_ = get_batch_stride(weights_d);
+            stride_b_ = src_d.is_cublaslt_blocked_desc()
+                    ? (K_ * N_)
+                    : get_batch_stride(src_d);
+            stride_c_ = get_batch_stride(dst_d);
+
+            // Enable broadcast semantics.
+            if (src_d.dims()[0] > weights_d.dims()[0]) {
+                stride_a_ = 0;
+                stride_a_blocked_ = 0;
+            } else if (src_d.dims()[0] < weights_d.dims()[0]) {
+                stride_b_ = 0;
+                stride_b_blocked_ = 0;
+            }
+        }
+
+        if (!imma_ampere_case_) {
+            create_non_blocked_layouts();
+        } else {
+            create_blocked_layouts();
+        }
+
+        return status::success;
+    }
+
+    status_t init_imma_ampere_sizes(const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) {
+        a_blocked_ld_ = c_blocked_ld_ = M_ * 32;
+        b_blocked_ld_ = ceildiv(N_, static_cast<uint64_t>(32)) * 32 * 32;
+        stride_b_blocked_
+                = ceildiv(K_, static_cast<uint64_t>(32)) * b_blocked_ld_;
+        source_size_ = batch_count_ * stride_b_blocked_ * src_d.data_type_size()
+                * 32;
+        stride_a_blocked_
+                = ceildiv(K_, static_cast<uint64_t>(32)) * a_blocked_ld_;
+        if (!w_blocked_) {
+            weight_size_ = batch_count_ * stride_a_blocked_
+                    * weights_d.data_type_size() * 32;
+        }
+        stride_c_blocked_
+                = ceildiv(N_, static_cast<uint64_t>(32)) * c_blocked_ld_;
+        if (!dst_blocked_) {
+            dest_size_ = batch_count_ * stride_c_blocked_
+                    * dst_d.data_type_size() * 32;
+        }
+        return status::success;
+    }
+
+    // Initialization for scratchpad memory
+    status_t init_scratchpad_size(cublasLtHandle_t lt_handle,
+            const memory_desc_wrapper &src_d,
+            const memory_desc_wrapper &weights_d,
+            const memory_desc_wrapper &dst_d) {
+
+        reorder_scratch_size_ = 0;
+
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulPreferenceCreate, &preference_);
+
+        // 1GB limit
+        uint64_t workspace_size = getenv_int_user(
+                "CUBLASLT_MAX_MATMUL_WORKSPACE_SIZE", 1073741824);
+
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulPreferenceSetAttribute, preference_,
+                CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &workspace_size,
+                sizeof(workspace_size));
+        cublasLtReductionScheme_t reduction_scheme
+                = CUBLASLT_REDUCTION_SCHEME_MASK;
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulPreferenceSetAttribute, preference_,
+                CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK, &reduction_scheme,
+                sizeof(reduction_scheme));
+
+        int num_results = 0;
+        if (imma_ampere_case_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmulAlgoGetHeuristic, lt_handle,
+                    operation_desc_, blocked_a_layout_, blocked_b_layout_,
+                    blocked_c_layout_, blocked_c_layout_, preference_,
+                    1 /* Num requested algos*/, &heuristic_results_,
+                    &num_results);
+        } else {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmulAlgoGetHeuristic, lt_handle,
+                    operation_desc_, a_layout_, b_layout_, c_layout_, c_layout_,
+                    preference_, 1 /* Num requested algos*/,
+                    &heuristic_results_, &num_results);
+        }
+
+        if (num_results == 0) { return status_t::dnnl_runtime_error; }
+        gemm_algo_ = heuristic_results_.algo;
+        algo_scratch_size_ = heuristic_results_.workspaceSize;
+
+        const auto dst_nelems = dst_d.nelems(true);
+        reorder_scratch_size_ = dst_nelems * sizeof(float);
+
+        return status_t::dnnl_success;
+    }
+
+    void init_scratchpad(memory_tracking::registrar_t scratchpad) {
+        if (reorder_scratch_size_ > 0) {
+            scratchpad.book(memory_tracking::names::key_matmul_dst_in_acc_dt,
+                    reorder_scratch_size_, 1, 256);
+        }
+        if (algo_scratch_size_ > 0) {
+            scratchpad.book(memory_tracking::names::key_matmul_lt_algo_scratch,
+                    algo_scratch_size_, 1, 256);
+        }
+        if (weight_size_ > 0) {
+            scratchpad.book(memory_tracking::names::key_gemm_blocked_a,
+                    weight_size_, 1, 256);
+        }
+        if (source_size_ > 0) {
+            scratchpad.book(memory_tracking::names::key_gemm_blocked_b,
+                    source_size_, 1, 256);
+        }
+        if (dest_size_ > 0) {
+            scratchpad.book(memory_tracking::names::key_matmul_lt_block_c,
+                    dest_size_, 1, 256);
+        }
+    }
+
+    bool is_md_col_major(const memory_desc_wrapper &md) {
+        if (md.is_blocking_desc()) {
+            const auto &md_strides = &md.blocking_desc().strides[isbatched_];
+            return (md_strides[1] == 1 && md.dims()[isbatched_ + 0] > 1);
+        }
+        return false;
+    }
+
+    void maybe_swap(uint64_t &row, uint64_t &col, cublasOperation_t &op,
+            cublasLtOrder_t order, bool transpose) {
+        if (transpose) {
+            std::swap(row, col);
+            op = cublasOperation_t::CUBLAS_OP_T;
+            order = CUBLASLT_ORDER_ROW;
+        }
+    }
+
+    status_t create_matrix_layout(cublasLtMatrixLayout_t &layout,
+            cublasLtOrder_t order, cublasOperation_t trans, uint64_t row,
+            uint64_t col, uint64_t ld, const cudaDataType_t data_type,
+            cublasLtMatmulDescAttributes_t trans_attr, uint64_t stride) {
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulDescSetAttribute, operation_desc_,
+                trans_attr, &trans, sizeof(trans));
+
+        CUBLAS_EXECUTE_FUNC(
+                cublasLtMatrixLayoutCreate, &layout, data_type, row, col, ld);
+
+        CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutSetAttribute, layout,
+                CUBLASLT_MATRIX_LAYOUT_ORDER, &order, sizeof(order));
+
+        if (batch_count_ != 1) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutSetAttribute, layout,
+                    CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count_,
+                    sizeof(batch_count_));
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutSetAttribute, layout,
+                    CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride,
+                    sizeof(stride));
+        }
+        return status_t::dnnl_success;
+    }
+
+    size_t bias_scratch_size() { return reorder_scratch_size_; }
+    bool has_runtime_params() { return has_runtime_params_; }
+
+    void create_non_blocked_layouts() {
+        auto trans_op = cublasOperation_t::CUBLAS_OP_N;
+        auto order = CUBLASLT_ORDER_COL;
+
+        auto row = M_;
+        auto col = K_;
+        maybe_swap(row, col, trans_op, order, trans_a_);
+        create_matrix_layout(a_layout_, CUBLASLT_ORDER_COL, trans_op, row, col,
+                row, weights_type_, CUBLASLT_MATMUL_DESC_TRANSA, stride_a_);
+
+        row = K_;
+        col = N_;
+        trans_op = cublasOperation_t::CUBLAS_OP_N;
+        maybe_swap(row, col, trans_op, order, trans_b_);
+        create_matrix_layout(b_layout_, CUBLASLT_ORDER_COL, trans_op, row, col,
+                row, src_type_, CUBLASLT_MATMUL_DESC_TRANSB, stride_b_);
+
+        row = M_;
+        col = N_;
+        order = CUBLASLT_ORDER_COL;
+        maybe_swap(row, col, trans_op, order, trans_c_);
+        create_matrix_layout(c_layout_, order, cublasOperation_t::CUBLAS_OP_N,
+                row, col, row, dst_type_, CUBLASLT_MATMUL_DESC_TRANSC,
+                stride_c_);
+    }
+
+    void create_blocked_layouts() {
+        create_matrix_layout(blocked_a_layout_, CUBLASLT_ORDER_COL32,
+                cublasOperation_t::CUBLAS_OP_N, M_, K_, a_blocked_ld_,
+                weights_type_, CUBLASLT_MATMUL_DESC_TRANSA, stride_a_blocked_);
+
+        create_matrix_layout(blocked_b_layout_, CUBLASLT_ORDER_COL32_2R_4R4,
+                cublasOperation_t::CUBLAS_OP_N, N_, K_, b_blocked_ld_,
+                src_type_, CUBLASLT_MATMUL_DESC_TRANSB, stride_b_blocked_);
+
+        create_matrix_layout(blocked_c_layout_, CUBLASLT_ORDER_COL32,
+                cublasOperation_t::CUBLAS_OP_N, M_, N_, c_blocked_ld_,
+                dst_type_, CUBLASLT_MATMUL_DESC_TRANSC, stride_c_blocked_);
+
+        uint64_t row, col;
+        if (!w_blocked_) {
+            row = M_;
+            col = K_;
+            if (trans_a_) { std::swap(row, col); }
+            create_matrix_layout(a_layout_, CUBLASLT_ORDER_COL,
+                    cublasOperation_t::CUBLAS_OP_N, row, col, row,
+                    weights_type_, CUBLASLT_MATMUL_DESC_TRANSA, stride_a_);
+        }
+
+        if (!src_blocked_) {
+            row = K_;
+            col = N_;
+            if (trans_b_) { std::swap(row, col); }
+            create_matrix_layout(b_layout_, CUBLASLT_ORDER_COL,
+                    cublasOperation_t::CUBLAS_OP_N, row, col, row, src_type_,
+                    CUBLASLT_MATMUL_DESC_TRANSB, stride_b_);
+        }
+
+        if (!dst_blocked_) {
+            row = M_;
+            col = N_;
+            if (trans_c_) { std::swap(row, col); }
+            create_matrix_layout(c_layout_, CUBLASLT_ORDER_COL,
+                    cublasOperation_t::CUBLAS_OP_N, row, col, row, dst_type_,
+                    CUBLASLT_MATMUL_DESC_TRANSC, stride_c_);
+        }
+
+        // Constraint for Turing/Ampere kernels matmul config needs to be
+        // A^N B^T
+        cublasOperation_t b_trans_t = cublasOperation_t::CUBLAS_OP_T;
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulDescSetAttribute, operation_desc_,
+                CUBLASLT_MATMUL_DESC_TRANSB, &b_trans_t, sizeof(b_trans_t));
+    }
+
+    void rt_cleanup() const {
+        if (a_layout_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutDestroy, a_layout_);
+        }
+        if (b_layout_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutDestroy, b_layout_);
+        }
+        if (c_layout_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutDestroy, c_layout_);
+        }
+
+        if (operation_desc_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmulDescDestroy, operation_desc_);
+        }
+
+        if (imma_ampere_case_) {
+            if (trans_desc_) {
+                CUBLAS_EXECUTE_FUNC(
+                        cublasLtMatrixTransformDescDestroy, trans_desc_);
+            }
+            if (blocked_a_layout_) {
+                CUBLAS_EXECUTE_FUNC(
+                        cublasLtMatrixLayoutDestroy, blocked_a_layout_);
+            }
+            if (blocked_b_layout_) {
+                CUBLAS_EXECUTE_FUNC(
+                        cublasLtMatrixLayoutDestroy, blocked_b_layout_);
+            }
+            if (blocked_c_layout_) {
+                CUBLAS_EXECUTE_FUNC(
+                        cublasLtMatrixLayoutDestroy, blocked_c_layout_);
+            }
+        }
+    }
+
+    void cleanup() const {
+        std::free(alpha_);
+        std::free(beta_);
+
+        if (preference_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmulPreferenceDestroy, preference_);
+        }
+        rt_cleanup();
+    }
+
+    cublasLtMatmulDesc_t operation_desc_;
+
+    cublasLtMatrixLayout_t a_layout_;
+    cublasLtMatrixLayout_t b_layout_;
+    cublasLtMatrixLayout_t c_layout_;
+    cublasLtMatrixLayout_t blocked_a_layout_;
+    cublasLtMatrixLayout_t blocked_b_layout_;
+    cublasLtMatrixLayout_t blocked_c_layout_;
+
+    bool multi_dst_scale_ = false;
+
+    bool with_bias_ = false;
+    bool with_bias_epilogue_ = false;
+    bool with_relu_;
+    bool with_relu_epilogue_ = false;
+    bool imma_case_ = false;
+    bool imma_ampere_case_ = false;
+    bool imma_plain_case_ = false;
+    bool w_blocked_ = false;
+    bool src_blocked_ = false;
+    bool dst_blocked_ = false;
+    cublasLtMatrixTransformDesc_t trans_desc_;
+    uint64_t source_size_ = 0;
+    uint64_t weight_size_ = 0;
+    uint64_t dest_size_ = 0;
+
+    int64_t stride_a_, stride_b_, stride_c_, stride_a_blocked_,
+            stride_b_blocked_, stride_c_blocked_, a_blocked_ld_, b_blocked_ld_,
+            c_blocked_ld_;
+
+    bool trans_a_ = false, trans_b_ = false, trans_c_ = false;
+
+    cublasComputeType_t compute_type_ = CUBLAS_COMPUTE_32F;
+
+    size_t alpha_beta_size_bytes_ = 0;
+    void *alpha_ = nullptr;
+    void *beta_ = nullptr;
+
+    size_t algo_scratch_size_ = 0;
+
+    cublasLtMatmulAlgo_t gemm_algo_;
+
+    cublasLtMatmulPreference_t preference_;
+
+    cublasLtMatmulHeuristicResult_t heuristic_results_;
+};
+
+struct cudnn_matmul_lt_impl_t {
+
+    void set_non_runtime_params(
+            const std::shared_ptr<cublas_lt_params> &matmul_params) {
+        matmul_params_ = matmul_params;
+    }
+
+    void execute(cublasHandle_t cublas_handle,
+            const std::shared_ptr<cublas_lt_params> matmul_params, void *a,
+            void *b, void *c, void *bias, void *algo_scratch,
+            void *reorder_scratch, void *block_a_scratch, void *block_b_scratch,
+            void *block_c_scratch, void * /* src_scale */,
+            void * /* wei_scale */, void *dst_scale) {
+
+        // use cached params unless using runtime dimensions
+        std::shared_ptr<cublas_lt_params> params
+                = matmul_params->has_runtime_params_ ? matmul_params
+                                                     : matmul_params_;
+
+        auto acc_type = params->acc_type_;
+
+        cudaStream_t streamId;
+        auto lt_handle = (cublasLtHandle_t)(cublas_handle);
+        CUBLAS_EXECUTE_FUNC(cublasGetStream, cublas_handle, &streamId);
+
+        auto b_layout = params->b_layout_;
+        auto blocked_b_layout = params->blocked_b_layout_;
+        auto a_layout = params->a_layout_;
+        auto blocked_a_layout = params->blocked_a_layout_;
+
+        auto imma_ampere_case = params->imma_ampere_case_;
+
+        if (imma_ampere_case) {
+            if (!params->src_blocked_) {
+                transform_matrix(lt_handle, params, b_layout, b,
+                        blocked_b_layout, block_b_scratch, !params->trans_b_,
+                        streamId);
+                b = block_b_scratch;
+            }
+            if (!params->w_blocked_) {
+                transform_matrix(lt_handle, params, a_layout, a,
+                        blocked_a_layout, block_a_scratch, params->trans_a_,
+                        streamId);
+                a = block_a_scratch;
+            }
+        }
+
+        cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+        auto with_bias_epilogue = params->with_bias_epilogue_;
+        auto with_relu_epilogue = params->with_relu_epilogue_;
+
+        auto operation_desc = params->operation_desc_;
+
+        if (with_bias_epilogue) {
+            if (with_relu_epilogue) {
+                epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
+            } else {
+                epilogue = CUBLASLT_EPILOGUE_BIAS;
+            }
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmulDescSetAttribute, operation_desc,
+                    CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias));
+        } else if (with_relu_epilogue && !with_bias_epilogue) {
+            epilogue = CUBLASLT_EPILOGUE_RELU;
+        }
+        CUBLAS_EXECUTE_FUNC(cublasLtMatmulDescSetAttribute, operation_desc,
+                CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue));
+
+        float scale = 1.0f;
+        float host_dst_scale = 1.0f;
+        if (dst_scale && !params->multi_dst_scale_ && acc_type != CUDA_R_32I) {
+            CUDA_EXECUTE_FUNC(cuMemcpy, (CUdeviceptr)&host_dst_scale,
+                    (CUdeviceptr)dst_scale, sizeof(float));
+            // only applied here if no post ops used
+            scale /= host_dst_scale;
+        }
+
+        auto alpha = params->alpha_;
+        auto beta = params->beta_;
+        auto post_op_sum = params->post_op_sum_;
+
+        if (acc_type == CUDA_R_16F) {
+            dnnl::impl::float16_t half_scale = scale;
+            dnnl::impl::float16_t half_gemm_beta = post_op_sum;
+            *static_cast<float16_t *>(alpha) = half_scale;
+            *static_cast<float16_t *>(beta) = half_gemm_beta;
+        } else {
+            *static_cast<float *>(alpha) = scale;
+            *static_cast<float *>(beta) = post_op_sum;
+        }
+
+        auto dst_blocked = params->dst_blocked_;
+        auto c_layout = params->c_layout_;
+        auto gemm_algo = params->gemm_algo_;
+        auto heuristic_results = params->heuristic_results_;
+
+        if (imma_ampere_case) {
+            if (!dst_blocked) {
+                std::memset(beta, 0, params->alpha_beta_size_bytes_);
+            }
+            auto blocked_c_layout = params->blocked_c_layout_;
+            c = params->reorder_required_ ? reorder_scratch : c;
+            void *tmp_c = dst_blocked ? c : block_c_scratch;
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmul, lt_handle, operation_desc,
+                    alpha, a, blocked_a_layout, b, blocked_b_layout, beta,
+                    tmp_c, blocked_c_layout, tmp_c, blocked_c_layout,
+                    &gemm_algo, algo_scratch, heuristic_results.workspaceSize,
+                    streamId);
+
+            if (!dst_blocked) {
+                transform_matrix(lt_handle, params, blocked_c_layout,
+                        block_c_scratch, c_layout, c, params->trans_c_,
+                        streamId, post_op_sum);
+            }
+        } else {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatmul, lt_handle, operation_desc,
+                    alpha, a, a_layout, b, b_layout, beta, c, c_layout, c,
+                    c_layout, &gemm_algo, algo_scratch,
+                    heuristic_results.workspaceSize, streamId);
+        }
+    }
+
+    ~cudnn_matmul_lt_impl_t() {
+        if (matmul_params_) { matmul_params_->cleanup(); }
+    }
+
+private:
+    void transform_matrix(cublasLtHandle_t handle,
+            const std::shared_ptr<cublas_lt_params> &params,
+            cublasLtMatrixLayout_t in_layout, void *in,
+            cublasLtMatrixLayout_t out_layout, void *out, bool transpose,
+            cudaStream_t stream, int beta = 0) {
+        int alpha = 1;
+        cublasLtMatrixTransformDesc_t trans_desc = params->trans_desc_;
+        cublasOperation_t transform_trans
+                = transpose ? CUBLAS_OP_T : CUBLAS_OP_N;
+        CUBLAS_EXECUTE_FUNC(cublasLtMatrixTransformDescSetAttribute, trans_desc,
+                CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA, &transform_trans,
+                sizeof(transform_trans));
+        CUBLAS_EXECUTE_FUNC(cublasLtMatrixTransform, handle, trans_desc, &alpha,
+                in, in_layout, &beta, out, out_layout, out, out_layout, stream);
+    }
+
+    std::shared_ptr<cublas_lt_params> matmul_params_;
+};
+
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/nvidia/cudnn_reorder.hpp b/src/gpu/nvidia/cudnn_reorder.hpp
index 30c7bc185dd..a97fa27b579 100644
--- a/src/gpu/nvidia/cudnn_reorder.hpp
+++ b/src/gpu/nvidia/cudnn_reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -83,14 +83,17 @@ struct cudnn_reorder_t : public gpu::primitive_t {
             return ok;
         }
 
-        bool scales_ok() const {
-            const auto &scales = attr()->scales_;
-            const auto &supported_args = {DNNL_ARG_FROM, DNNL_ARG_TO};
-            if (!scales.has_default_values(supported_args)) return false;
-            // cuDNN does not support scaling per dimension.
-            for (auto arg : supported_args)
-                if (scales.get(arg).mask_ != 0) return false;
-            return true;
+        bool scales_ok(const std::vector<int> &supported_args
+                = {DNNL_ARG_FROM, DNNL_ARG_TO}) const {
+            bool ok = attr()->scales_.has_default_values(supported_args);
+            for (int arg : supported_args) {
+                if (attr()->scales_.has_default_values(arg)) continue;
+
+                const auto &mask = attr()->scales_.get_mask(arg);
+                // cuDNN does not support scaling per dimension.
+                ok = ok && (mask == 0);
+            }
+            return ok;
         }
 
         bool post_ops_ok() const {
@@ -101,8 +104,7 @@ struct cudnn_reorder_t : public gpu::primitive_t {
 
         status_t init(impl::engine_t *engine, impl::engine_t *src_engine,
                 impl::engine_t *dst_engine) {
-            const auto attr_skip_mask
-                    = primitive_attr_t::skip_mask_t::scales_runtime
+            const auto attr_skip_mask = primitive_attr_t::skip_mask_t::scales
                     | primitive_attr_t::skip_mask_t::post_ops;
             bool ok = engine == dst_engine
                     && src_engine->kind() == engine_kind::gpu
diff --git a/src/gpu/nvidia/cudnn_reorder_lt.cpp b/src/gpu/nvidia/cudnn_reorder_lt.cpp
new file mode 100644
index 00000000000..d7e8eadddb1
--- /dev/null
+++ b/src/gpu/nvidia/cudnn_reorder_lt.cpp
@@ -0,0 +1,140 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gpu/nvidia/cudnn_reorder_lt.hpp"
+#include "gpu/nvidia/stream.hpp"
+#include "gpu/nvidia/sycl_cuda_scoped_context.hpp"
+#include "gpu/nvidia/sycl_cuda_stream_utils.hpp"
+
+#include "xpu/sycl/memory_storage_helper.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+
+status_t cudnn_reorder_lt_t::execute_internal_reorder(const exec_ctx_t &ctx,
+        const memory_arg_t &src, const memory_arg_t &dst,
+        const memory_arg_t *src_scales, const memory_arg_t *dst_scales) const {
+
+    exec_args_t r_args;
+    r_args[DNNL_ARG_SRC] = src;
+    r_args[DNNL_ARG_DST] = dst;
+    if (src_scales) r_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = *src_scales;
+    if (dst_scales) r_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST] = *dst_scales;
+
+    exec_ctx_t r_ctx(ctx, std::move(r_args));
+    nested_scratchpad_t ns(
+            ctx, memory_tracking::names::key_nested, generic_reorder_);
+
+    r_ctx.set_scratchpad_grantor(ns.grantor());
+
+    return generic_reorder_->execute(r_ctx);
+}
+
+status_t cudnn_reorder_lt_t::execute(const exec_ctx_t &ctx) const {
+    memory_desc_wrapper wrap(pd()->src_md());
+    if (wrap.size() == 0) { return status::success; }
+
+    nvidia::stream_t *cuda_stream
+            = utils::downcast<nvidia::stream_t *>(ctx.stream());
+
+    auto arg_src_md = ctx.args().at(DNNL_ARG_SRC);
+    auto arg_dst_md = ctx.args().at(DNNL_ARG_DST);
+    auto arg_src_scale_md
+            = ctx.args().find(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
+    auto arg_dst_scale_md
+            = ctx.args().find(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+    const memory_arg_t *src_scales = nullptr;
+    if (arg_src_scale_md != ctx.args().end()) {
+        src_scales = &arg_src_scale_md->second;
+    }
+    const memory_arg_t *dst_scales = nullptr;
+    if (arg_dst_scale_md != ctx.args().end()) {
+        dst_scales = &arg_dst_scale_md->second;
+    }
+    if (pd()->src_float_) {
+        std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
+        auto scratchpad_storage
+                = ctx.get_scratchpad_grantor().get_memory_storage(
+                        memory_tracking::names::key_reorder_cublaslt_src_float);
+        safe_ptr_assign(scratch_mem,
+                new memory_t(ctx.stream()->engine(), &pd()->dst_scratch_md_,
+                        std::move(scratchpad_storage)));
+        auto arg_src_scratch_md = memory_arg_t {scratch_mem.get(), false};
+
+        execute_internal_reorder(
+                ctx, arg_src_md, arg_src_scratch_md, src_scales, dst_scales);
+    }
+
+    return cuda_stream->interop_task([&](::sycl::handler &cgh) {
+        auto arg_src = CTX_IN_SYCL_MEMORY(DNNL_ARG_SRC);
+        auto arg_src_scratch = CTX_SCRATCH_SYCL_MEMORY(
+                memory_tracking::names::key_reorder_cublaslt_src_float);
+        auto arg_dst = CTX_OUT_SYCL_MEMORY(DNNL_ARG_DST);
+        auto arg_dst_scratch = CTX_SCRATCH_SYCL_MEMORY(
+                memory_tracking::names::key_reorder_cublaslt_dst_float);
+
+        auto arg_src_scale
+                = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
+
+        auto arg_dst_scale
+                = CTX_IN_SYCL_MEMORY(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+
+        compat::host_task(cgh, [=, this](const compat::interop_handle &ih) {
+            auto &sycl_engine = *utils::downcast<nvidia::engine_t *>(
+                    cuda_stream->engine());
+            auto sc = cuda_sycl_scoped_context_handler_t(sycl_engine);
+            auto handle = cuda_stream->get_cublas_handle();
+
+            void *src_ = pd()->src_float_
+                    ? arg_src_scratch.get_native_pointer(ih)
+                    : arg_src.get_native_pointer(ih);
+            void *dst_ = pd()->dst_float_
+                    ? arg_dst_scratch.get_native_pointer(ih)
+                    : arg_dst.get_native_pointer(ih);
+
+            auto a = static_cast<uint8_t *>(src_);
+            auto b = static_cast<uint8_t *>(dst_);
+
+            void *src_sc = arg_src_scale.get_native_pointer(ih);
+            void *dst_sc = arg_dst_scale.get_native_pointer(ih);
+
+            cublaslt_reorder_->execute(handle, a, b, src_sc, dst_sc);
+        });
+
+        if (pd()->dst_float_) {
+            std::unique_ptr<memory_t, memory_deleter_t> scratch_mem;
+            auto scratchpad_storage
+                    = ctx.get_scratchpad_grantor().get_memory_storage(
+                            memory_tracking::names::
+                                    key_reorder_cublaslt_dst_float);
+            safe_ptr_assign(scratch_mem,
+                    new memory_t(ctx.stream()->engine(), &pd()->src_scratch_md_,
+                            std::move(scratchpad_storage)));
+            auto arg_dst_scratch_md = memory_arg_t {scratch_mem.get(), false};
+
+            execute_internal_reorder(ctx, arg_dst_scratch_md, arg_dst_md,
+                    src_scales, dst_scales);
+        }
+    });
+}
+
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/gpu/nvidia/cudnn_reorder_lt.hpp b/src/gpu/nvidia/cudnn_reorder_lt.hpp
new file mode 100644
index 00000000000..ee0b5e9ac0d
--- /dev/null
+++ b/src/gpu/nvidia/cudnn_reorder_lt.hpp
@@ -0,0 +1,283 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+* Copyright 2020 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_NVIDIA_CUDNN_REORDER_LT_HPP
+#define GPU_NVIDIA_CUDNN_REORDER_LT_HPP
+
+#include "common/memory_desc_wrapper.hpp"
+#include "common/reorder.hpp"
+#include "common/reorder_pd.hpp"
+#include "gpu/gpu_primitive.hpp"
+#include "gpu/gpu_reorder_pd.hpp"
+#include "gpu/nvidia/cudnn_reorder_lt_impl.hpp"
+#include "gpu/nvidia/engine.hpp"
+#include "gpu/nvidia/sycl_cuda_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+
+struct cudnn_reorder_lt_t : public gpu::primitive_t {
+    using gpu::primitive_t::primitive_t;
+
+    struct pd_t : public gpu_reorder_pd_t {
+        using gpu_reorder_pd_t::gpu_reorder_pd_t;
+
+        DECLARE_COMMON_PD_T("cuda:cublaslt:any", cudnn_reorder_lt_t);
+
+        // Function to verify data and memory format
+        bool valid_data_n_mem_format(impl::engine_t *engine) {
+            auto src_dt_ = src_md()->data_type;
+            auto dst_dt_ = dst_md()->data_type;
+            bool ok = utils::one_of(
+                    src_dt_, data_type::f32, data_type::s8, data_type::s32);
+            ok = ok
+                    && utils::one_of(dst_dt_, data_type::f32, data_type::s8,
+                            data_type::s32);
+            // to ampere blocked
+            ok = ok
+                    && IMPLICATION(utils::one_of(src_dt_, data_type::s8,
+                                           data_type::s32, data_type::f32),
+                            dst_dt_ == data_type::s8);
+            // from ampere blocked
+            ok = ok
+                    && IMPLICATION(src_dt_ == data_type::s8,
+                            utils::one_of(dst_dt_, data_type::f32,
+                                    data_type::s32, data_type::s8));
+
+            src_float_ = src_dt_ == data_type::f32;
+            dst_float_ = dst_dt_ == data_type::f32;
+
+            if (!ok) return ok;
+
+            memory_desc_wrapper src_wrap(*src_md());
+            memory_desc_wrapper dst_wrap(*dst_md());
+
+            ok = ok && src_wrap.ndims() <= 3 && dst_wrap.ndims() <= 3;
+
+            // Only support transforming from plain to blocked format and vice-versa.
+            ok = ok && IMPLICATION(src_wrap.is_plain(), !dst_wrap.is_plain());
+            ok = ok && IMPLICATION(dst_wrap.is_plain(), !src_wrap.is_plain());
+            ok = ok && IMPLICATION(!src_wrap.is_plain(), dst_wrap.is_plain());
+            ok = ok && IMPLICATION(!dst_wrap.is_plain(), src_wrap.is_plain());
+            ok = ok && IMPLICATION(src_float_, src_wrap.is_plain());
+            ok = ok && IMPLICATION(dst_float_, dst_wrap.is_plain());
+
+            if (!ok) return ok;
+
+            auto check_tag = [&](const memory_desc_wrapper &wrap,
+                                     bool &transpose, format_kind_t &kind) {
+                kind = format_kind_t::dnnl_blocked;
+                if (wrap.is_cublaslt_blocked_desc()) {
+                    transpose = false;
+                    kind = format_kind::cublaslt_blocked;
+                    return format_tag::undef;
+                }
+                if (wrap.is_plain()) {
+                    auto tag = wrap.matches_one_of_tag(
+                            format_tag::ab, format_tag::abc);
+                    if (tag != format_tag::undef) {
+                        transpose = false;
+                        return tag;
+                    }
+                    tag = wrap.matches_one_of_tag(
+                            format_tag::ba, format_tag::acb);
+                    if (tag != format_tag::undef) {
+                        transpose = true;
+                        return tag;
+                    }
+                }
+                return dnnl_format_tag_undef;
+            };
+            ok = ok && src_wrap.ndims() == dst_wrap.ndims();
+            format_kind_t kind;
+
+            src_tag_ = check_tag(src_wrap, src_trans_, kind);
+            ok = IMPLICATION(
+                    kind == dnnl_blocked, src_tag_ != dnnl_format_tag_undef);
+
+            dst_tag_ = check_tag(dst_wrap, dst_trans_, kind);
+            ok = IMPLICATION(
+                    kind == dnnl_blocked, dst_tag_ != dnnl_format_tag_undef);
+            return ok;
+        }
+
+        bool scales_ok(const std::vector<int> &supported_args
+                = {DNNL_ARG_FROM, DNNL_ARG_TO}) const {
+            bool ok = attr()->scales_.has_default_values(supported_args);
+            for (int arg : supported_args) {
+                if (attr()->scales_.has_default_values(arg)) continue;
+
+                const auto &mask = attr()->scales_.get_mask(arg);
+                // cuDNN does not support scaling per dimension.
+                ok = ok && (mask == 0);
+            }
+            return ok;
+        }
+
+        bool post_ops_ok() const {
+            // only sum post-op is supported
+            const auto &p = attr()->post_ops_;
+            return p.len() == 0 || (p.len() == 1 && p.entry_[0].is_sum(false));
+        }
+
+        status_t init(impl::engine_t *engine, impl::engine_t *src_engine,
+                impl::engine_t *dst_engine) {
+            const auto attr_skip_mask = primitive_attr_t::skip_mask_t::scales
+                    | primitive_attr_t::skip_mask_t::post_ops;
+            bool ok = engine == dst_engine && src_engine == dst_engine
+                    && valid_data_n_mem_format(engine)
+                    && attr()->has_default_values(attr_skip_mask) && scales_ok()
+                    && post_ops_ok();
+            if (!ok) return status::unimplemented;
+
+            if (src_float_) {
+                src_scratch_md_ = *src_md();
+                dst_scratch_md_ = create_temp_md(src_scratch_md_);
+                this->src_md_ = dst_scratch_md_;
+            } else if (dst_float_) {
+                src_scratch_md_ = create_temp_md(dst_scratch_md_);
+                dst_scratch_md_ = *dst_md();
+            }
+
+            primitive_attr_t r_attr;
+            if (!attr()->scales_.has_default_values(DNNL_ARG_SRC)) {
+                const auto mask = attr()->scales_.get_mask(DNNL_ARG_SRC);
+                r_attr.scales_.set(DNNL_ARG_SRC, mask);
+            }
+
+            if (!attr()->scales_.has_default_values(DNNL_ARG_DST)) {
+                const auto mask = attr()->scales_.get_mask(DNNL_ARG_DST);
+                r_attr.scales_.set(DNNL_ARG_DST, mask);
+            }
+
+            CHECK(reorder_primitive_desc_create(generic_reorder_desc_, engine,
+                    &src_scratch_md_, &dst_scratch_md_, &r_attr));
+            init_scratchpad();
+
+            return status::success;
+        }
+
+        void init_scratchpad() {
+            memory_desc_wrapper src_wrap(src_md());
+            memory_desc_wrapper dst_wrap(dst_md());
+
+            int dims[DNNL_MAX_NDIMS];
+
+            if (!src_float_) {
+                convert_dims(dst_md()->padded_dims, dims, dst_md()->ndims);
+            } else {
+                convert_dims(src_md()->padded_dims, dims, src_md()->ndims);
+            }
+
+            const bool is_batched = src_wrap.ndims() > 2 && dst_wrap.dims()[0];
+
+            uint64_t row = dims[is_batched + 0];
+            uint64_t col = dims[is_batched + 1];
+
+            uint64_t blocked_ld
+                    = ceildiv(row, static_cast<uint64_t>(32)) * 32 * 32;
+            auto stride_b_blocked_
+                    = ceildiv(col, static_cast<uint64_t>(32)) * blocked_ld;
+
+            uint64_t src_scratch_size = 0;
+            uint64_t dst_scratch_size = 0;
+
+            if (src_float_) {
+                dst_scratch_size
+                        = stride_b_blocked_ * src_wrap.data_type_size() * 32;
+                src_scratch_size
+                        = src_wrap.nelems() * src_wrap.data_type_size();
+            } else {
+                src_scratch_size
+                        = stride_b_blocked_ * src_wrap.data_type_size() * 32;
+                dst_scratch_size
+                        = dst_wrap.nelems() * dst_wrap.data_type_size();
+            }
+
+            auto scratchpad = scratchpad_registry().registrar();
+            scratchpad.book(memory_tracking::names::key_nested_multiple,
+                    generic_reorder_desc_->scratchpad_registry());
+            if (src_scratch_size) {
+                scratchpad.book(
+                        memory_tracking::names::key_reorder_cublaslt_src_float,
+                        src_scratch_size, 1, 256);
+            }
+            if (dst_scratch_size) {
+                scratchpad.book(
+                        memory_tracking::names::key_reorder_cublaslt_dst_float,
+                        dst_scratch_size, 1, 256);
+            }
+        }
+
+        // Needed for internal reorder to convert src/dst from f32 to s8
+        memory_desc_t create_temp_md(const memory_desc_t &md) {
+            memory_desc_t temp;
+            temp = md;
+            temp.data_type = dnnl_s8;
+
+            return temp;
+        }
+
+        bool src_trans_ = false;
+        bool dst_trans_ = false;
+        bool src_float_ = false;
+        bool dst_float_ = false;
+
+        data_type_t src_dt_;
+        data_type_t dst_dt_;
+        memory_desc_t src_scratch_md_;
+        memory_desc_t dst_scratch_md_;
+        format_tag_t src_tag_;
+        format_tag_t dst_tag_;
+        std::shared_ptr<impl::primitive_desc_t> generic_reorder_desc_;
+
+    private:
+        DECLARE_GPU_REORDER_CREATE();
+    };
+
+    status_t init(impl::engine_t *engine) override {
+        cublaslt_reorder_.reset(new cublaslt_reorder_t);
+        CHECK(cublaslt_reorder_->init((reorder_pd_t *)pd()));
+        if ((pd()->src_float_ || pd()->dst_float_)) {
+            CHECK(create_nested_primitive(
+                    generic_reorder_, pd()->generic_reorder_desc_, engine));
+        }
+
+        return status::success;
+    }
+
+    status_t execute(const exec_ctx_t &ctx) const override;
+    status_t execute_internal_reorder(const exec_ctx_t &ctx,
+            const memory_arg_t &src, const memory_arg_t &dst,
+            const memory_arg_t *src_scales,
+            const memory_arg_t *dst_scales) const;
+
+private:
+    std::shared_ptr<impl::primitive_t> generic_reorder_;
+    std::shared_ptr<cublaslt_reorder_t> cublaslt_reorder_;
+
+    const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+};
+
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/nvidia/cudnn_reorder_lt_impl.hpp b/src/gpu/nvidia/cudnn_reorder_lt_impl.hpp
new file mode 100644
index 00000000000..5158244487e
--- /dev/null
+++ b/src/gpu/nvidia/cudnn_reorder_lt_impl.hpp
@@ -0,0 +1,211 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+* Copyright 2024 Codeplay Software Limited
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GPU_NVIDIA_CUDNN_REORDER_LT_IMPL_HPP
+#define GPU_NVIDIA_CUDNN_REORDER_LT_IMPL_HPP
+
+#include <cublasLt.h>
+#include "common/type_helpers.hpp"
+#include "gpu/nvidia/sycl_cuda_utils.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace gpu {
+namespace nvidia {
+
+struct cublaslt_reorder_t {
+public:
+    bool trans;
+    status_t init(reorder_pd_t *pd) {
+        // If any of the dimensions are 0 we should not continue with creating
+        // cudnn descriptors
+        memory_desc_wrapper src_wrap(pd->src_md());
+        memory_desc_wrapper dst_wrap(pd->dst_md());
+
+        if (src_wrap.size() == 0) { return status::success; }
+        // Validity checks
+        if (pd->dst_md()->ndims != pd->src_md()->ndims) {
+            return status::unimplemented;
+        }
+
+        const bool is_batched = src_wrap.ndims() > 2 && dst_wrap.dims()[0];
+        batch_count_ = is_batched ? dst_wrap.dims()[0] : 1;
+
+        CUBLAS_EXECUTE_FUNC(
+                cublasLtMatrixTransformDescCreate, &trans_desc_, CUDA_R_32I);
+
+        beta_ = pd->beta();
+
+        CHECK(get_cublas_data_type(pd->src_md()->data_type, src_data_type_));
+        CHECK(get_cublas_data_type(pd->dst_md()->data_type, dst_data_type_));
+        // take into account conversion from/to float
+        if (src_data_type_ == cudaDataType_t::CUDA_R_32F) {
+            src_data_type_ = cudaDataType_t::CUDA_R_8I;
+        }
+        if (dst_data_type_ == cudaDataType_t::CUDA_R_32F) {
+            dst_data_type_ = cudaDataType_t::CUDA_R_8I;
+        }
+
+        ampere_src_ = src_wrap.is_cublaslt_blocked_desc();
+
+        if (ampere_src_) {
+            convert_dims(pd->dst_md()->padded_dims, dims_, pd->dst_md()->ndims);
+        } else {
+            convert_dims(pd->src_md()->padded_dims, dims_, pd->src_md()->ndims);
+        }
+
+        trans = false;
+        row_ = dims_[is_batched + 0];
+        col_ = dims_[is_batched + 1];
+        int plain_ld = 0;
+        if (!ampere_src_) {
+            if (src_wrap.matches_one_of_tag(format_tag::ab, format_tag::abc)
+                    != format_tag::undef) {
+                non_blocked_order_ = CUBLASLT_ORDER_COL;
+                plain_ld = row_;
+                trans = true;
+            } else {
+                non_blocked_order_ = CUBLASLT_ORDER_ROW;
+                plain_ld = col_;
+            }
+        } else {
+            if (dst_wrap.matches_one_of_tag(format_tag::ab, format_tag::acb)
+                    != format_tag::undef) {
+                non_blocked_order_ = CUBLASLT_ORDER_COL;
+                plain_ld = row_;
+                trans = true;
+            } else {
+                non_blocked_order_ = CUBLASLT_ORDER_ROW;
+                plain_ld = col_;
+            }
+        }
+
+        uint64_t blocked_ld
+                = ceildiv(row_, static_cast<uint64_t>(32)) * 32 * 32;
+        auto stride_b_blocked_
+                = ceildiv(col_, static_cast<uint64_t>(32)) * blocked_ld;
+        if (ampere_src_) {
+            create_matrix_layout(src_layout_, col32_2r_4r4, col_, row_,
+                    blocked_ld, src_data_type_, stride_b_blocked_);
+            create_matrix_layout(dst_layout_, non_blocked_order_, row_, col_,
+                    plain_ld, dst_data_type_, row_ * col_);
+
+        } else {
+            create_matrix_layout(src_layout_, non_blocked_order_, col_, row_,
+                    col_, src_data_type_, row_ * col_);
+            create_matrix_layout(dst_layout_, col32_2r_4r4, row_, col_,
+                    blocked_ld, dst_data_type_, stride_b_blocked_);
+        }
+
+        return status::success;
+    }
+
+    void execute(cublasHandle_t cublas_handle, void *src, void *dst,
+            void *src_scale, void *dst_scale) {
+
+        cudaStream_t streamId;
+        auto lt_handle = (cublasLtHandle_t)(cublas_handle);
+        CUBLAS_EXECUTE_FUNC(cublasGetStream, cublas_handle, &streamId);
+        int alpha = 1;
+        if (src_scale) {
+            float host_src_scale = 1.0f;
+            CUDA_EXECUTE_FUNC(cuMemcpyAsync, (CUdeviceptr)&host_src_scale,
+                    (CUdeviceptr)src_scale, sizeof(float), streamId);
+            alpha *= host_src_scale;
+        }
+        int beta = beta_;
+        if (dst_scale) {
+            float host_dst_scale = 1.0f;
+            CUDA_EXECUTE_FUNC(cuMemcpyAsync, (CUdeviceptr)&host_dst_scale,
+                    (CUdeviceptr)dst_scale, sizeof(float), streamId);
+            alpha /= host_dst_scale;
+            beta /= host_dst_scale;
+        }
+        cublasOperation_t transform_trans = trans ? CUBLAS_OP_T : CUBLAS_OP_N;
+        CUBLAS_EXECUTE_FUNC(cublasLtMatrixTransformDescSetAttribute,
+                trans_desc_, CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
+                &transform_trans, sizeof(transform_trans));
+        CUBLAS_EXECUTE_FUNC(cublasLtMatrixTransform, lt_handle, trans_desc_,
+                &alpha, src, src_layout_, &beta, dst, dst_layout_, dst,
+                dst_layout_, streamId);
+    }
+
+    ~cublaslt_reorder_t() { cleanup(); }
+
+    void cleanup() {
+        if (src_layout_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutDestroy, src_layout_);
+            src_layout_ = nullptr;
+        }
+        if (dst_layout_) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutDestroy, dst_layout_);
+            dst_layout_ = nullptr;
+        }
+        if (trans_desc_) {
+            CUBLAS_EXECUTE_FUNC(
+                    cublasLtMatrixTransformDescDestroy, trans_desc_);
+            trans_desc_ = nullptr;
+        }
+    }
+
+protected:
+    cudaDataType_t src_data_type_;
+    cudaDataType_t dst_data_type_;
+    int dims_[DNNL_MAX_NDIMS];
+    float beta_ = 0.0f;
+
+    cublasLtMatrixTransformDesc_t trans_desc_;
+    cublasLtMatrixLayout_t src_layout_;
+    cublasLtMatrixLayout_t dst_layout_;
+
+    int batch_count_;
+    uint64_t row_, col_;
+
+    cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+    cublasLtOrder_t non_blocked_order_ = CUBLASLT_ORDER_COL;
+
+    bool ampere_src_ = false;
+
+    status_t create_matrix_layout(cublasLtMatrixLayout_t &layout,
+            cublasLtOrder_t order, uint64_t row, uint64_t col, uint64_t ld,
+            const cudaDataType_t data_type, uint64_t stride) {
+
+        CUBLAS_EXECUTE_FUNC(
+                cublasLtMatrixLayoutCreate, &layout, data_type, row, col, ld);
+
+        CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutSetAttribute, layout,
+                CUBLASLT_MATRIX_LAYOUT_ORDER, &order, sizeof(order));
+
+        if (batch_count_ != 1) {
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutSetAttribute, layout,
+                    CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count_,
+                    sizeof(batch_count_));
+            CUBLAS_EXECUTE_FUNC(cublasLtMatrixLayoutSetAttribute, layout,
+                    CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride,
+                    sizeof(stride));
+        }
+
+        return status_t::dnnl_success;
+    }
+};
+
+} // namespace nvidia
+} // namespace gpu
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/gpu/nvidia/cudnn_softmax.cpp b/src/gpu/nvidia/cudnn_softmax.cpp
index 1a39656dc06..29f317df470 100644
--- a/src/gpu/nvidia/cudnn_softmax.cpp
+++ b/src/gpu/nvidia/cudnn_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020-2022 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,12 +33,12 @@ status_t cudnn_softmax_fwd_t::execute(const exec_ctx_t &ctx) const {
     nvidia::stream_t *cuda_stream
             = utils::downcast<nvidia::stream_t *>(ctx.stream());
 
-    if (!pd()->attr()->scales_.get(DNNL_ARG_SRC).has_default_values())
+    if (!pd()->attr()->scales_.has_default_values(DNNL_ARG_SRC))
         CHECK(stream_utils::copy_input_arg_to_host(ctx, cuda_stream,
                 &host_scales_[0], DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC,
                 sizeof(float)));
 
-    if (!pd()->attr()->scales_.get(DNNL_ARG_DST).has_default_values())
+    if (!pd()->attr()->scales_.has_default_values(DNNL_ARG_DST))
         CHECK(stream_utils::copy_input_arg_to_host(ctx, cuda_stream,
                 &host_scales_[1], DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST,
                 sizeof(float)));
diff --git a/src/gpu/nvidia/cudnn_softmax.hpp b/src/gpu/nvidia/cudnn_softmax.hpp
index 3ce67c4acaf..9e792036ccd 100644
--- a/src/gpu/nvidia/cudnn_softmax.hpp
+++ b/src/gpu/nvidia/cudnn_softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 * Copyright 2020 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -52,11 +52,11 @@ struct cudnn_softmax_fwd_t : public gpu::primitive_t {
                     && IMPLICATION(src_md()->data_type == data_type::bf16,
                             has_bf16_support(sycl_dev))
                     && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::scales_runtime)
+                            primitive_attr_t::skip_mask_t::scales)
                     && set_default_formats() == status::success
                     && src_d.is_plain() && dst_d.is_plain() && dst_d == src_d
                     && IMPLICATION(!attr()->scales_.has_default_values(),
-                            check_scales_mask()
+                            attr_scales_ok()
                                     && dst_d.data_type() != data_type::s8);
             if (!ok) return status::unimplemented;
 
@@ -64,12 +64,6 @@ struct cudnn_softmax_fwd_t : public gpu::primitive_t {
 
             return softmax_impl_->init(this);
         }
-        bool check_scales_mask() const {
-            for (const auto &s : attr()->scales_.scales_) {
-                if (s.second.mask_ != 0) return false;
-            }
-            return true;
-        }
 
         std::shared_ptr<cudnn_softmax_impl_base_t> softmax_impl_;
     };
diff --git a/src/gpu/nvidia/engine.cpp b/src/gpu/nvidia/engine.cpp
index 375f652bdb5..a1caf9126c2 100644
--- a/src/gpu/nvidia/engine.cpp
+++ b/src/gpu/nvidia/engine.cpp
@@ -40,11 +40,6 @@ status_t engine_create(impl::engine_t **engine, engine_kind_t engine_kind,
     return status::success;
 }
 
-status_t engine_t::create_memory_storage(
-        memory_storage_t **storage, unsigned flags, size_t size, void *handle) {
-    return impl()->create_memory_storage(storage, this, flags, size, handle);
-}
-
 engine_t::engine_t(
         const ::sycl::device &dev, const ::sycl::context &ctx, size_t index)
     : impl::gpu::engine_t(
diff --git a/src/gpu/nvidia/engine.hpp b/src/gpu/nvidia/engine.hpp
index ded9b4a9ec9..1a2d1108978 100644
--- a/src/gpu/nvidia/engine.hpp
+++ b/src/gpu/nvidia/engine.hpp
@@ -43,20 +43,12 @@ class engine_t : public gpu::engine_t {
 
     status_t init() { return init_impl(); }
 
-    status_t create_memory_storage(memory_storage_t **storage, unsigned flags,
-            size_t size, void *handle) override;
-
     status_t create_stream(
             impl::stream_t **stream, impl::stream_impl_t *stream_impl) override;
 
     void activate_stream_cudnn(CUstream cuda_stream);
     void activate_stream_cublas(CUstream cuda_stream);
 
-    const ::sycl::device &device() const { return impl()->device(); }
-    const ::sycl::context &context() const { return impl()->context(); }
-
-    xpu::sycl::backend_t backend() const { return impl()->backend(); }
-
     CUcontext get_underlying_context() const;
     CUdevice get_underlying_device() const;
     cudnnHandle_t *get_cudnn_handle();
@@ -66,6 +58,8 @@ class engine_t : public gpu::engine_t {
         return impl()->mayiuse_system_memory_allocators();
     }
 
+    DECLARE_COMMON_SYCL_ENGINE_FUNCTIONS();
+
 protected:
     const xpu::sycl::engine_impl_t *impl() const {
         return (const xpu::sycl::engine_impl_t *)impl::engine_t::impl();
diff --git a/src/gpu/nvidia/sycl_cuda_compat.hpp b/src/gpu/nvidia/sycl_cuda_compat.hpp
index 197008123c0..5e6196f313d 100644
--- a/src/gpu/nvidia/sycl_cuda_compat.hpp
+++ b/src/gpu/nvidia/sycl_cuda_compat.hpp
@@ -35,9 +35,13 @@ T get_native_mem(const interop_handle &ih, U acc) {
             ih.get_native_mem<::sycl::backend::ext_oneapi_cuda>(acc));
 }
 
-template <typename T>
-void host_task(::sycl::handler &cgh, const T &task) {
+template <typename HandlerT, typename FnT>
+void host_task(HandlerT &cgh, const FnT &task) {
+#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    cgh.ext_codeplay_enqueue_native_command(task);
+#else
     cgh.host_task(task);
+#endif
 }
 
 template <typename native_object_t, typename sycl_object_t,
diff --git a/src/gpu/nvidia/sycl_cuda_utils.cpp b/src/gpu/nvidia/sycl_cuda_utils.cpp
index b587993abff..ce342beda0e 100644
--- a/src/gpu/nvidia/sycl_cuda_utils.cpp
+++ b/src/gpu/nvidia/sycl_cuda_utils.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-* Copyright 2020 Codeplay Software Limited
+* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2024 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,15 +58,35 @@ bool attr_post_ops_ok(const primitive_attr_t *attr) {
     }
 }
 
-bool has_bf16_support(const ::sycl::device &dev) {
-    // This function checks compute capabilities of the given device.
-    // BF16 is supported starting with compute capabilities 8.0.
+cudaDeviceProp query_device_properties(const ::sycl::device &dev) {
     auto cuda_dev = compat::get_native<CUdevice>(dev);
     cudaDeviceProp prop {};
     cudaGetDeviceProperties(&prop, cuda_dev);
+    return prop;
+}
+
+bool has_bf16_support(const ::sycl::device &dev) {
+    // This function checks compute capabilities of the given device.
+    // BF16 is supported starting with compute capabilities 8.0.
+    auto prop = query_device_properties(dev);
     return prop.major >= 8;
 }
 
+bool has_imma_ampere_layout_support(const ::sycl::device &dev) {
+    // This function checks compute capabilities of the given device.
+    // Ampere blocked layouts are supported starting with compute capabilities 7.0.
+    auto prop = query_device_properties(dev);
+    return prop.major == 8;
+}
+
+bool has_imma_dst_int8_support() {
+    // This function checks cuda runtime version capabilities.
+    // Int8 dst imma kernels are supported starting with cuda version 12.x.x.
+    int rt_version;
+    cudaRuntimeGetVersion(&rt_version);
+    return rt_version >= 12000;
+}
+
 } // namespace nvidia
 } // namespace gpu
 } // namespace impl
diff --git a/src/gpu/nvidia/sycl_cuda_utils.hpp b/src/gpu/nvidia/sycl_cuda_utils.hpp
index a4baad172c3..4e7f5fa51aa 100644
--- a/src/gpu/nvidia/sycl_cuda_utils.hpp
+++ b/src/gpu/nvidia/sycl_cuda_utils.hpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2020-2024 Intel Corporation
-* Copyright 2020 Codeplay Software Limited
+* Copyright 2020-2024 Codeplay Software Limited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,7 +58,10 @@ namespace nvidia {
             .get_access<::sycl::access::mode::read_write>(cgh)
 
 bool compare_cuda_devices(const ::sycl::device &lhs, const ::sycl::device &rhs);
+cudaDeviceProp query_device_properties(const ::sycl::device &dev);
 bool has_bf16_support(const ::sycl::device &dev);
+bool has_imma_ampere_layout_support(const ::sycl::device &dev);
+bool has_imma_dst_int8_support();
 
 // Check if the device type matches the passed engine kind
 inline status_t check_device(dnnl::impl::engine_kind_t eng_kind) {
@@ -67,9 +70,15 @@ inline status_t check_device(dnnl::impl::engine_kind_t eng_kind) {
                     : status::invalid_arguments);
 }
 
+static void sync_device() {
+#ifndef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+    cudaDeviceSynchronize();
+#endif
+}
+
 static void convert_dnnl_dims_array(
         const dnnl_dim_t *dims, int *new_dims, int n_dims) {
-    for (size_t i = 0; i < n_dims; i++) {
+    for (int i = 0; i < n_dims; i++) {
         new_dims[i] = static_cast<int>(dims[i]);
     }
 }
@@ -77,7 +86,7 @@ static void convert_dnnl_dims_array(
 static void convert_dims(const dnnl_dim_t *dims, int *new_dims, int n_dims,
         int adjustment_size = 4, int adjustment_value = 1) {
     convert_dnnl_dims_array(dims, new_dims, n_dims);
-    for (size_t i = n_dims; i < adjustment_size; i++) {
+    for (int i = n_dims; i < adjustment_size; i++) {
         new_dims[i] = adjustment_value;
     }
 }
@@ -126,7 +135,7 @@ static bool adjust_stride_for_dnn(
 
 // Check if the dimensions contain any zeros, returns true if they do.
 static bool has_zero_dims(const dnnl_dim_t *dims, int n_dims) {
-    for (size_t i = 0; i < n_dims; i++) {
+    for (int i = 0; i < n_dims; i++) {
         if (dims[i] == 0) { return true; }
     }
     return false;
@@ -182,11 +191,56 @@ static status_t convert_data_type(const memory_desc_t *mem_desc,
                                     ? cudnnDataType_t::CUDNN_DATA_INT8x4
                                     : cudnnDataType_t::CUDNN_DATA_INT8);
             break;
+        case dnnl_data_type_t::dnnl_s32:
+            *cudnn_data_type = cudnnDataType_t::CUDNN_DATA_INT32;
+            break;
         default: return status::unimplemented;
     }
     return status::success;
 }
 
+static status_t get_cublas_data_type(
+        dnnl_data_type_t data_type, cudaDataType_t &blas_dt) {
+    switch (data_type) {
+        case dnnl_data_type_t::dnnl_f32:
+            blas_dt = CUDA_R_32F;
+            return status::success;
+        case dnnl_data_type_t::dnnl_f16:
+            blas_dt = CUDA_R_16F;
+            return status::success;
+        case dnnl_data_type_t::dnnl_bf16:
+            blas_dt = CUDA_R_16BF;
+            return status::success;
+        case dnnl_data_type_t::dnnl_s8:
+            blas_dt = CUDA_R_8I;
+            return status::success;
+        case dnnl_data_type_t::dnnl_s32:
+            blas_dt = CUDA_R_32I;
+            return status::success;
+        default: return status::unimplemented;
+    }
+    return status::unimplemented;
+}
+
+inline bool is_md_col32(const memory_desc_wrapper &md) {
+    const bool is_batched = md.ndims() > 2;
+    // cublas operates in col-major so this function checks if the rows are blocked in 32.
+    if (md.is_blocking_desc()) {
+        if (md.blocking_desc().inner_nblks == 1
+                && md.blocking_desc().inner_idxs[0] == (is_batched ? 1 : 0)
+                && md.blocking_desc().inner_blks[0] == 32) {
+            return true;
+        }
+    }
+    return false;
+}
+
+template <typename T,
+        typename = typename std::enable_if<std::is_integral_v<T>>::type>
+T ceildiv(T n, T d) {
+    return (n + d - 1) / d;
+}
+
 class cublas_error : virtual public std::runtime_error {
 
 protected:
diff --git a/src/graph/backend/CMakeLists.txt b/src/graph/backend/CMakeLists.txt
index afd0eb02540..a036a61261c 100644
--- a/src/graph/backend/CMakeLists.txt
+++ b/src/graph/backend/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2023 Intel Corporation
+# Copyright 2021-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,4 +16,3 @@
 
 add_subdirectory(fake)
 add_subdirectory(dnnl)
-add_subdirectory(graph_compiler)
diff --git a/src/graph/backend/dnnl/common.cpp b/src/graph/backend/dnnl/common.cpp
index e678de34098..826cf2a7894 100644
--- a/src/graph/backend/dnnl/common.cpp
+++ b/src/graph/backend/dnnl/common.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -646,7 +646,7 @@ dnnl::memory::format_tag get_format_tag(const dnnl::memory::desc &md) {
     return format_tag;
 }
 
-size_t generate_constant_cache_key(
+size_t generate_constant_md_hash(
         size_t part_id, const std::vector<dnnl::memory::desc> &const_mds) {
     size_t key = 0;
     key = hash_combine(key, part_id);
diff --git a/src/graph/backend/dnnl/common.hpp b/src/graph/backend/dnnl/common.hpp
index f53450bb373..2809ad53463 100644
--- a/src/graph/backend/dnnl/common.hpp
+++ b/src/graph/backend/dnnl/common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -141,7 +141,7 @@ std::string get_format_tag_str(const dnnl::memory::desc &md);
 
 dnnl::memory::format_tag get_format_tag(const dnnl::memory::desc &md);
 
-size_t generate_constant_cache_key(
+size_t generate_constant_md_hash(
         size_t part_id, const std::vector<dnnl::memory::desc> &const_mds);
 
 #ifndef NDEBUG
diff --git a/src/graph/backend/dnnl/dnnl_backend.cpp b/src/graph/backend/dnnl/dnnl_backend.cpp
index e5ab0ea51e7..8a3fb6042ca 100644
--- a/src/graph/backend/dnnl/dnnl_backend.cpp
+++ b/src/graph/backend/dnnl/dnnl_backend.cpp
@@ -66,6 +66,7 @@ pass::pass_registry_t dnnl_backend_t::register_passes() {
     DNNL_BACKEND_REGISTER_PATTERN_CALL(shuffle_fusion, pass_registry);
     DNNL_BACKEND_REGISTER_PATTERN_CALL(reduction_fusion, pass_registry);
     DNNL_BACKEND_REGISTER_PATTERN_CALL(groupnorm_fusion, pass_registry);
+    DNNL_BACKEND_REGISTER_PATTERN_CALL(mlp, pass_registry);
 
     const std::vector<data_type_t> dtypes_to_check
             = {dnnl_bf16, dnnl_f16, dnnl_f8_e4m3, dnnl_f8_e5m2};
diff --git a/src/graph/backend/dnnl/dnnl_backend.hpp b/src/graph/backend/dnnl/dnnl_backend.hpp
index b72137dd0d5..5b2feb559ee 100644
--- a/src/graph/backend/dnnl/dnnl_backend.hpp
+++ b/src/graph/backend/dnnl/dnnl_backend.hpp
@@ -137,13 +137,13 @@ class dnnl_backend_t : public backend_t {
         std::string pass_config_json = "dnnl_graph_passes.json";
         std::ifstream fs(pass_config_json.c_str());
         if (fs) {
-            verbose_printf("onednn_graph_verbose,info,pattern,load,%s\n",
-                    pass_config_json.c_str());
+            verbose_printf(
+                    "graph,info,pattern,load,%s\n", pass_config_json.c_str());
         } else {
             if (getenv_int_user("GRAPH_DUMP", 0) > 0
                     || graph::utils::check_verbose_string_user(
                             "GRAPH_DUMP", "pattern")) {
-                verbose_printf("onednn_graph_verbose,info,pattern,dump,%s\n",
+                verbose_printf("graph,info,pattern,dump,%s\n",
                         pass_config_json.c_str());
                 pm.print_passes(pass_config_json);
             }
diff --git a/src/graph/backend/dnnl/dnnl_op_def.hpp b/src/graph/backend/dnnl/dnnl_op_def.hpp
index 549e6056169..be9118fee5d 100644
--- a/src/graph/backend/dnnl/dnnl_op_def.hpp
+++ b/src/graph/backend/dnnl/dnnl_op_def.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -76,6 +76,12 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_mul_scales, 1,
                 .set_attr(
                         op_attr::qtype, false, attribute_kind::s, "per_tensor")
                 .set_attr(op_attr::axis, false, attribute_kind::i, int64_t(1))
+                .set_attr(op_attr::group_shape, false, attribute_kind::is,
+                        std::vector<int64_t>())
+                .set_attr(op_attr::group_mask, false, attribute_kind::i,
+                        int64_t(0))
+                .set_attr(op_attr::data_type, false, attribute_kind::i,
+                        int64_t(0))
                 .set_attr(op_attr::scales, false, attribute_kind::fs,
                         std::vector<float>())
                 .set_attr(op_attr::with_runtime_scales, false,
@@ -116,6 +122,12 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_add_zps, 1,
                         std::vector<int64_t>())
                 .set_attr(op_attr::with_runtime_zps, false, attribute_kind::b,
                         false)
+                .set_attr(op_attr::group_shape, false, attribute_kind::is,
+                        std::vector<int64_t>())
+                .set_attr(op_attr::group_mask, false, attribute_kind::i,
+                        int64_t(0))
+                .set_attr(op_attr::data_type, false, attribute_kind::i,
+                        int64_t(0))
                 .set_shape_inference_function(infer_identity_output_shape)
                 .SET_LAYOUT_PROPAGATOR(layout_propagator_for_add_zps)
                 .SET_EXECUTABLE_CREATOR(dummy_executable_creator)
@@ -137,6 +149,12 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_sub_zps, 1,
                         std::vector<int64_t>())
                 .set_attr(op_attr::with_runtime_zps, false, attribute_kind::b,
                         false)
+                .set_attr(op_attr::group_shape, false, attribute_kind::is,
+                        std::vector<int64_t>())
+                .set_attr(op_attr::group_mask, false, attribute_kind::i,
+                        int64_t(0))
+                .set_attr(op_attr::data_type, false, attribute_kind::i,
+                        int64_t(0))
                 .set_shape_inference_function(infer_identity_output_shape)
                 .SET_LAYOUT_PROPAGATOR(layout_propagator_for_sub_zps)
                 .SET_EXECUTABLE_CREATOR(dummy_executable_creator)
@@ -683,6 +701,7 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_binary, 1,
                 .set_num_outputs(2)
                 .set_input(0, "a")
                 .set_input(1, "b")
+                .set_input(2, "cond")
                 .set_output(0, "output")
                 .set_output(1, "scratchpad")
                 // Attributes inherited from front binary ops (Add, Multiply,
@@ -756,6 +775,22 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_eltwise_bwd, 1,
                         executable_creator<eltwise_bwd_executable_t>)
                 .SET_ARG_INDICES_GETTER(eltwise_bwd_executable_t))
 
+DNNL_GRAPH_OP_SCHEMA(dnnl_gen_index, 1,
+        op_schema_t()
+                .set_num_inputs(1)
+                .set_num_outputs(1)
+                .set_input(0, "input")
+                .set_output(0, "output")
+                // Attributes inherited from front GenIndex ops
+                .set_attr(op_attr::axis, true, attribute_kind::i)
+                .SET_ATTR_IS_CONSTANT // used for constant prop and cache
+                // Analysis rules
+                .set_shape_inference_function(infer_identity_output_shape)
+                .SET_LAYOUT_PROPAGATOR(layout_propagator_for_gen_index)
+                .SET_EXECUTABLE_CREATOR(
+                        executable_creator<genindex_executable_t>)
+                .SET_ARG_INDICES_GETTER(genindex_executable_t))
+
 DNNL_GRAPH_OP_SCHEMA(dnnl_shuffle, 1,
         op_schema_t()
                 .set_num_inputs(1)
@@ -923,7 +958,7 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_matmul, 1,
                 .set_num_outputs(2)
                 .set_input(0, "src0")
                 .set_input(1, "src1")
-                .set_input(2, "bias")
+                .set_input(2, "bias") // optional
                 .set_output(0, "output")
                 .set_output(1, "scratchpad")
                 // Attributes inherited from MatMul.
@@ -1033,6 +1068,9 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_reorder, 1,
                 .set_attr(op_attr::scales, false, attribute_kind::fs)
                 .set_attr(op_attr::src_zps, false, attribute_kind::is)
                 .set_attr(op_attr::dst_zps, false, attribute_kind::is)
+                .set_attr(op_attr::group_shape, false, attribute_kind::is)
+                .set_attr(op_attr::group_mask, false, attribute_kind::i,
+                        int64_t(0))
                 .set_attr(op_attr::with_runtime_scales, false,
                         attribute_kind::b, false)
                 .set_attr(op_attr::with_runtime_src_zps, false,
@@ -1079,6 +1117,49 @@ DNNL_GRAPH_OP_SCHEMA(dnnl_groupnorm, 1,
                         executable_creator<groupnorm_executable_t>)
                 .SET_ARG_INDICES_GETTER(groupnorm_executable_t))
 
+DNNL_GRAPH_OP_SCHEMA(dnnl_mask, 1,
+        op_schema_t()
+                .set_num_inputs(2)
+                .set_num_outputs(1)
+                .set_input(0, "input")
+                .set_input(1, "-inf")
+                .set_output(0, "output")
+                // Attributes inherited from front gen_index ops
+                .set_attr(op_attr::axis_row, true, attribute_kind::i)
+                .set_attr(op_attr::axis_col, true, attribute_kind::i)
+                .SET_ATTR_IS_CONSTANT // used for constant prop and cache
+                // Analysis rules
+                .set_shape_inference_function(infer_identity_output_shape)
+                .SET_LAYOUT_PROPAGATOR(layout_propagator_for_mask)
+                .SET_EXECUTABLE_CREATOR(executable_creator<memory_reparser_t>)
+                .SET_ARG_INDICES_GETTER(memory_reparser_t))
+
+// The data types of query/key/value/mask/output must be consistent, and only
+// f16/bf16 are supported. The data type of scale must be consistent with other
+// input and output data types or fp32.
+DNNL_GRAPH_OP_SCHEMA(dnnl_sdpa, 1,
+        op_schema_t()
+                .set_inputs_option(op_schema_t::param_num_option::variadic)
+                .set_num_inputs(std::set<size_t>({3, 32}))
+                .set_num_outputs(2)
+                .set_input(0, "query")
+                .set_input(1, "key")
+                .set_input(2, "value")
+                .set_input(3, "scale") // optional
+                .set_input(4, "mask") // optional
+                .set_output(0, "output")
+                .set_output(1, "scratchpad")
+                .set_attr(op_attr::with_scale, true, attribute_kind::b)
+                .set_attr(op_attr::is_invert_scale, false, attribute_kind::b,
+                        false)
+                .set_attr(op_attr::with_mask, true, attribute_kind::b)
+                // with_causal attribute support top-left mask type only
+                .set_attr(op_attr::with_causal, true, attribute_kind::b)
+                .set_shape_inference_function(infer_dnnl_sdpa_output_shape)
+                .SET_LAYOUT_PROPAGATOR(layout_propagator_for_sdpa)
+                .SET_EXECUTABLE_CREATOR(executable_creator<sdpa_executable_t>)
+                .SET_ARG_INDICES_GETTER(sdpa_executable_t))
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/dnnl_opset.hpp b/src/graph/backend/dnnl/dnnl_opset.hpp
index 3a8beb8e2f4..0a6091ba616 100644
--- a/src/graph/backend/dnnl/dnnl_opset.hpp
+++ b/src/graph/backend/dnnl/dnnl_opset.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,6 +72,8 @@ class dnnl_opset_t {
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_eltwise, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(
                         dnnl_eltwise_bwd, 1)>());
+        fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_gen_index, 1)>());
+        fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_mask, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_shuffle, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_sum, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_prelu, 1)>());
@@ -95,6 +97,7 @@ class dnnl_opset_t {
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_layernorm, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_reorder, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_groupnorm, 1)>());
+        fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(dnnl_sdpa, 1)>());
     }
 };
 
diff --git a/src/graph/backend/dnnl/dnnl_partition_impl.cpp b/src/graph/backend/dnnl/dnnl_partition_impl.cpp
index 4b8148041e0..e8f0ce1cee9 100644
--- a/src/graph/backend/dnnl/dnnl_partition_impl.cpp
+++ b/src/graph/backend/dnnl/dnnl_partition_impl.cpp
@@ -130,7 +130,9 @@ status_t dnnl_partition_impl_t::compile(
 
     // Dispatch to fake kernel if one of the output dimensions is zero.
     const std::vector<std::shared_ptr<op_t>> &fused_op = part->get_ops();
-    auto agraph = graph_t(fused_op, get_engine_kind(), get_fpmath_mode());
+    auto fpm = get_fpmath_mode();
+    auto agraph = graph_t(fused_op, get_engine_kind());
+    agraph.set_fpmath_mode(fpm.mode_, fpm.apply_to_int_);
     agraph.set_user_inputs_outputs(inputs, outputs);
     agraph.infer_shape();
     for (const auto &val : agraph.get_output_values()) {
diff --git a/src/graph/backend/dnnl/dnnl_partition_impl.hpp b/src/graph/backend/dnnl/dnnl_partition_impl.hpp
index 19d69d85438..41dabcd8f94 100644
--- a/src/graph/backend/dnnl/dnnl_partition_impl.hpp
+++ b/src/graph/backend/dnnl/dnnl_partition_impl.hpp
@@ -90,8 +90,8 @@ class dnnl_partition_impl_t : public partition_impl_t {
     friend class dnnl_backend_t;
 
 public:
-    dnnl_partition_impl_t(engine_kind_t engine_kind, fpmath_mode_t fpmath_mode,
-            partition_kind_t pkind)
+    dnnl_partition_impl_t(engine_kind_t engine_kind,
+            const fpmath_t &fpmath_mode, partition_kind_t pkind)
         : partition_impl_t(engine_kind, fpmath_mode, pkind) {}
 
     ~dnnl_partition_impl_t() override = default;
diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.cpp b/src/graph/backend/dnnl/dnnl_shape_infer.cpp
index 781fe979190..e1cb7fc7d15 100644
--- a/src/graph/backend/dnnl/dnnl_shape_infer.cpp
+++ b/src/graph/backend/dnnl/dnnl_shape_infer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,9 +15,9 @@
 *******************************************************************************/
 
 #include <algorithm>
-#include <unordered_set>
-
 #include "graph/interface/shape_infer.hpp"
+#include "oneapi/dnnl/dnnl.hpp"
+#include <unordered_set>
 
 #include "graph/backend/dnnl/dnnl_shape_infer.hpp"
 #include "graph/backend/dnnl/internal_attrs.hpp"
@@ -484,17 +484,122 @@ status_t infer_dnnl_pool_bwd_output_shape(op_t *n,
     return status::success;
 }
 
+status_t infer_binary_select_output_shape(op_t *n,
+        std::vector<logical_tensor_t *> &inputs,
+        std::vector<logical_tensor_t *> &outputs) {
+    auto in0 = logical_tensor_wrapper_t(inputs[0]);
+    auto in1 = logical_tensor_wrapper_t(inputs[1]);
+    auto in2 = logical_tensor_wrapper_t(inputs[2]);
+
+    const bool shapes_should_match = n->has_attr(op_attr::auto_broadcast)
+            ? "none" == n->get_attr<std::string>(op_attr::auto_broadcast)
+            : false;
+
+    dims input0_dims = in0.vdims();
+    dims input1_dims = in1.vdims();
+    dims input2_dims = in2.vdims();
+    dims inferred_out_shape;
+
+    if (shapes_should_match) { // no broadcast
+        VCHECK_INVALID_SHAPE(
+                (input0_dims == input1_dims && input1_dims == input2_dims),
+                "%s, all input dims should match each other if there is no "
+                "broadcast. input0 dims: %s, input1 dims: %s, input2 dims: %s ",
+                op_t::kind2str(n->get_kind()).c_str(),
+                dims2str(input0_dims).c_str(), dims2str(input1_dims).c_str(),
+                dims2str(input2_dims).c_str());
+        inferred_out_shape = std::move(input0_dims);
+    } else { // can broadcast
+        status_t ret1 = broadcast(input0_dims, input1_dims, inferred_out_shape);
+        VCHECK_INVALID_SHAPE((ret1 == status::success),
+                "%s, failed to implement numpy broadcasting",
+                op_t::kind2str(n->get_kind()).c_str());
+    }
+
+    auto out0 = logical_tensor_wrapper_t(outputs[0]);
+    // check if given or partial set shape aligns with inferred shape
+    if (!out0.is_shape_unknown() || out0.ndims() != -1) {
+        VCHECK_INVALID_SHAPE(validate(inferred_out_shape, out0.vdims()),
+                "%s, inferred out shape and output shape are not compatible",
+                op_t::kind2str(n->get_kind()).c_str());
+        if (!out0.is_shape_unknown()) return status::success;
+    }
+
+    set_shape_and_strides(*outputs[0], inferred_out_shape);
+    return status::success;
+}
+
 status_t infer_dnnl_binary_output_shape(op_t *n,
         std::vector<logical_tensor_t *> &inputs,
         std::vector<logical_tensor_t *> &outputs) {
     const bool is_bias_add = n->has_attr(op_attr::is_bias_add)
             && n->get_attr<bool>(op_attr::is_bias_add);
+    const algorithm algo = static_cast<dnnl::algorithm>(
+            n->get_attr<int64_t>(op_attr::alg_kind));
+    if (algo == algorithm::binary_select) {
+        return infer_binary_select_output_shape(n, inputs, outputs);
+    } else if (is_bias_add) {
+        return infer_bias_add_output_shape(n, inputs, outputs);
+    } else {
+        return infer_elemwise_arithmetic_output_shape(n, inputs, outputs);
+    }
+}
+
+status_t infer_dnnl_sdpa_output_shape(op_t *n,
+        std::vector<logical_tensor_t *> &inputs,
+        std::vector<logical_tensor_t *> &outputs) {
+    // [batch_size, num_heads_q, seq_len_q, head_size_qk]
+    auto query = logical_tensor_wrapper_t(inputs[0]);
+    // [batch_size, num_heads_q, head_size_qk, seq_len_kv,]
+    auto key = logical_tensor_wrapper_t(inputs[1]);
+    // [batch_size, num_heads_v, seq_len_kv, head_size_v]
+    auto value = logical_tensor_wrapper_t(inputs[2]);
+    // [batch_size, num_heads_q, seq_len_q, head_size_v]
+    auto out0 = logical_tensor_wrapper_t(outputs[0]);
+
+    dims query_dims = query.vdims();
+    dims key_dims = key.vdims();
+    dims value_dims = value.vdims();
+
+    VCHECK_INVALID_SHAPE((query_dims.size() == key_dims.size()
+                                 && key_dims.size() == value_dims.size()),
+            "%s, all input dims should match each other. input0 dims: %s, "
+            "input1 dims: %s, input2 dims: %s ",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(query_dims).c_str(),
+            dims2str(key_dims).c_str(), dims2str(value_dims).c_str());
+
+    VCHECK_INVALID_SHAPE((query_dims.size() == 4),
+            "%s, only support 4D input for all q/k/v. input0 dimension: %s, "
+            "input1 dimension: %s, input2 dimension: %s ",
+            op_t::kind2str(n->get_kind()).c_str(),
+            std::to_string(query_dims.size()).c_str(),
+            std::to_string(key_dims.size()).c_str(),
+            std::to_string(value_dims.size()).c_str());
+
+    VCHECK_INVALID_SHAPE((query_dims[3] == key_dims[2]),
+            "%s, query head size should be match with key head size. query "
+            "dims: %s, Key dims: %s",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(query_dims).c_str(),
+            dims2str(key_dims).c_str());
+
+    VCHECK_INVALID_SHAPE((key_dims[3] == value_dims[2]),
+            "%s, key sequence length should be match with value sequence "
+            "length. key dims: %s, value dims: %s ",
+            op_t::kind2str(n->get_kind()).c_str(), dims2str(key_dims).c_str(),
+            dims2str(value_dims).c_str());
+
+    dims inferred_output_shape;
+    inferred_output_shape
+            = {query_dims[0], query_dims[1], query_dims[2], value_dims[3]};
 
-    auto ret = is_bias_add
-            ? infer_bias_add_output_shape(n, inputs, outputs)
-            : infer_elemwise_arithmetic_output_shape(n, inputs, outputs);
+    if (out0.ndims() != -1) {
+        VCHECK_INVALID_SHAPE(validate(inferred_output_shape, out0.vdims()),
+                "%s, inferred out shape and output shape are not compatible",
+                op_t::kind2str(n->get_kind()).c_str());
+    }
 
-    return ret;
+    set_shape_and_strides(*outputs[0], inferred_output_shape);
+    return status::success;
 }
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/dnnl_shape_infer.hpp b/src/graph/backend/dnnl/dnnl_shape_infer.hpp
index 22ef21b65ae..0877dc26c11 100644
--- a/src/graph/backend/dnnl/dnnl_shape_infer.hpp
+++ b/src/graph/backend/dnnl/dnnl_shape_infer.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -103,6 +103,14 @@ status_t infer_dnnl_binary_output_shape(op_t *n,
         std::vector<logical_tensor_t *> &inputs,
         std::vector<logical_tensor_t *> &outputs);
 
+status_t infer_binary_select_output_shape(op_t *n,
+        std::vector<logical_tensor_t *> &inputs,
+        std::vector<logical_tensor_t *> &outputs);
+
+status_t infer_dnnl_sdpa_output_shape(op_t *n,
+        std::vector<logical_tensor_t *> &inputs,
+        std::vector<logical_tensor_t *> &outputs);
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/fusion_info.cpp b/src/graph/backend/dnnl/fusion_info.cpp
index 7d32cef5dbd..218ece28730 100644
--- a/src/graph/backend/dnnl/fusion_info.cpp
+++ b/src/graph/backend/dnnl/fusion_info.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,29 +35,39 @@
 #include "graph/backend/dnnl/utils.hpp"
 
 #include "oneapi/dnnl/dnnl.hpp"
-
 namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
+using value_ptr = std::shared_ptr<value_t>;
+
 dnnl::primitive_attr make_dnnl_primitive_attr(
         const std::shared_ptr<op_t> &op, const fusion_info_t &fusion_info) {
     dnnl::primitive_attr attr;
+    std::vector<int64_t> default_groups;
 
     if (fusion_info.dst_scales_) {
         const op_t *dst_scales_op = fusion_info.dst_scales_->get_op();
-        assertm(fusion_info.with_runtime_scales(false, 0),
-                "only support runtime dst scales.\n");
+        VCHECK_FUSION_INFO(fusion_info.with_runtime_scales(false, 0), attr,
+                "failed to set scales for %s since primitive only supports "
+                "runtime dst scales",
+                op->get_name().c_str());
         int mask = 0;
+        int64_t dt = 0;
+
         if (dst_scales_op->has_attr(op_attr::axis)
                 && dst_scales_op->has_attr(op_attr::qtype)) {
             int64_t axis = dst_scales_op->get_attr<int64_t>(op_attr::axis);
             std::string qtype
                     = dst_scales_op->get_attr<std::string>(op_attr::qtype);
             mask = qtype == "per_tensor" ? 0 : 1 << axis;
+            dt = dst_scales_op->has_attr(op_attr::data_type)
+                    ? dst_scales_op->get_attr<int64_t>(op_attr::data_type)
+                    : dnnl_f32;
         }
-        attr.set_scales_mask(DNNL_ARG_DST, mask);
+        attr.set_scales(DNNL_ARG_DST, mask, default_groups,
+                static_cast<dnnl::memory::data_type>(dt));
     }
 
     // convert input scales
@@ -65,17 +75,31 @@ dnnl::primitive_attr make_dnnl_primitive_attr(
         for (const auto &in_scales : fusion_info.input_scales_) {
             size_t in_scales_indices = in_scales.first;
             const op_t *in_scales_op = in_scales.second->get_op();
-            assertm(fusion_info.with_runtime_scales(true, in_scales_indices),
-                    "only support runtime src scales.\n");
+            VCHECK_FUSION_INFO(
+                    fusion_info.with_runtime_scales(true, in_scales_indices),
+                    attr,
+                    "failed to set scales for %s since primitive only supports "
+                    "runtime src scales",
+                    op->get_name().c_str());
             int mask = 0;
-            if (in_scales_op->has_attr(op_attr::axis)
-                    && in_scales_op->has_attr(op_attr::qtype)) {
-                int64_t axis = in_scales_op->get_attr<int64_t>(op_attr::axis);
+            if (in_scales_op->has_attr(op_attr::qtype)) {
                 std::string qtype
                         = in_scales_op->get_attr<std::string>(op_attr::qtype);
+                const auto scales_data_type
+                        = in_scales_op->has_attr(op_attr::data_type)
+                        ? in_scales_op->get_attr<int64_t>(op_attr::data_type)
+                        : dnnl_f32;
                 if (qtype == "per_tensor") {
                     mask = 0;
-                } else {
+                    attr.set_scales(in_scales_indices == 0 ? DNNL_ARG_SRC
+                                                           : DNNL_ARG_WEIGHTS,
+                            mask, default_groups,
+                            static_cast<dnnl::memory::data_type>(
+                                    scales_data_type));
+                } else if (qtype == "per_channel") { // per-channel quantization
+                    int64_t axis = in_scales_op->has_attr(op_attr::axis)
+                            ? in_scales_op->get_attr<int64_t>(op_attr::axis)
+                            : 1;
                     if (impl::utils::one_of(op->get_kind(),
                                 op_kind::dnnl_convolution,
                                 op_kind::dnnl_convtranspose)
@@ -94,11 +118,31 @@ dnnl::primitive_attr make_dnnl_primitive_attr(
                     } else {
                         mask = 1 << axis;
                     }
+                    attr.set_scales(in_scales_indices == 0 ? DNNL_ARG_SRC
+                                                           : DNNL_ARG_WEIGHTS,
+                            mask, default_groups,
+                            static_cast<dnnl::memory::data_type>(
+                                    scales_data_type));
+                } else { // per-group quantization
+                    // oneDNN only supports weights-decompressed matmul
+                    if (in_scales_indices != 1
+                            || op->get_kind() != op_kind::dnnl_matmul)
+                        continue;
+                    const auto &group_shape
+                            = in_scales_op->get_attr<std::vector<int64_t>>(
+                                    op_attr::group_shape);
+
+                    // Currently oneDNN only supports grouped scales and zps on
+                    // last two dimensions.
+                    std::vector<int64_t> groups(
+                            group_shape.end() - 2, group_shape.end());
+                    int mask = (1 << group_shape.size()) - 1;
+
+                    attr.set_scales(DNNL_ARG_WEIGHTS, mask, groups,
+                            static_cast<dnnl::memory::data_type>(
+                                    scales_data_type));
                 }
             }
-            attr.set_scales_mask(
-                    in_scales_indices == 0 ? DNNL_ARG_SRC : DNNL_ARG_WEIGHTS,
-                    mask);
         }
     }
 
@@ -106,21 +150,66 @@ dnnl::primitive_attr make_dnnl_primitive_attr(
     if (!fusion_info.input_zps_.empty()) {
         for (const auto &in_zps : fusion_info.input_zps_) {
             size_t in_zps_indices = in_zps.first;
-            assertm(fusion_info.with_runtime_zero_points(true, in_zps_indices),
-                    "only support runtime src zero points.\n");
-            int mask = 0;
-            attr.set_zero_points_mask(
-                    in_zps_indices == 0 ? DNNL_ARG_SRC : DNNL_ARG_WEIGHTS,
-                    mask);
+            const op_t *in_zps_op = in_zps.second->get_op();
+            VCHECK_FUSION_INFO(
+                    fusion_info.with_runtime_zero_points(true, in_zps_indices),
+                    attr,
+                    "failed to set zero points for %s since primitive only "
+                    "supports runtime src zero points",
+                    op->get_name().c_str());
+
+            if (in_zps_op->has_attr(op_attr::qtype)) {
+                std::string qtype
+                        = in_zps_op->get_attr<std::string>(op_attr::qtype);
+                const auto zps_data_type
+                        = in_zps_op->has_attr(op_attr::data_type)
+                        ? in_zps_op->get_attr<int64_t>(op_attr::data_type)
+                        : dnnl_s32;
+                if (qtype == "per_group") {
+                    // oneDNN only supports weights-decompressed matmul
+                    if (in_zps_indices != 1
+                            || op->get_kind() != op_kind::dnnl_matmul)
+                        break;
+                    const auto &group_shape
+                            = in_zps_op->get_attr<std::vector<int64_t>>(
+                                    op_attr::group_shape);
+
+                    // Currently oneDNN only supports grouped scales and zps on
+                    // last two dimensions.
+                    std::vector<int64_t> groups(
+                            group_shape.end() - 2, group_shape.end());
+                    int mask = (1 << group_shape.size()) - 1;
+
+                    // Currently oneDNN only supports grouped zps on last two dimensions.
+                    attr.set_zero_points(DNNL_ARG_WEIGHTS, mask, groups,
+                            static_cast<dnnl::memory::data_type>(
+                                    zps_data_type));
+
+                } else {
+                    int mask = 0;
+                    attr.set_zero_points(in_zps_indices == 0 ? DNNL_ARG_SRC
+                                                             : DNNL_ARG_WEIGHTS,
+                            mask, default_groups,
+                            static_cast<dnnl::memory::data_type>(
+                                    zps_data_type));
+                }
+            }
         }
     }
 
     // convert output zps
     if (fusion_info.output_zps_) {
-        assertm(fusion_info.with_runtime_zero_points(false, 0),
-                "only support runtime src zero points.\n");
+        const op_t *output_zps_op = fusion_info.output_zps_->get_op();
+        const auto zps_data_type = output_zps_op->has_attr(op_attr::data_type)
+                ? output_zps_op->get_attr<int64_t>(op_attr::data_type)
+                : dnnl_s32;
+        VCHECK_FUSION_INFO(fusion_info.with_runtime_zero_points(false, 0), attr,
+                "failed to set zero points for %s since primitive only "
+                "supports runtime dst zero points",
+                op->get_name().c_str());
         int mask = 0;
-        attr.set_zero_points_mask(DNNL_ARG_DST, mask);
+        attr.set_zero_points(DNNL_ARG_DST, mask, default_groups,
+                static_cast<dnnl::memory::data_type>(zps_data_type));
     }
 
     // convert post ops
@@ -211,10 +300,13 @@ dnnl::primitive_attr make_dnnl_primitive_attr(
                 dnnl_pops.append_sum(scale, zp, sum_dt);
             } else {
                 // post-binary
-                assertm(extra_inputs.size() == 1,
-                        "post-binary only has 1 extra input");
-                assertm(scale == 1.f && zp == 0,
-                        "post-binary doesn't support input scale and zp");
+                VCHECK_FUSION_INFO(
+                        extra_inputs.size() == 1 && scale == 1.f && zp == 0,
+                        attr,
+                        "%s post-binary only has 1 extra input and doesn't "
+                        "support "
+                        "input scale and zp",
+                        op->get_name().c_str());
                 auto md = make_dnnl_memory_desc(psrc);
                 if (op->get_kind() == op_kind::dnnl_convolution)
                     md = to_format_any(md);
@@ -251,7 +343,7 @@ dnnl::primitive_attr make_dnnl_primitive_attr(
             // not reachable
         }
     }
-    attr.set_post_ops(std::move(dnnl_pops));
+    attr.set_post_ops(dnnl_pops);
 
     return attr;
 }
diff --git a/src/graph/backend/dnnl/fusion_info.hpp b/src/graph/backend/dnnl/fusion_info.hpp
index 951432168e6..8ba71a1fbfb 100644
--- a/src/graph/backend/dnnl/fusion_info.hpp
+++ b/src/graph/backend/dnnl/fusion_info.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <unordered_map>
 
 #include "graph/interface/c_types_map.hpp"
+#include "graph/interface/graph_attr.hpp"
 #include "graph/interface/op.hpp"
 #include "graph/interface/value.hpp"
 #include "graph/utils/utils.hpp"
@@ -40,6 +41,9 @@ namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
+#define VCHECK_FUSION_INFO(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, fusion_info, (cond), status, msg, \
+            ##__VA_ARGS__);
 
 // This class is used to represent an op's fusion information, such as the post
 // ops, the zero points or scales.
@@ -112,7 +116,8 @@ class fusion_info_t {
                 return nullptr;
             return const_cast<op_t *>(input_scales_.at(index)->get_op());
         } else {
-            assertm(index == 0, "index for output scales must be 0");
+            VCHECK_FUSION_INFO(
+                    index == 0, nullptr, "index for output scales must be 0");
             if (!dst_scales_) return nullptr;
             return const_cast<op_t *>(dst_scales_->get_op());
         }
@@ -143,7 +148,8 @@ class fusion_info_t {
             if (input_zps_.find(index) == input_zps_.end()) return nullptr;
             return const_cast<op_t *>(input_zps_.at(index)->get_op());
         } else {
-            assertm(index == 0, "index for output zps must be 0");
+            VCHECK_FUSION_INFO(
+                    index == 0, nullptr, "index for output zps must be 0");
             if (!output_zps_) return nullptr;
             return const_cast<op_t *>(output_zps_->get_op());
         }
@@ -198,7 +204,8 @@ class fusion_info_t {
                             == op_kind::dnnl_convolution;
                 });
 
-        assertm(pos != post_ops_.end(), "cannot find post dw_conv");
+        VCHECK_FUSION_INFO(
+                pos != post_ops_.end(), *pos, "cannot find post dw_conv");
         return *pos;
     }
 
@@ -210,11 +217,11 @@ class fusion_info_t {
         return pos != post_ops_.end();
     }
 
-    bool with_runtime_zero_points(bool is_input, size_t indice) const {
+    bool with_runtime_zero_points(bool is_input, size_t index) const {
         if (is_input) {
-            if (input_zps_.find(indice) == input_zps_.end()) return false;
+            if (input_zps_.find(index) == input_zps_.end()) return false;
             const op_t *zp_op
-                    = const_cast<op_t *>(input_zps_.at(indice)->get_op());
+                    = const_cast<op_t *>(input_zps_.at(index)->get_op());
             if (zp_op->has_attr(op_attr::with_runtime_zps)) {
                 return zp_op->get_attr<bool>(op_attr::with_runtime_zps);
             } else {
@@ -231,11 +238,11 @@ class fusion_info_t {
         }
     }
 
-    bool with_runtime_scales(bool is_input, size_t indice) const {
+    bool with_runtime_scales(bool is_input, size_t index) const {
         if (is_input) {
-            if (input_scales_.find(indice) == input_scales_.end()) return false;
+            if (input_scales_.find(index) == input_scales_.end()) return false;
             const op_t *zp_op
-                    = const_cast<op_t *>(input_scales_.at(indice)->get_op());
+                    = const_cast<op_t *>(input_scales_.at(index)->get_op());
             if (zp_op->has_attr(op_attr::with_runtime_scales)) {
                 return zp_op->get_attr<bool>(op_attr::with_runtime_scales);
             } else {
@@ -267,8 +274,8 @@ class fusion_info_t {
 // info key to query it out from the manager.
 class fusion_info_mgr_t {
 public:
-    fusion_info_mgr_t(fpmath_mode_t fpm_mode = fpmath_mode::strict,
-            bool can_use_blocked_layout = false)
+    fusion_info_mgr_t(
+            graph::fpmath_t fpm_mode = {}, bool can_use_blocked_layout = false)
         : fpmath_mode_(fpm_mode)
         , can_use_blocked_layout_(can_use_blocked_layout) {}
 
@@ -280,31 +287,31 @@ class fusion_info_mgr_t {
 
     // Initialize an empty fusion info object and return its key
     int64_t init_info() {
-        data_.emplace_back(fusion_info_t());
+        data_.emplace_back();
         return static_cast<int64_t>(data_.size() - 1);
     }
 
     // Get out a mutable fusion info reference according to the key
     fusion_info_t &get_mutable_info(int64_t key) {
         size_t k = static_cast<size_t>(key);
-        assertm(k < data_.size(), "invalid key");
+        VCHECK_FUSION_INFO(k < data_.size(), data_[k], "invalid key");
         return data_[k];
     }
 
     // Get out a constant fusion info reference according to the key
     const fusion_info_t &get_info(int64_t key) const {
         size_t k = static_cast<size_t>(key);
-        assertm(k < data_.size(), "invalid key");
+        VCHECK_FUSION_INFO(k < data_.size(), data_[k], "invalid key");
         return data_[k];
     }
 
-    fpmath_mode_t get_fpmath_mode() const { return fpmath_mode_; }
+    const fpmath_t &get_fpmath_mode() const { return fpmath_mode_; }
     bool get_use_blocked_layout() const { return can_use_blocked_layout_; }
 
 private:
     std::vector<fusion_info_t> data_;
     // specified floating-point math mode for all fusions
-    fpmath_mode_t fpmath_mode_ {};
+    fpmath_t fpmath_mode_;
     bool can_use_blocked_layout_;
 };
 
diff --git a/src/graph/backend/dnnl/internal_attrs.hpp b/src/graph/backend/dnnl/internal_attrs.hpp
index 9cd6fe7d606..2312d0ec7a5 100644
--- a/src/graph/backend/dnnl/internal_attrs.hpp
+++ b/src/graph/backend/dnnl/internal_attrs.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,10 +45,18 @@ const op_attr_t with_runtime_dst_zps = 0x1000c;
 const op_attr_t is_bias_add = 0x1000d;
 const op_attr_t with_sum = 0x1000e;
 const op_attr_t keep_dst_layout = 0x1000f;
+const op_attr_t with_scale = 0x10010;
+const op_attr_t is_invert_scale = 0x10011;
+const op_attr_t with_causal = 0x10012;
+const op_attr_t with_mask = 0x10013;
 
 // int64_t
 const op_attr_t alg_kind = 0x10100;
 const op_attr_t fusion_info_key = 0x10103;
+const op_attr_t group_mask = 0x10104;
+const op_attr_t data_type = 0x10105;
+const op_attr_t axis_row = 0x10106;
+const op_attr_t axis_col = 0x10107;
 
 // string
 const op_attr_t dw_type = 0x10201;
@@ -82,8 +90,14 @@ static inline std::string internal_attr2str(op_attr_t attr) {
         CASE(is_bias_add);
         CASE(with_sum);
         CASE(keep_dst_layout);
+        CASE(with_scale);
+        CASE(is_invert_scale);
+        CASE(with_causal);
+        CASE(with_mask);
         CASE(alg_kind);
         CASE(fusion_info_key);
+        CASE(axis_row);
+        CASE(axis_col);
         CASE(dw_type);
         CASE(kind);
         CASE(p);
diff --git a/src/graph/backend/dnnl/internal_ops.hpp b/src/graph/backend/dnnl/internal_ops.hpp
index 95e6c86e03a..db9fe1a5c70 100644
--- a/src/graph/backend/dnnl/internal_ops.hpp
+++ b/src/graph/backend/dnnl/internal_ops.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -77,7 +77,10 @@ namespace op_kind {
     X(dnnl_reorder, Dnnl_reorder) \
     X(dnnl_convtranspose_bwd_data, Dnnl_convtranspose_bwd_data) \
     X(dnnl_convtranspose_bwd_weights, Dnnl_convtranspose_bwd_weights) \
-    X(dnnl_groupnorm, Dnnl_groupnorm)
+    X(dnnl_groupnorm, Dnnl_groupnorm) \
+    X(dnnl_gen_index, Dnnl_gen_index) \
+    X(dnnl_mask, Dnnl_mask) \
+    X(dnnl_sdpa, Dnnl_sdpa)
 
 enum kind_t {
     kDNNL_INTERNAL_OP_STARTER = 0x1234,
diff --git a/src/graph/backend/dnnl/kernels/batch_norm.cpp b/src/graph/backend/dnnl/kernels/batch_norm.cpp
index 8cec5c7a3d7..f2769610b09 100644
--- a/src/graph/backend/dnnl/kernels/batch_norm.cpp
+++ b/src/graph/backend/dnnl/kernels/batch_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ status_t batch_norm_fwd_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -135,9 +135,11 @@ status_t batch_norm_fwd_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -204,9 +206,11 @@ status_t batch_norm_fwd_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -261,7 +265,7 @@ status_t batch_norm_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -279,9 +283,11 @@ status_t batch_norm_fwd_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -466,7 +472,7 @@ status_t batch_norm_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/batch_norm.hpp b/src/graph/backend/dnnl/kernels/batch_norm.hpp
index 1d0852ae92b..7a9439f258b 100644
--- a/src/graph/backend/dnnl/kernels/batch_norm.hpp
+++ b/src/graph/backend/dnnl/kernels/batch_norm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct batch_norm_fwd_t : public kernel_base_t {
     memory_planner_t memory_planner_;
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     batch_norm_fwd_t() {
@@ -90,6 +90,7 @@ struct batch_norm_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(batch_norm_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(batch_norm_fwd_t)
 };
 
 #if BUILD_TRAINING
@@ -144,6 +145,7 @@ struct batch_norm_bwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(batch_norm_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(batch_norm_bwd_t)
 };
 #endif // BUILD_TRAINING
 
diff --git a/src/graph/backend/dnnl/kernels/binary.cpp b/src/graph/backend/dnnl/kernels/binary.cpp
index 38a69ac9390..811b91dbc89 100644
--- a/src/graph/backend/dnnl/kernels/binary.cpp
+++ b/src/graph/backend/dnnl/kernels/binary.cpp
@@ -180,7 +180,7 @@ status_t binary_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &ocl_deps, cl_event *ocl_event) {
 
     auto deps = ocl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/binary.hpp b/src/graph/backend/dnnl/kernels/binary.hpp
index add000d1a15..84960a1020c 100644
--- a/src/graph/backend/dnnl/kernels/binary.hpp
+++ b/src/graph/backend/dnnl/kernels/binary.hpp
@@ -102,6 +102,7 @@ struct binary_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(binary_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(binary_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/concat.cpp b/src/graph/backend/dnnl/kernels/concat.cpp
index 2341bffd9c8..47748e77232 100644
--- a/src/graph/backend/dnnl/kernels/concat.cpp
+++ b/src/graph/backend/dnnl/kernels/concat.cpp
@@ -175,7 +175,7 @@ status_t concat_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
diff --git a/src/graph/backend/dnnl/kernels/concat.hpp b/src/graph/backend/dnnl/kernels/concat.hpp
index 6164c6a5b3b..3f36768cae9 100644
--- a/src/graph/backend/dnnl/kernels/concat.hpp
+++ b/src/graph/backend/dnnl/kernels/concat.hpp
@@ -90,6 +90,7 @@ struct concat_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(concat_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(concat_t)
 };
 
 using float_concat = concat_t</* quantized */ false>;
diff --git a/src/graph/backend/dnnl/kernels/conv.cpp b/src/graph/backend/dnnl/kernels/conv.cpp
index 60dd968f42b..1ae31679936 100644
--- a/src/graph/backend/dnnl/kernels/conv.cpp
+++ b/src/graph/backend/dnnl/kernels/conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -134,7 +134,7 @@ status_t conv_fwd_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -202,7 +202,7 @@ status_t conv_bwd_data_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
diff --git a/src/graph/backend/dnnl/kernels/conv_base.cpp b/src/graph/backend/dnnl/kernels/conv_base.cpp
index 0a887bcbe6e..ba7561b45c5 100644
--- a/src/graph/backend/dnnl/kernels/conv_base.cpp
+++ b/src/graph/backend/dnnl/kernels/conv_base.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -63,9 +63,11 @@ status_t conv_base_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -132,9 +134,11 @@ status_t conv_base_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -189,7 +193,7 @@ status_t conv_base_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &ocl_deps, cl_event *ocl_event) {
 
     auto deps = ocl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -207,9 +211,11 @@ status_t conv_base_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/conv_base.hpp b/src/graph/backend/dnnl/kernels/conv_base.hpp
index 597b8d11bef..8aadedaaf5a 100644
--- a/src/graph/backend/dnnl/kernels/conv_base.hpp
+++ b/src/graph/backend/dnnl/kernels/conv_base.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,7 +47,7 @@ struct conv_base_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     conv_base_t() {
@@ -85,6 +85,8 @@ struct conv_base_t : public kernel_base_t {
             const std::vector<cl_event> &ocl_deps,
             cl_event *ocl_event) override;
 #endif
+
+    DNNL_DISALLOW_COPY_AND_ASSIGN(conv_base_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/conv_transpose.cpp b/src/graph/backend/dnnl/kernels/conv_transpose.cpp
index 330e22b8050..eaca8fedf89 100644
--- a/src/graph/backend/dnnl/kernels/conv_transpose.cpp
+++ b/src/graph/backend/dnnl/kernels/conv_transpose.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -114,7 +114,7 @@ status_t conv_transpose_fwd_t<quantized>::compile_impl(
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -183,7 +183,7 @@ status_t conv_transpose_bwd_data_t::compile_impl(
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
diff --git a/src/graph/backend/dnnl/kernels/dummy.hpp b/src/graph/backend/dnnl/kernels/dummy.hpp
index 8880b29bd12..65fc78fb2bd 100644
--- a/src/graph/backend/dnnl/kernels/dummy.hpp
+++ b/src/graph/backend/dnnl/kernels/dummy.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,9 +38,9 @@ struct dummy_kernel_t : public kernel_base_t {
     std::shared_ptr<subgraph_t> subgraph_;
 
 public:
-    dummy_kernel_t() {}
+    dummy_kernel_t() = default;
 
-    ~dummy_kernel_t() override {}
+    ~dummy_kernel_t() override = default;
 
     status_t compile_impl(const dnnl_partition_impl_t *part,
             const engine_t *g_engine,
@@ -67,6 +67,7 @@ struct dummy_kernel_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(dummy_kernel_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(dummy_kernel_t)
 };
 
 kernel_ptr dummy_kernel_creator();
diff --git a/src/graph/backend/dnnl/kernels/eltwise.cpp b/src/graph/backend/dnnl/kernels/eltwise.cpp
index 04949042740..6a8dadec05e 100644
--- a/src/graph/backend/dnnl/kernels/eltwise.cpp
+++ b/src/graph/backend/dnnl/kernels/eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ status_t eltwise_fwd_t<quantized>::compile_impl(
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -138,9 +138,11 @@ status_t eltwise_fwd_t<quantized>::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -208,9 +210,11 @@ status_t eltwise_fwd_t<quantized>::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -266,7 +270,7 @@ status_t eltwise_fwd_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -284,9 +288,11 @@ status_t eltwise_fwd_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -465,7 +471,7 @@ status_t eltwise_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
diff --git a/src/graph/backend/dnnl/kernels/eltwise.hpp b/src/graph/backend/dnnl/kernels/eltwise.hpp
index 32aed7a8581..28e7f17a350 100644
--- a/src/graph/backend/dnnl/kernels/eltwise.hpp
+++ b/src/graph/backend/dnnl/kernels/eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ struct eltwise_fwd_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     eltwise_fwd_t() {
@@ -100,6 +100,7 @@ struct eltwise_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(eltwise_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(eltwise_fwd_t)
 };
 
 using float_eltwise_fwd = eltwise_fwd_t</* quantized */ false>;
@@ -157,6 +158,7 @@ struct eltwise_bwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(eltwise_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(eltwise_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/gen_index.cpp b/src/graph/backend/dnnl/kernels/gen_index.cpp
new file mode 100644
index 00000000000..b61a576e309
--- /dev/null
+++ b/src/graph/backend/dnnl/kernels/gen_index.cpp
@@ -0,0 +1,196 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "graph/backend/dnnl/kernels/gen_index.hpp"
+
+#include "graph/backend/dnnl/passes/compile_ops.hpp"
+#include "graph/backend/dnnl/passes/constant_propagation.hpp"
+#include "graph/backend/dnnl/passes/insert_ops.hpp"
+#include "graph/backend/dnnl/passes/layout_propagation.hpp"
+#include "graph/backend/dnnl/passes/lower.hpp"
+#include "graph/backend/dnnl/passes/memory_planning.hpp"
+#include "graph/backend/dnnl/passes/transform.hpp"
+#include "graph/backend/dnnl/passes/utils.hpp"
+
+#include "graph/backend/dnnl/op_executable.hpp"
+
+#define VCHECK_GENINDEX(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, genindex_t, (cond), status, msg, \
+            ##__VA_ARGS__);
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+namespace dnnl_impl {
+
+status_t genindex_t::compile_impl(const dnnl_partition_impl_t *part,
+        const engine_t *g_engine, const std::vector<logical_tensor_t> &inputs,
+        const std::vector<logical_tensor_t> &outputs) {
+    p_engine_ = make_dnnl_engine(*g_engine);
+    g_alloc_
+            = reinterpret_cast<graph::allocator_t *>(g_engine->get_allocator());
+
+    subgraph_ = std::make_shared<subgraph_t>(part->get_ops(), p_engine_,
+            part->get_fpmath_mode(), part->get_use_blocked_layout(), true);
+    BACKEND_DNNL_CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs));
+
+    if (p_engine_.get_kind() == engine::kind::gpu) {
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+        int ndims = inputs[0].ndims;
+        VCHECK_GENINDEX(ndims <= MAX_NDIMS, status::invalid_arguments,
+                "only tensors of 6 or fewer dimensions are supported for "
+                "genindex GPU, but got %dD",
+                ndims);
+#else
+        return status::unimplemented;
+#endif
+    }
+
+    subgraph_visualizer_t vis(part->id(), [this](const value_t *val) {
+        return this->memory_planner_.get_memory_info(val);
+    });
+    pass_pipeline_t pipeline(vis);
+
+    BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+
+    pipeline.reset_visualize_arg(true, false);
+
+    BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
+
+    // constant propagation
+    if (enabled_constant_cache()) {
+        BACKEND_DNNL_ADD_PASS(pipeline, constant_propagation);
+    }
+
+    // bind the memory for each op
+    auto memory_plan = [&](std::shared_ptr<subgraph_t> &sg) {
+        return memory_planner_.run(sg);
+    };
+    pipeline.reset_visualize_arg(true, true);
+    BACKEND_DNNL_ADD_PASS(pipeline, memory_plan);
+    BACKEND_DNNL_ADD_PASS(pipeline, compile_ops);
+
+    // Run the added passes
+    BACKEND_DNNL_CHECK(pipeline.run(subgraph_));
+
+    // fill information for outputs logical tensors
+    for (size_t i = 0; i < outputs.size(); i++) {
+        auto &out = const_cast<logical_tensor_t &>(outputs[i]);
+        out = subgraph_->outs_[i];
+    }
+
+    // generate a hash key for exec_args_mgr
+    resource_ctor_ = [this]() {
+        return this->memory_planner_.get_exec_args_set().clone();
+    };
+
+    return status::success;
+}
+
+void genindex_t::prepare_args_set(const execution_args_set_t *res,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs) {
+    // update the data of partition in/outputs args
+    for (const auto &mem_idx : res->get_mems_use_external_inputs()) {
+        mem_idx.first.set_data_handle(inputs[mem_idx.second].get_data_handle());
+    }
+    for (const auto &mem_idx : res->get_mems_use_external_outputs()) {
+        mem_idx.first.set_data_handle(
+                outputs[mem_idx.second].get_data_handle());
+    }
+}
+
+status_t genindex_t::execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs) {
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    // each thread's own local resource
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+    prepare_args_set(res, inputs, outputs);
+
+    constant_cache_t::cached_t c_buffer;
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        subgraph_->execs_[i]->execute(p_stream, res->get_exec_args()[i]);
+    }
+
+    return status::success;
+}
+#ifdef DNNL_WITH_SYCL
+status_t genindex_t::sycl_execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs,
+        const std::vector<::sycl::event> &sycl_deps,
+        ::sycl::event *sycl_event) {
+    if (p_engine_.get_kind() == engine::kind::gpu) {
+        auto deps = sycl_deps;
+        ::sycl::event returned_event;
+        dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+        thread_local_cache_t<execution_args_set_t> res_cache;
+        execution_args_set_t *res = res_cache.get_or_add(
+                reinterpret_cast<size_t>(this), resource_ctor_);
+        prepare_args_set(res, inputs, outputs);
+        for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+            if (subgraph_->is_constant_[i]) continue;
+            returned_event = subgraph_->execs_[i]->execute_sycl(
+                    p_stream, res->get_exec_args()[i], deps);
+            deps = {returned_event};
+        }
+
+        if (sycl_event) *sycl_event = returned_event;
+
+        return status::success;
+    }
+    return execute_impl(g_stream, inputs, outputs);
+}
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+status_t genindex_t::ocl_execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs,
+        const std::vector<cl_event> &ocl_deps, cl_event *ocl_event) {
+    auto deps = ocl_deps;
+    cl_event returned_event {};
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    // each thread's own local resource
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    prepare_args_set(res, inputs, outputs);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        returned_event = subgraph_->execs_[i]->execute_ocl(
+                p_stream, res->get_exec_args()[i], deps);
+        deps = {returned_event};
+    }
+
+    if (ocl_event) *ocl_event = returned_event;
+
+    return status::success;
+}
+#endif
+} // namespace dnnl_impl
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
diff --git a/src/graph/backend/dnnl/kernels/gen_index.hpp b/src/graph/backend/dnnl/kernels/gen_index.hpp
new file mode 100644
index 00000000000..157837887ef
--- /dev/null
+++ b/src/graph/backend/dnnl/kernels/gen_index.hpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#ifndef GRAPH_BACKEND_DNNL_KERNELS_GEN_INDEX_HPP
+#define GRAPH_BACKEND_DNNL_KERNELS_GEN_INDEX_HPP
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "common/dnnl_thread.hpp"
+
+#include "graph/backend/dnnl/dnnl_constant_tensor_cache.hpp"
+#include "graph/backend/dnnl/dnnl_partition_impl.hpp"
+#include "graph/backend/dnnl/kernels/kernel_base.hpp"
+#include "graph/backend/dnnl/passes/memory_planning.hpp"
+#include "graph/backend/dnnl/scratchpad.hpp"
+#include "graph/backend/dnnl/subgraph.hpp"
+#include "graph/backend/dnnl/thread_local_cache.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+namespace dnnl_impl {
+using ltw = logical_tensor_wrapper_t;
+struct genindex_t : public kernel_base_t {
+private:
+    allocator_t *g_alloc_ = nullptr;
+
+    std::shared_ptr<subgraph_t> subgraph_;
+    memory_planner_t memory_planner_;
+
+    std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
+
+public:
+    genindex_t() {
+        thread_local_cache_t<execution_args_set_t> res_cache;
+        res_cache.retain();
+    }
+    ~genindex_t() override {
+        thread_local_cache_t<execution_args_set_t> res_cache;
+        res_cache.remove_if_exist(reinterpret_cast<size_t>(this));
+        res_cache.release();
+    }
+    void prepare_args_set(const execution_args_set_t *res,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs);
+    status_t compile_impl(const dnnl_partition_impl_t *part,
+            const engine_t *g_engine,
+            const std::vector<logical_tensor_t> &inputs,
+            const std::vector<logical_tensor_t> &outputs) override;
+    status_t execute_impl(const stream_t *g_stream,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs) override;
+#ifdef DNNL_WITH_SYCL
+    status_t sycl_execute_impl(const stream_t *g_stream,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs,
+            const std::vector<::sycl::event> &sycl_deps,
+            ::sycl::event *sycl_event) override;
+#endif
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+    status_t ocl_execute_impl(const stream_t *g_stream,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs,
+            const std::vector<cl_event> &ocl_deps,
+            cl_event *ocl_event) override;
+#endif
+    DEF_KERNEL_METHOD_STR(genindex_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(genindex_t)
+};
+
+} // namespace dnnl_impl
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
+#endif
diff --git a/src/graph/backend/dnnl/kernels/group_norm.cpp b/src/graph/backend/dnnl/kernels/group_norm.cpp
index 4e71a6401b8..7fad86a8f06 100644
--- a/src/graph/backend/dnnl/kernels/group_norm.cpp
+++ b/src/graph/backend/dnnl/kernels/group_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -97,7 +97,7 @@ status_t group_norm_fwd_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -143,9 +143,11 @@ status_t group_norm_fwd_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -212,9 +214,11 @@ status_t group_norm_fwd_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -269,7 +273,7 @@ status_t group_norm_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -287,9 +291,11 @@ status_t group_norm_fwd_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/group_norm.hpp b/src/graph/backend/dnnl/kernels/group_norm.hpp
index 0d573e831fe..5caa15be4b2 100644
--- a/src/graph/backend/dnnl/kernels/group_norm.hpp
+++ b/src/graph/backend/dnnl/kernels/group_norm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct group_norm_fwd_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     group_norm_fwd_t() {
@@ -92,6 +92,7 @@ struct group_norm_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(group_norm_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(group_norm_fwd_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/kernel_base.cpp b/src/graph/backend/dnnl/kernels/kernel_base.cpp
index 8a47a558925..4dd28f0a15d 100644
--- a/src/graph/backend/dnnl/kernels/kernel_base.cpp
+++ b/src/graph/backend/dnnl/kernels/kernel_base.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2024 Intel Corporation
+ * Copyright 2024-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,6 +43,19 @@ bool kernel_base_t::enabled_constant_cache() const {
     return enabled;
 }
 
+size_t kernel_base_t::encode_constant_cache_key(
+        const std::vector<tensor_t> &inputs, size_t cache_key) const {
+    // Encode the constant memory address into cache key for differentiation
+    size_t encoded_cache_key = cache_key;
+    for (const auto &in : inputs) {
+        if (logical_tensor_wrapper_t(in.get_logical_tensor()).is_constant()) {
+            encoded_cache_key = hash_combine(encoded_cache_key,
+                    reinterpret_cast<uintptr_t>(in.get_data_handle()));
+        }
+    }
+    return encoded_cache_key;
+}
+
 const std::vector<inplace_pair_t> &kernel_base_t::get_inplace_pairs() const {
     return inplace_pairs_;
 };
diff --git a/src/graph/backend/dnnl/kernels/kernel_base.hpp b/src/graph/backend/dnnl/kernels/kernel_base.hpp
index c893a045332..16e81d4ae48 100644
--- a/src/graph/backend/dnnl/kernels/kernel_base.hpp
+++ b/src/graph/backend/dnnl/kernels/kernel_base.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2024 Intel Corporation
+ * Copyright 2024-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,6 @@
 #include "oneapi/dnnl/dnnl.hpp"
 
 #ifdef DNNL_WITH_SYCL
-#include "graph/utils/sycl_check.hpp"
 #include "oneapi/dnnl/dnnl_sycl.hpp"
 #endif
 
@@ -104,6 +103,9 @@ struct kernel_base_t {
 
     bool enabled_constant_cache() const;
 
+    size_t encode_constant_cache_key(
+            const std::vector<tensor_t> &inputs, size_t cache_key) const;
+
     const std::vector<inplace_pair_t> &get_inplace_pairs() const;
 
 protected:
diff --git a/src/graph/backend/dnnl/kernels/kernels.hpp b/src/graph/backend/dnnl/kernels/kernels.hpp
index 5da6fe99392..a17d2502461 100644
--- a/src/graph/backend/dnnl/kernels/kernels.hpp
+++ b/src/graph/backend/dnnl/kernels/kernels.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include "graph/backend/dnnl/kernels/conv_transpose.hpp"
 #include "graph/backend/dnnl/kernels/dummy.hpp"
 #include "graph/backend/dnnl/kernels/eltwise.hpp"
+#include "graph/backend/dnnl/kernels/gen_index.hpp"
 #include "graph/backend/dnnl/kernels/group_norm.hpp"
 #include "graph/backend/dnnl/kernels/large_partition.hpp"
 #include "graph/backend/dnnl/kernels/layer_norm.hpp"
diff --git a/src/graph/backend/dnnl/kernels/large_partition.cpp b/src/graph/backend/dnnl/kernels/large_partition.cpp
index f9d36a4eb3e..d14b4b4ae4d 100644
--- a/src/graph/backend/dnnl/kernels/large_partition.cpp
+++ b/src/graph/backend/dnnl/kernels/large_partition.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,6 +36,8 @@ void larger_partition_kernel_t::setup_pipeline_stage1(
         pass_pipeline_t &pipeline) {
     // Directly lower down (1 to 1 mapping)
     BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    // Decompose select to binary ops if necessary
+    BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops);
 
     // Indirectly lower down (N to 1 mapping)
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_reciprocal_mul_to_div);
@@ -63,6 +65,8 @@ void larger_partition_kernel_t::setup_pipeline_stage1(
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_typecast_to_add);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_typecast_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_typecast_to_mul_scales);
+    BACKEND_DNNL_ADD_PASS(
+            pipeline, insert_permute_for_dynamic_mul_scale_sub_zp);
 
     BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
 
@@ -131,9 +135,14 @@ void larger_partition_kernel_t::setup_pipeline_stage1(
 void larger_partition_kernel_t::setup_pipeline_stage2(pass_pipeline_t &pipeline,
         memory_planner_t &mem_planner, bool enable_constant_cache) {
     pipeline.reset_visualize_arg(true, false);
+    // do constant propagation here so that we can
+    // prepare constant info for other optimizations.
+    if (enable_constant_cache) {
+        BACKEND_DNNL_ADD_PASS(pipeline, constant_propagation);
+    }
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
     BACKEND_DNNL_ADD_PASS(pipeline, common_reorder_elimination);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);
@@ -220,7 +229,7 @@ status_t larger_partition_kernel_t::compile_impl(
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -246,9 +255,11 @@ status_t larger_partition_kernel_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -314,9 +325,11 @@ status_t larger_partition_kernel_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -370,7 +383,7 @@ status_t larger_partition_kernel_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<tensor_t> &outputs,
         const std::vector<cl_event> &ocl_deps, cl_event *event) {
     auto deps = ocl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
@@ -387,9 +400,11 @@ status_t larger_partition_kernel_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/large_partition.hpp b/src/graph/backend/dnnl/kernels/large_partition.hpp
index 31c36d34ee5..8d5c0a6a783 100644
--- a/src/graph/backend/dnnl/kernels/large_partition.hpp
+++ b/src/graph/backend/dnnl/kernels/large_partition.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ class larger_partition_kernel_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
     std::once_flag once_flag_;
     subgraph_visualizer_t vis_;
@@ -107,6 +107,7 @@ class larger_partition_kernel_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(larger_partition_kernel_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(larger_partition_kernel_t)
 };
 
 kernel_ptr large_partition_kernel_creator();
diff --git a/src/graph/backend/dnnl/kernels/layer_norm.cpp b/src/graph/backend/dnnl/kernels/layer_norm.cpp
index 82345c1108c..9b8c541e2e2 100644
--- a/src/graph/backend/dnnl/kernels/layer_norm.cpp
+++ b/src/graph/backend/dnnl/kernels/layer_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -94,7 +94,7 @@ status_t layer_norm_fwd_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -140,9 +140,11 @@ status_t layer_norm_fwd_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -209,9 +211,11 @@ status_t layer_norm_fwd_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -266,7 +270,7 @@ status_t layer_norm_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -284,9 +288,11 @@ status_t layer_norm_fwd_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -464,7 +470,7 @@ status_t layer_norm_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
diff --git a/src/graph/backend/dnnl/kernels/layer_norm.hpp b/src/graph/backend/dnnl/kernels/layer_norm.hpp
index 24f7a5a505d..416a65ad4cb 100644
--- a/src/graph/backend/dnnl/kernels/layer_norm.hpp
+++ b/src/graph/backend/dnnl/kernels/layer_norm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct layer_norm_fwd_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     layer_norm_fwd_t() {
@@ -91,6 +91,7 @@ struct layer_norm_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(layer_norm_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(layer_norm_fwd_t)
 };
 
 #if BUILD_TRAINING
@@ -150,6 +151,7 @@ struct layer_norm_bwd_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(layer_norm_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(layer_norm_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/log_softmax.cpp b/src/graph/backend/dnnl/kernels/log_softmax.cpp
index 047f96d5952..e511f5c4c63 100644
--- a/src/graph/backend/dnnl/kernels/log_softmax.cpp
+++ b/src/graph/backend/dnnl/kernels/log_softmax.cpp
@@ -162,7 +162,7 @@ status_t logsoftmax_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -325,7 +325,7 @@ status_t logsoftmax_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/log_softmax.hpp b/src/graph/backend/dnnl/kernels/log_softmax.hpp
index b171ef90509..d2322e31638 100644
--- a/src/graph/backend/dnnl/kernels/log_softmax.hpp
+++ b/src/graph/backend/dnnl/kernels/log_softmax.hpp
@@ -93,6 +93,7 @@ struct logsoftmax_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(logsoftmax_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(logsoftmax_fwd_t)
 };
 
 #if BUILD_TRAINING
@@ -152,6 +153,7 @@ struct logsoftmax_bwd_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(logsoftmax_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(logsoftmax_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/matmul.cpp b/src/graph/backend/dnnl/kernels/matmul.cpp
index a6ce6011cc5..b8fffd81202 100644
--- a/src/graph/backend/dnnl/kernels/matmul.cpp
+++ b/src/graph/backend/dnnl/kernels/matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,6 +50,9 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
     pass_pipeline_t pipeline(vis);
 
     BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    // Decompose select to binary ops if necessary
+    BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops);
+
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_bias_add);
     // check if bias exists
     BACKEND_DNNL_ADD_PASS(pipeline, check_with_bias);
@@ -61,6 +64,8 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_typecast_to_add);
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_typecast_to_predecessor);
         BACKEND_DNNL_ADD_PASS(pipeline, fuse_typecast_to_mul_scales);
+        BACKEND_DNNL_ADD_PASS(
+                pipeline, insert_permute_for_dynamic_mul_scale_sub_zp);
         BACKEND_DNNL_ADD_PASS(pipeline, convert_bias_to_f32);
     }
 
@@ -105,7 +110,7 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
     }
 
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_adjacent_reorders);
@@ -143,7 +148,7 @@ status_t matmul_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -191,9 +196,11 @@ status_t matmul_t<quantized>::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -261,9 +268,11 @@ status_t matmul_t<quantized>::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -319,7 +328,7 @@ status_t matmul_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -337,9 +346,11 @@ status_t matmul_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/matmul.hpp b/src/graph/backend/dnnl/kernels/matmul.hpp
index 3fadb98f734..1ddf2d5f20a 100644
--- a/src/graph/backend/dnnl/kernels/matmul.hpp
+++ b/src/graph/backend/dnnl/kernels/matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ struct matmul_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     matmul_t() {
@@ -98,6 +98,7 @@ struct matmul_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(matmul_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(matmul_t)
 };
 
 using float_matmul = matmul_t</* quantized */ false>;
diff --git a/src/graph/backend/dnnl/kernels/mqa.hpp b/src/graph/backend/dnnl/kernels/mqa.hpp
index aec0a53d404..283b085609a 100644
--- a/src/graph/backend/dnnl/kernels/mqa.hpp
+++ b/src/graph/backend/dnnl/kernels/mqa.hpp
@@ -62,15 +62,16 @@ struct mqa_base_t : public kernel_base_t {
         return mqa_decomp_status;
     }
 
-    // The fuction is used to check if enable the decomposition kernel based on
-    // user's env and params. Currently, we restrict the library’s CPU runtime
-    // to OMP and THREADPOOL. There is also an internal env var to decide if use
-    // the kernel. TODO: Remove CPU runtime check when we extend the support for
-    // others
+    // It is used to check if enable the decomposition kernel based on user's
+    // env and params. Decomposition kernel is enabled when:
+    // - CPU runtime is OMP or THREADPOOl.
+    // - Primitive based implementation is not forced by the internal env var.
     bool enable_decomp_kernel() {
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP \
         || DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
-        return graph::utils::getenv_int_internal("ENABLE_SDP_DECOMP", 1) > 0;
+        const int force_prim = graph::utils::getenv_int_internal(
+                "GRAPH_SDPA_FORCE_PRIMITIVE", 0);
+        return force_prim == 0;
 #else
         return false;
 #endif
diff --git a/src/graph/backend/dnnl/kernels/mqa_decomp.cpp b/src/graph/backend/dnnl/kernels/mqa_decomp.cpp
index d7bd418d0a9..aa76e48ec2b 100644
--- a/src/graph/backend/dnnl/kernels/mqa_decomp.cpp
+++ b/src/graph/backend/dnnl/kernels/mqa_decomp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -87,8 +87,7 @@ status_t mqa_decomp_kernel_t<quantized, dt>::compile_impl(
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
     }
     pipeline.reset_visualize_arg(true, false);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // Run the added passes
@@ -221,6 +220,9 @@ status_t mqa_decomp_kernel_t<quantized, dt>::execute_impl(
         //reorder3
         auto &sub_dst_user_tid = res->mem_map[mqa_cfg_.sub_dst_user.get()][tid];
 
+        // matmul2
+        auto &sub_mm2_dst_tid = res->mem_map[mqa_cfg_.sub_mm2_dst.get()][tid];
+
         const size_t sub_src1_offset
                 = bo * M1 * K1 * get_mem_dt_size(sub_src1_tid);
         const size_t sub_wei1_offset = (bo * MBI * K1 * N1 + bi * N1)
@@ -240,6 +242,14 @@ status_t mqa_decomp_kernel_t<quantized, dt>::execute_impl(
         sub_dst_user_tid.set_data_handle(
                 dst2_user_pointer + sub_dst_user_offset);
 
+        // If the last reorder is inplace, it means we don't have to do
+        // extra reorder, thus we should set matmul's output to the user's
+        // output directly.
+        if (mqa_cfg_.sub_reorder3.get_inplace()) {
+            sub_mm2_dst_tid.set_data_handle(
+                    dst2_user_pointer + sub_dst_user_offset);
+        }
+
         // in parallel region - these primitives should use single thread.
         mqa_cfg_.sub_reorder0.execute(strm, res->sub_reorder0_args[tid]);
         mqa_cfg_.sub_reorder1.execute(strm, res->sub_reorder1_args[tid]);
diff --git a/src/graph/backend/dnnl/kernels/mqa_decomp.hpp b/src/graph/backend/dnnl/kernels/mqa_decomp.hpp
index 49c328306e9..14620efacf7 100644
--- a/src/graph/backend/dnnl/kernels/mqa_decomp.hpp
+++ b/src/graph/backend/dnnl/kernels/mqa_decomp.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ struct mqa_decomp_kernel_t : public kernel_base_t {
                                      std::vector<std::unordered_map<int,
                                              memory>> &args) {
                 args.resize(nthr);
-                for (auto iter : ori_args) {
+                for (const auto &iter : ori_args) {
                     memory ori_mem = iter.second;
                     if (mem_map.count(ori_mem.get()) == 0) {
                         //construct new memory
@@ -164,6 +164,7 @@ struct mqa_decomp_kernel_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(mqa_decomp_kernel_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(mqa_decomp_kernel_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/mqa_decomp_config.cpp b/src/graph/backend/dnnl/kernels/mqa_decomp_config.cpp
index 57ad3864a6f..61b5deca5e9 100644
--- a/src/graph/backend/dnnl/kernels/mqa_decomp_config.cpp
+++ b/src/graph/backend/dnnl/kernels/mqa_decomp_config.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,10 @@ namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
+#define VCHECK_MQA_DECOMP(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, mqa_decomp_config, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 bool mqa_decomp_config_t::initial_check(const std::shared_ptr<subgraph_t> &sg,
         const std::vector<logical_tensor_t> &inputs) {
     // The order of input logical tensors in inputs is not certain, we need
@@ -31,7 +35,10 @@ bool mqa_decomp_config_t::initial_check(const std::shared_ptr<subgraph_t> &sg,
     memory::dims src1_user_dims = ltw(inputs[graph_inport[0]]).vdims();
     // Query(3-dims): batch_size * size_per_head * (num_head * seq_len)
     memory::dims wei1_user_dims = ltw(inputs[graph_inport[1]]).vdims();
-    if (src1_user_dims.size() != 3 || wei1_user_dims.size() != 3) return false;
+    VCHECK_MQA_DECOMP(src1_user_dims.size() == 3 && wei1_user_dims.size() == 3,
+            false,
+            "dims of src1 and wei1 should be 3, but got src1: %zu, wei1: %zu",
+            src1_user_dims.size(), wei1_user_dims.size());
 
     // Initialize MQA input dimension according to the src of mm1
     batch_size = src1_user_dims[0];
@@ -39,6 +46,14 @@ bool mqa_decomp_config_t::initial_check(const std::shared_ptr<subgraph_t> &sg,
     size_per_head = src1_user_dims[2];
     num_head = wei1_user_dims[2] / seq_len;
 
+    //  Check batch size compatibility.
+    dims wei2_user_dims = ltw(inputs[graph_inport[3]]).vdims();
+    VCHECK_MQA_DECOMP(
+            batch_size == wei1_user_dims[0] && batch_size == wei2_user_dims[0],
+            false,
+            "batch size mismatch, batch_size: %lld, wei1: %lld, wei2: %lld",
+            batch_size, wei1_user_dims[0], wei2_user_dims[0]);
+
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP
 // RATIO is an empirical value used to determine the numerical relationship
 // between batch_size, num_head and thread number to determine whether to use
@@ -53,10 +68,13 @@ bool mqa_decomp_config_t::initial_check(const std::shared_ptr<subgraph_t> &sg,
 #define RATIO 2
     // Initialize nthr with current threads num
     nthr = dnnl_get_current_num_threads();
-    return batch_size * num_head > RATIO * nthr;
-#else
-    return true;
+    VCHECK_MQA_DECOMP(batch_size * num_head > RATIO * nthr, false,
+            "doesn't meet condition for decompose:"
+            "batch size * num_head should be larger than ratio * nthr, but got "
+            "batch_size %lld, num_head %lld, ration %d , nthr %d",
+            batch_size, num_head, RATIO, nthr);
 #endif
+    return true;
 }
 
 template <bool quantized, memory::data_type dt>
@@ -74,7 +92,11 @@ status_t mqa_decomp_config_t::construct_params(std::shared_ptr<subgraph_t> &sg,
     memory::data_type dt_wei_user = static_cast<memory::data_type>(
             ltw(inputs[graph_inport[1]]).data_type());
     memory::data_type dt_wei = quantized ? memory::data_type::s8 : dt_src_user;
-    memory::data_type dt_inter = quantized ? dt : dt_src_user;
+    memory::data_type dt_inter = quantized
+            ? dt
+            : static_cast<memory::data_type>(
+                    ltw(mqa_op[1]->get_output_value(0)->get_logical_tensor())
+                            .data_type());
 
     ////////////////////////////////////////////////////////////////////////
     ////////////// Start Creating primitives ///////////////////////////////
@@ -143,7 +165,7 @@ status_t mqa_decomp_config_t::construct_params(std::shared_ptr<subgraph_t> &sg,
     auto alg
             = static_cast<algorithm>(ori_dnnl_pops.get()->entry_[0].binary.alg);
     dnnl_pops.append_binary(alg, sub_mm1_post_add_md);
-    sub_matmul1_attr.set_post_ops(std::move(dnnl_pops));
+    sub_matmul1_attr.set_post_ops(dnnl_pops);
     auto sub_mm1_pd = matmul::primitive_desc(p_engine, sub_mm1_src_md,
             sub_mm1_wei_md, sub_mm1_dst_md, sub_matmul1_attr);
     sub_mm1_prim = matmul(sub_mm1_pd);
@@ -358,7 +380,6 @@ status_t mqa_decomp_config_t::record_input_offset(
 }
 
 status_t mqa_decomp_config_t::record_mqa_ops(std::shared_ptr<subgraph_t> &sg) {
-    subgraph_rewriter_t rewriter(sg);
     op_ptr reorder1, reorder2, matmul1, softmax, matmul2;
     for (const auto &cur_op : sg->get_ops()) {
         if (cur_op->get_kind() != op_kind::dnnl_matmul) continue;
diff --git a/src/graph/backend/dnnl/kernels/mqa_decomp_config.hpp b/src/graph/backend/dnnl/kernels/mqa_decomp_config.hpp
index f85467b7437..1ca48d340b4 100644
--- a/src/graph/backend/dnnl/kernels/mqa_decomp_config.hpp
+++ b/src/graph/backend/dnnl/kernels/mqa_decomp_config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,6 +49,8 @@ struct mqa_reorder_t {
         return status::success;
     }
 
+    bool get_inplace() const { return is_inplace_; }
+
     status_t execute(const dnnl::stream &astream,
             const std::unordered_map<int, dnnl::memory> &args) const {
         // If the src and dst are the same, we just set the src arg to dst
diff --git a/src/graph/backend/dnnl/kernels/pool.cpp b/src/graph/backend/dnnl/kernels/pool.cpp
index 3ed37fd5dd4..859872908c6 100644
--- a/src/graph/backend/dnnl/kernels/pool.cpp
+++ b/src/graph/backend/dnnl/kernels/pool.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -122,7 +122,7 @@ status_t pooling_fwd_t<quantized>::compile_impl(
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -170,9 +170,11 @@ status_t pooling_fwd_t<quantized>::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -240,9 +242,11 @@ status_t pooling_fwd_t<quantized>::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -298,7 +302,7 @@ status_t pooling_fwd_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -316,9 +320,11 @@ status_t pooling_fwd_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -513,7 +519,7 @@ status_t pooling_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/pool.hpp b/src/graph/backend/dnnl/kernels/pool.hpp
index 8efd7c50f05..1c26e4663f0 100644
--- a/src/graph/backend/dnnl/kernels/pool.hpp
+++ b/src/graph/backend/dnnl/kernels/pool.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ struct pooling_fwd_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     pooling_fwd_t() {
@@ -93,6 +93,7 @@ struct pooling_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(pooling_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(pooling_fwd_t)
 };
 
 using float_pooling_fwd = pooling_fwd_t</* quantized */ false>;
@@ -150,6 +151,7 @@ struct pooling_bwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(pooling_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(pooling_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/prelu.cpp b/src/graph/backend/dnnl/kernels/prelu.cpp
index 8a903bf3a21..bcd354f8a90 100644
--- a/src/graph/backend/dnnl/kernels/prelu.cpp
+++ b/src/graph/backend/dnnl/kernels/prelu.cpp
@@ -180,7 +180,7 @@ status_t prelu_fwd_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -345,7 +345,7 @@ status_t prelu_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/prelu.hpp b/src/graph/backend/dnnl/kernels/prelu.hpp
index 88943f5c120..7f0b54bc6ae 100644
--- a/src/graph/backend/dnnl/kernels/prelu.hpp
+++ b/src/graph/backend/dnnl/kernels/prelu.hpp
@@ -91,6 +91,7 @@ struct prelu_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(prelu_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(prelu_fwd_t)
 };
 
 using float_prelu_fwd = prelu_fwd_t</* quantized */ false>;
@@ -147,6 +148,7 @@ struct prelu_bwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(prelu_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(prelu_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/quantize.cpp b/src/graph/backend/dnnl/kernels/quantize.cpp
index f62aa33b7a6..cb6b6e4b9bb 100644
--- a/src/graph/backend/dnnl/kernels/quantize.cpp
+++ b/src/graph/backend/dnnl/kernels/quantize.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ status_t quantize_dequantize_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -134,9 +134,11 @@ status_t quantize_dequantize_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -208,9 +210,11 @@ status_t quantize_dequantize_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -265,7 +269,7 @@ status_t quantize_dequantize_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -283,9 +287,11 @@ status_t quantize_dequantize_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/quantize.hpp b/src/graph/backend/dnnl/kernels/quantize.hpp
index 8067987826c..7136b7a140d 100644
--- a/src/graph/backend/dnnl/kernels/quantize.hpp
+++ b/src/graph/backend/dnnl/kernels/quantize.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct quantize_dequantize_t : public kernel_base_t {
     memory_planner_t memory_planner_;
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     quantize_dequantize_t() {
@@ -92,6 +92,7 @@ struct quantize_dequantize_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(quantize_dequantize_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(quantize_dequantize_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/reduction.cpp b/src/graph/backend/dnnl/kernels/reduction.cpp
index 883664ba080..eae512167fe 100644
--- a/src/graph/backend/dnnl/kernels/reduction.cpp
+++ b/src/graph/backend/dnnl/kernels/reduction.cpp
@@ -178,7 +178,7 @@ status_t reduction_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
diff --git a/src/graph/backend/dnnl/kernels/reduction.hpp b/src/graph/backend/dnnl/kernels/reduction.hpp
index 705a2e746bd..a9acd28d02d 100644
--- a/src/graph/backend/dnnl/kernels/reduction.hpp
+++ b/src/graph/backend/dnnl/kernels/reduction.hpp
@@ -96,6 +96,7 @@ struct reduction_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(reduction_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(reduction_t)
 };
 
 using float_reduction = reduction_t</* quantized */ false>;
diff --git a/src/graph/backend/dnnl/kernels/reorder.cpp b/src/graph/backend/dnnl/kernels/reorder.cpp
index 4da6324fa7e..d8206538e35 100644
--- a/src/graph/backend/dnnl/kernels/reorder.cpp
+++ b/src/graph/backend/dnnl/kernels/reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ status_t reorder_t<quantized>::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -153,9 +153,11 @@ status_t reorder_t<quantized>::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -223,9 +225,11 @@ status_t reorder_t<quantized>::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -281,7 +285,7 @@ status_t reorder_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -299,9 +303,11 @@ status_t reorder_t<quantized>::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/reorder.hpp b/src/graph/backend/dnnl/kernels/reorder.hpp
index 0c9efc2e36c..c40786606ee 100644
--- a/src/graph/backend/dnnl/kernels/reorder.hpp
+++ b/src/graph/backend/dnnl/kernels/reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ struct reorder_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     reorder_t() {
@@ -98,6 +98,7 @@ struct reorder_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(reorder_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(reorder_t)
 };
 
 using float_reorder = reorder_t</* quantized */ false>;
diff --git a/src/graph/backend/dnnl/kernels/resampling.cpp b/src/graph/backend/dnnl/kernels/resampling.cpp
index c9bf0745c72..83e9bdc63b0 100644
--- a/src/graph/backend/dnnl/kernels/resampling.cpp
+++ b/src/graph/backend/dnnl/kernels/resampling.cpp
@@ -179,7 +179,7 @@ status_t resampling_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
@@ -339,7 +339,7 @@ status_t resampling_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
diff --git a/src/graph/backend/dnnl/kernels/resampling.hpp b/src/graph/backend/dnnl/kernels/resampling.hpp
index 0ec960337c3..7bf80471d6a 100644
--- a/src/graph/backend/dnnl/kernels/resampling.hpp
+++ b/src/graph/backend/dnnl/kernels/resampling.hpp
@@ -95,6 +95,7 @@ struct resampling_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(resampling_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(resampling_fwd_t)
 };
 
 #if BUILD_TRAINING
@@ -149,6 +150,7 @@ struct resampling_bwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(resampling_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(resampling_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/sdp.hpp b/src/graph/backend/dnnl/kernels/sdp.hpp
index d02c8c6498d..15549e12d48 100644
--- a/src/graph/backend/dnnl/kernels/sdp.hpp
+++ b/src/graph/backend/dnnl/kernels/sdp.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,9 +27,13 @@
 #include "graph/backend/dnnl/kernels/large_partition.hpp"
 #include "graph/backend/dnnl/kernels/sdp_decomp.hpp"
 #include "graph/backend/dnnl/kernels/sdp_primitive.hpp"
+#include "graph/backend/dnnl/kernels/sdp_primitive_v1.hpp"
 
 #include "graph/backend/dnnl/dnnl_partition_impl.hpp"
 
+#define VDISPATCH_GRAPH_SDP(msg, ...) \
+    VINFO(graph, create, dispatch, compile, msg, ##__VA_ARGS__)
+
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -48,44 +52,74 @@ struct sdp_base_t : public kernel_base_t {
             const std::vector<logical_tensor_t> &inputs,
             const std::vector<logical_tensor_t> &outputs) override {
         const engine_kind_t ekind = g_engine->kind();
-        const bool enable_decomp
-                = ekind == engine_kind::cpu && enable_decomp_kernel();
-        const bool enable_prim = (ekind == engine_kind::gpu) && !quantized;
-        status_t subkernel_status = status::unimplemented;
-
-        if (enable_prim) {
-            kernel = std::make_shared<sdp_primitive_kernel_t>();
-            subkernel_status
-                    = kernel->compile_impl(part, g_engine, inputs, outputs);
+        bool enable_decomp = false;
+        bool enable_ukernel = false;
+
+        if (ekind == engine_kind::cpu) {
+            enable_decomp = enable_decomp_kernel();
+        } else if (ekind == engine_kind::gpu) {
+            enable_ukernel = !force_primitive();
+        } else {
+            assert(!"unknown engine kind");
+            return status::invalid_arguments;
+        }
+
+        status_t ret = status::unimplemented;
+
+        // SDPA Ukernel v1 with fused internal sdpa solution. Support fload sdpa
+        // only.
+        // TODO(GX): Support quantized sdpa and merge with sdp_primitive_kernel_t.
+        if (enable_ukernel && !quantized) {
+            kernel = std::make_shared<sdp_primitive_v1_kernel_t>();
+            ret = kernel->compile_impl(part, g_engine, inputs, outputs);
         }
 
-        if (subkernel_status != status::success && enable_decomp) {
+        if (ret != status::success && enable_ukernel) {
+            kernel = std::make_shared<sdp_primitive_kernel_t<quantized>>();
+            ret = kernel->compile_impl(part, g_engine, inputs, outputs);
+        }
+
+        if (ret != status::success && enable_decomp) {
             kernel = std::make_shared<sdp_decomp_kernel_t<quantized, dt>>();
-            subkernel_status
-                    = kernel->compile_impl(part, g_engine, inputs, outputs);
+            ret = kernel->compile_impl(part, g_engine, inputs, outputs);
         }
 
-        if (subkernel_status != status::success) {
+        if (ret != status::success) {
             kernel = std::make_shared<larger_partition_kernel_t>();
-            return kernel->compile_impl(part, g_engine, inputs, outputs);
+            ret = kernel->compile_impl(part, g_engine, inputs, outputs);
         }
-        return subkernel_status;
+        if (ret == status::success)
+            VDISPATCH_GRAPH_SDP(
+                    "sdpa is dispatched to (%s)", kernel->str().c_str());
+        else
+            VDISPATCH_GRAPH_SDP("sdpa is failed to dispatch");
+        return ret;
     }
 
-    // The fuction is used to check if enable the decomposition kernel based on
-    // user's env and params. Currently, we restrict the library’s CPU runtime
-    // to openmp and threadpool. There is also an internal env var to decide if
-    // use the kernel.
-    // TODO: Remove CPU runtime check when we extend the support for others
-    bool enable_decomp_kernel() {
+    // It is used to check if enable the decomposition kernel based on user's
+    // env and params. Decomposition kernel is enabled when:
+    // - CPU runtime is OMP or THREADPOOl.
+    // - Primitive based implementation is not forced by the internal env var.
+    bool enable_decomp_kernel() const {
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP \
         || DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL
-        return graph::utils::getenv_int_internal("ENABLE_SDP_DECOMP", 1) > 0;
+        const bool force_prim = force_primitive();
+        return !force_prim;
 #else
         return false;
 #endif
     }
 
+    // An internal env var is provided to force using primitive based SDPA
+    // implementation and skipping ukernel based optimization on GPU or
+    // decomposition based optimization on CPU. Currently it's for oneDNN debug
+    // and testing only.
+    bool force_primitive() const {
+        const int force = graph::utils::getenv_int_internal(
+                "GRAPH_SDPA_FORCE_PRIMITIVE", 0);
+        return force > 0;
+    }
+
     status_t execute_impl(const stream_t *g_stream,
             const std::vector<tensor_t> &inputs,
             const std::vector<tensor_t> &outputs) override {
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp
index 225c9925f83..cf87009c1d7 100644
--- a/src/graph/backend/dnnl/kernels/sdp_decomp.cpp
+++ b/src/graph/backend/dnnl/kernels/sdp_decomp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -60,6 +60,8 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(
     pass_pipeline_t pipeline = pass_pipeline_t(vis);
     pass_pipeline_t select_pipeline = pass_pipeline_t(vis);
     BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    // Decompose select to binary ops if necessary
+    BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_reshape_for_gqa);
     // Fusion and canonicalization passes begin
     if (quantized) {
@@ -84,7 +86,7 @@ status_t sdp_decomp_kernel_t<quantized, dt>::compile_impl(
         BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
     }
     pipeline.reset_visualize_arg(true, false);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // Run the added passes
@@ -288,12 +290,18 @@ status_t sdp_decomp_kernel_t<quantized, dt>::execute_impl(
             auto &sub_mm1_post_add_tid
                     = res->mem_map[sdp_cfg_.sub_mm1_post_mem[start_index++]
                                            .get()][tid];
-            auto mask_input = inputs[sdp_cfg_.graph_inport[3]];
-            auto mask_strides = ltw(mask_input.get_logical_tensor()).vstrides();
+            const auto &mask_input = inputs[sdp_cfg_.graph_inport[3]];
+            const auto mask_strides
+                    = ltw(mask_input.get_logical_tensor()).vstrides();
+            const auto mask_dims = ltw(mask_input.get_logical_tensor()).vdims();
+            size_t mask_offset = 0;
+            if (mask_dims.size() == 4) {
+                if (mask_dims[0] != 1) mask_offset += bo * mask_strides[0];
+                if (mask_dims[1] != 1) mask_offset += bi * mask_strides[1];
+            }
             sub_mm1_post_add_tid.set_data_handle(
                     static_cast<char *>(mask_input.get_data_handle())
-                    + bo * mask_strides[0]
-                            * get_mem_dt_size(sub_mm1_post_add_tid));
+                    + mask_offset * get_mem_dt_size(sub_mm1_post_add_tid));
         }
         if (sdp_cfg_.has_select) {
             //connect select_graph and sdp_graph
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp.hpp b/src/graph/backend/dnnl/kernels/sdp_decomp.hpp
index aee347cea21..78b21f11496 100644
--- a/src/graph/backend/dnnl/kernels/sdp_decomp.hpp
+++ b/src/graph/backend/dnnl/kernels/sdp_decomp.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -109,7 +109,7 @@ struct sdp_decomp_kernel_t : public kernel_base_t {
                                      std::vector<std::unordered_map<int,
                                              memory>> &args) {
                 args.resize(nthr);
-                for (auto iter : ori_args) {
+                for (const auto &iter : ori_args) {
                     memory ori_mem = iter.second;
                     if (mem_map.count(ori_mem.get()) == 0) {
                         //construct new memory
@@ -183,6 +183,7 @@ struct sdp_decomp_kernel_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(sdp_decomp_kernel_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(sdp_decomp_kernel_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp b/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp
index 4a8250f61fb..a6d03d8c260 100644
--- a/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp
+++ b/src/graph/backend/dnnl/kernels/sdp_decomp_config.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,11 @@
 *******************************************************************************/
 
 #include "graph/backend/dnnl/kernels/sdp_decomp_config.hpp"
+#include "graph/interface/shape_infer.hpp"
+
+#define VCHECK_SDP_DECOMP(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, sdp_decomp_kernel_t, (cond), status, msg, \
+            ##__VA_ARGS__);
 
 namespace dnnl {
 namespace impl {
@@ -25,20 +30,47 @@ bool sdp_decomp_config_t::initial_check(const std::shared_ptr<subgraph_t> &sg,
         const std::vector<logical_tensor_t> &inputs) {
     // The order of input logical tensors in inputs is not certain, we need
     // to record the input offset in a certain order of ops.
-    auto op_status = record_input_offset(sg, inputs);
-    if (op_status != status::success) return false;
+    CHECK_BOOL(record_input_offset(sg, inputs));
     dims src1_user_dims = ltw(inputs[graph_inport[0]]).vdims();
-    if (src1_user_dims.size() != 4) return false;
+    VCHECK_SDP_DECOMP(src1_user_dims.size() == 4, false,
+            "Input dims should be 4, but got %zu", src1_user_dims.size());
 
     // Initialize SDP input dimension according to the src of mm1
     batch_size = src1_user_dims[0];
     num_head_q = src1_user_dims[1];
     seq_len_q = src1_user_dims[2];
-    size_per_head = src1_user_dims[3];
+    head_size_qk = src1_user_dims[3];
 
     dims wei1_user_dims = ltw(inputs[graph_inport[1]]).vdims();
     num_head_kv = wei1_user_dims[1];
 
+    // Check batch size compatibility.
+    dims wei2_user_dims = ltw(inputs[graph_inport[4]]).vdims();
+    VCHECK_SDP_DECOMP(
+            batch_size == wei1_user_dims[0] && batch_size == wei2_user_dims[0],
+            false,
+            "Batch size mismatch, batch_size: %lld, wei1: %lld, wei2: %lld",
+            batch_size, wei1_user_dims[0], wei2_user_dims[0]);
+
+    head_size_v = wei2_user_dims[3];
+    // Check scale size
+    if (graph_inport[2] != -1) {
+        auto scale_sz = ltw(inputs[graph_inport[2]]).nelems();
+        VCHECK_SDP_DECOMP(scale_sz == 1, false,
+                "Only supports single scale value, but got %lld", scale_sz);
+    }
+
+    // Check select cond and src0 shape
+    if (graph_inport[5] != -1 && graph_inport[6] != -1) {
+        const auto select_cond_dims = ltw(inputs[graph_inport[5]]).vdims();
+        const auto select_src0_dims = ltw(inputs[graph_inport[6]]).vdims();
+        VCHECK_SDP_DECOMP(select_cond_dims != select_src0_dims, false,
+                "Only supports select for case requiring broadcast cond input, "
+                "but got cond dims %s and src0 dims %s",
+                dims2str(select_cond_dims).c_str(),
+                dims2str(select_src0_dims).c_str());
+    }
+
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP
 // RATIO is an empirical value used to determine the numerical relationship
 // between batch_size, num_head_q and thread number to determine whether to use
@@ -53,10 +85,13 @@ bool sdp_decomp_config_t::initial_check(const std::shared_ptr<subgraph_t> &sg,
 #define RATIO 2
     // Initialize nthr with current threads num
     nthr = dnnl_get_current_num_threads();
-    return batch_size * num_head_q > RATIO * nthr;
-#else
-    return true;
+    VCHECK_SDP_DECOMP(batch_size * num_head_q > RATIO * nthr, false,
+            "Doesn't meet condition for decompose: Batch size * num_head_q "
+            "should be larger than ratio * nthr, but got batch_size %lld, "
+            "num_head_q %lld, ration %d , nthr %d",
+            batch_size, num_head_q, RATIO, nthr);
 #endif
+    return true;
 }
 
 template <bool quantized, memory::data_type dt>
@@ -66,7 +101,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
         const std::vector<logical_tensor_t> &inputs) {
 
     // Record the ops inside of SDP pattern for later usage
-    record_sdp_ops(sg, quantized);
+    CHECK(record_sdp_ops(sg, quantized));
 
     // Update SDPA input params. Sequence length for query and key/value are
     // NOT always same.
@@ -81,7 +116,11 @@ impl::status_t sdp_decomp_config_t::construct_params(
     memory::data_type dt_wei_user = static_cast<memory::data_type>(
             ltw(inputs[graph_inport[1]]).data_type());
     memory::data_type dt_wei = quantized ? memory::data_type::s8 : dt_src_user;
-    memory::data_type dt_inter = quantized ? dt : dt_src_user;
+    memory::data_type dt_inter = quantized
+            ? dt
+            : static_cast<memory::data_type>(
+                    ltw(sdp_op[1]->get_output_value(0)->get_logical_tensor())
+                            .data_type());
 
     ////////////////////////////////////////////////////////////////////////
     ////////////// Start Creating primitives ///////////////////////////////
@@ -103,7 +142,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
     sub_reorder0_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     // per-head: reorder src1 to dense, for first matmul
-    dims sub_src1_dims = {1, 1, seq_len_q, size_per_head};
+    dims sub_src1_dims = {1, 1, seq_len_q, head_size_qk};
     src1_strides = ltw(inputs[graph_inport[0]]).vstrides();
     sub_src1_md = memory::desc(sub_src1_dims, dt_src_user,
             {1, 1, src1_strides[2], src1_strides[3]});
@@ -118,7 +157,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
     // create reorder1 primitive attr
     dnnl::primitive_attr sub_reorder1_attr
             = make_primitive_attr(sdp_op[0], mgr);
-    dims sub_wei1_dims = {1, 1, size_per_head, seq_len_kv};
+    dims sub_wei1_dims = {1, 1, head_size_qk, seq_len_kv};
     auto wei_md = make_dnnl_memory_desc(
             sdp_op[1]->get_input_value(1)->get_logical_tensor());
     wei1_strides = wei_md.get_strides();
@@ -133,8 +172,8 @@ impl::status_t sdp_decomp_config_t::construct_params(
     // first matmul
     // create first matmul primitive attr
     dnnl::primitive_attr sub_matmul1_attr = make_primitive_attr(sdp_op[1], mgr);
-    dims sub_mm1_src_dims = {1, 1, seq_len_q, size_per_head};
-    dims sub_mm1_wei_dims = {1, 1, size_per_head, seq_len_kv};
+    dims sub_mm1_src_dims = {1, 1, seq_len_q, head_size_qk};
+    dims sub_mm1_wei_dims = {1, 1, head_size_qk, seq_len_kv};
     dims sub_mm1_dst_dims = {1, 1, seq_len_q, seq_len_kv};
 
     sub_mm1_src_md = memory::desc(sub_mm1_src_dims, dt_src_user, tag::abcd);
@@ -156,7 +195,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
         sub_mm1_post_md.emplace_back(new_sub_md);
         dnnl_pops.append_binary(alg, new_sub_md);
     }
-    sub_matmul1_attr.set_post_ops(std::move(dnnl_pops));
+    sub_matmul1_attr.set_post_ops(dnnl_pops);
     auto sub_mm1_pd = matmul::primitive_desc(p_engine, sub_mm1_src_md,
             sub_mm1_wei_md, sub_mm1_dst_md, sub_matmul1_attr);
     sub_mm1_prim = matmul(sub_mm1_pd);
@@ -175,7 +214,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
     // create reorder2 primitive attr
     dnnl::primitive_attr sub_reorder2_attr
             = make_primitive_attr(sdp_op[3], mgr);
-    dims sub_wei2_dims = {1, 1, seq_len_kv, size_per_head};
+    dims sub_wei2_dims = {1, 1, seq_len_kv, head_size_v};
     wei2_strides = ltw(inputs[graph_inport[4]]).vstrides();
     sub_wei2_user_md = memory::desc(sub_wei2_dims, dt_wei_user,
             {1, 1, wei2_strides[2], wei2_strides[3]});
@@ -189,8 +228,8 @@ impl::status_t sdp_decomp_config_t::construct_params(
     // create second matmul primitive attr
     dnnl::primitive_attr sub_matmul2_attr = make_primitive_attr(sdp_op[4], mgr);
     dims sub_mm2_src_dims = {1, 1, seq_len_q, seq_len_kv};
-    dims sub_mm2_wei_dims = {1, 1, seq_len_kv, size_per_head};
-    dims sub_mm2_dst_dims = {1, 1, seq_len_q, size_per_head};
+    dims sub_mm2_wei_dims = {1, 1, seq_len_kv, head_size_v};
+    dims sub_mm2_dst_dims = {1, 1, seq_len_q, head_size_v};
     auto sub_mm2_src_md
             = memory::desc(sub_mm2_src_dims, dt_src_user, tag::abcd);
     sub_mm2_wei_md = memory::desc(sub_mm2_wei_dims, dt_wei, tag::abcd);
@@ -202,7 +241,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
     // per-head: reorder dst2 from dense to strided
     primitive_attr sub_reorder3_attr;
     sub_reorder3_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    dims sub_dst_dims = {1, 1, seq_len_q, size_per_head};
+    dims sub_dst_dims = {1, 1, seq_len_q, head_size_v};
     auto out_lt = sdp_op[4]->get_output_value(0)->get_logical_tensor();
     dst_strides = ltw(out_lt).vstrides();
     sub_dst_md = memory::desc(sub_dst_dims, dt_src_user, tag::abcd);
@@ -257,8 +296,7 @@ impl::status_t sdp_decomp_config_t::construct_params(
     sub_mm1_wei = memory(sub_mm1_wei_md, p_engine, nullptr);
     sub_mm1_dst = memory(sub_mm1_dst_md, p_engine, nullptr);
     for (size_t i = 0; i < sub_mm1_post_md.size(); i++) {
-        sub_mm1_post_mem.emplace_back(
-                memory(sub_mm1_post_md[i], p_engine, nullptr));
+        sub_mm1_post_mem.emplace_back(sub_mm1_post_md[i], p_engine, nullptr);
     }
     // softmax
     sub_softmax_dst = memory(sub_softmax_dst_md, p_engine, nullptr);
@@ -422,9 +460,15 @@ impl::status_t sdp_decomp_config_t::record_input_offset(
                     graph::op_kind::Add, graph::op_kind::Select,
                     graph::op_kind::SoftMax};
     for (const auto &cur_op : sg->get_ops()) {
+        const auto &op_kind = cur_op->get_kind();
+        VCHECK_SDP_DECOMP(op_kind != graph::op_kind::GenIndex,
+                status::unimplemented, "Not support implicit causal mask");
+        VCHECK_SDP_DECOMP(op_kind != graph::op_kind::DynamicDequantize,
+                status::unimplemented,
+                "Decomposed kernel does not support dynamic quantization");
         // both mm1 and mm2 are found.
         if (mm1 && mm2) break;
-        if (cur_op->get_kind() != graph::op_kind::MatMul) continue;
+        if (op_kind != graph::op_kind::MatMul) continue;
 
         auto post_op = get_post_op(cur_op);
         if (post_op && post_op_kind.count(post_op->get_kind())) {
@@ -433,9 +477,9 @@ impl::status_t sdp_decomp_config_t::record_input_offset(
             // TODO(xxx): Currently, p2 is not supported by decomp kernel.
             // p1: [matmul] --> [scale] --> [select] --> [mask] --> ...
             // p2: [matmul] --> [select] --> [scale] --> [mask] --> ...
-            if (post_op->get_kind() == graph::op_kind::Select) {
-                return status::unimplemented;
-            }
+            VCHECK_SDP_DECOMP(post_op->get_kind() != graph::op_kind::Select,
+                    status::unimplemented,
+                    "Not support select between matmul1 and scale");
             // find scale
             if (post_op->get_kind() == graph::op_kind::Divide
                     || post_op->get_kind() == graph::op_kind::Multiply) {
@@ -460,8 +504,8 @@ impl::status_t sdp_decomp_config_t::record_input_offset(
             mm2 = cur_op;
         }
     }
-    if (impl::utils::one_of(nullptr, mm1, mm2)) return status::invalid_graph;
-
+    VCHECK_SDP_DECOMP(mm1 != nullptr && mm2 != nullptr, status::invalid_graph,
+            "Failed to find matmul1 or matmul2");
     int src1_id = find_graph_inport(mm1->get_input_value(0));
     graph_inport.emplace_back(src1_id);
     int wei1_id = find_graph_inport(mm1->get_input_value(1));
@@ -511,14 +555,13 @@ impl::status_t sdp_decomp_config_t::record_sdp_ops(
             return nullptr;
     };
 
-    subgraph_rewriter_t rewriter(sg);
-
     for (const auto &cur_op : sg->get_ops()) {
         if (!cur_op || cur_op->get_kind() != op_kind::dnnl_matmul) continue;
         auto post_op = get_post_op(cur_op);
         if (!post_op || post_op->get_kind() != op_kind::dnnl_softmax) continue;
         auto ppost_op = get_post_op(post_op);
-        if (!ppost_op) return status::invalid_graph;
+        VCHECK_SDP_DECOMP(ppost_op != nullptr, status::invalid_graph,
+                "Failed to find post post op for matmul");
 
         op_ptr reorder1;
         op_ptr reorder2;
diff --git a/src/graph/backend/dnnl/kernels/sdp_decomp_config.hpp b/src/graph/backend/dnnl/kernels/sdp_decomp_config.hpp
index ae16348da88..de6dfd5f2c3 100644
--- a/src/graph/backend/dnnl/kernels/sdp_decomp_config.hpp
+++ b/src/graph/backend/dnnl/kernels/sdp_decomp_config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -74,8 +74,8 @@ struct sdp_decomp_config_t {
     sdp_decomp_config_t() = default;
 
     // SDP input dimension
-    dim_t batch_size, num_head_q, num_head_kv, seq_len_q, seq_len_kv,
-            size_per_head;
+    dim_t batch_size, num_head_q, num_head_kv, seq_len_q, seq_len_kv;
+    dim_t head_size_qk, head_size_v;
 
     // SDP input and output strides
     dims src1_strides, wei1_strides, wei2_strides, dst_strides,
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive.cpp
index 50e977994e1..d7c244b4a98 100644
--- a/src/graph/backend/dnnl/kernels/sdp_primitive.cpp
+++ b/src/graph/backend/dnnl/kernels/sdp_primitive.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "common/sdpa_pd.hpp"
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-#include "gpu/intel/ocl/ocl_stream.hpp"
+#include "gpu/intel/ocl/stream.hpp"
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
 #include "gpu/intel/sycl/stream.hpp"
 #endif
@@ -40,9 +40,15 @@ namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
-status_t sdp_primitive_kernel_t::compile_impl(const dnnl_partition_impl_t *part,
-        const engine_t *g_engine, const std::vector<logical_tensor_t> &inputs,
+template <bool quantized>
+status_t sdp_primitive_kernel_t<quantized>::compile_impl(
+        const dnnl_partition_impl_t *part, const engine_t *g_engine,
+        const std::vector<logical_tensor_t> &inputs,
         const std::vector<logical_tensor_t> &outputs) {
+// sdp_primitive_kernel_t only supports Intel GPU.
+#if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
     p_engine_ = make_dnnl_engine(*g_engine);
     g_alloc_
             = reinterpret_cast<graph::allocator_t *>(g_engine->get_allocator());
@@ -61,13 +67,32 @@ status_t sdp_primitive_kernel_t::compile_impl(const dnnl_partition_impl_t *part,
     pass_pipeline_t pipeline = pass_pipeline_t(vis);
 
     BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_implicit_causal_mask);
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_reshape_for_gqa);
+    if (quantized) {
+        BACKEND_DNNL_ADD_PASS(pipeline, lift_up_typecast);
+        BACKEND_DNNL_ADD_PASS(pipeline, lift_up_quantize);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_typecast_to_matmul_or_conv);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_typecast_to_predecessor);
+        BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_src_scales);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_scales);
+        BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_src_zero_points);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_zero_points);
+        BACKEND_DNNL_ADD_PASS(pipeline, insert_runtime_u8_to_s8_for_matmul);
+        BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_dst_scales);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_scales);
+        BACKEND_DNNL_ADD_PASS(pipeline, convert_to_runtime_dst_zero_points);
+        BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_zero_points);
+    }
     BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization);
     BACKEND_DNNL_ADD_PASS(pipeline, insert_permute_for_matmul);
+    if (quantized) {
+        BACKEND_DNNL_ADD_PASS(pipeline, remove_quant_data_with_no_effect);
+    }
 
     pipeline.reset_visualize_arg(true, false);
     BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
-    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
     BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
 
     // bind the memory for each op
@@ -101,13 +126,16 @@ status_t sdp_primitive_kernel_t::compile_impl(const dnnl_partition_impl_t *part,
     };
 
     CHECK(modify_subgraph());
+
+    cfg_.quantized_ = quantized;
     CHECK(cfg_.init(subgraph_, p_engine_, inputs, outputs));
 
     return status::success;
 }
 
-void sdp_primitive_kernel_t::prepare_args_set(const execution_args_set_t *res,
-        const std::vector<tensor_t> &inputs,
+template <bool quantized>
+void sdp_primitive_kernel_t<quantized>::prepare_args_set(
+        const execution_args_set_t *res, const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs, const scratchpad_t &scratchpad) {
     // update the data of partition in/outputs args
     for (const auto &mem_idx : res->get_mems_use_external_inputs()) {
@@ -117,17 +145,12 @@ void sdp_primitive_kernel_t::prepare_args_set(const execution_args_set_t *res,
         mem_idx.first.set_data_handle(
                 outputs[mem_idx.second].get_data_handle());
     }
-
-    grantor_t var_grantor = memory_planner_.internal_temporary_grantor(
-            scratchpad.get_buffer());
-
-    for (auto &mem_offkey : res->get_mems_use_internal_temporary()) {
-        mem_offkey.first.set_data_handle(var_grantor.get(mem_offkey.second));
-    }
 }
 
-status_t sdp_primitive_kernel_t::get_prim_exec_args(exec_args_t &args,
-        memory (&mem_storage)[6], const execution_args_set_t *res) const {
+template <bool quantized>
+status_t sdp_primitive_kernel_t<quantized>::get_prim_exec_args(
+        exec_args_t &args, memory (&mem_storage)[10],
+        const execution_args_set_t *res) const {
     bool ok = res->find_value_mem_map(cfg_.q_.get(), mem_storage[0])
             && res->find_value_mem_map(cfg_.k_.get(), mem_storage[1])
             && res->find_value_mem_map(cfg_.v_.get(), mem_storage[2])
@@ -139,8 +162,25 @@ status_t sdp_primitive_kernel_t::get_prim_exec_args(exec_args_t &args,
         ok = ok
                 && res->find_value_mem_map(
                         cfg_.attn_mask_.get(), mem_storage[5]);
+    if (quantized && !(cfg_.k_scale_ || cfg_.v_scale_))
+        return status::invalid_arguments;
+    if (cfg_.k_scale_)
+        ok = ok && res->find_value_mem_map(cfg_.k_scale_.get(), mem_storage[6]);
+    if (cfg_.v_scale_)
+        ok = ok && res->find_value_mem_map(cfg_.v_scale_.get(), mem_storage[7]);
+
+    if (cfg_.k_zero_points_)
+        ok = ok
+                && res->find_value_mem_map(
+                        cfg_.k_zero_points_.get(), mem_storage[8]);
+    if (cfg_.v_zero_points_)
+        ok = ok
+                && res->find_value_mem_map(
+                        cfg_.v_zero_points_.get(), mem_storage[9]);
 
-    if (!ok) return status::runtime_error;
+    VCONDCHECK(graph, exec, check, sdp_primitive_kernel, ok,
+            status::runtime_error,
+            "sdp_primitive_kernel get_prim_exec_args failed");
 
     memory_arg_t mem_arg_q = {mem_storage[0].get(), true};
     memory_arg_t mem_arg_k = {mem_storage[1].get(), true};
@@ -148,6 +188,10 @@ status_t sdp_primitive_kernel_t::get_prim_exec_args(exec_args_t &args,
     memory_arg_t mem_arg_dst = {mem_storage[3].get(), false};
     memory_arg_t mem_arg_scale = {mem_storage[4].get(true), true};
     memory_arg_t mem_arg_mask = {mem_storage[5].get(true), true};
+    memory_arg_t mem_arg_k_scale = {mem_storage[6].get(true), true};
+    memory_arg_t mem_arg_v_scale = {mem_storage[7].get(true), true};
+    memory_arg_t mem_arg_k_zero_points = {mem_storage[8].get(true), true};
+    memory_arg_t mem_arg_v_zero_points = {mem_storage[9].get(true), true};
 
     args.clear();
     args[DNNL_ARG_QUERIES] = mem_arg_q;
@@ -156,12 +200,17 @@ status_t sdp_primitive_kernel_t::get_prim_exec_args(exec_args_t &args,
     args[DNNL_ARG_DST] = mem_arg_dst;
     args[DNNL_ARG_SCALE] = mem_arg_scale;
     args[DNNL_ARG_ATTN_MASK] = mem_arg_mask;
+    args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_KEYS] = mem_arg_k_scale;
+    args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_VALUES] = mem_arg_v_scale;
+    args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_KEYS] = mem_arg_k_zero_points;
+    args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_VALUES] = mem_arg_v_zero_points;
 
     return status::success;
 }
 
-status_t sdp_primitive_kernel_t::execute_impl(const stream_t *g_stream,
-        const std::vector<tensor_t> &inputs,
+template <bool quantized>
+status_t sdp_primitive_kernel_t<quantized>::execute_impl(
+        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs) {
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
@@ -174,7 +223,7 @@ status_t sdp_primitive_kernel_t::execute_impl(const stream_t *g_stream,
     temporary_scratchpad_t scratchpad(0, p_engine_, *g_alloc_);
     prepare_args_set(res, inputs, outputs, scratchpad);
 
-    memory mem_storage[6];
+    memory mem_storage[10];
     exec_args_t args;
     CHECK(get_prim_exec_args(args, mem_storage, res));
     exec_ctx_t ctx(p_stream.get(), std::move(args));
@@ -183,12 +232,16 @@ status_t sdp_primitive_kernel_t::execute_impl(const stream_t *g_stream,
 }
 
 #ifdef DNNL_WITH_SYCL
-status_t sdp_primitive_kernel_t::sycl_execute_impl(const stream_t *g_stream,
-        const std::vector<tensor_t> &inputs,
+template <bool quantized>
+status_t sdp_primitive_kernel_t<quantized>::sycl_execute_impl(
+        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs,
         const std::vector<::sycl::event> &sycl_deps,
         ::sycl::event *sycl_event) {
-
+// sdp_primitive_kernel_t only supports Intel GPU.
+#if DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
@@ -200,7 +253,7 @@ status_t sdp_primitive_kernel_t::sycl_execute_impl(const stream_t *g_stream,
     temporary_scratchpad_t scratchpad(0, p_engine_, *g_alloc_);
     prepare_args_set(res, inputs, outputs, scratchpad);
 
-    memory mem_storage[6];
+    memory mem_storage[10];
     exec_args_t args;
     CHECK(get_prim_exec_args(args, mem_storage, res));
     exec_ctx_t ctx(p_stream.get(), std::move(args));
@@ -229,8 +282,9 @@ status_t sdp_primitive_kernel_t::sycl_execute_impl(const stream_t *g_stream,
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-status_t sdp_primitive_kernel_t::ocl_execute_impl(const stream_t *g_stream,
-        const std::vector<tensor_t> &inputs,
+template <bool quantized>
+status_t sdp_primitive_kernel_t<quantized>::ocl_execute_impl(
+        const stream_t *g_stream, const std::vector<tensor_t> &inputs,
         const std::vector<tensor_t> &outputs,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
@@ -245,15 +299,14 @@ status_t sdp_primitive_kernel_t::ocl_execute_impl(const stream_t *g_stream,
     temporary_scratchpad_t scratchpad(0, p_engine_, *g_alloc_);
     prepare_args_set(res, inputs, outputs, scratchpad);
 
-    memory mem_storage[6];
+    memory mem_storage[10];
     exec_args_t args;
     CHECK(get_prim_exec_args(args, mem_storage, res));
     exec_ctx_t ctx(p_stream.get(), std::move(args));
 
     // TODO (pc): refactor
-    namespace ocl = gpu::intel::ocl;
-    auto *ocl_stream
-            = dnnl::impl::utils::downcast<ocl::ocl_stream_t *>(p_stream.get());
+    auto *ocl_stream = dnnl::impl::utils::downcast<gpu::intel::ocl::stream_t *>(
+            p_stream.get());
 
     ocl_stream->before_exec_hook();
 
@@ -280,6 +333,10 @@ status_t sdp_primitive_kernel_t::ocl_execute_impl(const stream_t *g_stream,
     return status;
 }
 #endif
+
+template struct sdp_primitive_kernel_t<true>;
+template struct sdp_primitive_kernel_t<false>;
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive.hpp
index 57ac02ab374..e792d495ec7 100644
--- a/src/graph/backend/dnnl/kernels/sdp_primitive.hpp
+++ b/src/graph/backend/dnnl/kernels/sdp_primitive.hpp
@@ -40,6 +40,7 @@ namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
+template <bool quantized>
 struct sdp_primitive_kernel_t : public kernel_base_t {
 private:
     allocator_t *g_alloc_ = nullptr;
@@ -72,7 +73,7 @@ struct sdp_primitive_kernel_t : public kernel_base_t {
             const std::vector<tensor_t> &outputs,
             const scratchpad_t &scratchpad);
 
-    status_t get_prim_exec_args(exec_args_t &args, memory (&mem_storage)[6],
+    status_t get_prim_exec_args(exec_args_t &args, memory (&mem_storage)[10],
             const execution_args_set_t *res) const;
 
     status_t execute_impl(const stream_t *g_stream,
@@ -95,6 +96,7 @@ struct sdp_primitive_kernel_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(sdp_primitive_kernel_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(sdp_primitive_kernel_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp
index 9dba68fa14c..84d5692eecd 100644
--- a/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp
+++ b/src/graph/backend/dnnl/kernels/sdp_primitive_config.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,13 @@
 *******************************************************************************/
 
 #include "graph/backend/dnnl/kernels/sdp_primitive_config.hpp"
+#include "graph/backend/dnnl/fusion_info.hpp"
+
+#include "common/compiler_workarounds.hpp"
+
+#define VCHECK_SDP_PRIMITIVE(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, sdp_primitive_kernel_t, (cond), status, \
+            msg, ##__VA_ARGS__);
 
 namespace dnnl {
 namespace impl {
@@ -51,7 +58,7 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
     op_ptr mm1 = nullptr, mm2 = nullptr, scale = nullptr, add = nullptr,
            final_op = nullptr;
     const std::unordered_set<op_kind_t> mm1_post_op_kind
-            = {op_kind::dnnl_binary, op_kind::dnnl_softmax};
+            = {op_kind::dnnl_binary, op_kind::dnnl_softmax, op_kind::dnnl_mask};
     for (const auto &cur_op : sg->get_ops()) {
         if (in_tensor_list(cur_op->get_output_value(0).get(), outputs))
             final_op = cur_op;
@@ -60,7 +67,8 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
         if (post_op && mm1_post_op_kind.count(post_op->get_kind())) {
             // Locate mm1 and all post ops(scale and mask) here.
             // 1. locate mm1
-            if (mm1) return status::unimplemented;
+            VCHECK_SDP_PRIMITIVE(mm1 == nullptr, status::unimplemented,
+                    "Multiple mm1 found");
             mm1 = cur_op;
             // At least one of scale and mask exists
             if (post_op->get_kind() == op_kind::dnnl_binary) {
@@ -78,20 +86,41 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
                 // 3. locate mask if have
                 if (post_op->get_kind() == op_kind::dnnl_binary) {
                     add = post_op;
+                } else if (post_op->get_kind() == op_kind::dnnl_mask) {
+                    // implicit causal mask
+                    causal_mask_ = true;
                 }
+            } else if (post_op->get_kind() == op_kind::dnnl_mask) {
+                causal_mask_ = true;
             }
         } else {
-            if (mm2) return status::unimplemented;
+            VCHECK_SDP_PRIMITIVE(mm2 == nullptr, status::unimplemented,
+                    "Multiple mm2 found");
             mm2 = cur_op;
         }
     }
 
     // Locate input/outputs: Q, K, V, dst, scale, mask
-    if (!mm1 || !mm2 || !final_op) return status::unimplemented;
+    mm1_ = mm1;
+    mm2_ = mm2;
+    VCHECK_SDP_PRIMITIVE((mm1 && mm2 && final_op), status::unimplemented,
+            "Not all ops are found");
+
     q_ = mm1->get_input_value(0);
     k_ = mm1->get_input_value(1);
     v_ = mm2->get_input_value(1);
 
+    if (quantized_) {
+        // The input order of fused matmul is: src_0, src_1, scale, zero points
+        if (mm1->num_inputs() > 2) k_scale_ = mm1->get_input_value(2);
+        if (mm2->num_inputs() > 2) v_scale_ = mm2->get_input_value(2);
+
+        // asymmetric quantization for key.
+        if (4 == mm1->num_inputs()) k_zero_points_ = mm1->get_input_value(3);
+        // asymmetric quantization for value.
+        if (4 == mm2->num_inputs()) v_zero_points_ = mm2->get_input_value(3);
+    }
+
     auto k_follow = follow_back(k_);
     for (auto &t : inputs)
         if (k_follow->get_logical_tensor().id == t.id) {
@@ -110,7 +139,26 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
 
     if (add) {
         auto m0 = add->get_input_value(0), m1 = add->get_input_value(1);
-        attn_mask_ = in_tensor_list(m1.get(), inputs) ? m1 : m0;
+        if (in_tensor_list(m1.get(), inputs)) {
+            attn_mask_ = m1;
+        } else if (in_tensor_list(m0.get(), inputs)) {
+            attn_mask_ = m0;
+        } else if (m1->has_producer()
+                && m1->get_producer().get_kind() == op_kind::dnnl_unsqueeze
+                && in_tensor_list(
+                        m1->get_producer().get_input_value(0).get(), inputs)) {
+            // consider the case when mask is not 4D,
+            // unsqueeze op is inserted to broadcast the mask
+            attn_mask_ = m1;
+        } else if (m0->has_producer()
+                && m0->get_producer().get_kind() == op_kind::dnnl_unsqueeze
+                && in_tensor_list(
+                        m0->get_producer().get_input_value(0).get(), inputs)) {
+            attn_mask_ = m0;
+        } else {
+            VCHECK_SDP_PRIMITIVE(
+                    false, status::unimplemented, "explicit mask is not found");
+        }
     }
 
     return status::success;
@@ -118,9 +166,28 @@ status_t sdp_primitive_config_t::locate_io(std::shared_ptr<subgraph_t> &sg,
 
 status_t sdp_primitive_config_t::initial_check(
         const std::shared_ptr<subgraph_t> &sg,
-        const std::vector<logical_tensor_t> &inputs) {
+        const std::vector<logical_tensor_t> &inputs, bool v1_kernel) {
     // At least 3 inputs: Q, K, V
-    if (inputs.size() < 3) return status::invalid_arguments;
+    VCHECK_SDP_PRIMITIVE(inputs.size() >= 3, status::invalid_arguments,
+            "At least 3 inputs are required");
+
+    // Ukernel doesn't support f32 datatype now
+    VCHECK_SDP_PRIMITIVE(inputs[0].data_type != dnnl_data_type_t::dnnl_f32,
+            status::invalid_arguments,
+            "SDPA ukernel doesn't support f32 datatype now");
+
+    // Note: sdpa_primitive_v1 kernel currently don't support legacy GQA pattern.
+    if (v1_kernel) {
+        for (auto &cur_op : sg->get_ops()) {
+            if (cur_op->get_kind() == graph::op_kind::StaticReshape) {
+                auto in = cur_op->get_input_value(0)->get_logical_tensor();
+                auto out = cur_op->get_output_value(0)->get_logical_tensor();
+                if (ltw(in).ndims() == 5 || ltw(out).ndims() == 5) {
+                    return status::unimplemented;
+                }
+            }
+        }
+    }
 
     // step1(pattern check): Not support sdpa variants with select as mask
     // We already have a pattern matcher to ensure that the sdpa patterns
@@ -130,65 +197,130 @@ status_t sdp_primitive_config_t::initial_check(
             = {graph::op_kind::Divide, graph::op_kind::Multiply,
                     graph::op_kind::Add, graph::op_kind::Select,
                     graph::op_kind::SoftMax};
-    op_ptr mm1 = nullptr, mm2 = nullptr;
+    op_ptr mm1 = nullptr, mm2 = nullptr, scale = nullptr;
+    bool f32_inter = true;
     for (const auto &cur_op : sg->get_ops()) {
-        if (cur_op->get_kind() != graph::op_kind::MatMul) continue;
+        const auto &op_kind = cur_op->get_kind();
+        if (op_kind == graph::op_kind::DynamicDequantize
+                && cur_op->get_attr<std::string>(op_attr::qtype)
+                        == "per_group") {
+            if (!cur_op->has_attr(op_attr::group_shape))
+                return status::invalid_arguments;
+            const auto &group_shape = cur_op->get_attr<std::vector<int64_t>>(
+                    op_attr::group_shape);
+            const auto &input_lt
+                    = cur_op->get_input_value(0)->get_logical_tensor();
+            const auto &input_dims = ltw(input_lt).dims();
+            if (static_cast<int>(group_shape.size()) != ltw(input_lt).ndims())
+                return status::invalid_arguments;
+            // Due to the precision issue of ukernel implementation, we only
+            // support group_num=1 case for now.
+            for (size_t idx = 0; idx < group_shape.size(); ++idx) {
+                if (group_shape[idx] != 1
+                        && group_shape[idx] != input_dims[idx])
+                    return status::unimplemented;
+            }
+            // TODO(zhitao): execute the reorder for scale and zps mannually if the
+            // transpose attribute is specified as true.
+            auto post_op = get_post_op(cur_op);
+            if (post_op && post_op->get_kind() == graph::op_kind::MatMul
+                    && post_op->has_attr(op_attr::transpose_b)
+                    && post_op->get_attr<bool>(op_attr::transpose_b))
+                return status::unimplemented;
+        }
+        if (op_kind != graph::op_kind::MatMul) continue;
         auto post_op = get_post_op(cur_op);
         if (post_op && mm1_post_op_kind.count(post_op->get_kind())) {
             mm1 = cur_op;
+            const auto &lt_score
+                    = mm1->get_output_value(0)->get_logical_tensor();
+            f32_inter = f32_inter
+                    && (ltw(lt_score).data_type() == data_type::f32);
             // Not support select between mm1 and scale(optional)
             // GPT-J:[mm1] --> [select] --> [scale]* --> [mask]* --> ...
-            if (post_op->get_kind() == graph::op_kind::Select) {
-                return status::unimplemented;
-            }
+            VCHECK_SDP_PRIMITIVE(post_op->get_kind() != graph::op_kind::Select,
+                    status::unimplemented,
+                    "Not support select between mm1 and scale(optional)");
             // scale
             if (post_op->get_kind() == graph::op_kind::Divide
                     || post_op->get_kind() == graph::op_kind::Multiply) {
                 // Scale exists, update post_op and traverse to next op
+                scale = post_op;
                 post_op = get_post_op(post_op);
+                const auto &lt_ss
+                        = scale->get_output_value(0)->get_logical_tensor();
+                f32_inter = f32_inter
+                        && (ltw(lt_ss).data_type() == data_type::f32);
             }
             // mask
-            if (post_op->get_kind() == graph::op_kind::Add) {
-                // Mask exists, update post_op and traverse to next op
-                post_op = get_post_op(post_op);
-            }
-
-            // Not support select after scale(optional) and mask(optional)
-            // Distill-Bert:[mm1] --> [scale]* --> [mask]* --> [select] --> ...
-            if (post_op->get_kind() == graph::op_kind::Select) {
-                return status::unimplemented;
+            if (post_op) {
+                if (post_op->get_kind() == graph::op_kind::Add) {
+                    // Mask exists, update post_op and traverse to next op
+                    const auto mask = post_op;
+                    const auto &lt_ms
+                            = mask->get_output_value(0)->get_logical_tensor();
+                    f32_inter = f32_inter
+                            && (ltw(lt_ms).data_type() == data_type::f32);
+                    post_op = get_post_op(post_op);
+                }
+                // Not support select after scale(optional) and mask(optional)
+                // Distill-Bert:[mm1] --> [scale]* --> [mask]* --> [select] --> ...
+                VCHECK_SDP_PRIMITIVE(post_op
+                                && post_op->get_kind()
+                                        != graph::op_kind::Select,
+                        status::unimplemented,
+                        "Not support select after scale(optional) and "
+                        "mask(optional)");
             }
         } else {
             mm2 = cur_op;
         }
     }
 
-    // step2(data type check): only support fp16 now.
-    auto in_lt = inputs[0];
-    if (in_lt.data_type != dnnl_data_type_t::dnnl_f16)
-        return status::unimplemented;
+    VCHECK_SDP_PRIMITIVE(f32_inter, status::invalid_graph,
+            "only supports f32 intermediates.");
 
     auto find_graph_inport = [&inputs](const std::shared_ptr<value_t> &val) {
+        auto tmp_val = val;
+        while (tmp_val->has_producer()) {
+            const op_t &prod_op = tmp_val->get_producer();
+            tmp_val = prod_op.get_input_value(0);
+        }
         for (int i = 0; i < (int)inputs.size(); i++) {
-            if (val->get_logical_tensor().id == inputs[i].id) { return i; }
+            if (tmp_val->get_logical_tensor().id == inputs[i].id) { return i; }
         }
         // If the corresponding input is not found, return an invalid value
         return -1;
     };
 
+    VCHECK_SDP_PRIMITIVE(
+            mm1 && mm2, status::invalid_graph, "mm1 or mm2 is not found");
+
     // step3(dims check): only support 4-dims now.
     int q_id = find_graph_inport(mm1->get_input_value(0));
     int k_id = find_graph_inport(mm1->get_input_value(1));
     int v_id = find_graph_inport(mm2->get_input_value(1));
 
-    bool ok = true;
-    ok = ok && (q_id != -1) && (k_id != -1) && (v_id != -1);
-    if (!ok) return status::unimplemented;
-    ok = ok && ltw(inputs[q_id]).vdims().size() == 4
-            && ltw(inputs[k_id]).vdims().size() == 4
-            && ltw(inputs[v_id]).vdims().size() == 4;
+    VCHECK_SDP_PRIMITIVE(q_id != -1 && k_id != -1 && v_id != -1,
+            status::unimplemented, "Q, K, V are not found");
+
+    // Note: sdpa_primitive_v1 kernel accept 5D GQA pattern, and will reshape to
+    // 4D in later compilation pass.
+    if (!v1_kernel) {
+        VCHECK_SDP_PRIMITIVE(ltw(inputs[q_id]).vdims().size() == 4
+                        && ltw(inputs[k_id]).vdims().size() == 4
+                        && ltw(inputs[v_id]).vdims().size() == 4,
+                status::unimplemented, "Q, K, V should be 4-dims");
+    }
+
+    // sdp_primitive only supports single scale value.
+    if (scale) {
+        const auto &s = scale->get_input_value(1)->get_logical_tensor();
+        VCHECK_SDP_PRIMITIVE(ltw(s).nelems() == 1, status::unimplemented,
+                "Scale should be single value");
+    }
 
-    return ok ? status::success : status::unimplemented;
+    return status::success;
 }
 
 status_t sdp_primitive_config_t::init(std::shared_ptr<subgraph_t> &sg,
@@ -211,26 +343,34 @@ status_t sdp_primitive_config_t::init(std::shared_ptr<subgraph_t> &sg,
     auto scale_dt = impl::data_type::undef;
     if (scale_) scale_dt = scale_->get_logical_tensor().data_type;
 
-    dnnl::primitive_attr attr;
+    dnnl::primitive_attr attr, qk_attr, vs_attr;
 
     auto &mgr = sg->fusion_info_mgr_;
     attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    attr.set_fpmath_mode(static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+    attr.set_fpmath_mode(
+            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode().mode_));
+
+    if (mm1_->has_attr(op_attr::fusion_info_key)
+            && mm1_->get_attr<int64_t>(op_attr::fusion_info_key) != -1) {
+        int64_t key = mm1_->get_attr<int64_t>(op_attr::fusion_info_key);
+        qk_attr = make_dnnl_primitive_attr(mm1_, mgr.get_info(key));
+    }
+    if (mm2_->has_attr(op_attr::fusion_info_key)
+            && mm2_->get_attr<int64_t>(op_attr::fusion_info_key) != -1) {
+        int64_t key = mm2_->get_attr<int64_t>(op_attr::fusion_info_key);
+        vs_attr = make_dnnl_primitive_attr(mm2_, mgr.get_info(key));
+    }
 
     CHECK(create_sdpa_pd(sdpa_pd_, p_engine.get(), md_q.get(), md_k.get(),
             md_v.get(), md_dst.get(), md_mask.get(), scale_dt, invert_scale_,
-            attr.get(), kv_head_number_));
+            kv_head_number_,
+            causal_mask_ ? attn_mask_type::top_left : attn_mask_type::buffer,
+            attr.get(), qk_attr.get(), vs_attr.get()));
 
     auto status = sdpa_pd_->create_primitive(sdpa_prim_, p_engine.get());
 
-    if (status != status::success) {
-        if (get_verbose(verbose_t::create_dispatch, component_t::graph)) {
-            verbose_printf(
-                    "graph,create:dispatch,sdpa,could not create primitive, "
-                    "falling back\n");
-        }
-    }
-
+    VCONDCHECK(graph, create, dispatch, sdp, status == status::success, status,
+            "could not create sdp primitive, falling back\n");
     return status;
 }
 
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp
index 3a0c2ffaa0e..e1f77232abd 100644
--- a/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp
+++ b/src/graph/backend/dnnl/kernels/sdp_primitive_config.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,13 +43,25 @@ struct sdp_primitive_config_t {
 public:
     sdp_primitive_config_t() = default;
 
+    std::shared_ptr<op_t> mm1_ = nullptr;
+    std::shared_ptr<op_t> mm2_ = nullptr;
+
     std::shared_ptr<value_t> q_ = nullptr;
     std::shared_ptr<value_t> k_ = nullptr;
     std::shared_ptr<value_t> v_ = nullptr;
     std::shared_ptr<value_t> dst_ = nullptr;
     std::shared_ptr<value_t> scale_ = nullptr;
     std::shared_ptr<value_t> attn_mask_ = nullptr;
+
+    std::shared_ptr<value_t> k_scale_ = nullptr;
+    std::shared_ptr<value_t> v_scale_ = nullptr;
+
+    std::shared_ptr<value_t> k_zero_points_ = nullptr;
+    std::shared_ptr<value_t> v_zero_points_ = nullptr;
+
     bool invert_scale_ = false;
+    bool quantized_ = false;
+    bool causal_mask_ = false;
     dim_t kv_head_number_;
 
     // SDP pd and primitive.
@@ -70,7 +82,8 @@ struct sdp_primitive_config_t {
     // 2. only support fp16 data type
     // 3. only support 4-dims tensor
     status_t initial_check(const std::shared_ptr<subgraph_t> &sg,
-            const std::vector<logical_tensor_t> &inputs);
+            const std::vector<logical_tensor_t> &inputs,
+            bool v1_kernel = false);
 
     // Initialize parameters and primitive.
     status_t init(std::shared_ptr<subgraph_t> &sg, const dnnl::engine &p_engine,
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp
new file mode 100644
index 00000000000..5a7464b3941
--- /dev/null
+++ b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.cpp
@@ -0,0 +1,229 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "graph/backend/dnnl/kernels/sdp_primitive_v1.hpp"
+
+#include "common/sdpa_pd.hpp"
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "gpu/intel/ocl/stream.hpp"
+#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
+#include "gpu/intel/sycl/stream.hpp"
+#endif
+
+#include "graph/backend/dnnl/passes/compile_ops.hpp"
+#include "graph/backend/dnnl/passes/constant_propagation.hpp"
+#include "graph/backend/dnnl/passes/insert_ops.hpp"
+#include "graph/backend/dnnl/passes/layout_propagation.hpp"
+#include "graph/backend/dnnl/passes/lower.hpp"
+#include "graph/backend/dnnl/passes/memory_planning.hpp"
+#include "graph/backend/dnnl/passes/transform.hpp"
+#include "graph/backend/dnnl/passes/utils.hpp"
+
+#include "graph/backend/dnnl/op_executable.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+namespace dnnl_impl {
+
+status_t sdp_primitive_v1_kernel_t::compile_impl(
+        const dnnl_partition_impl_t *part, const engine_t *g_engine,
+        const std::vector<logical_tensor_t> &inputs,
+        const std::vector<logical_tensor_t> &outputs) {
+// sdp_primitive_v1_kernel_t only supports Intel GPU.
+#if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
+
+    p_engine_ = make_dnnl_engine(*g_engine);
+    g_alloc_
+            = reinterpret_cast<graph::allocator_t *>(g_engine->get_allocator());
+
+    // First, dry run on a deep copy
+    subgraph_
+            = std::make_shared<subgraph_t>(graph_t::deep_copy(part->get_ops()),
+                    p_engine_, part->get_fpmath_mode(), false, true);
+    CHECK(set_given_inputs_outputs(subgraph_, inputs, outputs));
+
+    CHECK(cfg_.initial_check(subgraph_, inputs, true));
+
+    subgraph_visualizer_t vis(part->id(), [this](const value_t *val) {
+        return this->memory_planner_.get_memory_info(val);
+    });
+    pass_pipeline_t pipeline = pass_pipeline_t(vis);
+
+    BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_implicit_causal_mask);
+    BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization);
+    BACKEND_DNNL_ADD_PASS(pipeline, insert_permute_for_matmul);
+
+    pipeline.reset_visualize_arg(true, false);
+    BACKEND_DNNL_ADD_PASS(pipeline, infer_shape);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_src_transpose_to_matmul);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_sdpa);
+    BACKEND_DNNL_ADD_PASS(pipeline, fuse_dst_transpose_to_predecessor);
+    BACKEND_DNNL_ADD_PASS(pipeline, insert_reshape_for_sdpa);
+    BACKEND_DNNL_ADD_PASS(pipeline, layout_propagation);
+
+    // bind the memory for each op`
+    auto memory_plan = [&](std::shared_ptr<subgraph_t> &sg) {
+        return memory_planner_.run(sg);
+    };
+    pipeline.reset_visualize_arg(true, true);
+    BACKEND_DNNL_ADD_PASS(pipeline, memory_plan);
+    BACKEND_DNNL_ADD_PASS(pipeline, compile_ops);
+
+    // Run the added passes
+    BACKEND_DNNL_CHECK(pipeline.run(subgraph_));
+
+    // fill information for inputs logical tensors
+    for (size_t i = 0; i < inputs.size(); i++) {
+        auto &in = const_cast<logical_tensor_t &>(inputs[i]);
+        in = subgraph_->ins_[i];
+    }
+
+    // fill information for outputs logical tensors
+    for (size_t i = 0; i < outputs.size(); i++) {
+        auto &out = const_cast<logical_tensor_t &>(outputs[i]);
+        out = subgraph_->outs_[i];
+    }
+
+    resource_ctor_ = [this]() {
+        return this->memory_planner_.get_exec_args_set().clone();
+    };
+
+    return status::success;
+}
+
+void sdp_primitive_v1_kernel_t::prepare_args_set(
+        const execution_args_set_t *res, const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs, const scratchpad_t &scratchpad) {
+    // update the data of partition in/outputs args
+    for (const auto &mem_idx : res->get_mems_use_external_inputs()) {
+        mem_idx.first.set_data_handle(inputs[mem_idx.second].get_data_handle());
+    }
+    for (const auto &mem_idx : res->get_mems_use_external_outputs()) {
+        mem_idx.first.set_data_handle(
+                outputs[mem_idx.second].get_data_handle());
+    }
+
+    grantor_t var_grantor = memory_planner_.internal_temporary_grantor(
+            scratchpad.get_buffer());
+
+    for (auto &mem_offkey : res->get_mems_use_internal_temporary()) {
+        mem_offkey.first.set_data_handle(var_grantor.get(mem_offkey.second));
+    }
+}
+
+status_t sdp_primitive_v1_kernel_t::execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs) {
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
+    prepare_args_set(res, inputs, outputs, scratchpad);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        subgraph_->execs_[i]->execute(p_stream, res->get_exec_args()[i]);
+    }
+
+    return status::success;
+}
+
+#ifdef DNNL_WITH_SYCL
+status_t sdp_primitive_v1_kernel_t::sycl_execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs,
+        const std::vector<::sycl::event> &sycl_deps,
+        ::sycl::event *sycl_event) {
+// sdp_primitive_v1_kernel_t only supports Intel GPU.
+#if DNNL_GPU_VENDOR != DNNL_VENDOR_INTEL
+    return status::unimplemented;
+#endif
+    auto deps = sycl_deps;
+    ::sycl::event returned_event;
+
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
+    prepare_args_set(res, inputs, outputs, scratchpad);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        returned_event = subgraph_->execs_[i]->execute_sycl(
+                p_stream, res->get_exec_args()[i], deps);
+        deps = {returned_event};
+    }
+
+    scratchpad.set_deps(returned_event);
+    if (sycl_event) *sycl_event = returned_event;
+
+    return status::success;
+}
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+status_t sdp_primitive_v1_kernel_t::ocl_execute_impl(const stream_t *g_stream,
+        const std::vector<tensor_t> &inputs,
+        const std::vector<tensor_t> &outputs,
+        const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
+    auto deps = cl_deps;
+    cl_event returned_event {};
+
+    dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
+
+    thread_local_cache_t<execution_args_set_t> res_cache;
+    execution_args_set_t *res = res_cache.get_or_add(
+            reinterpret_cast<size_t>(this), resource_ctor_);
+
+    temporary_scratchpad_t scratchpad(
+            memory_planner_.total_internal_temporary_size(), p_engine_,
+            *g_alloc_);
+    prepare_args_set(res, inputs, outputs, scratchpad);
+
+    for (size_t i = 0; i < subgraph_->execs_.size(); i++) {
+        if (subgraph_->is_constant_[i]) continue;
+        returned_event = subgraph_->execs_[i]->execute_ocl(
+                p_stream, res->get_exec_args()[i], deps);
+        deps = {returned_event};
+    }
+
+    scratchpad.set_deps(returned_event);
+    if (ret_event) *ret_event = returned_event;
+
+    return status::success;
+}
+#endif
+
+struct sdp_primitive_v1_kernel_t;
+
+} // namespace dnnl_impl
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
diff --git a/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp
new file mode 100644
index 00000000000..1d54d0ec36f
--- /dev/null
+++ b/src/graph/backend/dnnl/kernels/sdp_primitive_v1.hpp
@@ -0,0 +1,103 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GRAPH_BACKEND_DNNL_KERNELS_SDP_PRIMITIVE_V1_HPP
+#define GRAPH_BACKEND_DNNL_KERNELS_SDP_PRIMITIVE_V1_HPP
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "graph/backend/dnnl/kernels/sdp_primitive_config.hpp"
+
+#include "graph/backend/dnnl/common.hpp"
+#include "graph/backend/dnnl/dnnl_constant_tensor_cache.hpp"
+#include "graph/backend/dnnl/dnnl_partition_impl.hpp"
+#include "graph/backend/dnnl/op_executable.hpp"
+#include "graph/backend/dnnl/scratchpad.hpp"
+#include "graph/backend/dnnl/thread_local_cache.hpp"
+#include "graph/backend/dnnl/utils.hpp"
+
+#include "graph/backend/dnnl/passes/memory_planning.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+namespace dnnl_impl {
+
+struct sdp_primitive_v1_kernel_t : public kernel_base_t {
+private:
+    allocator_t *g_alloc_ = nullptr;
+
+    std::shared_ptr<subgraph_t> subgraph_;
+    memory_planner_t memory_planner_;
+    std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
+
+    sdp_primitive_config_t cfg_;
+
+public:
+    sdp_primitive_v1_kernel_t() {
+        thread_local_cache_t<execution_args_set_t> res_cache;
+        res_cache.retain();
+    }
+
+    ~sdp_primitive_v1_kernel_t() override {
+        thread_local_cache_t<execution_args_set_t> res_cache;
+        res_cache.remove_if_exist(reinterpret_cast<size_t>(this));
+        res_cache.release();
+    }
+
+    status_t compile_impl(const dnnl_partition_impl_t *part,
+            const engine_t *g_engine,
+            const std::vector<logical_tensor_t> &inputs,
+            const std::vector<logical_tensor_t> &outputs) override;
+
+    void prepare_args_set(const execution_args_set_t *res,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs,
+            const scratchpad_t &scratchpad);
+
+    status_t execute_impl(const stream_t *g_stream,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs) override;
+
+#ifdef DNNL_WITH_SYCL
+    status_t sycl_execute_impl(const stream_t *g_stream,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs,
+            const std::vector<::sycl::event> &sycl_deps,
+            ::sycl::event *sycl_event) override;
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+    status_t ocl_execute_impl(const stream_t *g_stream,
+            const std::vector<tensor_t> &inputs,
+            const std::vector<tensor_t> &outputs,
+            const std::vector<cl_event> &cl_deps, cl_event *ret_event) override;
+#endif
+
+    DEF_KERNEL_METHOD_STR(sdp_primitive_v1_kernel_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(sdp_primitive_v1_kernel_t)
+};
+
+} // namespace dnnl_impl
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/graph/backend/dnnl/kernels/select.cpp b/src/graph/backend/dnnl/kernels/select.cpp
index fe24860bf51..9e7b60fe118 100644
--- a/src/graph/backend/dnnl/kernels/select.cpp
+++ b/src/graph/backend/dnnl/kernels/select.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,6 +49,9 @@ status_t select_t::compile_impl(const dnnl_partition_impl_t *part,
     pass_pipeline_t pipeline(vis);
 
     BACKEND_DNNL_ADD_PASS(pipeline, lower_down);
+    // Decompose select to binary ops if necessary
+    BACKEND_DNNL_ADD_PASS(pipeline, decompose_select_to_binary_ops);
+
     BACKEND_DNNL_ADD_PASS(pipeline, binary_canonicalization);
 
     BACKEND_DNNL_ADD_PASS(pipeline, fuse_post_ops);
@@ -88,7 +91,7 @@ status_t select_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -134,9 +137,11 @@ status_t select_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -203,9 +208,11 @@ status_t select_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -260,7 +267,7 @@ status_t select_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -278,9 +285,11 @@ status_t select_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
diff --git a/src/graph/backend/dnnl/kernels/select.hpp b/src/graph/backend/dnnl/kernels/select.hpp
index 18a2b3f3416..bd4785542fc 100644
--- a/src/graph/backend/dnnl/kernels/select.hpp
+++ b/src/graph/backend/dnnl/kernels/select.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct select_t : public kernel_base_t {
 
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     select_t() {
@@ -97,6 +97,7 @@ struct select_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(select_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(select_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/shuffle.cpp b/src/graph/backend/dnnl/kernels/shuffle.cpp
index 68dde57ab0d..1b4ca81d59a 100644
--- a/src/graph/backend/dnnl/kernels/shuffle.cpp
+++ b/src/graph/backend/dnnl/kernels/shuffle.cpp
@@ -177,7 +177,7 @@ status_t shuffle_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     thread_local_cache_t<execution_args_set_t> res_cache;
diff --git a/src/graph/backend/dnnl/kernels/shuffle.hpp b/src/graph/backend/dnnl/kernels/shuffle.hpp
index 04178e95f74..a2be67b45e0 100644
--- a/src/graph/backend/dnnl/kernels/shuffle.hpp
+++ b/src/graph/backend/dnnl/kernels/shuffle.hpp
@@ -90,6 +90,7 @@ struct shuffle_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(shuffle_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(shuffle_fwd_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/kernels/softmax.cpp b/src/graph/backend/dnnl/kernels/softmax.cpp
index fca5ea99ef8..f3031c04edd 100644
--- a/src/graph/backend/dnnl/kernels/softmax.cpp
+++ b/src/graph/backend/dnnl/kernels/softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -91,7 +91,7 @@ status_t softmax_fwd_t::compile_impl(const dnnl_partition_impl_t *part,
         return this->memory_planner_.get_exec_args_set().clone();
     };
 
-    constant_key_ = generate_constant_cache_key(part->id(),
+    const_md_hash_ = generate_constant_md_hash(part->id(),
             memory_planner_.get_exec_args_set().get_persistent_mem_desc_list());
 
     return status::success;
@@ -137,9 +137,11 @@ status_t softmax_fwd_t::execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -206,9 +208,11 @@ status_t softmax_fwd_t::sycl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -263,7 +267,7 @@ status_t softmax_fwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
@@ -281,9 +285,11 @@ status_t softmax_fwd_t::ocl_execute_impl(const stream_t *g_stream,
 
     constant_cache_t::cached_t c_buffer;
     if (enabled_constant_cache()) {
+        const size_t encoded_key
+                = encode_constant_cache_key(inputs, const_md_hash_);
         std::promise<constant_cache_t::cached_t> c_promise;
         constant_cache_t::value_t cached_value
-                = dnnl_constant_cache_get_or_add(p_engine_, constant_key_,
+                = dnnl_constant_cache_get_or_add(p_engine_, encoded_key,
                         memory_planner_.total_internal_persistent_size(),
                         c_promise.get_future());
         bool is_from_cache = cached_value.valid();
@@ -466,7 +472,7 @@ status_t softmax_bwd_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/softmax.hpp b/src/graph/backend/dnnl/kernels/softmax.hpp
index e3a8e41f637..ebceb4571d0 100644
--- a/src/graph/backend/dnnl/kernels/softmax.hpp
+++ b/src/graph/backend/dnnl/kernels/softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ struct softmax_fwd_t : public kernel_base_t {
     memory_planner_t memory_planner_;
     std::function<std::shared_ptr<execution_args_set_t>()> resource_ctor_;
 
-    constant_cache_t::key_t constant_key_ = 0;
+    size_t const_md_hash_ = 0;
 
 public:
     softmax_fwd_t() {
@@ -95,6 +95,7 @@ struct softmax_fwd_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(softmax_fwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(softmax_fwd_t)
 };
 
 #if BUILD_TRAINING
@@ -154,6 +155,7 @@ struct softmax_bwd_t : public kernel_base_t {
     }
 
     DEF_KERNEL_METHOD_STR(softmax_bwd_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(softmax_bwd_t)
 };
 #endif
 
diff --git a/src/graph/backend/dnnl/kernels/sum.cpp b/src/graph/backend/dnnl/kernels/sum.cpp
index 8bde9519eb4..3580089cca7 100644
--- a/src/graph/backend/dnnl/kernels/sum.cpp
+++ b/src/graph/backend/dnnl/kernels/sum.cpp
@@ -177,7 +177,7 @@ status_t sum_t::ocl_execute_impl(const stream_t *g_stream,
         const std::vector<cl_event> &cl_deps, cl_event *ret_event) {
 
     auto deps = cl_deps;
-    cl_event returned_event;
+    cl_event returned_event {};
     dnnl::stream p_stream = make_dnnl_stream(p_engine_, *g_stream);
 
     // each thread's own local resource
diff --git a/src/graph/backend/dnnl/kernels/sum.hpp b/src/graph/backend/dnnl/kernels/sum.hpp
index 7f2d0784969..8ba25c7f828 100644
--- a/src/graph/backend/dnnl/kernels/sum.hpp
+++ b/src/graph/backend/dnnl/kernels/sum.hpp
@@ -91,6 +91,7 @@ struct sum_t : public kernel_base_t {
 #endif
 
     DEF_KERNEL_METHOD_STR(sum_t)
+    DNNL_DISALLOW_COPY_AND_ASSIGN(sum_t)
 };
 
 } // namespace dnnl_impl
diff --git a/src/graph/backend/dnnl/layout_propagator.cpp b/src/graph/backend/dnnl/layout_propagator.cpp
index ab4902d7000..ec729e92446 100644
--- a/src/graph/backend/dnnl/layout_propagator.cpp
+++ b/src/graph/backend/dnnl/layout_propagator.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,10 @@ namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
+#define VCHECK_LAYOUT_PROPAGATOR(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, layout_propagator, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 using op_t = op_t;
 using op_ptr = std::shared_ptr<op_t>;
 using value_ptr = std::shared_ptr<value_t>;
@@ -60,7 +64,8 @@ status_t insert_reorder_before(op_ptr &op, size_t offset,
     // set optimal layout to reorder's output
     auto reorder_out_val = reorder_op->get_output_value(0);
     status = fill_layout_info(reorder_out_val, opt_mdesc);
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder output");
     // fill shape info
     reorder_out_val->set_data_type(ltw(in_lt).data_type());
     reorder_out_val->set_dims(ltw(in_lt).vdims());
@@ -92,7 +97,8 @@ status_t insert_reorder_after(op_ptr &op, size_t offset,
     // set optimal layout to reorder's input
     auto reorder_in_val = reorder_op->get_input_value(0);
     status = fill_layout_info(reorder_in_val, opt_mdesc);
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder input");
     // fill shape info
     reorder_in_val->set_data_type(ltw(out_lt).data_type());
     reorder_in_val->set_dims(ltw(out_lt).vdims());
@@ -118,13 +124,14 @@ status_t layout_propagator_for_conv(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
-
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before conv src");
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before conv weights");
 
     if (op->has_attr(op_attr::with_bias)
             && op->get_attr<bool>(op_attr::with_bias)) {
@@ -132,7 +139,8 @@ status_t layout_propagator_for_conv(op_ptr &op, const dnnl::engine &p_engine,
                 op, 2, pd.bias_desc(), p_engine, mgr, pd_cache, rewriter);
         value_ptr bias = op->get_input_value(2);
         status = fill_layout_info(bias, pd.bias_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder before conv bias");
     }
 
     fusion_info_t fusion_info;
@@ -164,14 +172,15 @@ status_t layout_propagator_for_conv(op_ptr &op, const dnnl::engine &p_engine,
                     mgr, pd_cache, rewriter);
             status = fill_layout_info(dw_bias, dw_bias_opt_mdesc);
         }
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder before post_dw_conv");
     }
 
     if (fusion_info.has_post_binary()) {
         const auto &post_ops = fusion_info.get_post_ops();
         for (size_t i = 0; i < post_ops.size(); ++i) {
             if (!post_ops[i]->is_post_binary()) continue;
-            auto binary = post_ops[i];
+            const auto &binary = post_ops[i];
             std::vector<size_t> binary_idx
                     = binary->get_unfused_input_indices();
             if (binary_idx.empty()) continue;
@@ -188,7 +197,9 @@ status_t layout_propagator_for_conv(op_ptr &op, const dnnl::engine &p_engine,
             status = fill_layout_info(
                     binary_unfused_src, binary_unfused_src_opt_mdesc);
 
-            if (status != status::success) return status;
+            VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                    "failed to fill layout info for reorder before "
+                    "conv post_binary");
         }
     }
     // insert a reorder if output layout is different from output optimal layout
@@ -198,7 +209,8 @@ status_t layout_propagator_for_conv(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after conv dst");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     // according to op schema, scratchpad must be be second output
@@ -220,13 +232,15 @@ status_t layout_propagator_for_deconv(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before deconv src");
 
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before deconv weights");
 
     if (op->has_attr(op_attr::with_bias)
             && op->get_attr<bool>(op_attr::with_bias)) {
@@ -234,7 +248,8 @@ status_t layout_propagator_for_deconv(op_ptr &op, const dnnl::engine &p_engine,
                 op, 2, pd.bias_desc(), p_engine, mgr, pd_cache, rewriter);
         value_ptr bias = op->get_input_value(2);
         status = fill_layout_info(bias, pd.bias_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder before deconv bias");
     }
     // insert a reorder if output layout is different from output optimal layout
     // 1) output layout is opaque
@@ -243,7 +258,8 @@ status_t layout_propagator_for_deconv(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after deconv dst");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     auto scratchpad_val = op->get_output_value(1);
@@ -265,13 +281,17 @@ status_t layout_propagator_for_deconv_bwd_data(op_ptr &op,
             op, 0, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(0);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before deconv_bwd_data "
+            "diff_dst");
 
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before deconv_bwd_data "
+            "weights");
 
     // insert a reorder if output layout is different from output optimal layout
     // 1) output layout is opaque
@@ -280,7 +300,9 @@ status_t layout_propagator_for_deconv_bwd_data(op_ptr &op,
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after deconv_bwd_data "
+            "diff_src");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     // according to op schema, scratchpad must be be second output
@@ -301,19 +323,25 @@ status_t layout_propagator_for_deconv_bwd_weights(op_ptr &op,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before deconv_bwd_weights "
+            "src");
 
     insert_reorder_before(
             op, 1, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(1);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before deconv_bwd_weights "
+            "diff_dst");
 
     insert_reorder_after(
             op, 0, pd.diff_weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_weights = op->get_output_value(0);
     status = fill_layout_info(diff_weights, pd.diff_weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after deconv_bwd_weights "
+            "diff_weights");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     auto scratchpad_val = op->get_output_value(1);
@@ -335,7 +363,8 @@ status_t layout_propagator_for_eltwise(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after eltwise dst");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -360,18 +389,24 @@ status_t layout_propagator_for_eltwise_bwd(op_ptr &op,
     insert_reorder_before(op, 0, opt_desc, p_engine, mgr, pd_cache, rewriter);
     value_ptr data = op->get_input_value(0);
     status = fill_layout_info(data, opt_desc);
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before eltwise_bwd "
+            "inputs 0");
 
     insert_reorder_before(
             op, 1, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(1);
     status = fill_layout_info(diff_dst, opt_desc);
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before eltwise_bwd "
+            "diff_dst");
     insert_reorder_after(
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after eltwise_bwd "
+            "diff_src");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -393,7 +428,9 @@ status_t layout_propagator_for_binary(op_ptr &op, const dnnl::engine &p_engine,
         status = fill_layout_info(dst,
                 to_ncx_format(
                         make_dnnl_memory_desc(dst->get_logical_tensor())));
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for binary logical tensor with 0 "
+                "dims");
 
         return fill_layout_info(op->get_output_value(1), dnnl::memory::desc {});
     }
@@ -405,7 +442,8 @@ status_t layout_propagator_for_binary(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after binary dst");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -424,13 +462,16 @@ status_t layout_propagator_for_concat(op_ptr &op, const dnnl::engine &p_engine,
                 mgr, pd_cache, rewriter);
         status = fill_layout_info(
                 op->get_input_value(i), pd.src_desc(static_cast<int>(i)));
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder before concat "
+                "src");
     }
 
     insert_reorder_after(
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     status = fill_layout_info(op->get_output_value(0), pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after concat dst");
 
     auto scratchpad_val = op->get_output_value(1);
     const memory::desc scratchpad_desc = pd.scratchpad_desc();
@@ -448,13 +489,14 @@ status_t layout_propagator_for_shuffle(op_ptr &op, const dnnl::engine &p_engine,
     value_ptr src = op->get_input_value(0);
     value_ptr dst = op->get_output_value(0);
 
-    assertm(!ltw(src->get_logical_tensor()).is_any(),
-            "shuffle's src can't be any layout");
+    VCHECK_LAYOUT_PROPAGATOR(!ltw(src->get_logical_tensor()).is_any(),
+            status::invalid_arguments, "shuffle's src layout can't be any");
 
     insert_reorder_after(
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after shuffle dst");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -476,7 +518,8 @@ status_t layout_propagator_for_matmul(op_ptr &op, const dnnl::engine &p_engine,
         status = fill_layout_info(dst,
                 to_ncx_format(
                         make_dnnl_memory_desc(dst->get_logical_tensor())));
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for matmul 0 dims dst");
 
         return fill_layout_info(op->get_output_value(1), dnnl::memory::desc {});
     }
@@ -489,13 +532,15 @@ status_t layout_propagator_for_matmul(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before matmul src 0");
 
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before matmul src 1");
 
     if (op->has_attr(op_attr::with_bias)
             && op->get_attr<bool>(op_attr::with_bias)) {
@@ -503,7 +548,8 @@ status_t layout_propagator_for_matmul(op_ptr &op, const dnnl::engine &p_engine,
                 op, 2, pd.bias_desc(), p_engine, mgr, pd_cache, rewriter);
         value_ptr bias = op->get_input_value(2);
         status = fill_layout_info(bias, pd.bias_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder before matmul bias");
     }
     // insert a reorder if output layout is different from output optimal layout
     // 1) output layout is opaque
@@ -512,7 +558,8 @@ status_t layout_propagator_for_matmul(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after matmul dst");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     auto scratchpad_val = op->get_output_value(1);
@@ -532,12 +579,14 @@ status_t layout_propagator_for_pool(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after pool dst");
 
     // make scratchpad as pool's last output
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after pool scratchpad");
 
     if (op->has_attr(op_attr::is_training)
             && op->get_attr<bool>(op_attr::is_training)) {
@@ -562,13 +611,16 @@ status_t layout_propagator_for_pool_bwd(op_ptr &op,
             op, 0, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(0);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before pool_bwd "
+            "diff_dst");
 
     insert_reorder_after(
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after pool_bwd diff_src");
 
     // make scratchpad as pool's last output
     value_ptr scratchpad_val = op->get_output_value(1);
@@ -588,13 +640,15 @@ status_t layout_propagator_for_batchnorm(op_ptr &op,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before batchnorm src");
 
     insert_reorder_after(
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after batchnorm dst");
 
     if (op->get_attr<bool>(op_attr::is_training)) {
         value_ptr running_mean = op->get_output_value(1);
@@ -603,13 +657,17 @@ status_t layout_propagator_for_batchnorm(op_ptr &op,
         value_ptr batch_variance = op->get_output_value(4);
 
         status = fill_layout_info(running_mean, pd.mean_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm running_mean");
         status = fill_layout_info(running_variance, pd.variance_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm running_variance");
         status = fill_layout_info(batch_mean, pd.mean_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm batch_mean");
         status = fill_layout_info(batch_variance, pd.variance_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm batch_variance");
     }
 
     size_t scratchpad_index = op->num_outputs() - 1;
@@ -621,7 +679,8 @@ status_t layout_propagator_for_batchnorm(op_ptr &op,
         scratchpad_index = op->num_outputs() - 2;
         value_ptr workspace_val = op->get_output_value(op->num_outputs() - 1);
         status = fill_layout_info(workspace_val, pd.workspace_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm workspace");
     }
 
     value_ptr scratchpad_val = op->get_output_value(scratchpad_index);
@@ -641,40 +700,52 @@ status_t layout_propagator_for_batchnorm_bwd(op_ptr &op,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before batchnorm_bwd "
+            "src");
 
     insert_reorder_before(
             op, 1, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(1);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before batchnorm_bwd "
+            "diff_dst");
 
     insert_reorder_before(
             op, 2, pd.mean_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr mean = op->get_input_value(2);
     status = fill_layout_info(mean, pd.mean_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before batchnorm_bwd "
+            "mean");
 
     insert_reorder_before(
             op, 3, pd.variance_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr var = op->get_input_value(3);
     status = fill_layout_info(var, pd.variance_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before batchnorm_bwd "
+            "virance");
 
     insert_reorder_after(
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after batchnorm_bwd "
+            "diff_src");
 
     if (op->num_outputs() > 2) {
         value_ptr diff_gamma = op->get_output_value(1);
         value_ptr diff_beta = op->get_output_value(2);
 
         status = fill_layout_info(diff_gamma, pd.diff_weights_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm_bwd diff_gamma");
         status = fill_layout_info(diff_beta, pd.diff_weights_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for batchnorm_bwd diff_beta");
     }
 
     value_ptr scratchpad_val = op->get_output_values().back();
@@ -693,19 +764,22 @@ status_t layout_propagator_for_prelu(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before prelu src");
 
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before prelu weights");
 
     insert_reorder_after(
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after prelu dst");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     // make scratchpad as prelu's last output
@@ -724,29 +798,37 @@ status_t layout_propagator_for_prelu_bwd(op_ptr &op,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before prelu_bwd src");
 
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before prelu_bwd "
+            "weights");
 
     value_ptr diff_dst = op->get_input_value(2);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for prelu_bwd diff_dst");
 
     insert_reorder_after(
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after prelu_bwd "
+            "diff_src");
 
     insert_reorder_after(
             op, 1, pd.diff_weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_wei = op->get_output_value(1);
     status = fill_layout_info(diff_wei, pd.diff_weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after prelu_bwd "
+            "diff_weights");
 
     value_ptr scratchpad_val = op->get_output_value(2);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -764,22 +846,26 @@ status_t layout_propagator_for_layernorm(op_ptr &op,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before layernorm src");
 
     insert_reorder_after(
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after layernorm dst");
 
     if (op->num_outputs() > 2) {
         // keep_stats is true
         value_ptr mean = op->get_output_value(1);
         value_ptr variance = op->get_output_value(2);
         status = fill_layout_info(mean, pd.mean_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for layernorm mean");
         status = fill_layout_info(variance, pd.variance_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for layernorm variance");
     }
 
     // scratchpad is layernorm's last output
@@ -800,32 +886,42 @@ status_t layout_propagator_for_layernorm_bwd(op_ptr &op,
             op, in_index, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(in_index++);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before layernorm_bwd "
+            "src");
 
     insert_reorder_before(op, in_index, pd.diff_dst_desc(), p_engine, mgr,
             pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(in_index++);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before layernorm_bwd "
+            "diff_dst");
 
     insert_reorder_before(
             op, in_index, pd.mean_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr mean = op->get_input_value(in_index++);
     status = fill_layout_info(mean, pd.mean_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before layernorm_bwd "
+            "mean");
 
     insert_reorder_before(op, in_index, pd.variance_desc(), p_engine, mgr,
             pd_cache, rewriter);
     value_ptr var = op->get_input_value(in_index++);
     status = fill_layout_info(var, pd.variance_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before layernorm_bwd "
+            "variance");
 
     size_t out_index {0};
     insert_reorder_after(op, out_index, pd.diff_src_desc(), p_engine, mgr,
             pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(out_index++);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after layernorm_bwd "
+            "diff_src");
 
     const bool use_affine = op->get_attr<bool>(op_attr::use_affine);
     if (use_affine) {
@@ -835,7 +931,9 @@ status_t layout_propagator_for_layernorm_bwd(op_ptr &op,
                 pd_cache, rewriter);
         value_ptr diff_scale = op->get_output_value(out_index++);
         status = fill_layout_info(diff_scale, diff_scale_opt_mdesc);
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder after layernorm_bwd "
+                "diff_scale");
 
         const auto &diff_shift_opt_mdesc
                 = pd.query_md(query::exec_arg_md, DNNL_ARG_DIFF_SHIFT);
@@ -843,7 +941,9 @@ status_t layout_propagator_for_layernorm_bwd(op_ptr &op,
                 pd_cache, rewriter);
         value_ptr diff_shift = op->get_output_value(out_index++);
         status = fill_layout_info(diff_shift, diff_shift_opt_mdesc);
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder after layernorm_bwd "
+                "diff_shift");
     }
     auto scratchpad_val = op->get_output_value(op->num_outputs() - 1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -1005,7 +1105,8 @@ static status_t layout_propagator_for_reshape_like_ops(op_ptr &op,
     auto in_lt = src->get_logical_tensor();
     auto out_lt = dst->get_logical_tensor();
 
-    assertm(!ltw(in_lt).is_any(), "input layout must be specified");
+    VCHECK_LAYOUT_PROPAGATOR(!ltw(in_lt).is_any(), status::invalid_arguments,
+            "input layout must be specified for reshape_like");
 
     if (target_dims.empty()) {
         // scalar, just set empty strides to make the dst strided
@@ -1094,7 +1195,8 @@ status_t layout_propagator_for_transpose(op_ptr &op,
     auto in_lt = src->get_logical_tensor();
     auto out_lt = dst->get_logical_tensor();
 
-    assertm(!ltw(in_lt).is_any(), "transpose's src can't be any layout now");
+    VCHECK_LAYOUT_PROPAGATOR(!ltw(in_lt).is_any(), status::invalid_arguments,
+            "layout of transpose src can't be any layout");
 
     std::vector<int64_t> order
             = op->get_attr<std::vector<int64_t>>(op_attr::order);
@@ -1105,7 +1207,8 @@ status_t layout_propagator_for_transpose(op_ptr &op,
         }
     } else {
         // FIXME(xx) handle this case
-        assertm(false, "not handled yet");
+        VCHECK_LAYOUT_PROPAGATOR(false, status::unimplemented,
+                "transpose with empty order is not supported");
     }
 
     /// The order in spec op is used as:
@@ -1184,11 +1287,11 @@ status_t layout_propagator_for_reorder(op_ptr &op, const dnnl::engine &p_engine,
     auto out_lt = dst->get_logical_tensor();
 
     if (!ltw(in_lt).is_any() && ltw(out_lt).is_any()) {
-        assertm(!op->has_attr(op_attr::change_layout)
+        VCHECK_LAYOUT_PROPAGATOR(!op->has_attr(op_attr::change_layout)
                         || !op->get_attr<bool>(op_attr::change_layout),
-                "the dnnl_reorder op's input and output layout must be known "
+                status::invalid_arguments,
+                "layout of dnnl_reorder input and output must be known "
                 "if it changes layout");
-
         // for logical tensor with opaque layout, make_dnnl_memory_desc
         // cannot help manually modify the logical tensor. The previously
         // created memory will be quired according to logic tensor id.
@@ -1210,16 +1313,17 @@ status_t layout_propagator_for_reorder(op_ptr &op, const dnnl::engine &p_engine,
             status = fill_layout_info(dst, out_md);
         }
     } else if (!ltw(out_lt).is_any() && ltw(in_lt).is_any()) {
-        assertm(!op->has_attr(op_attr::change_layout)
+        VCHECK_LAYOUT_PROPAGATOR(!op->has_attr(op_attr::change_layout)
                         || !op->get_attr<bool>(op_attr::change_layout),
-                "the dnnl_reorder op's input and output layout must be known "
+                status::invalid_arguments,
+                "layout of dnnl_reorder input and output must be known "
                 "if it changes layout");
-
         out_lt.data_type = ltw(in_lt).data_type();
         auto in_md = make_dnnl_memory_desc(out_lt);
         status = fill_layout_info(src, in_md);
     }
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder");
 
     // set layout info for scratchpad output
     if (op->num_outputs() == 1) { insert_empty_scratchpad(op); }
@@ -1253,7 +1357,8 @@ status_t layout_propagator_for_bn_folding(op_ptr &op,
             dnnl::memory::desc in_md = make_dnnl_memory_desc(in_lt);
             auto dst = op->get_output_value(i);
             status = fill_layout_info(dst, in_md);
-            if (status != status::success) return status;
+            VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                    "failed to fill layout info for bn_folding dst");
         }
     }
 
@@ -1275,19 +1380,25 @@ status_t layout_propagator_for_conv_bwd_data(op_ptr &op,
             op, 0, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(0);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before conv_bwd_data "
+            "diff_dst");
 
     insert_reorder_before(
             op, 1, pd.weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr wei = op->get_input_value(1);
     status = fill_layout_info(wei, pd.weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before conv_bwd_data "
+            "weights");
 
     insert_reorder_after(
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after conv_bwd_data "
+            "diff_src");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     auto scratchpad_val = op->get_output_value(1);
@@ -1307,19 +1418,25 @@ status_t layout_propagator_for_conv_bwd_weights(op_ptr &op,
             op, 0, pd.src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr src = op->get_input_value(0);
     status = fill_layout_info(src, pd.src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before conv_bwd_weights "
+            "src");
 
     insert_reorder_before(
             op, 1, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(1);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before conv_bwd_weights "
+            "diff_dst");
 
     insert_reorder_after(
             op, 0, pd.diff_weights_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_weights = op->get_output_value(0);
     status = fill_layout_info(diff_weights, pd.diff_weights_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after conv_bwd_weights "
+            "diff_weights");
 
     // fill scratchpads dimensions and data type to scratchpad value_t
     auto scratchpad_val = op->get_output_value(1);
@@ -1339,7 +1456,9 @@ status_t layout_propagator_for_resampling(op_ptr &op,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after interpolate "
+            "dst");
 
     // make scratchpad as resampling's last output
     value_ptr scratchpad_val = op->get_output_value(1);
@@ -1358,7 +1477,9 @@ status_t layout_propagator_for_resampling_bwd(op_ptr &op,
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after interpolate_bwd "
+            "diff_src");
 
     auto scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -1380,7 +1501,9 @@ status_t layout_propagator_for_sum(op_ptr &op, const dnnl::engine &p_engine,
 
     MAYBE_UNUSED(input_has_any_format);
     assertm(!input_has_any_format,
-            "input format of sum primitive cannot be any.");
+            "input format of sum primitive cannot be any");
+    VCHECK_LAYOUT_PROPAGATOR(!input_has_any_format, status::invalid_arguments,
+            "input format of sum primitive cannot be any ");
 
     const auto &pd = sum_executable_t::create_desc(op, p_engine, mgr, pd_cache);
 
@@ -1389,7 +1512,8 @@ status_t layout_propagator_for_sum(op_ptr &op, const dnnl::engine &p_engine,
                 op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
         dst = op->get_output_value(0);
         status = fill_layout_info(dst, pd.dst_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for reorder after sum dst");
     }
 
     // scratchpad is dnnl_sum's last output
@@ -1403,8 +1527,9 @@ status_t layout_propagator_for_softmax(op_ptr &op, const dnnl::engine &p_engine,
         subgraph_rewriter_t &rewriter) {
     status_t status = status::success;
     value_ptr src = op->get_input_value(0);
-    assertm(!ltw(src->get_logical_tensor()).is_any(),
-            "softmax/logsoftmax's src can't be any layout now");
+    VCHECK_LAYOUT_PROPAGATOR(!ltw(src->get_logical_tensor()).is_any(),
+            status::invalid_arguments,
+            "layout of softmax/logsoftmax src can't be any");
 
     const auto &pd
             = softmax_executable_t::create_desc(op, p_engine, mgr, pd_cache);
@@ -1413,7 +1538,8 @@ status_t layout_propagator_for_softmax(op_ptr &op, const dnnl::engine &p_engine,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after softmax dst");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -1425,8 +1551,9 @@ status_t layout_propagator_for_softmax_bwd(op_ptr &op,
         pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
     status_t status = status::success;
     value_ptr dst = op->get_input_value(1);
-    assertm(!ltw(dst->get_logical_tensor()).is_any(),
-            "softmax/logsoftmax bwd's dst can't be any layout now");
+    VCHECK_LAYOUT_PROPAGATOR(!ltw(dst->get_logical_tensor()).is_any(),
+            status::invalid_arguments,
+            "layout of softmax/logsoftmax bwd dst can't be any");
 
     const auto &pd = softmax_bwd_executable_t::create_desc(
             op, p_engine, mgr, pd_cache);
@@ -1435,13 +1562,17 @@ status_t layout_propagator_for_softmax_bwd(op_ptr &op,
             op, 0, pd.diff_dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_dst = op->get_input_value(0);
     status = fill_layout_info(diff_dst, pd.diff_dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder before softmax_bwd "
+            "diff_dst");
 
     insert_reorder_after(
             op, 0, pd.diff_src_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr diff_src = op->get_output_value(0);
     status = fill_layout_info(diff_src, pd.diff_src_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after softmax_bwd "
+            "diff_src");
 
     // according to op schema, scratchpad must be be second output
     auto scratchpad_val = op->get_output_value(1);
@@ -1454,9 +1585,8 @@ status_t layout_propagator_for_reduction(op_ptr &op,
         pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
     status_t status = status::success;
     value_ptr src = op->get_input_value(0);
-    assertm(!ltw(src->get_logical_tensor()).is_any(),
-            "reduction's src can't be any layout now");
-
+    VCHECK_LAYOUT_PROPAGATOR(!ltw(src->get_logical_tensor()).is_any(),
+            status::invalid_arguments, "layout of reduction src can't be any");
     const auto &pd
             = reduction_executable_t::create_desc(op, p_engine, mgr, pd_cache);
 
@@ -1464,7 +1594,8 @@ status_t layout_propagator_for_reduction(op_ptr &op,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after reduction dst");
 
     value_ptr scratchpad_val = op->get_output_value(1);
     status = fill_layout_info(scratchpad_val, pd.scratchpad_desc());
@@ -1510,6 +1641,20 @@ status_t layout_propagator_for_add_zps(std::shared_ptr<op_t> &op,
     return status::invalid_graph_op;
 }
 
+status_t layout_propagator_for_gen_index(std::shared_ptr<op_t> &op,
+        const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
+        pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
+    UNUSED(p_engine);
+    UNUSED(mgr);
+    UNUSED(pd_cache);
+    UNUSED(rewriter);
+    auto src_md = make_dnnl_memory_desc(
+            op->get_input_value(0)->get_logical_tensor());
+    value_ptr dst_val = op->get_output_value(0);
+    status_t status = fill_layout_info(dst_val, src_md);
+    return status;
+}
+
 status_t layout_propagator_for_groupnorm(op_ptr &op,
         const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
         pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
@@ -1522,16 +1667,19 @@ status_t layout_propagator_for_groupnorm(op_ptr &op,
             op, 0, pd.dst_desc(), p_engine, mgr, pd_cache, rewriter);
     value_ptr dst = op->get_output_value(0);
     status = fill_layout_info(dst, pd.dst_desc());
-    if (status != status::success) return status;
+    VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+            "failed to fill layout info for reorder after groupnorm dst");
 
     if (op->num_outputs() > 2) {
         // keep_stats is true
         value_ptr mean = op->get_output_value(1);
         value_ptr variance = op->get_output_value(2);
         status = fill_layout_info(mean, pd.mean_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for groupnorm mean");
         status = fill_layout_info(variance, pd.variance_desc());
-        if (status != status::success) return status;
+        VCHECK_LAYOUT_PROPAGATOR(status == status::success, status,
+                "failed to fill layout info for groupnorm variance");
     }
 
     // scratchpad is groupnorm's last output
@@ -1540,6 +1688,75 @@ status_t layout_propagator_for_groupnorm(op_ptr &op,
     return status;
 }
 
+status_t layout_propagator_for_mask(std::shared_ptr<op_t> &op,
+        const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
+        pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
+    UNUSED(p_engine);
+    UNUSED(mgr);
+    UNUSED(pd_cache);
+    UNUSED(rewriter);
+    auto src_md = make_dnnl_memory_desc(
+            op->get_input_value(0)->get_logical_tensor());
+    value_ptr dst_val = op->get_output_value(0);
+    status_t status = fill_layout_info(dst_val, src_md);
+    return status;
+}
+
+status_t layout_propagator_for_sdpa(std::shared_ptr<op_t> &op,
+        const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
+        pd_cache_t &pd_cache, subgraph_rewriter_t &rewriter) {
+    UNUSED(p_engine);
+    UNUSED(mgr);
+    UNUSED(pd_cache);
+    UNUSED(rewriter);
+
+    value_ptr dst_val = op->get_output_value(0);
+    const logical_tensor_t &out_lt = dst_val->get_logical_tensor();
+
+    dnnl::memory::desc expected_md;
+    if (ltw(out_lt).is_any()) {
+        // For GQA, we need to check the layout of the dnnl_reshape output
+        // following dnnl_sdpa, which is given by the user.
+        if (!dst_val->get_consumers().empty()) {
+            const auto &consumer_op = dst_val->get_consumers()[0].get_op();
+            const logical_tensor_t &consumer_out
+                    = consumer_op.get_output_value(0)->get_logical_tensor();
+            if (consumer_op.get_kind() == op_kind::dnnl_reshape
+                    && ltw(consumer_out).ndims() == 5
+                    && ltw(consumer_out).is_strided()) {
+                const auto &ori_strides = ltw(consumer_out).vstrides();
+                std::vector<dim_t> strides = {ori_strides[0], ori_strides[2],
+                        ori_strides[3], ori_strides[4]};
+                expected_md = {ltw(out_lt).vdims(),
+                        static_cast<dnnl::memory::data_type>(
+                                ltw(out_lt).data_type()),
+                        strides};
+            } else {
+                // Set default output layout format for sdpa as acbd if user
+                // doesn't specify the layout since no reorder will be required.
+                expected_md = {ltw(out_lt).vdims(),
+                        static_cast<dnnl::memory::data_type>(
+                                ltw(out_lt).data_type()),
+                        dnnl::memory::format_tag::acbd};
+            }
+        } else {
+            expected_md = {ltw(out_lt).vdims(),
+                    static_cast<dnnl::memory::data_type>(
+                            ltw(out_lt).data_type()),
+                    dnnl::memory::format_tag::acbd};
+        }
+    } else {
+        expected_md = make_dnnl_memory_desc(out_lt);
+    }
+    status_t status = fill_layout_info(dst_val, expected_md);
+
+    // fill scratchpads dimensions and data type to scratchpad value_t
+    value_ptr scratchpad_val = op->get_output_value(1);
+    const memory::desc scratchpad_desc;
+    status = fill_layout_info(scratchpad_val, scratchpad_desc);
+    return status;
+}
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/layout_propagator.hpp b/src/graph/backend/dnnl/layout_propagator.hpp
index 5d7e1f62948..40b593ea551 100644
--- a/src/graph/backend/dnnl/layout_propagator.hpp
+++ b/src/graph/backend/dnnl/layout_propagator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -91,6 +91,9 @@ DECLARE_LAYOUT_PROPAGATOR(constant_filler);
 DECLARE_LAYOUT_PROPAGATOR(sub_zps);
 DECLARE_LAYOUT_PROPAGATOR(add_zps);
 DECLARE_LAYOUT_PROPAGATOR(groupnorm);
+DECLARE_LAYOUT_PROPAGATOR(gen_index);
+DECLARE_LAYOUT_PROPAGATOR(mask);
+DECLARE_LAYOUT_PROPAGATOR(sdpa);
 
 #undef DECLARE_LAYOUT_PROPAGATOR
 
diff --git a/src/graph/backend/dnnl/op_executable.cpp b/src/graph/backend/dnnl/op_executable.cpp
index 73fc14f39f4..4de6b115956 100644
--- a/src/graph/backend/dnnl/op_executable.cpp
+++ b/src/graph/backend/dnnl/op_executable.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,17 +25,27 @@
 
 #include <graph/utils/utils.hpp>
 
+#include "common/dnnl_thread.hpp"
+
 #include "graph/backend/dnnl/common.hpp"
 #include "graph/backend/dnnl/dnnl_constant_tensor_cache.hpp"
 #include "graph/backend/dnnl/fusion_info.hpp"
 #include "graph/backend/dnnl/internal_attrs.hpp"
 #include "graph/backend/dnnl/op_executable.hpp"
+#include "graph/backend/dnnl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
 
+#define VCHECK_OP_EXECUTABLE(cond, msg, ...) \
+    if (!(cond)) { VERROR(graph, op_executable, msg, ##__VA_ARGS__); }
+
+#define VCHECK_PATTERN_UTILS(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, pattern, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 const indices_t::type_t input = indices_t::type_t::input;
 const indices_t::type_t output = indices_t::type_t::output;
 
@@ -66,8 +76,9 @@ conv_fwd_executable_t::desc_t conv_fwd_executable_t::create_desc(
         prm_attr = make_dnnl_primitive_attr(op, fusion_info);
     }
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
     const bool can_use_blocked_layout = mgr.get_use_blocked_layout();
 
     auto src = make_dnnl_memory_desc(
@@ -181,8 +192,9 @@ deconv_fwd_executable_t::desc_t deconv_fwd_executable_t::create_desc(
         prm_attr = make_dnnl_primitive_attr(op, mgr.get_info(key));
     }
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
 
     auto src = make_dnnl_memory_desc(
             op->get_input_value(0)->get_logical_tensor());
@@ -241,8 +253,9 @@ deconv_bwd_data_executable_t::desc_t deconv_bwd_data_executable_t::create_desc(
         prm_attr = make_dnnl_primitive_attr(op, mgr.get_info(key));
     }
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
 
     auto diff_dst = make_dnnl_memory_desc(
             op->get_input_value(0)->get_logical_tensor());
@@ -292,8 +305,9 @@ deconv_bwd_weights_executable_t::create_desc(std::shared_ptr<op_t> &op,
         int64_t key = op->get_attr<int64_t>(op_attr::fusion_info_key);
         prm_attr = make_dnnl_primitive_attr(op, mgr.get_info(key));
     }
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
 
     auto src = make_dnnl_memory_desc(
             op->get_input_value(0)->get_logical_tensor());
@@ -336,8 +350,9 @@ matmul_executable_t::desc_t matmul_executable_t::create_desc(
         prm_attr = make_dnnl_primitive_attr(op, mgr.get_info(key));
     }
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
 
     auto src = make_dnnl_memory_desc(
             op->get_input_value(0)->get_logical_tensor());
@@ -835,8 +850,9 @@ conv_bwd_data_executable_t::desc_t conv_bwd_data_executable_t::create_desc(
         prm_attr = make_dnnl_primitive_attr(op, mgr.get_info(key));
     }
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
     const bool can_use_blocked_layout = mgr.get_use_blocked_layout();
 
     auto diff_dst = make_dnnl_memory_desc(
@@ -894,8 +910,9 @@ conv_bwd_weights_executable_t::create_desc(std::shared_ptr<op_t> &op,
         prm_attr = make_dnnl_primitive_attr(op, mgr.get_info(key));
     }
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    auto fpmath = mgr.get_fpmath_mode();
     prm_attr.set_fpmath_mode(
-            static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode()));
+            static_cast<dnnl::fpmath_mode>(fpmath.mode_), fpmath.apply_to_int_);
     const bool can_use_blocked_layout = mgr.get_use_blocked_layout();
 
     auto src = make_dnnl_memory_desc(
@@ -1076,7 +1093,7 @@ concat_executable_t::desc_t concat_executable_t::create_desc(
     const auto rank = op->get_output_value(0)->get_logical_tensor().ndims;
     const auto res = utils::try_reverse_axis(
             op->get_attr<int64_t>(op_attr::axis), rank);
-    assertm(res.first, "Incorrect axis value.");
+    VCHECK_OP_EXECUTABLE(res.first, "invalid axis for concat");
     const auto axis = res.second;
 
     dnnl::primitive_attr prm_attr;
@@ -1092,9 +1109,8 @@ concat_executable_t::desc_t concat_executable_t::create_desc(
     for (const auto &in_val : op->get_input_values()) {
         const auto tmp_desc
                 = make_dnnl_memory_desc(in_val->get_logical_tensor());
-        src_mds.emplace_back(
-                memory::desc {tmp_desc.get_dims(), tmp_desc.get_data_type(),
-                        get_forced_format_tag(tmp_desc.get_dims())});
+        src_mds.emplace_back(tmp_desc.get_dims(), tmp_desc.get_data_type(),
+                get_forced_format_tag(tmp_desc.get_dims()));
     }
     const auto tmp_desc = make_dnnl_memory_desc(
             op->get_output_value(0)->get_logical_tensor());
@@ -1242,8 +1258,15 @@ binary_executable_t::desc_t binary_executable_t::create_desc(
             op->get_attr<int64_t>(op_attr::alg_kind));
 
     dnnl::binary::primitive_desc pd;
-    pd = dnnl::binary::primitive_desc(
-            p_engine, algo, src0, src1, dst, prm_attr);
+    if (algo == algorithm::binary_select) {
+        auto src2 = make_dnnl_memory_desc(
+                op->get_input_value(2)->get_logical_tensor());
+        pd = dnnl::binary::primitive_desc(
+                p_engine, algo, src0, src1, src2, dst, prm_attr);
+    } else {
+        pd = dnnl::binary::primitive_desc(
+                p_engine, algo, src0, src1, dst, prm_attr);
+    }
 
     pd_cache.insert({op.get(), pd});
 
@@ -1512,35 +1535,77 @@ reorder_executable_t::desc_t reorder_executable_t::create_desc(
 
     // generate mask
     int mask = 0;
-    if (op->has_attr(op_attr::axis) && op->has_attr(op_attr::qtype)) {
-        int64_t axis = op->get_attr<int64_t>(op_attr::axis);
-        std::string qtype = op->get_attr<std::string>(op_attr::qtype);
-        mask = qtype == "per_tensor" ? 0 : 1 << axis;
-    }
+    const auto set_reorder_mask = [&op, &prm_attr](int mask) {
+        std::vector<int64_t> default_groups;
+
+        if (op->has_attr(op_attr::with_runtime_scales)
+                && op->get_attr<bool>(op_attr::with_runtime_scales)) {
+            auto scale_dt
+                    = op->get_input_value(1)->get_logical_tensor().data_type;
+            // For runtime arg scales, need to get data type information from
+            // the op
+            prm_attr.set_scales(DNNL_ARG_SRC, mask, default_groups,
+                    static_cast<dnnl::memory::data_type>(scale_dt));
+        } else if (op->has_attr(op_attr::scales)) {
+            assertm(false, "only support runtime arg scales.\n");
+        }
 
-    if (op->has_attr(op_attr::with_runtime_src_zps)
-            && op->get_attr<bool>(op_attr::with_runtime_src_zps)) {
-        // runtime src zps
-        prm_attr.set_zero_points_mask(DNNL_ARG_FROM, mask);
-    } else if (op->has_attr(op_attr::src_zps)) {
-        assertm(false, "only support runtime src zero points.\n");
-    }
+        if (op->has_attr(op_attr::with_runtime_src_zps)
+                && op->get_attr<bool>(op_attr::with_runtime_src_zps)) {
+            // For runtime src zps, as graph compilation will add extra
+            // typecast to convert int8 zero points to s32, we may still use
+            // the set_zero_points_mask API which specifies s32 zero point by
+            // default.
+            prm_attr.set_zero_points_mask(DNNL_ARG_FROM, mask);
+        } else if (op->has_attr(op_attr::src_zps)) {
+            assertm(false, "only support runtime src zero points.\n");
+        }
 
-    if (op->has_attr(op_attr::with_runtime_scales)
-            && op->get_attr<bool>(op_attr::with_runtime_scales)) {
-        // runtime arg scales
-        prm_attr.set_scales_mask(DNNL_ARG_SRC, mask);
-    } else if (op->has_attr(op_attr::scales)) {
-        assertm(false, "only support runtime arg scales.\n");
-    }
+        if (op->has_attr(op_attr::with_runtime_dst_zps)
+                && op->get_attr<bool>(op_attr::with_runtime_dst_zps)) {
+            // runtime dst zps
+            prm_attr.set_zero_points_mask(DNNL_ARG_TO, mask);
+        } else if (op->has_attr(op_attr::dst_zps)) {
+            assertm(false, "only support runtime dst zero points.\n");
+        }
+    };
 
-    if (op->has_attr(op_attr::with_runtime_dst_zps)
-            && op->get_attr<bool>(op_attr::with_runtime_dst_zps)) {
-        // runtime dst zps
-        prm_attr.set_zero_points_mask(DNNL_ARG_TO, mask);
-    } else if (op->has_attr(op_attr::dst_zps)) {
-        assertm(false, "only support runtime dst zero points.\n");
+    if (op->has_attr(op_attr::qtype)) {
+        std::string qtype = op->get_attr<std::string>(op_attr::qtype);
+        int64_t axis = op->has_attr(op_attr::axis)
+                ? op->get_attr<int64_t>(op_attr::axis)
+                : 1;
+
+        // For per group quantization, extra handling is needed for setting
+        // group shape and size.
+        if (qtype == "per_group") {
+            const auto &scale_lt = op->get_input_value(1)->get_logical_tensor();
+            const auto scales_data_type = scale_lt.data_type;
+            const auto &group_shape
+                    = op->get_attr<std::vector<int64_t>>(op_attr::group_shape);
+            const auto ndims = group_shape.size();
+            const int mask = (1 << ndims) - 1;
+
+            const std::vector<int64_t> groups
+                    = {group_shape[ndims - 2], group_shape[ndims - 1]};
+
+            prm_attr.set_scales(DNNL_ARG_FROM, mask, groups,
+                    static_cast<dnnl::memory::data_type>(scales_data_type));
+            if (op->has_attr(op_attr::with_runtime_src_zps)
+                    && op->get_attr<bool>(op_attr::with_runtime_src_zps)) {
+                const auto &zps_lt
+                        = op->get_input_value(2)->get_logical_tensor();
+                const auto zps_data_type = zps_lt.data_type;
+                prm_attr.set_zero_points(DNNL_ARG_FROM, mask, groups,
+                        static_cast<dnnl::memory::data_type>(zps_data_type));
+            }
+
+        } else { // per channel and per tensor quantization
+            if (qtype == "per_channel") { mask = 1 << axis; }
+            set_reorder_mask(mask);
+        }
     }
+
     prm_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
 
     auto in_md = make_dnnl_memory_desc(
@@ -1590,15 +1655,42 @@ bn_folding_t::desc_t bn_folding_t::create_desc(std::shared_ptr<op_t> &op,
     desc.epsilon_desc_ = memory::desc(
             epsilon_dims, memory::data_type::f32, memory::format_tag::a);
 
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+    // binary + sqrt post-op fusion is unsupported on NVIDIA GPU
+    if (p_engine.get_kind() == dnnl::engine::kind::gpu) {
+        primitive_attr add_attr;
+        desc.add_pd_
+                = dnnl::binary::primitive_desc(p_engine, algorithm::binary_add,
+                        variance, desc.epsilon_desc_, variance, add_attr);
+
+        primitive_attr sqrt_attr;
+        desc.sqrt_pd_ = dnnl::eltwise_forward::primitive_desc(p_engine,
+                prop_kind::forward, algorithm::eltwise_sqrt, variance, variance,
+                0.0f, 0.0f, sqrt_attr);
+    } else {
+        post_ops add_post_ops;
+        // sqrt_variance = sqrt(temp)
+        add_post_ops.append_eltwise(algorithm::eltwise_sqrt, 0.0f, 0.0f);
+
+        primitive_attr add_attr;
+        add_attr.set_post_ops(add_post_ops);
+        desc.add_pd_
+                = dnnl::binary::primitive_desc(p_engine, algorithm::binary_add,
+                        variance, desc.epsilon_desc_, variance, add_attr);
+    }
+
+#else
     post_ops add_post_ops;
     // sqrt_variance = sqrt(temp)
     add_post_ops.append_eltwise(algorithm::eltwise_sqrt, 0.0f, 0.0f);
 
     primitive_attr add_attr;
-    add_attr.set_post_ops(std::move(add_post_ops));
+    add_attr.set_post_ops(add_post_ops);
     desc.add_pd_ = dnnl::binary::primitive_desc(p_engine, algorithm::binary_add,
             variance, desc.epsilon_desc_, variance, add_attr);
 
+#endif
     // 2. updated_weight = weights * scale / sqrt_variance
 
     // expand 1D scale and variance to same ndims with weights
@@ -1631,7 +1723,7 @@ bn_folding_t::desc_t bn_folding_t::create_desc(std::shared_ptr<op_t> &op,
     mul_post_ops.append_binary(algorithm::binary_div, desc.new_variance_desc_);
 
     primitive_attr mul_attr;
-    mul_attr.set_post_ops(std::move(mul_post_ops));
+    mul_attr.set_post_ops(mul_post_ops);
     desc.mul_pd_ = dnnl::binary::primitive_desc(p_engine, algorithm::binary_mul,
             weights, desc.new_scale_desc_, weights, mul_attr);
 
@@ -1649,14 +1741,26 @@ bn_folding_t::desc_t bn_folding_t::create_desc(std::shared_ptr<op_t> &op,
     sub_post_ops.append_binary(algorithm::binary_add, shift);
 
     primitive_attr sub_attr;
-    sub_attr.set_post_ops(std::move(sub_post_ops));
+    sub_attr.set_post_ops(sub_post_ops);
     desc.sub_pd_ = dnnl::binary::primitive_desc(p_engine, algorithm::binary_sub,
             valid_bias, mean, valid_bias, sub_attr);
 
     memory::dims scratchpad_dims = variance.get_dims();
     // sqrt_variance, zero_bias and others (like epsilon),
     // or no need to alloc bias
+    // binary + sqrt post-op fusion is unsupported on NVIDIA GPU, so we need
+    // one more scratchpad for sqrt
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+    size_t factor = 0;
+    if (p_engine.get_kind() == dnnl::engine::kind::gpu) {
+        factor = bias.is_zero() ? 4 : 3;
+    } else {
+        factor = bias.is_zero() ? 3 : 2;
+    }
+#else
     size_t factor = bias.is_zero() ? 3 : 2;
+#endif
     scratchpad_dims[0] *= factor;
     desc.scratchpad_desc_ = memory::desc(
             scratchpad_dims, variance.get_data_type(), memory::format_tag::a);
@@ -1697,7 +1801,8 @@ groupnorm_executable_t::desc_t groupnorm_executable_t::create_desc(
     if (op->has_attr(op_attr::groups)) {
         group_num = op->get_attr<int64_t>(op_attr::groups);
     } else {
-        assertm(false, "group_num is required.");
+        VCHECK_OP_EXECUTABLE(
+                false, "groups attribute is required for groupnorm");
     }
     auto flags = dnnl::normalization_flags::none;
     if (use_affine)
@@ -1721,6 +1826,23 @@ groupnorm_executable_t::desc_t groupnorm_executable_t::create_desc(
     return {pd, false};
 }
 
+void genindex_executable_t ::execute(const stream &stream,
+        const std::unordered_map<int, memory> &args) const {
+    const auto &it_dst = args.find(DNNL_ARG_DST);
+    if (it_dst == args.end()) return;
+
+    auto &output = it_dst->second;
+    auto output_ptr = static_cast<int32_t *>(output.get_data_handle());
+    dnnl::impl::parallel_nd(nelems_, [&](dim_t i) {
+        dims_t input_dims; // decomposition for physical offsets
+        dnnl::impl::utils::l_dims_by_l_offset(
+                input_dims, i, output_dims_, ndims_);
+        auto offset
+                = utils::offset_compute(output_strides_, input_dims, ndims_);
+        output_ptr[offset] = input_dims[axis_];
+    });
+}
+
 static void get_arg_indices_for_post_ops(const op_t *op, fusion_info_mgr_t &mgr,
         arg_indices_t &indices, size_t &base_index) {
     const fusion_info_t &fusion_info
@@ -1825,12 +1947,16 @@ arg_indices_t matmul_executable_t::get_arg_indices(
 arg_indices_t binary_executable_t::get_arg_indices(
         const op_t *op, fusion_info_mgr_t &mgr) {
     arg_indices_t arg_indices;
+    const algorithm algo = static_cast<dnnl::algorithm>(
+            op->get_attr<int64_t>(op_attr::alg_kind));
 
     // add input args
     size_t index = 0;
     arg_indices.insert({DNNL_ARG_SRC_0, indices_t {input, index++}});
     arg_indices.insert({DNNL_ARG_SRC_1, indices_t {input, index++}});
-
+    if (algo == algorithm::binary_select) {
+        arg_indices.insert({DNNL_ARG_SRC_2, indices_t {input, index++}});
+    }
     get_arg_indices_for_post_ops(op, mgr, arg_indices, index);
 
     // add output args
@@ -2314,6 +2440,41 @@ arg_indices_t groupnorm_executable_t::get_arg_indices(
     return get_arg_indices_for_lnorm_and_gnorm(op, mgr);
 }
 
+arg_indices_t genindex_executable_t::get_arg_indices(
+        const op_t *op, fusion_info_mgr_t &mgr) {
+    UNUSED(op);
+    UNUSED(mgr);
+
+    arg_indices_t arg_indices;
+    arg_indices.insert({DNNL_ARG_SRC, indices_t {input, 0}});
+    arg_indices.insert({DNNL_ARG_DST, indices_t {output, 0}});
+
+    return arg_indices;
+}
+
+arg_indices_t sdpa_executable_t::get_arg_indices(
+        const op_t *op, fusion_info_mgr_t &mgr) {
+    UNUSED(mgr);
+
+    arg_indices_t arg_indices;
+    // add input args
+    size_t index = 0;
+    arg_indices.insert({DNNL_ARG_QUERIES, indices_t {input, index++}});
+    arg_indices.insert({DNNL_ARG_KEYS, indices_t {input, index++}});
+    arg_indices.insert({DNNL_ARG_VALUES, indices_t {input, index++}});
+    if (op->get_attr<bool>(dnnl::impl::graph::dnnl_impl::op_attr::with_scale)) {
+        arg_indices.insert({DNNL_ARG_SCALE, indices_t {input, index++}});
+    }
+    if (op->get_attr<bool>(dnnl::impl::graph::dnnl_impl::op_attr::with_mask)) {
+        arg_indices.insert({DNNL_ARG_ATTN_MASK, indices_t {input, index++}});
+    }
+
+    // add output args
+    arg_indices.insert({DNNL_ARG_DST, indices_t {output, 0}});
+    arg_indices.insert({DNNL_ARG_SCRATCHPAD, indices_t {output, 1}});
+    return arg_indices;
+}
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/op_executable.hpp b/src/graph/backend/dnnl/op_executable.hpp
index 4f4ef1847ae..cabd9e7db53 100644
--- a/src/graph/backend/dnnl/op_executable.hpp
+++ b/src/graph/backend/dnnl/op_executable.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,9 @@
 #include <type_traits>
 #include <unordered_map>
 
+#include "common/primitive.hpp"
+#include "common/sdpa_utils.hpp"
+
 #include "oneapi/dnnl/dnnl.hpp"
 #ifdef DNNL_WITH_SYCL
 #include "oneapi/dnnl/dnnl_sycl.hpp"
@@ -46,6 +49,20 @@
 #include "graph/backend/dnnl/fusion_info.hpp"
 #include "graph/backend/dnnl/internal_attrs.hpp"
 
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+
+#include "gpu/intel/compute/compute_engine.hpp"
+#include "gpu/intel/compute/compute_stream.hpp"
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+#include "gpu/intel/ocl/stream.hpp"
+#endif
+#endif
+
+#ifdef DNNL_WITH_SYCL
+#include "gpu/intel/sycl/stream.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -95,13 +112,13 @@ struct op_executable_t {
 #ifdef DNNL_WITH_SYCL
     virtual ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const = 0;
+            const std::vector<::sycl::event> &deps) const = 0;
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     virtual cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const = 0;
+            const std::vector<cl_event> &deps) const = 0;
 #endif
 };
 
@@ -153,7 +170,7 @@ inline std::shared_ptr<op_executable_t> executable_creator(
 // - support data formatting ops like permute/reshape/transpose
 // - support zero-volume tensor (empty tensor) like (1024, 64)x(64, 0)
 //
-// In the execute_sycl fuction, we will run a dummy sycl kernel to gather all
+// In the execute_sycl function, we will run a dummy sycl kernel to gather all
 // the input events
 struct dummy_impl_t : public op_executable_t {
     void execute(const stream &stream,
@@ -165,7 +182,7 @@ struct dummy_impl_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         UNUSED(stream);
 
         // Fast path: if no event, return an immediate event.
@@ -189,7 +206,7 @@ struct dummy_impl_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         UNUSED(stream);
 
         // Fast path: if no event, return an immediate event.
@@ -243,7 +260,7 @@ struct memory_reparser_t : public dummy_impl_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto from = args.find(DNNL_ARG_FROM);
         auto to = args.find(DNNL_ARG_TO);
         if (from == args.end() || to == args.end()) return {};
@@ -264,7 +281,7 @@ struct memory_reparser_t : public dummy_impl_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto from = args.find(DNNL_ARG_FROM);
         auto to = args.find(DNNL_ARG_TO);
         if (from == args.end() || to == args.end()) return {};
@@ -276,7 +293,7 @@ struct memory_reparser_t : public dummy_impl_t {
             const memory &dst_mem = to->second;
             assert(deps.size() <= 1);
             // Passing the empty event to memcpy below causes failure.
-            const bool empty = deps.size() == 0 || deps[0] == 0;
+            const bool empty = deps.empty() || deps[0] == nullptr;
             const cl_uint num = empty ? 0 : static_cast<cl_uint>(deps.size());
             cl_event e;
             UNUSED_STATUS(xpu::ocl::usm::memcpy(stream.get(),
@@ -307,6 +324,7 @@ struct const_memory_filler_t : public op_executable_t {
         UNUSED(p_engine);
         UNUSED(mgr);
         UNUSED(pd_cache);
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
         attr_data_
                 = get_attr_data(op->get_attr<std::vector<attr_dt>>(attr_name),
                         std::is_same<attr_dt, target_dt>());
@@ -339,7 +357,7 @@ struct const_memory_filler_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         void *data_handle = static_cast<void *>(
                 const_cast<target_dt *>(attr_data_.data()));
         const memory &dst_mem = args.find(DNNL_ARG_TO)->second;
@@ -353,13 +371,13 @@ struct const_memory_filler_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         void *data_handle = static_cast<void *>(
                 const_cast<target_dt *>(attr_data_.data()));
         const memory &dst_mem = args.find(DNNL_ARG_TO)->second;
         assert(deps.size() <= 1);
         // Passing the empty event to memcpy below causes failure.
-        const bool empty = deps.size() == 0 || deps[0] == 0;
+        const bool empty = deps.empty() || deps[0] == nullptr;
         const cl_uint num = empty ? 0 : static_cast<cl_uint>(deps.size());
         cl_event e;
         UNUSED_STATUS(
@@ -450,7 +468,7 @@ struct conv_fwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto sycl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -506,7 +524,7 @@ struct conv_fwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto ocl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -602,7 +620,7 @@ struct deconv_fwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto sycl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -625,7 +643,7 @@ struct deconv_fwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto ocl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -669,7 +687,7 @@ struct deconv_bwd_data_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -679,7 +697,7 @@ struct deconv_bwd_data_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -709,7 +727,7 @@ struct deconv_bwd_weights_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -719,7 +737,7 @@ struct deconv_bwd_weights_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -793,7 +811,7 @@ struct matmul_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         if (is_dummy_) { return dummy_impl_.execute_sycl(stream, args, deps); }
 
         auto sycl_deps = deps;
@@ -826,7 +844,7 @@ struct matmul_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         if (is_dummy_) { return dummy_impl_.execute_ocl(stream, args, deps); }
 
         auto ocl_deps = deps;
@@ -881,7 +899,7 @@ struct eltwise_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -891,7 +909,7 @@ struct eltwise_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -920,7 +938,7 @@ struct eltwise_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -930,7 +948,7 @@ struct eltwise_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -993,7 +1011,7 @@ struct binary_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         if (is_dummy_) { return dummy_impl_.execute_sycl(stream, args, deps); }
 
         auto sycl_deps = deps;
@@ -1026,7 +1044,7 @@ struct binary_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         if (is_dummy_) { return dummy_impl_.execute_ocl(stream, args, deps); }
 
         auto ocl_deps = deps;
@@ -1080,7 +1098,7 @@ struct concat_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1090,7 +1108,7 @@ struct concat_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1119,7 +1137,7 @@ struct shuffle_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1129,7 +1147,7 @@ struct shuffle_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1157,7 +1175,7 @@ struct pool_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1167,7 +1185,7 @@ struct pool_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1196,7 +1214,7 @@ struct pool_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1206,7 +1224,7 @@ struct pool_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1234,7 +1252,7 @@ struct prelu_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1244,7 +1262,7 @@ struct prelu_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1273,7 +1291,7 @@ struct prelu_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1283,7 +1301,7 @@ struct prelu_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1330,7 +1348,7 @@ struct reorder_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto sycl_deps = deps;
         if (with_sum_) {
             auto it_dst = args.find(DNNL_ARG_DST);
@@ -1360,7 +1378,7 @@ struct reorder_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto ocl_deps = deps;
         if (with_sum_) {
             auto it_dst = args.find(DNNL_ARG_DST);
@@ -1412,6 +1430,12 @@ struct bn_folding_t : public op_executable_t {
         dnnl::binary::primitive_desc mul_pd_;
         dnnl::binary::primitive_desc sub_pd_;
 
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+        // binary + sqrt post-op fusion is unsupported on NVIDIA GPU
+        dnnl::eltwise_forward::primitive_desc sqrt_pd_;
+#endif
+
         bool with_bias_ {false};
 
     public:
@@ -1426,6 +1450,13 @@ struct bn_folding_t : public op_executable_t {
             fusion_info_mgr_t &mgr, pd_cache_t &pd_cache) {
         desc_ = create_desc(op, p_engine, mgr, pd_cache);
         add_prim_ = dnnl::binary(desc_.add_pd_);
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+        // binary + sqrt post-op fusion is unsupported on NVIDIA GPU
+        if (p_engine.get_kind() == dnnl::engine::kind::gpu) {
+            sqrt_prim_ = dnnl::eltwise_forward(desc_.sqrt_pd_);
+        }
+#endif
         mul_prim_ = dnnl::binary(desc_.mul_pd_);
         sub_prim_ = dnnl::binary(desc_.sub_pd_);
     }
@@ -1521,7 +1552,7 @@ struct bn_folding_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         UNUSED(args);
 
         auto weights = args.find(DNNL_ARG_WEIGHTS)->second;
@@ -1553,18 +1584,64 @@ struct bn_folding_t : public op_executable_t {
         memory epsilon_mem = make_dnnl_memory(desc_.epsilon_desc_,
                 scratchpad.get_engine(), (void *)buf_start);
 
-        // 1. sqrt_variance = sqrt(variance + epsilon)
         auto sycl_queue = dnnl::sycl_interop::get_queue(stream);
-        sycl_queue
-                .memcpy(epsilon_mem.get_data_handle(), &desc_.epsilon_,
-                        epsilon_mem.get_desc().get_size())
-                .wait();
+        ::sycl::event sycl_deps;
 
-        auto sycl_deps = dnnl::sycl_interop::execute(add_prim_, stream,
-                {{DNNL_ARG_SRC_0, variance}, {DNNL_ARG_SRC_1, epsilon_mem},
-                        {DNNL_ARG_DST, sqrt_variance}},
-                deps);
+        if (scratchpad.get_engine().get_kind() == engine::kind::gpu) {
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+
+            buf_start += epsilon_mem.get_desc().get_size();
+
+            // variance + epsilon
+            memory variance_epsilon = make_dnnl_memory(desc_.epsilon_desc_,
+                    scratchpad.get_engine(), (void *)buf_start);
+
+            // 1. sqrt_variance = sqrt(variance + epsilon)
+            //auto sycl_queue = dnnl::sycl_interop::get_queue(stream);
+            sycl_queue
+                    .memcpy(epsilon_mem.get_data_handle(), &desc_.epsilon_,
+                            epsilon_mem.get_desc().get_size())
+                    .wait();
+
+            auto sycl_deps0 = dnnl::sycl_interop::execute(add_prim_, stream,
+                    {{DNNL_ARG_SRC_0, variance}, {DNNL_ARG_SRC_1, epsilon_mem},
+                            { DNNL_ARG_DST,
+                                variance_epsilon }},
+                    deps);
+
+            sycl_deps = dnnl::sycl_interop::execute(sqrt_prim_, stream,
+                    {{DNNL_ARG_SRC, variance_epsilon},
+                            { DNNL_ARG_DST,
+                                sqrt_variance }},
+                    {sycl_deps0});
+#else
+
+            // 1. sqrt_variance = sqrt(variance + epsilon)
+            //auto sycl_queue = dnnl::sycl_interop::get_queue(stream);
+            sycl_queue
+                    .memcpy(epsilon_mem.get_data_handle(), &desc_.epsilon_,
+                            epsilon_mem.get_desc().get_size())
+                    .wait();
 
+            sycl_deps = dnnl::sycl_interop::execute(add_prim_, stream,
+                    {{DNNL_ARG_SRC_0, variance}, {DNNL_ARG_SRC_1, epsilon_mem},
+                            {DNNL_ARG_DST, sqrt_variance}},
+                    deps);
+
+#endif
+        } else {
+            // 1. sqrt_variance = sqrt(variance + epsilon)
+            sycl_queue
+                    .memcpy(epsilon_mem.get_data_handle(), &desc_.epsilon_,
+                            epsilon_mem.get_desc().get_size())
+                    .wait();
+
+            sycl_deps = dnnl::sycl_interop::execute(add_prim_, stream,
+                    {{DNNL_ARG_SRC_0, variance}, {DNNL_ARG_SRC_1, epsilon_mem},
+                            {DNNL_ARG_DST, sqrt_variance}},
+                    deps);
+        }
         // 2. updated_weight = weights * scale / sqrt_variance
         memory new_scale(desc_.new_scale_desc_, scale.get_engine(),
                 scale.get_data_handle());
@@ -1621,7 +1698,7 @@ struct bn_folding_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         UNUSED(args);
 
         auto weights = args.find(DNNL_ARG_WEIGHTS)->second;
@@ -1727,6 +1804,11 @@ struct bn_folding_t : public op_executable_t {
     dnnl::binary add_prim_;
     dnnl::binary mul_prim_;
     dnnl::binary sub_prim_;
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+    // binary + sqrt post-op fusion is unsupported on NVIDIA GPU
+    dnnl::eltwise_forward sqrt_prim_;
+#endif
 };
 
 struct conv_bwd_data_executable_t : public op_executable_t {
@@ -1749,7 +1831,7 @@ struct conv_bwd_data_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1759,7 +1841,7 @@ struct conv_bwd_data_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1789,7 +1871,7 @@ struct conv_bwd_weights_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -1799,7 +1881,7 @@ struct conv_bwd_weights_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -1816,8 +1898,8 @@ struct batchnorm_executable_t : public op_executable_t {
 
     batchnorm_executable_t(std::shared_ptr<op_t> &op,
             const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
-            pd_cache_t &pd_cache) {
-        is_training_ = op->get_attr<bool>(op_attr::is_training);
+            pd_cache_t &pd_cache)
+        : is_training_(op->get_attr<bool>(op_attr::is_training)) {
         float momentum = 0.5;
         if (op->has_attr(op_attr::momentum))
             momentum = op->get_attr<float>(op_attr::momentum);
@@ -1885,7 +1967,7 @@ struct batchnorm_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         if (!is_training_) {
             auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
             if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
@@ -1935,7 +2017,7 @@ struct batchnorm_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         if (!is_training_) {
             auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
             return e;
@@ -2006,7 +2088,7 @@ struct batchnorm_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2016,7 +2098,7 @@ struct batchnorm_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2063,7 +2145,7 @@ struct resampling_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto sycl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -2086,7 +2168,7 @@ struct resampling_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto ocl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -2129,7 +2211,7 @@ struct resampling_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2139,7 +2221,7 @@ struct resampling_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2169,7 +2251,7 @@ struct layernorm_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2179,7 +2261,7 @@ struct layernorm_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2209,7 +2291,7 @@ struct layernorm_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2219,7 +2301,7 @@ struct layernorm_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2247,7 +2329,7 @@ struct sum_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2257,7 +2339,7 @@ struct sum_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2286,7 +2368,7 @@ struct softmax_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2296,7 +2378,7 @@ struct softmax_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2325,7 +2407,7 @@ struct softmax_bwd_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2335,7 +2417,7 @@ struct softmax_bwd_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2377,7 +2459,7 @@ struct reduction_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto sycl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -2401,7 +2483,7 @@ struct reduction_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto ocl_deps = deps;
         if (with_sum_) {
             const memory &psrc_mem = args.find(DNNL_GRAPH_ARG_POST_SRC)->second;
@@ -2446,7 +2528,7 @@ struct groupnorm_executable_t : public op_executable_t {
 #ifdef DNNL_WITH_SYCL
     ::sycl::event execute_sycl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<::sycl::event> &deps = {}) const override {
+            const std::vector<::sycl::event> &deps) const override {
         auto e = dnnl::sycl_interop::execute(prim_, stream, args, deps);
         if (stream.get_engine().get_kind() == engine::kind::cpu) e.wait();
         return e;
@@ -2456,7 +2538,7 @@ struct groupnorm_executable_t : public op_executable_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     cl_event execute_ocl(const stream &stream,
             const std::unordered_map<int, memory> &args,
-            const std::vector<cl_event> &deps = {}) const override {
+            const std::vector<cl_event> &deps) const override {
         auto e = dnnl::ocl_interop::execute(prim_, stream, args, deps);
         return e;
     }
@@ -2466,6 +2548,341 @@ struct groupnorm_executable_t : public op_executable_t {
     dnnl::group_normalization_forward prim_;
 };
 
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+using namespace dnnl::impl::gpu::intel;
+#define MAX_NDIMS 6
+#endif
+struct genindex_executable_t : public op_executable_t {
+    DECLARE_ARG_INDICES_GETTER;
+
+    genindex_executable_t(std::shared_ptr<op_t> &op,
+            const dnnl::engine &p_engine, fusion_info_mgr_t &mgr,
+            pd_cache_t &pd_cache)
+        : axis_(op->get_attr<int64_t>(op_attr::axis)) {
+        using ltw = logical_tensor_wrapper_t;
+        const auto &input_lt = op->get_input_value(0)->get_logical_tensor();
+        nelems_ = ltw(input_lt).nelems();
+        ndims_ = ltw(input_lt).ndims();
+        const auto &output_lt = op->get_output_value(0)->get_logical_tensor();
+        for (int i = 0; i < ndims_; i++) {
+            output_dims_[i] = output_lt.dims[i];
+            output_strides_[i] = output_lt.layout.strides[i];
+        }
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+        if (p_engine.get_kind() == engine::kind::gpu) {
+            compute::kernel_ctx_t kernel_ctx;
+            kernel_ctx.define_int("NDIMS", ndims_);
+            for (int d = 0; d < MAX_NDIMS; ++d) {
+                dim_t dim = (d < ndims_) ? output_dims_[d] : 1;
+                dim_t stride = (d < ndims_) ? output_strides_[d] : 0;
+                kernel_ctx.define_int(dnnl::impl::utils::format("D%d", d), dim);
+                kernel_ctx.define_int(
+                        dnnl::impl::utils::format("S%d", d), stride);
+            }
+            auto *compute_engine
+                    = dnnl::impl::utils::downcast<compute::compute_engine_t *>(
+                            p_engine.get());
+            std::vector<compute::kernel_t> kernels(1);
+            compute_engine->create_kernels(&kernels, {"gen_index"}, kernel_ctx);
+            kernel_ = kernels[0];
+        }
+#endif
+    }
+
+    void execute(const stream &stream,
+            const std::unordered_map<int, memory> &args) const override;
+
+#ifdef DNNL_WITH_SYCL
+    ::sycl::event execute_sycl(const stream &stream,
+            const std::unordered_map<int, memory> &args,
+            const std::vector<::sycl::event> &deps) const override {
+        if (stream.get_engine().get_kind() == engine::kind::cpu) {
+            auto strm_t = stream.get();
+            auto *sycl_stream_impl = dnnl::impl::utils::downcast<
+                    dnnl::impl::xpu::sycl::stream_impl_t *>(strm_t->impl());
+
+            strm_t->before_exec_hook();
+            if (!deps.empty()) { sycl_stream_impl->sycl_ctx().set_deps(deps); }
+
+            execute(stream, args);
+
+            // return output event
+            ::sycl::event return_event = sycl_stream_impl->get_output_event();
+            strm_t->after_exec_hook();
+            return return_event;
+        }
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+        auto compute_stream
+                = dnnl::impl::utils::downcast<compute::compute_stream_t *>(
+                        stream.get());
+        compute::range_t gws = {static_cast<size_t>(nelems_)};
+        auto nd_range = compute::nd_range_t(gws);
+        compute::kernel_arg_list_t arg_list;
+        const auto &dst = *(args.at(DNNL_ARG_DST).get()->memory_storage());
+        arg_list.set(0, dst);
+        arg_list.set(1, axis_);
+        auto *sycl_stream
+                = dnnl::impl::utils::downcast<sycl::stream_t *>(compute_stream);
+        sycl_stream->before_exec_hook();
+        if (!deps.empty()) sycl_stream->sycl_ctx().set_deps(deps);
+
+        kernel_.parallel_for(*compute_stream, nd_range, arg_list,
+                sycl_stream->sycl_ctx().get_deps(),
+                sycl_stream->sycl_ctx().get_deps());
+        auto return_event = sycl_stream->get_output_event();
+
+        sycl_stream->after_exec_hook();
+        return return_event;
+#else
+        assertm(false,
+                "genindex opexcutable is only implemented for intel vendor "
+                "under SYCL runtime ");
+        throw std::runtime_error("Unimplement");
+#endif
+    }
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+    cl_event execute_ocl(const stream &stream,
+            const std::unordered_map<int, memory> &args,
+            const std::vector<cl_event> &deps) const override {
+#if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
+        auto compute_stream
+                = dnnl::impl::utils::downcast<compute::compute_stream_t *>(
+                        stream.get());
+
+        compute::range_t gws = {static_cast<size_t>(nelems_)};
+
+        auto nd_range = compute::nd_range_t(gws);
+        compute::kernel_arg_list_t arg_list;
+        const auto &dst = *(args.at(DNNL_ARG_DST).get()->memory_storage());
+        arg_list.set(0, dst);
+        arg_list.set(1, axis_);
+        auto *ocl_stream
+                = dnnl::impl::utils::downcast<gpu::intel::ocl::stream_t *>(
+                        compute_stream);
+
+        ocl_stream->before_exec_hook();
+
+        if (!deps.empty()) {
+            std::vector<xpu::ocl::wrapper_t<cl_event>> events(deps.size());
+            for (size_t i = 0; i < deps.size(); i++)
+                events[i] = xpu::ocl::wrapper_t<cl_event>(deps[i], true);
+            ocl_stream->ocl_ctx().set_deps(events);
+        }
+
+        kernel_.parallel_for(*compute_stream, nd_range, arg_list,
+                compute_stream->ctx().get_deps(),
+                compute_stream->ctx().get_deps());
+
+        cl_event return_event = nullptr;
+        if ((ocl_stream->flags() & stream_flags::in_order) == 0) {
+            auto last = ocl_stream->get_output_event();
+            return_event = last.release();
+        }
+
+        ocl_stream->after_exec_hook();
+        return return_event;
+#else
+        assertm(false,
+                "genindex opexcutable is only implemented for intel vendor "
+                "under OCL runtime ");
+        throw std::runtime_error("Unimplement");
+#endif
+    }
+#endif
+
+private:
+    int axis_, nelems_, ndims_;
+    dims_t output_dims_, output_strides_;
+
+#if (DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE) \
+        && (DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL)
+    compute::kernel_t kernel_;
+#endif
+};
+
+struct sdpa_executable_t : public op_executable_t {
+    DECLARE_ARG_INDICES_GETTER;
+
+    sdpa_executable_t(std::shared_ptr<op_t> &op, const dnnl::engine &p_engine,
+            fusion_info_mgr_t &mgr, pd_cache_t &pd_cache)
+        : with_scale_(op->get_attr<bool>(op_attr::with_scale))
+        , with_mask_(op->get_attr<bool>(op_attr::with_mask))
+        , is_causal_mask_(op->get_attr<bool>(op_attr::with_causal)) {
+
+        auto md_q = make_dnnl_memory_desc(
+                op->get_input_value(0)->get_logical_tensor());
+        auto md_k = make_dnnl_memory_desc(
+                op->get_input_value(1)->get_logical_tensor());
+        auto md_v = make_dnnl_memory_desc(
+                op->get_input_value(2)->get_logical_tensor());
+        auto md_dst = make_dnnl_memory_desc(
+                op->get_output_value(0)->get_logical_tensor());
+
+        auto scale_dt = impl::data_type::undef;
+        size_t idx = 3;
+        if (with_scale_)
+            scale_dt = op->get_input_value(idx++)
+                               ->get_logical_tensor()
+                               .data_type;
+
+        dnnl::memory::desc md_mask;
+        if (with_mask_)
+            md_mask = make_dnnl_memory_desc(
+                    op->get_input_value(idx++)->get_logical_tensor());
+
+        dnnl::primitive_attr attr;
+        attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+        attr.set_fpmath_mode(
+                static_cast<dnnl::fpmath_mode>(mgr.get_fpmath_mode().mode_));
+        if (op->has_attr(op_attr::is_invert_scale))
+            is_invert_scale_ = op->get_attr<bool>(op_attr::is_invert_scale);
+
+        dim_t kv_head_number
+                = op->get_input_value(1)->get_logical_tensor().dims[1];
+        status_t s = create_sdpa_pd(sdpa_pd_, p_engine.get(), md_q.get(),
+                md_k.get(), md_v.get(), md_dst.get(), md_mask.get(), scale_dt,
+                is_invert_scale_, kv_head_number,
+                is_causal_mask_ ? attn_mask_type::top_left
+                                : attn_mask_type::buffer,
+                attr.get());
+        if (s != dnnl::impl::status::success) {
+            is_initialized_ = false;
+        } else {
+            status_t s = sdpa_pd_->create_primitive(sdpa_prim_, p_engine.get());
+            is_initialized_ = s == status::success ? true : false;
+        }
+    }
+
+    bool is_initialized() const { return is_initialized_; }
+
+    void execute(const stream &stream,
+            const std::unordered_map<int, memory> &args) const override {
+        exec_args_t exec_args;
+        memory_arg_t mem_arg_q = {(args.at(DNNL_ARG_QUERIES)).get(), true};
+        memory_arg_t mem_arg_k = {(args.at(DNNL_ARG_KEYS)).get(), true};
+        memory_arg_t mem_arg_v = {(args.at(DNNL_ARG_VALUES)).get(), true};
+        memory_arg_t mem_arg_dst = {(args.at(DNNL_ARG_DST)).get(), false};
+        memory_arg_t mem_arg_scale = {
+                with_scale_ ? (args.at(DNNL_ARG_SCALE)).get() : nullptr, true};
+        memory_arg_t mem_arg_mask
+                = {with_mask_ ? (args.at(DNNL_ARG_ATTN_MASK)).get() : nullptr,
+                        true};
+
+        exec_args[DNNL_ARG_QUERIES] = mem_arg_q;
+        exec_args[DNNL_ARG_KEYS] = mem_arg_k;
+        exec_args[DNNL_ARG_VALUES] = mem_arg_v;
+        exec_args[DNNL_ARG_DST] = mem_arg_dst;
+        exec_args[DNNL_ARG_SCALE] = mem_arg_scale;
+        exec_args[DNNL_ARG_ATTN_MASK] = mem_arg_mask;
+
+        exec_ctx_t ctx(stream.get(), std::move(exec_args));
+        sdpa_prim_->execute(ctx);
+    }
+
+#ifdef DNNL_WITH_SYCL
+    ::sycl::event execute_sycl(const stream &stream,
+            const std::unordered_map<int, memory> &args,
+            const std::vector<::sycl::event> &deps) const override {
+
+        exec_args_t exec_args;
+        memory_arg_t mem_arg_q = {(args.at(DNNL_ARG_QUERIES)).get(), true};
+        memory_arg_t mem_arg_k = {(args.at(DNNL_ARG_KEYS)).get(), true};
+        memory_arg_t mem_arg_v = {(args.at(DNNL_ARG_VALUES)).get(), true};
+        memory_arg_t mem_arg_dst = {(args.at(DNNL_ARG_DST)).get(), false};
+        memory_arg_t mem_arg_scale = {
+                with_scale_ ? (args.at(DNNL_ARG_SCALE)).get() : nullptr, true};
+        memory_arg_t mem_arg_mask
+                = {with_mask_ ? (args.at(DNNL_ARG_ATTN_MASK)).get() : nullptr,
+                        true};
+
+        exec_args[DNNL_ARG_QUERIES] = mem_arg_q;
+        exec_args[DNNL_ARG_KEYS] = mem_arg_k;
+        exec_args[DNNL_ARG_VALUES] = mem_arg_v;
+        exec_args[DNNL_ARG_DST] = mem_arg_dst;
+        exec_args[DNNL_ARG_SCALE] = mem_arg_scale;
+        exec_args[DNNL_ARG_ATTN_MASK] = mem_arg_mask;
+        auto strm_t = stream.get();
+        exec_ctx_t ctx(strm_t, std::move(exec_args));
+        auto *sycl_stream_impl = dnnl::impl::utils::downcast<
+                dnnl::impl::xpu::sycl::stream_impl_t *>(strm_t->impl());
+
+        strm_t->before_exec_hook();
+
+        if (!deps.empty()) sycl_stream_impl->sycl_ctx().set_deps(deps);
+
+        sdpa_prim_->execute(ctx);
+
+        ::sycl::event return_event = sycl_stream_impl->get_output_event();
+        strm_t->after_exec_hook();
+        return return_event;
+    }
+#endif
+
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
+    cl_event execute_ocl(const stream &stream,
+            const std::unordered_map<int, memory> &args,
+            const std::vector<cl_event> &deps) const override {
+        exec_args_t exec_args;
+        memory_arg_t mem_arg_q = {(args.at(DNNL_ARG_QUERIES)).get(), true};
+        memory_arg_t mem_arg_k = {(args.at(DNNL_ARG_KEYS)).get(), true};
+        memory_arg_t mem_arg_v = {(args.at(DNNL_ARG_VALUES)).get(), true};
+        memory_arg_t mem_arg_dst = {(args.at(DNNL_ARG_DST)).get(), false};
+        memory_arg_t mem_arg_scale = {
+                with_scale_ ? (args.at(DNNL_ARG_SCALE)).get() : nullptr, true};
+        memory_arg_t mem_arg_mask
+                = {with_mask_ ? (args.at(DNNL_ARG_ATTN_MASK)).get() : nullptr,
+                        true};
+
+        exec_args[DNNL_ARG_QUERIES] = mem_arg_q;
+        exec_args[DNNL_ARG_KEYS] = mem_arg_k;
+        exec_args[DNNL_ARG_VALUES] = mem_arg_v;
+        exec_args[DNNL_ARG_DST] = mem_arg_dst;
+        exec_args[DNNL_ARG_SCALE] = mem_arg_scale;
+        exec_args[DNNL_ARG_ATTN_MASK] = mem_arg_mask;
+
+        exec_ctx_t ctx(stream.get(), std::move(exec_args));
+
+        auto *ocl_stream
+                = dnnl::impl::utils::downcast<gpu::intel::ocl::stream_t *>(
+                        stream.get());
+
+        ocl_stream->before_exec_hook();
+
+        if (!deps.empty()) {
+            std::vector<xpu::ocl::wrapper_t<cl_event>> events(deps.size());
+            for (size_t i = 0; i < deps.size(); i++)
+                events[i] = xpu::ocl::wrapper_t<cl_event>(deps[i], true);
+            ocl_stream->ocl_ctx().set_deps(events);
+        }
+
+        sdpa_prim_->execute(ctx);
+
+        cl_event return_event = nullptr;
+        if ((ocl_stream->flags() & stream_flags::in_order) == 0) {
+            auto last = ocl_stream->get_output_event();
+            return_event = last.release();
+        }
+
+        ocl_stream->after_exec_hook();
+        return return_event;
+    }
+#endif
+
+private:
+    std::shared_ptr<primitive_desc_t> sdpa_pd_;
+    std::shared_ptr<primitive_t> sdpa_prim_;
+    bool with_scale_;
+    bool with_mask_;
+    bool is_invert_scale_;
+    bool is_causal_mask_;
+    bool is_initialized_;
+};
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/passes/compile_ops.cpp b/src/graph/backend/dnnl/passes/compile_ops.cpp
index ca387d35660..314859fb915 100644
--- a/src/graph/backend/dnnl/passes/compile_ops.cpp
+++ b/src/graph/backend/dnnl/passes/compile_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,12 +27,14 @@
 
 #include "oneapi/dnnl/dnnl.hpp"
 
+#define VCHECK_COMPILE_OPS(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, compile_ops, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
-using op_ptr = std::shared_ptr<op_t>;
-
 /// After the lower down, infer shape, infer type and layout propagation passes,
 /// each op in the subgraph will has complete attributes and each edge will have
 /// complete shape/dtype/layout information. We can create executable for these
@@ -45,28 +47,33 @@ status_t compile_ops(std::shared_ptr<subgraph_t> &sg) {
     return topo_order_visit(sg->get_output_ops(), [&](op_t *op) {
         const op_schema_t *opm
                 = op_schema_registry_t::get_op_schema(op->get_kind());
-        if (!opm) {
-            assertm(false, "no schema for current op");
-            return status::invalid_graph_op;
-        }
 
-        if (!opm->has_additional_item("executable_creator")) {
-            assertm(false, "no executable creator in this op schema");
-            return status::invalid_graph_op;
-        }
+        VCHECK_COMPILE_OPS(opm != nullptr, status::invalid_graph_op,
+                "no schema for current op %s", op->get_name().c_str());
+
+        VCHECK_COMPILE_OPS(opm->has_additional_item("executable_creator"),
+                status::invalid_graph_op,
+                "no executable creator in schema of op %s",
+                op->get_name().c_str());
 
         auto cur_op = op->shared_from_this();
         auto creator = opm->get_additional_item<executable_creator_func>(
                 "executable_creator");
+
         std::shared_ptr<op_executable_t> exec
                 = creator(cur_op, p_engine, mgr, pd_cache);
-
-        if (!exec) {
-            assertm(false, "unimplemented op, can't compile it");
-            return status::unimplemented;
+        VCHECK_COMPILE_OPS(exec != nullptr, status::invalid_graph_op,
+                "unimplemented op, can't compile op %s",
+                op->get_name().c_str());
+        if (cur_op->get_kind() == op_kind::dnnl_sdpa) {
+            auto sdpa_exec = std::dynamic_pointer_cast<sdpa_executable_t>(exec);
+            VCHECK_COMPILE_OPS(sdpa_exec->is_initialized(),
+                    status::unimplemented,
+                    "failed to create executable for op %s",
+                    op->get_name().c_str());
         }
-
         sg->execs_.emplace_back(exec);
+
         sg->is_constant_.push_back(op->has_attr(op_attr::is_constant)
                 && op->get_attr<bool>(op_attr::is_constant));
         return status::success;
diff --git a/src/graph/backend/dnnl/passes/constant_propagation.cpp b/src/graph/backend/dnnl/passes/constant_propagation.cpp
index 197e9b666f9..016fcf2b94d 100644
--- a/src/graph/backend/dnnl/passes/constant_propagation.cpp
+++ b/src/graph/backend/dnnl/passes/constant_propagation.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,18 +34,12 @@ namespace {
 bool has_scratchpad(const op_t *op) {
     // the following ops do not have scratchpad output by definition
     const static std::set<op_kind_t> no_scratchpad_ops {
-            op_kind::dnnl_constant_scales,
-            op_kind::dnnl_constant_zps,
-            op_kind::dnnl_add_zps,
-            op_kind::dnnl_sub_zps,
-            op_kind::dnnl_to_group,
-            op_kind::dnnl_from_group,
-            op_kind::dnnl_permute,
-            op_kind::dnnl_squeeze,
-            op_kind::dnnl_unsqueeze,
-            op_kind::dnnl_transpose,
-            op_kind::dnnl_reshape,
-    };
+            op_kind::dnnl_constant_scales, op_kind::dnnl_constant_zps,
+            op_kind::dnnl_add_zps, op_kind::dnnl_sub_zps,
+            op_kind::dnnl_to_group, op_kind::dnnl_from_group,
+            op_kind::dnnl_permute, op_kind::dnnl_squeeze,
+            op_kind::dnnl_unsqueeze, op_kind::dnnl_transpose,
+            op_kind::dnnl_reshape, op_kind::dnnl_gen_index, op_kind::dnnl_mask};
 
     // the following ops may have scratchpad output if output size > 1
     const static std::set<op_kind_t> may_have_scratchpad_ops {
diff --git a/src/graph/backend/dnnl/passes/insert_ops.cpp b/src/graph/backend/dnnl/passes/insert_ops.cpp
index d92b39c8419..60d3ae0631c 100644
--- a/src/graph/backend/dnnl/passes/insert_ops.cpp
+++ b/src/graph/backend/dnnl/passes/insert_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,10 @@
 #include "graph/backend/dnnl/passes/insert_ops.hpp"
 #include "graph/backend/dnnl/passes/utils.hpp"
 
+#define VCHECK_INSERT_OPS(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, insert_ops, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -332,15 +336,21 @@ status_t insert_to_group_for_reorder(std::shared_ptr<subgraph_t> &sg) {
             // reorder's input has blocked format with group
             // while output has plain format, perhaps for
             // backward path. No such case for now, disable
-            return status::unimplemented;
+            VCHECK_INSERT_OPS(false, status::unimplemented,
+                    "unsupported i/o dimentions to insert to_group for "
+                    "reorder, input ndims: %d, output ndims: %d",
+                    in_md.get_ndims(), out_md.get_ndims());
         } else if (in_md.get_ndims() + 1 == out_md.get_ndims()) {
             // reorder's input has plain format while output
             // has blocked format with group, typically for
             // weight prepacking
             auto group = out_md.get_dims()[0];
-            if (group * out_md.get_dims()[1] != in_md.get_dims()[0])
-                return status::invalid_shape;
-
+            VCHECK_INSERT_OPS(
+                    group * out_md.get_dims()[1] == in_md.get_dims()[0],
+                    status::invalid_shape,
+                    "unmatched shape to insert to_group for reorder, group: %d,"
+                    "output dims[1]: %d, input dims[0], %d",
+                    group, out_md.get_dims()[1], in_md.get_dims()[0]);
             // insert to_group op
             op_ptr to_group_op = std::make_shared<op_t>(op_kind::dnnl_to_group);
             to_group_op->set_attr<int64_t>(op_attr::groups, group);
@@ -348,7 +358,8 @@ status_t insert_to_group_for_reorder(std::shared_ptr<subgraph_t> &sg) {
             rewriter.insert_op_before(to_group_op, cur_op, 0);
         } else {
             // illegal shape
-            return status::invalid_shape;
+            VCHECK_INSERT_OPS(false, status::invalid_shape,
+                    "invalid shape to insert to_group for reorder");
         }
     }
 
@@ -356,6 +367,88 @@ status_t insert_to_group_for_reorder(std::shared_ptr<subgraph_t> &sg) {
     return status::success;
 }
 
+status_t insert_permute_for_dynamic_mul_scale_sub_zp(
+        std::shared_ptr<subgraph_t> &sg) {
+    subgraph_rewriter_t rewriter(sg);
+    std::vector<op_ptr> permute_op_group;
+
+    for (auto &cur_op : sg->get_ops()) {
+        if (cur_op->get_kind() != op_kind::dnnl_sub_zps
+                && cur_op->get_kind() != op_kind::dnnl_mul_scales)
+            continue;
+
+        if (cur_op->get_attr<std::string>(op_attr::qtype) == "per_tensor")
+            continue;
+
+        // This pass only handle dynamic quantization
+        if (cur_op->get_kind() == op_kind::dnnl_sub_zps) {
+            if (!cur_op->has_attr(op_attr::with_runtime_zps)
+                    || !cur_op->get_attr<bool>(op_attr::with_runtime_zps))
+                continue;
+
+            auto out_val = cur_op->get_output_values()[0];
+            auto consumers = out_val->get_consumers();
+            if (consumers.empty()) continue;
+
+            auto &consumer_op = consumers[0].get_op();
+            if (consumer_op.get_kind() != op_kind::dnnl_mul_scales) continue;
+            if (!consumer_op.has_attr(op_attr::with_runtime_scales)
+                    || !consumer_op.get_attr<bool>(
+                            op_attr::with_runtime_scales))
+                continue;
+
+            auto scale_out_val = consumer_op.get_output_values()[0];
+            auto &scale_consumer_op
+                    = scale_out_val->get_consumers()[0].get_op();
+            if (scale_consumer_op.get_kind() != op_kind::dnnl_matmul) continue;
+
+            if (scale_consumer_op.has_attr(op_attr::transpose_b)
+                    && scale_consumer_op.get_attr<bool>(op_attr::transpose_b)) {
+
+                permute_op_group.emplace_back(cur_op);
+            }
+        } else {
+            if (!cur_op->has_attr(op_attr::with_runtime_scales)
+                    || !cur_op->get_attr<bool>(op_attr::with_runtime_scales))
+                continue;
+
+            auto out_val = cur_op->get_output_values()[0];
+            auto consumers = out_val->get_consumers();
+            if (consumers.empty()) continue;
+
+            auto &consumer_op = consumers[0].get_op();
+            if (consumer_op.get_kind() != op_kind::dnnl_matmul) continue;
+
+            if (consumer_op.has_attr(op_attr::transpose_b)
+                    && consumer_op.get_attr<bool>(op_attr::transpose_b)) {
+                permute_op_group.emplace_back(cur_op);
+            }
+        }
+    }
+
+    if (permute_op_group.empty()) return status::success;
+
+    for (auto &cur_op : permute_op_group) {
+        auto ndims = cur_op->get_input_value(0)->get_logical_tensor().ndims;
+        if (cur_op->get_attr<std::string>(op_attr::qtype) == "per_group") {
+            op_ptr permute_op = std::make_shared<op_t>(op_kind::dnnl_permute);
+            auto perm = get_last_two_dims_permutation(ndims);
+            permute_op->set_attr<std::vector<int64_t>>(
+                    op_attr::permutation, perm);
+            rewriter.insert_op_before(permute_op, cur_op, 1);
+
+            auto group_shape = cur_op->get_attr<std::vector<int64_t>>(
+                    op_attr::group_shape);
+            std::swap(group_shape[ndims - 1], group_shape[ndims - 2]);
+            cur_op->set_attr<std::vector<int64_t>>(
+                    op_attr::group_shape, group_shape);
+        }
+    }
+
+    rewriter.run();
+    return infer_shape(sg);
+}
+
 status_t insert_permute_for_matmul(std::shared_ptr<subgraph_t> &sg) {
     auto &mgr = sg->fusion_info_mgr_;
     subgraph_rewriter_t rewriter(sg);
@@ -478,6 +571,107 @@ status_t insert_reshape_for_ndx2d_matmul(std::shared_ptr<subgraph_t> &sg) {
     return infer_shape(sg);
 }
 
+status_t insert_reshape_for_sdpa(std::shared_ptr<subgraph_t> &sg) {
+    subgraph_rewriter_t rewriter(sg);
+
+    for (auto &cur_op : sg->get_ops()) {
+        if (cur_op->get_kind() != op_kind::dnnl_sdpa) continue;
+
+        int32_t query_ndims
+                = cur_op->get_input_value(0)->get_logical_tensor().ndims;
+        if (query_ndims != 5) continue;
+
+        // Insert reshape for Query
+        auto query_dims = logical_tensor_wrapper_t(
+                cur_op->get_input_value(0)->get_logical_tensor())
+                                  .vdims();
+        dims expected_query_dims {
+                query_dims[0], -1, query_dims[3], query_dims[4]};
+        op_ptr reshape_query = std::make_shared<op_t>(op_kind::dnnl_reshape);
+        reshape_query->set_attr<bool>(op_attr::special_zero, false);
+        reshape_query->set_attr<std::vector<int64_t>>(
+                op_attr::shape, expected_query_dims);
+        rewriter.insert_op_before(reshape_query, cur_op, 0);
+
+        // Insert reshape for Key
+        auto key_dims = logical_tensor_wrapper_t(
+                cur_op->get_input_value(1)->get_logical_tensor())
+                                .vdims();
+        dims expected_key_dims {key_dims[0], -1, key_dims[3], key_dims[4]};
+        op_ptr reshape_key = std::make_shared<op_t>(op_kind::dnnl_reshape);
+        reshape_key->set_attr<bool>(op_attr::special_zero, false);
+        reshape_key->set_attr<std::vector<int64_t>>(
+                op_attr::shape, expected_key_dims);
+        rewriter.insert_op_before(reshape_key, cur_op, 1);
+
+        // Insert reshape for value
+        auto value_dims = logical_tensor_wrapper_t(
+                cur_op->get_input_value(2)->get_logical_tensor())
+                                  .vdims();
+        dims expected_value_dims {
+                value_dims[0], -1, value_dims[3], value_dims[4]};
+        op_ptr reshape_value = std::make_shared<op_t>(op_kind::dnnl_reshape);
+        reshape_value->set_attr<bool>(op_attr::special_zero, false);
+        reshape_value->set_attr<std::vector<int64_t>>(
+                op_attr::shape, expected_value_dims);
+        rewriter.insert_op_before(reshape_value, cur_op, 2);
+
+        size_t index = 3;
+        // Insert reshape for scale
+        if (cur_op->get_attr<bool>(op_attr::with_scale)) {
+            int32_t scale_ndims = cur_op->get_input_value(index)
+                                          ->get_logical_tensor()
+                                          .ndims;
+            if (scale_ndims == 5) {
+                auto scale_dims = logical_tensor_wrapper_t(
+                        cur_op->get_input_value(index)->get_logical_tensor())
+                                          .vdims();
+                dims expected_scale_dims {
+                        scale_dims[0], -1, scale_dims[3], scale_dims[4]};
+                op_ptr reshape_scale
+                        = std::make_shared<op_t>(op_kind::dnnl_reshape);
+                reshape_scale->set_attr<bool>(op_attr::special_zero, false);
+                reshape_scale->set_attr<std::vector<int64_t>>(
+                        op_attr::shape, expected_scale_dims);
+                rewriter.insert_op_before(reshape_scale, cur_op, index);
+            }
+            index += 1;
+        }
+        // Insert reshape for mask
+        if (cur_op->get_attr<bool>(op_attr::with_mask)) {
+            int32_t mask_ndims = cur_op->get_input_value(index)
+                                         ->get_logical_tensor()
+                                         .ndims;
+            if (mask_ndims == 5) {
+                auto mask_dims = logical_tensor_wrapper_t(
+                        cur_op->get_input_value(index)->get_logical_tensor())
+                                         .vdims();
+                dims expected_mask_dims {
+                        mask_dims[0], -1, mask_dims[3], mask_dims[4]};
+                op_ptr reshape_mask
+                        = std::make_shared<op_t>(op_kind::dnnl_reshape);
+                reshape_mask->set_attr<bool>(op_attr::special_zero, false);
+                reshape_mask->set_attr<std::vector<int64_t>>(
+                        op_attr::shape, expected_mask_dims);
+                rewriter.insert_op_before(reshape_mask, cur_op, index);
+            }
+        }
+
+        // Insert reshape for output
+        auto output_dims = logical_tensor_wrapper_t(
+                cur_op->get_output_value(0)->get_logical_tensor())
+                                   .vdims();
+        const dims &expected_output_dims = output_dims;
+        op_ptr reshape_output = std::make_shared<op_t>(op_kind::dnnl_reshape);
+        reshape_output->set_attr<bool>(op_attr::special_zero, false);
+        reshape_output->set_attr<std::vector<int64_t>>(
+                op_attr::shape, expected_output_dims);
+        rewriter.insert_op_after(reshape_output, cur_op, 0);
+    }
+    rewriter.run();
+    return infer_shape(sg);
+}
+
 status_t insert_unsqueeze_and_squeeze_for_matmul(
         std::shared_ptr<subgraph_t> &sg) {
     subgraph_rewriter_t rewriter(sg);
@@ -488,7 +682,11 @@ status_t insert_unsqueeze_and_squeeze_for_matmul(
 
         int32_t src_ndims = op->get_input_value(0)->get_logical_tensor().ndims;
         int32_t wei_ndims = op->get_input_value(1)->get_logical_tensor().ndims;
-        assertm(src_ndims >= 1 && wei_ndims >= 1, "invalid dims");
+        VCHECK_INSERT_OPS(src_ndims >= 1 && wei_ndims >= 1,
+                status::invalid_shape,
+                "src_ndims and wei_ndims should >= 1, src_ndims: %d, "
+                "wei_ndims: %d",
+                src_ndims, wei_ndims);
 
         int32_t unsqueezed_dst_ndims
                 = std::max(std::max(src_ndims, wei_ndims), 2);
@@ -605,8 +803,9 @@ impl::status_t insert_runtime_u8_to_s8_for_matmul(
                 // add a binary add here.
             }
         } else {
-            assertm(cur_op->num_inputs() == index,
-                    "only support insert input at the end of inputs");
+            VCHECK_INSERT_OPS(cur_op->num_inputs() == index,
+                    status::unimplemented,
+                    "only support insert input for wei at the end of inputs");
             std::vector<int64_t> zp {-128};
             auto zps_op = std::make_shared<op_t>(op_kind::dnnl_add_zps);
             zps_op->set_attr<std::string>(op_attr::qtype, "per_tensor");
@@ -658,6 +857,7 @@ impl::status_t insert_runtime_u8_to_s8_for_matmul(
         const_zps_dst_value->set_layout_type(layout_type::strided);
         const_zps_dst_value->set_strides({1});
         const_zps_op->add_output(const_zps_dst_value);
+        u8_to_s8_op->set_attr<std::string>(op_attr::qtype, "per_tensor");
         u8_to_s8_op->set_attr(op_attr::with_runtime_dst_zps, true);
         rewriter.insert_op_before(u8_to_s8_op, cur_op, 1);
         u8_to_s8_op->connect_input(1, const_zps_dst_value);
@@ -747,10 +947,11 @@ status_t insert_unsqueeze_for_prelu(std::shared_ptr<subgraph_t> &sg) {
         const bool per_channel_broadcast
                 = cur_op->get_attr<bool>(op_attr::per_channel_broadcast);
 
-        if (!prelu_doable(ltw(src_lt).vdims(), ltw(wei_lt).vdims(), data_format,
-                    per_channel_broadcast)) {
-            return status::invalid_shape;
-        }
+        const bool prelu_doable_status = prelu_doable(ltw(src_lt).vdims(),
+                ltw(wei_lt).vdims(), data_format, per_channel_broadcast);
+        VCHECK_INSERT_OPS(prelu_doable_status, status::invalid_shape,
+                "invalid shape to insert unsqueeze for prelu");
+
         // insert unsqueeze op
         int32_t src_ndims = src_lt.ndims;
         int32_t wei_ndims = wei_lt.ndims;
@@ -800,10 +1001,11 @@ status_t insert_unsqueeze_and_squeeze_for_prelu_bwd(
         const bool per_channel_broadcast
                 = wei_vdims.size() == 1 && wei_vdims[0] != 1;
 
-        if (!prelu_doable(ltw(src_lt).vdims(), wei_vdims, data_format,
-                    per_channel_broadcast)) {
-            return status::invalid_shape;
-        }
+        const bool prelu_doable_status = prelu_doable(ltw(src_lt).vdims(),
+                wei_vdims, data_format, per_channel_broadcast);
+        VCHECK_INSERT_OPS(prelu_doable_status, status::invalid_shape,
+                "invalid shape to insert unsqueeze for prelu");
+
         // insert unsqueeze op
         int32_t src_ndims = src_lt.ndims;
         int32_t wei_ndims = wei_lt.ndims;
diff --git a/src/graph/backend/dnnl/passes/insert_ops.hpp b/src/graph/backend/dnnl/passes/insert_ops.hpp
index 138877fa256..9ad3bff46f4 100644
--- a/src/graph/backend/dnnl/passes/insert_ops.hpp
+++ b/src/graph/backend/dnnl/passes/insert_ops.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2022 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,6 +39,11 @@ status_t insert_to_group_for_conv_or_deconv(std::shared_ptr<subgraph_t> &sg);
 
 status_t insert_to_group_for_reorder(std::shared_ptr<subgraph_t> &sg);
 
+/// Insert a permute op to transposed the scale and zero points on matmul's
+/// input tensors.
+status_t insert_permute_for_dynamic_mul_scale_sub_zp(
+        std::shared_ptr<subgraph_t> &sg);
+
 /// Insert a permute op to transpose matmul's input tensors
 ///
 /// Only valid for below scenarios:
@@ -53,6 +58,11 @@ status_t insert_permute_for_matmul(std::shared_ptr<subgraph_t> &sg);
 /// 2) reshape dst back to nd after compilation
 status_t insert_reshape_for_ndx2d_matmul(std::shared_ptr<subgraph_t> &sg);
 
+/// Insert reshape for 5D sdpa. sdpa only support 4D input/output
+/// 1) reshape Q/K/V/scale/mask from 5D to 4D
+/// 2) reshape output from 4D to 5D
+status_t insert_reshape_for_sdpa(std::shared_ptr<subgraph_t> &sg);
+
 // Insert an unsqueeze-squeeze pair for matmul
 //
 // The usage of unsqueeze op:
diff --git a/src/graph/backend/dnnl/passes/layout_propagation.cpp b/src/graph/backend/dnnl/passes/layout_propagation.cpp
index 183a3b7fdd4..916c8054b07 100644
--- a/src/graph/backend/dnnl/passes/layout_propagation.cpp
+++ b/src/graph/backend/dnnl/passes/layout_propagation.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,13 +26,16 @@
 #include "graph/backend/dnnl/common.hpp"
 #include "graph/backend/dnnl/layout_propagator.hpp"
 
+#define VCHECK_LAYOUT_PROPAGATION(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, layout_propagation, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
 using op_t = op_t;
 using op_ptr = std::shared_ptr<op_t>;
-using value_ptr = std::shared_ptr<value_t>;
 using ltw = logical_tensor_wrapper_t;
 
 bool need_prop_once_more(const std::shared_ptr<subgraph_t> &sg) {
@@ -118,15 +121,14 @@ status_t layout_propagation(std::shared_ptr<subgraph_t> &sg) {
 
             const op_schema_t *opm
                     = op_schema_registry_t::get_op_schema(op->get_kind());
-            if (!opm) {
-                assertm(false, "no schema for current op");
-                return status::invalid_graph_op;
-            }
+            VCHECK_LAYOUT_PROPAGATION(opm != nullptr, status::invalid_graph_op,
+                    "no schema for current op: %s", op->get_name().c_str());
 
-            if (!opm->has_additional_item("layout_propagator")) {
-                assertm(false, "no layout propagator in this op schema");
-                return status::invalid_graph_op;
-            }
+            VCHECK_LAYOUT_PROPAGATION(
+                    opm->has_additional_item("layout_propagator"),
+                    status::invalid_graph_op,
+                    "no layout propagator in the schema of op: %s",
+                    op->get_name().c_str());
 
             auto cur_op = op->shared_from_this();
             auto propagator = opm->get_additional_item<layout_propagator_func>(
@@ -138,14 +140,14 @@ status_t layout_propagation(std::shared_ptr<subgraph_t> &sg) {
             return status;
         });
 
-        if (ret != status::success) return ret;
+        VCHECK_LAYOUT_PROPAGATION(
+                ret == status::success, ret, "layout propagation failed");
         rewriter.run();
         propagation_number++;
-        if (propagation_number >= LAYOUT_PROPAGATION_NUMBER) {
-            assertm(false,
-                    "expect layout propagation number to be less than 10");
-            return status::invalid_arguments;
-        }
+        VCHECK_LAYOUT_PROPAGATION(
+                propagation_number < LAYOUT_PROPAGATION_NUMBER,
+                status::invalid_arguments,
+                "expect layout propagation number to be less than 10");
     } while (need_prop_once_more(sg));
 
     // Add check for the layout type of partition outputs to make partition
@@ -161,8 +163,7 @@ status_t layout_propagation(std::shared_ptr<subgraph_t> &sg) {
             auto lt = in_val->get_logical_tensor();
             if (lt.id == sg->ins_[i].id) {
                 auto md = make_dnnl_memory_desc(lt);
-                auto status = fill_layout_info(&(sg->ins_[i]), md);
-                if (status != status::success) return status;
+                CHECK(fill_layout_info(&(sg->ins_[i]), md));
             }
         }
     }
@@ -173,8 +174,7 @@ status_t layout_propagation(std::shared_ptr<subgraph_t> &sg) {
             auto lt = out_val->get_logical_tensor();
             if (lt.id == sg->outs_[i].id) {
                 auto md = make_dnnl_memory_desc(lt);
-                auto status = fill_layout_info(&(sg->outs_[i]), md);
-                if (status != status::success) return status;
+                CHECK(fill_layout_info(&(sg->outs_[i]), md));
             }
         }
     }
diff --git a/src/graph/backend/dnnl/passes/lower.cpp b/src/graph/backend/dnnl/passes/lower.cpp
index 86103993d1a..ea49b012180 100644
--- a/src/graph/backend/dnnl/passes/lower.cpp
+++ b/src/graph/backend/dnnl/passes/lower.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,6 +42,14 @@
 #include "graph/backend/dnnl/passes/transform.hpp"
 #include "graph/backend/dnnl/passes/utils.hpp"
 
+#define VCHECK_INVALID_ARGUMENT(cond, msg, ...) \
+    VCONDCHECK(graph, create, check, compile, (cond), \
+            status::invalid_arguments, msg, ##__VA_ARGS__);
+
+#define VCHECK_UNIMPLEMENTED(cond, msg, ...) \
+    VCONDCHECK(graph, create, check, compile, (cond), status::unimplemented, \
+            msg, ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -83,6 +91,12 @@ static status_t binary_handler(
     new_op->merge_attributes(op->get_attributes());
     rewriter.replace_op(op, new_op);
     insert_empty_scratchpad(new_op);
+    if (op->get_kind() == graph::op_kind::GreaterEqual) {
+        auto out_vals = op->get_output_values();
+        const auto &dst = out_vals[0];
+        // GreaterEqual output's datatype is boolean. we treated it as u8
+        dst->set_data_type(dnnl::impl::data_type::u8);
+    }
     return status::success;
 }
 
@@ -367,18 +381,17 @@ static status_t static_quant_handler(
 
     auto in_vals = op->get_input_values();
     auto out_vals = op->get_output_values();
-    assertm(in_vals.size() == 1 && out_vals.size() == 1,
-            "static quantize/dequantize should only have one input and "
-            "output");
-
+    VCHECK_INVALID_ARGUMENT(in_vals.size() == 1 && out_vals.size() == 1,
+            "static quantize/dequantize should only have one input and output"
+            " but got %zu input and %zu output",
+            in_vals.size(), out_vals.size());
+    VCHECK_INVALID_ARGUMENT(std::all_of(scales.begin(), scales.end(),
+                                    [](float i) { return i != 0.f; }),
+            "scales can't be zero");
     // int8 = f32 / scales + zps
     op_ptr mul_scales_op = std::make_shared<op_t>(op_kind::dnnl_mul_scales);
     op_ptr add_zps_op = std::make_shared<op_t>(op_kind::dnnl_add_zps);
 
-    assertm(std::all_of(scales.begin(), scales.end(),
-                    [](float i) { return i != 0.f; }),
-            "scales can't be zero");
-
     std::vector<float> inv_scales
             = dnnl_impl::utils::fmap(scales, [](float s) { return 1.f / s; });
     mul_scales_op->set_attr<std::vector<float>>(op_attr::scales, inv_scales);
@@ -425,8 +438,10 @@ static status_t static_dequant_handler(
 
     auto in_vals = cur_op->get_input_values();
     auto out_vals = cur_op->get_output_values();
-    assertm(in_vals.size() == 1 && out_vals.size() == 1,
-            "static dequantize should only have one input and output");
+    VCHECK_INVALID_ARGUMENT(in_vals.size() == 1 && out_vals.size() == 1,
+            "static dequantize should only have one input and output but "
+            "got %zu input and %zu output",
+            in_vals.size(), out_vals.size());
 
     // f32 = scales * (int8 - zps)
     op_ptr sub_zps_op = std::make_shared<op_t>(op_kind::dnnl_sub_zps);
@@ -470,9 +485,11 @@ static status_t dynamic_quant_handler(
 
     auto &in_vals = cur_op->get_input_values();
     auto &out_vals = cur_op->get_output_values();
-    assertm((in_vals.size() == 3 || in_vals.size() == 2)
+    VCHECK_INVALID_ARGUMENT((in_vals.size() == 3 || in_vals.size() == 2)
                     && out_vals.size() == 1,
-            "dynamic quantize must have 2 or 3 inputs and 1 output");
+            "dynamic quantize must have 2 or 3 inputs and 1 output, but "
+            "got %zu input and %zu output",
+            in_vals.size(), out_vals.size());
 
     // DynamicQuantize has optional zps
     bool has_zps = in_vals.size() == 3;
@@ -529,16 +546,58 @@ static status_t dynamic_dequant_handler(
 
     auto &in_vals = cur_op->get_input_values();
     auto &out_vals = cur_op->get_output_values();
-    assertm((in_vals.size() == 3 || in_vals.size() == 2)
+    VCHECK_INVALID_ARGUMENT((in_vals.size() == 3 || in_vals.size() == 2)
                     && out_vals.size() == 1,
-            "dynamic dequantize must have 2 or 3 inputs and 1 output");
+            "dynamic dequantize must have 2 or 3 inputs and 1 output, but "
+            "got %zu input and %zu output",
+            in_vals.size(), out_vals.size());
 
     // DynamicDequantize has optional zps
     bool has_zps = in_vals.size() == 3;
+    bool is_group_quantization = (qtype == "per_group");
 
     value_ptr src = in_vals[0], scales = in_vals[1], dst = out_vals[0], zps;
     if (has_zps) zps = in_vals[2];
 
+    int64_t group_mask = 0;
+    if (is_group_quantization) {
+
+        const auto &group_shape
+                = cur_op->get_attr<std::vector<int64_t>>(op_attr::group_shape);
+        const auto src_lt = src->get_logical_tensor();
+        const auto scale_lt = scales->get_logical_tensor();
+
+        const auto ndims = ltw(src_lt).ndims();
+        VCHECK_INVALID_ARGUMENT(
+                (static_cast<size_t>(ndims) == group_shape.size()),
+                "group shape size should match the number of dimensions of "
+                "src");
+        const auto &src_dims = ltw(src_lt).vdims();
+        const auto &scale_dims = ltw(scale_lt).vdims();
+
+        for (int idx = 0; idx < ndims - 2; ++idx) {
+            VCHECK_INVALID_ARGUMENT((src_dims[idx] == scale_dims[idx]),
+                    "the scale shape should match the input shape on the "
+                    "dimensions where no quantization is applied");
+        }
+
+        for (int idx = 0; idx < ndims; ++idx) {
+            VCHECK_INVALID_ARGUMENT(
+                    (src_dims[idx] == scale_dims[idx] * group_shape[idx]),
+                    "unsupported scale shape and group shape on dimension %d, "
+                    "src dim: %d, scale shape: %d, group shape: %d",
+                    idx, static_cast<int>(src_dims[idx]),
+                    static_cast<int>(scale_dims[idx]),
+                    static_cast<int>(group_shape[idx]));
+
+            if (group_shape[idx] != 1) {
+                group_mask += 1ULL << idx;
+                //Currently group quantization only happens on one dimension
+            }
+        }
+    }
+
+    const int64_t scales_data_type = scales->get_logical_tensor().data_type;
     // f32 = scales * (int8 - zps)
     // connect scales to mul_scales op
     op_ptr mul_scales = std::make_shared<op_t>(op_kind::dnnl_mul_scales);
@@ -546,6 +605,14 @@ static status_t dynamic_dequant_handler(
     scales->remove_consumer(*cur_op, 1);
     mul_scales->set_attr<int64_t>(op_attr::axis, axis);
     mul_scales->set_attr<std::string>(op_attr::qtype, qtype);
+    if (is_group_quantization) {
+        const auto &group_shape
+                = cur_op->get_attr<std::vector<int64_t>>(op_attr::group_shape);
+        mul_scales->set_attr<std::vector<int64_t>>(
+                op_attr::group_shape, group_shape);
+        mul_scales->set_attr<int64_t>(op_attr::group_mask, group_mask);
+    }
+    mul_scales->set_attr<int64_t>(op_attr::data_type, scales_data_type);
     mul_scales->set_attr<bool>(op_attr::with_runtime_scales, true);
 
     // connect mul_scales op to subgraph
@@ -555,13 +622,30 @@ static status_t dynamic_dequant_handler(
     rewriter.to_insert(mul_scales);
 
     if (has_zps) {
+        value_ptr zps = in_vals[2];
+        const int64_t zps_data_type = zps->get_logical_tensor().data_type;
         op_ptr sub_zps = std::make_shared<op_t>(op_kind::dnnl_sub_zps);
         sub_zps->connect_input(1, zps);
         zps->remove_consumer(*cur_op, 2);
         sub_zps->set_attr<int64_t>(op_attr::axis, axis);
         sub_zps->set_attr<std::string>(op_attr::qtype, qtype);
+        sub_zps->set_attr<int64_t>(op_attr::data_type, zps_data_type);
+        if (is_group_quantization) {
+            value_ptr scales = in_vals[1];
+            const auto &scale_dims = ltw(scales->get_logical_tensor()).vdims();
+            const auto &zp_dims = ltw(zps->get_logical_tensor()).vdims();
+            for (size_t idx = 0; idx < scale_dims.size(); ++idx) {
+                VCHECK_INVALID_ARGUMENT((scale_dims[idx] == zp_dims[idx]),
+                        "scale and zero point tensors should have the same "
+                        "shape");
+            }
+            const auto &group_shape = cur_op->get_attr<std::vector<int64_t>>(
+                    op_attr::group_shape);
+            sub_zps->set_attr<std::vector<int64_t>>(
+                    op_attr::group_shape, group_shape);
+            sub_zps->set_attr<int64_t>(op_attr::group_mask, group_mask);
+        }
         sub_zps->set_attr<bool>(op_attr::with_runtime_zps, true);
-
         // connect sub_zps op to subgraph
         rewriter.insert_op_before(sub_zps, mul_scales, 0, 0);
     }
@@ -573,127 +657,62 @@ static status_t dynamic_dequant_handler(
 
 static status_t select_handler(
         const std::shared_ptr<op_t> &op, subgraph_rewriter_t &rewriter) {
-
     auto in_vals = op->get_input_values();
     auto out_vals = op->get_output_values();
-    assertm(in_vals.size() == 3 && out_vals.size() == 1,
-            "select should have three inputs and a output");
-    auto cond = in_vals[0];
-    auto src0 = in_vals[1];
-    auto src1 = in_vals[2];
-    cond->set_data_type(dnnl::impl::data_type::u8);
-
-    //TODO: This reorder can be removed once eltwise_clip support int8 input
-    op_ptr type_cast = std::make_shared<op_t>(op_kind::dnnl_reorder);
-    type_cast->set_attr<bool>(op_attr::change_layout, false);
-
-    op_ptr clip = std::make_shared<op_t>(op_kind::dnnl_eltwise);
-    clip->set_attr<int64_t>(op_attr::alg_kind,
-            static_cast<int64_t>(dnnl::algorithm::eltwise_clip));
-    clip->set_attr<float>(op_attr::alpha, 0.f);
-    clip->set_attr<float>(op_attr::beta, 1.f);
-
-    // After reorder and clip. The cond value is 0 or 1.
-    // Then output = src0.*cond+src1.*(cond*-1 + 1)
-    op_ptr mul1 = std::make_shared<op_t>(op_kind::dnnl_binary);
-    mul1->set_attr<int64_t>(op_attr::alg_kind,
-            static_cast<int64_t>(dnnl::algorithm::binary_mul));
-    mul1->merge_attributes(op->get_attributes());
-
-    op_ptr mul2 = std::make_shared<op_t>(op_kind::dnnl_binary);
-    mul2->set_attr<int64_t>(op_attr::alg_kind,
-            static_cast<int64_t>(dnnl::algorithm::binary_mul));
-    mul2->merge_attributes(op->get_attributes());
-
-    op_ptr linear = std::make_shared<op_t>(op_kind::dnnl_eltwise);
-    linear->set_attr<int64_t>(op_attr::alg_kind,
-            static_cast<int64_t>(dnnl::algorithm::eltwise_linear));
-    const float alpha_value = -1.0f, beta_value = 1.0f;
-    linear->set_attr<float>(op_attr::alpha, alpha_value);
-    linear->set_attr<float>(op_attr::beta, beta_value);
-
-    op_ptr add = std::make_shared<op_t>(op_kind::dnnl_binary);
-    add->set_attr<int64_t>(op_attr::alg_kind,
-            static_cast<int64_t>(dnnl::algorithm::binary_add));
+    VCHECK_INVALID_ARGUMENT(in_vals.size() == 3 && out_vals.size() == 1,
+            "select should have three input and one output but "
+            "got %zu input and %zu output",
+            in_vals.size(), out_vals.size());
+    const auto &cond = in_vals[0];
+    const auto &src0 = in_vals[1];
+    const auto &src1 = in_vals[2];
+    // For the binary select operation, the conditional input tensor can
+    // only be of `s8` data type.
+    cond->set_data_type(dnnl::impl::data_type::s8);
+
+    op_ptr new_op = std::make_shared<op_t>(op_kind::dnnl_binary);
+    new_op->set_attr<int64_t>(op_attr::alg_kind,
+            static_cast<int64_t>(get_binary_alg_map().at(op->get_kind())));
+    new_op->merge_attributes(op->get_attributes());
 
     // reconnect
     cond->remove_consumer(*op, 0);
     src0->remove_consumer(*op, 1);
     src1->remove_consumer(*op, 2);
 
-    // first reorder and clip
-    cond->add_consumer(*type_cast, 0);
-    type_cast->add_input(cond);
-    logical_tensor_t float_cond = empty_logical_tensor_with_default_id();
-    auto float_cond_val
-            = std::make_shared<value_t>(*type_cast, 0, float_cond, true);
-    float_cond_val->set_data_type(dnnl::impl::data_type::f32);
-    type_cast->add_output(float_cond_val);
-    insert_empty_scratchpad(type_cast);
-
-    float_cond_val->add_consumer(*clip, 0);
-    clip->add_input(float_cond_val);
-    logical_tensor_t clip_cond = empty_logical_tensor_with_default_id();
-    auto clip_cond_val = std::make_shared<value_t>(*clip, 0, clip_cond, true);
-    clip_cond_val->set_data_type(
-            float_cond_val->get_logical_tensor().data_type);
-    clip->add_output(clip_cond_val);
-    insert_empty_scratchpad(clip);
-
-    // first multiply
-    src0->add_consumer(*mul1, 0);
-    clip_cond_val->add_consumer(*mul1, 1);
-    mul1->add_input(src0);
-    mul1->add_input(clip_cond_val);
-
-    logical_tensor_t src0_cond = empty_logical_tensor_with_default_id();
-    auto src0_val = std::make_shared<value_t>(*mul1, 0, src0_cond, true);
-    src0_val->set_data_type(src0->get_logical_tensor().data_type);
-    mul1->add_output(src0_val);
-    insert_empty_scratchpad(mul1);
-
-    //cond.*{-1} + 1
-    clip_cond_val->add_consumer(*linear, 0);
-    linear->add_input(clip_cond_val);
-
-    logical_tensor_t cond_inv = empty_logical_tensor_with_default_id();
-    auto cond_inv_val = std::make_shared<value_t>(*linear, 0, cond_inv, true);
-    cond_inv_val->set_data_type(clip_cond_val->get_logical_tensor().data_type);
-    linear->add_output(cond_inv_val);
-    insert_empty_scratchpad(linear);
-
-    //src1.*(cond_inv)
-
-    src1->add_consumer(*mul2, 0);
-    cond_inv_val->add_consumer(*mul2, 1);
-    mul2->add_input(src1);
-    mul2->add_input(cond_inv_val);
-
-    logical_tensor_t src1_cond = empty_logical_tensor_with_default_id();
-    auto src1_val = std::make_shared<value_t>(*mul2, 0, src1_cond, true);
-    src1_val->set_data_type(src1->get_logical_tensor().data_type);
-    mul2->add_output(src1_val);
-    insert_empty_scratchpad(mul2);
-
-    src0_val->add_consumer(*add, 0);
-    src1_val->add_consumer(*add, 1);
-    add->add_input(src0_val);
-    add->add_input(src1_val);
-    add->add_output(out_vals[0]);
-    insert_empty_scratchpad(add);
-
-    // add new ops and delete select op
-    rewriter.to_insert(type_cast);
-    rewriter.to_insert(clip);
-    rewriter.to_insert(mul1);
-    rewriter.to_insert(linear);
-    rewriter.to_insert(mul2);
-    rewriter.to_insert(add);
+    // binary select primitive places the condition input tensor as the
+    // third input tensor.
+    src0->add_consumer(*new_op, 0);
+    src1->add_consumer(*new_op, 1);
+    cond->add_consumer(*new_op, 2);
+
+    new_op->add_input(src0);
+    new_op->add_input(src1);
+    new_op->add_input(cond);
+    new_op->add_output(out_vals[0]);
+
+    insert_empty_scratchpad(new_op);
+    rewriter.to_insert(new_op);
     rewriter.to_remove(op);
 
     return status::success;
 }
 
+static status_t gen_index_handler(
+        const std::shared_ptr<op_t> &op, subgraph_rewriter_t &rewriter) {
+    auto new_op = std::make_shared<op_t>(op_kind::dnnl_gen_index);
+    new_op->merge_attributes(op->get_attributes());
+    int64_t axis = new_op->get_attr<int64_t>(op_attr::axis);
+    const int64_t ndims = static_cast<int64_t>(
+            ltw(op->get_input_value(0)->get_logical_tensor()).ndims());
+    VCHECK_INVALID_ARGUMENT(axis >= -1 * ndims && axis < ndims,
+            "GenIndex axis should be in range [-ndims, ndims) but got %d",
+            static_cast<int>(axis));
+    if (axis < 0) { new_op->set_attr<int64_t>(op_attr::axis, axis + ndims); }
+    rewriter.replace_op(op, new_op);
+    return status::success;
+}
+
 #define ITEM(kind, func) \
     { \
         graph::op_kind::kind, handler_func { (func) } \
@@ -731,6 +750,7 @@ static const std::unordered_map<graph::op_kind_t, handler_func> handler_table {
         ITEM(Divide, binary_handler),
         ITEM(Minimum, binary_handler),
         ITEM(Maximum, binary_handler),
+        ITEM(GreaterEqual, binary_handler),
         // eltwise fwd
         ITEM(Abs, eltwise_fwd_handler),
         ITEM(Clamp, eltwise_fwd_handler),
@@ -804,6 +824,7 @@ static const std::unordered_map<graph::op_kind_t, handler_func> handler_table {
         ITEM(Concat, common_handler<op_kind::kDnnl_concat>),
         ITEM(SquaredDifference, squared_difference_handler),
         ITEM(Select, select_handler),
+        ITEM(GenIndex, gen_index_handler),
         // utility
         ITEM(Wildcard, dummy_handler),
         ITEM(End, dummy_handler),
@@ -816,13 +837,11 @@ status_t lower_down(std::shared_ptr<subgraph_t> &sg) {
 
     for (auto &cur_op : sg->get_ops()) {
         auto kind = cur_op->get_kind();
-        if (!handler_table.count(kind)) {
-            assertm(false,
-                    "All spec ops should be lowered to internal ops, except "
-                    "for some utility ops like End, Wildcard");
-            return status::invalid_graph_op;
-        }
-
+        VCHECK_INVALID_ARGUMENT(handler_table.count(kind),
+                "All spec ops should be lowered to internal ops, except "
+                "for some utility ops like End, Wildcard. Current op name is "
+                "%s",
+                cur_op->get_name().c_str());
         // lower this spec op to dnnl backend internal op
         const auto &handler = handler_table.at(kind);
         auto status = handler(cur_op, rewriter);
diff --git a/src/graph/backend/dnnl/passes/memory_planning.cpp b/src/graph/backend/dnnl/passes/memory_planning.cpp
index 7d2bf24abc7..b6d4dc9367d 100644
--- a/src/graph/backend/dnnl/passes/memory_planning.cpp
+++ b/src/graph/backend/dnnl/passes/memory_planning.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,12 +33,15 @@
 
 #include "oneapi/dnnl/dnnl.hpp"
 
+#define VCHECK_MEMORY_PLANNING(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, memory_planning, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
 using op_t = op_t;
-using op_ptr = std::shared_ptr<op_t>;
 using ltw = logical_tensor_wrapper_t;
 
 struct op_inplace_pair_t {
@@ -344,7 +347,8 @@ status_t memory_planner_t::assign_external_inputs_buffer(
                 // assign alias
                 auto aliases = alias_analyzer_.get_all_aliases(val);
                 for (auto &alias : aliases) {
-                    assertm(!buffer_assignments_.count(alias),
+                    VCHECK_MEMORY_PLANNING(!buffer_assignments_.count(alias),
+                            status::runtime_error,
                             "alias of input has been assigned buffer");
                     buffer_assignments_.insert(std::make_pair(alias, info));
                 }
@@ -717,7 +721,11 @@ status_t memory_planner_t::book_buffers(std::shared_ptr<subgraph_t> &sg) {
                 persistent_registrar.book(info.index_,
                         persistent_buffer_assigner_.query_size(info.index_));
                 break;
-            default: return status::unimplemented;
+            default:
+                VCHECK_MEMORY_PLANNING(false, status::unimplemented,
+                        "booking memory failed for unimplemented buffer kind "
+                        "%d",
+                        info.kind_);
         }
     }
     return status::success;
@@ -771,21 +779,20 @@ status_t memory_planner_t::prepare_execution_args_set(
         }
         return status::success;
     });
-    if (ret != status::success) return ret;
+    VCHECK_MEMORY_PLANNING(
+            ret == status::success, ret, "prepare memory failed");
 
     // construct the dnnl execution args for each op
     ret = topo_order_visit(sg->get_output_ops(), [&](op_t *op) {
         const op_schema_t *opm
                 = op_schema_registry_t::get_op_schema(op->get_kind());
-        if (!opm) {
-            assertm(false, "no schema for current op");
-            return status::invalid_graph_op;
-        }
+        VCHECK_MEMORY_PLANNING(opm != nullptr, status::invalid_graph_op,
+                "no schema for current op: %s", op->get_name().c_str());
 
-        if (!opm->has_additional_item("arg_indices_getter")) {
-            assertm(false, "no arg indices getter in this op schema");
-            return status::invalid_graph_op;
-        }
+        VCHECK_MEMORY_PLANNING(opm->has_additional_item("arg_indices_getter"),
+                status::invalid_graph_op,
+                "no arg indices getter in the schema of op: %s",
+                op->get_name().c_str());
 
         auto getter = opm->get_additional_item<arg_indices_getter_func>(
                 "arg_indices_getter");
@@ -806,7 +813,9 @@ status_t memory_planner_t::prepare_execution_args_set(
             // find the corresponding memory object
             dnnl::memory mem;
             if (!exec_args_set_.find_value_mem_map(val, mem)) {
-                return status::invalid_arguments;
+                VCHECK_MEMORY_PLANNING(false, status::invalid_arguments,
+                        "can't find memory for value id: %zu",
+                        val->get_logical_tensor().id);
             }
 
             dnnl_exec_args.insert({dnnl_arg, mem});
@@ -830,8 +839,6 @@ status_t memory_planner_t::prepare_execution_args_set(
 // - Assign internal allocated persistent buffer to corresponding edges.
 // - Prepare the memory objects which will be used in execution.
 status_t memory_planner_t::run(std::shared_ptr<subgraph_t> &sg) {
-    status_t ret;
-
     auto &mgr = sg->fusion_info_mgr_;
     const auto &p_engine = *(sg->p_engine_);
     const auto &inputs = sg->ins_;
@@ -867,21 +874,17 @@ status_t memory_planner_t::run(std::shared_ptr<subgraph_t> &sg) {
     }
 
     // Assign external_input buffers to subgraph's inputs and their alias
-    ret = assign_external_inputs_buffer(sg, inputs);
-    if (ret != status::success) return ret;
+    CHECK(assign_external_inputs_buffer(sg, inputs));
 
     // Assign internal temporary buffer for all other edges
-    ret = assign_internal_temporary_buffer(sg, edge_ref_count, mgr, false);
-    if (ret != status::success) return ret;
+    CHECK(assign_internal_temporary_buffer(sg, edge_ref_count, mgr, false));
 
     // Replace some internal temporary buffers to user given external output
     // buffer
-    ret = assign_external_outputs_buffer(sg, outputs, mgr);
-    if (ret != status::success) return ret;
+    CHECK(assign_external_outputs_buffer(sg, outputs, mgr));
 
     // Replace some internal temporary buffers to cached persistent buffer
-    ret = assign_internal_persistent_buffer(sg, mgr);
-    if (ret != status::success) return ret;
+    CHECK(assign_internal_persistent_buffer(sg, mgr));
 
     // Reset the unreplaced internal temporary buffer
     temporary_buffer_assigner_.clear();
@@ -896,20 +899,13 @@ status_t memory_planner_t::run(std::shared_ptr<subgraph_t> &sg) {
 
     // Re-assign internal temporary buffer for reset ones (will re-do memory
     // sharing between temporary buffers)
-    ret = assign_internal_temporary_buffer(sg, edge_ref_count, mgr, true);
-    if (ret != status::success) return ret;
-
+    CHECK(assign_internal_temporary_buffer(sg, edge_ref_count, mgr, true));
     // Check which input/output pair of the subgraph can be inplaced
-    ret = prepare_subgraph_inplace_pairs(sg, false);
-    if (ret != status::success) return ret;
-
-    ret = book_buffers(sg);
-    if (ret != status::success) return ret;
+    CHECK(prepare_subgraph_inplace_pairs(sg, false));
 
+    CHECK(book_buffers(sg));
     // Bind memory object to each value
-    ret = prepare_execution_args_set(sg, p_engine, mgr);
-    if (ret != status::success) return ret;
-
+    CHECK(prepare_execution_args_set(sg, p_engine, mgr));
     return status::success;
 }
 
diff --git a/src/graph/backend/dnnl/passes/memory_planning.hpp b/src/graph/backend/dnnl/passes/memory_planning.hpp
index 1e3ff18250b..1142e1a1dd6 100644
--- a/src/graph/backend/dnnl/passes/memory_planning.hpp
+++ b/src/graph/backend/dnnl/passes/memory_planning.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -193,7 +193,7 @@ class buffer_assigner_t {
 public:
     // constructor
     explicit buffer_assigner_t(const size_t match_range)
-        : match_range_(match_range), free_(), data_() {}
+        : match_range_(match_range) {}
 
     // request a free buffer
     size_t request(size_t size) {
diff --git a/src/graph/backend/dnnl/passes/transform.cpp b/src/graph/backend/dnnl/passes/transform.cpp
index 986049945ca..083d8dca85c 100644
--- a/src/graph/backend/dnnl/passes/transform.cpp
+++ b/src/graph/backend/dnnl/passes/transform.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,9 @@
 #include "graph/backend/dnnl/passes/transform.hpp"
 #include "graph/backend/dnnl/passes/utils.hpp"
 
+#define VCHECK_TRANSFORM(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, transform, (cond), status, msg, \
+            ##__VA_ARGS__);
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -245,14 +248,20 @@ status_t convert_to_runtime_src_scales(std::shared_ptr<subgraph_t> &sg) {
                 || visited.count(cur_op.get()) != 0)
             continue;
 
+        // This pass only handle static quantization
+        bool dync_quantization = cur_op->has_attr(op_attr::with_runtime_scales)
+                && cur_op->get_attr<bool>(op_attr::with_runtime_scales);
+        if (dync_quantization) continue;
+
         scales_ops.emplace_back(cur_op.get());
         visited.insert(cur_op.get());
     }
 
     subgraph_rewriter_t rewriter(sg);
     for (auto &cur_op : scales_ops) {
-        assertm(cur_op->num_outputs() == 1,
-                "scale_op should have only one output value.");
+        VCHECK_TRANSFORM(cur_op->num_outputs() == 1, status::invalid_graph_op,
+                "scale_op should have only one output value, but got %zu",
+                cur_op->num_outputs());
         auto out_val = cur_op->get_output_values()[0];
         auto consumers = out_val->get_consumers();
         if (consumers.empty()) continue;
@@ -298,14 +307,20 @@ status_t convert_to_runtime_src_zero_points(std::shared_ptr<subgraph_t> &sg) {
                 || visited.count(cur_op.get()) != 0)
             continue;
 
+        // This pass only handle static quantization
+        bool dync_quantization = cur_op->has_attr(op_attr::with_runtime_zps)
+                && cur_op->get_attr<bool>(op_attr::with_runtime_zps);
+        if (dync_quantization) continue;
+
         zp_ops.emplace_back(cur_op.get());
         visited.insert(cur_op.get());
     }
 
     subgraph_rewriter_t rewriter(sg);
     for (auto &zp_op : zp_ops) {
-        assertm(zp_op->num_outputs() == 1,
-                "zp_op should have only one output value.");
+        VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op,
+                "zp_op should have only one output value, but got %zu",
+                zp_op->num_outputs());
         auto out_val = zp_op->get_output_values()[0];
         auto consumers = out_val->get_consumers();
 
@@ -352,14 +367,20 @@ status_t convert_to_runtime_dst_zero_points(std::shared_ptr<subgraph_t> &sg) {
                 || visited.count(cur_op.get()) != 0)
             continue;
 
+        // This pass only handle static quantization
+        bool dync_quantization = cur_op->has_attr(op_attr::with_runtime_zps)
+                && cur_op->get_attr<bool>(op_attr::with_runtime_zps);
+        if (dync_quantization) continue;
+
         zp_ops.emplace_back(cur_op.get());
         visited.insert(cur_op.get());
     }
 
     subgraph_rewriter_t rewriter(sg);
     for (auto &zp_op : zp_ops) {
-        assertm(zp_op->num_outputs() == 1,
-                "zp_op should have only one output value.");
+        VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op,
+                "zp_op should have only one output value, but got %zu",
+                zp_op->num_outputs());
         auto in_val = zp_op->get_input_values()[0];
         bool is_output_zps = in_val->has_producer()
                 && impl::utils::one_of(in_val->get_producer().get_kind(),
@@ -407,8 +428,10 @@ status_t fold_mul_scales(std::shared_ptr<subgraph_t> &sg) {
                     || visited.count(cur_op.get()) != 0)
                 continue;
 
-            assertm(cur_op->num_outputs() == 1,
-                    "cur_op should have only one output value.");
+            VCHECK_TRANSFORM(cur_op->num_outputs() == 1, false,
+                    "dnnl_mul_scales should have only one output value, but "
+                    "got %zu",
+                    cur_op->num_outputs());
             auto out_val = cur_op->get_output_values()[0];
             auto consumers = out_val->get_consumers();
             if (consumers.empty()) continue;
@@ -416,8 +439,7 @@ status_t fold_mul_scales(std::shared_ptr<subgraph_t> &sg) {
             auto &consumer_op = consumers[0].get_op();
             if (consumer_op.get_kind() != op_kind::dnnl_mul_scales) continue;
 
-            folding_groups.emplace_back(
-                    std::pair<op_t *, op_t *> {cur_op.get(), &consumer_op});
+            folding_groups.emplace_back(cur_op.get(), &consumer_op);
             visited.insert(cur_op.get());
             visited.insert(&consumer_op);
         }
@@ -474,8 +496,10 @@ impl::status_t fold_sub_zps_add_zps(std::shared_ptr<subgraph_t> &sg) {
                     || visited.count(cur_op.get()) != 0)
                 continue;
 
-            assertm(cur_op->num_outputs() == 1,
-                    "cur_op should have only one output value.");
+            VCHECK_TRANSFORM(cur_op->num_outputs() == 1, false,
+                    "dnnl_sub_zps should have only one output value, but got "
+                    "%zu",
+                    cur_op->num_outputs());
             auto out_val = cur_op->get_output_values()[0];
             auto consumers = out_val->get_consumers();
             if (consumers.empty()) continue;
@@ -483,8 +507,7 @@ impl::status_t fold_sub_zps_add_zps(std::shared_ptr<subgraph_t> &sg) {
             auto &consumer_op = consumers[0].get_op();
             if (consumer_op.get_kind() != op_kind::dnnl_add_zps) continue;
 
-            folding_groups.emplace_back(
-                    std::pair<op_t *, op_t *> {cur_op.get(), &consumer_op});
+            folding_groups.emplace_back(cur_op.get(), &consumer_op);
             visited.insert(cur_op.get());
             visited.insert(&consumer_op);
         }
@@ -574,8 +597,12 @@ status_t fuse_to_int8_concat(std::shared_ptr<subgraph_t> &sg) {
             rewriter.fuse_op_to_successor(scale_op.shared_from_this());
         }
 
-        assertm(concat_op->get_output_value(0)->get_consumers().size() == 1,
-                "concat's successor op should only have one consumer.");
+        VCHECK_TRANSFORM(
+                concat_op->get_output_value(0)->get_consumers().size() == 1,
+                status::invalid_graph,
+                "concat's successor op should only have one consumer, but got "
+                "%zu",
+                concat_op->get_output_value(0)->get_consumers().size());
         op_t &scale_op
                 = concat_op->get_output_value(0)->get_consumers()[0].get_op();
         op_t &zp_op = scale_op.get_output_value(0)->get_consumers()[0].get_op();
@@ -787,8 +814,6 @@ status_t fuse_post_ops(std::shared_ptr<subgraph_t> &sg) {
     // lambda function to fuse one post op into base primitive
     auto fuse_post_ops_func = [&](bool &changed) -> status_t {
         auto &mgr = sg->fusion_info_mgr_;
-        subgraph_rewriter_t rewriter(sg);
-
         std::vector<std::pair<op_t *, op_t *>> fuse_groups;
 
         std::set<op_t *> visited;
@@ -817,24 +842,28 @@ status_t fuse_post_ops(std::shared_ptr<subgraph_t> &sg) {
                     || (post_op_kind == op_kind::dnnl_binary
                             && !post_binary_fusible(
                                     op, &post_op, sg->get_engine_kind()))
+                    || (post_op_kind == op_kind::dnnl_eltwise
+                            && !post_eltwise_fusible(
+                                    op, &post_op, sg->get_engine_kind()))
                     || (post_op_kind == op_kind::dnnl_convolution
                             && !post_depthwise_conv_fusible(op, &post_op));
             if (not_fusible) { return status::success; }
 
             // push fusible pair to fuse group for later fusion
-            fuse_groups.emplace_back(std::pair<op_t *, op_t *> {op, &post_op});
+            fuse_groups.emplace_back(op, &post_op);
             visited.insert(op);
             visited.insert(&post_op);
             return status::success;
         });
 
-        if (ret != status::success) return ret;
+        VCHECK_TRANSFORM(ret == status::success, ret,
+                "Error finding fusible post_op groups");
 
         if (fuse_groups.empty()) {
             changed = false;
             return status::success;
         }
-
+        subgraph_rewriter_t rewriter(sg);
         for (auto &fuse_group : fuse_groups) {
             auto base_op = fuse_group.first;
             auto post_op = fuse_group.second;
@@ -975,14 +1004,12 @@ status_t fuse_post_ops(std::shared_ptr<subgraph_t> &sg) {
 
     bool changed = true;
     do {
-        auto ret = fuse_post_ops_func(changed);
-        if (ret != status::success) return ret;
+        CHECK(fuse_post_ops_func(changed));
         cnt++;
     } while (changed && cnt <= max_num_limit);
 
-    assertm(cnt <= max_num_limit + 1,
-            "Failed to fuse all post ops since there has unsupported ones.");
-    if (cnt > max_num_limit + 1) return status::unimplemented;
+    VCHECK_TRANSFORM(cnt <= max_num_limit + 1, status::unimplemented,
+            "Failed to fuse all post ops since there has unsupported ones");
     return status::success;
 }
 
@@ -1002,8 +1029,9 @@ status_t fuse_src_zero_points(std::shared_ptr<subgraph_t> &sg) {
 
     subgraph_rewriter_t rewriter(sg);
     for (auto &zp_op : zp_ops) {
-        assertm(zp_op->num_outputs() == 1,
-                "zp_op should have only one output value.");
+        VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op,
+                "zp_op should have only one output value, but got %zu",
+                zp_op->num_outputs());
         auto out_val = zp_op->get_output_values()[0];
         auto consumers = out_val->get_consumers();
 
@@ -1054,7 +1082,7 @@ status_t fuse_src_zero_points(std::shared_ptr<subgraph_t> &sg) {
                 auto zps = zp_op->get_attr<std::vector<int64_t>>(op_attr::zps);
                 not_all_zero = !utils::all_zero(zps);
                 if (not_all_zero) {
-                    assertm(zps.size() == 1,
+                    VCHECK_TRANSFORM(zps.size() == 1, status::unimplemented,
                             "zp attr only support scalar zp, need to use "
                             "runtime arg to support vector zp");
                     fusion_info.set_zero_points(
@@ -1088,8 +1116,9 @@ status_t fuse_src_scales(std::shared_ptr<subgraph_t> &sg) {
 
     subgraph_rewriter_t rewriter(sg);
     for (auto &scale_op : scales_ops) {
-        assertm(scale_op->num_outputs() == 1,
-                "scale_op should have only one output value.");
+        VCHECK_TRANSFORM(scale_op->num_outputs() == 1, status::invalid_graph_op,
+                "scale_op should have only one output value, but got %zu",
+                scale_op->num_outputs());
         auto out_val = scale_op->get_output_values()[0];
         auto consumers = out_val->get_consumers();
         if (consumers.empty()) continue;
@@ -1115,10 +1144,15 @@ status_t fuse_src_scales(std::shared_ptr<subgraph_t> &sg) {
                 int ndims = scale_op->get_input_value(0)
                                     ->get_logical_tensor()
                                     .ndims;
-                if ((!trans_flag && axis != ndims - 1 && axis != -1)
-                        || (trans_flag && axis != ndims - 2 && axis != -2)) {
-                    return status::unimplemented;
-                }
+                VCHECK_TRANSFORM(
+                        (!trans_flag && (axis == ndims - 1 || axis == -1))
+                                || (trans_flag
+                                        && (axis == ndims - 2 || axis == -2)),
+                        status::unimplemented,
+                        "Matmul only support applying per channel scale "
+                        "along the last dimension for DNNL_ARG_WEIGHTS. "
+                        "trans_flag: %d, axis: %lld, ndims: %d",
+                        trans_flag, axis, ndims);
             }
             int64_t key = -1;
             if (next_op.has_attr(op_attr::fusion_info_key)) {
@@ -1144,7 +1178,8 @@ status_t fuse_src_scales(std::shared_ptr<subgraph_t> &sg) {
                         scale_op->shared_from_this(), true, offset);
                 rewriter.to_remove(scale_op->shared_from_this());
             } else {
-                assertm(false, "src scales must be runtime scales.");
+                VCHECK_TRANSFORM(false, status::unimplemented,
+                        "src scales must be runtime scales.");
             }
         }
     }
@@ -1174,8 +1209,7 @@ status_t fuse_dst_scales(std::shared_ptr<subgraph_t> &sg) {
         if (consumers.size() != 1) continue;
         auto &next_op = consumers[0].get_op();
         if (next_op.get_kind() != op_kind::dnnl_mul_scales) continue;
-        fuse_groups.emplace_back(
-                std::pair<op_t *, op_t *> {cur_op.get(), &next_op});
+        fuse_groups.emplace_back(cur_op.get(), &next_op);
         visited.insert(cur_op.get());
         visited.insert(&next_op);
     }
@@ -1217,6 +1251,11 @@ status_t convert_to_runtime_dst_scales(std::shared_ptr<subgraph_t> &sg) {
                 || visited.count(cur_op.get()))
             continue;
 
+        // This pass only handle static quantization
+        bool dync_quantization = cur_op->has_attr(op_attr::with_runtime_scales)
+                && cur_op->get_attr<bool>(op_attr::with_runtime_scales);
+        if (dync_quantization) continue;
+
         visited.insert(cur_op.get());
         // make scales as a constant input
         op_ptr const_data_op;
@@ -1293,8 +1332,9 @@ status_t fuse_dst_zero_points(std::shared_ptr<subgraph_t> &sg) {
 
     subgraph_rewriter_t rewriter(sg);
     for (auto &zp_op : zp_ops) {
-        assertm(zp_op->num_outputs() == 1,
-                "zp_op should have only one output value.");
+        VCHECK_TRANSFORM(zp_op->num_outputs() == 1, status::invalid_graph_op,
+                "zp_op should have only one output value, but got %zu",
+                zp_op->num_outputs());
         auto out_val = zp_op->get_output_values()[0];
         auto consumers = out_val->get_consumers();
 
@@ -2030,8 +2070,7 @@ status_t fuse_reciprocal_mul_to_div(std::shared_ptr<subgraph_t> &sg) {
         size_t mul_other_offset = 1 - offset;
         mul_other_offsets.emplace_back(mul_other_offset);
 
-        div_patterns.emplace_back(
-                std::pair<op_t *, op_t *> {cur_op.get(), &csm_op});
+        div_patterns.emplace_back(cur_op.get(), &csm_op);
     }
 
     if (div_patterns.empty()) return status::success;
@@ -2237,7 +2276,9 @@ status_t binary_canonicalization(std::shared_ptr<subgraph_t> &sg) {
                     = binary_doable(ltw(src0_lt).vdims(), ltw(src1_lt).vdims());
         }
 
-        if (!shape_check_ok) return status::invalid_shape;
+        VCHECK_TRANSFORM(shape_check_ok, status::invalid_shape,
+                "Binary op shape check failed for op: %s .",
+                cur_op->get_name().c_str());
 
         // insert unsqueeze op
         int32_t src0_ndims = src0_lt.ndims;
@@ -2245,6 +2286,9 @@ status_t binary_canonicalization(std::shared_ptr<subgraph_t> &sg) {
         int32_t target_ndims = std::max(src0_ndims, src1_ndims);
         std::vector<int32_t> in_ndims {src0_ndims, src1_ndims};
         for (size_t i = 0; i < cur_op->num_inputs(); ++i) {
+            // For binary select op, broadcast for the third input is
+            // unsupported.
+            if (i == 2) { continue; }
             if (in_ndims[i] == target_ndims) { continue; }
 
             std::vector<int64_t> axes(target_ndims - in_ndims[i]);
@@ -2275,6 +2319,147 @@ status_t binary_canonicalization(std::shared_ptr<subgraph_t> &sg) {
     return infer_shape(sg);
 }
 
+status_t decompose_select_to_binary_ops(std::shared_ptr<subgraph_t> &sg) {
+    subgraph_rewriter_t rewriter(sg);
+    for (auto &op : sg->get_ops()) {
+        if (op->get_kind() != op_kind::dnnl_binary) continue;
+        const algorithm algo = static_cast<dnnl::algorithm>(
+                op->get_attr<int64_t>(op_attr::alg_kind));
+
+        if (algo != algorithm::binary_select) continue;
+
+        // For the binary select primitive, broadcast semantics are not
+        // supported for the third conditional input tensor. For this case, the
+        // shape of the conditional input tensor must match that of the source 0
+        // tensor.
+        // The binary select primitive is unsupported on GPU.
+        const bool require_broadcast = need_broadcast_for_inputs(op, 0, 2);
+        if (!require_broadcast && sg->get_engine_kind() != engine_kind::gpu)
+            continue;
+
+        auto in_vals = op->get_input_values();
+        auto out_vals = op->get_output_values();
+
+        const auto &src0 = in_vals[0];
+        const auto &src1 = in_vals[1];
+        const auto &cond = in_vals[2];
+        cond->set_data_type(dnnl::impl::data_type::u8);
+
+        //TODO: This reorder can be removed once eltwise_clip support int8 input
+        op_ptr type_cast = std::make_shared<op_t>(op_kind::dnnl_reorder);
+        type_cast->set_attr<bool>(op_attr::change_layout, false);
+
+        op_ptr clip = std::make_shared<op_t>(op_kind::dnnl_eltwise);
+        clip->set_attr<int64_t>(op_attr::alg_kind,
+                static_cast<int64_t>(dnnl::algorithm::eltwise_clip));
+        clip->set_attr<float>(op_attr::alpha, 0.f);
+        clip->set_attr<float>(op_attr::beta, 1.f);
+
+        // After reorder and clip. The cond value is 0 or 1.
+        // Then output = src0.*cond+src1.*(cond*-1 + 1)
+        op_ptr mul1 = std::make_shared<op_t>(op_kind::dnnl_binary);
+        mul1->merge_attributes(op->get_attributes());
+        mul1->set_attr<int64_t>(op_attr::alg_kind,
+                static_cast<int64_t>(dnnl::algorithm::binary_mul));
+
+        op_ptr mul2 = std::make_shared<op_t>(op_kind::dnnl_binary);
+        mul2->merge_attributes(op->get_attributes());
+        mul2->set_attr<int64_t>(op_attr::alg_kind,
+                static_cast<int64_t>(dnnl::algorithm::binary_mul));
+
+        op_ptr linear = std::make_shared<op_t>(op_kind::dnnl_eltwise);
+        linear->set_attr<int64_t>(op_attr::alg_kind,
+                static_cast<int64_t>(dnnl::algorithm::eltwise_linear));
+        const float alpha_value = -1.0f, beta_value = 1.0f;
+        linear->set_attr<float>(op_attr::alpha, alpha_value);
+        linear->set_attr<float>(op_attr::beta, beta_value);
+
+        op_ptr add = std::make_shared<op_t>(op_kind::dnnl_binary);
+        add->set_attr<int64_t>(op_attr::alg_kind,
+                static_cast<int64_t>(dnnl::algorithm::binary_add));
+
+        // reconnect
+        src0->remove_consumer(*op, 0);
+        src1->remove_consumer(*op, 1);
+        cond->remove_consumer(*op, 2);
+
+        // first reorder and clip
+        cond->add_consumer(*type_cast, 0);
+        type_cast->add_input(cond);
+        logical_tensor_t float_cond = empty_logical_tensor_with_default_id();
+        auto float_cond_val
+                = std::make_shared<value_t>(*type_cast, 0, float_cond, true);
+        float_cond_val->set_data_type(dnnl::impl::data_type::f32);
+        type_cast->add_output(float_cond_val);
+        insert_empty_scratchpad(type_cast);
+
+        float_cond_val->add_consumer(*clip, 0);
+        clip->add_input(float_cond_val);
+        logical_tensor_t clip_cond = empty_logical_tensor_with_default_id();
+        auto clip_cond_val
+                = std::make_shared<value_t>(*clip, 0, clip_cond, true);
+        clip_cond_val->set_data_type(
+                float_cond_val->get_logical_tensor().data_type);
+        clip->add_output(clip_cond_val);
+        insert_empty_scratchpad(clip);
+
+        // first multiply
+        src0->add_consumer(*mul1, 0);
+        clip_cond_val->add_consumer(*mul1, 1);
+        mul1->add_input(src0);
+        mul1->add_input(clip_cond_val);
+
+        logical_tensor_t src0_cond = empty_logical_tensor_with_default_id();
+        auto src0_val = std::make_shared<value_t>(*mul1, 0, src0_cond, true);
+        src0_val->set_data_type(src0->get_logical_tensor().data_type);
+        mul1->add_output(src0_val);
+        insert_empty_scratchpad(mul1);
+
+        //cond.*{-1} + 1
+        clip_cond_val->add_consumer(*linear, 0);
+        linear->add_input(clip_cond_val);
+
+        logical_tensor_t cond_inv = empty_logical_tensor_with_default_id();
+        auto cond_inv_val
+                = std::make_shared<value_t>(*linear, 0, cond_inv, true);
+        cond_inv_val->set_data_type(
+                clip_cond_val->get_logical_tensor().data_type);
+        linear->add_output(cond_inv_val);
+        insert_empty_scratchpad(linear);
+
+        //src1.*(cond_inv)
+
+        src1->add_consumer(*mul2, 0);
+        cond_inv_val->add_consumer(*mul2, 1);
+        mul2->add_input(src1);
+        mul2->add_input(cond_inv_val);
+
+        logical_tensor_t src1_cond = empty_logical_tensor_with_default_id();
+        auto src1_val = std::make_shared<value_t>(*mul2, 0, src1_cond, true);
+        src1_val->set_data_type(src1->get_logical_tensor().data_type);
+        mul2->add_output(src1_val);
+        insert_empty_scratchpad(mul2);
+
+        src0_val->add_consumer(*add, 0);
+        src1_val->add_consumer(*add, 1);
+        add->add_input(src0_val);
+        add->add_input(src1_val);
+        add->add_output(out_vals[0]);
+        insert_empty_scratchpad(add);
+
+        // add new ops and delete select op
+        rewriter.to_insert(type_cast);
+        rewriter.to_insert(clip);
+        rewriter.to_insert(mul1);
+        rewriter.to_insert(linear);
+        rewriter.to_insert(mul2);
+        rewriter.to_insert(add);
+        rewriter.to_remove(op);
+    }
+    rewriter.run();
+    return infer_shape(sg);
+}
+
 status_t binary_broadcast_swap(std::shared_ptr<subgraph_t> &sg) {
     subgraph_rewriter_t rewriter(sg);
 
@@ -2396,7 +2581,7 @@ status_t fuse_adjacent_reorders(std::shared_ptr<subgraph_t> &sg) {
             }
 
             // push fusible pair to fuse group for later fusion
-            fuse_groups.emplace_back(std::pair<op_t *, op_t *> {op, &next_op});
+            fuse_groups.emplace_back(op, &next_op);
             visited.insert(op);
             visited.insert(&next_op);
             // destroy md before return
@@ -2404,7 +2589,8 @@ status_t fuse_adjacent_reorders(std::shared_ptr<subgraph_t> &sg) {
             return status::success;
         });
 
-        if (ret != status::success) return ret;
+        VCHECK_TRANSFORM(ret == status::success, ret,
+                "Error finding adjacent reorders.");
 
         if (fuse_groups.empty()) {
             changed = false;
@@ -2532,8 +2718,7 @@ status_t fuse_adjacent_reorders(std::shared_ptr<subgraph_t> &sg) {
             const auto &pd = reorder_executable_t::create_desc(
                     fused_op, *p_engine, mgr, pd_cache);
             const memory::desc scratchpad_desc = pd.scratchpad_desc();
-            auto status = fill_layout_info(scratchpad_val, scratchpad_desc);
-            if (status != status::success) return status;
+            CHECK(fill_layout_info(scratchpad_val, scratchpad_desc));
 
             rewriter.to_insert(fused_op);
             rewriter.to_remove(op1->shared_from_this());
@@ -2548,13 +2733,12 @@ status_t fuse_adjacent_reorders(std::shared_ptr<subgraph_t> &sg) {
 
     bool changed = true;
     do {
-        auto ret = fuse_two_adjacent_reorders(changed);
-        if (ret != status::success) return ret;
+        CHECK(fuse_two_adjacent_reorders(changed));
         cnt++;
     } while (changed && cnt <= max_num_limit);
 
-    assertm(cnt <= max_num_limit + 1, "reorder fusion failed.");
-    if (cnt > max_num_limit + 1) return status::unimplemented;
+    VCHECK_TRANSFORM(cnt <= max_num_limit + 1, status::unimplemented,
+            "Reorder fusion failed.");
 
     return status::success;
 }
@@ -2700,8 +2884,7 @@ status_t fuse_dynamic_mul_scales_add_zps(std::shared_ptr<subgraph_t> &sg) {
                 || !consumer_op.get_attr<bool>(op_attr::with_runtime_zps))
             continue;
 
-        fuse_groups.emplace_back(std::pair<op_ptr, op_ptr> {
-                cur_op, (&consumer_op)->shared_from_this()});
+        fuse_groups.emplace_back(cur_op, (&consumer_op)->shared_from_this());
         visited.insert(cur_op.get());
         visited.insert(&consumer_op);
     }
@@ -2777,8 +2960,7 @@ status_t fuse_dynamic_sub_zps_mul_scales(std::shared_ptr<subgraph_t> &sg) {
                 || !consumer_op.get_attr<bool>(op_attr::with_runtime_scales))
             continue;
 
-        fuse_groups.emplace_back(std::pair<op_ptr, op_ptr> {
-                cur_op, (&consumer_op)->shared_from_this()});
+        fuse_groups.emplace_back(cur_op, (&consumer_op)->shared_from_this());
         visited.insert(cur_op.get());
         visited.insert(&consumer_op);
     }
@@ -2797,6 +2979,16 @@ status_t fuse_dynamic_sub_zps_mul_scales(std::shared_ptr<subgraph_t> &sg) {
         fused_op->set_attr<bool>(op_attr::change_layout, false);
         fused_op->set_attr<int64_t>(op_attr::axis, axis);
         fused_op->set_attr<std::string>(op_attr::qtype, qtype);
+        if (qtype == "per_group") {
+            const auto &group_shape
+                    = op2->get_attr<std::vector<int64_t>>(op_attr::group_shape);
+            const int64_t group_mask
+                    = op2->get_attr<int64_t>(op_attr::group_mask);
+
+            fused_op->set_attr<int64_t>(op_attr::group_mask, group_mask);
+            fused_op->set_attr<std::vector<int64_t>>(
+                    op_attr::group_shape, group_shape);
+        }
 
         // src must be the 0-th input
         auto src = op1->get_input_value(0);
@@ -2867,6 +3059,16 @@ impl::status_t convert_dynamic_quantize_ops(std::shared_ptr<subgraph_t> &sg) {
         fused_op->set_attr<bool>(op_attr::change_layout, false);
         fused_op->set_attr<int64_t>(op_attr::axis, axis);
         fused_op->set_attr<std::string>(op_attr::qtype, qtype);
+        if (qtype == "per_group") {
+            const auto &group_shape = cur_op->get_attr<std::vector<int64_t>>(
+                    op_attr::group_shape);
+            const int64_t group_mask
+                    = cur_op->get_attr<int64_t>(op_attr::group_mask);
+
+            fused_op->set_attr<int64_t>(op_attr::group_mask, group_mask);
+            fused_op->set_attr<std::vector<int64_t>>(
+                    op_attr::group_shape, group_shape);
+        }
 
         // src must be the 0-th input
         auto src = cur_op->get_input_value(0);
@@ -2906,21 +3108,37 @@ status_t reorder_canonicalization(std::shared_ptr<subgraph_t> &sg) {
 
     for (auto &cur_op : sg->get_ops()) {
         if (cur_op->get_kind() != op_kind::dnnl_reorder) continue;
+        const std::string &qtype = cur_op->has_attr(op_attr::qtype)
+                ? cur_op->get_attr<std::string>(op_attr::qtype)
+                : "";
 
         size_t index = 1; // the start index of optional runtime scales and zps
-
         // optionally skip the runtime scales
         if (cur_op->has_attr(op_attr::with_runtime_scales)
                 && cur_op->get_attr<bool>(op_attr::with_runtime_scales)) {
             index++;
         }
 
+        const auto is_int4 = [](const graph::data_type_t dt) {
+            return dt == graph::data_type::s4 || dt == graph::data_type::u4;
+        };
+
+        if (qtype == "per_channel") {
+            VCHECK_TRANSFORM((!(cur_op->has_attr(op_attr::with_runtime_src_zps)
+                                     || cur_op->has_attr(
+                                             op_attr::with_runtime_dst_zps))),
+                    status::unimplemented,
+                    "Reorder primitive does not support zero points for "
+                    "per-channel quantization");
+        }
+
         // check runtime src_zps and add typecast if necessary
         if (cur_op->has_attr(op_attr::with_runtime_src_zps)
                 && cur_op->get_attr<bool>(op_attr::with_runtime_src_zps)) {
             auto src_zps = cur_op->get_input_value(index);
-            if (src_zps->get_logical_tensor().data_type
-                    != graph::data_type::s32) {
+            const auto &zp_dt = src_zps->get_logical_tensor().data_type;
+            if (zp_dt != graph::data_type::s32 && !is_int4(zp_dt)) {
+                // DNNL backend does not support int4<->s32 reorder.
                 auto tc_op = std::make_shared<op_t>(op_kind::dnnl_reorder);
                 tc_op->set_attr<bool>(op_attr::change_layout, false);
                 rewriter.insert_op_before(tc_op, cur_op, index);
@@ -2935,8 +3153,8 @@ status_t reorder_canonicalization(std::shared_ptr<subgraph_t> &sg) {
         if (cur_op->has_attr(op_attr::with_runtime_dst_zps)
                 && cur_op->get_attr<bool>(op_attr::with_runtime_dst_zps)) {
             auto dst_zps = cur_op->get_input_value(index);
-            if (dst_zps->get_logical_tensor().data_type
-                    != graph::data_type::s32) {
+            const auto &zp_dt = dst_zps->get_logical_tensor().data_type;
+            if (zp_dt != graph::data_type::s32 && !is_int4(zp_dt)) {
                 auto tc_op = std::make_shared<op_t>(op_kind::dnnl_reorder);
                 tc_op->set_attr<bool>(op_attr::change_layout, false);
                 rewriter.insert_op_before(tc_op, cur_op, index);
@@ -3041,15 +3259,13 @@ status_t common_reorder_elimination(std::shared_ptr<subgraph_t> &sg) {
 
     bool changed = true;
     do {
-        auto ret = cse_func(changed);
-        if (ret != status::success) return ret;
+        CHECK(cse_func(changed));
         cnt++;
     } while (changed && cnt <= max_iter_num);
 
-    assertm(cnt <= max_iter_num + 1,
+    VCHECK_TRANSFORM(cnt <= max_iter_num + 1, status::unimplemented,
             "Failed to eliminate common reorders since the pass can't "
             "converge.");
-    if (cnt > max_iter_num + 1) return status::unimplemented;
 
     return status::success;
 }
@@ -3146,22 +3362,31 @@ status_t combine_binary_post_op_scales(std::shared_ptr<subgraph_t> &sg) {
             continue;
 
         op_t &scales_in0_op = bin_in0_val->get_producer();
-        assertm(scales_in0_op.get_kind() == op_kind::dnnl_mul_scales,
-                "the first predecessor of a binary op should be mul_scales.");
+        VCHECK_TRANSFORM(scales_in0_op.get_kind() == op_kind::dnnl_mul_scales,
+                status::invalid_graph,
+                "the first predecessor of a binary op should be mul_scales. "
+                "but got %s",
+                scales_in0_op.get_name().c_str());
         if (scales_in0_op.has_attr(op_attr::with_runtime_scales)
                 && scales_in0_op.get_attr<bool>(op_attr::with_runtime_scales))
             continue;
 
         op_t &scales_in1_op = bin_in1_val->get_producer();
-        assertm(scales_in1_op.get_kind() == op_kind::dnnl_mul_scales,
-                "the second predecessor of a binary op should be mul_scales.");
+        VCHECK_TRANSFORM(scales_in1_op.get_kind() == op_kind::dnnl_mul_scales,
+                status::invalid_graph,
+                "the second predecessor of a binary op should be mul_scales. "
+                "but got %s",
+                scales_in1_op.get_name().c_str());
         if (scales_in1_op.has_attr(op_attr::with_runtime_scales)
                 && scales_in1_op.get_attr<bool>(op_attr::with_runtime_scales))
             continue;
 
         op_t &scales_out_op = bin_out_val->get_consumers()[0].get_op();
-        assertm(scales_out_op.get_kind() == op_kind::dnnl_mul_scales,
-                "the successor of a binary op should be mul_scales.");
+        VCHECK_TRANSFORM(scales_out_op.get_kind() == op_kind::dnnl_mul_scales,
+                status::invalid_graph,
+                "the successor predecessor of a binary op should be "
+                "mul_scales. but got %s",
+                scales_out_op.get_name().c_str());
         if (scales_out_op.has_attr(op_attr::with_runtime_scales)
                 && scales_out_op.get_attr<bool>(op_attr::with_runtime_scales))
             continue;
@@ -3201,9 +3426,10 @@ status_t combine_binary_post_op_scales(std::shared_ptr<subgraph_t> &sg) {
         const auto multiplier = std::multiplies<float>();
         switch (bin_kind) {
             case dnnl::algorithm::binary_add:
-                assertm(std::all_of(in0_scales.begin(), in0_scales.end(),
+                VCHECK_TRANSFORM(
+                        std::all_of(in0_scales.begin(), in0_scales.end(),
                                 [](float v) { return v != 0.f; }),
-                        "scales can't be zero");
+                        status::invalid_arguments, "scales can't be zero");
                 new_scales_in0
                         = fuse_scales(in0_scales, inv_out_scales, multiplier);
                 new_scales_in1
@@ -3223,7 +3449,8 @@ status_t combine_binary_post_op_scales(std::shared_ptr<subgraph_t> &sg) {
                         {&scales_in0_op, &scales_in1_op, &scales_out_op});
                 break;
             default:
-                assertm(false, "unsupported binary post-op was provided.");
+                VCHECK_TRANSFORM(false, status::unimplemented,
+                        "unsupported binary post-op was provided.");
                 break;
         }
 
@@ -3352,8 +3579,7 @@ impl::status_t lift_up_typecast(std::shared_ptr<subgraph_t> &sg) {
                     || is_layout_reorder(producer);
             if (!ok) continue;
 
-            to_be_swapped.emplace_back(
-                    std::pair<op_t *, op_t *> {producer, op.get()});
+            to_be_swapped.emplace_back(producer, op.get());
         }
 
         if (to_be_swapped.empty()) break;
@@ -3365,6 +3591,7 @@ impl::status_t lift_up_typecast(std::shared_ptr<subgraph_t> &sg) {
             rewriter.swap_neighboring_si_ops(
                     producer->shared_from_this(), tc->shared_from_this());
         }
+        rewriter.run();
     }
     return infer_shape(sg);
 }
@@ -3389,8 +3616,7 @@ impl::status_t lift_up_quantize(std::shared_ptr<subgraph_t> &sg) {
                     || is_layout_reorder(producer);
             if (!ok) continue;
 
-            to_be_swapped.emplace_back(
-                    std::pair<op_t *, op_t *> {producer, op.get()});
+            to_be_swapped.emplace_back(producer, op.get());
         }
 
         if (to_be_swapped.empty()) break;
@@ -3402,6 +3628,7 @@ impl::status_t lift_up_quantize(std::shared_ptr<subgraph_t> &sg) {
             rewriter.swap_neighboring_si_ops(
                     producer->shared_from_this(), quant->shared_from_this());
         }
+        rewriter.run();
     }
     return infer_shape(sg);
 }
@@ -3524,11 +3751,11 @@ impl::status_t lift_up_weight_reshape_for_depthwiseconv(
     subgraph_rewriter_t rewriter(sg);
     for (auto &pair : to_be_swapped) {
         op_t *baseop = pair.first;
-        for (auto swaped : pair.second)
+        for (auto swapped : pair.second)
             rewriter.swap_neighboring_reshape_ops(
-                    swaped->shared_from_this(), baseop->shared_from_this());
+                    swapped->shared_from_this(), baseop->shared_from_this());
     }
-
+    rewriter.run();
     return infer_shape(sg);
 }
 
@@ -3569,11 +3796,25 @@ impl::status_t fuse_src_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
                 if (axis < 0) axis += ltw(in_lt).ndims();
             }
         } else {
-            return impl::status::success;
+            break;
+        }
+
+        /// The order in spec op is used as:
+        /// for (i = 0; i < ndims(); i++)
+        ///     new_shape[i] = org_shape[order[i]];
+        ///
+        /// The axes for permute_axes function is used as:
+        /// for (i = 0; i < ndims(); i++)
+        ///     new_shape[axes[i]] = org_shape[i];
+        ///
+        /// So, we need to convert the order to axes
+        std::vector<int> axes(order.size(), -1);
+        for (size_t i = 0; i < order.size(); i++) {
+            size_t new_shape_idx = i;
+            size_t org_shape_idx = order[i];
+            axes[org_shape_idx] = static_cast<int>(new_shape_idx);
         }
 
-        std::vector<int> axes = dnnl_impl::utils::fmap(order,
-                [](int64_t index) { return static_cast<int32_t>(index); });
         // calculate the expected transposed layout by permuting the md
         auto expected_stride = get_dense_strides(ltw(in_lt).vdims());
         auto &consumer = transpose_op->get_output_value(0)
@@ -3593,16 +3834,20 @@ impl::status_t fuse_src_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
         const auto &strides = expected_in_md.get_strides();
         out_val->set_strides(strides);
     }
+    rewriter.run();
     return impl::status::success;
 }
 
-impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
+impl::status_t fuse_dst_transpose_to_predecessor(
+        std::shared_ptr<subgraph_t> &sg) {
     std::vector<op_ptr> transpose_ops;
     for (auto &cur_op : sg->get_ops()) {
         if (cur_op->get_kind() == op_kind::dnnl_transpose
                 && cur_op->get_input_value(0)->has_producer()
-                && cur_op->get_input_value(0)->get_producer().get_kind()
-                        == op_kind::dnnl_matmul
+                && (cur_op->get_input_value(0)->get_producer().get_kind()
+                                == op_kind::dnnl_matmul
+                        || cur_op->get_input_value(0)->get_producer().get_kind()
+                                == op_kind::dnnl_sdpa)
                 && !cur_op->get_output_value(0)->get_consumers().empty()
                 && (cur_op->get_output_value(0)
                                         ->get_consumers()[0]
@@ -3630,7 +3875,7 @@ impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
                 if (axis < 0) axis += ltw(in_lt).ndims();
             }
         } else {
-            return impl::status::success;
+            break;
         }
 
         std::vector<int> axes = dnnl_impl::utils::fmap(order,
@@ -3655,15 +3900,19 @@ impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg) {
         dnnl::memory::desc expected_out_md = out_md.permute_axes(axes);
         // Special check to avoid low matmul performance with adbc layout.
         // TODO: remove this once the performance is improved.
-        if (get_format_tag(expected_out_md) == dnnl::memory::format_tag::adbc) {
-            return impl::status::success;
+        if (in_val->get_producer().get_kind() == op_kind::dnnl_matmul
+                && get_format_tag(expected_out_md)
+                        == dnnl::memory::format_tag::adbc) {
+            break;
         }
         const auto &strides = expected_out_md.get_strides();
         in_val->set_strides(strides);
-        auto &matmul = transpose_op->get_input_value(0)->get_producer();
-        matmul.set_attr(op_attr::keep_dst_layout, true);
+        if (in_val->get_producer().get_kind() == op_kind::dnnl_matmul) {
+            auto &matmul = in_val->get_producer();
+            matmul.set_attr(op_attr::keep_dst_layout, true);
+        }
     }
-
+    rewriter.run();
     return impl::status::success;
 }
 
@@ -3729,9 +3978,7 @@ impl::status_t swap_relu_mul_scales(std::shared_ptr<subgraph_t> &sg) {
                     = producer->get_input_value(0)->get_producer();
             if (prv_op.get_kind() == op_kind::dnnl_batchnorm
                     && !prv_op.get_attr<bool>(op_attr::is_training)) {
-                to_be_swapped.emplace_back(
-                        std::pair<graph::op_t *, graph::op_t *> {
-                                producer, op.get()});
+                to_be_swapped.emplace_back(producer, op.get());
             } else {
                 continue;
             }
@@ -3744,10 +3991,95 @@ impl::status_t swap_relu_mul_scales(std::shared_ptr<subgraph_t> &sg) {
             rewriter.swap_neighboring_si_ops(
                     relu->shared_from_this(), mul_scales->shared_from_this());
         }
+        rewriter.run();
     }
     return infer_shape(sg);
 }
 
+status_t fuse_implicit_causal_mask(std::shared_ptr<subgraph_t> &sg) {
+    std::vector<std::vector<op_ptr>> op_lists;
+    for (auto &cur_op : sg->get_ops()) {
+        // check if cur_op is GreaterEqual
+        if (cur_op->get_kind() != op_kind::dnnl_binary) continue;
+        if (static_cast<dnnl::algorithm>(
+                    cur_op->get_attr<int64_t>(op_attr::alg_kind))
+                != dnnl::algorithm::binary_ge)
+            continue;
+
+        // check if in_ops are GenIndex
+        auto in_val0 = cur_op->get_input_value(0);
+        if (!in_val0->has_producer()) continue;
+        auto &in_op0 = in_val0->get_producer();
+        if (in_op0.get_kind() != op_kind::dnnl_gen_index) continue;
+        if (in_op0.get_attr<int64_t>(op_attr::axis) != 2) continue;
+
+        auto in_val1 = cur_op->get_input_value(1);
+        if (!in_val1->has_producer()) continue;
+        auto &in_op1 = in_val1->get_producer();
+        if (in_op1.get_kind() != op_kind::dnnl_gen_index) continue;
+        if (in_op1.get_attr<int64_t>(op_attr::axis) != 3) continue;
+
+        // check if out_op is Select
+        auto out_val = cur_op->get_output_value(0);
+        if (out_val->get_consumers().size() != 1) continue;
+        auto &out_op = out_val->get_consumers()[0].get_op();
+        if (out_op.get_kind() != op_kind::dnnl_binary) continue;
+        if (static_cast<dnnl::algorithm>(
+                    out_op.get_attr<int64_t>(op_attr::alg_kind))
+                != dnnl::algorithm::binary_select)
+            continue;
+
+        // check if GenIndex and Select share the same input
+        if (in_op0.get_input_value(0) != in_op1.get_input_value(0)) continue;
+        if (in_op0.get_input_value(0) != out_op.get_input_value(0)) continue;
+
+        // ops in the list: GenIndex_row, GenIndex_col, GreaterEqual, Select
+        std::vector<op_ptr> list = {in_op0.shared_from_this(),
+                in_op1.shared_from_this(), cur_op, out_op.shared_from_this()};
+        op_lists.emplace_back(list);
+    }
+
+    if (op_lists.empty()) return status::success;
+
+    subgraph_rewriter_t rewriter(sg);
+    for (auto &list : op_lists) {
+        op_ptr mask_op = std::make_shared<op_t>(op_kind::dnnl_mask);
+
+        // connect inputs for mask_op
+        auto in_val0 = list[0]->get_input_value(0);
+        in_val0->remove_consumer(*list[0], 0);
+        in_val0->remove_consumer(*list[1], 0);
+        in_val0->remove_consumer(*list[3], 0);
+        mask_op->connect_input(0, in_val0);
+
+        auto in_val1 = list[3]->get_input_value(1);
+        in_val1->remove_consumer(*list[3], 1);
+        mask_op->connect_input(1, in_val1);
+
+        // connect output for mask op
+        auto out_val = list[3]->get_output_value(0);
+        out_val->set_producer(*mask_op);
+        mask_op->add_output(out_val);
+
+        // set attrs for mask_op
+        const auto axis_row = list[0]->get_attr<int64_t>(op_attr::axis);
+        const auto axis_col = list[1]->get_attr<int64_t>(op_attr::axis);
+        mask_op->set_attr(op_attr::axis_row, axis_row);
+        mask_op->set_attr(op_attr::axis_col, axis_col);
+
+        // remove original ops
+        for (const auto &op : list) {
+            rewriter.to_remove(op);
+        }
+
+        // add mask_op to subgraph
+        rewriter.to_insert(mask_op);
+    }
+
+    rewriter.run();
+    return status::success;
+}
+
 impl::status_t fold_pre_mul_scale_into_bn(std::shared_ptr<subgraph_t> &sg) {
     const auto get_next_op = [](const op_ptr &op) -> op_ptr {
         const value_ptr out_val = op->get_output_value(0);
@@ -3846,6 +4178,131 @@ impl::status_t replace_select_values(std::shared_ptr<subgraph_t> &sg) {
     return infer_shape(sg);
 }
 
+status_t fuse_sdpa(std::shared_ptr<subgraph_t> &sg) {
+    std::vector<op_ptr> candidates;
+    for (auto &cur_op : sg->get_ops()) {
+        std::vector<op_ptr> pattern_ops;
+        if (cur_op->get_kind() != op_kind::dnnl_matmul) continue;
+        op_ptr walker = cur_op;
+        bool valid_pattern = true;
+        bool has_scale = false, has_mask = false, has_softmax = false;
+        bool finished = false;
+        while (walker && !finished) {
+            pattern_ops.push_back(walker);
+            switch (walker->get_kind()) {
+                case op_kind::dnnl_matmul: {
+                    if (pattern_ops.size() == 1) {
+                    }
+                    // Finish pattern match process after second matmul
+                    else {
+                        valid_pattern = (pattern_ops.size() >= 3);
+                        finished = true;
+                    }
+                    break;
+                }
+                case op_kind::dnnl_binary: {
+                    auto alg = static_cast<dnnl::algorithm>(
+                            walker->get_attr<int64_t>(op_attr::alg_kind));
+                    if (alg == dnnl::algorithm::binary_mul
+                            || alg == dnnl::algorithm::binary_div) {
+                        if (has_scale) valid_pattern = false;
+                        has_scale = true;
+                    } else if (alg == dnnl::algorithm::binary_add) {
+                        if (has_mask) valid_pattern = false;
+                        has_mask = true;
+                    }
+                    break;
+                }
+                case op_kind::dnnl_mask: {
+                    if (has_mask) valid_pattern = false;
+                    has_mask = true;
+                    break;
+                }
+                case op_kind::dnnl_softmax: {
+                    if (has_softmax) valid_pattern = false;
+                    has_softmax = true;
+                    break;
+                }
+                default: valid_pattern = false;
+            }
+
+            if (!valid_pattern) break;
+
+            auto out_val = walker->get_output_value(0);
+            if (out_val->get_consumers().size() != 1) break;
+            walker = out_val->get_consumers()[0].get_op().shared_from_this();
+        }
+
+        if (valid_pattern && finished) {
+            candidates = pattern_ops;
+            break;
+        }
+    }
+
+    if (candidates.empty()) return status::success;
+
+    subgraph_rewriter_t rewriter(sg);
+    op_ptr sdpa_op = std::make_shared<op_t>(op_kind::dnnl_sdpa);
+    sdpa_op->set_attr<bool>(op_attr::with_scale, false);
+    sdpa_op->set_attr<bool>(op_attr::with_mask, false);
+    sdpa_op->set_attr<bool>(op_attr::with_causal, false);
+
+    auto query_val = candidates[0]->get_input_value(0);
+    query_val->remove_consumer(*candidates[0], 0);
+    sdpa_op->connect_input(0, query_val);
+
+    auto key_val = candidates[0]->get_input_value(1);
+    key_val->remove_consumer(*candidates[0], 1);
+    sdpa_op->connect_input(1, key_val);
+
+    auto value_val = candidates.back()->get_input_value(1);
+    value_val->remove_consumer(*candidates.back(), 1);
+    sdpa_op->connect_input(2, value_val);
+
+    size_t input_idx = 3;
+    for (size_t i = 1; i < candidates.size(); ++i) {
+        auto op = candidates[i];
+        if (op->get_kind() == op_kind::dnnl_binary) {
+            auto alg = static_cast<dnnl::algorithm>(
+                    op->get_attr<int64_t>(op_attr::alg_kind));
+            // handle scale
+            if (alg == dnnl::algorithm::binary_mul
+                    || alg == dnnl::algorithm::binary_div) {
+                auto scale_val = op->get_input_value(1);
+                scale_val->remove_consumer(*op, 1);
+                sdpa_op->connect_input(input_idx++, scale_val);
+                sdpa_op->set_attr<bool>(op_attr::with_scale, true);
+                sdpa_op->set_attr<bool>(op_attr::is_invert_scale,
+                        (alg == dnnl::algorithm::binary_div));
+            }
+            // handle explicit mask
+            else if (alg == dnnl::algorithm::binary_add) {
+                auto mask_val = op->get_input_value(1);
+                mask_val->remove_consumer(*op, 1);
+                sdpa_op->connect_input(input_idx++, mask_val);
+                sdpa_op->set_attr<bool>(op_attr::with_mask, true);
+            }
+        }
+        // handle implicit dnnl_mask
+        else if (op->get_kind() == op_kind::dnnl_mask) {
+            sdpa_op->set_attr<bool>(op_attr::with_causal, true);
+        }
+    }
+
+    auto final_output = candidates.back()->get_output_value(0);
+    final_output->set_producer(*sdpa_op);
+    sdpa_op->add_output(final_output);
+
+    insert_empty_scratchpad(sdpa_op);
+
+    for (auto &op : candidates) {
+        rewriter.to_remove(op);
+    }
+    rewriter.to_insert(sdpa_op);
+    rewriter.run();
+    return status::success;
+}
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/passes/transform.hpp b/src/graph/backend/dnnl/passes/transform.hpp
index 4378c527e20..7b2cc197222 100644
--- a/src/graph/backend/dnnl/passes/transform.hpp
+++ b/src/graph/backend/dnnl/passes/transform.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,6 +119,14 @@ status_t fuse_to_dnnl_sum(std::shared_ptr<subgraph_t> &sg);
 // make the input shape meet the requirement of oneDNN binary primitive
 status_t binary_canonicalization(std::shared_ptr<subgraph_t> &sg);
 
+// For now, we support two impl paths for select op: one is to use binary
+// primitive with select alorithm, the other is to use multiple binary ops(we
+// call it "legacy impl" here). However, during the lowering pass, we directly
+// lower the front-end select op to single binary select op, this pass is used
+// to decide which impl path to apply and then decompose the select binary op
+// back to multiple binary ops if it's the case to use legacy impl.
+status_t decompose_select_to_binary_ops(std::shared_ptr<subgraph_t> &sg);
+
 // This pass is used to swap two inputs to broadcast src1 which is optimized in
 // oneDNN binary primitive. Notice that this should be applied after
 // binary_canonicalization and infer_shape
@@ -196,9 +204,10 @@ impl::status_t lift_up_weight_reshape_for_depthwiseconv(
 // This pass will compute matmul with the src layout of transpose before matmul
 impl::status_t fuse_src_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg);
 
-// This pass will compute matmul with the dst layout of following transpose if
-// the operator after transpose need a dense layout
-impl::status_t fuse_dst_transpose_to_matmul(std::shared_ptr<subgraph_t> &sg);
+// This pass will compute matmul/sdpa with the dst layout of following transpose
+// if the operator after transpose need a dense layout
+impl::status_t fuse_dst_transpose_to_predecessor(
+        std::shared_ptr<subgraph_t> &sg);
 
 // This pass will fuse all the reshape to its lead op for GQA.
 impl::status_t fuse_reshape_for_gqa(std::shared_ptr<subgraph_t> &sg);
@@ -262,6 +271,22 @@ impl::status_t fold_post_mul_scale_into_bn(std::shared_ptr<subgraph_t> &sg);
 /// This pass replaces the output logical tensor to remove the consumer. It is
 /// mainly to use the "get_output_ops" function.
 impl::status_t replace_select_values(std::shared_ptr<subgraph_t> &sg);
+
+/// This pass will translate the subgraph containing subgraph of implicit causal
+/// mask into a dnnl_mask op
+///           in0
+///         /    |
+///    GenIndex GenIndex             in0  in1
+///        \     /                     \   /
+///      GreaterEqual in0 in1   -->     mask
+///               \  /   /               |
+///                Select
+///                   |
+status_t fuse_implicit_causal_mask(std::shared_ptr<subgraph_t> &sg);
+
+/// This pass will transform the sdpa subgraph into a dnnl_sdpa op.
+status_t fuse_sdpa(std::shared_ptr<subgraph_t> &sg);
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/passes/utils.cpp b/src/graph/backend/dnnl/passes/utils.cpp
index 19a56954e9a..25f72c18e9a 100644
--- a/src/graph/backend/dnnl/passes/utils.cpp
+++ b/src/graph/backend/dnnl/passes/utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,6 @@ namespace graph {
 namespace dnnl_impl {
 using op_t = op_t;
 using op_ptr = std::shared_ptr<op_t>;
-using value_ptr = std::shared_ptr<value_t>;
 using ltw = logical_tensor_wrapper_t;
 
 status_t set_given_inputs_outputs(std::shared_ptr<subgraph_t> &sg,
@@ -65,9 +64,8 @@ status_t set_given_inputs_outputs(std::shared_ptr<subgraph_t> &sg,
 
             // partition in/outs should not have default id. There must be some
             // errors in previous graph transformation stage
-            if (edge_id == std::numeric_limits<size_t>::max())
-                return status::invalid_graph;
-
+            VCHECK_UTILS(edge_id != std::numeric_limits<size_t>::max(),
+                    status::invalid_graph, "Invalid edge_id %zu", edge_id);
             bool found = false;
             for (const auto &given : givens) {
                 if (edge_id == given.id) {
@@ -87,7 +85,9 @@ status_t set_given_inputs_outputs(std::shared_ptr<subgraph_t> &sg,
                                 }
                             }
                         }
-                        if (!valid) return status::invalid_arguments;
+                        VCHECK_UTILS(valid, status::invalid_arguments,
+                                "Invalid given logical tensor for given.id %zu",
+                                given.id);
                     }
 
                     edge->set_logical_tensor(given);
@@ -96,17 +96,15 @@ status_t set_given_inputs_outputs(std::shared_ptr<subgraph_t> &sg,
                 }
             }
 
-            if (!found) return status::invalid_arguments;
+            VCHECK_UTILS(found, status::invalid_arguments,
+                    "Can't find given logical tensor for edge_id %zu", edge_id);
         }
         return status::success;
     };
 
-    status_t ret;
-    ret = func(graph_in_vals, inputs, true, true);
-    if (ret != status::success) return ret;
-
-    ret = func(graph_out_vals, outputs, true, false);
-    return ret;
+    CHECK(func(graph_in_vals, inputs, true, true));
+    CHECK(func(graph_out_vals, outputs, true, false));
+    return status::success;
 }
 
 status_t set_given_inputs_outputs(std::vector<op_ptr> &subgraph,
@@ -188,7 +186,8 @@ std::vector<value_t *> get_constant_block_output_values(
         return status::success;
     };
     status_t status = topo_order_visit(sg->get_output_ops(), func);
-    if (status != status::success) return {};
+    VCHECK_UTILS(status == status::success, {},
+            "Failed to get constant block output values");
     return ret;
 }
 
@@ -221,8 +220,7 @@ status_t infer_shape(std::shared_ptr<subgraph_t> &sg) {
         }
     }
 
-    auto ret = sg->infer_shape();
-    if (ret != status::success) return ret;
+    CHECK(sg->infer_shape());
 
     // Fill the inferred shape and strides to subgraph's outputs
     for (size_t i = 0; i < sg->outs_.size(); i++) {
@@ -239,7 +237,7 @@ status_t infer_shape(std::shared_ptr<subgraph_t> &sg) {
         op->remove_attr(op_attr::dw_type);
     }
 
-    return ret;
+    return status::success;
 }
 
 const std::map<op_kind_t, dnnl::algorithm> &get_binary_alg_map() {
@@ -250,7 +248,9 @@ const std::map<op_kind_t, dnnl::algorithm> &get_binary_alg_map() {
                     {graph::op_kind::Minimum, dnnl::algorithm::binary_min},
                     {graph::op_kind::Maximum, dnnl::algorithm::binary_max},
                     {graph::op_kind::Subtract, dnnl::algorithm::binary_sub},
-                    {graph::op_kind::BiasAdd, dnnl::algorithm::binary_add}};
+                    {graph::op_kind::BiasAdd, dnnl::algorithm::binary_add},
+                    {graph::op_kind::GreaterEqual, dnnl::algorithm::binary_ge},
+                    {graph::op_kind::Select, dnnl::algorithm::binary_select}};
     return binary_alg_map;
 }
 
@@ -272,8 +272,11 @@ bool binary_doable(
 static bool post_binary_fusible_impl(const op_t *base_op,
         const std::vector<dim_t> &fused_shape,
         const std::vector<dim_t> &other_shape, engine_kind_t ekind) {
-    assertm(fused_shape.size() == other_shape.size(),
-            "must have same ndims, pls run binary_canonicalization pass first");
+    VCHECK_UTILS(fused_shape.size() == other_shape.size(), false,
+            "binary fusible ops must have same ndims, "
+            "fused_shape size is %zu, other_shape size is %zu."
+            "pls run binary_canonicalization pass first",
+            fused_shape.size(), other_shape.size());
     // full tensor and per tensor broadcasted
     if (fused_shape == other_shape
             || std::all_of(other_shape.begin(), other_shape.end(),
@@ -406,6 +409,27 @@ bool post_binary_fusible(
             base_op, ltw(fused_in).vdims(), ltw(other_in).vdims(), ekind);
 }
 
+bool post_eltwise_fusible(
+        const op_t *base_op, const op_t *elt_op, graph::engine_kind_t ekind) {
+// binary + sqrt post-op fusion is unsupported on NVIDIA GPU
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
+    if (base_op->get_kind() == op_kind::dnnl_binary
+            && static_cast<dnnl::algorithm>(
+                       elt_op->get_attr<int64_t>(op_attr::alg_kind))
+                    == dnnl::algorithm::eltwise_sqrt
+            && ekind == dnnl_gpu) {
+        return false;
+    }
+    return true;
+#else
+    UNUSED(base_op);
+    UNUSED(elt_op);
+    UNUSED(ekind);
+    return true;
+#endif
+}
+
 bool post_depthwise_conv_fusible(
         const op_t *base_conv_op, const op_t *post_conv_op) {
     using spatial_dims_t = std::vector<int64_t>;
@@ -576,24 +600,24 @@ bool is_typecast(const op_t *op) {
 }
 
 bool with_runtime_zps(const op_ptr &op, const fusion_info_mgr_t &mgr,
-        bool is_input, size_t indice) {
+        bool is_input, size_t index) {
     if (op->has_attr(op_attr::fusion_info_key)
             && op->get_attr<int64_t>(op_attr::fusion_info_key) != -1) {
         int64_t key = op->get_attr<int64_t>(op_attr::fusion_info_key);
         const fusion_info_t &fusion_info = mgr.get_info(key);
-        return fusion_info.with_runtime_zero_points(is_input, indice);
+        return fusion_info.with_runtime_zero_points(is_input, index);
     } else {
         return false;
     }
 }
 
 bool with_runtime_scales(const op_ptr &op, const fusion_info_mgr_t &mgr,
-        bool is_input, size_t indice) {
+        bool is_input, size_t index) {
     if (op->has_attr(op_attr::fusion_info_key)
             && op->get_attr<int64_t>(op_attr::fusion_info_key) != -1) {
         int64_t key = op->get_attr<int64_t>(op_attr::fusion_info_key);
         const fusion_info_t &fusion_info = mgr.get_info(key);
-        return fusion_info.with_runtime_scales(is_input, indice);
+        return fusion_info.with_runtime_scales(is_input, index);
     } else {
         return false;
     }
@@ -621,10 +645,10 @@ bool is_layout_reorder(const op_t *op) {
 }
 
 std::shared_ptr<op_t> clone_mul_scales(const std::shared_ptr<op_t> &scale_op) {
-    assertm(scale_op->num_inputs() <= 1,
-            "scale_op should have only one input value.");
-    assertm(!scale_op->has_attr(op_attr::with_runtime_scales),
-            "scale_op should be static");
+    VCHECK_UTILS(scale_op->num_inputs() <= 1
+                    && !scale_op->has_attr(op_attr::with_runtime_scales),
+            nullptr,
+            "scale_op should be static and have only one input value.");
     auto new_op = std::make_shared<op_t>(op_kind::dnnl_mul_scales);
     new_op->set_attr<std::vector<float>>(op_attr::scales,
             scale_op->get_attr<std::vector<float>>(op_attr::scales));
@@ -636,16 +660,30 @@ std::shared_ptr<op_t> clone_mul_scales(const std::shared_ptr<op_t> &scale_op) {
 }
 
 bool inverse_mul_scales(std::shared_ptr<op_t> &scale_op) {
-    assertm(scale_op->num_inputs() <= 1,
-            "scale_op should have only one input value.");
-    assertm(!scale_op->has_attr(op_attr::with_runtime_scales),
-            "scale_op should be static");
+    VCHECK_UTILS(scale_op->num_inputs() <= 1
+                    && !scale_op->has_attr(op_attr::with_runtime_scales),
+            false, "scale_op should be static and have only one input value.");
     auto scales = scale_op->get_attr<std::vector<float>>(op_attr::scales);
     scales = dnnl_impl::utils::fmap(scales, [](float s) { return 1.f / s; });
     scale_op->set_attr(op_attr::scales, scales);
     return true;
 }
 
+bool need_broadcast_for_inputs(
+        const std::shared_ptr<op_t> &op, size_t index1, size_t index2) {
+    auto in_vals = op->get_input_values();
+
+    const dims input1_dims
+            = logical_tensor_wrapper_t(in_vals[index1]->get_logical_tensor())
+                      .vdims();
+    const dims input2_dims
+            = logical_tensor_wrapper_t(in_vals[index2]->get_logical_tensor())
+                      .vdims();
+
+    if (input1_dims != input2_dims) { return true; }
+
+    return false;
+}
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/passes/utils.hpp b/src/graph/backend/dnnl/passes/utils.hpp
index d768c735543..40fe6942729 100644
--- a/src/graph/backend/dnnl/passes/utils.hpp
+++ b/src/graph/backend/dnnl/passes/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,9 @@
 
 #include "oneapi/dnnl/dnnl.hpp"
 
+#define VCHECK_UTILS(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, utils, (cond), status, msg, ##__VA_ARGS__);
+
 namespace dnnl {
 namespace impl {
 namespace graph {
@@ -88,7 +91,8 @@ class pass_pipeline_t {
         status_t ret;
         for (size_t i = 0; i < passes_.size(); i++) {
             ret = passes_[i](sg);
-            if (ret != status::success) { return ret; }
+            VCHECK_UTILS(ret == status::success, ret, "run pass %s failed",
+                    names_[i].c_str());
 
             // Dump the subgraph to dot file
             if (enable_visualizer_) {
@@ -97,8 +101,13 @@ class pass_pipeline_t {
             }
 
             // Validate the subgraph after each pass
-            if (enable_validator_) { ret = validator_.run(sg); }
-            if (ret != status::success) { return ret; }
+            if (enable_validator_) {
+                ret = validator_.run(sg);
+                VCHECK_UTILS(ret == status::success, ret,
+                        "validation failed "
+                        "after run pass %s",
+                        names_[i].c_str());
+            }
         }
         return status::success;
     }
@@ -310,6 +319,10 @@ std::pair<bool, std::pair<size_t, int64_t>> shuffle_fusible(
 bool post_binary_fusible(const op_t *base_op, const op_t *bin_op,
         engine_kind_t ekind = engine_kind::cpu);
 
+// binary + sqrt post-op fusion is unsupported on NVIDIA GPU
+bool post_eltwise_fusible(
+        const op_t *base_op, const op_t *elt_op, graph::engine_kind_t ekind);
+
 // oneDNN support post depthwise conv fusion. This function is used to check if
 // two conv ops can be fused as a conv + depthwise pattern.
 bool post_depthwise_conv_fusible(
@@ -329,13 +342,13 @@ std::string kind2str(op_kind_t kind);
 bool is_typecast(const op_t *op);
 
 bool with_runtime_scales(const std::shared_ptr<op_t> &op,
-        const fusion_info_mgr_t &mgr, bool is_input, size_t indice);
+        const fusion_info_mgr_t &mgr, bool is_input, size_t index);
 
 bool with_runtime_dst_scales(
         const std::shared_ptr<op_t> &op, const fusion_info_mgr_t &mgr);
 
 bool with_runtime_zps(const std::shared_ptr<op_t> &op,
-        const fusion_info_mgr_t &mgr, bool is_input, size_t indice);
+        const fusion_info_mgr_t &mgr, bool is_input, size_t index);
 
 // This function is used to check if a dnnl_reorder op is converted from or act
 // as a Reorder op. This function will only return true for a dnnl_reorder op
@@ -348,6 +361,9 @@ std::shared_ptr<op_t> clone_mul_scales(const std::shared_ptr<op_t> &scale_op);
 // This function is used to inverse scales of a dnnl_mul_scales op
 bool inverse_mul_scales(std::shared_ptr<op_t> &scale_op);
 
+bool need_broadcast_for_inputs(
+        const std::shared_ptr<op_t> &op, size_t index1, size_t index2);
+
 } // namespace dnnl_impl
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/backend/dnnl/patterns/concat_fusion.cpp b/src/graph/backend/dnnl/patterns/concat_fusion.cpp
index 86b86a16e08..87434941c34 100644
--- a/src/graph/backend/dnnl/patterns/concat_fusion.cpp
+++ b/src/graph/backend/dnnl/patterns/concat_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,8 +39,11 @@ bool check_scales_zps_all_equal(op_t *op) {
     // We only want to accept int8 concat with inputs using the same scales and
     // zps. Concat does not change range of values so output scales and zps
     // should be same as well.
-    if (!out_op.has_attr(op_attr::scales) || !out_op.has_attr(op_attr::zps))
-        return false;
+    VCHECK_PATTERN_UTILS(
+            out_op.has_attr(op_attr::scales) && out_op.has_attr(op_attr::zps),
+            false,
+            "output of int8 concat pattern should have scales and zps "
+            "attributes");
     const auto expected_scales
             = out_op.get_attr<std::vector<float>>(op_attr::scales);
     const auto expected_zps
@@ -51,11 +54,17 @@ bool check_scales_zps_all_equal(op_t *op) {
         if (!in_port->has_producer()) return false;
 
         auto &in_op = in_port->get_producer();
-        if (!in_op.has_attr(op_attr::scales) || !in_op.has_attr(op_attr::zps))
-            return false;
+        VCHECK_PATTERN_UTILS(out_op.has_attr(op_attr::scales)
+                        && out_op.has_attr(op_attr::zps),
+                false,
+                "input of int8 concat pattern should have scales and zps "
+                "attributes");
         auto scales = in_op.get_attr<std::vector<float>>(op_attr::scales);
         auto zps = in_op.get_attr<std::vector<int64_t>>(op_attr::zps);
-        if (scales != expected_scales || zps != expected_zps) return false;
+        VCHECK_PATTERN_UTILS(scales == expected_scales && zps == expected_zps,
+                false,
+                "input scales and zps of int8 concat pattern should be equal "
+                "to output scales and zps");
     }
 
     return true;
diff --git a/src/graph/backend/dnnl/patterns/conv_block_fusion.cpp b/src/graph/backend/dnnl/patterns/conv_block_fusion.cpp
index f39deb7ac60..523b598c0aa 100644
--- a/src/graph/backend/dnnl/patterns/conv_block_fusion.cpp
+++ b/src/graph/backend/dnnl/patterns/conv_block_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,13 +35,17 @@ using FCreatePattern = graph::pass::FCreatePattern;
 namespace {
 template <bool GROUPED>
 bool check_grouped(op_t *op) {
+    bool result = true;
     if (GROUPED) {
-        return op->has_attr(op_attr::groups)
+        result = op->has_attr(op_attr::groups)
                 && op->get_attr<int64_t>(op_attr::groups) > 1;
     } else {
-        return !op->has_attr(op_attr::groups)
+        result = !op->has_attr(op_attr::groups)
                 || op->get_attr<int64_t>(op_attr::groups) <= 1;
     }
+
+    VCHECK_PATTERN_UTILS(result, result, "invalid groups attribute");
+    return result;
 }
 
 // Block creators used to construct large patterns
diff --git a/src/graph/backend/dnnl/patterns/conv_post_ops.cpp b/src/graph/backend/dnnl/patterns/conv_post_ops.cpp
index 551d869e428..952c005e202 100644
--- a/src/graph/backend/dnnl/patterns/conv_post_ops.cpp
+++ b/src/graph/backend/dnnl/patterns/conv_post_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,8 +32,11 @@ namespace {
 bool check_s8_weight(op_t *op) {
     const op_kind_t kind = op->get_kind();
     if (kind != graph::op_kind::Dequantize) return true;
-    return (!is_int8_quantization(op)
-            || check_input_dtype<graph::data_type::s8>(op));
+    bool result = !is_int8_quantization(op)
+            || check_input_dtype<graph::data_type::s8>(op);
+    VCHECK_PATTERN_UTILS(
+            result, result, "conv primitive only supports s8 weight");
+    return result;
 }
 
 bool check_f8_conv_with_no_scale_and_zp(op_t *op) {
@@ -43,9 +46,10 @@ bool check_f8_conv_with_no_scale_and_zp(op_t *op) {
     for (const size_t index : {0, 1}) {
         auto &parent_op = op->get_input_value(index)->get_producer();
         if (parent_op.get_kind() != graph::op_kind::Dequantize) return true;
-        if (is_f8_quantization(&parent_op)
-                && !check_quant_with_no_effect(&parent_op))
-            return false;
+        bool result = !(is_f8_quantization(&parent_op)
+                && !check_quant_with_no_effect(&parent_op));
+        VCHECK_PATTERN_UTILS(result, result,
+                "f8 conv primitive doesn't support scale and zp");
     }
     return true;
 }
@@ -337,7 +341,7 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8x8x_conv_post_ops)
             [quant_out]*
                 | 
 
-This pattern is defined for itex orginally, the conv here is a depthwise 
+This pattern is defined for itex originally, the conv here is a depthwise 
 convolution.
 */
 DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8s8x_conv_reshape_post_ops)
diff --git a/src/graph/backend/dnnl/patterns/convtranspose_fusion.cpp b/src/graph/backend/dnnl/patterns/convtranspose_fusion.cpp
index 43817f496ac..9fd194bebe0 100644
--- a/src/graph/backend/dnnl/patterns/convtranspose_fusion.cpp
+++ b/src/graph/backend/dnnl/patterns/convtranspose_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,8 +32,11 @@ using FCreatePattern = graph::pass::FCreatePattern;
 
 bool check_scales_equal_to_1(op_t *op) {
     auto scales = op->get_attr<std::vector<float>>(op_attr::scales);
-    return std::all_of(scales.begin(), scales.end(),
+    bool result = std::all_of(scales.begin(), scales.end(),
             [](float val) { return val == 1.0f; });
+    VCHECK_PATTERN_UTILS(result, result,
+            "convtranspose primitive doesn't support output scales != 1");
+    return result;
 }
 
 DNNL_BACKEND_REGISTER_PATTERN_DEF_BEGIN(convtranspose_fusion)
diff --git a/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp b/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp
index 61fd5284765..4bf2711212d 100644
--- a/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp
+++ b/src/graph/backend/dnnl/patterns/data_type_check_pass.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "graph/backend/dnnl/platform.hpp"
 #include "graph/backend/fake/pattern_utils.hpp"
 
+#include "graph/backend/dnnl/patterns/utils.hpp"
 #include "graph/utils/pm/nested_matcher.hpp"
 #include "graph/utils/pm/pass_base.hpp"
 
@@ -33,7 +34,7 @@ namespace pattern {
 
 namespace {
 
-platform::dir_t get_op_dir(const std::shared_ptr<op_t> &aop) {
+inline platform::dir_t get_op_dir(const std::shared_ptr<op_t> &aop) {
     using namespace dnnl::impl::graph::op_kind;
     using namespace dnnl::impl::graph::dnnl_impl::platform;
 
@@ -121,7 +122,7 @@ platform::dir_t get_op_dir(const std::shared_ptr<op_t> &aop) {
     return dir;
 }
 
-bool is_reorder_type(op_kind_t op_kind) {
+inline bool is_reorder_type(op_kind_t op_kind) {
     using namespace dnnl::impl::graph::op_kind;
     static const std::unordered_set<int> reorder_ops {Reorder, Quantize,
             Dequantize, DynamicDequantize, DynamicQuantize, TypeCast};
@@ -135,11 +136,11 @@ bool is_reorder_type(op_kind_t op_kind) {
  * \brief dtype_check_pass_t generates a pass for checking unimplemented data 
  *        type.
  */
-class dtype_check_pass_t : public graph::pass::pass_base {
+class dtype_check_pass_t : public graph::pass::pass_base_t {
 public:
     explicit dtype_check_pass_t(std::string pbackend, std::string pname,
             std::vector<data_type_t> dtypes)
-        : graph::pass::pass_base(std::move(pbackend), std::move(pname))
+        : graph::pass::pass_base_t(std::move(pbackend), std::move(pname))
         , dt_to_check_(std::move(dtypes)) {
         // data type check passes should be executed first, hence should
         // have the highest priority.
@@ -186,9 +187,9 @@ class dtype_check_pass_t : public graph::pass::pass_base {
             dir_t dir = get_op_dir(aop);
             const auto &op_kind = aop->get_kind();
 
-            if (unsupported_dt.find(dir) == unsupported_dt.end()) {
-                return impl::status::unimplemented;
-            }
+            VCHECK_PATTERN_UTILS(
+                    unsupported_dt.find(dir) != unsupported_dt.end(),
+                    impl::status::unimplemented, "unsupported dir %d ", dir);
             const auto &dt_with_dir = unsupported_dt.at(dir);
 
             for (size_t i = 0; i < aop->num_inputs(); ++i) {
diff --git a/src/graph/backend/dnnl/patterns/fusions.hpp b/src/graph/backend/dnnl/patterns/fusions.hpp
index e6442a72216..232abbcc3e8 100644
--- a/src/graph/backend/dnnl/patterns/fusions.hpp
+++ b/src/graph/backend/dnnl/patterns/fusions.hpp
@@ -47,6 +47,7 @@ DNNL_BACKEND_REGISTER_PATTERN_DECLARE(layernorm_fusion)
 DNNL_BACKEND_REGISTER_PATTERN_DECLARE(sum_fusion)
 DNNL_BACKEND_REGISTER_PATTERN_DECLARE(concat_fusion)
 DNNL_BACKEND_REGISTER_PATTERN_DECLARE(groupnorm_fusion)
+DNNL_BACKEND_REGISTER_PATTERN_DECLARE(mlp)
 
 #undef DNNL_BACKEND_REGISTER_PATTERN_DECLARE
 
diff --git a/src/graph/backend/dnnl/patterns/interpolate_fusion.cpp b/src/graph/backend/dnnl/patterns/interpolate_fusion.cpp
index c5e1678b3b7..20e417f257d 100644
--- a/src/graph/backend/dnnl/patterns/interpolate_fusion.cpp
+++ b/src/graph/backend/dnnl/patterns/interpolate_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,8 +33,13 @@ using FCreatePattern = graph::pass::FCreatePattern;
 
 namespace {
 bool check_attributes(op_t *op) {
-    return op->get_attr<std::string>(op_attr::coordinate_transformation_mode)
+    bool result
+            = op->get_attr<std::string>(op_attr::coordinate_transformation_mode)
             == std::string("half_pixel");
+    VCHECK_PATTERN_UTILS(result, result,
+            "interpolate primitive doesn't support "
+            "other coordinate_transformation_mode except half_pixel");
+    return result;
 }
 } // namespace
 
diff --git a/src/graph/backend/dnnl/patterns/matmul_post_ops.cpp b/src/graph/backend/dnnl/patterns/matmul_post_ops.cpp
index f66f9f0f90d..9dbcaae28f6 100644
--- a/src/graph/backend/dnnl/patterns/matmul_post_ops.cpp
+++ b/src/graph/backend/dnnl/patterns/matmul_post_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -283,7 +283,7 @@ features on GPU:
 Note: This pattern also accepts fp32 as weight input
 */
 #if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE
-DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8s8x8_matmul_add_post_ops_gpu)
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8x8x8_matmul_add_post_ops_gpu)
         .set_priority(10.f)
         .set_engine_kind(engine_kind::gpu)
         .set_kind(partition_kind_t::quantized_matmul_post_ops)
@@ -305,8 +305,8 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8s8x8_matmul_add_post_ops_gpu)
                             = pgraph->append_op(graph::op_kind::Dequantize,
                                     in_edges_t {in_edge(0, popt, 0)});
                     dequant_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::s8>);
-
+                            check_unsupported_input_dtype<
+                                    graph::data_type::u8>);
                     pm::pb_op_t *pmatmul
                             = pgraph->append_op(graph::op_kind::MatMul,
                                     in_edges_t {in_edge(0, dequant_data, 0),
@@ -361,8 +361,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8x8x_tc_matmul_post_ops)
                 [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
                     pm::pb_op_t *dequant_data
                             = pgraph->append_op(graph::op_kind::Dequantize);
-                    dequant_data->append_decision_function(
-                            is_int8_quantization);
                     pm::pb_op_t *typecast_data
                             = pgraph->append_op(graph::op_kind::TypeCast,
                                     in_edges_t {in_edge(0, dequant_data, 0)});
@@ -373,7 +371,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8x8x_tc_matmul_post_ops)
                     auto popt_quant_wei_graph = std::make_shared<pb_graph_t>();
                     pm::pb_op_t *pquant = popt_quant_wei_graph->append_op(
                             graph::op_kind::Quantize);
-                    pquant->append_decision_function(is_int8_quantization);
                     pquant->append_decision_function(check_if_constant_weight);
                     popt_quant_wei_graph->create_input_port(0, pquant, 0);
                     popt_quant_wei_graph->create_output_port(0, pquant, 0);
@@ -383,8 +380,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, x8x8x_tc_matmul_post_ops)
                     pm::pb_op_t *dequant_weight
                             = pgraph->append_op(graph::op_kind::Dequantize,
                                     in_edges_t {in_edge(0, popt_quant_wei, 0)});
-                    dequant_weight->append_decision_function(
-                            is_int8_quantization);
 
                     pm::pb_op_t *typecast_weight
                             = pgraph->append_op(graph::op_kind::TypeCast,
@@ -476,8 +471,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
                 [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
                     pm::pb_op_t *dequant_data
                             = pgraph->append_op(graph::op_kind::Dequantize);
-                    dequant_data->append_decision_function(
-                            is_int8_quantization);
                     pm::pb_op_t *typecast_data
                             = pgraph->append_op(graph::op_kind::TypeCast,
                                     in_edges_t {in_edge(0, dequant_data, 0)});
@@ -497,8 +490,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
                     pm::pb_op_t *dequant_weight
                             = pgraph->append_op(graph::op_kind::Dequantize,
                                     in_edges_t {in_edge(0, popt_quant_wei, 0)});
-                    dequant_weight->append_decision_function(
-                            is_int8_quantization);
 
                     pm::pb_op_t *typecast_weight
                             = pgraph->append_op(graph::op_kind::TypeCast,
@@ -517,8 +508,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
                     // post add with dequant->typecast
                     pm::pb_op_t *pdequant_add
                             = pgraph->append_op(graph::op_kind::Dequantize);
-                    pdequant_add->append_decision_function(
-                            is_int8_quantization);
                     pm::pb_op_t *typecast_add
                             = pgraph->append_op(graph::op_kind::TypeCast,
                                     in_edges_t {in_edge(0, pdequant_add, 0)});
@@ -557,7 +546,7 @@ Note: This pattern also accepts fp32 as weight input
 */
 #if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE
 DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
-        dnnl, x8s8x8_tc_matmul_add_post_ops_gpu)
+        dnnl, x8x8x8_tc_matmul_add_post_ops_gpu)
         .set_priority(10.5f)
         .set_engine_kind(engine_kind::gpu)
         .set_kind(partition_kind_t::quantized_matmul_post_ops)
@@ -565,8 +554,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
                 [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
                     pm::pb_op_t *dequant_data
                             = pgraph->append_op(graph::op_kind::Dequantize);
-                    dequant_data->append_decision_function(
-                            is_int8_quantization);
                     pm::pb_op_t *typecast_data
                             = pgraph->append_op(graph::op_kind::TypeCast,
                                     in_edges_t {in_edge(0, dequant_data, 0)});
@@ -587,7 +574,8 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
                             = pgraph->append_op(graph::op_kind::Dequantize,
                                     in_edges_t {in_edge(0, popt_quant_wei, 0)});
                     dequant_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::s8>);
+                            check_unsupported_input_dtype<
+                                    graph::data_type::u8>);
 
                     pm::pb_op_t *typecast_weight
                             = pgraph->append_op(graph::op_kind::TypeCast,
@@ -606,8 +594,6 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(
                     // post add with dequant->typecast
                     pm::pb_op_t *pdequant_add
                             = pgraph->append_op(graph::op_kind::Dequantize);
-                    pdequant_add->append_decision_function(
-                            is_int8_quantization);
                     pdequant_add->append_decision_function(check_zps_values<0>);
                     pm::pb_op_t *typecast_add
                             = pgraph->append_op(graph::op_kind::TypeCast,
diff --git a/src/graph/backend/dnnl/patterns/mlp.cpp b/src/graph/backend/dnnl/patterns/mlp.cpp
new file mode 100644
index 00000000000..0310c2be138
--- /dev/null
+++ b/src/graph/backend/dnnl/patterns/mlp.cpp
@@ -0,0 +1,221 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "graph/backend/dnnl/kernels/large_partition.hpp"
+
+#include "graph/backend/dnnl/patterns/fusions.hpp"
+#include "graph/backend/dnnl/patterns/pattern_matcher_pass.hpp"
+#include "graph/backend/dnnl/patterns/utils.hpp"
+
+#include "graph/utils/pm/pbuilder.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+namespace dnnl_impl {
+namespace pattern {
+
+namespace pm = graph::utils::pm;
+using in_edges_t = pm::in_edges_t;
+using pb_graph_t = pm::pb_graph_t;
+using FCreatePattern = graph::pass::FCreatePattern;
+
+DNNL_BACKEND_REGISTER_PATTERN_DEF_BEGIN(mlp)
+
+/*
+//        /      \
+//  matmul (gt)  matmul (up)
+//     |          |
+//    unary*      |
+//        \      /
+//        multiply
+//           |
+//      matmul (down)
+*/
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, gated_mlp)
+        .set_priority(22.0f)
+        .set_kind(partition_kind_t::matmul_post_ops)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    pm::pb_op_t *fc_up
+                            = pgraph->append_op(graph::op_kind::MatMul);
+                    pm::pb_op_t *fc_gt
+                            = pgraph->append_op(graph::op_kind::MatMul);
+                    pgraph->create_input_port(0, fc_up, 0);
+                    pgraph->create_input_port(0, fc_gt, 0);
+
+                    // activations after fc_gt
+                    auto alt_graph = std::make_shared<pb_graph_t>();
+                    auto palt = alt_graph->append_alternation(get_unary_ops());
+                    alt_graph->create_input_port(0, palt, 0);
+                    alt_graph->create_output_port(0, palt, 0);
+                    // The activation is optional
+                    auto act = pgraph->append_optional(
+                            alt_graph, in_edges_t {in_edge(0, fc_gt, 0)});
+
+                    // binary: add/div/mul/sub
+                    in_edges_t edges
+                            = {in_edge(0, act, 0), in_edge(1, fc_up, 0)};
+                    auto bin = pgraph->append_alternation(
+                            get_binary_ops(), edges);
+
+                    // fc_down
+                    pgraph->append_op(graph::op_kind::MatMul,
+                            in_edges_t {in_edge(0, bin, 0)});
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<larger_partition_kernel_t>();
+        });
+
+// gated mlp with swish decomposed to sigmoid and multiply.
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, gated_mlp_v1)
+        .set_priority(22.05f)
+        .set_kind(partition_kind_t::matmul_post_ops)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    pm::pb_op_t *fc_up
+                            = pgraph->append_op(graph::op_kind::MatMul);
+                    pm::pb_op_t *fc_gt
+                            = pgraph->append_op(graph::op_kind::MatMul);
+                    pgraph->create_input_port(0, fc_up, 0);
+                    pgraph->create_input_port(0, fc_gt, 0);
+
+                    // swish (sigmoid + mul) after fc_gt
+                    pm::pb_op_t *swish_sig = pgraph->append_op(
+                            graph::op_kind::Sigmoid, {in_edge(0, fc_gt, 0)});
+                    in_edges_t swish_mul_edges
+                            = {in_edge(0, fc_gt, 0), in_edge(1, swish_sig, 0)};
+                    pm::pb_op_t *swish_mul = pgraph->append_op(
+                            graph::op_kind::Multiply, swish_mul_edges);
+
+                    // binary: add/div/mul/sub
+                    in_edges_t edges
+                            = {in_edge(0, swish_mul, 0), in_edge(1, fc_up, 0)};
+                    auto bin = pgraph->append_alternation(
+                            get_binary_ops(), edges);
+
+                    // fc_down
+                    pgraph->append_op(graph::op_kind::MatMul,
+                            in_edges_t {in_edge(0, bin, 0)});
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<larger_partition_kernel_t>();
+        });
+
+/*
+//        |          |
+//       deq        deq
+//  \   /        \  /
+//  matmul (gt)  matmul (up)
+//     |          |
+//    unary*      |
+//        \      /   deq
+//         binary   /
+//           |     /
+//           matmul (down)
+//              |
+*/
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, quantized_gated_mlp)
+        .set_priority(22.1f)
+        .set_kind(partition_kind_t::matmul_post_ops)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    pm::pb_op_t *deq_up = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    pm::pb_op_t *deq_gate = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    pm::pb_op_t *fc_up = pgraph->append_op(
+                            graph::op_kind::MatMul, {in_edge(1, deq_up, 0)});
+                    pm::pb_op_t *fc_gt = pgraph->append_op(
+                            graph::op_kind::MatMul, {in_edge(1, deq_gate, 0)});
+                    pgraph->create_input_port(0, fc_up, 0);
+                    pgraph->create_input_port(0, fc_gt, 0);
+
+                    // activations after fc_gt
+                    auto alt_graph = std::make_shared<pb_graph_t>();
+                    auto palt = alt_graph->append_alternation(get_unary_ops());
+                    alt_graph->create_input_port(0, palt, 0);
+                    alt_graph->create_output_port(0, palt, 0);
+                    // The activation is optional
+                    auto act = pgraph->append_optional(
+                            alt_graph, in_edges_t {in_edge(0, fc_gt, 0)});
+
+                    // binary: add/div/mul/sub
+                    in_edges_t edges
+                            = {in_edge(0, act, 0), in_edge(1, fc_up, 0)};
+                    auto bin = pgraph->append_alternation(
+                            get_binary_ops(), edges);
+
+                    // fc_down
+                    pm::pb_op_t *deq_down = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    in_edges_t fc_down_edges
+                            = {in_edge(0, bin, 0), in_edge(1, deq_down, 0)};
+                    pgraph->append_op(graph::op_kind::MatMul, fc_down_edges);
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<larger_partition_kernel_t>();
+        });
+
+// quantized gated mlp with swish decomposed to sigmoid and multiply.
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, quantized_gated_mlp_v1)
+        .set_priority(22.1f)
+        .set_kind(partition_kind_t::matmul_post_ops)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    pm::pb_op_t *deq_up = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    pm::pb_op_t *deq_gate = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    pm::pb_op_t *fc_up = pgraph->append_op(
+                            graph::op_kind::MatMul, {in_edge(1, deq_up, 0)});
+                    pm::pb_op_t *fc_gt = pgraph->append_op(
+                            graph::op_kind::MatMul, {in_edge(1, deq_gate, 0)});
+                    pgraph->create_input_port(0, fc_up, 0);
+                    pgraph->create_input_port(0, fc_gt, 0);
+
+                    // swish (sigmoid + mul) after fc_gt
+                    pm::pb_op_t *swish_sig = pgraph->append_op(
+                            graph::op_kind::Sigmoid, {in_edge(0, fc_gt, 0)});
+                    in_edges_t swish_mul_edges
+                            = {in_edge(0, fc_gt, 0), in_edge(1, swish_sig, 0)};
+                    pm::pb_op_t *swish_mul = pgraph->append_op(
+                            graph::op_kind::Multiply, swish_mul_edges);
+
+                    // binary: add/div/mul/sub
+                    in_edges_t edges
+                            = {in_edge(0, swish_mul, 0), in_edge(1, fc_up, 0)};
+                    auto bin = pgraph->append_alternation(
+                            get_binary_ops(), edges);
+
+                    // fc_down
+                    pm::pb_op_t *deq_down = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    in_edges_t fc_down_edges
+                            = {in_edge(0, bin, 0), in_edge(1, deq_down, 0)};
+                    pgraph->append_op(graph::op_kind::MatMul, fc_down_edges);
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<larger_partition_kernel_t>();
+        });
+
+DNNL_BACKEND_REGISTER_PATTERN_DEF_END
+
+} // namespace pattern
+} // namespace dnnl_impl
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
diff --git a/src/graph/backend/dnnl/patterns/pattern_matcher_pass.hpp b/src/graph/backend/dnnl/patterns/pattern_matcher_pass.hpp
index a208354753b..7555b03ddb5 100644
--- a/src/graph/backend/dnnl/patterns/pattern_matcher_pass.hpp
+++ b/src/graph/backend/dnnl/patterns/pattern_matcher_pass.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,10 +90,10 @@ inline void pattern_utils_t::init_partition(graph_t &backend_graph,
  * \brief pattern_matcher_pass_t generates an optimized graph
  *        when a pre-defined pattern is hit.
  */
-class pattern_matcher_pass_t : public graph::pass::pass_base {
+class pattern_matcher_pass_t : public graph::pass::pass_base_t {
 public:
     explicit pattern_matcher_pass_t(std::string pbackend, std::string pname)
-        : graph::pass::pass_base(std::move(pbackend), std::move(pname)) {}
+        : graph::pass::pass_base_t(std::move(pbackend), std::move(pname)) {}
 
     static graph::pass::pass_base_ptr create(
             std::string pbackend, std::string pname) {
diff --git a/src/graph/backend/dnnl/patterns/pool_post_ops.cpp b/src/graph/backend/dnnl/patterns/pool_post_ops.cpp
index 2d35866c475..8a5899f02af 100644
--- a/src/graph/backend/dnnl/patterns/pool_post_ops.cpp
+++ b/src/graph/backend/dnnl/patterns/pool_post_ops.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,10 +33,14 @@ using pb_graph_t = pm::pb_graph_t;
 using FCreatePattern = graph::pass::FCreatePattern;
 
 bool check_avgpool_attributes(op_t *op) {
-    return !(op->get_kind() == graph::op_kind::AvgPool
+    bool result = !(op->get_kind() == graph::op_kind::AvgPool
             && op->get_attr<std::string>(graph::op_attr::rounding_type)
                     == "ceil"
             && op->get_attr<bool>(graph::op_attr::exclude_pad) == false);
+    VCHECK_PATTERN_UTILS(result, result,
+            "unsupported avgpool attributes combination: ceil rounding type "
+            "and exclude_pad=false");
+    return result;
 }
 
 DNNL_BACKEND_REGISTER_PATTERN_DEF_BEGIN(pool_post_ops)
diff --git a/src/graph/backend/dnnl/patterns/quantize_fusion.cpp b/src/graph/backend/dnnl/patterns/quantize_fusion.cpp
index 06b1ae7f98d..bba5b59c8ab 100644
--- a/src/graph/backend/dnnl/patterns/quantize_fusion.cpp
+++ b/src/graph/backend/dnnl/patterns/quantize_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "graph/backend/dnnl/kernels/quantize.hpp"
 #include "graph/backend/dnnl/patterns/fusions.hpp"
 #include "graph/backend/dnnl/patterns/pattern_matcher_pass.hpp"
+#include "graph/backend/dnnl/patterns/utils.hpp"
 #include "graph/utils/pm/pbuilder.hpp"
 
 namespace dnnl {
@@ -34,7 +35,8 @@ namespace {
 bool check_inputs_all_bf16(op_t *op) {
     for (size_t i = 0; i < op->num_inputs(); ++i) {
         logical_tensor_t iport = op->get_input_value(i)->get_logical_tensor();
-        if (iport.data_type != graph::data_type::bf16) return false;
+        VCHECK_PATTERN_UTILS(iport.data_type == graph::data_type::bf16, false,
+                "input data type for typecast-quantize fusion is not bf16");
     }
     return true;
 }
diff --git a/src/graph/backend/dnnl/patterns/reduction_fusion.cpp b/src/graph/backend/dnnl/patterns/reduction_fusion.cpp
index da303629dfd..d91a01367a4 100644
--- a/src/graph/backend/dnnl/patterns/reduction_fusion.cpp
+++ b/src/graph/backend/dnnl/patterns/reduction_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ namespace {
 bool check_attributes(op_t *graph_op) {
     if (graph_op->has_attr(op_attr::axes)
             && graph_op->get_attr<std::vector<int64_t>>(op_attr::axes).empty())
-        return false;
+        VCHECK_PATTERN_UTILS(false, false, "reduction axes is empty");
     return true;
 }
 } // namespace
diff --git a/src/graph/backend/dnnl/patterns/sdp.cpp b/src/graph/backend/dnnl/patterns/sdp.cpp
index f2968d8b2db..928f68ce8ec 100644
--- a/src/graph/backend/dnnl/patterns/sdp.cpp
+++ b/src/graph/backend/dnnl/patterns/sdp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,36 +101,51 @@ void create_gpt_sdp(
     }
 }
 
+graph::utils::pm::repetition_t *optional_scale_and_masks(
+        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *matmul_qk,
+        bool check_xf16 = false) {
+    auto opt_scale = optional_scale(pgraph, matmul_qk);
+    auto opt_causal_mask = optional_causal_mask(pgraph, opt_scale, check_xf16);
+    auto opt_explicit_mask = optional_explicit_mask(pgraph, opt_causal_mask);
+    // Optional select for distilbert
+    auto opt_select = optional_select(pgraph, opt_explicit_mask, 2);
+    return opt_select;
+}
+
 DNNL_BACKEND_REGISTER_PATTERN_DEF_BEGIN(sdp)
 
-DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, float_sdp_fusion)
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, float_sdp_fusion_cpu)
         .set_priority(21.0f)
         .set_kind(partition_kind_t::sdp)
+        .set_engine_kind(engine_kind::cpu)
         .set_attr<FCreatePattern>("FCreatePattern",
                 [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
                     auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
+                    auto optional_scale_and_mask
+                            = optional_scale_and_masks(pgraph, matmul_qk);
+                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
+                            {in_edge(0, optional_scale_and_mask, 0)});
+                    auto matmul_v = pgraph->append_op(
+                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
+                    // Optional transpose + reshape/reorder
+                    optional_transpose_reshape(pgraph, matmul_v, 0);
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<sdp_base_t<>>();
+        });
 
-                    std::shared_ptr<pb_graph_t> scale_graph;
-                    scale_graph = std::make_shared<pb_graph_t>();
-                    auto scale = scale_graph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply});
-                    scale_graph->create_input_port(0, scale, 0);
-                    scale_graph->create_output_port(0, scale, 0);
-                    auto optional_scale = pgraph->append_optional(
-                            scale_graph, {in_edge(0, matmul_qk, 0)});
-
-                    auto optional_mask = std::make_shared<pb_graph_t>();
-                    auto fscore_add
-                            = optional_mask->append_op(graph::op_kind::Add);
-                    optional_mask->create_input_port(0, fscore_add, 0);
-                    optional_mask->create_output_port(0, fscore_add, 0);
-                    auto mask = pgraph->append_optional(
-                            optional_mask, {in_edge(0, optional_scale, 0)});
-
-                    // Optional select for distilbert
-                    auto p_select2 = optional_select(pgraph, mask, 2);
+// for implicit causal mask, gpu only supports f16/bf16 dtype
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, float_sdp_fusion_gpu)
+        .set_priority(21.0f)
+        .set_kind(partition_kind_t::sdp)
+        .set_engine_kind(engine_kind::gpu)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
+                    auto optional_scale_and_mask
+                            = optional_scale_and_masks(pgraph, matmul_qk);
                     auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, p_select2, 0)});
+                            {in_edge(0, optional_scale_and_mask, 0)});
                     auto matmul_v = pgraph->append_op(
                             graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
                     // Optional transpose + reshape/reorder
@@ -391,6 +406,77 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, int8_bf16_sdp_fusion)
                     sdp_base_t<true, memory::data_type::bf16>>();
         });
 
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, sdp_with_compressed_kv_fusion)
+        .set_priority(22.0f)
+        .set_kind(partition_kind_t::quantized_sdp)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    auto dequantize_key = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
+                            {in_edge(1, dequantize_key, 0)});
+
+                    auto optional_scale_and_mask
+                            = optional_scale_and_masks(pgraph, matmul_qk);
+
+                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
+                            {in_edge(0, optional_scale_and_mask, 0)});
+                    auto dequantize_value = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    pgraph->append_op(graph::op_kind::MatMul,
+                            {in_edge(0, softmax, 0),
+                                    in_edge(1, dequantize_value, 0)});
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<
+                    sdp_base_t<true, memory::data_type::bf16>>();
+        });
+
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, sdp_with_compressed_v_fusion)
+        .set_priority(22.0f)
+        .set_kind(partition_kind_t::quantized_sdp)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
+
+                    auto optional_scale_and_mask
+                            = optional_scale_and_masks(pgraph, matmul_qk);
+                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
+                            {in_edge(0, optional_scale_and_mask, 0)});
+                    auto dequantize_value = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    pgraph->append_op(graph::op_kind::MatMul,
+                            {in_edge(0, softmax, 0),
+                                    in_edge(1, dequantize_value, 0)});
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<
+                    sdp_base_t<true, memory::data_type::bf16>>();
+        });
+
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, sdp_with_compressed_k_fusion)
+        .set_priority(22.0f)
+        .set_kind(partition_kind_t::quantized_sdp)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    auto dequantize_key = pgraph->append_op(
+                            graph::op_kind::DynamicDequantize);
+                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
+                            {in_edge(1, dequantize_key, 0)});
+
+                    auto optional_scale_and_mask
+                            = optional_scale_and_masks(pgraph, matmul_qk);
+
+                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
+                            {in_edge(0, optional_scale_and_mask, 0)});
+                    pgraph->append_op(
+                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
+                })
+        .set_attr<FCreateKernel>("FCreateKernel", []() -> kernel_ptr {
+            return std::make_shared<
+                    sdp_base_t<true, memory::data_type::bf16>>();
+        });
+
 DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, float_gpt_sdp)
         .set_priority(22.0f)
         .set_kind(partition_kind_t::sdp)
diff --git a/src/graph/backend/dnnl/patterns/single_op_pattern.cpp b/src/graph/backend/dnnl/patterns/single_op_pattern.cpp
index 73a22f6c45c..0571de2c9a8 100644
--- a/src/graph/backend/dnnl/patterns/single_op_pattern.cpp
+++ b/src/graph/backend/dnnl/patterns/single_op_pattern.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
+ * Copyright 2020-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -291,6 +291,7 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, convtranspose_weights_bwd_pass)
         });
 #endif
 
+DNNL_BACKEND_SINGLE_OP_TRANSFORM(gen_index_pass, GenIndex, genindex_t)
 DNNL_BACKEND_SINGLE_OP_TRANSFORM(matmul_pass, MatMul, float_matmul)
 DNNL_BACKEND_SINGLE_OP_TRANSFORM(max_pool_pass, MaxPool, float_pooling_fwd)
 DNNL_BACKEND_SINGLE_OP_TRANSFORM(prelu_pass, PReLU, float_prelu_fwd)
@@ -424,6 +425,18 @@ DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, reduce_pass)
             return std::make_shared<float_reduction>();
         });
 
+// GreaterEqual currently is CPU only
+DNNL_BACKEND_REGISTER_PATTERN_MATCHER_PASS(dnnl, greater_equal_pass)
+        .set_priority(DEFAULT_P)
+        .set_kind(partition_kind_t::misc_post_ops)
+        .set_engine_kind(engine_kind::cpu)
+        .set_attr<FCreatePattern>("FCreatePattern",
+                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
+                    pgraph->append_op(graph::op_kind::GreaterEqual);
+                })
+        .set_attr<FCreateKernel>("FCreateKernel",
+                []() -> kernel_ptr { return std::make_shared<binary_t>(); });
+
 #undef DNNL_BACKEND_SINGLE_OP_TRANSFORM
 #undef DEFAULT_P
 
diff --git a/src/graph/backend/dnnl/patterns/utils.hpp b/src/graph/backend/dnnl/patterns/utils.hpp
index 24e204cc788..ff50aa625a9 100644
--- a/src/graph/backend/dnnl/patterns/utils.hpp
+++ b/src/graph/backend/dnnl/patterns/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,6 +31,10 @@ namespace graph {
 namespace dnnl_impl {
 namespace pattern {
 
+#define VCHECK_PATTERN_UTILS(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, pattern, (cond), status, msg, \
+            ##__VA_ARGS__);
+
 template <int64_t N>
 bool check_zps_values(op_t *op) {
     if (op->has_attr(op_attr::zps) == false) return true;
@@ -61,6 +65,17 @@ bool check_output_num(op_t *op) {
     return op->num_outputs() == N;
 }
 
+template <data_type_t DTYPE>
+bool check_unsupported_input_dtype(op_t *op) {
+    for (size_t i = 0; i < op->num_inputs(); ++i) {
+        const logical_tensor_t &iport
+                = op->get_input_value(i)->get_logical_tensor();
+        if (iport.data_type == DTYPE) return false;
+    }
+
+    return true;
+}
+
 template <data_type_t DTYPE>
 bool check_input_dtype(op_t *op) {
     for (size_t i = 0; i < op->num_inputs(); ++i) {
@@ -360,7 +375,6 @@ inline graph::utils::pm::pb_node_t *optional_smooth_quant(
     graph::utils::pm::pb_op_t *quant_out
             = p_curr_graph->append_op(graph::op_kind::Quantize,
                     graph::utils::pm::in_edges_t {in_edge(0, opt, 0)});
-    quant_out->append_decision_function(is_int8_quantization);
     if (optional_qout) {
         p_curr_graph->create_input_port(0, opt, 0);
         p_curr_graph->create_output_port(0, quant_out, 0);
@@ -390,6 +404,77 @@ inline graph::utils::pm::repetition_t *optional_select(
     return pselect;
 }
 
+inline graph::utils::pm::repetition_t *optional_scale(
+        const std::shared_ptr<graph::utils::pm::pb_graph_t> &pgraph,
+        graph::utils::pm::pb_node_t *input) {
+    auto scale_graph = std::make_shared<graph::utils::pm::pb_graph_t>();
+    auto scale = scale_graph->append_alternation(
+            {graph::op_kind::Divide, graph::op_kind::Multiply});
+    scale_graph->create_input_port(0, scale, 0);
+    scale_graph->create_output_port(0, scale, 0);
+    auto optional_scale
+            = pgraph->append_optional(scale_graph, {in_edge(0, input, 0)});
+    return optional_scale;
+}
+
+inline graph::utils::pm::repetition_t *optional_explicit_mask(
+        const std::shared_ptr<graph::utils::pm::pb_graph_t> &pgraph,
+        graph::utils::pm::pb_node_t *scaled_output) {
+    auto mask_graph = std::make_shared<graph::utils::pm::pb_graph_t>();
+    auto add = mask_graph->append_op(graph::op_kind::Add);
+    mask_graph->create_input_port(0, add, 0);
+    mask_graph->create_output_port(0, add, 0);
+    auto optional_mask = pgraph->append_optional(
+            mask_graph, {in_edge(0, scaled_output, 0)});
+    return optional_mask;
+}
+
+inline bool check_inputs_xf16(op_t *op) {
+    for (size_t i = 0; i < op->num_inputs(); ++i) {
+        const logical_tensor_t &iport
+                = op->get_input_value(i)->get_logical_tensor();
+        if (iport.data_type != graph::data_type::f16
+                && iport.data_type != graph::data_type::bf16)
+            return false;
+    }
+
+    return true;
+}
+
+// Implicit Causal Mask
+inline graph::utils::pm::repetition_t *optional_causal_mask(
+        const std::shared_ptr<graph::utils::pm::pb_graph_t> &pgraph,
+        graph::utils::pm::pb_node_t *scaled_output, bool check_xf16 = false) {
+    auto popt_graph = std::make_shared<graph::utils::pm::pb_graph_t>();
+
+    graph::utils::pm::pb_op_t *gen_index_row
+            = popt_graph->append_op(graph::op_kind::GenIndex);
+    if (check_xf16) {
+        // sdpa_primitive only supports f16/bf16 on gpu
+        // for other dtypes, we don't have a reference implementation on gpu
+        // so filter them out
+        gen_index_row->append_decision_function(check_inputs_xf16);
+    }
+    graph::utils::pm::pb_op_t *gen_index_col
+            = popt_graph->append_op(graph::op_kind::GenIndex);
+    graph::utils::pm::pb_op_t *greater_equal
+            = popt_graph->append_op(graph::op_kind::GreaterEqual,
+                    graph::utils::pm::in_edges_t {in_edge(0, gen_index_row, 0),
+                            {in_edge(1, gen_index_col, 0)}});
+    graph::utils::pm::pb_op_t *select = popt_graph->append_op(
+            graph::op_kind::Select,
+            graph::utils::pm::in_edges_t {in_edge(0, greater_equal, 0)});
+
+    popt_graph->create_input_port(0, gen_index_row, 0);
+    popt_graph->create_input_port(0, gen_index_col, 0);
+    popt_graph->create_input_port(0, select, 1);
+    popt_graph->create_input_port(1, select, 2);
+    popt_graph->create_output_port(0, select, 0);
+    auto pmask = pgraph->append_optional(popt_graph,
+            graph::utils::pm::in_edges_t {in_edge(0, scaled_output, 0)});
+    return pmask;
+}
+
 // Optional (transpose + reorder/staticReshape)
 inline graph::utils::pm::repetition_t *optional_transpose_reshape(
         const std::shared_ptr<graph::utils::pm::pb_graph_t> &pgraph,
@@ -459,7 +544,7 @@ inline graph::utils::pm::pb_node_t *append_optional_typecast_quantize(
         graph::utils::pm::pb_node_t *input, bool is_bf16 = false) {
     auto subgraph = std::make_shared<graph::utils::pm::pb_graph_t>();
     graph::utils::pm::in_edges_t in_edges;
-    graph::utils::pm::pb_node_t *subgraph_in_node;
+    graph::utils::pm::pb_node_t *subgraph_in_node = nullptr;
     if (is_bf16) {
         auto typecast_output = subgraph->append_op(graph::op_kind::TypeCast);
         in_edges
diff --git a/src/graph/backend/dnnl/platform.cpp b/src/graph/backend/dnnl/platform.cpp
index 4b8016f7e5c..08f0497b8f7 100644
--- a/src/graph/backend/dnnl/platform.cpp
+++ b/src/graph/backend/dnnl/platform.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2024 Intel Corporation
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,11 +21,9 @@
 #if DNNL_X64
 #include "cpu/x64/cpu_isa_traits.hpp"
 #elif DNNL_AARCH64
-#if DNNL_AARCH64_USE_ACL
+#if DNNL_USE_ACL
 // For checking if fp16 isa is supported on the platform
 #include "arm_compute/core/CPP/CPPTypes.h"
-// For setting the number of threads for ACL
-#include "src/common/cpuinfo/CpuInfo.h"
 #endif
 #endif
 #endif
@@ -108,7 +107,7 @@ bool has_cpu_data_type_support(data_type_t data_type) {
 #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
             return true;
 #endif
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_bf16();
 #else
             return false;
@@ -116,7 +115,7 @@ bool has_cpu_data_type_support(data_type_t data_type) {
         case data_type::f16:
 #if DNNL_X64
             return mayiuse(avx512_core_fp16) || mayiuse(avx2_vnni_2);
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_fp16();
 #else
             return false;
@@ -146,7 +145,7 @@ bool has_cpu_training_support(data_type_t data_type) {
 #if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
             return true;
 #endif
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_bf16();
 #else
             return false;
@@ -154,7 +153,7 @@ bool has_cpu_training_support(data_type_t data_type) {
         case data_type::f16:
 #if DNNL_X64
             return mayiuse(avx512_core_fp16);
-#elif DNNL_AARCH64_USE_ACL
+#elif DNNL_USE_ACL
             return arm_compute::CPUInfo::get().has_fp16();
 #else
             return false;
diff --git a/src/graph/backend/dnnl/scratchpad.hpp b/src/graph/backend/dnnl/scratchpad.hpp
index 5653f1e8d57..9d41fab4ecf 100644
--- a/src/graph/backend/dnnl/scratchpad.hpp
+++ b/src/graph/backend/dnnl/scratchpad.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include "oneapi/dnnl/dnnl.hpp"
 
 #ifdef DNNL_WITH_SYCL
-#include "graph/utils/sycl_check.hpp"
+#include "oneapi/dnnl/dnnl_sycl.hpp"
 #endif
 
 namespace dnnl {
@@ -85,15 +85,9 @@ class temporary_scratchpad_t : public scratchpad_t {
         size_ = 0;
     }
 
-    temporary_scratchpad_t(temporary_scratchpad_t &&other) noexcept
-        : buffer_(nullptr), size_(0) {
-        buffer_ = other.buffer_;
-        size_ = other.size_;
-        eng_ = other.eng_;
-        alloc_ = other.alloc_;
-        other.buffer_ = nullptr;
-        other.size_ = 0;
-    }
+    // Disable assignment and copy
+    temporary_scratchpad_t(const temporary_scratchpad_t &) = delete;
+    temporary_scratchpad_t &operator=(const temporary_scratchpad_t &) = delete;
 
     char *get_buffer() const override { return buffer_; }
 
@@ -221,7 +215,8 @@ class grantor_t {
     }
 
     char *get(const registry_t::key_t &key) const {
-        return aligned_base_ptr_ + registry_.get(key);
+        return aligned_base_ptr_ ? (aligned_base_ptr_ + registry_.get(key))
+                                 : nullptr;
     }
 
 private:
diff --git a/src/graph/backend/dnnl/subgraph.cpp b/src/graph/backend/dnnl/subgraph.cpp
index 969b3aaf1cf..342fcc25a74 100644
--- a/src/graph/backend/dnnl/subgraph.cpp
+++ b/src/graph/backend/dnnl/subgraph.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,17 +42,21 @@ namespace dnnl {
 namespace impl {
 namespace graph {
 namespace dnnl_impl {
+#define VCHECK_SUBGRAPH(cond, status, msg, ...) \
+    VCONDCHECK(graph, create, check, subgraph, (cond), status, msg, \
+            ##__VA_ARGS__);
 using op_t = op_t;
 using op_ptr = std::shared_ptr<op_t>;
 using value_ptr = std::shared_ptr<value_t>;
 using ltw = logical_tensor_wrapper_t;
 
 subgraph_t::subgraph_t(const std::vector<op_ptr> &ops, const dnnl::engine &eng,
-        impl::fpmath_mode_t fpm_mode, bool can_use_blocked_layout,
+        const graph::fpmath_t &fpm_mode, bool can_use_blocked_layout,
         bool reset_layout)
-    : graph_t(ops, static_cast<engine_kind_t>(eng.get_kind()), fpm_mode)
+    : graph_t(ops, static_cast<engine_kind_t>(eng.get_kind()))
     , p_engine_(&eng)
     , fusion_info_mgr_(fpm_mode, can_use_blocked_layout) {
+    set_fpmath_mode(fpm_mode.mode_, fpm_mode.apply_to_int_);
     if (reset_layout) { set_all_layout_to_any(get_mutable_ops()); }
 }
 
@@ -278,10 +282,8 @@ status_t subgraph_validator_t::run(const std::shared_ptr<subgraph_t> &sg) {
         if (!opm) { return status::invalid_graph_op; }
 
         // Validate
-        if (!opm->verify(op, false)) {
-            assertm(false, "schema verify failed");
-            return status::invalid_graph_op;
-        }
+        VCHECK_SUBGRAPH(opm->verify(op, false), status::invalid_graph_op,
+                "schema verify failed for op %s", op->get_name().c_str());
 
         // Not allow undefined attributes
         const auto &expected_attrs = opm->get_attrs();
@@ -320,13 +322,10 @@ status_t subgraph_validator_t::run(const std::shared_ptr<subgraph_t> &sg) {
                 auto groups = op->get_attr<int64_t>(op_attr::groups);
                 bool ok = data_fmt == "NCX" && filter_fmt == "OIX"
                         && groups == 1;
-                if (!ok) {
-                    DEBUG_PRINT_ERROR("data_format:" + data_fmt + ";"
-                            + "filter_format:" + filter_fmt + ";"
-                            + "groups:" + std::to_string(groups));
-                    assertm(false, "additional verify failed");
-                    return status::invalid_graph_op;
-                }
+                VCHECK_SUBGRAPH(ok, status::invalid_graph_op,
+                        "additional verify failed for dnnl_convolution,  "
+                        "data_format:%s, filter_format:%s, groups:%ld",
+                        data_fmt.c_str(), filter_fmt.c_str(), groups);
             }
         } else {
             // TODO(qun)
@@ -384,10 +383,6 @@ void subgraph_rewriter_t::run() {
     to_be_inserted_ops_.clear();
 }
 
-subgraph_rewriter_t::~subgraph_rewriter_t() {
-    run();
-}
-
 void subgraph_rewriter_t::fuse_op_to_successor(const op_ptr &op) {
     assertm(op->num_inputs() == 1, "this op should have only one input value.");
     value_ptr in_val = op->get_input_value(0);
@@ -446,6 +441,26 @@ void subgraph_rewriter_t::insert_op_before(const op_ptr &inserted_op,
     auto in_dtype = in_val->get_logical_tensor().data_type;
     new_val->set_data_type(in_dtype);
 
+    if (inserted_op->get_kind() == op_kind::dnnl_permute
+            && (base_op->get_kind() == op_kind::dnnl_mul_scales
+                    || base_op->get_kind() == op_kind::dnnl_sub_zps)) {
+        // Only abx tag is respected for scale and zps inputs, should set
+        // strides explicitly and execute reorder.
+
+        dnnl::memory::desc in_md
+                = make_dnnl_memory_desc(in_val->get_logical_tensor());
+        const auto &perm = inserted_op->get_attr<std::vector<int64_t>>(
+                op_attr::permutation);
+        std::vector<int> int_perm(perm.size(), -1);
+        for (size_t i = 0; i < perm.size(); i++) {
+            int_perm[i] = static_cast<int>(perm[i]);
+        }
+        dnnl::memory::desc out_md = in_md.permute_axes(int_perm);
+        const auto &dims = out_md.get_dims();
+        // set the strides with abx tag.
+        new_val->set_strides(get_dense_strides(dims));
+    }
+
     if (k == std::numeric_limits<size_t>::max()) {
         k = inserted_op->num_outputs();
     }
diff --git a/src/graph/backend/dnnl/subgraph.hpp b/src/graph/backend/dnnl/subgraph.hpp
index b65bbdb1be9..a8b6e0e26c5 100644
--- a/src/graph/backend/dnnl/subgraph.hpp
+++ b/src/graph/backend/dnnl/subgraph.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
+ * Copyright 2022-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 #include "graph/interface/c_types_map.hpp"
 #include "graph/interface/graph.hpp"
+#include "graph/interface/graph_attr.hpp"
 #include "graph/interface/op.hpp"
 #include "graph/interface/value.hpp"
 #include "graph/utils/utils.hpp"
@@ -65,7 +66,7 @@ class subgraph_t : public graph_t {
 
 public:
     subgraph_t(const std::vector<op_ptr> &ops, const dnnl::engine &eng,
-            impl::fpmath_mode_t fpm_mode, bool can_use_blocked_layout,
+            const graph::fpmath_t &fpm_mode, bool can_use_blocked_layout,
             bool reset_layout);
 
     subgraph_t(const std::vector<op_ptr> &ops, bool reset_layout = true);
@@ -108,6 +109,7 @@ class subgraph_visualizer_t {
     {
         MAYBE_UNUSED(partition_id);
         // Set _DNNL_BACKEND_SUBGRAPH_DUMP=1 to enable dump subgraph
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
         enabled_ = graph::utils::getenv_int_internal("BACKEND_SUBGRAPH_DUMP", 0)
                 > 0;
     }
@@ -140,8 +142,6 @@ class subgraph_rewriter_t {
 public:
     subgraph_rewriter_t(std::shared_ptr<subgraph_t> &sg) : subgraph_(sg) {}
 
-    ~subgraph_rewriter_t();
-
     // Finalize the rewriting, which actually insert/remove the op to/from
     // subgraph op list
     void run();
diff --git a/src/graph/backend/dnnl/thread_local_cache.hpp b/src/graph/backend/dnnl/thread_local_cache.hpp
index a91a2ac1c5e..79087fabb70 100644
--- a/src/graph/backend/dnnl/thread_local_cache.hpp
+++ b/src/graph/backend/dnnl/thread_local_cache.hpp
@@ -232,7 +232,7 @@ class thread_local_cache_t {
                                 });
                         assertm(pos != thread_instances.end(),
                                 "expected value to exist in cache");
-                        // Detroy it
+                        // Destroy it
                         thread_instances.erase(pos);
                     }
                 }
@@ -240,6 +240,9 @@ class thread_local_cache_t {
             global_cache_ref_.release();
         }
 
+        cache_type_t(const cache_type_t &) = delete;
+        cache_type_t &operator=(const cache_type_t &) = delete;
+
         std::unordered_map<size_t, std::weak_ptr<T>> &data() { return data_; }
 
         global_cache_type_t &global_cache_ref_;
diff --git a/src/graph/backend/dnnl/utils.hpp b/src/graph/backend/dnnl/utils.hpp
index b4b7de7d01b..f36c60d8465 100644
--- a/src/graph/backend/dnnl/utils.hpp
+++ b/src/graph/backend/dnnl/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -123,6 +123,15 @@ inline bool all_zero(const std::vector<int64_t> &vec) {
     return no_zero_pos == vec.end();
 }
 
+inline dim_t offset_compute(
+        const dims_t &strides, const dims_t &idx, int ndims) {
+    dim_t off = 0;
+    for (int i = 0; i < ndims; i++) {
+        off += idx[i] * strides[i];
+    }
+    return off;
+}
+
 } // namespace utils
 } // namespace dnnl_impl
 } // namespace graph
diff --git a/src/graph/backend/fake/transformation_pass.hpp b/src/graph/backend/fake/transformation_pass.hpp
index 792586c0875..92db0d74a56 100644
--- a/src/graph/backend/fake/transformation_pass.hpp
+++ b/src/graph/backend/fake/transformation_pass.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,10 +34,10 @@ namespace graph {
 namespace fake_impl {
 namespace pass {
 
-class transformation_pass_t : public graph::pass::pass_base {
+class transformation_pass_t : public graph::pass::pass_base_t {
 public:
     explicit transformation_pass_t(std::string pbackend, std::string pname)
-        : graph::pass::pass_base(std::move(pbackend), std::move(pname)) {}
+        : graph::pass::pass_base_t(std::move(pbackend), std::move(pname)) {}
 
     static graph::pass::pass_base_ptr create(
             std::string pbackend, std::string pname) {
diff --git a/src/graph/backend/graph_compiler/CMakeLists.txt b/src/graph/backend/graph_compiler/CMakeLists.txt
deleted file mode 100644
index fc28d34bc1e..00000000000
--- a/src/graph/backend/graph_compiler/CMakeLists.txt
+++ /dev/null
@@ -1,155 +0,0 @@
-#===============================================================================
-# Copyright 2021-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-if(NOT ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND)
-    message(STATUS "Graph compiler backend is disabled.")
-    return()
-endif()
-
-SET(SC_LLVM_VERSION "OFF" CACHE STRING "version of LLVM")
-SET(SC_LLVM_LIB_NAME "OFF" CACHE STRING "the lib name of LLVM for linker")
-SET(SC_LLVM_INCLUDE_PATH "OFF" CACHE STRING "the header include path of LLVM")
-
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/core/cmake")
-include("core/cmake/find_llvm.cmake")
-
-set(SC_LLVM_CONFIG ${ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_LLVM_CONFIG})
-
-if(DNNL_LIBRARY_TYPE STREQUAL "SHARED")
-    add_definitions_with_host_compiler(-DSC_DLL)
-    set(SC_LIBRARY_TYPE "SHARED")
-else()
-    set(SC_LIBRARY_TYPE "STATIC")
-endif()
-
-list (FIND ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT "llvm" _index)
-if (${_index} GREATER -1)
-    set(SC_LLVM_ENABLED ON)
-else()
-    set(SC_LLVM_ENABLED OFF)
-endif()
-
-list (FIND ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT "c" _index)
-if (NOT WIN32 AND NOT APPLE AND ${_index} GREATER -1)
-    set(SC_CFAKE_ENABLED ON)
-else()
-    set(SC_CFAKE_ENABLED OFF)
-endif()
-
-list (FIND ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT "builtin" _index)
-if (${_index} GREATER -1)
-    set(SC_BUILTIN_ENABLED ON)
-else()
-    set(SC_BUILTIN_ENABLED OFF)
-endif()
-set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_BUILTIN_ENABLED ${SC_BUILTIN_ENABLED})
-
-if(NOT SC_LLVM_ENABLED AND NOT SC_BUILTIN_ENABLED AND NOT SC_CFAKE_ENABLED)
-    message(FATAL_ERROR "ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT should include at least one of llvm;c;builtin")
-endif()
-
-if(SC_LLVM_ENABLED)
-    find_llvm()
-
-    if(${SC_LLVM_VERSION} STREQUAL OFF OR ${SC_LLVM_VERSION} LESS 10)
-        message(FATAL_ERROR "Expecting LLVM version >= 10, got ${SC_LLVM_VERSION}. "
-        "Consider turning off graph compiler backend, or use a higher LLVM version.")
-    else()
-        if("${SC_LLVM_INCLUDE_PATH}" STREQUAL OFF)
-            message(FATAL_ERROR "LLVM_INCLUDE_PATH is not successfully set. "
-            "Consider turning off graph compiler backend, or recheck LLVM Config.")
-        else()
-            message(STATUS "Found LLVM_VERSION=" ${SC_LLVM_VERSION})
-        endif()
-    endif()
-else()
-    message(STATUS "LLVM JIT is OFF")
-endif()
-
-if(DNNL_CPU_RUNTIME MATCHES "(DPCPP)$")
-    message(FATAL_ERROR "Graph compiler backend does not support the chosen CPU runtime. "
-    "Consider changing DNNL_CPU_RUNTIME to OMP or turning off graph compiler backend.")
-endif()
-
-message(STATUS "Graph compiler backend is enabled.")
-set(SC_CPU_RUNTIME ${DNNL_CPU_RUNTIME})
-if("${SC_CPU_RUNTIME}" STREQUAL "SYCL")
-    set(SC_CPU_RUNTIME "TBB")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "OMP")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=1")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "TBB")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=2")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "SEQ")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=0")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "THREADPOOL")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=3")
-endif()
-
-if(${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
-    set(CCXX_NOWARN_FLAGS "")
-    append(CCXX_NOWARN_FLAGS "/wd4200")
-    # allow usage of "deprecated" functions
-    append(CCXX_NOWARN_FLAGS "/wd4996")
-    # inherits via dominance
-    append(CCXX_NOWARN_FLAGS "/wd4250")
-    # conversion from 'size_t' to 'uint16_t'
-    append(CCXX_NOWARN_FLAGS "/wd4267")
-    # function assumed not to throw an exception but does
-    append(CCXX_NOWARN_FLAGS "/wd4297")
-    #  format string '%lu' requires an argument of type 'unsigned long'
-    append(CCXX_NOWARN_FLAGS "/wd4477")
-    # not enough arguments for function-like macro
-    append(CCXX_NOWARN_FLAGS "/wd4003")
-    # 
-    append(CCXX_NOWARN_FLAGS "/wd4624")
-    # 'elem_type': unreferenced local variable
-    append(CCXX_NOWARN_FLAGS "/wd4101")
-    # unary minus operator applied to unsigned type
-    append(CCXX_NOWARN_FLAGS "/wd4146")
-    # destructor never returns, potential memory leak
-    append(CCXX_NOWARN_FLAGS "/wd4722")
-    # needs to have dll-interface to be used by clients of struct
-    append(CCXX_NOWARN_FLAGS "/wd4251")
-    
-    append(CMAKE_CCXX_NOWARN_FLAGS ${CCXX_NOWARN_FLAGS})
-    set_property(GLOBAL PROPERTY GRAPH_COMPILER_CCXX_NOWARN_FLAGS "${CCXX_NOWARN_FLAGS}")
-endif()
-
-append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS}")
-append_host_compiler_options(CMAKE_CXX_FLAGS "${DPCPP_CXX_NOWARN_FLAGS}")
-
-include_directories_with_host_compiler(${CMAKE_CURRENT_SOURCE_DIR}/core/src/)
-
-file(GLOB SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
-    )
-
-set(OBJ_LIB dnnl_graph_backend_compiler)
-add_library(${OBJ_LIB} OBJECT ${SOURCES})
-
-
-add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/core)
-
-set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
-    $<TARGET_OBJECTS:${OBJ_LIB}>)
diff --git a/src/graph/backend/graph_compiler/compiler_allocator.cpp b/src/graph/backend/graph_compiler/compiler_allocator.cpp
deleted file mode 100644
index 0b132972ee5..00000000000
--- a/src/graph/backend/graph_compiler/compiler_allocator.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <memory>
-
-#include <atomic>
-#include <future>
-#include <memory>
-#include "compiler_allocator.hpp"
-#include "compiler_backend.hpp"
-#include "runtime/const_cache_wrapper.hpp"
-#include "runtime/runtime.hpp"
-
-#define ALLOCATOR_ALIGNMENT 64
-namespace dnnl {
-namespace impl {
-namespace graph {
-
-namespace gc {
-namespace runtime {
-
-static std::atomic<uint64_t> const_count {0};
-
-struct gc_constant_buffer_t : public constant_buffer_t {
-    std::weak_ptr<const_cache_proxy> proxy_;
-    // we need to remember the backend id, because we don't know when
-    // compiler_impl::compiler_backend_t::get_singleton() is destryed
-    constant_tensor_cache_t::key_t backend_id_;
-    constant_tensor_cache_t::key_t id_;
-    compiler_impl::compiler_graph_engine_t *gcengine_;
-
-    static void *malloc_func(size_t sz, impl::engine_t *, allocator_t *alloc) {
-        return alloc->allocate(
-                sz, {allocator_t::mem_type_t::persistent, ALLOCATOR_ALIGNMENT});
-    }
-    static void free_func(void *p, impl::engine_t *, allocator_t *alloc) {
-        alloc->deallocate(p);
-    }
-
-    gc_constant_buffer_t(
-            compiler_impl::compiler_graph_engine_t *engine, size_t size)
-        : constant_buffer_t {size, engine->engine_,
-                static_cast<allocator_t *>(engine->engine_->get_allocator()),
-                malloc_func, free_func}
-        , backend_id_ {compiler_impl::compiler_backend_t::get_singleton()
-                               .get_id()}
-        , id_ {++const_count}
-        , gcengine_ {engine} {}
-
-    void notify_evict() override {
-        // when the cache is evicted from the constant_tensor_cache, notify
-        // the GC runtime that it should not be used
-        auto p = proxy_.lock();
-        if (p) { p->deref(); }
-    }
-};
-
-// this function will be used in GC runtime
-static void destroy_const_cache(const_cache_proxy *cache) {
-    // when the const_cache_proxy expires in GC side, all compiled JIT function
-    // referencing this cache is destroyed. This function will be
-    // called to remove the cache from constant_tensor_cache.
-    if (cache->check_alive_and_ref()) {
-        // if the cache has not been evicted
-        auto ret = reinterpret_cast<gc_constant_buffer_t *>(
-                cache->unsafe_get_ptr());
-        ret->gcengine_->cache_->remove_if_exist(ret->backend_id_, ret->id_);
-    }
-    delete cache;
-}
-
-// this function will be used in GC
-static std::shared_ptr<const_cache_proxy> do_create_and_register_const_cache(
-        dnnl::impl::graph::gc::runtime::engine_t *engine, size_t size) {
-    auto eng = static_cast<compiler_impl::compiler_graph_engine_t *>(engine);
-    // make a constant_buffer_t and alloc memory
-    auto buffer_obj = std::make_shared<gc_constant_buffer_t>(eng, size);
-
-    // create the proxy object
-    auto ret = std::shared_ptr<const_cache_proxy>(
-            new const_cache_proxy {
-                    buffer_obj, buffer_obj->data<void>(), size, true},
-            destroy_const_cache);
-    // link the proxy object with constant_buffer_t to allow eviction
-    buffer_obj->proxy_ = ret;
-
-    // register into constant_tensor_cache_t
-    std::promise<constant_tensor_cache_t::cached_t> fut {};
-    fut.set_value(buffer_obj);
-    eng->cache_->get_or_add(
-            buffer_obj->backend_id_, buffer_obj->id_, size, fut.get_future());
-
-    return ret;
-}
-
-} // namespace runtime
-} // namespace gc
-
-namespace compiler_impl {
-
-using namespace gc::runtime;
-
-using sc_engine_t = gc::runtime::engine_t;
-
-compiler_graph_engine_t::compiler_graph_engine_t(
-        gc::runtime::engine_vtable_t *vtable, graph::engine_t *engine,
-        const std::shared_ptr<engine_ref_data> &engine_ref_data_ptr)
-    : gc::runtime::engine_t {vtable}
-    , engine_ {engine}
-    , cache_ {get_constant_tensor_cache(engine_->kind(), engine_->index())}
-    , engine_ref_data_ptr_ {engine_ref_data_ptr} {
-    engine_->retain();
-    cache_->retain();
-}
-
-compiler_graph_engine_t::~compiler_graph_engine_t() {
-    std::lock_guard<std::mutex> lock(engine_ref_data_ptr_->global_mutex_);
-    gc::release_runtime_memory(this);
-    for (auto iter = engine_ref_data_ptr_->engine_map_.begin();
-            iter != engine_ref_data_ptr_->engine_map_.end();) {
-        if (iter->second.lock() == nullptr) {
-            iter = engine_ref_data_ptr_->engine_map_.erase(iter);
-        } else {
-            ++iter;
-        }
-    }
-    cache_->release();
-    engine_->release();
-}
-
-static void *compiler_graph_global_alloc(sc_engine_t *eng, size_t sz) {
-    allocator_t *alloc = static_cast<allocator_t *>(
-            static_cast<compiler_graph_engine_t *>(eng)
-                    ->engine_->get_allocator());
-    return alloc->allocate(
-            sz, {allocator_t::mem_type_t::persistent, ALLOCATOR_ALIGNMENT});
-}
-
-static void compiler_graph_global_free(sc_engine_t *eng, void *p) {
-    allocator_t *alloc = static_cast<allocator_t *>(
-            static_cast<compiler_graph_engine_t *>(eng)
-                    ->engine_->get_allocator());
-    alloc->deallocate(p);
-}
-
-#if 0
-static void *compiler_graph_temp_alloc(sc_engine_t *eng, size_t sz) {
-    return static_cast<compiler_graph_engine_t *>(eng)->allocator_->allocate(sz,
-            {allocator_t::mem_type_t::temp, ALLOCATOR_ALIGNMENT});
-}
-
-static void compiler_graph_temp_free(sc_engine_t *eng, void *p) {
-    static_cast<compiler_graph_engine_t *>(eng)->allocator_->deallocate(p);
-}
-#endif
-
-static size_t get_engine_tensor_cache_cap(sc_engine_t *engine) {
-    auto eng = static_cast<compiler_impl::compiler_graph_engine_t *>(engine);
-    return eng->cache_->get_capacity();
-}
-
-engine_vtable_t graph_engine_vtable {compiler_graph_global_alloc,
-        compiler_graph_global_free, compiler_graph_global_alloc,
-        compiler_graph_global_free,
-        gc::runtime::do_create_and_register_const_cache,
-        get_engine_tensor_cache_cap};
-
-compiler_graph_stream_t::compiler_graph_stream_t(
-        compiler_graph_engine_t *eng, const dnnl_stream *stream)
-    : gc::runtime::stream_t {
-            {sc_parallel_call_cpu_with_env_impl, stream}, eng} {}
-
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/compiler_allocator.hpp b/src/graph/backend/graph_compiler/compiler_allocator.hpp
deleted file mode 100644
index 0ca429d9e99..00000000000
--- a/src/graph/backend/graph_compiler/compiler_allocator.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef BACKEND_GRAPH_COMPILER_COMPILER_ALLOCATOR_HPP
-#define BACKEND_GRAPH_COMPILER_COMPILER_ALLOCATOR_HPP
-
-#include <memory>
-#include <unordered_set>
-
-#include "common/engine.hpp"
-#include "graph/interface/allocator.hpp"
-#include "graph/interface/constant_tensor_cache.hpp"
-
-#include "runtime/context.hpp"
-#include "runtime/parallel.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-struct compiler_graph_engine_t;
-
-struct engine_ref_data {
-    std::unordered_map<const graph::engine_t *,
-            std::weak_ptr<graph::compiler_impl::compiler_graph_engine_t>>
-            engine_map_;
-    std::mutex global_mutex_;
-};
-
-struct compiler_graph_engine_t : public gc::runtime::engine_t {
-    graph::engine_t *engine_;
-    constant_tensor_cache_t *cache_;
-    std::shared_ptr<engine_ref_data> engine_ref_data_ptr_;
-    compiler_graph_engine_t(gc::runtime::engine_vtable_t *vtable,
-            graph::engine_t *engine,
-            const std::shared_ptr<engine_ref_data> &engine_ref_data_ptr);
-
-    ~compiler_graph_engine_t();
-};
-
-struct compiler_graph_stream_t : public gc::runtime::stream_t {
-    compiler_graph_stream_t(
-            compiler_graph_engine_t *eng, const dnnl_stream *stream);
-};
-
-extern gc::runtime::engine_vtable_t graph_engine_vtable;
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/compiler_backend.cpp b/src/graph/backend/graph_compiler/compiler_backend.cpp
deleted file mode 100644
index 7da5c3f4e75..00000000000
--- a/src/graph/backend/graph_compiler/compiler_backend.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-
-#include "compiler_graph.hpp"
-#include "compiler_partition_impl.hpp"
-#include "patterns/concat_pattern.hpp"
-#include "patterns/conv_pattern.hpp"
-#include "patterns/mha_pattern.hpp"
-#include "patterns/misc_pattern.hpp"
-#include "patterns/mlp_pattern.hpp"
-#include "patterns/single_op_pattern.hpp"
-#include "target_machine.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-size_t compiler_backend_t::get_mem_size(const logical_tensor_t &lt) const {
-    assert(lt.layout_type == layout_type::strided);
-    // for 0-d tensor (scalar), mem_size is by default 1
-    size_t mem_size = 1;
-    for (int32_t i = 0; i < lt.ndims; ++i) {
-        mem_size *= lt.dims[i];
-    }
-    return mem_size * logical_tensor_wrapper_t(lt).data_type_size();
-}
-
-graph::pass::pass_registry_t compiler_backend_t::register_passes() {
-    const bool enable_single_op_pattern
-            = graph::utils::getenv_int_internal(
-                      "ENABLE_GRAPH_COMPILER_SINGLE_OP_PATTERN", 0)
-            > 0;
-    const bool enable_fp32_bf16_concat_patterns
-            = graph::utils::getenv_int_internal(
-                      "ENABLE_GRAPH_COMPILER_FP_CONCAT_PATTERN", 0)
-            > 0;
-    graph::pass::pass_registry_t pass_registry;
-    REQUIRE_AVX512_BEGIN
-    if (enable_single_op_pattern)
-        COMPILER_BACKEND_REGISTER_PASSES_CALL(single_op_pattern, pass_registry);
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(fp32_mha_pattern, pass_registry);
-    REQUIRE_AMX_BEGIN
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(fp32_mlp_pattern, pass_registry);
-    REQUIRE_AMX_END
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(
-            fp32_conv_training_pattern, pass_registry);
-    REQUIRE_AMX_BEGIN
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(
-            fp32_conv_inference_pattern, pass_registry);
-    REQUIRE_AMX_END
-    REQUIRE_BF16_AMXBF16_BEGIN
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(bf16_mha_pattern, pass_registry);
-    REQUIRE_AMXBF16_BEGIN
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(bf16_mlp_pattern, pass_registry);
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(
-            bf16_conv_training_pattern, pass_registry);
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(
-            bf16_conv_inference_pattern, pass_registry);
-    REQUIRE_AMXBF16_END
-    REQUIRE_BF16_AMXBF16_END
-    REQUIRE_VNNI_AMXINT8_BEGIN
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(int8_mha_pattern, pass_registry);
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(int8_mlp_pattern, pass_registry);
-    REQUIRE_AMX_BEGIN
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(
-            int8_conv_inference_pattern, pass_registry);
-    REQUIRE_AMX_END
-    REQUIRE_VNNI_AMXINT8_END
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(misc_pattern, pass_registry);
-    if (enable_fp32_bf16_concat_patterns) {
-        COMPILER_BACKEND_REGISTER_PASSES_CALL(concat_patterns, pass_registry);
-    }
-    COMPILER_BACKEND_REGISTER_PASSES_CALL(int8_concat_patterns, pass_registry);
-    REQUIRE_AVX512_END
-    pass_registry.sort_passes();
-    return pass_registry;
-}
-
-graph::pass::pass_registry_t compiler_backend_t::pass_registry_
-        = compiler_backend_t::register_passes();
-
-status_t compiler_backend_t::get_partitions(
-        graph_t &agraph, partition_policy_t policy) {
-    // this environment variable is similar to DISABLE_DNNL_BACKEND
-    // only for internal testing purpose
-    const bool disable_compiler_bkd
-            = graph::utils::getenv_int_internal("DISABLE_COMPILER_BACKEND", 0)
-            > 0;
-    if (disable_compiler_bkd) return status::success;
-    graph::pass::pass_manager_t pm(get_pass_registry());
-    pm.run_passes(agraph, "", policy);
-    return status::success;
-}
-
-} // namespace compiler_impl
-
-status_t register_compiler_backend() {
-    const status_t ret = backend_registry_t::get_singleton().register_backend(
-            &compiler_impl::compiler_backend_t::get_singleton());
-    return ret;
-}
-
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/compiler_backend.hpp b/src/graph/backend/graph_compiler/compiler_backend.hpp
deleted file mode 100644
index a6b37ef5101..00000000000
--- a/src/graph/backend/graph_compiler/compiler_backend.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef BACKEND_GRAPH_COMPILER_COMPILER_BACKEND_HPP
-#define BACKEND_GRAPH_COMPILER_COMPILER_BACKEND_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "graph/backend/graph_compiler/utils.hpp"
-#include "graph/interface/backend.hpp"
-#include "graph/utils/pm/pass_manager.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-class compiler_backend_t : public backend_t {
-    friend class compiler_partition_impl_t;
-
-public:
-    static compiler_backend_t &get_singleton() {
-        static compiler_backend_t ins("compiler_backend", /*priority*/ 2.f);
-        return ins;
-    }
-
-    bool support_engine_kind(engine_kind_t kind) const override {
-        return kind == engine_kind_t::dnnl_cpu;
-    }
-
-    /*! \brief Register defined patterns that can be processed with compiler backend
-     */
-    graph::pass::pass_registry_t &get_pass_registry() { return pass_registry_; }
-
-    /*! \brief Get the size of logical tensor in the unit of bytes
-     */
-    size_t get_mem_size(const logical_tensor_t &lt) const override;
-
-    /*! \brief Get the partition that can be processed by compiler backend
-     */
-    status_t get_partitions(
-            graph_t &agraph, partition_policy_t policy) override;
-
-    // /*! \brief Return the support status for a specific engine kine
-    //  */
-    // bool support_engine_kind(engine_kind_t kind) const override {
-    //     static const std::unordered_set<engine_kind_t, utils::enum_hash_t>
-    //             supported_kind = {engine_kind::cpu};
-    //     return supported_kind.count(kind);
-    // }
-
-private:
-    compiler_backend_t(const std::string &backend_name, float priority)
-        : backend_t(backend_name, priority) {};
-
-    static graph::pass::pass_registry_t register_passes();
-
-    static graph::pass::pass_registry_t pass_registry_;
-};
-
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/compiler_graph.cpp b/src/graph/backend/graph_compiler/compiler_graph.cpp
deleted file mode 100644
index db861f42d98..00000000000
--- a/src/graph/backend/graph_compiler/compiler_graph.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "compiler_graph.hpp"
-#include "patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-const std::unordered_map<op_kind_t, std::string, utils::enum_hash_t> &
-compiler_graph_impl_t::compiler_backend_supported_ops() {
-    static const std::unordered_map<op_kind_t, std::string, utils::enum_hash_t>
-            compiler_backend_op = {{op_kind::Add, "add"}, {op_kind::Abs, "abs"},
-                    {op_kind::AbsBackward, "abs_bwd"},
-                    {op_kind::Subtract, "sub"}, {op_kind::Multiply, "mul"},
-                    {op_kind::Divide, "div"}, {op_kind::Pow, "pow"},
-                    {op_kind::MatMul, "matmul"},
-                    {op_kind::Quantize, "quantize"},
-                    {op_kind::Dequantize, "dequantize"},
-                    {op_kind::DynamicDequantize, "dynamic_dequantize"},
-                    {op_kind::DynamicQuantize, "dynamic_quantize"},
-                    {op_kind::StaticReshape, "static_reshape"},
-                    {op_kind::StaticTranspose, "transpose"},
-                    {op_kind::LogSoftmax, "log_softmax"},
-                    {op_kind::LogSoftmaxBackward, "log_softmax_bwd"},
-                    {op_kind::SoftMax, "softmax"},
-                    {op_kind::SoftMaxBackward, "softmax_bwd"},
-                    {op_kind::Reorder, "reorder"},
-                    {op_kind::SoftPlus, "soft_plus"},
-                    {op_kind::SoftPlusBackward, "soft_plus_bwd"},
-                    {op_kind::TypeCast, "cast"}, {op_kind::ReLU, "relu"},
-                    {op_kind::Sigmoid, "sigmoid"}, {op_kind::GELU, "gelu"},
-                    {op_kind::ReLUBackward, "relu_backprop"},
-                    {op_kind::Reciprocal, "reciprocal"},
-                    {op_kind::SigmoidBackward, "sigmoid_backprop"},
-                    {op_kind::GELUBackward, "gelu_backprop"},
-                    {op_kind::ReduceSum, "reduce_sum"},
-                    {op_kind::BiasAdd, "add"},
-                    {op_kind::Convolution, "conv_fwd"},
-                    {op_kind::ConvolutionBackwardData, "conv_bwd_data"},
-                    {op_kind::ConvolutionBackwardWeights, "conv_bwd_weight"},
-                    {op_kind::BatchNormForwardTraining,
-                            "batchnorm_forward_training"},
-                    {op_kind::BatchNormTrainingBackward,
-                            "batchnorm_training_backprop"},
-                    {op_kind::Maximum, "max"}, {op_kind::Minimum, "min"},
-                    {op_kind::LayerNorm, "layernorm"},
-                    {op_kind::Select, "select"}, {op_kind::Square, "square"},
-                    {op_kind::Tanh, "tanh"},
-                    {op_kind::TanhBackward, "tanh_bwd"}, {op_kind::Exp, "exp"},
-                    {op_kind::ReduceL2, "reduce_l2"},
-                    {op_kind::ReduceL1, "reduce_l1"},
-                    {op_kind::ReduceMax, "reduce_max"},
-                    {op_kind::ReduceMean, "reduce_mean"},
-                    {op_kind::ReduceProd, "reduce_prod"},
-                    {op_kind::ReduceMin, "reduce_min"},
-                    {op_kind::Concat, "concat"}, {op_kind::Clamp, "clamp"},
-                    {op_kind::ClampBackward, "clamp_bwd"},
-                    {op_kind::HardSigmoid, "hardsigmoid"},
-                    {op_kind::HardSigmoidBackward, "hardsigmoid_bwd"},
-                    {op_kind::LeakyReLU, "leaky_relu"}, {op_kind::Log, "log"},
-                    {op_kind::Elu, "elu"}, {op_kind::EluBackward, "elu_bwd"},
-                    {op_kind::MaxPool, "pooling_max"},
-                    {op_kind::MaxPoolBackward, "pooling_max_backprop"},
-                    {op_kind::AvgPoolBackward, "pooling_avg_backprop"},
-                    {op_kind::Mish, "mish"},
-                    {op_kind::MishBackward, "mish_bwd"},
-                    {op_kind::AvgPool, "pooling_avg"},
-                    {op_kind::SqrtBackward, "sqrt_bwd"},
-                    {op_kind::HardSwish, "hardswish"},
-                    {op_kind::HardSwishBackward, "hardswish_bwd"},
-                    {op_kind::BatchNormInference, "batchnorm_inference"}};
-    return compiler_backend_op;
-}
-
-// we convert all int64[] to int32[] except for allowlist
-static std::unordered_set<graph::op_attr_t> type_conversion_allowlist {
-        graph::op_attr::strides, graph::op_attr::pads_begin,
-        graph::op_attr::pads_end, graph::op_attr::weights_shape,
-        graph::op_attr::dst_shape, graph::op_attr::src_shape,
-        graph::op_attr::dilations, graph::op_attr::kernel};
-
-gc::any_map_t compiler_graph_impl_t::convert_op_attrs(
-        const std::unordered_map<graph::op_attr_t,
-                graph::utils::attribute_value_t> &attrs) {
-    gc::any_map_t backend_attrs;
-    for (const auto &attr : attrs) {
-        auto kind = attr.second.get_kind();
-        auto name = graph::op_t::attr2str(attr.first);
-        switch (kind) {
-            case attribute_kind::i: {
-                auto val = attr.second.get<int64_t>();
-                backend_attrs.set(name, (int)val);
-            } break;
-            case attribute_kind::is: {
-                auto val = attr.second.get<std::vector<int64_t>>();
-                // TODO(yifei): generalize the logic here
-                if (type_conversion_allowlist.find(attr.first)
-                        != type_conversion_allowlist.end()) {
-                    backend_attrs.set(name, val);
-                } else {
-                    std::vector<int> val_int32(val.begin(), val.end());
-                    backend_attrs.set(name, val_int32);
-                }
-            } break;
-            case attribute_kind::f: {
-                auto val = attr.second.get<float>();
-                backend_attrs.set(name, val);
-            } break;
-            case attribute_kind::fs: {
-                auto val = attr.second.get<std::vector<float>>();
-                backend_attrs.set(name, val);
-            } break;
-            case attribute_kind::s: {
-                auto val = attr.second.get<std::string>();
-                backend_attrs.set(name, val);
-            } break;
-            case attribute_kind::b: {
-                auto val = attr.second.get<bool>();
-                backend_attrs.set(name, val);
-            } break;
-            default: assert(0 && "attr value type can't support"); break;
-        }
-    }
-    return backend_attrs;
-}
-
-static int convert_axis(int64_t axis, int64_t dim) {
-    assert(axis < dim && axis >= -dim);
-    return axis < 0 ? dim + axis : axis;
-}
-
-gc::sc_op_ptr compiler_graph_impl_t::make_backend_op(const op_t *aop,
-        const std::vector<gc::graph_tensor_ptr> &producer_lt,
-        const std::vector<gc::graph_tensor_ptr> &consumer_lt) {
-    if (!is_supported_op(aop->get_kind())) { return nullptr; }
-    gc::any_map_t backend_attrs;
-    std::unordered_map<graph::op_attr_t, graph::utils::attribute_value_t> attrs
-            = aop->get_attributes();
-    auto input_dim = producer_lt[0]->details_.get_plain_dims().size();
-    if (aop->get_kind() == op_kind::Quantize
-            || aop->get_kind() == op_kind::Dequantize
-            || aop->get_kind() == op_kind::DynamicQuantize
-            || aop->get_kind() == op_kind::DynamicDequantize) {
-        if (attrs.find(graph::op_attr::qtype) != attrs.end()) {
-            backend_attrs.set("per_channel",
-                    (attrs[graph::op_attr::qtype].get<std::string>()
-                            == "per_channel"));
-        }
-        int64_t axis = attrs.find(graph::op_attr::axis) != attrs.end()
-                ? attrs[graph::op_attr::axis].get<int64_t>()
-                : 1;
-        if (attrs[graph::op_attr::qtype].get<std::string>() == "per_channel")
-            backend_attrs.set("channel_axis", convert_axis(axis, input_dim));
-        if (aop->get_kind() == op_kind::Quantize
-                || aop->get_kind() == op_kind::Dequantize) {
-            std::vector<float> scales
-                    = attrs[graph::op_attr::scales].get<std::vector<float>>();
-            std::vector<int64_t> zps_int64
-                    = std::vector<int64_t>(scales.size(), 0);
-            if (attrs.find(graph::op_attr::zps) != attrs.end()) {
-                zps_int64 = attrs[graph::op_attr::zps]
-                                    .get<std::vector<int64_t>>();
-            }
-            std::vector<int> zps(zps_int64.begin(), zps_int64.end());
-            backend_attrs.set("scales", scales);
-            backend_attrs.set("zero_points", zps);
-        }
-        backend_attrs.set("dtype",
-                convert_data_type(aop->get_output_value(0)
-                                          ->get_logical_tensor()
-                                          .data_type));
-    } else if (aop->get_kind() == op_kind::SoftMax
-            || aop->get_kind() == op_kind::SoftMaxBackward
-            || aop->get_kind() == op_kind::LogSoftmax
-            || aop->get_kind() == op_kind::LogSoftmaxBackward) {
-        backend_attrs.set("axis",
-                std::vector<int>(1,
-                        convert_axis(attrs[graph::op_attr::axis].get<int64_t>(),
-                                input_dim)));
-    } else if (aop->get_kind() == op_kind::StaticReshape) {
-        backend_attrs.set("shape",
-                attrs[graph::op_attr::shape].get<std::vector<int64_t>>());
-        backend_attrs.set("special_zero",
-                attrs[graph::op_attr::special_zero].get<bool>());
-    } else if (aop->get_kind() == op_kind::StaticTranspose) {
-        std::vector<int64_t> order_int64
-                = attrs[graph::op_attr::order].get<std::vector<int64_t>>();
-        std::vector<int> order(order_int64.size());
-        std::transform(order_int64.begin(), order_int64.end(), order.begin(),
-                [input_dim](int64_t axis) -> int {
-                    return convert_axis(axis, input_dim);
-                });
-        backend_attrs.set("order", order);
-    } else if (aop->get_kind() == op_kind::TypeCast) {
-        backend_attrs.set("dtype",
-                convert_data_type(aop->get_output_value(0)
-                                          ->get_logical_tensor()
-                                          .data_type));
-    } else if (std::find(pass::get_reduction_ops().begin(),
-                       pass::get_reduction_ops().end(), aop->get_kind())
-            != pass::get_reduction_ops().end()) {
-        assert(attrs.find(graph::op_attr::axes) != attrs.end());
-        std::vector<int64_t> axes
-                = attrs[graph::op_attr::axes].get<std::vector<int64_t>>();
-        std::vector<int> rd_axis(axes.size());
-        std::transform(axes.begin(), axes.end(), rd_axis.begin(),
-                [input_dim](int64_t axis) -> int {
-                    return convert_axis(axis, input_dim);
-                });
-        backend_attrs.set("rd_axis", rd_axis);
-        if (attrs.find(graph::op_attr::keep_dims) != attrs.end()) {
-            backend_attrs.set(
-                    "keep_dims", attrs[graph::op_attr::keep_dims].get<bool>());
-        } else {
-            backend_attrs.set("keep_dims", false);
-        }
-    } else if (aop->get_kind() == op_kind::BiasAdd) {
-        std::string data_format = "NXC";
-        if (attrs.find(graph::op_attr::data_format) != attrs.end()) {
-            data_format = attrs[graph::op_attr::data_format].get<std::string>();
-        }
-        if (data_format == "NXC") {
-            backend_attrs.set("bc_axis",
-                    std::vector<int> {static_cast<int>(input_dim) - 1});
-        } else {
-            backend_attrs.set("bc_axis", std::vector<int> {1});
-        }
-    } else {
-        backend_attrs = convert_op_attrs(aop->get_attributes());
-    }
-    return make(compiler_backend_supported_ops().find(aop->get_kind())->second,
-            producer_lt, consumer_lt, backend_attrs);
-}
-
-gc::graph_tensor_ptr compiler_graph_impl_t::convert_logical_tensor(
-        const graph::logical_tensor_t &lt) {
-    gc::sc_data_format_t lt_format;
-    if (lt.ndims == 0) {
-        return std::make_shared<gc::graph_tensor>(nullptr, lt_format,
-                std::vector<int64_t> {1}, convert_data_type(lt.data_type));
-    }
-    std::vector<bool> visited(lt.ndims);
-    if (lt.layout_type == layout_type::strided) {
-        std::vector<int64_t> ordered_strides {
-                lt.layout.strides, lt.layout.strides + lt.ndims};
-        std::sort(ordered_strides.begin(), ordered_strides.end(),
-                std::greater<int64_t>());
-        std::vector<int> storage_args(lt.ndims);
-        for (int s = 0; s < lt.ndims; ++s) {
-            for (int j = 0; j < lt.ndims; ++j) {
-                if (!visited[j] && ordered_strides[s] == lt.layout.strides[j]) {
-                    storage_args[s] = j;
-                    visited[j] = true;
-                    break;
-                }
-            }
-        }
-        lt_format = gc::sc_data_format_t(storage_args);
-        return std::make_shared<gc::graph_tensor>(nullptr, lt_format,
-                std::vector<int64_t> {lt.dims, lt.dims + lt.ndims},
-                convert_data_type(lt.data_type), ordered_strides);
-    }
-    return std::make_shared<gc::graph_tensor>(nullptr, lt_format,
-            std::vector<int64_t> {lt.dims, lt.dims + lt.ndims},
-            convert_data_type(lt.data_type));
-}
-
-inline gc::sc_data_type_t compiler_graph_impl_t::convert_data_type(
-        data_type_t dtype) {
-    if (dtype == data_type::f16)
-        return gc::sc_data_type_t(gc::sc_data_etype::F16, 1);
-    if (dtype == data_type::bf16)
-        return gc::sc_data_type_t(gc::sc_data_etype::BF16, 1);
-    if (dtype == data_type::f32)
-        return gc::sc_data_type_t(gc::sc_data_etype::F32, 1);
-    if (dtype == data_type::s32)
-        return gc::sc_data_type_t(gc::sc_data_etype::S32, 1);
-    if (dtype == data_type::s8)
-        return gc::sc_data_type_t(gc::sc_data_etype::S8, 1);
-    if (dtype == data_type::u8 || dtype == data_type::boolean)
-        return gc::sc_data_type_t(gc::sc_data_etype::U8, 1);
-    assert(0 && "undefined or unknown data_type");
-    return gc::sc_data_type_t();
-}
-
-gc::sc_op_ptr compiler_graph_impl_t::make_compiler_backend_input(
-        const graph::logical_tensor_t &in_lt, const size_t &partition_id) {
-    auto lrt = compiler_graph_impl_t::convert_logical_tensor(in_lt);
-    auto in_ret = this->make_input({lrt});
-    in_ret->attrs_["temp.tensor_id"] = (partition_id << 32) + in_lt.id;
-    if (in_lt.property == property_type::constant) {
-        in_ret->attrs_.set("constant", 1); // set as local_const
-    }
-    return in_ret;
-}
-
-bool compiler_graph_impl_t::is_supported_op(op_kind_t name) {
-    return compiler_backend_supported_ops().find(name)
-            != compiler_backend_supported_ops().end();
-}
-
-std::vector<op_kind_t> compiler_graph_impl_t::get_supported_op_kinds() {
-    std::vector<op_kind_t> ret;
-    for (const auto &pair : compiler_backend_supported_ops()) {
-        ret.emplace_back(pair.first);
-    }
-    return ret;
-}
-
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/compiler_graph.hpp b/src/graph/backend/graph_compiler/compiler_graph.hpp
deleted file mode 100644
index b3d03d2006d..00000000000
--- a/src/graph/backend/graph_compiler/compiler_graph.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_COMPILER_GRAPH_HPP
-#define BACKEND_GRAPH_COMPILER_COMPILER_GRAPH_HPP
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-#include "graph/interface/graph.hpp"
-#include "graph/interface/op.hpp"
-#include "utils.hpp"
-
-#include "compiler/ir/graph/fusible_op.hpp"
-#include "compiler/ir/graph/graph.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-class compiler_graph_impl_t : public gc::sc_graph_t {
-public:
-    compiler_graph_impl_t() = default;
-
-    // convert onednn graph op to compiler backend op
-    gc::sc_op_ptr make_backend_op(const op_t *aop,
-            const std::vector<gc::graph_tensor_ptr> &producer_lt,
-            const std::vector<gc::graph_tensor_ptr> &consumer_lt);
-
-    // convert onednn graph dtype to compiler backend dtype
-    static inline gc::sc_data_type_t convert_data_type(data_type_t dtype);
-
-    // convert partition's input logical tensor to compiler backend input node
-    gc::sc_op_ptr make_compiler_backend_input(
-            const graph::logical_tensor_t &lt, const size_t &partition_id);
-
-    // get compiler backend ops
-    const std::vector<gc::sc_op_ptr> get_compiler_backend_ops() { return ops_; }
-
-    // onednn graph op id -> compiler backend op pointer
-    std::unordered_map<int, gc::sc_op_ptr> op_mapping_;
-
-    // convert onednn graph op attrs to compiler backend op attrs
-    gc::any_map_t convert_op_attrs(const std::unordered_map<graph::op_attr_t,
-            graph::utils::attribute_value_t> &attrs);
-
-    // convert onednn graph logical tensor to backend graph tensor
-    static gc::graph_tensor_ptr convert_logical_tensor(
-            const graph::logical_tensor_t &lt);
-
-    // return whether an op is supported by compiler backend or not
-    static bool is_supported_op(op_kind_t name);
-
-    // returns all supported op kinds
-    static std::vector<op_kind_t> get_supported_op_kinds();
-
-    // returns all supported op kinds
-    static const std::unordered_map<op_kind_t, std::string, utils::enum_hash_t>
-            &compiler_backend_supported_ops();
-};
-
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/compiler_partition_impl.cpp b/src/graph/backend/graph_compiler/compiler_partition_impl.cpp
deleted file mode 100644
index bd8fe00966b..00000000000
--- a/src/graph/backend/graph_compiler/compiler_partition_impl.cpp
+++ /dev/null
@@ -1,484 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include "compiler/config/context.hpp"
-#include "compiler/ir/graph/driver.hpp"
-#include "compiler/ir/graph/dynamic_utils.hpp"
-#include "compiler/ir/graph/pass/pass.hpp"
-#include "compiler/ir/transform/tensor_inplace_info.hpp"
-#include "compiler/jit/compiler_driver.hpp"
-#include "compiler_partition_impl.hpp"
-
-#include "common/rw_mutex.hpp"
-#include "common/verbose.hpp"
-#include "graph/interface/graph.hpp"
-#include "graph/utils/debug.hpp"
-#include "graph/utils/utils.hpp"
-#include "runtime/runtime.hpp"
-#include "runtime/thread_locals.hpp"
-#include "utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-static const auto engine_ref_data_ptr = std::make_shared<engine_ref_data>();
-
-graph::status_t compiler_partition_impl_t::infer_shape(
-        std::vector<const graph::logical_tensor_t *> &inputs,
-        std::vector<graph::logical_tensor_t *> &outputs) const {
-    std::lock_guard<std::mutex> lck(mtx_);
-    // construct a temp graph
-    copied_ops_ = graph::graph_t::deep_copy(ops_);
-    graph::graph_t temp_graph(copied_ops_);
-    auto output_ops = temp_graph.get_output_ops();
-    auto ret = topo_order_visit(output_ops, [&](op_t *cur_op) {
-        const graph::op_schema_t *cur_op_schema
-                = graph::op_schema_registry_t::get_op_schema(
-                        cur_op->get_kind());
-        assertm(cur_op_schema, "Can't infer shape for cur op: no schema");
-        auto get_logical_tensor = [&](const std::shared_ptr<value_t> &val)
-                -> graph::logical_tensor_t {
-            logical_tensor_t lt = val->get_logical_tensor();
-            auto in_pos = std::find_if(inputs.begin(), inputs.end(),
-                    [&](const graph::logical_tensor_t *alt) -> bool {
-                        return alt->id == lt.id;
-                    });
-            if (in_pos != inputs.end()) { return **in_pos; }
-            return lt;
-        };
-        graph::op_t temp_node = graph::op_t(cur_op->get_kind());
-        temp_node.merge_attributes(cur_op->get_attributes());
-        std::vector<graph::logical_tensor_t> ordered_inputs_holder
-                = utils::func_map(
-                        cur_op->get_input_values(), get_logical_tensor);
-        std::vector<graph::logical_tensor_t> ordered_outputs_holder
-                = utils::func_map(
-                        cur_op->get_output_values(), get_logical_tensor);
-        std::vector<graph::logical_tensor_t *> ordered_inputs;
-        ordered_inputs.reserve(ordered_inputs_holder.size());
-        for (auto &tsr : ordered_inputs_holder) {
-            ordered_inputs.emplace_back(&tsr);
-        }
-        std::vector<graph::logical_tensor_t *> ordered_outputs;
-        ordered_outputs.reserve(ordered_outputs_holder.size());
-        for (auto &tsr : ordered_outputs_holder) {
-            ordered_outputs.emplace_back(&tsr);
-        }
-        graph::status_t ret = cur_op_schema->shape_infer(
-                &temp_node, ordered_inputs, ordered_outputs);
-        if (ret != graph::status::success) return ret;
-        for (size_t i = 0; i < cur_op->get_output_values().size(); ++i) {
-            auto output_lt = *ordered_outputs[i];
-            auto cur_val = cur_op->get_output_values()[i];
-            cur_val->set_logical_tensor(output_lt);
-            // if layout is any; let's respect it
-            // if layout is strided; shape_infer will fill the stride
-            // if layout is undef; convert it to strided and fill the stride
-            if (output_lt.layout_type == graph::layout_type::undef) {
-                // force set strided dense layout
-                graph::dims shape(
-                        output_lt.dims, output_lt.dims + output_lt.ndims);
-                graph::dims strides = utils::get_dense_strides(shape);
-                cur_val->set_strides(strides);
-            }
-            // only write back inferred info to outputs
-            // shall not modify the layout type
-            auto out_pos = std::find_if(outputs.begin(), outputs.end(),
-                    [&](graph::logical_tensor_t *alt) -> bool {
-                        return alt->id == ordered_outputs[i]->id;
-                    });
-            if (out_pos != outputs.end()) {
-                auto cur_lt = cur_val->get_logical_tensor();
-                // ensure layout type not modified
-                if ((**out_pos).layout_type == graph::layout_type::any)
-                    cur_lt.layout_type = (**out_pos).layout_type;
-                **out_pos = cur_lt;
-            }
-        }
-        return graph::status::success;
-    });
-    return ret;
-}
-
-graph::status_t compiler_partition_impl_t::compile(
-        graph::compiled_partition_t *compiled_partition,
-        const std::vector<graph::logical_tensor_t> &inputs,
-        const std::vector<graph::logical_tensor_t> &outputs,
-        const graph::engine_t *aengine) const {
-    try {
-        graph::status_t res = status::success;
-        // here we call infer_shape since logical tensor info
-        // may be incomplete for the graph corresponding to the
-        // partition
-        std::vector<const graph::logical_tensor_t *> input_ref;
-        std::vector<graph::logical_tensor_t *> output_ref;
-        input_ref.reserve(inputs.size());
-        output_ref.reserve(outputs.size());
-        for (auto &t : inputs) {
-            input_ref.push_back(const_cast<graph::logical_tensor_t *>(&t));
-        }
-        for (auto &t : outputs) {
-            output_ref.push_back(const_cast<graph::logical_tensor_t *>(&t));
-        }
-        res = this->infer_shape(input_ref, output_ref);
-        if (res != status::success) { return res; }
-
-        std::lock_guard<std::mutex> lck(mtx_);
-        std::vector<gc::runtime::dynamic_tensor_t> dyn_inputs, dyn_outputs;
-        std::unordered_map<size_t, gc::sc_op_ptr> inputs_map, outputs_map;
-        std::vector<gc::sc_op_ptr> sc_inputs;
-        compiler_graph_impl_t sub_graph;
-        size_t id = 0;
-        std::vector<size_t> out_lt_ids;
-        out_lt_ids.reserve(outputs.size());
-        sc_inputs.reserve(inputs.size());
-        bool is_dynamic = false;
-        for (auto &in_lt : inputs) {
-            gc::sc_op_ptr in_ret;
-            in_ret = sub_graph.make_compiler_backend_input(in_lt, this->id());
-            if (!is_dynamic && sub_graph.is_dynamic()) { is_dynamic = true; }
-            inputs_map[in_lt.id] = in_ret;
-            sc_inputs.emplace_back(in_ret);
-            in_ret->attrs_.set("unique_id", id++);
-            in_ret->attrs_.set(
-                    "temp.name", "logical_tensor_" + std::to_string(in_lt.id));
-        }
-        if (is_dynamic) {
-            dyn_inputs.resize(inputs.size());
-            dyn_outputs.reserve(outputs.size());
-            std::transform(sc_inputs.begin(), sc_inputs.end(),
-                    dyn_inputs.begin(), [](const gc::sc_op_ptr &in) {
-                        return gc::convert_graph_tensor_to_dynamic_tensor(
-                                in->get_outputs()[0]);
-                    });
-        }
-        for (auto &out_lt : outputs) {
-            out_lt_ids.emplace_back(out_lt.id);
-        }
-        graph::graph_t temp_graph(copied_ops_);
-        auto output_ops = temp_graph.get_output_ops();
-        std::vector<bool> output_format_any(outputs.size(), false);
-        graph::status_t status;
-        status = topo_order_visit(output_ops, [&](op_t *cur_op) {
-            std::vector<gc::graph_tensor_ptr> producer_lt, consumer_lt;
-            // translate input value
-            for (auto &in_value : cur_op->get_input_values()) {
-                auto backend_op = in_value->has_producer()
-                        ? sub_graph.op_mapping_[in_value->get_producer()
-                                                        .get_id()]
-                        : nullptr;
-                if (backend_op) {
-                    producer_lt.emplace_back(
-                            backend_op->get_info()
-                                    .outputs_[in_value->get_offset()]);
-                } else {
-                    auto lt = in_value->get_logical_tensor();
-                    assert(inputs_map.find(lt.id) != inputs_map.end());
-                    producer_lt.emplace_back(
-                            inputs_map[lt.id]->get_outputs()[0]);
-                }
-            }
-            // Get consumer lt
-            if (!is_dynamic) {
-                for (auto &out_value : cur_op->get_output_values()) {
-                    auto lt = out_value->get_logical_tensor();
-                    // check in outputs
-                    auto out_pos = std::find_if(outputs.begin(), outputs.end(),
-                            [&lt](const graph::logical_tensor_t &alt) -> bool {
-                                return alt.id == lt.id;
-                            });
-                    if (out_pos != outputs.end()) {
-                        consumer_lt.emplace_back(
-                                compiler_graph_impl_t::convert_logical_tensor(
-                                        *out_pos));
-                    } else {
-                        consumer_lt.emplace_back(
-                                compiler_graph_impl_t::convert_logical_tensor(
-                                        lt));
-                    }
-                }
-            }
-            // translate op
-            gc::sc_op_ptr ret;
-            try {
-                ret = sub_graph.make_backend_op(
-                        cur_op, producer_lt, consumer_lt);
-            } catch (const std::exception &e) {
-                VERROR(graph, graph_compiler, "%s", e.what());
-                ret = nullptr;
-            }
-            if (!ret) { return graph::status::invalid_graph_op; }
-            // translate output value
-            for (size_t i = 0; i < cur_op->get_output_values().size(); i++) {
-                auto &out_value = cur_op->get_output_values()[i];
-                auto lt = out_value->get_logical_tensor();
-                auto out_pos = std::find_if(outputs.begin(), outputs.end(),
-                        [&lt](const graph::logical_tensor_t &alt) -> bool {
-                            return alt.id == lt.id;
-                        });
-                if (out_pos != outputs.end()) {
-                    auto out_ret
-                            = sub_graph.make_output({ret->get_outputs()[i]});
-                    if (is_dynamic) {
-                        auto out_dyn_tsr
-                                = gc::convert_graph_tensor_to_dynamic_tensor(
-                                        out_ret->get_inputs()[0]);
-                        dyn_outputs.emplace_back(out_dyn_tsr);
-                    }
-                    out_ret->attrs_.set("unique_id", id++);
-                    out_ret->attrs_.set("temp.name",
-                            "logical_tensor_" + std::to_string(lt.id));
-                    gc::sc_data_format_t output_format
-                            = out_ret->get_inputs()[0]->details_.get_format();
-                    // TODO(yifei): consider dynamic shape cases
-                    if (out_pos->layout_type == graph::layout_type::strided
-                            && !output_format.is_any()) {
-                        gc::sc_dims output_strides
-                                = out_ret->get_inputs()[0]
-                                          ->details_.get_strides();
-                        out_ret->attrs_.set("target_formats",
-                                std::vector<gc::sc_data_format_t> {
-                                        output_format});
-                        out_ret->attrs_.set("target_strides",
-                                std::vector<gc::sc_dims> {output_strides});
-                    } else {
-                        output_format_any[std::distance(
-                                outputs.begin(), out_pos)]
-                                = true;
-                    }
-                    outputs_map[lt.id] = out_ret;
-                }
-            }
-            sub_graph.op_mapping_[cur_op->get_id()] = ret;
-            return graph::status::success;
-        });
-        if (status != graph::status::success) return status;
-
-        gc::sc_graph_t &backend_graph_obj = sub_graph;
-        if (!gc::check_graph_connection(backend_graph_obj)) {
-            return graph::status::invalid_graph;
-        }
-        backend_graph_obj.attrs_["temp.name"]
-                = pname_ + "_" + std::to_string(this->id_);
-        backend_graph_obj.attrs_[gc::sc_graph_t::attr_key_t::fpmath_mode]
-                = static_cast<int>(fpmath_mode_);
-        backend_graph_obj
-                .attrs_[gc::sc_graph_t::attr_key_t::allow_channel_last_output]
-                = true;
-
-        COMPILE_ASSERT(aengine->kind() == graph::engine_kind_t::dnnl_cpu,
-                "Graph compiler backend only supports cpu engine");
-        static gc::context_ptr ctx
-                = std::make_shared<gc::context_t>(*gc::get_default_context());
-        std::shared_ptr<compiler_graph_engine_t> graph_engine;
-        {
-            std::lock_guard<std::mutex> lock(
-                    engine_ref_data_ptr->global_mutex_);
-            auto iter = engine_ref_data_ptr->engine_map_.find(aengine);
-            if (iter != engine_ref_data_ptr->engine_map_.end()) {
-                graph_engine = iter->second.lock();
-            }
-            if (!graph_engine) {
-                graph_engine = std::make_shared<compiler_graph_engine_t>(
-                        &graph_engine_vtable,
-                        const_cast<graph::engine_t *>(aengine),
-                        engine_ref_data_ptr);
-                engine_ref_data_ptr->engine_map_[aengine] = graph_engine;
-            }
-        }
-        // check engine
-        auto &tls_buffer = gc::runtime::thread_local_buffer_t::tls_buffer();
-        if (tls_buffer.engine_ && tls_buffer.engine_ != graph_engine.get()) {
-            gc::release_runtime_memory(tls_buffer.engine_);
-        }
-
-        ctx->engine_ = static_cast<gc::runtime::engine_t *>(graph_engine.get());
-
-        std::vector<gc::sc_op_ptr> args;
-        for (auto &out_lt : outputs) {
-            for (const auto &op : backend_graph_obj.get_output_ops()) {
-                if (op->attrs_.get<size_t>("unique_id")
-                        == outputs_map[out_lt.id]->attrs_.get<size_t>(
-                                "unique_id")) {
-                    args.push_back(op);
-                    break;
-                }
-            }
-        }
-        for (auto &in_lt : inputs) {
-            for (const auto &op : backend_graph_obj.get_input_ops()) {
-                if (op->attrs_.get<size_t>("unique_id")
-                        == inputs_map[in_lt.id]->attrs_.get<size_t>(
-                                "unique_id")) {
-                    args.push_back(op);
-                    break;
-                }
-            }
-        }
-
-        std::shared_ptr<gc::jit_function_t> fptr
-                = gc::compiler_driver(ctx, backend_graph_obj, args);
-
-        // pairs of {input_tensor_id, output_tensor_id}
-        std::vector<graph::inplace_pair_t> inplace_pairs;
-        for (auto &in_out : fptr->inplace_pairs_) {
-            // in graph compiler, the function's output args are at the front,
-            // so the input idx needs to shift by outputs.size().
-            if (in_out.first >= outputs.size()
-                    && in_out.first - outputs.size() < inputs.size()
-                    && in_out.second >= 0 && in_out.second < outputs.size()) {
-                const auto &input = inputs[in_out.first - outputs.size()];
-                const auto &output = outputs[in_out.second];
-                logical_tensor_wrapper_t in_ltw(input), out_ltw(output);
-                if (in_ltw.layout_type() != out_ltw.layout_type()
-                        || in_ltw.property_type() == property_type::constant) {
-                    continue;
-                }
-                // Temporarily disable this feature.
-                // inplace_pairs.push_back({input.id, output.id});
-            }
-        }
-
-        // validate and set outputs strides
-        for (size_t i = 0; i < output_format_any.size(); ++i) {
-            if (!output_format_any[i]) { continue; }
-            for (const auto &op : backend_graph_obj.get_output_ops()) {
-                if (op->attrs_.get<size_t>("unique_id")
-                        == outputs_map[outputs[i].id]->attrs_.get<size_t>(
-                                "unique_id")) {
-                    const auto &inferred_strides
-                            = op->get_inputs()[0]->details_.get_strides();
-                    auto out_lt = const_cast<graph::logical_tensor_t *>(
-                            &outputs[i]);
-                    assertm(out_lt->ndims > -1,
-                            "Partition output shape shall be specified.");
-                    if (out_lt->ndims == 0) continue;
-                    graph::dims out_shape(
-                            out_lt->dims, out_lt->dims + out_lt->ndims);
-                    graph::dims strides = utils::get_dense_strides(out_shape);
-                    out_lt->layout_type = graph::layout_type::strided;
-                    assertm(static_cast<int>(inferred_strides.size())
-                                    == out_lt->ndims,
-                            "Inferred stride shall have the same dimension "
-                            "as output logical tensor.");
-                    for (int d = 0; d < out_lt->ndims; ++d) {
-                        out_lt->layout.strides[d] = inferred_strides[d];
-                    }
-                    if (inferred_strides != strides) {
-                        // must be channel last format
-                        // TODO(yifei): generalize blocking stride to plain
-                        // stride logic here
-                        for (int d = 1; d < out_lt->ndims; ++d) {
-                            out_lt->layout.strides[d + 1] = inferred_strides[d];
-                        }
-                        out_lt->layout.strides[1] = 1;
-                    }
-                    break;
-                }
-            }
-        }
-
-        auto pimpl = std::make_shared<compiler_compiled_partition_impl_t>(
-                *aengine, inputs, outputs, inplace_pairs, fptr, graph_engine,
-                std::move(dyn_inputs), std::move(dyn_outputs));
-        compiled_partition->init(pimpl);
-        return res;
-    } catch (const std::exception &e) {
-        VERROR(graph, graph_compiler, "%s", e.what());
-        return graph::status::unimplemented;
-    }
-}
-
-std::shared_ptr<graph::partition_impl_t>
-compiler_partition_impl_t::clone() const {
-    auto ret = std::make_shared<compiler_partition_impl_t>(
-            get_engine_kind(), get_fpmath_mode(), get_kind(), get_name());
-    ret->ops_ = graph::graph_t::deep_copy(ops_);
-    ret->inputs_ = inputs_;
-    ret->outputs_ = outputs_;
-    ret->id_ = id_;
-    ret->is_init_ = is_init_;
-    return ret;
-}
-
-bool compiler_partition_impl_t::is_initialized() const {
-    return is_init_;
-}
-
-compiler_compiled_partition_impl_t::compiler_compiled_partition_impl_t(
-        const graph::engine_t &engine,
-        const std::vector<graph::logical_tensor_t> &inputs,
-        const std::vector<graph::logical_tensor_t> &outputs,
-        const std::vector<graph::inplace_pair_t> &inplace_pairs,
-        const std::shared_ptr<gc::jit_function_t> &jit_func,
-        const std::shared_ptr<graph::compiler_impl::compiler_graph_engine_t>
-                &graph_engine,
-        std::vector<gc::runtime::dynamic_tensor_t> &&dyn_inputs,
-        std::vector<gc::runtime::dynamic_tensor_t> &&dyn_outputs)
-    : graph::compiled_partition_impl_t(engine, inputs, outputs, inplace_pairs)
-    , graph_engine_(graph_engine)
-    , jit_func_(jit_func)
-    , dyn_inputs_(std::move(dyn_inputs))
-    , dyn_outputs_(std::move(dyn_outputs)) {}
-
-compiler_compiled_partition_impl_t::~compiler_compiled_partition_impl_t() {}
-
-graph::status_t compiler_compiled_partition_impl_t::execute(
-        const graph::stream_t *astream,
-        const std::vector<graph::tensor_t> &inputs,
-        const std::vector<graph::tensor_t> &outputs) {
-    // set backend runtime stream
-    compiler_graph_stream_t backend_stream {graph_engine_.get(), astream};
-    std::vector<gc::generic_val> generic_args;
-    if (dyn_inputs_.empty()) {
-        generic_args.reserve(inputs.size() + outputs.size());
-        for (auto out_tensor : outputs) {
-            generic_args.emplace_back(out_tensor.get_data_handle());
-        }
-        for (auto in_tensor : inputs) {
-            generic_args.emplace_back(in_tensor.get_data_handle());
-        }
-    } else {
-        generic_args.resize(inputs.size() + outputs.size());
-        auto trans_func = [](const graph::tensor_t &tsr,
-                                  gc::runtime::dynamic_tensor_t &dyn) {
-            dyn.data_ = tsr.get_data_handle();
-            dyn.dims_ = const_cast<gc::sc_dim *>(
-                    reinterpret_cast<const gc::sc_dim *>(
-                            tsr.get_logical_tensor().dims));
-            return &dyn;
-        };
-        auto arg_it = std::transform(outputs.begin(), outputs.end(),
-                dyn_outputs_.begin(), generic_args.begin(), trans_func);
-        std::transform(inputs.begin(), inputs.end(), dyn_inputs_.begin(),
-                arg_it, trans_func);
-    }
-    jit_func_->call_generic(&backend_stream, generic_args.data());
-    return status::success;
-}
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/compiler_partition_impl.hpp b/src/graph/backend/graph_compiler/compiler_partition_impl.hpp
deleted file mode 100644
index ed3e8c0c11c..00000000000
--- a/src/graph/backend/graph_compiler/compiler_partition_impl.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_COMPILER_PARTITION_IMPL_HPP
-#define BACKEND_GRAPH_COMPILER_COMPILER_PARTITION_IMPL_HPP
-
-#include <cassert>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "compiler/ir/graph/lowering.hpp"
-#include "compiler/jit/cfake/cfake_jit.hpp"
-#include "graph/interface/backend.hpp"
-#include "graph/interface/partition.hpp"
-#include "runtime/dynamic_dispatch/dynamic_tensor.hpp"
-#include "runtime/memorypool.hpp"
-
-#include "compiler_allocator.hpp"
-#include "compiler_backend.hpp"
-#include "compiler_graph.hpp"
-#include "utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-class compiler_partition_impl_t : public partition_impl_t {
-    friend class compiler_backend_t;
-
-public:
-    compiler_partition_impl_t(graph::engine_kind_t engine_kind,
-            graph::fpmath_mode_t fpmath_mode, graph::partition_kind_t pkind,
-            std::string pname)
-        : graph::partition_impl_t(engine_kind, fpmath_mode, pkind)
-        , is_init_(true)
-        , pname_(pname) {
-        assertm(fpmath_mode == fpmath_mode::strict
-                        || fpmath_mode == fpmath_mode::bf16,
-                "Compiler backend only allows fpmath mode: strict, bf16.");
-    }
-
-    virtual ~compiler_partition_impl_t() = default;
-
-    bool is_initialized() const override;
-    std::shared_ptr<graph::partition_impl_t> clone() const override;
-    graph::status_t infer_shape(
-            std::vector<const graph::logical_tensor_t *> &inputs,
-            std::vector<graph::logical_tensor_t *> &outputs) const override;
-    graph::status_t compile(graph::compiled_partition_t *compiled_partition,
-            const std::vector<graph::logical_tensor_t> &inputs,
-            const std::vector<graph::logical_tensor_t> &outputs,
-            const graph::engine_t *aengine) const override;
-
-    const graph::backend_t *get_assigned_backend() const override {
-        return &compiler_backend_t::get_singleton();
-    }
-
-    // add op to backend partition
-    void add_op(const std::shared_ptr<op_t> &op) { ops_.emplace_back(op); }
-
-    // add ops to backend partition
-    void add_op(const std::vector<std::shared_ptr<op_t>> &ops) {
-        for (auto &op : ops) {
-            add_op(op);
-        }
-    }
-
-    // add backend partition's input tensor
-    void add_input_tensor(const std::shared_ptr<value_t> &v) {
-        auto in_pos = std::find_if(inputs_.begin(), inputs_.end(),
-                [&](const graph::logical_tensor_t &alt) -> bool {
-                    return alt.id == v->get_logical_tensor().id;
-                });
-        if (in_pos == inputs_.end()) {
-            inputs_.push_back(v->get_logical_tensor());
-        }
-    }
-
-    // add backend partition's output tensor
-    void add_output_tensor(const std::shared_ptr<value_t> &v) {
-        auto out_pos = std::find_if(outputs_.begin(), outputs_.end(),
-                [&](const graph::logical_tensor_t &alt) -> bool {
-                    return alt.id == v->get_logical_tensor().id;
-                });
-        if (out_pos == outputs_.end()) {
-            outputs_.push_back(v->get_logical_tensor());
-        }
-    }
-
-    bool is_op_exist(const op_t *aop) {
-        auto pos = std::find_if(ops_.begin(), ops_.end(),
-                [&](const std::shared_ptr<graph::op_t> &cur) -> bool {
-                    return cur->get_id() == aop->get_id();
-                });
-        return pos != ops_.end();
-    }
-
-    std::string get_name() const { return pname_; }
-
-protected:
-    bool is_init_ = false;
-    mutable std::vector<std::shared_ptr<graph::op_t>> copied_ops_;
-    mutable std::mutex mtx_;
-    std::string pname_;
-};
-
-class compiler_compiled_partition_impl_t : public compiled_partition_impl_t {
-public:
-    compiler_compiled_partition_impl_t(const graph::engine_t &engine,
-            const std::vector<graph::logical_tensor_t> &inputs,
-            const std::vector<graph::logical_tensor_t> &outputs,
-            const std::vector<graph::inplace_pair_t> &inplace_pairs,
-            const std::shared_ptr<gc::jit_function_t> &jit_func,
-            const std::shared_ptr<graph::compiler_impl::compiler_graph_engine_t>
-                    &graph_engine,
-            std::vector<gc::runtime::dynamic_tensor_t> &&dyn_inputs,
-            std::vector<gc::runtime::dynamic_tensor_t> &&dyn_outputs);
-    virtual ~compiler_compiled_partition_impl_t();
-    graph::status_t execute(const graph::stream_t *astream,
-            const std::vector<graph::tensor_t> &inputs,
-            const std::vector<graph::tensor_t> &outputs) override;
-
-#ifdef DNNL_WITH_SYCL
-    status_t execute_sycl(const stream_t *astream,
-            const std::vector<tensor_t> &inputs,
-            const std::vector<tensor_t> &outputs,
-            const std::vector<::sycl::event> &sycl_deps,
-            ::sycl::event *sycl_event) override {
-        return execute(astream, inputs, outputs);
-    }
-#endif
-
-private:
-    // Notice: the order of the following graph_engine_ and jit_func_ shall not
-    // be changed, since we need to ensure jit_func_ is destructed before
-    // graph_engine_
-    std::shared_ptr<graph::compiler_impl::compiler_graph_engine_t>
-            graph_engine_;
-    std::shared_ptr<gc::jit_function_t> jit_func_;
-    std::vector<gc::runtime::dynamic_tensor_t> dyn_inputs_, dyn_outputs_;
-};
-
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/CMakeLists.txt b/src/graph/backend/graph_compiler/core/CMakeLists.txt
deleted file mode 100644
index f5e7a84c2c9..00000000000
--- a/src/graph/backend/graph_compiler/core/CMakeLists.txt
+++ /dev/null
@@ -1,245 +0,0 @@
-#===============================================================================
-# Copyright 2020-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-### graph-compiler specical config ###
-
-set(SC_LIB_NAME dnnl_graph_backend_compiler_core)
-SET(SC_MEMORY_LEAK_CHECK 0 CACHE STRING "memory leak checking. 0 = off, 1 = check numbers of instances, 2 = dump leaked objects")
-set(SC_CPP_VER 11)
-
-if(NOT "${SC_CPU_RUNTIME}" MATCHES "^(OMP|TBB|SEQ|THREADPOOL|SYCL)$")
-    message(FATAL_ERROR "Unsupported CPU runtime: ${SC_CPU_RUNTIME}")
-endif()
-
-if("${SC_CPU_RUNTIME}" STREQUAL "SYCL")
-    set(SC_CPU_RUNTIME "TBB")
-endif()
-
-# SET(DNNL_CPU_RUNTIME ${SC_CPU_RUNTIME})
-# SET(DNNL_CPU_THREADING_RUNTIME ${SC_CPU_RUNTIME})
-
-if(${SC_CPU_RUNTIME} STREQUAL "OMP")
-    set(SC_OMP_ENABLED ON)
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=1")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_CPU_THREADPOOL=1")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "TBB")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=2")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_CPU_THREADPOOL=2")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "SEQ")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=0")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_CPU_THREADPOOL=0")
-endif()
-
-if(${SC_CPU_RUNTIME} STREQUAL "THREADPOOL")
-    add_definitions_with_host_compiler("-DSC_CPU_THREADPOOL=3")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_CPU_THREADPOOL=3")
-endif()
-
-set(DNNL_PATH "${CMAKE_SOURCE_DIR}/src")
-
-include_directories_with_host_compiler(
-    ${CMAKE_CURRENT_SOURCE_DIR}/src
-    ${PROJECT_SOURCE_DIR}/src/cpu/x64
-)
-
-file(GLOB_RECURSE SC_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/*.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/*.hpp
-    )
-
-if(NOT DNNL_ENABLE_JIT_PROFILING)
-    list(REMOVE_ITEM SC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/jit/xbyak/debug/vtune/vtune.cpp)
-else()
-    add_definitions_with_host_compiler(-DSC_PROFILING=1)
-endif()
-
-if(WIN32 OR APPLE OR NOT SC_CFAKE_ENABLED)
-    list(REMOVE_ITEM SC_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/util/fdstream.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/util/fdstream.hpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/util/subprocess.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/jit/cfake/cfake_jit.cpp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/jit/cfake/cfake_jit.hpp
-    )
-    add_definitions_with_host_compiler("-DSC_CFAKE_JIT_ENABLED=0")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_CFAKE_JIT_ENABLED=0")
-else()
-    add_definitions_with_host_compiler("-DSC_CFAKE_JIT_ENABLED=1")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_CFAKE_JIT_ENABLED=1")
-endif()
-
-
-if(NOT SC_BUILTIN_ENABLED)
-    file(GLOB_RECURSE SC_BUILTIN_JIT_SRC
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/jit/xbyak/*
-    )
-    list(APPEND SC_BUILTIN_JIT_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dummy)
-    list(REMOVE_ITEM SC_SOURCES ${SC_BUILTIN_JIT_SRC})
-    add_definitions_with_host_compiler("-DSC_BUILTIN_JIT_ENABLED=0")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_BUILTIN_JIT_ENABLED=0")
-else()
-    add_definitions_with_host_compiler("-DSC_BUILTIN_JIT_ENABLED=1")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_BUILTIN_JIT_ENABLED=1")
-endif()
-
-if(SC_LLVM_ENABLED)
-    if(${SC_LLVM_VERSION} LESS 9)
-        # default to c++11
-    elseif(${SC_LLVM_VERSION} LESS 16)
-        set(SC_CPP_VER "14")
-    else()
-        set(SC_CPP_VER "17")
-    endif()
-    if(NOT MSVC)
-        include(CheckCXXCompilerFlag)
-        CHECK_CXX_COMPILER_FLAG(-std=c++${SC_CPP_VER} COMPILER_SUPPORTS_LLVM_CXX_STD)
-        if(NOT COMPILER_SUPPORTS_LLVM_CXX_STD)
-            message(WARNING "LLVM${SC_LLVM_VERSION} requires c++${SC_CPP_VER} which is not supported by your C++ compiler. LLVM backend is disabled")
-            set(SC_LLVM_ENABLED OFF)
-            set(SC_CPP_VER "11")
-        endif()
-    endif()
-endif()
-
-if(NOT SC_LLVM_ENABLED)
-    file(GLOB_RECURSE SC_LLVM_SRC
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/codegen/codegen_llvm.[ch]pp
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/codegen/llvm/*
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/jit/llvm/*
-    )
-    list(APPEND SC_LLVM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/dummy)
-    list(REMOVE_ITEM SC_SOURCES ${SC_LLVM_SRC})
-endif()
-
-if( SC_MEMORY_LEAK_CHECK GREATER -1 AND SC_MEMORY_LEAK_CHECK LESS 3 )
-    add_definitions_with_host_compiler("-DSC_MEMORY_LEAK_CHECK=${SC_MEMORY_LEAK_CHECK}")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS "-DSC_MEMORY_LEAK_CHECK=${SC_MEMORY_LEAK_CHECK}")
-else()
-    message(FATAL_ERROR "SC_MEMORY_LEAK_CHECK should be in 0 to 2")
-endif()
-
-####################
-# Configuring LLVM
-####################
-
-if(SC_LLVM_ENABLED)
-    message(STATUS "LLVM backend is ON. version ${SC_LLVM_VERSION}")
-    add_definitions_with_host_compiler("-DSC_LLVM_BACKEND=${SC_LLVM_VERSION}")
-    set_property(GLOBAL APPEND PROPERTY GRAPH_COMPILER_DEFINITIONS -DSC_LLVM_BACKEND=${SC_LLVM_VERSION})
-    file(TO_CMAKE_PATH "${SC_LLVM_INCLUDE_PATH}" SC_LLVM_INCLUDE_PATH_CMAKE) # to convert "\" to cmake style "/" in windows path
-    include_directories_with_host_compiler(${SC_LLVM_INCLUDE_PATH_CMAKE}) # include LLVM headers
-endif()
-####################
-# Compiling objlib
-####################
-
-add_library(${SC_LIB_NAME} OBJECT ${SC_SOURCES})
-set_property(GLOBAL APPEND PROPERTY DNNL_LIB_DEPS
-    $<TARGET_OBJECTS:${SC_LIB_NAME}>)
-
-if(SC_LIBRARY_TYPE STREQUAL "SHARED")
-    if(MSVC)
-        set_property(TARGET dnnl_graph_backend_compiler PROPERTY COMPILE_FLAGS "/DSC_DLL_EXPORTS")
-    else()
-        set_property(TARGET dnnl_graph_backend_compiler PROPERTY COMPILE_FLAGS "-DSC_DLL_EXPORTS")
-    endif()
-    add_definitions_with_host_compiler("-DSC_DLL_EXPORTS")
-endif()
-
-include_directories_with_host_compiler(
-    ${PROJECT_BINARY_DIR}/include
-)
-
-################
-# Linking LLVM
-################
-
-if(SC_LLVM_ENABLED)
-    set(SC_LLVM_ABS_LIBPATH ${SC_LLVM_LIB_NAME})
-
-    #expose the LLVM libraries it depends on when cmake version > 3.0
-    if(${CMAKE_VERSION} VERSION_GREATER "3.0.0")
-        add_library(dnnl_graphcompiler_llvm_lib INTERFACE IMPORTED GLOBAL)
-        set_property(TARGET dnnl_graphcompiler_llvm_lib PROPERTY INTERFACE_LINK_LIBRARIES ${SC_LLVM_LIB_NAME})
-        add_library(dnnl_graphcompiler_llvm_lib_exclude_string INTERFACE IMPORTED GLOBAL)
-        set_property(TARGET dnnl_graphcompiler_llvm_lib_exclude_string PROPERTY INTERFACE_LINK_LIBRARIES ${SC_LLVM_LIB_EXCLUDE})
-    endif()
-
-    if(${SC_LLVM_CONFIG_RETURN_STATIC})
-        set_property(GLOBAL APPEND PROPERTY DNNL_SUBDIR_EXTRA_STATIC_LIBS
-            ${SC_LLVM_ABS_LIBPATH})
-    else()
-        set_property(GLOBAL APPEND PROPERTY DNNL_SUBDIR_EXTRA_SHARED_LIBS
-            ${SC_LLVM_ABS_LIBPATH})
-    endif()
-
-    if(NOT MSVC)
-        set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/src/compiler/jit/llvm/llvm_jit_resolver.cpp PROPERTIES COMPILE_FLAGS -fno-rtti)
-    endif()
-endif()
-
-##########################
-# Setting compiler flags
-##########################
-
-if(CMAKE_BASE_NAME MATCHES "(icx|icpx)" AND ${SC_CPP_VER} LESS 14)
-    set(NEED_SET_CPP_STD OFF)
-else()
-    set(NEED_SET_CPP_STD ON)
-endif()
-
-if(NEED_SET_CPP_STD)
-    if(${CMAKE_VERSION} VERSION_LESS "3.1.0") 
-        if(MSVC)
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /std:c++${SC_CPP_VER}")
-        else()
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${SC_CPP_VER}")
-        endif()
-    else()
-        set_property(TARGET ${SC_LIB_NAME} PROPERTY CXX_STANDARD ${SC_CPP_VER})
-    endif()
-endif()
-
-if(NOT MSVC OR CMAKE_BASE_NAME MATCHES "(icx|icpx)")
-    if(NOT MSVC)
-
-    else()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
-        append_host_compiler_options(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations")
-    endif()
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-unused-variable -Wno-unused-function -std=c99 -DSC_HOME=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable -Wno-unused-function")
-    append_host_compiler_options(CMAKE_CXX_FLAGS "-Wno-unused-variable -Wno-unused-function")
-    if(DNNL_DPCPP_HOST_COMPILER STREQUAL "DEFAULT")
-        append(CMAKE_CXX_FLAGS " -DSC_HOME=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
-    endif()
-    if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wmissing-field-initializers")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wmissing-field-initializers")
-    endif()
-endif()
-
-if("${DNNL_CPU_RUNTIME}" STREQUAL "SYCL" AND WIN32 AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
-    append_host_compiler_options(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations")
-endif()
-
-# SET(DNNL_CPU_RUNTIME ${SC_CPU_RUNTIME})
diff --git a/src/graph/backend/graph_compiler/core/cmake/find_llvm.cmake b/src/graph/backend/graph_compiler/core/cmake/find_llvm.cmake
deleted file mode 100644
index b21e137d251..00000000000
--- a/src/graph/backend/graph_compiler/core/cmake/find_llvm.cmake
+++ /dev/null
@@ -1,159 +0,0 @@
-#===============================================================================
-# Copyright 2020-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-macro(exec_cmd_and_check CMD1 CMD2 CMD3 OUT)
-    execute_process(COMMAND ${CMD1} ${CMD2} ${CMD3}
-        RESULT_VARIABLE _sc_exit_code
-        OUTPUT_VARIABLE _sc_cmd_out)
-    if(NOT ${_sc_exit_code} STREQUAL 0)
-        message(FATAL_ERROR "Failed with exit code ${_sc_exit_code}: ${CMD}")
-    endif()
-    set(${OUT} ${_sc_cmd_out})
-endmacro()
-
-macro(find_llvm)
-    if(NOT ${SC_LLVM_VERSION} STREQUAL OFF)
-        # if LLVM environment is already set in parent environment, do nothing
-    elseif(LLVM_FOUND)
-        # if LLVM is found by cmake, use it
-        set(SC_LLVM_VERSION ${LLVM_VERSION_MAJOR})
-        set(SC_LLVM_INCLUDE_PATH ${LLVM_INCLUDE_DIRS})
-        set(SC_LLVM_LIB_NAME "")
-        foreach(__libname ${LLVM_AVAILABLE_LIBS})
-            # skip dynamic LLVM libraries
-            if(NOT ${__libname} MATCHES "^(LLVM|LLVMRemarks|LLVMLTO|LTO|Remarks)$")
-                get_target_property(__interface_lib ${__libname} INTERFACE_LINK_LIBRARIES)
-                # skip targets that depend on dynamic LLVM libraries
-                list(FIND __interface_lib "LLVM" _index)
-                if (NOT ${_index} GREATER -1)
-                    set(LIB_VAR "__LIB_${__libname}")
-                    find_library(${LIB_VAR} NAMES ${__libname} HINTS ${LLVM_LIBRARY_DIRS})
-                    if(${LIB_VAR})
-                        list(APPEND SC_LLVM_LIB_NAME ${__libname})
-                    endif()
-                endif()
-            endif()
-        endforeach()
-    elseif(SC_LLVM_CONFIG STREQUAL "CMAKE")
-        find_package(LLVM CONFIG)
-        if(NOT LLVM_FOUND)
-            message(FATAL_ERROR "LLVM not found.")
-        else()
-            set(SC_LLVM_CONFIG_RETURN_STATIC ON)
-            # LLVM-10+ has a bug in llvm_map_components_to_libnames(LLVM_LIBS all)
-            # bug report at https://bugs.llvm.org/show_bug.cgi?id=47003
-            if(SC_LLVM_TRY_DYN_LINK OR (NOT SC_LIBRARY_TYPE STREQUAL "STATIC"))
-                # find if LLVM has shared library built
-                list(FIND LLVM_AVAILABLE_LIBS "LLVM" _index)
-                if (${_index} GREATER -1)
-                    set(LLVM_LIBS LLVM)
-                    set(SC_LLVM_CONFIG_RETURN_STATIC OFF)
-                else()
-                    set(LLVM_LIBS ${LLVM_AVAILABLE_LIBS})
-                endif()
-            else()
-                set(LLVM_LIBS ${LLVM_AVAILABLE_LIBS})
-            endif()
-            set(SC_LLVM_VERSION ${LLVM_VERSION_MAJOR})
-            set(SC_LLVM_INCLUDE_PATH ${LLVM_INCLUDE_DIRS})
-            set(SC_LLVM_LIB_NAME ${LLVM_LIBS})
-            message(STATUS "Link static LLVM=${SC_LLVM_CONFIG_RETURN_STATIC}")
-        endif()
-    else()
-        if(SC_LLVM_CONFIG STREQUAL "AUTO")
-            foreach(__sc_llvm_ver RANGE 16 10 -1)
-                find_program(__sc_llvm_config llvm-config-${__sc_llvm_ver})
-                if(__sc_llvm_config)
-                    SET(SC_LLVM_CONFIG ${__sc_llvm_config})
-                endif()
-            endforeach()
-            if(SC_LLVM_CONFIG STREQUAL "AUTO")
-                find_program(__sc_llvm_config llvm-config)
-                if(__sc_llvm_config)
-                    SET(SC_LLVM_CONFIG ${__sc_llvm_config})
-                endif()
-                if(SC_LLVM_CONFIG STREQUAL "AUTO")
-                    message(FATAL_ERROR "Failed to find llvm-config in AUTO mode. Please specify an llvm-config executable.")
-                endif()
-            endif()
-        endif()
-        message(STATUS "Finding LLVM using ${SC_LLVM_CONFIG}")
-        set(__sc_llvm_link "--link-static")
-        set(SC_LLVM_CONFIG_RETURN_STATIC ON)
-        if(NOT MSVC)
-            if(SC_LLVM_TRY_DYN_LINK OR (NOT SC_LIBRARY_TYPE STREQUAL "STATIC"))
-                # try link with shared library
-                execute_process(COMMAND ${SC_LLVM_CONFIG} "--libfiles" "--link-shared"
-                    RESULT_VARIABLE _sc_exit_code
-                    OUTPUT_VARIABLE _sc_cmd_out ERROR_QUIET)
-                if(${_sc_exit_code} STREQUAL 0)
-                    set(__sc_llvm_link "--link-shared")
-                    set(SC_LLVM_CONFIG_RETURN_STATIC OFF)
-                endif()
-            else()
-                execute_process(COMMAND ${SC_LLVM_CONFIG} "--libfiles" "--link-static"
-                    RESULT_VARIABLE _sc_exit_code
-                    OUTPUT_VARIABLE _sc_cmd_out ERROR_QUIET)
-                if(NOT ${_sc_exit_code} STREQUAL 0)
-                    set(__sc_llvm_link "--link-shared")
-                    set(SC_LLVM_CONFIG_RETURN_STATIC OFF)
-                endif()
-            endif()
-        endif()
-
-        message(STATUS "Decided to link LLVM with ${__sc_llvm_link}")
-
-        exec_cmd_and_check(${SC_LLVM_CONFIG} "--system-libs" "${__sc_llvm_link}"
-            _sc_llvm_system_libs)
-        exec_cmd_and_check(${SC_LLVM_CONFIG} "--libfiles"  "${__sc_llvm_link}"
-            _sc_llvm_libs)
-
-        # libs to link
-        string(STRIP ${_sc_llvm_libs} _sc_llvm_libs)
-        string(STRIP ${_sc_llvm_system_libs} _sc_llvm_system_libs)
-        separate_arguments(_sc_llvm_libs)
-        if(SC_LLVM_CONFIG_RETURN_STATIC)
-            set(SC_LLVM_LIB_NAME "")
-            foreach(__libname ${_sc_llvm_libs})
-                if(EXISTS ${__libname})
-                    list(APPEND SC_LLVM_LIB_NAME ${__libname})
-                endif()
-            endforeach()
-        else()
-            set(SC_LLVM_LIB_NAME ${_sc_llvm_libs})
-        endif()
-        set(SC_LLVM_LIB_NAME "${SC_LLVM_LIB_NAME} ${_sc_llvm_system_libs}")
-        separate_arguments(SC_LLVM_LIB_NAME)
-
-        # version
-        exec_cmd_and_check(${SC_LLVM_CONFIG} "--version" ""
-            _sc_llvm_version)
-        string(REGEX MATCH "^([^.]+)\\.([^.])+\\.[^.]+.*$" _ ${_sc_llvm_version})
-        set(SC_LLVM_VERSION ${CMAKE_MATCH_1})
-        string(STRIP ${SC_LLVM_VERSION} SC_LLVM_VERSION)
-        
-        # include directory
-        exec_cmd_and_check(${SC_LLVM_CONFIG} "--includedir" "" SC_LLVM_INCLUDE_PATH)
-    endif()
-    if(SC_LLVM_CONFIG_RETURN_STATIC)
-        set(__sc_llvm_exclude "")
-        foreach(__sc_llvm_lib ${SC_LLVM_LIB_NAME})
-            get_filename_component(__sc_llvm_file ${__sc_llvm_lib} NAME)
-            list(APPEND __sc_llvm_exclude ${__sc_llvm_file})
-        endforeach()
-        string(REPLACE ";" ":" SC_LLVM_LIB_EXCLUDE "${__sc_llvm_exclude}")
-    endif()
-endmacro()
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c.cpp
deleted file mode 100644
index ccd17f02f20..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c.cpp
+++ /dev/null
@@ -1,1292 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "codegen_c.hpp"
-#include <algorithm>
-#include <assert.h>
-#include <memory>
-#include <string.h>
-#include <string>
-#include <utility>
-#include <vector>
-#include "../ir/viewer.hpp"
-#include "codegen_c_internal.hpp"
-#include "precodegen_passes.hpp"
-#include <compiler/ir/intrinsics.hpp>
-#include <compiler/ir/pass/func_dependency.hpp>
-#include <compiler/ir/pass/printer.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/jit/symbol_resolver.hpp>
-#include <runtime/threadpool_mode.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/scoped_timer.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static constexpr const char *wrapper_postfix = "_0wrapper";
-
-static std::string get_closure_wrapper_name(const std::string &name) {
-    return name + "_0closurewrapper";
-}
-
-static void print_cpp_etype(ostream &os, sc_data_etype t) {
-    switch (t) {
-        case sc_data_etype::UNDEF: assert(0 && "Met undef"); break;
-        case sc_data_etype::F16: os << "_Float16"; break;
-        case sc_data_etype::BF16: os << "uint16_t"; break;
-        case sc_data_etype::U16: os << "uint16_t"; break;
-        case sc_data_etype::F32: os << "float"; break;
-        case sc_data_etype::S32: os << "int32_t"; break;
-        case sc_data_etype::U32: os << "uint32_t"; break;
-        case sc_data_etype::S8: os << "int8_t"; break;
-        case sc_data_etype::U8: os << "uint8_t"; break;
-        case sc_data_etype::INDEX: os << "uint64_t"; break;
-        case sc_data_etype::POINTER: os << "void*"; break;
-        case sc_data_etype::VOID_T: os << "void"; break;
-        case sc_data_etype::BOOLEAN: os << "bool"; break;
-        case sc_data_etype::GENERIC: os << "generic_val"; break;
-        default:
-            if (etypes::is_pointer(t)) {
-                print_cpp_etype(os, etypes::get_pointer_element(t));
-                os << '*';
-                return;
-            }
-            assert(0 && "Unknown type");
-    }
-}
-
-ostream &print_cpp_type(ostream &os, sc_data_type_t dtype) {
-    assert(dtype.lanes_ == 1 && "Only scalar types allowed here");
-    print_cpp_etype(os, dtype.type_code_);
-    return os;
-}
-
-ostream &codegen_c_vis::print_cpp_var_def(const var &v) {
-    print_type(v->dtype_);
-    *os << ' ' << v->name_;
-    return *os;
-}
-
-ostream &codegen_c_vis::print_tensor_def(const tensor &v) {
-    if (v->attr_ && v->attr_->get_or_else("volatile", false)) {
-        *os << "volatile ";
-    }
-    print_type(v->elem_dtype_);
-    auto ali_info = alias_info::get_alias_info(*v);
-    if (ali_info && !ali_info->has_no_alias()) {
-        *os << "* ";
-    } else {
-        *os << "* __restrict__ ";
-    }
-    *os << v->name_;
-    return *os;
-}
-
-ostream &codegen_c_vis::print_param(const expr &e) {
-    if (e.isa<var>()) {
-        print_cpp_var_def(e.static_as<var>());
-    } else {
-        print_tensor_def(e.checked_as<tensor>());
-    }
-    return *os;
-}
-
-ostream &codegen_c_vis::print_func_params(const func_c &v, bool with_type) {
-    if (!v->params_.empty()) {
-        for (size_t i = 0; i < v->params_.size() - 1; i++) {
-            expr e = v->params_[i];
-            if (with_type) {
-                print_param(e);
-            } else {
-                if (e.isa<var>()) {
-                    *os << e.static_as<var>()->name_;
-                } else {
-                    *os << e.checked_as<tensor>()->name_;
-                }
-            }
-            *os << ", ";
-        }
-        expr e = v->params_.back();
-        if (with_type) {
-            print_param(e);
-        } else {
-            if (e.isa<var>()) {
-                *os << e.static_as<var>()->name_;
-            } else {
-                *os << e.checked_as<tensor>()->name_;
-            }
-        }
-    }
-    return *os;
-}
-
-void codegen_c_vis::print_type(sc_data_type_t dtype) {
-    if (dtype.lanes_ == 1) {
-        print_cpp_type(*os, dtype);
-    } else {
-        switch (dtype) {
-            case sc_data_type_t::bf16(4): *os << "vec_u16x4"; break;
-            case sc_data_type_t::bf16(8): *os << "vec_u16x8"; break;
-            case sc_data_type_t::bf16(16): *os << "vec_u16x16"; break;
-            case sc_data_type_t::bf16(32): *os << "vec_u16x32"; break;
-
-            case sc_data_type_t::f16(4): *os << "vec_f16x4"; break;
-            case sc_data_type_t::f16(8): *os << "vec_f16x8"; break;
-            case sc_data_type_t::f16(16): *os << "vec_f16x16"; break;
-            case sc_data_type_t::f16(32): *os << "vec_f16x32"; break;
-
-            case sc_data_type_t::f32(4): *os << "vec_f32x4"; break;
-            case sc_data_type_t::f32(8): *os << "vec_f32x8"; break;
-            case sc_data_type_t::f32(16): *os << "vec_f32x16"; break;
-
-            case sc_data_type_t::s32(4): *os << "vec_s32x4"; break;
-            case sc_data_type_t::s32(8): *os << "vec_s32x8"; break;
-            case sc_data_type_t::s32(16): *os << "vec_s32x16"; break;
-
-            case sc_data_type_t::u32(4): *os << "vec_u32x4"; break;
-            case sc_data_type_t::u32(8): *os << "vec_u32x8"; break;
-            case sc_data_type_t::u32(16): *os << "vec_u32x16"; break;
-
-            case sc_data_type_t::index(2): *os << "vec_u64x2"; break;
-            case sc_data_type_t::index(4): *os << "vec_u64x4"; break;
-            case sc_data_type_t::index(8): *os << "vec_u64x8"; break;
-
-            case sc_data_type_t::u16(8): *os << "vec_u16x8"; break;
-            case sc_data_type_t::u16(16): *os << "vec_u16x16"; break;
-            case sc_data_type_t::u16(32): *os << "vec_u16x32"; break;
-
-            case sc_data_type_t::s8(8): *os << "vec_s8x8"; break;
-            case sc_data_type_t::s8(16): *os << "vec_s8x16"; break;
-            case sc_data_type_t::s8(32): *os << "vec_s8x32"; break;
-            case sc_data_type_t::s8(64): *os << "vec_s8x64"; break;
-
-            case sc_data_type_t::u8(8): *os << "vec_u8x8"; break;
-            case sc_data_type_t::u8(16): *os << "vec_u8x16"; break;
-            case sc_data_type_t::u8(32): *os << "vec_u8x32"; break;
-            case sc_data_type_t::u8(64): *os << "vec_u8x64"; break;
-
-            case sc_data_type_t::boolean(4): *os << "uint8_t"; break;
-            case sc_data_type_t::boolean(8): *os << "uint8_t"; break;
-            case sc_data_type_t::boolean(16): *os << "uint16_t"; break;
-            case sc_data_type_t::boolean(32): *os << "uint32_t"; break;
-            case sc_data_type_t::boolean(64): *os << "uint64_t"; break;
-
-            default:
-                COMPILE_ASSERT(
-                        0, "Cannot generate vector type for C++: " << dtype);
-                break;
-        }
-    }
-}
-
-codegen_c_vis::codegen_c_vis(ostream *os, bool prototype_only, bool is_static)
-    : os(os), prototype_only(prototype_only), is_static(is_static) {}
-
-stmt_c codegen_c_vis::dispatch(stmt_c v) {
-    if (v->attr_) {
-        if (auto comments
-                = v->attr_->get_or_null<std::vector<std::string>>("comments")) {
-            for (auto &str : *comments) {
-                *os << "// " << str << "\n";
-                print_indents(*os, indents);
-            }
-        }
-    }
-    return ir_visitor_t::dispatch(std::move(v));
-}
-
-static const std::string &get_func_name(const func_c &v) {
-    if (v->attr_) {
-        if (auto pname = v->attr_->get_or_null<std::string>(
-                    "temp.replace_func_name")) {
-            return *pname;
-        }
-    }
-    return v->name_;
-}
-
-func_c codegen_c_vis::dispatch(func_c v) {
-    if (utils::string_startswith(v->name_, "_should_inline_")) { return v; }
-    if (prototype_only) { print_func_comments(v, *os); }
-    bool is_symbol_in_runtime
-            = !is_offline_ && default_external_symbol_resolve(v->name_);
-    if (!is_symbol_in_runtime) {
-        *os << (is_static ? "static " : "extern \"C\" ");
-    }
-    print_type(v->ret_type_);
-    const std::string &real_name = get_func_name(v);
-    if (is_symbol_in_runtime) {
-        *os << " (*" << real_name << "_fptr" << ')' << '(';
-    } else {
-        *os << " " << real_name << '(';
-    }
-    print_func_params(v);
-    *os << ") noexcept";
-    if (prototype_only) {
-        if (v->attr_) {
-            if (v->attr_->get_or_else(function_attrs::pure, false)) {
-                if (is_symbol_in_runtime) {
-                    // create a wrapper to symbol fptr for compiler function opt
-                    *os << ";\n";
-                    print_type(v->ret_type_);
-                    *os << " __" << real_name << wrapper_postfix << '(';
-                    print_func_params(v);
-                    *os << ") noexcept __attribute__((const));\n";
-
-                    print_type(v->ret_type_);
-                    *os << " __" << real_name << wrapper_postfix << '(';
-                    print_func_params(v);
-                    *os << ") noexcept { return " << real_name << "_fptr(";
-                    print_func_params(v, false);
-                    *os << "); }";
-                } else {
-                    *os << " __attribute__((const))";
-                }
-            }
-            if (v->attr_->get_or_else(function_attrs::no_alias, false)) {
-                *os << " __attribute__((returns_nonnull))  ";
-                if (!is_offline_) { *os << "/*"; }
-                *os << "__attribute__((malloc))";
-                if (!is_offline_) { *os << "*/"; }
-            }
-        }
-        std::string tensor_idx = " __attribute__((nonnull (";
-        bool has_tensor = false;
-        for (size_t i = 0; i < v->params_.size(); i++) {
-            if (v->params_[i].isa<tensor>()) {
-                if (has_tensor) { tensor_idx += ','; }
-                has_tensor = true;
-                tensor_idx += std::to_string(i + 1);
-            }
-        }
-        if (has_tensor) { *os << tensor_idx << ")))"; }
-    }
-    if (!prototype_only && v->body_.defined()) {
-        dispatch(v->body_);
-    } else {
-        *os << ";";
-    }
-    return std::const_pointer_cast<func_base>(v);
-}
-
-void codegen_c_vis::view(constant_c v) {
-    if (v->is_vector()) {
-        print_type(v->dtype_);
-        v->to_string(*os);
-    } else if (v->dtype_ != datatypes::pointer && v->dtype_.is_pointer()) {
-        (*os) << '(' << '(';
-        print_type(v->dtype_);
-        (*os) << ')' << v->value_.at(0).u64 << ')';
-    } else {
-        if (v->dtype_.is_etype(sc_data_etype::F16)) {
-            (*os) << "(_Float16)";
-            v->to_string(*os);
-        } else {
-            v->to_string(*os);
-        }
-    }
-}
-
-void codegen_c_vis::view(var_c v) {
-    *os << v->name_;
-}
-
-void codegen_c_vis::view(cast_c v) {
-    // if it is T* => void* cast, C++ can auto cast it
-    if (v->in_->dtype_.is_pointer() && v->dtype_ == datatypes::pointer) {
-        dispatch(v->in_);
-        return;
-    }
-    // cast to generic_val
-    if (v->dtype_ == datatypes::generic) {
-        assert(v->in_->dtype_.lanes_ == 1
-                && "casting to generic vector is not implemented");
-        // we have constructors defined. C++ will handle the conversions
-        dispatch(v->in_);
-        return;
-    }
-    // cast from generic_val
-    if (v->in_->dtype_ == datatypes::generic) {
-        assert(v->in_->dtype_.lanes_ == 1
-                && "casting from generic vector is not implemented");
-        if (v->dtype_.is_etype_pointer()) {
-            // prints (float*)(XXX.v_ptr)
-            *os << '(';
-            print_cpp_etype(*os, v->dtype_.type_code_);
-            *os << ")(";
-            dispatch(v->in_);
-            *os << ".v_ptr)";
-        } else {
-            // prints xxx.v_float
-            dispatch(v->in_);
-            *os << ".v_";
-            print_cpp_etype(*os, v->dtype_.type_code_);
-        }
-        return;
-    }
-
-    if (v->dtype_.lanes_ == 1) {
-        if (v->in_->dtype_.is_etype(sc_data_etype::F32)
-                && v->dtype_.is_etype(sc_data_etype::BF16)) {
-            *os << "tobf16(";
-            dispatch(v->in_);
-            *os << ')';
-        } else if (v->in_->dtype_.is_etype(sc_data_etype::F16)
-                && v->dtype_.is_etype(sc_data_etype::F32)) {
-            *os << "(float)";
-            dispatch(v->in_);
-        } else if (v->in_->dtype_.is_etype(sc_data_etype::F32)
-                && v->dtype_.is_etype(sc_data_etype::F16)) {
-            *os << "(_Float16)";
-            dispatch(v->in_);
-        } else {
-            *os << '(';
-            print_cpp_type(*os, v->dtype_) << ')';
-            dispatch(v->in_);
-        }
-
-    } else {
-        COMPILE_ASSERT(v->dtype_.lanes_ == v->in_->dtype_.lanes_,
-                "Vector cast should have same lanes. Got "
-                        << v->dtype_ << " vs " << v->in_->dtype_);
-        if (v->in_->dtype_.is_etype(sc_data_etype::F32)
-                && v->dtype_.is_etype(sc_data_etype::BF16)) {
-            *os << "tobf16(";
-            dispatch(v->in_);
-            *os << ')';
-        } else {
-            *os << '(';
-            print_type(v->dtype_);
-            *os << ")(";
-            dispatch(v->in_);
-            *os << ')';
-        }
-    }
-}
-
-#define GEN_BINARY(TYPE) \
-    void codegen_c_vis::print_binary(const TYPE &v, const char *op) { \
-        *os << '('; \
-        dispatch(v->l_); \
-        *os << op; \
-        dispatch(v->r_); \
-        *os << ')'; \
-    }
-
-GEN_BINARY(binary_c);
-GEN_BINARY(logic_c);
-GEN_BINARY(cmp_c);
-
-void codegen_c_vis::trinary_func_codegen_c(
-        const std::vector<expr> &args, const char *funcname) {
-    COMPILE_ASSERT(args.size() == 3,
-            "Invalid arg size: " << args.size() << ", should be 3");
-    auto &os = *this->os;
-    os << funcname << "(";
-    dispatch(args[0]);
-    os << ", ";
-    dispatch(args[1]);
-    os << ", ";
-    dispatch(args[2]);
-    os << ')';
-}
-
-void codegen_c_vis::binary_func_codegen_c(
-        const std::vector<expr> &args, const char *funcname) {
-    COMPILE_ASSERT(args.size() == 2,
-            "Invalid arg size: " << args.size() << ", should be 3");
-    auto &os = *this->os;
-    os << funcname << '(';
-    dispatch(args[0]);
-    os << ", ";
-    dispatch(args[1]);
-    os << ')';
-}
-
-void codegen_c_vis::unary_func_codegen_c(
-        const expr &arg, const char *funcname) {
-    auto &os = *this->os;
-    os << funcname << '(';
-    dispatch(arg);
-    os << ')';
-}
-
-void codegen_c_vis::view(func_addr_c v) {
-    if (!is_offline_ && default_external_symbol_resolve(v->func_->name_)) {
-        *os << "(void*)" << v->func_->name_ << "_fptr";
-    } else {
-        *os << "(void*)&" << v->func_->name_;
-    }
-}
-
-static const char *prefetch_names[]
-        = {"_MM_HINT_T0", "_MM_HINT_T1", "_MM_HINT_T2", "_MM_HINT_NTA"};
-
-void codegen_c_vis::view(intrin_call_c v) {
-    switch (v->type_) {
-        case intrin_type::min: binary_func_codegen_c(v->args_, "sc_min"); break;
-        case intrin_type::max: binary_func_codegen_c(v->args_, "sc_max"); break;
-        case intrin_type::abs:
-            unary_func_codegen_c(v->args_[0], "sc_abs");
-            break;
-        case intrin_type::round:
-            unary_func_codegen_c(v->args_[0], "sc_round");
-            break;
-        case intrin_type::floor:
-            unary_func_codegen_c(v->args_[0], "sc_floor");
-            break;
-        case intrin_type::ceil:
-            unary_func_codegen_c(v->args_[0], "sc_ceil");
-            break;
-        case intrin_type::exp:
-            unary_func_codegen_c(v->args_[0], "sc_exp");
-            break;
-        case intrin_type::sqrt:
-            unary_func_codegen_c(v->args_[0], "sc_sqrt");
-            break;
-        case intrin_type::rsqrt:
-            unary_func_codegen_c(v->args_[0], "sc_rsqrt");
-            break;
-        case intrin_type::reduce_add:
-            unary_func_codegen_c(v->args_[0], "sc_reduce_add");
-            break;
-        case intrin_type::reduce_mul:
-            unary_func_codegen_c(v->args_[0], "sc_reduce_mul");
-            break;
-        case intrin_type::reduce_max:
-            unary_func_codegen_c(v->args_[0], "sc_reduce_max");
-            break;
-        case intrin_type::reduce_min:
-            unary_func_codegen_c(v->args_[0], "sc_reduce_min");
-            break;
-        case intrin_type::fmadd:
-            trinary_func_codegen_c(v->args_, "sc_fmadd");
-            break;
-        case intrin_type::unpack_low:
-            *os << "sc_unpack_low_";
-            print_type(v->dtype_);
-            *os << "_";
-            *os << v->intrin_attrs_->get<int>("elem_bits");
-            *os << "bits";
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::unpack_high:
-            *os << "sc_unpack_high_";
-            print_type(v->dtype_);
-            *os << "_";
-            *os << v->intrin_attrs_->get<int>("elem_bits");
-            *os << "bits";
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::shuffle:
-            *os << "sc_shuffle_";
-            print_type(v->dtype_);
-            *os << "_";
-            *os << v->intrin_attrs_->get<int>("type_bits");
-            *os << "bits";
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            dispatch(v->args_[1]);
-            *os << ", ";
-            *os << v->intrin_attrs_->get<int>("shuffle_imm");
-            *os << ')';
-            break;
-        case intrin_type::permute:
-            *os << "sc_permute_";
-            print_type(v->dtype_);
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            dispatch(v->args_[1]);
-            *os << ", ";
-            *os << v->intrin_attrs_->get<int>("permute_imm");
-            *os << ')';
-            break;
-        case intrin_type::prefetch: {
-            *os << "_mm_prefetch(";
-            dispatch(v->args_[0]);
-            auto locality = v->intrin_attrs_->get<int>("locality");
-            COMPILE_ASSERT(locality <= 3 && locality >= 0,
-                    "bad locality for prefetch");
-            *os << ", " << prefetch_names[locality] << ')';
-            break;
-        }
-        case intrin_type::gather:
-            *os << "sc_gather(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            dispatch(v->args_[1]);
-            *os << ")";
-            break;
-        case intrin_type::broadcast:
-            print_type(v->dtype_);
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ')';
-            break;
-        case intrin_type::reinterpret:
-            *os << "sc_reinterpret<";
-            print_type(v->dtype_);
-            *os << ">(";
-            dispatch(v->args_[0]);
-            *os << ')';
-            break;
-        case intrin_type::isnan:
-            *os << "sc_isnan(";
-            dispatch(v->args_[0]);
-            *os << ')';
-            break;
-        case intrin_type::saturated_cast:
-            *os << "sc_saturated_cast<";
-            print_type(v->dtype_);
-            *os << ">(";
-            dispatch(v->args_[0]);
-            *os << ')';
-            break;
-        case intrin_type::round_and_cast:
-            *os << "sc_round_and_cast<";
-            print_type(v->dtype_);
-            *os << ">(";
-            dispatch(v->args_[0]);
-            *os << ')';
-            break;
-        case intrin_type::int_and:
-            *os << '(';
-            dispatch(v->args_[0]);
-            *os << " & ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::int_or:
-            *os << '(';
-            dispatch(v->args_[0]);
-            *os << " | ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::int_xor:
-            *os << '(';
-            dispatch(v->args_[0]);
-            *os << " ^ ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::shl:
-            *os << '(';
-            dispatch(v->args_[0]);
-            *os << " << ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::shr:
-            *os << '(';
-            dispatch(v->args_[0]);
-            *os << " >> ";
-            dispatch(v->args_[1]);
-            *os << ')';
-            break;
-        case intrin_type::permutex2var:
-            trinary_func_codegen_c(v->args_, "sc_permutex2var");
-            break;
-        case intrin_type::permutexvar:
-            if (v->args_[0].isa<constant_c>()) {
-                // vpermq have two different invocation. If is imm, we need to
-                // do use these part.
-                int lanes = v->intrin_attrs_->get<int>("lanes");
-                int elem_bits = utils::get_sizeof_etype(
-                                        v->args_[1]->dtype_.type_code_)
-                        * 8 * lanes;
-                auto suffix = std::to_string(elem_bits) + "bits";
-                *os << "sc_permutexvar_";
-                print_type(v->args_[1]->dtype_);
-                *os << "_" + suffix;
-                *os << '(';
-                dispatch(v->args_[0]);
-                *os << ',';
-                dispatch(v->args_[1]);
-                *os << ')';
-            } else {
-                binary_func_codegen_c(v->args_, "sc_permutexvar");
-            }
-            break;
-        case intrin_type::insert:
-            *os << "sc_insert_";
-            print_type(v->dtype_);
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            dispatch(v->args_[1]);
-            *os << ", ";
-            *os << v->intrin_attrs_->get<int>("insert_imm");
-            *os << ')';
-            break;
-        case intrin_type::extract:
-            *os << "sc_extract_";
-            print_type(v->args_[0]->dtype_);
-            *os << "(";
-            dispatch(v->args_[0]);
-            *os << ", ";
-            *os << v->intrin_attrs_->get<int>("extract_imm");
-            *os << ')';
-            break;
-        default: assert(0 && "Unknown intrinsic!"); break;
-    }
-}
-
-void codegen_c_vis::view(add_c v) {
-    print_binary(v, " + ");
-}
-void codegen_c_vis::view(sub_c v) {
-    print_binary(v, " - ");
-}
-void codegen_c_vis::view(mul_c v) {
-    print_binary(v, " * ");
-}
-void codegen_c_vis::view(div_c v) {
-    print_binary(v, " / ");
-}
-void codegen_c_vis::view(mod_c v) {
-    print_binary(v, " % ");
-}
-
-void codegen_c_vis::view(cmp_eq_c v) {
-    print_binary(v, " == ");
-}
-void codegen_c_vis::view(cmp_lt_c v) {
-    print_binary(v, " < ");
-}
-void codegen_c_vis::view(cmp_le_c v) {
-    print_binary(v, " <= ");
-}
-void codegen_c_vis::view(cmp_gt_c v) {
-    print_binary(v, " > ");
-}
-void codegen_c_vis::view(cmp_ge_c v) {
-    print_binary(v, " >= ");
-}
-void codegen_c_vis::view(cmp_ne_c v) {
-    print_binary(v, " != ");
-}
-
-void codegen_c_vis::view(logic_and_c v) {
-    print_binary(v, " && ");
-}
-void codegen_c_vis::view(logic_or_c v) {
-    print_binary(v, " || ");
-}
-
-void codegen_c_vis::view(logic_not_c v) {
-    *os << '!';
-    dispatch(v->in_);
-}
-
-void codegen_c_vis::view(select_c v) {
-    if (v->l_->dtype_.lanes_ > 1) {
-        *os << "sc_select(";
-        dispatch(v->cond_);
-        *os << ", ";
-        dispatch(v->l_);
-        *os << ", ";
-        dispatch(v->r_);
-    } else {
-        *os << "(";
-        dispatch(v->cond_);
-        *os << "?";
-        dispatch(v->l_);
-        *os << ":";
-        dispatch(v->r_);
-    }
-    *os << ")";
-}
-
-void codegen_c_vis::view(indexing_c v) {
-    if (v->dtype_.lanes_ > 1) {
-        if (v->mask_.defined()) {
-            print_type(v->dtype_);
-            *os << "::mask_load(&";
-            dispatch(v->ptr_);
-            *os << '[';
-            assert(v->idx_.size() == 1);
-            dispatch(v->idx_.front());
-            *os << "], ";
-            dispatch(v->mask_);
-            *os << ')';
-        } else {
-            print_type(v->dtype_);
-            *os << "::load(&";
-            dispatch(v->ptr_);
-            *os << '[';
-            assert(v->idx_.size() == 1);
-            dispatch(v->idx_.front());
-            *os << ']' << ')';
-        }
-    } else {
-        dispatch(v->ptr_);
-        *os << '[';
-        assert(v->idx_.size() == 1);
-        dispatch(v->idx_.front());
-        *os << ']';
-    }
-}
-
-void codegen_c_vis::view(tensorptr_c v) {
-    *os << '&';
-    dispatch(v->base_);
-}
-
-void codegen_c_vis::view(call_c v) {
-    func_t the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-    expr the_expr;
-    if (!the_func) {
-        the_expr = expr(std::dynamic_pointer_cast<expr_base>(v->func_));
-        assert(the_expr.defined());
-    }
-    if (!v->para_attr_.empty()) {
-        assert(v->args_.size() == 1);
-        assert(v->para_attr_.size() == 1);
-        assert(the_func);
-        *os << "sc_parallel_call_cpu(" << the_func->name_ << ", ";
-        dispatch(v->para_attr_[0].begin_);
-        *os << ", ";
-        dispatch(v->para_attr_[0].end_);
-        *os << ", ";
-        dispatch(v->para_attr_[0].step_);
-        *os << ", ";
-        *os << v->args_[0] << ')';
-    } else {
-        if (the_func) {
-            if (!is_offline_
-                    && default_external_symbol_resolve(the_func->name_)) {
-                // if set gcc attributes, call func wrapper instead
-                if (!the_func->attr_
-                        || !the_func->attr_->get_or_else(
-                                function_attrs::pure, false)) {
-                    *os << the_func->name_ << "_fptr";
-                } else {
-                    *os << "__" << the_func->name_ << wrapper_postfix;
-                }
-            } else {
-                *os << the_func->name_;
-            }
-        } else {
-            auto func = the_expr->attr().get_or_else("prototype", func_t());
-            COMPILE_ASSERT(func, "Call node expects an expr with prototype");
-            *os << '(' << '(';
-            print_type(func->ret_type_);
-            *os << "(*)(";
-            if (!func->params_.empty()) {
-                for (size_t i = 0; i < func->params_.size() - 1; i++) {
-                    print_type(func->params_[i]->dtype_);
-                    *os << ',' << ' ';
-                }
-                print_type(func->params_.back()->dtype_);
-            }
-            *os << ')' << ')';
-            dispatch(the_expr);
-            *os << ')';
-        }
-        *os << '(';
-        if (!v->args_.empty()) {
-            for (size_t i = 0; i < v->args_.size() - 1; i++) {
-                expr e = v->args_.at(i);
-                dispatch(e);
-                *os << ", ";
-            }
-            dispatch(v->args_.back());
-        }
-        *os << ")";
-    }
-}
-
-void codegen_c_vis::view(tensor_c v) {
-    *os << v->name_;
-}
-
-void codegen_c_vis::view(assign_c v) {
-    if (v->var_->dtype_.lanes_ > 1 && v->var_.isa<indexing>()) {
-        if (v->var_.static_as<indexing>()->mask_.defined()) {
-            print_type(v->var_->dtype_);
-            *os << "::mask_store(";
-            *os << "&";
-            indexing dst = v->var_.static_as<indexing>();
-            dispatch(dst->ptr_);
-            *os << '[';
-            dispatch(dst->idx_.front());
-            *os << "], ";
-            dispatch(dst->mask_);
-            *os << ", ";
-            dispatch(v->value_);
-            *os << ");";
-        } else {
-            print_type(v->var_->dtype_);
-            *os << "::store(";
-            dispatch(v->value_);
-            *os << ", &";
-            indexing dst = v->var_.static_as<indexing>();
-            dispatch(dst->ptr_);
-            *os << '[';
-            dispatch(dst->idx_.front());
-            *os << "]);";
-        }
-    } else {
-        dispatch(v->var_);
-        *os << " = ";
-        dispatch(v->value_);
-        *os << ';';
-    }
-}
-
-void codegen_c_vis::view(stmts_c v) {
-    *os << "{\n";
-    indents++;
-    for (auto &s : v->seq_) {
-        print_indents(*os, indents);
-        dispatch(s);
-        *os << "\n";
-    }
-    indents--;
-    print_indents(*os, indents);
-    *os << "}";
-}
-
-void codegen_c_vis::view(if_else_c v) {
-    *os << "if (";
-    dispatch(v->condition_);
-    *os << ") ";
-    dispatch(v->then_case_);
-    if (v->else_case_.defined()) {
-        *os << " else ";
-        dispatch(v->else_case_);
-    }
-}
-
-void codegen_c_vis::view(evaluate_c v) {
-    dispatch(v->value_);
-    *os << ';';
-}
-
-void codegen_c_vis::view(returns_c v) {
-    *os << "return ";
-    if (v->value_.defined()) { dispatch(v->value_); }
-    *os << ';';
-}
-
-void codegen_c_vis::view(define_c v) {
-    if (v->linkage_ == linkage::static_local
-            || v->linkage_ == linkage::private_global) {
-        *os << "static ";
-    }
-    if (v->var_.isa<var>()) {
-        auto thevar = v->var_.static_as<var>();
-        if (v->var_->attr_
-                && v->var_->attr_->has_key(attr_keys::module_global_offset)) {
-            // if it is a global variable that is lowered to local
-            print_type(thevar->dtype_);
-            *os << '&' << ' ' << thevar->name_;
-            auto &offset
-                    = v->var_->attr_->get_any(attr_keys::module_global_offset);
-            *os << " = *(";
-            print_type(thevar->dtype_);
-            if (auto ptr = offset.get_or_null<void *>()) {
-                *os << "*)(" << *ptr << ')';
-            } else {
-                *os << "*)(__module_data + " << offset.get<size_t>() << ')';
-            }
-        } else {
-            print_cpp_var_def(thevar);
-            if (v->init_.defined()) {
-                *os << " = ";
-                dispatch(v->init_);
-            }
-        }
-    } else if (v->var_.isa<tensor>()) {
-        tensor t = v->var_.static_as<tensor>();
-        // if it is a view of the rescheduled buffer/ local tensor on heap
-        if (v->init_.defined()) {
-            print_cpp_type(*os, t->elem_dtype_) << "* " << t->name_ << " = (";
-            print_cpp_type(*os, t->elem_dtype_) << "*)";
-            dispatch(v->init_);
-            *os << ';';
-            return;
-        }
-        bool use_heap = false;
-        // if the tensor is defined as global, do not alloc it on heap
-        if (v->linkage_ != linkage::public_global
-                && v->linkage_ != linkage::private_global) {
-            if (t->dims_.front().isa<constant>()) {
-                auto dim = t->dims_.front().static_as<constant>();
-                auto sz = get_const_as_int(dim);
-                if (sz >= 256) { use_heap = true; }
-            } else {
-                use_heap = true;
-            }
-        }
-        if (use_heap) {
-            const char *buffertype = "_buffer<";
-            if (t->attr_ && t->attr_->has_key("is_thread_buffer")
-                    && t->attr_->get<bool>("is_thread_buffer")) {
-                buffertype = "_thread_buffer<";
-            }
-            *os << buffertype;
-            print_cpp_type(*os, t->elem_dtype_)
-                    << "> _buf_" << t->name_ << "(__stream, ";
-            dispatch(t->dims_.front());
-            *os << "); ";
-            print_cpp_type(*os, t->elem_dtype_)
-                    << "* " << t->name_ << " = _buf_" << t->name_ << ".buf";
-        } else {
-            // explicitly align tensor with cache line size, except that tensor
-            // is a scalar or bytes size < 64.
-            bool need_align = false;
-            // check condition.
-            if (t->dims_.size() == 1
-                    && get_const_as_int(t->dims_[0].checked_as<constant>())
-                            == 1) {
-                // it is a scalar
-            } else {
-                size_t shape = 1;
-                for (auto &d : t->dims_) {
-                    shape *= get_const_as_int(d.checked_as<constant>());
-                }
-                size_t dtsize
-                        = utils::get_sizeof_etype(t->elem_dtype_.type_code_);
-                // check bytes size
-                if (shape * dtsize > 64) need_align = true;
-            }
-            // cache line alignment
-            if (need_align) *os << "alignas(64) ";
-
-            print_cpp_type(*os, t->elem_dtype_) << ' ' << t->name_ << '[';
-            dispatch(t->dims_.front());
-            *os << ']';
-        }
-    } else {
-        assert(0 && "Bad var type");
-    }
-    *os << ';';
-}
-
-void codegen_c_vis::view(for_loop_c v) {
-    if (v->kind_ == for_type::PARALLEL) {
-        *os << "parallel((";
-        dispatch(v->iter_end_);
-        *os << " - ";
-        dispatch(v->iter_begin_);
-        *os << ") / ";
-        dispatch(v->step_);
-        *os << ", [&](const int __ithr, const int __nthr) {\n";
-        indents++;
-        print_indents(*os, indents);
-        auto itrv = v->var_.checked_as<var>();
-        print_cpp_var_def(itrv);
-        *os << " = ";
-        dispatch(v->iter_begin_);
-        *os << " + __ithr * ";
-        dispatch(v->step_);
-        *os << ";\n";
-        print_indents(*os, indents);
-        dispatch(v->body_);
-        indents--;
-        *os << "\n";
-        print_indents(*os, indents);
-        *os << "});";
-    } else {
-        *os << "for (";
-        auto itrv = v->var_.checked_as<var>();
-        print_cpp_var_def(itrv);
-        *os << " = ";
-        dispatch(v->iter_begin_);
-        *os << "; " << itrv->name_ << " < ";
-        dispatch(v->iter_end_);
-        *os << "; " << itrv->name_ << " += ";
-        dispatch(v->step_);
-        *os << ") ";
-        dispatch(v->body_);
-    }
-}
-
-static void prepare_include(std::ostream *source) {
-    *source << R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-)";
-}
-
-static bool is_main_func_wrapper(const func_c &f) {
-    return f->attr_ && f->attr_->get_or_else(function_attrs::is_main, false)
-            && utils::string_endswith(f->name_, wrapper_postfix);
-}
-
-static bool is_func_static(const func_c &f, bool is_offline) {
-    auto attr = f->attr_.get();
-    if (is_offline) {
-        // if is offline mode, hide all symbols except the main entry
-        if (is_main_func_wrapper(f)) { return false; }
-        if (f->name_ == "__sc_init__" || f->name_ == "memset") { return false; }
-        return !default_external_symbol_resolve(f->name_);
-    }
-    return attr && attr->get_or_else(function_attrs::private_, false);
-}
-
-void write_cpp_prototype(
-        std::ostream *source_, const func_c &f, bool is_offline) {
-    codegen_c_vis vis(source_, true, is_func_static(f, is_offline));
-    vis.is_offline_ = is_offline;
-    vis.dispatch(f);
-    *source_ << '\n';
-}
-
-void write_cpp_generic_wrapper(
-        std::ostream *source_, const func_c &f, bool is_parallel) {
-    *source_ << "extern \"C\" void ";
-    if (is_parallel) {
-        *source_ << get_closure_wrapper_name(f->name_)
-                 << "(int64_t i, generic_val* args) {\n  ";
-    } else {
-        *source_ << f->name_ << "_0wrapper(generic_val* args) {\n  ";
-    }
-    *source_ << f->name_ << '(';
-    std::vector<expr>::const_iterator itr;
-    if (is_parallel) {
-        *source_ << "i";
-        if (f->params_.size() > 1) { *source_ << ", "; }
-        assert(!f->params_.empty());
-        // skip the first parameter, as it is given by "i"
-        itr = f->params_.begin() + 1;
-    } else {
-        itr = f->params_.begin();
-    }
-
-    int idx = 0;
-    for (; itr != f->params_.end(); ++itr) {
-        auto &arg = *itr;
-        if (arg.isa<var>()) {
-            auto v = arg.static_as<var>();
-            *source_ << "args[" << idx << "].v_";
-            if (v->dtype_.is_pointer()) {
-                *source_ << "ptr";
-            } else {
-                print_cpp_type(*source_, v->dtype_);
-            }
-        } else {
-            auto v = arg.checked_as<tensor>();
-            *source_ << '(';
-            print_cpp_type(*source_, v->elem_dtype_)
-                    << "*)(args[" << idx << "].v_ptr)";
-        }
-        idx++;
-        if (itr + 1 != f->params_.end()) { *source_ << ", "; }
-    }
-    *source_ << ");\n}\n\n";
-}
-
-static func_c do_generate_c(func_c f, std::ostream &source,
-        const std::vector<define> &globals, bool gen_wrapper, bool is_static,
-        bool is_offline) {
-    codegen_c_vis vis(&source, false, is_static);
-    vis.is_offline_ = is_offline;
-    vis.dispatch(f);
-    source << '\n' << '\n';
-    return f;
-}
-
-void c_generator_pass_t::operator()(func_t f) {
-    f = pre_passes_(ir_module_t::from_entry_func(context_, f))
-                ->get_entry_func();
-    do_generate_c(f, source_, {}, gen_wrapper_, false, false);
-}
-
-c_generator_pass_t::c_generator_pass_t(std::ostream &source,
-        const context_ptr &ctx, bool gen_wrapper,
-        c_generator_optional_out_t *optional_out)
-    : source_(source)
-    , context_(ctx)
-    , gen_wrapper_(gen_wrapper)
-    , pre_passes_ {get_default_precodegen_passes(ctx, gen_wrapper)}
-    , optional_out_(optional_out) {
-    prepare_include(&source_);
-    if (optional_out_) { prepare_include(optional_out_->offline_source_); }
-}
-
-const_ir_module_ptr preprocess_module_and_make_decl(
-        const const_ir_module_ptr &mod, module_pass_t &pre_passes,
-        std::ostream &source, c_generator_optional_out_t *optout) {
-    auto mod_cpy = run_precodegen_passes(pre_passes, mod);
-    auto timer
-            = SC_SCOPED_TIMER_INFO("pass.time.c_generator_pass.preprocess", "");
-    // stage 1, find and print all function prototypes
-    std::vector<func_t> dep;
-    std::unordered_set<func_t> depset;
-    func_dependency_finder_t finder(dep);
-    for (auto &f : mod_cpy->get_contents()) {
-        finder(f, depset);
-    }
-    for (auto &v : mod_cpy->get_module_vars()) {
-        finder(v, depset);
-    }
-    for (auto &f : dep) {
-        if (!f->attr_ || !f->attr_->has_key("device_func")) {
-            write_cpp_prototype(&source, f, false);
-        }
-    }
-    source << '\n';
-    source << '\n';
-
-    if (optout) {
-        auto mode = mod_cpy->attr_.get<thread_pool_mode_t>(
-                ir_module_t::attr_key_t::MANAGED_THREAD_POOL);
-        COMPILE_ASSERT(mode != thread_pool_mode_t::DYNAMIC,
-                "Dynamic threadpool offline mode is not yet supported.");
-        bool managed_thread_pool = mode == thread_pool_mode_t::MANAGED;
-        if (managed_thread_pool) {
-            (*optout->offline_source_)
-                    << R"(#include <runtime/managed_thread_pool.hpp>
-)";
-        }
-
-        (*optout->offline_source_) << R"(#include <omp.h>
-#define sc_get_thread_id omp_get_thread_num
-#define sc_parallel_call_cpu_with_env sc_parallel_call_cpu_with_env_impl
-)";
-        for (auto &f : dep) {
-            if (!f->attr_ || !f->attr_->has_key("device_func")) {
-                write_cpp_prototype(optout->offline_source_, f, true);
-            }
-        }
-        *(optout->offline_source_) << '\n' << '\n';
-    }
-    return mod_cpy;
-}
-
-static void generate_dumped_source(const const_ir_module_ptr &mod,
-        c_generator_optional_out_t *optional_out, bool gen_wrapper) {
-    std::string module_name = "main_entry";
-    for (auto &f : mod->get_contents()) {
-        if (f->attr_ && f->attr_->get_or_else(function_attrs::is_main, false)
-                && !utils::string_endswith(f->name_, wrapper_postfix)) {
-            module_name = f->name_;
-            break;
-        }
-    }
-
-    thread_pool_mode_t mode = mod->attr_.get<thread_pool_mode_t>(
-            ir_module_t::attr_key_t::MANAGED_THREAD_POOL);
-    COMPILE_ASSERT(mode != thread_pool_mode_t::DYNAMIC,
-            "Dynamic threadpool offline mode is not yet supported.");
-    bool managed_thread_pool = mode == thread_pool_mode_t::MANAGED;
-    for (auto &f : mod->get_contents()) {
-        if (f->name_ == "__sc_init__") {
-            f->attr()["temp.replace_func_name"] = "sc_init_" + module_name;
-            f->attr()["comments"]
-                    = std::vector<std::string> {"Initialize the " + module_name,
-                            "@param __stream the stream pointer, usually "
-                            "get_default_stream()",
-                            "@param __module_data the module global data"};
-            break;
-        }
-    }
-    auto &header_src = *optional_out->header_source_;
-    auto &data_src = *optional_out->data_source_;
-    auto &offline_src = *optional_out->offline_source_;
-
-    auto &mod_data = mod->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS);
-
-    header_src << R"(#include <stdint.h>
-#include <runtime/generic_val.hpp>
-using generic_val = dnnl::impl::graph::gc::generic_val;
-
-extern uint8_t )"
-               << module_name << "_data[" << mod_data->data_.size_ << "];\n\n";
-    const char *skip_func_name = "__sc_init___0wrapper";
-    for (auto &f : mod->get_contents()) {
-        if (f->name_ == skip_func_name) { continue; }
-        bool is_static_func = is_func_static(f, true);
-        if (!is_static_func) { write_cpp_prototype(&header_src, f, true); }
-
-        // change the name of the generated func to insert managed thread pool
-        // code
-        if (managed_thread_pool && is_main_func_wrapper(f)) {
-            f->attr()["temp.replace_func_name"] = f->name_ + "_impl";
-        }
-
-        do_generate_c(f, offline_src, mod->get_module_vars(), gen_wrapper,
-                is_static_func, true);
-    }
-    if (managed_thread_pool) {
-        offline_src
-                << "extern \"C\" void " << module_name
-                << R"(_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  gc::runtime::thread_manager::cur_mgr.run_main_function((gc::runtime::thread_manager::main_func_t))"
-                << module_name
-                << R"(_0wrapper_impl, (gc::runtime::stream_t *)__stream, __module_data, args);
-})";
-    }
-
-    // generate module data
-
-    memset((char *)mod_data->data_.data_ + mod_data->initialized_size_, 0,
-            mod_data->data_.size_ - mod_data->initialized_size_);
-    data_src << R"(#include <stdint.h>
-
-alignas(64) uint8_t )"
-             << module_name << "_data[" << mod_data->data_.size_ << "] = {";
-    uint8_t *buffer = (uint8_t *)mod_data->data_.data_;
-    for (size_t i = 0; i < mod_data->initialized_size_; i++) {
-        data_src << (uint32_t)buffer[i] << ',';
-    }
-    data_src << "};\n";
-}
-
-const_ir_module_ptr c_generator_pass_t::operator()(const_ir_module_ptr mod) {
-    // TODO(xxx): cfake_jit is created with default ctx, which can't pass
-    // below assertion. assert(mod->ctx_ == context_);
-    mod = preprocess_module_and_make_decl(
-            mod, pre_passes_, source_, optional_out_);
-    auto timer = SC_SCOPED_TIMER_INFO("pass.time.c_generator_pass.codegen", "");
-    for (auto &f : mod->get_contents()) {
-        do_generate_c(f, source_, mod->get_module_vars(), gen_wrapper_,
-                is_func_static(f, false), false);
-    }
-    if (optional_out_) {
-        generate_dumped_source(mod, optional_out_, gen_wrapper_);
-    }
-    return mod;
-}
-
-c_generator_pass_t create_c_generator(std::ostream &os, const context_ptr &ctx,
-        bool gen_wrapper, c_generator_optional_out_t *optional_out) {
-    return c_generator_pass_t(os, ctx, gen_wrapper, optional_out);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c.hpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c.hpp
deleted file mode 100644
index 4551d31e4d1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_CODEGEN_C_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_CODEGEN_C_HPP
-
-#include <ostream>
-
-#include <ostream>
-#include "../ir/ir_module.hpp"
-#include "../ir/util_module_passes.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct c_generator_optional_out_t {
-    std::ostream *offline_source_;
-    std::ostream *header_source_;
-    std::ostream *data_source_;
-};
-
-class SC_INTERNAL_API c_generator_pass_t : public module_pass_t {
-private:
-    std::ostream &source_;
-    context_ptr context_;
-    bool gen_wrapper_;
-    sequential_module_pass_t pre_passes_;
-    c_generator_optional_out_t *optional_out_;
-
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    /**
-     * Generates a single function to the stream
-     * */
-    void operator()(func_t f);
-
-    c_generator_pass_t(std::ostream &source, const context_ptr &ctx,
-            bool gen_wrapper,
-            c_generator_optional_out_t *optional_out = nullptr);
-};
-
-/**
- * Creates a pass pipeline. It finds all dependent functions of the function.
- * For each function, it runs legalization and optimization. Then runs
- * codegen-to-c on it. Finally outputs the C++ source code to `os`. In itself,
- * the pipeline remembers all functions that has already been generated to
- * prevent multi-defining the functions
- *
- * @param os the stream to output the generated source code
- * @param ctx the context
- * @param gen_wrapper if true, generates a function "NAME_wrapper" which has
- *      type erased prototype
- * @param optional_out the optional output for AOT offline mode codegen
- * */
-SC_INTERNAL_API c_generator_pass_t create_c_generator(std::ostream &os,
-        const context_ptr &ctx, bool gen_wrapper = false,
-        c_generator_optional_out_t *optional_out = nullptr);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c_internal.hpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c_internal.hpp
deleted file mode 100644
index e458606b876..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_c_internal.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_CODEGEN_C_INTERNAL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_CODEGEN_C_INTERNAL_HPP
-
-#include <vector>
-#include "../ir/util_module_passes.hpp"
-#include "../ir/viewer.hpp"
-
-/**
- * This header exposes some utility functions for C++-like language codegen.
- * To use standard C++ codegen, please include codegen_c.hpp instead
- * */
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class codegen_c_vis : public ir_viewer_t {
-protected:
-    ostream *os;
-    int indents = 0;
-    bool prototype_only;
-    bool is_static;
-    ostream &print_param(const expr &v);
-    ostream &print_func_params(const func_c &v, bool with_type = true);
-    void trinary_func_codegen_c(
-            const std::vector<expr> &args, const char *funcname);
-    void binary_func_codegen_c(
-            const std::vector<expr> &args, const char *funcname);
-    void unary_func_codegen_c(const expr &arg, const char *funcname);
-
-public:
-    bool is_offline_ = false;
-    virtual ostream &print_cpp_var_def(const var &v);
-    virtual ostream &print_tensor_def(const tensor &v);
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    codegen_c_vis(ostream *os, bool prototype_only, bool is_static = false);
-    func_c dispatch(func_c v) override;
-    stmt_c dispatch(stmt_c v) override;
-    void view(constant_c v) override;
-    void view(var_c v) override;
-    void view(cast_c v) override;
-
-    void print_binary(const binary_c &v, const char *op);
-    void print_binary(const logic_c &v, const char *op);
-    void print_binary(const cmp_c &v, const char *op);
-
-    virtual void print_type(sc_data_type_t dtype);
-
-    void view(add_c v) override;
-    void view(sub_c v) override;
-    void view(mul_c v) override;
-    void view(div_c v) override;
-    void view(mod_c v) override;
-    void view(cmp_eq_c v) override;
-    void view(cmp_lt_c v) override;
-    void view(cmp_le_c v) override;
-    void view(cmp_gt_c v) override;
-    void view(cmp_ge_c v) override;
-    void view(cmp_ne_c v) override;
-    void view(logic_and_c v) override;
-    void view(logic_or_c v) override;
-    void view(logic_not_c v) override;
-    void view(select_c v) override;
-    void view(indexing_c v) override;
-    void view(tensorptr_c v) override;
-    void view(intrin_call_c v) override;
-    void view(func_addr_c v) override;
-    void view(call_c v) override;
-    void view(tensor_c v) override;
-    void view(assign_c v) override;
-    void view(stmts_c v) override;
-    void view(if_else_c v) override;
-    void view(evaluate_c v) override;
-    void view(returns_c v) override;
-    void view(define_c v) override;
-    void view(for_loop_c v) override;
-};
-
-struct c_generator_optional_out_t;
-extern const_ir_module_ptr preprocess_module_and_make_decl(
-        const const_ir_module_ptr &mod, module_pass_t &pre_passes,
-        std::ostream &source,
-        c_generator_optional_out_t *optionalout = nullptr);
-extern ostream &print_cpp_type(ostream &os, sc_data_type_t dtype);
-extern void write_cpp_prototype(
-        std::ostream *source_, const func_c &f, bool is_offline = false);
-extern void write_cpp_generic_wrapper(
-        std::ostream *source_, const func_c &f, bool is_parallel);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_llvm.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_llvm.cpp
deleted file mode 100644
index c71a4b99b4a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_llvm.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/codegen/llvm/shared_include.hpp>
-// the visitor for lowering TIR to LLVM IR
-#include <compiler/codegen/llvm/llvm_visitor.hpp>
-
-using namespace llvm;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-#if SC_LLVM_BACKEND > 16
-// starting from LLVM17, they use STL's optional container
-template <typename T>
-using Optional = std::optional<T>;
-#endif
-
-static std::string dump_module_to_string(Module *m) {
-    std::string ret;
-    raw_string_ostream os(ret);
-    os << *m;
-    return ret;
-}
-
-const_ir_module_ptr llvm_generator_pass::operator()(const_ir_module_ptr f) {
-    auto passes = get_default_precodegen_passes(f->ctx_, gen_wrapper_);
-    auto mod = run_precodegen_passes(passes, f);
-    std::string unique_name;
-    const auto &tmpdir = utils::compiler_configs_t::get_temp_dir_path();
-    if (f->ctx_->flags_.debug_info_) {
-        std::string file_name;
-        file_name = "llvm_jit-" + utils::get_unique_name_for_file() + ".gcir";
-        std::string unique_name = tmpdir + "/" + file_name;
-        std::ofstream ofs;
-        utils::open_file_for_write(ofs, unique_name);
-        out_source_path_ = unique_name;
-        print_ir_and_annotate_source_pos(*mod, ofs);
-    } else {
-        out_source_path_ = "";
-    }
-
-    codegen_llvm_vis_t vis {f->ctx_, llvm_ctx_, tmpdir, out_source_path_};
-    auto timer = SC_SCOPED_TIMER_INFO("pass.time.llvm_generator_pass", "");
-    for (auto &funct : mod->get_contents()) {
-        vis.dispatch(funct);
-    }
-    if (f->ctx_->flags_.debug_info_) { vis.dbuilder_->finalize(); }
-    out_module_ = std::move(vis.module_);
-    SC_MODULE_INFO << dump_module_to_string(out_module_.get());
-    return mod;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_llvm.hpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_llvm.hpp
deleted file mode 100644
index 13a6b6478c5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/codegen_llvm.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_CODEGEN_LLVM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_CODEGEN_LLVM_HPP
-
-#include <memory>
-#include <ostream>
-
-#include <string>
-#include <compiler/ir/module_pass.hpp>
-#include <llvm/IR/Module.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class SC_INTERNAL_API llvm_generator_pass : public module_pass_t {
-public:
-    llvm::LLVMContext &llvm_ctx_;
-    std::unique_ptr<llvm::Module> &out_module_;
-    bool gen_wrapper_;
-    std::string out_source_path_;
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    llvm_generator_pass(llvm::LLVMContext &llvm_ctx,
-            std::unique_ptr<llvm::Module> &out_module, bool gen_wrapper)
-        : llvm_ctx_(llvm_ctx)
-        , out_module_(out_module)
-        , gen_wrapper_(gen_wrapper) {}
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/basic_expr.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/basic_expr.cpp
deleted file mode 100644
index 3e10b60885a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/basic_expr.cpp
+++ /dev/null
@@ -1,494 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "compiler/ir/attr_keys.hpp"
-#include "shared_include.hpp"
-#include "util/fp16.hpp"
-
-using namespace llvm;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void codegen_llvm_vis_t::generate_bin_op(
-        const expr_c &v, const expr_c &l, const expr_c &r) {
-    auto cate = get_etype_category_nothrow(l->dtype_.type_code_);
-    auto lhs = generate_expr(l);
-    auto rhs = generate_expr(r);
-    auto original_fmf = builder_.getFastMathFlags();
-    if (!any_map_t::fetch_or_else(v->attr_.get(), attr_keys::fast_math, true)) {
-        FastMathFlags fmflag;
-        fmflag.setFast(false);
-        fmflag.setAllowContract(false);
-        builder_.setFastMathFlags(fmflag);
-    }
-#define HANDLE_BIN_OP2(scname, intname, fpname, case_other) \
-    case sc_expr_type::scname: \
-        switch (cate) { \
-            case CATE_INT: \
-            case CATE_UINT: \
-                current_val_ = builder_.Create##intname(lhs, rhs); \
-                break; \
-            case CATE_FLOAT: \
-                current_val_ = builder_.Create##fpname(lhs, rhs); \
-                break; \
-            case CATE_OTHER: { \
-                case_other \
-            } \
-            default: \
-                COMPILE_ASSERT(false, \
-                        "Cannot generate binary op for this type: " << v); \
-        } \
-        break
-#define HANDLE_POINTER_CMP(llvm_name) \
-    if (l->dtype_.type_code_ == sc_data_etype::POINTER \
-            && r->dtype_.type_code_ == sc_data_etype::POINTER) { \
-        current_val_ = builder_.Create##llvm_name(lhs, rhs); \
-        break; \
-    }
-#define HANDLE_POINTER_EQ HANDLE_POINTER_CMP(ICmpEQ)
-#define HANDLE_POINTER_NE HANDLE_POINTER_CMP(ICmpNE)
-#define HANDLE_BIN_OP(scname, intname) \
-    HANDLE_BIN_OP2(scname, intname, F##intname, )
-#define HANDLE_BIN_SIGNED_OP2(scname, intname1, intname2, fpname) \
-    case sc_expr_type::scname: \
-        switch (cate) { \
-            case CATE_INT: \
-                current_val_ \
-                        = builder_.Create##intname1##S##intname2(lhs, rhs); \
-                break; \
-            case CATE_UINT: \
-                current_val_ \
-                        = builder_.Create##intname1##U##intname2(lhs, rhs); \
-                break; \
-            case CATE_FLOAT: \
-                current_val_ = builder_.Create##fpname(lhs, rhs); \
-                break; \
-            default: \
-                COMPILE_ASSERT(false, \
-                        "Cannot generate binary op for this type: " << v); \
-        } \
-        break
-#define HANDLE_BIN_SIGNED_OP(scname, intname) \
-    HANDLE_BIN_SIGNED_OP2(scname, , intname, F##intname)
-#define HANDLE_CMP_SIGNED_OP(scname, llvmname) \
-    HANDLE_BIN_SIGNED_OP2(scname, ICmp, llvmname, FCmpO##llvmname)
-    switch (v->node_type_) {
-        HANDLE_BIN_OP(add, Add);
-        HANDLE_BIN_OP(sub, Sub);
-        HANDLE_BIN_OP(mul, Mul);
-        HANDLE_BIN_SIGNED_OP(div, Div);
-        HANDLE_BIN_SIGNED_OP(mod, Rem);
-        HANDLE_BIN_OP2(cmp_eq, ICmpEQ, FCmpOEQ, HANDLE_POINTER_EQ);
-        HANDLE_BIN_OP2(cmp_ne, ICmpNE, FCmpONE, HANDLE_POINTER_NE);
-        HANDLE_CMP_SIGNED_OP(cmp_lt, LT);
-        HANDLE_CMP_SIGNED_OP(cmp_le, LE);
-        HANDLE_CMP_SIGNED_OP(cmp_gt, GT);
-        HANDLE_CMP_SIGNED_OP(cmp_ge, GE);
-        case sc_expr_type::logic_and:
-            current_val_ = builder_.CreateAnd(lhs, rhs);
-            break;
-        case sc_expr_type::logic_or:
-            current_val_ = builder_.CreateOr(lhs, rhs);
-            break;
-        default: assert(0);
-    }
-    builder_.setFastMathFlags(original_fmf);
-}
-
-void codegen_llvm_vis_t::view(binary_c v) {
-    generate_bin_op(v, v->l_, v->r_);
-}
-void codegen_llvm_vis_t::view(cmp_c v) {
-    generate_bin_op(v, v->l_, v->r_);
-}
-
-void codegen_llvm_vis_t::view(logic_c v) {
-    generate_bin_op(v, v->l_, v->r_);
-}
-
-void codegen_llvm_vis_t::view(constant_c v) {
-    std::vector<Constant *> vals;
-    vals.reserve(v->value_.size());
-    auto cate = get_etype_category_nothrow(v->dtype_.type_code_);
-    bool is_bf16 = v->dtype_.type_code_ == sc_data_etype::BF16;
-    bool is_f16 = v->dtype_.type_code_ == sc_data_etype::F16;
-    sc_data_type_t base_type = v->dtype_;
-    base_type.lanes_ = 1;
-    auto llvm_base_type = get_type(base_type);
-    switch (cate) {
-        case CATE_FLOAT: {
-            for (auto &val : v->value_) {
-                if (is_bf16) {
-                    uint64_t val_u64 = bf16_t(val.f32).storage_;
-                    vals.push_back(
-                            ConstantInt::get(llvm_base_type, val_u64, false));
-                } else if (is_f16) {
-                    uint16_t val_u16 = fp16_t(val.f32).storage_;
-                    vals.push_back(ConstantFP::get(llvm_base_type,
-                            APFloat(APFloat::IEEEhalf(), APInt(16, val_u16))));
-                } else {
-                    vals.push_back(
-                            ConstantFP::get(llvm_base_type, APFloat(val.f32)));
-                }
-            }
-        } break;
-        case CATE_UINT:
-        case CATE_INT: {
-            bool is_signed = cate == CATE_INT;
-            for (auto &val : v->value_) {
-                vals.push_back(
-                        ConstantInt::get(llvm_base_type, val.u64, is_signed));
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(v->dtype_.is_pointer(),
-                    "Unexpected type for LLVM. Expecting pointer.");
-            for (auto &val : v->value_) {
-                vals.push_back(ConstantExpr::getIntToPtr(
-                        builder_.getInt64(val.u64), llvm_base_type));
-            }
-            break;
-    }
-    if (vals.size() != v->dtype_.lanes_) {
-        COMPILE_ASSERT(
-                vals.size() == 1, "Bad constant node. Expecting 1 value");
-        // broadcast value
-        current_val_
-                = builder_.CreateVectorSplat(v->dtype_.lanes_, vals.front());
-    } else {
-        if (vals.size() != 1) {
-            current_val_ = ConstantVector::get(vals);
-        } else {
-            current_val_ = vals.front();
-        }
-    }
-}
-
-void codegen_llvm_vis_t::view(var_c v) {
-    auto ret = get_defined_var_ptr(v);
-    bool is_global_base = utils::string_startswith(v->name_, "__module_data");
-    bool is_special_params = (is_global_base || v->name_ == "__stream");
-    if (is_lvalue_mode_) {
-        assert(!is_special_params);
-        is_lvalue_mode_ = false;
-        current_val_ = ret;
-    } else {
-        if (is_special_params) {
-            current_val_ = ret;
-        } else {
-            current_val_ = builder_.CreateLoad(
-                    get_type(v->dtype_), ret, v->name_ + "_v");
-        }
-    }
-}
-
-llvm::Type *codegen_llvm_vis_t::get_llvm_bf16_native_type(unsigned lanes) {
-#if SC_LLVM_BACKEND > 15
-    // llvm 16's bf16 casting intrinsic uses bf16 instead of i16
-    return VectorType::get(builder_.getBFloatTy(), lanes, false);
-#else
-    return get_type(sc_data_type_t::bf16(lanes));
-#endif
-}
-
-void codegen_llvm_vis_t::view(cast_c v) {
-    auto cate_out = get_etype_category_nothrow(v->dtype_.type_code_);
-    auto cate_in = get_etype_category_nothrow(v->in_->dtype_.type_code_);
-    auto in_v = generate_expr(v->in_);
-    auto outtype = get_type(v->dtype_);
-    auto check_cate = [&v]() {
-        COMPILE_ASSERT(
-                v->dtype_ == datatypes::generic, "Unexpected outtype " << v);
-    };
-
-    if (v->in_->dtype_.is_etype(sc_data_etype::F32)
-            && v->dtype_.is_etype(sc_data_etype::BF16)) {
-#if SC_LLVM_BACKEND > 10
-        switch (v->in_->dtype_.lanes_) {
-            case 1: {
-                Value *vec = builder_.CreateVectorSplat(4, in_v);
-                vec = builder_.CreateIntrinsic(
-                        Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128, {},
-                        {vec, UndefValue::get(get_llvm_bf16_native_type(8)),
-                                /*mask*/
-                                builder_.CreateVectorSplat(
-                                        4, builder_.getInt1(true))});
-                current_val_ = builder_.CreateExtractElement(vec, UINT64_C(0));
-            } break;
-            case 4:
-                current_val_ = builder_.CreateIntrinsic(
-                        Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128, {},
-                        {in_v, UndefValue::get(get_llvm_bf16_native_type(8)),
-                                /*mask*/
-                                builder_.CreateVectorSplat(
-                                        4, builder_.getInt1(true))});
-#if SC_LLVM_BACKEND == 11
-                current_val_ = builder_.CreateShuffleVector(current_val_,
-                        current_val_, ArrayRef<int>({0, 1, 2, 3}));
-#else
-                current_val_ = builder_.CreateShuffleVector(
-                        current_val_, {0, 1, 2, 3});
-#endif
-                break;
-            case 8:
-                current_val_ = builder_.CreateIntrinsic(
-                        Intrinsic::x86_avx512bf16_cvtneps2bf16_256, {}, {in_v});
-                break;
-            case 16:
-                current_val_ = builder_.CreateIntrinsic(
-                        Intrinsic::x86_avx512bf16_cvtneps2bf16_512, {}, {in_v});
-                break;
-            default:
-                std::stringstream ss;
-                ss << "Unsupport cast lanes " << v->in_->dtype_.lanes_;
-                throw std::runtime_error(ss.str());
-        }
-#if SC_LLVM_BACKEND > 15
-        // llvm 16's bf16 casting intrinsic returns bf16 instead of i16
-        current_val_ = builder_.CreateBitCast(current_val_, outtype);
-#endif
-        return;
-#else
-        throw std::runtime_error("Current version of LLVM cannot handle bf16");
-#endif
-    }
-    switch (cate_in) {
-        case CATE_FLOAT: {
-            switch (cate_out) {
-                case CATE_FLOAT:
-                    current_val_ = builder_.CreateFPCast(in_v, outtype);
-                    break;
-                case CATE_INT:
-                    current_val_ = builder_.CreateFPToSI(in_v, outtype);
-                    break;
-                case CATE_UINT:
-                    current_val_ = builder_.CreateFPToUI(in_v, outtype);
-                    break;
-                case CATE_OTHER: {
-                    check_cate();
-                    auto bits
-                            = utils::get_sizeof_etype(v->in_->dtype_.type_code_)
-                            * 8;
-                    auto ret = builder_.CreateBitCast(
-                            in_v, IntegerType::get(context_, bits));
-                    current_val_ = builder_.CreateZExtOrBitCast(
-                            ret, builder_.getInt64Ty());
-                } break;
-            }
-        } break;
-        case CATE_INT: {
-            switch (cate_out) {
-                case CATE_FLOAT:
-                    current_val_ = builder_.CreateSIToFP(in_v, outtype);
-                    break;
-                case CATE_INT:
-                case CATE_UINT:
-                    current_val_ = builder_.CreateSExtOrTrunc(in_v, outtype);
-                    break;
-                case CATE_OTHER: {
-                    check_cate();
-                    current_val_ = builder_.CreateZExtOrBitCast(
-                            in_v, builder_.getInt64Ty());
-                } break;
-            }
-        } break;
-        case CATE_UINT: {
-            switch (cate_out) {
-                case CATE_FLOAT:
-                    current_val_ = builder_.CreateUIToFP(in_v, outtype);
-                    break;
-                case CATE_INT:
-                case CATE_UINT:
-                    current_val_ = builder_.CreateZExtOrTrunc(in_v, outtype);
-                    break;
-                case CATE_OTHER: {
-                    if (v->dtype_.is_pointer()) {
-                        current_val_ = builder_.CreateIntToPtr(in_v, outtype);
-                    } else {
-                        check_cate();
-                        current_val_ = builder_.CreateZExtOrBitCast(
-                                in_v, builder_.getInt64Ty());
-                    }
-                } break;
-            }
-        } break;
-        case CATE_OTHER:
-            if (v->in_->dtype_ == datatypes::generic) {
-                auto bits = module_->getDataLayout().getTypeAllocSizeInBits(
-                        outtype);
-                auto ret = builder_.CreateTruncOrBitCast(
-                        in_v, IntegerType::get(context_, bits));
-                switch (cate_out) {
-                    case CATE_OTHER:
-                        COMPILE_ASSERT(v->dtype_.is_pointer(),
-                                "Unexpected out type " << v);
-                        current_val_ = builder_.CreateIntToPtr(ret, outtype);
-                        break;
-                    case CATE_FLOAT:
-                        current_val_ = builder_.CreateBitCast(ret, outtype);
-                        break;
-                    case CATE_INT:
-                    case CATE_UINT: current_val_ = ret; break;
-                }
-            } else {
-                COMPILE_ASSERT(v->in_->dtype_.is_pointer(),
-                        "Unexpected in type " << v);
-                if (v->dtype_.is_pointer()) {
-                    // pointer to pointer
-                    current_val_ = builder_.CreatePointerCast(in_v, outtype);
-                } else {
-                    // pointer to generic val or uint64
-                    if (v->dtype_ != datatypes::index) { check_cate(); }
-                    current_val_ = builder_.CreatePtrToInt(
-                            in_v, builder_.getInt64Ty());
-                }
-            }
-            break;
-    }
-}
-
-void codegen_llvm_vis_t::view(logic_not_c v) {
-    current_val_ = builder_.CreateNot(generate_expr(v->in_));
-}
-void codegen_llvm_vis_t::view(select_c v) {
-    auto l = generate_expr(v->l_);
-    auto r = generate_expr(v->r_);
-    auto cond = convert_mask(v->cond_, v->l_->dtype_.lanes_ == 4);
-    current_val_ = builder_.CreateSelect(cond, l, r);
-}
-
-void codegen_llvm_vis_t::view(indexing_c v) {
-    bool is_lvalue_mode = is_lvalue_mode_;
-    is_lvalue_mode_ = false;
-    COMPILE_ASSERT(v->idx_.size() == 1, "Expecting 1D array: " << v);
-    auto base = generate_expr(v->ptr_);
-    assert(v->ptr_->dtype_.is_pointer());
-    auto element_type = get_type(v->ptr_->dtype_.get_pointer_element());
-    auto ptr = builder_.CreateGEP(
-            element_type, base, generate_expr(v->idx_.front()));
-    auto target_type = get_type(v->dtype_);
-    if (target_type != element_type) {
-        // allow pointer to pointer
-        assert(v->dtype_ == datatypes::pointer
-                || llvm::cast<VectorType>(*target_type).getElementType()
-                        == element_type);
-        ptr = builder_.CreatePointerCast(ptr, target_type->getPointerTo());
-    }
-    if (is_lvalue_mode) {
-        current_val_ = ptr;
-    } else {
-        bool is_volatile = (v->ptr_->attr_
-                && v->ptr_->attr_->get_or_else("volatile", false));
-        if (v->mask_.defined()) {
-            Value *mask;
-            auto bit_len = utils::get_sizeof_type(v->mask_->dtype_) * 8;
-            if (v->dtype_.lanes_ != bit_len) {
-                COMPILE_ASSERT(v->dtype_.lanes_ == 4,
-                        "Currently only 8bit -> 4bit is supported, but get "
-                                << v->dtype_
-                                << " lanes = " << v->dtype_.lanes_);
-                mask = convert_mask(v->mask_, true);
-            } else {
-                mask = convert_mask(v->mask_);
-            }
-            auto znode = make_expr<constant_node>(0UL, v->dtype_);
-            auto zero = generate_expr(znode);
-#if SC_LLVM_BACKEND > 12
-            current_val_
-                    = set_alias(builder_.CreateMaskedLoad(get_type(v->dtype_),
-                                        ptr, SC_LLVM_ALIGN(1), mask, zero),
-                            v->ptr_);
-#elif SC_LLVM_BACKEND >= 11
-            current_val_ = set_alias(builder_.CreateMaskedLoad(
-                                             ptr, SC_LLVM_ALIGN(1), mask, zero),
-                    v->ptr_);
-#else
-            current_val_ = set_alias(
-                    builder_.CreateMaskedLoad(ptr, 1, mask, zero), v->ptr_);
-#endif
-        } else {
-            if (v->dtype_.lanes_ > 1) {
-                current_val_ = set_alias(
-                        builder_.CreateAlignedLoad(get_type(v->dtype_), ptr,
-                                SC_LLVM_ALIGN(1), is_volatile),
-                        v->ptr_);
-            } else {
-                current_val_
-                        = set_alias(builder_.CreateLoad(get_type(v->dtype_),
-                                            ptr, is_volatile),
-                                v->ptr_);
-            }
-        }
-    }
-}
-void codegen_llvm_vis_t::view(tensorptr_c v) {
-    is_lvalue_mode_ = true;
-    current_val_ = generate_expr(v->base_);
-}
-
-void codegen_llvm_vis_t::view(func_addr_c v) {
-    current_val_ = builder_.CreatePointerCast(
-            get_or_create_func(v->func_), builder_.getInt8PtrTy());
-}
-void codegen_llvm_vis_t::view(call_c v) {
-    std::vector<Value *> args;
-    auto the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-    Value *ll_func;
-    FunctionType *ft = nullptr;
-    if (the_func) {
-        auto F = get_or_create_func(the_func);
-        ll_func = F;
-        ft = F->getFunctionType();
-    } else {
-        auto the_expr = std::dynamic_pointer_cast<expr_base>(v->func_);
-        assert(the_expr);
-        auto proto_func = the_expr->attr().get_or_else("prototype", func_t());
-        COMPILE_ASSERT(proto_func, "Call node expects an expr with prototype");
-        ft = create_func_type(proto_func);
-        ll_func = generate_expr(expr_c(the_expr));
-        ll_func = builder_.CreatePointerCast(ll_func, ft->getPointerTo());
-    }
-    for (size_t i = 0; i < v->args_.size(); i++) {
-        auto &val = v->args_[i];
-        auto ll_value = generate_expr(val);
-        auto target_type = *(ft->param_begin() + i);
-        if (ll_value->getType() != target_type) {
-            COMPILE_ASSERT(target_type == builder_.getInt8PtrTy(),
-                    "LLVM can only handle autocast to pointer");
-            ll_value = builder_.CreatePointerCast(ll_value, target_type);
-        }
-        args.push_back(ll_value);
-    }
-    current_val_ = builder_.CreateCall(ft, ll_func, args);
-}
-void codegen_llvm_vis_t::view(tensor_c v) {
-    current_val_ = get_defined_var_ptr(v);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/intrinsic.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/intrinsic.cpp
deleted file mode 100644
index ca7526461e8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/intrinsic.cpp
+++ /dev/null
@@ -1,964 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "shared_include.hpp"
-#include "util/utils.hpp"
-
-using namespace llvm;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Implementing LLVM-x86 intrinsics
- * 1. first find the GCC/Clang built-in intrinsic name in
- * https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers
- *    e.g. Goto definition of _mm512_cvtusepi32_epi8, you will get
- *    __builtin_ia32_pmovusdb512_mask
- * 2. Find the built-in function name in
- * https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/IR/IntrinsicsX86.td
- * 3. Now we have the intrinsic name in LLVM :
- *    x86_avx512_mask_pmovus_db_512
- * */
-void codegen_llvm_vis_t::view(intrin_call_c v) {
-    auto cate = get_etype_category_nothrow(v->dtype_.type_code_);
-
-    switch (v->type_) {
-        case intrin_type::reinterpret: {
-            assert(v->args_.size() == 1);
-            auto inval = generate_expr(v->args_[0]);
-            auto outty = get_type(v->dtype_);
-            if (outty->isPointerTy()) {
-                auto src_cate = get_type_category_nothrow(v->args_[0]->dtype_);
-                bool is_src_int = src_cate == CATE_INT || src_cate == CATE_UINT;
-                if (is_src_int) {
-                    current_val_ = builder_.CreateIntToPtr(inval, outty);
-                } else {
-                    current_val_ = builder_.CreatePointerCast(inval, outty);
-                }
-            } else if (inval->getType()->isPointerTy()) {
-                auto dst_cate = get_type_category_nothrow(v->dtype_);
-                bool is_dest_int
-                        = dst_cate == CATE_INT || dst_cate == CATE_UINT;
-                COMPILE_ASSERT(is_dest_int,
-                        "Expecting pointer to int for reinterpret");
-                current_val_ = builder_.CreatePtrToInt(inval, outty);
-            } else {
-                current_val_ = builder_.CreateBitCast(inval, outty);
-            }
-        } break;
-        case intrin_type::abs: {
-            assert(v->args_.size() == 1);
-            auto inval = generate_expr(v->args_[0]);
-
-            std::string intrin_name;
-            llvm::raw_string_ostream os(intrin_name);
-            switch (cate) {
-                case CATE_FLOAT:
-                    current_val_ = builder_.CreateUnaryIntrinsic(
-                            Intrinsic::fabs, inval);
-                    break;
-                case CATE_INT: {
-                    auto znode = make_expr<constant_node>(
-                            0UL, v->args_[0]->dtype_);
-                    auto zero = generate_expr(znode);
-                    auto sign = builder_.CreateICmpSGT(inval, zero);
-                    auto neg = builder_.CreateSub(zero, inval);
-                    current_val_ = builder_.CreateSelect(sign, inval, neg);
-                } break;
-                default: assert(0); break;
-            }
-        } break;
-        case intrin_type::sqrt: {
-            current_val_
-                    = call_unary_llvm_intrin(v, cate, Intrinsic::sqrt, true);
-        } break;
-        case intrin_type::rsqrt: {
-            // todo: The precision of AVX2 intrinsic is ~0.000366, not meet
-            // f32 precision judgements. Use fast-math pass to enable
-            // avx2/sse intrinsic with low precision(bf16/int8)
-            if (ctx_->machine_.cpu_flags_.fAVX512F && v->dtype_.lanes_ > 1) {
-                auto inval = generate_expr(v->args_[0]);
-                switch (v->dtype_.lanes_) {
-                    case 4:
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_avx512_rsqrt14_ps_128, {},
-                                {inval, inval, builder_.getInt8(0xff)});
-
-                        break;
-                    case 8:
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_avx512_rsqrt14_ps_256, {},
-                                {inval, inval, builder_.getInt8(0xff)});
-
-                        break;
-                    case 16:
-                        COMPILE_ASSERT(ctx_->machine_.cpu_flags_.fAVX512F,
-                                "rsqrt of 16 floats needs AVX512");
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_avx512_rsqrt14_ps_512, {},
-                                {inval, inval, builder_.getInt16(0xffff)});
-                        break;
-                    default:
-                        COMPILE_ASSERT(false,
-                                "LLVM backend has not yet support "
-                                "rsqrt with lanes = "
-                                        << v->dtype_.lanes_);
-                        break;
-                }
-            } else {
-                current_val_ = call_unary_llvm_intrin(
-                        v, cate, Intrinsic::sqrt, true);
-                Value *ones
-                        = ConstantFP::get(builder_.getFloatTy(), APFloat(1.0f));
-                if (v->dtype_.lanes_ > 1) {
-                    ones = builder_.CreateVectorSplat(v->dtype_.lanes_, ones);
-                }
-                current_val_ = builder_.CreateFDiv(ones, current_val_);
-            }
-        } break;
-        case intrin_type::int_and: {
-            current_val_
-                    = call_binary_llvm_normal(v, &llvm::IRBuilder<>::CreateAnd);
-        } break;
-        case intrin_type::int_or: {
-            current_val_
-                    = call_binary_llvm_normal(v, &llvm::IRBuilder<>::CreateOr);
-        } break;
-        case intrin_type::int_xor: {
-            current_val_
-                    = call_binary_llvm_normal(v, &llvm::IRBuilder<>::CreateXor);
-        } break;
-        case intrin_type::round: {
-            current_val_ = call_unary_llvm_intrin(
-                    v, cate, Intrinsic::nearbyint, true);
-        } break;
-        case intrin_type::ceil: {
-            current_val_
-                    = call_unary_llvm_intrin(v, cate, Intrinsic::ceil, true);
-        } break;
-        case intrin_type::floor: {
-            current_val_
-                    = call_unary_llvm_intrin(v, cate, Intrinsic::floor, true);
-        } break;
-        case intrin_type::log: {
-            current_val_
-                    = call_unary_llvm_intrin(v, cate, Intrinsic::log, true);
-        } break;
-        case intrin_type::max: {
-            if (cate == CATE_FLOAT) {
-                current_val_ = call_binary_llvm_intrin(
-                        v, cate, Intrinsic::maxnum, true);
-            } else {
-                current_val_ = make_int_min_max(v, false, cate);
-            }
-        } break;
-        case intrin_type::min: {
-            if (cate == CATE_FLOAT) {
-                current_val_ = call_binary_llvm_intrin(
-                        v, cate, Intrinsic::minnum, true);
-            } else {
-                current_val_ = make_int_min_max(v, true, cate);
-            }
-        } break;
-        case intrin_type::shl: {
-            assert(v->args_.size() == 2);
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            COMPILE_ASSERT(cate == CATE_INT || cate == CATE_UINT,
-                    "Bad type. Expecting int: " << v);
-            current_val_ = builder_.CreateShl(inval1, inval2);
-        } break;
-        case intrin_type::shr: {
-            assert(v->args_.size() == 2);
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            if (cate == CATE_INT) {
-                current_val_ = builder_.CreateAShr(inval1, inval2);
-            } else {
-                COMPILE_ASSERT(
-                        cate == CATE_UINT, "Bad type. Expecting int: " << v);
-                current_val_ = builder_.CreateLShr(inval1, inval2);
-            }
-        } break;
-        case intrin_type::broadcast: {
-            assert(v->args_.size() == 1);
-            auto inval1 = generate_expr(v->args_[0]);
-            auto lanes = v->dtype_.lanes_;
-            auto in_lanes = v->args_[0]->dtype_.lanes_;
-            if (lanes != 1) {
-                if (in_lanes != 1) {
-                    while (in_lanes < lanes) {
-                        std::vector<shuffle_idx_t> array(in_lanes << 1);
-                        for (uint16_t i = 0; i < (in_lanes << 1); i++) {
-                            array[i] = i;
-                        }
-                        inval1 = builder_.CreateShuffleVector(
-                                inval1, inval1, array);
-                        in_lanes = in_lanes << 1;
-                    }
-                    current_val_ = inval1;
-                } else {
-                    current_val_ = builder_.CreateVectorSplat(lanes, inval1);
-                }
-            } else {
-                current_val_ = inval1;
-            }
-        } break;
-        case intrin_type::fmadd:
-        case intrin_type::fnmadd: {
-            assert(v->args_.size() == 3);
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            auto inval3 = generate_expr(v->args_[2]);
-            if (v->type_ == intrin_type::fnmadd) {
-                inval1 = builder_.CreateFNeg(inval1);
-            }
-            auto ret = builder_.CreateIntrinsic(Intrinsic::fma,
-                    {get_type(v->dtype_)}, {inval1, inval2, inval3});
-            ret->setFastMathFlags(builder_.getFastMathFlags());
-            current_val_ = ret;
-        } break;
-        case intrin_type::reduce_add:
-        case intrin_type::reduce_mul:
-        case intrin_type::reduce_max:
-        case intrin_type::reduce_min: {
-            Intrinsic::ID cur_intrinsic = Intrinsic::not_intrinsic;
-#if SC_LLVM_BACKEND > 11
-#define LLVM_INTRINSIC_EXP_V2(name) Intrinsic::vector_reduce_##name
-#define LLVM_INTRINSIC_EXP LLVM_INTRINSIC_EXP_V2
-#elif SC_LLVM_BACKEND > 8
-#define LLVM_INTRINSIC_EXP_V2(name) \
-    Intrinsic::experimental_vector_reduce_v2_##name
-#define LLVM_INTRINSIC_EXP(name) Intrinsic::experimental_vector_reduce_##name
-#else
-#define LLVM_INTRINSIC_EXP_V2(name) Intrinsic::experimental_vector_reduce_##name
-#define LLVM_INTRINSIC_EXP LLVM_INTRINSIC_EXP_V2
-#endif
-            if (v->type_ == intrin_type::reduce_add) {
-                if (cate == CATE_FLOAT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP_V2(fadd);
-                } else {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(add);
-                }
-            } else if (v->type_ == intrin_type::reduce_mul) {
-                if (cate == CATE_FLOAT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP_V2(fmul);
-                } else {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(mul);
-                }
-            } else if (v->type_ == intrin_type::reduce_max) {
-                if (cate == CATE_FLOAT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(fmax);
-                } else if (cate == CATE_INT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(smax);
-                } else if (cate == CATE_UINT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(umax);
-                }
-            } else if (v->type_ == intrin_type::reduce_min) {
-                if (cate == CATE_FLOAT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(fmin);
-                } else if (cate == CATE_INT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(smin);
-                } else if (cate == CATE_UINT) {
-                    cur_intrinsic = LLVM_INTRINSIC_EXP(umin);
-                }
-            }
-            assert(v->args_.size() == 1);
-            auto inval = generate_expr(v->args_[0]);
-            if ((v->type_ == intrin_type::reduce_add
-                        || v->type_ == intrin_type::reduce_mul)
-                    && cate == CATE_FLOAT) {
-                bool is_fp16 = v->args_[0]->dtype_.is_etype(sc_data_etype::F16);
-                auto start_val = v->type_ == intrin_type::reduce_mul
-                        ? APFloat(1.0f)
-                        : APFloat(0.0f);
-                auto fp16_start_val = v->type_ == intrin_type::reduce_mul
-                        ? static_cast<uint16_t>(0x3C00)
-                        : static_cast<uint16_t>(0);
-                current_val_ = builder_.CreateIntrinsic(
-#if SC_LLVM_BACKEND > 11
-                        cur_intrinsic, {inval->getType()},
-#elif SC_LLVM_BACKEND > 8
-                        cur_intrinsic, {get_type(v->dtype_), inval->getType()},
-#else
-                        cur_intrinsic,
-                        {get_type(v->dtype_), get_type(v->dtype_),
-                                inval->getType()},
-#endif
-                        {ConstantFP::get(get_type(v->dtype_),
-                                 is_fp16 ? APFloat(APFloat::IEEEhalf(),
-                                         APInt(16, fp16_start_val))
-                                         : start_val),
-                                inval});
-                llvm::cast<CallInst>(*current_val_)
-                        .setFastMathFlags(builder_.getFastMathFlags());
-            } else {
-                current_val_ = builder_.CreateIntrinsic(
-#if SC_LLVM_BACKEND >= 10
-                        cur_intrinsic, {inval->getType()},
-#else
-                        cur_intrinsic, {get_type(v->dtype_), inval->getType()},
-#endif
-                        {inval});
-            }
-        } break;
-        case intrin_type::saturated_cast: {
-            current_val_ = do_lower_saturated_cast(v);
-        } break;
-        case intrin_type::round_and_cast: {
-            assert(v->args_.size() == 1);
-            auto inval1 = generate_expr(v->args_[0]);
-            COMPILE_ASSERT(
-                    (v->dtype_.type_code_ == sc_data_etype::S32
-                            || (v->dtype_.type_code_ == sc_data_etype::U32
-                                    && v->dtype_.lanes_ > 1))
-                            && v->args_[0]->dtype_.type_code_
-                                    == sc_data_etype::F32,
-                    "LLVM backend has not yet support round_and_cast "
-                    "like "
-                    "this: " << v);
-            bool is_u32 = v->dtype_.type_code_ == sc_data_etype::U32;
-            if (is_u32) {
-                COMPILE_ASSERT(ctx_->machine_.cpu_flags_.fAVX512F,
-                        "round_and_cast of u32 needs AVX512");
-            }
-            switch (v->dtype_.lanes_) {
-                case 1: {
-                    current_val_ = builder_.CreateFPToSI(
-                            builder_.CreateUnaryIntrinsic(
-                                    Intrinsic::roundeven, inval1),
-                            builder_.getInt32Ty());
-                } break;
-                case 4: {
-                    if (is_u32) {
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_avx512_mask_cvtps2udq_128, {},
-                                {inval1, UndefValue::get(get_type(v->dtype_)),
-                                        /*mask*/ builder_.getInt8(0xff),
-                                        builder_.getInt32(0x04)});
-                    } else {
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_sse2_cvtps2dq, {}, inval1);
-                    }
-                } break;
-                case 8: {
-                    if (is_u32) {
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_avx512_mask_cvtps2udq_256, {},
-                                {inval1, UndefValue::get(get_type(v->dtype_)),
-                                        /*mask*/ builder_.getInt8(0xff),
-                                        builder_.getInt32(0x04)});
-                    } else {
-                        current_val_ = builder_.CreateIntrinsic(
-                                Intrinsic::x86_avx_cvt_ps2dq_256, {}, inval1);
-                    }
-                } break;
-                case 16: {
-                    COMPILE_ASSERT(ctx_->machine_.cpu_flags_.fAVX512F,
-                            "round_and_cast of 16 floats needs AVX512");
-                    current_val_ = builder_.CreateIntrinsic(is_u32
-                                    ? Intrinsic::x86_avx512_mask_cvtps2udq_512
-                                    : Intrinsic::x86_avx512_mask_cvtps2dq_512,
-                            {},
-                            {inval1, UndefValue::get(get_type(v->dtype_)),
-                                    /*mask*/ builder_.getInt16(0xffff),
-                                    /*rounding mode =
-                                       _MM_FROUND_CUR_DIRECTION 0x04*/
-                                    builder_.getInt32(0x04)});
-                } break;
-                default:
-                    COMPILE_ASSERT(false,
-                            "LLVM backend has not yet support "
-                            "round_and_cast with lanes = "
-                                    << v->dtype_.lanes_);
-                    break;
-            }
-        } break;
-        case intrin_type::permutex2var: {
-            assert(v->args_.size() == 3);
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            auto inval3 = generate_expr(v->args_[2]);
-            switch (v->args_[0]->dtype_.type_code_) {
-                case sc_data_etype::F32:
-                    switch (v->args_[0]->dtype_.lanes_) {
-                        case 4:
-                            current_val_ = builder_.CreateIntrinsic(
-                                    Intrinsic::x86_avx512_vpermi2var_ps_128, {},
-                                    {inval1, inval2, inval3});
-                            break;
-                        case 8:
-                            current_val_ = builder_.CreateIntrinsic(
-                                    Intrinsic::x86_avx512_vpermi2var_ps_256, {},
-                                    {inval1, inval2, inval3});
-                            break;
-                        case 16:
-                            current_val_ = builder_.CreateIntrinsic(
-                                    Intrinsic::x86_avx512_vpermi2var_ps_512, {},
-                                    {inval1, inval2, inval3});
-                            break;
-                        default:
-                            COMPILE_ASSERT(false,
-                                    "Unimplement lanes for permute2var: "
-                                            << v->args_[0]->dtype_.lanes_);
-                    }
-                    break;
-                case sc_data_etype::U8:
-                    current_val_ = builder_.CreateIntrinsic(
-                            Intrinsic::x86_avx512_vpermi2var_qi_128, {},
-                            {inval1, inval2, inval3});
-                    break;
-                default:
-                    COMPILE_ASSERT(
-                            false, "Unimplement datatype for permute2var");
-            }
-        } break;
-        case intrin_type::permutexvar: {
-            COMPILE_ASSERT(v->args_.size() == 2,
-                    "need two args in permutexvar instructions.");
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            int lanes = v->intrin_attrs_->get<int>("lanes");
-            auto elem_bits
-                    = utils::get_sizeof_etype(v->args_[1]->dtype_.type_code_)
-                    * 8 * lanes;
-            switch (elem_bits) {
-                case 8: {
-                    current_val_ = builder_.CreateIntrinsic(
-                            Intrinsic::x86_avx512_permvar_qi_512, {},
-                            {inval2, inval1});
-                } break;
-                case 16: {
-                    current_val_ = builder_.CreateIntrinsic(
-                            Intrinsic::x86_avx512_permvar_hi_512, {},
-                            {inval2, inval1});
-                } break;
-                case 64: {
-                    // Because there are two forms of invocation of the
-                    // vpermq instruction. One is such as
-                    // _mm256_permutexvar_epi64 (__m256i idx, __m256i a) and
-                    // _mm256_permute4x64_epi64 (__m256i a, const int imm8). The
-                    // corresponding code construction shufflevector of these
-                    // two forms in llvm is different, and we are consistent
-                    // with the source code of llvm.
-                    if (v->args_[0].isa<constant_c>()) {
-                        unsigned imm = get_expr_as_int(v->args_[0]);
-                        assert(v->args_[0].isa<constant_c>() || elem_bits > 0);
-                        bool is_avx512
-                                = utils::get_sizeof_type(v->args_[1]->dtype_)
-                                        * 8
-                                == 512;
-                        auto target_type = is_avx512
-                                ? get_type(sc_data_type_t::index(8))
-                                : get_type(sc_data_type_t::index(4));
-                        unsigned numelts = is_avx512 ? 8 : 4;
-                        auto outtype = get_type(v->args_[1]->dtype_);
-                        auto tmp1 = builder_.CreateBitCast(inval2, target_type);
-                        // These intrinsics operate on 256-bit lanes of four
-                        // 64-bit elements.
-                        std::vector<shuffle_idx_t> indices(numelts);
-                        for (unsigned l = 0; l != numelts; l += 4)
-                            for (unsigned i = 0; i != 4; ++i)
-                                indices[l + i] = l + ((imm >> (2 * i)) & 0x3);
-
-                        auto ret = builder_.CreateShuffleVector(
-                                tmp1, indices, "perm");
-                        current_val_ = builder_.CreateBitCast(ret, outtype);
-                    } else {
-                        COMPILE_ASSERT(false,
-                                "Currently we do not need "
-                                "_mm512_permutexvar_epi64 (__m512i idx, "
-                                "__m512i a)");
-                    }
-                } break;
-                default: {
-                    COMPILE_ASSERT(false,
-                            "Currently, we don't support "
-                            "\"" << v->args_[1]->dtype_
-                                 << "\"");
-                } break;
-            }
-        } break;
-        case intrin_type::unpack_high:
-        case intrin_type::unpack_low: {
-            COMPILE_ASSERT(v->args_.size() == 2,
-                    "Expecting size of args = 2, but get " << v->args_.size());
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            Value *tmp1;
-            Value *tmp2;
-            auto elem_bits = v->intrin_attrs_->get<int>("elem_bits");
-            std::vector<shuffle_idx_t> hi_array, lo_array;
-            const int type_bits = utils::get_sizeof_type(v->dtype_) * 8;
-            // We should unpack according to the number of the avx data type
-            // bits.
-            switch (type_bits) {
-                case 128: {
-                    switch (elem_bits) {
-                        case 8: {
-                            auto target_type = get_type(sc_data_type_t::u8(16));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {8, 16 + 8, 9,
-                                    16 + 9, 10, 16 + 10, 11, 16 + 11, 12,
-                                    16 + 12, 13, 16 + 13, 14, 16 + 14, 15,
-                                    16 + 15};
-                            lo_array.resize(16);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 8; });
-                        } break;
-                        case 16: {
-                            auto target_type = get_type(sc_data_type_t::u16(8));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {
-                                    4, 8 + 4, 5, 8 + 5, 6, 8 + 6, 7, 8 + 7};
-                            lo_array.resize(8);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 4; });
-                        } break;
-                        case 32: {
-                            auto target_type = get_type(sc_data_type_t::u32(4));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {
-                                    2, 4 + 2, 3, 4 + 3};
-                            lo_array.resize(4);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 2; });
-                        } break;
-                        case 64: {
-                            auto target_type
-                                    = get_type(sc_data_type_t::index(2));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {1, 2 + 1};
-                            lo_array.resize(2);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 1; });
-                        } break;
-                        default: {
-                            COMPILE_ASSERT(false,
-                                    "Only support 8, 16, 32 and 64 bits");
-                        };
-                    };
-                } break;
-                case 256: {
-                    switch (elem_bits) {
-                        case 8: {
-                            auto target_type = get_type(sc_data_type_t::u8(32));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {8, 32 + 8, 9,
-                                    32 + 9, 10, 32 + 10, 11, 32 + 11, 12,
-                                    32 + 12, 13, 32 + 13, 14, 32 + 14, 15,
-                                    32 + 15, 24, 32 + 24, 25, 32 + 25, 26,
-                                    32 + 26, 27, 32 + 27, 28, 32 + 28, 29,
-                                    32 + 29, 30, 32 + 30, 31, 32 + 31};
-                            lo_array.resize(32);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 8; });
-                        } break;
-                        case 16: {
-                            auto target_type
-                                    = get_type(sc_data_type_t::u16(16));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {4, 16 + 4, 5,
-                                    16 + 5, 6, 16 + 6, 7, 16 + 7, 12, 16 + 12,
-                                    13, 16 + 13, 14, 16 + 14, 15, 16 + 15};
-                            lo_array.resize(16);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 4; });
-                        } break;
-                        case 32: {
-                            auto target_type = get_type(sc_data_type_t::u32(8));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {
-                                    2, 8 + 2, 3, 8 + 3, 6, 8 + 6, 7, 8 + 7};
-                            lo_array.resize(8);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 2; });
-                        } break;
-                        case 64: {
-                            auto target_type
-                                    = get_type(sc_data_type_t::index(4));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {
-                                    1, 4 + 1, 3, 4 + 3};
-                            lo_array.resize(4);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 1; });
-                        } break;
-                        default: {
-                            COMPILE_ASSERT(false,
-                                    "Only support 8, 16, 32 and 64 bits");
-                        } break;
-                    }
-                } break;
-                case 512: {
-                    switch (elem_bits) {
-                        case 8: {
-                            auto target_type = get_type(sc_data_type_t::u8(64));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {8, 64 + 8, 9,
-                                    64 + 9, 10, 64 + 10, 11, 64 + 11, 12,
-                                    64 + 12, 13, 64 + 13, 14, 64 + 14, 15,
-                                    64 + 15, 24, 64 + 24, 25, 64 + 25, 26,
-                                    64 + 26, 27, 64 + 27, 28, 64 + 28, 29,
-                                    64 + 29, 30, 64 + 30, 31, 64 + 31, 40,
-                                    64 + 40, 41, 64 + 41, 42, 64 + 42, 43,
-                                    64 + 43, 44, 64 + 44, 45, 64 + 45, 46,
-                                    64 + 46, 47, 64 + 47, 56, 64 + 56, 57,
-                                    64 + 57, 58, 64 + 58, 59, 64 + 59, 60,
-                                    64 + 60, 61, 64 + 61, 62, 64 + 62, 63,
-                                    64 + 63};
-                            lo_array.resize(64);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 8; });
-                        } break;
-                        case 16: {
-                            auto target_type
-                                    = get_type(sc_data_type_t::u16(32));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {4, 32 + 4, 5,
-                                    32 + 5, 6, 32 + 6, 7, 32 + 7, 12, 32 + 12,
-                                    13, 32 + 13, 14, 32 + 14, 15, 32 + 15, 20,
-                                    32 + 20, 21, 32 + 21, 22, 32 + 22, 23,
-                                    32 + 23, 28, 32 + 28, 29, 32 + 29, 30,
-                                    32 + 30, 31, 32 + 31};
-                            lo_array.resize(32);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 4; });
-                        } break;
-                        case 32: {
-                            auto target_type
-                                    = get_type(sc_data_type_t::u32(16));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {2, 18, 3, 19,
-                                    2 + 4, 18 + 4, 3 + 4, 19 + 4, 2 + 8, 18 + 8,
-                                    3 + 8, 19 + 8, 2 + 12, 18 + 12, 3 + 12,
-                                    19 + 12};
-                            lo_array.resize(16);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 2; });
-                        } break;
-                        case 64: {
-                            auto target_type
-                                    = get_type(sc_data_type_t::index(8));
-                            tmp1 = builder_.CreateBitCast(inval1, target_type);
-                            tmp2 = builder_.CreateBitCast(inval2, target_type);
-                            hi_array = std::vector<shuffle_idx_t> {1, 9, 1 + 2,
-                                    9 + 2, 1 + 4, 9 + 4, 1 + 6, 9 + 6};
-                            lo_array.resize(8);
-                            std::transform(hi_array.begin(), hi_array.end(),
-                                    lo_array.begin(),
-                                    [](shuffle_idx_t x) { return x - 1; });
-                        } break;
-                        default: {
-                            COMPILE_ASSERT(false,
-                                    "Only support 8, 16, 32 and 64 bits");
-                        } break;
-                    }
-                } break;
-                default: {
-                    COMPILE_ASSERT(false, "Invalid simd bits: " << type_bits);
-                } break;
-            }
-            ArrayRef<shuffle_idx_t> arr = v->type_ == intrin_type::unpack_high
-                    ? hi_array
-                    : lo_array;
-            auto res = builder_.CreateShuffleVector(tmp1, tmp2, arr);
-            current_val_ = builder_.CreateBitCast(res, get_type(v->dtype_));
-        } break;
-        case intrin_type::shuffle: {
-            COMPILE_ASSERT(v->args_.size() == 2,
-                    "args size must be 2, but get " << v->args_.size());
-            auto inval1 = generate_expr(v->args_[0]);
-            auto inval2 = generate_expr(v->args_[1]);
-            auto imm8 = v->intrin_attrs_->get<int>("shuffle_imm");
-            auto type_bits = v->intrin_attrs_->get<int>("type_bits");
-            switch (type_bits) {
-                case 32: {
-                    unsigned num_elts = v->args_[0]->dtype_.lanes_;
-                    unsigned num_lanes
-                            = utils::get_sizeof_type(v->args_[0]->dtype_) * 8
-                            / 128;
-                    unsigned num_lanes_elts = num_elts / num_lanes;
-
-                    // Splat the 8-bits of immediate 4 times to help the
-                    // loop wrap around.
-                    imm8 = (imm8 & 0xff) * 0x01010101;
-
-                    std::vector<shuffle_idx_t> indices(num_elts);
-                    for (unsigned l = 0; l != num_elts; l += num_lanes_elts) {
-                        for (unsigned i = 0; i != num_lanes_elts; ++i) {
-                            unsigned index = imm8 % num_lanes_elts;
-                            imm8 /= num_lanes_elts;
-                            if (i >= (num_lanes_elts / 2)) index += num_elts;
-                            indices[l + i] = l + index;
-                        }
-                    }
-
-                    current_val_ = builder_.CreateShuffleVector(
-                            inval1, inval2, indices, "shuf32");
-                } break;
-                case 128: {
-                    unsigned num_elts = v->args_[0]->dtype_.lanes_;
-                    unsigned num_lanes
-                            = utils::get_sizeof_type(v->args_[0]->dtype_) * 8
-                                    == 512
-                            ? 4
-                            : 2;
-                    unsigned num_lanes_elts = num_elts / num_lanes;
-
-                    std::vector<shuffle_idx_t> indices(num_elts);
-                    for (unsigned l = 0; l != num_elts; l += num_lanes_elts) {
-                        unsigned index = (imm8 % num_lanes) * num_lanes_elts;
-                        imm8 /= num_lanes; // Discard the bits we just used.
-                        if (l >= (num_elts / 2))
-                            index += num_elts; // Switch to other source.
-                        for (unsigned i = 0; i != num_lanes_elts; ++i) {
-                            indices[l + i] = index + i;
-                        }
-                    }
-
-                    current_val_ = builder_.CreateShuffleVector(
-                            inval1, inval2, indices, "shuf128");
-                } break;
-                default: {
-                    COMPILE_ASSERT(false,
-                            "Curerntly only support type_bits 32 and 128, "
-                            "but get "
-                                    << type_bits);
-                } break;
-            };
-        } break;
-        case intrin_type::permute: {
-            assert(v->args_.size() == 2);
-
-            auto imm8 = v->intrin_attrs_->get<int>("permute_imm");
-            auto val0 = generate_expr(v->args_[0]);
-            auto val1 = generate_expr(v->args_[1]);
-
-            unsigned numelts = v->args_[0]->dtype_.lanes_;
-
-            // This takes a very simple approach since there are two
-            // lanes and a shuffle can have 2 inputs. So we reserve the
-            // first input for the first lane and the second input for
-            // the second lane. This may result in duplicate sources,
-            // but this can be dealt with in the backend.
-
-            Value *outops[2];
-            int indices[16];
-            for (unsigned l = 0; l != 2; ++l) {
-                // Determine the source for this lane.
-                if (imm8 & (1 << ((l * 4) + 3)))
-                    outops[l]
-                            = llvm::ConstantAggregateZero::get(val0->getType());
-                else if (imm8 & (1 << ((l * 4) + 1)))
-                    outops[l] = val1;
-                else
-                    outops[l] = val0;
-
-                for (unsigned i = 0; i != numelts / 2; ++i) {
-                    // Start with ith element of the source for this
-                    // lane.
-                    unsigned idx = (l * numelts) + i;
-                    // If bit 0 of the immediate half is set, switch to
-                    // the high half of the source.
-                    if (imm8 & (1 << (l * 4))) idx += numelts / 2;
-                    indices[(l * (numelts / 2)) + i] = idx;
-                }
-            }
-
-            current_val_ = builder_.CreateShuffleVector(outops[0], outops[1],
-                    ArrayRef<shuffle_idx_t>(indices, numelts), "vperm");
-        } break;
-        case intrin_type::prefetch: {
-            assert(v->args_.size() == 1);
-            auto locality = v->intrin_attrs_->get<int>("locality");
-            assert(locality <= 3 && locality >= 0
-                    && "bad locality for prefetch");
-            auto inval1 = generate_expr(v->args_[0]);
-            current_val_ = builder_.CreateIntrinsic(Intrinsic::prefetch,
-#if SC_LLVM_BACKEND > 8
-                    {builder_.getInt8PtrTy()},
-#else
-                    {},
-#endif
-                    {builder_.CreatePointerCast(
-                             inval1, builder_.getInt8PtrTy()),
-                            /*rw*/ builder_.getInt32(0),
-                            /*locality*/
-                            builder_.getInt32(3 - locality),
-                            /*type:i/d*/ builder_.getInt32(1)});
-        } break;
-        case intrin_type::gather: {
-            assert(v->args_.size() == 2);
-            COMPILE_ASSERT(v->dtype_.is_etype(sc_data_etype::F32),
-                    "Expecting f32 for gather: " << v);
-            auto inval1 = builder_.CreatePointerCast(
-                    generate_expr(v->args_[0]), builder_.getInt8PtrTy());
-            auto inval2 = generate_expr(v->args_[1]);
-            auto full_mask = generate_expr(make_expr<constant_node>(
-                    std::vector<union_val>(
-                            v->dtype_.lanes_, UINT64_C(0xffffffff)),
-                    v->dtype_));
-            auto dummy_vec = generate_expr(make_expr<constant_node>(
-                    std::vector<union_val>(v->dtype_.lanes_, UINT64_C(0)),
-                    v->dtype_));
-            switch (v->dtype_.lanes_) {
-                case 1: {
-                    auto indexing = make_expr<indexing_node>(v->args_[0],
-                            std::vector<expr> {v->args_[1]}, expr());
-                    current_val_ = generate_expr(indexing);
-                    break;
-                }
-                case 4:
-                    current_val_ = builder_.CreateIntrinsic(
-                            Intrinsic::x86_avx2_gather_d_ps, {},
-                            {dummy_vec, inval1, inval2, full_mask,
-                                    builder_.getInt8(4)});
-                    break;
-                case 8:
-                    current_val_ = builder_.CreateIntrinsic(
-                            Intrinsic::x86_avx2_gather_d_ps_256, {},
-                            {dummy_vec, inval1, inval2, full_mask,
-                                    builder_.getInt8(4)});
-                    break;
-                case 16: {
-                    COMPILE_ASSERT(ctx_->machine_.cpu_flags_.fAVX512F,
-                            "gather of 16 floats needs AVX512");
-                    current_val_ = builder_.CreateIntrinsic(
-                            Intrinsic::x86_avx512_gather_dps_512, {},
-                            {dummy_vec, inval1, inval2,
-                                    builder_.getInt16(0xffff),
-                                    builder_.getInt32(4)});
-                    break;
-                }
-                default:
-                    COMPILE_ASSERT(false,
-                            "LLVM backend has not yet support "
-                            "gather with lanes = "
-                                    << v->dtype_.lanes_);
-                    break;
-            }
-        } break;
-        case intrin_type::insert: {
-            auto val0 = generate_expr(v->args_[0]);
-            auto val1 = generate_expr(v->args_[1]);
-            unsigned index = v->intrin_attrs_->get<int>("insert_imm");
-            const unsigned elem_bits
-                    = utils::get_sizeof_type(v->args_[1]->dtype_) * 8;
-            auto lanes = v->args_[0]->dtype_.lanes_;
-            auto type_code = v->args_[0]->dtype_.type_code_;
-            if (ctx_->machine_.cpu_flags_.fAVX512F && elem_bits >= 128) {
-                // avx512 128bit insert can use this part.
-                unsigned dst_num_elts = lanes, src_num_elts = lanes / 2;
-                unsigned subvectors = dst_num_elts / src_num_elts;
-                index &= subvectors - 1;
-                index *= src_num_elts;
-                std::vector<shuffle_idx_t> indices(dst_num_elts);
-                for (unsigned i = 0; i != dst_num_elts; ++i) {
-                    indices[i] = (i >= src_num_elts)
-                            ? src_num_elts + (i % src_num_elts)
-                            : i;
-                }
-                Value *op1
-                        = builder_.CreateShuffleVector(val1, indices, "widen");
-                for (unsigned i = 0; i != dst_num_elts; ++i) {
-                    if (i >= index && i < (index + src_num_elts)) {
-                        indices[i] = (i - index) + dst_num_elts;
-                    } else {
-                        indices[i] = i;
-                    }
-                }
-                current_val_ = builder_.CreateShuffleVector(
-                        val0, op1, indices, "insert");
-            } else {
-                assert(elem_bits <= 128);
-                uint64_t idx = index;
-                idx &= lanes - 1;
-                current_val_ = builder_.CreateInsertElement(val0, val1, idx);
-            }
-        } break;
-        case intrin_type::extract: {
-            auto val0 = generate_expr(v->args_[0]);
-            unsigned index = v->intrin_attrs_->get<int>("extract_imm");
-            const unsigned elem_bits = utils::get_sizeof_type(v->dtype_) * 8;
-            auto lanes = v->args_[0]->dtype_.lanes_;
-            auto type_code = v->args_[0]->dtype_.type_code_;
-            if (ctx_->machine_.cpu_flags_.fAVX512F && elem_bits >= 128) {
-                // avx512 128bit extract can use this part.
-                unsigned dst_num_elts
-                        = index / utils::get_sizeof_etype(v->dtype_.as_etype()),
-                        src_num_elts = lanes;
-                unsigned subvectors = src_num_elts / dst_num_elts;
-                index &= subvectors - 1;
-                index *= src_num_elts;
-
-                std::vector<shuffle_idx_t> indices(dst_num_elts);
-                for (unsigned i = 0; i != dst_num_elts; ++i)
-                    indices[i] = i + index;
-                current_val_ = builder_.CreateShuffleVector(
-                        val0, indices, "extract");
-            } else {
-                assert(elem_bits <= 128);
-                uint64_t idx = (uint64_t)index;
-                idx &= lanes - 1;
-                current_val_ = builder_.CreateExtractElement(val0, idx);
-            }
-        } break;
-        default: {
-            std::stringstream ss;
-            ss << "Intrinsics not implemented ";
-            v->to_string(ss);
-            throw std::runtime_error(ss.str());
-        }
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/intrinsic_impl.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/intrinsic_impl.cpp
deleted file mode 100644
index 9fce31056ff..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/intrinsic_impl.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "shared_include.hpp"
-
-using namespace llvm;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-Value *codegen_llvm_vis_t::call_unary_llvm_intrin(const intrin_call_c &v,
-        type_category cate, Intrinsic::ID id, bool must_fp) {
-    assert(v->args_.size() == 1);
-    auto inval = generate_expr(v->args_[0]);
-    if (must_fp) {
-        COMPILE_ASSERT(cate == CATE_FLOAT, "Bad type. Expecting float: " << v);
-    }
-    return builder_.CreateUnaryIntrinsic(id, inval);
-}
-
-Value *codegen_llvm_vis_t::call_binary_llvm_intrin(const intrin_call_c &v,
-        type_category cate, Intrinsic::ID id, bool must_fp) {
-    assert(v->args_.size() == 2);
-    auto inval1 = generate_expr(v->args_[0]);
-    auto inval2 = generate_expr(v->args_[1]);
-    if (must_fp) {
-        COMPILE_ASSERT(cate == CATE_FLOAT, "Bad type. Expecting float: " << v);
-    }
-    return builder_.CreateBinaryIntrinsic(id, inval1, inval2);
-}
-
-Value *codegen_llvm_vis_t::call_binary_llvm_normal(
-        const intrin_call_c &v, llvm_binary_func op) {
-    assert(v->args_.size() == 2);
-    auto inval1 = generate_expr(v->args_[0]);
-    auto inval2 = generate_expr(v->args_[1]);
-    return (builder_.*op)(inval1, inval2, "");
-}
-
-Value *codegen_llvm_vis_t::make_int_min_max(
-        const intrin_call_c &v, bool ismin, type_category cate) {
-    assert(v->args_.size() == 2);
-    auto v1 = generate_expr(v->args_[0]);
-    auto v2 = generate_expr(v->args_[1]);
-    return make_int_min_max(v1, v2, ismin, cate);
-}
-
-Value *codegen_llvm_vis_t::make_int_min_max(
-        Value *v1, Value *v2, bool ismin, type_category cate) {
-    // fix-me: use smax/smin for newer LLVM
-    llvm::Value *(llvm::IRBuilder<>::*ptr)(
-            llvm::Value * LHS, llvm::Value * RHS, const llvm::Twine &Name);
-    if (ismin) {
-        if (cate == CATE_INT) {
-            ptr = &IRBuilder<>::CreateICmpSLE;
-        } else {
-            ptr = &IRBuilder<>::CreateICmpULE;
-        }
-    } else {
-        if (cate == CATE_INT) {
-            ptr = &IRBuilder<>::CreateICmpSGE;
-        } else {
-            ptr = &IRBuilder<>::CreateICmpUGE;
-        }
-    }
-    return builder_.CreateSelect((builder_.*ptr)(v1, v2, ""), v1, v2);
-}
-
-Value *codegen_llvm_vis_t::do_lower_saturated_cast(const intrin_call_c &v) {
-    COMPILE_ASSERT(ctx_->machine_.cpu_flags_.fAVX512F,
-            "lowered saturated_cast needs AVX512F");
-    assert(v->args_.size() == 1);
-    auto inval1 = generate_expr(v->args_[0]);
-    auto intype = v->args_[0]->dtype_;
-    auto out_llvm_ty = get_type(v->dtype_);
-    auto ths = this;
-    // the fast path for AVX512
-    auto pmovus_db_512 = [ths, out_llvm_ty](Value *v, bool issigned) {
-        Intrinsic::ID id = issigned ? Intrinsic::x86_avx512_mask_pmovs_db_512
-                                    : Intrinsic::x86_avx512_mask_pmovus_db_512;
-        return ths->builder_.CreateIntrinsic(id, {},
-                {v, UndefValue::get(out_llvm_ty),
-                        ths->builder_.getInt16(0xffff)});
-    };
-    if (v->dtype_ == sc_data_type_t::s8(16)) {
-        if (intype == sc_data_type_t::s32(16)) {
-            return pmovus_db_512(inval1, true);
-        }
-    } else if (v->dtype_ == sc_data_type_t::u8(16)) {
-        if (intype == sc_data_type_t::s32(16)
-                || intype == sc_data_type_t::u32(16)) {
-            return pmovus_db_512(inval1, false);
-        }
-    }
-    COMPILE_ASSERT(false,
-            "lowered saturated_cast cannot handle: "
-                    << v << '(' << intype << "->" << v->dtype_ << ')');
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/llvm_visitor.hpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/llvm_visitor.hpp
deleted file mode 100644
index c0c03395cd1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/llvm_visitor.hpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_LLVM_LLVM_VISITOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_LLVM_LLVM_VISITOR_HPP
-
-/*
-Developer notes:
-We break the implementation of codegen_llvm_vis_t into multiple files, because
-we need to workaround a hidden g++-7 bug linking with LLVM16. We found that when
-the size of a single cpp source/binary for LLVM-IR generation increases to a
-certain size, the generated in-memory LLVM-IR will be broken. It results in
-segfault in LLVM's internal passes. g++-9 does not break LLVM16.
-*/
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/pass/printer.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <llvm/IR/BasicBlock.h>
-#include <llvm/IR/Constants.h>
-#include <llvm/IR/DIBuilder.h>
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/Intrinsics.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/MDBuilder.h>
-#include <llvm/IR/Module.h>
-#include <llvm/IR/Type.h>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class codegen_llvm_vis_t : public ir_viewer_t {
-public:
-    context_ptr ctx_;
-    llvm::LLVMContext &context_;
-    llvm::IRBuilder<> builder_;
-    std::unique_ptr<llvm::Module> module_;
-    std::unique_ptr<llvm::DIBuilder> dbuilder_;
-    llvm::DICompileUnit *dbg_cu_ = nullptr;
-    std::vector<llvm::DIScope *> dbg_scopes_;
-    llvm::Function *current_func_ = nullptr;
-    llvm::Value *current_val_ = nullptr;
-    // the **pointer** of local var in a function
-    std::unordered_map<expr_c, llvm::Value *> var_ptr_in_func_;
-    using vec_metadata = llvm::SmallVector<llvm::Metadata *, 4>;
-    // tensor to <alias scope, noalias>
-    std::unordered_map<expr_c, std::pair<llvm::MDNode *, llvm::MDNode *> *>
-            tsr_to_alias_scope_;
-    std::unordered_map<alias_info::tensor_alias_identity_t *,
-            std::pair<llvm::MDNode *, llvm::MDNode *>>
-            alias_set_to_alias_scope_;
-    std::unordered_map<std::string, llvm::Function *> name_to_func_;
-    bool is_lvalue_mode_ = false;
-
-    codegen_llvm_vis_t(const context_ptr &ctx, llvm::LLVMContext &context,
-            const std::string &source_dir, const std::string &source_file_name);
-
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-
-    expr_c dispatch(expr_c v) override;
-
-    stmt_c dispatch(stmt_c v) override;
-
-    const std::string &get_node_name(const expr_c &c);
-
-    llvm::FunctionType *create_func_type(const func_c &v);
-
-    llvm::DISubroutineType *create_func_dtype(const func_c &v);
-
-    llvm::Function *get_or_create_func(const func_c &v);
-
-    llvm::Value *generate_expr(const expr_c &e);
-
-    std::unordered_map<uint64_t, std::pair<llvm::Type *, llvm::DIType *>>
-            type_cache_;
-    std::pair<llvm::Type *, llvm::DIType *> do_get_type(sc_data_type_t dtype);
-
-    std::pair<llvm::Type *, llvm::DIType *> get_type_both(sc_data_type_t dtype);
-
-    llvm::Type *get_type(sc_data_type_t dtype);
-
-    llvm::Value *get_defined_var_ptr(const expr_c &e);
-
-    llvm::Value *define_var(const expr_c &e, llvm::Value *initv);
-
-    void set_dbg_info_for_func_arg(llvm::Value *v, llvm::DISubprogram *SP,
-            llvm::DIFile *dunit, sc_data_type_t type, const std::string &name,
-            int argidx, int lineno, bool need_ref);
-    void emit_location(const node_base *p);
-    void prepare_alias_metadata(
-            const std::vector<alias_info::tensor_alias_identity_t *> &tensors,
-            llvm::MDNode *alias_domain, llvm::MDBuilder &MDB);
-
-    func_c dispatch(func_c v) override;
-    void view(constant_c v) override;
-    void view(var_c v) override;
-
-    llvm::Type *get_llvm_bf16_native_type(unsigned lanes);
-
-    void view(cast_c v) override;
-    void generate_bin_op(const expr_c &v, const expr_c &l, const expr_c &r);
-    void view(binary_c v) override;
-    void view(cmp_c v) override;
-
-    void view(logic_c v) override;
-    void view(logic_not_c v) override;
-    void view(select_c v) override;
-
-    llvm::Instruction *set_alias(llvm::Instruction *inst, const expr_c &tsr);
-
-    llvm::Value *set_alias(llvm::Value *inst, const expr_c &tsr);
-
-    llvm::Value *convert_mask(const expr &in, const bool is_int4 = false);
-
-    void view(indexing_c v) override;
-    void view(tensorptr_c v) override;
-
-    llvm::Value *gen_vec_const(uint64_t elements, float f);
-
-    llvm::Value *call_unary_llvm_intrin(const intrin_call_c &v,
-            type_category cate, llvm::Intrinsic::ID id, bool must_fp);
-
-    llvm::Value *call_binary_llvm_intrin(const intrin_call_c &v,
-            type_category cate, llvm::Intrinsic::ID id, bool must_fp);
-
-    typedef llvm::Value *(llvm::IRBuilder<>::*llvm_binary_func)(
-            llvm::Value *LHS, llvm::Value *RHS, const llvm::Twine &Name);
-    llvm::Value *call_binary_llvm_normal(
-            const intrin_call_c &v, llvm_binary_func op);
-
-    llvm::Value *make_int_min_max(
-            const intrin_call_c &v, bool ismin, type_category cate);
-
-    llvm::Value *make_int_min_max(
-            llvm::Value *v1, llvm::Value *v2, bool ismin, type_category cate);
-
-    llvm::Value *do_lower_saturated_cast(const intrin_call_c &v);
-
-    void view(intrin_call_c v) override;
-    void view(func_addr_c v) override;
-    void view(call_c v) override;
-    void view(tensor_c v) override;
-    // void view(stmts_c v) override;
-    // void view(evaluate_c v) override;
-
-    void generate_codeblock(
-            const stmt_c &v, llvm::BasicBlock *current, llvm::BasicBlock *cont);
-
-    void view(assign_c v) override;
-
-    void view(if_else_c v) override;
-
-    void view(returns_c v) override;
-
-    void set_dbg_info_for_local_var(const source_pos *pos, sc_data_type_t type,
-            const std::string &name, llvm::Value *llvm_value, bool need_ref);
-
-    void set_dbg_info_for_local_var(const define_node_t *v,
-            const std::string &name, llvm::Value *llvm_value, bool need_ref);
-
-    void view(define_c v) override;
-
-    void view(for_loop_c v) override;
-    ~codegen_llvm_vis_t();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/shared_include.hpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/shared_include.hpp
deleted file mode 100644
index 1025ae22498..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/shared_include.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_LLVM_SHARED_INCLUDE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_LLVM_SHARED_INCLUDE_HPP
-
-#include <compiler/codegen/codegen_llvm.hpp>
-#include <compiler/codegen/precodegen_passes.hpp>
-#include <compiler/ir/pass/printer.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <llvm/ADT/APFloat.h>
-#include <llvm/ADT/STLExtras.h>
-#include <llvm/Analysis/TargetTransformInfo.h>
-#include <llvm/IR/BasicBlock.h>
-#include <llvm/IR/Constants.h>
-#include <llvm/IR/DIBuilder.h>
-#include <llvm/IR/DerivedTypes.h>
-#include <llvm/IR/Function.h>
-#include <llvm/IR/IRBuilder.h>
-#include <llvm/IR/Intrinsics.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/MDBuilder.h>
-#include <llvm/IR/Module.h>
-#include <llvm/IR/Type.h>
-#if SC_LLVM_BACKEND > 16
-#include <llvm/TargetParser/Host.h>
-#else
-#include <llvm/Support/Host.h>
-#endif
-#if SC_LLVM_BACKEND > 13
-#include <llvm/MC/TargetRegistry.h>
-#else
-#include <llvm/Support/TargetRegistry.h>
-#endif
-#if SC_LLVM_BACKEND >= 17
-#include <llvm/TargetParser/SubtargetFeature.h>
-#else
-#include <llvm/MC/SubtargetFeature.h>
-#endif
-#include <llvm/Support/TargetSelect.h>
-#include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/TargetOptions.h>
-#include <util/any_map.hpp>
-#include <util/bf16.hpp>
-#include <util/file.hpp>
-#include <util/scoped_timer.hpp>
-#include <util/utils.hpp>
-
-#if SC_LLVM_BACKEND > 15
-#if SC_LLVM_BACKEND < 17
-#include <llvm/ADT/None.h>
-#include <llvm/ADT/Optional.h>
-#endif
-#include <llvm/Support/ModRef.h>
-#endif
-
-#if SC_LLVM_BACKEND > 8
-#include <llvm/IR/IntrinsicsX86.h>
-#endif
-
-#include "llvm_visitor.hpp"
-
-SC_MODULE(codegen.llvm);
-
-#if SC_LLVM_BACKEND > 8
-#define SC_LLVM_ALIGN(a) Align(a)
-#else
-#define SC_LLVM_ALIGN(a) (a)
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-#if SC_LLVM_BACKEND > 16
-// starting from LLVM17, they use STL's optional container
-template <typename T>
-using Optional = std::optional<T>;
-#endif
-
-#if SC_LLVM_BACKEND > 10
-using shuffle_idx_t = int;
-#else
-using shuffle_idx_t = uint32_t;
-#endif
-
-using namespace llvm;
-#if SC_LLVM_BACKEND >= 18
-using LLVM_CodeGenOptLevel = llvm::CodeGenOptLevel;
-#else
-using LLVM_CodeGenOptLevel = llvm::CodeGenOpt::Level;
-#endif
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/stmt.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/stmt.cpp
deleted file mode 100644
index 030e468d03f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/stmt.cpp
+++ /dev/null
@@ -1,239 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "shared_include.hpp"
-
-using namespace llvm;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void codegen_llvm_vis_t::generate_codeblock(
-        const stmt_c &v, BasicBlock *current, BasicBlock *cont) {
-    builder_.SetInsertPoint(current);
-    dispatch(v);
-    current = builder_.GetInsertBlock();
-    if (current->empty() || !llvm::isa<llvm::ReturnInst>(current->back())) {
-        builder_.CreateBr(cont);
-    }
-}
-
-void codegen_llvm_vis_t::view(assign_c v) {
-    auto val = generate_expr(v->value_);
-    is_lvalue_mode_ = true;
-    auto ptr = generate_expr(v->var_);
-    if (v->var_->dtype_.lanes_ > 1) {
-        ptr = builder_.CreatePointerCast(
-                ptr, get_type(v->var_->dtype_)->getPointerTo());
-    }
-    if (v->var_->dtype_.is_pointer()
-            && v->value_->dtype_.type_code_ != v->var_->dtype_.type_code_) {
-        val = builder_.CreatePointerCast(val, get_type(v->var_->dtype_));
-    }
-    if (v->value_->dtype_.lanes_ > 1 && v->var_.isa<indexing>()) {
-        // assigning to tensor
-        if (v->var_.static_as<indexing>()->mask_.defined()) {
-            Value *mask;
-            auto v_index = v->var_.static_as<indexing>();
-            auto bit_len = utils::get_sizeof_type(v_index->mask_->dtype_) * 8;
-            if (v_index->dtype_.lanes_ != bit_len) {
-                COMPILE_ASSERT(v_index->dtype_.lanes_ == 4,
-                        "Currently only 8bit -> 4bit is supported.");
-                mask = convert_mask(v_index->mask_, true);
-            } else {
-                mask = convert_mask(v_index->mask_);
-            }
-#if SC_LLVM_BACKEND > 10
-            set_alias(builder_.CreateMaskedStore(
-                              val, ptr, SC_LLVM_ALIGN(1), mask),
-                    v->var_);
-#else
-            set_alias(builder_.CreateMaskedStore(val, ptr, 1, mask), v->var_);
-#endif
-        } else {
-            set_alias(builder_.CreateAlignedStore(val, ptr, SC_LLVM_ALIGN(1)),
-                    v->var_);
-        }
-    } else {
-        set_alias(builder_.CreateStore(val, ptr), v->var_);
-    }
-}
-
-void codegen_llvm_vis_t::view(if_else_c v) {
-    auto cond = generate_expr(v->condition_);
-    BasicBlock *tb = BasicBlock::Create(context_, "if_t", current_func_);
-    BasicBlock *cb = BasicBlock::Create(context_, "if_cont", current_func_);
-    BasicBlock *fb = v->else_case_.defined()
-            ? BasicBlock::Create(context_, "if_f", current_func_)
-            : cb;
-    fb->moveBefore(cb);
-    builder_.CreateCondBr(cond, tb, fb);
-    generate_codeblock(v->then_case_, tb, cb);
-    if (fb != cb) { generate_codeblock(v->else_case_, fb, cb); }
-    builder_.SetInsertPoint(cb);
-}
-
-void codegen_llvm_vis_t::view(returns_c v) {
-    if (v->value_.defined()) {
-        builder_.CreateRet(generate_expr(v->value_));
-    } else {
-        builder_.CreateRetVoid();
-    }
-}
-
-void codegen_llvm_vis_t::view(define_c v) {
-    COMPILE_ASSERT(v->linkage_ != linkage::static_local
-                    && v->linkage_ != linkage::private_global,
-            "LLVM backend cannot handle non-local variable "
-            "definitions");
-    if (v->var_.isa<var>()) {
-        auto thevar = v->var_.static_as<var>();
-        if (thevar->attr_
-                && thevar->attr_->has_key(attr_keys::module_global_offset)) {
-            // if it is a global variable that is lowered to local
-            auto &offset
-                    = thevar->attr_->get_any(attr_keys::module_global_offset);
-            Value *ptr;
-            if (auto absptr = offset.get_or_null<void *>()) {
-                ptr = builder_.CreateIntToPtr(
-                        builder_.getInt64(reinterpret_cast<uint64_t>(*absptr)),
-                        get_type(thevar->dtype_)->getPointerTo(),
-                        thevar->name_);
-            } else {
-                auto module_ptr = current_func_->arg_begin() + 1;
-                assert(module_ptr->getName() == "__module_data_arg");
-
-                ptr = builder_.CreateGEP(builder_.getInt8Ty(), module_ptr,
-                        builder_.getInt64(offset.get<size_t>()));
-                ptr = builder_.CreatePointerCast(ptr,
-                        get_type(thevar->dtype_)->getPointerTo(),
-                        thevar->name_);
-            }
-            var_ptr_in_func_.insert(std::make_pair(thevar, ptr));
-            set_dbg_info_for_local_var(v.get(), thevar->name_, ptr, false);
-        } else {
-            Value *init_v = nullptr;
-            if (v->init_.defined()) { init_v = generate_expr(v->init_); }
-            auto retv = define_var(thevar, init_v);
-            set_dbg_info_for_local_var(v.get(), thevar->name_, retv, false);
-        }
-    } else if (v->var_.isa<tensor>()) {
-        tensor t = v->var_.static_as<tensor>();
-        if (auto alias_info = alias_info::get_alias_info(*t)) {
-            auto alias_itr = alias_set_to_alias_scope_.find(alias_info);
-            if (alias_itr != alias_set_to_alias_scope_.end()) {
-                tsr_to_alias_scope_[t] = &alias_itr->second;
-            }
-        }
-        // if it is a view of the rescheduled buffer/ local tensor on
-        // heap
-        if (v->init_.defined()) {
-            Value *ptr = generate_expr(v->init_);
-            ptr = builder_.CreatePointerCast(
-                    ptr, get_type(t->elem_dtype_)->getPointerTo(), t->name_);
-            var_ptr_in_func_.insert(std::make_pair(t, ptr));
-            set_dbg_info_for_local_var(v.get(), t->name_, ptr, true);
-            return;
-        }
-
-        // explicitly align tensor with cache line size, except that
-        // tensor is a scalar or bytes size < 64.
-        bool need_align = false;
-        // check condition.
-        if (t->dims_.size() == 1
-                && get_const_as_int(t->dims_[0].checked_as<constant>()) == 1) {
-            // it is a scalar
-        } else {
-            size_t shape = 1;
-            for (auto &d : t->dims_) {
-                shape *= get_const_as_int(d.checked_as<constant>());
-            }
-            size_t dtsize = utils::get_sizeof_etype(t->elem_dtype_.type_code_);
-            // check bytes size
-            if (shape * dtsize > 64) need_align = true;
-        }
-        auto ptr = builder_.CreateAlloca(get_type(t->elem_dtype_),
-                generate_expr(t->dims_.front()), t->name_);
-        // cache line alignment
-
-        if (need_align) { ptr->setAlignment(SC_LLVM_ALIGN(64)); }
-
-        var_ptr_in_func_.insert(std::make_pair(t, ptr));
-        set_dbg_info_for_local_var(v.get(), t->name_, ptr, true);
-    } else {
-        assert(0 && "Bad var type");
-    }
-}
-
-void codegen_llvm_vis_t::view(for_loop_c v) {
-    COMPILE_ASSERT(v->kind_ == for_type::NORMAL,
-            "LLVM backend can only handle normal for-loops");
-    auto itr_v = define_var(v->var_, generate_expr(v->iter_begin_));
-
-    if (ctx_->flags_.debug_info_) {
-        auto pos = v->attr_->get_or_null<source_pos>("source_pos");
-        if (pos) {
-            set_dbg_info_for_local_var(pos, v->var_->dtype_,
-                    v->var_.checked_as<var>()->name_, itr_v, false);
-        }
-    }
-
-    BasicBlock *chk = BasicBlock::Create(context_, "for_check", current_func_);
-    BasicBlock *body = BasicBlock::Create(context_, "for_body", current_func_);
-    BasicBlock *cont = BasicBlock::Create(context_, "for_cont", current_func_);
-    builder_.CreateBr(chk);
-    {
-        builder_.SetInsertPoint(chk);
-        auto cate = get_type_category(v->var_->dtype_);
-        auto end_v = generate_expr(v->iter_end_);
-        auto itr_value = builder_.CreateLoad(get_type(v->var_->dtype_), itr_v);
-        Value *cond;
-        if (cate == CATE_INT) {
-            cond = builder_.CreateICmpSLT(itr_value, end_v);
-        } else {
-            assert(cate == CATE_UINT);
-            cond = builder_.CreateICmpULT(itr_value, end_v);
-        }
-        builder_.CreateCondBr(cond, body, cont);
-    }
-    {
-        builder_.SetInsertPoint(body);
-        dispatch(v->body_);
-        if (body->empty() || !llvm::isa<llvm::ReturnInst>(body->back())) {
-            auto step_v = generate_expr(v->step_);
-            Value *itr_value
-                    = builder_.CreateLoad(get_type(v->var_->dtype_), itr_v);
-            itr_value = builder_.CreateAdd(itr_value, step_v);
-            builder_.CreateStore(itr_value, itr_v);
-            builder_.CreateBr(chk);
-        }
-    }
-    cont->moveAfter(builder_.GetInsertBlock());
-    builder_.SetInsertPoint(cont);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/support.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/support.cpp
deleted file mode 100644
index 3a92559642a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/llvm/support.cpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "shared_include.hpp"
-#include <util/compiler_macros.hpp>
-
-// g++-7 + LLVM16 has an unknown bug, which will generate broken TargetMachine
-// we use C-API instead
-#if SC_GNUC_VERSION_LT(8) && SC_LLVM_BACKEND >= 16
-#include <llvm-c/TargetMachine.h>
-#define WORKAROUND_LLVM_TM
-#endif
-
-using namespace llvm;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static void print_helper(Value *v) {
-    v->print(llvm::errs());
-}
-
-#define WHEN_ARCH_NAME(...) __VA_ARGS__
-#define ARCH_GraniteRapids \
-    case 0xae: \
-    case 0xad:
-#define ARCH_EmeraldRapids case 0xcf:
-#define ARCH_SapphireRapids case 0x8f:
-#define END_ARCH() break;
-
-#define WHEN_LLVM_VER_GE(V, VALUE) \
-    if (SC_LLVM_BACKEND >= (V)) { return VALUE; }
-#define WHEN_LLVM_VER_RANGE(V1, V2, VALUE) \
-    if (SC_LLVM_BACKEND >= (V1) && SC_LLVM_BACKEND <= (V2)) { return VALUE; }
-
-// Handle the case that the CPU is known to us, but not to current version of
-// LLVM. We help LLVM to better fallback to a closer generation of CPU known to
-// it.
-static const char *handle_cpu_name(runtime::cpu_flags_t &flags) {
-    if (flags.family != 6) { return nullptr; }
-    constexpr const char *sapphirerapids = "sapphirerapids";
-    constexpr const char *icelake_server = "icelake-server";
-    constexpr const char *let_llvm_handle = nullptr;
-    switch (flags.model) {
-        // clang-format off
-        WHEN_ARCH_NAME(ARCH_GraniteRapids ARCH_EmeraldRapids)
-            WHEN_LLVM_VER_GE(16, let_llvm_handle)
-            WHEN_LLVM_VER_RANGE(12, 15, sapphirerapids)
-            WHEN_LLVM_VER_RANGE(9, 11, icelake_server)
-        END_ARCH()
-        WHEN_ARCH_NAME(ARCH_SapphireRapids)
-            WHEN_LLVM_VER_GE(12, let_llvm_handle)
-            WHEN_LLVM_VER_RANGE(9, 11, icelake_server)
-        END_ARCH()
-        // clang-format on
-    }
-    // default, let llvm decide
-    return let_llvm_handle;
-}
-
-std::unique_ptr<TargetMachine> get_llvm_target_machine(
-        LLVM_CodeGenOptLevel optlevel) {
-    auto target_triple = sys::getProcessTriple();
-
-    std::string err;
-    auto target = TargetRegistry::lookupTarget(target_triple, err);
-    if (!target) { throw std::runtime_error(err); }
-    TargetOptions opt;
-#if SC_LLVM_BACKEND > 15
-    auto the_none = std::nullopt;
-#else
-    auto the_none = llvm::None;
-#endif
-
-    auto reloc_model = Optional<Reloc::Model>(Reloc::Static);
-    auto inferred_cpu_name
-            = handle_cpu_name(runtime::get_runtime_target_machine().cpu_flags_);
-    auto host_cpu
-            = inferred_cpu_name ? inferred_cpu_name : sys::getHostCPUName();
-    if (inferred_cpu_name) {
-        SC_MODULE_WARN << "Your CPU is not recognized by LLVM. Falling back to "
-                       << inferred_cpu_name;
-    }
-    llvm::StringMap<bool> feature_map;
-    sys::getHostCPUFeatures(feature_map);
-    llvm::SubtargetFeatures f;
-    for (auto &feature : feature_map) {
-        f.AddFeature(feature.first(), feature.second);
-    }
-    std::string features = f.getString();
-#ifdef WORKAROUND_LLVM_TM
-    SC_UNUSED(the_none);
-    SC_UNUSED(reloc_model);
-    auto tm = (TargetMachine *)LLVMCreateTargetMachine((LLVMTargetRef)target,
-            target_triple.c_str(), host_cpu.data(), features.c_str(),
-            LLVMCodeGenLevelDefault, LLVMRelocStatic, LLVMCodeModelJITDefault);
-#else
-    auto tm = target->createTargetMachine(target_triple, host_cpu, features,
-            opt, reloc_model, the_none, optlevel, true);
-#endif
-    return std::unique_ptr<TargetMachine>(tm);
-}
-
-std::unique_ptr<TargetMachine> get_llvm_target_machine(
-        LLVM_CodeGenOptLevel optlevel = LLVM_CodeGenOptLevel::Default);
-
-codegen_llvm_vis_t::codegen_llvm_vis_t(const context_ptr &ctx,
-        LLVMContext &context, const std::string &source_dir,
-        const std::string &source_file_name)
-    : ctx_(ctx)
-    , context_(context)
-    , builder_(context_)
-    , module_(utils::make_unique<Module>("name", context_))
-    , dbuilder_(utils::make_unique<DIBuilder>(*module_)) {
-    static bool initialized = []() {
-        // make sure LLVM native targets are initialized once and avoid
-        // race condition
-        InitializeNativeTarget();
-        InitializeNativeTargetAsmParser();
-        InitializeNativeTargetAsmPrinter();
-        return true;
-    }();
-    SC_UNUSED(initialized);
-    auto tm = get_llvm_target_machine();
-    module_->setTargetTriple(tm->getTargetTriple().str());
-    module_->setDataLayout(tm->createDataLayout());
-    FastMathFlags fmflag;
-    // some optimization in FastMath may cause accuracy loss, which needs
-    // further investigation in the future
-    fmflag.setFast(false);
-    // keep FMA optimization on
-    fmflag.setAllowContract(true);
-    // turn on following options for performance
-    fmflag.setAllowReassoc(true);
-    fmflag.setNoNaNs(true);
-    builder_.setFastMathFlags(fmflag);
-    if (ctx->flags_.debug_info_) {
-        dbg_cu_ = dbuilder_->createCompileUnit(dwarf::DW_LANG_C,
-                dbuilder_->createFile(source_file_name, source_dir),
-                "oneDNN Graph Compiler", false, "", 0);
-
-        if (!tm->getTargetTriple().isOSWindows()) {
-            // Add the current debug info version into the module.
-            module_->addModuleFlag(Module::Warning, "Debug Info Version",
-                    DEBUG_METADATA_VERSION);
-
-            // Darwin only supports dwarf2.
-            if (tm->getTargetTriple().isOSDarwin())
-                module_->addModuleFlag(
-                        llvm::Module::Warning, "Dwarf Version", 2);
-        } else {
-            module_->addModuleFlag(llvm::Module::Warning, "CodeView", 1);
-        }
-    }
-}
-
-expr_c codegen_llvm_vis_t::dispatch(expr_c v) {
-    emit_location(v.get());
-    return ir_viewer_t::dispatch(v);
-}
-
-stmt_c codegen_llvm_vis_t::dispatch(stmt_c v) {
-    emit_location(v.get());
-    return ir_viewer_t::dispatch(v);
-}
-
-const std::string &codegen_llvm_vis_t::get_node_name(const expr_c &c) {
-    if (c.isa<var>()) { return c.static_as<var>()->name_; }
-    return c.checked_as<tensor>()->name_;
-}
-
-FunctionType *codegen_llvm_vis_t::create_func_type(const func_c &v) {
-    std::vector<Type *> tys;
-    for (auto &param : v->params_) {
-        tys.push_back(get_type(param->dtype_));
-    }
-    FunctionType *FT = FunctionType::get(get_type(v->ret_type_), tys, false);
-    return FT;
-}
-
-DISubroutineType *codegen_llvm_vis_t::create_func_dtype(const func_c &v) {
-    std::vector<Metadata *> tys {get_type_both(v->ret_type_).second};
-    for (auto &param : v->params_) {
-        tys.push_back(get_type_both(param->dtype_).second);
-    }
-    return dbuilder_->createSubroutineType(
-            dbuilder_->getOrCreateTypeArray(tys));
-}
-
-Function *codegen_llvm_vis_t::get_or_create_func(const func_c &v) {
-    auto itr = name_to_func_.find(v->name_);
-    if (itr != name_to_func_.end()) { return itr->second; }
-    auto FT = create_func_type(v);
-    bool is_private = v->attr_
-            && v->attr_->get_or_else(function_attrs::private_, false);
-    Function *F = Function::Create(FT,
-            is_private ? Function::InternalLinkage : Function::ExternalLinkage,
-            v->name_, module_.get());
-    assert(FT == F->getFunctionType());
-    for (size_t i = 0; i < v->params_.size(); i++) {
-        (F->arg_begin() + i)->setName(get_node_name(v->params_[i]) + "_arg");
-    }
-    name_to_func_.insert(std::make_pair(v->name_, F));
-    if (v->attr_ && v->attr_->get_or_else(function_attrs::pure, false)) {
-#if SC_LLVM_BACKEND < 16
-        F->addFnAttr(llvm::Attribute::AttrKind::ReadNone);
-#else
-        F->addFnAttr(llvm::Attribute::getWithMemoryEffects(
-                context_, llvm::MemoryEffects::none()));
-#endif
-#if SC_LLVM_BACKEND > 10
-        F->addFnAttr(llvm::Attribute::AttrKind::Speculatable);
-#endif
-    }
-    if (v->attr_ && v->attr_->get_or_else(function_attrs::no_alias, false)) {
-        F->setReturnDoesNotAlias();
-    }
-    F->addFnAttr(llvm::Attribute::AttrKind::NoUnwind);
-    return F;
-}
-
-codegen_llvm_vis_t::~codegen_llvm_vis_t() = default;
-
-Value *codegen_llvm_vis_t::generate_expr(const expr_c &e) {
-    dispatch(e);
-    return current_val_;
-}
-std::pair<Type *, DIType *> codegen_llvm_vis_t::do_get_type(
-        sc_data_type_t dtype) {
-    Type *ty = nullptr;
-    DIType *dty = nullptr;
-    if (dtype.is_etype_pointer()
-            && dtype.type_code_ != sc_data_etype::POINTER) {
-        auto ret = do_get_type(dtype.get_pointer_element());
-        return {ret.first->getPointerTo(),
-                dbuilder_->createPointerType(ret.second, 64)};
-    }
-    switch (dtype.type_code_) {
-        case sc_data_etype::UNDEF:
-            throw std::runtime_error("Unsupported dtype");
-        case sc_data_etype::BF16:
-            ty = builder_.getInt16Ty();
-            dty = dbuilder_->createBasicType(
-                    "bf16", 16, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::F16:
-            ty = builder_.getHalfTy();
-            dty = dbuilder_->createBasicType("f16", 16, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::U16:
-            ty = builder_.getInt16Ty();
-            dty = dbuilder_->createBasicType("u16", 16, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::F32:
-            ty = builder_.getFloatTy();
-            dty = dbuilder_->createBasicType("f32", 32, dwarf::DW_ATE_float);
-            break;
-        case sc_data_etype::S32:
-            ty = builder_.getInt32Ty();
-            dty = dbuilder_->createBasicType("s32", 32, dwarf::DW_ATE_signed);
-            break;
-        case sc_data_etype::U32:
-            ty = builder_.getInt32Ty();
-            dty = dbuilder_->createBasicType("u32", 32, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::S8:
-            ty = builder_.getInt8Ty();
-            dty = dbuilder_->createBasicType("s8", 8, dwarf::DW_ATE_signed);
-            break;
-        case sc_data_etype::U8:
-            ty = builder_.getInt8Ty();
-            dty = dbuilder_->createBasicType("u8", 8, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::INDEX:
-        case sc_data_etype::GENERIC:
-            ty = builder_.getInt64Ty();
-            dty = dbuilder_->createBasicType("u64", 64, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::BOOLEAN:
-            ty = builder_.getInt1Ty();
-            dty = dbuilder_->createBasicType("bool", 1, dwarf::DW_ATE_unsigned);
-            break;
-        case sc_data_etype::VOID_T:
-            ty = builder_.getVoidTy();
-            dty = dbuilder_->createBasicType("void", 0, dwarf::DW_ATE_address);
-            break;
-        case sc_data_etype::POINTER:
-            ty = builder_.getInt8PtrTy();
-            dty = dbuilder_->createBasicType(
-                    "pointer", 64, dwarf::DW_ATE_address);
-            break;
-
-        default: assert("Unreachable" && 0); break;
-    }
-    if (dtype.lanes_ > 1) {
-#if SC_LLVM_BACKEND > 10
-        ty = VectorType::get(ty, dtype.lanes_, false);
-#else
-        ty = VectorType::get(ty, dtype.lanes_);
-#endif
-
-        auto subscript = dbuilder_->getOrCreateSubrange(0, dtype.lanes_);
-        llvm::DINodeArray subscriptarray
-                = dbuilder_->getOrCreateArray(subscript);
-        dty = dbuilder_->createVectorType(
-                utils::get_sizeof_type(dtype) * 8, 8, dty, subscriptarray);
-    }
-    return {ty, dty};
-}
-
-std::pair<Type *, DIType *> codegen_llvm_vis_t::get_type_both(
-        sc_data_type_t dtype) {
-    auto itr = type_cache_.find(dtype);
-    if (itr != type_cache_.end()) { return itr->second; }
-    auto ret = do_get_type(dtype);
-    type_cache_.insert(std::make_pair(dtype, ret));
-    return ret;
-}
-
-Type *codegen_llvm_vis_t::get_type(sc_data_type_t dtype) {
-    return get_type_both(dtype).first;
-}
-
-Value *codegen_llvm_vis_t::get_defined_var_ptr(const expr_c &e) {
-    auto itr = var_ptr_in_func_.find(e);
-    assert(itr != var_ptr_in_func_.end());
-    return itr->second;
-}
-
-Value *codegen_llvm_vis_t::define_var(const expr_c &e, Value *initv) {
-    auto ptr = builder_.CreateAlloca(
-            get_type(e->dtype_), nullptr, e.checked_as<var>()->name_);
-    if (initv) builder_.CreateStore(initv, ptr);
-    var_ptr_in_func_.insert(std::make_pair(e, ptr));
-    return ptr;
-}
-
-void codegen_llvm_vis_t::set_dbg_info_for_func_arg(llvm::Value *v,
-        DISubprogram *SP, DIFile *dunit, sc_data_type_t type,
-        const std::string &name, int argidx, int lineno, bool need_ref) {
-    if (!ctx_->flags_.debug_info_) { return; }
-    auto types = get_type_both(type);
-    auto dbgtype = types.second;
-    if (need_ref) {
-        auto tmp = builder_.CreateAlloca(v->getType());
-        builder_.CreateStore(v, tmp);
-        v = tmp;
-    }
-    // Create a debug descriptor for the variable.
-    DILocalVariable *D = dbuilder_->createParameterVariable(
-            SP, name, argidx, dunit, lineno, dbgtype, true);
-
-    dbuilder_->insertDeclare(v, D, dbuilder_->createExpression(),
-            DILocation::get(SP->getContext(), lineno, 0, SP),
-            builder_.GetInsertBlock());
-}
-
-void codegen_llvm_vis_t::emit_location(const node_base *p) {
-    if (!ctx_->flags_.debug_info_) { return; }
-    if (!p) { return builder_.SetCurrentDebugLocation(DebugLoc()); }
-    if (p->attr_) {
-        if (auto loc = p->attr_->get_or_null<source_pos>("source_pos")) {
-            DIScope *Scope;
-            if (dbg_scopes_.empty())
-                Scope = dbg_cu_;
-            else
-                Scope = dbg_scopes_.back();
-            builder_.SetCurrentDebugLocation(DILocation::get(
-                    Scope->getContext(), loc->line_, loc->pos_, Scope));
-        }
-    }
-}
-
-void codegen_llvm_vis_t::prepare_alias_metadata(
-        const std::vector<alias_info::tensor_alias_identity_t *> &tensors,
-        MDNode *alias_domain, MDBuilder &MDB) {
-    std::unordered_set<std::shared_ptr<alias_info::alias_set_t>> cliques;
-    int64_t new_tensor_id = -1;
-    for (auto &aid : tensors) {
-        if (aid->alias_cliques_.empty()) {
-            auto new_clique = std::make_shared<alias_info::alias_set_t>();
-            aid->add_to_clique(new_clique);
-            new_clique->id_ = new_tensor_id;
-            new_tensor_id--;
-        }
-        cliques.insert(aid->alias_cliques_.begin(), aid->alias_cliques_.end());
-    }
-    std::vector<std::shared_ptr<alias_info::alias_set_t>> cliques_sorted {
-            cliques.begin(), cliques.end()};
-    std::sort(cliques_sorted.begin(), cliques_sorted.end(),
-            [](const std::shared_ptr<alias_info::alias_set_t> &v1,
-                    const std::shared_ptr<alias_info::alias_set_t> &v2) {
-                return v1->id_ < v2->id_;
-            });
-    std::unordered_map<alias_info::alias_set_t *, MDNode *> clique_to_MD;
-    for (auto &clique : cliques_sorted) {
-        MDNode *scope = MDB.createAnonymousAliasScope(
-                alias_domain, std::to_string(clique->id_));
-        clique_to_MD[clique.get()] = scope;
-    }
-    for (auto &aid : tensors) {
-        auto shared_aid = aid->shared_from_this();
-        vec_metadata alias_scope;
-        vec_metadata noalias_scope;
-        for (auto &clique : cliques_sorted) {
-            if (clique->set_.has(shared_aid)) {
-                alias_scope.emplace_back(clique_to_MD[clique.get()]);
-            } else {
-                noalias_scope.emplace_back(clique_to_MD[clique.get()]);
-            }
-        }
-        alias_set_to_alias_scope_[aid] = {MDNode::get(context_, alias_scope),
-                MDNode::get(context_, noalias_scope)};
-    }
-}
-
-func_c codegen_llvm_vis_t::dispatch(func_c v) {
-    var_ptr_in_func_.clear();
-    tsr_to_alias_scope_.clear();
-    alias_set_to_alias_scope_.clear();
-    if (utils::string_startswith(v->name_, "_should_inline_")) { return v; }
-    if (!v->body_.defined()) { return v; }
-    auto F = get_or_create_func(v);
-    BasicBlock *BB = BasicBlock::Create(context_, "entry", F);
-    builder_.SetInsertPoint(BB);
-
-    unsigned LineNo = 1;
-    unsigned ScopeLine = 1;
-    DIFile *dunit = nullptr;
-    DISubprogram *SP = nullptr;
-    if (ctx_->flags_.debug_info_) {
-        auto pos = v->attr_->get<source_pos>("source_pos");
-
-        LineNo = pos.line_;
-        ScopeLine = LineNo;
-        dunit = dbuilder_->createFile(
-                dbg_cu_->getFilename(), dbg_cu_->getDirectory());
-
-        SP = dbuilder_->createFunction(dunit, v->name_, StringRef(), dunit,
-                LineNo, create_func_dtype(v), ScopeLine, DINode::FlagPrototyped,
-                DISubprogram::SPFlagDefinition);
-        F->setSubprogram(SP);
-
-        // Push the current scope.
-        dbg_scopes_.push_back(SP);
-
-        // Unset the location for the prologue emission (leading
-        // instructions with no location in a function are considered
-        // part of the prologue and the debugger will run past them when
-        // breaking on a function)
-        emit_location(nullptr);
-    }
-
-    current_func_ = F;
-    F->addFnAttr("no-frame-pointer-elim", "true");
-    F->addFnAttr("frame-pointer", "all");
-    bool has_alias = false;
-
-    for (size_t i = 0; i < v->params_.size(); i++) {
-        if (v->params_[i].isa<tensor>()) {
-            auto ainfo = alias_info::get_alias_info(*v->params_[i]);
-            if (ainfo && !ainfo->has_no_alias()) {
-                has_alias = true;
-            } else {
-                F->addParamAttr(i, llvm::Attribute::AttrKind::NoAlias);
-            }
-
-            F->addParamAttr(i, llvm::Attribute::AttrKind::NoCapture);
-            F->addParamAttr(i, llvm::Attribute::AttrKind::NonNull);
-        }
-    }
-    using alias_id_vec
-            = std::vector<std::shared_ptr<alias_info::tensor_alias_identity_t>>;
-    std::vector<alias_info::tensor_alias_identity_t *> alias_ids;
-    if (v->attr_) {
-        if (auto local_tsr_alias_set
-                = v->attr_->get_or_null<alias_id_vec>("alias_sets")) {
-            has_alias = true;
-            alias_ids.reserve(local_tsr_alias_set->size());
-            for (auto &v : *local_tsr_alias_set) {
-                alias_ids.push_back(v.get());
-            }
-        }
-    }
-
-    // if has custom alias info, need to construct alias scope/noalias
-    // for LLVM each alias scope are exclusive to each other
-    if (has_alias) {
-        MDBuilder MDB(context_);
-        MDNode *alias_domain = MDB.createAnonymousAliasScopeDomain(v->name_);
-
-        for (size_t i = 0; i < v->params_.size(); i++) {
-            if (v->params_[i].isa<tensor>()) {
-                auto ainfo
-                        = alias_info::get_or_create_alias_info(*v->params_[i]);
-                alias_ids.push_back(ainfo.get());
-            }
-        }
-        prepare_alias_metadata(alias_ids, alias_domain, MDB);
-        for (size_t i = 0; i < v->params_.size(); i++) {
-            if (v->params_[i].isa<tensor>()) {
-                auto ainfo
-                        = alias_info::get_or_create_alias_info(*v->params_[i]);
-                tsr_to_alias_scope_[v->params_[i]]
-                        = &(alias_set_to_alias_scope_[ainfo.get()]);
-            }
-        }
-    }
-    bool is_low_level = v->attr_
-            && v->attr_->get_or_else(function_attrs::low_level, false);
-    // LLVM func args are SSA values and cannot be modified. We use
-    // alloca to alloc modifiable slots for each params
-    for (size_t i = 0; i < v->params_.size(); i++) {
-        Value *arg = F->args().begin() + i;
-        auto &p = v->params_[i];
-        if (p.isa<var>()) {
-            auto varnode = p.static_as<var>();
-            if (is_low_level) {
-                auto varalloca = define_var(v->params_[i], arg);
-                set_dbg_info_for_func_arg(varalloca, SP, dunit, p->dtype_,
-                        varnode->name_, i + 1, LineNo, false);
-            } else {
-                switch (i) {
-                    case 0:
-                        assert(arg->getName() == "__stream_arg");
-                        var_ptr_in_func_.insert(std::make_pair(p, arg));
-                        set_dbg_info_for_func_arg(arg, SP, dunit, p->dtype_,
-                                varnode->name_, i + 1, LineNo, true);
-                        break;
-                    case 1:
-                        assert(arg->getName() == "__module_data_arg");
-                        var_ptr_in_func_.insert(std::make_pair(p, arg));
-                        set_dbg_info_for_func_arg(arg, SP, dunit, p->dtype_,
-                                varnode->name_, i + 1, LineNo, true);
-                        break;
-                    default: {
-                        auto varalloca = define_var(v->params_[i], arg);
-                        set_dbg_info_for_func_arg(varalloca, SP, dunit,
-                                p->dtype_, varnode->name_, i + 1, LineNo,
-                                false);
-                        break;
-                    }
-                }
-            }
-        } else {
-            assert(p.isa<tensor>());
-            auto tnode = p.static_as<tensor>();
-            var_ptr_in_func_.insert(std::make_pair(p, arg));
-            set_dbg_info_for_func_arg(arg, SP, dunit, tnode->dtype_,
-                    tnode->name_, i + 1, LineNo, true);
-        }
-    }
-    dispatch(v->body_);
-    if (builder_.GetInsertBlock()->empty()
-            || !builder_.GetInsertBlock()->back().isTerminator()) {
-        assert(v->ret_type_ == datatypes::void_t);
-        builder_.CreateRetVoid();
-    }
-    if (ctx_->flags_.debug_info_) { dbg_scopes_.pop_back(); }
-    return v;
-}
-
-Instruction *codegen_llvm_vis_t::set_alias(
-        Instruction *inst, const expr_c &tsr) {
-    if (tsr.isa<indexing>()) {
-        return set_alias(inst, tsr.static_as<indexing>()->ptr_);
-    }
-    auto itr = tsr_to_alias_scope_.find(tsr);
-    if (itr != tsr_to_alias_scope_.end()) {
-        // alias.scope metadata.
-        inst->setMetadata(LLVMContext::MD_alias_scope,
-                MDNode::concatenate(
-                        inst->getMetadata(LLVMContext::MD_alias_scope),
-                        itr->second->first));
-
-        // noalias metadata.
-        inst->setMetadata(LLVMContext::MD_noalias,
-                MDNode::concatenate(inst->getMetadata(LLVMContext::MD_noalias),
-                        itr->second->second));
-    }
-    return inst;
-}
-
-Value *codegen_llvm_vis_t::set_alias(Value *inst, const expr_c &tsr) {
-    set_alias(static_cast<Instruction *>(inst), tsr);
-    return inst;
-}
-
-Value *codegen_llvm_vis_t::convert_mask(const expr &in, const bool is_int4) {
-    // true means mask must have 4 bits. Otherwise, it is always false.
-    auto mask = generate_expr(in);
-    auto &dtype = in->dtype_;
-    if (dtype.lanes_ == 1 && !dtype.is_etype(sc_data_etype::BOOLEAN)) {
-        auto ty_int1 = builder_.getInt1Ty();
-        auto bit_len = is_int4 ? 4 : utils::get_sizeof_type(dtype) * 8;
-        auto mask_ty =
-#if SC_LLVM_BACKEND > 10
-                VectorType::get(ty_int1, bit_len, false);
-#else
-                VectorType::get(ty_int1, bit_len);
-#endif
-        if (is_int4) {
-            mask = builder_.CreateTrunc(mask, builder_.getIntNTy(4));
-        }
-        mask = builder_.CreateBitCast(mask, mask_ty);
-    }
-    return mask;
-}
-
-void codegen_llvm_vis_t::set_dbg_info_for_local_var(const source_pos *pos,
-        sc_data_type_t type, const std::string &name, Value *llvm_value,
-        bool need_ref) {
-    if (!ctx_->flags_.debug_info_) { return; }
-    auto types = get_type_both(type);
-    auto dbgtype = types.second;
-    if (need_ref) {
-        auto tmp = builder_.CreateAlloca(llvm_value->getType());
-        builder_.CreateStore(llvm_value, tmp);
-        llvm_value = tmp;
-    }
-    // Create a debug descriptor for the variable.
-    DILocalVariable *D = dbuilder_->createAutoVariable(dbg_scopes_.back(), name,
-            dbg_cu_->getFile(), pos->line_, dbgtype, true);
-
-    dbuilder_->insertDeclare(llvm_value, D, dbuilder_->createExpression(),
-            DILocation::get(dbg_scopes_.back()->getContext(), pos->line_,
-                    pos->pos_, dbg_scopes_.back()),
-            builder_.GetInsertBlock());
-}
-
-void codegen_llvm_vis_t::set_dbg_info_for_local_var(const define_node_t *v,
-        const std::string &name, Value *llvm_value, bool need_ref) {
-    if (!ctx_->flags_.debug_info_) { return; }
-    auto pos = v->attr_->get_or_null<source_pos>("source_pos");
-    if (pos) {
-        set_dbg_info_for_local_var(
-                pos, v->var_->dtype_, name, llvm_value, need_ref);
-    }
-}
-
-Value *codegen_llvm_vis_t::gen_vec_const(uint64_t elements, float f) {
-    return builder_.CreateVectorSplat(
-            elements, ConstantFP::get(builder_.getFloatTy(), APFloat(f)));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/precodegen_passes.cpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/precodegen_passes.cpp
deleted file mode 100644
index 15db2006a86..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/precodegen_passes.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "precodegen_passes.hpp"
-#include <compiler/ir/pass/validator.hpp>
-#include <compiler/ir/pass_manager.hpp>
-#include <compiler/ir/sequential_function_pass.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/bf16_fp16_legalize.hpp>
-#include <compiler/ir/transform/buffer_reschedule_tensor_hoist.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/concat_memory_planning.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/cpu/closurize.hpp>
-#include <compiler/ir/transform/cpu/kernel_lower.hpp>
-#include <compiler/ir/transform/cpu/local_tensor_lower.hpp>
-#include <compiler/ir/transform/cpu/target_specific_lower.hpp>
-#include <compiler/ir/transform/dead_func_eliminate.hpp>
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <compiler/ir/transform/dessa_transform.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/dynamic_parallel_transform.hpp>
-#include <compiler/ir/transform/func_inline.hpp>
-#include <compiler/ir/transform/index2var.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-#include <compiler/ir/transform/insert_trace.hpp>
-#include <compiler/ir/transform/interface_generalize.hpp>
-#include <compiler/ir/transform/loop_invariant_code_motion.hpp>
-#include <compiler/ir/transform/loop_merge.hpp>
-#include <compiler/ir/transform/loop_split.hpp>
-#include <compiler/ir/transform/loop_unroll.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/nested_parallel_flatten.hpp>
-#include <compiler/ir/transform/parallel_merge.hpp>
-#include <compiler/ir/transform/parallel_workload_dispatch.hpp>
-#include <compiler/ir/transform/simple_licm.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/transform/tensor_init.hpp>
-#include <compiler/ir/transform/tensor_inplace.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <compiler/ir/transform/value_numbering.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <runtime/config.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-sequential_module_pass_t get_ssa_passes(const context_ptr &ctx) {
-    std::vector<module_pass_ptr> ret;
-    ret.emplace_back(module_function_pass_t::make<ssa_transform_t>());
-    ret.emplace_back(module_function_pass_t::make<value_numbering_t>(false));
-    ret.emplace_back(
-            module_function_pass_t::make<loop_invariant_code_motion_t>());
-    ret.emplace_back(module_function_pass_t::make<value_numbering_t>());
-    return sequential_module_pass_t(std::move(ret));
-}
-
-sequential_module_pass_t get_default_precodegen_passes(
-        const context_ptr &ctx, bool gen_wrapper) {
-    std::vector<module_pass_ptr> ret;
-    ret.reserve(64);
-    ret.emplace_back(utils::make_unique<dyn_tensor_transformer_t>());
-    if (gen_wrapper) {
-        ret.emplace_back(utils::make_unique<interface_generalizer_t>());
-    }
-    ret.emplace_back(utils::make_unique<tensor_shrinker_t>());
-    if (ctx->flags_.concat_optimization_) {
-        ret.emplace_back(utils::make_unique<concat_memory_planning_t>());
-    }
-    ret.emplace_back(utils::make_unique<index_flattener_t>());
-    ret.emplace_back(utils::make_unique<auto_caster_t>());
-    ret.emplace_back(module_function_pass_t::make<bf16_fp16_legalizer_t>(ctx));
-    ret.emplace_back(utils::make_unique<validator_t>());
-    if (ctx->flags_.trace_) {
-        ret.emplace_back(utils::make_unique<trace_inserter_t>());
-    }
-
-    ret.emplace_back(utils::make_unique<func_inliner_t>());
-    ret.emplace_back(utils::make_unique<constant_folder_t>());
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(true));
-
-    if (ctx->flags_.buffer_schedule_ > 0 && ctx->flags_.tensor_inplace_) {
-        ret.emplace_back(utils::make_unique<tensor_inplace_t>(ctx));
-    }
-    ret.emplace_back(module_function_pass_t::make<tensor_init_t>(ctx));
-    ret.emplace_back(module_function_pass_t::make<loop_merger_t>());
-
-    ret.emplace_back(
-            module_function_pass_t::make<parallel_workload_dispatcher_t>());
-    ret.emplace_back(module_function_pass_t::make<
-            simple_loop_invariant_code_motion_t>());
-    ret.emplace_back(utils::make_unique<constant_folder_t>(false));
-    ret.emplace_back(module_function_pass_t::make<loop_unroller_t>());
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(true));
-    if (ctx->flags_.index2var_) {
-        ret.emplace_back(module_function_pass_t::make<index2var_t>());
-    }
-    if (ctx->flags_.dead_write_elimination_) {
-        ret.emplace_back(
-                module_function_pass_t::make<dead_write_eliminator_t>());
-    }
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(true));
-
-    if (runtime_config_t::get().managed_thread_pool_
-            == thread_pool_mode_t::DYNAMIC) {
-        ret.emplace_back(
-                utils::make_unique<dynamic_parallel_transform_t>(true));
-    } else {
-        ret.emplace_back(module_function_pass_t::make<
-                buffer_rescheduling_tensor_hoisting_t>(
-                ctx, true, ctx->flags_.tensor_inplace_));
-        ret.emplace_back(
-                module_function_pass_t::make<nested_parallel_flattener_t>());
-    }
-    ret.emplace_back(utils::make_unique<constant_folder_t>(false));
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(true));
-    ret.emplace_back(module_function_pass_t::make<loop_splitter_t>());
-
-    ret.emplace_back(utils::make_unique<parallel_merge_t>());
-    ret.emplace_back(utils::make_unique<dead_func_eliminate_t>());
-    ret.emplace_back(module_function_pass_t::make<bf16_fp16_eliminator_t>(ctx));
-    ret.emplace_back(utils::make_unique<target_specific_lowering_cpu_t>(ctx));
-    ret.emplace_back(utils::make_unique<func_inliner_t>());
-    ret.emplace_back(utils::make_unique<dead_func_eliminate_t>());
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(false));
-
-    if (ctx->flags_.tensor2var_) {
-        ret.emplace_back(module_function_pass_t::make<tensor2var_t>());
-    }
-    if (ctx->flags_.buffer_schedule_ > 0) {
-        ret.emplace_back(module_function_pass_t::make<buffer_scheduler_t>(
-                ctx, true, ctx->flags_.tensor_inplace_));
-    }
-    ret.emplace_back(utils::make_unique<kernel_lowering_cpu_t>(
-            ctx->flags_.kernel_optim_));
-    ret.emplace_back(utils::make_unique<closurizer_cpu_t>(
-            runtime_config_t::get().get_num_threads() == 1));
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(false));
-
-    ret.emplace_back(utils::make_unique<module_globals_resolver_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<local_tensor_lowering_cpu_t>(128));
-    if (ctx->flags_.ssa_passes_) {
-        ret.emplace_back(module_function_pass_t::make<ssa_transform_t>());
-        ret.emplace_back(module_function_pass_t::make<value_numbering_t>());
-        ret.emplace_back(
-                module_function_pass_t::make<loop_invariant_code_motion_t>());
-        ret.emplace_back(module_function_pass_t::make<dessa_transform_t>());
-    }
-    validate_pass_order(ctx, ret, gen_wrapper);
-    return sequential_module_pass_t(std::move(ret));
-}
-
-const_ir_module_ptr run_precodegen_passes(
-        module_pass_t &pass, const_ir_module_ptr mod) { // NOLINT
-    func_t init_func = mod->make_init_func();
-    auto mod_with_init = std::make_shared<ir_module_t>(*mod);
-    if (init_func) { mod_with_init->add_func({init_func}); }
-    // todo: use attr in function to skip some of the passes
-    auto mod_cpy = pass(mod_with_init);
-    if (mod->ctx_->flags_.print_ir_) { std::cerr << mod_cpy; }
-    return mod_cpy;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/codegen/precodegen_passes.hpp b/src/graph/backend/graph_compiler/core/src/compiler/codegen/precodegen_passes.hpp
deleted file mode 100644
index 63059660c8c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/codegen/precodegen_passes.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_PRECODEGEN_PASSES_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CODEGEN_PRECODEGEN_PASSES_HPP
-
-#include <compiler/ir/util_module_passes.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-sequential_module_pass_t get_default_precodegen_passes(
-        const context_ptr &ctx, bool gen_wrapper);
-
-const_ir_module_ptr run_precodegen_passes(
-        module_pass_t &pass, const_ir_module_ptr f);
-sequential_module_pass_t get_ssa_passes(const context_ptr &ctx);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/config/context.cpp b/src/graph/backend/graph_compiler/core/src/compiler/config/context.cpp
deleted file mode 100644
index 7e14d9a207f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/config/context.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <immintrin.h>
-#include <utility>
-#include "context.hpp"
-#include <compiler/jit/cfake/cfake_jit.hpp>
-#include <compiler/jit/jit.hpp>
-#include <cpu/x64/cpu_isa_traits.hpp>
-#include <runtime/config.hpp>
-#include <runtime/env_vars.hpp>
-#include <runtime/target_machine.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/string_utils.hpp>
-#include <util/utils.hpp>
-
-#ifndef _WIN32
-extern char **environ;
-#endif
-
-SC_MODULE(target)
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using namespace env_key;
-
-static void check_within(
-        int &val, int lo, int hi, int defaultv, const char *prompt) {
-    if (val < lo || val > hi) {
-        SC_MODULE_WARN << prompt << val << ", set to default = " << defaultv;
-        val = defaultv;
-    }
-}
-
-// The function reset some cpu flags by DNNL_MAX_CPU_ISA/ONEDNN_MAX_CPU_ISA, cpu
-// flags will take the intersection of compiler/target machine/onednn env
-// variable/os.
-static void reset_cpu_flags_by_dnnl_envs(runtime::target_machine_t &tm) {
-    std::string dnnl_isa = utils::getenv_string("DNNL_MAX_CPU_ISA");
-    std::string onednn_isa = utils::getenv_string("ONEDNN_MAX_CPU_ISA");
-    // DNNL_MAX_CPU_ISA and ONEDNN_MAX_CPU_ISA can not be set at same time.
-    assert(dnnl_isa.empty() || onednn_isa.empty());
-    std::string &max_isa = onednn_isa.empty() ? dnnl_isa : onednn_isa;
-    if (!max_isa.empty()) {
-        // copy here for recover if env is invalid
-        auto old_cpu_flags = tm.cpu_flags_;
-        do {
-            // amx fp16
-            if (max_isa == "AVX512_CORE_AMX_FP16") { break; }
-            tm.cpu_flags_.fAVX512AMXFP16 = false;
-            // amx
-            if (max_isa == "AVX512_CORE_AMX") { break; }
-            tm.cpu_flags_.fAVX512AMXTILE = false;
-            tm.cpu_flags_.fAVX512AMXINT8 = false;
-            tm.cpu_flags_.fAVX512AMXBF16 = false;
-            // avx512 fp16
-            if (max_isa == "AVX512_CORE_FP16") { break; }
-            tm.cpu_flags_.fAVX512FP16 = false;
-            // avx512 bf16
-            if (max_isa == "AVX512_CORE_BF16") { break; }
-            tm.cpu_flags_.fAVX512BF16 = false;
-            // avx512 vnni
-            if (max_isa == "AVX512_CORE_VNNI") { break; }
-            tm.cpu_flags_.fAVX512VNNI = false;
-            // avx512 core
-            if (max_isa == "AVX512_CORE") { break; }
-            tm.cpu_flags_.fAVX512F = false;
-            tm.cpu_flags_.fAVX512BW = false;
-            tm.cpu_flags_.fAVX512VL = false;
-            tm.cpu_flags_.fAVX512DQ = false;
-            // avx vnni 2
-            if (max_isa == "AVX2_VNNI_2") { break; }
-            // avx vnni
-            if (max_isa == "AVX2_VNNI") { break; }
-            // avx2
-            if (max_isa == "AVX2") { break; }
-            tm.cpu_flags_.fAVX2 = false;
-            // avx
-            if (max_isa == "AVX") { break; }
-            tm.cpu_flags_.fAVX = false;
-            // sse41
-            if (max_isa == "SSE41") { break; }
-
-            // unsupport
-            SC_MODULE_WARN << "Unsupported ISA type: " << max_isa;
-
-            // use old cpu flags
-            tm.cpu_flags_ = old_cpu_flags;
-        } while (false);
-        runtime::target_machine_t::set_simd_length_and_max_cpu_threads(
-                tm.cpu_flags_);
-    }
-    // double check amx by syscall
-    if (tm.cpu_flags_.fAVX512AMXTILE
-            && !dnnl::impl::cpu::x64::amx::is_available()) {
-        tm.cpu_flags_.fAVX512AMXTILE = false;
-        tm.cpu_flags_.fAVX512AMXINT8 = false;
-        tm.cpu_flags_.fAVX512AMXBF16 = false;
-    }
-}
-
-template <typename T>
-static void parse_value(const char *name, T &v) {
-    auto strv = utils::getenv_string(name);
-    if (!strv.empty()) { v = T(std::stoi(strv)); };
-}
-
-context_ptr make_context_from_env() {
-    runtime::target_machine_t tm = runtime::get_native_target_machine();
-    scflags_t flags;
-
-#if defined(SC_LLVM_BACKEND)
-    jit_kind jit = jit_kind::llvm;
-#elif SC_BUILTIN_JIT_ENABLED
-    jit_kind jit = jit_kind::xbyak;
-#elif SC_CFAKE_JIT_ENABLED
-    jit_kind jit = jit_kind::cfake;
-#endif
-    {
-        const char *jit_env_var_name = env_names[SC_CPU_JIT];
-        const char *cfakejit_switch_name = "c";
-        auto buf = utils::getenv_string(jit_env_var_name);
-        if (!buf.empty()) {
-#if SC_CFAKE_JIT_ENABLED
-            if (buf == cfakejit_switch_name) {
-                jit = jit_kind::cfake;
-            }
-#else
-            if (false) {
-                // make compiler happy
-            }
-#endif
-#if SC_BUILTIN_JIT_ENABLED
-            else if (buf == "builtin") {
-                jit = jit_kind::xbyak;
-            }
-#endif
-#if defined(SC_LLVM_BACKEND)
-            else if (buf == "llvm") {
-                jit = jit_kind::llvm;
-            }
-#endif
-            else {
-                SC_MODULE_WARN << "Bad value for SC_CPU_JIT=" << buf
-                               << ", setting to default value="
-#if defined(SC_LLVM_BACKEND)
-                                  "llvm";
-#elif SC_BUILTIN_JIT_ENABLED
-                                  "builtin";
-#elif SC_CFAKE_JIT_ENABLED
-                                  "cfake";
-#endif
-            }
-        }
-    }
-
-    flags.jit_kind_ = jit;
-    jit_engine_t::set_target_machine(jit, flags, tm);
-    reset_cpu_flags_by_dnnl_envs(tm);
-    tm.brgemm_use_amx_ = tm.cpu_flags_.fAVX512AMXTILE
-            && (tm.cpu_flags_.fAVX512AMXBF16 || tm.cpu_flags_.fAVX512AMXINT8);
-    set_runtime_target_machine(tm);
-    std::string tracep = utils::getenv_string(env_names[SC_TRACE]);
-    if (!tracep.empty() && tracep != "0") {
-        flags.trace_ = true;
-        SC_MODULE_WARN << "Trace is ON";
-    }
-
-    int opt_level = utils::getenv_int(env_names[SC_OPT_LEVEL], 3);
-    check_within(
-            opt_level, 0, 3, 3, "Bad optimization level in SC_OPT_LEVEL: ");
-    flags.opt_level_ = sc_opt_level(opt_level);
-
-    if (opt_level == 0) {
-        // disable opt passes
-        flags.buffer_schedule_ = 0;
-        flags.dead_write_elimination_ = false;
-        flags.kernel_optim_ = 0;
-        flags.index2var_ = false;
-        flags.tensor_inplace_ = false;
-    }
-
-    return std::make_shared<context_t>(flags, std::move(tm));
-}
-
-context_ptr get_default_context() {
-    static auto v = make_context_from_env();
-    return v;
-}
-
-uint16_t context_t::get_max_vector_lanes(sc_data_etype etype) const {
-    return machine_.get_device_flags().get_max_vector_lanes(etype);
-}
-
-bool context_t::use_amx() const {
-    return machine_.use_amx();
-}
-
-context_t::context_t(const scflags_t &flags,
-        runtime::target_machine_t &&machine, runtime::engine_t *engine)
-    : engine_(engine ? engine : runtime::get_default_stream()->engine_)
-    , flags_(flags)
-    , machine_(std::move(machine)) {}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/config/context.hpp b/src/graph/backend/graph_compiler/core/src/compiler/config/context.hpp
deleted file mode 100644
index b594ceff323..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/config/context.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CONFIG_CONTEXT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_CONFIG_CONTEXT_HPP
-#include <memory>
-#include <string>
-#include <runtime/target_machine.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct engine_t;
-}
-
-enum class jit_kind {
-    cfake = 0,
-#if defined(SC_LLVM_BACKEND)
-    llvm,
-#endif
-    xbyak,
-};
-
-enum class sc_opt_level : int { lv0 = 0, lv1, lv2, lv3 };
-enum class fusion_opt_level : int { lv0 = 0, lv1, lv2, lv3 };
-
-struct scflags_t {
-    enum class brgemm_backend_t : int { dnnl = 0, max_num };
-
-    jit_kind jit_kind_ = jit_kind::cfake;
-    sc_opt_level opt_level_ = sc_opt_level::lv3;
-    int backend_opt_level_ = 3;
-    bool tensor_inplace_ = true;
-    bool bf16_fast_trunc_ = false;
-    bool const_share_ = true;
-    bool trace_ = false;
-    bool dead_write_elimination_ = true;
-    int buffer_schedule_ = 3; // 0 off, 1 whole reuse, 2 size first, 3 hot first
-    brgemm_backend_t brgemm_backend_ = brgemm_backend_t::dnnl;
-    int kernel_optim_ = 1; // 0 off, 1 external-runtime-oriented opt,
-    bool index2var_ = true;
-    bool tensor2var_ = true;
-    bool print_ir_ = false;
-    bool ssa_passes_ = false;
-    bool prefetch_ = true;
-    fusion_opt_level fusion_level_ = fusion_opt_level::lv3;
-    bool use_cost_model_ = true;
-    bool debug_info_ = false;
-    // whether jit supports directly generating amx intrinsics instead of using
-    // dnnl
-    bool jit_support_amx_intrinsics_ = false;
-    bool concat_optimization_ = true;
-    bool graph_default_private_ = true;
-};
-
-struct context_t {
-    runtime::engine_t *engine_;
-    scflags_t flags_;
-    runtime::target_machine_t machine_;
-    context_t(const scflags_t &flags, runtime::target_machine_t &&machine,
-            runtime::engine_t *engine = nullptr);
-    context_t(const context_t &) = default;
-    uint16_t get_max_vector_lanes(sc_data_etype etype) const;
-    bool use_amx() const;
-};
-using context_ptr = std::shared_ptr<context_t>;
-
-SC_API context_ptr make_context_from_env();
-SC_API context_ptr get_default_context();
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/dimensions.hpp b/src/graph/backend/graph_compiler/core/src/compiler/dimensions.hpp
deleted file mode 100644
index fe7d4a7db39..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/dimensions.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_DIMENSIONS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_DIMENSIONS_HPP
-#include <assert.h>
-#include <stdint.h>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using sc_dim = int64_t;
-using sc_dims = std::vector<sc_dim>;
-namespace dimensions {
-constexpr sc_dim dynamic_any = -1;
-}
-
-inline uint64_t dim2unsigned(sc_dim v) {
-    assert(v >= 0);
-    return v;
-}
-
-inline bool is_dynamic_dim(sc_dim v) {
-    return v < 0;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/attr_keys.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/attr_keys.hpp
deleted file mode 100644
index 39109c3b025..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/attr_keys.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_ATTR_KEYS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_ATTR_KEYS_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace attr_keys {
-// bool. Default false. Applied on local tensors. If true, the tensor must be
-// allocated using runtime memory allocator instead of the native stack
-constexpr const char *runtime_stack_alloc = "runtime_stack_alloc";
-// string. The name of the next function to call. Applied on evaluate-call
-// nodes. If true, the next op to be called has no dependency on the current op.
-// It can be merged with the previous op to remove the barrier.
-constexpr const char *no_post_barrier = "no_post_barrier";
-// bool. Default false. It should be applied on tensors, indicating that the
-// tensor should be 1xs32 and its element's value is auto-assigned by the
-// compiler. The content of the tensor is bind to whether the const cache is
-// valid and already initialized.
-constexpr const char *is_init_for_const_cache = "is_init_for_const_cache";
-// std::shared_ptr<cached_const_graph_tensor>. It should be marked on shared
-// constant tensor nodes
-constexpr const char *shared_const = "shared_const";
-// bool. Default false. Marked on stmt nodes that is related to shared constant
-// lazy initialization flag __is_init. module_global_resolver uses this flag to
-// find the __is_init related stmt.
-constexpr const char *is_shared_const_init_stmt = "is_shared_const_init_stmt";
-// size_t. Marked by module_global_resolver on tensor nodes
-// ("__shared_const_base_*") that are base tensors of buffers in shared const
-// tensor cache. It is the index of the handle to these base tensors. It will be
-// used in local_tensor_lower
-constexpr const char *shared_const_base_idx = "shared_const_base_idx";
-// bool. Default false. Marked "true" on top-level parallel for_loop nodes that
-// the user would like to use dynamic parallel dispatching
-constexpr const char *dynamic_parallel = "dynamic_parallel";
-// bool. Default false. Applied on base tensor.
-// If true, the tensor is read only in the funtion.
-constexpr const char *read_only_tensor = "read_only_tensor";
-// bool. whether codegen enable fast math.
-// If false, codegen will not use fast math in the calculation. (Currently only
-// works in binary expr calculation of llvm codegen.)
-constexpr const char *fast_math = "fast_math";
-} // namespace attr_keys
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/builder.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/builder.cpp
deleted file mode 100644
index d7e03f3a1c5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/builder.cpp
+++ /dev/null
@@ -1,878 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "builder.hpp"
-
-#include <util/utils.hpp>
-
-#include <algorithm>
-#include <atomic>
-#include <utility>
-#include "passlet/passlet.hpp"
-#include "passlet/structural_analysis.hpp"
-#include "sc_expr.hpp"
-#include "sc_function.hpp"
-#include "sc_stmt.hpp"
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static void merge_attrs(std::unique_ptr<any_map_t> &mergeto,
-        const std::unique_ptr<any_map_t> &mergefrom) {
-    if (mergeto) {
-        auto &ths_attr_map = mergefrom->as_map();
-        mergeto->as_map().insert(ths_attr_map.begin(), ths_attr_map.end());
-    } else {
-        mergeto = utils::make_unique<any_map_t>(*mergefrom);
-    }
-}
-
-static void copy_temp_data(const node_base *src, node_base *dst) {
-    if (src->temp_data_) {
-        auto &data = *src->temp_data_;
-        if (!data.vtable()) { return; }
-        if (!data.vtable()->copy_assigner_ && !data.vtable()->copy_ctor_) {
-            return;
-        }
-        dst->temp_data() = data;
-    }
-}
-
-expr copy_attr(const expr_base &ths, expr &&newexpr) {
-    if (ths.attr_) { merge_attrs(newexpr->attr_, ths.attr_); }
-    copy_temp_data(&ths, newexpr.get());
-    return std::move(newexpr);
-}
-
-stmt copy_attr(const stmt_base_t &ths, stmt &&newstmt) {
-    if (ths.attr_) { merge_attrs(newstmt->attr_, ths.attr_); }
-    copy_temp_data(&ths, newstmt.get());
-    return std::move(newstmt);
-}
-
-func_t copy_attr(const func_base &ths, func_t &&newfunc) {
-    if (ths.attr_) { merge_attrs(newfunc->attr_, ths.attr_); }
-    copy_temp_data(&ths, newfunc.get());
-    return std::move(newfunc);
-}
-
-static std::vector<expr> vector_remove_const(const std::vector<expr_c> &v) {
-    std::vector<expr> ret;
-    ret.reserve(v.size());
-    for (auto &i : v) {
-        ret.emplace_back(i.remove_const());
-    }
-    return ret;
-}
-
-// add ret to parent node of s.
-void add_parent_node(const stmt &s, const stmt &ret) {
-    s->attr()["builder.parent_node"] = passlet::structural_result_t {ret, s};
-}
-
-passlet::structural_result_t *get_parent_struct(const stmt &v) {
-    if (!v->attr_) return nullptr;
-    return v->attr_->get_or_null<passlet::structural_result_t>(
-            "builder.parent_node");
-}
-
-// This function can find the parent node in IR, if the node has no parent
-// node, return stmt().
-stmt get_parent_node(const stmt &node) {
-    auto stru_res = get_parent_struct(node);
-    if (!stru_res) return stmt();
-    return stru_res->get_parent_node();
-}
-
-stmt get_common_parent_node(const stmt &node1, const stmt &node2) {
-    auto stru_res1 = get_parent_struct(node1),
-         stru_res2 = get_parent_struct(node2);
-    if (!stru_res1 || !stru_res2) return stmt();
-    return stmt {const_cast<stmt_base_t *>(
-            stru_res1->find_shared_parent(
-                    *stru_res2,
-                    [](passlet::passlet_t *ths, const node_base *v)
-                            -> passlet::structural_result_t * {
-                        if (!v->attr_) return nullptr;
-                        return v->attr_
-                                ->get_or_null<passlet::structural_result_t>(
-                                        "builder.parent_node");
-                    },
-                    true, true))
-                         ->shared_from_this()};
-}
-
-expr &get_base_tensor(expr &buffer) {
-    expr *tsr = &buffer;
-    while (!tsr->isa<tensor>()) {
-        COMPILE_ASSERT(
-                tsr->isa<tensorptr>(), "Only tensor or tensorptr is accepted")
-        auto base = tsr->static_as<tensorptr>()->base_;
-        COMPILE_ASSERT(base.isa<indexing>(),
-                "tensor_ptr base should be indexing, but got: " << base);
-        tsr = &(base.static_as<indexing>()->ptr_);
-    }
-    return *tsr;
-}
-
-tensor get_real_tensor(const expr &buffer) {
-    auto buf = buffer;
-    return get_base_tensor(buf).checked_as<tensor>();
-}
-
-void set_base_tensor(expr &tptr, const expr &tsr) {
-    get_base_tensor(tptr) = tsr;
-}
-
-namespace builder {
-
-static thread_local builder_impl_t *cur_builder = nullptr;
-
-builder_impl_t *get_current_builder() {
-    return cur_builder;
-}
-
-void set_current_builder(builder_impl_t *b) {
-    cur_builder = b;
-}
-
-expr make_constant(float val) {
-    return make_expr<constant_node>(val);
-};
-
-expr make_constant(int32_t val) {
-    return make_expr<constant_node>(static_cast<int64_t>(val));
-};
-
-expr make_constant(uint64_t val) {
-    return make_expr<constant_node>(val);
-};
-
-expr make_constant(const std::vector<union_val> &val, sc_data_type_t dtype) {
-    return make_expr<constant_node>(val, dtype);
-}
-
-expr make_var(sc_data_type_t type, const std::string &name) {
-    return make_expr<var_node>(type, name);
-};
-
-expr make_cast(sc_data_type_t type, const expr_c &in) {
-    return make_expr<cast_node>(type, in.remove_const());
-}
-
-func_t make_func(const std::string &name, const std::vector<expr> &params,
-        stmt body, sc_data_type_t ret_type) {
-    return func_t(new func_base(name, params, std::move(body), ret_type));
-}
-
-func_t make_func(const std::string &name, const std::vector<expr_c> &params,
-        const stmt_c &body, sc_data_type_t ret_type) {
-    return make_func(
-            name, vector_remove_const(params), body.remove_const(), ret_type);
-}
-
-expr builder_impl_t::make_str(const std::string &str) {
-    static std::atomic<int> counter = {0};
-    expr ret = make_tensor("_str_0_" + std::to_string(++counter),
-            std::vector<expr> {expr(str.size() + 1)}, datatypes::u8);
-    push_var_tensor_def(ret, linkage::local);
-    for (size_t i = 0; i < str.size(); i++) {
-        push_assign(make_indexing(ret, expr(i)),
-                make_expr<constant_node>(uint64_t(str.at(i)), datatypes::u8));
-    }
-    push_assign(make_indexing(ret, expr(str.size())),
-            make_expr<constant_node>(uint64_t(0), datatypes::u8));
-    return ret;
-}
-
-expr tensor_ptr(const expr &tens, const std::vector<expr> &idx,
-        const std::vector<expr> &shape, bool is_slice) {
-    COMPILE_ASSERT(tens.isa<tensor>() || tens.isa<tensorptr>(),
-            "tensor_ptr only accepts a tensor or tensorptr, got: " << tens);
-    const std::vector<expr> *real_shape;
-    if (is_slice && shape.empty()) {
-        if (tens.isa<tensor>()) {
-            real_shape = &tens.static_as<tensor>()->dims_;
-        } else {
-            real_shape = &tens.static_as<tensorptr>()->shape_;
-        }
-    } else {
-        real_shape = &shape;
-    }
-    return make_expr<tensorptr_node>(
-            make_expr<indexing_node>(tens, idx, expr()), *real_shape, is_slice);
-}
-
-expr tensor_ptr(const expr_c &tens, const std::vector<expr_c> &idx,
-        const std::vector<expr_c> &shape, bool is_slice) {
-    return tensor_ptr(tens.remove_const(), vector_remove_const(idx),
-            vector_remove_const(shape), is_slice);
-}
-
-expr tensor_ptr(const expr &tensor, std::initializer_list<expr> idx,
-        std::initializer_list<expr> shape, bool is_slice) {
-    return tensor_ptr(
-            tensor, std::vector<expr>(idx), std::vector<expr>(shape), is_slice);
-}
-
-expr remake_binary(const expr_c &l, const expr_c &r, const expr_c &original) {
-    if (original.isa<intrin_call>()) {
-        auto orig_type = original.static_as<intrin_call>()->type_;
-        switch (orig_type) {
-            case intrin_type::min:
-            case intrin_type::max:
-            case intrin_type::int_and:
-            case intrin_type::int_or:
-            case intrin_type::int_xor:
-            case intrin_type::shl:
-            case intrin_type::shr:
-                return copy_attr(*original,
-                        make_expr<intrin_call_node>(orig_type,
-                                std::vector<expr> {
-                                        l.remove_const(), r.remove_const()},
-                                *original.static_as<intrin_call>()
-                                         ->intrin_attrs_));
-            default: assert(0 && "Bad op for remake_binary");
-        }
-    }
-    switch (original->node_type_) {
-        case sc_expr_type::add: return copy_attr(*original, make_add(l, r));
-        case sc_expr_type::sub: return copy_attr(*original, make_sub(l, r));
-        case sc_expr_type::mul: return copy_attr(*original, make_mul(l, r));
-        case sc_expr_type::div: return copy_attr(*original, make_div(l, r));
-        case sc_expr_type::mod: return copy_attr(*original, make_mod(l, r));
-        case sc_expr_type::cmp_eq:
-            return copy_attr(*original, make_cmp_eq(l, r));
-        case sc_expr_type::cmp_ne:
-            return copy_attr(*original, make_cmp_ne(l, r));
-        case sc_expr_type::cmp_lt:
-            return copy_attr(*original, make_cmp_lt(l, r));
-        case sc_expr_type::cmp_le:
-            return copy_attr(*original, make_cmp_le(l, r));
-        case sc_expr_type::cmp_gt:
-            return copy_attr(*original, make_cmp_gt(l, r));
-        case sc_expr_type::cmp_ge:
-            return copy_attr(*original, make_cmp_ge(l, r));
-        case sc_expr_type::logic_and:
-            return copy_attr(*original, make_logic_and(l, r));
-        case sc_expr_type::logic_or:
-            return copy_attr(*original, make_logic_or(l, r));
-        default: assert(0 && "Bad op for remake_binary"); return expr();
-    }
-}
-
-#define GEN_BINARY(name) \
-    expr make_##name(const expr_c &left, const expr_c &right) { \
-        return make_expr<name##_node>( \
-                left.remove_const(), right.remove_const()); \
-    };
-
-expr make_func_addr(func_t v) {
-    return make_expr<func_addr_node>(std::move(v));
-}
-
-expr make_phi(const std::vector<expr> &values, bool is_loop_phi) {
-    return make_expr<ssa_phi_node>(values, is_loop_phi);
-}
-
-intrin_call remake_intrin_call(
-        const intrin_call_c &v, const std::vector<expr> &newargs) {
-    return make_expr<intrin_call_node>(v->type_, newargs, *v->intrin_attrs_);
-}
-
-low_level_intrin remake_low_level_intrin(
-        const low_level_intrin_c &v, const std::vector<expr> &newargs) {
-    low_level_intrin new_intrin = v->remake().static_as<low_level_intrin>();
-    new_intrin->args_ = newargs;
-    return new_intrin;
-}
-
-expr make_x86_intrin(x86_intrin_type::x86_intrin_type_t type,
-        const std::vector<expr> &args, const any_map_t &attrs) {
-    auto intrin = make_expr<low_level_intrin_node>(
-            low_level_intrin_kind::x86_general, // x86 low level kind
-            static_cast<int64_t>(type), // x86 intrin type
-            args, attrs);
-    get_x86_intrinsic_handler(intrin->type_).on_initialize(*intrin);
-    return intrin;
-}
-
-expr make_reinterpret(const expr_c &v, sc_data_type_t dtype) {
-    any_map_t attr;
-    attr[intrin_attr::out_dtype] = dtype;
-    auto ret = make_expr<intrin_call_node>(intrin_type::reinterpret,
-            std::vector<expr> {v.remove_const()}, attr);
-    return ret;
-}
-
-expr make_isnan(const expr_c &v) {
-    auto ret = make_expr<intrin_call_node>(intrin_type::isnan,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-    ret->dtype_ = sc_data_type_t::boolean(v->dtype_.lanes_);
-    return ret;
-}
-
-expr make_saturated_cast(const expr_c &v, sc_data_type_t dtype) {
-    any_map_t attr;
-    attr[intrin_attr::out_dtype] = dtype;
-    auto ret = make_expr<intrin_call_node>(intrin_type::saturated_cast,
-            std::vector<expr> {v.remove_const()}, attr);
-    return ret;
-}
-
-expr make_round_and_cast(const expr_c &v, sc_data_type_t dtype) {
-    return make_expr<intrin_call_node>(intrin_type::round_and_cast,
-            std::vector<expr> {v.remove_const()},
-            any_map_t {{intrin_attr::out_dtype, dtype}});
-}
-
-expr make_min(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::min,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_max(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::max,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_int_and(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::int_and,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_int_or(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::int_or,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_int_xor(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::int_xor,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_shl(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::shl,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_shr(const expr_c &left, const expr_c &right) {
-    return make_expr<intrin_call_node>(intrin_type::shr,
-            std::vector<expr> {left.remove_const(), right.remove_const()},
-            any_map_t());
-}
-
-expr make_abs(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::abs,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_round(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::round,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_floor(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::floor,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_ceil(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::ceil,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_exp(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::exp,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_log(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::log,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_erf(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::erf,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_sqrt(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::sqrt,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_rsqrt(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::rsqrt,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_reduce_add(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::reduce_add,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_reduce_mul(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::reduce_mul,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_reduce_max(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::reduce_max,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_reduce_min(const expr_c &v) {
-    return make_expr<intrin_call_node>(intrin_type::reduce_min,
-            std::vector<expr> {v.remove_const()}, any_map_t());
-}
-
-expr make_broadcast(const expr_c &v, int lanes) {
-    return make_expr<intrin_call_node>(intrin_type::broadcast,
-            std::vector<expr> {v.remove_const()}, any_map_t {{"lanes", lanes}});
-}
-
-expr make_fmadd(const expr_c &v_a, const expr_c &v_b, const expr_c &v_c) {
-    return make_expr<intrin_call_node>(intrin_type::fmadd,
-            std::vector<expr> {
-                    v_a.remove_const(), v_b.remove_const(), v_c.remove_const()},
-            any_map_t());
-}
-
-expr make_fnmadd(const expr_c &v_a, const expr_c &v_b, const expr_c &v_c) {
-    return make_expr<intrin_call_node>(intrin_type::fnmadd,
-            std::vector<expr> {
-                    v_a.remove_const(), v_b.remove_const(), v_c.remove_const()},
-            any_map_t());
-}
-
-expr make_unpack_low(const expr_c &v_a, const expr_c &v_b, int elem_bits) {
-    return make_expr<intrin_call_node>(intrin_type::unpack_low,
-            std::vector<expr> {v_a.remove_const(), v_b.remove_const()},
-            any_map_t {{"elem_bits", elem_bits}});
-}
-
-expr make_unpack_high(const expr_c &v_a, const expr_c &v_b, int elem_bits) {
-    return make_expr<intrin_call_node>(intrin_type::unpack_high,
-            std::vector<expr> {v_a.remove_const(), v_b.remove_const()},
-            any_map_t {{"elem_bits", elem_bits}});
-}
-
-expr make_shuffle(const expr_c &v_a, const expr_c &v_b, const int &v_c,
-        const int &type_bits) {
-    return make_expr<intrin_call_node>(intrin_type::shuffle,
-            std::vector<expr> {v_a.remove_const(), v_b.remove_const()},
-            any_map_t {{"shuffle_imm", v_c}, {"type_bits", type_bits}});
-}
-
-expr make_permute(const expr_c &v_a, const expr_c &v_b, const int &v_c,
-        const int &type_bits) {
-    return make_expr<intrin_call_node>(intrin_type::permute,
-            std::vector<expr> {v_a.remove_const(), v_b.remove_const()},
-            any_map_t {{"permute_imm", v_c}, {"type_bits", type_bits}});
-}
-
-expr make_gather(const expr_c &addr, const expr_c &indices) {
-    return make_expr<intrin_call_node>(intrin_type::gather,
-            std::vector<expr> {addr.remove_const(), indices.remove_const()},
-            any_map_t());
-}
-
-expr make_permutex2var(
-        const expr_c &v_a, const expr_c &v_b, const expr_c &v_c) {
-    return make_expr<intrin_call_node>(intrin_type::permutex2var,
-            std::vector<expr> {
-                    v_a.remove_const(), v_b.remove_const(), v_c.remove_const()},
-            any_map_t());
-}
-
-expr make_permutexvar(const expr_c &idx, const expr_c &v, const int lanes) {
-    return make_expr<intrin_call_node>(intrin_type::permutexvar,
-            std::vector<expr> {idx.remove_const(), v.remove_const()},
-            any_map_t {{"lanes", lanes}});
-}
-
-expr make_insert(const expr_c &v_a, const expr_c &v_b, const int imm) {
-    return make_expr<intrin_call_node>(intrin_type::insert,
-            std::vector<expr> {v_a.remove_const(), v_b.remove_const()},
-            any_map_t {{"insert_imm", imm}});
-}
-
-expr make_extract(const expr_c &v_a, const int imm, const int lanes) {
-    return make_expr<intrin_call_node>(intrin_type::extract,
-            std::vector<expr> {v_a.remove_const()},
-            any_map_t {{"extract_imm", imm}, {"lanes", lanes}});
-}
-
-static expr flatten_2d_index(
-        const expr_c &v_a, const expr_c &row, const expr_c &col) {
-    uint64_t parent_rows = v_a->dtype_.rows_;
-    if (parent_rows == 0) { parent_rows = 1; }
-    uint64_t parent_cols = v_a->dtype_.lanes_ / parent_rows;
-    auto p_cols_expr = make_constant({parent_cols}, datatypes::u16);
-    return (make_cast(datatypes::u16, row) * p_cols_expr
-                   + make_cast(datatypes::u16, col))
-            * make_constant(
-                    {(uint64_t)utils::get_sizeof_etype(v_a->dtype_.type_code_)},
-                    datatypes::u16);
-}
-
-expr make_insert(const expr_c &v_a, const expr_c &v_b, const expr_c &row,
-        const expr_c &col) {
-    return make_expr<intrin_call_node>(intrin_type::insert,
-            std::vector<expr> {v_a.remove_const(), v_b.remove_const(),
-                    flatten_2d_index(v_a, row, col)},
-            any_map_t {});
-}
-
-expr make_extract(const expr_c &v_a, const expr_c &row, const expr_c &col,
-        uint32_t rows, uint32_t cols) {
-    return make_expr<intrin_call_node>(intrin_type::extract,
-            std::vector<expr> {
-                    v_a.remove_const(), flatten_2d_index(v_a, row, col)},
-            any_map_t {{"rows", rows}, {"cols", cols}});
-}
-
-expr make_read_struct(const expr_c &in, const std::string &struct_name,
-        const int &field_name) {
-    return make_expr<intrin_call_node>(intrin_type::read_struct,
-            std::vector<expr> {in.remove_const()},
-            any_map_t {{intrin_attr::struct_name, struct_name},
-                    {intrin_attr::struct_field, field_name}});
-}
-
-expr make_write_struct(const expr_c &in, const expr_c &field,
-        const std::string &struct_name, const int &field_name) {
-    return make_expr<intrin_call_node>(intrin_type::write_struct,
-            std::vector<expr> {in.remove_const(), field.remove_const()},
-            any_map_t {{intrin_attr::struct_name, struct_name},
-                    {intrin_attr::struct_field, field_name}});
-}
-
-expr make_get_group_id(uint64_t par_for_level_id) {
-    return make_expr<intrin_call_node>(intrin_type::get_group_id,
-            std::vector<expr> {par_for_level_id}, any_map_t());
-}
-
-expr make_get_group_thread_id(int par_for_level_id) {
-    return make_expr<intrin_call_node>(intrin_type::get_group_thread_id,
-            std::vector<expr> {par_for_level_id}, any_map_t());
-}
-
-GEN_BINARY(add);
-GEN_BINARY(sub);
-GEN_BINARY(mul);
-GEN_BINARY(div);
-GEN_BINARY(mod);
-
-GEN_BINARY(cmp_eq);
-GEN_BINARY(cmp_ne);
-GEN_BINARY(cmp_lt);
-GEN_BINARY(cmp_le);
-GEN_BINARY(cmp_gt);
-GEN_BINARY(cmp_ge);
-
-GEN_BINARY(logic_and);
-GEN_BINARY(logic_or);
-
-expr make_logic_not(const expr_c &in) {
-    return make_expr<logic_not_node>(in.remove_const());
-}
-
-expr make_select(const expr_c &cond, const expr_c &l, const expr_c &r) {
-    return make_expr<select_node>(
-            cond.remove_const(), l.remove_const(), r.remove_const());
-}
-
-expr make_clip(
-        const expr_c &in, const expr_c &clip_min, const expr_c &clip_max) {
-    expr ge_clip_min = builder::make_min(in, clip_max);
-    return builder::make_max(ge_clip_min, clip_min);
-}
-
-expr make_indexing(const expr &ptr, const std::vector<expr> &idx,
-        uint16_t length, const expr &mask, uint16_t rows) {
-    return make_expr<indexing_node>(ptr, idx, length, rows, mask);
-}
-
-expr make_indexing(const expr_c &ptr, const std::vector<expr_c> &idx,
-        uint16_t length, const expr_c &mask, uint16_t rows) {
-    return make_expr<indexing_node>(ptr.remove_const(),
-            vector_remove_const(idx), length, rows, mask.remove_const());
-}
-
-expr make_indexing(const expr &ptr, std::initializer_list<expr> idx,
-        uint16_t length, const expr &mask, uint16_t rows) {
-    return make_expr<indexing_node>(
-            ptr, std::vector<expr>(idx), length, rows, mask);
-}
-
-expr make_indexing(const expr_c &ptr, const expr_c &idx, uint16_t length,
-        const expr_c &mask, uint16_t rows) {
-    return make_expr<indexing_node>(ptr.remove_const(),
-            std::vector<expr> {idx.remove_const()}, length, rows,
-            mask.remove_const());
-}
-
-expr make_call(const func_t &func, const std::vector<expr> &args) {
-    return make_expr<call_node>(func, args);
-}
-
-expr make_call(const func_c &func, const std::vector<expr_c> &args) {
-    return make_call(std::const_pointer_cast<func_base>(func),
-            vector_remove_const(args));
-}
-
-expr remake_call(
-        const func_t &func, const std::vector<expr> &args, const call_c &old) {
-    return copy_attr(*old,
-            make_expr<call_node>(func, args,
-                    std::vector<call_node::parallel_attr_t> {old->para_attr_}));
-}
-
-expr remake_call(const func_c &func, const std::vector<expr_c> &args,
-        const call_c &old) {
-    return remake_call(std::const_pointer_cast<func_base>(func),
-            vector_remove_const(args), old);
-}
-
-expr make_tensor(const std::string &name, const std::vector<expr> &dims,
-        sc_data_type_t dtype, address_space addrspace,
-        const std::shared_ptr<static_data_t> &init_value,
-        const std::vector<expr> &strides) {
-    return make_expr<tensor_node>(
-            dtype, name, dims, addrspace, init_value, strides);
-}
-
-expr make_tensor(const std::string &name, const std::vector<expr_c> &dims,
-        sc_data_type_t dtype, address_space addrspace,
-        const std::shared_ptr<static_data_t> &init_value,
-        const std::vector<expr_c> &strides) {
-    return make_expr<tensor_node>(dtype, name, vector_remove_const(dims),
-            addrspace, init_value, vector_remove_const(strides));
-}
-
-expr make_tensor(const std::string &name, std::initializer_list<expr> dims,
-        sc_data_type_t dtype, address_space addrspace,
-        const std::shared_ptr<static_data_t> &init_value,
-        std::initializer_list<expr> strides) {
-    return make_tensor(name, std::vector<expr>(dims), dtype, addrspace,
-            init_value, std::vector<expr>(strides));
-}
-
-expr make_stensor(const std::string &name, const std::vector<expr> &dims,
-        const std::vector<expr> &strides, sc_data_type_t dtype,
-        address_space addrspace,
-        const std::shared_ptr<static_data_t> &init_value) {
-    COMPILE_ASSERT(strides.size() == dims.size(),
-            "Dims and strides shall have same length.");
-    return make_tensor(name, dims, dtype, addrspace, init_value, strides);
-}
-
-expr make_stensor(const std::string &name, const std::vector<expr_c> &dims,
-        const std::vector<expr_c> &strides, sc_data_type_t dtype,
-        address_space addrspace,
-        const std::shared_ptr<static_data_t> &init_value) {
-    COMPILE_ASSERT(strides.size() == dims.size(),
-            "Dims and strides shall have same length.");
-    return make_tensor(name, dims, dtype, addrspace, init_value, strides);
-}
-
-builder_impl_t::basic_block_t::basic_block_t() {
-    body = builder::make_stmts_unattached({}).checked_as<stmts>();
-}
-
-void builder_impl_t::basic_block_t::emit(const stmt &stmt) {
-    body->seq_.emplace_back(stmt);
-    // auto attach parent node
-    add_parent_node(stmt, body);
-}
-
-stmt builder_impl_t::basic_block_t::get() {
-    return std::move(body);
-}
-
-builder_impl_t::basic_block_t &builder_impl_t::get_current_scope() {
-    assert(!scopes.empty());
-    return scopes.back();
-}
-
-void builder_impl_t::emit(const stmt &s) {
-    COMPILE_ASSERT(!scopes.empty(),
-            "Emitting to empty scope stack. You need to call "
-            "push_scope() first");
-    get_current_scope().emit(s);
-}
-
-stmt builder_impl_t::pop_scope() {
-    auto ret = scopes.back().get();
-    add_parent_node(ret, stmt());
-    scopes.pop_back();
-    return ret;
-}
-
-void builder_impl_t::push_scope() {
-    scopes.emplace_back();
-}
-
-stmts builder_impl_t::push_anchor() {
-    auto s = make_stmt<stmts_node_t>(std::vector<stmt> {});
-    emit(s);
-    return s;
-}
-
-stmt make_assign_unattached(const expr_c &var, const expr_c &value) {
-    return make_stmt<assign_node_t>(var.remove_const(), value.remove_const());
-}
-
-stmt make_stmts_unattached(const std::vector<stmt_c> &seq) {
-    std::vector<stmt> s;
-    s.reserve(seq.size());
-    for (auto &v : seq) {
-        s.emplace_back(v.remove_const());
-    }
-    return make_stmt<stmts_node_t>(std::move(s));
-}
-
-stmt make_if_else_unattached(const expr_c &condition, const stmt_c &then_case,
-        const stmt_c &else_case) {
-    return make_stmt<if_else_node_t>(condition.remove_const(),
-            then_case.remove_const(), else_case.remove_const());
-}
-
-stmt make_evaluate_unattached(const expr_c &val) {
-    return make_stmt<evaluate_node_t>(val.remove_const());
-}
-
-stmt make_returns_unattached(const expr_c &val) {
-    return make_stmt<returns_node_t>(val.remove_const());
-}
-
-stmt make_var_tensor_def_unattached(
-        const expr_c &var, linkage l, const expr_c &init) {
-    return make_stmt<define_node_t>(var.remove_const(), l, init.remove_const());
-}
-
-stmt make_for_loop_unattached(const expr_c &var, const expr_c &iter_begin,
-        const expr_c &iter_end, const expr_c &step, const stmt_c &body,
-        bool incremental, for_type kind, int num_threads) {
-    return make_stmt<for_loop_node_t>(var.remove_const(),
-            iter_begin.remove_const(), iter_end.remove_const(),
-            step.remove_const(), body.remove_const(), incremental, kind,
-            num_threads);
-}
-
-stmt builder_impl_t::push_assign(const expr &var, const expr &value) {
-    auto ret = make_stmt<assign_node_t>(var, value);
-    emit(ret);
-    return ret;
-}
-
-stmt builder_impl_t::push_if_else(const expr &condition_,
-        const stmt &then_case_, const stmt &else_case_) {
-    auto ret = make_stmt<if_else_node_t>(condition_, then_case_, else_case_);
-    add_parent_node(then_case_, ret);
-    if (else_case_.defined()) add_parent_node(else_case_, ret);
-    emit(ret);
-    return ret;
-}
-
-stmt builder_impl_t::push_evaluate(const expr &val) {
-    auto ret = make_stmt<evaluate_node_t>(val);
-    emit(ret);
-    return ret;
-}
-
-stmt builder_impl_t::push_returns(const expr &val) {
-    auto ret = make_stmt<returns_node_t>(val);
-    emit(ret);
-    return ret;
-}
-
-stmt builder_impl_t::push_var_tensor_def(
-        const expr &var, linkage l, const expr &init) {
-    auto ret = make_stmt<define_node_t>(var, l, init);
-    emit(ret);
-    return ret;
-}
-
-stmt builder_impl_t::push_for_loop(const expr &var_, const expr &iter_begin_,
-        const expr &iter_end_, const expr &step_, const stmt &body_,
-        bool incremental_, for_type kind, int num_threads) {
-    auto ret = make_stmt<for_loop_node_t>(var_, iter_begin_, iter_end_, step_,
-            body_, incremental_, kind, num_threads);
-    add_parent_node(body_, ret);
-    emit(ret);
-    return ret;
-}
-
-stmt builder_impl_t::brgemm(const expr_c &x, const expr_c &w, const expr_c &y,
-        const expr_c &blocks, const expr_c &M, const expr_c &N, const expr_c &K,
-        const expr_c &ldx, const expr_c &ldw, const expr_c &ldy,
-        const expr_c &x_block_stride, const expr_c &w_block_stride,
-        const std::vector<expr> &postops_data, const expr_c &c_buf,
-        const expr_c &bd_mask_idx, const expr_c &top_pad,
-        const expr_c &bottom_pad, const brgemm_args::extra_args_t &extras) {
-    auto args = std::vector<expr> {x.remove_const(), w.remove_const(),
-            y.remove_const(), blocks.remove_const(), M.remove_const(),
-            N.remove_const(), K.remove_const(), ldx.remove_const(),
-            ldw.remove_const(), ldy.remove_const(),
-            x_block_stride.remove_const(), w_block_stride.remove_const()};
-    args.insert(args.end(), postops_data.begin(), postops_data.end());
-    args.emplace_back(c_buf.remove_const());
-    args.emplace_back(bd_mask_idx.remove_const());
-    args.emplace_back(top_pad.remove_const());
-    args.emplace_back(bottom_pad.remove_const());
-    return push_evaluate(make_expr<intrin_call_node>(intrin_type::brgemm, args,
-            any_map_t {{intrin_attr::brgemm_extras, extras}}));
-}
-
-stmt builder_impl_t::list_brgemm(const expr_c &x, const expr_c &w,
-        const expr_c &y, const expr_c &blocks, const expr_c &M, const expr_c &N,
-        const expr_c &K, const expr_c &ldx, const expr_c &ldw,
-        const expr_c &ldy, const expr_c &x_block_stride,
-        const expr_c &w_block_stride, const expr_c &len,
-        const std::vector<expr> &postops_data, const expr_c &c_buf,
-        const expr_c &bd_mask_idx, const expr_c &top_pad,
-        const expr_c &bottom_pad, const brgemm_args::extra_args_t &extras) {
-    auto args = std::vector<expr> {x.remove_const(), w.remove_const(),
-            y.remove_const(), blocks.remove_const(), M.remove_const(),
-            N.remove_const(), K.remove_const(), ldx.remove_const(),
-            ldw.remove_const(), ldy.remove_const(),
-            x_block_stride.remove_const(), w_block_stride.remove_const(),
-            len.remove_const()};
-    args.insert(args.end(), postops_data.begin(), postops_data.end());
-    args.emplace_back(c_buf.remove_const());
-    args.emplace_back(bd_mask_idx.remove_const());
-    args.emplace_back(top_pad.remove_const());
-    args.emplace_back(bottom_pad.remove_const());
-    return push_evaluate(make_expr<intrin_call_node>(intrin_type::list_brgemm,
-            args, any_map_t {{intrin_attr::brgemm_extras, extras}}));
-}
-
-} // namespace builder
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/builder.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/builder.hpp
deleted file mode 100644
index f49f596a513..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/builder.hpp
+++ /dev/null
@@ -1,1172 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_BUILDER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_BUILDER_HPP
-
-#include <functional>
-#include <vector>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include "intrinsics.hpp"
-#include "sc_data_type.hpp"
-#include "sc_expr.hpp"
-#include "sc_function.hpp"
-#include "sc_stmt.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * The IR builder. It contains utility functions that helps
- * to build the IR. For easier IR building, you can use easy_build,
- * which is based on these function. Each thread has a
- * thread-local pointer which points to the current builder. This
- * feature is useful for easy_build to save the user from providing
- * the current builder.
- * */
-namespace builder {
-/**
- * The internal implementation of IR builder. Use ir_builder_t
- * in most cases instead
- * */
-class SC_INTERNAL_API builder_impl_t {
-public:
-    /**
-     * A basic block is a list of statements that is temporarily
-     * stored in the builder. A basic block can be appended with
-     * a new statement. The builder will finally compose the
-     * statements in a basic-block into a stmts node.
-     * */
-    struct basic_block_t {
-        stmts body;
-        /**
-         * Appends a stmt into the BB
-         * @param stmt the statement
-         * */
-        void emit(const stmt &stmt);
-        /**
-         * Get the contained body and clear this field
-         * @return the `body` stmts
-         * */
-        stmt get();
-        // return `seq_` field of `body`, which is std::vector<stmt> type
-        std::vector<stmt> &as_seq() { return body->seq_; }
-        basic_block_t();
-        basic_block_t(basic_block_t &&other) : body(std::move(other.body)) {}
-    };
-
-    /**
-     * A builder may have nested basic blockes. It uses a stack to track
-     * the current basic block.
-     * */
-    std::vector<basic_block_t> scopes;
-
-    /**
-     * Gets the current basic block on the top of the scopes
-     * */
-    basic_block_t &get_current_scope();
-    builder_impl_t() = default;
-
-    /**
-     * Pushs a statement into the current basic block
-     * */
-    void emit(const stmt &s);
-
-    /**
-     * Pops out the current basic block. Generates a stmts node
-     * from it
-     * @return the stmts node for the poped basic block
-     * */
-    stmt pop_scope();
-
-    /**
-     * Pushes a empty current basic block on the top of the scopes.
-     * emit() after calling this function will push the statements
-     * into this new basic block
-     * */
-    void push_scope();
-
-    /**
-     * Makes and pushes an assign statement
-     * @param var the dest LValue
-     * @param value the src RValue
-     * @return the pushed node
-     * */
-    stmt push_assign(const expr &var, const expr &value);
-
-    /**
-     * Makes and pushes an if_else
-     * @param condition_ the condition, should be boolean typed
-     * @param then_case_ the `then` block
-     * @param else_case_ the `else` block, nullable.
-     * @return the pushed node
-     * */
-    stmt push_if_else(const expr &condition_, const stmt &then_case_,
-            const stmt &else_case_);
-
-    /**
-     * Makes and pushes an evaluate statement
-     * @param val the expression
-     * @return the pushed node
-     * */
-    stmt push_evaluate(const expr &val);
-
-    /**
-     * Makes and pushes an return statement
-     * @param val the expression, nullable if the current function returns
-     *      void_t
-     * @return the pushed node
-     * */
-    stmt push_returns(const expr &val = expr());
-
-    /**
-     * Makes and push a variable def statement
-     * @param var defined var or tensor
-     * @param linkage
-     * @param init init value, nullable
-     * @return the pushed node
-     * */
-    stmt push_var_tensor_def(const expr &var, linkage linkage = linkage::local,
-            const expr &init = expr());
-
-    /**
-     * Makes and pushes a for_loop
-     * @param var the iterate variable. The loop-var is expected to
-     *  be used only within the scope of the loop. Should be an integer var
-     * @param iter_begin the initial value of var_
-     * @param iter_end the max bound of the loop-var var_. Can never be reached
-     * @param step the step of var_ in each iteration.
-     * @param body the body of the loop
-     * @param incremental if the loop-var var_ is incremental. Not currently
-     *  used.
-     * @param kind the kind of the loop. @see for_type
-     * @param num_threads the number of threads in parallel-for
-     * @return the pushed node
-     * */
-    stmt push_for_loop(const expr &var, const expr &iter_begin,
-            const expr &iter_end, const expr &step, const stmt &body,
-            bool incremental, for_type kind, int num_threads = 0);
-
-    /**
-     * Makes and pushes blank stmts as an anchor
-     * */
-    stmts push_anchor();
-
-    /**
-     * Makes a brgemm-call node and evaluates it
-     * */
-    stmt brgemm(const expr_c &a, const expr_c &b, const expr_c &c,
-            const expr_c &blocks, const expr_c &M, const expr_c &N,
-            const expr_c &K, const expr_c &lda, const expr_c &ldb,
-            const expr_c &ldc, const expr_c &a_stride, const expr_c &b_stride,
-            const std::vector<expr> &postops_data, const expr_c &c_buf,
-            const expr_c &bd_mask_idx, const expr_c &top_pad,
-            const expr_c &bottom_pad, const brgemm_args::extra_args_t &extras);
-
-    /**
-     * Makes a brgemm-call node and evaluates it
-     * */
-    stmt list_brgemm(const expr_c &a, const expr_c &b, const expr_c &c,
-            const expr_c &blocks, const expr_c &M, const expr_c &N,
-            const expr_c &K, const expr_c &lda, const expr_c &ldb,
-            const expr_c &ldc, const expr_c &a_stride, const expr_c &b_stride,
-            const expr_c &len, const std::vector<expr> &postops_data,
-            const expr_c &c_buf, const expr_c &bd_mask_idx,
-            const expr_c &top_pad, const expr_c &bottom_pad,
-            const brgemm_args::extra_args_t &extras);
-
-    /**
-     * Makes a string simulated by tensor node in the current location
-     * For debug use only (e.g. in fmtprint). It may have performance overhead
-     * @param name the function name
-     * @return the created node, should be a tensor
-     * */
-    expr make_str(const std::string &str);
-};
-
-/**
- * Gets the current thread-local builder. If you have never set
- * a builder by `set_current()`, this function will return nullptr
- * */
-SC_INTERNAL_API builder_impl_t *get_current_builder();
-
-/**
- * Sets the current thread-local builder.
- * @param b the pointer to the builder
- * */
-SC_INTERNAL_API void set_current_builder(builder_impl_t *b);
-
-/**
- * Makes a binary/logic/cmp node of the same type of original, with new LHS=l
- * and RHS=r. Copies attrs and other info from original
- * @return the created node
- * */
-expr remake_binary(const expr_c &l, const expr_c &r, const expr_c &original);
-
-/**
- * Makes a constant node of f32
- * @param val the value
- * @return the created node
- * */
-expr make_constant(float val);
-
-/**
- * Makes a constant node of s32
- * @param val the value
- * @return the created node
- * */
-expr make_constant(int32_t val);
-
-/**
- * Makes a constant node of index type
- * @param val the value
- * @return the created node
- * */
-expr make_constant(uint64_t val);
-
-/**
- * Makes a constant node of std::vector<union_val>
- * @param val the value
- * @return the created node
- * */
-expr make_constant(const std::vector<union_val> &val, sc_data_type_t dtype);
-
-/**
- * Makes a var node
- * @param type the type of the variable
- * @param name the name of the variable
- * @return the created node
- * */
-expr make_var(sc_data_type_t type, const std::string &name);
-
-/**
- * Makes a cast node
- * @param type the destination type of the casting
- * @param in the expression to convert
- * @return the created node
- * */
-SC_INTERNAL_API expr make_cast(sc_data_type_t type, const expr_c &in);
-
-/**
- * Makes a add (+) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_add(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a sub (-) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_sub(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a mul (*) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-SC_INTERNAL_API expr make_mul(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a div (div) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_div(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a mod (%) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_mod(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a min node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_min(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a reinterpret_cast node
- * @param v the value
- * @param dtype the target type, must have the same size of v
- * @return the created node
- * */
-expr make_reinterpret(const expr_c &v, sc_data_type_t dtype);
-
-/**
- * Makes a isnan(float) node
- * @param v the value
- * @return the created node
- * */
-expr make_isnan(const expr_c &v);
-
-/**
- * Makes a saturated cast node
- * @param v the value
- * @param dtype the target dtype, must have the same size of v
- * @return the created node
- * */
-expr make_saturated_cast(const expr_c &v, sc_data_type_t dtype);
-
-/**
- * Makes a round-and-cast node. The input should be a FP32 vector or scalar
- * type. The output should be S32 vector or scalar type.
- * @param v the value
- * @param dtype the target dtype, must have the same size of v
- * @return the created node
- * */
-expr make_round_and_cast(const expr_c &v, sc_data_type_t dtype);
-
-/**
- * Makes a max node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_max(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a abs node (absolute value)
- * @param v the input value
- * @return the created node
- * */
-expr make_abs(const expr_c &v);
-
-/**
- * Makes a round node
- * @param v the input value
- * @return the created node
- * */
-expr make_round(const expr_c &v);
-
-/**
- * Makes a floor node
- * @param v the input value
- * @return the created node
- * */
-expr make_floor(const expr_c &v);
-
-/**
- * Makes a ceil node
- * @param v the input value
- * @return the created node
- * */
-expr make_ceil(const expr_c &v);
-
-/**
- * Makes an exp node
- * @param v the input value
- * @return the created node
- * */
-expr make_exp(const expr_c &v);
-
-/**
- * Makes a log node
- * @param v the input value
- * @return the created node
- * */
-expr make_log(const expr_c &v);
-
-/**
- * Makes a erf node
- * @param v the input value
- * @return the created node
- * */
-expr make_erf(const expr_c &v);
-
-/**
- * Makes an sqrt node
- * @param v the input value
- * @return the created node
- * */
-expr make_sqrt(const expr_c &v);
-
-/**
- * Makes an rsqrt node
- * @param v the input value
- * @return the created node
- * */
-expr make_rsqrt(const expr_c &v);
-
-/**
- * Makes an reduce add node
- * @param v the input value
- * @return the created node
- * */
-expr make_reduce_add(const expr_c &v);
-
-/**
- * Makes an reduce max node
- * @param v the input value
- * @return the created node
- * */
-expr make_reduce_max(const expr_c &v);
-
-/**
- * Makes an reduce min node
- * @param v the input value
- * @return the created node
- * */
-expr make_reduce_min(const expr_c &v);
-
-/**
- * Makes an reduce mul node
- * @param v the input value
- * @return the created node
- * */
-expr make_reduce_mul(const expr_c &v);
-
-/**
- * Makes a broadcast node
- * @param v the input value
- * @param lanes the lanes of output value
- * @return the created node
- * */
-expr make_broadcast(const expr_c &v, int lanes);
-
-/**
- * Makes an fmadd node
- * @param v_a the first input value
- * @param v_b the second input value
- * @param v_c the third input value
- * @return the created node
- * */
-expr make_fmadd(const expr_c &v_a, const expr_c &v_b, const expr_c &v_c);
-
-/**
- * Makes an fnmadd node
- * @param v_a the first input value
- * @param v_b the second input value
- * @param v_c the third input value
- * @return the created node
- * */
-expr make_fnmadd(const expr_c &v_a, const expr_c &v_b, const expr_c &v_c);
-
-/**
- * Makes an unpack_low node
- * Unpack and interleave elements from the low half of each 128-bit lane
- * in v_a and v_b
- *
- * _mm256_unpacklo_ps
- * DEFINE INTERLEAVE_DWORDS(src1[127:0], src2[127:0]) {
- *	dst[31:0] := src1[31:0]
- *  dst[63:32] := src2[31:0]
- *  dst[95:64] := src1[63:32]
- *  dst[127:96] := src2[63:32]
- *  RETURN dst[127:0]
- * }
- * dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
- * dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
- * dst[MAX:256] := 0
- *
- * @param v_a the first input value
- * @param v_b the second input value
- * @param elem_bits the bits of unpack element, e.g. f32=>32, s64=>64
- * @return the created node
- * */
-expr make_unpack_low(const expr_c &v_a, const expr_c &v_b, int elem_bits = 32);
-
-/**
- * Makes an unpack_high node
- * Unpack and interleave elements from the high half of each 128-bit lane
- * in v_a and v_b
- *
- * _mm256_unpackhi_ps
- * DEFINE INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]) {
- *	dst[31:0] := src1[95:64]
- *	dst[63:32] := src2[95:64]
- *	dst[95:64] := src1[127:96]
- *	dst[127:96] := src2[127:96]
- *	RETURN dst[127:0]
- * }
- * dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
- * dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
- * dst[MAX:256] := 0
- *
- * @param v_a the first input value
- * @param v_b the second input value
- * @param elem_bits the lanes of unpack element, e.g. f32=>32, s64=>64
- * @return the created node
- * */
-expr make_unpack_high(const expr_c &v_a, const expr_c &v_b, int elem_bits = 32);
-
-/**
- * Makes a shuffle node
- * Shuffle elements in v_a and v_b within 128-bit lanes using the control in v_c
- *
- * _mm256_shuffle_ps
- * DEFINE SELECT4(src, control) {
- *	CASE(control[1:0]) OF
- *	0:	tmp[31:0] := src[31:0]
- *	1:	tmp[31:0] := src[63:32]
- *	2:	tmp[31:0] := src[95:64]
- *	3:	tmp[31:0] := src[127:96]
- *	ESAC
- *	RETURN tmp[31:0]
- * }
- * dst[31:0] := SELECT4(a[127:0], imm8[1:0])
- * dst[63:32] := SELECT4(a[127:0], imm8[3:2])
- * dst[95:64] := SELECT4(b[127:0], imm8[5:4])
- * dst[127:96] := SELECT4(b[127:0], imm8[7:6])
- * dst[159:128] := SELECT4(a[255:128], imm8[1:0])
- * dst[191:160] := SELECT4(a[255:128], imm8[3:2])
- * dst[223:192] := SELECT4(b[255:128], imm8[5:4])
- * dst[255:224] := SELECT4(b[255:128], imm8[7:6])
- * dst[MAX:256] := 0
- *
- * @param v_a the first input value
- * @param v_b the second input value
- * @param v_c the third input value
- * @param type_bits the number of bits of the data type you want to shuffle
- * @return the created node
- * */
-expr make_shuffle(const expr_c &v_a, const expr_c &v_b, const int &v_c,
-        const int &type_bits);
-
-/**
- * Makes an permute node
- * Shuffle each 128-bits selected by v_c from v_a and v_b.
- *
- * _mm256_permute2f128_ps
- * DEFINE SELECT4(src1, src2, control) {
- *	CASE(control[1:0]) OF
- *	0:	tmp[127:0] := src1[127:0]
- *	1:	tmp[127:0] := src1[255:128]
- *	2:	tmp[127:0] := src2[127:0]
- *	3:	tmp[127:0] := src2[255:128]
- *	ESAC
- *	IF control[3]
- *		tmp[127:0] := 0
- *	FI
- *	RETURN tmp[127:0]
- * }
- * dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
- * dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
- * dst[MAX:256] := 0
-
- * @param v_a the first input value
- * @param v_b the second input value
- * @param v_c the third input value
- * @param type_bits the number of bits of the data type you want to permute
- * @return the created node
- * */
-expr make_permute(const expr_c &v_a, const expr_c &v_b, const int &v_c,
-        const int &type_bits = 128);
-
-/**
- * Makes an permutexvar node
- * Using the corresponding bit in idx to Shuffle v.
- * eg:
-
- * _mm512_permutexvar_epi8
- * FOR j := 0 to 63
- *  i := j*8
- *  id := idx[i+5:i]*8
- *   dst[i+7:i] := a[id+7:id]
- * ENDFOR
-
- * @param idx the correspoding index
- * @param v the input value
- * @param lanes specify the lanes for permutex data. For example: if datatype
- is u8 and specify lanes is 8, which means you want to permutex 64bit data in
- v.
- * @return the created node
- * */
-expr make_permutexvar(const expr_c &idx, const expr_c &v, const int lanes = 1);
-
-/**
- * Insert the value into dst at the location specified by imm. Note that if the
- * data is more than 128bit, the first parameter needs to be twice the number of
- * bits of the second parameter.
- *
- * ep: _mm512_inserti32x8
- * Operation
- * dst[511:0] := a[511:0]
- * CASE imm8[0] OF
- * 0: dst[255:0] := b[255:0]
- * 1: dst[511:256] := b[255:0]
- * ESAC
- * dst[MAX:512] := 0
-
- * @param v_a the first input value
- * @param v_b the second input value
- * @param imm the location specified value, 0 or 1
- * @return the created node
- * */
-expr make_insert(const expr_c &v_a, const expr_c &v_b, const int imm);
-
-/**
- * Extract the value from input specified by imm.
- *
- * dst[7:0] := (a[127:0] >> (imm[3:0] * 8))[7:0]
- * dst[31:8] := 0
-
- * @param v_a the input value
- * @param imm the location specified value, 0 or 1
- * @param lanes specify the lanes for extracting data. For example: if datatype
- is u8 and specify lanes is 8, which means you want to extract 64bit data from
- v_a.
- * @return the created node
- * */
-expr make_extract(const expr_c &v_a, const int imm, const int lanes = 1);
-
-/**
- * Update the 2d value into v_a at the location specified by row and col.
- * @param v_a the larger 2D tile
- * @param v_b the smaller 2D tile to be inserted to v_a
- * @param row the row location for v_b to be inserted to v_a
- * @param col the col location for v_b to be inserted to v_a
- * @return the created node as the updated value
- * */
-expr make_insert(const expr_c &v_a, const expr_c &v_b, const expr_c &row,
-        const expr_c &col);
-/**
- * Extract the 2d region from v_a at the location specified by row and col.
- * @param v_a the 2D tile value
- * @param row the row location for v_b to be inserted to v_a
- * @param col the col location for v_b to be inserted to v_a
- * @param rows_count the number of rows to be extracted
- * @param cols_count the number of cols to be extracted
- * @return the created node as the extracted SIMD value of shape [rows x cols]
- * */
-expr make_extract(const expr_c &v_a, const expr_c &row, const expr_c &col,
-        uint32_t rows_count, uint32_t cols_count);
-
-/**
- * Makes an gather node
- * Gather elements from memory.
- * dst = __mm512{*(addr + indices[0]), *(addr + indices[1]), ...}
- * @param addr the base addr value, usually has pointer dtype.
- * @param indices the index list, usually use a simd var.
- * @return the created node
- * */
-expr make_gather(const expr_c &addr, const expr_c &indices);
-
-/**
- * Makes a permutex2var node
- * @param v_a the first input value
- * @param v_b the second input value
- * @param v_c the third input value
- * @return the created node
- * */
-expr make_permutex2var(const expr_c &v_a, const expr_c &v_b, const expr_c &v_c);
-
-/**
- * Makes a cmp_eq (==) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_cmp_eq(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a cmp_ne (!=) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_cmp_ne(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a cmp_lt (<) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_cmp_lt(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a cmp_le (<=) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_cmp_le(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a cmp_gt (>) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_cmp_gt(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a cmp_ge (>=) node
- * @param left left hand side
- * @param right right hand side
- * @return the created node
- * */
-expr make_cmp_ge(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a logic_and (&&) node
- * @param left left hand side, should be of boolean type
- * @param right right hand side, should be of boolean type
- * @return the created node
- * */
-expr make_logic_and(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a logic_or (||) node
- * @param left left hand side, should be of boolean type
- * @param right right hand side, should be of boolean type
- * @return the created node
- * */
-expr make_logic_or(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a shift left (<<) node
- * @param left left hand side, the number to be shift
- * @param bits the number of bits to shift
- * @return the created node
- * */
-expr make_shl(const expr_c &left, const expr_c &bits);
-
-/**
- * Makes a shift right (>>) node
- * @param left left hand side, the number to be shift
- * @param bits the number of bits to shift
- * @return the created node
- * */
-expr make_shr(const expr_c &left, const expr_c &bits);
-
-/**
- * Makes a int_and (&) node
- * @param left left hand side, should be of boolean type
- * @param right right hand side, should be of boolean type
- * @return the created node
- * */
-expr make_int_and(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a int_or (|) node
- * @param left left hand side, should be of boolean type
- * @param right right hand side, should be of boolean type
- * @return the created node
- * */
-expr make_int_or(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a int_xor (^) node
- * @param left left hand side, should be of boolean type
- * @param right right hand side, should be of boolean type
- * @return the created node
- * */
-expr make_int_xor(const expr_c &left, const expr_c &right);
-
-/**
- * Makes a node to read field from a struct.
- * @param in dynamic tensor, should be a tensor with u8 pointer type
- * @param struct_name name string of struct
- * @param field_name name enum of field
- * @return the created node
- * */
-expr make_read_struct(const expr_c &in, const std::string &struct_name,
-        const int &field_name);
-
-/**
- * Makes a node to write a field to struct.
- * @param dyn_tsr dynamic tensor, should be a tensor with u8 pointer type
- * @param in data tensor, should be a tensor with pointer type
- * @param struct_name name string of struct
- * @param field_name name enum of field
- * @return the created node
- * */
-expr make_write_struct(const expr_c &dyn_tsr, const expr_c &in,
-        const std::string &struct_name, const int &field_name);
-
-/**
- * Makes a node to get thread group id
- * @param par_for_level_id the level of parallel for
- * @return the created node
- * */
-expr make_get_group_id(uint64_t par_for_level_id);
-
-/**
- * Makes a node to get thread id in group. Specifying the (-1) group will return
- * the global thread id
- * @param par_for_level_id the level of parallel for
- * @return the created node
- * */
-expr make_get_group_thread_id(int par_for_level_id);
-
-/**
- * Makes a logic_not (!) node
- * @param in the input expression, should be of boolean type
- * @return the created node
- * */
-expr make_logic_not(const expr_c &in);
-
-/**
- * Makes a select (? :) node
- * @param cond the condition expression, should be of boolean type
- * @param l obtained value when condition is true
- * @param r obtained value when condition is false
- * @return the created node
- * */
-expr make_select(const expr_c &cond, const expr_c &l, const expr_c &r);
-
-/**
- * Makes a clip(in, clip_min, clip_max) node
- * clip(in, clip_min, clip_max) = in when clip_min <= in <= clip_max
- * clip(in, clip_min, clip_max) = clip_min when in < clip_min
- * clip(in, clip_min, clip_max) = clip_max when in > clip_max
- * @param in the input value to be clipped
- * @param xmin min value for clip
- * @param xmax max value for clip
- * @return the created node
- * */
-expr make_clip(
-        const expr_c &in, const expr_c &clip_min, const expr_c &clip_max);
-
-/**
- * Makes a indexing node of multiple dimemsions
- * @param ptr the buffer expression, should be a tensor
- * @param idx the indices, should be integers
- * @param mask the loading mask, currently unused, nullable
- * @param length the vector length to index, or 1 as a scalar
- * @return the created node
- * */
-expr make_indexing(const expr &ptr, const std::vector<expr> &idx,
-        uint16_t length = 1, const expr &mask = expr(), uint16_t rows = 0);
-expr make_indexing(const expr &ptr, std::initializer_list<expr> idx,
-        uint16_t length = 1, const expr &mask = expr(), uint16_t rows = 0);
-
-expr make_indexing(const expr_c &ptr, const std::vector<expr_c> &idx,
-        uint16_t length = 1, const expr_c &mask = expr_c(), uint16_t rows = 0);
-
-/**
- * Makes a indexing node of single dimemsion
- * @param ptr the buffer expression, should be a tensor
- * @param idx the index, should be an integer
- * @param mask the loading mask, currently unused, nullable
- * @param length the vector length to index, or 1 as a scalar
- * @return the created node
- * */
-expr make_indexing(const expr_c &ptr, const expr_c &idx, uint16_t length = 1,
-        const expr_c &mask = expr_c(), uint16_t rows = 0);
-
-/**
- * Makes a call node
- * @param func the callee
- * @param args the arguments
- * @return the created node
- * */
-expr make_call(const func_t &func, const std::vector<expr> &args);
-expr make_call(const func_c &func, const std::vector<expr_c> &args);
-
-/**
- * Makes a call node with given callee and arguments. Other members are copied
- * from `old`
- * @param func_t the callee
- * @param args the arguments
- * @param old the old call_node
- * @return the created node
- * */
-expr remake_call(
-        const func_t &func, const std::vector<expr> &args, const call_c &old);
-expr remake_call(
-        const func_c &func, const std::vector<expr_c> &args, const call_c &old);
-
-/**
- * Makes a tensor node
- * @param name the name of the tensor
- * @param dims the dimensions, should be integers
- * @param dtype the elemente type of the tensor
- * @param addr_space the address space: CPU/Device
- * @param strides stride info for each dim (optional)
- * @return the created node
- * */
-SC_INTERNAL_API expr make_tensor(const std::string &name,
-        const std::vector<expr> &dims, sc_data_type_t dtype,
-        address_space addrspace = address_space::automatic,
-        const std::shared_ptr<static_data_t> &init_value = nullptr,
-        const std::vector<expr> &strides = {});
-SC_INTERNAL_API expr make_tensor(const std::string &name,
-        const std::vector<expr_c> &dims, sc_data_type_t dtype,
-        address_space addrspace = address_space::automatic,
-        const std::shared_ptr<static_data_t> &init_value = nullptr,
-        const std::vector<expr_c> &strides = {});
-SC_INTERNAL_API expr make_tensor(const std::string &name,
-        std::initializer_list<expr> dims, sc_data_type_t dtype,
-        address_space addrspace = address_space::automatic,
-        const std::shared_ptr<static_data_t> &init_value = nullptr,
-        std::initializer_list<expr> strides = std::initializer_list<expr>());
-
-/**
- * Makes a tensor node with user-defined stride
- * @param name the name of the tensor
- * @param dims the dimensions, should be integers
- * @param strides stride info for each dim
- * @param dtype the elemente type of the tensor
- * @param addr_space the address space: CPU/Device
- * @return the created node
- * */
-expr make_stensor(const std::string &name, const std::vector<expr> &dims,
-        const std::vector<expr> &strides, sc_data_type_t dtype,
-        address_space addrspace = address_space::automatic,
-        const std::shared_ptr<static_data_t> &init_value = nullptr);
-expr make_stensor(const std::string &name, const std::vector<expr_c> &dims,
-        const std::vector<expr_c> &strides, sc_data_type_t dtype,
-        address_space addrspace = address_space::automatic,
-        const std::shared_ptr<static_data_t> &init_value = nullptr);
-
-/**
- * Makes a function node
- * @param name the function name
- * @param params the parameters, should be tensors or vars
- * @param body the body of the function
- * @param ret_type the return type of the function
- * @return the created node
- * */
-SC_INTERNAL_API func_t make_func(const std::string &name,
-        const std::vector<expr> &params, stmt body, sc_data_type_t ret_type);
-/**
- * @see make_func overloaded function
- * */
-SC_INTERNAL_API func_t make_func(const std::string &name,
-        const std::vector<expr_c> &params, const stmt_c &body,
-        sc_data_type_t ret_type);
-
-/**
- * Makes an assign statement, the statement is not attached to any builder
- * @param var the dest LValue
- * @param value the src RValue
- * @return the pushed node
- * */
-stmt make_assign_unattached(const expr_c &var, const expr_c &value);
-
-/**
- * Makes an stmts statement, the statement is not attached to any builder
- * @param seq the sequence of the stmt
- * @return the pushed node
- * */
-stmt make_stmts_unattached(const std::vector<stmt_c> &seq);
-
-/**
- * Makes an if_else, the statement is not attached to any builder
- * @param condition_ the condition, should be boolean typed
- * @param then_case_ the `then` block
- * @param else_case_ the `else` block, nullable.
- * @return the pushed node
- * */
-stmt make_if_else_unattached(const expr_c &condition, const stmt_c &then_case,
-        const stmt_c &else_case);
-
-/**
- * Makes an evaluate statement, the statement is not attached to any
- * builder
- * @param val the expression
- * @return the pushed node
- * */
-stmt make_evaluate_unattached(const expr_c &val);
-
-/**
- * Makes a return statement, the statement is not attached to any builder
- * @param val the expression, nullable if the current function returns
- *      void_t
- * @return the pushed node
- * */
-stmt make_returns_unattached(const expr_c &val = expr_c());
-
-/**
- * Makes a variable def statement, the statement is not attached to any builder
- * @param var defined var or tensor
- * @param linkage
- * @param init init value, nullable
- * @return the pushed node
- * */
-SC_INTERNAL_API stmt make_var_tensor_def_unattached(const expr_c &var,
-        linkage linkage = linkage::local, const expr_c &init = expr_c());
-
-/**
- * Makes a for_loop, the statement is not attached to any builder
- * @param var the iterate variable. The loop-var is expected to
- *  be used only within the scope of the loop. Should be an integer var
- * @param iter_begin the initial value of var_
- * @param iter_end the max bound of the loop-var var_. Can never be reached
- * @param step the step of var_ in each iteration.
- * @param body the body of the loop
- * @param incremental if the loop-var var_ is incremental. Not currently
- *  used.
- * @param kind the kind of the loop. @see for_type
- * @param num_threads the number of threads to use in parallel-for. 0 for using
- * all avaliable threads in current thread group. If the loop is not parallel,
- * it must be 0.
- * @return the pushed node
- * */
-stmt make_for_loop_unattached(const expr_c &var, const expr_c &iter_begin,
-        const expr_c &iter_end, const expr_c &step, const stmt_c &body,
-        bool incremental, for_type kind, int num_threads = 0);
-
-// makes a new intrin_call with type_ and intrin_attrs_ copied
-intrin_call remake_intrin_call(
-        const intrin_call_c &v, const std::vector<expr> &newargs);
-
-// makes a func_ptr
-expr make_func_addr(func_t v);
-
-/**
- * Makes a phi node
- * */
-expr make_phi(const std::vector<expr> &values, bool is_loop_phi = false);
-
-/**
- * Makes a x86 gerenal intrinsic node
- * */
-expr make_x86_intrin(x86_intrin_type::x86_intrin_type_t type,
-        const std::vector<expr> &args, const any_map_t &attrs = any_map_t());
-
-// makes a new low_level_intrin with newargs and type_ copied
-low_level_intrin remake_low_level_intrin(
-        const low_level_intrin_c &v, const std::vector<expr> &newargs);
-
-/**
- * Gets the pointer of an element of the tensor as a view
- * @see tensorptr_node
- * @param tensor the tensor
- * @param idx the indices of the element within the tensor
- * @param shape the shape of the resulting view. It can be empty, when
- * there is no indexing on the resulting view. If there is an indexing
- * on this tensor_ptr, the shape cannot be empty.
- * @param is_slice How `index_flatten` pass flatten the indexing on this
- * tensor view. For example, we have a tensorptr
- * `ptr=&base[offset0,offset1,offset2]`.If `is_slice` is true, indexing
- * on this tensor view `ptr[i,j,k]` will be first mapped to indexing on
- * the base tensor: `base[i+offset0, j+offset1, k+offset2]`, where
- * offsetN is the offset of this tensorptr to the base tensor. Then it
- * will flatten the mapped indexing. If `is_slice` is false,
- * `index_flatten` pass will treat the view as an independent tensor,
- * which shares a part of the memory from the base tensor. So the
- * indexing on view `ptr[i,j,k]` will be first flattened to `ptr[i * A +
- * j * B + k]`, where `A` and `B` depends on the `shape` field of this
- * IR node. Then it will add the flattened index `i * A + j * B + k` to
- * the flattened base offset of this tensor ptr `offset0 * C + offset1 *
- * D + offset2`, to finally compute the index on the base tensor.
- * @return the address of the element in the tensor
- * */
-SC_INTERNAL_API expr tensor_ptr(const expr &tensor,
-        const std::vector<expr> &idx, const std::vector<expr> &shape = {},
-        bool is_slice = false);
-SC_INTERNAL_API expr tensor_ptr(const expr_c &tensor,
-        const std::vector<expr_c> &idx, const std::vector<expr_c> &shape = {},
-        bool is_slice = false);
-SC_INTERNAL_API expr tensor_ptr(const expr &tensor,
-        std::initializer_list<expr> idx, std::initializer_list<expr> shape = {},
-        bool is_slice = false);
-/**
- * Sets the attr of the input IR node
- * @tparam TNode should be expr(_c) or stmt(_c)
- * @tparam T the value of the attr
- *
- * @param node the IR node
- * @param key the key name of the attr
- * @param value the value of the attr
- * @return node
- * */
-template <typename TNode, typename T>
-TNode with_attr(TNode node, const std::string &key, const T &value) {
-    node->attr()[key] = value;
-    return node;
-}
-
-/**
- * A builder with scope guard to help set the builder of the current
- * thread. It will set the current builder to `this` and remember the old
- * builder. After this builder is destructed, it will set the current builder
- * back to the old one. Using this builder, we can ensure that
- * builder::get_current_builder is a valid pointer or null.
- * @see builder_impl_t
- * @note Common usage:
- * {
- *      guarded_builder builder;
- *      // build the IR with builder
- * }
- * */
-class ir_builder_t : public builder_impl_t {
-    builder_impl_t *old_;
-
-public:
-    ir_builder_t(const ir_builder_t &) = delete;
-    ir_builder_t() {
-        old_ = get_current_builder();
-        set_current_builder(this);
-    }
-    ~ir_builder_t() { set_current_builder(old_); }
-};
-} // namespace builder
-
-#define _BUILDER_MAKE_BIN_OP(OP, NAME) \
-    inline expr operator OP(const expr_c &l, const expr_c &r) { \
-        return builder::make_##NAME(l, r); \
-    }
-
-_BUILDER_MAKE_BIN_OP(+, add)
-_BUILDER_MAKE_BIN_OP(-, sub)
-_BUILDER_MAKE_BIN_OP(*, mul)
-_BUILDER_MAKE_BIN_OP(/, div)
-_BUILDER_MAKE_BIN_OP(%, mod)
-_BUILDER_MAKE_BIN_OP(==, cmp_eq)
-_BUILDER_MAKE_BIN_OP(!=, cmp_ne)
-_BUILDER_MAKE_BIN_OP(<, cmp_lt)
-_BUILDER_MAKE_BIN_OP(<=, cmp_le)
-_BUILDER_MAKE_BIN_OP(>, cmp_gt)
-_BUILDER_MAKE_BIN_OP(>=, cmp_ge)
-_BUILDER_MAKE_BIN_OP(&&, logic_and)
-_BUILDER_MAKE_BIN_OP(||, logic_or)
-_BUILDER_MAKE_BIN_OP(&, int_and)
-_BUILDER_MAKE_BIN_OP(|, int_or)
-_BUILDER_MAKE_BIN_OP(^, int_xor)
-_BUILDER_MAKE_BIN_OP(<<, shl)
-_BUILDER_MAKE_BIN_OP(>>, shr)
-
-inline expr operator!(const expr_c &l) {
-    return builder::make_logic_not(l);
-}
-#undef _BUILDER_MAKE_BIN_OP
-
-expr copy_attr(const expr_base &ths, expr &&newexpr);
-stmt copy_attr(const stmt_base_t &ths, stmt &&newstmt);
-func_t copy_attr(const func_base &ths, func_t &&newfunc);
-
-stmt get_parent_node(const stmt &node);
-// If buffer is tptr, return base tensor, If buffer is tensor, return itself
-tensor get_real_tensor(const expr &buffer);
-// set base tensor of `tptr` with `tsr`
-void set_base_tensor(expr &tptr, const expr &tsr);
-void add_parent_node(const stmt &s, const stmt &ret);
-stmt get_common_parent_node(const stmt &node1, const stmt &node2);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/builtin.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/builtin.cpp
deleted file mode 100644
index dc0df9eb223..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/builtin.cpp
+++ /dev/null
@@ -1,965 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "builtin.hpp"
-#include <array>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/index2var.hpp>
-#include <unordered_map>
-#include <util/math_utils.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(microkernel.builtin)
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-expr get_ir_null() {
-    return make_expr<constant_node>(UINT64_C(0), datatypes::pointer);
-}
-expr get_ir_zero_index() {
-    return make_expr<constant_node>(UINT64_C(0), datatypes::index);
-}
-bool is_pure_func_call(const expr_c &v) {
-    if (!v.isa<call>()) { return false; }
-    auto func = v.static_as<call_c>()->get_prototype();
-    return func->attr_ && func->attr_->get_or_else(function_attrs::pure, false);
-}
-
-namespace builtin {
-sc_data_type_t infer_output_dtype(sc_data_type_t dtype_A) {
-    if (dtype_A == datatypes::u8 || dtype_A == datatypes::s8) {
-        return datatypes::s32;
-    }
-    return datatypes::f32;
-}
-
-void print_index(expr v) {
-    static const func_t print_index_f = make_func("print_index",
-            {make_var(datatypes::index, "v")}, stmt(), datatypes::void_t);
-    _evaluate_call_(print_index_f, std::move(v));
-}
-
-void print_int(expr v) {
-    static const func_t print_int_f = make_func("print_int",
-            {make_var(datatypes::s32, "v")}, stmt(), datatypes::void_t);
-    _evaluate_call_(print_int_f, std::move(v));
-}
-
-void print_float(expr v) {
-    static const func_t print_float_f = make_func("print_float",
-            {make_var(datatypes::f32, "v")}, stmt(), datatypes::void_t);
-    _evaluate_call_(print_float_f, std::move(v));
-}
-
-void print_str(expr v) {
-    static const func_t print_str_f = make_func("print_str",
-            {make_var(sc_data_type_t::pointerof(sc_data_etype::U8), "v")},
-            stmt(), datatypes::void_t);
-    _evaluate_call_(print_str_f, std::move(v));
-}
-
-void print_str(const std::string &v) {
-    print_str(builder::get_current_builder()->make_str(v));
-}
-
-void print_str(const char *v) {
-    print_str(std::string(v));
-}
-
-static const func_t &mark_trace_func(const func_t &f) {
-    f->attr()[function_attrs::is_trace_func] = true;
-    return f;
-}
-
-expr make_trace(expr func_name, expr in_or_out, expr arg) {
-    static func_t make_trace_f = mark_trace_func(make_func("sc_make_trace",
-            {make_var(datatypes::s32, "func_name"),
-                    make_var(datatypes::s32, "in_or_out"),
-                    make_var(datatypes::s32, "arg")},
-            stmt(), datatypes::void_t));
-    return make_trace_f(
-            std::move(func_name), std::move(in_or_out), std::move(arg));
-}
-
-expr make_trace_kernel(expr func_name, expr in_or_out, expr arg) {
-    static func_t make_trace_f
-            = mark_trace_func(make_func("sc_make_trace_kernel",
-                    {make_var(datatypes::s32, "func_name"),
-                            make_var(datatypes::s32, "in_or_out"),
-                            make_var(datatypes::s32, "arg")},
-                    stmt(), datatypes::void_t));
-    return make_trace_f(
-            std::move(func_name), std::move(in_or_out), std::move(arg));
-}
-
-func_t get_brgemm_init_func() {
-    static func_t brgemm_func = _decl_func("dnnl_brgemm_init", datatypes::s32,
-            {_arg_("C", datatypes::pointer), _arg_("M", datatypes::s32),
-                    _arg_("N", datatypes::s32), _arg_("LDC", datatypes::s32),
-                    _arg_("dtypeC", datatypes::s32),
-                    _arg_("value", datatypes::f32)});
-    return brgemm_func;
-}
-
-void dnnl_brgemm_init(
-        expr C, expr M, expr N, expr LDC, sc_data_type_t dtypeC, expr value) {
-    _evaluate_call_(get_brgemm_init_func(), std::move(C), std::move(M),
-            std::move(N), std::move(LDC), dtypeC.as_etype_int(),
-            std::move(value));
-}
-
-static const char *brgemm_names[] = {
-        "dnnl",
-        "sc",
-};
-
-static const char *get_brgemm_name(scflags_t::brgemm_backend_t backend) {
-    return brgemm_names[static_cast<int>(backend)];
-}
-
-static const func_t &mark_brgemm_func(const func_t &f) {
-    f->attr()["is_brgemm_func_with_stream"] = true;
-    return f;
-}
-
-// returns the kernel creator and kernel caller pair
-static std::pair<func_t, func_t> declare_brgemm_kernel_creator(
-        scflags_t::brgemm_backend_t backend, brgemm_mode mode,
-        bool has_postop) {
-    std::stringstream ss;
-    std::string postfix = has_postop ? "_call_postops" : "_call";
-    std::vector<std::vector<expr>> post_args
-            = {_arg_("postops_data", datatypes::pointer),
-                    _arg_("c_buf", datatypes::pointer)};
-    if (mode == brgemm_mode::stride) {
-        ss << get_brgemm_name(backend) << "_brgemm";
-        static func_t creator
-                = _decl_func(ss.str() + "_func", datatypes::pointer,
-                        {_arg_("M", datatypes::s32), _arg_("N", datatypes::s32),
-                                _arg_("K", datatypes::s32),
-                                _arg_("LDA", datatypes::s32),
-                                _arg_("LDB", datatypes::s32),
-                                _arg_("LDC", datatypes::s32),
-                                _arg_("stride_a", datatypes::s32),
-                                _arg_("stride_b", datatypes::s32),
-                                _arg_("beta", datatypes::f32),
-                                _arg_("dtypeA", datatypes::s32),
-                                _arg_("dtypeB", datatypes::s32),
-                                _arg_("brg_attrs", datatypes::pointer),
-                                _arg_("bd_mask", datatypes::pointer),
-                                _arg_("postops_setting", datatypes::pointer)});
-        auto caller_args = std::vector<std::vector<expr>> {
-                _arg_("func", datatypes::pointer),
-                _arg_("A", datatypes::pointer), _arg_("B", datatypes::pointer),
-                _arg_("C", datatypes::pointer), _arg_("num", datatypes::s32),
-                _arg_("top_pad", datatypes::pointer),
-                _arg_("bottom_pad", datatypes::pointer),
-                _arg_("stream", datatypes::pointer)};
-        if (has_postop) {
-            caller_args.insert(
-                    caller_args.end() - 1, post_args.begin(), post_args.end());
-        }
-        func_t caller = mark_brgemm_func(_decl_func(
-                ss.str() + postfix, datatypes::void_t, std::move(caller_args)));
-        return std::pair<func_t, func_t>(creator, caller);
-    } else {
-        ss << get_brgemm_name(backend) << "_brgemm_list";
-        static func_t creator
-                = _decl_func(ss.str() + "_func", datatypes::pointer,
-                        {_arg_("M", datatypes::s32), _arg_("N", datatypes::s32),
-                                _arg_("K", datatypes::s32),
-                                _arg_("LDA", datatypes::s32),
-                                _arg_("LDB", datatypes::s32),
-                                _arg_("LDC", datatypes::s32),
-                                _arg_("beta", datatypes::f32),
-                                _arg_("dtypeA", datatypes::s32),
-                                _arg_("dtypeB", datatypes::s32),
-                                _arg_("brg_attrs", datatypes::pointer),
-                                _arg_("bd_mask", datatypes::pointer),
-                                _arg_("postops_setting", datatypes::pointer)});
-        auto caller_args = std::vector<std::vector<expr>> {
-                _arg_("func", datatypes::pointer),
-                _arg_("A", datatypes::pointer), _arg_("B", datatypes::pointer),
-                _arg_("C", datatypes::pointer), _arg_("num", datatypes::s32),
-                _arg_("stride_a", datatypes::s32),
-                _arg_("stride_b", datatypes::s32), _arg_("len", datatypes::s32),
-                _arg_("dtypeA", datatypes::s32),
-                _arg_("dtypeB", datatypes::s32),
-                _arg_("top_pad", datatypes::pointer),
-                _arg_("bottom_pad", datatypes::pointer),
-                _arg_("stream", datatypes::pointer)};
-        if (has_postop) {
-            caller_args.insert(
-                    caller_args.end() - 1, post_args.begin(), post_args.end());
-        }
-        func_t caller = mark_brgemm_func(_decl_func(
-                ss.str() + postfix, datatypes::void_t, std::move(caller_args)));
-        return std::pair<func_t, func_t>(creator, caller);
-    }
-}
-
-std::pair<func_t, func_t> get_brgemm_creator_and_call_func(brgemm_mode mode,
-        scflags_t::brgemm_backend_t backend, bool has_postop) {
-#define DEF_FUNC(back, list_stride) \
-    if (mode == brgemm_mode::list_stride \
-            && backend == scflags_t::brgemm_backend_t::back) { \
-        static std::pair<func_t, func_t> f0 \
-                = declare_brgemm_kernel_creator(backend, mode, false); \
-        static std::pair<func_t, func_t> f1 \
-                = declare_brgemm_kernel_creator(backend, mode, true); \
-        return has_postop ? f1 : f0; \
-    }
-    // we need a static variable each branch to ensure there will be no
-    // duplicated decl for the same func_t.
-    DEF_FUNC(dnnl, stride)
-    DEF_FUNC(dnnl, addr_list)
-#undef DEF_FUNC
-    assert(0 && "Unreachable");
-    return std::pair<func_t, func_t>();
-}
-
-// returns the kernel creator and kernel caller pair
-static func_t declare_brgemm_update_funcs(
-        scflags_t::brgemm_backend_t backend, brgemm_mode mode, bool init) {
-    std::stringstream ss;
-    if (mode == brgemm_mode::stride) {
-        ss << get_brgemm_name(backend) << "_brgemm_";
-        if (init) { ss << "init_"; }
-        ss << "update";
-        auto update_args = std::vector<std::vector<expr>> {
-                _arg_("A", datatypes::pointer), _arg_("B", datatypes::pointer),
-                _arg_("C", datatypes::pointer), _arg_("num", datatypes::s32),
-                _arg_("M", datatypes::s32), _arg_("N", datatypes::s32),
-                _arg_("K", datatypes::s32), _arg_("LDA", datatypes::s32),
-                _arg_("LDB", datatypes::s32), _arg_("LDC", datatypes::s32),
-                _arg_("stride_a", datatypes::s32),
-                _arg_("stride_b", datatypes::s32),
-                _arg_("dtypeA", datatypes::s32),
-                _arg_("dtypeB", datatypes::s32),
-                _arg_("brg_attrs", datatypes::pointer),
-                _arg_("bd_mask", datatypes::pointer),
-                _arg_("postops_setting", datatypes::pointer),
-                _arg_("top_pad", datatypes::pointer),
-                _arg_("bottom_pad", datatypes::pointer),
-                _arg_("postops_data", datatypes::pointer),
-                _arg_("c_buf", datatypes::pointer),
-                _arg_("stream", datatypes::pointer)};
-
-        func_t update = mark_brgemm_func(
-                _decl_func(ss.str(), datatypes::s32, std::move(update_args)));
-        return update;
-    } else {
-        ss << get_brgemm_name(backend) << "_brgemm_";
-        if (init) { ss << "init_"; }
-        ss << "list_update";
-        auto update_args = std::vector<std::vector<expr>> {
-                _arg_("A", datatypes::pointer), _arg_("B", datatypes::pointer),
-                _arg_("C", datatypes::pointer), _arg_("num", datatypes::s32),
-                _arg_("M", datatypes::s32), _arg_("N", datatypes::s32),
-                _arg_("K", datatypes::s32), _arg_("LDA", datatypes::s32),
-                _arg_("LDB", datatypes::s32), _arg_("LDC", datatypes::s32),
-                _arg_("stride_a", datatypes::s32),
-                _arg_("stride_b", datatypes::s32), _arg_("len", datatypes::s32),
-                _arg_("dtypeA", datatypes::s32),
-                _arg_("dtypeB", datatypes::s32),
-                _arg_("brg_attrs", datatypes::pointer),
-                _arg_("bd_mask", datatypes::pointer),
-                _arg_("postops_setting", datatypes::pointer),
-                _arg_("top_pad", datatypes::pointer),
-                _arg_("bottom_pad", datatypes::pointer),
-                _arg_("postops_data", datatypes::pointer),
-                _arg_("c_buf", datatypes::pointer),
-                _arg_("stream", datatypes::pointer)};
-        func_t brgemm_func = mark_brgemm_func(
-                _decl_func(ss.str(), datatypes::s32, std::move(update_args)));
-        return brgemm_func;
-    }
-}
-
-func_t get_brgemm_call_range_func(brgemm_mode mode) {
-    if (mode == brgemm_mode::stride) {
-        auto update_args = std::vector<std::vector<expr>> {
-                _arg_("func", datatypes::pointer),
-                _arg_("M_real", datatypes::s32),
-                _arg_("N_real", datatypes::s32),
-                _arg_("K_real", datatypes::s32), _arg_("A", datatypes::pointer),
-                _arg_("B", datatypes::pointer), _arg_("C", datatypes::pointer),
-                _arg_("num", datatypes::s32),
-                _arg_("top_pad", datatypes::pointer),
-                _arg_("bottom_pad", datatypes::pointer),
-                _arg_("stream", datatypes::pointer)};
-
-        static func_t update
-                = mark_brgemm_func(_decl_func("dnnl_brgemm_call_range",
-                        datatypes::s32, std::move(update_args)));
-        return update;
-    } else {
-        auto update_args = std::vector<std::vector<expr>> {
-                _arg_("func", datatypes::pointer),
-                _arg_("M_real", datatypes::s32),
-                _arg_("N_real", datatypes::s32),
-                _arg_("K_real", datatypes::s32), _arg_("A", datatypes::pointer),
-                _arg_("B", datatypes::pointer), _arg_("C", datatypes::pointer),
-                _arg_("num", datatypes::s32), _arg_("stride_a", datatypes::s32),
-                _arg_("stride_b", datatypes::s32), _arg_("len", datatypes::s32),
-                _arg_("dtypeA", datatypes::s32),
-                _arg_("dtypeB", datatypes::s32),
-                _arg_("top_pad", datatypes::pointer),
-                _arg_("bottom_pad", datatypes::pointer),
-                _arg_("stream", datatypes::pointer)};
-        static func_t brgemm_func
-                = mark_brgemm_func(_decl_func("dnnl_brgemm_list_call_range",
-                        datatypes::s32, std::move(update_args)));
-        return brgemm_func;
-    }
-}
-
-std::pair<func_t, func_t> get_brgemm_update_funcs(
-        brgemm_mode mode, scflags_t::brgemm_backend_t backend) {
-#define DEF_FUNC(back) \
-    if (mode == brgemm_mode::stride \
-            && backend == scflags_t::brgemm_backend_t::back) { \
-        static std::pair<func_t, func_t> f = std::pair<func_t, func_t>( \
-                declare_brgemm_update_funcs(backend, mode, false), \
-                declare_brgemm_update_funcs(backend, mode, true)); \
-        return f; \
-    }
-#define DEF_LIST_FUNC(back) \
-    if (mode == brgemm_mode::addr_list \
-            && backend == scflags_t::brgemm_backend_t::back) { \
-        static std::pair<func_t, func_t> f = std::pair<func_t, func_t>( \
-                declare_brgemm_update_funcs(backend, mode, false), \
-                declare_brgemm_update_funcs(backend, mode, true)); \
-        return f; \
-    }
-    // we need a static variable each branch to ensure there will be no
-    // duplicated decl for the same func_t.
-    DEF_FUNC(dnnl)
-    DEF_LIST_FUNC(dnnl)
-#undef DEF_FUNC
-#undef DEF_LIST_FUNC
-    assert(0 && "Unreachable");
-    return std::pair<func_t, func_t>();
-}
-
-evaluate brgemm_init_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &LDA, const expr &LDB, const expr &LDC, const expr &stride_a,
-        const expr &stride_b, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB, const sc_brgemm_attrs_t &brg_attrs,
-        const sc_brgemm_bd_mask_t &bd_mask, const expr &bd_mask_idx,
-        const int &bd_mask_set_num, const expr &top_pad, const expr &bottom_pad,
-        const sc_brgemm_postops_setting_t &postops_set,
-        const std::vector<expr> &postops_data, const expr &c_buf) {
-    return builder::get_current_builder()
-            ->brgemm(A, B, C, num, M, N, K, LDA, LDB, LDC, stride_a, stride_b,
-                    postops_data, c_buf, bd_mask_idx, top_pad, bottom_pad,
-                    {brgemm_args::cpu_t {true}, dtypeA, dtypeB,
-                            infer_output_dtype(dtypeA), brg_attrs, bd_mask,
-                            bd_mask_set_num, postops_set})
-            .checked_as<evaluate>();
-}
-
-evaluate brgemm_init_update_allow_fusion(const expr &A, const expr &B,
-        const expr &C, const expr &num, const expr &M, const expr &N,
-        const expr &K, const expr &LDA, const expr &LDB, const expr &LDC,
-        const expr &stride_a, const expr &stride_b,
-        const sc_data_type_t &dtypeA, const sc_data_type_t &dtypeB,
-        const sc_brgemm_attrs_t &brg_attrs, const sc_brgemm_bd_mask_t &bd_mask,
-        const expr &bd_mask_idx, const int &bd_mask_set_num,
-        const expr &top_pad, const expr &bottom_pad,
-        const sc_brgemm_postops_setting_t &postops_set,
-        const std::vector<expr> &postops_data, const expr &c_buf) {
-    auto ret = brgemm_init_update(A, B, C, num, M, N, K, LDA, LDB, LDC,
-            stride_a, stride_b, dtypeA, dtypeB, brg_attrs, bd_mask, bd_mask_idx,
-            bd_mask_set_num, top_pad, bottom_pad, postops_set, postops_data,
-            c_buf)
-                       .checked_as<evaluate>();
-    builder::get_current_builder()
-            ->get_current_scope()
-            .as_seq()
-            .back()
-            .checked_as<evaluate>()
-            ->value_.checked_as<intrin_call>()
-            ->intrin_attrs_->set(intrin_attr::allow_brgemm_fusion, true);
-    return ret;
-}
-
-evaluate brgemm_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &LDA, const expr &LDB, const expr &LDC, const expr &stride_a,
-        const expr &stride_b, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB, const sc_brgemm_attrs_t &brg_attrs,
-        const sc_brgemm_bd_mask_t &bd_mask, const expr &bd_mask_idx,
-        const int &bd_mask_set_num, const expr &top_pad, const expr &bottom_pad,
-        const sc_brgemm_postops_setting_t &postops_set,
-        const std::vector<expr> &postops_data, const expr &c_buf) {
-    return builder::get_current_builder()
-            ->brgemm(A, B, C, num, M, N, K, LDA, LDB, LDC, stride_a, stride_b,
-                    postops_data, c_buf, bd_mask_idx, top_pad, bottom_pad,
-                    {brgemm_args::cpu_t {false}, dtypeA, dtypeB,
-                            infer_output_dtype(dtypeA), brg_attrs, bd_mask,
-                            bd_mask_set_num, postops_set})
-            .checked_as<evaluate>();
-}
-
-evaluate brgemm_list_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &LDA, const expr &LDB, const expr &LDC, const expr &stride_a,
-        const expr &stride_b, const expr &len, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB, const sc_brgemm_attrs_t &brg_attrs,
-        const sc_brgemm_bd_mask_t &bd_mask, const expr &bd_mask_idx,
-        const int &bd_mask_set_num, const expr &top_pad, const expr &bottom_pad,
-        const sc_brgemm_postops_setting_t &postops_set,
-        const std::vector<expr> &postops_data, const expr &c_buf) {
-    A->attr()[attr_keys::no_index2var] = true;
-    A->attr()["list_brgemm_arg"] = true;
-    B->attr()[attr_keys::no_index2var] = true;
-    B->attr()["list_brgemm_arg"] = true;
-    return builder::get_current_builder()
-            ->list_brgemm(A, B, C, num, M, N, K, LDA, LDB, LDC, stride_a,
-                    stride_b, len, postops_data, c_buf, bd_mask_idx, top_pad,
-                    bottom_pad,
-                    brgemm_args::extra_args_t(brgemm_args::cpu_t {false},
-                            dtypeA, dtypeB, infer_output_dtype(dtypeA),
-                            brg_attrs, bd_mask, bd_mask_set_num, postops_set))
-            .checked_as<evaluate>();
-}
-
-evaluate brgemm_init_list_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &LDA, const expr &LDB, const expr &LDC, const expr &stride_a,
-        const expr &stride_b, const expr &len, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB, const sc_brgemm_attrs_t &brg_attrs,
-        const sc_brgemm_bd_mask_t &bd_mask, const expr &bd_mask_idx,
-        const int &bd_mask_set_num, const expr &top_pad, const expr &bottom_pad,
-        const sc_brgemm_postops_setting_t &postops_set,
-        const std::vector<expr> &postops_data, const expr &c_buf) {
-    A->attr()[attr_keys::no_index2var] = true;
-    A->attr()["list_brgemm_arg"] = true;
-    B->attr()[attr_keys::no_index2var] = true;
-    B->attr()["list_brgemm_arg"] = true;
-    return builder::get_current_builder()
-            ->list_brgemm(A, B, C, num, M, N, K, LDA, LDB, LDC, stride_a,
-                    stride_b, len, postops_data, c_buf, bd_mask_idx, top_pad,
-                    bottom_pad,
-                    brgemm_args::extra_args_t(brgemm_args::cpu_t {true}, dtypeA,
-                            dtypeB, infer_output_dtype(dtypeA), brg_attrs,
-                            bd_mask, bd_mask_set_num, postops_set))
-            .checked_as<evaluate>();
-}
-
-void brgemm_init(
-        expr C, expr M, expr N, expr LDC, sc_data_type_t dtypeC, expr value) {
-    dnnl_brgemm_init(std::move(C), std::move(M), std::move(N), std::move(LDC),
-            dtypeC, std::move(value));
-}
-
-func_t get_mem_set_func() {
-    static func_t memzerofunc = _decl_func("memset", datatypes::pointer,
-            {_arg_("ptr", datatypes::pointer), _arg_("v", datatypes::s32),
-                    _arg_("len", datatypes::index)});
-    return memzerofunc;
-}
-
-void mem_zero(expr C, const expr &size, sc_data_type_t dtype) {
-    _evaluate_call_(get_mem_set_func(), std::move(C), 0,
-            size * utils::get_sizeof_type(dtype));
-}
-
-func_t get_brgemm_postops_data_init_func() {
-    static func_t data_init_func
-            = _decl_func("dnnl_brgemm_postops_data_init", datatypes::void_t,
-                    {_arg_("dnnl_data", datatypes::pointer),
-                            _arg_("bias", datatypes::pointer),
-                            _arg_("scales", datatypes::pointer),
-                            _arg_("binary_post_ops_rhs", datatypes::pointer),
-                            _arg_("oc_logical_off", datatypes::index),
-                            _arg_("dst_row_logical_off", datatypes::index),
-                            _arg_("data_C_ptr_", datatypes::pointer),
-                            _arg_("first_mb_matrix_addr_off", datatypes::index),
-                            _arg_("a_zp_compensations", datatypes::pointer),
-                            _arg_("b_zp_compensations", datatypes::pointer),
-                            _arg_("c_zp_values", datatypes::pointer),
-                            _arg_("skip_accumulation", datatypes::boolean),
-                            _arg_("zp_a_val", datatypes::s32),
-                            _arg_("do_only_comp", datatypes::boolean),
-                            _arg_("do_only_zp_a_val", datatypes::boolean)});
-    return data_init_func;
-}
-
-std::vector<expr> create_initialed_postops_data() {
-    std::vector<expr> data(brgemm::postops_data_init_func_nargs, get_ir_null());
-    auto false_node = make_expr<constant_node>(UINT64_C(0), datatypes::boolean);
-    // oc_logical_off
-    data[3] = get_ir_zero_index();
-    // dst_row_logical_off
-    data[4] = get_ir_zero_index();
-    // first_mb_matrix_addr_off
-    data[6] = get_ir_zero_index();
-    // skip_accumulation
-    data[10] = false_node;
-    // zp_a_val
-    data[11] = make_expr<constant_node>(UINT64_C(0), datatypes::s32);
-    // do_only_comp
-    data[12] = false_node;
-    // do_only_zp_a_val
-    data[13] = false_node;
-    return data;
-}
-
-// dynamic query format function call
-expr call_matmul_core_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &ori_in0,
-        const expr &ori_in1, const expr &out_format0, const expr &in_format0,
-        const expr &in_format1, const expr &ori_in_format0,
-        const expr &ori_in_format1, const expr &out_size, const expr &kernel,
-        const expr &impl) {
-    static func_t matmul_core_query_f = make_func("query_format_matmul_core_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp0"),
-                    make_var(datatypes::pointer, "inp1"),
-                    make_var(datatypes::pointer, "ori_inp0"),
-                    make_var(datatypes::pointer, "ori_inp1"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt0"),
-                    make_var(datatypes::pointer, "inp_fmt1"),
-                    make_var(datatypes::pointer, "ori_inp_fmt0"),
-                    make_var(datatypes::pointer, "ori_inp_fmt1"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel"),
-                    make_var(datatypes::pointer, "impl")},
-            stmt(), datatypes::void_t);
-    return matmul_core_query_f(tb, out0, in0, in1, ori_in0, ori_in1,
-            out_format0, in_format0, in_format1, ori_in_format0, ori_in_format1,
-            out_size, kernel, impl);
-}
-
-expr call_managed_matmul_core_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &ori_in0,
-        const expr &ori_in1, const expr &out_format0, const expr &in_format0,
-        const expr &in_format1, const expr &ori_in_format0,
-        const expr &ori_in_format1, const expr &out_size, const expr &kernel,
-        const expr &impl) {
-    static func_t mmm_query_f = make_func("query_format_managed_matmul_core_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp0"),
-                    make_var(datatypes::pointer, "inp1"),
-                    make_var(datatypes::pointer, "ori_inp0"),
-                    make_var(datatypes::pointer, "ori_inp1"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt0"),
-                    make_var(datatypes::pointer, "inp_fmt1"),
-                    make_var(datatypes::pointer, "ori_inp_fmt0"),
-                    make_var(datatypes::pointer, "ori_inp_fmt1"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel"),
-                    make_var(datatypes::pointer, "impl")},
-            stmt(), datatypes::void_t);
-    return mmm_query_f(tb, out0, in0, in1, ori_in0, ori_in1, out_format0,
-            in_format0, in_format1, ori_in_format0, ori_in_format1, out_size,
-            kernel, impl);
-}
-
-expr call_conv_fwd_core_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &ori_in0,
-        const expr &ori_in1, const expr &out_format0, const expr &in_format0,
-        const expr &in_format1, const expr &ori_in_format0,
-        const expr &ori_in_format1, const expr &out_size, const expr &kernel,
-        const expr &impl) {
-    static func_t conv_fwd_core_query_f
-            = make_func("query_format_conv_fwd_core_op",
-                    {make_var(datatypes::pointer, "op_table"),
-                            make_var(datatypes::pointer, "out"),
-                            make_var(datatypes::pointer, "inp0"),
-                            make_var(datatypes::pointer, "inp1"),
-                            make_var(datatypes::pointer, "ori_inp0"),
-                            make_var(datatypes::pointer, "ori_inp1"),
-                            make_var(datatypes::pointer, "out_fmt"),
-                            make_var(datatypes::pointer, "inp_fmt0"),
-                            make_var(datatypes::pointer, "inp_fmt1"),
-                            make_var(datatypes::pointer, "ori_inp_fmt0"),
-                            make_var(datatypes::pointer, "ori_inp_fmt1"),
-                            make_var(datatypes::pointer, "out_size"),
-                            make_var(datatypes::pointer, "kernel"),
-                            make_var(datatypes::pointer, "impl")},
-                    stmt(), datatypes::void_t);
-    return conv_fwd_core_query_f(tb, out0, in0, in1, ori_in0, ori_in1,
-            out_format0, in_format0, in_format1, ori_in_format0, ori_in_format1,
-            out_size, kernel, impl);
-}
-
-expr call_unary_fusible_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel) {
-    static func_t unary_query_f = make_func("query_format_unary_fusible_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return unary_query_f(
-            tb, out0, in0, out_format0, in_format0, out_size, kernel);
-}
-
-expr call_padding_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel) {
-    static func_t padding_query_f = make_func("query_format_padding_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return padding_query_f(
-            tb, out0, in0, out_format0, in_format0, out_size, kernel);
-}
-
-expr call_pooling_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel) {
-    static func_t pooling_query_f = make_func("query_format_pooling_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return pooling_query_f(
-            tb, out0, in0, out_format0, in_format0, out_size, kernel);
-}
-
-expr call_binary_fusible_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &out_format0,
-        const expr &in_format0, const expr &in_format1, const expr &out_size,
-        const expr &kernel) {
-    static func_t binary_query_f = make_func("query_format_binary_fusible_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp0"),
-                    make_var(datatypes::pointer, "inp1"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt0"),
-                    make_var(datatypes::pointer, "inp_fmt1"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return binary_query_f(tb, out0, in0, in1, out_format0, in_format0,
-            in_format1, out_size, kernel);
-}
-
-expr call_reorder_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel, const expr &impl) {
-    static func_t reorder_query_f = make_func("query_format_reorder_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel"),
-                    make_var(datatypes::pointer, "impl")},
-            stmt(), datatypes::void_t);
-    return reorder_query_f(
-            tb, out0, in0, out_format0, in_format0, out_size, kernel, impl);
-}
-
-expr call_reduce_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel) {
-    static func_t reorder_query_f = make_func("query_format_reduce_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    //     make_var(datatypes::s32, "rd_axis"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return reorder_query_f(
-            tb, out0, in0, out_format0, in_format0, out_size, kernel);
-}
-
-expr call_tensor_view_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel) {
-    static func_t reorder_query_f = make_func("query_format_tensor_view_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return reorder_query_f(
-            tb, out0, in0, out_format0, in_format0, out_size, kernel);
-}
-
-expr call_select_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &in2,
-        const expr &out_format0, const expr &in_format0, const expr &in_format1,
-        const expr &in_format2, const expr &out_size, const expr &kernel) {
-    static func_t select_query_f = make_func("query_format_select_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "out"),
-                    make_var(datatypes::pointer, "inp0"),
-                    make_var(datatypes::pointer, "inp1"),
-                    make_var(datatypes::pointer, "inp2"),
-                    make_var(datatypes::pointer, "out_fmt"),
-                    make_var(datatypes::pointer, "inp0_fmt"),
-                    make_var(datatypes::pointer, "inp1_fmt"),
-                    make_var(datatypes::pointer, "inp2_fmt"),
-                    make_var(datatypes::pointer, "out_size"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return select_query_f(tb, out0, in0, in1, in2, out_format0, in_format0,
-            in_format1, in_format2, out_size, kernel);
-}
-
-expr call_fused_op_query_combined(const expr &tb, const expr &combined_keys,
-        const expr &combined_algs, const expr &each_op_num_key,
-        const expr &op_num, const expr &kernel) {
-    static func_t fused_query_f = make_func("query_combined_fused_op",
-            {make_var(datatypes::pointer, "op_table"),
-                    make_var(datatypes::pointer, "combined_keys"),
-                    make_var(datatypes::pointer, "combined_algs"),
-                    make_var(datatypes::pointer, "each_op_num_key"),
-                    make_var(datatypes::s32, "op_num"),
-                    make_var(datatypes::pointer, "kernel")},
-            stmt(), datatypes::void_t);
-    return fused_query_f(
-            tb, combined_keys, combined_algs, each_op_num_key, op_num, kernel);
-}
-
-expr call_cal_blocking_dims(const expr &placeholder, const expr &format) {
-    static func_t cal_blocking_f = make_func("calculate_blocking_dims",
-            {make_var(datatypes::pointer, "placeholder"),
-                    make_var(datatypes::pointer, "format")},
-            stmt(), datatypes::index);
-    return cal_blocking_f(placeholder, format);
-}
-
-expr call_get_matmul_dyn_cfg_single(const expr &in, const expr &is_batch) {
-    static func_t matmul_dyn_cfg_f = make_func("get_matmul_dyn_cfg_single",
-            {make_var(datatypes::s32, "in"),
-                    make_var(datatypes::boolean, "is_batch")},
-            stmt(), datatypes::s32);
-    return matmul_dyn_cfg_f(in, is_batch);
-}
-
-static func_t set_pure_function(func_t f) {
-    f->attr()[function_attrs::pure] = true;
-    return f;
-}
-
-func_t get_thread_id_func() {
-    static func_t func = set_pure_function(
-            _decl_func("sc_get_thread_id", datatypes::s32, {}));
-    return func;
-}
-
-func_t get_is_in_parallel_func() {
-    static func_t func = set_pure_function(
-            _decl_func("sc_is_in_parallel", datatypes::s32, {}));
-    return func;
-}
-
-func_t get_barrier_arrive_func() {
-    static func_t func = _decl_func("sc_arrive_at_barrier", datatypes::void_t,
-            {_arg_("b", datatypes::pointer),
-                    _arg_("idle_func", datatypes::pointer),
-                    _arg_("idle_args", datatypes::pointer)});
-    return func;
-}
-
-func_t get_init_barrier_func() {
-    static func_t func = _decl_func("sc_init_barrier", datatypes::void_t,
-            {_arg_("b", datatypes::pointer), _arg_("num", datatypes::s32),
-                    _arg_("thread_cnt", datatypes::index)});
-    return func;
-}
-
-func_t get_set_idle_func_managed_func() {
-    static func_t f = builder::make_func("sc_set_idle_func_managed",
-            {builder::make_var(datatypes::pointer, "funcptr"),
-                    builder::make_var(datatypes::pointer, "arg1")},
-            stmt(), datatypes::void_t);
-    return f;
-}
-
-func_t get_tls_amx_buffer_func() {
-    static func_t f = set_pure_function(_decl_func("sc_get_tls_amx_buffer",
-            datatypes::pointer, {_arg_("stream", datatypes::pointer)}));
-    return f;
-}
-
-// get the sum of elements with indices [0,1,...,gid] of the
-// array: [A,A,..., A, B, B,...,B], where the count of element "A" is num_A
-static expr generate_balance211_job_id_base(const expr &gid, const expr &num_A,
-        const expr &val_A, const expr &val_B) {
-    return builder::make_select(
-            gid <= num_A, gid * val_A, num_A * val_A + (gid - num_A) * val_B);
-}
-
-uint64_t generate_balance211(int num_threads, const expr &start_e,
-        const expr &end_e, const expr &step_e, const expr &gid,
-        const std::function<std::string(const char *)> &namer, expr *out_start,
-        expr *out_len, expr *out_end, std::vector<stmt> *out_seq) {
-    // the util function to define a var with init value
-    auto def_var = [&](const char *name, const expr &init_v,
-                           sc_data_type_t dtype = datatypes::index) {
-        auto ret = builder::make_var(dtype, namer ? namer(name) : name);
-        auto def = builder::make_var_tensor_def_unattached(
-                ret, linkage::local, do_cast_and_fold(init_v));
-        if (out_seq) {
-            out_seq->emplace_back(def);
-        } else {
-            builder::get_current_builder()->emit(def);
-        }
-        return ret;
-    };
-    // the util function to define the expressions to return
-    auto make_output_vars = [&](expr &my_begin, const expr &cur_jobs) {
-        my_begin = do_cast_and_fold(my_begin);
-        *out_start = def_var("_start", my_begin);
-
-        expr out_len_e;
-        if (out_end || out_len) {
-            out_len_e = def_var("_len", do_cast_and_fold(cur_jobs * step_e));
-        }
-        if (out_end) {
-            expr my_end = *out_start + out_len_e;
-            my_end = do_cast_and_fold(my_end);
-            *out_end = def_var("_end", my_end);
-        }
-        if (out_len) { *out_len = out_len_e; }
-    };
-    expr the_tid, my_jobs_e, my_jobs_2_e;
-    if (start_e.isa<constant>() && end_e.isa<constant>()
-            && step_e.isa<constant>()) {
-        // if is constant-for (in most cases)
-        uint64_t end = get_const_as_int(end_e.static_as<constant>());
-        uint64_t begin = get_const_as_int(start_e.static_as<constant>());
-        uint64_t step = get_const_as_int(step_e.static_as<constant>());
-        auto len = end - begin;
-        auto num_jobs = utils::divide_and_ceil(len, step);
-        uint64_t my_jobs = utils::divide_and_ceil(num_jobs, num_threads);
-        COMPILE_ASSERT(my_jobs > 0, "Bad number of jobs");
-        if (num_jobs % num_threads == 0) {
-            // fast path. the jobs is divisible by the thread number
-            *out_start = def_var("_start", gid * (my_jobs * step) + begin);
-            if (out_end) {
-                *out_end = def_var("_end", *out_start + (my_jobs * step));
-            }
-            if (out_len) { *out_len = my_jobs * step; }
-            return num_threads;
-        }
-        uint64_t my_jobs_2 = my_jobs - 1;
-        // number of threads doing my_jobs work
-        uint64_t num_thread_larger_work = num_jobs - my_jobs_2 * num_threads;
-        // number of threads doing my_jobs - 1 work
-        uint64_t num_thread_less_work = num_threads - num_thread_larger_work;
-        // the loop is divisible with num_thread_less_work parts
-        // and each part has same number of works
-        // the loop can be further "split" into outer and inner loops and
-        // the outer loop may be merged with another loop
-        uint64_t num_split
-                = math_utils::get_gcd(num_thread_larger_work, num_threads);
-        if (num_split > 1UL) {
-            // e.g. 22 jobs and 8 threads
-            // my_jobs=3 my_jobs_2=2
-            // num_thread_larger_work = 6
-            // num_split = gcd(num_thread_larger_work, num_threads) = 2
-            // num_thread_each_split = 4
-            // num_thread_larger_work_per_split = 6/2 = 3
-            // cur_jobs for each thread: 3,3,3,2 | 3,3,3,2 <<< 2 splits
-            uint64_t num_thread_each_split = num_threads / num_split;
-            uint64_t num_thread_larger_work_per_split
-                    = num_thread_larger_work / num_split;
-            expr id_in_split
-                    = def_var("id_in_split", gid % num_thread_each_split);
-            expr is_large_work = id_in_split < num_thread_larger_work_per_split;
-            expr cur_jobs
-                    = builder::make_select(is_large_work, my_jobs, my_jobs_2);
-            // for gid=5, base_job_id = 11
-            expr base_job_id
-                    = (gid / num_thread_each_split) * (num_jobs / num_split);
-            // for gid=5, job_id_offset = 3
-            expr job_id_offset;
-            if (num_thread_larger_work_per_split == num_thread_each_split - 1) {
-                job_id_offset = id_in_split * my_jobs;
-            } else {
-                job_id_offset = generate_balance211_job_id_base(id_in_split,
-                        num_thread_larger_work_per_split, my_jobs, my_jobs_2);
-            }
-            expr my_begin = start_e + (base_job_id + job_id_offset) * step_e;
-            make_output_vars(my_begin, cur_jobs);
-            return num_split;
-        }
-        // fallback way: non-divisible loop
-        the_tid = num_thread_larger_work;
-        my_jobs_e = my_jobs;
-        my_jobs_2_e = my_jobs_2;
-
-    } else {
-        auto &end = end_e;
-        auto &begin = start_e;
-        auto &step = step_e;
-        auto len = end - begin;
-        auto num_jobs = def_var("num_jobs", (len + step - 1) / step);
-        my_jobs_e = def_var(
-                "my_jobs", (num_jobs + (num_threads - 1)) / num_threads);
-        // assert(my_jobs > 0);
-        my_jobs_2_e = def_var("my_jobs2", my_jobs_e - 1);
-        the_tid = def_var("the_tid", num_jobs - my_jobs_2_e * num_threads);
-    }
-    expr cur_jobs = builder::make_select(gid < the_tid, my_jobs_e, my_jobs_2_e);
-    expr my_begin = start_e
-            + generate_balance211_job_id_base(
-                      gid, the_tid, my_jobs_e, my_jobs_2_e)
-                    * step_e;
-    make_output_vars(my_begin, cur_jobs);
-    return 0;
-}
-
-} // namespace builtin
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/builtin.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/builtin.hpp
deleted file mode 100644
index a79ebecb882..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/builtin.hpp
+++ /dev/null
@@ -1,502 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_BUILTIN_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_BUILTIN_HPP
-
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/sc_function.hpp>
-#include <runtime/microkernel/cpu/brgemm_common.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-SC_INTERNAL_API expr get_ir_null();
-SC_INTERNAL_API expr get_ir_zero_index();
-SC_INTERNAL_API bool is_pure_func_call(const expr_c &v);
-
-namespace builtin {
-
-/**
- * Infer output dtype based on dtype of input A for brgemm
- * */
-SC_INTERNAL_API sc_data_type_t infer_output_dtype(sc_data_type_t dtype_A);
-
-/**
- * Generates a call node to print_index, and wrap the call node with
- * evaluate node. Also declares the print_index function in the builder
- * */
-void print_index(expr v);
-
-/**
- * Generates a call node to print_int, and wrap the call node with
- * evaluate node. Also declares the print_int function in the builder
- * */
-void print_int(expr v);
-
-/**
- * Generates a call node to print_float, and wrap the call node with
- * evaluate node. Also declares the print_float function in the builder
- * */
-void print_float(expr v);
-
-/**
- * Generates a call node to print_str function, and wrap the call node
- * with evaluate node. Also declares the print_str function in the builder
- * */
-void print_str(expr v);
-
-/**
- * Generates a string and pass the string to a call node to print_str function,
- * and wrap the call node with evaluate node. Also declares the print_str
- * function in the builder
- * */
-void print_str(const std::string &v);
-
-/**
- * Generates a string and pass the string to a call node to print_str function,
- * and wrap the call node with evaluate node. Also declares the print_str
- * function in the builder
- * */
-void print_str(const char *v);
-
-/**
- * Generates a evaluate_call to sc_make_trace
- * @param func_id the function id, s32
- * @param in_or_out s32, if 0, this is the entry trace of the function. if 1,
- * this is the exit trace of the function
- * @param arg the argument in the trace
- * */
-expr make_trace(expr func_name, expr in_or_out, expr arg);
-
-/**
- * Generates a evaluate_call to sc_make_trace_kernel
- * @param func_id the function id, s32
- * @param in_or_out s32, if 0, this is the entry trace of the function. if 1,
- * this is the exit trace of the function
- * @param arg the argument in the trace
- * */
-expr make_trace_kernel(expr func_name, expr in_or_out, expr arg);
-
-// Create a initialized postops data vector. Its length matches the number of
-// dnnl postop data init func args
-SC_INTERNAL_API std::vector<expr> create_initialed_postops_data();
-
-/**
- * Generates a call node to dnnl_brgemm_init, and wrap the call node
- * with evaluate node. Also declares the dnnl_brgemm_init function in
- * the builder
- *
- * @param C pointer (float)
- * @param M s32
- * @param N s32
- * @param LDC s32
- * @param dtypeC sc_data_type_t
- * @param value f32
- * */
-void dnnl_brgemm_init(
-        expr C, expr M, expr N, expr LDC, sc_data_type_t dtypeC, expr value);
-
-/**
- * Generates a generate call node to brgemm_init_update.
- *
- * @param A pointer (float)
- * @param B pointer (float)
- * @param C pointer (float)
- * @param num s32
- * @param M s32
- * @param N s32
- * @param K s32
- * @param LDA s32
- * @param LDB s32
- * @param LDC s32
- * @param stride_a s32
- * @param stride_b s32
- * @param dtypeA sc_data_type_t
- * @param dtypeB sc_data_type_t
- * @param brg_attrs any_map
- * @param bd_mask bd_mask
- * @param bd_mask_idx bd_mask idx for same brgemm with different bd_mask
- * @param bd_mask_set_num num of different bd_mask for same brgemm
- * @param brg_postops_setting postops_setting
- * @param brg_postops_data postops_data
- * @param brg_c_buf c_buf
- * */
-evaluate brgemm_init_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &LDA, const expr &LDB, const expr &LDC, const expr &stride_a,
-        const expr &stride_b, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB,
-        const sc_brgemm_attrs_t &brg_attrs = sc_brgemm_attrs_t(),
-        const sc_brgemm_bd_mask_t &bd_mask = sc_brgemm_bd_mask_t(),
-        const expr &bd_mask_idx = get_ir_zero_index(),
-        const int &bd_mask_set_num = 1, const expr &top_pad = get_ir_null(),
-        const expr &bottom_pad = get_ir_null(),
-        const sc_brgemm_postops_setting_t &brg_postops_setting
-        = sc_brgemm_postops_setting_t(),
-        const std::vector<expr> &brg_postops_data
-        = create_initialed_postops_data(),
-        const expr &brg_c_buf = get_ir_null());
-
-/**
- * Generates a generate call node to brgemm_init_update. If you want to use
- * brgemm fusion, please use this interface. Notice that brgemm fusion needs all
- * calculation of reduce axis done.
- *
- * @param A pointer (float)
- * @param B pointer (float)
- * @param C pointer (float)
- * @param num s32
- * @param M s32
- * @param N s32
- * @param K s32
- * @param LDA s32
- * @param LDB s32
- * @param LDC s32
- * @param stride_a s32
- * @param stride_b s32
- * @param dtypeA sc_data_type_t
- * @param dtypeB sc_data_type_t
- * @param brg_attrs any_map
- * @param bd_mask bd_mask
- * @param bd_mask_idx bd_mask idx for same brgemm with different bd_mask
- * @param bd_mask_set_num num of different bd_mask for same brgemm
- * @param top_pad pointer, indicate the first padding num on M dimension(bd)
- * @param bottom_pad pointer, indicate the last padding num on M dimension(bd)
- * @param brg_postops_setting postops_setting
- * @param brg_postops_data postops_data
- * @param brg_c_buf c_buf
- * */
-evaluate brgemm_init_update_allow_fusion(const expr &A, const expr &B,
-        const expr &C, const expr &num, const expr &M, const expr &N,
-        const expr &K, const expr &LDA, const expr &LDB, const expr &LDC,
-        const expr &stride_a, const expr &stride_b,
-        const sc_data_type_t &dtypeA, const sc_data_type_t &dtypeB,
-        const sc_brgemm_attrs_t &brg_attrs = sc_brgemm_attrs_t(),
-        const sc_brgemm_bd_mask_t &bd_mask = sc_brgemm_bd_mask_t(),
-        const expr &bd_mask_idx = get_ir_zero_index(),
-        const int &bd_mask_set_num = 1, const expr &top_pad = get_ir_null(),
-        const expr &bottom_pad = get_ir_null(),
-        const sc_brgemm_postops_setting_t &brg_postops_setting
-        = sc_brgemm_postops_setting_t(),
-        const std::vector<expr> &brg_postops_data
-        = create_initialed_postops_data(),
-        const expr &brg_c_buf = get_ir_null());
-
-/**
- * Generates a generate call node to brgemm_init_f32.
- *
- * @param C pointer (void)
- * @param M s32
- * @param N s32
- * @param LDC s32
- * @param dtypeC sc_data_type_t
- * @param value f32
- * */
-void brgemm_init(
-        expr C, expr M, expr N, expr LDC, sc_data_type_t dtypeC, expr value);
-
-/**
- * Generates a generate call node to brgemm_update.
- *
- * @param A pointer (void)
- * @param B pointer (void)
- * @param C pointer (void)
- * @param num s32
- * @param M s32
- * @param N s32
- * @param K s32
- * @param LDA s32
- * @param LDB s32
- * @param LDC s32
- * @param stride_a s32
- * @param stride_b s32
- * @param dtypeA sc_data_type_t
- * @param dtypeB sc_data_type_t
- * @param brg_attrs any_map
- * @param bd_mask bd_mask
- * @param bd_mask_idx bd_mask idx for same brgemm with different bd_mask
- * @param bd_mask_set_num num of different bd_mask for same brgemm
- * @param top_pad pointer, indicate the first padding num on M dimension(bd)
- * @param bottom_pad pointer, indicate the first padding num on M dimension(bd)
- * @param brg_postops_setting postops_setting
- * @param brg_postops_data postops_data
- * @param brg_c_buf c_buf
- * */
-evaluate brgemm_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &LDA, const expr &LDB, const expr &LDC, const expr &stride_a,
-        const expr &stride_b, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB,
-        const sc_brgemm_attrs_t &brg_attrs = sc_brgemm_attrs_t(),
-        const sc_brgemm_bd_mask_t &bd_mask = sc_brgemm_bd_mask_t(),
-        const expr &bd_mask_idx = get_ir_zero_index(),
-        const int &bd_mask_set_num = 1, const expr &top_pad = get_ir_null(),
-        const expr &bottom_pad = get_ir_null(),
-        const sc_brgemm_postops_setting_t &brg_postops_setting
-        = sc_brgemm_postops_setting_t(),
-        const std::vector<expr> &brg_postops_data
-        = create_initialed_postops_data(),
-        const expr &brg_c_buf = get_ir_null());
-
-/**
- * Generates a generate call node to brgemm_list_update.
- *
- * @param A pointer (void)
- * @param B pointer (void)
- * @param C pointer (void)
- * @param num s32
- * @param M s32
- * @param N s32
- * @param K s32
- * @param lda s32
- * @param ldb s32
- * @param ldc s32
- * @param stride_a s32
- * @param stride_b s32
- * @param len s32 (len of addr list, each addr list contains num addrs which
- * inferred via stride_a and strid_b)
- * @param dtypeA sc_data_type_t
- * @param dtypeB sc_data_type_t
- * @param brg_attrs any_map
- * @param bd_mask bd_mask
- * @param bd_mask_idx bd_mask idx for same brgemm with different bd_mask
- * @param bd_mask_set_num num of different bd_mask for same brgemm
- * @param top_pad pointer, indicate the first padding num on M dimension(bd)
- * @param bottom_pad pointer, indicate the first padding num on M dimension(bd)
- * @param brg_postops_setting postops_setting
- * @param brg_postops_data postops_data
- * @param brg_c_buf c_buf
- * */
-evaluate brgemm_list_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &lda, const expr &ldb, const expr &ldc, const expr &stride_a,
-        const expr &stride_b, const expr &len, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB,
-        const sc_brgemm_attrs_t &brg_attrs = sc_brgemm_attrs_t(),
-        const sc_brgemm_bd_mask_t &bd_mask = sc_brgemm_bd_mask_t(),
-        const expr &bd_mask_idx = get_ir_zero_index(),
-        const int &bd_mask_set_num = 1, const expr &top_pad = get_ir_null(),
-        const expr &bottom_pad = get_ir_null(),
-        const sc_brgemm_postops_setting_t &brg_postops_setting
-        = sc_brgemm_postops_setting_t(),
-        const std::vector<expr> &brg_postops_data
-        = create_initialed_postops_data(),
-        const expr &brg_c_buf = get_ir_null());
-
-/**
- * Generates a generate call node to brgemm_init_list_update.
- *
- * @param A pointer (void)
- * @param B pointer (void)
- * @param C pointer (void)
- * @param num s32
- * @param M s32
- * @param N s32
- * @param K s32
- * @param lda s32
- * @param ldb s32
- * @param ldc s32
- * @param stride_a s32
- * @param stride_b s32
- * @param len s32 (len of addr list, each addr list contains num addrs which
- * inferred via stride_a and strid_b)
- * @param dtypeA sc_data_type_t
- * @param dtypeB sc_data_type_t
- * @param brg_attrs any_map
- * @param bd_mask bd_mask
- * @param bd_mask_idx bd_mask idx for same brgemm with different bd_mask
- * @param bd_mask_set_num num of different bd_mask for same brgemm
- * @param top_pad pointer, indicate the first padding num on M dimension(bd)
- * @param bottom_pad pointer, indicate the first padding num on M dimension(bd)
- * @param brg_postops_setting postops_setting
- * @param brg_postops_data postops_data
- * @param brg_c_buf c_buf
- * */
-evaluate brgemm_init_list_update(const expr &A, const expr &B, const expr &C,
-        const expr &num, const expr &M, const expr &N, const expr &K,
-        const expr &lda, const expr &ldb, const expr &ldc, const expr &stride_a,
-        const expr &stride_b, const expr &len, const sc_data_type_t &dtypeA,
-        const sc_data_type_t &dtypeB,
-        const sc_brgemm_attrs_t &brg_attrs = sc_brgemm_attrs_t(),
-        const sc_brgemm_bd_mask_t &bd_mask = sc_brgemm_bd_mask_t(),
-        const expr &bd_mask_idx = get_ir_zero_index(),
-        const int &bd_mask_set_num = 1, const expr &top_pad = get_ir_null(),
-        const expr &bottom_pad = get_ir_null(),
-        const sc_brgemm_postops_setting_t &brg_postops_setting
-        = sc_brgemm_postops_setting_t(),
-        const std::vector<expr> &brg_postops_data
-        = create_initialed_postops_data(),
-        const expr &brg_c_buf = get_ir_null());
-
-/**
- * Generates a generate call node to mem_zero.
- * use 'memset' to initialize a specified buffer to zero, size of which equals
- * to size*sizeof(dtype).
- *
- * @param C pointer (void)
- * @param size index
- * @param dtype sc_data_type_t
- * */
-void mem_zero(expr C, const expr &size, sc_data_type_t dtype);
-func_t get_mem_set_func();
-
-/**
- * Generates a call node to convert multiple buffers(like scales, bias) to one
- * dnnl postop data type struct.
- *
- * @return the created func.
- * */
-func_t get_brgemm_postops_data_init_func();
-
-enum class brgemm_mode {
-    // offset doesn't used for now.
-    // offset,
-    stride,
-    addr_list,
-};
-
-// returns <kernerl creator, caller> pair
-std::pair<func_t, func_t> get_brgemm_creator_and_call_func(
-        brgemm_mode mode, scflags_t::brgemm_backend_t backend, bool has_postop);
-
-// returns <update, init_update> pair
-std::pair<func_t, func_t> get_brgemm_update_funcs(
-        brgemm_mode mode, scflags_t::brgemm_backend_t backend);
-func_t get_brgemm_call_range_func(brgemm_mode mode);
-
-// dynamic query format function evaluation at runtime.
-expr call_matmul_core_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &ori_in0,
-        const expr &ori_in1, const expr &out_format0, const expr &in_format0,
-        const expr &in_format1, const expr &ori_in_format0,
-        const expr &ori_in_format1, const expr &out_size, const expr &kernel,
-        const expr &impl = get_ir_null());
-expr call_managed_matmul_core_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &ori_in0,
-        const expr &ori_in1, const expr &out_format0, const expr &in_format0,
-        const expr &in_format1, const expr &ori_in_format0,
-        const expr &ori_in_format1, const expr &out_size, const expr &kernel,
-        const expr &impl = get_ir_null());
-expr call_conv_fwd_core_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &ori_in0,
-        const expr &ori_in1, const expr &out_format0, const expr &in_format0,
-        const expr &in_format1, const expr &ori_in_format0,
-        const expr &ori_in_format1, const expr &out_size, const expr &kernel,
-        const expr &impl = get_ir_null());
-expr call_unary_fusible_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_foramt0, const expr &in_format0,
-        const expr &out_size, const expr &kernel);
-expr call_padding_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_foramt0, const expr &in_format0,
-        const expr &out_size, const expr &kernel);
-expr call_pooling_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_foramt0, const expr &in_format0,
-        const expr &out_size, const expr &kernel);
-expr call_binary_fusible_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &out_format0,
-        const expr &in_format0, const expr &in_format1, const expr &out_size,
-        const expr &kernel);
-// reorder need to query its impl alg.
-expr call_reorder_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_foramt0, const expr &in_format0,
-        const expr &out_size, const expr &kernel,
-        const expr &impl = get_ir_null());
-expr call_reduce_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_foramt0, const expr &in_format0,
-        const expr &out_size, const expr &kernel);
-expr call_tensor_view_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &out_format0, const expr &in_format0,
-        const expr &out_size, const expr &kernel);
-expr call_select_op_query_format(const expr &tb, const expr &out0,
-        const expr &in0, const expr &in1, const expr &in2,
-        const expr &out_format0, const expr &in_format0, const expr &in_format1,
-        const expr &in_format2, const expr &out_size, const expr &kernel);
-expr call_fused_op_query_combined(const expr &tb, const expr &combined_keys,
-        const expr &combined_algs, const expr &each_op_num_key,
-        const expr &op_num, const expr &kernel);
-expr call_cal_blocking_dims(const expr &placeholder, const expr &format);
-// Get single config by input shape of matmul.
-expr call_get_matmul_dyn_cfg_single(const expr &in, const expr &is_batch);
-
-// gets the IR func for get_thread_id. @see thread_pool_table::get_thread_id
-func_t get_thread_id_func();
-
-// gets the IR func for is_in_parallel. @see thread_pool_table::is_in_parallel
-func_t get_is_in_parallel_func();
-
-// gets the IR func for gc::runtime::enter_barrier
-func_t get_barrier_arrive_func();
-
-// gets the IR func for gc::runtime::init_barrier
-func_t get_init_barrier_func();
-
-// gets the IR func for sc_set_idle_func_managed
-func_t get_set_idle_func_managed_func();
-
-func_t get_tls_amx_buffer_func();
-
-func_t get_brgemm_init_func();
-
-/**
- * Generates the IR to do work-dispatch of balance211 - to parallelize a loop of
- * [start,end) with loop iterator step = "step" using "num_threads" threads.
- * Given the above parameters and the thread id, this helper function builds the
- * IR to calculate the share of the workload in [out_start, out_end)
- *
- * @param num_threads the number of threads
- * @param start the start of the loop
- * @param end the end of the loop, not inclusive
- * @param step the setp of the loop
- * @param tid the current thread id
- * @param namer the optional callback function to customize the internal
- * variable name. It takes a parameter of the base variable name. A typical
- * implementation of this function may transform the input name "a" into a_{N},
- * where N is an internal counter. If is null, use the default namer.
- * @param out_start outputs the IR node for the start of the workload of the
- * thread
- * @param out_len optionally outputs the IR node for the length of the workload
- * of the thread. If is null, this function will not output IR for this
- * @param out_end optionally outputs the IR node for the end of the workload of
- * the thread. If is null, this function will not output IR for this
- * @param out_seq If it is not null, append the generated statements into it. If
- * is is null, generate the statements to the current IR builder.
- * @return the maximal number of splits of the threads, which threads can be
- * grouped into, and each group of threads does the same amount of jobs. The
- * return value should be a factor of num_threads. e.g. If the number of threads
- * is 18 and the function returns 6, it means that the 18 threads can be further
- * grouped into sub-groups with 2, 3, or 6 threads. The function can return 0,
- * indicating that the loop boundary are not constants.
- * */
-uint64_t generate_balance211(int num_threads, const expr &start,
-        const expr &end, const expr &step, const expr &tid,
-        const std::function<std::string(const char *)> &namer, expr *out_start,
-        expr *out_len = nullptr, expr *out_end = nullptr,
-        std::vector<stmt> *out_seq = nullptr);
-
-} // namespace builtin
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/content_hash.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/content_hash.cpp
deleted file mode 100644
index c774599bccc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/content_hash.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "content_hash.hpp"
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-std::size_t content_hash_t<constant_c>::operator()(const constant_c &k) const {
-    std::size_t ret = static_cast<uint64_t>(k->dtype_);
-    if (k->dtype_.is_pointer()) {
-        for (auto &v : k->value_) {
-            ret = ret * 23 + v.s64;
-        }
-        return ret;
-    }
-    switch (get_etype_category(k->dtype_)) {
-        case CATE_FLOAT:
-            for (auto &v : k->value_) {
-                ret = ret * 23 + std::hash<float>()(v.f32);
-            }
-            break;
-        case CATE_INT:
-        case CATE_UINT:
-            for (auto &v : k->value_) {
-                ret = ret * 23 + v.s64;
-            }
-            break;
-        default: break;
-    }
-    return ret;
-}
-
-std::size_t content_hash_t<expr>::operator()(const expr &k) const {
-    return content_hash_t<expr_c>()(k);
-}
-
-std::size_t content_hash_t<expr_c>::operator()(const expr_c &k) const {
-    std::size_t ret;
-    switch (k->node_type_) {
-        case sc_expr_type::constant:
-            ret = content_hash_t<constant_c>()(k.static_as<constant>());
-            break;
-        case sc_expr_type::tensor: ret = std::hash<expr_c>()(k); break;
-        default: throw std::runtime_error("Unsupported node type!"); break;
-    }
-    return ret;
-}
-
-bool content_equals_t<expr_c>::operator()(
-        const expr_c &a, const expr_c &b) const {
-#if !SC_GNUC_VERSION_LT(7) && !defined(_MSC_VER)
-    // use cached ir_comparer because it is a complex class
-    // we will auto-reset after compare, so the cmper_ is unchanged after this
-    // function call, as if it is "const"
-    return const_cast<ir_comparer &>(cmper_).compare(a, b);
-#else
-    ir_comparer cmper_;
-    return cmper_.compare(a, b);
-#endif
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/content_hash.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/content_hash.hpp
deleted file mode 100644
index 37f32e3ae70..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/content_hash.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_CONTENT_HASH_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_CONTENT_HASH_HPP
-
-#include <vector>
-#include "ir_comparer.hpp"
-#include "sc_expr.hpp"
-#include <unordered_map>
-#include <util/compiler_macros.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-template <typename T>
-struct content_hash_t {};
-
-template <>
-struct content_hash_t<constant_c> {
-    std::size_t operator()(const constant_c &k) const;
-};
-
-template <>
-struct content_hash_t<expr> {
-    std::size_t operator()(const expr &k) const;
-};
-
-template <>
-struct content_hash_t<expr_c> {
-    std::size_t operator()(const expr_c &k) const;
-};
-
-template <typename T>
-struct content_hash_t<std::vector<T>> {
-    std::size_t operator()(const std::vector<T> &k) const {
-        auto h = content_hash_t<T>();
-        std::size_t ret = 0;
-        for (auto &v : k) {
-            ret = ret * 23 + h(v);
-        }
-        return ret;
-    }
-};
-
-template <typename T>
-struct content_equals_t {};
-
-template <>
-struct content_equals_t<expr_c> {
-#if !SC_GNUC_VERSION_LT(7) && !defined(_MSC_VER)
-    // Old version of gcc will produce error upon content_hash_map construction
-    ir_comparer cmper_;
-#endif
-    bool operator()(const expr_c &a, const expr_c &b) const;
-};
-
-template <typename T, typename TBase>
-struct content_equals_t<node_ptr<T, TBase>>
-    : content_equals_t<node_ptr<const TBase, TBase>> {};
-
-template <typename T>
-struct content_equals_t<std::vector<T>> {
-    bool operator()(const std::vector<T> &a, const std::vector<T> &b) const {
-        content_equals_t<T> eq;
-        if (a.size() != b.size()) { return false; }
-        for (unsigned i = 0; i < a.size(); i++) {
-            if (!eq(a[i], b[i])) return false;
-        }
-        return true;
-    }
-};
-
-template <typename T, typename V>
-using content_hash_map
-        = std::unordered_map<T, V, content_hash_t<T>, content_equals_t<T>>;
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/easy_build.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/easy_build.cpp
deleted file mode 100644
index 6d85e4559f7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/easy_build.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "easy_build.hpp"
-#include <utility>
-#include <compiler/config/context.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace builder {
-for_range_simulator_t range(const std::string &name, for_loop &out, expr min,
-        expr extent, expr step, for_type type, int num_threads) {
-    return for_range_simulator_t(builder::get_current_builder(), &out, name,
-            std::move(min), std::move(extent), std::move(step), type,
-            num_threads);
-}
-
-for_range_simulator_t range_nobind(const std::string &name, expr min,
-        expr extent, expr step, for_type type, int num_threads) {
-    return for_range_simulator_t(builder::get_current_builder(), nullptr, name,
-            std::move(min), std::move(extent), std::move(step), type,
-            num_threads);
-}
-
-for_range_simulator_t range(for_loop &out, expr min, expr extent, expr step,
-        for_type type, int num_threads) {
-    return for_range_simulator_t(builder::get_current_builder(), &out,
-            "!!!unamed", std::move(min), std::move(extent), std::move(step),
-            type, num_threads);
-}
-
-for_range_simulator_t range(
-        expr min, expr extent, expr step, for_type type, int num_threads) {
-    return for_range_simulator_t(builder::get_current_builder(), nullptr,
-            "!!!unamed", std::move(min), std::move(extent), std::move(step),
-            type, num_threads);
-}
-
-func_simulator_t _make_func_simulator(const std::string &name, func_t *outfunc,
-        sc_data_type_t dtype, std::vector<std::vector<expr>> &&args) {
-    std::vector<expr> flattened;
-    for (auto &a : args) {
-        for (auto &arg : a) {
-            flattened.emplace_back(std::move(arg));
-        }
-    }
-    return func_simulator_t(name, outfunc, dtype, std::move(flattened));
-}
-
-std::vector<expr> _make_arg(
-        const char *name, sc_data_type_t dtype, const std::vector<int> &args) {
-    expr ret;
-    if (args.empty()) {
-        ret = builder::make_var(dtype, name);
-    } else {
-        std::vector<expr> dims;
-        dims.reserve(args.size());
-        for (auto i : args) {
-            dims.emplace_back(i);
-        }
-        ret = builder::make_tensor(name, dims, dtype);
-    }
-    return std::vector<expr> {ret};
-}
-
-std::vector<expr> _make_arg(const char *name, sc_data_type_t dtype,
-        std::initializer_list<unsigned long> args) { // NOLINT,
-    // We must use unsigned long here to let g++ and MSVC to correctly let UL
-    // number literals find correct overload version of function.
-    expr ret;
-    if (args.size() == 0) {
-        ret = builder::make_var(dtype, name);
-    } else {
-        std::vector<expr> dims;
-        dims.reserve(args.size());
-        for (auto i : args) {
-            dims.emplace_back(i);
-        }
-        ret = builder::make_tensor(name, dims, dtype);
-    }
-    return std::vector<expr> {ret};
-}
-
-std::vector<expr> _make_arg(
-        const char *name, sc_data_type_t dtype, const std::vector<expr> &args) {
-    expr ret;
-    if (args.empty()) {
-        ret = builder::make_var(dtype, name);
-    } else {
-        ret = builder::make_tensor(name, args, dtype);
-    }
-    return std::vector<expr> {ret};
-}
-
-std::vector<expr> _make_arg(const char *name, sc_data_type_t dtype,
-        std::initializer_list<int> args) {
-    return _make_arg(name, dtype, std::vector<int>(args));
-}
-
-std::vector<expr> _make_arg(const char *name, sc_data_type_t dtype) {
-    return std::vector<expr> {builder::make_var(dtype, name)};
-}
-
-func_t _decl_func(const std::string &name, sc_data_type_t dtype,
-        std::vector<std::vector<expr>> &&args) {
-    std::vector<expr> flattened;
-    for (auto &a : args) {
-        for (auto &arg : a) {
-            flattened.emplace_back(std::move(arg));
-        }
-    }
-    return builder::make_func(name, flattened, stmt(), dtype);
-}
-
-} // namespace builder
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/easy_build.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/easy_build.hpp
deleted file mode 100644
index 61129b29b5f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/easy_build.hpp
+++ /dev/null
@@ -1,608 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_EASY_BUILD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_EASY_BUILD_HPP
-
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include "builder.hpp"
-#include <runtime/logging.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct context_t;
-
-// the assignment overload for utils::bind_vector_to_args
-namespace utils {
-template <>
-struct SC_INTERNAL_API bind_assigner_t<expr::lvalue_proxy_t, expr> {
-    static void assign(expr::lvalue_proxy_t &dst, const expr &src) {
-        dst.data_ = src;
-        dst.require_remake_ = false;
-    }
-};
-} // namespace utils
-
-namespace builder {
-struct SC_INTERNAL_API scope_mgr_t {
-    builder::builder_impl_t *ctx_;
-    using callback_type = std::function<void(builder::builder_impl_t *, stmt)>;
-    callback_type on_pop_;
-    scope_mgr_t(builder::builder_impl_t *ctx, callback_type on_pop)
-        : ctx_(ctx), on_pop_(std::move(on_pop)) {
-        ctx->push_scope();
-    }
-    scope_mgr_t(scope_mgr_t &&other)
-        : ctx_(other.ctx_), on_pop_(std::move(other.on_pop_)) {
-        other.ctx_ = nullptr;
-    }
-    ~scope_mgr_t() {
-        if (ctx_) { on_pop_(ctx_, ctx_->pop_scope()); }
-    }
-};
-
-/**
- * This class builds a for-loop node with RAII of C++.
- * It provides an iterator which can only iterate once.
- * Users should use range-based-for to iterate on this iterator
- * The iterator returns the "var expr" of the for-loop to generate
- * e.g.
- * for (auto i: range(0, 100, "i")) {
- *  buf[i] = buf[i] + 1;
- * }
- *
- * At the end of the scope, this class will push a for-loop node in current
- * build
- * */
-struct SC_INTERNAL_API for_range_simulator_t {
-    expr var_;
-    builder::builder_impl_t *ctx_;
-    expr min_;
-    expr extent_;
-    expr step_;
-    for_type type_;
-    for_loop *out_;
-    int num_threads_;
-    struct for_range_iterator_t {
-        expr var_;
-        bool consumed_;
-        expr::lvalue_proxy_t operator*() const {
-            return expr::lvalue_proxy_t(var_, false);
-        }
-
-        for_range_iterator_t &operator++() {
-            consumed_ = true;
-            return *this;
-        }
-
-        bool operator!=(for_range_iterator_t &other) const {
-            return consumed_ != other.consumed_;
-        }
-
-        for_range_iterator_t(expr var)
-            : var_(std::move(var)), consumed_(false) {}
-        for_range_iterator_t() : consumed_(true) {}
-    };
-
-    for_range_iterator_t begin() const { return for_range_iterator_t(var_); }
-
-    for_range_iterator_t end() { return for_range_iterator_t(); }
-
-    for_range_simulator_t(builder::builder_impl_t *ctx, for_loop *out,
-            const std::string &name, expr min, expr extent, expr step,
-            for_type type, int num_threads)
-        : var_(builder::make_var(datatypes::index, name))
-        , ctx_(ctx)
-        , min_(std::move(min))
-        , extent_(std::move(extent))
-        , step_(std::move(step))
-        , type_(type)
-        , out_(out)
-        , num_threads_(num_threads) {
-        ctx->push_scope();
-    }
-
-    for_range_simulator_t(const for_range_simulator_t &other) = delete;
-    for_range_simulator_t(for_range_simulator_t &&other)
-        : var_(std::move(other.var_))
-        , ctx_(other.ctx_)
-        , min_(std::move(other.min_))
-        , extent_(std::move(other.extent_))
-        , step_(std::move(other.step_))
-        , type_(other.type_)
-        , out_(other.out_)
-        , num_threads_(other.num_threads_) {}
-
-    ~for_range_simulator_t() {
-        if (!var_.defined()) return;
-        auto bb = ctx_->pop_scope();
-        auto st = ctx_->push_for_loop(
-                var_, min_, extent_, step_, bb, true, type_, num_threads_);
-        if (out_) { *out_ = st.checked_as<for_loop>(); }
-    }
-};
-
-SC_INTERNAL_API for_range_simulator_t range(const std::string &name,
-        for_loop &out, expr min, expr extent, expr step = expr(1),
-        for_type type = for_type::NORMAL, int num_threads = 0);
-SC_INTERNAL_API for_range_simulator_t range_nobind(const std::string &name,
-        expr min, expr extent, expr step = expr(1),
-        for_type type = for_type::NORMAL, int num_threads = 0);
-SC_INTERNAL_API for_range_simulator_t range(for_loop &out, expr min,
-        expr extent, expr step = expr(1), for_type type = for_type::NORMAL,
-        int num_threads = 0);
-SC_INTERNAL_API for_range_simulator_t range(expr min, expr extent,
-        expr step = expr(1), for_type type = for_type::NORMAL,
-        int num_threads = 0);
-
-/**
- * Builds a for-loop. Takes arguments:
- *  the loop var (will create a C++ variable and a var node of the same name)
- *  iter_begin
- *  iter_end
- *  step (optional)
- *  loop type (optional)
- * See range_nobind(). e.g.
- * _for_(i, 0, 100) {...}
- * */
-#define _for_(IDX, ...) \
-    for (auto IDX : \
-            ::dnnl::impl::graph::gc::builder::range_nobind(#IDX, __VA_ARGS__))
-
-/**
- * Builds a for-loop and returns the for-loop node to an output variable.
- * Takes arguments:
- *  output variable: the output for_loop C++ variable. The variable will be
- *      set after the scope of the for loop
- *  the loop var (will create a C++ variable and a var node of the same name)
- *  iter_begin
- *  iter_end
- *  step (optional)
- *  loop type (optional)
- * See range(). e.g.
- * for_loop li;
- * _named_for_(li, i, 0, 100) {...}
- * */
-#define _named_for_(OUT, IDX, ...) \
-    for (auto IDX : \
-            ::dnnl::impl::graph::gc::builder::range(#IDX, OUT, __VA_ARGS__))
-
-struct SC_INTERNAL_API nested_for_ranges_t {
-    std::vector<for_range_simulator_t> loops_;
-    unsigned cur_var_ = 0;
-    expr get_var() {
-        assert(cur_var_ < loops_.size());
-        return loops_[cur_var_++].var_;
-    }
-    bool consumed_;
-    nested_for_ranges_t(nested_for_ranges_t &other) = delete;
-    nested_for_ranges_t(std::vector<for_range_simulator_t> &&loops)
-        : loops_(std::move(loops)), consumed_(false) {}
-    nested_for_ranges_t(nested_for_ranges_t &&other)
-        : loops_(std::move(other.loops_)), consumed_(other.consumed_) {}
-
-    template <typename... Args>
-    nested_for_ranges_t(Args... args) : consumed_(false) {
-        utils::args_to_vector(loops_, std::move(args)...);
-    }
-
-    void step() { consumed_ = true; }
-    ~nested_for_ranges_t() {
-        // destruct inner loop first
-        while (!loops_.empty())
-            loops_.pop_back();
-    }
-    std::vector<expr> get_vars() {
-        std::vector<expr> ret;
-        for (auto &l : loops_) {
-            ret.emplace_back(l.var_);
-        }
-        return ret;
-    }
-};
-
-/**
- * Builds nested for loops
- * params: the for_range_simulator_t(s) of the loops. Create
- * for_range_simulator_t by range()/range_nobind() functions
- * */
-#define _nested_for_(...) \
-    for (auto _0_nested_for \
-            = ::dnnl::impl::graph::gc::builder::nested_for_ranges_t( \
-                    __VA_ARGS__); \
-            !_0_nested_for.consumed_; _0_nested_for.step())
-
-/**
- * Binds a loop-variable in the nested for to a expr variable in C++
- * Calling this macro several times will bind the loop variable from
- * the outer to the inner. Can only be used in a nested-for
- * Also sets the name of the loop-variable.
- * */
-#define _iter_var_(v) \
-    expr v = _0_nested_for.get_var(); \
-    (v).checked_as<var>()->name_ = #v
-
-/**
- *  This class builds a if-then-else node with RAII of C++.
- *  if_simulator_t will generate an iterator with "block_num" inside.
- *  Calling "++" on if_iterator_t will increase "block_num" by one.
- *  Using "*" on if_iterator_t will generate a std::pair<scope_mgr_t,int>, where
- *  the "scope_mgr_t" is the scope guard. When the "then/else" scope is done,
- *  the "scope_mgr_t" will register the generated basic_block_t to
- *  if_simulator_t.{true_block, false_block}. The second element of
- "*if_iterator_t"
- *  is the current "block_num", which can help decide whether we are in
- *  "then block" or "else block". When range-based-for scope is done,
- *  if_simulator_t will be destructed and make and if-else node with registerd
- *  true/false blocks. The "_if_" macro wraps the underlying range-based-for.
- *  It expands like:
-    {
-    if_simulator_t _simu = if_simulator_t(builder::get_current(), cond);
-    if_iterator_t itr = _simu.begin()
-    for (;itr != _simu.end(); itr++) {
-        std::pair<scope_mgr_t,int> _scope = *itr;
-        if (scope.second == 0) {
-        // true block
-        } else {
-        // false block
-        }
-        // _scope is destoryed here. Will register "true block" or "false block"
-        // in _simu
-    }
-    // _simu is destoryed here. The if-then-else is generated
-    }
-*/
-struct SC_INTERNAL_API if_simulator_t {
-    stmt true_block_;
-    stmt false_block_;
-    builder::builder_impl_t *ctx_;
-    expr cond_;
-
-    struct if_iterator_t {
-        int block_num_;
-        if_simulator_t *if_scope_;
-        std::pair<scope_mgr_t, int> operator*() {
-            return std::make_pair(scope_mgr_t(if_scope_->ctx_,
-                                          [this](builder::builder_impl_t *ctx,
-                                                  const stmt &s) {
-                                              if (block_num_ == 0) {
-                                                  if_scope_->true_block_ = s;
-                                              } else {
-                                                  if_scope_->false_block_ = s;
-                                              }
-                                          }),
-                    block_num_);
-        }
-
-        if_iterator_t &operator++() {
-            block_num_++;
-            return *this;
-        }
-
-        bool operator!=(if_iterator_t &other) const {
-            return block_num_ != other.block_num_;
-        }
-
-        if_iterator_t(if_simulator_t *if_scope)
-            : block_num_(0), if_scope_(if_scope) {}
-        if_iterator_t() : block_num_(2), if_scope_(nullptr) {}
-    };
-
-    if_iterator_t begin() { return if_iterator_t(this); }
-
-    if_iterator_t end() { return if_iterator_t(); }
-
-    if_simulator_t(builder::builder_impl_t *ctx, expr cond)
-        : ctx_(ctx), cond_(std::move(cond)) {}
-
-    if_simulator_t(if_iterator_t &other) = delete;
-    ~if_simulator_t() {
-        if (!true_block_.defined() || !false_block_.defined()) {
-            SC_WARN << "Cannot generate if statements due to undefined "
-                       "true_block/false_block for if_simulator, could be "
-                       "caused by early destruction from assertion failure";
-            return;
-        }
-        stmt false_block = false_block_.checked_as<stmts>()->seq_.empty()
-                ? stmt()
-                : false_block_;
-        builder::get_current_builder()->push_if_else(
-                cond_, true_block_, false_block);
-    }
-};
-
-/**
- * Builds an if_else node. Takes the parameter of the condition
- * as an expr
- * */
-#define _if_(...) \
-    for (auto &&__if_scope__ : \
-            ::dnnl::impl::graph::gc::builder::if_simulator_t( \
-                    ::dnnl::impl::graph::gc::builder::get_current_builder(), \
-                    (__VA_ARGS__))) \
-        if (__if_scope__.second == 0)
-#define _else_ else
-
-struct SC_INTERNAL_API func_simulator_t {
-    operator bool() const { return true; }
-    std::vector<expr> vargs_;
-    func_t *outfunc_;
-    std::string name_;
-    sc_data_type_t dtype_;
-    func_simulator_t(func_simulator_t &&other)
-        : vargs_(std::move(other.vargs_))
-        , outfunc_(other.outfunc_)
-        , name_(std::move(other.name_))
-        , dtype_(other.dtype_) {
-        other.outfunc_ = nullptr;
-    }
-
-    func_simulator_t(const std::string &name, func_t *outfunc,
-            sc_data_type_t dtype, std::vector<expr> &&vargs)
-        : vargs_(std::move(vargs))
-        , outfunc_(outfunc)
-        , name_(name)
-        , dtype_(dtype) {
-        get_current_builder()->push_scope();
-    }
-    ~func_simulator_t() {
-        if (!outfunc_) return;
-        auto bb = get_current_builder()->pop_scope();
-        *outfunc_ = make_func(name_, vargs_, bb, dtype_);
-    }
-};
-
-SC_INTERNAL_API func_simulator_t _make_func_simulator(const std::string &name,
-        func_t *outfunc, sc_data_type_t dtype,
-        std::vector<std::vector<expr>> &&args);
-
-SC_INTERNAL_API std::vector<expr> _make_arg(
-        const char *name, sc_data_type_t dtype, const std::vector<int> &args);
-SC_INTERNAL_API std::vector<expr> _make_arg(const char *name,
-        sc_data_type_t dtype,
-        std::initializer_list<unsigned long> args); // NOLINT,
-// We must use unsigned long here to let g++ and MSVC to correctly let UL number
-// literals find correct overload version of function.
-SC_INTERNAL_API std::vector<expr> _make_arg(
-        const char *name, sc_data_type_t dtype, const std::vector<expr> &args);
-
-SC_INTERNAL_API std::vector<expr> _make_arg(const char *name,
-        sc_data_type_t dtype, std::initializer_list<int> args);
-
-SC_INTERNAL_API std::vector<expr> _make_arg(
-        const char *name, sc_data_type_t dtype);
-
-SC_INTERNAL_API func_t _decl_func(const std::string &name, sc_data_type_t dtype,
-        std::vector<std::vector<expr>> &&args);
-
-/**
- * Defines a function node named "NAME" and create a C++ variable of func with
- * the same name. arguments:
- *  DTYPE: the return type
- *  NAME: the function name (and the func variable name in C++)
- *  the argument definitions: see _arg_ below
- * e.g. _function_(datatypes::s32, funcA,
- *          _arg_("buffer", datatypes::f32, {100,200})) {
- *  ...
- * }
- * */
-#define _function_(DTYPE, NAME, ...) \
-    ::dnnl::impl::graph::gc::func_t NAME; \
-    if (auto _0_func__ \
-            = ::dnnl::impl::graph::gc::builder::_make_func_simulator(#NAME, \
-                    &NAME, DTYPE, \
-                    std::vector<std::vector<::dnnl::impl::graph::gc::expr>> { \
-                            __VA_ARGS__}))
-/**
- * Declares a function node named "NAME" and create a C++ variable of func with
- * the same name. This macro will not define a function. It declares an extern
- * function.
- *  arguments:
- *  DTYPE: the return type
- *  NAME: the function name (and the func variable name in C++)
- *  the argument definitions: see _arg_ below
- * */
-#define _decl_func_(DTYPE, NAME, ...) \
-    ::dnnl::impl::graph::gc::func_t NAME \
-            = ::dnnl::impl::graph::gc::builder::_decl_func( \
-                    #NAME, DTYPE, {__VA_ARGS__})
-
-/**
- * Declares an argument of function node.
- * arguments:
- *  name: the name of the argument
- *  dtype: the sc_data_type_t of the argument
- *  dimemsions: the integer dimensions of the argument. Wrapped by {}.
- *      Can be empty {} or omitted, meaning this argument is a scalar.
- *  e.g.
- *  _arg_("len", datatypes::s32) // a scalar arg of s32
- *  _arg_("buffer", datatypes::f32, {100,200}) // a tensor arg of f32
- * */
-#define _arg_(...) ::dnnl::impl::graph::gc::builder::_make_arg(__VA_ARGS__)
-
-/**
- *  An std::vector of arguments to the function node.
- *  e.g.
- *  std::vector<expr> args = {make_var(...)};
- *  _varg_(args)
- * */
-#define _varg_(...) (__VA_ARGS__)
-
-/**
- * Binds all arguments in the function definition to C++ expr variables
- * e.g. _bind_(a,b,c);
- * */
-#define _bind_(...) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t __VA_ARGS__; \
-    ::dnnl::impl::graph::gc::utils::bind_vector_to_args<0>( \
-            _0_func__.vargs_, __VA_ARGS__);
-
-/**
- * Defines a variable within the current scope
- * arguments:
- *  NAME: the name of the variable, should not be quoted
- *  DTYPE: sc_data_type_t of the variable
- * */
-#define _var_(NAME, DTYPE) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            ::dnnl::impl::graph::gc::builder::make_var(DTYPE, #NAME), false); \
-    ::dnnl::impl::graph::gc::builder::get_current_builder() \
-            ->push_var_tensor_def(NAME, linkage::local);
-
-/**
- * Defines a variable within the current scope with specified name
- * arguments:
- *  NAME: the name of the variable, should not be quoted
- *  VAR_NAME: the name of the tir variable, should be quoted
- *  DTYPE: sc_data_type_t of the variable
- * */
-#define _named_var_(NAME, VAR_NAME, DTYPE) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            ::dnnl::impl::graph::gc::builder::make_var(DTYPE, VAR_NAME), \
-            false); \
-    ::dnnl::impl::graph::gc::builder::get_current_builder() \
-            ->push_var_tensor_def(NAME, linkage::local);
-
-/**
- * Defines a variable within the current scope
- * arguments:
- *  NAME: the name of the variable, should not be quoted
- *  DTYPE: sc_data_type_t of the variable
- *  LINKAGE: the linkage
- *  INIT: the initial value
- * */
-#define _var_ex_(NAME, DTYPE, ...) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            ::dnnl::impl::graph::gc::builder::make_var(DTYPE, #NAME), false); \
-    ::dnnl::impl::graph::gc::builder::get_current_builder() \
-            ->push_var_tensor_def(NAME, __VA_ARGS__);
-
-/**
- * Defines a variable within the current scope with init value
- * arguments:
- *  NAME: the name of the variable, should not be quoted
- *  DTYPE: sc_data_type_t of the variable
- *  INIT: the initial value
- * */
-#define _var_init_(NAME, DTYPE, INIT) \
-    _var_ex_(NAME, DTYPE, ::dnnl::impl::graph::gc::linkage::local, INIT)
-
-#define _var_init_copy_(NAME, DTYPE, INIT) \
-    _var_init_(NAME, DTYPE, INIT); \
-    NAME##_ = NAME;
-
-/**
- * Defines a private linkage global variable within the current scope
- * arguments:
- *  MODULE: the ir_module
- *  NAME: the name of the variable, should not be quoted
- *  DTYPE: sc_data_type_t of the variable
- *  INIT: the initial value
- * */
-#define _module_var_(MODULE, NAME, DTYPE, INIT) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            (MODULE)->make_global_var( \
-                    DTYPE, #NAME, linkage::private_global, INIT), \
-            false);
-
-/**
- * Defines a public linkage global variable within the current scope
- * arguments:
- *  MODULE: the ir_module
- *  NAME: the name of the variable, should not be quoted
- *  DTYPE: sc_data_type_t of the variable
- *  INIT: the initial value
- * */
-#define _global_var_(MODULE, NAME, DTYPE, INIT) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            (MODULE)->make_global_var( \
-                    DTYPE, #NAME, linkage::public_global, INIT), \
-            false);
-
-/**
- * Defines a private linkage global tensor within the current scope
- * arguments:
- *  MODULE: the ir_module
- *  NAME: the name of the tensor, should not be quoted
- *  DTYPE: sc_data_type_t of the tensor
- *  INIT: the initial value
- * */
-#define _module_tensor_(MODULE, NAME, DTYPE, ...) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            MODULE->make_global_tensor( \
-                    DTYPE, #NAME, {__VA_ARGS__}, linkage::private_global), \
-            false);
-
-/**
- * Defines a public linkage global tensor within the current scope
- * arguments:
- *  MODULE: the ir_module
- *  NAME: the name of the tensor, should not be quoted
- *  DTYPE: sc_data_type_t of the tensor
- *  INIT: the initial value
- * */
-#define _global_tensor_(MODULE, NAME, DTYPE, ...) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            MODULE->make_global_tensor( \
-                    DTYPE, #NAME, {__VA_ARGS__}, linkage::public_global), \
-            false);
-
-/**
- * Defines a tensor within the current scope
- * arguments:
- *  NAME: the name of the tensor, should not be quoted
- *  DTYPE: sc_data_type_t of the tensor
- *  dimemsions: the dimemsions
- * e.g. _tensor_(buffer, datatypes::f32, 100, 200)
- * */
-#define _tensor_(NAME, DTYPE, ...) \
-    ::dnnl::impl::graph::gc::expr::lvalue_proxy_t NAME( \
-            ::dnnl::impl::graph::gc::builder::make_tensor( \
-                    #NAME, std::vector<expr> {__VA_ARGS__}, DTYPE), \
-            false); \
-    ::dnnl::impl::graph::gc::builder::get_current_builder() \
-            ->push_var_tensor_def(NAME, linkage::local);
-
-/**
- * Creates and reserve a function call
- * */
-#define _evaluate_call_(NAME, ...) \
-    ::dnnl::impl::graph::gc::builder::get_current_builder()->push_evaluate( \
-            ::dnnl::impl::graph::gc::builder::make_call(NAME, \
-                    std::vector<::dnnl::impl::graph::gc::expr> { \
-                            __VA_ARGS__}));
-
-/**
- * Creates a returns statement
- * */
-#define _return_(...) \
-    ::dnnl::impl::graph::gc::builder::get_current_builder()->push_returns( \
-            __VA_ARGS__)
-
-} // namespace builder
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/function_pass.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/function_pass.hpp
deleted file mode 100644
index 3ac2e0cafa3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/function_pass.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_FUNCTION_PASS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_FUNCTION_PASS_HPP
-
-#include <memory>
-#include <vector>
-#include "pass_info_macros.hpp"
-#include "sc_function.hpp"
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct tir_pass_dependency_t;
-
-/**
- * The base abstruct class of all function passes
- * */
-class function_pass_t {
-public:
-    virtual func_c operator()(func_c f) = 0;
-    virtual ~function_pass_t() = default;
-    virtual const char *get_name() const { return nullptr; }
-#ifndef NDEBUG
-    virtual void get_dependency_info(tir_pass_dependency_t &out) const;
-#endif
-};
-
-using function_pass_ptr = std::unique_ptr<function_pass_t>;
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/analysis/analysis.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/analysis/analysis.hpp
deleted file mode 100644
index 3c5b3bc1e58..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/analysis/analysis.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_ANALYSIS_ANALYSIS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_ANALYSIS_ANALYSIS_HPP
-
-#include "../graph.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// check whether open quantized optimzation passes.
-void analysis_quantized(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/analysis/analysis_quantized.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/analysis/analysis_quantized.cpp
deleted file mode 100644
index a7e57c4b2ef..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/analysis/analysis_quantized.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../traits.hpp"
-#include "analysis.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-void analysis_quantized(sc_graph_t &graph, const context_ptr &ctx) {
-    for (auto &op : graph.ops_) {
-        if (op->op_name_.find("quantize") != std::string::npos) {
-            graph.attrs_[sc_graph_t::attr_key_t::quantize] = true;
-            break;
-        }
-    }
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/anchor_loop_generator.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/anchor_loop_generator.cpp
deleted file mode 100644
index ed3116f70f5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/anchor_loop_generator.cpp
+++ /dev/null
@@ -1,395 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "anchor_loop_generator.hpp"
-#include "fusible_op_utils.hpp"
-#include "fusion_anchor.hpp"
-#include "mixed_partition.hpp"
-#include "utils.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/cpu/local_tensor_lower.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <runtime/config.hpp>
-#include <util/utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.outer_loop_gen);
-
-for_loop get_next_inner_loop(const for_loop &cur_loop) {
-    if (cur_loop->body_.isa<for_loop>()) {
-        return cur_loop->body_.checked_as<for_loop>();
-    } else if (cur_loop->body_.isa<stmts>()
-            && cur_loop->body_.checked_as<stmts>()->seq_.size() == 1
-            && cur_loop->body_.checked_as<stmts>()->seq_[0].isa<for_loop>()) {
-        return cur_loop->body_.checked_as<stmts>()
-                ->seq_[0]
-                .checked_as<for_loop>();
-    }
-    return for_loop();
-}
-
-static bool axis_can_be_sort(sc_graph_t &graph) {
-    return is_optimized_sub_graph(graph)
-            && std::all_of(graph.ops_.begin(), graph.ops_.end(),
-                    [](const sc_op_ptr &op) {
-                        return (!op->isa<reorder_op_t>()
-                                       && !op->isa<tensor_view_op_t>())
-                                || op->attrs_.get_or_else(
-                                        op_attr_key::no_fuse, false);
-                    });
-}
-
-typedef std::vector<int> (*loop_sort_rule_func)(const context_ptr &,
-        const std::vector<int> &, sc_graph_t &, const graph_tensor_ptr &);
-
-/**
- * Move loop axis of reduce axis to inner.
- *
- * E.g. loop axis is {0, 1, 2, 3}, rd_axis is {1, 2}, after func, we get loop
- * axis {0, 3, 1, 2}
- * */
-static std::vector<int> move_reduce_axis_to_inner(const context_ptr &ctx,
-        const std::vector<int> &in_axis, sc_graph_t &graph,
-        const graph_tensor_ptr &base_gt) {
-    if (graph.is_dynamic() || !axis_can_be_sort(graph)) { return in_axis; }
-    auto run_threads = runtime_config_t::get().get_num_threads();
-    std::vector<int> out_axis(in_axis.begin(), in_axis.end());
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    bool can_move = true;
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        std::vector<int> reduce_axis;
-        if (auto reduce_node = node->dyn_cast<reduce_op_t>()) {
-            reduce_axis = reduce_node->get_rd_axis();
-        } else if (auto reduce_node = node->dyn_cast<reduce_impl_op_t>()) {
-            reduce_axis = reduce_node->get_rd_axis();
-        } else {
-            return;
-        }
-        // if using single core and the reduce op is the only op in this
-        // partition, don't reorder the loops
-        // we move the reduce axis for parallelism with a cost of losing
-        // cache locality. For single core configuration, we don't do this
-        // moving
-        if (run_threads == 1 && graph.ops_.size() == 3) {
-            // 3 ops for input, reduce and output
-            can_move = false;
-            return;
-        }
-        std::sort(reduce_axis.begin(), reduce_axis.end());
-        if (reduce_axis.back() >= static_cast<int>(in_axis.size())) {
-            can_move = false;
-            return;
-        }
-        for (auto raxis : reduce_axis) {
-            auto rend = std::remove(out_axis.begin(), out_axis.end(), raxis);
-            assert(rend + 1 == out_axis.end());
-            *rend = raxis;
-        }
-    });
-    return can_move ? out_axis : in_axis;
-}
-
-/**
- * Move last channel axis to outer if pooling format is NXC, in order to acheive
- * better loop parallelism
- * */
-static std::vector<int> move_pooling_axis_to_outer(const context_ptr &ctx,
-        const std::vector<int> &in_axis, sc_graph_t &graph,
-        const graph_tensor_ptr &base_gt) {
-    if (graph.is_dynamic() && !axis_can_be_sort(graph)) { return in_axis; }
-
-    if (!is_optimized_sub_graph(graph)) return in_axis;
-    auto run_threads = runtime_config_t::get().get_num_threads();
-    // auto skip
-    if (run_threads == 1) return in_axis;
-    std::vector<int> out_axis(in_axis.begin(), in_axis.end());
-    bool use_vectorized = false;
-    if (std::any_of(graph.ops_.begin(), graph.ops_.end(),
-                [&ctx, &run_threads, &use_vectorized](const sc_op_ptr &node) {
-                    if (!node->isa<pooling_op_t>()) return false;
-                    auto &detail = node->get_inputs()[0]->details_;
-                    // skip if not a plain format
-                    if (detail.get_format().is_blocking()) return false;
-                    auto shape = detail.get_blocking_dims();
-                    auto pool = node->dyn_cast<pooling_op_t>();
-                    auto channel_axis = pool->get_channel_axis();
-                    COMPILE_ASSERT(channel_axis.size() == 1,
-                            "plain format is expected")
-                    // skip if not channel last
-                    if (pool->get_channel_axis()[0]
-                            != (static_cast<int>(shape.size()) - 1))
-                        return false;
-                    auto &last_dim = shape.back();
-                    auto vector_lanes
-                            = vectorize_step(ctx, detail.dtype_.type_code_);
-                    use_vectorized = (last_dim / vector_lanes
-                            && last_dim % vector_lanes == 0);
-                    if (use_vectorized) {
-                        shape.back() = std::max(
-                                (int64_t)1, shape.back() / vector_lanes);
-                    }
-                    auto pool_axis = pool->get_real_pooling_axis();
-                    int parallel_num = 1;
-                    // calculate possible parallelism
-                    for (int i = 0; i < static_cast<int>(shape.size()); i++) {
-                        if (std::find(pool_axis.begin(), pool_axis.end(), i)
-                                != pool_axis.end())
-                            continue;
-                        parallel_num *= shape[i];
-                    }
-                    // check parallel_num
-                    return parallel_num >= run_threads;
-                })) {
-        // move last channel axis to outer
-        auto &last_channel_ax = out_axis.back();
-        out_axis.insert(out_axis.begin() + 1, last_channel_ax);
-        if (!use_vectorized) out_axis.pop_back();
-        return out_axis;
-    } else {
-        return in_axis;
-    }
-}
-
-/**
- * Satisfy continuous access of input tensor include vectorization on last axis
- * and ensure size of each load is more than cache line.
- *
- * E.g. loop axis = {1, 3, 4, 0, 2}
- *
- * IF input tensor(origin shape) is f32(32, 4, 16, 8, 16), last axis is 16
- * which fills up a cache line, after func we get loop axis = {1, 3, 0, 2, 4}.
- * IF input tensor(origin shape) is f32{32, 4, 16, 8, 8}, after func we get loop
- * axis = {1, 0, 2, 3, 4}
- * */
-static std::vector<int> continuous_access_satisfaction(const context_ptr &ctx,
-        const std::vector<int> &in_axis, sc_graph_t &graph,
-        const graph_tensor_ptr &base_gt) {
-    if (!axis_can_be_sort(graph)) { return in_axis; }
-    auto base_dims = base_gt->details_.get_blocking_dims_expr(graph);
-    assert(in_axis.size() == base_dims.size());
-    constexpr int cache_line_size = 64;
-    int fill_up_dim = static_cast<int>(base_dims.size()) - 1;
-    int dtype_size = utils::get_sizeof_type(base_gt->details_.dtype_);
-    int cur_load_size = base_dims[fill_up_dim].isa<constant>()
-            ? get_expr_as_int(base_dims[fill_up_dim])
-            : 1;
-    while (fill_up_dim > 0 && cur_load_size * dtype_size < cache_line_size) {
-        fill_up_dim--;
-        cur_load_size = cur_load_size
-                * (base_dims[fill_up_dim].isa<constant>()
-                                ? get_expr_as_int(base_dims[fill_up_dim])
-                                : 1);
-    }
-    // input tensor is too small that can not fill up a cache line.
-    // No need to change loop axis.
-    if (fill_up_dim == 0) {
-        if (!graph.is_dynamic()) {
-            return in_axis;
-        } else {
-            fill_up_dim = static_cast<int>(base_dims.size()) - 1;
-        }
-    }
-    std::vector<int> out_axis(in_axis.begin(), in_axis.end());
-    for (int i = fill_up_dim; i < static_cast<int>(base_dims.size()); i++) {
-        auto rend = std::remove(out_axis.begin(), out_axis.end(), i);
-        *rend = i;
-    }
-    return out_axis;
-}
-
-static std::vector<loop_sort_rule_func> loop_sort_rules
-        = {move_reduce_axis_to_inner, continuous_access_satisfaction,
-                move_pooling_axis_to_outer};
-
-anchor_loop_generator_t::anchor_loop_generator_t(
-        const graph_tensor_ptr &base_gt)
-    : body_generator_base_t(nullptr, {}, {}), base_gt_(base_gt) {}
-
-sc_graph_t &get_owner_graph_from_gt(const graph_tensor_ptr &gt) {
-    // search producer firstly
-    auto op = gt->producer_owner_;
-    if (!op->owner_graph_) {
-        // if not found, search in users
-        for (auto &user : gt->uses_) {
-            op = user.second.get();
-            if (op->owner_graph_) { break; }
-        }
-    }
-    COMPILE_ASSERT(op->owner_graph_, "No owner graph found, please check")
-    return op->get_owner_graph();
-}
-
-bool anchor_loop_generator_t::create_outer_loop_anchor(
-        fusion_anchor_mgr_t *fmgr, const context_ptr &ctx) const {
-    COMPILE_ASSERT(fmgr, "fusion anchor mgr should not be null")
-    auto &g = get_owner_graph_from_gt(base_gt_);
-    // query binding axis
-    query_binding_axis(g);
-    auto base_dims = base_gt_->details_.get_blocking_dims_expr(g);
-    auto numdims = base_dims.size();
-    assert(numdims > 0);
-    std::vector<expr> loop_vars;
-    slice_range cur_tsr_slice;
-    std::vector<int> loop_axis;
-    loop_vars.reserve(numdims);
-    cur_tsr_slice.reserve(numdims);
-    loop_axis.reserve(numdims);
-
-    auto bld = builder::get_current_builder();
-    // will create numdims loop vars but uses numdims - 1 because user may sort
-    // loop axis
-    for (size_t i = 0; i < numdims; i++) {
-        bld->push_scope();
-        loop_vars.emplace_back(builder::make_var(
-                datatypes::index, std::string("__itr_") + std::to_string(i)));
-        // outer loops should have tensor slice of length=1
-        cur_tsr_slice.emplace_back(std::make_pair(loop_vars.back(), expr(1)));
-        loop_axis.push_back(static_cast<int>(i));
-    }
-    // sort loop axis with rules
-    for (auto &sort_rule : loop_sort_rules) {
-        loop_axis = sort_rule(ctx, loop_axis, g, base_gt_);
-    }
-
-    // generate anchors from inner to outer
-    if (numdims > 1) {
-        // get ax for last dim
-        int max_ax = *std::max_element(loop_axis.begin(), loop_axis.end());
-        // if duplicated ax found, it means that loop generator needs to split
-        // loop for last dim vectorization
-        bool need_split_loop
-                = (std::count(loop_axis.begin(), loop_axis.end(), max_ax) == 2);
-        auto lanes = vectorize_step(ctx, base_gt_->details_.dtype_.type_code_);
-        // set last dim slice range for lanes
-        if (need_split_loop) {
-            cur_tsr_slice[max_ax]
-                    = std::make_pair(loop_vars[max_ax] * lanes, lanes);
-        }
-        for (size_t i = 0; i < numdims - 1; i++) {
-            // loop num is current dimension index
-            auto loop_num = loop_axis[numdims - i - 1];
-            // upper loop num
-            auto upper_loop_num = loop_axis[numdims - i - 2];
-            // set full tensor range for loop_num dimension
-            cur_tsr_slice[loop_num] = std::make_pair(0, base_dims[loop_num]);
-            fmgr->create_fusion_anchor(
-                    slice_map {{base_gt_.get(), {cur_tsr_slice}}});
-            auto body = bld->pop_scope();
-            auto loop = bld->push_for_loop(loop_vars[upper_loop_num], 0,
-                    (need_split_loop && upper_loop_num == max_ax)
-                            ? do_cast_and_fold(base_dims[max_ax] / lanes)
-                            : base_dims[upper_loop_num],
-                    1, body, true, for_type::NORMAL);
-            // bind outer loops with axis hint
-            bind_loop_axis(base_gt_, loop, upper_loop_num, true);
-        }
-    } else {
-        COMPILE_ASSERT(numdims == 1, "only 1 dims is expected")
-        if (base_dims[0].isa<constant>()) {
-            auto only_dim
-                    = get_const_as_int(base_dims[0].static_as<constant>());
-            auto lanes
-                    = vectorize_step(ctx, base_gt_->details_.dtype_.type_code_);
-            if ((only_dim % lanes == 0) && (only_dim > lanes)) {
-                cur_tsr_slice[0].second = expr((int)lanes);
-                fmgr->create_fusion_anchor(
-                        slice_map {{base_gt_.get(), {cur_tsr_slice}}});
-                auto body = bld->pop_scope();
-                auto loop = bld->push_for_loop(loop_vars[0], 0, base_dims[0],
-                        expr((int)lanes), body, true, for_type::NORMAL);
-                // bind outer loops with axis hint
-                bind_loop_axis(base_gt_, loop, 0, true);
-            }
-        }
-    }
-    // create outer-most anchor
-    cur_tsr_slice[loop_axis[0]] = std::make_pair(0, base_dims[loop_axis[0]]);
-    fmgr->create_fusion_anchor(slice_map {{base_gt_.get(), {cur_tsr_slice}}});
-    return true;
-}
-
-bool anchor_loop_generator_t::create_inner_loop_anchor(
-        fusion_anchor_mgr_t *fmgr,
-        const fusion_anchor_ptr &parent_fanchor) const {
-    COMPILE_ASSERT(parent_fanchor, "parent anchor must be set")
-    // do not support multi-slice
-    if (parent_fanchor->fsmap_.get(base_gt_).size() != 1) return false;
-    auto bld = builder::get_current_builder();
-    std::vector<expr> loop_vars;
-    std::vector<int> inner_anchor_axis;
-    slice_range inner_slice;
-    // will create numdims loop vars but uses numdims - 1 because user may sort
-    // loop axis
-    auto &range = parent_fanchor->fsmap_.get(base_gt_)[0];
-    auto is_valid_constant_range = [](const std::pair<expr, expr> &range_i) {
-        return (range_i.second.isa<constant_c>()
-                && get_expr_as_int(range_i.second) > 1);
-    };
-    size_t valid_range_end = 0;
-    for (int64_t i = range.size() - 1; i >= 0; --i) {
-        if (is_valid_constant_range(range[i])) {
-            valid_range_end = i;
-            break;
-        }
-    }
-    for (size_t i = 0; i < range.size(); i++) {
-        if (is_valid_constant_range(range[i]) && (i < valid_range_end)) {
-            bld->push_scope();
-            loop_vars.emplace_back(builder::make_var(datatypes::index,
-                    std::string("__inner_itr_") + std::to_string(i)));
-            inner_slice.emplace_back(std::make_pair(loop_vars.back(), expr(1)));
-            inner_anchor_axis.emplace_back(i);
-        } else {
-            inner_slice.emplace_back(std::make_pair(expr(0), range[i].second));
-        }
-    }
-
-    auto inner_anchor_num = inner_anchor_axis.size();
-    if (!inner_anchor_num) return false;
-    // generate anchors from inner to outer
-    for (int64_t i = static_cast<int64_t>(inner_anchor_num) - 1; i >= 0; i--) {
-        auto loop_num = inner_anchor_axis[i];
-        fmgr->create_fusion_anchor(
-                slice_map {{base_gt_.get(), {inner_slice}}}, parent_fanchor);
-        auto body = bld->pop_scope();
-        auto loop = bld->push_for_loop(loop_vars[i], 0, range[loop_num].second,
-                1, body, true, for_type::NORMAL);
-        if (i == 0) {
-            loop->attr()[stmt_attr_key::merge_loop] = true;
-            // bind outer loops with axis hint
-            bind_loop_axis(base_gt_, loop, loop_num, true);
-        }
-        inner_slice[loop_num] = range[loop_num];
-    }
-    return true;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/anchor_loop_generator.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/anchor_loop_generator.hpp
deleted file mode 100644
index 224dce796ea..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/anchor_loop_generator.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_ANCHOR_LOOP_GENERATOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_ANCHOR_LOOP_GENERATOR_HPP
-#include <memory>
-#include <vector>
-#include "graph.hpp"
-#include <ops/body_generator.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct fusion_anchor_t;
-
-class anchor_loop_generator_t : public body_generator_base_t {
-private:
-    // decide which one is base graph tensor
-    graph_tensor_ptr base_gt_;
-
-public:
-    anchor_loop_generator_t(const graph_tensor_ptr &base_gt);
-    config_ptr get_default_config(context_ptr ctx) const override {
-        return nullptr;
-    }
-
-    bool generate(context_ptr ctx, const void *config,
-            fusion_anchor_mgr_t *fmgr, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const override {
-        return false;
-    };
-
-    bool create_outer_loop_anchor(
-            fusion_anchor_mgr_t *fmgr, const context_ptr &ctx) const;
-
-    bool create_inner_loop_anchor(fusion_anchor_mgr_t *fmgr,
-            const std::shared_ptr<fusion_anchor_t> &parent_fanchor) const;
-
-    void schedule_loops(context_ptr ctx, const void *config, stmt body,
-            std::vector<for_loop> &fors) const override {};
-    float get_gflop() const override { return 0; }
-};
-
-for_loop get_next_inner_loop(const for_loop &cur_loop);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/binding_axis.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/binding_axis.cpp
deleted file mode 100644
index 58d236cde64..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/binding_axis.cpp
+++ /dev/null
@@ -1,351 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <functional>
-#include <utility>
-#include "binding_axis.hpp"
-#include "fusible_op.hpp"
-#include "fusible_op_utils.hpp"
-#include "graph.hpp"
-#include "pass/pass.hpp"
-#include "visitor.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// only hash topology of graph, excluding attr and so on
-graph_identity get_identity_by_topology(const sc_graph_t &g) {
-    graph_identity identity;
-    size_t seed = 0;
-    op_visitor_t vis = op_visitor_t::bfs_topology_sort(g.ops_.size());
-    vis.visit_graph(
-            g, [&seed, &identity](op_visitor_t *vis, const sc_op_ptr &op) {
-                for (auto &in : op->get_inputs()) {
-                    hash_combine(seed, in->details_.hash());
-                    identity.op_lts_.emplace_back(in->details_);
-                }
-                for (auto &out : op->get_outputs()) {
-                    hash_combine(seed, out->details_.hash());
-                    identity.op_lts_.emplace_back(out->details_);
-                }
-                hash_combine(seed, op->op_name_);
-                identity.op_names_.emplace_back(op->op_name_);
-            });
-    identity.hash_ = seed;
-    return identity;
-}
-
-bool graph_identity::operator==(const graph_identity &other) const {
-    return hash_ == other.hash_ && op_names_ == other.op_names_
-            && op_lts_ == other.op_lts_;
-}
-
-void query_binding_axis(sc_graph_t &g) {
-    // get hash of graph
-    auto g_identity = get_identity_by_topology(g);
-    // get global map cache of graph
-    auto cache = g.attrs_.get_or_null<global_map_cache>(
-            binding_axis_attr::global_map_cache);
-    // auto skip, when cache is found and dont need to update
-    if (cache && cache->identity_ == g_identity) { return; }
-    // make shared ptr of global map
-    auto shared_map_ptr = std::make_shared<global_binding_axis_map>();
-    // global map
-    auto &gax_map = *shared_map_ptr;
-    // query function
-    auto query_based_on_gt = [&gax_map](const graph_tensor_ptr &gt) {
-        auto key = (uintptr_t)gt.get();
-        // avoid repeated query
-        if (gax_map.find(key) != gax_map.end()) return;
-        // skip any format
-        if (gt->details_.get_format().is_any()) return;
-        // set initial axis
-        int rank = gt->details_.get_plain_dims().size();
-        binding_axis_map bd_ax_map;
-        binding_axis init_axis;
-        init_axis.reserve(rank);
-        for (int i = 0; i < rank; i++) {
-            init_axis.emplace_back(std::vector<int> {i});
-        }
-        bd_ax_map.get(gt) = init_axis;
-
-        // start node producer recursively call pre-infer binding axis
-        auto &producer_op = gt->producer_owner_;
-        if (auto cur
-                = producer_op
-                          ->dyn_cast<op_traits::mixed_partition_acceptable>()) {
-            cur->pre_infer_binding_axis(bd_ax_map);
-        }
-
-        // start node user recursively call infer binding axis
-        for (auto &user : gt->uses_) {
-            if (auto cur = user.second->dyn_cast<
-                           op_traits::mixed_partition_acceptable>()) {
-                cur->infer_binding_axis(bd_ax_map);
-            }
-        }
-
-        // tranform key type
-        std::unordered_map<uintptr_t, binding_axis> value_map;
-        std::transform(bd_ax_map.datamap_.begin(), bd_ax_map.datamap_.end(),
-                std::inserter(value_map, value_map.end()),
-                [](const std::pair<graph_tensor *, binding_axis> &kv) {
-                    return std::make_pair((uintptr_t)kv.first, kv.second);
-                });
-
-        // insert to global map
-        gax_map[key] = std::move(value_map);
-    };
-
-    // visit all graph tensor
-    for (auto &op : g.ops_) {
-        if (op->isa<input_op>() || op->isa<output_op>()
-                || op->isa<constant_op_t>())
-            continue;
-        for (auto &inp : op->get_inputs()) {
-            query_based_on_gt(inp);
-        }
-        for (auto &out : op->get_outputs()) {
-            query_based_on_gt(out);
-        }
-    }
-
-    // attach to graph attr
-    g.attrs_[binding_axis_attr::global_map_cache]
-            = global_map_cache {shared_map_ptr, g_identity};
-}
-
-std::shared_ptr<global_binding_axis_map> get_binding_axis_map(sc_graph_t &g) {
-    auto cache = g.attrs_.get_or_null<global_map_cache>(
-            binding_axis_attr::global_map_cache);
-    return cache ? cache->global_map_ptr_ : nullptr;
-}
-
-void bind_loop_axis(const graph_tensor_ptr &gt, const for_loop &loop,
-        const std::vector<int> &axis, bool is_block) {
-    auto &owner_graph = gt->producer_owner_->owner_graph_;
-    COMPILE_ASSERT(owner_graph, "No owner graph is found")
-    // get global map ptr
-    auto global_ptr = get_binding_axis_map(*owner_graph);
-    COMPILE_ASSERT(global_ptr, "No global mapping ptr found, please check")
-    // set attr
-    loop->attr()[binding_axis_attr::loop_hint]
-            = loop_binding_axis_hint {global_ptr, (uintptr_t)gt.get(),
-                    is_block ? transform_axis_blocking2plain(gt->details_, axis)
-                             : axis};
-}
-
-void bind_loop_axis(
-        const graph_tensor_ptr &gt, const stmt &loop, int axis, bool is_block) {
-    COMPILE_ASSERT(
-            loop.isa<for_loop>(), "loop node is expected, but got " << loop)
-    bind_loop_axis(
-            gt, loop.static_as<for_loop>(), std::vector<int> {axis}, is_block);
-}
-
-void bind_loop_axis(const graph_tensor_ptr &gt,
-        const std::vector<for_loop> &loops, const std::vector<int> &axis,
-        bool is_block) {
-    COMPILE_ASSERT(loops.size() == axis.size(),
-            "axis size should be equal to loop size, but got "
-                    << axis.size() << " VS " << loops.size())
-    for (size_t i = 0; i < loops.size(); i++) {
-        bind_loop_axis(gt, loops[i], axis[i], is_block);
-    }
-}
-
-std::vector<int> transform_binding_axis_with_indices(
-        const binding_axis &axis, const std::vector<int> &indices) {
-    std::vector<int> ret;
-    for (auto &ax : indices) {
-        COMPILE_ASSERT(ax < (int)axis.size(),
-                "ax exceed size: " << ax << " >= " << axis.size())
-        ret.insert(ret.end(), axis[ax].begin(), axis[ax].end());
-    }
-
-    // check if empty to make g++12 happy
-    if (!ret.empty()) {
-        // sort axis
-        std::sort(ret.begin(), ret.end());
-        // erase possible repeated elements in axis B
-        ret.erase(std::unique(ret.begin(), ret.end()), ret.end());
-    }
-    return ret;
-}
-
-bool get_aligned_binding_axis(const loop_binding_axis_hint &hint_a,
-        const loop_binding_axis_hint &hint_b, std::vector<int> &axis_a,
-        std::vector<int> &axis_b) {
-    // reset
-    axis_a.clear();
-    axis_b.clear();
-
-    COMPILE_ASSERT(hint_a.global_map_ptr_ && hint_b.global_map_ptr_,
-            "global mapping ptr is null")
-    // global mapping ptr must be equal, which means they are quried from the
-    // same graph
-    if (hint_a.global_map_ptr_ != hint_b.global_map_ptr_) { return false; }
-
-    // if two keys are same
-    if (hint_a.key_ == hint_b.key_) {
-        axis_a = hint_a.axis_;
-        axis_b = hint_b.axis_;
-        return true;
-    }
-    // get binding axis of A from hint A
-    axis_a = hint_a.axis_;
-    // get global map
-    auto &global_map = *hint_a.global_map_ptr_;
-
-    // get mapping from view of hint B
-    if (global_map.find(hint_b.key_) == global_map.end()) { return false; }
-    auto &map_b = global_map[hint_b.key_];
-
-    // query binding axis of A from hint B
-    if (map_b.find(hint_a.key_) == map_b.end()) { return false; }
-    auto &value_a = map_b[hint_a.key_];
-    // use axis_a as indices to get axis_b from value_a
-    axis_b = transform_binding_axis_with_indices(value_a, hint_b.axis_);
-
-    return true;
-}
-
-bool get_aligned_binding_axis_twice(const loop_binding_axis_hint &hint_a,
-        const loop_binding_axis_hint &hint_b, std::vector<int> &axis_a,
-        std::vector<int> &axis_b) {
-    return get_aligned_binding_axis(hint_a, hint_b, axis_a, axis_b)
-            || get_aligned_binding_axis(hint_b, hint_a, axis_b, axis_a);
-}
-
-bool check_loop_binding_axis(
-        const for_loop_node_t *loop_a, const for_loop_node_t *loop_b) {
-    // get hint A and B respectively
-    auto hint_a = any_map_t::fetch_or_null<loop_binding_axis_hint>(
-            loop_a->attr_.get(), binding_axis_attr::loop_hint);
-    auto hint_b = any_map_t::fetch_or_null<loop_binding_axis_hint>(
-            loop_b->attr_.get(), binding_axis_attr::loop_hint);
-    if (!hint_a || !hint_b) return false;
-
-    std::vector<int> axis_a, axis_b;
-    if (get_aligned_binding_axis_twice(*hint_a, *hint_b, axis_a, axis_b)) {
-        return axis_a == axis_b;
-    } else {
-        return false;
-    }
-}
-
-bool check_loop_binding_axis(const for_loop &loop_a, const for_loop &loop_b) {
-    return check_loop_binding_axis(loop_a.get(), loop_b.get());
-}
-
-int check_loop_binding_axis(const std::vector<for_loop> &loop_a,
-        const std::vector<for_loop> &loop_b, int64_t check_loop_size) {
-    size_t gcs = std::min(loop_a.size(), loop_b.size());
-    if (check_loop_size >= 0) { gcs = std::min(gcs, (size_t)check_loop_size); }
-    int aligned_loop_num = 0;
-    for (size_t i = 0; i < gcs; i++) {
-        if (check_loop_binding_axis(loop_a[i], loop_b[i])) {
-            aligned_loop_num++;
-        } else {
-            break;
-        }
-    }
-    return aligned_loop_num;
-}
-
-bool check_loop_has_axis(const for_loop &loop, const graph_tensor_ptr &gt,
-        const std::vector<int> &axis) {
-    // get hint
-    auto hint = any_map_t::fetch_or_null<loop_binding_axis_hint>(
-            loop->attr_.get(), binding_axis_attr::loop_hint);
-    if (!hint) return false;
-    // get binding axis
-    auto &self_axis = hint->axis_;
-    // get global map
-    COMPILE_ASSERT(hint->global_map_ptr_, "global mapping ptr is null")
-    auto &global_map = *(hint->global_map_ptr_);
-
-    // get mapping based on self hint key
-    if (global_map.find(hint->key_) == global_map.end()) return false;
-    auto &self_map = global_map[hint->key_];
-
-    // get target key
-    auto target_key = (uintptr_t)gt.get();
-    // query binding axis
-    if (self_map.find(target_key) == self_map.end()) return false;
-    auto &target_binding_axis = self_map[target_key];
-    // get target_axis based on self_axis from view of target graph tensor
-    std::vector<int> target_axis = transform_binding_axis_with_indices(
-            target_binding_axis, self_axis);
-
-    // check any element of argument `axis` would appear among target axis
-    return std::any_of(axis.begin(), axis.end(), [&target_axis](const int &ax) {
-        return std::find(target_axis.begin(), target_axis.end(), ax)
-                != target_axis.end();
-    });
-}
-
-void copy_binding_axis_hint(
-        const for_loop_node_t *ths, for_loop_node_t *other) {
-    auto ptr = any_map_t::fetch_or_null<loop_binding_axis_hint>(
-            ths->attr_.get(), binding_axis_attr::loop_hint);
-    if (ptr) {
-        auto loop_hint = *ptr;
-        other->attr().set(binding_axis_attr::loop_hint, loop_hint);
-    }
-}
-
-void fuse_binding_axis_hint(
-        const for_loop_node_t *ths, const for_loop_node_t *other) {
-    auto ths_hint_ptr = any_map_t::fetch_or_null<loop_binding_axis_hint>(
-            ths->attr_.get(), binding_axis_attr::loop_hint);
-    auto other_hint_ptr = any_map_t::fetch_or_null<loop_binding_axis_hint>(
-            other->attr_.get(), binding_axis_attr::loop_hint);
-    if (!ths_hint_ptr || !other_hint_ptr) return;
-
-    loop_binding_axis_hint hint_ths = *ths_hint_ptr,
-                           hint_other = *other_hint_ptr;
-
-    auto merge_axis = [](const std::vector<int> &axis_a,
-                              const std::vector<int> &axis_b) {
-        auto ret = axis_a;
-        ret.insert(ret.end(), axis_b.begin(), axis_b.end());
-        // check if empty to make g++12 happy
-        if (!ret.empty()) {
-            std::sort(ret.begin(), ret.end());
-            ret.erase(std::unique(ret.begin(), ret.end()), ret.end());
-        }
-        return ret;
-    };
-
-    std::vector<int> axis_ths, axis_other;
-    if (get_aligned_binding_axis(hint_ths, hint_other, axis_ths, axis_other)) {
-        auto merged_axis = merge_axis(axis_ths, axis_other);
-        hint_ths.axis_ = merged_axis;
-    } else if (get_aligned_binding_axis(
-                       hint_other, hint_ths, axis_other, axis_ths)) {
-        auto merged_axis = merge_axis(axis_ths, axis_other);
-        hint_other.axis_ = merged_axis;
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/binding_axis.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/binding_axis.hpp
deleted file mode 100644
index 34ce9c8f24c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/binding_axis.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_BINDING_AXIS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_BINDING_AXIS_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-#include <compiler/ir/sc_stmt.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace binding_axis_attr {
-// global binding axis map cache, attached on the graph
-constexpr const char *global_map_cache = "global_map_cache";
-// loop binding axis hint, attached on the for loop node
-constexpr const char *loop_hint = "loop_hint";
-}; // namespace binding_axis_attr
-
-class sc_graph_t;
-struct graph_tensor;
-struct logical_tensor_t;
-
-using binding_axis = std::vector<std::vector<int>>;
-template <typename valT>
-struct gt_map_t;
-using binding_axis_map = gt_map_t<binding_axis>;
-
-using global_binding_axis_map = std::unordered_map<uintptr_t,
-        std::unordered_map<uintptr_t, binding_axis>>;
-
-// The identity based on the topology of graph
-struct graph_identity {
-    std::vector<std::string> op_names_;
-    std::vector<logical_tensor_t> op_lts_;
-    size_t hash_;
-    // compare graph identity
-    bool operator==(const graph_identity &other) const;
-};
-
-struct global_map_cache {
-    // global map ptr
-    std::shared_ptr<global_binding_axis_map> global_map_ptr_;
-    // ensure consistency of global mapping ptr
-    graph_identity identity_;
-};
-
-struct loop_binding_axis_hint {
-    // global map ptr
-    std::shared_ptr<global_binding_axis_map> global_map_ptr_;
-    // key: the address of graph tensor that generate loops
-    uintptr_t key_;
-    // axis of graph tensor's plain dims
-    std::vector<int> axis_;
-};
-
-// query binding axis based on give graph and set result on the attr of graph
-void query_binding_axis(sc_graph_t &g);
-
-// the standart binding axis interface
-void bind_loop_axis(const std::shared_ptr<graph_tensor> &gt,
-        const for_loop &loop, const std::vector<int> &axis,
-        bool is_block = false);
-// friendly to development
-void bind_loop_axis(const std::shared_ptr<graph_tensor> &gt, const stmt &loop,
-        int axis, bool is_block = false);
-void bind_loop_axis(const std::shared_ptr<graph_tensor> &gt,
-        const std::vector<for_loop> &loops, const std::vector<int> &axis,
-        bool is_block = false);
-
-// compare loop with binding axis
-bool check_loop_binding_axis(
-        const for_loop_node_t *loop_a, const for_loop_node_t *loop_b);
-bool check_loop_binding_axis(const for_loop &loop_a, const for_loop &loop_b);
-// compare loop with binding axis and return aligned loop num
-int check_loop_binding_axis(const std::vector<for_loop> &loop_a,
-        const std::vector<for_loop> &loop_b, int64_t check_loop_size = -1);
-
-// check whether the given loop has axis of graph tensor
-bool check_loop_has_axis(const for_loop &loop,
-        const std::shared_ptr<graph_tensor> &gt, const std::vector<int> &axis);
-
-/* used for loop transform */
-// copy binding axis hint when loop split
-void copy_binding_axis_hint(const for_loop_node_t *ths, for_loop_node_t *other);
-// merge binding axis hint when loop fuse
-void fuse_binding_axis_hint(
-        const for_loop_node_t *ths, const for_loop_node_t *other);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/brgemm_fusion.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/brgemm_fusion.cpp
deleted file mode 100644
index a9c62af77fd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/brgemm_fusion.cpp
+++ /dev/null
@@ -1,351 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "brgemm_fusion.hpp"
-#include <utility>
-#include "../builder.hpp"
-#include "../intrinsics.hpp"
-#include "../pass/ir_copy.hpp"
-#include "../viewer.hpp"
-#include "fusion_anchor.hpp"
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <runtime/microkernel/cpu/brgemm_alg_kind.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class valid_brgemm_finder_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    // currenly only support one valid brgemm in fusion manager
-    expr get_valid_brgemm_node() const {
-        if (valid_brgemm_count_ == 1) { return valid_brgemm_node_; }
-        return expr();
-    }
-    void operator()(stmt_c v) { ir_viewer_t::dispatch(std::move(v)); }
-    void view(intrin_call_c v) override {
-        if (v->type_ == intrin_type::brgemm
-                || v->type_ == intrin_type::list_brgemm) {
-            if (v->intrin_attrs_->get_or_else(
-                        intrin_attr::allow_brgemm_fusion, false)) {
-                valid_brgemm_count_++;
-                valid_brgemm_node_ = v.remove_const();
-            }
-        }
-    }
-
-private:
-    int valid_brgemm_count_ = 0;
-    expr valid_brgemm_node_;
-};
-
-class brgemm_inplace_replacer_t : public ir_inplace_visitor_t {
-public:
-    using ir_inplace_visitor_t::dispatch_impl;
-    using ir_inplace_visitor_t::visit_impl;
-    brgemm_inplace_replacer_t(
-            std::unordered_map<expr, expr> &rmap, const expr &out_tsr)
-        : rmap_(rmap), out_tsr_(out_tsr) {}
-    stmt operator()(stmt v) {
-        return ir_inplace_visitor_t::dispatch_impl(std::move(v));
-    }
-    expr visit_impl(intrin_call v) override {
-        auto itr = rmap_.find(v);
-        if (itr != rmap_.end()) {
-            changed_ = true;
-            return itr->second;
-        }
-        return v;
-    }
-    stmt visit_impl(stmts v) override {
-        std::vector<stmt> seq;
-        seq.reserve(v->seq_.size());
-        bool has_out_tsr = false;
-        for (auto &st : v->seq_) {
-            auto newst = dispatch_impl(st);
-            if (newst.isa<define>()
-                    && newst.static_as<define>()->var_.ptr_same(out_tsr_)) {
-                seq.insert(seq.begin(), newst);
-            } else {
-                seq.emplace_back(newst);
-            }
-        }
-        v->seq_ = seq;
-        return v;
-    }
-
-private:
-    std::unordered_map<expr, expr> &rmap_;
-    // output tsr for define node search and upshift.
-    expr out_tsr_;
-};
-
-// Check if the shape is valid for brgemm.
-// Valid cases:
-//      [m, n] + [1, n]
-//      [m, n] + [m, n]
-//      [m, n] + [1, 1]
-// Invalid cases:
-//      [m, n] + [m, 1]
-//      [m, n] + [m, 2n]
-static bool is_brgemm_valid_shape(const std::vector<expr> &extra_in_shape) {
-    if (extra_in_shape.size() < 2) { return false; }
-    // brgemm not support [m, 1] broadcast.
-    auto n = extra_in_shape[extra_in_shape.size() - 1];
-    auto m = extra_in_shape[extra_in_shape.size() - 2];
-    if (n.isa<constant>() && get_expr_as_int(n) == UINT64_C(1)
-            && !(m.isa<constant>() && get_expr_as_int(m) == UINT64_C(1))) {
-        return false;
-    }
-    for (size_t i = 0; i < extra_in_shape.size() - 2; i++) {
-        auto &shape = extra_in_shape[i];
-        if (!(shape.isa<constant>() && get_expr_as_int(shape) == INT64_C(1))) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static sc_data_etype get_expr_etype(expr in) {
-    while (in.isa<tensorptr>()) {
-        in = in.static_as<tensorptr>()->base_->ptr_;
-    }
-    assert(in.isa<tensor>());
-    return etypes::get_pointer_element(
-            in.static_as<tensor>()->dtype_.type_code_);
-}
-
-static bool register_bias(
-        brgemm_fusion_register *reg, sc_data_type_t dtype, const expr &bias) {
-    if (reg->data_.at(brgemm::bias)->equals(get_ir_null())) {
-        brgemm::bias_op_t op(dtype.as_etype());
-        reg->setting_.emplace_back(op);
-        reg->data_.at(brgemm::bias) = bias;
-        return true;
-    }
-    return false;
-}
-
-static bool register_scales(brgemm_fusion_register *reg, const expr &scales) {
-    if (reg->data_.at(brgemm::scales)->equals(get_ir_null())) {
-        brgemm::scale_op_t op;
-        reg->setting_.emplace_back(op);
-        reg->data_.at(brgemm::scales) = scales;
-        return true;
-    }
-    return false;
-}
-
-static bool register_zp(brgemm_fusion_register *reg, const expr &compen,
-        const brgemm::alg_kind_t &alg) {
-    brgemm::postop_data_kind dkind;
-    if (alg == brgemm::a_zp) {
-        dkind = brgemm::a_zp_compensations;
-    } else if (alg == brgemm::b_zp) {
-        dkind = brgemm::b_zp_compensations;
-    } else {
-        assert(alg == brgemm::c_zp);
-        dkind = brgemm::c_zp_values;
-    }
-    if (reg->data_.at(dkind)->equals(get_ir_null())) {
-        brgemm::zp_op_t op(alg);
-        reg->setting_.emplace_back(op);
-        reg->data_.at(dkind) = compen;
-        return true;
-    }
-    return false;
-}
-
-static bool register_unary(brgemm_fusion_register *reg,
-        brgemm::alg_kind_t alg_kind, const sc_op_ptr &op) {
-    // todo: currenly only add relu for test.
-    switch (alg_kind) {
-        case brgemm::alg_kind_t::eltwise_relu: {
-            brgemm::elt_op_t op2(alg_kind, 1.f, 0.f);
-            reg->setting_.emplace_back(op2);
-            return true;
-        }
-        default: return false;
-    }
-}
-
-static bool register_binary(brgemm_fusion_register *reg,
-        brgemm::alg_kind_t alg_kind, const sc_op_ptr &op, const expr &extra_in,
-        const std::vector<expr> &extra_in_shape) {
-    if (!is_brgemm_valid_shape(extra_in_shape)) { return false; }
-    if (!reg->data_.at(brgemm::binary_post_ops_rhs)->equals(get_ir_null())) {
-        return false;
-    }
-    sc_dims dims = get_expr_to_dims(extra_in_shape);
-    assert(dims.size() >= 2);
-    std::vector<int> int_dims(dims.end() - 2, dims.end());
-    auto dtype = get_expr_etype(extra_in);
-    // todo: currently only support add/sub/mul/div.
-    switch (alg_kind) {
-        case brgemm::binary_add:
-        case brgemm::binary_sub:
-        case brgemm::binary_mul:
-        case brgemm::binary_div: {
-            brgemm::bin_op_t op2(alg_kind, int_dims.data(), dtype);
-            reg->setting_.emplace_back(op2);
-            reg->data_.at(brgemm::binary_post_ops_rhs) = extra_in;
-            return true;
-        }
-        default: return false;
-    }
-}
-
-// always register success. output dtype setting is always the first setting op.
-static bool register_out_dtype(
-        brgemm_fusion_register *reg, const expr &output) {
-    auto dtype = get_expr_etype(output);
-    if (!utils::is_one_of(dtype, sc_data_etype::U8, sc_data_etype::S8,
-                sc_data_etype::S32, sc_data_etype::BF16, sc_data_etype::F32)) {
-        return false;
-    }
-    if (reg->setting_.empty()
-            || reg->setting_[0].empty_op_.alg_ != brgemm::out_dtype) {
-        brgemm::out_op_t op(dtype);
-        reg->setting_.insert(reg->setting_.begin(), op);
-    } else {
-        reg->setting_[0].out_op_.dtype_ = dtype;
-    }
-    return true;
-}
-
-static void update_data_c_ptr(brgemm_fusion_register *reg, expr output) {
-    if (!reg->data_.at(brgemm::binary_post_ops_rhs)->equals(get_ir_null())) {
-        reg->data_.at(brgemm::data_C_ptr) = std::move(output);
-    }
-}
-
-bool brgemm_fusion_register::register_op_infos(const sc_op_ptr &op,
-        const expr &output, const expr &extra_in,
-        const std::vector<expr> &extra_in_shape) {
-    if (setting_.size() >= brgemm::postops_setting_t::max_postops_num) {
-        SC_WARN << "Op " << op->op_name_
-                << " can not be inserted into brgemm fusion as the number of "
-                   "postops is full: "
-                << setting_.size();
-        return false;
-    }
-    auto brg_op = op->dyn_cast<op_traits::brgemm_fusion_acceptable_t>();
-    // can not fuse in brgemm, return false
-    if (!brg_op || !brg_op->fuse_in_brgemm_
-            || brg_op->alg_kind_ == brgemm::alg_kind_undef) {
-        return false;
-    }
-    bool status = false;
-    if (brg_op->alg_kind_ == brgemm::bias_add) {
-        status = register_bias(
-                this, op->get_inputs()[1]->details_.dtype_, extra_in);
-    } else if (brg_op->alg_kind_ == brgemm::out_scales) {
-        status = register_scales(this, extra_in);
-    } else if (brg_op->alg_kind_ == brgemm::out_dtype) {
-        status = register_out_dtype(this, output);
-    } else if (brg_op->alg_kind_ == brgemm::a_zp) {
-        status = register_zp(this, extra_in, brgemm::a_zp);
-    } else if (brg_op->alg_kind_ == brgemm::b_zp) {
-        status = register_zp(this, extra_in, brgemm::b_zp);
-    } else if (brg_op->alg_kind_ == brgemm::c_zp) {
-        status = register_zp(this, extra_in, brgemm::c_zp);
-    } else if (brg_op->alg_kind_ >= brgemm::eltwise_begin
-            && brg_op->alg_kind_ <= brgemm::eltwise_end) {
-        status = register_unary(this, brg_op->alg_kind_, op);
-    } else if (brg_op->alg_kind_ >= brgemm::binary_begin
-            && brg_op->alg_kind_ <= brgemm::binary_end) {
-        status = register_binary(
-                this, brg_op->alg_kind_, op, extra_in, extra_in_shape);
-    } else {
-        COMPILE_ASSERT(false, "Unsupported brgemm fusion op kind.");
-    }
-    if (status) {
-        last_out_ = output;
-        status = register_out_dtype(this, output);
-        update_data_c_ptr(this, output);
-    }
-    COMPILE_ASSERT(
-            setting_.size() <= brgemm::postops_setting_t::max_postops_num,
-            "Current number of postops("
-                    << setting_.size() << ") should be less than max number("
-                    << brgemm::postops_setting_t::max_postops_num << ").");
-    return status;
-}
-
-bool brgemm_fusion_register::can_register_brgemm_fusion(const stmt &body) {
-    valid_brgemm_finder_t finder;
-    finder(body);
-    valid_brgemm_node_ = finder.get_valid_brgemm_node();
-    return valid_brgemm_node_.defined();
-}
-
-void brgemm_fusion_register::reset() {
-    can_register_next_ = true;
-    last_out_ = expr();
-    valid_brgemm_node_ = expr();
-    setting_.clear();
-    data_ = builtin::create_initialed_postops_data();
-}
-
-stmt brgemm_fusion_register::remake_brgemm_intrinsic_by_fusion(
-        stmt body, expr c_buf) const {
-    COMPILE_ASSERT(
-            valid_brgemm_node_.defined(), "Should return valid remake brgemm.");
-    auto node = valid_brgemm_node_.checked_as<intrin_call>();
-    assert(node->type_ == intrin_type::brgemm
-            || node->type_ == intrin_type::list_brgemm);
-    auto extra_args = node->intrin_attrs_->get<brgemm_args::extra_args_t>(
-            intrin_attr::brgemm_extras);
-    assert(extra_args.postops_setting_.empty());
-    if (setting_.empty()) { return body; }
-    extra_args.postops_setting_ = setting_;
-    assert(last_out_.isa<tensorptr>());
-    extra_args.dtype_C_ = last_out_.static_as<tensorptr>()
-                                  ->base_->ptr_.static_as<tensor>()
-                                  ->elem_dtype_;
-    int basic_arg_size = node->type_ == intrin_type::brgemm
-            ? brgemm_args::NUM_BASIC_ARGS_STRIDE
-            : brgemm_args::NUM_BASIC_ARGS_LIST;
-    // layout of node->args (full args):
-    //    | basic_args | postops_data list(11 elems) | c_buf | bdmask_idx
-    auto new_args = std::vector<expr>(
-            node->args_.begin(), node->args_.begin() + basic_arg_size);
-    new_args.insert(new_args.end(), data_.begin(), data_.end());
-    if (!c_buf.defined()) { c_buf = get_ir_null(); }
-    new_args.emplace_back(c_buf);
-    new_args.emplace_back(node->args_.back());
-    new_args[brgemm_args::C] = last_out_;
-    assert(new_args.size() == node->args_.size());
-    auto new_node = copy_attr(*node,
-            make_expr<intrin_call_node>(node->type_, new_args,
-                    any_map_t {{intrin_attr::brgemm_extras, extra_args},
-                            {intrin_attr::allow_brgemm_fusion,
-                                    node->intrin_attrs_->get_or_else(
-                                            intrin_attr::allow_brgemm_fusion,
-                                            false)}}));
-    std::unordered_map<expr, expr> rmap = {{node, new_node}};
-    brgemm_inplace_replacer_t replacer(
-            rmap, last_out_.static_as<tensorptr>()->base_->ptr_);
-    return replacer(body);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/brgemm_fusion.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/brgemm_fusion.hpp
deleted file mode 100644
index 7f8105f78ad..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/brgemm_fusion.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_BRGEMM_FUSION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_BRGEMM_FUSION_HPP
-#include <vector>
-#include "graph.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <runtime/microkernel/cpu/brgemm_common.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// When fusion manager executing `do_compute_blocks` pass, check if it can be
-// registered to brgemm inside fusion, otherwise do trival computation.
-struct brgemm_fusion_register {
-    // register in brgemm fusion needs op infos, extra input tensor shape and
-    // index, op output tensor index.
-    bool register_op_infos(const sc_op_ptr &op, const expr &output,
-            const expr &extra_in = get_ir_null(),
-            const std::vector<expr> &extra_in_shape = std::vector<expr>());
-    // only have one valid brgemm
-    bool can_register_brgemm_fusion(const stmt &body);
-    stmt remake_brgemm_intrinsic_by_fusion(
-            stmt body, expr c_buf = get_ir_null()) const;
-    void reset();
-    bool can_register_next_ = true;
-    expr last_out_;
-    expr valid_brgemm_node_;
-    sc_brgemm_postops_setting_t setting_;
-    std::vector<expr> data_ = builtin::create_initialed_postops_data();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/driver.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/driver.cpp
deleted file mode 100644
index ced909403a9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/driver.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <atomic>
-#include <fstream>
-#include <tuple>
-#include <utility>
-#include "driver.hpp"
-#include "pass/pass.hpp"
-#include <runtime/env_vars.hpp>
-#include <unordered_map>
-#include <util/exceptions.hpp>
-#include <util/scoped_timer.hpp>
-
-#ifdef _MSC_VER
-#include <Windows.h>
-#define getprocessid GetCurrentProcessId
-#else
-#include <unistd.h>
-#define getprocessid getpid
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.driver)
-
-basic_graph_pass_ptr create_graph_pass(const std::string &name,
-        pass_func func_t, const std::vector<std::string> &required,
-        pass_type type, sc_opt_level opt_level, bool enabled) {
-    return std::make_shared<basic_graph_pass_t>(
-            func_t, name, required, type, opt_level, enabled);
-}
-
-static std::vector<basic_graph_pass_ptr> filter_passes_by_opt_level(
-        const std::vector<basic_graph_pass_ptr> &passes,
-        sc_opt_level opt_level) {
-    std::vector<basic_graph_pass_ptr> ret;
-    for (auto &p : passes) {
-        if (p->opt_level_ <= opt_level) { ret.emplace_back(p); }
-    }
-    return ret;
-}
-
-static std::tuple<std::vector<basic_graph_pass_ptr>,
-        std::vector<basic_graph_pass_ptr>>
-create_default_graph_flow(const context_ptr &ctx) {
-    std::vector<basic_graph_pass_ptr> pre_tune_passes, post_tune_passes;
-    pre_tune_passes.push_back(create_graph_pass("eliminate_zero_shaped_tensors",
-            eliminate_zero_shaped_tensors, {}, pass_type::pre_tune,
-            sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(
-            create_graph_pass("analysis_quantized", analysis_quantized, {},
-                    pass_type::analysis, sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("annotate_fusion_break",
-            quantize::annotate_fusion_break, {}, pass_type::pre_tune,
-            sc_opt_level::lv2, true));
-    pre_tune_passes.push_back(create_graph_pass("annotate_config",
-            annotate_config, {}, pass_type::pre_tune, sc_opt_level::lv2, true));
-    pre_tune_passes.push_back(create_graph_pass("graph_inline", graph_inline,
-            {}, pass_type::pre_tune, sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(
-            create_graph_pass("constant_optimization", constant_optimization,
-                    {}, pass_type::pre_tune, sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("quantized_info_propagation",
-            quantize::quantize_info_propagation, {}, pass_type::pre_tune,
-            sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("quantized_graph_reschedule",
-            quantize::graph_reschedule, {}, pass_type::pre_tune,
-            sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("fpmath_mode", fpmath_mode, {},
-            pass_type::pre_tune, sc_opt_level::lv0, true));
-    // should be executed after graph reschedule, and before quantize_inline
-    pre_tune_passes.push_back(create_graph_pass("rl_conv_weight_transform",
-            rl_conv_weight_transform, {}, pass_type::pre_tune,
-            sc_opt_level::lv2, true));
-    pre_tune_passes.push_back(
-            create_graph_pass("const_folding", graph_constant_input_folding, {},
-                    pass_type::pre_tune, sc_opt_level::lv2, true));
-    pre_tune_passes.push_back(create_graph_pass("flatten_conv", flatten_conv,
-            {}, pass_type::pre_tune, sc_opt_level::lv2, true));
-    pre_tune_passes.push_back(create_graph_pass("dynamic_graph_transform",
-            dynamic_graph_transform, {}, pass_type::pre_tune, sc_opt_level::lv2,
-            true));
-    pre_tune_passes.push_back(
-            create_graph_pass("quantize_inline", quantize::quantize_inline, {},
-                    pass_type::pre_tune, sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(
-            create_graph_pass("elemtwise_bcast_swap", elemwise_bcast_swap, {},
-                    pass_type::pre_tune, sc_opt_level::lv1, true));
-    pre_tune_passes.push_back(
-            create_graph_pass("permute_propagation", permute_propagation, {},
-                    pass_type::pre_tune, sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("quantize_op_compensation",
-            quantize::calculate_op_compensation, {}, pass_type::pre_tune,
-            sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(
-            create_graph_pass("broadcast_transform", broadcast_transform, {},
-                    pass_type::pre_tune, sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("elemwise_dimension_alignment",
-            elemwise_dimension_alignment, {}, pass_type::pre_tune,
-            sc_opt_level::lv0, true));
-    pre_tune_passes.push_back(create_graph_pass("shape_relationship_binding",
-            shape_relationship_binding, {}, pass_type::pre_tune,
-            sc_opt_level::lv0, true));
-
-    // ------------------ post_tune -------------------------------------------
-    post_tune_passes.push_back(
-            create_graph_pass("const_folding", graph_constant_input_folding, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(
-            create_graph_pass("div_bcast_transform", div_bcast_transform, {},
-                    pass_type::post_tune, sc_opt_level::lv2, true));
-    post_tune_passes.push_back(create_graph_pass("pre_padding", pre_padding, {},
-            pass_type::post_tune, sc_opt_level::lv2, true));
-    post_tune_passes.push_back(
-            create_graph_pass("layout_propagation", layout_propagation, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(
-            create_graph_pass("tensor_view_transform", tensor_view_transform,
-                    {}, pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(
-            create_graph_pass("const_folding", graph_constant_input_folding, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(create_graph_pass("graph_simplify",
-            graph_simplify, {}, pass_type::post_tune, sc_opt_level::lv2, true));
-    post_tune_passes.push_back(
-            create_graph_pass("global_reschedule", global_reschedule, {},
-                    pass_type::post_tune, sc_opt_level::lv1, true));
-    post_tune_passes.push_back(
-            create_graph_pass("intrusive_opt_level", intrusive_opt_level, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(
-            create_graph_pass("partial_reduce_replace", partial_reduce_replace,
-                    {}, pass_type::post_tune, sc_opt_level::lv2, true));
-    // fix-me(brgemm-fuse): recover the following when postop is fixed
-#if 0
-    post_tune_passes.push_back(create_graph_pass("brgemm_fusion_transform",
-            brgemm_fusion_transform, {}, pass_type::post_tune, true));
-#endif
-    post_tune_passes.push_back(
-            create_graph_pass("const_folding", graph_constant_input_folding, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    if (ctx->flags_.concat_optimization_) {
-        post_tune_passes.push_back(
-                create_graph_pass("merge_concats", merge_concats, {},
-                        pass_type::post_tune, sc_opt_level::lv2, true));
-    }
-    post_tune_passes.push_back(create_graph_pass("const_folding_and_share",
-            graph_constant_input_folding_and_share_constants, {},
-            pass_type::post_tune, sc_opt_level::lv1, true));
-    post_tune_passes.push_back(
-            create_graph_pass("graph_code_cache", graph_code_cache, {},
-                    pass_type::post_tune, sc_opt_level::lv1, true));
-    post_tune_passes.push_back(
-            create_graph_pass("inplace_transform", inplace_transform, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(
-            create_graph_pass("padded_mask_mark", padded_mask_mark, {},
-                    pass_type::post_tune, sc_opt_level::lv0, true));
-    post_tune_passes.push_back(
-            create_graph_pass("mixed_partition", mixed_partition, {},
-                    pass_type::post_tune, sc_opt_level::lv1, true));
-    if (ctx->flags_.concat_optimization_) {
-        post_tune_passes.push_back(create_graph_pass("graph_concat_optimize",
-                graph_concat_memory_planning, {}, pass_type::post_tune,
-                sc_opt_level::lv2, true));
-    }
-    // filter passes by opt level
-    pre_tune_passes = filter_passes_by_opt_level(
-            pre_tune_passes, ctx->flags_.opt_level_);
-    post_tune_passes = filter_passes_by_opt_level(
-            post_tune_passes, ctx->flags_.opt_level_);
-
-    // get passes map
-    std::unordered_map<std::string, basic_graph_pass_ptr> passes_map;
-    std::transform(pre_tune_passes.begin(), pre_tune_passes.end(),
-            std::inserter(passes_map, passes_map.end()),
-            [](const basic_graph_pass_ptr &pass) {
-                return std::make_pair(pass->name_, pass);
-            });
-
-    std::transform(post_tune_passes.begin(), post_tune_passes.end(),
-            std::inserter(passes_map, passes_map.end()),
-            [](const basic_graph_pass_ptr &pass) {
-                return std::make_pair(pass->name_, pass);
-            });
-    // get pass's dependies and reset enabled_.
-    for (auto &kv : passes_map) {
-        if (kv.second->enabled_) {
-            for (const std::string &require : kv.second->requires_) {
-                passes_map[require]->enabled_ = true;
-            }
-        }
-    }
-    return std::make_tuple(pre_tune_passes, post_tune_passes);
-}
-
-std::tuple<std::vector<basic_graph_pass_ptr>, std::vector<basic_graph_pass_ptr>>
-get_graph_passes(const context_ptr &ctx) {
-    return create_default_graph_flow(ctx);
-}
-
-void run_graph_passes(sc_graph_t &graph, const context_ptr &ctx,
-        const std::vector<basic_graph_pass_ptr> &passes, bool allow_cache) {
-    bool need_time = utils::compiler_configs_t::get().print_pass_time_;
-    bool need_result = utils::compiler_configs_t::get().print_pass_result_;
-    for (auto &pass : passes) {
-        if (pass->enabled_) {
-            auto timer = utils::create_scoped_timer(
-                    need_time, [&pass](utils::time_duration dur) {
-                        std::string name = std::string("graph.driver.time.")
-                                + pass->name_;
-                        SC_MODULE_INFO2(name.c_str())
-                                << "took "
-                                << std::chrono::duration_cast<
-                                           std::chrono::microseconds>(dur)
-                                           .count()
-                                << " us";
-                    });
-            pass->func_(graph, ctx);
-            if (allow_cache && pass->type_ == pass_type::post_tune
-                    && pass->name_ == "graph_code_cache") {
-                if (graph.attrs_.has_key("graph_code_cache")) { break; }
-            }
-            if (need_result) {
-                std::string name
-                        = std::string("graph.driver.debug.") + pass->name_;
-                if (auto stream
-                        = runtime::get_info_logging_stream(name.c_str())) {
-                    *stream.stream_ << "IR after this pass:\n";
-                    print_graph(graph, *stream.stream_, true, true, true, true);
-                }
-            }
-        }
-    }
-}
-
-std::unordered_map<sc_op_ptr, std::vector<sc_op_ptr>> create_op_map(
-        sc_graph_t &lg, sc_graph_t &rg) {
-    assert(lg.ops_.size() == rg.ops_.size());
-    auto op_size = lg.ops_.size();
-    std::unordered_map<sc_op_ptr, std::vector<sc_op_ptr>> op_map;
-    for (auto i = 0UL; i < op_size; i++) {
-        op_map[lg.ops_[i]] = std::vector<sc_op_ptr>({rg.ops_[i]});
-        op_map[rg.ops_[i]] = std::vector<sc_op_ptr>({lg.ops_[i]});
-    }
-    return op_map;
-}
-
-sc_graph_t dynamic_shape_infer_preprocess(
-        const sc_graph_t &graph, const context_ptr &ctx) {
-    sc_graph_t copy = copy_graph(graph);
-    graph_inline(copy, ctx);
-    constant_optimization(copy, ctx);
-    return copy;
-}
-
-void graph_driver(sc_graph_t &graph, const context_ptr &ctx,
-        const graph_config *in_cfg, graph_config *out_cfg, int batch_size,
-        int repeat, int64_t timeout, tuner_creator *tune_creator,
-        std::vector<basic_graph_pass_ptr> *pre_tune_pass,
-        std::vector<basic_graph_pass_ptr> *post_tune_pass, bool allow_cache) {
-    bool need_tuning = timeout != 0;
-    sc_graph_t graph_cpy;
-
-    // save origin graph(tuning) / load config(no tune)
-    if (need_tuning) {
-        graph_cpy = copy_graph(graph);
-        graph.attrs_["temp.op_map"] = create_op_map(graph, graph_cpy);
-    } else {
-        SC_MODULE_INFO << "Use default config";
-    }
-
-    auto passes_tuple = get_graph_passes(ctx);
-    const std::vector<basic_graph_pass_ptr> *prepass
-            = pre_tune_pass ? pre_tune_pass : &std::get<0>(passes_tuple);
-    const std::vector<basic_graph_pass_ptr> *postpass
-            = post_tune_pass ? post_tune_pass : &std::get<1>(passes_tuple);
-    // run pre_processing passes
-    run_graph_passes(graph, ctx, *prepass, true);
-
-    // run post tune passes
-    run_graph_passes(graph, ctx, *postpass, true);
-}
-
-void graph_driver(
-        sc_graph_t &graph, int batch_size, int repeat, const context_ptr &ctx) {
-    graph_config *pincfg = nullptr;
-    graph_config *poutcfg = nullptr;
-    tuner_creator *ptun_creator = nullptr;
-    int64_t real_timeout = 0;
-    sc_graph_t orig_graph;
-    if (poutcfg) { orig_graph = copy_graph(graph); }
-    graph_driver(graph, ctx, pincfg, poutcfg, batch_size, repeat, real_timeout,
-            ptun_creator);
-}
-
-void graph_driver_before_fusion(sc_graph_t &graph, const context_ptr &ctx) {
-    analysis_quantized(graph, ctx);
-    graph_inline(graph, ctx);
-    constant_optimization(graph, ctx);
-    quantize::quantize_info_propagation(graph, ctx);
-
-    quantize::graph_reschedule(graph, ctx);
-    quantize::quantize_inline(graph, ctx);
-
-    elemwise_bcast_swap(graph, ctx);
-    shape_relationship_binding(graph, ctx);
-    permute_propagation(graph, ctx);
-
-    quantize::calculate_op_compensation(graph, ctx);
-    elemwise_dimension_alignment(graph, ctx);
-    layout_propagation(graph, ctx);
-
-    tensor_view_transform(graph, ctx);
-    graph_simplify(graph, ctx);
-    global_reschedule(graph, ctx);
-    partial_reduce_replace(graph, ctx);
-    graph_constant_input_folding(graph, ctx);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/driver.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/driver.hpp
deleted file mode 100644
index f71bb1652aa..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/driver.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DRIVER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DRIVER_HPP
-
-#include <memory>
-#include <string>
-#include <tuple>
-#include <vector>
-#include "analysis/analysis.hpp"
-#include "transform/transform.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class pass_type { analysis, pre_tune, post_tune };
-
-using pass_func = void (*)(sc_graph_t &, const context_ptr &);
-struct basic_graph_pass_t {
-public:
-    std::string name_;
-    std::vector<std::string> requires_;
-    pass_func func_;
-    pass_type type_;
-    sc_opt_level opt_level_; // allowed minimum opt level.
-    bool enabled_; // for debug and tuning
-    basic_graph_pass_t(pass_func func, const std::string &name,
-            const std::vector<std::string> &required, pass_type type,
-            sc_opt_level opt_level = sc_opt_level::lv3, bool enabled = true)
-        : name_(name)
-        , requires_(required)
-        , func_(func)
-        , type_(type)
-        , opt_level_(opt_level)
-        , enabled_(enabled) {}
-};
-
-using basic_graph_pass_ptr = std::shared_ptr<basic_graph_pass_t>;
-
-basic_graph_pass_ptr create_graph_pass(const std::string &name, pass_func func,
-        const std::vector<std::string> &required, pass_type type,
-        sc_opt_level opt_level = sc_opt_level::lv3, bool enabled = true);
-
-// Return: std::vector<std::shared_ptr>, represents all passes run order.
-// 1. If adding a new pass, developer need to define this pass will be put which
-// pos in vector container.
-// 2. If this pass has dependent passes, please set`requires_`, for example
-// `create_pass("pass_0", pass_0_func, {"pass_1"}, pass_type::xxxx)`
-// 3. Each pass's enabled_ field is defaulted true. In general, the pass's
-// enabled_ is true. If pass has dependencies and executed by sepcial situations
-// like: assuming having analaysis_xxx pass return tue, a pass will be opened
-// (enabled_ sets true) and the passes it depends on also need to be opened.
-SC_API std::tuple<std::vector<basic_graph_pass_ptr>,
-        std::vector<basic_graph_pass_ptr>>
-get_graph_passes(const context_ptr &ctx);
-
-/**
- * @param graph orginal graph
- * @param ctx the context
- * Return: a deep copy of the original graph that runs preprocess passes for
- * dynamic_infer_shape
- * */
-SC_API sc_graph_t dynamic_shape_infer_preprocess(const sc_graph_t &graph,
-        const context_ptr &ctx = get_default_context());
-
-/**
- * @param graph orginal graph
- * @param tuner_batch the number of configs which is
- * generated from a call to get_next_config_batch(). The executor may
- * generate executables from a batch of configs in parallel.
- * @param repeat the times to repeatedly run the same executable.
- * @param ctx if developer doesn't set context, use default context.
- * The param: tuner_batch and repeat may be replaced by `timed out`.
- * */
-SC_API void graph_driver(sc_graph_t &graph, int tuner_batch, int repeat,
-        const context_ptr &ctx = get_default_context());
-
-struct tuner_creator;
-struct graph_config;
-
-/**
- * Runs post tune passes, tunes the graph and runs post tune passes
- * @param graph orginal graph
- * @param ctx the context
- * @param in_cfg  if tuning is off and if not null, takes this parameter as the
- * config for the graph. If is null, will use default configs
- * @param out_cfg if tuning is on and if not null, returns the tuned graph
- * config to this pointer.
- * @param tuner_batch the number of configs which is
- * generated from a call to get_next_config_batch(). The executor may
- * generate executables from a batch of configs in parallel.
- * @param repeat the times to repeatedly run the same executable.
- * @param timeout the timeout of the tuning. Will check if time runs out after
- * each tuner batch. If set to negative, there will be no time limit. If set to
- * 0, then the tuning will be turned off
- * @param tune_creator the creator for the tuner. if null, use default tuner
- * settings
- * @param pre_tune_pass the graph passes before running tuner. if null, use
- * default passes
- * @param post_tune_pass the graph passes after running tuner. if null, use
- * default passes
- * @param allow_cache allow reusing cached code for the graph
- * */
-SC_API void graph_driver(sc_graph_t &graph,
-        const context_ptr &ctx = get_default_context(),
-        const graph_config *in_cfg = nullptr, graph_config *out_cfg = nullptr,
-        int tuner_batch = 0, int repeat = 0, int64_t timeout = 0,
-        tuner_creator *tune_creator = nullptr,
-        std::vector<basic_graph_pass_ptr> *pre_tune_pass = nullptr,
-        std::vector<basic_graph_pass_ptr> *post_tune_pass = nullptr,
-        bool allow_cache = false);
-
-// util function to create mapping of ops in the copied graph
-std::unordered_map<sc_op_ptr, std::vector<sc_op_ptr>> create_op_map(
-        sc_graph_t &lg, sc_graph_t &rg);
-void run_graph_passes(sc_graph_t &graph, const context_ptr &ctx,
-        const std::vector<basic_graph_pass_ptr> &passes,
-        bool allow_cache = false);
-
-// get graph driver result before fusion, usually used for unit test
-void graph_driver_before_fusion(sc_graph_t &graph, const context_ptr &ctx);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_dispatch_key.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_dispatch_key.cpp
deleted file mode 100644
index f8a7f3ae066..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_dispatch_key.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <set>
-#include <vector>
-#include "dynamic_dispatch_key.hpp"
-#include "dynamic_internal_info.hpp"
-#include "dynamic_lower_info.hpp"
-#include "dynamic_utils.hpp"
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-#include <runtime/logging.hpp>
-#include <util/assert.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-#define DISPATCH_KEY_MAX_THRESHOLD UINT64_C(64)
-SC_MODULE(graph.dynamic_dispatch_key)
-
-bool op_dispatch_key_t::operator==(const op_dispatch_key_t &other) const {
-    return var_block_ == other.var_block_
-            && in_out_formats_ == other.in_out_formats_ && impl_ == other.impl_;
-}
-
-bool op_dispatch_key_t::operator!=(const op_dispatch_key_t &other) const {
-    return !(*this == other);
-}
-
-void op_dispatch_key_t::set_op_dispatch_key(
-        const sc_op_ptr &node, const context_ptr &ctx) const {
-    if (auto tunable_node = node->dyn_cast<tunable_op_t>()) {
-        tunable_node->set_config_by_key(*this, ctx);
-    }
-    auto &inputs = node->get_inputs();
-    auto &outputs = node->get_outputs();
-    int idx = 0;
-    for (auto &in : inputs) {
-        in->details_.set_format(in_out_formats_[idx++]);
-    }
-    for (auto &out : outputs) {
-        out->details_.set_format(in_out_formats_[idx++]);
-    }
-    node->info_.cur_impl_ = impl_;
-}
-
-std::vector<runtime::dispatch_key>
-op_dispatch_key_t::convert_to_runtime_format_vec() const {
-    std::vector<runtime::dispatch_key> outs(in_out_formats_.size());
-    bool var_block_empty = var_block_.empty();
-    assert(var_block_empty || var_block_.size() == in_out_formats_.size());
-    for (size_t i = 0; i < in_out_formats_.size(); i++) {
-        sc_dim block0 = 0, block1 = 0;
-        if (!var_block_empty && !var_block_[i].empty()) {
-            block0 = var_block_[i][0];
-            block1 = var_block_[i][1];
-        } else {
-            block0 = in_out_formats_[i].blocks_[0];
-            block1 = in_out_formats_[i].blocks_[1];
-        }
-        outs[i] = runtime::dispatch_key(
-                static_cast<uint64_t>(in_out_formats_[i].format_code_), block0,
-                block1, impl_, in_out_formats_[i].is_plain());
-    }
-    return outs;
-}
-
-void impl_op_dispatch_key_t::set_op_dispatch_key(
-        const sc_op_ptr &node, const context_ptr &ctx) const {
-    if (auto tun_node = node->dyn_cast<tunable_op_t>()) {
-        tun_node->set_internal_config_by_key(*this, ctx);
-    } else {
-        throw std::runtime_error("Unimplement");
-    }
-}
-
-std::vector<runtime::dispatch_key>
-impl_op_dispatch_key_t::convert_to_runtime_format_vec() const {
-    std::vector<runtime::dispatch_key> outs(
-            repeat_, runtime::dispatch_key(UINT64_C(0), 0, 0, impl_, false));
-    return outs;
-}
-
-std::vector<runtime::dispatch_key>
-combined_op_dispatch_key_t::convert_to_runtime_format_vec() const {
-    std::vector<runtime::dispatch_key> ret;
-    for (auto &key : keys_) {
-        auto v = key.convert_to_runtime_format_vec();
-        ret.insert(ret.end(), v.begin(), v.end());
-    }
-    return ret;
-}
-
-bool combined_op_dispatch_key_t::operator==(
-        const combined_op_dispatch_key_t &other) const {
-    return keys_ == other.keys_;
-}
-
-bool combined_op_dispatch_key_t::operator!=(
-        const combined_op_dispatch_key_t &other) const {
-    return !(*this == other);
-}
-
-void combined_op_dispatch_key_t::set_op_dispatch_key(
-        const sc_op_ptr &node, const context_ptr &ctx) const {
-    assert(node->isa<mixed_fuse_op_t>());
-    node->stc_cast<mixed_fuse_op_t>()->update_internal_graph_format(*this, ctx);
-}
-
-bool dispatch_key_cmper_t::operator()(
-        const op_dispatch_key_t &key0, const op_dispatch_key_t &key1) const {
-    if (key0.impl_ != key1.impl_) { return key0.impl_ < key1.impl_; }
-    assert(key0.in_out_formats_.size() == key1.in_out_formats_.size());
-    for (size_t i = 0; i < key0.in_out_formats_.size(); i++) {
-        if (key0.in_out_formats_[i].format_code_
-                != key1.in_out_formats_[i].format_code_) {
-            return key0.in_out_formats_[i].format_code_
-                    < key1.in_out_formats_[i].format_code_;
-        }
-        if (key0.in_out_formats_[i].blocks_
-                != key1.in_out_formats_[i].blocks_) {
-            return key0.in_out_formats_[i].blocks_
-                    < key1.in_out_formats_[i].blocks_;
-        }
-    }
-    assert(key0.var_block_.size() == key1.var_block_.size());
-    for (size_t i = 0; i < key0.var_block_.size(); i++) {
-        assert(key0.var_block_[i].size() == key1.var_block_[i].size());
-        for (size_t j = 0; j < key0.var_block_[i].size(); j++) {
-            if (key0.var_block_[i][j] != key1.var_block_[i][j]) {
-                return key0.var_block_[i][j] < key1.var_block_[i][j];
-            }
-        }
-    }
-    // equal
-    return false;
-}
-
-bool impl_dispatch_key_cmper_t::operator()(const impl_op_dispatch_key_t &key0,
-        const impl_op_dispatch_key_t &key1) const {
-    if (key0.impl_ != key1.impl_) { return key0.impl_ < key1.impl_; }
-    // equal
-    return false;
-}
-
-combined_dispatch_key_set_t::combined_dispatch_key_set_t(
-        const std::vector<sc_op_ptr> &inputs, const sc_op_ptr &modified_inp) {
-    if (!inputs.empty()) {
-        auto dispatch_sets = get_dispatch_set_vec_from_ops(inputs);
-        internal_construct(dispatch_sets, inputs, modified_inp);
-    }
-}
-
-combined_dispatch_key_set_t::combined_dispatch_key_set_t(
-        const std::vector<dispatch_set_ptr> &dispatch_sets) {
-    if (!dispatch_sets.empty()) { internal_construct(dispatch_sets); }
-}
-
-void recursive_construct(combined_dispatch_key_set_t::inner_set_t &set,
-        combined_op_dispatch_key_t &cur_combined_key,
-        op_layout_link_vec_t &op_link_relations,
-        const std::vector<dispatch_set_ptr> &dispatch_sets,
-        const std::vector<sc_op_ptr> &inputs, size_t cur_op_idx, size_t len_key,
-        int linked_reorder_impl) {
-    if (cur_op_idx == len_key) {
-        if (cur_combined_key.size() == len_key) {
-            set.insert(cur_combined_key);
-        }
-        return;
-    }
-    auto &inner_set = dispatch_sets[cur_op_idx]->get_inner_set();
-    for (auto it = inner_set.begin(); it != inner_set.end(); it++) {
-        bool is_valid = true;
-        auto cur_key = *it;
-        if (!op_link_relations.empty()) {
-            for (size_t j = 0; j < op_link_relations[cur_op_idx].size(); j++) {
-                auto &link_pair = op_link_relations[cur_op_idx][j];
-                if (link_pair.first != no_link_idx
-                        && !is_linked_layout(it->in_out_formats_[j],
-                                cur_combined_key[link_pair.first]
-                                        .in_out_formats_[link_pair.second])) {
-                    is_valid = false;
-                    break;
-                }
-            }
-        }
-        // In order to reduce the number of combined dispatch key, all reorder
-        // inside subgraph follows same impl algorithm(normal/no_padding).
-        // no_padding is available when all reorders has no padding at runtime.
-        if (!inputs.empty() && inputs[cur_op_idx]->isa<reorder_op_t>()) {
-            if (it->impl_ != linked_reorder_impl) {
-                // static reorder in dynamic pattern
-                if (!can_op_be_dispatched(inputs[cur_op_idx])) {
-                    cur_key.impl_ = linked_reorder_impl;
-                } else {
-                    is_valid = false;
-                }
-            }
-        }
-        if (!is_valid) { continue; }
-        cur_combined_key.keys_.emplace_back(cur_key);
-        recursive_construct(set, cur_combined_key, op_link_relations,
-                dispatch_sets, inputs, cur_op_idx + 1, len_key,
-                linked_reorder_impl);
-        cur_combined_key.keys_.pop_back();
-    }
-}
-
-static bool validate_dispatch_key_sets(
-        const std::vector<dispatch_set_ptr> &dispatch_sets,
-        const std::vector<sc_op_ptr> &inputs) {
-    // todo: add tunable dispatch check
-    // reorder impl kind check.
-    int cur_impl_num = -1;
-    for (size_t i = 0; i < inputs.size(); i++) {
-        if (inputs[i]->isa<reorder_op_t>()) {
-            if (dispatch_sets[i]->size() == 1) { continue; }
-            auto get_impl_kind_num = [](const dispatch_set_ptr &dispatch_set) {
-                auto &set = dispatch_set->get_inner_set();
-                bool has_normal = false, has_padding = false;
-                for (auto &it : set) {
-                    if (it.impl_ == impl_kind_t::normal) {
-                        has_normal = true;
-                    } else if (it.impl_ == impl_kind_t::no_padding) {
-                        has_padding = true;
-                    } else {
-                        throw std::runtime_error(
-                                "Wrong impl kind for reorder.");
-                    }
-                    if (has_normal && has_padding) { return 2; }
-                }
-                if (has_normal || has_padding) { return 1; }
-                return 0;
-            };
-            if (cur_impl_num == -1) {
-                cur_impl_num = get_impl_kind_num(dispatch_sets[i]);
-            } else if (cur_impl_num != get_impl_kind_num(dispatch_sets[i])) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-void combined_dispatch_key_set_t::internal_construct(
-        const std::vector<dispatch_set_ptr> &dispatch_sets,
-        const std::vector<sc_op_ptr> &inputs, const sc_op_ptr &modified_inp) {
-    op_layout_link_vec_t op_link_relations;
-    auto len_key = dispatch_sets.size();
-    auto num_keys = UINT64_C(1);
-    for (size_t i = len_key; i > 0; i--) {
-        num_keys = num_keys * dispatch_sets[i - 1]->size();
-    }
-    // no need to dispatch
-    if (num_keys == 0) { return; }
-    bool is_specific = false;
-    if (!inputs.empty()) {
-        op_link_relations = get_op_layout_link_relationships(
-                inputs, dispatch_sets, modified_inp);
-        is_specific = is_dyn_specific_graph(inputs[0]->get_owner_graph());
-    }
-    COMPILE_ASSERT(validate_dispatch_key_sets(dispatch_sets, inputs),
-            "Wrong dispatch key sets, could not construct combined "
-            "dispatch key.");
-    combined_op_dispatch_key_t cur_combined_key;
-    cur_combined_key.keys_.reserve(len_key);
-    auto reorder_impl_candidates = get_default_impl_dispatch_candidates();
-    for (auto &linked_reorder_impl : reorder_impl_candidates) {
-        recursive_construct(set_, cur_combined_key, op_link_relations,
-                dispatch_sets, inputs, 0, len_key, linked_reorder_impl);
-    }
-    COMPILE_ASSERT(!set_.empty(), "Empty linked combined dispatch key set!");
-    if (!is_specific && set_.size() > DISPATCH_KEY_MAX_THRESHOLD) {
-        SC_MODULE_WARN << "Number of dispatch key set " << set_.size()
-                       << " has exceeded threshold "
-                       << DISPATCH_KEY_MAX_THRESHOLD;
-    }
-}
-
-bool combined_dispatch_key_cmper_t::operator()(
-        const combined_op_dispatch_key_t &key0,
-        const combined_op_dispatch_key_t &key1) const {
-    assert(key0.size() == key1.size());
-    dispatch_key_cmper_t cmper;
-    for (size_t i = 0; i < key0.size(); i++) {
-        if (key0[i] != key1[i]) { return cmper(key0[i], key1[i]); }
-    }
-    // equal
-    return false;
-}
-
-void dispatch_key_set_t::for_each_key_process(
-        const std::function<void(const op_dispatch_key_base_t *)> &callback) {
-    for (auto &key : set_) {
-        callback(&key);
-    }
-}
-
-std::set<op_dispatch_key_t, dispatch_key_cmper_t> &
-dispatch_key_set_t::get_inner_set() {
-    return set_;
-}
-
-dispatch_set_ptr dispatch_key_set_t::copy() const {
-    return std::make_shared<dispatch_key_set_t>(set_);
-}
-
-void impl_dispatch_key_set_t::for_each_key_process(
-        const std::function<void(const op_dispatch_key_base_t *)> &callback) {
-    for (auto &key : set_) {
-        callback(&key);
-    }
-}
-
-std::set<op_dispatch_key_t, dispatch_key_cmper_t> &
-impl_dispatch_key_set_t::get_inner_set() {
-    throw std::runtime_error(
-            "Impl disaptch key set can not get its inner set.");
-}
-
-dispatch_set_ptr impl_dispatch_key_set_t::copy() const {
-    return std::make_shared<impl_dispatch_key_set_t>(set_);
-}
-
-void combined_dispatch_key_set_t::for_each_key_process(
-        const std::function<void(const op_dispatch_key_base_t *)> &callback) {
-    for (auto &key : set_) {
-        callback(&key);
-    }
-}
-
-std::set<op_dispatch_key_t, dispatch_key_cmper_t> &
-combined_dispatch_key_set_t::get_inner_set() {
-    throw std::runtime_error(
-            "Combined dispatch key set can not get its inner set");
-}
-
-dispatch_set_ptr combined_dispatch_key_set_t::copy() const {
-    return std::make_shared<combined_dispatch_key_set_t>(set_);
-}
-
-std::vector<int> get_default_impl_dispatch_candidates() {
-    static std::vector<int> default_impl_candidates
-            = {impl_kind_t::normal, impl_kind_t::no_padding};
-    return default_impl_candidates;
-}
-
-std::vector<int> get_dynamic_impl_dispatch_candidates(
-        tunable_op_t *op, const context_ptr &ctx) {
-    auto configs = op->get_dynamic_config_candidates(ctx);
-    std::vector<int> ret;
-    ret.reserve(configs.size());
-    for (int i = 0; i < static_cast<int>(configs.size()); i++) {
-        ret.push_back(i);
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_dispatch_key.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_dispatch_key.hpp
deleted file mode 100644
index bed7c5de67f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_dispatch_key.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_DISPATCH_KEY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_DISPATCH_KEY_HPP
-#include <functional>
-#include <memory>
-#include <set>
-#include <utility>
-#include <vector>
-#include <compiler/ir/sc_data_format.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-union dispatch_key;
-}
-class tunable_op_t;
-struct context_t;
-class sc_op;
-// the common base class of op_dispatch_key_t and combind_op_dispatch_key_t.
-struct op_dispatch_key_base_t {
-    virtual ~op_dispatch_key_base_t() {}
-    // set dispatch key for op.
-    virtual void set_op_dispatch_key(const std::shared_ptr<sc_op> &node,
-            const std::shared_ptr<context_t> &ctx) const = 0;
-    virtual std::vector<runtime::dispatch_key>
-    convert_to_runtime_format_vec() const = 0;
-};
-
-// the dispatch key type for lowering. Will be used in a map in lowering. The
-// key is this struct and the value is kernel.
-struct op_dispatch_key_t : public op_dispatch_key_base_t {
-    // Currently only need for tunable op. Size is same as in_out_formats, and
-    // illustrate the config of input/outputs. E.g matmul_core config (M, N,
-    // K)[32, 16, 64], we got {{32, 64}, {64, 16}, {32, 16}}.
-    std::vector<std::vector<sc_dim>> var_block_;
-    // a vector of input/output formats, order is input 0,1,..., output 0,1,...
-    std::vector<sc_data_format_t> in_out_formats_;
-    // the op can be dispatched as padding or not.
-    // todo(cuij): find if the field could be deleted if we complete query by
-    // juxtaposed. If we don't need query of combination any more.
-    int impl_ = impl_kind_t::normal;
-    op_dispatch_key_t() = default;
-    virtual ~op_dispatch_key_t() {}
-    op_dispatch_key_t(const std::vector<sc_data_format_t> &formats,
-            int impl = impl_kind_t::normal)
-        : in_out_formats_(formats), impl_(impl) {}
-    op_dispatch_key_t(const std::vector<std::vector<sc_dim>> &var_block,
-            const std::vector<sc_data_format_t> &formats, bool impl = false)
-        : var_block_(var_block), in_out_formats_(formats), impl_(impl) {}
-    bool operator==(const op_dispatch_key_t &other) const;
-    bool operator!=(const op_dispatch_key_t &other) const;
-    void set_op_dispatch_key(const std::shared_ptr<sc_op> &node,
-            const std::shared_ptr<context_t> &ctx) const override;
-    std::vector<runtime::dispatch_key>
-    convert_to_runtime_format_vec() const override;
-};
-
-struct impl_op_dispatch_key_t : public op_dispatch_key_base_t {
-    int impl_ = impl_kind_t::normal;
-    int repeat_;
-    virtual ~impl_op_dispatch_key_t() {}
-    // impl kernel dispatch uses same table of outer kernel dispatch, so set
-    // repeat here to match the number of outer kernel keys.
-    impl_op_dispatch_key_t(int impl, int repeat)
-        : impl_(impl), repeat_(repeat) {}
-    void set_op_dispatch_key(const std::shared_ptr<sc_op> &node,
-            const std::shared_ptr<context_t> &ctx) const override;
-    std::vector<runtime::dispatch_key>
-    convert_to_runtime_format_vec() const override;
-};
-
-struct combined_op_dispatch_key_t : public op_dispatch_key_base_t {
-    combined_op_dispatch_key_t() = default;
-    combined_op_dispatch_key_t(std::initializer_list<op_dispatch_key_t> keys)
-        : keys_(std::move(keys)) {}
-    combined_op_dispatch_key_t(std::vector<op_dispatch_key_t> &&keys)
-        : keys_(std::move(keys)) {}
-    bool operator==(const combined_op_dispatch_key_t &other) const;
-    bool operator!=(const combined_op_dispatch_key_t &other) const;
-    op_dispatch_key_t &operator[](size_t i) { return keys_[i]; }
-    const op_dispatch_key_t &operator[](size_t i) const { return keys_[i]; }
-
-    size_t size() const { return keys_.size(); }
-    void set_op_dispatch_key(const std::shared_ptr<sc_op> &node,
-            const std::shared_ptr<context_t> &ctx) const override;
-    std::vector<runtime::dispatch_key>
-    convert_to_runtime_format_vec() const override;
-    // explict op_dispatch_key_t here, not impl_op_dispatch_key_t because it
-    // could not be combined.
-    std::vector<op_dispatch_key_t> keys_;
-};
-
-struct dispatch_key_cmper_t {
-    bool operator()(
-            const op_dispatch_key_t &key0, const op_dispatch_key_t &key1) const;
-};
-
-struct impl_dispatch_key_cmper_t {
-    bool operator()(const impl_op_dispatch_key_t &key0,
-            const impl_op_dispatch_key_t &key1) const;
-};
-
-struct combined_dispatch_key_cmper_t {
-    bool operator()(const combined_op_dispatch_key_t &key0,
-            const combined_op_dispatch_key_t &key1) const;
-};
-
-// common base struct for dispatch_key_set_t and combined_dispatch_key_set_t.
-struct disaptch_key_set_t;
-struct dispatch_key_set_base_t {
-    virtual ~dispatch_key_set_base_t() {}
-    virtual size_t size() const = 0;
-    virtual void for_each_key_process(
-            const std::function<void(const op_dispatch_key_base_t *)> &callback)
-            = 0;
-    virtual std::set<op_dispatch_key_t, dispatch_key_cmper_t> &get_inner_set()
-            = 0;
-    virtual std::shared_ptr<dispatch_key_set_base_t> copy() const = 0;
-};
-
-struct dispatch_key_set_t : public dispatch_key_set_base_t {
-    using inner_set_t = std::set<op_dispatch_key_t, dispatch_key_cmper_t>;
-    dispatch_key_set_t() = default;
-    dispatch_key_set_t(const inner_set_t &set) : set_(set) {}
-    size_t size() const override { return set_.size(); }
-    void for_each_key_process(
-            const std::function<void(const op_dispatch_key_base_t *)> &callback)
-            override;
-    inner_set_t &get_inner_set() override;
-    std::shared_ptr<dispatch_key_set_base_t> copy() const override;
-    inner_set_t set_;
-};
-
-struct impl_dispatch_key_set_t : public dispatch_key_set_base_t {
-    using inner_set_t
-            = std::set<impl_op_dispatch_key_t, impl_dispatch_key_cmper_t>;
-    impl_dispatch_key_set_t() = default;
-    impl_dispatch_key_set_t(const inner_set_t &set) : set_(set) {}
-    size_t size() const override { return set_.size(); }
-    void for_each_key_process(
-            const std::function<void(const op_dispatch_key_base_t *)> &callback)
-            override;
-    std::set<op_dispatch_key_t, dispatch_key_cmper_t> &get_inner_set() override;
-    std::shared_ptr<dispatch_key_set_base_t> copy() const override;
-    inner_set_t set_;
-};
-
-struct combined_dispatch_key_set_t : public dispatch_key_set_base_t {
-    using inner_set_t = std::set<combined_op_dispatch_key_t,
-            combined_dispatch_key_cmper_t>;
-    // modified inp is for compatibility of fused op as its internal tuanble op
-    // and fusible op are in seperate graphs.
-    combined_dispatch_key_set_t(
-            const std::vector<std::shared_ptr<sc_op>> &inputs,
-            const std::shared_ptr<sc_op> &modified_inp = nullptr);
-    combined_dispatch_key_set_t(
-            const std::vector<std::shared_ptr<dispatch_key_set_base_t>>
-                    &dispatch_sets);
-    combined_dispatch_key_set_t(const inner_set_t &set) : set_(set) {}
-    void internal_construct(
-            const std::vector<std::shared_ptr<dispatch_key_set_base_t>>
-                    &dispatch_sets,
-            const std::vector<std::shared_ptr<sc_op>> &inputs
-            = std::vector<std::shared_ptr<sc_op>>(),
-            const std::shared_ptr<sc_op> &modified_inp = nullptr);
-    size_t size() const override { return set_.size(); }
-    void for_each_key_process(
-            const std::function<void(const op_dispatch_key_base_t *)> &callback)
-            override;
-    std::set<op_dispatch_key_t, dispatch_key_cmper_t> &get_inner_set() override;
-    std::shared_ptr<dispatch_key_set_base_t> copy() const override;
-    inner_set_t set_;
-};
-
-std::vector<int> get_default_impl_dispatch_candidates();
-std::vector<int> get_dynamic_impl_dispatch_candidates(
-        tunable_op_t *op, const std::shared_ptr<context_t> &ctx);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_internal_info.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_internal_info.hpp
deleted file mode 100644
index fc27febd02c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_internal_info.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_INTERNAL_INFO_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_INTERNAL_INFO_HPP
-#include <memory>
-#include <string>
-#include <vector>
-#include <compiler/ir/graph/tensor_detail.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct dyn_internal_info_t {
-    // Internal function could be generated from a fused partition. Here we
-    // record final arguments for function decl and call.
-    std::vector<logical_tensor_t> parti_in_ltsrs_;
-    std::vector<logical_tensor_t> parti_out_ltsrs_;
-    // Inner(fused op) dispatch table name in fused op.
-    std::string dispatch_table_name_;
-};
-using dyn_internal_info_ptr = std::shared_ptr<dyn_internal_info_t>;
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_lower_info.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_lower_info.hpp
deleted file mode 100644
index 91d0cc11799..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_lower_info.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_LOWER_INFO_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_LOWER_INFO_HPP
-#include <memory>
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/sc_expr.hpp>
-#include <unordered_map>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct dynamic_lower_info_t {
-    static constexpr sc_dim init_placeholder = dimensions::dynamic_any - 1;
-    // decreasing placeholder to represent dynamic dim in graph.
-    // range is [-2, min of int64_t]
-    sc_dim cur_dynamic_placeholder_ = init_placeholder;
-    // dynamic sc_dim => expr(uint64_t) map, we maitain one map for one graph.
-    std::unordered_map<sc_dim, expr> dim2expr_map_;
-    // Is specific sub graph for parallel compilation.
-    bool is_specific_ = false;
-
-    const sc_dim &get_cur_dynamic_placeholder() const {
-        return cur_dynamic_placeholder_;
-    }
-    const std::unordered_map<sc_dim, expr> &get_dim2expr_map() const {
-        return dim2expr_map_;
-    }
-    sc_dim &get_cur_dynamic_placeholder() { return cur_dynamic_placeholder_; }
-    std::unordered_map<sc_dim, expr> &get_dim2expr_map() {
-        return dim2expr_map_;
-    }
-    void set_cur_dynamic_placeholder(const sc_dim &cur_dynamic_placeholder) {
-        cur_dynamic_placeholder_ = cur_dynamic_placeholder;
-    }
-    void set_dim2expr_map(
-            const std::unordered_map<sc_dim, expr> &dim2expr_map) {
-        dim2expr_map_ = dim2expr_map;
-    }
-};
-using dyn_lower_info_ptr = std::shared_ptr<dynamic_lower_info_t>;
-class sc_graph_t;
-bool is_dyn_specific_graph(sc_graph_t &);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_utils.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_utils.cpp
deleted file mode 100644
index 4f973615e49..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_utils.cpp
+++ /dev/null
@@ -1,633 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "dynamic_utils.hpp"
-#include <algorithm>
-#include <utility>
-#include "dynamic_dispatch_key.hpp"
-#include "dynamic_lower_info.hpp"
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/matmul_core.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <runtime/dynamic_dispatch/hash_dispatch_table.hpp>
-#include <unordered_set>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-void initialize_format_table_with_op(
-        const sc_op_ptr &op, op_dispatch_tables_ptr &tb) {
-    uint32_t inp_size = op->get_inputs().size();
-    auto dispatch_keys = op->get_dispatch_key_set();
-    // allow empty format table query. It means format has been set with initial
-    // values during graph lowering.
-    if (dispatch_keys->size() <= 1) { return; }
-    if (op->isa<tunable_op_t>()) {
-        auto set_format_by_keys
-                = [&](const op_dispatch_key_base_t *dispatch_key) {
-                      std::vector<runtime::dispatch_key> all_formats
-                              = dispatch_key->convert_to_runtime_format_vec();
-                      std::vector<runtime::dispatch_key> keys(
-                              all_formats.begin(), all_formats.end() - 1);
-                      runtime::dispatch_key value
-                              = all_formats[all_formats.size() - 1];
-                      tb->format_table_[keys] = {value};
-                  };
-        dispatch_keys->for_each_key_process(set_format_by_keys);
-    } else {
-        uint64_t unknown_fmt = 0;
-        std::vector<runtime::dispatch_key> keys(inp_size, unknown_fmt);
-        auto set_format_by_key
-                = [&](const op_dispatch_key_base_t *dispatch_key) {
-                      std::vector<runtime::dispatch_key> values
-                              = dispatch_key->convert_to_runtime_format_vec();
-                      // only one input format known.
-                      for (uint32_t i = 0; i < inp_size; i++) {
-                          keys.clear();
-                          keys.resize(inp_size, unknown_fmt);
-                          keys[i] = values[i];
-                          tb->format_table_[keys] = values;
-                      }
-                      // all input format known
-                      for (uint32_t i = 0; i < inp_size; i++) {
-                          keys[i] = values[i];
-                      }
-                      tb->format_table_[keys] = values;
-                  };
-        dispatch_keys->for_each_key_process(set_format_by_key);
-    }
-}
-
-std::shared_ptr<ir_module_t> op_dispatch_tables_t::op_func_info::lower() {
-    ir_module_ptr mod
-            = internal_ ? op_->get_internal_func(ctx_) : op_->get_func(ctx_);
-    auto func = mod->get_entry_func();
-    func->attr()[function_attrs::top_level] = false;
-    func->name_ += name_or_postfix_;
-    func->decl_->name_ = func->name_;
-
-    mod->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL] = *use_managed_tp_;
-    mod->attr_[ir_module_t::attr_key_t::STATIC_GLOBALS] = true;
-    return mod;
-}
-
-void initialize_impl_kind_table_with_op(const context_ptr &ctx,
-        const sc_op_ptr &op, op_dispatch_tables_ptr &tb) {
-    COMPILE_ASSERT(op->isa<tunable_op_t>(),
-            "impl_kind table currently is only used for tunable op.");
-    auto tun_op = op->dyn_cast<tunable_op_t>();
-    tb->impl_kind_table_ = tun_op->convert_config_candidates_to_impl_map(
-            tun_op->get_dynamic_config_candidates(ctx));
-}
-
-void initialize_op_info_with_op(
-        const sc_op_ptr &op, op_dispatch_tables_ptr &tb) {
-    tb->op_info_ = op->get_dynamic_runtime_info();
-}
-
-void initialize_dispatch_table_with_op(const context_ptr &ctx,
-        const sc_op_ptr &op, op_dispatch_tables_ptr &tb) {
-    initialize_format_table_with_op(op, tb);
-    if (op->isa<tunable_op_t>()) {
-        initialize_impl_kind_table_with_op(ctx, op, tb);
-    }
-    initialize_op_info_with_op(op, tb);
-}
-
-void add_dispatch_symbol_to_kernel_table(op_dispatch_tables_ptr &tb,
-        const op_dispatch_key_base_t *key,
-        op_dispatch_tables_t::op_func_info &&value) {
-    std::vector<runtime::dispatch_key> runtime_keys
-            = key->convert_to_runtime_format_vec();
-    tb->kernel_table_.insert(std::make_pair(runtime_keys, std::move(value)));
-}
-
-bool is_internal_op(const sc_op_ptr &op) {
-    return op->op_name_ != "input" && op->op_name_ != "output"
-            && op->op_name_ != "constant";
-}
-
-bool can_op_be_dispatched(const sc_op_ptr &op) {
-    return is_internal_op(op) && op->get_owner_graph().is_dynamic()
-            && op->get_dispatch_key_set()->size() > 1;
-}
-
-bool can_op_query_output(const sc_op_ptr &op) {
-    if (!is_internal_op(op)) { return false; }
-    if (op->op_name_ == "dynamic_reshape") { return false; }
-    std::unordered_set<sc_dim> set;
-    for (auto &in : op->get_inputs()) {
-        for (auto &d : in->details_.get_plain_dims()) {
-            if (is_dynamic_dim(d)) { set.insert(d); }
-        }
-    }
-    for (auto &out : op->get_outputs()) {
-        for (auto &d : out->details_.get_plain_dims()) {
-            if (is_dynamic_dim(d) && set.find(d) == set.end()) { return true; }
-        }
-    }
-    return false;
-}
-
-bool can_op_be_queried(const sc_op_ptr &op) {
-    return can_op_be_dispatched(op) || can_op_query_output(op);
-}
-
-std::vector<dispatch_set_ptr> get_dispatch_set_vec_from_ops(
-        const std::vector<sc_op_ptr> &ops) {
-    std::vector<dispatch_set_ptr> ret;
-    ret.reserve(ops.size());
-    for (auto &op : ops) {
-        ret.emplace_back(op->get_dispatch_key_set());
-    }
-    return ret;
-}
-
-sc_op_ptr find_parent_dispatch_node(const graph_tensor_ptr &in) {
-    auto cur_op = in->producer_owner_;
-    while (!(cur_op->isa<tunable_op_t>() || cur_op->isa<input_op>()
-            || cur_op->isa<reorder_op_t>())) {
-        int layout_input_idx = cur_op->attrs_.get_or_else(
-                op_attr_key::layout_input_index, 0);
-        cur_op = cur_op->get_inputs()[layout_input_idx]->producer_owner_;
-    };
-    return cur_op->shared_from_this();
-}
-
-sc_op_ptr find_output_linked_tunable_op(const graph_tensor_ptr &in) {
-    for (auto &use : in->uses_) {
-        auto op = use.second.lock();
-        if (op->isa<tunable_op_t>() || op->isa<reorder_op_t>()
-                || op->isa<output_op>()) {
-            continue;
-        }
-        if (op->isa<binary_elementwise_op_t>() || op->isa<select_op_t>()) {
-            for (size_t i = 0; i < op->get_inputs().size(); i++) {
-                auto parent_node
-                        = find_parent_dispatch_node(op->get_inputs()[i]);
-                if (parent_node->isa<tunable_op_t>()) { return parent_node; }
-            }
-        }
-        auto ret = find_output_linked_tunable_op(op->get_outputs()[0]);
-        if (ret) { return ret; }
-    }
-    return nullptr;
-}
-
-static std::pair<int, int> make_linked_op_output_pair(
-        const std::vector<dispatch_set_ptr> &dispatch_keys, int op_idx) {
-    assert(op_idx >= 0 && op_idx < static_cast<int>(dispatch_keys.size()));
-    return std::make_pair(op_idx,
-            static_cast<int>(dispatch_keys[op_idx]
-                                     ->get_inner_set()
-                                     .begin()
-                                     ->in_out_formats_.size())
-                    - 1);
-}
-
-op_layout_link_vec_t get_op_layout_link_relationships(
-        const std::vector<std::shared_ptr<sc_op>> &ops,
-        const std::vector<dispatch_set_ptr> &dispatch_keys,
-        const sc_op_ptr &modified_inp) {
-    op_layout_link_vec_t ret;
-    ret.resize(ops.size());
-    for (size_t i = 0; i < ops.size(); i++) {
-        assert(!dispatch_keys[i]
-                        ->get_inner_set()
-                        .begin()
-                        ->in_out_formats_.empty());
-        ret[i].resize(dispatch_keys[i]
-                              ->get_inner_set()
-                              .begin()
-                              ->in_out_formats_.size(),
-                std::make_pair(no_link_idx, no_link_idx));
-    }
-    for (size_t i = 0; i < ops.size(); i++) {
-        auto &op = ops[i];
-        if (op->isa<reorder_op_t>()) {
-            auto parent_inp = find_parent_dispatch_node(op->get_inputs()[0]);
-            int linked_op_idx = no_link_idx;
-            int op_layout_idx = no_link_idx;
-            if (parent_inp->isa<tunable_op_t>() || parent_inp == modified_inp) {
-                // input linked with tuanble op output
-                if (parent_inp == modified_inp) {
-                    linked_op_idx = 0;
-                } else {
-                    auto it = std::find(ops.begin(), ops.end(), parent_inp);
-                    COMPILE_ASSERT(it != ops.end(), "Wrong graph pattern!");
-                    linked_op_idx = std::distance(ops.begin(), it);
-                }
-                op_layout_idx = 0;
-            } else {
-                // output may be linked with tunable op output
-                auto tunable_op
-                        = find_output_linked_tunable_op(op->get_outputs()[0]);
-                op_layout_idx = 1;
-                if (tunable_op) {
-                    auto it = std::find(ops.begin(), ops.end(), tunable_op);
-                    COMPILE_ASSERT(it != ops.end(), "Wrong graph pattern!");
-                    linked_op_idx = std::distance(ops.begin(), it);
-                }
-            }
-            if (linked_op_idx != no_link_idx) {
-                auto linked_layout_idx
-                        = static_cast<int>(dispatch_keys[linked_op_idx]
-                                                   ->get_inner_set()
-                                                   .begin()
-                                                   ->in_out_formats_.size())
-                        - 1;
-                if (linked_op_idx < static_cast<int>(i)) {
-                    ret[i][op_layout_idx]
-                            = std::make_pair(linked_op_idx, linked_layout_idx);
-                } else {
-                    ret[linked_op_idx][linked_layout_idx]
-                            = std::make_pair(i, op_layout_idx);
-                }
-            }
-        } else {
-            // tunable_op link
-            assert(op->isa<tunable_op_t>());
-            for (size_t j = 0; j < op->get_inputs().size(); j++) {
-                auto parent_node
-                        = find_parent_dispatch_node(op->get_inputs()[j]);
-                if (parent_node->isa<tunable_op_t>()
-                        || parent_node->isa<reorder_op_t>()) {
-                    auto it = std::find(ops.begin(), ops.end(), parent_node);
-                    COMPILE_ASSERT(it != ops.end(), "Wrong graph pattern!");
-                    int op_idx = std::distance(ops.begin(), it);
-                    auto linked_layout_idx
-                            = static_cast<int>(dispatch_keys[op_idx]
-                                                       ->get_inner_set()
-                                                       .begin()
-                                                       ->in_out_formats_.size())
-                            - 1;
-                    if (op_idx < static_cast<int>(i)) {
-                        ret[i][j] = std::make_pair(op_idx, linked_layout_idx);
-                    } else {
-                        ret[op_idx][linked_layout_idx] = std::make_pair(i, j);
-                    }
-                }
-            }
-        }
-    }
-    return ret;
-}
-
-bool is_linked_layout(
-        const sc_data_format_t &layout1, const sc_data_format_t &layout2) {
-    if (!(layout1.is_blocking() && layout2.is_blocking())) {
-        return layout1 == layout2;
-    }
-    // both blocking
-    auto block_num1 = layout1.get_blocks_size();
-    auto block_num2 = layout2.get_blocks_size();
-    auto offset1 = block_num1 < block_num2 ? 0 : block_num1 - block_num2;
-    auto offset2 = block_num1 < block_num2 ? block_num2 - block_num1 : 0;
-    auto block_num = std::min(block_num1, block_num2);
-    for (int i = 0; i < block_num; i++) {
-        if (!(layout1.blocks_[offset1 + i] == layout2.blocks_[offset2 + i]
-                    || layout1.blocks_[offset1 + i] == 1
-                    || layout2.blocks_[offset2 + i] == 1)) {
-            return false;
-        }
-    }
-    return true;
-}
-
-runtime::dynamic_tensor_t convert_graph_tensor_to_dynamic_tensor(
-        const graph_tensor_ptr &in, void *data_ptr, sc_dim *shape_ptr) {
-    runtime::dynamic_tensor_t ret;
-    auto &plain_dims = in->details_.get_plain_dims();
-    ret.data_ = data_ptr;
-    ret.dims_ = shape_ptr;
-    ret.ndims_ = static_cast<int>(plain_dims.size());
-    ret.dtype_ = static_cast<uint32_t>(in->details_.dtype_.type_code_);
-    ret.dyn_mask_ = 0;
-    for (int i = 0; i < static_cast<int>(plain_dims.size()); i++) {
-        if (is_dynamic_dim(plain_dims[i])) { ret.dyn_mask_ |= (1 << i); }
-    }
-    return ret;
-}
-
-static bool need_query_next_first(const sc_op_ptr &node) {
-    bool has_tail_reorder = node->isa<reorder_op_t>();
-    std::vector<sc_op_ptr> out_ops;
-    if (node->isa<mixed_fuse_op_t>()) {
-        out_ops = node->stc_cast<mixed_fuse_op_t>()
-                          ->sub_graph_.get_output_ops();
-    }
-    for (auto &op : out_ops) {
-        if (op->get_inputs()[0]->producer_owner_->isa<reorder_op_t>()) {
-            has_tail_reorder = true;
-            break;
-        }
-    }
-    return has_tail_reorder
-            && can_op_be_dispatched(
-                    node->get_outputs()[0]->uses_[0].second.lock());
-}
-
-void lower_query_function(std::vector<bool> &visited, const sc_op_ptr &node,
-        const std::function<void(const sc_op_ptr &)> &callback) {
-    if (visited[node->logical_op_id_]) { return; }
-    if (node->attrs_.get_or_else("constant", const_kind::not_const)
-                    == const_kind::not_const
-            && need_query_next_first(node)) {
-        for (size_t i = 0; i < node->get_outputs().size(); i++) {
-            auto query_node = node->get_outputs()[i]->uses_[0].second.lock();
-            lower_query_function(visited, query_node, callback);
-        }
-    }
-    callback(node);
-    visited[node->logical_op_id_] = true;
-}
-
-void visit_fused_graph_by_query_order(sc_graph_t &graph,
-        const std::function<void(const sc_op_ptr &)> &callback) {
-    for (auto &op : graph.ops_) {
-        if (!op->isa<reorder_op_t>()) { callback(op); }
-    }
-    for (auto &op : graph.ops_) {
-        if (op->isa<reorder_op_t>()) { callback(op); }
-    }
-}
-std::vector<sc_op_ptr> get_graph_inner_dispatch_ops(
-        sc_graph_t &graph, int *total_key_num) {
-    std::vector<sc_op_ptr> ret;
-    // first add tunable_op dispatch key then reorder op
-    auto get_inner_ops = [&ret, &total_key_num](const sc_op_ptr &op) {
-        if (op->isa<tunable_op_t>() || op->isa<reorder_op_t>()) {
-            ret.emplace_back(op);
-            if (total_key_num) {
-                *total_key_num += static_cast<int>(
-                        op->get_inputs().size() + op->get_outputs().size());
-            }
-        }
-    };
-    visit_fused_graph_by_query_order(graph, get_inner_ops);
-    return ret;
-}
-
-static int find_padding_impl(const combined_op_dispatch_key_t &key) {
-    for (auto &cur_key : key.keys_) {
-        // reorder
-        if (cur_key.in_out_formats_.size() == 2) { return cur_key.impl_; }
-    }
-    return impl_kind_t::normal;
-}
-
-void update_graph_format_by_key(const context_ptr &ctx,
-        const sc_op_ptr &fused_op, sc_graph_t &graph,
-        const combined_op_dispatch_key_t &key, int &key_idx,
-        size_t node_input_offset, size_t graph_input_offset,
-        const sc_op_ptr &modified_inp) {
-    auto &node_inputs = fused_op->get_inputs();
-    std::vector<bool> visited(graph.ops_.size(), false);
-    int padding_impl = find_padding_impl(key);
-    // currently we store dispatch key of tunable op and reorder only in
-    // fused op.
-    auto update_format = [&](const sc_op_ptr &node) {
-        if (node->isa<tunable_op_t>() || node->isa<reorder_op_t>()) {
-            auto &cur_key = key[key_idx++];
-            assert(cur_key.in_out_formats_.size() == 2
-                    || cur_key.in_out_formats_.size() == 3);
-            // update format
-            for (size_t i = 0; i < cur_key.in_out_formats_.size() - 1; i++) {
-                node->get_inputs()[i]->details_.set_format(
-                        cur_key.in_out_formats_[i]);
-            }
-            if (node->isa<tunable_op_t>()) {
-                node->stc_cast<tunable_op_t>()->set_config_by_key(cur_key, ctx);
-            }
-            if (node->isa<reorder_op_t>()
-                    && cur_key.in_out_formats_[0].is_blocking()) {
-                auto inp_parent
-                        = find_parent_dispatch_node(node->get_inputs()[0]);
-                if (inp_parent->isa<input_op>() && inp_parent != modified_inp) {
-                    inp_parent->get_outputs()[0]->details_.set_format(
-                            cur_key.in_out_formats_[0]);
-                }
-            }
-            node->get_outputs()[0]->details_.set_format(
-                    cur_key.in_out_formats_[cur_key.in_out_formats_.size()
-                            - 1]);
-            // update impl alg
-            node->info_.cur_impl_ = cur_key.impl_;
-        } else if (node->isa<unary_elementwise_op_t>()
-                || node->isa<binary_elementwise_op_t>()) {
-            node->info_.cur_impl_ = padding_impl;
-        }
-    };
-    visit_fused_graph_by_query_order(graph, update_format);
-    assert(key_idx == static_cast<int>(key.size()));
-    graph.attrs_.set("insert_reorder", false);
-    graph.attrs_.set("is_output_plain", false);
-    layout_propagation(graph);
-
-    // sync fused op's input/output format with inner graph
-    auto graph_inputs = graph.get_input_ops();
-    assert(node_inputs.size() + graph_input_offset
-            == graph_inputs.size() + node_input_offset);
-    for (size_t i = 0; i + graph_input_offset < graph_inputs.size(); i++) {
-        node_inputs[i + node_input_offset]->details_.set_format(
-                graph_inputs[i + graph_input_offset]
-                        ->get_outputs()[0]
-                        ->details_.get_format());
-    }
-    // update fused op output format
-    auto &node_outputs = fused_op->get_outputs();
-    auto graph_outputs = graph.get_output_ops();
-    assert(graph_outputs.size() == node_outputs.size());
-    for (size_t i = 0; i < graph_outputs.size(); i++) {
-        node_outputs[i]->details_.set_format(
-                graph_outputs[i]->get_inputs()[0]->details_.get_format());
-    }
-}
-
-expr call_op_dynamic_query_function(
-        const sc_op_ptr &op, const std::vector<expr> &args) {
-    if (op->isa<ops::matmul_core_op_t>()) {
-        assert(args.size() == 13 || args.size() == 14);
-        return builtin::call_matmul_core_query_format(args[0], args[1], args[2],
-                args[3], args[4], args[5], args[6], args[7], args[8], args[9],
-                args[10], args[11], args[12],
-                args.size() == 13 ? get_ir_null() : args[13]);
-    } else if (op->isa<ops::managed_matmul_core_op_t>()) {
-        assert(args.size() == 13 || args.size() == 14);
-        return builtin::call_managed_matmul_core_query_format(args[0], args[1],
-                args[2], args[3], args[4], args[5], args[6], args[7], args[8],
-                args[9], args[10], args[11], args[12],
-                args.size() == 13 ? get_ir_null() : args[13]);
-    } else if (op->isa<ops::conv_fwd_core_op_t>()) {
-        assert(args.size() == 13 || args.size() == 14);
-        return builtin::call_conv_fwd_core_query_format(args[0], args[1],
-                args[2], args[3], args[4], args[5], args[6], args[7], args[8],
-                args[9], args[10], args[11], args[12],
-                args.size() == 13 ? get_ir_null() : args[13]);
-    } else if (op->isa<unary_elementwise_op_t>()) {
-        assert(args.size() == 7);
-        return builtin::call_unary_fusible_op_query_format(
-                args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-    } else if (op->isa<binary_elementwise_op_t>()) {
-        assert(args.size() == 9);
-        return builtin::call_binary_fusible_op_query_format(args[0], args[1],
-                args[2], args[3], args[4], args[5], args[6], args[7], args[8]);
-    } else if (op->isa<reorder_op_t>()) {
-        assert(args.size() == 7 || args.size() == 8);
-        return builtin::call_reorder_op_query_format(args[0], args[1], args[2],
-                args[3], args[4], args[5], args[6],
-                args.size() == 7 ? get_ir_null() : args[7]);
-    } else if (op->isa<reduce_op_t>()) {
-        assert(args.size() == 7);
-        return builtin::call_reduce_op_query_format(
-                args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-    } else if (op->isa<tensor_view_op_t>()) {
-        assert(args.size() == 7);
-        return builtin::call_tensor_view_op_query_format(
-                args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-    } else if (op->isa<select_op_t>()) {
-        assert(args.size() == 11);
-        return builtin::call_select_op_query_format(args[0], args[1], args[2],
-                args[3], args[4], args[5], args[6], args[7], args[8], args[9],
-                args[10]);
-    } else if (op->isa<padding_op_t>()) {
-        return builtin::call_padding_op_query_format(
-                args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-    } else if (op->isa<pooling_op_t>()) {
-        return builtin::call_pooling_op_query_format(
-                args[0], args[1], args[2], args[3], args[4], args[5], args[6]);
-    } else {
-        COMPILE_ASSERT(
-                false, "unsupported op query function: " << op->op_name_);
-    }
-    return expr();
-}
-
-void create_internal_dispatch_funcs_by_node(const context_ptr &ctx,
-        ir_module_ptr &ret_mod, const std::string &table_name,
-        const sc_op_ptr &node,
-        const std::shared_ptr<const thread_pool_mode_t> &use_mtp) {
-    if (node->isa<mixed_fuse_op_t>()) {
-        node->stc_cast<mixed_fuse_op_t>()->create_internal_dispatch_funcs(
-                ctx, ret_mod, use_mtp);
-    } else {
-        auto internal_keys = node->get_internal_dispatch_key_set(ctx);
-        std::vector<expr> op_dispatch_kernel;
-        int dyn_idx = 0;
-        internal_keys->for_each_key_process(
-                std::bind(create_dispatch_funcs_by_keys, ctx, std::ref(ret_mod),
-                        table_name, node, std::placeholders::_1,
-                        std::ref(op_dispatch_kernel[node->logical_op_id_]),
-                        std::ref(dyn_idx), use_mtp,
-                        /*internal*/ true));
-    }
-}
-
-void create_dispatch_funcs_by_keys(const context_ptr &ctx,
-        ir_module_ptr &ret_mod, const std::string &table_name,
-        const sc_op_ptr &node, const op_dispatch_key_base_t *key,
-        expr &op_dispatch_kernel, int &dyn_idx,
-        const std::shared_ptr<const thread_pool_mode_t> &use_mtp,
-        bool internal) {
-    auto cur_table = ret_mod->get_op_table_map()[table_name];
-    assert(cur_table);
-    bool should_compile_later = false;
-    if (!should_compile_later) {
-        // we compile the first format specialization in main module
-        key->set_op_dispatch_key(node, ctx);
-        auto mod
-                = internal ? node->get_internal_func(ctx) : node->get_func(ctx);
-        auto func = mod->get_entry_func();
-        if (internal) { func->name_ += "_internal"; }
-        func->name_ += "_" + std::to_string(dyn_idx);
-        func->decl_->name_ = func->name_;
-        if (!dyn_idx && !internal) {
-            // mark the first function as prototype.
-            op_dispatch_kernel->attr().set("prototype", func);
-        }
-        ret_mod->merge(*mod);
-        add_dispatch_symbol_to_kernel_table(cur_table, key,
-                op_dispatch_tables_t::op_func_info {func->name_});
-    }
-    dyn_idx++;
-}
-
-int get_num_of_internal_funcs(const sc_op_ptr &node) {
-    if (node->isa<mixed_fuse_op_t>()) {
-        auto mixed_op = node->stc_cast<mixed_fuse_op_t>();
-        int ret = 0;
-        for (auto &op : mixed_op->sub_graph_.ops_) {
-            if (op->need_dynamic_internal_query()) { ret++; }
-        }
-        return ret;
-    } else if (node->need_dynamic_internal_query()) {
-        return 1;
-    }
-    return 0;
-}
-
-int get_num_of_internal_funcs(const sc_graph_t &graph) {
-    int ret = 0;
-    for (auto &op : graph.ops_) {
-        if (op->need_dynamic_internal_query()) { ret++; }
-    }
-    return ret;
-}
-
-int count_dynamic_dims(const sc_dims &in) {
-    int ret = 0;
-    for (auto &dim : in) {
-        if (is_dynamic_dim(dim)) { ret++; }
-    }
-    return ret;
-}
-
-expr divide_and_ceil(const expr &v, const expr &d) {
-    return do_cast_and_fold((v + d - 1) / d);
-}
-
-bool is_dyn_specific_graph(sc_graph_t &graph) {
-    if (graph.dyn_info_ && graph.dyn_info_->is_specific_) { return true; }
-    return false;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_utils.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_utils.hpp
deleted file mode 100644
index 06199d12d61..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/dynamic_utils.hpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_DYNAMIC_UTILS_HPP
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/sc_expr.hpp>
-#include <runtime/dynamic_dispatch/op_dispatch_tables.hpp>
-#include <runtime/threadpool_mode.hpp>
-#include <util/def.hpp>
-#include <util/reflection.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class sc_op;
-class sc_graph_t;
-struct graph_tensor;
-struct op_dispatch_key_base_t;
-struct combined_op_dispatch_key_t;
-struct dispatch_key_set_base_t;
-struct sc_data_format_t;
-class ir_module_t;
-struct context_t;
-// op tables used in ir_module
-struct op_dispatch_tables_t {
-    // format table:
-    // for tunable op: input format keys => out formats value
-    // for reorder op: the field is meaningless.
-    // for other ops: input format keys with only one known format => all in/out
-    // formats value
-    std::unordered_map<std::vector<runtime::dispatch_key>,
-            std::vector<runtime::dispatch_key>>
-            format_table_;
-    /**
-     * the struct to hold info of a partial specialization instance of a dynamic
-     * shaped op.
-     * If graph_ is null, the field `name_or_postfix_` is the only
-     * valid field of `op_func_info`. `name_or_postfix_` indicates the name of
-     * TIR function in the IR module for the partial specialization.
-     * If graph_ is not null, the partial specialization has not been compiled
-     * when `lower_graph()` is called. The field `op_` contains the dynamic
-     * shaped op, whose container is stored in `graph_`. The `name_or_postfix_`
-     * is the postfix for the TIR function name of the partial specialization
-     */
-    struct op_func_info {
-        std::shared_ptr<sc_graph_t> graph_;
-        std::shared_ptr<sc_op> op_;
-        std::string name_or_postfix_;
-        std::shared_ptr<context_t> ctx_;
-        std::shared_ptr<const thread_pool_mode_t> use_managed_tp_;
-        bool internal_;
-        // only valid when `!already_compiled()`. Lower the partial
-        // specialization to TIR
-        std::shared_ptr<ir_module_t> lower();
-        /**
-         * op_func_info which indicates that the partial specialization is not
-         * compiled or lowered yet, internal means the internal functions of op.
-         */
-        op_func_info(const std::shared_ptr<sc_graph_t> &graph,
-                const std::shared_ptr<sc_op> &op,
-                const std::string &name_or_postfix,
-                const std::shared_ptr<context_t> &ctx,
-                const std::shared_ptr<const thread_pool_mode_t> &use_managed_tp,
-                bool internal = false)
-            : graph_(graph)
-            , op_(op)
-            , name_or_postfix_(name_or_postfix)
-            , ctx_(ctx)
-            , use_managed_tp_(use_managed_tp)
-            , internal_(internal) {}
-        /**
-         * op_func_info which indicates that the partial specialization is
-         * already lowered to TIR in the current IR module
-         */
-        op_func_info(const std::string &name_or_postfix)
-            : name_or_postfix_(name_or_postfix), internal_(false) {}
-        bool already_compiled() { return graph_ == nullptr; }
-    };
-    // config table: configs of tunable_op => impl kind
-    std::unordered_map<std::vector<uint64_t>, int> impl_kind_table_;
-    // kernel table: in/out format keys => function symbol
-    std::unordered_map<std::vector<runtime::dispatch_key>, op_func_info>
-            kernel_table_;
-    reflection::shared_general_object_t op_info_;
-};
-
-using op_dispatch_tables_ptr = std::shared_ptr<op_dispatch_tables_t>;
-using dispatch_table_map_t
-        = std::unordered_map<std::string, op_dispatch_tables_ptr>;
-
-void initialize_format_table_with_op(
-        const std::shared_ptr<sc_op> &op, op_dispatch_tables_ptr &tb);
-void initialize_impl_kind_table_with_op(const std::shared_ptr<context_t> &ctx,
-        const std::shared_ptr<sc_op> &op, op_dispatch_tables_ptr &tb);
-void initialize_dispatch_table_with_op(const std::shared_ptr<context_t> &ctx,
-        const std::shared_ptr<sc_op> &op, op_dispatch_tables_ptr &tb);
-void add_dispatch_symbol_to_kernel_table(op_dispatch_tables_ptr &tb,
-        const op_dispatch_key_base_t *keys,
-        op_dispatch_tables_t::op_func_info &&func_module);
-bool is_internal_op(const std::shared_ptr<sc_op> &op);
-bool can_op_be_dispatched(const std::shared_ptr<sc_op> &op);
-bool can_op_query_output(const std::shared_ptr<sc_op> &op);
-bool can_op_be_queried(const std::shared_ptr<sc_op> &op);
-std::vector<std::shared_ptr<dispatch_key_set_base_t>>
-get_dispatch_set_vec_from_ops(const std::vector<std::shared_ptr<sc_op>> &ops);
-
-// find the parent node who could be dispatched related to current op, parent
-// node could be tuanble op/reorder op/input op.
-std::shared_ptr<sc_op> find_parent_dispatch_node(
-        const std::shared_ptr<graph_tensor> &in);
-
-constexpr const int no_link_idx = -1;
-using op_layout_link_vec_t = std::vector<std::vector<std::pair<int, int>>>;
-// The first dim is the op index who has effective dispatch keys inside fused
-// op, the second dim is op's in/out format index, the value is linked op index
-// and linked op in/out format index pair.
-op_layout_link_vec_t get_op_layout_link_relationships(
-        const std::vector<std::shared_ptr<sc_op>> &ops,
-        const std::vector<std::shared_ptr<dispatch_key_set_base_t>>
-                &dispatch_keys,
-        const std::shared_ptr<sc_op> &modified_inp);
-// Query function order, if we meet reorder, query its next tunable op first.
-void lower_query_function(std::vector<bool> &visited,
-        const std::shared_ptr<sc_op> &node,
-        const std::function<void(const std::shared_ptr<sc_op> &)> &callback);
-
-void visit_fused_graph_by_query_order(sc_graph_t &graph,
-        const std::function<void(const std::shared_ptr<sc_op> &)> &callback);
-// Judge whether the two input layout could be linked. The linked means the
-// graph is in the valid status of layout. For example, pattern like "reorder +
-// matmul", the output layout of reorder should be equal to the input layout of
-// matmul. And blocking factor =1 is prepared for binary elementwise op with
-// broadcast semantic. For example, MKmk(32, 16) and MKmk(1, 16) are linked,
-// while MKmk(32,16) and MKmk(1, 32) not.
-bool is_linked_layout(
-        const sc_data_format_t &layout1, const sc_data_format_t &layout2);
-std::vector<std::shared_ptr<sc_op>> get_graph_inner_dispatch_ops(
-        sc_graph_t &graph, int *total_num_key);
-void update_graph_format_by_key(const std::shared_ptr<context_t> &ctx,
-        const std::shared_ptr<sc_op> &fused_op, sc_graph_t &graph,
-        const combined_op_dispatch_key_t &key, int &key_idx,
-        size_t node_input_offset, size_t graph_input_offset,
-        const std::shared_ptr<sc_op> &modified_inp = nullptr);
-expr call_op_dynamic_query_function(
-        const std::shared_ptr<sc_op> &op, const std::vector<expr> &args);
-void create_internal_dispatch_funcs_by_node(
-        const std::shared_ptr<context_t> &ctx,
-        std::shared_ptr<ir_module_t> &ret_mod, const std::string &table_name,
-        const std::shared_ptr<sc_op> &node,
-        const std::shared_ptr<const thread_pool_mode_t> &use_mtp);
-void create_dispatch_funcs_by_keys(const std::shared_ptr<context_t> &ctx,
-        std::shared_ptr<ir_module_t> &ret_mod, const std::string &table_name,
-        const std::shared_ptr<sc_op> &node, const op_dispatch_key_base_t *key,
-        expr &op_dispatch_kernel, int &dyn_idx,
-        const std::shared_ptr<const thread_pool_mode_t> &use_mtp,
-        bool internal);
-int get_num_of_internal_funcs(const std::shared_ptr<sc_op> &node);
-int get_num_of_internal_funcs(const sc_graph_t &graph);
-int count_dynamic_dims(const sc_dims &in);
-
-namespace runtime {
-struct dynamic_tensor_t;
-}
-SC_API runtime::dynamic_tensor_t convert_graph_tensor_to_dynamic_tensor(
-        const std::shared_ptr<graph_tensor> &in, void *data_ptr = nullptr,
-        sc_dim *shape_ptr = nullptr);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fused_op.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fused_op.cpp
deleted file mode 100644
index 4be9b4d1568..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fused_op.cpp
+++ /dev/null
@@ -1,1184 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "fused_op.hpp"
-#include <algorithm>
-#include <atomic>
-#include <utility>
-#include "anchor_loop_generator.hpp"
-#include "fusible_op_utils.hpp"
-#include "lowering.hpp"
-#include "pass/pass.hpp"
-#include "tunable_op.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/dynamic_internal_info.hpp>
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/func_inline.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/transform/scope_flatten.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/shape_of_tensor.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/matmul_core.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-SC_MODULE(graph.fused_op)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-fusion_partition_t *fusion_partition_t::get_root() const {
-    if (merged_to) {
-        assert(ops.empty());
-        return merged_to->get_root();
-    }
-    return const_cast<fusion_partition_t *>(this);
-}
-
-bool fusion_partition_t::is_ok_to_add(
-        sc_op *op, const op_dep_matrix_t &g) const {
-    if (merged_to) { return get_root()->is_ok_to_add(op, g); }
-    for (auto &in : op->get_inputs()) {
-        // if the input is in the partition, don't need to check
-        if (contains(in->producer_owner_)) { continue; }
-        // if "in" depends on an op in the partition
-        if (main_tunable_op
-                && g.lookup(main_tunable_op->logical_op_id_,
-                           in->producer_owner_->logical_op_id_)
-                        == 1) {
-            return false;
-        }
-        for (auto &op_in_set : ops) {
-            auto result = g.lookup(op_in_set->logical_op_id_,
-                    in->producer_owner_->logical_op_id_);
-            // if "in" depends on an op in the partition
-            if (result == 1) { return false; }
-        }
-    }
-    return true;
-}
-
-bool fusion_partition_t::contains(sc_op *op) const {
-    if (merged_to) { return get_root()->contains(op); }
-    return op == main_tunable_op.get()
-            || ops.find(op->shared_from_this()) != ops.end();
-}
-
-// merge the ops in "other" to "this"
-void fusion_partition_t::merge(const ptr &other) const {
-    auto ths_root = get_root();
-    auto other_root = other->get_root();
-    if (ths_root == other_root) { return; }
-    ths_root->ops.insert(other_root->ops.begin(), other_root->ops.end());
-    assert(other_root->main_tunable_op == nullptr);
-    other_root->ops.clear();
-    other_root->merged_to = ths_root->shared_from_this();
-}
-
-namespace graph {
-void mark_read_or_write_buffers(std::vector<expr> &args, bool is_read) {
-    const char *name = is_read ? "read_buffer" : "write_buffer";
-    for (auto &tsr : args) {
-        tsr->attr()[name] = true;
-    }
-}
-
-func_t create_func_decl_for_op(
-        sc_op *op, std::vector<expr> &ins, std::vector<expr> &outs) {
-    auto &graph = op->get_owner_graph();
-    ins = ins.empty() ? graph::tensor_detail_to_ir_tensor(
-                  graph, "__ins_", op->get_inputs())
-                      : ins;
-    outs = outs.empty() ? graph::tensor_detail_to_ir_tensor(
-                   graph, "__outs_", op->get_outputs())
-                        : outs;
-    graph::mark_read_or_write_buffers(ins, true);
-    graph::mark_read_or_write_buffers(outs, false);
-    std::vector<expr> args = outs;
-    args.insert(args.end(), ins.begin(), ins.end());
-    std::string func_name;
-    if (auto layer_name
-            = op->attrs_.get_or_null<std::string>(op_attr_key::layer_name)) {
-        COMPILE_ASSERT(!layer_name->empty() && isalpha(layer_name->front()),
-                "Bad layername: " << *layer_name);
-        func_name = *layer_name;
-    } else {
-        func_name = op->op_name_;
-    }
-    func_name += "__";
-    func_name += std::to_string(op->logical_op_id_);
-    auto func = builder::make_func(func_name, args,
-            make_stmt<stmts_node_t>(std::vector<stmt>()), datatypes::boolean);
-    // func->attr()["Gflop"] = gen_ptr->get_gflop();
-    return func;
-}
-
-func_t create_query_func_decl_for_op(sc_op *op, std::vector<expr> &ins,
-        std::vector<expr> &ori_ins, std::vector<expr> &outs,
-        std::vector<expr> &in_fmts, std::vector<expr> &ori_in_fmts,
-        std::vector<expr> &out_fmts, std::vector<expr> &out_sizes,
-        expr &kernel) {
-    func_t func = create_func_decl_for_op(op, ins, outs);
-    in_fmts.resize(ins.size());
-    out_fmts.resize(outs.size());
-    out_sizes.resize(outs.size());
-    assert(op->isa<mixed_fuse_op_t>());
-    if ((op->isa<mixed_fuse_op_t>()
-                && !op->stc_cast<mixed_fuse_op_t>()
-                            ->get_internal_tunable_input_indices()
-                            .empty())) {
-        size_t ori_sz = op->stc_cast<mixed_fuse_op_t>()
-                                ->get_internal_tunable_input_indices()
-                                .size();
-        auto &graph = op->get_owner_graph();
-        for (size_t i = 0; i < ori_sz; i++) {
-            auto ori_in = graph::tensor_detail_to_ir_tensor(graph,
-                    std::string("__ori_ins_") + std::to_string(i),
-                    op->get_inputs()[i]->details_);
-            ori_in->attr().set(attr_keys::always_trans, true);
-            ori_ins.emplace_back(ori_in);
-            ori_in_fmts.emplace_back(builder::make_tensor(
-                    ori_in.static_as<tensor>()->name_ + "_format",
-                    {UINT64_C(1)}, datatypes::index));
-        }
-    }
-    std::transform(ins.begin(), ins.end(), in_fmts.begin(), [](const expr &in) {
-        in->attr().set(attr_keys::always_trans, true);
-        return builder::make_tensor(in.static_as<tensor>()->name_ + "_format",
-                {UINT64_C(1)}, datatypes::index);
-    });
-    std::transform(
-            outs.begin(), outs.end(), out_fmts.begin(), [](const expr &in) {
-                in->attr().set(attr_keys::always_trans, true);
-                return builder::make_tensor(
-                        in.static_as<tensor>()->name_ + "_format",
-                        {UINT64_C(1)}, datatypes::index);
-            });
-    std::transform(
-            outs.begin(), outs.end(), out_sizes.begin(), [](const expr &in) {
-                return builder::make_tensor(
-                        in.static_as<tensor>()->name_ + "_size", {UINT64_C(1)},
-                        datatypes::index);
-            });
-    // table should be get from module global data, here is only for the
-    // alignment to other functions.
-    auto table = builder::make_var(datatypes::pointer, "dummy_table");
-    std::vector<expr> args = func->params_;
-    args.insert(args.begin(), table);
-    args.insert(args.end(), ori_ins.begin(), ori_ins.end());
-    args.insert(args.end(), out_fmts.begin(), out_fmts.end());
-    args.insert(args.end(), in_fmts.begin(), in_fmts.end());
-    args.insert(args.end(), ori_in_fmts.begin(), ori_in_fmts.end());
-    args.insert(args.end(), out_sizes.begin(), out_sizes.end());
-    kernel = builder::make_tensor("func_kernel",
-            {1 + get_num_of_internal_funcs(op->shared_from_this())},
-            datatypes::index);
-    args.push_back(kernel);
-    func->params_ = args;
-    func->name_ = std::string("query_format_") + func->name_;
-    return func;
-}
-} // namespace graph
-static std::atomic<int> out_idx(0);
-
-struct fused_exprs_t {
-    expr dummy_size;
-    expr dummy_kernel;
-    expr combined_keys;
-    expr combined_algs;
-};
-
-struct general_fused_params_t {
-    builder::ir_builder_t &bld;
-    ir_module_ptr modu;
-    sc_graph_t &graph;
-    sc_op_ptr node;
-    std::unordered_map<graph_tensor_ptr, tsr_info_t> &ltsr_rtsr;
-    std::unordered_map<graph_tensor_ptr, graph_tensor_ptr> &fmgr_2_orig;
-    std::unordered_map<graph_tensor_ptr, bool> &visited;
-    int &inner_tsr_count;
-    int &cur_combined_op_idx;
-    int &cur_combined_key_idx;
-    int &cur_ori_inp_idx;
-    int &cur_internal_idx;
-    fused_exprs_t exprs;
-};
-
-tsr_info_t get_or_create_tsr_and_fmt(
-        general_fused_params_t &gp, const graph_tensor_ptr &in) {
-    auto it = gp.ltsr_rtsr.find(in);
-    if (it != gp.ltsr_rtsr.end()) { return it->second; }
-    auto &bld = gp.bld;
-    auto rtsr
-            = builder::make_tensor("tsr_" + std::to_string(gp.inner_tsr_count),
-                    {sizeof(runtime::dynamic_tensor_t)}, datatypes::u8);
-    auto shape_tsr = builder::make_tensor(
-            "dyn_shape_tsr_" + std::to_string(gp.inner_tsr_count),
-            {in->details_.get_plain_dims().size()}, datatypes::index);
-    shape_tsr->attr().set(attr_keys::no_dead_write, true);
-    shape_tsr->attr().set(attr_keys::no_tensor2var, true);
-    bool fmt_init = in->details_.get_format_candidates().size() <= 1;
-    auto out_fmt = builder::make_tensor(
-            "format_" + std::to_string(gp.inner_tsr_count), {1UL},
-            datatypes::index);
-    bld.push_var_tensor_def(rtsr);
-    bld.push_var_tensor_def(shape_tsr);
-    bld.push_evaluate(builder::make_write_struct(rtsr, shape_tsr,
-            dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::dim_ptr));
-    bld.push_evaluate(builder::make_write_struct(rtsr,
-            builder::make_constant(
-                    {in->details_.get_plain_dims().size()}, datatypes::s32),
-            dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::ndims));
-    uint64_t etype = in->details_.dtype_.is_etype_pointer()
-            ? in->details_.dtype_.get_pointer_element().as_etype_int()
-            : in->details_.dtype_.as_etype_int();
-    bld.push_evaluate(builder::make_write_struct(rtsr,
-            builder::make_constant({etype}, datatypes::u32),
-            dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::dtype));
-    auto plain_shapes = gp.node->get_owner_graph().dims_to_expr(
-            in->details_.get_plain_dims());
-    uint64_t dyn_mask_int = 0;
-    for (size_t i = 0; i < plain_shapes.size(); i++) {
-        bld.push_assign(
-                builder::make_indexing(shape_tsr, {i}), plain_shapes[i]);
-        dyn_mask_int |= (uint64_t(!plain_shapes[i].isa<constant>()) << i);
-    }
-    bld.push_evaluate(builder::make_write_struct(rtsr,
-            builder::make_constant({dyn_mask_int}, datatypes::u8),
-            dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::dyn_mask));
-    uint64_t init_format = 0;
-    if (fmt_init) {
-        init_format = uint64_t(in->details_.get_format().to_runtime());
-    }
-    bld.push_var_tensor_def(out_fmt);
-    bld.push_assign(builder::make_indexing(out_fmt, {0}), init_format);
-
-    gp.inner_tsr_count++;
-    auto ret = tsr_info_t(rtsr, expr(), out_fmt, gp.exprs.dummy_size);
-    gp.ltsr_rtsr[in] = ret;
-    return ret;
-}
-
-static bool need_inner_query(
-        general_fused_params_t &gp, const sc_op_ptr &node, int &main_idx) {
-    auto &inputs = node->get_inputs();
-    auto &outputs = node->get_outputs();
-    // check if the op is associated with const.
-    for (size_t i = 0; i < inputs.size(); i++) {
-        auto &in = inputs[i];
-        // original ltensor is legal and is constant
-        if (!gp.visited[in]) { return true; }
-        auto it = gp.fmgr_2_orig.find(in);
-        if (it != gp.fmgr_2_orig.end() && !it->second->uses_.empty()
-                && it->second->producer_owner_
-                && it->second->producer_owner_->attrs_.get_or_else(
-                           "constant", const_kind::not_const)
-                        != const_kind::not_const) {
-            return true;
-        }
-        if (!in->uses_.empty() && in->producer_owner_
-                && in->producer_owner_->isa<reorder_op_t>()) {
-            return true;
-        }
-    }
-    if (node->isa<binary_elementwise_op_t>()) {
-        int bc_idx = node->stc_cast<binary_elementwise_op_t>()
-                             ->get_broadcast_input();
-        main_idx = bc_idx == -1 ? 0 : 1 - bc_idx;
-    }
-    // check the op is linked to output, need to query output size.
-    for (size_t i = 0; i < outputs.size(); i++) {
-        auto &out = outputs[i];
-        for (size_t j = 0; j < out->uses_.size(); j++) {
-            if (out->uses_[j].second->isa<output_op>()) { return true; }
-        }
-    }
-    return false;
-}
-
-void update_op_visited(general_fused_params_t &gp, const sc_op_ptr &node) {
-    for (auto &in : node->get_inputs()) {
-        gp.visited[in] = true;
-    }
-    for (auto &out : node->get_outputs()) {
-        gp.visited[out] = true;
-    }
-}
-
-void add_global_table_var(general_fused_params_t &gp,
-        const std::string &table_name, const op_dispatch_tables_ptr &table_ptr,
-        const expr &table_var) {
-    gp.modu->add_op_table(std::make_pair(table_name, table_ptr));
-    auto table_def = builder::make_var_tensor_def_unattached(
-            table_var, linkage::private_global);
-    gp.modu->add_global_var(table_def.checked_as<define>());
-}
-
-void declare_dummy_and_combined_tsrs(
-        general_fused_params_t &gp, int total_key_num, int dispatch_op_num) {
-    auto &bld = gp.bld;
-    bld.push_scope();
-    // create dummy tensor/var for inner query.
-    gp.exprs.dummy_kernel
-            = builder::make_var(datatypes::pointer, "dummy_kernel");
-    bld.push_var_tensor_def(gp.exprs.dummy_kernel, linkage::local);
-    gp.exprs.dummy_size
-            = builder::make_tensor("dummy_size", {1}, datatypes::index);
-    bld.push_var_tensor_def(gp.exprs.dummy_size, linkage::local);
-    if (total_key_num) {
-        expr combined_keys = builder::make_tensor(
-                "combined_keys", {total_key_num}, datatypes::pointer);
-        gp.exprs.combined_keys = combined_keys;
-        bld.push_var_tensor_def(combined_keys);
-    }
-    if (dispatch_op_num) {
-        expr combined_algs = builder::make_tensor(
-                "combined_algs", {dispatch_op_num}, datatypes::s32);
-        gp.exprs.combined_algs = combined_algs;
-        bld.push_var_tensor_def(combined_algs);
-    }
-}
-
-void set_original_tensor_and_format_for_tunables(general_fused_params_t &gp,
-        sc_op *node_before, const std::vector<expr> &ori_ins,
-        const std::vector<expr> &ori_in_fmts, expr &ori_tsr, expr &ori_fmt) {
-    tsr_info_t tsr_info;
-    if (node_before->isa<reorder_op_t>()) {
-        tsr_info = get_or_create_tsr_and_fmt(gp, node_before->get_inputs()[0]);
-    } else {
-        auto ltsr = node_before->get_outputs()[0];
-        auto it = gp.fmgr_2_orig.find(ltsr);
-        // if find in fmgr_2_orig, it is a input op, else is a internal ltsr.
-        if (it != gp.fmgr_2_orig.end()) {
-            assert(node_before->isa<input_op>());
-            ltsr = it->second;
-        }
-        tsr_info = get_or_create_tsr_and_fmt(gp, ltsr);
-    }
-    ori_tsr = tsr_info.tensor_;
-    ori_fmt = tsr_info.format_;
-}
-
-void create_query_function_by_graph(general_fused_params_t &gp,
-        const expr &kernel, const std::vector<expr> &ori_ins,
-        const std::vector<expr> &ori_in_fmts,
-        std::vector<int> &each_op_num_keys, int total_key_num,
-        int dispatch_op_num,
-        const std::vector<size_t> &tunable_inp_indices
-        = std::vector<size_t>()) {
-    auto &bld = gp.bld;
-    auto combined_keys = gp.exprs.combined_keys;
-    auto combined_algs = gp.exprs.combined_algs;
-    auto &cur_combined_key_idx = gp.cur_combined_key_idx;
-    auto &cur_combined_op_idx = gp.cur_combined_op_idx;
-    auto &cur_internal_idx = gp.cur_internal_idx;
-    std::vector<std::string> table_names(gp.graph.ops_.size());
-    auto create_internal_query_func = [&](const sc_op_ptr &op) {
-        // Can not use can_op_be_dispatched as tsr and format need
-        // pass through each op.
-        if (op->isa<input_op>() || op->isa<output_op>()
-                || op->isa<constant_op_t>()) {
-            return;
-        }
-        expr dummy_kernel = gp.exprs.dummy_kernel;
-        expr dummy_size = gp.exprs.dummy_size;
-        auto ctx = gp.modu->ctx_;
-        int main_idx = 0;
-        size_t in_size = op->get_inputs().size();
-        size_t out_size = op->get_outputs().size();
-        auto table_name = gp.node->op_name_ + "__"
-                + std::to_string(gp.node->logical_op_id_) + "_inner__"
-                + std::to_string(op->logical_op_id_) + "_table";
-        if (op->info_.internal_info_) {
-            op->info_.internal_info_->dispatch_table_name_ = table_name;
-        }
-        auto table_var = builder::make_var(datatypes::pointer, table_name);
-        auto table_ptr = std::make_shared<op_dispatch_tables_t>();
-        std::vector<tsr_info_t> op_outs(out_size), op_ins(in_size);
-        for (size_t i = 0; i < out_size; i++) {
-            op_outs[i] = get_or_create_tsr_and_fmt(gp, op->get_outputs()[i]);
-        }
-        for (size_t i = 0; i < in_size; i++) {
-            op_ins[i] = get_or_create_tsr_and_fmt(gp, op->get_inputs()[i]);
-        }
-        if (op->isa<ops::matmul_core_op_t>()
-                || op->isa<ops::managed_matmul_core_op_t>()
-                || op->isa<ops::conv_fwd_core_op_t>()) {
-            auto table_ptr = std::make_shared<op_dispatch_tables_t>();
-            // create origin tsr and dispatch key for tunable ops
-            expr ori_in0, ori_in1, ori_in_fmt0, ori_in_fmt1;
-            auto node_before_in0 = op->get_inputs()[0]->producer_owner_;
-            auto node_before_in1 = op->get_inputs()[1]->producer_owner_;
-            set_original_tensor_and_format_for_tunables(gp, node_before_in0,
-                    ori_ins, ori_in_fmts, ori_in0, ori_in_fmt0);
-            set_original_tensor_and_format_for_tunables(gp, node_before_in1,
-                    ori_ins, ori_in_fmts, ori_in1, ori_in_fmt1);
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            auto internal_kernel = op->need_dynamic_internal_query()
-                    ? builder::tensor_ptr(kernel, {cur_internal_idx})
-                    : make_expr<constant_node>(UINT64_C(0), datatypes::pointer);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_ins[1].tensor_, ori_in0, ori_in1,
-                    op_outs[0].format_, op_ins[0].format_, op_ins[1].format_,
-                    ori_in_fmt0, ori_in_fmt1, op_outs[0].size_, internal_kernel,
-                    builder::tensor_ptr(combined_algs, {cur_combined_op_idx})};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-            initialize_dispatch_table_with_op(ctx, op, table_ptr);
-            // set combined tensor
-            bld.push_assign(builder::make_indexing(
-                                    combined_keys, {cur_combined_key_idx++}),
-                    op_ins[0].format_);
-            bld.push_assign(builder::make_indexing(
-                                    combined_keys, {cur_combined_key_idx++}),
-                    op_ins[1].format_);
-            bld.push_assign(builder::make_indexing(
-                                    combined_keys, {cur_combined_key_idx++}),
-                    op_outs[0].format_);
-            each_op_num_keys[cur_combined_op_idx] = 3;
-            cur_combined_op_idx++;
-            if (op->isa<ops::managed_matmul_core_op_t>()) {
-                cur_internal_idx++;
-            }
-        } else if (op->isa<unary_elementwise_op_impl_t>()) {
-            if (need_inner_query(gp, op, main_idx)) {
-                add_global_table_var(gp, table_name, table_ptr, table_var);
-                initialize_dispatch_table_with_op(ctx, op, table_ptr);
-                std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                        op_ins[0].tensor_, op_outs[0].format_,
-                        op_ins[0].format_, op_outs[0].size_, dummy_kernel};
-                bld.push_evaluate(call_op_dynamic_query_function(op, args));
-            } else {
-                auto &out = op->get_outputs()[0];
-                gp.ltsr_rtsr[out] = op_ins[main_idx];
-            }
-        } else if (op->isa<padding_op_t>()) {
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            initialize_dispatch_table_with_op(ctx, op, table_ptr);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_outs[0].format_, op_ins[0].format_,
-                    op_outs[0].size_, dummy_kernel};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-        } else if (op->isa<pooling_op_t>()) {
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            initialize_dispatch_table_with_op(ctx, op, table_ptr);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_outs[0].format_, op_ins[0].format_,
-                    op_outs[0].size_, dummy_kernel};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-        } else if (op->isa<binary_elementwise_op_impl_t>()) {
-            if (need_inner_query(gp, op, main_idx)) {
-                add_global_table_var(gp, table_name, table_ptr, table_var);
-                initialize_dispatch_table_with_op(ctx, op, table_ptr);
-                std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                        op_ins[0].tensor_, op_ins[1].tensor_,
-                        op_outs[0].format_, op_ins[0].format_,
-                        op_ins[1].format_, op_outs[0].size_, dummy_kernel};
-                bld.push_evaluate(call_op_dynamic_query_function(op, args));
-            } else {
-                auto &out = op->get_outputs()[0];
-                gp.ltsr_rtsr[out] = op_ins[main_idx];
-            }
-        } else if (op->isa<reorder_op_t>()) {
-            // Currently reorder is the last op of fusion pattern, so always
-            // query.
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_outs[0].format_, op_ins[0].format_,
-                    op_outs[0].size_, dummy_kernel,
-                    builder::tensor_ptr(combined_algs, {cur_combined_op_idx})};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-            // set combined key tensor
-            bld.push_assign(builder::make_indexing(
-                                    combined_keys, {cur_combined_key_idx++}),
-                    op_ins[0].format_);
-            bld.push_assign(builder::make_indexing(
-                                    combined_keys, {cur_combined_key_idx++}),
-                    op_outs[0].format_);
-            each_op_num_keys[cur_combined_op_idx] = 2;
-            cur_combined_op_idx++;
-        } else if (op->isa<reduce_op_t>()) {
-            // always query
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            initialize_dispatch_table_with_op(ctx, op, table_ptr);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_outs[0].format_, op_ins[0].format_,
-                    op_outs[0].size_, dummy_kernel};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-        } else if (op->isa<tensor_view_op_t>()) {
-            // always query
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            initialize_dispatch_table_with_op(ctx, op, table_ptr);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_outs[0].format_, op_ins[0].format_,
-                    op_outs[0].size_, dummy_kernel};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-        } else if (op->isa<select_op_t>()) {
-            add_global_table_var(gp, table_name, table_ptr, table_var);
-            initialize_dispatch_table_with_op(ctx, op, table_ptr);
-            std::vector<expr> args = {table_var, op_outs[0].tensor_,
-                    op_ins[0].tensor_, op_ins[1].tensor_, op_ins[2].tensor_,
-                    op_outs[0].format_, op_ins[0].format_, op_ins[1].format_,
-                    op_ins[2].format_, op_outs[0].size_, dummy_kernel};
-            bld.push_evaluate(call_op_dynamic_query_function(op, args));
-        } else if (op->isa<shape_of_tensor_op_t>()) {
-            // do nothing
-        } else {
-            COMPILE_ASSERT(false,
-                    "Currently dynamic fusbile op only support "
-                    "unary/binary.");
-        }
-        update_op_visited(gp, op);
-    };
-    visit_fused_graph_by_query_order(gp.graph, create_internal_query_func);
-
-    // final query the fused op kernel.
-    assert(gp.cur_combined_key_idx == total_key_num
-            && gp.cur_combined_op_idx == dispatch_op_num);
-    assert(gp.cur_internal_idx - 1 == get_num_of_internal_funcs(gp.node));
-    auto main_table_name = gp.node->op_name_ + "__"
-            + std::to_string(gp.node->logical_op_id_) + "_ptr_table";
-    auto main_table_var
-            = builder::make_var(datatypes::pointer, main_table_name);
-    auto main_table_ptr = std::make_shared<op_dispatch_tables_t>();
-    add_global_table_var(gp, main_table_name, main_table_ptr, main_table_var);
-    expr each_op_num_keys_tsr = builder::make_tensor("each_op_num_keys",
-            {dispatch_op_num}, datatypes::s32, address_space::automatic,
-            std::make_shared<static_data_t>(each_op_num_keys));
-    gp.modu->add_global_var(builder::make_var_tensor_def_unattached(
-            each_op_num_keys_tsr, linkage::private_global)
-                                    .static_as<define>());
-    bld.push_evaluate(builtin::call_fused_op_query_combined(main_table_var,
-            combined_keys.defined() ? combined_keys : get_ir_null(),
-            combined_algs.defined() ? combined_algs : get_ir_null(),
-            each_op_num_keys_tsr, dispatch_op_num, kernel));
-}
-
-mixed_fuse_op_t::mixed_fuse_op_t(const std::string &name,
-        const std::vector<mixed_parti_t::ptr> &parti_list,
-        const ir_module_ptr &mod, const sc_graph_t &graph,
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    info_.outputs_ = outs;
-    parti_list_ = parti_list;
-    mod_ = mod;
-    sub_graph_ = copy_graph(graph);
-    op_name_ = name;
-    attrs_ = attrs;
-}
-
-bool mixed_fuse_op_t::need_dynamic_internal_query_impl() const {
-    return !std::all_of(sub_graph_.ops_.begin(), sub_graph_.ops_.end(),
-            [](const sc_op_ptr &op) {
-                return !op->need_dynamic_internal_query();
-            });
-}
-
-ir_module_ptr mixed_fuse_op_t::get_func(context_ptr ctx) {
-    func_t func;
-    bool use_cache
-            = get_owner_graph().attrs_.get_or_else("temp.force_static", false);
-    ir_module_ptr modu;
-    if (!use_cache && can_op_be_dispatched(shared_from_this())) {
-        modu = std::make_shared<ir_module_t>(ctx);
-        // max fusion policy don't need any conditions.
-        expr max_loop_parallelism_cond;
-        func_t max_fusion_func, max_loop_parallel_func;
-        ir_module_ptr max_fusion_modu, max_loop_parallel_modu;
-        std::vector<expr> ins, outs;
-        func = graph::create_func_decl_for_op(this, ins, outs);
-        outs.insert(outs.end(), ins.begin(), ins.end());
-        std::for_each(outs.begin(), outs.end(), [](const expr &arg) {
-            arg->attr().set(attr_keys::always_trans, true);
-        });
-        if (need_dynamic_internal_query()) {
-            auto internal_func_arg = builder::make_tensor(
-                    "extra_internal_funcs",
-                    {get_num_of_internal_funcs(sub_graph_)}, datatypes::index);
-            func->params_.emplace_back(internal_func_arg);
-            func->decl_->params_.emplace_back(internal_func_arg);
-            outs.emplace_back(internal_func_arg);
-        }
-        func->name_ = op_name_;
-        func->decl_->name_ = op_name_;
-        func->name_ += "_" + std::to_string(logical_op_id_);
-        func->decl_->name_ += "_" + std::to_string(logical_op_id_);
-        auto return_stmt = builder::make_returns_unattached(true);
-        stmt policy_dispatch;
-        {
-            // max_loop_parallelism policy
-            auto cpy_graph = copy_graph(sub_graph_);
-            cpy_graph.attrs_.set("temp.dynamic_fusion_policy",
-                    dynamic_fusion_policy_t::max_loop_parallelism);
-            mixed_partition(cpy_graph, ctx);
-            std::vector<sc_op_ptr> lower_args(cpy_graph.get_output_ops());
-            auto input_ops = cpy_graph.get_input_ops();
-            lower_args.insert(
-                    lower_args.end(), input_ops.begin(), input_ops.end());
-            cpy_graph.attrs_.set("temp.force_static", true);
-            max_loop_parallel_modu
-                    = lower_graph(ctx, cpy_graph, lower_args, false);
-            max_loop_parallel_func = max_loop_parallel_modu->get_entry_func();
-            max_loop_parallel_func->name_ = op_name_;
-            max_loop_parallel_func->decl_->name_ = op_name_;
-            max_loop_parallel_func->name_
-                    += "_max_loop_parallism_" + std::to_string(logical_op_id_);
-            max_loop_parallel_func->decl_->name_
-                    += "_max_loop_parallism_" + std::to_string(logical_op_id_);
-            max_loop_parallelism_cond = cpy_graph.attrs_.get<expr>(
-                    "temp.fusion_policy_condition");
-            schedule_loops(max_loop_parallel_func->body_);
-        }
-        // if condition is true or false after simplify, keep only one module
-        // for less functions.
-        max_loop_parallelism_cond = do_cast_and_fold(max_loop_parallelism_cond);
-        if (max_loop_parallelism_cond->equals(expr(true))) {
-            modu->merge(*max_loop_parallel_modu);
-            policy_dispatch = builder::make_evaluate_unattached(
-                    builder::make_call(max_loop_parallel_func->decl_, outs));
-        } else {
-            // max_fusion policy
-            auto cpy_graph = copy_graph(sub_graph_);
-            cpy_graph.attrs_.set("temp.dynamic_fusion_policy",
-                    dynamic_fusion_policy_t::max_fusion);
-            mixed_partition(cpy_graph, ctx);
-            std::vector<sc_op_ptr> lower_args(cpy_graph.get_output_ops());
-            auto input_ops = cpy_graph.get_input_ops();
-            lower_args.insert(
-                    lower_args.end(), input_ops.begin(), input_ops.end());
-            cpy_graph.attrs_.set("temp.force_static", true);
-            max_fusion_modu = lower_graph(ctx, cpy_graph, lower_args, false);
-            max_fusion_func = max_fusion_modu->get_entry_func();
-            max_fusion_func->name_ = op_name_;
-            max_fusion_func->name_
-                    += "_max_fusion_" + std::to_string(logical_op_id_);
-            max_fusion_func->decl_->name_
-                    += "_max_fusion_" + std::to_string(logical_op_id_);
-            schedule_loops(max_fusion_func->body_);
-            modu->merge(*max_fusion_modu);
-            if (max_loop_parallelism_cond->equals(expr(false))) {
-                policy_dispatch = builder::make_evaluate_unattached(
-                        builder::make_call(max_fusion_func->decl_, outs));
-            } else {
-                modu->merge(*max_loop_parallel_modu);
-                policy_dispatch = builder::make_if_else_unattached(
-                        max_loop_parallelism_cond,
-                        builder::make_evaluate_unattached(builder::make_call(
-                                max_loop_parallel_func->decl_, outs)),
-                        builder::make_evaluate_unattached(builder::make_call(
-                                max_fusion_func->decl_, outs)));
-            }
-        }
-        func->body_ = builder::make_stmts_unattached(
-                {policy_dispatch, return_stmt});
-        modu->add_func({func});
-        modu->set_entry_func_idx(modu->get_contents().size() - 1);
-    } else {
-        // if mod_ is not empty, usually when redo occurs in partition stage.
-        if (mod_) return mod_;
-        COMPILE_ASSERT(parti_list_.size() == 1,
-                "partition size is expected for 1, but got "
-                        << parti_list_.size())
-        func = parti_list_[0]->func_;
-        func->name_ = op_name_;
-        func->decl_->name_ = op_name_;
-        func->name_ += "_" + std::to_string(logical_op_id_);
-        func->decl_->name_ += "_" + std::to_string(logical_op_id_);
-        schedule_loops(func->body_);
-        modu = std::make_shared<ir_module_t>(ctx);
-        modu->add_func({func});
-        modu->set_entry_func_idx(0);
-        if (need_dynamic_internal_query()) {
-            for (auto &parti : parti_list_) {
-                assert(parti->dyn_inter_);
-                modu->merge(*parti->dyn_inter_->mod_);
-            }
-        }
-    }
-
-    return modu;
-}
-
-void schedule_loop_body(const stmt &body, node_ptr_map *node_remap) {
-    stmt target_loop = body;
-    if (target_loop.isa<stmts>()) {
-        auto ss = target_loop.static_as<stmts>();
-        COMPILE_ASSERT(ss->seq_.size() == 1 && ss->seq_[0].isa<for_loop>(),
-                "for loop node is expected");
-        target_loop = ss->seq_[0];
-    }
-    COMPILE_ASSERT(target_loop.isa<for_loop>(), "for loop node is expected");
-    for_loop outer_most_loop = target_loop.checked_as<for_loop>();
-    outer_most_loop->kind_ = for_type::PARALLEL;
-    assert(outer_most_loop.defined());
-    const int run_threads = runtime_config_t::get().get_num_threads();
-    for_loop cur_loop = outer_most_loop;
-    std::vector<for_loop> loops;
-    auto fused_number = 1;
-    while (true) {
-        if (cur_loop->iter_end_.isa<constant>()
-                && cur_loop->iter_begin_.isa<constant>()) {
-            fused_number *= (get_expr_as_int(cur_loop->iter_end_)
-                    - get_expr_as_int(cur_loop->iter_begin_));
-        }
-        if (fused_number / run_threads > 12
-                || (fused_number >= run_threads
-                        && (fused_number % run_threads) == 0))
-            break;
-        auto inner_loop = get_inner_for_loop(cur_loop.get());
-        if (inner_loop.defined() && !inner_loop->num_threads_
-                && (inner_loop->step_.isa<constant>()
-                        && get_expr_as_int(inner_loop->step_) == 1)
-                && !inner_loop->attr().get_or_else(
-                        stmt_attr_key::no_loop_fuse, false)) {
-            node_ptr_map cur_remap;
-            outer_most_loop->fuse(
-                    inner_loop, node_remap ? &cur_remap : nullptr);
-            cur_loop = inner_loop;
-            if (node_remap) {
-                for (auto &exp_m : (*node_remap)) {
-                    auto iter = cur_remap.find(exp_m.second);
-                    if (iter != cur_remap.end()) {
-                        exp_m.second = iter->second;
-                        cur_remap.erase(iter);
-                    }
-                }
-                node_remap->insert(cur_remap.begin(), cur_remap.end());
-            }
-        } else {
-            break;
-        }
-    }
-}
-
-static op_traits::may_prefetch_t *find_prefetch_op(const sc_graph_t &graph) {
-    // fix-me (yijie): should provide a way to find the main op of the partition
-    // find the first tunable & prefetchable op
-    op_traits::may_prefetch_t *found_op = nullptr;
-    op_visitor_t::dfs_topology_sort(graph.ops_.size())
-            .visit_graph(
-                    graph, [&found_op](op_visitor_t *vis, const sc_op_ptr &op) {
-                        if (found_op) { return; }
-                        if (op->isa<tunable_op_t>()
-                                && op->isa<op_traits::may_prefetch_t>()
-                                && op->attrs_.get_or_else(
-                                        mixed_partition_hint::first_prefetch_op,
-                                        false)) {
-                            for (auto &ins : op->get_inputs()) {
-                                if (ins->producer_owner_->isa<input_op>()) {
-                                    found_op = op->dyn_cast<
-                                            op_traits::may_prefetch_t>();
-                                    return;
-                                }
-                            }
-                        }
-                    });
-    return found_op;
-}
-
-std::vector<int> mixed_fuse_op_t::query_prefetch(const context_ptr &ctx,
-        bool is_global, const std::vector<tensor_slice> &ins) {
-    if (auto found_op = find_prefetch_op(sub_graph_)) {
-        auto ret = found_op->query_prefetch(ctx, is_global, ins);
-        auto &ins = dynamic_cast<sc_op *>(found_op)->get_inputs();
-        // check that the inputs to prefetch are from the sub-graph inputs
-        std::vector<int> indices;
-        auto graph_inputs = sub_graph_.get_input_ops();
-        for (auto in_of_op : ret) {
-            if (auto the_in_op
-                    = ins.at(in_of_op)->producer_owner_->dyn_cast<input_op>()) {
-                // find the index of the input op in graph inputs
-                int idx = std::find(graph_inputs.begin(), graph_inputs.end(),
-                                  the_in_op->shared_from_this())
-                        - graph_inputs.begin();
-                // if idx is not in indices, push
-                if (std::find(indices.begin(), indices.end(), idx)
-                        == indices.end()) {
-                    indices.emplace_back(idx);
-                }
-            }
-        }
-        return indices;
-    }
-    return {};
-}
-
-void mixed_fuse_op_t::generate_prefetcher_body_for_tensor(
-        const context_ptr &ctx, const std::vector<expr> &func_args,
-        const std::vector<expr> &ins, const std::vector<int> &indices) {
-    if (auto found_op = find_prefetch_op(sub_graph_)) {
-        auto graph_inputs = sub_graph_.get_input_ops();
-        auto &op_ins = dynamic_cast<sc_op *>(found_op)->get_inputs();
-        std::vector<int> indices_in_op;
-        for (auto idx : indices) {
-            auto input_op_ptr = graph_inputs.at(idx).get();
-            for (size_t i = 0; i < op_ins.size(); i++) {
-                if (op_ins[i]->producer_owner_ == input_op_ptr) {
-                    indices_in_op.emplace_back(i);
-                }
-            }
-        }
-        found_op->generate_prefetcher_body_for_tensor(
-                ctx, func_args, ins, indices_in_op);
-    }
-}
-
-void mixed_fuse_op_t::schedule_loops(const stmt &body) {
-    if (body.isa<for_loop>())
-        schedule_loop_body(body);
-    else if (body.isa<stmts>()) {
-        for (auto &st : body.checked_as<stmts>()->seq_) {
-            if (st.isa<for_loop>()) {
-                schedule_loop_body(st);
-            } else {
-                // recursively call
-                schedule_loops(st);
-            }
-        }
-    }
-}
-
-std::vector<size_t> mixed_fuse_op_t::get_internal_tunable_input_indices() {
-    auto graph_input_ops = sub_graph_.get_input_ops();
-    std::vector<size_t> ret;
-    for (size_t i = 0; i < graph_input_ops.size(); i++) {
-        auto &uses = graph_input_ops[i]->get_outputs()[0]->uses_;
-        for (auto &use : uses) {
-            if (use.second.lock() && use.second->isa<tunable_op_t>()) {
-                ret.emplace_back(i);
-                break;
-            }
-        }
-    }
-    return ret;
-}
-
-dispatch_set_ptr &mixed_fuse_op_t::get_dispatch_key_set() {
-    if (!info_.dispatch_key_set_) {
-        int dummy_num;
-        info_.dispatch_key_set_ = std::make_shared<combined_dispatch_key_set_t>(
-                get_inner_dispatch_ops(&dummy_num));
-    }
-    return info_.dispatch_key_set_;
-}
-
-std::vector<sc_op_ptr> mixed_fuse_op_t::get_inner_dispatch_ops(
-        int *total_key_num) {
-    std::vector<sc_op_ptr> ret;
-    if (total_key_num) { *total_key_num = 0; }
-    ret = get_graph_inner_dispatch_ops(sub_graph_, total_key_num);
-    return ret;
-}
-
-void mixed_fuse_op_t::update_internal_graph_format(
-        const combined_op_dispatch_key_t &key, const context_ptr &ctx) {
-    int key_idx = 0;
-    update_graph_format_by_key(
-            ctx, shared_from_this(), sub_graph_, key, key_idx, 0, 0);
-    assert(key_idx == static_cast<int>(key.size()));
-}
-
-ir_module_ptr mixed_fuse_op_t::get_dynamic_query_func(const context_ptr &ctx) {
-    auto modu = std::make_shared<ir_module_t>(ctx);
-    sub_graph_.sync_dynamic_info_with_graph(get_owner_graph());
-    std::unordered_map<graph_tensor_ptr, graph_tensor_ptr> fmgr_2_orig;
-    auto &node_inputs = get_inputs();
-    const auto &graph_input_ops = sub_graph_.get_input_ops();
-    const auto &graph_output_ops = sub_graph_.get_output_ops();
-    assert(node_inputs.size() == graph_input_ops.size());
-    for (size_t i = 0; i < node_inputs.size(); i++) {
-        fmgr_2_orig[graph_input_ops[i]->get_outputs()[0]] = node_inputs[i];
-    }
-    // inner graph logical tensor visit states, for query pruning.
-    std::unordered_map<graph_tensor_ptr, bool> visited;
-    std::vector<expr> ins, outs, in_fmts, out_fmts, ori_ins, ori_in_fmts,
-            out_sizes;
-    expr main_table_var, kernel;
-    size_t inp_idx = 0, out_idx = 0;
-    auto func = graph::create_query_func_decl_for_op(this, ins, ori_ins, outs,
-            in_fmts, ori_in_fmts, out_fmts, out_sizes, kernel);
-    // inner logical tensor => real tensor, out_size and format.
-    std::unordered_map<graph_tensor_ptr, tsr_info_t> ltsr_rtsr;
-    builder::ir_builder_t bld;
-    int total_key_num = 0;
-    int inner_tsr_count = 0;
-    int dispatch_op_num
-            = static_cast<int>(get_inner_dispatch_ops(&total_key_num).size());
-    int cur_combined_op_idx = 0, cur_combined_key_idx = 0, cur_ori_inp_idx = 0,
-        cur_internal_idx = 1;
-    // create general params.
-    general_fused_params_t gp {bld, modu, sub_graph_, shared_from_this(),
-            ltsr_rtsr, fmgr_2_orig, visited, inner_tsr_count,
-            cur_combined_op_idx, cur_combined_key_idx, cur_ori_inp_idx,
-            cur_internal_idx, fused_exprs_t()};
-    // construct combined tensors for final query.
-    std::vector<int> each_op_num_keys(dispatch_op_num, 0);
-    // build query function body
-    // declare dummy kernel and dummy size, combined tsrs
-    declare_dummy_and_combined_tsrs(gp, total_key_num, dispatch_op_num);
-    for (auto &inp : graph_input_ops) {
-        auto &ltsr = inp->get_outputs()[0];
-        ltsr_rtsr[ltsr]
-                = tsr_info_t(ins[inp_idx], expr(), in_fmts[inp_idx], expr());
-        inp_idx++;
-    }
-    for (auto &out : graph_output_ops) {
-        auto &ltsr = out->get_inputs()[0];
-        ltsr_rtsr[ltsr] = tsr_info_t(
-                outs[out_idx], expr(), out_fmts[out_idx], out_sizes[out_idx]);
-        out_idx++;
-    }
-    auto query_idx = get_internal_tunable_input_indices();
-    for (size_t i = 0; i < query_idx.size(); i++) {
-        auto &ori_inp_idx = query_idx[i];
-        auto &ltsr = node_inputs[ori_inp_idx];
-        ltsr_rtsr[ltsr]
-                = tsr_info_t(ori_ins[i], expr(), ori_in_fmts[i], expr());
-    }
-    // create query functions of valid ops inside graph and final query
-    // function.
-    create_query_function_by_graph(gp, kernel, ori_ins, ori_in_fmts,
-            each_op_num_keys, total_key_num, dispatch_op_num);
-    bld.push_returns(true);
-    auto body = bld.pop_scope();
-    func->body_ = std::move(body);
-    modu->add_func({func});
-    modu->set_entry_func_idx(0);
-    return modu;
-}
-
-void mixed_fuse_op_t::create_internal_dispatch_funcs(const context_ptr &ctx,
-        ir_module_ptr &mod,
-        const std::shared_ptr<const thread_pool_mode_t> &use_mtp) {
-    // todo: currently we only support one op with internal func query.
-    for (auto &op : sub_graph_.ops_) {
-        if (op->need_dynamic_internal_query()) {
-            COMPILE_ASSERT(op->info_.internal_info_
-                            && !op->info_.internal_info_->dispatch_table_name_
-                                        .empty(),
-                    "Not set the dispatch table in mixed op.");
-            auto &table_name = op->info_.internal_info_->dispatch_table_name_;
-            int dyn_idx = 0;
-            op->info_.internal_info_->parti_in_ltsrs_
-                    = info_.internal_info_->parti_in_ltsrs_;
-            op->info_.internal_info_->parti_out_ltsrs_
-                    = info_.internal_info_->parti_out_ltsrs_;
-            op->get_internal_dispatch_key_set(ctx)->for_each_key_process(
-                    std::bind(create_dispatch_funcs_by_keys, ctx, std::ref(mod),
-                            table_name, op, std::placeholders::_1, expr(),
-                            std::ref(dyn_idx), use_mtp,
-                            /*internal*/ true));
-        }
-    }
-}
-
-void mixed_fuse_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    throw std::runtime_error("mixed_fuse_op_t::get_graph Not implemented");
-}
-
-struct inplace_recursion_context_t {
-    int depth_ = 0;
-    // UNDEF means a tensor is not directly or indirectly connected to an input
-    enum kind_t { UNDEF = 0, NO_INPLACE, ZERO_OFFSET_INPLACE, FREE_INPLACE };
-
-    // the graph input tensor -> its index in std::vector<kind_t>
-    const std::unordered_map<graph_tensor_ptr, int> &tsr_2_in_index_;
-    // the map of all graph tensors -> a vector of graph inputs. Each element of
-    // the vector represents the in-place status of a graph input
-    std::unordered_map<graph_tensor_ptr, std::vector<kind_t>> result_;
-
-    inplace_recursion_context_t(
-            const std::unordered_map<graph_tensor_ptr, int> &tsr_2_in_index)
-        : tsr_2_in_index_(tsr_2_in_index) {}
-
-    // merges the in-place results. Used when an Op depends on multiple graph
-    // inputs and we need to merge the status of the same input
-    static kind_t merge_result(kind_t a, kind_t b, bool good_op) {
-        if (good_op) {
-            if (a == NO_INPLACE || b == NO_INPLACE) { return NO_INPLACE; }
-            if (a == UNDEF) { return b; }
-            if (b == UNDEF) { return a; }
-            if (a == ZERO_OFFSET_INPLACE || b == ZERO_OFFSET_INPLACE) {
-                return ZERO_OFFSET_INPLACE;
-            }
-            return FREE_INPLACE;
-        } else {
-            // if the current op is not a "good" op, we need to mark all inputs
-            // it depends on with NO_INPLACE
-            if (a != UNDEF || b != UNDEF) { return NO_INPLACE; }
-            return UNDEF;
-        }
-    }
-
-    // the main recursion function to recursively find the in-place status
-    const std::vector<kind_t> *call(const graph_tensor_ptr &tsr) {
-        depth_++;
-        if (depth_ > 500) { return nullptr; }
-        auto itr = result_.find(tsr);
-        if (itr != result_.end()) { return &itr->second; }
-        auto &ret
-                = (result_[tsr] = std::vector<kind_t>(tsr_2_in_index_.size()));
-        // if the tensor is used more than once, we simply skip it for the sake
-        // of correctness. We can obviously do better than this.
-        auto producer = tsr->producer_owner_;
-        bool good_op = tsr->uses_.size() <= 1UL;
-        if (producer->isa<input_op>()) {
-            // we define that input tensor can in-place reuse itself.
-            ret.at(tsr_2_in_index_.find(tsr)->second)
-                    = good_op ? FREE_INPLACE : NO_INPLACE;
-            depth_--;
-            return &ret;
-        }
-        bool is_binary = producer->isa<binary_elementwise_op_t>();
-        // if it is an broadcast op, we cannot in-place reuse the broadcast
-        // input
-        int bcast_idx = -1;
-        if (is_binary) {
-            bcast_idx = producer->stc_cast<binary_elementwise_op_t>()
-                                ->get_broadcast_input();
-        }
-        bool must_zero_offset = producer->isa<cast_op_t>() || is_binary
-                || producer->isa<unary_elementwise_op_t>();
-        bool can_be_free = producer->isa<tensor_view_op_t>();
-        good_op = good_op && (must_zero_offset || can_be_free);
-        // we allow cast ops in the dependency chain, even if the size of dtype
-        // is not the same. Here we only check the logical dependency instead of
-        // memory position
-        auto &inputs = producer->get_inputs();
-        for (size_t input_idx = 0; input_idx < inputs.size(); input_idx++) {
-            auto &intsr = inputs[input_idx];
-            auto *sub_result = call(intsr);
-            if (!sub_result) { return nullptr; }
-            for (size_t i = 0; i < ret.size(); i++) {
-                auto result = (*sub_result)[i];
-                // if the op's input is broadcast, all the graph input tensors
-                // it depends on should be NO_INPLACE
-                if ((int64_t)bcast_idx == (int64_t)input_idx
-                        && result != UNDEF) {
-                    result = NO_INPLACE;
-                }
-                ret[i] = merge_result(ret[i], result, good_op);
-                if (must_zero_offset && ret[i] == FREE_INPLACE) {
-                    ret[i] = ZERO_OFFSET_INPLACE;
-                }
-            }
-        }
-        depth_--;
-        return &ret;
-    }
-};
-
-float mixed_fuse_op_t::get_gflop() {
-    return sub_graph_.get_gflop();
-}
-
-sc_op_ptr mixed_fuse_op_t::copy( // NOLINT
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &graph) {
-    auto ret = graph.make<mixed_fuse_op_t>(op_name_, parti_list_, nullptr,
-            copy_graph(sub_graph_), ins, outs, attrs_);
-    ret->sub_graph_.sync_dynamic_info_with_graph(graph);
-    return ret;
-}
-
-std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-mixed_fuse_op_t::get_inplace_map() {
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>> ret;
-    auto in_ops = sub_graph_.get_input_ops();
-    // create a map from input tensors to its index
-    std::unordered_map<graph_tensor_ptr, int> tsr_2_index;
-    std::vector<graph_tensor_ptr> index_2_tsr;
-    for (auto &in : in_ops) {
-        for (auto &tsr : in->get_outputs()) {
-            auto idx = tsr_2_index.size();
-            tsr_2_index[tsr] = idx;
-            index_2_tsr.emplace_back(tsr);
-        }
-    }
-    inplace_recursion_context_t ctx {tsr_2_index};
-
-    // for each output tensors...
-    auto out_ops = sub_graph_.get_output_ops();
-    size_t out_idx = 0;
-    for (auto &out : out_ops) {
-        for (auto &outtsr : out->get_inputs()) {
-            std::vector<tensor_inplace_info_t> can_inplace;
-            auto *rec_ret = ctx.call(outtsr);
-            if (!rec_ret) {
-                SC_MODULE_WARN << "Max recursion count reached for tensor "
-                                  "inplace optimization "
-                                  "for fused op";
-                return {};
-            }
-            for (size_t i = 0; i < rec_ret->size(); i++) {
-                if ((*rec_ret)[i]
-                        == inplace_recursion_context_t::ZERO_OFFSET_INPLACE) {
-                    // zero offset means that the output->input dependency
-                    // chain contains elementwise ops. We need to ensure
-                    // that each memory position of the output strictly
-                    // depend on the same memory position of the input
-                    if (utils::get_sizeof_type(
-                                index_2_tsr.at(i)->details_.dtype_)
-                            == utils::get_sizeof_type(
-                                    outtsr->details_.dtype_)) {
-                        can_inplace.emplace_back(
-                                tensor_inplace_info_t {static_cast<int>(i),
-                                        inplace_kind::ZERO_OFFSET});
-                    }
-                } else if ((*rec_ret)[i]
-                        == inplace_recursion_context_t::FREE_INPLACE) {
-                    can_inplace.emplace_back(tensor_inplace_info_t {
-                            static_cast<int>(i), inplace_kind::FREE});
-                }
-            }
-            if (!can_inplace.empty()) {
-                ret.emplace_back(out_idx, std::move(can_inplace));
-            }
-        }
-        out_idx++;
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fused_op.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fused_op.hpp
deleted file mode 100644
index 4a681e957d4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fused_op.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSED_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSED_OP_HPP
-
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include "graph_op.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/trait/may_inplace.hpp>
-#include <compiler/ir/graph/trait/may_prefetch.hpp>
-#include <runtime/threadpool_mode.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct mixed_parti_t;
-
-struct fusion_partition_t : std::enable_shared_from_this<fusion_partition_t> {
-    // the fusible ops in the partition. Not including base tunable op
-    std::unordered_set<sc_op_ptr> ops;
-    // valid only after phase 3
-    sc_op_ptr main_tunable_op;
-    // valid only after phase 3, out tensors of old ops to be fused => new
-    // fused_op out tensors
-    std::unordered_map<graph_tensor_ptr, graph_tensor_ptr> output_replace_map;
-    using ptr = std::shared_ptr<fusion_partition_t>;
-    ptr merged_to;
-    // if "this" is not merged, return this. If "this" is merged to another
-    // partition, return the partition
-    fusion_partition_t *get_root() const;
-
-    virtual ~fusion_partition_t() = default;
-
-    /**
-     * Checks if an `op` node can be added to the partition. We check that for
-     * each input op x in op.get_inputs(), if x is not in the partition, then x
-     * must not depend on any op in the current partition.
-     * We add this check to avoid potential cycles in the graph after fusion.
-     * e.g., think the following graph (something like the basic block of
-     * resnet)
-     * v0=conv(v1,v2)
-     * v3=relu(v0)
-     * v4=conv(v3,v6)
-     * v7=add(v3,v4)
-     *
-     * We first add relu_v3 to the partition. Note that the add_v7 depends on
-     * our partition by 2 paths: v3->v7 and v3->v4->v7. For add_v7's
-     * dependencies, relu_v3 is in the partition and conv_v4 is not. If we fuse
-     * conv_v0, relu_v3 and add_v7, there will be a cycle in the graph:
-     * conv_v4 depends on v3, which is the output of the fused op. But the fused
-     * op depends on v4, which is the output of conv_v4
-     */
-    bool is_ok_to_add(sc_op *op, const op_dep_matrix_t &g) const;
-
-    bool contains(sc_op *op) const;
-
-    // merge the ops in "other" to "this"
-    void merge(const ptr &other) const;
-};
-
-class mixed_fuse_op_t : public graph_op_t,
-                        public op_traits::may_prefetch_t,
-                        public op_traits::may_inplace_t,
-                        public op_traits::copyable_t {
-public:
-    mixed_fuse_op_t(const std::string &name,
-            const std::vector<std::shared_ptr<mixed_parti_t>> &parti_list,
-            const ir_module_ptr &mod, const sc_graph_t &graph,
-            const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    sc_graph_t sub_graph_;
-    std::vector<std::shared_ptr<mixed_parti_t>> parti_list_;
-    ir_module_ptr mod_;
-    ir_module_ptr get_func(context_ptr ctx) override;
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &) override;
-    void schedule_loops(const stmt &body);
-
-    // dynamic related
-    // need internal dispatch func.
-    bool need_dynamic_internal_query_impl() const override;
-    // create internal dispatch functions with internal dispatch keys.
-    void create_internal_dispatch_funcs(const context_ptr &ctx,
-            ir_module_ptr &ret_mod,
-            const std::shared_ptr<const thread_pool_mode_t> &use_mtp);
-    // return the indices of tunable op inputs in sub graph.
-    std::vector<size_t> get_internal_tunable_input_indices();
-    virtual dispatch_set_ptr &get_dispatch_key_set() override;
-    // Return vector of inner op with dispatch key set like tunable op/reorder
-    // op(dispatch alg). total_key_num is the pointer to number of input/output
-    // dispatch key inside(option). The return value is mainly used for combined
-    // dispatch key construction.
-    virtual std::vector<sc_op_ptr> get_inner_dispatch_ops(int *total_key_num);
-    void update_internal_graph_format(
-            const combined_op_dispatch_key_t &key, const context_ptr &ctx);
-    ir_module_ptr get_dynamic_query_func(const context_ptr &ctx);
-
-    std::vector<int> query_prefetch(const context_ptr &ctx, bool is_global,
-            const std::vector<tensor_slice> &ins) override;
-
-    void generate_prefetcher_body_for_tensor(const context_ptr &ctx,
-            const std::vector<expr> &func_args, const std::vector<expr> &ins,
-            const std::vector<int> &indices) override;
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() override;
-    float get_gflop() override;
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins, // NOLINT
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-};
-
-void schedule_loop_body(const stmt &body, node_ptr_map *node_remap = nullptr);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op.cpp
deleted file mode 100644
index 2a6c9e5fc6e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op.cpp
+++ /dev/null
@@ -1,544 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-#include <atomic>
-#include <numeric>
-#include <unordered_map>
-
-#include <algorithm>
-#include <utility>
-#include "anchor_loop_generator.hpp"
-#include "fused_op.hpp"
-#include "fusible_op.hpp"
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/transform/parallel_workload_dispatch.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-int binary_elementwise_op_t::get_broadcast_input() const {
-    auto non_bc_input_idx = get_non_broadcast_input_index(true);
-    return non_bc_input_idx.size() > 1 ? -1 : 1 - non_bc_input_idx[0];
-}
-
-static int get_base_input_idx(fusible_op_t *cur) {
-    int base_idx = 0;
-    if (auto binary_node = cur->dyn_cast<binary_elementwise_op_t>()) {
-        // if bc side (smaller side) is the lhs, we need to set base_idx to 1
-        if (!binary_node->get_broadcast_input()) { base_idx = 1; }
-    }
-    if (auto select_node = cur->dyn_cast<select_op_t>()) {
-        // we need to set base_idx to the max input
-        base_idx = select_node->get_non_broadcast_input_index(true)[0];
-    }
-    COMPILE_ASSERT(base_idx >= 0, "Bad base idx for fusible_op");
-    return base_idx;
-}
-
-ir_module_ptr fusible_op_t::get_func(context_ptr ctx) {
-    return fusible_op_get_func(this, ctx);
-}
-
-void fusible_op_t::create_mixed_partition(mixed_parti_t *parti) {
-    parti->buf_alloc_.allocate_buffer(this);
-    std::vector<expr> ins, outs;
-    std::tie(ins, outs) = parti->buf_alloc_.get_buffer(this);
-    builder::ir_builder_t bld;
-    bld.push_scope();
-    // select loop mode
-    bool use_output_mode = false;
-    if (auto reo_op = this->dyn_cast<reorder_op_t>()) {
-        // for padding reorder, it maybe prefer to select input loop
-        if (reo_op->check_padding()) {
-            use_output_mode = reo_op->use_output_loop();
-        } else {
-            // for most reorder w/o padding, if it supports output loop, try to
-            // force it to use output loop for post-op fusion
-            use_output_mode
-                    = !reo_op->attrs_.get_or_else("use_input_loop", false)
-                    && reo_op->support_output_loop();
-            if (use_output_mode) {
-                // set attr to force reorder use output mode
-                reo_op->attrs_.set(op_attr_key::break_pre_fuse, true);
-            }
-        }
-    }
-    // set base idx
-    auto base_gt = use_output_mode ? get_outputs()[0]
-                                   : get_inputs()[get_base_input_idx(this)];
-    // create fusion anchor mgr
-    fusion_anchor_mgr_t fmgr(parti);
-    // create anchor loop gen
-    anchor_loop_generator_t gen(base_gt);
-    // create outer loop anchor
-    bool status = gen.create_outer_loop_anchor(&fmgr, parti->ctx_);
-    COMPILE_ASSERT(status,
-            "generate outer loops failed for "
-                    << op_name_ << "_" << logical_op_id_ << ", please check");
-    auto body = bld.pop_scope();
-    parti->func_ = builder::make_func(std::string(""), std::vector<expr> {},
-            std::move(body), datatypes::boolean);
-    // append to partition
-    search_anchor(parti);
-    append_mixed_partition(parti);
-
-    // double check used anchor
-    auto used_anchor = parti->lookup_anchor_map(this);
-    // Due to anchor is sorted by ascending order, clear invalid IR including
-    // for_loop and anchor
-    if (used_anchor != parti->fanchors_.back()) {
-        // remove all fanchor less then used anchor
-        for (auto iter = parti->fanchors_.begin();
-                iter < parti->fanchors_.end();) {
-            if ((*iter) == used_anchor) break;
-            parti->clear_fanchor(*iter);
-            iter = parti->fanchors_.erase(iter);
-        }
-        // check whether need to keep last anchor
-        if (!is_single_op_graph(get_owner_graph())) {
-            // remove last anchor
-            parti->clear_fanchor(parti->fanchors_.back());
-            parti->fanchors_.pop_back();
-        }
-    } else {
-        // if last fanchor is used
-        // clear all fanchor in void of post op fusion
-        parti->fanchors_.clear();
-        // remove all field related to anchor
-        parti->op_anchor_map_.erase(this);
-        parti->buf_alloc_.tsr2anch_map_.clear();
-    }
-    auto parent_scope = used_anchor->get_parent_scope();
-    for (auto iter = parent_scope->seq_.begin();
-            iter < parent_scope->seq_.end();) {
-        if ((*iter).ptr_same(used_anchor->anchor_position_)) {
-            ++iter;
-            continue;
-        }
-        iter = parent_scope->seq_.erase(iter);
-    }
-}
-
-void fusible_op_t::append_mixed_partition(mixed_parti_t *parti) {
-    COMPILE_ASSERT(parti->ready_for_op(this),
-            "No suitable anchor found for " << op_name_ << "_"
-                                            << logical_op_id_);
-
-    if (!parti->empty()) {
-        parti->buf_alloc_.allocate_buffer(this);
-        parti->buf_alloc_.update_input_buffer_info(this);
-
-        if (!this->isa<movement_op_t>()) {
-            int base_idx = get_base_input_idx(this);
-            auto committed_anchor = parti->lookup_anchor_map(this);
-            auto &fsmap = committed_anchor->fsmap_;
-            auto &base_gt = get_inputs()[base_idx];
-            if (fsmap.get(base_gt).size() == 1) {
-                builder::ir_builder_t bld;
-                bld.push_scope();
-                anchor_loop_generator_t gen(base_gt);
-                // create fusion anchor mgr
-                fusion_anchor_mgr_t fmgr(parti);
-                // create_inner_anchor
-                gen.create_inner_loop_anchor(&fmgr, committed_anchor);
-                auto inner_ss = bld.pop_scope().checked_as<stmts>();
-                // search inner anchor again
-                search_anchor(parti);
-                if (committed_anchor != parti->lookup_anchor_map(this)) {
-                    committed_anchor->commit_stmts(inner_ss);
-                } else {
-                    auto inner_anchors = fmgr.get_fusion_anchor();
-                    // erase unused inner anchor
-                    parti->fanchors_.erase(
-                            parti->fanchors_.end() - inner_anchors.size(),
-                            parti->fanchors_.end());
-                    // clear invoid inner anchor content from parent anchor
-                    for (auto iter = inner_anchors.rbegin();
-                            iter != inner_anchors.rend(); iter++) {
-                        committed_anchor->clear_content((*iter).get());
-                    }
-                }
-            }
-        }
-    }
-    // update output buffer info after inner anchor created
-    parti->buf_alloc_.update_output_buffer_info(this);
-
-    fusion_anchor_ptr committed_anchor = parti->lookup_anchor_map(this);
-    if (attrs_.get_or_else(mixed_partition_hint::inplace_optimized_op, false)) {
-        // commit content id to anchor
-        committed_anchor->append_content(static_cast<sc_op *>(this));
-        return;
-    }
-
-    commit_into_anchor(committed_anchor.get());
-
-    // append op inner anchor into parti
-    if (attrs_.has_key(op_attr_key::fusible_inner_anchors)) {
-        auto op_inner_anchors = attrs_.get<std::vector<fusion_anchor_ptr>>(
-                op_attr_key::fusible_inner_anchors);
-        for (const auto &op_inner_anchor : op_inner_anchors) {
-            op_inner_anchor->attach_parent_anchor(committed_anchor);
-            parti->append_fusion_anchor(op_inner_anchor);
-        }
-    }
-}
-
-void fusible_op_t::search_anchor(mixed_parti_t *parti) {
-    search_op_anchor_in_parti(this, parti);
-}
-
-void fusible_op_t::commit_into_anchor(fusion_anchor_t *committed_anchor) {
-    auto parti = committed_anchor->get_binded_mxp();
-    std::vector<expr> in_tsrs, out_tsrs;
-    std::tie(in_tsrs, out_tsrs) = parti->buf_alloc_.get_buffer(this);
-    std::vector<std::vector<tensor_slice>> inputs(in_tsrs.size()),
-            outputs(out_tsrs.size());
-    auto ths = this;
-    auto wrap_tsr2tsl_ = [&ths, &committed_anchor](const expr &tsr,
-                                 const graph_tensor_ptr &gt,
-                                 bool is_output = false) {
-        auto &range_list = committed_anchor->fsmap_.get(gt);
-        std::vector<tensor_slice> multi_tsl;
-        if (!range_list.empty()) {
-            if (ths->isa<reorder_op_t>() && is_output) {
-                auto input_size
-                        = committed_anchor->fsmap_.get(ths->get_inputs()[0])
-                                  .size();
-                // multi-slice reorder
-                if (input_size && range_list.size() > input_size) {
-                    // align output size with input
-                    for (size_t i = 0; i < input_size; i++) {
-                        multi_tsl.emplace_back(tensor_slice(tsr));
-                    }
-                    return multi_tsl;
-                }
-            }
-            for (auto &range : range_list) {
-                multi_tsl.emplace_back(tensor_slice(tsr, slice_range(range)));
-            }
-        } else {
-            COMPILE_ASSERT(ths->isa<reorder_op_t>(),
-                    "only reorder op support this case, but got "
-                            << ths->op_name_)
-            if (is_output) {
-                ths->attrs_.set(op_attr_key::break_post_fuse, true);
-            } else if (committed_anchor->isa<grouped_fusion_anchor_t>()) {
-                // pre-fuse reorder, align input size with output
-                auto output_size
-                        = committed_anchor->fsmap_.get(ths->get_outputs()[0])
-                                  .size();
-                for (size_t i = 0; i < output_size; i++) {
-                    multi_tsl.emplace_back(tensor_slice(tsr));
-                }
-                return multi_tsl;
-            }
-            multi_tsl.emplace_back(tensor_slice(tsr));
-        }
-        return multi_tsl;
-    };
-    std::transform(get_inputs().begin(), get_inputs().end(), in_tsrs.begin(),
-            inputs.begin(),
-            [&wrap_tsr2tsl_](const graph_tensor_ptr &gt, const expr &tsr) {
-                return wrap_tsr2tsl_(tsr, gt);
-            });
-    std::transform(get_outputs().begin(), get_outputs().end(), out_tsrs.begin(),
-            outputs.begin(),
-            [&wrap_tsr2tsl_](const graph_tensor_ptr &gt, const expr &tsr) {
-                return wrap_tsr2tsl_(tsr, gt, true);
-            });
-    auto in_slice_size = inputs[0].size();
-    COMPILE_ASSERT(in_slice_size, "No input slice found for " << op_name_)
-    COMPILE_ASSERT(std::all_of(inputs.begin(), inputs.end(),
-                           [&in_slice_size](
-                                   const std::vector<tensor_slice> &input_tsl) {
-                               return input_tsl.size() == in_slice_size;
-                           }),
-            "All input slice size should be equal")
-
-    // generate IR: unwrapper tensor slice, for compute_block, it just accpet
-    // single tensor_slice
-    for (size_t i = 0; i < in_slice_size; i++) {
-        std::vector<const tensor_slice *> new_inputs_ptr(inputs.size());
-        std::vector<tensor_slice *> new_outputs_ptr(outputs.size());
-        std::transform(inputs.begin(), inputs.end(), new_inputs_ptr.begin(),
-                [&i](std::vector<tensor_slice> &ins) { return &ins[i]; });
-
-        std::transform(outputs.begin(), outputs.end(), new_outputs_ptr.begin(),
-                [&i](std::vector<tensor_slice> &out) { return &out[i]; });
-        builder::ir_builder_t bld;
-        bld.push_scope();
-        compute_block(parti->ctx_, new_outputs_ptr, new_inputs_ptr);
-        auto compute_core = bld.pop_scope().checked_as<stmts>();
-        committed_anchor->commit_stmts(compute_core);
-    }
-    // commit content id to anchor
-    committed_anchor->append_content(static_cast<sc_op *>(this));
-}
-
-void fusible_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    if (this->isa<constant_op_t>()) {
-        out_formats.push_back({info_.outputs_[0]->details_.get_format()});
-    } else {
-        out_formats.push_back({info_.inputs_[0]->details_.get_format()});
-    }
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-size_t fusible_op_t::compute_workload(const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    size_t wkld = 0UL;
-    auto accumulate_workload
-            = [&wkld](size_t weight, const shape_dtype_pair &v) {
-                  auto &dtype = v.second;
-                  wkld += utils::get_sizeof_type(dtype) * weight;
-              };
-    std::for_each(ins.begin(), ins.end(),
-            std::bind(accumulate_workload,
-                    static_cast<size_t>(
-                            op_traits::workload_computable_t::read_weight),
-                    std::placeholders::_1));
-    std::for_each(outs.begin(), outs.end(),
-            std::bind(accumulate_workload,
-                    static_cast<size_t>(
-                            op_traits::workload_computable_t::write_weight),
-                    std::placeholders::_1));
-    return wkld;
-}
-
-size_t fusible_op_t::compute_fusible_workload(const context_ptr &ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    if (is_dynamic()
-            || std::any_of(inputs.begin(), inputs.end(),
-                    [](const tensor_slice *inp) { return !inp->is_const(); })) {
-        return memory_access_threshold_per_thread;
-    }
-    std::vector<shape_dtype_pair> wkld_ins, wkld_outs;
-    wkld_ins.resize(inputs.size());
-    wkld_outs.resize(dst.size());
-    auto get_shape_dtype_pair = [](const tensor_slice *v) {
-        return std::make_pair(get_expr_to_dims(v->shape_), v->get_base_dtype());
-    };
-    std::transform(inputs.begin(), inputs.end(), wkld_ins.begin(),
-            get_shape_dtype_pair);
-    std::transform(
-            dst.begin(), dst.end(), wkld_outs.begin(), get_shape_dtype_pair);
-    return compute_workload(wkld_ins, wkld_outs);
-}
-
-input_op::input_op(const sc_dims &dims, sc_data_type_t dtype) {
-    op_name_ = "input";
-    info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-            this, sc_data_format_t(), dims, dtype));
-}
-
-input_op::input_op(const logical_tensor_t &lt) {
-    op_name_ = "input";
-    info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this, lt));
-}
-
-input_op::input_op(const std::vector<graph_tensor_ptr> &outs) {
-    info_.outputs_ = outs;
-    for (auto &out : info_.outputs_) {
-        out->producer_owner_ = this;
-    }
-    op_name_ = "input";
-}
-
-void input_op::initialize_dynamic_placeholder() {
-    for (auto &out : info_.outputs_) {
-        auto plain_dims = out->details_.get_plain_dims();
-        for (auto &it : plain_dims) {
-            if (it == dimensions::dynamic_any) {
-                it = get_owner_graph().get_next_dynamic_placeholder();
-            }
-        }
-        out->details_.set_plain_dims(plain_dims);
-    }
-}
-
-output_op::output_op(const graph_tensor_ptr &v) {
-    info_.inputs_.emplace_back(v);
-    op_name_ = "output";
-}
-
-output_op::output_op(const std::vector<graph_tensor_ptr> &in) {
-    info_.inputs_ = in;
-    op_name_ = "output";
-}
-
-static void extract_const_op_data(const sc_op *ths,
-        const std::function<bool(const sc_op *, const std::string &)> &filter,
-        const std::shared_ptr<static_data_t> *&out_vales,
-        const sc_data_type_t *&out_dtype, const sc_data_format_t *&out_fmt) {
-    if (!filter || filter(ths, "values")) {
-        out_vales = ths->attrs_.get_or_null<std::shared_ptr<static_data_t>>(
-                "values");
-        COMPILE_ASSERT(out_vales, "expecting values");
-    }
-    if (!filter || filter(ths, "dtype")) {
-        out_dtype = ths->attrs_.get_or_null<sc_data_type_t>("dtype");
-        COMPILE_ASSERT(out_dtype, "expecting dtype");
-    }
-    if (!filter || filter(ths, "format")) {
-        out_fmt = ths->attrs_.get_or_null<sc_data_format_t>("format");
-    }
-}
-
-// special handling for union values
-bool constant_op_t::compare_contents(const sc_op *other,
-        const std::function<bool(const sc_op *, const std::string &)> &filter)
-        const {
-    if (other->op_name_ != op_name_) { return false; }
-    const std::shared_ptr<static_data_t> *values = nullptr,
-                                         *other_values = nullptr;
-    const sc_data_type_t *dtype = nullptr, *other_dtype = nullptr;
-    const sc_data_format_t *fmt = nullptr, *other_fmt = nullptr;
-    extract_const_op_data(this, filter, values, dtype, fmt);
-    extract_const_op_data(other, filter, other_values, other_dtype, other_fmt);
-    if (dtype) {
-        if (!other_dtype || *dtype != *other_dtype) { return false; }
-    }
-    if (fmt) {
-        if (!other_fmt || *fmt != *other_fmt) { return false; }
-    }
-    if (!values) { return other_values == nullptr; }
-    // now values must be non-null
-    if (!other_values) { return false; }
-    auto &vals = *values;
-    auto &vals2 = *other_values;
-    if (vals->size_ != vals2->size_) { return false; }
-
-    if (dtype) {
-        switch (get_type_category_nothrow(*dtype)) {
-            case CATE_FLOAT:
-                for (size_t i = 0; i < vals->size_ / 4; i++) {
-                    if (static_cast<float *>(vals->data_)[i]
-                            != static_cast<float *>(vals2->data_)[i]) {
-                        return false;
-                    }
-                }
-                break;
-            case CATE_INT:
-            case CATE_UINT:
-                for (size_t i = 0; i < vals->size_ / 4; i++) {
-                    if (static_cast<uint32_t *>(vals->data_)[i]
-                            != static_cast<uint32_t *>(vals2->data_)[i]) {
-                        return false;
-                    }
-                }
-                break;
-            default:
-                throw std::runtime_error("Met unexpected dtype for constant");
-                break;
-        }
-    }
-    return true;
-}
-
-size_t constant_op_t::hash_contents(
-        const std::function<bool(const sc_op *, const std::string &)> &filter)
-        const {
-    size_t seed = 0;
-    const std::shared_ptr<static_data_t> *values = nullptr;
-    const sc_data_type_t *dtype = nullptr;
-    const sc_data_format_t *fmt = nullptr;
-    extract_const_op_data(this, filter, values, dtype, fmt);
-    if (fmt) { hash_combine(seed, *fmt); }
-    if (!values) { return seed; }
-    auto &vals = *values;
-
-    for (size_t i = 0; i < vals->size_; i++) {
-        hash_combine(seed, static_cast<char *>(vals->data_)[i]);
-    }
-
-    return seed;
-}
-
-void constant_op_t::reset_const_values() {
-    if (attrs_.has_key("temp.var") && attrs_.has_key("temp.val/var")) {
-        int K = static_cast<int>(
-                attrs_.get<std::shared_ptr<VConst>>("temp.var")->var_);
-        int base_val = attrs_.get<int>("temp.val/var");
-        // update private member
-        const_values_ = std::make_shared<static_data_t>(
-                std::vector<int> {base_val * K});
-        // update attr
-        attrs_.set("values", const_values_);
-    }
-}
-
-constant_op_t::constant_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.empty(), "No op input.\n");
-    COMPILE_ASSERT(attrs.has_key("values") && attrs.has_key("dtype")
-                    && attrs.has_key("plain_dims"),
-            "expecting values, format and dtype in attr");
-    op_name_ = "constant";
-    sc_data_format_t format
-            = attrs.get_or_else("format", sc_data_format_t(format_kinds::A));
-    attrs_ = attrs;
-    const_values_ = attrs.get<std::shared_ptr<static_data_t>>("values");
-    sc_data_type_t dtype = attrs.get<sc_data_type_t>("dtype");
-    sc_dims plain_dims = attrs.get<sc_dims>("plain_dims");
-
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-                this, logical_tensor_t {format, plain_dims, dtype}));
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "Wrong op output size.\n");
-        info_.outputs_ = outs;
-    }
-}
-
-// todo: support tensor expr
-constant_op_t::constant_op_t(std::shared_ptr<static_data_t> v,
-        sc_data_type_t dtype, const sc_dims &plain_dims,
-        const sc_data_format_t &format) {
-    const_values_ = std::move(v);
-    info_.outputs_.emplace_back(
-            std::make_shared<graph_tensor>(this, format, plain_dims, dtype));
-    info_.outputs_[0]->details_.dtype_ = dtype;
-    info_.outputs_[0]->details_.set_plain_dims(plain_dims);
-    attrs_.set("dtype", dtype);
-    attrs_.set("values", const_values_);
-    attrs_.set("plain_dims", plain_dims);
-    attrs_.set("format", format);
-    op_name_ = "constant";
-}
-
-OP_REGISTER(constant_op_t, constant)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op.hpp
deleted file mode 100644
index 48b64834045..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSIBLE_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSIBLE_OP_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusion_data.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/trait/may_broadcast.hpp>
-#include <compiler/ir/graph/trait/may_inplace.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct tensor_slice;
-
-/**
- * A fuser will do actual code injection on the fusion point. It will be managed
- * by the fusion manager.
- * */
-class SC_INTERNAL_API fusible_op_t
-    : public sc_op,
-      public op_traits::workload_computable_t,
-      public op_traits::mixed_partition_acceptable {
-public:
-    // when fusible_op_t is as a started op in the graph/subgraph, query_format
-    // return certain format.
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    ir_module_ptr get_func(context_ptr ctx) override;
-
-    /**
-     * 'infer_slice_ranges' is used to infer slice ranges for all fusible op in
-     * fusion manger, espically helpful for input arg op, because it could not
-     * know the slice information, it need to inferred by its partner input in
-     * binary or trinary op, such as add/mul, etc. Additional, sometimes, the
-     * slice range may be changed, e.g. reduce_op_t or movement type ops. we
-     * need to address for this speciall condition. Actually,
-     * 'infer_slice_ranges' can be viewed as `post_slice_ranges` and very simple
-     * `pre_infer_slice_ranges` with only one previous op inferred.
-     * */
-    virtual infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap)
-            = 0;
-
-    /**
-     * 'pre_infer_slice_ranges' is used to infer slice ranges especially
-     * for pre-op fusion. As mentioned above `infer_slice_ranges` can infer
-     * slice for input arg op, however, if sometimes the input of one fusible op
-     * is a sub fusion graph, it is expected to infer input slice by given
-     * output slice.
-     * */
-    virtual infer_status_code pre_infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap)
-            = 0;
-
-    void search_anchor(mixed_parti_t *parti) override;
-
-    /**
-     * Does the actual code injection to the current func on an input
-     * and output tensor slices.
-     *
-     * */
-    virtual void compute_block(context_ptr ctx,
-            const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs)
-            = 0;
-    /**
-     * Compute workload of a fusible op during its compute block, ins and outs
-     * are from tensor slice.
-     * */
-    size_t compute_workload(const std::vector<shape_dtype_pair> &ins,
-            const std::vector<shape_dtype_pair> &outs) override;
-    /**
-     * A wrapper of `compute_workload`, get workload from tensor slice.
-     * */
-    virtual size_t compute_fusible_workload(const context_ptr &ctx,
-            const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs);
-
-    void create_mixed_partition(mixed_parti_t *parti) override;
-
-    void append_mixed_partition(mixed_parti_t *parti) override;
-
-    void commit_into_anchor(fusion_anchor_t *committed_anchor) override;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override {}
-
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override {}
-
-    ~fusible_op_t() override = default;
-
-    int anchor_id_ = -1;
-};
-
-using fusion_op_ptr = std::shared_ptr<fusible_op_t>;
-
-#define DECLARE_COMPUTE() \
-    virtual void compute_block(context_ptr ctx, \
-            const std::vector<tensor_slice *> &dst, \
-            const std::vector<const tensor_slice *> &inputs) override;
-
-#define DECLARE_QUERY_AND_COMPUTE() \
-    virtual infer_status_code infer_slice_ranges( \
-            const context_ptr &ctx, fslice_map &fsmap) override; \
-    virtual infer_status_code pre_infer_slice_ranges( \
-            const context_ptr &ctx, fslice_map &fsmap) override; \
-    virtual void compute_block(context_ptr ctx, \
-            const std::vector<tensor_slice *> &dst, \
-            const std::vector<const tensor_slice *> &inputs) override;
-
-#define DECLARE_QUERY_AND_DEFAULT_COMPUTE() \
-    virtual infer_status_code infer_slice_ranges( \
-            const context_ptr &ctx, fslice_map &fsmap) override { \
-        return infer_status_code::OK; \
-    } \
-    virtual infer_status_code pre_infer_slice_ranges( \
-            const context_ptr &ctx, fslice_map &fsmap) override { \
-        return infer_status_code::OK; \
-    } \
-    virtual void compute_block(context_ptr ctx, \
-            const std::vector<tensor_slice *> &dst, \
-            const std::vector<const tensor_slice *> &inputs) override {}
-
-/**
- * The input argument Op
- * Inputs:
- *  - None
- * Outputs:
- *  - One or more tensors
- * Attrs:
- *  - values: std::shared_ptr<static_data_t> - (Optional) if the values of this
- *    input is known at compile time, the values
- *  - keep_plain: bool - default = false. It will only be used when
- *    graph.attrs_["is_input_plain"]=false. If keep_plain=true, we will keep the
- *    plain format for this input. Otherwise we may reset this input to blocking
- *    format
- * */
-class input_op : public fusible_op_t {
-public:
-    DECLARE_QUERY_AND_DEFAULT_COMPUTE();
-
-    input_op(const sc_dims &dims = {}, sc_data_type_t dtype = datatypes::f32);
-    input_op(const logical_tensor_t &lt);
-    input_op(const std::vector<graph_tensor_ptr> &outs);
-
-    void initialize_dynamic_placeholder();
-};
-
-/**
- * The output argument Op
- * Inputs:
- *  - One or more tensors
- * Outputs:
- *  - None
- * Attrs:
- *  - target_formats: std::vector<sc_data_format_t> - default: vector of plain
- *    format. It will be used only if the graph's `is_output_plain` attr is
- *    true. target_formats is the format of the output tensor. They should be
- *    either plain or simply permuted. If the target_formats is set, the
- *    output tensor will be reordered into the specified format.
- *  - target_strides: std::vector<sc_dims> - default: vector of dense strides
- *    target_strides is the stride of the output tensor.
- * */
-class output_op : public fusible_op_t {
-public:
-    DECLARE_QUERY_AND_DEFAULT_COMPUTE();
-
-    output_op(const graph_tensor_ptr &v);
-    output_op(const std::vector<graph_tensor_ptr> &in);
-};
-
-/**
- * The op that produces constant tensors.
- * Inputs:
- *  - None
- * Outputs:
- *  - One single output tensor
- * Attrs:
- *  - values: std::shared_ptr<static_data_t> - the data
- *  - dtype: sc_data_type_t - the datatype. (todo: remove this attr)
- *  - plain_dims: dims (todo: remove this attr)
- *  - format: sc_data_format_t (todo: remove this attr)
- * */
-class constant_op_t : public fusible_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_DEFAULT_COMPUTE();
-
-    constant_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    constant_op_t(const std::shared_ptr<static_data_t> v, sc_data_type_t dtype,
-            const sc_dims &plain_dims,
-            const sc_data_format_t &format = sc_data_format_t());
-    std::shared_ptr<static_data_t> get_constant_values() const {
-        return const_values_;
-    }
-    sc_data_type_t get_constant_dtype() const {
-        return info_.outputs_[0]->details_.dtype_;
-    }
-    const sc_dims &get_constant_plain_dims() const {
-        return info_.outputs_[0]->details_.get_plain_dims();
-    }
-    const sc_data_format_t &get_constant_format() {
-        return info_.outputs_[0]->details_.get_format();
-    }
-    sc_dims get_constant_blocking_dims() {
-        return sc_data_format_t::get_blocking_shapes(
-                get_constant_plain_dims(), get_constant_format());
-    }
-    bool compare_contents(const sc_op *other,
-            const std::function<bool(const sc_op *, const std::string &)>
-                    &filter) const override;
-    size_t hash_contents(
-            const std::function<bool(const sc_op *, const std::string &)>
-                    &filter) const override;
-
-    // if necessary, reset const_values according possible `var` from attrs
-    void reset_const_values();
-
-private:
-    std::shared_ptr<static_data_t> const_values_;
-};
-
-// this structure is used to store vectorized information, including axis and
-// lanes;
-struct vectorized_info_t {
-    // the last valid axis to vectorized, skip invalid dims whose length equals
-    // to 1, the default is last dim for most conditions
-    int axis = -1;
-    // vectorized lanes
-    uint32_t lanes = 0;
-};
-
-class binary_elementwise_op_t : public fusible_op_t,
-                                public op_traits::may_inplace_t,
-                                public op_traits::may_broadcast_t,
-                                public op_traits::brgemm_fusion_acceptable_t,
-                                public op_traits::auto_copyable_with_trait_t<
-                                        op_traits::brgemm_fusion_acceptable_t> {
-public:
-    int get_broadcast_input() const;
-};
-
-class binary_backward_op_t : public fusible_op_t,
-                             public op_traits::may_inplace_t,
-                             public op_traits::auto_copyable_t {};
-
-class unary_elementwise_op_t : public fusible_op_t,
-                               public op_traits::may_inplace_t,
-                               public op_traits::brgemm_fusion_acceptable_t,
-                               public op_traits::auto_copyable_with_trait_t<
-                                       op_traits::brgemm_fusion_acceptable_t> {
-};
-
-// used for classification
-class movement_op_t : public fusible_op_t, public op_traits::may_quantize_t {};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op_utils.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op_utils.cpp
deleted file mode 100644
index 62048c69ad3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op_utils.cpp
+++ /dev/null
@@ -1,1183 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <limits>
-#include <memory>
-#include <utility>
-#include "anchor_loop_generator.hpp"
-#include "fusible_op.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/cpu/local_tensor_lower.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <runtime/config.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/optional_find.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// incrementor for loop
-static std::atomic<int> idx = {0};
-// incrementor for temp var
-static std::atomic<int> var_idx = {0};
-// helper function to get current var_idx
-std::string fusion_create_var_idx() {
-    return std::string("_") + std::to_string(var_idx++);
-}
-
-std::string fusion_create_idx() {
-    return std::string("_") + std::to_string(idx++);
-}
-
-static std::vector<tensor_slice *> convert_t(std::vector<tensor_slice> &src) {
-    std::vector<tensor_slice *> dst(src.size());
-    std::transform(src.begin(), src.end(), dst.begin(),
-            [](tensor_slice &t) { return &t; });
-    return dst;
-}
-
-static std::vector<const tensor_slice *> convert_const_t(
-        std::vector<tensor_slice> &src) {
-    std::vector<const tensor_slice *> dst(src.size());
-    std::transform(src.begin(), src.end(), dst.begin(),
-            [](tensor_slice &t) { return &t; });
-    return dst;
-}
-
-static std::vector<tensor_slice> make_tensor_slice(sc_graph_t &graph,
-        const std::vector<graph_tensor_ptr> &data,
-        const std::string &tensor_name, std::vector<expr> &flattened) {
-    std::vector<tensor_slice> expected;
-    for (size_t i = 0; i < data.size(); ++i) {
-        std::vector<expr> dims
-                = data[i]->details_.get_blocking_dims_expr(graph);
-        std::vector<expr> strides = dims_to_dense_stride(dims);
-        expr aexpr = builder::make_stensor(tensor_name + std::to_string(i),
-                dims, strides, data[i]->details_.dtype_);
-        flattened.emplace_back(aexpr);
-        expected.emplace_back(tensor_slice(aexpr));
-    }
-    return expected;
-}
-
-ir_module_ptr fusible_op_get_func(fusible_op_t *op, const context_ptr &ctx) {
-    sc_graph_t g;
-    g.sync_dynamic_info_with_graph(op->get_owner_graph());
-    if (op->get_owner_graph().attrs_.get_or_else("temp.force_static", false)) {
-        g.attrs_.set("temp.force_static", true);
-    }
-    if (op->get_owner_graph().is_dynamic()) {
-        g.attrs_.set("temp.parent_graph_dynamic", true);
-    }
-    std::vector<graph_tensor_ptr> ins;
-    std::vector<graph_tensor_ptr> outs;
-    for (auto &in : op->get_inputs()) {
-        ins.emplace_back(std::make_shared<graph_tensor>(nullptr, in->details_));
-    }
-    for (auto &out : op->get_outputs()) {
-        outs.emplace_back(
-                std::make_shared<graph_tensor>(nullptr, out->details_));
-    }
-    g.make_input(ins);
-    auto copyable = op->dyn_cast<op_traits::copyable_t>();
-    COMPILE_ASSERT(
-            copyable, "The fusible op should be copyable: " << op->op_name_);
-    auto copied = copyable->copy(ins, outs, g);
-    copied->info_.cur_impl_ = op->info_.cur_impl_;
-    g.make_output(outs);
-    g.attrs_.set(mixed_partition_hint::single_op_graph, true);
-    g.attrs_.set(mixed_partition_hint::optimized_sub_graph,
-            is_optimized_sub_graph(op->get_owner_graph()));
-    // create dummy parti
-    auto parti = std::make_shared<mixed_parti_t>(
-            ctx, std::const_pointer_cast<sc_op>(op->shared_from_this()));
-    // create graph-to-original ops maping
-    std::unordered_map<sc_op_ptr, sc_op_ptr> graph2orig_ops
-            = {{copied, op->shared_from_this()}};
-    // try optimize partition
-    if (!op->attrs_.get_or_else("temp.no_optimize_op", false)
-            && try_optimize_parti(parti.get(), g, graph2orig_ops)) {
-        // redo partition
-        std::vector<mixed_parti_t::ptr> op2parti(g.ops_.size());
-        do_partition(ctx, g, op2parti, std::make_shared<op_dep_matrix_t>(g));
-        // collect legal partition
-        auto res = collect_parti_set(op2parti, false);
-        // Expect only one partition found
-        COMPILE_ASSERT(res.size() == 1,
-                "Only sinlge partition is expected, but got " << res.size());
-        // reset new partition
-        parti = res[0];
-        // validate optimization
-        if (!parti->validate_optimization()) {
-            // redo without optimization
-            op->attrs_.set("temp.no_optimize_op", true);
-            auto ret = fusible_op_get_func(op, ctx);
-            // remove temp attr
-            op->attrs_.remove("temp.no_optimize_op");
-            return ret;
-        }
-    } else {
-        parti = std::make_shared<mixed_parti_t>(ctx,
-                std::const_pointer_cast<sc_op>(copied->shared_from_this()));
-    }
-    auto mx_op = parti->transform_to_mixed_op();
-    mx_op->set_owner_graph(&g);
-    // copy op name
-    mx_op->op_name_ = op->op_name_;
-    // if layer name set
-    if (auto layer_name
-            = op->attrs_.get_or_null<std::string>(op_attr_key::layer_name)) {
-        COMPILE_ASSERT(!layer_name->empty() && isalpha(layer_name->front()),
-                "Bad layername: " << *layer_name);
-        mx_op->op_name_ = *layer_name;
-    }
-    // copy logigcal id
-    mx_op->logical_op_id_ = op->logical_op_id_;
-    return mx_op->get_func(ctx);
-}
-
-sc_dims get_expr_to_dims(const std::vector<expr> &dim) {
-    sc_dims dim_int;
-    dim_int.reserve(dim.size());
-    for (const expr &d : dim) {
-        auto cd = do_cast_and_fold(d);
-        COMPILE_ASSERT(cd.isa<constant_c>(), "non-constant value found.");
-        dim_int.emplace_back(get_const_as_int(cd.static_as<constant_c>()));
-    }
-    return dim_int;
-}
-
-stmt mask_compute_func_t::operator()(const std::vector<expr> &in,
-        std::vector<expr::lvalue_proxy_t> &out, const expr &cur_idx,
-        const expr &upper_bound, uint32_t lanes) const {
-    auto ret = impl_(in, out);
-    if (cur_idx.defined() && upper_bound.defined()) {
-        auto bld = builder::get_current_builder();
-        bld->emit(ret);
-        const size_t len = out.size();
-        std::vector<stmt_c> cur_list;
-        cur_list.reserve(len);
-        for (size_t i = 0; i < len; i++) {
-            cur_list.emplace_back(builder::make_assign_unattached(out[i],
-                    make_select_by_mask(out[i], cur_idx, upper_bound, lanes)));
-        }
-        return builder::make_stmts_unattached(cur_list);
-    }
-    return ret;
-}
-
-expr make_select_by_mask(const expr &lhs_vec, const expr &cur_index,
-        const expr &upper_bound, uint32_t lanes) {
-    sc_data_type_t var_dtype;
-    auto bld = builder::get_current_builder();
-    auto upper_bound_int = builder::make_cast(datatypes::s32, upper_bound);
-    auto cur_index_int = builder::make_cast(datatypes::s32, cur_index);
-    int step = static_cast<int>(lanes);
-    // upper_bound - cur_index
-    auto cur_step = builder::make_min(
-            builder::make_max(
-                    builder::make_sub(upper_bound_int, cur_index_int), 0),
-            step);
-    stmt mask_def;
-    auto mask = generate_mask_var_by_step(mask_def, cur_step, step);
-    bld->emit(mask_def);
-    expr rhs_vec = make_expr<constant_node>(
-            std::vector<union_val>(lanes, UINT64_C(0)),
-            sc_data_type_t(lhs_vec->dtype_.type_code_, lanes));
-    return builder::make_select(mask, lhs_vec, rhs_vec);
-}
-
-void choose_mask_vartype_init_value(
-        sc_data_type_t &var_dtype, uint64_t &init_value, int32_t step) {
-    switch (step) {
-        case 4: {
-            var_dtype = datatypes::u8;
-            init_value = 15;
-            break;
-        }
-        case 8: {
-            var_dtype = datatypes::u8;
-            init_value = std::numeric_limits<uint8_t>::max();
-            break;
-        }
-        case 16: {
-            var_dtype = datatypes::u16;
-            init_value = std::numeric_limits<uint16_t>::max();
-            break;
-        }
-        case 32: {
-            var_dtype = datatypes::u32;
-            init_value = std::numeric_limits<uint32_t>::max();
-            break;
-        }
-        case 64: {
-            var_dtype = datatypes::index;
-            init_value = std::numeric_limits<uint64_t>::max();
-            break;
-        }
-        default: COMPILE_ASSERT(false, "invalid lanes: " << step);
-    }
-}
-
-expr calculate_mask_cur_step(
-        const expr &len, const expr &iter_var, const int32_t lanes) {
-    auto last_axis_offset = cast_to_s32(len) - cast_to_s32(iter_var);
-    // mask = min(max(0, last_dim_len -
-    // last_dim_idx),real_step) To choose [0 ~
-    // step] mask
-    return builder::make_min(
-            builder::make_max(builder::make_constant(0), last_axis_offset),
-            lanes);
-}
-
-// generate mask = var < floor ? 0b1111 : 0b00..111;
-expr last_dim_generate_mask(const expr &iter_var, const expr &floor,
-        expr const &last_dim_len, int const &lanes, bool just_tail_part) {
-    // just_tail_part means that the floor and tail parts of the for loop are
-    // calculated separately. Only the tail part needs to calculate the mask.
-    auto s32_var = cast_to_s32(iter_var);
-    auto s32_floor = cast_to_s32(floor);
-    auto s32_dim_len = cast_to_s32(last_dim_len);
-    expr condition = s32_var < s32_floor;
-    expr tail_len = lanes + s32_var - s32_dim_len;
-    sc_data_type_t var_dtype;
-    uint64_t init_value;
-    choose_mask_vartype_init_value(var_dtype, init_value, lanes);
-    auto full_mask = builder::make_constant({init_value}, var_dtype);
-    if (floor.isa<constant>()) {
-        int floor_int = get_expr_as_int(floor);
-        int dim_len = get_expr_as_int(last_dim_len);
-        int res_mask = init_value >> (lanes + floor_int - dim_len);
-
-        return just_tail_part ? builder::make_cast(var_dtype, res_mask)
-                              : builder::make_select(condition, full_mask,
-                                      builder::make_cast(var_dtype, res_mask));
-    }
-    return just_tail_part
-            ? (full_mask >> builder::make_cast(var_dtype, tail_len))
-            : builder::make_select(condition, full_mask,
-                    (full_mask >> builder::make_cast(var_dtype, tail_len)));
-}
-
-expr generate_mask_var_by_step(stmt &mask_def, const expr &cur_step,
-        int32_t step, const expr &sup_condition, bool direct_sup_cond) {
-    // notice: cur_step must be s32
-    sc_data_type_t var_dtype;
-    uint64_t init_value;
-    choose_mask_vartype_init_value(var_dtype, init_value, step);
-    auto mask_select = generate_mask_by_step_directly(
-            cur_step, step, sup_condition, direct_sup_cond);
-    auto mask = builder::make_var(
-            var_dtype, "__mask_" + std::to_string(var_idx++));
-    mask_def = builder::make_var_tensor_def_unattached(
-            mask, linkage::local, mask_select);
-    return mask;
-}
-
-expr generate_mask_by_step_directly(const expr &cur_step, int32_t step,
-        const expr &sup_condition, bool direct_sup_cond) {
-    // notice: cur_step must be s32
-    sc_data_type_t var_dtype;
-    uint64_t init_value;
-    choose_mask_vartype_init_value(var_dtype, init_value, step);
-    auto full_mask = builder::make_constant({init_value}, var_dtype);
-    auto empty_mask = builder::make_constant({UINT64_C(0)}, var_dtype);
-    expr empty_mask_condition;
-    empty_mask_condition = (sup_condition.defined())
-            ? (cur_step == 0
-                    || (direct_sup_cond ? sup_condition : !sup_condition))
-            : (cur_step == 0);
-    return builder::make_select(empty_mask_condition, empty_mask,
-            builder::make_select(cur_step == step, full_mask,
-                    full_mask
-                            >> builder::make_cast(var_dtype, step - cur_step)));
-}
-
-/** Determine whether masks are needed during elementwise computation and
- * generate conditional expressions for the mask
- * @param graph the graph
- * @param src input slice
- * @param plain_dims plain shapes
- * @param format input format
- * @param iter_vars input loop vars
- * @param lanes simd lanes
- * @param condition key is related iter var, value is two exprs: first is
- * current accumulated index, second is its plain shape upperbound.
- * @param last_axis_mask mask count, how many elements should be computed in
- * this time. -1 means all.
- * */
-void compute_mask_and_generate_condition(sc_graph_t &graph,
-        const std::vector<const tensor_slice *> &src, const sc_dims &plain_dims,
-        sc_data_format_t format, const std::vector<expr> &iter_vars, int lanes,
-        std::unordered_map<expr, std::pair<expr, expr>> &conditions,
-        int &last_axis_mask) {
-    auto blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, format);
-    auto padded_dims
-            = sc_data_format_t::get_padded_plain_shapes(blocking_dims, format);
-    auto &format_code = format.format_code_;
-    if (plain_dims == padded_dims) { return; }
-    auto offset = src[0]->get_offset();
-    auto shapes = src[0]->get_shape();
-    size_t ndims = format_code.ndims();
-    assert(offset.size() == ndims && shapes.size() == ndims
-            && iter_vars.size() == ndims);
-    auto plain2block = format_code.collect_p2b_mapping();
-    for (size_t i = 0; i < plain2block.size(); i++) {
-        auto &orig_dim = i;
-        if (plain_dims[orig_dim] == padded_dims[orig_dim]
-                || plain2block[i].size() == 1) {
-            continue;
-        }
-        auto &block_dim = plain2block[i][plain2block[i].size() - 1];
-        auto blocks = format_code.collect_blocking_index(orig_dim);
-        int padding_count = 0;
-        conditions[iter_vars[block_dim]].first
-                = iter_vars[block_dim] + offset[block_dim];
-        for (int b = static_cast<int>(blocks.size()) - 1; b >= 0; b--) {
-            if (b > 0 && blocks[b - 1] % blocks[b] != 0) { padding_count++; }
-            conditions[iter_vars[block_dim]].first
-                    = conditions[iter_vars[block_dim]].first
-                    + (iter_vars[plain2block[i][b]] + offset[plain2block[i][b]])
-                            * format.blocks_[blocks[b]];
-        }
-        conditions[iter_vars[block_dim]].second
-                = graph.dim_to_expr(plain_dims[orig_dim]);
-        COMPILE_ASSERT(padding_count < 2,
-                "Currently we don't support multi-level padding mask.");
-        if (block_dim == format_code.ndims() - 1) {
-            assert(lanes > 1);
-            last_axis_mask = plain_dims[orig_dim] % lanes;
-        }
-    }
-}
-
-/** Get indexing based on different conditions
- * @param is_lastdim_meet_require whether the shape len >= threshold
- * @param has_tail whether has tail
- * @param input input slice
- * @param input_idx input index
- * @param lanes simd lanes
- * @param res_idx the final indexing we want to get
- * @param axis_len length of indexing axis
- * @param iter_var var of indexing axis
- * @param just_tail_part floor part and tail part is calculated separately
- * */
-expr indexing_from_diff_cond(const bool is_lastdim_meet_require,
-        const bool has_tail, const tensor_slice &input,
-        std::vector<expr> &input_idx, const int32_t lanes, expr &res_idx,
-        const expr &axis_len, const expr &iter_var, const expr &floor,
-        bool just_tail_part) {
-    if (is_lastdim_meet_require) {
-        res_idx = builder::make_indexing(input.tptr_, input_idx);
-    } else if (has_tail && utils::is_one_of(lanes, 4, 8, 16, 32, 64)) {
-        auto mask = last_dim_generate_mask(
-                iter_var, floor, axis_len, lanes, just_tail_part);
-        res_idx = builder::make_indexing(input.tptr_, input_idx, lanes, mask);
-    } else {
-        res_idx = builder::make_indexing(input.tptr_, input_idx, lanes);
-    }
-    return res_idx;
-}
-
-// Whether all the input shapes are blocking formats.
-bool is_op_input_blocking_shape(const sc_op_info_t &info) {
-    return std::all_of(info.inputs_.begin(), info.inputs_.end(),
-            [](const graph_tensor_ptr &in) {
-                return in->details_.get_format().is_blocking();
-            });
-}
-
-void vec_backend_require(const context_ptr &ctx, bool &use_vectorized) {
-// llvm and g++ will perform special optimization on the scalar version,
-// resulting in the performance of our vectorized version not being as good
-// as the scalar version. Currently, these two backends still maintain the
-// scalar method of tail processing. The builtin will use our vectorized
-// version.
-#if SC_BUILTIN_JIT_ENABLED
-    if (ctx->flags_.jit_kind_ == jit_kind::xbyak) {
-        use_vectorized = true;
-    } else {
-        use_vectorized = false;
-    }
-#else
-    use_vectorized = false;
-#endif
-}
-
-void compute_vectorized_op(const context_ptr &ctx, sc_graph_t &graph,
-        const std::vector<const tensor_slice *> &src, const tensor_slice &dst,
-        sc_op_info_t &info, const vectorized_info_t &vx_info,
-        const mask_compute_func_t &compute_lanes,
-        const mask_compute_func_t &compute_scalar, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld, bool use_mask,
-        const tensor_slice *expand_loop_by, bool unroll_inner_loop) {
-    if (!expand_loop_by) { expand_loop_by = &dst; }
-    bool use_vectorized = false;
-    vec_backend_require(ctx, use_vectorized);
-    // In order to support non-stride test, we add dense_stride flag.
-    // If it is non-stride shape, we just use step = 1 to do
-    // this.
-    int graph_input_size = info.inputs_.size();
-    bool dense_stride = std::all_of(info.inputs_.begin(), info.inputs_.end(),
-            [](const graph_tensor_ptr &in) { return in->details_.is_dense(); });
-    bool is_blocking_shape = is_op_input_blocking_shape(info);
-    // nested loop vars
-    std::vector<expr> iter_vars;
-    // the indices for multiple inputs. First dim: the input, Second dim:
-    // the dimemsions in the tensor
-    std::vector<std::vector<expr>> src_indices_floor(src.size());
-    std::vector<std::vector<expr>> src_indices_tail(src.size());
-    // the indices for the output tensor
-    std::vector<expr> dst_idx_floor;
-    std::vector<expr> dst_idx_tail;
-    for (unsigned i = 0; i < expand_loop_by->nslice_dims(); i++) {
-        // make the loop var for the for-loop
-        iter_vars.emplace_back(
-                range_from_outer_loop(expand_loop_by->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + std::to_string(idx++)));
-        // for each input tensor
-        for (size_t j = 0; j < src.size(); j++) {
-            auto &src_idx_floor = src_indices_floor.at(j);
-            auto &src_idx_tail = src_indices_tail.at(j);
-            // push an index
-            src_idx_floor.emplace_back(iter_vars.back());
-            src_idx_tail.emplace_back(iter_vars.back());
-        }
-        // push an index for output tensor
-        dst_idx_floor.emplace_back(iter_vars.back());
-        dst_idx_tail.emplace_back(iter_vars.back());
-    }
-    auto tail_var = builder::make_var(
-            datatypes::index, std::string("_fuseiter") + std::to_string(idx++));
-    for (size_t j = 0; j < src.size(); j++) {
-        auto &src_idx_tail = src_indices_tail.at(j);
-        src_idx_tail[vx_info.axis] = tail_var;
-    }
-    dst_idx_tail[vx_info.axis] = tail_var;
-    expr indexed_target_floor;
-
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    int lanes = static_cast<int>(vx_info.lanes);
-    auto slice_len = expand_loop_by->get_shape().at(vx_info.axis);
-    auto floor = do_cast_and_fold(slice_len / lanes * lanes);
-    auto tail = do_cast_and_fold(slice_len % lanes);
-    int floor_int = 0;
-    int tail_int = 0;
-    if (floor.isa<constant>()) {
-        floor_int = get_expr_as_int(floor);
-        tail_int = get_expr_as_int(tail);
-        COMPILE_ASSERT((floor_int + tail_int), "Don't support shape len = 0.");
-    }
-    const int INVALID_AXIS_MASK = -64;
-    int last_axis_mask = INVALID_AXIS_MASK;
-    std::unordered_map<expr, std::pair<expr, expr>> conditions;
-    if (use_mask) {
-        compute_mask_and_generate_condition(graph, src,
-                info.inputs_[0]->details_.get_plain_dims(),
-                info.inputs_[0]->details_.get_format(), iter_vars,
-                vx_info.lanes, conditions, last_axis_mask);
-    }
-    if (last_axis_mask != INVALID_AXIS_MASK && floor_int > 0) {
-        COMPILE_ASSERT(tail_int == 0,
-                "Currently we only support mask in vectorize compute not "
-                "tail.");
-    }
-    std::vector<stmt> tcur;
-    stmt cur;
-    int loop_size = static_cast<int>(expand_loop_by->get_shape().size());
-    bool tail_threshold = tail.isa<constant>() && tail_int <= 1;
-    bool use_scalar
-            = !use_vectorized || tail_threshold || lanes == 1 || !dense_stride;
-    // recover schedule loop
-    for (int i = loop_size - 1; i >= 0; i--) {
-        stmt body;
-        // currently vx_axis should be last axis
-        if (loop_size == vx_info.axis + 1 && i == vx_info.axis) {
-            if (dense_stride && (!floor.isa<constant>() || floor_int)) {
-                bld->push_scope();
-                // if the shape is less than lanes, we don't use mask to
-                // process.
-
-                indexing_from_diff_cond(false, false, dst, dst_idx_floor, lanes,
-                        indexed_target_floor, slice_len, iter_vars.at(i),
-                        floor);
-                std::vector<expr> indexed_input_floor;
-                expr input_floor_idx;
-                for (unsigned j = 0; j < src.size(); j++) {
-                    indexed_input_floor.emplace_back(indexing_from_diff_cond(
-                            false, false, *src.at(j), src_indices_floor.at(j),
-                            lanes, input_floor_idx, slice_len, iter_vars.at(i),
-                            floor));
-                }
-                std::vector<expr::lvalue_proxy_t> target_floor
-                        = {expr::lvalue_proxy_t(indexed_target_floor, false)};
-                auto cond_it = conditions.find(iter_vars[i]);
-                if (cond_it != conditions.end()) {
-                    assert(last_axis_mask != INVALID_AXIS_MASK);
-                    cur = compute_lanes(indexed_input_floor, target_floor,
-                            cond_it->second.first, cond_it->second.second,
-                            vx_info.lanes);
-                } else {
-                    cur = compute_lanes(indexed_input_floor, target_floor);
-                }
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = bld->pop_scope();
-                if (iter_vars.at(i).isa<var>()) {
-                    cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                            floor, expr(lanes), cur, true, for_type::NORMAL);
-                    if (unroll_inner_loop) {
-                        cur->attr()[stmt_attr_key::unroll_loop] = 0;
-                    }
-                    bind_loop_axis(expand_gt, cur, i, true);
-                }
-                tcur.emplace_back(cur);
-            }
-            if (((!tail.isa<constant>() && !is_blocking_shape) || tail_int)
-                    || !dense_stride) {
-                bld->push_scope();
-
-                std::vector<expr> indexed_input_tail;
-                expr mask;
-                if (!use_scalar) {
-                    mask = last_dim_generate_mask(
-                            tail_var, floor, slice_len, lanes, true);
-                }
-                expr indexed_target_tail = builder::make_indexing(
-                        dst.tptr_, dst_idx_tail, use_scalar ? 1 : lanes, mask);
-                for (unsigned j = 0; j < src.size(); j++) {
-                    indexed_input_tail.emplace_back(builder::make_indexing(
-                            src.at(j)->tptr_, src_indices_tail.at(j),
-                            use_scalar ? 1 : lanes, mask));
-                }
-                std::vector<expr::lvalue_proxy_t> target_tail
-                        = {expr::lvalue_proxy_t(indexed_target_tail, false)};
-                if (use_scalar) {
-                    cur = compute_scalar(indexed_input_tail, target_tail);
-                } else {
-                    cur = compute_lanes(indexed_input_tail, target_tail);
-                }
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(tail_var,
-                        !dense_stride ? expr(0) : floor, slice_len,
-                        use_scalar ? expr(1) : expr(lanes), bld->pop_scope(),
-                        true, for_type::NORMAL);
-                if (unroll_inner_loop) {
-                    cur->attr()[stmt_attr_key::unroll_loop] = 0;
-                }
-                bind_loop_axis(expand_gt, cur, i, true);
-                tcur.emplace_back(cur);
-            }
-        } else if (iter_vars.at(i).isa<var>()) {
-            // Do not generate those dummy loop
-            if (!tcur.empty() && tcur[0].defined()) {
-                body = make_stmt<stmts_node_t>(std::move(tcur));
-                tcur.clear();
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), expand_loop_by->get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else if (cur.defined()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), expand_loop_by->get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else {
-                // if cur not defined, means last axis of tensor slice has
-                // range 1, e.g. tensor_slice{{i, 100},{0, 1}}
-                indexed_target_floor
-                        = builder::make_indexing(dst.tptr_, dst_idx_floor);
-                std::vector<expr> indexed_input_floor;
-                for (unsigned j = 0; j < src.size(); j++) {
-                    indexed_input_floor.emplace_back(builder::make_indexing(
-                            src.at(j)->tptr_, src_indices_floor.at(j)));
-                }
-
-                std::vector<expr::lvalue_proxy_t> target_floor
-                        = {expr::lvalue_proxy_t(indexed_target_floor, false)};
-                bld->push_scope();
-                cur = compute_scalar(indexed_input_floor, target_floor);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                        expand_loop_by->get_shape().at(i), expr(1),
-                        bld->pop_scope(), true, for_type::NORMAL);
-                if (unroll_inner_loop) {
-                    cur->attr()[stmt_attr_key::unroll_loop] = 0;
-                }
-            }
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-    }
-    if (!tcur.empty() && tcur[0].defined()) {
-        // TODO(xxx): currenly we don't add merge_loop attribute for this
-        // special case, need stronger loop analysis.
-        for (auto &it : tcur) {
-            bld->emit(it);
-        }
-        // TODO(yifei): analyze whether this is safe enough
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-    } else {
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    }
-}
-
-size_t get_dims_product(const sc_dims &dims) {
-    sc_dim ret = 1;
-    // todo: find out how to use this function in dynamic cases.
-    for (unsigned i = 0; i < dims.size(); ++i) {
-        if (!is_dynamic_dim(dims[i])) { ret *= dims[i]; }
-    }
-    assert(ret >= 0 && "Overflow or non-constant shape detected");
-    return ret;
-}
-
-int get_number_of_squeeze_dims(const sc_dims &dims) {
-    int ret = 0;
-    for (auto &it : dims) {
-        if (it == 1) { ret++; }
-    }
-    return ret;
-}
-
-bool loop_can_be_fused(const for_loop &loop) {
-    return get_expr_as_int(loop->step_) == INT64_C(1);
-}
-
-slice_range_map search_known_input_slice(sc_op *cur, fslice_map &fsmap) {
-    slice_range_map known_ranges_map;
-    auto input_size = cur->get_inputs().size();
-    for (size_t i = 0; i < input_size; i++) {
-        auto &input = cur->get_inputs()[i];
-        if (!fsmap.get(input).empty()) {
-            known_ranges_map[i] = fsmap.get(input);
-        }
-    }
-    return known_ranges_map;
-}
-
-void set_unknown_input_slice(fusible_op_t *cur,
-        const slice_range_map &known_ranges_map, fslice_map &fsmap) {
-    // set other unknown ranges.
-    auto input_size = cur->get_inputs().size();
-    for (size_t i = 0; i < input_size; i++) {
-        auto input = cur->get_inputs()[i];
-        auto &inp_slice = fsmap.get(input);
-        if (inp_slice.empty()) {
-            inp_slice = *utils::find_map_value(known_ranges_map, i).get();
-        }
-    }
-}
-
-std::unordered_map<int, binding_axis> search_known_input_axis(
-        sc_op *cur, binding_axis_map &bdax_map) {
-    std::unordered_map<int, binding_axis> known_axis_map;
-    auto input_size = cur->get_inputs().size();
-    for (size_t i = 0; i < input_size; i++) {
-        auto &input = cur->get_inputs()[i];
-        if (!bdax_map.get(input).empty()) {
-            known_axis_map[i] = bdax_map.get(input);
-        }
-    }
-    COMPILE_ASSERT(!known_axis_map.empty(),
-            "No binded input axis found for " << cur->op_name_
-                                              << cur->logical_op_id_)
-    return known_axis_map;
-}
-
-void call_output_user_axis_binding(sc_op *cur, binding_axis_map &bdax_map) {
-    for (auto &out : cur->get_outputs()) {
-        for (auto &user : out->uses_) {
-            if (auto bd_op = user.second->dyn_cast<
-                             op_traits::mixed_partition_acceptable>()) {
-                bd_op->infer_binding_axis(bdax_map);
-            }
-        }
-    }
-}
-
-void set_unknown_binding_axis(sc_op *cur,
-        const std::unordered_map<int, binding_axis> &known_axis_map,
-        binding_axis_map &bdax_map) {
-    // set other unknown axis.
-    auto input_size = cur->get_inputs().size();
-    // prepare infer binding axis upwards
-    std::unordered_set<sc_op *> pre_infer_list;
-    for (size_t i = 0; i < input_size; i++) {
-        auto input = cur->get_inputs()[i];
-        auto &inp_axis = bdax_map.get(input);
-        if (inp_axis.empty()) {
-            inp_axis = *utils::find_map_value(known_axis_map, i).get();
-            auto producer = input->producer_owner_;
-            if (producer->isa<input_op>()) continue;
-            pre_infer_list.insert(producer);
-        }
-    }
-    for (auto &op : pre_infer_list) {
-        if (auto preop
-                = op->dyn_cast<op_traits::mixed_partition_acceptable>()) {
-            preop->pre_infer_binding_axis(bdax_map);
-            // in avoid of more than one users cases
-            call_output_user_axis_binding(op, bdax_map);
-        }
-    }
-    // call output
-    call_output_user_axis_binding(cur, bdax_map);
-}
-
-std::vector<int> transform_axis_plain2blocking(
-        const logical_tensor_t &lt, const std::vector<int> &plain_axis) {
-    auto fmt = lt.get_format();
-    // If format is any, just return.
-    if (fmt.is_any()) { return plain_axis; }
-    std::vector<int> real_axis;
-    auto p2bmp = fmt.format_code_.collect_p2b_mapping();
-    for (auto &i : plain_axis) {
-        std::vector<int> res;
-        res.resize(p2bmp[i].size());
-        std::transform(p2bmp[i].begin(), p2bmp[i].end(), res.begin(),
-                [](const int &v) { return v; });
-        real_axis.insert(real_axis.end(), res.begin(), res.end());
-    }
-    std::sort(real_axis.begin(), real_axis.end());
-    return real_axis;
-}
-
-std::vector<int> transform_axis_plain2blocking(
-        const graph_tensor_ptr &gt, const std::vector<int> &plain_axis) {
-    return transform_axis_plain2blocking(gt->details_, plain_axis);
-}
-
-std::vector<int> transform_axis_blocking2plain(
-        const logical_tensor_t &lt, const std::vector<int> &blocking_axis) {
-    auto fmt = lt.get_format();
-    // If format is any, just return.
-    if (fmt.is_any()) { return blocking_axis; }
-    std::vector<int> plain_axis;
-    auto p2bmp = fmt.format_code_.collect_p2b_mapping();
-    for (auto &ax : blocking_axis) {
-        for (size_t i = 0; i < p2bmp.size(); i++) {
-            auto blk_axis_i = p2bmp[i];
-            if (blk_axis_i.end()
-                    != std::find(blk_axis_i.begin(), blk_axis_i.end(), ax)) {
-                plain_axis.emplace_back(i);
-                break;
-            }
-        }
-    }
-    // check if empty to make g++12 happy
-    if (!plain_axis.empty()) {
-        // remove duplicated axis.
-        std::sort(plain_axis.begin(), plain_axis.end());
-        plain_axis.erase(std::unique(plain_axis.begin(), plain_axis.end()),
-                plain_axis.end());
-    }
-    return plain_axis;
-}
-
-bool is_dynamic_slice_range_list(const slice_range_list &in_slice_range_list) {
-    for (auto &range : in_slice_range_list) {
-        auto shapes = get_slice_shape(range);
-        for (auto &shape : shapes) {
-            auto folded_shape = do_cast_and_fold(shape);
-            if (!folded_shape.isa<constant>()) { return true; }
-        }
-    }
-    return false;
-}
-
-/**
- * Compare left and right fsmap
- * */
-cmp_res cmp_slice_range(const slice_range_list &left_slice_range_list,
-        const slice_range_list &right_slice_range_list) {
-    size_t left_slice_size = 0, right_slice_size = 0;
-    COMPILE_ASSERT(
-            !left_slice_range_list.empty() && !right_slice_range_list.empty(),
-            "slice range should be set");
-    if (is_dynamic_slice_range_list(left_slice_range_list)
-            || is_dynamic_slice_range_list(right_slice_range_list)) {
-        auto &left_slice_range = left_slice_range_list[0];
-        auto &right_slice_range = right_slice_range_list[0];
-        assert(left_slice_range.size() == right_slice_range.size());
-        for (size_t i = left_slice_range.size(); i > 0; i--) {
-            auto left_shape = do_cast_and_fold(left_slice_range[i - 1].second);
-            auto right_shape
-                    = do_cast_and_fold(right_slice_range[i - 1].second);
-            if (!left_shape->equals(right_shape)
-                    && !(left_shape.isa<constant>()
-                            && right_shape.isa<constant>()
-                            && get_expr_as_int(left_shape)
-                                    == get_expr_as_int(right_shape))) {
-                if (left_shape.isa<constant>() && right_shape.isa<constant>()) {
-                    if (get_expr_as_int(left_shape)
-                            > get_expr_as_int(right_shape)) {
-                        left_slice_size++;
-                    } else {
-                        right_slice_size++;
-                    }
-                } else if (left_shape.isa<constant>()) {
-                    right_slice_size++;
-                } else if (right_shape.isa<constant>()) {
-                    left_slice_size++;
-                }
-            }
-        }
-    } else {
-        for (auto &left_slice_range : left_slice_range_list) {
-            auto left_slice_shape
-                    = get_expr_to_dims(get_slice_shape(left_slice_range));
-            if (get_dims_product(left_slice_shape) > left_slice_size) {
-                left_slice_size = get_dims_product(left_slice_shape);
-            }
-        }
-        for (auto &right_slice_range : right_slice_range_list) {
-            auto right_slice_shape
-                    = get_expr_to_dims(get_slice_shape(right_slice_range));
-            if (get_dims_product(right_slice_shape) > right_slice_size) {
-                right_slice_size = get_dims_product(right_slice_shape);
-            }
-        }
-    }
-    // if right anchor is more smaller than the leftrent one
-    if (left_slice_size == right_slice_size) {
-        return cmp_res::equal;
-    } else if (left_slice_size < right_slice_size) {
-        return cmp_res::l_less_r;
-    } else {
-        return cmp_res::l_larger_r;
-    }
-}
-
-bool is_reshaped_tensor(const expr &tsr) {
-    COMPILE_ASSERT(tsr.isa<tensorptr>(),
-            "except for tensor node, only tensorptr node is expected, but "
-            "got " << tsr);
-    if (tsr.static_as<tensorptr>()->is_slice_) return false;
-    auto base = tsr.static_as<tensorptr>()->base_;
-    COMPILE_ASSERT(base.isa<indexing>(),
-            "tensor_ptr base should be indexing, but got: " << base);
-    for (auto &idx : base.static_as<indexing>()->idx_) {
-        if (!idx.isa<constant>() || get_expr_as_int(idx) != 0) return false;
-    }
-    auto base_tensor = base.static_as<indexing>()->ptr_;
-    COMPILE_ASSERT(base_tensor.isa<tensor>(), "Tensor type is expected")
-    auto base_dims = base_tensor.static_as<tensor>()->dims_;
-    auto new_dims = tsr.static_as<tensorptr>()->shape_;
-    return get_dims_product(get_expr_to_dims(base_dims))
-            == get_dims_product(get_expr_to_dims(new_dims));
-}
-
-static std::vector<expr> get_dense_stride(const std::vector<expr> &shape) {
-    std::vector<expr> result(shape.size(), 1);
-    for (int i = shape.size() - 2; i >= 0; --i) {
-        result[i] = result[i + 1] * shape[i + 1];
-    }
-    return result;
-}
-
-expr transform_tsr2stsr_with_range(const expr &tsr, const slice_range &range) {
-    auto new_dims = get_slice_shape(range);
-    std::vector<expr> new_strides;
-    tensor t;
-    if (tsr.isa<tensor>()) {
-        t = tsr.static_as<tensor>();
-        new_strides = t->strides_;
-    } else {
-        COMPILE_ASSERT(is_reshaped_tensor(tsr),
-                "reshaped tensor is expected, but got " << tsr);
-        t = tsr.static_as<tensorptr>()
-                    ->base_.static_as<indexing>()
-                    ->ptr_.static_as<tensor>();
-        new_strides = get_dense_stride(tsr.static_as<tensorptr>()->shape_);
-    }
-    return builder::make_stensor(
-            t->name_ + "_strd", new_dims, new_strides, t->elem_dtype_);
-}
-
-expr transform_tsl2stsr(const tensor_slice &tsl) {
-    return transform_tsr2stsr_with_range(
-            tsl.get_real_tensor(), tsl.get_ranges());
-}
-
-expr transform_tsr2tptr_with_range(const expr &tsr, const slice_range &range) {
-    auto new_dims = get_slice_shape(range);
-    return builder::tensor_ptr(
-            tsr, get_slice_idx(range), get_slice_shape(range), true);
-}
-
-expr transform_tptr2stsr(const expr &tptr) {
-    COMPILE_ASSERT(tptr.isa<tensorptr>(),
-            "tensort pointer node is expected, but got " << tptr);
-    auto tp = tptr.static_as<tensorptr>();
-    COMPILE_ASSERT(tp->base_.isa<indexing>(), "indexing node is expected");
-    auto tsr = tp->base_->ptr_;
-    COMPILE_ASSERT(
-            tsr.isa<tensor>(), "tensor node is expected, but got " << tsr);
-    auto t = tsr.static_as<tensor>();
-    return builder::make_stensor(
-            t->name_ + "_strd", tp->shape_, t->strides_, t->elem_dtype_);
-}
-
-float evaluate_loop_parallel_balance(
-        const std::vector<for_loop> &loops, bool check_use_full_threads) {
-    expr dummy;
-    return evaluate_loop_parallel_balance(loops, dummy, check_use_full_threads);
-}
-
-float evaluate_loop_parallel_balance(const std::vector<for_loop> &loops,
-        expr &cond, bool check_use_full_threads) {
-    expr dyn_loops = 1;
-    sc_dim stc_loops = 1;
-    const int run_threads = runtime_config_t::get().get_num_threads();
-    cond = false;
-    // the minor dyn cost is used for case `dyn_var1 + dyn_var2` and
-    // `dyn_var1` only comparison.
-    float minor_dyn_parallelism = 0.f;
-    const float minor_dyn_parallelism_step = 1e-4f;
-    for (size_t i = 0; i < loops.size(); i++) {
-        auto &loop = loops[i];
-        if (!(loop->iter_begin_.isa<constant_c>()
-                    && loop->iter_end_.isa<constant_c>())) {
-            dyn_loops = dyn_loops * (loop->iter_end_ - loop->iter_begin_);
-            minor_dyn_parallelism += minor_dyn_parallelism_step;
-        } else {
-            auto begin = get_expr_as_int(loop->iter_begin_),
-                 end = get_expr_as_int(loop->iter_end_);
-            COMPILE_ASSERT(
-                    end > begin, "loop end is expected to larger than begin")
-            stc_loops = stc_loops * (end - begin);
-            dyn_loops = dyn_loops * static_cast<uint64_t>(stc_loops);
-        }
-    }
-    if (check_use_full_threads) {
-        cond = dyn_loops < run_threads;
-        return stc_loops >= run_threads;
-    }
-    cond = !((dyn_loops % run_threads == 0) || (dyn_loops > run_threads * 8));
-    if (stc_loops == 1) { return 0.0f + minor_dyn_parallelism; }
-    bool parallelism = (stc_loops / run_threads > 8)
-            || (stc_loops % run_threads == 0 && stc_loops >= run_threads);
-    float cal_parallelism = ((stc_loops % run_threads) / float(run_threads));
-    if (stc_loops < run_threads) { cal_parallelism += minor_dyn_parallelism; }
-    return parallelism ? 1.0f : cal_parallelism;
-}
-
-bool range_from_outer_loop(const std::pair<expr, expr> &range) {
-    return !range.first.isa<constant>() && range.second.isa<constant>()
-            && get_expr_as_int(range.second) == 1;
-}
-
-bool slice_full_on_axis(const sc_dims &dim, const slice_range &ranges,
-        const std::vector<int> &axis) {
-    for (auto &ax : axis) {
-        auto first = do_cast_and_fold(ranges[ax].first);
-        auto second = do_cast_and_fold(ranges[ax].second);
-        // slice range length should equal to dims
-        if (second.isa<constant>()) {
-            if (get_const_as_int(second.checked_as<constant>()) != dim[ax]) {
-                return false;
-            } else if (dim[ax] == 1) {
-                continue;
-            }
-        }
-        if (!first.isa<constant>()) {
-            if (first->node_type_ == sc_expr_type::mul) {
-                auto rv = constant_folding::get_operand_from_binary(first)
-                                  .second;
-                // {i * block, block} case where `block_size==dims[i]`
-                if (rv.isa<constant>()
-                        && get_const_as_int(rv.static_as<constant>())
-                                == dim[ax])
-                    continue;
-            }
-            return false;
-        } else if (get_const_as_int(first.static_as<constant>()) != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool slice_divisible_on_axis(const sc_dims &dim, const slice_range &ranges,
-        const std::vector<int> &axis) {
-    for (auto &ax : axis) {
-        auto second = do_cast_and_fold(ranges[ax].second);
-        // slice range length should be divisible by dims
-        if (second.isa<constant>()
-                && dim[ax] % get_const_as_int(second.checked_as<constant>())
-                        != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool slice_divisible_by_factor(const slice_range &ranges,
-        const std::vector<int> &axis, const int factor) {
-    for (auto &ax : axis) {
-        auto second = do_cast_and_fold(ranges[ax].second);
-        // slice range length should be divisible by dims
-        if (second.isa<constant>()) {
-            if (get_const_as_int(second.checked_as<constant>()) % factor != 0) {
-                return false;
-            }
-        } else {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool slice_larger_than_bound_on_axis(const slice_range &ranges,
-        const std::vector<int> &axis, const int factor, const int lower_bound) {
-    auto total_len = 1;
-    for (auto &ax : axis) {
-        auto second = do_cast_and_fold(ranges[ax].second);
-        if (second.isa<constant>()) {
-            total_len *= get_const_as_int(second.checked_as<constant>());
-        } else {
-            return false;
-        }
-    }
-    if (total_len / factor < lower_bound) { return false; }
-    return true;
-}
-
-bool innermost_slice_with_non_dividable_lanes(const context_ptr &ctx,
-        const slice_range &slice, const sc_data_type_t &dtype, sc_dim &floor,
-        sc_dim &tail) {
-    if (is_dynamic_slice_range_list({slice})) return false;
-    auto shape = get_slice_shape(slice);
-    auto dims = get_expr_to_dims(shape);
-    auto last_dim = dims.back();
-    // check dims except last are all one
-    if (get_dims_product(dims) != (size_t)last_dim) return false;
-    // get max lanes
-    auto lanes = vectorize_step(ctx, dtype.type_code_);
-    if ((last_dim > lanes) && (last_dim % lanes != 0)) {
-        floor = last_dim / lanes * lanes;
-        tail = last_dim % lanes;
-        return true;
-    }
-    return false;
-}
-
-int get_slice_size(const slice_range &ranges, const int dtype_size) {
-    auto total_size = dtype_size;
-    for (auto &range : ranges) {
-        auto second = do_cast_and_fold(range.second);
-        if (second.isa<constant>()) {
-            total_size *= get_const_as_int(second.checked_as<constant>());
-        } else {
-            return -1;
-        }
-    }
-    return total_size;
-}
-
-bool slice_expr_equals(const expr &in1, const expr &in2) {
-    auto fin1 = do_cast_and_fold(in1);
-    auto fin2 = do_cast_and_fold(in2);
-    if (fin1->equals(fin2)) { return true; }
-    // datatype may not equal during slice compare.
-    if (fin1.isa<constant>() && fin2.isa<constant>()) {
-        return get_expr_as_int(fin1) == get_expr_as_int(fin2);
-    }
-    return false;
-}
-
-expr cast_to_s32(const expr &in) {
-    return builder::make_cast(datatypes::s32, in);
-}
-
-std::vector<graph_tensor_ptr> get_sorted_inputs_by_layout_input(
-        const sc_op_ptr &op) {
-    int layout_input_index
-            = op->attrs_.get_or_else(op_attr_key::layout_input_index, 0);
-    if (layout_input_index == 0) { return op->get_inputs(); }
-    std::vector<graph_tensor_ptr> ret;
-    auto inps = op->get_inputs();
-    ret.reserve(inps.size());
-    ret.push_back(inps[layout_input_index]);
-    for (int i = 0; i < static_cast<int>(inps.size()); i++) {
-        if (i == layout_input_index) { continue; }
-        ret.push_back(inps[i]);
-    }
-    return ret;
-}
-
-variant<float, int64_t> numeric_limits_minimum(sc_data_etype type_code) {
-    if (type_code == sc_data_etype::F32 || type_code == sc_data_etype::BF16) {
-        return -std::numeric_limits<float>::infinity();
-    } else if (type_code == sc_data_etype::F16) {
-        return (float)-65504;
-    } else if (type_code == sc_data_etype::U8
-            || type_code == sc_data_etype::U32) {
-        return int64_t(0);
-    } else if (type_code == sc_data_etype::S8) {
-        return int64_t(-128);
-    } else if (type_code == sc_data_etype::S32) {
-        return int64_t(std::numeric_limits<int32_t>::min());
-    } else {
-        COMPILE_ASSERT(0, "unsupported data_etype");
-    }
-}
-
-variant<float, int64_t> numeric_limits_maximum(sc_data_etype type_code) {
-    if (type_code == sc_data_etype::F32 || type_code == sc_data_etype::BF16) {
-        return std::numeric_limits<float>::infinity();
-    } else if (type_code == sc_data_etype::F16) {
-        return (float)65504;
-    } else if (type_code == sc_data_etype::S8) {
-        return int64_t(127);
-    } else if (type_code == sc_data_etype::S32) {
-        return int64_t(std::numeric_limits<int32_t>::max());
-    } else if (type_code == sc_data_etype::U8) {
-        return int64_t(255);
-    } else if (type_code == sc_data_etype::U32) {
-        return int64_t(std::numeric_limits<uint32_t>::max());
-    } else {
-        COMPILE_ASSERT(0, "unsupported data_etype");
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op_utils.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op_utils.hpp
deleted file mode 100644
index b0a8b254700..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusible_op_utils.hpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSIBLE_OP_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSIBLE_OP_UTILS_HPP
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-#include "fusible_op.hpp"
-#include "fusion_data.hpp"
-#include "util/variant.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class cmp_res : int {
-    unknown = -1,
-    equal = 0,
-    l_less_r = 1,
-    l_larger_r = 2,
-};
-
-using slice_range_map = std::unordered_map<int, slice_range_list>;
-slice_range_map search_known_input_slice(sc_op *cur, fslice_map &fsmap);
-void set_unknown_input_slice(fusible_op_t *cur,
-        const slice_range_map &known_ranges_map, fslice_map &fsmap);
-infer_status_code infer_binary_slice_ranges(
-        fusible_op_t *cur, fslice_map &fsmap);
-
-std::unordered_map<int, binding_axis> search_known_input_axis(
-        sc_op *cur, binding_axis_map &bdax_map);
-void set_unknown_binding_axis(sc_op *cur,
-        const std::unordered_map<int, binding_axis> &known_axis_map,
-        binding_axis_map &bdax_map);
-void call_output_user_axis_binding(sc_op *cur, binding_axis_map &bdax_map);
-void infer_identical_binding_axis(
-        fusible_op_t *cur, binding_axis_map &bdax_map);
-void pre_infer_identical_binding_axis(
-        fusible_op_t *cur, binding_axis_map &bdax_map);
-
-sc_dims get_expr_to_dims(const std::vector<expr> &dims);
-size_t get_dims_product(const sc_dims &dims);
-// the dim can be squeezed is 1
-int get_number_of_squeeze_dims(const sc_dims &dims);
-
-bool range_from_outer_loop(const std::pair<expr, expr> &range);
-
-bool slice_full_on_axis(const sc_dims &dim, const slice_range &ranges,
-        const std::vector<int> &axis);
-
-bool slice_divisible_on_axis(const sc_dims &dim, const slice_range &ranges,
-        const std::vector<int> &axis);
-
-bool slice_divisible_by_factor(const slice_range &ranges,
-        const std::vector<int> &axis, const int factor);
-
-bool slice_larger_than_bound_on_axis(const slice_range &ranges,
-        const std::vector<int> &axis, const int factor, const int lower_bound);
-
-int get_slice_size(const slice_range &ranges, const int dtype_size = 1);
-
-inline uint16_t vectorize_step(const context_ptr &ctx, sc_data_etype detype) {
-    // eg: bf16 or s8u8 always promote to f32 or s32 to do calculation, we need
-    // to limited bf16 max lanes is 8 under avx2 environment.
-    auto avx2_lanes_require_dtype = [](const sc_data_etype detype) {
-        return detype == sc_data_etype::BF16 || detype == sc_data_etype::S8
-                || detype == sc_data_etype::U8;
-    };
-    if (!ctx->machine_.cpu_flags_.fAVX512F
-            && avx2_lanes_require_dtype(detype)) {
-        assert(ctx->machine_.cpu_flags_.fAVX2);
-        return std::min(uint16_t(8), ctx->get_max_vector_lanes(detype));
-    }
-    return std::min(uint16_t(16), ctx->get_max_vector_lanes(detype));
-}
-bool loop_can_be_fused(const for_loop &loop);
-
-// use new fusion mgr to lowering single fusible op
-ir_module_ptr fusible_op_get_func(fusible_op_t *op, const context_ptr &ctx);
-
-struct mask_compute_func_t {
-    mask_compute_func_t(const std::function<stmt(const std::vector<expr> &,
-                    std::vector<expr::lvalue_proxy_t> &)> &func)
-        : impl_(func) {}
-    stmt operator()(const std::vector<expr> &in,
-            std::vector<expr::lvalue_proxy_t> &out,
-            const expr &cur_idx = expr(), const expr &upper_bound = expr(),
-            uint32_t lanes = 16) const;
-    std::function<stmt(
-            const std::vector<expr> &, std::vector<expr::lvalue_proxy_t> &)>
-            impl_;
-};
-
-using fusion_compute_func_t = std::function<stmt(
-        const std::vector<expr> &, std::vector<expr::lvalue_proxy_t> &)>;
-bool is_op_input_blocking_shape(const sc_op_info_t &info);
-
-void compute_vectorized_op(const context_ptr &ctx, sc_graph_t &graph,
-        const std::vector<const tensor_slice *> &src, const tensor_slice &dst,
-        sc_op_info_t &info, const vectorized_info_t &vx_info,
-        const mask_compute_func_t &compute_lanes,
-        const mask_compute_func_t &compute_scalar, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool use_mask = false,
-        const tensor_slice *expand_loop_by
-        = nullptr /*by default expand loop by dst*/,
-        bool unroll_inner_loop = false);
-expr make_select_by_mask(const expr &, const expr &, const expr &, uint32_t);
-expr generate_mask_var_by_step(stmt &mask_def, const expr &cur_step,
-        int32_t step, const expr &sup_condition = expr(),
-        bool direct_sup_cond = false);
-expr generate_mask_by_step_directly(const expr &cur_step, int32_t step,
-        const expr &sup_condition = expr(), bool direct_sup_cond = false);
-expr calculate_mask_cur_step(
-        const expr &len, const expr &iter_var, const int32_t lanes);
-expr indexing_from_diff_cond(const bool is_last_dim_1, const bool has_tail,
-        const tensor_slice &input, std::vector<expr> &input_idx,
-        const int32_t lanes, expr &res_idx, const expr &axis_len,
-        const expr &iter_var, const expr &floor, bool just_tail_part = false);
-expr last_dim_generate_mask(const expr &iter_var, const expr &floor,
-        expr const &last_dim_len, int const &lanes,
-        bool just_tail_part = false);
-void vec_backend_require(const context_ptr &ctx, bool &use_vectorized);
-void compute_mask_and_generate_condition(sc_graph_t &graph,
-        const std::vector<const tensor_slice *> &src, const sc_dims &plain_dims,
-        sc_data_format_t format, const std::vector<expr> &iter_vars, int lanes,
-        std::unordered_map<expr, std::pair<expr, expr>> &conditions,
-        int &last_axis_mask);
-void compute_block_elemwise(const std::vector<const tensor_slice *> &src,
-        const tensor_slice &dst, sc_op_info_t &info,
-        fusion_compute_func_t compute);
-
-std::vector<int> transform_axis_plain2blocking(
-        const logical_tensor_t &lt, const std::vector<int> &plain_axis);
-
-std::vector<int> transform_axis_plain2blocking(
-        const graph_tensor_ptr &gt, const std::vector<int> &plain_axis);
-
-std::vector<int> transform_axis_blocking2plain(
-        const logical_tensor_t &lt, const std::vector<int> &blocking_axis);
-
-std::string fusion_create_var_idx();
-std::string fusion_create_idx();
-
-cmp_res cmp_slice_range(const slice_range_list &left_slice_range_list,
-        const slice_range_list &right_slice_range_list);
-
-bool is_dynamic_slice_range_list(const slice_range_list &in_slice_range_list);
-
-// workload penalty coefficient for transpose/reorder measured by
-// for(i, 0, 128){
-//     for(j, 0, 256){
-//         B[j, i] = A[i, j];
-//     }
-// }
-// TODO(xxx): currently we mark this penalty on op, we will add loop
-// analysis pass for tensor sequential access analysis in future
-static constexpr size_t workload_penalty_coefficient = 16UL;
-
-float evaluate_loop_parallel_balance(const std::vector<for_loop> &loops,
-        bool check_use_full_threads = false);
-// return static loop parallelism coefficient to satisfy the parallelism and
-// the related condition expr.
-float evaluate_loop_parallel_balance(const std::vector<for_loop> &loops,
-        expr &cond, bool check_use_full_threads = false);
-expr cast_to_s32(const expr &in);
-// compare expr in slice equal or not, constant slice may have different
-// datatypes but same value as we use `int` for static.
-bool slice_expr_equals(const expr &in1, const expr &in2);
-// sort op inputs by layout input index, the order is [layout input index,
-// others...]
-std::vector<graph_tensor_ptr> get_sorted_inputs_by_layout_input(
-        const sc_op_ptr &op);
-
-/**
- * @return Bool: return if given slice comes from the inner most anchor with non
- * dividable lanes
- * @param ctx: context, used to query max lanes
- * @param slice: given slice range
- * @param dtype: data type kind, used to query max lanes
- * @param floor: if return is True, recording `floor` value for last dims
- * divided by max lanes
- * @param tail: if return is True, recording `tail` value for last dims divided
- * by max lanes
- * */
-bool innermost_slice_with_non_dividable_lanes(const context_ptr &ctx,
-        const slice_range &slice, const sc_data_type_t &dtype, sc_dim &floor,
-        sc_dim &tail);
-
-variant<float, int64_t> numeric_limits_minimum(sc_data_etype type_code);
-variant<float, int64_t> numeric_limits_maximum(sc_data_etype type_code);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_anchor.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_anchor.cpp
deleted file mode 100644
index ab99bd76536..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_anchor.cpp
+++ /dev/null
@@ -1,570 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "fusion_anchor.hpp"
-#include <string>
-#include "fusible_op_utils.hpp"
-#include "mixed_partition.hpp"
-#include <util/optional_find.hpp>
-
-SC_MODULE(graph.fusion_anchor);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// get parent loop for the given node, if not found, return the top parent node
-static stmt get_parent_loop_or_node(const stmt_c &node) {
-    stmt cur_node = node.remove_const();
-    stmt parent_node = get_parent_node(cur_node);
-    while (parent_node.defined()) {
-        if (parent_node.isa<for_loop>()) { return parent_node; }
-        cur_node = parent_node;
-        parent_node = get_parent_node(parent_node);
-    }
-    return cur_node;
-}
-
-stmt fusion_anchor_t::get_parent_loop() const {
-    return get_parent_loop_or_node(anchor_position_);
-}
-
-void fusion_anchor_t::append_content(anchor_content_t content) {
-    std::vector<anchor_content_t> contents;
-    // if content is fusion anchor, need to append all of its contents
-    if (auto fanchor = content.as_or_null<fusion_anchor_t *>()) {
-        contents = (*fanchor)->get_contents();
-    }
-    contents.emplace_back(content);
-    append_contents(contents, content_number_map_.size());
-
-    // recursively attach to parent anchor
-    auto root = this;
-    while (root->parent_) {
-        auto num_id = root->parent_->content_number_map_[root];
-        root = root->parent_.get();
-        root->append_contents(contents, num_id);
-    }
-}
-
-void fusion_anchor_t::clear_content(anchor_content_t content) {
-    if (auto fanchor = content.as_or_null<fusion_anchor_t *>()) {
-        COMPILE_ASSERT((*fanchor)->get_contents().empty(),
-                "the content of fusion anchor is not empty")
-    }
-    auto erase_content = [](fusion_anchor_t *fanchor,
-                                 const anchor_content_t &content,
-                                 bool assert_last) {
-        auto iter = fanchor->content_number_map_.find(content);
-        // erase
-        if (iter != fanchor->content_number_map_.end()) {
-            COMPILE_ASSERT(!assert_last
-                            || iter->second + 1
-                                    == fanchor->content_number_map_.size(),
-                    "erased content is not last one")
-            fanchor->content_number_map_.erase(iter);
-        }
-    };
-    // recursively remove content from parent anchor
-    auto root = this;
-    while (root) {
-        erase_content(root, content, root == this);
-        root = root->parent_.get();
-    }
-}
-
-static void inherit_parent_offset(
-        slice_range_list &cur_value, const slice_range_list &parent_value) {
-    // if slice range list size is not equal
-    if (cur_value.size() != parent_value.size()) return;
-    for (size_t i = 0; i < cur_value.size(); i++) {
-        auto &cur_slice = cur_value[i];
-        auto &parent_slice = parent_value[i];
-        COMPILE_ASSERT(cur_slice.size() == parent_slice.size(),
-                "slice range size is not equal for the same graph "
-                "tensor")
-        for (size_t j = 0; j < cur_slice.size(); j++) {
-            // inherit offset
-            cur_slice[j].first = do_cast_and_fold(
-                    parent_slice[j].first + cur_slice[j].first);
-        }
-    }
-}
-
-void fusion_anchor_t::attach_parent_anchor(
-        const std::shared_ptr<fusion_anchor_t> &parent,
-        const std::shared_ptr<fusion_anchor_t> &repl_parent,
-        bool inherit_offset) {
-    if (!parent) return;
-    if (inherit_offset) {
-        for (auto &kv : fsmap_.datamap_) {
-            auto &gt = kv.first;
-            auto &cur_value = kv.second;
-            if (parent->fsmap_.hasvalue(gt)) {
-                inherit_parent_offset(cur_value, parent->fsmap_.get(gt));
-            }
-        }
-    }
-    auto root = this;
-    while (root->parent_ && (root->parent_ != repl_parent)) {
-        COMPILE_ASSERT(root != root->parent_.get(),
-                "Ring parent anchor relationship found");
-        root = root->parent_.get();
-    }
-    if (root == parent.get()) return;
-    root->parent_ = parent;
-    parent->append_content(root);
-}
-
-void fusion_anchor_t::merge(const std::shared_ptr<fusion_anchor_t> &other) {
-    fsmap_.datamap_.insert(
-            other->fsmap_.datamap_.begin(), other->fsmap_.datamap_.end());
-    blocked_gt_set_.insert(
-            other->blocked_gt_set_.begin(), other->blocked_gt_set_.end());
-    borrowed_fanchor_map_.insert(other->borrowed_fanchor_map_.begin(),
-            other->borrowed_fanchor_map_.end());
-    auto contents_size = content_number_map_.size();
-    for (auto &cont_numb_pair : other->content_number_map_) {
-        content_number_map_.insert(std::make_pair(
-                cont_numb_pair.first, cont_numb_pair.second + contents_size));
-    }
-}
-
-bool fusion_anchor_t::has_view_of(sc_op *op) {
-    auto iter = binded_mxp_->op_anchor_map_.find(op);
-    // if not found
-    if (iter == binded_mxp_->op_anchor_map_.end()) return true;
-    auto op_anchor = iter->second;
-    while (true) {
-        if (op_anchor.get() == this) return true;
-        if (op_anchor->content_number_map_.find(this)
-                == op_anchor->content_number_map_.end()) {
-            if (op_anchor->parent_) {
-                op_anchor = op_anchor->parent_;
-            } else
-                break;
-        } else if (*utils::find_map_value(op_anchor->content_number_map_, this)
-                            .get()
-                > *utils::find_map_value(op_anchor->content_number_map_, op)
-                           .get()) {
-            return true;
-        } else {
-            return false;
-        }
-    }
-    auto ths_root_scope = get_root()->get_parent_scope()->seq_;
-    stmt ths_anchor_ss = get_root()->anchor_position_;
-    auto ths_pos = std::find_if(ths_root_scope.begin(), ths_root_scope.end(),
-            [&ths_anchor_ss](
-                    const stmt &s) { return ths_anchor_ss.ptr_same(s); });
-    auto op_anchor_loop = op_anchor->get_parent_loop();
-    while (true) {
-        auto op_anchor_pos = std::find_if(ths_root_scope.begin(),
-                ths_root_scope.end(), [&op_anchor_loop](const stmt &s) {
-                    return op_anchor_loop.ptr_same(s);
-                });
-        if (op_anchor_pos != ths_root_scope.end()) {
-            return op_anchor_pos < ths_pos;
-        }
-        op_anchor_loop = get_parent_node(op_anchor_loop);
-        if (!op_anchor_loop.defined()) return false;
-    }
-}
-
-bool fusion_anchor_t::check_dep_for_op(const sc_op *op) {
-    if (op->isa<input_op>() || op->isa<constant_op_t>()) return true;
-    auto parti = binded_mxp_;
-    auto dep_m = parti->dep_m_;
-    for (auto &inp : op->get_inputs()) {
-        // check dep ops
-        for (auto &cur : parti->ops) {
-            if (std::any_of(cur->get_outputs().begin(),
-                        cur->get_outputs().end(),
-                        [&parti](const graph_tensor_ptr &gt) {
-                            return parti->is_parti_out(gt);
-                        })
-                    && (dep_m->lookup(cur.get(), inp->producer_owner_) == 1
-                            || cur.get() == inp->producer_owner_)) {
-                // this fanchor should has view of depent op
-                if (!this->has_view_of(cur.get())) return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool fusion_anchor_t::validate_input_for_op(
-        const sc_op *op, const std::unordered_set<graph_tensor_ptr> &known_gt) {
-    auto fanchor = this;
-    return std::all_of(op->get_inputs().begin(), op->get_inputs().end(),
-            [&fanchor, &known_gt](const graph_tensor_ptr &gt) {
-                auto parti = fanchor->binded_mxp_;
-                auto dep_op = gt->producer_owner_;
-                // if the producer owner of gt is successfully inferred
-                if (known_gt.find(gt) != known_gt.end()) return true;
-                // if the producer owner of gt is excluded in current
-                // partition
-                if (!parti->contains(dep_op)) {
-                    return fanchor->check_dep_for_op(dep_op);
-                }
-
-                // fanchor should has view of depent op
-                if (!fanchor->has_view_of(dep_op)) return false;
-
-                slice_range_list inferred_slice = fanchor->fsmap_.get(gt);
-                fusion_anchor_t *cur = fanchor;
-
-                COMPILE_ASSERT(!dep_op->get_inputs().empty(),
-                        dep_op->op_name_ << " op has no input")
-                // find the borrowed scope
-                auto dep_inp = dep_op->get_inputs()[0];
-                while (cur->parent_) {
-                    cur = cur->parent_.get();
-                    // if input is ready on cur anchor, its input slice range
-                    // should not be empty
-                    if (!cur->fsmap_.hasvalue(gt)
-                            || (!parti->is_parti_inp(dep_inp)
-                                    && !cur->fsmap_.hasvalue(dep_inp))) {
-                        continue;
-                    }
-                    slice_range_list cur_slice = cur->fsmap_.get(gt);
-                    auto res = cmp_slice_range(inferred_slice, cur_slice);
-                    if (res != cmp_res::l_larger_r) {
-                        fanchor->borrowed_fanchor_map_[gt]
-                                = cur->shared_from_this();
-                        return true;
-                    }
-                }
-                return false;
-            });
-}
-
-void fusion_anchor_t::forbid_op(
-        const sc_op *op, const std::unordered_set<graph_tensor_ptr> &known_gt) {
-    auto fanchor = this;
-    std::for_each(op->get_inputs().begin(), op->get_inputs().end(),
-            [&known_gt, &fanchor](const graph_tensor_ptr &gt) {
-                if (known_gt.find(gt) == known_gt.end()) {
-                    fanchor->fsmap_.datamap_.erase(gt.get());
-                }
-            });
-    std::for_each(op->get_outputs().begin(), op->get_outputs().end(),
-            [&known_gt, &fanchor](const graph_tensor_ptr &gt) {
-                if (known_gt.find(gt) == known_gt.end()) {
-                    fanchor->fsmap_.datamap_.erase(gt.get());
-                    fanchor->blocked_gt_set_.insert(gt);
-                }
-            });
-}
-
-bool fusion_anchor_t::check_input_for_op(
-        const sc_op *op, std::unordered_set<graph_tensor_ptr> &known_gt) {
-    auto fanchor = this;
-    bool input_blocked = false;
-    for (auto &gt : op->get_inputs()) {
-        if (blocked_gt_set_.find(gt) != blocked_gt_set_.end()) {
-            std::for_each(op->get_outputs().begin(), op->get_outputs().end(),
-                    [&fanchor](const graph_tensor_ptr &gt) {
-                        fanchor->blocked_gt_set_.insert(gt);
-                    });
-            input_blocked = true;
-            break;
-        }
-        if (fanchor->fsmap_.hasvalue(gt)) {
-            if (fanchor->borrowed_fanchor_map_.find(gt)
-                    != fanchor->borrowed_fanchor_map_.end())
-                continue;
-            known_gt.insert(gt);
-        }
-    }
-    return !(input_blocked || known_gt.empty());
-}
-
-bool fusion_anchor_t::is_small_op_workload(const sc_op *op) {
-    if (op->is_dynamic() || !op->isa<fusible_op_t>()) return false;
-    auto fusible_op = op->dyn_cast<const fusible_op_t>();
-    // collect inputs and outputs
-    auto gt_list = fusible_op->get_inputs();
-    gt_list.insert(gt_list.end(), fusible_op->get_outputs().begin(),
-            fusible_op->get_outputs().end());
-    // zero-init workload value
-    size_t wkld = 0;
-    // count of avaliable computation
-    size_t computed_count = 0;
-    // SigMa workload computation
-    for (auto &gt : gt_list) {
-        // if slice info is not found
-        if (!fsmap_.hasvalue(gt)) continue;
-        auto &slice_list = fsmap_.get(gt);
-        // if unexpected cases found, directly return threshold
-        if (is_dynamic_slice_range_list(slice_list) || slice_list.size() != 1)
-            return false;
-        auto slice_shape = get_expr_to_dims(get_slice_shape(slice_list[0]));
-        // compute workload: tensor shrink shape / dtype step
-        wkld += utils::divide_and_ceil(get_dims_product(slice_shape),
-                vectorize_step(get_binded_mxp()->ctx_,
-                        gt->details_.dtype_.type_code_));
-        computed_count++;
-    };
-    COMPILE_ASSERT(computed_count, "No workload computed")
-    // roughly compute if necessary
-    if (computed_count != gt_list.size()) {
-        wkld = wkld / computed_count * gt_list.size();
-    }
-    return wkld < mixed_partition_hint::small_op_workload_threshold;
-}
-
-void iter_fusion_anchor_t::commit(const stmt &s) {
-    if (cached_iter_anchor_.empty()) {
-        if (dispatch_helper_.isa<stmts>()) {
-            anchor_position_->seq_.insert(anchor_position_->seq_.end(),
-                    dispatch_helper_.static_as<stmts>()->seq_.begin(),
-                    dispatch_helper_.static_as<stmts>()->seq_.end());
-        } else {
-            anchor_position_->seq_.emplace_back(dispatch_helper_);
-        }
-    }
-    // create cached_iter_anchor_ if necessary
-    if (cached_iter_anchor_.size() < iter_size_) {
-        stmts ss = s.isa<stmts>()
-                ? s.static_as<stmts>()
-                : builder::make_stmts_unattached({s}).checked_as<stmts>();
-        anchor_position_->seq_.emplace_back(
-                make_stmt<if_else_node_t>(iter_ == iter_cnt_, ss, stmt()));
-        cached_iter_anchor_.emplace_back(ss);
-    }
-    // commit into cached_iter_anchor_
-    else {
-        auto cached_anchor = cached_iter_anchor_.at(iter_cnt_);
-        if (s.isa<stmts>()) {
-            cached_anchor->seq_.insert(cached_anchor->seq_.end(),
-                    s.static_as<stmts>()->seq_.begin(),
-                    s.static_as<stmts>()->seq_.end());
-        } else {
-            cached_anchor->seq_.emplace_back(s);
-        }
-    }
-    iter_cnt_++;
-    if (iter_cnt_ == iter_size_) iter_cnt_ = 0;
-}
-
-void grouped_fusion_anchor_t::commit(const stmt &s) {
-    auto pos = anchor_position_->seq_.at(group_cnt_).checked_as<stmts>();
-    if (s.isa<stmts>()) {
-        pos->seq_.insert(pos->seq_.end(), s.static_as<stmts>()->seq_.begin(),
-                s.static_as<stmts>()->seq_.end());
-    } else {
-        pos->seq_.emplace_back(s);
-    }
-    group_cnt_++;
-    if (group_cnt_ == group_size_) group_cnt_ = 0;
-}
-
-stmt grouped_fusion_anchor_t::get_parent_loop() const {
-    // use last group anchor which is always the largest anchor
-    stmt cur_node = anchor_position_->seq_.front();
-    if (group_size_ == 1) {
-        return get_parent_loop_or_node(cur_node);
-    } else {
-        stmt common_parent;
-        for (size_t i = 1; i < anchor_position_->seq_.size(); i++) {
-            auto next_node = anchor_position_->seq_[i];
-            common_parent = get_common_parent_node(cur_node, next_node);
-            if (!common_parent.defined()) return common_parent;
-            cur_node = common_parent;
-        }
-        return get_parent_loop_or_node(common_parent);
-    }
-}
-
-void fusion_anchor_mgr_t::create_fusion_anchor(const slice_map &fsmap,
-        const fusion_anchor_ptr &parent, bool is_input_anchor) {
-    COMPILE_ASSERT(!fsmap.empty(), "fusion anchor init slice not found")
-    // create anchor placeholder in IR
-    auto bld = builder::get_current_builder();
-    auto pos = bld->push_anchor();
-
-    auto fusion_anchor = std::make_shared<fusion_anchor_t>(
-            pos, fslice_map(fsmap), parent, is_input_anchor);
-    // append to fanchor list
-    fanchor_list_.emplace_back(fusion_anchor);
-    if (binded_mxp_) {
-        // if binded, directly append to mxp
-        binded_mxp_->append_fusion_anchor(fusion_anchor);
-    }
-}
-
-void fusion_anchor_mgr_t::create_fusion_anchor(const expr &iter_var,
-        const slice_map &fsmap, const stmt &dispatch_helper,
-        const fusion_anchor_ptr &parent, bool is_input_anchor) {
-    // create anchor placeholder in IR
-    auto bld = builder::get_current_builder();
-    auto pos = bld->push_anchor();
-
-    auto iter_anchor = std::make_shared<iter_fusion_anchor_t>(iter_var, pos,
-            fslice_map(fsmap), dispatch_helper, parent, is_input_anchor);
-    // append to fanchor list
-    fanchor_list_.emplace_back(iter_anchor);
-    if (binded_mxp_) {
-        // if binded, directly append to mxp
-        binded_mxp_->append_fusion_anchor(iter_anchor);
-    }
-}
-
-void fusion_anchor_mgr_t::create_fusion_anchor(int group_id,
-        const slice_map &fsmap, const fusion_anchor_ptr &parent,
-        bool is_input_anchor) {
-    COMPILE_ASSERT(!fsmap.empty(), "grouped fusion anchor init slice not found")
-    COMPILE_ASSERT(
-            std::all_of(fsmap.begin(), fsmap.end(),
-                    [](const std::pair<graph_tensor *, slice_range_list> &p) {
-                        return p.second.size() == 1;
-                    }),
-            "all init slice size of grouped fusion anchor should be 1")
-    // create anchor placeholder in IR
-    auto bld = builder::get_current_builder();
-    auto pos = bld->push_anchor();
-    // search grouped anchor map
-    auto res = grouped_id_map_.find(group_id);
-    // if group id already exist, update cached grouped anchor
-    if (res != grouped_id_map_.end()) {
-        auto &group_anchor = res->second;
-        auto &datamap = group_anchor->fsmap_.datamap_;
-        // update fsmap
-        for (auto &new_kv : fsmap) {
-            auto gt = new_kv.first;
-            auto slice = new_kv.second[0];
-            auto cache_kv = datamap.find(gt);
-            COMPILE_ASSERT(cache_kv != datamap.end(),
-                    "grouped anchor must have same graph tensor key")
-            // update cached slice range list
-            cache_kv->second.emplace_back(slice);
-        }
-        // update anchor position
-        group_anchor->anchor_position_->seq_.emplace_back(pos);
-        // update group size
-        group_anchor->group_size_++;
-    } else {
-        auto new_grouped_anchor = std::make_shared<grouped_fusion_anchor_t>(
-                builder::make_stmts_unattached({pos}).checked_as<stmts>(),
-                fslice_map(fsmap), parent, is_input_anchor);
-        // append to fanchor list
-        fanchor_list_.emplace_back(new_grouped_anchor);
-        if (binded_mxp_) {
-            // if binded, directly append to mxp
-            binded_mxp_->append_fusion_anchor(new_grouped_anchor);
-        }
-        // cache to grouped anchor map
-        grouped_id_map_.insert(std::make_pair(group_id, new_grouped_anchor));
-    }
-}
-
-void fusion_anchor_mgr_t::create_fusion_anchor(const graph_tensor_ptr &gt,
-        const slice_range &slice, const fusion_anchor_ptr &parent,
-        bool is_input_anchor) {
-    create_fusion_anchor(slice_map {{gt.get(), slice_range_list {slice}}},
-            parent, is_input_anchor);
-}
-
-void fusion_anchor_mgr_t::create_fusion_anchor(const expr &tsr,
-        const slice_range &slice, const fusion_anchor_ptr &parent,
-        bool is_input_anchor) {
-    // lookup graph tensor
-    auto iter = get_binded_mxp()->buf_alloc_.b2g_map_.find(tsr);
-    COMPILE_ASSERT(iter != get_binded_mxp()->buf_alloc_.b2g_map_.end(),
-            "No buffer to graph tensor mapping found for " << tsr)
-    auto gt = iter->second;
-    create_fusion_anchor(slice_map {{gt.get(), slice_range_list {slice}}},
-            parent, is_input_anchor);
-}
-
-void create_fusion_anchor(fusion_anchor_mgr_t *fusion,
-        const graph_tensor_ptr &gt, const slice_range &slice) {
-    if (fusion) { fusion->create_fusion_anchor(gt, slice); }
-}
-
-fusion_anchor_ptr try_convert_anchor(
-        const context_ptr &ctx, const fusion_anchor_ptr &fanchor) {
-    // Currently, it only converts common anchor. When all kinds of fusion
-    // anchor supported, this method should become one of member function
-    auto &anchor_type = *fanchor;
-    if (typeid(anchor_type) != typeid(fusion_anchor_t)) return fanchor;
-
-    auto &datamap = fanchor->fsmap_.datamap_;
-    if (datamap.size() != 1 || datamap.begin()->second.size() != 1)
-        return fanchor;
-    sc_dim floor, tail;
-    auto &fs_pair = *datamap.begin();
-    auto base_gt = fs_pair.first;
-    // check attr to confirm all users of base gt agree to split
-    if (!std::all_of(base_gt->uses_.begin(), base_gt->uses_.end(),
-                [](const std::pair<int, dnnl::impl::graph::gc::sc_op_weak_ptr_t>
-                                &user) {
-                    return user.second->attrs_.get_or_else(
-                            mixed_partition_hint::split_anchor_op, false);
-                }))
-        return fanchor;
-    // if last lanes is non-dividable, auto split it to grouped anchor
-    if (!innermost_slice_with_non_dividable_lanes(
-                ctx, fs_pair.second[0], base_gt->details_.dtype_, floor, tail))
-        return fanchor;
-
-    auto &pos = fanchor->anchor_position_;
-    // get parent pos
-    auto parent_pos = get_parent_node(pos).checked_as<stmts>();
-    // remove old anchor pos
-    std::vector<stmt>::iterator anchor_iter
-            = std::find_if(parent_pos->seq_.begin(), parent_pos->seq_.end(),
-                    [pos](stmt &s) { return s.ptr_same(pos); });
-    COMPILE_ASSERT(anchor_iter != parent_pos->seq_.end(),
-            "Could not found anchor in current parent_pos stmts");
-    parent_pos->seq_.erase(anchor_iter);
-    // create grouped anchor fsmap
-    auto &value = datamap.begin()->second[0];
-
-    auto floor_slice = slice_range {value.begin(), value.end() - 1};
-    floor_slice.emplace_back(
-            std::make_pair(value.back().first, dim2unsigned(floor)));
-    auto tail_slice = slice_range {value.begin(), value.end() - 1};
-    tail_slice.emplace_back(std::make_pair(
-            value.back().first + dim2unsigned(floor), dim2unsigned(tail)));
-    datamap.begin()->second = slice_range_list {floor_slice, tail_slice};
-    // create grouped anchor pos
-    auto floor_pos = make_stmt<stmts_node_t>(std::vector<stmt> {});
-    auto tail_pos = make_stmt<stmts_node_t>(std::vector<stmt> {});
-    auto new_pos
-            = make_stmt<stmts_node_t>(std::vector<stmt> {floor_pos, tail_pos});
-    add_parent_node(floor_pos, new_pos);
-    add_parent_node(tail_pos, new_pos);
-    parent_pos->seq_.emplace_back(new_pos);
-    add_parent_node(new_pos, parent_pos);
-    // remove current fanchor from content map of praent anchor
-    auto parent_anchor = fanchor->parent_;
-    auto root = parent_anchor;
-    while (root) {
-        root->content_number_map_.erase(fanchor.get());
-        root = root->parent_;
-    }
-    // make gourped anchor
-    return std::make_shared<grouped_fusion_anchor_t>(new_pos,
-            fslice_map(datamap), parent_anchor, fanchor->is_input_anchor());
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_anchor.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_anchor.hpp
deleted file mode 100644
index 6f177124a79..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_anchor.hpp
+++ /dev/null
@@ -1,482 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSION_ANCHOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSION_ANCHOR_HPP
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include "graph.hpp"
-#include "graph_map.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/sc_expr.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/variant.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct mixed_parti_t;
-
-using anchor_content_t = variant<sc_op *, fusion_anchor_t *>;
-struct anchor_content_hasher {
-    size_t operator()(const anchor_content_t &v) const {
-        return std::hash<void *>()(v.cast<void *>());
-    }
-};
-
-struct anchor_content_cmper {
-    bool operator()(
-            const anchor_content_t &v, const anchor_content_t &v2) const {
-        return v.cast<void *>() == v2.cast<void *>();
-    }
-};
-
-struct fusion_anchor_t : std::enable_shared_from_this<fusion_anchor_t> {
-private:
-    // control whether the fusion anchor is output anchor or input, default is
-    // output anchor.
-    bool is_input_anchor_;
-    mixed_parti_t *binded_mxp_ = nullptr;
-
-public:
-    friend struct mixed_parti_t;
-    stmts anchor_position_;
-    fslice_map fsmap_;
-
-    // parent anchor
-    std::shared_ptr<fusion_anchor_t> parent_;
-
-    /* Updated when inferring */
-    // blocked graph tensor set, the reason why not use empty gt for judgement
-    // is to distinguish non-visited gt and visited-but-failed gt
-    std::unordered_set<graph_tensor_ptr> blocked_gt_set_;
-    // borrowed fanchor map, must be the parent for current anchor
-    std::unordered_map<graph_tensor_ptr, std::shared_ptr<fusion_anchor_t>>
-            borrowed_fanchor_map_;
-
-    /* Updated when committing */
-    // content-to-number mapping under current fusion anchor scope, includes
-    // either op and anchor
-    std::unordered_map<anchor_content_t, size_t, anchor_content_hasher,
-            anchor_content_cmper>
-            content_number_map_;
-
-    fusion_anchor_t() = default;
-    fusion_anchor_t(stmts pos, const fslice_map &fsmap,
-            const std::shared_ptr<fusion_anchor_t> &parent = nullptr,
-            bool is_input_anchor = false)
-        : is_input_anchor_(is_input_anchor)
-        , anchor_position_(std::move(pos))
-        , fsmap_(std::move(fsmap)) {
-        if (parent) { attach_parent_anchor(parent, nullptr, true); }
-    };
-
-    mixed_parti_t *const get_binded_mxp() const { return binded_mxp_; }
-
-    bool defined() const { return anchor_position_.defined(); }
-
-    bool is_input_anchor() const { return is_input_anchor_; };
-
-    // commit `stmt` to anchor and bind parent node to commited anchor
-    virtual void commit_stmt(const stmt &s) {
-        add_parent_node(s, anchor_position_);
-        anchor_position_->seq_.emplace_back(s);
-    }
-
-    // commit `stmts` to anchor and bind parent node to commited anchor
-    virtual void commit_stmts(const stmts &ss) {
-        for (auto &s : ss->seq_) {
-            commit_stmt(s);
-        }
-    }
-
-    // get all contents inside fusion anchor
-    std::vector<anchor_content_t> get_contents() const {
-        std::vector<anchor_content_t> ret;
-        for (auto &mp : content_number_map_) {
-            ret.emplace_back(mp.first);
-        }
-        return ret;
-    }
-
-    // append list of contents by the given number id
-    void append_contents(
-            std::vector<anchor_content_t> contents, size_t num_id) {
-        for (auto &content : contents) {
-            content_number_map_.insert(std::make_pair(content, num_id));
-        }
-    }
-
-    // append content including either fusion anchor or sc op
-    void append_content(anchor_content_t content);
-
-    // clear content
-    void clear_content(anchor_content_t content);
-
-    void attach_parent_anchor(const std::shared_ptr<fusion_anchor_t> &parent,
-            const std::shared_ptr<fusion_anchor_t> &repl_parent = nullptr,
-            bool inherit_offset = false);
-
-    fusion_anchor_t *get_root() const {
-        auto root = this;
-        while (root->parent_) {
-            COMPILE_ASSERT(root != root->parent_.get(),
-                    "Ring parent anchor relationship found");
-            root = root->parent_.get();
-        }
-        return const_cast<fusion_anchor_t *>(root);
-    }
-
-    void merge(const std::shared_ptr<fusion_anchor_t> &other);
-
-    template <typename T>
-    bool isa() const {
-        static_assert(is_base_of_t<fusion_anchor_t, T>::value,
-                "T is not a subclass of fusion_anchor.");
-        return dynamic_cast<const T *>(this);
-    }
-
-    template <typename T>
-    T *stc_cast() {
-        return static_cast<T *>(this);
-    }
-
-    template <typename T>
-    T *dyn_cast() {
-        return dynamic_cast<T *>(this);
-    }
-
-    virtual ~fusion_anchor_t() = default;
-
-    // This function will find the nearest parent 'for_loop' node for fusion
-    // anchor
-    virtual stmt get_parent_loop() const;
-
-    // This function will return parent body scope
-    stmts get_parent_scope() const {
-        auto loop = get_parent_loop();
-        return loop.isa<for_loop>()
-                ? loop.static_as<for_loop>()->body_.checked_as<stmts>()
-                : loop.checked_as<stmts>();
-    }
-
-    /**
-     * What is parent relationship between anchor A and B
-     * { // anchor A
-     *    for(){
-     *      //
-     *      { // anchor B
-     *      }
-     *    }
-     * }
-     * */
-    bool is_parent_for(const fusion_anchor_t *cur) const {
-        if (!cur) return false;
-        if (is_input_anchor_ != cur->is_input_anchor_) return false;
-        while (cur->parent_) {
-            cur = cur->parent_.get();
-            if (cur == this) return true;
-        }
-        return false;
-    }
-
-    bool is_parent_for(const std::shared_ptr<fusion_anchor_t> &cur) const {
-        return is_parent_for(cur.get());
-    }
-
-    /**
-     * What is sibling relationship between anchor A and B
-     * for(){
-     *    for(){
-     *      //
-     *      { // anchor A
-     *      }
-     *    }
-     *    { // anchor B
-     *    }
-     * }
-     * */
-    bool is_sibling_for(const fusion_anchor_t *other) const {
-        if (is_input_anchor_ != other->is_input_anchor_) return false;
-        if (is_parent_for(other)) return false;
-        auto this_loop = get_parent_loop();
-        auto other_loop = other->get_parent_loop();
-        // get parent loop
-        auto parent_loop = get_parent_node(other_loop);
-        while (parent_loop.defined()) {
-            if (this_loop.ptr_same(parent_loop)) { return true; }
-            parent_loop = get_parent_node(parent_loop);
-        }
-        return false;
-    }
-
-    bool is_sibling_for(const std::shared_ptr<fusion_anchor_t> &other) const {
-        return is_sibling_for(other.get());
-    }
-
-    /**
-     * What is cousin relationship between anchor A and B
-     * for(){
-     *    for(){
-     *      //
-     *      { // anchor A
-     *      }
-     *    }
-     *    for(){
-     *      //
-     *      { // anchor B
-     *      }
-     *    }
-     * }
-     * */
-    bool is_cousin_for(const fusion_anchor_t *cur) const {
-        if (is_input_anchor_ != cur->is_input_anchor_) return false;
-        return !(this->is_parent_for(cur) || cur->is_parent_for(this)
-                       || this->is_sibling_for(cur)
-                       || cur->is_sibling_for(this))
-                && (cur->get_root() == this->get_root()
-                        || get_common_parent_node(
-                                anchor_position_, cur->anchor_position_)
-                                   .defined());
-    }
-
-    bool is_cousin_for(const std::shared_ptr<fusion_anchor_t> &cur) const {
-        return is_cousin_for(cur.get());
-    }
-
-    // check this anchor whether has view of given op
-    bool has_view_of(sc_op *op);
-
-    // check inputs for op
-    bool check_input_for_op(
-            const sc_op *op, std::unordered_set<graph_tensor_ptr> &known_gt);
-
-    // validate inferred slice range of inputs for op, excluded known gt
-    bool validate_input_for_op(const sc_op *op,
-            const std::unordered_set<graph_tensor_ptr> &known_gt);
-
-    // validate inferred slice range of outputs for op, usually for advanced
-    // fusion anchor
-    virtual bool validate_output_for_op(const sc_op *op) {
-        auto ths = this;
-        return !op->is_dynamic()
-                || std::all_of(op->get_outputs().begin(),
-                        op->get_outputs().end(),
-                        [&ths](const graph_tensor_ptr &out) {
-                            // dynamic case does not support multi-slice
-                            // currently
-                            return ths->fsmap_.get(out).size() == 1;
-                        });
-    }
-
-    // forbid the given op in current anchor, set all gt(excluding known gt) to
-    // blocking gt set
-    void forbid_op(const sc_op *op,
-            const std::unordered_set<graph_tensor_ptr> &known_gt);
-
-    // check the depedency for the given op in current anchor
-    bool check_dep_for_op(const sc_op *op);
-
-    // query op committed into current fusion anchor whether is small workload
-    bool is_small_op_workload(const sc_op *op);
-};
-
-using fusion_anchor_ptr = std::shared_ptr<fusion_anchor_t>;
-
-/**
- * iter_anchor represents irregular slice range which is binded with loop iter.
- * @param iter_: loop var
- * @param iter_size: iteration size
- * @param cached_iter_anchor_: real multi anchor used to commit code
- * @param dispatch_helper_: the helper for IR dispatch
- * @param iter_cnt_: built-in iter counter
- * */
-struct iter_fusion_anchor_t : fusion_anchor_t {
-private:
-    // iterated var
-    expr iter_;
-    size_t iter_size_;
-    std::vector<stmts> cached_iter_anchor_;
-    stmt dispatch_helper_;
-    size_t iter_cnt_;
-
-public:
-    iter_fusion_anchor_t(expr iter_var, stmts pos, const fslice_map &fsmap,
-            stmt dispatch_helper = stmt(),
-            const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false)
-        : fusion_anchor_t(pos, fsmap, parent, is_input_anchor)
-        , iter_(std::move(iter_var))
-        , dispatch_helper_(std::move(dispatch_helper)) {
-        COMPILE_ASSERT(
-                !fsmap.empty(), "iterated fusion anchor init slice not found")
-        iter_size_ = fsmap.datamap_.begin()->second.size();
-        COMPILE_ASSERT(std::all_of(fsmap.datamap_.begin(), fsmap.datamap_.end(),
-                               [&](const std::pair<graph_tensor *,
-                                       slice_range_list> &p) {
-                                   return p.second.size() == iter_size_;
-                               }),
-                "all init slice size of iterated fusion anchor should be equal")
-        iter_cnt_ = 0;
-        cached_iter_anchor_.reserve(iter_size_);
-    }
-
-    // iter anchor special inner-build `commit_`
-    void commit(const stmt &s);
-
-    // override commit `stmt` to anchor
-    void commit_stmt(const stmt &s) override { commit(s); }
-
-    // override commit `stmts` to anchor
-    void commit_stmts(const stmts &ss) override { commit(ss); }
-
-    const size_t get_iter_size() const { return iter_size_; }
-
-    bool validate_output_for_op(const sc_op *op) override {
-        auto ths = this;
-        return std::all_of(op->get_outputs().begin(), op->get_outputs().end(),
-                [&ths](const graph_tensor_ptr &out) {
-                    return ths->get_iter_size() == ths->fsmap_.get(out).size();
-                });
-    }
-};
-
-/**
- * grouped_anchor contains grouped of basic anchors but under unified management
- * and committed simultaneously.
- * @param group_size_: total group amount
- * @param group_pos_: real group anchor position used to commit code
- * @param group_cnt_: built-in group counter
- * */
-struct grouped_fusion_anchor_t : fusion_anchor_t {
-private:
-    size_t group_size_;
-    size_t group_cnt_;
-
-public:
-    friend class fusion_anchor_mgr_t;
-
-    grouped_fusion_anchor_t(stmts pos, const fslice_map &fsmap,
-            const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false)
-        : fusion_anchor_t(pos, fsmap, parent, is_input_anchor) {
-        COMPILE_ASSERT(
-                !fsmap.empty(), "grouped fusion anchor init slice not found")
-        group_size_ = fsmap.datamap_.begin()->second.size();
-        COMPILE_ASSERT(std::all_of(fsmap.datamap_.begin(), fsmap.datamap_.end(),
-                               [&](const std::pair<graph_tensor *,
-                                       slice_range_list> &p) {
-                                   return p.second.size() == group_size_;
-                               }),
-                "all init slice size of grouped fusion anchor should be equal")
-        COMPILE_ASSERT(pos->seq_.size() == group_size_,
-                "grouped anchor position " << pos->seq_.size()
-                                           << " should be equal to group size "
-                                           << group_size_)
-        group_cnt_ = 0;
-    }
-
-    // group anchor special inner-build `commit_`
-    void commit(const stmt &s);
-
-    // override commit `stmt` to anchor
-    void commit_stmt(const stmt &s) override { commit(s); }
-
-    // override commit `stmts` to anchor
-    void commit_stmts(const stmts &ss) override { commit(ss); }
-
-    // grouped anchor should override `get_parent_loop` method due to it owns
-    // multi anchor positions
-    stmt get_parent_loop() const override;
-
-    const size_t get_group_size() const { return group_size_; }
-
-    bool validate_output_for_op(const sc_op *op) override {
-        auto ths = this;
-        return std::all_of(op->get_outputs().begin(), op->get_outputs().end(),
-                [&ths](const graph_tensor_ptr &out) {
-                    return ths->get_group_size() == ths->fsmap_.get(out).size();
-                });
-    }
-};
-
-using slice_map = std::unordered_map<graph_tensor *, slice_range_list>;
-
-class fusion_anchor_mgr_t {
-private:
-    // binded mixed partition
-    mixed_parti_t *binded_mxp_ = nullptr;
-    // maintain fusion anchor list
-    std::vector<fusion_anchor_ptr> fanchor_list_;
-    // grouped id mapping
-    std::unordered_map<int, std::shared_ptr<grouped_fusion_anchor_t>>
-            grouped_id_map_;
-
-public:
-    fusion_anchor_mgr_t() = default;
-    fusion_anchor_mgr_t(mixed_parti_t *binded_mxp) : binded_mxp_(binded_mxp) {};
-
-    void create_fusion_anchor(const graph_tensor_ptr &gt,
-            const slice_range &slice, const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false);
-
-    void create_fusion_anchor(const expr &tsr, const slice_range &slice,
-            const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false);
-
-    // create basic fusion anchor
-    void create_fusion_anchor(const slice_map &fsmap,
-            const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false);
-
-    // create iterated fusion anchor
-    void create_fusion_anchor(const expr &iter_var, const slice_map &fsmap,
-            const stmt &dispatch_helper = stmt(),
-            const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false);
-
-    // create grouped fusion anchor
-    void create_fusion_anchor(int group_id, const slice_map &fsmap,
-            const fusion_anchor_ptr &parent = nullptr,
-            bool is_input_anchor = false);
-
-    // get contained fusion anchor list
-    const std::vector<fusion_anchor_ptr> &get_fusion_anchor() const {
-        return fanchor_list_;
-    }
-
-    // get binded mxp
-    mixed_parti_t *const get_binded_mxp() const { return binded_mxp_; }
-};
-
-// create fusion anchor utils, usually used in template
-void create_fusion_anchor(fusion_anchor_mgr_t *fusion,
-        const graph_tensor_ptr &gt, const slice_range &slice);
-// split common anchor into grouped anchor
-fusion_anchor_ptr try_convert_anchor(
-        const context_ptr &ctx, const fusion_anchor_ptr &fanchor);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_cost_model.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_cost_model.cpp
deleted file mode 100644
index 49e685c5f51..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_cost_model.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "fusion_cost_model.hpp"
-#include <utility>
-#include "fusible_op_utils.hpp"
-#include "fusion_anchor.hpp"
-#include "mixed_partition.hpp"
-
-SC_MODULE(graph.fusion_cost_model);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static int op_num(mixed_parti_t *parti) {
-    int num = parti->ops.size();
-    return num;
-}
-
-static int loop_parallelism(mixed_parti_t *parti) {
-    auto outer_loops = parti->get_outer_loops();
-    return evaluate_loop_parallel_balance(outer_loops);
-}
-
-std::vector<std::pair<float, cost_eval>> create_default_evaluator() {
-    std::vector<std::pair<float, cost_eval>> inits;
-    inits.emplace_back(std::make_pair(0.1f, op_num));
-    inits.emplace_back(std::make_pair(1.0f, loop_parallelism));
-    return inits;
-}
-
-fusion_cost_model_base_t::fusion_cost_model_base_t(mixed_parti_t *parti)
-    : binded_mxp_(parti), enable_(parti->ctx_->flags_.use_cost_model_) {}
-
-static_fusion_cost_model_t::static_fusion_cost_model_t(mixed_parti_t *parti)
-    : fusion_cost_model_base_t(parti)
-    , max_scores_(0)
-    , evaluators_(create_default_evaluator()) {}
-
-float static_fusion_cost_model_t::evaluate() {
-    float new_scores = 0;
-    if (!enable_) return new_scores;
-    for (auto &eval : evaluators_) {
-        new_scores += eval.first * eval.second(binded_mxp_);
-    }
-    if (new_scores > max_scores_) max_scores_ = new_scores;
-    return new_scores;
-}
-
-void static_fusion_cost_model_t::append_evaluator(
-        float weight, const cost_eval &eval) {
-    evaluators_.emplace_back(std::make_pair(weight, eval));
-}
-
-bool static_fusion_cost_model_t::make_decision_for_parti(
-        const mixed_parti_t *parti, size_t merged_loop_size,
-        parti_merge_kind merge_kind) {
-    // query if turn on
-    if (!enable_) return true;
-    /* loop_parallelism */
-    auto ths_outer_loop = binded_mxp_->get_outer_loops();
-    auto other_outer_loop = parti->get_outer_loops();
-    COMPILE_ASSERT(!ths_outer_loop.empty() && !other_outer_loop.empty(),
-            "Could not merge empty loop")
-    COMPILE_ASSERT((merged_loop_size <= ths_outer_loop.size())
-                    && (merged_loop_size <= other_outer_loop.size()),
-            "merge loop size should less than both loop")
-    if (merge_kind == parti_merge_kind::horizontal) {
-        return evaluate_loop_parallel_balance(ths_outer_loop) != 1.0f
-                && evaluate_loop_parallel_balance(other_outer_loop) != 1.0f;
-    }
-    /* for verticall merge*/
-    COMPILE_ASSERT(merge_kind == parti_merge_kind::vertical,
-            "No cost metric found for parallel merge")
-    // in avoid of loss for loop optimize opportunity
-    if (binded_mxp_->can_optimize_outer_loop(true)
-            ^ parti->can_optimize_outer_loop(true)) {
-        return false;
-    }
-
-    // check loop parallelism
-    auto ths_loop_parallelism = evaluate_loop_parallel_balance(ths_outer_loop);
-    auto other_loop_parallelism
-            = evaluate_loop_parallel_balance(other_outer_loop);
-    auto merged_outer_loop = std::vector<for_loop> {
-            ths_outer_loop.begin(), ths_outer_loop.begin() + merged_loop_size};
-    auto merged_loop_parallelism
-            = evaluate_loop_parallel_balance(merged_outer_loop);
-    if (merged_loop_parallelism < ths_loop_parallelism
-            || merged_loop_parallelism < other_loop_parallelism) {
-        // evalute workload size
-        if (!evaluate_loop_parallel_balance(merged_outer_loop, true)
-                || ((merged_loop_parallelism < ths_loop_parallelism)
-                        && !binded_mxp_->is_small_workload())
-                || ((merged_loop_parallelism < other_loop_parallelism)
-                        && !parti->is_small_workload())) {
-            SC_MODULE_INFO << "rejects to merge two "
-                              "partition: "
-                           << binded_mxp_->func_->name_ << " and "
-                           << parti->func_->name_
-                           << " from perspective of loop parallelism";
-            return false;
-        }
-    }
-
-    /* cache efficiency */
-    // skip standalone parti
-    if (binded_mxp_->ops.size() == 1) return true;
-    // get real merged trace and merged inplace map
-    auto merged_mem_info
-            = merge_real_mem_info(binded_mxp_->buf_alloc_, parti->buf_alloc_);
-    // get real buffer usage
-    auto buffer_usage = get_buffer_usage(
-            binded_mxp_->ctx_, merged_mem_info.first, merged_mem_info.second);
-    // get threshold
-    auto threshold = binded_mxp_->ctx_->machine_.cpu_flags_.getDCacheSize(2);
-
-    // check cache efficiency
-    if (buffer_usage > threshold) {
-        SC_MODULE_INFO << "rejects to merge two "
-                          "partition: "
-                       << binded_mxp_->func_->name_ << " and "
-                       << parti->func_->name_
-                       << " from perspective of cache efficiency";
-        return false;
-    } else {
-        return true;
-    }
-}
-
-// Some kinds of op may perform more parallism when fusion break, like tunable
-// op and broadcast op, which needs double check
-static bool need_double_check_standalone_parallel(
-        const sc_op *op, const mixed_parti_t *mxp) {
-    if (op->isa<tunable_op_t>()) return true;
-    if (auto broadcast_op = op->dyn_cast<const op_traits::may_broadcast_t>()) {
-        auto non_bc_index = broadcast_op->get_non_broadcast_input_index(true);
-        // If no broadcast semantic found
-        if (non_bc_index.size() == op->get_inputs().size()) return false;
-        // If any non-broadcast input of op is included in mxp with tunable op,
-        // rely on previous decision made by cost model
-        if (mxp->contain_tunable_op()
-                && std::any_of(non_bc_index.begin(), non_bc_index.end(),
-                        [&op, &mxp](const int &idx) {
-                            return mxp->contains(
-                                    op->get_inputs()[idx]->producer_owner_);
-                        }))
-            return false;
-        if (mxp->can_optimize_outer_loop(true)) return false;
-        return true;
-    }
-    return false;
-}
-
-bool static_fusion_cost_model_t::make_decision_for_op(
-        const sc_op *op, const fusion_anchor_ptr &fanchor) {
-    // query if turn on
-    if (!enable_) return true;
-    /** Auto Skip List:
-     * 1. empty partition
-     * 2. nested parallel template
-     * 3. singel op lowering
-     * */
-    if (binded_mxp_->empty() || binded_mxp_->contain_nested_parallel_for()
-            || is_single_op_graph(op->get_owner_graph()))
-        return true;
-
-    auto orig_loop_parallelism
-            = evaluate_loop_parallel_balance(binded_mxp_->get_outer_loops());
-    auto fanchor_loop_parallelism = evaluate_loop_parallel_balance(
-            binded_mxp_->get_outer_loops(fanchor));
-
-    bool ret = (!binded_mxp_->contain_tunable_op() && !op->isa<tunable_op_t>())
-            || (fanchor_loop_parallelism >= orig_loop_parallelism);
-
-    // double check parallelism of standalone op
-    if (need_double_check_standalone_parallel(op, binded_mxp_)) {
-        SC_MODULE_INFO << "double check standalone parallellism for "
-                       << op->op_name_ << op->logical_op_id_;
-        mixed_parti_t op_parti(binded_mxp_->ctx_,
-                std::const_pointer_cast<sc_op>(op->shared_from_this()));
-        float standalone_parallel
-                = evaluate_loop_parallel_balance(op_parti.get_outer_loops());
-        // if original result of partition can not meet requirement
-        if (ret
-                && !evaluate_loop_parallel_balance(
-                        binded_mxp_->get_outer_loops(), true)) {
-            // if new parti created by op owning more loop parallelism, reject
-            // to fuse it
-            if (standalone_parallel > fanchor_loop_parallelism
-                    && !fanchor->is_small_op_workload(op)) {
-                ret = false;
-            }
-        } else if (!ret
-                && evaluate_loop_parallel_balance(
-                        binded_mxp_->get_outer_loops(fanchor), true)) {
-            // if new parti created by op can not meet loop parallelism
-            // requirement, suggest to fuse it anyway.
-            if (standalone_parallel != 1.f) { ret = true; }
-        }
-    }
-    if (!ret) {
-        SC_MODULE_INFO << "rejects to commit op: " << op->op_name_
-                       << op->logical_op_id_
-                       << " into current fusion anchor from "
-                          "perspective of parallellism";
-    }
-    return ret;
-}
-
-dynamic_fusion_cost_model_t::dynamic_fusion_cost_model_t(
-        mixed_parti_t *parti, dynamic_fusion_policy_t policy)
-    : fusion_cost_model_base_t(parti), cond_(false), policy_(policy) {}
-
-bool dynamic_fusion_cost_model_t::make_decision_for_parti(
-        const mixed_parti_t *parti, size_t merged_loop_size,
-        parti_merge_kind merge_kind) {
-    // query if turn on
-    if (!enable_ || policy_ == dynamic_fusion_policy_t::max_fusion) return true;
-    /* loop_parallelism */
-    auto ths_outer_loops = binded_mxp_->get_outer_loops();
-    auto other_outer_loops = parti->get_outer_loops();
-    COMPILE_ASSERT(!ths_outer_loops.empty() && !other_outer_loops.empty(),
-            "Could not merge empty loop")
-    COMPILE_ASSERT((merged_loop_size <= ths_outer_loops.size())
-                    && (merged_loop_size <= other_outer_loops.size()),
-            "merge loop size should less than both loop");
-    expr res_cond, dummy_cond;
-    if (merge_kind == parti_merge_kind::horizontal) {
-        float ths_parallelism = evaluate_loop_parallel_balance(
-                {ths_outer_loops[0]}, res_cond);
-        float other_parallelism = evaluate_loop_parallel_balance(
-                {other_outer_loops[0]}, dummy_cond);
-        bool ret = ths_parallelism != 1.0f && other_parallelism != 1.0f;
-        if (!ret) { cond_ = cond_ || !(res_cond && dummy_cond); }
-        return ret;
-    }
-    /* for verticall merge*/
-    COMPILE_ASSERT(merge_kind == parti_merge_kind::vertical,
-            "No cost metric found for parallel merge")
-    // in avoid of loss for loop optimize opportunity
-    if (binded_mxp_->can_optimize_outer_loop(true)
-            ^ parti->can_optimize_outer_loop(true)) {
-        return false;
-    }
-
-    // check loop parallelism
-    float ths_parallelism
-            = evaluate_loop_parallel_balance(ths_outer_loops, dummy_cond);
-    float other_parallelism
-            = evaluate_loop_parallel_balance(other_outer_loops, dummy_cond);
-    auto merged_outer_loop = std::vector<for_loop> {ths_outer_loops.begin(),
-            ths_outer_loops.begin() + merged_loop_size};
-    float merged_parallelism
-            = evaluate_loop_parallel_balance(merged_outer_loop, res_cond);
-    if (merged_parallelism < ths_parallelism
-            || merged_parallelism < other_parallelism) {
-        SC_MODULE_INFO << "rejects to merge two "
-                          "partition: "
-                       << binded_mxp_->func_->name_ << " and "
-                       << parti->func_->name_
-                       << " from perspective of loop parallelism";
-        cond_ = cond_ || res_cond;
-        return false;
-    }
-    // don't set cond_ if accept the parti.
-    /* find how to describe cache efficiency */
-    return true;
-}
-
-bool dynamic_fusion_cost_model_t::make_decision_for_op(
-        const sc_op *op, const fusion_anchor_ptr &fanchor) {
-    // query if turn on
-    if (!enable_ || policy_ == dynamic_fusion_policy_t::max_fusion) return true;
-    // auto skip
-    if (!binded_mxp_->contain_tunable_op() && !op->isa<tunable_op_t>())
-        return true;
-    expr res_cond, thr_cond, dummy_cond;
-    auto ths_outer_loops = binded_mxp_->get_outer_loops();
-    auto other_outer_loops = binded_mxp_->get_outer_loops(fanchor);
-
-    auto ths_parallelism
-            = evaluate_loop_parallel_balance(ths_outer_loops, dummy_cond);
-    auto other_parallelism
-            = evaluate_loop_parallel_balance(other_outer_loops, res_cond);
-    bool ret = ths_parallelism <= other_parallelism;
-    if (op->isa<tunable_op_t>()
-            && evaluate_loop_parallel_balance(
-                       binded_mxp_->get_outer_loops(), thr_cond, true)
-                    == 0.f) {
-        SC_MODULE_INFO << "double check standalone parallellism for "
-                       << op->op_name_ << op->logical_op_id_;
-        mixed_parti_t tunable_parti(binded_mxp_->ctx_,
-                std::const_pointer_cast<sc_op>(op->shared_from_this()));
-        if (evaluate_loop_parallel_balance(
-                    tunable_parti.get_outer_loops(), dummy_cond)
-                > other_parallelism) {
-            ret = false;
-            cond_ = cond_ || thr_cond;
-        }
-    }
-    if (!ret) {
-        SC_MODULE_INFO << "rejects to commit op: " << op->op_name_
-                       << op->logical_op_id_
-                       << " into current fusion anchor from "
-                          "perspective of parallellism";
-        cond_ = cond_ || res_cond;
-        return false;
-    }
-    // don't set cond_ when accept the op
-    return true;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_cost_model.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_cost_model.hpp
deleted file mode 100644
index ffd2c1fb3a6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_cost_model.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSION_COST_MODEL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSION_COST_MODEL_HPP
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-#include <compiler/ir/sc_expr.hpp>
-#include <unordered_set>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class sc_op;
-struct mixed_parti_t;
-struct fusion_anchor_t;
-enum class parti_merge_kind;
-enum class dynamic_fusion_policy_t;
-
-using cost_eval = std::function<int(mixed_parti_t *)>;
-
-/**
- * fusion cost model has two main function:
- * 1. make final decision for fusing or not in consider of loop parallelism
- * or cache locality.
- * 2. evaluate fusion partition and throw warning when prediction is under
- * expectation.
- * */
-struct fusion_cost_model_base_t {
-protected:
-    mixed_parti_t *binded_mxp_;
-    bool enable_;
-
-public:
-    fusion_cost_model_base_t(mixed_parti_t *parti);
-    virtual ~fusion_cost_model_base_t() {}
-    // disable cost model
-    void disable() { enable_ = false; }
-    // judge if it is enabled
-    bool is_enabled() const { return enable_; }
-    // evaluate current mixed partition by several evaluator
-    virtual float evaluate() = 0;
-    // make decision for partition merge
-    virtual bool make_decision_for_parti(const mixed_parti_t *parti,
-            size_t merge_loop_size, parti_merge_kind merge_kind)
-            = 0;
-    // make decision for op and fusion anchor
-    virtual bool make_decision_for_op(
-            const sc_op *op, const std::shared_ptr<fusion_anchor_t> &fanchor)
-            = 0;
-    virtual expr get_fusion_policy_condition() const { return false; }
-};
-
-struct static_fusion_cost_model_t : public fusion_cost_model_base_t {
-private:
-    float max_scores_; // cache the top scores
-    std::vector<std::pair<float, cost_eval>> evaluators_;
-
-public:
-    static_fusion_cost_model_t(mixed_parti_t *parti);
-    // evaluate current mixed partition by several evaluator
-    float evaluate() override;
-    // append new defined evaluator
-    void append_evaluator(float weight, const cost_eval &eval);
-    // make decision for partition merge
-    bool make_decision_for_parti(const mixed_parti_t *parti,
-            size_t merge_loop_size, parti_merge_kind merge_kind) override;
-    // make decision for op and fusion anchor
-    bool make_decision_for_op(const sc_op *op,
-            const std::shared_ptr<fusion_anchor_t> &fanchor) override;
-};
-
-struct dynamic_fusion_cost_model_t : public fusion_cost_model_base_t {
-private:
-    expr cond_;
-    dynamic_fusion_policy_t policy_;
-
-public:
-    dynamic_fusion_cost_model_t(
-            mixed_parti_t *parti, dynamic_fusion_policy_t policy);
-    // evaluate current mixed partition by several evaluator
-    float evaluate() override { return 0.f; }
-    expr get_fusion_policy_condition() const override { return cond_; }
-    // make decision for partition merge
-    bool make_decision_for_parti(const mixed_parti_t *parti,
-            size_t merge_loop_size, parti_merge_kind merge_kind) override;
-    // make decision for op and fusion anchor
-    bool make_decision_for_op(const sc_op *op,
-            const std::shared_ptr<fusion_anchor_t> &fanchor) override;
-};
-
-using fusion_cost_model_ptr = std::shared_ptr<fusion_cost_model_base_t>;
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_data.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_data.hpp
deleted file mode 100644
index 9abfdcea32f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/fusion_data.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSION_DATA_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_FUSION_DATA_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "graph_map.hpp"
-#include "tensor_slice.hpp"
-#include <compiler/ir/sc_expr.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using slice_range_list = std::vector<slice_range>;
-using slice_range_map = std::unordered_map<int, slice_range_list>;
-
-enum class infer_status_code : int {
-    OK = 0, // Successful
-    RETRY, // Need retry another anchor
-    FAIL, // Could not infer
-    UNKNOWN, // Unknown
-    END,
-};
-
-expr do_cast_and_fold(const expr &);
-
-inline std::vector<expr> get_slice_idx(const slice_range &range) {
-    std::vector<expr> ret;
-    for (auto &r : range) {
-        ret.emplace_back(do_cast_and_fold(r.first));
-    }
-    return ret;
-}
-
-inline std::vector<expr> get_slice_shape(const slice_range &range) {
-    std::vector<expr> ret;
-    for (auto &r : range) {
-        ret.emplace_back(do_cast_and_fold(r.second));
-    }
-    return ret;
-}
-
-inline slice_range gen_slice_by_dims(const sc_dims &dims) {
-    slice_range ret;
-    for (auto &r : dims) {
-        ret.emplace_back(std::make_pair(expr(0), dim2unsigned(r)));
-    }
-    return ret;
-}
-
-inline slice_range gen_slice_by_dims_expr(const std::vector<expr> &dims) {
-    slice_range ret;
-    for (auto &r : dims) {
-        ret.emplace_back(std::make_pair(dim2unsigned(0), r));
-    }
-    return ret;
-}
-
-bool is_reshaped_tensor(const expr &tsr);
-
-expr transform_tsr2stsr_with_range(const expr &tsr, const slice_range &range);
-
-expr transform_tsl2stsr(const tensor_slice &tsl);
-
-expr transform_tsr2tptr_with_range(const expr &tsr, const slice_range &range);
-
-expr transform_tptr2stsr(const expr &tptr);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph.cpp
deleted file mode 100644
index 25df2dd0d3e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph.cpp
+++ /dev/null
@@ -1,1001 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <limits>
-#include <vector>
-
-#include "dynamic_dispatch_key.hpp"
-#include "dynamic_lower_info.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/dynamic_internal_info.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/fusion_data.hpp>
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/simple_licm.hpp>
-#include <util/general_object.hpp>
-#include <util/hash_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(sc_graph);
-
-extern void __dummy_init();
-
-template <typename valT>
-valT &gt_map_t<valT>::get(graph_tensor *v) {
-    auto itr = datamap_.find(v);
-    if (itr != datamap_.end()) { return itr->second; }
-    auto &ret = datamap_[v];
-    return ret;
-}
-
-template <typename valT>
-valT &gt_map_t<valT>::get(const graph_tensor_ptr &v) {
-    return get(v.get());
-}
-
-template <typename valT>
-bool gt_map_t<valT>::haskey(graph_tensor *v) const {
-    return datamap_.find(v) != datamap_.end();
-}
-
-template <typename valT>
-bool gt_map_t<valT>::haskey(const graph_tensor_ptr &v) const {
-    return haskey(v.get());
-}
-
-template struct gt_map_t<slice_range_list>;
-template struct gt_map_t<graph_tensor_ptr>;
-template struct gt_map_t<std::vector<int>>;
-template struct gt_map_t<expr>;
-template struct gt_map_t<fusion_anchor_t *>;
-template struct gt_map_t<binding_axis>;
-
-sc_op_ptr op_traits::auto_copyable_t::copy(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ths = dynamic_cast<sc_op *>(this);
-    auto ret = mgr.make(ths->op_name_, ins, outs, ths->attrs_);
-    ret->copy_dispatch_key_set_from_op(ths->shared_from_this());
-    return ret;
-}
-
-std::vector<graph_tensor_ptr> copy_logical_tsr(
-        const std::vector<graph_tensor_ptr> &v) {
-    std::vector<graph_tensor_ptr> ret;
-    ret.reserve(v.size());
-    for (auto &t : v) {
-        ret.emplace_back(std::make_shared<graph_tensor>(nullptr, t->details_));
-    }
-    return ret;
-}
-
-static std::vector<expr> *get_tensor_dims(const expr &tsr) {
-    if (tsr.isa<tensor>()) {
-        auto t = tsr.static_as<tensor>();
-        return &t->dims_;
-    } else {
-        COMPILE_ASSERT(tsr.isa<tensorptr>(),
-                "tensor_slice only accepts a tensor or tensorptr, got: "
-                        << tsr);
-        return &tsr.static_as<tensorptr>()->shape_;
-    }
-}
-
-const std::vector<expr> &tensor_slice::get_base_dims() const {
-    return *get_tensor_dims(tptr_);
-}
-
-sc_data_type_t tensor_slice::get_base_dtype() const {
-    return get_real_tensor()->elem_dtype_;
-}
-
-tensor tensor_slice::get_real_tensor() const {
-    auto &base = tptr_->base_;
-    COMPILE_ASSERT(base.isa<indexing>(),
-            "tensor_ptr base should be indexing, but got: " << base);
-    auto tsr = base->ptr_;
-    while (!tsr.isa<tensor>()) {
-        COMPILE_ASSERT(tsr.isa<tensorptr>(),
-                "tensor_slice only accepts a tensor or tensorptr, got: "
-                        << tsr);
-        auto base = tsr.static_as<tensorptr>()->base_;
-        COMPILE_ASSERT(base.isa<indexing>(),
-                "tensor_ptr base should be indexing, but got: " << base);
-        tsr = base.checked_as<indexing>()->ptr_;
-    }
-    return tsr.static_as<tensor>();
-}
-
-slice_range tensor_slice::get_ranges() const {
-    COMPILE_ASSERT(get_shape().size() == tptr_->base_->idx_.size(),
-            "Unmatched shape and idx found");
-    auto shape = get_shape();
-    auto offset = tptr_->base_->idx_;
-    slice_range ranges;
-    for (int64_t i = 0; i < nslice_dims(); i++) {
-        ranges.emplace_back(std::make_pair(offset[i], shape[i]));
-    }
-    return ranges;
-}
-
-bool tensor_slice::full_on_axis(const std::vector<int> &axis) const {
-    auto &dims = get_base_dims();
-    auto &idx = tptr_->base_->idx_;
-    auto &shape = get_shape();
-    for (auto &ax : axis) {
-        if (!idx[ax].isa<constant>() || !shape[ax].isa<constant>()) {
-            return false;
-        }
-        if (get_const_as_int(idx[ax].checked_as<constant>()) != 0
-                || get_const_as_int(shape[ax].checked_as<constant>())
-                        != get_const_as_int(dims[ax].checked_as<constant>())) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool tensor_slice::is_full() const {
-    auto &dims = get_base_dims();
-    std::vector<int> total_axis;
-    total_axis.reserve(static_cast<int>(dims.size()));
-    for (int i = 0; i < static_cast<int>(dims.size()); i++) {
-        total_axis.emplace_back(i);
-    }
-    return full_on_axis(total_axis);
-}
-
-bool tensor_slice::is_const() const {
-    return std::all_of(shape_.begin(), shape_.end(),
-            [](const expr &e) { return do_cast_and_fold(e).isa<constant>(); });
-}
-
-tensor_slice::tensor_slice(const expr &tsr) {
-    if (tsr.isa<tensor>()) {
-        auto t = tsr.static_as<tensor>();
-        tptr_ = builder::tensor_ptr(
-                tsr, std::vector<expr>(t->dims_.size(), 0), {}, true)
-                        .static_as<tensorptr>();
-        shape_ = t->dims_;
-    } else {
-        COMPILE_ASSERT(tsr.isa<tensorptr>(),
-                "tensor_slice only accepts a tensor or tensorptr, got: "
-                        << tsr);
-        tptr_ = tsr.static_as<tensorptr>();
-        shape_ = tptr_->shape_;
-    }
-}
-
-tensor_slice::tensor_slice(const expr &tsr, slice_range &&range) {
-    auto dims = get_tensor_dims(tsr);
-    if (dims->size() != range.size())
-        COMPILE_ASSERT(dims->size() == 1
-                        && get_const_as_int((*dims)[0].checked_as<constant>())
-                                == 1,
-                "Unmatched range found. Tensor: "
-                        << (tsr.isa<tensor>() ? tsr.static_as<tensor>()->name_
-                                              : "")
-                        << " have dims: " << utils::print_vector(*dims)
-                        << " but got slice range: "
-                        << utils::print_pair_vector(range));
-    tptr_ = builder::tensor_ptr(tsr,
-            dims->size() != range.size() ? std::vector<expr> {0}
-                                         : get_slice_idx(range),
-            {}, true)
-                    .static_as<tensorptr>();
-    shape_ = get_slice_shape(range);
-}
-
-graph_tensor::graph_tensor(sc_op *owner) : producer_owner_(owner) {}
-graph_tensor::graph_tensor(sc_op *owner, const logical_tensor_t &lt)
-    : details_(lt), producer_owner_(owner) {}
-
-graph_tensor::graph_tensor(sc_op *owner, const sc_data_format_t &format,
-        const sc_dims &plain_shape, const sc_data_type_t &type,
-        const sc_dims &stride)
-    : details_(format, plain_shape, type, stride), producer_owner_(owner) {}
-
-const sc_dims &logical_tensor_t::get_blocking_dims() const {
-    return dims_;
-}
-
-std::vector<expr> logical_tensor_t::get_blocking_dims_expr(
-        sc_graph_t &g) const {
-    return get_blocking_shapes_expr(g, plain_dims_, format_);
-}
-
-static bool check_stride_validity(
-        const bool is_dynamic, const sc_dims &dims, const sc_dims &strides) {
-    return strides.size() == dims.size()
-            && (is_dynamic
-                    || std::is_sorted(strides.begin(), strides.end(),
-                            std::greater<sc_dim>()));
-}
-
-void logical_tensor_t::internal_update() {
-    dims_ = sc_data_format_t::get_blocking_shapes(plain_dims_, format_);
-    if (strides_.empty()) {
-        strides_ = compute_dense_stride(dims_);
-    } else {
-        COMPILE_ASSERT(check_stride_validity(is_dynamic(), dims_, strides_),
-                "Specified strides value invalid or not consistent with "
-                "real(blocking) dims.")
-    }
-}
-
-bool logical_tensor_t::is_dynamic() const {
-    return !std::all_of(plain_dims_.begin(), plain_dims_.end(),
-            [](const sc_dim &dim) { return !is_dynamic_dim(dim); });
-}
-
-// sets the logical dims in plain format
-void logical_tensor_t::set_plain_dims(const sc_dims &plain_dims) {
-    COMPILE_ASSERT(is_dynamic() || is_dense(),
-            "Forbid update format on a strided tensor.");
-    strides_.clear();
-    plain_dims_ = plain_dims;
-    internal_update();
-}
-
-// TODO(xxx): this logic maybe not correct, just distinguish with set_plain_dims
-void logical_tensor_t::set_blocking_dims(const sc_dims &blocking_dims) {
-    // assert(format_.format_code_ == format_kinds::any);
-    COMPILE_ASSERT(is_dense(), "Forbid set blocking dims on a strided tensor.");
-    format_.format_code_ = format_kinds::any;
-    plain_dims_ = blocking_dims;
-    dims_ = blocking_dims;
-    strides_ = compute_dense_stride(dims_);
-}
-
-void logical_tensor_t::set_format(const sc_data_format_t &newv) {
-    COMPILE_ASSERT(is_dense(), "Forbid set format on a strided tensor.");
-    strides_.clear();
-    format_ = newv;
-    internal_update();
-}
-
-void logical_tensor_t::set_strides(const sc_dims &strides) {
-    COMPILE_ASSERT(check_stride_validity(is_dynamic(), dims_, strides),
-            "Specified strides value invalid or not consistent with "
-            "real(blocking) dims.")
-    strides_ = strides;
-}
-
-void logical_tensor_t::set_format_and_stride(
-        const sc_data_format_t &newv, const sc_dims &strides) {
-    format_ = newv;
-    strides_ = strides;
-    internal_update();
-}
-
-void logical_tensor_t::add_format_candidate(const sc_data_format_t &newv) {
-    format_candidates_.insert(newv);
-}
-
-void logical_tensor_t::remove_format_candidate(const sc_data_format_t &v) {
-    auto it = format_candidates_.find(v);
-    if (it != format_candidates_.end()) { format_candidates_.erase(it); }
-}
-
-void logical_tensor_t::set_format_candidates(
-        const std::vector<sc_data_format_t> &newf) {
-    format_candidates_.insert(newf.begin(), newf.end());
-    if (format_candidates_.size() == 1) {
-        format_ = *format_candidates_.begin();
-    }
-    internal_update();
-}
-
-size_t logical_tensor_t::get_blocking_byte_size() const {
-    COMPILE_ASSERT(!is_dynamic(), "blocking byte size should be static shape.");
-    size_t sz = utils::get_sizeof_type(dtype_);
-    for (auto z : get_blocking_dims()) {
-        sz *= z;
-    }
-    return sz;
-}
-
-bool logical_tensor_t::is_dense() {
-    if (strides_.empty()) { return true; }
-    if (is_dynamic()) { return true; }
-    if (std::any_of(plain_dims_.begin(), plain_dims_.end(),
-                [](const sc_dim &d) { return d == 0; })) {
-        return true;
-    }
-    assert(strides_.size() == dims_.size());
-    if (strides_.back() != 1) { return false; }
-    for (int i = dims_.size() - 2; i >= 0; --i) {
-        if (strides_[i] != strides_[i + 1] * dims_[i + 1]) { return false; }
-    }
-    return true;
-}
-
-std::vector<expr> logical_tensor_t::get_strides_expr(sc_graph_t &g) const {
-    return is_dynamic() ? dims_to_dense_stride(get_blocking_dims_expr(g))
-                        : g.dims_to_expr(get_strides());
-}
-
-sc_dims logical_tensor_t::compute_dense_stride(const sc_dims &dims) {
-    sc_dims strides(dims.size(), 1);
-    for (int i = dims.size() - 2; i >= 0; --i) {
-        if (dims[i + 1] == 0) {
-            strides[i] = strides[i + 1];
-        } else {
-            strides[i] = dims[i + 1] * strides[i + 1];
-        }
-    }
-    return strides;
-}
-
-void logical_tensor_t::to_string(std::ostream &os) {
-    os << '[' << dtype_ << ' ' << utils::print_vector(get_blocking_dims())
-       << " @ " << format_ << ']';
-}
-
-size_t logical_tensor_t::hash() const {
-    size_t seed = 0;
-    hash_combine(seed, static_cast<uint64_t>(dtype_));
-    hash_combine(seed, plain_dims_);
-    hash_combine(seed, dims_);
-    hash_combine(seed, strides_);
-    hash_combine(seed, format_);
-    return seed;
-}
-
-void graph_tensor::attach_use(sc_op_ptr op, int index) {
-    uses_.emplace_back(std::make_pair(index, std::move(op)));
-}
-
-void graph_tensor::detach_use(const sc_op_ptr &op) {
-    for (auto itr = uses_.begin(); itr != uses_.end();) {
-        if (itr->second == op) {
-            itr = uses_.erase(itr);
-        } else {
-            ++itr;
-        }
-    }
-}
-
-void graph_tensor::detach_use(const sc_op_ptr &op, int input_idx) {
-    for (auto itr = uses_.begin(); itr != uses_.end();) {
-        if (itr->first == input_idx && itr->second == op) {
-            itr = uses_.erase(itr);
-        } else {
-            ++itr;
-        }
-    }
-}
-
-void graph_tensor::replace_with(const graph_tensor_ptr &v) {
-    while (!uses_.empty()) {
-        auto node = uses_.front();
-        node.second->replace_input(node.first, v);
-    }
-}
-
-graph_tensor_ptr graph_tensor::copy() {
-    return std::make_shared<graph_tensor>(producer_owner_, details_);
-}
-
-void sc_op::replace_input(size_t index, const graph_tensor_ptr &new_input,
-        const bool skip_shape_check) {
-    if (!skip_shape_check) {
-        assert(index < info_.inputs_.size());
-        assert(new_input->details_.is_dynamic()
-                || get_dims_product(
-                           info_.inputs_[index]->details_.get_plain_dims())
-                        == get_dims_product(
-                                new_input->details_.get_plain_dims()));
-    }
-    info_.inputs_[index]->detach_use(shared_from_this(), index);
-    info_.inputs_[index] = new_input;
-    new_input->attach_use(shared_from_this(), index);
-}
-
-void sc_op::replace_uses_with_and_remove(const sc_op_ptr &replacer) {
-    assert(info_.outputs_.size() == replacer->info_.outputs_.size());
-    for (unsigned i = 0; i < info_.outputs_.size(); i++) {
-        auto &ths_out = info_.outputs_[i];
-        auto &replace_out = replacer->info_.outputs_[i];
-        ths_out->replace_with(replace_out);
-    }
-    remove();
-}
-
-bool sc_op::has_graph_output() const {
-    for (const auto &output : info_.outputs_) {
-        for (const auto &use_node : output->uses_) {
-            if (use_node.second->isa<output_op>()) { return true; }
-        }
-    }
-    return false;
-}
-
-bool sc_op::is_single_output_single_use() {
-    return info_.outputs_.size() == 1 && info_.outputs_[0]->uses_.size() == 1;
-}
-
-bool sc_op::is_dynamic() const {
-    return !std::all_of(info_.inputs_.begin(), info_.inputs_.end(),
-                   [](const graph_tensor_ptr &inp) {
-                       return !inp->details_.is_dynamic();
-                   })
-            || !std::all_of(info_.outputs_.begin(), info_.outputs_.end(),
-                    [](const graph_tensor_ptr &out) {
-                        return !out->details_.is_dynamic();
-                    });
-}
-
-const dispatch_set_ptr &sc_op::get_dispatch_key_set() const {
-    assert(info_.dispatch_key_set_);
-    return info_.dispatch_key_set_;
-}
-
-dispatch_set_ptr &sc_op::get_dispatch_key_set() {
-    if (!info_.dispatch_key_set_) {
-        info_.dispatch_key_set_ = std::make_shared<dispatch_key_set_t>();
-    }
-    return info_.dispatch_key_set_;
-}
-
-dispatch_set_ptr sc_op::get_internal_dispatch_key_set(const context_ptr &ctx) {
-    throw std::runtime_error(
-            "Internal dispatch key set should be implemented by concrete op.");
-}
-
-void sc_op::copy_dispatch_key_set_from_op(const sc_op_ptr &other) {
-    if (other->info_.dispatch_key_set_) {
-        info_.dispatch_key_set_ = other->info_.dispatch_key_set_->copy();
-    }
-}
-
-void sc_op::remove() {
-    for (auto &in : info_.inputs_) {
-        in->detach_use(shared_from_this());
-    }
-    info_.inputs_.clear();
-    info_.outputs_.clear();
-    attrs_.as_map().clear();
-    is_removed_ = true;
-}
-
-// template op and fusible op common constructor
-sc_op::sc_op(const std::string &op_name,
-        const std::vector<graph_tensor_ptr> &producer_lt,
-        const std::vector<graph_tensor_ptr> &consumer_lt,
-        const any_map_t &attrs)
-    : attrs_(attrs), op_name_(op_name) {
-    info_.inputs_ = producer_lt;
-    info_.outputs_ = consumer_lt;
-    for (auto &op : info_.outputs_) {
-        op->producer_owner_ = this;
-    }
-}
-
-void sc_op::format_to_dense_format_stride_pair(
-        const std::vector<std::vector<sc_data_format_t>> &in_formats,
-        const std::vector<std::vector<sc_data_format_t>> &out_formats,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    supported_ins.resize(in_formats.size());
-    for (size_t i = 0; i < in_formats.size(); ++i) {
-        for (auto fmt : in_formats[i]) {
-            logical_tensor_t dense_lt(fmt,
-                    info_.inputs_[i]->details_.get_plain_dims(),
-                    info_.inputs_[i]->details_.dtype_);
-            supported_ins[i].emplace_back(
-                    std::make_pair(fmt, dense_lt.get_strides()));
-        }
-    }
-    supported_outs.resize(out_formats.size());
-    for (size_t i = 0; i < out_formats.size(); ++i) {
-        for (auto fmt : out_formats[i]) {
-            logical_tensor_t dense_lt(fmt,
-                    info_.outputs_[i]->details_.get_plain_dims(),
-                    info_.outputs_[i]->details_.dtype_);
-            supported_outs[i].emplace_back(fmt, dense_lt.get_strides());
-        }
-    }
-}
-
-sc_graph_t::sc_graph_t(sc_graph_t &&other)
-    : ops_(std::move(other.ops_))
-    , attrs_(std::move(other.attrs_))
-    , dyn_info_(std::move(other.dyn_info_)) {
-    for (auto &op : ops_) {
-        op->set_owner_graph(this);
-    }
-}
-
-sc_graph_t &sc_graph_t::operator=(sc_graph_t &&other) {
-    ops_ = std::move(other.ops_);
-    attrs_ = std::move(other.attrs_);
-    dyn_info_ = std::move(other.dyn_info_);
-    for (auto &op : ops_) {
-        op->set_owner_graph(this);
-    }
-    return *this;
-}
-
-size_t sc_graph_t::hash_contents(
-        const std::function<bool(const sc_op *, const std::string &)> &filter)
-        const {
-    size_t seed = 0;
-    hash_combine(seed, attrs_.get_or_else("fpmath_mode", 0));
-    op_visitor_t vis = op_visitor_t::bfs_topology_sort(this->ops_.size());
-    vis.visit_graph(*this, [&](op_visitor_t *vis, const sc_op_ptr &op) {
-        hash_combine(seed, op->hash_contents(filter));
-    });
-    return seed;
-}
-
-void sc_graph_t::reset_op_ids() {
-    for (auto it = ops_.begin(); it != ops_.end();) {
-        if ((*it)->is_removed_
-                || ((*it)->get_inputs().empty()
-                        && (*it)->get_outputs().empty())) {
-            it = ops_.erase(it);
-        } else {
-            ++it;
-        }
-    }
-    for (size_t i = 0; i < ops_.size(); ++i) {
-        ops_[i]->logical_op_id_ = i;
-    }
-}
-
-void sc_graph_t::resort_op_ids(
-        const std::unordered_map<sc_op_ptr, int> &op_id_map) {
-    std::sort(ops_.begin(), ops_.end(),
-            [&op_id_map](const sc_op_ptr &A, const sc_op_ptr &B) {
-                auto A_iter = op_id_map.find(A), B_iter = op_id_map.find(B);
-                COMPILE_ASSERT(
-                        A_iter != op_id_map.end() && B_iter != op_id_map.end(),
-                        "op id map is not enough, could not do sorting")
-                return A_iter->second < B_iter->second;
-            });
-    for (size_t i = 0; i < ops_.size(); ++i) {
-        ops_[i]->logical_op_id_ = i;
-    }
-}
-
-sc_dim sc_graph_t::get_next_dynamic_placeholder() {
-    if (!dyn_info_) { dyn_info_ = std::make_shared<dynamic_lower_info_t>(); }
-    COMPILE_ASSERT(std::numeric_limits<sc_dim>::min()
-                    < dyn_info_->cur_dynamic_placeholder_,
-            "Dynamic shapes are too many to mark!");
-    return dyn_info_->cur_dynamic_placeholder_--;
-}
-
-expr sc_graph_t::dim_to_expr(const sc_dim &v) {
-    const std::string dynamic_prefix = "dynamic_var_";
-    if (is_dynamic_dim(v)) {
-        if (!dyn_info_) {
-            dyn_info_ = std::make_shared<dynamic_lower_info_t>();
-        }
-        assert(v != dimensions::dynamic_any);
-        auto &m = dyn_info_->dim2expr_map_;
-        auto it = m.find(v);
-        if (it == m.end()) {
-            auto dyn_var = make_expr<var_node>(
-                    datatypes::index, dynamic_prefix + std::to_string(-v));
-            dyn_var->attr().set(attr_key::const_attr, true);
-            m[v] = dyn_var;
-            return dyn_var;
-        } else {
-            return it->second;
-        }
-    }
-    return dim2unsigned(v);
-}
-
-std::vector<expr> sc_graph_t::dims_to_expr(const sc_dims &dim) {
-    std::vector<expr> dim_expr;
-    dim_expr.reserve(dim.size());
-    for (auto d : dim) {
-        dim_expr.emplace_back(dim_to_expr(d));
-    }
-    return dim_expr;
-}
-
-std::vector<expr_c> sc_graph_t::dims_to_expr_c(const sc_dims &dim) {
-    std::vector<expr_c> dim_expr;
-    dim_expr.reserve(dim.size());
-    for (auto d : dim) {
-        dim_expr.emplace_back(dim_to_expr(d));
-    }
-    return dim_expr;
-}
-
-float sc_graph_t::get_gflop() const {
-    float gflop = 0.f;
-    for (auto &op : ops_) {
-        if (op->is_removed_) continue;
-        gflop += op->get_gflop();
-    }
-    return gflop;
-}
-
-bool sc_graph_t::is_dynamic() const {
-    return !std::all_of(ops_.begin(), ops_.end(),
-            [](const sc_op_ptr &op) { return !op->is_dynamic(); });
-}
-
-bool sc_graph_t::is_non_dense() const {
-    if (is_dynamic()) {
-        // currently dynamic graph is always dense
-        return false;
-    }
-    for (const sc_op_ptr &op : ops_) {
-        for (const graph_tensor_ptr &gt : op->get_inputs()) {
-            if (!gt->details_.is_dense()) { return true; }
-        }
-    }
-    return false;
-}
-
-void sc_graph_t::add(const sc_op_ptr &ret) {
-    assert(ret->logical_op_id_ == 0);
-    ret->logical_op_id_ = ops_.size();
-    ret->set_owner_graph(this);
-    for (auto &outs : ret->info_.outputs_) {
-        assert(outs->producer_owner_ == nullptr
-                || outs->producer_owner_ == ret.get());
-        outs->producer_owner_ = ret.get();
-    }
-    for (unsigned i = 0; i < ret->info_.inputs_.size(); i++) {
-        ret->info_.inputs_[i]->attach_use(ret, i);
-    }
-    ops_.emplace_back(ret);
-}
-
-std::shared_ptr<sc_op> sc_graph_t::make(const std::string &op_name,
-        const std::vector<graph_tensor_ptr> &inputs,
-        const std::vector<graph_tensor_ptr> &outputs, const any_map_t &attrs) {
-    // make a reference to all needed ops to prevent them from being removed by
-    // linker
-    __dummy_init();
-    std::shared_ptr<sc_op> ret, in_ret;
-    // internally create input_op
-    // todo: LLGA-sc front end should create input_op first, instead of creating
-    // it internally
-    for (auto &ins : inputs) {
-        if (!ins->producer_owner_) {
-            auto in_ret = std::make_shared<input_op>(
-                    std::vector<graph_tensor_ptr> {ins});
-            in_ret->logical_op_id_ = ops_.size();
-            ops_.emplace_back(std::move(in_ret));
-        }
-    }
-
-    std::string decay_op_name = graph::decay_quantized_op_name(op_name);
-    // firstly search template, secondly search fusible
-    // todo: add all tunable ops
-    if (auto f = get_op_factory(decay_op_name)) {
-        ret = f(inputs, outputs, attrs);
-    } else {
-        COMPILE_ASSERT(false, "Unsupported op: " << decay_op_name);
-    }
-    bool is_quantized = utils::string_startswith(op_name, "quantized");
-    if (is_quantized) {
-        ret->dyn_cast<op_traits::may_quantize_t>()->is_quantized_ = true;
-    }
-    add(ret);
-    // As owner graph is initialzed after op's constructor and add, some ops
-    // like conv need infer output plain shapes at this step.
-    ret->infer_out_tensor_details();
-    return ret;
-}
-
-std::shared_ptr<sc_op> sc_graph_t::make_output(
-        const std::vector<graph_tensor_ptr> &inputs, const any_map_t &attrs) {
-    auto ret = std::make_shared<output_op>(inputs);
-    ret->owner_graph_ = this;
-    ret->attrs_ = attrs;
-    for (unsigned i = 0; i < inputs.size(); i++) {
-        inputs[i]->attach_use(ret, i);
-    }
-    ret->logical_op_id_ = ops_.size();
-    ops_.emplace_back(ret);
-    return ret;
-}
-
-std::shared_ptr<sc_op> sc_graph_t::make_input(
-        const std::vector<graph_tensor_ptr> &inputs, const any_map_t &attrs) {
-    auto ret = std::make_shared<input_op>(inputs);
-    ret->owner_graph_ = this;
-    ret->attrs_ = attrs;
-    ret->logical_op_id_ = ops_.size();
-    if (ret->is_dynamic()) { ret->initialize_dynamic_placeholder(); }
-    ops_.emplace_back(ret);
-    return ret;
-}
-
-std::vector<sc_op_ptr> sc_graph_t::get_output_ops() {
-    std::vector<sc_op_ptr> output_ops;
-    for (auto &op : ops_) {
-        if (op->isa<output_op>()) { output_ops.push_back(op); }
-    }
-    return output_ops;
-}
-std::vector<sc_op_ptr> sc_graph_t::get_input_ops() {
-    std::vector<sc_op_ptr> input_ops;
-    for (auto &op : ops_) {
-        if (op->isa<input_op>()) { input_ops.push_back(op); }
-    }
-    return input_ops;
-}
-
-std::vector<sc_op_ptr> sc_graph_t::get_input_or_const_ops() const {
-    std::vector<sc_op_ptr> input_ops;
-    for (auto &op : ops_) {
-        if (op->isa<input_op>() || op->isa<constant_op_t>()) {
-            input_ops.push_back(op);
-        }
-    }
-    return input_ops;
-}
-
-std::unordered_set<sc_dim> sc_graph_t::get_external_dynamic_vars() {
-    std::unordered_set<sc_dim> ext_vars;
-    auto extract_vars = [&ext_vars](const std::vector<sc_op_ptr> &ops) {
-        for (auto &op : ops) {
-            for (auto &ins : op->get_inputs()) {
-                for (auto &d : ins->details_.get_plain_dims()) {
-                    if (is_dynamic_dim(d)) { ext_vars.insert(d); }
-                }
-            }
-            for (auto &outs : op->get_outputs()) {
-                for (auto &d : outs->details_.get_plain_dims()) {
-                    if (is_dynamic_dim(d)) { ext_vars.insert(d); }
-                }
-            }
-        }
-    };
-    extract_vars(get_input_ops());
-    extract_vars(get_output_ops());
-    // dynamic reshape is also traited as external var.
-    std::vector<sc_op_ptr> dyn_reshapes;
-    for (auto &op : ops_) {
-        if (op->op_name_ == "dynamic_reshape") { dyn_reshapes.push_back(op); }
-    }
-    extract_vars(dyn_reshapes);
-    return ext_vars;
-}
-
-bool sc_graph_t::need_dynamic_internal_query() {
-    return !std::all_of(ops_.begin(), ops_.end(), [](const sc_op_ptr &op) {
-        return !op->need_dynamic_internal_query();
-    });
-}
-
-bool sc_op::need_dynamic_internal_query() {
-    bool ret = need_dynamic_internal_query_impl();
-    if (ret && !info_.internal_info_) {
-        info_.internal_info_ = std::make_shared<dyn_internal_info_t>();
-    }
-    return ret;
-}
-
-bool sc_op::compare_contents(const sc_op *other, // NOLINT
-        const std::function<bool(const sc_op *, const std::string &)> &filter)
-        const {
-    if (op_name_ != other->op_name_) { return false; }
-    int numattrs = 0, othernumattrs = 0;
-    auto &othermap = other->attrs_.as_map();
-    for (auto &kv : attrs_.as_map()) {
-        if (utils::string_startswith(kv.first, "temp.")) { continue; }
-        if (filter && !filter(this, kv.first)) { continue; }
-        numattrs++;
-        auto otherkv = othermap.find(kv.first);
-        if (otherkv == othermap.end()) { return false; }
-        if (kv.second.cmp(otherkv->second) != 0) { return false; }
-    }
-    for (auto &kv : othermap) {
-        if (utils::string_startswith(kv.first, "temp.")) { continue; }
-        if (filter && !filter(this, kv.first)) { continue; }
-        othernumattrs++;
-    }
-    if (numattrs != othernumattrs) { return false; }
-
-    return true;
-}
-
-size_t sc_op::standard_hash_contents(const sc_op *p,
-        const std::function<bool(const sc_op *, const std::string &)> &filter) {
-    size_t seed = 0;
-    for (auto &in : p->info_.inputs_) {
-        hash_combine(seed, in->details_.hash());
-    }
-    for (auto &out : p->info_.outputs_) {
-        hash_combine(seed, out->details_.hash());
-    }
-    hash_combine(seed, p->op_name_);
-    for (auto &kv : p->attrs_.as_map()) {
-        if (utils::string_startswith(kv.first, "temp.")) { continue; }
-        if (filter && !filter(p, kv.first)) { continue; }
-        // To hash unordered_map, use `XOR`, which satisfies commutative law.
-        // Otherwise, for ordered containers (like arrays), use `hash_combine`
-        // to distinguish result from the differnt sequence order.
-        if (!kv.second.empty()) { seed ^= kv.second.hash(); }
-    }
-    return seed;
-}
-
-size_t sc_op::hash_contents( // NOLINT
-        const std::function<bool(const sc_op *, const std::string &)> &filter)
-        const {
-    return standard_hash_contents(this, filter);
-}
-
-static std::unordered_map<std::string, op_factory_func> &get_op_factory_map() {
-    static std::unordered_map<std::string, op_factory_func> op_map;
-    return op_map;
-}
-
-op_factory_func get_op_factory(const std::string &name) {
-    auto &op_map = get_op_factory_map();
-    auto itr = op_map.find(name);
-    if (itr != op_map.end()) { return itr->second; }
-    return nullptr;
-}
-
-void set_op_factory(const std::string &name, op_factory_func f) {
-    auto &op_map = get_op_factory_map();
-    COMPILE_ASSERT(op_map.find(name) == op_map.end(),
-            "The op has already registered!");
-    op_map[name] = f;
-}
-
-float sc_op::get_gflop() {
-    return 0.0f;
-}
-
-std::vector<int> sc_op::get_impl_dispatch_candidates(const context_ptr &ctx) {
-    return {};
-}
-
-reflection::shared_general_object_t sc_op::get_dynamic_runtime_info() {
-    return reflection::shared_general_object_t();
-}
-
-namespace graph {
-std::string decay_quantized_op_name(const std::string &op_name) {
-    bool is_quantized = utils::string_startswith(op_name, "quantized");
-    std::string qstring = "quantized_";
-    std::string decay_op_name = is_quantized
-            ? op_name.substr(qstring.size(), op_name.size() - qstring.size())
-            : op_name;
-    return decay_op_name;
-}
-
-void get_logical_tensors(
-        ltensors *ins, const std::vector<graph_tensor_ptr> &flts) {
-    ins->reserve(flts.size());
-    for (auto &in : flts) {
-        ins->emplace_back(in->details_);
-    }
-}
-
-expr tensor_detail_to_ir_tensor(sc_graph_t &graph, const std::string &name,
-        const logical_tensor_t &tsrd) {
-    auto blocking_dims = tsrd.get_blocking_dims();
-    auto strides = tsrd.get_strides();
-    COMPILE_ASSERT(blocking_dims.size() == strides.size(),
-            "Dims and strides does not match.");
-    auto blocking_exprs = tsrd.get_blocking_dims_expr(graph);
-    auto tsr = builder::make_stensor(name, blocking_exprs,
-            tsrd.get_strides_expr(graph), tsrd.dtype_, address_space::automatic,
-            nullptr);
-    tsr->attr().set(
-            attr_keys::plain_dims, graph.dims_to_expr(tsrd.get_plain_dims()));
-    if (graph.is_dynamic()) { tsr->attr().set(attr_keys::always_trans, true); }
-    return tsr;
-}
-
-std::vector<expr> tensor_detail_to_ir_tensor(sc_graph_t &graph,
-        const std::string &name_prefix,
-        const std::vector<logical_tensor_t> &tsrs) {
-    std::vector<expr> ret;
-    ret.reserve(tsrs.size());
-    for (size_t i = 0; i < tsrs.size(); i++) {
-        ret.emplace_back(tensor_detail_to_ir_tensor(
-                graph, name_prefix + std::to_string(i), tsrs[i]));
-    }
-    return ret;
-}
-
-std::vector<expr> tensor_detail_to_ir_tensor(sc_graph_t &graph,
-        const std::string &name_prefix,
-        const std::vector<graph_tensor_ptr> &tsrs) {
-    std::vector<expr> ret;
-    ret.reserve(tsrs.size());
-    for (size_t i = 0; i < tsrs.size(); i++) {
-        ret.emplace_back(tensor_detail_to_ir_tensor(
-                graph, name_prefix + std::to_string(i), tsrs[i]->details_));
-    }
-    return ret;
-}
-
-ltensors extract_detail_from_tensors(
-        const std::vector<std::shared_ptr<graph_tensor>> &flts) {
-    std::vector<logical_tensor_t> ret;
-    ret.reserve(flts.size());
-    for (auto &in : flts) {
-        ret.emplace_back(in->details_);
-    }
-    return ret;
-}
-
-sc_graph_t make_single_op_graph(const std::string &opname,
-        const std::vector<graph_tensor_ptr> &inputs,
-        const std::vector<graph_tensor_ptr> &outputs, const any_map_t &attr) {
-    sc_graph_t ret;
-    ret.make_input(inputs);
-    auto op = ret.make(opname, inputs, outputs, attr);
-    ret.make_output(op->get_outputs());
-    return ret;
-}
-
-bool check_shape_equal(const sc_dims &lhs_shape, const sc_dims &rhs_shape) {
-    if (lhs_shape.size() != rhs_shape.size()) { return false; }
-    if (!std::equal(lhs_shape.begin(), lhs_shape.end(), rhs_shape.begin(),
-                [](const int x, const int y) {
-                    return is_dynamic_dim(x) || is_dynamic_dim(y) || x == y;
-                })) {
-        return false;
-    }
-    return true;
-}
-
-void check_logical_tensor_shape_dtype_identical(
-        const logical_tensor_t &lhs, const logical_tensor_t &rhs) {
-    COMPILE_ASSERT(
-            check_shape_equal(lhs.get_plain_dims(), rhs.get_plain_dims()),
-            "The given logical tensors shall have the same shape.")
-    COMPILE_ASSERT(lhs.dtype_.type_code_ == rhs.dtype_.type_code_,
-            "The given logical tensors shall have the same dtype.")
-}
-
-} // namespace graph
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-std::size_t hash<dnnl::impl::graph::gc::logical_tensor_t>::operator()(
-        const dnnl::impl::graph::gc::logical_tensor_t &k) const {
-    size_t seed = 0;
-    dnnl::impl::graph::gc::hash_combine(seed, k.dtype_);
-    dnnl::impl::graph::gc::hash_combine(seed, k.format_);
-    for (size_t i = 0; i < k.plain_dims_.size(); i++) {
-        dnnl::impl::graph::gc::hash_combine(seed, k.plain_dims_[i]);
-    }
-    return seed;
-}
-} // namespace std
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph.hpp
deleted file mode 100644
index 0ace1a59db0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph.hpp
+++ /dev/null
@@ -1,575 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_HPP
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "tensor_detail.hpp"
-#include <compiler/config/context.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-#if SC_MEMORY_LEAK_CHECK > 0
-#include <util/leak_detector.hpp>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace reflection {
-template <typename T, typename Dummy>
-struct type_registry;
-struct shared_general_object_t;
-} // namespace reflection
-
-struct graph_tensor;
-class sc_op;
-using graph_tensor_ptr = std::shared_ptr<graph_tensor>;
-using sc_op_ptr = std::shared_ptr<sc_op>;
-
-// the additional data related to fusion manager, attached to logical_tensor_t
-struct tensor_slice;
-struct fusion_anchor_t;
-
-template <typename valT>
-struct gt_map_t;
-using gt2gt_map = gt_map_t<graph_tensor_ptr>;
-using gt2axis_map = gt_map_t<std::vector<int>>;
-using gt2buf_map = gt_map_t<expr>;
-
-using slice_range = std::vector<std::pair<expr, expr>>;
-using slice_range_list = std::vector<slice_range>;
-using fslice_map = gt_map_t<slice_range_list>;
-
-using format_stride_pair = std::pair<sc_data_format_t, sc_dims>;
-using shape_rl_vec = std::vector<std::pair<sc_dim, sc_dim>>;
-
-struct dispatch_key_set_base_t;
-using dispatch_set_ptr = std::shared_ptr<dispatch_key_set_base_t>;
-struct dyn_internal_info_t;
-
-/** VConst struct record possible varible in constant value, e.g.
- *
- *   const int a = k * b;
- *
- *  where `k` maybe variable related on other factor such as blocking dims.
- * */
-struct VConst {
-    // variable value
-    int64_t var_;
-};
-
-// a weak pointer which always asserts the object exists
-struct sc_op_weak_ptr_t : public std::weak_ptr<sc_op> {
-    using parent = std::weak_ptr<sc_op>;
-    using parent::parent;
-    using parent::operator=;
-
-    sc_op_weak_ptr_t(const sc_op_weak_ptr_t &) = default;
-    sc_op_weak_ptr_t(sc_op_weak_ptr_t &&) = default;
-
-    sc_op_weak_ptr_t &operator=(sc_op *);
-    sc_op_weak_ptr_t &operator=(const sc_op_weak_ptr_t &other) {
-        parent::operator=(other);
-        return *this;
-    }
-    bool operator!() const { return get(); }
-    bool operator==(const sc_op_weak_ptr_t &other) const {
-        return get() == other.get();
-    }
-    bool operator==(const sc_op_ptr &other) const {
-        return get() == other.get();
-    }
-    sc_op *operator*() const { return get(); }
-    sc_op *operator->() const { return get(); }
-
-    sc_op_ptr get_shared() const {
-        auto ret = lock();
-        assert(ret);
-        return ret;
-    }
-    sc_op *get() const { return get_shared().get(); }
-    sc_op_weak_ptr_t &operator=(const sc_op *other);
-    operator sc_op_ptr() const { return get_shared(); }
-};
-
-// the logical tensor for in the graph to represent a result value in the graph.
-// It contains the tensor details (shape, dtype, etc.) and the connectivity of
-// the value in the graph
-struct SC_API graph_tensor : public std::enable_shared_from_this<graph_tensor>
-                             SC_EXTENDS_LEAK_CHECK(graph_tensor) {
-    logical_tensor_t details_;
-    // todo(zhichen/yijie): producer_owner should be used weak pointer.
-    sc_op *producer_owner_ {nullptr};
-    // the op nodes that use this tensor as input
-    std::vector<std::pair<int, sc_op_weak_ptr_t>> uses_;
-
-    any_map_t attrs_;
-
-    graph_tensor(sc_op *owner);
-    graph_tensor(sc_op *owner, const logical_tensor_t &lt);
-    graph_tensor(sc_op *owner, const sc_data_format_t &format,
-            const sc_dims &plain_shape, const sc_data_type_t &type,
-            const sc_dims &stride = {});
-
-    // adds a node to the `uses_` list, given the index of the tensor in the
-    // inputs of the op
-    void attach_use(sc_op_ptr op, int index);
-    // removes a node from the `uses_` list (may need a linear search)
-    void detach_use(const sc_op_ptr &op);
-    // removes a node to the `uses_` list, given the index of the tensor in the
-    // inputs of the op
-    void detach_use(const sc_op_ptr &op, int input);
-
-    // replaces all uses of this tensors with `v`. Will call `replace_input` of
-    // all uses of this
-    void replace_with(const graph_tensor_ptr &v);
-    graph_tensor_ptr copy();
-    ~graph_tensor() = default;
-
-    static graph_tensor_ptr make(const sc_dims &shape,
-            const sc_data_format_t &fmt = sc_data_format_t(),
-            sc_data_type_t dtype = sc_data_type_t(sc_data_etype::F32, 1),
-            const sc_dims &stride = {}) {
-        return std::make_shared<graph_tensor>(
-                nullptr, fmt, shape, dtype, stride);
-    }
-    bool is_dynamic() const { return details_.is_dynamic(); }
-};
-
-using ltensors = std::vector<logical_tensor_t>;
-
-struct sc_op_info_t {
-    std::vector<graph_tensor_ptr> outputs_;
-    std::vector<graph_tensor_ptr> inputs_;
-    // set of all dynamic dispatch keys combinations of op, this field is mainly
-    // prepared for dynamic dispatch during lowering and is created during
-    // layout propagation.
-    dispatch_set_ptr dispatch_key_set_;
-    // Extra info for op who could be internal queried.
-    std::shared_ptr<dyn_internal_info_t> internal_info_;
-    // current used impl type
-    int cur_impl_ = 0;
-};
-
-struct op_base_trait_t {};
-
-namespace op_attr_key {
-// Boolean. If true, don't fuse this Op. Default = false
-constexpr const char *no_fuse = "no_fuse";
-// Boolean. If true, don't break the fusion partition after this Op
-// (this Op can still be fused with previous Ops). Default = false
-constexpr const char *break_post_fuse = "break_post_fuse";
-// Boolean. If true, don't break the fusion partition before this Op
-// (this Op can still be fused with post Ops). Default = false
-constexpr const char *break_pre_fuse = "break_pre_fuse";
-// Fuse Anchor
-constexpr const char *inner_anchor = "inner_anchor";
-// Inner anchor created in fusible ops and will be add to mixed partition when
-// committing fusible op.
-constexpr const char *fusible_inner_anchors = "fusible_inner_anchors";
-// the name of the layer. Will be used to name the IR function
-constexpr const char *layer_name = "temp.name";
-// op marked with not_redundant will not be removed in horizontal same op
-// elimination
-constexpr const char *not_redundant = "temp.not_redundant";
-// binary/ternary elementwise op layout propagation source input index.
-constexpr const char *layout_input_index = "layout_input_index";
-// Could use mask select to process output for reduce, matmul or other memory
-// movement op.
-constexpr const char *use_padded_mask = "use_padded_mask";
-// Boolean. If true, it will skip graph pass div_bcast_transform. The precision
-// requirements are high, and division must be used in the calculation of op.
-constexpr const char *must_div = "must_div";
-// Boolean. If true, the optimized formula will be used when norm calculates
-// mean and var.
-constexpr const char *use_norm_opt = "use_norm_opt";
-}; // namespace op_attr_key
-
-class sc_graph_t;
-class SC_INTERNAL_API sc_op : public virtual op_base_trait_t,
-                              public std::enable_shared_from_this<sc_op>
-                              SC_LEAK_CHECK(sc_op) {
-public:
-    sc_graph_t *owner_graph_ {nullptr};
-    sc_op_info_t info_;
-    any_map_t attrs_;
-    // the logical op ID in the op in the manager, default is 0.
-    int logical_op_id_ = 0;
-    // todo(zhichen): remove this, replace by dynamic cast check
-    // is or not template op
-    bool is_fusible_ = false;
-    // align template name
-    std::string op_name_;
-    // if the node is removed from the manager
-    bool is_removed_ = false;
-    // get op infos
-    const sc_op_info_t &get_info() const { return info_; }
-    // get input logical tensors
-    const std::vector<graph_tensor_ptr> &get_inputs() const {
-        return info_.inputs_;
-    }
-    // get output logcial tesnors
-    const std::vector<graph_tensor_ptr> &get_outputs() const {
-        return info_.outputs_;
-    }
-
-    virtual const dispatch_set_ptr &get_dispatch_key_set() const;
-    virtual dispatch_set_ptr &get_dispatch_key_set();
-    // internal query disaptch keys, mainly for impl kind.
-    virtual dispatch_set_ptr get_internal_dispatch_key_set(
-            const context_ptr &ctx);
-    // call impl_func if return true, create the internal info.
-    bool need_dynamic_internal_query();
-    // if the op needs internal query, default false.
-    virtual bool need_dynamic_internal_query_impl() const { return false; }
-    void copy_dispatch_key_set_from_op(const sc_op_ptr &other);
-    /**
-     * Repalces an input logical tensor
-     * @param index the index within get_inputs()
-     * @param new_input the new logical tensor
-     * @param skip_shape_check whether to check new shape vs original shape, for
-     * padding op the shape could be different
-     * */
-    void replace_input(size_t index, const graph_tensor_ptr &new_input,
-            const bool skip_shape_check = false);
-
-    // Replaces the current Op in the graph using another Op. All other Ops
-    // using the output tensors of current Op will use the corresponding tensors
-    // in the replacer Op instead. Finally the current node will be removed
-    // Requires that the replacer has the same number of outputs of the current
-    // node. Will detach from the input tensors. The replacer should manually
-    // attach to the input tensors when it is needed
-    void replace_uses_with_and_remove(const sc_op_ptr &replacer);
-
-    /**
-     * Checks whether any output tensor of this op is the output of the graph.
-     */
-    bool has_graph_output() const;
-
-    // the op is single output and the output is single used
-    bool is_single_output_single_use();
-
-    // the op contains dynamic shape calculation, we didn't store a boolean
-    // value cache for op/graph, because they are dynamic themselves.
-    bool is_dynamic() const;
-
-    // Get relationship of shapes from input/output plain shapes. Return
-    // a vector of a pair of parent shape placeholder and child shape
-    // placeholder. If one of the shapes corresponding to parent and child is
-    // static, the other is also inferred as static, e.g. binary elemwise op
-    // input shapes [-1, 64] and [16, 64], -1 will be inferred as 16.
-    virtual shape_rl_vec get_dynamic_shape_relations() const { return {}; }
-    // Get calculation expressions between different dynamic vars in
-    // output/input. The expressions will be used by internal dynamic vars
-    // inside kernel which does not infer shape. The expression will store in
-    // var expr with attribute `pass.cal_expression`.
-    virtual void calculate_dynamic_shape_expression() {}
-    // the op share given graph tensor with opT(except itself)
-    template <typename opT>
-    bool share_gt_with_op(const graph_tensor_ptr &gt) {
-        auto ths = this;
-        return std::any_of(gt->uses_.begin(), gt->uses_.end(),
-                [&ths](const std::pair<int, sc_op_weak_ptr_t> &user) {
-                    return user.second->isa<opT>()
-                            && (user.second.get() != ths);
-                });
-    }
-
-    // Marks this node invalid and detach_use from all input tensors
-    void remove();
-
-    // get op's owner graph
-    sc_graph_t &get_owner_graph() const { return *owner_graph_; }
-    void set_owner_graph(sc_graph_t *owner_graph) {
-        owner_graph_ = owner_graph;
-    }
-    // infer output details like plain shapes, most op infer in their
-    // constructors except for conv now.
-    virtual void infer_out_tensor_details() {}
-    /**
-     * Checks if the node is of a subclass.
-     * @param T the subclass
-     * */
-
-    template <typename T>
-    bool isa() const {
-        static_assert(is_base_of_t<sc_op, T>::value
-                        || is_base_of_t<op_base_trait_t, T>::value,
-                "T is not a subclass of sc_op/op_trait.");
-        return dynamic_cast<const T *>(this);
-    }
-
-    /**
-     * Dynamic cast the node to a subclass or a trait.
-     * @param T the subclass
-     * */
-    template <typename T>
-    T *dyn_cast() const {
-        return dynamic_cast<T *>(this);
-    }
-
-    template <typename T>
-    T *dyn_cast() {
-        return dynamic_cast<T *>(this);
-    }
-
-    /**
-     * Static cast the node to a subclass or a trait.
-     * @param T the subclass
-     * */
-    template <typename T>
-    T *stc_cast() {
-        assert(isa<T>() && "check T failed.");
-        return static_cast<T *>(this);
-    }
-
-    /**
-     * Compares the contents (op_name/attrs/other fields in the op). The default
-     * implementation only compares op_name and attrs. This function does not
-     * check the op-tensor connections, which will be checked by the
-     * graph_comparer.
-     * @param filter the filter for the attr name. If it returns false, the attr
-     * will be ignored. By default it only filters out keys starting with
-     * "temp."
-     * @note we ingore the attrs with keys starting with "temp."
-     * @return true if the contents (not including the op connections) are the
-     * same
-     * */
-    virtual bool compare_contents(const sc_op *other,
-            const std::function<bool(const sc_op *, const std::string &)>
-                    &filter
-            = nullptr) const;
-
-    /**
-     * Hash the contents. The default implementation only hashs op_name and
-     * attrs. The function can be used to make hash map. When hash conflict
-     * happened, we can compare them with `compare_contents`.
-     * @param filter the filter for the attr name. If it returns false, the attr
-     * will be ignored. By default it only filters out keys starting with
-     * "temp."
-     * @note we ingore the attrs with keys starting with "temp."
-     * @return hash value with size_t datatype.
-     * */
-    virtual size_t hash_contents(
-            const std::function<bool(const sc_op *, const std::string &)>
-                    &filter
-            = nullptr) const;
-
-    // the default implementation of hash_contents
-    static size_t standard_hash_contents(const sc_op *p,
-            const std::function<bool(const sc_op *, const std::string &)>
-                    &filter);
-
-    // constructor
-    sc_op(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs);
-    sc_op() = default;
-    sc_op(const sc_op &) = delete;
-    sc_op(sc_op &&) = delete;
-
-    virtual bool is_valid(const context_ptr &ctx) { return true; }
-    virtual ir_module_ptr get_func(context_ptr ctx) = 0;
-    virtual ir_module_ptr get_internal_func(const context_ptr &ctx) {
-        throw std::runtime_error("Unimplement.");
-    }
-
-    virtual void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            = 0;
-    void format_to_dense_format_stride_pair(
-            const std::vector<std::vector<sc_data_format_t>> &in_formats,
-            const std::vector<std::vector<sc_data_format_t>> &out_formats,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs);
-
-    virtual ~sc_op() = default;
-    virtual float get_gflop();
-    // Get op impl type candidates for dynamic dispatch. Default candidates are
-    // padding/no_padding. Return the impl alg candidates vector, element is
-    // int(not enum) because different ops have different impl algs.
-    virtual std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx);
-    virtual reflection::shared_general_object_t get_dynamic_runtime_info();
-};
-
-inline sc_op_weak_ptr_t &sc_op_weak_ptr_t::operator=(sc_op *other) {
-    *this = other->shared_from_this();
-    return *this;
-}
-
-class SC_INTERNAL_API input_op;
-class SC_INTERNAL_API output_op;
-
-std::vector<graph_tensor_ptr> copy_logical_tsr(
-        const std::vector<graph_tensor_ptr> &v);
-
-struct dynamic_lower_info_t;
-class SC_API sc_graph_t {
-public:
-    // the attr keys for graph
-    struct attr_key_t {
-        static constexpr const char *gflop = "gflop";
-        static constexpr const char *quantize = "quantize";
-        static constexpr const char *bf16 = "bf16";
-        static constexpr const char *fp16 = "fp16";
-        // if false, will keep the blocking format of output tensor
-        static constexpr const char *is_output_plain = "is_output_plain";
-        // if false, when an input_op is plain format and has only one use, will
-        // set the input's data format to the expected format of the use
-        static constexpr const char *is_input_plain = "is_input_plain";
-        // if true, will allow output to be in channel last format
-        static constexpr const char *allow_channel_last_output
-                = "allow_channel_last_output";
-        static constexpr const char *fpmath_mode = "fpmath_mode";
-    };
-
-    std::vector<sc_op_ptr> ops_;
-    any_map_t attrs_;
-    // lowering info related to dynamic
-    std::shared_ptr<dynamic_lower_info_t> dyn_info_;
-
-    // adds an existing node to the graph
-    void add(const sc_op_ptr &node);
-
-    // return true if the graph is empty
-    bool empty() const { return ops_.empty(); }
-
-    std::shared_ptr<sc_op> make(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &inputs,
-            const std::vector<graph_tensor_ptr> &outputs,
-            const any_map_t &attrs);
-
-    template <typename T, typename... Args>
-    std::shared_ptr<T> make(Args &&...args) {
-        static_assert(!(std::is_same<T, input_op>::value),
-                "output_op should go to specialized function");
-        static_assert(!(std::is_same<T, output_op>::value),
-                "output_op should go to specialized function");
-        auto ret = std::make_shared<T>(std::forward<Args>(args)...);
-        add(ret);
-        return ret;
-    }
-
-    /**
-     * Hash the contents. The default implementation only hashs ops and
-     * attrs. The function can be used to make hash map. When hash conflict
-     * happened, we can compare them with `compare_graph`.
-     * @param filter the filter for the attr name. If it returns false, the attr
-     * will be ignored. By default it only filters out keys starting with
-     * "temp."
-     * @note we ingore the attrs with keys starting with "temp."
-     * @return hash value with size_t datatype.
-     * */
-    size_t hash_contents(
-            const std::function<bool(const sc_op *, const std::string &)>
-                    &filter
-            = nullptr) const;
-
-    // This function removes the Ops with is_removed_=true. And it compresses
-    // the ops_ array by removing the holes of removed ops. It finally resets
-    // all ids of each ops, to keep the continuity of ids.
-    void reset_op_ids();
-    // This function is aimed to re-sort the ops in the graph's `ops_` array
-    // using the op-ids in the op_id_map
-    void resort_op_ids(const std::unordered_map<sc_op_ptr, int> &op_id_map);
-
-    // get next valid dynamic placeholder (decreasing) in graph.
-    // We use [-2, -min_of_int64_t] to represent relationships between dynamic
-    // shapes. It will be called by infer shapes(constructor of each op).
-    // E.g. plain shape [-2, 64, -2, 128, -3, -4], first and third dimension are
-    // the same dynamic var, and the last two are separated.
-    sc_dim get_next_dynamic_placeholder();
-    // return a expr (vector) from input dim (vector), if dim is dynamic, first
-    // find in dim2expr_map_, if not found, create a var based on placeholder.
-    expr dim_to_expr(const sc_dim &);
-    std::vector<expr> dims_to_expr(const sc_dims &dim);
-    std::vector<expr_c> dims_to_expr_c(const sc_dims &dim);
-
-    // Get the total gflop from all tunable ops contained in the graph.
-    float get_gflop() const;
-    bool is_dynamic() const;
-    bool is_non_dense() const;
-    std::vector<sc_op_ptr> get_output_ops();
-    std::vector<sc_op_ptr> get_input_ops();
-    std::vector<sc_op_ptr> get_input_or_const_ops() const;
-
-    // sync dynamic placeholder and dim2expr map with other graph. When you
-    // create a subgraph b of graph a, you need to call
-    // b.sync_dynamic_info_with_graph(a).
-    void sync_dynamic_info_with_graph(const sc_graph_t &other) {
-        dyn_info_ = other.dyn_info_;
-    }
-    // Get external dynamic vars existed in inputs/outputs.
-    std::unordered_set<sc_dim> get_external_dynamic_vars();
-    // Judge if the ops in graph need dynamic internal query
-    bool need_dynamic_internal_query();
-    // output op
-    std::shared_ptr<sc_op> make_output(
-            const std::vector<graph_tensor_ptr> &inputs,
-            const any_map_t &attrs = any_map_t());
-    // input op
-    std::shared_ptr<sc_op> make_input(
-            const std::vector<graph_tensor_ptr> &inputs,
-            const any_map_t &attrs = any_map_t());
-    sc_graph_t() = default;
-    sc_graph_t(sc_graph_t &&other);
-    sc_graph_t &operator=(sc_graph_t &&other);
-    sc_graph_t(const sc_graph_t &other) = delete;
-};
-
-std::vector<expr> get_blocking_shapes_expr(sc_graph_t &graph,
-        const sc_dims &plain_shapes, const sc_data_format_t &format);
-
-using op_factory_func = sc_op_ptr (*)(const std::vector<graph_tensor_ptr> &,
-        const std::vector<graph_tensor_ptr> &, const any_map_t &);
-op_factory_func get_op_factory(const std::string &name);
-void set_op_factory(const std::string &name, op_factory_func f);
-
-template <typename T>
-struct register_helper_t {
-    static bool op_call(const std::string &op_name) {
-        auto func = [](const std::vector<graph_tensor_ptr> &ins,
-                            const std::vector<graph_tensor_ptr> &outs,
-                            const any_map_t &attrs) -> sc_op_ptr {
-            return std::shared_ptr<T>(new T(ins, outs, attrs));
-        };
-        set_op_factory(op_name, func);
-        return false;
-    }
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#define OP_REGISTER(IDENTIFIER, NAME) \
-    volatile bool __help_dummy_##NAME \
-            = dnnl::impl::graph::gc::register_helper_t<IDENTIFIER>::op_call( \
-                    #NAME);
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_config.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_config.cpp
deleted file mode 100644
index 4ab10861e13..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_config.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "graph_config.hpp"
-#include <utility>
-#include <vector>
-#include "graph_op.hpp"
-#include "traits.hpp"
-#include "tunable_op.hpp"
-#include "util/utils.hpp"
-#include "visitor.hpp"
-#include <unordered_map>
-#include <util/reflection.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// clang-format off
-SC_CLASS(graph_config)
-  SC_FIELD(op_cfgs_)
-SC_CLASS_END();
-// clang-format on
-
-namespace graph {
-
-void set_graph_config(sc_graph_t &g, const graph_config &tcfg) {
-    size_t visited_num = 0;
-    for (auto &op : g.ops_) {
-        if (auto tune_op = op->dyn_cast<op_traits::configurable_t>()) {
-            tune_op->set_config(tcfg.op_cfgs_.at(visited_num++));
-        }
-    }
-}
-
-graph_config get_graph_default_config(context_ptr ctx, const sc_graph_t &g) {
-    graph_config cfg;
-    op_visitor_t vis = op_visitor_t::bfs_topology_sort(g.ops_.size());
-    vis.visit_graph(g, [&](op_visitor_t *vis, const sc_op_ptr &op) {
-        if (auto tune_op = op->dyn_cast<op_traits::configurable_t>()) {
-            auto obj = tune_op->get_default_config(ctx);
-            cfg.op_cfgs_.emplace_back(std::move(obj));
-        }
-    });
-    return cfg;
-}
-} // namespace graph
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_config.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_config.hpp
deleted file mode 100644
index 0b55eabfa95..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_config.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_CONFIG_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_CONFIG_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "graph.hpp"
-#include "util/general_object.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// todo(zhichen): replaced by any map
-struct graph_config {
-    std::vector<reflection::shared_general_object_t> op_cfgs_;
-    // maybe anther config item in the future
-};
-namespace tuner {
-struct config_space;
-} // namespace tuner
-
-namespace graph {
-SC_INTERNAL_API graph_config get_graph_default_config(
-        context_ptr ctx, const sc_graph_t &g);
-SC_INTERNAL_API void set_graph_config(
-        sc_graph_t &g, const graph_config &config);
-} // namespace graph
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_map.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_map.hpp
deleted file mode 100644
index 6ccd7f09b6c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_map.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_MAP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_MAP_HPP
-
-#include <vector>
-#include "graph.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-template <typename T>
-struct is_vector {
-    static constexpr bool value = false;
-};
-
-template <typename T, typename Alloc>
-struct is_vector<std::vector<T, Alloc>> {
-    static constexpr bool value = true;
-};
-
-// the map based on graph_tensor key
-template <typename valT>
-struct gt_map_t {
-    std::unordered_map<graph_tensor *, valT> datamap_;
-
-    gt_map_t() = default;
-    template <typename valT2>
-    gt_map_t(const std::unordered_map<graph_tensor *, valT2> &datamap)
-        : datamap_(datamap) {}
-    // get the reference of value for the corresponding key, can be either read
-    // or write
-    valT &get(graph_tensor *);
-    valT &get(const graph_tensor_ptr &);
-    // return true if has key
-    bool haskey(const graph_tensor_ptr &) const;
-    bool haskey(graph_tensor *) const;
-
-    // specialization for vector tpyes of valT, which owns `empty` method
-    template <typename T = void,
-            typename dummy = typename std::enable_if<
-                    std::is_same<T, void>::value && is_vector<valT>::value,
-                    bool>::type>
-    bool hasvalue(graph_tensor *v) {
-        return haskey(v) && !get(v).empty();
-    }
-
-    template <typename T = void,
-            typename dummy = typename std::enable_if<
-                    std::is_same<T, void>::value && is_vector<valT>::value,
-                    bool>::type>
-    bool hasvalue(const graph_tensor_ptr &v) {
-        return hasvalue(v.get());
-    }
-
-    // clear datamap
-    void clear() { datamap_.clear(); }
-    // return true if empty
-    bool empty() const { return datamap_.empty(); }
-    // erase by key
-    void erase(const graph_tensor_ptr k) { datamap_.erase(k.get()); }
-    gt_map_t &operator=(const gt_map_t &other) = delete;
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_op.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_op.cpp
deleted file mode 100644
index 3ab7bc675be..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_op.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "graph_op.hpp"
-#include "lowering.hpp"
-#include "util/utils.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/graph/pass/pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-std::vector<graph_tensor_ptr> graph_op_t::remake_logical_tensors(
-        const std::vector<graph_tensor_ptr> &flts) {
-    std::vector<graph_tensor_ptr> new_flts(flts.size());
-    for (size_t i = 0; i < flts.size(); ++i) {
-        new_flts[i] = std::make_shared<graph_tensor>(nullptr,
-                flts[i]->details_.get_format(),
-                flts[i]->details_.get_plain_dims(), flts[i]->details_.dtype_);
-    }
-    return new_flts;
-}
-
-ir_module_ptr graph_op_t::get_func(context_ptr ctx) {
-    auto graph = get_graph();
-    return lower_graph(
-            ctx, *graph, {graph->ops_.back(), graph->ops_[0]}, false);
-}
-
-std::shared_ptr<sc_graph_t> graph_op_t::get_graph() {
-    auto g = std::make_shared<sc_graph_t>();
-    g->sync_dynamic_info_with_graph(get_owner_graph());
-    get_graph_impl(g);
-    return g;
-}
-
-graph_tensor_ptr graph_op_t::cast_input_dtype(graph_tensor_ptr &inp,
-        std::shared_ptr<sc_graph_t> &graph, const any_map_t &attrs) {
-    graph_tensor_ptr input = inp;
-    if (inp->details_.dtype_.is_etype(sc_data_etype::BF16)
-            || inp->details_.dtype_.is_etype(sc_data_etype::F16)) {
-        auto cast_input = graph->make("cast", {inp}, {},
-                attrs.as_map().empty() ? any_map_t {{"dtype", datatypes::f32}}
-                                       : attrs);
-        input = cast_input->get_outputs()[0];
-    }
-    return input;
-}
-
-std::shared_ptr<sc_op> graph_op_t::cast_output_dtype(graph_tensor_ptr &inp,
-        std::shared_ptr<sc_graph_t> &graph, std::shared_ptr<sc_op> &last_op,
-        const any_map_t &attrs) {
-    std::shared_ptr<sc_op> &cast_output = last_op;
-    if (inp->details_.dtype_.is_etype(sc_data_etype::BF16)
-            || inp->details_.dtype_.is_etype(sc_data_etype::F16)) {
-        auto target_dtype = inp->details_.dtype_;
-        cast_output = graph->make("cast", last_op->get_outputs(), {},
-                attrs.as_map().empty() ? any_map_t {{"dtype", target_dtype}}
-                                       : attrs);
-    }
-    return cast_output;
-}
-
-std::shared_ptr<sc_graph_t> configurable_graph_op_t::get_graph() {
-    auto g = std::make_shared<sc_graph_t>();
-    g->sync_dynamic_info_with_graph(get_owner_graph());
-    get_graph_impl(g);
-    // set config space;
-    return g;
-}
-
-config_ptr configurable_graph_op_t::get_config() {
-    return reflection::general_object_t::make(config_data_);
-}
-
-void configurable_graph_op_t::set_config(const config_ptr &config) {
-    config_data_ = *config.get_as<graph_config>();
-}
-
-config_ptr configurable_graph_op_t::get_default_config(context_ptr ctx) {
-    auto op_graph = this->get_graph();
-    return reflection::general_object_t::make(
-            graph::get_graph_default_config(ctx, *op_graph));
-}
-
-nested_graph_op_t::nested_graph_op_t(const std::string &op_name,
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs,
-        sc_graph_t &&graph)
-    : configurable_graph_op_t(op_name, ins, outs, attrs)
-    , graph_(std::move(graph)) {
-    if (outs.empty()) {
-        for (auto &out_op : graph_.get_output_ops()) {
-            info_.outputs_.insert(info_.outputs_.end(),
-                    out_op->get_inputs().begin(), out_op->get_inputs().end());
-        }
-        info_.outputs_ = remake_logical_tensors(info_.outputs_);
-        for (auto &op : info_.outputs_) {
-            op->producer_owner_ = this;
-        }
-    }
-
-    auto required_input_tsenor_num = 0ul;
-    for (auto &in_op : graph_.get_input_ops()) {
-        required_input_tsenor_num += in_op->get_outputs().size();
-    }
-    COMPILE_ASSERT(required_input_tsenor_num == ins.size(),
-            "The number of input tensor "
-                    << ins.size() << " is incorrect. The required number is "
-                    << required_input_tsenor_num);
-
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-
-    // combine all input op into one
-    int counter = 0;
-    for (auto &op : graph_.get_input_ops()) {
-        for (size_t idx = 0; idx < op->get_outputs().size(); ++idx) {
-            op->get_outputs()[idx]->replace_with(inputs.at(counter++));
-        }
-        op->remove();
-    }
-    auto in_op = graph_.make_input(inputs);
-
-    // combine all output op into one
-    counter = 0;
-    for (auto &op : graph_.get_output_ops()) {
-        outputs.insert(outputs.end(), op->get_inputs().begin(),
-                op->get_inputs().end());
-        op->remove();
-    }
-    auto out_op = graph_.make_output(outputs);
-
-    // delete those removed op
-    graph_.reset_op_ids();
-}
-
-void nested_graph_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    graph = std::make_shared<sc_graph_t>(copy_graph(graph_));
-}
-
-// linter has a false alarm to treat copy here as a STL function
-sc_op_ptr nested_graph_op_t::copy( // NOLINT
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ret = mgr.make<nested_graph_op_t>(
-            this->op_name_, ins, outs, attrs_, copy_graph(graph_));
-    return ret;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_op.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_op.hpp
deleted file mode 100644
index e669d452c6d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/graph_op.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_GRAPH_OP_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "graph.hpp"
-#include "util/general_object.hpp"
-#include <compiler/ir/graph/graph_config.hpp>
-#include <compiler/ir/graph/trait/configurable.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class graph_op_t : public sc_op {
-public:
-    graph_op_t() = default;
-
-    graph_op_t(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs)
-        : sc_op(op_name, producer_lt, consumer_lt, attrs) {}
-
-    ir_module_ptr get_func(context_ptr ctx) override;
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override {};
-
-    // the param graph is created by upper function and passed to this function.
-    // It should be an empty graph and already synced with external graph.
-    // For the alignment of outer and inner dynamic graph.
-    virtual void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) = 0;
-
-    virtual std::shared_ptr<sc_graph_t> get_graph();
-
-    static std::vector<graph_tensor_ptr> remake_logical_tensors(
-            const std::vector<graph_tensor_ptr> &flts);
-
-    float get_gflop() override { return get_graph()->get_gflop(); }
-
-    static graph_tensor_ptr cast_input_dtype(graph_tensor_ptr &inp,
-            std::shared_ptr<sc_graph_t> &graph, const any_map_t &attrs = {});
-
-    static std::shared_ptr<sc_op> cast_output_dtype(graph_tensor_ptr &inp,
-            std::shared_ptr<sc_graph_t> &graph, std::shared_ptr<sc_op> &last_op,
-            const any_map_t &attrs = {});
-};
-
-class configurable_graph_op_t : public graph_op_t,
-                                public op_traits::configurable_t {
-public:
-    configurable_graph_op_t() = default;
-
-    configurable_graph_op_t(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs)
-        : graph_op_t(op_name, producer_lt, consumer_lt, attrs) {}
-
-    std::shared_ptr<sc_graph_t> get_graph() override;
-
-    config_ptr get_config() override;
-
-    void set_config(const config_ptr &config) override;
-
-    reflection::shared_general_object_t get_default_config(
-            context_ptr ctx) override;
-
-    config_ptr_vec get_dynamic_config_candidates(
-            const context_ptr &ctx) override {
-        throw std::runtime_error("Unimplement.");
-    }
-    impl_kind_map convert_config_candidates_to_impl_map(
-            const config_ptr_vec &configs) override {
-        throw std::runtime_error("Unimplement.");
-    }
-
-protected:
-    graph_config config_data_;
-};
-
-/**
- * The nested graph op
- * Used to convert a graph to a graph op which could be
- * reused in other graph.
- * Ins:
- *  - The corresponding input tensors of the nested graph
- * Outs:
- *  - The corresponding output tensors of the nested graph
- * Attrs:
- * - The attribute of the op
- * Graph:
- * - The graph used to convert a graph to a nested graph op.
- * */
-class SC_INTERNAL_API nested_graph_op_t : public configurable_graph_op_t,
-                                          public op_traits::copyable_t {
-public:
-    nested_graph_op_t(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs,
-            sc_graph_t &&graph);
-
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-
-    // linter has a false alarm to treat copy here as a STL function
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins, // NOLINT
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-
-protected:
-    sc_graph_t graph_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/lowering.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/lowering.cpp
deleted file mode 100644
index a3a1afe6e94..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/lowering.cpp
+++ /dev/null
@@ -1,1515 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "lowering.hpp"
-#include <algorithm>
-#include <limits>
-#include <list>
-#include <memory>
-#include <string>
-#include <utility>
-#include "fusible_op.hpp"
-#include "graph.hpp"
-#include "pass/pass.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/dynamic_internal_info.hpp>
-#include <compiler/ir/graph/dynamic_lower_info.hpp>
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/pass/graph_constant_cache.hpp>
-#include <compiler/ir/graph/trait/may_prefetch.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/pass/ir_copy_internal.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/concat_memory_planning.hpp>
-#include <compiler/ir/transform/cpu/local_tensor_lower.hpp>
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/index2var.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/transform/tensor_inplace_info.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/reshape.hpp>
-#include <runtime/config.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/scoped_timer.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.lowering)
-
-static expr make_global_string(
-        const ir_module_ptr &mod, const std::string &v, int &counter) {
-    std::string name = "__gstring";
-    name += std::to_string(counter++);
-    auto contents = std::make_shared<static_data_t>(v.c_str(), v.size() + 1);
-    auto ret = builder::make_tensor(name, {v.size() + 1}, datatypes::s8,
-            address_space::automatic, contents);
-    auto def = builder::make_var_tensor_def_unattached(
-            ret, linkage::private_global);
-    mod->add_global_var(def.checked_as<define>());
-    return ret;
-}
-
-static graph_tensor_ptr get_linked_output_tsr(const graph_tensor_ptr &ltensor) {
-    if (ltensor->producer_owner_->isa<input_op>()) { return nullptr; }
-    if (!ltensor->uses_.empty()) {
-        for (size_t i = 0; i < ltensor->uses_.size(); i++) {
-            if (ltensor->uses_[i].second->isa<tensor_view_op_t>()
-                    || ltensor->uses_[i]
-                               .second->isa<ops::dynamic_reshape_op>()) {
-                auto reshape = ltensor->uses_[i].second;
-                auto next_ltensor = reshape->get_outputs()[0];
-                for (auto &cld : next_ltensor->uses_) {
-                    if (cld.second->isa<output_op>()) {
-                        return cld.second->get_inputs()[cld.first];
-                    } else if (cld.second->isa<tensor_view_op_t>()
-                            || cld.second->isa<ops::dynamic_reshape_op>()) {
-                        auto cur_linked_out
-                                = get_linked_output_tsr(next_ltensor);
-                        if (cur_linked_out) { return cur_linked_out; }
-                    }
-                }
-            }
-        }
-    }
-    return nullptr;
-}
-
-static bool has_output_uses(const graph_tensor_ptr &ltensor) {
-    if (!ltensor->uses_.empty()) {
-        for (size_t i = 0; i < ltensor->uses_.size(); i++) {
-            if (ltensor->uses_[i].second->isa<output_op>()) { return true; }
-        }
-    }
-    return false;
-}
-
-struct lowering_visitor_state_t {
-    std::unordered_map<graph_tensor_ptr, size_t> tensor_pending_refcount_;
-    op_visitor_t::updater_func topo_sorter_;
-    std::vector<size_t> op_exec_tick_;
-    std::vector<bool> op_visited_;
-    //  need to visit the input outs in reversed order to align to old lowering
-    //  input argument order (like pop_back_selector). Our visitor must visit
-    //  the input ops first
-    std::list<sc_op_ptr>::iterator input_op_itr;
-    size_t cur_tick_ = 0;
-    size_t max_tensor_size_;
-    bool is_dynamic_;
-
-    lowering_visitor_state_t(sc_graph_t &g)
-        : topo_sorter_ {op_visitor_t::create_DAG_updater(g.ops_.size())}
-        , op_exec_tick_(g.ops_.size())
-        , op_visited_(g.ops_.size()) {
-        max_tensor_size_ = 0;
-        is_dynamic_ = g.is_dynamic();
-        if (!is_dynamic_) {
-            for (auto &op : g.ops_) {
-                for (auto &tsr : op->get_outputs()) {
-                    max_tensor_size_ = std::max(max_tensor_size_,
-                            tsr->details_.get_blocking_byte_size());
-                }
-            }
-        }
-    }
-
-    size_t &get_tensor_pending_refcount(const graph_tensor_ptr &p) {
-        auto itr = tensor_pending_refcount_.find(p);
-        if (itr == tensor_pending_refcount_.end()) {
-            auto ret = tensor_pending_refcount_.insert(
-                    std::make_pair(p, p->uses_.size()));
-            return ret.first->second;
-        }
-        return itr->second;
-    }
-
-    op_visitor_t::updater_func get_updater() {
-        auto ths = this;
-        return [ths](op_visitor_t *vis, const sc_op_ptr &op) {
-            for (auto &in : op->get_inputs()) {
-                ths->get_tensor_pending_refcount(in)--;
-            }
-            auto tick = ths->cur_tick_++;
-            if (op->isa<output_op>() || op->isa<constant_op_t>()) {
-                ths->op_exec_tick_[op->logical_op_id_] = 0;
-            } else {
-                ths->op_exec_tick_[op->logical_op_id_] = tick;
-            }
-            ths->op_visited_[op->logical_op_id_] = true;
-            ths->topo_sorter_(vis, op);
-        };
-    }
-
-    // find the distance of an op to the visited ops
-    int get_op_distance_to_visited_set(sc_op *op, std::vector<int> &d) {
-        auto id = op->logical_op_id_;
-        if (op_visited_[id]) { return 0; }
-        if (d[id] != 0) { return d[id]; }
-        if (op->isa<output_op>()) {
-            d[id] = 0;
-            return 0;
-        }
-        int ret = -1;
-        for (auto &v : op->get_inputs()) {
-            int cur_d
-                    = get_op_distance_to_visited_set(v->producer_owner_, d) + 1;
-            ret = std::max(ret, cur_d);
-        }
-        d[id] = ret;
-        return ret;
-    }
-
-    static constexpr float distance_factor = 2.0f;
-    // for each input tensor, check if the refcount=1. If so, it means that
-    // after the Op is visited, the input tensor is no longer needed compute the
-    // score of each visitable candidate op. the score is "SUM_{each input
-    // tensor}(normalized_sizeof(tensor)/ref_count_modifier*heat_modifier) -
-    // SUM_{each output tensor}(normalized_sizeof(tensor)+ distance_modifier)"
-    float evaluate_op_score(sc_op *op, std::vector<int> &distance_to_visited) {
-        float cur_score = 0;
-
-        for (auto &in : op->get_inputs()) {
-            // if the input tensor is input_op, there is no temp buffer to be
-            // free'd
-            if (!in->producer_owner_->isa<input_op>()) {
-                // compute the heat modifier of the tensor. The hotter
-                // the tensor is (computed lately), the larger the
-                // modifier.
-                auto owner = in->producer_owner_;
-                auto tick_diff
-                        = cur_tick_ - op_exec_tick_[owner->logical_op_id_];
-                assert(cur_tick_ > op_exec_tick_[owner->logical_op_id_]);
-                float heat_modifier;
-                switch (tick_diff) {
-                    case 0:
-                    case 1: heat_modifier = 2.5f; break;
-                    case 2: heat_modifier = 1.5f; break;
-                    default: heat_modifier = 1.0f;
-                }
-                // if it is last use, ref_count_modifier=1. If not,
-                // ref_count_modifier=number of uses
-                size_t ref_count_modifier;
-                if (this->get_tensor_pending_refcount(in) == 1) {
-                    ref_count_modifier = 1;
-                } else {
-                    ref_count_modifier = in->uses_.size();
-                }
-                float cur_tsr = is_dynamic_
-                        ? heat_modifier
-                        : float(in->details_.get_blocking_byte_size())
-                                / ref_count_modifier / max_tensor_size_
-                                * heat_modifier;
-                cur_score += cur_tsr;
-            }
-        }
-        for (auto &out : op->get_outputs()) {
-            // if this output is connected to output op, it is not a temp
-            // buffer, and we don't need to count its size
-            if (out->uses_.size() == 1UL
-                    && out->uses_[0].second->isa<output_op>()) {
-                continue;
-            }
-            int distance = 1;
-            for (auto &use : out->uses_) {
-                distance = std::max(distance,
-                        get_op_distance_to_visited_set(
-                                use.second.get(), distance_to_visited));
-            }
-            float cur_tsr = (distance - 1) * distance_factor
-                    + (is_dynamic_ ? 1.f
-                                   : float(out->details_
-                                                     .get_blocking_byte_size())
-                                            / max_tensor_size_);
-            cur_score -= cur_tsr;
-        }
-        return cur_score;
-    }
-
-    using queue_iterator_t = std::list<sc_op_ptr>::iterator;
-    op_visitor_t::selector_func get_selector() {
-        auto ths = this;
-        return [ths](op_visitor_t *vis) -> sc_op_ptr {
-            if (ths->cur_tick_ == 0) {
-                ths->input_op_itr = vis->to_visit_.end();
-                --ths->input_op_itr;
-            }
-            if (ths->input_op_itr != vis->to_visit_.end()) {
-                // if there is input ops, return and advance the input_op_itr
-                auto ret = *ths->input_op_itr;
-                auto to_remove = ths->input_op_itr;
-                if (ths->input_op_itr == vis->to_visit_.begin()) {
-                    ths->input_op_itr = vis->to_visit_.end();
-                } else {
-                    --ths->input_op_itr;
-                }
-                vis->to_visit_.erase(to_remove);
-
-                SC_MODULE_INFO << "Scheduling const/input: iter "
-                               << ths->cur_tick_ << ", Op " << ret->op_name_
-                               << "_" << ret->logical_op_id_;
-                return ret;
-            }
-            // fast path: if there is only one op, just pop it
-            if (vis->to_visit_.size() == 1) {
-                auto ret = vis->to_visit_.back();
-                vis->to_visit_.pop_back();
-                return ret;
-            }
-            float best_score = std::numeric_limits<float>::lowest();
-            std::list<sc_op_ptr>::reverse_iterator to_remove;
-
-            std::vector<int> distance(ths->op_visited_.size());
-            // visit the queue in reversed order to align to old lowering input
-            // argument order (like pop_back_selector)
-            for (auto itr = vis->to_visit_.rbegin();
-                    itr != vis->to_visit_.rend(); ++itr) {
-                auto &op = *itr;
-                assert(!op->isa<input_op>() && !op->isa<constant_op_t>());
-                float cur_score = ths->evaluate_op_score(op.get(), distance);
-                SC_MODULE_INFO << "Scheduling score: iter " << ths->cur_tick_
-                               << ", Op " << op->op_name_ << "_"
-                               << op->logical_op_id_ << " = " << cur_score;
-                if (cur_score > best_score) {
-                    best_score = cur_score;
-                    to_remove = itr;
-                }
-            }
-            auto ret = *to_remove;
-            SC_MODULE_INFO << "Scheduling selects: iter " << ths->cur_tick_
-                           << ", Op " << ret->op_name_ << "_"
-                           << ret->logical_op_id_;
-            vis->to_visit_.erase(std::next(to_remove).base());
-            return ret;
-        };
-    }
-};
-
-namespace graph {
-std::string get_tensor_name(graph_tensor *t, sc_op *linked_output) {
-    std::string tensor_name;
-    if (t->producer_owner_->get_outputs().size() == 1UL) {
-        tensor_name = t->producer_owner_->attrs_.get_or_else(
-                "temp.name", tensor_name);
-    }
-    if (tensor_name.empty() && linked_output
-            && linked_output->get_inputs().size() == 1UL) {
-        tensor_name
-                = linked_output->attrs_.get_or_else("temp.name", tensor_name);
-    }
-    for (auto &ch : tensor_name) {
-        if (ch == '*') { ch = '_'; }
-    }
-    return tensor_name;
-}
-} // namespace graph
-
-class tv_tsr_replacer_t : public ir_copier_impl_t {
-public:
-    using ir_copier_impl_t::dispatch;
-    using ir_copier_impl_t::view;
-    tv_tsr_replacer_t(std::unordered_map<expr_c, expr> &replace_map,
-            bool create_var_tensor = false)
-        : ir_copier_impl_t(replace_map, create_var_tensor) {}
-    void view(define_c v) override {
-        if (replace_map_.find(v->var_) != replace_map_.end()) {
-            returned_stmt_ = builder::make_stmts_unattached({});
-        } else {
-            ir_copier_impl_t::view(v);
-        }
-    }
-};
-
-static void add_def_comments(const stmt &def_node, graph_tensor *t) {
-    std::stringstream ss;
-    t->details_.to_string(ss);
-    if (auto old_comments
-            = def_node->attr().get_or_null<std::vector<std::string>>(
-                    "comments")) {
-        old_comments->emplace_back(ss.str());
-    } else {
-        def_node->attr()["comments"] = std::vector<std::string> {ss.str()};
-    }
-}
-
-enum op_kinds : int {
-    kother = 0,
-    kinput,
-    koutput,
-    kconstant,
-    kreorder,
-    kreshape,
-};
-
-struct general_lower_params_t {
-    ir_module_ptr ret_mod;
-    std::unordered_map<graph_tensor_ptr, tsr_info_t> &ltsr_rtsr;
-    sc_graph_t &graph;
-    stmts func_body;
-    stmts init_body;
-    int &tensor_counter;
-    int &global_tensor_counter;
-    bool is_graph_dynamic;
-    // the number of lazy initialzied constant tensors in shared const cache
-    size_t num_lazy_init_shared_const_tsr_;
-    // the number of compile-time initialzied constant tensors in shared const
-    // cache
-    size_t num_inited_shared_const_tsr_;
-    std::unordered_set<sc_dim> external_dyn_vars;
-    std::vector<expr> &shared_consts;
-};
-
-expr get_or_create_tensor(general_lower_params_t &gp, const graph_tensor_ptr &t,
-        bool is_arg, int const_type,
-        info_etype_t type = info_etype_t::real_tensor) {
-    bool tsr_is_dynamic = t->details_.is_dynamic();
-    sc_op *linked_output = nullptr;
-    if (!is_arg) {
-        for (auto &use : t->uses_) {
-            // finds if any of the use of the tensor is marked output
-            if (use.second->isa<output_op>()) {
-                is_arg = true;
-                linked_output = use.second.get();
-                break;
-            }
-        }
-    }
-    if (is_arg || const_type != const_kind::not_const) {
-        // input/output and const tsr don't need placeholder
-        if (gp.is_graph_dynamic) {
-            COMPILE_ASSERT(t->details_.get_format_candidates().size() <= 1,
-                    "Input/output/constant tsr should have only empty or "
-                    "one format candidate");
-        }
-        if (type == info_etype_t::placeholder) {
-            type = info_etype_t::real_tensor;
-        }
-    }
-    auto itr = gp.ltsr_rtsr.find(t);
-    if (itr == gp.ltsr_rtsr.end()) {
-        gp.ltsr_rtsr[t] = tsr_info_t();
-        itr = gp.ltsr_rtsr.find(t);
-        itr->second.count_ = gp.tensor_counter++;
-    } else {
-        if (type == info_etype_t::real_tensor
-                && itr->second.tensor_.defined()) {
-            if (gp.graph.is_dynamic()) {
-                itr->second.tensor_->attr().set(attr_keys::always_trans, true);
-            }
-            return itr->second.tensor_;
-        }
-        if (type == info_etype_t::placeholder
-                && itr->second.placeholder_.defined()) {
-            return itr->second.placeholder_;
-        }
-        if (type == info_etype_t::format && itr->second.format_.defined()) {
-            return itr->second.format_;
-        }
-        if (type == info_etype_t::out_size && itr->second.size_.defined()) {
-            return itr->second.size_;
-        }
-    }
-
-    std::vector<expr> dims, strides;
-    sc_data_type_t tsr_dtype;
-    expr tsr;
-
-    std::string tensor_name = graph::get_tensor_name(t.get(), linked_output);
-    if (tensor_name.empty()) {
-        tensor_name
-                = std::string("buffer_") + std::to_string(itr->second.count_);
-    }
-    if (type == info_etype_t::real_tensor) {
-        bool is_size_dynamic = !is_arg && gp.is_graph_dynamic
-                && (t->details_.get_format_candidates().size() > 1
-                        || can_op_query_output(
-                                t->producer_owner_->shared_from_this()));
-        expr dyn_tsr_size;
-        if (is_size_dynamic) {
-            assert(itr->second.size_.defined());
-            dyn_tsr_size = builder::make_indexing(itr->second.size_, {0});
-            dyn_tsr_size->attr().set(attr_keys::no_index2var, true);
-        }
-
-        dims = is_size_dynamic ? std::vector<expr> {dyn_tsr_size}
-                               : t->details_.get_blocking_dims_expr(gp.graph);
-        strides = is_size_dynamic ? std::vector<expr> {UINT64_C(1)}
-                                  : t->details_.get_strides_expr(gp.graph);
-        tsr_dtype = t->details_.dtype_;
-        tsr = builder::make_stensor(tensor_name, dims, strides, tsr_dtype);
-        tsr->attr()[attr_keys::plain_dims]
-                = gp.graph.dims_to_expr(t->details_.get_plain_dims());
-        if (itr->second.placeholder_.defined()) {
-            // for dynamic tensor transform
-            tsr->attr()["temp.dyn_placeholder"] = itr->second.placeholder_;
-        }
-        itr->second.tensor_ = tsr;
-        if (is_arg || const_type != const_kind::not_const) {
-            itr->second.placeholder_ = tsr;
-        }
-        if (gp.graph.is_dynamic()) {
-            tsr->attr().set(attr_keys::always_trans, true);
-        }
-        // this tensor is an input to a standalone concat op
-        if (t->attrs_.has_key(concat_optim_attr_keys::graph_memory_offset)) {
-            tsr->attr()[concat_optim_attr_keys::pass_memory_offset]
-                    = t->attrs_.get<std::vector<expr>>(
-                            concat_optim_attr_keys::graph_memory_offset);
-        }
-    } else if (type == info_etype_t::placeholder) {
-        if (itr->second.tensor_.defined()) {
-            // first check if the real tensor exist
-            tsr = itr->second.tensor_;
-            itr->second.placeholder_ = tsr;
-            tsr->attr().set(attr_keys::always_trans, true);
-        } else {
-            tensor_name += "_placeholder";
-            dims = std::vector<expr> {sizeof(runtime::dynamic_tensor_t)};
-            tsr_dtype = datatypes::u8;
-            tsr = builder::make_tensor(tensor_name, dims, tsr_dtype);
-            itr->second.placeholder_ = tsr;
-        }
-    } else if (type == info_etype_t::format) {
-        tensor_name += "_format";
-        tsr = builder::make_tensor(
-                tensor_name, {UINT64_C(1)}, datatypes::index);
-        itr->second.format_ = tsr;
-    } else {
-        assert(type == info_etype_t::out_size);
-        tensor_name += "_size";
-        tsr = builder::make_tensor(
-                tensor_name, {UINT64_C(1)}, datatypes::index);
-        itr->second.size_ = tsr;
-    }
-    if (type == info_etype_t ::real_tensor) {
-        stmt def_node;
-        if (!is_arg) {
-            if (const_type != const_kind::not_const) {
-                if (const_type == const_kind::global_const) {
-                    std::string folded_name;
-                    auto ownerop = t->producer_owner_;
-                    std::shared_ptr<cached_const_graph_tensor> cached;
-                    if (auto cache
-                            = ownerop->attrs_
-                                      .get_or_null<std::vector<std::shared_ptr<
-                                              cached_const_graph_tensor>>>(
-                                              op_attr_key::const_input_cache)) {
-                        auto idx = std::find(ownerop->get_outputs().begin(),
-                                           ownerop->get_outputs().end(), t)
-                                - ownerop->get_outputs().begin();
-                        cached = cache->at(idx);
-                        if (cached->buf_base_->is_lazy_) {
-                            gp.num_lazy_init_shared_const_tsr_++;
-                        } else {
-                            gp.num_inited_shared_const_tsr_++;
-                        }
-                        std::stringstream ss;
-                        ss << "shared_const_" << gp.global_tensor_counter++;
-                        folded_name = ss.str();
-                        tsr.static_as<tensor>()->name_ = folded_name;
-                    } else {
-                        folded_name = "folded_const_"
-                                + std::to_string(gp.global_tensor_counter++);
-                        tsr = copy_attr(*tsr,
-                                gp.ret_mod->make_global_stensor(
-                                        tsr.checked_as<tensor>()->elem_dtype_,
-                                        folded_name,
-                                        tsr.checked_as<tensor>()->dims_,
-                                        tsr.checked_as<tensor>()->strides_,
-                                        linkage::private_global, &def_node));
-                    }
-                    if (cached) {
-                        tsr->attr()[attr_keys::shared_const] = cached;
-                        // const cached tensors are lowered to "local tensors"
-                        // with special marks
-                        def_node = builder::make_var_tensor_def_unattached(tsr);
-                        def_node->attr()["comments"]
-                                = std::vector<std::string> {
-                                        "The tensor is cached in global "
-                                        "constant cache"};
-                        // they are not real local tensors, don't schedule
-                        // buffers
-                        def_node->attr()[attr_keys::tsr_dont_buf_sched] = true;
-                        gp.func_body->seq_.insert(
-                                gp.func_body->seq_.begin(), def_node);
-                        gp.shared_consts.emplace_back(tsr);
-                    }
-                    // global tensor does not need cached dynamic var
-                    tsr->attr_->set("temp.dyn_placeholder", expr());
-                    if (auto const_node
-                            = t->producer_owner_->dyn_cast<constant_op_t>()) {
-                        auto const_value = const_node->get_constant_values();
-                        tsr.checked_as<tensor>()->init_value_ = const_value;
-                    }
-                    if (gp.graph.is_dynamic()) {
-                        tsr->attr().set(attr_keys::always_trans, true);
-                    }
-                    itr->second.tensor_ = tsr;
-                    itr->second.placeholder_ = tsr;
-                } else {
-                    def_node = builder::make_var_tensor_def_unattached(tsr);
-                    gp.init_body->seq_.emplace_back(def_node);
-                }
-            } else {
-                def_node = builder::make_var_tensor_def_unattached(tsr);
-                gp.func_body->seq_.emplace_back(def_node);
-            }
-        }
-        if (def_node.defined()) { add_def_comments(def_node, t.get()); }
-    } else if (type == info_etype_t::placeholder) {
-        // placeholder
-        // if use tensor as plhd, do nothing.
-        if (!itr->second.tensor_.defined()) {
-            gp.func_body->seq_.emplace_back(
-                    builder::make_var_tensor_def_unattached(tsr));
-            std::string name;
-            if (tsr.isa<tensor>()) {
-                name = tsr.checked_as<tensor>()->name_;
-            } else {
-                assert(tsr.isa<tensorptr>());
-                name = tsr.checked_as<tensorptr>()
-                                ->base_->ptr_.checked_as<tensor>()
-                                ->name_
-                        + "_tptr";
-            }
-            auto shape_tsr = builder::make_tensor(
-                    std::string("dyn_shape_") + tsr.checked_as<tensor>()->name_,
-                    {t->details_.get_plain_dims().size()}, datatypes::index);
-            shape_tsr->attr().set(attr_keys::no_dead_write, true);
-            shape_tsr->attr().set(attr_keys::no_tensor2var, true);
-            tsr->attr().set("temp.dyn_shape_of_placeholder", shape_tsr);
-            gp.func_body->seq_.emplace_back(
-                    builder::make_var_tensor_def_unattached(shape_tsr));
-            gp.func_body->seq_.emplace_back(builder::make_evaluate_unattached(
-                    builder::make_write_struct(tsr, shape_tsr,
-                            dyn_tsr_struct_t::name,
-                            dyn_tsr_struct_t::fields::dim_ptr)));
-            gp.func_body->seq_.emplace_back(builder::make_evaluate_unattached(
-                    builder::make_write_struct(tsr,
-                            builder::make_constant(
-                                    {t->details_.get_plain_dims().size()},
-                                    datatypes::s32),
-                            dyn_tsr_struct_t::name,
-                            dyn_tsr_struct_t::fields::ndims)));
-            uint64_t etype = t->details_.dtype_.is_etype_pointer()
-                    ? t->details_.dtype_.get_pointer_element().as_etype_int()
-                    : t->details_.dtype_.as_etype_int();
-            gp.func_body->seq_.emplace_back(builder::make_evaluate_unattached(
-                    builder::make_write_struct(tsr,
-                            builder::make_constant({etype}, datatypes::u32),
-                            dyn_tsr_struct_t::name,
-                            dyn_tsr_struct_t::fields::dtype)));
-            auto plain_shapes_int = t->details_.get_plain_dims();
-            auto plain_shapes = gp.graph.dims_to_expr(plain_shapes_int);
-            uint64_t dyn_mask_int = 0;
-            for (size_t i = 0; i < plain_shapes.size(); i++) {
-                if (!is_dynamic_dim(plain_shapes_int[i])
-                        || gp.external_dyn_vars.find(plain_shapes_int[i])
-                                != gp.external_dyn_vars.end()) {
-                    gp.func_body->seq_.emplace_back(
-                            builder::make_assign_unattached(
-                                    builder::make_indexing(shape_tsr, {i}),
-                                    plain_shapes[i]));
-                }
-                dyn_mask_int
-                        |= (uint64_t(!plain_shapes[i].isa<constant>()) << i);
-            }
-            gp.func_body->seq_.emplace_back(builder::make_evaluate_unattached(
-                    builder::make_write_struct(tsr,
-                            builder::make_constant(
-                                    {dyn_mask_int}, datatypes::u8),
-                            dyn_tsr_struct_t::name,
-                            dyn_tsr_struct_t::fields::dyn_mask)));
-        }
-    } else if (type == info_etype_t::format) {
-        // placeholder can be replaced by tensor while format can't
-        gp.func_body->seq_.emplace_back(
-                builder::make_var_tensor_def_unattached(tsr));
-        uint64_t init_format = 0;
-        if (t->details_.get_format_candidates().size() <= 1) {
-            init_format = uint64_t(t->details_.get_format().to_runtime());
-        }
-        gp.func_body->seq_.emplace_back(builder::make_assign_unattached(
-                builder::make_indexing(tsr, {0}), init_format));
-    } else if (type == info_etype_t::out_size) {
-        if (const_type == const_kind::not_const) {
-            gp.func_body->seq_.emplace_back(
-                    builder::make_var_tensor_def_unattached(tsr));
-            gp.func_body->seq_.back()->attr().set(
-                    attr_keys::tsr_dont_buf_sched, true);
-        }
-    }
-    return tsr;
-};
-
-expr create_op_query_func(const context_ptr &ctx, general_lower_params_t &gp,
-        std::vector<expr> &op_dispatch_kernel, const sc_op_ptr &node) {
-    std::vector<expr> plhd_ins, fmt_ins;
-    std::vector<expr> plhd_outs, fmt_outs, size_outs;
-    bool need_dispatch = can_op_be_queried(node);
-    // current input
-    for (auto &ltensor : node->get_inputs()) {
-        auto const_type = ltensor->producer_owner_->attrs_.get_or_else(
-                "constant", const_kind::not_const);
-        plhd_ins.emplace_back(get_or_create_tensor(
-                gp, ltensor, false, const_type, info_etype_t::placeholder));
-        fmt_ins.emplace_back(get_or_create_tensor(
-                gp, ltensor, false, const_type, info_etype_t::format));
-    }
-    // input before reorder
-    if (node->isa<tunable_op_t>()
-            || (node->isa<mixed_fuse_op_t>()
-                    && !node->stc_cast<mixed_fuse_op_t>()
-                                ->get_internal_tunable_input_indices()
-                                .empty())) {
-        auto &inputs = node->get_inputs();
-        std::vector<size_t> query_idxs;
-        if (node->isa<tunable_op_t>()) {
-            size_t sz;
-            sz = inputs.size();
-            query_idxs.reserve(sz);
-            for (size_t i = 0; i < sz; i++) {
-                query_idxs.emplace_back(i);
-            }
-        } else if (node->isa<mixed_fuse_op_t>()) {
-            query_idxs = node->stc_cast<mixed_fuse_op_t>()
-                                 ->get_internal_tunable_input_indices();
-        }
-        for (auto i : query_idxs) {
-            auto ltensor = node->get_inputs()[i];
-            auto node_before = ltensor->producer_owner_;
-            auto const_type_before = node_before->attrs_.get_or_else(
-                    "constant", const_kind::not_const);
-            // find the buffer before reorder.
-            if (node_before->isa<reorder_op_t>()
-                    && (node_before->attrs_.as_map().empty()
-                            || node_before->attrs_.get_or_else(
-                                       "constant", const_kind::not_const)
-                                    == const_kind::not_const)) {
-                ltensor = node_before->get_inputs()[0];
-            }
-            plhd_ins.emplace_back(get_or_create_tensor(gp, ltensor, false,
-                    const_type_before, info_etype_t::placeholder));
-            fmt_ins.emplace_back(get_or_create_tensor(gp, ltensor, false,
-                    const_type_before, info_etype_t::format));
-        }
-    }
-    auto const_type
-            = node->attrs_.get_or_else("constant", const_kind::not_const);
-    for (auto &ltensor : node->get_outputs()) {
-        expr plhd, fmt, size;
-        if (node->isa<input_op>()) {
-            // use real tensor instead of placeholder.
-            plhd = get_or_create_tensor(
-                    gp, ltensor, true, const_type, info_etype_t::real_tensor);
-            fmt = get_or_create_tensor(
-                    gp, ltensor, true, const_type, info_etype_t::format);
-        } else if (node->isa<constant_op_t>()) {
-            plhd = get_or_create_tensor(gp, ltensor, false,
-                    const_kind::global_const, info_etype_t::real_tensor);
-            fmt = get_or_create_tensor(gp, ltensor, false,
-                    const_kind::global_const, info_etype_t::format);
-        } else {
-            plhd = get_or_create_tensor(
-                    gp, ltensor, false, const_type, info_etype_t::placeholder);
-            // expect for output tsr
-            if (!plhd.defined()) {
-                plhd = get_or_create_tensor(gp, ltensor, false, const_type,
-                        info_etype_t::real_tensor);
-            }
-            fmt = get_or_create_tensor(
-                    gp, ltensor, false, const_type, info_etype_t::format);
-            size = get_or_create_tensor(
-                    gp, ltensor, false, const_type, info_etype_t::out_size);
-        }
-        plhd_outs.emplace_back(plhd);
-        fmt_outs.emplace_back(fmt);
-        size_outs.emplace_back(size);
-    }
-    // Pruning, because the format propagation is broken after reorder,
-    // so it doesn't need query to deliver formats. Notes that only
-    // reorder could, other ops should propagate their format even does
-    // not need dispatch.
-    if ((node->isa<reorder_op_t>() && !need_dispatch)
-            || const_type != const_kind::not_const) {
-        return expr();
-    }
-    expr dyn_ker_ptr;
-    // update dynamic query format
-    if (!op_dispatch_kernel[node->logical_op_id_].defined()) {
-        auto &table_map = gp.ret_mod->get_op_table_map();
-        auto func_name = node->op_name_ + "__"
-                + std::to_string(node->logical_op_id_) + "_ptr";
-        auto table_name = func_name + "_table";
-        auto table_it = table_map.find(table_name);
-        auto table_var = builder::make_var(datatypes::pointer, table_name);
-        auto table_ptr = table_it != table_map.end()
-                ? table_it->second
-                : std::make_shared<op_dispatch_tables_t>();
-        int internal_func_num = get_num_of_internal_funcs(node);
-        // kernel pointer vector the first is outer function.
-        dyn_ker_ptr = builder::make_tensor(func_name,
-                {static_cast<uint64_t>(1 + internal_func_num)},
-                datatypes::index);
-        std::vector<expr> query_func_args;
-        query_func_args.emplace_back(table_var);
-        query_func_args.insert(
-                query_func_args.end(), plhd_outs.begin(), plhd_outs.end());
-        query_func_args.insert(
-                query_func_args.end(), plhd_ins.begin(), plhd_ins.end());
-        query_func_args.insert(
-                query_func_args.end(), fmt_outs.begin(), fmt_outs.end());
-        query_func_args.insert(
-                query_func_args.end(), fmt_ins.begin(), fmt_ins.end());
-        query_func_args.insert(
-                query_func_args.end(), size_outs.begin(), size_outs.end());
-        query_func_args.push_back(dyn_ker_ptr);
-        expr query_call; // call node
-        if (node->isa<mixed_fuse_op_t>()) {
-            auto fused_node = node->stc_cast<mixed_fuse_op_t>();
-            auto query_mod = fused_node->get_dynamic_query_func(ctx);
-            gp.ret_mod->merge(*query_mod);
-            assert(table_ptr);
-            query_call = builder::make_call(
-                    query_mod->get_entry_func(), query_func_args);
-        } else {
-            auto table_ptr = std::make_shared<op_dispatch_tables_t>();
-            gp.ret_mod->add_op_table(std::make_pair(table_name, table_ptr));
-            initialize_dispatch_table_with_op(ctx, node, table_ptr);
-            query_call = call_op_dynamic_query_function(node, query_func_args);
-        }
-        stmts_node_t *target_body = gp.func_body.get();
-        if (table_it == table_map.end()) {
-            auto table_def = builder::make_var_tensor_def_unattached(
-                    table_var, linkage::private_global);
-            gp.ret_mod->add_global_var(table_def.checked_as<define>());
-        }
-        target_body->seq_.emplace_back(
-                builder::make_var_tensor_def_unattached(dyn_ker_ptr));
-        target_body->seq_.emplace_back(
-                builder::make_evaluate_unattached(query_call));
-        op_dispatch_kernel[node->logical_op_id_] = builder::make_reinterpret(
-                builder::make_indexing(dyn_ker_ptr, 0), datatypes::pointer);
-        op_dispatch_kernel[node->logical_op_id_]->attr().set(
-                attr_keys::no_index2var, true);
-    }
-    return dyn_ker_ptr;
-}
-
-std::pair<expr, expr> get_reshape_tptr(general_lower_params_t &gp,
-        const graph_tensor_ptr &old_tsr, const graph_tensor_ptr &new_tsr,
-        int const_type, op_kinds kind) {
-    auto base_tsr
-            = get_or_create_tensor(gp, old_tsr, kind == kinput, const_type);
-    size_t ndims;
-    if (base_tsr.isa<tensorptr>()) {
-        ndims = base_tsr.static_as<tensorptr>()->shape_.size();
-    } else {
-        assert(base_tsr.isa<tensor>());
-        ndims = base_tsr.static_as<tensor>()->dims_.size();
-    }
-    std::vector<expr_c> base_idx(ndims, expr(0));
-    std::vector<expr> new_shape_tmp
-            = new_tsr->details_.get_blocking_dims_expr(gp.graph);
-    std::vector<expr_c> new_shape(new_shape_tmp.begin(), new_shape_tmp.end());
-    auto new_tptr = builder::tensor_ptr(base_tsr, base_idx, new_shape);
-    new_tptr->attr().set(attr_keys::plain_dims,
-            gp.graph.dims_to_expr(new_tsr->details_.get_plain_dims()));
-    return std::make_pair(base_tsr, new_tptr);
-}
-
-void create_op_tensors(general_lower_params_t &gp, std::vector<expr> &ins,
-        std::vector<expr> &outs, const sc_op_ptr &node, op_kinds kind) {
-    int const_type
-            = node->attrs_.get_or_else("constant", const_kind::not_const);
-    for (auto &ltensor : node->get_inputs()) {
-        // As the traversal is not in order, so the constant type of
-        // tensor should be decided by the node before.
-        ins.emplace_back(get_or_create_tensor(gp, ltensor, false, const_type));
-    }
-    for (auto &ltensor : node->get_outputs()) {
-        if (kind == kconstant) {
-            get_or_create_tensor(gp, ltensor, false, const_kind::global_const);
-        } else if (kind == kreshape) {
-            COMPILE_ASSERT(node->get_inputs().size() == 1
-                            || node->get_inputs().size() == 2,
-                    "Reshape should have 1 or 2(dynamic_reshape) input");
-            // If the output of tensor view is output of graph
-            if (gp.ltsr_rtsr.find(ltensor) != gp.ltsr_rtsr.end()
-                    && has_output_uses(ltensor)) {
-                break;
-            }
-            auto out_tsr_pair = get_reshape_tptr(
-                    gp, node->get_inputs()[0], ltensor, const_type, kind);
-            auto it = gp.ltsr_rtsr.find(ltensor);
-            if (it != gp.ltsr_rtsr.end() && it->second.tensor_.defined()) {
-                COMPILE_ASSERT(gp.is_graph_dynamic,
-                        "If output tsr of tensor view is defined, it "
-                        "should in dynamic mode.");
-                // the tsr replace map for tensor view op. Because in
-                // dynamic mode, the output of tensor view may be
-                // traversed first.
-                std::unordered_map<expr_c, expr> tv_replace_map;
-                tv_replace_map.insert(std::make_pair(
-                        it->second.tensor_, out_tsr_pair.second));
-                tv_tsr_replacer_t cpy(tv_replace_map, false);
-                gp.func_body = cpy.dispatch(gp.func_body)
-                                       .remove_const()
-                                       .checked_as<stmts>();
-                gp.init_body = cpy.dispatch(gp.init_body)
-                                       .remove_const()
-                                       .checked_as<stmts>();
-            }
-            gp.ltsr_rtsr[ltensor].tensor_ = out_tsr_pair.second;
-        } else {
-            graph_tensor_ptr out_tsr;
-            // for pattern like node->reshape->output, node != input
-            if (auto out_tsr = get_linked_output_tsr(ltensor)) {
-                gp.ltsr_rtsr[ltensor].tensor_ = get_reshape_tptr(
-                        gp, out_tsr, ltensor, const_type, kind)
-                                                        .second;
-                outs.emplace_back(gp.ltsr_rtsr[ltensor].tensor_);
-            } else {
-                outs.emplace_back(get_or_create_tensor(
-                        gp, ltensor, kind == kinput, const_type));
-            }
-        }
-    }
-}
-
-static std::string get_dispatch_callee_name(const expr &kernel) {
-    assert(kernel.isa<indexing>());
-    return kernel.checked_as<indexing>()->ptr_.checked_as<tensor>()->name_;
-}
-
-static void dynamic_reshape_var_assignment(general_lower_params_t &gp,
-        std::unordered_set<expr> &dynamic_var_set, const sc_op_ptr &node) {
-    auto &graph = gp.graph;
-    auto out_plain_dims = node->get_outputs()[0]->details_.get_plain_dims();
-    auto shape_tsr = gp.ltsr_rtsr[node->get_inputs()[1]].tensor_;
-    for (size_t i = 0; i < out_plain_dims.size(); i++) {
-        auto &dyn_dim = out_plain_dims[i];
-        // may be inferred by dynamic shape binding.
-        if (!is_dynamic_dim(dyn_dim)) { continue; }
-        auto var = graph.dim_to_expr(dyn_dim);
-        if (dynamic_var_set.find(var) == dynamic_var_set.end()) {
-            gp.func_body->seq_.emplace_back(
-                    builder::make_var_tensor_def_unattached(
-                            graph.dim_to_expr(dyn_dim), linkage::local,
-                            builder::make_indexing(shape_tsr, {i})));
-            dynamic_var_set.insert(var);
-        }
-    }
-}
-static func_t insert_prefetch(const context_ptr &ctx,
-        const std::vector<std::pair<sc_op_ptr, stmt>> &op_execution_log,
-        sc_op *node, const std::vector<expr> &input_tsr, ir_module_t &mod,
-        std::vector<stmt> &outbody) {
-    if (!ctx->flags_.prefetch_) { return func_t(); }
-    if (op_execution_log.empty()) { return func_t(); }
-    auto prefetch_op = node->dyn_cast<op_traits::may_prefetch_t>();
-    if (!prefetch_op) { return func_t(); }
-    std::vector<tensor_slice> ins;
-    ins.reserve(input_tsr.size());
-    for (auto &in : input_tsr) {
-        ins.emplace_back(in);
-    }
-    auto can_prefetch_index = prefetch_op->query_prefetch(ctx, true, ins);
-    if (can_prefetch_index.empty()) { return func_t(); }
-    // now find the position where we insert prefetch
-    size_t pos = std::string::npos;
-    // how many Ops are executed between "pos" and current Op
-    size_t pos_diff = 0;
-    for (int64_t i = op_execution_log.size() - 1; i >= 0; i--) {
-        if (op_execution_log[i].second.defined()) {
-            // if the op is executed in main body (not in cached const/input)
-            auto gflop = op_execution_log[i].first->get_gflop();
-            // todo(yijie): currently gflop>0 means the op has a complex op like
-            // matmul so we only need to compare gflop with 0. We should later
-            // set a reasonable threshold
-            bool is_good_for_insert = gflop > 0;
-            if (is_good_for_insert) {
-                auto func = op_execution_log[i]
-                                    .second.checked_as<evaluate>()
-                                    ->value_.checked_as<call>()
-                                    ->func_;
-                if (func->attr_
-                        && func->attr_->get_or_else(
-                                function_attrs::has_idle_func, false)) {
-                    // if we want to insert to a position that is already doing
-                    // prefetch, skip
-                    return func_t();
-                }
-                pos = i;
-                break;
-            }
-            pos_diff++;
-        }
-    }
-    if (pos == std::string::npos || pos_diff > 3) {
-        // if there is no large ops, or the position is too far away from the
-        // current op, don't insert prefetch
-        return func_t();
-    }
-
-    std::vector<int> filtered_prefetch_index;
-    auto cache_thres = ctx->machine_.cpu_flags_.getDCacheSize(2)
-            * runtime_config_t::get().get_num_threads();
-    // we need to check at this point (at "pos"), which input of the Op ("node")
-    // is ready to prefetch
-    for (auto in_idx : can_prefetch_index) {
-        auto &tsr = node->get_inputs().at(in_idx);
-        // todo (yijie): add more detailed cache size management
-        // skip prefetch on this tensor if it is too large
-        if (tsr->details_.get_blocking_byte_size() * 2 > cache_thres) {
-            continue;
-        }
-        auto producer_op = tsr->producer_owner_;
-        size_t producer_idx = op_execution_log.size();
-        for (size_t i = 0; i < op_execution_log.size(); i++) {
-            if (op_execution_log[i].first.get() == producer_op) {
-                if (op_execution_log[i].second.defined()) {
-                    producer_idx = i;
-                } else {
-                    // if the producer op is in cached const/input, it is always
-                    // ready to be prefetched
-                    producer_idx = 0;
-                }
-                break;
-            }
-        }
-        if (producer_idx <= pos) {
-            if (!input_tsr.at(in_idx).isa<tensor>()) {
-                SC_MODULE_WARN << "Cannot prefetch tensor_view"
-                               << input_tsr.at(in_idx);
-            } else {
-                // if the input is already computed before "pos"
-                filtered_prefetch_index.push_back(in_idx);
-            }
-        }
-    }
-    if (filtered_prefetch_index.empty()) { return func_t(); }
-    std::vector<stmt> out;
-    auto ret = prefetch_op->generate_prefetcher_and_set_idle(
-            ctx, true, ins, filtered_prefetch_index, out);
-    // insert the thread pool manipulation code before pos
-    bool found = false;
-    for (auto itr = outbody.begin(); itr != outbody.end(); ++itr) {
-        if (itr->ptr_same(op_execution_log[pos].second)) {
-            for (auto &set_idle_body : out) {
-                itr = outbody.insert(itr, set_idle_body);
-            }
-            auto func = op_execution_log[pos]
-                                .second.checked_as<evaluate>()
-                                ->value_.checked_as<call>()
-                                ->func_;
-            func->attr()[function_attrs::has_idle_func] = true;
-            found = true;
-            break;
-        }
-    }
-    assert(found);
-    SC_UNUSED(found);
-    mod.add_func({ret});
-    return ret;
-}
-
-static void add_func_doc_string(const func_t &f) {
-    std::vector<std::string> comments;
-    comments.reserve(f->params_.size() + 1);
-    comments.emplace_back(f->name_);
-    for (auto &p : f->params_) {
-        std::stringstream ss;
-        ss << "@param " << p << ' '
-           << p->attr().get<std::string>("temp.comments");
-        comments.emplace_back(ss.str());
-        p->attr().remove("temp.comments");
-    }
-    f->attr()["comments"] = std::move(comments);
-}
-
-// check if an op can be marked no_post_barrier
-stmt_base_t *has_no_dep_with_prev(const sc_op_ptr &node,
-        const op_dep_matrix_t &dep_mat,
-        const std::vector<std::pair<sc_op_ptr, stmt>> &op_execution_log) {
-    // mark no_post_barrier on eveluate-call
-    if (op_execution_log.empty()) { return nullptr; }
-    auto itr = op_execution_log.rbegin();
-    for (; itr != op_execution_log.rend(); ++itr) {
-        if (!itr->second.defined()) continue;
-        if (dep_mat.lookup(node, itr->first) != 0) { return nullptr; }
-        break;
-    }
-    if (itr == op_execution_log.rend()) { return nullptr; }
-    // itr now points to the previously executed op
-    auto ret = itr->second.get();
-    // search from the previous executed op to find if the current op depends on
-    // any of the op in the no_post_barrier section
-    for (itr = itr + 1; itr != op_execution_log.rend(); ++itr) {
-        if (!itr->second.defined()) continue;
-        // if the prev op is already marked no_dep, need to continue to check to
-        // ensure all consequent ops with no_dep does not depend on each other
-        if (itr->second->attr_
-                && itr->second->attr_->has_key(attr_keys::no_post_barrier)) {
-            // if current op does not depend on the previous op
-            if (dep_mat.lookup(node, itr->first) == 0) {
-                continue;
-            } else {
-                return nullptr;
-            }
-        } else {
-            return ret;
-        }
-    }
-    return ret;
-}
-
-ir_module_ptr lower_graph(context_ptr ctx, sc_graph_t &graph,
-        const std::vector<sc_op_ptr> &args, bool mark_as_main) {
-    auto timer = SC_SCOPED_TIMER_INFO("graph.driver.time.lowering", "");
-    lowering_visitor_state_t visiter_state(graph);
-    op_visitor_t vis {
-            visiter_state.get_selector(), visiter_state.get_updater(), true};
-    visiter_state.input_op_itr = vis.to_visit_.end();
-    std::vector<expr> params;
-    stmts func_body = make_stmt<stmts_node_t>(std::vector<stmt>());
-    stmts init_body = make_stmt<stmts_node_t>(std::vector<stmt>());
-    constexpr const char *default_graph_name = "main_entry";
-    auto graph_name = graph.attrs_.get_or_else<std::string>(
-            "temp.name", default_graph_name);
-    // todo: use graph-id to generate name
-    auto func = builder::make_func(
-            graph_name, params, func_body, datatypes::void_t);
-    // todo: logical tensor should also have an unique id
-    // tsr_info_t include dynamic placeholder(dynamic tensor with empty
-    // datapointer) and runtime format.
-    std::unordered_map<graph_tensor_ptr, tsr_info_t> ltsr_rtsr;
-    // external dyn vars set.
-    std::unordered_set<sc_dim> external_dyn_vars
-            = graph.get_external_dynamic_vars();
-    // function pointer
-    std::vector<expr> op_dispatch_kernel(graph.ops_.size());
-    int tensor_counter = 0;
-    int global_tensor_counter = 0;
-    auto ret_mod = ir_module_t::from_entry_func(ctx, func);
-    auto use_managed_tp
-            = std::make_shared<thread_pool_mode_t>(thread_pool_mode_t::DIRECT);
-
-    expr dump_out_path;
-    if (graph.attrs_.get_or_else("folded_input", false)) {
-        ret_mod->attr_.set("folded_input", true);
-    }
-    bool is_graph_dynamic = graph.is_dynamic()
-            && !graph.attrs_.get_or_else("temp.force_static", false);
-    // the shared constant tensors. They need to be passed to the init_globals()
-    // func
-    std::vector<expr> shared_consts;
-    general_lower_params_t gp {ret_mod, ltsr_rtsr, graph, func_body, init_body,
-            tensor_counter, global_tensor_counter, is_graph_dynamic,
-            /*num_lazy_init_const_tsr*/ 0, /*num_inited_shared_const_tsr_*/ 0,
-            external_dyn_vars, shared_consts};
-    // the set of dynamic var defined in func body.(dynamic reshape)
-    std::unordered_set<expr> dyn_var_set;
-    // record the node, index is op id.
-    std::vector<bool> query_visited(graph.ops_.size(), false);
-    op_dep_matrix_t dep_mat {graph};
-    std::vector<std::pair<sc_op_ptr, stmt>> op_execution_log;
-    int num_ops = 0;
-    // internal function related
-    bool need_extra_internal_func_arg
-            = !mark_as_main && graph.need_dynamic_internal_query();
-    int total_num_of_internal_funcs = get_num_of_internal_funcs(graph);
-    expr extra_internal_func_arg = builder::make_tensor("extra_internal_funcs",
-            {total_num_of_internal_funcs}, datatypes::index);
-    extra_internal_func_arg->attr()["temp.comments"] = std::string(
-            "Extra tensor arg contains pointer to internal funcs");
-    int cur_internal_idx = 0;
-
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        std::vector<expr> ins, outs;
-        // special kinds of Ops that we need to take care of
-        op_kinds kind = kother;
-        if (node->isa<input_op>()) {
-            kind = kinput;
-        } else if (node->isa<output_op>()) {
-            kind = koutput;
-        } else if (node->isa<constant_op_t>()) {
-            kind = kconstant;
-            if (node->attrs_.get_or_else("constant", const_kind::not_const)
-                    == const_kind::not_const) {
-                node->attrs_.set("constant", const_kind::global_const);
-            }
-        } else if (node->isa<reorder_op_t>()) {
-            // todo: assume reorder is fused break in dynamic now.
-            kind = kreorder;
-        } else if (node->isa<tensor_view_op_t>()
-                || node->isa<ops::dynamic_reshape_op>()) {
-            kind = kreshape;
-        }
-
-        // if the node is reorder or has tail reorder in its internal graph,
-        // query its uses op first.
-        if (is_graph_dynamic && can_op_be_queried(node)) {
-            auto create_op_query_func_wrapper = std::bind(create_op_query_func,
-                    ctx, std::ref(gp), std::ref(op_dispatch_kernel),
-                    std::placeholders::_1);
-            lower_query_function(
-                    query_visited, node, create_op_query_func_wrapper);
-        }
-        // tensor decl should put after query functions.
-        create_op_tensors(gp, ins, outs, node, kind);
-        if (is_graph_dynamic && kind == kreorder
-                && node->attrs_.get_or_else("constant", const_kind::not_const)
-                        == const_kind::not_const) {
-            outs[0]->attr().set("temp.may_inplace", true);
-        }
-        int const_type
-                = node->attrs_.get_or_else("constant", const_kind::not_const);
-        bool executed_in_main_body = false;
-        switch (kind) {
-            case kinput: {
-                auto &op_outs = node->get_outputs();
-                for (size_t i = 0; i < outs.size(); i++) {
-                    auto &v = outs[i];
-                    params.emplace_back(v);
-                    std::stringstream ss;
-                    op_outs[i]->details_.to_string(ss);
-                    v->attr()["temp.comments"] = ss.str();
-                }
-                break;
-            }
-            case koutput: {
-                auto &op_ins = node->get_inputs();
-                for (size_t i = 0; i < ins.size(); i++) {
-                    auto &v = ins[i];
-                    params.emplace_back(v);
-                    std::stringstream ss;
-                    op_ins[i]->details_.to_string(ss);
-                    v->attr()["temp.comments"] = ss.str();
-                }
-                break;
-            }
-            case kconstant:
-            case kreshape: {
-                if (node->isa<ops::dynamic_reshape_op>()) {
-                    dynamic_reshape_var_assignment(gp, dyn_var_set, node);
-                }
-                break;
-                // nothing to do.
-            }
-            default: {
-                std::vector<expr> exprargs;
-                exprargs.insert(exprargs.end(), outs.begin(), outs.end());
-                exprargs.insert(exprargs.end(), ins.begin(), ins.end());
-                expr kernel_call;
-                std::string callee_name;
-                if (is_graph_dynamic && can_op_be_dispatched(node)) {
-                    auto &base_kernel = op_dispatch_kernel[node->logical_op_id_]
-                                                .checked_as<intrin_call>()
-                                                ->args_[0];
-                    assert(is_graph_dynamic);
-                    assert(base_kernel.defined());
-                    callee_name = get_dispatch_callee_name(base_kernel);
-                    std::string table_name = callee_name + "_table";
-                    int dyn_idx = 0;
-
-                    node->get_dispatch_key_set()->for_each_key_process(
-                            std::bind(create_dispatch_funcs_by_keys, ctx,
-                                    std::ref(ret_mod), table_name, node,
-                                    std::placeholders::_1,
-                                    std::ref(op_dispatch_kernel
-                                                    [node->logical_op_id_]),
-                                    std::ref(dyn_idx), use_managed_tp,
-                                    /*internal*/ false));
-                    if (node->need_dynamic_internal_query()) {
-                        node->info_.internal_info_->parti_in_ltsrs_
-                                = graph::extract_detail_from_tensors(
-                                        node->get_inputs());
-                        node->info_.internal_info_->parti_out_ltsrs_
-                                = graph::extract_detail_from_tensors(
-                                        node->get_outputs());
-                        create_internal_dispatch_funcs_by_node(
-                                ctx, ret_mod, table_name, node, use_managed_tp);
-                        assert(base_kernel.isa<indexing>());
-                        auto ker_tsr = base_kernel.static_as<indexing>()->ptr_;
-                        // index 0 is the outer dispatch kernel, elements from
-                        // index 1 are the internal dispatch kernels.
-                        exprargs.emplace_back(
-                                builder::tensor_ptr(ker_tsr, {1}));
-                    }
-                    kernel_call = make_expr<call_node>(
-                            op_dispatch_kernel[node->logical_op_id_], exprargs);
-                } else {
-                    // no dispatch
-                    auto mod = node->get_func(ctx);
-                    auto inp_node = node->dyn_cast<op_traits::may_inplace_t>();
-                    if (inp_node && ctx->flags_.tensor_inplace_) {
-                        auto inp_hint = inp_node->get_inplace_map();
-                        if (!inp_hint.empty()) {
-                            auto func = mod->get_entry_func();
-                            std::vector<std::pair<int,
-                                    std::vector<tensor_inplace_info_t>>>
-                                    out_hint;
-                            for (auto &kv : inp_hint) {
-                                int output_id = kv.first;
-                                out_hint.emplace_back(output_id,
-                                        std::vector<tensor_inplace_info_t> {});
-                                for (auto &info : kv.second) {
-                                    auto input_id = info.used_arg_idx_;
-                                    // convert input id to argument index
-                                    int new_idx = input_id
-                                            + node->get_outputs().size();
-                                    out_hint.back().second.emplace_back(
-                                            tensor_inplace_info_t {
-                                                    new_idx, info.kind_});
-                                }
-                            }
-                            func->attr()[function_attrs::inplace_hint]
-                                    = std::move(out_hint);
-                        }
-                    }
-                    ret_mod->merge(*mod);
-                    auto callee = mod->get_entry_func();
-                    if (need_extra_internal_func_arg
-                            && node->need_dynamic_internal_query()) {
-                        int cur_internal_funcs
-                                = get_num_of_internal_funcs(node);
-                        exprargs.emplace_back(builder::tensor_ptr(
-                                extra_internal_func_arg, {cur_internal_idx}));
-                        cur_internal_idx += cur_internal_funcs;
-                    }
-                    callee_name = callee->name_;
-                    kernel_call = builder::make_call(callee, exprargs);
-                    if (node->isa<concat_op_t>()) {
-                        kernel_call->attr()
-                                [concat_optim_attr_keys::is_standalone_concat]
-                                = true;
-                    }
-                    if (mark_as_main && const_type == const_kind::not_const) {
-                        insert_prefetch(ctx, op_execution_log, node.get(), ins,
-                                *ret_mod, func_body->seq_);
-                    }
-                }
-                stmts_node_t *target_body
-                        = (const_type != const_kind::not_const)
-                        ? init_body.get()
-                        : func_body.get();
-                target_body->seq_.emplace_back(
-                        builder::make_evaluate_unattached(kernel_call));
-                if (const_type == const_kind::not_const) {
-                    executed_in_main_body = true;
-                    if (auto marked_stmt = has_no_dep_with_prev(
-                                node, dep_mat, op_execution_log)) {
-                        marked_stmt->attr()[attr_keys::no_post_barrier]
-                                = callee_name;
-                    }
-                    op_execution_log.emplace_back(
-                            node, target_body->seq_.back());
-                    num_ops++;
-                }
-            }
-        }
-        if (!executed_in_main_body) {
-            op_execution_log.emplace_back(node, stmt());
-        }
-    });
-    if (!args.empty()) {
-        std::vector<expr> new_param;
-        for (auto &v : args) {
-            if (auto inop = v->dyn_cast<input_op>()) {
-                for (auto &in : inop->get_outputs()) {
-                    auto itr = ltsr_rtsr.find(in);
-                    COMPILE_ASSERT(itr != ltsr_rtsr.end(),
-                            "Cannot find the input op in the generated "
-                            "function");
-                    itr->second.tensor_->attr()["read_buffer"] = true;
-                    new_param.emplace_back(itr->second.tensor_);
-                }
-            } else if (auto outop = v->dyn_cast<output_op>()) {
-                for (auto &out : outop->get_inputs()) {
-                    auto itr = ltsr_rtsr.find(out);
-                    COMPILE_ASSERT(itr != ltsr_rtsr.end(),
-                            "Cannot find the output op in the generated "
-                            "function");
-                    itr->second.tensor_->attr()["write_buffer"] = true;
-                    new_param.emplace_back(itr->second.tensor_);
-                }
-            } else {
-                COMPILE_ASSERT(false,
-                        "The Op given in the args is not input or output");
-            }
-        }
-        COMPILE_ASSERT(new_param.size() == params.size(),
-                "The args count does not match the count of in/out "
-                "tensors, new_param.size="
-                        << new_param.size()
-                        << ", param.size()=" << params.size() << ".");
-        params = std::move(new_param);
-    }
-    if (need_extra_internal_func_arg) {
-        assert(cur_internal_idx == total_num_of_internal_funcs);
-        params.emplace_back(extra_internal_func_arg);
-    }
-    if (!init_body->seq_.empty()) {
-        expr is_init_var;
-        const size_t share_const_is_init_num_stmts = 2;
-        std::vector<stmt> to_insert;
-        if (gp.num_lazy_init_shared_const_tsr_ == 0) {
-            // if there are no lazy inited buffer in shared const
-            is_init_var = ret_mod->make_global_var(datatypes::boolean,
-                    "is_init", linkage::private_global,
-                    graph.attrs_.get_or_else("folded_input", false));
-            init_body->seq_.emplace_back(
-                    builder::make_assign_unattached(is_init_var, true));
-        } else {
-            // need special __is_init local tensor for lazy inited buffer
-            auto is_init_tensor
-                    = builder::make_tensor("__is_init", {1}, datatypes::s32);
-            is_init_tensor->attr()[attr_keys::is_init_for_const_cache] = true;
-            is_init_tensor->attr()[attr_keys::no_index2var] = true;
-            to_insert.emplace_back(
-                    builder::make_var_tensor_def_unattached(is_init_tensor));
-            to_insert.back()->attr()["comments"] = std::vector<std::string> {
-                    "the element of it is auto-filled based on states of "
-                    "shared_const tensors"};
-            to_insert.back()->attr()[attr_keys::is_shared_const_init_stmt]
-                    = true;
-            to_insert.emplace_back(
-                    builder::make_assign_unattached(is_init_tensor[0], 1));
-            to_insert.back()->attr()[attr_keys::is_shared_const_init_stmt]
-                    = true;
-            is_init_var = (is_init_tensor[0] != 0);
-            assert(share_const_is_init_num_stmts == to_insert.size());
-            func_body->seq_.insert(func_body->seq_.begin(), to_insert.begin(),
-                    to_insert.end());
-        }
-        auto params_for_init = params;
-        params_for_init.insert(params_for_init.end(), shared_consts.begin(),
-                shared_consts.end());
-        func_t init_func = builder::make_func("__init_const_globals",
-                params_for_init, init_body, datatypes::void_t);
-        init_func->attr()[function_attrs::private_] = true;
-        ret_mod->add_func({init_func});
-        stmt const_init = builder::make_if_else_unattached(
-                builder::make_logic_not(is_init_var),
-                builder::make_stmts_unattached(
-                        {builder::make_evaluate_unattached(builder::make_call(
-                                init_func, params_for_init))}),
-                stmts());
-        // insert the stmt "if (!is_init) {__init_const_globals(...);}"
-        if (gp.num_inited_shared_const_tsr_ + gp.num_lazy_init_shared_const_tsr_
-                == 0) {
-            // if there are no shared const buffers
-            func_body->seq_.insert(func_body->seq_.begin(), const_init);
-        } else {
-            // find the first position in the body after the definition of
-            // shared consts
-            for (auto itr = func_body->seq_.begin() + to_insert.size();;
-                    itr++) {
-                if (itr == func_body->seq_.end()
-                        || !(*itr).cast<define_c>()
-                                    .map([](const define_c &v) {
-                                        return v->var_.as<tensor>();
-                                    })
-                                    .filter([](const tensor &v) {
-                                        return v->attr_
-                                                && v->attr_->has_key(attr_keys::
-                                                                shared_const);
-                                    })
-                                    .has_value()) {
-                    func_body->seq_.insert(itr, const_init);
-                    break;
-                }
-            }
-        }
-    }
-    func->params_ = std::move(params);
-    func->decl_->params_ = func->params_;
-    func->body_ = std::move(func_body);
-    func->attr()[function_attrs::top_level] = true;
-    add_func_doc_string(func);
-    if (mark_as_main) func->attr()[function_attrs::is_main] = true;
-    if (utils::compiler_configs_t::get().print_pass_result_) {
-        SC_MODULE_INFO << ret_mod;
-    }
-    auto gflop = graph.attrs_.get_or_else(sc_graph_t::attr_key_t::gflop, 0.0f);
-    ret_mod->attr_[ir_module_t::attr_key_t::GFLOP] = gflop;
-
-    // if the workload is too small, directly use thread pool backend instead of
-    // managed thread pool. if gflop per thread is large enough, or there is
-    // only one single thread, enable managed thread pool. For MLP workload on
-    // 24-core cascade lake, 1.6Gflop is turning point of choosing
-    // managed/native thread pool
-    auto &rtl_cfg = runtime_config_t::get();
-    thread_pool_mode_t use_managed_thread_pool = thread_pool_mode_t::DIRECT;
-    if (rtl_cfg.managed_thread_pool_ == thread_pool_mode_t::MANAGED) {
-        if (num_ops > 2 || rtl_cfg.get_num_threads() == 1
-                || gflop / rtl_cfg.get_num_threads() > 0.0666f) {
-            use_managed_thread_pool = thread_pool_mode_t::MANAGED;
-        } else if (ctx->use_amx()) {
-            use_managed_thread_pool = thread_pool_mode_t::MANAGED;
-        }
-    } else if (rtl_cfg.managed_thread_pool_ == thread_pool_mode_t::DYNAMIC) {
-        use_managed_thread_pool = thread_pool_mode_t::DYNAMIC;
-    }
-    ret_mod->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-            = use_managed_thread_pool;
-    *use_managed_tp = use_managed_thread_pool;
-    if (graph_name != default_graph_name) {
-        ret_mod->attr_[ir_module_t::attr_key_t::NAME] = graph_name;
-    }
-    if (graph.attrs_.has_key("shared_const_bases")) {
-        ret_mod->attr_[ir_module_t::attr_key_t::SHARED_CONST_BASES]
-                = graph.attrs_["shared_const_bases"];
-    }
-
-    if (ctx->flags_.graph_default_private_) {
-        for (auto &f : ret_mod->get_contents()) {
-            f->attr()[function_attrs::private_] = true;
-        }
-
-        ret_mod->get_entry_func()->attr().remove(function_attrs::private_);
-        for (auto &table : ret_mod->get_op_table_map()) {
-            for (auto &kv : table.second->kernel_table_) {
-                if (kv.second.already_compiled()) {
-                    if (auto f
-                            = ret_mod->get_func(kv.second.name_or_postfix_)) {
-                        f->attr().remove(function_attrs::private_);
-                    }
-                }
-            }
-        }
-    }
-    return ret_mod;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/lowering.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/lowering.hpp
deleted file mode 100644
index 42ae152af66..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/lowering.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_LOWERING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_LOWERING_HPP
-#include <string>
-#include <vector>
-#include "graph.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace graph {
-std::string get_tensor_name(graph_tensor *t, sc_op *linked_output);
-}
-
-expr call_op_dynamic_query_function(
-        const sc_op_ptr &op, const std::vector<expr> &args);
-
-struct tsr_info_t {
-    expr tensor_;
-    expr placeholder_;
-    expr format_;
-    expr size_;
-    int count_ = 0;
-    tsr_info_t() = default;
-    tsr_info_t(const expr &tensor, const expr &placeholder, const expr &format,
-            const expr &size)
-        : tensor_(tensor)
-        , placeholder_(placeholder)
-        , format_(format)
-        , size_(size) {}
-};
-
-enum info_etype_t { real_tensor, placeholder, format, out_size };
-
-/**
- * Generates the ir_module_t from the OP graph
- * @param ctx the context
- * @param graph the graph
- * @param args optional order of the arguments of the generated IR function.
- * Should all be input_op or output_op. If empty, use default internal order
- * @param mark_as_main mark the main entry function with "is_main" attr
- * */
-SC_API ir_module_ptr lower_graph(context_ptr ctx, sc_graph_t &graph,
-        const std::vector<sc_op_ptr> &args, bool mark_as_main = true);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/mixed_partition.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/mixed_partition.cpp
deleted file mode 100644
index cd79449fc68..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/mixed_partition.cpp
+++ /dev/null
@@ -1,4118 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "mixed_partition.hpp"
-#include <algorithm>
-#include <string>
-#include "binding_axis.hpp"
-#include "pass/pass.hpp"
-#include "transform/transform.hpp"
-#include "tunable_op.hpp"
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/concat_memory_planning.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/transform/scope_flatten.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/transform/tensor_inplace_info.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/reshape.hpp>
-#include <runtime/config.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.mixed_partition);
-
-static bool require_fusion_level(
-        const context_ptr &ctx, fusion_opt_level at_least_lvl) {
-    return ctx->flags_.fusion_level_ >= at_least_lvl;
-}
-
-void mxp_replacer_t::replace_anchor(
-        const std::vector<fusion_anchor_ptr> &fanchors) {
-    auto replace_fsmap = [&](const fusion_anchor_ptr &cur) {
-        for (auto &fs_pair : cur->fsmap_.datamap_) {
-            for (auto &slice : fs_pair.second) {
-                for (auto &range : slice) {
-                    range.first = dispatch_impl(range.first);
-                    range.second = dispatch_impl(range.second);
-                }
-            }
-        }
-    };
-    for (auto &anmap : fanchors) {
-        replace_fsmap(anmap);
-    }
-}
-
-namespace graph {
-void tensor_detail_to_ir_tensor(sc_graph_t &graph, const std::string &name,
-        const graph_tensor_ptr &gt, mxp_buffer_allocator *buf_alloc) {
-    COMPILE_ASSERT(buf_alloc, "No buffer allocator found")
-    bool is_cost_model_enabled
-            = buf_alloc->get_binded_mxp()->cost_->is_enabled();
-    if (!buf_alloc->g2b_map_.haskey(gt)) {
-        auto tsr = graph::tensor_detail_to_ir_tensor(graph, name, gt->details_);
-        buf_alloc->g2b_map_.get(gt) = tsr;
-        // init b2g map
-        buf_alloc->b2g_map_[tsr] = gt;
-        if (!graph.is_dynamic() && is_cost_model_enabled) {
-            auto dim_prod = get_dims_product(
-                    get_expr_to_dims(tsr.checked_as<tensor>()->dims_));
-            auto dtype_size = utils::get_sizeof_etype(
-                    tsr.checked_as<tensor>()->elem_dtype_.type_code_);
-            // first touch
-            buf_alloc->mem_trace_.emplace_back(
-                    memory_optim::memory_alloc_trace_t {(uintptr_t)tsr.get(),
-                            (size_t)dim_prod * dtype_size});
-            // last use
-            buf_alloc->mem_trace_.emplace_back(
-                    memory_optim::memory_alloc_trace_t {
-                            (uintptr_t)tsr.get(), (size_t)0});
-        }
-    } else if (!graph.is_dynamic() && is_cost_model_enabled) {
-        auto tsr = get_real_tensor(buf_alloc->g2b_map_.get(gt));
-        // update last use trace
-        auto last_trace = std::remove_if(buf_alloc->mem_trace_.begin(),
-                buf_alloc->mem_trace_.end(),
-                [&tsr](const memory_optim::memory_alloc_trace_t &trace) {
-                    return ((uintptr_t)tsr.get() == trace.buffer_id_)
-                            && (trace.size_ == 0);
-                });
-        COMPILE_ASSERT((last_trace + 1) == buf_alloc->mem_trace_.end(),
-                "Not found last use trace for: " << tsr);
-        (*last_trace) = memory_optim::memory_alloc_trace_t {
-                (uintptr_t)tsr.get(), (size_t)0};
-    }
-}
-
-void tensor_detail_to_ir_tensor(sc_graph_t &graph,
-        const std::string &name_prefix,
-        const std::vector<graph_tensor_ptr> &tsrs,
-        mxp_buffer_allocator *buf_alloc) {
-    COMPILE_ASSERT(buf_alloc, "No buffer allocator found")
-    for (size_t i = 0; i < tsrs.size(); i++) {
-        tensor_detail_to_ir_tensor(
-                graph, name_prefix + std::to_string(i), tsrs[i], buf_alloc);
-    }
-}
-} // namespace graph
-
-mixed_fuse_op_t *get_mixed_op_from_graph(sc_graph_t &graph) {
-    mixed_fuse_op_t *mixed_op = nullptr;
-    for (auto &op : graph.ops_) {
-        if (auto mx_op = op->dyn_cast<mixed_fuse_op_t>()) {
-            COMPILE_ASSERT(!mixed_op, "Only one fused op is expected")
-            mixed_op = mx_op;
-        }
-    }
-    return mixed_op;
-}
-
-void commit_graph_to_func(
-        sc_graph_t &g, const func_t &func, const fusion_anchor_mgr_t &fmgr) {
-    // query binding axis
-    query_binding_axis(g);
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    // set opt level to diable partition optimization
-    ctx->flags_.opt_level_ = sc_opt_level::lv0;
-    auto parti = std::make_shared<mixed_parti_t>(ctx, func, fmgr, g);
-    op_visitor_t visitor
-            = op_visitor_t::dfs_topology_speculative_sort(g.ops_.size());
-    visitor.visit_graph(
-            g, [&parti](op_visitor_t *visitor, const sc_op_ptr &op) {
-                if (op->isa<input_op>() || op->isa<output_op>()) return;
-                if (parti->empty()) parti->buf_alloc_.allocate_buffer(op.get());
-                parti->add(op);
-            });
-    parti->transform_to_mixed_op();
-}
-
-void mxp_buffer_allocator::set_buffer_inplace_hint(
-        const expr &target_buf, const expr &inplace_buf) {
-    // skip dynamic cases
-    // Buffer inplace currently does not support dynamic buffers
-    if (binded_mxp_->get_host_graph().is_dynamic()) { return; }
-    COMPILE_ASSERT(target_buf.defined() && inplace_buf.defined(),
-            "Both buffer should be defined")
-    // skip same buffer
-    if (inplace_buf.ptr_same(target_buf)) return;
-    auto target_id = alias_info::get_or_create_alias_info(*target_buf.get());
-    SC_MODULE_INFO << "Mark inplace hint for buffer: " << inplace_buf << " ==> "
-                   << target_buf;
-    inplace_buf->attr()[attr_keys::tensor_inplace_hint]
-            = std::vector<temp_tensor_inplace_info_t> {
-                    {target_id, inplace_kind::ZERO_OFFSET}};
-    // update inner inaplce map
-    inplace_map_[(uintptr_t)inplace_buf.get()]
-            = std::vector<std::pair<uintptr_t, inplace_kind>> {
-                    {(uintptr_t)target_buf.get(), inplace_kind::ZERO_OFFSET}};
-}
-
-void mxp_buffer_allocator::allocate_buffer(sc_op *op) {
-    auto &graph = op->get_owner_graph();
-    // allocate input buffer
-    graph::tensor_detail_to_ir_tensor(graph,
-            op->op_name_ + "_" + std::to_string(op->logical_op_id_) + "_ins_",
-            op->get_inputs(), this);
-
-    /* deal with special ops: explict inplace input */
-    // tensorview op
-    if (auto tv_op = op->dyn_cast<tensor_view_op_t>()) {
-        auto inp = tv_op->get_inputs()[0];
-        if ((inp->uses_.size() == 1)
-                && (inp->details_.get_blocking_dims()
-                        == tv_op->get_outputs()[0]
-                                   ->details_.get_blocking_dims())
-                && (binded_mxp_->contains(inp->producer_owner_))
-                && (!(g2b_map_.get(inp).isa<tensor>()
-                        && g2b_map_.get(inp)
-                                   .static_as<tensor>()
-                                   ->init_value_))) {
-            if (graph.is_dynamic()) {
-                // reset plain dims hint as do inplacement here.
-                auto &tsr = g2b_map_.get(inp);
-                tsr->attr().set(attr_keys::plain_dims,
-                        graph.dims_to_expr(
-                                op->get_outputs()[0]
-                                        ->details_.get_plain_dims()));
-            }
-            g2b_map_.get(op->get_outputs()[0]) = g2b_map_.get(inp);
-        } else {
-            auto base_tsr = get_real_tensor(g2b_map_.get(inp));
-            g2b_map_.get(op->get_outputs()[0]) = builder::tensor_ptr(base_tsr,
-                    std::vector<expr>(base_tsr->dims_.size(), 0),
-                    tv_op->get_shapes_expr());
-        }
-    }
-
-    // reduce collect op
-    if (auto collc_op = op->dyn_cast<reduce_collect_op_t>()) {
-        if (collc_op->is_place_holder_op()) {
-            // inplace reduce_compute_op output
-            COMPILE_ASSERT(
-                    op->get_inputs()[0]
-                            ->producer_owner_->isa<reduce_compute_op_t>(),
-                    "reduce collect op is expected to follow reduce compute "
-                    "op, but got "
-                            << op->get_inputs()[0]->producer_owner_->op_name_)
-            // no code generated
-            g2b_map_.get(op->get_outputs()[0])
-                    = g2b_map_.get(op->get_inputs()[0]);
-        }
-    }
-
-    // allocate output buffer
-    graph::tensor_detail_to_ir_tensor(graph,
-            op->op_name_ + "_" + std::to_string(op->logical_op_id_) + "_outs_",
-            op->get_outputs(), this);
-
-    // reorder op
-    if (auto reo_op = op->dyn_cast<reorder_op_t>()) {
-        if (reo_op->check_padding()) {
-            op->get_outputs()[0]->attrs_.set(
-                    mixed_partition_hint::no_inplace, true);
-        }
-    }
-
-    /* infer post-op inplace */
-    auto query_inplace = [&](const graph_tensor_ptr &out,
-                                 const graph_tensor_ptr &in) -> bool {
-        return (!op->isa<tunable_op_t>() && !op->isa<pooling_op_t>())
-                && (in->uses_.size() == 1) && (out != in)
-                && (out->details_.get_blocking_dims()
-                        == in->details_.get_blocking_dims())
-                && (out->details_.get_strides() == in->details_.get_strides())
-                && (out->details_.dtype_ == in->details_.dtype_)
-                && (out->details_.get_format() == in->details_.get_format())
-                && (binded_mxp_->contains(
-                        in->producer_owner_)) // inputs of partition should not
-                // be inplaced
-                && (!in->producer_owner_->isa<tunable_op_t>())
-                && (!(g2b_map_.get(in).isa<tensor>()
-                        && g2b_map_.get(in)
-                                   .static_as<tensor>()
-                                   ->init_value_)); // TODO(XXX): inplace inited
-        // tensor
-    };
-
-    for (auto &out : op->get_outputs()) {
-        if (out->attrs_.get_or_else(mixed_partition_hint::no_inplace, false)) {
-            continue;
-        }
-        // query input
-        for (auto &inp : op->get_inputs()) {
-            if (inp->attrs_.get_or_else(
-                        mixed_partition_hint::no_inplace, false)) {
-                continue;
-            }
-            if (query_inplace(out, inp)) {
-                // set buffer inplace hint for output buffer
-                set_buffer_inplace_hint(g2b_map_.get(inp), g2b_map_.get(out));
-                break;
-            }
-        }
-    }
-
-    /* deal with special ops: set tensor initial value */
-    // reduce collect and compute op
-    if (auto rd_impl_op = op->dyn_cast<reduce_impl_op_t>()) {
-        auto buf = g2b_map_.get(op->get_outputs()[0]);
-        COMPILE_ASSERT(buf.isa<tensor>(),
-                "output of reduce_impl op should be tensor type")
-        rd_impl_op->set_reduce_buffer(buf.checked_as<tensor>());
-    }
-
-    /* infer pre-op inplace */
-    if (op->isa<padding_op_t>() && op->get_inputs()[0]->uses_.size() == 1
-            && !binded_mxp_->empty()) {
-        auto out = op->get_outputs()[0];
-        auto ins = op->get_inputs()[0];
-        auto old_input = g2b_map_.get(ins);
-        if (old_input.isa<tensor>()
-                || (old_input.isa<tensorptr>() && ins->uses_.size() == 1
-                        && utils::is_one_of(
-                                old_input.static_as<tensorptr>()->base_->dtype_,
-                                sc_data_type_t::u8(), sc_data_type_t::s8()))) {
-            if (old_input.isa<tensorptr>()) {
-                auto &producer = ins->producer_owner_;
-                if (!producer->isa<tensor_view_op_t>()) return;
-                auto &tv_inp = producer->get_inputs()[0];
-                if (tv_inp->attrs_.get_or_else(
-                            mixed_partition_hint::no_inplace, false))
-                    return;
-                if (producer->share_gt_with_op<tensor_view_op_t>(tv_inp)
-                        || producer->share_gt_with_op<tunable_op_t>(tv_inp))
-                    return;
-            }
-            op->attrs_.set<bool>(
-                    mixed_partition_hint::inplace_optimized_op, true);
-            auto pad_op = op->dyn_cast<padding_op_t>();
-            auto new_input = builder::tensor_ptr(g2b_map_.get(out),
-                    pad_op->get_padding_offsets_exprs(),
-                    pad_op->get_inputs()[0]->details_.get_blocking_dims_expr(
-                            graph),
-                    true);
-            if (old_input.isa<tensorptr>()) {
-                auto parent_tsr = get_real_tensor(old_input);
-                auto shape = parent_tsr->dims_;
-                new_input = builder::tensor_ptr(new_input,
-                        std::vector<expr>(
-                                new_input.static_as<tensorptr>()->shape_.size(),
-                                0),
-                        shape, false);
-                old_input = parent_tsr;
-            }
-            // Buffer replace
-            replace_buffer(old_input, new_input);
-        }
-    }
-}
-
-std::vector<memory_optim::memory_alloc_trace_t>
-mxp_buffer_allocator::get_real_mem_trace(
-        const std::unordered_set<graph_tensor *> &keep_cut_set) const {
-    std::vector<memory_optim::memory_alloc_trace_t> shrink_trace = mem_trace_;
-    std::unordered_set<expr> ignore_buf_set;
-    for (auto &g2b : g2b_map_.datamap_) {
-        if (binded_mxp_->is_parti_cut(g2b.first)
-                && keep_cut_set.find(g2b.first) == keep_cut_set.end()) {
-            ignore_buf_set.insert(get_real_tensor(g2b.second));
-        }
-    }
-    for (auto iter = shrink_trace.begin(); iter != shrink_trace.end();) {
-        auto tsr = ((expr_base *)iter->buffer_id_)->node_ptr_from_this();
-        if (ignore_buf_set.find(tsr) != ignore_buf_set.end()) {
-            iter = shrink_trace.erase(iter);
-        } else {
-            iter++;
-        }
-    }
-    std::transform(shrink_trace.begin(), shrink_trace.end(),
-            shrink_trace.begin(),
-            [&](const memory_optim::memory_alloc_trace_t &t) {
-                if (t.size_ == 0) return t;
-                auto tsr = ((expr_base *)t.buffer_id_)->node_ptr_from_this();
-                auto shrink_info = get_shrinked_info(tsr);
-                auto shape = get_dims_product(get_expr_to_dims(
-                        shrink_info.empty() ? tsr.checked_as<tensor>()->dims_
-                                            : get_slice_shape(shrink_info)));
-                auto dtype_size = utils::get_sizeof_etype(
-                        tsr.checked_as<tensor>()->elem_dtype_.type_code_);
-                return memory_optim::memory_alloc_trace_t {
-                        t.buffer_id_, shape * dtype_size};
-            });
-    return shrink_trace;
-}
-
-size_t get_buffer_usage(const context_ptr &ctx,
-        const std::vector<memory_optim::memory_alloc_trace_t> &mem_trace,
-        const memory_optim::inplace_info_map &inplace_map) {
-    std::unordered_map<uintptr_t, std::size_t> out_schedule;
-    std::unordered_map<uintptr_t, std::vector<uintptr_t>> out_inplace_selection;
-    return schedule_memory_allocations(mem_trace, /*alignment*/ 64,
-            ctx->flags_.buffer_schedule_ == attr_keys::BUF_SCHED_HOT,
-            inplace_map, out_schedule, out_inplace_selection);
-}
-
-size_t mxp_buffer_allocator::get_real_buffer_usage() const {
-    return get_buffer_usage(
-            binded_mxp_->ctx_, get_real_mem_trace(), inplace_map_);
-}
-
-void mxp_buffer_allocator::replace_buffer(
-        const expr &old_buffer, const expr &new_buffer) {
-    // assert new buffer
-    COMPILE_ASSERT(b2g_map_.find(new_buffer) == b2g_map_.end(),
-            "Currently, it is only expected to replace with new buffer which "
-            "never appear in mixed IR, but got "
-                    << new_buffer)
-    // get old buffer
-    COMPILE_ASSERT(old_buffer.isa<tensor>(),
-            "Replace target is expected to be Tensor node")
-    if (tsr2anch_map_.find(old_buffer) != tsr2anch_map_.end()) {
-        tsr2anch_map_.erase(old_buffer);
-    }
-    if (b2g_map_.find(old_buffer) != b2g_map_.end()) {
-        auto old_gt = b2g_map_[old_buffer];
-        b2g_map_.erase(old_buffer);
-        b2g_map_[new_buffer] = old_gt;
-    }
-    // get real tsr
-    auto old_tsr = get_real_tensor(old_buffer),
-         new_tsr = get_real_tensor(new_buffer);
-    // update trace
-    bool new_tsr_already_in_trace = false;
-    std::for_each(mem_trace_.begin(), mem_trace_.end(),
-            [&old_tsr, &new_tsr, &new_tsr_already_in_trace](
-                    memory_optim::memory_alloc_trace_t &t) {
-                if (t.buffer_id_ == (uintptr_t)old_tsr.get()) {
-                    t.buffer_id_ = (uintptr_t)new_tsr.get();
-                } else if (t.buffer_id_ == (uintptr_t)new_tsr.get()) {
-                    new_tsr_already_in_trace = true;
-                }
-            });
-    if (new_tsr_already_in_trace) {
-        int cnt = 0;
-        // erase middle two times
-        for (auto iter = mem_trace_.begin(); iter != mem_trace_.end();) {
-            if (iter->buffer_id_ == (uintptr_t)new_tsr.get()) {
-                cnt++;
-                if (cnt == 2 || cnt == 3) {
-                    iter = mem_trace_.erase(iter);
-                    continue;
-                }
-            }
-            iter++;
-        }
-        COMPILE_ASSERT(
-                cnt == 4, "Unexpected buffer occurs time in trace: " << cnt)
-    }
-    // update inplace map if necessary
-    for (auto iter = inplace_map_.begin(); iter != inplace_map_.end();) {
-        auto buf1 = ((expr_base *)iter->first)->node_ptr_from_this();
-        auto buf2 = get_inplaced_buffer(buf1);
-        if (buf1.ptr_same(old_buffer) || buf2.ptr_same(old_buffer)) {
-            iter = inplace_map_.erase(iter);
-        } else {
-            iter++;
-        }
-    }
-    // replace g2b map
-    for (auto &g2b : g2b_map_.datamap_) {
-        auto &buf = g2b.second;
-        auto tsr = get_real_tensor(buf);
-        if (tsr.ptr_same(old_tsr.static_as<tensor>())) {
-            set_base_tensor(buf, new_buffer);
-        }
-    }
-    // TIR replace
-    node_ptr_map buffer_map = {{old_buffer.impl, new_buffer.impl}};
-    mxp_replacer_t(buffer_map).replace_func(binded_mxp_->func_);
-}
-
-std::tuple<std::vector<expr>, std::vector<expr>>
-mxp_buffer_allocator::get_buffer(sc_op *op) const {
-    std::vector<expr> inputs(op->get_inputs().size()),
-            outputs(op->get_outputs().size());
-    std::transform(op->get_inputs().begin(), op->get_inputs().end(),
-            inputs.begin(), [&](const graph_tensor_ptr &gt) {
-                COMPILE_ASSERT(
-                        g2b_map_.haskey(gt), "please allocate buffer first")
-                return g2b_map_.datamap_.find(gt.get())->second;
-            });
-    std::transform(op->get_outputs().begin(), op->get_outputs().end(),
-            outputs.begin(), [&](const graph_tensor_ptr &gt) {
-                COMPILE_ASSERT(
-                        g2b_map_.haskey(gt), "please allocate buffer first")
-                return g2b_map_.datamap_.find(gt.get())->second;
-            });
-    return std::make_tuple(inputs, outputs);
-}
-
-static cmp_res cmp_op_anchor(
-        sc_op *op, fusion_anchor_ptr cur_anchor, fusion_anchor_ptr new_anchor) {
-    cmp_res res = cmp_res::unknown;
-    auto cmper = [&](const std::vector<graph_tensor_ptr> &gt_vec) {
-        std::for_each(
-                gt_vec.begin(), gt_vec.end(), [&](const graph_tensor_ptr &gt) {
-                    if (utils::is_one_of(res, cmp_res::unknown, cmp_res::equal)
-                            && cur_anchor->fsmap_.hasvalue(gt)
-                            && new_anchor->fsmap_.hasvalue(gt)) {
-                        res = cmp_slice_range(cur_anchor->fsmap_.get(gt),
-                                new_anchor->fsmap_.get(gt));
-                    }
-                });
-    };
-    // compare input
-    cmper(op->get_inputs());
-    // if result is unknown, continue compare output
-    if (res == cmp_res::unknown) { cmper(op->get_outputs()); }
-    COMPILE_ASSERT(res != cmp_res::unknown, "Unknown comparision result")
-    return res;
-}
-
-void mxp_buffer_allocator::update_input_buffer_info(sc_op *op) {
-    auto commited_anchor_map = binded_mxp_->lookup_anchor_map(op);
-    auto update_inp_tensor_info = [&](const graph_tensor_ptr &inp) {
-        auto buf = g2b_map_.get(inp);
-        if (b2g_map_.find(buf) == b2g_map_.end()) b2g_map_[buf] = inp;
-        auto tsr = get_real_tensor(buf);
-        bool is_borrowed = (commited_anchor_map->borrowed_fanchor_map_.find(inp)
-                != commited_anchor_map->borrowed_fanchor_map_.end());
-        auto real_anchor_map = is_borrowed
-                ? commited_anchor_map->borrowed_fanchor_map_[inp]
-                : commited_anchor_map;
-        if (tsr2anch_map_.find(tsr) != tsr2anch_map_.end()) {
-            COMPILE_ASSERT(b2g_map_.find(buf) != b2g_map_.end(),
-                    "base tensor should be visited")
-            // current anchor map
-            auto cur_anchor_map = tsr2anch_map_[tsr];
-            // auto skip
-            if (cur_anchor_map == real_anchor_map) return;
-            // redirect to common parent anchor if new anchor is cousin
-            // relationship of current anchor in avoid of use before define
-            if (cur_anchor_map->is_cousin_for(real_anchor_map)) {
-                // set common parent anchor no matter which anchor is larger
-                tsr2anch_map_[tsr]
-                        = real_anchor_map->get_root()->shared_from_this();
-            } else {
-                auto cur_slice = cur_anchor_map->fsmap_.get(b2g_map_[buf]);
-                auto new_slice = real_anchor_map->fsmap_.get(inp);
-                auto res = cmp_slice_range(cur_slice, new_slice);
-                bool need_overwrite = false;
-                if (res == cmp_res::l_less_r) {
-                    need_overwrite = true;
-                } else if (res == cmp_res::equal) {
-                    // usually occurs in op is reduce or reduce collect op
-                    need_overwrite
-                            = real_anchor_map->is_parent_for(cur_anchor_map)
-                            || real_anchor_map->is_sibling_for(cur_anchor_map);
-                }
-                // update latest anchor map
-                if (need_overwrite) {
-                    tsr2anch_map_[tsr] = real_anchor_map;
-                    // update b2g map if necessary
-                    b2g_map_[buf] = inp;
-                }
-            }
-        } else {
-            tsr2anch_map_[tsr] = real_anchor_map;
-        }
-    };
-
-    for (auto &inp : op->get_inputs()) {
-        update_inp_tensor_info(inp);
-    }
-}
-
-void mxp_buffer_allocator::update_output_buffer_info(sc_op *op) {
-    auto commited_anchor_map = binded_mxp_->lookup_anchor_map(op);
-    auto sub_of_commited_anchor_map
-            = binded_mxp_->lookup_sub_anchor_map(commited_anchor_map);
-
-    auto update_out_tensor_info = [&](const graph_tensor_ptr &out) {
-        auto buf = g2b_map_.get(out);
-        if (b2g_map_.find(buf) == b2g_map_.end()) b2g_map_[buf] = out;
-        auto tsr = get_real_tensor(buf);
-        if (tsr2anch_map_.find(tsr) != tsr2anch_map_.end()) {
-            // only input anchor is expected for below logic
-            if (!commited_anchor_map->is_input_anchor()) return;
-            COMPILE_ASSERT(b2g_map_.find(buf) != b2g_map_.end(),
-                    "base tensor should be visited")
-            // auto skip
-            if (tsr2anch_map_[tsr] == commited_anchor_map) return;
-            auto cur_slice = tsr2anch_map_[tsr]->fsmap_.get(b2g_map_[buf]);
-            auto new_slice = commited_anchor_map->fsmap_.get(out);
-            auto res = cmp_slice_range(cur_slice, new_slice);
-            if (res == cmp_res::l_less_r) {
-                tsr2anch_map_[tsr] = commited_anchor_map;
-            }
-        } else {
-            fusion_anchor_ptr min_anchor_map = nullptr;
-            for (auto &sub_anchor : sub_of_commited_anchor_map) {
-                if (!sub_anchor->fsmap_.hasvalue(out)) continue;
-                if (!min_anchor_map)
-                    min_anchor_map = sub_anchor;
-                else {
-                    auto min_slice = min_anchor_map->fsmap_.get(out);
-                    auto cur_slice = sub_anchor->fsmap_.get(out);
-                    if (cmp_slice_range(min_slice, cur_slice)
-                            == cmp_res::l_larger_r) {
-                        min_anchor_map = sub_anchor;
-                    }
-                }
-            }
-            tsr2anch_map_[tsr]
-                    = min_anchor_map ? min_anchor_map : commited_anchor_map;
-        }
-    };
-
-    for (auto &out : op->get_outputs()) {
-        update_out_tensor_info(out);
-    }
-}
-
-void mxp_buffer_allocator::tensor_initialize() {
-    for (auto &pair : g2b_map_.datamap_) {
-        auto op = pair.first->producer_owner_;
-        // zero out padding area
-        if (auto padding = op->dyn_cast<padding_op_t>()) {
-            stmts decl_body;
-            auto pad_tsr = get_real_tensor(pair.second);
-            slice_range_list range_list = {};
-            if (pair.second->attr().has_key(mixed_partition_hint::cut_buffer)) {
-                decl_body = binded_mxp_->func_->body_.checked_as<stmts>();
-            } else {
-                COMPILE_ASSERT(
-                        tsr2anch_map_.find(pad_tsr) != tsr2anch_map_.end(),
-                        "Could not find padding tensor: "
-                                << pad_tsr << " in tsr2anchor map")
-                auto anchor = tsr2anch_map_[pad_tsr];
-                range_list = anchor->fsmap_.get(b2g_map_[pair.second]);
-                decl_body = anchor->get_parent_scope();
-            }
-            if (!op->attrs_.has_key(
-                        mixed_partition_hint::tensor_already_initialized)) {
-                auto ret = padding->get_zero_out_stmt(pad_tsr, range_list);
-                decl_body->seq_.insert(decl_body->seq_.begin(), ret);
-                op->attrs_[mixed_partition_hint::tensor_already_initialized]
-                        = true;
-            }
-        }
-    }
-}
-
-void mxp_buffer_allocator::copy_concat_memory_attrs_tsr2buf() {
-    for (auto &op : binded_mxp_->committed_ops_) {
-        if (!op->isa<concat_op_t>()) { continue; }
-        auto concat = op->stc_cast<concat_op_t>();
-        for (auto &input_tsr : concat->get_inputs()) {
-            if (input_tsr->attrs_.has_key(
-                        concat_optim_attr_keys::graph_memory_offset)) {
-                auto &offset = input_tsr->attrs_.get<std::vector<expr>>(
-                        concat_optim_attr_keys::graph_memory_offset);
-
-                auto &buf = binded_mxp_->buf_alloc_.g2b_map_.get(input_tsr);
-                COMPILE_ASSERT(buf.isa<tensor>(),
-                        "Buffer with memory_offset should be a tensor")
-                buf->attr()[concat_optim_attr_keys::pass_memory_offset]
-                        = offset;
-
-                auto &final_tsr = input_tsr->attrs_.get<graph_tensor_ptr>(
-                        concat_optim_attr_keys::graph_memory_offset_to);
-                COMPILE_ASSERT(
-                        binded_mxp_->buf_alloc_.g2b_map_.haskey(final_tsr),
-                        "No buffer allocated for concat outputs")
-                auto &out_buffer
-                        = binded_mxp_->buf_alloc_.g2b_map_.get(final_tsr);
-                buf->attr()[concat_optim_attr_keys::pass_memory_offset_to]
-                        = out_buffer;
-                SC_MODULE_INFO
-                        << "Buffer: " << buf
-                        << " has memory offset to buffer: " << out_buffer;
-            }
-        }
-    }
-}
-
-inline bool is_elementwise_op(const sc_op *op) {
-    return op->isa<unary_elementwise_op_t>()
-            || op->isa<binary_elementwise_op_t>();
-}
-
-inline bool is_elementwise_producer(const graph_tensor *gt) {
-    return is_elementwise_op(gt->producer_owner_);
-}
-
-inline bool is_reshape_op(const sc_op *op) {
-    return op->isa<tensor_view_op_t>() || op->isa<ops::dynamic_reshape_op>()
-            || op->isa<ops::static_reshape_op>();
-}
-
-// If last gt depends on all users of cur gt, return true
-inline bool check_last_use_for_gt(const graph_tensor *cur_gt,
-        const graph_tensor *last_gt, const mxp_buffer_allocator *alloc) {
-    return std::all_of(cur_gt->uses_.begin(), cur_gt->uses_.end(),
-            [&alloc, &last_gt](const std::pair<int, sc_op_weak_ptr_t> &user) {
-                return alloc->get_binded_mxp()->dep_m_->lookup(
-                               user.second.get(), last_gt->producer_owner_)
-                        == 1;
-            });
-}
-
-// max step to explore preview ops
-static constexpr int EXPLORE_INPLACE_MAX_STEP = 8;
-
-static void collect_inplace_info(graph_tensor *cur_gt, graph_tensor *ref_gt,
-        std::unordered_set<graph_tensor *> &inplace_set,
-        std::unordered_set<expr> &visited_set,
-        const std::unordered_set<graph_tensor *> &valid_set,
-        mxp_buffer_allocator *alloc, int step) {
-    // increment by 1 recursive depth and auto skip
-    if (EXPLORE_INPLACE_MAX_STEP == (step++)) return;
-    // skip repeated
-    if (inplace_set.find(cur_gt) != inplace_set.end()) return;
-    // return when producer is not elementwise op
-    if (!is_elementwise_producer(cur_gt)) { return; }
-    // use buffer as key for map
-    auto cur_buf = alloc->g2b_map_.get(cur_gt);
-    // check inplace condition
-    if ((visited_set.find(cur_buf) == visited_set.end())
-            && (valid_set.find(cur_gt) != valid_set.end()) && (cur_gt != ref_gt)
-            && (cur_gt->details_.get_blocking_dims()
-                    == ref_gt->details_.get_blocking_dims())
-            && (cur_gt->details_.get_strides()
-                    == ref_gt->details_.get_strides())
-            && (cur_gt->details_.dtype_ == ref_gt->details_.dtype_)
-            && (cur_gt->details_.get_format() == ref_gt->details_.get_format())
-            && check_last_use_for_gt(cur_gt, ref_gt, alloc)) {
-        inplace_set.insert(cur_gt);
-        // reset step
-        step = 0;
-        // reset ref_gt
-        ref_gt = cur_gt;
-    }
-
-    // if not mark visited
-    if (visited_set.find(cur_buf) == visited_set.end()) {
-        // recursively mark visited
-        while (cur_buf.defined()) {
-            visited_set.insert(cur_buf);
-            cur_buf = alloc->get_inplaced_buffer(cur_buf);
-        }
-    }
-    // get cur op
-    auto elem_op = cur_gt->producer_owner_;
-    // recursively collect inplace information
-    for (auto &inp : elem_op->get_inputs()) {
-        collect_inplace_info(inp.get(), ref_gt, inplace_set, visited_set,
-                valid_set, alloc, step);
-    }
-}
-
-expr mxp_buffer_allocator::get_inplaced_buffer(const expr &buf) const {
-    auto iter = inplace_map_.find((uintptr_t)buf.get());
-    if (iter != inplace_map_.end()) {
-        COMPILE_ASSERT(iter->second.size() == 1,
-                "Unexpected inplace info size during partition")
-        return ((expr_base *)iter->second[0].first)->node_ptr_from_this();
-    }
-    return expr();
-}
-
-void mxp_buffer_allocator::query_inplace() {
-    SC_MODULE_INFO << "Query buffer inplace hint...";
-    // step 0: get outer loop
-    auto outer_loops = binded_mxp_->get_outer_loops();
-    if (outer_loops.empty()) return;
-    auto batch_anchor = binded_mxp_->get_anchor_inside_loop(outer_loops.back());
-    if (!batch_anchor) return;
-
-    // step 1: search gt which defined inside outer loop
-    std::vector<graph_tensor *> ref_gt_list;
-    std::unordered_set<graph_tensor *> valid_set;
-    for (auto &tsr2anch : tsr2anch_map_) {
-        auto anchor = tsr2anch.second;
-        auto tsr = tsr2anch.first;
-        if (tsr->attr().has_key(mixed_partition_hint::cut_buffer)) continue;
-        COMPILE_ASSERT(b2g_map_.find(tsr) != b2g_map_.end(),
-                "No buffer to graph tensor mapping found for " << tsr)
-        auto shrink_gt = b2g_map_[tsr];
-        auto slice_on_batch_anchor = batch_anchor->fsmap_.get(shrink_gt);
-        auto slice_on_cur_anchor = slice_range_list {get_shrinked_info(tsr)};
-        if (slice_on_cur_anchor.empty() || slice_on_batch_anchor.empty())
-            continue;
-        if (cmp_slice_range(slice_on_batch_anchor, slice_on_cur_anchor)
-                == cmp_res::equal) {
-            ref_gt_list.emplace_back(b2g_map_[tsr].get());
-            valid_set.insert(b2g_map_[tsr].get());
-        }
-    }
-    // skip
-    if (ref_gt_list.empty()) return;
-    // sort by op id
-    std::sort(ref_gt_list.begin(), ref_gt_list.end(),
-            [](const graph_tensor *gt1, const graph_tensor *gt2) {
-                return gt1->producer_owner_->logical_op_id_
-                        > gt2->producer_owner_->logical_op_id_;
-            });
-
-    std::unordered_set<expr> replaced_buffer, visited_buffer;
-    for (auto &ref_gt : ref_gt_list) {
-        // auto skip
-        if (replaced_buffer.find(g2b_map_.get(ref_gt)) != replaced_buffer.end())
-            continue;
-        // step 2: collect inplace mapping for each gt
-        std::unordered_set<graph_tensor *> inplace_gt_set;
-        collect_inplace_info(ref_gt, ref_gt, inplace_gt_set, visited_buffer,
-                valid_set, this, /*init_step*/ 0);
-        if (inplace_gt_set.empty()) continue;
-        // step 3: transform map to vector sorted by op committing order
-        std::vector<graph_tensor *> inplace_gt_list;
-        for (auto &commit_op : binded_mxp_->committed_ops_) {
-            if (commit_op.get() == ref_gt->producer_owner_) {
-                inplace_gt_list.emplace_back(ref_gt);
-            } else {
-                for (auto &out : commit_op->get_outputs()) {
-                    if (inplace_gt_set.find(out.get())
-                            != inplace_gt_set.end()) {
-                        inplace_gt_list.emplace_back(out.get());
-                    }
-                }
-            }
-        }
-        // step 4: validate inplace chain to ensure all of gt satisfy last use
-        graph_tensor *next_gt = nullptr;
-        for (auto iter = inplace_gt_list.rbegin();
-                iter != inplace_gt_list.rend();) {
-            if (!next_gt)
-                next_gt = (*iter);
-            else {
-                if (check_last_use_for_gt(*iter, next_gt, this)) {
-                    next_gt = (*iter);
-                } else {
-                    iter = std::vector<graph_tensor *>::reverse_iterator(
-                            inplace_gt_list.erase((++iter).base()));
-                    continue;
-                }
-            }
-            ++iter;
-        }
-        // step 5: mark inplace attr for each buffer in list with the previous
-        // one
-        for (size_t i = 1; i < inplace_gt_list.size(); i++) {
-            // get target gt
-            auto target_gt = inplace_gt_list[i - 1];
-            auto target_buf = g2b_map_.get(target_gt);
-            // get inplace gt
-            auto inplace_gt = inplace_gt_list[i];
-            auto inplace_buf = g2b_map_.get(inplace_gt);
-            // skip repeated replaced
-            if (replaced_buffer.find(inplace_buf) != replaced_buffer.end())
-                continue;
-            // set inplace hint
-            set_buffer_inplace_hint(target_buf, inplace_buf);
-            // in avoid of repeat try
-            replaced_buffer.insert(inplace_buf);
-        }
-    }
-}
-
-void mxp_buffer_allocator::calibrate_info() {
-    // validate shrink info
-    for (auto &buf2shr : b2g_map_) {
-        auto &buf = buf2shr.first;
-        auto tsr = get_real_tensor(buf);
-        // sync shrink info of tensorptr and base tensor
-        if (!buf.ptr_same(tsr)
-                && (buf->attr().has_key(tensor_shrinker_attrs::should_shrink)
-                        ^ tsr->attr().has_key(
-                                tensor_shrinker_attrs::should_shrink))) {
-            buf->attr().remove(tensor_shrinker_attrs::should_shrink);
-            buf->attr().set(tensor_shrinker_attrs::no_shrink, true);
-            tsr->attr().remove(tensor_shrinker_attrs::should_shrink);
-            tsr->attr().set(tensor_shrinker_attrs::no_shrink, true);
-        }
-    }
-
-    // collect cut buffer
-    std::unordered_set<expr> cut_buffer_set;
-    for (auto iter = inplace_map_.begin(); iter != inplace_map_.end();) {
-        auto out_buf = ((expr_base *)iter->first)->node_ptr_from_this();
-        if (!out_buf->attr().has_key(mixed_partition_hint::cut_buffer)) {
-            ++iter;
-            continue;
-        }
-        // temp buffer set
-        std::unordered_set<expr> temp_set;
-        // record whether tensorptr would be inplaced by cut buffer
-        bool inplace_tptr = false;
-        auto buf = out_buf;
-        while (buf.defined()) {
-            // if tensorptr found
-            if (buf.isa<tensorptr>()) {
-                inplace_tptr = true;
-                break;
-            }
-            temp_set.insert(buf);
-            buf = get_inplaced_buffer(buf);
-        }
-        if (!inplace_tptr) {
-            // remove tensor shrink attr for those shared with cut buffer
-            for (auto &tb : temp_set) {
-                tb->attr().remove(tensor_shrinker_attrs::should_shrink);
-            }
-            cut_buffer_set.insert(temp_set.begin(), temp_set.end());
-            ++iter;
-        } else {
-            // cut off inplace hint for the output buffer
-            out_buf->attr().remove(attr_keys::tensor_inplace_hint);
-            // remove inplace map
-            iter = inplace_map_.erase(iter);
-        }
-    }
-
-    // validate inplace hint for shrink info
-    for (auto iter = inplace_map_.begin(); iter != inplace_map_.end();) {
-        auto buf1 = ((expr_base *)iter->first)->node_ptr_from_this();
-        auto buf2 = get_inplaced_buffer(buf1);
-        // auto skip cut buffer
-        if (cut_buffer_set.find(buf1) != cut_buffer_set.end()) {
-            COMPILE_ASSERT(cut_buffer_set.find(buf2) != cut_buffer_set.end(),
-                    "inplaced buffer should also be set cut buffer")
-            ++iter;
-        } else {
-            auto shrink_info1 = get_shrinked_info(buf1);
-            auto shrink_info2 = get_shrinked_info(buf2);
-            // if shrink info is not equal, remove inplace hint
-            if ((shrink_info1.empty() ^ shrink_info2.empty())
-                    || (!shrink_info1.empty() && !shrink_info2.empty()
-                            && cmp_slice_range({shrink_info1}, {shrink_info2})
-                                    != cmp_res::equal)) {
-                SC_MODULE_INFO << "removing tensor inplace hint: " << buf1
-                               << " ==> " << buf2 << " for safety";
-                // remove inplace hint to ensure correctness
-                buf1->attr().remove(attr_keys::tensor_inplace_hint);
-                // remove inplace map
-                iter = inplace_map_.erase(iter);
-            } else {
-                ++iter;
-            }
-        }
-    }
-}
-
-inline bool check_tsr_len_under_resigter_size(
-        size_t tsr_len, uint16_t simd_len, uint16_t max_register_tol = 16) {
-    return (tsr_len % simd_len == 0
-            && (tsr_len / simd_len) <= max_register_tol);
-}
-
-bool mxp_buffer_allocator::validate_tsr2var() const {
-    for (auto &tsr2def : tsr2anch_map_) {
-        auto tsr = tsr2def.first;
-        if (!tsr->attr().has_key(attr_keys::must_tensor2var)) continue;
-        auto shrinked_info = get_shrinked_info(tsr);
-        if (shrinked_info.empty()) continue;
-        auto shape = get_slice_shape(shrinked_info);
-        auto prod = get_dims_product(get_expr_to_dims(shape));
-        auto tsr_simd_len = vectorize_step(binded_mxp_->ctx_,
-                tsr.checked_as<tensor>()->elem_dtype_.type_code_);
-        if (!check_tsr_len_under_resigter_size(prod, tsr_simd_len))
-            return false;
-    }
-    return true;
-}
-
-int mxp_buffer_allocator::use_count(const expr &buffer) const {
-    int cnt = 0;
-    for (auto &g2b : g2b_map_.datamap_) {
-        if (g2b.second.ptr_same(buffer)) cnt++;
-    }
-    return cnt;
-}
-
-fusion_anchor_ptr mxp_buffer_allocator::get_real_anchor_for_buffer(
-        const expr &buffer) const {
-    auto tsr = get_real_tensor(buffer);
-    if (tsr2anch_map_.find(tsr) == tsr2anch_map_.end()) return nullptr;
-    auto anch = tsr2anch_map_.find(tsr)->second;
-    auto parent_loop = anch->get_parent_loop();
-    // use outer anchor for cases that calculation is partially done. (e.g.
-    // calculating part of K for matmul)
-    if (b2g_map_.find(tsr)->second->producer_owner_->isa<tunable_op_t>()
-            && parent_loop.isa<for_loop>()
-            && parent_loop->attr().has_key(stmt_attr_key::reduce_root_loop)) {
-        auto raw = parent_loop->attr()
-                           .get<std::weak_ptr<stmt_base_t>>(
-                                   stmt_attr_key::reduce_root_loop)
-                           .lock();
-        COMPILE_ASSERT(raw, "reduce_root_loop weak ptr invalidated");
-        anch = binded_mxp_->get_anchor_inside_loop(
-                stmt(raw).checked_as<for_loop>());
-        COMPILE_ASSERT(anch,
-                "No anchor found under reduce root loop, please create it "
-                "otherwise, it may cause correctness issue")
-    }
-    return anch;
-}
-
-slice_range mxp_buffer_allocator::get_shrinked_info(const expr &buffer) const {
-    if (buffer->attr().get_or_else(tensor_shrinker_attrs::no_shrink, false))
-        return {};
-    auto anchor = get_real_anchor_for_buffer(buffer);
-    if (!anchor) return {};
-    COMPILE_ASSERT(b2g_map_.find(buffer) != b2g_map_.end(),
-            "Could not find " << buffer << " in b2g map");
-    slice_range ret;
-    auto range_list = anchor->fsmap_.get(b2g_map_.find(buffer)->second);
-    if (range_list.size() != 1) {
-        if (range_list.empty()
-                || !(anchor->isa<iter_fusion_anchor_t>()
-                        || anchor->isa<grouped_fusion_anchor_t>()))
-            return {};
-        else
-            ret = *std::max_element(range_list.begin(), range_list.end(),
-                    [&](const slice_range &A, const slice_range &B) {
-                        return cmp_slice_range({A}, {B}) == cmp_res::l_less_r;
-                    });
-
-    } else {
-        ret = range_list[0];
-    }
-
-    return ret;
-}
-
-void mxp_buffer_allocator::declare_tensor() const {
-    // define real tensor, and put it at the `def_pos` of ss
-    auto declare_tensor_ = [&](const expr &tsr) {
-        auto fanchor = get_real_anchor_for_buffer(tsr);
-        if (!fanchor) return;
-        // search insert position in scope of fusion anchor
-        auto &ss = fanchor->get_parent_scope()->seq_;
-        size_t def_pos = 0;
-        for (; def_pos < ss.size(); def_pos++) {
-            // skip var def node
-            if (!ss[def_pos].cast<define>()
-                            .filter([](const define &d) {
-                                return d->var_.isa<var>();
-                            })
-                            .has_value())
-                break;
-        }
-        ss.emplace(ss.begin() + def_pos,
-                builder::make_var_tensor_def_unattached(tsr));
-    };
-
-    // collect tensors those need definition
-    std::vector<expr> define_list;
-    for (auto &tsr2def : tsr2anch_map_) {
-        if (tsr2def.first->attr().has_key(mixed_partition_hint::cut_buffer))
-            continue;
-        define_list.emplace_back(tsr2def.first);
-    }
-    // sort by tsr name for stable define order
-    std::sort(define_list.begin(), define_list.end(),
-            [](const expr &a, const expr &b) {
-                COMPILE_ASSERT(a.isa<tensor>() && b.isa<tensor>(),
-                        "tensor node is expected, but got " << a << " and "
-                                                            << b)
-                return a.static_as<tensor>()->name_
-                        < b.static_as<tensor>()->name_;
-            });
-    // declare tensors by order
-    for (auto &tsr : define_list) {
-        declare_tensor_(tsr);
-    }
-}
-
-void mxp_buffer_allocator::set_shrink_info() const {
-    // set shrink info
-    auto set_shrink_info_ = [&](const expr &buffer) {
-        auto shrink_range = get_shrinked_info(buffer);
-        if (shrink_range.empty()) return;
-        buffer->attr()[tensor_shrinker_attrs::should_shrink]
-                = tensor_shrinker_t::shrink_info_t {
-                        /*base*/ get_slice_idx(shrink_range),
-                        /*shape*/ get_slice_shape(shrink_range), stmts()};
-    };
-    for (auto &buf2shr : b2g_map_) {
-        if (buf2shr.first->attr().has_key(mixed_partition_hint::cut_buffer))
-            continue;
-        set_shrink_info_(buf2shr.first);
-    }
-}
-
-std::vector<memory_optim::memory_alloc_trace_t> merge_mem_trace(
-        const std::vector<memory_optim::memory_alloc_trace_t> &mem_trace1,
-        const std::vector<memory_optim::memory_alloc_trace_t> &mem_trace2,
-        const std::unordered_map<expr, expr> &buffer_map) {
-    auto ret = mem_trace1;
-    for (auto &trace : mem_trace2) {
-        auto tsr = ((expr_base *)trace.buffer_id_)->node_ptr_from_this();
-        auto trace_itr = buffer_map.find(tsr);
-        bool is_replaced = trace_itr != buffer_map.end();
-        auto buf_id = is_replaced ? (uintptr_t)(trace_itr->second.get())
-                                  : trace.buffer_id_;
-        if (trace.size_ > 0) {
-            auto last_use = std::find_if(ret.begin(), ret.end(),
-                    [&buf_id](const memory_optim::memory_alloc_trace_t &tr) {
-                        return (tr.buffer_id_ == buf_id) && (tr.size_ == 0);
-                    });
-            if (last_use != ret.end()) {
-                ret.erase(last_use);
-                continue;
-            }
-        }
-        if (is_replaced) {
-            ret.emplace_back(
-                    memory_optim::memory_alloc_trace_t {buf_id, trace.size_});
-        } else {
-            ret.emplace_back(trace);
-        }
-    }
-    return ret;
-}
-
-memory_optim::inplace_info_map merge_inplace_map(
-        const memory_optim::inplace_info_map &inplace_map1,
-        const memory_optim::inplace_info_map &inplace_map2,
-        const std::unordered_map<expr, expr> &buffer_map) {
-    auto ret = inplace_map1;
-    for (auto &inplace_pair : inplace_map2) {
-        COMPILE_ASSERT(inplace_pair.second.size() == 1,
-                "Unexpected inplace info size during partition")
-        memory_optim::inplace_info info = inplace_pair.second[0];
-        auto tsr = ((expr_base *)info.first)->node_ptr_from_this();
-        auto iter = buffer_map.find(tsr);
-        ret[inplace_pair.first]
-                = std::vector<std::pair<uintptr_t, inplace_kind>> {
-                        {iter != buffer_map.end()
-                                        ? (uintptr_t)iter->second.get()
-                                        : info.first,
-                                info.second}};
-    }
-    return ret;
-}
-
-mxp_mem_info merge_real_mem_info(const mxp_buffer_allocator &alloc1,
-        const mxp_buffer_allocator &alloc2) {
-    // get buffer replace map
-    std::unordered_map<expr, expr> buffer_map;
-    std::unordered_set<graph_tensor *> keep_cut_set;
-    for (auto &g2b : alloc2.g2b_map_.datamap_) {
-        auto gt = g2b.first;
-        if (alloc1.g2b_map_.haskey(gt)) {
-            buffer_map[g2b.second] = alloc1.g2b_map_.datamap_.find(gt)->second;
-            if (alloc1.get_binded_mxp()->is_parti_out(gt)
-                    && alloc2.get_binded_mxp()->is_parti_inp(gt)) {
-                keep_cut_set.insert(gt);
-            }
-        }
-    }
-    return std::make_pair(
-            merge_mem_trace(alloc1.get_real_mem_trace(keep_cut_set),
-                    alloc2.get_real_mem_trace(keep_cut_set), buffer_map),
-            merge_inplace_map(
-                    alloc1.inplace_map_, alloc2.inplace_map_, buffer_map));
-}
-
-void mxp_buffer_allocator::merge(mxp_buffer_allocator &other,
-        std::unordered_map<expr, expr> &buffer_map,
-        const std::pair<fusion_anchor_ptr, fusion_anchor_ptr>
-                &common_buffer_anchor_pair) {
-    buffer_map.clear();
-    auto common_buffer_anchor = common_buffer_anchor_pair.first,
-         common_other_buffer_anchor = common_buffer_anchor_pair.second;
-    for (auto &other_g2b : other.g2b_map_.datamap_) {
-        auto other_gt = other_g2b.first;
-        // if other tensor has conflict in current tensor, redirect it to common
-        // buffer anchor
-        if (g2b_map_.haskey(other_gt)) {
-            auto existed_buf = g2b_map_.get(other_gt);
-            buffer_map[other_g2b.second] = existed_buf;
-            if ((binded_mxp_->is_parti_inp(other_gt)
-                        && other.binded_mxp_->is_parti_inp(other_gt))
-                    || (binded_mxp_->is_parti_out(other_gt)
-                            && other.binded_mxp_->is_parti_out(other_gt)))
-                continue;
-            COMPILE_ASSERT(common_buffer_anchor,
-                    "Conflict buffer: "
-                            << existed_buf
-                            << " is detected but no common buffer anchor "
-                               "is found for redirection")
-            tsr2anch_map_[get_real_tensor(existed_buf)] = common_buffer_anchor;
-            // the existed buffer will become intermediate buffer, which may be
-            // shrinked
-            if (b2g_map_.find(existed_buf) == b2g_map_.end()) {
-                b2g_map_[existed_buf] = other_gt->shared_from_this();
-            }
-        } else {
-            auto buffer = other.g2b_map_.get(other_gt);
-            g2b_map_.get(other_gt) = buffer;
-            if (other.b2g_map_.find(buffer) != other.b2g_map_.end()) {
-                b2g_map_[buffer] = other.b2g_map_[buffer];
-            }
-            if (other.tsr2anch_map_.find(get_real_tensor(buffer))
-                    != other.tsr2anch_map_.end()) {
-                auto other_anchor
-                        = other.tsr2anch_map_[get_real_tensor(buffer)];
-                tsr2anch_map_[get_real_tensor(buffer)]
-                        = (other_anchor == common_other_buffer_anchor)
-                        ? common_buffer_anchor
-                        : other_anchor;
-            }
-        }
-    }
-    // merge mem trace
-    mem_trace_ = merge_mem_trace(mem_trace_, other.mem_trace_, buffer_map);
-    // merge inplace map
-    inplace_map_
-            = merge_inplace_map(inplace_map_, other.inplace_map_, buffer_map);
-}
-
-void mxp_buffer_allocator::clear() {
-    binded_mxp_ = nullptr;
-    g2b_map_.clear();
-    tsr2anch_map_.clear();
-    b2g_map_.clear();
-    mem_trace_.clear();
-}
-
-void search_op_anchor_in_parti(sc_op *op, mixed_parti_t *parti) {
-    if (parti->merged_to) {
-        search_op_anchor_in_parti(op, parti->get_root());
-        return;
-    }
-
-    for (auto &fanchor : parti->fanchors_) {
-        // if op is marked as break_pre_fuse, only pre-op fusion is accepted
-        if (!parti->empty()
-                && op->attrs_.get_or_else(
-                        mixed_partition_hint::pre_fuse_begin_op, false)) {
-            if (fanchor->fsmap_.hasvalue(op->get_outputs()[0])) {
-                parti->set_anchor_for_op(op, fanchor);
-            }
-            // in avoid of preop fusion select unexpected anchor
-            continue;
-        }
-        auto ctx = parti->ctx_;
-        infer_status_code status;
-        std::unordered_set<graph_tensor_ptr> known_gt;
-        // deal with fusible op which use output loop mode to create partition
-        if (parti->empty()
-                && std::any_of(op->get_outputs().begin(),
-                        op->get_outputs().end(),
-                        [&fanchor](const graph_tensor_ptr &gt) {
-                            return fanchor->fsmap_.haskey(gt);
-                        })) {
-            COMPILE_ASSERT(op->isa<fusible_op_t>(),
-                    "Only fusible op is expected, but got " << op->op_name_)
-            // use pre-infer slice range method
-            status = op->dyn_cast<fusible_op_t>()->pre_infer_slice_ranges(
-                    ctx, fanchor->fsmap_);
-        } else { /* most common cases */
-            // check input slice firstly
-            if (!fanchor->check_input_for_op(op, known_gt)) continue;
-            // TODO(XXX): merge
-            if (auto fusible = op->dyn_cast<fusible_op_t>()) {
-                status = fusible->infer_slice_ranges(ctx, fanchor->fsmap_);
-            } else if (auto tunable = op->dyn_cast<tunable_op_t>()) {
-                status = tunable->infer_slice_ranges(ctx, fanchor->fsmap_);
-            } else {
-                COMPILE_ASSERT(0, "Unexpected op type found: " << op->op_name_)
-            }
-        }
-        bool success_flag = true;
-        // check status map
-        if (status == infer_status_code::OK) {
-            /** doule check list
-             * 1. validate input slice
-             * 2. validate output slice
-             * 3. check cost model
-             * */
-            success_flag = fanchor->validate_input_for_op(op, known_gt)
-                    && fanchor->validate_output_for_op(op)
-                    && parti->cost_->make_decision_for_op(op, fanchor);
-        } else {
-            success_flag = false;
-        }
-        if (success_flag) {
-            // set op anchor, and auto select smallest one
-            parti->set_anchor_for_op(op, fanchor);
-        } else {
-            // forbid op in current anchor
-            fanchor->forbid_op(op, known_gt);
-        }
-    }
-}
-
-static std::string print_loops_range(const std::vector<for_loop> &loops) {
-    std::stringstream os;
-    int cnt = 0;
-    for (auto &l : loops) {
-        if (cnt != 0) { os << "X"; }
-        if (l->iter_begin_.isa<constant>() && l->iter_end_.isa<constant>()) {
-            os << (get_expr_as_int(l->iter_end_)
-                    - get_expr_as_int(l->iter_begin_));
-        } else {
-            os << "var";
-        }
-        cnt++;
-    }
-    return os.str();
-}
-
-enum class parti_dep : int {
-    no_dep = 0,
-    l_dep_r = 1,
-    r_dep_l = 2,
-    inter_dep = 3,
-};
-
-/**
- * Check two partition dependency
- * */
-static parti_dep check_parti_dep(mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    auto dep_m = A->dep_m_;
-    auto A_ops = A->ops, B_ops = B->ops;
-    bool A_dep_B = false, B_dep_A = false;
-    for (auto &op_a : A_ops) {
-        for (auto &op_b : B_ops) {
-            auto dep_flag = dep_m->lookup(op_a, op_b);
-            if (dep_flag == 1)
-                B_dep_A = true;
-            else if (dep_flag == -1)
-                A_dep_B = true;
-        }
-    }
-    if (A_dep_B && !B_dep_A) {
-        return parti_dep::l_dep_r;
-    } else if (B_dep_A && !A_dep_B) {
-        return parti_dep::r_dep_l;
-    } else if (!A_dep_B && !B_dep_A) {
-        return parti_dep::no_dep;
-    } else {
-        return parti_dep::inter_dep;
-    }
-}
-
-/**
- * Check two partition connectionship
- * */
-static bool check_parti_connectionship(mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    auto A_ops = A->ops, B_ops = B->ops;
-    for (auto &op_a : A_ops) {
-        std::unordered_set<graph_tensor_ptr> gt_set;
-        std::for_each(op_a->get_inputs().begin(), op_a->get_inputs().end(),
-                [&gt_set](const graph_tensor_ptr &gt) { gt_set.insert(gt); });
-        std::for_each(op_a->get_outputs().begin(), op_a->get_outputs().end(),
-                [&gt_set](const graph_tensor_ptr &gt) { gt_set.insert(gt); });
-        for (auto &op_b : B_ops) {
-            for (auto &inp : op_b->get_inputs()) {
-                if (gt_set.find(inp) != gt_set.end()) return true;
-            }
-            for (auto &out : op_b->get_outputs()) {
-                if (gt_set.find(out) != gt_set.end()) return true;
-            }
-        }
-    }
-    return false;
-}
-
-/**
- * Check two partition ring risk
- * */
-static bool check_parti_ring_risk(mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    auto dep = check_parti_dep(A, B);
-    if (dep == parti_dep::inter_dep)
-        return true;
-    else if (dep == parti_dep::no_dep)
-        return false;
-
-    auto append_parti = (dep == parti_dep::l_dep_r) ? A : B,
-         target_parti = (dep == parti_dep::l_dep_r) ? B : A;
-    auto append_ops = append_parti->ops;
-    for (auto &op_a : append_ops) {
-        for (auto &inp : op_a->get_inputs()) {
-            if (append_parti->is_parti_inp(inp)
-                    && !target_parti->contains(inp->producer_owner_)) {
-                for (auto &op_in_set : target_parti->ops) {
-                    auto result = target_parti->dep_m_->lookup(
-                            op_in_set->logical_op_id_,
-                            inp->producer_owner_->logical_op_id_);
-                    if (result == 1) { return true; }
-                }
-            }
-        }
-    }
-    return false;
-}
-
-static stmts find_last_parallel_for(const stmts &scope, int64_t &out_index) {
-    auto &outer_body = scope.static_as<stmts>()->seq_;
-    int64_t prefetch_insertion_point = -1;
-    if (!outer_body.empty()) {
-        for (int64_t i = outer_body.size() - 1; i >= 0; i--) {
-            auto &s = outer_body[i];
-            if (s.isa<stmts>()) {
-                auto ret = find_last_parallel_for(
-                        s.static_as<stmts>(), out_index);
-                if (ret.defined()) { return ret; }
-            }
-            if (s.cast<for_loop>()
-                            .filter([](const for_loop &v) {
-                                return v->kind_ == for_type::PARALLEL
-                                        && (!v->attr_
-                                                || !v->attr_->get_or_else(
-                                                        "dont_prefetch",
-                                                        false));
-                            })
-                            .has_value()) {
-                out_index = i;
-                return scope;
-            }
-        }
-    }
-    return stmts();
-}
-
-/**
- * Check two partition is forked, like
- *            input
- *           /     \
- *      parti A   parti B
- *         |        |
- *       output    output
- * Note that: this is different relationship from no dependency
- * */
-static bool check_parti_forked(mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    auto dep_m = A->dep_m_;
-    // for all op depends on A, should not depends on B
-    for (auto &op_in_A : A->ops) {
-        auto A_id = op_in_A->logical_op_id_;
-        auto related_ids = dep_m->lookup_ops_depend_on(A_id);
-        for (auto &op_in_B : B->ops) {
-            auto B_id = op_in_B->logical_op_id_;
-            if (std::any_of(related_ids.begin(), related_ids.end(),
-                        [&B_id, &dep_m](const int &rid) {
-                            return dep_m->lookup(B_id, rid) == 1;
-                        })) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-/**
- * Check two partition axis binding
- * */
-static int check_parti_loop_axis_binding(
-        mixed_parti_t *A, mixed_parti_t *B, int check_loop_size) {
-    A = A->get_root();
-    B = B->get_root();
-    // skip conv workload until all conv ops implement axis binding infer
-    if (A->contain_convolution() || B->contain_convolution()) {
-        // limit to at most one outer loop (batch-dimension)
-        return std::min(1, check_loop_size);
-    }
-    // auto skip when A and B are forked
-    if (check_parti_forked(A, B)) return check_loop_size;
-    return check_loop_binding_axis(
-            A->get_outer_loops(), B->get_outer_loops(), check_loop_size);
-}
-
-static void merge_parti_impl(mixed_parti_t *pa_to_merge,
-        mixed_parti_t *parti_be_merged, size_t merged_loop_size,
-        const sc_op_ptr &joint_op = nullptr) {
-    pa_to_merge = pa_to_merge->get_root(),
-    parti_be_merged = parti_be_merged->get_root();
-    auto outer_loops_to_merge = pa_to_merge->get_outer_loops(),
-         outer_loops_be_merged = parti_be_merged->get_outer_loops();
-    auto outer_loops_merged_target = outer_loops_to_merge[merged_loop_size - 1];
-    auto outer_loops_merged_append
-            = outer_loops_be_merged[merged_loop_size - 1];
-    auto max_to_merge_anchor_map
-            = pa_to_merge->get_anchor_inside_loop(outer_loops_merged_target);
-    auto max_be_merged_anchor_map = parti_be_merged->get_anchor_inside_loop(
-            outer_loops_merged_append);
-    /* * * * * * * * * * * * * * * * *
-     * Step 1: Merge func_
-     * * * * * * * * * * * * * * * * */
-    COMPILE_ASSERT(
-            max_to_merge_anchor_map, "max-to-merge fusion anchor not found")
-    max_to_merge_anchor_map->fusion_anchor_t::commit_stmt(
-            outer_loops_merged_append->body_);
-
-    // var and tensor replace map
-    node_ptr_map node_remap;
-    std::unordered_map<expr, expr> buffer_map;
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 2: Merge fanchor_
-     * * * * * * * * * * * * * * * * */
-    // erase inferred but not allocated gt
-    for (auto &to_merge_anchor_map : pa_to_merge->fanchors_) {
-        for (auto iter = to_merge_anchor_map->fsmap_.datamap_.begin();
-                iter != to_merge_anchor_map->fsmap_.datamap_.end();) {
-            if (!pa_to_merge->buf_alloc_.g2b_map_.haskey(iter->first)) {
-                iter = to_merge_anchor_map->fsmap_.datamap_.erase(iter);
-            } else {
-                iter++;
-            }
-        }
-    }
-
-    for (auto &be_merged_anchor_map : parti_be_merged->fanchors_) {
-        for (auto iter = be_merged_anchor_map->fsmap_.datamap_.begin();
-                iter != be_merged_anchor_map->fsmap_.datamap_.end();) {
-            if (!parti_be_merged->buf_alloc_.g2b_map_.haskey(iter->first)) {
-                iter = be_merged_anchor_map->fsmap_.datamap_.erase(iter);
-            } else {
-                iter++;
-            }
-        }
-    }
-
-    // append tunable op without owner anchor manually if necessary
-    for (auto &op : parti_be_merged->committed_ops_) {
-        auto op_raw = op.get();
-        // Due to current tunable template implement, the beginning tunable op
-        // of partition does not belong to any anchor.
-        if (op->isa<tunable_op_t>()
-                && parti_be_merged->op_anchor_map_.find(op_raw)
-                        == parti_be_merged->op_anchor_map_.end()) {
-            // append op to anchor
-            max_to_merge_anchor_map->append_content(op_raw);
-            // append op to partition
-            pa_to_merge->op_anchor_map_[op_raw] = max_to_merge_anchor_map;
-        }
-    }
-
-    // append inner loop anchor
-    for (auto &be_merged_anchor_map : parti_be_merged->fanchors_) {
-        // skip outer anchor
-        if (std::any_of(outer_loops_be_merged.begin(),
-                    outer_loops_be_merged.begin() + merged_loop_size,
-                    [&be_merged_anchor_map, &parti_be_merged](
-                            const for_loop &loop) {
-                        return parti_be_merged->get_anchor_inside_loop(loop,
-                                       be_merged_anchor_map->is_input_anchor())
-                                == be_merged_anchor_map;
-                    })) {
-            continue;
-        }
-        be_merged_anchor_map->attach_parent_anchor(
-                max_to_merge_anchor_map, max_be_merged_anchor_map);
-        pa_to_merge->append_fusion_anchor(be_merged_anchor_map);
-    }
-
-    // merge outer loop anchor
-    for (size_t i = 0; i < merged_loop_size; i++) {
-        node_remap[outer_loops_be_merged[i]->var_.impl]
-                = outer_loops_to_merge[i]->var_.impl;
-        node_remap[outer_loops_be_merged[i].impl]
-                = outer_loops_to_merge[i].impl;
-        auto be_merged_anchor_map = parti_be_merged->get_anchor_inside_loop(
-                     outer_loops_be_merged[i]),
-             to_merge_anchor_map
-                = pa_to_merge->get_anchor_inside_loop(outer_loops_to_merge[i]);
-        if (be_merged_anchor_map && to_merge_anchor_map) {
-            to_merge_anchor_map->merge(be_merged_anchor_map);
-            // reset op anchor map if neccesary
-            if (i == merged_loop_size - 1) {
-                for (auto &op_anchor_pair : parti_be_merged->op_anchor_map_) {
-                    if (op_anchor_pair.second == be_merged_anchor_map) {
-                        op_anchor_pair.second = to_merge_anchor_map;
-                    }
-                }
-            }
-        }
-    }
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 3: Merge buf_alloc_
-     * * * * * * * * * * * * * * * * */
-    pa_to_merge->buf_alloc_.merge(parti_be_merged->buf_alloc_, buffer_map,
-            std::make_pair(max_to_merge_anchor_map, max_be_merged_anchor_map));
-
-    /* * * * * * * * * * * * * * * * * *
-     * Step 4: Replace expr involving:
-     *  1. func->body
-     *  2. fanchor->fsmap->slice_range
-     * * * * * * * * * * * * * * * * * */
-    for (auto &buf_pair : buffer_map) {
-        node_remap.insert(
-                std::make_pair(buf_pair.first.impl, buf_pair.second.impl));
-    }
-    // create mxp inplace replacer
-    mxp_replacer_t mxp_reper(node_remap);
-    // 1. func->body
-    mxp_reper.replace_func(pa_to_merge->func_);
-    // 2. fanchor->fsmap->slice_range
-    mxp_reper.replace_anchor(pa_to_merge->fanchors_);
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 5: Merge op_anchor_map_
-     * * * * * * * * * * * * * * * * */
-    for (auto &op_anchor_pair : parti_be_merged->op_anchor_map_) {
-        // override existed ones
-        pa_to_merge->op_anchor_map_[op_anchor_pair.first]
-                = op_anchor_pair.second;
-    }
-
-    // erase joint op in op_anchor_map
-    pa_to_merge->op_anchor_map_.erase(joint_op.get());
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 6: Merge op_
-     * * * * * * * * * * * * * * * * */
-    // call base merge
-    // move 'ops' from src to target and set 'merged_to' of src to be target
-    pa_to_merge->fusion_partition_t::merge(
-            static_cast<fusion_partition_t *>(parti_be_merged)
-                    ->shared_from_this());
-
-    // Merge commited ops
-    pa_to_merge->committed_ops_.insert(pa_to_merge->committed_ops_.end(),
-            parti_be_merged->committed_ops_.begin(),
-            parti_be_merged->committed_ops_.end());
-
-    // clear merged parti
-    parti_be_merged->clear();
-}
-
-static size_t get_great_common_loop_size(const std::vector<for_loop> &loop_A,
-        const std::vector<for_loop> &loop_B) {
-    // great common size
-    auto gcs = std::min(loop_A.size(), loop_B.size());
-
-    size_t merged_loop_size = 0;
-    for (; merged_loop_size < gcs; merged_loop_size++) {
-        auto &to_merge = loop_A[merged_loop_size];
-        auto &be_merged = loop_B[merged_loop_size];
-        if (!(to_merge->iter_begin_.isa<constant_c>()
-                    && to_merge->step_.isa<constant_c>()
-                    && be_merged->iter_begin_.isa<constant_c>()
-                    && be_merged->step_.isa<constant_c>())) {
-            break;
-        }
-        if (!to_merge->iter_end_.isa<constant_c>()
-                && slice_expr_equals(
-                        to_merge->iter_end_, be_merged->iter_end_)) {
-            continue; // for dynamic
-        } else if (!(to_merge->iter_end_.isa<constant_c>()
-                           && be_merged->iter_end_.isa<constant_c>())) {
-            break;
-        }
-        auto A_begin = get_expr_as_int(to_merge->iter_begin_),
-             A_end = get_expr_as_int(to_merge->iter_end_),
-             A_step = get_expr_as_int(to_merge->step_),
-             B_begin = get_expr_as_int(be_merged->iter_begin_),
-             B_end = get_expr_as_int(be_merged->iter_end_),
-             B_step = get_expr_as_int(be_merged->step_);
-        auto A_num_threads = to_merge->num_threads_;
-        auto B_num_threads = be_merged->num_threads_;
-        if (A_begin != B_begin || A_end != B_end || A_step != B_step
-                || A_num_threads != B_num_threads)
-            break;
-    }
-    return merged_loop_size;
-}
-
-/**
- * find nested parallel for. E.g.
- * pfor(){
- *  tensor a; // optional
- *  pfor(){
- *  }
- * }
- * */
-class nested_pfor_finder_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    // return whether nested pfor exist
-    bool operator()(for_loop_c v) {
-        ir_viewer_t::dispatch(std::move(v));
-        return pfor_cnt_ > 1;
-    }
-    expr_c dispatch(expr_c v) override { return v; }
-
-    void view(for_loop_c f) override {
-        // check pfor
-        if (f->kind_ == for_type::PARALLEL && f->num_threads_ > 0) {
-            pfor_cnt_++;
-        }
-        // to be faster
-        if (pfor_cnt_ > 1) return;
-        ir_viewer_t::view(f);
-    }
-
-private:
-    int pfor_cnt_ = 0;
-};
-
-static bool try_merge_mixed_parti_parallel_inners(
-        mixed_parti_t *pa_to_merge, mixed_parti_t *parti_be_merged) {
-    pa_to_merge = pa_to_merge->get_root(),
-    parti_be_merged = parti_be_merged->get_root();
-
-    auto outer_loops_to_merge = pa_to_merge->get_outer_loops(),
-         outer_loops_be_merged = parti_be_merged->get_outer_loops();
-
-    if (outer_loops_to_merge.empty() || outer_loops_be_merged.empty())
-        return false;
-
-    auto merged_loop_size = get_great_common_loop_size(
-            outer_loops_to_merge, outer_loops_be_merged);
-    if (!merged_loop_size) return false; // no outer loop can be merged
-
-    // validate axis binding
-    merged_loop_size = check_parti_loop_axis_binding(
-            parti_be_merged, pa_to_merge, merged_loop_size);
-    SC_MODULE_INFO << "After axis binding, num loops to merge: "
-                   << merged_loop_size;
-    if (!merged_loop_size) return false; // no outer loop can be merged
-
-    // change the loop var name of the first for.
-    // first for: for m_s, for n_s. second for: for m_s_o, m_s_i, n_s.
-    // merge directly: for m_s, for n_s, for n_s.
-    // add suffix and merge: for m_s_0, for n_s_0, for n_s.
-    for (size_t i = 0; i < merged_loop_size; ++i) {
-        std::string &varname
-                = outer_loops_to_merge[i]->var_.static_as<var>()->name_;
-        varname += "_0";
-    }
-
-    SC_MODULE_INFO << "parallel merging two partition:";
-    SC_MODULE_INFO << pa_to_merge->func_;
-    SC_MODULE_INFO << parti_be_merged->func_;
-
-    if (check_parti_dep(pa_to_merge, parti_be_merged) == parti_dep::no_dep) {
-        auto &to_merge_body = outer_loops_to_merge[merged_loop_size - 1]->body_;
-        /**
-         * The thread-shared buffer of the previous pfor and the thread-shared
-         * buffer of the next pfor may share the same memory location after
-         * buffer scheduling and hoist. After removal of barrier, some threads
-         * may work on the previous pfor and others may work on the next pfor,
-         * and they share the same memory localtion. This will cause race
-         * condition. To avoid that, we need to check that:
-         *  1. there are no tensors defined in the immediate body of the merged
-         * pfor.
-         *  2. the buffers inside of the child pfor of the merged pfor must be
-         * the most inner loop (i.e. merged pfor must be second most inner
-         * loop). This is to avoid hoisting of the tensors. */
-        bool barrier_can_remove = true;
-        if (to_merge_body.isa<stmts>()) {
-            for (auto &s : to_merge_body.static_as<stmts>()->seq_) {
-                if (s.isa<define>()) {
-                    // Case 1: if tensor node is defined
-                    if (s.static_as<define>()->var_.isa<tensor>()) {
-                        barrier_can_remove = false;
-                        break;
-                    }
-                } else if (s.isa<for_loop>()) {
-                    // Case 2: nested pfor and potential hoist buffer
-                    if (nested_pfor_finder_t()(s.static_as<for_loop>())) {
-                        barrier_can_remove = false;
-                        break;
-                    }
-                }
-            }
-        }
-        if (barrier_can_remove) {
-            auto last_for = get_last_loop_in_body(to_merge_body);
-            if (last_for.defined()) {
-                last_for->attr()[stmt_attr_key::no_post_barrier] = true;
-            }
-        }
-    }
-
-    // try to add prefetch code
-    if (pa_to_merge->ctx_->flags_.prefetch_) {
-        auto op_first = pa_to_merge->committed_ops_[0];
-        auto op_second = parti_be_merged->committed_ops_[0];
-        if (auto second_prefetch
-                = op_second->dyn_cast<op_traits::may_prefetch_t>()) {
-            int64_t prefetch_insertion_point = -1;
-            auto last_for_parent = find_last_parallel_for(
-                    outer_loops_to_merge[merged_loop_size - 1]
-                            ->body_.static_as<stmts>(),
-                    prefetch_insertion_point);
-
-            std::vector<tensor_slice> in_slice;
-            in_slice.reserve(op_second->get_inputs().size());
-            for (auto &inp : op_second->get_inputs()) {
-                auto second_in = parti_be_merged->buf_alloc_.g2b_map_.get(inp);
-                in_slice.emplace_back(second_in);
-            }
-            auto prefetch_idx = second_prefetch->query_prefetch(
-                    pa_to_merge->ctx_, false, in_slice);
-            for (auto itr = prefetch_idx.begin(); itr != prefetch_idx.end();) {
-                auto input = op_second->get_inputs()[*itr]->producer_owner_;
-                if (pa_to_merge->contains(input)) {
-                    itr = prefetch_idx.erase(itr);
-                } else {
-                    ++itr;
-                }
-            }
-            if (prefetch_insertion_point != -1 && !prefetch_idx.empty()) {
-                std::vector<stmt> out_seq;
-                second_prefetch->generate_prefetcher_and_set_idle(
-                        pa_to_merge->ctx_, false, in_slice, prefetch_idx,
-                        out_seq);
-                auto &outer_body = last_for_parent->seq_;
-                outer_body.insert(outer_body.begin() + prefetch_insertion_point,
-                        out_seq.begin(), out_seq.end());
-            }
-        }
-    }
-
-    auto outer_loops_merged_target = outer_loops_to_merge[merged_loop_size - 1];
-    auto max_to_merge_anchor_map
-            = pa_to_merge->get_anchor_inside_loop(outer_loops_merged_target);
-    if (!max_to_merge_anchor_map) {
-        auto s = builder::make_stmts_unattached({}).checked_as<stmts>();
-        add_parent_node(s, outer_loops_merged_target->body_);
-        outer_loops_merged_target->body_.checked_as<stmts>()->seq_.emplace_back(
-                s);
-        // dummy fsmap, the tensor belongs to this scope will not be shrinked
-        fslice_map fsmap;
-        max_to_merge_anchor_map = std::make_shared<fusion_anchor_t>(s, fsmap);
-        pa_to_merge->append_fusion_anchor(max_to_merge_anchor_map);
-    }
-
-    pa_to_merge->func_->name_
-            += "_parallel_merge_" + parti_be_merged->func_->name_;
-
-    merge_parti_impl(pa_to_merge, parti_be_merged, merged_loop_size);
-
-    SC_MODULE_INFO << "parallel merging result:";
-    SC_MODULE_INFO << pa_to_merge->func_;
-
-    return true;
-}
-
-static bool try_merge_mixed_parti_parallel(mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    if (A == B) return false;
-    if (!A->func_.get() || !B->func_.get()) return false;
-    if (!check_parti_connectionship(A, B)) return false;
-    if (check_parti_ring_risk(A, B)) return false;
-
-    auto dep = check_parti_dep(A, B);
-    COMPILE_ASSERT(
-            dep != parti_dep::inter_dep, "inter-dependency is not expected");
-
-    auto append_parti = (dep == parti_dep::l_dep_r) ? A : B,
-         target_parti = (dep == parti_dep::l_dep_r) ? B : A;
-    SC_MODULE_INFO << "Start try_merge_mixed_parti_parallel: "
-                   << "Target: " << target_parti->func_->name_
-                   << ", Append: " << append_parti->func_->name_;
-
-    auto outer_loops_target = target_parti->get_outer_loops(),
-         outer_loops_append = append_parti->get_outer_loops();
-    if (outer_loops_target.empty() || outer_loops_append.empty()) return false;
-
-    auto outermost_loop_target = outer_loops_target[0],
-         outermost_loop_append = outer_loops_append[0];
-
-    // check parallel loops attr.
-    if (!(outermost_loop_target->kind_ == for_type::PARALLEL
-                && (outermost_loop_target->num_threads_ > 0))
-            || !(outermost_loop_append->kind_ == for_type::PARALLEL
-                    && (outermost_loop_append->num_threads_ > 0))) {
-        return false;
-    }
-
-    // check axis binding on the outmost axis
-    if (!check_parti_loop_axis_binding(append_parti, target_parti, 1)) {
-        return false;
-    }
-    // cannot do parallel merge when loop split granularity are different
-    if (outermost_loop_target->attr().get_or_else(
-                stmt_attr_key::parallel_merge_loop_granularity, 1)
-            != outermost_loop_append->attr().get_or_else(
-                    stmt_attr_key::parallel_merge_loop_granularity, 1)) {
-        return false;
-    }
-
-    if (outermost_loop_target->iter_begin_.isa<constant_c>()
-            && outermost_loop_target->iter_end_.isa<constant_c>()
-            && outermost_loop_target->step_.isa<constant_c>()
-            && outermost_loop_append->iter_begin_.isa<constant_c>()
-            && outermost_loop_append->iter_end_.isa<constant_c>()
-            && outermost_loop_append->step_.isa<constant_c>()) {
-        auto target_begin = get_expr_as_int(outermost_loop_target->iter_begin_),
-             target_end = get_expr_as_int(outermost_loop_target->iter_end_),
-             target_step = get_expr_as_int(outermost_loop_target->step_),
-             append_begin = get_expr_as_int(outermost_loop_append->iter_begin_),
-             append_end = get_expr_as_int(outermost_loop_append->iter_end_),
-             append_step = get_expr_as_int(outermost_loop_append->step_);
-        // start and step must be the same
-        if (!(target_begin == append_begin && target_step == append_step)) {
-            return false;
-        }
-        if (target_end
-                != append_end) { // if the end is not same, then we try to split
-            // the first on num_threads_ to merge, but we
-            // require num_iters == num_threads
-            if (append_parti->contain_convolution()
-                    || target_parti->contain_convolution()) {
-                // currently do not support split num_threads_ on conv
-                return false;
-            }
-
-            if (target_begin != 0 || target_step != 1 || append_begin != 0
-                    || append_step != 1) {
-                // Only support begin is 0 and step is 1
-                return false;
-            }
-            if (target_end == outermost_loop_target->num_threads_
-                    && append_end == outermost_loop_append->num_threads_) {
-                // For the two fors, same start, same step, different end =>
-                // different num_iters.
-                // For each for, num_iters == num_threads. So We split the for
-                // on num_threads.
-                // can not split the outermost loop in imbalanced cases
-                if (!outermost_loop_target->attr().get_or_else(
-                            stmt_attr_key::parallel_loop_balanced, true)
-                        || !outermost_loop_append->attr().get_or_else(
-                                stmt_attr_key::parallel_loop_balanced, true)) {
-                    SC_MODULE_INFO << "The outermost loop is imbalanced, can "
-                                      "not be split";
-                    return false;
-                }
-                if (outermost_loop_target->num_threads_
-                                > outermost_loop_append->num_threads_
-                        && outermost_loop_target->num_threads_
-                                        % outermost_loop_append->num_threads_
-                                == 0) {
-                    if (outermost_loop_append->num_threads_ == 1) {
-                        // in this case, after split, outermost_loop_target
-                        // num_threads will be 1
-                        return false;
-                    }
-                    int64_t num_groups = outermost_loop_target->num_threads_
-                            / outermost_loop_append->num_threads_;
-                    target_parti->try_split_outermost_loop_on_num_threads(
-                            num_groups);
-                    return try_merge_mixed_parti_parallel_inners(
-                            target_parti, append_parti);
-                } else if (outermost_loop_append->num_threads_
-                                > outermost_loop_target->num_threads_
-                        && outermost_loop_append->num_threads_
-                                        % outermost_loop_target->num_threads_
-                                == 0) {
-                    if (outermost_loop_target->num_threads_ == 1) {
-                        // in this case, after split, outermost_loop_append
-                        // num_threads will be 1
-                        return false;
-                    }
-                    int64_t num_groups = outermost_loop_append->num_threads_
-                            / outermost_loop_target->num_threads_;
-                    append_parti->try_split_outermost_loop_on_num_threads(
-                            num_groups);
-                    return try_merge_mixed_parti_parallel_inners(
-                            target_parti, append_parti);
-                } else {
-                    return false;
-                }
-            } else { // do not support num_iters != num_threads case for now
-                return false;
-            }
-        } else { // the two fors have same (start, end, step)
-            // if num_threads are the same, merge them directly
-            if (outermost_loop_target->num_threads_
-                    == outermost_loop_append->num_threads_) {
-                return try_merge_mixed_parti_parallel_inners(
-                        target_parti, append_parti);
-            } else {
-                return false;
-            }
-        }
-    } else { // if (start, end, step) is not constant
-        return false;
-    }
-}
-
-static bool try_merge_mixed_parti_horizontally(
-        mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    if (A == B) return false;
-    if (!A->func_.get() || !B->func_.get()) return false;
-    if (!A->contain_tunable_op() || !B->contain_tunable_op()) return false;
-    if (!check_parti_connectionship(A, B)) return false;
-    if (check_parti_dep(A, B) != parti_dep::no_dep) return false;
-
-    auto outer_loops_A = A->get_outer_loops(),
-         outer_loops_B = B->get_outer_loops();
-    if (outer_loops_A.empty() || outer_loops_B.empty()) return false;
-    // skip parallel merge
-    if (outer_loops_A[0]->num_threads_ > 0
-            || outer_loops_B[0]->num_threads_ > 0)
-        return false;
-    // check cost model
-    if (!A->cost_->make_decision_for_parti(
-                B, 1, parti_merge_kind::horizontal)) {
-        return false;
-    }
-
-    SC_MODULE_INFO << "horizontally merging two partition:";
-    SC_MODULE_INFO << A->func_;
-    SC_MODULE_INFO << B->func_;
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 0: Fuse func_
-     * * * * * * * * * * * * * * * * */
-    node_ptr_map node_remap;
-    schedule_loop_body(A->func_->body_, &node_remap);
-    schedule_loop_body(B->func_->body_, &node_remap);
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 1: Merge func_
-     * * * * * * * * * * * * * * * * */
-    auto new_body = make_stmt<stmts_node_t>(
-            std::vector<stmt> {A->func_->body_, B->func_->body_});
-
-    A->func_->body_ = std::move(new_body);
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 2: Merge fanchor_
-     * * * * * * * * * * * * * * * * */
-    A->append_fusion_anchor(B->fanchors_);
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 3: Merge buffer_
-     * * * * * * * * * * * * * * * * */
-    std::unordered_map<expr, expr> buffer_map;
-    A->buf_alloc_.merge(
-            B->buf_alloc_, buffer_map, std::make_pair(nullptr, nullptr));
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 4: Replace expr
-     * * * * * * * * * * * * * * * * */
-    for (auto &buf_pair : buffer_map) {
-        node_remap.insert(
-                std::make_pair(buf_pair.first.impl, buf_pair.second.impl));
-    }
-    mxp_replacer_t expr_reper(node_remap);
-    // 1. func->body
-    expr_reper.replace_func(A->func_);
-    // 2. fanchor->fsmap->slice_range
-    expr_reper.replace_anchor(A->fanchors_);
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 5: Merge op_anchor_map_
-     * * * * * * * * * * * * * * * * */
-    A->op_anchor_map_.insert(
-            B->op_anchor_map_.begin(), B->op_anchor_map_.end());
-
-    // call base merge
-    A->fusion_partition_t::merge(
-            static_cast<fusion_partition_t *>(B)->shared_from_this());
-
-    // Merge commited ops
-    A->committed_ops_.insert(A->committed_ops_.end(), B->committed_ops_.begin(),
-            B->committed_ops_.end());
-
-    auto &body = A->func_->body_;
-    /* * * * * * * * * * * * * * * * *
-     * Step 6: Same to Horizontal Merge
-     * * * * * * * * * * * * * * * * */
-    COMPILE_ASSERT(body.isa<stmts>(), "body has only one stmt.");
-    scope_flatten(body.checked_as<stmts>(), -1);
-    std::vector<stmt> &body_seq = body.checked_as<stmts>()->seq_;
-    std::vector<for_loop> loops;
-    std::vector<stmt> not_loops;
-    for (auto &st : body_seq) {
-        if (st.isa<for_loop>()) {
-            loops.push_back(st.checked_as<for_loop>());
-        } else if (!st.isa<returns>()) {
-            not_loops.push_back(st);
-        }
-    }
-    std::vector<stmt> new_seq(not_loops.begin(), not_loops.end());
-    new_seq.insert(new_seq.end(), loops.begin(), loops.end());
-    body_seq = std::move(new_seq);
-    COMPILE_ASSERT(loops.size() > 1,
-            "No need to horizontal fuse as parallel loop number is less "
-            "than "
-            "2.");
-
-    constant_folder_t cf;
-    auto_caster_t ac;
-    for (size_t i = 1; i < loops.size(); i++) {
-        loops[0]->parallel_merge(body, loops[i]);
-        loops[0]->iter_end_ = cf(ac(loops[0]->iter_end_)).remove_const();
-    }
-
-    add_parent_node(loops[0], stmt());
-
-    A->func_->name_ += "_horizontal_merge_" + B->func_->name_;
-
-    SC_MODULE_INFO << "horizontally merging result:";
-    SC_MODULE_INFO << A->func_;
-
-    // clear merged parti
-    B->clear();
-
-    return true;
-}
-
-static sc_dim get_loops_range(const for_loop &loop) {
-    if (!(loop->iter_begin_.isa<constant_c>()
-                && loop->iter_end_.isa<constant_c>())) {
-        return (int64_t)0;
-    }
-    return get_expr_as_int(loop->iter_end_)
-            - get_expr_as_int(loop->iter_begin_);
-}
-
-static sc_dim get_loops_range_prod(const std::vector<for_loop> &loops) {
-    sc_dim prod_res = 1;
-    for (auto &l : loops) {
-        prod_res *= get_loops_range(l);
-    }
-    return prod_res;
-}
-
-static void try_align_parti_outer_loops(mixed_parti_t *A, mixed_parti_t *B) {
-    auto outer_loops_A = A->get_outer_loops(),
-         outer_loops_B = B->get_outer_loops();
-    if (outer_loops_A.empty() || outer_loops_B.empty()) return;
-    if (outer_loops_A.size() == outer_loops_B.size()) return;
-
-    auto outermost_loop_A_range = get_loops_range(outer_loops_A[0]),
-         outermost_loop_B_range = get_loops_range(outer_loops_B[0]);
-    if (outermost_loop_A_range == 0 || outermost_loop_B_range == 0
-            || outermost_loop_A_range == outermost_loop_B_range) {
-        return;
-    } else if (outermost_loop_A_range <= outermost_loop_B_range) {
-        B->try_split_outermost_loop(outermost_loop_A_range);
-    } else {
-        A->try_split_outermost_loop(outermost_loop_B_range);
-    }
-}
-
-// check brgemm pre-op fusion
-static bool try_merge_brgemm_and_preop_parti(mixed_parti_t *A, mixed_parti_t *B,
-        const sc_op_ptr &joint_op = nullptr) {
-    if (!require_fusion_level(A->ctx_, fusion_opt_level::lv2)) return false;
-    A = A->get_root(), B = B->get_root();
-    if (A == B) return false;
-    if (!A->func_.get() || !B->func_.get()) return false;
-    if (!joint_op && !check_parti_connectionship(A, B)) return false;
-    if (check_parti_ring_risk(A, B)) return false;
-    auto dep_flag = check_parti_dep(A, B);
-    if (dep_flag == parti_dep::inter_dep) return false;
-
-    mixed_parti_t *brgemm_parti, *preop_parti;
-    if (A->contain_tunable_op() && B->contain_elemwise_op_only()) {
-        brgemm_parti = A;
-        preop_parti = B;
-    } else if (B->contain_tunable_op() && A->contain_elemwise_op_only()) {
-        brgemm_parti = B;
-        preop_parti = A;
-    } else {
-        return false;
-    }
-
-    if (check_parti_dep(brgemm_parti, preop_parti) == parti_dep::l_dep_r)
-        return false;
-
-    SC_MODULE_INFO << "pre-op merging two partition:";
-    SC_MODULE_INFO << A->func_;
-    SC_MODULE_INFO << B->func_;
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 1: pre infer elementwise op using brgemm parti fusion anchor
-     * * * * * * * * * * * * * * * * */
-    for (auto &brgemm_parti_anchor : brgemm_parti->fanchors_) {
-        fslice_map tmp_fsmap;
-        for (auto iter = brgemm_parti_anchor->fsmap_.datamap_.begin();
-                iter != brgemm_parti_anchor->fsmap_.datamap_.end();) {
-            if (!brgemm_parti->buf_alloc_.g2b_map_.haskey(iter->first)) {
-                auto pre_fuse_last_op = iter->first->producer_owner_;
-                if (preop_parti->contains(pre_fuse_last_op)) {
-                    // set known slice range and prepare for pre-infer slice
-                    // range
-                    tmp_fsmap.get(iter->first) = iter->second;
-                    // pre infer ops list
-                    std::vector<sc_op *> pre_infer_ops;
-                    for (auto &op : preop_parti->committed_ops_) {
-                        if (op.get() == pre_fuse_last_op
-                                || preop_parti->dep_m_->lookup(
-                                           op.get(), pre_fuse_last_op)
-                                        == 1) {
-                            pre_infer_ops.emplace_back(op.get());
-                        }
-                    }
-                    COMPILE_ASSERT(pre_infer_ops.back() == pre_fuse_last_op,
-                            "The first pre infer op should be the last pre "
-                            "fuse op")
-                    // inverse visit, pre infer from the last op to first one
-                    for (auto op_iter = pre_infer_ops.rbegin();
-                            op_iter != pre_infer_ops.rend(); op_iter++) {
-                        auto status = (*op_iter)
-                                              ->dyn_cast<fusible_op_t>()
-                                              ->pre_infer_slice_ranges(
-                                                      brgemm_parti->ctx_,
-                                                      tmp_fsmap);
-                        assert(status == infer_status_code::OK);
-                    }
-                    iter++;
-                } else {
-                    iter = brgemm_parti_anchor->fsmap_.datamap_.erase(iter);
-                }
-            } else {
-                iter++;
-            }
-        }
-        // move fsmap
-        brgemm_parti_anchor->fsmap_.datamap_.insert(
-                tmp_fsmap.datamap_.begin(), tmp_fsmap.datamap_.end());
-    }
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 2: Commit ops in pre-op parti into brgemm parti by original order
-     * * * * * * * * * * * * * * * * */
-    brgemm_parti->func_->name_ += "_preop_merge";
-    // set pre-op fusion attr
-    preop_parti->committed_ops_.front()->attrs_.set(
-            mixed_partition_hint::pre_fuse_begin_op, true);
-    for (auto &op_in_preop_parti : preop_parti->committed_ops_) {
-        brgemm_parti->add(op_in_preop_parti);
-    }
-    // remove pre-op fusion attr
-    preop_parti->committed_ops_.front()->attrs_.remove(
-            mixed_partition_hint::pre_fuse_begin_op);
-
-    // erase joint op in op_anchor_map
-    brgemm_parti->op_anchor_map_.erase(joint_op.get());
-
-    /* * * * * * * * * * * * * * * * *
-     * Step 3: Merge op_
-     * * * * * * * * * * * * * * * * */
-    // call base merge
-    brgemm_parti->fusion_partition_t::merge(
-            static_cast<fusion_partition_t *>(preop_parti)->shared_from_this());
-
-    SC_MODULE_INFO << "pre-op merging result:";
-    SC_MODULE_INFO << brgemm_parti->func_;
-
-    // clear merged parti
-    preop_parti->clear();
-
-    return true;
-}
-
-static bool try_merge_mixed_parti_vertically(mixed_parti_t *A, mixed_parti_t *B,
-        const sc_op_ptr &joint_op, bool keep_outerloop_size = false) {
-    A = A->get_root(), B = B->get_root();
-    if (A == B) return false;
-    if (!A->func_.get() || !B->func_.get()) return false;
-    // in avoid conflict with parallel merge
-    if (A->contain_nested_parallel_for() || B->contain_nested_parallel_for())
-        return false;
-    if (!joint_op && !check_parti_connectionship(A, B)) return false;
-    if (!joint_op && check_parti_ring_risk(A, B)) return false;
-    auto dep_flag = check_parti_dep(A, B);
-    // if two partition inter-depends each other, could not merge them
-    if (dep_flag == parti_dep::inter_dep) return false;
-    // if both two partitions have input anchor, could not merge them
-    if (A->contain_input_anchor() && B->contain_input_anchor()) return false;
-    mixed_parti_t *pa_to_merge = nullptr, *parti_be_merged = nullptr;
-
-    pa_to_merge = (dep_flag == parti_dep::l_dep_r) ? B : A;
-    parti_be_merged = (dep_flag == parti_dep::l_dep_r) ? A : B;
-
-    auto outer_loops_to_merge = pa_to_merge->get_outer_loops(),
-         outer_loops_be_merged = parti_be_merged->get_outer_loops();
-
-    auto merged_loop_size = get_great_common_loop_size(
-            outer_loops_to_merge, outer_loops_be_merged);
-
-    if (!merged_loop_size) return false;
-
-    // validate axis binding
-    merged_loop_size = check_parti_loop_axis_binding(
-            parti_be_merged, pa_to_merge, merged_loop_size);
-
-    if (!merged_loop_size
-            || (keep_outerloop_size
-                    && (merged_loop_size != outer_loops_to_merge.size()
-                            || merged_loop_size
-                                    != outer_loops_be_merged.size())))
-        return false;
-
-    // check cost model
-    if (!pa_to_merge->cost_->make_decision_for_parti(
-                parti_be_merged, merged_loop_size, parti_merge_kind::vertical))
-        return false;
-
-    if (auto max_to_merge_anchor_map = pa_to_merge->get_anchor_inside_loop(
-                outer_loops_to_merge[merged_loop_size - 1])) {
-        if (joint_op
-                && std::any_of(joint_op->get_inputs().begin(),
-                        joint_op->get_inputs().end(),
-                        [&max_to_merge_anchor_map, &pa_to_merge](
-                                const graph_tensor_ptr &inp) {
-                            return pa_to_merge->contains(inp->producer_owner_)
-                                    && max_to_merge_anchor_map->blocked_gt_set_
-                                               .find(inp)
-                                    != max_to_merge_anchor_map->blocked_gt_set_
-                                               .end();
-                        }))
-            return false;
-    } else {
-        return false;
-    }
-
-    SC_MODULE_INFO << "merging two partition:";
-    SC_MODULE_INFO << A->func_;
-    SC_MODULE_INFO << B->func_;
-
-    pa_to_merge->func_->name_ += "_merge_" + parti_be_merged->func_->name_;
-
-    merge_parti_impl(pa_to_merge, parti_be_merged, merged_loop_size, joint_op);
-
-    SC_MODULE_INFO << "Merging result:";
-    SC_MODULE_INFO << pa_to_merge->func_;
-
-    return true;
-}
-
-// usually used by crossover dispatcher
-static bool try_merge_mixed_parti_vertically(
-        mixed_parti_t *A, mixed_parti_t *B) {
-    A = A->get_root(), B = B->get_root();
-    if (A == B) return false;
-    // if A and B are forked, do not merge them
-    if (check_parti_forked(A, B)) return false;
-    // skip single op partition
-    if (A->is_single_op_parti() || B->is_single_op_parti()) return false;
-    bool image_affinity = A->contain_convolution() && B->contain_convolution();
-    return try_merge_mixed_parti_vertically(A, B, nullptr, !image_affinity);
-}
-
-static bool try_merge_mixed_parti_with_joint_op(const mixed_parti_t::ptr &A,
-        const mixed_parti_t::ptr &B, const sc_op_ptr &joint_op) {
-    mixed_parti_t *default_lhs, *default_rhs;
-    // If no dependence, first execute the partition with more tunable ops
-    if (joint_op->isa<tunable_op_t>()
-            || (A->contain_tunable_op()
-                    && (!B->contain_tunable_op()
-                            || (B->count_op_with_type<tunable_op_t>()
-                                    > A->count_op_with_type<
-                                            tunable_op_t>())))) {
-        default_lhs = B.get();
-        default_rhs = A.get();
-    } else {
-        default_lhs = A.get();
-        default_rhs = B.get();
-    }
-    auto &ctx = A->ctx_;
-    // if pre-op fusion succeed, skip vertical merge
-    return try_merge_brgemm_and_preop_parti(default_lhs, default_rhs, joint_op)
-            || try_merge_mixed_parti_vertically(
-                    default_lhs, default_rhs, joint_op);
-}
-
-mixed_parti_t::mixed_parti_t(
-        const context_ptr &ctx, const sc_op_ptr &op, const dep_mat_ptr &dep_m)
-    : dep_m_(dep_m), ctx_(ctx) {
-    auto &graph = op->get_owner_graph();
-    if (graph.is_dynamic()) {
-        cost_ = std::make_shared<dynamic_fusion_cost_model_t>(this,
-                graph.attrs_.get_or_else("temp.dynamic_fusion_policy",
-                        dynamic_fusion_policy_t::max_fusion));
-    } else {
-        cost_ = std::make_shared<static_fusion_cost_model_t>(this);
-    }
-    if (!op->isa<constant_op_t>() && !is_reshape_op(op.get())) {
-        SC_MODULE_INFO << "================  create new partition: "
-                       << op->op_name_ << "_" << op->logical_op_id_
-                       << " ================";
-        auto mixed_op = op->dyn_cast<op_traits::mixed_partition_acceptable>();
-        mixed_op->create_mixed_partition(this);
-        func_->name_ = op->op_name_ + std::to_string(op->logical_op_id_);
-        SC_MODULE_INFO << func_;
-    }
-    ops.insert(op);
-    committed_ops_.emplace_back(op);
-}
-
-mixed_parti_t::mixed_parti_t(const context_ptr &ctx, const func_t &func,
-        const fusion_anchor_mgr_t &fmgr, const sc_graph_t &graph)
-    : ctx_(ctx), func_(func) {
-    cost_ = std::make_shared<static_fusion_cost_model_t>(this);
-    dep_m_ = std::make_shared<op_dep_matrix_t>(graph);
-    append_fusion_anchor(fmgr.get_fusion_anchor());
-}
-
-bool mixed_parti_t::is_ok_to_add(sc_op *op) {
-    if (merged_to) { return get_root()->is_ok_to_add(op); }
-    if (empty()) { return false; }
-    if (!fusion_partition_t::is_ok_to_add(op, (*dep_m_))) {
-        SC_MODULE_INFO << op->op_name_ << "_" << op->logical_op_id_
-                       << " fail to add partition: " << func_->name_
-                       << ", due to potential graph "
-                          "dependency ring risk";
-        return false;
-    }
-    // TODO(yunfei): when all tunable ops completely refactored to managed
-    // threads mode, remove this if.
-    if (op->isa<tunable_op_t>() && contain_nested_parallel_for()) {
-        return false;
-    }
-    // if not acceptable. return false
-    if (!op->isa<op_traits::mixed_partition_acceptable>()) return false;
-    // search suitable anchor for op
-    auto mixed_op = op->dyn_cast<op_traits::mixed_partition_acceptable>();
-    mixed_op->search_anchor(this);
-    if (!ready_for_op(op)) {
-        SC_MODULE_INFO << op->op_name_ << "_" << op->logical_op_id_
-                       << " fail to add partition: " << func_->name_
-                       << ", due to no suitable anchor found";
-        return false;
-    }
-    return true;
-}
-
-bool mixed_parti_t::ready_for_op(sc_op *op) const {
-    if (merged_to) { return get_root()->ready_for_op(op); }
-    return op_anchor_map_.find(op) != op_anchor_map_.end();
-}
-
-void mixed_parti_t::set_anchor_for_op(
-        sc_op *op, const fusion_anchor_ptr &fanchor_map) {
-    if (merged_to) {
-        get_root()->set_anchor_for_op(op, fanchor_map);
-        return;
-    }
-
-    auto iter = op_anchor_map_.find(op);
-    // update op_anchor_map_
-    if (iter != op_anchor_map_.end()) {
-        // auto skip
-        if (iter->second == fanchor_map) return;
-        auto res = cmp_op_anchor(op, iter->second, fanchor_map);
-        // if new anchor is more smaller than the current one
-        if (res == cmp_res::l_larger_r) {
-            // overwrite new anchor
-            op_anchor_map_[op] = fanchor_map;
-        } else if (res == cmp_res::equal) {
-            // Except for reduce_collect_op_t with COPY kind because this mode
-            // will elimiate first required axis
-            if (auto red_coll = op->dyn_cast<reduce_collect_op_t>()) {
-                if (red_coll->op_ == reduce_collect_op_t::kind::COPY) return;
-            }
-            // if sub anchor is equal to parent one, overwrite it
-            if (iter->second->is_parent_for(fanchor_map)) {
-                // overwrite inner anchor
-                op_anchor_map_[op] = fanchor_map;
-            }
-        }
-    } else {
-        op_anchor_map_[op] = fanchor_map;
-    }
-}
-
-bool mixed_parti_t::add(const sc_op_ptr &op) {
-    if (merged_to) { return get_root()->add(op); }
-    // pre-check anchor for op in avoid of throw assert during following
-    // appending stage
-    search_op_anchor_in_parti(op.get(), this);
-    // if no anchor is ready, return false
-    if (!ready_for_op(op.get())) {
-        SC_MODULE_INFO << op->op_name_ << "_" << op->logical_op_id_
-                       << " fail to add partition: " << func_->name_
-                       << ", due to no suitable anchor found";
-        return false;
-    }
-    /* adding op to partition */
-    SC_MODULE_INFO << "================  adding op: " << op->op_name_ << "_"
-                   << op->logical_op_id_ << " to partition: " << func_->name_
-                   << " ================";
-    if (op->need_dynamic_internal_query()) {
-        dyn_inter_ = std::make_shared<mixed_dyn_internal_info_t>(ctx_);
-    }
-    // if not acceptable. return false
-    if (!op->isa<op_traits::mixed_partition_acceptable>()) return false;
-    // append to partition
-    auto mixed_op = op->dyn_cast<op_traits::mixed_partition_acceptable>();
-    mixed_op->append_mixed_partition(this);
-    func_->name_ += "_" + op->op_name_ + std::to_string(op->logical_op_id_);
-    SC_MODULE_INFO << func_;
-    ops.insert(op);
-    committed_ops_.emplace_back(op);
-    return true;
-}
-
-void mixed_parti_t::append_fusion_anchor(const fusion_anchor_ptr &fanchor) {
-    auto cur = try_convert_anchor(ctx_, fanchor);
-    cur->binded_mxp_ = this;
-    fanchors_.emplace_back(cur);
-}
-
-fusion_anchor_ptr mixed_parti_t::lookup_anchor_map(
-        sc_op *op, bool throw_assert) const {
-    if (merged_to) { return get_root()->lookup_anchor_map(op); }
-    auto iter = op_anchor_map_.find(op);
-    auto ret = (iter != op_anchor_map_.end()) ? iter->second : nullptr;
-    if (throw_assert) {
-        COMPILE_ASSERT(ret,
-                "No dispatched fusion anchor map found for "
-                        << op->op_name_
-                        << " in this partition, please try to search it "
-                           "firstly");
-    }
-    return ret;
-}
-
-fusion_anchor_ptr mixed_parti_t::lookup_anchor_map(const stmts &ss) const {
-    if (merged_to) { return get_root()->lookup_anchor_map(ss); }
-    auto iter = std::find_if(fanchors_.begin(), fanchors_.end(),
-            [&ss](const fusion_anchor_ptr &amap) {
-                return ss.ptr_same(amap->anchor_position_);
-            });
-    return (iter != fanchors_.end()) ? (*iter) : nullptr;
-}
-
-std::vector<fusion_anchor_ptr> mixed_parti_t::lookup_sub_anchor_map(
-        const fusion_anchor_ptr &parent_fanchor) const {
-    if (merged_to) { return get_root()->lookup_sub_anchor_map(parent_fanchor); }
-    std::vector<fusion_anchor_ptr> subs;
-    for (auto &fanc : fanchors_) {
-        if (fanc->parent_ == parent_fanchor) { subs.emplace_back(fanc); }
-    }
-    return subs;
-}
-
-void mixed_parti_t::clear_fanchor(fusion_anchor_ptr &fanchor) {
-    stmt anchor = fanchor->anchor_position_;
-    if (!fanchor->isa<grouped_fusion_anchor_t>()) {
-        COMPILE_ASSERT(anchor.checked_as<stmts>()->seq_.empty(),
-                "Could not remove this fanchor, because it is not empty");
-    } else {
-        for (auto &sub_group : anchor.checked_as<stmts>()->seq_) {
-            COMPILE_ASSERT(sub_group.checked_as<stmts>()->seq_.empty(),
-                    "Could not remove this fanchor, because it is not empty");
-        }
-    }
-    stmt parent = get_parent_node(anchor);
-    auto ss_parent = parent.checked_as<stmts>();
-    // clear empty if_node outside iter anchor if necessary
-    if (fanchor->isa<iter_fusion_anchor_t>() && ss_parent->seq_.size() == 1
-            && get_parent_node(parent).isa<if_else>()) {
-        auto if_node = get_parent_node(parent);
-        // redirect
-        parent = get_parent_node(if_node);
-        ss_parent = parent.checked_as<stmts>();
-        anchor = if_node;
-    }
-    // find anchor iter
-    std::vector<stmt>::iterator anchor_iter
-            = std::find_if(ss_parent->seq_.begin(), ss_parent->seq_.end(),
-                    [anchor](stmt &s) { return s.ptr_same(anchor); });
-    COMPILE_ASSERT(anchor_iter != ss_parent->seq_.end(),
-            "Could not found anchor in current parent stmts");
-    // remove anchor
-    ss_parent->seq_.erase(anchor_iter);
-    // reset fanchor
-    fanchor->anchor_position_ = stmts();
-    fanchor->fsmap_.clear();
-}
-
-void mixed_parti_t::clear_fanchors() {
-    for (auto iter = fanchors_.begin(); iter != fanchors_.end();) {
-        auto fanchor = *iter;
-        if (!fanchor->anchor_position_->size()) {
-            clear_fanchor(fanchor);
-            iter = fanchors_.erase(iter);
-        } else {
-            iter++;
-        }
-    }
-}
-
-std::vector<for_loop> mixed_parti_t::get_outer_loops(
-        fusion_anchor_ptr fanchor, bool skip_def_node) const {
-    if (merged_to) { return get_root()->get_outer_loops(fanchor); }
-    std::vector<for_loop> outer_loops;
-    if (!func_) return outer_loops;
-    auto body = func_->body_;
-
-    fusion_anchor_ptr target_fanchor = std::move(fanchor);
-    while (target_fanchor && target_fanchor->parent_) {
-        target_fanchor = target_fanchor->parent_;
-    }
-
-    if (body.isa<stmts>()) {
-        // copy
-        auto ss = body.static_as<stmts>()->seq_;
-        if (skip_def_node) {
-            for (auto iter = ss.begin(); iter != ss.end();) {
-                if (iter->isa<define>()) {
-                    iter = ss.erase(iter);
-                } else {
-                    break;
-                }
-            }
-        }
-        // pop back unnecessary item including `return` or unused anchor
-        if (!ss.empty()) {
-            auto backs = ss.back();
-            if (backs.isa<returns>()) ss.pop_back();
-            backs = ss.back();
-            if (backs.isa<stmts>()) {
-                auto last_ss = backs.static_as<stmts>();
-                if (last_ss->seq_.empty()) {
-                    auto anchor_map = lookup_anchor_map(last_ss);
-                    if (anchor_map && anchor_map != target_fanchor) {
-                        ss.pop_back();
-                    }
-                }
-            }
-        }
-        if (ss.size() == 1) {
-            auto st = ss[0];
-            if (st.isa<for_loop>()) {
-                auto loop = st.static_as<for_loop>();
-                while (loop.defined()) {
-                    outer_loops.emplace_back(loop);
-                    loop = get_next_inner_loop_with_anchor(
-                            loop, target_fanchor);
-                }
-            }
-        }
-    }
-
-    return outer_loops;
-}
-
-/**
- * for_loop(){
- *   // input anchor
- *   {}
- *   // body
- *   ...
- *   // output anchor
- *   {}
- * }
- * */
-fusion_anchor_ptr mixed_parti_t::get_anchor_inside_loop(
-        const for_loop &loop, bool input_anchor) const {
-    auto &body = loop->body_;
-    if (body.isa<stmts>()) {
-        auto ss = body.static_as<stmts>();
-        for (auto s : ss->seq_) {
-            // find anchor inside if-node
-            if (s.isa<if_else>()) {
-                auto if_node = s.static_as<if_else>();
-                if (!if_node->then_case_.defined()
-                        || if_node->else_case_.defined())
-                    continue;
-                auto then_stmts = if_node->then_case_.static_as<stmts>();
-                if (then_stmts->seq_.size() == 1)
-                    s = then_stmts->seq_[0];
-                else
-                    continue;
-            }
-            if (!s.isa<stmts>()) continue;
-            auto inner_ss = s.static_as<stmts>();
-            auto anchor_map = lookup_anchor_map(inner_ss);
-            if (anchor_map) {
-                // check anchor type
-                if (input_anchor != anchor_map->is_input_anchor()) continue;
-                return anchor_map;
-            }
-        }
-    }
-    return nullptr;
-}
-
-for_loop mixed_parti_t::get_next_inner_loop_with_anchor(
-        const for_loop &cur_loop,
-        const fusion_anchor_ptr &target_fanchor) const {
-    if (cur_loop->body_.isa<for_loop>()) {
-        return cur_loop->body_.checked_as<for_loop>();
-    } else if (cur_loop->body_.isa<stmts>()) {
-        auto ss = cur_loop->body_.static_as<stmts>();
-        bool target_anchor_found = false;
-        for_loop next_loop = for_loop();
-        for (auto &s : ss->seq_) {
-            if (s.isa<stmts>()) {
-                auto inner_ss = s.static_as<stmts>();
-                if (inner_ss->seq_.empty()) {
-                    auto anchor_map = lookup_anchor_map(inner_ss);
-                    if (anchor_map) {
-                        if (anchor_map == target_fanchor) {
-                            target_anchor_found = true;
-                            break;
-                        }
-                        continue;
-                    } else {
-                        next_loop = for_loop();
-                        break;
-                    }
-                } else {
-                    next_loop = for_loop();
-                    break;
-                }
-            } else if (s.isa<for_loop>() && !next_loop.defined()) {
-                next_loop = s.checked_as<for_loop>();
-            } else if (s.cast<evaluate>()
-                               .map([](const evaluate &v) {
-                                   return v->value_.as<intrin_call>();
-                               })
-                               .filter([](const intrin_call &v) {
-                                   return v->type_
-                                           == intrin_type::set_thread_idle_func;
-                               })
-                               .has_value()) {
-                // When prefetch is enabled, a prefetcher function call
-                // `evaluate{set_thread_idle_func(...)}` would have been added
-                // to the loop. We should ignore it when finding next anchor.
-                continue;
-            } else {
-                next_loop = for_loop();
-                break;
-            }
-        }
-        return target_anchor_found ? for_loop() : next_loop;
-    }
-    return for_loop();
-}
-
-class var_replacer_t : public ir_visitor_t {
-public:
-    var_replacer_t(const std::unordered_map<expr_c, expr_c> &remap)
-        : remap_(remap) {}
-    std::unordered_map<expr_c, expr_c> remap_;
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    expr_c visit(var_c v) override {
-        auto itr = remap_.find(v);
-        if (itr != remap_.end()) { return itr->second; }
-        return v;
-    }
-};
-
-void mixed_parti_t::try_split_outermost_loop_on_num_threads(
-        int64_t num_groups) {
-    auto outer_loops = get_outer_loops();
-    if (outer_loops.empty()) return;
-    auto outermost_loop = outer_loops[0];
-
-    // cache original largest anchor inside outermost loop
-    auto origin_large_anchor = get_anchor_inside_loop(outermost_loop);
-    // node ptr replace map
-    node_ptr_map remap;
-    // change IR and record replace map
-    auto split_inner_loop
-            = outermost_loop->split_on_num_threads(num_groups, &remap);
-    mxp_replacer_t(remap).replace_anchor(fanchors_);
-    // if original largest anchor not null, create new largest anchor under new
-    // outermost loop
-    if (origin_large_anchor) {
-        auto s = builder::make_stmts_unattached({}).checked_as<stmts>();
-        add_parent_node(s, outermost_loop->body_);
-        outermost_loop->body_.checked_as<stmts>()->seq_.emplace_back(s);
-        // dummy fsmap, the tensor belongs to this scope will not be shrinked
-        fslice_map new_fsmap;
-        // copy from original fsmap
-        new_fsmap.datamap_ = origin_large_anchor->fsmap_.datamap_;
-        // create var inplacer
-        std::unordered_map<expr_c, expr_c> vmap = {{split_inner_loop->var_,
-                make_expr<constant_node>(
-                        UINT64_C(0), split_inner_loop->var_->dtype_)}};
-        var_replacer_t repl(vmap);
-        //  modify slice range
-        for (auto &fspair : new_fsmap.datamap_) {
-            for (auto &srange : fspair.second) {
-                for (auto &r : srange) {
-                    auto new_v = repl.dispatch(r.first);
-                    // if necesary
-                    if (!new_v.ptr_same(r.first)) {
-                        // set new offset
-                        r.first = new_v.remove_const();
-                        // set new range
-                        r.second = dim2unsigned(get_expr_as_int(r.second)
-                                * (get_expr_as_int(split_inner_loop->iter_end_)
-                                        - get_expr_as_int(
-                                                split_inner_loop
-                                                        ->iter_begin_)));
-                    }
-                }
-            }
-        }
-        // append new anchor into parti
-        append_fusion_anchor(std::make_shared<fusion_anchor_t>(s, new_fsmap));
-    }
-}
-
-void mixed_parti_t::try_split_outermost_loop(int64_t block) const {
-    auto outer_loops = get_outer_loops();
-    if (outer_loops.empty()) return;
-    auto outermost_loop = outer_loops[0];
-    auto outermost_loop_range = get_expr_as_int(outermost_loop->iter_end_)
-            - get_expr_as_int(outermost_loop->iter_begin_);
-    if (outermost_loop_range % block != 0) return;
-    if (!outermost_loop->step_.isa<constant>()
-            || get_expr_as_int(outermost_loop->step_) != 1)
-        return;
-
-    node_ptr_map remap;
-    // change IR and record replace map
-    outermost_loop->split(outermost_loop_range / block, &remap);
-    mxp_replacer_t(remap).replace_anchor(fanchors_);
-}
-
-void mixed_parti_t::buffer_schedule() {
-    if (merged_to) {
-        get_root()->buffer_schedule();
-        return;
-    }
-    if (ctx_->flags_.concat_optimization_) {
-        buf_alloc_.copy_concat_memory_attrs_tsr2buf();
-    }
-    buf_alloc_.tensor_initialize();
-    buf_alloc_.declare_tensor();
-    buf_alloc_.set_shrink_info();
-    buf_alloc_.query_inplace();
-    buf_alloc_.calibrate_info();
-}
-
-bool mixed_parti_t::is_parti_inp(const graph_tensor *gt) const {
-    if (merged_to) { return get_root()->is_parti_inp(gt); }
-    auto ths = this;
-    return std::any_of(gt->uses_.begin(), gt->uses_.end(),
-                   [&ths](const std::pair<int, sc_op_weak_ptr_t> &user) {
-                       return ths->contains(user.second.get());
-                   })
-            && !contains(gt->producer_owner_);
-}
-
-bool mixed_parti_t::is_parti_inp(const graph_tensor_ptr &gt) const {
-    return is_parti_inp(gt.get());
-}
-
-bool mixed_parti_t::is_parti_out(const graph_tensor *gt) const {
-    if (merged_to) { return get_root()->is_parti_out(gt); }
-    auto ths = this;
-    return contains(gt->producer_owner_)
-            && std::any_of(gt->uses_.begin(), gt->uses_.end(),
-                    [&ths](const std::pair<int, sc_op_weak_ptr_t> &user) {
-                        return !ths->contains(user.second.get());
-                    });
-}
-
-bool mixed_parti_t::is_parti_out(const graph_tensor_ptr &gt) const {
-    return is_parti_out(gt.get());
-}
-
-bool mixed_parti_t::is_const_parti() const {
-    return (get_ops_size() == 1
-            && get_ith_op(0)->attrs_.get_or_else(
-                       "constant", const_kind::not_const)
-                    != const_kind::not_const);
-}
-
-bool mixed_parti_t::contain_convolution() const {
-    return contain_op_with_type<ops::conv_fwd_core_op_t>()
-            || contain_op_with_type<ops::conv_bwd_data_core_op_t>()
-            || contain_op_with_type<ops::conv_bwd_weight_core_op_t>();
-}
-
-bool mixed_parti_t::contain_nested_parallel_for() const {
-    auto outer_loops = get_outer_loops();
-    if (outer_loops.empty()) return false;
-    if (outer_loops.size() == 1) {
-        if (outer_loops[0]->num_threads_ == 0) return false;
-        if (outer_loops[0]->body_.isa<stmts>()) {
-            auto &ss = outer_loops[0]->body_.static_as<stmts>()->seq_;
-            for (auto &s : ss) {
-                if (s.isa<for_loop>()) {
-                    if (s.static_as<for_loop>()->num_threads_ > 0) return true;
-                }
-            }
-        }
-        return false;
-    }
-    return (outer_loops[0]->num_threads_ > 0)
-            && (outer_loops[1]->num_threads_ > 0);
-}
-
-bool mixed_parti_t::contain_tunable_op() const {
-    return contain_op_with_type<tunable_op_t>();
-}
-
-bool mixed_parti_t::contain_elemwise_op_only() const {
-    if (merged_to) { return get_root()->contain_elemwise_op_only(); }
-    for (auto &op : ops) {
-        if (!is_elementwise_op(op.get())) return false;
-    }
-    return true;
-}
-
-bool mixed_parti_t::is_single_op_parti() const {
-    if (merged_to) { return get_root()->is_single_op_parti(); }
-    return ops.size() == 1;
-}
-
-bool mixed_parti_t::is_optimized() const {
-    if (merged_to) { return get_root()->is_optimized(); }
-    if (committed_ops_.empty()) return false;
-    return is_optimized_sub_graph(get_host_graph());
-}
-
-void mixed_parti_t::clear() {
-    // Graph-related
-    ops.clear();
-    committed_ops_.clear();
-    dep_m_ = nullptr;
-
-    // IR-related
-    func_ = func_t();
-    fanchors_.clear();
-    buf_alloc_.clear();
-    op_anchor_map_.clear();
-
-    // Cost Model
-    cost_ = nullptr;
-}
-
-float mixed_parti_t::evaluate_perf() {
-    if (merged_to) { return get_root()->evaluate_perf(); }
-    return cost_->evaluate();
-}
-
-bool mixed_parti_t::is_small_workload() const {
-    if (merged_to) { return get_root()->is_small_workload(); }
-    // skip partition owning more than one ops
-    if (committed_ops_.size() != 1) return false;
-    // get sinlge op
-    auto single_op = committed_ops_[0].get();
-    // skip tunable op
-    if (single_op->isa<tunable_op_t>()) return false;
-    // get committed anchor
-    auto committed_anchor = lookup_anchor_map(single_op);
-    COMPILE_ASSERT(committed_anchor, "No committed anchor found")
-    // query small op workload
-    return committed_anchor->is_small_op_workload(single_op);
-}
-
-std::vector<mixed_parti_t::ptr> collect_parti_set(
-        const std::vector<mixed_parti_t::ptr> &op_2_partition,
-        bool ignore_const) {
-    std::unordered_set<mixed_parti_t::ptr> parti_set;
-    std::vector<mixed_parti_t::ptr> parti_vec;
-    for (auto &parti : op_2_partition) {
-        if (parti_set.count(parti)) { continue; }
-        // auto skip nullptr or const parti
-        if (!parti || (ignore_const && parti->is_const_parti())) continue;
-        parti_set.insert(parti);
-        parti_vec.push_back(parti);
-    }
-    return parti_vec;
-}
-
-static bool check_repartition(const mixed_parti_t::ptr &parti) {
-    // if case 3's pre_fuse_break leads to standalone op
-    // we remove break_pre_fuse
-    if (parti && parti->get_ops_size() == 1
-            && parti->committed_ops_[0]->attrs_.get_or_else(
-                    mixed_partition_hint::trial_break, false)) {
-        parti->committed_ops_[0]->attrs_.remove(op_attr_key::break_pre_fuse);
-        parti->committed_ops_[0]->attrs_[mixed_partition_hint::trial_break]
-                = false;
-        return true;
-    }
-    if (!parti || parti->get_ops_size() < 2) return false;
-    // check tensorview in edge of partition
-    bool repartition = false;
-    // check buffer is tensorptr
-    auto check_parti_out_tptr = [&repartition, &parti](
-                                        const graph_tensor_ptr &out) {
-        auto producer = out->producer_owner_;
-        COMPILE_ASSERT(parti->buf_alloc_.g2b_map_.haskey(out),
-                "No buffer allocated found for output "
-                "of " << producer->op_name_
-                      << producer->logical_op_id_)
-        if (parti->buf_alloc_.g2b_map_.get(out).isa<tensorptr>()) {
-            if (producer->isa<tensor_view_op_t>()) {
-                auto tv_op = producer;
-                graph_tensor *tv_inp;
-                while (tv_op->isa<tensor_view_op_t>()) {
-                    tv_op->attrs_[op_attr_key::no_fuse] = true;
-                    tv_inp = tv_op->get_inputs()[0].get();
-                    tv_op = tv_inp->producer_owner_;
-                }
-                if (parti->buf_alloc_.g2b_map_.get(tv_inp).isa<tensorptr>()) {
-                    tv_inp->attrs_[mixed_partition_hint::no_inplace] = true;
-                }
-            } else {
-                out->attrs_[mixed_partition_hint::no_inplace] = true;
-            }
-            repartition = true;
-        }
-    };
-    for (auto &op : parti->ops) {
-        // 1. check partition output buffer should not be tensoptr
-        for (auto &out : op->get_outputs()) {
-            if (parti->is_parti_out(out)) check_parti_out_tptr(out);
-        }
-        // 2. check reorder
-        if (op->isa<reorder_op_t>()) {
-            if (!op->dyn_cast<reorder_op_t>()->support_output_loop()
-                    || op->get_inputs()[0]->uses_.size() == 1) {
-                continue;
-            }
-            auto tunable_users = search_tuneop_linearly(op);
-            // mark this kind of reorder to do pre-op fusion during
-            // repartition stage
-            if (tunable_users && !parti->contains(tunable_users.get())) {
-                op->attrs_[op_attr_key::break_pre_fuse] = true;
-                repartition = true;
-                // the input of this reorder will be the new output of
-                // partition, check tptr in advance to reduce repartition times
-                check_parti_out_tptr(op->get_inputs()[0]);
-            }
-        }
-        // 3. check partition outputs and mark break_pre_fuse on B
-        // when encountering the topology below
-        //   A__
-        //  /   |
-        // out  B
-        //      |
-        //     out
-        // if B becomes a standalone op, we will re-fuse it
-        // we requires B to have all of its consumers outside of the current
-        // partition
-        bool is_terminate_op = std::all_of(op->get_outputs().begin(),
-                op->get_outputs().end(), [&](const graph_tensor_ptr &gt) {
-                    return std::all_of(gt->uses_.begin(), gt->uses_.end(),
-                            [&parti](const std::pair<int, sc_op_weak_ptr_t>
-                                            &user) {
-                                return !parti->contains(user.second.get());
-                            });
-                });
-        if (is_terminate_op) {
-            // find producer inputs
-            for (const auto &prod_input : op->get_inputs()) {
-                for (const auto &preced_output :
-                        prod_input->producer_owner_->get_outputs()) {
-                    if (parti->is_parti_out(preced_output)
-                            && !op->attrs_.has_key(
-                                    mixed_partition_hint::trial_break)) {
-                        op->attrs_[op_attr_key::break_pre_fuse] = true;
-                        op->attrs_[mixed_partition_hint::trial_break] = true;
-                        repartition = true;
-                    }
-                }
-            }
-        }
-    }
-    return repartition;
-}
-
-bool mixed_parti_t::contain_input_anchor() const {
-    if (merged_to) { return get_root()->contain_input_anchor(); }
-    return std::any_of(fanchors_.begin(), fanchors_.end(),
-            [](const fusion_anchor_ptr &anchor) {
-                return anchor->is_input_anchor();
-            });
-}
-
-static mixed_parti_t::ptr try_execute_pre_op_fusion(const context_ptr &ctx,
-        const sc_op_ptr &op,
-        std::vector<mixed_parti_t::ptr> &avaliable_input_parti) {
-    mixed_parti_t::ptr parent_partition = nullptr;
-    // only suitable for tunable op
-    if (!op->isa<tunable_op_t>()) return parent_partition;
-    if (!require_fusion_level(ctx, fusion_opt_level::lv2)) {
-        return parent_partition;
-    }
-    // collect reorder parti
-    std::vector<mixed_parti_t *> reo_parti;
-    for (auto &inp_parti : avaliable_input_parti) {
-        if (inp_parti->get_ops_size() == 1
-                && inp_parti->contain_op_with_type<reorder_op_t>()) {
-            reo_parti.emplace_back(inp_parti->get_root());
-        }
-    }
-    if (reo_parti.empty()) return parent_partition;
-    // create tunable partition for possible input fusion anchor
-    parent_partition = std::make_shared<mixed_parti_t>(
-            reo_parti[0]->ctx_, op, reo_parti[0]->dep_m_);
-    if (!parent_partition->contain_input_anchor()) {
-        parent_partition->clear();
-        return nullptr;
-    }
-    // search input anchor for reorder op
-    for (auto &rparti : reo_parti) {
-        auto reo = rparti->committed_ops_[0];
-        bool input_anchor_found = false;
-        // set pre-op fusion attr
-        reo->attrs_.set(mixed_partition_hint::pre_fuse_begin_op, true);
-        for (auto &fanchor : parent_partition->fanchors_) {
-            // only search input anchor
-            if (!fanchor->is_input_anchor()) continue;
-            // check output of reorder
-            auto reo_out = reo->get_outputs()[0], reo_in = reo->get_inputs()[0];
-            if (!fanchor->fsmap_.hasvalue(reo_out)) continue;
-            fslice_map tmp_fsmap;
-            // copy to tmp fsmap
-            tmp_fsmap.get(reo_out) = fanchor->fsmap_.get(reo_out);
-            auto status = reo->stc_cast<fusible_op_t>()->pre_infer_slice_ranges(
-                    parent_partition->ctx_, tmp_fsmap);
-            if (status == infer_status_code::OK) {
-                input_anchor_found = true;
-                fanchor->fsmap_.get(reo_in) = tmp_fsmap.get(reo_in);
-            }
-        }
-        if (input_anchor_found) {
-            // pre fuse reorder
-            parent_partition->add(reo);
-            // clear origin parti
-            rparti->clear();
-            // reset root pointer
-            rparti->merged_to = parent_partition;
-        }
-        // remove pre-op fusion attr
-        reo->attrs_.remove(mixed_partition_hint::pre_fuse_begin_op);
-    }
-    if (parent_partition->get_ops_size() == 1) {
-        parent_partition->clear();
-        // reset
-        parent_partition = nullptr;
-    } else {
-        auto &ops = parent_partition->committed_ops_;
-        auto old_op = std::move(ops.front());
-        ops.erase(ops.begin());
-        ops.emplace_back(std::move(old_op));
-    }
-    return parent_partition;
-}
-
-static mixed_parti_t::ptr try_execute_post_op_fusion(const context_ptr &ctx,
-        const sc_op_ptr &op,
-        std::vector<mixed_parti_t::ptr> &avaliable_input_parti) {
-    mixed_parti_t::ptr parent_partition = nullptr;
-    // try to preop merge the partitons of all inputs in advance
-    for (auto &inp_parti : avaliable_input_parti) {
-        if (!parent_partition) {
-            parent_partition = inp_parti;
-        } else if (!parent_partition->empty() && !inp_parti->empty()) {
-            COMPILE_ASSERT(op->isa<op_traits::mixed_partition_acceptable>(),
-                    "Unexpected op type find: " << op->op_name_)
-            op->dyn_cast<op_traits::mixed_partition_acceptable>()
-                    ->search_anchor(parent_partition.get());
-            op->dyn_cast<op_traits::mixed_partition_acceptable>()
-                    ->search_anchor(inp_parti.get());
-            if (parent_partition->ready_for_op(op.get())
-                    && inp_parti->ready_for_op(op.get())) {
-                // the difference between later partition merge with
-                // joint op is that current merge need to check
-                // partition connection to ensure legality.
-                try_merge_brgemm_and_preop_parti(
-                        parent_partition.get(), inp_parti.get(), nullptr);
-            }
-        }
-    }
-    // reset parent partition
-    parent_partition = nullptr;
-    // try to add op to input partition
-    for (auto &inp_parti : avaliable_input_parti) {
-        if (inp_parti->is_ok_to_add(op.get())) {
-            if (parent_partition) {
-                try_merge_mixed_parti_with_joint_op(
-                        parent_partition, inp_parti, op);
-                parent_partition = std::static_pointer_cast<mixed_parti_t>(
-                        parent_partition->get_root()->shared_from_this());
-            } else {
-                parent_partition = inp_parti;
-                // Do not merge input partition according hint
-                if (op->attrs_.get_or_else(
-                            mixed_partition_hint::no_gather_op, false))
-                    break;
-            }
-        }
-    }
-    if (parent_partition) {
-        if (!parent_partition->get_root()->add(op)) {
-            // set no gather input partition hint
-            op->attrs_.set(mixed_partition_hint::no_gather_op, true);
-        }
-    }
-    return parent_partition;
-}
-
-bool do_partition(const context_ptr &ctx, sc_graph_t &g,
-        std::vector<mixed_parti_t::ptr> &op_2_partition,
-        const dep_mat_ptr &dep_m) {
-    // validate partition
-    bool repartition = false;
-    // a speculative DFS visitor
-    op_visitor_t visitor
-            = op_visitor_t::dfs_topology_speculative_sort(g.ops_.size());
-    visitor.visit_graph(g, [&](op_visitor_t *visitor, const sc_op_ptr &op) {
-        if (op->isa<input_op>() || op->isa<output_op>()) return;
-        mixed_parti_t::ptr parent_partition = nullptr;
-        if (op->attrs_.get_or_else(op_attr_key::no_fuse, false)) {
-            SC_MODULE_INFO << op->op_name_ << "_" << op->logical_op_id_
-                           << " is marked as no fuse";
-        } else {
-            if (!op->attrs_.get_or_else(op_attr_key::break_pre_fuse, false)) {
-                std::vector<mixed_parti_t::ptr> avaliable_input_parti;
-                // any op marked as `break_post_fuse` should be excluded
-                std::vector<sc_op *> excluded_ops;
-                // collect avaliable input partition
-                auto sorted_inputs = get_sorted_inputs_by_layout_input(op);
-                for (auto &in : sorted_inputs) {
-                    // if an input is fusible and is not "break_post_fuse"
-                    if (!in->producer_owner_->attrs_.get_or_else(
-                                op_attr_key::break_post_fuse, false)
-                            && !in->producer_owner_->attrs_.get_or_else(
-                                    op_attr_key::no_fuse, false)
-                            && in->producer_owner_->attrs_.get_or_else(
-                                       "constant", const_kind::not_const)
-                                    == const_kind::not_const) {
-                        auto &cur_in_partition
-                                = op_2_partition[in->producer_owner_
-                                                         ->logical_op_id_];
-                        if (cur_in_partition)
-                            avaliable_input_parti.emplace_back(
-                                    cur_in_partition);
-                    } else if (in->producer_owner_->attrs_.get_or_else(
-                                       op_attr_key::break_post_fuse, false)) {
-                        SC_MODULE_INFO << op->op_name_ << "_"
-                                       << op->logical_op_id_
-                                       << " fail to add partition because its "
-                                          "producer "
-                                       << in->producer_owner_->op_name_ << "_"
-                                       << in->producer_owner_->logical_op_id_
-                                       << " is marked as break post fuse";
-                        // Although input op has been marked as break post fuse,
-                        // it maybe already fused into avaliable input partition
-                        // belonging to another ones
-                        excluded_ops.emplace_back(in->producer_owner_);
-                    }
-                }
-                // filter input partition
-                auto filter_input_partition = [&avaliable_input_parti,
-                                                      &excluded_ops]() {
-                    // avoid duplication
-                    std::unordered_set<mixed_parti_t *> unique_parti_set;
-                    for (auto iter = avaliable_input_parti.begin();
-                            iter != avaliable_input_parti.end();) {
-                        // get root partition as key
-                        auto root_parti = (*iter)->get_root();
-                        if (unique_parti_set.find(root_parti)
-                                != unique_parti_set.end()) {
-                            iter = avaliable_input_parti.erase(iter);
-                        } else {
-                            unique_parti_set.insert(root_parti);
-                            // double-check whether contains excluded ops
-                            if (std::any_of(excluded_ops.begin(),
-                                        excluded_ops.end(),
-                                        [&root_parti](sc_op *op) {
-                                            return root_parti->contains(op);
-                                        })) {
-                                iter = avaliable_input_parti.erase(iter);
-                            } else {
-                                iter++;
-                            }
-                        }
-                    }
-                };
-                // filter duplicated ones
-                filter_input_partition();
-                // try pre op fusion
-                parent_partition = try_execute_pre_op_fusion(
-                        ctx, op, avaliable_input_parti);
-                if (!parent_partition) {
-                    // try post op fusion
-                    parent_partition = try_execute_post_op_fusion(
-                            ctx, op, avaliable_input_parti);
-                }
-            } else {
-                SC_MODULE_INFO << op->op_name_ << "_" << op->logical_op_id_
-                               << " fail to add partition because it is marked "
-                                  "as break pre fuse";
-            }
-        }
-        if (!parent_partition || !parent_partition->contains(op.get())) {
-            // op was not added into parent partition, usually after unexpected
-            // input partition merge, as the result, set repatition flag to
-            // trigger next round of partition from the view of performance.
-            if (parent_partition && !parent_partition->contains(op.get())) {
-                repartition = true;
-            }
-            parent_partition = std::make_shared<mixed_parti_t>(ctx, op, dep_m);
-        }
-        op_2_partition[op->logical_op_id_] = parent_partition;
-    });
-
-    for (auto &parti : op_2_partition) {
-        if (parti) {
-            parti = std::static_pointer_cast<mixed_parti_t>(
-                    parti->get_root()->shared_from_this());
-        }
-    }
-
-    // collect parti set
-    auto parti_set = collect_parti_set(op_2_partition);
-    // assign checker
-    auto checker = &check_repartition;
-    std::for_each(parti_set.begin(), parti_set.end(),
-            [&checker, &repartition](const mixed_parti_t::ptr &parti) {
-                if ((*checker)(parti)) repartition = true;
-            });
-    if (repartition) {
-        SC_MODULE_INFO << "================ Repartition the whole graph "
-                          "================";
-    }
-    return !repartition;
-}
-
-bool mixed_parti_t::validate_optimization() const {
-    return buf_alloc_.validate_tsr2var();
-}
-
-bool mixed_parti_t::can_optimize_outer_loop(bool allow_tensorview) const {
-    if (merged_to) {
-        return get_root()->can_optimize_outer_loop(allow_tensorview);
-    }
-    bool for_reduce = contain_op_with_type<op_traits::maybe_split_optimized_t>()
-            && std::all_of(ops.begin(), ops.end(),
-                    [&](const sc_op_ptr &op) {
-                        if (op->isa<movement_op_t>()) {
-                            if (op->isa<tensor_view_op_t>()
-                                    && allow_tensorview) {
-                                return is_parti_out(op->get_outputs()[0])
-                                        || is_parti_inp(op->get_inputs()[0]);
-                            } else if (op->isa<reorder_op_t>()) {
-                                return op->attrs_.get_or_else(
-                                        op_attr_key::no_fuse, false);
-                            } else {
-                                return false;
-                            }
-                        } else {
-                            return is_elementwise_op(op.get())
-                                    || op->isa<reduce_op_t>()
-                                    || op->isa<reduce_collect_op_t>()
-                                    || (op->isa<reduce_compute_op_t>()
-                                            && !op->stc_cast<
-                                                          reduce_compute_op_t>()
-                                                        ->is_partial_reduce());
-                        }
-                    })
-            && std::any_of(ops.begin(), ops.end(), [&](const sc_op_ptr &op) {
-                   std::vector<int> rd_axis;
-                   if (auto rd_op = op->dyn_cast<reduce_op_t>()) {
-                       rd_axis = rd_op->get_rd_axis();
-                   } else if (auto rd_op
-                           = op->dyn_cast<reduce_compute_op_t>()) {
-                       rd_axis = rd_op->get_rd_axis();
-                   } else {
-                       return false;
-                   }
-                   std::sort(rd_axis.begin(), rd_axis.end());
-                   int cur = (op->get_inputs()[0]
-                                      ->details_.get_blocking_dims()
-                                      .size()
-                           - rd_axis.size());
-                   /** E.g
-                    * - reduce input dims: [32,64,16,16]
-                    * - rd_axis: [2,3]
-                    *  It is unecessary to optimize outer loop for this kind
-                    * of reduce op
-                    * */
-                   for (auto &ax : rd_axis) {
-                       if (ax != cur) return true;
-                       cur++;
-                   }
-                   return false;
-               });
-    bool for_pooling = contain_op_with_type<pooling_op_t>()
-            && std::all_of(ops.begin(), ops.end(), [&](const sc_op_ptr &op) {
-                   return (!op->isa<movement_op_t>() || op->isa<padding_op_t>())
-                           && !op->isa<tunable_op_t>();
-               });
-    return !is_optimized() && (for_reduce || for_pooling);
-}
-
-static bool try_optimize_reduce(mixed_parti_t *parti, sc_graph_t &sub_graph,
-        const std::unordered_map<sc_op_ptr, sc_op_ptr> &graph2orig_ops) {
-    // currently disable reduce optimization in dynamic
-    if (!parti->contain_op_with_type<op_traits::maybe_split_optimized_t>()
-            || sub_graph.is_dynamic())
-        return false;
-
-    bool redo = false;
-    auto &ctx = parti->ctx_;
-
-    // if parti contains nested parallel for, it could not be ensured that the
-    // inner loop is not parallel
-    bool nested_parallel_found = parti->contain_nested_parallel_for();
-
-    std::unordered_set<op_traits::maybe_split_optimized_t *> split_reduce_set;
-    for (auto &op : sub_graph.ops_) {
-        if (auto red_op = op->dyn_cast<op_traits::maybe_split_optimized_t>()) {
-            if (!red_op->can_split_op()) continue;
-            if (op->isa<reduce_op_t>() && !nested_parallel_found) {
-                split_reduce_set.insert(red_op);
-            } else if (auto rd_op = op->dyn_cast<reduce_compute_op_t>()) {
-                COMPILE_ASSERT(rd_op->is_partial_reduce(),
-                        "Only partial reduce is expected")
-                // predict anchor parallelism of coming `reduce_collect_op`
-                auto orig_iter = graph2orig_ops.find(op);
-                COMPILE_ASSERT(orig_iter != graph2orig_ops.end(),
-                        "Could not find original op in partition")
-                auto &input = orig_iter->second->get_inputs()[0];
-                auto required_axis = rd_op->get_rd_axis();
-                // get real requied axis
-                required_axis.erase(required_axis.begin());
-                bool parallelism = true;
-                // get outer loops
-                auto outer_loops = parti->get_outer_loops(nullptr, true);
-                for (auto itr = outer_loops.begin(); itr != outer_loops.end();
-                        itr++) {
-                    // check required axis whether appears on outer loops
-                    if (check_loop_has_axis(*itr, input, required_axis)) {
-                        // if found, predict parallelism
-                        if (itr == outer_loops.begin()
-                                || evaluate_loop_parallel_balance(
-                                           {outer_loops.begin(), itr - 1})
-                                        != 1.0f) {
-                            parallelism = false;
-                        }
-                        break;
-                    }
-                }
-                if (parallelism) split_reduce_set.insert(red_op);
-            }
-        }
-    }
-    // If split reduce op exist
-    for (auto &red_op : split_reduce_set) {
-        auto op = dynamic_cast<sc_op *>(red_op);
-        reduce_operator rd_type;
-        // check padding except for reduce add
-        if (auto rd_op = op->dyn_cast<reduce_op_t>()) {
-            rd_type = rd_op->get_rd_op();
-        } else if (auto rd_op = op->dyn_cast<reduce_compute_op_t>()) {
-            rd_type = rd_op->get_rd_op();
-        } else {
-            COMPILE_ASSERT(0, "Unexpected kind of op found: " << op->op_name_)
-        }
-        if (rd_type != reduce_operator::add) {
-            auto &plain_dims = op->get_inputs()[0]->details_.get_plain_dims();
-            auto &fmt = op->get_inputs()[0]->details_.get_format();
-            auto blocking_dims
-                    = sc_data_format_t::get_blocking_shapes(plain_dims, fmt);
-            auto padded_dims = sc_data_format_t::get_padded_plain_shapes(
-                    blocking_dims, fmt);
-            // currently, do not support split with padding
-            if (plain_dims != padded_dims) continue;
-        }
-        // pre-check
-        if (op->isa<reduce_compute_op_t>()) {
-            // find original op in partition
-            auto orig_iter = graph2orig_ops.find(op->shared_from_this());
-            COMPILE_ASSERT(orig_iter != graph2orig_ops.end(),
-                    "Could not find original op in partition")
-            // get shrink info
-            auto slice_info = parti->buf_alloc_.get_shrinked_info(
-                    parti->buf_alloc_.g2b_map_.get(
-                            orig_iter->second->get_outputs()[0]));
-            if (slice_info.empty()) continue;
-            sc_dim prod = get_dims_product(
-                    get_expr_to_dims(get_slice_shape(slice_info)));
-            auto tsr_simd_len = vectorize_step(
-                    ctx, op->get_inputs()[0]->details_.dtype_.type_code_);
-            if (!check_tsr_len_under_resigter_size(prod, tsr_simd_len))
-                continue;
-        }
-        // Do split
-        red_op->split_op(ctx, sub_graph, 1);
-        redo = true;
-    }
-
-    return redo;
-}
-
-static bool try_optimize_concat(mixed_parti_t *parti, sc_graph_t &sub_graph) {
-    return parti->ctx_->flags_.concat_optimization_
-            && concat_memory_planning_on_graph(sub_graph);
-}
-
-static bool try_optimize_outer_loop(mixed_parti_t *parti, sc_graph_t &sub_graph,
-        const std::unordered_map<sc_op_ptr, sc_op_ptr> &graph2orig_ops) {
-    // prepare stage
-    auto &ops = sub_graph.ops_;
-    // check reorder in partition which includes reduce op but
-    // exclude tunable op
-    if (parti->contain_op_with_type<op_traits::maybe_split_optimized_t>()
-            && !parti->contain_tunable_op()
-            && parti->contain_op_with_type<reorder_op_t>()) {
-        bool forced_reorder_axis = false;
-        std::unordered_set<sc_op_ptr> reo_op_set;
-        auto run_threads = runtime_config_t::get().get_num_threads();
-        for (auto &op : ops) {
-            if (op->is_removed_) continue;
-            if (op->isa<op_traits::maybe_split_optimized_t>()) {
-                std::vector<int> rd_axis;
-                if (auto rd_op = op->dyn_cast<reduce_op_t>()) {
-                    rd_axis = rd_op->get_rd_axis();
-                } else if (auto rd_op = op->dyn_cast<reduce_compute_op_t>()) {
-                    if (rd_op->is_partial_reduce()) {
-                        forced_reorder_axis = false;
-                        break;
-                    }
-                    rd_axis = rd_op->get_rd_axis();
-                }
-                auto shape = op->get_inputs()[0]->details_.get_blocking_dims();
-                int outer_rd_axis_size = 1;
-                for (int i = 0; i < *rd_axis.begin(); i++) {
-                    outer_rd_axis_size *= shape[i];
-                }
-                /* Due to loop order not only affect outer-loop parallelism,
-                 * but also inner-loop fusion, which will affect local buffer
-                 * size( sensitive to cache line size). Further, more
-                 * performance data maybe required and analyzed to decide which
-                 * strategy shuold be applied to achieve best performance*/
-                if (outer_rd_axis_size < run_threads)
-                    forced_reorder_axis = true;
-            } else if (op->isa<reorder_op_t>()) {
-                reo_op_set.insert(op);
-            }
-        }
-        if (forced_reorder_axis) {
-            std::for_each(reo_op_set.begin(), reo_op_set.end(),
-                    [&graph2orig_ops](const sc_op_ptr &op) {
-                        op->attrs_[op_attr_key::no_fuse] = true;
-                        // sync origin op in partition
-                        auto orig_iter = graph2orig_ops.find(op);
-                        COMPILE_ASSERT(orig_iter != graph2orig_ops.end(),
-                                "Could not find original op in partition")
-                        orig_iter->second->attrs_[op_attr_key::no_fuse] = true;
-                    });
-        }
-    }
-
-    // optimize loop order
-    return parti->can_optimize_outer_loop();
-}
-
-static bool try_optimize_fusion_anchor(mixed_parti_t *parti,
-        sc_graph_t &sub_graph,
-        const std::unordered_map<sc_op_ptr, sc_op_ptr> &graph2orig_ops) {
-    // auto skip
-    if (parti->committed_ops_.size() < 2) return false;
-    auto &ctx = parti->ctx_;
-    // check the op whether is the elementwise op with last dim undividable
-    auto elem_op_with_last_dim_undividable
-            = [&ctx, &parti](const sc_op_ptr &op) -> bool {
-        if (!is_elementwise_op(op.get())) return false;
-        auto committed_anchor = parti->lookup_anchor_map(op.get(), false);
-        if (!committed_anchor) return false;
-        auto gt = op->get_outputs()[0];
-        COMPILE_ASSERT(committed_anchor->fsmap_.hasvalue(gt),
-                "Unexpected case for elementwise op: " << op->op_name_ << "_"
-                                                       << op->logical_op_id_)
-        auto &slice_info = committed_anchor->fsmap_.get(gt);
-        if (slice_info.size() != 1) return false;
-        auto compute_shape = get_slice_shape(slice_info[0]);
-        if (!compute_shape.back().isa<constant>()) return false;
-        auto last_dim = get_expr_as_int(compute_shape.back());
-        // get max lanes
-        auto dtype = gt->details_.dtype_;
-        auto lanes = vectorize_step(ctx, dtype.type_code_);
-        return ((last_dim > lanes) && (last_dim % lanes != 0));
-    };
-    // query partition op on graph
-    auto query_op_on_graph = [&graph2orig_ops](const sc_op_ptr &op) {
-        auto iter = std::find_if(graph2orig_ops.begin(), graph2orig_ops.end(),
-                [&op](const std::pair<sc_op_ptr, sc_op_ptr> &kv) {
-                    return kv.second == op;
-                });
-        COMPILE_ASSERT(
-                iter != graph2orig_ops.end(), "Could not find mapping op")
-        return iter->first;
-    };
-    // set hint about fusion anchor
-    auto set_hint = [](std::vector<sc_op_ptr> &ops) -> bool {
-        bool flag = false;
-        if (ops.size() > 1) {
-            for (auto &op : ops) {
-                op->attrs_.set(mixed_partition_hint::split_anchor_op, true);
-            }
-            flag = true;
-        }
-        ops.clear();
-        return flag;
-    };
-
-    auto dep_m = op_dep_matrix_t(sub_graph);
-    bool redo = false;
-    std::vector<sc_op_ptr> target_ops;
-    // visit sorted commit ops
-    for (auto &op : parti->committed_ops_) {
-        // get mapping op on sub graph
-        auto sub_graph_op = query_op_on_graph(op);
-        if (target_ops.empty() && elem_op_with_last_dim_undividable(op)) {
-            // lookup commited anchor
-            auto &committed_anchor = *parti->lookup_anchor_map(op.get());
-            if (typeid(committed_anchor) == typeid(fusion_anchor_t)) {
-                target_ops.emplace_back(sub_graph_op);
-            }
-        } else if (!target_ops.empty()) {
-            auto first_op
-                    = graph2orig_ops.find(target_ops.front())->second.get();
-            // successive elementwise op and same anchor with first op and have
-            // dependency with last target op
-            if (elem_op_with_last_dim_undividable(op)
-                    && (parti->lookup_anchor_map(first_op)
-                            == parti->lookup_anchor_map(op.get()))
-                    && dep_m.lookup(target_ops.back(), sub_graph_op) == 1) {
-                target_ops.emplace_back(sub_graph_op);
-            } else if (parti->lookup_anchor_map(first_op)
-                            == parti->lookup_anchor_map(op.get(), false)
-                    && !target_ops.empty()) {
-                target_ops.clear();
-                // replace first op
-                if (elem_op_with_last_dim_undividable(op)) {
-                    target_ops.emplace_back(sub_graph_op);
-                }
-            } else {
-                // check redo flag
-                redo |= set_hint(target_ops);
-            }
-        }
-    }
-    // if the first op still exists utils loop ends
-    redo |= set_hint(target_ops);
-
-    return redo;
-}
-
-bool try_optimize_parti(mixed_parti_t *parti, sc_graph_t &sub_graph,
-        const std::unordered_map<sc_op_ptr, sc_op_ptr> &graph2orig_ops) {
-    // if `opt-level == 0`, auto skip
-    if (parti->ctx_->flags_.opt_level_ == sc_opt_level::lv0) return false;
-    if (sub_graph.is_dynamic()) { return false; }
-    // skip already optimized parti
-    if (parti->is_optimized()) return false;
-    bool need_optimize = false;
-    // optimize reduce
-    need_optimize |= try_optimize_reduce(parti, sub_graph, graph2orig_ops);
-    // optimize concat
-    need_optimize |= try_optimize_concat(parti, sub_graph);
-    // optimize loop order
-    need_optimize |= try_optimize_outer_loop(parti, sub_graph, graph2orig_ops);
-    // optimize fusion anchor
-    need_optimize
-            |= try_optimize_fusion_anchor(parti, sub_graph, graph2orig_ops);
-
-    if (need_optimize) {
-        sub_graph.attrs_.set(mixed_partition_hint::optimized_sub_graph, true);
-    }
-    return need_optimize;
-}
-
-static std::string get_graph_name(const sc_graph_t &graph) {
-    std::string ret;
-    for (auto &op : graph.ops_) {
-        if (op->isa<input_op>() || op->isa<output_op>()
-                || op->isa<constant_op_t>())
-            continue;
-        if (!ret.empty()) ret += "_";
-        ret += op->op_name_;
-    }
-    return ret;
-}
-
-static std::string get_parti_prefix(const mixed_parti_t &parti) {
-    std::string prefix;
-    if (parti.ops.size() > 1) {
-        prefix = "partition_";
-        auto outer_loops = parti.get_outer_loops();
-        if (!outer_loops.empty()) {
-            prefix = "outerloop_" + print_loops_range(outer_loops) + "_"
-                    + prefix;
-        }
-    }
-    return prefix;
-}
-
-static void search_first_prefetch_op(mixed_parti_t &parti) {
-    for (const auto &op : parti.committed_ops_) {
-        if (op->isa<tunable_op_t>() && op->isa<op_traits::may_prefetch_t>()) {
-            op->attrs_[mixed_partition_hint::first_prefetch_op] = true;
-            break;
-        }
-    }
-}
-
-std::shared_ptr<mixed_fuse_op_t> mixed_parti_t::transform_to_mixed_op() {
-    COMPILE_ASSERT(!empty(), "Could not transform empty partition")
-    // Get original graph
-    auto &g = get_host_graph();
-    // Make sub graph
-    sc_graph_t sub_graph;
-    sub_graph.sync_dynamic_info_with_graph(g);
-    std::vector<graph_tensor_ptr> fused_op_in, fused_op_out;
-    std::vector<expr> arg_ins, arg_out;
-    std::string op_name;
-    // the mapping for original op to op in sub graph
-    std::unordered_map<sc_op_ptr, sc_op_ptr> graph2orig_ops;
-    // the mapping for original LT in original ops to fuse => the LT in the
-    // sub_graph.
-    std::unordered_map<graph_tensor_ptr, graph_tensor_ptr> orig_2_graph;
-    auto get_or_create_graph_tsr = [&](const graph_tensor_ptr &orig_lr) {
-        auto itr = orig_2_graph.find(orig_lr);
-        if (itr != orig_2_graph.end()) { return itr->second; }
-        auto ret = std::make_shared<graph_tensor>(nullptr, orig_lr->details_);
-        orig_2_graph.insert(std::make_pair(orig_lr, ret));
-        return ret;
-    };
-    auto visitor = op_visitor_t::dfs_topology_sort(g.ops_.size());
-    std::unordered_set<graph_tensor_ptr> input_tsr_set;
-    // search first prefetch op of original graph and set attr on it
-    search_first_prefetch_op(*this);
-    // visit original graph
-    visitor.visit_graph(g, [&](op_visitor_t *visitor, const sc_op_ptr &op) {
-        if (ops.find(op) == ops.end()) { return; }
-        std::vector<graph_tensor_ptr> new_graph_in, new_graph_ou;
-        for (auto &in : op->get_inputs()) {
-            new_graph_in.emplace_back(get_or_create_graph_tsr(in));
-            if (is_parti_inp(in)
-                    && input_tsr_set.find(in) == input_tsr_set.end()) {
-                // if the input is not included in the parti, make an input
-                // node
-                auto new_input_op = sub_graph.make_input({new_graph_in.back()});
-                // inherit constant attr for input if necessary
-                if (in->producer_owner_->attrs_.has_key("constant")) {
-                    new_input_op->attrs_.set("constant",
-                            in->producer_owner_->attrs_.get<int>("constant"));
-                }
-                // add the input in the args of the fused op in orig
-                // sub_graph
-                fused_op_in.emplace_back(in);
-                input_tsr_set.insert(in);
-                COMPILE_ASSERT(buf_alloc_.g2b_map_.haskey(in),
-                        "No buffer allocated for " << op->op_name_ << "_"
-                                                   << op->logical_op_id_
-                                                   << " inputs")
-                arg_ins.emplace_back(buf_alloc_.g2b_map_.get(in));
-            }
-        }
-        for (auto &out : op->get_outputs()) {
-            new_graph_ou.emplace_back(get_or_create_graph_tsr(out));
-            // if the output is a "cut" - an edge across the parti and
-            // outside of the parti; or if concat optimization is enabled:
-            if (is_parti_out(out)
-                    || (ctx_->flags_.concat_optimization_
-                            && op->isa<concat_op_t>()
-                            && op->attrs_.get_or_else(
-                                    concat_optim_attr_keys::is_final_concat,
-                                    false))) {
-                // if there is a use outside of the parti, the tensor should
-                // be marked "output"
-                const auto &outtsr = new_graph_ou.back();
-                auto new_out_op = sub_graph.make_output({outtsr});
-                // make a new output tensor for the fused_op_t in the
-                // original sub_graph
-                fused_op_out.emplace_back(
-                        std::make_shared<graph_tensor>(nullptr, out->details_));
-                // save the mapping of the tensor to be replaced => new
-                // tensor
-                output_replace_map[out] = fused_op_out.back();
-                COMPILE_ASSERT(buf_alloc_.g2b_map_.haskey(out),
-                        "No buffer allocated for " << op->op_name_ << "_"
-                                                   << op->logical_op_id_
-                                                   << " outputs")
-                auto out_buffer = buf_alloc_.g2b_map_.get(out);
-                // if outbuffer is already reused, set attr on output op
-                if (out_buffer->attr().has_key(
-                            attr_keys::tensor_inplace_hint)) {
-                    new_out_op->attrs_.set("buffer_already_reused", true);
-                }
-                arg_out.emplace_back(out_buffer);
-            }
-        }
-        auto copyable = op->dyn_cast<op_traits::copyable_t>();
-        assert(copyable);
-        auto copied = copyable->copy(new_graph_in, new_graph_ou, sub_graph);
-        copied->copy_dispatch_key_set_from_op(op);
-        graph2orig_ops.insert(std::make_pair(copied, op));
-        // build the fused op name
-        if (!op_name.empty()) op_name += '_';
-        op_name += copied->op_name_;
-    });
-
-    // copy graph in avoid of redo fall-back case
-    auto copied_grpah = copy_graph(sub_graph);
-
-    if (try_optimize_parti(this, sub_graph, graph2orig_ops)) {
-        SC_MODULE_INFO << "Optimizing mixed partition for current pattern: "
-                       << func_->name_;
-        // copy optimized sub graph
-        auto copied_opt_grpah = copy_graph(sub_graph);
-        // redo mixed partition with setting hint
-        do_mixed_partition(ctx_, sub_graph);
-        bool fall_back = false;
-        std::vector<mixed_parti_t::ptr> parti_list;
-        std::string mx_op_name;
-        bool non_mixed_op_exist = false;
-        // redo validation stage
-        for (auto &op : sub_graph.ops_) {
-            if (op->isa<input_op>() || op->isa<output_op>()
-                    || op->isa<constant_op_t>())
-                continue;
-            if (!mx_op_name.empty()) mx_op_name += "_";
-            if (auto mx_op = op->dyn_cast<mixed_fuse_op_t>()) {
-                COMPILE_ASSERT(mx_op->parti_list_.size() == 1,
-                        "Unexpected partition size found: "
-                                << mx_op->parti_list_.size())
-                if (!mx_op->parti_list_[0]->validate_optimization()) {
-                    // reset
-                    fall_back = true;
-                    SC_MODULE_INFO << "invalid optimized reduce detected, "
-                                      "fall-back "
-                                      "to original pattern";
-                    break;
-                }
-                parti_list.emplace_back(mx_op->parti_list_[0]);
-                mx_op_name += get_graph_name(mx_op->sub_graph_);
-            } else {
-                if (op->isa<reduce_collect_op_t>()) {
-                    fall_back = true;
-                    SC_MODULE_INFO << "reduce collect op must be fused with "
-                                      "reduce compute op, fall-back "
-                                      "to original pattern";
-                    break;
-                }
-                if (!is_reshape_op(op.get())) non_mixed_op_exist = true;
-                mx_op_name += op->op_name_;
-            }
-        }
-        if (!fall_back) {
-            if (parti_list.size() == 1 && !non_mixed_op_exist) {
-                mx_op_name = get_parti_prefix(*parti_list[0]) + mx_op_name;
-            } else if (!is_single_op_graph(g)) {
-                mx_op_name = "multi_partitions_" + mx_op_name;
-            }
-            std::vector<sc_op_ptr> lower_args(sub_graph.get_output_ops());
-            auto input_ops = sub_graph.get_input_ops();
-            lower_args.insert(
-                    lower_args.end(), input_ops.begin(), input_ops.end());
-            auto modu = lower_graph(ctx_, sub_graph, lower_args, false);
-            auto main_func = modu->get_entry_func();
-            main_func->name_ = mx_op_name;
-            main_func->decl_->name_ = mx_op_name;
-            return std::make_shared<mixed_fuse_op_t>(mx_op_name, parti_list,
-                    modu, copied_opt_grpah,
-                    /*ins*/ fused_op_in,
-                    /*outs*/
-                    fused_op_out, any_map_t {});
-        } else {
-            // fall-back
-            parti_list.clear();
-            mx_op_name.clear();
-        }
-    }
-
-    // mark read/write buffer
-    graph::mark_read_or_write_buffers(arg_ins, true);
-    graph::mark_read_or_write_buffers(arg_out, false);
-    // build up final func name and param
-    std::vector<expr> args = arg_out, buffer_args;
-    args.insert(args.end(), arg_ins.begin(), arg_ins.end());
-    std::for_each(args.begin(), args.end(), [&g](const expr &arg) {
-        COMPILE_ASSERT(arg.isa<tensor>(),
-                "Only tensor node is expected for function argument, but "
-                "got " << arg)
-        arg->attr()[mixed_partition_hint::cut_buffer] = true;
-        if (g.is_dynamic()
-                || g.attrs_.get_or_else("temp.parent_graph_dynamic", false)) {
-            arg->attr()[attr_keys::always_trans] = true;
-        }
-    });
-    if (dyn_inter_) {
-        buffer_args = args;
-        args.emplace_back(dyn_inter_->inter_funcs_param_);
-    }
-    func_->params_ = args;
-    func_->decl_->params_ = args;
-    if (dyn_inter_) {
-        assert(dyn_inter_->inter_call_.defined());
-        // replace output buffer of single core/internal func.
-        auto reset_args = [&buffer_args](std::vector<expr> &target_args,
-                                  const std::vector<expr> &extra_args) {
-            target_args = buffer_args;
-            target_args.insert(
-                    target_args.end(), extra_args.begin(), extra_args.end());
-        };
-        reset_args(dyn_inter_->inter_call_->args_,
-                dyn_inter_->inter_call_extra_args_);
-        reset_args(dyn_inter_->inter_func_->params_,
-                dyn_inter_->inter_func_extra_args_);
-        dyn_inter_->inter_func_->decl_->params_
-                = dyn_inter_->inter_func_->params_;
-        reset_args(dyn_inter_->single_core_func_->params_,
-                dyn_inter_->single_core_func_extra_args_);
-        dyn_inter_->single_core_func_->decl_->params_
-                = dyn_inter_->single_core_func_->params_;
-    }
-    // buffer schedule: declare, set shrink info, tensor initilize and query
-    // inplace
-    buffer_schedule();
-
-    // clear unused fanchor, in avoid of loop fuse break
-    clear_fanchors();
-
-    // remove all parallel flag
-    remove_parallel(func_, true);
-
-    // auto push return to the end of body
-    auto &seq = func_->body_.checked_as<stmts>()->seq_;
-    if (func_->ret_type_ == sc_data_type_t::boolean() && !seq.empty()
-            && !seq.back().isa<returns>()) {
-        seq.emplace_back(builder::make_returns_unattached(true));
-    }
-
-    // set function name
-    func_->name_ = get_parti_prefix(*this) + op_name;
-    func_->decl_->name_ = func_->name_;
-    // link op name
-    op_name = func_->name_;
-
-    SC_MODULE_INFO << "mixed partition result:";
-    SC_MODULE_INFO << func_;
-
-    auto fused_op = std::make_shared<mixed_fuse_op_t>(op_name,
-            std::vector<mixed_parti_t::ptr> {
-                    std::static_pointer_cast<mixed_parti_t>(
-                            shared_from_this())},
-            nullptr, copied_grpah,
-            /*ins*/ fused_op_in,
-            /*outs*/
-            fused_op_out, any_map_t {});
-
-    return fused_op;
-}
-
-using crossover_alg
-        = std::function<void(const std::vector<mixed_parti_t::ptr> &parti_vec)>;
-
-static void crossover_dispatcher(
-        const std::vector<mixed_parti_t::ptr> &parti_vec,
-        parti_merge_kind merge_kind) {
-    // select merger by merge kind
-    bool (*merger)(mixed_parti_t * A, mixed_parti_t * B);
-    switch (merge_kind) {
-        case parti_merge_kind::vertical: {
-            merger = try_merge_mixed_parti_vertically;
-            break;
-        }
-        case parti_merge_kind::horizontal: {
-            merger = try_merge_mixed_parti_horizontally;
-            break;
-        }
-        case parti_merge_kind::parallel: {
-            merger = try_merge_mixed_parti_parallel;
-            break;
-        }
-        default: COMPILE_ASSERT(0, "Unknown partition merge kind found")
-    }
-    auto op_size = parti_vec.size();
-    for (size_t i = 0; i < op_size; i++) {
-        auto parti_A = parti_vec[i];
-        if (!parti_A) continue;
-        for (size_t j = i; j < op_size; j++) {
-            auto parti_B = parti_vec[j];
-            if (!parti_B) continue;
-            merger(parti_A.get(), parti_B.get());
-        }
-    }
-}
-
-static void horizontal_crossover(
-        const std::vector<mixed_parti_t::ptr> &parti_vec) {
-    SC_MODULE_INFO << "Applying horizontal merge crossover algorithm...";
-    crossover_dispatcher(parti_vec, parti_merge_kind::horizontal);
-}
-
-static void parallel_crossover(
-        const std::vector<mixed_parti_t::ptr> &parti_vec) {
-    SC_MODULE_INFO << "Applying parallel merge crossover algorithm...";
-    crossover_dispatcher(parti_vec, parti_merge_kind::parallel);
-}
-
-static void vertical_crossover(
-        const std::vector<mixed_parti_t::ptr> &parti_vec) {
-    SC_MODULE_INFO << "Applying vertical merge crossover algorithm...";
-    crossover_dispatcher(parti_vec, parti_merge_kind::vertical);
-}
-
-static void crossover_partition(std::vector<mixed_parti_t::ptr> &op_2_partition,
-        const std::vector<crossover_alg> &algs) {
-    std::vector<mixed_parti_t::ptr> parti_vec
-            = collect_parti_set(op_2_partition);
-    for (auto &al : algs) {
-        al(parti_vec);
-    }
-    for (auto &parti : op_2_partition) {
-        if (parti) {
-            parti = std::static_pointer_cast<mixed_parti_t>(
-                    parti->get_root()->shared_from_this());
-        }
-    }
-}
-
-static expr merge_fusion_condition_by_parti_list(
-        const std::vector<mixed_parti_t::ptr> &partis) {
-    auto parti_set = collect_parti_set(partis);
-    expr ret = false;
-    for (auto &parti : parti_set) {
-        if (parti) { ret = ret || parti->get_fusion_policy_condition(); }
-    }
-    return ret;
-}
-
-void do_mixed_partition(const context_ptr &ctx, sc_graph_t &graph) {
-    auto op_size = graph.ops_.size();
-    // mapping from op id => partition
-    std::vector<mixed_parti_t::ptr> op_2_partition;
-    // set max iter times
-    constexpr int maxiter = 5;
-    // dynamic policy condition
-    expr fusion_policy_condition = false;
-    // make dependency matrix from graph here in avoid of repeated construction
-    // later when graph is extremely large
-    auto dep_m = std::make_shared<op_dep_matrix_t>(graph);
-    for (int i = 0; i < maxiter; i++) {
-        op_2_partition.clear();
-        op_2_partition.resize(op_size);
-        bool ret = do_partition(ctx, graph, op_2_partition, dep_m);
-        auto cur_cond = merge_fusion_condition_by_parti_list(op_2_partition);
-        fusion_policy_condition = fusion_policy_condition || cur_cond;
-        if (ret)
-            break;
-        else if (i == maxiter - 1) {
-            SC_MODULE_INFO << "mixed partition exceed max iteration times, "
-                              "please enlarge limitation and try again";
-            return;
-        }
-    }
-    graph.attrs_.set("temp.fusion_policy_condition", fusion_policy_condition);
-    if (require_fusion_level(ctx, fusion_opt_level::lv3)) {
-        std::vector<crossover_alg> algs = {
-                horizontal_crossover, parallel_crossover, vertical_crossover};
-        crossover_partition(op_2_partition, algs);
-    }
-    std::vector<sc_op_ptr> fused_ops;
-    for (auto &parti : op_2_partition) {
-        if (!parti || !parti->output_replace_map.empty() || parti->empty()
-                || parti->ops.size() == 1) {
-            // if a partition has been processed or it is empty or single op,
-            // skip it.
-            continue;
-        }
-
-        auto fused_op = parti->transform_to_mixed_op();
-
-        fused_op->attrs_[mixed_partition_hint::parti]
-                = std::weak_ptr<mixed_parti_t>(parti);
-        fused_ops.emplace_back(fused_op);
-    }
-
-    std::unordered_map<graph_tensor_ptr, graph_tensor_ptr> tsr_replace_map;
-    for (auto &fused_op : fused_ops) {
-        auto partition = fused_op->attrs_[mixed_partition_hint::parti]
-                                 .get<std::weak_ptr<mixed_parti_t>>()
-                                 .lock();
-        assert(partition);
-        fused_op->attrs_.remove(mixed_partition_hint::parti);
-        for (auto &old_new : partition->output_replace_map) {
-            auto &old = old_new.first;
-            auto &newv = old_new.second;
-            old->replace_with(newv);
-            assert(tsr_replace_map.find(old) == tsr_replace_map.end());
-            tsr_replace_map.insert(old_new);
-        }
-        for (auto &in : fused_op->info_.inputs_) {
-            // if an input is replaced by other fused_op node, update it
-            auto itr = tsr_replace_map.find(in);
-            if (itr != tsr_replace_map.end()) { in = itr->second; }
-        }
-        graph.add(fused_op);
-
-        for (auto &op : partition->ops) {
-            op->remove();
-        }
-
-        // no main op is expected
-        assert(!partition->main_tunable_op);
-    }
-    graph.reset_op_ids();
-}
-
-void mixed_partition(sc_graph_t &graph, const context_ptr &ctx) {
-    if (!require_fusion_level(ctx, fusion_opt_level::lv1)
-            || !graph.attrs_.get_or_else("temp.fuse", 1)) {
-        return;
-    }
-    SC_MODULE_INFO << "Starting Mixed Partition...";
-    do_mixed_partition(ctx, graph);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/mixed_partition.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/mixed_partition.hpp
deleted file mode 100644
index 92b95af1104..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/mixed_partition.hpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_MIXED_PARTITION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_MIXED_PARTITION_HPP
-#include <memory>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include "fused_op.hpp"
-#include "fusion_anchor.hpp"
-#include "fusion_cost_model.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/transform/static_memory_planner.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct mixed_parti_t;
-
-namespace mixed_partition_hint {
-// pointer: partition owner
-constexpr const char *parti = "partition";
-// Boolean: is cut buffer hint
-constexpr const char *cut_buffer = "cut_buffer";
-// Boolean: dont inplace hint
-constexpr const char *no_inplace = "no_inplace";
-// Boolean: is optimized sub graph
-constexpr const char *optimized_sub_graph = "optimized_sub_graph";
-// Boolean: is single op graph
-constexpr const char *single_op_graph = "single_op_graph";
-// Boolean: is begining op of pre-op fuse
-constexpr const char *pre_fuse_begin_op = "pre_fuse_begin_op";
-// Boolean: whether directly inplaced and would not call `compute_block`
-constexpr const char *inplace_optimized_op = "inplace_optimized_op";
-// Boolean: is the first op for prefetching
-constexpr const char *first_prefetch_op = "first_prefetch_op";
-// size_t: used to judge op whether is small workload. If less than threshold,
-// ignore parallelism check of cost model. (Measured by 3D fp32 InstanceNorm)
-constexpr size_t small_op_workload_threshold = 1688UL;
-// Boolean: is the op which could not gather input partitions
-constexpr const char *no_gather_op = "no_gather_op";
-// Boolean: is the op which need to split common anchor into grouped anchor
-constexpr const char *split_anchor_op = "split_anchor_op";
-// Boolean: By default false. When marked as true, we will try to do fusion
-// break. However, if it becomes standalone op after fuse break, it will be
-// marked as false and we will remove the fuse break attribute
-constexpr const char *trial_break = "trial_break";
-// Boolean: whether the tensor has been initialized
-constexpr const char *tensor_already_initialized = "tensor_already_initialized";
-} // namespace mixed_partition_hint
-
-// different fusion policies prepared for dynamic shape, policies will be JIT
-// and dispatched by conditions generated by dynamic cost model,/
-enum class dynamic_fusion_policy_t : int {
-    max_fusion = 0,
-    max_loop_parallelism
-};
-
-class mxp_replacer_t : public ir_inplace_visitor_t {
-private:
-    node_ptr_map node_remap_;
-
-public:
-    mxp_replacer_t(node_ptr_map &node_remap) : node_remap_(node_remap) {}
-
-    using ir_inplace_visitor_t::dispatch_impl;
-    using ir_inplace_visitor_t::visit_impl;
-    expr visit_impl(var v) override {
-        auto itr = node_remap_.find(v.impl);
-        if (itr != node_remap_.end()) {
-            changed_ = true;
-            return static_cast<expr_base *>(itr->second.get())
-                    ->node_ptr_from_this();
-        }
-        return v;
-    }
-
-    expr visit_impl(tensor v) override {
-        // visit and update user-defined tsr
-        if (v->attr_
-                && v->attr_->has_key(tensor_shrinker_attrs::should_shrink)) {
-            auto &shrink_info = v->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                    tensor_shrinker_attrs::should_shrink);
-            std::vector<expr> _new_expr;
-            ir_inplace_visitor_t::dispatch_expr_vector(
-                    shrink_info.base_, _new_expr);
-            ir_inplace_visitor_t::dispatch_expr_vector(
-                    shrink_info.shape_, _new_expr);
-        }
-
-        auto itr = node_remap_.find(v.impl);
-        if (itr != node_remap_.end()) {
-            changed_ = true;
-            return static_cast<expr_base *>(itr->second.get())
-                    ->node_ptr_from_this();
-        }
-        return v;
-    }
-
-    stmt visit_impl(for_loop v) override {
-        // redirect reduce_root_loop if necessary
-        if (v->attr_) {
-            if (auto praw = v->attr_->get_or_null<std::weak_ptr<stmt_base_t>>(
-                        stmt_attr_key::reduce_root_loop)) {
-                auto raw = praw->lock();
-                COMPILE_ASSERT(raw, "reduce_root_loop weak ptr invalidated");
-                auto itr = node_remap_.find(raw);
-                if (itr != node_remap_.end()) {
-                    *praw = std::static_pointer_cast<stmt_base_t>(itr->second);
-                }
-            }
-        }
-        return ir_inplace_visitor_t::visit_impl(v);
-    }
-
-    void replace_func(func_t &func) {
-        if (func) { dispatch_impl(func); }
-    }
-    void replace_anchor(const std::vector<fusion_anchor_ptr> &fanchors);
-};
-
-struct mxp_buffer_allocator {
-private:
-    mixed_parti_t *binded_mxp_;
-
-public:
-    gt2buf_map g2b_map_; // record graph tensor to ir tensor/tensorptr
-            // mapping(maybe n-to-one)
-    std::unordered_map<expr, fusion_anchor_ptr>
-            tsr2anch_map_; // real tensor-to-anchor mapping
-    std::unordered_map<expr, graph_tensor_ptr>
-            b2g_map_; // buffer-to-gt mapping(one-to-one)
-
-    mxp_buffer_allocator(mixed_parti_t *parti) { binded_mxp_ = parti; }
-
-    std::vector<memory_optim::memory_alloc_trace_t>
-            mem_trace_; // memory allocation trace
-
-    memory_optim::inplace_info_map inplace_map_; // inplace map
-
-    mixed_parti_t *const get_binded_mxp() const { return binded_mxp_; }
-    // support inplace logic, allocate buffer including either tensor or
-    // tensorptr
-    void allocate_buffer(sc_op *op);
-    // set inplace hint
-    void set_buffer_inplace_hint(
-            const expr &target_buf, const expr &inplace_buf);
-    // get inplaced buffer
-    expr get_inplaced_buffer(const expr &buf) const;
-    // get allocated buffer
-    std::tuple<std::vector<expr>, std::vector<expr>> get_buffer(
-            sc_op *op) const;
-    // update input buffer info
-    void update_input_buffer_info(sc_op *op);
-    // update output buffer info
-    void update_output_buffer_info(sc_op *op);
-    // define tensor
-    void declare_tensor() const;
-    // set shrink info
-    void set_shrink_info() const;
-    /** merge two buffer allocator
-     * @param common_anchor_pair: the common anchor overlapped when two
-     * partition merged. `first` comes from this partition, and `second` comes
-     * from other one.
-     * */
-    void merge(mxp_buffer_allocator &other,
-            std::unordered_map<expr, expr> &buffer_map,
-            const std::pair<fusion_anchor_ptr, fusion_anchor_ptr>
-                    &common_buffer_anchor_pair);
-    // clear buffer allocator
-    void clear();
-    // check buffer_allocator whether empty
-    const bool empty() const { return g2b_map_.empty(); };
-    // initilize tensor
-    void tensor_initialize();
-    // replace the specific buffer
-    void replace_buffer(const expr &old_buffer, const expr &new_buffer);
-    // get real mem trace, taking consider of tensor shrink and ignore cut
-    // buffer except for those in `keep_cut_set`
-    std::vector<memory_optim::memory_alloc_trace_t> get_real_mem_trace(
-            const std::unordered_set<graph_tensor *> &keep_cut_set = {}) const;
-    // calculate real buffer usage size, taking consider of buffer schedule
-    size_t get_real_buffer_usage() const;
-    // get real anchor for the specfic buffer
-    fusion_anchor_ptr get_real_anchor_for_buffer(const expr &buffer) const;
-    // get shrinked info for buffer
-    slice_range get_shrinked_info(const expr &buffer) const;
-    // query buffer inplace and set hint for IR pass
-    void query_inplace();
-    // calibrate buffer information about inplace and shrink
-    void calibrate_info();
-    // validate tensor2var buffer whether meet the requirement
-    bool validate_tsr2var() const;
-    // count of buffer usage
-    int use_count(const expr &buffer) const;
-    // concat memory planning related
-    void copy_concat_memory_attrs_tsr2buf();
-};
-
-struct mixed_dyn_internal_info_t {
-    // The module records internal functions usually contains repeat
-    // calculations which could be reused like single core brgemm. One partition
-    // could hold multiple ops who have internal functions.
-    ir_module_ptr mod_;
-    // extra parameter for internal func dispatch, a tensor of pointer.
-    expr inter_funcs_param_;
-    // internal call node of internal func.
-    call inter_call_;
-    // internal func node.
-    func_t inter_func_;
-    // single core func.
-    func_t single_core_func_;
-    // extra args for inter call
-    std::vector<expr> inter_call_extra_args_;
-    // extra args for inter func
-    std::vector<expr> inter_func_extra_args_;
-    // extra args of single core func
-    std::vector<expr> single_core_func_extra_args_;
-    // number of functions in partition.
-    int num_func_ = 0;
-    mixed_dyn_internal_info_t(const context_ptr &ctx)
-        : mod_(std::make_shared<ir_module_t>(ctx)) {}
-};
-using mixed_dyn_internal_info_ptr = std::shared_ptr<mixed_dyn_internal_info_t>;
-struct mixed_parti_t : fusion_partition_t {
-    /* related to Graph */
-    // different from ops_ in base class, it records the sequence of committed
-    // ops in current partition
-    std::vector<sc_op_ptr> committed_ops_;
-    // dep matrix
-    dep_mat_ptr dep_m_;
-
-    /* related to IR */
-    context_ptr ctx_;
-    // the body of func_ will be updated once the new op is committed into
-    // current partition, but the name and argument maybe not confirmed until
-    // final mixed_fused_op created.
-    func_t func_;
-    // the fanchor only manage the shared pointer of fusion_anchor struct,
-    // during the whole lifetime of mixed_parti_t, it will not copy any
-    // fusion_anchor struct.
-    std::vector<fusion_anchor_ptr> fanchors_;
-    // manage graph tensor to real tensor mapping
-    mxp_buffer_allocator buf_alloc_ = mxp_buffer_allocator(this);
-    // record the anchor to op mapping
-    std::unordered_map<sc_op *, fusion_anchor_ptr> op_anchor_map_;
-
-    // Cost Model
-    fusion_cost_model_ptr cost_;
-    // mixed fusion dyn internal info
-    mixed_dyn_internal_info_ptr dyn_inter_;
-    using ptr = std::shared_ptr<mixed_parti_t>;
-
-    // append fusion anchor
-    void append_fusion_anchor(const fusion_anchor_ptr &fanchor);
-
-    void append_fusion_anchor(const std::vector<fusion_anchor_ptr> &fanchors) {
-        for (auto &fanchor : fanchors) {
-            append_fusion_anchor(fanchor);
-        }
-    }
-
-    /**
-     * The mixed partition merge will override base merge method, including
-     * following several steps:
-     * 1. It will firstly check two partition dependency and decide which one is
-     * `to_merge` and another is `be_merged`.
-     * 2. extract `outer_loops` from each one and compute greatest common outer
-     * loops.
-     * 3. commit inner loop body from `be_merged` to the largest used fanchor of
-     * `to_merged`.
-     * 4. update outer and inner fusion anchor map.
-     * 5. replace expr iter/tensor/tensorptr in `func_`, `buf_alloc_` and
-     * `fanchors_`.
-     * 6. call base class `merge` method to do disjoint-set merge.
-     * void merge(const ptr &other);
-     * */
-
-    mixed_parti_t(const context_ptr &ctx, const sc_op_ptr &op,
-            const dep_mat_ptr &dep_m = nullptr);
-
-    mixed_parti_t(const context_ptr &ctx, const func_t &func,
-            const fusion_anchor_mgr_t &fmgr, const sc_graph_t &graph);
-
-    bool is_ok_to_add(sc_op *op);
-
-    bool add(const sc_op_ptr &op);
-
-    void remove(const sc_op_ptr &op) {
-        throw std::runtime_error("remove method is not implemented");
-    }
-
-    // if current partition contains no op or those ops generating no
-    // codes(like tensorview op), return True.
-    bool empty() const {
-        if (merged_to) { return get_root()->empty(); }
-        return (ops.empty() || !func_);
-    }
-
-    mixed_parti_t *get_root() const {
-        return static_cast<mixed_parti_t *>(fusion_partition_t::get_root());
-    }
-
-    size_t get_ops_size() const { return get_root()->ops.size(); }
-
-    sc_op_ptr get_ith_op(size_t ith) const {
-        COMPILE_ASSERT(ith < get_root()->committed_ops_.size(),
-                "Could not get " << ith << "-th op")
-        return get_root()->committed_ops_[ith];
-    }
-
-    // get outer loops of which body(stmts) contains only one stmt or two with
-    // the second one is empty fanchor
-    std::vector<for_loop> get_outer_loops(fusion_anchor_ptr fanchor = nullptr,
-            bool skip_def_node = false) const;
-
-    void try_split_outermost_loop(int64_t block) const;
-    void try_split_outermost_loop_on_num_threads(int64_t num_groups);
-
-    // query if partition can optimize its loop order
-    bool can_optimize_outer_loop(bool allow_tensorview = false) const;
-
-    // return op whether in op_anchor_map_
-    bool ready_for_op(sc_op *op) const;
-
-    // look up fanchor by op
-    fusion_anchor_ptr lookup_anchor_map(
-            sc_op *op, bool throw_assert = true) const;
-
-    // look up fanchor by stmts
-    fusion_anchor_ptr lookup_anchor_map(const stmts &ss) const;
-
-    // look up sub fanchor by parent fanchor
-    std::vector<fusion_anchor_ptr> lookup_sub_anchor_map(
-            const fusion_anchor_ptr &parent_fanchor) const;
-
-    // get anchor inside given loop
-    fusion_anchor_ptr get_anchor_inside_loop(
-            const for_loop &loop, bool input_anchor = false) const;
-
-    /// get next inner loop including anchor
-    for_loop get_next_inner_loop_with_anchor(const for_loop &cur_loop,
-            const fusion_anchor_ptr &target_fanchor = nullptr) const;
-
-    // clear all contents of given fanchor, but not erase it from
-    // fanchor list
-    void clear_fanchor(fusion_anchor_ptr &fanchor);
-
-    // clear all unused fanchor, and erase them from fanchor list
-    void clear_fanchors();
-
-    // try to bind given op with given fanchor, if suitable fanchor exists, it
-    // will compare two fanchor and select smaller one
-    void set_anchor_for_op(sc_op *op, const fusion_anchor_ptr &fanchor_map);
-
-    // schedule buffer
-    void buffer_schedule();
-
-    // judge whether the given graph tensor node is the input of the whole
-    // partition
-    bool is_parti_inp(const graph_tensor_ptr &gt) const;
-    bool is_parti_inp(const graph_tensor *gt) const;
-
-    // judge whether the given graph tensor node is the output of the whole
-    // partition
-    bool is_parti_out(const graph_tensor_ptr &gt) const;
-    bool is_parti_out(const graph_tensor *gt) const;
-
-    bool is_parti_cut(const graph_tensor_ptr &gt) const {
-        return is_parti_inp(gt) || is_parti_out(gt);
-    }
-    bool is_parti_cut(const graph_tensor *gt) const {
-        return is_parti_inp(gt) || is_parti_out(gt);
-    }
-
-    // count op number with given type
-    template <typename T>
-    size_t count_op_with_type() const {
-        if (merged_to) { return get_root()->count_op_with_type<T>(); }
-        size_t cnt = 0;
-        for (auto &op : ops) {
-            if (op->isa<T>()) cnt++;
-        }
-        return cnt;
-    }
-
-    // query partition whether contains input fusion anchor
-    bool contain_input_anchor() const;
-
-    // query partition whether is constant partition
-    bool is_const_parti() const;
-
-    // query partition whether contains op with given type
-    template <typename T>
-    bool contain_op_with_type() const {
-        if (merged_to) { return get_root()->contain_op_with_type<T>(); }
-        return (count_op_with_type<T>() != 0);
-    }
-
-    // query partition whether contains op with conv type
-    bool contain_convolution() const;
-
-    // query partition whether contains nested parallel for
-    bool contain_nested_parallel_for() const;
-
-    // query partition whether contains tunable op
-    bool contain_tunable_op() const;
-
-    // query partition whether contains only elementwise op
-    bool contain_elemwise_op_only() const;
-
-    // query partition whether contains only one op
-    bool is_single_op_parti() const;
-
-    // query partition whether contains op from optimized sub graph
-    bool is_optimized() const;
-
-    // check optimization whether legal or not
-    bool validate_optimization() const;
-
-    // clear all contents of partition object
-    void clear();
-
-    // call inner-build cost model to evaluate current partition, return the
-    // scores
-    float evaluate_perf();
-
-    // query partition whether is small workload
-    bool is_small_workload() const;
-
-    // Get the condition expr generated by dynamic cost model.
-    expr get_fusion_policy_condition() const {
-        return cost_->get_fusion_policy_condition();
-    }
-
-    // get host graph
-    sc_graph_t &get_host_graph() const {
-        COMPILE_ASSERT(
-                !committed_ops_.empty(), "No op contained in current partition")
-        return committed_ops_[0]->get_owner_graph();
-    }
-
-    // transform partition to mixed fuse op
-    std::shared_ptr<mixed_fuse_op_t> transform_to_mixed_op();
-};
-
-enum class parti_merge_kind : int {
-    vertical = 0,
-    horizontal = 1,
-    parallel = 2,
-};
-
-void search_op_anchor_in_parti(sc_op *op, mixed_parti_t *parti);
-
-using mxp_mem_info = std::pair<std::vector<memory_optim::memory_alloc_trace_t>,
-        memory_optim::inplace_info_map>;
-
-mxp_mem_info merge_real_mem_info(
-        const mxp_buffer_allocator &alloc1, const mxp_buffer_allocator &alloc2);
-
-std::vector<memory_optim::memory_alloc_trace_t> merge_mem_trace(
-        const std::vector<memory_optim::memory_alloc_trace_t> &mem_trace1,
-        const std::vector<memory_optim::memory_alloc_trace_t> &mem_trace2,
-        const std::unordered_map<expr, expr> &buffer_map);
-
-memory_optim::inplace_info_map merge_inplace_map(
-        const memory_optim::inplace_info_map &inplace_map1,
-        const memory_optim::inplace_info_map &inplace_map2,
-        const std::unordered_map<expr, expr> &buffer_map);
-
-size_t get_buffer_usage(const context_ptr &ctx,
-        const std::vector<memory_optim::memory_alloc_trace_t> &mem_trace,
-        const memory_optim::inplace_info_map &inplace_map);
-
-// collect unrepeated partition set, and optionally ignore pure const partition
-std::vector<mixed_parti_t::ptr> collect_parti_set(
-        const std::vector<mixed_parti_t::ptr> &op_2_partition,
-        bool ignore_const = true);
-
-// do mixed partition
-bool do_partition(const context_ptr &ctx, sc_graph_t &g,
-        std::vector<mixed_parti_t::ptr> &op_2_partition,
-        const dep_mat_ptr &dep_m);
-
-// judge the given graph whether is second time retried graph
-inline bool is_optimized_sub_graph(sc_graph_t &g) {
-    return g.attrs_.get_or_else(
-            mixed_partition_hint::optimized_sub_graph, false);
-}
-
-// judge the given graph whether is the graph containing only one op
-inline bool is_single_op_graph(sc_graph_t &g) {
-    return g.attrs_.get_or_else(mixed_partition_hint::single_op_graph, false);
-}
-
-bool concat_memory_planning_on_graph(sc_graph_t &graph);
-
-// try optimize partition, such as reduce_op optimization
-bool try_optimize_parti(mixed_parti_t *parti, sc_graph_t &sub_graph,
-        const std::unordered_map<sc_op_ptr, sc_op_ptr> &graph2orig_ops = {});
-
-// get single mixed op from the graph
-mixed_fuse_op_t *get_mixed_op_from_graph(sc_graph_t &graph);
-
-void do_mixed_partition(const context_ptr &ctx, sc_graph_t &graph);
-
-// commit graph to TIR, usually used in UT
-void commit_graph_to_func(
-        sc_graph_t &g, const func_t &func, const fusion_anchor_mgr_t &fmgr);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/check.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/check.cpp
deleted file mode 100644
index 52fe0fcc135..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/check.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <unordered_map>
-
-#include "../graph.hpp"
-#include "../visitor.hpp"
-#include "pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// using pre-order and post-order visit graph, if any throw error and
-// connection error, this function return false
-bool check_graph_connection(sc_graph_t &graph) {
-    try {
-        op_visitor_t post_vis {op_visitor_t::pop_back_selector,
-                op_visitor_t::create_DAG_updater_post(graph.ops_.size()), true};
-        // {logical op id, visited num} map
-        std::unordered_map<int, int> visited_ops;
-        post_vis.post_visit_graph(
-                graph, [&](op_visitor_t *post_vis, const sc_op_ptr &aop) {
-                    visited_ops[aop->logical_op_id_]++;
-                });
-
-        op_visitor_t::dfs_topology_sort(graph.ops_.size())
-                .visit_graph(
-                        graph, [&](op_visitor_t *vis, const sc_op_ptr &aop) {
-                            visited_ops[aop->logical_op_id_]--;
-                        });
-        for (auto op : visited_ops) {
-            if (op.second) { return false; }
-        }
-    } catch (...) { return false; }
-    return true;
-}
-
-// check config in tunable op is valid or not.
-bool check_graph_config(sc_graph_t &graph, const context_ptr &ctx) {
-    for (auto &node : graph.ops_) {
-        if (!node->is_removed_ && !node->is_valid(ctx)) { return false; }
-    }
-    return true;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/comparer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/comparer.cpp
deleted file mode 100644
index 1f871e874cf..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/comparer.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <unordered_map>
-
-#include "../graph.hpp"
-#include "../visitor.hpp"
-#include "pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-bool compare_graph(sc_op_ptr &first_diff_lhs, sc_op_ptr &first_diff_rhs,
-        const sc_graph_t &lhs, const sc_graph_t &rhs,
-        const std::unordered_map<int, int> &lhs_rhs_input_mapping,
-        const std::function<bool(const sc_op *, const std::string &)> &filter) {
-    std::vector<int> op_id_map(lhs.ops_.size(), -1);
-    std::unordered_map<graph_tensor *, graph_tensor *> tsr_map;
-    if (lhs_rhs_input_mapping.empty()) {
-        auto inops_lhs = lhs.get_input_or_const_ops();
-        auto inops_rhs = rhs.get_input_or_const_ops();
-        if (inops_lhs.size() != inops_rhs.size()) { return false; }
-        for (size_t i = 0; i < inops_lhs.size(); i++) {
-            auto &l = inops_lhs[i];
-            auto &r = inops_rhs[i];
-            op_id_map.at(l->logical_op_id_) = r->logical_op_id_;
-        }
-    } else {
-        for (auto &lhs_rhs_id_map : lhs_rhs_input_mapping) {
-            op_id_map.at(lhs_rhs_id_map.first) = lhs_rhs_id_map.second;
-        }
-    }
-    bool not_same = false;
-    op_visitor_t visitor {/*selector with early stopping*/
-            [&not_same](op_visitor_t *vis) {
-                auto ret = op_visitor_t::pop_back_selector(vis);
-                return not_same ? sc_op_ptr() : ret;
-            },
-            op_visitor_t::create_DAG_updater(lhs.ops_.size()), false};
-
-    // a utility macro to return from the visit function and tell the visitor to
-    // stop early
-#define DO_RETURN \
-    { \
-        not_same = true; \
-        first_diff_lhs = lnode; \
-        first_diff_rhs = rnode; \
-        return; \
-    }
-
-    visitor.visit_graph(
-            lhs, [&](op_visitor_t *visitor, const sc_op_ptr &lnode) {
-                sc_op_ptr rnode;
-                int rhs_node_idx = op_id_map[lnode->logical_op_id_];
-                // if cannot find RHS
-                if (rhs_node_idx == -1) { DO_RETURN; }
-                rnode = rhs.ops_.at(rhs_node_idx);
-                // check the name and attrs
-                if (!lnode->compare_contents(rnode.get(), filter)) {
-                    DO_RETURN;
-                }
-                // check the mapping of input tensors. because the input nodes
-                // are already visited (DAG_updater), we just need to check if
-                // the LHS tensor is correctly mapped to RHS tensor
-                auto &lhs_inputs = lnode->get_inputs();
-                auto &rhs_inputs = rnode->get_inputs();
-                if (lhs_inputs.size() != rhs_inputs.size()) { DO_RETURN; }
-                for (size_t i = 0; i < lhs_inputs.size(); i++) {
-                    auto lhs_tsr = lhs_inputs[i].get();
-                    auto rhs_tsr = rhs_inputs[i].get();
-                    auto itr = tsr_map.find(lhs_tsr);
-                    if (itr == tsr_map.end() || itr->second != rhs_tsr) {
-                        DO_RETURN;
-                    }
-                }
-
-                // the output tensors are owned by the current node, so they
-                // should have not been visited. For each output tensor, we need
-                // to:
-                // 1. check if the logical tensor contents are exactly the same
-                // (e.g. shape, dtype, etc.)
-                // 2. set the lhs_tsr => rhs_tsr mapping (the mapping will be
-                // checked when the tsr is an input of other ops)
-                // 3. check if the sizes of uses_ matches for LHS and RHS
-                // 4. for each consumer Op of the tensor (which we call "use"),
-                // set the op mapping from LHS to RHS
-
-                auto &lhs_outputs = lnode->get_outputs();
-                auto &rhs_outputs = rnode->get_outputs();
-                if (lhs_outputs.size() != rhs_outputs.size()) { DO_RETURN; }
-                for (size_t i = 0; i < lhs_outputs.size(); i++) {
-                    auto lhs_tsr = lhs_outputs[i].get();
-                    auto rhs_tsr = rhs_outputs[i].get();
-                    // 1. contents
-                    if (!(lhs_tsr->details_ == rhs_tsr->details_)) {
-                        DO_RETURN;
-                    }
-                    // 2. tsr mapping
-                    auto itr = tsr_map.find(lhs_tsr);
-                    // if lhs_tsr is already mapped to a rhs, return false
-                    if (itr != tsr_map.end()) { DO_RETURN; }
-                    tsr_map.insert(std::make_pair(lhs_tsr, rhs_tsr));
-                    // 3. size of uses
-                    auto &lhs_uses = lhs_tsr->uses_;
-                    auto &rhs_uses = rhs_tsr->uses_;
-                    if (lhs_uses.size() != rhs_uses.size()) { DO_RETURN; }
-                    // 4. op mapping
-                    for (size_t u = 0; u < lhs_uses.size(); u++) {
-                        auto &lhs_use = lhs_uses[u];
-                        auto &rhs_use = rhs_uses[u];
-                        if (lhs_use.first != rhs_use.first) { DO_RETURN; }
-                        int &mapped_id
-                                = op_id_map.at(lhs_use.second->logical_op_id_);
-                        // if there is already a mapping in the op map, and the
-                        // mapped RHS does not match rhs_use's Op id, return
-                        // false
-                        if (mapped_id != -1
-                                && mapped_id
-                                        != rhs_use.second->logical_op_id_) {
-                            DO_RETURN;
-                        }
-                        // set the mapping. When visiting the node, we need this
-                        // mapping to find the RHS
-                        mapped_id = rhs_use.second->logical_op_id_;
-                    }
-                }
-            });
-    return !not_same;
-}
-
-bool compare_graph(const sc_graph_t &lhs, const sc_graph_t &rhs,
-        const std::unordered_map<int, int> &lhs_rhs_input_mapping,
-        const std::function<bool(const sc_op *, const std::string &)> &filter) {
-    sc_op_ptr first_diff_lhs, first_diff_rhs;
-    return compare_graph(first_diff_lhs, first_diff_rhs, lhs, rhs,
-            lhs_rhs_input_mapping, filter);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/dynamic_infer_shape.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/dynamic_infer_shape.cpp
deleted file mode 100644
index e72ca1973ad..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/dynamic_infer_shape.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <vector>
-#include "../transform/transform.hpp"
-#include "../visitor.hpp"
-#include "pass.hpp"
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/reduce_graph_op.hpp>
-#include <ops/reshape.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <runtime/dynamic_dispatch/op_func_decl.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using namespace ops;
-SC_MODULE(graph.pass.dynamic_infer_shape)
-static void print_shapes(
-        const std::string &name, runtime::dynamic_tensor_t *in) {
-    std::stringstream ss;
-    ss << "Shape after op " << name << " is :[";
-    for (int i = 0; i < in->ndims_; i++) {
-        if (i) { ss << ","; }
-        ss << std::to_string(in->dims_[i]);
-    }
-    ss << "]\n";
-    SC_MODULE_INFO << ss.str();
-}
-SC_API void dynamic_infer_shape_by_graph(sc_graph_t &graph,
-        runtime::dynamic_tensor_t **ins, runtime::dynamic_tensor_t **outs,
-        size_t num_ins, size_t num_outs) {
-    if (graph.empty() || !graph.is_dynamic()) { return; }
-    COMPILE_ASSERT(num_ins == graph.get_input_ops().size()
-                    && num_outs == graph.get_output_ops().size(),
-            "Given input/output number does not match the graph.");
-    std::unordered_map<graph_tensor_ptr, runtime::dynamic_tensor_t *> ltsr_dtsr;
-    std::vector<runtime::dynamic_tensor_t> inner_dyn_tsr;
-    std::vector<std::vector<sc_dim>> inner_shape_tsr;
-
-    auto inp_ops = graph.get_input_ops();
-    auto out_ops = graph.get_output_ops();
-    for (size_t i = 0; i < num_ins; i++) {
-        ltsr_dtsr[inp_ops[i]->get_outputs()[0]] = ins[i];
-    }
-    for (size_t i = 0; i < num_outs; i++) {
-        ltsr_dtsr[out_ops[i]->get_inputs()[0]] = outs[i];
-    }
-    // avoid realloc
-    size_t ltsr_num = 0;
-    for (auto &op : graph.ops_) {
-        ltsr_num += op->get_inputs().size() + op->get_outputs().size();
-    }
-    ltsr_num -= inp_ops.size() + out_ops.size();
-    inner_dyn_tsr.reserve(ltsr_num);
-    inner_shape_tsr.reserve(ltsr_num);
-    auto get_or_create_dyn_tsr = [&ltsr_dtsr, &inner_dyn_tsr, &inner_shape_tsr](
-                                         const graph_tensor_ptr &in) {
-        auto it = ltsr_dtsr.find(in);
-        if (it != ltsr_dtsr.end()) { return it->second; }
-        auto plain_dims = in->details_.get_plain_dims();
-        inner_shape_tsr.emplace_back(plain_dims);
-        inner_dyn_tsr.emplace_back(nullptr, inner_shape_tsr.back().data(),
-                static_cast<int>(plain_dims.size()), 0, 0);
-        ltsr_dtsr.insert(std::make_pair(in, &inner_dyn_tsr.back()));
-        return &inner_dyn_tsr.back();
-    };
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis.visit_graph(graph,
-            [&get_or_create_dyn_tsr](op_visitor_t *vis, const sc_op_ptr &node) {
-                if (node->isa<input_op>() || node->isa<output_op>()) {
-                    return;
-                } else if (node->isa<constant_op_t>()) {
-                    // don't need to infer.
-                    get_or_create_dyn_tsr(node->get_outputs()[0]);
-                } else if (node->isa<matmul_core_op_t>()
-                        || node->isa<managed_matmul_core_op_t>()) {
-                    auto *data = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *weight = get_or_create_dyn_tsr(node->get_inputs()[1]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    infer_shape_matmul_op(out, data, weight);
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<conv_fwd_core_op_t>()) {
-                    auto *data = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *weight = get_or_create_dyn_tsr(node->get_inputs()[1]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    auto &stride = node->attrs_.get<sc_dims>("strides");
-                    auto dilations = get_dilations(node->attrs_);
-                    sc_dims dilation_dims(data->ndims_ - 2, dilations[0]);
-                    if (dilations.size() > 1) { dilation_dims = dilations; }
-                    auto &pads_begin = node->attrs_.has_key("pads_begin")
-                            ? node->attrs_.get<sc_dims>("pads_begin")
-                            : node->attrs_.get<sc_dims>("paddings");
-                    auto &pads_end = node->attrs_.has_key("pads_end")
-                            ? node->attrs_.get<sc_dims>("pads_end")
-                            : node->attrs_.get<sc_dims>("paddings");
-                    auto dyn_conv_info = data->ndims_ == 4
-                            ? dyn_conv_fwd_runtime_info_t(stride[0], stride[1],
-                                    pads_begin[0], pads_begin[1], pads_end[0],
-                                    pads_end[1], dilation_dims[0],
-                                    dilation_dims[1])
-                            : dyn_conv_fwd_runtime_info_t(stride[0], stride[1],
-                                    stride[2], pads_begin[0], pads_begin[1],
-                                    pads_begin[2], pads_end[0], pads_end[1],
-                                    pads_end[2], dilation_dims[0],
-                                    dilation_dims[1], dilation_dims[2]);
-                    infer_shape_conv_fwd_op(out, data, weight, dyn_conv_info);
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<unary_elementwise_op_t>()
-                        || node->isa<reorder_op_t>()
-                        || node->isa<quantize::quantize_op_t>()
-                        || node->isa<quantize::dequantize_op_t>()) {
-                    auto *in = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    infer_shape_unary_fusible_op(out, in);
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<binary_elementwise_op_t>()) {
-                    auto *in0 = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *in1 = get_or_create_dyn_tsr(node->get_inputs()[1]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    infer_shape_binary_fusible_op(out, in0, in1);
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<reduce_op_t>()
-                        || node->isa<reduce_mean_op_t>()) {
-                    auto *in = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    auto rd_axis
-                            = node->attrs_.get<std::vector<int>>("rd_axis");
-                    infer_shape_reduce_op(out, in, rd_axis.data(),
-                            static_cast<int>(rd_axis.size()));
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<transpose_op_t>()) {
-                    auto *in = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    auto order = node->attrs_.get<std::vector<int>>("order");
-                    infer_shape_transpose_op(out, in, order.data(),
-                            static_cast<int>(order.size()));
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<tensor_view_op_t>()) {
-                    auto *in = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    auto new_shape
-                            = node->attrs_.get<std::vector<sc_dim>>("shape");
-                    auto input_plain_dims
-                            = node->get_inputs()[0]->details_.get_plain_dims();
-                    infer_shape_tensor_view_op(out, in, input_plain_dims.data(),
-                            static_cast<int>(input_plain_dims.size()),
-                            new_shape.data(),
-                            static_cast<int>(new_shape.size()));
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<select_op_t>()) {
-                    auto *in0 = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *in1 = get_or_create_dyn_tsr(node->get_inputs()[1]);
-                    auto *in2 = get_or_create_dyn_tsr(node->get_inputs()[2]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    infer_shape_select_op(out, in0, in1, in2);
-                    print_shapes(node->op_name_, out);
-                } else if (node->isa<pooling_op_t>()) {
-                    auto *data = get_or_create_dyn_tsr(node->get_inputs()[0]);
-                    auto *out = get_or_create_dyn_tsr(node->get_outputs()[0]);
-                    auto pooling_op = node->stc_cast<pooling_op_t>();
-                    std::string auto_pad
-                            = pooling_op->attrs_.get_or_else<std::string>(
-                                    pooling_attr_key::auto_pad,
-                                    auto_pad_options::none);
-                    auto auto_pads_same
-                            = auto_pad == auto_pad_options::same_upper
-                            || auto_pad == auto_pad_options::same_lower;
-                    auto rounding_type_floor
-                            = pooling_op->attrs_.get_or_else<std::string>(
-                                      pooling_attr_key::rounding_type,
-                                      rounding_type_options::floor)
-                            == rounding_type_options::floor;
-                    auto dyn_info = data->ndims_ == 5
-                            ? dyn_pooling_runtime_info_t(pooling_op->stride_[0],
-                                    pooling_op->stride_[1],
-                                    pooling_op->stride_[2],
-                                    pooling_op->pads_begin_[0],
-                                    pooling_op->pads_begin_[1],
-                                    pooling_op->pads_begin_[2],
-                                    pooling_op->pads_end_[0],
-                                    pooling_op->pads_end_[1],
-                                    pooling_op->pads_end_[2],
-                                    pooling_op->kernel_[0],
-                                    pooling_op->kernel_[1],
-                                    pooling_op->kernel_[2], rounding_type_floor,
-                                    auto_pads_same)
-                            : dyn_pooling_runtime_info_t(pooling_op->stride_[0],
-                                    pooling_op->stride_[1],
-                                    pooling_op->pads_begin_[0],
-                                    pooling_op->pads_begin_[1],
-                                    pooling_op->pads_end_[0],
-                                    pooling_op->pads_end_[1],
-                                    pooling_op->kernel_[0],
-                                    pooling_op->kernel_[1], rounding_type_floor,
-                                    auto_pads_same);
-                    infer_shape_pooling_fusible_op(out, data, dyn_info);
-                    print_shapes(node->op_name_, out);
-                }
-
-                else {
-                    COMPILE_ASSERT(false,
-                            "Unsupported op for shape inference: "
-                                    << node->op_name_);
-                }
-            });
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_code_cache.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_code_cache.cpp
deleted file mode 100644
index be3155b8d9e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_code_cache.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <atomic>
-#include <list>
-#include <memory>
-#include <mutex>
-#include "pass.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/pass/graph_constant_cache.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <compiler/jit/jit.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/hash_utils.hpp>
-#include <util/variant.hpp>
-
-SC_MODULE(graph.pass.code_cache);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct prehashed_graph_for_code_share_t {
-    context_ptr ctx_;
-    variant<sc_graph_t, std::reference_wrapper<const sc_graph_t>> g_;
-    // the ids for the base tensors for each shared const tensor
-    std::vector<size_t> base_tensor_id_;
-    // the offset in the base tensors for each shared const tensor
-    std::vector<size_t> offset_;
-    size_t hash_;
-    static bool attr_filter(const sc_op *op, const std::string &attr) {
-        if (attr == op_attr_key::const_input_cache) { return false; }
-        if (op->isa<constant_op_t>()) {
-            if (attr == "values") { return false; }
-        }
-        static std::unordered_set<std::string> ignorable_keys {
-                attr_keys::scales, attr_keys::zero_points,
-                attr_keys::data_scales, attr_keys::data_zero_points,
-                attr_keys::weight_scales, attr_keys::weight_zero_points};
-        if (ignorable_keys.find(attr) != ignorable_keys.end()) { return false; }
-        return true;
-    }
-
-    prehashed_graph_for_code_share_t(const context_ptr &ctx, sc_graph_t &&g,
-            const std::vector<size_t> &base_tensor_id,
-            const std::vector<size_t> &offset, size_t ghash)
-        : ctx_(ctx)
-        , g_(std::move(g))
-        , base_tensor_id_(base_tensor_id)
-        , offset_(offset) {
-        hash_ = ghash;
-        hash_combine(hash_, base_tensor_id_);
-        hash_combine(hash_, offset_);
-        hash_combine(hash_, ctx_);
-    }
-
-    prehashed_graph_for_code_share_t(const context_ptr &ctx,
-            std::reference_wrapper<const sc_graph_t> g,
-            const std::vector<size_t> &base_tensor_id,
-            const std::vector<size_t> &offset, size_t ghash)
-        : ctx_(ctx), g_(g), base_tensor_id_(base_tensor_id), offset_(offset) {
-        hash_ = ghash;
-        hash_combine(hash_, base_tensor_id_);
-        hash_combine(hash_, offset_);
-        hash_combine(hash_, ctx_);
-    }
-    bool operator==(const prehashed_graph_for_code_share_t &other) const {
-        if (hash_ != other.hash_) { return false; }
-        if (ctx_ != other.ctx_) { return false; }
-        if (base_tensor_id_ != other.base_tensor_id_) { return false; }
-        if (offset_ != other.offset_) { return false; }
-        return compare_graph(g_.cast<const sc_graph_t &>(),
-                g_.cast<const sc_graph_t &>(), {}, attr_filter);
-    }
-};
-
-std::ostream &operator<<(
-        std::ostream &os, const prehashed_graph_for_code_share_t &v) {
-    os << "hash=" << v.hash_
-       << ", base tsr=" << utils::print_vector(v.base_tensor_id_)
-       << ", offsets=" << utils::print_vector(v.offset_);
-    return os;
-}
-
-struct graph_shared_ptr_hasher_t {
-    size_t operator()(const prehashed_graph_for_code_share_t &v) const {
-        return v.hash_;
-    }
-};
-
-struct graph_code_cache_handle;
-struct graph_code_cache_manager;
-
-struct graph_code_cache_value_t {
-    std::weak_ptr<graph_code_cache_handle> v_;
-    std::shared_ptr<bool> deletion_flag_;
-};
-
-using graph_code_cache_map
-        = std::unordered_map<prehashed_graph_for_code_share_t,
-                graph_code_cache_value_t, graph_shared_ptr_hasher_t>;
-
-struct graph_code_cache_handle {
-    std::weak_ptr<jit_module_code> code_;
-    statics_table_t module_data_template_;
-    graph_code_cache_map::iterator iter_;
-    std::shared_ptr<bool> deletion_flag_;
-    std::shared_ptr<graph_code_cache_manager> mgr_;
-    graph_code_cache_handle(const std::weak_ptr<jit_module_code> &code,
-            statics_table_t &&module_data,
-            const std::shared_ptr<graph_code_cache_manager> &mgr)
-        : code_(code)
-        , module_data_template_(std::move(module_data))
-        , deletion_flag_(std::make_shared<bool>(false))
-        , mgr_(mgr) {}
-    ~graph_code_cache_handle();
-};
-
-static const char *shared_const_handle_name = "__shared_const_handle";
-statics_table_t prepare_static_table_for_cached_code(
-        graph_code_cache_handle &v, const sc_graph_t &orig_graph) {
-    statics_table_t ret = v.module_data_template_.copy();
-    if (auto bases = orig_graph.attrs_.get_or_null<
-                     std::vector<std::shared_ptr<runtime::const_cache_proxy>>>(
-                "shared_const_bases")) {
-        if (!bases->empty()) {
-            void **phandles
-                    = (void **)ret.get_or_null(shared_const_handle_name);
-            COMPILE_ASSERT(phandles,
-                    "Cannot find shared_const_handle_name in module data");
-            for (size_t i = 0; i < bases->size(); i++) {
-                phandles[i] = (*bases)[i]->is_lazy_
-                        ? (*bases)[i].get()
-                        : (*bases)[i]->get_buffer_if_not_lazy();
-            }
-            ret.shared_tensors_ = orig_graph.attrs_.get<
-                    std::vector<std::shared_ptr<cached_const_graph_tensor>>>(
-                    "shared_const_tensors");
-        }
-    }
-    return ret;
-}
-
-static std::shared_ptr<graph_code_cache_manager> get_cache_mgr();
-
-struct graph_code_cache_manager {
-    std::mutex lock_;
-    graph_code_cache_map map_;
-
-    std::shared_ptr<jit_module_code> internal_query(
-            const prehashed_graph_for_code_share_t &v) {
-        auto itr = map_.find(v);
-        if (itr != map_.end()) {
-            // find the key in the map, and make sure the weakptr are alive. If
-            // the weakptr expired, remove the key from the map
-            auto ret = some_opt(itr->second.v_.lock())
-                               .map([](const std::shared_ptr<
-                                            graph_code_cache_handle> &v) {
-                                   return v->code_.lock();
-                               })
-                               .get_or_else(nullptr);
-            if (!ret) {
-                // notify the dtor of graph_code_cache_handle that the key is
-                // already removed
-                *itr->second.deletion_flag_ = false;
-                map_.erase(itr);
-                return nullptr;
-            }
-            return ret;
-        }
-        return nullptr;
-    }
-
-    std::shared_ptr<jit_module_code> query(
-            const prehashed_graph_for_code_share_t &v) {
-        std::lock_guard<std::mutex> guard {lock_};
-        return internal_query(v);
-    }
-
-    std::shared_ptr<graph_code_cache_handle> insert(
-            prehashed_graph_for_code_share_t &&v, const jit_module &m) {
-        std::lock_guard<std::mutex> guard {lock_};
-        if (internal_query(v)) {
-            SC_MODULE_INFO << "The graph is already in the cache";
-            return nullptr;
-        }
-
-        auto value = std::make_shared<graph_code_cache_handle>(
-                m.code_, m.globals_.copy(), get_cache_mgr());
-        auto graph_iter = map_.insert(std::make_pair(std::move(v),
-                graph_code_cache_value_t {value, value->deletion_flag_}));
-        assert(graph_iter.second);
-        value->iter_ = graph_iter.first;
-        m.code_->graph_cache_handle_ = value;
-        SC_MODULE_INFO << "Putting into code cache, "
-                       << graph_iter.first->first;
-        if (auto sc_stream_temp
-                = ::dnnl::impl::graph::gc::runtime::get_info_logging_stream(
-                        "graph.pass.verbose.code_cache")) {
-            print_graph(graph_iter.first->first.g_.cast<const sc_graph_t &>(),
-                    *sc_stream_temp.stream_, true, true);
-        }
-        return value;
-    }
-
-    void remove(graph_code_cache_handle &v) {
-        std::lock_guard<std::mutex> guard {lock_};
-        /*
-        see why the use of deletion_flag_ in const_graph_tensor_cache::remove
-         */
-        bool already_deleted = *v.deletion_flag_;
-        if (already_deleted) { return; }
-        if (v.iter_ != map_.end()) map_.erase(v.iter_);
-    }
-};
-
-graph_code_cache_handle::~graph_code_cache_handle() {
-    mgr_->remove(*this);
-}
-
-static std::shared_ptr<graph_code_cache_manager> get_cache_mgr() {
-    static std::shared_ptr<graph_code_cache_manager> ret
-            = std::make_shared<graph_code_cache_manager>();
-    return ret;
-}
-
-size_t query_cached_code_of_context(const context_ptr &ctx) {
-    auto mgr = get_cache_mgr();
-    size_t ret = 0;
-    std::lock_guard<std::mutex> guard {mgr->lock_};
-    for (auto &kv : mgr->map_) {
-        if (kv.first.ctx_ == ctx) ret++;
-    }
-    return ret;
-}
-
-std::shared_ptr<graph_code_cache_handle> register_code_in_graph_cache(
-        const jit_module &m,
-        std::shared_ptr<prehashed_graph_for_code_share_t> &&key) {
-    return get_cache_mgr()->insert(std::move(*key), m);
-}
-
-SC_INTERNAL_API void graph_code_cache(sc_graph_t &mgr, const context_ptr &ctx) {
-    if (!ctx->flags_.const_share_) { return; }
-    if (ctx->engine_->vtable_->get_tensor_cache_cap(ctx->engine_) == 0) {
-        // if constant cache is off
-        return;
-    }
-    // collect the base tensors and offsets
-    std::vector<std::shared_ptr<runtime::const_cache_proxy>> bases;
-    std::vector<size_t> base_ids;
-    std::vector<size_t> offsets;
-    std::vector<std::shared_ptr<cached_const_graph_tensor>> shared_tsr;
-    op_visitor_t::dfs_topology_sort(mgr.ops_.size())
-            .visit_graph(mgr, [&](op_visitor_t *, const sc_op_ptr &op) {
-                if (auto buffer
-                        = op->attrs_.get_or_null<std::vector<
-                                  std::shared_ptr<cached_const_graph_tensor>>>(
-                                op_attr_key::const_input_cache)) {
-                    for (auto &buf : *buffer) {
-                        shared_tsr.emplace_back(buf);
-                        auto itr = std::find(
-                                bases.begin(), bases.end(), buf->buf_base_);
-                        if (itr != bases.end()) {
-                            base_ids.push_back(
-                                    static_cast<size_t>(itr - bases.begin()));
-                        } else {
-                            base_ids.push_back(bases.size());
-                            bases.emplace_back(buf->buf_base_);
-                        }
-                        offsets.push_back(buf->offset_);
-                    }
-                }
-            });
-    mgr.attrs_["shared_const_bases"] = bases;
-    mgr.attrs_["shared_const_tensors"] = shared_tsr;
-    auto cache_mgr = get_cache_mgr();
-    auto ghash
-            = mgr.hash_contents(prehashed_graph_for_code_share_t::attr_filter);
-    prehashed_graph_for_code_share_t key {ctx,
-            std::reference_wrapper<const sc_graph_t>(mgr), base_ids, offsets,
-            ghash};
-
-    if (auto cached = cache_mgr->query(key)) {
-        mgr.attrs_["graph_code_cache"] = cached;
-        SC_MODULE_INFO << "Found cached code for the graph: " << key;
-        return;
-    }
-    auto new_graph = copy_graph(mgr);
-    // cleanup attrs
-    for (auto &op : new_graph.ops_) {
-        auto &attrs = op->attrs_.as_map();
-        for (auto itr = attrs.begin(); itr != attrs.end();) {
-            if (utils::string_startswith(itr->first, "temp.")
-                    || !prehashed_graph_for_code_share_t::attr_filter(
-                            op.get(), itr->first)) {
-                itr = attrs.erase(itr);
-                continue;
-            }
-            ++itr;
-        }
-    }
-    auto cache_key = std::make_shared<prehashed_graph_for_code_share_t>(
-            ctx, std::move(new_graph), base_ids, offsets, ghash);
-    mgr.attrs_["graph_code_cache_key"] = cache_key;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_code_cache.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_code_cache.hpp
deleted file mode 100644
index b176a017fb3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_code_cache.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_PASS_GRAPH_CODE_CACHE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_PASS_GRAPH_CODE_CACHE_HPP
-
-#include <memory>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/statics_table.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * The graph code cache uses prehashed_graph_for_code_share_t as the key and
- * graph_code_cache_handle as the value. It contains a weakptr to the
- * jit_module_code. The jit_module_code should hold a sharedptr to this handle.
- * When a jit_module_code is destructed, it will destroy its
- * graph_code_cache_handle as well. This will automatically unregister the graph
- * in graph code cache
- */
-struct graph_code_cache_handle;
-
-/**
- * The key in graph code cache
- */
-struct prehashed_graph_for_code_share_t;
-class sc_graph_t;
-struct jit_module;
-
-/**
- * Create and prepare the JIT module data of a cached graph.
- */
-statics_table_t prepare_static_table_for_cached_code(
-        graph_code_cache_handle &v, const sc_graph_t &orig_graph);
-
-/**
- * Register the compilation result and the query key (graph) into the graph code
- * cache.
- *
- * @returns the pointer to the cache item handle. This can be null if the key
- * already exists in the cache
- */
-std::shared_ptr<graph_code_cache_handle> register_code_in_graph_cache(
-        const jit_module &m,
-        std::shared_ptr<prehashed_graph_for_code_share_t> &&key);
-
-// get the number of cache code for a given context
-size_t query_cached_code_of_context(const context_ptr &ctx);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_constant_cache.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_constant_cache.hpp
deleted file mode 100644
index a4f0c6f8f7e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_constant_cache.hpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_PASS_GRAPH_CONSTANT_CACHE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_PASS_GRAPH_CONSTANT_CACHE_HPP
-
-#include <memory>
-#include <compiler/ir/statics_table.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-struct const_cache_proxy;
-}
-
-struct cached_const_graph_tensor;
-struct const_graph_tensor_cache;
-class sc_graph_t;
-
-// the value for the graph-cached_const_graph_tensor map. Holds an deletion_flag
-// indicating that the key is already removed or not
-struct flaged_cached_const_graph_tensor_t {
-    std::weak_ptr<cached_const_graph_tensor> v_;
-    std::shared_ptr<bool> deletion_flag_;
-};
-struct graph_weak_ptr_hasher {
-    size_t operator()(const std::weak_ptr<sc_graph_t> &v) const;
-};
-struct graph_weak_ptr_cmper {
-    bool operator()(const std::weak_ptr<sc_graph_t> &v1,
-            const std::weak_ptr<sc_graph_t> &v2) const;
-};
-
-using graph_weak_ptr_map = std::unordered_map<std::weak_ptr<sc_graph_t>,
-        flaged_cached_const_graph_tensor_t, graph_weak_ptr_hasher,
-        graph_weak_ptr_cmper>;
-
-using tensor_id_map = std::unordered_map<uint64_t,
-        std::weak_ptr<cached_const_graph_tensor>>;
-
-struct cached_const_graph_tensor {
-    std::shared_ptr<sc_graph_t> dependency_;
-    // the data pointer buf_base_ is initialized null. It will be later updated
-    // when finishing visiting the whole graph in graph_constant_input_folding
-    // pass. The update of this pointer is protected by
-    // shared_global_data_allocator_t::lock_
-    size_t size_;
-    graph_weak_ptr_map::iterator graph_iter_;
-    tensor_id_map::iterator id_iter_;
-    std::shared_ptr<const_graph_tensor_cache> cache_owner_;
-    const std::shared_ptr<bool> deletion_flag_;
-    // the base pointer of buf_. buf_ may be cut from a larger buffer buf_base_.
-    std::shared_ptr<runtime::const_cache_proxy> buf_base_;
-    // the offset of buf_ on the base buffer of buf_base_
-    size_t offset_ = 0;
-    cached_const_graph_tensor(const std::shared_ptr<sc_graph_t> &dep,
-            size_t buf_size,
-            const std::shared_ptr<const_graph_tensor_cache> &owner)
-        : dependency_ {dep}
-        , size_ {buf_size}
-        , cache_owner_ {owner}
-        , deletion_flag_ {std::make_shared<bool>(false)} {}
-    ~cached_const_graph_tensor();
-};
-
-namespace op_attr_key {
-constexpr const char *const_input_cache = "temp.const_input_cache";
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_constant_input_folding.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_constant_input_folding.cpp
deleted file mode 100644
index dcfdfaf1f0c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_constant_input_folding.cpp
+++ /dev/null
@@ -1,478 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <list>
-#include <mutex>
-#include "../fused_op.hpp"
-#include "../fusible_op.hpp"
-#include "../visitor.hpp"
-#include "graph_constant_cache.hpp"
-#include "pass.hpp"
-#include <compiler/ir/statics_table.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <runtime/context.hpp>
-#include <unordered_map>
-
-SC_MODULE(graph.pass.const_input_fold);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-size_t graph_weak_ptr_hasher::operator()(
-        const std::weak_ptr<sc_graph_t> &v) const {
-    auto g = v.lock();
-    if (!g) { return 0; }
-    return g->hash_contents();
-}
-bool graph_weak_ptr_cmper::operator()(const std::weak_ptr<sc_graph_t> &v1,
-        const std::weak_ptr<sc_graph_t> &v2) const {
-    auto g1 = v1.lock();
-    if (!g1) { return false; }
-    auto g2 = v2.lock();
-    if (!g2) { return false; }
-    if (g1 == g2) { return true; }
-    return compare_graph(*g1, *g2);
-}
-
-struct shared_global_data_allocator_t {
-    std::mutex lock_;
-    size_t needed_allocation_size(
-            const std::vector<std::shared_ptr<cached_const_graph_tensor>>
-                    &cache,
-            const std::vector<void *> &existing_data, size_t &out_real_const) {
-        size_t ret_lazy = 0, ret_compile_time_const = 0;
-        for (size_t i = 0; i < cache.size(); i++) {
-            auto &v = cache[i];
-            size_t &val = existing_data[i] ? ret_compile_time_const : ret_lazy;
-            if (!v->buf_base_) {
-                val += utils::divide_and_ceil(v->size_, 64) * 64;
-            }
-        }
-        out_real_const = ret_compile_time_const;
-        return ret_lazy;
-    }
-    void
-    alloc(const std::vector<std::shared_ptr<cached_const_graph_tensor>> &cache,
-            const std::vector<void *> &existing_data,
-            runtime::engine_t *engine) {
-        size_t ret_lazy = 0, ret_compile_time_const = 0;
-        ret_lazy = needed_allocation_size(
-                cache, existing_data, ret_compile_time_const);
-        if (ret_lazy + ret_compile_time_const) {
-            std::lock_guard<std::mutex> guard {lock_};
-            // do double-check locking
-            ret_lazy = needed_allocation_size(
-                    cache, existing_data, ret_compile_time_const);
-            if (ret_lazy) {
-                // need to register into const cache manager for
-                // lazy-initialized buffers
-                auto base = engine->vtable_->alloc_and_register_tensor_cache(
-                        engine, ret_lazy);
-                size_t offset = 0;
-                for (size_t idx = 0; idx < cache.size(); idx++) {
-                    auto &v = cache[idx];
-                    if (existing_data[idx]) continue;
-                    // the update on buf_ is protected by lock_
-                    if (!v->buf_base_) {
-                        v->buf_base_ = base;
-                        v->offset_ = offset;
-                        offset += utils::divide_and_ceil(v->size_, 64) * 64;
-                        SC_MODULE_INFO << "Alloc buffer for " << v
-                                       << ", offset=" << v->offset_;
-                    }
-                }
-            }
-            if (ret_compile_time_const) {
-                // no need to register into const cache manager for
-                // compile-time constants because they are never evicted
-                std::shared_ptr<void> baseptr = std::shared_ptr<void> {
-                        engine->vtable_->persistent_alloc(
-                                engine, ret_compile_time_const),
-                        [engine](void *p) {
-                            engine->vtable_->persistent_dealloc(engine, p);
-                        }};
-                auto base = std::make_shared<runtime::const_cache_proxy>(
-                        baseptr, baseptr.get(), ret_compile_time_const,
-                        /*lazy*/ false);
-                size_t offset = 0;
-                for (size_t idx = 0; idx < cache.size(); idx++) {
-                    auto &v = cache[idx];
-                    if (!existing_data[idx]) continue;
-                    // the update on buf_ is protected by lock_
-                    if (!v->buf_base_) {
-                        v->buf_base_ = base;
-                        v->offset_ = offset;
-                        memcpy((char *)baseptr.get() + offset,
-                                existing_data[idx], v->size_);
-                        offset += utils::divide_and_ceil(v->size_, 64) * 64;
-                        SC_MODULE_INFO << "Alloc buffer for " << v
-                                       << ", offset=" << v->offset_;
-                    }
-                }
-            }
-        }
-    }
-};
-
-static std::shared_ptr<const_graph_tensor_cache> get_cache();
-struct const_graph_tensor_cache {
-    std::mutex lock_;
-    tensor_id_map from_tensor_id_;
-    graph_weak_ptr_map from_dep_graph_;
-    shared_global_data_allocator_t alloca_;
-
-    std::shared_ptr<cached_const_graph_tensor> add_tensor(
-            const std::shared_ptr<sc_graph_t> &dep_graph, size_t buf_size,
-            runtime::engine_t *engine) {
-        std::lock_guard<std::mutex> guard {lock_};
-        auto itr = from_dep_graph_.find(dep_graph);
-        if (itr != from_dep_graph_.end()) {
-            auto ret = itr->second.v_.lock();
-            if (ret) { return ret; }
-            // the weakptr expired. mark the key as deleted.
-            *itr->second.deletion_flag_ = true;
-            from_dep_graph_.erase(itr);
-        }
-        auto ret = std::make_shared<cached_const_graph_tensor>(
-                dep_graph, buf_size, get_cache());
-        // insert into graph map
-        auto graph_iter = from_dep_graph_.insert(std::make_pair(dep_graph,
-                flaged_cached_const_graph_tensor_t {ret, ret->deletion_flag_}));
-        assert(graph_iter.second);
-        // allocate a unique ID for the tensor
-        auto hash = dep_graph->hash_contents();
-        bool found = false;
-        for (int retries = 0; retries < 3000; retries++) {
-            auto id_iter = from_tensor_id_.find(hash);
-            if (id_iter == from_tensor_id_.end()) {
-                found = true;
-                break;
-            }
-            hash++;
-        }
-        COMPILE_ASSERT(found, "Cannot insert unique cached tensor id");
-        auto id_iter = from_tensor_id_.insert(std::make_pair(hash, ret));
-        ret->graph_iter_ = graph_iter.first;
-        ret->id_iter_ = id_iter.first;
-        return ret;
-    }
-
-    void remove(cached_const_graph_tensor &v) {
-        std::lock_guard<std::mutex> guard {lock_};
-        /* need to check if the key in the map is already deleted. Consider the
-         * following case if we don't use the deletion_flag_.
-         * 1. Thread 1: The last reference to a cached_const_graph_tensor is
-         * destroyed, calling the destructor of cached_const_graph_tensor
-         * 2. Thread 2: Before thread 1 enters
-         * const_graph_tensor_cache::remove(), thread2 enters add_tensor. The
-         * graph_key is the same as but a different instance of the graph_key of
-         * thread1's. Thread2 will find that the graph is already in
-         * from_dep_graph_ as the key, but the value (weakptr to
-         * cached_const_graph_tensor) already expired. So thread2 cannot reuse
-         * the key-value already in from_dep_graph_. It removes the key from
-         * from_dep_graph_ and adds a new key to it.
-         * 3. Thread 1 enters const_graph_tensor_cache::remove() and calls
-         * from_dep_graph_.erase(v.graph_iter_) - however, the iterator is
-         * already invalidated because in step 2, thread2 removed it.
-         *
-         */
-        bool already_deleted = *v.deletion_flag_;
-        if (already_deleted) { return; }
-        if (v.graph_iter_ != from_dep_graph_.end())
-            from_dep_graph_.erase(v.graph_iter_);
-        if (v.id_iter_ != from_tensor_id_.end())
-            from_tensor_id_.erase(v.id_iter_);
-    }
-};
-
-static std::shared_ptr<const_graph_tensor_cache> get_cache() {
-    static std::shared_ptr<const_graph_tensor_cache> c
-            = std::make_shared<const_graph_tensor_cache>();
-    return c;
-}
-
-cached_const_graph_tensor::~cached_const_graph_tensor() {
-    // if dependency_, the cached tensor should be created by unit tests
-    if (dependency_) cache_owner_->remove(*this);
-}
-
-static std::atomic<size_t> internal_tensor_id = {0xfffff000};
-
-SC_INTERNAL_API void graph_constant_input_folding_impl(
-        sc_graph_t &mgr, const context_ptr &ctx, bool share_constants) {
-    if (ctx->engine_->vtable_->get_tensor_cache_cap(ctx->engine_) == 0) {
-        // if constant cache is off, remove all constant attr
-        for (auto &op : mgr.ops_) {
-            if (op->attrs_.get_or_else("constant", const_kind::not_const)
-                    != const_kind::not_const) {
-                op->attrs_["constant"] = const_kind::not_const;
-            }
-        }
-        return;
-    }
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(mgr.ops_.size());
-    std::vector<sc_op *> edge_ops;
-    vis.visit_graph(mgr, [&edge_ops](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<input_op>()) {
-            if (!node->attrs_.get_or_null<size_t>("temp.tensor_id")) {
-                size_t tsr_id = internal_tensor_id++;
-                node->attrs_["temp.tensor_id"] = tsr_id;
-            }
-        }
-        auto cur_const_state
-                = node->attrs_.get_or_else("constant", const_kind::not_const);
-        if (node->isa<constant_op_t>() || cur_const_state) {
-            if (node->isa<constant_op_t>()) {
-                edge_ops.emplace_back(node.get());
-                node->attrs_.set("constant", const_kind::local_const);
-            }
-            bool all_constant_inputs = true;
-            for (const auto &input : node->get_inputs()) {
-                auto parent_node = input->producer_owner_;
-                if (parent_node->attrs_.get_or_else(
-                            "constant", const_kind::not_const)
-                                == const_kind::not_const
-                        && !parent_node->isa<constant_op_t>()) {
-                    all_constant_inputs = false;
-                    break;
-                }
-            }
-            if (!all_constant_inputs) {
-                for (const auto &input : node->get_inputs()) {
-                    auto parent_node = input->producer_owner_;
-                    if (parent_node->attrs_.get_or_else(
-                                "constant", const_kind::not_const)
-                            != const_kind::not_const) {
-                        parent_node->attrs_.set(
-                                "constant", const_kind::global_const);
-                        edge_ops.emplace_back(parent_node);
-                        if (parent_node->isa<tensor_view_op_t>()) {
-                            auto tv_in = parent_node;
-                            while (tv_in->isa<tensor_view_op_t>()) {
-                                tv_in = tv_in->get_inputs()[0]->producer_owner_;
-                            }
-                            tv_in->attrs_.set(
-                                    "constant", const_kind::global_const);
-                            edge_ops.emplace_back(tv_in);
-                        }
-                    }
-                }
-                node->attrs_.set("constant", const_kind::not_const);
-            } else {
-                if (!node->isa<output_op>()) {
-                    // Setting attrs here is intermediary status.
-                    // Current node is constant node, its uses may
-                    // also be constant, so we set `local_const`
-                    // here temporarily meaning `may_constant`.
-                    // Later when visiting its uses, we check their
-                    // all inputs and decide whether we reserve this
-                    // attr.
-                    for (auto &out : node->get_outputs()) {
-                        for (auto &cld_node : out->uses_) {
-                            cld_node.second->attrs_.set(
-                                    "constant", const_kind::local_const);
-                        }
-                    }
-                }
-            }
-        }
-    });
-    if (share_constants && !edge_ops.empty() && ctx->flags_.const_share_) {
-        op_dep_matrix_t dependency {mgr};
-        std::vector<size_t> hash_cache(mgr.ops_.size());
-        std::vector<void *> existing_data_vec;
-        // the list of cached tensors in the whole graph
-        std::vector<std::shared_ptr<cached_const_graph_tensor>> caches;
-        for (auto op : edge_ops) {
-            if (op->attrs_.has_key(op_attr_key::const_input_cache)) {
-                continue;
-            }
-            if (op->isa<input_op>()) { continue; }
-            // the list of cached tensors in this op
-            std::vector<std::shared_ptr<cached_const_graph_tensor>> results;
-            // the op-ids of inputs and constants that current op depends on
-            std::list<sc_op_ptr> depending_inputs;
-            // if all of the dependent ops are tensor_view, skip
-            bool is_all_tensor_view = true;
-            std::vector<bool> op_mask(mgr.ops_.size());
-            for (size_t i = 0; i < mgr.ops_.size(); i++) {
-                if (i != (size_t)op->logical_op_id_
-                        && dependency.lookup(i, op->logical_op_id_) == 1) {
-                    // if is input/constant
-                    if (mgr.ops_[i]->get_inputs().empty()) {
-                        depending_inputs.emplace_back(mgr.ops_[i]);
-                    }
-                    op_mask[i] = true;
-                    if (!mgr.ops_[i]->isa<tensor_view_op_t>()
-                            && !mgr.ops_[i]->isa<input_op>()
-                            && !mgr.ops_[i]->isa<constant_op_t>()) {
-                        is_all_tensor_view = false;
-                    }
-                }
-            }
-            if (is_all_tensor_view && op->isa<tensor_view_op_t>()) { continue; }
-            // the selector for graph visitor to visit sub graph for the
-            // current op. Try to normalize the visiting order by hash of
-            // the ops
-            auto selector
-                    = [&op_mask, &hash_cache](op_visitor_t *v) -> sc_op_ptr {
-                std::list<sc_op_ptr>::iterator theitr;
-                size_t max_hash = 0;
-                bool found = false;
-                for (auto itr = v->to_visit_.begin();
-                        itr != v->to_visit_.end();) {
-                    auto &cur = *itr;
-                    // if the cur is not depended by op, skip
-                    if (v->has_visited(cur->logical_op_id_)
-                            || !op_mask[cur->logical_op_id_]) {
-                        itr = v->to_visit_.erase(itr);
-                        continue;
-                    }
-                    auto &cached_hash = hash_cache[cur->logical_op_id_];
-                    size_t cur_hash = 0;
-                    if (!cached_hash) {
-                        cur_hash = cur->hash_contents();
-                        cached_hash = cur_hash;
-                    } else {
-                        cur_hash = cached_hash;
-                    }
-                    // find the op with max hash value
-                    if (cur_hash >= max_hash) {
-                        max_hash = cur_hash;
-                        found = true;
-                        theitr = itr;
-                    }
-                    ++itr;
-                }
-                if (found) {
-                    auto ret = *theitr;
-                    v->to_visit_.erase(theitr);
-                    return ret;
-                }
-                return nullptr;
-            };
-            op_visitor_t visitor {selector,
-                    op_visitor_t::create_DAG_updater(mgr.ops_.size()), false};
-            std::vector<size_t> depending_ops;
-            visitor.to_visit_ = std::move(depending_inputs);
-            visitor.visit(
-                    [&depending_ops](op_visitor_t *v, const sc_op_ptr &p) {
-                        depending_ops.emplace_back(p->logical_op_id_);
-                    });
-            depending_ops.emplace_back(op->logical_op_id_);
-            for (size_t i = 0; i < op->get_outputs().size(); i++) {
-                void *existing_data = nullptr;
-                if (auto c_op = op->dyn_cast<constant_op_t>()) {
-                    auto &src = *(c_op->get_constant_values());
-                    existing_data = src.data_;
-                }
-                existing_data_vec.emplace_back(existing_data);
-                auto buf_size = op->get_outputs()[i]
-                                        ->details_.get_blocking_byte_size();
-                if (buf_size <= 8UL) {
-                    // skip scalars, don't register it in the cache manager
-                    auto cached_tsr
-                            = std::make_shared<cached_const_graph_tensor>(
-                                    nullptr, buf_size, get_cache());
-                    // mark as deleted from the cache manager
-                    *cached_tsr->deletion_flag_ = true;
-                    results.emplace_back(cached_tsr);
-                    caches.emplace_back(cached_tsr);
-                    continue;
-                }
-                // for each output tensor of the edge op, rebuild the
-                // dependency graph
-                auto g = std::make_shared<sc_graph_t>();
-                std::vector<sc_op_ptr> newops;
-                newops.resize(mgr.ops_.size());
-                for (auto &depop_id : depending_ops) {
-                    const auto &theop = mgr.ops_[depop_id];
-                    if (theop->isa<input_op>()) {
-                        newops[depop_id] = g->make_input(
-                                copy_logical_tsr(theop->get_outputs()),
-                                theop->attrs_);
-                        newops[depop_id]->attrs_["constant_input_id"]
-                                = theop->attrs_["temp.tensor_id"];
-                    } else {
-                        auto copyable
-                                = theop->dyn_cast<op_traits::copyable_t>();
-                        COMPILE_ASSERT(copyable,
-                                "The const cache ops must be copyable");
-                        std::vector<graph_tensor_ptr> newins;
-                        auto &oldins = theop->get_inputs();
-                        for (size_t in_idx = 0; in_idx < oldins.size();
-                                in_idx++) {
-                            auto oldowner = oldins[in_idx]->producer_owner_;
-                            auto &mapped_op = newops[oldowner->logical_op_id_];
-                            auto idx
-                                    = std::find(oldowner->get_outputs().begin(),
-                                              oldowner->get_outputs().end(),
-                                              oldins[in_idx])
-                                    - oldowner->get_outputs().begin();
-                            newins.emplace_back(mapped_op->get_outputs()[idx]);
-                        }
-                        newops[depop_id] = copyable->copy(newins,
-                                copy_logical_tsr(theop->get_outputs()), *g);
-                    }
-                }
-                auto &lastop = newops[op->logical_op_id_];
-                g->make_output({lastop->get_outputs()[i]});
-                results.emplace_back(get_cache()->add_tensor(g,
-                        op->get_outputs()[i]->details_.get_blocking_byte_size(),
-                        ctx->engine_));
-                caches.emplace_back(results.back());
-                if (auto sc_stream_temp = ::dnnl::impl::graph::gc::runtime::
-                                get_info_logging_stream(__sc_module_name)) {
-                    (*sc_stream_temp.stream_)
-                            << "Putting into shared cache: " << results.back()
-                            << ", uid="
-                            << (void *)results.back()->id_iter_->first
-                            << ", size=" << results.back()->size_
-                            << ", dep =\n";
-                    print_graph(*g, *sc_stream_temp.stream_, true, true);
-                }
-            }
-            // mark the op with const_input_cache attr. We need to skip the
-            // tensorview ops, because we don't generate tensor buffer for these
-            // ops
-            auto cached_op = op;
-            while (cached_op->isa<tensor_view_op_t>()) {
-                cached_op = cached_op->get_inputs()[0]->producer_owner_;
-            }
-            cached_op->attrs_[op_attr_key::const_input_cache]
-                    = std::move(results);
-        }
-        get_cache()->alloca_.alloc(caches, existing_data_vec, ctx->engine_);
-    }
-}
-
-SC_INTERNAL_API void graph_constant_input_folding(
-        sc_graph_t &mgr, const context_ptr &ctx) {
-    graph_constant_input_folding_impl(mgr, ctx, false);
-}
-SC_INTERNAL_API void graph_constant_input_folding_and_share_constants(
-        sc_graph_t &mgr, const context_ptr &ctx) {
-    graph_constant_input_folding_impl(mgr, ctx, true);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_copy.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_copy.cpp
deleted file mode 100644
index 31a734e01ad..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/graph_copy.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../dynamic_lower_info.hpp"
-#include "../fusible_op.hpp"
-#include "../traits.hpp"
-#include "../visitor.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using namespace op_traits;
-SC_INTERNAL_API sc_graph_t copy_graph(const sc_graph_t &graph) {
-    for (auto &op : graph.ops_) {
-        if (!op->is_removed_ && !op->dyn_cast<input_op>()
-                && !op->dyn_cast<output_op>() && !op->dyn_cast<copyable_t>()) {
-            return sc_graph_t();
-        }
-    }
-    sc_graph_t copied_graph;
-    op_visitor_t vis = op_visitor_t::bfs_topology_sort(graph.ops_.size());
-    std::unordered_map<graph_tensor_ptr, graph_tensor_ptr> old_new_lt_map;
-    std::unordered_map<sc_op_ptr, int> op_id_map;
-    // the map from old op id to new op
-    std::vector<sc_op_ptr> old_id_2_new_op;
-    old_id_2_new_op.resize(graph.ops_.size());
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        sc_op_ptr new_node;
-        if (node->dyn_cast<input_op>()) {
-            new_node = copied_graph.make_input(
-                    copy_logical_tsr(node->get_outputs()));
-
-            // "unique_id" for integration
-            new_node->attrs_ = node->attrs_;
-        } else {
-            std::vector<graph_tensor_ptr> ins;
-            ins.reserve(node->get_inputs().size());
-            for (auto &t : node->get_inputs()) {
-                ins.emplace_back(old_new_lt_map.at(t));
-            }
-            if (node->dyn_cast<output_op>()) {
-                new_node = copied_graph.make_output(ins);
-                // "unique_id" for integration
-                new_node->attrs_ = node->attrs_;
-            } else {
-                new_node = node->dyn_cast<op_traits::copyable_t>()->copy(ins,
-                        copy_logical_tsr(node->get_outputs()), copied_graph);
-                new_node->info_.cur_impl_ = node->info_.cur_impl_;
-            }
-        }
-        // recording old graph_tensor->new graph_tensor
-        for (size_t i = 0; i < new_node->get_outputs().size(); ++i) {
-            old_new_lt_map[node->get_outputs()[i]] = new_node->get_outputs()[i];
-        }
-        op_id_map[new_node] = node->logical_op_id_;
-        old_id_2_new_op[node->logical_op_id_] = new_node;
-    });
-    // update the uses order, it is important for checking equality of the
-    // copied graph
-    for (auto &newop : copied_graph.ops_) {
-        auto &mapped_old = graph.ops_[op_id_map[newop]];
-        auto &newouts = newop->get_outputs();
-        auto &oldouts = mapped_old->get_outputs();
-        for (size_t i = 0; i < newouts.size(); i++) {
-            // copy the old uses, and re-map to new ops
-            newouts[i]->uses_ = oldouts.at(i)->uses_;
-            for (auto &use : newouts[i]->uses_) {
-                auto old_id = use.second.lock()->logical_op_id_;
-                use.second = old_id_2_new_op[old_id];
-            }
-        }
-    }
-    copied_graph.attrs_ = graph.attrs_;
-    // deep copy here.
-    if (graph.dyn_info_) {
-        copied_graph.dyn_info_
-                = std::make_shared<dynamic_lower_info_t>(*graph.dyn_info_);
-    }
-    copied_graph.resort_op_ids(op_id_map);
-    return copied_graph;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/intrusive_opt_level.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/intrusive_opt_level.cpp
deleted file mode 100644
index f8dc4de922e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/intrusive_opt_level.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <fstream>
-#include "pass.hpp"
-#include <ops/convolution.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_INTERNAL_API void intrusive_opt_level(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    for (auto &op : graph.ops_) {
-        if (ctx->flags_.opt_level_ < sc_opt_level::lv3) {
-            if (op->isa<ops::conv_fwd_core_op_t>()) {
-                op->attrs_.set("image_affinity", false);
-            }
-        }
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/padded_mask_mark.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/padded_mask_mark.cpp
deleted file mode 100644
index 3d3b26b2773..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/padded_mask_mark.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "pass.hpp"
-
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <ops/fusible/broadcast.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-SC_MODULE(graph.pass.const_input_fold);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-enum mask_etype : int {
-    no_need_mask = 0,
-    not_nan, // not produce nan/inf(div)
-    has_nan
-};
-const std::unordered_set<std::string> ops_need_mask
-        = {"exp", "select", "add", "sub", "div", "max"};
-static int need_op_mask(const sc_op_ptr &op) {
-    int pre_mask_type = op->attrs_.get_or_else(
-            "temp.mask_type", (int)mask_etype::no_need_mask);
-    if (ops_need_mask.find(op->op_name_) != ops_need_mask.end()) {
-        // div may produce nan/inf, and nan/inf * 0 is nan
-        if (op->op_name_ == "div") {
-            return std::max(pre_mask_type, (int)mask_etype::has_nan);
-        }
-        // f(0, 0) = 0
-        if (utils::is_one_of(op->op_name_, std::string("add"),
-                    std::string("sub"), std::string("max"))) {
-            if (op->get_inputs()[0]->details_.get_plain_dims()
-                    != op->get_inputs()[1]->details_.get_plain_dims()) {
-                return std::max(pre_mask_type, (int)mask_etype::not_nan);
-            }
-            return std::max(pre_mask_type, (int)mask_etype::no_need_mask);
-        }
-        if (op->op_name_ == "select") {
-            if (op->get_inputs()[1]->details_.get_plain_dims()
-                    != op->get_inputs()[2]->details_.get_plain_dims()) {
-                return std::max(pre_mask_type, (int)mask_etype::not_nan);
-            }
-            return std::max(pre_mask_type, (int)mask_etype::no_need_mask);
-        }
-
-        return std::max(pre_mask_type, (int)mask_etype::not_nan);
-    }
-    return pre_mask_type;
-}
-static void mark_mask_attr(const sc_op_ptr &op) {
-    if (!op->isa<unary_elementwise_op_t>()
-            && !op->isa<binary_elementwise_op_t>() && !op->isa<select_op_t>()) {
-        return;
-    }
-    int mask_type = need_op_mask(op);
-    auto &uses = op->get_outputs()[0]->uses_;
-    if (op->get_outputs()[0]->details_.get_format().is_blocking()
-            && mask_type) {
-        for (auto &use : uses) {
-            if (use.second->isa<reduce_op_t>()
-                    || use.second->isa<reduce_impl_op_t>()) {
-                op->attrs_.set(op_attr_key::use_padded_mask, true);
-                return;
-            }
-            if (use.second->isa<tunable_op_t>()
-                    && mask_type == (int)mask_etype::has_nan) {
-                op->attrs_.set(op_attr_key::use_padded_mask, true);
-                return;
-            }
-            if (use.second->isa<movement_op_t>()) {
-                if (use.second->isa<reorder_op_t>()
-                        && use.second->get_inputs()[0]
-                                   ->details_.get_format()
-                                   .is_blocking()
-                        && use.second->get_outputs()[0]
-                                   ->details_.get_format()
-                                   .is_plain()) {
-                    continue;
-                }
-                if (use.second->isa<broadcast_op_t>()) { continue; }
-                op->attrs_.set(op_attr_key::use_padded_mask, true);
-                return;
-            }
-        }
-        // pass down to its uses.
-        for (auto &use : uses) {
-            use.second->attrs_.set("temp.mask_type", mask_type);
-        }
-    }
-    op->attrs_.set(op_attr_key::use_padded_mask, false);
-}
-
-SC_INTERNAL_API void padded_mask_mark(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis.visit_graph(graph, [](op_visitor_t *vis, const sc_op_ptr &node) {
-        mark_mask_attr(node);
-    });
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/pass.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/pass.hpp
deleted file mode 100644
index 2acda575513..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/pass.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_PASS_PASS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_PASS_PASS_HPP
-
-#include <functional>
-#include <ios>
-#include <string>
-#include "../graph.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// Used in `graph_constant_input_folding`, stores as an attribute
-// "constant" in nodes which describe current node's outputs status.
-namespace const_kind {
-// means non constant node
-constexpr int not_const = 0;
-// means intermediate constant node, with constant inputs and its
-// outputs will be used to calculate local/global constant node in init stage.
-constexpr int local_const = 1;
-// means final constant node, with constant inputs and its outputs will be
-// lowerred to global variables and be used as inputs in non-const nodes.
-constexpr int global_const = 2;
-} // namespace const_kind
-
-SC_INTERNAL_API void print_graph(const sc_graph_t &mgr, std::ostream &os,
-        bool print_shape = false, bool print_attr = false,
-        bool print_name = false, bool print_stride = false);
-
-SC_INTERNAL_API sc_graph_t copy_graph(const sc_graph_t &graph);
-
-SC_API bool check_graph_connection(sc_graph_t &graph);
-bool check_graph_config(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-/**
- * Folding nodes with constant attribute from inputs in graph.
- * In this pass, we only propagate "constant" states from constant op and input
- * op which marked `constant`. Actual folding will occur in lowering.
- * */
-SC_INTERNAL_API void graph_constant_input_folding(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-/**
- * Do the same as graph_constant_input_folding, except that it also try to share
- * the constant buffer with other graphs. This pass should be put after all
- * other graph_constant_input_folding
- * */
-SC_INTERNAL_API void graph_constant_input_folding_and_share_constants(
-        sc_graph_t &mgr, const context_ptr &ctx);
-/**
- * Mark the elementwise op with padded input/output could use output mask(not
- * mask load/store) or not. The op could use mask when its direct uses have
- * reduce or memory movement semantics.
- */
-SC_INTERNAL_API void padded_mask_mark(
-        sc_graph_t &graph, const context_ptr &ctx);
-/**
- * Enable/Disable some latest optimizations like image-affinity according to the
- * compiler opt level.
- */
-SC_INTERNAL_API void intrusive_opt_level(
-        sc_graph_t &graph, const context_ptr &ctx);
-
-// find the graph in cached code. If a matched graph is found, the
-// compiler_driver/graph_driver can skip the compilation and reuse the code
-SC_INTERNAL_API void graph_code_cache(sc_graph_t &mgr, const context_ptr &ctx);
-
-/**
- * Compares the graphs.
- * @param lhs the left hand side graph
- * @param rhs the right hand side graph
- * @param first_diff_lhs outputs the first different LHS Op
- * @param first_diff_rhs outputs the first different RHS Op
- * @param lhs_rhs_input_mapping {the left graph input op id, the right graph
- * input op id} mapping
- * @param filter the filter function for op attr, @see
- * sc_graph_t::compare_contents
- * @return true if the the graphs are the same
- * */
-SC_INTERNAL_API bool compare_graph(sc_op_ptr &first_diff_lhs,
-        sc_op_ptr &first_diff_rhs, const sc_graph_t &lhs, const sc_graph_t &rhs,
-        const std::unordered_map<int, int> &lhs_rhs_input_mapping = {},
-        const std::function<bool(const sc_op *, const std::string &)> &filter
-        = nullptr);
-
-/**
- * Compares the graphs.
- * @param lhs the left hand side graph
- * @param rhs the right hand side graph
- * @param lhs_rhs_input_mapping {the left graph input op id, the right graph
- * input op id} mapping
- * @param filter the filter function for op attr, @see
- * sc_graph_t::compare_contents
- * @return true if the the graphs are the same
- * */
-SC_INTERNAL_API bool compare_graph(const sc_graph_t &lhs, const sc_graph_t &rhs,
-        const std::unordered_map<int, int> &lhs_rhs_input_mapping = {},
-        const std::function<bool(const sc_op *, const std::string &)> &filter
-        = nullptr);
-
-namespace runtime {
-struct dynamic_tensor_t;
-}
-/**
- * The api for dynamic infer shape.
- * At runtime, given input real shapes of partition, infer the output shapes.
- * After calling this api, the sizes of output buffers are known for allocation.
- * @param graph cached raw graph after translation(after graph inline and
- * constant optimization) but not graph driver. We need original graph cache to
- * preserve all semantics of ops
- * @param ins a pointer to dynamic tensor vector of inputs, contains real shape
- * info.
- * @param outs a pointer to dynamic tensor vector of outputs, the real shape
- * need to be infered.
- * @param num_ins, the length of input dynamic tensor vector, should be equal to
- * number of inputs in graph.
- * @param num_outs, the length of output dynamic tensor vector, should be equal
- * to number of outputs in graph.
- * */
-SC_API void dynamic_infer_shape_by_graph(sc_graph_t &graph,
-        runtime::dynamic_tensor_t **ins, runtime::dynamic_tensor_t **outs,
-        size_t num_ins, size_t num_outs);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/printer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/printer.cpp
deleted file mode 100644
index dacbf29f8df..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/pass/printer.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <fstream>
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../visitor.hpp"
-#include "pass.hpp"
-#include <compiler/ir/graph/lowering.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_INTERNAL_API void print_graph(const sc_graph_t &mgr, std::ostream &os,
-        bool print_shape, bool print_attr, bool print_name, bool print_stride) {
-    std::unordered_map<graph_tensor_ptr, int> tsr_idx;
-    auto get_tensor_id = [&](const graph_tensor_ptr &t) {
-        auto itr = tsr_idx.find(t);
-        if (itr != tsr_idx.end()) { return itr->second; }
-        int ret = tsr_idx.size();
-        tsr_idx[t] = ret;
-        return ret;
-    };
-    auto print_tensor_name = [&](const graph_tensor_ptr &t) {
-        if (print_name) {
-            sc_op *linked_output = nullptr;
-            for (auto &use : t->uses_) {
-                if (use.second->isa<output_op>()) {
-                    linked_output = use.second.get();
-                    break;
-                }
-            }
-            auto name = graph::get_tensor_name(t.get(), linked_output);
-            if (!name.empty()) {
-                os << name;
-                return;
-            }
-        }
-        os << 'v' << get_tensor_id(t);
-    };
-    auto print_tensor_list = [&](const std::vector<graph_tensor_ptr> &list,
-                                     bool p_shape, bool p_stride) {
-        bool is_first_input = true;
-        for (const auto &tsr : list) {
-            if (!is_first_input) {
-                os << ", ";
-            } else {
-                is_first_input = false;
-            }
-            print_tensor_name(tsr);
-            if (p_shape) {
-                os << ": " << tsr->details_.dtype_ << '[';
-                bool is_first_shape = true;
-                for (auto dim : tsr->details_.get_blocking_dims()) {
-                    if (!is_first_shape) {
-                        os << ", ";
-                    } else {
-                        is_first_shape = false;
-                    }
-                    os << dim;
-                }
-                os << ']';
-            }
-            if (p_stride) {
-                os << ", "
-                   << (tsr->details_.is_dense() ? "dense:" : "strided:");
-                for (auto stride : tsr->details_.get_strides()) {
-                    os << "s";
-                    os << stride;
-                }
-            }
-        }
-    };
-    std::vector<graph_tensor_ptr> inputs;
-    std::vector<graph_tensor_ptr> outputs;
-    for (auto &v : mgr.ops_) {
-        if (v->isa<input_op>()) {
-            inputs.insert(inputs.end(), v->get_outputs().begin(),
-                    v->get_outputs().end());
-        } else if (v->isa<output_op>()) {
-            outputs.insert(outputs.end(), v->get_inputs().begin(),
-                    v->get_inputs().end());
-        }
-    }
-    std::string graph_name;
-    if (print_name) {
-        graph_name = mgr.attrs_.get_or_else<std::string>("temp.name", "graph");
-    } else {
-        graph_name = "graph";
-    }
-    os << graph_name << '(';
-    print_tensor_list(inputs, print_shape, print_stride);
-    os << ") -> [";
-    print_tensor_list(outputs, print_shape, print_stride);
-    os << "] {\n";
-
-    op_visitor_t::dfs_topology_sort().visit_graph(
-            mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-                if (!node->isa<input_op>() && !node->isa<output_op>()) {
-                    print_indents(os, 1);
-
-                    os << '[';
-                    print_tensor_list(
-                            node->get_outputs(), print_shape, print_stride);
-                    os << "] = " << node->op_name_;
-                    os << '(';
-                    if (auto con_node = node->dyn_cast<constant_op_t>()) {
-                        os << utils::print_vector(
-                                con_node->get_constant_blocking_dims());
-                    } else {
-                        print_tensor_list(node->get_inputs(), false, false);
-                    }
-                    os << ')' << '\n';
-                }
-            });
-    os << '}' << '\n';
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/compensation.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/compensation.cpp
deleted file mode 100644
index 9dc552f5393..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/compensation.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "quantize_info.hpp"
-#include <compiler/ir/sc_expr.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static std::vector<union_val> get_padded_union_val_vector(
-        const std::vector<int> &input, unsigned padded) {
-    if (input.empty() || padded <= 0) { return std::vector<union_val>(); }
-    std::vector<union_val> results(padded);
-    for (unsigned i = 0; i < padded; i++) {
-        if (i < input.size()) {
-            results[i] = (int64_t)input[i];
-        } else {
-            results[i] = (int64_t)0;
-        }
-    }
-    return results;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_info.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_info.hpp
deleted file mode 100644
index 9ad841515f4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_info.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_QUANTIZATION_QUANTIZE_INFO_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_QUANTIZATION_QUANTIZE_INFO_HPP
-
-#include <vector>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/ir/sc_data_type.hpp>
-#include <util/any_map.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace attr_keys {
-// static quantize attribute keys, values are concrete numbers.
-constexpr const char *scales = "scales";
-constexpr const char *zero_points = "zero_points";
-constexpr const char *data_scales = "data_scales";
-constexpr const char *data_zero_points = "data_zero_points";
-constexpr const char *weight_scales = "weight_scales";
-constexpr const char *weight_zero_points = "weight_zero_points";
-// dynamic quantize attribute keys, values are std::shared_ptr<graph_tensor>.
-// need "temp.". e.g. "temp.data_dyn_scales"
-constexpr const char *dyn_scales = "temp.dyn_scales";
-constexpr const char *dyn_zero_points = "temp.dyn_zero_points";
-constexpr const char *dyn_data_scales = "temp.dyn_data_scales";
-constexpr const char *dyn_data_zero_points = "temp.dyn_data_zero_points";
-constexpr const char *dyn_weight_scales = "temp.dyn_weight_scales";
-constexpr const char *dyn_weight_zero_points = "temp.dyn_weight_zero_points";
-// common attributes
-constexpr const char *quan_dtype = "dtype";
-constexpr const char *per_channel = "per_channel";
-constexpr const char *channel_axis = "channel_axis";
-constexpr const char *weight_channel_axis = "weight_channel_axis";
-constexpr const char *output_channel_axis
-        = "output_channel_axis"; // output of tuanble ops
-constexpr const char *asymmetric = "asymmetric";
-// pass attributes
-constexpr const char *mixed_dtype = "mixed_dtype";
-constexpr const char *may_quantize = "may_quantize";
-} // namespace attr_keys
-
-namespace quantize {
-
-enum class tensor_type { input_tensor, weight_tensor, bias_tensor };
-
-/** information for int8 quantization, include quantize/re-quantize/de-quantize.
- * @param dtype_ decide clip range, u8 [0,255], s8 [-128, 127]
- * @param scales_ scales for current quantize op output.
- * @param zero_points_ output zero points for asymmetric quantize, no need
- * input zero points.
- * @param per_channel_ a param for conv op, if true, each channel has a scale
- * and zero point.
- * @param channel_axis_ an option param available when per_channel_ is true,
- * points which axis is channel.
- * @param asymmetric_ whether use symmetric or asymmetric quantization.(not used
- * yet)
- * */
-
-struct quantize_infos_t {
-    sc_data_type_t dtype_ = sc_data_type_t::u8(1);
-    std::vector<float> scales_ = {1.f};
-    std::vector<int> zero_points_ = {0};
-    bool per_channel_ = false;
-    int channel_axis_ = 0;
-    bool asymmetric_ = false;
-    quantize_infos_t() = default;
-    quantize_infos_t(sc_data_type_t dtype, const std::vector<float> &scales,
-            const std::vector<int> &zero_points, bool per_channel = false,
-            int channel_axis = 0, bool asymmetric = false)
-        : dtype_(dtype)
-        , scales_(scales)
-        , zero_points_(zero_points)
-        , per_channel_(per_channel)
-        , channel_axis_(channel_axis)
-        , asymmetric_(asymmetric) {}
-};
-quantize_infos_t get_quantize_info_from_attrs(const any_map_t &attrs);
-
-} // namespace quantize
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_op.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_op.cpp
deleted file mode 100644
index 5b1bb184907..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_op.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <fstream>
-#include <memory>
-#include <numeric>
-#include <vector>
-#include "quantize_op.hpp"
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <util/math_utils.hpp>
-#include <util/utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace quantize {
-
-static void common_query_function(sc_op *node,
-        const sc_data_format_t &out_format,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    out_formats.push_back({out_format});
-    node->format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-quantize_infos_t get_quantize_info_from_attrs(const any_map_t &attrs) {
-    quantize_infos_t infos;
-    infos.dtype_
-            = attrs.get_or_else(attr_keys::quan_dtype, sc_data_type_t::u8(1));
-    infos.scales_
-            = attrs.get_or_else(attr_keys::scales, std::vector<float> {1.f});
-    infos.zero_points_
-            = attrs.get_or_else(attr_keys::zero_points, std::vector<int> {0});
-    infos.per_channel_ = attrs.get_or_else(attr_keys::per_channel, false)
-            || infos.scales_.size() > 1;
-    infos.channel_axis_ = attrs.get_or_else(attr_keys::channel_axis, 0);
-    infos.asymmetric_ = attrs.get_or_else(attr_keys::asymmetric, true);
-    assert(utils::is_one_of(infos.dtype_, datatypes::f32, datatypes::bf16,
-                   datatypes::u8, datatypes::s8)
-            && ((infos.per_channel_ && !infos.scales_.empty())
-                    || (!infos.per_channel_ && infos.scales_.size() == 1
-                            && infos.zero_points_.size() <= 1))
-            && (infos.asymmetric_
-                    || (!infos.asymmetric_
-                            && (infos.zero_points_.empty()
-                                    || (infos.zero_points_.size() == 1
-                                            && infos.zero_points_[0] == 0)))));
-    return infos;
-}
-
-static void check_dynamic_quantization_schema(const quantize_infos_t &qinfos,
-        const std::vector<graph_tensor_ptr> &inputs) {
-    const auto &input = inputs[0];
-    const auto &scales = inputs[1];
-    const auto &zps = inputs.size() == 3 ? inputs[2] : nullptr;
-    COMPILE_ASSERT(scales->details_.dtype_.type_code_ == sc_data_etype::F32,
-            "dynamic quantization's scales shall be f32 dtype.");
-    if (zps) {
-        COMPILE_ASSERT(utils::is_one_of(zps->details_.dtype_.type_code_,
-                               sc_data_etype::U8, sc_data_etype::S8,
-                               sc_data_etype::S32),
-                "dynamic quantization's zps shall be one of u8/s8/s32 dtype.");
-    }
-    if (qinfos.per_channel_) {
-        const auto &channel_size
-                = input->details_.get_plain_dims()[qinfos.channel_axis_];
-        COMPILE_ASSERT(
-                gc::graph::check_shape_equal(scales->details_.get_plain_dims(),
-                        sc_dims {channel_size}),
-                "dynamic quantization op's scale shall confirm with channel "
-                "size in per_channel case.");
-        if (zps) {
-            COMPILE_ASSERT(
-                    gc::graph::check_shape_equal(zps->details_.get_plain_dims(),
-                            sc_dims {channel_size}),
-                    "dynamic quantization op's zps size shall confirm with "
-                    "channel size in per_channel case.");
-        }
-    } else {
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               scales->details_.get_plain_dims(), sc_dims {1}),
-                "dynamic quantization op's scale shall be {1} in "
-                "per_tensor case.");
-        if (zps) {
-            COMPILE_ASSERT(gc::graph::check_shape_equal(
-                                   zps->details_.get_plain_dims(), sc_dims {1}),
-                    "dynamic quantization op's zps shall be {1} in "
-                    "per_tensor case.");
-        }
-    }
-}
-
-quantize_op_t::quantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    assert(ins.size() == 1);
-    assert(ins[0]->details_.dtype_.type_code_ == sc_data_etype::F32
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::S32);
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        // fixme: correctly infer the shape for broadcast
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_ = ins[0]->details_;
-        assert(attrs.has_key(attr_keys::quan_dtype));
-        info_.outputs_[0]->details_.dtype_
-                = attrs.get<sc_data_type_t>(attr_keys::quan_dtype);
-    } else {
-        info_.outputs_ = outs;
-    }
-    attrs_ = attrs;
-    op_name_ = "quantize";
-}
-
-quantize_op_t::quantize_op_t(
-        const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs)
-    : quantize_op_t(ins, std::vector<graph_tensor_ptr>(), attrs) {}
-
-void quantize_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    const auto qinfos = get_quantize_info_from_attrs(attrs_);
-    assert(utils::is_one_of(qinfos.dtype_, datatypes::u8, datatypes::s8));
-    auto scales = qinfos.scales_;
-    scales = math_utils::vector_rcp(scales);
-    std::shared_ptr<static_data_t> scales_ptr
-            = std::make_shared<static_data_t>(scales);
-
-    sc_dims plain_dims = {1};
-    if (scales.size() > 1) {
-        plain_dims.resize(inputs[0]->details_.get_plain_dims().size(), 1);
-        plain_dims[qinfos.channel_axis_] = static_cast<int>(scales.size());
-    }
-    auto quantize_const_scales = graph->make("constant", {}, {},
-            {{"values", scales_ptr}, {"dtype", datatypes::f32},
-                    {"plain_dims", plain_dims}, {"format", sc_data_format_t()},
-                    {"all_positive", true}});
-    auto div_scale = graph->make("mul",
-            {inputs[0], quantize_const_scales->get_outputs()[0]}, {}, {});
-
-    auto zeropoints = qinfos.zero_points_;
-    if (!zeropoints.empty()) {
-        int zp_all_zero = std::all_of(zeropoints.begin(), zeropoints.end(),
-                [](int i) { return i == 0; });
-        if (!zp_all_zero) {
-            std::vector<float> zeropoints_f32(
-                    zeropoints.begin(), zeropoints.end());
-            std::shared_ptr<static_data_t> zeropoints_ptr
-                    = std::make_shared<static_data_t>(zeropoints_f32);
-            auto quantize_const_zeropoints = graph->make("constant", {}, {},
-                    {{"values", zeropoints_ptr}, {"dtype", datatypes::f32},
-                            {"plain_dims",
-                                    sc_dims {static_cast<sc_dim>(
-                                            zeropoints_f32.size())}},
-                            {"format", sc_data_format_t()}});
-
-            div_scale = graph->make("add",
-                    {div_scale->get_outputs()[0],
-                            quantize_const_zeropoints->get_outputs()[0]},
-                    {}, {});
-        }
-    }
-    // maybe we need clip op in future
-#if 0
-        auto clip = graph->make("clip", sub_zp->get_outputs(), {},
-                {{"clip_min",
-                         qinfos.dtype_.is_etype(sc_data_etype::U8)
-                                 ? 0.f
-                                 : -128.f},
-                        {"clip_max",
-                                qinfos.dtype_.is_etype(
-                                        sc_data_etype::U8)
-                                        ? 255.f
-                                        : 127.f}});
-#endif
-    auto int8_cast = graph->make("cast", div_scale->get_outputs(), {},
-            {{"dtype", qinfos.dtype_}, {"saturated", true}});
-    graph->make_output(int8_cast->get_outputs());
-}
-
-void quantize_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    common_query_function(this, info_.inputs_[0]->details_.get_format(),
-            supported_ins, supported_outs);
-}
-
-dequantize_op_t::dequantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    assert(ins.size() == 1);
-    assert(ins[0]->details_.dtype_.type_code_ == sc_data_etype::U8
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::S8
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::S32
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::F32);
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        // fixme: correctly infer the shape for broadcast
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_ = ins[0]->details_;
-        info_.outputs_[0]->details_.dtype_.type_code_ = sc_data_etype::F32;
-    } else {
-        info_.outputs_ = outs;
-    }
-    attrs_ = attrs;
-    op_name_ = "dequantize";
-}
-
-dequantize_op_t::dequantize_op_t(
-        const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs)
-    : dequantize_op_t(ins, std::vector<graph_tensor_ptr>(), attrs) {}
-
-void dequantize_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto qinfos = get_quantize_info_from_attrs(attrs_);
-    qinfos.dtype_ = datatypes::f32;
-    std::vector<float> scales = qinfos.scales_;
-    std::shared_ptr<static_data_t> scales_ptr
-            = std::make_shared<static_data_t>(scales);
-    sc_dims scales_plain_dims = {1};
-    if (scales.size() > 1) {
-        scales_plain_dims.resize(
-                inputs[0]->details_.get_plain_dims().size(), 1);
-        scales_plain_dims[qinfos.channel_axis_]
-                = static_cast<int>(scales.size());
-    }
-    auto ins = graph->make_input(inputs);
-    auto const_scales = graph->make("constant", {}, {},
-            {{"values", scales_ptr}, {"dtype", datatypes::f32},
-                    {"plain_dims", scales_plain_dims},
-                    {"format", sc_data_format_t()}, {"all_positive", true}});
-    auto f32_cast = ins;
-    if (inputs[0]->details_.dtype_.type_code_ != sc_data_etype::F32) {
-        f32_cast = graph->make(
-                "cast", ins->get_outputs(), {}, {{"dtype", qinfos.dtype_}});
-    }
-
-    bool all_zero = std::all_of(qinfos.zero_points_.begin(),
-            qinfos.zero_points_.end(), [](int x) { return x == 0; });
-    if (!all_zero) {
-        std::vector<float> zero_points(
-                qinfos.zero_points_.begin(), qinfos.zero_points_.end());
-        auto const_zero_points = graph->make("constant", {}, {},
-                {{"values", std::make_shared<static_data_t>(zero_points)},
-                        {"dtype", datatypes::f32},
-                        {"plain_dims",
-                                sc_dims {static_cast<sc_dim>(
-                                        zero_points.size())}},
-                        {"format", sc_data_format_t()}});
-        f32_cast = graph->make("sub",
-                {f32_cast->get_outputs()[0],
-                        const_zero_points->get_outputs()[0]},
-                {}, {});
-    }
-    auto mul_scale = graph->make("mul",
-            {f32_cast->get_outputs()[0], const_scales->get_outputs()[0]}, {},
-            {});
-    graph->make_output(mul_scale->get_outputs());
-}
-
-void dequantize_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    common_query_function(this, info_.inputs_[0]->details_.get_format(),
-            supported_ins, supported_outs);
-}
-
-dynamic_quantize_op_t::dynamic_quantize_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 2 || ins.size() == 3,
-            "dynamic quantize op shall have 2 or 3 inputs.");
-    assert(ins[0]->details_.dtype_.type_code_ == sc_data_etype::F32
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::S32);
-    info_.inputs_ = ins;
-    const auto qinfos = get_quantize_info_from_attrs(attrs);
-    check_dynamic_quantization_schema(qinfos, info_.inputs_);
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_ = ins[0]->details_;
-        assert(attrs.has_key(attr_keys::quan_dtype));
-        info_.outputs_[0]->details_.dtype_
-                = attrs.get<sc_data_type_t>(attr_keys::quan_dtype);
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               info_.inputs_[0]->details_.get_plain_dims(),
-                               info_.outputs_[0]->details_.get_plain_dims()),
-                "dynamic quantize's output shall have the same shape as "
-                "input.");
-        COMPILE_ASSERT(
-                utils::is_one_of(info_.outputs_[0]->details_.dtype_.type_code_,
-                        sc_data_etype::U8, sc_data_etype::S8),
-                "dynamic quantize's output shall be s8 or u8 dtype.");
-    }
-    attrs_ = attrs;
-    op_name_ = "dynamic_quantize";
-}
-
-void dynamic_quantize_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    const auto qinfos = get_quantize_info_from_attrs(attrs_);
-    assert(utils::is_one_of(qinfos.dtype_, datatypes::u8, datatypes::s8));
-    auto &inp = inputs[0];
-    auto &scales = inputs[1];
-    auto inp_op = graph->make_input(inputs);
-    auto bc_axis = scales->details_.get_plain_dims() != sc_dims {1}
-            ? std::vector<int> {qinfos.channel_axis_}
-            : std::vector<int> {};
-    auto div_scale
-            = graph->make("div", {inp, scales}, {}, {{"bc_axis", bc_axis}});
-    if (inputs.size() == 3) {
-        auto zp_cast = graph->make(
-                "cast", {inputs[2]}, {}, {{"dtype", datatypes::f32}});
-        div_scale = graph->make("add",
-                {div_scale->get_outputs()[0], zp_cast->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-    }
-    auto int8_cast = graph->make("cast", div_scale->get_outputs(), {},
-            {{"dtype", qinfos.dtype_}, {"saturated", true}});
-    graph->make_output(int8_cast->get_outputs());
-}
-
-void dynamic_quantize_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    common_query_function(this, info_.inputs_[0]->details_.get_format(),
-            supported_ins, supported_outs);
-}
-
-dynamic_dequantize_op_t::dynamic_dequantize_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    assert(ins.size() == 2 || ins.size() == 3);
-    assert(ins[0]->details_.dtype_.type_code_ == sc_data_etype::U8
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::S8
-            || ins[0]->details_.dtype_.type_code_ == sc_data_etype::S32);
-    info_.inputs_ = ins;
-    const auto qinfos = get_quantize_info_from_attrs(attrs);
-    check_dynamic_quantization_schema(qinfos, info_.inputs_);
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_ = ins[0]->details_;
-        info_.outputs_[0]->details_.dtype_ = datatypes::f32;
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               info_.inputs_[0]->details_.get_plain_dims(),
-                               info_.outputs_[0]->details_.get_plain_dims()),
-                "dynamic dequantize's output shall have the same shape as "
-                "input.");
-        COMPILE_ASSERT(info_.outputs_[0]->details_.dtype_.type_code_
-                        == sc_data_etype::F32,
-                "dynamic dequantize's output shall be fp32 dtype.");
-    }
-    attrs_ = attrs;
-    op_name_ = "dynamic_dequantize";
-}
-
-void dynamic_dequantize_op_t::get_graph_impl(
-        std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto qinfos = get_quantize_info_from_attrs(attrs_);
-    qinfos.dtype_ = datatypes::f32;
-    auto &inp = inputs[0];
-    auto &scales = inputs[1];
-    auto inp_op = graph->make_input(inputs);
-    auto f32_cast = graph->make("cast", {inp}, {}, {{"dtype", datatypes::f32}});
-    auto bc_axis = scales->details_.get_plain_dims() != sc_dims {1}
-            ? std::vector<int> {qinfos.channel_axis_}
-            : std::vector<int> {};
-    if (inputs.size() == 3) {
-        auto zp_cast = graph->make(
-                "cast", {inputs[2]}, {}, {{"dtype", datatypes::f32}});
-        f32_cast = graph->make("sub",
-                {f32_cast->get_outputs()[0], zp_cast->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-    }
-    auto mul_scale = graph->make("mul", {f32_cast->get_outputs()[0], scales},
-            {}, {{"bc_axis", bc_axis}});
-    graph->make_output(mul_scale->get_outputs());
-}
-
-void dynamic_dequantize_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    common_query_function(this, info_.inputs_[0]->details_.get_format(),
-            supported_ins, supported_outs);
-}
-
-} // namespace quantize
-
-OP_REGISTER(quantize::quantize_op_t, quantize)
-OP_REGISTER(quantize::dequantize_op_t, dequantize)
-OP_REGISTER(quantize::dynamic_quantize_op_t, dynamic_quantize)
-OP_REGISTER(quantize::dynamic_dequantize_op_t, dynamic_dequantize)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_op.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_op.hpp
deleted file mode 100644
index 866c6522f24..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/quantization/quantize_op.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_QUANTIZATION_QUANTIZE_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_QUANTIZATION_QUANTIZE_OP_HPP
-
-#include <memory>
-#include <vector>
-#include "quantize_info.hpp"
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/graph_op.hpp>
-#include <compiler/ir/graph/traits.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace quantize {
-/**
- * A temporary quantize and requantize op before calculation for conversion from
- * frontend. Support bf16 and int8/uint8 quantization.
- * General quantize op, scales and zero points are stored in attribute.
- * */
-class quantize_op_t : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    quantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    quantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const any_map_t &attrs = any_map_t());
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-// dequantize op
-class dequantize_op_t : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    dequantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    dequantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const any_map_t &attrs = any_map_t());
-
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-/**
- * Dynamic quantize op, with scales and zero points as extra inputs.
- * */
-class dynamic_quantize_op_t : public graph_op_t,
-                              public op_traits::auto_copyable_t {
-public:
-    dynamic_quantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    dynamic_quantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const any_map_t &attrs = any_map_t());
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-// dynamic dequantize op
-class dynamic_dequantize_op_t : public graph_op_t,
-                                public op_traits::auto_copyable_t {
-public:
-    dynamic_dequantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    dynamic_dequantize_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const any_map_t &attrs = any_map_t());
-
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-} // namespace quantize
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tensor_detail.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tensor_detail.hpp
deleted file mode 100644
index 1876813842a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tensor_detail.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TENSOR_DETAIL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TENSOR_DETAIL_HPP
-
-#include <algorithm>
-#include <functional>
-
-#include <iosfwd>
-#include <vector>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/ir/sc_data_type.hpp>
-#include <compiler/ir/sc_expr.hpp>
-#include <unordered_set>
-#include <util/assert.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace reflection {
-template <typename T, typename Dummy>
-struct type_registry;
-}
-class sc_graph_t;
-struct SC_INTERNAL_API logical_tensor_t {
-public:
-    sc_data_type_t dtype_;
-    logical_tensor_t() = default;
-
-    bool operator==(const logical_tensor_t &other) const {
-        return format_ == other.format_ && dtype_ == other.dtype_
-                && plain_dims_ == other.plain_dims_
-                && strides_ == other.strides_;
-    }
-
-    bool operator!=(const logical_tensor_t &other) const {
-        return !(*this == other);
-    }
-
-    logical_tensor_t(const sc_data_format_t &format, const sc_dims &plain_dims,
-            const sc_data_type_t &type, const sc_dims &strides = {})
-        : dtype_(type)
-        , format_(format)
-        , plain_dims_(plain_dims)
-        , strides_(strides) {
-        internal_update();
-    }
-
-    // gets the dims, taking blocking into consideration, using cache
-    // If the shape is dynamic, this function should not be used in lowering.
-    // Use `sc_data_format_t::get_blocking_shapes_expr(plain_shape, format)`
-    // instead.
-    const sc_dims &get_blocking_dims() const;
-    // gets the blocking dims exprs, we could not just use
-    // graph.dims_to_expr(dims_) because of may generate block and num of block
-    // of dynamic var, we should calculate by
-    // sc_data_format_t::get_blocking_dims_expr();
-    std::vector<expr> get_blocking_dims_expr(sc_graph_t &g) const;
-    // sets the logical dims in blocking format
-    void set_blocking_dims(const sc_dims &blocking_dims);
-    // gets the logical dims in plain format
-    const sc_dims &get_plain_dims() const { return plain_dims_; }
-    // gets strides corresponding to each blocking dim
-    const sc_dims &get_strides() const { return strides_; }
-    // In dynamic, strides should be compute by compute_dense_stride_expr()
-    std::vector<expr> get_strides_expr(sc_graph_t &) const;
-    // sets the logical dims in plain format
-    void set_plain_dims(const sc_dims &plain_dims);
-    // gets the data format
-    const sc_data_format_t &get_format() const { return format_; }
-    // sets the data format and invalidate the cached blocking_dims
-    void set_format(const sc_data_format_t &newv);
-    const std::unordered_set<sc_data_format_t> &get_format_candidates() const {
-        return format_candidates_;
-    }
-    // add a format to candidates, if exists in candidates, do nothing.
-    void add_format_candidate(const sc_data_format_t &newv);
-    // remove format candidate by format.
-    void remove_format_candidate(const sc_data_format_t &v);
-    // sets the format candidates and update the format_ if number of candidates
-    // == 1
-    void set_format_candidates(const std::vector<sc_data_format_t> &newf);
-    // gets the size of the tensor in bytes
-    size_t get_blocking_byte_size() const;
-    // if the tensor is dynamic
-    bool is_dynamic() const;
-    // sets the strides
-    void set_strides(const sc_dims &strides);
-    // sets the data format and stride
-    void set_format_and_stride(
-            const sc_data_format_t &newv, const sc_dims &strides);
-    // judge whether the current logical tensor is dense
-    bool is_dense();
-    // print the tensor detail to string
-    void to_string(std::ostream &os);
-    // used to compute dense stride based on dims
-    static sc_dims compute_dense_stride(const sc_dims &dims);
-    static std::vector<expr> compute_dense_stride_expr(
-            sc_graph_t &graph, const std::vector<expr> &dims);
-    size_t hash() const;
-
-private:
-    template <typename T, typename Dummy>
-    friend struct reflection::type_registry;
-    friend struct std::hash<dnnl::impl::graph::gc::logical_tensor_t>;
-    // definite format for internal sync.
-    sc_data_format_t format_;
-    // The real dims, which may be blocking.
-    sc_dims dims_;
-    // the logical dims in plain format
-    sc_dims plain_dims_;
-    // strides corresponding to each dim of `dims_`, always dense in dynamic
-    sc_dims strides_;
-    // dynamic shape may has several format/stride pair in graph.
-    std::unordered_set<sc_data_format_t> format_candidates_;
-    // sync real dims based on plain dims and format
-    void internal_update();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::logical_tensor_t> {
-    std::size_t operator()(
-            const dnnl::impl::graph::gc::logical_tensor_t &k) const;
-};
-} // namespace std
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tensor_slice.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tensor_slice.hpp
deleted file mode 100644
index 7becfb1ee0e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tensor_slice.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TENSOR_SLICE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TENSOR_SLICE_HPP
-
-#include <utility> //for std::pair
-#include <vector>
-#include <compiler/ir/sc_expr.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using slice_range = std::vector<std::pair<expr, expr>>;
-/**
- * A slice of the tensor.
- * @param tptr_ the base tensor_ptr
- * @param shape_ the slice shape
- * */
-struct tensor_slice {
-    tensorptr tptr_;
-    std::vector<expr> shape_;
-    tensor_slice() = default;
-
-    tensor_slice(const expr &tsr);
-
-    tensor_slice(const expr &tsr, slice_range &&ranges);
-
-    // Gets the start address of the tensor slice
-    expr get_tensor_ptr() const { return tptr_; }
-
-    // Gets the shape of the sliced tensor
-    const std::vector<expr> &get_shape() const { return shape_; }
-
-    int64_t nslice_dims() const { return static_cast<int64_t>(shape_.size()); }
-    int64_t nbase_dims() const {
-        return static_cast<int64_t>(get_base_dims().size());
-    }
-
-    // Gets the offset of the sliced tensor
-    const std::vector<expr> &get_offset() const { return tptr_->base_->idx_; }
-
-    // Gets the ranges of the sliced tensor
-    slice_range get_ranges() const;
-
-    // Gets the real shape of base tensor (const version)
-    const std::vector<expr> &get_base_dims() const;
-
-    // Gets the dtype of base tensor
-    sc_data_type_t get_base_dtype() const;
-
-    // Gets the real tensor of tensor slice, not the tensor_ptr
-    tensor get_real_tensor() const;
-
-    // check whether slice is full on specific axis
-    bool full_on_axis(const std::vector<int> &axis) const;
-
-    // is_full
-    bool is_full() const;
-
-    // is_const
-    bool is_const() const;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/configurable.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/configurable.hpp
deleted file mode 100644
index f60bb0983bc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/configurable.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_CONFIGURABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_CONFIGURABLE_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/traits.hpp>
-#include <unordered_map>
-#include <util/general_object.hpp>
-#include <util/reflection.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using config_ptr_vec = std::vector<reflection::shared_general_object_t>;
-using impl_kind_map = std::unordered_map<std::vector<uint64_t>, int>;
-namespace op_traits {
-struct configurable_t : public virtual op_base_trait_t {
-    virtual reflection::shared_general_object_t get_config() = 0;
-
-    virtual void set_config(const reflection::shared_general_object_t &config)
-            = 0;
-    virtual reflection::shared_general_object_t get_default_config(
-            context_ptr ctx)
-            = 0;
-    // Get config space in dynamic shape case, similar with get_config_space but
-    // due to the number limit of kernels, the space is shrinked from static
-    // config space. Return the vector of config ptr, index is the impl kind.
-    virtual config_ptr_vec get_dynamic_config_candidates(const context_ptr &ctx)
-            = 0;
-    // Convert the input config space vector to a unordered map for query in
-    // runtime. This function accepts the configs from output of
-    // `get_dynamic_config_candidates`. We need the intermediate
-    // result(config_ptr_vec) so we split the implement into this two functions.
-    virtual impl_kind_map convert_config_candidates_to_impl_map(
-            const config_ptr_vec &configs)
-            = 0;
-};
-
-} // namespace op_traits
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_broadcast.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_broadcast.cpp
deleted file mode 100644
index 0f6bed9f047..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_broadcast.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <vector>
-
-#include "may_broadcast.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace op_traits {
-
-std::vector<int> may_broadcast_t::get_auto_broadcast_bc_axis(
-        const sc_dims &input_shape, const sc_dims &output_shape) {
-    if (input_shape.size() == 1 && input_shape[0] == 1) { return {-1}; }
-    // following auto_broadcast semantics
-    const size_t input_rank = input_shape.size();
-    const size_t output_rank = output_shape.size();
-    COMPILE_ASSERT(output_rank >= input_rank,
-            "Incorrect input or output shape for broadcast op.");
-    const size_t offset = output_rank - input_rank;
-    std::vector<int> bc_axis;
-    for (size_t i = 0; i < input_rank; ++i) {
-        // TODO(yifei): consider whether input_shape[i] != 1 is
-        // necessary here
-        if (input_shape[i] == output_shape[i + offset]
-                || (is_dynamic_dim(input_shape[i])
-                        && is_dynamic_dim(output_shape[i + offset]))) {
-            bc_axis.emplace_back(i + offset);
-        }
-    }
-    if (bc_axis.empty()) { bc_axis.emplace_back(-1); }
-    return bc_axis;
-}
-
-bool may_broadcast_t::broadcastable_shape_equal(
-        const sc_dims &shape1, const sc_dims &shape2) {
-    COMPILE_ASSERT(shape1.size() <= shape2.size(),
-            "broadcastable shape equal function shall have input shape1 "
-            "smaller than input shape2.");
-    if (shape1.size() != shape2.size()) return false;
-    for (size_t i = 0; i < shape1.size(); ++i) {
-        if (!is_dynamic_dim(shape1[i]) && !is_dynamic_dim(shape2[i])
-                && shape1[i] != shape2[i]) {
-            return false;
-        } else if (shape1[i] == 1 && is_dynamic_dim(shape2[i])) {
-            return false;
-        }
-    }
-    return true;
-}
-
-sc_dims may_broadcast_t::infer_auto_broadcast_output_shape(
-        const sc_dims &lhs, const sc_dims &rhs) {
-    const size_t lhs_rank = lhs.size();
-    const size_t rhs_rank = rhs.size();
-    const size_t max_rank = std::max(lhs_rank, rhs_rank);
-
-    const size_t lhs_offset = max_rank - lhs_rank;
-    const size_t rhs_offset = max_rank - rhs_rank;
-    bool auto_broadcastable = true;
-    sc_dims ret(max_rank, 1);
-    for (size_t i = 0; i < max_rank; ++i) {
-        sc_dim l = 1, r = 1;
-        if (i >= lhs_offset) l = lhs[i - lhs_offset];
-        if (i >= rhs_offset) r = rhs[i - rhs_offset];
-        if (l == 1 || r == 1) {
-            ret[i] = (l == 1 ? r : l);
-        } else if (is_dynamic_dim(l) && is_dynamic_dim(r)) {
-            // TODO(xxx): correctly handle dynamic case
-            ret[i] = l;
-        } else if (is_dynamic_dim(l) || is_dynamic_dim(r)) {
-            ret[i] = (is_dynamic_dim(l) ? r : l);
-        } else {
-            if (l != r) {
-                auto_broadcastable = false;
-                break;
-            } else {
-                ret[i] = l;
-            }
-        }
-    }
-    COMPILE_ASSERT(auto_broadcastable,
-            "The given input shapes do not follow auto_broadcast "
-            "semantics. "
-            "Please recheck the input shapes, or consider specifying "
-            "detailed "
-            "bc_axis.");
-    return ret;
-}
-
-sc_data_format_t may_broadcast_t::infer_broadcast_format(
-        const logical_tensor_t &target_lt, const logical_tensor_t &bc_lt) {
-    COMPILE_ASSERT(
-            bc_lt.get_plain_dims().size() == target_lt.get_plain_dims().size(),
-            "infer_blocking_format only support plain dimension aligned "
-            "cases");
-    sc_data_format_kind_t target_lt_format_code
-            = target_lt.get_format().format_code_;
-    sc_data_format_t::blocking_t blocks = target_lt.get_format().blocks_;
-    sc_data_format_kind_t bc_lt_format_code = bc_lt.get_format().format_code_;
-    // start infer the blocks
-    sc_dims bc_plain_dim = bc_lt.get_plain_dims();
-    sc_dims target_plain_dim = target_lt.get_plain_dims();
-    int target_batch_dim = target_lt.get_plain_dims().size()
-            - target_lt_format_code.norig_dims();
-    for (int i = 0; i < target_lt_format_code.norig_dims(); ++i) {
-        if (bc_plain_dim[target_batch_dim + i] == 1
-                && target_plain_dim[target_batch_dim + i] != 1) {
-            // if bc_plain_dim is 1 and this axis is with broadcast
-            // semantics
-            auto axis = target_lt_format_code.collect_blocking_index(i);
-            for (auto ax : axis) {
-                blocks[ax] = 1;
-            }
-        }
-    }
-    // start infer the format code
-    // if both batch OR both non-batch
-    // smaller side's format code == larger side's format code
-    COMPILE_ASSERT(target_lt_format_code.norig_dims()
-                    == bc_lt_format_code.norig_dims(),
-            "Unsupported case for broadcastable op query format.");
-    return sc_data_format_t(target_lt.get_format().format_code_, blocks);
-}
-
-} // namespace op_traits
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_broadcast.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_broadcast.hpp
deleted file mode 100644
index 81dddbf6bdc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_broadcast.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_MAY_BROADCAST_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_MAY_BROADCAST_HPP
-
-#include <algorithm>
-#include <vector>
-#include <compiler/ir/graph/traits.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace op_traits {
-struct may_broadcast_t : public virtual op_base_trait_t {
-    static constexpr int NOT_DETERMINED = -2;
-    /**
-     * Returns the index of all the input logical tensors that do
-     * not need to be broadcasted (whose shape same as the output)
-     * */
-    virtual std::vector<int> get_non_broadcast_input_index(
-            bool assert_non_empty) const = 0;
-
-    /**
-     * Returns the input index serving as the layout reference, which tends to
-     * be the index of the largest input
-     * */
-    virtual int get_ref_input_index(bool assert_determined) const = 0;
-
-    const std::vector<std::vector<int>> &get_plain_bc_axis() const {
-        return plain_bc_axis_;
-    }
-
-    /**
-     * Returns a vector of input_shape's axis whose shape matches with the
-     * corresponding dimension of the output (so no need to be broadcasted)
-     * */
-    static std::vector<int> get_auto_broadcast_bc_axis(
-            const sc_dims &input_shape, const sc_dims &output_shape);
-
-    /**
-     * Returns whether shape1 and shape2 are equal (means shape1 to shape 2 does
-     * not involve broadcast).
-     * Assuming that shape1 is broadcastable to shape2.
-     * */
-    static bool broadcastable_shape_equal(
-            const sc_dims &shape1, const sc_dims &shape2);
-
-    static sc_dims infer_auto_broadcast_output_shape(
-            const sc_dims &lhs, const sc_dims &rhs);
-
-    static sc_data_format_t infer_broadcast_format(
-            const logical_tensor_t &target_lt, const logical_tensor_t &bc_lt);
-
-protected:
-    std::vector<std::vector<int>> plain_bc_axis_;
-};
-
-} // namespace op_traits
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_inplace.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_inplace.hpp
deleted file mode 100644
index c80b88cf5ae..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_inplace.hpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_MAY_INPLACE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_MAY_INPLACE_HPP
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/transform/tensor_inplace_info.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace op_traits {
-struct may_inplace_t : public virtual op_base_trait_t {
-    /**
-     * @brief Get the inplace mapping
-     *
-     * @return the list of <output_tensor_index, list<input_tensor_index>>.
-     * output_tensor_index is the index of an output tensor of this op staring
-     * from 0. input_tensor_index is the index of an input tensor of this op
-     * staring from 0. returning empty vector means that this op cannot inplace
-     * reuse any input
-     */
-    virtual std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() = 0;
-};
-
-} // namespace op_traits
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_prefetch.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_prefetch.cpp
deleted file mode 100644
index 8d2f83b7e7c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_prefetch.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "may_prefetch.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace op_traits {
-
-// replace all var/tensors in the tensor with remade ones, because we are
-// creating a new function
-class remake_args_visitor_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::visit;
-    std::vector<expr> args_;
-    std::vector<expr> old_args_;
-    std::unordered_map<expr_c, expr> mapping_;
-
-    expr get_mapped(const expr_c &v) {
-        auto itr = mapping_.find(v);
-        if (itr != mapping_.end()) { return itr->second; }
-        auto ret = v->remake();
-        mapping_[v] = ret;
-        args_.emplace_back(ret);
-        old_args_.emplace_back(v.remove_const());
-        return ret;
-    }
-
-    expr_c visit(var_c v) override { return get_mapped(v); }
-    expr_c visit(tensor_c v) override { return get_mapped(v); }
-};
-
-void may_prefetch_t::generate_prefetcher_body_for_slice(const context_ptr &ctx,
-        const std::vector<expr> &func_args,
-        const std::vector<tensor_slice> &ins, const std::vector<int> &indices) {
-    COMPILE_ASSERT(false, "not yet implemented");
-}
-
-func_t may_prefetch_t::generate_prefetcher_and_set_idle(const context_ptr &ctx,
-        bool is_global, const std::vector<tensor_slice> &ins,
-        const std::vector<int> &indices, std::vector<stmt> &out_set_idle_code) {
-    std::vector<tensor_slice> new_ins;
-    new_ins.reserve(ins.size());
-    // remake the vars and tensors
-    remake_args_visitor_t vis;
-    sc_data_type_t type_trigger = datatypes::s32;
-    vis.args_.emplace_back(builder::make_tensor("state", {1}, type_trigger));
-    vis.args_.emplace_back(builder::make_var(type_trigger, "expected"));
-    // the tid in func arg. It may not be valid (-1), we need to check it and
-    // reassign the real tid if necessary
-    auto tid_in_arg = builder::make_var(datatypes::s32, "tid");
-    vis.args_.emplace_back(tid_in_arg);
-    size_t tid_in_arg_idx = vis.args_.size() - 1;
-
-    size_t num_standard_args = vis.args_.size();
-    vis.old_args_.emplace_back();
-
-    std::vector<expr> generator_args = vis.args_;
-
-    vis.args_.front()->attr()["volatile"] = true;
-    // remake the tensor slice
-    for (auto &idx : indices) {
-        auto &v = ins.at(idx);
-        auto base = vis.visit(v.tptr_).checked_as<tensorptr>();
-        std::vector<expr> shapes;
-        for (auto &s : v.shape_) {
-            shapes.emplace_back(vis.dispatch(s).remove_const());
-        }
-        new_ins.emplace_back();
-        new_ins.back().tptr_ = std::move(base);
-        new_ins.back().shape_ = std::move(shapes);
-    }
-
-    auto func_body = make_stmt<stmts_node_t>(std::vector<stmt>());
-    std::vector<expr> func_args;
-    for (size_t i = 0; i < num_standard_args; i++) {
-        func_args.emplace_back(vis.args_[i]);
-    }
-
-    expr general_args;
-    if (num_standard_args + 1 == vis.args_.size()) {
-        general_args = vis.args_[num_standard_args];
-    } else {
-        general_args = builder::make_tensor(
-                "__args", {vis.args_.size() - 1}, datatypes::generic);
-        // extract the arguments in new prefetch function from general arg array
-        for (size_t i = num_standard_args; i < vis.args_.size(); i++) {
-            auto &arg = vis.args_[i];
-            func_body->seq_.emplace_back(
-                    builder::make_var_tensor_def_unattached(arg, linkage::local,
-                            builder::make_cast(arg->dtype_,
-                                    general_args[i - num_standard_args])));
-        }
-    }
-
-    func_args.emplace_back(general_args);
-
-    // call the user func to generate the func body
-    builder::ir_builder_t builder;
-    builder.push_scope();
-
-    // the real tid
-    auto realtid = builder::make_var(datatypes::s32, "realtid");
-    builder.push_var_tensor_def(realtid, linkage::local,
-            builder::make_select(tid_in_arg < 0,
-                    builtin::get_thread_id_func()(), tid_in_arg));
-    generator_args[tid_in_arg_idx] = realtid;
-
-    std::vector<expr> args_ins;
-    for (auto &t : new_ins) {
-        COMPILE_ASSERT(t.is_full(),
-                "Cannot generate prefetcher for tensor slice yet.");
-        args_ins.emplace_back(t.get_real_tensor());
-    }
-    generate_prefetcher_body_for_tensor(ctx, generator_args, args_ins, indices);
-
-    auto body = builder.pop_scope();
-    for (auto &s : body.checked_as<stmts>()->seq_) {
-        func_body->seq_.emplace_back(std::move(s));
-    }
-
-    auto op = dynamic_cast<sc_op *>(this);
-    std::string func_name = op->op_name_;
-    func_name += "_";
-    func_name += std::to_string(op->logical_op_id_);
-    func_name += "_prefetch";
-    auto retfunc = builder::make_func(
-            func_name, func_args, func_body, datatypes::index);
-    retfunc->attr()[function_attrs::low_level] = true;
-    retfunc->attr()[function_attrs::private_] = true;
-    retfunc->decl_->attr()[function_attrs::private_] = true;
-
-    vis.old_args_[0] = builder::make_func_addr(retfunc);
-    out_set_idle_code.emplace_back(builder::make_evaluate_unattached(
-            make_expr<intrin_call_node>(intrin_type::set_thread_idle_func,
-                    vis.old_args_, any_map_t {})));
-    return retfunc;
-}
-} // namespace op_traits
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_prefetch.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_prefetch.hpp
deleted file mode 100644
index 9a3cf240fdc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/trait/may_prefetch.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_MAY_PREFETCH_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAIT_MAY_PREFETCH_HPP
-
-#include <vector>
-#include <compiler/ir/graph/tensor_slice.hpp>
-#include <compiler/ir/graph/traits.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace op_traits {
-struct may_prefetch_t : public virtual op_base_trait_t {
-    /**
-     * @brief Query the the indices of inputs which needs prefetching
-     *
-     * @param ctx the context
-     * @param is_global query for global or local prefetch. @see
-     * generate_prefetcher_and_set_idle
-     * @param ins the vector of input tensor slices
-     * @return std::vector<int> a vector of input tensor indices
-     */
-    virtual std::vector<int> query_prefetch(const context_ptr &ctx,
-            bool is_global, const std::vector<tensor_slice> &ins)
-            = 0;
-
-    /**
-     * @brief generate prefetcher code for the inputs selected by query_prefetch
-     * into the current ir_builder. This function is for tensor slice.
-     *
-     * @param ctx the context
-     * @param func_args the arguments for the prefetcher function. The first arg
-     * is the pointer to the trigger value(as a tensor). The second arg is the
-     * expected value. The prefetcher should exit when trigger value !=
-     * expected.
-     * @param ins the inputs, in the new prefetch IR function created by
-     * generate_prefetcher_func. The tensors and vars are in func_args
-     * @param indices the input indices selected by query_prefetch
-     */
-    virtual void generate_prefetcher_body_for_slice(const context_ptr &ctx,
-            const std::vector<expr> &func_args,
-            const std::vector<tensor_slice> &ins,
-            const std::vector<int> &indices);
-
-    /**
-     * @brief generate prefetcher code for the inputs selected by query_prefetch
-     * into the current ir_builder. This function is for whole tensor.
-     *
-     * @param ctx the context
-     * @param func_args the arguments for the prefetcher function. The first
-     * arg is the pointer to the trigger value(as a tensor). The second arg is
-     * the expected value. The prefetcher should exit when trigger value ==
-     * expected. The third arg is the tid.
-     * @param ins the inputs, in the new prefetch IR function created by
-     * generate_prefetcher_func. The tensors and vars are in func_args
-     * @param indices the input indices selected by query_prefetch
-     */
-    virtual void generate_prefetcher_body_for_tensor(const context_ptr &ctx,
-            const std::vector<expr> &func_args, const std::vector<expr> &ins,
-            const std::vector<int> &indices)
-            = 0;
-
-    /**
-     * @brief generate prefetcher function for the inputs selected by
-     * query_prefetch. The default inplementation calles
-     * generate_prefetcher_body. Op implementations usually don't need to
-     * override this function.
-     *
-     * @param ctx the context
-     * @param is_global whether the user wants a global or local prefetcher. A
-     * global prefetcher will be registered in the main entry function via
-     * managed thread pool API and the local prefetcher is passed to
-     * sc_arrive_at_barrier. Local prefetchers are usually used in fused op. The
-     * prefetcher interfaces are slightly different.
-     * @param ins the slice for all inputs of the Op, in the Op's computation
-     * code. Should be the same as the parameter passed to query_prefetch
-     * @param indices the input indices selected by query_prefetch
-     * @param out_set_idle_code outputs the statements for calling
-     * set_thread_idle_func()
-     * @return the generated prefetcher function
-     */
-    virtual func_t generate_prefetcher_and_set_idle(const context_ptr &ctx,
-            bool is_global, const std::vector<tensor_slice> &ins,
-            const std::vector<int> &indices,
-            std::vector<stmt> &out_set_idle_code);
-};
-
-} // namespace op_traits
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/traits.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/traits.hpp
deleted file mode 100644
index d0375bb610b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/traits.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAITS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRAITS_HPP
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/binding_axis.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/transform/parallel_workload_attr.hpp>
-#include <runtime/microkernel/cpu/brgemm_alg_kind.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class fusion_anchor_mgr_t;
-struct brgemm_fusion_register;
-struct mixed_parti_t;
-struct fusion_anchor_t;
-
-namespace op_traits {
-struct copyable_t : public virtual op_base_trait_t {
-    virtual sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr)
-            = 0;
-};
-
-/**
- * A class is auto-copyable if we can construct a valid copy of the node with
- * the in/out tensors and the attrs of the node. The op name should be in the op
- * registery
- * */
-struct auto_copyable_t : public copyable_t {
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-};
-
-/**
- * @brief The util trait template for Ops that is "almost" auto-copyable except
- * that they need to copy the data in the trait.
- *
- * @tparam TArgs op_traits that has copy_from methods
- */
-template <typename... TArgs>
-struct auto_copyable_with_trait_t : public auto_copyable_t {
-    template <typename T>
-    static void copy_impl(auto_copyable_with_trait_t *from, sc_op *to) {
-        auto pto = dynamic_cast<T *>(to);
-        assert(pto);
-        auto pfrom = dynamic_cast<T *>(from);
-        assert(pfrom);
-        pto->copy_from(pfrom);
-    }
-
-    template <typename T0, typename... T>
-    static void copy_impl(auto_copyable_with_trait_t *from,
-            typename std::enable_if<(sizeof...(T) > 0), sc_op *>::type to) {
-        copy_impl<T0>(from, to);
-        copy_impl<T...>(from, to);
-    }
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override {
-        auto ret = auto_copyable_t::copy(ins, outs, mgr);
-        copy_impl<TArgs...>(this, ret.get());
-        return ret;
-    }
-};
-
-// the OP can be optimized if some of the inputs are constants
-struct constant_optimizable_t : public virtual op_base_trait_t {
-    // do optimization and return the new optmized op. If no optimization can be
-    // applied, return null
-    virtual sc_op_ptr constant_optimize(sc_graph_t &graph) = 0;
-};
-
-// the part of OP's workload can be computed, e.g. intrisics(brgemm), tensor
-// slice.
-struct workload_computable_t : public virtual op_base_trait_t {
-    using shape_dtype_pair = std::pair<sc_dims, sc_data_type_t>;
-    static const size_t read_weight = parallel_workload::read_weight;
-    static const size_t write_weight = parallel_workload::write_weight;
-    static constexpr const char *workload_number
-            = parallel_workload::attr_workload_number;
-    // compute workload with given input and output tensor pointers, according
-    // to read/write times and operator numbers.
-    virtual size_t compute_workload(const std::vector<shape_dtype_pair> &ins,
-            const std::vector<shape_dtype_pair> &outs)
-            = 0;
-};
-
-// the OP can accept a fusion manager to do post fusion
-struct post_fusion_acceptable_t : public virtual op_base_trait_t {
-    virtual ir_module_ptr get_func(context_ptr ctx,
-            const std::shared_ptr<fusion_anchor_mgr_t> &fuse_mgr,
-            const std::string &func_name)
-            = 0;
-};
-
-// the OP can be fused into brgemm calculation.
-struct brgemm_fusion_acceptable_t : public virtual op_base_trait_t {
-    static constexpr const char *brgemm_fusion = "brgemm_fusion";
-    bool fuse_in_brgemm_ = false;
-    brgemm::alg_kind_t alg_kind_ = brgemm::alg_kind_t::alg_kind_undef;
-    virtual bool register_brgemm_fusion(const context_ptr &ctx,
-            const std::vector<tensor_slice *> &outputs,
-            const std::vector<const tensor_slice *> &inputs,
-            brgemm_fusion_register &brg_reg)
-            = 0;
-    void copy_from(brgemm_fusion_acceptable_t *from) {
-        fuse_in_brgemm_ = from->fuse_in_brgemm_;
-        alg_kind_ = from->alg_kind_;
-    }
-};
-
-// quantize
-struct may_quantize_t : public virtual op_base_trait_t {
-    virtual sc_op_ptr do_compensations(
-            sc_graph_t &mgr, const context_ptr &ctx) {
-        need_compensation_ = false;
-        return sc_op_ptr();
-    }
-    bool should_quantized_ = false;
-    bool is_quantized_ = false;
-    bool need_compensation_ = true;
-};
-
-struct mixed_partition_acceptable : public virtual op_base_trait_t {
-    // create a new partition for current op
-    virtual void create_mixed_partition(mixed_parti_t *parti) = 0;
-
-    // append current op to the existed partition
-    virtual void append_mixed_partition(mixed_parti_t *parti) = 0;
-
-    // search fusion anchor for current op in given partition
-    virtual void search_anchor(mixed_parti_t *parti) = 0;
-
-    // commit current op into given partition
-    virtual void commit_into_anchor(fusion_anchor_t *committed_anchor) = 0;
-
-    // infer binding axis from inputs to outputs
-    virtual void infer_binding_axis(binding_axis_map &bdax_map) = 0;
-
-    // infer binding axis from outputs to inputs
-    virtual void pre_infer_binding_axis(binding_axis_map &bdax_map) = 0;
-};
-
-struct data_compensation_t : public virtual op_base_trait_t {};
-struct weight_compensation_t : public virtual op_base_trait_t {};
-struct constant_compensation_t : public virtual op_base_trait_t {};
-
-} // namespace op_traits
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/annotate_config.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/annotate_config.cpp
deleted file mode 100644
index cce070f6d6e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/annotate_config.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <vector>
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/reduce_graph_op.hpp>
-#include <util/math_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// Give annotation to graph_op which contains tunable op. The same tunable op
-// with a given default config will perform differently in different graphs,
-// considering the potential impact of fusion. Attributes are given in this pass
-// to tell tunable op use proper config (or even format) accordingly. Currently
-// only managed_matmul_core_op benefits from this pass.
-void annotate_config(sc_graph_t &graph, const context_ptr &ctx) {
-    // annotate graph with matmul+reduce fusion
-    auto vis0 = op_visitor_t::bfs();
-    bool is_int8 = false;
-    vis0.visit_graph(graph, [&](op_visitor_t *vis0, const sc_op_ptr &node) {
-        // consider mixed-datatype
-        if (node->isa<quantize::dequantize_op_t>()) { is_int8 = true; }
-        if (node->op_name_ == "matmul"
-                || node->op_name_ == "managed_matmul_core") {
-            // currently only consider static case
-            if (node->is_dynamic()) { return; }
-            size_t use_size = node->get_outputs()[0]->uses_.size();
-            auto next_node
-                    = node->get_outputs()[0]->uses_.at(0).second.get_shared();
-            while (!next_node->isa<tunable_op_t>()
-                    && !next_node->isa<output_op>()
-                    && next_node->op_name_ != "matmul") {
-                if (next_node->isa<reduce_op_t>()
-                        || next_node->isa<reduce_mean_op_t>()
-                        || next_node->op_name_ == "layernorm") {
-                    auto data_dims
-                            = node->get_inputs()[0]->details_.get_plain_dims();
-                    auto weight_dims
-                            = node->get_inputs()[1]->details_.get_plain_dims();
-                    auto out_dims_size
-                            = std::max(data_dims.size(), weight_dims.size());
-                    bool transpose_b
-                            = node->attrs_.get_or_else("transpose_b", false);
-                    bool transpose_a
-                            = node->attrs_.get_or_else("transpose_a", false);
-                    if (next_node->attrs_.has_key("rd_axis")
-                            && !next_node->attrs_.get_or_else(
-                                    op_attr_key::break_pre_fuse, false)) {
-                        std::vector<int> rd_axis
-                                = next_node->attrs_.get<std::vector<int>>(
-                                        "rd_axis");
-                        if ((weight_dims.size() == 2 || data_dims.size() == 2)
-                                && rd_axis.size() == 1
-                                && rd_axis.at(0)
-                                        == static_cast<int>(out_dims_size)
-                                                - 1) {
-                            auto K = transpose_b ? weight_dims.at(1)
-                                                 : weight_dims.at(0);
-                            auto M = transpose_a
-                                    ? data_dims.at(1)
-                                    : math_utils::get_dims_product(data_dims)
-                                            / data_dims.back();
-                            if (((K >= 640 && K < 4096) || M < 12288)
-                                    && utils::is_one_of(
-                                            node->get_inputs()[0]
-                                                    ->details_.dtype_,
-                                            datatypes::bf16, datatypes::f16)
-                                    && !is_int8) {
-                                next_node->attrs_.set(
-                                        op_attr_key::break_pre_fuse, true);
-                                break;
-                            }
-                        }
-                        node->attrs_.set("post_rd_axis", rd_axis);
-                        break;
-                    }
-                }
-                if (next_node->attrs_.get_or_else(
-                            op_attr_key::break_pre_fuse, false)) {
-                    break;
-                }
-                if (next_node->get_outputs()[0]->uses_.size() > 1) { break; }
-                if (next_node->isa<binary_elementwise_op_t>()
-                        && !(next_node->get_inputs()[1]
-                                        ->producer_owner_->isa<input_op>())
-                        && node->get_inputs()[0]->details_.dtype_
-                                == datatypes::f32
-                        && !is_int8) {
-                    // consider residual cases
-                    break;
-                }
-                next_node = next_node->get_outputs()[0]
-                                    ->uses_.at(0)
-                                    .second.get_shared();
-            }
-        }
-    });
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/brgemm_fusion.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/brgemm_fusion.cpp
deleted file mode 100644
index 4b86914b2f0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/brgemm_fusion.cpp
+++ /dev/null
@@ -1,559 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <memory>
-#include <vector>
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/sc_data_type.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <runtime/microkernel/cpu/brgemm_common.hpp>
-#include <util/math_utils.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static const constexpr char *forbidden_brgemm_fusion
-        = "forbidden_brgemm_fusion";
-
-enum class brg_broadcast_type {
-    non_broadcast = 0, // {N, C, H, W}
-    non_channel, // {N, 1, H, W} => {N, C, H, W}
-    only_channel, // {1, C, 1, 1} => {N, C, H, W}
-    all_broadcast, // {1}
-};
-
-static bool all_ones_except_channel(const sc_dims &shapes, int channel_axis) {
-    assert(!shapes.empty());
-    for (int i = 0; i < static_cast<int>(shapes.size()); i++) {
-        if (i != channel_axis && shapes[i] != 1) { return false; }
-    }
-    return true;
-}
-
-static op_traits::brgemm_fusion_acceptable_t *to_brg_fusible(
-        const sc_op_ptr &p) {
-    return p->dyn_cast<op_traits::brgemm_fusion_acceptable_t>();
-}
-
-static brg_broadcast_type get_brgemm_broadcast_type(
-        const sc_data_format_t &l_format, const sc_data_format_t &r_format,
-        const sc_dims &l_plain_dims, const sc_dims &r_plain_dims) {
-    int channel_axis
-            = l_format.format_code_.get(l_format.format_code_.ndims() - 1);
-    sc_dim channel_size = l_plain_dims.at(channel_axis);
-    COMPILE_ASSERT((l_format.format_code_ == r_format.format_code_
-                           && l_plain_dims.size() == r_plain_dims.size())
-                    || r_plain_dims.size() == 1,
-            "The tensors on the left and right hands cannot be calculated.");
-    sc_dims expected_all_broadcast {1};
-    if (r_plain_dims == l_plain_dims
-            && !all_ones_except_channel(l_plain_dims, channel_axis)) {
-        return brg_broadcast_type::non_broadcast;
-    } else if (r_plain_dims == expected_all_broadcast) {
-        return brg_broadcast_type::all_broadcast;
-    } else if (r_plain_dims.size() == 1
-            || r_plain_dims.at(channel_axis) == l_plain_dims.at(channel_axis)) {
-        return brg_broadcast_type::only_channel;
-    } else if (r_plain_dims.at(channel_axis) == 1) {
-        return brg_broadcast_type::non_channel;
-    } else {
-        throw std::runtime_error(
-                "The shapes on left and right sides do not correspond!");
-    }
-}
-struct fusion_state_t {
-    int max_op_count_ = 1;
-    int cur_op_count_ = 0;
-    template <typename T>
-    bool isa() const {
-        static_assert(std::is_base_of<fusion_state_t, T>::value,
-                "T is not a subclass of fusion_state_t.");
-        return dynamic_cast<const T *>(this);
-    }
-    virtual void reset() { cur_op_count_ = 0; }
-    // the cur_op can be accepted or not in this state.
-    virtual bool is_acceptable(const sc_op_ptr &cur_op) = 0;
-    virtual std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            = 0;
-    virtual void do_transform(sc_graph_t &graph, const sc_op_ptr &main_op,
-            const sc_op_ptr &cur_op) {
-        to_brg_fusible(cur_op)->fuse_in_brgemm_ = true;
-    }
-    virtual ~fusion_state_t() = default;
-};
-#define GET_BROADCAST_TYPE() \
-    auto l_format = cur_op->get_inputs()[0]->details_.get_format(); \
-    auto r_format = cur_op->get_inputs()[1]->details_.get_format(); \
-    auto l_plain_dims = cur_op->get_inputs()[0]->details_.get_plain_dims(); \
-    auto r_plain_dims = cur_op->get_inputs()[1]->details_.get_plain_dims(); \
-    auto bct_type = get_brgemm_broadcast_type( \
-            l_format, r_format, l_plain_dims, r_plain_dims);
-
-struct cast_state_t;
-struct scale_state_t;
-struct bias_state_t;
-struct elem_state_t;
-struct c_zp_state_t;
-struct ab_zp_state_t : public fusion_state_t {
-    bool has_a_zp = false;
-    bool has_b_zp = false;
-    ab_zp_state_t() : has_a_zp(false), has_b_zp(false) { max_op_count_ = 2; }
-    void reset() override {
-        cur_op_count_ = 0;
-        has_a_zp = false;
-        has_b_zp = false;
-    }
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        if (!cur_op->isa<sub_op_t>()) { return false; }
-        auto sub_op = cur_op->stc_cast<sub_op_t>();
-        if (sub_op->get_broadcast_input() == 0) { return false; }
-        GET_BROADCAST_TYPE();
-        // if non-broadcast or all broadcast, return
-        if (bct_type == brg_broadcast_type::non_broadcast
-                || bct_type == brg_broadcast_type::all_broadcast) {
-            return false;
-        }
-        // should be s32
-        if (!cur_op->get_inputs()[0]->details_.dtype_.is_etype(
-                    sc_data_etype::S32)
-                || !cur_op->get_inputs()[1]->details_.dtype_.is_etype(
-                        sc_data_etype::S32)) {
-            return false;
-        }
-        if (!has_a_zp && bct_type == brg_broadcast_type::only_channel) {
-            has_a_zp = true;
-            cur_op_count_++;
-            sub_op->alg_kind_ = brgemm::alg_kind_t::a_zp;
-            return true;
-        }
-        if (!has_b_zp && bct_type == brg_broadcast_type::non_channel) {
-            has_b_zp = true;
-            cur_op_count_++;
-            sub_op->alg_kind_ = brgemm::alg_kind_t::b_zp;
-            return true;
-        }
-        return false;
-    }
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<cast_state_t>() || st->isa<ab_zp_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-};
-
-struct cast_state_t : public fusion_state_t {
-    // calculation type cast, memory storage type cast
-    cast_state_t() { max_op_count_ = 2; }
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        if (!cur_op->isa<cast_op_t>()) { return false; }
-        cur_op_count_++;
-        cur_op->stc_cast<cast_op_t>()->alg_kind_
-                = brgemm::alg_kind_t::out_dtype;
-        return true;
-    }
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<bias_state_t>() || st->isa<cast_state_t>()
-                    || st->isa<elem_state_t>() || st->isa<scale_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-};
-
-struct scale_state_t : public fusion_state_t {
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        if (!cur_op->isa<mul_op_t>()) { return false; }
-        auto mul_op = cur_op->stc_cast<mul_op_t>();
-        if (mul_op->get_broadcast_input() == 0) { return false; }
-
-        GET_BROADCAST_TYPE();
-        // support only_channel, all broadcast
-        if (bct_type == brg_broadcast_type::non_broadcast
-                || bct_type == brg_broadcast_type::non_channel) {
-            return false;
-        }
-        if (bct_type == brg_broadcast_type::all_broadcast
-                && !mul_op->get_inputs()[1]
-                            ->producer_owner_->isa<constant_op_t>()) {
-            return false;
-        }
-        // should be f32
-        if (!cur_op->get_inputs()[0]->details_.dtype_.is_etype(
-                    sc_data_etype::F32)
-                || !cur_op->get_inputs()[1]->details_.dtype_.is_etype(
-                        sc_data_etype::F32)) {
-            return false;
-        }
-        cur_op_count_++;
-        mul_op->alg_kind_ = brgemm::alg_kind_t::out_scales;
-        return true;
-    }
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<cast_state_t>() || st->isa<elem_state_t>()
-                    || st->isa<c_zp_state_t>() || st->isa<bias_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-
-    // scalar scale to vector
-    void do_transform(sc_graph_t &graph, const sc_op_ptr &main_op,
-            const sc_op_ptr &cur_op) override {
-        if (auto const_op
-                = cur_op->get_inputs()[1]
-                          ->producer_owner_->dyn_cast<constant_op_t>()) {
-            if (const_op->get_constant_plain_dims() == sc_dims {1}) {
-                float scale = reinterpret_cast<float *>(
-                        const_op->get_constant_values()->data_)[0];
-                auto main_format
-                        = main_op->get_outputs()[0]->details_.get_format();
-                auto main_plain_shapes
-                        = main_op->get_outputs()[0]->details_.get_plain_dims();
-                auto &fcode = main_format.format_code_;
-                int channel_axis = fcode.get(fcode.ndims() - 1);
-                sc_dim channel_size = main_plain_shapes.at(channel_axis);
-                std::vector<float> scales(channel_size, scale);
-                sc_dims new_plain_dims(main_plain_shapes.size(), 1);
-                new_plain_dims.at(channel_axis)
-                        = main_plain_shapes.at(channel_axis);
-                auto new_format = main_format;
-                for (int i = 0; i < new_format.format_code_.norig_dims(); i++) {
-                    auto blk_idxs
-                            = new_format.format_code_.collect_blocking_index(i);
-                    if (i != channel_axis) {
-                        for (auto &idx : blk_idxs) {
-                            new_format.blocks_[idx] = 1;
-                        }
-                    }
-                }
-                auto new_const_op = graph.make("constant", {}, {},
-                        {{"values", std::make_shared<static_data_t>(scales)},
-                                {"dtype", datatypes::f32},
-                                {"plain_dims", new_plain_dims},
-                                {"format", new_format}});
-                auto new_scale_op = graph.make("mul",
-                        {cur_op->get_inputs()[0],
-                                new_const_op->get_outputs()[0]},
-                        {}, cur_op->attrs_);
-                auto new_scale_fu = to_brg_fusible(new_scale_op);
-                new_scale_fu->alg_kind_ = brgemm::alg_kind_t::out_scales;
-                new_scale_fu->fuse_in_brgemm_ = true;
-                cur_op->replace_uses_with_and_remove(new_scale_op);
-                const_op->remove();
-                return;
-            }
-        }
-        auto cur_op_fu = to_brg_fusible(cur_op);
-        cur_op_fu->alg_kind_ = brgemm::alg_kind_t::out_scales;
-        cur_op_fu->fuse_in_brgemm_ = true;
-    }
-};
-
-struct bias_state_t : public fusion_state_t {
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        if (!cur_op->isa<add_op_t>()) { return false; }
-        auto add_op = cur_op->stc_cast<add_op_t>();
-        if (add_op->get_broadcast_input() == 0) { return false; }
-        GET_BROADCAST_TYPE();
-        // support only_channel
-        if (bct_type != brg_broadcast_type::only_channel) { return false; }
-        // should be f32, bf16 or fp16
-        if (!utils::is_one_of(cur_op->get_inputs()[0]->details_.dtype_,
-                    datatypes::bf16, datatypes::f16, datatypes::f32)) {
-            return false;
-        }
-        cur_op_count_++;
-        add_op->alg_kind_ = brgemm::alg_kind_t::bias_add;
-        return true;
-    }
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<cast_state_t>() || st->isa<elem_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-
-    // e.g. conv + cast + scale + bias => conv + bias + cast + scale
-    void do_transform(sc_graph_t &graph, const sc_op_ptr &main_op,
-            const sc_op_ptr &cur_op) override {
-        // s32 + cast + f32
-        if (main_op->get_outputs()[0]->details_.dtype_.is_etype(
-                    sc_data_etype::S32)
-                && cur_op->get_inputs()[0]->details_.dtype_.is_etype(
-                        sc_data_etype::F32)) {
-            COMPILE_ASSERT(main_op->attrs_.has_key("data_scales")
-                            && main_op->attrs_.has_key("weight_scales"),
-                    "Main op is not quantized but output s32.");
-            auto l_plain_dims
-                    = main_op->get_outputs()[0]->details_.get_plain_dims();
-            auto r_plain_dims
-                    = cur_op->get_inputs()[1]->details_.get_plain_dims();
-            auto &r_format = cur_op->get_inputs()[1]->details_.get_format();
-            auto &fcode = r_format.format_code_;
-            int channel_axis = fcode.get(fcode.ndims() - 1);
-            sc_dim channel_size = r_plain_dims.at(channel_axis);
-            auto data_scales
-                    = main_op->attrs_.get<std::vector<float>>("data_scales");
-            auto weight_scales
-                    = main_op->attrs_.get<std::vector<float>>("weight_scales");
-            auto output_scales
-                    = math_utils::vector_mul(data_scales, weight_scales);
-            auto next_op_by_main = main_op->get_outputs()[0]->uses_[0].second;
-            assert(main_op->get_outputs()[0]->uses_[0].first == 0);
-            assert(next_op_by_main.get() != cur_op.get());
-            output_scales = math_utils::vector_rcp(output_scales);
-            if (output_scales.size() == 1) {
-                output_scales
-                        = std::vector<float>(channel_size, output_scales[0]);
-            }
-            sc_dims const_plain_dims = r_plain_dims;
-            auto const_scales = graph.make("constant", {}, {},
-                    {{"values", std::make_shared<static_data_t>(output_scales)},
-                            {"dtype", datatypes::f32},
-                            {"plain_dims", const_plain_dims},
-                            {"format", r_format}});
-            auto bias_mul_scales = graph.make("mul",
-                    {cur_op->get_inputs()[1], const_scales->get_outputs()[0]},
-                    {}, {});
-            auto castf32 = graph.make("cast", main_op->get_outputs(), {},
-                    {{"dtype", datatypes::f32}});
-            auto bias_add = graph.make("add",
-                    {castf32->get_outputs()[0],
-                            bias_mul_scales->get_outputs()[0]},
-                    {}, cur_op->attrs_);
-            auto casts32 = graph.make("cast", bias_add->get_outputs(), {},
-                    {{"dtype", datatypes::s32}});
-            to_brg_fusible(bias_add)->alg_kind_ = brgemm::alg_kind_t::bias_add;
-            to_brg_fusible(castf32)->fuse_in_brgemm_ = true;
-            to_brg_fusible(bias_add)->fuse_in_brgemm_ = true;
-            to_brg_fusible(casts32)->fuse_in_brgemm_ = true;
-            next_op_by_main->replace_input(0, casts32->get_outputs()[0]);
-            auto pre_op_by_cur = cur_op->get_inputs()[0]->producer_owner_;
-            auto uses = cur_op->get_outputs()[0]->uses_;
-            for (auto &use : uses) {
-                use.second->replace_input(
-                        use.first, pre_op_by_cur->get_outputs()[0]);
-            }
-            cur_op->remove();
-            return;
-        }
-        auto cur_op_fu = to_brg_fusible(cur_op);
-        cur_op_fu->alg_kind_ = brgemm::alg_kind_t::bias_add;
-        cur_op_fu->fuse_in_brgemm_ = true;
-    }
-};
-
-struct elem_state_t : public fusion_state_t {};
-struct unary_elem_state_t : public elem_state_t {
-    unary_elem_state_t() {
-        max_op_count_ = brgemm::postops_setting_t::max_postops_num;
-    }
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        // cast cur_op should be processed by cast_state_t.
-        if (cur_op->isa<cast_op_t>()) { return false; }
-        if (!cur_op->isa<unary_elementwise_op_t>()) { return false; }
-        cur_op_count_++;
-        return true;
-    }
-
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<cast_state_t>() || st->isa<elem_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-};
-
-struct binary_elem_state_t : public elem_state_t {
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        if (!cur_op->isa<binary_elementwise_op_t>()) { return false; }
-        auto bin_op = cur_op->stc_cast<binary_elementwise_op_t>();
-        if (bin_op->get_broadcast_input() == 0) { return false; }
-        GET_BROADCAST_TYPE();
-        // support only_channel, non broadcast
-        if (bct_type == brg_broadcast_type::all_broadcast
-                || bct_type == brg_broadcast_type::non_channel) {
-            return false;
-        }
-        cur_op_count_++;
-        return true;
-    }
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<cast_state_t>() || st->isa<unary_elem_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-};
-
-struct c_zp_state_t : public fusion_state_t {
-    bool is_acceptable(const sc_op_ptr &cur_op) override {
-        if (cur_op_count_ >= max_op_count_) { return false; }
-        if (!cur_op->isa<add_op_t>()) { return false; }
-        auto add_op = cur_op->stc_cast<add_op_t>();
-        if (add_op->get_broadcast_input() == 0) { return false; }
-        GET_BROADCAST_TYPE();
-        if (bct_type != brg_broadcast_type::all_broadcast) { return false; }
-        // should be f32
-        if (!cur_op->get_inputs()[0]->details_.dtype_.is_etype(
-                    sc_data_etype::F32)
-                || !cur_op->get_inputs()[1]->details_.dtype_.is_etype(
-                        sc_data_etype::F32)) {
-            return false;
-        }
-        cur_op_count_++;
-        add_op->alg_kind_ = brgemm::alg_kind_t::c_zp;
-        return true;
-    }
-    std::vector<std::shared_ptr<fusion_state_t>> next_possible_states(
-            const std::vector<std::shared_ptr<fusion_state_t>> &all_states)
-            override {
-        std::vector<std::shared_ptr<fusion_state_t>> ret;
-        for (auto &st : all_states) {
-            if (st->isa<cast_state_t>() || st->isa<elem_state_t>()
-                    || st->isa<bias_state_t>()) {
-                ret.emplace_back(st);
-            }
-        }
-        return ret;
-    }
-};
-
-std::vector<std::shared_ptr<fusion_state_t>> create_all_possible_states(
-        bool include_zp = false, bool include_binary = false) {
-    std::vector<std::shared_ptr<fusion_state_t>> ret;
-    if (include_zp) { ret.emplace_back(std::make_shared<ab_zp_state_t>()); }
-    ret.emplace_back(std::make_shared<cast_state_t>());
-    ret.emplace_back(std::make_shared<scale_state_t>());
-    ret.emplace_back(std::make_shared<bias_state_t>());
-    ret.emplace_back(std::make_shared<unary_elem_state_t>());
-    // onednn binary postop injector has redundant latency.
-    if (include_binary) {
-        ret.emplace_back(std::make_shared<binary_elem_state_t>());
-    }
-    if (include_zp) { ret.emplace_back(std::make_shared<c_zp_state_t>()); }
-    return ret;
-}
-
-// Mark the alg kind of brgemm fusion ops and do brgemm-friendly
-// transformation (like move bias closer to main op; use vector buffer for
-// scales. It does not change calculation itself)
-// As this transform pass is only used in `fuse_ops` pass, so we introduce two
-// auxiliary ops
-void brgemm_fusion_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    if (graph.attrs_.get_or_else("temp.disable_graph_fusion", 0) == 1) {
-        return;
-    }
-    // todo: need interleave optimization on amx
-    if (ctx->machine_.cpu_flags_.fAVX512AMXTILE) { return; }
-    auto ops = graph.ops_;
-    for (auto &main_op : ops) {
-        if (!main_op->isa<tunable_op_t>()) { continue; }
-        if (!main_op->is_single_output_single_use()) { continue; }
-        sc_op_ptr cur_op = main_op;
-        // todo : enable zp
-        bool include_zp = graph.attrs_.get_or_else(
-                "temp.brgemm_fusion_include_zp", false);
-        // currently binary injector has slow implement.
-        bool include_binary = graph.attrs_.get_or_else(
-                "temp.brgemm_fusion_include_binary", false);
-        static std::vector<std::shared_ptr<fusion_state_t>> all_possible_states
-                = create_all_possible_states(false);
-        static std::vector<std::shared_ptr<fusion_state_t>>
-                all_possible_states_include_zp
-                = create_all_possible_states(true);
-        auto all_states = include_zp ? all_possible_states_include_zp
-                                     : all_possible_states;
-        if (include_binary) {
-            all_states = create_all_possible_states(include_zp, true);
-        }
-        // reset because of the static states
-        for (auto &st : all_states) {
-            st->reset();
-        }
-        int fused_op_count = 0;
-        auto next_op = cur_op->get_outputs()[0]->uses_[0].second;
-        cur_op = next_op;
-        auto next_states = all_states;
-        while (fused_op_count < brgemm::postops_setting_t::max_postops_num
-                && !cur_op->isa<output_op>()) {
-            // store next op as cur op may be transformed by new op
-            next_op = cur_op->get_outputs()[0]->uses_[0].second;
-            bool has_next = cur_op->is_single_output_single_use();
-            bool accept = false;
-            for (auto &st : next_states) {
-                if (st->is_acceptable(cur_op)) {
-                    st->do_transform(graph, main_op, cur_op);
-                    next_states = st->next_possible_states(all_states);
-                    fused_op_count++;
-                    accept = true;
-                    break;
-                }
-            }
-            if (!accept || !has_next) { break; }
-            cur_op = next_op;
-        }
-    }
-    graph.reset_op_ids();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/broadcast_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/broadcast_transform.cpp
deleted file mode 100644
index 7bd3f74755c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/broadcast_transform.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <unordered_set>
-
-#include "../fusible_op.hpp"
-#include "../visitor.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <util/math_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/*
-Insert broadcast after the input with largest size
-*/
-static size_t get_input_idx_to_broadcast(const sc_op_ptr &node) {
-    // TODO(yifei): upgrade the logic here
-    auto inputs = node->get_inputs();
-    std::vector<int64_t> input_sizes(inputs.size());
-    std::transform(inputs.begin(), inputs.end(), input_sizes.begin(),
-            [](const graph_tensor_ptr &gt) -> int64_t {
-                auto lt = gt->details_;
-                if (lt.is_dynamic()) {
-                    return 0;
-                } else {
-                    return math_utils::get_dims_product(lt.get_blocking_dims())
-                            * static_cast<int64_t>(utils::get_sizeof_etype(
-                                    lt.dtype_.type_code_));
-                }
-            });
-    return std::max_element(input_sizes.begin(), input_sizes.end())
-            - input_sizes.begin();
-}
-
-/*
-Applies to multi-directional broadcast scenario
-One side broadcast to the output shape
-TODO(yifei): decide broadcast insertion rule
-    - Which side to choose
-(x,1,z,1)[v0] [v1](y,z,w)
-           \   /
-            add
-             |
-   (x,y,z,w)[v3]
-===============
-               [v1](y,z,w)
-                |
-            broadcast
-                |
-(x,1,z,1)[v0] [v4](x,y,z,w)
-           \   /
-            add
-             |
-   (x,y,z,w)[v3]
-*/
-void broadcast_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto broadcastable_node
-                = node->dyn_cast<op_traits::may_broadcast_t>()) {
-            if (broadcastable_node->get_non_broadcast_input_index(false)
-                            .empty()) {
-                auto inputs = node->info_.inputs_;
-                auto &out_shape
-                        = node->info_.outputs_[0]->details_.get_plain_dims();
-                auto broadcast_input_idx = get_input_idx_to_broadcast(node);
-                const auto &bc_axis = broadcastable_node->get_plain_bc_axis();
-                auto broadcast = graph.make("broadcast",
-                        {inputs[broadcast_input_idx]}, {},
-                        {{"output_shape", out_shape},
-                                {"bc_axis", bc_axis[broadcast_input_idx]}});
-                std::vector<graph_tensor_ptr> new_inputs = inputs;
-                new_inputs[broadcast_input_idx] = broadcast->get_outputs()[0];
-                auto new_broadcastable
-                        = graph.make(node->op_name_, new_inputs, {}, {});
-                node->replace_uses_with_and_remove(new_broadcastable);
-                vis->update_state_for_visited(broadcast);
-                vis->update_state_for_visited(new_broadcastable);
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/constant_optimize.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/constant_optimize.cpp
deleted file mode 100644
index 4ea40d4b25c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/constant_optimize.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <set>
-
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <compiler/ir/graph/traits.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void constant_optimization(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto graph_node
-                = node->dyn_cast<op_traits::constant_optimizable_t>()) {
-            auto ret = graph_node->constant_optimize(graph);
-            if (ret) { vis->update_state_for_visited(ret); }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/div_bcast_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/div_bcast_transform.cpp
deleted file mode 100644
index 1bdf90a1cc1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/div_bcast_transform.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../fusible_op.hpp"
-#include "../graph.hpp"
-#include "../visitor.hpp"
-#include <ops/fusible/binary_elemwise.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/*
-[v0] [v1] <--broadcast tensor
-  \   /
-   div
-    |
-   [b1]
-===============
-     [v1]
-       |
-   reciprocal
-       |
-[v0] [v2] <--broadcast tensor
-  \   /
-   mul
-    |
-   [b1]
-*/
-
-sc_op_ptr insert_rcp(sc_graph_t &graph, div_op_t *node) {
-    auto dtype = node->get_inputs()[1]->details_.dtype_;
-    if (node->attrs_.get_or_else(op_attr_key::must_div, false)) {
-        return nullptr;
-    }
-    if (dtype.type_code_ == sc_data_etype::F32
-            || dtype.type_code_ == sc_data_etype::BF16) {
-        // next node uses div result
-        graph_tensor_ptr v0 = node->get_inputs()[0];
-        graph_tensor_ptr v1 = node->get_inputs()[1];
-        auto v2 = graph.make("reciprocal", {v1}, {}, {})->get_outputs()[0];
-        auto new_node = graph.make("mul", {v0, v2}, {}, node->attrs_);
-        node->replace_uses_with_and_remove(new_node);
-        return new_node;
-    }
-    return nullptr;
-}
-
-void div_bcast_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&graph](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto div_node = node->dyn_cast<div_op_t>()) {
-            if (div_node->attrs_.get_or_else(op_attr_key::must_div, false)) {
-                return;
-            }
-            auto bcast_idx = div_node->get_broadcast_input();
-            if (bcast_idx == 1) {
-                assert(div_node->get_inputs().size() == 2);
-                auto inserted_op = insert_rcp(graph, div_node);
-                if (inserted_op) { vis->update_state_for_visited(inserted_op); }
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/dynamic_graph_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/dynamic_graph_transform.cpp
deleted file mode 100644
index bb6b2221626..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/dynamic_graph_transform.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../quantization/quantize_op.hpp"
-#include "../tunable_op.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/reshape.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/// Push tensor_view/transpose op back to quantize/dequantize ops for better
-/// fusion in dynamic cases.
-void tensor_view_push_back(sc_graph_t &graph, const context_ptr &ctx) {
-    constexpr const int max_try_times = 10;
-    for (int i = 0; i < max_try_times; i++) {
-        bool changed = false;
-        auto vis = op_visitor_t::bfs_unchecked();
-        vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-            if (node->isa<tensor_view_op_t>() || node->isa<transpose_op_t>()
-                    || node->isa<ops::dynamic_reshape_op>()) {
-                auto cur_node = node;
-                auto details = node->get_inputs()[0]->details_;
-                while (cur_node->is_single_output_single_use()) {
-                    auto next_node
-                            = cur_node->get_outputs()[0]->uses_[0].second;
-                    int use_idx = cur_node->get_outputs()[0]->uses_[0].first;
-                    if (!(next_node->isa<quantize::quantize_op_t>()
-                                || next_node->isa<cast_op_t>())) {
-                        break;
-                    }
-                    if (cur_node == node) {
-                        next_node->replace_input(
-                                use_idx, cur_node->get_inputs()[0]);
-                    }
-                    cur_node = next_node;
-                    details.dtype_ = cur_node->get_inputs()[0]->details_.dtype_;
-                    cur_node->get_inputs()[0]->details_ = details;
-                    details.dtype_
-                            = cur_node->get_outputs()[0]->details_.dtype_;
-                    cur_node->get_outputs()[0]->details_ = details;
-                }
-                // if changed
-                if (cur_node != node) {
-                    changed = true;
-                    // cache uses.
-                    auto uses = cur_node->get_outputs()[0]->uses_;
-                    node->replace_input(0, cur_node->get_outputs()[0]);
-                    // correct datatype as we could meet a cast op.
-                    auto out_tsr = node->get_outputs()[0];
-                    out_tsr->details_.dtype_
-                            = cur_node->get_outputs()[0]->details_.dtype_;
-                    for (auto &use : uses) {
-                        use.second->replace_input(use.first, out_tsr);
-                    }
-                }
-            }
-            vis->update_state_for_visited(node);
-        });
-        if (!changed) { break; }
-    }
-    graph.reset_op_ids();
-}
-
-void dynamic_graph_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    if (!graph.is_dynamic()) { return; }
-    tensor_view_push_back(graph, ctx);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/elemwise_bcast_swap.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/elemwise_bcast_swap.cpp
deleted file mode 100644
index 5f9d2464a8d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/elemwise_bcast_swap.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../fusible_op.hpp"
-#include "../visitor.hpp"
-#include <ops/fusible/binary_elemwise.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/*
-[v0] [v1]
-  \   /
-   add
-    |
-   [v3]  [v2] <--broadcast tensor
-     \    /
-      bcast
-        |
-      [b1]
-===============
-[v0] [v2] <--broadcast tensor
-  \   /
-   bcast
-    |
-   [v3]  [v1]
-     \    /
-       add
-        |
-      [b1]
-*/
-static bool do_swap(sc_graph_t &mgr, sc_op *bcast, sc_op *other,
-        int bcast_tsr_idx, op_visitor_t *visitor) {
-    assert(bcast->get_inputs().size() == 2 && other->get_inputs().size() == 2);
-    assert(bcast->get_outputs().size() == 1
-            && other->get_outputs().size() == 1);
-    graph_tensor_ptr v2 = bcast->get_inputs()[bcast_tsr_idx];
-    graph_tensor_ptr v0 = other->get_inputs()[0];
-    graph_tensor_ptr v1 = other->get_inputs()[1];
-    graph_tensor_ptr b1 = bcast->get_outputs()[0];
-    graph_tensor_ptr v3 = other->get_outputs()[0];
-    if (v3->uses_.size() > 1) {
-        // v3 has multiple uses, cannot swap
-        return false;
-    }
-
-    int idx_of_other_in_bcast = false ? 1 : (bcast_tsr_idx == 1);
-    auto new_bcast = mgr.make(bcast->op_name_, {v0, v2}, {}, bcast->attrs_);
-    auto new_other = mgr.make(other->op_name_,
-            {new_bcast->get_outputs()[0], v1}, {}, other->attrs_);
-    bcast->replace_uses_with_and_remove(new_other);
-    other->remove();
-    visitor->update_state_for_visited(new_other);
-    return true;
-}
-
-static bool check_and_swap(sc_graph_t &mgr, add_op_t *bcast_node,
-        int index_in_bcast, int bcast_input_idx, op_visitor_t *visitor) {
-    auto parent = bcast_node->get_inputs()[index_in_bcast]->producer_owner_;
-    if (auto parent_add = parent->dyn_cast<add_op_t>()) {
-        if (parent_add->get_broadcast_input() == -1) {
-            return do_swap(
-                    mgr, bcast_node, parent_add, bcast_input_idx, visitor);
-        }
-    }
-    return false;
-}
-
-void elemwise_bcast_swap(sc_graph_t &mgr, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort();
-    vis.visit_graph(mgr, [&mgr](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto add_node = node->dyn_cast<add_op_t>()) {
-            auto non_bcast_idx = add_node->get_non_broadcast_input_index(false);
-            if (non_bcast_idx.size() == 1) {
-                assert(add_node->get_inputs().size() == 2);
-                int bcast_idx = 1 - non_bcast_idx[0];
-                if (!check_and_swap(mgr, add_node, 0, bcast_idx, vis)) {
-                    check_and_swap(mgr, add_node, 1, bcast_idx, vis);
-                }
-            }
-        }
-    });
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/elemwise_dimension_alignment.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/elemwise_dimension_alignment.cpp
deleted file mode 100644
index 7353b878e97..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/elemwise_dimension_alignment.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_set>
-
-#include "../fusible_op.hpp"
-#include "../visitor.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <ops/fusible/broadcast.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static void infer_aligned_shape(const logical_tensor_t &a,
-        const logical_tensor_t &b, const std::vector<int> &plain_bc_axis,
-        sc_dims &aligned_shape, std::vector<int> &aligned_axis,
-        sc_data_format_t &new_format) {
-    assert(a.get_plain_dims().size() > b.get_plain_dims().size());
-    // skip the case where b's shape == {1}
-    if (plain_bc_axis == std::vector<int> {-1}) {
-        aligned_shape = sc_dims {};
-        aligned_axis = {};
-        new_format = sc_data_format_t();
-        return;
-    }
-
-    int dim_difference = a.get_plain_dims().size() - b.get_plain_dims().size();
-    sc_data_format_t b_format = b.get_format();
-    if (b_format.is_any()) {
-        b_format = sc_data_format_t::get_plain_by_dims(
-                b.get_plain_dims().size());
-    }
-    // get dimension binding relation of b to plain_bc_axis index
-    std::unordered_map<int, int> bc_axis_bind;
-    int bc_axis_idx = 0;
-    for (size_t i = 0; i < b.get_plain_dims().size(); ++i) {
-        if (b.get_plain_dims()[i] == 1
-                && a.get_plain_dims()[plain_bc_axis[bc_axis_idx]] != 1) {
-            continue;
-        } else if (b.get_plain_dims()[i] != 1
-                && a.get_plain_dims()[plain_bc_axis[bc_axis_idx]] == 1) {
-            COMPILE_ASSERT(0, "Invalid bc_axis found for broadcastable ops.");
-        } else {
-            COMPILE_ASSERT(is_dynamic_dim(b.get_plain_dims()[i])
-                            || is_dynamic_dim(
-                                    a.get_plain_dims()
-                                            [plain_bc_axis[bc_axis_idx]])
-                            || b.get_plain_dims()[i]
-                                    == a.get_plain_dims()
-                                               [plain_bc_axis[bc_axis_idx]],
-                    "Invalid bc_axis found for broadcastable ops.");
-            bc_axis_bind[i] = bc_axis_idx;
-            bc_axis_idx++;
-            if (bc_axis_idx >= static_cast<int>(plain_bc_axis.size())) {
-                break;
-            }
-        }
-    }
-    COMPILE_ASSERT(bc_axis_idx == static_cast<int>(plain_bc_axis.size()),
-            "Invalid bc_axis found for broadcastable ops.");
-    // b's new shape --> extend to a's number of dimension
-    aligned_shape.resize(a.get_plain_dims().size(), 1);
-    // start infer extended shape
-    for (const auto &bind : bc_axis_bind) {
-        aligned_shape[plain_bc_axis[bind.second]]
-                = b.get_plain_dims()[bind.first];
-    }
-    for (size_t i = 0; i < a.get_plain_dims().size(); ++i) {
-        if (std::find(plain_bc_axis.begin(), plain_bc_axis.end(), (int)i)
-                == plain_bc_axis.end()) {
-            aligned_axis.push_back(i);
-        }
-    }
-
-    // start infer extended format
-    // the logic below wish to let blocking shape having extended dims
-    // as its leading dims (having limitation on batch format)
-    // e.g.
-    // plain_shape: [3, 5] --> [1, 3, 1, 5]
-    // blocking_shape: [5, 3] --> [1, 1, 5, 3]
-    // format: BA --> xxDB --> ACDB
-    // plain_shape: [3, 1, 5] --> [1, 3, 1, 5] ({1, 3})
-    // format: ABC --> xBxD --> ABCD
-    // format: CAB ([5, 3, 1])--> xDBx --> ADBC --> [1, 5, 3, 1]
-    std::vector<int> storage_args(
-            b_format.format_code_.ndims() + dim_difference, -1);
-    std::unordered_set<int> axis;
-    for (int i = 0; i < b_format.format_code_.ndims(); ++i) {
-        int original_axis = b_format.format_code_.get(i);
-        // original_axis is the axis of the b's plain_dims
-        // if plain_dims[original_axis]
-        // the issue is we don't know whether original_axis is
-        if (bc_axis_bind.find(original_axis) != bc_axis_bind.end()) {
-            storage_args[i + dim_difference]
-                    = plain_bc_axis[bc_axis_bind[original_axis]];
-            axis.insert(storage_args[i + dim_difference]);
-        } else {
-            COMPILE_ASSERT(b.get_plain_dims()[original_axis] == 1,
-                    "Axis not found in bc_axis_bind map's corresponding dim "
-                    "must be 1.");
-        }
-    }
-
-    int sequential_fill_up = 0;
-    for (size_t i = 0; i < storage_args.size(); ++i) {
-        if (storage_args[i] == -1) {
-            while (axis.find(sequential_fill_up) != axis.end()) {
-                sequential_fill_up++;
-            }
-            storage_args[i] = sequential_fill_up++;
-        }
-    }
-    new_format = sc_data_format_t(storage_args, b_format.blocks_);
-}
-
-/*
-(x,y,z,w)[v0] [v1](y,w)  <-- shorter side
-           \   /
-            add
-             |
-   (x,y,z,w)[v3]
-===============
-               [v1](y,w)  <-- shorter side
-                |
-            tensor_view
-                |
-(x,y,z,w)[v0] [v4](1,y,1,w)
-           \   /
-            add
-             |
-   (x,y,z,w)[v3]
-*/
-void elemwise_dimension_alignment(sc_graph_t &graph, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto may_broadcast_node
-                = node->dyn_cast<op_traits::may_broadcast_t>()) {
-            COMPILE_ASSERT(
-                    !may_broadcast_node->get_non_broadcast_input_index(false)
-                             .empty(),
-                    "elemwise_dimension_alignment requires the broadcast-able "
-                    "op to have at least 1 non-broadcast input.");
-            for (size_t i = 0; i < node->get_inputs().size(); ++i) {
-                auto cur_in_lt = node->get_inputs()[i]->details_;
-                auto cur_out_lt = node->get_outputs()[0]->details_;
-                if (cur_in_lt.get_plain_dims().size()
-                        < cur_out_lt.get_plain_dims().size()) {
-                    const auto &plain_bc_axis
-                            = may_broadcast_node->get_plain_bc_axis()[i];
-                    sc_dims shape;
-                    std::vector<int> aligned_axis;
-                    sc_data_format_t format;
-                    infer_aligned_shape(cur_out_lt, cur_in_lt, plain_bc_axis,
-                            shape, aligned_axis, format);
-                    if (!shape.empty()) {
-                        // insert tensor view
-                        auto ret = graph.make("tensor_view",
-                                {node->info_.inputs_[i]}, {},
-                                {{"shape", shape}, {"format", format},
-                                        {"expand_dim", aligned_axis}});
-                        node->replace_input(i, ret->get_outputs()[0]);
-                    }
-                }
-            }
-        } else if (auto broadcast_op = node->dyn_cast<broadcast_op_t>()) {
-            auto cur_in_lt = node->get_inputs()[0]->details_;
-            auto cur_out_lt = node->get_outputs()[0]->details_;
-            if (cur_in_lt.get_plain_dims().size()
-                    < cur_out_lt.get_plain_dims().size()) {
-                const auto &plain_bc_axis = broadcast_op->get_plain_bc_axis();
-                sc_dims shape;
-                std::vector<int> aligned_axis;
-                sc_data_format_t format;
-                infer_aligned_shape(cur_out_lt, cur_in_lt, plain_bc_axis, shape,
-                        aligned_axis, format);
-                if (!shape.empty()) {
-                    // insert tensor view
-                    auto ret = graph.make("tensor_view",
-                            {node->info_.inputs_[0]}, {},
-                            {{"shape", shape}, {"format", format},
-                                    {"expand_dim", aligned_axis}});
-                    node->replace_input(0, ret->get_outputs()[0]);
-                }
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/eliminate_zero_shaped_tensors.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/eliminate_zero_shaped_tensors.cpp
deleted file mode 100644
index 0a220ffc141..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/eliminate_zero_shaped_tensors.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../graph.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/fusible/shape_of_tensor.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <ops/graph_convolution.hpp>
-#include <ops/matmul.hpp>
-
-#include <algorithm>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.eliminate_zero_shaped_tensors);
-
-static bool is_tensor_zero_shape(const graph_tensor_ptr &gt) {
-    const sc_dims dims = gt->details_.get_plain_dims();
-    if (std::all_of(
-                dims.begin(), dims.end(), [](sc_dim d) { return d != 0; })) {
-        return false;
-    }
-    return true;
-}
-
-/*
-It's possible that the shape of some tensors contains zeros, for example, (4,
-32, 0, 128). These tensors are actually empty in size. We need to eliminate them
-after infering shape and before other passes.
-*/
-
-SC_INTERNAL_API void eliminate_zero_shaped_tensors(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &op) {
-        std::vector<bool> is_zero_shape(op->info_.inputs_.size(), false);
-        for (size_t idx = 0; idx < op->info_.inputs_.size(); ++idx) {
-            if (is_tensor_zero_shape(op->info_.inputs_[idx])) {
-                is_zero_shape[idx] = true;
-            }
-        }
-        if (std::all_of(is_zero_shape.begin(), is_zero_shape.end(),
-                    [](bool is_zero) { return !is_zero; })) {
-            return;
-        }
-        if (op->isa<concat_op_t>() || op->isa<output_op>()) {
-            graph_tensor_ptr input_left = nullptr;
-            for (auto in_tsr_itr = op->info_.inputs_.begin();
-                    in_tsr_itr != op->info_.inputs_.end();) {
-                auto &in_tsr = *in_tsr_itr;
-                if (!is_tensor_zero_shape(in_tsr)) {
-                    ++in_tsr_itr;
-                    continue;
-                }
-                if (op->info_.inputs_.size() == 1) {
-                    // we have 1 tensor left and it is a zero-shape tensor
-                    input_left = in_tsr;
-                    break;
-                }
-                SC_MODULE_INFO << "Disconnect input from concat or output op.";
-                in_tsr->detach_use(op);
-                // Since we will delete this input, the inputs after it should
-                // modify their uses_ by decrease index by 1
-                for (auto latter_itr = in_tsr_itr + 1;
-                        latter_itr != op->info_.inputs_.end(); ++latter_itr) {
-                    auto &latter_uses = (*latter_itr)->uses_;
-                    for (auto &idx_op : latter_uses) {
-                        idx_op.first -= 1;
-                    }
-                }
-                in_tsr_itr = op->info_.inputs_.erase(in_tsr_itr);
-            }
-            if (input_left) {
-                vis->update_state_for_visited(op);
-                for (auto &out_tsr : op->get_outputs()) {
-                    auto uses = out_tsr->uses_;
-                    for (auto &idx_op : uses) {
-                        idx_op.second->replace_input(idx_op.first, input_left);
-                    }
-                }
-                op->remove();
-            }
-        } else if (op->isa<padding_op_t>() || op->isa<shape_of_tensor_op_t>()) {
-            COMPILE_ASSERT(0,
-                    "padding and shape op do not support zero-shape tensor "
-                    "input.");
-        } else {
-            vis->update_state_for_visited(op);
-            if (!op->isa<binary_elementwise_op_impl_t>()
-                    && !op->isa<ops::matmul_op>() && !op->isa<select_op_t>()) {
-                COMPILE_ASSERT(is_zero_shape[0],
-                        "Current op's first input must be a zero tensor.");
-            }
-            int64_t input_idx_left = -1;
-            for (size_t i = 0; i < is_zero_shape.size(); ++i) {
-                if (is_zero_shape[i]) {
-                    input_idx_left = i;
-                    break;
-                }
-            }
-            COMPILE_ASSERT(input_idx_left >= 0,
-                    "The current op must have a zero-shape input.");
-            int64_t visiting_idx = 0;
-            for (auto in_tsr_itr = op->info_.inputs_.begin();
-                    in_tsr_itr != op->info_.inputs_.end();) {
-                if (visiting_idx == input_idx_left) {
-                    in_tsr_itr++;
-                } else {
-                    (*in_tsr_itr)->detach_use(op);
-                    in_tsr_itr = op->info_.inputs_.erase(in_tsr_itr);
-                }
-                visiting_idx++;
-            }
-            COMPILE_ASSERT(op->info_.inputs_.size() == 1,
-                    "The zero-input op must have only 1 input left.");
-            auto &input_left = op->info_.inputs_[0];
-            // replace_input may change the uses, we need to copy it
-            for (const auto &output : op->info_.outputs_) {
-                auto uses = output->uses_;
-                for (auto &cld : uses) {
-                    cld.second->replace_input(cld.first, input_left);
-                }
-            }
-            op->remove();
-        }
-    });
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/flatten_conv.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/flatten_conv.cpp
deleted file mode 100644
index 1f5514606de..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/flatten_conv.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <utility>
-#include "../fusible_op.hpp"
-#include "../pass/pass.hpp"
-#include "../visitor.hpp"
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/templates/conv_rl.hpp>
-#include <runtime/config.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-sc_dims get_conv1d_flatten_shape(const sc_data_format_t &format,
-        const sc_dims &origin_shape, int merge_bs = 1) {
-    COMPILE_ASSERT(
-            origin_shape.size() == 4, "Conv1d flatten only support 2d case");
-    sc_dims out_shape;
-    out_shape.push_back(origin_shape[0]);
-    if (format == sc_data_format_t::NCHW()) {
-        out_shape.push_back(origin_shape[1]);
-        out_shape.push_back(origin_shape[2] * origin_shape[3]);
-    } else if (format == sc_data_format_t::NHWC()) {
-        out_shape.push_back(origin_shape[3]);
-        out_shape.push_back(origin_shape[1] * origin_shape[2]);
-    }
-    COMPILE_ASSERT(out_shape[0] % merge_bs == 0,
-            "N % merge_bs should be equal to zero but get "
-                    << out_shape[0] << " % " << merge_bs << " = "
-                    << out_shape[0] % merge_bs);
-    out_shape[2] *= merge_bs;
-    out_shape[0] /= merge_bs;
-
-    return out_shape;
-}
-
-// Whether to flatten N axis, which will break bs fusion but may bring perf
-// benefit to single layer
-int get_minibatch(const int &bs, const int &min_os) {
-    auto num_threads = runtime_config_t::get().get_num_threads();
-    int minibatch = std::max(sc_dim(1), sc_dim(28) / sc_dim(std::sqrt(min_os)));
-    if ((bs / minibatch % num_threads != 0
-                && bs / minibatch < 4 * num_threads)) {
-        // TODO(zhicong): use a more general way for minibatch image affinity
-        if (bs % num_threads == 0) {
-            return 1;
-        } else {
-            return bs;
-        }
-    }
-    return bs % minibatch == 0 ? minibatch : 1;
-}
-
-int minimum_spatial_shape(sc_graph_t &graph, bool &is_support) {
-    auto vis = op_visitor_t::bfs();
-    int min_spatial = std::numeric_limits<int>::max();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto op = node->dyn_cast<ops::conv_fwd_core_op_t>()) {
-            int spatial_size = 1;
-            auto plain_shape = op->get_inputs()[0]->details_.get_plain_dims();
-            if (plain_shape.size() != 4) { is_support = false; }
-
-            for (auto i = 2UL; i < plain_shape.size(); i++) {
-                spatial_size *= plain_shape[i];
-            }
-            if (spatial_size < min_spatial) { min_spatial = spatial_size; }
-        }
-    });
-    return min_spatial;
-}
-
-void conv1d_flatten(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    bool is_support = true;
-    auto minimum_os = minimum_spatial_shape(graph, is_support);
-    if (!is_support) return;
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto op = node->dyn_cast<ops::conv_fwd_core_op_t>()) {
-            auto data_plain_shape
-                    = op->get_inputs()[0]->details_.get_plain_dims();
-            // do not support dynamic shape
-            if (op->get_inputs()[0]->is_dynamic()) return;
-            auto data_origin_shape
-                    = op->get_inputs()[0]->details_.get_blocking_dims();
-            auto weight_origin_shape
-                    = op->get_inputs()[1]->details_.get_blocking_dims();
-            auto ndims = data_plain_shape.size();
-            auto &stride = op->attrs_.get<sc_dims>("strides");
-            auto &pads_begin = op->attrs_.has_key("pads_begin")
-                    ? op->attrs_.get<sc_dims>("pads_begin")
-                    : op->attrs_.get<sc_dims>("paddings");
-            if (op->attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-                    != ops::rl_kind::NO_LOWERING) {
-                return;
-            }
-            sc_dim groups = op->attrs_.get_or_else("groups", 1);
-            if (groups > 1) { return; }
-            auto weight_plain_dims
-                    = op->get_inputs()[1]->details_.get_plain_dims();
-            assert(weight_plain_dims.size() == ndims
-                    && "invalid weight_plain_dims");
-            auto kh = weight_plain_dims[ndims - 2];
-            auto kw = weight_plain_dims[ndims - 1];
-            auto data_format = op->get_inputs()[0]->details_.get_format();
-            auto weight_format = op->get_inputs()[1]->details_.get_format();
-            int merge_bs = get_minibatch(data_plain_shape[0], minimum_os);
-            auto iw = op->get_inputs()[0]->details_.get_plain_dims()[3];
-            auto ih = op->get_inputs()[0]->details_.get_plain_dims()[2];
-            auto ow = op->get_outputs()[0]->details_.get_plain_dims()[3];
-            auto oh = op->get_outputs()[0]->details_.get_plain_dims()[2];
-            if (op->use_conv1d(ctx)) {
-                { // pre tensor_view(data)
-                    auto shape
-                            = get_conv1d_flatten_shape(sc_data_format_t::NCHW(),
-                                    sc_data_format_t::get_padded_plain_shapes(
-                                            data_origin_shape, data_format),
-                                    merge_bs);
-                    auto reorder_op
-                            = graph.make("reorder", {op->get_inputs()[0]}, {},
-                                    {{"out_format", sc_data_format_t::NHWC()},
-                                            {"internal", true}});
-                    auto view = graph.make("tensor_view",
-                            reorder_op->get_outputs(), {},
-                            {{"shape", shape},
-                                    {"format", sc_data_format_t::NSC()},
-                                    {"expand_dim", std::vector<int> {}}});
-                    op->replace_input(0, view->get_outputs()[0]);
-                    vis->update_state_for_visited(reorder_op);
-                    vis->update_state_for_visited(view);
-                }
-                { // pre tensor_view(weight)
-                    auto shape = get_conv1d_flatten_shape(
-                            weight_format, weight_origin_shape);
-                    auto reorder_op
-                            = graph.make("reorder", {op->get_inputs()[1]}, {},
-                                    {{"out_format", sc_data_format_t::KCRS()},
-                                            {"internal", true}});
-                    auto view = graph.make("tensor_view",
-                            reorder_op->get_outputs(), {},
-                            {{"shape", shape},
-                                    {"format", sc_data_format_t::KCS()},
-                                    {"expand_dim", std::vector<int> {}}});
-                    op->replace_input(1, view->get_outputs()[0]);
-                    vis->update_state_for_visited(reorder_op);
-                    vis->update_state_for_visited(view);
-                }
-                { // post tensor view(output)
-                    auto origin_out = op->get_outputs()[0]->copy();
-                    op->get_outputs()[0]->replace_with(origin_out);
-                    origin_out->producer_owner_ = nullptr;
-
-                    auto dtype = origin_out->details_.dtype_;
-                    auto new_conv_out = std::make_shared<graph_tensor>(&(*node),
-                            sc_data_format_t::NCS(),
-                            get_conv1d_flatten_shape(sc_data_format_t::NCHW(),
-                                    op->get_outputs()[0]
-                                            ->details_.get_blocking_dims(),
-                                    merge_bs),
-                            dtype);
-                    op->info_.outputs_[0] = new_conv_out;
-
-                    auto reorder_op = graph.make("reorder", op->get_outputs(),
-                            {}, {{"out_format", sc_data_format_t::NSC()}});
-                    auto view = graph.make("tensor_view",
-                            reorder_op->get_outputs(), {},
-                            {{"shape", origin_out->details_.get_plain_dims()},
-                                    {"format", sc_data_format_t::NHWC()},
-                                    {"expand_dim", std::vector<int> {}},
-                                    {"push_back", true}});
-                    origin_out->replace_with(view->get_outputs()[0]);
-                    view->copy_dispatch_key_set_from_op(node);
-                    vis->update_state_for_visited(reorder_op);
-                    vis->update_state_for_visited(view);
-                }
-                if (op->attrs_.has_key("pads_begin")) {
-                    op->attrs_.get<sc_dims>("pads_begin") = {0};
-                    op->attrs_.get<sc_dims>("pads_end") = {0};
-                } else {
-                    op->attrs_.get<sc_dims>("paddings") = {0};
-                }
-                op->attrs_["origin_ih"] = ih;
-                op->attrs_["origin_iw"] = iw;
-                op->attrs_["origin_ow"] = ow;
-                op->attrs_["origin_oh"] = oh;
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-void flatten_conv(sc_graph_t &graph, const context_ptr &ctx) {
-    if (graph.attrs_.get_or_else("no_conv1d", false)) { return; }
-    conv1d_flatten(graph, ctx);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/fpmath_mode.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/fpmath_mode.cpp
deleted file mode 100644
index 92670962b47..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/fpmath_mode.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "transform.hpp"
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <runtime/env_vars.hpp>
-#include <unordered_map>
-
-SC_MODULE(fpmath_mode)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-bool down_converted_op_exit(const context_ptr &ctx, const sc_op_ptr &node,
-        const sc_data_type_t &target_type,
-        std::vector<int> &onedim_const) { // whether the op support
-    // converting to the target type
-    if (node->isa<input_op>()) return false;
-    if (node->isa<cast_op_t>()) return false;
-    // avoid cast epsilon to bf16
-    if (node->isa<constant_op_t>()) {
-        auto const_op = node->dyn_cast<constant_op_t>();
-        if (const_op->get_constant_plain_dims() == sc_dims {1}) {
-            onedim_const.push_back(node->logical_op_id_);
-            return false;
-        }
-    };
-    // check inputs data types.
-    for (auto &in : node->get_inputs()) {
-        if (std::find(onedim_const.begin(), onedim_const.end(),
-                    in->producer_owner_->logical_op_id_)
-                != onedim_const.end())
-            return false;
-        if (!utils::is_one_of(in->details_.dtype_, datatypes::f32, target_type))
-            return false;
-    }
-    // checkout outputs data types.
-    for (auto &out : node->get_outputs()) {
-        if (!utils::is_one_of(
-                    out->details_.dtype_, datatypes::f32, target_type))
-            return false;
-    }
-    return true;
-}
-
-bool check_is_quantized(sc_graph_t &graph) {
-    for (auto &op : graph.ops_) {
-        if (op->op_name_.find("quantize") != std::string::npos) return true;
-    }
-    return false;
-}
-
-void fpmath_mode(sc_graph_t &graph, const context_ptr &ctx) {
-    // checks for open fpmath_mode
-    int fpmath_mode_attr
-            = graph.attrs_.get_or_else(sc_graph_t::attr_key_t::fpmath_mode, 0);
-    int fpmath_mode_env = utils::getenv_int("SC_FPMATH_MODE", 0);
-    if (!fpmath_mode_attr && !fpmath_mode_env) return;
-    if (!ctx->machine_.cpu_flags_.fAVX512AMXBF16
-            && !ctx->machine_.cpu_flags_.fAVX512BF16)
-        return;
-    if (check_is_quantized(graph)) return;
-    auto vis = op_visitor_t::bfs_topology_sort(graph.ops_.size());
-    std::unordered_map<int, sc_data_type_t> dtype_backup;
-    std::vector<int> onedim_const;
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<output_op>()) {
-            bool downconvert_output
-                    = dtype_backup.count(
-                              node->get_inputs()[0]
-                                      ->producer_owner_->logical_op_id_)
-                    > 0;
-            if (downconvert_output) {
-                // add the cast op: BF16 -> F32
-                sc_op_ptr cast_node = graph.make("cast", node->get_inputs(), {},
-                        {{"dtype", datatypes::f32}});
-                node->replace_input(0, cast_node->get_outputs()[0]);
-            }
-        } else {
-            if (down_converted_op_exit(
-                        ctx, node, datatypes::bf16, onedim_const)) {
-                for (auto in_id = 0; in_id < int(node->get_inputs().size());
-                        ++in_id) {
-                    if (node->get_inputs()[in_id]->details_.dtype_
-                            == datatypes::f32) {
-                        // add the cast op: F32 -> BF16
-                        auto cur_owner_op
-                                = node->get_inputs()[in_id]->producer_owner_;
-                        auto is_constant_op
-                                = (cur_owner_op->isa<constant_op_t>()
-                                        || cur_owner_op->attrs_.get_or_else(
-                                                "constant",
-                                                const_kind::not_const));
-                        sc_op_ptr cast_node = graph.make("cast",
-                                {node->get_inputs()[in_id]}, {},
-                                {{"dtype", datatypes::bf16}});
-                        cast_node->get_outputs()[0]->details_.dtype_
-                                = datatypes::bf16;
-                        if (is_constant_op) {
-                            cast_node->attrs_.set(
-                                    "constant", const_kind::local_const);
-                        }
-                        node->replace_input(in_id, cast_node->get_outputs()[0]);
-                    }
-                }
-                // tunable ops can infer output type automatically, do not
-                // need to set out_dtype
-                if (!node->get_outputs().empty()
-                        && !node->isa<tunable_op_t>()) {
-                    if (node->get_outputs()[0]->details_.dtype_
-                            == datatypes::f32) {
-                        dtype_backup[node->logical_op_id_]
-                                = node->get_outputs()[0]->details_.dtype_;
-                        for (auto &out : node->get_outputs()) {
-                            out->details_.dtype_ = datatypes::bf16;
-                        }
-                    }
-                }
-            } else if (!node->isa<input_op>() && !node->isa<cast_op_t>()) {
-                for (auto in_id = 0; in_id < int(node->get_inputs().size());
-                        ++in_id) {
-                    if (node->get_inputs()[in_id]->details_.dtype_
-                            == datatypes::bf16) {
-                        // add the cast op: BF16 -> F32
-                        auto cur_owner_op
-                                = node->get_inputs()[in_id]->producer_owner_;
-                        auto is_constant_op
-                                = (cur_owner_op->isa<constant_op_t>()
-                                        || cur_owner_op->attrs_.get_or_else(
-                                                "constant",
-                                                const_kind::not_const));
-                        sc_op_ptr cast_node = graph.make("cast",
-                                {node->get_inputs()[in_id]}, {},
-                                {{"dtype", datatypes::f32}});
-                        cast_node->get_outputs()[0]->details_.dtype_
-                                = datatypes::f32;
-                        if (is_constant_op) {
-                            cast_node->attrs_.set(
-                                    "constant", const_kind::local_const);
-                        }
-                        node->replace_input(in_id, cast_node->get_outputs()[0]);
-                    }
-                }
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/global_reschedule.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/global_reschedule.cpp
deleted file mode 100644
index 5ce38e28e13..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/global_reschedule.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <numeric>
-#include <utility>
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../tunable_op.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-template <typename opT>
-static void collect_linked_ops_nearby_input_until_opT(sc_graph_t &graph,
-        const context_ptr &ctx,
-        std::vector<std::vector<sc_op_ptr>> &target_ops_list) {
-    auto vis = op_visitor_t::bfs();
-    constexpr int max_step = 5;
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (!node->isa<input_op>()) return;
-        for (auto &user : node->get_outputs()[0]->uses_) {
-            sc_op_ptr next_node = user.second;
-            int step = 1;
-            std::vector<sc_op_ptr> target_ops;
-            bool found = false;
-            while (next_node->is_single_output_single_use()
-                    && (!next_node->get_outputs()[0]
-                                    ->uses_[0]
-                                    .second->isa<output_op>())
-                    && (next_node->attrs_.get_or_else(
-                                "constant", const_kind::not_const)
-                            == const_kind::not_const)) {
-                target_ops.emplace_back(next_node);
-                if (next_node->isa<opT>()) {
-                    found = true;
-                    break;
-                }
-                if ((step++) >= max_step) return;
-                next_node = next_node->get_outputs()[0]->uses_[0].second;
-            }
-            if (found && target_ops.size() > 1) {
-                target_ops_list.emplace_back(target_ops);
-            }
-        }
-    });
-}
-
-/**
- * The reschedule bypass rule is:
- * 1. upwards visit.
- * 2. scan bypass ops list: find the first non-elementwise op and try to swap
- * reorder op position with it firstly in avoid of unnecessay fusion break.
- * */
-static void reschedule_reorder_nearby_input(std::vector<sc_op_ptr> target_ops) {
-    if (target_ops.empty()) return;
-    // first op in bypass list
-    auto begin_op = target_ops.front();
-    auto end_op = target_ops.back();
-    COMPILE_ASSERT(end_op->isa<reorder_op_t>(),
-            "tensorview op is expected, but got " << end_op->op_name_ << "_"
-                                                  << end_op->logical_op_id_)
-    // support automatically position switch for unary and special
-    // broadcast ops
-    auto last_reo = end_op->dyn_cast<reorder_op_t>();
-    sc_op *pre_op = end_op.get();
-    sc_data_type_t cached_dtype = end_op->get_inputs()[0]->details_.dtype_;
-    // search the swtichable ops
-    int i = target_ops.size() - 1;
-    int j = i;
-    std::unordered_map<int, int> idx_mp;
-    while (pre_op != begin_op.get()) {
-        pre_op = target_ops[--i].get();
-        if (pre_op->isa<unary_elementwise_op_t>()) {
-            if (pre_op->isa<cast_op_t>()) {
-                if (utils::get_sizeof_type(
-                            pre_op->get_inputs()[0]->details_.dtype_)
-                        > utils::get_sizeof_type(
-                                pre_op->get_outputs()[0]->details_.dtype_)) {
-                    break;
-                }
-                cached_dtype = pre_op->get_inputs()[0]->details_.dtype_;
-            }
-            idx_mp[i] = 0;
-            j = i;
-        } else if (pre_op->isa<binary_elementwise_op_t>()) {
-            int bc_idx = pre_op->dyn_cast<binary_elementwise_op_t>()
-                                 ->get_broadcast_input();
-            if (bc_idx < 0) break;
-            if (auto const_op
-                    = pre_op->get_inputs()[bc_idx]
-                              ->producer_owner_->dyn_cast<constant_op_t>()) {
-                auto dims = const_op->get_constant_blocking_dims();
-                if (dims.size() == 1 && dims[0] == 1) {
-                    idx_mp[i] = 1 - bc_idx;
-                    j = i;
-                    continue;
-                }
-            }
-            break;
-        } else {
-            break;
-        }
-    }
-    pre_op = target_ops[j].get();
-
-    // switch pre_op and last_reo position
-    if (pre_op == begin_op.get()) {
-        auto new_fmt = last_reo->get_outputs()[0]->details_.get_format();
-        // disconnect reorder
-        last_reo->get_outputs()[0]->replace_with(
-                target_ops[target_ops.size() - 2]->get_outputs()[0]);
-        // insert reorder
-        last_reo->replace_input(0, pre_op->get_inputs()[idx_mp[j]]);
-        pre_op->replace_input(idx_mp[j], last_reo->get_outputs()[0]);
-
-        // reset attribute
-        last_reo->info_.outputs_[0]->details_.dtype_ = cached_dtype;
-        // update output format
-        for (int k = j; k < static_cast<int>(target_ops.size()) - 1; k++) {
-            target_ops[k]->get_outputs()[0]->details_.set_format(new_fmt);
-        }
-    }
-}
-
-static void nearby_input_reorder_rule(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    std::vector<std::vector<sc_op_ptr>> target_ops_list;
-    collect_linked_ops_nearby_input_until_opT<reorder_op_t>(
-            graph, ctx, target_ops_list);
-    // reschedule all bypass ops
-    for (auto &target_ops : target_ops_list) {
-        reschedule_reorder_nearby_input(target_ops);
-    }
-}
-
-static void find_reorder_and_tv_from_matmul2D2ND(sc_graph_t &graph,
-        const context_ptr &ctx,
-        std::vector<std::vector<sc_op_ptr>> &target_ops_list,
-        std::unordered_map<int, int> &op_bc_idx) {
-    auto vis = op_visitor_t::dfs_topology_sort();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        // 1. Find the tensorview op which is generated by matmul. Its parent
-        // should be a reorder op, whose input format should be blocking and
-        // output format should be plain. If the reorder op is marked as
-        // "break_post_fuse", it is our target.
-        if (!node->isa<tensor_view_op_t>()) return;
-        if (!node->attrs_.get_or_else("source_matmul_2D2ND", false)) return;
-        if (!node->is_single_output_single_use()) return;
-        auto tv_parent = node->info_.inputs_[0]->producer_owner_;
-        if (!tv_parent->isa<reorder_op_t>()) return;
-        // Note: break_post_fuse is marked by mixed_partition pass,
-        // which is a later one. Can not get it here. if
-        // (!tv_parent->attrs_.get_or_else(op_attr_key::break_post_fuse, false))
-        //     return;
-        if (!tv_parent->is_single_output_single_use()) return;
-        auto &reorder_input_format
-                = tv_parent->get_inputs()[0]->details_.get_format();
-        auto &reorder_output_format
-                = tv_parent->get_outputs()[0]->details_.get_format();
-        // this reorder is from ABab to AB
-        if (!reorder_input_format.format_code_.is_blocking()
-                || reorder_input_format.format_code_ != format_kinds::ABab
-                || !reorder_output_format.format_code_.is_plain()) {
-            return;
-        }
-
-        std::vector<sc_op_ptr> target_ops {tv_parent->shared_from_this(), node};
-        sc_op_ptr child_node = node->get_outputs()[0]->uses_[0].second;
-        while (child_node->is_single_output_single_use()) {
-            if (child_node->isa<unary_elementwise_op_t>()) {
-                target_ops.emplace_back(child_node);
-            } else if (child_node->isa<binary_elementwise_op_t>()) {
-                /*
-                for binary op, we need to check that it is with broadcast;
-                because for topo like
-                    matmul0 - reorder0 - tv0 \
-                                               add,
-                    matmul1 - reorder1 - tv1 /
-                the add op should not be moved.
-                */
-                std::vector<int> non_bc_inputs
-                        = child_node->dyn_cast<binary_elementwise_op_t>()
-                                  ->get_non_broadcast_input_index(true);
-                if (non_bc_inputs.size() != 2) {
-                    int bc_idx = child_node->dyn_cast<binary_elementwise_op_t>()
-                                         ->get_broadcast_input();
-                    op_bc_idx[child_node->logical_op_id_] = bc_idx;
-                    target_ops.emplace_back(child_node);
-                } else {
-                    break;
-                }
-            } else {
-                break;
-            }
-            child_node = child_node->get_outputs()[0]->uses_[0].second;
-        }
-        if (target_ops.size() > 2) { target_ops_list.emplace_back(target_ops); }
-    });
-}
-
-static void postpone_reorder_and_tv(sc_graph_t &graph,
-        const std::vector<sc_op_ptr> &target_ops,
-        const std::unordered_map<int, int> &op_bc_idx) {
-    if (target_ops.size() < 2) return;
-    auto first_op = target_ops.front();
-    COMPILE_ASSERT(first_op->isa<reorder_op_t>(),
-            "reorder op is expected, but got " << first_op->op_name_ << "_"
-                                               << first_op->logical_op_id_);
-    // target ops: {reorder, tensorview, unary/banary ops}
-    auto &reorder_input = first_op->get_inputs()[0];
-    auto &reorder_input_format // ABab blocking
-            = reorder_input->details_.get_format();
-    auto &reorder_output_format // AB plain
-            = first_op->get_outputs()[0]->details_.get_format();
-    sc_data_format_t::blocking_t blocks = reorder_input_format.blocks_;
-    blocks[0] = 1; // only blocking at B-axis
-    sc_data_format_t new_fmt
-            = sc_data_format_t(reorder_input_format.format_code_, blocks);
-
-    // Move the reorder op and tv op to last. Other ops in original order.
-    auto &tv_op = target_ops[1];
-    auto &third_op = target_ops[2]; // the op after reorder and tensorview
-    auto &last_op = target_ops.back();
-    for (auto &use : tv_op->get_outputs()[0]->uses_) {
-        // tsr tv_output is single-used by third_op
-        third_op->replace_input(use.first, reorder_input);
-    }
-    for (auto &use : last_op->get_outputs()[0]->uses_) {
-        use.second->replace_input(use.first, tv_op->get_outputs()[0]);
-    }
-    first_op->replace_input(0, last_op->get_outputs()[0]);
-
-    for (size_t i = 2; i < target_ops.size(); ++i) {
-        auto &curr_op = target_ops[i];
-        if (curr_op->isa<unary_elementwise_op_t>()) {
-            auto new_output = std::make_shared<graph_tensor>(curr_op.get(),
-                    curr_op->get_inputs()[0]->details_.get_format(),
-                    curr_op->get_inputs()[0]->details_.get_plain_dims(),
-                    // this may be a cast op, so use dtype of the output tensor
-                    curr_op->get_outputs()[0]->details_.dtype_);
-            curr_op->get_outputs()[0]->replace_with(new_output);
-            curr_op->info_.outputs_[0] = new_output;
-        } else if (curr_op->isa<binary_elementwise_op_t>()) {
-            // For binary ops, add tensorview to change the format and shape
-            int bc_idx = op_bc_idx.at(curr_op->logical_op_id_);
-            auto &ori_tsr = curr_op->get_inputs()[bc_idx];
-            auto &parent_op = ori_tsr->producer_owner_;
-            if (ori_tsr->details_.get_format() != reorder_input_format
-                    && ori_tsr->details_.get_plain_dims().size() > 1) {
-                // reshape to 1D
-                auto &ori_tsr_shape = ori_tsr->details_.get_blocking_dims();
-                sc_dims plain_shape = {1,
-                        std::accumulate(ori_tsr_shape.begin(),
-                                ori_tsr_shape.end(), 1,
-                                std::multiplies<sc_dim>())};
-                auto new_tv = graph.make("tensor_view",
-                        parent_op->get_outputs(), {},
-                        {{"shape", plain_shape},
-                                {"format",
-                                        sc_data_format_t(format_kinds::AB)}});
-                // reorder to blocking format
-                auto new_reorder = graph.make("reorder", new_tv->get_outputs(),
-                        {}, {{"internal", true}, {"out_format", new_fmt}});
-                curr_op->replace_input(bc_idx, new_reorder->get_outputs()[0]);
-            }
-            auto new_output = std::make_shared<graph_tensor>(curr_op.get(),
-                    curr_op->get_inputs()[1 - bc_idx]->details_.get_format(),
-                    curr_op->get_inputs()[1 - bc_idx]
-                            ->details_.get_plain_dims(),
-                    curr_op->get_inputs()[1 - bc_idx]->details_.dtype_);
-            curr_op->get_outputs()[0]->replace_with(new_output);
-            curr_op->info_.outputs_[0] = new_output;
-            if (!curr_op->attrs_.get_or_else("bc_axis", std::vector<int> {})
-                            .empty()) {
-                // Note: broadcast happens at axis #1 (axis N of matmul)
-                curr_op->attrs_["bc_axis"] = std::vector<int> {1};
-            }
-            curr_op->dyn_cast<binary_elementwise_op_impl_t>()
-                    ->set_plain_bc_axis();
-        }
-    }
-    // the shape and format of output tensors of reorder and tv are not
-    // affected; but if the dtype is changed, rebuild them.
-    if (first_op->info_.inputs_[0]->details_.dtype_
-            != first_op->info_.outputs_[0]->details_.dtype_) {
-        for (size_t i = 0; i < 2; ++i) {
-            auto &curr_op = target_ops[i];
-            auto new_output = std::make_shared<graph_tensor>(curr_op.get(),
-                    curr_op->get_outputs()[0]->details_.get_format(),
-                    curr_op->get_outputs()[0]->details_.get_plain_dims(),
-                    curr_op->get_inputs()[0]->details_.dtype_);
-            curr_op->get_outputs()[0]->replace_with(new_output);
-            curr_op->info_.outputs_[0] = new_output;
-        }
-    }
-}
-
-static void postpone_reorder_and_tv_from_matmul2D2ND(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    std::vector<std::vector<sc_op_ptr>> target_ops_list;
-    std::unordered_map<int, int> op_bc_idx;
-    find_reorder_and_tv_from_matmul2D2ND(
-            graph, ctx, target_ops_list, op_bc_idx);
-    for (auto &target_ops : target_ops_list) {
-        postpone_reorder_and_tv(graph, target_ops, op_bc_idx);
-    }
-}
-
-using gr_rule = std::function<void(sc_graph_t &graph, const context_ptr &ctx)>;
-
-void global_reschedule(sc_graph_t &graph, const context_ptr &ctx) {
-    if (!graph.attrs_.get_or_else("temp.fuse", 1)) { return; }
-    if (graph.is_dynamic()) { return; }
-    std::vector<gr_rule> gr_rule_list;
-    gr_rule_list.emplace_back(nearby_input_reorder_rule);
-    gr_rule_list.emplace_back(postpone_reorder_and_tv_from_matmul2D2ND);
-    for (auto &gr_rule : gr_rule_list) {
-        gr_rule(graph, ctx);
-    }
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_concat_memory_planning.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_concat_memory_planning.cpp
deleted file mode 100644
index 6ede26c18f3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_concat_memory_planning.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../visitor.hpp"
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/transform/concat_memory_planning.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.graph_concat_memory_planning);
-
-/*
-For the following graph:
-    op0   op1   op2
-      \    |    /
-       \   |   /
-         concat0    op3
-           \        /
-            \      /
-            concat1
-Merge the consecutive two concats into one, i.e., merge concat0 into concat1:
-    op0    op1    op2
-      \     |     /
-       \    |    /
-        \   |   / op3
-         \  |  /  /
-          \ | /  /
-          concat1
-Constraints:
-0. the output of concat0 is only use by concat1
-1. concat dim must be equal.
-*/
-SC_INTERNAL_API void merge_concats(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context()) {
-    op_visitor_t vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &op) {
-        if (!op->isa<concat_op_t>()) { return; }
-        auto concat = op->stc_cast<concat_op_t>();
-        std::vector<graph_tensor_ptr> ori_inputs = concat->info_.inputs_;
-        std::vector<graph_tensor_ptr> new_inputs;
-        std::unordered_map<int, int> new_idx_to_ori_idx;
-        std::unordered_set<sc_op *> ops_to_remove;
-        for (size_t i = 0; i < ori_inputs.size(); ++i) {
-            auto &ori_input = ori_inputs[i];
-            sc_op *parent_op = ori_input->producer_owner_;
-            // 1. parent op is concat and its only output is only used by
-            // current op.
-            // 2. the concat dim must be equal for current op and parent op
-            if (parent_op->isa<concat_op_t>()
-                    && parent_op->stc_cast<concat_op_t>()->get_axis()
-                            == concat->get_axis()
-                    && parent_op->get_outputs()[0]->uses_.size() == 1) {
-                SC_MODULE_INFO << "Meets two consecutive concat ops: "
-                               << parent_op->logical_op_id_ << " --> "
-                               << concat->logical_op_id_
-                               << ", same concat dim: " << concat->get_axis();
-                // the inputs of parent op are now inputs of current op
-                new_inputs.insert(new_inputs.end(),
-                        parent_op->info_.inputs_.begin(),
-                        parent_op->info_.inputs_.end());
-                ops_to_remove.insert(parent_op);
-            } else {
-                new_idx_to_ori_idx[new_inputs.size()] = i;
-                new_inputs.push_back(ori_input);
-            }
-        }
-        if (ori_inputs != new_inputs) {
-            op->info_.inputs_ = new_inputs;
-            concat->is_input_valid_
-                    = std::vector<bool>(new_inputs.size(), true);
-            for (size_t j = 0; j < op->info_.inputs_.size(); ++j) {
-                // can not call replace_input here because the input idx is
-                // changed, and detach_use and attach_use uses different index
-                op->info_.inputs_[j]->detach_use(op, new_idx_to_ori_idx[j]);
-                new_inputs[j]->attach_use(op, j);
-            }
-            for (auto &op_to_remove : ops_to_remove) {
-                SC_MODULE_INFO << "Remove op: " << op_to_remove->op_name_
-                               << op_to_remove->logical_op_id_;
-                op_to_remove->remove();
-            }
-            vis->update_state_for_visited(op);
-        }
-    });
-    graph.reset_op_ids();
-}
-
-// Calc multi-dimensional offset of each input to concat.
-// Each input has specific offset to the output buffer of concat.
-static std::vector<sc_dims> calc_offsets(
-        std::vector<sc_dims> &inputs_dims, unsigned concat_dim) {
-    size_t num_dims = inputs_dims[0].size();
-    std::vector<sc_dims> offsets(inputs_dims.size(), sc_dims(num_dims, 0));
-    sc_dim offset = 0;
-    // the offset of the first tensor is 0
-    for (size_t i = 1; i < inputs_dims.size(); ++i) {
-        offset += inputs_dims[i - 1][concat_dim];
-        offsets[i][concat_dim] = offset;
-    }
-    return offsets;
-}
-
-static bool has_output_user(const graph_tensor_ptr &gt) {
-    for (auto &use : gt->uses_) {
-        if (use.second->isa<output_op>()) { return true; }
-    }
-    return false;
-}
-
-/* This function is the preparation for TensorIR pass concat_memory_planning_t.
- * We calculate the offset for each input of concat op and set strides of output
- * of parent op. For detailed explanation, please refer to
- * concat_memory_planning_t. */
-static bool set_offsets_and_strides_for_op(
-        const std::unordered_set<sc_op_ptr> &ops, sc_op *op,
-        const sc_dims &strides,
-        std::unordered_map<graph_tensor_ptr,
-                std::pair<graph_tensor_ptr, sc_dims>> &concat_in_out) {
-    auto concat = op->stc_cast<concat_op_t>();
-    unsigned concat_dim = concat->get_axis();
-    SC_MODULE_INFO << "Set offset and strides for concat_op: " << op->op_name_
-                   << op->logical_op_id_;
-    std::vector<graph_tensor_ptr> &inputs = concat->info_.inputs_;
-    std::unordered_set<graph_tensor_ptr> inputs_set(
-            inputs.begin(), inputs.end());
-    if (inputs_set.size() != inputs.size()) {
-        SC_MODULE_INFO
-                << "There are duplicate inputs to current concat, cannot "
-                   "do memory planning, donot set offset and strides";
-        return false;
-    }
-    std::vector<sc_dims> inputs_dims(inputs.size());
-    for (size_t i = 0; i < inputs.size(); ++i) {
-        inputs_dims[i] = inputs[i]->details_.get_blocking_dims();
-    }
-    std::vector<sc_dims> offsets = calc_offsets(inputs_dims, concat_dim);
-
-    // The input of concat is the output of its parent op, so we need to
-    // set strides of the output of the parent op.
-    for (size_t i = 0; i < inputs.size(); ++i) {
-        auto parent_op = inputs[i]->producer_owner_->shared_from_this();
-        if (parent_op->isa<input_op>() || parent_op->isa<tensor_view_op_t>()
-                || has_output_user(inputs[i])) {
-            continue;
-        }
-        if (ops.count(parent_op)) {
-            if (parent_op->isa<mixed_fuse_op_t>()
-                    && parent_op->stc_cast<mixed_fuse_op_t>()
-                                    ->parti_list_.size()
-                            != 1) {
-                SC_MODULE_INFO << "Do not support mixed fuse op with "
-                                  "multiple partitions";
-                continue;
-            }
-
-            auto it = find(parent_op->info_.outputs_.begin(),
-                    parent_op->info_.outputs_.end(), inputs[i]);
-            COMPILE_ASSERT(it != parent_op->info_.outputs_.end(),
-                    "Cannot find input tensor");
-            size_t output_idx = it - parent_op->info_.outputs_.begin();
-
-            concat_in_out[inputs[i]] = {concat->info_.outputs_[0], offsets[i]};
-            inputs[i]->details_.set_strides(strides);
-            concat->is_input_valid_[i] = false;
-            if (parent_op->isa<mixed_fuse_op_t>()) {
-                auto parti = parent_op->stc_cast<mixed_fuse_op_t>()
-                                     ->parti_list_[0]
-                                     .get();
-                std::vector<expr> strides_expr;
-                for (auto o : strides) {
-                    strides_expr.emplace_back(uint64_t(o));
-                }
-                parti->func_->params_[output_idx].checked_as<tensor>()->strides_
-                        = strides_expr;
-            }
-            SC_MODULE_INFO << "Set strides of tensor: #"
-                           << it - parent_op->info_.outputs_.begin()
-                           << " output of op " << parent_op->op_name_
-                           << parent_op->logical_op_id_
-                           << " to : " << utils::print_vector(strides);
-        }
-    }
-
-    concat->info_.outputs_[0]->details_.set_strides(strides);
-
-    for (auto &id_op_pair : concat->info_.outputs_[0]->uses_) {
-        auto child_op = id_op_pair.second.lock();
-        if (ops.count(child_op)) {
-            if (child_op->isa<mixed_fuse_op_t>()) {
-                auto child_mixed = child_op->stc_cast<mixed_fuse_op_t>();
-                // TODO(niuxiaoguang): to support this case
-                COMPILE_ASSERT(child_mixed->parti_list_.size() == 1,
-                        "The child op of concat should not have multi "
-                        "partitions: "
-                                << child_mixed->parti_list_.size());
-                auto parti = child_mixed->parti_list_[0].get();
-                std::vector<expr> strides_expr;
-                for (auto o : strides) {
-                    strides_expr.emplace_back(uint64_t(o));
-                }
-                int input_idx
-                        = id_op_pair.first + child_op->info_.outputs_.size();
-                parti->func_->params_[input_idx].checked_as<tensor>()->strides_
-                        = strides_expr;
-                SC_MODULE_INFO << "Set strides of tensor: #" << id_op_pair.first
-                               << " input of op " << child_op->op_name_
-                               << child_op->logical_op_id_
-                               << " to : " << utils::print_vector(strides);
-            }
-        }
-    }
-    return true;
-}
-
-static void find_final_tensor_and_offset(std::unordered_map<graph_tensor_ptr,
-        std::pair<graph_tensor_ptr, sc_dims>> &concat_in_out) {
-    for (auto &pair : concat_in_out) {
-        graph_tensor_ptr curr = pair.first;
-        size_t n_dims = curr->details_.get_blocking_dims().size();
-        sc_dims final_offset(n_dims, 0);
-        while (concat_in_out.find(curr) != concat_in_out.end()) {
-            auto parent = concat_in_out[curr].first;
-            auto offset = concat_in_out[curr].second;
-            curr = parent;
-            for (size_t i = 0; i < n_dims; ++i) {
-                final_offset[i] = final_offset[i] + offset[i];
-            }
-        }
-        pair.second = {curr, final_offset};
-        curr->producer_owner_->attrs_[concat_optim_attr_keys::is_final_concat]
-                = true;
-    }
-}
-
-static void set_final_offsets(sc_op *op,
-        std::unordered_map<graph_tensor_ptr,
-                std::pair<graph_tensor_ptr, sc_dims>> &concat_in_out) {
-    auto concat = op->stc_cast<concat_op_t>();
-    std::vector<graph_tensor_ptr> &inputs = concat->info_.inputs_;
-    for (size_t i = 0; i < inputs.size(); ++i) {
-        if (concat_in_out.find(inputs[i]) != concat_in_out.end()) {
-            SC_MODULE_INFO << "Set final offset for concat_op: " << op->op_name_
-                           << op->logical_op_id_ << " #" << i << " input";
-            inputs[i]->attrs_[concat_optim_attr_keys::graph_memory_offset_to]
-                    = concat_in_out[inputs[i]].first;
-            std::vector<expr> offset_expr;
-            for (auto o : concat_in_out[inputs[i]].second) {
-                offset_expr.emplace_back(uint64_t(o));
-            }
-            inputs[i]->attrs_[concat_optim_attr_keys::graph_memory_offset]
-                    = offset_expr;
-        }
-    }
-}
-
-bool set_offsets_and_strides_recursively(std::vector<sc_op_ptr> &ops) {
-    std::unordered_set<sc_op_ptr> ops_set(ops.begin(), ops.end());
-    std::vector<std::vector<sc_op *>> concats_seqs;
-    for (sc_op_ptr &op : ops) {
-        if (!op->isa<concat_op_t>()) { continue; }
-        auto concat = op->stc_cast<concat_op_t>();
-        SC_MODULE_INFO << "Meet concat_op: " << op->op_name_
-                       << op->logical_op_id_;
-        bool new_seq = true;
-        // We put the directly connected concats into one sequence.
-        // TODO(niuxiaoguang): support the topo that multiple parents are
-        // concat.
-        for (const auto &inp_lt : concat->info_.inputs_) {
-            sc_op *parent_op = inp_lt->producer_owner_;
-            if (parent_op->isa<tensor_view_op_t>()) {
-                // tensor_view_op_t will be skiped when lowering, so the concat
-                // op cannot get memory_offset attr from its new input
-                SC_MODULE_INFO << "Do not optimize curr input because it is "
-                                  "from a tensor_view_op_t";
-                continue;
-            }
-            if (parent_op->isa<concat_op_t>()
-                    && parent_op->stc_cast<concat_op_t>()->get_axis()
-                            == concat->get_axis()) {
-                SC_MODULE_INFO
-                        << "Current concat has a concat parent with same dim";
-                if (ops_set.count(parent_op->shared_from_this()) == 0) {
-                    SC_MODULE_INFO << "The parent op is not in current "
-                                      "partition, so start a new seq.";
-                    continue;
-                }
-                new_seq = false;
-                for (auto &seq : concats_seqs) {
-                    if (seq.back() == parent_op) {
-                        seq.push_back(op.get());
-                        break;
-                    }
-                }
-            }
-        }
-        if (new_seq) { concats_seqs.push_back({op.get()}); }
-    }
-
-    SC_MODULE_INFO << "There are " << concats_seqs.size()
-                   << " sequences of concat ops.";
-    if (!concats_seqs.empty()) {
-        std::unordered_map<graph_tensor_ptr,
-                std::pair<graph_tensor_ptr, sc_dims>>
-                concat_in_out;
-        for (auto &concat_seq : concats_seqs) {
-            SC_MODULE_INFO << "Process concats seq with " << concat_seq.size()
-                           << " concat ops.";
-            sc_dims strides = concat_seq.back()
-                                      ->info_.outputs_[0]
-                                      ->details_.get_strides();
-            for (int i = concat_seq.size() - 1; i >= 0; --i) {
-                set_offsets_and_strides_for_op(
-                        ops_set, concat_seq[i], strides, concat_in_out);
-            }
-        }
-
-        find_final_tensor_and_offset(concat_in_out);
-
-        for (auto &concat_seq : concats_seqs) {
-            for (int i = concat_seq.size() - 1; i >= 0; --i) {
-                set_final_offsets(concat_seq[i], concat_in_out);
-            }
-        }
-    }
-    return !concats_seqs.empty();
-}
-
-SC_INTERNAL_API void graph_concat_memory_planning(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context()) {
-    SC_MODULE_INFO << "Run graph concat memory planning on graph with "
-                   << graph.ops_.size() << " ops.";
-    if (graph.ops_.size() < 2) { return; }
-    set_offsets_and_strides_recursively(graph.ops_);
-}
-
-SC_INTERNAL_API bool concat_memory_planning_on_graph(sc_graph_t &graph) {
-    return set_offsets_and_strides_recursively(graph.ops_);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_inline.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_inline.cpp
deleted file mode 100644
index 80ae2cb716e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_inline.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <set>
-#include <vector>
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static void do_inline_graph(const context_ptr &ctx, sc_graph_t &full_graph,
-        sc_op_ptr &cur_node, sc_graph_t &sub_graph) {
-    std::unordered_map<sc_op_ptr, std::vector<sc_op_ptr>> *tunable_op_map
-            = full_graph.attrs_.get_or_null<
-                    std::unordered_map<sc_op_ptr, std::vector<sc_op_ptr>>>(
-                    "temp.op_map");
-    sc_op_ptr corresponding_node;
-    bool need_tuning = tunable_op_map;
-    if (need_tuning) {
-        if (tunable_op_map->find(cur_node) != tunable_op_map->end()) {
-            corresponding_node = (*tunable_op_map)[cur_node][0];
-            (*tunable_op_map)[corresponding_node].clear();
-        }
-    }
-    int cur_op_id = cur_node->logical_op_id_;
-    for (auto &op : sub_graph.ops_) {
-        if (op->isa<input_op>()) {
-            auto cur_op_ori_ins = cur_node->get_inputs();
-            COMPILE_ASSERT(cur_op_ori_ins.size() == op->get_outputs().size(),
-                    "cur_node " << cur_node->op_name_
-                                << " 's input size should be equal with its "
-                                   "sub_graph input op 's output size");
-            if (ctx->flags_.opt_level_ == sc_opt_level::lv1
-                    || cur_node->attrs_.get_or_else(
-                            op_attr_key::break_pre_fuse, false)
-                    || cur_node->attrs_.get_or_else(
-                            op_attr_key::no_fuse, false)) {
-                for (auto &cur : op->get_outputs()) {
-                    for (auto &u : cur->uses_) {
-                        auto user_op = u.second;
-                        bool need_break = true;
-                        for (auto &in : user_op->get_inputs()) {
-                            if (!(in->producer_owner_->isa<input_op>()
-                                        || in->producer_owner_
-                                                   ->isa<constant_op_t>())) {
-                                need_break = false;
-                                break;
-                            }
-                        }
-                        u.second->attrs_.set(
-                                op_attr_key::break_pre_fuse, need_break);
-                    }
-                }
-            }
-            for (size_t i = 0; i < op->get_outputs().size(); ++i) {
-                op->get_outputs()[i]->replace_with(cur_op_ori_ins.at(i));
-            }
-            op->remove();
-        } else if (op->isa<output_op>()) {
-            auto cur_op_ori_outs = cur_node->get_outputs();
-            COMPILE_ASSERT(cur_op_ori_outs.size() == op->get_inputs().size(),
-                    "cur_node " << cur_node->op_name_
-                                << " 's output size should be equal with its "
-                                   "sub_graph output op's input size");
-            if (ctx->flags_.opt_level_ == sc_opt_level::lv1
-                    || cur_node->attrs_.get_or_else(
-                            op_attr_key::break_post_fuse, false)
-                    || cur_node->attrs_.get_or_else(
-                            op_attr_key::no_fuse, false)) {
-                for (auto &cur : op->get_inputs()) {
-                    cur->producer_owner_->attrs_.set(
-                            op_attr_key::break_post_fuse, true);
-                }
-            }
-            for (size_t i = 0; i < cur_op_ori_outs.size(); ++i) {
-                while (!cur_op_ori_outs[i]->uses_.empty()) {
-                    auto &use = cur_op_ori_outs[i]->uses_.front();
-                    use.second->replace_input(use.first, op->get_inputs()[i]);
-                }
-            }
-            op->remove();
-        } else {
-            if (op->isa<op_traits::configurable_t>() && need_tuning) {
-                (*tunable_op_map)[corresponding_node].push_back(op);
-            }
-            op->set_owner_graph(&full_graph);
-            full_graph.ops_.emplace_back(op);
-            op->set_owner_graph(&full_graph);
-        }
-    }
-}
-
-const std::set<std::string> &get_op_blocked_lists() {
-    static std::set<std::string> blocked_list {
-            "quantize", "dequantize", "dynamic_quantize", "dynamic_dequantize"};
-    return blocked_list;
-}
-
-void graph_inline(sc_graph_t &graph, const context_ptr &ctx) {
-    auto &blocked_list = get_op_blocked_lists();
-    constexpr const int max_recursion = 10;
-    int i = 0;
-    for (; i < max_recursion; i++) {
-        auto vis = op_visitor_t::bfs();
-        vis.visit_graph(graph, [&](op_visitor_t *vis, sc_op_ptr node) {
-            if (auto graph_node = node->dyn_cast<graph_op_t>()) {
-                if (blocked_list.find(node->op_name_) == blocked_list.end()) {
-                    auto sub_graph = graph_node->get_graph();
-                    vis->update_state_for_visited(node);
-                    do_inline_graph(ctx, graph, node, *sub_graph);
-                    node->remove();
-                }
-            }
-        });
-        graph.reset_op_ids();
-        if (std::all_of(graph.ops_.begin(), graph.ops_.end(),
-                    [&](const sc_op_ptr &op) {
-                        return !op->isa<graph_op_t>()
-                                || blocked_list.find(op->op_name_)
-                                != blocked_list.end();
-                    })) {
-            break;
-        }
-    }
-    COMPILE_ASSERT(i < max_recursion, "Reached max inline recursion depth");
-    graph.attrs_[sc_graph_t::attr_key_t::gflop] = graph.get_gflop();
-}
-
-namespace quantize {
-void quantize_inline(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, sc_op_ptr node) {
-        if (auto graph_node = node->dyn_cast<graph_op_t>()) {
-            auto sub_graph = graph_node->get_graph();
-            vis->update_state_for_visited(node);
-            do_inline_graph(ctx, graph, node, *sub_graph);
-            node->remove();
-        }
-    });
-    graph.reset_op_ids();
-}
-} // namespace quantize
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_simplify.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_simplify.cpp
deleted file mode 100644
index 08bcac6ae91..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/graph_simplify.cpp
+++ /dev/null
@@ -1,636 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <set>
-
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <unordered_map>
-#include <util/bf16.hpp>
-#include <util/fp16.hpp>
-
-SC_MODULE(graph.simplify)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct hash_sc_op_t {
-    std::size_t operator()(const sc_op_ptr &v) const {
-        size_t hash_ = 0;
-        hash_combine(hash_, v->op_name_);
-        hash_combine(hash_, v->info_.outputs_[0]->details_);
-        hash_combine(hash_, v->hash_contents());
-        return hash_;
-    }
-};
-
-struct compare_sc_op_t {
-    bool operator()(const sc_op_ptr &v0, const sc_op_ptr &v1) const {
-        return v0->op_name_ == v1->op_name_
-                && v0->info_.outputs_[0]->details_
-                == v1->info_.outputs_[0]->details_
-                && v0->compare_contents(v1.get());
-    }
-};
-
-static void insert_tensor_view_op(sc_graph_t &graph, const graph_tensor_ptr &in,
-        size_t in_index, const sc_op_ptr &cur_op) {
-    auto ret = graph.make("tensor_view", {in}, {},
-            {{"shape", in->details_.get_plain_dims()}});
-    cur_op->replace_input(in_index, ret->get_outputs()[0]);
-}
-
-void drop_same_op_on_output(sc_graph_t &graph, const graph_tensor_ptr &output) {
-    std::unordered_map<sc_op_ptr, std::vector<int>, hash_sc_op_t,
-            compare_sc_op_t>
-            same_op_map;
-    for (size_t i = 0; i < output->uses_.size(); i++) {
-        auto node = output->uses_[i];
-        // do not eliminate redundant ops marked as "temp.not_redundant"
-        if (node.second->attrs_.get_or_else(
-                    op_attr_key::not_redundant, false)) {
-            continue;
-        }
-        if (node.second->get_inputs().empty()
-                || node.second->get_outputs().empty()) {
-            continue;
-        }
-        if (node.second->get_inputs().size() > 1
-                || node.second->get_outputs().size() > 1) {
-            SC_MODULE_INFO
-                    << "Currently we don't support multi-input/multi-output op "
-                       "elimination.";
-            continue;
-        }
-        // when uses is an output op, we should not remove the global buffer
-        auto temp_node = node.second->get_outputs()[0];
-        if (std::any_of(temp_node->uses_.begin(), temp_node->uses_.end(),
-                    [](const std::pair<int, sc_op_weak_ptr_t> &j) {
-                        return j.second->isa<output_op>();
-                    })) {
-            continue;
-        }
-        same_op_map[node.second].push_back(i);
-    }
-    std::vector<std::pair<int, sc_op_ptr>> next_nodes(
-            output->uses_.begin(), output->uses_.end());
-    for (auto &it : same_op_map) {
-        if (it.second.size() > 1) {
-            auto reserve_node = next_nodes[it.second[0]].second;
-            std::vector<sc_op_ptr> del_node_list;
-            for (size_t i = 1; i < it.second.size(); i++) {
-                if (it.second[i] >= static_cast<int>(next_nodes.size())
-                        || next_nodes[it.second[i]]
-                                   .second->get_outputs()[0]
-                                   ->uses_.empty()) {
-                    break;
-                }
-                auto del_node = next_nodes[it.second[i]].second;
-                std::vector<std::pair<int, sc_op_weak_ptr_t>> del_uses
-                        = del_node->get_outputs()[0]->uses_;
-                for (size_t u = 0; u < del_uses.size(); u++) {
-                    auto node_after_del = del_uses[u];
-                    node_after_del.second->replace_input(node_after_del.first,
-                            reserve_node->get_outputs()[0]);
-                }
-                del_node_list.push_back(del_node);
-            }
-            for (auto &del_node : del_node_list) {
-                del_node->remove();
-            }
-        }
-    }
-}
-
-// eliminate the tensor_view in which the shape and format are the same for both
-// input and output
-void useless_tensor_view_elimination(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<tensor_view_op_t>()
-                && !node->get_inputs()[0]->producer_owner_->isa<input_op>()) {
-            const auto &in_tensor = node->get_inputs()[0]->details_;
-            const auto &in_shape = in_tensor.get_plain_dims();
-            const auto &in_real_shape = in_tensor.get_blocking_dims();
-            const auto &in_format = in_tensor.get_format();
-            const auto &out_tensor = node->get_outputs()[0]->details_;
-            const auto &out_shape = out_tensor.get_plain_dims();
-            const auto &out_real_shape = out_tensor.get_blocking_dims();
-            const auto &out_format = out_tensor.get_format();
-            if (in_real_shape == out_real_shape && in_shape == out_shape
-                    && in_format == out_format) {
-                vis->update_state_for_visited(node);
-                node->get_outputs()[0]->replace_with(node->get_inputs()[0]);
-                node->remove();
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-// eliminate horizontal same ops, e.g. qkv input reorder
-void horizontal_same_op_elimination(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (!node->isa<output_op>()) {
-            for (size_t i = 0; i < node->get_outputs().size(); i++) {
-                auto output = node->get_outputs()[i];
-                drop_same_op_on_output(graph, output);
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-static bool is_single_use(const sc_op_ptr &node) {
-    return node->get_outputs()[0]->uses_.size() == 1;
-}
-
-static void merge_dispatch_key_sets(const sc_op_ptr &op0, sc_op_ptr &op1) {
-    auto &dispatch_key_set0 = op0->get_dispatch_key_set()->get_inner_set();
-    auto &dispatch_key_set1 = op1->get_dispatch_key_set()->get_inner_set();
-    std::unordered_map<sc_data_format_t, sc_data_format_t> cached_map;
-    for (auto &key1 : dispatch_key_set1) {
-        auto &in_fmt1 = key1.in_out_formats_[0];
-        auto &out_fmt1 = key1.in_out_formats_[1];
-        cached_map[in_fmt1] = out_fmt1;
-    }
-    dispatch_key_set_t::inner_set_t dispatch_key_set_new0;
-    for (auto key0 : dispatch_key_set0) {
-        auto &out_fmt0 = key0.in_out_formats_[1];
-        auto it = cached_map.find(out_fmt0);
-        assert(it != cached_map.end());
-        key0.in_out_formats_[1] = it->second;
-        dispatch_key_set_new0.insert(key0);
-    }
-    op0->get_dispatch_key_set()->get_inner_set()
-            = std::move(dispatch_key_set_new0);
-}
-
-// eliminate excess tensor view, e.g. tensor_view->tensor_view->tensor_view
-void excess_tensor_view_elimination(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<tensor_view_op_t>() && is_single_use(node)) {
-            sc_op_ptr next_node
-                    = node->get_outputs()[0]->uses_[0].second.get_shared();
-            sc_op_ptr pre_node = next_node;
-            std::vector<sc_op_ptr> node_to_remove;
-            while (next_node->isa<tensor_view_op_t>()
-                    && is_single_use(next_node)) {
-                merge_dispatch_key_sets(node, next_node);
-                node_to_remove.push_back(next_node);
-                pre_node = next_node;
-                next_node = next_node->get_outputs()[0]
-                                    ->uses_[0]
-                                    .second.get_shared();
-            }
-            if (next_node->isa<tensor_view_op_t>()) { pre_node = next_node; }
-            if (pre_node != next_node || pre_node->isa<tensor_view_op_t>()) {
-                if (pre_node == next_node) {
-                    merge_dispatch_key_sets(node, pre_node);
-                }
-                node->get_outputs()[0]->details_
-                        = pre_node->get_outputs()[0]->details_;
-                std::vector<std::pair<int, sc_op_weak_ptr_t>> uses
-                        = pre_node->get_outputs()[0]->uses_;
-                for (size_t i = 0; i < uses.size(); i++) {
-                    int pre_idx = uses[i].first;
-                    uses[i].second->replace_input(
-                            pre_idx, node->get_outputs()[0]);
-                }
-                for (auto &del_node : node_to_remove) {
-                    del_node->remove();
-                }
-                if (pre_node->isa<tensor_view_op_t>()) { pre_node->remove(); }
-                if (node->attrs_.has_key("order")) {
-                    node->attrs_.remove("order");
-                }
-            }
-            vis->update_state_for_visited(node);
-        }
-    });
-    graph.reset_op_ids();
-}
-
-static bool can_simplify(
-        const sc_op_ptr &node, const constant_op_t *in_const_op) {
-    auto const_dtype = in_const_op->get_constant_dtype();
-    if (in_const_op->get_constant_plain_dims() != sc_dims {1}) { return false; }
-    if (const_dtype == datatypes::f32) {
-        float constant_val = reinterpret_cast<float *>(
-                in_const_op->get_constant_values()->data_)[0];
-        if ((constant_val == 0.f
-                    && (node->isa<add_op_t>() || node->isa<sub_op_t>()))
-                || (constant_val == 1.f
-                        && (node->isa<mul_op_t>() || node->isa<div_op_t>())))
-            return true;
-    } else if (const_dtype == datatypes::s32) {
-        int constant_val = reinterpret_cast<int *>(
-                in_const_op->get_constant_values()->data_)[0];
-        if ((constant_val == 0
-                    && (node->isa<add_op_t>() || node->isa<sub_op_t>()))
-                || (constant_val == 1
-                        && (node->isa<mul_op_t>() || node->isa<div_op_t>())))
-            return true;
-    } else if (const_dtype == datatypes::bf16) {
-        bf16_t constant_val = reinterpret_cast<bf16_t *>(
-                in_const_op->get_constant_values()->data_)[0];
-        if ((constant_val == bf16_t(0)
-                    && (node->isa<add_op_t>() || node->isa<sub_op_t>()))
-                || (constant_val == bf16_t(1)
-                        && (node->isa<mul_op_t>() || node->isa<div_op_t>())))
-            return true;
-    } else if (const_dtype == datatypes::f16) {
-        fp16_t constant_val = reinterpret_cast<fp16_t *>(
-                in_const_op->get_constant_values()->data_)[0];
-        if ((constant_val == fp16_t(0)
-                    && (node->isa<add_op_t>() || node->isa<sub_op_t>()))
-                || (constant_val == fp16_t(1)
-                        && (node->isa<mul_op_t>() || node->isa<div_op_t>())))
-            return true;
-    }
-    return false;
-}
-
-// eliminate redundant binary op
-void redundant_binary_op_elimination(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        vis->update_state_for_visited(node);
-        // x + 0 or 0 + x or x * 1 or 1 * x
-        if (node->isa<add_op_t>() || node->isa<mul_op_t>()) {
-            for (size_t i = 0; i < node->get_inputs().size(); i++) {
-                if (node->get_inputs()[i]
-                                ->producer_owner_->isa<constant_op_t>()) {
-                    auto in_const_op = node->get_inputs()[i]
-                                               ->producer_owner_
-                                               ->dyn_cast<constant_op_t>();
-                    bool do_simplify = can_simplify(node, in_const_op);
-                    if (do_simplify) {
-                        size_t use_size = node->get_outputs()[0]->uses_.size();
-                        int use_idx = 0;
-                        for (size_t j = 0; j < use_size; j++) {
-                            sc_op_ptr next_node = node->get_outputs()[0]
-                                                          ->uses_.at(use_idx)
-                                                          .second.get_shared();
-                            int idx = node->get_outputs()[0]
-                                              ->uses_.at(use_idx)
-                                              .first;
-                            if (next_node->isa<output_op>()
-                                    && node->get_inputs()[1 - i]
-                                               ->producer_owner_
-                                               ->isa<input_op>()) {
-                                insert_tensor_view_op(graph,
-                                        node->get_inputs()[1 - i], idx,
-                                        next_node);
-                            } else {
-                                next_node->replace_input(
-                                        idx, node->get_inputs()[1 - i]);
-                                use_idx--;
-                            }
-                            use_idx++;
-                        }
-                        node->remove();
-                        if (in_const_op->get_outputs()[0]->uses_.empty()) {
-                            in_const_op->remove();
-                        }
-                        break;
-                    }
-                }
-            }
-        }
-        // x - 0 or x / 1
-        else if (node->isa<sub_op_t>() || node->isa<div_op_t>()) {
-            if (node->get_inputs()[1]->producer_owner_->isa<constant_op_t>()) {
-                auto in_const_op
-                        = node->get_inputs()[1]
-                                  ->producer_owner_->dyn_cast<constant_op_t>();
-                bool do_simplify = can_simplify(node, in_const_op);
-                if (do_simplify) {
-                    size_t use_size = node->get_outputs()[0]->uses_.size();
-                    int use_idx = 0;
-                    for (size_t j = 0; j < use_size; j++) {
-                        sc_op_ptr next_node = node->get_outputs()[0]
-                                                      ->uses_.at(use_idx)
-                                                      .second.get_shared();
-                        int idx = node->get_outputs()[0]
-                                          ->uses_.at(use_idx)
-                                          .first;
-                        if (next_node->isa<output_op>()
-                                && node->get_inputs()[0]
-                                           ->producer_owner_->isa<input_op>()) {
-                            insert_tensor_view_op(graph, node->get_inputs()[0],
-                                    idx, next_node);
-                        } else {
-                            next_node->replace_input(
-                                    idx, node->get_inputs()[0]);
-                            use_idx--;
-                        }
-                        use_idx++;
-                    }
-                    node->remove();
-                    if (in_const_op->get_outputs()[0]->uses_.empty()) {
-                        in_const_op->remove();
-                    }
-                }
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-static bool is_basic_binary_calculate_op(const sc_op_ptr &node) {
-    return node->isa<add_op_t>() || node->isa<sub_op_t>()
-            || node->isa<mul_op_t>() || node->isa<div_op_t>();
-}
-
-static bool is_high_priority_op(const sc_op_ptr &node) {
-    return node->isa<mul_op_t>() || node->isa<div_op_t>();
-}
-
-static bool is_lower_priority_op(const sc_op_ptr &node) {
-    return node->isa<add_op_t>() || node->isa<sub_op_t>();
-}
-
-static bool is_constant_op(const sc_op *node) {
-    return node->isa<constant_op_t>()
-            || node->attrs_.get_or_else("constant", const_kind::not_const)
-            != const_kind::not_const;
-}
-
-static bool is_all_positive(const sc_op *node) {
-    return node->attrs_.get_or_else("all_positive", false);
-}
-
-// a add/mul op with two inputs: in0 and in1, when in0 is a constant and in1
-// not, exchange them.
-static void exchange_binary_const_ops(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::dfs_topology_sort();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<mul_op_t>() || node->isa<add_op_t>()) {
-            if (node->get_inputs()[0]->producer_owner_->attrs_.get_or_else(
-                        "constant", const_kind::not_const)
-                            != const_kind::not_const
-                    && node->get_inputs()[1]
-                                    ->producer_owner_->attrs_.get_or_else(
-                                            "constant", const_kind::not_const)
-                            == const_kind::not_const) {
-                auto new_node = graph.make(node->op_name_,
-                        {node->get_inputs()[1], node->get_inputs()[0]}, {},
-                        node->attrs_);
-                node->replace_uses_with_and_remove(new_node);
-                vis->update_state_for_visited(new_node);
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-// For case like: mul + add + relu + (mul/div), change to mul + add + (mul/div)
-// + relu for more folding opportunities.
-static void push_relu_back(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::dfs_topology_sort_unchecked();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<relu_op_t>()) {
-            sc_op_ptr cur_node = node, pre_node = node;
-            while (cur_node->is_single_output_single_use()
-                    && utils::is_one_of(cur_node->get_outputs()[0]
-                                                ->uses_[0]
-                                                .second->op_name_,
-                            std::string("mul"), std::string("div"))) {
-                cur_node = cur_node->get_outputs()[0]->uses_[0].second;
-                auto inp1 = cur_node->get_inputs()[1]->producer_owner_;
-                if (!(is_constant_op(inp1) && is_all_positive(inp1))) {
-                    cur_node = pre_node;
-                    break;
-                }
-                if (pre_node == node) {
-                    cur_node->replace_input(
-                            node->get_outputs()[0]->uses_[0].first,
-                            node->get_inputs()[0]);
-                }
-                pre_node = cur_node;
-            }
-            if (cur_node != node) {
-                auto &out_tsr = cur_node->get_outputs()[0];
-                auto uses = out_tsr->uses_;
-                for (auto &use : uses) {
-                    use.second->replace_input(
-                            use.first, node->get_outputs()[0]);
-                }
-                node->replace_input(0, out_tsr);
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-static bool is_same_priority(const sc_op_ptr &op0, const sc_op_ptr &op1,
-        const sc_data_type_t &dtype, std::string &elt_type) {
-    std::string add = "add", sub = "sub", mul = "mul", div = "div";
-    bool ret = false;
-    if (utils::is_one_of(op0->op_name_, add, sub)
-            && utils::is_one_of(op1->op_name_, add, sub)) {
-        if (op0->op_name_ == sub) {
-            elt_type = op1->op_name_ == add ? sub : add;
-        } else {
-            elt_type = op1->op_name_;
-        }
-        return true;
-    }
-    if (dtype == datatypes::f32) {
-        if (utils::is_one_of(op0->op_name_, mul, div)
-                && utils::is_one_of(op1->op_name_, mul, div)) {
-            ret = true;
-        }
-    } else if (dtype == datatypes::s32) {
-        if ((op0->op_name_ == mul && op1->op_name_ == mul)
-                || (op0->op_name_ == div && op1->op_name_ == div)) {
-            ret = true;
-        }
-    }
-    if (ret) {
-        if (op0->op_name_ == div) {
-            elt_type = op1->op_name_ == mul ? div : mul;
-        } else {
-            elt_type = op1->op_name_;
-        }
-    }
-    return ret;
-}
-
-// Process the pattern like "(a + b) + c" => "a + (b + c)" where b and c are
-// constant. Return the new folded node if processing is successful.
-static sc_op_ptr same_priority_pattern_fold(
-        sc_graph_t &graph, const sc_op_ptr &node) {
-    if (is_basic_binary_calculate_op(node)
-            && node->is_single_output_single_use()) {
-        auto &out_tsr = node->get_outputs()[0];
-        auto next_node = out_tsr->uses_[0].second;
-        std::string elt_type;
-        if (utils::is_one_of(
-                    out_tsr->details_.dtype_, datatypes::f32, datatypes::s32)
-                && is_same_priority(
-                        node, next_node, out_tsr->details_.dtype_, elt_type)) {
-            auto *cur_inp1 = node->get_inputs()[1]->producer_owner_;
-            auto *next_inp1 = next_node->get_inputs()[1]->producer_owner_;
-            if (is_constant_op(cur_inp1) && is_constant_op(next_inp1)
-                    && next_inp1 != node.get()) {
-                sc_op_ptr cal_const;
-                cal_const = graph.make(elt_type,
-                        {cur_inp1->get_outputs()[0],
-                                next_inp1->get_outputs()[0]},
-                        {}, {});
-                cal_const->attrs_.set("constant", const_kind::local_const);
-                cal_const->attrs_.set("all_positive",
-                        is_all_positive(cur_inp1)
-                                && is_all_positive(next_inp1));
-                auto new_node = graph.make(node->op_name_,
-                        {node->get_inputs()[0], cal_const->get_outputs()[0]},
-                        {}, node->attrs_);
-                new_node->copy_dispatch_key_set_from_op(node);
-                node->replace_uses_with_and_remove(new_node);
-                auto uses = next_node->get_outputs()[0]->uses_;
-                for (auto &use : uses) {
-                    use.second->replace_input(
-                            use.first, new_node->get_outputs()[0]);
-                }
-                next_node->remove();
-                return new_node;
-            }
-        }
-    }
-    return nullptr;
-}
-
-// process the partten like "(a * b + c) * d => (a * b * d + c * d)" where b, c,
-// d are constant. Return the new folded node if processing is successful.
-static sc_op_ptr diff_priority_pattern_fold(
-        sc_graph_t &graph, const sc_op_ptr &node) {
-    if (is_high_priority_op(node) && node->is_single_output_single_use()) {
-        auto &out_tsr = node->get_outputs()[0];
-        auto &dtype = out_tsr->details_.dtype_;
-        auto next_node = out_tsr->uses_[0].second;
-        auto *cur_inp1 = node->get_inputs()[1]->producer_owner_;
-        if (!is_constant_op(cur_inp1)) { return nullptr; }
-        if (is_lower_priority_op(next_node)
-                && node->is_single_output_single_use()) {
-            auto nnext_node = next_node->get_outputs()[0]->uses_[0].second;
-            auto *next_inp1 = next_node->get_inputs()[1]->producer_owner_;
-            if (!is_constant_op(next_inp1)) { return nullptr; }
-            if (is_high_priority_op(nnext_node)
-                    && next_node->is_single_output_single_use()) {
-                auto nnext_inp1 = nnext_node->get_inputs()[1]->producer_owner_;
-                if (!is_constant_op(nnext_inp1)) { return nullptr; }
-                std::string cur_elt_type = "mul",
-                            next_elt_type = nnext_node->op_name_;
-                if ((dtype == datatypes::s32 && node->isa<mul_op_t>()
-                            && nnext_node->isa<mul_op_t>())
-                        || (dtype == datatypes::f32
-                                && is_same_priority(node, nnext_node, dtype,
-                                        cur_elt_type))) {
-                    auto cur_cal_const = graph.make(cur_elt_type,
-                            {cur_inp1->get_outputs()[0],
-                                    nnext_inp1->get_outputs()[0]},
-                            {}, {});
-                    cur_cal_const->attrs_.set(
-                            "constant", const_kind::local_const);
-                    cur_cal_const->attrs_.set("all_positive",
-                            is_all_positive(cur_inp1)
-                                    && is_all_positive(nnext_inp1));
-                    auto new_node = graph.make(node->op_name_,
-                            {node->get_inputs()[0],
-                                    cur_cal_const->get_outputs()[0]},
-                            {}, node->attrs_);
-                    new_node->copy_dispatch_key_set_from_op(node);
-                    node->replace_uses_with_and_remove(new_node);
-                    auto next_cal_const = graph.make(next_elt_type,
-                            {next_inp1->get_outputs()[0],
-                                    nnext_inp1->get_outputs()[0]},
-                            {}, {});
-                    next_cal_const->attrs_.set(
-                            "constant", const_kind::local_const);
-                    next_cal_const->attrs_.set("all_positive",
-                            is_all_positive(next_inp1)
-                                    && is_all_positive(nnext_inp1));
-                    auto new_next_node = graph.make(next_node->op_name_,
-                            {next_node->get_inputs()[0],
-                                    next_cal_const->get_outputs()[0]},
-                            {}, next_node->attrs_);
-                    new_next_node->copy_dispatch_key_set_from_op(next_node);
-                    next_node->replace_uses_with_and_remove(new_next_node);
-                    auto uses = nnext_node->get_outputs()[0]->uses_;
-                    auto last_out = new_next_node->get_outputs()[0];
-                    for (auto &use : uses) {
-                        use.second->replace_input(use.first, last_out);
-                    }
-                    nnext_node->remove();
-                    return new_node;
-                }
-            }
-        }
-    }
-    return nullptr;
-}
-
-static void fold_polynomial(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    constexpr const int MAX_TRY_TIMES = 100;
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        sc_op_ptr last_folded = node, pre_folded = last_folded;
-        for (int i = 0; i < MAX_TRY_TIMES; i++) {
-            auto same_folded = same_priority_pattern_fold(graph, last_folded);
-            if (same_folded) { last_folded = same_folded; }
-            auto diff_folded = diff_priority_pattern_fold(graph, last_folded);
-            if (diff_folded) { last_folded = diff_folded; }
-            if (pre_folded == last_folded) { break; }
-            pre_folded = last_folded;
-        }
-        vis->update_state_for_visited(last_folded);
-    });
-    graph.reset_op_ids();
-}
-
-void graph_constant_folding(sc_graph_t &graph, const context_ptr &ctx) {
-    exchange_binary_const_ops(graph, ctx);
-    push_relu_back(graph, ctx);
-    fold_polynomial(graph, ctx);
-}
-
-void graph_simplify(sc_graph_t &graph, const context_ptr &ctx) {
-    redundant_binary_op_elimination(graph, ctx);
-    excess_tensor_view_elimination(graph, ctx);
-    useless_tensor_view_elimination(graph, ctx);
-    horizontal_same_op_elimination(graph, ctx);
-    graph_constant_folding(graph, ctx);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/inplace_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/inplace_transform.cpp
deleted file mode 100644
index 8415c9d1915..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/inplace_transform.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../fusible_op.hpp"
-#include "../graph.hpp"
-#include "../visitor.hpp"
-#include <ops/fusible/memory_movement.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-enum inplace_status : int { no_linked, linked_output, directly_linked_output };
-static bool is_copy_reorder(const sc_op_ptr &node) {
-    return node->isa<reorder_op_t>()
-            && node->attrs_.get_or_else("actually_copy", false);
-}
-
-static sc_op_ptr skip_tensor_view(const sc_op_ptr &node, int &index) {
-    auto cur_node = node;
-    while (cur_node->isa<tensor_view_op_t>()
-            && cur_node->is_single_output_single_use()) {
-        index = cur_node->get_outputs()[0]->uses_[0].first;
-        cur_node = cur_node->get_outputs()[0]->uses_[0].second.lock();
-    }
-    return cur_node;
-}
-static bool has_linked_input(const sc_op_ptr &node) {
-    if (node->isa<input_op>()) { return true; }
-    auto cur_node = node.get();
-    while (cur_node->isa<tensor_view_op_t>()) {
-        cur_node = cur_node->get_inputs()[0]->producer_owner_;
-    }
-    return cur_node->isa<input_op>();
-}
-
-static bool has_linked_output(const sc_op_ptr &node) {
-    if (node->isa<output_op>()) { return true; }
-    if (node->isa<tensor_view_op_t>()) {
-        for (auto &use : node->get_outputs()[0]->uses_) {
-            if (has_linked_output(use.second)) { return true; }
-        }
-    }
-    return false;
-}
-
-// return if the copy_reorder is linked to output.
-inplace_status has_linked_output_and_modify_copy(int out_idx,
-        const sc_op_ptr &node, bool is_del = true, bool force_insert = false,
-        bool force_delete = false) {
-    auto cur_node = skip_tensor_view(node, out_idx);
-    auto has_linked_out = has_linked_output(cur_node);
-    if (is_del && is_copy_reorder(cur_node)) {
-        assert(cur_node->get_outputs()[0]->uses_.size() == 1);
-        auto next_node = cur_node->get_outputs()[0]->uses_[0].second.lock();
-        // skip tensor view
-        auto nnext_node = skip_tensor_view(next_node, out_idx);
-        auto nnext_linked_out = has_linked_output(nnext_node);
-        if (nnext_linked_out && !force_delete) { return linked_output; }
-        // delete copy.
-        next_node->replace_input(cur_node->get_outputs()[0]->uses_[0].first,
-                cur_node->get_inputs()[0]);
-        cur_node->remove();
-        return nnext_linked_out ? linked_output : no_linked;
-    } else if (!is_del && force_insert && has_linked_out) {
-        // insert copy
-        sc_graph_t &graph = node->get_owner_graph();
-        auto old_ltsr = cur_node->get_inputs()[out_idx];
-        auto cp_reorder = graph.make("reorder", {old_ltsr}, {},
-                {{"internal", true}, {"actually_copy", true},
-                        {"out_format", old_ltsr->details_.get_format()}});
-        cur_node->replace_input(out_idx, cp_reorder->get_outputs()[0]);
-        return linked_output;
-    }
-    // judge cur node is a output op.
-    return has_linked_out ? directly_linked_output : no_linked;
-}
-
-// This function aims to automatically detect wrong inplacement of external
-// input/output buffers and insert copying reorders between them.
-void invalid_inplacement_detection(sc_graph_t &graph) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        // skip tensor view with single use.
-        int dummy_idx = 0;
-        auto cur_node = skip_tensor_view(node, dummy_idx);
-        if (is_copy_reorder(cur_node) || cur_node->isa<output_op>()) { return; }
-        size_t inp_count = 0, out_count = 0;
-        if (has_linked_input(cur_node)) { inp_count++; }
-        for (auto &customer_use : cur_node->get_outputs()[0]->uses_) {
-            auto customer = customer_use.second.lock();
-            if (has_linked_output(customer)) { out_count++; }
-        }
-        // If the buffer is linked to 2 or more inputs/outputs, need to
-        // insert copy.
-        if (inp_count + out_count >= 2) {
-            // if linked to input, then all output needs copy, else one output
-            // could directly use the buffer.
-            int insert_num = inp_count > 0 ? out_count : out_count - 1;
-            auto uses = cur_node->get_outputs()[0]->uses_;
-            for (auto &customer_use : uses) {
-                if (insert_num == 0) { break; }
-                auto customer = customer_use.second.lock();
-                int out_idx = customer_use.first;
-                auto status
-                        = has_linked_output_and_modify_copy(out_idx, customer,
-                                /*is_del*/ false, /*force_insert*/ true);
-                if (status == linked_output) { --insert_num; }
-            }
-        }
-        vis->update_state_for_visited(cur_node);
-    });
-    graph.reset_op_ids();
-}
-
-// This function aims to do reduction copying op elimination, copying reorders
-// could be inserted by users or past pass.
-void redundant_copy_elimination(sc_graph_t &graph) {
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        // skip tensor view with single use.
-        int dummy_idx = 0;
-        auto cur_node = skip_tensor_view(node, dummy_idx);
-        if (is_copy_reorder(cur_node) || cur_node->isa<output_op>()) { return; }
-        size_t inp_count = 0, out_count = 0, direct_out_count = 0;
-        if (has_linked_input(cur_node)) { inp_count++; }
-        auto uses = cur_node->get_outputs()[0]->uses_;
-        for (auto &customer_use : uses) {
-            auto customer = customer_use.second.lock();
-            int out_idx = customer_use.first;
-            auto status = has_linked_output_and_modify_copy(out_idx, customer);
-            if (utils::is_one_of(
-                        status, linked_output, directly_linked_output)) {
-                out_count++;
-                if (status == directly_linked_output) { direct_out_count++; }
-            }
-        }
-        assert(direct_out_count <= 1);
-        // If the buffer is linked to single input/output, eliminate the copy,
-        // else if linked to 2 or more outputs, eliminate the first copy.
-        if ((inp_count + out_count < 2 && inp_count + out_count > 0)
-                || (inp_count == 0 && out_count >= 2
-                        && direct_out_count == 0)) {
-            int delete_num = 1;
-            auto uses = cur_node->get_outputs()[0]->uses_;
-            for (auto &customer_use : uses) {
-                if (delete_num == 0) { break; }
-                auto customer = customer_use.second.lock();
-                int out_idx = customer_use.first;
-                auto status
-                        = has_linked_output_and_modify_copy(out_idx, customer,
-                                /*is_del*/ true,
-                                /*force insert*/ false, /*force delete*/ true);
-                if (status == linked_output) { delete_num--; }
-            }
-        }
-        vis->update_state_for_visited(cur_node);
-    });
-    graph.reset_op_ids();
-}
-
-void inplace_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    invalid_inplacement_detection(graph);
-    redundant_copy_elimination(graph);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/layout_propagation.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/layout_propagation.cpp
deleted file mode 100644
index ccda05a436e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/layout_propagation.cpp
+++ /dev/null
@@ -1,728 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-
-#include <functional>
-#include <limits>
-#include "../dynamic_dispatch_key.hpp"
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../tunable_op.hpp"
-#include "../visitor.hpp"
-#include <ops/convolution.hpp>
-#include <ops/fusible/broadcast.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <ops/reshape.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.layout_propagation);
-
-using reorder_callback_type = std::function<void(
-        const graph_tensor_ptr &in, const sc_data_format_t &target_formats)>;
-using reorder_map_t = std::unordered_map<graph_tensor_ptr,
-        std::unordered_map<sc_op_ptr, sc_op_ptr>>;
-
-static constexpr const char *reorder_format_set_key = "temp.reorder_format_set";
-static void insert_reorder_op(sc_graph_t &graph, reorder_map_t &reorder_map,
-        const graph_tensor_ptr &in, size_t in_index,
-        const format_stride_pair &out_format_stride, const sc_op_ptr &cur_op,
-        bool is_input_plain, reorder_callback_type &on_insert_reorder) {
-    // if we don't need to keep plain format for input op and input op tensor is
-    // currently not blocking, copy the blocking format
-    // If the output has multiply uses, choose one format for all that now to
-    // minimize the number of reorder, mainly for bottleneck.
-    if (!is_input_plain && !cur_op->is_dynamic()
-            && in->producer_owner_->isa<input_op>()
-            && in->details_.get_format().is_plain()
-            && !in->producer_owner_->attrs_.get_or_else("keep_plain", false)) {
-        in->details_.set_format_and_stride(
-                out_format_stride.first, out_format_stride.second);
-        return;
-    }
-    bool is_graph_dynamic = graph.is_dynamic();
-    auto &in_format = in->details_.get_format();
-    auto dispatch_key = op_dispatch_key_t(
-            std::vector<sc_data_format_t> {in_format, out_format_stride.first});
-    auto is_key_valid = [](const op_dispatch_key_t &key) {
-        return !(key.in_out_formats_[0].is_blocking()
-                       && key.in_out_formats_[1].is_blocking())
-                || key.in_out_formats_[0] == key.in_out_formats_[1];
-    }(dispatch_key);
-    sc_op_ptr ret;
-    // find reorder in map, if not insert one.
-    auto tsr_it = reorder_map.find(in);
-    if (tsr_it != reorder_map.end()) {
-        auto &op_map = tsr_it->second;
-        auto op_it = op_map.find(cur_op);
-        if (op_it != op_map.end()) {
-            ret = op_it->second;
-            ret->get_outputs()[0]->details_.set_format_and_stride(
-                    out_format_stride.first, out_format_stride.second);
-            ret->attrs_.set("out_format", out_format_stride.first);
-            ret->attrs_.set("out_stride", out_format_stride.second);
-            // update fuse attr for latest format
-            ret->stc_cast<reorder_op_t>()->update_fuse_attr();
-            // map reorder's in/out
-            if (is_graph_dynamic && is_key_valid) {
-                auto &dynamic_formats
-                        = ret->get_dispatch_key_set()->get_inner_set();
-                // todo: remove internal reorder by following last tunable op
-                // output layout.
-                if (dynamic_formats.find(dispatch_key)
-                        == dynamic_formats.end()) {
-                    ret->get_outputs()[0]->details_.add_format_candidate(
-                            out_format_stride.first);
-                    dynamic_formats.insert(dispatch_key);
-                }
-                ret->get_outputs()[0]->details_.add_format_candidate(
-                        out_format_stride.first);
-            }
-        }
-    }
-    bool create_reorder = !ret;
-    if (create_reorder) {
-        ret = graph.make("reorder", {in}, {},
-                {{"internal", true}, {"out_format", out_format_stride.first},
-                        {"out_stride", out_format_stride.second},
-                        {op_attr_key::no_fuse, // work around for conv
-                                // graph. will be dropped
-                                // after yijie's refactor
-                                graph.attrs_.get_or_else(
-                                        "reorder_not_to_fuse", false)}});
-        // map reorder's in/out
-        if (is_graph_dynamic && is_key_valid) {
-            ret->get_dispatch_key_set()->get_inner_set().insert(dispatch_key);
-        }
-        ret->get_outputs()[0]->details_.add_format_candidate(
-                out_format_stride.first);
-        reorder_map[in][cur_op] = ret;
-    }
-    // if we are in trying-mode, don't insert an reorder. Instead,
-    // notify that there will be a reorder to be inerted
-    if (on_insert_reorder) {
-        in->details_.set_format_and_stride(
-                out_format_stride.first, out_format_stride.second);
-        on_insert_reorder(in, out_format_stride.first);
-    } else {
-        cur_op->replace_input(in_index, ret->get_outputs()[0]);
-    }
-}
-
-static void remove_inserted_reorder_op(sc_graph_t &graph,
-        reorder_map_t &reorder_map, const graph_tensor_ptr &in,
-        const sc_op_ptr &cur_op) {
-    auto tsr_it = reorder_map.find(in);
-    if (tsr_it != reorder_map.end()) {
-        auto &op_map = tsr_it->second;
-        auto op_it = op_map.find(cur_op);
-        if (op_it != op_map.end()) { op_it->second->remove(); }
-    }
-}
-
-static void update_output_formats(std::vector<graph_tensor_ptr> &outs,
-        const std::vector<std::vector<format_stride_pair>> &out_supported_pairs,
-        size_t layout_choice) {
-    for (size_t i = 0; i < outs.size(); ++i) {
-        if (!out_supported_pairs.empty()) {
-            auto &fs_pair = out_supported_pairs[i][layout_choice];
-            outs[i]->details_.set_format_and_stride(
-                    fs_pair.first, fs_pair.second);
-        } else if (outs[i]->details_.get_format().is_any()) {
-            outs[i]->details_.set_format({sc_data_format_t::get_plain_by_dims(
-                    (int)outs[i]->details_.get_plain_dims().size())});
-        }
-        outs[i]->details_.add_format_candidate(outs[i]->details_.get_format());
-    }
-}
-
-static void check_input_format(const std::vector<graph_tensor_ptr> &ins) {
-    for (auto &in : ins) {
-        COMPILE_ASSERT(!in->details_.get_format().is_any(),
-                "input format don't allow any format");
-    }
-}
-
-static void insert_reorder_for_output_op(reorder_map_t &reorder_map,
-        const sc_op_ptr &node, bool use_channel_last_format, bool is_out_plain,
-        bool is_input_plain, bool is_graph_dynamic, sc_graph_t &graph,
-        reorder_callback_type &insert_reorder_callback) {
-    auto given_target_formats
-            = node->attrs_.get_or_null<std::vector<sc_data_format_t>>(
-                    "target_formats");
-    auto given_target_strides
-            = node->attrs_.get_or_null<std::vector<sc_dims>>("target_strides");
-    if (given_target_formats) {
-        COMPILE_ASSERT(
-                given_target_formats->size() == node->get_inputs().size(),
-                "Output op's target_formats' size should be equal to "
-                "number of tensors");
-    }
-
-    if (given_target_strides) {
-        COMPILE_ASSERT(
-                given_target_strides->size() == node->get_inputs().size(),
-                "Output op's target_strides' size should be equal to "
-                "number of tensors");
-    }
-    for (size_t i = 0; i < node->get_inputs().size(); ++i) {
-        auto &in_detail = node->get_inputs()[i]->details_;
-        sc_data_format_t target_format;
-        sc_dims target_stride;
-        if (given_target_formats) {
-            target_format = (*given_target_formats)[i];
-        } else if (is_out_plain) {
-            if (use_channel_last_format) {
-                target_format = in_detail.get_format().to_channel_last();
-            } else {
-                target_format = in_detail.get_format().to_plain();
-            }
-        } else {
-            target_format = in_detail.get_format();
-        }
-        if (given_target_strides) {
-            target_stride = (*given_target_strides)[i];
-        } else if (is_out_plain) {
-            // here stride is calculated according to plain dims & format
-            if (use_channel_last_format) {
-                // permute dense stride to channel last stride
-                sc_dims permuted_dims = in_detail.get_plain_dims();
-                size_t ndims = permuted_dims.size();
-                sc_dim channel = permuted_dims[1];
-                for (size_t d = 1; d < ndims - 1; ++d) {
-                    permuted_dims[d] = permuted_dims[d + 1];
-                }
-                permuted_dims[ndims - 1] = channel;
-                target_stride
-                        = logical_tensor_t::compute_dense_stride(permuted_dims);
-            } else {
-                target_stride = logical_tensor_t::compute_dense_stride(
-                        in_detail.get_plain_dims());
-            }
-        } else {
-            if (given_target_formats) {
-                auto dims = logical_tensor_t(target_format,
-                        in_detail.get_plain_dims(), in_detail.dtype_)
-                                    .get_blocking_dims();
-                target_stride = logical_tensor_t::compute_dense_stride(dims);
-            } else {
-                target_stride = in_detail.get_strides();
-            }
-        }
-
-        auto in = node->get_inputs()[i];
-        COMPILE_ASSERT(!in_detail.get_format().is_any(),
-                "output op's input format should have a concrete "
-                "format, instead of any format");
-        bool plain_check_failed = is_out_plain && target_format.is_blocking();
-        COMPILE_ASSERT(!target_format.is_any() && !plain_check_failed,
-                "output op's target format should be plain or "
-                "permuted.")
-
-        if (is_graph_dynamic) {
-            auto old_format = in_detail.get_format();
-            format_stride_pair target_fs_pair(
-                    target_format, std::move(target_stride));
-            for (auto &candidate : in_detail.get_format_candidates()) {
-                in_detail.set_format(candidate);
-                insert_reorder_op(graph, reorder_map, in, i, target_fs_pair,
-                        node, is_input_plain, insert_reorder_callback);
-            }
-            in_detail.set_format(old_format);
-        } else if (target_format != in_detail.get_format()
-                || target_stride != in_detail.get_strides()) {
-            format_stride_pair target_fs_pair(
-                    target_format, std::move(target_stride));
-            insert_reorder_op(graph, reorder_map, in, i, target_fs_pair, node,
-                    is_input_plain, insert_reorder_callback);
-        } else if (!insert_reorder_callback) {
-            // if static and no need of reorder
-            remove_inserted_reorder_op(graph, reorder_map, in, node);
-        }
-    }
-}
-
-static void combine_layout_and_impl_dispatch(
-        const context_ptr &ctx, sc_graph_t &graph) {
-    auto &ops = graph.ops_;
-    for (auto &op : ops) {
-        if (!op->is_dynamic()) { continue; }
-        auto impl_candidates = op->get_impl_dispatch_candidates(ctx);
-        if (impl_candidates.empty() || !op->is_dynamic()) { continue; }
-        auto &key_set = op->get_dispatch_key_set()->get_inner_set();
-        dispatch_key_set_t::inner_set_t new_set(key_set);
-        for (auto key : key_set) {
-            for (auto &impl : impl_candidates) {
-                if (impl) {
-                    key.impl_ = impl;
-                    new_set.insert(key);
-                }
-            }
-        }
-        op->get_dispatch_key_set()->get_inner_set() = new_set;
-    }
-}
-
-static bool has_channel_last_input(sc_graph_t &graph) {
-    for (const auto &inputs : graph.get_input_ops()) {
-        for (const auto &in_gt : inputs->get_outputs()) {
-            if (in_gt->details_.get_format().is_channel_last()
-                    && !in_gt->details_.get_format().is_blocking()) {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-static bool has_conv_op(sc_graph_t &graph) {
-    auto pos = std::find_if(
-            graph.ops_.begin(), graph.ops_.end(), [](const sc_op_ptr &op) {
-                if (op->dyn_cast<ops::conv_fwd_core_op_t>()) {
-                    return true;
-                } else {
-                    return false;
-                }
-            });
-    return pos != graph.ops_.end();
-}
-
-// max times of layout tries with static shape
-constexpr int STATIC_MAX_LAYOUT_TRIES = 64;
-
-SC_INTERNAL_API void layout_propagation(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    bool use_channel_last_format = has_channel_last_input(graph)
-            && has_conv_op(graph)
-            && graph.attrs_.get_or_else(
-                    sc_graph_t::attr_key_t::allow_channel_last_output, false);
-    bool is_input_plain = graph.attrs_.get_or_else(
-            sc_graph_t::attr_key_t::is_input_plain, true);
-    bool is_graph_dynamic = graph.is_dynamic()
-            && graph.attrs_.get_or_else("insert_reorder", true);
-    std::vector<sc_op_ptr> sorted_ops;
-    sorted_ops.reserve(graph.ops_.size());
-    std::vector<size_t> num_choices;
-    num_choices.resize(graph.ops_.size(), 1);
-    size_t total_choices = 1;
-    std::vector<size_t> cur_choice;
-    cur_choice.resize(graph.ops_.size(), 0);
-    // the try-run will reset the input op's format. need to remember them
-    std::vector<std::vector<format_stride_pair>> format_backup;
-    std::vector<sc_op *> input_ops;
-
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    // stage 1, collect all ops and the number of choices of formats
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        sorted_ops.emplace_back(node);
-        if (is_graph_dynamic) { return; }
-        if (node->isa<input_op>() || node->isa<constant_op_t>()) {
-            // backup the input's format
-            std::vector<format_stride_pair> backup_info;
-            for (auto &in : node->get_outputs()) {
-                backup_info.emplace_back(std::make_pair(
-                        in->details_.get_format(), in->details_.get_strides()));
-            }
-            format_backup.emplace_back(backup_info);
-            input_ops.push_back(node.get());
-        }
-        if (auto tunable_node = node->dyn_cast<tunable_op_t>()) {
-            std::vector<std::vector<format_stride_pair>> in_supported_pairs,
-                    out_supported_pairs;
-            bool has_config = bool(tunable_node->get_config());
-            node->query_format(ctx, in_supported_pairs, out_supported_pairs);
-            if (!has_config) { tunable_node->set_config(nullptr); }
-            if (in_supported_pairs.empty()) {
-                num_choices[node->logical_op_id_] = 1;
-            } else {
-                num_choices[node->logical_op_id_]
-                        = in_supported_pairs[0].size();
-                if (!graph.is_dynamic()
-                        && total_choices <= STATIC_MAX_LAYOUT_TRIES)
-                    total_choices
-                            = total_choices * num_choices[node->logical_op_id_];
-            }
-            SC_MODULE_INFO << node->op_name_ << '_' << node->logical_op_id_
-                           << " has num_choices="
-                           << num_choices[node->logical_op_id_];
-        } else {
-            num_choices[node->logical_op_id_] = 1;
-        }
-    });
-
-    auto reset_input_fmt = [&]() {
-        // restore input formats
-        for (size_t input_cnt = 0; input_cnt < input_ops.size(); input_cnt++) {
-            auto input_node = input_ops[input_cnt];
-            // backup the input's format
-            auto &fmts = format_backup[input_cnt];
-            auto &outs = input_node->get_outputs();
-            for (size_t i = 0; i < outs.size(); i++) {
-                outs[i]->details_.set_format_and_stride(
-                        fmts[i].first, fmts[i].second);
-            }
-        }
-    };
-    reorder_callback_type insert_reorder_callback;
-    // the map is used to record the reorder between original tensor and op. In
-    // try mode, we insert an reorder but does not change the connection. If we
-    // work on dynamic shape or really need the reorder in static, we reserve it
-    // and insert it between graph tensor and op. If not, we delete it.
-    reorder_map_t reorder_map;
-    bool is_out_plain = graph.attrs_.get_or_else(
-            sc_graph_t::attr_key_t::is_output_plain, true);
-    auto do_visit = [&](const sc_op_ptr &node) {
-        if (node->isa<output_op>()) {
-            insert_reorder_for_output_op(reorder_map, node,
-                    use_channel_last_format, is_out_plain, is_input_plain,
-                    is_graph_dynamic, graph, insert_reorder_callback);
-        } else if (node->isa<input_op>() || node->isa<constant_op_t>()) {
-            update_output_formats(node->info_.outputs_, {}, 0);
-        } else {
-            std::vector<std::vector<format_stride_pair>> in_supported_pairs,
-                    out_supported_pairs;
-            auto reset_in_out_supported_pairs = [&]() {
-                in_supported_pairs.clear();
-                out_supported_pairs.clear();
-            };
-            // we need to reset the config after query_format if we need to use
-            // the default config for tunable_ops
-            bool reset_config = false;
-
-            if (insert_reorder_callback) {
-                // we only need to reset the config when we are in "try" mode
-                auto tunable_node = node->dyn_cast<tunable_op_t>();
-                if (tunable_node && !tunable_node->get_config()) {
-                    // if it is tunable and has no user-defined config
-                    reset_config = true;
-                }
-            }
-            node->query_format(ctx, in_supported_pairs, out_supported_pairs);
-
-            if (reset_config) {
-                node->stc_cast<tunable_op_t>()->set_config(nullptr);
-            }
-            // as we will insert reorder in dynamic mode, we should record
-            // previous inputs.
-            auto inputs_backup = node->info_.inputs_;
-            auto &inputs
-                    = is_graph_dynamic ? inputs_backup : node->info_.inputs_;
-            auto &outputs = node->info_.outputs_;
-            check_input_format(inputs);
-            size_t cur_layout_choice = cur_choice[node->logical_op_id_];
-            if (node->isa<tunable_op_t>()
-                    || node->isa<binary_elementwise_op_t>()
-                    || node->isa<tensor_view_op_t>()
-                    || node->isa<ops::dynamic_reshape_op>()
-                    || node->isa<concat_op_t>() || node->isa<select_op_t>()
-                    || node->isa<broadcast_op_t>() || node->isa<reshape_op_t>()
-                    || node->isa<reduce_op_t>()) {
-                std::vector<sc_data_format_t> old_formats;
-                std::vector<std::vector<sc_data_format_t>>
-                        input_format_candidates;
-                std::vector<size_t> format_choice;
-                size_t total_format_choice = 1;
-                old_formats.reserve(inputs.size());
-                input_format_candidates.reserve(inputs.size());
-                for (size_t i = 0; i < inputs.size(); ++i) {
-                    old_formats.push_back(inputs[i]->details_.get_format());
-                    std::set<sc_data_format_t, sc_data_format_cmper_t>
-                            format_candidates;
-                    if (is_graph_dynamic) {
-                        auto &inp_candidates
-                                = inputs[i]->details_.get_format_candidates();
-                        format_candidates.insert(
-                                inp_candidates.begin(), inp_candidates.end());
-                    } else {
-                        format_candidates.insert(
-                                inputs[i]->details_.get_format());
-                    }
-                    input_format_candidates.emplace_back(
-                            format_candidates.begin(), format_candidates.end());
-                    format_choice.push_back(format_candidates.size());
-                    total_format_choice *= format_choice.back();
-                }
-                for (size_t c = 0; c < total_format_choice; c++) {
-                    size_t format_idx = c;
-                    // for dynamic dispatch of fusible op.
-                    std::vector<sc_data_format_t> dispatch_format;
-                    dispatch_format.reserve(inputs.size() + outputs.size());
-                    if (is_graph_dynamic) {
-                        for (size_t i = input_format_candidates.size(); i > 0;
-                                i--) {
-                            inputs[i - 1]->details_.set_format(
-                                    input_format_candidates[i - 1][format_idx
-                                            % format_choice[i - 1]]);
-                            format_idx /= format_choice[i - 1];
-                        }
-                    } else {
-                        COMPILE_ASSERT(total_format_choice == 1,
-                                "Static traverses one format at a time.");
-                    }
-                    // tunable op should return same formats in dynamic
-                    // shape.
-                    if (is_graph_dynamic) {
-                        // tuanble op also need re-query as the internal layer
-                        // rely on previous layer's output layout.
-                        assert(cur_layout_choice == 0);
-                        reset_in_out_supported_pairs();
-                        if (node->isa<tunable_op_t>()) {
-                            node->stc_cast<tunable_op_t>()->set_config(nullptr);
-                        }
-                        node->query_format(
-                                ctx, in_supported_pairs, out_supported_pairs);
-                    }
-                    // need to unify input formats
-                    // todo: should check add_op input shape, output shape
-                    // size = max(input size), so need to enhance
-                    if (!in_supported_pairs.empty()
-                            && !out_supported_pairs.empty()) {
-                        for (size_t i = 0; i < inputs.size(); i++) {
-                            auto &target_fs_pair
-                                    = in_supported_pairs[i][cur_layout_choice];
-                            format_stride_pair in_fs_pair(
-                                    inputs[i]->details_.get_format(),
-                                    inputs[i]->details_.get_strides());
-                            auto &format_candidates
-                                    = inputs[i]
-                                              ->details_
-                                              .get_format_candidates();
-                            auto &target_format = target_fs_pair.first;
-                            // tunable op always need reorder before op in
-                            // dynamic.
-                            if (is_graph_dynamic) {
-                                if (node->isa<tunable_op_t>()) {
-                                    // Reverse order here to match tunable op's
-                                    // config.
-                                    for (size_t k
-                                            = in_supported_pairs[i].size();
-                                            k > 0; k--) {
-                                        // ensure that each internal tunable op
-                                        // follows last format
-                                        // clang-format off
-                                        if (!in_supported_pairs[i][k - 1]
-                                                        .first.is_blocking()
-                                                || (format_candidates.find(
-                                                            in_supported_pairs
-                                                                    [i][k - 1]
-                                                                        .first)
-                                                        == format_candidates
-                                                                   .end())) {
-                                            insert_reorder_op(graph,
-                                                    reorder_map, inputs[i], i,
-                                                    in_supported_pairs[i]
-                                                                      [k - 1],
-                                                    node, is_input_plain,
-                                                    insert_reorder_callback);
-                                        }
-                                        // clang-format on
-                                    }
-                                } else {
-                                    // binary fusible, if input format
-                                    // candidates contains the target format, do
-                                    // not need reorder
-                                    if (format_candidates.find(target_format)
-                                            == format_candidates.end()) {
-                                        insert_reorder_op(graph, reorder_map,
-                                                inputs[i], i, target_fs_pair,
-                                                node, is_input_plain,
-                                                insert_reorder_callback);
-                                    }
-                                }
-                            } else if (in_fs_pair != target_fs_pair) {
-                                if (graph.attrs_.get_or_else(
-                                            "insert_reorder", true)) {
-                                    // static update
-                                    insert_reorder_op(graph, reorder_map,
-                                            inputs[i], i, target_fs_pair, node,
-                                            is_input_plain,
-                                            insert_reorder_callback);
-                                } else {
-                                    // For dynamic fused op internal format
-                                    // update.
-                                    inputs[i]->details_.set_format_and_stride(
-                                            target_fs_pair.first,
-                                            target_fs_pair.second);
-                                }
-                            } else if (!insert_reorder_callback) {
-                                // if static and no need of reorder
-                                remove_inserted_reorder_op(
-                                        graph, reorder_map, inputs[i], node);
-                            }
-                            if (is_graph_dynamic) {
-                                dispatch_format.push_back(target_fs_pair.first);
-                            }
-                        }
-                        if (is_graph_dynamic) {
-                            assert(!out_supported_pairs[0].empty());
-                            for (size_t j = out_supported_pairs[0].size();
-                                    j > 0; j--) {
-                                update_output_formats(node->info_.outputs_,
-                                        out_supported_pairs, j - 1);
-                            }
-                            if (!node->isa<tunable_op_t>()) {
-                                // dispatch_key of tunable op is decided by
-                                // itself.
-                                dispatch_format.push_back(
-                                        out_supported_pairs[0][0].first);
-                                // update fusible_op's dispatch key in pass as
-                                // it follows format of tunable op.
-                                node->get_dispatch_key_set()
-                                        ->get_inner_set()
-                                        .insert(dispatch_format);
-                            }
-                        }
-                    } else {
-                        COMPILE_ASSERT(0,
-                                "The op must support query_format: "
-                                        << node->op_name_);
-                    }
-                }
-                // reset old format
-                if (is_graph_dynamic) {
-                    for (size_t i = 0; i < inputs.size(); ++i) {
-                        inputs[i]->details_.set_format(old_formats[i]);
-                    }
-                    reset_in_out_supported_pairs();
-                    if (node->isa<tunable_op_t>()) {
-                        node->stc_cast<tunable_op_t>()->set_config(nullptr);
-                    }
-                    node->query_format(
-                            ctx, in_supported_pairs, out_supported_pairs);
-                    // if (node->isa<tunable_op_t>()) {
-                    // reset one unified layout for fusion.
-                    for (size_t i = 0; i < node->get_inputs().size(); ++i) {
-                        node->get_inputs()[i]->details_.set_format(
-                                in_supported_pairs[i][0].first);
-                    }
-                    // }
-                }
-                update_output_formats(node->info_.outputs_, out_supported_pairs,
-                        cur_layout_choice);
-            } else if (node->isa<fusible_op_t>()) {
-                // split/flatten/reshape/concat/matmul/reduce/reorder/trans2d/transpose
-                // has itself query_format func
-                // relu/exp/tanh/erf/squared_root/triangle has utility
-                // query_format func
-                if (is_graph_dynamic) {
-                    auto &format_candidates
-                            = inputs[0]->details_.get_format_candidates();
-                    auto old_format = inputs[0]->details_.get_format();
-                    for (auto &candidate : format_candidates) {
-                        std::vector<sc_data_format_t> dispatch_format;
-                        dispatch_format.reserve(inputs.size() + outputs.size());
-                        inputs[0]->details_.set_format(candidate);
-                        dispatch_format.push_back(candidate);
-                        reset_in_out_supported_pairs();
-                        node->query_format(
-                                ctx, in_supported_pairs, out_supported_pairs);
-                        assert(out_supported_pairs[0].size() == 1);
-                        update_output_formats(node->info_.outputs_,
-                                out_supported_pairs, cur_layout_choice);
-                        dispatch_format.push_back(
-                                out_supported_pairs[0][0].first);
-                        node->get_dispatch_key_set()->get_inner_set().insert(
-                                dispatch_format);
-                    }
-                    inputs[0]->details_.set_format(old_format);
-                }
-                reset_in_out_supported_pairs();
-                node->query_format(
-                        ctx, in_supported_pairs, out_supported_pairs);
-                update_output_formats(node->info_.outputs_, out_supported_pairs,
-                        cur_layout_choice);
-            } else {
-                COMPILE_ASSERT(0,
-                        "Only support fusible op/tunable op/in op/out op "
-                        "in "
-                        "the layout_propagation pass");
-            }
-        }
-    };
-
-    // try all combinations of possible layouts and select the least cost
-    // one
-    if (total_choices <= STATIC_MAX_LAYOUT_TRIES && total_choices > 1) {
-        size_t best_cost = std::numeric_limits<size_t>::max();
-        std::vector<size_t> best_choice = cur_choice;
-        for (size_t tr = 0; tr < total_choices; tr++) {
-            reset_input_fmt();
-            size_t cost = 0;
-            insert_reorder_callback =
-                    [&cost](const graph_tensor_ptr &in,
-                            const sc_data_format_t &target_format) {
-                        size_t cur_cost = in->details_.get_blocking_byte_size();
-                        // give blocking format a discount as it may
-                        // benefit performance on other ops
-                        if (target_format.is_blocking()) {
-                            cur_cost = cur_cost * 0.9;
-                        }
-                        cost += cur_cost;
-                    };
-            size_t cur_idx = tr;
-            for (size_t i = 0; i < cur_choice.size(); i++) {
-                cur_choice[i] = cur_idx % num_choices[i];
-                cur_idx /= num_choices[i];
-            }
-            for (auto &op : sorted_ops) {
-                do_visit(op);
-            }
-            SC_MODULE_INFO << "cost=" << cost << " "
-                           << utils::print_vector(cur_choice);
-            if (!is_graph_dynamic && cost < best_cost) {
-                best_cost = cost;
-                best_choice = cur_choice;
-            }
-        }
-        cur_choice = std::move(best_choice);
-    } else {
-        // if there are too many choices, we can choose the default all-zero
-        // choice
-        if (total_choices > STATIC_MAX_LAYOUT_TRIES) {
-            SC_MODULE_WARN << "Too many choices, using default";
-        }
-    }
-
-    // clear the callback to let insert_reorder_op really work
-    insert_reorder_callback = reorder_callback_type();
-    reset_input_fmt();
-    // visit again to insert the reorder ops
-    for (auto &op : sorted_ops) {
-        do_visit(op);
-    }
-
-    // it should be refactor to one standalone pass to finally fix constant
-    // value
-    auto vis2 = op_visitor_t::bfs();
-    vis2.visit_graph(graph, [&](op_visitor_t *vis2, const sc_op_ptr &node) {
-        if (node->isa<constant_op_t>() && node->attrs_.has_key("temp.var")) {
-            auto const_op = node->dyn_cast<constant_op_t>();
-            const_op->reset_const_values();
-        }
-    });
-    if (is_graph_dynamic) { combine_layout_and_impl_dispatch(ctx, graph); }
-    graph.reset_op_ids();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/partial_reduce_replace.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/partial_reduce_replace.cpp
deleted file mode 100644
index eccc07d69ec..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/partial_reduce_replace.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <vector>
-#include "transform.hpp"
-#include <ops/fusible/reduce.hpp>
-#include <runtime/config.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void partial_reduce_replace(sc_graph_t &graph, const context_ptr &ctx) {
-    auto num_threads = runtime_config_t::get().get_num_threads();
-    if (num_threads <= 1 || graph.is_dynamic()) { return; }
-    auto ops = graph.ops_;
-    for (auto &op : ops) {
-        if (auto rdop = op->dyn_cast<reduce_op_t>()) {
-            auto rxax = rdop->get_rd_axis();
-            bool is_first_axis = false;
-            auto &in_dims = rdop->get_inputs()[0]->details_.get_blocking_dims();
-            sc_dim reduction_size = 1;
-            for (auto ax : rxax) {
-                if (ax == 0) { is_first_axis = true; }
-                reduction_size *= in_dims[ax];
-            }
-            if (is_first_axis && reduction_size >= num_threads * 16) {
-                rdop->split_op(ctx, graph, num_threads);
-            }
-        }
-    }
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/permute_propagation.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/permute_propagation.cpp
deleted file mode 100644
index 85db7437481..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/permute_propagation.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <utility>
-
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../tunable_op.hpp"
-#include "../visitor.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static bool check_have_any(const sc_graph_t &graph) {
-    bool have_any = false;
-    for (const auto &op : graph.ops_) {
-        for (auto &in : op->get_inputs()) {
-            if (in->details_.get_format().is_any()) {
-                have_any = true;
-                break;
-            }
-        }
-    }
-    return have_any;
-}
-
-static void insert_reorder_op(sc_graph_t &graph, graph_tensor_ptr in,
-        size_t in_index, const sc_data_format_t &out_format,
-        const sc_op_ptr &cur_op) {
-    auto ret = graph.make("reorder", {std::move(in)}, {},
-            {{"out_format", out_format}, {"internal", true},
-                    {op_attr_key::no_fuse, // walk around for conv graph. will
-                            // be dropped after yijie's refactor
-                            graph.attrs_.get_or_else(
-                                    "reorder_not_to_fuse", false)}});
-    cur_op->replace_input(in_index, ret->get_outputs()[0]);
-}
-
-static bool need_to_reorder(const sc_data_format_t &format) {
-    bool needed = false;
-    for (size_t i = 0; i < sc_data_format_kind_t::MAX_DIMS; ++i) {
-        if (i > 0) {
-            if ((format.format_code_.get(i) - format.format_code_.get(i - 1))
-                    < 0) {
-                needed = true;
-                break;
-            }
-        }
-    }
-    return needed;
-}
-
-void permute_propagation(sc_graph_t &graph, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        for (size_t i = 0; i < node->get_inputs().size(); ++i) {
-            auto in = node->get_inputs()[i];
-            if (in->details_.get_format().is_any()) {
-                in->details_.set_format(sc_data_format_t::get_plain_by_dims(
-                        (int)in->details_.get_plain_dims().size()));
-            } else if (in->details_.get_format().is_plain()
-                    && need_to_reorder(in->details_.get_format())) {
-                // todo: should query the Op if it accepts a permuted layout.
-                // TODO(yifei): consider remove this entire pass
-                // since we will not run into this condition
-                insert_reorder_op(graph, in, i,
-                        sc_data_format_t::get_plain_by_dims(
-                                (int)in->details_.get_plain_dims().size()),
-                        node);
-            }
-        }
-    });
-    graph.reset_op_ids();
-    COMPILE_ASSERT(!check_have_any(graph),
-            "After permute_propagation, each op's graph_tensor should have no "
-            "any format");
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/pre_padding.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/pre_padding.cpp
deleted file mode 100644
index 8ffa6841751..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/pre_padding.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/graph/fused_op.hpp>
-#include <ops/convolution.hpp>
-#include <ops/templates/conv_rl.hpp>
-#include <ops/templates/utils.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../visitor.hpp"
-#include "transform.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.pre_padding);
-
-void pre_padding(sc_graph_t &graph, const context_ptr &ctx) {
-    if (graph.is_dynamic()) { return; }
-    COMPILE_ASSERT(!graph.is_dynamic(), "PrePadding only support static cases");
-    auto visitor = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    auto is_zero_paddings = [](sc_dims &paddings) {
-        bool zero_paddings = true;
-        for (auto &p : paddings) {
-            if (p != 0) {
-                zero_paddings = false;
-                break;
-            }
-        }
-        return zero_paddings;
-    };
-    visitor.visit_graph(
-            graph, [&](op_visitor_t *visitor, const sc_op_ptr &node) {
-                // if current node is a conv op with paddings,
-                // insert a padding op before current node
-
-                if (node->isa<ops::conv_fwd_core_op_t>()) {
-                    auto data_plain_dims
-                            = node->info_.inputs_[0]->details_.get_plain_dims();
-                    auto weight_plain_dims
-                            = node->info_.inputs_[1]->details_.get_plain_dims();
-                    bool is_amx_dtype = ops::is_amx_dtype(
-                            ctx, node->get_inputs()[0]->details_.dtype_);
-                    sc_dim groups = node->attrs_.get_or_else("groups", 1);
-                    bool is_dw_brdgmm = groups > 1
-                            && weight_plain_dims.size() >= 5
-                            && data_plain_dims.size() >= 5
-                            && 1 == weight_plain_dims[1]
-                            && 1 == data_plain_dims[2];
-                    auto padding_value
-                            = node->attrs_.get_or_else("padding_value", 0);
-                    // TODO(xurui)
-                    // Only support extract padding op from 2d conv for now.
-                    if (((data_plain_dims.size() != 4) || !is_amx_dtype
-                                || (is_dw_brdgmm && padding_value == 0))) {
-                        return;
-                    }
-                    // Only apply to inference
-                    bool is_weight_constant
-                            = node->get_inputs()[1]
-                                      ->producer_owner_->isa<constant_op_t>()
-                            || node->get_inputs()[1]
-                                       ->producer_owner_->attrs_.get_or_else(
-                                               "constant",
-                                               const_kind::not_const)
-                            || node->get_inputs()[1]->attrs_.get_or_else(
-                                    "constant", const_kind::not_const);
-                    if (!is_weight_constant) { return; }
-                    if (node->attrs_.get_or_else(
-                                "use_rl", ops::rl_kind::NO_LOWERING)
-                            == ops::rl_kind::FULL_LOWERING)
-                        return;
-
-                    auto pads_begin = node->attrs_.has_key("pads_begin")
-                            ? node->attrs_.get<sc_dims>("pads_begin")
-                            : node->attrs_.get<sc_dims>("paddings");
-
-                    auto pads_end = node->attrs_.has_key("pads_end")
-                            ? node->attrs_.get<sc_dims>("pads_end")
-                            : node->attrs_.get<sc_dims>("paddings");
-
-                    if (is_zero_paddings(pads_begin)) { return; }
-
-                    auto parent_node = node->get_inputs()[0]->producer_owner_;
-
-                    if (parent_node->isa<input_op>()) { return; }
-                    auto padding_node = graph.make("padding",
-                            {node->get_inputs()[0]}, {},
-                            {{"pads_begin", pads_begin}, {"pads_end", pads_end},
-                                    {"padding_value", padding_value}});
-                    if (node->get_inputs()[0]->details_.get_plain_dims()[0]
-                            == 1) {
-                        padding_node->attrs_.set(
-                                op_attr_key::break_post_fuse, true);
-                    }
-
-                    // clear paddings from original conv node
-                    node->attrs_.set<sc_dims>("pads_begin", sc_dims {0});
-                    node->attrs_.set<sc_dims>("pads_end", sc_dims {0});
-                    node->attrs_.set<sc_dims>("paddings", sc_dims {0});
-
-                    visitor->update_state_for_visited(node);
-                    auto conv_new = graph.make(node->op_name_,
-                            {padding_node->get_outputs()[0],
-                                    node->get_inputs()[1]},
-                            node->get_outputs(), node->attrs_);
-
-                    // Copy configs from old node to new node
-                    // TODO(xurui) A better way for doing this is put the config
-                    // setting step after all grpah rewrite so that we do not
-                    // need to copy config here
-                    if (auto tunable_node = node->dyn_cast<tunable_op_t>()) {
-                        conv_new->dyn_cast<tunable_op_t>()->set_config(
-                                tunable_node->get_config());
-                    }
-                    node->remove();
-                    visitor->update_state_for_visited(padding_node);
-                }
-            });
-    graph.reset_op_ids();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/annotate_fuse_break.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/annotate_fuse_break.cpp
deleted file mode 100644
index 5dff74ec9cd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/annotate_fuse_break.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "compiler/ir/graph/quantization/quantize_op.hpp"
-#include "ops/fusible/binary_elemwise.hpp"
-#include "ops/fusible/unary_elemwise.hpp"
-#include "ops/graph_convolution.hpp"
-#include <compiler/ir/graph/transform/transform.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace quantize {
-/*
-Add break_post_fuse after quantize with the following pattern
-Convolution/[Convolution + BiasAdd]
-        |
-     Quantize (break_post_fuse)
-        |
-    Dequantize
-        |
-       Add
--------------OR------------
-    Dequantize
-        |
-       Add
-        |
-       Relu
-        |
-     Quantize
-*/
-void annotate_fusion_break(sc_graph_t &mgr, const context_ptr &ctx) {
-    if (!mgr.attrs_.get_or_else(sc_graph_t::attr_key_t::quantize, false))
-        return;
-    for (auto &op : mgr.ops_) {
-        if (op->isa<add_op_t>()) {
-            for (const auto &in : op->get_inputs()) {
-                auto prev1 = in->producer_owner_;
-                if (prev1->isa<quantize::dequantize_op_t>()) {
-                    auto prev2 = prev1->get_inputs()[0]->producer_owner_;
-                    if (prev2->isa<quantize::quantize_op_t>()) {
-                        auto prev3 = prev2->get_inputs()[0]->producer_owner_;
-                        if (prev3->isa<ops::conv_fwd_op_t>()
-                                || prev3->isa<add_op_t>()) {
-                            prev1->attrs_[op_attr_key::break_pre_fuse] = true;
-                        }
-                    }
-                }
-            }
-        }
-        if (op->isa<quantize::quantize_op_t>()) {
-            auto prev1 = op->get_inputs()[0]->producer_owner_;
-            if (prev1->isa<relu_op_t>()) {
-                auto prev2 = prev1->get_inputs()[0]->producer_owner_;
-                if (prev2->isa<add_op_t>()) {
-                    auto prev3_lhs = prev2->get_inputs()[0]->producer_owner_;
-                    auto prev3_rhs = prev2->get_inputs()[1]->producer_owner_;
-                    if (prev3_lhs->isa<quantize::dequantize_op_t>()
-                            || prev3_rhs->isa<quantize::dequantize_op_t>()) {
-                        for (auto &user : op->get_outputs()[0]->uses_) {
-                            if (auto deq = user.second->dyn_cast<
-                                           quantize::dequantize_op_t>()) {
-                                deq->attrs_[op_attr_key::break_pre_fuse] = true;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-        // auto break qadd branch
-        else if (op->isa<add_op_t>()) {
-            bool is_bf16 = op->get_outputs()[0]->details_.dtype_
-                    == sc_data_etype::BF16;
-            sc_op *prev = nullptr;
-            for (size_t i = 0; i < op->get_inputs().size(); i++) {
-                auto producer = op->get_inputs()[i]->producer_owner_;
-                if (is_bf16 && producer->isa<cast_op_t>())
-                    prev = producer;
-                else if (!is_bf16 && producer->isa<quantize::dequantize_op_t>())
-                    prev = producer;
-            }
-            if (!prev) continue;
-            if (is_bf16) { prev = prev->get_inputs()[0]->producer_owner_; }
-            if (!prev->isa<quantize::dequantize_op_t>()) continue;
-            auto forked_op = prev->get_inputs()[0]->producer_owner_;
-            if (!forked_op->isa<quantize::quantize_op_t>()
-                    || forked_op->get_outputs()[0]->uses_.size() != 2)
-                continue;
-            // set deq op break_pre_fuse
-            prev->attrs_[op_attr_key::break_pre_fuse] = true;
-        }
-    }
-}
-} // namespace quantize
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/graph_reschedule.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/graph_reschedule.cpp
deleted file mode 100644
index fce3ddde974..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/graph_reschedule.cpp
+++ /dev/null
@@ -1,382 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <string>
-
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/reshape.hpp>
-#include <util/math_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace quantize {
-bool reschedule_forbid_op(const sc_op_ptr &node) {
-    // transpose could be processed as it only changes the order of dim.
-    // tensor_view without changing the non-broadcast(non-one) axis could also
-    // be processed.
-    return ((node->isa<ops::dynamic_reshape_op>()
-                    || (node->isa<tensor_view_op_t>()
-                            && !dynamic_cast<tensor_view_op_t *>(node.get())
-                                        ->is_only_expand_or_penetrate())
-                    || node->isa<reshape_op_t>() || node->isa<concat_op_t>()
-                    || node->isa<split_op_t>())
-                   && node->attrs_.get_or_else(attr_keys::per_channel, false)
-                   && !node->attrs_.get_or_else(
-                           "allow_quantize_reschedule", false))
-            || node->isa<concat_op_t>();
-}
-void dequantize_elimination(sc_graph_t &mgr, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::bfs();
-    vis.visit_graph(mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<dequantize_op_t>()
-                || node->isa<dynamic_dequantize_op_t>()) {
-            auto dequantize_input = node->info_.inputs_[0];
-            auto dequantize_output = node->get_outputs()[0];
-            vis->update_state_for_visited(node);
-
-            auto cld_nodes = dequantize_output->uses_;
-            std::vector<sc_op_ptr> node_to_remove = {node};
-            // whether replace inputs with next node according to two op nodes
-            // back check when int8-bf16 mixed type.
-            // For the case like:
-            //                    dequantize
-            //                     /
-            //           matmul  cast
-            //               \   /
-            //                add
-            if (cld_nodes.size() == 1) {
-                auto cast_op = cld_nodes[0].second;
-                if (cast_op->isa<cast_op_t>()
-                        && cast_op->attrs_.get_or_else(
-                                attr_keys::mixed_dtype, false)) {
-                    vis->update_state_for_visited(cast_op);
-                    node_to_remove.emplace_back(cast_op);
-                    cld_nodes = cast_op->get_outputs()[0]->uses_;
-                }
-            }
-            int use_count = cld_nodes.size();
-            for (auto &cld_node : cld_nodes) {
-                if ((cld_node.second->isa<op_traits::may_quantize_t>()
-                            && cld_node.second
-                                       ->dyn_cast<op_traits::may_quantize_t>()
-                                       ->should_quantized_
-                            && !reschedule_forbid_op(cld_node.second))) {
-                    cld_node.second->replace_input(
-                            cld_node.first, dequantize_input);
-                    use_count--;
-                }
-            }
-            if (!use_count) {
-                std::for_each(node_to_remove.begin(), node_to_remove.end(),
-                        [](const sc_op_ptr &op) { op->remove(); });
-            }
-        }
-    });
-    mgr.reset_op_ids();
-}
-
-void insert_back_dequantize(sc_graph_t &mgr, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(mgr.ops_.size());
-    vis.visit_graph(mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->dyn_cast<op_traits::may_quantize_t>()) {
-            if (node->isa<tunable_op_t>()) {
-                // create a new quantized node every time may confuse
-                // `to_visit` list.
-                auto may_quantize_node
-                        = node->dyn_cast<op_traits::may_quantize_t>();
-                if (!may_quantize_node->should_quantized_
-                        || may_quantize_node->is_quantized_) {
-                    return;
-                }
-                if (may_quantize_node->should_quantized_
-                        && node->get_inputs()[0]->details_.dtype_.is_etype(
-                                sc_data_etype::BF16)) {
-                    may_quantize_node->is_quantized_ = true;
-                    vis->update_state_for_visited(node);
-                    return;
-                }
-                node->get_outputs()[0]->details_.dtype_.type_code_
-                        = sc_data_etype::S32;
-                may_quantize_node->is_quantized_ = true;
-                node->op_name_ = "quantized_" + node->op_name_;
-                assert((node->attrs_.has_key(attr_keys::data_scales)
-                               && node->attrs_.has_key(
-                                       attr_keys::weight_scales))
-                        || (node->attrs_.has_key(attr_keys::dyn_data_scales)
-                                && node->attrs_.has_key(
-                                        attr_keys::dyn_weight_scales)));
-                bool is_dyn_quan
-                        = node->attrs_.has_key(attr_keys::dyn_data_scales);
-                auto weight_scales = node->attrs_.get_or_else(
-                        attr_keys::weight_scales, std::vector<float>());
-                int output_channel_axis;
-                if (node->attrs_.has_key(attr_keys::output_channel_axis)) {
-                    output_channel_axis = node->attrs_.get<int>(
-                            attr_keys::output_channel_axis);
-                } else if (node->isa<ops::conv_fwd_core_op_t>()
-                        && node->attrs_.has_key("data_format")) {
-                    // infer output_channel_axis based on op attributes
-                    auto data_format
-                            = node->attrs_.get<std::string>("data_format");
-                    auto ndims = node->get_outputs()[0]
-                                         ->details_.get_plain_dims()
-                                         .size();
-                    output_channel_axis = data_format == "NCX" ? 1 : ndims - 1;
-                } else {
-                    output_channel_axis = node->attrs_.get<int>(
-                            attr_keys::weight_channel_axis);
-                    if (weight_scales.size() > 1
-                            && node->attrs_.has_key(
-                                    attr_keys::weight_channel_axis)) {
-                        SC_WARN << "Weight_channel_axis is specified but "
-                                   "output_channel_axis not specified. "
-                                   "Assuming output_channel_axis == "
-                                   "weight_channel_axis";
-                    }
-                }
-                auto tunable_op_consumers = node->get_outputs()[0]->uses_;
-                for (auto &child : tunable_op_consumers) {
-                    auto cur_child = child;
-                    auto cur_parent = node;
-                    while (cur_child.second
-                                    ->dyn_cast<op_traits::may_quantize_t>()
-                            && !cur_child.second->isa<concat_op_t>()
-                            // reserve dynamic version here for debug.
-                            //     &&
-                            //     !cur_child.second->isa<movement_op_t>())
-                            //     {
-                            && cur_child.second->get_outputs().size() == 1) {
-                        cur_child.second->replace_input(
-                                cur_child.first, cur_parent->get_outputs()[0]);
-                        cur_parent = cur_child.second;
-                        cur_child
-                                = cur_child.second->get_outputs()[0]->uses_[0];
-                        cur_parent->get_outputs()[0]->details_.dtype_.type_code_
-                                = sc_data_etype::S32;
-                        if (cur_parent->get_outputs()[0]->uses_.size() > 1) {
-                            break;
-                        }
-                    }
-                    auto cur_parent_uses = cur_parent == node
-                            ? std::vector<
-                                    std::pair<int, sc_op_weak_ptr_t>> {child}
-                            : cur_parent->get_outputs()[0]->uses_;
-                    // TODO(yifei): overcome the constraints here
-                    for (const auto &use : cur_parent_uses) {
-                        COMPILE_ASSERT(use.second->isa<concat_op_t>()
-                                        || !use.second->dyn_cast<
-                                                op_traits::may_quantize_t>(),
-                                "may_quantize op with multiple consumers "
-                                "shouldn't have any of may_quantize "
-                                "consumers (except concat).");
-                    }
-                    sc_op_ptr dequantize_node;
-                    if (is_dyn_quan) {
-                        assert(node->attrs_.has_key(attr_keys::dyn_data_scales)
-                                && (node->attrs_.has_key(
-                                            attr_keys::dyn_weight_scales)
-                                        || node->attrs_.has_key(
-                                                attr_keys::weight_scales)));
-                        auto data_scales_gt
-                                = node->attrs_.get<graph_tensor_ptr>(
-                                        attr_keys::dyn_data_scales);
-                        auto weight_scales_gt = node->attrs_.get_or_else(
-                                attr_keys::dyn_weight_scales,
-                                graph_tensor_ptr());
-                        sc_op_ptr out_scales;
-                        if (weight_scales_gt) {
-                            out_scales = mgr.make("mul",
-                                    {data_scales_gt, weight_scales_gt}, {}, {});
-                        } else {
-                            auto wei_scale_const = mgr.make("constant", {}, {},
-                                    {{"values",
-                                             std::make_shared<static_data_t>(
-                                                     weight_scales)},
-                                            {"dtype", datatypes::f32},
-                                            {"plain_dims",
-                                                    sc_dims {static_cast<
-                                                            sc_dim>(
-                                                            weight_scales
-                                                                    .size())}},
-                                            {"format", sc_data_format_t()}});
-                            auto &wei_details
-                                    = wei_scale_const->get_outputs()[0]
-                                              ->details_;
-                            out_scales = mgr.make("mul",
-                                    {data_scales_gt,
-                                            wei_scale_const->get_outputs()[0]},
-                                    {std::make_shared<graph_tensor>(nullptr,
-                                            wei_details.get_format(),
-                                            wei_details.get_plain_dims(),
-                                            wei_details.dtype_)},
-                                    {});
-                        }
-                        bool is_per_channel
-                                = out_scales->get_outputs()[0]
-                                          ->details_.get_plain_dims()[0]
-                                > 1;
-                        dequantize_node = mgr.make("dynamic_dequantize",
-                                {cur_parent->get_outputs()[0],
-                                        out_scales->get_outputs()[0]},
-                                {},
-                                {{attr_keys::quan_dtype, datatypes::f32},
-                                        {attr_keys::per_channel,
-                                                is_per_channel},
-                                        {attr_keys::channel_axis,
-                                                output_channel_axis}});
-                    } else {
-                        auto data_scales = node->attrs_.get<std::vector<float>>(
-                                attr_keys::data_scales);
-                        auto output_scales = math_utils::vector_mul(
-                                data_scales, weight_scales);
-                        dequantize_node = mgr.make("dequantize",
-                                cur_parent->get_outputs(),
-                                std::vector<graph_tensor_ptr> {},
-                                {{attr_keys::quan_dtype, datatypes::f32},
-                                        {attr_keys::scales, output_scales},
-                                        {attr_keys::channel_axis,
-                                                output_channel_axis}});
-                    }
-                    if (node->attrs_.get_or_else(
-                                attr_keys::mixed_dtype, false)) {
-                        dequantize_node = mgr.make("cast",
-                                dequantize_node->get_outputs(),
-                                std::vector<graph_tensor_ptr> {},
-                                {{"dtype", datatypes::bf16}});
-                    }
-                    for (const auto &use : cur_parent_uses) {
-                        use.second->replace_input(
-                                use.first, dequantize_node->get_outputs()[0]);
-                    }
-                    vis->update_state_for_visited(dequantize_node);
-                }
-            } else if (!reschedule_forbid_op(node)) {
-                // align output datatype with input
-                for (auto &out : node->get_outputs()) {
-                    if (node->isa<pooling_avg_op_t>()) {
-                        node->get_outputs()[0]->details_.dtype_.type_code_
-                                = sc_data_etype::F32;
-                    } else {
-                        out->details_.dtype_
-                                = node->get_inputs()[0]->details_.dtype_;
-                    }
-                    // insert dequantize if last op of pattern is output op,
-                    // for pattern like `qua->deq->reshape->output`
-                    std::vector<std::pair<int, sc_op_weak_ptr_t>> uses
-                            = out->uses_;
-                    for (auto &use : uses) {
-                        auto cld_op = use.second.lock();
-                        if ((!(cld_op->dyn_cast<op_traits::may_quantize_t>()
-                                     && cld_op->get_outputs().size() == 1)
-                                    || reschedule_forbid_op(cld_op))
-                                && ((node->attrs_.has_key(attr_keys::scales)
-                                            && node->attrs_.has_key(
-                                                    attr_keys::zero_points))
-                                        || (node->attrs_.has_key(
-                                                attr_keys::dyn_scales)))) {
-                            bool is_dyn_quan = node->attrs_.has_key(
-                                    attr_keys::dyn_scales);
-                            sc_op_ptr dequantize_node;
-                            if (is_dyn_quan) {
-                                auto &scales
-                                        = node->attrs_.get<graph_tensor_ptr>(
-                                                attr_keys::dyn_scales);
-                                auto &zero_points
-                                        = node->attrs_.get<graph_tensor_ptr>(
-                                                attr_keys::dyn_zero_points);
-                                std::vector<graph_tensor_ptr> ins
-                                        = {out, scales};
-                                if (zero_points) {
-                                    ins.emplace_back(zero_points);
-                                }
-                                dequantize_node = mgr.make("dynamic_dequantize",
-                                        ins, {}, node->attrs_);
-                            } else {
-                                dequantize_node = mgr.make("dequantize", {out},
-                                        std::vector<graph_tensor_ptr> {},
-                                        node->attrs_);
-                            }
-                            if (node->attrs_.get_or_else(
-                                        attr_keys::mixed_dtype, false)) {
-                                dequantize_node = mgr.make("cast",
-                                        dequantize_node->get_outputs(),
-                                        std::vector<graph_tensor_ptr> {},
-                                        {{"dtype", datatypes::bf16}});
-                            }
-                            cld_op->replace_input(use.first,
-                                    dequantize_node->get_outputs()[0]);
-                            vis->update_state_for_visited(dequantize_node);
-                        }
-                    }
-                }
-            }
-        }
-    });
-    mgr.reset_op_ids();
-}
-
-/**
- * reschedule graph by add/remove dequantize op and replace calculate op
- * with quantize calculate op.
- *
- * for dequantize op, as quantize information is propagated by info
- * propagation, if it has no non-quantize uses, we remove the node.
- * before:
- *                |
- *          dequantize_op
- *                |
- *             conv_op
- * after:
- *                |
- *                |
- *              conv_op
- *                |
- *
- * for calculate op with quantize info, we replace it with a quantized
- * one and add a dequantize node after. before:
- *                  |
- *               conv_op(with quantize info)
- *                  |
- * after:
- *                  |
- *               qconv_op(u8/s8 in, s32 out)
- *                  |
- *             dequantize_op
- *                  |
- *
- * */
-void graph_reschedule(sc_graph_t &mgr, const context_ptr &ctx) {
-    if (!mgr.attrs_.get_or_else(sc_graph_t::attr_key_t::quantize, false))
-        return;
-    dequantize_elimination(mgr, ctx);
-    insert_back_dequantize(mgr, ctx);
-}
-} // namespace quantize
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/info_propagation.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/info_propagation.cpp
deleted file mode 100644
index 277634513b8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/info_propagation.cpp
+++ /dev/null
@@ -1,401 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <utility>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace quantize {
-
-static std::unordered_set<std::string> data_wei_op_set
-        = {"conv_fwd_core", "matmul_core", "managed_matmul_core"};
-// quantize awared nodes search starts from dequantize node and end in
-// quantize node.
-static std::vector<std::pair<int, sc_op_ptr>> find_quantize_aware_nodes(
-        const sc_op_ptr &node) {
-    if (node == nullptr) { return std::vector<std::pair<int, sc_op_ptr>>(); }
-    std::vector<std::pair<int, sc_op_ptr>> aware_nodes;
-    for (auto &child_lt : node->get_outputs()) {
-        for (const auto &child_op : child_lt->uses_) {
-            if (child_op.second->isa<concat_op_t>()) {
-                // Concat op has multiple inputs whose data types must be same.
-                // To assure this constraint, we do not consider it as quantize
-                // aware node.
-                continue;
-            }
-            if ((child_op.second->dyn_cast<op_traits::may_quantize_t>()
-                        && child_op.second->attrs_.get_or_else(
-                                sc_graph_t::attr_key_t::quantize, true))
-                    || child_op.second->isa<cast_op_t>()) {
-                aware_nodes.emplace_back(child_op);
-            }
-        }
-    }
-    return aware_nodes;
-}
-
-template <typename T>
-void has_key_and_set(any_map_t &attrs, const std::string &key,
-        const sc_op_ptr &aware_node,
-        const std::string &new_key = std::string()) {
-    if (attrs.has_key(key)) {
-        aware_node->attrs_.set(
-                new_key.empty() ? key : new_key, attrs.get<T>(key));
-    }
-}
-
-static void propagate_quantize_info(const sc_op_ptr &quantize_node,
-        const std::pair<int, sc_op_ptr> &aware_node) {
-    assert((!(aware_node.second->attrs_.has_key(attr_keys::data_scales)
-                    && aware_node.second->attrs_.has_key(
-                            attr_keys::dyn_data_scales))
-                   || !(aware_node.second->attrs_.has_key(
-                                attr_keys::weight_scales)
-                           && aware_node.second->attrs_.has_key(
-                                   attr_keys::dyn_weight_scales)))
-            && "aware node has been set a quantized info");
-    if (data_wei_op_set.find(aware_node.second->op_name_)
-            != data_wei_op_set.end()) {
-        const auto qinfos = get_quantize_info_from_attrs(quantize_node->attrs_);
-        std::string prefix, cur_dyn_scales, cur_dyn_zero_points;
-        switch (aware_node.first) {
-            case 0:
-                prefix = "data_";
-                cur_dyn_scales = attr_keys::dyn_data_scales;
-                cur_dyn_zero_points = attr_keys::dyn_data_zero_points;
-                break;
-            case 1:
-                prefix = "weight_";
-                cur_dyn_scales = attr_keys::dyn_weight_scales;
-                cur_dyn_zero_points = attr_keys::dyn_weight_zero_points;
-                break;
-            default: assert(0 && "invalid tensor type!"); break;
-        };
-        aware_node.second->attrs_.set(
-                prefix + attr_keys::scales, qinfos.scales_);
-        aware_node.second->attrs_.set(
-                prefix + attr_keys::zero_points, qinfos.zero_points_);
-        aware_node.second->attrs_.set(
-                prefix + attr_keys::per_channel, qinfos.per_channel_);
-        aware_node.second->attrs_.set(
-                prefix + attr_keys::channel_axis, qinfos.channel_axis_);
-        // dynamic quantize
-        if (quantize_node->isa<dynamic_dequantize_op_t>()) {
-            auto &inputs = quantize_node->get_inputs();
-            aware_node.second->attrs_.set(cur_dyn_scales, inputs[1]);
-            if (inputs.size() == 3) {
-                aware_node.second->attrs_.set(cur_dyn_zero_points, inputs[2]);
-            }
-        } else {
-            has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                    attr_keys::dyn_scales, aware_node.second, cur_dyn_scales);
-            has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                    attr_keys::dyn_zero_points, aware_node.second,
-                    cur_dyn_zero_points);
-        }
-        if (quantize_node->isa<dequantize_op_t>()
-                || quantize_node->isa<dynamic_dequantize_op_t>()
-                || quantize_node->attrs_.get_or_else(
-                        attr_keys::may_quantize, false)) {
-            if (!aware_node.second->attrs_.has_key(attr_keys::may_quantize)) {
-                aware_node.second->attrs_.set(attr_keys::may_quantize, true);
-            }
-            if (aware_node.second->attrs_.get<bool>(attr_keys::may_quantize)) {
-                aware_node.second->dyn_cast<op_traits::may_quantize_t>()
-                        ->should_quantized_
-                        = true;
-            }
-        } else {
-            aware_node.second->attrs_.set(attr_keys::may_quantize, false);
-            aware_node.second->dyn_cast<op_traits::may_quantize_t>()
-                    ->should_quantized_
-                    = false;
-        }
-    } else {
-        // dynamic quantize
-        if (quantize_node->isa<dynamic_dequantize_op_t>()) {
-            auto &inputs = quantize_node->get_inputs();
-            aware_node.second->attrs_.set(attr_keys::dyn_scales, inputs[1]);
-            aware_node.second->attrs_.set(attr_keys::dyn_zero_points,
-                    inputs.size() == 3 ? inputs[2] : nullptr);
-        } else {
-            has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                    attr_keys::dyn_scales, aware_node.second);
-            has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                    attr_keys::dyn_zero_points, aware_node.second);
-        }
-        // static quantize
-        has_key_and_set<std::vector<float>>(
-                quantize_node->attrs_, attr_keys::scales, aware_node.second);
-        has_key_and_set<std::vector<int>>(quantize_node->attrs_,
-                attr_keys::zero_points, aware_node.second);
-        // common propagation.
-        has_key_and_set<bool>(quantize_node->attrs_, attr_keys::per_channel,
-                aware_node.second);
-        has_key_and_set<int>(quantize_node->attrs_, attr_keys::channel_axis,
-                aware_node.second);
-        bool is_transpose = aware_node.second->isa<transpose_op_t>(); // NOLINT
-        if (is_transpose
-                && quantize_node->attrs_.has_key(attr_keys::channel_axis)) {
-            int channel_axis
-                    = quantize_node->attrs_.get<int>(attr_keys::channel_axis);
-            std::unordered_map<int, int> axis_map;
-            auto order
-                    = aware_node.second->attrs_.get<std::vector<int>>("order");
-            for (size_t i = 0; i < order.size(); ++i) {
-                axis_map[order[i]] = i;
-            }
-            aware_node.second->attrs_.set(
-                    attr_keys::channel_axis, axis_map[channel_axis]);
-        }
-        has_key_and_set<bool>(quantize_node->attrs_, attr_keys::mixed_dtype,
-                aware_node.second);
-        has_key_and_set<std::vector<float>>(quantize_node->attrs_,
-                attr_keys::data_scales, aware_node.second);
-        has_key_and_set<std::vector<float>>(quantize_node->attrs_,
-                attr_keys::weight_scales, aware_node.second);
-        has_key_and_set<std::vector<int>>(quantize_node->attrs_,
-                attr_keys::data_zero_points, aware_node.second);
-        has_key_and_set<std::vector<int>>(quantize_node->attrs_,
-                attr_keys::weight_zero_points, aware_node.second);
-        has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                attr_keys::dyn_data_scales, aware_node.second);
-        has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                attr_keys::dyn_weight_scales, aware_node.second);
-        has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                attr_keys::dyn_data_zero_points, aware_node.second);
-        has_key_and_set<graph_tensor_ptr>(quantize_node->attrs_,
-                attr_keys::dyn_weight_zero_points, aware_node.second);
-        has_key_and_set<int>(quantize_node->attrs_,
-                attr_keys::weight_channel_axis, aware_node.second);
-        if (quantize_node->isa<dequantize_op_t>()
-                || quantize_node->isa<dynamic_dequantize_op_t>()
-                || quantize_node->attrs_.get_or_else(
-                        attr_keys::may_quantize, false)) {
-            aware_node.second->attrs_.set(attr_keys::may_quantize, true);
-            if (auto may_quantize_node
-                    = aware_node.second
-                              ->dyn_cast<op_traits::may_quantize_t>()) {
-                may_quantize_node->should_quantized_ = true;
-            }
-        }
-    }
-}
-
-static void check_and_set_mixed_dtype(const sc_op_ptr &cast_node) {
-    assert(cast_node->isa<cast_op_t>());
-    auto &attrs = cast_node->attrs_;
-    // if after tunable op
-    if (attrs.has_key(attr_keys::data_scales)
-            || attrs.has_key(attr_keys::dyn_data_scales)) {
-        assert((attrs.has_key(attr_keys::weight_scales)
-                       || attrs.has_key(attr_keys::dyn_weight_scales))
-                && !(attrs.has_key(attr_keys::scales)
-                        || attrs.has_key(attr_keys::dyn_scales)));
-    } else if (attrs.has_key(attr_keys::scales)
-            || attrs.has_key(attr_keys::dyn_scales)) {
-        assert(attrs.get<sc_data_type_t>(attr_keys::quan_dtype)
-                == datatypes::bf16);
-        attrs.set(attr_keys::mixed_dtype, true);
-    }
-}
-
-// Do cast u8/s8 => s32 for dynamic zero points
-void change_dyn_zp_to_s32(sc_graph_t &mgr, const context_ptr &ctx) {
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(mgr.ops_.size());
-    vis.visit_graph(mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<dynamic_quantize_op_t>()
-                || node->isa<dynamic_dequantize_op_t>()) {
-            if (node->get_inputs().size() == 3
-                    && node->get_inputs()[2]->details_.dtype_
-                            != datatypes::s32) {
-                auto &zp = node->get_inputs()[2];
-                auto casts32 = mgr.make(
-                        "cast", {zp}, {}, {{"dtype", datatypes::s32}});
-                node->replace_input(2, casts32->get_outputs()[0]);
-            }
-        }
-    });
-    mgr.reset_op_ids();
-}
-
-// Currenly we change u8 to s8 for weight
-void change_weight_u8_to_s8(sc_graph_t &mgr, const context_ptr &ctx) {
-    if (ctx->use_amx()) { return; }
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(mgr.ops_.size());
-    vis.visit_graph(mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<dequantize_op_t>()
-                || node->isa<dynamic_dequantize_op_t>()) {
-            bool dyn_quan_cur = node->isa<dynamic_dequantize_op_t>();
-            if (node->get_inputs()[0]->details_.dtype_ == datatypes::u8) {
-                bool need_s8 = false;
-                for (auto cld : node->get_outputs()[0]->uses_) {
-                    while (!cld.second->isa<output_op>()
-                            && !cld.second->isa<tunable_op_t>()
-                            && cld.second->get_outputs().size() == 1) {
-                        cld = cld.second->get_outputs()[0]->uses_[0];
-                    }
-                    if (data_wei_op_set.find(cld.second->op_name_)
-                                    != data_wei_op_set.end()
-                            && cld.first == 1) {
-                        need_s8 = true;
-                        break;
-                    }
-                }
-                if (need_s8) {
-                    auto *node_before = node->get_inputs()[0]->producer_owner_;
-                    if (node_before->isa<quantize_op_t>()
-                            || node_before->isa<dynamic_quantize_op_t>()) {
-                        bool dyn_quan_before
-                                = node_before->isa<dynamic_quantize_op_t>();
-                        assert(node_before->attrs_.get_or_else(
-                                       attr_keys::quan_dtype, datatypes::s8)
-                                == datatypes::u8);
-                        node_before->attrs_.get<sc_data_type_t>(
-                                attr_keys::quan_dtype)
-                                = datatypes::s8;
-                        node->get_inputs()[0]->details_.dtype_ = datatypes::s8;
-                        assert(node->attrs_.get_or_else(
-                                       attr_keys::quan_dtype, datatypes::s8)
-                                == datatypes::f32);
-                        node->attrs_.get<sc_data_type_t>(attr_keys::quan_dtype)
-                                = datatypes::f32;
-                        if (dyn_quan_before) {
-                            assert(node_before->get_inputs().size() == 3);
-                            auto &zp = node_before->get_inputs()[2];
-                            int value = 128;
-                            auto const_128 = mgr.make("constant", {}, {},
-                                    {{"dtype", datatypes::s32},
-                                            {"format", sc_data_format_t()},
-                                            {"values",
-                                                    std::make_shared<
-                                                            static_data_t>(
-                                                            &value,
-                                                            sizeof(int))},
-                                            {"plain_dims", sc_dims {1}}});
-                            auto shift_128 = mgr.make("sub",
-                                    {zp, const_128->get_outputs()[0]}, {}, {});
-                            node_before->replace_input(
-                                    2, shift_128->get_outputs()[0]);
-                            node->replace_input(2, shift_128->get_outputs()[0]);
-                        } else {
-                            assert(node_before->attrs_
-                                            .get_or_else(attr_keys::zero_points,
-                                                    std::vector<int>())
-                                            .size()
-                                    == 1);
-                            assert(node->attrs_
-                                            .get_or_else(attr_keys::zero_points,
-                                                    std::vector<int>())
-                                            .size()
-                                    == 1);
-                            node_before->attrs_.get<std::vector<int>>(
-                                    attr_keys::zero_points)[0]
-                                    -= 128;
-                            node->attrs_.get<std::vector<int>>(
-                                    attr_keys::zero_points)[0]
-                                    -= 128;
-                        }
-                    } else {
-                        auto *node_before
-                                = node->get_inputs()[0]->producer_owner_;
-                        const auto &qinfos
-                                = get_quantize_info_from_attrs(node->attrs_);
-                        auto new_zero_point = qinfos.zero_points_[0] - 128;
-                        auto casts32
-                                = mgr.make("cast", node_before->get_outputs(),
-                                        {}, {{"dtype", datatypes::s32}});
-                        auto const128 = mgr.make("constant", {}, {},
-                                {{"values",
-                                         std::make_shared<static_data_t>(
-                                                 std::vector<int> {128})},
-                                        {attr_keys::quan_dtype, datatypes::s32},
-                                        {"plain_dims", sc_dims {1}}});
-                        auto sub128 = mgr.make("sub",
-                                {casts32->get_outputs()[0],
-                                        const128->get_outputs()[0]},
-                                {}, {});
-                        auto casts8 = mgr.make("cast", sub128->get_outputs(),
-                                {}, {{"dtype", datatypes::s8}});
-                        sc_op_ptr deq;
-                        if (dyn_quan_cur) {
-                            assert(node->get_inputs().size() == 3);
-                            auto &scales = node->get_inputs()[1];
-                            auto &zp = node->get_inputs()[2];
-                            auto shift_128 = mgr.make("sub",
-                                    {zp, const128->get_outputs()[0]}, {}, {});
-                            deq = mgr.make("dynamic_dequantize",
-                                    {casts8->get_outputs()[0], scales,
-                                            shift_128->get_outputs()[0]},
-                                    {},
-                                    {{attr_keys::quan_dtype, datatypes::f32}});
-                        } else {
-                            deq = mgr.make("dequantize", casts8->get_outputs(),
-                                    {},
-                                    {{attr_keys::quan_dtype, datatypes::f32},
-                                            {attr_keys::scales, qinfos.scales_},
-                                            {attr_keys::zero_points,
-                                                    std::vector<int> {
-                                                            new_zero_point}}});
-                        }
-                        node->replace_uses_with_and_remove(deq);
-                        vis->update_state_for_visited(deq);
-                    }
-                }
-            }
-        }
-    });
-    mgr.reset_op_ids();
-}
-
-// do two things: infer input tensor is data/weight;transfer quantize info
-// to calculation op
-SC_INTERNAL_API void quantize_info_propagation(
-        sc_graph_t &mgr, const context_ptr &ctx) {
-    if (!mgr.attrs_.get_or_else(sc_graph_t::attr_key_t::quantize, false))
-        return;
-    change_dyn_zp_to_s32(mgr, ctx);
-    change_weight_u8_to_s8(mgr, ctx);
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(mgr.ops_.size());
-    vis.visit_graph(mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<dequantize_op_t>() || node->isa<dynamic_dequantize_op_t>()
-                || (node->isa<op_traits::may_quantize_t>()
-                        && !node->isa<concat_op_t>())
-                || node->isa<cast_op_t>()) {
-            if (node->isa<cast_op_t>()) { check_and_set_mixed_dtype(node); }
-            auto aware_ops = find_quantize_aware_nodes(node);
-            for (const auto &aware_op : aware_ops) {
-                propagate_quantize_info(node, aware_op);
-            }
-        }
-    });
-}
-} // namespace quantize
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/op_rewrite.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/op_rewrite.cpp
deleted file mode 100644
index aad920731ec..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/quantization/op_rewrite.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/quantization/quantize_op.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <util/math_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace quantize {
-void calculate_op_compensation(sc_graph_t &mgr, const context_ptr &ctx) {
-    if (!mgr.attrs_.get_or_else(sc_graph_t::attr_key_t::quantize, false))
-        return;
-    op_visitor_t vis = op_visitor_t::dfs();
-    vis.visit_graph(mgr, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto qnode = node->dyn_cast<op_traits::may_quantize_t>()) {
-            if (qnode->is_quantized_ && qnode->need_compensation_) {
-                std::vector<std::pair<int, sc_op_weak_ptr_t>> uses;
-                for (auto &out : node->get_outputs()) {
-                    uses.insert(
-                            uses.end(), out->uses_.begin(), out->uses_.end());
-                }
-                sc_op_ptr compensation_qnode
-                        = qnode->do_compensations(mgr, ctx);
-                auto new_out = compensation_qnode->get_outputs()[0];
-                for (auto &use : uses) {
-                    use.second->replace_input(use.first, new_out);
-                }
-                vis->update_state_for_visited(compensation_qnode);
-            }
-        }
-/* I want to keep this comment now in case we do bias add in s32
- * dtype.*/
-#if 0
-            else if (auto bst_node = node->dyn_cast<broadcast_op_t>()) {
-                if (bst_node->is_quantized_ && bst_node->need_compensation_) {
-                    std::vector<float> data_scales
-                            = bst_node->attrs_.get<std::vector<float>>(
-                                    "data_scales");
-                    std::vector<float> weight_scales
-                            = bst_node->attrs_.get<std::vector<float>>(
-                                    "weight_scales");
-                    std::vector<float> mul_scales
-                            = math_utils::vector_mul(data_scales,
-                            weight_scales);
-                    std::vector<union_val> union_scales(
-                            mul_scales.begin(), mul_scales.end());
-                    sc_op_ptr mul_scales_const = mgr.make("constant", {}, {},
-                            {{"values", union_scales}, {"dtype",
-                            datatypes::f32}});
-                    sc_op_ptr bst_div = mgr.make("div",
-                            {bst_node->get_inputs()[1],
-                                    mul_scales_const->get_outputs()[0]},
-                            {}, {});
-                    sc_op_ptr bst_cast = mgr.make("cast",
-                    bst_div->get_outputs(),
-                            {}, {{"dtype", datatypes::s32}});
-                    sc_op_ptr bst_new = mgr.make("broadcast",
-                            {bst_node->get_inputs()[0],
-                            bst_cast->get_outputs()[0]},
-                            {}, node->attrs_);
-
-                    bst_node->need_compensation_ = false;
-                    bst_node->replace_uses_with_and_remove(bst_new);
-                    vis->update_state_for_visited(bst_new);
-                }
-            }
-        }
-#endif
-    });
-    mgr.reset_op_ids();
-}
-} // namespace quantize
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/rl_conv_weight_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/rl_conv_weight_transform.cpp
deleted file mode 100644
index 9b1279ccc05..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/rl_conv_weight_transform.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "../visitor.hpp"
-#include "transform.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <ops/convolution.hpp>
-#include <ops/templates/conv_rl.hpp>
-#include <ops/templates/utils.hpp>
-#include <runtime/config.hpp>
-
-SC_MODULE(rl_conv_weight_transform)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static inline int get_full_lowering_threshold(int tile_col) {
-    int threshold = static_cast<int>(tile_col * 0.75);
-    return threshold;
-}
-
-static bool use_rl(const context_ptr &ctx, const sc_data_type_t &data_dtype,
-        const sc_dims &data_dims, const sc_dims &weight_dims,
-        const sc_dims &pads_begin, const sc_dims &pads_end, const int &groups) {
-    auto ndims = data_dims.size();
-    assert(ndims == 4UL + (groups > 1) && weight_dims.size() == ndims);
-    if (!ops::is_amx_dtype(ctx, data_dtype)) { return false; }
-    bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, data_dtype);
-    int vnni_blk = is_vnni_low_fp ? 2 : 4;
-    int tile_col = is_vnni_low_fp ? 32 : 64;
-    int threshold = get_full_lowering_threshold(tile_col);
-    bool is_1x1 = std::all_of(weight_dims.begin() + 2 + (groups > 1),
-            weight_dims.end(), [](int x) { return x == 1; });
-    bool is_small_padding = std::all_of(pads_begin.begin(), pads_begin.end(),
-                                    [weight_dims, ndims](int x) {
-                                        return x <= weight_dims[ndims - 1];
-                                    })
-            && std::all_of(pads_end.begin(), pads_end.end(),
-                    [weight_dims, ndims](
-                            int x) { return x <= weight_dims[ndims - 1]; });
-    auto ic = weight_dims[1 + (groups > 1)];
-    auto kh = weight_dims[ndims - 2];
-    auto ih = data_dims[ndims - 2];
-
-    // Note: the current rl algorithm expects kw <= iw for init aux buffer
-    // handling
-    return (!is_1x1 && ndims == 4UL + (groups > 1) && is_small_padding
-            && (ic <= (tile_col / 2))
-            && ((ic % vnni_blk != 0 && kh * ic <= threshold)
-                    || (ic % vnni_blk == 0))
-            && kh <= ih);
-}
-
-static void query_accu_info_for_rl(const context_ptr &ctx,
-        const sc_data_type_t &dtype, const int kh, const int kw, const int ic,
-        int &num_brgemm_k, int &brgemm_k, int &extra_padding, int &kind,
-        int groups) {
-    assert(ops::is_amx_dtype(ctx, dtype));
-    bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-    int vnni_blk = is_vnni_low_fp ? 2 : 4;
-    int tile_col = is_vnni_low_fp ? 32 : 64;
-    int threshold = get_full_lowering_threshold(tile_col);
-
-    // Notes: Here force group conv with small ic to conv_rl because group convs
-    // will prefer NHWGC while the kw_lowering only support NGHWC now
-    if (kw * ic <= threshold || ic % vnni_blk != 0 || groups > 1) {
-        kind = ops::rl_kind::FULL_LOWERING;
-        auto total_raw_accu = kw * kh * ic;
-        num_brgemm_k = utils::divide_and_ceil(total_raw_accu, tile_col);
-        auto total_padded_accu
-                = utils::rnd_up(total_raw_accu, num_brgemm_k * vnni_blk);
-        brgemm_k = total_padded_accu / num_brgemm_k;
-        extra_padding = total_padded_accu - total_raw_accu;
-    } else {
-        kind = ops::rl_kind::KW_LOWERING;
-
-        brgemm_k = ic * kw;
-        num_brgemm_k = kh;
-        extra_padding = 0;
-    }
-}
-
-void rl_conv_weight_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    if ((graph.attrs_.has_key("use_rl")
-                && graph.attrs_.get<int>("use_rl") == ops::rl_kind::NO_LOWERING)
-            || graph.is_dynamic()) {
-        return;
-    }
-    auto vis = op_visitor_t::bfs();
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (auto op = node->dyn_cast<ops::conv_fwd_core_op_t>()) {
-            auto data_plain_dims
-                    = op->info_.inputs_[0]->details_.get_plain_dims();
-            auto weight_plain_dims
-                    = op->info_.inputs_[1]->details_.get_plain_dims();
-            auto data_dtype = op->info_.inputs_[0]->details_.dtype_;
-            auto ndims = data_plain_dims.size();
-            sc_dim groups = op->attrs_.get_or_else("groups", 1);
-            if (ndims != 4UL + (groups > 1)) { return; }
-            // depthwise convolution
-            bool is_dw_brdgmm = groups > 1 && 1 == weight_plain_dims[1]
-                    && 1 == data_plain_dims[2];
-            if (is_dw_brdgmm) { return; }
-            auto dilations = ops::get_dilations(op->attrs_);
-            auto has_dilation = std::any_of(dilations.begin(), dilations.end(),
-                    [](int x) { return x != 1; });
-            // Note, dilation will introduce non-contiguous data when packing
-            // kw, might be supported in future
-            if (has_dilation) { return; };
-
-            COMPILE_ASSERT(weight_plain_dims.size() == ndims,
-                    "Weight dims size is expected equal to data dims, but got "
-                            << weight_plain_dims.size() << " vs. " << ndims
-                            << ".");
-            COMPILE_ASSERT(weight_plain_dims[1 + (groups > 1)]
-                            == data_plain_dims[1 + (groups > 1)],
-                    "Weight_plain_dims[1] is expected equal to "
-                    "data_plain_dims[1], but got "
-                            << weight_plain_dims[1 + (groups > 1)] << " vs. "
-                            << data_plain_dims[1 + (groups > 1)] << ".");
-            sc_dims pads_begin, pads_end;
-            if (op->attrs_.has_key("pads_begin")) {
-                pads_begin = op->attrs_.get<sc_dims>("pads_begin");
-                pads_end = op->attrs_.get<sc_dims>("pads_end");
-            } else {
-                pads_begin = op->attrs_.get<sc_dims>("paddings");
-                pads_end = pads_begin;
-            }
-
-            sc_dims strides = op->attrs_.get<sc_dims>("strides");
-            if (!use_rl(ctx, data_dtype, data_plain_dims, weight_plain_dims,
-                        pads_begin, pads_end, groups)) {
-                return;
-            }
-
-            auto oc = weight_plain_dims[(groups > 1)];
-            auto kh = weight_plain_dims[ndims - 2];
-            auto kw = weight_plain_dims[ndims - 1];
-            auto ic = data_plain_dims[1 + (groups > 1)];
-            auto &stride = op->attrs_.get<sc_dims>("strides");
-            auto sw = !stride.empty() ? stride[1] : stride[0];
-            int num_brgemm_k = 1;
-            int brgemm_k = 1;
-            int extra_padding = 0;
-            auto kind = ops::rl_kind::NO_LOWERING;
-
-            query_accu_info_for_rl(ctx, data_dtype, kh, kw, ic, num_brgemm_k,
-                    brgemm_k, extra_padding, kind, groups);
-            if (kind == ops::rl_kind::NO_LOWERING) { return; }
-#if 0
-            //     fall-back for small amx utilization
-                auto low_tmul_utilization
-                        = [](int N, int K, sc_data_type_t &dtype) {
-                              auto dtype_size = (dtype == datatypes::bf16) ?
-                              2 : 4; if (N <= 8 || K <= (64 / dtype_size)) {
-                                  return true;
-                              } else {
-                                  return false;
-                              }
-                          };
-                if (ops::is_amx_dtype(ctx, data_dtype)
-                        && low_tmul_utilization(
-                                oc, brgemm_k, data_dtype)) {
-                    return;
-                }
-#endif
-
-            op->attrs_["use_rl"] = kind;
-            op->attrs_["num_brgemm_k"] = num_brgemm_k;
-            op->attrs_["brgemm_k"] = brgemm_k;
-            op->attrs_["extra_padding"] = extra_padding;
-            op->attrs_["origin_wei_plain_dims"] = groups > 1
-                    ? sc_dims {groups, oc, ic, kh, kw}
-                    : sc_dims {oc, ic, kh, kw};
-
-            auto weight = op->get_inputs()[1];
-            auto weight_op = weight->producer_owner_;
-            auto is_constant_weight = (weight_op->isa<constant_op_t>()
-                    || weight_op->attrs_.get_or_else(
-                            "constant", const_kind::not_const));
-            if (kind == ops::rl_kind::FULL_LOWERING) {
-                // B-OIHW->reorder->HWIO->view->KN->(padding on "K")
-                auto wei_shape_2d = sc_dims {ic * kh * kw, groups * oc};
-                auto trans_wei = graph.make("reorder", {weight}, {},
-                        {{"out_format",
-                                 sc_data_format_t(groups > 1
-                                                 ? format_kinds::EDCAB
-                                                 : format_kinds::DCBA)},
-                                {"internal", true}});
-                if (is_constant_weight) {
-                    trans_wei->attrs_.set("constant", const_kind::local_const);
-                }
-
-                auto view_wei = graph.make("tensor_view",
-                        trans_wei->get_outputs(), {},
-                        {{"shape", wei_shape_2d},
-                                {"format", sc_data_format_t(format_kinds::AB)},
-                                {"expand_dim", std::vector<int> {}}});
-                if (is_constant_weight) {
-                    view_wei->attrs_.set("constant", const_kind::local_const);
-                }
-                vis->update_state_for_visited(trans_wei);
-                vis->update_state_for_visited(view_wei);
-
-                if (extra_padding > 0) {
-                    auto padded_wei = graph.make("padding",
-                            view_wei->get_outputs(), {},
-                            {{"pads_begin", sc_dims {0}},
-                                    {"pads_end", sc_dims {extra_padding}}});
-
-                    if (is_constant_weight) {
-                        padded_wei->attrs_.set(
-                                "constant", const_kind::local_const);
-                    }
-                    op->replace_input(1, padded_wei->get_outputs()[0], true);
-                    vis->update_state_for_visited(padded_wei);
-                } else {
-                    op->replace_input(1, view_wei->get_outputs()[0], true);
-                }
-            } else {
-                // noops as no need to re-layout weight/inputs
-            }
-        }
-    });
-    graph.reset_op_ids();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/shape_relationship_binding.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/shape_relationship_binding.cpp
deleted file mode 100644
index e7cae6c7c9c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/shape_relationship_binding.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-
-#include <functional>
-#include <limits>
-#include "../dynamic_dispatch_key.hpp"
-#include "../fusible_op.hpp"
-#include "../graph_op.hpp"
-#include "../pass/pass.hpp"
-#include "../tunable_op.hpp"
-#include "../visitor.hpp"
-#include <compiler/ir/graph/dynamic_lower_info.hpp>
-#include <ops/fusible/memory_movement.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(graph.shape_relationship_binding);
-struct dyn_plhd_union_t {
-    dyn_plhd_union_t(sc_dim cur_plhd) {
-        count_ = dynamic_lower_info_t::init_placeholder - cur_plhd;
-        parent_.resize(count_);
-        for (size_t i = 0; i < parent_.size(); i++) {
-            parent_[i] = i;
-        }
-    }
-    sc_dim to_idx(sc_dim in) {
-        return -1 * (in - dynamic_lower_info_t::init_placeholder);
-    }
-    sc_dim to_plhd(sc_dim in) {
-        return -1 * in + dynamic_lower_info_t::init_placeholder;
-    }
-    sc_dim find(sc_dim in) { return to_plhd(find_impl(to_idx(in))); }
-    void merge(sc_dim in1, sc_dim in2) {
-        assert(is_dynamic_dim(in1) || is_dynamic_dim(in2));
-        if (in1 > 0) {
-            static_map_.insert(std::make_pair(find(in2), in1));
-            return;
-        }
-        if (in2 > 0) {
-            static_map_.insert(std::make_pair(find(in1), in2));
-            return;
-        }
-        merge_impl(to_idx(in1), to_idx(in2));
-    }
-    sc_dim get_count() const { return count_; }
-    const std::unordered_map<sc_dim, sc_dim> &get_static_map() {
-        for (auto it = static_map_.begin(); it != static_map_.end();) {
-            auto p = find(it->first);
-            if (p != it->first) {
-                static_map_.insert(std::make_pair(p, it->second));
-                it = static_map_.erase(it);
-            } else {
-                it++;
-            }
-        };
-        return static_map_;
-    }
-
-private:
-    std::vector<sc_dim> parent_;
-    sc_dim count_;
-    // parent to static value map
-    std::unordered_map<sc_dim, sc_dim> static_map_;
-    sc_dim find_impl(sc_dim in) {
-        if (parent_[in] == in) { return in; }
-        parent_[in] = find_impl(parent_[in]);
-        return parent_[in];
-    }
-    void merge_impl(sc_dim in1, sc_dim in2) {
-        sc_dim p1 = find_impl(in1), p2 = find_impl(in2);
-        if (p1 == p2) { return; }
-        parent_[p2] = p1;
-        count_--;
-    }
-};
-
-SC_INTERNAL_API void shape_relationship_binding(
-        sc_graph_t &graph, const context_ptr &ctx) {
-    if (graph.empty() || !graph.is_dynamic() || !graph.dyn_info_) { return; }
-    // stage 1, first satisfy all dynamic dimensions are placeholder not any.
-    for (auto &node : graph.ops_) {
-        auto ins = node->get_inputs();
-        auto outs = node->get_outputs();
-        for (auto &in : ins) {
-            auto plain_dims = in->details_.get_plain_dims();
-            for (auto &dim : plain_dims) {
-                if (dim == dimensions::dynamic_any) {
-                    dim = graph.get_next_dynamic_placeholder();
-                }
-            }
-            in->details_.set_plain_dims(plain_dims);
-        }
-        for (auto &out : outs) {
-            auto plain_dims = out->details_.get_plain_dims();
-            for (auto &dim : plain_dims) {
-                if (dim == dimensions::dynamic_any) {
-                    dim = graph.get_next_dynamic_placeholder();
-                }
-            }
-            out->details_.set_plain_dims(plain_dims);
-        }
-    }
-    // stage 2, use disjoint-set to merge all dynamic placeholders.
-    // initialize disjoint-set.
-    int cur_old_plhd = graph.dyn_info_->cur_dynamic_placeholder_;
-    dyn_plhd_union_t dyn_un(cur_old_plhd);
-    op_visitor_t vis = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        std::vector<std::pair<sc_dim, sc_dim>> shape_relations
-                = node->get_dynamic_shape_relations();
-        for (auto &it : shape_relations) {
-            dyn_un.merge(it.first, it.second);
-        }
-    });
-    // stage 3: construct a map of old placeholder to new.
-    // map of parent to new.
-    std::unordered_map<sc_dim, sc_dim> parent_map;
-    // map of old placeholder to new.
-    std::unordered_map<sc_dim, sc_dim> plhd_map;
-    sc_dim cur_new_plhd = dynamic_lower_info_t::init_placeholder;
-    auto plhd_count = dyn_un.get_count();
-    auto static_map = dyn_un.get_static_map();
-
-    parent_map.insert(static_map.begin(), static_map.end());
-    plhd_count -= static_map.size();
-    for (sc_dim old_plhd = dynamic_lower_info_t::init_placeholder;
-            old_plhd > cur_old_plhd; old_plhd--) {
-        auto it = plhd_map.find(old_plhd);
-        if (it == plhd_map.end()) {
-            auto p = dyn_un.find(old_plhd);
-            auto pit = parent_map.find(p);
-            if (pit == parent_map.end()) {
-                parent_map[p] = cur_new_plhd;
-                plhd_map[old_plhd] = cur_new_plhd;
-                cur_new_plhd--;
-                plhd_count--;
-            } else {
-                plhd_map[old_plhd] = pit->second;
-            }
-        }
-    }
-    assert(plhd_count == 0);
-    SC_UNUSED(plhd_count);
-    // stage 4: replace old placeholders in graph with map.
-    std::unordered_map<graph_tensor_ptr, bool> visited;
-    op_visitor_t vis2 = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis2.visit_graph(graph, [&](op_visitor_t *vis2, const sc_op_ptr &node) {
-        auto ins = node->get_inputs();
-        auto outs = node->get_outputs();
-        SC_MODULE_INFO << "Op " << node->op_name_ << "\n Inputs:";
-        for (auto &in : ins) {
-            auto plain_dims = in->details_.get_plain_dims();
-            if (!visited[in]) {
-                std::for_each(plain_dims.begin(), plain_dims.end(),
-                        [&plhd_map](sc_dim &in) {
-                            if (is_dynamic_dim(in)) {
-                                auto it = plhd_map.find(in);
-                                assert(it != plhd_map.end());
-                                in = it->second;
-                            }
-                        });
-                in->details_.set_plain_dims(plain_dims);
-                visited[in] = true;
-            }
-            SC_MODULE_INFO << utils::print_vector(plain_dims);
-        }
-        SC_MODULE_INFO << "Outputs: ";
-        for (auto &out : outs) {
-            auto plain_dims = out->details_.get_plain_dims();
-            if (!visited[out]) {
-                std::for_each(plain_dims.begin(), plain_dims.end(),
-                        [&plhd_map](sc_dim &in) {
-                            if (is_dynamic_dim(in)) {
-                                auto it = plhd_map.find(in);
-                                assert(it != plhd_map.end());
-                                in = it->second;
-                            }
-                        });
-                out->details_.set_plain_dims(plain_dims);
-                visited[out] = true;
-            }
-            SC_MODULE_INFO << utils::print_vector(plain_dims);
-        }
-    });
-    // stage 5: remove old extra placeholder info.
-    graph.dyn_info_->cur_dynamic_placeholder_ = cur_new_plhd;
-    for (sc_dim old_plhd = cur_new_plhd; old_plhd > cur_old_plhd; old_plhd--) {
-        graph.dyn_info_->dim2expr_map_.erase(old_plhd);
-    }
-    // stage 6: set calculation relationship between output and input
-    op_visitor_t vis3 = op_visitor_t::dfs_topology_sort(graph.ops_.size());
-    vis3.visit_graph(graph, [&](op_visitor_t *vis3, const sc_op_ptr &node) {
-        node->calculate_dynamic_shape_expression();
-    });
-    graph.reset_op_ids();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/tensor_view_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/tensor_view_transform.cpp
deleted file mode 100644
index 7f5e62edfbc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/tensor_view_transform.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <set>
-#include <utility>
-
-#include "../fusible_op.hpp"
-#include "../visitor.hpp"
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <ops/fusible/memory_movement.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// Helper function to create a block-compressed format for tensor_view transform
-// check. The new format tries to convert the axis `with num_of_tiles == 1` to
-// previous different axis's block.
-// E.g block [128, 1, 1, 6, 1, 384] (ABCDcd) => (ABCDdd)
-sc_data_format_t try_compress_format(
-        const sc_dims &plain_dims, const sc_data_format_t &format) {
-    auto &fcode = format.format_code_;
-    auto block_dims = sc_data_format_t::get_blocking_shapes(plain_dims, format);
-    std::unordered_map<int, std::vector<int>> block_axis
-            = format.get_blocked_axis();
-    int blk_idx = 0, new_idx = 0;
-    std::unordered_map<int, int> axis_count;
-    std::unordered_map<int, int> block_idx;
-    sc_data_format_t new_format;
-    for (int i = 0; i < fcode.ndims(); i++) {
-        auto orig_dim = fcode.get(i);
-        auto it = block_axis.find(orig_dim);
-        // the axis has block and the block shape of idx i is 1.
-        // the first number of block of axis should be reserved to indicate the
-        // axis exists.
-        if (block_dims[i] == 1) {
-            // blocks or num_of_blocks when block > 1.
-            if (axis_count[orig_dim] > 0
-                    || (it != block_axis.end()
-                            && std::find_if(it->second.begin(),
-                                       it->second.end(),
-                                       [&](const int &x) { return x > 1; })
-                                    != it->second.end())) {
-                block_idx[orig_dim]++;
-                continue;
-            }
-        }
-        new_format.format_code_.set(new_idx++, orig_dim);
-        if (axis_count[orig_dim] > 0) {
-            assert(it != block_axis.end());
-            auto blk = it->second[block_idx[orig_dim]++];
-            new_format.blocks_[blk_idx++] = blk;
-        }
-        axis_count[orig_dim]++;
-    }
-    new_format.format_code_.set(sc_data_format_kind_t::MAX_DIMS,
-            format.format_code_.get_control_block());
-    return new_format;
-}
-
-// Helper function to judge if a reorder should be transformed to tensor
-// view. If the reorder actually does not result in memory movement, we
-// mark it as should_transform.
-bool should_transform_reorder(const sc_op_ptr &node) {
-    if (can_op_be_dispatched(node)) {
-        auto key_set = node->get_dispatch_key_set()->get_inner_set();
-        return std::all_of(key_set.begin(), key_set.end(),
-                [](const op_dispatch_key_t &key) {
-                    return key.in_out_formats_[0] == key.in_out_formats_[1];
-                });
-    }
-    assert(node->isa<reorder_op_t>());
-    auto input_plain_shapes = node->get_inputs()[0]->details_.get_plain_dims();
-    auto output_plain_shapes
-            = node->get_outputs()[0]->details_.get_plain_dims();
-    auto input_format = node->get_inputs()[0]->details_.get_format();
-    auto output_format = node->get_outputs()[0]->details_.get_format();
-    auto input_blocking_shapes
-            = node->get_inputs()[0]->details_.get_blocking_dims();
-    auto output_blocking_shapes
-            = node->get_outputs()[0]->details_.get_blocking_dims();
-    // transformation not possible due to stride
-    if (!node->get_inputs()[0]->details_.is_dense()
-            || !node->get_outputs()[0]->details_.is_dense()) {
-        return false;
-    }
-    // inp format is equal to out format
-    if (input_format == output_format) { return true; }
-    // reorder for padding
-    if (sc_data_format_t::get_padded_plain_shapes(
-                input_blocking_shapes, input_format)
-            != sc_data_format_t::get_padded_plain_shapes(
-                    output_blocking_shapes, output_format)) {
-        return false;
-    }
-    input_format = try_compress_format(input_plain_shapes, input_format);
-    output_format = try_compress_format(output_plain_shapes, output_format);
-    input_blocking_shapes = sc_data_format_t::get_blocking_shapes(
-            input_plain_shapes, input_format);
-    output_blocking_shapes = sc_data_format_t::get_blocking_shapes(
-            output_plain_shapes, output_format);
-
-    int inp_idx = 0, out_idx = 0;
-    auto &inp_code = input_format.format_code_;
-    auto &out_code = output_format.format_code_;
-    // orig axis -> vector of block idx
-    auto inp_blocked_axis = input_format.get_blocked_axis();
-    auto out_blocked_axis = output_format.get_blocked_axis();
-    // orig axis -> current idx of block
-    std::unordered_map<int, size_t> inp_block_idx, out_block_idx;
-    for (int i = 0; i < input_format.format_code_.norig_dims(); i++) {
-        inp_block_idx[i] = -1;
-    }
-    for (int i = 0; i < output_format.format_code_.norig_dims(); i++) {
-        out_block_idx[i] = -1;
-    }
-    while (inp_idx < sc_data_format_kind_t::MAX_DIMS
-            && out_idx < sc_data_format_kind_t::MAX_DIMS
-            && (inp_code.get(inp_idx) != sc_data_format_kind_t::UNDEF_DIM
-                    || out_code.get(out_idx)
-                            != sc_data_format_kind_t::UNDEF_DIM)) {
-        // get next inp_block_idx
-        inp_block_idx[inp_code.get(inp_idx)]++;
-        out_block_idx[out_code.get(out_idx)]++;
-
-        // skip axis == 1
-        while (inp_idx < sc_data_format_kind_t::MAX_DIMS
-                && inp_code.get(inp_idx) != sc_data_format_kind_t::UNDEF_DIM
-                && input_blocking_shapes[inp_idx] == 1) {
-            inp_idx++;
-            inp_block_idx[inp_code.get(inp_idx)]++;
-        }
-        while (out_idx < sc_data_format_kind_t::MAX_DIMS
-                && out_code.get(out_idx) != sc_data_format_kind_t::UNDEF_DIM
-                && output_blocking_shapes[out_idx] == 1) {
-            out_idx++;
-            out_block_idx[out_code.get(out_idx)]++;
-        }
-        if (inp_code.get(inp_idx) != out_code.get(out_idx)) { return false; }
-        // skip same axis
-        while (inp_code.get(inp_idx + 1) != sc_data_format_kind_t::UNDEF_DIM
-                && inp_code.get(inp_idx + 1) == inp_code.get(inp_idx)) {
-            inp_block_idx[inp_code.get(inp_idx + 1)]++;
-            inp_idx++;
-        }
-        while (out_code.get(out_idx + 1) != sc_data_format_kind_t::UNDEF_DIM
-                && out_code.get(out_idx + 1) == out_code.get(out_idx)) {
-            out_block_idx[out_code.get(out_idx + 1)]++;
-            out_idx++;
-        }
-        int orig_inp_axis = inp_code.get(inp_idx);
-        int orig_out_axis = out_code.get(out_idx);
-        // different axis
-        if (orig_inp_axis != orig_out_axis) { return false; }
-        // different block
-        auto inp_block_itr = inp_blocked_axis.find(orig_inp_axis);
-        auto out_block_itr = out_blocked_axis.find(orig_out_axis);
-        if (inp_block_itr != inp_blocked_axis.end()
-                || out_block_itr != out_blocked_axis.end()) {
-            // inp has no block, but out has blocks remained.
-            if (inp_block_itr == inp_blocked_axis.end()
-                    && out_block_idx[orig_out_axis]
-                            < out_block_itr->second.size()) {
-                return false;
-            }
-            // out has no block, but inp has blocks remained.
-            if (out_block_itr == out_blocked_axis.end()
-                    && inp_block_idx[orig_inp_axis]
-                            < inp_block_itr->second.size()) {
-                return false;
-            }
-            // inp and out both have blocks.
-            if (inp_block_itr != inp_blocked_axis.end()
-                    && out_block_itr != out_blocked_axis.end()) {
-                // inp and out have blocks remained.
-                if (inp_block_idx[orig_inp_axis] < inp_block_itr->second.size()
-                        && out_block_idx[orig_out_axis]
-                                < out_block_itr->second.size()) {
-                    // inp and out block remained should be equal.
-                    if (inp_block_itr->second[inp_block_idx[orig_inp_axis]]
-                            != out_block_itr
-                                       ->second[out_block_idx[orig_out_axis]]) {
-                        return false;
-                    }
-                } else if (inp_block_idx[orig_inp_axis]
-                        < inp_block_itr->second.size()) {
-                    return false;
-                } else if (out_block_idx[orig_out_axis]
-                        < out_block_itr->second.size()) {
-                    return false;
-                }
-                // else inp and out have no remained blocks.
-            }
-        }
-        // increase inp_idx
-        inp_idx++;
-        out_idx++;
-    }
-    return true;
-}
-
-static bool should_transform_transpose(const sc_op_ptr &node) {
-    assert(node->isa<transpose_op_t>());
-    // all transpose op can be converted to tensor_view as long as layout
-    // propagation is in effect
-    if (node->attrs_.get_or_else("layout_transformed", false)) {
-        // means that we have transformed the data format
-        return true;
-    }
-    return false;
-}
-
-// Replace reorder op and transpose op that does not cause data movements with
-// tensor_view op
-void convert_to_tensor_view(sc_graph_t &graph, const context_ptr &ctx) {
-    auto vis = op_visitor_t::bfs();
-    int reorder2tv = graph.attrs_.get_or_else("temp.reorder2tv", 1);
-    vis.visit_graph(graph, [&](op_visitor_t *vis, const sc_op_ptr &node) {
-        if (node->isa<reorder_op_t>() && reorder2tv
-                && should_transform_reorder(node)
-                && !node->attrs_.get_or_else("actually_copy", false)) {
-            auto tensor_view_out = node->get_outputs()[0]->copy();
-            tensor_view_out->producer_owner_ = nullptr;
-            auto view = graph.make("tensor_view", node->get_inputs(),
-                    {tensor_view_out},
-                    {{"shape", tensor_view_out->details_.get_blocking_dims()}});
-            view->copy_dispatch_key_set_from_op(node);
-            node->replace_uses_with_and_remove(view);
-            vis->update_state_for_visited(view);
-        } else if (node->isa<transpose_op_t>()
-                && should_transform_transpose(node)) {
-            auto tensor_view_out = node->get_outputs()[0]->copy();
-            tensor_view_out->producer_owner_ = nullptr;
-            auto view = graph.make("tensor_view", node->get_inputs(),
-                    {tensor_view_out},
-                    {{"shape", tensor_view_out->details_.get_blocking_dims()},
-                            {"order",
-                                    node->attrs_.get<std::vector<int>>(
-                                            "order")}});
-            view->copy_dispatch_key_set_from_op(node);
-            node->replace_uses_with_and_remove(view);
-            vis->update_state_for_visited(view);
-        }
-    });
-    graph.reset_op_ids();
-}
-
-void tensor_view_transform(sc_graph_t &graph, const context_ptr &ctx) {
-    convert_to_tensor_view(graph, ctx);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/transform.hpp
deleted file mode 100644
index 2504e21879c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/transform/transform.hpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRANSFORM_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TRANSFORM_TRANSFORM_HPP
-
-#include <vector>
-#include "../graph.hpp"
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_INTERNAL_API void elemwise_bcast_swap(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void broadcast_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void elemwise_dimension_alignment(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void dynamic_graph_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void layout_propagation(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void pre_padding(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void mixed_partition(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void annotate_config(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void graph_inline(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void partial_reduce_replace(
-        sc_graph_t &graph, const context_ptr &ctx);
-/**
- * Optimize the constant_optimizable_t ops.
- * */
-SC_INTERNAL_API void constant_optimization(
-        sc_graph_t &graph, const context_ptr &ctx);
-
-void permute_propagation(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void tensor_view_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void flatten_conv(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API
-void graph_simplify(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void global_reschedule(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void inplace_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void div_bcast_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-void brgemm_fusion_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void shape_relationship_binding(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void merge_concats(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void graph_concat_memory_planning(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void rl_conv_weight_transform(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-namespace quantize {
-SC_INTERNAL_API void annotate_fusion_break(
-        sc_graph_t &mgr, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void quantize_info_propagation(
-        sc_graph_t &mgr, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void graph_reschedule(
-        sc_graph_t &mgr, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void calculate_op_compensation(
-        sc_graph_t &mgr, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void quantize_inline(
-        sc_graph_t &mgr, const context_ptr &ctx = get_default_context());
-} // namespace quantize
-
-SC_INTERNAL_API void fpmath_mode(
-        sc_graph_t &mgr, const context_ptr &ctx = get_default_context());
-
-SC_INTERNAL_API void eliminate_zero_shaped_tensors(
-        sc_graph_t &graph, const context_ptr &ctx = get_default_context());
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tunable_op.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tunable_op.cpp
deleted file mode 100644
index 85869fbd46e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tunable_op.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <atomic>
-#include <utility>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/dynamic_internal_info.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/transform/dead_func_eliminate.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/func_inline.hpp>
-#include <compiler/ir/transform/index2var.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// distiguish overloaded functions.
-static std::atomic<int> internal_idx = {0};
-tunable_op_t::tunable_op_t(const std::string &op_name,
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : sc_op(op_name, ins, outs, attrs) {}
-
-sc_op_ptr tunable_op_t::copy(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ret = mgr.make(op_name_, ins, outs, attrs_);
-    ret->copy_dispatch_key_set_from_op(shared_from_this());
-    ret->info_.internal_info_ = info_.internal_info_; // shadow copy.
-    auto tune_ret = ret->stc_cast<tunable_op_t>();
-    tune_ret->op_name_ = op_name_;
-    tune_ret->config_data_ = config_data_;
-    tune_ret->dyn_config_candidates_ = dyn_config_candidates_;
-    tune_ret->is_quantized_ = is_quantized_;
-    tune_ret->need_compensation_ = need_compensation_;
-    tune_ret->should_quantized_ = should_quantized_;
-    return ret;
-}
-
-bool tunable_op_t::is_valid(const context_ptr &ctx) {
-    if (!config_data_
-            || !create_generator()->is_valid_config(
-                    ctx, config_data_.data_.get())) {
-        return false;
-    }
-    return true;
-}
-
-ir_module_ptr tunable_op_t::get_func(context_ptr ctx) {
-    // query binding axis
-    query_binding_axis(get_owner_graph());
-    auto ret = std::make_shared<ir_module_t>(ctx);
-    auto gen_ptr = create_generator();
-    bool need_inner_query = need_dynamic_internal_query();
-    set_config_if_empty(ctx, gen_ptr.get());
-    std::vector<expr> ins;
-    std::vector<expr> outs;
-    auto func = graph::create_func_decl_for_op(this, ins, outs);
-    auto gen_body = [&](const body_generator_ptr &gen_ptr) {
-        builder::ir_builder_t bld;
-        bld.push_scope();
-        std::vector<for_loop> loops;
-        bool status = gen_ptr->generate(
-                ctx, config_data_.data_.get(), nullptr, ins, outs, loops);
-        assert(status);
-        bld.push_returns(true);
-        auto body = bld.pop_scope();
-        gen_ptr->schedule_loops(ctx, config_data_.data_.get(), body, loops);
-        return body;
-    };
-    stmt body;
-    if (need_inner_query) {
-        std::vector<for_loop> loops;
-        func->params_.emplace_back(
-                builder::make_tensor("internal_func", {1}, datatypes::index));
-        func->params_.back()->attr().set(attr_keys::no_index2var, true);
-        func->decl_->params_ = func->params_;
-        func_t single_core_func = gen_ptr->get_single_core_func(
-                ctx, config_data_.data_.get(), nullptr, ins, outs, loops);
-        auto common_args = outs;
-        common_args.insert(common_args.end(), ins.begin(), ins.end());
-        auto common_params = common_args;
-        common_params.emplace_back(
-                builder::make_var(datatypes::pointer, "single_core_func"));
-        common_args.emplace_back(builder::make_func_addr(single_core_func));
-        auto internal_func = builder::make_func(
-                std::string("internal_func") + std::to_string(internal_idx++),
-                common_params, builder::make_returns_unattached(true),
-                datatypes::boolean);
-        internal_func->attr().set(attr_keys::keep_func, true);
-        auto inter_ptr = builder::make_reinterpret(
-                builder::make_indexing(func->params_.back(), {0}),
-                datatypes::pointer);
-        inter_ptr->attr().set("prototype", internal_func);
-        body = builder::make_stmts_unattached(
-                {builder::make_evaluate_unattached(
-                         make_expr<call_node>(inter_ptr, common_args)),
-                        builder::make_returns_unattached(true)});
-        ret->add_func({func, internal_func, single_core_func});
-    } else {
-        body = gen_body(gen_ptr);
-        ret->add_func({func});
-    }
-
-    func->body_ = std::move(body);
-    ret->set_entry_func_idx(0);
-    return ret;
-}
-
-func_t tunable_op_t::get_func(mixed_parti_t *parti,
-        const std::vector<expr> &ins, const std::vector<expr> &outs) {
-    // query binding axis
-    query_binding_axis(get_owner_graph());
-    bool need_inner_query = need_dynamic_internal_query();
-
-    auto gen_body = [&](const body_generator_ptr &gen_ptr) {
-        set_config_if_empty(parti->ctx_, gen_ptr.get());
-        fusion_anchor_mgr_t fmgr(parti);
-        builder::ir_builder_t bld;
-        bld.push_scope();
-        std::vector<for_loop> loops;
-        bool status = gen_ptr->generate(
-                parti->ctx_, config_data_.data_.get(), &fmgr, ins, outs, loops);
-        // attach parent anchor if necessary
-        if (parti->ready_for_op(this)) {
-            auto committed_anchor = parti->lookup_anchor_map(this);
-            for (auto &template_anchor : fmgr.get_fusion_anchor()) {
-                template_anchor->attach_parent_anchor(
-                        committed_anchor, nullptr, true);
-            }
-        }
-        assert(status);
-        auto body = bld.pop_scope();
-        return body;
-    };
-
-    auto gen_ptr = create_generator();
-    stmt body;
-    // single core func is assicated with format so it needs to be query first.
-    if (need_inner_query) {
-        if (!parti->dyn_inter_) {
-            parti->dyn_inter_
-                    = std::make_shared<mixed_dyn_internal_info_t>(parti->ctx_);
-        }
-        fusion_anchor_mgr_t fmgr(parti);
-        std::vector<for_loop> loops;
-        func_t single_core_func = gen_ptr->get_single_core_func(
-                parti->ctx_, config_data_.data_.get(), &fmgr, ins, outs, loops);
-        parti->dyn_inter_->single_core_func_ = single_core_func;
-        parti->dyn_inter_->single_core_func_extra_args_
-                = gen_ptr->get_extra_args_from_func(single_core_func);
-
-        auto common_args = outs;
-        common_args.insert(common_args.end(), ins.begin(), ins.end());
-        std::for_each(
-                common_args.begin(), common_args.end(), [](const expr &arg) {
-                    arg->attr().set(attr_keys::always_trans, true);
-                });
-        parti->dyn_inter_->inter_func_extra_args_ = std::vector<expr> {
-                builder::make_var(datatypes::pointer, "single_core_func")};
-        parti->dyn_inter_->inter_call_extra_args_
-                = std::vector<expr> {builder::make_func_addr(single_core_func)};
-        // only for function signature.
-        auto internal_func = builder::make_func(
-                std::string("internal_func") + std::to_string(internal_idx++),
-                common_args, builder::make_returns_unattached(true),
-                datatypes::boolean);
-        internal_func->attr().set(attr_keys::keep_func, true);
-        parti->dyn_inter_->inter_func_ = internal_func;
-        parti->dyn_inter_->mod_->add_func({single_core_func, internal_func});
-        if (!parti->dyn_inter_->inter_funcs_param_.defined()) {
-            parti->dyn_inter_->inter_funcs_param_ = builder::make_tensor(
-                    "internal_funcs", {1}, datatypes::index);
-            parti->dyn_inter_->num_func_ = 1;
-            parti->dyn_inter_->inter_funcs_param_->attr().set(
-                    attr_keys::no_index2var, true);
-        } else {
-            assert(parti->dyn_inter_->inter_funcs_param_.isa<tensor>());
-            auto inter_tsr
-                    = parti->dyn_inter_->inter_funcs_param_.static_as<tensor>();
-            inter_tsr->dims_[0] = ++parti->dyn_inter_->num_func_;
-        }
-        auto inter_ptr = builder::make_reinterpret(
-                builder::make_indexing(parti->dyn_inter_->inter_funcs_param_,
-                        {parti->dyn_inter_->num_func_ - 1}),
-                datatypes::pointer);
-        inter_ptr->attr().set("prototype", internal_func);
-        auto inter_call = make_expr<call_node>(inter_ptr, common_args);
-        parti->dyn_inter_->inter_call_ = inter_call;
-        body = builder::make_stmts_unattached(
-                {builder::make_evaluate_unattached(inter_call)});
-        // auto ret_params = outs;
-        // ret_params.emplace_back(parti->dyn_inter_->inter_funcs_param_);
-    } else {
-        body = gen_body(gen_ptr);
-    }
-    auto func = builder::make_func(std::string(""), std::vector<expr> {},
-            std::move(body), datatypes::boolean);
-    return func;
-}
-
-void tunable_op_t::create_mixed_partition(mixed_parti_t *parti) {
-    parti->buf_alloc_.allocate_buffer(this);
-    std::vector<expr> ins, outs;
-    std::tie(ins, outs) = parti->buf_alloc_.get_buffer(this);
-    parti->func_ = get_func(parti, ins, outs);
-}
-
-void tunable_op_t::append_mixed_partition(mixed_parti_t *parti) {
-    COMPILE_ASSERT(parti->ready_for_op(this),
-            "No suitable anchor found for " << op_name_ << "_"
-                                            << logical_op_id_);
-    parti->buf_alloc_.allocate_buffer(this);
-    parti->buf_alloc_.update_input_buffer_info(this);
-
-    commit_into_anchor(parti->lookup_anchor_map(this).get());
-    // update output buffer info after inner anchor created
-    parti->buf_alloc_.update_output_buffer_info(this);
-}
-
-void tunable_op_t::search_anchor(mixed_parti_t *parti) {
-    search_op_anchor_in_parti(this, parti);
-}
-
-void tunable_op_t::commit_into_anchor(fusion_anchor_t *committed_anchor) {
-    auto parti = committed_anchor->get_binded_mxp();
-    std::vector<expr> ins, outs;
-    std::tie(ins, outs) = parti->buf_alloc_.get_buffer(this);
-    // prepare slice
-    std::vector<slice_range> ins_slice(get_inputs().size()),
-            outs_slice(get_outputs().size());
-
-    std::transform(get_inputs().begin(), get_inputs().end(), ins_slice.begin(),
-            [&committed_anchor](const graph_tensor_ptr &gt) {
-                auto slice_list = committed_anchor->fsmap_.get(gt);
-                COMPILE_ASSERT(slice_list.size() == 1,
-                        "multi-slice is not expected to tunable op");
-                return slice_list[0];
-            });
-    std::transform(get_outputs().begin(), get_outputs().end(),
-            outs_slice.begin(),
-            [&committed_anchor](const graph_tensor_ptr &gt) {
-                auto slice_list = committed_anchor->fsmap_.get(gt);
-                COMPILE_ASSERT(slice_list.size() == 1,
-                        "multi-slice is not expected to tunable op");
-                return slice_list[0];
-            });
-
-    // prepare tptr for function call
-    std::vector<expr> tptr_ins(ins.size()), tptr_outs(outs.size());
-    std::transform(ins.begin(), ins.end(), ins_slice.begin(), tptr_ins.begin(),
-            [&](const expr &tsr, const slice_range &range) {
-                return transform_tsr2tptr_with_range(tsr, range);
-            });
-    std::transform(outs.begin(), outs.end(), outs_slice.begin(),
-            tptr_outs.begin(), [&](const expr &tsr, const slice_range &range) {
-                return transform_tsr2tptr_with_range(tsr, range);
-            });
-
-    // prepare strided tsr for function definition
-    std::vector<expr> strd_ins(ins.size()), strd_outs(outs.size());
-    std::transform(ins.begin(), ins.end(), ins_slice.begin(), strd_ins.begin(),
-            [&](const expr &tsr, const slice_range &range) {
-                return transform_tsr2stsr_with_range(tsr, range);
-            });
-    std::transform(outs.begin(), outs.end(), outs_slice.begin(),
-            strd_outs.begin(), [&](const expr &tsr, const slice_range &range) {
-                return transform_tsr2stsr_with_range(tsr, range);
-            });
-
-    node_ptr_map def_to_call_map;
-    for (size_t i = 0; i < ins.size(); i++) {
-        def_to_call_map[strd_ins[i].impl] = tptr_ins[i].impl;
-        parti->buf_alloc_.b2g_map_[strd_ins[i]] = get_inputs()[i];
-    }
-    for (size_t i = 0; i < outs.size(); i++) {
-        def_to_call_map[strd_outs[i].impl] = tptr_outs[i].impl;
-        parti->buf_alloc_.b2g_map_[strd_outs[i]] = get_outputs()[i];
-    }
-    // commit content id to anchor firstly
-    committed_anchor->append_content(static_cast<sc_op *>(this));
-    // fusible fusion anchor would also be created following op template
-    auto func = get_func(parti, strd_ins, strd_outs);
-    for (auto &stsr : strd_ins) {
-        parti->buf_alloc_.b2g_map_.erase(stsr);
-    }
-    for (auto &stsr : strd_outs) {
-        parti->buf_alloc_.b2g_map_.erase(stsr);
-    }
-
-    // replace strided tensor with tensorptr
-    mxp_replacer_t(def_to_call_map).replace_func(func);
-    committed_anchor->commit_stmt(func->body_);
-}
-
-void tunable_op_t::set_config(const config_ptr &config) {
-    config_data_ = config;
-}
-
-void tunable_op_t::set_config_if_empty(
-        context_ptr ctx, body_generator_base_t *p) {
-    if (!config_data_) { set_config(p->get_default_config(std::move(ctx))); }
-}
-
-config_ptr tunable_op_t::get_default_config(context_ptr ctx) {
-    auto gen = this->create_generator();
-    return gen->get_default_config(ctx);
-}
-
-config_ptr_vec tunable_op_t::get_dynamic_config_candidates(
-        const context_ptr &ctx) {
-    if (dyn_config_candidates_.empty()) {
-        if (auto gen = create_generator()) {
-            dyn_config_candidates_ = gen->get_dynamic_config_candidates(ctx);
-        }
-    }
-    return dyn_config_candidates_;
-}
-
-impl_kind_map tunable_op_t::convert_config_candidates_to_impl_map(
-        const config_ptr_vec &configs) {
-    if (configs.empty()) { return impl_kind_map(); }
-    auto gen = create_generator();
-    impl_kind_map ret;
-    ret.reserve(configs.size());
-    for (int i = 0; i < static_cast<int>(configs.size()); i++) {
-        auto &cfg = configs[i];
-        ret.insert(std::make_pair(gen->convert_config_to_keys(cfg), i));
-    }
-    return ret;
-}
-
-std::vector<int> tunable_op_t::get_impl_dispatch_candidates(
-        const context_ptr &ctx) {
-    return get_dynamic_impl_dispatch_candidates(this, ctx);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tunable_op.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tunable_op.hpp
deleted file mode 100644
index 23467c02912..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/tunable_op.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TUNABLE_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_TUNABLE_OP_HPP
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include <compiler/ir/graph/fusion_data.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/trait/configurable.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <ops/body_generator.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct op_dispatch_key_t;
-struct impl_op_dispatch_key_t;
-
-class SC_INTERNAL_API tunable_op_t
-    : public sc_op,
-      public op_traits::copyable_t,
-      public op_traits::may_quantize_t,
-      public op_traits::post_fusion_acceptable_t,
-      public op_traits::configurable_t,
-      public op_traits::mixed_partition_acceptable {
-public:
-    tunable_op_t(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-
-    bool is_valid(const context_ptr &) override;
-
-    ir_module_ptr get_func(context_ptr ctx,
-            const std::shared_ptr<fusion_anchor_mgr_t> &fuse_mgr,
-            const std::string &func_name) override {
-        throw std::runtime_error("unimplemented");
-    }
-    ir_module_ptr get_func(context_ptr ctx) override;
-
-    func_t get_func(mixed_parti_t *parti, const std::vector<expr> &ins,
-            const std::vector<expr> &outs);
-
-    config_ptr get_config() override { return config_data_; }
-
-    void set_config(const config_ptr &config) override;
-    void set_config_if_empty(context_ptr ctx, body_generator_base_t *p);
-    virtual void set_config_by_key(
-            const op_dispatch_key_t &key, const context_ptr &ctx) {
-        throw std::runtime_error("unimplemented");
-    }
-    virtual void set_internal_config_by_key(
-            const impl_op_dispatch_key_t &key, const context_ptr &ctx) {
-        throw std::runtime_error("unimplemented");
-    }
-
-    config_ptr get_default_config(context_ptr ctx) override;
-
-    void search_anchor(mixed_parti_t *parti) override;
-
-    void commit_into_anchor(fusion_anchor_t *committed_anchor) override;
-
-    config_ptr_vec get_dynamic_config_candidates(
-            const context_ptr &ctx) override;
-    impl_kind_map convert_config_candidates_to_impl_map(
-            const config_ptr_vec &configs) override;
-    std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx) override;
-
-    virtual body_generator_ptr create_generator() = 0;
-
-    void create_mixed_partition(mixed_parti_t *parti) override;
-
-    void append_mixed_partition(mixed_parti_t *parti) override;
-
-    virtual infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap)
-            = 0;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override {}
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override {}
-
-protected:
-    config_ptr config_data_;
-    std::vector<config_ptr> dyn_config_candidates_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/utils.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/utils.hpp
deleted file mode 100644
index ccb0e3c46e3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/utils.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_UTILS_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// do divide and ceil on expr.
-expr divide_and_ceil(const expr &, const expr &);
-namespace graph {
-sc_graph_t make_single_op_graph(const std::string &opname,
-        const std::vector<graph_tensor_ptr> &inputs,
-        const std::vector<graph_tensor_ptr> &outputs = {},
-        const any_map_t &attr = {});
-
-expr tensor_detail_to_ir_tensor(
-        sc_graph_t &graph, const std::string &name, const logical_tensor_t &);
-std::vector<expr> tensor_detail_to_ir_tensor(sc_graph_t &graph,
-        const std::string &name_prefix, const std::vector<logical_tensor_t> &);
-std::vector<expr> tensor_detail_to_ir_tensor(sc_graph_t &graph,
-        const std::string &name_prefix, const std::vector<graph_tensor_ptr> &);
-std::string decay_quantized_op_name(const std::string &op_name);
-
-// get logical_tensor_t from logical_tensor_t
-void get_logical_tensors(
-        ltensors *ins, const std::vector<graph_tensor_ptr> &flts);
-
-// marks the "read_buffer" or "write_buffer" attributes on each expr in `args`.
-// If `is_read` mark them read. Otherwise, mark them write.
-void mark_read_or_write_buffers(std::vector<expr> &args, bool is_read);
-
-/**
- * Creates an empty function declaration for an op. The function will have the
- * name `op::op_name_`. Its arguments are composed of the Op's outputs and then
- * the Op's inputs
- * @param op the op
- * @param ins the vector will be set with the input args of the generated IR
- * function
- * @param outs the vector will be set with the output args of the generated IR
- * functio
- * @return the generated IR function for the Op. Its body is an empty stmts node
- * */
-func_t create_func_decl_for_op(
-        sc_op *op, std::vector<expr> &ins, std::vector<expr> &outs);
-
-/**
- * Creates an empty query format function declaration for an op. The function
- * will have the name `query_format_op::op_name_`. Its arguments are composed of
- * the Op's outputs and then the Op's inputs and their formats.
- * @param op the op
- * @param ins the vector of input args
- * @param ori_ins the vector of origin input args
- * @param outs the vector of output args
- * @param in_fmts the vector of input format args
- * @param ori_in_fmts the vector of origin input format args
- * @param out_fmts the vector of output format
- * @param out_sizes the size tensor of output
- * @param kernel the kernel tensor
- * @return the generated IR function for the Op. Its body is an empty stmts node
- * */
-func_t create_query_func_decl_for_op(sc_op *op, std::vector<expr> &ins,
-        std::vector<expr> &ori_ins, std::vector<expr> &outs,
-        std::vector<expr> &in_fmts, std::vector<expr> &ori_in_fmts,
-        std::vector<expr> &out_fmts, std::vector<expr> &out_sizes,
-        expr &kernel);
-
-ltensors extract_detail_from_tensors(
-        const std::vector<std::shared_ptr<graph_tensor>> &);
-
-/**
- * Checks whether lhs_shape could be identical to rhs_shape
- * considering dynamic shape use cases
- * */
-bool check_shape_equal(const sc_dims &lhs_shape, const sc_dims &rhs_shape);
-
-/**
- * Checks whether lhs and rhs matches for both plain_dims and dtype
- * considering dynamic shape use cases
- * */
-void check_logical_tensor_shape_dtype_identical(
-        const logical_tensor_t &lhs, const logical_tensor_t &rhs);
-} // namespace graph
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/visitor.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/visitor.cpp
deleted file mode 100644
index 57576aac8d4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/visitor.cpp
+++ /dev/null
@@ -1,564 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "visitor.hpp"
-#include <algorithm>
-#include <sstream>
-#include <utility>
-#include "fusible_op.hpp"
-#include "fusible_op_utils.hpp"
-#include "fusion_anchor.hpp"
-#include "tunable_op.hpp"
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/assert.hpp>
-#include <util/def.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void op_visitor_t::visit_graph(const sc_graph_t &mgr, const visitor_func &f) {
-    for (auto &v : mgr.ops_) {
-        if (v->isa<input_op>() || v->isa<constant_op_t>()) {
-            to_visit_.emplace_back(v);
-        }
-    }
-    int original_number_of_ops = mgr.ops_.size();
-    visit(f);
-    if (check_all_ops_visited_) {
-        assert_all_ops_visited(mgr, original_number_of_ops);
-    }
-}
-
-void op_visitor_t::visit(const visitor_func &f) {
-    while (!to_visit_.empty()) {
-        auto ptr = select_next_node(this);
-        // if selector fails (e.g. found a node that has already been visited),
-        // try again
-        if (!ptr || ptr->is_removed_) { continue; }
-        f(this, ptr);
-        update_state_for_visited(std::move(ptr));
-    }
-}
-
-void op_visitor_t::assert_all_ops_visited(
-        const sc_graph_t &mgr, size_t concerned_size) {
-#ifndef NDEBUG
-    std::vector<sc_op_ptr> not_visited_ops;
-    for (size_t i = 0; i < concerned_size; i++) {
-        // check whether all non-removed ops within concerned_size are visited
-        const auto &op = mgr.ops_[i];
-        if (!op->is_removed_ && !has_visited(op->logical_op_id_)) {
-            not_visited_ops.emplace_back(op);
-        }
-    }
-    if (!not_visited_ops.empty()) {
-        // some ops are not visited, assertion failed
-        std::stringstream error_message;
-        error_message << "Illegal state for op_visitor_t. The following "
-                      << not_visited_ops.size()
-                      << " ops were not visited, possibly due to changing the "
-                         "graph during the visit without calling "
-                         "update_state_for_visited(): ";
-        for (const auto &op : not_visited_ops) {
-            auto name = op->op_name_ + std::to_string(op->logical_op_id_);
-            error_message << name << ' ';
-        }
-        COMPILE_ASSERT(false, error_message.str());
-    }
-#else
-    for (size_t i = 0; i < concerned_size; i++) {
-        const auto &op = mgr.ops_[i];
-        COMPILE_ASSERT(op->is_removed_ || has_visited(op->logical_op_id_),
-                "Illegal state for op_visitor_t. Some ops were not visited, "
-                "possibly due to changing the graph during the visit without "
-                "calling update_state_for_visited().");
-    }
-#endif
-}
-
-void op_dep_matrix_t::update(const sc_op_ptr &cur) {
-    int cur_id = cur->logical_op_id_;
-    // set all inputs=1
-    for (auto &input : cur->get_inputs()) {
-        int dep_id = input->producer_owner_->logical_op_id_;
-        matrix_[dep_id][cur_id] = 1;
-        matrix_[cur_id][dep_id] = -1;
-        // set all recursive inputs=1
-        for (size_t i = 0; i < matrix_.size(); i++) {
-            if (matrix_[i][dep_id] == 1) {
-                matrix_[i][cur_id] = 1;
-                matrix_[cur_id][i] = -1;
-            }
-        }
-    }
-}
-
-// Move Op in topology sequence(op_seq) from src position to dst position.
-// Meanwhile, it will check op_dep_matrix_t, if dependency not met, it will do
-// nothing.
-static bool move_op_from_to(std::vector<sc_op_ptr> &op_seq,
-        const op_dep_matrix_t &dep_matrix, int src, int dst,
-        bool ignore_dep = false) {
-    // automatically skip if src==dst
-    if (src == dst) return false;
-    // need to check op_seq from src+1 to dst dependence in dep_matrix
-    int src_op_id = op_seq[src]->logical_op_id_;
-    if (src < dst) {
-        for (int i = src + 1; i < dst; i++) {
-            int cur_op_id = op_seq[i]->logical_op_id_;
-            if (!ignore_dep && dep_matrix.lookup(src_op_id, cur_op_id) == 1) {
-                // can not move due to dependence
-                return false;
-            }
-        }
-    }
-    // need to check op_seq from dst to src-1 dependence in dep_matrix
-    else {
-        for (int i = dst; i < src; i++) {
-            int cur_op_id = op_seq[i]->logical_op_id_;
-            if (!ignore_dep && dep_matrix.lookup(cur_op_id, src_op_id) == 1) {
-                // can not move due to dependence
-                return false;
-            }
-        }
-    }
-    // move op in seq from src to dst
-    op_seq.insert(op_seq.begin() + dst, op_seq[src]);
-    op_seq.erase(op_seq.begin() + src + (src <= dst ? 0 : 1));
-    return true;
-}
-
-void op_sorting_visitor_t::visit_by_rules(sc_graph_t &graph,
-        const std::function<void(sc_op_ptr)> &f, std::vector<sort_rule> rules) {
-    std::sort(rules.begin(), rules.end());
-    std::vector<rule_func> f_rule_list;
-    for (auto &r : rules) {
-        switch (r) {
-            // This rule will make same kind op closer, which can expose more
-            // opportunity to loop merge pass
-            case sort_rule::same_kind:
-                f_rule_list.emplace_back(create_same_kind_rule());
-                break;
-            default: break;
-        }
-    }
-    auto f_rule = [f_rule_list](std::vector<sc_op_ptr> &op_seq,
-                          const op_dep_matrix_t &dep_matrix) {
-        for (auto &f_rule : f_rule_list) {
-            f_rule(op_seq, dep_matrix);
-        }
-    };
-    visit_by_rules(graph, f, f_rule);
-}
-
-op_dep_matrix_t::op_dep_matrix_t(const sc_graph_t &graph)
-    : op_dep_matrix_t(graph.ops_.size()) {
-    int op_size = graph.ops_.size();
-    auto &dep_matrix = *this;
-    op_visitor_t::dfs_topology_sort(op_size).visit_graph(
-            graph, [&dep_matrix](op_visitor_t *vis, const sc_op_ptr &cur) {
-                dep_matrix.update(cur);
-            });
-}
-
-void op_sorting_visitor_t::visit_by_rules(sc_graph_t &graph,
-        const std::function<void(sc_op_ptr)> &f, const rule_func &f_rule) {
-    int op_size = graph.ops_.size();
-    // Dependence Matrix for Graph Ops, value of postion<i,j> means whether j-th
-    // op depends on i-th op.
-    op_dep_matrix_t dep_matrix(op_size);
-    std::vector<sc_op_ptr> op_seq;
-    // Step 1: visit whole graph and update Adj-Matrix.
-    op_visitor_t::dfs_topology_sort(op_size).visit_graph(graph,
-            [&op_seq, &dep_matrix](op_visitor_t *vis, const sc_op_ptr &cur) {
-                op_seq.emplace_back(cur);
-                dep_matrix.update(cur);
-            });
-    // Step 2: change order by rules
-    f_rule(op_seq, dep_matrix);
-    // Step 3: excute ops by new order
-    for (auto &op : op_seq) {
-        f(std::move(op));
-    }
-}
-
-std::vector<sc_op_ptr> op_sorting_visitor_t::sort_by_rules(
-        sc_graph_t &graph, std::vector<sort_rule> rules) {
-    std::vector<sc_op_ptr> sorted_op_list;
-    auto f = [&sorted_op_list](const sc_op_ptr &cur) {
-        sorted_op_list.emplace_back(cur);
-    };
-    visit_by_rules(graph, f, std::move(rules));
-    return sorted_op_list;
-}
-
-std::vector<sc_op_ptr> op_sorting_visitor_t::sort_by_rules(
-        sc_graph_t &graph, const rule_func &f_rule) {
-    std::vector<sc_op_ptr> sorted_op_list;
-    auto f = [&sorted_op_list](const sc_op_ptr &cur) {
-        sorted_op_list.emplace_back(cur);
-    };
-    visit_by_rules(graph, f, f_rule);
-    return sorted_op_list;
-}
-
-void op_visitor_t::update_state_for_visited(sc_op_ptr node) {
-    update_visit_list(this, std::move(node));
-}
-
-op_visitor_t::op_visitor_t(selector_func select_next_node_func,
-        updater_func update_visit_list_func, bool check_all_ops_visited)
-    : select_next_node(std::move(select_next_node_func))
-    , update_visit_list(std::move(update_visit_list_func))
-    , check_all_ops_visited_(check_all_ops_visited) {
-    visited_.reserve(256);
-}
-
-void op_visitor_t::set_visited(int id) {
-    assert(id >= 0);
-    if ((unsigned)id >= visited_.size()) { visited_.resize(id + 1); }
-    visited_[id] = true;
-}
-
-bool op_visitor_t::has_visited(int id) {
-    assert(id >= 0);
-    if ((unsigned)id >= visited_.size()) { return false; }
-    return visited_[id];
-}
-
-void op_visitor_t::push_back_updater(op_visitor_t *v, const sc_op_ptr &cur) {
-    v->set_visited(cur->logical_op_id_);
-    for (auto &lt : cur->get_outputs()) {
-        for (auto &user : lt->uses_) {
-            v->to_visit_.emplace_back(user.second);
-        }
-    }
-};
-
-namespace op_kind {
-static constexpr int elementwise = 0;
-static constexpr int broadcast = 1;
-static constexpr int reduce = 2;
-// TODO(xxx): extend it.
-static constexpr int others = 4;
-}; // namespace op_kind
-
-op_sorting_visitor_t::rule_func op_sorting_visitor_t::create_same_kind_rule() {
-    return [](std::vector<sc_op_ptr> &op_seq,
-                   const op_dep_matrix_t &dep_matrix) {
-        // op kind taxonomy
-        auto get_op_kind = [](const sc_op_ptr &cur) {
-            if (cur->isa<unary_elementwise_op_t>()) {
-                return op_kind::elementwise;
-            } else if (auto belemop
-                    = cur->dyn_cast<binary_elementwise_op_t>()) {
-                auto anchor_id = cur->dyn_cast<fusible_op_t>()->anchor_id_;
-                if (belemop->get_broadcast_input() >= 0) {
-                    return op_kind::broadcast;
-                } else {
-                    return op_kind::elementwise;
-                }
-            } else if (cur->isa<reduce_op_t>()) {
-                return op_kind::reduce;
-            } else {
-                // Movement kind of op etc.
-                return op_kind::others;
-            }
-        };
-        // use a map to record same kind ops and their index in topology
-        // sequence
-        std::unordered_map<int, std::vector<int>> op_name_idx_map;
-        for (int i = op_seq.size() - 1; i >= 0; i--) {
-            auto kind = get_op_kind(op_seq[i]);
-            if (kind != op_kind::others) {
-                op_name_idx_map[kind].emplace_back(i);
-            }
-        }
-        // Iterate map, and find opportunity for reorder.
-        for (auto &m : op_name_idx_map) {
-            int pre_idx = m.second.at(0);
-            for (auto &cur_idx : m.second) {
-                // if not neighboring, try to make them closer, note that pre is
-                // larger than cur here
-                if ((pre_idx - cur_idx) > 1) {
-                    move_op_from_to(op_seq, dep_matrix, cur_idx, pre_idx);
-                }
-                pre_idx = cur_idx;
-            }
-        }
-    };
-}
-
-op_visitor_t::updater_func op_visitor_t::create_DAG_updater(
-        size_t total_hint, const user_sort_func &sorter) {
-    struct count_t {
-        int count = -1;
-    };
-    // the count of pending depending logical tensors for each node
-    std::vector<count_t> pending_count;
-    return [pending_count, total_hint, sorter](
-                   op_visitor_t *v, const sc_op_ptr &cur) mutable {
-        v->set_visited(cur->logical_op_id_);
-        for (auto &lt : cur->get_outputs()) {
-            auto visit_index = sorter(lt);
-            for (auto &idx : visit_index) {
-                auto user = lt->uses_[idx];
-                auto id = user.second->logical_op_id_;
-                assert(id >= 0);
-                if ((unsigned)id >= pending_count.size()) {
-                    // need to extend pending_count
-                    if ((unsigned)id < total_hint) {
-                        pending_count.resize(total_hint);
-                    } else {
-                        pending_count.resize((id + 1) * 1.5f);
-                    }
-                }
-                if (pending_count[id].count == -1) {
-                    // we have not met it before, initialize the dependency
-                    // count
-                    pending_count[id].count = user.second->get_inputs().size();
-                }
-                // the pending count is decreased by 1 because current node is
-                // done
-                --pending_count[id].count;
-                assert(pending_count[id].count >= 0);
-                // all dependencies resolved, we can visit it now
-                if (pending_count[id].count == 0) {
-                    v->to_visit_.emplace_back(user.second);
-                }
-            }
-        }
-    };
-}
-
-op_visitor_t::updater_func op_visitor_t::create_DAG_updater_post(
-        size_t total_hint) {
-    struct count_t {
-        int count = -1;
-    };
-    // the count of pending depending logical tensors for each node
-    std::vector<count_t> pending_count;
-    return [pending_count, total_hint](
-                   op_visitor_t *v, const sc_op_ptr &cur) mutable {
-        v->set_visited(cur->logical_op_id_);
-        for (auto &lt : cur->get_inputs()) {
-            auto id = lt->producer_owner_->logical_op_id_;
-            if (v->has_visited(id)) { continue; }
-            assert(id >= 0);
-            if ((unsigned)id >= pending_count.size()) {
-                // need to extend pending_count
-                if ((unsigned)id < total_hint) {
-                    pending_count.resize(total_hint);
-                } else {
-                    pending_count.resize((id + 1) * 1.5f);
-                }
-            }
-            if (pending_count[id].count == -1) {
-                // we have not met it before, initialize the dependency
-                // count
-                size_t num_count = 0;
-                for (const graph_tensor_ptr &out :
-                        lt->producer_owner_->get_outputs()) {
-                    num_count += out->uses_.size();
-                }
-                pending_count[id].count = num_count;
-            }
-            // the pending count is decreased by 1 because current node is
-            // done
-            --pending_count[id].count;
-            assert(pending_count[id].count >= 0);
-            // all dependencies resolved, we can visit it now
-            if (pending_count[id].count == 0) {
-                v->to_visit_.emplace_back(
-                        lt->producer_owner_->shared_from_this());
-            }
-        }
-    };
-}
-
-static std::vector<int> usr_speculative_sorter(const graph_tensor_ptr &gt) {
-    std::vector<int> visit_index;
-    visit_index.reserve(gt->uses_.size());
-    std::unordered_set<int> visited_set;
-    // Step 1: sorted by tunable op cnt
-    std::vector<std::pair<size_t, int>> priority_index_list;
-    auto sort_by_prority = [&priority_index_list, &visit_index]() {
-        std::sort(priority_index_list.begin(), priority_index_list.end(),
-                [](const std::pair<size_t, int> &p1,
-                        const std::pair<size_t, int> &p2) {
-                    return p1.first > p2.first;
-                });
-        for (auto &p : priority_index_list) {
-            visit_index.emplace_back(p.second);
-        }
-        priority_index_list.clear();
-    };
-    // counter tunable op users
-    for (size_t i = 0; i < gt->uses_.size(); i++) {
-        auto u = gt->uses_[i];
-        auto tun_cnt = count_tuneop_linearly(u.second, 15);
-        if (tun_cnt > 0) {
-            priority_index_list.emplace_back(std::make_pair(tun_cnt, i));
-            visited_set.insert(i);
-        }
-    }
-    // sort tunable_cnt_index_list by descend
-    sort_by_prority();
-    // Step 2: sorted by user tensor size
-    for (size_t i = 0; i < gt->uses_.size(); i++) {
-        if (visited_set.find(i) != visited_set.end()) continue;
-        auto u = gt->uses_[i];
-        if (u.second->get_outputs().empty()) continue;
-        auto dt = u.second->get_outputs()[0]->details_;
-        size_t user_tsr_size = utils::get_sizeof_etype(dt.dtype_.type_code_)
-                * get_dims_product(dt.get_blocking_dims());
-        priority_index_list.emplace_back(std::make_pair(user_tsr_size, i));
-        visited_set.insert(i);
-    }
-    // sort tensor_size_index_list by descend
-    sort_by_prority();
-    // Step 3: push remaining
-    for (size_t i = 0; i < gt->uses_.size(); i++) {
-        if (visited_set.find(i) != visited_set.end()) continue;
-        visit_index.emplace_back(i);
-    }
-    return visit_index;
-}
-
-op_visitor_t::updater_func op_visitor_t::create_DAG_updater_speculative(
-        size_t total_hint) {
-    return create_DAG_updater(total_hint, usr_speculative_sorter);
-}
-
-sc_op_ptr op_visitor_t::pop_back_selector(op_visitor_t *v) {
-    auto ret = v->to_visit_.back();
-    v->to_visit_.pop_back();
-    if (v->has_visited(ret->logical_op_id_)) { return nullptr; }
-    return ret;
-}
-
-op_visitor_t op_visitor_t::dfs() {
-    return op_visitor_t(pop_back_selector, push_back_updater, true);
-}
-
-sc_op_ptr op_visitor_t::dequeue_selector(op_visitor_t *v) {
-    auto ret = v->to_visit_.front();
-    v->to_visit_.pop_front();
-    if (v->has_visited(ret->logical_op_id_)) { return nullptr; }
-    return ret;
-}
-
-op_visitor_t op_visitor_t::bfs() {
-    return op_visitor_t(dequeue_selector, push_back_updater, true);
-}
-
-op_visitor_t op_visitor_t::dfs_topology_sort(size_t total_nodes_hint) {
-    return op_visitor_t(
-            pop_back_selector, create_DAG_updater(total_nodes_hint), true);
-}
-
-op_visitor_t op_visitor_t::dfs_topology_speculative_sort(
-        size_t total_nodes_hint) {
-    return op_visitor_t(pop_back_selector,
-            create_DAG_updater_speculative(total_nodes_hint), true);
-}
-
-op_visitor_t op_visitor_t::bfs_topology_sort(size_t total_nodes_hint) {
-    return op_visitor_t(
-            dequeue_selector, create_DAG_updater(total_nodes_hint), true);
-}
-
-op_visitor_t op_visitor_t::bfs_unchecked() {
-    return op_visitor_t(dequeue_selector, push_back_updater, false);
-}
-
-op_visitor_t op_visitor_t::dfs_topology_sort_unchecked(
-        size_t total_nodes_hint) {
-    return op_visitor_t(
-            pop_back_selector, create_DAG_updater(total_nodes_hint), false);
-}
-
-void op_visitor_t::post_visit_graph(
-        const sc_graph_t &mgr, const visitor_func &f) {
-    for (auto &v : mgr.ops_) {
-        if (dynamic_cast<output_op *>(v.get())
-                || dynamic_cast<constant_op_t *>(v.get())) {
-            to_visit_.emplace_back(v);
-        }
-    }
-    int original_number_of_ops = mgr.ops_.size();
-    visit(f);
-    if (check_all_ops_visited_) {
-        assert_all_ops_visited(mgr, original_number_of_ops);
-    }
-}
-
-sc_op_ptr search_tuneop_linearly(const sc_op_ptr &start_node, int max_step) {
-    auto next_node = start_node;
-    if (next_node->isa<tunable_op_t>()) return next_node;
-    int step = 1;
-    while (next_node->is_single_output_single_use()) {
-        next_node = next_node->get_outputs()[0]->uses_[0].second;
-        if (next_node->isa<tunable_op_t>()) return next_node;
-        if (step >= max_step) return nullptr;
-        ++step;
-    }
-    return nullptr;
-}
-
-int count_tuneop_linearly(const sc_op_ptr &start_node, int step) {
-    int cnt = 0;
-    auto next_node = start_node;
-    if (next_node->isa<tunable_op_t>()) cnt++;
-    while (next_node->is_single_output_single_use() && step > 0) {
-        next_node = next_node->get_outputs()[0]->uses_[0].second;
-        if (next_node->isa<tunable_op_t>()) cnt++;
-        step--;
-    }
-    return cnt;
-}
-
-std::vector<sc_op_ptr> search_tuneop_bypass(const context_ptr &ctx,
-        const sc_op_ptr &tuneop, const sc_op_ptr &start_node,
-        const op_dep_matrix_t &dep, int max_step) {
-    if (!tuneop) return {};
-    auto next_node = start_node;
-    int step = 1;
-    std::vector<sc_op_ptr> bypass_ops;
-    bool found = false;
-    while (next_node->is_single_output_single_use()) {
-        // This is input fusion, rather than pre-op fusion
-        if (next_node == tuneop) break;
-        // found bypass
-        if (dep.lookup(tuneop, next_node) == 1) {
-            found = true;
-            break;
-        }
-        bypass_ops.emplace_back(next_node);
-        next_node = next_node->get_outputs()[0]->uses_[0].second;
-        if ((step++) >= max_step) break;
-    }
-    if (found) { return bypass_ops; }
-    return {};
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/visitor.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/visitor.hpp
deleted file mode 100644
index 839d8e012b3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/graph/visitor.hpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_VISITOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_GRAPH_VISITOR_HPP
-
-#include <functional>
-#include <list>
-#include <memory>
-#include <numeric>
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * The util for traversing the OP graph. It contains key functions: selector and
- * updater. Selector will pick the nodes to visit from the "queue": `to_visit_`.
- * Updater is called after visiting a node, which "pushes" the nodes to visit in
- * the `to_visit_` "queue". Different selectors and updaters results in
- * different visiting orders. This class has some pre-defined selectors and
- * updaters.
- * Note: `op_visitor_t::visit_graph()` can be called only once. Please create a
- * new object for each visit.
- * */
-class SC_INTERNAL_API op_visitor_t {
-public:
-    // the queue/stack for the nodes to visit
-    std::list<sc_op_ptr> to_visit_;
-    // the array to memorize the nodes that we have visited, indexed by the op
-    // id
-    std::vector<bool> visited_;
-
-    using visitor_func = std::function<void(op_visitor_t *, sc_op_ptr)>;
-    using updater_func = std::function<void(op_visitor_t *, sc_op_ptr)>;
-    using selector_func = std::function<sc_op_ptr(op_visitor_t *)>;
-    // the selector to return the next node to visit in `to_visit_` list. It
-    // should also remove the node from the list. It can return null if it finds
-    // a node that has been visited. The visitor will try to call it again
-    selector_func select_next_node;
-    // will be called after a node has been visited. Usually it should update
-    // the `visited_`, and push/enqueue the sub-nodes to the `to_visit_`
-    updater_func update_visit_list;
-    // if true, when the user call `visit_graph()` or `post_visit_graph()`, we
-    // will check whether all ops in the graph are visited (not including the
-    // ops which are newly created and inserted into the graph during the
-    // visit). If any ops are not visited, an exception will be thrown
-    const bool check_all_ops_visited_;
-
-    void visit(const visitor_func &f);
-
-    // set a node as visited
-    void set_visited(int id);
-    // returns if an id is in the visited node set
-    bool has_visited(int id);
-
-    void visit_graph(const sc_graph_t &mgr, const visitor_func &f);
-
-    op_visitor_t(selector_func select_next_node_func,
-            updater_func update_visit_list_func, bool check_all_ops_visited);
-
-    // updates the visitor states after a node is visited. It can be also used
-    // when a new node replaces an old one. Users should call this function with
-    // the new node
-    void update_state_for_visited(sc_op_ptr node);
-
-    // the updater which pushes all uses of all output logical tensors
-    // to the back of the to_visit list
-    static void push_back_updater(op_visitor_t *, const sc_op_ptr &sc_op_ptr);
-
-    // For DAG updater, its user visit order can be specified as demand, the
-    // return vector is the index of multi-users
-    using user_sort_func
-            = std::function<std::vector<int>(const graph_tensor_ptr &gt)>;
-    // the updater which pushes all nodes whose dependencies have already been
-    // visited. Used in topology sort. The default user order is sorted by
-    // ascend
-    static updater_func create_DAG_updater(
-            size_t total_nodes_hint,
-            const user_sort_func &func
-            = [](const graph_tensor_ptr &gt) -> std::vector<int> {
-                std::vector<int> usr_vis_ord(gt->uses_.size());
-                std::iota(usr_vis_ord.begin(), usr_vis_ord.end(), 0);
-                return usr_vis_ord;
-            });
-    static updater_func create_DAG_updater_post(size_t total_nodes_hint);
-    // Different from default create_DAG_updater, it has specific user sort func
-    static updater_func create_DAG_updater_speculative(size_t total_nodes_hint);
-    // create_DAG_updatater_post;
-    // post order traversing
-    void post_visit_graph(const sc_graph_t &mgr, const visitor_func &f);
-
-    // the selector which pops a node in `to_visit_` from back
-    static sc_op_ptr pop_back_selector(op_visitor_t *v);
-
-    // the selector which pops a node in `to_visit_` from front
-    static sc_op_ptr dequeue_selector(op_visitor_t *v);
-
-    // constructs a DFS visitor, using push_back_updater and
-    // pop_back_selector
-    static op_visitor_t dfs();
-    // constructs a BFS visitor, using push_back_updater and
-    // dequeue_selector
-    static op_visitor_t bfs();
-    // constructs a topology sort visitor in DFS order, using
-    // create_DAG_updater and pop_back_selector
-    static op_visitor_t dfs_topology_sort(size_t total_nodes_hint = 30);
-    static op_visitor_t dfs_topology_speculative_sort(
-            size_t total_nodes_hint = 30);
-    // constructs a topology sort visitor in BFS order, using
-    // create_DAG_updater and dequeue_selector
-    static op_visitor_t bfs_topology_sort(size_t total_nodes_hint = 30);
-    // constructs a BFS visitor, and do not check whether all ops are visited
-    static op_visitor_t bfs_unchecked();
-    // constructs a topology sort visitor in BFS order, and do not check whether
-    // all ops are visited
-    static op_visitor_t dfs_topology_sort_unchecked(
-            size_t total_nodes_hint = 30);
-
-private:
-    // asserts that all ops in the graph whose index is within
-    // [0, concerned_size) have been visited
-    void assert_all_ops_visited(const sc_graph_t &mgr, size_t concerned_size);
-};
-
-/** Op Depenency Matrix
- * This class is used to record depenency relationship between all graph
- * ops. For n Ops in graph, it will create nxn matrix, in which:
- * 1. <i,j> = 1 represents the j-th OP depends on i-th OP
- * 2. <i,j> = -1 represents the i-th OP depends on j-th OP
- * 3. <i,j> = 0 represents i-th OP and j-th OP have no depenency
- * @note both i and j are logical_op_id_ of original
- * whole graph.
- * */
-class op_dep_matrix_t {
-private:
-    std::vector<std::vector<int>> matrix_;
-    int op_size_;
-
-public:
-    op_dep_matrix_t(const sc_graph_t &);
-    // initlize op_size x op_size matrix by default value of zero
-    op_dep_matrix_t(int op_size)
-        : matrix_(op_size, std::vector<int>(op_size, 0)), op_size_(op_size) {}
-    // update depenency matrix according topology sort
-    void update(const sc_op_ptr &cur);
-    /** lookup function for matrix, it will return depenency relationship
-     * return 1 represents j-th op depends on i-th op.
-     * return -1 represents i-th op depends on j-th op
-     * return  0 represents i-th op and j-th op have no depenency
-     * */
-    int lookup(int i, int j) const {
-        COMPILE_ASSERT(i >= 0 && i < op_size_ && j >= 0 && j < op_size_,
-                "illegal lookup index for depenency matrix.");
-        return matrix_[i][j];
-    }
-
-    int lookup(sc_op *op_i, sc_op *op_j) const {
-        int i = op_i->logical_op_id_, j = op_j->logical_op_id_;
-        return lookup(i, j);
-    }
-
-    int lookup(const sc_op_ptr &op_i, const sc_op_ptr &op_j) const {
-        return lookup(op_i.get(), op_j.get());
-    }
-
-    // look up all op id which depend on given op id
-    std::vector<int> lookup_ops_depend_on(int depend_i) const {
-        COMPILE_ASSERT(depend_i >= 0 && depend_i < op_size_,
-                "illegal lookup index for depenency matrix.");
-        std::vector<int> ret;
-        for (int j = 0; j < op_size_; j++) {
-            if (matrix_[depend_i][j] == 1) { ret.emplace_back(j); }
-        }
-        return ret;
-    }
-};
-
-using dep_mat_ptr = std::shared_ptr<op_dep_matrix_t>;
-
-/**
- * op_sorting_visitor_t is sort based, and visiting order is defined by the user
- * sort function or rules. op_visitor_t is queue/iteration based.
- * */
-class op_sorting_visitor_t {
-public:
-    /**
-     * pre-defined optimize rules are listed as enumurate type, users can
-     * directly use this enumuate kind rules. Actually, it will call pre-defined
-     * create_xxx_rule function. @note multiple optimzied rules are supported,
-     * so these rules would be sorted by priority if more than one rules are
-     * used.
-     * */
-    enum class sort_rule : int {
-        same_kind = 0,
-        // TODO(xxx): Add other pre-defined rules here.
-    };
-
-    /**
-     * user-defined optimize rules are the lambda function, which will accept
-     * two arguements. This function is expected to reorder op_seq according
-     * given rules and dep_matrix.
-     * @param op_seq: op sequence generated by op_visitor_t.
-     * @param dep_matrix: op depenency matrix.
-     * */
-    using rule_func = std::function<void(
-            std::vector<sc_op_ptr> &op_seq, const op_dep_matrix_t &dep_matrix)>;
-
-    /** Visit topology sequence by given rules.
-     * This function will split this into three stages. First, it will visit all
-     * ops in graph.ops and generate initial sequence. Meanwhile, it will record
-     * its dependency in a Adjacent Matrix. Secondly, it may change the op
-     * visiting order by the given visit rules. Finally, all ops will be excuted
-     * by new order.
-     * @note This visit function have two versions: one is suitable for
-     * pre-defined rules, and another is more suitable for user-defined rule.
-     * @param graph: sc_graph object
-     * @param f: visiting function for each op in graph
-     * @param rules: (Version 1) the list of pre-defined rules, it will be
-     * reordered by priority inside.
-     * @param f_fule (Version 2) user-defined lambda rule function.
-     * */
-    static void visit_by_rules(sc_graph_t &graph,
-            const std::function<void(sc_op_ptr)> &f,
-            std::vector<sort_rule> rules);
-    static void visit_by_rules(sc_graph_t &graph,
-            const std::function<void(sc_op_ptr)> &f, const rule_func &f_rule);
-
-    static std::vector<sc_op_ptr> sort_by_rules(
-            sc_graph_t &graph, std::vector<sort_rule> rules);
-    static std::vector<sc_op_ptr> sort_by_rules(
-            sc_graph_t &graph, const rule_func &f_rule);
-
-    /** create pre-defined same_kind rules
-     * Due to same_kind op like elementwise_op, broadcast_op_t or reduce_op_t
-     * may have generate similar loop in IR, which will expose more opportunity
-     * for Loop Merge Pass, we can make them as neighboring as possible.
-     * */
-    static rule_func create_same_kind_rule();
-};
-
-// count amount of tuneable op
-int count_tuneop_linearly(const sc_op_ptr &start_node, int step);
-
-// search first tunable op linearly
-sc_op_ptr search_tuneop_linearly(const sc_op_ptr &start_node, int max_step = 5);
-
-/**
- * What is bypass: it starts from the certain op which has more than one user
- * ops, and one of them is tunable op. it means fuse op pass will reparition
- * fused graph starting that tunable op.
- * E.g.
- *       in_a  in_b
- *         \    /
- *        matmul2d   in_c
- *           |      /
- *          bias
- *           |
- *    in_d  quan
- *      \    |       \
- *        matmul2d   dequan
- *           |       /
- *          add
- *           |
- *         output
- *
- * In graph above, `deq` or `cast+sub+mul`(after graph inline) is the bypass
- * what we want to search. For each op found in bypsas, it can be fused either
- * previous or post op, global reschedule is aimed to mark suitable fuse attr
- * (break pre/post fuse or even no fused) for each op them according several
- * different rules.
- * */
-std::vector<sc_op_ptr> search_tuneop_bypass(const context_ptr &ctx,
-        const sc_op_ptr &tuneop, const sc_op_ptr &start_node,
-        const op_dep_matrix_t &dep, int max_step = 10);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/intrinsics.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/intrinsics.cpp
deleted file mode 100644
index 6bb08f23022..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/intrinsics.cpp
+++ /dev/null
@@ -1,643 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "intrinsics.hpp"
-#include <memory>
-#include <string>
-#include <utility>
-#include <compiler/ir/pass/printer.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-void intrinsic_handler_t::to_string(
-        const intrin_call_c &v, ir_printer_t *printer) const {
-    printer->os_ << name_ << '(';
-    if (!v->args_.empty()) {
-        for (unsigned i = 0; i < v->args_.size() - 1; i++) {
-            printer->do_dispatch(v->args_.at(i)) << ", ";
-        }
-        printer->do_dispatch(v->args_.back());
-    }
-    printer->os_ << ')';
-}
-
-intrinsic_handler_t::intrinsic_handler_t(const std::string &name)
-    : name_(name) {}
-
-x86_intrinsic_handler_t::x86_intrinsic_handler_t(const std::string &name)
-    : intrinsic_handler_t(name) {}
-
-struct binary_intrinsic_handler_t : public intrinsic_handler_t {
-    binary_intrinsic_handler_t(const std::string &name)
-        : intrinsic_handler_t(name) {}
-    void on_initialize(intrin_call_node &node) override;
-};
-
-void binary_intrinsic_handler_t::on_initialize(intrin_call_node &node) {
-    assert(node.args_.size() == 2);
-    auto &l = node.args_[0];
-    auto &r = node.args_[1];
-    node.dtype_ = l->dtype_ == r->dtype_ ? l->dtype_ : datatypes::undef;
-}
-
-struct trinary_intrinsic_handler_t : public intrinsic_handler_t {
-    trinary_intrinsic_handler_t(const std::string &name)
-        : intrinsic_handler_t(name) {}
-    void on_initialize(intrin_call_node &node) override;
-};
-
-void trinary_intrinsic_handler_t::on_initialize(intrin_call_node &node) {
-    assert(node.args_.size() == 3);
-    auto &a = node.args_[0];
-    auto &b = node.args_[1];
-    auto &c = node.args_[2];
-    if (node.type_ == intrin_type::permute
-            || node.type_ == intrin_type::shuffle) {
-        node.dtype_ = a->dtype_ == b->dtype_ ? a->dtype_ : datatypes::undef;
-    } else if (node.type_ == intrin_type::permutex2var) {
-        node.dtype_ = a->dtype_ == c->dtype_ ? a->dtype_ : datatypes::undef;
-    } else {
-        node.dtype_ = a->dtype_ == b->dtype_ && a->dtype_ == c->dtype_
-                ? a->dtype_
-                : datatypes::undef;
-    }
-}
-
-struct min_handler_t : public binary_intrinsic_handler_t {
-    min_handler_t() : binary_intrinsic_handler_t("min") {}
-};
-
-struct max_handler_t : public binary_intrinsic_handler_t {
-    max_handler_t() : binary_intrinsic_handler_t("max") {}
-};
-
-struct abs_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    abs_handler_t() : intrinsic_handler_t("abs") {}
-};
-
-struct round_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    round_handler_t() : intrinsic_handler_t("round") {}
-};
-
-struct ceil_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    ceil_handler_t() : intrinsic_handler_t("ceil") {}
-};
-
-struct floor_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    floor_handler_t() : intrinsic_handler_t("floor") {}
-};
-
-struct exp_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    exp_handler_t() : intrinsic_handler_t("exp") {}
-};
-
-struct log_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    log_handler_t() : intrinsic_handler_t("log") {}
-};
-
-struct erf_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    erf_handler_t() : intrinsic_handler_t("erf") {}
-};
-
-struct sqrt_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    sqrt_handler_t() : intrinsic_handler_t("sqrt") {}
-};
-
-struct rsqrt_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    rsqrt_handler_t() : intrinsic_handler_t("rsqrt") {}
-};
-
-struct reduce_add_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-        node.dtype_.lanes_ = 1;
-    }
-    reduce_add_handler_t() : intrinsic_handler_t("reduce_add") {}
-};
-
-struct reduce_mul_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-        node.dtype_.lanes_ = 1;
-    }
-    reduce_mul_handler_t() : intrinsic_handler_t("reduce_mul") {}
-};
-
-struct reduce_max_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-        node.dtype_.lanes_ = 1;
-    }
-    reduce_max_handler_t() : intrinsic_handler_t("reduce_max") {}
-};
-
-struct reduce_min_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-        node.dtype_.lanes_ = 1;
-    }
-    reduce_min_handler_t() : intrinsic_handler_t("reduce_min") {}
-};
-
-struct broadcast_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        auto lanes = node.intrin_attrs_->get<int>("lanes");
-        COMPILE_ASSERT(lanes <= 512, "Expecting lanes<=512");
-        node.dtype_ = node.args_[0]->dtype_;
-        node.dtype_.lanes_ = lanes;
-    }
-    broadcast_handler_t() : intrinsic_handler_t("broadcast") {}
-};
-
-struct fmadd_handler_t : public trinary_intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    fmadd_handler_t() : trinary_intrinsic_handler_t("fmadd") {}
-};
-
-struct fnmadd_handler_t : public trinary_intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    fnmadd_handler_t() : trinary_intrinsic_handler_t("fnmadd") {}
-};
-
-struct unpack_low_handler_t : public binary_intrinsic_handler_t {
-    unpack_low_handler_t() : binary_intrinsic_handler_t("unpack_low") {}
-};
-
-struct unpack_high_handler_t : public binary_intrinsic_handler_t {
-    unpack_high_handler_t() : binary_intrinsic_handler_t("unpack_high") {}
-};
-
-struct shuffle_handler_t : public binary_intrinsic_handler_t {
-    shuffle_handler_t() : binary_intrinsic_handler_t("shuffle") {}
-};
-
-struct permute_handler_t : public binary_intrinsic_handler_t {
-    permute_handler_t() : binary_intrinsic_handler_t("permute") {}
-};
-
-struct reinterpret_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.intrin_attrs_->get<sc_data_type_t>(
-                intrin_attr::out_dtype);
-    }
-    reinterpret_handler_t() : intrinsic_handler_t("reinterpret") {}
-};
-
-struct permutex2var_handler_t : public trinary_intrinsic_handler_t {
-    permutex2var_handler_t() : trinary_intrinsic_handler_t("permutex2var") {}
-};
-
-struct permutexvar_handler_t : public binary_intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        COMPILE_ASSERT(node.args_.size() == 2, "Expecting 2 args.");
-        node.dtype_ = node.args_[1]->dtype_;
-    }
-    permutexvar_handler_t() : binary_intrinsic_handler_t("permutexvar") {}
-};
-
-struct insert_handler_t : public trinary_intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 2 || node.args_.size() == 3);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    insert_handler_t() : trinary_intrinsic_handler_t("insert") {}
-};
-
-struct extract_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1 || node.args_.size() == 2);
-        if (node.args_.size() == 1) {
-            node.dtype_ = sc_data_type_t(node.args_[0]->dtype_.type_code_);
-            auto lanes = node.intrin_attrs_->get<int>("lanes");
-            if (lanes > 1) { node.dtype_.lanes_ = lanes; }
-        } else {
-            auto rows = node.intrin_attrs_->get<uint32_t>("rows");
-            auto cols = node.intrin_attrs_->get<uint32_t>("cols");
-            auto lanes = rows > 0 ? (rows * cols) : cols;
-            node.dtype_ = sc_data_type_t(
-                    node.args_[0]->dtype_.type_code_, lanes, rows);
-        }
-    }
-    extract_handler_t() : intrinsic_handler_t("extract") {}
-};
-
-struct gather_handler_t : public binary_intrinsic_handler_t {
-    gather_handler_t() : binary_intrinsic_handler_t("gather") {}
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_[1]->dtype_.is_etype(sc_data_etype::S32));
-        node.dtype_ = node.args_[0]->dtype_.get_pointer_element();
-        node.dtype_.lanes_ = node.args_[1]->dtype_.lanes_;
-    }
-};
-
-struct round_and_cast_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.intrin_attrs_->get<sc_data_type_t>(
-                intrin_attr::out_dtype);
-        COMPILE_ASSERT(node.dtype_.lanes_ == node.args_[0]->dtype_.lanes_
-                        && (node.dtype_.type_code_ == sc_data_etype::S32
-                                || (node.dtype_.type_code_ == sc_data_etype::U32
-                                        && node.dtype_.lanes_ > 1))
-                        && node.args_[0]->dtype_.type_code_
-                                == sc_data_etype::F32,
-                "round_and_cast cannot handle " << node.args_[0]->dtype_ << "->"
-                                                << node.dtype_);
-    }
-    round_and_cast_handler_t() : intrinsic_handler_t("round_and_cast") {}
-};
-
-struct isnan_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = sc_data_type_t::boolean(node.dtype_.lanes_);
-    }
-    isnan_handler_t() : intrinsic_handler_t("isnan") {}
-};
-
-struct saturated_cast_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.intrin_attrs_->get<sc_data_type_t>("out_dtype");
-    }
-    saturated_cast_handler_t() : intrinsic_handler_t("saturated_cast") {}
-};
-
-struct shl_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 2);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    shl_handler_t() : intrinsic_handler_t("shl") {}
-};
-
-struct shr_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 2);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    shr_handler_t() : intrinsic_handler_t("shr") {}
-};
-
-struct int_and_handler_t : public binary_intrinsic_handler_t {
-    int_and_handler_t() : binary_intrinsic_handler_t("int_and") {}
-};
-
-struct int_or_handler_t : public binary_intrinsic_handler_t {
-    int_or_handler_t() : binary_intrinsic_handler_t("int_or") {}
-};
-
-struct int_xor_handler_t : public binary_intrinsic_handler_t {
-    int_xor_handler_t() : binary_intrinsic_handler_t("int_xor") {}
-};
-
-sc_data_type_t get_dtype_from_struct_and_field(
-        const std::string &in, int field) {
-    if (in == dyn_tsr_struct_t::name) {
-        return dyn_tsr_struct_t::dtypes[field];
-    } else {
-        COMPILE_ASSERT(false, "struct " << in << " has not been supported!");
-    }
-    return sc_data_type_t();
-}
-
-struct read_struct_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        assert(node.intrin_attrs_->has_key(intrin_attr::struct_name)
-                && node.intrin_attrs_->has_key(intrin_attr::struct_field));
-        node.dtype_ = get_dtype_from_struct_and_field(
-                node.intrin_attrs_->get<std::string>(intrin_attr::struct_name),
-                node.intrin_attrs_->get<int>(intrin_attr::struct_field));
-    }
-    read_struct_handler_t() : intrinsic_handler_t("read_struct") {}
-};
-
-struct write_struct_handler_t : public binary_intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.intrin_attrs_->has_key(intrin_attr::struct_name)
-                && node.intrin_attrs_->has_key(intrin_attr::struct_field));
-        node.dtype_ = datatypes::void_t;
-    }
-    write_struct_handler_t() : binary_intrinsic_handler_t("write_struct") {}
-};
-
-struct brgemm_handler_t : public intrinsic_handler_t {
-    size_t arg_cnt_;
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.check_brgemm_arg_size(arg_cnt_));
-        node.dtype_ = datatypes::void_t;
-    }
-    brgemm_handler_t(size_t arg_cnt, const char *name)
-        : intrinsic_handler_t(name), arg_cnt_(arg_cnt) {}
-};
-
-struct set_thread_idle_func_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        node.dtype_ = datatypes::void_t;
-        COMPILE_ASSERT(node.args_.size() >= 2UL,
-                "set_thread_idle_func requires more than 2 args");
-        COMPILE_ASSERT(node.args_[0]->dtype_ == datatypes::pointer,
-                "The first arg of set_thread_idle_func should be pointer");
-    }
-    set_thread_idle_func_handler_t()
-        : intrinsic_handler_t("set_thread_idle_func") {}
-};
-
-struct prefetch_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        node.dtype_ = datatypes::void_t;
-        COMPILE_ASSERT(node.args_.size() == 1, "prefetch requires 1 arg");
-        COMPILE_ASSERT(node.args_[0]->dtype_.is_pointer(),
-                "The first arg of prefetch should be pointer");
-        auto locality = node.intrin_attrs_->get_or_else("locality", -1);
-        COMPILE_ASSERT(locality >= 0 && locality <= 3,
-                "locality attr of prefetch must be between 0 to 3");
-    }
-    prefetch_handler_t() : intrinsic_handler_t("prefetch") {}
-};
-
-struct constant_load_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    constant_load_handler_t() : intrinsic_handler_t("constant_load") {}
-};
-
-struct volatile_load_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    volatile_load_handler_t() : intrinsic_handler_t("volatile_load") {}
-};
-
-struct get_group_id_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = datatypes::u32;
-    }
-    get_group_id_handler_t() : intrinsic_handler_t("get_group_id") {}
-};
-
-struct get_group_thread_id_handler_t : public intrinsic_handler_t {
-    void on_initialize(intrin_call_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = datatypes::s32;
-    }
-    get_group_thread_id_handler_t()
-        : intrinsic_handler_t("get_group_thread_id") {}
-};
-
-struct avx_broadcast_idx_handler_t : public x86_intrinsic_handler_t {
-    void on_initialize(low_level_intrin_node &node) override {
-        assert(node.args_.size() == 3);
-        assert(node.args_[0]->dtype_.is_pointer());
-        auto lanes = node.intrin_attrs_->get<int>("lanes");
-        node.dtype_ = sc_data_type_t(
-                etypes::get_pointer_element(node.args_[0]->dtype_.type_code_),
-                lanes);
-    }
-    avx_broadcast_idx_handler_t()
-        : x86_intrinsic_handler_t("avx_broadcast_idx") {}
-};
-
-struct avx_mask_cast_handler_t : public x86_intrinsic_handler_t {
-    void on_initialize(low_level_intrin_node &node) override {
-        assert(node.args_.size() == 1);
-        node.dtype_ = node.intrin_attrs_->get<sc_data_type_t>("dtype");
-    }
-    avx_mask_cast_handler_t() : x86_intrinsic_handler_t("avx_mask_cast") {}
-};
-
-struct avx_compare_handler_t : public x86_intrinsic_handler_t {
-    void on_initialize(low_level_intrin_node &node) override {
-        assert(node.args_.size() == 3);
-        assert(node.args_[2].isa<constant>());
-        assert(node.args_[0]->dtype_ == node.args_[1]->dtype_);
-        node.dtype_ = node.args_[0]->dtype_;
-    }
-    avx_compare_handler_t() : x86_intrinsic_handler_t("avx_compare") {}
-};
-
-namespace brgemm_args {
-sc_data_type_t arg_types[NUM_FULL_ARGS_STRIDE] = {
-        datatypes::pointer, // A (overloaded)
-        datatypes::pointer, // B
-        datatypes::pointer, // C
-        datatypes::s32, // num
-        datatypes::s32, // M
-        datatypes::s32, // N
-        datatypes::s32, // K
-        datatypes::s32, // LDA
-        datatypes::s32, // LDB
-        datatypes::s32, // LDC
-        datatypes::s32, // stride_a
-        datatypes::s32, // stride_b
-        datatypes::pointer, // bias
-        datatypes::pointer, // scales
-        datatypes::pointer, // binary_post_ops_rhs
-        datatypes::index, // oc_logical_off
-        datatypes::index, // dst_row_logical_off
-        datatypes::pointer, // data_C_ptr
-        datatypes::index, // first_mb_matrix_addr_off
-        datatypes::pointer, // a_zp_compensations
-        datatypes::pointer, // b_zp_compensations
-        datatypes::pointer, // c_zp_values
-        datatypes::boolean, // skip_accumulation
-        datatypes::s32, // zp_a_val
-        datatypes::boolean, // do_only_comp
-        datatypes::boolean, // do_only_zp_a_val
-        datatypes::pointer, // c_buf
-        datatypes::index, // bdmask_idx
-        datatypes::pointer, // top_pad
-        datatypes::pointer // bottom_pad
-};
-
-sc_data_type_t list_arg_types[NUM_FULL_ARGS_LIST] = {
-        datatypes::pointer, // A
-        datatypes::pointer, // B
-        datatypes::pointer, // C
-        datatypes::s32, // num
-        datatypes::s32, // M
-        datatypes::s32, // N
-        datatypes::s32, // K
-        datatypes::s32, // LDA
-        datatypes::s32, // LDB
-        datatypes::s32, // LDC
-        datatypes::s32, // stride_a
-        datatypes::s32, // stride_b
-        datatypes::s32, // len
-        datatypes::pointer, // bias
-        datatypes::pointer, // scales
-        datatypes::pointer, // binary_post_ops_rhs
-        datatypes::index, // oc_logical_off
-        datatypes::index, // dst_row_logical_off
-        datatypes::pointer, // data_C_ptr
-        datatypes::index, // first_mb_matrix_addr_off
-        datatypes::pointer, // a_zp_compensations
-        datatypes::pointer, // b_zp_compensations
-        datatypes::pointer, // c_zp_values
-        datatypes::boolean, // skip_accumulation
-        datatypes::s32, // zp_a_val
-        datatypes::boolean, // do_only_comp
-        datatypes::boolean, // do_only_zp_a_val
-        datatypes::pointer, // c_buf
-        datatypes::index, // bdmask_idx
-        datatypes::pointer, // top_pad
-        datatypes::pointer // bottom_pad
-};
-} // namespace brgemm_args
-
-static std::unique_ptr<intrinsic_handler_t> handlers[] = {
-        utils::make_unique<min_handler_t>(),
-        utils::make_unique<max_handler_t>(),
-        utils::make_unique<abs_handler_t>(),
-        utils::make_unique<round_handler_t>(),
-        utils::make_unique<floor_handler_t>(),
-        utils::make_unique<ceil_handler_t>(),
-        utils::make_unique<exp_handler_t>(),
-        utils::make_unique<log_handler_t>(),
-        utils::make_unique<erf_handler_t>(),
-        utils::make_unique<sqrt_handler_t>(),
-        utils::make_unique<rsqrt_handler_t>(),
-        utils::make_unique<reduce_add_handler_t>(),
-        utils::make_unique<reduce_mul_handler_t>(),
-        utils::make_unique<reduce_max_handler_t>(),
-        utils::make_unique<reduce_min_handler_t>(),
-        utils::make_unique<fmadd_handler_t>(),
-        utils::make_unique<fnmadd_handler_t>(),
-        utils::make_unique<unpack_low_handler_t>(),
-        utils::make_unique<unpack_high_handler_t>(),
-        utils::make_unique<shuffle_handler_t>(),
-        utils::make_unique<permute_handler_t>(),
-        utils::make_unique<int_and_handler_t>(),
-        utils::make_unique<int_or_handler_t>(),
-        utils::make_unique<int_xor_handler_t>(),
-        utils::make_unique<reinterpret_handler_t>(),
-        utils::make_unique<broadcast_handler_t>(),
-        utils::make_unique<isnan_handler_t>(),
-        utils::make_unique<saturated_cast_handler_t>(),
-        utils::make_unique<round_and_cast_handler_t>(),
-        utils::make_unique<shl_handler_t>(),
-        utils::make_unique<shr_handler_t>(),
-        utils::make_unique<permutex2var_handler_t>(),
-        utils::make_unique<permutexvar_handler_t>(),
-        utils::make_unique<insert_handler_t>(),
-        utils::make_unique<extract_handler_t>(),
-        utils::make_unique<gather_handler_t>(),
-        utils::make_unique<read_struct_handler_t>(),
-        utils::make_unique<write_struct_handler_t>(),
-        utils::make_unique<set_thread_idle_func_handler_t>(),
-        utils::make_unique<prefetch_handler_t>(),
-        utils::make_unique<constant_load_handler_t>(),
-        utils::make_unique<volatile_load_handler_t>(),
-        utils::make_unique<get_group_id_handler_t>(),
-        utils::make_unique<get_group_thread_id_handler_t>(),
-        utils::make_unique<brgemm_handler_t>(
-                brgemm_args::NUM_FULL_ARGS_STRIDE, "brgemm"),
-        utils::make_unique<brgemm_handler_t>(
-                brgemm_args::NUM_FULL_ARGS_LIST, "list_brgemm"),
-};
-
-static_assert(sizeof(handlers) / sizeof(handlers[0])
-                == int(intrin_type::NUM_INTRINSICS),
-        "Not all intrinsics are filled in handlers");
-
-intrinsic_handler_t &get_intrinsic_handler(intrin_type intrin) {
-    return *handlers[static_cast<int>(intrin)];
-}
-
-static std::unique_ptr<x86_intrinsic_handler_t> x86_handlers[] = {
-        utils::make_unique<avx_broadcast_idx_handler_t>(),
-        utils::make_unique<avx_mask_cast_handler_t>(),
-        utils::make_unique<avx_compare_handler_t>(),
-};
-x86_intrinsic_handler_t &get_x86_intrinsic_handler(int64_t intrin) {
-    return *x86_handlers[intrin];
-}
-
-static_assert(sizeof(x86_handlers) / sizeof(x86_handlers[0])
-                == int(x86_intrin_type::NUM_INTRINSICS),
-        "Not all intrinsics are filled in x86 handlers");
-
-#define OFFSET(struct_type, field) \
-    (size_t)(&(((struct_type *)nullptr)->field)) // NOLINT
-
-const sc_data_type_t dyn_tsr_struct_t::dtypes[5] = {datatypes::pointer,
-        datatypes::pointer, datatypes::s32, datatypes::u32, datatypes::u8};
-const size_t dyn_tsr_struct_t::offsets[5]
-        = {OFFSET(runtime::dynamic_tensor_t, data_),
-                OFFSET(runtime::dynamic_tensor_t, dims_),
-                OFFSET(runtime::dynamic_tensor_t, ndims_),
-                OFFSET(runtime::dynamic_tensor_t, dtype_),
-                OFFSET(runtime::dynamic_tensor_t, dyn_mask_)};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/intrinsics.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/intrinsics.hpp
deleted file mode 100644
index 01f6ad30998..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/intrinsics.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_INTRINSICS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_INTRINSICS_HPP
-
-#include <ostream>
-#include <string>
-#include <vector>
-#include "sc_expr.hpp"
-#include <runtime/microkernel/cpu/brgemm_common.hpp>
-#include <util/any_map.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class ir_visitor_t;
-class ir_printer_t;
-struct intrinsic_handler_t {
-    std::string name_;
-    virtual void on_initialize(intrin_call_node &node) = 0;
-    intrinsic_handler_t(const std::string &name);
-    virtual void to_string(const intrin_call_c &v, ir_printer_t *printer) const;
-    virtual ~intrinsic_handler_t() = default;
-};
-
-struct x86_intrinsic_handler_t : public intrinsic_handler_t {
-    virtual void on_initialize(intrin_call_node &node) {};
-    virtual void on_initialize(low_level_intrin_node &node) = 0;
-    x86_intrinsic_handler_t(const std::string &name);
-    virtual ~x86_intrinsic_handler_t() = default;
-};
-
-// user defined struct for reading/writing at runtime.
-struct dyn_tsr_struct_t {
-    static constexpr const char *name = "dyn_tsr";
-    enum fields : int { data_ptr = 0, dim_ptr, ndims, dtype, dyn_mask };
-    static const sc_data_type_t dtypes[5];
-    static const size_t offsets[5];
-};
-
-sc_data_type_t get_dtype_from_struct_and_field(
-        const std::string &in, int field);
-
-// the indices of arguments of the brgemm intrinsices
-namespace brgemm_args {
-constexpr int A = 0;
-constexpr int B = 1;
-constexpr int C = 2;
-constexpr int NUM = 3;
-constexpr int M = 4;
-constexpr int N = 5;
-constexpr int K = 6;
-constexpr int LDA = 7;
-constexpr int LDB = 8;
-constexpr int LDC = 9;
-constexpr int STRIDE_A = 10;
-constexpr int STRIDE_B = 11;
-constexpr int NUM_BASIC_ARGS_STRIDE = STRIDE_B + 1;
-constexpr int LEN = 12;
-constexpr int NUM_BASIC_ARGS_LIST = LEN + 1;
-// extra +4 for c_buf, bdmask_idx, top_pad, bottom_pad
-constexpr int NUM_FULL_ARGS_STRIDE
-        = NUM_BASIC_ARGS_STRIDE + brgemm::postops_data_init_func_nargs + 4;
-// extra +4 for c_buf, bdmask_idx, top_pad, bottom_pad
-constexpr int NUM_FULL_ARGS_LIST
-        = NUM_BASIC_ARGS_LIST + brgemm::postops_data_init_func_nargs + 4;
-
-struct cpu_t {
-    // use init_update or update
-    bool init_;
-    bool operator==(const cpu_t &other) const { return init_ == other.init_; }
-    bool operator!=(const cpu_t &other) const { return init_ != other.init_; }
-};
-
-namespace extra_args_offset {
-constexpr int dtypeA = 0;
-constexpr int dtypeB = 1;
-constexpr int brg_attrs = 2;
-constexpr int bd_mask = 3;
-constexpr int postops_setting = 4;
-constexpr int cache_nargs = postops_setting + 1;
-constexpr int top_pad = 5;
-constexpr int bottom_pad = 6;
-constexpr int postops_data = 7;
-constexpr int c_buf = 8;
-constexpr int nargs = c_buf + 1;
-} // namespace extra_args_offset
-
-struct extra_args_t {
-    bool is_cpu_;
-    sc_data_type_t dtype_A_ = datatypes::undef; // element dtype of mat A
-    sc_data_type_t dtype_B_ = datatypes::undef; // element dtype of mat B
-    sc_data_type_t dtype_C_ = datatypes::undef; // element dtype of mat C
-    sc_brgemm_attrs_t brg_attrs_; // brgemm attrs
-    sc_brgemm_bd_mask_t bd_mask_; // bd mask
-    int bd_mask_set_num_; // num of bd_mask set
-    sc_brgemm_postops_setting_t postops_setting_; // post ops setting
-
-    union {
-        cpu_t cpu_;
-    };
-    extra_args_t(const cpu_t &g, sc_data_type_t dtypeA,
-            sc_data_type_t dtypeB = datatypes::undef,
-            sc_data_type_t dtypeC = datatypes::undef,
-            const sc_brgemm_attrs_t &brg_attrs = sc_brgemm_attrs_t(),
-            const sc_brgemm_bd_mask_t &bd_mask = sc_brgemm_bd_mask_t(),
-            const int bd_mask_set_num = 1,
-            const sc_brgemm_postops_setting_t &brg_postops
-            = sc_brgemm_postops_setting_t())
-        : is_cpu_(true)
-        , dtype_A_(dtypeA)
-        , dtype_B_(dtypeB == datatypes::undef ? dtypeA : dtypeB)
-        , dtype_C_(dtypeC == datatypes::undef ? dtypeA : dtypeC)
-        , brg_attrs_(brg_attrs)
-        , bd_mask_(bd_mask)
-        , bd_mask_set_num_(bd_mask_set_num)
-        , postops_setting_(brg_postops)
-        , cpu_(g) {}
-    bool operator==(const extra_args_t &other) const {
-        return is_cpu_ == other.is_cpu_ && dtype_A_ == other.dtype_A_
-                && dtype_B_ == other.dtype_B_ && dtype_C_ == other.dtype_C_
-                && brg_attrs_ == other.brg_attrs_ && bd_mask_ == other.bd_mask_
-                && bd_mask_set_num_ == other.bd_mask_set_num_
-                && postops_setting_ == other.postops_setting_
-                && cpu_ == other.cpu_;
-    }
-    bool operator!=(const extra_args_t &other) const {
-        return !(*this == other);
-    }
-};
-
-extern sc_data_type_t arg_types[NUM_FULL_ARGS_STRIDE];
-extern sc_data_type_t list_arg_types[NUM_FULL_ARGS_LIST];
-} // namespace brgemm_args
-
-intrinsic_handler_t &get_intrinsic_handler(intrin_type intrin);
-x86_intrinsic_handler_t &get_x86_intrinsic_handler(int64_t intrin);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_comparer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_comparer.cpp
deleted file mode 100644
index 4479b8eb179..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_comparer.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "builder.hpp"
-
-#include "ir_comparer.hpp"
-
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-void ir_comparer::reset() {
-    same_ = true;
-    if (diff) diff = utils::make_unique<ir_comparer_diff_t>();
-    var_mapping_.clear();
-}
-ir_comparer::ir_comparer(bool needs_diff, bool cmp_names, bool cmp_var_ref,
-        bool cmp_callee, bool cmp_commutative)
-    : cmp_names_(cmp_names)
-    , cmp_callee_(cmp_callee)
-    , cmp_var_ref_(cmp_var_ref)
-    , cmp_commutative_(cmp_commutative)
-    , same_(true) {
-    if (needs_diff) diff = utils::make_unique<ir_comparer_diff_t>();
-}
-
-bool ir_comparer::set_result(func_c l, func_c r, bool cond) {
-    if (same_ && !cond) {
-        same_ = false;
-        if (diff) {
-            diff->first_diff_func_.first = std::move(l);
-            diff->first_diff_func_.second = std::move(r);
-        }
-    }
-    return cond;
-}
-
-bool ir_comparer::set_result(expr_c l, expr_c r, bool cond) {
-    if (same_ && !cond) {
-        same_ = false;
-        if (diff) {
-            diff->first_diff_expr_.first = std::move(l);
-            diff->first_diff_expr_.second = std::move(r);
-        }
-    }
-    return cond;
-}
-
-bool ir_comparer::set_result(stmt_c l, stmt_c r, bool cond) {
-    if (same_ && !cond) {
-        same_ = false;
-        if (diff) {
-            diff->first_diff_stmt_.first = std::move(l);
-            diff->first_diff_stmt_.second = std::move(r);
-        }
-    }
-    return cond;
-}
-
-bool ir_comparer::expr_arr_equals(
-        const std::vector<expr> &a, const std::vector<expr> &b) {
-    if (a.size() != b.size()) { return false; }
-    for (size_t i = 0; i < a.size(); i++) {
-        if (!a.at(i)->equals(b.at(i), *this)) { return false; }
-    }
-    return true;
-}
-
-bool ir_comparer::compare(const func_c &l, const func_c &r, bool auto_reset) {
-    bool ret = l->equals(r, *this);
-    if (auto_reset) reset();
-    return ret;
-}
-
-bool ir_comparer::compare(const expr_c &l, expr_c r, bool auto_reset) {
-    bool ret = l->equals(std::move(r), *this);
-    if (auto_reset) reset();
-    return ret;
-}
-
-bool ir_comparer::compare(const stmt_c &l, stmt_c r, bool auto_reset) {
-    bool ret = l->equals(std::move(r), *this);
-    if (auto_reset) reset();
-    return ret;
-}
-
-bool ir_comparer::check_or_set_expr_mapping(const expr_c &l, const expr_c &r) {
-    auto f = var_mapping_.find(l.get());
-    if (f != var_mapping_.end()) { return f->second == r.get(); }
-    var_mapping_.insert(std::make_pair(l.get(), r.get()));
-    return true;
-}
-
-void ir_comparer::set_expr_mapping(const expr_c &l, const expr_c &r) {
-    var_mapping_.insert(std::make_pair(l.get(), r.get()));
-}
-
-bool ir_comparer::get_expr_mapping(const expr_c &l, const expr_c &r) {
-    auto f = var_mapping_.find(l.get());
-    if (f != var_mapping_.end()) { return f->second == r.get(); }
-    return false;
-}
-
-std::ostream &operator<<(std::ostream &os, ir_comparer &cmper) {
-    if (cmper.same_) {
-        os << "same";
-    } else {
-        os << "not same: ";
-        if (cmper.diff) {
-            if (cmper.diff->first_diff_expr_.first.defined()
-                    || cmper.diff->first_diff_expr_.second.defined()) {
-                os << "diff expr = " << cmper.diff->first_diff_expr_.first
-                   << " v.s. " << cmper.diff->first_diff_expr_.second;
-            } else if (cmper.diff->first_diff_func_.first
-                    || cmper.diff->first_diff_func_.second) {
-                os << "diff func = " << cmper.diff->first_diff_func_.first
-                   << " v.s. " << cmper.diff->first_diff_func_.second;
-            } else if (cmper.diff->first_diff_stmt_.first.defined()
-                    || cmper.diff->first_diff_stmt_.second.defined()) {
-                os << "diff stmt = " << cmper.diff->first_diff_stmt_.first
-                   << " v.s. " << cmper.diff->first_diff_stmt_.second;
-            }
-        }
-    }
-    os << '\n';
-    return os;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_comparer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_comparer.hpp
deleted file mode 100644
index f080447c1bb..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_comparer.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_COMPARER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_COMPARER_HPP
-
-#include <memory>
-#include <ostream>
-#include <utility>
-#include <vector>
-#include "sc_stmt.hpp"
-#include <unordered_map>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct ir_comparer_diff_t {
-    std::pair<func_c, func_c> first_diff_func_;
-    std::pair<expr_c, expr_c> first_diff_expr_;
-    std::pair<stmt_c, stmt_c> first_diff_stmt_;
-};
-
-class ir_comparer {
-    std::unordered_map<const expr_base *, const expr_base *> var_mapping_;
-
-public:
-    std::unique_ptr<ir_comparer_diff_t> diff;
-    bool cmp_names_;
-    bool cmp_callee_;
-    bool cmp_var_ref_;
-    bool cmp_commutative_;
-    bool same_;
-
-    void reset();
-    /**
-     * The IR comparison context.
-     * @param needs_diff true if need to return the first met different IR in
-     *  "diff" field
-     * @param cmp_names if false, will ignore the names of func/var/tensor.
-     *  i.e. nodes with different "name_" may be treated as the same
-     * @param cmp_var_ref if true, the comparison of var/tensor will be
-     *  comparing vars/tensors by their pointers instead of comparing the names
-     *  (decided by cmp_names), shapes and dtypes. Note: you can use
-     *  set_expr_mapping() to map one var/tensor to another even if this flag is
-     *  true. If the LHS var/tensor is in the mapping, IR comparison will check
-     *  whether the pointer of the LHS expr is mapped in the mapping instead
-     * @param cmp_callee similar to cmp_var_ref. if false, will compare callee
-     * functions in call_nodes by their pointer instead of comparing using
-     * func_base::equals()
-     * @param cmp_commutative if the expr is commutative, try to match both
-     * orders. DO NOT enable this when the expr is very complex
-     * */
-    ir_comparer(bool needs_diff = false, bool cmp_names = false,
-            bool cmp_var_ref = false, bool cmp_callee = false,
-            bool cmp_commutative = false);
-    bool set_result(func_c l, func_c r, bool cond);
-    bool set_result(expr_c l, expr_c r, bool cond);
-    bool set_result(stmt_c l, stmt_c r, bool cond);
-    bool expr_arr_equals(
-            const std::vector<expr> &a, const std::vector<expr> &b);
-
-    bool compare(const func_c &l, const func_c &r, bool auto_reset = true);
-    bool compare(const expr_c &l, expr_c r, bool auto_reset = true);
-    bool compare(const stmt_c &l, stmt_c r, bool auto_reset = true);
-
-    /**
-     * Checks if the var/tensor definition mapping is correct.
-     * Will set a mapping l=>r if the mapping of l is not found
-     * It will find the difference of:
-     * var a:int
-     * var b:int
-     * a=b
-     * ----------------------- and:
-     * var a:int
-     * var b:int
-     * b=a
-     * */
-    bool check_or_set_expr_mapping(const expr_c &l, const expr_c &r);
-
-    /**
-     * Sets the expr mapping from l to r.
-     * l and r should be var/tensor
-     * The result of l->equals(r, *this) will be true
-     * */
-    void set_expr_mapping(const expr_c &l, const expr_c &r);
-
-    /**
-     * Gets the expr mapping from l to r.
-     * l and r should be var/tensor
-     * If l is mapped to r, returns true
-     * */
-    bool get_expr_mapping(const expr_c &l, const expr_c &r);
-
-    template <typename T>
-    bool check_equals_may_null(T a, T b) {
-        if (!a.defined()) { return set_result(a, b, !b.defined()); }
-        if (!b.defined()) { return set_result(a, b, false); }
-        return a->equals(b, *this);
-    }
-};
-
-std::ostream &operator<<(std::ostream &, ir_comparer &);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_module.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_module.cpp
deleted file mode 100644
index 7ee663e4899..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_module.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "ir_module.hpp"
-#include <atomic>
-#include <utility>
-#include "builder.hpp"
-#include "pass/func_dependency.hpp"
-#include "visitor.hpp"
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/pass/printer.hpp>
-#include <runtime/dynamic_dispatch/op_dispatch_tables.hpp>
-#include <unordered_set>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static std::atomic<int> rename_cnt = {0};
-// this pass will
-// break the link from call_nodes to function nodes with bodies
-class func_unlinker_t : public ir_inplace_visitor_t {
-public:
-    using ir_inplace_visitor_t::dispatch_impl;
-    expr visit_impl(call c) override {
-        auto ret = ir_inplace_visitor_t::visit_impl(std::move(c)).as<call>();
-        func_t callee = std::dynamic_pointer_cast<func_base>(ret->func_);
-        if (!callee) { return ret; }
-        // if the callee has a function body, unlink the call_node with the
-        // function with body. Use the decl_ instead
-        if (callee->body_.defined()) {
-            assert(callee->decl_);
-            callee = callee->decl_;
-            return builder::remake_call(callee, ret->args_, ret);
-        } else {
-            assert(!callee->decl_);
-        }
-        // if the call is to a decl_, no changes
-        return ret;
-    }
-
-    expr visit_impl(func_addr c) override {
-        auto ret = ir_inplace_visitor_t::visit_impl(std::move(c))
-                           .as<func_addr>();
-        func_t callee = std::dynamic_pointer_cast<func_base>(ret->func_);
-        if (!callee) { return ret; }
-        // if the callee has a function body, unlink the call_node with the
-        // function with body. Use the decl_ instead
-        if (callee->body_.defined()) {
-            assert(callee->decl_);
-            ret->func_ = callee->decl_;
-            return ret;
-        } else {
-            assert(!callee->decl_);
-        }
-        // if the call is to a decl_, no changes
-        return ret;
-    }
-
-    func_t dispatch_impl(func_t f) override {
-        if (f->decl_) {
-            if (f->decl_->name_ != f->name_) { f->decl_->name_ = f->name_; }
-        }
-        return ir_inplace_visitor_t::dispatch_impl(std::move(f));
-    }
-};
-
-std::shared_ptr<ir_module_t> ir_module_t::from_entry_func(
-        context_ptr ctx, func_t funct) {
-    auto ret = from_entry_func(
-            std::move(ctx), std::vector<func_t> {std::move(funct)});
-    ret->entry_func_idx_ = 0;
-    return ret;
-}
-
-// resolve function dependency, returns funcs with their dependency
-static std::vector<func_t> resolve_func(const std::vector<func_t> &funcs) {
-    std::vector<func_t> dep;
-    std::unordered_set<func_t> set;
-    func_dependency_finder_t finder(dep);
-    for (auto &f : funcs) {
-        if (set.find(f) != set.end()) { continue; }
-        dep.emplace_back(f);
-        set.insert(f);
-        size_t old_size = 0;
-        do {
-            size_t last_size = old_size;
-            old_size = set.size();
-            // run dependency_finder on newly added functions
-            for (size_t i = last_size; i < dep.size(); i++) {
-                finder(dep[i], set);
-            }
-            assert(set.size() == dep.size());
-            // if old_size == set.size(), no dependency added, can exit
-        } while (old_size != set.size());
-    }
-    return dep;
-}
-
-std::shared_ptr<ir_module_t> ir_module_t::from_entry_func(
-        context_ptr ctx, const std::vector<func_t> &funcs) {
-    auto ret = std::make_shared<ir_module_t>(std::move(ctx));
-    ret->add_resolved_func(resolve_func(funcs));
-    return ret;
-}
-
-ir_module_t *ir_module_t::merge(const ir_module_t &m) {
-    assert(m.ctx_ == ctx_);
-    add_resolved_func(m.get_contents());
-    for (auto &v : m.module_vars_) {
-        add_global_var(v);
-    }
-    for (auto &kv : m.op_table_map_) {
-        add_op_table(kv);
-    }
-    return this;
-}
-
-ir_module_t *ir_module_t::merge(const std::vector<ir_module_ptr> &list) {
-    for (auto &m : list) {
-        merge(*m);
-    }
-    return this;
-}
-
-func_t ir_module_t::get_func(const std::string &name) const {
-    auto itr = symbols_.find(name);
-    if (itr != symbols_.end()) { return contents_.at(itr->second); }
-    return func_t();
-}
-
-ir_module_ptr ir_module_t::copy() const {
-    return std::make_shared<ir_module_t>(*this);
-}
-
-ir_module_ptr ir_module_t::copy_and_remove_funcs(
-        const std::vector<bool> &mask) const {
-    auto ret = std::make_shared<ir_module_t>(*this);
-    auto entry_func = ret->entry_func_idx_;
-    ret->entry_func_idx_ = -1;
-    ret->symbols_ = {};
-    std::vector<func_t> newcontents;
-    for (size_t old_id = 0; old_id < ret->contents_.size(); old_id++) {
-        if (mask.at(old_id)) {
-            auto new_id = newcontents.size();
-            auto the_func = ret->contents_[old_id];
-            newcontents.emplace_back(the_func);
-            ret->symbols_[the_func->name_] = new_id;
-            if ((int)old_id == entry_func) { ret->entry_func_idx_ = new_id; }
-        }
-    }
-    ret->contents_ = std::move(newcontents);
-    return ret;
-}
-
-ir_module_ptr ir_module_t::deep_copy() const {
-    auto ret = copy();
-    std::unordered_map<expr_c, expr> replacer;
-    for (auto &kv : ret->var_symbols_) {
-        ir_copier_t cpy {replacer, true};
-        kv.second = cpy(kv.second).remove_const().checked_as<define>();
-    }
-    for (auto &def : ret->module_vars_) {
-        def = ret->var_symbols_[get_node_name(def->var_)];
-    }
-    for (auto &f : ret->contents_) {
-        ir_copier_t cpy {replacer, true};
-        f = std::const_pointer_cast<func_base>(cpy(f));
-    }
-    for (auto &kv : ret->op_table_map_) {
-        kv.second = std::make_shared<op_dispatch_tables_t>(*kv.second);
-    }
-    return ret;
-}
-
-void ir_module_t::add_func(const std::vector<func_t> &funcs) {
-    add_resolved_func(resolve_func(funcs));
-}
-
-var ir_module_t::make_global_var(sc_data_type_t dtype, const std::string &name,
-        linkage linkage, expr init) {
-    var ret = builder::make_var(dtype, name).static_as<var>();
-    auto def = builder::make_var_tensor_def_unattached(
-            ret, linkage, std::move(init))
-                       .static_as<define>();
-    add_global_var(std::move(def));
-    return ret;
-}
-
-tensor ir_module_t::make_global_tensor(sc_data_type_t dtype,
-        const std::string &name, const std::vector<expr> &dims,
-        linkage linkage) {
-    tensor ret = builder::make_tensor(name, dims, dtype).static_as<tensor>();
-    auto def = builder::make_var_tensor_def_unattached(ret, linkage)
-                       .static_as<define>();
-    add_global_var(std::move(def));
-    return ret;
-}
-
-tensor ir_module_t::make_global_stensor(sc_data_type_t dtype,
-        const std::string &name, const std::vector<expr> &dims,
-        const std::vector<expr> &strides, linkage linkage, stmt *out_def_node) {
-    tensor ret = builder::make_stensor(name, dims, strides, dtype)
-                         .static_as<tensor>();
-    auto def = builder::make_var_tensor_def_unattached(ret, linkage)
-                       .static_as<define>();
-    if (out_def_node) { *out_def_node = def; }
-    add_global_var(std::move(def));
-    return ret;
-}
-
-void ir_module_t::add_global_var(define def) {
-    auto &name = def->var_.isa<var>() ? def->var_.static_as<var>()->name_
-                                      : def->var_.checked_as<tensor>()->name_;
-    auto itr = var_symbols_.find(name);
-    if (itr != var_symbols_.end()) {
-        COMPILE_ASSERT(def->linkage_ != linkage::public_global,
-                "Global var redefinition: " << name);
-        std::string new_name;
-        do {
-            new_name = name + '_' + std::to_string(++rename_cnt);
-            itr = var_symbols_.find(new_name);
-        } while (itr != var_symbols_.end());
-        name = new_name;
-    }
-    var_symbols_.insert(std::make_pair(name, def));
-    module_vars_.emplace_back(std::move(def));
-}
-
-void ir_module_t::add_resolved_func(const std::vector<func_t> &funcs) {
-    func_unlinker_t replacer;
-    // all dependencies found, now check if any function name duplications
-    for (auto &f : funcs) {
-        // skip function decl_
-        if (!f->body_.defined()) { continue; }
-        auto itr = symbols_.find(f->name_);
-        if (itr != symbols_.end()) {
-            // if the function is already added to the module, skip
-            if (contents_.at(itr->second) == f) { continue; }
-
-            // try to rename the function if we are allowed to
-            COMPILE_ASSERT(!f->attr_ || !f->attr_->has_key("entry_func"),
-                    "The function "
-                            << f->name_
-                            << " is duplicated and is marked \'entry_func\'.")
-            std::string name;
-            do {
-                name = f->name_ + '_' + std::to_string(++rename_cnt);
-                itr = symbols_.find(name);
-            } while (itr != symbols_.end());
-            f->name_ = name;
-            assert(f->decl_);
-        }
-        symbols_.insert(std::make_pair(f->name_, contents_.size()));
-        replacer.dispatch_impl(f);
-        contents_.emplace_back(f);
-    }
-}
-
-void ir_module_t::add_op_table(
-        const std::pair<std::string, op_dispatch_tables_ptr> &tb) {
-    op_table_map_.insert(tb);
-}
-
-void ir_module_t::run_pass(function_pass_t &pass) {
-    for (auto &f : contents_) {
-        f = std::const_pointer_cast<func_base>(pass(f));
-    }
-}
-
-func_t ir_module_t::make_init_func() const {
-    if (!module_vars_.empty()) {
-        stmts seq = make_stmt<stmts_node_t>(std::vector<stmt>());
-        for (auto &v : module_vars_) {
-            assert(v->linkage_ == linkage::private_global
-                    || v->linkage_ == linkage::public_global);
-            if (v->init_.defined()) {
-                seq->seq_.emplace_back(
-                        builder::make_assign_unattached(v->var_, v->init_));
-            }
-        }
-        if (seq->seq_.empty()) return func_t();
-        auto ret = builder::make_func("__sc_init__", std::vector<expr_c>(),
-                std::move(seq), datatypes::void_t);
-        return ret;
-    }
-    return func_t();
-}
-ostream &operator<<(ostream &os, const ir_module_t &m) {
-    ir_printer_t p {os};
-    return p.do_dispatch(m);
-}
-ostream &operator<<(ostream &os, const const_ir_module_ptr &m) {
-    return (os << *m);
-}
-
-ostream &operator<<(ostream &os, const ir_module_ptr &m) {
-    return (os << const_ir_module_ptr(m));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_module.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_module.hpp
deleted file mode 100644
index bbc2d60d794..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_module.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_MODULE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_MODULE_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/sc_expr.hpp>
-#include <compiler/ir/sc_function.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-struct brg_range_handle_t;
-using brg_range_handle_ptr = std::shared_ptr<brg_range_handle_t>;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct op_dispatch_tables_t;
-using op_dispatch_tables_ptr = std::shared_ptr<op_dispatch_tables_t>;
-using dispatch_table_map_t
-        = std::unordered_map<std::string, op_dispatch_tables_ptr>;
-
-class function_pass_t;
-
-class SC_INTERNAL_API ir_module_t {
-    // Items to appear in the module.
-    // Might be useful at some point to be able to include data,
-    // initializations, finalizations, etc.
-    std::vector<func_t> contents_;
-    // the function name -> func_idx mapping
-    std::unordered_map<std::string, int> symbols_;
-    // the "main" function of the module, <0 if no entry func_t is set
-    int entry_func_idx_;
-    // the global variables
-    std::vector<define> module_vars_;
-    std::unordered_map<std::string, define> var_symbols_;
-    // hold all op tables in ir module and pass to jit module for dynamic
-    // dispatch. the op function name -> op tables ptr
-    dispatch_table_map_t op_table_map_;
-    // vector of brgemm range handler.
-    std::vector<brg_range_handle_ptr> brg_handles_;
-
-public:
-    // the attr keys for ir_module
-    struct attr_key_t {
-        // the statics_table_t for the global data buffer
-        static constexpr const char *MODULE_DATA_BUFFERS
-                = "MODULE_DATA_BUFFERS";
-        // float, the estimated number of FP operations in the module (GFLOPS)
-        static constexpr const char *GFLOP = "gflop";
-        // bool, whether to use managed thread pool
-        static constexpr const char *MANAGED_THREAD_POOL
-                = "MANAGED_THREAD_POOL";
-        // string, the name of the module
-        static constexpr const char *NAME = "name";
-        // bool, default=false. whether the addresses of global tensors and
-        // variables will be hardcoded in the JIT'd code
-        static constexpr const char *STATIC_GLOBALS = "static_globals";
-        // vector<shared_ptr<shared_const_wrapper>>. The list of base tensors
-        // for shared consts
-        static constexpr const char *SHARED_CONST_BASES = "shared_const_bases";
-    };
-
-    context_ptr ctx_;
-    any_map_t attr_;
-    ir_module_t(context_ptr ctx) : ctx_(std::move(ctx)) {
-        entry_func_idx_ = -1;
-    }
-
-    // creates an ir_module_t with given functions. If entry_func_idx >=0, will
-    // set entry_func_ = contents[entry_func_idx]. Otherwise, will leave
-    // entry_func_ as empty
-    ir_module_t(context_ptr ctx, const std::vector<func_t> &contents,
-            int entry_func_idx = -1)
-        : contents_(contents), ctx_(std::move(ctx)) {
-        entry_func_idx_ = entry_func_idx;
-    }
-
-    int get_entry_func_idx() const { return entry_func_idx_; }
-    void set_entry_func_idx(int entry_func_idx) {
-        COMPILE_ASSERT(entry_func_idx == -1
-                        || (entry_func_idx >= 0
-                                && size_t(entry_func_idx) < contents_.size()),
-                "Invalid entry_func_idx");
-        entry_func_idx_ = entry_func_idx;
-    }
-
-    std::vector<func_t> &get_contents() { return contents_; }
-    const std::vector<func_t> &get_contents() const { return contents_; }
-
-    const std::vector<define> &get_module_vars() const { return module_vars_; }
-    std::vector<define> &get_module_vars() { return module_vars_; }
-
-    const dispatch_table_map_t &get_op_table_map() const {
-        return op_table_map_;
-    }
-    dispatch_table_map_t &get_op_table_map() { return op_table_map_; }
-    const std::vector<brg_range_handle_ptr> &get_brg_range_handle_vec() const {
-        return brg_handles_;
-    }
-    std::vector<brg_range_handle_ptr> &get_brg_range_handle_vec() {
-        return brg_handles_;
-    }
-
-    // run the pass on all functions in this module
-    void run_pass(function_pass_t &pass);
-
-    func_t make_init_func() const;
-
-    var make_global_var(sc_data_type_t dtype, const std::string &name,
-            linkage linkage = linkage::private_global, expr init = expr());
-    tensor make_global_tensor(sc_data_type_t dtype, const std::string &name,
-            const std::vector<expr> &dims,
-            linkage linkage = linkage::private_global);
-    // make a global tensor with strides
-    tensor make_global_stensor(sc_data_type_t dtype, const std::string &name,
-            const std::vector<expr> &dims, const std::vector<expr> &strides,
-            linkage linkage = linkage::private_global,
-            stmt *out_def_node = nullptr);
-    // adds a global var def, handles renaming
-    void add_global_var(define def);
-    // adds a pair of name and op table for dynamic dispatch
-    void add_op_table(const std::pair<std::string, op_dispatch_tables_ptr> &tb);
-    // gets the entry func_t. nullable
-    func_t get_entry_func() const {
-        return entry_func_idx_ >= 0 ? contents_[entry_func_idx_] : func_t();
-    }
-    // get var define node from input symbol, if could not find, return nullptr.
-    define get_var_def_from_symbol(const std::string &symbol) const {
-        auto it = var_symbols_.find(symbol);
-        if (it != var_symbols_.end()) { return it->second; }
-        return define();
-    }
-
-    // adds a list of functions to the module, resolves dependencies and handles
-    // name duplications
-    void add_func(const std::vector<func_t> &f);
-
-    // adds the functions of another module to this module. Will handle the
-    // renaming of the functions returns this
-    ir_module_t *merge(const ir_module_t &m);
-
-    // adds the function of another module list to this module.
-    ir_module_t *merge(const std::vector<std::shared_ptr<ir_module_t>> &list);
-
-    // gets a function from the module. If the name is not in the symbol table,
-    // returns null
-    func_t get_func(const std::string &name) const;
-
-    // copies the module
-    std::shared_ptr<ir_module_t> copy() const;
-    // deep copies the module
-    std::shared_ptr<ir_module_t> deep_copy() const;
-
-    // copies the module and remove the specified funcs by mask. A function will
-    // be copied to the returned IR module only if it is get_contents()[i] and
-    // mask[i] is true.
-    std::shared_ptr<ir_module_t> copy_and_remove_funcs(
-            const std::vector<bool> &mask) const;
-
-    /**
-     * Creates an IR module from a list of functions. Finds the direct and
-     * indirect dependent functions of the given functions. Also rename the
-     * functions with duplicated names to "XXX_1", "XXX_1_1", etc.
-     * @param ctx the context
-     * @param f the list of functions
-     * @return the `ir_module_t` containing the given list of functions and all
-     *  dependent functions. The names of the functions in the module are unique
-     * */
-    static std::shared_ptr<ir_module_t> from_entry_func(
-            context_ptr ctx, const std::vector<func_t> &f);
-
-    /**
-     * @see from_entry_func overloaded function
-     * */
-    static std::shared_ptr<ir_module_t> from_entry_func(
-            context_ptr ctx, func_t f);
-
-private:
-    // adds a list of functions to the module, dependency already resolved
-    void add_resolved_func(const std::vector<func_t> &f);
-};
-
-using ir_module_ptr = std::shared_ptr<ir_module_t>;
-using const_ir_module_ptr = std::shared_ptr<const ir_module_t>;
-
-extern ostream &operator<<(ostream &os, const const_ir_module_ptr &);
-SC_API extern ostream &operator<<(ostream &os, const ir_module_ptr &);
-SC_API extern ostream &operator<<(ostream &os, const ir_module_t &);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_node_names.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_node_names.hpp
deleted file mode 100644
index 73f427b7600..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_node_names.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_NODE_NAMES_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_NODE_NAMES_HPP
-
-#define SC_EXPAND(x) x
-
-#define FOR_EACH_EXPR_IR_TYPE(F, ...) \
-    SC_EXPAND(F(constant, __VA_ARGS__)) \
-    SC_EXPAND(F(var, __VA_ARGS__)) \
-    SC_EXPAND(F(cast, __VA_ARGS__)) \
-    SC_EXPAND(F(add, __VA_ARGS__)) \
-    SC_EXPAND(F(sub, __VA_ARGS__)) \
-    SC_EXPAND(F(mul, __VA_ARGS__)) \
-    SC_EXPAND(F(div, __VA_ARGS__)) \
-    SC_EXPAND(F(mod, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp_eq, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp_ne, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp_lt, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp_le, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp_gt, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp_ge, __VA_ARGS__)) \
-    SC_EXPAND(F(logic_and, __VA_ARGS__)) \
-    SC_EXPAND(F(logic_or, __VA_ARGS__)) \
-    SC_EXPAND(F(logic_not, __VA_ARGS__)) \
-    SC_EXPAND(F(select, __VA_ARGS__)) \
-    SC_EXPAND(F(indexing, __VA_ARGS__)) \
-    SC_EXPAND(F(call, __VA_ARGS__)) \
-    SC_EXPAND(F(tensor, __VA_ARGS__)) \
-    SC_EXPAND(F(tensorptr, __VA_ARGS__)) \
-    SC_EXPAND(F(intrin_call, __VA_ARGS__)) \
-    SC_EXPAND(F(ssa_phi, __VA_ARGS__)) \
-    SC_EXPAND(F(func_addr, __VA_ARGS__)) \
-    SC_EXPAND(F(low_level_intrin, __VA_ARGS__))
-
-#define FOR_EACH_BASE_EXPR_IR_TYPE(F, ...) \
-    SC_EXPAND(F(binary, __VA_ARGS__)) \
-    SC_EXPAND(F(logic, __VA_ARGS__)) \
-    SC_EXPAND(F(cmp, __VA_ARGS__))
-
-#define FOR_EACH_STMT_IR_TYPE(F, ...) \
-    SC_EXPAND(F(assign, __VA_ARGS__)) \
-    SC_EXPAND(F(stmts, __VA_ARGS__)) \
-    SC_EXPAND(F(if_else, __VA_ARGS__)) \
-    SC_EXPAND(F(evaluate, __VA_ARGS__)) \
-    SC_EXPAND(F(for_loop, __VA_ARGS__)) \
-    SC_EXPAND(F(returns, __VA_ARGS__)) \
-    SC_EXPAND(F(define, __VA_ARGS__))
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_utils.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_utils.cpp
deleted file mode 100644
index 5d33a22a91b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_utils.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "ir_utils.hpp"
-#include <functional>
-#include "sc_expr.hpp"
-#include "sc_stmt.hpp"
-#include "ssa_data.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <util/array_ref.hpp>
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-void get_direct_dependency_of_expr(
-        const expr &v, const std::function<void(array_ref<expr>)> &callback) {
-    switch (v->node_type_) {
-        case sc_expr_type::undef: assert(0 && "Unreachable"); break;
-        case sc_expr_type::constant:
-        case sc_expr_type::func_addr: break;
-        case sc_expr_type::tensor:
-        case sc_expr_type::var: {
-            if (utils::is_uninitialized_weakptr(v->ssa_data_->owner_)) {
-                // a var/tensor can be attached none of the define_node, which
-                // means it is a parameter
-                break;
-            }
-            stmt owner {v->ssa_data_->owner_.lock()};
-            assert(owner.defined());
-            auto init = owner.static_as<define>()->init_;
-            if (owner.isa<define>() && init.defined()) {
-                if (init.isa<var>() || init.isa<tensor>()) {
-                    callback({init});
-                } else {
-                    get_direct_dependency_of_expr(init, callback);
-                }
-            }
-        } break;
-        case sc_expr_type::cast: callback({v.static_as<cast>()->in_}); break;
-        case sc_expr_type::add:
-        case sc_expr_type::sub:
-        case sc_expr_type::mul:
-        case sc_expr_type::div:
-        case sc_expr_type::mod: {
-            auto val = v.static_as<binary>();
-            callback({val->l_, val->r_});
-        } break;
-        case sc_expr_type::cmp_eq:
-        case sc_expr_type::cmp_ne:
-        case sc_expr_type::cmp_lt:
-        case sc_expr_type::cmp_le:
-        case sc_expr_type::cmp_gt:
-        case sc_expr_type::cmp_ge: {
-            auto val = v.static_as<cmp>();
-            callback({val->l_, val->r_});
-        } break;
-        case sc_expr_type::logic_and:
-        case sc_expr_type::logic_or: {
-            auto val = v.static_as<logic>();
-            callback({val->l_, val->r_});
-        } break;
-        case sc_expr_type::logic_not:
-            callback(&v.static_as<logic_not>()->in_);
-            break;
-        case sc_expr_type::select: {
-            auto val = v.static_as<select>();
-            callback({val->cond_, val->l_, val->r_});
-        } break;
-        case sc_expr_type::indexing: {
-            auto val = v.static_as<indexing>();
-            callback(&val->ptr_);
-            callback(val->idx_);
-            if (val->mask_.defined()) { callback(&val->mask_); }
-        } break;
-        case sc_expr_type::call: {
-            auto the_call = v.static_as<call>();
-            if (auto ex
-                    = std::dynamic_pointer_cast<expr_base>(the_call->func_)) {
-                callback({expr(ex)});
-            }
-            callback(the_call->args_);
-        } break;
-        case sc_expr_type::tensorptr: {
-            auto val = v.static_as<tensorptr>();
-            callback(&val->base_->ptr_);
-            callback(val->base_->idx_);
-        } break;
-        case sc_expr_type::intrin_call:
-            callback(v.static_as<intrin_call>()->args_);
-            break;
-        case sc_expr_type::low_level_intrin:
-            callback(v.static_as<low_level_intrin>()->args_);
-            break;
-        case sc_expr_type::ssa_phi:
-            callback(v.static_as<ssa_phi>()->values_);
-            break;
-    }
-}
-
-std::vector<expr> dims_to_dense_stride(const std::vector<expr> &v) {
-    std::vector<expr> stride(v.size(), UINT64_C(1));
-    for (int i = v.size() - 2; i >= 0; --i) {
-        stride[i] = do_cast_and_fold(v[i + 1] * stride[i + 1]);
-    }
-    return stride;
-}
-
-tensor_c get_base_tensor_of(const expr &p) {
-    tensor_c tsr;
-    if (p.isa<tensor>()) {
-        return p.static_as<tensor_c>();
-    } else if (p.isa<tensorptr>()) {
-        auto base = p.static_as<tensorptr_c>()->base_->ptr_;
-        if (base.isa<tensor>()) {
-            return base.checked_as<tensor_c>();
-        } else {
-            return get_base_tensor_of(base);
-        }
-    } else if (p.isa<cast>()) {
-        return get_base_tensor_of(p.static_as<cast>()->in_);
-    }
-    return tensor_c();
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_utils.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_utils.hpp
deleted file mode 100644
index 00cdd98d662..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ir_utils.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_IR_UTILS_HPP
-
-#include <functional>
-#include <vector>
-#include "sc_expr.hpp"
-#include <util/array_ref.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// finds the direct dependency of an SSA expr. Will return the exprs to the
-// callback.
-void get_direct_dependency_of_expr(
-        const expr &v, const std::function<void(array_ref<expr>)> &callback);
-/**
- * calculate dense stride from tensor dims.
- * @param v the input dims of tensor.
- */
-std::vector<expr> dims_to_dense_stride(const std::vector<expr> &v);
-
-/**
- * Gets the base tensor of the argument. It will look into 1) tensorptr 2) cast
- * nodes. If the expr has not base tensor, returns null.
- */
-tensor_c get_base_tensor_of(const expr &p);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/module_pass.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/module_pass.hpp
deleted file mode 100644
index e1ac0059ab1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/module_pass.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_MODULE_PASS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_MODULE_PASS_HPP
-
-#include <memory>
-#include "ir_module.hpp"
-#include "pass_info_macros.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct tir_pass_dependency_t;
-
-/**
- * The base abstruct class of all module passes. The pass should not change the
- * input module and the IR in it. However, it is allowed to set the attibutes
- * in the functions of the input module. To modify the module, the pass can
- * copy the module by calling `module->copy()`, modify on the cloned one and
- * return it. The return value of the module_pass_t may/may not be the same
- * memory object of the input, depending on the inplementation
- * */
-class SC_INTERNAL_API module_pass_t {
-public:
-    virtual const char *get_name() const { return nullptr; }
-#ifndef NDEBUG
-    virtual void get_dependency_info(tir_pass_dependency_t &out) const;
-#endif
-    virtual const_ir_module_ptr operator()(const_ir_module_ptr f) = 0;
-    virtual ~module_pass_t() = default;
-};
-
-using module_pass_ptr = std::unique_ptr<module_pass_t>;
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/dependency_analyzer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/dependency_analyzer.cpp
deleted file mode 100644
index 32cc4a993be..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/dependency_analyzer.cpp
+++ /dev/null
@@ -1,357 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "dependency_analyzer.hpp"
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "../ir_comparer.hpp"
-#include "../viewer.hpp"
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-// fixme: the tensor/tensorptr passed to functions not marked in dep graph
-// fixme: if-else merge issue (see merge() below)
-// todo: add IR hash so that we don't need to compare the IR one by one
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using namespace dependency_analysis;
-namespace dependency_analysis {
-dependency_t &get_dep_info(const node_base *s) {
-    return s->temp_data().get<dependency_t>();
-}
-} // namespace dependency_analysis
-
-static stmt_base_t *get_indexing_owner(const expr_c &access) {
-    auto ret = get_dep_info(access.get()).indexing_owner_.lock();
-    assert(ret);
-    return ret.get();
-}
-
-class dep_analyzer_impl_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    stmt_c cur_stmt;
-    dependency_t *cur_dep = nullptr;
-    using stmt_set = stmt_weak_set;
-    // the states for var and tensor, may be nested for if-else
-    struct nested_state_t {
-        // expr(var) => the statements that may be the last update to it.
-        std::unordered_map<expr_c, stmt_set> var_last_update;
-
-        // merges diverged branches' state of if-else into this state
-        void merge(nested_state_t &then_block, nested_state_t &else_block) {
-            // todo: if a var is updated in both then and else, can remove the
-            // last update in this state (the mainstream state)
-            for (auto &kv : then_block.var_last_update) {
-                var_last_update[kv.first].merge(kv.second);
-            }
-            for (auto &kv : else_block.var_last_update) {
-                var_last_update[kv.first].merge(kv.second);
-            }
-        }
-    };
-    std::vector<nested_state_t> state_stack;
-    // tensor => the loop depth where it is defined
-    std::unordered_map<expr_c, int> tsr_defined_loop_depth;
-    // tensor => the set of indexing on the tensor in the loops
-    std::unordered_map<expr_c, std::unordered_set<expr_c>> tsr_accesses_in_loop;
-    // tensor => the set of indexing on the tensor outside of any loops
-    std::unordered_map<expr_c, std::unordered_set<expr_c>> tsr_accesses;
-    for_loop_c cur_loop_;
-    int loop_depth = 0;
-    int if_depth = 0;
-
-    const stmt_set *get_last_update_for_var(const var_c &v) {
-        // checks nested_states. Check stack top first, if not found, look down
-        for (auto itr = state_stack.rbegin(); itr != state_stack.rend();
-                ++itr) {
-            auto varitr = itr->var_last_update.find(v);
-            if (varitr != itr->var_last_update.end()) {
-                return &varitr->second;
-            }
-        }
-        return nullptr;
-    }
-
-    void update_var(const expr_c &v, const stmt_c &st) {
-        state_stack.back().var_last_update[v] = {st.impl};
-    }
-
-    void add_to_dependency(stmt_base_t *depending, stmt_base_t *dependee) {
-        get_dep_info(depending).depends_on_.insert(
-                dependee->shared_from_this());
-        get_dep_info(dependee).depended_by_.insert(
-                depending->shared_from_this());
-    }
-
-    // checks if two indexing are the same: same ptr, same indices and their
-    // dependency are the same (the vars in the indices are unchanged)
-    bool indexing_same(const indexing_c &v1, const indexing_c &v2) {
-        if (!v1->ptr_.ptr_same(v2->ptr_)) { return false; }
-        assert(v1->idx_.size() == v2->idx_.size());
-        if (v1->dtype_ != v2->dtype_) { return false; }
-        ir_comparer cmper {false, false, true};
-        for (unsigned i = 0; i < v1->idx_.size(); i++) {
-            if (!cmper.compare(v1->idx_[i], v2->idx_[i])) { return false; }
-        }
-        return get_dep_info(v1.get()).depends_on_
-                == get_dep_info(v2.get()).depends_on_;
-    }
-
-    // returns true when two indices can be proven does not overlap
-    // currently only check the constant indices
-    bool indexing_definitely_not_same(
-            const indexing_c &v1, const indexing_c &v2) {
-        assert(v1->idx_.size() == v2->idx_.size());
-        for (unsigned i = 0; i < v1->idx_.size(); i++) {
-            if (!v1->idx_[i].isa<constant>()) { return false; }
-            if (!v2->idx_[i].isa<constant>()) { return false; }
-            if (v1->idx_[i]->dtype_ != v2->idx_[i]->dtype_) { return false; }
-            if (v1->idx_[i].static_as<constant>()->value_
-                    == v2->idx_[i].static_as<constant>()->value_) {
-                return false;
-            }
-        }
-        // the index not the same, now check if the access will overlap
-        auto idx1 = v1->idx_.back().static_as<constant_c>()->value_.front().s64;
-        auto idx2 = v2->idx_.back().static_as<constant_c>()->value_.front().s64;
-        // check if [idx1, idx1+lanes1) and [idx2, idx2+lanes2) overlaps
-        int64_t end1 = idx1 + v1->dtype_.lanes_;
-        int64_t end2 = idx2 + v2->dtype_.lanes_;
-
-        auto overlap = std::max(
-                INT64_C(0), std::min(end1, end2) - std::max(idx1, idx2));
-        return overlap == 0;
-    }
-
-    void add_tensor_depends_on_top_level_access(
-            const tensor_c &tsr, const indexing_c &v) {
-        auto itr = tsr_accesses.find(tsr);
-        if (itr != tsr_accesses.end()) {
-            for (auto &access : itr->second) {
-                auto other_owner = get_indexing_owner(access);
-                if (v.defined()) {
-                    if (indexing_definitely_not_same(
-                                access.checked_as<indexing_c>(), v)) {
-                        // if the access and v are definitely not the same, no
-                        // need to add dependency
-                    } else {
-                        add_to_dependency(
-                                cur_stmt.remove_const().get(), other_owner);
-                    }
-                }
-            }
-        }
-    }
-
-    void add_tensor_depends_on_loop_access(
-            const tensor_c &tsr, const indexing_c &v) {
-        auto itr = tsr_accesses_in_loop.find(tsr);
-        if (itr != tsr_accesses_in_loop.end()) {
-            for (auto &access : itr->second) {
-                auto other_owner = get_indexing_owner(access);
-                if (v.defined()) {
-                    if (indexing_definitely_not_same(
-                                access.checked_as<indexing_c>(), v)) {
-                        // if the access's index is proven not be the same of v,
-                        // no need to add dependency
-                    } else if (indexing_same(
-                                       access.checked_as<indexing_c>(), v)) {
-                        // if the access's index is the same expr of v, no need
-                        // to add reverse dependency, since v happens after
-                        // access
-                        add_to_dependency(
-                                cur_stmt.remove_const().get(), other_owner);
-                        auto def_loop_depth = tsr_defined_loop_depth[tsr];
-                        /* if the tensor is defined in an outer loop, like
-                        tensor A[100]
-                        for(i,...) {
-                            for(j,...) {
-                                t=A[i]
-                                A[i]=t+1
-                            }
-                        }
-
-                        We cannot remove the write to A[i]
-
-                        Another case is that the indexing does not depend on the
-                        loop:
-                        tensor A[100]
-                        for(j,...) {
-                            t=A[0]
-                            A[0]=t+1
-                        }
-                        */
-                        if (loop_depth > 0
-                                && (def_loop_depth < loop_depth - 1
-                                        || !get_dep_info(v.get())
-                                                    .depends_on_.has(
-                                                            cur_loop_.impl))) {
-                            add_to_dependency(
-                                    other_owner, cur_stmt.remove_const().get());
-                        }
-                    } else {
-                        // if the access's index may be the same of v,
-                        // conservatively add reverse dependency
-                        add_to_dependency(
-                                cur_stmt.remove_const().get(), other_owner);
-                        add_to_dependency(
-                                other_owner, cur_stmt.remove_const().get());
-                    }
-                }
-            }
-        }
-    }
-
-    void view(define_c v) override {
-        if (!v->var_.isa<tensor>()) {
-            tsr_defined_loop_depth[v->var_] = loop_depth;
-            dispatch(v->var_);
-        }
-        if (v->init_.defined()) { dispatch(v->init_); }
-    }
-
-    // indexing's base tensor will not go here
-    void view(tensor_c v) override {
-        ir_viewer_t::view(v);
-        const auto &tsr = v;
-        add_tensor_depends_on_loop_access(tsr, indexing_c());
-        add_tensor_depends_on_top_level_access(tsr, indexing_c());
-        v.remove_const()->attr()[attr_directly_accessed] = true;
-    }
-
-    void view(tensorptr_c v) override {
-        ir_viewer_t::view(v);
-        v->base_->ptr_.checked_as<tensor_c>()
-                .remove_const()
-                ->attr()[attr_directly_accessed]
-                = true;
-    }
-
-    void view(indexing_c v) override {
-        auto old_cur_dep = cur_dep;
-        any_t &dep_val = v->temp_data();
-        dep_val = dependency_t();
-        cur_dep = &dep_val.get<dependency_t>();
-        cur_dep->indexing_owner_ = cur_stmt.impl;
-
-        // ir_viewer_t::view(v);
-        // dispatch sub nodes without dispatching on the base tensor
-        for (auto &idx : v->idx_) {
-            dispatch(idx);
-        }
-        if (v->mask_.defined()) { dispatch(v->mask_); }
-
-        old_cur_dep->depended_by_.merge(cur_dep->depended_by_);
-        old_cur_dep->depends_on_.merge(cur_dep->depends_on_);
-        cur_dep = old_cur_dep;
-
-        auto tsr = v->ptr_.checked_as<tensor>();
-        add_tensor_depends_on_loop_access(tsr, v);
-        add_tensor_depends_on_top_level_access(tsr, v);
-        if (loop_depth > 0) { tsr_accesses_in_loop[tsr].insert(v); }
-        tsr_accesses[tsr].insert(v);
-    }
-
-    void view(var_c v) override {
-        // visiting on a var or tensor, adds its latest updating stmt to the
-        // dependency
-        if (cur_dep) {
-            auto last_update = get_last_update_for_var(v);
-            if (last_update) {
-                cur_dep->depends_on_.merge(*last_update);
-                for (auto upd_stmt : *last_update) {
-                    auto ptr = upd_stmt.lock();
-                    assert(ptr);
-                    assert(cur_stmt.defined());
-                    get_dep_info(ptr.get()).depended_by_.insert(cur_stmt.impl);
-                }
-            }
-        }
-    }
-
-    void view(assign_c v) override {
-        ir_viewer_t::view(v);
-        if (v->var_.isa<var_c>()) { update_var(v->var_, cur_stmt); }
-    }
-    void view(if_else_c v) override {
-        dispatch(v->condition_);
-        state_stack.emplace_back(nested_state_t());
-        dispatch(v->then_case_);
-        nested_state_t top_state1 = std::move(state_stack.back());
-        state_stack.pop_back();
-
-        nested_state_t top_state2;
-        if (v->else_case_.defined()) {
-            state_stack.emplace_back(nested_state_t());
-            dispatch(v->else_case_);
-            top_state2 = std::move(state_stack.back());
-            state_stack.pop_back();
-        }
-        state_stack.back().merge(top_state1, top_state2);
-    }
-    void view(for_loop_c v) override {
-        loop_depth++;
-        for_loop_c old_loop = std::move(cur_loop_);
-        cur_loop_ = v;
-        update_var(v->var_, v);
-        ir_viewer_t::view(v);
-        state_stack.back().var_last_update.erase(v->var_);
-        cur_loop_ = std::move(old_loop);
-        loop_depth--;
-        if (loop_depth == 0) { tsr_accesses_in_loop.clear(); }
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        auto old_cur_stmt = cur_stmt;
-        auto old_cur_dep = cur_dep;
-        cur_stmt = v;
-        any_t &dep_val = v->temp_data();
-        dep_val = dependency_t();
-        cur_dep = &dep_val.get<dependency_t>();
-
-        ir_viewer_t::dispatch(v);
-
-        cur_stmt = std::move(old_cur_stmt);
-        cur_dep = old_cur_dep;
-        return v;
-    }
-
-    func_c dispatch(func_c v) override {
-        state_stack = {nested_state_t()};
-        return ir_viewer_t::dispatch(v);
-    }
-};
-
-func_c dependency_analyzer_t::operator()(func_c f) {
-    dep_analyzer_impl_t v;
-    return v.dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/dependency_analyzer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/dependency_analyzer.hpp
deleted file mode 100644
index e12fa39a747..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/dependency_analyzer.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_DEPENDENCY_ANALYZER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_DEPENDENCY_ANALYZER_HPP
-
-#include <memory>
-#include "../function_pass.hpp"
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace dependency_analysis {
-// if a tensor is directly accessed (take pointer of a tensor)
-constexpr const char *attr_directly_accessed = "ir_analysis.directly_accessed";
-// std::weak_ptr cannot be hashed. Use a trick to bypass it
-using stmt_weak_set = utils::weakptr_hashset_t<stmt_base_t>;
-struct dependency_t {
-    stmt_weak_set depends_on_;
-    stmt_weak_set depended_by_;
-    std::weak_ptr<stmt_base_t> indexing_owner_;
-    dependency_t() = default;
-};
-
-dependency_t &get_dep_info(const node_base *s);
-} // namespace dependency_analysis
-
-/**
- * Mark the dependency graph. Will attach a dependency_analyzer_t::dependency_t
- * on the temp_data
- * */
-class dependency_analyzer_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/func_dependency.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/func_dependency.cpp
deleted file mode 100644
index f18385b9003..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/func_dependency.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "func_dependency.hpp"
-#include <vector>
-#include "../viewer.hpp"
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class func_dependency_impl_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    std::unordered_set<func_t> &set_;
-    std::vector<func_t> &dep_;
-
-    func_dependency_impl_t(
-            std::vector<func_t> &dep, std::unordered_set<func_t> &set)
-        : set_(set), dep_(dep) {}
-    void view(call_c v) override {
-        func_t f = std::dynamic_pointer_cast<func_base>(v->func_);
-        if (f) {
-            if (set_.find(f) == set_.end()) {
-                set_.insert(f);
-                dep_.push_back(f);
-            }
-        }
-        for (auto &arg : v->args_) {
-            dispatch(arg);
-        }
-    }
-
-    void view(func_addr_c v) override {
-        auto f = v->func_;
-        if (set_.find(f) == set_.end()) {
-            set_.insert(f);
-            dep_.push_back(f);
-        }
-    }
-};
-
-func_c func_dependency_finder_t::operator()(func_c f) {
-    std::unordered_set<func_t> set;
-    func_dependency_impl_t impl(dep_, set);
-    impl.dispatch(f);
-    return f;
-}
-
-stmt_c func_dependency_finder_t::operator()(stmt_c f) {
-    std::unordered_set<func_t> set;
-    func_dependency_impl_t impl(dep_, set);
-    impl.dispatch(f);
-    return f;
-}
-
-func_c func_dependency_finder_t::operator()(
-        func_c f, std::unordered_set<func_t> &set) {
-    func_dependency_impl_t impl(dep_, set);
-    impl.dispatch(f);
-    return f;
-}
-
-stmt_c func_dependency_finder_t::operator()(
-        stmt_c f, std::unordered_set<func_t> &set) {
-    func_dependency_impl_t impl(dep_, set);
-    impl.dispatch(f);
-    return f;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/func_dependency.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/func_dependency.hpp
deleted file mode 100644
index d84623c823c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/func_dependency.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_FUNC_DEPENDENCY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_FUNC_DEPENDENCY_HPP
-
-#include <vector>
-#include "../function_pass.hpp"
-#include "../sc_function.hpp"
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Walks through the function's body and find the functions that the function
- * may call.
- *
- * @param dep the output result array of all functions that this function may
- *      call. Different function (of different pointers) occurs once in the
- *      resulting array. The functions are pushed into the resulting array in
- *      the order that they are called in the function
- * */
-class func_dependency_finder_t : public function_pass_t {
-    std::vector<func_t> &dep_;
-
-public:
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f);
-    /**
-     * @param f the stmt to walk through
-     * @param func in & out, the function already met before. Will insert new
-     * dependencies after calling this function
-     * */
-    stmt_c operator()(stmt_c f, std::unordered_set<func_t> &funcs);
-    /**
-     * @param f the func_t to walk through
-     * @param func in & out, the function already met before. Will insert new
-     * dependencies after calling this function
-     * */
-    func_c operator()(func_c f, std::unordered_set<func_t> &funcs);
-    func_dependency_finder_t(std::vector<func_t> &dep) : dep_(dep) {}
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy.cpp
deleted file mode 100644
index ad4ea14086a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy.cpp
+++ /dev/null
@@ -1,338 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "ir_copy.hpp"
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../viewer.hpp"
-#include "ir_copy_internal.hpp"
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-expr ir_copier_impl_t::copy(const expr_c &v) {
-    dispatch(v);
-    return copy_attr(*v, std::move(returned_expr_));
-}
-
-stmt ir_copier_impl_t::copy(const stmt_c &v) {
-    dispatch(v);
-    return copy_attr(*v, std::move(returned_stmt_));
-}
-
-bool ir_copier_impl_t::find_and_return(const expr_c &v) {
-    auto itr = replace_map_.find(v);
-    if (itr != replace_map_.end()) {
-        returned_expr_ = itr->second;
-        if (!returned_expr_.defined()) { return false; }
-        return true;
-    } else {
-        // if the var/tensor is not found in the replace map
-        if (!create_var_tensor_) {
-            // if we use the old var/tensor
-            returned_expr_ = v.remove_const();
-            return true;
-        }
-        return false;
-    }
-}
-
-bool ir_copier_impl_t::find_and_return(const stmt_c &s) {
-    if (!stmt_replace_map_) { return false; }
-    auto itr = stmt_replace_map_->find(s);
-    if (itr != stmt_replace_map_->end()) {
-        returned_stmt_ = itr->second;
-        if (!returned_stmt_.defined()) { return false; }
-        return true;
-    }
-    return false;
-}
-
-ir_copier_impl_t::ir_copier_impl_t(
-        std::unordered_map<expr_c, expr> &replace_map, bool create_var_tensor)
-    : replace_map_(replace_map)
-    , stmt_replace_map_(nullptr)
-    , create_var_tensor_(create_var_tensor) {}
-
-ir_copier_impl_t::ir_copier_impl_t(
-        std::unordered_map<expr_c, expr> &replace_map,
-        std::unordered_map<stmt_c, stmt> *stmt_replace_map,
-        bool create_var_tensor)
-    : replace_map_(replace_map)
-    , stmt_replace_map_(stmt_replace_map)
-    , create_var_tensor_(create_var_tensor) {}
-
-void ir_copier_impl_t::view(constant_c v) {
-    returned_expr_ = make_expr<constant_node>(v->value_, v->dtype_);
-}
-
-void ir_copier_impl_t::view(var_c v) {
-    if (find_and_return(v)) return;
-    returned_expr_ = builder::make_var(v->dtype_, v->name_);
-    replace_map_[v] = returned_expr_;
-}
-
-void ir_copier_impl_t::view(cast_c v) {
-    returned_expr_ = builder::make_cast(v->dtype_, copy(v->in_));
-}
-
-#define COPY_BINARY(CLASS) \
-    void ir_copier_impl_t::view(CLASS##_c v) { \
-        returned_expr_ = builder::make_##CLASS(copy(v->l_), copy(v->r_)); \
-    }
-
-void ir_copier_impl_t::view(add_c v) {
-    auto ret = builder::make_add(copy(v->l_), copy(v->r_));
-    if (v->dtype_ != datatypes::undef) ret->dtype_ = v->dtype_;
-    returned_expr_ = std::move(ret);
-}
-
-COPY_BINARY(sub)
-COPY_BINARY(mul)
-COPY_BINARY(div)
-COPY_BINARY(mod)
-COPY_BINARY(cmp_eq)
-COPY_BINARY(cmp_lt)
-COPY_BINARY(cmp_le)
-COPY_BINARY(cmp_gt)
-COPY_BINARY(cmp_ge)
-COPY_BINARY(cmp_ne)
-COPY_BINARY(logic_and)
-COPY_BINARY(logic_or)
-
-void ir_copier_impl_t::view(logic_not_c v) {
-    returned_expr_ = builder::make_logic_not(copy(v->in_));
-}
-
-void ir_copier_impl_t::view(select_c v) {
-    returned_expr_
-            = builder::make_select(copy(v->cond_), copy(v->l_), copy(v->r_));
-}
-
-void ir_copier_impl_t::view(indexing_c v) {
-    expr ptr = copy(v->ptr_);
-    std::vector<expr> idx;
-    idx.reserve(v->idx_.size());
-    for (auto &i : v->idx_) {
-        idx.emplace_back(copy(i));
-    }
-    expr mask;
-    if (v->mask_.defined()) { mask = copy(v->mask_); }
-    returned_expr_ = builder::make_indexing(
-            ptr, idx, v->dtype_.lanes_, mask, v->dtype_.rows_);
-}
-
-void ir_copier_impl_t::view(tensorptr_c v) {
-    auto ret = make_expr<tensorptr_node>(
-            copy(v->base_).checked_as<indexing>(), v->shape_, v->is_slice_);
-    update_shrink_info(v, ret);
-    returned_expr_ = ret;
-}
-
-void ir_copier_impl_t::view(call_c v) {
-    // do not copy the function AST
-    std::vector<expr> args;
-    args.reserve(v->args_.size());
-    auto callee = v->func_;
-    if (auto callee_expr = std::dynamic_pointer_cast<expr_base>(v->func_)) {
-        callee = copy(expr_c(callee_expr)).remove_const().impl;
-    }
-    for (auto &i : v->args_) {
-        args.emplace_back(copy(i));
-    }
-    returned_expr_ = make_expr<call_node>(callee, args,
-            std::vector<call_node::parallel_attr_t> {v->para_attr_});
-}
-
-void ir_copier_impl_t::view(intrin_call_c v) {
-    std::vector<expr> args;
-    args.reserve(v->args_.size());
-    for (auto &i : v->args_) {
-        args.emplace_back(copy(i));
-    }
-    returned_expr_ = builder::remake_intrin_call(v, args);
-    returned_expr_->dtype_ = v->dtype_;
-}
-
-void ir_copier_impl_t::view(low_level_intrin_c v) {
-    std::vector<expr> args;
-    args.reserve(v->args_.size());
-    for (auto &i : v->args_) {
-        args.emplace_back(copy(i));
-    }
-    returned_expr_ = builder::remake_low_level_intrin(v, args);
-}
-
-void ir_copier_impl_t::view(func_addr_c v) {
-    returned_expr_ = builder::make_func_addr(v->func_);
-}
-
-void ir_copier_impl_t::view(ssa_phi_c v) {
-    std::vector<expr> args;
-    args.reserve(v->values_.size());
-    for (auto &i : v->values_) {
-        args.emplace_back(copy(i));
-    }
-    returned_expr_ = make_expr<ssa_phi_node>(args, v->is_loop_phi_);
-}
-
-void ir_copier_impl_t::update_shrink_info(const expr_c &v, const expr &ret) {
-    if (v->attr_) {
-        if (auto shrink_info
-                = v->attr_->get_or_null<tensor_shrinker_t::shrink_info_t>(
-                        tensor_shrinker_attrs::should_shrink)) {
-            auto new_shrink = *shrink_info;
-            // update the shape and base by using new copied expr
-            for (auto &val : new_shrink.base_) {
-                val = copy(val);
-            }
-            for (auto &val : new_shrink.shape_) {
-                val = copy(val);
-            }
-            ret->attr()[tensor_shrinker_attrs::should_shrink]
-                    = std::move(new_shrink);
-        }
-    }
-}
-
-void ir_copier_impl_t::view(tensor_c v) {
-    if (find_and_return(v)) return;
-
-    std::vector<expr> args(v->dims_.size());
-    std::vector<expr> strides(v->strides_.size());
-    for (size_t i = 0; i < v->dims_.size(); ++i) {
-        args[i] = copy(v->dims_[i]);
-        strides[i] = copy(v->strides_[i]);
-    }
-    returned_expr_ = builder::make_stensor(v->name_, args, strides,
-            v->elem_dtype_, v->address_space_, v->init_value_);
-    replace_map_[v] = returned_expr_;
-}
-
-void ir_copier_impl_t::view(assign_c v) {
-    returned_stmt_ = make_stmt<assign_node_t>(copy(v->var_), copy(v->value_));
-}
-
-void ir_copier_impl_t::view(stmts_c v) {
-    std::vector<stmt> seq;
-    seq.reserve(v->seq_.size());
-    for (auto &i : v->seq_) {
-        seq.emplace_back(copy(i));
-    }
-    returned_stmt_ = make_stmt<stmts_node_t>(std::move(seq));
-
-    if (v->attr_
-            && v->attr_->has_key(
-                    tensor_shrinker_attrs::tensor_for_placerholder)) {
-        auto tsr = v->attr_->get<std::weak_ptr<expr_base>>(
-                                   tensor_shrinker_attrs::
-                                           tensor_for_placerholder)
-                           .lock();
-        assert(tsr);
-        auto tsr_copy = copy(tsr->node_ptr_from_this());
-        returned_stmt_->attr()[tensor_shrinker_attrs::tensor_for_placerholder]
-                = std::weak_ptr<expr_base>(tsr_copy.impl);
-        update_shrink_info(tsr->node_ptr_from_this(), tsr_copy);
-        auto &shrink_info
-                = tsr_copy->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                        tensor_shrinker_attrs::should_shrink);
-        assert(shrink_info.move_def_.get() == v.get());
-        // update the new def
-        shrink_info.move_def_ = returned_stmt_.checked_as<stmts>();
-    }
-}
-
-void ir_copier_impl_t::view(if_else_c v) {
-    returned_stmt_ = make_stmt<if_else_node_t>(copy(v->condition_),
-            copy(v->then_case_),
-            v->else_case_.defined() ? copy(v->else_case_) : stmt());
-}
-
-void ir_copier_impl_t::view(evaluate_c v) {
-    returned_stmt_ = make_stmt<evaluate_node_t>(copy(v->value_));
-}
-
-void ir_copier_impl_t::view(returns_c v) {
-    returned_stmt_ = make_stmt<returns_node_t>(
-            v->value_.defined() ? copy(v->value_) : expr());
-}
-
-void ir_copier_impl_t::view(define_c v) {
-    auto the_var = copy(v->var_);
-    returned_stmt_ = make_stmt<define_node_t>(
-            the_var, v->linkage_, v->init_.defined() ? copy(v->init_) : expr());
-    if (the_var.isa<tensor>()) { update_shrink_info(v->var_, the_var); }
-}
-
-void ir_copier_impl_t::view(for_loop_c v) {
-    returned_stmt_ = make_stmt<for_loop_node_t>(copy(v->var_),
-            copy(v->iter_begin_), copy(v->iter_end_), copy(v->step_),
-            copy(v->body_), v->incremental_, v->kind_, v->num_threads_);
-}
-
-stmt_c ir_copier_impl_t::dispatch(stmt_c s) {
-    if (find_and_return(s)) { return s; }
-    ir_viewer_t::dispatch(s);
-    if (stmt_replace_map_) { (*stmt_replace_map_)[s] = returned_stmt_; }
-    return s;
-}
-
-func_c ir_copier_impl_t::dispatch(func_c v) {
-    std::vector<expr> params;
-    params.reserve(v->params_.size());
-    for (auto &i : v->params_) {
-        auto new_param = copy(i);
-        if (new_param.isa<tensor>()) { update_shrink_info(i, new_param); }
-        params.emplace_back(new_param);
-    }
-    auto body = v->body_.defined() ? copy(v->body_) : stmt();
-    returned_func_ = builder::make_func(v->name_, params, body, v->ret_type_);
-    if (v->attr_) {
-        returned_func_->attr_ = utils::make_unique<any_map_t>(*v->attr_);
-    }
-    return v;
-}
-
-func_t ir_copier_impl_t::copy(const func_c &f) {
-    dispatch(func_t(std::const_pointer_cast<func_base>(f)));
-    return copy_attr(*f, std::move(returned_func_));
-}
-
-stmt_c ir_copier_t::operator()(const stmt_c &s) {
-    ir_copier_impl_t vis(replace_map_, stmt_replace_map_, create_var_tensor_);
-    return vis.copy(s);
-}
-
-expr_c ir_copier_t::operator()(const expr_c &s) {
-    ir_copier_impl_t vis(replace_map_, stmt_replace_map_, create_var_tensor_);
-    return vis.copy(s);
-}
-
-func_c ir_copier_t::operator()(func_c s) {
-    ir_copier_impl_t vis(replace_map_, stmt_replace_map_, create_var_tensor_);
-    return vis.copy(s);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy.hpp
deleted file mode 100644
index e20d2dcebf0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_IR_COPY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_IR_COPY_HPP
-
-#include "../function_pass.hpp"
-#include "../sc_function.hpp"
-#include "../viewer.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Deeply copies the IR. Will NOT copy the function referenced by call_nodes
- *
- * @param create_var_tensor_ If true, creates new var and tensor. Otherwise, use
- * the old var and tensor
- * @param replace_map_ In and out parameter. Holds the mapping from old IR to
- * new IR. Before copying an IR node (var/tensor nodes only), the copier will
- * check if the old var/tensor node is in the map. If so, it will use the new IR
- * node in the map as the new copied IR node. Also, if a var/tensor node is
- * copied, the copier will put the old-new mapping in the replace_map_.
- * @param stmt_replace_map_, optional in and out parameter. Similar to
- * replace_map_ but holds the mapping from old stmt to new stmt.
- * */
-class ir_copier_t : public function_pass_t {
-protected:
-    std::unordered_map<expr_c, expr> &replace_map_;
-    std::unordered_map<stmt_c, stmt> *stmt_replace_map_;
-    bool create_var_tensor_;
-
-public:
-    ir_copier_t(std::unordered_map<expr_c, expr> &replace_map,
-            bool create_var_tensor = true)
-        : replace_map_(replace_map)
-        , stmt_replace_map_(nullptr)
-        , create_var_tensor_(create_var_tensor) {}
-    ir_copier_t(std::unordered_map<expr_c, expr> &replace_map,
-            std::unordered_map<stmt_c, stmt> *stmt_replace_map,
-            bool create_var_tensor = true)
-        : replace_map_(replace_map)
-        , stmt_replace_map_(stmt_replace_map)
-        , create_var_tensor_(create_var_tensor) {}
-    virtual expr_c operator()(const expr_c &v);
-    virtual stmt_c operator()(const stmt_c &v);
-    func_c operator()(func_c v) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy_internal.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy_internal.hpp
deleted file mode 100644
index 546d0514acd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_copy_internal.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_IR_COPY_INTERNAL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_IR_COPY_INTERNAL_HPP
-
-#include <algorithm>
-#include "../sc_function.hpp"
-#include "../viewer.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * This header exposes the implementation of ir_copier_t for code reuse.
- * Use ir_copier_t for simple IR copying
- * */
-
-/**
- * Deeply copies the IR. Will NOT copy the function referenced by call_nodes
- *
- * @param create_var_tensor If true, creates new var and tensor. Otherwise, use
- * the old var and tensor
- * @param replace_map_ In and out parameter. Holds the mapping from old IR to
- * new IR. Before copying an IR node (var/tensor nodes only), the copier will
- * check if the old var/tensor node is in the map. If so, it will use the new IR
- * node in the map as the new copied IR node. Also, if a var/tensor node is
- * copied, the copier will put the old-new mapping in the replace_map_.
- * @param stmt_replace_map_, optional in and out parameter. Similar to
- * replace_map_ but holds the mapping from old stmt to new stmt.
- * */
-class ir_copier_impl_t : public ir_viewer_t {
-protected:
-    std::unordered_map<expr_c, expr> &replace_map_;
-    std::unordered_map<stmt_c, stmt> *stmt_replace_map_;
-
-    // the returned IR pointers after calling dispatch(...).
-    // Should not read them directly
-    // We are using ir_viewer_t, so we cannot return the pointer via the return
-    // value. We store the return values here. These values are valid after
-    // calling dispatch(...). They should be consumed immediately after
-    // dispatch(...) returns.
-    // In this ir_viewer_t, you should use copy(..) instead of directly calling
-    // dispatch(...). copy(..) will call "dispatch" and return-move the
-    // returned_*_ to its return value
-    expr returned_expr_;
-    stmt returned_stmt_;
-    func_t returned_func_;
-
-    bool create_var_tensor_;
-
-    /**
-     * Finds a var/tensor in the replace map. If there is a match, sets
-     * returned_expr_ to the mapped expression.
-     * @param v the `old` expression
-     * @return true if `v` is in the replace map
-     * */
-    bool find_and_return(const expr_c &v);
-    /**
-     * Finds a stmt in the replace map, similar to above method
-     * */
-    bool find_and_return(const stmt_c &s);
-    void update_shrink_info(const expr_c &v, const expr &ret);
-
-public:
-    ir_copier_impl_t(std::unordered_map<expr_c, expr> &replace_map,
-            bool create_var_tensor = true);
-    ir_copier_impl_t(std::unordered_map<expr_c, expr> &replace_map,
-            std::unordered_map<stmt_c, stmt> *stmt_replace_map,
-            bool create_var_tensor = true);
-
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-
-    /**
-     * Copies the IR by calling dispatch(...) on it. Move-returns
-     * `returned_expr_`
-     * @return the copied IR
-     * */
-    virtual expr copy(const expr_c &v);
-
-    /**
-     * Copies the IR by calling dispatch(...) on it. Move-returns
-     * `returned_stmt_`
-     * @return the copied IR
-     * */
-    stmt copy(const stmt_c &v);
-
-    /**
-     * Copies the IR by calling dispatch(...) on it. Move-returns
-     * `returned_stmt_`
-     * @return the copied IR
-     * */
-    func_t copy(const func_c &v);
-
-    func_c dispatch(func_c e) override;
-    stmt_c dispatch(stmt_c s) override;
-    void view(constant_c v) override;
-    void view(var_c v) override;
-    void view(cast_c v) override;
-    void view(add_c v) override;
-    void view(sub_c v) override;
-    void view(mul_c v) override;
-    void view(div_c v) override;
-    void view(mod_c v) override;
-
-    void view(cmp_eq_c v) override;
-    void view(cmp_lt_c v) override;
-    void view(cmp_le_c v) override;
-    void view(cmp_gt_c v) override;
-    void view(cmp_ge_c v) override;
-    void view(cmp_ne_c v) override;
-
-    void view(logic_and_c v) override;
-    void view(logic_or_c v) override;
-
-    void view(logic_not_c v) override;
-    void view(select_c v) override;
-    void view(indexing_c v) override;
-    void view(call_c v) override;
-    void view(tensor_c v) override;
-    void view(tensorptr_c v) override;
-    void view(intrin_call_c v) override;
-    void view(low_level_intrin_c v) override;
-    void view(func_addr_c v) override;
-    void view(ssa_phi_c v) override;
-
-    void view(assign_c v) override;
-    void view(stmts_c v) override;
-    void view(if_else_c v) override;
-    void view(evaluate_c v) override;
-    void view(returns_c v) override;
-    void view(define_c v) override;
-    void view(for_loop_c v) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_extract.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_extract.cpp
deleted file mode 100644
index 93262b62ea1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_extract.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <utility>
-#include "../viewer.hpp"
-#include "ir_extract.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void ir_var_extractor_t::view(var_c v) {
-    ir_viewer_t::view(v);
-    if (dedup_.find(v) == dedup_.end()) {
-        dedup_.insert(v);
-        vars_.emplace_back(v);
-    }
-}
-
-void ir_var_extractor_t::operator()(const stmt_c &s) {
-    dedup_.clear();
-    ir_viewer_t::dispatch(s);
-}
-
-void ir_var_extractor_t::operator()(const expr_c &s) {
-    dedup_.clear();
-    ir_viewer_t::dispatch(s);
-}
-
-void ir_var_extractor_t::operator()(func_c s) {
-    dedup_.clear();
-    ir_viewer_t::dispatch(std::move(s));
-}
-
-void ir_var_capturer_t::view(define_c v) {
-    if (v->var_.isa<var>()) { vars_.emplace_back(v->var_); }
-    ir_viewer_t::view(v);
-}
-
-void ir_var_capturer_t::view(stmts_c v) {
-    size_t prev_sz = vars_.size();
-    for (auto &s : v->seq_) {
-        if (terminated_) { return; }
-        if (s.ptr_same(terminator_.remove_const())) {
-            terminated_ = true;
-            return;
-        }
-        ir_viewer_t::dispatch(s);
-    }
-    if (!terminated_) { vars_.resize(prev_sz); }
-}
-
-void ir_var_capturer_t::view(for_loop_c v) {
-    if (terminated_) { return; }
-    size_t prev_sz = vars_.size();
-    vars_.emplace_back(v->var_);
-    if (!v->body_.ptr_same(terminator_.remove_const())) {
-        ir_viewer_t::dispatch(v->body_);
-    } else {
-        terminated_ = true;
-    }
-    if (!terminated_) { vars_.resize(prev_sz); }
-}
-
-void ir_var_capturer_t::view(if_else_c v) {
-    if (terminated_) { return; }
-    size_t prev_sz = vars_.size();
-    if (v->then_case_.ptr_same(terminator_.remove_const())) {
-        terminated_ = true;
-        return;
-    } else {
-        ir_viewer_t::dispatch(v->then_case_);
-    }
-    if (!terminated_) {
-        vars_.resize(prev_sz);
-    } else
-        return;
-
-    if (v->else_case_.defined()) {
-        if (v->else_case_.ptr_same(terminator_.remove_const())) {
-            terminated_ = true;
-            return;
-        } else {
-            ir_viewer_t::dispatch(v->else_case_);
-            if (!terminated_) { vars_.resize(prev_sz); }
-        }
-    }
-}
-
-void ir_var_capturer_t::operator()(const stmt_c &v) {
-    terminated_ = false;
-    ir_viewer_t::dispatch(v);
-}
-
-void ir_var_capturer_t::operator()(const func_c &v) {
-    terminated_ = false;
-
-    size_t prev_sz = vars_.size();
-    for (auto &p : v->params_) {
-        if (p.isa<var>()) { vars_.emplace_back(p); }
-    }
-    if (v->body_.ptr_same(terminator_.remove_const())) {
-        terminated_ = true;
-        return;
-    }
-    ir_viewer_t::dispatch(v->body_);
-    if (!terminated_) { vars_.resize(prev_sz); }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_extract.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_extract.hpp
deleted file mode 100644
index ee5ccbce59f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/ir_extract.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_IR_EXTRACT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_IR_EXTRACT_HPP
-
-#include <vector>
-#include "../viewer.hpp"
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Extract vars from given IR. Will NOT change the given IR
- *
- * @param vars_ In and out parameter. Holds the extracted vars
- * */
-class ir_var_extractor_t : public ir_viewer_t {
-protected:
-    std::vector<expr_c> &vars_;
-    std::unordered_set<expr_c> dedup_;
-
-public:
-    ir_var_extractor_t(std::vector<expr_c> &vars) : vars_(vars) {}
-
-    void view(var_c v) override;
-    void operator()(const expr_c &v);
-    void operator()(const stmt_c &v);
-    void operator()(func_c v);
-};
-
-/**
- * Capture vars from given IR prior to a statement (i.e. terminator). Will NOT
- * change the given IR
- *
- * @param vars_ Out parameter. Holds the captured vars
- * @param terminator_ In parameter. Holds the stmt where extraction terminates
- * */
-class ir_var_capturer_t : public ir_viewer_t {
-protected:
-    std::vector<expr_c> &vars_;
-    stmt_c terminator_;
-    bool terminated_;
-
-public:
-    ir_var_capturer_t(std::vector<expr_c> &vars, stmt_c t = stmt_c())
-        : vars_(vars), terminator_(t) {}
-
-    void view(define_c v) override;
-    void view(stmts_c v) override;
-    void view(for_loop_c v) override;
-    void view(if_else_c v) override;
-    void operator()(const stmt_c &v);
-    void operator()(const func_c &v);
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/printer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/printer.cpp
deleted file mode 100644
index 3b038267ac6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/printer.cpp
+++ /dev/null
@@ -1,466 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "printer.hpp"
-#include <algorithm>
-#include <limits>
-#include <string>
-#include <vector>
-#include "../viewer.hpp"
-#include "util/fp16.hpp"
-#include <compiler/ir/intrinsics.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_expr.hpp>
-#include <util/bf16.hpp>
-#include <util/pos_track_stream.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void ir_printer_t::view(constant_c v) {
-    if (v->is_vector()) { os_ << '('; }
-    for (unsigned i = 0; i < v->value_.size(); i++) {
-        switch (v->dtype_.type_code_) {
-            case sc_data_etype::BF16: {
-                os_ << bf16_t(v->value_[i].f32).storage_ << "UL";
-            } break;
-            case sc_data_etype::F16: {
-                os_ << "(_Float16)"
-                    << std::min(std::max(v->value_[i].f32, -65504.f), 65504.f);
-            } break;
-            case sc_data_etype::F32: {
-                if (v->value_[i].f32 - static_cast<int>(v->value_[i].f32)
-                        == 0) {
-                    os_ << v->value_[i].f32 << ".f";
-                } else {
-                    if (std::isnan(v->value_[i].f32)) {
-                        os_ << "NAN";
-                    } else if (std::isinf(v->value_[i].f32)) {
-                        if (v->value_[i].s64 & 0x80000000) {
-                            os_ << "-INFINITY";
-                        } else {
-                            os_ << "INFINITY";
-                        }
-                    } else {
-                        os_.precision(std::numeric_limits<float>::max_digits10);
-                        os_ << v->value_[i].f32;
-                    }
-                }
-                break;
-            }
-            case sc_data_etype::S8:
-            case sc_data_etype::S32: os_ << v->value_[i].s64; break;
-            case sc_data_etype::U8:
-            case sc_data_etype::U16:
-            case sc_data_etype::U32:
-            case sc_data_etype::INDEX: os_ << v->value_[i].u64 << "UL"; break;
-            case sc_data_etype::BOOLEAN:
-                os_ << (v->value_[0].u64 ? "true" : "false");
-                break;
-            case sc_data_etype::POINTER:
-                os_ << "((void*)" << v->value_[i].u64 << ')';
-                break;
-            default:
-                if (v->dtype_.is_pointer()) {
-                    os_ << "((" << v->dtype_ << ')' << v->value_[i].u64 << ')';
-                } else {
-                    os_ << "((unknown)" << v->value_[i].u64 << ')';
-                }
-        }
-        if (i != v->value_.size() - 1) { os_ << ',' << ' '; }
-    }
-    if (v->is_vector()) { os_ << ')'; }
-}
-
-void ir_printer_t::view(cast_c v) {
-    os_ << v->dtype_ << '(';
-    do_dispatch(v->in_) << ')';
-}
-void ir_printer_t::view(var_c v) {
-    os_ << v->name_;
-    if (utils::compiler_configs_t::get().printer_print_address_) {
-        os_ << '_' << (void *)v.get();
-    }
-}
-
-#define GEN_BINARY(CLASS, OP) \
-    void ir_printer_t::view(CLASS##_c v) { \
-        os_ << '('; \
-        do_dispatch(v->l_) << (OP); \
-        do_dispatch(v->r_) << ')'; \
-    }
-
-GEN_BINARY(add, " + ")
-GEN_BINARY(sub, " - ")
-GEN_BINARY(mul, " * ")
-GEN_BINARY(div, " / ")
-GEN_BINARY(mod, " % ")
-GEN_BINARY(cmp_eq, " == ")
-GEN_BINARY(cmp_lt, " < ")
-GEN_BINARY(cmp_le, " <= ")
-GEN_BINARY(cmp_gt, " > ")
-GEN_BINARY(cmp_ge, " >= ")
-GEN_BINARY(cmp_ne, " != ")
-GEN_BINARY(logic_and, " && ")
-GEN_BINARY(logic_or, " || ")
-
-void ir_printer_t::view(logic_not_c v) {
-    os_ << "!(";
-    do_dispatch(v->in_) << ')';
-}
-
-void ir_printer_t::view(select_c v) {
-    os_ << "(";
-    do_dispatch(v->cond_) << "?";
-    do_dispatch(v->l_) << ":";
-    do_dispatch(v->r_) << ")";
-}
-
-void ir_printer_t::view(indexing_c v) {
-    do_dispatch(v->ptr_) << '[';
-    assert(!v->idx_.empty());
-    for (size_t i = 0; i < v->idx_.size() - 1; i++) {
-        do_dispatch(v->idx_.at(i)) << ", ";
-    }
-    do_dispatch(v->idx_.back());
-    if (v->dtype_.lanes_ > 1) {
-        if (v->dtype_.rows_ == 0) {
-            os_ << " @ " << v->dtype_.lanes_;
-        } else {
-            os_ << " @ " << v->dtype_.rows_ << 'x'
-                << v->dtype_.lanes_ / v->dtype_.rows_;
-        }
-    }
-    if (v->mask_.defined()) {
-        os_ << " M= ";
-        do_dispatch(v->mask_);
-    }
-    os_ << ']';
-}
-
-void ir_printer_t::view(call_c v) {
-    if (auto func = std::dynamic_pointer_cast<func_base>(v->func_)) {
-        os_ << func->name_;
-    } else {
-        auto theexpr = std::dynamic_pointer_cast<expr_base>(v->func_);
-        assert(theexpr);
-        do_dispatch(expr(theexpr));
-    }
-    os_ << '(';
-    if (!v->args_.empty()) {
-        for (unsigned i = 0; i < v->args_.size() - 1; i++) {
-            do_dispatch(v->args_.at(i)) << ", ";
-        }
-        do_dispatch(v->args_.back());
-    }
-    os_ << ')';
-    if (!v->para_attr_.empty()) {
-        os_ << "@parallel(";
-        for (auto &pv : v->para_attr_) {
-            os_ << '[';
-            do_dispatch(pv.begin_) << ", ";
-            do_dispatch(pv.end_) << ", ";
-            do_dispatch(pv.step_) << "], ";
-        }
-        os_ << ')';
-    }
-}
-
-void ir_printer_t::view(tensor_c v) {
-    os_ << v->name_;
-    if (utils::compiler_configs_t::get().printer_print_address_) {
-        os_ << '_' << (void *)v.get();
-    }
-}
-
-void ir_printer_t::view(tensorptr_c v) {
-    os_ << '&';
-    do_dispatch(v->base_);
-}
-
-void ir_printer_t::view(intrin_call_c v) {
-    auto &h = get_intrinsic_handler(v->type_);
-    h.to_string(v, this);
-}
-
-void ir_printer_t::view(func_addr_c v) {
-    os_ << '&' << v->func_->name_;
-}
-
-void ir_printer_t::view(ssa_phi_c v) {
-    os_ << "phi(";
-    if (!v->values_.empty()) {
-        for (unsigned i = 0; i < v->values_.size() - 1; i++) {
-            do_dispatch(v->values_[i]) << ", ";
-        }
-        do_dispatch(v->values_.back());
-    }
-    if (v->is_loop_phi_) { os_ << " loop"; }
-    os_ << ')';
-}
-
-void ir_printer_t::view(low_level_intrin_c v) {
-    switch (v->kind_) {
-        case low_level_intrin_kind::x86_xbyak: {
-            v.checked_as<xbyak::xbyak_intrin_c>()->to_string(os_);
-            return;
-        } break;
-        default: {
-            assert(0 && "Not supported.");
-        } break;
-    }
-    os_ << '(';
-    if (!v->args_.empty()) {
-        for (unsigned i = 0; i < v->args_.size() - 1; i++) {
-            do_dispatch(v->args_.at(i)) << ", ";
-        }
-        do_dispatch(v->args_.back());
-    }
-    os_ << ')';
-}
-
-void ir_printer_t::view(assign_c v) {
-    do_dispatch(v->var_) << " = ";
-    do_dispatch(v->value_);
-}
-
-void ir_printer_t::view(stmts_c v) {
-    os_ << "{\n";
-    indents_++;
-    for (auto &s : v->seq_) {
-        print_indents(os_, indents_);
-        do_dispatch(s);
-        os_ << '\n';
-    }
-    indents_--;
-    print_indents(os_, indents_);
-    os_ << "}";
-}
-
-void ir_printer_t::view(if_else_c v) {
-    os_ << "if (";
-    do_dispatch(v->condition_) << ") ";
-    do_dispatch(v->then_case_);
-    if (v->else_case_.defined()) {
-        os_ << " else ";
-        do_dispatch(v->else_case_);
-    }
-}
-
-void ir_printer_t::view(evaluate_c v) {
-    os_ << "evaluate{";
-    do_dispatch(v->value_) << '}';
-}
-
-void ir_printer_t::view(returns_c v) {
-    os_ << "return ";
-    if (v->value_.defined()) { do_dispatch(v->value_); }
-}
-
-void ir_printer_t::view(define_c v) {
-    if (v->attr_) {
-        if (auto comments
-                = v->attr_->get_or_null<std::vector<std::string>>("comments")) {
-            for (auto &str : *comments) {
-                os_ << "// " << str << "\n";
-                print_indents(os_, indents_);
-            }
-        }
-    }
-    auto va = v->var_.as<var>();
-    switch (v->linkage_) {
-        case linkage::local: break;
-        case linkage::static_local: os_ << "static "; break;
-        case linkage::private_global: os_ << "private "; break;
-        case linkage::public_global: os_ << "public "; break;
-        default: break;
-    }
-    if (va.defined()) {
-        os_ << "var ";
-        do_dispatch(v->var_) << ": " << va->dtype_;
-        if (v->init_.defined()) {
-            os_ << " = ";
-            do_dispatch(v->init_);
-        }
-        return;
-    }
-    auto t = v->var_.as<tensor>();
-    if (t.defined()) {
-        os_ << "tensor ";
-        t->to_string_full(*this);
-        if (t->init_value_) {
-            if (t->init_value_ != tensor_node::get_zero_tensor_initializer()) {
-                os_ << " = [addr=" << t->init_value_->data_
-                    << ", size=" << t->init_value_->size_ << ']';
-            }
-        }
-        if (v->init_.defined()) {
-            os_ << " = ";
-            do_dispatch(v->init_);
-        }
-        return;
-    } else {
-        os_ << "(Bad var type)";
-    }
-}
-
-void ir_printer_t::view(for_loop_c v) {
-    const char *type;
-    if (v->kind_ == for_type::PARALLEL) {
-        type = "parallel ";
-    } else {
-        type = "";
-    }
-    os_ << "for ";
-    do_dispatch(v->var_) << " in (";
-    do_dispatch(v->iter_begin_) << ", ";
-    do_dispatch(v->iter_end_) << ", ";
-    do_dispatch(v->step_) << ") " << type;
-    if (v->num_threads_ > 0) { os_ << '(' << v->num_threads_ << ')'; }
-    do_dispatch(v->body_);
-}
-
-static ostream &print_single_arg(ir_printer_t &p, const expr &arg) {
-    if (arg.isa<tensor>()) {
-        auto a = arg.static_as<tensor>();
-        a->to_string_full(p);
-    } else {
-        p.os_ << arg << ": " << arg->dtype_;
-    }
-    return p.os_;
-}
-
-void print_func_comments(const func_c &f, std::ostream &os) {
-    if (f->attr_) {
-        if (auto comments
-                = f->attr_->get_or_null<std::vector<std::string>>("comments")) {
-            os << "/**\n";
-            for (auto &line : *comments) {
-                os << " * " << line << '\n';
-            }
-            os << "*/\n";
-        }
-    }
-}
-
-func_c ir_printer_t::dispatch(func_c f) {
-    print_func_comments(f, os_);
-    os_ << "func " << f->name_ << '(';
-    if (!f->params_.empty()) {
-        for (unsigned i = 0; i < f->params_.size() - 1; i++) {
-            print_single_arg(*this, f->params_.at(i)) << ", ";
-        }
-        print_single_arg(*this, f->params_.back());
-    }
-    os_ << "): " << f->ret_type_ << ' ';
-    if (f->body_.defined()) { do_dispatch(f->body_); }
-    return f;
-}
-
-std::ostream &ir_printer_t::do_dispatch(const ir_module_t &m) {
-    for (auto &f : m.get_module_vars()) {
-        do_dispatch(f) << '\n';
-    }
-    for (auto &f : m.get_contents()) {
-        do_dispatch(f) << '\n';
-    }
-    return os_;
-}
-
-std::ostream &ir_printer_t::do_dispatch(const func_c &m) {
-    dispatch(m);
-    return os_;
-}
-
-std::ostream &ir_printer_t::do_dispatch(const expr_c &m) {
-    dispatch(m);
-    return os_;
-}
-
-std::ostream &ir_printer_t::do_dispatch(const stmt_c &m) {
-    dispatch(m);
-    return os_;
-}
-
-class ir_track_pos_printer_t : public ir_printer_t {
-public:
-    track_pos_stream_t theos_;
-    using ir_printer_t::dispatch;
-    using ir_printer_t::view;
-
-    func_c dispatch(func_c f) override {
-        std::const_pointer_cast<func_base>(f)->attr()["source_pos"]
-                = source_pos {theos_.buf_.pos_, theos_.buf_.line_};
-        return ir_printer_t::dispatch(f);
-    }
-
-    stmt_c dispatch(stmt_c f) override {
-        f.remove_const()->attr()["source_pos"]
-                = source_pos {theos_.buf_.pos_, theos_.buf_.line_};
-        return ir_printer_t::dispatch(f);
-    }
-
-    expr_c dispatch(expr_c f) override {
-        if (!f.isa<var>() && !f.isa<tensor>()) {
-            f.remove_const()->attr()["source_pos"]
-                    = source_pos {theos_.buf_.pos_, theos_.buf_.line_};
-        }
-        return ir_printer_t::dispatch(f);
-    }
-
-    ir_track_pos_printer_t(std::ostream &os)
-        : ir_printer_t(theos_), theos_(os) {}
-};
-
-void print_ir_and_annotate_source_pos(const ir_module_t &v, std::ostream &os) {
-    ir_track_pos_printer_t p {os};
-    p.do_dispatch(v);
-}
-
-void print_ir_and_annotate_source_pos(const func_c &v, std::ostream &os) {
-    ir_track_pos_printer_t p {os};
-    p.do_dispatch(v);
-}
-
-void print_ir_and_annotate_position_in_source(
-        const func_c &scope, const node_base *v, std::ostream &os) {
-    std::stringstream ss;
-    ir_track_pos_printer_t p {ss};
-    p.do_dispatch(scope);
-    auto src = ss.str();
-    os << src;
-    if (v->attr_) {
-        auto pos = v->attr_->get_or_null<source_pos>("source_pos");
-        if (pos) {
-            os << "\n===================\nat line:col (" << pos->line_ << ':'
-               << pos->pos_ << ')' << '\n';
-            auto lines = utils::string_split(src, "\n");
-            if (pos->line_ - 1UL < lines.size()) {
-                os << "  " << lines[pos->line_ - 1UL] << '\n';
-                for (int i = 0; i < pos->pos_ + 2; i++) {
-                    os << ' ';
-                }
-                os << "^^^\n";
-            }
-        }
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/printer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/printer.hpp
deleted file mode 100644
index 4bb240f7645..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/printer.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_PRINTER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_PRINTER_HPP
-
-#include <ios>
-#include <compiler/ir/viewer.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class ir_module_t;
-class ir_printer_t : public ir_viewer_t {
-public:
-    std::ostream &os_;
-    int indents_ = 0;
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-
-    ir_printer_t(std::ostream &os) : os_(os) {}
-
-    std::ostream &do_dispatch(const expr_c &v);
-    std::ostream &do_dispatch(const stmt_c &v);
-    std::ostream &do_dispatch(const func_c &v);
-    std::ostream &do_dispatch(const ir_module_t &v);
-
-    func_c dispatch(func_c v) override;
-
-#define SC_IR_PRINTER_METHODS_IMPL(node_type, ...) \
-    void view(node_type##_c v) override;
-
-    FOR_EACH_EXPR_IR_TYPE(SC_IR_PRINTER_METHODS_IMPL)
-    FOR_EACH_STMT_IR_TYPE(SC_IR_PRINTER_METHODS_IMPL)
-};
-
-struct source_pos {
-    int pos_;
-    int line_;
-    bool operator==(const source_pos &other) const {
-        return pos_ == other.pos_ && line_ == other.line_;
-    }
-};
-
-/**
- * @brief print the IR to the stream and annotate the source_pos on "source_pos"
- * attr of each IR node
- *
- * @param v the IR module
- * @param os the output stream
- */
-void print_ir_and_annotate_source_pos(const ir_module_t &v, std::ostream &os);
-void print_ir_and_annotate_source_pos(const func_c &v, std::ostream &os);
-
-// printing the IR and print the position of "v". Useful for debugging
-void print_ir_and_annotate_position_in_source(
-        const func_c &scope, const node_base *v, std::ostream &os);
-void print_func_comments(const func_c &f, std::ostream &os);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/validator.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/validator.cpp
deleted file mode 100644
index 89471e6cb0c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/validator.cpp
+++ /dev/null
@@ -1,795 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "validator.hpp"
-#include <string>
-#include <utility>
-#include <vector>
-#include "../intrinsics.hpp"
-#include "../viewer.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/pass/printer.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/tir_pos_trace.hpp>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(validator,
-        SC_PASS_DEPENDS_ON(dyn_tensor_transformer, interface_generalizer,
-                tensor_shrinker, index_flattener, auto_caster),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-class validate_impl_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    const func_base *cur_func_ = nullptr;
-    using var_scope = std::unordered_set<const expr_base *>;
-    std::vector<var_scope> defined_vars_;
-    bool allow_tensor_view_ = false;
-    int for_loop_levels_ = 0;
-    tir_pos_tracer pass_error_tracer_;
-
-    bool is_var_defined(const expr_base *v) {
-        for (auto &scope : defined_vars_) {
-            if (scope.find(v) != scope.end()) { return true; }
-        }
-        return false;
-    }
-
-    void add_def(const expr_base *v, const stmt_c &s) {
-        assert(!defined_vars_.empty());
-        auto &cur_scope = defined_vars_.back();
-        COMPILE_ASSERT_POS(!is_var_defined(v),
-                "The variable/tensor "
-                        << v
-                        << " is already defined. The second definition is: "
-                        << s);
-        defined_vars_.back().insert(v);
-    }
-
-    expr_c dispatch(expr_c v) override {
-        TIR_ERROR_TRACE(v);
-        ir_viewer_t::dispatch(v);
-        return v;
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        TIR_ERROR_TRACE(v);
-        ir_viewer_t::dispatch(v);
-        return v;
-    }
-
-    func_c dispatch(func_c v) override;
-    void view(binary_c v) override;
-    void view(cmp_c v) override;
-    void view(logic_c v) override;
-    void view(logic_not_c v) override;
-    void view(select_c v) override;
-    void view(indexing_c v) override;
-    void view(call_c v) override;
-    void view(tensor_c v) override;
-    void view(tensorptr_c v) override;
-    void view(intrin_call_c v) override;
-    void view(var_c v) override;
-
-    void view(assign_c v) override;
-    void view(if_else_c v) override;
-    void view(for_loop_c v) override;
-    void view(returns_c v) override;
-    void view(define_c v) override;
-    void view(stmts_c v) override;
-
-    void validate_type(sc_data_type_t dtype, const expr_c &e) const;
-    void validate_type(const expr_c &e) const;
-    void validate_binary_op(
-            const expr_c &v, const expr_c &l, const expr_c &r) const;
-    void check_indexing(const std::vector<expr> &idxvec, const expr_c &v,
-            unsigned expected_dims) const;
-    void check_var_tensor_def(
-            const define_c &v, bool allow_tensor_view, bool is_global) const;
-};
-
-func_c validate_impl_t::dispatch(func_c v) {
-    TIR_ERROR_TRACE(v);
-    cur_func_ = v.get();
-    defined_vars_.emplace_back(var_scope());
-    allow_tensor_view_ = cur_func_->attr_
-            && (cur_func_->attr_->get_or_else("allow_tensor_view", false)
-                    || cur_func_->attr_->get_or_else(
-                            function_attrs::low_level, false));
-    for (auto &p : v->params_) {
-        defined_vars_.back().insert(p.get());
-    }
-    auto ret = ir_viewer_t::dispatch(std::move(v));
-    allow_tensor_view_ = false;
-    cur_func_ = nullptr;
-    defined_vars_.pop_back();
-    return ret;
-}
-
-void validate_impl_t::validate_type(
-        sc_data_type_t dtype, const expr_c &e) const {
-    COMPILE_ASSERT_POS(dtype != datatypes::undef && dtype != datatypes::void_t
-                    && dtype.lanes_ > 0,
-            "Invalid type: met undef/void/zero-length vector: " << e);
-}
-
-void validate_impl_t::validate_type(const expr_c &e) const {
-    validate_type(e->dtype_, e);
-}
-
-void validate_impl_t::validate_binary_op(
-        const expr_c &v, const expr_c &l, const expr_c &r) const {
-    validate_type(v);
-    COMPILE_ASSERT_POS(l->dtype_ == r->dtype_,
-            "The types of LHS and RHS should be the same: "
-                    << l->dtype_ << " v.s. " << r->dtype_ << ", expr = " << v);
-    COMPILE_ASSERT_POS(r->dtype_ != datatypes::pointer,
-            "Do not support binary op on pointers " << v);
-}
-
-static bool validate_brgemm_dtype(sc_data_type_t dtype_A,
-        sc_data_type_t dtype_B, sc_data_type_t dtype_C, bool has_postop) {
-    if ((dtype_A == datatypes::f32 && dtype_B == datatypes::f32
-                && dtype_C == datatypes::f32)
-            || (utils::is_one_of(dtype_A, datatypes::u8, datatypes::s8)
-                    && utils::is_one_of(dtype_B, datatypes::u8, datatypes::s8)
-                    && (has_postop ? utils::is_one_of(dtype_C, datatypes::s32,
-                                datatypes::f32, datatypes::u8, datatypes::s8)
-                                   : dtype_C == datatypes::s32))
-            || (dtype_A == datatypes::bf16 && dtype_B == datatypes::bf16
-                    && (has_postop ? utils::is_one_of(
-                                dtype_C, datatypes::f32, datatypes::bf16)
-                                   : dtype_C == datatypes::f32))
-            || (dtype_A == datatypes::f16 && dtype_B == datatypes::f16
-                    && (has_postop ? utils::is_one_of(
-                                dtype_C, datatypes::f32, datatypes::f16)
-                                   : dtype_C == datatypes::f32))) {
-        return true;
-    }
-    return false;
-}
-
-void validate_impl_t::view(binary_c v) {
-    dispatch(v->l_);
-    dispatch(v->r_);
-    validate_binary_op(v, v->l_, v->r_);
-    if (v->node_type_ == sc_expr_type::mod) {
-        auto dtype = v->l_->dtype_;
-        COMPILE_ASSERT_POS(dtype == datatypes::index || dtype == datatypes::s32
-                        || dtype == datatypes::u8 || dtype == datatypes::s8,
-                "%% operator cannot be applied on this type: " << v);
-    }
-}
-void validate_impl_t::view(cmp_c v) {
-    dispatch(v->l_);
-    dispatch(v->r_);
-    COMPILE_ASSERT_POS(v->dtype_.is_etype(sc_data_etype::BOOLEAN),
-            "The type of cmp should be boolean, got: "
-                    << v->dtype_ << ". The expr is " << v);
-    COMPILE_ASSERT_POS(v->l_->dtype_ == v->r_->dtype_,
-            "The type of LHS and RHS should be the same: "
-                    << v->l_->dtype_ << " v.s. " << v->r_->dtype_
-                    << ". expr = " << v);
-    COMPILE_ASSERT_POS(get_etype_category_nothrow(v->l_->dtype_) != CATE_OTHER,
-            "comparison expressions should have valid type, got type: "
-                    << v->l_->dtype_ << ", expr = " << v);
-}
-void validate_impl_t::view(logic_c v) {
-    dispatch(v->l_);
-    dispatch(v->r_);
-    COMPILE_ASSERT_POS(v->dtype_ == datatypes::boolean,
-            "The type of logic should be boolean, got: "
-                    << v->dtype_ << ". The expr is " << v);
-    COMPILE_ASSERT_POS(v->l_->dtype_ == datatypes::boolean,
-            "The type of LHS should be a boolean expr: " << v);
-    COMPILE_ASSERT_POS(v->r_->dtype_ == datatypes::boolean,
-            "The type of RHS should be a boolean expr: " << v);
-}
-void validate_impl_t::view(logic_not_c v) {
-    dispatch(v->in_);
-    COMPILE_ASSERT_POS(v->dtype_ == datatypes::boolean,
-            "The type of logic not should be boolean, got: "
-                    << v->dtype_ << ". The expr is " << v);
-    COMPILE_ASSERT_POS(v->in_->dtype_ == datatypes::boolean,
-            "The type of in_ should be a boolean expr: " << v);
-}
-
-void validate_impl_t::view(select_c v) {
-    dispatch(v->cond_);
-    dispatch(v->l_);
-    dispatch(v->r_);
-    COMPILE_ASSERT_POS(v->l_->dtype_ == v->r_->dtype_,
-            "The two candidates in select should have same dtype, got: "
-                    << v->l_->dtype_ << " v.s. " << v->r_->dtype_);
-    auto &cond_dtype = v->cond_->dtype_;
-    if (cond_dtype.lanes_ == 1
-            && !cond_dtype.is_etype(sc_data_etype::BOOLEAN)) {
-        uint64_t candidate_lanes = static_cast<uint64_t>(v->l_->dtype_.lanes_);
-        uint64_t cond_bits = utils::get_sizeof_type(cond_dtype) * 8;
-        COMPILE_ASSERT_POS(candidate_lanes == cond_bits,
-                "When condition is bit mask, its number of bit should equal to "
-                "number of left/right hand vector, got: "
-                        << candidate_lanes << " v.s. " << cond_bits);
-    }
-}
-
-void validate_impl_t::check_indexing(const std::vector<expr> &idxvec,
-        const expr_c &v, unsigned expected_dims) const {
-    COMPILE_ASSERT_POS(idxvec.size() == expected_dims,
-            "Indexing node should have the same dimemsion of the tensor "
-            "element, expecting "
-                    << expected_dims << ", got " << idxvec.size()
-                    << ". expr = " << v);
-    int cnt = 0;
-    sc_data_type_t idxtype = idxvec.front()->dtype_;
-    for (auto &idx : idxvec) {
-        cnt += 1;
-        auto cate = get_type_category_nothrow(idx->dtype_);
-        COMPILE_ASSERT_POS(idx->dtype_ != datatypes::boolean
-                        && (cate == CATE_INT || cate == CATE_UINT
-                                || idx->dtype_ == sc_data_type_t::u32(16)),
-                "The " << cnt << "-th index of the indexing expr has type "
-                       << idx->dtype_ << ". Expecting an integer: " << v);
-        COMPILE_ASSERT_POS(idx->dtype_ == idxtype,
-                "Expecting all the indices within the indexing expression "
-                "having the same dtype. Current dimemsion: "
-                        << cnt << ". expr = " << v);
-    }
-}
-
-void validate_impl_t::view(indexing_c v) {
-    validate_type(v);
-    dispatch(v->ptr_);
-    for (auto &idx : v->idx_) {
-        dispatch(idx);
-    }
-    sc_data_etype elem_type;
-    if (v->ptr_.isa<tensorptr>()) {
-        auto ptr = v->ptr_.static_as<tensorptr_c>();
-        COMPILE_ASSERT_POS(!ptr->shape_.empty(),
-                "Tensorptr expects a shape when used in indexing: " << v);
-        check_indexing(v->idx_, v, ptr->shape_.size());
-    } else {
-        COMPILE_ASSERT_POS(v->ptr_.isa<tensor>(),
-                "Indexing node is expecting a tensor/tensorptr as the ptr: "
-                        << v);
-        tensor tnode = v->ptr_.static_as<tensor>();
-        COMPILE_ASSERT_POS(
-                v->dtype_.type_code_ == tnode->elem_dtype_.type_code_,
-                "Indexing node should have the same type of the tensor "
-                "element, "
-                "got " << v->dtype_
-                       << " and " << tnode->elem_dtype_ << ". expr = " << v);
-        check_indexing(v->idx_, v, tnode->dims_.size());
-    }
-}
-void validate_impl_t::view(call_c v) {
-    for (auto &arg : v->args_) {
-        dispatch(arg);
-    }
-    func_t the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-    func_t proto_func;
-    if (!the_func) {
-        auto the_expr = std::dynamic_pointer_cast<expr_base>(v->func_);
-        COMPILE_ASSERT_POS(the_expr, "Expecting expr or func in call node");
-        proto_func = the_expr->attr().get_or_else("prototype", func_t());
-        COMPILE_ASSERT_POS(proto_func,
-                "Expecting attr prototype in the expr of call node");
-        COMPILE_ASSERT_POS(the_expr->dtype_ == datatypes::pointer,
-                "Expecting the callee to be a pointer typed value");
-    } else {
-        proto_func = the_func;
-    }
-    COMPILE_ASSERT_POS(v->dtype_ != datatypes::undef, "Met undef. " << v);
-    COMPILE_ASSERT_POS(v->args_.size() == proto_func->params_.size(),
-            "Wrong number of parameters, given "
-                    << v->args_.size() << ", expecting "
-                    << proto_func->params_.size() << ". Expr = " << v);
-    for (size_t i = 0; i < v->args_.size(); i++) {
-        sc_data_type_t ty1 = v->args_.at(i)->dtype_;
-        sc_data_type_t ty2 = proto_func->params_.at(i)->dtype_;
-        COMPILE_ASSERT_POS(ty1 == ty2,
-                "Unmatched types for parameter " << i + 1 << " : given " << ty1
-                                                 << ", expecting " << ty2
-                                                 << ". Expr = " << v);
-    }
-    COMPILE_ASSERT_POS(v->dtype_ == proto_func->ret_type_,
-            "Unmatched types of call node and the func_t: " << v);
-}
-
-void validate_impl_t::view(intrin_call_c v) {
-    for (auto &arg : v->args_) {
-        dispatch(arg);
-    }
-    switch (v->type_) {
-        case intrin_type::max:
-        case intrin_type::min:
-        case intrin_type::unpack_low:
-        case intrin_type::unpack_high:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 2,
-                    "Binary intrinsics take two parameters. Got " << v);
-            validate_binary_op(v, v->args_[0], v->args_[1]);
-            break;
-        case intrin_type::int_and:
-        case intrin_type::int_or:
-        case intrin_type::int_xor:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 2,
-                    "Binary intrinsics take two parameters. Got " << v);
-            validate_binary_op(v, v->args_[0], v->args_[1]);
-            type_category cate;
-            cate = get_etype_category_nothrow(v->args_[0]->dtype_);
-            COMPILE_ASSERT_POS(cate == CATE_INT || cate == CATE_UINT,
-                    "int_and and int_or only supports ints, got " << v);
-            break;
-        case intrin_type::abs:
-        case intrin_type::round:
-        case intrin_type::floor:
-        case intrin_type::ceil:
-        case intrin_type::exp:
-        case intrin_type::log:
-        case intrin_type::erf:
-        case intrin_type::sqrt:
-        case intrin_type::rsqrt:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 1,
-                    "abs/ round/ floor/ ceil/ exp/ sqrt/ rsqrt expects 1 "
-                    "argument, "
-                    "got " << v);
-            COMPILE_ASSERT_POS(v->args_[0]->dtype_ == v->dtype_,
-                    "abs/ round/ floor/ ceil/ exp/ sqrt/ rsqrt node should "
-                    "have the "
-                    "same type "
-                    "of its argument");
-            break;
-        case intrin_type::reduce_add:
-        case intrin_type::reduce_mul:
-        case intrin_type::reduce_max:
-        case intrin_type::reduce_min:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 1,
-                    "reduce expects 1 argument, "
-                    "got " << v);
-            break;
-        case intrin_type::fnmadd:
-        case intrin_type::fmadd:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 3,
-                    "Trinary intrinsics take three parameters. Got " << v);
-            COMPILE_ASSERT_POS(v->args_[0]->dtype_ == v->args_[1]->dtype_
-                            && v->args_[0]->dtype_ == v->args_[2]->dtype_,
-                    "The types of three args should be the same: "
-                            << v->args_[0]->dtype_ << " v.s. "
-                            << v->args_[1]->dtype_ << " v.s. "
-                            << v->args_[2]->dtype_ << ", expr = " << v);
-            break;
-        case intrin_type::shuffle:
-        case intrin_type::permute:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 2,
-                    "Shuffle/permute intrinsics take two parameters. Got "
-                            << v);
-            COMPILE_ASSERT_POS(v->args_[0]->dtype_ == v->args_[1]->dtype_
-                            && (v->intrin_attrs_->get_or_else("shuffle_imm", -1)
-                                            != -1
-                                    || v->intrin_attrs_->get_or_else(
-                                               "permute_imm", -1)
-                                            != -1),
-                    "The types of the first two args should be the same and "
-                    "the third arg should be valid imm: "
-                            << v->args_[0]->dtype_ << " v.s. "
-                            << v->args_[1]->dtype_ << ", expr = " << v);
-            break;
-        case intrin_type::permutex2var:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 3,
-                    "Trinary intrinsics take three parameters. Got " << v);
-            COMPILE_ASSERT_POS(v->args_[0]->dtype_ == v->args_[2]->dtype_,
-                    "The types of the first and last args should be the same"
-                            << v->args_[0]->dtype_ << " v.s. "
-                            << v->args_[2]->dtype_);
-
-            break;
-        case intrin_type::permutexvar:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 2,
-                    "Trinary intrinsics take two parameters. Got " << v);
-            break;
-        case intrin_type::insert:
-            validate_type(v);
-            if (v->args_.size() == 2) {
-                COMPILE_ASSERT_POS(
-                        v->intrin_attrs_->get_or_null<int>("insert_imm")
-                                != nullptr,
-                        "Must specify intrin_attrs_.");
-            } else {
-                COMPILE_ASSERT_POS(v->args_.size() == 3,
-                        "The intrinsics take 2 or 3 parameters. Got " << v);
-                COMPILE_ASSERT_POS(v->args_[2]->dtype_ == datatypes::u16,
-                        "The args row and col should be type u32. Got " << v);
-            }
-            break;
-        case intrin_type::extract:
-            validate_type(v);
-            if (v->args_.size() == 1) {
-                COMPILE_ASSERT_POS(
-                        v->intrin_attrs_->get_or_null<int>("extract_imm")
-                                != nullptr,
-                        "Must specify intrin_attrs_.");
-            } else {
-                COMPILE_ASSERT_POS(v->args_.size() == 2,
-                        "The intrinsics take 1 or 2 parameters. Got " << v);
-                COMPILE_ASSERT_POS(v->args_[1]->dtype_ == datatypes::u16,
-                        "The args row and col should be type u32. Got " << v);
-            }
-            break;
-        case intrin_type::gather:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 2,
-                    "Gather intrinsics take two parameters. Got " << v);
-            COMPILE_ASSERT_POS(v->args_[0]->dtype_.is_pointer()
-                            && v->args_[0]
-                                       ->dtype_.get_pointer_element()
-                                       .is_etype(sc_data_etype::F32),
-                    "The first type of gather should be a pointer to f32. Got "
-                            << v);
-            COMPILE_ASSERT_POS(v->args_[1]->dtype_.is_etype(sc_data_etype::S32),
-                    "The second type of gather should be s32. Got " << v);
-            break;
-        case intrin_type::broadcast:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 1,
-                    "broadcast expects 1 argument, got " << v);
-            break;
-        case intrin_type::reinterpret:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 1,
-                    "reinterpret expects 1 argument, got " << v);
-            break;
-        case intrin_type::isnan:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 1,
-                    "isnan expects 1 argument, got " << v);
-            COMPILE_ASSERT_POS(v->dtype_.is_etype(sc_data_etype::F32)
-                            || v->dtype_.is_etype(sc_data_etype::U16),
-                    "isnan node should receive float or bfloat16 type.");
-            COMPILE_ASSERT_POS(utils::get_sizeof_type(v->args_[0]->dtype_)
-                            == utils::get_sizeof_type(v->dtype_),
-                    "isnan node should have types of same size, got " << v);
-            break;
-        case intrin_type::saturated_cast:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 1,
-                    "isnan expects 1 argument, got " << v);
-            break;
-        case intrin_type::shl:
-        case intrin_type::shr:
-            validate_type(v);
-            COMPILE_ASSERT_POS(v->args_.size() == 2,
-                    "shl shr expects 2 argument, got " << v);
-            COMPILE_ASSERT_POS(
-                    (v->args_[0]->dtype_.lanes_ == v->args_[1]->dtype_.lanes_)
-                            || v->args_[1]->dtype_.lanes_ == 1,
-                    "shl shr does not support A << B or A >> B where A is a "
-                    "scalar and B is a vector ");
-            type_category cate1, cate2;
-            cate1 = get_etype_category_nothrow(v->args_[0]->dtype_);
-            cate2 = get_etype_category_nothrow(v->args_[1]->dtype_);
-            COMPILE_ASSERT_POS((cate1 == CATE_INT || cate1 == CATE_UINT)
-                            && (cate2 == CATE_INT || cate2 == CATE_UINT),
-                    "operands of shl and shr should be ints, got " << v);
-            break;
-        case intrin_type::brgemm: {
-            COMPILE_ASSERT_POS(v->dtype_ == datatypes::void_t,
-                    "brgemm should return void");
-            COMPILE_ASSERT_POS(
-                    v->check_brgemm_arg_size(brgemm_args::NUM_FULL_ARGS_STRIDE),
-                    "Wrong number of arguments for brgemm");
-            auto &extras = v->intrin_attrs_->get<brgemm_args::extra_args_t>(
-                    intrin_attr::brgemm_extras);
-            if (extras.is_cpu_) {
-                COMPILE_ASSERT_POS(validate_brgemm_dtype(extras.dtype_A_,
-                                           extras.dtype_B_, extras.dtype_C_,
-                                           !extras.postops_setting_.empty()),
-                        "BRGEMM currently only support f32, got: " << v);
-            } else {
-                COMPILE_ASSERT_POS(extras.dtype_A_ == datatypes::bf16
-                                && extras.dtype_B_ == datatypes::bf16
-                                && extras.dtype_C_ == datatypes::bf16,
-                        "BRGEMM currently only support bf16, got: " << v);
-            }
-            auto tsr_dtype_A = extras.dtype_A_.get_pointerof();
-            auto tsr_dtype_B = extras.dtype_B_.get_pointerof();
-            auto tsr_dtype_C = extras.dtype_C_.get_pointerof();
-            COMPILE_ASSERT_POS(v->args_[0]->dtype_ == tsr_dtype_A,
-                    "The " << 0
-                           << "-th argument of brgemm does not match. "
-                              "Expecting"
-                           << tsr_dtype_A << " Got" << v->args_[0]->dtype_);
-            COMPILE_ASSERT_POS(v->args_[1]->dtype_ == tsr_dtype_B,
-                    "The " << 1
-                           << "-th argument of brgemm does not match. "
-                              "Expecting"
-                           << tsr_dtype_B << " Got" << v->args_[1]->dtype_);
-            COMPILE_ASSERT_POS(v->args_[2]->dtype_ == tsr_dtype_C,
-                    "The " << 2
-                           << "-th argument of brgemm does not match. "
-                              "Expecting"
-                           << tsr_dtype_C << " Got" << v->args_[2]->dtype_);
-            for (unsigned i = brgemm_args::C + 1;
-                    i < brgemm_args::NUM_FULL_ARGS_STRIDE; i++) {
-                COMPILE_ASSERT_POS(
-                        v->args_[i]->dtype_ == brgemm_args::arg_types[i],
-                        "The " << i
-                               << "-th argument of brgemm does not match. "
-                                  "Expecting"
-                               << brgemm_args::arg_types[i] << " Got "
-                               << v->args_[i]->dtype_);
-            }
-            break;
-        }
-        case intrin_type::list_brgemm: {
-            COMPILE_ASSERT_POS(v->dtype_ == datatypes::void_t,
-                    "list_brgemm should return void");
-            COMPILE_ASSERT_POS(
-                    v->check_brgemm_arg_size(brgemm_args::NUM_FULL_ARGS_LIST),
-                    "Wrong number of arguments for list_brgemm");
-
-            auto &extras = v->intrin_attrs_->get<brgemm_args::extra_args_t>(
-                    intrin_attr::brgemm_extras);
-            if (extras.is_cpu_) {
-                COMPILE_ASSERT_POS(validate_brgemm_dtype(extras.dtype_A_,
-                                           extras.dtype_B_, extras.dtype_C_,
-                                           !extras.postops_setting_.empty()),
-                        "list_brgemm currently only support "
-                        "u8s8s32/s8s8s32/bf16bf16f32/f32f32f32, got: "
-                                << v);
-            } else {
-                COMPILE_ASSERT_POS(extras.dtype_A_ == datatypes::f32
-                                && extras.dtype_B_ == datatypes::f32
-                                && extras.dtype_C_ == datatypes::f32,
-                        "list_BRGEMM currently only support fp32, got: " << v);
-            }
-
-            for (unsigned i = 0; i < brgemm_args::NUM_FULL_ARGS_LIST; i++) {
-                COMPILE_ASSERT_POS(
-                        v->args_[i]->dtype_ == brgemm_args::list_arg_types[i],
-                        "The " << i
-                               << "-th argument of list_brgemm does not match. "
-                                  "Expecting"
-                               << brgemm_args::list_arg_types[i] << " Got"
-                               << v->args_[i]->dtype_);
-            }
-            break;
-        }
-        default: break;
-    }
-}
-
-void validate_impl_t::view(tensor_c v) {
-    if (cur_func_) {
-        COMPILE_ASSERT_POS(is_var_defined(v.get()), "Use before define: " << v);
-    }
-    COMPILE_ASSERT_POS(v->dtype_.is_etype_pointer(),
-            "Tensor should have tensor type, got: " << v->dtype_);
-    validate_type(v->elem_dtype_, v);
-    COMPILE_ASSERT_POS(v->elem_dtype_.lanes_ == 1,
-            "tensor cannot contain vector types: " << v);
-    COMPILE_ASSERT_POS(!v->dims_.empty(), "Expecting the dimension > 0: " << v);
-    for (auto &dim : v->dims_) {
-        dispatch(dim);
-    }
-    COMPILE_ASSERT_POS(v->dims_.size() == v->strides_.size(),
-            "Expecting dims and strides having same length, but got dims "
-            "length: "
-                    << v->dims_.size()
-                    << " and strides length: " << v->strides_.size());
-    int cnt = 0;
-    sc_data_type_t idxtype = v->dims_.front()->dtype_;
-    for (auto &idx : v->dims_) {
-        cnt += 1;
-        COMPILE_ASSERT_POS(idx->dtype_ == datatypes::index
-                        || idx->dtype_ == datatypes::s32,
-                "The " << cnt << "-th index of the tensor has type "
-                       << idx->dtype_ << ". Expecting an integer: " << v);
-        COMPILE_ASSERT_POS(idx->dtype_ == idxtype,
-                "Expecting the all dimemsions within the tensor definition "
-                "having the same dtype. Current dimemsion: "
-                        << cnt << ". expr = " << v);
-    }
-}
-
-void validate_impl_t::view(tensorptr_c v) {
-    COMPILE_ASSERT_POS(v->dtype_.is_etype_pointer(),
-            "Tensor should have tensor type, got: " << v->dtype_);
-    dispatch(v->base_);
-}
-
-void validate_impl_t::view(var_c v) {
-    if (cur_func_) {
-        COMPILE_ASSERT_POS(is_var_defined(v.get()), "Use before define: " << v);
-    }
-    validate_type(v);
-}
-
-void validate_impl_t::view(assign_c v) {
-    COMPILE_ASSERT_POS(v->var_.isa<var>() || v->var_.isa<indexing>(),
-            "Assignment only supports tensor or var, got: " << v);
-    dispatch(v->var_);
-    dispatch(v->value_);
-    COMPILE_ASSERT_POS(v->var_->dtype_ == v->value_->dtype_,
-            "Assignment expects the LHS and RHS of the same type, but got "
-                    << v->var_->dtype_ << " and " << v->value_->dtype_
-                    << " expr = " << v);
-}
-
-void validate_impl_t::view(returns_c v) {
-    ir_viewer_t::view(v);
-    if (cur_func_) {
-        if (v->value_.defined()) {
-            COMPILE_ASSERT_POS(v->value_->dtype_ == cur_func_->ret_type_,
-                    "The current function should return "
-                            << cur_func_->ret_type_ << ", but got "
-                            << v->value_->dtype_ << ": " << v);
-        } else {
-            COMPILE_ASSERT_POS(cur_func_->ret_type_ == datatypes::void_t,
-                    "Returning void in a non-void function: " << v);
-        }
-    }
-    COMPILE_ASSERT_POS(for_loop_levels_ == 0
-                    || (cur_func_ && cur_func_->attr_
-                            && cur_func_->attr_->get_or_else(
-                                    function_attrs::low_level, false)),
-            "Cannot return in a for-loop: " << v);
-}
-
-void validate_impl_t::check_var_tensor_def(
-        const define_c &v, bool allow_tensor_view, bool is_global) const {
-    COMPILE_ASSERT_POS(v->var_.isa<var>() || v->var_.isa<tensor>(),
-            "Expecting var/tensor: " << v);
-    if (v->init_.defined()) {
-        if (!allow_tensor_view) {
-            if (v->var_.isa<var>()) {
-                COMPILE_ASSERT_POS(v->init_->dtype_ == v->var_->dtype_,
-                        "The init val has different type from the var "
-                        "definition "
-                                << v);
-            } else {
-                COMPILE_ASSERT_POS(v->init_.isa<intrin_call>()
-                                && utils::is_one_of(
-                                        v->init_.static_as<intrin_call>()
-                                                ->type_,
-                                        intrin_type::read_struct),
-                        "The init val of tensor should come from dynamic "
-                        "extract intrin call "
-                                << v);
-            }
-        }
-    }
-    if (!is_global && v->var_.isa<tensor>()) {
-        auto &init = v->var_.static_as<tensor>()->init_value_;
-        COMPILE_ASSERT_POS(init == nullptr
-                        || init == tensor_node::get_zero_tensor_initializer()
-                        || init->size_ == sizeof(union_val)
-                        || (v->var_->attr_
-                                && v->var_->attr_->has_key(
-                                        attr_keys::shared_const)),
-                "The tensor defined in function cannot have init value: " << v);
-    }
-}
-
-void validate_impl_t::view(stmts_c v) {
-    if (cur_func_) { defined_vars_.emplace_back(var_scope()); }
-    ir_viewer_t::view(v);
-    if (cur_func_) { defined_vars_.pop_back(); }
-}
-
-void validate_impl_t::view(define_c v) {
-    if (cur_func_) { add_def(v->var_.get(), v); }
-    ir_viewer_t::view(v);
-    COMPILE_ASSERT_POS(v->linkage_ != linkage::private_global
-                    && v->linkage_ != linkage::public_global,
-            "The variable defined in function cannot be global: " << v);
-    check_var_tensor_def(v, allow_tensor_view_, false);
-}
-
-void validate_impl_t::view(if_else_c v) {
-    dispatch(v->condition_);
-    dispatch(v->then_case_);
-    if (v->else_case_.defined()) { dispatch(v->else_case_); }
-    COMPILE_ASSERT_POS(v->condition_->dtype_ == datatypes::boolean,
-            "If-else node expects an boolean expr as the condition, got "
-                    << v->condition_->dtype_ << " expr = " << v);
-}
-void validate_impl_t::view(for_loop_c v) {
-    for_loop_levels_++;
-    COMPILE_ASSERT_POS(v->isvalid(), "met an invalid for-loop");
-    if (cur_func_) { add_def(v->var_.get(), v); }
-    dispatch(v->var_);
-    dispatch(v->iter_begin_);
-    dispatch(v->iter_end_);
-    dispatch(v->step_);
-    dispatch(v->body_);
-
-    if (cur_func_) { defined_vars_.back().erase(v->var_.get()); }
-    COMPILE_ASSERT_POS(v->var_->dtype_ == datatypes::index
-                    || v->var_->dtype_ == datatypes::s32,
-            "for_loop node expects an index or s32 itervar, got "
-                    << v->var_->dtype_ << " expr = " << v);
-    COMPILE_ASSERT_POS(v->var_->dtype_ == v->iter_begin_->dtype_,
-            "iter_begin of for_loop node expects an "
-                    << v->var_->dtype_ << " as the itervar, got "
-                    << v->iter_begin_->dtype_ << " expr = " << v);
-    COMPILE_ASSERT_POS(v->var_->dtype_ == v->iter_end_->dtype_,
-            "iter_end of for_loop node expects an "
-                    << v->var_->dtype_ << " as the itervar, got "
-                    << v->iter_end_->dtype_ << " expr = " << v);
-    COMPILE_ASSERT_POS(v->var_->dtype_ == v->step_->dtype_,
-            "step of for_loop node expects an "
-                    << v->var_->dtype_ << " as the itervar, got "
-                    << v->step_->dtype_ << " expr = " << v);
-    if (v->kind_ == for_type::NORMAL) {
-        COMPILE_ASSERT_POS(v->num_threads_ == 0,
-                "Expecting non-parallel for-loop's num threads = 0");
-    }
-    for_loop_levels_--;
-}
-
-func_c validator_t::operator()(func_c f) {
-    validate_impl_t v;
-    return v.dispatch(std::move(f));
-}
-
-stmt_c validator_t::operator()(stmt_c f) {
-    validate_impl_t v;
-    return v.dispatch(std::move(f));
-}
-
-expr_c validator_t::operator()(expr_c f) {
-    validate_impl_t v;
-    return v.dispatch(std::move(f));
-}
-
-const_ir_module_ptr validator_t::operator()(const_ir_module_ptr f) {
-    validate_impl_t vali;
-    vali.defined_vars_.emplace_back(validate_impl_t::var_scope());
-    for (auto &v : f->get_module_vars()) {
-        vali.add_def(v->var_.get(), v);
-        vali.ir_viewer_t::view(v);
-        COMPILE_ASSERT(v->linkage_ == linkage::private_global
-                        || v->linkage_ == linkage::public_global,
-                "The variable defined in module should be global: " << v);
-        vali.check_var_tensor_def(v, false, true);
-    }
-    for (auto &funct : f->get_contents()) {
-        vali.dispatch(funct);
-    }
-    return f;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/validator.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/validator.hpp
deleted file mode 100644
index 3d5f4cf36f5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass/validator.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_VALIDATOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_VALIDATOR_HPP
-
-#include "../module_pass.hpp"
-#include "../sc_function.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * The validator who checks if the IR is legal
- * */
-class validator_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    func_c operator()(func_c f);
-    expr_c operator()(expr_c f);
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_dep_util.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_dep_util.hpp
deleted file mode 100644
index 68c18652431..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_dep_util.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_DEP_UTIL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_DEP_UTIL_HPP
-
-#include "pass_id.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-template <typename T, typename... Args>
-inline constexpr T make_bit_mask_impl(T oldv, int n_th, Args... args) {
-    return oldv | (T(1) << n_th);
-}
-template <typename T>
-inline constexpr T make_bit_mask_impl(T oldv) {
-    return oldv;
-}
-
-template <typename T, typename... Args>
-inline constexpr T make_bit_mask(Args... args) {
-    return make_bit_mask_impl<T>(T(0), args...);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#ifndef NDEBUG
-#define SC_DECL_PASS_DEPDENCYINFO_IMPL(passname, passclass, ...) \
-    void passclass::get_dependency_info(tir_pass_dependency_t &out) const { \
-        using namespace tir_pass; \
-        out = {passname, __VA_ARGS__}; \
-    }
-#else
-#define SC_DECL_PASS_DEPDENCYINFO_IMPL(passname, passclass, ...)
-#endif
-
-#define SC_DECL_PASS_INFO_IMPL(passname, passclass, ...) \
-    const char *passclass::get_name() const { return #passname; } \
-    SC_DECL_PASS_DEPDENCYINFO_IMPL(passname, passclass, __VA_ARGS__)
-
-// passname, SC_PASS_DEPENDS_ON(...), SC_PASS_REQUIRE_STATE(...),
-// SC_PASS_REQUIRE_NOT_STATE(...), SC_PASS_SET_STATE(...),
-// SC_PASS_UNSET_STATE(...)
-#define SC_DECL_PASS_INFO(passname, ...) \
-    SC_DECL_PASS_INFO_IMPL(passname, passname##_t, __VA_ARGS__)
-
-#define SC_PASS_MASK(...) make_bit_mask<uint64_t>(__VA_ARGS__)
-#define SC_PASS_DEPENDS_ON(...) \
-    { __VA_ARGS__ }
-#define SC_PASS_REQUIRE_STATE SC_PASS_MASK
-#define SC_PASS_REQUIRE_NOT_STATE SC_PASS_MASK
-#define SC_PASS_SET_STATE SC_PASS_MASK
-#define SC_PASS_UNSET_STATE SC_PASS_MASK
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_id.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_id.hpp
deleted file mode 100644
index 86e3990e7b5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_id.hpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_ID_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_ID_HPP
-
-#include <stdint.h>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace tir_pass {
-
-enum pass_id {
-    undef = 0,
-    dyn_tensor_transformer,
-    interface_generalizer,
-    concat_memory_planning,
-    index_flattener,
-    auto_caster,
-    validator,
-    trace_inserter,
-    constant_folder,
-    dead_func_eliminate,
-    tensor_inplace,
-    target_specific_lowering_cpu,
-    kernel_lowering_cpu,
-    closurizer_cpu,
-    module_globals_resolver,
-    parallel_merge,
-
-    FUNCTION_PASS_START,
-    tensor_shrinker = FUNCTION_PASS_START,
-    bf16_fp16_legalizer,
-    dynamic_parallel_transform,
-    buffer_rescheduling_tensor_hoisting,
-    nested_parallel_flattener,
-    func_inliner,
-    ir_simplifier,
-    loop_merger,
-    tensor_init,
-    parallel_workload_dispatcher,
-    simple_loop_invariant_code_motion,
-    simple_loop_function_motion,
-    index2var,
-    bf16_fp16_eliminator,
-    loop_unroller,
-    dead_write_eliminator,
-    tensor2var,
-    buffer_scheduler,
-    dyn_boundary_check,
-    local_tensor_lowering_cpu,
-    loop_splitter,
-    ssa_transform,
-    value_numbering,
-    loop_invariant_code_motion,
-    dessa_transform,
-
-    MAX_ID_PLUS_1
-};
-
-// the states in the pass manager. Initialized with 0
-enum state {
-    CONST_FOLDED,
-    IR_SIMPLIFIED,
-    FUNC_INLINED,
-    SSA_STAGE,
-
-    NUM_STATES
-};
-} // namespace tir_pass
-
-#ifndef NDEBUG
-struct tir_pass_dependency_t {
-    tir_pass::pass_id id_;
-    // the passes to run before this pass
-    std::vector<tir_pass::pass_id> depending_;
-    // the required bits in the global pass state. If a bit in
-    // required_state_ is 1, the corresponding bit must be 1 in global pass
-    // state
-    uint64_t required_state_;
-    // the required unset bits in the global pass state. If a bit in
-    // required_not_state_ is 1, the corresponding bit must be 0 in global pass
-    // state
-    uint64_t required_not_state_;
-    // the bit mask to set to the global pass state after the pass
-    uint64_t set_state_;
-    // the bit mask to unset to the global pass state after the pass
-    uint64_t unset_state_;
-
-    tir_pass_dependency_t(tir_pass::pass_id id = tir_pass::pass_id::undef,
-            const std::vector<tir_pass::pass_id> &depending = {},
-            uint64_t required_state = 0, uint64_t required_not_state = 0,
-            uint64_t set_state = 0, uint64_t unset_state = 0)
-        : id_(id)
-        , depending_(depending)
-        , required_state_(required_state)
-        , required_not_state_(required_not_state)
-        , set_state_(set_state)
-        , unset_state_(unset_state) {}
-};
-#endif
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_info_macros.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_info_macros.hpp
deleted file mode 100644
index ea4cf65e922..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_info_macros.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_INFO_MACROS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_INFO_MACROS_HPP
-
-#ifndef NDEBUG
-#define SC_DECL_PASS_DEPDENCYINFO() \
-    void get_dependency_info(tir_pass_dependency_t &out) const override;
-#else
-#define SC_DECL_PASS_DEPDENCYINFO()
-#endif
-
-#define SC_DECL_PASS_INFO_FUNC() \
-    const char *get_name() const override; \
-    SC_DECL_PASS_DEPDENCYINFO();
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_manager.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_manager.cpp
deleted file mode 100644
index d53666636b8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_manager.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <memory>
-#include <stdint.h>
-#include <vector>
-#include "function_pass.hpp"
-#include "module_pass.hpp"
-#include "pass_id.hpp"
-#include "pass_manager.hpp"
-#include "util_module_passes.hpp"
-#include <runtime/config.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-const char *get_pass_name(module_pass_t *pass) {
-    if (auto ret = pass->get_name()) { return ret; }
-    auto &tyid = typeid(*pass);
-    if (tyid == typeid(module_function_pass_t)) {
-        auto &ptr = *static_cast<module_function_pass_t *>(pass)->impl_;
-        return typeid(ptr).name();
-    }
-    return tyid.name();
-}
-
-#ifndef NDEBUG
-
-void module_pass_t::get_dependency_info(tir_pass_dependency_t &out) const {
-    out = tir_pass_dependency_t {tir_pass::undef};
-}
-
-void function_pass_t::get_dependency_info(tir_pass_dependency_t &out) const {
-    out = tir_pass_dependency_t {tir_pass::undef};
-}
-
-static size_t cast_to_int(tir_pass::pass_id v) {
-    return static_cast<size_t>(v);
-}
-
-static const char *tir_state_names[]
-        = {"CONST_FOLDED", "IR_SIMPLIFIED", "FUNC_LININED", "SSA_STAGE"};
-static_assert(sizeof(tir_state_names) / sizeof(tir_state_names[0])
-                == tir_pass::state::NUM_STATES,
-        "Bad number of tir_state_names");
-
-void validate_pass_order(const context_ptr &ctx,
-        const std::vector<std::unique_ptr<module_pass_t>> &passes,
-        bool gen_wrapper) {
-    uint64_t state = 0;
-    std::vector<bool> visited(
-            cast_to_int(tir_pass::pass_id::MAX_ID_PLUS_1), false);
-    for (size_t i = 0; i < passes.size(); i++) {
-        auto pass = passes[i].get();
-        tir_pass_dependency_t dep;
-        pass->get_dependency_info(dep);
-        visited[cast_to_int(dep.id_)] = true;
-        for (auto prev : dep.depending_) {
-            if (prev == tir_pass::trace_inserter && !ctx->flags_.trace_) {
-                continue;
-            }
-            if ((prev == tir_pass::tensor_inplace
-                        || prev == tir_pass::buffer_scheduler)
-                    && ctx->flags_.buffer_schedule_ <= 0) {
-                continue;
-            }
-            if (prev == tir_pass::interface_generalizer && !gen_wrapper) {
-                continue;
-            }
-            if (prev == tir_pass::tensor2var && !ctx->flags_.tensor2var_) {
-                continue;
-            }
-            if (prev == tir_pass::index2var && !ctx->flags_.index2var_) {
-                continue;
-            }
-            if (runtime_config_t::get().managed_thread_pool_
-                            == thread_pool_mode_t::DYNAMIC
-                    && (prev == tir_pass::buffer_rescheduling_tensor_hoisting
-                            || prev == tir_pass::nested_parallel_flattener)) {
-                continue;
-            }
-            if (runtime_config_t::get().managed_thread_pool_
-                            != thread_pool_mode_t::DYNAMIC
-                    && prev == tir_pass::dynamic_parallel_transform) {
-                continue;
-            }
-            if (prev == tir_pass::dead_write_eliminator
-                    && !ctx->flags_.dead_write_elimination_) {
-                continue;
-            }
-            COMPILE_ASSERT(visited[cast_to_int(prev)],
-                    "The pass " << get_pass_name(pass) << " at index " << i
-                                << " depends on pass_id " << cast_to_int(prev)
-                                << ", but this requirement is not satisfied");
-        }
-        for (int sid = 0; sid < tir_pass::state::NUM_STATES; sid++) {
-            uint64_t mask = uint64_t(1) << sid;
-            if ((dep.required_state_ & mask) && ((state & mask) == 0)) {
-                COMPILE_ASSERT(false,
-                        "The pass "
-                                << get_pass_name(pass) << " at index " << i
-                                << " requires the state "
-                                << tir_state_names[sid]
-                                << ", but this requirement is not satisfied");
-            }
-
-            if ((dep.required_not_state_ & mask) && ((state & mask) != 0)) {
-                COMPILE_ASSERT(false,
-                        "The pass "
-                                << get_pass_name(pass) << " at index " << i
-                                << " rejects the state " << tir_state_names[sid]
-                                << ", but this requirement is not satisfied");
-            }
-        }
-        state |= dep.set_state_;
-        state &= ~dep.unset_state_;
-    }
-}
-
-#endif
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_manager.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_manager.hpp
deleted file mode 100644
index 82dd0cf60f2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/pass_manager.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_MANAGER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASS_MANAGER_HPP
-
-#include <memory>
-#include <stdint.h>
-#include <vector>
-#include <compiler/config/context.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class module_pass_t;
-#ifndef NDEBUG
-void validate_pass_order(const context_ptr &ctx,
-        const std::vector<std::unique_ptr<module_pass_t>> &passes,
-        bool gen_wrapper);
-#else
-#define validate_pass_order(ctx, passes, gen_wrapper)
-#endif
-const char *get_pass_name(module_pass_t *pass);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/passlet.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/passlet.cpp
deleted file mode 100644
index 17a94e02469..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/passlet.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "passlet.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-#define SC_PASSLET_IMPL_METHODS_IMPL(node_type, ...) \
-    void passlet_t::view(const node_type##_c &v, pass_phase phase) {}
-
-FOR_EACH_EXPR_IR_TYPE(SC_PASSLET_IMPL_METHODS_IMPL)
-FOR_EACH_STMT_IR_TYPE(SC_PASSLET_IMPL_METHODS_IMPL)
-FOR_EACH_BASE_EXPR_IR_TYPE(SC_PASSLET_IMPL_METHODS_IMPL)
-SC_PASSLET_IMPL_METHODS_IMPL(func)
-SC_PASSLET_IMPL_METHODS_IMPL(expr)
-SC_PASSLET_IMPL_METHODS_IMPL(stmt)
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/passlet.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/passlet.hpp
deleted file mode 100644
index 1f884d2450e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/passlet.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_PASSLET_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_PASSLET_HPP
-
-#include <compiler/ir/sc_function.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace passlet {
-enum pass_phase { PRE_VISIT, POST_VISIT };
-
-#define SC_PASSLET_METHODS_IMPL(node_type, ...) \
-    virtual void view(const node_type##_c &v, pass_phase phase);
-
-#define SC_PASSLET_METHODS() \
-    FOR_EACH_EXPR_IR_TYPE(SC_PASSLET_METHODS_IMPL) \
-    FOR_EACH_STMT_IR_TYPE(SC_PASSLET_METHODS_IMPL) \
-    FOR_EACH_BASE_EXPR_IR_TYPE(SC_PASSLET_METHODS_IMPL)
-
-/**
- * Passlet base class. A passlet is a small plugin-able analysis pass which can
- * be inserted into an ir_viewer. ir_viewer can compose a list of passlets into
- * a big analysis pass. An analysis work written into a passlet can improve:
- * 1) reusablity, as a passlet can be easily reused by other passes
- * 2) performance, as a passlet don't need to dispatch the whole IR DAG
- *
- * A passlet uses user-provided result_addresser_t to return the analysis
- * result.
- * @see temp_data_addresser
- * @see map_addresser
- * */
-struct passlet_t {
-    using result_addresser_t
-            = std::function<void *(passlet_t *ths, const node_base *v)>;
-
-    SC_PASSLET_METHODS()
-
-    virtual void view(const func_c &v, pass_phase phase);
-    virtual void view(const expr_c &v, pass_phase phase);
-    virtual void view(const stmt_c &v, pass_phase phase);
-
-    result_addresser_t expr_result_func_;
-    result_addresser_t stmt_result_func_;
-    passlet_t(const result_addresser_t &expr_result_func,
-            const result_addresser_t &stmt_result_func)
-        : expr_result_func_(expr_result_func)
-        , stmt_result_func_(stmt_result_func) {}
-    virtual ~passlet_t() = default;
-};
-
-/**
- * The typed passlet.
- * @tparam T the analysis result
- * */
-template <typename T>
-struct typed_passlet : public passlet_t {
-    using typed_addresser_t
-            = std::function<T *(passlet_t *ths, const node_base *v)>;
-    T *get_result(const expr_base *p) {
-        return reinterpret_cast<T *>(expr_result_func_(this, p));
-    }
-
-    T *get_result(const stmt_base_t *p) {
-        return reinterpret_cast<T *>(stmt_result_func_(this, p));
-    }
-
-    typed_passlet(const typed_addresser_t &expr_result_func,
-            const typed_addresser_t &stmt_result_func)
-        : passlet_t(expr_result_func, stmt_result_func) {}
-};
-
-/**
- * The passlet ananlysis result addresser. It will insert the result to the
- * expr/stmt's temp_data_.
- * @tparam T the type of the temp_data_
- * */
-template <typename T>
-struct temp_data_inserter {
-    T *operator()(passlet_t *ths, const node_base *v) {
-        auto &data = v->temp_data();
-        if (!data.isa<T>()) { data = T(); }
-        auto &ret = data.get<T>();
-        return &ret;
-    }
-};
-
-/**
- * The passlet ananlysis result addresser. It will insert the result to the
- * expr/stmt's temp_data_ as a field.
- * @tparam T the type of the temp_data_
- * @tparam TObj the analysis result type of the passlet
- * @tparam ptr the member pointer of TObj in struct T
- * */
-template <typename T, typename TObj, TObj T::*ptr>
-struct temp_data_addresser {
-    TObj *operator()(passlet_t *ths, const node_base *v) {
-        auto &data = v->temp_data();
-        if (!data.isa<T>()) { data = T(); }
-        auto &ret = data.get<T>();
-        return &(ret.*ptr);
-    }
-};
-
-namespace helper {
-
-template <typename T, typename TObj>
-struct temp_data_addresser_helper {
-    using Base = T;
-    using Obj = TObj;
-};
-
-template <typename T, typename TObj>
-constexpr temp_data_addresser_helper<T, TObj> mk_helper(TObj T::*ptr) {
-    return temp_data_addresser_helper<T, TObj> {};
-};
-
-#define sc_make_temp_data_addresser(PTR) \
-    (temp_data_addresser<decltype(helper::mk_helper(PTR))::Base, \
-            decltype(helper::mk_helper(PTR))::Obj, (PTR)>())
-
-} // namespace helper
-
-template <typename T>
-struct key_converter {
-    static T convert(const node_base *v) { return static_cast<T>(v); }
-};
-
-template <typename T1, typename Base>
-struct key_converter<node_ptr<T1, Base>> {
-    static node_ptr<T1, Base> convert(const node_base *v) {
-        return static_cast<const Base *>(v)->node_ptr_from_this();
-    }
-};
-
-/**
- * The passlet ananlysis result addresser. It will insert the result to an STL
- * map.
- * @tparam M the map type. Key can be `const expr_node*`,`const stmt_node*`,
- * `stmt_c`, `expr_c`
- * */
-template <typename M>
-struct map_addresser {
-    using key_type = typename M::key_type;
-    using value_type = typename M::mapped_type;
-    M &map_;
-    map_addresser(M &map) : map_(map) {}
-    value_type *operator()(passlet_t *ths, const node_base *v) {
-        auto &data = map_[key_converter<key_type>::convert(v)];
-        return &data;
-    }
-};
-} // namespace passlet
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_simplify.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_simplify.cpp
deleted file mode 100644
index 01ab4f8bfc7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_simplify.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "ssa_simplify.hpp"
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <util/hash_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-expr_c ssa_simplify_t::visit(const var_c &v) {
-    if (is_in_phi_) { return v; }
-    if (v->ssa_data_->is_global_) { return v; }
-    auto val = v->ssa_data_->get_value_of_var_nothrow();
-    if (!val.defined()) { return v; }
-    if (val.isa<constant>()) {
-        if (simplify_const_vec_ || val->dtype_.lanes_ == 1) return val;
-    }
-    if (val.isa<var>()) {
-        if (val->ssa_data_->is_global_) { return v; }
-        assert(v.get() != val.get());
-        return visit(val.static_as<var>());
-    }
-    return v;
-}
-
-expr_c ssa_simplify_t::visit(const ssa_phi_c &v) {
-    if (v->values_.size() == 1UL) {
-        auto &ret = v->values_.front();
-        if (ret.isa<var>()) { return visit(ret.static_as<var_c>()); }
-        return ret;
-    }
-    return v;
-}
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_simplify.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_simplify.hpp
deleted file mode 100644
index a5cf625ed7c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_simplify.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_SSA_SIMPLIFY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_SSA_SIMPLIFY_HPP
-
-#include <compiler/ir/sc_expr.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-// the passlet for SSA copy propagation and constant propagation
-// for each var X, will recursively apply the following rules:
-// 0) if the parent expr is PHI, return X (don't optimize in PHI). Otherwise:
-// 1) if X's definition is X=constant, return the constant
-// 2) if X's definition is X=another_var and another_var is not a global var,
-// 3) if X's definition is X=phi(y), return y
-// return another_var
-// To use, call enter_phi()/leave_phi() in visit(ssa_phi v) before and after
-// dispatching down ssa_phi. And call visit() for var node to get simplified
-// result
-struct ssa_simplify_t {
-    const bool simplify_const_vec_;
-    bool is_in_phi_ = false;
-    ssa_simplify_t(bool simplify_const_vec)
-        : simplify_const_vec_ {simplify_const_vec} {}
-    void enter_phi() { is_in_phi_ = true; }
-    void leave_phi() { is_in_phi_ = false; }
-
-    expr_c visit(const ssa_phi_c &v);
-    expr_c visit(const var_c &v);
-};
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_value_hash.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_value_hash.cpp
deleted file mode 100644
index 30c1c82f65f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_value_hash.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "ssa_value_hash.hpp"
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <util/hash_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-static size_t hash_const(expr_base *v) {
-    auto &values = static_cast<constant_node *>(v)->value_;
-    auto type = get_etype_category_nothrow(v->dtype_.type_code_);
-    size_t ret = 7;
-    for (auto val : values) {
-        switch (type) {
-            case CATE_FLOAT:
-                union {
-                    float vf;
-                    uint32_t vi;
-                } caster;
-                caster.vf = val.f32;
-                hash_combine(ret, caster.vi);
-                break;
-            case CATE_OTHER:
-                COMPILE_ASSERT(v->dtype_ == datatypes::pointer,
-                        "Expecting pointer type for hashing constants");
-            case CATE_INT:
-            case CATE_UINT: hash_combine(ret, val.u64); break;
-        }
-    }
-    return ret;
-}
-
-// if original value of v is 0, let v = hash. Else, call hash_combine(v,hash).
-// this is for constant and copy propagation, to let contant value/copy of SSA
-// value to propagate to their users without altering the hash value
-// if is_comm_asso, use Add(+) rather than hash_combine for unordered hashing
-static void propagated_hash_combine(bool is_comm_asso, size_t &v, size_t hash) {
-    if (v == 0) {
-        v = hash;
-    } else {
-        if (is_comm_asso) {
-            v = v + hash;
-        } else {
-            hash_combine(v, hash);
-        }
-    }
-}
-
-static expr_base *get_single_value_ssa(const expr &v) {
-    if (v.isa<ssa_phi>()) {
-        auto p = v.static_as<ssa_phi>();
-        if (p->values_.size() == 1UL) { return p->values_[0].get(); }
-    }
-    return nullptr;
-}
-
-void ssa_value_hash_t::view(const define_c &v, pass_phase phase) {
-    if (phase != pass_phase::POST_VISIT) { return; }
-    auto def = v.get();
-    auto ths = this;
-    size_t result = 0;
-    // if the operator is commutative_and_associative. If true, use unordered
-    // hashing. hash("a+b") == hash("b+a")
-    bool is_comm_asso = false;
-    if (def->var_.isa<var>() && def->init_.defined()
-            && def->var_->ssa_data_->is_local()) {
-        auto callback = [ths, &result, &is_comm_asso](array_ref<expr> v) {
-            for (auto &val : v) {
-                if (val->ssa_data_->is_global_ || val->ssa_data_->is_param_) {
-                    // for global var/ parameter vars, hash the var addr
-                    propagated_hash_combine(is_comm_asso, result,
-                            reinterpret_cast<uintptr_t>(val.get()));
-                    continue;
-                }
-                if (val.isa<constant>()) {
-                    propagated_hash_combine(
-                            is_comm_asso, result, hash_const(val.get()));
-                    continue;
-                }
-                assert(val->ssa_data_->has_owner());
-                auto owner = val->ssa_data_->get_owner();
-                if (owner.isa<for_loop>()) {
-                    propagated_hash_combine(is_comm_asso, result,
-                            reinterpret_cast<uintptr_t>(val.get()));
-                    continue;
-                }
-                size_t parent_hash = *ths->get_result(owner.get());
-                propagated_hash_combine(is_comm_asso, result, parent_hash);
-            }
-        };
-        if (def->init_.isa<var>() || def->init_.isa<constant>()) {
-            // constant and copy propagation, we use result=0
-            callback(&def->init_);
-        } else if (auto single_ssa = get_single_value_ssa(def->init_)) {
-            callback({single_ssa->node_ptr_from_this()});
-        } else {
-            result = static_cast<size_t>(def->init_->node_type_);
-            is_comm_asso = constant_folding::is_op_commutative_and_associative(
-                    def->init_);
-            get_direct_dependency_of_expr(def->init_, callback);
-        }
-    } else if (def->var_.isa<tensor>()) {
-        result = reinterpret_cast<uintptr_t>(def->var_.get());
-    }
-    *get_result(def) = result;
-}
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_value_hash.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_value_hash.hpp
deleted file mode 100644
index 4261f891507..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/ssa_value_hash.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_SSA_VALUE_HASH_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_SSA_VALUE_HASH_HPP
-
-#include "passlet.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-/**
- * The passlet to assign a hash value to each ssa var definition. This passlet
- * is used for value-numbering. It will get the same hash value when two SSA
- * "trees" are the same, even if some of the intermediate SSA var nodes has
- * different name but with same value. For commutative operators like mul, add,
- * and, or, it uses unordered hashing to ensure hash("a+b") == hash("b+a")
- * @note please make sure to initialize hash value by 0 in default
- * constructor of the result
- * @param stmt_result_func the addresser for stmt->size_t
- * */
-struct ssa_value_hash_t : public typed_passlet<size_t> {
-    using parent = typed_passlet<size_t>;
-    using parent::typed_addresser_t;
-    ssa_value_hash_t(const typed_addresser_t &stmt_result_func)
-        : parent {nullptr, stmt_result_func} {}
-    void view(const define_c &v, pass_phase phase) override;
-};
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/structural_analysis.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/structural_analysis.cpp
deleted file mode 100644
index 0181b5bd1ac..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/structural_analysis.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "structural_analysis.hpp"
-#include <functional>
-#include <utility>
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-void structural_analysis_t::view(const stmt_c &v, pass_phase phase) {
-    if (phase == pass_phase::PRE_VISIT) {
-        cur_parent_.emplace_back(v.get());
-        return;
-    }
-    cur_parent_.pop_back();
-    auto cur = get_result(v.get());
-    cur->parent_ = cur_parent_.back() ? cur_parent_.back()->shared_from_this()
-                                      : nullptr;
-    cur->cur_node_ = v.impl;
-}
-
-const stmt_base_t *structural_result_t::get_raw_parent() const {
-    return utils::get_raw_from_weakptr(parent_);
-}
-
-const stmt_base_t *structural_result_t::get_raw_cur_node() const {
-    return utils::get_raw_from_weakptr(cur_node_);
-}
-
-stmt structural_result_t::get_parent_node() const {
-    auto parent_raw = utils::get_raw_from_weakptr(parent_);
-    return parent_raw
-            ? stmt {const_cast<stmt_base_t *>(parent_raw)->shared_from_this()}
-            : stmt();
-}
-
-void structural_result_t::reset_parent_node(const stmt_c &parent) {
-    parent_ = parent.impl;
-}
-
-bool structural_result_t::is_parent_of(const structural_result_t &other,
-        const typed_addresser_t &addresser, bool allow_across_for,
-        bool allow_across_if,
-        const structural_result_t **out_second_level_parent) const {
-    const stmt_base_t *cur = other.get_raw_cur_node();
-    const structural_result_t *cur_info = &other;
-    if (out_second_level_parent) { *out_second_level_parent = nullptr; }
-    for (;;) {
-        if (cur_info == this) { return true; }
-        if (out_second_level_parent) { *out_second_level_parent = cur_info; }
-        if (!allow_across_for) {
-            if (cur->node_type_ == sc_stmt_type::for_loop) { return false; }
-        }
-        if (!allow_across_if) {
-            if (cur->node_type_ == sc_stmt_type::if_else) { return false; }
-        }
-        cur = cur_info->get_raw_parent();
-        if (!cur) { return false; }
-        cur_info = addresser(nullptr, cur);
-    }
-}
-
-const stmt_base_t *structural_result_t::find_shared_parent(
-        const structural_result_t &other, const typed_addresser_t &addresser,
-        bool allow_across_for, bool allow_across_if) const {
-    const stmt_base_t *cur = get_raw_cur_node();
-    const structural_result_t *cur_info = this;
-    for (;;) {
-        if (cur_info->is_parent_of(
-                    other, addresser, allow_across_for, allow_across_if)) {
-            return cur;
-        }
-        if (!allow_across_for) {
-            if (cur->node_type_ == sc_stmt_type::for_loop) { return nullptr; }
-        }
-        if (!allow_across_if) {
-            if (cur->node_type_ == sc_stmt_type::if_else) { return nullptr; }
-        }
-        cur = cur_info->get_raw_parent();
-        if (!cur) { return nullptr; }
-        cur_info = addresser(nullptr, cur);
-    }
-}
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/structural_analysis.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/structural_analysis.hpp
deleted file mode 100644
index badd72c340c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/structural_analysis.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_STRUCTURAL_ANALYSIS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_STRUCTURAL_ANALYSIS_HPP
-
-#include <memory>
-#include <vector>
-#include "passlet.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-struct structural_result_t {
-    using typed_addresser_t
-            = typed_passlet<structural_result_t>::typed_addresser_t;
-    std::weak_ptr<const stmt_base_t> parent_;
-    std::weak_ptr<const stmt_base_t> cur_node_;
-
-    structural_result_t() = default;
-
-    structural_result_t(const stmt_c &parent, const stmt_c &cur_node)
-        : parent_(parent.impl), cur_node_(cur_node.impl) {}
-
-    bool is_parent_of(const structural_result_t &other,
-            const typed_addresser_t &addresser, bool allow_across_for,
-            bool allow_across_if,
-            const structural_result_t **out_second_level_parent
-            = nullptr) const;
-
-    stmt get_parent_node() const;
-
-    void reset_parent_node(const stmt_c &parent);
-
-    const stmt_base_t *get_raw_parent() const;
-    const stmt_base_t *get_raw_cur_node() const;
-
-    const stmt_base_t *find_shared_parent(const structural_result_t &other,
-            const typed_addresser_t &addresser, bool allow_across_for,
-            bool allow_across_if) const;
-};
-
-/**
- * The passlet to analyze the parent stmt of each stmt
- * */
-struct structural_analysis_t : public typed_passlet<structural_result_t> {
-    using typed_addresser_t
-            = typed_passlet<structural_result_t>::typed_addresser_t;
-    std::vector<const stmt_base_t *> cur_parent_ {nullptr};
-    structural_analysis_t(const typed_addresser_t &stmt_result_func)
-        : typed_passlet<structural_result_t>(nullptr, stmt_result_func) {}
-    void view(const stmt_c &v, pass_phase phase);
-};
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/use_count_analysis.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/use_count_analysis.cpp
deleted file mode 100644
index de234cb7c24..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/use_count_analysis.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <utility>
-
-#include <compiler/ir/ir_utils.hpp>
-#include <util/array_ref.hpp>
-
-#include "use_count_analysis.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-void use_count_analysis_t::view(const func_c &v, pass_phase phase) {
-    if (phase != pass_phase::PRE_VISIT) { return; }
-    //
-    for (auto &e : v->params_) {
-        *get_result(e.get()) = 0;
-    }
-}
-
-void use_count_analysis_t::view(const expr_c &v, pass_phase phase) {
-    if (phase != pass_phase::PRE_VISIT) { return; }
-    //
-    auto count_use = [this](array_ref<expr> vec) {
-        for (auto &e : vec) {
-            if (e.isa<var>() || e.isa<tensor>()) { *get_result(e.get()) += 1; }
-        }
-    };
-    if (v.isa<var>() || v.isa<tensor>()) {
-        *get_result(v.get()) += 1;
-    } else {
-        get_direct_dependency_of_expr(v.remove_const(), count_use);
-    }
-}
-
-void use_count_analysis_t::view(const define_c &v, pass_phase phase) {
-    if (phase != pass_phase::PRE_VISIT) { return; }
-    *get_result(v->var_.get()) = 0;
-}
-
-void use_count_analysis_t::view(const for_loop_c &v, pass_phase phase) {
-    if (phase != pass_phase::PRE_VISIT) { return; }
-    *get_result(v->var_.get()) = 0;
-}
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/use_count_analysis.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/use_count_analysis.hpp
deleted file mode 100644
index 0b0bee07ed4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/use_count_analysis.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_USE_COUNT_ANALYSIS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_USE_COUNT_ANALYSIS_HPP
-
-#include "passlet.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-/**
- * The passlet to analyze the usage of each expr
- * */
-
-struct use_count_analysis_t : public typed_passlet<size_t> {
-    using typed_addresser_t = typed_passlet<size_t>::typed_addresser_t;
-
-    use_count_analysis_t(const typed_addresser_t &expr_result_func)
-        : typed_passlet<size_t>(expr_result_func, nullptr) {}
-    void view(const expr_c &v, pass_phase phase);
-    void view(const func_c &v, pass_phase phase);
-    void view(const define_c &v, pass_phase phase);
-    void view(const for_loop_c &v, pass_phase phase);
-};
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/volatility_analysis.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/volatility_analysis.cpp
deleted file mode 100644
index 6bc9e2c366e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/volatility_analysis.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "volatility_analysis.hpp"
-#include <functional>
-#include <utility>
-#include <vector>
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/ssa_data.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-static volatility_result_t::state_t merge_state(
-        volatility_result_t::state_t a, volatility_result_t::state_t b) {
-    if (a == volatility_result_t::YES || b == volatility_result_t::YES) {
-        return volatility_result_t::YES;
-    }
-    if (a == volatility_result_t::NO && b == volatility_result_t::NO) {
-        return volatility_result_t::NO;
-    }
-    return volatility_result_t::UNDEF;
-}
-
-static bool expr_can_hoist(const expr_base *s) {
-    return non_volatile_expr(s) || is_pure_func_call(s->node_ptr_from_this())
-            || is_tensor_read_only(s);
-}
-
-void volatility_analysis_t::view(const define_c &v, pass_phase phase) {
-    if (phase != pass_phase::POST_VISIT) { return; }
-    auto def = v.get();
-    auto ths = this;
-    if (def->var_.isa<var>() && def->init_.defined()
-            && def->var_->ssa_data_->is_local()) {
-        volatility_result_t::state_t is_volatile = volatility_result_t::NO;
-        bool is_loop_phi = check_loop_invarient_ && def->init_.isa<ssa_phi>()
-                && def->init_.static_as<ssa_phi>()->is_loop_phi_;
-        if (is_loop_phi || !expr_can_hoist(def->init_.get())) {
-            is_volatile = volatility_result_t::YES;
-        } else {
-            auto callback = [ths, &is_volatile](array_ref<expr> v) {
-                for (auto &val : v) {
-                    if (is_volatile == volatility_result_t::YES) { break; }
-                    if (val->ssa_data_->is_global_) {
-                        is_volatile = volatility_result_t::YES;
-                        break;
-                    }
-                    if (val->ssa_data_->is_param_) { continue; }
-                    if (val.isa<constant>() || val.isa<tensor>()) { continue; }
-                    assert(val->ssa_data_->has_owner());
-                    auto owner = val->ssa_data_->get_owner();
-                    if (owner.isa<for_loop>()) {
-                        if (ths->check_loop_invarient_) {
-                            is_volatile = volatility_result_t::YES;
-                        }
-                    } else {
-                        auto parent_is_volatile
-                                = ths->get_result(
-                                             val->ssa_data_->get_owner().get())
-                                          ->is_volatile_;
-                        is_volatile
-                                = merge_state(is_volatile, parent_is_volatile);
-                    }
-                }
-            };
-            if (def->init_.isa<var>() && def->init_->ssa_data_->is_global_) {
-                callback({def->init_->node_ptr_from_this()});
-            } else {
-                get_direct_dependency_of_expr(
-                        def->init_->node_ptr_from_this(), callback);
-            }
-        }
-        if (is_volatile == volatility_result_t::UNDEF) {
-            // if this var depends on an undefined var, it usually
-            // mean it depends on a loop phi.
-            if (ths->check_loop_invarient_) {
-                // a loop var is not loop invarient
-                is_volatile = volatility_result_t::YES;
-            } else {
-                // We need to remember these vars and try again
-                ths->to_revisit_.push_back(def);
-            }
-        }
-        ths->get_result(def)->is_volatile_ = is_volatile;
-    } else {
-        ths->get_result(def)->is_volatile_ = volatility_result_t::YES;
-    }
-}
-
-void volatility_analysis_t::view(const func_c &v, pass_phase phase) {
-    if (phase != pass_phase::POST_VISIT) { return; }
-    if (!to_revisit_.empty()) {
-        // finalize the results
-        std::vector<const define_node_t *> to_revisit = std::move(to_revisit_);
-        for (auto v : to_revisit) {
-            view(v->node_ptr_from_this().static_as<define_c>(),
-                    pass_phase::POST_VISIT);
-            auto ret = get_result(v);
-            // if a loop var's state is still undef, then it will not be
-            // volatile
-            if (ret->is_volatile_ == volatility_result_t::UNDEF) {
-                ret->is_volatile_ = volatility_result_t::NO;
-            }
-        }
-    }
-}
-
-volatility_analysis_t::volatility_analysis_t(
-        bool check_loop_invarient, const typed_addresser_t &stmt_result_func)
-    : typed_passlet {nullptr, stmt_result_func}
-    , check_loop_invarient_(check_loop_invarient) {}
-
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/volatility_analysis.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/volatility_analysis.hpp
deleted file mode 100644
index 0f6d5b9ec72..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/passlet/volatility_analysis.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_VOLATILITY_ANALYSIS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_PASSLET_VOLATILITY_ANALYSIS_HPP
-
-#include <vector>
-#include "passlet.hpp"
-#include <compiler/ir/attr_keys.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace passlet {
-
-struct volatility_result_t {
-    enum state_t {
-        UNDEF,
-        YES,
-        NO,
-    };
-    state_t is_volatile_ = UNDEF;
-};
-
-inline bool is_tensor_read_only(const expr_base *s) {
-    return (s->node_type_ == sc_expr_type::tensor)
-            && any_map_t::fetch_or_else(
-                    s->attr_.get(), attr_keys::read_only_tensor, false);
-}
-
-inline bool non_volatile_expr(const expr_base *s) {
-    switch (s->node_type_) {
-        case sc_expr_type::var:
-        case sc_expr_type::cast:
-        case sc_expr_type::select:
-        case sc_expr_type::constant:
-        case sc_expr_type::tensorptr:
-        case sc_expr_type::ssa_phi: return true; break;
-        case sc_expr_type::intrin_call: {
-            switch (static_cast<const intrin_call_node *>(s)->type_) {
-                case intrin_type::min:
-                case intrin_type::max:
-                case intrin_type::abs:
-                case intrin_type::round:
-                case intrin_type::floor:
-                case intrin_type::ceil:
-                case intrin_type::exp:
-                case intrin_type::log:
-                case intrin_type::erf:
-                case intrin_type::sqrt:
-                case intrin_type::rsqrt:
-                case intrin_type::reduce_add:
-                case intrin_type::reduce_mul:
-                case intrin_type::reduce_max:
-                case intrin_type::reduce_min:
-                case intrin_type::fnmadd:
-                case intrin_type::fmadd:
-                case intrin_type::unpack_low:
-                case intrin_type::unpack_high:
-                case intrin_type::shuffle:
-                case intrin_type::permute:
-                case intrin_type::int_and:
-                case intrin_type::int_or:
-                case intrin_type::int_xor:
-                case intrin_type::reinterpret:
-                case intrin_type::broadcast:
-                case intrin_type::permutex2var:
-                case intrin_type::permutexvar:
-                case intrin_type::insert:
-                case intrin_type::extract:
-                case intrin_type::isnan:
-                case intrin_type::get_group_thread_id:
-                case intrin_type::get_group_id:
-                case intrin_type::saturated_cast:
-                case intrin_type::round_and_cast:
-                case intrin_type::shl:
-                case intrin_type::shr:
-                case intrin_type::constant_load: return true; break;
-                default: break;
-            }
-            return false;
-            break;
-        }
-        case sc_expr_type::low_level_intrin: {
-            auto intrin = static_cast<const low_level_intrin_node *>(s);
-            switch (intrin->kind_) {
-                case low_level_intrin_kind::x86_general:
-                    switch (intrin->type_) {
-                        case x86_intrin_type::avx_broadcast_idx: {
-                            return is_tensor_read_only(intrin->args_[0].get());
-                        } break;
-                        case x86_intrin_type::avx_mask_cast:
-                        case x86_intrin_type::avx_compare: return true; break;
-                        default: break;
-                    }
-                default: break;
-            }
-            return false;
-            break;
-        }
-        default:
-            if (dynamic_cast<const binary_node *>(s)) { return true; }
-            if (dynamic_cast<const logic_node *>(s)) { return true; }
-            if (dynamic_cast<const cmp_node *>(s)) { return true; }
-            return false;
-            break;
-    }
-}
-
-/**
- * The passlet to analyze whether an SSA value is related to any "side-effect".
- * If an SSA value produces/is affected by side-effects (it usually means
- * reads/writes memory, I/O, etc.), it will be marked "YES".
- * @param check_loop_invarient whether to consider loops as "side-effect"
- * @param stmt_result_func the addresser for stmt->volatility_result_t
- * */
-struct volatility_analysis_t : public typed_passlet<volatility_result_t> {
-    bool check_loop_invarient_;
-    using typed_addresser_t
-            = typed_passlet<volatility_result_t>::typed_addresser_t;
-    // if there is a dependency loop, ususally it means the var depends on a
-    // loop. Need to revisit again
-    std::vector<const define_node_t *> to_revisit_;
-    volatility_analysis_t(bool check_loop_invarient,
-            const typed_addresser_t &stmt_result_func);
-    void view(const define_c &v, pass_phase phase);
-    void view(const func_c &v, pass_phase phase);
-};
-} // namespace passlet
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_format.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_format.cpp
deleted file mode 100644
index ed80ee4040c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_format.cpp
+++ /dev/null
@@ -1,565 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include "sc_data_format.hpp"
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/simple_licm.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-sc_data_format_kind_t::sc_data_format_kind_t(
-        const std::vector<int> &storage_args) {
-    COMPILE_ASSERT(storage_args.size() <= sc_data_format_kind_t::MAX_DIMS,
-            "storage size should be less than MAX_DIMS");
-    uint64_t res = set_ith_int(
-            0xffffffffffffffff, sc_data_format_kind_t::MAX_DIMS, 0);
-    for (size_t i = 0; i < storage_args.size(); ++i) {
-        res = set_ith_int(res, i, storage_args[i]);
-    }
-    storage_ = res;
-}
-
-sc_data_format_kind_t sc_data_format_kind_t::get_plain_by_dims(size_t ndims) {
-    COMPILE_ASSERT(ndims <= sc_data_format_kind_t::MAX_DIMS,
-            "storage size should be less than MAX_DIMS");
-    uint64_t res = set_ith_int(
-            0xffffffffffffffff, sc_data_format_kind_t::MAX_DIMS, 0);
-    for (size_t i = 0; i < ndims; ++i) {
-        res = set_ith_int(res, i, i);
-    }
-    return sc_data_format_kind_t(res);
-}
-
-sc_data_format_kind_t sc_data_format_kind_t::get_2dblocking_by_dims(
-        size_t ndims, bool is_weight, bool is_vnni_format) {
-    COMPILE_ASSERT(ndims <= sc_data_format_kind_t::MAX_DIMS,
-            "storage size should be less than MAX_DIMS");
-    uint64_t res = set_ith_int(
-            0xffffffffffffffff, sc_data_format_kind_t::MAX_DIMS, 0);
-    for (size_t i = 0; i < ndims; ++i) {
-        res = set_ith_int(res, i, i);
-    }
-    if (ndims == 1) {
-        res = set_ith_int(res, ndims, ndims - 1);
-    } else {
-        // the first blocking
-        res = set_ith_int(res, ndims, ndims - 2);
-        // the second blocking
-        res = set_ith_int(res, ndims + 1, ndims - 1);
-        if (is_weight) {
-            res = set_ith_int(res, ndims - 2, ndims - 1);
-            res = set_ith_int(res, ndims - 1, ndims - 2);
-        }
-    }
-    if (is_vnni_format) {
-        assert(ndims >= 2);
-        // the third blocking
-        res = set_ith_int(res, ndims + 2, ndims - 2);
-    }
-    return sc_data_format_kind_t(res);
-}
-
-int sc_data_format_kind_t::ndims() const {
-    for (int i = 0; i < MAX_DIMS; i++) {
-        if (get(i) == UNDEF_DIM) { return i; }
-    }
-    return -1;
-}
-
-std::vector<int> sc_data_format_kind_t::collect_blocking_index(int axis) const {
-    std::vector<int> index;
-    int cur_idx = 0;
-    int blocking_count[MAX_DIMS] = {0};
-    for (int i = 0; i < MAX_DIMS; i++) {
-        auto cur_axis = get(i);
-        if (cur_axis == UNDEF_DIM) { return index; }
-        blocking_count[cur_axis]++;
-        if (blocking_count[cur_axis] > 1) {
-            if (cur_axis == axis) { index.push_back(cur_idx); }
-            cur_idx++;
-        }
-    }
-    return index;
-}
-
-std::vector<std::vector<int>>
-sc_data_format_kind_t::collect_p2b_mapping() const {
-    std::vector<std::vector<int>> dist(norig_dims(), std::vector<int> {});
-    for (int i = 0; i < MAX_DIMS; i++) {
-        auto orig_axis = get(i);
-        if (orig_axis == UNDEF_DIM) { return dist; }
-        dist[orig_axis].emplace_back(i);
-    }
-    return dist;
-}
-
-void sc_data_format_kind_t::collect_dim_count(
-        int out[sc_data_format_kind_t::MAX_DIMS]) const {
-    for (int i = 0; i < MAX_DIMS; i++) {
-        int orig_idx = get(i);
-        if (orig_idx != UNDEF_DIM) {
-            ++out[orig_idx];
-        } else {
-            break;
-        }
-    }
-}
-
-int sc_data_format_kind_t::norig_dims() const {
-    int ret = -1;
-    for (int i = 0; i < MAX_DIMS; i++) {
-        int orig_idx = get(i);
-        if (orig_idx != UNDEF_DIM) {
-            ret = std::max(orig_idx, ret);
-        } else {
-            break;
-        }
-    }
-    return ret + 1;
-}
-
-bool sc_data_format_t::is_convertible(const sc_data_format_t &other) const {
-    if (format_code_ == format_kinds::any
-            || other.format_code_ == format_kinds::any) {
-        return true;
-    }
-    return format_code_.norig_dims() == other.format_code_.norig_dims();
-}
-
-bool sc_data_format_kind_t::is_channel_last() const {
-    int i = 0;
-    for (; i < MAX_DIMS - 1; i++) {
-        if (get(i + 1) == UNDEF_DIM) {
-            break;
-        } else if ((i == 0 && get(i) != 0) || (i != 0 && get(i) != i + 1)) {
-            return false;
-        }
-    }
-    if (!i) return false;
-    return get(i) == 1;
-}
-
-bool sc_data_format_kind_t::is_plain() const {
-    int i = 0;
-    for (; i < MAX_DIMS; i++) {
-        if (get(i) == UNDEF_DIM) {
-            break;
-        } else if (get(i) != i) {
-            return false;
-        }
-    }
-    if (!i) return false;
-    return true;
-}
-
-bool sc_data_format_kind_t::is_blocking() const {
-    int out[sc_data_format_kind_t::MAX_DIMS] = {0};
-    collect_dim_count(out);
-    for (int i = 0; i < MAX_DIMS; i++) {
-        if (out[i] > 1) {
-            return true;
-        } else if (out[i] == 0) {
-            return false;
-        }
-    }
-    return false;
-}
-
-sc_data_format_kind_t sc_data_format_kind_t::to_plain() const {
-    std::vector<int> storage(this->norig_dims());
-    for (int i = 0; i < this->norig_dims(); i++) {
-        storage[i] = i;
-    }
-    return sc_data_format_kind_t(storage);
-}
-
-sc_data_format_kind_t sc_data_format_kind_t::to_channel_last() const {
-    int ndims = this->norig_dims();
-    std::vector<int> storage(ndims);
-    for (int i = 1; i < ndims; i++) {
-        storage[i] = i + 1;
-    }
-    storage[0] = 0;
-    storage[ndims - 1] = 1;
-    return sc_data_format_kind_t(storage);
-}
-
-bool sc_data_format_t::is_blocking() const {
-    return format_code_.is_blocking();
-}
-
-bool sc_data_format_t::is_channel_last() const {
-    return format_code_.is_channel_last();
-}
-
-bool sc_data_format_t::is_plain() const {
-    return format_code_ != format_kinds::any && format_code_.is_plain();
-}
-
-bool sc_data_format_t::is_any() const {
-    return format_code_ == format_kinds::any;
-}
-
-sc_data_format_t sc_data_format_t::to_plain() const {
-    return sc_data_format_t(format_code_.to_plain());
-}
-
-sc_data_format_t sc_data_format_t::to_channel_last() const {
-    return sc_data_format_t(format_code_.to_channel_last());
-}
-
-sc_format_category sc_data_format_t::get_format_category() const {
-    if (format_code_ == format_kinds::any)
-        return sc_format_category::any;
-    else if (format_code_.is_blocking())
-        return sc_format_category::blocked;
-    else
-        return sc_format_category::non_blocking;
-}
-
-void sc_data_format_t::to_string(std::ostream &os) const {
-    os << *this << "\n";
-}
-
-runtime::dispatch_key sc_data_format_t::to_runtime() const {
-    COMPILE_ASSERT(format_code_.ndims() < runtime::dispatch_key::meta::MAX_DIMS,
-            "Cannot convert this sc_data_format_t to runtime dispatch_key");
-    COMPILE_ASSERT(blocks_[0] < 256 && blocks_[1] < 256,
-            "The blocks are too large for runtime dispatch_key");
-    return runtime::dispatch_key(static_cast<uint64_t>(format_code_),
-            blocks_[0], blocks_[1], impl_kind_t::normal,
-            format_code_.is_plain());
-}
-
-std::ostream &operator<<(std::ostream &os, const sc_data_format_t &in) {
-    if (in.is_any()) { return os << "any"; }
-    int out[sc_data_format_kind_t::MAX_DIMS] = {0};
-    int num_blocking = 0;
-    for (int i = 0; i < sc_data_format_kind_t::MAX_DIMS; i++) {
-        int orig_idx = in.format_code_.get(i);
-        if (orig_idx != sc_data_format_kind_t::UNDEF_DIM) {
-            out[orig_idx]++;
-            char axis_name_base;
-            // if the axis is blocking
-            if (out[orig_idx] > 1) {
-                // get the blocking number from in.blocks_
-                COMPILE_ASSERT((size_t)num_blocking < in.blocks_.size(),
-                        "Too many blocking dims");
-                os << in.blocks_[num_blocking];
-                num_blocking++;
-                axis_name_base = 'a';
-            } else {
-                axis_name_base = 'A';
-            }
-            os << char(axis_name_base + orig_idx);
-        } else {
-            break;
-        }
-    }
-    return os;
-}
-
-static bool is_fixed_blocks(const std::array<int, 4> &blocks, int number) {
-    for (int i = 0; i < number; i++) {
-        if (blocks[i] < 0) { return false; }
-    }
-    return true;
-}
-
-static void get_block_nums_for_axises(const sc_data_format_t &format,
-        int blocking_num[sc_data_format_kind_t::MAX_DIMS]) {
-    // index: index in the plain dims, value: number of currently met axies in
-    // the format
-    int orig_num_blocking[sc_data_format_kind_t::MAX_DIMS] = {0};
-    size_t cur_blocking = 0;
-    for (int i = 0; i < sc_data_format_kind_t::MAX_DIMS; i++) {
-        int orig_idx = format.format_code_.get(i);
-        if (orig_idx != sc_data_format_kind_t::UNDEF_DIM) {
-            orig_num_blocking[orig_idx]++;
-            if (orig_num_blocking[orig_idx] > 1) {
-                // get the blocking number from in.blocks_
-                COMPILE_ASSERT(cur_blocking < format.blocks_.size(),
-                        "Too many blocking dims");
-                blocking_num[i] = format.blocks_[cur_blocking];
-                cur_blocking++;
-            } else {
-                // blocking_num[i]=0;
-            }
-        } else {
-            break;
-        }
-    }
-}
-
-std::unordered_map<int, std::vector<int>>
-sc_data_format_t::get_blocked_axis() const {
-    std::unordered_map<int, std::vector<int>> blocked_axis;
-    int out[sc_data_format_kind_t::MAX_DIMS] = {0};
-    int block_pos = 0;
-    for (int i = 0; i < sc_data_format_kind_t::MAX_DIMS; i++) {
-        int orig_idx = format_code_.get(i);
-        if (orig_idx != sc_data_format_kind_t::UNDEF_DIM) {
-            ++out[orig_idx];
-            if (out[orig_idx] > 1) {
-                blocked_axis[orig_idx].push_back(blocks_[block_pos++]);
-            }
-        } else {
-            break;
-        }
-    }
-    return blocked_axis;
-}
-
-void get_blocking_shapes_impl(const sc_dims &plain_shapes,
-        const sc_data_format_t &format, size_t base_out_dim,
-        size_t num_format_dims, size_t num_out_dims,
-        const std::function<void(int, int)> &callback) {
-    COMPILE_ASSERT(plain_shapes.size() <= sc_data_format_kind_t::MAX_DIMS,
-            "Too many dims in plain shapes");
-    // index: index in the format, value: the blocking number collected from
-    // format.blocks_. e.g. for NCHW16c, blocking_num=[0,0,0,0,16,...]
-    int blocking_num[sc_data_format_kind_t::MAX_DIMS] = {0};
-    get_block_nums_for_axises(format, blocking_num);
-
-    // calculate the real shapes
-    // For example, KCRS16c8k4c, format code = [0,1,2,3,1,0,1], where plain dims
-    // is KCRS=[a,b,c,d]
-    // 1. for each axis in the target format, get the original axis. e.g. for C
-    // axis, the original axis is 1.
-    // 2. get the blocking/original shape of the axis. e.g. for C axis in
-    // KCRS16c8k4c, it is the first axis of C, so the original shape on the axis
-    // is b. For the first blocking axis of "16c", its blocking number is 16.
-    // 3. finalize the shape of the current axis. It should be
-    // (original_shape/next_blocking_number). e.g. for C axis in KCRS16c8k4c,
-    // original shape=b, next blocking number=16, so the shape of C axis is
-    // ceil(b/16). for "16c" axis in KCRS16c8k4c, original shape=16, next
-    // blocking number=4, so the shape of "16c" axis is ceil(16/4)=4.
-    for (auto out_idx = base_out_dim; out_idx < num_out_dims; out_idx++) {
-        auto idx_in_format = out_idx - base_out_dim;
-        int orig_axis = format.format_code_.get(idx_in_format);
-        assert((size_t)orig_axis < plain_shapes.size());
-        // find original shape from plain_shapes or blocking_num
-        int new_shape;
-        if (blocking_num[idx_in_format] != 0) {
-            new_shape = blocking_num[idx_in_format];
-        } else {
-            new_shape = plain_shapes.at(orig_axis + base_out_dim);
-        }
-        // get next_blocking_number
-        int next_blocking_number = 1;
-        for (size_t i = idx_in_format + 1; i < num_format_dims; i++) {
-            // find next axis in format with same axis name
-            if (orig_axis == format.format_code_.get(i)) {
-                next_blocking_number = blocking_num[i];
-                break;
-            }
-        }
-        callback(new_shape, next_blocking_number);
-    }
-}
-
-sc_dims sc_data_format_t::get_blocking_shapes(
-        const sc_dims &plain_shapes, const sc_data_format_t &format) {
-    if (plain_shapes.empty()) { return sc_dims(); }
-    // todo: should be is_plain()
-    if (format.format_code_ == format_kinds::any) { return plain_shapes; }
-    sc_dims ret;
-    size_t base_out_dim = 0;
-    size_t num_plain_dims = format.format_code_.norig_dims();
-    size_t num_format_dims = format.format_code_.ndims();
-    size_t num_out_dims = num_format_dims;
-    ret.reserve(num_out_dims);
-    COMPILE_ASSERT(plain_shapes.size() == num_plain_dims,
-            "Wrong number of dimensions for format: "
-                    << format
-                    << ", plain shape = " << utils::print_vector(plain_shapes));
-    auto callback = [&](int new_shape, int next_blocking_number) {
-        if (is_dynamic_dim(new_shape) || is_dynamic_dim(next_blocking_number)) {
-            ret.push_back(dimensions::dynamic_any);
-        } else {
-            ret.push_back(
-                    utils::divide_and_ceil(new_shape, next_blocking_number));
-        }
-    };
-    get_blocking_shapes_impl(plain_shapes, format, base_out_dim,
-            num_format_dims, num_out_dims, callback);
-    return ret;
-}
-
-static size_t throw_if_negative(int dim) {
-    if (dim < 0) { throw std::runtime_error("Bad format"); }
-    return dim;
-}
-
-std::vector<expr> get_blocking_shapes_expr(sc_graph_t &g,
-        const sc_dims &plain_shapes, const sc_data_format_t &format) {
-    if (plain_shapes.empty()) { return std::vector<expr>(); }
-    // todo: should be is_plain()
-    if (format.format_code_ == format_kinds::any) {
-        return g.dims_to_expr(plain_shapes);
-    }
-    std::vector<expr> ret;
-    size_t base_out_dim = 0;
-    size_t num_plain_dims = throw_if_negative(format.format_code_.norig_dims());
-    size_t num_format_dims = throw_if_negative(format.format_code_.ndims());
-    size_t num_out_dims = num_format_dims;
-    ret.reserve(num_out_dims);
-    COMPILE_ASSERT(plain_shapes.size() == num_plain_dims,
-            "Wrong number of dimensions for format: "
-                    << format
-                    << ", plain shape = " << utils::print_vector(plain_shapes));
-    auto callback = [&](int new_shape, int next_blocking_number) {
-        auto dim = next_blocking_number == 1
-                ? g.dim_to_expr(new_shape)
-                : divide_and_ceil(g.dim_to_expr(new_shape),
-                        g.dim_to_expr(next_blocking_number));
-        dim->attr().set(attr_key::const_attr, true);
-        ret.push_back(dim);
-    };
-    get_blocking_shapes_impl(plain_shapes, format, base_out_dim,
-            num_format_dims, num_out_dims, callback);
-    return ret;
-}
-
-sc_dims sc_data_format_t::get_padded_plain_shapes(
-        const sc_dims &real_shapes, const sc_data_format_t &format) {
-    if (real_shapes.empty()) { return sc_dims(); }
-    if (format.format_code_ == format_kinds::any) { return real_shapes; }
-    sc_dims ret;
-    size_t base_out_dim = 0;
-    size_t num_plain_dims = format.format_code_.norig_dims();
-    size_t num_format_dims = format.format_code_.ndims();
-    size_t num_out_dims = num_plain_dims;
-    ret.reserve(num_out_dims);
-    COMPILE_ASSERT(real_shapes.size() == num_format_dims,
-            "Wrong number of dimensions for format: "
-                    << format
-                    << ", real shape = " << utils::print_vector(real_shapes));
-    if (!is_fixed_blocks(format.blocks_, 4)) {
-        return sc_dims(num_out_dims, -1);
-    }
-    COMPILE_ASSERT(real_shapes.size() <= sc_data_format_kind_t::MAX_DIMS,
-            "Too many dims in plain shapes");
-    for (auto out_idx = base_out_dim; out_idx < num_out_dims; out_idx++) {
-        auto orig_axis = out_idx - base_out_dim;
-        int shape = 1;
-        for (size_t i = 0; i < num_format_dims; i++) {
-            if ((int)orig_axis == format.format_code_.get(i)) {
-                if (is_dynamic_dim(real_shapes.at(base_out_dim + i))) {
-                    shape = dimensions::dynamic_any;
-                    break;
-                }
-                shape *= real_shapes.at(base_out_dim + i);
-            }
-        }
-        // shape is the product of all axises with the same name
-        ret.push_back(shape);
-    }
-    return ret;
-}
-
-sc_dims sc_data_format_t::get_reordered_shapes(const sc_dims &input_shapes,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format) {
-    COMPILE_ASSERT(input_format.is_convertible(output_format),
-            "Can not convert input format "
-                    << input_format << " to output format " << output_format);
-    sc_dims plain_shapes = get_padded_plain_shapes(input_shapes, input_format);
-    return get_blocking_shapes(plain_shapes, output_format);
-}
-
-int sc_data_format_t::get_blocks_size() const {
-    int i = 0;
-    for (; i < static_cast<int>(blocks_.size()); i++) {
-        if (blocks_[i] == 0) { return i; }
-    }
-    return i;
-}
-
-bool sc_data_format_t::is_same_format_kind(
-        const sc_data_format_t &input_format) const {
-    return this->format_code_ == input_format.format_code_;
-}
-
-bool sc_data_format_cmper_t::operator()(
-        const sc_data_format_t &fmt0, const sc_data_format_t &fmt1) const {
-    if (fmt0.format_code_ != fmt1.format_code_) {
-        return fmt0.format_code_ < fmt1.format_code_;
-    }
-    if (fmt0.blocks_ != fmt1.blocks_) {
-        for (int i = 0; i < 4; i++) {
-            if (fmt0.blocks_[i] != fmt1.blocks_[i]) {
-                return fmt0.blocks_[i] < fmt1.blocks_[i];
-            }
-        }
-    }
-    // equal
-    return false;
-}
-
-bool is_dynamic_blocking(
-        const sc_dims &shapes, const sc_data_format_t &format) {
-    auto &code = format.format_code_;
-    for (size_t i = 0; i < shapes.size(); i++) {
-        if (is_dynamic_dim(shapes[i])
-                && !code.collect_blocking_index(i).empty()) {
-            return true;
-        }
-    }
-    return false;
-}
-
-// clang-format off
-SC_CLASS(sc_data_format_t)
-    SC_FIELD(format_code_)
-    SC_FIELD(blocks_)
-SC_CLASS_END()
-
-SC_CLASS(sc_data_format_kind_t)
-    SC_FIELD(storage_)
-SC_CLASS_END()
-// clang-format on
-
-template struct reflection::type_registry<
-        std::vector<std::vector<sc_data_format_t>>>;
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-std::size_t hash<dnnl::impl::graph::gc::sc_data_format_t>::operator()(
-        const dnnl::impl::graph::gc::sc_data_format_t &k) const {
-    namespace gc = dnnl::impl::graph::gc;
-    size_t hash_ = 0;
-    gc::hash_combine(hash_, (uint64_t)k.format_code_);
-    gc::hash_combine(hash_, get<0>(k.blocks_));
-    gc::hash_combine(hash_, get<1>(k.blocks_));
-    gc::hash_combine(hash_, get<2>(k.blocks_));
-    gc::hash_combine(hash_, get<3>(k.blocks_));
-    return hash_;
-}
-} // namespace std
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_format.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_format.hpp
deleted file mode 100644
index ac381d60505..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_format.hpp
+++ /dev/null
@@ -1,497 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_DATA_FORMAT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_DATA_FORMAT_HPP
-
-#include <array>
-#include <ostream>
-#include <vector>
-#include <compiler/dimensions.hpp>
-#include <runtime/dispatch_key.hpp>
-#include <unordered_map>
-#include <util/def.hpp>
-#include <util/hash_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/// Memory format kind
-enum class sc_format_category {
-    // any means: support block and plain
-    any,
-    // blocked means: only support block
-    blocked,
-    // vnni blocked means: specific block format used by vnni instructions
-    vnni_blocked,
-    // non_blocking means: plain or permuted
-    non_blocking
-};
-
-/**
- * The encoded data format kind. It stores the mapping of each axis in the real
- * shape with the axis in the original shape. We interpret the 64-bit storage as
- * 16x 4-bit ints. We use the last 4-bit int (15-th) as the control block
- * [slot0],[slot1],[slot2],...[slot15]
- * The slots 0~14 (15 slots) store the original axis index of the corresponding
- * dimension. For a N-dimension format, any slots with index >=N should contain
- * a value of (0xF). For example, NCHWc =>[0,1,2,3,1,-1,-1,...]
- * The slot15 is a control block, which is initially set to be 0 and can be used
- * for other purpose in the future.
- */
-struct SC_API sc_data_format_kind_t {
-    static constexpr int NUM_SLOTS = 16;
-    static constexpr int MAX_DIMS = NUM_SLOTS - 1; // 15
-    static constexpr int BITS_PER_SLOT = sizeof(uint64_t) * 8 / NUM_SLOTS; // 4
-    static constexpr int UNDEF_DIM = (1 << BITS_PER_SLOT) - 1; // 0xf
-    uint64_t storage_;
-    // gets the original axis of the idx-th dimemsion of the format
-    constexpr int get(int idx) const {
-        return 0xf & (storage_ >> (idx * BITS_PER_SLOT));
-    }
-    constexpr int get_control_block() const { return get(MAX_DIMS); }
-    void set(int idx, int data) { storage_ = set_ith_int(storage_, idx, data); }
-    constexpr sc_data_format_kind_t(uint64_t storage) : storage_(storage) {}
-    constexpr sc_data_format_kind_t() : storage_(0xffffffffffffffff) {}
-
-private:
-    static constexpr uint64_t set_ith_int(uint64_t oldv, int idx, int data) {
-        return (oldv & ~(uint64_t(UNDEF_DIM) << (idx * BITS_PER_SLOT)))
-                | (uint64_t(data) << (idx * BITS_PER_SLOT));
-    }
-    template <int start, typename... Args>
-    static constexpr uint64_t make_storage(uint64_t oldv, int v, Args... args) {
-        return make_storage<start + 1>(set_ith_int(oldv, start, v), args...);
-    }
-
-    template <int start>
-    static constexpr uint64_t make_storage(uint64_t oldv, int v) {
-        return set_ith_int(oldv, start, v);
-    }
-
-public:
-    template <typename... Args>
-    constexpr sc_data_format_kind_t(Args... args)
-        : storage_(make_storage<0>(
-                set_ith_int(0xffffffffffffffff, MAX_DIMS, 0), args...)) {
-        static_assert(sizeof...(args) <= MAX_DIMS,
-                "At most 15 dimensions are supported");
-    }
-
-    sc_data_format_kind_t(const std::vector<int> &storage_args);
-
-    constexpr sc_data_format_kind_t(const sc_data_format_kind_t &) = default;
-    sc_data_format_kind_t &operator=(const sc_data_format_kind_t &) = default;
-    constexpr bool operator==(const sc_data_format_kind_t &other) const {
-        return storage_ == other.storage_;
-    }
-
-    constexpr bool operator!=(const sc_data_format_kind_t &other) const {
-        return storage_ != other.storage_;
-    }
-
-    constexpr operator uint64_t() const { return storage_; }
-
-    // gets the number of dimensions. For any, returns -1.
-    int ndims() const;
-
-    // gets the number of original dimensions. For any, returns -1.
-    int norig_dims() const;
-
-    // checks if the format is valid. If not, throws an runtime_exception
-    void check() const;
-
-    bool is_plain() const;
-    bool is_blocking() const;
-
-    bool is_channel_last() const;
-
-    // collects the number of axies in the format. For original axis `i`,
-    // `out[i]` will be the number of occurence of the axis in this format.
-    // e.g. NCHWc => out=[1,2,1,1], the axis C occurs twice
-    void collect_dim_count(int out[MAX_DIMS]) const;
-
-    // collects the index of blocking with given `axis`. e.g. NCHWc and given
-    // `axis=1` we get the blocking index vector {0}
-    std::vector<int> collect_blocking_index(int axis) const;
-
-    // collects the mapping from plain axis to blocking axis for each dimension.
-    // e.g. NCHWc will return {{0},{1,4},{2},{3}}, MKmk will return
-    // {{0,2},{1,3}}
-    std::vector<std::vector<int>> collect_p2b_mapping() const;
-
-    sc_data_format_kind_t to_plain() const;
-
-    sc_data_format_kind_t to_channel_last() const;
-
-    // makes an N-D plain format.
-    static sc_data_format_kind_t get_plain_by_dims(size_t ndims);
-    // makes a format that 2d blocking are at the lowest 2 dimensions. e.g. if
-    // ndims=4, is_vnni_format=false, format is ABCDcd. if ndims=5,
-    // is_vnni_format=false, then the format is ABCDEde.
-    static sc_data_format_kind_t get_2dblocking_by_dims(
-            size_t ndims, bool is_weight = false, bool is_vnni_format = false);
-};
-
-namespace format_kinds {
-#define SC_DEF_FMT(name, ...) \
-    constexpr sc_data_format_kind_t name {__VA_ARGS__};
-/* this format means continous memory format, which can be converted to
-         any format*/
-SC_DEF_FMT(any, 0xffffffffffffffff)
-SC_DEF_FMT(A, 0)
-SC_DEF_FMT(AB, 0, 1)
-SC_DEF_FMT(BA, 1, 0)
-SC_DEF_FMT(ABC, 0, 1, 2)
-SC_DEF_FMT(ABCD, 0, 1, 2, 3)
-SC_DEF_FMT(ABCDE, 0, 1, 2, 3, 4)
-SC_DEF_FMT(ABCDEF, 0, 1, 2, 3, 4, 5)
-SC_DEF_FMT(CDBA, 2, 3, 1, 0)
-SC_DEF_FMT(DCBA, 3, 2, 1, 0)
-SC_DEF_FMT(EDCAB, 4, 3, 2, 0, 1)
-SC_DEF_FMT(DECAB, 3, 4, 2, 0, 1)
-
-// channel last format
-SC_DEF_FMT(ACB, 0, 2, 1)
-SC_DEF_FMT(ACDB, 0, 2, 3, 1)
-SC_DEF_FMT(ACDEB, 0, 2, 3, 4, 1)
-SC_DEF_FMT(ADEBC, 0, 3, 4, 1, 2)
-SC_DEF_FMT(ADEFBC, 0, 3, 4, 5, 1, 2)
-
-// blocked format start
-SC_DEF_FMT(Aa, 0, 0)
-SC_DEF_FMT(ABab, 0, 1, 0, 1)
-SC_DEF_FMT(ABba, 0, 1, 1, 0)
-SC_DEF_FMT(BAab, 1, 0, 0, 1)
-SC_DEF_FMT(ABCb, 0, 1, 2, 1)
-SC_DEF_FMT(ABCba, 0, 1, 2, 1, 0)
-SC_DEF_FMT(ABCDb, 0, 1, 2, 3, 1)
-SC_DEF_FMT(ABCDba, 0, 1, 2, 3, 1, 0)
-SC_DEF_FMT(ABCDab, 0, 1, 2, 3, 0, 1)
-SC_DEF_FMT(ABCDaba, 0, 1, 2, 3, 0, 1, 0)
-SC_DEF_FMT(BACDab, 1, 0, 2, 3, 0, 1)
-SC_DEF_FMT(ACDBa, 0, 2, 3, 1, 0)
-SC_DEF_FMT(ACDEBa, 0, 2, 3, 4, 1, 0)
-SC_DEF_FMT(BACDba, 1, 0, 2, 3, 1, 0)
-SC_DEF_FMT(BACDEba, 1, 0, 2, 3, 4, 1, 0)
-SC_DEF_FMT(ABCDEc, 0, 1, 2, 3, 4, 2)
-SC_DEF_FMT(ABCDEFc, 0, 1, 2, 3, 4, 5, 2)
-SC_DEF_FMT(ABCDEcb, 0, 1, 2, 3, 4, 2, 1)
-SC_DEF_FMT(ABCDEFcb, 0, 1, 2, 3, 4, 5, 2, 1)
-
-// for bert
-SC_DEF_FMT(ABDCcd, 0, 1, 3, 2, 2, 3)
-SC_DEF_FMT(ABDCcdc, 0, 1, 3, 2, 2, 3, 2)
-SC_DEF_FMT(ABCDdcd, 0, 1, 2, 3, 3, 2, 3)
-SC_DEF_FMT(ABCDEb, 0, 1, 2, 3, 4, 1)
-SC_DEF_FMT(ABCDEba, 0, 1, 2, 3, 4, 1, 0)
-SC_DEF_FMT(BACDEab, 1, 0, 2, 3, 4, 0, 1)
-
-// vnni format
-SC_DEF_FMT(KCSckc, 0, 1, 2, 1, 0, 1)
-SC_DEF_FMT(KCRSckc, 0, 1, 2, 3, 1, 0, 1)
-SC_DEF_FMT(KCDRSckc, 0, 1, 2, 3, 4, 1, 0, 1)
-SC_DEF_FMT(GKCRSckc, 0, 1, 2, 3, 4, 2, 1, 2)
-SC_DEF_FMT(GKCDRSckc, 0, 1, 2, 3, 4, 5, 2, 1, 2)
-SC_DEF_FMT(CKRSkck, 1, 0, 2, 3, 0, 1, 0)
-SC_DEF_FMT(CKDRSkck, 1, 0, 2, 3, 4, 0, 1, 0)
-SC_DEF_FMT(NKknk, 1, 0, 0, 1, 0)
-SC_DEF_FMT(BNKknk, 1, 0, 0, 1, 0)
-SC_DEF_FMT(KNknk, 0, 1, 0, 1, 0)
-
-// used for bertBMM
-SC_DEF_FMT(ACBD, 0, 2, 1, 3)
-SC_DEF_FMT(ABCDdc, 0, 1, 2, 3, 3, 2)
-SC_DEF_FMT(ABCDcd, 0, 1, 2, 3, 2, 3)
-SC_DEF_FMT(ACBDdc, 0, 2, 1, 3, 3, 2)
-SC_DEF_FMT(ACBDcd, 0, 2, 1, 3, 2, 3)
-SC_DEF_FMT(ACBDcdc, 0, 2, 1, 3, 2, 3, 2)
-
-constexpr auto NCHW = ABCD, NHWC = ACDB, KCRS = ABCD, NKHW = ABCD, MK = AB,
-               KN = AB, NK = BA, MN = AB, NCHWc = ABCDb, NCHWnc = ABCDab,
-               NCHWcn = ABCDba, NCHWncn = ABCDaba, NKHWk = ABCDb,
-               KCRSck = ABCDba, MKmk = ABab, NKkn = BAab, MNmn = ABab,
-               NCDHW = ABCDE, NDHWC = ACDEB, KCDRS = ABCDE, NCDHWc = ABCDEb,
-               KCDRSck = ABCDEba, CKRSkc = BACDab, CKDRSkc = BACDEab,
-               NHWCn = ACDBa, NDHWCn = ACDEBa, CKRSck = BACDba,
-               CKDRSck = BACDEba, NSC = ACB, NCS = ABC, NCSc = ABCb, KCS = ABC,
-               KCSck = ABCba, NGCHWc = ABCDEc, NGCHW = ABCDE, NHWGC = ADEBC,
-               GKCRS = ABCDE, GKCRSck = ABCDEcb, NGCDHWc = ABCDEFc,
-               NGCDHW = ABCDEF, NDHWGC = ADEFBC, GKCDRS = ABCDEF,
-               GKCDRSck = ABCDEFcb;
-
-#undef SC_DEF_FMT
-}; // namespace format_kinds
-struct SC_API sc_data_format_t {
-    using blocking_t = std::array<int, 4>;
-    sc_data_format_t() : format_code_(format_kinds::any), blocks_ {0} {}
-    constexpr sc_data_format_t(
-            sc_data_format_kind_t format_code, const blocking_t &blocks = {0})
-        : format_code_(format_code), blocks_(blocks) {}
-
-    sc_data_format_t(const std::vector<int> &storage_args,
-            const blocking_t &blocks = {0})
-        : format_code_(storage_args), blocks_(blocks) {}
-
-    bool operator==(const sc_data_format_t &other) const {
-        return format_code_ == other.format_code_ && blocks_ == other.blocks_;
-    }
-    bool operator!=(const sc_data_format_t &other) const {
-        return !(*this == other);
-    }
-    bool is_convertible(const sc_data_format_t &other) const;
-
-    bool is_blocking() const;
-
-    bool is_plain() const;
-
-    bool is_channel_last() const;
-
-    bool is_any() const;
-
-    sc_data_format_t to_plain() const;
-
-    sc_data_format_t to_channel_last() const;
-
-    sc_format_category get_format_category() const;
-
-    constexpr static inline sc_data_format_t NCHW() {
-        return sc_data_format_t(format_kinds::NCHW);
-    }
-    constexpr static inline sc_data_format_t NHWC() {
-        return sc_data_format_t(format_kinds::NHWC);
-    }
-    constexpr static inline sc_data_format_t NCHWc(int c) {
-        return sc_data_format_t(format_kinds::NCHWc, {c});
-    }
-    constexpr static inline sc_data_format_t NGCHW() {
-        return sc_data_format_t(format_kinds::NGCHW);
-    }
-    constexpr static inline sc_data_format_t NHWGC() {
-        return sc_data_format_t(format_kinds::NHWGC);
-    }
-    constexpr static inline sc_data_format_t NGCHWc(int c) {
-        return sc_data_format_t(format_kinds::NGCHWc, {c});
-    }
-    constexpr static inline sc_data_format_t NGCDHW() {
-        return sc_data_format_t(format_kinds::NGCDHW);
-    }
-    constexpr static inline sc_data_format_t NDHWGC() {
-        return sc_data_format_t(format_kinds::NDHWGC);
-    }
-    constexpr static inline sc_data_format_t NGCDHWc(int c) {
-        return sc_data_format_t(format_kinds::NGCDHWc, {c});
-    }
-    constexpr static inline sc_data_format_t KCRS() {
-        return sc_data_format_t(format_kinds::KCRS);
-    }
-    constexpr static inline sc_data_format_t KCRSck(int c, int k) {
-        return sc_data_format_t(format_kinds::KCRSck, {c, k, 0, 0});
-    }
-    constexpr static inline sc_data_format_t KCRSck2c(int c, int k) {
-        return sc_data_format_t(format_kinds::KCRSckc, {c, k, 2});
-    }
-    constexpr static inline sc_data_format_t KCRSck4c(int c, int k) {
-        return sc_data_format_t(format_kinds::KCRSckc, {c, k, 4});
-    }
-    constexpr static inline sc_data_format_t GKCRS() {
-        return sc_data_format_t(format_kinds::GKCRS);
-    }
-    constexpr static inline sc_data_format_t GKCRSck(int c, int k) {
-        return sc_data_format_t(format_kinds::GKCRSck, {c, k, 0, 0});
-    }
-    constexpr static inline sc_data_format_t GKCRSck2c(int c, int k) {
-        return sc_data_format_t(format_kinds::GKCRSckc, {c, k, 2});
-    }
-    constexpr static inline sc_data_format_t GKCRSck4c(int c, int k) {
-        return sc_data_format_t(format_kinds::GKCRSckc, {c, k, 4});
-    }
-    constexpr static inline sc_data_format_t GKCDRS() {
-        return sc_data_format_t(format_kinds::GKCDRS);
-    }
-    constexpr static inline sc_data_format_t GKCDRSck(int c, int k) {
-        return sc_data_format_t(format_kinds::GKCDRSck, {c, k, 0, 0});
-    }
-    constexpr static inline sc_data_format_t GKCDRSck2c(int c, int k) {
-        return sc_data_format_t(format_kinds::GKCDRSckc, {c, k, 2});
-    }
-    constexpr static inline sc_data_format_t GKCDRSck4c(int c, int k) {
-        return sc_data_format_t(format_kinds::GKCDRSckc, {c, k, 4});
-    }
-    constexpr static inline sc_data_format_t NCS() {
-        return sc_data_format_t(format_kinds::NCS);
-    }
-    constexpr static inline sc_data_format_t NSC() {
-        return sc_data_format_t(format_kinds::NSC);
-    }
-    constexpr static inline sc_data_format_t NCSc(int c) {
-        return sc_data_format_t(format_kinds::NCSc, {c});
-    }
-    constexpr static inline sc_data_format_t KCS() {
-        return sc_data_format_t(format_kinds::KCS);
-    }
-    constexpr static inline sc_data_format_t KCSck(int c, int k) {
-        return sc_data_format_t(format_kinds::KCSck, {c, k, 0, 0});
-    }
-    constexpr static inline sc_data_format_t KCSck2c(int c, int k) {
-        return sc_data_format_t(format_kinds::KCSckc, {c, k, 2});
-    }
-    constexpr static inline sc_data_format_t KCSck4c(int c, int k) {
-        return sc_data_format_t(format_kinds::KCSckc, {c, k, 4});
-    }
-    constexpr static inline sc_data_format_t MK() {
-        return sc_data_format_t(format_kinds::MK);
-    }
-    constexpr static inline sc_data_format_t MKmk(int m, int k) {
-        return sc_data_format_t(format_kinds::MKmk, {m, k, 0, 0});
-    }
-    constexpr static inline sc_data_format_t KN() {
-        return sc_data_format_t(format_kinds::KN);
-    }
-    constexpr static inline sc_data_format_t NK() {
-        return sc_data_format_t(format_kinds::NK);
-    }
-    constexpr static inline sc_data_format_t NKkn(int k, int n) {
-        return sc_data_format_t(format_kinds::NKkn, {k, n, 0, 0});
-    }
-    constexpr static inline sc_data_format_t NKkn2k(int k, int n) {
-        return sc_data_format_t(format_kinds::NKknk, {k, n, 2});
-    }
-    constexpr static inline sc_data_format_t NKkn4k(int k, int n) {
-        return sc_data_format_t(format_kinds::NKknk, {k, n, 4});
-    }
-    constexpr static inline sc_data_format_t KNkn4k(int k, int n) {
-        return sc_data_format_t(format_kinds::KNknk, {k, n, 4});
-    }
-    constexpr static inline sc_data_format_t KNkn2k(int k, int n) {
-        return sc_data_format_t(format_kinds::KNknk, {k, n, 2});
-    }
-    constexpr static inline sc_data_format_t NCDHW() {
-        return sc_data_format_t(format_kinds::NCDHW);
-    }
-    constexpr static inline sc_data_format_t NDHWC() {
-        return sc_data_format_t(format_kinds::NDHWC);
-    }
-    constexpr static inline sc_data_format_t NCDHWc(int c) {
-        return sc_data_format_t(format_kinds::NCDHWc, {c});
-    }
-    constexpr static inline sc_data_format_t NCHWnc(int n, int c) {
-        return sc_data_format_t(format_kinds::NCHWnc, {n, c});
-    }
-    constexpr static inline sc_data_format_t NCHWcn(int c, int n) {
-        return sc_data_format_t(format_kinds::NCHWcn, {c, n});
-    }
-    constexpr static inline sc_data_format_t NCHWnc2n(int n, int c) {
-        return sc_data_format_t(format_kinds::NCHWncn, {n, c, 2});
-    }
-    constexpr static inline sc_data_format_t KCDRS() {
-        return sc_data_format_t(format_kinds::KCDRS);
-    }
-    constexpr static inline sc_data_format_t KCDRSck(int c, int k) {
-        return sc_data_format_t(format_kinds::KCDRSck, {c, k, 0, 0});
-    }
-    constexpr static inline sc_data_format_t KCDRSck2c(int c, int k) {
-        return sc_data_format_t(format_kinds::KCDRSckc, {c, k, 2, 0});
-    }
-    constexpr static inline sc_data_format_t KCDRSck4c(int c, int k) {
-        return sc_data_format_t(format_kinds::KCDRSckc, {c, k, 4, 0});
-    }
-    constexpr static inline sc_data_format_t CKRSkc(int k, int c) {
-        return sc_data_format_t(format_kinds::CKRSkc, {k, c, 0, 0});
-    }
-    constexpr static inline sc_data_format_t CKRSkc2k(int k, int c) {
-        return sc_data_format_t(format_kinds::CKRSkck, {k, c, 2});
-    }
-    constexpr static inline sc_data_format_t CKDRSkc(int k, int c) {
-        return sc_data_format_t(format_kinds::CKDRSkc, {k, c, 0, 0});
-    }
-    constexpr static inline sc_data_format_t CKDRSkc2k(int k, int c) {
-        return sc_data_format_t(format_kinds::CKDRSkck, {k, c, 2});
-    }
-    constexpr static inline sc_data_format_t NHWCn(int n) {
-        return sc_data_format_t(format_kinds::NHWCn, {n});
-    }
-    constexpr static inline sc_data_format_t NDHWCn(int n) {
-        return sc_data_format_t(format_kinds::NDHWCn, {n});
-    }
-    constexpr static inline sc_data_format_t CKRSck(int c, int k) {
-        return sc_data_format_t(format_kinds::CKRSck, {c, k});
-    }
-    constexpr static inline sc_data_format_t CKDRSck(int c, int k) {
-        return sc_data_format_t(format_kinds::CKDRSck, {c, k});
-    }
-
-    sc_data_format_kind_t format_code_;
-    // The blocking numbers. It stores the blocking of the blocking axis in
-    // the format_code_ from left to right. At most 4 blocking numbers can be
-    // stored. Unused slots should be 0. For example, for format NK16k8n4k, the
-    // blocks_ should be {16,8,4,0}. std::vector is unnecessary for block info.
-    // And in g++, sizeof(vector<int>)==24, while static array only takes 16
-    // bytes
-    blocking_t blocks_;
-    int get_blocks_size() const;
-    bool is_same_format_kind(const sc_data_format_t &input_format) const;
-
-    // {plain_axis, block}
-    std::unordered_map<int, std::vector<int>> get_blocked_axis() const;
-
-    static sc_dims get_reordered_shapes(const sc_dims &input_shapes,
-            const sc_data_format_t &input_format,
-            const sc_data_format_t &output_format);
-    // given plain shapes and the data format, gets the real blocking shapes
-    static sc_dims get_blocking_shapes(
-            const sc_dims &plain_shapes, const sc_data_format_t &format);
-    // given real blocking shapes and the data format, infers plain shapes. Note
-    // that if there is padding when converting plain shapes and format to
-    // blocking shapes, we cannot infer the original plain shapes from the
-    // padded blocking shapes and the format
-    static sc_dims get_padded_plain_shapes(
-            const sc_dims &real_shapes, const sc_data_format_t &format);
-
-    // gets an N-D plain format
-    static sc_data_format_t get_plain_by_dims(size_t shape_size) {
-        return sc_data_format_t(
-                sc_data_format_kind_t::get_plain_by_dims(shape_size));
-    }
-
-    runtime::dispatch_key to_runtime() const;
-
-    void to_string(std::ostream &os) const;
-};
-struct sc_data_format_cmper_t {
-    bool operator()(
-            const sc_data_format_t &key0, const sc_data_format_t &key1) const;
-};
-
-SC_INTERNAL_API std::ostream &operator<<(
-        std::ostream &os, const sc_data_format_t &in);
-
-// if has block on dynamic plain shapes
-bool is_dynamic_blocking(const sc_dims &shapes, const sc_data_format_t &format);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::sc_data_format_t> {
-    std::size_t operator()(
-            const dnnl::impl::graph::gc::sc_data_format_t &k) const;
-};
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_type.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_type.cpp
deleted file mode 100644
index 0a4a808b5bb..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_type.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/sc_data_type.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-type_category get_type_category(sc_data_type_t dtype) {
-    switch (dtype) {
-        case datatypes::bf16:
-        case datatypes::f16:
-        case datatypes::f32: return CATE_FLOAT; break;
-        case datatypes::s32:
-        case datatypes::s8: return CATE_INT; break;
-        case datatypes::index:
-        case datatypes::u32:
-        case datatypes::u16:
-        case datatypes::u8:
-        case datatypes::boolean: return CATE_UINT; break;
-        default: assert(0 && "Bad type"); return CATE_INT;
-    }
-}
-
-type_category get_type_category_nothrow(sc_data_type_t dtype) {
-    switch (dtype) {
-        case datatypes::bf16:
-        case datatypes::f16:
-        case datatypes::f32: return CATE_FLOAT; break;
-        case datatypes::s32:
-        case datatypes::s8: return CATE_INT; break;
-        case datatypes::index:
-        case datatypes::u32:
-        case datatypes::u16:
-        case datatypes::u8:
-        case datatypes::boolean: return CATE_UINT; break;
-        default: return CATE_OTHER;
-    }
-}
-
-type_category get_etype_category(sc_data_type_t dtype) {
-    switch (dtype.type_code_) {
-        case sc_data_etype::BF16:
-        case sc_data_etype::F16:
-        case sc_data_etype::F32: return CATE_FLOAT; break;
-        case sc_data_etype::S32:
-        case sc_data_etype::S8: return CATE_INT; break;
-        case sc_data_etype::INDEX:
-        case sc_data_etype::U32:
-        case sc_data_etype::U16:
-        case sc_data_etype::U8:
-        case sc_data_etype::BOOLEAN: return CATE_UINT; break;
-        default: assert(0 && "Bad type"); return CATE_OTHER;
-    }
-}
-
-type_category get_etype_category_nothrow(sc_data_type_t dtype) {
-    switch (dtype.type_code_) {
-        case sc_data_etype::BF16:
-        case sc_data_etype::F16:
-        case sc_data_etype::F32: return CATE_FLOAT; break;
-        case sc_data_etype::S32:
-        case sc_data_etype::S8: return CATE_INT; break;
-        case sc_data_etype::INDEX:
-        case sc_data_etype::U32:
-        case sc_data_etype::U16:
-        case sc_data_etype::U8:
-        case sc_data_etype::BOOLEAN: return CATE_UINT; break;
-        default: return CATE_OTHER;
-    }
-}
-
-std::ostream &operator<<(std::ostream &os, sc_data_etype t) {
-    switch (t) {
-        case sc_data_etype::UNDEF: os << "undef"; break;
-        case sc_data_etype::F16: os << "f16"; break;
-        case sc_data_etype::BF16: os << "bf16"; break;
-        case sc_data_etype::F32: os << "f32"; break;
-        case sc_data_etype::S32: os << "s32"; break;
-        case sc_data_etype::U32: os << "u32"; break;
-        case sc_data_etype::U16: os << "u16"; break;
-        case sc_data_etype::S8: os << "s8"; break;
-        case sc_data_etype::U8: os << "u8"; break;
-        case sc_data_etype::INDEX: os << "index"; break;
-        case sc_data_etype::BOOLEAN: os << "bool"; break;
-        case sc_data_etype::GENERIC: os << "generic_val"; break;
-        case sc_data_etype::VOID_T: os << "void"; break;
-        case sc_data_etype::POINTER: os << "pointer"; break;
-        default:
-            if (etypes::is_pointer(t)) {
-                os << etypes::get_pointer_element(t);
-                os << '*';
-                return os;
-            }
-            assert(0 && "Unknown type");
-    }
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, sc_data_type_t dtype) {
-    os << dtype.type_code_;
-    if (dtype.lanes_ > 1) {
-        if (dtype.rows_ == 0) {
-            os << 'x' << dtype.lanes_;
-        } else {
-            os << 'x' << dtype.rows_ << 'x' << dtype.lanes_ / dtype.rows_;
-        }
-    }
-    return os;
-}
-
-namespace utils {
-// Without this specialization, the generic print_vector template will
-// automatically invoke `sc_data_type_t::operator uint_64t()`, resulting in
-// the printed output being a list of numbers.
-template <>
-std::string print_vector(const std::vector<sc_data_type_t> &vec) {
-    std::stringstream os;
-    int cnt = 0;
-    os << '[';
-    for (auto &v : vec) {
-        if (cnt != 0) { os << ", "; }
-        os << v;
-        cnt++;
-    }
-    os << ']';
-    return os.str();
-}
-} // namespace utils
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-std::size_t hash<dnnl::impl::graph::gc::sc_data_type_t>::operator()(
-        const dnnl::impl::graph::gc::sc_data_type_t &k) const {
-    return hash<unsigned>()((uint64_t)k);
-}
-} // namespace std
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_type.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_type.hpp
deleted file mode 100644
index 8bc31d429d4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_data_type.hpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_DATA_TYPE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_DATA_TYPE_HPP
-#include <cassert>
-#include <functional> // std::hash
-#include <ostream>
-#include <string>
-#include <vector>
-#include <runtime/data_type.hpp>
-#include <util/def.hpp>
-#include <util/string_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace etypes {
-
-inline constexpr bool is_pointer(sc_data_etype type_code) {
-    return static_cast<int>(type_code)
-            & static_cast<int>(sc_data_etype::POINTER);
-}
-
-// given a "base type", returns the pointer type of the type. If a pointer type
-// is given, will return POINTER (void*)
-inline constexpr sc_data_etype get_pointerof(sc_data_etype type_code) {
-    return is_pointer(type_code)
-            ? sc_data_etype::POINTER
-            : static_cast<sc_data_etype>(
-                    static_cast<int>(sc_data_etype::POINTER)
-                    | static_cast<int>(type_code));
-}
-
-// given a pointer type, returns the element type. for POINTER, returns POINTER
-inline constexpr sc_data_etype get_pointer_element(sc_data_etype type_code) {
-    return type_code == sc_data_etype::POINTER
-            ? sc_data_etype::POINTER
-            : static_cast<sc_data_etype>(0xFF & static_cast<int>(type_code));
-}
-
-} // namespace etypes
-/// Data type specification. Can be scalar or vector
-/// The rows_ field indicates dtype is a tile, where cols = lanes / rows
-/// Currently tile dtype is used in AMX-based microkernel scenarios with limited
-/// usage
-///
-/// NOTE: The \c lanes_ & rows_ field is not meaningful when \c type_code_ is
-/// UNDEF or VOID_T.
-struct SC_API sc_data_type_t {
-    sc_data_etype type_code_;
-    union {
-        struct {
-            uint16_t lanes_;
-            uint16_t rows_;
-        };
-        uint32_t ldata_;
-    };
-    constexpr sc_data_type_t(sc_data_etype code, uint16_t lanes = 1)
-        : type_code_(code), lanes_(lanes), rows_(0) {}
-    constexpr sc_data_type_t(sc_data_etype code, uint16_t lanes, uint16_t rows)
-        : type_code_(code), lanes_(lanes), rows_(rows) {}
-    constexpr sc_data_type_t()
-        : type_code_(sc_data_etype::UNDEF), lanes_(0), rows_(0) {}
-
-    bool is_tile() const { return rows_ > 0; }
-
-    sc_data_etype as_etype() const {
-        assert(lanes_ == 1 && rows_ == 0);
-        return type_code_;
-    }
-
-    int32_t as_etype_int() const { return (int32_t)as_etype(); }
-
-    sc_data_type_t get_pointer_element() const {
-        assert(is_pointer());
-        return sc_data_type_t(etypes::get_pointer_element(type_code_));
-    }
-
-    sc_data_type_t get_pointerof() const {
-        return sc_data_type_t(etypes::get_pointerof(type_code_));
-    }
-
-    constexpr bool is_etype(const sc_data_etype &other) const {
-        return type_code_ == other;
-    }
-    constexpr bool is_etype_pointer() const {
-        return etypes::is_pointer(type_code_);
-    }
-    // returns true if the datatype is scalar pointer type
-    constexpr bool is_pointer() const {
-        return lanes_ == 1 && rows_ == 0 && etypes::is_pointer(type_code_);
-    }
-
-    constexpr bool operator==(const sc_data_type_t &other) const {
-        return type_code_ == other.type_code_ && lanes_ == other.lanes_;
-    }
-
-    constexpr bool operator!=(const sc_data_type_t &other) const {
-        return !(*this == other);
-    }
-    // this converter enables switching on sc_data_type_t
-    constexpr operator uint64_t() const {
-        return (uint64_t(lanes_) << 32) | (uint64_t(rows_) << 16)
-                | uint64_t(type_code_);
-    }
-    // Below are helper functions to construct a datatype with given SIMD lanes.
-    // The result can be a constexpr, which can be further used as a constant,
-    // for example, in the "case" label of switch-case of C++
-    inline static constexpr sc_data_type_t undef(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::UNDEF, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t f16(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::F16, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t bf16(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::BF16, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t u16(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::U16, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t f32(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::F32, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t s32(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::S32, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t u32(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::U32, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t s8(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::S8, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t u8(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::U8, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t index(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::INDEX, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t boolean(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::BOOLEAN, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t generic(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::GENERIC, lanes, rows);
-    }
-    // void* type
-    inline static constexpr sc_data_type_t pointer(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::POINTER, lanes, rows);
-    }
-    inline static constexpr sc_data_type_t pointerof(
-            sc_data_etype etype, uint32_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(etypes::get_pointerof(etype), lanes, rows);
-    }
-    inline static constexpr sc_data_type_t void_t(
-            uint16_t lanes = 1, uint16_t rows = 0) {
-        return sc_data_type_t(sc_data_etype::VOID_T, lanes, rows);
-    }
-};
-
-enum type_category { CATE_FLOAT, CATE_INT, CATE_UINT, CATE_OTHER };
-
-extern type_category get_type_category(sc_data_type_t dtype);
-extern type_category get_type_category_nothrow(sc_data_type_t dtype);
-extern type_category get_etype_category(sc_data_type_t dtype);
-extern type_category get_etype_category_nothrow(sc_data_type_t dtype);
-
-SC_INTERNAL_API std::ostream &operator<<(std::ostream &os, sc_data_etype etype);
-
-SC_INTERNAL_API std::ostream &operator<<(
-        std::ostream &os, sc_data_type_t dtype);
-
-namespace utils {
-template <>
-std::string print_vector(const std::vector<sc_data_type_t> &vec);
-} // namespace utils
-
-/**
- * Predefined scalar sc_data_type_t short-cuts which act like enums. Using them
- * have the same effect of using sc_data_type_t::XXX(1) (where XXX can be f16,
- * f32, etc.). Both members in datatypes and results of sc_data_type_t::XXX()
- * are constexpr, and can be used as normal constants. If you want to use
- * datatypes with lanes other than 1, simply use sc_data_type_t::XXX(n) instead.
- * It can still be a constexpr
- * */
-namespace datatypes {
-static constexpr sc_data_type_t undef = sc_data_type_t::undef();
-static constexpr sc_data_type_t f16 = sc_data_type_t::f16();
-static constexpr sc_data_type_t bf16 = sc_data_type_t::bf16();
-static constexpr sc_data_type_t u16 = sc_data_type_t::u16();
-static constexpr sc_data_type_t f32 = sc_data_type_t::f32();
-static constexpr sc_data_type_t s32 = sc_data_type_t::s32();
-static constexpr sc_data_type_t u32 = sc_data_type_t::u32();
-static constexpr sc_data_type_t s8 = sc_data_type_t::s8();
-static constexpr sc_data_type_t u8 = sc_data_type_t::u8();
-static constexpr sc_data_type_t index = sc_data_type_t::index();
-static constexpr sc_data_type_t boolean = sc_data_type_t::boolean();
-static constexpr sc_data_type_t generic = sc_data_type_t::generic();
-// void* type. The opaque pointer type. Any pointers (including tensor/tensor
-// ptr) can be auto-cast to a pointer value. But casting back is not allowed
-static constexpr sc_data_type_t pointer = sc_data_type_t::pointer();
-static constexpr sc_data_type_t void_t = sc_data_type_t::void_t();
-} // namespace datatypes
-
-template <typename>
-struct sc_data_traits_t {};
-
-template <>
-struct sc_data_traits_t<float> {
-    static constexpr sc_data_type_t type() { return datatypes::f32; }
-};
-
-template <>
-struct sc_data_traits_t<int> {
-    static constexpr sc_data_type_t type() { return datatypes::s32; }
-};
-
-template <>
-struct sc_data_traits_t<uint32_t> {
-    static constexpr sc_data_type_t type() { return datatypes::u32; }
-};
-
-template <>
-struct sc_data_traits_t<int8_t> {
-    static constexpr sc_data_type_t type() { return datatypes::s8; }
-};
-
-template <>
-struct sc_data_traits_t<uint8_t> {
-    static constexpr sc_data_type_t type() { return datatypes::u8; }
-};
-
-struct bf16_t;
-template <>
-struct sc_data_traits_t<bf16_t> {
-    static constexpr sc_data_type_t type() { return datatypes::bf16; }
-};
-
-struct fp16_t;
-template <>
-struct sc_data_traits_t<fp16_t> {
-    static constexpr sc_data_type_t type() { return datatypes::f16; }
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-// definition of hash function of sc_data_type_t
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::sc_data_type_t> {
-    std::size_t operator()(
-            const dnnl::impl::graph::gc::sc_data_type_t &k) const;
-};
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_expr.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_expr.cpp
deleted file mode 100644
index ff39eb1cc2b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_expr.cpp
+++ /dev/null
@@ -1,764 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <limits>
-#include <string.h>
-#include <utility>
-
-#include "builder.hpp"
-#include "intrinsics.hpp"
-#include "ir_comparer.hpp"
-#include "ir_utils.hpp"
-#include "sc_expr.hpp"
-#include "sc_function.hpp"
-#include "ssa_data.hpp"
-#include "visitable.hpp"
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/pass/printer.hpp>
-#include <util/any_map.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-any_t &node_base::temp_data() const {
-    if (!temp_data_) {
-        const_cast<node_base *>(this)->temp_data_ = utils::make_unique<any_t>();
-    }
-    return *temp_data_;
-}
-
-static any_t empty_any;
-
-const any_t &node_base::get_temp_data() const {
-    if (!temp_data_) { return empty_any; }
-    return *temp_data_;
-}
-
-any_map_t &node_base::attr() {
-    if (!attr_) { attr_ = utils::make_unique<any_map_t>(); }
-    return *attr_;
-}
-
-ostream &operator<<(ostream &os, sc_expr_type val) {
-    switch (val) {
-#define HANDLE_CASE(X) \
-    case sc_expr_type::X: os << "sc_expr_type::" #X; break;
-
-        HANDLE_CASE(undef)
-        HANDLE_CASE(constant)
-        HANDLE_CASE(var)
-        HANDLE_CASE(cast)
-        HANDLE_CASE(add)
-        HANDLE_CASE(sub)
-        HANDLE_CASE(mul)
-        HANDLE_CASE(div)
-        HANDLE_CASE(mod)
-        HANDLE_CASE(cmp_eq)
-        HANDLE_CASE(cmp_ne)
-        HANDLE_CASE(cmp_lt)
-        HANDLE_CASE(cmp_le)
-        HANDLE_CASE(cmp_gt)
-        HANDLE_CASE(cmp_ge)
-        HANDLE_CASE(logic_and)
-        HANDLE_CASE(logic_or)
-        HANDLE_CASE(logic_not)
-        HANDLE_CASE(select)
-        HANDLE_CASE(indexing)
-        HANDLE_CASE(call)
-        HANDLE_CASE(tensor)
-        HANDLE_CASE(tensorptr)
-        HANDLE_CASE(intrin_call)
-        HANDLE_CASE(func_addr)
-        HANDLE_CASE(ssa_phi)
-#undef HANDLE_CASE
-        default: os << "(unrecognized sc_expr_type value)"; break;
-    }
-    return os;
-}
-
-ostream &operator<<(ostream &os, intrin_type val) {
-    switch (val) {
-#define HANDLE_CASE(X) \
-    case intrin_type::X: os << "intrin_type::" #X; break;
-
-        HANDLE_CASE(min)
-        HANDLE_CASE(max)
-        HANDLE_CASE(abs)
-        HANDLE_CASE(round)
-        HANDLE_CASE(floor)
-        HANDLE_CASE(ceil)
-        HANDLE_CASE(exp)
-        HANDLE_CASE(sqrt)
-        HANDLE_CASE(rsqrt)
-        HANDLE_CASE(reduce_add)
-        HANDLE_CASE(reduce_mul)
-        HANDLE_CASE(reduce_max)
-        HANDLE_CASE(reduce_min)
-        HANDLE_CASE(fmadd)
-        HANDLE_CASE(fnmadd)
-        HANDLE_CASE(unpack_low)
-        HANDLE_CASE(unpack_high)
-        HANDLE_CASE(shuffle)
-        HANDLE_CASE(permute)
-        HANDLE_CASE(int_and)
-        HANDLE_CASE(int_or)
-        HANDLE_CASE(int_xor)
-        HANDLE_CASE(reinterpret)
-        HANDLE_CASE(broadcast)
-        HANDLE_CASE(isnan)
-        HANDLE_CASE(shl)
-        HANDLE_CASE(shr)
-        HANDLE_CASE(permutex2var)
-        HANDLE_CASE(permutexvar)
-        HANDLE_CASE(insert)
-        HANDLE_CASE(extract)
-        HANDLE_CASE(constant_load)
-        HANDLE_CASE(volatile_load)
-        HANDLE_CASE(brgemm)
-        HANDLE_CASE(list_brgemm)
-        HANDLE_CASE(NUM_INTRINSICS)
-#undef HANDLE_CASE
-        default: os << "(unrecognized intrin_type value)"; break;
-    }
-    return os;
-}
-
-ostream &operator<<(ostream &os, x86_intrin_type::x86_intrin_type_t val) {
-    switch (val) {
-#define HANDLE_CASE(X) \
-    case x86_intrin_type::X: os << "x86_intrin_type::" #X; break;
-        HANDLE_CASE(avx_broadcast_idx)
-        HANDLE_CASE(NUM_INTRINSICS)
-#undef HANDLE_CASE
-        default: os << "(unrecognized x86_intrin_type value)"; break;
-    }
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, linkage val) {
-    switch (val) {
-        case linkage::public_global: os << "linkage::public_global"; break;
-        case linkage::private_global: os << "linkage::private_global"; break;
-        case linkage::static_local: os << "linkage::static_local"; break;
-        case linkage::local: os << "linkage::local"; break;
-    }
-    return os;
-}
-
-node_base::~node_base() = default;
-expr_base::~expr_base() = default;
-
-expr_base::expr_base() = default;
-expr_base::expr_base(sc_data_type_t type) : dtype_(type) {}
-expr_base::expr_base(sc_expr_type exp_type) : node_type_(exp_type) {}
-expr_base::expr_base(sc_data_type_t type, sc_expr_type exp_type)
-    : dtype_(type), node_type_(exp_type) {}
-
-expr::node_ptr(float v) : parent(builder::make_constant(v)) {}
-expr::node_ptr(int32_t v) : parent(builder::make_constant(v)) {}
-expr::node_ptr(uint64_t v) : parent(builder::make_constant(v)) {}
-expr::node_ptr(bool v)
-    : node_ptr(make_expr<constant_node>(uint64_t(v), datatypes::boolean)) {}
-
-expr_c::node_ptr(float v) : parent(builder::make_constant(v)) {}
-expr_c::node_ptr(int32_t v) : parent(builder::make_constant(v)) {}
-expr_c::node_ptr(uint64_t v) : parent(builder::make_constant(v)) {}
-expr_c::node_ptr(bool v)
-    : node_ptr(make_expr<constant_node>(uint64_t(v), datatypes::boolean)) {}
-
-expr::lvalue_proxy_t::lvalue_proxy_t() : require_remake_(true) {}
-
-expr::lvalue_proxy_t::lvalue_proxy_t(expr data, bool require_remake)
-    : data_(std::move(data)), require_remake_(require_remake) {}
-
-expr expr::lvalue_proxy_t::get() const {
-    if (require_remake_) {
-        return data_->remake();
-    } else {
-        return expr(data_);
-    }
-}
-
-expr::lvalue_proxy_t::operator expr() const {
-    return get();
-}
-
-expr::lvalue_proxy_t::operator expr_c() const {
-    return get();
-}
-
-void expr::lvalue_proxy_t::operator=(const expr &other) const {
-    builder::get_current_builder()->push_assign(get(), other);
-}
-
-void expr::lvalue_proxy_t::operator=(expr::lvalue_proxy_t &other) const {
-    this->operator=(other.get());
-}
-
-expr::lvalue_proxy_t expr::lvalue_proxy_t::operator[](
-        const std::vector<expr> &index) const {
-    return expr::lvalue_proxy_t(builder::make_indexing(*this, index), true);
-}
-
-expr::lvalue_proxy_t expr::lvalue_proxy_t::operator[](expr index) const {
-    return expr::lvalue_proxy_t(
-            builder::make_indexing(*this, std::move(index)), true);
-}
-
-expr::lvalue_proxy_t expr::lvalue_proxy_t::operator[](
-        const span_t &index) const {
-    return expr::lvalue_proxy_t(get()[index], true);
-}
-
-expr::lvalue_proxy_t::lvalue_proxy_t(expr::lvalue_proxy_t &&other)
-    : data_(std::move(other.data_)), require_remake_(other.require_remake_) {}
-
-expr::lvalue_proxy_t::lvalue_proxy_t(const expr::lvalue_proxy_t &other)
-        = default;
-
-expr::lvalue_proxy_t expr::operator[](const std::vector<expr> &index) const {
-    return expr::lvalue_proxy_t(builder::make_indexing(*this, index), true);
-}
-
-expr::lvalue_proxy_t expr::operator[](expr index) {
-    return expr::lvalue_proxy_t(
-            builder::make_indexing(*this, std::move(index)), true);
-}
-
-expr::lvalue_proxy_t expr::operator[](const span_t &index) const {
-    std::vector<expr> idx;
-    idx.reserve(index.index_.size());
-    for (auto &i : index.index_) {
-        idx.emplace_back(i);
-    }
-
-    return expr::lvalue_proxy_t(
-            builder::make_indexing(
-                    *this, idx, index.length_, index.mask_, index.rows_),
-            true);
-}
-
-void print_indents(ostream &os, int indent) {
-    for (int i = 0; i < indent; i++) {
-        os << "  ";
-    }
-}
-
-ostream &operator<<(ostream &os, const expr_c &e) {
-    return os << e.get();
-}
-
-ostream &operator<<(ostream &os, const expr_base *e) {
-    e->to_string(os);
-    return os;
-}
-
-void expr_base::to_string(ostream &os) const {
-    ir_printer_t p {os};
-    p.dispatch(node_ptr_from_this());
-}
-
-bool expr_base::equals(expr_c other) const {
-    ir_comparer cmper;
-    return this->equals(std::move(other), cmper);
-}
-
-expr constant_node::remake() const {
-    return copy_attr(*this, make_expr<constant_node>(value_, dtype_));
-}
-
-#define ASCAST_OR_RETURN(v, other) \
-    using self \
-            = node_ptr<typename std::remove_reference<decltype(*this)>::type, \
-                    expr_base>; \
-    if (!(v).isa<self>()) { \
-        return ctx.set_result(node_ptr_from_this(), v, false); \
-    } \
-    if ((v)->dtype_ != dtype_) { \
-        return ctx.set_result(node_ptr_from_this(), v, false); \
-    } \
-    auto other = v.static_as<self>(); // NOLINT(bugprone-macro-parentheses)
-
-#define DYNCAST_OR_RETURN(v, other) \
-    using self \
-            = node_ptr<typename std::remove_reference<decltype(*this)>::type, \
-                    expr_base>; \
-    if (!(v).instanceof <self>()) { \
-        return ctx.set_result(node_ptr_from_this(), v, false); \
-    } \
-    if ((v)->dtype_ != dtype_) { \
-        return ctx.set_result(node_ptr_from_this(), v, false); \
-    } \
-    auto other = v.static_as<self>(); // NOLINT(bugprone-macro-parentheses)
-
-#define RETURN(val) return ctx.set_result(node_ptr_from_this(), v, (val));
-
-bool constant_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    sc_data_etype etype = dtype_.is_etype_pointer() ? sc_data_etype::POINTER
-                                                    : dtype_.type_code_;
-    bool is_float_category = dtype_.type_code_ != sc_data_etype::POINTER
-            && get_etype_category_nothrow(dtype_.type_code_)
-                    == type_category::CATE_FLOAT;
-
-    auto is_values_equals
-            = [&is_float_category](const std::vector<union_val> &v1,
-                      const std::vector<union_val> &v2) {
-                  assert(v1.size() == 1);
-                  return std::all_of(v2.begin(), v2.end(),
-                          [&v1, &is_float_category](const union_val &x) {
-                              return is_float_category ? x.f32 == v1[0].f32
-                                                       : x.u64 == v1[0].u64;
-                          });
-              };
-    // Our IR is usually written in the form of
-    // make_expr<constant_node>(1.f,f32(8)), and its
-    // constant value size will be 1. After we have checked the dtype is same,
-    // we need to ensure all the union_val value is same.
-    if (other->value_.size() != value_.size()) {
-        bool ok = false;
-        if (value_.size() == 1) {
-            ok = is_values_equals(value_, other->value_);
-        } else if (other->value_.size() == 1) {
-            ok = is_values_equals(other->value_, value_);
-        }
-        RETURN(ok);
-    };
-    // value size is equal
-    bool res = std::equal(value_.begin(), value_.end(), other->value_.begin(),
-            [&](const union_val &x, const union_val &y) {
-                if (is_float_category) {
-                    if (x.f32 != y.f32) { return false; }
-                } else {
-                    if (x.u64 != y.u64) { return false; }
-                }
-                return true;
-            });
-    RETURN(res);
-}
-
-expr var_node::remake() const {
-    return copy_attr(*this, builder::make_var(dtype_, name_));
-}
-
-bool var_node::equals(expr_c v, ir_comparer &ctx) const {
-    if (ctx.cmp_var_ref_) {
-        if (ctx.get_expr_mapping(node_ptr_from_this(), v)) { return true; }
-        RETURN(v.get() == this);
-    }
-    ASCAST_OR_RETURN(v, other);
-    bool name_checking_passed = !ctx.cmp_names_ || (name_ == other->name_);
-    if (!name_checking_passed
-            || !ctx.check_or_set_expr_mapping(node_ptr_from_this(), v)) {
-        RETURN(false);
-    }
-    // all other checks are done in ASCAST_OR_RETURN
-    return true;
-}
-
-expr cast_node::remake() const {
-    return copy_attr(*this, builder::make_cast(dtype_, in_));
-}
-
-bool cast_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    return in_->equals(other->in_, ctx);
-}
-
-bool binary_node::equals(expr_c v, ir_comparer &ctx) const {
-    DYNCAST_OR_RETURN(v, other);
-    if (node_type_ != other->node_type_) { RETURN(false); }
-    auto ret = l_->equals(other->l_, ctx) && r_->equals(other->r_, ctx);
-    if (!ret && ctx.cmp_commutative_
-            && (node_type_ == sc_expr_type::add
-                    || node_type_ == sc_expr_type::mul)) {
-        return l_->equals(other->r_, ctx) && r_->equals(other->l_, ctx);
-    }
-    return ret;
-}
-
-bool logic_node::equals(expr_c v, ir_comparer &ctx) const {
-    DYNCAST_OR_RETURN(v, other);
-    if (node_type_ != other->node_type_) { RETURN(false); }
-    auto ret = l_->equals(other->l_, ctx) && r_->equals(other->r_, ctx);
-    if (!ret && ctx.cmp_commutative_) {
-        return l_->equals(other->r_, ctx) && r_->equals(other->l_, ctx);
-    }
-    return ret;
-}
-
-bool cmp_node::equals(expr_c v, ir_comparer &ctx) const {
-    DYNCAST_OR_RETURN(v, other);
-    if (node_type_ != other->node_type_) { RETURN(false); }
-    auto ret = l_->equals(other->l_, ctx) && r_->equals(other->r_, ctx);
-    if (!ret && ctx.cmp_commutative_
-            && (node_type_ == sc_expr_type::cmp_eq
-                    || node_type_ == sc_expr_type::cmp_ne)) {
-        return l_->equals(other->r_, ctx) && r_->equals(other->l_, ctx);
-    }
-    return ret;
-}
-
-#define GEN_BINARY(CLASS, OP) \
-    expr CLASS##_node::remake() const { \
-        return copy_attr(*this, builder::make_##CLASS(l_, r_)); \
-    }
-
-expr add_node::remake() const {
-    auto ret = builder::make_add(l_, r_);
-    if (dtype_ != datatypes::undef) ret->dtype_ = dtype_;
-    return copy_attr(*this, std::move(ret));
-}
-
-GEN_BINARY(sub, " - ")
-GEN_BINARY(mul, " * ")
-GEN_BINARY(div, " / ")
-GEN_BINARY(mod, " % ")
-GEN_BINARY(cmp_eq, " == ")
-GEN_BINARY(cmp_lt, " < ")
-GEN_BINARY(cmp_le, " <= ")
-GEN_BINARY(cmp_gt, " > ")
-GEN_BINARY(cmp_ge, " >= ")
-GEN_BINARY(cmp_ne, " != ")
-GEN_BINARY(logic_and, " && ")
-GEN_BINARY(logic_or, " || ")
-
-bool logic_not_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    return in_->equals(other->in_, ctx);
-}
-
-expr logic_not_node::remake() const {
-    return copy_attr(*this, builder::make_logic_not(in_));
-}
-
-bool select_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    return cond_->equals(other->cond_, ctx) && l_->equals(other->l_, ctx)
-            && r_->equals(other->r_, ctx);
-}
-
-expr select_node::remake() const {
-    return copy_attr(*this, builder::make_select(cond_, l_, r_));
-}
-
-expr indexing_node::remake() const {
-    return copy_attr(*this,
-            builder::make_indexing(
-                    ptr_, idx_, dtype_.lanes_, mask_, dtype_.rows_));
-}
-
-bool indexing_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    return ptr_->equals(other->ptr_, ctx)
-            && ctx.set_result(node_ptr_from_this(), v,
-                    ctx.expr_arr_equals(idx_, other->idx_))
-            && ctx.check_equals_may_null(mask_, other->mask_);
-}
-
-static sc_data_type_t get_func_type(node_base *func) {
-    if (auto f = dynamic_cast<func_base *>(func)) {
-        return f->ret_type_;
-    } else {
-        auto e = dynamic_cast<expr_base *>(func);
-        assert(e);
-        return e->attr().get<func_t>("prototype")->ret_type_;
-    }
-}
-
-call_node::call_node(const std::shared_ptr<node_base> &func,
-        const std::vector<expr> &args, std::vector<parallel_attr_t> &&para_attr)
-    : expr_base(get_func_type(func.get()), sc_expr_type::call)
-    , func_(func)
-    , args_(args)
-    , para_attr_(std::move(para_attr)) {}
-
-call_node::call_node(const expr &func, const std::vector<expr> &args)
-    : expr_base(func->attr().get<func_t>("prototype")->ret_type_,
-            sc_expr_type::call)
-    , func_(func.impl)
-    , args_(args) {}
-
-call_node::call_node(const func_t &func, const std::vector<expr> &args,
-        std::vector<parallel_attr_t> &&para_attr)
-    : expr_base(func->ret_type_, sc_expr_type::call)
-    , func_(func)
-    , args_(args)
-    , para_attr_(std::move(para_attr)) {}
-
-call_node::parallel_attr_t::parallel_attr_t(expr begin_, expr end_, expr step_)
-    : begin_(std::move(begin_))
-    , end_(std::move(end_))
-    , step_(std::move(step_)) {}
-
-func_t call_node::get_prototype() const {
-    func_t the_func = std::dynamic_pointer_cast<func_base>(func_);
-    func_t proto_func;
-    if (!the_func) {
-        auto the_expr = std::dynamic_pointer_cast<expr_base>(func_);
-        assert(the_expr);
-        proto_func = the_expr->attr().get<func_t>("prototype");
-    } else {
-        proto_func = the_func;
-    }
-    return proto_func;
-}
-
-expr call_node::remake() const {
-    return copy_attr(*this, make_expr<call_node>(func_, args_));
-}
-
-// for the callee, just check if pointer is same
-bool call_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    auto shared = node_ptr_from_this();
-    auto &ths_func = *func_;
-    auto &other_func = *other->func_;
-    if (typeid(ths_func) != typeid(other_func)) { return false; }
-    if (auto e = dynamic_cast<expr_base *>(func_.get())) {
-        // if is expr, compare the callee by equals()
-        if (!e->equals(
-                    expr_c(std::static_pointer_cast<expr_base>(other->func_)),
-                    ctx)) {
-            return false;
-        }
-    } else {
-        if (ctx.cmp_callee_) {
-            auto f = dynamic_cast<func_base *>(func_.get());
-            assert(f);
-            if (!f->equals(
-                        std::static_pointer_cast<func_base>(other->func_), ctx))
-                return false;
-        } else {
-            if (!ctx.set_result(shared, v, func_ == other->func_)) return false;
-        }
-    }
-    if (para_attr_.size() != other->para_attr_.size()) { RETURN(false); }
-    for (unsigned i = 0; i < para_attr_.size(); i++) {
-        auto &p = para_attr_[i];
-        auto &op = other->para_attr_[i];
-        if (!p.begin_->equals(op.begin_, ctx) || !p.end_->equals(op.end_, ctx)
-                || !p.step_->equals(op.step_, ctx)) {
-            return false;
-        }
-    }
-    RETURN(ctx.expr_arr_equals(args_, other->args_));
-}
-
-tensor_node::tensor_node(sc_data_type_t dtype, const std::string &name,
-        const std::vector<expr> &dims, address_space address_space,
-        const std::shared_ptr<static_data_t> &init_value,
-        const std::vector<expr> &strides)
-    : expr_base(
-            sc_data_type_t::pointerof(dtype.type_code_), sc_expr_type::tensor)
-    , elem_dtype_(dtype)
-    , dims_(dims)
-    , name_(name)
-    , address_space_(address_space)
-    , init_value_(init_value)
-    , strides_(strides) {
-    if (strides_.empty()) { strides_ = dims_to_dense_stride(dims_); }
-}
-
-void tensor_node::to_string_full(ir_printer_t &printer) {
-    auto &os = printer.os_;
-    printer.do_dispatch(node_ptr_from_this()) << ": [" << elem_dtype_ << " * ";
-    if (!dims_.empty()) {
-        for (unsigned i = 0; i < dims_.size() - 1; i++) {
-            printer.do_dispatch(dims_.at(i)) << " * ";
-        }
-        printer.do_dispatch(dims_.back());
-    }
-    os << ']';
-    if (address_space_ != address_space::automatic) {
-        switch (address_space_) {
-            case address_space::device: os << " device"; break;
-            case address_space::host: os << " host"; break;
-            default: assert(0); break;
-        }
-    }
-    if (init_value_ == tensor_node::get_zero_tensor_initializer()) {
-        os << "{zero_init}";
-    } else if (init_value_ && init_value_->size_ == sizeof(union_val)) {
-        union_val val = *reinterpret_cast<union_val *>(init_value_->data_);
-        os << "{value:" << val.u64 << '}';
-    }
-}
-
-const std::shared_ptr<static_data_t> &
-tensor_node::get_zero_tensor_initializer() {
-    static std::shared_ptr<static_data_t> ret
-            = std::make_shared<static_data_t>(nullptr, 0);
-    return ret;
-}
-
-std::shared_ptr<static_data_t> tensor_node::make_tensor_initializer(
-        union_val val) {
-    union_val theval;
-    theval.u64 = 0;
-    theval = val;
-    return std::make_shared<static_data_t>(&theval, sizeof(val));
-}
-
-expr tensor_node::remake() const {
-    return copy_attr(*this,
-            builder::make_stensor(name_, dims_, strides_, elem_dtype_,
-                    address_space_, init_value_));
-}
-
-// ignore the names
-bool tensor_node::equals(expr_c v, ir_comparer &ctx) const {
-    if (ctx.cmp_var_ref_) {
-        if (ctx.get_expr_mapping(node_ptr_from_this(), v)) { return true; }
-        RETURN(v.get() == this);
-    }
-    ASCAST_OR_RETURN(v, other);
-    bool name_checking_passed = !ctx.cmp_names_ || (name_ == other->name_);
-    if (!name_checking_passed || address_space_ != other->address_space_
-            || dtype_ != other->dtype_ || elem_dtype_ != other->elem_dtype_
-            || !ctx.check_or_set_expr_mapping(node_ptr_from_this(), v)) {
-        RETURN(false);
-    }
-    if (init_value_) {
-        if (!other->init_value_) { RETURN(false); }
-        if (init_value_->size_ != other->init_value_->size_
-                || memcmp(init_value_->data_, other->init_value_->data_,
-                        other->init_value_->size_)) {
-            RETURN(false);
-        }
-    } else {
-        if (other->init_value_) { RETURN(false); }
-    }
-    RETURN(ctx.expr_arr_equals(dims_, other->dims_)
-            && ctx.expr_arr_equals(strides_, other->strides_));
-}
-
-expr tensorptr_node::remake() const {
-    return copy_attr(
-            *this, make_expr<tensorptr_node>(base_, shape_, is_slice_));
-}
-
-bool tensorptr_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    if (other->is_slice_ != is_slice_) { RETURN(false); }
-    if (!ctx.expr_arr_equals(shape_, other->shape_)) { RETURN(false); }
-    return base_->equals(other->base_, ctx);
-}
-
-intrin_call_node::intrin_call_node(intrin_type intrin,
-        const std::vector<expr> &args, const any_map_t &attrs)
-    : expr_base(sc_expr_type::intrin_call)
-    , type_(intrin)
-    , args_(args)
-    , intrin_attrs_(utils::make_unique<any_map_t>(attrs)) {
-    get_intrinsic_handler(type_).on_initialize(*this);
-}
-
-expr intrin_call_node::remake() const {
-    return copy_attr(
-            *this, make_expr<intrin_call_node>(type_, args_, *intrin_attrs_));
-}
-
-bool intrin_call_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    if (type_ != other->type_) { RETURN(false); }
-    if (type_ == intrin_type::brgemm || type_ == intrin_type::list_brgemm) {
-        auto &extra_args = intrin_attrs_->get<brgemm_args::extra_args_t>(
-                intrin_attr::brgemm_extras);
-        auto &other_extra_args
-                = other->intrin_attrs_->get<brgemm_args::extra_args_t>(
-                        intrin_attr::brgemm_extras);
-        if (extra_args != other_extra_args) { RETURN(false); }
-    }
-    RETURN(ctx.expr_arr_equals(args_, other->args_));
-}
-
-bool intrin_call_node::check_brgemm_arg_size(size_t expected_size) const {
-    if (type_ != intrin_type::brgemm && type_ != intrin_type::list_brgemm) {
-        return true;
-    }
-
-    return args_.size() == expected_size;
-}
-
-expr func_addr_node::remake() const {
-    return copy_attr(*this, make_expr<func_addr_node>(func_));
-}
-
-bool func_addr_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    RETURN(func_ == other->func_);
-}
-
-ssa_phi_node::ssa_phi_node(const std::vector<expr> &values, bool is_loop_phi)
-    : expr_base(type_code_), values_(values), is_loop_phi_(is_loop_phi) {
-    COMPILE_ASSERT(!values_.empty(), "Phi node expects non-empty inputs");
-    dtype_ = values_.begin()->get()->dtype_;
-    for (auto &v : values_) {
-        COMPILE_ASSERT(dtype_ == v->dtype_,
-                "Phi node expects exprs with the same type, got "
-                        << dtype_ << " v.s. " << v->dtype_);
-    }
-}
-
-expr ssa_phi_node::remake() const {
-    return copy_attr(*this, make_expr<ssa_phi_node>(values_, is_loop_phi_));
-}
-
-bool ssa_phi_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    RETURN(other->is_loop_phi_ == is_loop_phi_
-            && ctx.expr_arr_equals(values_, other->values_));
-}
-
-low_level_intrin_node::low_level_intrin_node(low_level_intrin_kind kind,
-        int64_t type, const std::vector<expr> &args, const any_map_t &attrs)
-    : expr_base(sc_expr_type::low_level_intrin)
-    , kind_(kind)
-    , type_(type)
-    , args_(args)
-    , intrin_attrs_(utils::make_unique<any_map_t>(attrs)) {}
-
-expr low_level_intrin_node::remake() const {
-    auto ret = make_expr<low_level_intrin_node>(
-            kind_, type_, args_, *intrin_attrs_);
-    ret->dtype_ = dtype_;
-    return copy_attr(*this, std::move(ret));
-}
-
-bool low_level_intrin_node::equals(expr_c v, ir_comparer &ctx) const {
-    ASCAST_OR_RETURN(v, other);
-    if (kind_ != other->kind_) { RETURN(false); }
-    if (type_ != other->type_) { RETURN(false); }
-    RETURN(ctx.expr_arr_equals(args_, other->args_));
-}
-
-const std::string &get_node_name(const expr &e) {
-    tensor t = e.as<tensor>();
-    if (t.get() != nullptr) { return t->name_; }
-
-    var v = e.as<var>();
-    if (v.get() != nullptr) { return v->name_; }
-
-    COMPILE_ASSERT(
-            false, "Not an expr_base subclass that has a 'name_' member.");
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_expr.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_expr.hpp
deleted file mode 100644
index 525e7bc3aa0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_expr.hpp
+++ /dev/null
@@ -1,1597 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_EXPR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_EXPR_HPP
-
-#include <assert.h>
-
-#include <functional>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "sc_data_type.hpp"
-#include "statics_table.hpp"
-#if SC_MEMORY_LEAK_CHECK > 0
-#include <util/leak_detector.hpp>
-#endif
-#include "ir_node_names.hpp"
-#include <util/optional.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct any_map_t;
-struct ssa_data_t;
-struct any_t;
-
-using std::ostream;
-/**
- * Utility funtion to print indents for formatting
- * @param os the output stream
- * @param indent the number of indents
- * */
-extern void print_indents(ostream &os, int indent);
-
-/**
- * The IDs for each expression node
- * */
-enum class sc_expr_type {
-    undef = 0,
-#define _SC_DEFINE_EXPR(t, ...) t,
-    FOR_EACH_EXPR_IR_TYPE(_SC_DEFINE_EXPR)
-
-#undef _SC_DEFINE_EXPR
-    // clang-format off
-    MAX_TYPE = low_level_intrin
-    // clang-format on
-};
-
-std::ostream &operator<<(std::ostream &os, sc_expr_type val);
-
-// a wrapper for std::is_base_of
-// old versions of g++ have std::is_base_of<Cls, Cls> = false
-// which is not expected
-template <typename Base, typename Derived>
-struct is_base_of_t {
-    using RMConstDerived = typename std::remove_const<Derived>::type;
-    static constexpr bool value = std::is_base_of<Base, RMConstDerived>::value
-            || std::is_same<Base, RMConstDerived>::value;
-};
-
-template <typename T, typename Base>
-class node_ptr;
-
-namespace optional_impl {
-template <typename T, typename Base>
-struct optional_base<node_ptr<T, Base>> {
-    using cur_type_t = node_ptr<T, Base>;
-    void init_as_empty(void *v) { new (v) cur_type_t(); }
-    void set_has_value(void *v) {}
-    bool has_value_impl(const void *v) const {
-        return reinterpret_cast<const cur_type_t *>(v)->defined();
-    }
-};
-
-} // namespace optional_impl
-
-/**
- * The smart pointer for statement nodes, expression node and etc.
- * Uses std::shared_ptr as internal implementation. It reinforces
- * shared_ptr by adding upcast/downcast functionalities.
- *
- * @tparam T the type of the node that this node_ptr points to
- * @tparam Base the root base class of T. Examples: stmt_base_t, expr_base
- * */
-template <typename T, typename Base>
-class node_ptr_impl_t {
-public:
-    template <typename _Arg>
-    using _assignable =
-            typename std::enable_if<std::is_convertible<_Arg *, T *>::value,
-                    Base>::type;
-    static_assert(
-            is_base_of_t<Base, T>::value, "T should be a subclass of Base");
-
-    // the implementation is based on shared_ptr<Base>
-    using impl_ptr = std::shared_ptr<Base>;
-    using type = T;
-
-    impl_ptr impl;
-
-    // constructible from a sub-class node_ptr<T2, Base>, where T2 < T
-    template <typename T2>
-    node_ptr_impl_t(const node_ptr<T2, _assignable<T2>> &other) noexcept
-        : impl(other.impl) {}
-
-    // Move-constructible from a sub-class node_ptr<T2, Base>, where T2 < T
-    template <typename T2>
-    node_ptr_impl_t(node_ptr<T2, _assignable<T2>> &&other) noexcept
-        : impl(std::move(other.impl)) {}
-
-    // Constructs a node_ptr_impl_t from raw shared_ptr<Base>
-    explicit node_ptr_impl_t(const impl_ptr &impl) noexcept : impl(impl) {}
-    // Move-constructs a node_ptr_impl_t from raw shared_ptr<Base>
-    explicit node_ptr_impl_t(impl_ptr &&impl) noexcept
-        : impl(std::move(impl)) {}
-
-    // Constructs an empty node_ptr_impl_t
-    node_ptr_impl_t() noexcept = default;
-
-    template <typename T2>
-    node_ptr_impl_t &operator=(
-            const node_ptr<T2, _assignable<T2>> &other) noexcept {
-        impl = other.impl;
-        return *this;
-    }
-
-    // move-assignable from node_ptr
-    template <typename T2>
-    node_ptr_impl_t &operator=(node_ptr<T2, _assignable<T2>> &&other) noexcept {
-        impl = std::move(other.impl);
-        return *this;
-    }
-
-    /**
-     * Checks if the contained pointer is `exactly` of type T2::type.
-     * `defined()` of this must be `true`. See notes.
-     *
-     * @tparam T2 the type to check. T2 should be node_ptr<X, Base> or its alias
-     * like `for_loop`
-     * @return `true` if the contained pointer is of type T2.
-     * @note isa() will not check if the pointer is a sub-class of T2::type.
-     *  That is, for example:
-     *      add a = ...;
-     *      a.isa<binary>() == false
-     * */
-    template <typename T2>
-    bool isa() const noexcept {
-        static_assert(is_base_of_t<T, typename T2::type>::value,
-                "Cannot cast T to T2");
-        return impl->node_type_ == T2::type::type_code_;
-    }
-
-    /**
-     * Checks if the contained pointer is an instance of type T2::type or its
-     * subclass. Will be slower than `isa()` because it uses dynamic_cast
-     * `defined()` of this must be `true`.
-     *
-     * @tparam T2 the type to check. T2 should be node_ptr<X, Base> or its alias
-     * like `for_loop`
-     * @return `true` if the contained pointer is of type T2. Will check if the
-     *  pointer is a sub-class of T2.
-     * */
-    template <typename T2>
-    bool instanceof () const noexcept { // NOLINT
-        return dynamic_cast<typename T2::type *>(impl.get()) != nullptr;
-    }
-
-    /**
-     * Converts the pointer to T2. Like static_cast, it will not check if the
-     * cast is really valid and it is up to the user of this function to ensure
-     * the pointer can be casted to T2.
-     *
-     * @tparam T2 the target type to cast. It should be node_ptr<X, Base>
-     * or its alias like `for_loop`. Also the contained type (T2::type) must be
-     * a base class or a subclass of T.
-     * @return The casted node_ptr of T2.
-     * */
-    template <typename T2>
-    node_ptr<typename T2::type, Base> static_as() const noexcept {
-        static_assert(is_base_of_t<T, typename T2::type>::value
-                        || is_base_of_t<typename T2::type, T>::value,
-                "Cannot cast T to T2");
-        return node_ptr<typename T2::type, Base>(impl);
-    }
-    /**
-     * Converts the pointer to T2. It will return an empty node_ptr if the
-     * contained pointer is not `exactly` T2::type. It uses `isa()` internally.
-     * `defined()` of this must be `true`.
-     * @see isa
-     *
-     * @tparam T2 the target type to cast. It should be node_ptr<X, Base> or its
-     * alias like `for_loop`.
-     * @return The casted node_ptr of T2. If the type of the contained
-     * pointer is not T2::type, returns empty
-     * */
-    template <typename T2>
-    node_ptr<typename T2::type, Base> as() const noexcept {
-        if (isa<T2>()) {
-            return static_as<T2>();
-        } else {
-            return node_ptr<typename T2::type, Base>();
-        }
-    }
-
-    /**
-     * Converts the pointer to T2. It will abort if the contained
-     * pointer is not `exactly` T2::type. It uses `isa()` internally.
-     * `defined()` of this must be `true`.
-     * @see isa
-     *
-     * @tparam T2 the target type to cast. It should be node_ptr<X, Base> or its
-     * alias like `for_loop`.
-     * @return The casted node_ptr of T2.
-     * */
-    template <typename T2>
-    node_ptr<typename T2::type, Base> checked_as() const {
-        assert(isa<T2>() && "checked_as failed");
-        return static_as<T2>();
-    }
-
-    /**
-     * Converts the pointer to T2. It will return an empty node_ptr if the
-     * contained pointer is not a subclass of T2::type. It uses `instanceof()`
-     * internally. `defined()` of this must be `true`.
-     * @see instanceof
-     *
-     * @tparam T2 the target type to cast. It should be node_ptr<X, Base> or its
-     * alias like `for_loop`.
-     * @return The casted node_ptr of T2. If the type of the contained
-     * pointer is not a subclass of T2::type, returns empty
-     * */
-    template <typename T2>
-    node_ptr<typename T2::type, Base> dyn_as() const noexcept {
-        if (instanceof <T2>()) {
-            return static_as<T2>();
-        } else {
-            return node_ptr<typename T2::type, Base>();
-        }
-    }
-
-    /**
-     * @brief Try to downcast to a subclass pointer. If not successful, return
-     * an empty optional
-     *
-     * @tparam T2 a subclass pointer type
-     * @return optional<node_ptr<typename T2::type, Base>>
-     */
-    template <typename T2>
-    optional<node_ptr<typename T2::type, Base>> cast() const {
-        if (!defined()) return {};
-        return as<T2>();
-    }
-
-    /**
-     * Adds a const-qualifier to the type T
-     * @return The casted node_ptr
-     * */
-    node_ptr<const T, Base> to_const() const noexcept {
-        return node_ptr<const T, Base>(impl);
-    }
-
-    /**
-     * Removes the const-qualifier of T, like const_cast
-     * @return The casted node_ptr
-     * */
-    node_ptr<typename std::remove_const<T>::type, Base>
-    remove_const() const noexcept {
-        return node_ptr<typename std::remove_const<T>::type, Base>(impl);
-    }
-
-    // operator *
-    T &operator*() const noexcept { return *get(); }
-    // operator ->
-    T *operator->() const noexcept { return get(); }
-    // gets the contained pointer
-    T *get() const noexcept { return static_cast<T *>(impl.get()); }
-
-    /**
-     * Checks if the node_ptr contains any pointer
-     * @return false if the node_ptr is empty
-     * */
-    bool defined() const noexcept { return impl.operator bool(); }
-
-    /**
-     * Checks if the node_ptr contains the same pointer of another
-     * @param v the other node_ptr to compare with
-     * @return true if the node_ptrs are the same
-     * */
-    bool ptr_same(const node_ptr_impl_t &v) const noexcept {
-        return v.impl == impl;
-    }
-
-    /**
-     * Gets the weak_ptr of the pointer
-     * */
-    std::weak_ptr<Base> weak() const noexcept { return impl; }
-};
-
-template <typename T, typename Base>
-class node_ptr : public node_ptr_impl_t<T, Base> {
-public:
-    using parent = node_ptr_impl_t<T, Base>;
-    using impl_ptr = typename parent::impl_ptr;
-    using parent::parent;
-    using type = typename parent::type;
-    using parent::operator=;
-    using parent::operator*;
-    using parent::operator->;
-    node_ptr() = default;
-};
-
-// this macro defines the alias of a node_ptr and its const version
-// clang-format off
-#define SC_DEFINE_EXPR_NODE_PTR(TYPE) \
-    using TYPE = node_ptr<TYPE##_node, expr_base>; using TYPE##_c = node_ptr<const TYPE##_node, expr_base>; // NOLINT
-// clang-format on
-
-/**
- * The base class of the data-nodes which uses node_ptr as their pointer
- * container. It enables users to get the node_ptr from `this`
- * */
-template <typename Base>
-class enable_node_ptr_from_this_t : public std::enable_shared_from_this<Base> {
-public:
-    /**
-     * Get the node_ptr from `this`
-     * @return the node_ptr, of which the contained pointer is this
-     * */
-    node_ptr<Base, Base> node_ptr_from_this() {
-        return node_ptr<Base, Base>(
-                std::enable_shared_from_this<Base>::shared_from_this());
-    }
-
-    /**
-     * Get the node_ptr from `this`. Const version
-     * @return the node_ptr, of which the contained pointer is this
-     * */
-    node_ptr<const Base, Base> node_ptr_from_this() const {
-        return node_ptr<const Base, Base>(std::const_pointer_cast<Base>(
-                std::enable_shared_from_this<Base>::shared_from_this()));
-    }
-};
-
-/**
- * Stores an arbitrary single-lane integer or floating-point value.
- *
- * If the length of valid data is less than 32bit (e.g. f32), The unused bits in
- * the union should be filled with 0. By convention, usage of this union assumes
- * the following mapping between C++-type and sc_data_etype:
- *
- *   datatypes::f16     --> float (WIP)
- *   datatypes::bf16    --> float (WIP)
- *   datatypes::f32     --> float
- *   datatypes::s32     --> int64_t
- *   datatypes::s8      --> int64_t
- *   datatypes::u8      --> uint64_t
- *   datatypes::index   --> uint64_t
- *   datatypes::boolean --> uint64_t
- *   datatypes::pointer --> uint64_t
- */
-union union_val {
-    uint64_t u64;
-    int64_t s64;
-    struct {
-        float f32;
-        int32_t unused;
-    };
-    union_val() = default;
-    union_val(uint64_t val_u64) { u64 = val_u64; }
-#if defined(_MSC_VER) || defined(__APPLE__)
-    union_val(unsigned long val_u64) { u64 = val_u64; } // NOLINT
-#endif
-    union_val(int64_t val_s64) { s64 = val_s64; }
-    union_val(float val_f32) {
-        f32 = val_f32;
-        unused = 0;
-    }
-    bool operator==(const union_val &v) const { return u64 == v.u64; }
-    bool operator!=(const union_val &v) const { return u64 != v.u64; }
-};
-
-class expr_base;
-struct span_t;
-
-// the const version of node_ptr of expr_base
-template <>
-class node_ptr<const expr_base, expr_base>
-    : public node_ptr_impl_t<const expr_base, expr_base> {
-public:
-    using parent = node_ptr_impl_t<const expr_base, expr_base>;
-    using impl_ptr = typename parent::impl_ptr;
-    using parent::parent;
-    using type = typename parent::type;
-    using parent::operator=;
-    using parent::operator*;
-    using parent::operator->;
-    // converter from c++ float to f32 `constant` IR
-    SC_INTERNAL_API node_ptr(float v);
-    // converter from c++ int32_t to s32 `constant` IR
-    SC_INTERNAL_API node_ptr(int32_t v);
-    // converter from c++ uint64_t to index `constant` IR
-    SC_INTERNAL_API node_ptr(uint64_t v);
-    // converter from c++ bool to boolean `constant` IR
-    SC_INTERNAL_API node_ptr(bool v);
-    node_ptr() = default;
-};
-using expr_c = node_ptr<const expr_base, expr_base>;
-
-/**
- * The enhanced node_ptr class for expr_base. It enables operator []
- * for easier building indexing IR by simply writing `a[]`. It also
- * enables conversion from C++ numbers to `constant` IR nodes.
- **/
-template <>
-class node_ptr<expr_base, expr_base>
-    : public node_ptr_impl_t<expr_base, expr_base> {
-public:
-    using expr = node_ptr<expr_base, expr_base>;
-    using parent = node_ptr_impl_t<expr_base, expr_base>;
-    using impl_ptr = typename parent::impl_ptr;
-    using parent::parent;
-    using type = typename parent::type;
-    using parent::operator=;
-    using parent::operator*;
-    using parent::operator->;
-
-    // converter from c++ float to f32 `constant` IR
-    SC_INTERNAL_API node_ptr(float v);
-    // converter from c++ int32_t to s32 `constant` IR
-    SC_INTERNAL_API node_ptr(int32_t v);
-    // converter from c++ uint64_t to index `constant` IR
-    SC_INTERNAL_API node_ptr(uint64_t v);
-#if defined(_MSC_VER) || defined(__APPLE__)
-    // converter from c++ uint64_t to index `constant` IR
-    SC_INTERNAL_API node_ptr(unsigned long v) // NOLINT
-        : node_ptr(static_cast<uint64_t>(v)) {} // NOLINT
-#endif
-    // converter from c++ bool to boolean `constant` IR
-    SC_INTERNAL_API node_ptr(bool v);
-
-    node_ptr() = default;
-    node_ptr(const parent &v) : parent(v) {}
-    /**
-     * A helper class to wrap LValues of expr. Assignment on LValues should
-     * generate a assign_node_t. lvalue_proxy_t can be auto-converted to expr.
-     *
-     * @param data_ the wrapped expr LValue
-     * @param require_remake_ whether to call remake() every time get() is
-     *      called. If true, will generate a new expr when getting the expr
-     * */
-    struct SC_INTERNAL_API lvalue_proxy_t {
-        expr::parent data_;
-        bool require_remake_;
-        lvalue_proxy_t();
-        lvalue_proxy_t(expr data, bool require_remake);
-        // generates the indexing_node based on the indexed expr and the indices
-        expr get() const;
-        operator expr() const;
-        operator expr_c() const;
-        /**
-         * Generates an `assign_node_t` and push to the current scope. The
-         * `var_` of the `assign_node_t` will be this indexing_node, the
-         * `value_` is `other`
-         * @param other the value to "assign" to this indexing_node in
-         * generating `assign_node_t`
-         * */
-        void operator=(const expr &other) const;
-        /**
-         * Generates an `assign_node_t` and push to the current scope. The
-         * `var_` of the `assign_node_t` will be this indexing_node, the
-         * `value_` is `other.get()`
-         * @param other the value to "assign" to this indexing_node in
-         * generating `assign_node_t`
-         * */
-        void operator=(lvalue_proxy_t &other) const;
-        lvalue_proxy_t(lvalue_proxy_t &&);
-        lvalue_proxy_t(const lvalue_proxy_t &);
-
-        /**
-         * Generates a lvalue_proxy_t with a single index. The lvalue_proxy_t
-         * can be further "assigned" with expr or be used as expr
-         * */
-        lvalue_proxy_t operator[](expr index) const;
-
-        /**
-         * Generates a lvalue_proxy_t with multi indices. The lvalue_proxy_t
-         * can be further "assigned" with expr or be used as expr
-         * */
-        lvalue_proxy_t operator[](const std::vector<expr> &index) const;
-
-        /**
-         * Generates a lvalue_proxy_t with multi indices as a vector. The
-         * lvalue_proxy_t can be further "assigned" with expr or be used as expr
-         * */
-        lvalue_proxy_t operator[](const span_t &index) const;
-        expr_base *operator->() const { return get().get(); }
-        /**
-         * Converts the pointer to T2. Like static_cast, it will not check if
-         * the cast is really valid and it is up to the user of this function to
-         * ensure the pointer can be casted to T2.
-         * */
-        template <typename T2>
-        node_ptr<typename T2::type, expr_base> static_as() {
-            return get().static_as<T2>();
-        }
-    };
-
-    /**
-     * Generates a lvalue_proxy_t with a single index. The lvalue_proxy_t
-     * can be further "assigned" with expr or be used as expr
-     * */
-    lvalue_proxy_t operator[](expr index);
-
-    /**
-     * Generates a lvalue_proxy_t with multi indices. The lvalue_proxy_t
-     * can be further "assigned" with expr or be used as expr
-     * */
-    lvalue_proxy_t operator[](const std::vector<expr> &index) const;
-
-    /**
-     * Generates a lvalue_proxy_t with multi indices as a vector. The
-     * lvalue_proxy_t can be further "assigned" with expr or be used as expr
-     * */
-    lvalue_proxy_t operator[](const span_t &index) const;
-};
-using expr = node_ptr<expr_base, expr_base>;
-/**
- * The helper struct to easily build a vector indexing node.
- * Specifies a range (start_index, length)
- * @param index_ the start index in the tensor
- * @param length_ the length of the vector to load
- * @param mask_ the mask of vector to load, should be lanes == length_ or
- * bits == length_, e.g. length_==16, vec_f32x16(trans2d) or uint16_t(most
- * cases).
- * @param rows_ the rows of the vector to load. Can be 0 if it is 1D vector
- * */
-struct span_t {
-    std::vector<expr> index_;
-    uint16_t length_;
-    uint16_t rows_;
-    expr mask_;
-
-    span_t(std::vector<expr> index, uint16_t length, expr mask = expr(),
-            uint16_t rows = 0)
-        : index_(std::move(index))
-        , length_(length)
-        , rows_(rows)
-        , mask_(std::move(mask)) {}
-
-    span_t(span_t &&other)
-        : index_(std::move(other.index_))
-        , length_(other.length_)
-        , rows_(other.rows_)
-        , mask_(std::move(other.mask_)) {}
-};
-
-class ir_comparer;
-
-// forward decl of ir_visitor_t base class
-class ir_visitor_base_t;
-
-/**
- * The stmt and expr nodes implements visitable_base_t, so that they can be
- * visited by ir_visitor_base_t. In visited_by(), it will downcast `this` to a
- * sub-class pointer and call the overloaded `vis->visit_impl(...)` of a
- * specific IR sub-class.
- * */
-template <typename Base>
-struct visitable_base_t {
-    virtual node_ptr<Base, Base> visited_by(ir_visitor_base_t *vis) = 0;
-};
-
-// Implementation hidden in visitable.hpp. No need to include it everywhere.
-template <typename T, typename Base>
-struct visitable_t : public virtual visitable_base_t<Base> {
-    node_ptr<Base, Base> visited_by(ir_visitor_base_t *vis) final;
-};
-
-struct node_base {
-    // optional attributes, nullable. In most cases, use attr() to get the
-    // attributes
-    std::unique_ptr<any_map_t> attr_;
-    // temp data after analysis passes
-    std::unique_ptr<any_t> temp_data_;
-
-    // temp data after analysis passes, will create if not exists
-    any_t &temp_data() const;
-
-    // temp data after analysis passes, will return empty if not exists
-    const any_t &get_temp_data() const;
-
-    // returns attr_ if is defined or creates and sets a new any_map_t if not
-    // defined
-    any_map_t &attr();
-
-    virtual ~node_base();
-};
-
-/**
- * The base class of expression IR nodes
- * */
-class expr_base : public node_base,
-                  public virtual visitable_base_t<expr_base>,
-                  public enable_node_ptr_from_this_t<expr_base>
-                  SC_LEAK_CHECK(expr_base) {
-public:
-    expr_base();
-    expr_base(sc_data_type_t type);
-    expr_base(sc_expr_type exp_type);
-    expr_base(sc_data_type_t type, sc_expr_type exp_type);
-    // the data type of the expression
-    sc_data_type_t dtype_ = datatypes::undef;
-    // the expression type id of the IR node
-    sc_expr_type node_type_ = sc_expr_type::undef;
-    // the additional info after SSA transformation pass. Before SSA
-    // transformation, this field should be null
-    std::unique_ptr<ssa_data_t> ssa_data_;
-
-    virtual ~expr_base();
-    /**
-     * Dumps the IR node as string to the ostream
-     * @param os the output stream_t
-     * */
-    virtual void to_string(ostream &os) const;
-    /**
-     * Does shallow copying copy on this IR node.
-     * Makes a new IR node with the same type and the same values of fields.
-     * */
-    virtual expr remake() const = 0;
-    /**
-     * Checks if `this` is same as another IR node. May change the internal
-     * states of `ctx`
-     * @param other the other IR node to compare
-     * @param ctx the context of the comparison: how "same" is defined,
-     *  the internal states, etc.
-     * @return true if the nodes are the same
-     * */
-    virtual bool equals(expr_c other, ir_comparer &ctx) const = 0;
-    /**
-     * Checks if `this` is same as another IR node. It will create a new
-     * default ir_comparer context to do comparison.
-     * @param other the other IR node to compare
-     * @return true if the nodes are the same
-     * */
-    virtual bool equals(expr_c other) const; // NOLINT
-};
-
-/**
- * Makes a expression node_ptr with given arguments.
- * @tparam T the type of the expression to make, should be *_node
- * @param args the arguments to the constructor of T
- * @return a node_ptr of T
- * */
-template <typename T, typename... Args>
-node_ptr<T, expr_base> make_expr(Args &&...args) {
-    std::shared_ptr<T> ptr = std::make_shared<T>(std::forward<Args>(args)...);
-    return node_ptr<T, expr_base>(std::move(ptr));
-}
-
-// Operator << overrider for expr on std::ostream
-extern ostream &operator<<(ostream &os, const expr_c &e);
-// Operator << overrider for expr_base* on std::ostream
-extern ostream &operator<<(ostream &os, const expr_base *e);
-
-/**
- * The expression node for constants
- *
- * NOTE: To avoid confusion within the Graphcompiler code base, it's recommended
- * that all users of this class adhere to the type-mapping convention indicated
- * in the `union_val` documentation.
- */
-class constant_node : public expr_base,
-                      public visitable_t<constant_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::constant;
-    constant_node(int64_t val, sc_data_type_t dtype = datatypes::s32)
-        : expr_base(dtype, sc_expr_type::constant)
-        , value_(std::vector<union_val>(1, val)) {};
-#if defined(_MSC_VER) || defined(__APPLE__)
-    constant_node(unsigned long val, // NOLINT
-            sc_data_type_t dtype = datatypes::index) // NOLINT
-        : expr_base(dtype, sc_expr_type::constant)
-        , value_(std::vector<union_val>(1, val)) {};
-#endif
-
-    constant_node(uint64_t val, sc_data_type_t dtype = datatypes::index)
-        : expr_base(dtype, sc_expr_type::constant)
-        , value_(std::vector<union_val>(1, val)) {};
-
-    constant_node(float val, sc_data_type_t dtype = datatypes::f32)
-        : expr_base(dtype, sc_expr_type::constant)
-        , value_(std::vector<union_val>(1, val)) {};
-
-    constant_node(union_val val, sc_data_type_t dtype = datatypes::f32)
-        : expr_base(dtype, sc_expr_type::constant)
-        , value_(std::vector<union_val>(1, val)) {};
-
-    constant_node(const std::vector<union_val> &val, sc_data_type_t dtype)
-        : expr_base(dtype, sc_expr_type::constant), value_(val) {};
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-    bool is_vector() const { return dtype_.lanes_ > 1; }
-
-    float get_f32() const {
-        assert(dtype_ == datatypes::f32);
-        return value_[0].f32;
-    }
-
-    int64_t get_s32() const {
-        assert(dtype_ == datatypes::s32);
-        return value_[0].s64;
-    }
-
-    uint64_t get_index() const {
-        assert(dtype_ == datatypes::index);
-        return value_[0].u64;
-    }
-
-    bool get_boolean() const {
-        assert(dtype_ == datatypes::boolean);
-        return value_[0].u64;
-    }
-
-    // the contained value
-    std::vector<union_val> value_;
-};
-
-SC_DEFINE_EXPR_NODE_PTR(constant)
-
-enum class linkage {
-    public_global, // global variable, externally visible
-    private_global, // global variable, visible in current module
-    static_local, // C++ "local static", visible in current func, but have
-    // static lifetime
-    local, // local variable on stack
-};
-
-std::ostream &operator<<(std::ostream &os, linkage val);
-
-/**
- * The variable node.
- * @param type the type of the variable
- * @param var_name the name of the variable
- * */
-class var_node : public expr_base, public visitable_t<var_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::var;
-    var_node(sc_data_type_t type, const std::string &var_name)
-        : expr_base(type, sc_expr_type::var), name_(var_name) {}
-    // the variable name
-    std::string name_;
-
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-    expr remake() const override;
-};
-
-SC_DEFINE_EXPR_NODE_PTR(var)
-
-/**
- * The cast node, which converts an expression from one type
- * to another
- * @param type the destination type of the casting
- * @param in_expr the expression to convert
- * */
-class cast_node : public expr_base, public visitable_t<cast_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cast;
-
-    cast_node(sc_data_type_t type, expr in_expr)
-        : expr_base(type, sc_expr_type::cast), in_(std::move(in_expr)) {}
-    // the expression to convert
-    expr in_;
-
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-    expr remake() const override;
-};
-
-SC_DEFINE_EXPR_NODE_PTR(cast)
-
-/**
- * The base class for binary arithmetic Ops:
- *  + - * / % min max
- * @param expr_type the type
- * @param l the left hand side (LHS) of the binary node
- * @param r the right hand side (LHS) of the binary node
- * */
-class binary_node : public expr_base {
-public:
-    binary_node(sc_expr_type expr_type, const expr &l, const expr &r)
-        : expr_base(l->dtype_ == r->dtype_ ? l->dtype_ : datatypes::undef,
-                expr_type)
-        , l_(l)
-        , r_(r) {}
-    // the left hand side expr
-    expr l_;
-    // the right hand side expr
-    expr r_;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(binary)
-
-/**
- * The node for addition (+)
- * */
-class add_node : public binary_node, public visitable_t<add_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::add;
-    add_node(const expr &l, const expr &r)
-        : binary_node(sc_expr_type::add, l, r) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(add)
-
-/**
- * The node for subtraction (-)
- * */
-class sub_node : public binary_node, public visitable_t<sub_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::sub;
-    sub_node(const expr &l, const expr &r)
-        : binary_node(sc_expr_type::sub, l, r) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(sub)
-
-/**
- * The node for multiplication (*)
- * */
-class mul_node : public binary_node, public visitable_t<mul_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::mul;
-    mul_node(const expr &l, const expr &r)
-        : binary_node(sc_expr_type::mul, l, r) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(mul)
-
-/**
- * The node for division (/)
- * */
-class div_node : public binary_node, public visitable_t<div_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::div;
-    div_node(const expr &l, const expr &r)
-        : binary_node(sc_expr_type::div, l, r) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(div)
-
-/**
- * The node for modulo (%)
- * */
-class mod_node : public binary_node, public visitable_t<mod_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::mod;
-    mod_node(const expr &l, const expr &r)
-        : binary_node(sc_expr_type::mod, l, r) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(mod)
-
-/**
- * The base class for binary comparison Ops:
- *  == != >= > <= <
- * @param expr_type the type
- * @param l the left hand side (LHS) of the cmp node
- * @param r the right hand side (LHS) of the cmp node
- * */
-class cmp_node : public expr_base {
-public:
-    cmp_node(sc_expr_type expr_type, expr l, expr r)
-        : expr_base(sc_data_type_t::boolean(l->dtype_.lanes_), expr_type)
-        , l_(std::move(l))
-        , r_(std::move(r)) {};
-    // the left hand side expr
-    expr l_;
-    // the right hand side expr
-    expr r_;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp)
-
-/**
- * Compare equals node (==)
- * */
-class cmp_eq_node : public cmp_node,
-                    public visitable_t<cmp_eq_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cmp_eq;
-    cmp_eq_node(expr l, expr r)
-        : cmp_node(sc_expr_type::cmp_eq, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp_eq)
-
-/**
- * Compare not equals node (!=)
- * */
-class cmp_ne_node : public cmp_node,
-                    public visitable_t<cmp_ne_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cmp_ne;
-    cmp_ne_node(expr l, expr r)
-        : cmp_node(sc_expr_type::cmp_ne, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp_ne)
-
-/**
- * Compare less than node (<)
- * */
-class cmp_lt_node : public cmp_node,
-                    public visitable_t<cmp_lt_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cmp_lt;
-    cmp_lt_node(expr l, expr r)
-        : cmp_node(sc_expr_type::cmp_lt, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp_lt)
-
-/**
- * Compare less equals node (<=)
- * */
-class cmp_le_node : public cmp_node,
-                    public visitable_t<cmp_le_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cmp_le;
-    cmp_le_node(expr l, expr r)
-        : cmp_node(sc_expr_type::cmp_le, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp_le)
-
-/**
- * Compare greater than node (>)
- * */
-class cmp_gt_node : public cmp_node,
-                    public visitable_t<cmp_gt_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cmp_gt;
-    cmp_gt_node(expr l, expr r)
-        : cmp_node(sc_expr_type::cmp_gt, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp_gt)
-
-/**
- * Compare greater equals node (>=)
- * */
-class cmp_ge_node : public cmp_node,
-                    public visitable_t<cmp_ge_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::cmp_ge;
-    cmp_ge_node(expr l, expr r)
-        : cmp_node(sc_expr_type::cmp_ge, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(cmp_ge)
-
-/**
- * The base class for binary logic Ops:
- *  && ||
- * @param expr_type the type
- * @param l the left hand side (LHS) of the logic node, should be an boolean
- *  expr
- * @param r the right hand side (LHS) of the logic node, should be an
- * boolean expr
- * */
-class logic_node : public expr_base {
-public:
-    logic_node(sc_expr_type expr_type, expr l, expr r)
-        : expr_base(datatypes::boolean, expr_type)
-        , l_(std::move(l))
-        , r_(std::move(r)) {};
-    // the left hand side expr
-    expr l_;
-    // the right hand side expr
-    expr r_;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(logic)
-
-/**
- * Logic and node (&&)
- * */
-class logic_and_node : public logic_node,
-                       public visitable_t<logic_and_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::logic_and;
-    logic_and_node(expr l, expr r)
-        : logic_node(sc_expr_type::logic_and, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(logic_and)
-
-/**
- * Logic or node (||)
- * */
-class logic_or_node : public logic_node,
-                      public visitable_t<logic_or_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::logic_or;
-    logic_or_node(expr l, expr r)
-        : logic_node(sc_expr_type::logic_or, std::move(l), std::move(r)) {};
-
-    expr remake() const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(logic_or)
-
-/**
- * The logic not node (!)
- * @param in the input expr, should be a boolean expr.
- * */
-class logic_not_node : public expr_base,
-                       public visitable_t<logic_not_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::logic_not;
-    logic_not_node(const expr &in)
-        : expr_base(in->dtype_, sc_expr_type::logic_not), in_(in) {};
-    expr in_;
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(logic_not)
-
-/**
- * Conditional operation: ? :
- * @param expr_type the type
- * @param cond the conditional judgment in select node, should be an boolean
- * expr
- * @param l obtained value when previous condition returns true, should have
- * same type with r
- * @param r obtained value when previous condition returns false, should have
- * same type with l
- * */
-class select_node : public expr_base,
-                    public visitable_t<select_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::select;
-    select_node(expr cond, const expr &l, const expr &r)
-        : expr_base(l->dtype_, sc_expr_type::select)
-        , cond_(std::move(cond))
-        , l_(l)
-        , r_(r) {
-        assert(l->dtype_ == r->dtype_);
-    }
-    // the condition expr
-    expr cond_;
-    // obtained expr when condition is true
-    expr l_;
-    // obtained expr when condition is false
-    expr r_;
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(select)
-
-class func_base;
-class call_node;
-
-/**
- * The enhanced shared_ptr for function node
- * */
-class func_t : public std::shared_ptr<func_base> {
-public:
-    /**
-     * Makes a call node, whose callee is this and the parameters
-     * are args
-     *
-     * @param args the arguments of the call. Should be exprs
-     * @return the call node node_ptr
-     * */
-    template <class... Types>
-    expr operator()(Types... args) {
-        return expr(std::make_shared<call_node>(
-                *this, std::vector<expr>({expr(args)...})));
-    }
-    /**
-     * Constructs an empty pointer
-     * */
-    func_t() = default;
-
-    /**
-     * Creates a smart pointer from a raw pointer. Takes the ownership
-     * of the raw pointer.
-     *
-     * @note You should only create a `func_t` by this constructor by passing
-     *  a func_base pointer that is just `newed`. You should never pass a
-     * raw pointer from another func to this constructor. To get the `func_t`
-     * from `this` in the class func_base, use func_t(shared_from_this())
-     * instead.
-     * */
-    func_t(func_base *ptr);
-
-    /**
-     * Creates a smart pointer from a shared_ptr
-     * */
-    func_t(std::shared_ptr<func_base> &&other);
-
-    /**
-     * Checks if the func_t contains the same pointer of another
-     * @param v the other func_t to compare with
-     * @return true if the pointers of the funcs are the same
-     * */
-    bool ptr_same(const func_t &v) const { return v.get() == get(); }
-};
-
-inline bool operator==(const func_t &v, std::nullptr_t) {
-    // make dpcpp happy in c++20 mode
-    return !v;
-}
-
-// constant version of func_t
-using func_c = std::shared_ptr<const func_base>;
-
-/**
- * The IR node for function calls
- * @param func the callee
- * @param args the arguments
- * */
-class call_node : public expr_base, public visitable_t<call_node, expr_base> {
-public:
-    /**
-     * The parallel_call attr. If a call node has a parallel_attr_t, in the
-     * run time, the program will submit jobs to run the callee with index
-     * (the first argument) from begin_ to end_, where step = step_. The
-     * jobs will run the callee in parallel
-     * */
-    struct parallel_attr_t {
-        expr begin_;
-        expr end_;
-        expr step_;
-        parallel_attr_t(expr begin_, expr end_, expr step_);
-    };
-    static constexpr sc_expr_type type_code_ = sc_expr_type::call;
-    call_node(const std::shared_ptr<node_base> &func,
-            const std::vector<expr> &args,
-            std::vector<parallel_attr_t> &&para_attr = {});
-    call_node(const expr &func, const std::vector<expr> &args);
-    call_node(const func_t &func, const std::vector<expr> &args,
-            std::vector<parallel_attr_t> &&para_attr = {});
-    std::shared_ptr<node_base> func_;
-    std::vector<expr> args_;
-    std::vector<parallel_attr_t> para_attr_;
-    func_t get_prototype() const;
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(call)
-
-// The address space of a tensor
-enum class address_space {
-    // decided by the context. In device mode, it will be "device"
-    automatic,
-    // Device
-    device,
-    // CPU
-    host,
-};
-
-class ir_printer_t;
-/**
- * The tensor node. A tensor is a single/multidimemsion array.
- * @param dtype the type of the elements of the tensor
- * @param name the name of the tensor
- * @param dims the dimemsions of the tensor, should be integer exprs
- * @param strides the stride information for each dimension
- *
- * @note at the run time, a function-local tensor's memory buffer will be
- * allocated in three ways:
- * 1. if the tensor's `define_node_t` has an init value, the tensor is a "view"
- * of another tensor (possibly with some offsets). This feature is used when
- * rescheduling multiple tensors into a large buffer if their lifetime do not
- * overlap. This usage is *NOT* allowed in the front end and is for internal use
- * only. The "view" of other tensor should not break the strict alias rule.
- * 2. if the tensor's `define_node_t` has no init value and tensor size is
- * small, the tensor will be allocated on stack
- * 3. if the tensor's `define_node_t` has no init value and tensor size is
- * large, the tensor will be allocated on heap via memory pool
- * @note The dtype_ of tensor node should be a pointer, which can be mapped to
- * `T*` in C++. The type of the elements is stored in the field elem_dtype_
- * @note if the tensor's \p init_value_ has special init values. They are either
- * from \p get_zero_tensor_initializer , (it means that the tensor should be
- * initialzied with 0_ or from \p make_tensor_initializer (it means that the
- * tensor will be filled with a single value repeatedly)
- * */
-class tensor_node : public expr_base,
-                    public visitable_t<tensor_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::tensor;
-    tensor_node(sc_data_type_t dtype, const std::string &name,
-            const std::vector<expr> &dims,
-            address_space address_space = address_space::automatic,
-            const std::shared_ptr<static_data_t> &init_value = nullptr,
-            const std::vector<expr> &strides = {});
-    // The type of the elements
-    sc_data_type_t elem_dtype_;
-    std::vector<expr> dims_;
-    std::string name_;
-    address_space address_space_;
-    // the initial raw value of the tensor. Currently this field is valid only
-    // when the tensor is defined in a global scope, or is zero-initialized
-    std::shared_ptr<static_data_t> init_value_;
-    std::vector<expr> strides_;
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-    /**
-     * Prints the tensor with the name and its dimensions, like:
-     * name[dim1, dim2, ...]
-     * @param os the output stream
-     * @param printer the IR printer
-     * */
-    void to_string_full(ir_printer_t &printer);
-    static const std::shared_ptr<static_data_t> &get_zero_tensor_initializer();
-    static std::shared_ptr<static_data_t> make_tensor_initializer(
-            union_val val);
-};
-SC_DEFINE_EXPR_NODE_PTR(tensor)
-
-/**
- * The subscript expression node.
- * If dtype_.lanes_ > 1, it is a vector value.
- * If dtype_.lanes_ == 1, it is a scalar value.
- * @param ptr the tensor node to be accessed. Should be a tensor_node
- * @param idx the indices, should be integer exprs
- * @param mask the mask for loading data. Unused. Nullable
- * */
-class indexing_node : public expr_base,
-                      public visitable_t<indexing_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::indexing;
-    indexing_node(const expr &ptr, const std::vector<expr> &idx, expr mask)
-        : expr_base(
-                sc_data_type_t(
-                        etypes::get_pointer_element(ptr->dtype_.type_code_), 1),
-                sc_expr_type::indexing)
-        , ptr_(ptr)
-        , idx_(idx)
-        , mask_(std::move(mask)) {}
-    indexing_node(const expr &ptr, const std::vector<expr> &idx, uint16_t lanes,
-            uint16_t rows, expr mask)
-        : expr_base(sc_data_type_t(
-                            etypes::get_pointer_element(ptr->dtype_.type_code_),
-                            lanes, rows),
-                sc_expr_type::indexing)
-        , ptr_(ptr)
-        , idx_(idx)
-        , mask_(std::move(mask)) {}
-    expr ptr_;
-    std::vector<expr> idx_;
-    expr mask_;
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(indexing)
-
-/**
- * The pointer within a tensor, based on the offsets. The result will be a view
- * of the tensor, having its own shape. It works like `&base[i, j, k]` in C++.
- * But it may also reshape the resulting view. The result can be further used in
- * indexing, tensorptr, etc., as if it is a tensor.
- * @note the index_flatten pass will replace all indexing on tensor_ptr with the
- * indexing on its base tensor. The indices are converted to the index within
- * the base tensor respetively
- * @param base The base tensor and the indices
- * @param shape The new shape for the resulting view. Can be empty if there will
- * be no indexing on this pointer
- * @param is_slice Only useful when there is indexing on this pointer. If true,
- * `(&base[i, j, k])[a, b, c]` will be lowered to base[i+a, j+b, k+c].
- * Otherwise, `(&base[i, j, k])[a, b, c]` will be lowered in 3 steps. First
- * calculate the base pointer of `&base[i, j, k]`. Then, lower the offsets `[a,
- * b, c]` with the `shape` field of this pointer. Finally, add the offset to the
- * base pointer
- * */
-class tensorptr_node : public expr_base,
-                       public visitable_t<tensorptr_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::tensorptr;
-    tensorptr_node(indexing base, const std::vector<expr> &shape, bool is_slice)
-        : expr_base(sc_data_type_t::pointerof(base->dtype_.type_code_),
-                sc_expr_type::tensorptr)
-        , base_(std::move(base))
-        , shape_(shape)
-        , is_slice_(is_slice) {}
-    indexing base_;
-    std::vector<expr> shape_;
-    bool is_slice_;
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(tensorptr)
-
-enum class intrin_type {
-    min = 0,
-    max,
-    abs,
-    round,
-    floor,
-    ceil,
-    exp,
-    log,
-    erf,
-    sqrt,
-    rsqrt,
-    reduce_add,
-    reduce_mul,
-    reduce_max,
-    reduce_min,
-    fmadd,
-    fnmadd,
-    unpack_low,
-    unpack_high,
-    shuffle,
-    permute,
-    int_and,
-    int_or,
-    int_xor,
-    reinterpret,
-    broadcast,
-    isnan,
-    // saturated cast, casting f32xN or s32xN to u8xN or s8xN with saturation.
-    // After target specific lowering of CPU, if
-    // 1) Target machine has no AVX512f, or the SIMD lanes is not 16: It will be
-    // lowered to min/max etc. No special instructions will be generated
-    // 2) Target machine has AVX512f and SIMD lanes is 16: backend should only
-    // take care of saturated_cast's input dtype being s32x16. It should
-    // be simply lowered to mask_pmovs_db_512 (for s8x16) or mask_pmovus_db_512
-    // (for u8x16)
-    saturated_cast,
-    // round FP inputs to integers and cast to integer types
-    round_and_cast,
-    shl, // shift left
-    shr, // shift right
-    permutex2var,
-    permutexvar,
-    insert, // insert the value into dst at the location specified
-    extract, // extract the value from simd value
-    gather, // gather elements from memory.
-    read_struct, // read field from a struct
-    write_struct, // write a field to a struct
-    // tell the thread pool that the next barrier should run a function when the
-    // thread is waiting for others
-    set_thread_idle_func,
-    // _mm_prefetch(X, _MM_HINT_T{N}}). The locality should be set in the
-    // intrin_attrs_["locality"]. It should be an int from 0 to 3, ranging from
-    // very local to cache (0, or _MM_HINT_T0) to not local (3)
-    prefetch,
-    // explicitly load from const memory location
-    constant_load,
-    // explicitly load from const memory location with volatility, prevent to be
-    // hoisted
-    volatile_load,
-    // gets the group id in nested-parallel-for. It should be in 0 to (max
-    // number of groups)-1. Takes one parameter of u32 for the level of group.
-    get_group_id,
-    // gets the thread id in nested-parallel-for. It should be in 0 to (max
-    // number of threads in group)-1. Takes one parameter of s32 for the
-    // level of group. A special parameter of (-1) gets the global thread id
-    get_group_thread_id,
-    // Below are micro-kernels, which should be lower to function call before
-    // codegen
-    brgemm,
-    list_brgemm,
-    NUM_INTRINSICS,
-};
-
-extern ostream &operator<<(ostream &os, intrin_type t);
-
-namespace intrin_attr {
-constexpr const char *out_dtype = "out_dtype";
-constexpr const char *brgemm_extras = "intrin.brgemm_extras";
-// default true, may turn off when reduce axis has block tile.
-constexpr const char *allow_brgemm_fusion = "intrin.allow_brgemm_fusion";
-// the attr is the name of struct, string type.
-constexpr const char *struct_name = "intrin.struct_name";
-// the attr is used in read/write struct field, value is enum int.
-constexpr const char *struct_field = "intrin.struct_field";
-// datatype of field, sc_data_type_t
-constexpr const char *field_dtype = "intrin.field_dtype";
-} // namespace intrin_attr
-
-/**
- * The intrinsic-call node
- * @param intrin the intrinsic
- * @param args the arguments
- * */
-class intrin_call_node : public expr_base,
-                         public visitable_t<intrin_call_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::intrin_call;
-    intrin_type type_;
-    std::vector<expr> args_;
-    std::unique_ptr<any_map_t> intrin_attrs_;
-    // the attrs will be copied into intrin_attrs_ before calling
-    // intrinsic_handler_t::on_initialize
-    intrin_call_node(intrin_type intrin, const std::vector<expr> &args,
-            const any_map_t &attrs);
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-    // check if size of brgemm args is valid
-    // return `true` if intrinsic not brgemm
-    bool check_brgemm_arg_size(size_t expected_size) const;
-};
-SC_DEFINE_EXPR_NODE_PTR(intrin_call)
-
-/**
- * The Phi node. It should be only used when the IR is in SSA form. It merges
- * two SSA values defined in two incoming basic blocks of the current basic
- * block and it selects one of them as the Phi node value based of the actual
- * incoming branch taken
- *
- * @param values the possible incoming values
- * */
-class ssa_phi_node : public expr_base,
-                     public visitable_t<ssa_phi_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::ssa_phi;
-    std::vector<expr> values_;
-    // if the phi-node depends on a previous value in the last iteration. In
-    // traditional SSA, this means that this PHI depends on a value on critical
-    // path
-    bool is_loop_phi_;
-    ssa_phi_node(const std::vector<expr> &values, bool is_loop_phi);
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(ssa_phi)
-
-/**
- * The function address node.
- * @param func the function
- * @note the passes may make new function nodes to replace old ones in the
- * ir_module_t. However the function node referenced by this node will not be
- * changed after passes. You can use the name of the function to find the
- * updated function node in the IR module.
- * */
-class func_addr_node : public expr_base,
-                       public visitable_t<func_addr_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::func_addr;
-    func_t func_;
-    func_addr_node(func_t f)
-        : expr_base(datatypes::pointer, type_code_), func_(std::move(f)) {};
-
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(func_addr)
-
-// intrin_type for low_level_intrin_kind::x86_general
-namespace x86_intrin_type {
-enum x86_intrin_type_t {
-    avx_broadcast_idx = 0,
-    avx_mask_cast,
-    avx_compare,
-    NUM_INTRINSICS,
-};
-} // namespace x86_intrin_type
-
-extern ostream &operator<<(ostream &os, x86_intrin_type::x86_intrin_type_t t);
-
-/**
- * The backend specific intrinsic kinds
- **/
-enum class low_level_intrin_kind {
-    x86_general = 0,
-    x86_xbyak,
-    NUM_INTRIN_KINDS,
-};
-
-/**
- * The low-level-intrinsic node
- * @param kind the backend specific intrinsic kind
- * @param type the intrinsic type, defined by each backend
- * @param args the arguments
- * @note low-level-intrinsics will be used on low level ir for different
- * backends to express target-specific intrinsic, e.g. operations and
- * instructions. This node will be visible for normal ir passes but will only be
- * used by low level passes for CPUs and GPUs.
- **/
-class low_level_intrin_node
-    : public expr_base,
-      public visitable_t<low_level_intrin_node, expr_base> {
-public:
-    static constexpr sc_expr_type type_code_ = sc_expr_type::low_level_intrin;
-    low_level_intrin_kind kind_;
-    int64_t type_;
-    std::vector<expr> args_;
-    std::unique_ptr<any_map_t> intrin_attrs_;
-    low_level_intrin_node(low_level_intrin_kind kind, int64_t type,
-            const std::vector<expr> &args, const any_map_t &attrs);
-    expr remake() const override;
-    bool equals(expr_c other, ir_comparer &ctx) const override;
-};
-SC_DEFINE_EXPR_NODE_PTR(low_level_intrin)
-
-/**
- * Gets the integer from the constant node. Will abort if the dtype
- * of the node is not an integer
- * @param c the constant node
- * @return the constant value
- * */
-SC_INTERNAL_API extern int64_t get_const_as_int(const constant_c &c);
-
-/**
- * Gets the integer from the expr node. Will abort if the dtype
- * of the node is not an integer
- * @param e the expr node
- * @return the constant value
- * */
-extern int64_t get_expr_as_int(const expr_c &e);
-/**
- * When \p e isa instance of an expression node that has a \c name_ member,
- * return that name; otherwise raise an exception.
- */
-const std::string &get_node_name(const expr &e);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::expr> {
-    std::size_t operator()(const dnnl::impl::graph::gc::expr &k) const {
-        return hash<dnnl::impl::graph::gc::expr::impl_ptr>()(k.impl);
-    }
-};
-
-template <>
-struct equal_to<dnnl::impl::graph::gc::expr> {
-    bool operator()(const dnnl::impl::graph::gc::expr &k,
-            const dnnl::impl::graph::gc::expr &k2) const {
-        return k.ptr_same(k2);
-    }
-};
-
-template <>
-struct hash<dnnl::impl::graph::gc::expr_c> {
-    std::size_t operator()(const dnnl::impl::graph::gc::expr_c &k) const {
-        return hash<dnnl::impl::graph::gc::expr::impl_ptr>()(k.impl);
-    }
-};
-
-template <>
-struct equal_to<dnnl::impl::graph::gc::expr_c> {
-    bool operator()(const dnnl::impl::graph::gc::expr_c &k,
-            const dnnl::impl::graph::gc::expr_c &k2) const {
-        return k.ptr_same(k2);
-    }
-};
-
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_function.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_function.cpp
deleted file mode 100644
index 911001a3083..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_function.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "sc_function.hpp"
-#include <utility>
-#include "builder.hpp"
-#include "ir_comparer.hpp"
-#include <compiler/ir/pass/printer.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-ostream &operator<<(ostream &os, const func_c &e) {
-    return os << e.get();
-}
-
-ostream &operator<<(ostream &os, const func_base *e) {
-    e->to_string(os);
-    return os;
-}
-
-void func_base::to_string(ostream &os) const {
-    ir_printer_t p {os};
-    p.dispatch(shared_from_this());
-}
-
-func_t::func_t(func_base *ptr) : std::shared_ptr<func_base>(ptr) {}
-func_t::func_t(std::shared_ptr<func_base> &&other)
-    : std::shared_ptr<func_base>(std::move(other)) {}
-
-func_base::~func_base() = default;
-
-func_base::func_base(const std::string &name, const std::vector<expr> &params,
-        stmt body, sc_data_type_t ret_type)
-    : name_(name)
-    , params_(params)
-    , body_(std::move(body))
-    , ret_type_(ret_type) {
-    if (body_.defined()) {
-        decl_ = builder::make_func(name, params, stmt(), ret_type);
-    }
-}
-
-func_t func_base::remake() const {
-    auto ret = builder::make_func(name_, params_, body_, ret_type_);
-    if (attr_) { ret->attr_ = utils::make_unique<any_map_t>(*attr_); }
-    return ret;
-}
-
-bool func_base::equals(const func_c &f) const {
-    ir_comparer cmper;
-    return this->equals(f, cmper);
-}
-
-bool func_base::equals(const func_c &f, ir_comparer &ctx) const {
-    func_t shared = std::const_pointer_cast<func_base>(shared_from_this());
-    bool name_checking_passed = !ctx.cmp_names_ || (name_ == f->name_);
-    return ctx.set_result(shared, f,
-                   ret_type_ == f->ret_type_ && name_checking_passed
-                           && ctx.expr_arr_equals(params_, f->params_))
-            && ctx.check_equals_may_null(body_, f->body_);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_function.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_function.hpp
deleted file mode 100644
index b95c37cd747..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_function.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_FUNCTION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_FUNCTION_HPP
-
-#include <functional>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-#include <type_traits>
-
-#include "sc_expr.hpp"
-#include "sc_stmt.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace function_attrs {
-// bool, if this function represents low-level semantic, which will disable some
-// passes on this function
-constexpr const char *low_level = "low_level";
-// bool, whether to run idle func at the last parallel-for's barrier of this
-// function
-constexpr const char *has_idle_func = "has_idle_func";
-// bool, if the function in invisible externally of the module. default = false
-constexpr const char *private_ = "private";
-// bool, if the function in a pure function (observes and produces no
-// side-effects). default = false
-constexpr const char *pure = "pure";
-// bool, if the function's return address has no alias (like malloc). default =
-// false
-constexpr const char *no_alias = "noalias";
-// bool, if the function cannot use parallel-for. default = false
-constexpr const char *no_parallel = "no_parallel";
-// bool, if the function cannot be traced. default = false
-constexpr const char *skip_trace = "skip_trace";
-// bool, if the function is the main entry of the module
-constexpr const char *is_main = "is_main";
-// bool, if the function is a top-level function. The main entry function should
-// be a top-level function. If other functions needs the same optimization as
-// the main entry function, they should be top-level functions
-constexpr const char *top_level = "top_level";
-// std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>, the tensor
-// inplace optimization hint. It is a vector of pair<int,
-// std::vector<tensor_inplace_info_t>>. For each pair, the int is an index of
-// the output tensor of the args this function. The vector of inplace info in
-// the pair holds the indices of input tensor args of this function, which this
-// output tensor can share buffer with.
-constexpr const char *inplace_hint = "inplace_hint";
-// bool, if the function is a trace probe function
-constexpr const char *is_trace_func = "is_trace_func";
-
-} // namespace function_attrs
-
-/**
- * The function IR node
- * @param name the function name
- * @param params_ the function parameters. The elements should be var or tensor
- * @param body_ the body of the function
- * @param ret_type_ the return type of the function
- * */
-class func_base : public node_base,
-                  public std::enable_shared_from_this<func_base>
-                  SC_LEAK_CHECK(func_base) {
-public:
-    std::string name_;
-    std::vector<expr> params_;
-    stmt body_;
-    sc_data_type_t ret_type_;
-    // the function declaration. It has the same prototype of this function
-    // will be non-null only when body is not empty
-    func_t decl_;
-
-    ~func_base();
-    /**
-     * Dump the IR node as string to the ostream
-     * @param os the output stream
-     * */
-    void to_string(ostream &os) const;
-    func_base(const std::string &name, const std::vector<expr> &params,
-            stmt body, sc_data_type_t ret_type);
-    /**
-     * Does shallow copying copy on this IR node.
-     * Makes a new IR node with the same type and the same values of fields.
-     * */
-    func_t remake() const;
-
-    /**
-     * Check if `this` is same as another IR node. May change the internal
-     * states of `ctx`
-     * @param other the other IR node to compare
-     * @param ctx the context of the comparison: how "same" is defined,
-     *  the internal states, etc.
-     * @return true if the nodes are the same
-     * */
-    bool equals(const func_c &f, ir_comparer &ctx) const;
-
-    /**
-     * Check if `this` is same as another IR node. It will create a new
-     * default ir_comparer context to do comparison.
-     * @param other the other IR node to compare
-     * @return true if the nodes are the same
-     * */
-    bool equals(const func_c &f) const;
-};
-
-// Operator << overload for std::ostream on func_t
-SC_INTERNAL_API extern ostream &operator<<(ostream &os, const func_c &e);
-
-// Operator << overload for std::ostream on func_base*
-SC_INTERNAL_API extern ostream &operator<<(ostream &os, const func_base *e);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::func_t> {
-    std::size_t operator()(const dnnl::impl::graph::gc::func_t &k) const {
-        return hash<std::shared_ptr<dnnl::impl::graph::gc::func_base>>()(k);
-    }
-};
-
-template <>
-struct equal_to<dnnl::impl::graph::gc::func_t> {
-    bool operator()(const dnnl::impl::graph::gc::func_t &k,
-            const dnnl::impl::graph::gc::func_t &k2) const {
-        return k.ptr_same(k2);
-    }
-};
-
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_stmt.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_stmt.cpp
deleted file mode 100644
index 161da61f8c9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_stmt.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "sc_stmt.hpp"
-#include "builder.hpp"
-#include "ir_comparer.hpp"
-#include "visitable.hpp"
-#include <compiler/ir/pass/printer.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-stmt_base_t::~stmt_base_t() = default;
-stmt_base_t::stmt_base_t(sc_stmt_type type) : node_type_(type) {}
-
-ostream &operator<<(ostream &os, const stmt_c &s) {
-    return os << s.get();
-}
-
-ostream &operator<<(ostream &os, const stmt_base_t *s) {
-    s->to_string(os, 0);
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, for_type val) {
-    switch (val) {
-#define HANDLE_CASE(X) \
-    case for_type::X: os << "for_type::" #X; break;
-        HANDLE_CASE(NORMAL)
-        HANDLE_CASE(PARALLEL)
-#undef HANDLE_CASE
-        default: os << "(unrecognized for_type value)"; break;
-    }
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, sc_stmt_type val) {
-    switch (val) {
-#define HANDLE_CASE(X) \
-    case sc_stmt_type::X: os << "sc_stmt_type::" #X; break;
-
-        HANDLE_CASE(undef)
-        HANDLE_CASE(assign)
-        HANDLE_CASE(stmts)
-        HANDLE_CASE(if_else)
-        HANDLE_CASE(evaluate)
-        HANDLE_CASE(for_loop)
-        HANDLE_CASE(returns)
-        HANDLE_CASE(define)
-#undef HANDLE_CASE
-        default: os << "(unrecognized sc_stmt_type value)"; break;
-    }
-    return os;
-}
-
-stmt assign_node_t::remake() const {
-    return copy_attr(*this, make_stmt<assign_node_t>(var_, value_));
-}
-
-void stmt_base_t::to_string(ostream &os, int indent) const {
-    ir_printer_t p {os};
-    p.indents_ = indent;
-    p.dispatch(node_ptr_from_this());
-}
-
-bool stmt_base_t::equals(stmt_c other) const {
-    ir_comparer cmper;
-    return this->equals(std::move(other), cmper);
-}
-
-#define CAST_OR_RETURN(v) \
-    using self \
-            = node_ptr<typename std::remove_reference<decltype(*this)>::type, \
-                    stmt_base_t>; \
-    if (!(v).isa<self>()) { return false; } \
-    auto other = (v).static_as<self>();
-
-bool assign_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    return var_->equals(other->var_, ctx) && value_->equals(other->value_, ctx);
-}
-
-stmt stmts_node_t::remake() const {
-    std::vector<stmt> seq = seq_;
-    return copy_attr(*this, make_stmt<stmts_node_t>(std::move(seq)));
-}
-
-bool stmts_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    if (seq_.size() != other->seq_.size()) {
-        return ctx.set_result(node_ptr_from_this(), v, false);
-    }
-    for (size_t i = 0; i < seq_.size(); i++) {
-        if (!seq_.at(i)->equals(other->seq_.at(i), ctx)) { return false; }
-    }
-    return true;
-}
-
-stmt if_else_node_t::remake() const {
-    return copy_attr(*this,
-            make_stmt<if_else_node_t>(condition_, then_case_, else_case_));
-}
-
-bool if_else_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    return condition_->equals(other->condition_, ctx)
-            && then_case_->equals(other->then_case_, ctx)
-            && ctx.check_equals_may_null(else_case_, other->else_case_);
-}
-
-stmt evaluate_node_t::remake() const {
-    return copy_attr(*this, make_stmt<evaluate_node_t>(value_));
-}
-
-bool evaluate_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    return value_->equals(other->value_, ctx);
-}
-
-stmt returns_node_t::remake() const {
-    return copy_attr(*this, make_stmt<returns_node_t>(value_));
-}
-
-bool returns_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    return ctx.check_equals_may_null(value_, other->value_);
-}
-
-stmt define_node_t::remake() const {
-    return copy_attr(*this, make_stmt<define_node_t>(var_, linkage_, init_));
-}
-
-bool define_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    return var_->equals(other->var_, ctx) && linkage_ == other->linkage_
-            && ctx.check_equals_may_null(init_, other->init_);
-}
-
-stmt for_loop_node_t::remake() const {
-    return copy_attr(*this,
-            make_stmt<for_loop_node_t>(var_, iter_begin_, iter_end_, step_,
-                    body_, incremental_, kind_, num_threads_));
-}
-
-#define CAST_OR_RETURN(v) \
-    using self \
-            = node_ptr<typename std::remove_reference<decltype(*this)>::type, \
-                    stmt_base_t>; \
-    if (!(v).isa<self>()) { return false; } \
-    auto other = (v).static_as<self>();
-
-bool for_loop_node_t::equals(stmt_c v, ir_comparer &ctx) const {
-    CAST_OR_RETURN(v);
-    return ctx.set_result(node_ptr_from_this(), v,
-                   incremental_ == other->incremental_ && kind_ == other->kind_
-                           && num_threads_ == other->num_threads_)
-            && var_->equals(other->var_, ctx)
-            && iter_begin_->equals(other->iter_begin_, ctx)
-            && iter_end_->equals(other->iter_end_, ctx)
-            && step_->equals(other->step_, ctx)
-            && body_->equals(other->body_, ctx);
-}
-
-uint64_t for_loop_node_t::get_balance211_split_factor() const {
-    COMPILE_ASSERT(num_threads_ > 0,
-            "get_balance211_split_factor only works on num_threads>0");
-    if (iter_begin_.isa<constant>() && iter_end_.isa<constant>()
-            && step_.isa<constant>()) {
-        // if is constant-for (in most cases)
-        uint64_t end = get_const_as_int(iter_end_.static_as<constant>());
-        uint64_t begin = get_const_as_int(iter_begin_.static_as<constant>());
-        uint64_t step = get_const_as_int(step_.static_as<constant>());
-        auto len = end - begin;
-        auto num_jobs = utils::divide_and_ceil(len, step);
-        uint64_t my_jobs = utils::divide_and_ceil(num_jobs, num_threads_);
-        COMPILE_ASSERT(my_jobs > 0, "Bad number of jobs");
-        if (num_jobs % num_threads_ == 0) { return num_threads_; }
-        uint64_t my_jobs_2 = my_jobs - 1;
-        // number of threads doing my_jobs work
-        uint64_t num_thread_larger_work = num_jobs - my_jobs_2 * num_threads_;
-        // number of threads doing my_jobs - 1 work
-        uint64_t num_thread_less_work = num_threads_ - num_thread_larger_work;
-        // the loop is divisible with num_thread_less_work parts
-        // and each part has same number of works
-        // the loop can be further "split" into outer and inner loops and
-        // the outer loop may be merged with another loop
-        uint64_t num_split
-                = math_utils::get_gcd(num_thread_larger_work, num_threads_);
-        return num_split;
-    }
-    return 0;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_stmt.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_stmt.hpp
deleted file mode 100644
index e9dbbef632e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sc_stmt.hpp
+++ /dev/null
@@ -1,653 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_STMT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SC_STMT_HPP
-
-#include <assert.h>
-
-#include <functional>
-#include <memory>
-#include <ostream>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-#include <type_traits>
-#include <unordered_map>
-
-#include "sc_expr.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * The IDs for each statememt node
- * */
-enum class sc_stmt_type {
-    undef = 0,
-#define _SC_DEFINE_STMT(t, ...) t,
-    FOR_EACH_STMT_IR_TYPE(_SC_DEFINE_STMT)
-
-#undef _SC_DEFINE_STMT
-    // clang-format off
-    MAX_TYPE = define
-    // clang-format on
-};
-
-namespace stmt_attr_key {
-// Boolean. If true, for_loop_node_t will be merged as possible
-constexpr const char *merge_loop = "merge_loop";
-
-constexpr const char *skip_axis_binding_check = "skip_axis_binding_check";
-
-// for_loop_node_t. It represents the root loop invloving reduce computation
-constexpr const char *reduce_root_loop = "reduce_root_loop";
-
-// int. If is set, for_loop_node_t will be unrolled in loop_unroller pass.
-// The value is the unroll factor, see unroll() of for_loop_node_t. The value 0
-// is a special unroll factor, and it unrolls all iterations of the loop
-constexpr const char *unroll_loop = "unroll_loop";
-
-// Boolean. Wether parallel for_loop_node_t is balanced
-constexpr const char *parallel_loop_balanced = "parallel_loop_balanced";
-
-// int. Indicate the partition granularity (default =1)
-constexpr const char *parallel_merge_loop_granularity
-        = "parallel_merge_loop_granularity";
-
-// Boolean. If true, there is no need to insert one pre barrier, please ensure
-// data dependency first when enable this flag.
-constexpr const char *no_post_barrier = "no_post_barrier";
-
-// Boolean. If true, the certain loop could not be fused.
-constexpr const char *no_loop_fuse = "no_loop_fuse";
-}; // namespace stmt_attr_key
-
-std::ostream &operator<<(std::ostream &os, sc_stmt_type val);
-
-/**
- * The base class of statement IR nodes
- * */
-class stmt_base_t : public node_base,
-                    virtual public visitable_base_t<stmt_base_t>,
-                    public enable_node_ptr_from_this_t<stmt_base_t>
-                    SC_LEAK_CHECK(stmt_base_t) {
-public:
-    // the statement type id of the IR node
-    sc_stmt_type node_type_ = sc_stmt_type::undef;
-
-    stmt_base_t(sc_stmt_type type);
-    virtual ~stmt_base_t();
-    /**
-     * Dump the IR node as string to the ostream
-     * @param os the output stream
-     * */
-    virtual void to_string(ostream &os, int indent) const;
-
-    /**
-     * Does shallow copying copy on this IR node.
-     * Makes a new IR node with the same type and the same values of fields.
-     * */
-    virtual node_ptr<stmt_base_t, stmt_base_t> remake() const = 0;
-
-    /**
-     * Check if `this` is same as another IR node. May change the internal
-     * states of `ctx`
-     * @param other the other IR node to compare
-     * @param ctx the context of the comparison: how "same" is defined,
-     *  the internal states, etc.
-     * @return true if the nodes are the same
-     * */
-    virtual bool equals(node_ptr<const stmt_base_t, stmt_base_t> other,
-            ir_comparer &ctx) const = 0;
-
-    /**
-     * Check if `this` is same as another IR node. It will create a new
-     * default ir_comparer context to do comparison.
-     * @param other the other IR node to compare
-     * @return true if the nodes are the same
-     * */
-    virtual bool equals(node_ptr<const stmt_base_t, stmt_base_t> other) const;
-};
-
-// the alias of statement node_ptr
-using stmt = node_ptr<stmt_base_t, stmt_base_t>;
-// the alias of statement constant node_ptr
-using stmt_c = node_ptr<const stmt_base_t, stmt_base_t>;
-
-// Operator << overrider for std::ostream on statements
-extern ostream &operator<<(ostream &os, const stmt_c &);
-extern ostream &operator<<(ostream &os, const stmt_base_t *);
-
-/**
- * Assignment node.
- * @param var_ the destination expr. Can be var/indexing
- * @param value_ the value to be assigned. Should have the same dtype of var_
- * */
-class assign_node_t : public stmt_base_t,
-                      public visitable_t<assign_node_t, stmt_base_t> {
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::assign;
-
-    expr var_;
-    expr value_;
-    assign_node_t(expr var, expr value)
-        : stmt_base_t(sc_stmt_type::assign)
-        , var_(std::move(var))
-        , value_(std::move(value)) {};
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-};
-using assign = node_ptr<assign_node_t, stmt_base_t>;
-using assign_c = node_ptr<const assign_node_t, stmt_base_t>;
-
-/**
- * The node for a sequence of statements. One or more statements
- * can be contained into a stmts_node_t to be further used in other
- * statements like if_else and for_loop.
- * @param seq_ the sequence of statements
- * */
-class stmts_node_t : public stmt_base_t,
-                     public visitable_t<stmts_node_t, stmt_base_t> {
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::stmts;
-    std::vector<stmt> seq_;
-
-    /**
-     * Gets the size of the sequence that is contained
-     * @return size
-     * */
-    size_t size() const { return seq_.size(); };
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-
-    /**
-     * Gets the index-th statements. Will abort if out of index
-     * @return the index-th statement
-     * */
-    stmt operator[](size_t index) const {
-        assert(index < size());
-        return seq_[index];
-    }
-
-    stmts_node_t(std::vector<stmt> &&seq_)
-        : stmt_base_t(sc_stmt_type::stmts), seq_(std::move(seq_)) {}
-};
-
-using stmts = node_ptr<stmts_node_t, stmt_base_t>;
-using stmts_c = node_ptr<const stmts_node_t, stmt_base_t>;
-
-/**
- * The if-else node. `Else` can be empty. If the `condition_` is
- * true, will go to `then_case_`. Else, if `else_case_` is defined,
- * go to `else_case_` case.
- * @param condition_ the condition of the `if`. Should be of boolean type
- * @param then_case_ the `then` block
- * @param else_case_ the `else` block. Nullable (`stmt()`)
- * */
-class if_else_node_t : public stmt_base_t,
-                       public visitable_t<if_else_node_t, stmt_base_t> {
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::if_else;
-    expr condition_;
-    stmt then_case_;
-    stmt else_case_;
-    if_else_node_t(expr condition, stmt then_case, stmt else_case)
-        : stmt_base_t(sc_stmt_type::if_else)
-        , condition_(std::move(condition))
-        , then_case_(std::move(then_case))
-        , else_case_(std::move(else_case)) {};
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-};
-using if_else = node_ptr<if_else_node_t, stmt_base_t>;
-using if_else_c = node_ptr<const if_else_node_t, stmt_base_t>;
-
-/**
- * Runs an expression.
- * @param value_ the expression
- * @note Any expression that is not directly or indirectly attached to
- *  an statement will not be visible to codegen. This statement node will
- *  reserve an expression in the statement tree.
- * */
-class evaluate_node_t : public stmt_base_t,
-                        public visitable_t<evaluate_node_t, stmt_base_t> {
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::evaluate;
-    expr value_;
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-    evaluate_node_t(expr value)
-        : stmt_base_t(sc_stmt_type::evaluate), value_(std::move(value)) {}
-};
-using evaluate = node_ptr<evaluate_node_t, stmt_base_t>;
-using evaluate_c = node_ptr<const evaluate_node_t, stmt_base_t>;
-
-/**
- * Returns an expression.
- * @param value_ the expression, nullable if the current function return void_t
- * */
-class returns_node_t : public stmt_base_t,
-                       public visitable_t<returns_node_t, stmt_base_t> {
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::returns;
-    expr value_;
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-    returns_node_t(expr value)
-        : stmt_base_t(sc_stmt_type::returns), value_(std::move(value)) {}
-};
-using returns = node_ptr<returns_node_t, stmt_base_t>;
-using returns_c = node_ptr<const returns_node_t, stmt_base_t>;
-
-/**
- * Variable or tensor definition
- * @param var the var or tensor
- * @param linkage
- * @param init the initial value. When \p var is a \c tensor_node , and init_ is
- * not null, it means that the tensor is a "view" over another tensor or it is
- * zero initialized. @see tensor_node
- * */
-class define_node_t : public stmt_base_t,
-                      public visitable_t<define_node_t, stmt_base_t> {
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::define;
-    expr var_;
-    expr init_;
-    linkage linkage_;
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-    define_node_t(expr var, linkage linkage, expr init)
-        : stmt_base_t(type_code_)
-        , var_(std::move(var))
-        , init_(std::move(init))
-        , linkage_(linkage) {}
-};
-using define = node_ptr<define_node_t, stmt_base_t>;
-using define_c = node_ptr<const define_node_t, stmt_base_t>;
-
-// the mapping of node pointer: either stmt or expr node
-using node_ptr_map = std::unordered_map<std::shared_ptr<node_base>,
-        std::shared_ptr<node_base>>;
-
-/**
- * The types of for-loops
- * */
-enum class for_type {
-    NORMAL = 1, // normal sequential for
-    PARALLEL = 2, // run the for-loop in parallel (like omp-parallel-for)
-};
-
-std::ostream &operator<<(std::ostream &os, for_type val);
-
-/**
- * The node of for-loop.
- * i.e. for(TYPE var_ = iter_begin_; var_ < iter_end_; var_ += step_){
- *  ...
- * }
- *
- * @param var_ the iterate variable. The loop-var is expected to
- *  be used only within the scope of the loop. Should be an integer var
- * @param iter_begin_ the initial value of var_
- * @param iter_end_ the max bound of the loop-var var_. Can never be reached
- * @param step_ the step of var_ in each iteration.
- * @param body_ the body of the loop
- * @param incremental_ if the loop-var var_ is incremental. Not currently used.
- * @param kind_ the kind of the loop. @see for_type
- * @param num_threads_ the number of threads to use when it is a parallel-for. 0
- * for using all avaiable threads in thread group. If the for-loop is not
- * parallelled, it should be 0
- * */
-class for_loop_node_t : public stmt_base_t,
-                        public visitable_t<for_loop_node_t, stmt_base_t> {
-    using ptr_type = node_ptr<for_loop_node_t, stmt_base_t>;
-
-public:
-    static constexpr sc_stmt_type type_code_ = sc_stmt_type::for_loop;
-    expr var_;
-    expr iter_begin_;
-    expr iter_end_;
-    expr step_;
-    stmt body_;
-    bool incremental_;
-    for_type kind_;
-    int num_threads_;
-
-    /**
-     * A for-loop node will become
-     * invalid after it is merged or fused by merge() and fuse()
-     * @return true if the loop is valid.
-     * */
-    bool isvalid() const;
-
-    /**
-     * Do in-place split on this for-loop. The current loop will
-     * become the outer loop and the inner loop will be returned.
-     * The loop start/end/step must be constants and step should
-     * be positive
-     * e.g.:
-     * for(i, 0, 100) {
-     *  ...
-     * }
-     *
-     * after loop_i.split(20);
-     * for(i_outer, 0, 5) {
-     *  for(i_inner, 0, 20) {
-     *      ...
-     *  }
-     * }
-     * The loop_i points to the outer loop
-     * @param block the length of the inner loop
-     * @return the for_loop node_ptr of the inner loop
-     * */
-    ptr_type split(int64_t block, node_ptr_map *node_remap = nullptr);
-
-    /**
-     * Change the original loop's end and num_threads
-     * for i in (0, end, 1) parallel (num_threads)
-     *   op1(i)
-     * where num_iters == end == num_threads
-     * ===>
-     * for i in (0, num_threads / num_groups, 1)
-     *     parallel (num_threads / num_groups)
-     *   for j in (0, num_groups, 1) parallel (num_groups)
-     *     iter = i * num_groups + j
-     *       op1(iter)
-     * Example:
-     * for i in (0, 16, 1) parallel (16) {
-     *   op(i)
-     * }
-     * with num_groups = 2:
-     * for i in (0, 8, 1) parallel (8) {
-     *   for j in (0, 2, 1) parallel (2) {
-     *     op(i * 2 + j)
-     *   }
-     * }
-     *
-     * @param num_groups the num_threads_ of inner loop
-     * @return the for_loop node_ptr of the inner loop
-     * */
-    ptr_type split_on_num_threads(
-            int64_t num_groups, node_ptr_map *node_remap = nullptr);
-
-    /**
-     * Do in-place fusion of two nested loop. This is a reverse operaion of
-     * split(). The for-loop to fuse should be the next nested loop of this.
-     * @note The inner loop will be fused into `this` and be invalidated.
-     * @note `this` should have only one child-node that is `ax`. If `this` have
-     *  multiple children statements, this function will abort.
-     * for(i, 0, 5) {
-     *  for(j, 0, 20) {
-     *      ...
-     *  }
-     * }
-     *
-     * After loop_i.fuse(loop_j);
-     * for(fused_i_j, 0, 100) {
-     *  i = fused_i_j / 20;
-     *  j = fused_i_j % 20;
-     *  ...
-     * }
-     *
-     * @param ax the for-loop node_ptr to fuse. Should be the next nested-loop
-     * @return The fused for-loop. Should have the same ptr of `this`
-     * */
-    ptr_type fuse(const ptr_type &ax, node_ptr_map *node_remap = nullptr);
-
-    /**
-     * Do in-place reordering on some nested loops. Except the most inner loop,
-     * each of the specified for-loops should have only one child statement
-     * which is the next inner loop. `this` should be the most outer loop in the
-     * specified loops
-     *
-     * for(i, 0, 5) {
-     *  for(j, 0, 20) {
-     *      ...
-     *  }
-     * }
-     *
-     * After loop_i.reorder({loop_j, loop_i});
-     * for(j, 0, 20) {
-     *  for(i, 0, 5) {
-     *      ...
-     *  }
-     * }
-     *
-     * @param parent the parent statement of `this`
-     * @param ax the new order of the nested for-loops
-     * */
-    void reorder(stmt parent, std::vector<ptr_type> &&ax);
-
-    /**
-     * Inplace merge a sibling for loop "other" with this. e.g.:
-     * for i in (0, 100, 1) {
-     *  A[i] = A[i] + 1
-     * }
-     * for j in (0, 100, 1) {
-     *  B[i] = B[i] + 1
-     * }
-     *
-     * After merging i.merge(parent, j):
-     * for i in (0, 100, 1) {
-     *  A[i] = A[i] + 1
-     *  B[i] = B[i] + 1
-     * }
-     * "other" must be a for loop in the same stmts of this. This function will
-     * merge these two loops and put the body of "other" after the body of
-     * "this". The ranges of merged for-loops must be the same, but can be
-     * non-constants.
-     *
-     * @param parent the parent stmts_node_t of this and `other`. Nullable. If
-     * not null, will erase `other` from parent
-     * @param other the other loop to be merged
-     * @return the merged for-loop. Should be the same ptr of this
-     * */
-    ptr_type merge(const stmt &parent, const ptr_type &other);
-
-    /**
-     * Merge "num_nested" inner loops of this with
-     * a sibling for loop "other". e.g.:
-     * for i in (0, 100, 1) {
-     *  for j in (0, 100, 1) {
-     *      for k in (0, 100, 1) {
-     *          A[i,j,k] = A[i,j,k] + 1
-     *      }
-     *  }
-     * }
-     * for a in (0, 100, 1) {
-     *  for b in (0, 100, 1) {
-     *      for c in (0, 100, 1) {
-     *          B[a,b,c] = B[a,b,c] + 1
-     *      }
-     *  }
-     * }
-     *
-     * After merging i.merge(parent, a, 2), it will merge 2 inner
-     * loops from i and a:
-     * for i in (0, 100, 1) {
-     *  for j in (0, 100, 1) {
-     *      for k in (0, 100, 1) {
-     *          A[i,j,k] = A[i,j,k] + 1
-     *      }
-     *      for c in (0, 100, 1) {
-     *          B[a,b,c] = B[a,b,c] + 1
-     *      }
-     *  }
-     * }
-     *
-     * @see merge above for the requirements of this function
-     * @param parent the parent stmts_node_t of this and `other`.  Nullable. If
-     *  not null, will erase `other` from parent
-     * @param other the other loop to be merged
-     * @param num_nested the number of nested loops to be merged.
-     * @return the merged for-loop. Should be the same ptr of this
-     * */
-    ptr_type merge(stmt parent, ptr_type other, unsigned num_nested);
-
-    /**
-     * Merges all nested loops within this and `other`
-     * @see merge
-     * @return the number of inner loops that were merged
-     * */
-    int merge_all(stmt parent, ptr_type other);
-
-    /**
-     * Unrolls the loop
-     * Original loop
-     * for(i=A; i<B; i+=c) {
-     *
-     * }
-     *
-     * ============>
-     * //remainder version
-     * for(i_u=0; i_u < (B-A)/(c*factor)+1; i_u+=1) {
-     *     if (i_u < (B-A)/(c*factor)) {
-     *         int i = i_u * c * factor + A;
-     *         unroll(i + c * 1);
-     *         unroll(i + c * 2);
-     *         ...
-     *         unroll(i + c * factor);
-     *     } else {
-     *         for(i = (B-A)/(c*factor)*(c*factor)+A; i<B; i+=c) {
-     *             original body of i
-     *         }
-     *     }
-     * }
-     *
-     * // or, no remainder:
-     * for(i_u=0; i_u < (B-A)/(c*factor); i_u+=1) {
-     *     int i = i_u * c * factor + A;
-     *     unroll(i + c * 1);
-     *     unroll(i + c * 2);
-     *     ...
-     *     unroll(i + c * factor);
-     * }
-     * @param factor the unroll factor: how many times the body should be. If is
-     * 0, will try to unroll all
-     * @param parent the stmts node contains `this` axis
-     * */
-    void unroll(uint64_t factor = 0, const stmt &parent = stmt());
-
-    /**
-     * Merges two sibling loops. The execution order is unchanged: will still do
-     * the work in loop 1 then loop 2. This transform is useful when loop1 and
-     * loop2 have no dependency and are all parallel-fors. It will eliminate the
-     * barrier after loop1. Requires that the steps of the merged loops to be 1
-     *
-     * Original:
-     * for(i, A, B)
-     * {
-     *  body1
-     * }
-     * for(j, C, D) {
-     *  body1
-     * }
-     * After =================
-     * for(i, A, B + D - C) {
-     *  if (i < B) {
-     *      body1
-     *  } else {
-     *      j = i - B + C
-     *      body2
-     *  }
-     * }
-     * @param parent the stmts node that contains `this` and `ax`
-     * @param ax the sibling loop to be merged after this loop
-     * */
-    void parallel_merge(const stmt &parent, const ptr_type &ax);
-
-    /**
-     * Gets the split factor of this loop for balance211 workload-dispatch.
-     * @see builtin::generate_balance211
-     * This function is only avaliable when the loop is a nested parallel-for
-     * (with num_threads!=0). It returns the maximal number of groups that this
-     * parallel-for can be split into. Each sub-thread group will be assign with
-     * a workload of the same size after spliting. It may return 0, indicating
-     * that this for-loop has non-const boundary.
-     */
-    uint64_t get_balance211_split_factor() const;
-
-    stmt remake() const override;
-    bool equals(stmt_c other, ir_comparer &ctx) const override;
-
-    for_loop_node_t(expr var, expr iter_begin, expr iter_end, expr step,
-            stmt body, bool incremental, for_type kind, int num_threads = 0)
-        : stmt_base_t(sc_stmt_type::for_loop)
-        , var_(std::move(var))
-        , iter_begin_(std::move(iter_begin))
-        , iter_end_(std::move(iter_end))
-        , step_(std::move(step))
-        , body_(std::move(body))
-        , incremental_(incremental)
-        , kind_(kind)
-        , num_threads_(num_threads) {}
-};
-using for_loop = node_ptr<for_loop_node_t, stmt_base_t>;
-using for_loop_c = node_ptr<const for_loop_node_t, stmt_base_t>;
-
-/**
- * Makes a statement node_ptr with given arguments.
- * @tparam T the type of the statement to make, should be *_node
- * @param args the arguments to the constructor of T
- * @return a node_ptr of T
- * */
-template <typename T, typename... Args>
-node_ptr<T, stmt_base_t> make_stmt(Args &&...args) {
-    std::shared_ptr<T> ptr = std::make_shared<T>(std::forward<Args>(args)...);
-    return node_ptr<T, stmt_base_t>(std::move(ptr));
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::stmt> {
-    std::size_t operator()(const dnnl::impl::graph::gc::stmt &k) const {
-        return hash<dnnl::impl::graph::gc::stmt::impl_ptr>()(k.impl);
-    }
-};
-
-template <>
-struct equal_to<dnnl::impl::graph::gc::stmt> {
-    bool operator()(const dnnl::impl::graph::gc::stmt &k,
-            const dnnl::impl::graph::gc::stmt &k2) const {
-        return k.ptr_same(k2);
-    }
-};
-
-template <>
-struct hash<dnnl::impl::graph::gc::stmt_c> {
-    std::size_t operator()(const dnnl::impl::graph::gc::stmt_c &k) const {
-        return hash<dnnl::impl::graph::gc::stmt_c::impl_ptr>()(k.impl);
-    }
-};
-
-template <>
-struct equal_to<dnnl::impl::graph::gc::stmt_c> {
-    bool operator()(const dnnl::impl::graph::gc::stmt_c &k,
-            const dnnl::impl::graph::gc::stmt_c &k2) const {
-        return k.ptr_same(k2);
-    }
-};
-
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sequential_function_pass.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sequential_function_pass.cpp
deleted file mode 100644
index 6e8aa879825..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sequential_function_pass.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "sequential_function_pass.hpp"
-#include <memory>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-sequential_function_pass_t::sequential_function_pass_t(
-        std::vector<std::unique_ptr<function_pass_t>> &&passes)
-    : passes_(std::move(passes)) {}
-
-sequential_function_pass_t::sequential_function_pass_t(
-        sequential_function_pass_t &&other)
-    : passes_(std::move(other.passes_)) {}
-
-func_c sequential_function_pass_t::operator()(func_c f) {
-    for (auto &p : passes_) {
-        f = (*p)(f);
-    }
-    return f;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/sequential_function_pass.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/sequential_function_pass.hpp
deleted file mode 100644
index 008d2771b28..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/sequential_function_pass.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SEQUENTIAL_FUNCTION_PASS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SEQUENTIAL_FUNCTION_PASS_HPP
-
-#include <utility>
-#include <vector>
-#include "function_pass.hpp"
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class sequential_function_pass_t : public function_pass_t {
-public:
-    std::vector<function_pass_ptr> passes_;
-    sequential_function_pass_t(std::vector<function_pass_ptr> &&passes);
-    sequential_function_pass_t(sequential_function_pass_t &&other);
-    func_c operator()(func_c f) override;
-    template <typename... Args>
-    sequential_function_pass_t(Args &&...args) {
-        utils::args_to_vector<function_pass_ptr>(passes_, std::move(args)...);
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_data.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_data.hpp
deleted file mode 100644
index b3cd2dd766c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_data.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SSA_DATA_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SSA_DATA_HPP
-
-#include <assert.h>
-#include <memory>
-#include <utility>
-#include "sc_stmt.hpp"
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// the additional data for SSA exprs
-struct ssa_data_t {
-    // the SSA value owner node. Can be define_node or for_node for var
-    std::weak_ptr<stmt_base_t> owner_;
-    // if it is a var node and is a global variable, it can be assigned multiple
-    // times
-    bool is_global_ = false;
-    bool is_param_ = false;
-
-    // if this expr is referenced by a top-level stmt like evaluate/for/if
-    bool referenced_ = false;
-
-    ssa_data_t() = default;
-
-    // gets the value of a var node
-    expr get_value_of_var() const;
-
-    // gets the value of a var node
-    expr get_value_of_var_nothrow() const;
-
-    // check if this var is locally defined
-    bool is_local() { return !is_global_ && !is_param_; }
-    // check if this expr is no longer used
-    bool is_garbage() { return !is_global_ && !referenced_; }
-
-    bool has_owner() const { return !utils::is_uninitialized_weakptr(owner_); }
-
-    stmt get_owner() const {
-        if (utils::is_uninitialized_weakptr(owner_)) { return stmt(); }
-        auto ret = owner_.lock();
-        assert(ret);
-        return stmt(std::move(ret));
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_visitor.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_visitor.cpp
deleted file mode 100644
index 6ab483df3f0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_visitor.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "ssa_visitor.hpp"
-#include <atomic>
-#include <functional>
-#include <list>
-#include <string>
-#include <utility>
-#include <vector>
-#include "ir_utils.hpp"
-#include "ssa_data.hpp"
-#include <compiler/ir/builder.hpp>
-#include <util/array_ref.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-expr ssa_data_t::get_value_of_var() const {
-    assert(!utils::is_uninitialized_weakptr(owner_));
-    auto owner = owner_.lock();
-    assert(owner);
-    COMPILE_ASSERT(owner->node_type_ == sc_stmt_type::define,
-            "Expecting define_node for get_value_of_var");
-    return static_cast<define_node_t *>(owner.get())->init_;
-}
-
-expr ssa_data_t::get_value_of_var_nothrow() const {
-    if (utils::is_uninitialized_weakptr(owner_)) { return expr(); }
-    auto owner = owner_.lock();
-    assert(owner);
-    if (owner->node_type_ != sc_stmt_type::define) { return expr(); }
-    return static_cast<define_node_t *>(owner.get())->init_;
-}
-
-expr_c ssa_visitor_t::dispatch(expr_c e) {
-    auto ret = ir_visitor_t::dispatch(e);
-    if (!ret->ssa_data_) {
-        ret.remove_const()->ssa_data_ = utils::make_unique<ssa_data_t>();
-    }
-    return ret;
-}
-
-static void process_define_node_after_visit(const stmt_c &ret) {
-    auto def_node = ret.static_as<define_c>();
-    auto &lhs = def_node->var_;
-    assert(lhs->ssa_data_);
-    lhs->ssa_data_->owner_ = ret.weak();
-    auto &rhs = def_node->init_;
-    if (!lhs->ssa_data_->is_global_) {
-        if (rhs.defined()) {
-            assert(rhs->ssa_data_);
-            rhs->ssa_data_->referenced_ = false;
-        } else {
-            assert(lhs.isa<tensor>());
-        }
-        lhs->ssa_data_->referenced_ = false;
-    }
-}
-
-// after visiting an stmt, we need to: 1. set "referenced" bit to the directly
-// referenced exprs 2. Set the owners of the vars for define_node and for_node
-stmt_c ssa_visitor_t::dispatch(stmt_c e) {
-    auto ret = ir_visitor_t::dispatch(std::move(e));
-    auto ths = this;
-    auto set_referenced = [ths](const expr &d, bool check) {
-        d->ssa_data_->referenced_ = true;
-        ths->gc_roots_.emplace_back(d);
-        if (check) { assert(d.isa<var>()); }
-    };
-    if (!ret.defined()) { return ret; }
-    switch (ret->node_type_) {
-        case sc_stmt_type::undef: assert(0 && "Unreachable"); break;
-        case sc_stmt_type::assign:
-            // the LHS of assign can be indexing
-            set_referenced(ret.static_as<assign>()->var_, false);
-            set_referenced(ret.static_as<assign>()->value_, false);
-            break;
-        case sc_stmt_type::if_else:
-            set_referenced(ret.static_as<if_else>()->condition_, false);
-            break;
-        case sc_stmt_type::evaluate:
-            // the value of evaluate can be call_node
-            set_referenced(ret.static_as<evaluate>()->value_, false);
-            break;
-        case sc_stmt_type::for_loop: {
-            auto the_for = ret.static_as<for_loop_c>();
-            set_referenced(the_for->iter_begin_, false);
-            set_referenced(the_for->iter_end_, false);
-            set_referenced(the_for->step_, false);
-            set_referenced(the_for->var_, true);
-            the_for->var_->ssa_data_->owner_ = ret.weak();
-        } break;
-        case sc_stmt_type::returns: {
-            auto the_ret = ret.static_as<returns_c>();
-            if (the_ret->value_.defined()) {
-                set_referenced(the_ret->value_, false);
-            }
-        } break;
-        case sc_stmt_type::define: process_define_node_after_visit(ret); break;
-        case sc_stmt_type::stmts: break;
-    }
-
-    return ret;
-}
-
-stmt_c ssa_visitor_t::top_level_dispatch(stmt_c e) {
-    gc_roots_.clear();
-    auto ret = dispatch(std::move(e));
-
-    mark_garbage();
-    gc_roots_.clear();
-
-    return ret;
-}
-
-func_c ssa_visitor_t::top_level_dispatch(func_c e) {
-    gc_roots_.clear();
-    auto ret = dispatch(std::move(e));
-
-    mark_garbage();
-    gc_roots_.clear();
-
-    return ret;
-}
-
-static void do_mark(const expr &cur) {
-    cur->ssa_data_->referenced_ = true;
-
-    get_direct_dependency_of_expr(cur, [](array_ref<expr> ref) {
-        for (auto &v : ref) {
-            if (!v->ssa_data_->referenced_) { do_mark(v); }
-        }
-    });
-}
-
-void ssa_visitor_t::mark_garbage() {
-    for (auto &v : gc_roots_) {
-        do_mark(v);
-    }
-}
-
-static expr_base *get_var_if_is_define(const stmt_c &s) {
-    if (s.isa<define>()) {
-        auto def = s.static_as<define>();
-        if (def->var_->ssa_data_) {
-            // assert(def->var_->ssa_data_);
-            return def->var_.get();
-        }
-    }
-    return nullptr;
-}
-
-stmt_c ssa_visitor_t::visit(stmts_c v) {
-    std::vector<stmt_c> ret_vec;
-    auto old_scope = current_scope_;
-    current_scope_ = &ret_vec;
-    bool changed = false;
-    for (auto &s : v->seq_) {
-        if (auto var_p = get_var_if_is_define(s)) {
-            if (var_p->ssa_data_->is_garbage()) {
-                changed = true;
-                continue;
-            }
-        }
-        auto sz_before = ret_vec.size();
-        auto ret = dispatch(s);
-
-        // we allow the return value to be empty, which means we need to remove
-        // the stmt from parent
-        if (!ret.defined()) {
-            changed = true;
-            continue;
-        }
-        // if a SSA definition is inserted, the IR is changed
-        changed |= sz_before != ret_vec.size();
-        changed |= !ret.ptr_same(s);
-        ret_vec.emplace_back(std::move(ret));
-        // insert the stmts_to_insert_after
-        for (auto &st : stmts_to_insert_after) {
-            if (st.isa<define>()) { process_define_node_after_visit(st); }
-            ret_vec.emplace_back(std::move(st));
-        }
-        stmts_to_insert_after.clear();
-    }
-    current_scope_ = old_scope;
-    if (!changed) {
-        return v;
-    } else {
-        return copy_attr(*v, builder::make_stmts_unattached(ret_vec));
-    }
-}
-
-define ssa_visitor_t::make_def(const expr_c &v) {
-    auto ret = builder::make_var(
-            v->dtype_, std::string("__tmp") + std::to_string(var_def_idx_++));
-    ret->ssa_data_ = utils::make_unique<ssa_data_t>();
-    if (!v->ssa_data_) {
-        v.remove_const()->ssa_data_ = utils::make_unique<ssa_data_t>();
-    }
-    return builder::make_var_tensor_def_unattached(ret, linkage::local, v)
-            .static_as<define>();
-}
-
-define ssa_visitor_t::make_def_and_process(const expr_c &v) {
-    auto ret = make_def(v);
-    process_define_node_after_visit(ret);
-    return ret;
-}
-
-expr ssa_visitor_t::add_def(const expr_c &v) {
-    assert(current_scope_);
-    auto ret = make_def(v);
-    current_scope_->emplace_back(ret);
-    process_define_node_after_visit(current_scope_->back());
-    return ret->var_;
-}
-
-expr ssa_visitor_t::add_def_after_current_stmt(const expr_c &v) {
-    assert(current_scope_);
-    auto ret = make_def(v);
-    stmts_to_insert_after.emplace_back(ret);
-    return ret->var_;
-}
-
-expr_c ssa_visitor_t::visit(ssa_phi_c v) {
-    std::vector<expr> newv;
-    bool changed = dispatch_expr_vector(v->values_, newv);
-    if (changed) {
-        return copy_attr(*v, make_expr<ssa_phi_node>(newv, v->is_loop_phi_));
-    } else {
-        return v;
-    }
-}
-expr_c ssa_visitor_t::visit(tensor_c v) {
-    return v;
-}
-
-void ssa_viewer_t::view(ssa_phi_c v) {
-    for (auto &val : v->values_) {
-        dispatch(val);
-    }
-}
-void ssa_viewer_t::view(tensor_c v) {}
-
-void ssa_viewer_t::view(stmts_c v) {
-    for (auto &s : v->seq_) {
-        if (auto var_p = get_var_if_is_define(s)) {
-            if (var_p->ssa_data_->is_garbage()) { continue; }
-        }
-        dispatch(s);
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_visitor.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_visitor.hpp
deleted file mode 100644
index 5265ac5224a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/ssa_visitor.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SSA_VISITOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_SSA_VISITOR_HPP
-#include <vector>
-#include "viewer.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// The base class of all visitors for SSA IR. Developers who extend this visitor
-// may override visit() as usual. The returned values of visit() for expr are
-// expected to be simple indexing, constants, vars or tensors. To insert a SSA
-// def before the current position, please use `add_def`. To insert after,
-// please use `add_def_after_current_stmt`. Nested exprs should not be returned.
-// Its difference between ir_visitor_t:
-// 1. It enables default dispatching on phi node
-// 2. It does not dispatch into the dims of tensor_node
-// 3. It keeps expr_base::ssa_data_ consistent after dispatch for expr/stmt
-// 4. dispatch() is hidden and users of the SSA visitor should use
-// top_level_dispatch() instead, to initiate walking into the IR from top-level
-// func/stmt. The developers of SSA visitor can still call old dispatch()
-// internally
-class ssa_visitor_t : public ir_visitor_t {
-protected:
-    std::vector<stmt_c> *current_scope_ = nullptr;
-    // the stmts to be inserted after visit() of current stmt
-    std::vector<stmt_c> stmts_to_insert_after;
-    uint64_t var_def_idx_ = 0;
-    expr_c dispatch(expr_c e) override;
-    stmt_c dispatch(stmt_c e) override;
-
-    define make_def(const expr_c &v);
-    define make_def_and_process(const expr_c &v);
-
-    // reference root of the expr nodes that has been visited during one call
-    // of top_level_dispatch. will be cleared after top_level_dispatch returns
-    std::vector<expr> gc_roots_;
-
-    // starts to traverse from gc_roots_ and recursively mark the `referenced_`
-    // bit in ssa_data_t. SSA var definitions that are not referenced will be
-    // skipped in the next visitor's visit()
-    void mark_garbage();
-
-public:
-    // Dispatch and walk down into IR tree from a top-level node. It will also
-    // do garbage marking on expr nodes that are no longer referenced after
-    // visiting the IR is over
-    func_c top_level_dispatch(func_c e);
-    stmt_c top_level_dispatch(stmt_c e);
-
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    std::vector<stmt_c> *get_current_scope() const noexcept {
-        return current_scope_;
-    }
-
-    // acccepts an simple expr like "a+b", and puts the SSA var def "c=a+b" into
-    // current scope
-    expr add_def(const expr_c &v);
-    // acccepts an simple expr like "a+b", and puts the SSA var def "c=a+b" into
-    // current scope after current visit() of stmt returns
-    expr add_def_after_current_stmt(const expr_c &v);
-    stmt_c visit(stmts_c v) override;
-    expr_c visit(ssa_phi_c v) override;
-    expr_c visit(tensor_c v) override;
-};
-
-// The base class of all viewers for SSA IR. Its difference between ir_viewer_t:
-// 1. It enables default dispatching on phi node
-// 2. It will NOT dispatch into the dimensions of tensor_nodes
-class ssa_viewer_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::view;
-    void view(ssa_phi_c v) override;
-    void view(tensor_c v) override;
-    void view(stmts_c v) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/statics_table.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/statics_table.cpp
deleted file mode 100644
index 0f138670239..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/statics_table.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "statics_table.hpp"
-#include <iomanip>
-#include <memory>
-#include <string.h>
-#include <runtime/context.hpp>
-#include <runtime/runtime.hpp>
-#include <util/assert.hpp>
-#include <util/reflection.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static_data_t::static_data_t(const void *indata, size_t size)
-    : aligned_buffer_t(size, runtime::get_default_stream()->engine_) {
-    memcpy(data_, indata, size);
-}
-
-aligned_buffer_t::aligned_buffer_t(size_t size, runtime::engine_t *engine) {
-    data_ = engine->vtable_->persistent_alloc(engine, size);
-    size_ = size;
-    engine_ = engine;
-}
-
-aligned_buffer_t::aligned_buffer_t(aligned_buffer_t &&other) {
-    data_ = other.data_;
-    size_ = other.size_;
-    engine_ = other.engine_;
-    other.data_ = nullptr;
-    other.size_ = 0;
-    other.engine_ = nullptr;
-}
-aligned_buffer_t::~aligned_buffer_t() {
-    if (data_) { engine_->vtable_->persistent_dealloc(engine_, data_); }
-}
-
-#define SC_CLASS_END2() \
-    return &meta; \
-    } \
-    }
-
-static int compare_data(void *lhs, void *rhs) {
-    std::shared_ptr<static_data_t> *lv
-            = reinterpret_cast<std::shared_ptr<static_data_t> *>(lhs);
-    std::shared_ptr<static_data_t> *rv
-            = reinterpret_cast<std::shared_ptr<static_data_t> *>(rhs);
-    if (lv->get()->size_ < rv->get()->size_) { return -1; }
-    if (lv->get()->size_ > rv->get()->size_) { return 1; }
-    return memcmp(lv->get()->data_, rv->get()->data_, lv->get()->size_);
-}
-
-// clang-format off
-SC_CLASS_WITH_NAME(shared_ptr_static_data, std::shared_ptr<static_data_t>)
-    .get();
-    meta.vtable_ = utils::make_unique<reflection::class_vtable_t>();
-    meta.vtable_->compare_ = compare_data;
-SC_CLASS_END2()
-// clang-format on
-
-void *statics_table_t::get(const std::string &name) const {
-    auto itr = impl_.find(name);
-    COMPILE_ASSERT(itr != impl_.end(),
-            "Cannot find the name in globals table: " << name);
-    return reinterpret_cast<void *>(
-            reinterpret_cast<uintptr_t>(data_.data_) + itr->second);
-}
-
-static constexpr size_t MAGIC = 0xc0ffeec011001010;
-void statics_table_t::save_to_file(const std::string &path) const {
-    COMPILE_ASSERT(initialized_size_ <= data_.size_, "Bad statics_table");
-    FILE *ofs = fopen(path.c_str(), "wb");
-    COMPILE_ASSERT(ofs, "Cannot open file for write: " << path);
-    size_t data;
-    data = MAGIC;
-    fwrite(&data, sizeof(data), 1, ofs);
-    data = data_.size_;
-    fwrite(&data, sizeof(data), 1, ofs);
-    data = initialized_size_;
-    fwrite(&data, sizeof(data), 1, ofs);
-    fwrite(data_.data_, initialized_size_, 1, ofs);
-    fclose(ofs);
-}
-
-struct file_raii_t {
-    FILE *f_;
-    ~file_raii_t() {
-        if (f_) fclose(f_);
-    }
-};
-statics_table_t statics_table_t::load_from_file(const std::string &path) {
-    FILE *ofs = fopen(path.c_str(), "rb");
-    COMPILE_ASSERT(ofs, "Cannot open file for read: " << path);
-    file_raii_t auto_close {ofs};
-    size_t data = 0;
-    size_t readitems;
-    readitems = fread(&data, sizeof(data), 1, ofs);
-    COMPILE_ASSERT(data == MAGIC, "Bad magic number");
-
-    size_t total_size = 0;
-    readitems = fread(&total_size, sizeof(total_size), 1, ofs);
-    COMPILE_ASSERT(readitems == 1, "Bad EOF");
-
-    size_t initialized_size = 0;
-    readitems = fread(&initialized_size, sizeof(initialized_size), 1, ofs);
-    COMPILE_ASSERT(readitems == 1, "Bad EOF");
-
-    COMPILE_ASSERT(initialized_size <= total_size,
-            "Expecting initialized_size <= total_size");
-    static_data_t buf {total_size, runtime::get_default_stream()->engine_};
-
-    if (initialized_size) {
-        readitems = fread(buf.data_, initialized_size, 1, ofs);
-        COMPILE_ASSERT(readitems == 1, "Bad EOF");
-    }
-
-    statics_table_t ret {std::move(buf)};
-    ret.initialized_size_ = initialized_size;
-    return ret;
-}
-
-void *statics_table_t::get_or_null(const std::string &name) const {
-    auto itr = impl_.find(name);
-    if (itr != impl_.end()) {
-        return reinterpret_cast<void *>(
-                reinterpret_cast<uintptr_t>(data_.data_) + itr->second);
-    }
-    return nullptr;
-}
-void statics_table_t::add(const std::string &name, size_t d) {
-    COMPILE_ASSERT(impl_.find(name) == impl_.end(),
-            "Duplicated name in global tensors: " << name);
-    impl_.insert(std::make_pair(name, d));
-}
-
-statics_table_t statics_table_t::copy() const {
-    if (!data_.data_) { return statics_table_t(); }
-    statics_table_t ret(aligned_buffer_t(data_.size_, data_.engine_));
-    memcpy(ret.data_.data_, data_.data_, data_.size_);
-    ret.impl_ = impl_;
-    ret.initialized_size_ = initialized_size_;
-    return ret;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/statics_table.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/statics_table.hpp
deleted file mode 100644
index 6945f0469c9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/statics_table.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_STATICS_TABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_STATICS_TABLE_HPP
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-struct engine_t;
-}
-
-struct cached_const_graph_tensor;
-
-// Manages the ownership of the internal data buffer
-struct SC_API aligned_buffer_t {
-    void *data_;
-    // the size of the data in bytes
-    size_t size_;
-    runtime::engine_t *engine_;
-
-    // creates a buffer with given size in bytes
-    aligned_buffer_t(size_t size, runtime::engine_t *engine);
-
-    aligned_buffer_t(aligned_buffer_t &&other);
-    aligned_buffer_t() : data_(nullptr), size_(0), engine_(nullptr) {}
-    ~aligned_buffer_t();
-};
-
-// The aligned buffer used in compiler using default runtime stream. It is a
-// segment of statically allocated data for a module (e.g. for a single
-// initialized global tensor). In contrast, stack/FILO memory pool are
-// dynamically allocated.
-struct SC_API static_data_t : public aligned_buffer_t {
-    using aligned_buffer_t::aligned_buffer_t;
-
-    /**
-     * Creates a buffer and copies an existing buffer to it
-     * @param indata the existing buffer. Does not take the ownership of it
-     * @param size the size of the new buffer and the existing buffer
-     * */
-    static_data_t(const void *indata, size_t size);
-    template <typename T,
-            typename Dummy
-            = typename std::enable_if<!std::is_same<T, bool>::value>>
-    static_data_t(const std::vector<T> &indata)
-        : static_data_t((void *)indata.data(), indata.size() * sizeof(T)) {}
-};
-
-// the table to hold the statically allocated data for a jit_module/ir_module,
-// or for multiple linked modules
-// memory layout:
-// ------------ start of the buffer
-// | global vars            |
-// ------------
-// | initialized tensors    |
-// ------------ end of initialized section
-// | uninitialized tensors  |
-// ------------ end of the buffer
-struct statics_table_t {
-    std::unordered_map<std::string, size_t> impl_;
-    aligned_buffer_t data_;
-    // the size of initialized section
-    size_t initialized_size_;
-    std::vector<std::shared_ptr<cached_const_graph_tensor>> shared_tensors_;
-    std::vector<std::shared_ptr<void>> device_kernels_;
-    // gets the data by name, will throw an exception if name not found
-    void *get(const std::string &name) const;
-    // gets the data by name, returns null if name not found instead of throwing
-    void *get_or_null(const std::string &name) const;
-    // adds and copies the static_data_t to the table
-    void add(const std::string &name, size_t offset);
-    statics_table_t(aligned_buffer_t &&data)
-        : data_(std::move(data)), initialized_size_(0) {}
-    statics_table_t(statics_table_t &&other) = default;
-    statics_table_t() : initialized_size_(0) {}
-    // save the buffer size and the contents of initialized section
-    SC_INTERNAL_API void save_to_file(const std::string &path) const;
-    // load the saved statics_table from file
-    SC_INTERNAL_API static statics_table_t load_from_file(
-            const std::string &path);
-    statics_table_t copy() const;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/tir_pos_trace.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/tir_pos_trace.cpp
deleted file mode 100644
index cc13382c8e0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/tir_pos_trace.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "tir_pos_trace.hpp"
-#include <compiler/ir/pass/printer.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-std::string tir_pos_tracer::to_string() const {
-    if (cur_func_ && cur_node_ && utils::compiler_configs_t::get().diagnose_) {
-        std::stringstream ss;
-        ss << '\n';
-        auto func = dynamic_cast<const func_base *>(cur_func_);
-        print_ir_and_annotate_position_in_source(
-                func->shared_from_this(), cur_node_, ss);
-        return ss.str();
-    }
-    return std::string();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/tir_pos_trace.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/tir_pos_trace.hpp
deleted file mode 100644
index 9c8ae544d67..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/tir_pos_trace.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TIR_POS_TRACE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TIR_POS_TRACE_HPP
-
-#include <string>
-#include <compiler/ir/sc_function.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// the tracer to track the current TIR location. Useful for pretty-printing the
-// TIR error with detailed context.
-struct tir_pos_tracer {
-    const node_base *cur_func_ = nullptr;
-    const node_base *cur_node_ = nullptr;
-    // returns the string representation of current function and the source
-    // position of the current node
-    std::string to_string() const;
-};
-
-// the RAII holder to set and auto-reset the tir_pos_tracer. Use TIR_ERROR_TRACE
-// in IR visitors
-struct tir_pos_trace {
-    const node_base *const old_value_;
-    const node_base **const target_value_;
-    tir_pos_trace(tir_pos_tracer *tracer, const func_base *v)
-        : old_value_(tracer->cur_func_), target_value_(&tracer->cur_func_) {
-        tracer->cur_func_ = v;
-    }
-    tir_pos_trace(tir_pos_tracer *tracer, const stmt_base_t *v)
-        : old_value_(tracer->cur_node_), target_value_(&tracer->cur_node_) {
-        tracer->cur_node_ = v;
-    }
-    tir_pos_trace(tir_pos_tracer *tracer, const expr_base *v)
-        : old_value_(tracer->cur_node_), target_value_(&tracer->cur_node_) {
-        // var and tensors are used multiple times and are hard to trace the
-        // location of the use
-        if (v->node_type_ != sc_expr_type::var
-                && v->node_type_ != sc_expr_type::tensor) {
-            tracer->cur_node_ = v;
-        }
-    }
-    ~tir_pos_trace() { *target_value_ = old_value_; }
-};
-
-#define TIR_ERROR_TRACE(v) \
-    tir_pos_trace __sc_tir_error_trace {&this->pass_error_tracer_, v.get()};
-
-#define COMPILE_ASSERT_POS(cond, ...) \
-    COMPILE_ASSERT(cond, this->pass_error_tracer_.to_string() << __VA_ARGS__)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/auto_cast.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/auto_cast.cpp
deleted file mode 100644
index 7bebec10f9e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/auto_cast.cpp
+++ /dev/null
@@ -1,397 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_map>
-
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../intrinsics.hpp"
-#include "../util_module_passes.hpp"
-#include "../visitor.hpp"
-#include "auto_cast.hpp"
-#include <compiler/ir/pass_dep_util.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(auto_caster, SC_PASS_DEPENDS_ON(index_flattener),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(CONST_FOLDED));
-
-static std::unordered_map<sc_data_type_t, int> promotion_priority = {
-        {datatypes::f32, 100},
-        {datatypes::index, 90},
-        {datatypes::u32, 81},
-        {datatypes::s32, 80},
-        {datatypes::u16, 70},
-        {datatypes::u8, 61},
-        {datatypes::s8, 60},
-};
-
-int get_casting_priority(sc_data_type_t dty) {
-    auto itr = promotion_priority.find(dty);
-    if (itr != promotion_priority.end()) { return itr->second; }
-    return -1;
-}
-
-static bool try_cast_to_fp32(expr_c &v1) {
-    if (v1->dtype_.type_code_ == sc_data_etype::BF16
-            || v1->dtype_.type_code_ == sc_data_etype::F16) {
-        v1 = builder::make_cast(
-                sc_data_type_t(sc_data_etype::F32, v1->dtype_.lanes_), v1);
-        return true;
-    }
-    return false;
-}
-
-static void do_promote(expr_c &v1, expr_c &v2) {
-    int pr1 = get_casting_priority(v1->dtype_);
-    int pr2 = get_casting_priority(v2->dtype_);
-    if (pr1 != -1 && pr2 != -1) {
-        // if both types are in the priority map
-        assert(pr1 != pr2);
-        if (pr1 > pr2) {
-            v2 = builder::make_cast(v1->dtype_, v2);
-        } else {
-            v1 = builder::make_cast(v2->dtype_, v1);
-        }
-        return;
-    }
-
-    if (v1->dtype_.type_code_ == sc_data_etype::F32) {
-        if (try_cast_to_fp32(v2)) return;
-    } else if (v2->dtype_.type_code_ == sc_data_etype::F32) {
-        if (try_cast_to_fp32(v1)) return;
-    }
-    COMPILE_ASSERT(false,
-            "Cannot promote types: " << v1->dtype_ << " and " << v2->dtype_
-                                     << ". The expr are: " << v1 << " and "
-                                     << v2);
-}
-
-static bool cast_to(
-        expr_c &v, sc_data_type_t ty, const std::function<void()> &raiseit) {
-    if (v->dtype_ == ty) { return false; }
-    if (v->dtype_.is_pointer() && ty == datatypes::pointer) {
-        // if it is T* => void* cast
-        v = builder::make_cast(ty, v);
-    } else if (v->dtype_.lanes_ == 1 && ty == datatypes::generic) {
-        // if it is T* => void* cast
-        v = builder::make_cast(ty, v);
-    } else {
-        int pri = get_casting_priority(v->dtype_);
-        if (pri >= 0) {
-            int target_pri = get_casting_priority(ty);
-            if (pri >= target_pri) raiseit();
-            v = builder::make_cast(ty, v);
-        } else {
-            if (ty != datatypes::f32) raiseit();
-            if (!try_cast_to_fp32(v)) raiseit();
-        }
-    }
-    return true;
-}
-
-bool cast_to(expr_c &v, sc_data_type_t ty, expr_c containing) {
-    auto raiseit = [&v, ty, &containing]() {
-        COMPILE_ASSERT(false,
-                "Cannot cast from " << v->dtype_ << " to " << ty
-                                    << ". expr = " << containing);
-    };
-    return cast_to(v, ty, raiseit);
-}
-
-static bool cast_to(expr_c &v, sc_data_type_t ty, stmt_c containing) {
-    auto raiseit = [&v, ty, &containing]() {
-        COMPILE_ASSERT(false,
-                "Cannot cast from " << v->dtype_ << " to " << ty
-                                    << ". expr = " << containing);
-    };
-    return cast_to(v, ty, raiseit);
-}
-
-static expr_c cast_intrin_binary(
-        intrin_call_c oldnode, std::vector<expr> &newargs, bool changed) {
-    auto &l = newargs[0];
-    auto &r = newargs[1];
-    if (l->dtype_ != r->dtype_) {
-        expr_c vl = l, vr = r;
-        do_promote(vl, vr);
-        // write back to newargs
-        l = vl.remove_const();
-        r = vr.remove_const();
-        changed = true;
-    }
-    sc_data_type_t dtype = l->dtype_;
-    if (changed) {
-        auto ret = copy_attr(
-                *oldnode, builder::remake_intrin_call(oldnode, newargs));
-        ret->dtype_ = dtype;
-        return std::move(ret);
-    } else {
-        return std::move(oldnode);
-    }
-}
-
-class auto_cast_t : public ir_consistent_visitor_t {
-public:
-    using ir_consistent_visitor_t::dispatch;
-    using ir_consistent_visitor_t::visit;
-    expr_c visit(binary_c v) override {
-        auto l = dispatch(v->l_);
-        auto r = dispatch(v->r_);
-        bool changed = !l.ptr_same(v->l_) || !r.ptr_same(v->r_);
-        if (l.isa<tensor>()) {
-            changed = true;
-        } else if (l->dtype_ != r->dtype_) {
-            do_promote(l, r);
-            changed = true;
-        } else if (v->dtype_ == datatypes::undef) {
-            changed = true;
-        }
-        sc_data_type_t dtype = l->dtype_;
-        if (changed) {
-            auto ret = builder::remake_binary(l, r, v);
-            ret->dtype_ = dtype;
-            return std::move(ret);
-        } else {
-            return std::move(v);
-        }
-    }
-
-    expr_c visit(cmp_c v) override {
-        auto l = dispatch(v->l_);
-        auto r = dispatch(v->r_);
-        bool changed = !l.ptr_same(v->l_) || !r.ptr_same(v->r_);
-        if (l->dtype_ != r->dtype_) {
-            do_promote(l, r);
-            changed = true;
-        }
-        if (changed) {
-            return builder::remake_binary(l, r, std::move(v));
-        } else {
-            return std::move(v);
-        }
-    }
-
-    // makes all indices the same type with the largest dtype
-    expr_c visit(indexing_c v) override {
-        auto ptr = dispatch(v->ptr_);
-        bool changed = !ptr.ptr_same(v->ptr_);
-        std::vector<expr> newidx;
-        changed |= dispatch_expr_vector(v->idx_, newidx);
-        int max_priori = -100;
-        sc_data_type_t target_type;
-        // first find max priority
-        for (auto &idx : newidx) {
-            int priori = get_casting_priority(idx->dtype_);
-            if (priori > max_priori) {
-                max_priori = priori;
-                target_type = idx->dtype_;
-            }
-        }
-
-        std::vector<expr_c> newidx_c;
-        newidx_c.reserve(newidx.size());
-        // second pass: cast to max priority dtype
-        for (auto &idx : newidx) {
-            expr_c idx_c = idx;
-            changed |= cast_to(idx_c, target_type, v);
-            newidx_c.emplace_back(std::move(idx_c));
-        }
-
-        auto new_mask = v->mask_.defined() ? dispatch(v->mask_) : expr_c();
-        changed |= !new_mask.ptr_same(v->mask_);
-
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_indexing(ptr, newidx_c, v->dtype_.lanes_,
-                            new_mask, v->dtype_.rows_));
-        } else {
-            return std::move(v);
-        }
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto var = dispatch(v->var_);
-        auto val = dispatch(v->value_);
-        bool changed = !var.ptr_same(v->var_) || !val.ptr_same(v->value_);
-        if (v->value_->dtype_ != v->var_->dtype_) {
-            changed |= cast_to(val, v->var_->dtype_, v);
-        }
-        if (changed) {
-            return copy_attr(*v, builder::make_assign_unattached(var, val));
-        } else {
-            return std::move(v);
-        }
-    }
-
-    expr_c visit(call_c v) override {
-        std::vector<expr> newargs;
-        bool changed = dispatch_expr_vector(v->args_, newargs);
-        func_t the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-        func_t proto_func = v->get_prototype();
-        assert(v->args_.size() == proto_func->params_.size());
-        for (size_t i = 0; i < newargs.size(); i++) {
-            expr_c idx_c = newargs[i];
-            changed |= cast_to(idx_c, proto_func->params_[i]->dtype_, v);
-            newargs[i] = idx_c.remove_const();
-        }
-        if (changed) {
-            if (the_func) {
-                return builder::remake_call(the_func, newargs, v);
-            } else {
-                return copy_attr(*v, make_expr<call_node>(v->func_, newargs));
-            }
-        } else {
-            return std::move(v);
-        }
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        std::vector<expr> newargs;
-        bool changed = dispatch_expr_vector(v->args_, newargs);
-        switch (v->type_) {
-            case intrin_type::min:
-            case intrin_type::max:
-            case intrin_type::int_and:
-            case intrin_type::int_or:
-            case intrin_type::int_xor:
-                return cast_intrin_binary(std::move(v), newargs, changed);
-            case intrin_type::abs:
-            case intrin_type::reinterpret:
-            case intrin_type::shl:
-            case intrin_type::shr:
-            case intrin_type::round:
-            case intrin_type::floor:
-            case intrin_type::ceil:
-            case intrin_type::exp:
-            case intrin_type::log:
-            case intrin_type::erf:
-            case intrin_type::gather:
-            case intrin_type::sqrt:
-            case intrin_type::rsqrt:
-            case intrin_type::reduce_add:
-            case intrin_type::reduce_mul:
-            case intrin_type::reduce_max:
-            case intrin_type::reduce_min:
-            case intrin_type::fmadd:
-            case intrin_type::fnmadd:
-            case intrin_type::unpack_low:
-            case intrin_type::unpack_high:
-            case intrin_type::shuffle:
-            case intrin_type::permute:
-            case intrin_type::broadcast:
-            case intrin_type::permutex2var:
-            case intrin_type::permutexvar:
-            case intrin_type::insert:
-            case intrin_type::extract:
-                // do nothing
-                break;
-            case intrin_type::brgemm:
-                COMPILE_ASSERT(v->check_brgemm_arg_size(
-                                       brgemm_args::NUM_FULL_ARGS_STRIDE),
-                        "Wrong number of arguments for brgemm, expected equal "
-                        "or larger than "
-                                << brgemm_args::NUM_FULL_ARGS_STRIDE
-                                << ", but got " << newargs.size() << ".");
-                // the A, B, C are overloaded arguments
-                for (unsigned i = brgemm_args::C + 1;
-                        i < brgemm_args::NUM_FULL_ARGS_STRIDE; i++) {
-                    // cast newargs[i] to the expected types
-                    expr_c v = newargs[i];
-                    // specific process for brgemm, as its runtime kernel is too
-                    // hard to change.
-                    if (v->dtype_ != brgemm_args::arg_types[i]) {
-                        v = builder::make_cast(brgemm_args::arg_types[i], v);
-                        changed = true;
-                    }
-                    newargs[i] = v.remove_const();
-                }
-                break;
-            case intrin_type::list_brgemm:
-                COMPILE_ASSERT(v->check_brgemm_arg_size(
-                                       brgemm_args::NUM_FULL_ARGS_LIST),
-                        "Wrong number of arguments for list brgemm, expected "
-                        "equal or larger then "
-                                << brgemm_args::NUM_FULL_ARGS_LIST
-                                << ", but got " << newargs.size() << ".");
-                for (unsigned i = 0; i < brgemm_args::NUM_FULL_ARGS_LIST; i++) {
-                    // cast newargs[i] to the expected types
-                    expr_c v = newargs[i];
-                    changed |= cast_to(v, brgemm_args::list_arg_types[i], v);
-                    newargs[i] = v.remove_const();
-                }
-                break;
-            default: break;
-        }
-        if (changed) {
-            return copy_attr(*v, builder::remake_intrin_call(v, newargs));
-        }
-        return std::move(v);
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto var = dispatch(v->var_);
-        auto begin = dispatch(v->iter_begin_);
-        auto end = dispatch(v->iter_end_);
-        auto step = dispatch(v->step_);
-        auto body = dispatch(v->body_);
-
-        bool changed = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-                && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_)
-                && body.ptr_same(v->body_));
-        changed |= cast_to(begin, v->var_->dtype_, v);
-        changed |= cast_to(end, v->var_->dtype_, v);
-        changed |= cast_to(step, v->var_->dtype_, v);
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_for_loop_unattached(var, begin, end, step,
-                            body, v->incremental_, v->kind_, v->num_threads_));
-        } else {
-            return std::move(v);
-        }
-    }
-};
-
-func_c auto_caster_t::operator()(func_c f) {
-    auto_cast_t pass;
-    auto ret = pass.dispatch(std::move(f));
-    return ret;
-}
-
-stmt_c auto_caster_t::operator()(stmt_c f) {
-    auto_cast_t pass;
-    return pass.dispatch(std::move(f));
-}
-
-expr_c auto_caster_t::operator()(expr_c f) {
-    auto_cast_t pass;
-    return pass.dispatch(std::move(f));
-}
-
-const_ir_module_ptr auto_caster_t::operator()(const_ir_module_ptr f) {
-    auto_cast_t pass;
-    return dispatch_module_on_visitor(&pass, f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/auto_cast.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/auto_cast.hpp
deleted file mode 100644
index 21f37c0bb9a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/auto_cast.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_AUTO_CAST_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_AUTO_CAST_HPP
-
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Adds cast_nodes to the IR to legalize it. It promotes the type if is
- * necessary and is allowed.
- * @see get_casting_priority
- * */
-class auto_caster_t : public module_pass_t {
-public:
-    func_c operator()(func_c f);
-    stmt_c operator()(stmt_c s);
-    expr_c operator()(expr_c s);
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-/**
- * Gets the casting auto-promotion priority of a type. A type of lower priority
- * can be auto-cast to a type of larger priority
- *
- * @param dty the type
- * @return if dty is castable, the auto-promotion priority, should > 0.
- *      Otherwise, -1
- * */
-int get_casting_priority(sc_data_type_t dty);
-
-/**
- * Auto promote an expr to a type. If it may narrow down the type of the expr,
- * throw an error. If the expr is the same type of the target type, does nothing
- * @param v in and out expr
- * @param ty the target type
- * @param containing the expr containing the type, for showing a user-friendly
- * error message
- * @return true if the expr is changed for casting
- * */
-bool cast_to(expr_c &v, sc_data_type_t ty, expr_c containing);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/bf16_fp16_legalize.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/bf16_fp16_legalize.cpp
deleted file mode 100644
index 812f8ece7a8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/bf16_fp16_legalize.cpp
+++ /dev/null
@@ -1,518 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <vector>
-#include "../builder.hpp"
-#include "bf16_fp16_legalize.hpp"
-#include <compiler/ir/pass_dep_util.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(bf16_fp16_legalizer, SC_PASS_DEPENDS_ON(auto_caster),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-SC_DECL_PASS_INFO(bf16_fp16_eliminator,
-        SC_PASS_DEPENDS_ON(constant_folder, bf16_fp16_legalizer, index2var),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-sc_data_type_t get_etype(
-        const sc_data_type_t &dtype, const uint16_t lanes = 1) {
-    return dtype.is_etype(sc_data_etype::BF16) ? sc_data_type_t::bf16(lanes)
-                                               : sc_data_type_t::f16(lanes);
-}
-
-static bool check_ref_more_than_one(
-        std::unordered_map<expr_c, int> &m, const expr &a) {
-    auto var_it = m.find(a);
-    // if the var is only used for once, e.g. assignment in reorder, we do not
-    // promote it.
-    return (var_it != m.end() && var_it->second > 1);
-}
-
-static bool define_can_promote(const context_ptr &ctx, const define_c &v) {
-    if (v->var_.isa<var>()) {
-        if (v->var_->dtype_.is_etype(sc_data_etype::BF16)) {
-            return v->var_->dtype_.lanes_
-                    <= ctx->get_max_vector_lanes(sc_data_etype::F32)
-                    && any_map_t::fetch_or_else(
-                            v->var_->attr_.get(), "can_promote_to_f32", true);
-        } else if (v->var_->dtype_.is_etype(sc_data_etype::F16)) {
-            return v->var_->dtype_.lanes_
-                    <= ctx->get_max_vector_lanes(sc_data_etype::F32)
-                    && any_map_t::fetch_or_else(
-                            v->var_->attr_.get(), "can_promote_to_f32", true);
-        }
-    }
-    return false;
-}
-
-bool can_binary_handled_by_hw(
-        const expr &L, const expr &R, const context_ptr &ctx) {
-    return L->dtype_.type_code_ == sc_data_etype::F16
-            && R->dtype_.type_code_ == sc_data_etype::F16
-            && ctx->machine_.cpu_flags_.fAVX512FP16;
-}
-
-std::tuple<expr_c, expr_c> bf16_fp16_promote_impl_t::docast(
-        const expr &orig_a, const expr &orig_b, bool *is_low_precision_fp) {
-    auto a = dispatch(orig_a);
-    auto b = dispatch(orig_b);
-    *is_low_precision_fp = false;
-    if (utils::is_one_of(a->dtype_.type_code_, sc_data_etype::BF16,
-                sc_data_etype::F16)) {
-        COMPILE_ASSERT(utils::is_one_of(b->dtype_.type_code_,
-                               a->dtype_.type_code_, sc_data_etype::F32),
-                "low precision floating point should be calculated with "
-                "low_precision_fp/f32");
-        *is_low_precision_fp = true;
-    } else if (utils::is_one_of(b->dtype_.type_code_, sc_data_etype::BF16,
-                       sc_data_etype::F16)) {
-        COMPILE_ASSERT(utils::is_one_of(a->dtype_.type_code_,
-                               b->dtype_.type_code_, sc_data_etype::F32),
-                "low precision floating point should be calculated with low "
-                "precision floating point");
-        *is_low_precision_fp = true;
-    }
-    if (*is_low_precision_fp) {
-        sc_data_type_t fp32ty = sc_data_type_t::f32(a->dtype_.lanes_);
-        a = utils::is_one_of(a->dtype_.type_code_, sc_data_etype::BF16,
-                    sc_data_etype::F16)
-                ? builder::make_cast(fp32ty, a)
-                : a;
-        b = utils::is_one_of(b->dtype_.type_code_, sc_data_etype::BF16,
-                    sc_data_etype::F16)
-                ? builder::make_cast(fp32ty, b)
-                : b;
-    }
-    return std::make_tuple(a, b);
-}
-
-expr_c bf16_fp16_promote_impl_t::visit(binary_c v) {
-    expr_c a, b;
-    bool is_low_precision_fp = false;
-    if (can_binary_handled_by_hw(v->l_, v->r_, ctx_)) return v;
-    std::tie(a, b) = docast(v->l_, v->r_, &is_low_precision_fp);
-    bool changed
-            = !a.ptr_same(v->l_) || !b.ptr_same(v->r_) || is_low_precision_fp;
-    if (changed) {
-        if (is_low_precision_fp) {
-            return copy_attr(*v,
-                    builder::make_cast(
-                            get_etype(v->l_->dtype_, v->l_->dtype_.lanes_),
-                            builder::remake_binary(a, b, v)));
-        }
-        return copy_attr(*v, builder::remake_binary(a, b, v));
-    } else {
-        return v;
-    }
-}
-
-expr_c bf16_fp16_promote_impl_t::visit(cmp_c v) {
-    expr_c a, b;
-    bool is_low_precision_fp = false;
-    if (can_binary_handled_by_hw(v->l_, v->r_, ctx_)) return v;
-    std::tie(a, b) = docast(v->l_, v->r_, &is_low_precision_fp);
-    bool changed
-            = !a.ptr_same(v->l_) || !b.ptr_same(v->r_) || is_low_precision_fp;
-    if (changed) {
-        return copy_attr(*v, builder::remake_binary(a, b, v));
-    } else {
-        return v;
-    }
-}
-
-expr_c bf16_fp16_promote_impl_t::visit(select_c v) {
-    if (can_binary_handled_by_hw(v->l_, v->r_, ctx_)) return v;
-    if (utils::is_one_of(v->l_->dtype_.type_code_, sc_data_etype::BF16,
-                sc_data_etype::F16)
-            && v->l_->dtype_.lanes_
-                    > ctx_->get_max_vector_lanes(sc_data_etype::F32)) {
-        return v;
-    }
-    expr_c a, b;
-    bool is_low_precision_fp = false;
-    std::tie(a, b) = docast(v->l_, v->r_, &is_low_precision_fp);
-    auto cond = dispatch(v->cond_);
-    bool changed = !a.ptr_same(v->l_) || !b.ptr_same(v->r_)
-            || !cond.ptr_same(v->cond_) || is_low_precision_fp;
-    if (changed) {
-        return copy_attr(*v,
-                builder::make_cast(
-                        get_etype(v->l_->dtype_, v->l_->dtype_.lanes_),
-                        builder::make_select(cond, a, b)));
-    } else {
-        return v;
-    }
-}
-
-expr_c bf16_fp16_promote_impl_t::visit(intrin_call_c v) {
-    std::vector<expr> args;
-    bool is_low_precision_fp = false;
-    bool changed = false;
-    bool can_disable_promote = ctx_->machine_.cpu_flags_.fAVX512FP16;
-    assert(v->args_.size() > 0);
-    auto raw_dtype = v->args_[0]->dtype_;
-    switch (v->type_) {
-        case intrin_type::reduce_add:
-        case intrin_type::reduce_mul:
-        case intrin_type::reduce_max:
-        case intrin_type::reduce_min:
-        case intrin_type::fmadd:
-        case intrin_type::fnmadd:
-        case intrin_type::exp:
-        case intrin_type::log:
-        case intrin_type::erf:
-        case intrin_type::isnan:
-            for (size_t i = 0; i < v->args_.size(); i++) {
-                auto in = dispatch(v->args_[i]);
-                changed = changed || !in.ptr_same(v->args_[i]);
-                if (utils::is_one_of(in->dtype_.type_code_, sc_data_etype::BF16,
-                            sc_data_etype::F16)) {
-                    in = builder::make_cast(
-                            sc_data_type_t::f32(in->dtype_.lanes_), in);
-                    is_low_precision_fp = true;
-                }
-                args.emplace_back(in.remove_const());
-            }
-            if (is_low_precision_fp) {
-                for (size_t i = 0; i < args.size(); i++) {
-                    COMPILE_ASSERT(
-                            !utils::is_one_of(args[i]->dtype_.type_code_,
-                                    sc_data_etype::BF16, sc_data_etype::F16),
-                            "All input args should be f32 from bf16 / f16.");
-                }
-            }
-            break;
-        case intrin_type::min:
-        case intrin_type::max:
-        case intrin_type::abs:
-        case intrin_type::round:
-        case intrin_type::floor:
-        case intrin_type::ceil:
-        case intrin_type::sqrt:
-        case intrin_type::rsqrt:
-            if (can_disable_promote) {
-                for (size_t i = 0; i < v->args_.size(); i++) {
-                    auto in = dispatch(v->args_[i]);
-                    if (in->dtype_.type_code_ != sc_data_etype::F16) {
-                        can_disable_promote = false;
-                        break;
-                    }
-                }
-            }
-            if (!can_disable_promote) {
-                for (size_t i = 0; i < v->args_.size(); i++) {
-                    auto in = dispatch(v->args_[i]);
-                    changed = changed || !in.ptr_same(v->args_[i]);
-                    if (utils::is_one_of(in->dtype_.type_code_,
-                                sc_data_etype::BF16, sc_data_etype::F16)) {
-                        in = builder::make_cast(
-                                sc_data_type_t::f32(in->dtype_.lanes_), in);
-                        is_low_precision_fp = true;
-                    }
-                    args.emplace_back(in.remove_const());
-                }
-                if (is_low_precision_fp) {
-                    for (size_t i = 0; i < args.size(); i++) {
-                        COMPILE_ASSERT(
-                                !utils::is_one_of(args[i]->dtype_.type_code_,
-                                        sc_data_etype::BF16,
-                                        sc_data_etype::F16),
-                                "All input args should be f32 from bf16 / "
-                                "f16.");
-                    }
-                }
-            } else {
-                for (size_t i = 0; i < v->args_.size(); i++) {
-                    auto in = dispatch(v->args_[i]);
-                    changed = changed || !in.ptr_same(v->args_[i]);
-                    args.emplace_back(in.remove_const());
-                }
-            }
-            break;
-        case intrin_type::int_and:
-        case intrin_type::int_or:
-        case intrin_type::int_xor:
-        case intrin_type::round_and_cast:
-        case intrin_type::saturated_cast:
-        case intrin_type::shl:
-        case intrin_type::shr:
-        case intrin_type::brgemm:
-        case intrin_type::list_brgemm:
-            for (size_t i = 0; i < v->args_.size(); i++) {
-                auto in = dispatch(v->args_[i]);
-                changed = changed || !in.ptr_same(v->args_[i]);
-                COMPILE_ASSERT(!in->dtype_.is_etype(sc_data_etype::BF16),
-                        "Intrin type " << v->type_
-                                       << " does not support bf16 args");
-                args.emplace_back(in.remove_const());
-            }
-            break;
-        case intrin_type::gather:
-        case intrin_type::unpack_low:
-        case intrin_type::unpack_high:
-        case intrin_type::shuffle:
-        case intrin_type::permute:
-        case intrin_type::broadcast:
-        case intrin_type::permutex2var:
-        case intrin_type::permutexvar:
-        case intrin_type::insert:
-        case intrin_type::extract:
-        case intrin_type::read_struct:
-        case intrin_type::write_struct:
-        case intrin_type::get_group_id:
-        case intrin_type::get_group_thread_id:
-        case intrin_type::prefetch:
-        case intrin_type::set_thread_idle_func:
-        case intrin_type::reinterpret: break;
-        default:
-            COMPILE_ASSERT(
-                    false, "Unsupport BF16 / FP16 intrin type: " << v->type_);
-    }
-
-    changed = changed || is_low_precision_fp;
-    if (changed) {
-        if (is_low_precision_fp) {
-            if (utils::is_one_of(v->type_, intrin_type::reduce_add,
-                        intrin_type::reduce_mul, intrin_type::reduce_max,
-                        intrin_type::reduce_min)) {
-                return copy_attr(*v,
-                        builder::make_cast(get_etype(raw_dtype),
-                                builder::remake_intrin_call(v, args)));
-            } else {
-                return copy_attr(*v,
-                        builder::make_cast(
-                                get_etype(raw_dtype, raw_dtype.lanes_),
-                                builder::remake_intrin_call(v, args)));
-            }
-        } else {
-            return copy_attr(*v, builder::remake_intrin_call(v, args));
-        }
-    } else {
-        return v;
-    }
-}
-
-void bf16_fp16_elimination_analyzer_t::view(var_c v) {
-    auto var_it = var_use_cnt_.find(v);
-    // if the var is used in non-assignment statement, increase its valid count
-    // by 1.
-    if (var_it != var_use_cnt_.end()) { var_it->second++; }
-}
-void bf16_fp16_elimination_analyzer_t::view(assign_c v) {
-    auto var_it = var_use_cnt_.find(v->var_);
-    auto val_it = var_use_cnt_.find(v->value_);
-    if (var_it != var_use_cnt_.end()) { var_it->second++; }
-    // If value is not the bf16 / fp16 var, dispatch it.
-    // If it is, hold its valid usage count.
-    if (val_it == var_use_cnt_.end()) { dispatch(v->value_); }
-}
-void bf16_fp16_elimination_analyzer_t::view(define_c v) {
-    if (define_can_promote(ctx_, v)) {
-        // initial count is 0
-        int count = 0;
-        // if the var has initial value, increase the count by 1
-        if (v->init_.defined()) { count = 1; }
-        var_use_cnt_[v->var_] = count;
-    }
-}
-void bf16_fp16_elimination_analyzer_t::view(intrin_call_c v) {
-    switch (v->type_) {
-        case intrin_type::unpack_low:
-        case intrin_type::unpack_high:
-        case intrin_type::shuffle:
-        case intrin_type::permute:
-        case intrin_type::broadcast:
-        case intrin_type::reinterpret:
-            // If an arg is not the bf16 / fp16 var, dispatch it.
-            // If it is, hold its valid usage count.
-            for (size_t i = 0; i < v->args_.size(); i++) {
-                auto it = var_use_cnt_.find(v->args_[i]);
-                if (it == var_use_cnt_.end()) { dispatch(v->args_[i]); }
-            }
-            break;
-        default: ir_viewer_t::view(v); break;
-    }
-}
-
-expr_c bf16_fp16_cast_elimination_impl_t::visit(cast_c v) {
-    auto in = dispatch(v->in_);
-    if (v->dtype_.is_etype(sc_data_etype::F32)) {
-        if (in.isa<cast_c>()) {
-            auto inin = in.static_as<cast_c>();
-            if (utils::is_one_of(inin->dtype_.type_code_, sc_data_etype::BF16,
-                        sc_data_etype::F16)
-                    && inin->in_->dtype_.is_etype(sc_data_etype::F32)) {
-                return inin->in_;
-            }
-        }
-    }
-    bool changed = !in.ptr_same(v->in_);
-    if (changed) {
-        return copy_attr(*v, builder::make_cast(v->dtype_, in));
-    } else {
-        return v;
-    }
-}
-
-expr_c bf16_fp16_cast_elimination_impl_t::visit(var_c v) {
-    auto it = cvt_map_.find(v);
-    // If we find the bf16 / fp16 old var, we should replace it with bf16 /
-    // fp16 (newv) for most ir node. If the var occurs singlely in define/assign
-    // node, directly use the newv instead(processed in define/assign node).
-    if (it != cvt_map_.end()) {
-        return builder::make_cast(
-                get_etype(v->dtype_, v->dtype_.lanes_), it->second);
-    }
-    return v;
-}
-
-stmt_c bf16_fp16_cast_elimination_impl_t::visit(define_c v) {
-    // if the var is only used for once, e.g. assignment in reorder, we do not
-    // promote it.
-    if (define_can_promote(ctx_, v)
-            && check_ref_more_than_one(var_use_cnt_, v->var_)) {
-        expr_c newv = v->var_;
-        expr_c init = v->init_;
-        bool changed = false;
-        if (v->linkage_ == linkage::local) {
-            newv = copy_attr(*v->var_,
-                    builder::make_var(
-                            sc_data_type_t::f32(v->var_->dtype_.lanes_),
-                            v->var_.static_as<var>()->name_));
-            cvt_map_.insert({v->var_, newv});
-            changed = true;
-        }
-        if (v->init_.defined()) {
-            if (v->linkage_ == linkage::local) {
-                init = builder::make_cast(
-                        sc_data_type_t::f32(v->init_->dtype_.lanes_), v->init_);
-            }
-            init = dispatch(init);
-            changed |= init.ptr_same(v->init_);
-        }
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_var_tensor_def_unattached(
-                            newv, v->linkage_, init));
-        }
-        return v;
-    }
-    return ir_visitor_t::visit(std::move(v));
-}
-
-stmt_c bf16_fp16_cast_elimination_impl_t::visit(assign_c v) {
-    expr_c var, val;
-    auto varit = cvt_map_.find(v->var_);
-    // single var directly replace
-    if (varit != cvt_map_.end()) {
-        var = varit->second;
-    } else {
-        var = dispatch(v->var_);
-    }
-    auto valit = cvt_map_.find(v->value_);
-    if (valit != cvt_map_.end()) {
-        val = valit->second;
-    } else {
-        val = dispatch(v->value_);
-    }
-    bool changed = !var.ptr_same(v->var_) || !val.ptr_same(v->value_);
-    //  (v,a,b are bf16, v2,a2,b2 are f32) v = bf16(f32(a) + f32(b)) => v2 = a2
-    //  + b2
-    if (varit != cvt_map_.end()) {
-        assert(v->var_->dtype_.is_etype(sc_data_etype::BF16)
-                || v->var_->dtype_.is_etype(sc_data_etype::F16)
-                || v->var_->dtype_.is_etype(sc_data_etype::U16));
-        assert(var.ptr_same(varit->second));
-        if (val.isa<cast_c>()) {
-            val = val.static_as<cast_c>()->in_;
-        } else if (!val->dtype_.is_etype(sc_data_etype::F32)) {
-            val = builder::make_cast(
-                    sc_data_type_t::f32(val->dtype_.lanes_), val);
-        }
-        changed = true;
-    } else if (valit != cvt_map_.end()) {
-        val = builder::make_cast(
-                get_etype(v->value_->dtype_, v->value_->dtype_.lanes_), val);
-        changed = true;
-    }
-    if (changed) {
-        return copy_attr(*v, builder::make_assign_unattached(var, val));
-    }
-    return v;
-}
-
-stmt_c bf16_fp16_cast_elimination_impl_t::visit(returns_c v) {
-    if (v->value_.isa<var>()) {
-        COMPILE_ASSERT(cvt_map_.find(v->value_) == cvt_map_.end(),
-                "Not support return a bf16 / fp16 local buffer now");
-    }
-    return ir_visitor_t::visit(v);
-}
-
-func_c bf16_fp16_legalizer_t::operator()(func_c f) {
-    bf16_fp16_promote_impl_t promote_pass(ctx_);
-    f = promote_pass.dispatch(f);
-    return f;
-}
-
-stmt_c bf16_fp16_legalizer_t::operator()(stmt_c f) {
-    bf16_fp16_promote_impl_t promote_pass(ctx_);
-    f = promote_pass.dispatch(f);
-    return f;
-}
-
-expr_c bf16_fp16_legalizer_t::operator()(expr_c f) {
-    bf16_fp16_promote_impl_t promote_pass(ctx_);
-    f = promote_pass.dispatch(f);
-    return f;
-}
-
-func_c bf16_fp16_eliminator_t::operator()(func_c f) {
-    if (f->attr_ && f->attr_->get_or_else(function_attrs::low_level, false)) {
-        return f;
-    }
-    bf16_fp16_elimination_analyzer_t analyzer(ctx_);
-    analyzer.dispatch(f);
-    bf16_fp16_cast_elimination_impl_t pass(ctx_, analyzer.var_use_cnt_);
-    f = pass.dispatch(f);
-    return f;
-}
-
-stmt_c bf16_fp16_eliminator_t::operator()(stmt_c f) {
-    bf16_fp16_elimination_analyzer_t analyzer(ctx_);
-    analyzer.dispatch(f);
-    bf16_fp16_cast_elimination_impl_t pass(ctx_, analyzer.var_use_cnt_);
-    f = pass.dispatch(f);
-    return f;
-}
-
-expr_c bf16_fp16_eliminator_t::operator()(expr_c f) {
-    bf16_fp16_elimination_analyzer_t analyzer(ctx_);
-    analyzer.dispatch(f);
-    bf16_fp16_cast_elimination_impl_t pass(ctx_, analyzer.var_use_cnt_);
-    f = pass.dispatch(f);
-    return f;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/bf16_fp16_legalize.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/bf16_fp16_legalize.hpp
deleted file mode 100644
index 009b607af13..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/bf16_fp16_legalize.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BF16_FP16_LEGALIZE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BF16_FP16_LEGALIZE_HPP
-
-#include <tuple>
-#include <utility>
-#include "../function_pass.hpp"
-#include "../sc_function.hpp"
-#include "../viewer.hpp"
-#include "../visitor.hpp"
-#include <compiler/config/context.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class bf16_fp16_promote_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    context_ptr ctx_;
-    bf16_fp16_promote_impl_t(context_ptr ctx = get_default_context())
-        : ctx_(std::move(ctx)) {}
-    std::tuple<expr_c, expr_c> docast(
-            const expr &orig_a, const expr &orig_b, bool *is_low_precision_fp);
-    expr_c visit(binary_c v) final;
-    expr_c visit(cmp_c v) final;
-    expr_c visit(select_c v) final;
-    expr_c visit(intrin_call_c v) final;
-};
-
-// An analyzer viewer runs before elimination to count the valid usage number of
-// bf16 / fp16 vars, to decide whether they need to be promoted to f32.
-class bf16_fp16_elimination_analyzer_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    context_ptr ctx_;
-    std::unordered_map<expr_c, int> var_use_cnt_;
-    bf16_fp16_elimination_analyzer_t(context_ptr ctx) : ctx_(std::move(ctx)) {}
-    void view(var_c v) override;
-    void view(assign_c v) override;
-    void view(define_c v) override;
-    void view(intrin_call_c v) override;
-};
-
-class bf16_fp16_cast_elimination_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    context_ptr ctx_;
-    // need to convert bf16 / fp16 var to f32
-    std::unordered_map<expr_c, expr_c> cvt_map_;
-    // inherit from analyzer
-    std::unordered_map<expr_c, int> &var_use_cnt_;
-    expr_c visit(cast_c v) final;
-    expr_c visit(var_c v) final;
-    stmt_c visit(define_c v) final;
-    stmt_c visit(assign_c v) final;
-    stmt_c visit(returns_c v) final;
-    bf16_fp16_cast_elimination_impl_t(
-            context_ptr ctx, std::unordered_map<expr_c, int> &var_use_cnt)
-        : ctx_(ctx), var_use_cnt_(var_use_cnt) {}
-};
-
-/**
- * bfloat16 and floating point16 legalize pass.
- *
- * It will do the following (a, b as bfloat16 input, c as bfloat16 output, "+"
- * as example):
- * c = a + b => c = bf16(float(a)+float(b))
- * c = a + neg(b) => c = bf16(float(a), neg(float(b)))
- * */
-class bf16_fp16_legalizer_t : public function_pass_t {
-public:
-    bf16_fp16_legalizer_t(context_ptr ctx = get_default_context())
-        : ctx_(std::move(ctx)) {}
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f);
-    expr_c operator()(expr_c f);
-    SC_DECL_PASS_INFO_FUNC();
-
-private:
-    context_ptr ctx_;
-};
-
-/**
- * bfloat16 and floating point16 elimination pass.
- *
- * The pass should be evaluated after bf16_fp16_legalize and recommended after
- * index2var.
- * It will do the two elimination:
- * 1. Eliminate consecutive bf16 transformations, e.g. f32(bf16(f32(a))) =>
- * f32(a)
- * 2. Promote consecutive bf16 calculation stmt of var to f32(only for var).
- */
-class bf16_fp16_eliminator_t : public function_pass_t {
-public:
-    bf16_fp16_eliminator_t(context_ptr ctx) : ctx_(std::move(ctx)) {}
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f);
-    expr_c operator()(expr_c f);
-    SC_DECL_PASS_INFO_FUNC();
-
-private:
-    context_ptr ctx_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_reschedule_tensor_hoist.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_reschedule_tensor_hoist.cpp
deleted file mode 100644
index 3afff0dc9f7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_reschedule_tensor_hoist.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "buffer_reschedule_tensor_hoist.hpp"
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(pass.buffer_reschedule_tensor_hoist);
-
-SC_DECL_PASS_INFO(buffer_rescheduling_tensor_hoisting,
-        SC_PASS_DEPENDS_ON(index2var, tensor_init, index_flattener,
-                dead_write_eliminator, ir_simplifier),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-static const char *processed_by_brth = "processed_by_brth";
-static const char *attr_hoisted_tensor = "hoisted";
-
-static bool is_par_for(stmt &curr) {
-    return curr.isa<for_loop_c>()
-            && curr.static_as<for_loop_c>()->kind_ == for_type::PARALLEL;
-}
-
-static bool is_par_for_or_define(stmt &curr) {
-    return (is_par_for(curr)
-            || (curr.isa<define_c>()
-                    && !(curr.checked_as<define_c>()->init_.defined())));
-}
-
-static bool not_innermost_par_for(std::vector<stmt> &stmts) {
-    return std::all_of(stmts.begin(), stmts.end(), is_par_for_or_define);
-}
-
-static bool contains_par_for(std::vector<stmt> &stmts) {
-    return std::any_of(stmts.begin(), stmts.end(), is_par_for);
-}
-
-class buffer_rescheduling_tensor_hoisting_impl_t : public ir_visitor_t {
-private:
-    context_ptr ctx_;
-    int buffer_schedule_type_;
-    bool eliminate_dead_writes_;
-    bool do_inplace_opt_;
-
-    buffer_scheduler_t buffer_sche_ {
-            ctx_, eliminate_dead_writes_, do_inplace_opt_};
-
-    uint64_t par_for_level_id_ = 0;
-    uint64_t define_id_ = 0;
-
-    stmt copy_define(
-            define &ori_define, int num_copies, uint64_t par_for_level_id) {
-        ++define_id_;
-        auto ori_tsr = ori_define->var_.static_as<tensor>();
-        std::string name = ori_tsr->name_;
-        expr_c shape = num_copies;
-        shape = do_cast_and_fold(
-                builder::make_mul(num_copies, ori_tsr->dims_[0]));
-        std::shared_ptr<static_data_t> new_data_init(nullptr);
-        if (ori_tsr->init_value_) {
-            auto size = ori_tsr->init_value_->size_;
-            if (size == 0) {
-                new_data_init = tensor_node::get_zero_tensor_initializer();
-            } else {
-                std::unique_ptr<char[]> ddata(new char[size * num_copies]);
-                for (int i = 0; i < num_copies; i++) {
-                    memcpy(ddata.get() + i * size, ori_tsr->init_value_->data_,
-                            size);
-                }
-                new_data_init = std::make_shared<static_data_t>(
-                        ddata.get(), size * num_copies);
-            }
-        }
-        auto hoisted_tsr = builder::make_tensor(std::string("hoisted_") + name
-                        + "_id" + std::to_string(define_id_),
-                {shape}, ori_tsr->elem_dtype_, ori_tsr->address_space_,
-                new_data_init);
-        hoisted_tsr->attr()[attr_keys::can_be_scheduled] = true;
-        hoisted_tsr->attr()[attr_hoisted_tensor] = true;
-        auto hoisted_def = builder::make_var_tensor_def_unattached(hoisted_tsr);
-
-        expr thread_id = make_expr<intrin_call_node>(intrin_type::get_group_id,
-                std::vector<expr> {par_for_level_id}, any_map_t());
-
-        ori_define->init_ = builder::tensor_ptr(
-                hoisted_tsr, {thread_id * ori_tsr->dims_[0]});
-
-        return hoisted_def;
-    }
-
-public:
-    buffer_rescheduling_tensor_hoisting_impl_t(context_ptr ctx,
-            int buffer_schedule_type, bool eliminate_dead_writes,
-            bool do_inplace_opt = false)
-        : ctx_(std::move(ctx))
-        , buffer_schedule_type_(buffer_schedule_type)
-        , eliminate_dead_writes_(eliminate_dead_writes)
-        , do_inplace_opt_(do_inplace_opt) {}
-
-    using ir_visitor_t::dispatch;
-    func_c dispatch(func_c v) override {
-        buffer_sche_.top_level_ = v;
-        return ir_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        if (v->attr_ && v->attr_->get_or_else(processed_by_brth, false)) {
-            return ir_visitor_t::visit(v);
-        }
-        if (v->kind_ == for_type::PARALLEL) {
-            SC_MODULE_INFO << "Input parallel for:\n" << v;
-            std::vector<stmt> &old_body = v->body_.checked_as<stmts>()->seq_;
-            bool not_innermost = contains_par_for(old_body);
-            if (not_innermost) {
-                // there is a nested parallel_for_loop
-                // within v, process it first (recursively)
-                auto new_body = make_stmt<stmts_node_t>(std::vector<stmt> {});
-                for (size_t i = 0; i < old_body.size(); ++i) {
-                    stmt &curr = old_body[i];
-                    if (is_par_for(curr)) {
-                        ++par_for_level_id_;
-                        // dispatch: transform the original nested_par_for to
-                        // hoisted_defines + new nested_par_for
-                        auto new_for = dispatch(curr).remove_const();
-                        if (new_for.isa<stmts>()) {
-                            auto new_for_stmts
-                                    = new_for.checked_as<stmts>()->seq_;
-                            new_body->seq_.insert(new_body->seq_.end(),
-                                    new_for_stmts.begin(), new_for_stmts.end());
-                        } else {
-                            new_body->seq_.emplace_back(new_for);
-                        }
-                        --par_for_level_id_;
-                    } else { // other stmts remain unchanged
-                        new_body->seq_.emplace_back(old_body[i]);
-                    }
-                }
-                old_body = new_body->seq_;
-                SC_MODULE_INFO << "After recursive process:\n" << v;
-            }
-
-            // first, do buffer rescheduling on the body
-            auto body_stmts = v->body_.checked_as<stmts>();
-            body_stmts->attr()[attr_keys::buf_sched_type]
-                    = buffer_schedule_type_;
-            auto sched_body = buffer_sche_(body_stmts);
-            old_body = sched_body.checked_as<stmts>()->seq_;
-            SC_MODULE_INFO << "After buffer reschedule:\n" << v;
-
-            if (!not_innermost) {
-                SC_MODULE_INFO << "The innermost level only buffer reschedule, "
-                                  "do not hoist";
-                // using visit of parent class
-                auto ret = ir_visitor_t::visit(v).remove_const();
-                ret->attr()[processed_by_brth] = true;
-                return ret;
-            }
-            // then, extract defines from par_for body
-            std::vector<stmt> defines;
-            auto others = make_stmt<stmts_node_t>(std::vector<stmt> {});
-            for (size_t i = 0; i < old_body.size(); ++i) {
-                stmt &curr = old_body[i];
-                if (curr.isa<define_c>()
-                        && curr.checked_as<define_c>()->var_.isa<tensor>()
-                        && !(curr.checked_as<define_c>()->init_.defined())) {
-                    define ori_def = curr.checked_as<define_c>().remove_const();
-                    // create a copy of tensor define for each thread, but
-                    // not for each iteration
-                    int num_copies = v->num_threads_ != 0
-                            ? v->num_threads_
-                            : runtime_config_t::get().get_num_threads();
-                    auto hoisted_def = copy_define(
-                            ori_def, num_copies, par_for_level_id_);
-                    defines.push_back(hoisted_def);
-                    // indexing of ori_def is modified
-                    others->seq_.emplace_back(ori_def);
-                } else {
-                    // will revisit inner for_loops to replace the outdated
-                    // indexings
-                    others->seq_.push_back(dispatch(curr).remove_const());
-                }
-            }
-            auto for_without_defines = builder::make_for_loop_unattached(
-                    v->var_, v->iter_begin_, v->iter_end_, v->step_, others,
-                    v->incremental_, v->kind_, v->num_threads_);
-            for_without_defines->attr()[processed_by_brth] = true;
-            // par_for with defines in body -->
-            // hoisted defines + par_for without defines in body
-            if (defines.empty()) {
-                SC_MODULE_INFO << "Output parallel for:\n"
-                               << for_without_defines;
-                return for_without_defines;
-            } else {
-                defines.push_back(for_without_defines);
-                stmts new_for = make_stmt<stmts_node_t>(std::move(defines));
-                SC_MODULE_INFO << "Output parallel for:\n" << new_for;
-                return new_for;
-            }
-        } else { // not parallel
-            return ir_visitor_t::visit(v); // using visit of parent class
-        }
-    }
-};
-
-func_c buffer_rescheduling_tensor_hoisting_t::operator()(func_c f) {
-    SC_MODULE_INFO << "Start run buffer_rescheduling_tensor_hoisting pass";
-    int buffer_sche_type = ctx_->flags_.buffer_schedule_;
-    if (f->attr_) {
-        if (f->attr_->has_key(attr_keys::buf_sched_type)) {
-            int type = f->attr_->template get<int>(attr_keys::buf_sched_type);
-            if (type < 0 || type > 3) {
-                SC_MODULE_WARN
-                        << "The attr pass.buf_sched_type should be >0 and <3";
-            } else {
-                buffer_sche_type = type;
-            }
-        }
-    }
-
-    buffer_rescheduling_tensor_hoisting_impl_t impl(
-            ctx_, buffer_sche_type, eliminate_dead_writes_, do_inplace_opt_);
-    f = impl.dispatch(std::move(f));
-    SC_MODULE_INFO << "Finish run buffer_rescheduling_tensor_hoisting pass";
-    return f;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_reschedule_tensor_hoist.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_reschedule_tensor_hoist.hpp
deleted file mode 100644
index 2eadf1e4c45..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_reschedule_tensor_hoist.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BUFFER_RESCHEDULE_TENSOR_HOIST_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BUFFER_RESCHEDULE_TENSOR_HOIST_HPP
-
-#include <utility>
-#include "../function_pass.hpp"
-#include <compiler/config/context.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * buffer rescheduling and tensor hoisting recursively
- * before buffer_schedule pass and nested_parallel_flatten pass
- * */
-class buffer_rescheduling_tensor_hoisting_t : public function_pass_t {
-public:
-    // params for buffer rescheduling
-    context_ptr ctx_;
-    bool eliminate_dead_writes_;
-    bool do_inplace_opt_;
-
-    buffer_rescheduling_tensor_hoisting_t(context_ptr ctx,
-            bool eliminate_dead_writes, bool do_inplace_opt = false)
-        : ctx_(std::move(ctx))
-        , eliminate_dead_writes_(eliminate_dead_writes)
-        , do_inplace_opt_(do_inplace_opt) {}
-
-    func_c operator()(func_c f) override;
-
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule.cpp
deleted file mode 100644
index 30a50b02fe6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule.cpp
+++ /dev/null
@@ -1,1480 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "buffer_schedule.hpp"
-#include <algorithm>
-#include <functional>
-#include <limits>
-#include <map>
-#include <memory>
-#include <set>
-#include <utility>
-#include <vector>
-#include "buffer_schedule_utils.hpp"
-#include "constant_fold.hpp"
-#include "pointer_alias_info.hpp"
-#include "static_memory_planner.hpp"
-#include "tensor_inplace_info.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/intrinsics.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/optional_find.hpp>
-
-SC_MODULE(pass.buffer_schedule);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(buffer_scheduler,
-        SC_PASS_DEPENDS_ON(tensor_shrinker, tensor_inplace, tensor2var,
-                tensor_init, index_flattener, dead_write_eliminator,
-                nested_parallel_flattener),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-using namespace special_ticks;
-
-// a visitor which has "instruction counter". Every visit on any expr will
-// increase the tick_ by 1. Note that by default, all tick_visitor visits exprs
-// in the order where they are executed at the run time. So the tick_ is the
-// execution order of each expr. If an expr has less tick than another, then it
-// is executed after the other.
-class tick_visitor_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    uint32_t tick_ = 0;
-    expr_c dispatch(expr_c e) override {
-        tick_++;
-        return ir_visitor_t::dispatch(std::move(e));
-    }
-
-    virtual void enter_complex_scope(const stmt_c &cur) {}
-    virtual void leave_complex_scope(const stmt_c &cur) {}
-
-    // for-loop is a special case: we can not really order the body exprs and
-    // iter_end_. So before we enter the body, we call enter_complex_scope(),
-    // after the body + iter_end_, we call leave_complex_scope(). Users can do
-    // specialized operation on the tick_
-    stmt_c visit(for_loop_c v) override {
-        auto var = dispatch(v->var_);
-        auto begin = dispatch(v->iter_begin_);
-        auto step = dispatch(v->step_);
-        enter_complex_scope(v);
-        // different visitor order than ir_visitor_t:
-        // iter_end_ will be executed after the loop body at the run time
-        auto body = dispatch(v->body_);
-        auto end = dispatch(v->iter_end_);
-        leave_complex_scope(v);
-
-        bool changed = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-                && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_)
-                && body.ptr_same(v->body_));
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_for_loop_unattached(var, begin, end, step,
-                            body, v->incremental_, v->kind_, v->num_threads_));
-        } else {
-            return std::move(v);
-        }
-    }
-
-    stmt_c visit(if_else_c v) override {
-        auto cond = dispatch(v->condition_);
-        enter_complex_scope(v);
-        auto then = dispatch(v->then_case_);
-        stmt_c else_case;
-        if (v->else_case_.defined()) { else_case = dispatch(v->else_case_); }
-        leave_complex_scope(v);
-        if (!cond.ptr_same(v->condition_) || !then.ptr_same(v->then_case_)
-                || !else_case.ptr_same(v->else_case_)) {
-            return copy_attr(*v,
-                    builder::make_if_else_unattached(cond, then, else_case));
-        }
-        return v;
-    }
-};
-
-static bool is_parallel_for(const stmt_c &v) {
-    if (v.isa<for_loop_c>()) {
-        auto loop = v.static_as<for_loop_c>();
-        if (loop->kind_ == for_type::PARALLEL) { return true; }
-        if (v->attr_
-                && v->attr_->get_or_else(
-                        attr_keys::buf_sched_top_scope, false)) {
-            return true;
-        }
-    }
-    return false;
-}
-
-class reference_tick_finder_t : public tick_visitor_t {
-public:
-    using tick_visitor_t::dispatch;
-    using tick_visitor_t::visit;
-    // tensor -> tensor_tick_info
-    std::unordered_map<expr_c, tensor_tick_info_t> &out_;
-    // memorize all tensors that are read/written in for-loop/if-else. Will
-    // later reset their tick to the tick of the end of the loop, the values are
-    // bit masks: the tensor is read if (mask & READ_MASK != 0) the tensor is
-    // written if (mask & WRITE_MASK != 0)
-    // scope may be nested in parallel for, we use a stack to record inner
-    // ticks.
-    std::vector<std::unordered_map<expr_c, char>> ticks_in_scope_;
-    static constexpr char READ_MASK = 1;
-    static constexpr char WRITE_MASK = 1 << 1;
-    // the depth of nested for we are currently visiting
-    int for_depth_ = 0;
-    // A stack recording the start tick in loop scopes, for those tensors who
-    // are defined in loops or accessed in loops.
-    std::vector<int64_t> for_start_ticks_;
-    // the depth of in for loop for the outmost level of parallel for.
-    // When parallel_depth_ > 0, the current scope is in parallel.
-    int parallel_depth_ = -1;
-    bool in_parallel_for_ = false;
-    // the number of "top-level" stmts that contains buffers to reschedule.
-    // The top-level stmts of a function has scope_id=NOT_THREAD_LOCAL(0) and
-    // each parallel-for has an unique scope id. It will be used to set scope_
-    // in tick_info.
-    uint64_t scope_id_ = NOT_THREAD_LOCAL;
-    // record scope id of stmts for base tensor replacement
-    std::unordered_map<stmt_c, uint64_t> stmts_to_scope_id_;
-
-    // the stack for tensors in nested scopes
-    std::vector<std::vector<expr_c>> tensor_in_scope_;
-    // the list of all defined tensors, by tensor creation order
-    std::vector<expr_c> &defined_tensors_;
-    // the map of tensor identity to the tensor
-    std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-            identity_to_tensor_;
-    reference_tick_finder_t(std::unordered_map<expr_c, tensor_tick_info_t> &out,
-            std::vector<expr_c> &defined_tensors)
-        : out_(out), defined_tensors_(defined_tensors) {}
-
-    // if a visit to the tensor_node means that the tensor use is complicated.
-    // If the parent node is call/indexing, this variable should be true
-    bool good_tensor_ = false;
-    enum {
-        READ,
-        WRITE,
-        TENSOR_PTR,
-    } indexing_parent_
-            = READ;
-
-    // update the ticks of tensor `t` when leaving the complex scope, according
-    // to three parameters:
-    // for start tick: default the tick entered the outmost loop. If we got
-    // hint tick, add it with default tick.
-    // for end tick: default the tick leaved the outmost loop. If we got
-    // hint tick, add it with default tick.
-    // read/write mask: mark tensor is read/written in loop.
-    void update_complex_scope_ticks(const expr_c &t, int64_t for_start_tick,
-            int64_t for_end_tick, char rw_mask) {
-        auto itr = out_.find(t);
-        if (itr != out_.end()) {
-            if (itr->second.already_scheduled_base_.defined()
-                    && !t.ptr_same(itr->second.already_scheduled_base_)) {
-                update_complex_scope_ticks(itr->second.already_scheduled_base_,
-                        for_start_tick, for_end_tick, rw_mask);
-            }
-            assert(for_start_tick != TICK_NOT_EXIST
-                    && for_start_tick != COMPLICATED_ACCESS);
-            assert(itr->second.first_access_ != TICK_NOT_EXIST);
-            // first used out of loop
-            itr->second.first_access_
-                    = std::min(itr->second.first_access_, for_start_tick);
-            if (itr->second.last_read_ == COMPLICATED_ACCESS) { return; }
-            auto first_access_tick = for_start_tick;
-            auto last_access_tick = for_end_tick;
-            // if tensor has additional hint tick info
-            if (t->attr_
-                    && t->attr_->has_key(attr_keys::hint_first_access_tick)) {
-                itr->second.has_hint_ = true;
-                auto hint_first_access_tick = t->attr_->get_or_else(
-                        attr_keys::hint_first_access_tick, TICK_NOT_EXIST);
-                auto hint_last_access_tick = t->attr_->get_or_else(
-                        attr_keys::hint_last_access_tick, TICK_NOT_EXIST);
-                // update tick
-                if (hint_first_access_tick == HINT_IN_LOOP
-                        || hint_last_access_tick == HINT_IN_LOOP) {
-                    if (itr->second.first_access_ >= for_start_tick) {
-                        itr->second.first_access_ = for_start_tick;
-                    }
-                    last_access_tick = for_end_tick;
-                } else if (hint_first_access_tick != TICK_NOT_EXIST
-                        && hint_first_access_tick != COMPLICATED_ACCESS) {
-                    assert(hint_last_access_tick != TICK_NOT_EXIST
-                            && hint_last_access_tick != COMPLICATED_ACCESS);
-                    assert(itr->second.first_access_ >= 0);
-                    // first access in loop, reset its first access
-                    if (itr->second.first_access_ >= for_start_tick) {
-                        itr->second.first_access_
-                                = for_start_tick + hint_first_access_tick;
-                    }
-                    last_access_tick = for_start_tick + hint_last_access_tick;
-                }
-            }
-            if (rw_mask & READ_MASK) {
-                itr->second.last_read_ = last_access_tick;
-            }
-            if (rw_mask & WRITE_MASK) {
-                itr->second.writes_.insert(last_access_tick);
-            }
-        }
-    }
-
-    void set_read_tick(const expr_c &t, int64_t tick) {
-        auto itr = out_.find(t);
-        if (itr != out_.end()) {
-            if (itr->second.already_scheduled_base_.defined()
-                    && !t.ptr_same(itr->second.already_scheduled_base_)) {
-                set_read_tick(itr->second.already_scheduled_base_, tick);
-            }
-            if (itr->second.last_read_ == TICK_NOT_EXIST
-                    && itr->second.writes_.empty()) {
-                itr->second.first_access_ = tick;
-                itr->second.real_first_access_ = tick;
-            }
-            // if the tensor has complicated use, don't optimize it
-            if (itr->second.last_read_ == COMPLICATED_ACCESS) { return; }
-            itr->second.last_read_ = tick;
-            // record ticks in current scope; if the tensor is defined in parent
-            // scope but used in parallel scope, add it to first scope(not
-            // thread local) in ticks_in_scope_
-            if (for_depth_ > 0) {
-                assert(!ticks_in_scope_.empty());
-                if (itr->second.scope_ != scope_id_) {
-                    // non_thread_local in parallel, push to parent scope
-                    ticks_in_scope_[0][t] |= READ_MASK;
-                } else if (for_depth_ != parallel_depth_) {
-                    // push to current scope
-                    ticks_in_scope_.back()[t] |= READ_MASK;
-                }
-            }
-        }
-        // else, the tensor is not defined in the current scope
-    }
-
-    void set_write_tick(const expr_c &t, int64_t tick) {
-        auto itr = out_.find(t);
-        if (itr != out_.end()) {
-            if (itr->second.already_scheduled_base_.defined()
-                    && !t.ptr_same(itr->second.already_scheduled_base_)) {
-                set_write_tick(itr->second.already_scheduled_base_, tick);
-            }
-            if (itr->second.last_read_ == TICK_NOT_EXIST
-                    && itr->second.writes_.empty()) {
-                itr->second.first_access_ = tick;
-                itr->second.real_first_access_ = tick;
-            }
-            if (itr->second.last_read_ == COMPLICATED_ACCESS) { return; }
-            // record ticks in current scope; if the tensor is defined in parent
-            // scope but used in parallel scope, add it to first scope(not
-            // thread local) in ticks_in_scope_
-            if (for_depth_ > 0) {
-                assert(!ticks_in_scope_.empty());
-                if (itr->second.scope_ != scope_id_) {
-                    ticks_in_scope_[0][t] |= WRITE_MASK;
-                    return;
-                } else if (for_depth_ != parallel_depth_) {
-                    ticks_in_scope_.back()[t] |= WRITE_MASK;
-                    return;
-                }
-            }
-            itr->second.writes_.insert(tick);
-        }
-        // else, the tensor is not defined in the current scope
-    }
-
-    void set_both_tick(const expr_c &t, int64_t tick) {
-        auto itr = out_.find(t);
-        if (itr != out_.end()) {
-            if (itr->second.already_scheduled_base_.defined()
-                    && !t.ptr_same(itr->second.already_scheduled_base_)) {
-                set_both_tick(itr->second.already_scheduled_base_, tick);
-            }
-            if (tick == COMPLICATED_ACCESS) {
-                itr->second.last_read_ = tick;
-                itr->second.writes_.clear();
-                return;
-            }
-            if (itr->second.last_read_ == TICK_NOT_EXIST
-                    && itr->second.writes_.empty()) {
-                itr->second.first_access_ = tick;
-                itr->second.real_first_access_ = tick;
-            }
-
-            if (itr->second.last_read_ == COMPLICATED_ACCESS) { return; }
-            itr->second.last_read_ = tick;
-            // record ticks in current scope; if the tensor is defined in parent
-            // scope but used in parallel scope, add it to first scope(not
-            // thread local) in ticks_in_scope_
-            if (for_depth_ > 0) {
-                assert(!ticks_in_scope_.empty());
-                if (itr->second.scope_ != scope_id_) {
-                    ticks_in_scope_[0][t] = WRITE_MASK | READ_MASK;
-                    return;
-                } else if (for_depth_ != parallel_depth_) {
-                    ticks_in_scope_.back()[t] = WRITE_MASK | READ_MASK;
-                    return;
-                }
-            }
-            itr->second.writes_.insert(tick);
-        }
-        // else, the tensor is not defined in the current scope
-    }
-
-    // called when we are entering the body of a complex scope like for-loop
-    void enter_complex_scope(const stmt_c &v) override {
-        in_parallel_for_ |= is_parallel_for(v);
-        for_depth_++;
-        if (in_parallel_for_ && parallel_depth_ == -1) {
-            scope_id_++;
-            stmts_to_scope_id_[v.checked_as<for_loop>()->body_] = scope_id_;
-            parallel_depth_ = for_depth_;
-        }
-        // ticks_in_scope has at most 2 scopes, non thread local/thread
-        // local
-        if (for_depth_ == 1 || parallel_depth_ + 1 == for_depth_) {
-            for_start_ticks_.emplace_back(tick_);
-            ticks_in_scope_.emplace_back();
-            assert(for_start_ticks_.size() <= 2);
-            COMPILE_ASSERT(ticks_in_scope_.size() <= 2UL,
-                    "Buffer scheduling currently only supports 2 level scopes");
-        }
-    }
-
-    // called when we are leaving a complex scope like for-loop
-    void leave_complex_scope(const stmt_c &cur) override {
-        for_depth_--;
-        // if we are in the most outer loop or inner loop in parallel loop
-        if (for_depth_ == 0 || parallel_depth_ == for_depth_) {
-            assert(!for_start_ticks_.empty());
-            assert(!ticks_in_scope_.empty());
-            // reset all referenced tensor to the current tick
-            for (auto &itr : ticks_in_scope_.back()) {
-                update_complex_scope_ticks(
-                        itr.first, for_start_ticks_.back(), tick_, itr.second);
-            }
-            // exit inner thread local scope
-            for_start_ticks_.pop_back();
-            ticks_in_scope_.pop_back();
-        }
-        // we assume that there is no nested parallel for
-        if (is_parallel_for(cur)) { in_parallel_for_ = false; }
-        if (parallel_depth_ == for_depth_ + 1) { parallel_depth_ = -1; }
-    }
-
-    stmt_c visit(assign_c v) override {
-        // we allow getting tensor address for "A_list[...]=&A", when A_list is
-        // a list_brgemm_arg
-        if (v->var_.isa<indexing>()) {
-            auto tsr = v->var_.static_as<indexing>()->ptr_;
-            if (tsr->attr_
-                    && tsr->attr_->get_or_else("list_brgemm_arg", false)) {
-                // make accesses on the real buffer not "complicated"
-                good_tensor_ = true;
-                auto itr = out_.find(tsr);
-                if (itr != out_.end()) {
-                    if (!itr->second.list_brgemm_tensors_) {
-                        itr->second.list_brgemm_tensors_ = utils::make_unique<
-                                std::unordered_set<expr_c>>();
-                    }
-                    auto list_brg_real_tsr = get_base_tensor_of(v->value_);
-                    if (list_brg_real_tsr.defined()) {
-                        itr->second.list_brgemm_tensors_->insert(
-                                list_brg_real_tsr);
-                    }
-                }
-            }
-        }
-        // read first, then write
-        dispatch(v->value_);
-        if (v->var_.isa<indexing>()) indexing_parent_ = WRITE;
-        dispatch(v->var_);
-        return v;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        if (stmts_to_scope_id_.empty()) {
-            assert(scope_id_ == 0);
-            stmts_to_scope_id_[v] = scope_id_;
-        }
-        tensor_in_scope_.emplace_back(std::vector<expr_c>());
-        tick_visitor_t::visit(v);
-        for (auto &t : tensor_in_scope_.back()) {
-            out_[t].delete_ = tick_;
-        }
-        tensor_in_scope_.pop_back();
-        return v;
-    }
-
-    stmt_c visit(define_c v) override {
-        if (v->var_.isa<tensor>()) {
-            expr already_scheduled_base;
-            // only process local tensors here
-            if (v->linkage_ == linkage::local
-                    && (!v->attr_
-                            || !v->attr_->get_or_else(
-                                    attr_keys::tsr_dont_buf_sched, false))) {
-                bool flag = false;
-                if (!v->init_.defined()) {
-                    // e.g., tensor c : [u8 * 32]. This is the default case.
-                    flag = true;
-                } else if (v->init_.isa<tensorptr_c>()) {
-                    // e.g., tensor A_list : [pointer * 8] =
-                    // &__rescheduled_0[0UL] which is built in former
-                    // buffer_schedule process
-                    const auto &tsr
-                            = v->init_.static_as<tensorptr>()->base_->ptr_;
-                    if (tsr->attr_
-                            && tsr->attr_->get_or_else(
-                                    attr_keys::can_be_scheduled, false)) {
-                        flag = true;
-                        already_scheduled_base = tsr;
-                    }
-                } else {
-                    flag = false;
-                }
-
-                if (flag) {
-                    out_[v->var_].create_ = tick_;
-                    // set scope id for thread local tensor
-                    if (in_parallel_for_) {
-                        assert(scope_id_ >= 1);
-                        out_[v->var_].scope_ = scope_id_;
-                    }
-                    if (v->init_.defined()) {
-                        out_[v->var_].already_scheduled_base_
-                                = already_scheduled_base;
-                    }
-                    if (auto identity = alias_info::get_alias_info(*v->var_)) {
-                        auto itr = identity_to_tensor_.find(identity);
-                        if (itr != identity_to_tensor_.end()) {
-                            SC_MODULE_WARN << "Multiple tensors uses the same "
-                                              "tensor identity: "
-                                           << v->var_;
-                            itr->second = expr_c();
-                        } else {
-                            identity_to_tensor_[identity] = v->var_;
-                        }
-                    }
-                    defined_tensors_.emplace_back(v->var_);
-                    tensor_in_scope_.back().emplace_back(v->var_);
-                    good_tensor_ = true;
-                    dispatch(v->var_);
-                    if (v->init_.defined()) {
-                        good_tensor_ = true;
-                        dispatch(v->init_);
-                        good_tensor_ = false;
-                    }
-                    return v;
-                }
-            }
-        }
-        tick_visitor_t::visit(v);
-        return v;
-    }
-
-    expr_c visit(indexing_c v) override {
-        // if the parent of the indexing is tensorptr, good_tensor_ remains
-        if (indexing_parent_ != TENSOR_PTR) { good_tensor_ = true; }
-        auto local_indexing_parent = indexing_parent_;
-        indexing_parent_ = READ;
-
-        tick_visitor_t::visit(v);
-        switch (local_indexing_parent) {
-            case READ:
-                set_read_tick(v->ptr_.checked_as<tensor_c>(), tick_);
-                break;
-            case WRITE:
-                set_write_tick(v->ptr_.checked_as<tensor_c>(), tick_);
-                break;
-            case TENSOR_PTR:
-                if (v->ptr_.isa<tensor_c>()) {
-                    auto tsr = v->ptr_.checked_as<tensor_c>();
-                    if (tsr->attr_
-                            && tsr->attr_->get_or_else(
-                                    attr_keys::can_be_scheduled, false)) {
-                        set_read_tick(tsr, tick_);
-                    }
-                }
-                break;
-            default: assert(0 && "Bad indexing parent");
-        }
-        return v;
-    }
-    expr_c visit(tensor_c v) override {
-        tick_visitor_t::visit(v);
-        if (!good_tensor_) {
-            // complex use of tensor. Normal use of tensor in
-            // indexing/function call will not go here
-            set_both_tick(v, COMPLICATED_ACCESS);
-        }
-        good_tensor_ = false;
-        return v;
-    }
-
-    expr_c visit(tensorptr_c v) override {
-        indexing_parent_ = TENSOR_PTR;
-        tick_visitor_t::visit(v);
-        return v;
-    }
-
-    void dispatch_args(const std::vector<expr> &args) {
-        bool old_good = good_tensor_;
-        // first calculate the tick after evaluating the args
-        for (unsigned i = 0; i < args.size(); i++) {
-            auto &p = args[i];
-            if (old_good || p.isa<tensor>() || p.isa<tensorptr>()) {
-                good_tensor_ = true; // good to use tensorptr in args
-            }
-            dispatch(p);
-        }
-    }
-
-    // if a tensor/tensorptr is passed in function args, set the r/w ticks
-    expr_c visit(call_c v) override {
-        using hint_t = std::vector<
-                std::pair<int, std::vector<tensor_inplace_info_t>>>;
-        hint_t *inplace_hint = (v->func_->attr_)
-                ? (v->func_->attr_->get_or_null<hint_t>(
-                        function_attrs::inplace_hint))
-                : nullptr;
-        if (v->func_ == builtin::get_mem_set_func()
-                || v->func_ == builtin::get_brgemm_init_func()) {
-            good_tensor_ = true;
-        }
-        dispatch_args(v->args_);
-        if (auto ex = std::dynamic_pointer_cast<expr_base>(v->func_)) {
-            dispatch(expr_c(ex));
-        }
-        // now tick_ is the tick when the last parameter is calculated. set the
-        // tick for referenced tensors
-        auto prototype = v->get_prototype();
-        auto call_site = std::make_shared<call_site_info_t>();
-        std::vector<expr_c> &arg_tensors = call_site->tensors_passed_;
-        call_site->func_ = prototype;
-        arg_tensors.resize(v->args_.size());
-        for (unsigned i = 0; i < v->args_.size(); i++) {
-            auto &p = v->args_[i];
-            auto &funcp = prototype->params_[i];
-            tensor_c tsr = get_base_tensor_of(p);
-            if (tsr.defined()) {
-                bool has_read
-                        = funcp->attr_ && funcp->attr_->has_key("read_buffer");
-                if (has_read) { set_read_tick(tsr, tick_); }
-                bool has_write
-                        = funcp->attr_ && funcp->attr_->has_key("write_buffer");
-                if (has_write) { set_write_tick(tsr, tick_); }
-                if (!has_read && !has_write) {
-                    // by default the buffer is r/w
-                    set_both_tick(tsr, tick_);
-                }
-                if (inplace_hint) { arg_tensors[i] = tsr; }
-            }
-        }
-        if (inplace_hint) {
-            for (auto &kv : *inplace_hint) {
-                auto &ths_tsr = arg_tensors.at(kv.first);
-                auto itr = out_.find(ths_tsr);
-                if (itr == out_.end()) { continue; }
-                // if the tensor is not firstly accessed at current time, the
-                // inplace hint is meaningless for the tensor, because it is
-                // already created
-                if (tick_ != itr->second.first_access_) { continue; }
-                auto &can_inplace_tsrs = kv.second;
-                COMPILE_ASSERT(ths_tsr.defined(),
-                        "Bad inplace hint index: " << kv.first);
-                for (auto idx : can_inplace_tsrs) {
-                    auto &inplace_tsr = arg_tensors.at(idx.used_arg_idx_);
-                    COMPILE_ASSERT(inplace_tsr.defined(),
-                            "Bad inplace hint index: " << idx.used_arg_idx_);
-                    itr->second.inplace_reuse_.emplace_back(
-                            inplace_tsr, idx.kind_);
-                }
-                itr->second.inplace_call_site_ = call_site;
-            }
-        }
-        return v;
-    }
-
-    // if a tensor/tensorptr is passed in function args, set the r/w ticks
-    expr_c visit(intrin_call_c v) override {
-        if (v->type_ == intrin_type::brgemm
-                || v->type_ == intrin_type::list_brgemm) {
-            good_tensor_ = true;
-            dispatch_args(v->args_);
-            // now tick_ is the tick when the last parameter is calculated. set
-            // the tick for referenced tensors
-            assert((v->type_ == intrin_type::brgemm
-                           && v->check_brgemm_arg_size(
-                                   brgemm_args::NUM_FULL_ARGS_STRIDE))
-                    || (v->type_ == intrin_type::list_brgemm
-                            && v->check_brgemm_arg_size(
-                                    brgemm_args::NUM_FULL_ARGS_LIST)));
-            for (auto i = 0UL; i < v->args_.size(); i++) {
-                auto &p = v->args_[i];
-                if (!p.defined()) continue;
-                tensor_c tsr = get_base_tensor_of(p);
-                if (tsr.defined()) {
-                    switch (i) {
-                        case brgemm_args::A: // fall through
-                        case brgemm_args::B:
-                            set_read_tick(tsr, tick_);
-                            // set the ticks of the real tensors
-                            if (v->type_ == intrin_type::list_brgemm) {
-                                auto itr = out_.find(tsr);
-                                if (itr != out_.end()
-                                        && itr->second.list_brgemm_tensors_) {
-                                    for (auto &linked_tsr :
-                                            *itr->second.list_brgemm_tensors_) {
-                                        set_read_tick(linked_tsr, tick_);
-                                    }
-                                }
-                            }
-                            break;
-                        case brgemm_args::C: set_write_tick(tsr, tick_); break;
-                        default: set_read_tick(tsr, tick_); break;
-                    }
-                }
-            }
-        } else {
-            ir_visitor_t::visit(v);
-        }
-        return v;
-    }
-
-    void parse_func_params(const std::vector<expr> &params) {
-        for (auto &p : params) {
-            if (p.isa<tensor>() && p->attr_
-                    && (p->attr_->has_key("write_buffer")
-                            || p->attr_->has_key("read_buffer"))) {
-                auto &info = (out_[p] = {});
-                info.create_ = 0;
-                info.delete_ = std::numeric_limits<int64_t>::max();
-                info.is_arg_ = true;
-                info.already_scheduled_base_ = p;
-            }
-        }
-    }
-
-    func_c dispatch(func_c v) override {
-        parse_func_params(v->params_);
-        dispatch(v->body_);
-        return v;
-    }
-};
-
-void annotate_ticks(const func_c &f,
-        std::unordered_map<expr_c, tensor_tick_info_t> &ticks,
-        std::vector<expr_c> &defined) {
-    reference_tick_finder_t finder(ticks, defined);
-    finder.dispatch(f);
-}
-
-class dead_tsr_write_remover_t : public tick_visitor_t {
-public:
-    using tick_visitor_t::dispatch;
-    using tick_visitor_t::visit;
-    // update stmts pointer in this pass.
-    std::unordered_map<stmt_c, uint64_t> &stmts_to_scope_id_;
-    // tensor -> tensor_tick_info
-    std::unordered_map<expr_c, tensor_tick_info_t> &out_;
-    std::unordered_set<expr_c> removed_in_for;
-    dead_tsr_write_remover_t(
-            std::unordered_map<stmt_c, uint64_t> &stmts_to_scope_id,
-            std::unordered_map<expr_c, tensor_tick_info_t> &out)
-        : stmts_to_scope_id_(stmts_to_scope_id), out_(out) {}
-
-    int for_depth_ = 0;
-    void enter_complex_scope(const stmt_c &v) override { for_depth_++; }
-
-    // called when we are leaving a for-loop, remove the end-of-loop write
-    void leave_complex_scope(const stmt_c &v) override {
-        for_depth_--;
-        // if we are in the most outer loop
-        if (for_depth_ == 0) {
-            for (auto &itr : removed_in_for) {
-                auto infoitr = out_.find(itr);
-                assert(infoitr != out_.end());
-                auto &writes = infoitr->second.writes_;
-                if (writes.find(tick_) != writes.end()) { writes.erase(tick_); }
-            }
-            removed_in_for.clear();
-        }
-    }
-
-    stmt_c visit(assign_c v) override {
-        // read first, then write
-        dispatch(v->value_);
-        dispatch(v->var_);
-        if (v->var_.isa<indexing>()) {
-            auto idxing = v->var_.static_as<indexing>();
-            auto tsr = idxing->ptr_.checked_as<tensor>();
-            auto itr = out_.find(tsr);
-            if (itr != out_.end()) {
-                auto last_read = itr->second.last_read_;
-                bool should_remove = !itr->second.is_arg_
-                        && !itr->second.is_already_scheduled()
-                        && !itr->second.scope_ && !itr->second.has_hint_
-                        && last_read != COMPLICATED_ACCESS
-                        && tick_ > itr->second.last_read_;
-                if (should_remove) {
-                    auto &writes = itr->second.writes_;
-                    if (for_depth_ == 0) {
-                        // not in for, directly remove the write
-                        assert(writes.find(tick_) != writes.end());
-                        writes.erase(tick_);
-                    } else {
-                        // will be removed after for
-                        removed_in_for.insert(tsr);
-                    }
-                    return builder::make_stmts_unattached({});
-                }
-            }
-        }
-        return v;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        auto newv = ir_visitor_t::visit(v);
-        if (!newv.ptr_same(v)) {
-            auto itr = stmts_to_scope_id_.find(v);
-            if (itr != stmts_to_scope_id_.end()) {
-                stmts_to_scope_id_[newv] = itr->second;
-                stmts_to_scope_id_.erase(itr);
-            }
-        }
-        return newv;
-    }
-
-    func_c dispatch(func_c v) override {
-        auto body = dispatch(v->body_);
-        if (!body.ptr_same(v->body_)) {
-            return copy_attr(*v,
-                    builder::make_func(v->name_, v->params_,
-                            body.remove_const(), v->ret_type_));
-        }
-        return v;
-    }
-};
-
-static bool check_if_tensor_valid(
-        const std::pair<const expr_c, tensor_tick_info_t> &itr,
-        bool &need_remove) {
-    int64_t lastread = itr.second.last_read_;
-    if (lastread == TICK_NOT_EXIST && itr.second.writes_.empty()
-            && !itr.second.is_arg_ && !itr.second.is_already_scheduled()) {
-        // the tensor is neither read or written, remove the tensor
-        SC_MODULE_INFO << "Removing unused " << itr.first;
-        need_remove = true;
-        return false;
-    }
-    if (!itr.first.checked_as<tensor_c>()->dims_[0].isa<constant>()) {
-        // only when the candidate has const shape can we reuse it
-        SC_MODULE_INFO << "The tensor " << itr.first
-                       << " has non-constant shape";
-        return false;
-    }
-    SC_MODULE_INFO << "tsr: " << itr.first << ", LRT=" << itr.second.last_read_
-                   << ", LWT="
-                   << (itr.second.writes_.empty()
-                                      ? TICK_NOT_EXIST
-                                      : *itr.second.writes_.rbegin())
-                   << ", FAT=" << itr.second.first_access_;
-    return true;
-}
-
-static void schedule_tensors(
-        std::unordered_map<expr_c, tensor_tick_info_t> &ticks,
-        std::vector<expr_c> &defined_tensors,
-        // the old_tensor -> reuse target tensor map
-        std::unordered_map<expr_c, expr_c> &replace_map,
-        // the reuse target tensor -> extended new tensor map
-        std::unordered_map<expr_c, expr_c> &extend_map) {
-    // last read tick => tensor, decending order
-    std::multimap<int64_t, expr_c, std::greater<int64_t>> last_read_tensor;
-    for (auto &itr : ticks) {
-        bool need_remove = false;
-        int64_t lastread = itr.second.last_read_;
-        if (!check_if_tensor_valid(itr, need_remove)) {
-            if (need_remove) { replace_map[itr.first] = expr_c(); }
-            continue;
-        }
-        // if lastread == TICK_NOT_EXIST, we are good. Because TICK_NOT_EXIST is
-        // a nagative number
-        if (lastread != COMPLICATED_ACCESS) {
-            last_read_tensor.insert(std::make_pair(lastread, itr.first));
-        }
-    }
-
-    // there will be some cases that buffer A decides to reuse buffer B, which
-    // is previously defined, but B has smaller size than A. We need to extend B
-    // this map memorize all tensors that needs to be extended
-    std::unordered_map<expr_c, int64_t> tensors_to_extend;
-    for (auto &tsr : defined_tensors) {
-        auto itr = ticks.find(tsr);
-        assert(itr != ticks.end());
-        const auto &cur_info = itr->second;
-        // SC_MODULE_INFO << "Tsr " << tsr << " LRT= " << cur_info.last_read_;
-        int64_t my_last_read = cur_info.last_read_;
-        tensor_c cur_tensor = tsr.static_as<tensor_c>();
-        assert(cur_tensor->dims_.size() == 1);
-        if (!cur_tensor->dims_[0].isa<constant>()) {
-            SC_MODULE_INFO << "The tensor " << cur_tensor
-                           << " has non-constant shape";
-            continue;
-        }
-        int64_t cur_tsr_size = get_const_as_int(
-                cur_tensor->dims_[0].static_as<constant_c>());
-
-        if (my_last_read == TICK_NOT_EXIST && cur_info.writes_.empty()) {
-            // the tensor is neither read or written, remove the tensor
-            SC_MODULE_WARN << "Removing " << tsr;
-            replace_map[cur_tensor] = expr_c();
-            continue;
-        }
-        if (my_last_read == COMPLICATED_ACCESS) {
-            // it has complicated access pattern, skip
-            SC_MODULE_INFO << "Complex access on " << tsr;
-            continue;
-        }
-
-        // find an avaliable tensor, which:
-        // 1. its last-read-tick is less than current first_access tick
-        // 2. it is in tensor_alive_ set
-
-        // upper_bound returns the first element which is less than cur tick
-        auto titr = last_read_tensor.upper_bound(cur_info.first_access_);
-        while (titr != last_read_tensor.end()) {
-            auto new_tensor = titr->second.checked_as<tensor_c>();
-            if (new_tensor.ptr_same(cur_tensor)) {
-                // found myself as in the map, skip
-                // this can occur when a tensor is written, but is never read
-                ++titr;
-                continue;
-            }
-            auto new_tensor_itr = ticks.find(new_tensor);
-            if (new_tensor_itr == ticks.end()) {
-                // the new_tensor is erased (reusing another tensor). Erase
-                // it from the lookup map
-                titr = last_read_tensor.erase(titr);
-                continue;
-            }
-            auto &new_info = new_tensor_itr->second;
-            // check if the candidate:
-            // 1) is defined before cur tensor's first use
-            // 2) dies after cur tensor dies
-            // 3) its dtype is the same as cur tensor
-            if (new_info.create_ <= cur_info.first_access_
-                    && new_info.delete_ >= cur_info.delete_
-                    && new_tensor->elem_dtype_ == cur_tensor->elem_dtype_) {
-                // check that the candidate has no writes during the time range
-                // when cur_tensor is in use: [cur_info.first_access_,
-                // cur_info.last_read_]
-                if (cur_info.last_read_ != TICK_NOT_EXIST) {
-                    auto lower = new_info.writes_.lower_bound(
-                            cur_info.first_access_);
-                    auto upper
-                            = new_info.writes_.upper_bound(cur_info.last_read_);
-                    // lower: the first element >= first_access_
-                    // upper: the first element > last_read_
-                    if (lower != upper) {
-                        // there are writes between first_access and last_read
-                        SC_MODULE_INFO << "Write after read: Failed "
-                                       << cur_tensor->name_ << "->"
-                                       << new_tensor->name_ << "@" << *lower;
-                        ++titr;
-                        continue;
-                    }
-                }
-
-                // if candidate is out_buffer from arg, we must not write to it
-                // after its first original access
-                if (new_info.is_arg_ && !cur_info.writes_.empty()
-                        && *cur_info.writes_.rbegin()
-                                >= new_info.first_access_) {
-                    SC_MODULE_INFO << "Write final buf: Failed "
-                                   << cur_tensor->name_ << "->"
-                                   << new_tensor->name_;
-                    ++titr;
-                    continue;
-                }
-
-                // an avaliable tensor is found, we can reuse it!
-                // now, check if the reused tensor: new_tensor is large
-                // enough
-                assert(new_tensor->dims_.size() == 1);
-                // all tensors in last_read_tensor are ensured to have const
-                // shape
-                int64_t new_tsr_size = get_const_as_int(
-                        new_tensor->dims_[0].static_as<constant_c>());
-                // if the candidate is too small, need to extend it
-                if (cur_tsr_size > new_tsr_size) {
-                    // cannot extend an argument buffer: it is given by the
-                    // caller
-                    if (new_info.is_arg_) {
-                        SC_MODULE_INFO << cur_tensor->name_ << " cannot reuse "
-                                       << new_tensor->name_
-                                       << ": cannot extend";
-                        ++titr;
-                        continue;
-                    }
-                    auto &sz = tensors_to_extend[new_tensor];
-                    sz = std::max(sz, cur_tsr_size);
-                }
-                // push to replace map
-                replace_map.insert(std::make_pair(cur_tensor, new_tensor));
-                // update the reused tensor's last-read-tick
-                // if my_last_read is TICK_NOT_EXIST, make sure the new tick
-                // be new_tensor's last read tick
-                int64_t newtick = std::max(my_last_read, new_info.last_read_);
-                new_info.last_read_ = newtick;
-                // merge the write traces
-                new_info.writes_.insert(
-                        cur_info.writes_.begin(), cur_info.writes_.end());
-                last_read_tensor.insert(std::make_pair(newtick, new_tensor));
-                last_read_tensor.erase(titr);
-                // cur_tensor is removed
-                ticks.erase(cur_tensor);
-
-                SC_MODULE_INFO << "Reuse " << tsr << "->" << new_tensor;
-                break;
-            }
-            ++titr;
-        }
-    }
-    // fix the recursion in replace map: a replaced by b, and b replaced by c
-    // then a should be replaced by c
-    constexpr int MAX_REPLACE_RECURSION = 100;
-    bool need_replace = true;
-    for (int recursion = 0; need_replace; recursion++) {
-        COMPILE_ASSERT(recursion < MAX_REPLACE_RECURSION,
-                "MAX_REPLACE_RECURSION Reached, a loop in buffer schedule?");
-        need_replace = false;
-        for (auto &v : replace_map) {
-            auto itr2 = replace_map.find(v.second);
-            if (itr2 != replace_map.end()) {
-                v.second = itr2->second;
-                need_replace = true;
-            }
-        }
-    }
-
-    // replace the tensors to extend
-    for (auto &v : tensors_to_extend) {
-        SC_MODULE_INFO << "Extend " << v.first;
-        auto newtsr = v.first->remake().checked_as<tensor>();
-        newtsr->dims_[0] = builder::make_constant((uint64_t)v.second);
-        extend_map.insert(std::make_pair(v.first, std::move(newtsr)));
-    }
-    // some of the reused tensors are extended, update them
-    for (auto &v : replace_map) {
-        // if the new value is also replaced
-        auto itr2 = extend_map.find(v.second);
-        if (itr2 != extend_map.end()) { v.second = itr2->second; }
-    }
-}
-
-// if tsr has in-place hint and the hint contains only one target and the target
-// has the same size & dtype of tsr
-static const tensor_node *get_inplace_target_if_single_inplace(
-        const std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-                &identity_to_tensor,
-        const tensor_node *tsr, const tensor_node *base) {
-    return some_opt(tsr->attr_.get())
-            .map([](any_map_t *v) {
-                return v->get_or_null<std::vector<temp_tensor_inplace_info_t>>(
-                        attr_keys::tensor_inplace_hint);
-            })
-            .filter([](std::vector<temp_tensor_inplace_info_t> *v) {
-                return v->size() == 1UL
-                        && (*v)[0].kind_ == inplace_kind::ZERO_OFFSET;
-            })
-            .flat_map([&identity_to_tensor](
-                              std::vector<temp_tensor_inplace_info_t> *v) {
-                return utils::find_map_value(
-                        identity_to_tensor, (*v)[0].to_reuse_.get());
-            })
-            .map([](const expr_c *v) {
-                // identity_to_tensor might map to empty pointer if the identity
-                // is mapped to multiple buffers. Let's filter that out.
-                return *v;
-            })
-            .map([](const expr_c &v) { return v.as<tensor_c>().get(); })
-            .filter([base](const tensor_node *v) {
-                return base->elem_dtype_ == v->elem_dtype_
-                        && get_const_as_int(
-                                   base->dims_.at(0).static_as<constant>())
-                        >= get_const_as_int(
-                                v->dims_.at(0).static_as<constant>());
-            })
-            .get_or_else(nullptr);
-}
-
-static std::vector<size_t> schedule_tensor_memory_planner(
-        std::unordered_map<expr_c, tensor_tick_info_t> &ticks,
-        const std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-                &identity_to_tensor,
-        std::vector<expr_c> &defined_tensors,
-        // the old_tensor -> (scope id, start offset)
-        std::unordered_map<expr_c, std::pair<uint64_t, size_t>> &replace_map,
-        std::unordered_map<expr_c, expr> &replace_map_in_output, bool hot_first,
-        uint64_t scopes, bool inplace) {
-    std::vector<size_t> total_list(scopes, 0);
-    // tick->trace map, outer map for different parallel scope.
-    std::vector<std::multimap<int64_t, memory_optim::memory_alloc_trace_t>>
-            traces(scopes);
-    std::vector<std::pair<const expr_c, tensor_tick_info_t> *> sorted_ticks;
-    sorted_ticks.reserve(ticks.size());
-    for (auto &itr : ticks) {
-        sorted_ticks.push_back(&itr);
-    }
-
-    memory_optim::inplace_info_map inplace_map;
-    // unordered ticks may cause unordered multimap
-    std::sort(sorted_ticks.begin(), sorted_ticks.end(),
-            [](const std::pair<const expr_c, tensor_tick_info_t> *x,
-                    const std::pair<const expr_c, tensor_tick_info_t> *y) {
-                if (x->second.real_first_access_
-                        != y->second.real_first_access_) {
-                    return x->second.real_first_access_
-                            < y->second.real_first_access_;
-                }
-                return x->first.checked_as<tensor>()->name_
-                        < y->first.checked_as<tensor>()->name_;
-            });
-    // collect alias info of func args
-    std::vector<alias_info::tensor_alias_identity_t *> arg_identities;
-    for (auto &itr_ptr : sorted_ticks) {
-        auto &itr = *itr_ptr;
-        auto tsr = itr.first.checked_as<tensor>();
-        if (itr.second.is_arg_) {
-            auto alias_data = alias_info::get_alias_info(*tsr);
-            if (alias_data) { arg_identities.emplace_back(alias_data); }
-        }
-    }
-    for (auto &itr_ptr : sorted_ticks) {
-        auto &itr = *itr_ptr;
-        auto tsr = itr.first.checked_as<tensor>();
-        if (itr.second.is_arg_) {
-            auto inplace_parent = get_inplace_target_if_single_inplace(
-                    identity_to_tensor, tsr.get(), tsr.get());
-            if (inplace_parent) {
-                // if this output buffer in func arg has already an alias with
-                // another arg by arg-tensor-inplace, temp buffers cannot
-                // in-place reuse this buffer
-                auto alias_data = alias_info::get_alias_info(*tsr);
-                if (alias_data) {
-                    for (auto &tid : arg_identities) {
-                        if (tid != alias_data && alias_data->is_alias_of(tid)) {
-                            SC_MODULE_INFO
-                                    << "Cannot in-place reuse output buffer"
-                                    << tsr
-                                    << ", because it is reusing other buffers";
-                            inplace_parent = nullptr;
-                            break;
-                        }
-                    }
-                }
-            }
-            while (inplace_parent) {
-                if (auto pinfo = utils::find_map_value(
-                            ticks, inplace_parent->node_ptr_from_this())
-                                         .get_or_else(nullptr)) {
-                    bool need_remove = false;
-                    int64_t lastread = itr.second.last_read_;
-                    if (!check_if_tensor_valid(itr, need_remove)) {
-                        if (need_remove) {
-                            replace_map[itr.first]
-                                    = std::make_pair(itr.second.scope_,
-                                            std::numeric_limits<size_t>::max());
-                        }
-                    } else {
-                        SC_MODULE_INFO << "In-place reusing output buffer"
-                                       << inplace_parent << " -> " << tsr;
-                        pinfo->is_arg_ = true;
-                        replace_map_in_output[inplace_parent
-                                                      ->node_ptr_from_this()]
-                                = builder::tensor_ptr(tsr, {UINT64_C(0)});
-                    }
-                    inplace_parent = get_inplace_target_if_single_inplace(
-                            identity_to_tensor, inplace_parent, tsr.get());
-                } else {
-                    inplace_parent = nullptr;
-                }
-            }
-        }
-    }
-    for (auto &itr_ptr : sorted_ticks) {
-        auto &itr = *itr_ptr;
-        auto tsr = itr.first.checked_as<tensor>();
-        if (itr.second.is_arg_ || itr.second.is_already_scheduled()) {
-            // skip arg tensor for now
-            // skip already scheduled tensors
-            continue;
-        }
-        bool need_remove = false;
-        int64_t lastread = itr.second.last_read_;
-        if (!check_if_tensor_valid(itr, need_remove)) {
-            if (need_remove) {
-                replace_map[itr.first] = std::make_pair(
-                        itr.second.scope_, std::numeric_limits<size_t>::max());
-            }
-            continue;
-        }
-        if (lastread == COMPLICATED_ACCESS) { continue; }
-
-        assert(tsr->dims_.size() == 1);
-        // all tensors in last_read_tensor are ensured to have const
-        // shape
-        int64_t tsr_size = utils::get_sizeof_type(tsr->elem_dtype_)
-                * get_const_as_int(tsr->dims_[0].static_as<constant_c>());
-        traces[itr.second.scope_].insert(
-                std::make_pair(itr.second.first_access_ * 2,
-                        memory_optim::memory_alloc_trace_t {
-                                (uintptr_t)tsr.get(), (size_t)tsr_size}));
-        auto last_access = itr.second.get_last_access();
-        // make sure last access > first_access_
-        traces[itr.second.scope_].insert(std::make_pair(last_access * 2 + 1,
-                memory_optim::memory_alloc_trace_t {(uintptr_t)tsr.get(), 0}));
-        if (inplace) {
-            std::unordered_map<uintptr_t, inplace_kind> inplace_tsrs;
-            for (auto &inp_tsr : itr.second.inplace_reuse_) {
-                // check if the tensor in the inplace hint is the last use
-                if (ticks[inp_tsr.first].get_last_access()
-                        == itr.second.first_access_) {
-                    inplace_tsrs.insert(std::make_pair(
-                            (uintptr_t)inp_tsr.first.get(), inp_tsr.second));
-                }
-            }
-            if (tsr->attr_) {
-                if (auto inplace_hints
-                        = tsr->attr_->get_or_null<
-                                std::vector<temp_tensor_inplace_info_t>>(
-                                attr_keys::tensor_inplace_hint)) {
-                    for (auto &hint : *inplace_hints) {
-                        auto hint_itr
-                                = identity_to_tensor.find(hint.to_reuse_.get());
-                        if (hint_itr == identity_to_tensor.end()) {
-                            SC_MODULE_WARN
-                                    << "Bad temp tensor in-place identity:"
-                                    << tsr;
-                        } else {
-                            auto &victim = hint_itr->second;
-                            auto lastuse = ticks[victim].get_last_access();
-
-                            inplace_tsrs.insert(std::make_pair(
-                                    (uintptr_t)victim.get(), hint.kind_));
-                        }
-                    }
-                }
-            }
-            if (!inplace_tsrs.empty()) {
-                inplace_map[(uintptr_t)tsr.get()]
-                        = std::vector<std::pair<uintptr_t, inplace_kind>>(
-                                inplace_tsrs.begin(), inplace_tsrs.end());
-            }
-        }
-    }
-
-    for (size_t i = 0; i < traces.size(); i++) {
-        auto &trace = traces[i];
-        std::vector<memory_optim::memory_alloc_trace_t> in_trace;
-        in_trace.reserve(trace.size());
-        for (auto &kv : trace) {
-            in_trace.emplace_back(kv.second);
-        }
-
-        std::unordered_map<uintptr_t, size_t> out;
-
-        std::unordered_map<uintptr_t, std::vector<uintptr_t>>
-                out_inplace_result;
-        total_list[i] = memory_optim::schedule_memory_allocations(in_trace,
-                /*512-bit alignment*/ 64, hot_first, inplace_map, out,
-                out_inplace_result);
-        for (auto &kv : out) {
-            auto p = reinterpret_cast<expr_base *>(kv.first)
-                             ->node_ptr_from_this();
-            replace_map[p] = std::make_pair(i, kv.second);
-            SC_MODULE_INFO << p << " -> " << kv.second;
-        }
-        // set alias info to the callee function
-        for (auto &kv : out_inplace_result) {
-            auto p = reinterpret_cast<expr_base *>(kv.first)
-                             ->node_ptr_from_this();
-            // p is a tensor which inplace reuses others
-            if (kv.second.empty()) { continue; }
-            auto &tinfo = ticks[p];
-            const auto &callsite = tinfo.inplace_call_site_;
-            // update the alias info. If a tensor inplace reuses another one in
-            // func parameters, we need to inform the function implementation
-            // that there is pointer alias in arguments
-            if (callsite) {
-                expr arg;
-                auto aliasinfo = std::make_shared<alias_info::alias_set_t>();
-                // first step, find the first usage of tensor p in the call
-                for (size_t pidx = 0; pidx < callsite->tensors_passed_.size();
-                        pidx++) {
-                    if (callsite->tensors_passed_[pidx].ptr_same(p)) {
-                        auto cur_arg = callsite->func_->params_.at(pidx);
-                        auto cur_aliasinfo
-                                = alias_info::get_or_create_alias_info(
-                                        *cur_arg);
-                        if (!arg.defined()) { arg = cur_arg; }
-                        SC_MODULE_INFO << "inplace-func "
-                                       << callsite->func_->name_
-                                       << " Add arg1:" << cur_arg;
-                        cur_aliasinfo->add_to_clique(aliasinfo);
-                    }
-                }
-                assert(arg.defined());
-                // second, find all func args that may have alias
-                for (size_t pidx = 0; pidx < callsite->tensors_passed_.size();
-                        pidx++) {
-                    for (auto inplace_id : kv.second) {
-                        auto inp = reinterpret_cast<expr_base *>(inplace_id);
-                        if (callsite->tensors_passed_[pidx].get() == inp) {
-                            auto cur_arg = callsite->func_->params_.at(pidx);
-                            auto cur_aliasinfo
-                                    = alias_info::get_or_create_alias_info(
-                                            *cur_arg);
-                            SC_MODULE_INFO << "inplace-func "
-                                           << callsite->func_->name_
-                                           << " Add arg2:" << cur_arg;
-                            cur_aliasinfo->add_to_clique(aliasinfo);
-                        }
-                    }
-                }
-            }
-        }
-        SC_MODULE_INFO << "Scope: " << i << ",Total: " << total_list[i];
-    }
-    return total_list;
-}
-
-class buffer_replacer_t : public ir_visitor_t {
-public:
-    std::unordered_map<expr_c, expr_c> &replace_map_;
-    std::unordered_map<expr_c, expr_c> &extend_map_;
-
-    buffer_replacer_t(std::unordered_map<expr_c, expr_c> &replace_map,
-            std::unordered_map<expr_c, expr_c> &extend_map)
-        : replace_map_(replace_map), extend_map_(extend_map) {}
-
-    stmt_c visit(define_c v) override {
-        if (v->var_.isa<tensor>()) {
-            auto itr = replace_map_.find(v->var_);
-            if (itr != replace_map_.end()) {
-                // the tensor is completely replaced
-                return builder::make_stmts_unattached({});
-            }
-            itr = extend_map_.find(v->var_);
-            if (itr != extend_map_.end()) {
-                // the tensor is extended
-                return copy_attr(*v,
-                        builder::make_var_tensor_def_unattached(
-                                itr->second, v->linkage_, v->init_));
-            }
-            return std::move(v);
-        } else {
-            return ir_visitor_t::visit(v);
-        }
-    }
-
-    expr_c visit(tensor_c v) override {
-        auto itr = replace_map_.find(v);
-        if (itr != replace_map_.end()) {
-            // the tensor is completely replaced
-            return itr->second;
-        }
-        itr = extend_map_.find(v);
-        if (itr != extend_map_.end()) {
-            // the tensor is extended
-            return itr->second;
-        }
-        return std::move(v);
-    }
-};
-
-class buffer_replacer_memory_planner_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    // tsr -> (scope id, start offset)
-    std::unordered_map<expr_c, std::pair<uint64_t, size_t>> &replace_map_;
-    std::unordered_map<expr_c, expr> &replace_map_in_output_;
-    // top-level stmts -> scope id
-    std::unordered_map<stmt_c, uint64_t> &stmts_to_scope_id_;
-    std::vector<size_t> total_list_;
-    std::vector<expr> base_list_;
-    buffer_replacer_memory_planner_t(
-            std::unordered_map<expr_c, std::pair<uint64_t, size_t>>
-                    &replace_map,
-            std::unordered_map<expr_c, expr> &replace_map_in_output,
-            std::unordered_map<stmt_c, uint64_t> &stmts_to_scope_id,
-            const std::vector<size_t> &total_list)
-        : replace_map_(replace_map)
-        , replace_map_in_output_(replace_map_in_output)
-        , stmts_to_scope_id_(stmts_to_scope_id)
-        , total_list_(total_list)
-        , base_list_(std::vector<expr>(total_list.size())) {}
-
-    stmt_c visit(define_c v) override {
-        if (v->var_.isa<tensor>() && !(v->init_.defined())) {
-            auto itr = replace_map_.find(v->var_);
-            if (itr != replace_map_.end()) {
-                auto cur_scope = itr->second.first;
-                // if need_remove
-                if (itr->second.second == std::numeric_limits<size_t>::max()) {
-                    return builder::make_stmts_unattached({});
-                }
-                assert(base_list_[cur_scope].defined());
-                return copy_attr(*v,
-                        builder::make_var_tensor_def_unattached(v->var_,
-                                v->linkage_,
-                                builder::tensor_ptr(base_list_[cur_scope],
-                                        {itr->second.second})));
-            }
-            auto itr2 = replace_map_in_output_.find(v->var_);
-            if (itr2 != replace_map_in_output_.end()) {
-                return copy_attr(*v,
-                        builder::make_var_tensor_def_unattached(
-                                v->var_, v->linkage_, itr2->second));
-            }
-            return std::move(v);
-        } else {
-            return ir_visitor_t::visit(v);
-        }
-    }
-
-    stmt_c visit(stmts_c v) override {
-        auto stmts_to_scope_itr = stmts_to_scope_id_.find(v);
-        bool is_top_level = stmts_to_scope_itr != stmts_to_scope_id_.end();
-        if (is_top_level) {
-            auto cur_scope = stmts_to_scope_itr->second;
-            if (total_list_[cur_scope]) {
-                auto rescheduled_tsr = builder::make_tensor(
-                        "__rescheduled_" + std::to_string(cur_scope),
-                        {total_list_[cur_scope]}, datatypes::s8);
-                rescheduled_tsr->attr()[attr_keys::can_be_scheduled] = true;
-                base_list_[cur_scope] = rescheduled_tsr;
-            }
-        }
-        auto ret = ir_visitor_t::visit(v);
-        if (is_top_level) {
-            auto cur_scope = stmts_to_scope_itr->second;
-            if (total_list_[cur_scope]) {
-                auto newret = ret.checked_as<stmts>()->seq_;
-                newret.insert(newret.begin(),
-                        builder::make_var_tensor_def_unattached(
-                                base_list_[cur_scope]));
-                return make_stmt<stmts_node_t>(std::move(newret));
-            }
-        }
-        return ret;
-    }
-
-    func_c dispatch(func_c v) override {
-        auto ret = ir_visitor_t::dispatch(v);
-        if (ret != v) {
-            std::const_pointer_cast<func_base>(ret)
-                    ->attr()[attr_keys::already_buf_sched]
-                    = true;
-        }
-        return ret;
-    }
-};
-
-template <typename T1>
-static T1 run(const context_ptr &ctx, T1 f, const func_c &toplevel,
-        bool remove_dead, bool inplace) {
-    int type = ctx->flags_.buffer_schedule_;
-    if (f->attr_) {
-        if (f->attr_->has_key(attr_keys::buf_sched_type)) {
-            int localtype
-                    = f->attr_->template get<int>(attr_keys::buf_sched_type);
-            if (localtype < 0 || localtype > 3) {
-                SC_MODULE_WARN
-                        << "The attr pass.buf_sched_type should be >0 and <3";
-            } else {
-                type = localtype;
-            }
-        }
-        if (f->attr_->get_or_else(attr_keys::already_buf_sched, false)) {
-            return f;
-        }
-    }
-    if (type == attr_keys::BUF_SCHED_NONE) { return f; }
-    bool aggresive = (type > 1);
-    std::unordered_map<expr_c, tensor_tick_info_t> tick_out;
-    std::vector<expr_c> defined;
-    reference_tick_finder_t finder(tick_out, defined);
-    if (toplevel) { finder.parse_func_params(toplevel->params_); }
-    finder.dispatch(f);
-    // if no local tensor defined, shortcut
-    if (defined.empty()) { return f; }
-    if (remove_dead) {
-        dead_tsr_write_remover_t dtwr(finder.stmts_to_scope_id_, tick_out);
-        f = dtwr.dispatch(f);
-        assert(dtwr.tick_ == finder.tick_);
-    }
-    if (aggresive) {
-        std::unordered_map<expr_c, std::pair<uint64_t, size_t>> replacer_list;
-        std::unordered_map<expr_c, expr> replacer_list_output;
-        auto total_list = schedule_tensor_memory_planner(tick_out,
-                finder.identity_to_tensor_, defined, replacer_list,
-                replacer_list_output, type == attr_keys::BUF_SCHED_HOT,
-                finder.scope_id_ + 1, inplace);
-        if (replacer_list.size() <= 1UL && replacer_list_output.empty()) {
-            return f;
-        }
-        buffer_replacer_memory_planner_t rep {replacer_list,
-                replacer_list_output, finder.stmts_to_scope_id_, total_list};
-        return rep.dispatch(f);
-    } else {
-        std::unordered_map<expr_c, expr_c> replacer, extender;
-        schedule_tensors(tick_out, defined, replacer, extender);
-        // if not replace, shortcut
-        if (replacer.empty() && extender.empty()) { return f; }
-        buffer_replacer_t rep(replacer, extender);
-        return rep.dispatch(f);
-    }
-}
-
-func_c buffer_scheduler_t::operator()(func_c f) {
-    if (f->attr_ && f->attr_->get_or_else(function_attrs::low_level, false)) {
-        return f;
-    }
-    return run(ctx_, f, nullptr, eliminate_dead_writes_, do_inplace_opt_);
-}
-
-stmt_c buffer_scheduler_t::operator()(stmt_c f) const {
-    return run(ctx_, std::move(f), top_level_, eliminate_dead_writes_,
-            do_inplace_opt_);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule.hpp
deleted file mode 100644
index a4b7e06f48b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BUFFER_SCHEDULE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BUFFER_SCHEDULE_HPP
-
-#include <utility>
-#include <vector>
-#include "../function_pass.hpp"
-#include <compiler/config/context.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace attr_keys {
-// the buffer scheduler type: 0 - no buffer schedule, 1 - whole buffer reuse, 2
-// - static memory planner (minimize size), 3 - static memory planner (hot
-// memory first)
-constexpr const char *buf_sched_type = "pass.buf_sched_type";
-// hint tick info for tensors in loops from graph or fusion mgr: int64_t. It
-// guides the buffer scheduler to correctly compute the tensor life time. Buffer
-// scheduler will it add to current tick to calculate final tick
-constexpr const char *hint_first_access_tick = "pass.hint_first_access_tick";
-constexpr const char *hint_last_access_tick = "pass.hint_last_access_tick";
-constexpr const char *tsr_dont_buf_sched = "pass.tsr_dont_buf_sched";
-// applied on functions. If true, the func has already been processed by
-// buffer_scheduler_t
-constexpr const char *already_buf_sched = "pass.already_buf_sched";
-constexpr int BUF_SCHED_NONE = 0;
-constexpr int BUF_SCHED_WHOLE = 1;
-constexpr int BUF_SCHED_SIZE = 2;
-constexpr int BUF_SCHED_HOT = 3;
-// the tensor inplace hint, applied on temp tensors. It should be a
-// vector<temp_tensor_inplace_info_t>.
-constexpr const char *tensor_inplace_hint = "pass.tensor_inplace_hint";
-// applied on for-loops. If a for-loop is attached with this attr = true, buffer
-// scheduler will treat it as a parallel-for and as a independent scope
-constexpr const char *buf_sched_top_scope = "pass.buf_sched_top_level_scope";
-// whether the tensor can be scheduled even if there are complex accesses to it
-constexpr const char *can_be_scheduled = "can_be_scheduled";
-} // namespace attr_keys
-
-/**
- * Schedule tensor buffers to reuse them if they are no longer needed.
- * This pass should only work on 1D tensors. It should be placed after
- * index_flatten
- *
- * 1) We sort all the expressions by execution order and all exprs are assigned
- * a tick. A greater tick means that the expr will be executed later than other
- * expr with less tick.
- *
- * 2) First collect the last-read-tick (LRT), all write ticks (in writes_ set)
- * and first-access-tick (FAT), creation tick, deletion tick for each tensor. We
- * collect these ticks on indexing_nodes, and functions calls. To distinguish
- * writes from reads, we also process assign_nodes (lvalues are written). The
- * function arguments can be annotated with "read_buffer" and "write_buffer" in
- * the function declaration. If no annotation is applied on an argument, the
- * tensor is considered read-written. Special case for "for_loop":
- * the tensors in a for-loop will be accessed mutiple times in "body_" and
- * "iter_end_". We manually set ticks of all tensors accessed in a for-loop to
- * the tick at the end of the loop.
- *
- * 3) Optionally (if eliminate_dead_writes_=true), remove all writes to local
- * tensors which is no longer read, where tick > tensor.LRT
- *
- * 4) Schedule the tensors. For each defined local tensors (in tensor creation
- * order), say, "cur", find another local defined/ function arg tensor, say
- * "candidate", where:
- *  1. cur.FAT > candidate.LRT && cur.FAT >= candidate.creation_tick &&
- * cur.deletion_tick <= candidate.deletion_tick.
- *  2. in the tick set candidate.writes, there are no writes to the candidates
- * that happens between [cur.FAT, cur.LRT].
- *  3. If the candidate is an function argument, make sure that cur writes will
- * not overwrite the candidate's final values: cur.last_write < candidate.FAT
- *
- * If such candidate is found, replace cur with the candidate
- *
- * 5) if "cur" is larger than "candidate" in size, extend candidate
- * */
-class buffer_scheduler_t : public function_pass_t {
-public:
-    context_ptr ctx_;
-    // if only transforming the func body, this field should be set to correctly
-    // handle the func args
-    func_c top_level_ = nullptr;
-    bool eliminate_dead_writes_;
-    bool do_inplace_opt_;
-    buffer_scheduler_t(context_ptr ctx, bool eliminate_dead_writes,
-            bool do_inplace_opt = false)
-        : ctx_(std::move(ctx))
-        , eliminate_dead_writes_(eliminate_dead_writes)
-        , do_inplace_opt_(do_inplace_opt) {}
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f) const;
-    SC_DECL_PASS_INFO_FUNC();
-};
-// todo: if the buffer ("candidate") is larger than the "cur" tensor, we can
-// split "candidate" tensor into two and reuse the remaining of it for other
-// tensors
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule_utils.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule_utils.hpp
deleted file mode 100644
index 8eb11e39664..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/buffer_schedule_utils.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BUFFER_SCHEDULE_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_BUFFER_SCHEDULE_UTILS_HPP
-
-#include "tensor_inplace_info.hpp"
-
-#include <algorithm>
-#include <memory>
-#include <set>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace special_ticks {
-// the tensor is never accessed
-static constexpr int64_t TICK_NOT_EXIST = -2;
-// the tensor has complicated access pattern: have you assigned a tensor to a
-// pointer?
-static constexpr int64_t COMPLICATED_ACCESS = -1;
-// the tensor was declared in for loop, and its lifetime is complicated. But can
-// be merged with other buffers with hints
-static constexpr int64_t HINT_IN_LOOP = -3;
-} // namespace special_ticks
-
-// the struct to track the call site which may have inplace reuse
-struct call_site_info_t {
-    func_c func_;
-    // the tensors passed to the function on the caller side. It has length of
-    // func->args_. If an arg is not tensor related, the element in this array
-    // will be empty.
-    std::vector<expr_c> tensors_passed_;
-};
-
-// the tensor is not thread local
-static constexpr uint64_t NOT_THREAD_LOCAL = 0;
-struct tensor_tick_info_t {
-    // first read/write tick, will not be reset by complex scopes, useful for
-    // sorting the tensors
-    int64_t real_first_access_ = special_ticks::TICK_NOT_EXIST;
-    int64_t first_access_
-            = special_ticks::TICK_NOT_EXIST; // first read/write tick
-    int64_t last_read_ = special_ticks::TICK_NOT_EXIST; // last read tick
-    std::set<int64_t> writes_; // all write ticks
-    int64_t create_ = special_ticks::TICK_NOT_EXIST; // tensor creation tick
-    int64_t delete_ = special_ticks::TICK_NOT_EXIST; // the tick that the tensor
-            // scope is done
-    bool is_arg_ = false; // if is the tensor defined in function args
-    // if the tensor is already scheduled, infered from the define stmt's
-    // init_. If the tensor is arg tensor, already_scheduled_base_ is the tensor
-    // itself
-    expr_c already_scheduled_base_;
-    uint64_t scope_ = NOT_THREAD_LOCAL; // parallel scope id
-    bool has_hint_ = false; // if the tensor has hint tick info
-    // the tensors that the current tensor can inplace reuse buffer with. Not
-    // all of the tensors in this set is valid. Only if last_access_tick of the
-    // tensor in the set == the first_access_tick of the current tensor can
-    // the tensor be reused.
-    std::vector<std::pair<expr_c, inplace_kind>> inplace_reuse_;
-    // the call site of the inplace function call. may be null. It is used to
-    // back-propagate the inplace result to the function to be called, to inform
-    // it that some of the args has pointer alias
-    std::shared_ptr<call_site_info_t> inplace_call_site_;
-    // Only valid when this tensor is an argument to list_brgemm calls. The set
-    // is used to record the A and B data tensors that the address list tensor
-    // of list_brgemm points to.
-    std::unique_ptr<std::unordered_set<expr_c>> list_brgemm_tensors_;
-
-    bool is_already_scheduled() const {
-        return already_scheduled_base_.defined();
-    }
-    int64_t get_last_access() const {
-        int64_t last_access = last_read_;
-        if (!writes_.empty()) {
-            last_access = std::max(last_access, *writes_.rbegin());
-        }
-        if (last_access == special_ticks::TICK_NOT_EXIST) {
-            last_access = first_access_;
-        }
-        return last_access;
-    }
-};
-
-void annotate_ticks(const func_c &f,
-        std::unordered_map<expr_c, tensor_tick_info_t> &ticks,
-        std::vector<expr_c> &defined);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/closurize_impl.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/closurize_impl.cpp
deleted file mode 100644
index 906b038d07d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/closurize_impl.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "closurize_impl.hpp"
-#include <utility>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-for_loop get_inner_for_loop(const for_loop_node_t *f);
-
-closurize_impl_t::closurize_impl_t(
-        const std::vector<define> &globals, ir_module_ptr modu)
-    : modu_(std::move(modu)) {
-    for (auto &g : globals) {
-        globals_set_.insert(g->var_);
-    }
-}
-
-func_c closurize_impl_t::dispatch(func_c f) {
-    cur_func_ = f;
-    return ir_visitor_t::dispatch(f);
-}
-
-expr_c closurize_impl_t::create_or_get_tensor_or_var(expr_c v) {
-    expr_c newv;
-    if (defined_set_.find(v) != defined_set_.end()) { return v; }
-    if (globals_set_.find(v) != globals_set_.end()) { return v; }
-    auto itr = captures_set_.find(v);
-    if (itr == captures_set_.end()) {
-        newv = v->remake();
-        captures_set_.insert(std::make_pair(v, newv));
-        captures_.emplace_back(v.remove_const());
-        defined_set_.insert(newv);
-    } else {
-        newv = itr->second;
-    }
-    return newv;
-}
-
-stmt_c closurize_impl_t::visit(define_c v) {
-    if (in_parallel_for && (v->var_.isa<var>() || v->var_.isa<tensor>())) {
-        expr_c r = v->var_;
-        if (v->var_.isa<tensor>()) {
-            v->var_->attr()["is_thread_buffer"] = true;
-            // for dynamic, dispatch tensors' dims.
-            r = ir_visitor_t::visit(v->var_.static_as<tensor>());
-            if (!r.ptr_same(v->var_)) { captures_set_.insert({v->var_, r}); }
-        }
-        defined_set_.insert(r);
-    }
-    return ir_visitor_t::visit(std::move(v));
-}
-
-expr_c closurize_impl_t::visit(tensor_c v) {
-    if (in_parallel_for) { return create_or_get_tensor_or_var(v); }
-    return v;
-}
-
-expr_c closurize_impl_t::visit(var_c v) {
-    if (in_parallel_for) { return create_or_get_tensor_or_var(v); }
-    return v;
-}
-
-stmt_c closurize_impl_t::visit(assign_c v) {
-    auto ret = ir_visitor_t::visit(v);
-    if (in_parallel_for) {
-        // If the IR assigns values to the captured
-        // variable, throw en error
-        COMPILE_ASSERT(captures_set_.find(v->var_) == captures_set_.end(),
-                "Assigning to captured vars: " << v);
-    }
-    return ret;
-}
-
-stmt_c closurize_impl_t::visit(for_loop_c v) {
-    if (v->kind_ == for_type::PARALLEL) {
-        COMPILE_ASSERT(!in_parallel_for,
-                "Cannot have parallel for in parallel for: " << v);
-        in_parallel_for = true;
-        stmt_c body;
-        std::vector<call_node::parallel_attr_t> attr;
-        if (v->kind_ == for_type::PARALLEL) {
-            dispatch(v->var_);
-            attr.emplace_back(v->iter_begin_, v->iter_end_, v->step_);
-            body = dispatch(v->body_);
-            assert(!captures_.empty());
-        }
-        std::vector<expr_c> params;
-        for (size_t i = 0; i < captures_.size(); i++) {
-            auto itr = captures_set_.find(captures_[i]);
-            assert(itr != captures_set_.end());
-            params.emplace_back(itr->second);
-        }
-
-        // all captured variables are in captures_
-        func_t func_target = make_closure_func(
-                cur_func_->name_ + "0_closure_" + std::to_string(kernel_cnt++),
-                std::move(params), std::move(body), attr);
-
-        in_parallel_for = false;
-        captures_set_.clear();
-        defined_set_.clear();
-
-        auto ret = make_parallel_call(func_target, captures_, std::move(attr));
-        captures_.clear();
-        return ret;
-    }
-    if (in_parallel_for) { defined_set_.insert(v->var_); }
-    return ir_visitor_t::visit(std::move(v));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/closurize_impl.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/closurize_impl.hpp
deleted file mode 100644
index 35925749e6a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/closurize_impl.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CLOSURIZE_IMPL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CLOSURIZE_IMPL_HPP
-
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * Base class for closurizer. Different targets should extend the
- * class to make target-specific parallel functions and parallel calls. Replaces
- * parallel for nodes with call_parallel nodes. Also moves the bodies of the
- * parallel for nodes to new closure functions. The closure functions have the
- * interface like `void closure_name(uint64_t i, T1 capture1, T2 capture2,
- * ...);`, where `i` is the loop variable, and captureX is the captured
- * variables that may be used in the body of the original for-loop.
- *
- * The target specific subclass should at least override `make_closure_func` and
- * `make_parallel_call`. See comments below.
- * */
-class closurize_impl_t : public ir_visitor_t {
-protected:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    std::unordered_map<expr_c, expr_c> captures_set_;
-    // the variables defined within the current captures
-    std::unordered_set<expr_c> defined_set_;
-    // the variables defined in the module as globals
-    std::unordered_set<expr_c> globals_set_;
-    bool in_parallel_for = false;
-    std::vector<expr> captures_;
-    func_c cur_func_;
-    ir_module_ptr modu_;
-    int kernel_cnt = 0;
-
-    closurize_impl_t(const std::vector<define> &globals, ir_module_ptr modu);
-
-    func_c dispatch(func_c f) override;
-    expr_c create_or_get_tensor_or_var(expr_c v);
-
-    stmt_c visit(define_c v) override;
-    expr_c visit(tensor_c v) override;
-    expr_c visit(var_c v) override;
-    stmt_c visit(assign_c v) override;
-
-    // makes the closure function and its generic wrapper, returns the callee
-    // function for the parallel_call
-    virtual func_t make_closure_func(const std::string &name,
-            std::vector<expr_c> &&params, stmt_c body,
-            const std::vector<call_node::parallel_attr_t> &para_attr)
-            = 0;
-    // make the parallel_call node and the arguments for it. Can be wrapped in a
-    // stmts node
-    virtual stmt make_parallel_call(func_t target, std::vector<expr> &captures,
-            std::vector<call_node::parallel_attr_t> &&para_attr)
-            = 0;
-    stmt_c visit(for_loop_c v) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/concat_memory_planning.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/concat_memory_planning.cpp
deleted file mode 100644
index 38b052b0df2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/concat_memory_planning.cpp
+++ /dev/null
@@ -1,432 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "concat_memory_planning.hpp"
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(pass.concat_memory_planning);
-
-SC_DECL_PASS_INFO(concat_memory_planning, SC_PASS_DEPENDS_ON(),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-bool is_standalone_concat_call(call_c &v) {
-    return v->attr_
-            && v->attr_->get_or_else(
-                    concat_optim_attr_keys::is_standalone_concat, false);
-}
-
-static void reset_strides(expr &arg) {
-    auto v = get_real_tensor(arg);
-    std::vector<expr> &strides = v->strides_;
-    strides = dims_to_dense_stride(v->dims_);
-}
-
-// Collect the {input buffer, (output buffer, offset)} info of all concat ops.
-// Collcet the concat calls that should be deleted.
-class concat_memory_planning_preprocess_t : public ir_visitor_t {
-public:
-    concat_memory_planning_preprocess_t(
-            std::unordered_map<expr, std::pair<expr, std::vector<expr>>>
-                    &concat_in_out,
-            std::unordered_set<expr_c> &to_be_deleted)
-        : concat_in_out(concat_in_out), to_be_deleted(to_be_deleted) {}
-    // map the input buffer to the output buffer and offset
-    std::unordered_map<expr, std::pair<expr, std::vector<expr>>> &concat_in_out;
-    // concat evaluations to be deleted
-    std::unordered_set<expr_c> &to_be_deleted;
-
-    expr_c visit(call_c v) override {
-        std::string func_name = v->get_prototype()->name_;
-        // if this is a standalone concat op, we can directly check the args
-        if (is_standalone_concat_call(v)) {
-            SC_MODULE_INFO << "Meet a standalone concat call node: " << v;
-            bool optimized = true;
-            std::vector<std::vector<expr>> inputs_offsets;
-            for (size_t i = 1; i < v->args_.size(); ++i) {
-                if (v->args_[i]->attr_
-                        && v->args_[i]->attr_->has_key(
-                                concat_optim_attr_keys::pass_memory_offset)) {
-                    auto &offset = v->args_[i]->attr().get<std::vector<expr>>(
-                            concat_optim_attr_keys::pass_memory_offset);
-                    if (offset.empty()) {
-                        SC_MODULE_WARN
-                                << "Input #" << i << ": " << v->args_[i]
-                                << " has empty offset set, skip this concat";
-                        optimized = false;
-                        break;
-                    }
-                    inputs_offsets.push_back(offset);
-                } else {
-                    SC_MODULE_INFO << "Input #" << i << ": " << v->args_[i]
-                                   << " has no offset set, can not do memory "
-                                      "planning, skip this concat";
-                    optimized = false;
-                    break;
-                }
-            }
-            if (!optimized) {
-                for (size_t i = 1; i < v->args_.size(); ++i) {
-                    reset_strides(v.remove_const()->args_[i]);
-                }
-            } else {
-                COMPILE_ASSERT(inputs_offsets.size() == v->args_.size() - 1,
-                        "Get wrong number of inputs offsets");
-                for (size_t i = 1; i < v->args_.size(); ++i) {
-                    concat_in_out[v->args_[i]] = std::make_pair(
-                            v->args_[0], inputs_offsets[i - 1]);
-                }
-                to_be_deleted.insert(v);
-            }
-        }
-        return v;
-    }
-};
-
-static void find_final_tensor_and_offset(
-        std::unordered_map<expr, std::pair<expr, std::vector<expr>>>
-                &concat_in_out) {
-    for (auto &pair : concat_in_out) {
-        expr curr = pair.first;
-        size_t n_dims = curr.static_as<tensor>()->dims_.size();
-        std::vector<expr> final_offset(n_dims, 0);
-        while (concat_in_out.find(curr) != concat_in_out.end()) {
-            auto parent = concat_in_out[curr].first;
-            auto offset = concat_in_out[curr].second;
-            curr = parent;
-            for (size_t i = 0; i < n_dims; ++i) {
-                final_offset[i] = final_offset[i] + offset[i];
-            }
-        }
-        for (size_t i = 0; i < n_dims; ++i) {
-            final_offset[i] = do_cast_and_fold(final_offset[i]);
-        }
-        pair.second = {curr, final_offset};
-    }
-}
-
-// We have to visit the parent nodes of concat, so we can not merge this into
-// preprocess.
-class concat_memory_planning_process_t : public ir_visitor_t {
-public:
-    concat_memory_planning_process_t(
-            std::unordered_map<expr, std::pair<expr, std::vector<expr>>>
-                    &concat_in_out,
-            std::unordered_set<expr_c> &to_be_deleted)
-        : concat_in_out(concat_in_out), to_be_deleted(to_be_deleted) {}
-    std::unordered_map<expr, std::pair<expr, std::vector<expr>>> &concat_in_out;
-    // concat evaluations to be deleted, collected from preprocess
-    std::unordered_set<expr_c> &to_be_deleted;
-    // the concat ops' output buffers which are defined within the function
-    std::unordered_set<expr> output_of_concats;
-
-    using ir_visitor_t::dispatch;
-    func_c dispatch(func_c v) override {
-        // the args of function
-        std::unordered_set<expr> args;
-        for (auto &arg : v->params_) {
-            args.insert(arg);
-        }
-        for (auto &in_out_pair : concat_in_out) {
-            auto &out_buf = in_out_pair.second.first;
-            // if the concat output buffer is defined within the function, not
-            // an input/output param of the function
-            if (args.count(out_buf) == 0) { output_of_concats.insert(out_buf); }
-        }
-        return ir_visitor_t::dispatch(v);
-    }
-
-    // Move forward the define of output buffer of concat.
-    // Delete the defines of input buffers and concat evaluates.
-    stmt_c visit(stmts_c v) override {
-        const std::vector<stmt> &stmts_vec = v->seq_;
-        std::unordered_map<expr, std::pair<size_t, size_t>>
-                output_define_ori_pos_new_pos;
-        // input buffer defines or concat evaluations
-        std::unordered_set<size_t> to_delete_stmt_pos;
-        for (size_t i = 0; i < stmts_vec.size(); ++i) {
-            auto s = stmts_vec[i];
-            if (s.isa<define>()) {
-                auto var = s.static_as<define>()->var_;
-                // if this stmt defines an input to a concat,
-                if (concat_in_out.find(var) != concat_in_out.end()) {
-                    to_delete_stmt_pos.insert(i);
-                    auto out_var = concat_in_out[var].first;
-                    // then we will move the define of output to this position
-                    if (output_of_concats.count(out_var)
-                            && output_define_ori_pos_new_pos.find(out_var)
-                                    == output_define_ori_pos_new_pos.end()) {
-                        output_define_ori_pos_new_pos[out_var].second = i;
-                    }
-                }
-                // if this defines the output to a concat
-                if (output_of_concats.find(var) != output_of_concats.end()) {
-                    to_delete_stmt_pos.insert(i);
-                    output_define_ori_pos_new_pos[var].first = i;
-                }
-            }
-            if (s.isa<evaluate>()) {
-                if (to_be_deleted.count(s.static_as<evaluate>()->value_)) {
-                    to_delete_stmt_pos.insert(i);
-                }
-            }
-        }
-
-        std::unordered_map<size_t, size_t> new_ori;
-        for (const auto &pos : output_define_ori_pos_new_pos) {
-            size_t ori_pos = pos.second.first;
-            size_t new_pos = pos.second.second;
-            new_ori[new_pos] = ori_pos;
-        }
-        std::vector<stmt> new_seq;
-        for (size_t i = 0; i < stmts_vec.size(); ++i) {
-            if (new_ori.find(i) != new_ori.end()) {
-                new_seq.push_back(stmts_vec[new_ori[i]]);
-                // TODO(niuxiaoguang): the inputs and output of concats
-                // should not occupy same address, but there seems no direct
-                // way to add this constraint, so we simply do not do buffer
-                // rescheduling on the output of concat. This may affects
-                // performance in some cases. Improve this in the future.
-                new_seq.back()->attr()["pass.tsr_dont_buf_sched"] = true;
-            } else if (to_delete_stmt_pos.count(i)) {
-                continue;
-            } else {
-                new_seq.push_back(dispatch(stmts_vec[i]).remove_const());
-            }
-        }
-        return make_stmt<stmts_node_t>(std::move(new_seq));
-    }
-
-    stmt_c visit(define_c v) override { return v; }
-
-    // Replace the inputs of concat with tensorptrs on the output of concat.
-    expr_c visit(tensor_c v) override {
-        expr e = v.static_as<expr>();
-        if (concat_in_out.find(e) != concat_in_out.end()) {
-            auto tsrptr = builder::tensor_ptr(concat_in_out[e].first,
-                    concat_in_out[e].second, v->dims_, true);
-            SC_MODULE_INFO << "Meet tensor_c: " << v
-                           << " of shape: " << utils::print_vector(v->dims_)
-                           << ", replace it with tensorptr: " << tsrptr;
-            return tsrptr;
-        } else {
-            return v;
-        }
-    }
-};
-
-static const_ir_module_ptr optimize_standalone_concat_in_main_entry(
-        const_ir_module_ptr in_mod) {
-    SC_MODULE_INFO << "Start run concat_memory_planning pass on main entry";
-    if (in_mod->get_entry_func() == nullptr) { return in_mod; }
-    std::unordered_map<expr, std::pair<expr, std::vector<expr>>> concat_in_out;
-    std::unordered_set<expr_c> to_be_deleted;
-    concat_memory_planning_preprocess_t pre(concat_in_out, to_be_deleted);
-    pre.dispatch(in_mod->get_entry_func());
-
-    if (concat_in_out.empty()) {
-        SC_MODULE_INFO
-                << "Finish run concat_memory_planning pass on main entry";
-        return in_mod;
-    } else {
-        find_final_tensor_and_offset(concat_in_out);
-
-        concat_memory_planning_process_t pro(concat_in_out, to_be_deleted);
-        auto ret_mod = std::make_shared<ir_module_t>(*in_mod);
-        for (auto &funct : ret_mod->get_contents()) {
-            if (funct == ret_mod->get_entry_func()) {
-                funct = std::const_pointer_cast<func_base>(pro.dispatch(funct));
-            }
-        }
-        SC_MODULE_INFO
-                << "Finish run concat_memory_planning pass on main entry";
-        return ret_mod;
-    }
-}
-
-static expr find_final_tsr(expr v) {
-    if (v.isa<tensor>()) {
-        return v;
-    } else if (v.isa<indexing>()) {
-        return find_final_tsr(v.checked_as<indexing>()->ptr_);
-    } else if (v.isa<tensorptr>()) {
-        return find_final_tsr(v.checked_as<tensorptr>()->base_);
-    } else {
-        COMPILE_ASSERT(false, "Cannot find final tensor for: " << v);
-        return v;
-    }
-}
-
-// Collect the {input buffer, (output buffer, offset)} info of all concat ops.
-// Some assignment operations of concat will be useless after optimization, so
-// collect them to a set.
-class concat_in_mxp_memory_planning_preprocess_t : public ir_viewer_t {
-public:
-    concat_in_mxp_memory_planning_preprocess_t(
-            std::unordered_map<expr, std::pair<expr, std::vector<expr>>>
-                    &concat_in_out,
-            std::unordered_set<stmt_c> &to_be_deleted)
-        : concat_in_out(concat_in_out), to_be_deleted(to_be_deleted) {}
-    // the args of function
-    std::unordered_set<expr> args;
-    // map the input buffer to the output buffer and offset
-    std::unordered_map<expr, std::pair<expr, std::vector<expr>>> &concat_in_out;
-    // assignment statements that will be deleted later
-    std::unordered_set<stmt_c> &to_be_deleted;
-
-    func_c dispatch(func_c v) override {
-        for (auto &arg : v->params_) {
-            args.insert(arg);
-        }
-        return ir_viewer_t::dispatch(v);
-    }
-
-    void view(tensor_c v) override {
-        if (args.count(v.static_as<expr>()) == 0 && v->attr_
-                && v->attr_->has_key(concat_optim_attr_keys::pass_memory_offset)
-                && v->attr_->has_key(
-                        concat_optim_attr_keys::pass_memory_offset_to)) {
-            // this buffer will be replaced by tensorptr
-            auto &offset = v->attr_->get<std::vector<expr>>(
-                    concat_optim_attr_keys::pass_memory_offset);
-            COMPILE_ASSERT(!offset.empty(), "Input has empty offset set");
-            auto &offset_to = v->attr_->get<expr>(
-                    concat_optim_attr_keys::pass_memory_offset_to);
-            auto input_expr = v.static_as<expr>();
-            concat_in_out[input_expr] = std::make_pair(offset_to, offset);
-        }
-    }
-
-    // collcet the redundant assignment stmts of concat operations
-    void view(assign_c v) override {
-        if (v->var_.isa<indexing>() && v->value_.isa<indexing>()) {
-            // left[...] = right[...] ==> left = &left[...][...]
-            expr left = find_final_tsr(v->var_);
-            expr right = find_final_tsr(v->value_);
-            if (args.count(right) == 0 // this buffer is not a function param
-                    && concat_in_out.find(right) != concat_in_out.end()
-                    && concat_in_out.at(right).first.ptr_same(left)) {
-                // right is concat input and left is concat output
-                to_be_deleted.insert(v);
-            }
-        }
-    }
-};
-
-// Replace the inputs of concat with tensorptrs on the output of concat.
-// We have to visit the parent nodes of concat, so we can not merge this into
-// preprocess.
-class concat_in_mxp_memory_planning_process_t : public ir_visitor_t {
-public:
-    concat_in_mxp_memory_planning_process_t(
-            std::unordered_map<expr, std::pair<expr, std::vector<expr>>>
-                    &concat_in_out,
-            std::unordered_set<stmt_c> &to_be_deleted)
-        : concat_in_out(concat_in_out), to_be_deleted(to_be_deleted) {}
-    std::unordered_map<expr, std::pair<expr, std::vector<expr>>> &concat_in_out;
-    std::unordered_set<stmt_c> &to_be_deleted;
-
-    using ir_visitor_t::dispatch;
-
-    stmt_c visit(stmts_c v) override {
-        const std::vector<stmt> &stmts_vec = v->seq_;
-        std::vector<stmt> new_seq;
-        for (size_t i = 0; i < stmts_vec.size(); ++i) {
-            auto s = stmts_vec[i];
-            if (s.isa<assign>() && to_be_deleted.count(s)) {
-                continue;
-            } else {
-                new_seq.push_back(dispatch(s).remove_const());
-            }
-        }
-        return make_stmt<stmts_node_t>(std::move(new_seq));
-    }
-
-    stmt_c visit(define_c v) override { return v; }
-
-    expr_c visit(tensor_c v) override {
-        expr var = v.static_as<expr>();
-        if (concat_in_out.find(var) != concat_in_out.end()) {
-            auto tsrptr = builder::tensor_ptr(concat_in_out[var].first,
-                    concat_in_out[var].second, v->dims_, true);
-            SC_MODULE_INFO << "Meet tensor_c: " << v
-                           << ", replace it with tensorptr: " << tsrptr;
-            return tsrptr;
-        }
-        return v;
-    }
-};
-
-static const_ir_module_ptr optimize_concat_in_mxp(
-        const const_ir_module_ptr &in_mod) {
-    SC_MODULE_INFO
-            << "Start run concat_memory_planning pass on mixed partitions";
-    auto out_mod = std::make_shared<ir_module_t>(*in_mod);
-    for (auto &funct : out_mod->get_contents()) {
-        if (funct == out_mod->get_entry_func()) { continue; }
-        std::unordered_map<expr, std::pair<expr, std::vector<expr>>>
-                concat_in_out;
-        std::unordered_set<stmt_c> to_be_deleted;
-        concat_in_mxp_memory_planning_preprocess_t pre(
-                concat_in_out, to_be_deleted);
-        pre.dispatch(funct);
-        if (concat_in_out.empty()) {
-            continue;
-        } else {
-            concat_in_mxp_memory_planning_process_t pro(
-                    concat_in_out, to_be_deleted);
-            funct->body_ = pro.dispatch(funct->body_).remove_const();
-        }
-    }
-    SC_MODULE_INFO
-            << "Finish run concat_memory_planning pass on mixed partitions";
-    return out_mod;
-}
-
-const_ir_module_ptr concat_memory_planning_t::operator()(
-        const_ir_module_ptr in_mod) {
-    SC_MODULE_INFO << "Start run concat_memory_planning pass";
-    auto out_mod = optimize_standalone_concat_in_main_entry(
-            optimize_concat_in_mxp(in_mod));
-    SC_MODULE_INFO << "Finish run concat_memory_planning pass";
-    return out_mod;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/concat_memory_planning.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/concat_memory_planning.hpp
deleted file mode 100644
index 7f600622e93..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/concat_memory_planning.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CONCAT_MEMORY_PLANNING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CONCAT_MEMORY_PLANNING_HPP
-
-#include <compiler/ir/module_pass.hpp>
-#include <compiler/ir/sc_function.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace concat_optim_attr_keys {
-constexpr const char *graph_memory_offset = "memory_offset";
-constexpr const char *pass_memory_offset = "memory_offset";
-constexpr const char *graph_memory_offset_to = "memory_offset_to";
-constexpr const char *pass_memory_offset_to = "memory_offset_to";
-constexpr const char *is_final_concat = "is_final_concat";
-constexpr const char *is_standalone_concat = "is_standalone_concat";
-} // namespace concat_optim_attr_keys
-
-/*
-All inputs and output of concat share the strides_ vector. The input tensor
-of concat is a tensorptr on output tensor of concat. The output tensor of
-concat is dense, and its's strides_ is straightforward. The input tensor is
-strided, but we do not need to set its strides_, because the strides_ of a
-tensorptr is from the strides_ of its base. But different inputs have
-different offsets and we need to set them.
-*/
-
-/*
-For the following graph:
-    op0   op1   op2
-      \    |    /
-       \   |   /
-         concat
-           |
-           |
-          op3
-There are four tensors: output of op0, output of op1, output of op2, output of
-concat. After optimization, there is only one tensor (and only one buffer):
-output of concat. The output of op0, output of op1 and output of op2
-are offset-and-strided tensorptrs from the output of concat.
-In GraphIR, we use graph_concat_memory_planning pass to set the strides and
-offsets to the output of op0, output of op1 and output of op2.
-In TensorIR, we only allocate the output buffer of concat.
-*/
-
-class concat_memory_planning_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-bool is_standalone_concat_call(call_c &v);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/constant_fold.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/constant_fold.cpp
deleted file mode 100644
index 07c304beea1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/constant_fold.cpp
+++ /dev/null
@@ -1,2017 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_map>
-
-#include <algorithm>
-#include <atomic>
-#include <string>
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../util_module_passes.hpp"
-#include "../visitor.hpp"
-#include "auto_cast.hpp"
-#include "constant_fold.hpp"
-#include <compiler/ir/content_hash.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-#include <util/hash_utils.hpp>
-#include <util/optional.hpp>
-#include <util/utils.hpp>
-#include <util/variant.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-SC_DECL_PASS_INFO(constant_folder, SC_PASS_DEPENDS_ON(validator, auto_caster),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(CONST_FOLDED), SC_PASS_UNSET_STATE());
-
-namespace constant_folding {
-
-// the range from start to end (including end)
-struct const_range_t {
-    type_category cate;
-    union_val start;
-    union_val end;
-
-    enum class infer_result { YES, NO, UNKNOWN };
-
-    bool is_single_value() const { return start == end; }
-
-    bool union_val_less(union_val a, union_val b) const {
-        switch (cate) {
-            case CATE_FLOAT: return a.f32 < b.f32; break;
-            case CATE_INT: return a.s64 < b.s64; break;
-            case CATE_UINT: return a.u64 < b.u64; break;
-            default:
-                throw std::runtime_error("Bad type for type_category");
-                return false;
-                break;
-        }
-    }
-
-    bool is_good_range() const {
-        return union_val_less(start, end) || is_single_value();
-    }
-
-#define def_operator(name, OP) \
-    union_val union_val_##name(union_val a, union_val b) const { \
-        switch (cate) { \
-            case CATE_FLOAT: return a.f32 OP b.f32; break; \
-            case CATE_INT: return a.s64 OP b.s64; break; \
-            case CATE_UINT: return a.u64 OP b.u64; break; \
-            default: \
-                throw std::runtime_error("Bad type for type_category"); \
-                return 0UL; \
-                break; \
-        } \
-    }
-
-    def_operator(add, +);
-    def_operator(sub, -);
-    def_operator(mul, *);
-    def_operator(div, /);
-
-    void assert_same_category(const const_range_t &other) const {
-        COMPILE_ASSERT(
-                cate == other.cate, "const_ranges should have same categories");
-    }
-
-    infer_result query_equal(const const_range_t &other) const {
-        assert_same_category(other);
-        if (!is_good_range() || !other.is_good_range()) {
-            // there is overflow
-            return infer_result::UNKNOWN;
-        }
-        //[s,e] [s2,e2]
-        if (union_val_less(end, other.start)) { return infer_result::NO; }
-        // [s2, e2] [s, e]
-        if (union_val_less(other.end, start)) { return infer_result::NO; }
-        return infer_result::UNKNOWN;
-    }
-
-    infer_result less_than(const const_range_t &other, bool allow_equal) const {
-        assert_same_category(other);
-        if (!is_good_range() || !other.is_good_range()) {
-            // there is overflow
-            return infer_result::UNKNOWN;
-        }
-        //[s,e] [s2,e2]
-        if (union_val_less(end, other.start)) { return infer_result::YES; }
-        // [s,e = s2, e2]
-        if (end == other.start) {
-            if (allow_equal) {
-                return infer_result::YES;
-            } else {
-                return infer_result::UNKNOWN;
-            }
-        }
-        // [s2, e2==s, e]
-        if (start == other.end) {
-            if (allow_equal)
-                return infer_result::UNKNOWN;
-            else
-                return infer_result::NO;
-        }
-
-        // [s2, e2] [s, e]
-        if (union_val_less(other.end, start)) { return infer_result::NO; }
-
-        // otherwise, overlapping
-        return infer_result::UNKNOWN;
-    }
-
-    bool operator==(const const_range_t &other) const {
-        assert_same_category(other);
-        return start == other.start && end == other.end;
-    }
-
-    const_range_t operator+(const const_range_t &other) const {
-        assert_same_category(other);
-        return const_range_t {cate, union_val_add(start, other.start),
-                union_val_add(end, other.end)};
-    }
-
-    const_range_t operator-(const const_range_t &other) const {
-        assert_same_category(other);
-        return const_range_t {cate, union_val_sub(start, other.end),
-                union_val_sub(end, other.start)};
-    }
-
-    const_range_t operator*(const const_range_t &other) const {
-        assert_same_category(other);
-        return const_range_t {cate, union_val_mul(start, other.start),
-                union_val_mul(end, other.end)};
-    }
-
-    const_range_t operator/(const const_range_t &other) const {
-        assert_same_category(other);
-        return const_range_t {cate, union_val_div(start, other.end),
-                union_val_div(end, other.start)};
-    }
-
-    const_range_t get_mod_range() const {
-        COMPILE_ASSERT(cate != CATE_FLOAT, "'%' cannot be applied on floats");
-        return const_range_t {cate, union_val {0UL}, start.u64 - 1};
-    }
-};
-
-struct constant_fold_analysis_result_t {
-private:
-    // use raw pointers to avoid cycles of dependency
-    variant<const_range_t, expr_base *> range_or_assignment;
-
-public:
-    optional<size_t> hash;
-    int loop_depth = 0;
-    // the current "run_count" when the analysis result is created. It is used
-    // to avoid reading stale result of previous runs of the pass
-    uint64_t run_idx;
-    bool canonicalized = false;
-    int updated_at_sub_run_ = -1;
-    constant_fold_analysis_result_t(const constant_fold_analysis_result_t &)
-            = delete;
-    constant_fold_analysis_result_t(constant_fold_analysis_result_t &&)
-            = default;
-    constant_fold_analysis_result_t(uint64_t run) : run_idx(run) {}
-    constant_fold_analysis_result_t(uint64_t run, expr_base *v)
-        : range_or_assignment {v}, run_idx {run} {}
-    constant_fold_analysis_result_t(uint64_t run, const const_range_t &v)
-        : range_or_assignment {v}, run_idx {run} {}
-
-    constant_fold_analysis_result_t &operator=(
-            constant_fold_analysis_result_t &&other)
-            = default;
-    const const_range_t *get_range() const {
-        if (range_or_assignment.isa<const_range_t>()) {
-            return &range_or_assignment.get<const_range_t>();
-        }
-        if (range_or_assignment.isa<expr_base *>()) {
-            auto result
-                    = range_or_assignment.get<expr_base *>()
-                              ->get_temp_data()
-                              .get_or_null<constant_fold_analysis_result_t>();
-            return result ? result->get_range() : nullptr;
-        }
-        return nullptr;
-    }
-    expr_base *get_assigned_expr() const {
-        return range_or_assignment.isa<expr_base *>()
-                ? range_or_assignment.get<expr_base *>()
-                : nullptr;
-    }
-    void set_range(const const_range_t &v) { range_or_assignment = v; }
-};
-
-static bool parse_bool_infer_result(
-        const_range_t::infer_result r, const_range_t &out) {
-    if (r == const_range_t::infer_result::YES) {
-        out = {type_category::CATE_UINT, 1UL, 1UL};
-        return true;
-    } else if (r == const_range_t::infer_result::NO) {
-        out = {type_category::CATE_UINT, 0UL, 0UL};
-        return true;
-    }
-    return false;
-}
-
-static const_range_t::infer_result flip_infer_result(
-        const_range_t::infer_result r) {
-    if (r == const_range_t::infer_result::YES) {
-        return const_range_t::infer_result::NO;
-    }
-    if (r == const_range_t::infer_result::NO) {
-        return const_range_t::infer_result::YES;
-    }
-    return r;
-}
-
-static bool compute_range(const expr_c &parent, const const_range_t *l,
-        const const_range_t *r, const_range_t &out) {
-    if (parent->node_type_ == sc_expr_type::mod) {
-        out = r->get_mod_range();
-        return true;
-    }
-    if (!l) return false;
-    switch (parent->node_type_) {
-        case sc_expr_type::add:
-            out = *l + *r;
-            return true;
-            break;
-        case sc_expr_type::sub:
-            out = *l - *r;
-            return true;
-            break;
-        case sc_expr_type::mul:
-            out = *l * *r;
-            return true;
-            break;
-        case sc_expr_type::div:
-            out = *l / *r;
-            return true;
-            break;
-
-        case sc_expr_type::cmp_eq:
-            return parse_bool_infer_result(l->query_equal(*r), out);
-            break;
-        case sc_expr_type::cmp_ne:
-            return parse_bool_infer_result(
-                    flip_infer_result(l->query_equal(*r)), out);
-            break;
-        case sc_expr_type::cmp_le:
-            return parse_bool_infer_result(l->less_than(*r, true), out);
-            break;
-        case sc_expr_type::cmp_lt:
-            return parse_bool_infer_result(l->less_than(*r, false), out);
-            break;
-        case sc_expr_type::cmp_ge:
-            // l>=r ====> r<=l
-            return parse_bool_infer_result(r->less_than(*l, true), out);
-            break;
-        case sc_expr_type::cmp_gt:
-            // l>r ====> r<l
-            return parse_bool_infer_result(r->less_than(*l, false), out);
-            break;
-        default: break;
-    }
-    return false;
-}
-
-static const constant_fold_analysis_result_t *get_analysis(
-        const expr_base *v, uint64_t run) {
-    if (auto ret = v->get_temp_data()
-                           .get_or_null<constant_fold_analysis_result_t>()) {
-        if (ret->run_idx == run) { return ret; }
-    }
-    return nullptr;
-}
-
-static constant_fold_analysis_result_t *get_analysis_for_edit(
-        const expr_base *v, uint64_t run) {
-    auto &temp = v->temp_data();
-    if (temp.isa<constant_fold_analysis_result_t>()) {
-        auto &ret = temp.get<constant_fold_analysis_result_t>();
-        if (ret.run_idx == run) { return &ret; }
-    }
-    temp = constant_fold_analysis_result_t {run};
-    return &temp.get<constant_fold_analysis_result_t>();
-}
-
-static const constant_fold_analysis_result_t *get_analysis(
-        const expr_c &v, uint64_t run) {
-    return get_analysis(v.get(), run);
-}
-
-static const const_range_t *get_range_of_expr(
-        uint64_t run, const expr_c &v, bool fast) {
-    if (fast) { return nullptr; }
-    auto ana = get_analysis(v, run);
-    if (!ana) return nullptr;
-    return ana->get_range();
-}
-
-static void mark_range_for_const(uint64_t run, const expr_c &v, bool fast) {
-    if (!fast && v.isa<constant>() && v->dtype_.lanes_ == 1
-            && !get_range_of_expr(run, v, fast)) {
-        auto cate = get_type_category_nothrow(v->dtype_);
-        if (cate != CATE_OTHER) {
-            // TODO(niuxiaoguang): find out why value_ is empty
-            // COMPILE_ASSERT(!v.static_as<constant>()->value_.empty(),
-            //         "constant node value empty: " << v);
-            if (v.static_as<constant>()->value_.empty()) { return; }
-            auto constv = v.static_as<constant>()->value_.front();
-            v->temp_data() = constant_fold_analysis_result_t {
-                    run, const_range_t {cate, constv, constv}};
-        }
-    }
-}
-
-static bool is_expr_already_canonicalized(uint64_t run, const expr_c &v) {
-    auto ana = get_analysis(v, run);
-    if (!ana) return false;
-    return ana->canonicalized;
-}
-
-static const expr_base *get_assigned_expr(uint64_t run, const expr_base *v) {
-    auto ana = get_analysis(v, run);
-    if (!ana) { return v; }
-    if (auto single_assign = ana->get_assigned_expr()) {
-        return get_assigned_expr(run, single_assign);
-    }
-    return v;
-}
-
-// mark the vars that is actually constants. mark the ranges of for-loop-iter
-// vars
-class constant_fold_analysis_t : public ir_viewer_t {
-public:
-    // the map of var to single assign value. if key=value, it means that this
-    // var is not single assign
-    std::unordered_map<expr_c, expr> single_assign_;
-    uint64_t run_idx_;
-    constant_fold_analysis_t(uint64_t run) : run_idx_(run) {}
-    expr_c dispatch(expr_c v) override { return v; }
-
-    func_c dispatch(func_c v) override {
-        ir_viewer_t::dispatch(v);
-        for (auto &kv : single_assign_) {
-            // if the var is assigned once
-            if (kv.second.defined() && !kv.first.ptr_same(kv.second)) {
-                auto cate = get_type_category_nothrow(kv.first->dtype_);
-                if (cate != CATE_OTHER) {
-                    mark_range_for_const(run_idx_, kv.second, false);
-                    kv.first->temp_data() = constant_fold_analysis_result_t {
-                            run_idx_, kv.second.get()};
-                }
-            }
-        }
-        return v;
-    }
-
-    void view(define_c v) override {
-        if (v->var_.isa<var>() && v->var_->dtype_.lanes_ == 1) {
-            if (v->init_.defined()) {
-                assert(single_assign_.find(v->var_) == single_assign_.end());
-                single_assign_[v->var_] = v->init_;
-            } else {
-                single_assign_[v->var_] = expr();
-            }
-        }
-    }
-
-    void view(assign_c v) override {
-        if (v->var_.isa<var>() && v->var_->dtype_.lanes_ == 1) {
-            auto itr = single_assign_.find(v->var_);
-            if (itr != single_assign_.end()) {
-                if (itr->second.defined()) {
-                    // the var is already assigned elsewhere, it is not a
-                    // single-assign var
-                    itr->second = v->var_; // mark it
-                } else {
-                    itr->second = v->value_;
-                }
-            }
-        }
-    }
-};
-
-template <typename T>
-union_val make_val(T) = delete;
-
-static union_val make_val(float t) {
-    union_val a;
-    a.f32 = t;
-    return a;
-}
-
-static union_val make_val(uint64_t t) {
-    union_val a;
-    a.u64 = t;
-    return a;
-}
-
-static union_val make_val(int64_t t) {
-    union_val a;
-    a.s64 = t;
-    return a;
-}
-
-static union_val make_val(bool t) {
-    union_val a;
-    a.u64 = t ? 1 : 0;
-    return a;
-}
-
-template <typename T>
-struct extract_val_t {
-    static T doit(union_val) = delete;
-};
-
-template <>
-struct extract_val_t<float> {
-    static float doit(union_val v) { return v.f32; }
-};
-
-template <>
-struct extract_val_t<uint64_t> {
-    static uint64_t doit(union_val v) { return v.u64; }
-};
-template <>
-struct extract_val_t<int64_t> {
-    static int64_t doit(union_val v) { return v.s64; }
-};
-
-template <>
-struct extract_val_t<bool> {
-    static bool doit(union_val v) { return v.u64; }
-};
-
-template <typename SrcT, typename DestT>
-static union_val cast_dispatched(union_val val) {
-    return make_val(static_cast<DestT>(extract_val_t<SrcT>::doit(val)));
-}
-
-template <typename T>
-expr create_cast(sc_data_type_t to_dtype, type_category to_cate,
-        const std::vector<union_val> &v) {
-    std::vector<union_val> ret;
-    ret.reserve(v.size());
-    union_val (*dispatch)(union_val val);
-    switch (to_cate) {
-        case CATE_FLOAT: {
-            dispatch = cast_dispatched<T, float>;
-            break;
-        }
-        case CATE_INT: {
-            dispatch = cast_dispatched<T, int64_t>;
-            break;
-        }
-        case CATE_UINT: {
-            dispatch = cast_dispatched<T, uint64_t>;
-            break;
-        }
-        default: COMPILE_ASSERT(0, "Bad cast to " << to_dtype); return expr();
-    }
-    for (auto val : v) {
-        ret.push_back(dispatch(val));
-    }
-    return make_expr<constant_node>(ret, to_dtype);
-}
-
-bool is_const_equal_to(const constant_c &v, int64_t V) {
-    auto cate = get_etype_category(v->dtype_);
-    switch (cate) {
-        case CATE_INT: {
-            return std::all_of(v->value_.begin(), v->value_.end(),
-                    [&](const union_val &r) { return r.s64 == V; });
-        }
-        case CATE_UINT: {
-            return std::all_of(v->value_.begin(), v->value_.end(),
-                    [&](const union_val &r) {
-                        return r.u64 == static_cast<uint64_t>(V);
-                    });
-        }
-        case CATE_FLOAT: {
-            return std::all_of(v->value_.begin(), v->value_.end(),
-                    [&](const union_val &r) {
-                        return r.f32 == static_cast<float>(V);
-                    });
-        }
-        default: assert(0 && "Bad category"); return false;
-    }
-}
-
-template <typename T, typename... Args>
-static size_t check_size_equals(const T &v0) {
-    return v0.size();
-}
-
-template <typename T, typename... Args>
-static size_t check_size_equals(const T &v0, const Args &...args) {
-    auto ret = check_size_equals(args...);
-    if (v0.size() == 1UL) { return ret; }
-    if (ret == 1UL) { return v0.size(); }
-    COMPILE_ASSERT(
-            v0.size() == ret, "number of constant value elements mismatch");
-    return ret;
-}
-
-static union_val extract_const_value(
-        const std::vector<union_val> &v, size_t idx) {
-    if (idx < v.size()) { return v[idx]; }
-    return v[0];
-}
-
-template <typename T>
-static T extract_typed_value(const std::vector<union_val> &v, size_t idx) {
-    return extract_val_t<T>::doit(extract_const_value(v, idx));
-}
-
-template <typename R, typename... A>
-R ret_helper(R (*)(A...));
-
-// decay an lambda to function pointer
-template <typename R, typename FirstArg, typename... A>
-FirstArg first_arg_helper(R (*)(FirstArg, A...));
-
-template <typename FuncT, typename... Args>
-static std::vector<union_val> execute_on_values_impl(
-        FuncT func, const Args &...args) {
-    using FirstArg = decltype(first_arg_helper(func));
-    size_t sz = check_size_equals(args...);
-    std::vector<union_val> ret;
-    ret.reserve(sz);
-    auto first_val = func(extract_typed_value<FirstArg>(args, 0)...);
-    ret.push_back(make_val(first_val));
-    bool is_same = true;
-    for (size_t i = 1; i < sz; i++) {
-        auto cur_val = func(extract_typed_value<FirstArg>(args, i)...);
-        ret.push_back(make_val(cur_val));
-        is_same &= (cur_val == first_val);
-    }
-    if (is_same) { ret.resize(1); }
-    return ret;
-}
-
-template <typename FuncT, typename... Args>
-static std::vector<union_val> execute_on_values(
-        FuncT func, const Args &...args) {
-    // the FuncT can be a lambda.
-    // the +func trick converts the func to a function pointer
-    return execute_on_values_impl(+func, args...);
-}
-
-std::vector<union_val> execute_logic_binary(sc_expr_type op,
-        const std::vector<union_val> &a, const std::vector<union_val> &b) {
-    switch (op) {
-        case sc_expr_type::logic_and:
-            return execute_on_values(
-                    [](bool a, bool b) { return a && b; }, a, b);
-        case sc_expr_type::logic_or:
-            return execute_on_values(
-                    [](bool a, bool b) { return a || b; }, a, b);
-        default: assert(0 && "Unknown logic OP"); return {};
-    };
-}
-
-template <typename T>
-static T execute_shr(T a, T b) {
-    return a >> b;
-}
-
-template <typename T>
-static T execute_shl(T a, T b) {
-    return a << b;
-}
-
-template <>
-float execute_shr(float a, float b) {
-    COMPILE_ASSERT(0, ">> cannot be applied on float type");
-    return 0;
-}
-
-template <>
-float execute_shl(float a, float b) {
-    COMPILE_ASSERT(0, "<< cannot be applied on float type");
-    return 0;
-}
-
-template <typename T>
-static T execute_mod(T a, T b) {
-    return a % b;
-}
-
-template <>
-float execute_mod(float a, float b) {
-    COMPILE_ASSERT(0, "%% cannot be applied on float type");
-    return 0;
-}
-
-template <typename T>
-static T execute_and(T a, T b) {
-    return a & b;
-}
-
-template <typename T>
-static T execute_or(T a, T b) {
-    return a | b;
-}
-
-template <>
-float execute_and(float a, float b) {
-    COMPILE_ASSERT(0, "& cannot be applied on float type");
-    return 0;
-}
-
-template <>
-float execute_or(float a, float b) {
-    COMPILE_ASSERT(0, "| cannot be applied on float type");
-    return 0;
-}
-
-#define DEF_COMPUTE(expr_) \
-    execute_on_values([](T a, T b) { return (expr_); }, x, y);
-
-template <typename T>
-std::vector<union_val> execute_binary(sc_expr_type op, intrin_type intrin_op,
-        const std::vector<union_val> &x, const std::vector<union_val> &y) {
-    switch (op) {
-        case sc_expr_type::add: return DEF_COMPUTE(a + b);
-        case sc_expr_type::sub: return DEF_COMPUTE(a - b);
-        case sc_expr_type::mul: return DEF_COMPUTE(a * b);
-        case sc_expr_type::div: return DEF_COMPUTE(a / b);
-        case sc_expr_type::mod: return execute_on_values(&execute_mod<T>, x, y);
-        case sc_expr_type::intrin_call: {
-            switch (intrin_op) {
-                case intrin_type::shl:
-                    return execute_on_values(&execute_shl<T>, x, y);
-                case intrin_type::shr:
-                    return execute_on_values(&execute_shr<T>, x, y);
-                case intrin_type::min: return DEF_COMPUTE(a < b ? a : b);
-                case intrin_type::max: return DEF_COMPUTE(a > b ? a : b);
-                case intrin_type::int_and:
-                    return execute_on_values(&execute_and<T>, x, y);
-                case intrin_type::int_or:
-                    return execute_on_values(&execute_or<T>, x, y);
-                default: assert(0 && "Unknown OP");
-            }
-        }
-        case sc_expr_type::cmp_eq: return DEF_COMPUTE(a == b);
-        case sc_expr_type::cmp_ne: return DEF_COMPUTE(a != b);
-        case sc_expr_type::cmp_lt: return DEF_COMPUTE(a < b);
-        case sc_expr_type::cmp_le: return DEF_COMPUTE(a <= b);
-        case sc_expr_type::cmp_gt: return DEF_COMPUTE(a > b);
-        case sc_expr_type::cmp_ge: return DEF_COMPUTE(a >= b);
-        default: assert(0 && "Unknown OP"); return {};
-    }
-}
-
-expr compute_constexpr(
-        const constant_c &cl, const constant_c &cr, const expr_c &parent) {
-    COMPILE_ASSERT(cl->dtype_ == cr->dtype_,
-            "LHS and RHS should have the same type: " << parent);
-    if (parent.instanceof <logic>()) {
-        COMPILE_ASSERT(cl->dtype_.type_code_ == sc_data_etype::BOOLEAN,
-                "logic op should have boolean operands: " << parent);
-        auto res = execute_logic_binary(
-                parent->node_type_, cl->value_, cr->value_);
-        return make_expr<constant_node>(res, cl->dtype_);
-    }
-    type_category ty = get_etype_category_nothrow(cl->dtype_.type_code_);
-    auto op = parent->node_type_;
-    intrin_type intrin_op = intrin_type::NUM_INTRINSICS;
-    if (op == intrin_call_node::type_code_)
-        intrin_op = parent.static_as<intrin_call_c>()->type_;
-    std::vector<union_val> val;
-    switch (ty) {
-        case CATE_FLOAT:
-            val = execute_binary<float>(op, intrin_op, cl->value_, cr->value_);
-            break;
-        case CATE_UINT:
-            val = execute_binary<uint64_t>(
-                    op, intrin_op, cl->value_, cr->value_);
-            break;
-        case CATE_INT:
-            val = execute_binary<int64_t>(
-                    op, intrin_op, cl->value_, cr->value_);
-            break;
-        default:
-            COMPILE_ASSERT(0, "Type of binary op: " << parent);
-            return expr();
-    }
-    return make_expr<constant_node>(val, parent->dtype_);
-}
-
-bool is_op_commutative_and_associative(const expr_c &v) {
-    if (v->node_type_ == sc_expr_type::intrin_call) {
-        switch (v.static_as<intrin_call_c>()->type_) {
-            case intrin_type::max:
-            case intrin_type::min:
-            case intrin_type::int_and:
-            case intrin_type::int_or: return true;
-            default: return false;
-        }
-    }
-    switch (v->node_type_) {
-        case sc_expr_type::add:
-        case sc_expr_type::mul:
-        case sc_expr_type::logic_and:
-        case sc_expr_type::logic_or: return true;
-        default: return false;
-    }
-}
-
-std::pair<expr_c, expr_c> get_operand_from_binary(const expr_c &a) {
-    if (a.instanceof <intrin_call_c>()) {
-        auto v = a.static_as<intrin_call_c>();
-        return std::make_pair(v->args_[0], v->args_[1]);
-    }
-    if (a.instanceof <binary_c>()) {
-        auto v = a.static_as<binary_c>();
-        return std::make_pair(v->l_, v->r_);
-    }
-    if (a.instanceof <cmp_c>()) {
-        auto v = a.static_as<cmp_c>();
-        return std::make_pair(v->l_, v->r_);
-    }
-    if (a.instanceof <logic_c>()) {
-        auto v = a.static_as<logic_c>();
-        return std::make_pair(v->l_, v->r_);
-    }
-    return std::make_pair(expr_c(), expr_c());
-}
-
-bool fold_special_consts(expr_c &orig, expr_c l, const constant_c &r) {
-    // does not fold pointer constant
-    if (r->dtype_.is_pointer()) { return false; }
-    // todo: handle vector types with different value
-    if (r->value_.size() > 1) { return false; }
-    sc_expr_type op = orig->node_type_;
-    if (r->dtype_.is_etype(sc_data_etype::BOOLEAN)) {
-        bool val = r->value_[0].u64;
-        switch (op) {
-            case sc_expr_type::logic_and:
-                if (val) {
-                    // x && 1 = x
-                    orig = std::move(l);
-                    return true;
-                } else {
-                    // X && 0 = 0
-                    orig = make_expr<constant_node>(uint64_t(0), r->dtype_);
-                    return true;
-                }
-                break;
-            case sc_expr_type::logic_or:
-                if (val) {
-                    // x || 1 = 1
-                    orig = make_expr<constant_node>(uint64_t(1), r->dtype_);
-                    return true;
-                } else {
-                    // X || 0 = X
-                    orig = std::move(l);
-                    return true;
-                }
-                break;
-            default: {
-            };
-        }
-        return false;
-    }
-
-    if (is_const_equal_to(r, 0)) {
-        switch (op) {
-            case sc_expr_type::add:
-            case sc_expr_type::sub: orig = std::move(l); return true;
-            case sc_expr_type::mul:
-                orig = make_expr<constant_node>(uint64_t(0), orig->dtype_);
-                return true;
-            default: {
-            };
-        }
-    }
-    if (is_const_equal_to(r, 1)) {
-        switch (op) {
-            case sc_expr_type::mul:
-            case sc_expr_type::div: orig = std::move(l); return true;
-            case sc_expr_type::mod:
-                orig = make_expr<constant_node>(uint64_t(0), orig->dtype_);
-                return true;
-            default: {
-            };
-        }
-    }
-    return false;
-}
-
-/*
- * ==========================================
- * Canonicalization related general commutative and associative rule:
- * (for associative operators, a sequence of operators will always chain to
- * the left, e.g. ((a+b)+c)+d))
- * (a+b)+(c+d) => ((a+b)+c)+d
- * a+(b+c) => (a+b)+c
- * ==========================================
- * For sub(-) and integers only, transform to add(+)
- * a-b => a+(-b)
- * a-(b+c) > a+(-b)+(-c)
- * a-(b-c) > a+(-b)+c
- * ==========================================
- * Fold sub(-) related expr
- * (-a)+a => 0
- * a+(-a) => 0
- * (a+b)+(-b) => a
- * (a+(-b))+b => a
- * ==========================================
- * Sort expressions by hash
- * sort positions of a,b,c in (a+b)+c and a+b
- * make sure (-a) and a has the same hash value
- */
-struct canonicalize_expressions_t : ir_visitor_t {
-    uint64_t run_;
-    const std::unordered_map<expr_c, expr_c> &var_tensor_map_;
-    canonicalize_expressions_t(uint64_t run,
-            const std::unordered_map<expr_c, expr_c> &var_tensor_map)
-        : run_ {run}, var_tensor_map_(var_tensor_map) {}
-    // the operand collected in sequenced expr
-    struct operand_t {
-        expr_c v_;
-        // if the expr is sub(-), this field will be true
-        bool negative_;
-    };
-    size_t simple_hash_string(const std::string &v) {
-        size_t i = 0;
-        for (auto c : v) {
-            i = i * 23 + c;
-        }
-        return i;
-    }
-    size_t simple_hash_expr(const expr_c &v, int *out_loop_depth = nullptr) {
-        auto ana = get_analysis_for_edit(v.get(), run_);
-        if (ana->hash.has_value()) {
-            if (out_loop_depth) { *out_loop_depth = ana->loop_depth; }
-            return ana->hash.get();
-        }
-        size_t ret = 0;
-        if (v.isa<var>()) {
-            ret = simple_hash_string(v.static_as<var>()->name_);
-            hash_combine_stable(ret, v->dtype_);
-        } else if (v.isa<tensor>()) {
-            ret = simple_hash_string(v.static_as<tensor>()->name_);
-            hash_combine_stable(ret, v->dtype_);
-        } else if (v.isa<constant>()) {
-            ret = content_hash_t<constant_c>()(v.static_as<constant_c>());
-        } else {
-            ret = static_cast<size_t>(v->node_type_);
-            int loop_depth = 0;
-            auto ths = this;
-            get_direct_dependency_of_expr(v.remove_const(),
-                    [&loop_depth, &ret, ths](array_ref<expr> args) {
-                        for (auto &v : args) {
-                            hash_combine_stable(ret, ths->simple_hash_expr(v));
-                            loop_depth = std::max(loop_depth,
-                                    get_analysis_for_edit(v.get(), ths->run_)
-                                            ->loop_depth);
-                        }
-                    });
-            ana->loop_depth = loop_depth;
-        }
-        ana->hash = ret;
-        if (out_loop_depth) { *out_loop_depth = ana->loop_depth; }
-        return ret;
-    }
-
-    bool can_be_chained(const expr_c &v, const expr_c &parent) {
-        auto parentop = parent->node_type_;
-        if (parentop == sc_expr_type::sub) { parentop = sc_expr_type::add; }
-        auto vop = v->node_type_;
-        if (vop == sc_expr_type::sub) { vop = sc_expr_type::add; }
-        if (parentop != vop) { return false; }
-        if (parentop == sc_expr_type::intrin_call) {
-            return v.static_as<intrin_call_c>()->type_
-                    == parent.static_as<intrin_call_c>()->type_;
-        }
-        return true;
-    }
-
-    // recursively collect the nested expressions with the same op. We also
-    // consider "+" as the same as "-" and use "negative" flag to preserve
-    // "-"
-    bool collect_chained_operands(const expr_c &v, const expr_c &parent,
-            std::vector<operand_t> &collected, bool negative) {
-        if (can_be_chained(v, parent)) {
-            auto operands = get_operand_from_binary(v);
-            bool changed = collect_chained_operands(
-                    operands.first, parent, collected, negative);
-            bool rhs_negative = negative;
-            if (v->node_type_ == sc_expr_type::sub) {
-                rhs_negative = !rhs_negative;
-            }
-            size_t old_collected = collected.size();
-            changed = changed
-                    | collect_chained_operands(
-                            operands.second, parent, collected, rhs_negative);
-            // if there are more than 1 operand collected on the RHS, we need to
-            // rebuild the IR because it is not in canonicalized form
-            if (old_collected + 1 != collected.size()) { changed = true; }
-            return changed;
-        } else {
-            auto ret = dispatch(v);
-            collected.emplace_back(operand_t {ret, negative});
-            return !ret.ptr_same(v);
-        }
-    }
-
-    expr_c canonicalize(const expr_c &v) {
-        // skip if it is not int scalar
-        auto cate = get_type_category_nothrow(v->dtype_);
-        bool is_int_cate = (cate == CATE_INT || cate == CATE_UINT);
-        if (!is_int_cate) { return ir_visitor_t::dispatch(v); }
-        // skip if already handled
-        if (is_expr_already_canonicalized(run_, v)) { return v; }
-        auto theop = v->node_type_;
-        if (theop == sc_expr_type::sub) { theop = sc_expr_type::add; }
-        std::vector<operand_t> collected;
-        auto changed = collect_chained_operands(v, v, collected, false);
-        auto cmper = [&](const operand_t &ths, const operand_t &other) {
-            bool ths_const = ths.v_.isa<constant>();
-            bool other_const = other.v_.isa<constant>();
-            if (ths_const && other_const) { return false; }
-            if (ths_const) { return false; }
-            if (other_const) { return true; }
-            int loop_depth1 = 0;
-            int loop_depth2 = 0;
-            size_t hash1 = simple_hash_expr(ths.v_, &loop_depth1);
-            size_t hash2 = simple_hash_expr(other.v_, &loop_depth2);
-            if (loop_depth1 != loop_depth2) {
-                // expr with in a deeper loop should be sorted after others
-                return loop_depth1 < loop_depth2;
-            }
-            return hash1 < hash2;
-        };
-        bool need_special_fold = theop == sc_expr_type::add;
-        if (!changed && !need_special_fold
-                && std::is_sorted(collected.begin(), collected.end(), cmper)) {
-            get_analysis_for_edit(v.get(), run_)->canonicalized = true;
-            return v;
-        }
-        auto num_orig_collected = collected.size();
-        std::stable_sort(collected.begin(), collected.end(), cmper);
-        if (need_special_fold) {
-            // fold constants at RHS
-            for (auto itr = collected.begin(); itr != collected.end(); ++itr) {
-                if (itr->v_.isa<constant>()) {
-                    if (cate == CATE_UINT) {
-                        uint64_t cur = itr->v_.static_as<constant_c>()
-                                               ->value_.at(0)
-                                               .u64;
-                        bool negative = itr->negative_;
-                        for (auto itr2 = itr + 1; itr2 != collected.end();
-                                ++itr2) {
-                            COMPILE_ASSERT(itr2->v_.isa<constant>(),
-                                    "Expecting constant after canonicalize");
-                            uint64_t cur2 = itr2->v_.static_as<constant_c>()
-                                                    ->value_.at(0)
-                                                    .u64;
-                            bool negative2 = itr2->negative_;
-                            if (negative == negative2) {
-                                cur += cur2;
-                            } else {
-                                if (cur > cur2) {
-                                    cur -= cur2;
-                                } else {
-                                    negative = negative2;
-                                    cur = cur2 - cur;
-                                }
-                            }
-                        }
-                        itr->negative_ = negative;
-                        itr->v_ = make_expr<constant_node>(cur, v->dtype_);
-                    } else {
-                        int64_t cur = itr->v_.static_as<constant_c>()
-                                              ->value_.at(0)
-                                              .s64;
-                        if (itr->negative_) cur = (-cur);
-                        for (auto itr2 = itr + 1; itr2 != collected.end();
-                                ++itr2) {
-                            COMPILE_ASSERT(itr2->v_.isa<constant>(),
-                                    "Expecting constant after canonicalize");
-                            int64_t cur2 = itr2->v_.static_as<constant_c>()
-                                                   ->value_.at(0)
-                                                   .s64;
-                            if (itr2->negative_) cur2 = (-cur2);
-                            cur += cur2;
-                        }
-                        itr->negative_ = cur < 0;
-                        cur = cur < 0 ? -cur : cur;
-                        itr->v_ = make_expr<constant_node>(cur, v->dtype_);
-                    }
-                    collected.erase(itr + 1, collected.end());
-                    break;
-                }
-            }
-            size_t num_operands = collected.size();
-            // delete pairs like +a,-a
-            for (auto itr = collected.begin(); itr != collected.end(); ++itr) {
-                // already removed, skip
-                if (!itr->v_.defined()) { continue; }
-                auto cur_hash = simple_hash_expr(itr->v_);
-                for (auto itr2 = itr + 1; itr2 != collected.end(); ++itr2) {
-                    // already removed, skip
-                    if (!itr2->v_.defined()) { continue; }
-                    auto cur_hash2 = simple_hash_expr(itr2->v_);
-                    if (cur_hash != cur_hash2) { break; }
-                    if (itr->negative_ == itr2->negative_) { continue; }
-                    // same hash and different negative, we can try to remove
-                    // them
-                    ir_comparer cmper {false, false, true};
-                    if (cmper.compare(itr->v_, itr2->v_)) {
-                        itr->v_ = expr_c();
-                        itr2->v_ = expr_c();
-                        num_operands -= 2;
-                        break;
-                    }
-                }
-            }
-            // all values are destoryed
-            if (num_operands == 0) {
-                return make_expr<constant_node>(UINT64_C(0), v->dtype_);
-            }
-            expr_c cur_expr;
-            // find the first positive value
-            for (auto itr = collected.begin(); itr != collected.end(); ++itr) {
-                // already removed, skip
-                if (!itr->v_.defined()) { continue; }
-                if (!itr->negative_) {
-                    cur_expr = itr->v_;
-                    itr->v_ = expr_c();
-                    break;
-                }
-            }
-            if (!cur_expr.defined()) {
-                cur_expr = make_expr<constant_node>(UINT64_C(0), v->dtype_);
-            }
-            for (auto itr = collected.begin(); itr != collected.end(); ++itr) {
-                // already removed, skip
-                if (!itr->v_.defined()) { continue; }
-                // if is constant zero
-                if (itr->v_.cast<constant_c>()
-                                .filter([](const constant_c &v) {
-                                    return v->value_.at(0).u64 == 0;
-                                })
-                                .has_value()) {
-                    continue;
-                }
-                if (!itr->negative_) {
-                    cur_expr = cur_expr + itr->v_;
-                } else {
-                    cur_expr = cur_expr - itr->v_;
-                }
-            }
-            expr_c retval = cur_expr;
-            if (!changed && num_operands == num_orig_collected) {
-                // if the new expr is the same as the old, return the old expr
-                ir_comparer cmper {false, false, true};
-                if (cmper.compare(v, cur_expr)) { retval = v; }
-            }
-            get_analysis_for_edit(retval.get(), run_)->canonicalized = true;
-            return retval;
-        } else {
-            expr_c cur_expr = collected.begin()->v_;
-            // non-add expr
-            for (auto itr = collected.begin() + 1; itr != collected.end();
-                    ++itr) {
-                cur_expr = builder::remake_binary(cur_expr, itr->v_, v);
-            }
-            get_analysis_for_edit(cur_expr.get(), run_)->canonicalized = true;
-            return cur_expr;
-        }
-    }
-
-    expr_c dispatch(expr_c v) override {
-        if (v.isa<var>() || v.isa<tensor>()) {
-            auto itr = var_tensor_map_.find(v);
-            if (itr != var_tensor_map_.end()) { return itr->second; }
-            return v;
-        }
-        if (v.isa<sub>() || is_op_commutative_and_associative(v)) {
-            return canonicalize(v);
-        }
-        return ir_visitor_t::dispatch(v);
-    }
-};
-
-} // namespace constant_folding
-
-using namespace constant_folding;
-
-/**
- * It will canonicalize the expressions. And execute the pure constant
- * expressions like c1 + c2 => c3
- *
- * Also fold special expr:
- * a (+ - * && ||) 0/false
- * a (* / % && ||) 1/true
- * a (- / % && || max min > >= < <= == !=) a
- * a * nb / b => a * n
- * */
-class constant_fold_t : public ir_consistent_visitor_t {
-public:
-    using ir_consistent_visitor_t::dispatch;
-    using ir_consistent_visitor_t::visit;
-    // a comparer with strict var/tensor comparison
-    ir_comparer cmper;
-    uint64_t run_idx_ = get_run_id();
-    int sub_run_idx_ = 0;
-    int expr_depth_ = 0;
-    int loop_depth_ = 0;
-    // track if rvalue of var is changed. It is used to skip redoing folding for
-    // non-fast mode
-    bool var_rvalue_changed_ = false;
-    bool fast_;
-    bool skip_mod_expand_;
-
-    constant_fold_t(bool fast, bool skip_mod_expand = false)
-        : cmper(false, true, true, false)
-        , fast_(fast)
-        , skip_mod_expand_(skip_mod_expand) {}
-    expr_c dispatch(expr_c v) override {
-        if (v.isa<var>() || v.isa<constant>()) { return v; }
-        auto ana = get_analysis(v.get(), run_idx_);
-        if (ana && ana->updated_at_sub_run_ == sub_run_idx_) { return v; }
-        auto ret = ir_consistent_visitor_t::dispatch(v);
-        get_analysis_for_edit(ret.get(), run_idx_)->updated_at_sub_run_
-                = sub_run_idx_;
-        return ret;
-    }
-
-    bool is_same_op(expr_c &v1, expr_c &v2) {
-        if (v1->node_type_ != v2->node_type_) return false;
-        if (v1->node_type_ == sc_expr_type::intrin_call)
-            return v1.static_as<intrin_call_c>()->type_
-                    == v2.static_as<intrin_call_c>()->type_;
-        return true;
-    }
-
-    /**
-     * Try to canonicalize the expression. "+" as an example.
-     * x,y,z as general expr, c as const.
-     * Canonicalization related to constants:
-     * (constants are moved to RHS)
-     * c + x => x + c
-     * (x + c1) + c2 => x + (c1 + c2)
-     * (x + c) + y => (x + y) + c
-     * x + (y + c) => (x + y) + c
-     * (x + c1) + (y + c2) => (x + y) + (c1 + c2)
-     */
-    // try to rotate by the rotation rule.
-    // returns true if rotation succeed
-    bool try_rotate_const(expr_c &parent, expr_c &l, expr_c &r) {
-        if (!is_op_commutative_and_associative(parent)) return false;
-        if (l.isa<constant>() && !r.isa<constant>()) {
-            if (parent.cast<intrin_call_c>()
-                            .filter([&l](const intrin_call_c &v) {
-                                return (v->type_ == intrin_type::min
-                                               || v->type_ == intrin_type::max)
-                                        && get_etype_category_nothrow(l->dtype_)
-                                        == type_category::CATE_FLOAT;
-                            })
-                            .has_value()) {
-                // skip for min/max cases with float dtype
-                return false;
-            }
-            // c + x => x + c
-            std::swap(l, r);
-            return true;
-        }
-        if (is_same_op(parent, l) && !l.isa<constant>() && r.isa<constant>()) {
-            // (x + c1) + c2 => x + (c1 + c2)
-            auto v = get_operand_from_binary(l);
-            if (v.second.isa<constant>()) {
-                auto c1 = v.second.static_as<constant_c>();
-                r = compute_constexpr(c1, r.static_as<constant_c>(), parent);
-                l = v.first;
-                return true;
-            }
-        }
-        if (!l.isa<constant>() && !r.isa<constant>()) {
-            if (is_same_op(parent, l) && !is_same_op(parent, r)) {
-                auto v = get_operand_from_binary(l);
-                if (v.second.isa<constant>()) {
-                    // (x + c) + y => (x + y) + c
-                    l = builder::remake_binary(v.first, r, parent);
-                    r = v.second;
-                    return true;
-                }
-            }
-            if (!is_same_op(parent, l) && is_same_op(parent, r)) {
-                auto v = get_operand_from_binary(r);
-                if (v.second.isa<constant>()) {
-                    // x + (y + c) => (x + y) + c
-                    l = builder::remake_binary(l, v.first, parent);
-                    r = v.second;
-                    return true;
-                }
-            }
-            if (is_same_op(parent, l) && is_same_op(parent, r)) {
-                auto vl = get_operand_from_binary(l);
-                auto vr = get_operand_from_binary(r);
-                if (vl.second.isa<constant>() && vr.second.isa<constant>()) {
-                    // (x + c1) + (y + c2) => (x + y) + (c1 + c2)
-                    l = builder::remake_binary(vl.first, vr.first, parent);
-                    r = compute_constexpr(vl.second.checked_as<constant>(),
-                            vr.second.checked_as<constant>(), parent);
-                    return true;
-                }
-            }
-        }
-        return false;
-    }
-
-    // fold expr like a-a a/a a%a a&&a a||a min(a,a) max(a,a)
-    // a!=a a>a ...
-    bool fold_special_exprs(expr_c &parent, expr_c lhs, const expr_c &rhs) {
-        switch (parent->node_type_) {
-            case sc_expr_type::sub:
-                if (cmper.compare(lhs, rhs)) {
-                    parent = make_expr<constant_node>(0UL, parent->dtype_);
-                    return true;
-                }
-                break;
-            case sc_expr_type::mod:
-                if (cmper.compare(lhs, rhs)) {
-                    parent = make_expr<constant_node>(0UL, parent->dtype_);
-                    return true;
-                }
-                if (parent->dtype_.lanes_ > 1) {
-                    // todo: handle vector types
-                    return false;
-                }
-                if (rhs.isa<constant_c>()) {
-                    int64_t rv1
-                            = get_const_as_int(rhs.checked_as<constant_c>());
-                    // fold i % C ==> i, if 0 <= i < C
-                    if (auto rng = get_range_of_expr(run_idx_, lhs, fast_)) {
-                        int64_t end_r = -1;
-                        int64_t start_r = -1;
-                        if (rng->cate == CATE_INT) {
-                            end_r = rng->end.s64;
-                            start_r = rng->start.s64;
-                        } else if (rng->cate == CATE_UINT) {
-                            end_r = rng->end.u64;
-                            start_r = rng->start.u64;
-                        }
-                        if (rng->is_good_range() && end_r >= 0 && start_r >= 0
-                                && rv1 > 0 && end_r < rv1) {
-                            parent = lhs;
-                            return true;
-                        }
-                    }
-                    // fold (x * nC) % C = 0
-                    if (lhs->node_type_ == sc_expr_type::mul) {
-                        auto rhs_of_lhs = get_operand_from_binary(lhs).second;
-                        if (rhs_of_lhs.isa<constant_c>()) {
-                            int64_t rv2 = get_const_as_int(
-                                    rhs_of_lhs.checked_as<constant_c>());
-                            if (rv2 % rv1 == 0) {
-                                parent = make_expr<constant_node>(
-                                        0UL, parent->dtype_);
-                                return true;
-                            }
-                        }
-                    }
-                    // fold (x %C) % C = x % C
-                    else if (lhs->node_type_ == sc_expr_type::mod) {
-                        auto r_l = get_operand_from_binary(lhs);
-                        auto rhs_of_lhs = r_l.second;
-                        if (rhs_of_lhs.isa<constant_c>()) {
-                            int64_t rv2 = get_const_as_int(
-                                    rhs_of_lhs.checked_as<constant_c>());
-                            if (rv2 == rv1) {
-                                parent = builder::make_mod(r_l.first, rhs);
-                                return true;
-                            }
-                        }
-                    }
-                }
-                break;
-            case sc_expr_type::mul:
-                // expand (v+const)*const
-                if (rhs.isa<constant>()
-                        && lhs.cast<add>()
-                                   .filter([](const add &v) {
-                                       return v->r_.isa<constant>();
-                                   })
-                                   .has_value()) {
-                    auto lhs_add = lhs.static_as<add>();
-                    parent = (lhs_add->l_ * rhs)
-                            + fold_binary(lhs_add->r_ * rhs);
-                    return true;
-                }
-                break;
-            case sc_expr_type::div:
-                if (cmper.compare(lhs, rhs)) {
-                    switch (get_type_category(parent->dtype_)) {
-                        case CATE_INT:
-                        case CATE_UINT:
-                            parent = make_expr<constant_node>(
-                                    1UL, parent->dtype_);
-                            return true;
-                        case CATE_FLOAT:
-                            parent = make_expr<constant_node>(
-                                    1.0f, parent->dtype_);
-                            return true;
-                        default: assert(0 && "Bad type"); return false;
-                    }
-                }
-                // a * nb / b => a * n
-                if (lhs->node_type_ == sc_expr_type::mul && !lhs.isa<constant>()
-                        && rhs.isa<constant>()) {
-                    auto v = get_operand_from_binary(lhs);
-                    if (v.second.isa<constant>()) {
-                        int64_t rc
-                                = get_const_as_int(rhs.static_as<constant_c>());
-                        int64_t lc = get_const_as_int(
-                                v.second.static_as<constant_c>());
-                        if (lc % rc == 0) {
-                            uint64_t folded_const = lc / rc;
-                            parent = v.first * expr(folded_const);
-                            return true;
-                        }
-                    }
-                }
-                break;
-            case sc_expr_type::intrin_call:
-                // todo(xxx): fold &0 |1
-                bool can_fold;
-                switch (parent.static_as<intrin_call>()->type_) {
-                    case intrin_type::max:
-                    case intrin_type::min:
-                    case intrin_type::int_and:
-                    case intrin_type::int_or: can_fold = true; break;
-                    default: can_fold = false;
-                }
-                if (!can_fold) break;
-                // if can_fold, fall through
-            case sc_expr_type::logic_and:
-            case sc_expr_type::logic_or:
-                if (cmper.compare(lhs, rhs)) {
-                    parent = std::move(lhs);
-                    return true;
-                }
-                break;
-            case sc_expr_type::cmp_eq:
-            case sc_expr_type::cmp_le:
-            case sc_expr_type::cmp_ge:
-                if (cmper.compare(lhs, rhs)) {
-                    parent = make_expr<constant_node>(1UL, parent->dtype_);
-                    return true;
-                }
-                break;
-            case sc_expr_type::cmp_ne:
-            case sc_expr_type::cmp_lt:
-            case sc_expr_type::cmp_gt:
-                if (cmper.compare(lhs, rhs)) {
-                    parent = make_expr<constant_node>(0UL, parent->dtype_);
-                    return true;
-                }
-                break;
-            default: break;
-        }
-        return false;
-    }
-
-    // fold x/c1/c2 => x /(c1*c2)
-    bool fold_successive_div(expr_c &orig, expr_c &l, const expr_c &r) {
-        sc_expr_type op = orig->node_type_;
-        if (op != sc_expr_type::div) { return false; }
-        if (is_same_op(orig, l) && !l.isa<constant>() && r.isa<constant>()) {
-            auto v = get_operand_from_binary(l);
-            if (v.second.isa<constant>()) {
-                auto c1 = v.second.static_as<constant_c>();
-                orig = builder::make_div(v.first,
-                        compute_constexpr(c1, r.static_as<constant_c>(),
-                                builder::make_mul(c1, r)));
-                return true;
-            }
-        }
-        return false;
-    }
-
-    /** expand Polynomial function
-     *  e.g. ((a+b)*c+d)*e = a*c*e+b*c*e+d*e
-     *                  *
-     *                 / \
-     *                +   e
-     *               / \
-     *              *   d
-     *             / \
-     *            +   c
-     *           / \
-     *          a   b
-     * */
-    expr_c expand_polynomial(expr_c parent) {
-        if (parent->dtype_.lanes_ > 1) {
-            // todo: handle vector types
-            return parent;
-        }
-        switch (parent->node_type_) {
-            case sc_expr_type::mul:
-            case sc_expr_type::div:
-            case sc_expr_type::mod: {
-                // TODO(xxx): support (a+b)*(c+d)
-                auto l_r = get_operand_from_binary(parent);
-                if (!l_r.second.isa<constant_c>()) {
-                    break;
-                } else {
-                    constant_c rv = l_r.second.checked_as<constant_c>();
-                    //
-                    auto expand_add_sub = [&](bool skip) {
-                        if (skip) {
-                            return builder::remake_binary(
-                                    expand_polynomial(l_r.first),
-                                    expand_polynomial(l_r.second), parent);
-                        } else {
-                            auto next_lr = get_operand_from_binary(l_r.first);
-                            auto new_parent = builder::remake_binary(
-                                    expand_polynomial(builder::remake_binary(
-                                            next_lr.first, rv, parent)),
-                                    expand_polynomial(builder::remake_binary(
-                                            next_lr.second, rv, parent)),
-                                    l_r.first);
-                            if (parent->node_type_ == sc_expr_type::mod)
-                                return builder::remake_binary(
-                                        new_parent, l_r.second, parent);
-                            else {
-                                return new_parent;
-                            }
-                        }
-                    };
-                    switch (l_r.first->node_type_) {
-                        // TODO(xxx): special case for distribution law of
-                        // Integer division and modulo
-                        case sc_expr_type::add: {
-                            bool skip = parent->node_type_ == sc_expr_type::div
-                                    || (parent->node_type_ == sc_expr_type::mod
-                                            && skip_mod_expand_);
-                            return expand_add_sub(skip);
-                        }
-                        case sc_expr_type::sub: {
-                            auto is_uint = get_type_category(l_r.second->dtype_)
-                                    == CATE_UINT;
-                            bool skip = parent->node_type_ == sc_expr_type::div
-                                    || (parent->node_type_ == sc_expr_type::mod
-                                            && (skip_mod_expand_ || is_uint));
-                            return expand_add_sub(skip);
-                        }
-                        case sc_expr_type::mul:
-                        case sc_expr_type::div:
-                        case sc_expr_type::mod: {
-                            if (parent->node_type_ == sc_expr_type::mod) {
-                                if (fold_special_exprs(
-                                            parent, l_r.first, l_r.second)) {
-                                    return expand_polynomial(parent);
-                                }
-                            }
-                            auto next_lr = get_operand_from_binary(l_r.first);
-                            // folding
-                            if (next_lr.second.isa<constant_c>()) {
-                                auto new_parent = expand_polynomial(
-                                        builder::remake_binary(
-                                                expand_polynomial(
-                                                        next_lr.first),
-                                                next_lr.second, l_r.first));
-                                return builder::remake_binary(
-                                        new_parent, l_r.second, parent);
-                            } else {
-                                break;
-                            }
-                        }
-                        default: return parent;
-                    }
-                }
-                break;
-            }
-            case sc_expr_type::add:
-            case sc_expr_type::sub: {
-                auto l_r = get_operand_from_binary(parent);
-                return builder::remake_binary(expand_polynomial(l_r.first),
-                        expand_polynomial(l_r.second), parent);
-            }
-            default: return parent;
-        }
-        return parent;
-    }
-
-    expr_c fold_binary_impl(
-            expr_c parent, const expr_c &lhs, const expr_c &rhs) {
-        auto l = fold_range_dispatch(lhs);
-        auto r = fold_range_dispatch(rhs);
-        if (l.isa<constant>() && r.isa<constant>()) {
-            auto cl = l.static_as<constant_c>();
-            auto cr = r.static_as<constant_c>();
-            return compute_constexpr(cl, cr, parent);
-        }
-        try_rotate_const(parent, l, r);
-        if (r.isa<constant>()) {
-            if (fold_special_consts(parent, l, r.static_as<constant>())) {
-                return parent;
-            }
-        }
-        if (fold_special_exprs(parent, l, r)) { return parent; }
-        if (fold_successive_div(parent, l, r)) { return parent; }
-
-        mark_range_for_const(run_idx_, l, fast_);
-        mark_range_for_const(run_idx_, r, fast_);
-        auto l_range = get_range_of_expr(run_idx_, l, fast_);
-        auto r_range = get_range_of_expr(run_idx_, r, fast_);
-        const_range_t rg;
-        bool successful_infer = false;
-        if (r_range) {
-            successful_infer = compute_range(parent, l_range, r_range, rg);
-            if (successful_infer) {
-                if (rg.is_single_value()) {
-                    return make_expr<constant_node>(rg.start, parent->dtype_);
-                }
-            }
-        }
-        expr_c ret;
-        if (!l.ptr_same(lhs) || !r.ptr_same(rhs)) {
-            ret = builder::remake_binary(l, r, parent);
-        } else {
-            ret = parent;
-        }
-        if (successful_infer && !get_range_of_expr(run_idx_, ret, fast_)) {
-            get_analysis_for_edit(ret.get(), run_idx_)->set_range(rg);
-        }
-        return ret;
-    }
-
-    // run fold_binary_impl repeatedly on the expr until no changes happen
-    expr_c fold_binary(expr_c parent) {
-        expr_depth_++;
-        expr_c old = parent;
-        auto parent_type = parent->node_type_;
-        constexpr int max_iter = 100;
-        int loop_cnt = 0;
-        for (;;) {
-            expr_c ret = parent;
-            if (!fast_ && expr_depth_ == 1) {
-                canonicalize_expressions_t canonicalizer {
-                        run_idx_, replace_map_};
-                ret = canonicalizer.dispatch(parent);
-            }
-            auto l_r = get_operand_from_binary(ret);
-            if (!l_r.first.defined() || !l_r.second.defined()) {
-                expr_depth_--;
-                return ret;
-            }
-            ret = fold_binary_impl(ret, l_r.first, l_r.second);
-            if (ret.ptr_same(old)) {
-                expr_depth_--;
-                return ret;
-            }
-            parent = ret;
-            old = std::move(ret);
-            loop_cnt++;
-            COMPILE_ASSERT(loop_cnt < max_iter,
-                    "Constant folder reaches max iteration time. Either the "
-                    "expression is too complicated or it is a bug of the "
-                    "constant folder.")
-        }
-    }
-
-    expr_c fold_fmadd(intrin_call_c parent) {
-        assert(parent->args_.size() == 3);
-        auto &a = parent->args_[0];
-        auto &b = parent->args_[1];
-        auto &c = parent->args_[2];
-        auto is_const = [](const expr &v, int64_t V) {
-            return v.isa<constant>()
-                    && is_const_equal_to(v.static_as<constant>(), V);
-        };
-        if (is_const(a, 0) || is_const(a, 1) || is_const(b, 0) || is_const(b, 1)
-                || is_const(c, 0)) {
-            return fold_binary(a * b + c);
-        }
-        return parent;
-    }
-
-    expr_c fold_range_dispatch(const expr_c &in) {
-        auto v = dispatch(in);
-        if (v.isa<constant>()) { return v; }
-        if (auto data = get_range_of_expr(run_idx_, v, fast_)) {
-            if (data->is_single_value()) {
-                return make_expr<constant_node>(data->start, v->dtype_);
-            }
-        }
-        return v;
-    }
-
-    expr_c visit(cast_c v) override {
-        auto in = fold_range_dispatch(v->in_);
-        bool changed = !in.ptr_same(v->in_);
-        if (in.isa<constant>()) {
-            auto inconst = in.as<constant_c>();
-            type_category fromty
-                    = get_etype_category_nothrow(inconst->dtype_.type_code_);
-            type_category toty
-                    = get_etype_category_nothrow(v->dtype_.type_code_);
-            if (fromty != CATE_OTHER && toty != CATE_OTHER) {
-                switch (fromty) {
-                    case CATE_FLOAT:
-                        return create_cast<float>(
-                                v->dtype_, toty, inconst->value_);
-                        break;
-                    case CATE_UINT:
-                        return create_cast<uint64_t>(
-                                v->dtype_, toty, inconst->value_);
-                        break;
-                    case CATE_INT:
-                        return create_cast<int64_t>(
-                                v->dtype_, toty, inconst->value_);
-                        break;
-                    default:
-                        COMPILE_ASSERT(0, "Bad cast from " << inconst->dtype_);
-                        return expr();
-                }
-            }
-        }
-        expr_c ret;
-        if (changed) {
-            ret = copy_attr(*v, builder::make_cast(v->dtype_, in));
-        } else {
-            ret = v;
-        }
-        if (auto ana = get_range_of_expr(run_idx_, in, fast_)) {
-            auto cur_cate = get_type_category(v->dtype_);
-            if (ana->cate != CATE_FLOAT && cur_cate != CATE_FLOAT
-                    && !get_range_of_expr(run_idx_, ret, fast_)) {
-                get_analysis_for_edit(ret.get(), run_idx_)
-                        ->set_range(
-                                const_range_t {cur_cate, ana->start, ana->end});
-            }
-        }
-        return ret;
-    }
-
-    expr_c visit(binary_c v) override { return fold_binary(v); }
-    expr_c visit(cmp_c v) override { return fold_binary(v); }
-    expr_c visit(logic_c v) override { return fold_binary(v); }
-    expr_c visit(intrin_call_c v) override {
-        switch (v->type_) {
-            case intrin_type::shl:
-            case intrin_type::shr:
-            case intrin_type::max:
-            case intrin_type::min:
-            case intrin_type::int_and:
-            case intrin_type::int_or: return fold_binary(v);
-            case intrin_type::fmadd: {
-                auto ret = ir_consistent_visitor_t::visit(std::move(v));
-                if (ret.isa<intrin_call_c>()) {
-                    return fold_fmadd(ret.static_as<intrin_call_c>());
-                } else {
-                    return ret;
-                }
-            }
-            default: break;
-        }
-        auto ret = ir_consistent_visitor_t::visit(std::move(v));
-        return ret;
-    }
-    expr_c visit(logic_not_c v) override {
-        auto in = fold_range_dispatch(v->in_);
-        bool changed = !in.ptr_same(v->in_);
-        if (in.isa<constant>()) {
-            auto inconst = in.as<constant>();
-            if (inconst->is_vector()) return v;
-            COMPILE_ASSERT(inconst->dtype_ == datatypes::boolean,
-                    "logic_not should have a boolean operand: " << v);
-            uint64_t v = inconst->value_[0].u64 ? 0 : 1;
-            return make_expr<constant_node>(v, datatypes::boolean);
-        }
-        if (changed) {
-            return copy_attr(*v, builder::make_logic_not(in));
-        } else {
-            return v;
-        }
-    }
-    expr_c visit(select_c v) override {
-        auto ret = ir_consistent_visitor_t::visit(std::move(v));
-        if (ret.isa<select>()) {
-            auto node = ret.static_as<select_c>();
-            const auto &cond = node->cond_;
-            const auto &left = node->l_;
-            const auto &right = node->r_;
-            const auto &cond_vals = cond.dyn_as<constant_c>()->value_;
-            auto count = left->dtype_.lanes_;
-            if (left.isa<constant>() && right.isa<constant>()
-                    && (cond.isa<constant>())) {
-                const auto &right_vals = right.dyn_as<constant_c>()->value_;
-                const auto &left_vals = left.dyn_as<constant_c>()->value_;
-                std::vector<union_val> vals(count);
-                auto value_size_select
-                        = [](const std::vector<union_val> &vls, size_t itr) {
-                              return vls.size() > 1 ? vls[itr] : vls[0];
-                          };
-                auto cond_select = [](bool cond, const union_val &left,
-                                           const union_val &right) {
-                    return cond ? left : right;
-                };
-                for (size_t itr = 0; itr < count; itr++) {
-                    // Our IR is usually written in the form of
-                    // make_expr<constant_node>(1.f,f32(8)), and its
-                    // constant value size will be 1.
-                    auto cond_value = cond_vals[0].u64;
-                    bool boolean_cond
-                            = value_size_select(cond_vals, itr).u64 > 0UL;
-                    bool uint_cond = ((1ULL << itr) & cond_value);
-                    union_val sel_val;
-                    if (cond->dtype_.type_code_ == sc_data_etype::BOOLEAN) {
-                        sel_val = cond_select(boolean_cond,
-                                value_size_select(left_vals, itr),
-                                value_size_select(right_vals, itr));
-                    } else {
-                        sel_val = cond_select(uint_cond,
-                                value_size_select(left_vals, itr),
-                                value_size_select(right_vals, itr));
-                    }
-                    vals[itr] = sel_val;
-                }
-                return builder::make_constant(vals, ret->dtype_);
-            } else if (cond.isa<constant>()
-                    && cond->dtype_.type_code_ != sc_data_etype::BOOLEAN) {
-                assert(left->dtype_.lanes_
-                        <= utils::get_sizeof_type(cond->dtype_) * 8);
-                bool check_true = true;
-                while (count-- > 0) {
-                    check_true &= (cond_vals[0].u64 >> count) & 1;
-                }
-                bool check_false = cond_vals[0].u64 == 0UL;
-                if (check_true) { return node->l_; }
-                if (check_false) { return node->r_; }
-                return ret;
-            } else if (cond->dtype_.type_code_ == sc_data_etype::BOOLEAN
-                    && cond.isa<constant>()) {
-                bool check_true
-                        = std::all_of(cond_vals.begin(), cond_vals.end(),
-                                [](union_val x) { return x.u64 > 0UL; });
-                bool check_false
-                        = std::all_of(cond_vals.begin(), cond_vals.end(),
-                                [](union_val x) { return x.u64 == 0UL; });
-                if (check_true) { return node->l_; }
-                if (check_false) { return node->r_; }
-                return ret;
-            } else if (node->l_.ptr_same(node->r_)
-                    || (node->l_.isa<constant>()
-                            && node->l_->equals(node->r_))) {
-                return node->l_;
-            }
-        }
-        return ret;
-    }
-
-    stmt_c visit(define_c v) override {
-        auto ret = ir_consistent_visitor_t::visit(v).checked_as<define_c>();
-        if (fast_) { return ret; }
-        if (v->var_.isa<var>() && v->var_->dtype_.lanes_ == 1
-                && ret->init_.defined() && !ret.ptr_same(v)
-                && ret->init_.isa<constant>()) {
-            var_rvalue_changed_ = true;
-        }
-        return ret;
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto ret = ir_consistent_visitor_t::visit(v).checked_as<assign_c>();
-        if (fast_) { return ret; }
-        if (v->var_.isa<var>() && v->var_->dtype_.lanes_ == 1
-                && !v->value_.ptr_same(ret->value_)
-                && ret->value_.isa<constant>()) {
-            var_rvalue_changed_ = true;
-        }
-        return ret;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto old_fast = fast_;
-        fast_ |= (v->attr_
-                && v->attr_->get_or_else("bypass_complex_const_fold", false));
-        // don't fold range for the var
-        auto var = dispatch(v->var_);
-        auto begin = fold_range_dispatch(v->iter_begin_);
-        auto end = fold_range_dispatch(v->iter_end_);
-        auto step = fold_range_dispatch(v->step_);
-
-        mark_range_for_const(run_idx_, begin, fast_);
-        mark_range_for_const(run_idx_, end, fast_);
-        int64_t loop_len_hint = -1;
-        if (!fast_) {
-            if (step.isa<constant>()) {
-                auto stepc = get_const_as_int(step.static_as<constant_c>());
-                auto begin_r = get_range_of_expr(run_idx_, begin, fast_);
-                auto end_r = get_range_of_expr(run_idx_, end, fast_);
-                if (stepc > 0 && begin_r && end_r) {
-                    int64_t max_loop_len = end_r->end.s64 - begin_r->start.s64;
-                    int64_t real_loop_len = max_loop_len / stepc * stepc;
-                    if (max_loop_len > 0 && real_loop_len > 0) {
-                        get_analysis_for_edit(var.get(), run_idx_)
-                                ->set_range(const_range_t {
-                                        get_type_category(var->dtype_),
-                                        begin_r->start,
-                                        begin_r->start.s64
-                                                + (real_loop_len - 1) * stepc});
-                    }
-                }
-            }
-            if (v->attr_) {
-                loop_len_hint
-                        = v->attr_->get_or_else("loop_len_hint", INT64_C(-1));
-            }
-            // try to fold the for range like for(i = A to A+1 step 1) {}
-            if (loop_len_hint == -1) {
-                if (!begin.isa<constant>() && !end.isa<constant>()) {
-                    expr_c real_begin = get_assigned_expr(run_idx_, begin.get())
-                                                ->node_ptr_from_this();
-                    expr_c real_end = get_assigned_expr(run_idx_, end.get())
-                                              ->node_ptr_from_this();
-                    auto ths = this;
-                    auto try_fold = [ths, &loop_len_hint](const expr_c &beg_v,
-                                            const expr_c &end_v,
-                                            const expr_c &step_v) -> bool {
-                        auto loop_len = ths->fold_range_dispatch(
-                                (end_v - beg_v) / step_v);
-                        if (loop_len.isa<constant>()) {
-                            loop_len_hint = get_expr_as_int(loop_len);
-                            return true;
-                        }
-                        return false;
-                    };
-                    if (try_fold(real_begin, real_end, step)) {
-                        // fall through
-                    } else if (!real_end.ptr_same(end)
-                            && try_fold(real_begin, end, step)) {
-                        // fall through
-                    } else if (!real_begin.ptr_same(begin)
-                            && try_fold(begin, real_end, step)) {
-                        // fall through
-                    }
-                }
-            }
-        }
-        loop_depth_++;
-        if (!old_fast) {
-            get_analysis_for_edit(var.get(), run_idx_)->loop_depth
-                    = loop_depth_;
-        }
-        auto body = dispatch(v->body_);
-        loop_depth_--;
-
-        bool changed = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-                && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_)
-                && body.ptr_same(v->body_));
-        stmt ret;
-        if (changed) {
-            ret = copy_attr(*v,
-                    builder::make_for_loop_unattached(var, begin, end, step,
-                            body, v->incremental_, v->kind_, v->num_threads_));
-        } else {
-            ret = std::move(v).remove_const();
-        }
-        if (loop_len_hint >= 0) {
-            ret->attr()["loop_len_hint"] = loop_len_hint;
-        }
-        fast_ = old_fast;
-        return ret;
-    }
-
-    stmt_c visit(if_else_c v) override {
-        auto cond = fold_range_dispatch(v->condition_);
-        auto thencase = dispatch(v->then_case_);
-
-        stmt_c elsecase;
-        if (v->else_case_.defined()) elsecase = dispatch(v->else_case_);
-        bool changed = !cond.ptr_same(v->condition_)
-                || !elsecase.ptr_same(v->else_case_)
-                || !thencase.ptr_same(v->then_case_);
-        if (cond.isa<constant>()) {
-            assert(!cond.as<constant>()->is_vector());
-            COMPILE_ASSERT(cond->dtype_ == datatypes::boolean,
-                    "IfElse node expects an boolean expr as the condition, got "
-                            << cond->dtype_ << " expr = " << v);
-            bool val = cond.as<constant>()->value_[0].u64;
-            if (val) {
-                return thencase;
-            } else {
-                if (v->else_case_.defined()) { return elsecase; }
-                return make_stmt<stmts_node_t>(std::vector<stmt>());
-            }
-        }
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_if_else_unattached(cond, thencase, elsecase));
-        }
-        return v;
-    }
-
-    func_c dispatch(func_c v) override {
-        if (!fast_) {
-            func_c cur_f = v;
-            func_c ret;
-            for (;;) {
-                var_rvalue_changed_ = false;
-                constant_fold_analysis_t ana {run_idx_};
-                ana.dispatch(cur_f);
-                // keep the exprs alive to make sure the raw pointers in
-                // analysis result is valid
-                auto keep_alive = std::move(ana.single_assign_);
-                ret = ir_visitor_t::dispatch(cur_f);
-                if (!var_rvalue_changed_ || ret == cur_f) {
-                    for (auto &kv : keep_alive) {
-                        if (kv.first->temp_data_) {
-                            kv.first->temp_data_->clear();
-                        }
-                    }
-                    return ret;
-                }
-                cur_f = ret;
-                sub_run_idx_++;
-            }
-        }
-        return ir_visitor_t::dispatch(v);
-    }
-};
-
-func_c constant_folder_t::operator()(func_c f) const {
-    constant_fold_t pass {fast_};
-    return pass.dispatch(std::move(f));
-}
-
-stmt_c constant_folder_t::operator()(stmt_c f) const {
-    constant_fold_t pass {fast_};
-    return pass.dispatch(std::move(f));
-}
-
-expr_c constant_folder_t::operator()(expr_c f) const {
-    constant_fold_t pass {fast_};
-    return pass.dispatch(std::move(f));
-}
-
-/**
- *  this feature is currently used to fold the index of reshape/reorder output,
- * so additional folding is added before and after expand_polynomial. TODO: move
- *  this feature to constant folding pass
- *  @param f: original polynomial expr.
- *  @param max_iter: maximum iteration time, default is one.
- *  @param skip_mod: skip int mod expand.
- * */
-expr_c constant_folder_t::expand_polynomial(
-        expr_c f, int max_iter, bool skip_mod) {
-    constant_fold_t pass {true, skip_mod};
-    auto ret = pass.dispatch(std::move(f));
-    for (int i = 0; i < max_iter; i++) {
-        auto old = ret;
-        ret = pass.expand_polynomial(old);
-        if (ret.ptr_same(old)) { break; }
-    }
-    constant_fold_t pass2 {true, skip_mod};
-    return pass2.dispatch(ret);
-}
-
-const_ir_module_ptr constant_folder_t::operator()(const_ir_module_ptr f) {
-    constant_fold_t pass {fast_};
-    return dispatch_module_on_visitor(&pass, f);
-}
-
-expr do_cast_and_fold(const expr &in) {
-    static auto_caster_t caster;
-    constant_folder_t folder {true};
-    return folder(caster(in)).remove_const();
-}
-
-expr_c do_cast_and_fold(const expr_c &in) {
-    return do_cast_and_fold(in.remove_const());
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/constant_fold.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/constant_fold.hpp
deleted file mode 100644
index f2bbd984395..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/constant_fold.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CONSTANT_FOLD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CONSTANT_FOLD_HPP
-
-#include <utility>
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace constant_folding {
-std::pair<expr_c, expr_c> get_operand_from_binary(const expr_c &a);
-bool is_op_commutative_and_associative(const expr_c &v);
-} // namespace constant_folding
-
-/**
- * Fold the constants.
- * Supported nodes:
- *  binary, cmp, logic, logic_not, cast, if_else
- *
- * It will do the following (c as constant, "+" as an example):
- * c1 + c2 => c3
- * c + x => x + c
- * (x + c1) + c2 => x + (c1 + c2)
- * (x + c) + y => (x + y) + c
- * x + (y + c) => (x + y) + c
- * (x + c1) + (y + c2) => (x + y) + (c1 + c2)
- *
- * Also fold special expr:
- * a (+ - * && ||) 0/false
- * a (* / % && ||) 1/true
- * a (- / % && || max min > >= < <= == !=) a
- *
- * @param fast whether check if varaibles are actually constants. If true, will
- * skip this check and keep the variables
- * */
-class constant_folder_t : public module_pass_t {
-public:
-    bool fast_;
-    constant_folder_t(bool fast = true) : fast_(fast) {}
-    func_c operator()(func_c f) const;
-    stmt_c operator()(stmt_c f) const;
-    expr_c operator()(expr_c f) const;
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    expr_c expand_polynomial(expr_c f, int max_iter = 1, bool skip_mod = false);
-    SC_DECL_PASS_INFO_FUNC();
-};
-// do auto cast and constant fold for input expr.
-expr do_cast_and_fold(const expr &in);
-expr_c do_cast_and_fold(const expr_c &in);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/closurize.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/closurize.cpp
deleted file mode 100644
index 81af324dd46..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/closurize.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "closurize.hpp"
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/closurize_impl.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <runtime/config.hpp>
-#include <runtime/thread_pool_flags.hpp>
-#include <runtime/trace.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-#include <util/optional.hpp>
-#include <util/optional_find.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(pass.closurize)
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(closurizer_cpu,
-        SC_PASS_DEPENDS_ON(nested_parallel_flattener,
-                parallel_workload_dispatcher, validator, trace_inserter,
-                parallel_merge, tensor_init),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-func_t get_parallel_call_with_env_func(bool managed) {
-    static func_t f = builder::_decl_func("sc_parallel_call_cpu_with_env",
-            datatypes::void_t,
-            {_arg_("func", datatypes::pointer),
-                    _arg_("flags", datatypes::index),
-                    _arg_("stream", datatypes::pointer),
-                    _arg_("env", datatypes::s8.get_pointerof()),
-                    _arg_("begin", datatypes::index),
-                    _arg_("end", datatypes::index),
-                    _arg_("step", datatypes::index),
-                    _arg_("args", datatypes::generic.get_pointerof())});
-    static func_t f_managed
-            = builder::_decl_func("sc_parallel_call_managed", datatypes::void_t,
-                    {_arg_("func", datatypes::pointer),
-                            _arg_("flags", datatypes::index),
-                            _arg_("stream", datatypes::pointer),
-                            _arg_("env", datatypes::s8.get_pointerof()),
-                            _arg_("begin", datatypes::index),
-                            _arg_("end", datatypes::index),
-                            _arg_("step", datatypes::index),
-                            _arg_("args", datatypes::generic.get_pointerof())});
-    return managed ? f_managed : f;
-}
-
-// the sub-pass to collect the tensor definitions in a function. It is used in
-// last-barrier safe removal to find the captured base tensors
-class tensor_def_collector_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    struct tensor_def_info_t {
-        expr_c init_;
-        bool is_arg_;
-    };
-    using result_t = std::unordered_map<expr_c, tensor_def_info_t>;
-    static expr_c find_base_tensor(const expr_c &v,
-            const tensor_def_collector_t::result_t &scope, bool &out_is_arg) {
-        expr_c cur = v;
-        for (;;) {
-            auto base = utils::find_map_value(scope, cur).get_or_else(nullptr);
-            // global tensor
-            if (!base) { return cur; }
-            if (base->is_arg_) {
-                // it is arg tensor
-                out_is_arg = true;
-                return v;
-            }
-            if (!base->init_.defined()) {
-                // itself is a base tensor
-                out_is_arg = false;
-                return cur;
-            }
-            if (!base->init_.isa<tensor>()) {
-                // base is not a tensor, failed
-                out_is_arg = false;
-                return expr_c();
-            }
-            cur = base->init_;
-        }
-    }
-
-    result_t result_;
-
-    expr_c dispatch(expr_c v) override { return v; }
-    func_c dispatch(func_c v) override {
-        for (auto &arg : v->params_) {
-            if (arg.isa<tensor>()) {
-                result_[arg] = tensor_def_info_t {expr_c(), true};
-            }
-        }
-        return ir_viewer_t::dispatch(v);
-    }
-
-    void view(define_c v) override {
-        if (v->var_.isa<tensor>()) {
-            expr_c base;
-            if (v->init_.defined()) {
-                auto tsr = get_base_tensor_of(v->init_);
-                if (tsr.defined()) {
-                    base = tsr;
-                } else {
-                    base = v->init_;
-                }
-            }
-            result_[v->var_] = tensor_def_info_t {base, false};
-        }
-    }
-
-    // link the args of a function into the current function's tensor info. The
-    // call should happen in the current func body
-    void link_call_args(const call &thecall, const func_t &callee) {
-        for (size_t i = 0; i < callee->params_.size(); i++) {
-            auto &param = callee->params_[i];
-            auto &arg = thecall->args_[i];
-            if (param.isa<tensor>()) {
-                result_[param]
-                        = tensor_def_info_t {get_base_tensor_of(arg), false};
-            }
-        }
-    }
-};
-
-namespace parallel_call_args {
-enum { ADDR = 0, FLAGS, STREAM, MODULE_DATA, BEGIN, END, STEP, ARGS };
-}
-
-class closurize_cpu_impl_t : public closurize_impl_t {
-    int rename_counter_ = 0;
-    bool use_managed_thread_pool_;
-    std::vector<call> out_calls;
-    func_t the_last_op_;
-    // we use pointers instead of reference to bypass a g++4.8 bug. If we pass
-    // result_t by reference, the address will be wrong in g++4.8
-    const tensor_def_collector_t::result_t *main_tensor_def_info_;
-    const tensor_def_collector_t::result_t *lastop_tensor_def_info_;
-    bool has_idle_func_ = false;
-    // makes the closure function and its generic wrapper
-    func_t make_closure_func(const std::string &name,
-            std::vector<expr_c> &&params, stmt_c body,
-            const std::vector<call_node::parallel_attr_t> &para_attr) override {
-        bool need_trace = runtime_config_t::get().trace_mode_
-                == runtime_config_t::trace_mode_t::MULTI_THREAD;
-        COMPILE_ASSERT(
-                para_attr.size() == 1, "CPU does not support grouped parallel");
-        func_t closure
-                = builder::make_func(name, params, body, datatypes::void_t);
-        // make the wrapper function
-        stmts seq = make_stmt<stmts_node_t>(std::vector<stmt>());
-        int func_id = -1;
-        if (need_trace) {
-            func_id = register_traced_func(name);
-            seq->seq_.insert(seq->seq_.begin(),
-                    builder::make_evaluate_unattached(
-                            builtin::make_trace(func_id, 0, 0)));
-        }
-        expr itervar = builder::make_var(datatypes::index, "i");
-        expr args = builder::make_tensor(
-                "args", {(int)params.size() - 1}, datatypes::generic);
-        func_t ret = builder::make_func(name + "_0wrapper",
-                std::vector<expr> {itervar, args}, seq, datatypes::void_t);
-        assert(!modu_->get_func(ret->name_));
-        assert(!modu_->get_func(closure->name_));
-
-        closure->attr()[function_attrs::private_] = true;
-        ret->attr()[function_attrs::private_] = true;
-        ret->decl_->attr() = ret->attr();
-        closure->decl_->attr() = closure->attr();
-        std::vector<expr> fargs;
-        fargs.reserve(params.size());
-        // params[0] is the itervar
-        fargs.emplace_back(itervar);
-        for (uint64_t idx = 1; idx < params.size(); idx++) {
-            auto &param = closure->params_[idx];
-            assert(param->dtype_.lanes_ == 1);
-            if (param->dtype_ == datatypes::generic) {
-                fargs.emplace_back(args[idx - 1]);
-            } else {
-                fargs.emplace_back(
-                        builder::make_cast(param->dtype_, args[idx - 1]));
-            }
-        }
-        seq->seq_.emplace_back(builder::make_evaluate_unattached(
-                builder::make_call(closure->decl_, fargs)));
-        if (need_trace) {
-            seq->seq_.emplace_back(builder::make_evaluate_unattached(
-                    builtin::make_trace(func_id, 1, 0)));
-        }
-        modu_->add_func({closure, ret});
-        return ret;
-    }
-    stmt make_parallel_call(func_t target, std::vector<expr> &captures,
-            std::vector<call_node::parallel_attr_t> &&para_attr) override {
-        // we now need to prepare the arguments.
-        // 1. Allcate a generic value array
-        // 2. For each captured variable except the first one, put them
-        // in the array. The first capture is the itervar, we should
-        // ignore it
-        stmts seq = make_stmt<stmts_node_t>(std::vector<stmt>());
-        std::string argname = "__tempargs";
-        argname += std::to_string(rename_counter_++);
-        auto argsbuf = builder::make_tensor(
-                argname, {captures.size() - 1}, datatypes::generic);
-        seq->seq_.emplace_back(
-                builder::make_var_tensor_def_unattached(argsbuf));
-
-        for (uint64_t argidx = 1; argidx < captures.size(); argidx++) {
-            // push "argsbuf[i] = (generic)captures_i"
-            seq->seq_.emplace_back(builder::make_assign_unattached(
-                    argsbuf[argidx - 1],
-                    builder::make_cast(datatypes::generic, captures[argidx])));
-        }
-        assert(para_attr.size() == 1);
-        constant_folder_t folder {};
-        expr_c begin_v = para_attr[0].begin_;
-        cast_to(begin_v, datatypes::index, begin_v);
-        begin_v = folder(begin_v);
-
-        expr_c end_v = para_attr[0].end_;
-        cast_to(end_v, datatypes::index, end_v);
-        end_v = folder(end_v);
-
-        expr_c step_v = para_attr[0].step_;
-        cast_to(step_v, datatypes::index, step_v);
-        step_v = folder(step_v);
-        uint64_t flag = has_idle_func_
-                ? runtime::thread_pool_flags::THREAD_POOL_RUN_IDLE_FUNC
-                : 0;
-        auto ret_call = builder::make_call(
-                get_parallel_call_with_env_func(use_managed_thread_pool_),
-                std::vector<expr_c> {builder::make_func_addr(std::move(target)),
-                        /*flags*/ make_expr<constant_node>(flag),
-                        /*stream*/
-                        make_expr<constant_node>(
-                                UINT64_C(0), datatypes::pointer),
-                        /*env*/
-                        make_expr<constant_node>(
-                                UINT64_C(0), datatypes::s8.get_pointerof()),
-                        begin_v, end_v, step_v, argsbuf});
-        ret_call->temp_data() = captures;
-        out_calls.emplace_back(ret_call.static_as<call>());
-        seq->seq_.emplace_back(make_stmt<evaluate_node_t>(std::move(ret_call)));
-        has_idle_func_ = false;
-        return seq;
-    }
-
-public:
-    using closurize_impl_t::dispatch;
-    using closurize_impl_t::visit;
-
-    expr_c visit(call_c v) override {
-        if (v->func_ == builtin::get_set_idle_func_managed_func()
-                && !in_parallel_for) {
-            has_idle_func_ = true;
-        }
-        return closurize_impl_t::visit(std::move(v));
-    }
-
-    uint64_t &get_last_parallel_call_flag() {
-        return out_calls.back()
-                ->args_.at(parallel_call_args::FLAGS)
-                .checked_as<constant>()
-                ->value_.at(0)
-                .u64;
-    }
-
-    optional<std::vector<expr_c>> collect_captured_base_tensors(
-            const std::vector<expr> &captured) {
-        std::vector<expr_c> ret;
-        for (auto &v : captured) {
-            if (!v.isa<tensor>()) {
-                if (v.isa<var>() && v->dtype_.is_etype_pointer()) {
-                    return none_opt();
-                }
-                continue;
-            }
-            if (v.static_as<tensor>()->elem_dtype_.is_pointer()) {
-                return none_opt();
-            }
-            bool is_arg = false;
-            expr_c base = tensor_def_collector_t::find_base_tensor(
-                    v, *lastop_tensor_def_info_, is_arg);
-            if (!base.defined()) {
-                // failed to find the captured base tensor
-                return none_opt();
-            }
-            if (is_arg) {
-                base = tensor_def_collector_t::find_base_tensor(
-                        base, *main_tensor_def_info_, is_arg);
-                if (!base.defined()) {
-                    // failed to find the captured base tensor
-                    return none_opt();
-                }
-            }
-            ret.emplace_back(base);
-        }
-        return ret;
-    }
-
-    func_c dispatch(func_c f) override {
-        out_calls.clear();
-        has_idle_func_ = false;
-        auto ret = closurize_impl_t::dispatch(f);
-        if (!out_calls.empty() && f->attr_
-                && f->attr_->get_or_else(
-                        function_attrs::has_idle_func, false)) {
-            auto &the_flag = get_last_parallel_call_flag();
-            the_flag |= runtime::thread_pool_flags::THREAD_POOL_RUN_IDLE_FUNC;
-            the_flag |= runtime::thread_pool_flags::THREAD_POOL_DISABLE_ROLLING;
-        }
-        if (f == the_last_op_ && !out_calls.empty()) {
-            // try to remove the last barrier
-            auto &seq = f->body_.checked_as<stmts>()->seq_;
-            for (auto itr = seq.rbegin(); itr != seq.rend(); ++itr) {
-                auto &s = *itr;
-                // allow return const after the last parallel call
-                if (s.cast<returns>()
-                                .filter([](const returns &v) {
-                                    return v->value_.isa<constant>();
-                                })
-                                .has_value()) {
-                    continue;
-                }
-                // if the statement is a parallel-for
-                if (s.cast<for_loop>()
-                                .filter([](const for_loop &v) {
-                                    return v->kind_ == for_type::PARALLEL;
-                                })
-                                .has_value()) {
-                    // we need to pin the captured tensors to runtime stack, we
-                    // first collect the base tensors
-                    auto captured_base = collect_captured_base_tensors(
-                            out_calls.back()
-                                    ->get_temp_data()
-                                    .get<std::vector<expr>>());
-                    // if the captured tensors are based on complex
-                    // tensors/pointers, we can't optimize it
-                    if (!captured_base.has_value()) {
-                        SC_MODULE_WARN
-                                << "Cannot optimize the last barrier in "
-                                   "function "
-                                << f->name_
-                                << " because it captures complex pointers.";
-                        break;
-                    }
-                    for (auto &base : captured_base.get()) {
-                        base.remove_const()
-                                ->attr()[attr_keys::runtime_stack_alloc]
-                                = true;
-                    }
-                    auto &the_flag = get_last_parallel_call_flag();
-                    the_flag |= runtime::thread_pool_flags::THREAD_POOL_EXIT;
-                    // the closure args buffer must be allocated via runtime
-                    // allocator instead of the native heap. So that when the
-                    // main thread exits from the kernel and waiting for the
-                    // worker threads, the args are still valid.
-                    auto closure_arg
-                            = out_calls.back()
-                                      ->args_.at(parallel_call_args::ARGS)
-                                      .as<tensor>();
-                    COMPILE_ASSERT(closure_arg.defined(), "Bad closure arg");
-                    closure_arg->attr()[attr_keys::runtime_stack_alloc] = true;
-                    break;
-                }
-
-                // else, there are some complex statements after the last
-                // parallel-for, do not optimize
-                SC_MODULE_WARN
-                        << "Cannot optimize the last barrier in function "
-                        << f->name_
-                        << " because there are complex code after the last "
-                           "parallel-for.";
-                break;
-            }
-        }
-        return ret;
-    }
-    closurize_cpu_impl_t(const ir_module_ptr &m, bool use_managed_thread_pool,
-            const func_t &the_last_op,
-            const tensor_def_collector_t::result_t &main_tensor_def_info,
-            const tensor_def_collector_t::result_t &lastop_tensor_def_info)
-        : closurize_impl_t(m->get_module_vars(), m)
-        , use_managed_thread_pool_(use_managed_thread_pool)
-        , the_last_op_ {the_last_op}
-        , main_tensor_def_info_ {&main_tensor_def_info}
-        , lastop_tensor_def_info_ {&lastop_tensor_def_info} {}
-};
-
-class single_core_remove_parallel_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    expr_c dispatch(expr_c v) override { return v; }
-    stmt_c visit(for_loop_c v) override {
-        auto ret = ir_visitor_t::visit(v).checked_as<for_loop>();
-        if (ret->kind_ == for_type::PARALLEL) {
-            if (ret.get() != v.get()) {
-                // the for loop is remade, just change it
-                ret->kind_ = for_type::NORMAL;
-                ret->attr()[attr_keys::buf_sched_top_scope] = true;
-            } else {
-                // remake a new IR node
-                auto retnode = copy_attr(*v,
-                        builder::make_for_loop_unattached(ret->var_,
-                                ret->iter_begin_, ret->iter_end_, ret->step_,
-                                ret->body_, ret->incremental_,
-                                for_type::NORMAL));
-                retnode->attr()[attr_keys::buf_sched_top_scope] = true;
-                return retnode;
-            }
-        }
-        return ret;
-    }
-};
-
-func_c remove_parallel_on_func(const func_c &f) {
-    single_core_remove_parallel_t pass;
-    return pass.dispatch(f);
-}
-
-const_ir_module_ptr closurizer_cpu_t::operator()(const_ir_module_ptr inmod) {
-    float gflop
-            = inmod->attr_.get_or_else(ir_module_t::attr_key_t::GFLOP, 0.0f);
-    auto tp_mod = inmod->attr_.get_or_else(
-            ir_module_t::attr_key_t::MANAGED_THREAD_POOL,
-            thread_pool_mode_t::DIRECT);
-    bool use_managed_thread_pool = tp_mod == thread_pool_mode_t::MANAGED;
-
-    SC_MODULE_INFO << "Use managed thread pool? " << use_managed_thread_pool
-                   << ". Module gflops = " << gflop;
-
-    func_t the_last_op;
-    tensor_def_collector_t::result_t main_tensor_def_info;
-    tensor_def_collector_t::result_t lastop_tensor_def_info;
-    if (!single_core_ && use_managed_thread_pool) {
-        // find the last op in main entry
-        auto main_entry_func = inmod->get_entry_func();
-        if (main_entry_func && main_entry_func->attr_
-                && main_entry_func->attr_->get_or_else(
-                        function_attrs::is_main, false)) {
-            auto &seq = main_entry_func->body_.checked_as<stmts>()->seq_;
-            for (auto itr = seq.rbegin(); itr != seq.rend(); ++itr) {
-                auto &s = *itr;
-                // allow return const after the last op call
-                if (s.cast<returns>()
-                                .filter([](const returns &v) {
-                                    return v->value_.isa<constant>();
-                                })
-                                .has_value()) {
-                    continue;
-                }
-                auto f = s.cast<evaluate>()
-                                 .map([](const evaluate &v) {
-                                     return v->value_.as<call>();
-                                 })
-                                 .map([](const call &v) {
-                                     return std::dynamic_pointer_cast<
-                                             func_base>(v->func_);
-                                 });
-                if (f.has_value()) {
-                    the_last_op = inmod->get_func(f.get()->name_);
-                    if (the_last_op) {
-                        assert(the_last_op->name_ == f.get()->name_);
-                        tensor_def_collector_t collector;
-                        collector.dispatch(main_entry_func);
-                        auto last_call = s.static_as<evaluate>()
-                                                 ->value_.static_as<call>();
-                        collector.link_call_args(last_call, the_last_op);
-                        main_tensor_def_info = std::move(collector.result_);
-
-                        tensor_def_collector_t collector2;
-                        collector2.dispatch(the_last_op);
-                        lastop_tensor_def_info = std::move(collector2.result_);
-                    }
-                    break;
-                }
-                // else, there are some complex statements after the last
-                // parallel-for, do not optimize
-                SC_MODULE_WARN
-                        << "Cannot optimize the last barrier in main function "
-                        << main_entry_func->name_
-                        << " because there are complex code after the last "
-                           "parallel-for.";
-                break;
-            }
-        }
-    }
-
-    auto ret = inmod->copy();
-    ir_visitor_t *the_pass;
-    closurize_cpu_impl_t pass(ret, use_managed_thread_pool, the_last_op,
-            main_tensor_def_info, lastop_tensor_def_info);
-    single_core_remove_parallel_t singlepass {};
-    if (single_core_) {
-        the_pass = &singlepass;
-    } else {
-        the_pass = &pass;
-    }
-    auto &funcs = ret->get_contents();
-    auto sz = funcs.size();
-    for (unsigned i = 0; i < sz; i++) {
-        auto f = std::const_pointer_cast<func_base>(
-                the_pass->dispatch(funcs[i]));
-        funcs[i] = std::move(f);
-    }
-
-    ret->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL] = tp_mod;
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/closurize.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/closurize.hpp
deleted file mode 100644
index dab5e636a9d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/closurize.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_CLOSURIZE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_CLOSURIZE_HPP
-
-#include <compiler/ir/module_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * Base class for closurizer. Different targets should extend the
- * class to make target-specific parallel functions and parallel calls. Replaces
- * parallel for nodes with call_parallel nodes. Also moves the bodies of the
- * parallel for nodes to new closure functions. The closure functions have the
- * interface like `void closure_name(uint64_t i, T1 capture1, T2 capture2,
- * ...);`, where `i` is the loop variable, and captureX is the captured
- * variables that may be used in the body of the original for-loop.
- *
- * The target specific subclass should at least override `make_closure_func` and
- * `make_parallel_call`. See comments below.
- *
- * */
-
-/**
- * Replaces parallel for nodes with call_parallel nodes for CPU
- * @see closurize_impl_t
- * This pass will also generate a wrapper function for each closure on CPU
- * backend, which has signature `void closure_name_wrapper(uint64_t i,
- * generic_val* args);`. The wrapper will extract the arguments from `args` and
- * call the closure.
- *
- *  the for-loop will finally replaced by:
- *  {
- *     tensor argbuf: [generic * X]
- *     argbuf[0] = capture_0;
- *     ...
- *     argbuf[X-1] = capture_X_1;
- *     parallel_call(closure_wrapper_func, argbuf, parallel_attr = {...})
- *  }
- * The backend should lower parallel_call into a call to `sc_parallel_call_cpu`
- * */
-class closurizer_cpu_t : public module_pass_t {
-public:
-    bool single_core_;
-    closurizer_cpu_t(bool single_core) : single_core_(single_core) {}
-    const_ir_module_ptr operator()(const_ir_module_ptr m) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-func_t get_parallel_call_with_env_func(bool managed);
-func_c remove_parallel_on_func(const func_c &f);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/kernel_lower.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/kernel_lower.cpp
deleted file mode 100644
index c4226d708f2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/kernel_lower.cpp
+++ /dev/null
@@ -1,948 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_map>
-
-#include <atomic>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "../../builder.hpp"
-#include "../../content_hash.hpp"
-#include "../../visitor.hpp"
-#include "../buffer_schedule.hpp"
-#include "../index_flatten.hpp"
-#include "kernel_lower.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <runtime/microkernel/cpu/brgemm_range_handle.hpp>
-#include <util/hash_utils.hpp>
-
-SC_MODULE(pass.kernel_lowering_cpu)
-namespace std {
-template <>
-struct hash<dnnl::impl::graph::gc::brgemm::postop_setting_t> {
-    std::size_t operator()(
-            const dnnl::impl::graph::gc::brgemm::postop_setting_t &k) const {
-        size_t ret = 0;
-        dnnl::impl::graph::gc::hash_combine(ret, k.pack_info_[0]);
-        dnnl::impl::graph::gc::hash_combine(ret, k.pack_info_[1]);
-        return ret;
-    }
-};
-
-template <>
-struct hash<std::pair<dnnl::impl::graph::gc::sc_brgemm_bd_mask_t, int>> {
-    std::size_t operator()(
-            const std::pair<dnnl::impl::graph::gc::sc_brgemm_bd_mask_t, int>
-                    &bdmask_pair) const {
-        size_t ret = 0;
-        for (auto &m : bdmask_pair.first) {
-            dnnl::impl::graph::gc::hash_combine(ret, m);
-        }
-        dnnl::impl::graph::gc::hash_combine(ret, bdmask_pair.second);
-        return ret;
-    }
-};
-} // namespace std
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(kernel_lowering_cpu,
-        SC_PASS_DEPENDS_ON(constant_folder, buffer_scheduler),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-using namespace builtin;
-using namespace brgemm;
-
-static bool check_arg_range(const std::vector<expr> &args,
-        std::vector<expr_c> &cached_args, int start, int end) {
-    cached_args.reserve(cached_args.size() + end - start);
-    // check the parameters, if they are all constants,
-    // we can cache the kernel
-    for (int i = start; i < end; i++) {
-        if (!args[i].isa<constant>() && !args[i].isa<tensor>()
-                && !args[i].isa<indexing>()) {
-            // not const, don't cache
-            return false;
-        }
-        cached_args.emplace_back(args[i]);
-    }
-    return true;
-}
-
-static std::vector<int64_t> get_brgemm_attrs_key(const sc_brgemm_attrs_t &attrs,
-        size_t &valid_sz, bool &range_optimize) {
-    std::vector<int64_t> key(attr_key::nkeys, 0);
-    valid_sz = 0;
-    for (auto &attr : attrs) {
-        if (attr.first < brgemm::attr_key::nkeys) {
-            valid_sz++;
-            key[attr.first] = attr.second;
-        } else if (utils::is_one_of(attr.first,
-                           brgemm::attr_key::M_range_upper_bound,
-                           brgemm::attr_key::N_range_upper_bound,
-                           brgemm::attr_key::K_range_upper_bound)) {
-            range_optimize = true;
-        }
-    }
-    return key;
-}
-
-static void validate_brgemm_attrs(const sc_brgemm_attrs_t &brg_attrs,
-        const sc_brgemm_bd_mask_t &bd_mask, const int bd_mask_set_num,
-        bool &use_bd_mask) {
-    size_t dummy;
-    bool dummy1;
-    auto keys = get_brgemm_attrs_key(brg_attrs, dummy, dummy1);
-    if (keys[attr_key::use_uker]) {
-        COMPILE_ASSERT(keys[attr_key::max_bs] > 0,
-                "max_bs should be >0 and valid number for the bs used by "
-                "brgemm when use_uker=true, but got "
-                        << keys[attr_key::max_bs] << ".");
-    }
-    use_bd_mask = false;
-    if (keys[attr_key::bd_mask_level] > 0) {
-        COMPILE_ASSERT(!bd_mask.empty(),
-                "bd_mask should be specified when bd_mask_level>0.");
-        COMPILE_ASSERT(bd_mask_set_num > 0,
-                "bd_mask_set_num should be >0 when bd_mask_level>0, but got "
-                        << bd_mask_set_num << ".");
-        use_bd_mask = true;
-    }
-}
-
-static expr get_brgemm_attrs_arg(const ir_module_ptr &mod,
-        const sc_brgemm_attrs_t &attrs,
-        std::unordered_map<std::vector<int64_t>, expr> &cache,
-        bool &range_optimize) {
-    size_t sz;
-    std::vector<int64_t> key = get_brgemm_attrs_key(attrs, sz, range_optimize);
-    if (sz == 0) { return get_ir_null(); }
-    COMPILE_ASSERT(sz <= attr_key::nkeys,
-            "Size of user defined attributes should not exceed nkeys!");
-    if (cache.find(key) != cache.end()) { return cache[key]; }
-    size_t tsr_sz = sz * sizeof(attrs_setting_t::attrs_map_t) + sizeof(int64_t);
-    std::vector<char> data(tsr_sz, 0);
-    attrs_setting_t *setting = reinterpret_cast<attrs_setting_t *>(data.data());
-    setting->num_ = sz;
-    int c = 0;
-    for (auto &it : attrs) {
-        if (it.first < brgemm::attr_key::nkeys) { setting->map_[c++] = it; }
-    }
-    auto init = std::make_shared<static_data_t>(data.data(), tsr_sz);
-    auto tsr = builder::make_tensor("__brgemm_attrs", {tsr_sz}, datatypes::u8,
-            address_space::automatic, init);
-    mod->add_global_var(builder::make_var_tensor_def_unattached(
-            tsr, linkage::private_global)
-                                .static_as<define>());
-    cache[key] = tsr;
-    return tsr;
-}
-
-static std::pair<expr, expr> get_brgemm_bd_mask_arg(const ir_module_ptr &mod,
-        const sc_brgemm_bd_mask_t &bd_mask, const expr &bd_mask_idx,
-        const expr &bd_mask_len, const int bd_mask_set_num,
-        std::unordered_map<std::pair<sc_brgemm_bd_mask_t, int>,
-                std::pair<expr, expr>> &cur_cache,
-        std::unordered_map<sc_brgemm_bd_mask_t, expr> &full_cached,
-        std::vector<std::vector<stmt>> &brg_bd_mask_arr_assignment) {
-    size_t sz = bd_mask.size();
-    auto bd_mask_pair
-            = std::pair<sc_brgemm_bd_mask_t, int>(bd_mask, bd_mask_set_num);
-    if (sz == 0) { return std::pair<expr, expr>(get_ir_null(), get_ir_null()); }
-    if (cur_cache.find(bd_mask_pair) != cur_cache.end()) {
-        return cur_cache[bd_mask_pair];
-    }
-
-    expr full_bd_mask;
-    if (full_cached.find(bd_mask) == full_cached.end()) {
-        size_t tsr_sz = sz;
-        auto init = std::make_shared<static_data_t>(bd_mask);
-        expr full_bd_mask_ = builder::make_tensor("__brgemm_full_bd_mask",
-                {tsr_sz}, datatypes::u8, address_space::automatic, init);
-        mod->add_global_var(builder::make_var_tensor_def_unattached(
-                full_bd_mask_, linkage::private_global)
-                                    .static_as<define>());
-        full_bd_mask = full_bd_mask_;
-        full_cached[bd_mask] = full_bd_mask;
-    } else {
-        full_bd_mask = full_cached[bd_mask];
-    }
-
-    expr bd_mask_arr = mod->make_global_tensor(datatypes::pointer,
-            "__brgemm_bd_mask_arr", {bd_mask_set_num}, linkage::private_global);
-    std::vector<stmt> bd_mask_arr_assignment(bd_mask_set_num);
-    for (int i = 0; i < bd_mask_set_num; ++i) {
-        bd_mask_arr_assignment.at(i) = builder::make_assign_unattached(
-                builder::make_indexing(bd_mask_arr, {i}),
-                builder::tensor_ptr(full_bd_mask, {i * bd_mask_len}));
-    }
-    brg_bd_mask_arr_assignment.push_back(bd_mask_arr_assignment);
-    auto ret = std::pair<expr, expr>(bd_mask_arr, bd_mask_arr[bd_mask_idx]);
-    cur_cache[bd_mask_pair] = ret;
-
-    return ret;
-}
-
-static expr get_brgemm_postops_setting_arg(const ir_module_ptr &mod,
-        const sc_brgemm_postops_setting_t &postops,
-        std::unordered_map<sc_brgemm_postops_setting_t, expr> &cache) {
-    if (postops.empty()) { return get_ir_null(); }
-    if (cache.find(postops) != cache.end()) { return cache[postops]; }
-    size_t sz = postops.size();
-    size_t tsr_sz = sz * sizeof(postop_setting_t) + sizeof(int64_t);
-    std::vector<char> data(tsr_sz, 0);
-    postops_setting_t *setting
-            = reinterpret_cast<postops_setting_t *>(data.data());
-    setting->num_ = sz;
-    int c = 0;
-    for (auto &op : postops) {
-        setting->ops_[c++] = op;
-    }
-    auto init = std::make_shared<static_data_t>(data.data(), tsr_sz);
-    expr tsr = builder::make_tensor("__brgemm_postops_setting", {tsr_sz},
-            datatypes::u8, address_space::automatic, init);
-    mod->add_global_var(builder::make_var_tensor_def_unattached(
-            tsr, linkage::private_global)
-                                .static_as<define>());
-    cache[postops] = tsr;
-    return tsr;
-}
-
-// has post ops
-static expr get_brgemm_postops_data_arg(
-        std::vector<stmt_c> &ret, const std::vector<expr> &ops_data) {
-    assert(ops_data.size() == brgemm::postops_data_init_func_nargs);
-    auto &in_bufs = ops_data;
-    auto postop_data = builder::make_tensor(
-            "__brgemm_postops_data", {postops_data_size}, datatypes::u8);
-
-    expr bin_ptr = in_bufs[2];
-    if (!in_bufs[2]->equals(get_ir_null())) {
-        bin_ptr = builder::make_tensor(
-                "__binary_rhs_ptr", {1}, datatypes::pointer);
-        ret.emplace_back(builder::make_var_tensor_def_unattached(bin_ptr));
-        ret.back().remove_const()->attr().set(
-                attr_keys::tsr_dont_buf_sched, true);
-        ret.emplace_back(builder::make_assign_unattached(
-                bin_ptr[UINT64_C(0)], in_bufs[2]));
-    }
-    ret.emplace_back(builder::make_var_tensor_def_unattached(postop_data));
-    ret.emplace_back(builder::make_evaluate_unattached(
-            builder::make_call(builtin::get_brgemm_postops_data_init_func(),
-                    {postop_data, in_bufs[0], in_bufs[1], bin_ptr, in_bufs[3],
-                            in_bufs[4], in_bufs[5], in_bufs[6], in_bufs[7],
-                            in_bufs[8], in_bufs[9], in_bufs[10], in_bufs[11],
-                            in_bufs[12], in_bufs[13]})));
-
-    return postop_data;
-}
-
-static sc_data_type_t infer_out_dtype(const sc_data_type_t &in_dtype) {
-    if (utils::is_one_of(in_dtype, datatypes::u8, datatypes::s8)) {
-        return datatypes::s32;
-    }
-    return datatypes::f32;
-}
-
-static expr brgemm_init_kernel_cache(brgemm_mode mode,
-        scflags_t::brgemm_backend_t backend, const std::vector<expr> &args,
-        std::vector<expr_c> &cached_args, float beta, bool has_postop) {
-    if (mode == brgemm_mode::stride) {
-        // cache basic args
-        const int expected_cache_org_args = brgemm_args::NUM_BASIC_ARGS_STRIDE;
-        const int expected_cache_extra_start
-                = brgemm_args::NUM_FULL_ARGS_STRIDE;
-        const int expected_cache_extra_args
-                = brgemm_args::extra_args_offset::cache_nargs;
-        // +1 for context pointer
-        assert(args.size()
-                == brgemm_args::NUM_FULL_ARGS_STRIDE
-                        + brgemm_args::extra_args_offset::nargs + 1);
-        if (!check_arg_range(args, cached_args, brgemm_args::M,
-                    expected_cache_org_args)) {
-            return expr();
-        }
-        if (!check_arg_range(args, cached_args, expected_cache_extra_start,
-                    expected_cache_extra_start + expected_cache_extra_args)) {
-            return expr();
-        }
-        return get_brgemm_creator_and_call_func(mode, backend, has_postop)
-                .first(args[brgemm_args::M], args[brgemm_args::N],
-                        args[brgemm_args::K], args[brgemm_args::LDA],
-                        args[brgemm_args::LDB], args[brgemm_args::LDC],
-                        args[brgemm_args::STRIDE_A],
-                        args[brgemm_args::STRIDE_B], beta,
-                        args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                                + brgemm_args::extra_args_offset::
-                                        dtypeA], // dtypeA
-                        args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                                + brgemm_args::extra_args_offset::
-                                        dtypeB], // dtypeB
-                        args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                                + brgemm_args::extra_args_offset::
-                                        brg_attrs], // attrs
-                        args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                                + brgemm_args::extra_args_offset::
-                                        bd_mask], // bd_mask
-                        args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                                + brgemm_args::extra_args_offset::
-                                        postops_setting]); // postops set
-    } else {
-        // cache basic args and bdmask
-        // brgemm_args::LEN does not require to be cached
-        // so we use NUM_BASIC_ARGS_STRIDE rather than NUM_BASIC_ARGS_LIST here
-        const int expected_cache_org_args = brgemm_args::NUM_BASIC_ARGS_STRIDE;
-        const int expected_cache_extra_start = brgemm_args::NUM_FULL_ARGS_LIST;
-        const int expected_cache_extra_args
-                = brgemm_args::extra_args_offset::cache_nargs;
-        // +1 for context pointer
-        assert(args.size()
-                == brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::nargs + 1);
-        if (!check_arg_range(args, cached_args, brgemm_args::M,
-                    expected_cache_org_args)) {
-            return expr();
-        }
-        if (!check_arg_range(args, cached_args, expected_cache_extra_start,
-                    expected_cache_extra_start + expected_cache_extra_args)) {
-            return expr();
-        }
-        return get_brgemm_creator_and_call_func(mode, backend, has_postop)
-                .first(args[brgemm_args::M], args[brgemm_args::N],
-                        args[brgemm_args::K], args[brgemm_args::LDA],
-                        args[brgemm_args::LDB], args[brgemm_args::LDC], beta,
-                        args[brgemm_args::NUM_FULL_ARGS_LIST
-                                + brgemm_args::extra_args_offset::
-                                        dtypeA], // dtypeA
-                        args[brgemm_args::NUM_FULL_ARGS_LIST
-                                + brgemm_args::extra_args_offset::
-                                        dtypeB], // dtypeB
-                        args[brgemm_args::NUM_FULL_ARGS_LIST
-                                + brgemm_args::extra_args_offset::
-                                        brg_attrs], // attrs
-                        args[brgemm_args::NUM_FULL_ARGS_LIST
-                                + brgemm_args::extra_args_offset::
-                                        bd_mask], // bd_mask
-                        args[brgemm_args::NUM_FULL_ARGS_LIST
-                                + brgemm_args::extra_args_offset::
-                                        postops_setting]); // postops set
-    }
-}
-
-static expr brgemm_run(brgemm_mode mode, scflags_t::brgemm_backend_t backend,
-        const expr &cache, const std::vector<expr> &args, bool has_postop) {
-    if (mode == brgemm_mode::stride) {
-        const int expected_num_args = brgemm_args::NUM_FULL_ARGS_STRIDE
-                + brgemm_args::extra_args_offset::nargs;
-        assert(args.size() == expected_num_args + 1);
-        auto run_func
-                = get_brgemm_creator_and_call_func(mode, backend, has_postop)
-                          .second;
-        if (has_postop) {
-            return run_func(cache, args[brgemm_args::A], args[brgemm_args::B],
-                    args[brgemm_args::C], args[brgemm_args::NUM],
-                    /*postop data*/
-                    args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                            + brgemm_args::extra_args_offset::postops_data],
-                    /*c_buf*/
-                    args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                            + brgemm_args::extra_args_offset::c_buf],
-                    args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                            + brgemm_args::extra_args_offset::top_pad],
-                    args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                            + brgemm_args::extra_args_offset::bottom_pad],
-                    /*ctx*/ args.back());
-        }
-        return run_func(cache, args[brgemm_args::A], args[brgemm_args::B],
-                args[brgemm_args::C], args[brgemm_args::NUM],
-                args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                        + brgemm_args::extra_args_offset::top_pad],
-                args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                        + brgemm_args::extra_args_offset::bottom_pad],
-                /*ctx*/ args.back());
-    } else {
-        const int expected_num_args = brgemm_args::NUM_FULL_ARGS_LIST
-                + brgemm_args::extra_args_offset::nargs;
-        assert(args.size() == expected_num_args + 1);
-        auto run_func
-                = get_brgemm_creator_and_call_func(mode, backend, has_postop)
-                          .second;
-        if (has_postop) {
-            return run_func(cache, args[brgemm_args::A], args[brgemm_args::B],
-                    args[brgemm_args::C], args[brgemm_args::NUM],
-                    args[brgemm_args::STRIDE_A], args[brgemm_args::STRIDE_B],
-                    args[brgemm_args::LEN],
-                    args[brgemm_args::NUM_FULL_ARGS_LIST
-                            + brgemm_args::extra_args_offset::dtypeA],
-                    args[brgemm_args::NUM_FULL_ARGS_LIST
-                            + brgemm_args::extra_args_offset::dtypeB],
-                    /*postop data*/
-                    args[brgemm_args::NUM_FULL_ARGS_LIST
-                            + brgemm_args::extra_args_offset::postops_data],
-                    /*c_buf*/
-                    args[brgemm_args::NUM_FULL_ARGS_LIST
-                            + brgemm_args::extra_args_offset::c_buf],
-                    args[brgemm_args::NUM_FULL_ARGS_LIST
-                            + brgemm_args::extra_args_offset::top_pad],
-                    args[brgemm_args::NUM_FULL_ARGS_LIST
-                            + brgemm_args::extra_args_offset::bottom_pad],
-                    /*ctx*/ args.back());
-        }
-        return run_func(cache, args[brgemm_args::A], args[brgemm_args::B],
-                args[brgemm_args::C], args[brgemm_args::NUM],
-                args[brgemm_args::STRIDE_A], args[brgemm_args::STRIDE_B],
-                args[brgemm_args::LEN],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::dtypeA],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::dtypeB],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::top_pad],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::bottom_pad],
-                /*ctx*/ args.back());
-    }
-}
-
-static expr range_brgemm_run(brgemm_mode mode,
-        scflags_t::brgemm_backend_t backend, const expr &cache,
-        const std::vector<expr> &args, bool has_postop) {
-    if (mode == brgemm_mode::stride) {
-        const int expected_num_args = brgemm_args::NUM_FULL_ARGS_STRIDE
-                + brgemm_args::extra_args_offset::nargs;
-        assert(args.size() == expected_num_args + 1);
-        auto run_func = get_brgemm_call_range_func(mode);
-        return run_func(cache, args[brgemm_args::M], args[brgemm_args::N],
-                args[brgemm_args::K], args[brgemm_args::A],
-                args[brgemm_args::B], args[brgemm_args::C],
-                args[brgemm_args::NUM],
-                args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                        + brgemm_args::extra_args_offset::top_pad],
-                args[brgemm_args::NUM_FULL_ARGS_STRIDE
-                        + brgemm_args::extra_args_offset::bottom_pad],
-                /*ctx*/ args.back());
-    } else {
-        const int expected_num_args = brgemm_args::NUM_FULL_ARGS_LIST
-                + brgemm_args::extra_args_offset::nargs;
-        assert(args.size() == expected_num_args + 1);
-        auto run_func = get_brgemm_call_range_func(mode);
-        return run_func(cache, args[brgemm_args::M], args[brgemm_args::N],
-                args[brgemm_args::K], args[brgemm_args::A],
-                args[brgemm_args::B], args[brgemm_args::C],
-                args[brgemm_args::NUM], args[brgemm_args::STRIDE_A],
-                args[brgemm_args::STRIDE_B], args[brgemm_args::LEN],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::dtypeA],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::dtypeB],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::top_pad],
-                args[brgemm_args::NUM_FULL_ARGS_LIST
-                        + brgemm_args::extra_args_offset::bottom_pad],
-                /*ctx*/ args.back());
-    }
-}
-
-class kernel_lower_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    // the kernel parameters => kernel pointer mapping
-    using param_cache_table = content_hash_map<std::vector<expr_c>, expr>;
-    ir_module_ptr mod_;
-    int optimize_;
-    int brg_bdmask_set_num_ = 0;
-    bool brg_use_bdmask_ = false;
-    // brgemm init stmts includes postops_data/c_buf
-    std::vector<stmt_c> brg_postop_init_;
-    std::unordered_map<std::vector<int64_t>, expr> attrs_cache_;
-    std::unordered_map<std::pair<sc_brgemm_bd_mask_t, int>,
-            std::pair<expr, expr>>
-            cur_bd_mask_cache_;
-    std::unordered_map<sc_brgemm_bd_mask_t, expr> full_bd_mask_cache_;
-    std::unordered_map<sc_brgemm_postops_setting_t, expr> postop_set_cache_;
-    std::vector<std::vector<stmt>> sc_kernel_cache_assignment_;
-    std::vector<std::vector<stmt>> brg_bd_mask_arr_assignment_;
-    // the kernel name => param_cache_table mapping
-    std::unordered_map<std::string, param_cache_table> kernel_cache;
-    typedef expr (*init_func_t)(brgemm_mode mode,
-            scflags_t::brgemm_backend_t backend, const std::vector<expr> &args,
-            std::vector<expr_c> &cached_args, float beta, bool has_postop);
-    using run_func_t = expr (*)(brgemm_mode, scflags_t::brgemm_backend_t,
-            const expr &, const std::vector<expr> &, bool has_postop);
-
-    expr_c optimize_range_kernel_call(brgemm_mode mode,
-            scflags_t::brgemm_backend_t backend, expr_c v,
-            const std::vector<expr> &args, const std::string &name,
-            run_func_t run_func, const sc_brgemm_attrs_t &brg_attrs, float beta,
-            bool has_postop, bool use_bdmask) {
-        std::vector<expr_c> cached_args;
-        if (backend != scflags_t::brgemm_backend_t::dnnl || has_postop
-                || use_bdmask) {
-            SC_MODULE_INFO << "Cannot optimize the range kernel call: " << v;
-            return v;
-        }
-        auto M_iter = brg_attrs.find(brgemm::attr_key::M_range_upper_bound);
-        auto N_iter = brg_attrs.find(brgemm::attr_key::N_range_upper_bound);
-        auto K_iter = brg_attrs.find(brgemm::attr_key::K_range_upper_bound);
-        if ((M_iter == brg_attrs.end() && !args[brgemm_args::M].isa<constant>())
-                || (N_iter == brg_attrs.end()
-                        && !args[brgemm_args::N].isa<constant>())
-                || (K_iter == brg_attrs.end()
-                        && !args[brgemm_args::K].isa<constant>())) {
-            SC_MODULE_INFO << "Cannot optimize the range kernel call: " << v;
-            return v;
-        }
-        auto M_tail_iter = brg_attrs.find(brgemm::attr_key::M_range_tail_value);
-        auto N_tail_iter = brg_attrs.find(brgemm::attr_key::N_range_tail_value);
-        auto K_tail_iter = brg_attrs.find(brgemm::attr_key::K_range_tail_value);
-        auto get_tail_value
-                = [](const sc_brgemm_attrs_t &m,
-                          const sc_brgemm_attrs_t::const_iterator &tail_it,
-                          bool is_range) {
-                      if (is_range) {
-                          if (tail_it != m.end()) {
-                              return static_cast<int>(tail_it->second);
-                          }
-                          return brg_range_tail_value::dyn_tail;
-                      }
-                      return brg_range_tail_value::no_tail;
-                  };
-        bool is_M_range = M_iter != brg_attrs.end();
-        bool is_N_range = N_iter != brg_attrs.end();
-        bool is_K_range = K_iter != brg_attrs.end();
-        int M_tail_value = get_tail_value(brg_attrs, M_tail_iter, is_M_range);
-        int N_tail_value = get_tail_value(brg_attrs, N_tail_iter, is_N_range);
-        int K_tail_value = get_tail_value(brg_attrs, K_tail_iter, is_K_range);
-        int M_upper_bound = static_cast<int>(is_M_range
-                        ? M_iter->second
-                        : get_expr_as_int(args[brgemm_args::M]));
-        int N_upper_bound = static_cast<int>(is_N_range
-                        ? N_iter->second
-                        : get_expr_as_int(args[brgemm_args::N]));
-        int K_upper_bound = static_cast<int>(is_K_range
-                        ? K_iter->second
-                        : get_expr_as_int(args[brgemm_args::K]));
-        const int expected_cache_org_args = brgemm_args::NUM_BASIC_ARGS_STRIDE;
-        const int expected_cache_extra_start = mode == brgemm_mode::stride
-                ? brgemm_args::NUM_FULL_ARGS_STRIDE
-                : brgemm_args::NUM_FULL_ARGS_LIST;
-        const int expected_cache_extra_args
-                = brgemm_args::extra_args_offset::cache_nargs;
-        // start from lda
-        if (!check_arg_range(args, cached_args, brgemm_args::LDA,
-                    expected_cache_org_args)) {
-            return v;
-        }
-        if (!check_arg_range(args, cached_args, expected_cache_extra_start,
-                    expected_cache_extra_start + expected_cache_extra_args)) {
-            return v;
-        }
-        int LDA = static_cast<int>(get_expr_as_int(args[brgemm_args::LDA]));
-        int LDB = static_cast<int>(get_expr_as_int(args[brgemm_args::LDB]));
-        int LDC = static_cast<int>(get_expr_as_int(args[brgemm_args::LDC]));
-        int stride_a = static_cast<int>(
-                get_expr_as_int(args[brgemm_args::STRIDE_A]));
-        int stride_b = static_cast<int>(
-                get_expr_as_int(args[brgemm_args::STRIDE_B]));
-        int dtypeA = static_cast<int>(
-                get_expr_as_int(args[expected_cache_extra_start
-                        + brgemm_args::extra_args_offset::dtypeA]));
-        int dtypeB = static_cast<int>(
-                get_expr_as_int(args[expected_cache_extra_start
-                        + brgemm_args::extra_args_offset::dtypeB]));
-        void *attr_ptr = nullptr;
-        auto attr_arg = args[expected_cache_extra_start
-                + brgemm_args::extra_args_offset::brg_attrs];
-        if (attr_arg.isa<tensor>()) {
-            attr_arg = attr_arg.static_as<tensor>()->init_value_->data_;
-        }
-        auto handle = (mode == brgemm_mode::stride
-                        ? std::make_shared<brg_range_handle_t>(M_upper_bound,
-                                N_upper_bound, K_upper_bound, LDA, LDB, LDC,
-                                stride_a, stride_b, beta, dtypeA, dtypeB,
-                                attr_ptr, M_tail_value, N_tail_value,
-                                K_tail_value)
-                        : std::make_shared<brg_range_handle_t>(M_upper_bound,
-                                N_upper_bound, K_upper_bound, LDA, LDB, LDC,
-                                beta, dtypeA, dtypeB, attr_ptr, M_tail_value,
-                                N_tail_value, K_tail_value));
-        mod_->get_brg_range_handle_vec().emplace_back(handle);
-        // Make kernel pointer global var. will be auto-renamed
-        auto init = make_expr<constant_node>(
-                reinterpret_cast<uintptr_t>(handle.get()), datatypes::pointer);
-        auto handlev = mod_->make_global_var(datatypes::pointer,
-                "__sc_range_kernel_cache", linkage::private_global, init);
-        // todo: add cache for range handle.
-        expr result = run_func(mode, backend, handlev, args, has_postop);
-        assert(result.defined());
-        return result;
-    }
-
-    expr_c optimize_kernel_call(brgemm_mode mode,
-            scflags_t::brgemm_backend_t backend, expr_c v,
-            const std::vector<expr> &args, const std::string &name,
-            init_func_t init_func, run_func_t run_func, float beta,
-            bool has_postop, bool use_bdmask) {
-        std::vector<expr_c> cached_args;
-
-        if (use_bdmask) {
-            int num_full_args = (mode == brgemm_mode::stride)
-                    ? brgemm_args::NUM_FULL_ARGS_STRIDE
-                    : brgemm_args::NUM_FULL_ARGS_LIST;
-            int bd_mask_arg_offset
-                    = num_full_args + +brgemm_args::extra_args_offset::bd_mask;
-            expr bd_mask = args[bd_mask_arg_offset];
-            expr bd_mask_idx = args[num_full_args - 3];
-            assert(bd_mask.defined() && bd_mask_idx.defined());
-
-            expr result = get_ir_null();
-            std::vector<expr> brg_args = args;
-            std::vector<expr> sc_kernel_cache;
-            // cached_args contains the original bd_mask with new index
-            std::vector<expr_c> cached_args_iter;
-
-            // cached_args contains the bd_mask_arr
-            expr cachev = init_func(
-                    mode, backend, args, cached_args, beta, has_postop);
-            if (!cachev.defined()) {
-                SC_MODULE_INFO << "Cannot optimize the kernel call: " << v;
-                return v;
-            }
-
-            // find the param_cache_table in the kernel cache
-            auto first_itr = kernel_cache.find(name);
-            if (first_itr != kernel_cache.end()) {
-                auto &entry = first_itr->second;
-                auto second_itr = entry.find(cached_args);
-                if (second_itr != entry.end()) {
-                    // if the same parameters are cached in the
-                    // kernel_cache, reuse the cached kernel pointer
-                    return run_func(mode, backend,
-                            second_itr->second[bd_mask_idx], args, has_postop);
-                }
-            }
-
-            expr sc_kernel_cache_arr = mod_->make_global_tensor(
-                    datatypes::pointer, "__sc_kernel_cache_arr",
-                    {brg_bdmask_set_num_}, linkage::private_global);
-            std::vector<stmt> kernel_cache_assignment;
-            for (int i = 0; i < brg_bdmask_set_num_; ++i) {
-                brg_args[bd_mask_arg_offset] = bd_mask[i];
-                expr cachev = init_func(mode, backend, brg_args,
-                        cached_args_iter, beta, has_postop);
-
-                if (!cachev.defined()) {
-                    SC_MODULE_INFO << "Cannot optimize the kernel call: " << v;
-                    return v;
-                }
-                kernel_cache_assignment.emplace_back(
-                        builder::make_assign_unattached(
-                                builder::make_indexing(
-                                        sc_kernel_cache_arr, {i}),
-                                cachev));
-            }
-            sc_kernel_cache_assignment_.push_back(kernel_cache_assignment);
-            // put the var to the kernel_cache
-            kernel_cache[name][cached_args] = sc_kernel_cache_arr;
-            brg_args[bd_mask_arg_offset] = bd_mask[bd_mask_idx];
-            result = run_func(mode, backend, sc_kernel_cache_arr[bd_mask_idx],
-                    brg_args, has_postop);
-            assert(result.defined());
-
-            return result;
-        } else {
-            expr cachev = init_func(
-                    mode, backend, args, cached_args, beta, has_postop);
-
-            // check if the kernel lowerer agrees to optimize the kernel call
-            if (!cachev.defined()) {
-                SC_MODULE_INFO << "Cannot optimize the kernel call: " << v;
-                return v;
-            }
-            // find the param_cache_table in the kernel cache
-            auto first_itr = kernel_cache.find(name);
-            if (first_itr != kernel_cache.end()) {
-                auto &entry = first_itr->second;
-                auto second_itr = entry.find(cached_args);
-                if (second_itr != entry.end()) {
-                    // if the same parameters are cached in the kernel_cache,
-                    // reuse the cached kernel pointer
-                    return run_func(mode, backend, second_itr->second, args,
-                            has_postop);
-                }
-            }
-            // Make kernel pointer global var. will be auto-renamed
-            expr cache = mod_->make_global_var(cachev->dtype_,
-                    "__sc_kernel_cache", linkage::private_global, cachev);
-            // put the var to the kernel_cache
-            kernel_cache[name][cached_args] = cache;
-            expr result = run_func(mode, backend, cache, args, has_postop);
-            assert(result.defined());
-            return result;
-        }
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        brgemm_args::extra_args_t *extras;
-        sc_data_type_t dtypeA, dtypeB;
-        brgemm_mode mode;
-        v = ir_visitor_t::visit(std::move(v)).checked_as<intrin_call_c>();
-        if (v->type_ == intrin_type::brgemm) {
-            mode = brgemm_mode::stride;
-        } else if (v->type_ == intrin_type::list_brgemm) {
-            mode = brgemm_mode::addr_list;
-        } else {
-            return v;
-        }
-
-        extras = &v->intrin_attrs_->get<brgemm_args::extra_args_t>(
-                intrin_attr::brgemm_extras);
-        COMPILE_ASSERT(extras->is_cpu_, "Found non-CPU brgemm: " << v);
-        dtypeA = extras->dtype_A_;
-        dtypeB = extras->dtype_B_;
-        sc_brgemm_attrs_t &brg_attrs = extras->brg_attrs_;
-        sc_brgemm_bd_mask_t &bd_mask = extras->bd_mask_;
-        brg_bdmask_set_num_ = extras->bd_mask_set_num_;
-        sc_brgemm_postops_setting_t &brg_postops_setting
-                = extras->postops_setting_;
-        assert(mode == brgemm_mode::stride || mode == brgemm_mode::addr_list);
-        int num_basic_args = (mode == brgemm_mode::stride)
-                ? brgemm_args::NUM_BASIC_ARGS_STRIDE
-                : brgemm_args::NUM_BASIC_ARGS_LIST;
-        int num_full_args = (mode == brgemm_mode::stride)
-                ? brgemm_args::NUM_FULL_ARGS_STRIDE
-                : brgemm_args::NUM_FULL_ARGS_LIST;
-        COMPILE_ASSERT(v->args_.size() == static_cast<size_t>(num_full_args),
-                "invalid number of brgemm args, expected to be "
-                        << num_full_args << ", but got " << v->args_.size()
-                        << ".");
-
-        // layout of v->args (full args):
-        //    | basic_args | postops_data list(11 elems) | c_buf | bdmask_idx |
-        //    top_pad | bottom_pad
-        std::vector<expr> brg_postops_data
-                = std::vector<expr>(v->args_.begin() + num_basic_args,
-                        v->args_.begin() + num_full_args - 4);
-        COMPILE_ASSERT(
-                brg_postops_data.size() == brgemm::postops_data_init_func_nargs,
-                "brg_postops_data.size() is expected to be "
-                        << brgemm::postops_data_init_func_nargs << ", but got "
-                        << brg_postops_data.size());
-        bool use_bdmask = false;
-        validate_brgemm_attrs(
-                brg_attrs, bd_mask, brg_bdmask_set_num_, use_bdmask);
-        brg_use_bdmask_ |= use_bdmask;
-
-        // layout of opt_args:
-        //    | basic_args | postops_data list(11 elems) | c_buf | bdmask_idx
-        //    | dtypeA | dtypeB | attrs | bd_mask | postops setting | top_pad |
-        //    bottom_pad | postops data | c_buf | stream |
-        // The 1st c_buf will be replaced by 2nd c_buf
-        std::vector<expr> opt_args {v->args_.begin(), v->args_.end()};
-        opt_args.emplace_back(dtypeA.as_etype_int());
-        opt_args.emplace_back(dtypeB.as_etype_int());
-        // brgemm attrs
-        bool try_range_optimize = false;
-        expr brg_attrs_arg = get_brgemm_attrs_arg(
-                mod_, brg_attrs, attrs_cache_, try_range_optimize);
-        opt_args.emplace_back(brg_attrs_arg);
-        // bd mask
-        expr bd_mask_arr = get_ir_null(), cur_bd_mask = get_ir_null();
-        if (use_bdmask) {
-            expr bd_mask_idx = v->args_[num_full_args - 3];
-            expr bd_mask_len = v->args_[brgemm_args::M];
-            auto bd_mask_arg = get_brgemm_bd_mask_arg(mod_, bd_mask,
-                    bd_mask_idx, bd_mask_len, brg_bdmask_set_num_,
-                    cur_bd_mask_cache_, full_bd_mask_cache_,
-                    brg_bd_mask_arr_assignment_);
-            bd_mask_arr = bd_mask_arg.first;
-            cur_bd_mask = bd_mask_arg.second;
-        }
-        opt_args.emplace_back(bd_mask_arr);
-        // brgemm postops setting
-        expr brg_setting_arg = get_brgemm_postops_setting_arg(
-                mod_, brg_postops_setting, postop_set_cache_);
-        opt_args.emplace_back(brg_setting_arg);
-
-        // top and bottom padding
-        // TODO(zhicong): optimize the brgemm call with vpad in compilation
-        // phase
-        expr top_pad = v->args_[num_full_args - 2],
-             bottom_pad = v->args_[num_full_args - 1];
-        opt_args.emplace_back(top_pad);
-        opt_args.emplace_back(bottom_pad);
-
-        // brgemm postops data
-        std::vector<stmt_c> &ret = brg_postop_init_;
-        expr brg_data_arg;
-        if (!brg_postops_setting.empty()) {
-            brg_data_arg = get_brgemm_postops_data_arg(ret, brg_postops_data);
-        } else {
-            auto ref = create_initialed_postops_data();
-            for (size_t i = 0; i < brg_postops_data.size(); i++) {
-                COMPILE_ASSERT(brg_postops_data[i]->equals(ref[i]),
-                        "Postops data is not empty when setting is empty.");
-            }
-            brg_data_arg = get_ir_null();
-        }
-        opt_args.emplace_back(brg_data_arg);
-
-        // brgemm c buf, currently we create a local buffer with M*N size
-        expr brg_c_buf = v->args_[num_full_args - 4];
-        assert(brg_c_buf.defined());
-        if (brg_c_buf->equals(get_ir_null())) {
-            if (!brg_postops_setting.empty()) {
-                brg_c_buf = builder::make_tensor("__brgemm_c_buf",
-                        {opt_args[brgemm_args::M] * opt_args[brgemm_args::N]},
-                        infer_out_dtype(dtypeA));
-                ret.emplace_back(
-                        builder::make_var_tensor_def_unattached(brg_c_buf));
-            }
-        }
-        opt_args.emplace_back(brg_c_buf);
-
-        // placeholder for the context, required by brgemm with AMX.
-        opt_args.emplace_back(get_ir_null());
-        brg_postop_init_ = ret;
-
-        // layout of no_opt_args:
-        //    | basic_args | dtypeA | dtypeB | attrs | bd_mask | postops setting
-        //    | top_pad | bottom_pad | postops data | c_buf | stream |
-        std::vector<expr> no_opt_args(
-                opt_args.begin(), opt_args.begin() + num_basic_args);
-        // +2 for old c_buf and bdmask_idx
-        no_opt_args.insert(no_opt_args.end(),
-                opt_args.begin() + num_basic_args
-                        + brgemm::postops_data_init_func_nargs + 4,
-                opt_args.end());
-        no_opt_args[num_basic_args + 3] = cur_bd_mask;
-
-        bool optimized = optimize_ >= 1;
-        scflags_t::brgemm_backend_t backend
-                = mod_->ctx_->flags_.brgemm_backend_;
-        auto fpair = get_brgemm_update_funcs(mode, backend);
-        func_t f = extras->cpu_.init_ ? fpair.second : fpair.first;
-        assert(f);
-
-        if (!optimized) {
-            return builder::make_call(f, no_opt_args);
-        } else {
-            if (try_range_optimize) {
-                auto ret = optimize_range_kernel_call(mode, backend, v,
-                        opt_args, f->name_, range_brgemm_run, brg_attrs,
-                        extras->cpu_.init_ ? 0.0f : 1.0f,
-                        !brg_postops_setting.empty(), use_bdmask);
-                if (!ret.ptr_same(v)) { return ret; }
-            }
-            // try general optimization again for bd mask and postop.
-            auto ret = optimize_kernel_call(mode, backend, v, opt_args,
-                    f->name_, brgemm_init_kernel_cache, brgemm_run,
-                    extras->cpu_.init_ ? 0.0f : 1.0f,
-                    !brg_postops_setting.empty(), use_bdmask);
-            if (ret.ptr_same(v)) { return builder::make_call(f, no_opt_args); }
-            return ret;
-        }
-    }
-
-    stmt_c visit(stmts_c v) override {
-        bool changed = false;
-        std::vector<stmt_c> seq;
-        for (auto &st : v->seq_) {
-            brg_postop_init_.clear();
-            auto new_st = dispatch(st);
-            changed |= !new_st.ptr_same(st);
-            if (new_st.isa<evaluate>()
-                    && new_st.static_as<evaluate>()->value_.isa<call>()
-                    && !brg_postop_init_.empty()) {
-                seq.insert(seq.end(), brg_postop_init_.begin(),
-                        brg_postop_init_.end());
-                changed = true;
-            }
-            seq.emplace_back(new_st);
-        }
-        if (changed) {
-            return copy_attr(*v, builder::make_stmts_unattached(seq));
-        }
-        return v;
-    }
-
-    kernel_lower_impl_t(ir_module_ptr mod, int optimize)
-        : mod_(std::move(mod)), optimize_(optimize) {}
-};
-
-const_ir_module_ptr kernel_lowering_cpu_t::operator()(const_ir_module_ptr m) {
-    auto ret = m->copy();
-    kernel_lower_impl_t pass(ret, optimize_);
-    auto old_gval_size = ret->get_module_vars().size();
-    for (auto &f : ret->get_contents()) {
-        f = std::const_pointer_cast<func_base>(pass.dispatch(f));
-    }
-    if (auto initf = ret->get_func("__sc_init__")) {
-        for (size_t i = old_gval_size; i < ret->get_module_vars().size(); i++) {
-            auto pvar = ret->get_module_vars()[i];
-            // attrs/bdmask/set are tensors.
-            if (pvar->var_.isa<var>()) {
-                auto name = pvar->var_.static_as<var>()->name_;
-                // only kernel cache needs init define.
-                if (name.find("kernel_cache") != std::string::npos) {
-                    initf->body_.checked_as<stmts>()->seq_.emplace_back(
-                            builder::make_assign_unattached(
-                                    pvar->var_, pvar->init_));
-                }
-            }
-        }
-    } else {
-        initf = ret->make_init_func();
-        if (initf) ret->add_func({initf});
-    }
-
-    if (pass.brg_use_bdmask_) {
-        auto initf = ret->get_func("__sc_init__");
-        if (!initf) {
-            stmts seq = make_stmt<stmts_node_t>(std::vector<stmt>());
-            initf = builder::make_func("__sc_init__", std::vector<expr_c>(),
-                    std::move(seq), datatypes::void_t);
-            ret->add_func({initf});
-        }
-        assert(initf && "__sc_init__ func is expected be presented in \
-            the current ir module, but not.");
-
-        for (auto &sts : pass.brg_bd_mask_arr_assignment_) {
-            for (auto &st : sts) {
-                initf->body_.checked_as<stmts>()->seq_.push_back(st);
-            }
-        }
-        for (auto &sts : pass.sc_kernel_cache_assignment_) {
-            for (auto &st : sts) {
-                initf->body_.checked_as<stmts>()->seq_.push_back(st);
-            }
-        }
-    }
-
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/kernel_lower.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/kernel_lower.hpp
deleted file mode 100644
index 12c06c277ca..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/kernel_lower.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_KERNEL_LOWER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_KERNEL_LOWER_HPP
-
-#include "../../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Lower the micro-kernel calls. Try to replace them with optimal versions of
- * APIs
- * */
-class kernel_lowering_cpu_t : public module_pass_t {
-public:
-    int optimize_;
-    kernel_lowering_cpu_t(int optimize) : optimize_(optimize) {}
-    const_ir_module_ptr operator()(const_ir_module_ptr m) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/local_tensor_lower.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/local_tensor_lower.cpp
deleted file mode 100644
index f00249840fd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/local_tensor_lower.cpp
+++ /dev/null
@@ -1,495 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "local_tensor_lower.hpp"
-#include <algorithm>
-#include <memory>
-#include <utility>
-#include <vector>
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/pass/graph_constant_cache.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-SC_MODULE(pass.local_tensor_lower);
-
-SC_DECL_PASS_INFO(local_tensor_lowering_cpu,
-        SC_PASS_DEPENDS_ON(constant_folder, buffer_scheduler, tensor_init,
-                module_globals_resolver, simple_loop_invariant_code_motion),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-static func_t set_noalias_function(func_t f) {
-    f->attr()[function_attrs::no_alias] = true;
-    return f;
-}
-
-func_t get_acquire_const_cache_func() {
-    static func_t f_global = set_noalias_function(
-            builder::_decl_func("sc_acquire_const_cache", datatypes::pointer,
-                    {_arg_("stream", datatypes::pointer),
-                            _arg_("cacheptr", datatypes::index),
-                            _arg_("size", datatypes::index),
-                            _arg_("outinited", datatypes::s32, {1})}));
-    return f_global;
-}
-
-func_t get_release_const_cache_func() {
-    static func_t f_global
-            = builder::_decl_func("sc_release_const_cache", datatypes::void_t,
-                    {_arg_("stream", datatypes::pointer),
-                            _arg_("cacheptr", datatypes::index),
-                            _arg_("ptr", datatypes::pointer)});
-    return f_global;
-}
-
-func_t get_cpu_temp_malloc_func(bool is_thread_local) {
-    static func_t f_global = set_noalias_function(
-            builder::_decl_func("sc_aligned_malloc", datatypes::pointer,
-                    {_arg_("stream", datatypes::pointer),
-                            _arg_("size", datatypes::index)}));
-    static func_t f_local = set_noalias_function(
-            builder::_decl_func("sc_thread_aligned_malloc", datatypes::pointer,
-                    {_arg_("stream", datatypes::pointer),
-                            _arg_("size", datatypes::index)}));
-    return is_thread_local ? f_local : f_global;
-}
-
-func_t get_cpu_temp_free_func(bool is_thread_local) {
-    static func_t f_global
-            = builder::_decl_func("sc_aligned_free", datatypes::void_t,
-                    {_arg_("stream", datatypes::pointer),
-                            _arg_("ptr", datatypes::pointer)});
-    static func_t f_local
-            = builder::_decl_func("sc_thread_aligned_free", datatypes::void_t,
-                    {_arg_("stream", datatypes::pointer),
-                            _arg_("ptr", datatypes::pointer)});
-    return is_thread_local ? f_local : f_global;
-}
-
-namespace local_tsr_lower {
-// record the base tensor and the offset of a tensor after buffer-scheduling
-struct tensor_base_offset_t {
-    expr base_;
-    int64_t start_offset_;
-    int64_t end_offset_; // inclusive offset
-};
-
-bool tensor_less_than_by_name(const expr &v1, const expr &v2) {
-    return v1.checked_as<tensor>()->name_ < v2.checked_as<tensor>()->name_;
-}
-
-// the "trace" for our pass to scan the positions in the base tensor to decide
-// which tensors are alias
-struct tensor_trace_t {
-    expr tsr_;
-    int64_t offset_;
-    bool is_end_;
-    bool operator<(const tensor_trace_t &v) const {
-        if (offset_ < v.offset_) { return true; }
-        if (offset_ > v.offset_) { return false; }
-        // offset == v.offset_
-        if (is_end_ != v.is_end_) {
-            // let "start" trace < "end" trace to let them overlap
-            // if is_end, then v.is_end_ is false, and we should let v< *this
-            return !is_end_;
-        }
-        // offset == v.offset_ && is_end_ == v.is_end_
-        // the same position trace, sort by tensor name
-        return tensor_less_than_by_name(tsr_, v.tsr_);
-    }
-};
-
-} // namespace local_tsr_lower
-
-static const char *shared_const_handle_name = "__shared_const_handle";
-
-using namespace local_tsr_lower;
-
-class tensor_lower_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    size_t threshold_;
-    bool is_in_parallel_;
-    expr cur_rtl_ctx_;
-    // the defined tensor stack. The first dimension is for nested stmts. The
-    // second is for ordering the tensors defined in the same scope
-    std::vector<std::vector<expr>> defined_tsr_;
-    // tensor -> [base, start_offset, end_offset]
-    std::unordered_map<expr, tensor_base_offset_t> scheduled_tensor_position_;
-    // the ordered tensors in scheduled_tensor_position_
-    std::vector<expr> scheduled_tensors_;
-    // tenosr -> hoisted base tensor in outer loop
-    std::unordered_map<expr, expr> hoisted_tensor_map_;
-    expr shared_const_handles, tsr_is_init;
-
-    // not interested in expr
-    expr_c dispatch(expr_c v) override { return v; }
-
-    int64_t get_tensor_size(const tensor &tsr) {
-        COMPILE_ASSERT(tsr->dims_.size() == 1 && tsr->dims_[0].isa<constant>(),
-                "Expecting 1D constant sized tensor");
-        return get_const_as_int(tsr->dims_[0].static_as<constant>())
-                * (int64_t)utils::get_sizeof_type(tsr->elem_dtype_);
-    }
-
-    stmt_c visit(define_c v) override {
-        if (!v->var_.isa<tensor>()) { return v; }
-        if (v->linkage_ == linkage::local
-                && v->var_.static_as<tensor>()->name_
-                        == shared_const_handle_name) {
-            shared_const_handles = v->var_;
-        }
-        if (v->linkage_ != linkage::local) { return v; }
-        // only interested in local tensors
-        auto tsr = v->var_.static_as<tensor>();
-        if (tsr->name_ == "__is_init") { tsr_is_init = tsr; }
-        if (v->init_.defined()) {
-            if (v->init_.isa<tensorptr>()) {
-                auto tptr = v->init_.static_as<tensorptr>();
-                auto attr = tptr->base_->ptr_->attr_.get();
-                if (attr && attr->get_or_else("hoisted", false)) {
-                    // the tensor is a tensor based on a hoisted tensor
-                    COMPILE_ASSERT(tptr->base_->ptr_.isa<tensor>()
-                                    && !tptr->base_->ptr_.ptr_same(tsr),
-                            "Expecting a tensor on the base of hoisted tensor");
-                    hoisted_tensor_map_[tsr] = tptr->base_->ptr_;
-                } else if (attr
-                        && attr->get_or_else(
-                                attr_keys::can_be_scheduled, false)) {
-                    auto base = tptr->base_->ptr_;
-                    COMPILE_ASSERT(tptr->base_->idx_.size() == 1
-                                    && tptr->base_->idx_[0].isa<constant>(),
-                            "Expecting 1D constant sized tensor");
-                    auto offset = get_const_as_int(
-                            tptr->base_->idx_[0].static_as<constant>());
-                    COMPILE_ASSERT(tsr->dims_.size() == 1
-                                    && tsr->dims_[0].isa<constant>(),
-                            "Expecting 1D constant sized tensor");
-                    auto tsr_size = get_tensor_size(tsr);
-                    COMPILE_ASSERT(tsr_size > 0, "Bad size of tensor");
-                    // recursively find the base tensor
-                    for (;;) {
-                        auto itr = scheduled_tensor_position_.find(base);
-                        if (itr != scheduled_tensor_position_.end()) {
-                            base = itr->second.base_;
-                            offset += itr->second.start_offset_;
-                        } else {
-                            break;
-                        }
-                    }
-                    scheduled_tensor_position_.insert(std::make_pair(tsr,
-                            tensor_base_offset_t {
-                                    base, offset, offset + tsr_size - 1}));
-                    scheduled_tensors_.emplace_back(tsr);
-                }
-            }
-            return v;
-        }
-        COMPILE_ASSERT(
-                tsr->dims_.size() == 1, "tensor_lower_impl needs 1D tensors");
-        // check if it is staticaly-shaped and shape is small
-        size_t sz = utils::get_sizeof_type(tsr->elem_dtype_);
-        expr_c alloc_size;
-
-        bool is_const = true;
-        const auto &dim = tsr->dims_[0];
-        auto shared_cached_buffer = any_map_t::fetch_or_else<
-                std::shared_ptr<cached_const_graph_tensor>>(
-                v->var_->attr_.get(), attr_keys::shared_const, nullptr);
-        if (!dim.isa<constant>()) {
-            is_const = false;
-        } else {
-            sz *= get_const_as_int(dim.static_as<constant>());
-        }
-        if (is_const && sz <= threshold_ && !shared_cached_buffer
-                && /*and the tensor is not marked runtime_stack_alloc*/
-                !(tsr->attr_
-                        && tsr->attr_->get_or_else(
-                                attr_keys::runtime_stack_alloc, false))) {
-            // if the tensor is small enough
-            return v;
-        }
-        alloc_size = is_const
-                ? expr(sz)
-                : auto_caster_t()(tsr->dims_[0]
-                        * utils::get_sizeof_type(tsr->elem_dtype_));
-        expr initv;
-        if (shared_cached_buffer) {
-            // handle special local tensors: shared cached const
-            auto handle_idx = any_map_t::fetch_or_null<size_t>(
-                    v->var_->attr_.get(), attr_keys::shared_const_base_idx);
-            COMPILE_ASSERT(handle_idx, "Expecting attr shared_const_base_idx");
-            COMPILE_ASSERT(shared_const_handles.defined(),
-                    "Expecting shared_const_handles defined");
-            if (shared_cached_buffer->buf_base_->is_lazy_) {
-                COMPILE_ASSERT(
-                        tsr_is_init.defined(), "Expecting __is_init defined");
-                initv = builder::make_call(get_acquire_const_cache_func(),
-                        {cur_rtl_ctx_,
-                                shared_const_handles[uint64_t(*handle_idx)],
-                                alloc_size, tsr_is_init});
-            } else {
-                initv = builder::make_reinterpret(
-                        shared_const_handles[uint64_t(*handle_idx)],
-                        datatypes::pointer);
-            }
-        } else {
-            bool thread_loca = is_in_parallel_
-                    || any_map_t::fetch_or_else(
-                            tsr->attr_.get(), "is_thread_buffer", false);
-            // a large local tensor/dynamic tensor
-            initv = builder::make_call(get_cpu_temp_malloc_func(thread_loca),
-                    {cur_rtl_ctx_, alloc_size});
-        }
-        defined_tsr_.back().emplace_back(tsr);
-        return copy_attr(*v,
-                builder::make_var_tensor_def_unattached(
-                        tsr, v->linkage_, initv));
-    }
-
-    stmt_c visit(stmts_c v) override {
-        defined_tsr_.emplace_back();
-        auto ret = ir_visitor_t::visit(v);
-        auto &current_scope = defined_tsr_.back();
-        if (!current_scope.empty()) {
-            assert(!ret.ptr_same(v));
-            auto &seq = ret.checked_as<stmts>()->seq_;
-            bool is_ret = !seq.empty() && seq.back().isa<returns>();
-            for (auto itr = current_scope.rbegin(); itr != current_scope.rend();
-                    ++itr) {
-                stmt the_call;
-                if (auto cached_buffer = any_map_t::fetch_or_else<
-                            std::shared_ptr<cached_const_graph_tensor>>(
-                            (*itr)->attr_.get(), attr_keys::shared_const,
-                            nullptr)) {
-                    auto handle_idx = any_map_t::fetch_or_null<size_t>(
-                            (*itr)->attr_.get(),
-                            attr_keys::shared_const_base_idx);
-                    COMPILE_ASSERT(
-                            handle_idx, "Expecting attr shared_const_base_idx");
-                    COMPILE_ASSERT(shared_const_handles.defined(),
-                            "Expecting shared_const_handles defined");
-                    if (cached_buffer->buf_base_->is_lazy_) {
-                        the_call = builder::make_evaluate_unattached(
-                                builder::make_call(
-                                        get_release_const_cache_func(),
-                                        {cur_rtl_ctx_,
-                                                shared_const_handles[uint64_t(
-                                                        *handle_idx)],
-                                                *itr}));
-                    } else {
-                        // compile-time constants, do not need to call release
-                        continue;
-                    }
-                } else {
-                    bool thread_loca = is_in_parallel_
-                            || any_map_t::fetch_or_else((*itr)->attr_.get(),
-                                    "is_thread_buffer", false);
-                    the_call = builder::make_evaluate_unattached(
-                            builder::make_call(
-                                    get_cpu_temp_free_func(thread_loca),
-                                    {cur_rtl_ctx_, *itr}));
-                }
-                if ((*itr)->attr_
-                        && (*itr)->attr_->get_or_else(
-                                "temp.may_inplace", false)) {
-                    assert((*itr).isa<tensor>());
-                    auto tsr = (*itr).static_as<tensor>();
-                    the_call = builder::make_if_else_unattached(
-                            tsr->dims_[0] > UINT64_C(0),
-                            builder::make_stmts_unattached({the_call}), stmt());
-                }
-
-                // if the last stmt is ret, should insert before it.
-                // Otherwise, append to the last position
-                auto pos = is_ret ? (seq.end() - 1) : seq.end();
-                seq.insert(pos, the_call);
-            }
-        }
-        defined_tsr_.pop_back();
-        return ret;
-    }
-};
-
-static std::vector<std::shared_ptr<alias_info::tensor_alias_identity_t>>
-mark_alias_for_scheduled_tensors(
-        const std::unordered_map<expr, tensor_base_offset_t>
-                &scheduled_tensor_position,
-        const std::unordered_map<expr, expr> &hoisted_tensor_map,
-        const std::vector<expr> &scheduled_tensors) {
-    if (scheduled_tensor_position.empty()) return {};
-    std::vector<std::shared_ptr<alias_info::tensor_alias_identity_t>> ret;
-    std::unordered_map<expr, std::vector<tensor_trace_t>> base_tsr_to_traces;
-    for (auto &kv : scheduled_tensor_position) {
-        auto &tsr = kv.first;
-        auto &base_info = kv.second;
-        base_tsr_to_traces[base_info.base_].emplace_back(
-                tensor_trace_t {tsr, base_info.start_offset_, false});
-        base_tsr_to_traces[base_info.base_].emplace_back(
-                tensor_trace_t {tsr, base_info.end_offset_, true});
-        ret.emplace_back(alias_info::get_or_create_alias_info(*tsr));
-    }
-    // sort the base tensors by name to have stable result
-    using pair_expr_vec_trace = std::pair<expr, std::vector<tensor_trace_t> *>;
-    std::vector<pair_expr_vec_trace> sorted_base_tsr_to_traces;
-    sorted_base_tsr_to_traces.reserve(base_tsr_to_traces.size());
-    for (auto &kv : base_tsr_to_traces) {
-        sorted_base_tsr_to_traces.emplace_back(kv.first, &kv.second);
-    }
-    std::sort(sorted_base_tsr_to_traces.begin(),
-            sorted_base_tsr_to_traces.end(),
-            [](const pair_expr_vec_trace &v1, const pair_expr_vec_trace &v2) {
-                return tensor_less_than_by_name(v1.first, v2.first);
-            });
-    int64_t clique_id = 1;
-    for (auto &kv : sorted_base_tsr_to_traces) {
-        auto &base = kv.first;
-        auto &traces = *kv.second;
-        if (traces.empty()) { continue; }
-        // the traces are sorted by offset, then by is_end, then by tensor name
-        std::sort(traces.begin(), traces.end());
-        // the algorithm:
-        // think there is a cursor pointing to an offset of the base tensor. We
-        // move the cursor from 0 offset to max offset of the tensor. The traces
-        // marks the start and end of scheduled tensors and are sorted by the
-        // offset. So when the cursor points to an offset, we can know which
-        // tensors contain the position.
-
-        // the set of tensors that contain the current cursor
-        alias_info::alias_set_t cur_alias;
-        std::shared_ptr<alias_info::alias_set_t> cur_clique;
-        for (auto &trace : traces) {
-            auto alias_id = alias_info::get_or_create_alias_info(*trace.tsr_);
-            if (!trace.is_end_) {
-                if (!cur_clique) {
-                    cur_clique = cur_alias.copy();
-                    cur_clique->id_ = clique_id;
-                    clique_id++;
-                }
-                cur_alias.set_.insert(alias_id);
-                alias_id->add_to_clique(cur_clique);
-            } else {
-                cur_alias.set_.erase(alias_id);
-                // a tensor leaves the clique, need to make a new clique when
-                // another tensor joins
-                cur_clique = nullptr;
-            }
-        }
-    }
-
-    // if scheduled tensor is based on a hoisted tensor, it should have same
-    // alias with the hoisted base. Recurisvely find the hoisted base
-    std::vector<std::pair<alias_info::tensor_alias_identity_t *,
-            alias_info::alias_set_t *>>
-            to_add_hoisted_alias;
-    // the vector to_add_hoisted_alias is to remember which scheduled tensor
-    // need to copy which alias set of a base hoisted tensor
-    for (auto &tsr : scheduled_tensors) {
-        auto itr = scheduled_tensor_position.find(tsr);
-        auto &base_info = itr->second;
-        auto aid_tsr = alias_info::get_or_create_alias_info(*tsr);
-        expr base = base_info.base_;
-        for (;;) {
-            auto itr = hoisted_tensor_map.find(base);
-            if (itr != hoisted_tensor_map.end()) {
-                auto hoisted = itr->second;
-                auto sche_itr = scheduled_tensor_position.find(hoisted);
-                if (sche_itr != scheduled_tensor_position.end()) {
-                    if (auto aid_base = alias_info::get_alias_info(*hoisted)) {
-                        // the tensor "tsr" is based on hoisted tensor "base"
-                        // and "tsr" should have same aliases of "base"
-                        for (auto &aset : aid_base->alias_cliques_) {
-                            if (!aset->set_.has(aid_tsr)) {
-                                to_add_hoisted_alias.emplace_back(
-                                        aid_tsr.get(), aset.get());
-                            }
-                        }
-                    }
-                    auto &hoisted_base_info = sche_itr->second;
-                    base = hoisted_base_info.base_;
-                } else {
-                    break;
-                }
-            } else {
-                break;
-            }
-        }
-    }
-    for (auto &kv : to_add_hoisted_alias) {
-        auto &aid_tsr = kv.first;
-        auto &aset = kv.second;
-        auto cli_copy = aset->copy();
-        aid_tsr->add_to_clique(cli_copy);
-        cli_copy->id_ = clique_id;
-        clique_id++;
-    }
-    // output to INFO log
-    if (auto sc_stream_temp
-            = runtime::get_info_logging_stream(__sc_module_name)) {
-        for (auto &kv : scheduled_tensor_position) {
-            (*sc_stream_temp.stream_) << kv.first << ':';
-            if (auto aid = alias_info::get_alias_info(*kv.first)) {
-                for (auto &aset : aid->alias_cliques_) {
-                    (*sc_stream_temp.stream_) << aset->id_ << ',';
-                }
-            }
-            (*sc_stream_temp.stream_) << '\n';
-        }
-    }
-    return ret;
-}
-
-func_c local_tensor_lowering_cpu_t::operator()(func_c m) {
-    if (m->attr_ && m->attr_->get_or_else(function_attrs::low_level, false)) {
-        return m;
-    }
-    tensor_lower_impl_t impl;
-    COMPILE_ASSERT(m->params_.size() >= 2
-                    && m->params_.front()->dtype_ == datatypes::pointer,
-            "local_tensor_lowering_cpu_t expecting the first function arugment "
-            "as a pointer, got: "
-                    << m);
-    impl.cur_rtl_ctx_ = m->params_.front();
-    impl.threshold_ = size_threshold_;
-    impl.is_in_parallel_ = any_map_t::fetch_or_else(
-            m->attr_.get(), function_attrs::no_parallel, false);
-    auto ret = impl.dispatch(m);
-    if (!impl.scheduled_tensor_position_.empty()) {
-        auto alias_ids = mark_alias_for_scheduled_tensors(
-                impl.scheduled_tensor_position_, impl.hoisted_tensor_map_,
-                impl.scheduled_tensors_);
-        std::const_pointer_cast<func_base>(ret)->attr()["alias_sets"]
-                = std::move(alias_ids);
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/local_tensor_lower.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/local_tensor_lower.hpp
deleted file mode 100644
index a813fba3a4d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/local_tensor_lower.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_LOCAL_TENSOR_LOWER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_LOCAL_TENSOR_LOWER_HPP
-
-#include "../../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Insert allocation/deallocation calls for local tensor. The alloc/dealloc
- * follows "first-alloc-last-dealloc" order to let the memory allocator use a
- * stack for easier and faster implementation. This pass should be placed after
- * module_globals_resolver_t
- * */
-class local_tensor_lowering_cpu_t : public function_pass_t {
-public:
-    // the threshold in bytes. If a local tensor is larger than the threshold,
-    // we should allocate it on heap (using alloc function). Otherwise, we
-    // should keep the tensor untouched (the backend should lower it a buffer on
-    // stack)
-    size_t size_threshold_;
-    func_c operator()(func_c m) override;
-    local_tensor_lowering_cpu_t(size_t size_threshold)
-        : size_threshold_(size_threshold) {}
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-func_t get_cpu_temp_malloc_func(bool is_thread_local);
-
-func_t get_cpu_temp_free_func(bool is_thread_local);
-func_t get_acquire_const_cache_func();
-func_t get_release_const_cache_func();
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/target_specific_lower.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/target_specific_lower.cpp
deleted file mode 100644
index feb27f33134..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/target_specific_lower.cpp
+++ /dev/null
@@ -1,1168 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "target_specific_lower.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/intrinsics.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-extern const uint32_t sc_log_const_int_vals_1[32];
-extern const uint32_t sc_log_const_int_vals_2[32];
-extern const uint32_t sc_erf_const_int_vals[6][32];
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(target_specific_lowering_cpu,
-        SC_PASS_DEPENDS_ON(bf16_fp16_legalizer, bf16_fp16_eliminator,
-                func_inliner, dyn_tensor_transformer),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(FUNC_INLINED));
-
-static expr gen_vec_const(uint32_t lanes, float f) {
-    return make_expr<constant_node>(f, sc_data_type_t::f32(lanes));
-}
-
-static expr gen_vec_const_int(uint32_t lanes, int64_t f) {
-    return make_expr<constant_node>(f, sc_data_type_t::s32(lanes));
-}
-
-static expr gen_vec_const_u32(uint32_t lanes, int64_t f) {
-    return make_expr<constant_node>(f, sc_data_type_t::u32(lanes));
-}
-
-static std::vector<expr> make_args_by_intrinsic(const intrin_call_c &node) {
-    std::vector<expr> ret;
-    int idx = 0;
-    for (auto &v : node->args_) {
-        ret.emplace_back(builder::make_var(
-                v->dtype_, "_intrin_v" + std::to_string(idx)));
-        idx += 1;
-    }
-    return ret;
-}
-
-static std::string get_exp_func_name(const intrin_call_c &node) {
-    std::stringstream ss;
-    ss << "_should_inline_exp_" << node->dtype_;
-    return ss.str();
-}
-
-static std::string get_read_struct_func_name(const intrin_call_c &node) {
-    auto &name
-            = node->intrin_attrs_->get<std::string>(intrin_attr::struct_name);
-    auto &field = node->intrin_attrs_->get<int>(intrin_attr::struct_field);
-    std::stringstream ss;
-    ss << "_should_inline_read_struct_" << name << "_" << field;
-    return ss.str();
-}
-static std::string get_write_struct_func_name(const intrin_call_c &node) {
-    auto &name
-            = node->intrin_attrs_->get<std::string>(intrin_attr::struct_name);
-    auto &field = node->intrin_attrs_->get<int>(intrin_attr::struct_field);
-    std::stringstream ss;
-    ss << "_should_inline_write_struct_" << name << "_" << field;
-    return ss.str();
-}
-
-static expr_c create_cast_bf16_to_f32(const context_ptr &ctx, const cast_c &v) {
-    builder::ir_builder_t builder;
-    auto &in = v->in_;
-    const auto count = make_expr<constant_node>(
-            16UL, sc_data_type_t::u32(in->dtype_.lanes_));
-
-    auto uint32_v = builder::make_cast(sc_data_type_t::u32(in->dtype_.lanes_),
-            builder::make_reinterpret(
-                    in, sc_data_type_t::u16(in->dtype_.lanes_)));
-    auto tmpf32 = copy_attr(*v,
-            builder::make_reinterpret(
-                    uint32_v << count, sc_data_type_t::f32(v->dtype_.lanes_)));
-    if (!v->dtype_.is_etype(sc_data_etype::F32)) {
-        tmpf32 = builder::make_cast(v->dtype_, tmpf32);
-    }
-
-    return tmpf32;
-}
-
-static expr_c create_cast_f32_to_bf16(const context_ptr &ctx, const cast_c &v) {
-    auto &in = v->in_;
-    COMPILE_ASSERT(in->dtype_.is_etype(sc_data_etype::F32),
-            "bf16 should be cast from f32.");
-    const auto count = make_expr<constant_node>(
-            16UL, sc_data_type_t::u32(in->dtype_.lanes_));
-
-    if (ctx->flags_.bf16_fast_trunc_) {
-        auto uint32_v = builder::make_reinterpret(
-                in, sc_data_type_t::u32(in->dtype_.lanes_));
-        return copy_attr(*v,
-                builder::make_reinterpret(
-                        builder::make_cast(
-                                sc_data_type_t::u16(in->dtype_.lanes_),
-                                uint32_v >> count),
-                        sc_data_type_t::bf16(in->dtype_.lanes_)));
-    }
-    if (ctx->machine_.device_type_ == runtime::target_machine_t::type::cpu
-            && ctx->machine_.cpu_flags_.fAVX512BF16) {
-        /* todo: add it to instrinsic for special tag */
-        return copy_attr(*v, builder::make_cast(v->dtype_, in));
-    } else {
-        auto uint32_v = builder::make_reinterpret(
-                in, sc_data_type_t::u32(in->dtype_.lanes_));
-
-        auto rounding_bias
-                = builder::make_int_and((uint32_v >> count),
-                          builder::make_constant(
-                                  std::vector<union_val>(
-                                          in->dtype_.lanes_, (int64_t)1),
-                                  sc_data_type_t::u32(in->dtype_.lanes_)))
-                + builder::make_constant(
-                        std::vector<union_val>(
-                                in->dtype_.lanes_, (int64_t)0x7FFF),
-                        sc_data_type_t::u32(in->dtype_.lanes_));
-        // reinterpret to bf16 to inference wrapper node dtype e.g.
-        // select/intrin_call.
-        return copy_attr(*v,
-                builder::make_reinterpret(
-                        builder::make_cast(
-                                sc_data_type_t::u16(in->dtype_.lanes_),
-                                (uint32_v + rounding_bias) >> count),
-                        sc_data_type_t::bf16(in->dtype_.lanes_)));
-    }
-}
-
-static expr_c create_cast_fp16_to_f32(const context_ptr &ctx, const cast_c &v) {
-    if (ctx->machine_.device_type_ == runtime::target_machine_t::type::cpu
-            && (ctx->machine_.cpu_flags_.fAVX512FP16)) {
-        return v;
-    } else {
-        auto &in = v->in_;
-        const auto lanes = in->dtype_.lanes_;
-        auto u32_dtype = sc_data_type_t::u32(lanes);
-
-        auto uint32_v = builder::make_cast(u32_dtype,
-                builder::make_reinterpret(in, sc_data_type_t::u16(lanes)));
-        // const uint32_t e = (x & 0x7C00) >> 10;
-        auto e = (builder::make_int_and(
-                         uint32_v, gen_vec_const_u32(lanes, UINT64_C(0X7C00))))
-                >> gen_vec_const_u32(lanes, UINT64_C(10));
-        // const uint32_t m = (x & 0x03FF) << 13;
-        auto m = (builder::make_int_and(
-                         uint32_v, gen_vec_const_u32(lanes, UINT64_C(0X03FF))))
-                << gen_vec_const_u32(lanes, UINT64_C(13));
-        // const uint32_t v = as_uint((float)m) >> 23;
-        auto l = (builder::make_reinterpret(
-                         builder::make_cast(sc_data_type_t::f32(lanes), m),
-                         u32_dtype))
-                >> gen_vec_const_u32(lanes, UINT64_C(23));
-
-        // (x & 0x8000) << 16
-        auto cond1 = (builder::make_int_and(uint32_v,
-                             gen_vec_const_u32(lanes, UINT64_C(0x8000))))
-                << gen_vec_const_u32(lanes, UINT64_C(16));
-        // (e != 0) *  ((e + 112) << 23 | m)
-        auto tmp_cond_2 = builder::make_int_or(
-                (e + gen_vec_const_u32(lanes, UINT64_C(112)))
-                        << gen_vec_const_u32(lanes, UINT64_C(23)),
-                m);
-        auto cond2 = builder::make_select(
-                e != gen_vec_const_u32(lanes, UINT64_C(0)), tmp_cond_2,
-                gen_vec_const_u32(lanes, UINT64_C(0)));
-        //((e == 0) & (m != 0)) * ((l - 37) << 23 | ((m << (150 - l)) &
-        // 0x007FE000))
-        auto tmp_cond_3 = builder::make_int_or(
-                (l - gen_vec_const_u32(lanes, UINT64_C(37)))
-                        << gen_vec_const_u32(lanes, UINT64_C(23)),
-                builder::make_int_and(
-                        (m << (gen_vec_const_u32(lanes, UINT64_C(150)) - l)),
-                        gen_vec_const_u32(lanes, UINT64_C(0X007FE000))));
-        auto cond3 = builder::make_select(
-                e == gen_vec_const_u32(lanes, UINT64_C(0)),
-                builder::make_select(m != gen_vec_const_u32(lanes, UINT64_C(0)),
-                        tmp_cond_3, gen_vec_const_u32(lanes, UINT64_C(0))),
-                gen_vec_const_u32(lanes, UINT64_C(0)));
-        /*
-        (x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m)
-                    | ((e == 0) & (m != 0))
-                            * ((l - 37) << 23 | ((m << (150 - l)) & 0x007FE000))
-        */
-        auto f32_v = builder::make_int_or(
-                builder::make_int_or(cond1, cond2), cond3);
-        auto tmpf32 = copy_attr(*v,
-                builder::make_reinterpret(f32_v, sc_data_type_t::f32(lanes)));
-        if (!v->dtype_.is_etype(sc_data_etype::F32)) {
-            tmpf32 = builder::make_cast(v->dtype_, tmpf32);
-        }
-        return tmpf32;
-    }
-}
-
-static expr_c create_cast_f32_to_fp16(const context_ptr &ctx, const cast_c &v) {
-    auto &in = v->in_;
-    if (ctx->machine_.device_type_ == runtime::target_machine_t::type::cpu
-            && (ctx->machine_.cpu_flags_.fAVX512FP16)) {
-        return v;
-    } else {
-        COMPILE_ASSERT(in->dtype_.is_etype(sc_data_etype::F32),
-                "fp16 should be cast from f32.");
-        const auto lanes = in->dtype_.lanes_;
-        auto u32_dtype = sc_data_type_t::u32(lanes);
-
-        auto uint32_v = builder::make_reinterpret(in, u32_dtype);
-        auto b = uint32_v + gen_vec_const_u32(lanes, UINT64_C(0x00001000));
-        auto e = builder::make_int_and(
-                         b, gen_vec_const_u32(lanes, UINT64_C(0x7F800000)))
-                >> gen_vec_const_u32(lanes, UINT64_C(23));
-        auto m = builder::make_int_and(
-                b, gen_vec_const_u32(lanes, UINT64_C(0x007FFFFF)));
-        // (b & 0x80000000) >> 16
-        auto cond1 = builder::make_int_and(
-                             b, gen_vec_const_u32(lanes, UINT64_C(0x80000000)))
-                >> gen_vec_const_u32(lanes, UINT64_C(16));
-        // (e > 112) * (((e - 112) << 10) & 0x7C00)
-        auto tmp_cond_2 = builder::make_int_and(
-                (e - gen_vec_const_u32(lanes, UINT64_C(112)))
-                        << gen_vec_const_u32(lanes, UINT64_C(10)),
-                builder::make_constant(std::vector<union_val>(in->dtype_.lanes_,
-                                               (int64_t)0x7C00),
-                        sc_data_type_t::u32(in->dtype_.lanes_)));
-        // (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13)
-        auto cond2 = builder::make_select(
-                e > gen_vec_const_u32(lanes, UINT64_C(112)),
-                builder::make_int_or(tmp_cond_2,
-                        m >> gen_vec_const_u32(lanes, UINT64_C(13))),
-                gen_vec_const_u32(lanes, UINT64_C(0)));
-        // ((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1) >>
-        // 1)
-        auto tmp_cond_3
-                = (((gen_vec_const_u32(lanes, UINT64_C(0x007FF000)) + m)
-                           >> (gen_vec_const_u32(lanes, UINT64_C(125)) - e))
-                          + gen_vec_const_u32(lanes, UINT64_C(1)))
-                >> gen_vec_const_u32(lanes, UINT64_C(1));
-        //((e < 113) & (e > 101)) * ((((0x007FF000 + m) >> (125 - e)) + 1)
-        //>> 1)
-        auto cond3 = builder::make_select(
-                e < gen_vec_const_u32(lanes, UINT64_C(113)),
-                builder::make_select(
-                        e > gen_vec_const_u32(lanes, UINT64_C(101)), tmp_cond_3,
-                        gen_vec_const_u32(lanes, UINT64_C(0))),
-                gen_vec_const_u32(lanes, UINT64_C(0)));
-        // (e > 143) * 0x7FFF
-        auto cond4 = builder::make_select(
-                e > gen_vec_const_u32(lanes, UINT64_C(143)),
-                gen_vec_const_u32(lanes, UINT64_C(0x7FFF)),
-                gen_vec_const_u32(lanes, UINT64_C(0)));
-        /*
-        (b & 0x80000000) >> 16
-                | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13)
-                | ((e < 113) & (e > 101))
-                * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1)
-                | (e > 143) * 0x7FFF
-        */
-        auto fp16_v = builder::make_int_or(
-                builder::make_int_or(builder::make_int_or(cond1, cond2), cond3),
-                cond4);
-        return copy_attr(*v,
-                builder::make_reinterpret(
-                        builder::make_cast(
-                                sc_data_type_t::u16(in->dtype_.lanes_), fp16_v),
-                        sc_data_type_t::f16(in->dtype_.lanes_)));
-    }
-}
-
-static std::string get_isnan_func_name(const intrin_call_c &node) {
-    std::stringstream ss;
-    ss << "_should_inline_isnan_" << node->dtype_;
-    return ss.str();
-}
-
-static std::string get_log_func_name(const intrin_call_c &node) {
-    std::stringstream ss;
-    ss << "_should_inline_log_" << node->dtype_;
-    return ss.str();
-}
-
-static std::string get_erf_func_name(const intrin_call_c &node) {
-    std::stringstream ss;
-    ss << "_should_inline_erf_" << node->dtype_;
-    return ss.str();
-}
-
-class target_specific_lower_cpu_impl_t;
-static func_t create_isnan_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor) {
-    auto type = node->dtype_;
-    uint32_t elements = type.lanes_;
-    auto ZERO = make_expr<constant_node>(
-            INT64_C(0), sc_data_type_t::s32(elements));
-    auto exponent_bits = make_expr<constant_node>(
-            INT64_C(0x7F800000), sc_data_type_t::s32(elements));
-    auto rm_sign_bits = make_expr<constant_node>(
-            INT64_C(0x7FFFFFFF), sc_data_type_t::s32(elements));
-    auto ty_epi_32 = sc_data_type_t::s32(elements);
-    builder::ir_builder_t builder;
-    _function_(type, the_sc_isnan_func, make_args_by_intrinsic(node)) {
-        _bind_(inval_f32);
-        _var_(inval, ty_epi_32);
-        inval = builder::make_reinterpret(inval_f32, ty_epi_32);
-        expr mask = (inval & exponent_bits) == exponent_bits;
-        expr temp = builder::make_select(mask, inval & rm_sign_bits, ZERO);
-        expr ret = exponent_bits < temp;
-        _return_(ret);
-    }
-    std::string fixed_name = get_isnan_func_name(node);
-    the_sc_isnan_func->name_ = fixed_name;
-    the_sc_isnan_func->decl_->name_ = fixed_name;
-    return the_sc_isnan_func;
-}
-
-static func_t create_exp_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor) {
-    auto type = node->dtype_;
-    uint32_t elements = type.lanes_;
-    bool overflow_check
-            = !node->attr_ || node->attr_->get_or_else("overflow_check", true);
-
-    auto ZERO = gen_vec_const(elements, 0.0f);
-    auto ln2 = gen_vec_const(elements, 0.693147181f);
-    auto minus_ln2 = gen_vec_const(elements, -0.693147181f);
-    auto one_over_ln2 = gen_vec_const(elements, 1.442695041f);
-    auto half_float = gen_vec_const(elements, 0.5f);
-    auto ONE_f = gen_vec_const(elements, 1.0f);
-    auto ONE_i = make_expr<constant_node>(
-            INT64_C(1), sc_data_type_t::s32(elements));
-    auto ty_epi_32 = sc_data_type_t::s32(elements);
-    auto overflow_x = gen_vec_const(elements, 88.72283935f);
-    auto underflow_x = gen_vec_const(elements, -87.33654785f);
-    auto ret_infinity = make_expr<constant_node>(
-            std::numeric_limits<float>::infinity(), type);
-
-    builder::ir_builder_t builder;
-    _function_(type, the_exp_func, make_args_by_intrinsic(node)) {
-        assert(node->args_.size() == 1);
-        _bind_(inval);
-        _var_(res, type);
-        // to avoid overflow
-        _var_init_(a_, type, inval);
-
-        // e^x = 2^k_int * e^r
-        _var_(k_float, type);
-        k_float = builder::make_floor(a_ * one_over_ln2
-                + half_float); // k_float = floor(x / ln2 + 0.5f)
-        _var_(k_int, ty_epi_32);
-        k_int = builder::make_cast(ty_epi_32, k_float); // k_int = int(k_float)
-
-        _var_(r, type);
-        // TODO(x): mabye utilize fnmadd, fmsub, etc. in the future
-        // r = a_ - k_float * ln2;
-        r = builder::make_fmadd(k_float, minus_ln2, a_);
-
-        expr table[7];
-        table[6] = gen_vec_const(elements, 0.142857143f);
-        table[5] = gen_vec_const(elements, 0.166666667f);
-        table[4] = gen_vec_const(elements, 0.2f);
-        table[3] = gen_vec_const(elements, 0.25f);
-        table[2] = gen_vec_const(elements, 0.333333333f);
-        table[1] = gen_vec_const(elements, 0.5f);
-        table[0] = ONE_f;
-        // Calculate e^r (Tn)
-
-        const size_t top_idx = 5;
-        _var_(Tn, type);
-        Tn = builder::make_fmadd(r, table[top_idx], ONE_f);
-        for (auto loop = top_idx; loop > 0; loop--) {
-            // Tn = Tn * (r / i) + 1
-            Tn = builder::make_fmadd(Tn, r * table[loop - 1], ONE_f);
-        }
-
-        // 2^k_int, shift to exponent bits position
-        auto const_23 = make_expr<constant_node>(
-                INT64_C(23), sc_data_type_t::s32(elements));
-        auto p = k_int << const_23;
-
-        _var_(result, ty_epi_32);
-        result = p + builder::make_reinterpret(Tn, ty_epi_32);
-
-        res = builder::make_reinterpret(result, type);
-        if (overflow_check) {
-            res = builder::make_select(inval > overflow_x, ret_infinity, res);
-        }
-        res = builder::make_select(inval < underflow_x, ZERO, res);
-        _return_(res);
-    } // namespace gc
-
-    std::string fixed_name = get_exp_func_name(node);
-    the_exp_func->name_ = fixed_name;
-    the_exp_func->decl_->name_ = fixed_name;
-    return the_exp_func;
-}
-
-static func_t create_log_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor);
-
-static func_t create_erf_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor);
-
-static size_t get_field_offset(const std::string &in, int field) {
-    if (in == dyn_tsr_struct_t::name) {
-        return dyn_tsr_struct_t::offsets[field];
-    } else {
-        COMPILE_ASSERT(false, "struct " << in << " has not been supported!");
-    }
-    return 0;
-}
-
-static func_t create_read_struct_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor) {
-    builder::ir_builder_t builder;
-    auto name = node->intrin_attrs_->get<std::string>(intrin_attr::struct_name);
-    auto field = node->intrin_attrs_->get<int>(intrin_attr::struct_field);
-    auto dtype = get_dtype_from_struct_and_field(name, field);
-    _function_(dtype, read_struct, {node->args_[0]}) {
-        assert(node->args_.size() == 1);
-        expr util_tsr = builder::make_tensor("util_tsr", {1}, dtype);
-        util_tsr->attr().set(attr_keys::tsr_dont_buf_sched, true);
-        util_tsr->attr().set(attr_keys::no_dead_write, true);
-        util_tsr->attr().set(attr_keys::no_tensor2var, true);
-        size_t offset = get_field_offset(name, field);
-        builder::get_current_builder()->push_var_tensor_def(util_tsr,
-                linkage::local, builder::tensor_ptr(node->args_[0], {offset}));
-        expr ret = builder::make_indexing(util_tsr, 0);
-        _return_(ret);
-    }
-    auto func_name = get_read_struct_func_name(node);
-    read_struct->name_ = func_name;
-    read_struct->decl_->name_ = func_name;
-    return read_struct;
-}
-
-static func_t create_write_struct_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor) {
-    builder::ir_builder_t builder;
-    auto name = node->intrin_attrs_->get<std::string>(intrin_attr::struct_name);
-    auto field = node->intrin_attrs_->get<int>(intrin_attr::struct_field);
-    auto dtype = node->args_[1]->dtype_;
-    if (dtype.is_pointer()) { dtype = datatypes::pointer; }
-    _function_(datatypes::void_t, write_struct, {node->args_[0]},
-            {builder::make_var(dtype, "_intrin_v")}) {
-        assert(node->args_.size() == 2);
-        assert(node->args_[1]->dtype_ == (dtype)
-                || node->args_[1]->dtype_.is_pointer());
-        _bind_(dyn_tsr, inval);
-        expr util_tsr = builder::make_tensor("util_tsr", {1},
-                dtype.is_pointer() ? datatypes::pointer : dtype);
-        util_tsr->attr().set(attr_keys::tsr_dont_buf_sched, true);
-        util_tsr->attr().set(attr_keys::no_dead_write, true);
-        util_tsr->attr().set(attr_keys::no_tensor2var, true);
-        size_t offset = get_field_offset(name, field);
-        builder::get_current_builder()->push_var_tensor_def(util_tsr,
-                linkage::local, builder::tensor_ptr(dyn_tsr, {offset}));
-        builder::get_current_builder()->push_assign(
-                builder::make_indexing(util_tsr, 0), inval);
-    }
-    auto func_name = get_write_struct_func_name(node);
-    write_struct->name_ = func_name;
-    write_struct->decl_->name_ = func_name;
-    return write_struct;
-}
-
-using intrin_func_creator = func_t (*)(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *);
-using intrin_func_namer = std::string (*)(const intrin_call_c &node);
-
-class target_specific_lower_cpu_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    context_ptr ctx_;
-    ir_module_ptr mod_;
-    int var_cnt_ = 0;
-    int parallel_depth_ = 0;
-    // use a var instead of complex arg expr
-    std::vector<std::vector<std::pair<expr_c, expr_c>>> need_defs_;
-    const stmts_node_t *cur_scope_ = nullptr;
-    std::unordered_map<int, std::pair<expr, expr>> barrier_idle_func_args_;
-
-    std::vector<expr_c> visit_need_def_args(const std::vector<expr> &args) {
-        std::vector<expr_c> new_args;
-        new_args.reserve(args.size());
-        for (auto &arg : args) {
-            if (!arg.isa<var>() && !arg.isa<tensor>() && !arg.isa<indexing>()
-                    && !arg.isa<tensorptr>() && !arg.isa<constant>()) {
-                expr_c cache_var = builder::make_var(arg->dtype_,
-                        "_arg_cache_" + std::to_string(var_cnt_++));
-                need_defs_.back().emplace_back(cache_var, arg);
-                new_args.emplace_back(cache_var);
-            } else {
-                new_args.emplace_back(arg);
-            }
-        }
-        return new_args;
-    }
-
-    expr_c do_lower_saturated_cast(const intrin_call_c &v) {
-        assert(v->args_.size() == 1);
-        const auto &inval1 = v->args_[0];
-        auto intype = v->args_[0]->dtype_;
-        auto outtype = v->dtype_;
-        auto ths = this;
-        bool is_s8 = outtype.type_code_ == sc_data_etype::S8;
-        float i8_min = is_s8 ? (float)std::numeric_limits<int8_t>::min()
-                             : (float)std::numeric_limits<uint8_t>::min();
-        float i8_max = is_s8 ? (float)std::numeric_limits<int8_t>::max()
-                             : (float)std::numeric_limits<uint8_t>::max();
-        auto i8_min_constant = gen_vec_const(intype.lanes_, i8_min);
-        auto i8_max_constant = gen_vec_const(intype.lanes_, i8_max);
-        bool use_avx512_path = true;
-        auto u8s8_valid_val = [&]() {
-            if (use_avx512_path) {
-                if (is_s8) {
-                    return builder::make_min(i8_max_constant, inval1);
-                } else {
-                    // u8
-                    return builder::make_max(i8_min_constant, inval1);
-                }
-            }
-            return builder::make_min(i8_max_constant,
-                    builder::make_max(i8_min_constant, inval1));
-        };
-        auto u8s8_saturate_cast = [&]() {
-            auto real_in = u8s8_valid_val();
-            auto cast_target_type = (is_s8 || !use_avx512_path)
-                    ? sc_data_type_t::s32(intype.lanes_)
-                    : sc_data_type_t::u32(intype.lanes_);
-            real_in = builder::make_round_and_cast(real_in, cast_target_type);
-            if (use_avx512_path) {
-                return builder::make_saturated_cast(real_in, v->dtype_);
-            } else {
-                return builder::make_cast(v->dtype_, real_in);
-            }
-        };
-        if (mod_->ctx_->machine_.cpu_flags_.fAVX512F) {
-            // the fast path for AVX512
-            if (v->dtype_ == sc_data_type_t::s8(16)) {
-                if (intype == sc_data_type_t::s32(16)) {
-                    return v;
-                } else if (intype == sc_data_type_t::f32(16)) {
-                    return u8s8_saturate_cast();
-                }
-            } else if (v->dtype_ == sc_data_type_t::u8(16)) {
-                if (intype == sc_data_type_t::s32(16)) {
-                    auto zero = make_expr<constant_node>(
-                            UINT64_C(0), sc_data_type_t::s32(16));
-                    return builder::make_saturated_cast(
-                            builder::make_max(inval1, zero), v->dtype_);
-                } else if (intype == sc_data_type_t::f32(16)) {
-                    return u8s8_saturate_cast();
-                }
-            } else if (v->dtype_ == sc_data_type_t::s32(16)) {
-                assert(intype == sc_data_type_t::f32(16));
-                return builder::make_round_and_cast(
-                        inval1, sc_data_type_t::s32(16));
-            }
-        }
-        auto cast_s32_u8s8 = [intype, outtype](const expr_c &v) {
-            int64_t max_val
-                    = outtype.type_code_ == sc_data_etype::U8 ? 255 : 127;
-            int64_t min_val
-                    = outtype.type_code_ == sc_data_etype::U8 ? 0 : -128;
-            expr val255 = make_expr<constant_node>(
-                    max_val, sc_data_type_t::s32(intype.lanes_));
-            expr val0 = make_expr<constant_node>(
-                    min_val, sc_data_type_t::s32(intype.lanes_));
-            // ret = min(v, 255)
-            auto ret = builder::make_min(v, val255);
-            // ret = max(ret, 0)
-            ret = builder::make_max(ret, val0);
-            return builder::make_cast(outtype, ret);
-        };
-
-        COMPILE_ASSERT((v->dtype_.type_code_ == sc_data_etype::S8
-                               || v->dtype_.type_code_ == sc_data_etype::U8)
-                        && (intype.type_code_ == sc_data_etype::S32
-                                || intype.type_code_ == sc_data_etype::F32),
-                "saturated_cast cannot handle: " << v << '(' << intype << "->"
-                                                 << v->dtype_ << ')');
-        expr_c real_in = inval1;
-        if (intype.type_code_ == sc_data_etype::F32) {
-            use_avx512_path = false;
-            return u8s8_saturate_cast();
-        }
-        return cast_s32_u8s8(real_in);
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        if (v->kind_ == for_type::PARALLEL) { parallel_depth_++; }
-        auto ret = ir_visitor_t::visit(v);
-        if (v->kind_ == for_type::PARALLEL) { parallel_depth_--; }
-        return ret;
-    }
-
-    int get_barrier_id(const node_base *v) {
-        int barrier_id = -1;
-        if (v->attr_) {
-            barrier_id = v->attr_->get_or_else("post_barrier_id", -1);
-        }
-        return barrier_id;
-    }
-
-    expr_c visit(call_c v) override {
-        auto ret = ir_visitor_t::visit(v);
-        if (v->func_ == builtin::get_barrier_arrive_func()) {
-            int bar_id = get_barrier_id(v.get());
-            auto itr = barrier_idle_func_args_.find(bar_id);
-            if (bar_id >= 0 && itr != barrier_idle_func_args_.end()) {
-                auto ret2 = ret.static_as<call_c>();
-                // pass idle_func and idle_args to barrier func
-                return copy_attr(*v,
-                        builtin::get_barrier_arrive_func()(ret2->args_.at(0),
-                                itr->second.first, itr->second.second));
-            }
-        }
-        return ret;
-    }
-    expr_c visit(intrin_call_c v) override {
-        auto ret = ir_visitor_t::visit(v);
-        if (v->type_ == intrin_type::set_thread_idle_func) {
-            int barrier_id = -1;
-            if (parallel_depth_ > 0) {
-                // get the barrier id of the set_thread_idle_func call, and find
-                // the barrier call of the same id
-                barrier_id = get_barrier_id(v.get());
-                bool found_consumer = false;
-                for (auto &seq : cur_scope_->seq_) {
-                    if (seq.isa<evaluate>()) {
-                        auto &val = seq.static_as<evaluate>()->value_;
-                        if (val.isa<call>()) {
-                            auto call_node = val.static_as<call>();
-                            if (call_node->func_
-                                    == builtin::get_barrier_arrive_func()) {
-                                auto call_id = get_barrier_id(call_node.get());
-                                if (call_id == barrier_id) {
-                                    found_consumer = true;
-                                    break;
-                                }
-                            }
-                        }
-                    }
-                }
-                if (!found_consumer) {
-                    // remove the intrinsic call
-                    return 0;
-                }
-            }
-            std::vector<stmt_c> insert_before;
-            expr args_pack;
-
-            if (v->args_.size() == 2UL) {
-                args_pack = v->args_[1];
-            } else {
-                args_pack = builder::make_tensor(
-                        std::string("__idle_args") + std::to_string(var_cnt_++),
-                        {v->args_.size() - 1}, datatypes::generic);
-                auto def = builder::make_var_tensor_def_unattached(args_pack);
-                insert_before.emplace_back(def);
-                def->attr()[attr_keys::tsr_dont_buf_sched] = true;
-
-                for (size_t i = 1; i < v->args_.size(); i++) {
-                    insert_before.emplace_back(
-                            builder::make_assign_unattached(args_pack[i - 1],
-                                    builder::make_cast(
-                                            datatypes::generic, v->args_[i])));
-                }
-                insert_seq_before_ = std::move(insert_before);
-            }
-            if (parallel_depth_ > 0) {
-                // inform the barrier call to set the idle
-                // functions
-                barrier_idle_func_args_[barrier_id] = {v->args_[0], args_pack};
-                // remove the intrinsic call. The idle function will be passed
-                // in barrier call
-                return 0;
-            }
-            if (runtime_config_t::get().managed_thread_pool_
-                    != thread_pool_mode_t::MANAGED) {
-                return 0;
-            }
-            return builtin::get_set_idle_func_managed_func()(
-                    v->args_[0], args_pack);
-        } else if (v->type_ == intrin_type::get_group_thread_id) {
-            int64_t level_id
-                    = get_const_as_int(v->args_[0].checked_as<constant_c>());
-            COMPILE_ASSERT(level_id < 0,
-                    "get_group_thread_id is used outside of nested "
-                    "parallel. It must has group level -1.");
-            return builtin::get_thread_id_func()();
-        } else if (v->type_ == intrin_type::get_group_id) {
-            throw std::runtime_error(
-                    "get_group_id cannot be used outside of nested parallel.");
-            return expr();
-        }
-#ifndef NDEBUG
-        else if (v->type_ == intrin_type::min || v->type_ == intrin_type::max) {
-            COMPILE_ASSERT(v->args_.size() == 2,
-                    "Invalid arg size: " << v->args_.size() << ", should be 2");
-            auto l = ret.checked_as<intrin_call_c>()->args_[0];
-            auto r = ret.checked_as<intrin_call_c>()->args_[1];
-            if (!l.isa<constant>() && r.isa<constant>()
-                    && get_etype_category_nothrow(r->dtype_)
-                            == type_category::CATE_FLOAT) {
-                // min(x, c) => min(c, x)
-                // max(x, c) => max(c, x)
-                return builder::remake_binary(r, l, v);
-            }
-        }
-#endif
-        intrin_func_creator lower_func = nullptr;
-        intrin_func_namer namer_func = nullptr;
-        switch (v->type_) {
-            case intrin_type::exp:
-                COMPILE_ASSERT(ret->dtype_.is_etype(sc_data_etype::F32),
-                        "Currently sc_exp only supports f32");
-                lower_func = &create_exp_func;
-                namer_func = &get_exp_func_name;
-                break;
-            case intrin_type::isnan:
-                COMPILE_ASSERT(v->args_[0]->dtype_.is_etype(sc_data_etype::F32),
-                        "Currently sc_isnan only supports f32");
-                lower_func = &create_isnan_func;
-                namer_func = &get_isnan_func_name;
-                break;
-            case intrin_type::log:
-                COMPILE_ASSERT(v->args_[0]->dtype_.is_etype(sc_data_etype::F32),
-                        "Currently sc_log only supports f32");
-                lower_func = &create_log_func;
-                namer_func = &get_log_func_name;
-                break;
-            case intrin_type::erf:
-                COMPILE_ASSERT(v->args_[0]->dtype_.is_etype(sc_data_etype::F32),
-                        "Currently erf only supports f32");
-                lower_func = &create_erf_func;
-                namer_func = &get_erf_func_name;
-                break;
-            case intrin_type::saturated_cast:
-                return do_lower_saturated_cast(ret.checked_as<intrin_call_c>());
-                break;
-            case intrin_type::read_struct:
-                COMPILE_ASSERT(v->args_[0]->dtype_.get_pointer_element()
-                                == datatypes::u8,
-                        "User defined struct tensor should be a u8 tensor in "
-                        "IR.");
-                lower_func = &create_read_struct_func;
-                namer_func = &get_read_struct_func_name;
-                break;
-            case intrin_type::write_struct:
-                COMPILE_ASSERT(v->args_[0]->dtype_.get_pointer_element()
-                                == datatypes::u8,
-                        "User defined struct tensor should be a u8 tensor in "
-                        "IR.");
-                lower_func = &create_write_struct_func;
-                namer_func = &get_write_struct_func_name;
-                break;
-
-            default: break;
-        }
-        if (lower_func) {
-            auto new_args = visit_need_def_args(
-                    ret.checked_as<intrin_call_c>()->args_);
-            func_t f = mod_->get_func(namer_func(v));
-            if (f) {
-                // if the function is found, check the signature
-                const char *prompt
-                        = "Bad signature found for intrinsic implementation "
-                          "function";
-                COMPILE_ASSERT(f->ret_type_ == v->dtype_
-                                && f->params_.size() == v->args_.size(),
-                        prompt << f);
-                for (size_t i = 0; i < f->params_.size(); i++) {
-                    COMPILE_ASSERT(f->params_[i]->dtype_ == v->args_[i]->dtype_
-                                    || (f->params_[i]->dtype_
-                                                    == datatypes::pointer
-                                            && v->args_[i]
-                                                       ->dtype_.is_pointer()),
-                            prompt << f);
-                }
-            } else {
-                f = lower_func(mod_, ret.checked_as<intrin_call>(), this);
-                // private function, so that hopefully it can be removed
-                // after inlined
-                f->attr()[function_attrs::private_] = true;
-                mod_->add_func({f});
-            }
-            auto r = copy_attr(*ret, builder::make_call(f, new_args));
-            r->attr()["inline_level"] = 2;
-            return r;
-        }
-        return ret;
-    }
-
-    expr_c visit(cast_c v) override {
-        auto ret = ir_visitor_t::visit(v).checked_as<cast_c>().remove_const();
-        if (ret->in_->dtype_.is_etype(sc_data_etype::BF16)
-                || ret->in_->dtype_.is_etype(sc_data_etype::F16)) {
-            expr new_in = visit_need_def_args({ret->in_})[0].remove_const();
-            if (!new_in.ptr_same(ret->in_)) {
-                ret = ret->remake().static_as<cast>();
-            }
-            ret->in_ = new_in;
-            return ret->in_->dtype_.is_etype(sc_data_etype::BF16)
-                    ? create_cast_bf16_to_f32(ctx_, ret)
-                    : create_cast_fp16_to_f32(ctx_, ret);
-        } else if (ret->dtype_.is_etype(sc_data_etype::BF16)
-                || ret->dtype_.is_etype(sc_data_etype::F16)) {
-            expr new_in = visit_need_def_args({ret->in_})[0].remove_const();
-            if (!new_in.ptr_same(ret->in_)) {
-                ret = ret->remake().static_as<cast>();
-            }
-            ret->in_ = new_in;
-            return ret->dtype_.is_etype(sc_data_etype::BF16)
-                    ? create_cast_f32_to_bf16(ctx_, ret)
-                    : create_cast_f32_to_fp16(ctx_, ret);
-        }
-
-        return ret;
-    }
-
-    std::vector<stmt_c> insert_seq_before_;
-    stmt_c visit(stmts_c v) override {
-        auto old_scope = cur_scope_;
-        cur_scope_ = v.get();
-        bool changed = false;
-        need_defs_.emplace_back();
-        std::vector<stmt_c> seqs;
-        size_t def_sz = 0;
-        for (auto &st : v->seq_) {
-            auto new_st = dispatch(st);
-            auto &cur_defs = need_defs_.back();
-            changed |= !new_st.ptr_same(st);
-            if (cur_defs.size() != def_sz) {
-                assert(cur_defs.size() > def_sz);
-                for (size_t i = def_sz; i < cur_defs.size(); i++) {
-                    expr_c &cache_var = cur_defs[i].first;
-                    seqs.emplace_back(builder::make_var_tensor_def_unattached(
-                            cache_var, linkage::local, cur_defs[i].second));
-                }
-                def_sz = cur_defs.size();
-            }
-            for (auto &insert : insert_seq_before_) {
-                seqs.emplace_back(std::move(insert));
-            }
-            insert_seq_before_.clear();
-            seqs.emplace_back(new_st);
-        }
-        need_defs_.pop_back();
-        changed |= seqs.size() != v->seq_.size();
-        if (changed) {
-            stmt newv = copy_attr(*v, builder::make_stmts_unattached(seqs));
-            cur_scope_ = old_scope;
-            return std::move(newv);
-        }
-        cur_scope_ = old_scope;
-        return v;
-    }
-
-    target_specific_lower_cpu_impl_t(context_ptr ctx, const ir_module_ptr &m)
-        : ctx_(std::move(ctx)), mod_(m) {}
-};
-
-static func_t create_log_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor) {
-    auto type = node->dtype_;
-    uint32_t elements = type.lanes_;
-
-    auto ZERO = gen_vec_const(elements, 0.0f);
-    auto ln2 = gen_vec_const(elements, 0.693147181f);
-    auto ONE_f = gen_vec_const(elements, 1.0f);
-    auto ONE_i = gen_vec_const_int(elements, 1);
-    auto ty_epi_32 = sc_data_type_t::s32(elements);
-    auto neg_inf_f
-            = gen_vec_const(elements, -std::numeric_limits<float>::infinity());
-    auto inf_f
-            = gen_vec_const(elements, std::numeric_limits<float>::infinity());
-    auto qnan_f
-            = gen_vec_const(elements, std::numeric_limits<float>::quiet_NaN());
-
-    builder::ir_builder_t builder;
-    std::string log_const_table_name_1 = "log_const_table_1",
-                log_const_table_name_2 = "log_const_table_2";
-    auto global_log_consts_1
-            = mod->get_var_def_from_symbol(log_const_table_name_1);
-    expr log_table_tsr_1, log_table_tsr_2;
-    if (!global_log_consts_1.defined()) {
-        log_table_tsr_1 = builder::make_tensor(log_const_table_name_1, {32},
-                datatypes::f32, address_space::automatic,
-                std::make_shared<static_data_t>(sc_log_const_int_vals_1,
-                        sizeof(sc_log_const_int_vals_1)));
-        log_table_tsr_2 = builder::make_tensor(log_const_table_name_2, {32},
-                datatypes::f32, address_space::automatic,
-                std::make_shared<static_data_t>(sc_log_const_int_vals_2,
-                        sizeof(sc_log_const_int_vals_2)));
-        log_table_tsr_1->attr().set(
-                attr_keys::static_global, (void *)sc_log_const_int_vals_1);
-        log_table_tsr_2->attr().set(
-                attr_keys::static_global, (void *)sc_log_const_int_vals_2);
-        mod->add_global_var(builder::make_var_tensor_def_unattached(
-                log_table_tsr_1, linkage::public_global)
-                                    .checked_as<define>());
-        mod->add_global_var(builder::make_var_tensor_def_unattached(
-                log_table_tsr_2, linkage::public_global)
-                                    .checked_as<define>());
-    } else {
-        log_table_tsr_1 = global_log_consts_1->var_;
-        log_table_tsr_2
-                = mod->get_var_def_from_symbol(log_const_table_name_2)->var_;
-    }
-    const int approx_bits = 5;
-    const int mantissa_bits = 23;
-    _function_(type, the_log_func, make_args_by_intrinsic(node)) {
-        assert(node->args_.size() == 1);
-        // to avoid underflow
-        _bind_(inval);
-        _var_init_(inval_int, ty_epi_32,
-                builder::make_reinterpret(inval, ty_epi_32));
-        _var_(aux1_int, ty_epi_32);
-        aux1_int = ((inval_int >> gen_vec_const_int(
-                             elements, mantissa_bits - approx_bits))
-                & gen_vec_const_int(elements, 0x0000001f));
-        _var_init_(aux2_int, ty_epi_32,
-                aux1_int >> gen_vec_const_int(elements, approx_bits - 1));
-        _var_init_(aux3_int, ty_epi_32,
-                (inval_int >> gen_vec_const_int(elements, mantissa_bits))
-                        + aux2_int);
-        _var_init_(aux3_f, type, builder::make_cast(type, aux3_int));
-        aux2_int = builder::make_int_xor(
-                           aux2_int, gen_vec_const_int(elements, 0x0000007f))
-                << gen_vec_const_int(elements, 23);
-        inval_int = (inval_int & gen_vec_const_int(elements, 0x007fffff))
-                | aux2_int;
-        _var_init_(
-                aux2_f, type, builder::make_gather(log_table_tsr_1, aux1_int));
-        aux2_f = aux2_f * builder::make_reinterpret(inval_int, type) - ONE_f;
-        _var_init_(poly_f, type, gen_vec_const(elements, 0.199984118f));
-        poly_f = builder::make_fmadd(
-                poly_f, aux2_f, gen_vec_const(elements, -0.250035613f));
-        poly_f = builder::make_fmadd(
-                poly_f, aux2_f, gen_vec_const(elements, 0.333333343f));
-        poly_f = builder::make_fmadd(
-                poly_f, aux2_f, gen_vec_const(elements, -0.5f));
-        poly_f = builder::make_fmadd(poly_f, aux2_f, ONE_f) * aux2_f;
-        aux2_f = builder::make_gather(log_table_tsr_2, aux1_int);
-        aux2_f = builder::make_fmadd(aux3_f, ln2, aux2_f);
-        // two sum algorithm
-        _var_init_(res_hi, type, poly_f + aux2_f);
-        _var_init_(res_lo, type, res_hi - aux2_f);
-        res_lo = res_lo - poly_f;
-        res_hi = res_hi + res_lo;
-        res_hi = builder::make_select(inval == ZERO, neg_inf_f, res_hi);
-        res_hi = builder::make_select(inval < ZERO, qnan_f, res_hi);
-        res_hi = builder::make_select(inval == inf_f, inf_f, res_hi);
-        auto isnan = visitor->dispatch(builder::make_isnan(inval));
-        res_hi = builder::make_select(isnan, qnan_f, res_hi);
-        res_hi = builder::make_select(inval == ONE_f, ZERO, res_hi);
-        _return_(res_hi);
-    }
-    std::string fixed_name = get_log_func_name(node);
-    the_log_func->name_ = fixed_name;
-    the_log_func->decl_->name_ = fixed_name;
-    return the_log_func;
-}
-
-const_ir_module_ptr target_specific_lowering_cpu_t::operator()(
-        const_ir_module_ptr m) {
-    auto ret = m->copy();
-    target_specific_lower_cpu_impl_t pass {ctx_, ret};
-    auto &contents = ret->get_contents();
-    auto sz = contents.size();
-    for (size_t i = 0; i < sz; i++) {
-        auto f = std::const_pointer_cast<func_base>(pass.dispatch(contents[i]));
-        contents[i] = std::move(f);
-        pass.barrier_idle_func_args_.clear();
-    }
-    return ret;
-}
-
-static func_t create_erf_func(const ir_module_ptr &mod,
-        const intrin_call_c &node, target_specific_lower_cpu_impl_t *visitor) {
-    auto type = node->dtype_;
-    uint32_t elements = type.lanes_;
-
-    auto ONE_i = gen_vec_const_int(elements, 1);
-    auto ONE_f = gen_vec_const(elements, 1.0f);
-    expr ZERO_f = make_expr<constant_node>(0.0f, sc_data_type_t::f32(elements));
-    auto half_f = gen_vec_const(elements, 0.5f);
-    auto INT_23 = gen_vec_const_int(elements, 23);
-    auto INT_24 = gen_vec_const_int(elements, 24);
-    auto positive_mask = gen_vec_const_int(elements, 0x7fffffff);
-    auto sign_mask = gen_vec_const_int(elements, 0x80000000);
-    auto index_bias = gen_vec_const_int(elements, 0xc21fffff);
-    union {
-        float v;
-        int v2;
-    } caster;
-    caster.v2 = 0x40b15cee;
-    auto rbound = gen_vec_const(elements, caster.v);
-    caster.v2 = 0x3FB504F3;
-    auto sqrt_2 = gen_vec_const(elements, caster.v);
-    auto ty_epi_32 = sc_data_type_t::s32(elements);
-
-    builder::ir_builder_t builder;
-    if (mod->ctx_->machine_.cpu_flags_.fAVX512F && elements >= 4) {
-        std::vector<std::string> erf_const_table_names = {"erf_const_table_0",
-                "erf_const_table_1", "erf_const_table_2", "erf_const_table_3",
-                "erf_const_table_4", "erf_const_table_5"};
-        auto global_erf_const_0
-                = mod->get_var_def_from_symbol(erf_const_table_names[0]);
-        std::vector<expr> erf_table_tsr_list(6);
-        if (!global_erf_const_0.defined()) {
-            for (int i = 0; i < 6; i++) {
-                erf_table_tsr_list[i]
-                        = builder::make_tensor(erf_const_table_names[i], {32},
-                                datatypes::f32, address_space::automatic,
-                                std::make_shared<static_data_t>(
-                                        sc_erf_const_int_vals[i],
-                                        sizeof(uint32_t) * 32));
-                erf_table_tsr_list[i]->attr().set(attr_keys::static_global,
-                        (void *)sc_erf_const_int_vals[i]);
-                mod->add_global_var(builder::make_var_tensor_def_unattached(
-                        erf_table_tsr_list[i], linkage::public_global)
-                                            .checked_as<define>());
-            }
-        } else {
-            erf_table_tsr_list[0] = global_erf_const_0->var_;
-            for (int i = 1; i < 6; i++) {
-                erf_table_tsr_list[i]
-                        = mod->get_var_def_from_symbol(erf_const_table_names[i])
-                                  ->var_;
-            }
-        }
-        _function_(type, the_erf_func, make_args_by_intrinsic(node)) {
-            assert(node->args_.size() == 1);
-            // to avoid underflow
-            _bind_(inval);
-            inval = inval * sqrt_2;
-            auto in_s32 = builder::make_reinterpret(inval, ty_epi_32);
-            _var_init_(src_pos_s32, ty_epi_32, in_s32 & positive_mask);
-            _var_init_(src_pos, inval->dtype_,
-                    builder::make_reinterpret(src_pos_s32, inval->dtype_));
-            _var_init_(indices, ty_epi_32,
-                    builder::make_min(
-                            builder::make_max((src_pos_s32 + index_bias)
-                                            >> gen_vec_const_int(elements, 21),
-                                    ONE_i),
-                            INT_24));
-            indices = builder::make_select(src_pos > rbound, INT_23, indices);
-            _var_init_(pol, inval->dtype_,
-                    builder::make_indexing(
-                            erf_table_tsr_list[5], {0}, elements));
-            pol = builder::make_permutex2var(pol, indices,
-                    builder::make_indexing(
-                            erf_table_tsr_list[5], {16}, elements));
-            _var_(tmp, inval->dtype_);
-            for (int i = 4; i >= 0; i--) {
-                tmp = builder::make_indexing(
-                        erf_table_tsr_list[i], {0}, elements);
-                tmp = builder::make_permutex2var(tmp, indices,
-                        builder::make_indexing(
-                                erf_table_tsr_list[i], {16}, elements));
-                pol = builder::make_fmadd(pol, src_pos, tmp);
-            }
-            auto tmp2 = in_s32 & sign_mask;
-            pol = builder::make_reinterpret(
-                    builder::make_reinterpret(pol, ty_epi_32) ^ tmp2,
-                    inval->dtype_);
-            _return_(pol);
-        }
-        std::string fixed_name = get_erf_func_name(node);
-        the_erf_func->name_ = fixed_name;
-        the_erf_func->decl_->name_ = fixed_name;
-        return the_erf_func;
-    } else {
-        expr const_a1 = make_expr<constant_node>(
-                0.254829592f, sc_data_type_t::f32(elements));
-        expr const_a2 = make_expr<constant_node>(
-                -0.284496736f, sc_data_type_t::f32(elements));
-        expr const_a3 = make_expr<constant_node>(
-                1.421413741f, sc_data_type_t::f32(elements));
-        expr const_a4 = make_expr<constant_node>(
-                -1.453152027f, sc_data_type_t::f32(elements));
-        expr const_a5 = make_expr<constant_node>(
-                1.061405429f, sc_data_type_t::f32(elements));
-        expr const_p = make_expr<constant_node>(
-                0.3275911f, sc_data_type_t::f32(elements));
-        _function_(type, the_erf_func, make_args_by_intrinsic(node)) {
-            assert(node->args_.size() == 1);
-            // to avoid underflow
-            _bind_(inval);
-            _var_init_(temp, sc_data_type_t::f32(elements),
-                    builder::make_abs(inval));
-            _var_init_(neg_square, sc_data_type_t::f32(elements),
-                    ZERO_f - inval * inval);
-            _var_init_(Q, sc_data_type_t::f32(elements),
-                    ZERO_f - visitor->dispatch(builder::make_exp(neg_square)));
-            _var_init_(t, sc_data_type_t::f32(elements),
-                    builder::make_div(
-                            ONE_f, builder::make_fmadd(const_p, temp, ONE_f)));
-            _var_init_(result, sc_data_type_t::f32(elements), const_a5);
-            _var_init_(sign, sc_data_type_t::s32(elements),
-                    builder::make_int_and(
-                            builder::make_reinterpret(
-                                    inval, sc_data_type_t::s32(elements)),
-                            sign_mask));
-            temp = builder::make_mul(Q, t);
-            result = builder::make_fmadd(result, t, const_a4);
-            result = builder::make_fmadd(result, t, const_a3);
-            result = builder::make_fmadd(result, t, const_a2);
-            result = builder::make_fmadd(result, t, const_a1);
-            result = builder::make_fmadd(result, temp, ONE_f);
-            result = builder::make_reinterpret(
-                    builder::make_int_xor(sign,
-                            builder::make_reinterpret(
-                                    result, sc_data_type_t::s32(elements))),
-                    sc_data_type_t::f32(elements));
-            _return_(result);
-        }
-        std::string fixed_name = get_erf_func_name(node);
-        the_erf_func->name_ = fixed_name;
-        the_erf_func->decl_->name_ = fixed_name;
-        return the_erf_func;
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/target_specific_lower.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/target_specific_lower.hpp
deleted file mode 100644
index aa00b87c46a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/cpu/target_specific_lower.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_TARGET_SPECIFIC_LOWER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_CPU_TARGET_SPECIFIC_LOWER_HPP
-
-#include <compiler/ir/module_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Lower the target-specific IRs.
- * */
-class target_specific_lowering_cpu_t : public module_pass_t {
-public:
-    context_ptr ctx_;
-    target_specific_lowering_cpu_t(context_ptr ctx) : ctx_(ctx) {}
-    const_ir_module_ptr operator()(const_ir_module_ptr m) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_func_eliminate.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_func_eliminate.cpp
deleted file mode 100644
index 60a3d7d6c4b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_func_eliminate.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "dead_func_eliminate.hpp"
-#include <memory>
-#include <string>
-#include <vector>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/pass_id.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <unordered_set>
-
-SC_MODULE(pass.dead_func_elim);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-SC_DECL_PASS_INFO(dead_func_eliminate, SC_PASS_DEPENDS_ON(),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-class func_reference_finder_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    // use pointer to workaround a gcc4 bug
-    const ir_module_t *mod_;
-    std::unordered_set<std::shared_ptr<func_base>> *funcs_;
-    std::vector<std::shared_ptr<func_base>> *new_found_;
-
-    func_reference_finder_t(const ir_module_t &mod,
-            std::unordered_set<std::shared_ptr<func_base>> &funcs,
-            std::vector<std::shared_ptr<func_base>> &new_found)
-        : mod_ {&mod}, funcs_(&funcs), new_found_(&new_found) {}
-
-    void view(call_c v) override {
-        ir_viewer_t::view(v);
-        auto funct = std::dynamic_pointer_cast<func_base>(v->func_);
-        if (funct) {
-            auto funcdef = mod_->get_func(funct->name_);
-            if (funcdef) {
-                bool is_inserted = funcs_->insert(funcdef).second;
-                if (is_inserted) { new_found_->emplace_back(funcdef); }
-            }
-        }
-    }
-
-    void view(func_addr_c v) override {
-        ir_viewer_t::view(v);
-        auto funcdef = mod_->get_func(v->func_->name_);
-        if (funcdef) {
-            bool is_inserted = funcs_->insert(funcdef).second;
-            if (is_inserted) { new_found_->emplace_back(funcdef); }
-        }
-    }
-};
-
-const_ir_module_ptr dead_func_eliminate_t::operator()(const_ir_module_ptr f) {
-    std::vector<std::shared_ptr<func_base>> gc_roots;
-    for (auto &func : f->get_contents()) {
-        if (!func->attr_
-                || !func->attr_->get_or_else(function_attrs::private_, false)) {
-            gc_roots.emplace_back(func);
-        }
-    }
-    std::unordered_set<std::shared_ptr<func_base>> met {
-            gc_roots.begin(), gc_roots.end()};
-    if (auto mainf = f->get_entry_func()) {
-        if (met.count(mainf) == 0) {
-            gc_roots.emplace_back(mainf);
-            met.insert(mainf);
-        }
-    }
-    while (!gc_roots.empty()) {
-        std::vector<std::shared_ptr<func_base>> new_found;
-        func_reference_finder_t finder {*f, met, new_found};
-        for (auto &func : gc_roots) {
-            finder.dispatch(func);
-        }
-        gc_roots = std::move(new_found);
-    }
-    if (met.size() == f->get_contents().size()) { return f; }
-    std::vector<bool> mask;
-    mask.reserve(f->get_contents().size());
-    for (auto &func : f->get_contents()) {
-        auto keep = met.count(func) != 0
-                || (func->attr_
-                        && func->attr_->get_or_else(
-                                attr_keys::keep_func, false));
-        mask.push_back(keep);
-        if (!keep) { SC_MODULE_INFO << "removing func " << func->name_; }
-    }
-    return f->copy_and_remove_funcs(mask);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_func_eliminate.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_func_eliminate.hpp
deleted file mode 100644
index b81c15451b3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_func_eliminate.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DEAD_FUNC_ELIMINATE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DEAD_FUNC_ELIMINATE_HPP
-
-#include <utility>
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace attr_keys {
-constexpr const char *keep_func = "pass.keep_func";
-}
-class dead_func_eliminate_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_write_eliminate.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_write_eliminate.cpp
deleted file mode 100644
index 2af2ebef084..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_write_eliminate.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "dead_write_eliminate.hpp"
-#include <vector>
-#include "../visitor.hpp"
-#include "buffer_schedule.hpp"
-#include <compiler/ir/pass/dependency_analyzer.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(pass.dead_write_elim)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(dead_write_eliminator,
-        SC_PASS_DEPENDS_ON(validator, index2var, tensor_inplace),
-        // need to remove redundant memory store for index2var
-        // need pointer alias info
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED, IR_SIMPLIFIED),
-        SC_PASS_REQUIRE_NOT_STATE(), SC_PASS_SET_STATE(),
-        SC_PASS_UNSET_STATE(IR_SIMPLIFIED));
-
-// dead write elimination implementation
-class dwe_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    std::unordered_set<expr_c> defined;
-
-    // not interested in any exprs
-    expr_c dispatch(expr_c v) override { return v; }
-
-    stmt_c visit(define_c v) override {
-        defined.insert(v->var_);
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(assign_c v) override {
-        if (dependency_analysis::get_dep_info(v.get()).depended_by_.size() == 0
-                && v->var_.isa<indexing>()) {
-            auto tsr = v->var_.static_as<indexing>()->ptr_.checked_as<tensor>();
-            bool no_dead_write = tsr->attr_
-                    && (tsr->attr_->get_or_else(attr_keys::no_dead_write, false)
-                            || tsr->attr_->get_or_else(
-                                    dependency_analysis::attr_directly_accessed,
-                                    false));
-            bool is_pointer_buffer = tsr->elem_dtype_.is_pointer();
-            bool is_read_buffer = tsr->attr().get_or_else("read_buffer", false);
-            bool is_tensor_local
-                    = (is_read_buffer || defined.find(tsr) != defined.end())
-                    && !is_pointer_buffer && !no_dead_write;
-
-            // if it is a store to indexing, and the target tensor is local, and
-            // no other stmt depends on it, we can delete it
-            if (is_tensor_local) {
-                SC_MODULE_INFO << "Remove " << v;
-                if (is_read_buffer) {
-                    SC_MODULE_WARN << "Writing on read-only buffer: " << v;
-                }
-                return make_stmt<stmts_node_t>(std::vector<stmt>());
-            }
-        }
-        return v;
-    }
-};
-
-func_c dead_write_eliminator_t::operator()(func_c f) {
-    if (f->attr_
-            && f->attr_->get_or_else(attr_keys::already_buf_sched, false)) {
-        return f;
-    }
-    dependency_analyzer_t ana;
-    f = ana(f);
-    dwe_impl_t v;
-    return v.dispatch(f);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_write_eliminate.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_write_eliminate.hpp
deleted file mode 100644
index 5b57f3c4ecc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dead_write_eliminate.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DEAD_WRITE_ELIMINATE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DEAD_WRITE_ELIMINATE_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace attr_keys {
-constexpr const char *no_dead_write = "pass.no_dead_write";
-}
-/**
- * Removes unread writes on local var (this feature is not done yet) and local
- * tensors
- * */
-class dead_write_eliminator_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dessa_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dessa_transform.cpp
deleted file mode 100644
index 4d6a8a0ae80..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dessa_transform.cpp
+++ /dev/null
@@ -1,645 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <map>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "dessa_transform.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <util/any_map.hpp>
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(dessa_transform, SC_PASS_DEPENDS_ON(ssa_transform),
-        SC_PASS_REQUIRE_STATE(SSA_STAGE), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(SSA_STAGE));
-
-struct dessa_analysis_data_t {
-    std::vector<const stmt_base_t *> uses_;
-    const stmt_base_t *parent_;
-
-    // if this var-define node should be removed
-    bool should_remove_ = false;
-    // the stmt to be inserted at the begining of the current stmts. If the
-    // current stmt is not stmts, it should be empty
-    std::vector<stmt> inserted_within_;
-    // the stmt to be inserted after the current stmts.
-    std::vector<stmt> inserted_after_;
-    // the copy for the var for loop-phi to solve lost-copy and swap problem
-    expr shadow_phi_var_;
-
-    dessa_analysis_data_t(const stmt_base_t *parent) : parent_(parent) {}
-};
-
-// Loop phi coalesce is achieved by checking the usage of a, b, c
-// in a = phi(b, c loop) is not overlapped and there is no interference.
-// ssa_analysis_viewer_t will set to check b, c usage after a is defined
-// and check a, b usage after c is defined, so if there is interference
-// between any vars in any sub loops, all vars will not be coalesced.
-// This will coalesce all loop phi vars in reduce situations
-struct loop_phi_coalesce_data_t {
-    // check for interference after this var is defined
-    std::weak_ptr<expr_base> check_after_define_;
-    // the final coalesced var
-    std::weak_ptr<expr_base> coalesced_var_;
-    // flag for var defined inside loop
-    bool from_loop_ = false;
-    // flag for interfered phi vars
-    bool interfered_ = false;
-    // flag for checking interference
-    // when check_ is true and the var is used
-    // interfered_ will be set to true
-    bool check_ = false;
-    // flag for preserved var, a copy has been made
-    // original var can be used in other node
-    bool preserved_ = false;
-
-    loop_phi_coalesce_data_t(const expr &v = expr())
-        : check_after_define_(v.impl) {}
-};
-
-// get loop_phi_coalesce_data_t or null
-static inline loop_phi_coalesce_data_t *get_coalesce_data(const expr_c &v) {
-    if (!v->get_temp_data().isa<loop_phi_coalesce_data_t>()) { //
-        return nullptr;
-    }
-    return &(v->temp_data().get<loop_phi_coalesce_data_t>());
-}
-// get loop_phi_coalesce_data_t or create new
-static inline loop_phi_coalesce_data_t *get_or_create_coalesce_data(
-        const expr_c &v, const expr &c = expr()) {
-    if (!v->get_temp_data().isa<loop_phi_coalesce_data_t>()) {
-        v->temp_data() = loop_phi_coalesce_data_t(c);
-    }
-    return &(v->temp_data().get<loop_phi_coalesce_data_t>());
-}
-
-// Merge related loop phi var interfered data,
-// if one interfered, mark all interfered
-static inline bool merge_interfered_data(
-        const expr_c &var, const ssa_phi_c &phi) {
-    if (!get_coalesce_data(var)) { return true; }
-    bool interfered = get_coalesce_data(var)->interfered_;
-    for (auto &val : phi->values_) {
-        if (!get_coalesce_data(val)) { return true; }
-        interfered |= get_coalesce_data(val)->interfered_;
-    }
-    get_coalesce_data(var)->interfered_ = interfered;
-    for (auto &val : phi->values_) {
-        get_coalesce_data(val)->interfered_ = interfered;
-    }
-    return interfered;
-}
-
-// Get final coalesced var
-static inline expr get_coalesced_from_var(const expr_c &v) {
-    if (get_coalesce_data(v)) {
-        auto ptr = get_coalesce_data(v)->coalesced_var_;
-        if (!utils::is_uninitialized_weakptr(ptr)) {
-            assert(ptr.lock());
-            return ptr.lock()->node_ptr_from_this();
-        }
-    }
-    return expr();
-}
-// Get if the coalesced var is preserved
-static inline bool get_perservd_from_var(const expr_c &v) {
-    return get_coalesce_data(v) ? get_coalesce_data(v)->preserved_ : false;
-}
-// Get final coalesced var from phi node, if incoming coalesced ptr_same
-static inline expr get_coalesced_from_phi(const ssa_phi_c &phi) {
-    assert(phi->values_.size() == 2);
-    auto c0 = get_coalesced_from_var(phi->values_[0]);
-    auto c1 = get_coalesced_from_var(phi->values_[1]);
-    return c0.defined() && c1.defined() && c0.ptr_same(c1) ? c0 : expr();
-}
-// Merge final coalesced var, set to the first incoming phi var
-static inline void merge_coalesced_var(
-        const expr_c &thevar, const ssa_phi_c &phi, const stmt &toplevel) {
-    assert(phi->values_.size() == 2);
-    auto &val_0 = phi->values_[0];
-    auto &val_1 = phi->values_[1];
-
-    expr coalesced = get_coalesced_from_var(val_0);
-
-    if (!coalesced.defined()) {
-        // perserve the outer loop defined var
-        get_coalesce_data(val_0)->preserved_ = true;
-        // make a copy of outer loop var
-        coalesced = thevar->remake();
-        coalesced.static_as<var>()->name_ += "_coalesced";
-        // insert define after original var
-        auto coalesced_def = builder::make_var_tensor_def_unattached(
-                coalesced, linkage::local, val_0);
-        auto valowner = val_0->ssa_data_->get_owner();
-        if (valowner.defined()) {
-            auto &tempdata = valowner->temp_data().get<dessa_analysis_data_t>();
-            tempdata.inserted_after_.emplace_back(coalesced_def);
-        } else {
-            auto &tempdata = toplevel->temp_data().get<dessa_analysis_data_t>();
-            tempdata.inserted_within_.emplace_back(coalesced_def);
-        }
-    }
-
-    get_coalesce_data(thevar)->coalesced_var_ = coalesced.impl;
-    get_coalesce_data(val_0)->coalesced_var_ = coalesced.impl;
-    get_coalesce_data(val_1)->coalesced_var_ = coalesced.impl;
-}
-
-struct ssa_analysis_viewer_t : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-    const stmt_base_t *current_ = nullptr;
-
-    std::vector<define_c> defines_;
-    // all the phi node defined in the current loop
-    std::vector<std::vector<std::pair<expr, ssa_phi>>> curr_loop_phi_;
-    // restore parent loop's phi vars check state after curr loop
-    std::vector<std::vector<std::pair<expr, bool>>> curr_check_state_;
-
-    void process_define(const expr &var, const expr &init = expr()) {
-        // process var data for interference check
-        if (get_coalesce_data(var)) {
-            auto data = get_coalesce_data(var);
-            auto ptr = data->check_after_define_;
-            if (!utils::is_uninitialized_weakptr(ptr)) {
-                assert(ptr.lock());
-                auto node = ptr.lock()->node_ptr_from_this();
-                // set to check interference for merged node
-                get_coalesce_data(node)->check_ = true;
-                get_coalesce_data(var)->check_ = false;
-            }
-        } else {
-            var->temp_data() = loop_phi_coalesce_data_t();
-        }
-        // If the var is defined not inside any loop, a copy will
-        // be made when coalescing, and orignal var is preserved
-        bool from_loop = !curr_loop_phi_.empty();
-        get_coalesce_data(var)->from_loop_ = from_loop;
-        // process loop phi define
-        if (init.defined() && init.isa<ssa_phi>()) {
-            auto phi = init.static_as<ssa_phi>();
-            if (phi->is_loop_phi_) {
-                assert(phi->values_.size() == 2);
-                auto &phi_val_0 = phi->values_[0];
-                auto &phi_val_1 = phi->values_[1];
-                if (get_coalesce_data(phi_val_0)
-                        && get_coalesce_data(phi_val_1)) {
-                    // critical loop var defined before current phi var
-                    // exist interference (potentially swap problem)
-                    // may need parallel copy implementation to resolve
-                    get_coalesce_data(var)->interfered_ = true;
-                    get_coalesce_data(phi_val_0)->interfered_ = true;
-                    get_coalesce_data(phi_val_1)->interfered_ = true;
-                } else if (phi_val_0.isa<constant>()) {
-                    // TODO(longsheng): deal with uninitialized vars
-                    get_or_create_coalesce_data(var)->interfered_ = true;
-                    get_or_create_coalesce_data(phi_val_0)->interfered_ = true;
-                    get_or_create_coalesce_data(phi_val_1)->interfered_ = true;
-                } else {
-                    // assume phi_val_0 is the incoming var
-                    // assume phi_val_1 is the critical var
-                    // save incoming phi check state
-                    COMPILE_ASSERT(get_coalesce_data(phi_val_0),
-                            "incoming phi var should be defined already.");
-                    auto &curr_state = curr_check_state_.back();
-                    bool check = get_coalesce_data(phi_val_0)->check_;
-                    curr_state.emplace_back(std::make_pair(phi_val_0, check));
-                    // set to check interference for incoming phi vars
-                    // and critical phi vars
-                    bool v0_check = get_coalesce_data(phi_val_0)->from_loop_;
-                    get_coalesce_data(phi_val_0)->check_ = v0_check;
-                    phi_val_1->temp_data() = loop_phi_coalesce_data_t(var);
-                    get_coalesce_data(phi_val_1)->check_ = true;
-                }
-                // record loop phi in current loop
-                curr_loop_phi_.back().emplace_back(std::make_pair(var, phi));
-            } else {
-                // if phi merged loop phi vars inherent the interferece state
-                merge_interfered_data(var, phi);
-                if (phi->values_.size() == 2) {
-                    auto v0_data = get_coalesce_data(phi->values_[0]);
-                    auto v1_data = get_coalesce_data(phi->values_[1]);
-                    bool v0_interfered = v0_data ? v0_data->interfered_ : true;
-                    bool v1_interfered = v1_data ? v1_data->interfered_ : true;
-                    bool interfered = v0_interfered || v1_interfered;
-                    bool v0_check = v0_data && v0_data->from_loop_;
-                    if (v0_data && !interfered) { v0_data->check_ = v0_check; }
-                    if (v1_data && !interfered) { v1_data->check_ = true; }
-                }
-            }
-        }
-    }
-
-    func_c dispatch(func_c v) override {
-        for (auto &p : v->params_) {
-            process_define(p);
-        }
-        return ssa_viewer_t::dispatch(std::move(v));
-    }
-
-    stmt_c dispatch(stmt_c s) override {
-        if (!s->get_temp_data().isa<dessa_analysis_data_t>()) {
-            s->temp_data() = dessa_analysis_data_t(current_);
-        } else {
-            // if the vardef is used before define, update the parent
-            auto &dessa_data = s->temp_data().get<dessa_analysis_data_t>();
-            assert(dessa_data.parent_ == nullptr);
-            dessa_data.parent_ = current_;
-        }
-        auto old = current_;
-        current_ = s.get();
-        auto ret = ssa_viewer_t::dispatch(std::move(s));
-        current_ = old;
-        return ret;
-    }
-
-    void view(for_loop_c v) override {
-        curr_loop_phi_.emplace_back(std::vector<std::pair<expr, ssa_phi>>());
-        curr_check_state_.emplace_back(std::vector<std::pair<expr, bool>>());
-        ssa_viewer_t::view(v);
-        // merge current loop phi var interference
-        for (auto &kv : curr_loop_phi_.back()) {
-            merge_interfered_data(kv.first, kv.second);
-        }
-        curr_loop_phi_.pop_back();
-        // recover incoming phi check state
-        for (auto &kv : curr_check_state_.back()) {
-            get_coalesce_data(kv.first)->check_ = kv.second;
-        }
-        curr_check_state_.pop_back();
-    }
-
-    void view(define_c v) override {
-        if (v->init_.defined()) { dispatch(v->init_); }
-        process_define(v->var_, v->init_);
-        defines_.emplace_back(std::move(v));
-    }
-
-    void view(var_c v) override {
-        auto owner = v->ssa_data_->get_owner();
-        if (owner.defined()) {
-            if (auto dessa_data
-                    = owner->temp_data().get_or_null<dessa_analysis_data_t>()) {
-                dessa_data->uses_.emplace_back(current_);
-            } else {
-                // if it is a use-before-define, this usually means that this is
-                // in a phi node in for-loop. We insert a placeholder for the
-                // var-def
-                owner->temp_data() = dessa_analysis_data_t(nullptr);
-                owner->temp_data()
-                        .get<dessa_analysis_data_t>()
-                        .uses_.emplace_back(current_);
-            }
-        }
-        if (get_coalesce_data(v) && get_coalesce_data(v)->check_) {
-            // check for loop phi var interference
-            get_coalesce_data(v)->interfered_ = true;
-        }
-    }
-};
-
-static var_node *get_var_if_is_define(const stmt_c &s) {
-    if (s.isa<define>()) {
-        auto def = s.static_as<define>();
-        if (def->var_.isa<var>() && def->var_->ssa_data_) {
-            // assert(def->var_->ssa_data_);
-            return def->var_.static_as<var>().get();
-        }
-    }
-    return nullptr;
-}
-
-struct ssa_mutator_t : public ir_visitor_t {
-    using ir_visitor_t::dispatch;
-    std::vector<stmt> inserted_after;
-    stmt_c dispatch(stmt_c v) override {
-        auto &dessa_data = v->get_temp_data().get<dessa_analysis_data_t>();
-        if (v.isa<stmts>()) {
-            assert(dessa_data.inserted_after_.empty());
-        } else {
-            assert(dessa_data.inserted_within_.empty());
-        }
-        if (dessa_data.should_remove_) { return stmt_c(); }
-        return ir_visitor_t::dispatch(std::move(v));
-    }
-
-    expr_c visit(var_c v) override {
-        auto coalesced = get_coalesced_from_var(v);
-        auto preserved = get_perservd_from_var(v);
-        if (coalesced.defined() && !preserved) { return coalesced; }
-        return v;
-    }
-
-    stmt_c visit(define_c v) override {
-        if (v->var_.isa<var>()
-                && v->var_.static_as<var>()->dtype_ == datatypes::void_t) {
-            assert(v->init_.defined());
-            return builder::make_evaluate_unattached(
-                    ir_visitor_t::dispatch(v->init_));
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        if (v->value_.isa<var>()
-                && v->value_.static_as<var>()->dtype_ == datatypes::void_t) {
-            return stmt_c();
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt_c> ret_vec;
-        bool changed = false;
-        // if the current stmts has "inserted_within_"
-        auto &dessa_data = v->get_temp_data().get<dessa_analysis_data_t>();
-        ret_vec.insert(ret_vec.end(), dessa_data.inserted_within_.begin(),
-                dessa_data.inserted_within_.end());
-        for (auto &s : v->seq_) {
-            if (auto var_p = get_var_if_is_define(s)) {
-                if (var_p->ssa_data_->is_garbage()) {
-                    changed = true;
-                    continue;
-                }
-            }
-            auto sz_before = ret_vec.size();
-            auto ret = dispatch(s);
-
-            // if an stmt is inserted, the IR is changed
-            changed |= sz_before != ret_vec.size();
-            changed |= !ret.ptr_same(s);
-            // we allow the return value to be empty, which means we need to
-            // remove the stmt from parent
-            if (!ret.defined()) {
-                changed = true;
-            } else {
-                ret_vec.emplace_back(std::move(ret));
-            } // insert "inserted_after_"
-            auto &s_dessa_data
-                    = s->get_temp_data().get<dessa_analysis_data_t>();
-            if (!s_dessa_data.inserted_after_.empty()) {
-                std::transform(s_dessa_data.inserted_after_.begin(),
-                        s_dessa_data.inserted_after_.end(),
-                        std::back_inserter(ret_vec), [this](stmt in) {
-                            return ir_visitor_t::dispatch(std::move(in))
-                                    .remove_const();
-                        });
-                changed = true;
-            }
-        }
-        if (!changed) {
-            return v;
-        } else {
-            return copy_attr(*v, builder::make_stmts_unattached(ret_vec));
-        }
-    }
-};
-
-static const stmt_base_t *get_parent(const stmt_base_t *cur) {
-    return cur->get_temp_data().get<dessa_analysis_data_t>().parent_;
-}
-
-static bool is_parent_of(const stmt_base_t *parent, const stmt_base_t *other) {
-    const stmt_base_t *cur = other;
-    while (cur) {
-        if (cur == parent) { return true; }
-        cur = get_parent(cur);
-    }
-    return false;
-}
-
-static const stmt_base_t *find_parent_stmts(const stmt_base_t *other) {
-    const stmt_base_t *cur = other;
-    while (cur) {
-        if (cur->node_type_ == sc_stmt_type::stmts) { return cur; }
-        cur = get_parent(cur);
-    }
-    assert(0 && "bad dessa_analysis_data_t");
-    return nullptr;
-}
-
-static const stmt_base_t *find_parent_for(const stmt_base_t *other) {
-    const stmt_base_t *cur = other;
-    while (cur) {
-        if (cur->node_type_ == sc_stmt_type::for_loop) { return cur; }
-        cur = get_parent(cur);
-    }
-    return nullptr;
-}
-
-static const stmt_base_t *find_shared_parent(
-        const stmt_base_t *lhs, const stmt_base_t *other) {
-    auto cur = lhs;
-    while (cur) {
-        if (is_parent_of(cur, other)) { return cur; }
-        cur = get_parent(cur);
-    }
-    assert(0 && "bad dessa_analysis_data_t");
-    return nullptr;
-}
-
-static const stmt_base_t *find_shared_parent_for_uses(const stmt_base_t *cur) {
-    const dessa_analysis_data_t &data
-            = cur->get_temp_data().get<dessa_analysis_data_t>();
-    for (auto use : data.uses_) {
-        cur = find_shared_parent(cur, use);
-    }
-    return find_parent_stmts(cur);
-}
-
-// If the loop phi is coalesced, use the coalesced var to replace the usage of
-// all merged loop phi vars and romove the define.
-// For non-loop phis, if the 2 merged vars are both coalesced to same var,
-// use the coalesced var to replace the phi node
-static void coalesce_var_for_phi(const expr &coalesced, const expr &var_for_phi,
-        const stmt_base_t *cur) {
-    dessa_analysis_data_t &data = cur->temp_data().get<dessa_analysis_data_t>();
-    auto def = cur->node_ptr_from_this().checked_as<define>();
-    data.should_remove_ = true; // remove the phi node
-
-    auto coalesced_loop_var = get_coalesced_from_var(var_for_phi);
-    if (!coalesced_loop_var.defined()) {
-        // this is a non loop phi merge, use coalesced var as init to define var
-        auto newnode = def->remake();
-        newnode.static_as<define>()->init_ = coalesced;
-        data.inserted_after_.insert(data.inserted_after_.begin(), newnode);
-    } else if (!coalesced_loop_var.ptr_same(coalesced)) {
-        // this is a non loop phi merge, but src and dst are both coalesced,
-        // use coalesced var as init to assign to other coalesced var
-        data.inserted_after_.insert(data.inserted_after_.begin(),
-                builder::make_assign_unattached(coalesced_loop_var, coalesced));
-    }
-}
-
-// in SSA form, a var may be defined in a child scope (e.g. within if-else), but
-// is used in parent scope. This function inserts a var def in parent scope and
-// replaces the var def in child scope with an assignment to make the IR valid.
-// returns true if the var def needs to be moved
-static bool promote_scope_for_non_phi_var(
-        const define_c &def, const expr &value) {
-    dessa_analysis_data_t &curdata
-            = def->temp_data().get<dessa_analysis_data_t>();
-    auto coalesced = get_coalesced_from_var(def->var_);
-    if (coalesced.defined()) {
-        // The var is coalesced
-        curdata.should_remove_ = true;
-        curdata.inserted_after_.emplace_back(
-                builder::make_assign_unattached(coalesced, value));
-        return false;
-    }
-    auto shared_parent = find_shared_parent_for_uses(def.get());
-    if (shared_parent != get_parent(def.get())) {
-        curdata.should_remove_ = true;
-        if (value->dtype_ == datatypes::void_t) {
-            if (!value.isa<var>()) {
-                curdata.inserted_after_.emplace_back(
-                        builder::make_evaluate_unattached(value));
-            }
-        } else {
-            shared_parent->temp_data()
-                    .get<dessa_analysis_data_t>()
-                    .inserted_within_.emplace_back(
-                            builder::make_var_tensor_def_unattached(
-                                    def->var_, def->linkage_));
-            curdata.inserted_after_.insert(curdata.inserted_after_.begin(),
-                    builder::make_assign_unattached(def->var_, value));
-        }
-        return true;
-    }
-    return false;
-}
-
-// insert copying to phi vars after an input of the phi node is computed. If a
-// phi input is defined in IR function parameters, put the copy in the top-level
-// stmts
-static void insert_value_copy_for_phi(const ssa_phi &phi,
-        const expr &var_for_phi, const stmt_base_t *cur, const stmt &toplevel) {
-    dessa_analysis_data_t &data = cur->temp_data().get<dessa_analysis_data_t>();
-    const stmt_base_t *shared_parent = cur;
-    const expr &the_target_var = data.shadow_phi_var_.defined()
-            ? data.shadow_phi_var_
-            : var_for_phi;
-    // if the phi has an input from parameter
-    expr init_val_for_param;
-    for (auto &val : phi->values_) {
-        auto valowner = val->ssa_data_->get_owner();
-        if (!valowner.defined()) {
-            COMPILE_ASSERT(!init_val_for_param.defined(),
-                    "PHI node can only accept only one parameter input");
-            init_val_for_param = val;
-            valowner = toplevel;
-            shared_parent = toplevel.get();
-        } else {
-            auto &tempdata = valowner->temp_data().get<dessa_analysis_data_t>();
-            shared_parent = find_shared_parent(shared_parent, valowner.get());
-            tempdata.inserted_after_.emplace_back(
-                    builder::make_assign_unattached(the_target_var, val));
-        }
-    }
-    shared_parent = find_shared_parent(
-            shared_parent, find_shared_parent_for_uses(cur));
-    shared_parent = find_parent_stmts(shared_parent);
-    auto &parent_data = shared_parent->temp_data().get<dessa_analysis_data_t>();
-    parent_data.inserted_within_.emplace_back(
-            builder::make_var_tensor_def_unattached(
-                    var_for_phi, linkage::local, init_val_for_param));
-    if (data.shadow_phi_var_.defined()) {
-        parent_data.inserted_within_.emplace_back(
-                builder::make_var_tensor_def_unattached(data.shadow_phi_var_,
-                        linkage::local, init_val_for_param));
-    }
-    data.should_remove_ = true; // remove the phi node
-}
-
-static void process_phi(
-        const ssa_phi &phi, const stmt_base_t *cur, const stmt &toplevel) {
-    dessa_analysis_data_t &data = cur->temp_data().get<dessa_analysis_data_t>();
-    auto def = cur->node_ptr_from_this().checked_as<define>();
-    auto thevar = def->var_.checked_as<var>();
-    if (phi->values_.size() > 1) {
-        // first check if this phi depends on a value in for-loop
-        const stmt_base_t *cur_for = find_parent_for(cur);
-        bool is_loop_phi = phi->is_loop_phi_;
-
-        if (is_loop_phi) {
-            COMPILE_ASSERT(cur_for, "Cannot find parent for-loop for loop phi");
-            if (!merge_interfered_data(thevar, phi)) {
-                // No interference between loop_phi vars
-                merge_coalesced_var(thevar, phi, toplevel);
-            } else {
-                // the phi is a loop-phi
-                data.shadow_phi_var_ = builder::make_var(
-                        thevar->dtype_, thevar->name_ + "_shadow");
-                auto cur_for_body
-                        = static_cast<const for_loop_node_t *>(cur_for)->body_;
-                // copy the value in shadow to old var at the begining of the
-                // for-loop
-                cur_for_body->temp_data()
-                        .get<dessa_analysis_data_t>()
-                        .inserted_within_.emplace_back(
-                                builder::make_assign_unattached(
-                                        thevar, data.shadow_phi_var_));
-            }
-        }
-        expr coalesced = get_coalesced_from_phi(phi);
-        if (coalesced.defined()) {
-            coalesce_var_for_phi(coalesced, thevar, cur);
-        } else {
-            insert_value_copy_for_phi(phi, thevar, cur, toplevel);
-        }
-    } else {
-        // not-phi
-        if (!promote_scope_for_non_phi_var(def, phi->values_.front())) {
-            // if the current scope is OK for the node
-            data.should_remove_ = true;
-            auto newnode = def->remake();
-            newnode.static_as<define>()->init_ = phi->values_.front();
-            data.inserted_after_.emplace_back(std::move(newnode));
-        }
-    }
-}
-
-func_c dessa_transform_t::operator()(func_c f) {
-    ssa_analysis_viewer_t analyzer;
-    analyzer.dispatch(f);
-    for (auto &def : analyzer.defines_) {
-        if (def->var_.isa<var>() && def->init_.defined()) {
-            if (def->init_.isa<ssa_phi>()) {
-                auto phi = def->init_.static_as<ssa_phi>();
-                process_phi(phi, def.get(), f->body_);
-            } else {
-                assert(def->init_.defined());
-                promote_scope_for_non_phi_var(def, def->init_);
-            }
-        }
-    }
-    ssa_mutator_t mutator;
-    return mutator.dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dessa_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dessa_transform.hpp
deleted file mode 100644
index 0c6324ac882..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dessa_transform.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DESSA_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DESSA_TRANSFORM_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Converts the IR from SSA (Single-Static-Assignment) form to normal IR for
- * codegen. It mainly does two things:
- * 1. Move the var define nodes to an appropriate scope which covers all uses.
- * Reason: in SSA form, we allow a variable to live across the scopes. e.g. a
- * var can be defined in "then" block of if-else and be used after if-else ends.
- * We need to move the def of the var to a parent scope to make the IR valid. We
- * also need to take the PHI node's inputs into consideration when selecting
- * where to put the definition
- * 2. Resolve PHI nodes. There are two kinds of PHI nodes, which are 1) PHI in
- * for-loops which depends on the value of a var of the previous iteration 2)
- * PHI unrelated with loops. If all/none of the PHI node inputs depend on the
- * values defined in the current for-loop, the PHI is of kind 1), otherwise it
- * is of kind 2). For kind 1), we need a "shadow" copy for the var in the loop
- * to solve the problems like "lost copy" or "swap problem". For kind 2), we
- * just need to add an assign node for each of the inputs of PHI
- * */
-class dessa_transform_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dyn_tsr_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dyn_tsr_transform.cpp
deleted file mode 100644
index 9a52e0ba318..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dyn_tsr_transform.cpp
+++ /dev/null
@@ -1,500 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "dyn_tsr_transform.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(dyn_tensor_transformer, SC_PASS_DEPENDS_ON(),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-class tensor_transform_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    // dyn var has been defined in function;
-    std::unordered_set<expr> var_defined_in_func_;
-    // old tsr => new dyn tsr defined in func param, used for call node inside
-    // function. Vector is used for different scope.
-    std::vector<std::unordered_map<expr, expr>> tsr_replace_map_;
-    // for second pass callee func decl replacement
-    std::unordered_map<std::string, func_t> func_decl_replace_map_;
-    // def stmts to insert before call node
-    std::vector<stmt> def_stmts_;
-    // def internal dynamic var stmts to insert before call node
-    std::vector<stmt> internal_dyn_var_stmts_;
-    expr create_dyn_tsr_from_tensor(
-            const expr &in, std::vector<stmt> &def_stmts) {
-        COMPILE_ASSERT(in.isa<tensor>() || in.isa<tensorptr>(),
-                "input should be a tensor or tensor ptr node.")
-        auto placeholder
-                = in->attr().get_or_else("temp.dyn_placeholder", expr());
-        std::string name;
-        expr tsr;
-        if (in.isa<tensor>()) {
-            name = in.checked_as<tensor>()->name_;
-            tsr = in;
-        } else {
-            tsr = in;
-            while (tsr.isa<tensorptr>()) {
-                tsr = tsr.checked_as<tensorptr>()->base_->ptr_;
-            }
-            name = tsr.checked_as<tensor>()->name_ + "_tptr";
-        }
-        auto dyn_tsr = builder::make_tensor(std::string("dyn_") + name,
-                {sizeof(runtime::dynamic_tensor_t)}, datatypes::u8);
-        def_stmts.push_back(builder::make_var_tensor_def_unattached(dyn_tsr));
-
-        expr shape_tsr, dtype, dyn_mask, ndims;
-        if (placeholder.defined()
-                && placeholder->attr().has_key(
-                        "temp.dyn_shape_of_placeholder")) {
-            shape_tsr = placeholder->attr().get<expr>(
-                    "temp.dyn_shape_of_placeholder");
-            ndims = builder::make_read_struct(placeholder,
-                    dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::ndims);
-            dtype = builder::make_read_struct(placeholder,
-                    dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::dtype);
-            dyn_mask = builder::make_read_struct(placeholder,
-                    dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::dyn_mask);
-        } else {
-            // should be in (tptr/tsr)
-            auto plain_dims
-                    = in->attr_->get<std::vector<expr>>(attr_keys::plain_dims);
-            ndims = builder::make_constant({plain_dims.size()}, datatypes::s32);
-            shape_tsr = builder::make_tensor(std::string("dyn_shape_") + name,
-                    {plain_dims.size()}, datatypes::index);
-            shape_tsr->attr().set(attr_keys::no_tensor2var, true);
-            shape_tsr->attr().set(attr_keys::no_dead_write, true);
-            int64_t etype = tsr->dtype_.is_etype_pointer()
-                    ? tsr->dtype_.get_pointer_element().as_etype_int()
-                    : tsr->dtype_.as_etype_int();
-            dtype = builder::make_constant({etype}, datatypes::u32);
-            dyn_mask = builder::make_var(
-                    datatypes::u8, std::string("dyn_mask_") + name);
-            uint64_t dyn_mask_int = 0;
-            def_stmts.push_back(
-                    builder::make_var_tensor_def_unattached(shape_tsr));
-            for (size_t i = 0; i < plain_dims.size(); i++) {
-                if (plain_dims[i].isa<constant>()
-                        || var_defined_in_func_.find(plain_dims[i])
-                                != var_defined_in_func_.end()) {
-                    def_stmts.push_back(builder::make_assign_unattached(
-                            builder::make_indexing(shape_tsr, i),
-                            plain_dims[i]));
-                }
-                dyn_mask_int |= (uint64_t(!plain_dims[i].isa<constant>()) << i);
-            }
-            def_stmts.push_back(builder::make_var_tensor_def_unattached(
-                    dyn_mask, linkage::local,
-                    builder::make_constant({dyn_mask_int}, datatypes::u8)));
-        }
-        def_stmts.push_back(
-                builder::make_evaluate_unattached(builder::make_write_struct(
-                        dyn_tsr, in.remove_const(), dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr)));
-        def_stmts.push_back(
-                builder::make_evaluate_unattached(builder::make_write_struct(
-                        dyn_tsr, shape_tsr, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr)));
-        def_stmts.push_back(
-                builder::make_evaluate_unattached(builder::make_write_struct(
-                        dyn_tsr, ndims, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::ndims)));
-        def_stmts.push_back(
-                builder::make_evaluate_unattached(builder::make_write_struct(
-                        dyn_tsr, dtype, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dtype)));
-        def_stmts.push_back(
-                builder::make_evaluate_unattached(builder::make_write_struct(
-                        dyn_tsr, dyn_mask, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dyn_mask)));
-        return dyn_tsr;
-    }
-
-    void add_new_dynamic_var(const expr &new_var) {
-        if (var_defined_in_func_.find(new_var) == var_defined_in_func_.end()) {
-            if (new_var->attr_
-                    && new_var->attr_->has_key(attr_keys::cal_expression)
-                    && !new_var->equals(new_var->attr_->get<expr_c>(
-                            attr_keys::cal_expression))) {
-                internal_dyn_var_stmts_.emplace_back(
-                        builder::make_var_tensor_def_unattached(new_var,
-                                linkage::local,
-                                new_var->attr_->get<expr_c>(
-                                        attr_keys::cal_expression)));
-                var_defined_in_func_.insert(new_var);
-            };
-        }
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        switch (v->type_) {
-            case intrin_type::read_struct:
-            case intrin_type::write_struct: {
-                if (v->intrin_attrs_->get<std::string>(intrin_attr::struct_name)
-                        == dyn_tsr_struct_t::name) {
-                    std::vector<expr> new_args;
-                    bool changed = dispatch_expr_vector(v->args_, new_args);
-                    // only consider first arg.
-                    auto &arg = v->args_[0];
-                    auto it = tsr_replace_map_.back().find(arg);
-                    if (it != tsr_replace_map_.back().end()) {
-                        new_args[0] = it->second;
-                        changed = true;
-                    }
-                    if (changed) {
-                        return copy_attr(*v,
-                                make_expr<intrin_call_node>(
-                                        v->type_, new_args, *v->intrin_attrs_));
-                    }
-                }
-                return v;
-                break;
-            }
-            default: return v;
-        }
-    }
-    expr_c visit(call_c v) override {
-        if (v->func_->attr().get_or_else(attr_keys::forbid_trans, false)) {
-            return v;
-        }
-        std::vector<expr> new_args(v->args_.begin(), v->args_.end());
-        bool changed = false;
-        func_t the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-        func_t proto_func = v->get_prototype();
-        assert(v->args_.size() == proto_func->params_.size());
-        assert(def_stmts_.empty());
-        // call kernel
-        bool trans_all_params = (the_func && the_func->attr_
-                                        && the_func->attr_->get_or_else(
-                                                attr_keys::always_trans, false))
-                || (proto_func->attr_
-                        && proto_func->attr_->get_or_else(
-                                attr_keys::always_trans, false));
-        for (size_t i = 0; i < v->args_.size(); i++) {
-            auto &arg = v->args_[i];
-            auto it = tsr_replace_map_.back().find(arg);
-            if (it != tsr_replace_map_.back().end()) {
-                new_args[i] = it->second;
-                changed = true;
-            } else if (arg.isa<tensor>()) {
-                auto tsr_arg = arg.static_as<tensor>();
-                bool should_trans = false;
-                for (auto &d : tsr_arg->dims_) {
-                    if (d.isa<var>()) {
-                        should_trans = true;
-                        break;
-                    }
-                }
-                should_trans = should_trans || trans_all_params
-                        || tsr_arg->attr().get_or_else(
-                                attr_keys::always_trans, false);
-                if (should_trans) {
-                    auto dyn_tsr
-                            = create_dyn_tsr_from_tensor(tsr_arg, def_stmts_);
-                    tsr_replace_map_.back().insert(
-                            std::make_pair(arg, dyn_tsr));
-                    new_args[i] = dyn_tsr;
-                    changed = true;
-                }
-            } else if (arg.isa<tensorptr>()) {
-                auto tsr_arg = arg.static_as<tensorptr>();
-                bool should_trans = false;
-                for (auto &d : tsr_arg->shape_) {
-                    if (d.isa<var>()) {
-                        should_trans = true;
-                        break;
-                    }
-                }
-                should_trans = should_trans || trans_all_params
-                        || tsr_arg->attr().get_or_else(
-                                attr_keys::always_trans, false);
-                if (should_trans) {
-                    auto dyn_tsr
-                            = create_dyn_tsr_from_tensor(tsr_arg, def_stmts_);
-                    tsr_replace_map_.back().insert(
-                            std::make_pair(arg, dyn_tsr));
-                    new_args[i] = dyn_tsr;
-                    changed = true;
-                }
-            }
-        }
-        if (changed) {
-            if (the_func) {
-                return builder::remake_call(the_func, new_args, v);
-            } else {
-                return copy_attr(*v, make_expr<call_node>(v->func_, new_args));
-            }
-        }
-        return v;
-    }
-    stmt_c visit(stmts_c v) override {
-        tsr_replace_map_.emplace_back(tsr_replace_map_.back());
-        bool changed = false;
-        std::vector<stmt_c> seq;
-        seq.reserve(v->seq_.size());
-        for (size_t i = 0; i < v->seq_.size(); i++) {
-            def_stmts_.clear();
-            auto &st = v->seq_[i];
-            auto n = dispatch(st);
-            if (!def_stmts_.empty()) {
-                changed = true;
-                seq.insert(seq.end(), def_stmts_.begin(), def_stmts_.end());
-                def_stmts_.clear();
-            }
-            if (!n.ptr_same(st)) { changed = true; }
-            seq.emplace_back(std::move(n));
-        }
-        tsr_replace_map_.pop_back();
-        if (changed) {
-            return copy_attr(*v, builder::make_stmts_unattached(seq));
-        }
-        return v;
-    }
-
-    // eliminate duplicate dynamic var definition.
-    stmt_c visit(define_c v) override {
-        if (var_defined_in_func_.find(v->var_) != var_defined_in_func_.end()) {
-            return builder::make_stmts_unattached({});
-        }
-        auto ret = ir_visitor_t::visit(v).checked_as<define_c>();
-        if (ret->var_.isa<tensor>()) {
-            auto plain_dims = any_map_t::fetch_or_else(ret->var_->attr_.get(),
-                    attr_keys::plain_dims, std::vector<expr>());
-            for (size_t i = 0; i < plain_dims.size(); i++) {
-                auto &d = plain_dims[i];
-                if (!d.isa<constant>()) {
-                    assert(d.isa<var>());
-                    add_new_dynamic_var(d);
-                }
-            }
-        } else if (ret->var_.isa<var>()) {
-            var_defined_in_func_.insert(ret->var_);
-        }
-        return ret;
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto d = v->value_;
-        if (d.isa<var>()) { add_new_dynamic_var(d); }
-        return v;
-    }
-
-    func_c dispatch(func_c v) override {
-        if (std::const_pointer_cast<func_base>(v)->attr().get_or_else(
-                    attr_keys::forbid_trans, false)) {
-            return v;
-        }
-        tsr_replace_map_.clear();
-        tsr_replace_map_.emplace_back();
-        bool all_const = true;
-        std::vector<expr> new_params;
-        // base and shape define stmts insert to front of body.
-        assert(def_stmts_.empty());
-        internal_dyn_var_stmts_.clear();
-        var_defined_in_func_.clear();
-        new_params.reserve(v->params_.size());
-        // kernel func should transform all tensors.
-        bool trans_all_params = v->attr_
-                && v->attr_->get_or_else(attr_keys::always_trans, false);
-        for (auto &p : v->params_) {
-            if (p.isa<tensor>()) {
-                auto old_tsr = p.static_as<tensor>();
-                std::vector<std::pair<size_t, expr>> shape_vars;
-                bool cur_const = true;
-                auto plain_dims
-                        = old_tsr->attr().get_or_else<std::vector<expr>>(
-                                attr_keys::plain_dims, std::vector<expr>());
-                for (size_t i = 0; i < plain_dims.size(); i++) {
-                    auto &d = plain_dims[i];
-                    if (!d.isa<constant>()) {
-                        assert(d.isa<var>());
-                        if (var_defined_in_func_.find(d)
-                                == var_defined_in_func_.end()) {
-                            shape_vars.emplace_back(i, d);
-                            var_defined_in_func_.insert(d);
-                        }
-                        cur_const = false;
-                    }
-                }
-                if (trans_all_params
-                        || p->attr().get_or_else(
-                                attr_keys::always_trans, false)) {
-                    cur_const = false;
-                }
-                if (!cur_const) {
-                    expr dyn_tsr;
-                    auto it = tsr_replace_map_.back().find(old_tsr);
-                    if (it != tsr_replace_map_.back().end()) {
-                        dyn_tsr = it->second;
-                    } else {
-                        dyn_tsr = builder::make_tensor(
-                                std::string("dyn_") + old_tsr->name_,
-                                {sizeof(runtime::dynamic_tensor_t)},
-                                datatypes::u8);
-                        tsr_replace_map_.back().insert(
-                                std::make_pair(old_tsr, dyn_tsr));
-                    }
-                    new_params.push_back(dyn_tsr);
-
-                    if (!shape_vars.empty()) {
-                        // shape tensor define
-                        auto dyn_shape_tsr = builder::make_tensor(
-                                std::string("dyn_shape_") + old_tsr->name_,
-                                {plain_dims.size()}, datatypes::index);
-                        dyn_shape_tsr->attr().set(
-                                attr_keys::no_tensor2var, true);
-                        auto def = builder::make_var_tensor_def_unattached(
-                                dyn_shape_tsr, linkage::local,
-                                builder::make_read_struct(dyn_tsr,
-                                        dyn_tsr_struct_t::name,
-                                        dyn_tsr_struct_t::fields::dim_ptr));
-                        def_stmts_.push_back(def);
-                        for (auto &var_pair : shape_vars) {
-                            def_stmts_.push_back(
-                                    builder::make_var_tensor_def_unattached(
-                                            var_pair.second, linkage::local,
-                                            builder::make_indexing(
-                                                    dyn_shape_tsr,
-                                                    var_pair.first)));
-                        }
-                    }
-                    // base tensor define, may be tensor ptr
-                    if (old_tsr.isa<tensor>()) {
-                        old_tsr->attr().set(attr_keys::no_dead_write, true);
-                        old_tsr->attr().set(attr_keys::no_tensor2var, true);
-                        auto def = builder::make_var_tensor_def_unattached(
-                                old_tsr, linkage::local,
-                                builder::make_read_struct(dyn_tsr,
-                                        dyn_tsr_struct_t::name,
-                                        dyn_tsr_struct_t::fields::data_ptr));
-                        def_stmts_.push_back(def);
-                    }
-                } else {
-                    new_params.emplace_back(old_tsr);
-                }
-                all_const &= cur_const;
-            } else {
-                new_params.push_back(p);
-            }
-        }
-        std::vector<stmt> def_stmts_backup = std::move(def_stmts_);
-        def_stmts_.clear();
-        auto body = dispatch(v->body_);
-        bool changed = !all_const || !body.ptr_same(v->body_);
-        var_defined_in_func_.clear();
-        if (changed) {
-            if (!all_const) {
-                if (body.isa<stmts>()) {
-                    auto seq = body.static_as<stmts>()->seq_;
-                    seq.insert(seq.begin(), def_stmts_backup.begin(),
-                            def_stmts_backup.end());
-                    if (!internal_dyn_var_stmts_.empty()) {
-                        seq.insert(seq.begin() + def_stmts_backup.size(),
-                                internal_dyn_var_stmts_.begin(),
-                                internal_dyn_var_stmts_.end());
-                        internal_dyn_var_stmts_.clear();
-                    }
-                    body = copy_attr(
-                            *body, make_stmt<stmts_node_t>(std::move(seq)));
-                } else {
-                    def_stmts_backup.push_back(body.remove_const());
-                    body = make_stmt<stmts_node_t>(std::move(def_stmts_backup));
-                }
-            }
-            auto new_func = copy_attr(*v,
-                    builder::make_func(v->name_, new_params,
-                            body.remove_const(), v->ret_type_));
-            // save map from symbol to decl.
-            func_decl_replace_map_.insert(
-                    std::make_pair(v->name_, new_func->decl_));
-            return std::move(new_func);
-        }
-        return v;
-    }
-};
-
-class func_decl_replace_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    // inherit from tensor_transform pass
-    std::unordered_map<std::string, func_t> &func_decl_replace_map_;
-
-    expr_c visit(call_c v) override {
-        func_t the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-        if (the_func) {
-            auto it = func_decl_replace_map_.find(the_func->name_);
-            if (it != func_decl_replace_map_.end()) {
-                return copy_attr(*v,
-                        builder::make_call(
-                                std::const_pointer_cast<func_base>(it->second),
-                                v->args_));
-            }
-        } else {
-            func_t proto_func = v->get_prototype();
-            auto it = func_decl_replace_map_.find(proto_func->name_);
-            if (it != func_decl_replace_map_.end()) {
-                v->func_->attr().set("prototype", it->second);
-                return copy_attr(*v, make_expr<call_node>(v->func_, v->args_));
-            }
-        }
-        return v;
-    }
-    func_decl_replace_impl_t(
-            std::unordered_map<std::string, func_t> &replace_map)
-        : func_decl_replace_map_(replace_map) {}
-};
-
-func_c dyn_tensor_transformer_t::operator()(func_c f) {
-    tensor_transform_impl_t trans_pass;
-    f = trans_pass.dispatch(f);
-    func_decl_replace_impl_t replace_pass(trans_pass.func_decl_replace_map_);
-    return replace_pass.dispatch(f);
-}
-
-const_ir_module_ptr dyn_tensor_transformer_t::operator()(
-        const_ir_module_ptr f) {
-    tensor_transform_impl_t trans_pass;
-    f = dispatch_module_on_visitor(&trans_pass, f);
-    func_decl_replace_impl_t replace_pass(trans_pass.func_decl_replace_map_);
-    f = dispatch_module_on_visitor(&replace_pass, f);
-    return f;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dyn_tsr_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dyn_tsr_transform.hpp
deleted file mode 100644
index 9a69e48eedd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dyn_tsr_transform.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DYN_TSR_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DYN_TSR_TRANSFORM_HPP
-#include "../module_pass.hpp"
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace attr_keys {
-constexpr const char *plain_dims = "pass.plain_dims";
-constexpr const char *always_trans = "pass.always_trans";
-constexpr const char *forbid_trans = "pass.forbid_trans";
-// calculation expression for internal vars which could be generated by known
-// vars.
-constexpr const char *cal_expression = "pass.cal_expression";
-
-} // namespace attr_keys
-/**
- * Do dynamic tensor transform, tensor_node=>dynamic_tensor(void pointer in IR,
- * actually runtime struct). Extract and define vars from tensor when enter the
- * function. This pass only acts on tensors who has dynamic shape(var_node)
- * */
-class dyn_tensor_transformer_t : public module_pass_t {
-public:
-    func_c operator()(func_c f);
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dynamic_parallel_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dynamic_parallel_transform.cpp
deleted file mode 100644
index 2a501cc5c59..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dynamic_parallel_transform.cpp
+++ /dev/null
@@ -1,898 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "dynamic_parallel_transform.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/tir_pos_trace.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/cpu/closurize.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_threadpool_c.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/assert.hpp>
-#include <util/optional_find.hpp>
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(dynamic_parallel_transform, SC_PASS_DEPENDS_ON(),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-func_t get_dyn_threadpool_shared_buffer_func() {
-    static func_t f = builder::_decl_func("sc_dyn_threadpool_shared_buffer",
-            datatypes::u8.get_pointerof(), {_arg_("size", datatypes::index)});
-    return f;
-}
-
-func_t get_dyn_threadpool_loop_end_func() {
-    static func_t f = builder::_decl_func("sc_dyn_threadpool_loop_end",
-            datatypes::index,
-            {_arg_("scope", datatypes::index),
-                    _arg_("size", datatypes::index)});
-    return f;
-}
-
-func_t get_dyn_threadpool_submit_func() {
-    static func_t f = builder::_decl_func("sc_dyn_threadpool_create_work_items",
-            datatypes::index,
-            {_arg_("pfunc", datatypes::pointer),
-                    _arg_("iter", datatypes::pointer),
-                    _arg_("iter_num", datatypes::index),
-                    _arg_("loop_len", datatypes::index),
-                    _arg_("num_blocks", datatypes::index),
-                    _arg_("tid_hint", datatypes::index),
-                    _arg_("buffer_num", datatypes::index),
-                    _arg_("buffers", datatypes::pointer),
-                    _arg_("flags", datatypes::index)});
-    return f;
-}
-
-func_t get_dyn_threadpool_run_func() {
-    static func_t f = builder::_decl_func(
-            "sc_dyn_threadpool_run", datatypes::void_t, {});
-    return f;
-}
-
-func_t get_dyn_threadpool_destroy_func() {
-    static func_t f = builder::_decl_func(
-            "sc_dyn_threadpool_sched_destroy", datatypes::void_t, {});
-    return f;
-}
-
-func_t get_dyn_threadpool_init_func() {
-    static func_t f = builder::_decl_func("sc_dyn_threadpool_sched_init",
-            datatypes::void_t,
-            {_arg_("stream", datatypes::pointer),
-                    _arg_("module_data", datatypes::pointer),
-                    _arg_("args", datatypes::generic.get_pointerof()),
-                    _arg_("num_roots", datatypes::index),
-                    _arg_("queue_size", datatypes::index),
-                    _arg_("num_threads", datatypes::index)});
-    return f;
-}
-
-// what a parallel-for scope defines/captures(from parent parallel-fors)
-struct parallel_for_scope_t {
-    std::vector<expr_c> iters_;
-    std::unordered_set<expr_c> defined_;
-    std::unordered_set<expr_c> captured_;
-    std::vector<expr_c> ordered_captured_;
-    std::weak_ptr<parallel_for_scope_t> inlined_to_;
-    for_loop_c loop_;
-    uint64_t nested_level_;
-    // if this scope is the start of an original parallel for
-    bool is_start_;
-    // if this scope is the end of an original parallel for
-    bool is_end_ = true;
-    bool in_if_scope_ = false;
-    bool in_for_scope_ = false;
-    uint64_t tid_step_;
-
-    parallel_for_scope_t(const for_loop_c &loop, uint64_t nested_level,
-            bool is_start, uint64_t tid_step,
-            const std::vector<expr_c> *parent_iters, const expr_c &cur_iter)
-        : loop_ {loop}
-        , nested_level_ {nested_level}
-        , is_start_ {is_start}
-        , tid_step_ {tid_step} {
-        if (parent_iters) {
-            iters_ = *parent_iters;
-            if (cur_iter.defined()) { iters_.emplace_back(cur_iter); }
-            defined_ = std::unordered_set<expr_c>(iters_.begin(), iters_.end());
-        }
-    }
-};
-
-struct dyn_parallel_analysis_result_t {
-    // the collection of all captured var/tensor of nested parallel fors
-    std::unordered_set<expr_c> captured_;
-    std::vector<expr_c> ordered_captured_;
-
-    std::vector<std::shared_ptr<parallel_for_scope_t>> pfor_chain_ {
-            std::make_shared<parallel_for_scope_t>(
-                    for_loop_c(), 0, true, 0, nullptr, expr_c())};
-};
-
-/**
- * this sub-pass traverses the nested parallel-for bodies to find 0)
- * parallel-for loop scopes 1) captured var/tensors in main_thread 2) captured
- * tensors in parent parallel-for scopes
- * */
-class dyn_parallel_analysis_impl_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    const std::vector<std::unordered_set<expr_c>> &main_thread_defined_;
-    const std::unordered_set<expr_c> &globals_;
-
-    dyn_parallel_analysis_result_t &result_;
-    tir_pos_tracer pass_error_tracer_;
-    uint64_t num_threads_;
-
-    void assert_value_is(const expr &v, int64_t val) const {
-        COMPILE_ASSERT_POS(v.isa<constant>()
-                        && get_const_as_int(v.static_as<constant_c>()) == val,
-                "Expecting " << val << " for the boundary of the loop, got "
-                             << v);
-    }
-
-    dyn_parallel_analysis_impl_t(uint64_t num_threads,
-            const std::vector<std::unordered_set<expr_c>> &main_thread_defined,
-            const std::unordered_set<expr_c> &globals,
-            dyn_parallel_analysis_result_t &result)
-        : main_thread_defined_(main_thread_defined)
-        , globals_(globals)
-        , result_(result)
-        , num_threads_ {num_threads} {}
-
-    expr_c dispatch(expr_c v) override {
-        TIR_ERROR_TRACE(v);
-        ir_viewer_t::dispatch(v);
-        return v;
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        TIR_ERROR_TRACE(v);
-        ir_viewer_t::dispatch(v);
-        return v;
-    }
-
-    bool try_capture_main_thread(const expr_c &v) {
-        if (result_.captured_.count(v)) { return true; }
-        for (auto &d : main_thread_defined_) {
-            if (d.count(v)) {
-                result_.captured_.insert(v);
-                result_.ordered_captured_.emplace_back(v);
-                return true;
-            }
-        }
-        return false;
-    }
-
-    bool try_capture_in_parallel(const expr_c &v) {
-        int64_t found = -1;
-        for (int64_t i = result_.pfor_chain_.size() - 1; i >= 0; i--) {
-            auto &cur_scope = result_.pfor_chain_[i];
-            if (cur_scope->captured_.count(v)) {
-                found = i;
-                break;
-            }
-            if (cur_scope->defined_.count(v)) {
-                found = i;
-                break;
-            }
-        }
-        if (found == -1) { return false; }
-        for (uint64_t i = found + 1; i < result_.pfor_chain_.size(); i++) {
-            auto &cur_scope = result_.pfor_chain_[i];
-            cur_scope->captured_.insert(v);
-            cur_scope->ordered_captured_.emplace_back(v);
-        }
-        return true;
-    }
-
-    void view(define_c v) override {
-        result_.pfor_chain_.back()->defined_.insert(v->var_);
-        ir_viewer_t::view(std::move(v));
-    }
-
-    void view(tensor_c v) override {
-        if (!try_capture_in_parallel(v)) {
-            auto defined_in_main = try_capture_main_thread(v);
-            if (!defined_in_main) {
-                COMPILE_ASSERT_POS(
-                        globals_.count(v), "Use of undefined tensor " << v);
-            }
-        }
-        ir_viewer_t::view(std::move(v));
-    }
-
-    void view(var_c v) override {
-        // we only allow using variables in the current parallel-for scope, or
-        // in main-thread captures, or in module globals
-        if (!result_.pfor_chain_.back()->defined_.count(v)) {
-            auto defined_in_main = try_capture_main_thread(v);
-            if (!defined_in_main) {
-                COMPILE_ASSERT_POS(
-                        globals_.count(v), "Use of undefined variable " << v);
-            }
-        }
-        // ir_viewer_t::view(std::move(v));
-    }
-
-    void view(if_else_c v) override {
-        auto &chain_data = result_.pfor_chain_.back()->in_if_scope_;
-        auto old_chain_data = chain_data;
-        chain_data = true;
-        ir_viewer_t::view(std::move(v));
-        chain_data = old_chain_data;
-    }
-
-    void view(for_loop_c v) override {
-        auto &chain_data = result_.pfor_chain_.back()->in_for_scope_;
-        auto old_chain_data = chain_data;
-        if (v->kind_ == for_type::NORMAL) {
-            chain_data = true;
-            result_.pfor_chain_.back()->defined_.insert(v->var_);
-            ir_viewer_t::view(v);
-            result_.pfor_chain_.back()->defined_.erase(v->var_);
-        } else if (v->kind_ == for_type::PARALLEL) {
-            COMPILE_ASSERT_POS(
-                    !chain_data && !result_.pfor_chain_.back()->in_if_scope_,
-                    "Nested parallel cannot be in if or normal for");
-            assert_value_is(v->iter_begin_, 0);
-            assert_value_is(v->step_, 1);
-            ir_viewer_t::dispatch(v->iter_begin_);
-            ir_viewer_t::dispatch(v->iter_end_);
-            ir_viewer_t::dispatch(v->step_);
-
-            auto current_scope = result_.pfor_chain_.back();
-            // the current scope has another parallel-for, so it is not the end
-            current_scope->is_end_ = false;
-            auto old_num_threads = num_threads_;
-            uint64_t thread_id_step = 1;
-            if (v->num_threads_ > 0) {
-                thread_id_step = old_num_threads / v->num_threads_;
-                num_threads_ = thread_id_step;
-            }
-            // dispatch body in a new scope
-            auto ptr = std::make_shared<parallel_for_scope_t>(v,
-                    current_scope->nested_level_ + 1, true, thread_id_step,
-                    &current_scope->iters_, v->var_);
-            COMPILE_ASSERT(ptr->nested_level_ <= 6,
-                    "At most 6 nested parallel loops is supported");
-            result_.pfor_chain_.emplace_back(ptr);
-            // dispatch into the loop body
-            ir_viewer_t::dispatch(v->body_);
-            num_threads_ = old_num_threads;
-            // push a new scope for the remainder of the current pfor. is_start
-            // should be false
-            auto new_scope = std::make_shared<parallel_for_scope_t>(
-                    current_scope->loop_, current_scope->nested_level_, false,
-                    current_scope->tid_step_, &current_scope->iters_, expr_c());
-            result_.pfor_chain_.emplace_back(new_scope);
-        } else {
-            throw std::runtime_error("Bad loop type");
-        }
-        chain_data = old_chain_data;
-    }
-};
-
-// transform the nested parallel for scope bodies into functions
-static int transform_scopes_to_funcs(const std::string &name,
-        const stmts_c &body, const dyn_parallel_analysis_result_t &result,
-        int idx_in_result, const std::unordered_set<expr_c> &globals,
-        std::vector<func_t> &out_funcs) {
-    auto current_scope = result.pfor_chain_.at(idx_in_result);
-    COMPILE_ASSERT(body.get() == current_scope->loop_->body_.get(),
-            "Bad parallel for scope state");
-    stmts new_body;
-    std::unordered_map<expr_c, expr> *replace_map = nullptr;
-    std::unique_ptr<ir_copier_t> cpyer;
-    auto prepare_new_var_tensor
-            = [&replace_map, &globals, &result, &current_scope]() {
-                  // make global vars unchanged
-                  for (auto &k : globals) {
-                      (*replace_map)[k] = k.remove_const();
-                  }
-                  std::vector<expr> new_args;
-                  // remake iterators
-                  for (auto &k : current_scope->iters_) {
-                      new_args.emplace_back(k->remake());
-                      (*replace_map)[k] = new_args.back();
-                  }
-                  // remake thread captured
-                  for (auto &k : current_scope->ordered_captured_) {
-                      new_args.emplace_back(k->remake());
-                      (*replace_map)[k] = new_args.back();
-                  }
-                  // remake main-thread captured
-                  for (auto &k : result.ordered_captured_) {
-                      new_args.emplace_back(k->remake());
-                      (*replace_map)[k] = new_args.back();
-                  }
-                  return new_args;
-              };
-    auto prepare_new_out_func = [&replace_map, &cpyer, &idx_in_result, &name,
-                                        &new_body, &prepare_new_var_tensor,
-                                        &out_funcs]() {
-        new_body = builder::make_stmts_unattached({}).static_as<stmts>();
-        new_body->temp_data() = std::unordered_map<expr_c, expr>();
-        replace_map = &(
-                new_body->temp_data().get<std::unordered_map<expr_c, expr>>());
-        cpyer = utils::make_unique<ir_copier_t>(*replace_map, true);
-        std::vector<expr> new_args = prepare_new_var_tensor();
-        auto ret = builder::make_func(
-                name + "_0_closure_N" + std::to_string(idx_in_result), new_args,
-                new_body, datatypes::void_t);
-        ret->attr()[function_attrs::private_] = true;
-        ret->attr()[function_attrs::no_parallel] = true;
-        ret->decl_->attr()[function_attrs::private_] = true;
-        ret->decl_->attr()[function_attrs::no_parallel] = true;
-        out_funcs.emplace_back(ret);
-        return ret;
-    };
-    auto current_new_func = prepare_new_out_func();
-    for (auto &s : body->seq_) {
-        COMPILE_ASSERT(!s.isa<stmts>(),
-                "Expecting nested stmts to be flattened, got " << body);
-        auto captured_def
-                = s.cast<define>()
-                          .flat_map([](const define &v) {
-                              return v->var_.cast<tensor>();
-                          })
-                          .filter([&](const tensor &v) {
-                              return (uint64_t)idx_in_result + 1
-                                      < result.pfor_chain_.size()
-                                      && result.pfor_chain_[idx_in_result + 1]
-                                                 ->captured_.count(v);
-                          });
-        if (captured_def.has_value()) {
-            // if "s" is a define node and the tensor is captured by the next
-            // scope, need special allocation for this tensor
-            auto def = s.static_as<define>();
-            COMPILE_ASSERT(!def->init_.defined(),
-                    "Captured tensors should not have init pointer" << def);
-            auto newdef = (*cpyer)(s).static_as<define>();
-            auto old_tsr = captured_def.get();
-            newdef->init_ = builder::make_call(
-                    get_dyn_threadpool_shared_buffer_func(),
-                    {do_cast_and_fold((*cpyer)(old_tsr->dims_.at(0))
-                            * utils::get_sizeof_type(old_tsr->elem_dtype_))});
-            new_body->seq_.emplace_back(newdef);
-            continue;
-        }
-
-        auto loop = s.cast<for_loop_c>()
-                            .filter([](const for_loop_c &l) {
-                                return l->kind_ == for_type::PARALLEL;
-                            })
-                            .get_or_else(for_loop_c());
-        if (!loop.defined()) {
-            // if it is not a parallel-for
-            new_body->seq_.emplace_back((*cpyer)(s).remove_const());
-            continue;
-        }
-        // "s" is a parallel for
-        idx_in_result = transform_scopes_to_funcs(name, loop->body_.as<stmts>(),
-                                result, idx_in_result + 1, globals, out_funcs)
-                + 1;
-        // switch the local states to the new scope
-        current_scope = result.pfor_chain_.at(idx_in_result);
-        COMPILE_ASSERT(body.get() == current_scope->loop_->body_.get(),
-                "Bad parallel for scope state");
-        current_new_func = prepare_new_out_func();
-    }
-    return idx_in_result;
-}
-
-static void finalize_body_funcs(const dyn_parallel_analysis_result_t &result,
-        std::vector<func_t> &body_funcs, std::vector<func_t> &out_funcs) {
-    assert(body_funcs.size() == result.pfor_chain_.size() - 1);
-    // generate place holders for wrappers
-    out_funcs.reserve(body_funcs.size());
-    for (size_t i = 0; i < body_funcs.size(); i++) {
-        auto &scope = result.pfor_chain_.at(i + 1);
-        uint64_t depth = scope->nested_level_;
-        uint64_t num_buffers = scope->ordered_captured_.size();
-        uint64_t num_args = result.ordered_captured_.size();
-        auto func = builder::make_func(body_funcs[i]->name_ + "_wrapper",
-                {builder::make_tensor("itr", {depth}, datatypes::index),
-                        builder::make_tensor(
-                                "buffers", {num_buffers}, datatypes::generic),
-                        builder::make_tensor(
-                                "args", {num_args}, datatypes::generic)},
-                builder::make_stmts_unattached({}), datatypes::void_t);
-        func->attr()[function_attrs::private_] = true;
-        func->decl_->attr()[function_attrs::private_] = true;
-        out_funcs.emplace_back(func);
-    }
-    auto find_parent_scope
-            = [&body_funcs, &result](const for_loop_c &v,
-                      size_t idx_in_body_funcs, size_t &out_idx_in_body_funcs) {
-                  uint64_t diff = 0;
-                  for (int64_t i = (int64_t)idx_in_body_funcs; i >= 0; i--) {
-                      // skip dummy droped for loop bodies
-                      if (!body_funcs[i]) { continue; }
-                      auto &scope = result.pfor_chain_.at(i + 1);
-                      if (scope->is_start_ && v.ptr_same(scope->loop_)) {
-                          out_idx_in_body_funcs = i;
-                          return diff;
-                      }
-                      if (utils::is_uninitialized_weakptr(scope->inlined_to_)) {
-                          // if the scope is inlined to another, don't need to
-                          // add to the diff because it is not visible by the
-                          // runtime
-                          diff++;
-                      }
-                  }
-                  throw std::runtime_error("Bad dyn_parallel_analysis_result");
-              };
-    for (size_t i = 0; i < body_funcs.size(); i++) {
-        auto &cur_func = body_funcs[i];
-        if (!cur_func) {
-            out_funcs[i] = nullptr;
-            continue;
-        }
-        auto &cur_func_body = cur_func->body_.static_as<stmts>()->seq_;
-        std::vector<stmt> *cur_body = &cur_func->body_.static_as<stmts>()->seq_;
-        auto &cur_scope = result.pfor_chain_.at(i + 1);
-        // generate code for 1) count down barrier 2) submit the next loop
-        // workload
-
-        /*
-        1) if the current scope is at the end of a parallel-for, count down
-        barrier like:
-        u64 cur_scope_1 = sc_dyn_threadpool_loop_end(nullptr, 0);
-        if(cur_scope_1) {
-            sc_dyn_threadpool_create_work_items(...);
-        }
-        2) an optimization, if there are consecutive multiple empty scopes that
-        ends a parallel-for after the current scope, fold them in a nested "if"
-        TIR:
-        pfor(t) {
-            pfor(i) {
-                pfor(j) {
-                    pfor(k) {
-                        code();
-                    } // <<<< loop_end here
-                } // <<<< an empty scope and loop_end here
-            } // <<<< an empty scope and loop_end here
-            code2() // non empty scope
-        }
-        The tail of loop k can be transformed to:
-        // count down the barrier for loop k
-        u64 cur_scope_1 = sc_dyn_threadpool_loop_end(0, 0);
-        // if the current thread is the last one on loop k
-        if(cur_scope_1) {
-            // go up "1" parent scope and count down the barrier for loop j
-            u64 cur_scope_2 = sc_dyn_threadpool_loop_end(cur_scope_1, 1);
-            // if the current thread is the last one on loop j
-            if (cur_scope_2) {
-                // go up "1" parent scope and count down the barrier for loop i
-                u64 cur_scope_3 = sc_dyn_threadpool_loop_end(cur_scope_2, 1);
-                // if the current thread is the last one on loop j
-                if(cur_scope_3) {
-                    // submit workload for "code2"
-                    sc_dyn_threadpool_create_work_items(...);
-                }
-            }
-        }
-        */
-        auto cur_tail_scope = cur_scope;
-        int scope_count = 0;
-        expr current_scope_handle = UINT64_C(0);
-        uint64_t last_parent_scope_idx = i;
-        auto cur_tail_scope_idx = i + 1;
-        bool has_barrier = false;
-        while (cur_tail_scope->is_end_) {
-            if (cur_tail_scope != cur_scope) {
-                // if the scope's body is not empty, we cannot optimize it.
-                // break.
-                auto &cur_scope_func = body_funcs.at(cur_tail_scope_idx - 1);
-                if (!cur_scope_func->body_.static_as<stmts>()->seq_.empty()) {
-                    break;
-                }
-                // remove the empty scopy body
-                cur_scope_func->temp_data_ = nullptr;
-                cur_scope_func = nullptr;
-            }
-            has_barrier = true;
-            auto scope_ret = builder::make_var(datatypes::index,
-                    "__scope_handle_" + std::to_string(scope_count));
-            scope_count++;
-            uint64_t new_scope = 0;
-            auto scope_diff = find_parent_scope(
-                    cur_tail_scope->loop_, last_parent_scope_idx, new_scope);
-            last_parent_scope_idx = new_scope;
-            cur_body->emplace_back(builder::make_var_tensor_def_unattached(
-                    scope_ret, linkage::local,
-                    get_dyn_threadpool_loop_end_func()(
-                            current_scope_handle, scope_diff)));
-            auto if_body
-                    = builder::make_stmts_unattached({}).static_as<stmts>();
-            cur_body->emplace_back(builder::make_if_else_unattached(
-                    scope_ret != UINT64_C(0), if_body, stmt_c()));
-            cur_body = &if_body->seq_;
-            // now check the next scope
-            cur_tail_scope_idx++;
-            if (cur_tail_scope_idx >= result.pfor_chain_.size()) { break; }
-            cur_tail_scope = result.pfor_chain_[cur_tail_scope_idx];
-            current_scope_handle = scope_ret;
-        }
-        // now submit the next scope's body
-        // first find the next body to submit
-        size_t next_func_idx = 0;
-        for (size_t j = i + 1; j < body_funcs.size(); j++) {
-            if (body_funcs[j]) {
-                next_func_idx = j;
-                break;
-            }
-        }
-        if (!next_func_idx) {
-            if (has_barrier) {
-                // if the current scope is the last parallel-for body
-                // remove the barrier code (if any)
-                // delele the var definition
-                cur_func_body.pop_back();
-                // delele the "if"
-                cur_func_body.pop_back();
-            }
-            body_funcs[i]->temp_data_ = nullptr;
-            // no function to submit, skip
-            continue;
-        }
-        // add the job submission code in cur_body
-        const auto &next_scope_info = result.pfor_chain_.at(next_func_idx + 1);
-        expr loop_len;
-        uint64_t num_iter = 0;
-        uint64_t num_threads = 1;
-        uint64_t tid_step = next_scope_info->tid_step_;
-        auto &replace = body_funcs[i]
-                                ->body_->temp_data()
-                                .get<std::unordered_map<expr_c, expr>>();
-        if (next_scope_info->is_start_) {
-            num_iter = next_scope_info->nested_level_ - 1;
-            num_threads
-                    = std::max((uint64_t)next_scope_info->loop_->num_threads_,
-                            UINT64_C(1));
-            // if the next scope is the begining of a loop
-            ir_copier_t cpy {replace, true};
-            loop_len = cpy(next_scope_info->loop_->iter_end_).remove_const();
-        } else {
-            // if the next scope returns to a parent loop
-            num_iter = next_scope_info->nested_level_;
-            loop_len = UINT64_C(1);
-            auto &nextfunc = body_funcs.at(next_func_idx);
-            if (nextfunc->body_.static_as<stmts>()->seq_.empty()) {
-                /* an optimization: if the next scope is empty, directly call
-                   the body:
-                   pfor() {
-                    pfor() {
-                        // current scope
-                    }
-                    // <<< next scope here. It only submits the next pfor
-                    pfor() {
-
-                    }
-                   }
-                   */
-                std::vector<expr> args;
-                for (auto &itr : next_scope_info->iters_) {
-                    args.emplace_back(
-                            *utils::find_map_value(replace, itr).get());
-                }
-                for (auto &itr : next_scope_info->ordered_captured_) {
-                    args.emplace_back(
-                            *utils::find_map_value(replace, itr).get());
-                }
-                for (auto &itr : result.ordered_captured_) {
-                    args.emplace_back(
-                            *utils::find_map_value(replace, itr).get());
-                }
-                auto thecall = builder::make_call(nextfunc->decl_, args);
-                thecall->attr()["inline_level"] = 2;
-                cur_body->emplace_back(
-                        builder::make_evaluate_unattached(thecall));
-                assert(utils::is_uninitialized_weakptr(
-                        next_scope_info->inlined_to_));
-                next_scope_info->inlined_to_ = cur_scope;
-                continue;
-            }
-        }
-        uint64_t num_shared_buffers = next_scope_info->ordered_captured_.size();
-        expr buffers;
-        if (num_shared_buffers) {
-            // an optimization, if the current scope has the same share buffers
-            // of the next scope, just pass nullptr
-            std::vector<expr_c> *cur_buffers = &cur_scope->ordered_captured_;
-            if (!utils::is_uninitialized_weakptr(cur_scope->inlined_to_)) {
-                auto scope = cur_scope->inlined_to_.lock();
-                assert(scope);
-                cur_buffers = &scope->ordered_captured_;
-            }
-            bool fast_path = false;
-            auto &next_buffers = next_scope_info->ordered_captured_;
-            // check that every buffer in next buffer is in the current buffers
-            if (cur_buffers->size() >= next_buffers.size()) {
-                fast_path = true;
-                for (size_t bid = 0; bid < next_buffers.size(); bid++) {
-                    if (!next_buffers[bid].ptr_same((*cur_buffers)[bid])) {
-                        fast_path = false;
-                        break;
-                    }
-                }
-            }
-            if (fast_path) {
-                buffers = get_ir_null();
-            } else {
-                buffers = builder::make_tensor("_shared_buf",
-                        {num_shared_buffers}, datatypes::generic);
-                cur_body->emplace_back(
-                        builder::make_var_tensor_def_unattached(buffers));
-                for (size_t idx = 0; idx < next_buffers.size(); idx++) {
-                    auto &buf = next_buffers[idx];
-                    auto itr = replace.find(buf);
-                    COMPILE_ASSERT(itr != replace.end(),
-                            "Bad capture state, cannot find the captured "
-                            "buffer");
-                    cur_body->emplace_back(
-                            builder::make_assign_unattached(buffers[idx],
-                                    builder::make_cast(
-                                            datatypes::generic, itr->second)));
-                }
-                buffers = builder::make_cast(datatypes::pointer, buffers);
-            }
-        } else {
-            buffers = get_ir_null();
-        }
-        // clear temp data
-        body_funcs[i]->temp_data_ = nullptr;
-        cur_body->emplace_back(builder::make_evaluate_unattached(
-                get_dyn_threadpool_submit_func()(
-                        builder::make_func_addr(
-                                out_funcs[next_func_idx]->decl_),
-                        get_ir_null(), num_iter, loop_len, num_threads,
-                        /*fix-me: (yijie) outer_loop_hash*/ UINT64_C(0),
-                        num_shared_buffers, buffers,
-                        /*flags*/ tid_step
-                                | runtime::dynamic_threadpool::work_item_flags::
-                                        bind_last_level)));
-    }
-}
-
-// finally, fill in the body of the placeholders for wrappers
-static void generate_wrappers(const dyn_parallel_analysis_result_t &result,
-        std::vector<func_t> &body_funcs, std::vector<func_t> &wrapper_funcs) {
-    for (size_t i = 0; i < wrapper_funcs.size(); i++) {
-        if (!wrapper_funcs[i]) { continue; }
-        auto &wrapper = wrapper_funcs[i];
-        auto &bodyfunc = body_funcs[i];
-        auto &current_scope = result.pfor_chain_.at(i + 1);
-        // args of wrapper:
-        // uint64_t *itr, generic_val* shared_buffers, generic_val *args
-        auto &body = wrapper->body_.static_as<stmts>()->seq_;
-        auto &itr = wrapper->params_[0];
-        auto &shared_buffers = wrapper->params_[1];
-        auto &args = wrapper->params_[2];
-        std::vector<expr> funcargs;
-        for (uint64_t j = 0; j < current_scope->iters_.size(); j++) {
-            funcargs.emplace_back(itr[j]);
-        }
-        for (uint64_t j = 0; j < current_scope->ordered_captured_.size(); j++) {
-            auto dtype = current_scope->ordered_captured_[j]->dtype_;
-            funcargs.emplace_back(builder::make_cast(dtype, shared_buffers[j]));
-        }
-        for (uint64_t j = 0; j < result.ordered_captured_.size(); j++) {
-            auto dtype = result.ordered_captured_[j]->dtype_;
-            funcargs.emplace_back(builder::make_cast(dtype, args[j]));
-        }
-        body.emplace_back(builder::make_evaluate_unattached(
-                builder::make_call(bodyfunc->decl_, funcargs)));
-    }
-}
-
-class dyn_parallel_transform_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    bool always_transform_;
-    std::vector<std::unordered_set<expr_c>> defined_;
-    ir_module_ptr mod_;
-    std::unordered_set<expr_c> globals_;
-    func_c cur_func_;
-    int capture_count_ = 0;
-    int num_threads_;
-
-    dyn_parallel_transform_impl_t(
-            bool always_transform, const ir_module_ptr &mod, int num_threads)
-        : always_transform_(always_transform)
-        , mod_(mod)
-        , num_threads_(num_threads) {
-        for (auto &v : mod->get_module_vars()) {
-            globals_.insert(v->var_);
-        }
-    }
-
-    stmt transform_loop(for_loop_c v) {
-        v = normalize_parallel_for_loop(v);
-        dyn_parallel_analysis_result_t result;
-        dyn_parallel_analysis_impl_t ana {
-                (uint64_t)num_threads_, defined_, globals_, result};
-        ana.pass_error_tracer_.cur_func_ = cur_func_.get();
-        ana.view(v);
-        // remove the dummy tail scope
-        result.pfor_chain_.pop_back();
-        std::vector<func_t> body_funcs, wrapper_funcs;
-        transform_scopes_to_funcs(cur_func_->name_, v->body_.as<stmts>(),
-                result, 1, globals_, body_funcs);
-        finalize_body_funcs(result, body_funcs, wrapper_funcs);
-        generate_wrappers(result, body_funcs, wrapper_funcs);
-        // the parallel-for bodies are ready. Now try to submit the root jobs
-        auto retstmts = builder::make_stmts_unattached({}).static_as<stmts>();
-        auto captures = builder::make_tensor(
-                "__captures" + std::to_string(capture_count_),
-                {(uint64_t)result.ordered_captured_.size()},
-                datatypes::generic);
-        capture_count_++;
-        retstmts->seq_.emplace_back(
-                builder::make_var_tensor_def_unattached(captures));
-        for (uint64_t i = 0; i < result.ordered_captured_.size(); i++) {
-            auto &cap = result.ordered_captured_[i];
-            retstmts->seq_.emplace_back(builder::make_assign_unattached(
-                    captures[i], builder::make_cast(datatypes::generic, cap)));
-        }
-        using namespace runtime::dynamic_threadpool;
-        auto null_for_stream = get_ir_null();
-        null_for_stream->attr()["auto_fill_stream"] = true;
-        auto null_for_module_data = get_ir_null();
-        null_for_module_data->attr()["auto_fill_module_data"] = true;
-        retstmts->seq_.emplace_back(builder::make_evaluate_unattached(
-                get_dyn_threadpool_init_func()(null_for_stream,
-                        null_for_module_data, captures, UINT64_C(1),
-                        /*queue size*/ UINT64_C(256), uint64_t(num_threads_))));
-        uint64_t num_threads = v->num_threads_;
-        if (num_threads == 0) {
-            // normal parallel for
-            num_threads = num_threads_;
-        }
-        retstmts->seq_.emplace_back(builder::make_evaluate_unattached(
-                get_dyn_threadpool_submit_func()(
-                        builder::make_func_addr(wrapper_funcs.front()->decl_),
-                        get_ir_null(), UINT64_C(0), v->iter_end_, num_threads,
-                        /*fix-me: (yijie) outer_loop_hash*/ UINT64_C(0),
-                        /*num_shared_buffers*/ UINT64_C(0), get_ir_null(),
-                        /*flags*/
-                        uint64_t(work_item_flags::is_root
-                                | work_item_flags::bind_last_level
-                                | result.pfor_chain_.at(1)->tid_step_))));
-        retstmts->seq_.emplace_back(builder::make_evaluate_unattached(
-                get_dyn_threadpool_run_func()()));
-        retstmts->seq_.emplace_back(builder::make_evaluate_unattached(
-                get_dyn_threadpool_destroy_func()()));
-        for (auto &f : body_funcs) {
-            if (f) { mod_->add_func({f}); }
-        }
-        for (auto &f : wrapper_funcs) {
-            if (f) { mod_->add_func({f}); }
-        }
-        return retstmts;
-    }
-
-    func_c dispatch(func_c f) override {
-        cur_func_ = f;
-        capture_count_ = 0;
-        defined_.emplace_back(std::unordered_set<expr_c>(
-                f->params_.begin(), f->params_.end()));
-        auto ret = ir_visitor_t::dispatch(f);
-        defined_.pop_back();
-        return ret;
-    }
-
-    expr_c dispatch(expr_c v) override { return v; }
-
-    stmt_c visit(define_c v) override {
-        defined_.back().insert(v->var_);
-        return v;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        defined_.back().insert(v->var_);
-        auto ret = ir_visitor_t::visit(v);
-        defined_.back().erase(v->var_);
-        return ret;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt_c> ret;
-        defined_.emplace_back();
-        bool changed = false;
-        for (auto &s : v->seq_) {
-            auto loop = s.cast<for_loop_c>()
-                                .filter([](const for_loop_c &l) {
-                                    return l->kind_ == for_type::PARALLEL;
-                                })
-                                .get_or_else(for_loop_c());
-            if (loop.defined()) {
-                bool should_transform = always_transform_
-                        || any_map_t::fetch_or_else(loop->attr_.get(),
-                                attr_keys::dynamic_parallel, false);
-                if (should_transform) {
-                    changed = true;
-                    ret.emplace_back(transform_loop(loop));
-                } else {
-                    // don't need to dispatch on statically dispatched
-                    // parallel-for
-                    ret.emplace_back(dispatch(s));
-                }
-            } else {
-                auto news = dispatch(s);
-                changed |= !news.ptr_same(s);
-                ret.emplace_back(news);
-            }
-        }
-        defined_.pop_back();
-        if (!changed) { return v; }
-        return copy_attr(*v, builder::make_stmts_unattached(ret));
-    }
-};
-
-const_ir_module_ptr dynamic_parallel_transform_t::operator()(
-        const_ir_module_ptr f) {
-    auto ret = std::make_shared<ir_module_t>(*f);
-    auto threads = runtime_config_t::get().get_num_threads();
-    dyn_parallel_transform_impl_t impl {always_transform_, ret, threads};
-    auto &contents = ret->get_contents();
-    size_t sz = contents.size();
-    for (size_t i = 0; i < sz; i++) {
-        if (threads == 1) {
-            contents[i] = std::const_pointer_cast<func_base>(
-                    remove_parallel_on_func(contents[i]));
-        } else {
-            contents[i] = std::const_pointer_cast<func_base>(
-                    impl.dispatch(contents[i]));
-        }
-    }
-    ret->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-            = thread_pool_mode_t::DYNAMIC;
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dynamic_parallel_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dynamic_parallel_transform.hpp
deleted file mode 100644
index e5e8a3b7ae4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/dynamic_parallel_transform.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DYNAMIC_PARALLEL_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_DYNAMIC_PARALLEL_TRANSFORM_HPP
-
-#include <compiler/ir/module_pass.hpp>
-#include <compiler/ir/sc_function.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Split nested parallel-for loops into "scopes" and runtime library calls
- * A "scope" is a basic block for parallel-for body, without any other nested
- * parallel loops.
- * The parallel body:
- * pfor() {
- *  A()
- *  pfor() {
- *      C()
- *  }
- *  B()
- * }
- * .. will have 3 scopes containing A(), B(), C() respectively.
- * This pass also handles dynamic buffer management, if a buffer is shared by
- * multiple scopes. A more complicated example:
- *
- * buffer B
- * pfor(i) {
- *  use(i)
- *  buffer A
- *  pfor(j) {
- *      use(i,j)
- *      pfor(k) {
- *          use(A,B)
- *          use(i,j,k)
- *      }
- *  }
- * }
- *
- * ... will be transformed to
- * buffer B
- * submit(func1)
- * func1 = (i,B) {
- *  use(i)
- *  buffer A = shared_malloc();
- *  submit(func2, A, B)
- * }
- * func2 = (i,j,A,B) {
- *      use(i,j)
- *      submit(func3, A, B)
- * }
- * func3 = (i,j,k,A,B) {
- *          use(A,B)
- *          use(i,j,k)
- * }
- */
-class dynamic_parallel_transform_t : public module_pass_t {
-public:
-    // transform all parallel-for loops and disregard whether the parallel-for
-    // is in dynamic mode
-    bool always_transform_;
-    dynamic_parallel_transform_t(bool always_transform)
-        : always_transform_ {always_transform} {}
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-func_t get_dyn_threadpool_shared_buffer_func();
-func_t get_dyn_threadpool_loop_end_func();
-func_t get_dyn_threadpool_submit_func();
-func_t get_dyn_threadpool_run_func();
-func_t get_dyn_threadpool_destroy_func();
-func_t get_dyn_threadpool_init_func();
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/func_inline.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/func_inline.cpp
deleted file mode 100644
index d34aa4fc513..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/func_inline.cpp
+++ /dev/null
@@ -1,371 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <atomic>
-#include <utility>
-#include "tensor_shrink.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass/ir_copy_internal.hpp>
-#include <compiler/ir/transform/func_inline.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(func_inliner, SC_PASS_DEPENDS_ON(validator, trace_inserter),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(FUNC_INLINED), SC_PASS_UNSET_STATE());
-
-// if v is a stmts_node_t, return v
-// else, return a stmts_node_t containing v
-static stmts_c promote_stmt_to_stmts(stmt_c v) {
-    if (v.isa<stmts>()) { return v.static_as<stmts>(); }
-    return builder::make_stmts_unattached({std::move(v)}).static_as<stmts_c>();
-}
-
-// The function body copier
-class func_body_copier_t : public ir_copier_impl_t {
-public:
-    using ir_copier_impl_t::copy;
-    using ir_copier_impl_t::dispatch;
-    using ir_copier_impl_t::view;
-    bool already_returned_ = false;
-    expr ret_var_;
-
-    func_body_copier_t(
-            std::unordered_map<expr_c, expr> &replace_map, expr ret_var)
-        : ir_copier_impl_t(replace_map), ret_var_(std::move(ret_var)) {}
-
-    expr copy(const expr_c &v) override {
-        dispatch(v);
-        auto ret = std::move(returned_expr_);
-        bool attr_copied = false;
-        // if v is var/tensor and it is in replace_map_, don't copy attr.
-        // Otherwise, it changes the original var/tensor's attr
-        if (!utils::is_one_of(
-                    v->node_type_, sc_expr_type::var, sc_expr_type::tensor)
-                || replace_map_.count(v) == 0) {
-            copy_attr(*v, expr(ret));
-            attr_copied = true;
-        }
-        if (ret.isa<tensor>() || ret.isa<tensorptr>()) {
-            if (ret->attr_
-                    && ret->attr_->has_key(
-                            tensor_shrinker_attrs::should_shrink)) {
-                COMPILE_ASSERT(
-                        attr_copied, "The shrink info should be copied: " << v);
-                auto &shrink_info
-                        = ret->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                                tensor_shrinker_attrs::should_shrink);
-                std::vector<expr> new_info;
-                new_info.reserve(shrink_info.base_.size());
-                for (auto &old : shrink_info.base_) {
-                    new_info.emplace_back(copy(old));
-                }
-                shrink_info.base_ = std::move(new_info);
-
-                new_info.reserve(shrink_info.shape_.size());
-                for (auto &old : shrink_info.shape_) {
-                    new_info.emplace_back(copy(old));
-                }
-                shrink_info.shape_ = std::move(new_info);
-            }
-        }
-        return ret;
-    }
-
-    void view(returns_c v) override {
-        already_returned_ = true;
-        if (v->value_.defined()) {
-            COMPILE_ASSERT(ret_var_.defined(),
-                    "The function to inline returns a value, but ret_var_ is "
-                    "not set");
-            expr newval = copy(v->value_);
-            returned_stmt_
-                    = make_stmt<assign_node_t>(ret_var_, std::move(newval));
-        } else {
-            returned_stmt_ = make_stmt<stmts_node_t>(std::vector<stmt>());
-        }
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        COMPILE_ASSERT(!already_returned_,
-                "return_node should be the last statement in the IR, got "
-                        << v);
-        return ir_copier_impl_t::dispatch(std::move(v));
-    }
-};
-
-// the count of the inlined function calls
-static std::atomic<int> count(0);
-
-// TODO(xxx): Recursively inline the target function (with max recursion depth)
-class func_inliner_impl_t : public ir_visitor_t {
-public:
-    struct insertion_point_t {
-        std::vector<stmt> &base;
-        size_t index;
-    };
-
-    // recursion inline depth
-    int recursions = 0;
-    // the current insertion point within a stmts_node_t
-    insertion_point_t *ins_point = nullptr;
-    const_ir_module_ptr modu_ = nullptr;
-    bool need_index_flatten_;
-    func_inliner_impl_t(
-            bool need_index_flatten, const const_ir_module_ptr &modu = nullptr)
-        : modu_(modu), need_index_flatten_(need_index_flatten) {}
-
-    bool is_func_marked_inline(node_base *callee) {
-        if (auto f = dynamic_cast<func_base *>(callee)) {
-            if (modu_) {
-                if (auto real_f = modu_->get_func(f->name_)) {
-                    return any_map_t::fetch_or_else(
-                                   real_f->attr_.get(), "inline_level", -1)
-                            == 2;
-                }
-            }
-        }
-        return false;
-    }
-    expr_c visit(call_c v) override {
-        bool is_parallel_call = bool(!v->para_attr_.empty());
-        auto callee = v->func_;
-        auto ret = ir_visitor_t::visit(std::move(v));
-        if (!is_parallel_call
-                && (any_map_t::fetch_or_else(ret->attr_.get(), "inline_level",
-                            -1) == 2
-                        || is_func_marked_inline(callee.get()))) {
-            if (modu_) {
-                return inline_at(ret.checked_as<call>(), modu_);
-            } else {
-                return inline_at(ret.checked_as<call>());
-            }
-        }
-        return ret;
-    }
-
-    stmt_c visit(if_else_c v) override {
-        auto cond = dispatch(v->condition_);
-        auto thencase = dispatch(promote_stmt_to_stmts(v->then_case_));
-        stmt_c elsecase;
-        if (v->else_case_.defined())
-            elsecase = dispatch(promote_stmt_to_stmts(v->else_case_));
-        bool changed = !cond.ptr_same(v->condition_)
-                || !elsecase.ptr_same(v->else_case_)
-                || !thencase.ptr_same(v->then_case_);
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_if_else_unattached(cond, thencase, elsecase));
-        }
-        return v;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto var = dispatch(v->var_);
-        auto begin = dispatch(v->iter_begin_);
-        auto end = dispatch(v->iter_end_);
-        auto step = dispatch(v->step_);
-        auto body = dispatch(promote_stmt_to_stmts(v->body_));
-
-        bool changed = !((var.ptr_same(v->var_)
-                && begin.ptr_same(v->iter_begin_) && end.ptr_same(v->iter_end_)
-                && step.ptr_same(v->step_) && body.ptr_same(v->body_)));
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_for_loop_unattached(var, begin, end, step,
-                            body, v->incremental_, v->kind_, v->num_threads_));
-        }
-        return v;
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        bool is_call = v->value_.isa<call>();
-        auto newv = dispatch(v->value_);
-        if (!newv.ptr_same(v->value_)) {
-            // if we have inlined a function returning void
-            if (!newv.defined()) {
-                return make_stmt<stmts_node_t>(std::vector<stmt>());
-            }
-            // if we have inlined a function call and the function call returns
-            // a value, do not make it a variable definition
-            if (is_call && (newv.isa<var>() || newv.isa<tensor>())) {
-                return make_stmt<stmts_node_t>(std::vector<stmt>());
-            }
-            return builder::make_evaluate_unattached(newv);
-        }
-        return v;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        bool changed = false;
-        // a shadow array of stmt
-        std::vector<stmt> bb;
-        bb.reserve(v->seq_.size());
-        insertion_point_t insp {bb, 0};
-        auto old_insp = ins_point;
-        ins_point = &insp;
-        for (unsigned i = 0; i < v->seq_.size(); i++) {
-            // always append to the tail of the BB
-            insp.index = bb.size();
-            auto newv = dispatch(v->seq_.at(i));
-            changed |= !newv.ptr_same(v->seq_.at(i));
-            bb.emplace_back(std::move(newv).remove_const());
-        }
-        ins_point = old_insp;
-        if (changed) {
-            return copy_attr(*v, make_stmt<stmts_node_t>(std::move(bb)));
-        }
-        return v;
-    }
-
-    expr_c inline_at(call_c site, const const_ir_module_ptr &modu = nullptr) {
-        auto the_func = std::dynamic_pointer_cast<func_base>(site->func_);
-        // if the callee is a declaration, find its function body in modu.
-        if (!the_func) { return site; }
-        if (modu) {
-            if (auto real_func = modu->get_func(the_func->name_)) {
-                the_func = real_func;
-            }
-        }
-        if (!the_func->body_.defined()) { return site; }
-        recursions++;
-        COMPILE_ASSERT(recursions < 20, "Reached max inline recursion depth");
-
-        // fill in the replace map
-        std::unordered_map<expr_c, expr> rmap;
-        bool has_tptr_arg = false;
-        for (size_t i = 0; i < site->args_.size(); i++) {
-            rmap.insert(
-                    std::make_pair(the_func->params_.at(i), site->args_.at(i)));
-            // if the arg has tensor ptr, we need to call index flatten in the
-            // new body to make sure nested tptr like "&&a[0][0]" is flattened
-            if (site->args_[i].isa<tensorptr>()) { has_tptr_arg = true; }
-        }
-        if (modu) {
-            auto module_vars = modu->get_module_vars();
-            for (size_t i = 0; i < module_vars.size(); i++) {
-                rmap.insert(std::make_pair(
-                        module_vars[i]->var_, module_vars[i]->var_));
-            }
-        }
-
-        // a "simple" function is a function with only one statement: return ...
-        // we can extract and inline the returned expression
-        stmt the_only_stmt;
-        if (!the_func->body_.isa<stmts>()) {
-            the_only_stmt = the_func->body_;
-        } else {
-            auto &thestmts = the_func->body_.static_as<stmts>()->seq_;
-            if (thestmts.size() == 1) { the_only_stmt = thestmts.front(); }
-        }
-        // if the function has only one stmt and it is a return, then it is
-        // "simple"
-        if (the_only_stmt.defined() && the_only_stmt.isa<returns>()) {
-            // if it is a "simple" function:
-            expr the_expr = the_only_stmt.static_as<returns>()->value_;
-            assert(the_expr.defined());
-            // first copy the function body
-            func_body_copier_t impl(rmap, expr());
-            expr_c newexpr = impl.copy(the_expr);
-            // then recursively inline the "call_node"s in the body
-            newexpr = dispatch(std::move(newexpr));
-            recursions--;
-            return newexpr;
-        } else {
-            expr outvar;
-            std::vector<stmt>::iterator itr
-                    = ins_point->base.begin() + ins_point->index;
-            if (site->dtype_ != datatypes::void_t) {
-                // declare the return value as a var
-                outvar = builder::make_var(
-                        site->dtype_, "_retval" + std::to_string(count++));
-                itr = ins_point->base.insert(itr,
-                              builder::make_var_tensor_def_unattached(outvar))
-                        + 1;
-            }
-            func_body_copier_t impl(rmap, outvar);
-            stmt_c new_body = impl.copy(promote_stmt_to_stmts(the_func->body_));
-            // then recursively inline the "call_node"s in the body
-            new_body = dispatch(std::move(new_body));
-            if (has_tptr_arg && need_index_flatten_) {
-                // need to flatten the nested tensorptrs
-                new_body = index_flattener_t()(new_body);
-            }
-            // push the new stmt in the insertion point
-            ins_point->base.insert(itr, std::move(new_body).remove_const());
-
-            recursions--;
-            return outvar;
-        }
-    }
-};
-
-expr_c func_inliner_t::inline_at(call_c c, std::vector<stmt> &seq,
-        size_t insert_idx, const const_ir_module_ptr &modu) {
-    func_inliner_impl_t impl {false};
-    func_inliner_impl_t::insertion_point_t insp {seq, insert_idx};
-    impl.ins_point = &insp;
-    return impl.inline_at(std::move(c), modu);
-}
-
-static func_t get_callee(const stmt &s) {
-    return s.cast<evaluate>()
-            .flat_map([](const evaluate &v) { return v->value_.cast<call>(); })
-            .map([](const call &v) {
-                return std::dynamic_pointer_cast<func_base>(v->func_);
-            })
-            .get_or_else(nullptr);
-}
-
-// check if the function is simple enough to inline. Currently, we consider a
-// function which only have eval-call as a simply function.
-static bool is_simple_func(const func_t &f) {
-    if (!f->body_.defined()) { return false; }
-    for (auto &s : f->body_.checked_as<stmts>()->seq_) {
-        if (!get_callee(s)) { return false; }
-    }
-    return true;
-}
-
-const_ir_module_ptr func_inliner_t::operator()(const_ir_module_ptr f) {
-    func_inliner_impl_t impl(needs_index_flatten_, f);
-    auto mainf = f->get_entry_func();
-    for (auto &fu : f->get_contents()) {
-        if (fu != mainf && is_simple_func(fu)) {
-            fu->attr()["inline_level"] = 2;
-            fu->decl_->attr()["inline_level"] = 2;
-        }
-    }
-
-    return dispatch_module_on_visitor(&impl, f);
-}
-
-func_c func_inliner_t::operator()(func_c f) const {
-    func_inliner_impl_t impl {needs_index_flatten_};
-    return impl.dispatch(std::move(f));
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/func_inline.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/func_inline.hpp
deleted file mode 100644
index 7bf783d142e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/func_inline.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_FUNC_INLINE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_FUNC_INLINE_HPP
-
-#include <vector>
-#include "../module_pass.hpp"
-#include "../sc_function.hpp"
-#include <compiler/ir/pass_dep_util.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Inlines function calls with attr["inline_level"] = 2
- * Or manually inline a call_node
- * @arg needs_index_flatten is we need to run index_flatten to flatten nested
- * tensor ptr on the changed part of TIR after inlining
- * */
-class func_inliner_t : public module_pass_t {
-public:
-    bool needs_index_flatten_;
-    func_inliner_t(bool needs_index_flatten = true)
-        : needs_index_flatten_(needs_index_flatten) {}
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    func_c operator()(func_c f) const;
-    /**
-     * Inlines the function call and inserts the body to an existing stmt array
-     *
-     * @param c the call_node to inline
-     * @param seq the stmt array to insert into
-     * @param insert_idx the index of the insertion point within `seq`
-     * @return if the function returns a value, the return value of the call.
-     *      Otherwise, null
-     * */
-    expr_c inline_at(call_c c, std::vector<stmt> &seq, size_t insert_idx,
-            const const_ir_module_ptr &modu = nullptr);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index2var.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index2var.cpp
deleted file mode 100644
index 0c6b6bc06f8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index2var.cpp
+++ /dev/null
@@ -1,986 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "index2var.hpp"
-#include <algorithm>
-#include <deque>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "pointer_alias_info.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/array_ref.hpp>
-#include <util/utils.hpp>
-#include <util/weakptr_utils.hpp>
-
-SC_MODULE(pass.index2var)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(index2var,
-        SC_PASS_DEPENDS_ON(constant_folder, ir_simplifier, validator,
-                index_flattener, parallel_workload_dispatcher, tensor_inplace),
-        SC_PASS_REQUIRE_STATE(FUNC_INLINED), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(IR_SIMPLIFIED));
-// it requires tensor_inplace for pointer alias info
-
-// the visitor to find the mutable dependencies in the indices of indexing
-// nodes. e.g., for A[i+j], it will find i and j as dependencies. Note that if
-// there is indexing_node/call_node in the indices, this pass will set
-// `is_valid_` = false, meaning that the indices are untraceable - we are unable
-// to tell if the indices are changed after some statements.
-class var_dependency_finder_t : public ir_viewer_t {
-    utils::weakptr_hashset_t<expr_base> *vars_;
-    // the output flag to mark if the input expr is good for index2var_t
-    bool is_valid_ = true;
-    var_dependency_finder_t(utils::weakptr_hashset_t<expr_base> *vars)
-        : vars_(vars) {}
-
-    void view(call_c v) override {
-        bool is_pure_no_args
-                = some_opt(dynamic_cast<func_base *>(v->func_.get()))
-                          .filter([](func_base *f) {
-                              return f->params_.empty();
-                          })
-                          .map([](func_base *f) { return f->attr_.get(); })
-                          .filter([](any_map_t *m) {
-                              return m->get_or_else(
-                                      function_attrs::pure, false);
-                          })
-                          .has_value();
-        if (is_pure_no_args) {
-            ir_viewer_t::view(v);
-            return;
-        }
-        is_valid_ = false;
-        SC_MODULE_INFO << "Found call node in index: " << v;
-    }
-    void view(indexing_c v) override {
-        is_valid_ = false;
-        SC_MODULE_INFO << "Found indexing node in index: " << v;
-    }
-    void view(var_c v) override { vars_->insert(v.impl); }
-
-public:
-    static bool find(
-            utils::weakptr_hashset_t<expr_base> &vars, array_ref<expr> idx) {
-        var_dependency_finder_t f(&vars);
-        for (auto &v : idx) {
-            f.dispatch(v);
-        }
-        return f.is_valid_;
-    }
-};
-
-struct written_tensor_analysis_result_t {
-    std::unordered_set<expr_c> written_;
-};
-
-struct tensor_usage_analysis_result_t {
-    bool used_in_broadcast_;
-    // the cached result of alias_info::get_alias_info
-    alias_info::tensor_alias_identity_t *alias_id_ = nullptr;
-
-    tensor_usage_analysis_result_t(bool used_in_broadcast)
-        : used_in_broadcast_(used_in_broadcast) {}
-    void for_each_alias_tensor(
-            const std::unordered_map<alias_info::tensor_alias_identity_t *,
-                    expr_c> &alias_map,
-            const std::function<void(const expr_c &)> &func) const {
-        for (auto &cliq : alias_id_->alias_cliques_) {
-            for (auto aid : cliq->set_) {
-                auto other_alias_id = aid.lock();
-                // if the tensor has been removed, skip
-                if (!other_alias_id) { continue; }
-                auto itr = alias_map.find(other_alias_id.get());
-                if (itr != alias_map.end()) { func(itr->second); }
-            }
-        }
-    }
-};
-
-struct var_state_analysis_result_t {
-    utils::weakptr_hashset_t<expr_base> depending_on_;
-    int assignments_ = 0;
-    bool is_complex_ = false;
-};
-
-// the visitor to find all tensor written in all stmts. Also finds if the tensor
-// is read in "broadcast" intrinsic.
-class index2var_analysis_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    std::unordered_set<expr_c> written_;
-    std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-            alias_map_;
-
-    stmt_c dispatch(stmt_c v) override {
-        written_ = std::unordered_set<expr_c> {};
-        ir_viewer_t::dispatch(v);
-        v->temp_data() = written_tensor_analysis_result_t {std::move(written_)};
-        written_ = std::unordered_set<expr_c> {};
-        return v;
-    }
-
-    expr get_tensor_from_indexing(const expr &v) {
-        if (v.isa<indexing>()) {
-            auto idx = v.static_as<indexing>();
-            COMPILE_ASSERT(idx->ptr_.isa<tensor>(),
-                    "The indexing should be based on a tensor. " << v);
-            return idx->ptr_;
-        }
-        return expr();
-    }
-
-    var_state_analysis_result_t *get_or_create_var_state(const expr &v) {
-        if (auto ret
-                = v->temp_data().get_or_null<var_state_analysis_result_t>()) {
-            return ret;
-        }
-        v->temp_data() = var_state_analysis_result_t {};
-        return v->temp_data().get_or_null<var_state_analysis_result_t>();
-    }
-
-    void view(define_c v) override {
-        if (v->var_.isa<var>() && v->init_.defined()) {
-            auto var_state = get_or_create_var_state(v->var_);
-            var_state->assignments_++;
-            var_state->is_complex_ = !var_dependency_finder_t::find(
-                    var_state->depending_on_, {v->init_});
-        }
-    }
-    void view(assign_c v) override {
-        ir_viewer_t::view(v);
-        auto tsr = get_tensor_from_indexing(v->var_);
-        if (tsr.defined()) {
-            written_.insert(tsr);
-        } else {
-            auto var_state = get_or_create_var_state(v->var_);
-            var_state->assignments_++;
-            if (var_state->assignments_ == 1) {
-                var_state->is_complex_ = !var_dependency_finder_t::find(
-                        var_state->depending_on_, {v->value_});
-            }
-        }
-    }
-
-    void view(tensor_c v) override {
-        ir_viewer_t::view(v);
-        auto alias = alias_info::get_alias_info(*v);
-        if (!alias || alias->has_no_alias()) { return; }
-        tensor_usage_analysis_result_t *result
-                = v->temp_data().get_or_null<tensor_usage_analysis_result_t>();
-        if (!result) {
-            alias_map_[alias] = v;
-            v->temp_data() = tensor_usage_analysis_result_t {false};
-            result = &v->temp_data_->get<tensor_usage_analysis_result_t>();
-        }
-        result->alias_id_ = alias;
-    }
-
-    void view(intrin_call_c v) override {
-        ir_viewer_t::view(v);
-        if (v->type_ == intrin_type::broadcast) {
-            auto &arg = v->args_.at(0);
-            auto tsr = get_tensor_from_indexing(arg);
-            if (tsr.defined()) {
-                if (auto result
-                        = tsr->temp_data()
-                                  .get_or_null<
-                                          tensor_usage_analysis_result_t>()) {
-                    result->used_in_broadcast_ = true;
-                } else {
-                    tsr->temp_data() = tensor_usage_analysis_result_t {true};
-                }
-            }
-        }
-    }
-
-    void view(for_loop_c v) override {
-        ir_viewer_t::view(v);
-        written_ = v->body_->get_temp_data()
-                           .get<written_tensor_analysis_result_t>()
-                           .written_;
-    }
-
-    void view(if_else_c v) override {
-        ir_viewer_t::view(v);
-
-        written_ = v->then_case_->get_temp_data()
-                           .get<written_tensor_analysis_result_t>()
-                           .written_;
-        if (v->else_case_.defined()) {
-            auto &else_result = v->else_case_->get_temp_data()
-                                        .get<written_tensor_analysis_result_t>()
-                                        .written_;
-            written_.insert(else_result.begin(), else_result.end());
-        }
-    }
-    void view(stmts_c v) override {
-        for (auto &s : v->seq_) {
-            dispatch(s);
-        }
-
-        for (auto &s : v->seq_) {
-            auto &result = s->get_temp_data()
-                                   .get<written_tensor_analysis_result_t>()
-                                   .written_;
-            written_.insert(result.begin(), result.end());
-        }
-    }
-};
-
-enum class compare_result {
-    EQ,
-    NE,
-    UNKNOWN,
-};
-
-static bool check_index_no_overlap(ir_comparer &cmper, const expr_c &v1,
-        const expr_c &v2, uint16_t lanes) {
-    // this helper function will decompose the "v" into "lhs+const" if possible
-    auto get_const_add_rhs = [](const expr_c &v, expr_c &lhs) -> int64_t {
-        expr_c ret_lhs = v;
-        auto ret = v.cast<add>()
-                           .map([&ret_lhs](const add &v) {
-                               ret_lhs = v->l_;
-                               return v->r_.as<constant>();
-                           })
-                           .filter([](const constant &v) {
-                               return utils::is_one_of(
-                                       get_type_category_nothrow(v->dtype_),
-                                       CATE_INT, CATE_UINT);
-                           })
-                           .map(get_const_as_int);
-        if (ret.has_value()) {
-            lhs = std::move(ret_lhs);
-            return ret.get();
-        }
-        lhs = v;
-        return 0;
-    };
-    expr_c v1_lhs, v2_lhs;
-    auto v1_rhs_const = get_const_add_rhs(v1, v1_lhs);
-    auto v2_rhs_const = get_const_add_rhs(v2, v2_lhs);
-    if (v1_rhs_const == 0 && v2_rhs_const == 0) { return false; }
-    if (std::abs(v1_rhs_const - v2_rhs_const) < (int64_t)lanes) {
-        // may overlap
-        return false;
-    }
-    // compare if the base is the same
-    return cmper.compare(v1_lhs, v2_lhs);
-}
-
-class indexing2var_impl_t : public ir_visitor_t {
-    struct scope_info_t;
-    // the "cache" for an element of a tensor
-    // currently, one tensor has only one "cache"
-    struct tensor_cache_t {
-        tensor_c tsr_;
-        std::vector<expr_c> idx_;
-        // the last write of the cached value. Null if the value is not yet
-        // written in the original IR
-        stmts last_write_;
-        // the var for the cache
-        var_c var_;
-        // the vector size
-        unsigned lanes_;
-        // the vector rows
-        unsigned rows_;
-        expr_c mask_;
-        // the tensor cache var definition may be lifted to a parent for-loop
-        scope_info_t *may_lift_to_ = nullptr;
-        utils::weakptr_hashset_t<expr_base> dependencies_;
-        tensor_cache_t(tensor_c tsr, std::vector<expr_c> &&idx, var_c var,
-                int lanes, int rows, expr_c mask = expr())
-            : tsr_(std::move(tsr))
-            , idx_(std::move(idx))
-            , var_(std::move(var))
-            , lanes_(lanes)
-            , rows_(rows)
-            , mask_(std::move(mask)) {}
-        // returns EQ if `v` is exactly the same of the cached indexing
-        // returns NE if `v` does not overlap with cached indexing
-        // otherwise, UNKNOWN
-        compare_result is_match(const indexing_c &v) const {
-            if (!v->ptr_.ptr_same(tsr_.static_as<expr>())) {
-                return compare_result::UNKNOWN;
-            }
-            assert(idx_.size() == v->idx_.size());
-            ir_comparer cmp(false, false, true);
-            if (v->dtype_.lanes_ == lanes_ && v->dtype_.rows_ == rows_) {
-                bool same_index = true;
-                if (v->idx_.size() == 1) {
-                    if (!cmp.compare(v->idx_[0], idx_[0])) {
-                        // the index is not the same now
-                        if (!check_index_no_overlap(
-                                    cmp, v->idx_[0], idx_[0], lanes_)) {
-                            // if the indexing may overlap, skip
-                            return compare_result::UNKNOWN;
-                        }
-                        // the index is proven not overlapping
-                        same_index = false;
-                    }
-                } else {
-                    for (unsigned i = 0; i < v->idx_.size(); i++) {
-                        if (!cmp.compare(v->idx_[i], idx_[i])) {
-                            return compare_result::UNKNOWN;
-                        }
-                    }
-                }
-
-                if (v->mask_.defined()) {
-                    if (!mask_.defined()) { return compare_result::UNKNOWN; }
-                    if (!cmp.compare(v->mask_, mask_)) {
-                        return compare_result::UNKNOWN;
-                    }
-                } else {
-                    if (mask_.defined()) { return compare_result::UNKNOWN; }
-                }
-                return same_index ? compare_result::EQ : compare_result::NE;
-            }
-            return compare_result::UNKNOWN;
-        }
-        bool is_valid() const { return tsr_.defined(); }
-    };
-
-    using tensor_cache_ptr = std::shared_ptr<tensor_cache_t>;
-
-    struct scope_info_t {
-        const for_loop_node_t *loop_;
-        std::vector<stmt_c> *insert_point_;
-        const std::unordered_set<expr_c> &written_tensors_;
-        std::unordered_set<tensor_cache_ptr> outstanding_cache_;
-        // the number of times we flush cached of an tensor
-        std::unordered_map<expr_c, int> num_flushes_;
-        // the statements to be inserted after current call of visit() of a
-        // sub-node in stmts node
-        std::vector<stmt> insert_after_;
-        bool is_cache_defined_here(const tensor_cache_ptr &v) {
-            return outstanding_cache_.find(v) != outstanding_cache_.end();
-        }
-
-        bool tensor_not_written_here(const expr_c &v) {
-            return written_tensors_.find(v) == written_tensors_.end();
-        }
-
-        void increment_num_flushes(const expr_c &v, int cnt = 1) {
-            num_flushes_[v] += cnt;
-        }
-    };
-
-    // the tensor -> tensor_cache, it stores all "cached" indexing
-    std::unordered_multimap<expr_c, tensor_cache_ptr> cached_index_;
-    // the var -> tensor_cache map. The var is the variable used in indexing,
-    // NOT the caching var. e.g. if there is A[i,j] cached, there will be
-    // {i->cache of A} and {j-> cache of A} in dependency_map_. If
-    std::unordered_multimap<expr_c, tensor_cache_ptr> dependency_map_;
-    // the number of vars for caching
-    int var_cnt_ = 0;
-    // the insertion point for the current stmts. We may insert var definition
-    // and var initialization here
-    std::vector<stmt_c> *insertion_point_ = nullptr;
-    // the cached tensors which are created in the current scope (stmts). The
-    // first dimension is a stack of scopes. At the end of a scope, we need to
-    // evict all cache items in scope_info_.back(), then call
-    // scope_info_.pop()
-    std::deque<scope_info_t> scope_info_;
-    int for_depth_ = 0;
-    std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-            &alias_map_;
-    std::unordered_set<expr_c> loop_vars_;
-    const for_loop_node_t *cur_for_loop_ = nullptr;
-
-    // do write back and etc, don't remove it from the map
-    void invalidate_internal(tensor_cache_ptr c, // NOLINT
-            std::vector<stmt> *writeback_point, scope_info_t *scope) {
-        if (c->is_valid()) {
-            // if the cache is dirty, write back after the last write
-            if (c->last_write_.defined()) {
-                if (!writeback_point) {
-                    writeback_point = &c->last_write_->seq_;
-                }
-                writeback_point->emplace_back(builder::make_assign_unattached(
-                        builder::make_indexing(c->tsr_, c->idx_, c->lanes_,
-                                c->mask_, c->rows_),
-                        c->var_));
-            }
-            // mark the cache invalid
-            scope->increment_num_flushes(c->tsr_);
-            c->tsr_ = tensor_c();
-        }
-    }
-
-    // flushes the cache for all indexing of the same tensor
-    void invalidate(tensor_cache_ptr c, // NOLINT
-            std::vector<stmt> *writeback_point = nullptr) { // NOLINT
-        invalidate_if_exist(c->tsr_, writeback_point);
-    }
-
-    // invalidates a tensor in the cache, returns true if is in cache
-    bool invalidate_if_exist(
-            const expr_c &arg, std::vector<stmt> *writeback_point = nullptr) {
-        auto itrs = cached_index_.equal_range(arg);
-        auto ret = (itrs.first != itrs.second);
-        for (auto itr = itrs.first; itr != itrs.second; ++itr) {
-            invalidate_internal(
-                    itr->second, writeback_point, &scope_info_.back());
-        }
-        cached_index_.erase(itrs.first, itrs.second);
-        return ret;
-    }
-
-    void invalidate_all() {
-        while (!cached_index_.empty()) {
-            auto itr = cached_index_.begin()->first;
-            invalidate_if_exist(itr);
-        }
-    }
-
-    bool is_var_dependent_only_on_loop_var(const expr_c &itr, int recur_count) {
-        if (recur_count > 7) {
-            // the dependency is too complex, or there is a loop in dependency
-            return false;
-        }
-        if (loop_vars_.count(itr) == 0) {
-            // if it depends on a non-loop var
-            if (auto var_state
-                    = itr->get_temp_data()
-                              .get_or_null<var_state_analysis_result_t>()) {
-                if (var_state->assignments_ == 1 && !var_state->is_complex_) {
-                    for (auto v : var_state->depending_on_) {
-                        auto ptr = v.lock();
-                        if (ptr.get() == itr.get()) {
-                            // self dependent
-                            return false;
-                        }
-                        if (!is_var_dependent_only_on_loop_var(
-                                    expr_c(ptr), recur_count + 1)) {
-                            return false;
-                        }
-                    }
-                }
-            } else {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    void collect_var_dependency(const std::shared_ptr<expr_base> &itr,
-            utils::weakptr_hashset_t<expr_base> &out) {
-        if (out.has(itr)) { return; }
-        out.insert(itr);
-        if (auto var_state
-                = itr->get_temp_data()
-                          .get_or_null<var_state_analysis_result_t>()) {
-            if (var_state->assignments_ == 1 && !var_state->is_complex_) {
-                for (auto v : var_state->depending_on_) {
-                    auto ptr = v.lock();
-                    collect_var_dependency(ptr, out);
-                }
-            }
-        }
-    }
-
-    // inserts an indexing to cache
-    // if `is_read`, sets the cached var to the indexing value
-    // if the indexing node is not cache-able, returns `v` and leaves
-    // `out_cache` = nullptr
-    expr_c make_cache(indexing_c v, const std::vector<expr> &old_idx,
-            bool is_read, tensor_cache_ptr &out_cache) {
-        SC_MODULE_INFO << "Make cache: " << v;
-        // the vars that the indices of `v` depends on
-        utils::weakptr_hashset_t<expr_base> vars;
-        // if we can trace the changes of the indices
-        bool is_valid = var_dependency_finder_t::find(vars, old_idx);
-        if (v->mask_.defined()) {
-            is_valid = is_valid
-                    && var_dependency_finder_t::find(vars, &(v->mask_));
-        }
-        if (!is_valid) {
-            out_cache = nullptr;
-            return std::move(v);
-        }
-        tensor tsr = v->ptr_.as<tensor>();
-        assert(tsr.defined());
-        if (auto ana = tsr->get_temp_data()
-                               .get_or_null<tensor_usage_analysis_result_t>()) {
-            // if the tensor is used in broadcast and it is currently loaded as
-            // scalar
-            if (ana->used_in_broadcast_ && v->dtype_.lanes_ == 1) {
-                out_cache = nullptr;
-                return std::move(v);
-            }
-        }
-        var vcache = builder::make_var(
-                v->dtype_, "__cached_" + std::to_string(var_cnt_++))
-                             .static_as<var>();
-        assert(insertion_point_);
-        // declare a var, insert before the current stmt
-        // if read, set the var cache to the value in memory
-        insertion_point_->emplace_back(builder::make_var_tensor_def_unattached(
-                vcache, linkage::local, is_read ? v.remove_const() : expr()));
-        out_cache = std::make_shared<tensor_cache_t>(tsr,
-                std::vector<expr_c>(v->idx_.begin(), v->idx_.end()), vcache,
-                v->dtype_.lanes_, v->dtype_.rows_, v->mask_);
-        scope_info_.back().outstanding_cache_.insert(out_cache);
-        // remember the dependency
-        // and check if the index only depends on loop vars
-        bool only_loop_dependent = true;
-        for (auto itr : vars) {
-            auto ptr = itr.lock();
-            dependency_map_.insert(std::make_pair(expr(ptr), out_cache));
-            only_loop_dependent = only_loop_dependent
-                    && is_var_dependent_only_on_loop_var(expr_c(ptr), 0);
-        }
-        if (only_loop_dependent) {
-            utils::weakptr_hashset_t<expr_base> dep_vars;
-            for (auto dvar : vars) {
-                collect_var_dependency(dvar.lock(), dep_vars);
-            }
-            // traverse the parent for-loops to find if it is possible to move
-            // the definition up in a later moment
-            scope_info_t *lifted_scope = nullptr;
-            for (auto itr = scope_info_.rbegin(); itr != scope_info_.rend();
-                    ++itr) {
-                auto &sinfo = *itr;
-                if (!sinfo.loop_) { break; }
-                for_loop_node_t *nested_loop
-                        = sinfo.loop_->body_.cast<stmts>()
-                                  .filter([](const stmts &v) {
-                                      return v->seq_.size() == 1UL;
-                                  })
-                                  .map([](const stmts &v) {
-                                      return v->seq_[0].as<for_loop>().get();
-                                  })
-                                  .get_or_else(nullptr);
-
-                lifted_scope = &sinfo;
-                if (sinfo.loop_->kind_ != for_type::NORMAL) { break; }
-                if (dep_vars.has(sinfo.loop_->var_.impl) != 0) {
-                    // if the index depends on the current loop, break
-                    break;
-                }
-                if (itr != scope_info_.rbegin() && !nested_loop) { break; }
-            }
-            if (lifted_scope && lifted_scope != &scope_info_.back()) {
-                out_cache->may_lift_to_ = lifted_scope;
-                out_cache->dependencies_ = std::move(dep_vars);
-            }
-        }
-        // put into the cache
-        cached_index_.insert(std::make_pair(tsr, out_cache));
-        return std::move(vcache);
-    }
-
-    expr_c visit(cast_c v) override {
-        if (v->in_.isa<tensor>()) {
-            if (invalidate_alias_group(v->in_, true)) {
-                SC_MODULE_INFO << "Evict due to cast: " << v;
-            }
-            scope_info_.back().increment_num_flushes(v->in_);
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(call_c v) override {
-        auto ret = ir_visitor_t::visit(std::move(v));
-        for (auto &arg : ret.as<call_c>()->args_) {
-            if (arg.isa<tensor>()) {
-                if (invalidate_alias_group(arg, true)) {
-                    SC_MODULE_INFO << "Evict due to function call: " << ret;
-                }
-                scope_info_.back().increment_num_flushes(arg);
-            }
-        }
-        return ret;
-    }
-
-    expr_c visit(tensorptr_c v) override {
-        // dispatch the fields inside of indexing, without calling
-        // index2var_t::visit on indexing. We don't need to create cache slot
-        // for tensorptr
-        auto ret_base = ir_visitor_t::visit(v->base_);
-        auto ret_idx = ret_base.as<indexing_c>();
-        auto tsr = ret_idx->ptr_.as<tensor>();
-        if (invalidate_alias_group(tsr, true)) {
-            SC_MODULE_INFO << "Evict due to tensorptr: " << v;
-        }
-        scope_info_.back().increment_num_flushes(tsr);
-        if (ret_base.ptr_same(v->base_)) {
-            return std::move(v);
-        } else {
-            return builder::tensor_ptr(tsr, ret_idx->idx_);
-        }
-    }
-
-    bool invalidate_alias_group(const expr_c &tsr, bool invalidate_self) {
-        auto analysis_result
-                = tsr->get_temp_data()
-                          .get_or_null<tensor_usage_analysis_result_t>();
-        bool ret = false;
-        if (analysis_result && analysis_result->alias_id_) {
-            auto ths = this;
-            // if the tensor has alias
-            analysis_result->for_each_alias_tensor(
-                    alias_map_, [&tsr, ths, &ret](const expr_c &v) {
-                        if (!v.ptr_same(tsr)) {
-                            ret |= ths->invalidate_if_exist(v);
-                        }
-                    });
-        }
-        if (invalidate_self) { ret |= invalidate_if_exist(tsr); }
-        return ret;
-    }
-
-    expr_c visit_indexing(
-            const indexing_c &v, bool is_read, tensor_cache_ptr &out_cache) {
-        auto ret = ir_visitor_t::visit(v).as<indexing_c>();
-        auto tsr = ret->ptr_.as<tensor>();
-        if (tsr->attr_
-                && tsr->attr_->get_or_else(attr_keys::must_tensor2var, false)) {
-            // if the tensor is marked to be transformed to var, no need to
-            // optimize
-            return ret;
-        }
-        if (tsr->attr_
-                && tsr->attr_->get_or_else(attr_keys::no_index2var, false)) {
-            return ret;
-        }
-        if (!is_read) {
-            // if it is not read, need to evict all other tensors in the alias
-            // group. no need to invalidate tsr itself
-            if (invalidate_alias_group(tsr, false)) {
-                SC_MODULE_INFO << "Alias group invalidated for " << tsr;
-            }
-        }
-        auto its = cached_index_.equal_range(tsr);
-        for (auto itr = its.first; itr != its.second; ++itr) {
-            // if the tensor is cached
-            auto cmp_result = itr->second->is_match(ret);
-            if (cmp_result == compare_result::EQ) {
-                // the cached index is a match, return the var
-                out_cache = itr->second;
-                // If the indexing is cached, we can use it if:
-                // 1. it is read and we are not in for_loop (if it is in a
-                // for-loop, we currently do not know if there will be writes on
-                // the cache, which will invalidate the parent scope cache)
-                // 2. or the cache is created in the same scope of the access
-                if ((is_read && for_depth_ == 0)
-                        || scope_info_.back().is_cache_defined_here(out_cache)
-                        || scope_info_.back().tensor_not_written_here(tsr)) {
-                    return itr->second->var_;
-                }
-                SC_MODULE_INFO << "Evict parent scope cache in child scope: "
-                               << ret;
-                // if we need to write to a cached var which is defined in
-                // parent scopes (not in current scope), we need to evict it
-                // If we don't, consider this case
-                // A[0] = 1 // cached here in parent scope
-                // if (...) {
-                //   A[0] = 1 // no write back here
-                // } else {
-                //   A[0] = 1 // write back due to the use of A[1]
-                //   A[1] = 2
-                // }
-                // in the above case, the else-block will evict A[0] because
-                // the use of A[1], but the last use of A[0] is still in the
-                // else block, so the write to A[0] in then-block will be
-                // lost
-                invalidate(itr->second);
-                break;
-            } else if (cmp_result == compare_result::NE) {
-                // do nothing. We will make cache later
-            } else {
-                SC_MODULE_INFO << "Evict old for unmatched index: " << ret;
-                // the tensor is cached, but with different index, evict it
-                invalidate(itr->second);
-                break;
-            }
-        }
-        return make_cache(std::move(ret), v->idx_, is_read, out_cache);
-    }
-
-    expr_c visit(indexing_c v) override {
-        if (v->attr_ && v->attr_->get_or_else(attr_keys::no_index2var, false)) {
-            return v;
-        }
-        tensor_cache_ptr out_cache;
-        return visit_indexing(v, true, out_cache);
-    }
-
-    stmt_c visit(stmts_c v) override {
-        auto old_insert = insertion_point_;
-        std::vector<stmt_c> seq;
-        seq.reserve(v->seq_.size());
-        insertion_point_ = &seq;
-        const for_loop_node_t *parent_loop = cur_for_loop_;
-        cur_for_loop_ = nullptr;
-        scope_info_.emplace_back(scope_info_t {parent_loop, insertion_point_,
-                v->get_temp_data()
-                        .get<written_tensor_analysis_result_t>()
-                        .written_,
-                {}, {}, {}});
-
-        bool changed = false;
-        for (auto &s : v->seq_) {
-            auto newstmt = dispatch(s);
-            changed |= !newstmt.ptr_same(s);
-            seq.emplace_back(std::move(newstmt));
-            auto &insert_after = scope_info_.back().insert_after_;
-            if (!insert_after.empty()) {
-                changed = true;
-                seq.insert(seq.end(), insert_after.begin(), insert_after.end());
-                insert_after.clear();
-            }
-        }
-        changed |= v->seq_.size() != seq.size();
-
-        // sort the cache to make stable result for testing
-        std::vector<tensor_cache_ptr> outstanding_cache {
-                scope_info_.back().outstanding_cache_.begin(),
-                scope_info_.back().outstanding_cache_.end()};
-        std::sort(outstanding_cache.begin(), outstanding_cache.end(),
-                [](const tensor_cache_ptr &p1, const tensor_cache_ptr &p2) {
-                    return p1->var_->name_ < p2->var_->name_;
-                });
-        std::vector<expr_c> bases;
-        bases.reserve(outstanding_cache.size());
-        // evict all cache items that will die at the end of this stmts
-        for (auto &v : outstanding_cache) {
-            // where to writeback the cache: the parent scope? or after the last
-            // write?
-            if (v->is_valid()) {
-                SC_MODULE_INFO
-                        << "Evict at the end of scope: " << v->tsr_
-                        << " idx=" << utils::print_vector(v->idx_)
-                        << ", may_lift_to="
-                        << (v->may_lift_to_ ? v->may_lift_to_->loop_->var_
-                                            : expr(0));
-                auto check_is_tensor_defined_in_current_scope = [&v, &seq]() {
-                    for (auto &s : seq) {
-                        if (s.cast<define>()
-                                        .filter([&v](const define &def) {
-                                            return def->var_.get()
-                                                    == v->tsr_.get();
-                                        })
-                                        .has_value()) {
-                            return true;
-                        }
-                    }
-                    return false;
-                };
-                if (v->may_lift_to_) {
-                    SC_MODULE_INFO
-                            << "Evict " << v->tsr_ << " num_flushes="
-                            << scope_info_.back().num_flushes_[v->tsr_]
-                            << ", current_scope="
-                            << check_is_tensor_defined_in_current_scope();
-                }
-                std::vector<stmt> *writeback_point = nullptr;
-                scope_info_t *lift_scope = &scope_info_.back();
-                // check if tensor cache can be lifted: a) check may_lift_to_ b)
-                // check if the cache has been flushed in the current scope c)
-                // if the tensor is defined in current scope, we cannot lift it
-                /*
-                Why we can't lift if the cache has been flushed? For example,
-                _for_(i, 0, 100, 1) {
-                    _for_(j, 0, 100, 1) {
-                        _for_(k, 0, 100, 1) {
-                            A[{n, 0}] = 0;
-                            A[{i, 0}] = 0;
-                        }
-                    }
-                }
-                "A[{i, 0}] = 0" cannot be lift because "A[{A, 0}] = 0" is
-                flushed in the k-loop, and "A[{i, 0}] = 0" may affect it
-                */
-                if (v->may_lift_to_
-                        && scope_info_.back().num_flushes_[v->tsr_] == 0
-                        && !check_is_tensor_defined_in_current_scope()) {
-                    utils::weakptr_hashset_t<expr_base> &dep_vars
-                            = v->dependencies_;
-                    // if the tensor cache var can be lifted, move the
-                    // definition
-                    for (auto itr = seq.begin(); itr != seq.end();) {
-                        // if the assign node assigns to dependency var
-                        bool assign_dep
-                                = (*itr).cast<assign>()
-                                          .map([](const assign &def) {
-                                              return def->var_.as<var>().get();
-                                          })
-                                          .filter([&dep_vars](var_node *var) {
-                                              return dep_vars.has(
-                                                      var->shared_from_this());
-                                          })
-                                          .get_or_else(nullptr);
-                        // if the var definition is for the cache var or
-                        // dependency var
-                        // clang-format off
-                        bool def_var = (*itr).cast<define>()
-                                .map([](const define &def) {
-                                    return def->var_.as<var>().get();
-                                })
-                                .filter([&v, &dep_vars](
-                                                var_node *var) {
-                                    return var == v->var_.get()
-                                            || dep_vars.has(
-                                                    var->shared_from_this());
-                                })
-                                .get_or_else(nullptr);
-                        // clang-format on
-                        if (assign_dep || def_var) {
-                            v->may_lift_to_->insert_point_->emplace_back(
-                                    std::move(*itr));
-                            itr = seq.erase(itr);
-                            continue;
-                        }
-
-                        ++itr;
-                    }
-                    writeback_point = &v->may_lift_to_->insert_after_;
-                    lift_scope = v->may_lift_to_;
-                }
-                bases.emplace_back(v->tsr_);
-                // each cache may have different writeback_point even if the
-                // base tensor is the same, so we first call invalidate_internal
-                // and then call invalidate to remove it from the map
-                invalidate_internal(v, writeback_point, lift_scope);
-            }
-        }
-        for (auto &v : bases) {
-            invalidate_if_exist(v);
-        }
-        if (scope_info_.size() > 1) {
-            auto itrprev = scope_info_.rbegin();
-            auto &prev = *++itrprev;
-            for (auto &kv : scope_info_.back().num_flushes_) {
-                prev.increment_num_flushes(kv.first, kv.second);
-            }
-        }
-        scope_info_.pop_back();
-        insertion_point_ = old_insert;
-
-        if (changed) { return builder::make_stmts_unattached(seq); }
-        return std::move(v);
-    }
-    stmt_c visit(for_loop_c v) override {
-        if (v->kind_ != for_type::NORMAL) {
-            // before entering a parallel-for loop, invalidate all cache
-            // because it is not profitable to cache the value across the thread
-            // boundary, and it will have problems in dynamic parallel transform
-            invalidate_all();
-        }
-        for_depth_++;
-        cur_for_loop_ = v.get();
-        loop_vars_.insert(v->var_);
-        auto ret = ir_visitor_t::visit(v);
-        if (v->kind_ != for_type::NORMAL) { invalidate_all(); }
-        loop_vars_.erase(v->var_);
-        for_depth_--;
-        return ret;
-    }
-    stmt_c visit(assign_c v) override {
-        if (v->var_.isa<indexing_c>()) {
-            auto rhs = dispatch(v->value_);
-            tensor_cache_ptr out_cache;
-            auto lhs = visit_indexing(
-                    v->var_.static_as<indexing_c>(), false, out_cache);
-            // cache creation may fail due to there is a call/indexing in
-            // the indices
-            if (out_cache) {
-                auto mask = v->var_.static_as<indexing_c>()->mask_;
-                if (mask.defined()) {
-                    if (mask.isa<constant>()) {
-                        rhs = get_expr_as_int(mask) > 0
-                                ? rhs
-                                : builder::make_constant(
-                                        {UINT64_C(0)}, rhs->dtype_);
-                    } else {
-                        rhs = builder::make_select(mask, rhs,
-                                builder::make_constant(
-                                        {UINT64_C(0)}, rhs->dtype_));
-                    }
-                }
-                // if successfully created a cache for the indexing
-                auto ret = builder::make_stmts_unattached(
-                        {builder::make_assign_unattached(lhs, rhs)});
-                out_cache->last_write_ = ret.static_as<stmts>();
-                return ret;
-            } else if (!rhs.ptr_same(v->value_) || !lhs.ptr_same(v->var_)) {
-                return builder::make_assign_unattached(lhs, rhs);
-            } else {
-                return std::move(v);
-            }
-        } else {
-            assert(v->var_.isa<var>());
-            // if a var is changed, all indexing nodes depends on this var
-            // should be evicted
-            auto its = dependency_map_.equal_range(v->var_);
-            if (its.first != its.second) {
-                for (auto it = its.first; it != its.second; ++it) {
-                    if (it->second->is_valid()) {
-                        SC_MODULE_INFO
-                                << "Evict due to change of index = " << v->var_
-                                << ", tensor = " << it->second->tsr_;
-                        invalidate(it->second);
-                    }
-                }
-                dependency_map_.erase(v->var_);
-            }
-            return ir_visitor_t::visit(std::move(v));
-        }
-    }
-
-public:
-    using ir_visitor_t::dispatch;
-    indexing2var_impl_t(
-            std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-                    &alias_map)
-        : alias_map_(alias_map) {}
-};
-
-func_c index2var_t::operator()(func_c f) {
-    if (f->attr_ && f->attr_->get_or_else(function_attrs::low_level, false)) {
-        return f;
-    }
-    index2var_analysis_t pass;
-    pass.dispatch(f);
-    indexing2var_impl_t impl {pass.alias_map_};
-    return impl.dispatch(f);
-}
-
-stmt_c index2var_t::operator()(const stmts_c &f) {
-    index2var_analysis_t pass;
-    pass.dispatch(f);
-    indexing2var_impl_t impl {pass.alias_map_};
-    return impl.dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index2var.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index2var.hpp
deleted file mode 100644
index ea5bea61ede..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index2var.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INDEX2VAR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INDEX2VAR_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace attr_keys {
-constexpr const char *no_index2var = "pass.no_index2var";
-}
-/**
- * Replace indexing on tensors with vars. "Cache" the values from/to tensors to
- * local variables. At the end of the scope (stmts node), the cached vars will
- * be written back to the tensors
- * @note This pass should run after function inlining pass
- * Currently this pass is very conservative. A tensor has only one "cache slot".
- * It will "flush" the var back to tensor if
- * 1. In the current/parent scopes, there are more than one different index on
- * the same tensor. e.g. if there is A[i] and then A[j] in use, we don't know if
- * A[i] and A[j] points to the same address, so we cannot cache both of them,
- * and need to "flush" the var for A[i] back to A[i]. However, if A[i] is the
- * only outstanding use of tensor A, we can use A[i] mutiple times and cache it
- * in a var. If `i` is changed between to uses of A[i], we also need to flush it
- * 2. There is a tensorptr on the tensor, the "cached" value of the tensor be
- * "flushed"
- * 3. The tensor is passed to a function call
- * 4. The lifetime of the cached var is over at the end of the scope
- * 5. There is a write to a tensor which is cached in parent scope
- * */
-class index2var_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    stmt_c operator()(const stmts_c &f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index_flatten.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index_flatten.cpp
deleted file mode 100644
index e816db266c8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index_flatten.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "index_flatten.hpp"
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../util_module_passes.hpp"
-#include "../visitor.hpp"
-#include "./constant_fold.hpp"
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(index_flattener,
-        SC_PASS_DEPENDS_ON(
-                dyn_tensor_transformer, interface_generalizer, tensor_shrinker),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(CONST_FOLDED));
-
-static bool process_indexing(ir_visitor_t *ths,
-        const std::vector<expr> &old_dims, const std::vector<expr> &old_strides,
-        const std::vector<expr> &idx, std::vector<expr_c> &newidx, bool is_2d) {
-    bool changed = ths->dispatch_expr_vector(idx, newidx);
-    assert(!idx.empty());
-    if (idx.size() == 1) {
-        return changed;
-    } else {
-        COMPILE_ASSERT(old_strides.size() == old_dims.size(),
-                "Dims and strides shall have same length.");
-        if (newidx.size() == 2 && is_2d) {
-            newidx = std::vector<expr_c> {
-                    builder::make_constant({UINT64_C(0)}, datatypes::s32),
-                    newidx[0], newidx[1]};
-            return true;
-        }
-        size_t start_idx = newidx.size() - 1;
-        if (is_2d) {
-            start_idx -= 2;
-            assert(newidx.size() > 2);
-        }
-        expr_c flattened
-                = builder::make_mul(newidx[start_idx], old_strides[start_idx]);
-        for (int64_t i = (int64_t)start_idx - 1; i >= 0; i--) {
-            flattened = builder::make_add(
-                    builder::make_mul(newidx[i], old_strides[i]), flattened);
-        }
-        if (is_2d) {
-            auto idx0 = newidx[newidx.size() - 1];
-            auto idx1 = newidx[newidx.size() - 2];
-            newidx = std::vector<expr_c> {
-                    std::move(flattened), std::move(idx0), std::move(idx1)};
-        } else {
-            newidx = std::vector<expr_c> {std::move(flattened)};
-        }
-        return true;
-    }
-}
-
-static bool process_indexing(ir_visitor_t *ths, tensor_c &tsr,
-        const std::vector<expr> &idx, std::vector<expr_c> &newidx, bool is_2d) {
-    tensor_c old = tsr;
-    tsr = ths->dispatch(tsr).checked_as<tensor_c>();
-    bool changed = !tsr.ptr_same(old);
-    COMPILE_ASSERT(old->dims_.size() == idx.size(),
-            "Unmatched dimensions of indexing: tsr= "
-                    << tsr << utils::print_vector(old->dims_) << ", indexing on"
-                    << utils::print_vector(idx));
-    changed |= process_indexing(
-            ths, old->dims_, old->strides_, idx, newidx, is_2d);
-    return changed;
-}
-
-sc_dims get_expr_to_dims(const std::vector<expr> &dims);
-
-static const std::vector<expr> *get_base_shape(const expr_c &ex) {
-    if (ex.isa<tensor>()) { return &ex.static_as<tensor>()->dims_; }
-    COMPILE_ASSERT(ex.isa<tensorptr>(), "Expecting tensorptr, got: " << ex);
-    auto tptr = ex.static_as<tensorptr>();
-    if (tptr->is_slice_) {
-        return get_base_shape(tptr->base_->ptr_);
-    } else {
-        return &tptr->shape_;
-    }
-}
-
-static std::vector<expr> get_dense_stride(const std::vector<expr> &shape) {
-    std::vector<expr> result(shape.size(), 1);
-    for (int i = shape.size() - 2; i >= 0; --i) {
-        result[i] = result[i + 1] * shape[i + 1];
-    }
-    return result;
-}
-
-static std::vector<expr> get_base_stride(const expr_c &ex) {
-    if (ex.isa<tensor>()) { return ex.static_as<tensor>()->strides_; }
-    COMPILE_ASSERT(ex.isa<tensorptr>(), "Expecting tensorptr, got: " << ex);
-    auto tptr = ex.static_as<tensorptr>();
-    if (tptr->is_slice_) {
-        COMPILE_ASSERT(tptr->base_.isa<indexing>(),
-                "tptr's base should be indexing, but got: " << tptr->base_);
-        return get_base_stride(tptr->base_->ptr_);
-    } else {
-        // when is_slice_ == false we create a dense tensor based on new shape
-        return get_dense_stride(tptr->shape_);
-    }
-}
-
-static bool is_const_expr_vector(const std::vector<expr> &vec) {
-    return std::all_of(vec.begin(), vec.end(), [](const expr &x) {
-        return do_cast_and_fold(x).isa<constant_c>();
-    });
-}
-
-/* process indexing on tptr(slice=false) with base = tptr(slice=true)
-   A = tensor[H,W,C]
-   B = tensorptr(A,[0,1,1,0], [H-1,W-1,C], is_slice=true)
-   C = tensorptr(B, [0,0,0,0], [S,C], is_slice=false) // S = (H-1) * (W-1)
-   indexing C[s,c]
-   let flatten_idx = s * C + c;
-   flatten(C[s,c]) = flatten(B) + (flatten_idx / C / (W-1) % (H-1) * W * C +
-                        flatten_idx / C % (W-1) * C + flatten_idx % C)
-
-   if shape are const, we could know C = C, S = (H-1)*(W-1) and the computation
-   could be simplied to
-   flatten(C[s,c]) = flatten(C) + (s / (W-1) % (H-1) * W *
-                        C + s % (W-1) * C + c)
-*/
-static void process_slice_reshape_indexing(ir_visitor_t *ths,
-        const std::vector<expr> &old_dims, const std::vector<expr> &slice_shape,
-        const std::vector<expr> &parent_stride, const std::vector<expr> &idx,
-        std::vector<expr_c> &newidx) {
-    ths->dispatch_expr_vector(idx, newidx);
-    assert(!idx.empty());
-    bool is_shape_all_const = is_const_expr_vector(slice_shape);
-    is_shape_all_const &= is_const_expr_vector(parent_stride);
-    is_shape_all_const &= is_const_expr_vector(old_dims);
-    if (is_shape_all_const) {
-        // if shape is all const, the computation could be simplified according
-        // to shape
-        expr flattened = 0;
-        auto stride = get_dense_stride(old_dims);
-        auto slice_const_shape = get_expr_to_dims(slice_shape);
-        auto real_shape = get_expr_to_dims(old_dims);
-        auto slice_dense_stride
-                = get_expr_to_dims(get_dense_stride(slice_shape));
-
-        auto acc_shape = 1;
-        int acc_idx = real_shape.size() - 1;
-        expr flatten_remain = 0UL;
-        for (auto i = (int)slice_const_shape.size() - 1; i >= 0; i--) {
-            while (acc_shape % slice_const_shape[i] != 0 && acc_idx >= 0) {
-                flatten_remain
-                        = idx[acc_idx] * stride[acc_idx] + flatten_remain;
-                acc_shape *= real_shape[acc_idx];
-                acc_idx--;
-            }
-            if (acc_shape == slice_const_shape[i]) {
-                flattened = flatten_remain
-                                / expr((uint64_t)slice_dense_stride[i])
-                                * parent_stride[i]
-                        + flattened;
-                flatten_remain = 0UL;
-                acc_shape = 1;
-            } else {
-                flattened = flatten_remain
-                                / expr((uint64_t)slice_dense_stride[i])
-                                % (uint64_t)slice_const_shape[i]
-                                * parent_stride[i]
-                        + flattened;
-                acc_shape /= slice_const_shape[i];
-            }
-        }
-        newidx = std::vector<expr_c> {std::move(flattened)};
-    } else {
-        std::vector<expr_c> flatten_idx;
-        process_indexing(ths, old_dims, get_dense_stride(old_dims), idx,
-                flatten_idx, false);
-        auto flatten_remain = flatten_idx.back();
-        expr flattened = 0UL;
-        for (auto i = (int)slice_shape.size() - 1; i >= 0; i--) {
-            flattened = (flatten_remain % slice_shape[i]) * parent_stride[i]
-                    + flattened;
-            flatten_remain = flatten_remain / slice_shape[i];
-        }
-        newidx = std::vector<expr_c> {std::move(flattened)};
-    }
-}
-
-class index_flatten_t : public ir_consistent_visitor_t {
-public:
-    using ir_consistent_visitor_t::dispatch;
-    using ir_consistent_visitor_t::visit;
-
-    expr_c visit(indexing_c v) override {
-        std::vector<expr_c> newidx;
-        bool changed;
-        bool is_2d = v->dtype_.rows_ != 0;
-        COMPILE_ASSERT(!is_2d || v->idx_.size() >= 2UL,
-                "2D load/store should have at least 2D index");
-        if (v->ptr_.isa<tensor>()) {
-            auto ptr = v->ptr_.static_as<tensor_c>();
-            auto old_ptr = ptr;
-            changed = process_indexing(this, ptr, v->idx_, newidx, is_2d);
-            if (is_2d) {
-                throw std::runtime_error("2D memory is not supported");
-            } else {
-                if (changed) {
-                    return copy_attr(*v,
-                            builder::make_indexing(ptr, newidx,
-                                    v->dtype_.lanes_, v->mask_,
-                                    v->dtype_.rows_));
-                } else {
-                    return std::move(v);
-                }
-            }
-        } else {
-            // indexing on a tensor_ptr
-            tensorptr_c oldptr = v->ptr_.checked_as<tensorptr_c>();
-            auto ptr = dispatch(v->ptr_).checked_as<tensorptr_c>();
-            if (!oldptr->is_slice_ && oldptr->base_->ptr_.isa<tensorptr>()
-                    && oldptr->base_->ptr_.static_as<tensorptr>()->is_slice_) {
-                COMPILE_ASSERT(!is_2d,
-                        "2D indexing not yet supported on reshaped tensorptr");
-                auto old_parent_ptr
-                        = oldptr->base_->ptr_.static_as<tensorptr_c>();
-                auto base_shape = oldptr->shape_;
-                auto slice_shape = old_parent_ptr->shape_;
-                auto parent_stride = get_base_stride(old_parent_ptr);
-                process_slice_reshape_indexing(this, base_shape, slice_shape,
-                        parent_stride, v->idx_, newidx);
-            } else {
-                const std::vector<expr> *shape = get_base_shape(oldptr);
-                const std::vector<expr> stride = get_base_stride(oldptr);
-                // 1D input might not have shape info
-                assert(oldptr->base_->idx_.size() == 1 || !shape->empty());
-                // flatten the indices using the reshaped dimensions
-                COMPILE_ASSERT(shape->size() == v->idx_.size(),
-                        "Unmatched dimensions of indexing: "
-                                << v
-                                << ", shape:" << utils::print_vector(*shape));
-                process_indexing(this, *shape, stride, v->idx_, newidx, is_2d);
-                if (is_2d) {
-                    throw std::runtime_error("2D memory is not supported");
-                }
-            }
-            // flatten the indices using tensor_ptr's dimensions
-            // then, add the flattened 1D index with the flattened 1D offset
-            // of the base tensor_ptr
-            assert(ptr->base_->idx_.size() == 1);
-            return copy_attr(*v,
-                    builder::make_indexing(ptr->base_->ptr_,
-                            builder::make_add(ptr->base_->idx_[0], newidx[0]),
-                            v->dtype_.lanes_, v->mask_, v->dtype_.rows_));
-        }
-    }
-
-    expr_c visit(tensor_c v) override {
-        COMPILE_ASSERT(v->dims_.size() == v->strides_.size(),
-                "Tensor dims and strides shall have same length.");
-        if (v->dims_.size() == 1 && v->strides_[0].isa<constant>()
-                && get_expr_as_int(v->strides_[0]) == 1) {
-            // if already flattened, return
-            return v;
-        } else {
-            expr range = 1;
-            for (size_t i = 0; i < v->dims_.size(); ++i) {
-                range = builder::make_add(
-                        builder::make_mul(v->dims_[i] - 1, v->strides_[i]),
-                        range);
-            }
-            // here stride is {1} since all strided info are compressed into
-            // dims
-            auto ret = copy_attr(*v,
-                    builder::make_stensor(v->name_, {range},
-                            std::vector<expr> {UINT64_C(1)}, v->elem_dtype_,
-                            v->address_space_, v->init_value_));
-            return ret;
-        }
-    }
-
-    expr_c visit(tensorptr_c v) override {
-        auto base = dispatch(v->base_);
-        // we should remove the shape info
-        if (base.ptr_same(v->base_) && v->shape_.empty()) { return v; }
-        return copy_attr(*v,
-                make_expr<tensorptr_node>(
-                        base.remove_const().checked_as<indexing>(),
-                        std::vector<expr> {}, false));
-    }
-};
-
-func_c index_flattener_t::operator()(func_c f) {
-    index_flatten_t pass;
-    return pass.dispatch(std::move(f));
-}
-
-stmt_c index_flattener_t::operator()(stmt_c f) {
-    index_flatten_t pass;
-    return pass.dispatch(std::move(f));
-}
-
-const_ir_module_ptr index_flattener_t::operator()(const_ir_module_ptr f) {
-    index_flatten_t pass;
-    return dispatch_module_on_visitor(&pass, f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index_flatten.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index_flatten.hpp
deleted file mode 100644
index a63cf524545..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/index_flatten.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INDEX_FLATTEN_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INDEX_FLATTEN_HPP
-
-#include "../module_pass.hpp"
-#include "../sc_function.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Transforms multi-dimemsion tensors into 1-D tensors.
- * e.g. tensor A: float[100,200] => tensor A: float[100*200]
- * Also transforms the corresponding indexing nodes:
- * e.g. A[10,20] => A[10*200+20]
- * */
-class index_flattener_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    func_c operator()(func_c f);
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/insert_trace.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/insert_trace.cpp
deleted file mode 100644
index 7154e24ed94..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/insert_trace.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "insert_trace.hpp"
-#include <atomic>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-#include <runtime/trace.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-SC_DECL_PASS_INFO(trace_inserter, SC_PASS_DEPENDS_ON(validator),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-class trace_inserter_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    int func_id;
-    func_c dispatch(func_c v) override {
-        if (v->attr_
-                && (v->attr_->get_or_else(function_attrs::skip_trace, false)
-                        || v->attr_->get_or_else(
-                                function_attrs::low_level, false))) {
-            return v;
-        }
-        func_id = register_traced_func(v->name_);
-        auto oldbody = v->body_;
-        assert(oldbody.isa<stmts>());
-        const auto &seq = oldbody.static_as<stmts>()->seq_;
-        bool is_return_last = !seq.empty() && seq.back().isa<returns>();
-        std::vector<stmt> newseq;
-        newseq.emplace_back(builder::make_evaluate_unattached(
-                builtin::make_trace(func_id, 0, 0)));
-        for (const auto &s : seq) {
-            newseq.emplace_back(dispatch(s).remove_const());
-        }
-        if (!is_return_last) {
-            newseq.emplace_back(builder::make_evaluate_unattached(
-                    builtin::make_trace(func_id, 1, 0)));
-        }
-        return copy_attr(*v,
-                builder::make_func(v->name_, v->params_,
-                        make_stmt<stmts_node_t>(std::move(newseq)),
-                        v->ret_type_));
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        if (runtime_config_t::get().trace_mode_
-                        >= runtime_config_t::trace_mode_t::KERNEL
-                && v->value_.isa<intrin_call>()) {
-            auto intrin = v->value_.static_as<intrin_call>();
-            if (intrin->type_ == intrin_type::brgemm) {
-                return builder::make_stmts_unattached({
-                        builder::make_evaluate_unattached(
-                                builtin::make_trace_kernel(0, 0, 0)),
-                        v,
-                        builder::make_evaluate_unattached(
-                                builtin::make_trace_kernel(0, 1,
-                                        intrin->args_.at(brgemm_args::NUM))),
-                });
-            } else if (intrin->type_ == intrin_type::list_brgemm) {
-                return builder::make_stmts_unattached({
-                        builder::make_evaluate_unattached(
-                                builtin::make_trace_kernel(1, 0, 0)),
-                        v,
-                        builder::make_evaluate_unattached(
-                                builtin::make_trace_kernel(1, 1,
-                                        intrin->args_.at(brgemm_args::NUM)
-                                                * intrin->args_.at(
-                                                        brgemm_args::LEN))),
-                });
-            }
-        }
-        return v;
-    }
-
-    stmt_c visit(returns_c v) override {
-        return builder::make_stmts_unattached(
-                {builder::make_evaluate_unattached(
-                         builtin::make_trace(func_id, 1, 0)),
-                        v});
-    }
-};
-
-const_ir_module_ptr trace_inserter_t::operator()(const_ir_module_ptr m) {
-    auto ret = std::make_shared<ir_module_t>(*m);
-    auto &funcs = ret->get_contents();
-    for (unsigned i = 0; i < funcs.size(); i++) {
-        funcs[i] = std::const_pointer_cast<func_base>(
-                trace_inserter_impl_t().dispatch(funcs[i]));
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/insert_trace.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/insert_trace.hpp
deleted file mode 100644
index 9872684507d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/insert_trace.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INSERT_TRACE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INSERT_TRACE_HPP
-
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * Inserts calls to sc_make_trace when entering and exit from functions.
- * This pass will register the function id => function name mapping when a
- * module is compiled.
- * */
-class trace_inserter_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr m) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/interface_generalize.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/interface_generalize.cpp
deleted file mode 100644
index b93b6dd9006..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/interface_generalize.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_map>
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../easy_build.hpp"
-#include "interface_generalize.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(interface_generalizer,
-        SC_PASS_DEPENDS_ON(dyn_tensor_transformer), SC_PASS_REQUIRE_STATE(),
-        SC_PASS_REQUIRE_NOT_STATE(), SC_PASS_SET_STATE(),
-        SC_PASS_UNSET_STATE());
-
-const_ir_module_ptr interface_generalizer_t::operator()(
-        const_ir_module_ptr in) {
-    auto ret = in->copy();
-    auto &funcs = ret->get_contents();
-    auto len = funcs.size();
-    builder::ir_builder_t builder;
-    for (unsigned i = 0; i < len; i++) {
-        auto f = funcs[i];
-        if (f->body_.defined()
-                && (!f->attr_
-                        || !f->attr_->get_or_else(
-                                function_attrs::private_, false))) {
-            std::string wrapper_name = f->name_ + "_0wrapper";
-            assert(!ret->get_func(wrapper_name));
-            _function_(datatypes::void_t, wrapper_func,
-                    _arg_("args", datatypes::generic,
-                            {(int)f->params_.size()})) {
-                _bind_(args);
-                std::vector<expr> fargs;
-                fargs.reserve(f->params_.size());
-                for (uint64_t idx = 0; idx < f->params_.size(); idx++) {
-                    auto &param = f->params_[idx];
-                    assert(param->dtype_.lanes_ == 1);
-                    if (param->dtype_ == datatypes::generic) {
-                        fargs.emplace_back(args[idx]);
-                    } else {
-                        fargs.emplace_back(
-                                builder::make_cast(param->dtype_, args[idx]));
-                    }
-                }
-                builder.push_evaluate(builder::make_call(f->decl_, fargs));
-            }
-            wrapper_func->name_ = wrapper_name;
-            wrapper_func->decl_->name_ = wrapper_name;
-            if (f->attr_) {
-                if (f->attr_->get_or_else(function_attrs::is_main, false)) {
-                    wrapper_func->attr()[function_attrs::is_main] = true;
-                }
-                if (auto comments
-                        = f->attr_->get_or_null<std::vector<std::string>>(
-                                "comments")) {
-                    std::vector<std::string> new_comments = {comments->at(0),
-                            "@param args The array of arguments. It should "
-                            "contain the following:"};
-                    for (size_t i = 1; i < comments->size(); i++) {
-                        auto &comment = (*comments)[i];
-                        new_comments.emplace_back("  " + comment);
-                        if (!comment.empty() && comment[0] == '@') {
-                            new_comments.back()[2] = '-';
-                        }
-                    }
-                    wrapper_func->attr()["comments"] = std::move(new_comments);
-                }
-            }
-            ret->add_func({wrapper_func});
-        }
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/interface_generalize.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/interface_generalize.hpp
deleted file mode 100644
index fd486923e41..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/interface_generalize.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INTERFACE_GENERALIZE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_INTERFACE_GENERALIZE_HPP
-
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Generates a wrapper for each function, which accepts generic values as
- * parameters
- * */
-class interface_generalizer_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr m) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_function_motion.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_function_motion.cpp
deleted file mode 100644
index ab02b1770cc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_function_motion.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../util_module_passes.hpp"
-#include "../viewer.hpp"
-#include "../visitor.hpp"
-#include "loop_function_motion.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(simple_loop_function_motion,
-        SC_PASS_DEPENDS_ON(loop_merger, parallel_workload_dispatcher,
-                ir_simplifier, closurizer_cpu),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(IR_SIMPLIFIED));
-
-class pure_func_hoister_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    int index_ = 0;
-    // Currently all pure funcs will be promoted to outermost loop
-    int loop_depth_ = 0;
-    // Each pure func ret value will assign to a var
-    std::vector<stmt_c> call_var_defs_;
-    expr_c visit(call_c v) override {
-        // Currently only promote pure funcs with no arg
-        if (is_pure_func_call(v) && v->args_.empty() && loop_depth_ > 0) {
-            assert(v->dtype_ != datatypes::void_t);
-            func_t callee = v->get_prototype();
-            auto var_name = "call_var_" + std::to_string(index_++) + "_"
-                    + callee->name_;
-            auto call_var = builder::make_var(v->dtype_, var_name);
-            call_var_defs_.emplace_back(builder::make_var_tensor_def_unattached(
-                    call_var, linkage::local, v));
-            return call_var;
-        }
-        return v;
-    }
-    stmt_c visit(for_loop_c v) override {
-        loop_depth_++;
-        auto vv = ir_visitor_t::visit(std::move(v));
-        loop_depth_--;
-        if (loop_depth_ == 0 && !call_var_defs_.empty()) {
-            call_var_defs_.emplace_back(vv);
-            auto ret = builder::make_stmts_unattached(call_var_defs_);
-            call_var_defs_.clear();
-            return ret;
-        }
-        return vv;
-    }
-};
-
-func_c simple_loop_function_motion_t::operator()(func_c v) {
-    pure_func_hoister_t pure_func_hoister;
-
-    return pure_func_hoister.dispatch(std::move(v));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_function_motion.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_function_motion.hpp
deleted file mode 100644
index 781c4c35299..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_function_motion.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_FUNCTION_MOTION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_FUNCTION_MOTION_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * Simple non-SSA LICM for pure function version, hoist pure function call
- * inside loop. If the function has args related to loop vars, do not hoist.
- * */
-class simple_loop_function_motion_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_invariant_code_motion.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_invariant_code_motion.cpp
deleted file mode 100644
index 0378f870154..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_invariant_code_motion.cpp
+++ /dev/null
@@ -1,943 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include "loop_invariant_code_motion.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/passlet/volatility_analysis.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(loop_invariant_code_motion, SC_PASS_DEPENDS_ON(ssa_transform),
-        SC_PASS_REQUIRE_STATE(SSA_STAGE), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-struct licm_analysis_data_t {
-    const stmt_base_t *parent_;
-    // if the var is global or volatile
-    bool volatile_ = false;
-    // The loop vars stmt depending on
-    std::unordered_set<expr_c> dep_vars_;
-    std::unordered_set<expr_c> dep_tensors_;
-    // scope
-    size_t if_for_depth_ = 0;
-
-    licm_analysis_data_t(const stmt_base_t *parent) : parent_(parent) {}
-};
-
-static bool unordered_intersects(const std::unordered_set<expr_c> &a,
-        const std::unordered_set<expr_c> &b) {
-    for (const auto &elem : a) {
-        if (b.end() != std::find_if(b.begin(), b.end(), [&](const expr_c &v) {
-                return v.ptr_same(elem);
-            })) {
-            return true;
-        }
-    }
-    return false;
-}
-
-// Promotion depends on call, call nodes must be hoisted before this pass.
-// Currently tensorptr can not be hoisted.
-static bool expr_can_hoist(const expr_c &s) {
-    return passlet::non_volatile_expr(s.get()) || s.isa<tensor>()
-            || s.isa<indexing>();
-}
-
-static bool stmt_can_hoist(const stmt_c &s) {
-    return s.isa<define_c>() || s.isa<if_else_c>() || s.isa<for_loop_c>()
-            || s.isa<stmts_c>();
-}
-
-struct tensor_analysis_data_t {
-    // The base tensor vars depends on
-    std::unordered_set<expr_c> base_tensors_;
-};
-#define BASE_TENSORS(V) \
-    (V)->temp_data().get<tensor_analysis_data_t>().base_tensors_
-
-// Analyze call and tensor volatility in loops for licm promotion assessment
-struct loop_analysis_viewer_t : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-    const stmt_base_t *current_ = nullptr;
-    std::vector<expr_c> cur_loop_vars_;
-    std::vector<std::unordered_set<expr_c>> cur_loop_ptrs_;
-    std::unordered_map<expr_c, bool> call_volatile_map_;
-    std::unordered_map<expr_c, std::unordered_set<expr_c>> base_tensor_map_;
-    std::unordered_map<expr_c, std::unordered_set<expr_c>> tensor_volatile_map_;
-    std::unordered_map<alias_info::tensor_alias_identity_t *, expr_c>
-            alias_map_;
-
-    expr_c dispatch(expr_c s) override {
-        if (!s->get_temp_data().isa<tensor_analysis_data_t>()) {
-            s->temp_data() = tensor_analysis_data_t();
-        }
-        return ssa_viewer_t::dispatch(std::move(s));
-    }
-
-    stmt_c dispatch(stmt_c s) override {
-        if (!s->get_temp_data().isa<tensor_analysis_data_t>()) {
-            s->temp_data() = tensor_analysis_data_t();
-        }
-        auto old = current_;
-        current_ = s.get();
-        auto ret = ssa_viewer_t::dispatch(std::move(s));
-        current_ = old;
-        return ret;
-    }
-
-    void view(for_loop_c v) override {
-        cur_loop_vars_.emplace_back(v->var_);
-        cur_loop_ptrs_.emplace_back(std::unordered_set<expr_c>());
-        call_volatile_map_[v->var_] = false;
-        tensor_volatile_map_[v->var_] = std::unordered_set<expr_c>();
-        ssa_viewer_t::view(v);
-        // If inner loop call volatile, mark outer nested loops as call volatile
-        if (call_volatile_map_[v->var_]) {
-            for (const auto &loop_var : cur_loop_vars_) {
-                call_volatile_map_[loop_var] = true;
-            }
-        }
-        // If current loop contain volatile tensor, mark all alias as volatile
-        auto &cur_tensor_volatile_map = tensor_volatile_map_[v->var_];
-        for (const auto &tsr : cur_tensor_volatile_map) {
-            auto alias = alias_info::get_alias_info(*tsr);
-            if (!alias || alias->has_no_alias()) { continue; }
-            for (auto &cliq : alias->alias_cliques_) {
-                for (auto aid : cliq->set_) {
-                    auto other_alias_id = aid.lock();
-                    // if the tensor has been removed, skip
-                    if (!other_alias_id) { continue; }
-                    auto itr = alias_map_.find(other_alias_id.get());
-                    if (itr != alias_map_.end()) {
-                        cur_tensor_volatile_map.insert(itr->second);
-                    }
-                }
-            }
-        }
-        // for each volatile base tensor, filter loop volatile pointers
-        // mark all volatile pointers
-        std::unordered_set<expr_c> volatile_ptrs;
-        const auto &cur_loop_ptrs = cur_loop_ptrs_.back();
-        for (const auto &tsr : cur_tensor_volatile_map) {
-            auto iter = base_tensor_map_.find(tsr);
-            if (iter != base_tensor_map_.end()) {
-                auto &alias = iter->second;
-                // only pointer used inside loop will be considered
-                // avoiding marking other loop's volatile pointers
-                for (const auto &a : alias) {
-                    if (cur_loop_ptrs.find(a) != cur_loop_ptrs.end()) {
-                        volatile_ptrs.insert(a);
-                    }
-                }
-            }
-        }
-        cur_tensor_volatile_map.insert(
-                volatile_ptrs.begin(), volatile_ptrs.end());
-        // end of loop scope
-        cur_loop_vars_.pop_back();
-        cur_loop_ptrs_.pop_back();
-    }
-
-    void view(var_c v) override {
-        // Prepare for volatile pointer filter
-        if (v->dtype_.is_pointer() && !cur_loop_ptrs_.empty()) {
-            for (auto &loop_ptrs : cur_loop_ptrs_) {
-                loop_ptrs.insert(v);
-            }
-        }
-    }
-
-    void view(tensor_c v) override {
-        // Prepare for tensor alias analysis
-        auto alias = alias_info::get_alias_info(*v);
-        if (alias) { alias_map_[alias] = v; }
-        // Prepare for base tensor analysis
-        BASE_TENSORS(v).insert(v);
-    }
-
-    void view(cast_c v) override {
-        ssa_viewer_t::view(v);
-        // Prepare for base tensor analysis
-        auto &st_base = BASE_TENSORS(current_);
-        auto &vi_base = BASE_TENSORS(v->in_);
-        st_base.insert(vi_base.begin(), vi_base.end());
-    }
-
-    void view(binary_c v) override {
-        ssa_viewer_t::view(v);
-        // Prepare for base tensor analysis
-        auto &st_base = BASE_TENSORS(current_);
-        auto &vl_base = BASE_TENSORS(v->l_);
-        auto &vr_base = BASE_TENSORS(v->r_);
-        st_base.insert(vl_base.begin(), vl_base.end());
-        st_base.insert(vr_base.begin(), vr_base.end());
-    }
-
-    void view(define_c v) override {
-        ssa_viewer_t::view(v);
-        if (v->var_.isa<var>()) {
-            auto &st_base = BASE_TENSORS(current_);
-            auto &va_base = BASE_TENSORS(v->var_);
-            va_base.insert(st_base.begin(), st_base.end());
-            if (v->var_->dtype_.is_pointer()) {
-                for (const auto &tsr : va_base) {
-                    base_tensor_map_[tsr].insert(v->var_);
-                }
-            }
-        } else if (v->var_.isa<tensor>()) {
-            for (const auto &loop_var : cur_loop_vars_) {
-                auto &volatile_map = tensor_volatile_map_[loop_var];
-                volatile_map.insert(v->var_);
-            }
-        }
-    }
-
-    void view(call_c v) override {
-        ssa_viewer_t::view(v);
-        // If loop contain call, mark this loop as call volatile
-        if (!cur_loop_vars_.empty()) {
-            call_volatile_map_[cur_loop_vars_.back()] = true;
-        }
-        // If loop contain volatile tensor, mark this loop and
-        // outer nested loops with this volatile tensor
-        // In call(&A[i]), A considered volatile
-        if (!cur_loop_vars_.empty()) {
-            for (const auto &arg : v->args_) {
-                auto &base = BASE_TENSORS(arg);
-                for (const auto &loop_var : cur_loop_vars_) {
-                    auto &volatile_map = tensor_volatile_map_[loop_var];
-                    volatile_map.insert(base.begin(), base.end());
-                }
-            }
-        }
-    }
-
-    void view(assign_c v) override {
-        ssa_viewer_t::view(v);
-        // If loop contain volatile tensor, mark this loop and
-        // outer nested loops with this volatile tensor
-        // In A[i] = x, A considered volatile
-        if (v->var_.isa<indexing>() && !cur_loop_vars_.empty()) {
-            auto &base = BASE_TENSORS(v->var_.static_as<indexing>()->ptr_);
-            for (const auto &loop_var : cur_loop_vars_) {
-                auto &volatile_map = tensor_volatile_map_[loop_var];
-                volatile_map.insert(base.begin(), base.end());
-            }
-        }
-    }
-};
-
-static std::vector<stmt_c> *find_stmt_in_map(
-        std::unordered_map<expr_c, std::vector<stmt_c>> &m, const stmt_c &st) {
-    for (auto &it : m) {
-        if (std::find_if(it.second.begin(), it.second.end(),
-                    [&st](const stmt_c &in) { return in.ptr_same(st); })
-                != it.second.end()) {
-            return &it.second;
-        }
-    }
-    return nullptr;
-}
-
-template <class T>
-static stmt get_owner(T &data) {
-    assert(data.defined());
-    return data->ssa_data_->get_owner();
-}
-
-static bool get_licm_data_volatile(const stmt &owner) {
-    assert(owner.defined());
-    return owner->temp_data().get<licm_analysis_data_t>().volatile_;
-}
-
-static void set_licm_data_volatile(const stmt &owner, bool vlt) {
-    owner->temp_data().get<licm_analysis_data_t>().volatile_ = vlt;
-}
-
-static size_t get_owner_depth(const stmt &owner) {
-    assert(owner.defined());
-    return owner->temp_data().get<licm_analysis_data_t>().if_for_depth_;
-}
-
-struct var_exist_analysis_t : public ssa_viewer_t {
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    bool has_loop_var = false;
-    var_c cur_loop_var_need_find;
-
-    void view(var_c v) override {
-        if (cur_loop_var_need_find.defined()) {
-            if (v.ptr_same(cur_loop_var_need_find)) { has_loop_var = true; }
-        }
-    }
-
-    void view(ssa_phi_c v) override {
-        if (v->is_loop_phi_) { has_loop_var = true; }
-        if (has_loop_var) { return; }
-        ssa_viewer_t::view(v);
-    }
-
-    void view(stmts_c v) override {
-        for (auto &s : v->seq_) {
-            dispatch(s);
-            if (has_loop_var) { break; }
-        }
-    }
-};
-
-struct var_property_set_t : public ssa_viewer_t {
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    var_c cur_property_var_;
-    bool find_property_var_ = false;
-    std::unordered_set<expr_c> &visited_set_;
-    std::vector<expr_c> &modified_vars_;
-
-    var_property_set_t(std::unordered_set<expr_c> &set_vars,
-            std::vector<expr_c> &changed_vars)
-        : visited_set_(set_vars), modified_vars_(changed_vars) {}
-
-    void view(ssa_phi_c v) override { // ssa phi already have proper attribute
-    }
-
-    void view(stmts_c v) override {
-        for (auto &s : v->seq_) {
-            dispatch(s);
-        }
-    }
-
-    void view(var_c v) override {
-        if (cur_property_var_.defined()) {
-            if (v.ptr_same(cur_property_var_)) { find_property_var_ = true; }
-        }
-    }
-    void process_vars_property(
-            expr &var, expr &value, licm_analysis_data_t *v_licm) {
-        if (var.defined() && visited_set_.count(var)) { return; }
-        if (value.defined()) {
-            find_property_var_ = false;
-            dispatch(value);
-            if (find_property_var_) {
-                auto property_var_owner = get_owner<var_c>(cur_property_var_);
-                if (property_var_owner.defined()) {
-                    auto property_var_prop
-                            = property_var_owner->temp_data()
-                                      .get_or_null<licm_analysis_data_t>();
-                    licm_analysis_data_t *cur_licm = v_licm;
-                    if (!visited_set_.count(var) && cur_licm != nullptr) {
-                        cur_licm->dep_vars_.insert(
-                                property_var_prop->dep_vars_.begin(),
-                                property_var_prop->dep_vars_.end());
-                        cur_licm->volatile_ = property_var_prop->volatile_;
-                        modified_vars_.emplace_back(var);
-                        visited_set_.insert(var);
-                    }
-                }
-            }
-            find_property_var_ = false;
-        }
-    }
-
-    void view(define_c v) override {
-        process_vars_property(v.remove_const()->var_, v.remove_const()->init_,
-                v->temp_data().get_or_null<licm_analysis_data_t>());
-    }
-};
-
-static void set_loop_var_stmt_dep_var(std::vector<expr_c> &cur_loop_vars_,
-        std::vector<expr> &phi_values, stmt &owner,
-        std::unordered_set<expr_c> &modified_vars) {
-    for (int j = cur_loop_vars_.size() - 1; j >= 0; j--) {
-        auto &x = cur_loop_vars_[j];
-        var_exist_analysis_t loop_var_analysis;
-        loop_var_analysis.cur_loop_var_need_find = x.dyn_as<var_c>();
-        loop_var_analysis.dispatch(owner);
-        if (loop_var_analysis.has_loop_var) {
-            auto x_owner = x->ssa_data_->get_owner();
-            if (x_owner.defined()) {
-                auto loop_depth = get_owner_depth(x_owner);
-                for (size_t i = 0; i < phi_values.size(); i++) {
-                    auto cur_phi_owner = phi_values[i]->ssa_data_->get_owner();
-                    if (cur_phi_owner.defined()) {
-                        auto &st_data = cur_phi_owner->temp_data_
-                                                ->get<licm_analysis_data_t>();
-                        // if already define in outer scope or same
-                        // scope as for loop, don't need to move
-                        if (loop_depth < get_owner_depth(cur_phi_owner)) {
-                            st_data.dep_vars_.insert(x);
-                            auto parent_stmt
-                                    = cur_phi_owner->temp_data()
-                                              .get<licm_analysis_data_t>()
-                                              .parent_->node_ptr_from_this()
-                                              .remove_const();
-                            if (parent_stmt.defined()) {
-                                modified_vars.insert(phi_values[0]);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-struct non_loop_phi_volatile_set_t : public ssa_viewer_t {
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    std::vector<expr_c> cur_loop_vars_;
-    std::vector<expr> phi_values_;
-    // used to avoid visit repeatly
-    std::unordered_set<expr_c> visited_set_;
-    // modified_vars
-    std::unordered_set<expr_c> &changed_vars_;
-
-    non_loop_phi_volatile_set_t(const std::vector<expr_c> &loop_vars_,
-            const std::vector<expr> &values_phi,
-            std::unordered_set<expr_c> &modified_vars)
-        : cur_loop_vars_(loop_vars_)
-        , phi_values_(values_phi)
-        , changed_vars_(modified_vars) {}
-
-    void view(var_c v) override {
-        if (visited_set_.count(v)) { return; }
-        auto owner = v->ssa_data_->get_owner();
-        auto is_loopvar = std::find_if(cur_loop_vars_.begin(),
-                cur_loop_vars_.end(),
-                [&v](expr_c &x) { return x.dyn_as<var_c>().ptr_same(v); });
-        // non loop var need to find whether relate with loop var
-        if (is_loopvar == cur_loop_vars_.end() && owner.defined()) {
-            set_loop_var_stmt_dep_var(
-                    cur_loop_vars_, phi_values_, owner, changed_vars_);
-            if (visited_set_.count(v) == 0 && !v->ssa_data_->is_garbage()) {
-                visited_set_.insert(v);
-                dispatch(owner);
-            }
-        }
-    }
-
-    void view(ssa_phi_c v) override { ssa_viewer_t::view(v); }
-
-    void view(stmts_c v) override {
-        for (auto &s : v->seq_) {
-            dispatch(s);
-        }
-    }
-};
-
-static void process_with_non_loop_phi(std::vector<expr> &phi_values,
-        ssa_phi_c &phi, std::vector<expr_c> &cur_loop_vars_) {
-    std::unordered_set<expr_c> modified_vars_set;
-    std::vector<expr_c> modified_vars;
-    for (size_t i = 1; i < phi_values.size(); i++) {
-        auto owner = phi_values[i]->ssa_data_->get_owner();
-        if (owner.defined()) {
-            // first parent is {} scope, second parent is if scope
-            // if{}
-            assert(owner->temp_data_->get<licm_analysis_data_t>().parent_);
-            auto cur_owner_scope
-                    = owner->temp_data_->get<licm_analysis_data_t>()
-                              .parent_->temp_data()
-                              .get<licm_analysis_data_t>()
-                              .parent_->node_ptr_from_this()
-                              .remove_const();
-            // All the current phi value need to be same volatile to avoid
-            // error.
-            if (get_licm_data_volatile(cur_owner_scope)) {
-                auto cur_owner = phi_values[0]->ssa_data_->get_owner();
-                if (cur_owner.defined()) {
-                    auto cur_depth = get_owner_depth(cur_owner);
-                    // be in same scope
-                    if (cur_depth + 1 == get_owner_depth(owner)
-                            && cur_owner.defined()) {
-                        set_licm_data_volatile(cur_owner, true);
-                        auto parent_stmt
-                                = cur_owner->temp_data()
-                                          .get<licm_analysis_data_t>()
-                                          .parent_->node_ptr_from_this()
-                                          .remove_const();
-                        // propagation
-                        if (parent_stmt.defined()) {
-                            modified_vars_set.insert(phi_values[0]);
-                        }
-                    }
-                }
-            }
-            // Find vars that may be affected by loop var
-            non_loop_phi_volatile_set_t vlt_set(
-                    cur_loop_vars_, phi_values, modified_vars_set);
-            vlt_set.dispatch(cur_owner_scope);
-            // need to find more accurate owner that have loop var
-            set_loop_var_stmt_dep_var(cur_loop_vars_, phi_values,
-                    cur_owner_scope, modified_vars_set);
-        }
-    }
-    // propagation
-    modified_vars.reserve(modified_vars_set.size());
-    for (auto &x : modified_vars_set) {
-        modified_vars.emplace_back(x);
-    }
-    while (!modified_vars.empty()) {
-        auto x = modified_vars.back();
-        modified_vars.pop_back();
-        auto cur_owner = x->ssa_data_->get_owner();
-        if (cur_owner.defined()) {
-            auto parent_stmt = cur_owner->temp_data()
-                                       .get<licm_analysis_data_t>()
-                                       .parent_->node_ptr_from_this()
-                                       .remove_const();
-            if (parent_stmt.defined()) {
-                var_property_set_t property_set(
-                        modified_vars_set, modified_vars);
-                property_set.cur_property_var_ = x.dyn_as<var_c>();
-                property_set.dispatch(parent_stmt);
-            }
-        }
-    }
-}
-
-struct licm_analysis_viewer_t : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-    const stmt_base_t *current_ = nullptr;
-    size_t if_for_depth_count_ = 0;
-    //
-    std::vector<stmt_c> cur_if_scopes_;
-    std::vector<expr_c> cur_loop_vars_;
-    // currently we treat if_scope as a special stmt to judge
-    // how loop invariants interact with it
-    std::unordered_map<expr_c, stmt_c> if_scope_loop_map_;
-    std::unordered_map<stmt_c, std::unordered_set<expr_c>> if_scope_var_map_;
-    //
-    licm_analysis_viewer_t() {
-        // Create a dummy if_scope
-        cur_if_scopes_.emplace_back(
-                builder::make_if_else_unattached(false, {}, {}));
-    }
-    stmt_c dispatch(stmt_c s) override {
-        if (!s->get_temp_data().isa<licm_analysis_data_t>()) {
-            s->temp_data() = licm_analysis_data_t(current_);
-        } else {
-            // if the vardef is used before define, update the parent
-            auto &licm_data = s->temp_data().get<licm_analysis_data_t>();
-            assert(licm_data.parent_ == nullptr);
-            licm_data.parent_ = current_;
-        }
-        if (!stmt_can_hoist(s)) {
-            s->temp_data().get<licm_analysis_data_t>().volatile_ = true;
-            return s;
-        }
-        auto old = current_;
-        current_ = s.get();
-        auto ret = ssa_viewer_t::dispatch(std::move(s));
-        current_ = old;
-        return ret;
-    }
-    expr_c dispatch(expr_c s) override {
-        if (!expr_can_hoist(s) && current_ != nullptr
-                && !cur_loop_vars_.empty()) {
-            current_->temp_data().get<licm_analysis_data_t>().volatile_ = true;
-            return s;
-        }
-        if (!s->get_temp_data().isa<licm_analysis_data_t>()) {
-            s->temp_data() = licm_analysis_data_t(current_);
-        }
-        return ssa_viewer_t::dispatch(std::move(s));
-    }
-    void view(for_loop_c v) override {
-        if_scope_loop_map_[v->var_] = cur_if_scopes_.back();
-        cur_loop_vars_.emplace_back(v->var_);
-        v->temp_data().get<licm_analysis_data_t>().if_for_depth_
-                = if_for_depth_count_;
-        ++if_for_depth_count_;
-        ssa_viewer_t::view(v);
-        cur_loop_vars_.pop_back();
-        auto &v_data = v->temp_data().get<licm_analysis_data_t>();
-        auto &body_data = v->body_->temp_data().get<licm_analysis_data_t>();
-        v_data.volatile_ |= body_data.volatile_;
-        v_data.dep_vars_.insert(
-                body_data.dep_vars_.begin(), body_data.dep_vars_.end());
-        v_data.dep_tensors_.insert(
-                body_data.dep_tensors_.begin(), body_data.dep_tensors_.end());
-        --if_for_depth_count_;
-    }
-    void view(stmts_c v) override {
-        ssa_viewer_t::view(v);
-        auto &v_data = v->temp_data().get<licm_analysis_data_t>();
-        for (auto &st : v->seq_) {
-            // if st is garbage
-            if (!st->get_temp_data().isa<licm_analysis_data_t>()) { continue; }
-            auto &st_data = st->temp_data().get<licm_analysis_data_t>();
-            v_data.dep_vars_.insert(
-                    st_data.dep_vars_.begin(), st_data.dep_vars_.end());
-            v_data.dep_tensors_.insert(
-                    st_data.dep_tensors_.begin(), st_data.dep_tensors_.end());
-            if (st_data.volatile_) {
-                v_data.volatile_ = true;
-                break;
-            }
-        }
-    }
-    void view(define_c v) override {
-        auto &st_data = v->temp_data().get<licm_analysis_data_t>();
-        st_data.if_for_depth_ = if_for_depth_count_;
-        dispatch(v->var_);
-        if (v->init_.defined()) { dispatch(v->init_); }
-        // synchronize volatile attribute between define node and its var node;
-        if (v->var_.isa<var_c>()) {
-            auto &var_data = v->var_->temp_data().get<licm_analysis_data_t>();
-            var_data = st_data;
-            if_scope_var_map_[cur_if_scopes_.back()].insert(v->var_);
-        } else if (v->var_.isa<tensor>() && !cur_loop_vars_.empty()) {
-            auto &var_data = v->var_->temp_data().get<licm_analysis_data_t>();
-            var_data.dep_vars_.insert(cur_loop_vars_.back());
-            st_data.volatile_ = true;
-        }
-    }
-    void view(if_else_c v) override {
-        ++if_for_depth_count_;
-        cur_if_scopes_.emplace_back(v);
-        ssa_viewer_t::view(v);
-        auto &st_data = v->temp_data().get<licm_analysis_data_t>();
-        auto &then_data
-                = v->then_case_->temp_data().get<licm_analysis_data_t>();
-        st_data.volatile_ |= then_data.volatile_;
-        st_data.dep_vars_.insert(
-                then_data.dep_vars_.begin(), then_data.dep_vars_.end());
-        st_data.dep_tensors_.insert(
-                then_data.dep_tensors_.begin(), then_data.dep_tensors_.end());
-        if (v->else_case_.defined()) {
-            auto &else_data
-                    = v->else_case_->temp_data().get<licm_analysis_data_t>();
-            st_data.volatile_ |= else_data.volatile_;
-            st_data.dep_vars_.insert(
-                    else_data.dep_vars_.begin(), else_data.dep_vars_.end());
-            st_data.dep_tensors_.insert(else_data.dep_tensors_.begin(),
-                    else_data.dep_tensors_.end());
-        }
-        // Pass down all dependencies to vars defined in this if_else_c
-        auto &vars_in_if_scope_ = if_scope_var_map_[v];
-        for (auto &var : vars_in_if_scope_) {
-            auto &var_data = var->temp_data().get<licm_analysis_data_t>();
-            var_data.volatile_ |= st_data.volatile_;
-            var_data.dep_vars_.insert(
-                    st_data.dep_vars_.begin(), st_data.dep_vars_.end());
-            var_data.dep_tensors_.insert(
-                    st_data.dep_tensors_.begin(), st_data.dep_tensors_.end());
-        }
-        vars_in_if_scope_.clear();
-        // End of this if_scopes
-        cur_if_scopes_.pop_back();
-        --if_for_depth_count_;
-    }
-
-    void view(var_c v) override {
-        auto owner = v->ssa_data_->get_owner();
-        auto &is_param = v->ssa_data_->is_param_;
-        if (current_ == nullptr) {
-            assert(is_param);
-            return;
-        }
-        auto &st_data = current_->temp_data().get<licm_analysis_data_t>();
-        auto &var_data = v->temp_data().get<licm_analysis_data_t>();
-        if (var_data.volatile_) {
-            st_data.volatile_ = true;
-            return;
-        }
-        st_data.dep_vars_.insert(
-                var_data.dep_vars_.begin(), var_data.dep_vars_.end());
-        st_data.dep_tensors_.insert(
-                var_data.dep_tensors_.begin(), var_data.dep_tensors_.end());
-        // param can be considered as invariant.
-        // if a var is used before defined, treat it as volatile.
-        if (owner.defined()
-                && owner->temp_data().get_or_null<licm_analysis_data_t>()) {
-            if (owner.isa<for_loop_c>()) { st_data.dep_vars_.insert(v); }
-        } else if (!is_param) {
-            var_data.volatile_ = true;
-            st_data.volatile_ = true;
-        }
-    }
-    void view(tensor_c v) override {
-        auto &is_param = v->ssa_data_->is_param_;
-        if (current_ == nullptr) {
-            assert(is_param);
-            return;
-        }
-        auto &st_data = current_->temp_data().get<licm_analysis_data_t>();
-        auto &var_data = v->temp_data().get<licm_analysis_data_t>();
-        st_data.dep_vars_.insert(
-                var_data.dep_vars_.begin(), var_data.dep_vars_.end());
-        if (var_data.volatile_) {
-            st_data.volatile_ = true;
-            return;
-        }
-    }
-    void view(indexing_c v) override {
-        ssa_viewer_t::view(v);
-        auto &st_data = current_->temp_data().get<licm_analysis_data_t>();
-        st_data.dep_tensors_.insert(v->ptr_);
-    }
-    void view(ssa_phi_c v) override {
-        ssa_viewer_t::view(v);
-        if (v->is_loop_phi_) {
-            for (auto &val : v->values_) {
-                auto owner = val->ssa_data_->get_owner();
-                if (owner.defined()
-                        && owner->temp_data()
-                                   .get_or_null<licm_analysis_data_t>()) {
-                    owner->temp_data().get<licm_analysis_data_t>().volatile_
-                            = true;
-                }
-            }
-        } else if (!cur_loop_vars_.empty() && v->values_.size() > 1) {
-            process_with_non_loop_phi(
-                    v.remove_const()->values_, v, cur_loop_vars_);
-        }
-    }
-};
-
-struct licm_hoist_prepare_viewer_t : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-    const stmt_base_t *current_ = nullptr;
-    //
-    std::vector<stmt_c> cur_if_scopes_;
-    std::vector<expr_c> cur_loop_vars_;
-    // currently we treat if_scope as a special stmt to judge
-    // how loop invariants interact with it
-    std::unordered_map<expr_c, stmt_c> if_scope_loop_map_;
-    // output map: loop var => vector of invariants, the invariants will
-    // be inserted just before their correspond loop vars.
-    std::unordered_map<expr_c, std::vector<stmt_c>> loop_invariant_map_;
-    std::unordered_set<stmt_c> stmt_to_remove_set_;
-    // input map: call and tensor info
-    std::unordered_map<expr_c, bool> &call_volatile_map_;
-    std::unordered_map<expr_c, std::unordered_set<expr_c>>
-            &tensor_volatile_map_;
-    //
-    licm_hoist_prepare_viewer_t(
-            std::unordered_map<expr_c, bool> &call_volatile_map,
-            std::unordered_map<expr_c, std::unordered_set<expr_c>>
-                    &tensor_volatile_map)
-        : call_volatile_map_(call_volatile_map)
-        , tensor_volatile_map_(tensor_volatile_map) {
-        // Create a dummy if_scope
-        cur_if_scopes_.emplace_back(
-                builder::make_if_else_unattached(false, {}, {}));
-    }
-    void register_loop_invariant_stmt() {
-        assert(current_ != nullptr);
-        // if stmts (need follow parent if or for), return
-        if (current_->node_type_ == sc_stmt_type::stmts) { return; }
-        // if the stmt is not in loop, return.
-        if (cur_loop_vars_.empty()) { return; }
-        auto &st_data = current_->temp_data().get<licm_analysis_data_t>();
-        if (st_data.volatile_) { return; }
-        // Find the loop to promote, from inner-most to out-most
-        auto it = cur_loop_vars_.rbegin();
-        for (; it != cur_loop_vars_.rend(); it++) {
-            // If any dep_tensors_ is defined or to be stored in the loop
-            // cannot promote
-            bool volatile_by_tensor = unordered_intersects(
-                    st_data.dep_tensors_, tensor_volatile_map_[*it]);
-            // If any dep_vars_ is the loop var, cannot promote
-            bool volatile_by_var = //
-                    st_data.dep_vars_.end()
-                    != std::find_if(st_data.dep_vars_.begin(),
-                            st_data.dep_vars_.end(),
-                            [&](const expr_c &v) { return v.ptr_same(*it); });
-            // If loop contains call node, cannot promote
-            bool volatile_by_call = call_volatile_map_[*it];
-            // If loop outside current if scope, cannot promote
-            bool outside_if_scope
-                    = !if_scope_loop_map_[*it].ptr_same(cur_if_scopes_.back());
-            // If depends not volatile, continue find the loop to promote
-            if (volatile_by_tensor || volatile_by_var || volatile_by_call
-                    || outside_if_scope) {
-                break;
-            }
-        }
-        if (it != cur_loop_vars_.rbegin()) {
-            stmt_c s = current_->node_ptr_from_this();
-            loop_invariant_map_[*(it - 1)].emplace_back(s);
-            stmt_to_remove_set_.insert(s);
-            return;
-        } else {
-            // current_ is volatile in its loop, cannot promote
-            return;
-        }
-    }
-    stmt_c dispatch(stmt_c s) override {
-        if (!stmt_can_hoist(s)) { return s; }
-        auto old = current_;
-        current_ = s.get();
-        auto ret = ssa_viewer_t::dispatch(std::move(s));
-        register_loop_invariant_stmt();
-        current_ = old;
-        return ret;
-    }
-    expr_c dispatch(expr_c s) override {
-        if (!expr_can_hoist(s) && current_ != nullptr
-                && !cur_loop_vars_.empty()) {
-            return s;
-        }
-        return ssa_viewer_t::dispatch(std::move(s));
-    }
-    void view(for_loop_c v) override {
-        if_scope_loop_map_[v->var_] = cur_if_scopes_.back();
-        cur_loop_vars_.emplace_back(v->var_);
-        ssa_viewer_t::view(v);
-        cur_loop_vars_.pop_back();
-    }
-    void view(if_else_c v) override {
-        cur_if_scopes_.emplace_back(v);
-        ssa_viewer_t::view(v);
-        // End of this if_scopes
-        cur_if_scopes_.pop_back();
-    }
-};
-
-// Second filter map and set to remove stmt by volatile == true
-static void filter_stmt_by_volatile(
-        std::unordered_map<expr_c, std::vector<stmt_c>> &hoist_map,
-        std::unordered_set<stmt_c> &stmt_to_remove) {
-    for (auto &kv : hoist_map) {
-        for (auto it = kv.second.begin(); it != kv.second.end();) {
-            if ((*it)->temp_data().get<licm_analysis_data_t>().volatile_) {
-                it = kv.second.erase(it);
-            } else {
-                it++;
-            }
-        }
-    }
-    for (auto it = stmt_to_remove.begin(); it != stmt_to_remove.end();) {
-        if ((*it)->temp_data().get<licm_analysis_data_t>().volatile_) {
-            it = stmt_to_remove.erase(it);
-        } else {
-            it++;
-        }
-    }
-}
-struct licm_hoister_t : public ssa_visitor_t {
-    using ssa_visitor_t::dispatch;
-    using ssa_visitor_t::visit;
-    std::unordered_map<expr_c, std::vector<stmt_c>> &hoist_map_;
-    std::unordered_set<stmt_c> &stmt_to_remove_;
-    // if_else and for_loop may changed after dispatch
-    std::unordered_map<stmt_c, stmt_c> stmt_replace_map_;
-    licm_hoister_t(std::unordered_map<expr_c, std::vector<stmt_c>> &hoist_map,
-            std::unordered_set<stmt_c> &stmt_to_remove)
-        : hoist_map_(hoist_map), stmt_to_remove_(stmt_to_remove) {}
-    // we are not interested in exprs. Simply return and don't dispatch down
-    expr_c dispatch(expr_c v) override { return v; }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt_c> ret_vec;
-        bool changed = false;
-
-        for (auto &s : v->seq_) {
-            // if s is garbage
-            if (!s->get_temp_data().isa<licm_analysis_data_t>()) {
-                changed = true;
-                continue;
-            }
-            auto sz_before = ret_vec.size();
-            auto ret = dispatch(s);
-
-            // if an stmt is inserted, the IR is changed
-            changed |= sz_before != ret_vec.size();
-            changed |= !ret.ptr_same(s);
-            if (stmt_to_remove_.find(s) != stmt_to_remove_.end()) {
-                assert(s->temp_data().get<licm_analysis_data_t>().volatile_
-                        == false);
-                changed = true;
-                continue;
-            }
-            if (ret.isa<for_loop_c>()) {
-                auto it = hoist_map_.find(ret.static_as<for_loop_c>()->var_);
-                if (it != hoist_map_.end()) {
-                    std::vector<stmt_c> *cur_hoist_stmts = &(it->second);
-                    // ret_vec insert cur_hoist_stmts with replacement
-                    for (auto &s : *cur_hoist_stmts) {
-                        auto rep = stmt_replace_map_.find(s);
-                        if (rep != stmt_replace_map_.end()) {
-                            ret_vec.push_back(rep->second);
-                        } else {
-                            ret_vec.push_back(s);
-                        }
-                    }
-                    changed = true;
-                }
-            }
-            ret_vec.emplace_back(ret);
-        }
-        if (!changed) {
-            return v;
-        } else {
-            return copy_attr(*v, builder::make_stmts_unattached(ret_vec));
-        }
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto vv = ssa_visitor_t::visit(v);
-        if (!vv.ptr_same(v)) { stmt_replace_map_[v] = vv; }
-        return vv;
-    }
-
-    stmt_c visit(if_else_c v) override {
-        auto vv = ssa_visitor_t::visit(v);
-        if (!vv.ptr_same(v)) { stmt_replace_map_[v] = vv; }
-        return vv;
-    }
-};
-
-func_c loop_invariant_code_motion_t::operator()(func_c f) {
-    // Analyze loop first
-    loop_analysis_viewer_t loop_analyzer;
-    loop_analyzer.dispatch(f);
-    // Mark loop-invariant code
-    licm_analysis_viewer_t analyzer;
-    analyzer.dispatch(f);
-    // use the mark information to prepare invariant map
-    licm_hoist_prepare_viewer_t hoist_prepared(loop_analyzer.call_volatile_map_,
-            loop_analyzer.tensor_volatile_map_);
-    hoist_prepared.dispatch(f);
-    // Move code
-    filter_stmt_by_volatile(hoist_prepared.loop_invariant_map_,
-            hoist_prepared.stmt_to_remove_set_);
-    licm_hoister_t hoister(hoist_prepared.loop_invariant_map_,
-            hoist_prepared.stmt_to_remove_set_);
-    return hoister.top_level_dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_invariant_code_motion.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_invariant_code_motion.hpp
deleted file mode 100644
index e0f6d190bc6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_invariant_code_motion.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_INVARIANT_CODE_MOTION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_INVARIANT_CODE_MOTION_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Do loop invariant code motion(LICM) in SSA  (Single-Static-Assignment)
- * form. It includes two steps:
- * 1. Analysis and mark loop-independent variables. Notice that we do LIP only
- * for local var (mainly process indexing of tensor), not for global var/tensor,
- * local tensor/indexing/tensorptr node or call node. Currently we treat an
- * if-else node as a whole, if an if-else node can not be hoisted, neither can
- * its inner vars.
- * 2. Hoist as many definitions of var out of loops as possible. The definitions
- * should be hoisted out of its inner loop as far as possible.
- * */
-class loop_invariant_code_motion_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_merge.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_merge.cpp
deleted file mode 100644
index 8f2167302f9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_merge.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "loop_merge.hpp"
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(loop_merger, SC_PASS_DEPENDS_ON(constant_folder),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-// if v is not stmts or v->seq_.size() > 1, return v
-// if v is a stmts node with only one stmt, return the stmt
-static stmt_c extract_single_stmt(stmt_c v) {
-    auto vstmt = v.as<stmts_c>();
-    if (vstmt.defined() && vstmt->seq_.size() == 1) {
-        return vstmt->seq_.front();
-    }
-    return v;
-}
-
-class loop_merger_impl_t : public ir_visitor_t {
-    bool merge_recursive = false;
-    stmt_c visit(stmts_c v) override {
-        if (v->seq_.size() <= 1) {
-            // if there is only one or less statement, no chance to merge
-            return ir_visitor_t::visit(std::move(v));
-        }
-        std::vector<stmt> newseq;
-        newseq.reserve(v->seq_.size());
-        bool changed = false;
-        for (auto itr = v->seq_.begin(); itr != v->seq_.end(); ++itr) {
-            auto news = dispatch(extract_single_stmt(*itr));
-            if (!news.ptr_same(*itr)) { changed = true; }
-            newseq.emplace_back(news.remove_const());
-        }
-        // the size of old seq
-        int seqsize = newseq.size();
-        // the size of seq after merging
-        int newseqsize = 0;
-        // the util function to push an stmt to newseq. Note that newseq has
-        // already allocated enough size, we only need to write the value,
-        // instead of pushing new ones
-        auto repush_to_newseq = [&newseq, &newseqsize](stmt &&s) {
-            newseq[newseqsize] = std::move(s);
-            newseqsize++;
-        };
-        // the util function to move an stmt in newseq from src position to dst
-        // position. We need to firstly insert it to dst position and remove it
-        // in src position
-        auto move_to_newseq = [&newseq, &newseqsize](stmt s, int src, int dst) {
-            if (src == dst) {
-                newseq[newseqsize] = std::move(s);
-            } else {
-                newseq.emplace(newseq.begin() + dst, std::move(s));
-                newseq.erase(newseq.begin() + src + 1);
-            }
-            newseqsize++;
-        };
-        std::unordered_map<expr_c, expr> rmap;
-        ir_copier_t ir_cper(rmap, /*create_var_tensor*/ false);
-
-        for (int i = 0; i < seqsize; i++) {
-            auto loop = newseq[i].as<for_loop_c>();
-            // the index for the next iteration (will be added by 1)
-            // it is used to skip the merged for-loops
-            int nexti = i;
-
-            if (loop.defined()) {
-                // the editable copy of the loop
-                for_loop writable_loop;
-                if (merge_recursive
-                        || (loop->attr_
-                                && loop->attr_->has_key(
-                                        stmt_attr_key::merge_loop))) {
-                    // use this variable to record where should declare all the
-                    // expressions of merged loop
-                    int expr_insert_pos = newseqsize;
-                    for (int j = i + 1; j < seqsize; j++) {
-                        // for each next stmt, check if it is for_loop and is
-                        // marked stmt_attr_key::merge_loop
-                        auto nextloop = newseq[j].as<for_loop_c>();
-                        if (nextloop.defined()
-                                && (merge_recursive
-                                        || (nextloop->attr_
-                                                && nextloop->attr_->has_key(
-                                                        stmt_attr_key::
-                                                                merge_loop)))) {
-                            if (!writable_loop.defined()) {
-                                writable_loop
-                                        = loop->remake().static_as<for_loop>();
-                            }
-                            int merged_loops = writable_loop->merge_all(stmt(),
-                                    ir_cper(nextloop).static_as<for_loop>());
-                            // if the next loop cannot be merged, break
-                            if (merged_loops == 0) { break; }
-                            // skip the merged loop in the for(int i...)
-                            nexti++;
-                        } else {
-                            // if next stmt is var tensor def, move it up to
-                            // expr_insert_pos
-                            if (newseq[j].isa<define>()) {
-                                move_to_newseq(std::move(newseq[j]), j,
-                                        expr_insert_pos++);
-                                // skip the merged loop in the for(int i...)
-                                nexti++;
-                                // set changed flag
-                                changed = true;
-                                continue;
-                            } else {
-                                // if we cannot merge the next stmt, break
-                                break;
-                            }
-                        }
-                    }
-                }
-                if (writable_loop.defined()) {
-                    // if the loop has been merged with next loops
-                    changed = true;
-                    bool old_merge_recursive = merge_recursive;
-                    merge_recursive = true;
-                    writable_loop->body_
-                            = dispatch(writable_loop->body_).remove_const();
-                    merge_recursive = old_merge_recursive;
-                    repush_to_newseq(std::move(writable_loop));
-                } else {
-                    repush_to_newseq(loop.remove_const());
-                }
-            } else {
-                repush_to_newseq(std::move(newseq[i]));
-            }
-            i = nexti;
-        }
-        if (changed) {
-            newseq.resize(newseqsize);
-            return make_stmt<stmts_node_t>(std::move(newseq));
-        }
-        return v;
-    }
-};
-
-func_c loop_merger_t::operator()(func_c f) {
-    loop_merger_impl_t impl;
-    return impl.dispatch(f);
-};
-
-expr_c loop_merger_t::operator()(expr_c f) {
-    loop_merger_impl_t impl;
-    return impl.dispatch(std::move(f));
-};
-
-stmt_c loop_merger_t::operator()(stmt_c f) {
-    loop_merger_impl_t impl;
-    return impl.dispatch(std::move(f));
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_merge.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_merge.hpp
deleted file mode 100644
index ddb4303e6b3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_merge.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_MERGE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_MERGE_HPP
-
-#include "../function_pass.hpp"
-#include "../sc_function.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Merge consecutive loops with attr[stmt_attr_key::merge_loop] defined
- * */
-class loop_merger_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    expr_c operator()(expr_c f);
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_split.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_split.cpp
deleted file mode 100644
index 66e0b507c68..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_split.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "loop_split.hpp"
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/pass_id.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(loop_splitter, SC_PASS_DEPENDS_ON(),
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED, IR_SIMPLIFIED),
-        SC_PASS_REQUIRE_NOT_STATE(), SC_PASS_SET_STATE(),
-        SC_PASS_UNSET_STATE());
-
-class loop_spiltter_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    inline int64_t divide_and_ceil(int64_t x, int64_t y) {
-        return (x + y - 1) / y;
-    }
-
-    // not interested in any exprs
-    expr_c dispatch(expr_c v) override { return v; }
-
-    stmt_c visit(for_loop_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<for_loop_c>();
-        // New range for this for loop
-        stmt new_body;
-        auto for_range_new_end = [&](int64_t new_end) -> stmt_c {
-            const auto end = vv->iter_end_.dyn_as<constant>();
-            if (end.defined() && (get_const_as_int(end) > new_end)) {
-                auto iter_end = builder::make_constant(
-                        {new_end}, vv->iter_end_->dtype_);
-                return copy_attr(*vv,
-                        make_stmt<for_loop_node_t>(vv->var_, vv->iter_begin_,
-                                iter_end, vv->step_, std::move(new_body),
-                                vv->incremental_, vv->kind_, vv->num_threads_));
-            }
-            return vv;
-        };
-        auto for_range_new_beg = [&](int64_t new_beg) -> stmt_c {
-            const auto begn = vv->iter_begin_.dyn_as<constant>();
-            const auto step = vv->step_.dyn_as<constant>();
-            if (begn.defined() && step.defined()
-                    && (get_const_as_int(begn) < new_beg)) {
-                auto old_beg = get_const_as_int(begn);
-                auto old_stp = get_const_as_int(step);
-                auto n = divide_and_ceil(new_beg - old_beg, old_stp);
-                auto iter_beg = builder::make_constant(
-                        {n * old_stp + old_beg}, vv->iter_begin_->dtype_);
-                return copy_attr(*vv,
-                        make_stmt<for_loop_node_t>(vv->var_, iter_beg,
-                                vv->iter_end_, vv->step_, std::move(new_body),
-                                vv->incremental_, vv->kind_, vv->num_threads_));
-            }
-            return vv;
-        };
-        // For now, the only case we experienced is one for and one if with one
-        // case, thus only match for loop with one if (loop var cmp) stmt
-        auto cond = vv->body_.cast<stmts>()
-                            .filter([vv](const stmts &v) {
-                                return vv->kind_ == for_type::NORMAL
-                                        && v->seq_.size() == 1;
-                            })
-                            .map([](const stmts &v) {
-                                return v->seq_.back().as<if_else>();
-                            })
-                            .filter([&new_body](const if_else &v) {
-                                new_body = v->then_case_;
-                                return !v->else_case_.defined();
-                            })
-                            .map([](const if_else &v) {
-                                return v->condition_.dyn_as<cmp>();
-                            })
-                            .filter([vv](const cmp &v) {
-                                return v->l_.ptr_same(vv->var_)
-                                        && v->r_.isa<constant>();
-                            });
-        if (cond.has_value()) {
-            const auto c = cond.get();
-            const auto n = get_const_as_int(c->r_.static_as<constant>());
-            switch (c->node_type_) {
-                case sc_expr_type::cmp_lt: return for_range_new_end(n);
-                case sc_expr_type::cmp_le: return for_range_new_end(n + 1);
-                case sc_expr_type::cmp_gt: return for_range_new_beg(n + 1);
-                case sc_expr_type::cmp_ge: return for_range_new_beg(n);
-                default: break;
-            }
-        }
-        // No match
-        return vv;
-    }
-};
-
-func_c loop_splitter_t::operator()(func_c f) {
-    loop_spiltter_impl_t v;
-    return v.dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_split.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_split.hpp
deleted file mode 100644
index e83959d568d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_split.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_SPLIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_SPLIT_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class loop_splitter_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_transform.cpp
deleted file mode 100644
index feda6c1cf5f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_transform.cpp
+++ /dev/null
@@ -1,866 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_map>
-
-#include <algorithm>
-#include <atomic>
-#include <limits>
-#include <utility>
-#include "../builder.hpp"
-#include "../ir_comparer.hpp"
-#include "../pass/ir_copy_internal.hpp"
-#include "../visitor.hpp"
-#include "auto_cast.hpp"
-#include "constant_fold.hpp"
-#include "tensor_shrink.hpp"
-#include <compiler/ir/graph/binding_axis.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(ir.loop_transform)
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-int64_t get_const_as_int(const constant_c &c) {
-    assert(!c.get()->is_vector());
-    switch (get_type_category(c->dtype_)) {
-        case CATE_INT: return c->value_[0].s64; break;
-        case CATE_UINT: return c->value_[0].u64; break;
-        default:
-            COMPILE_ASSERT(0, "Bad type to get int from const: " << c);
-            return 0;
-    }
-}
-
-int64_t get_expr_as_int(const expr_c &e) {
-    constant_c c = e.checked_as<constant_c>();
-    return get_const_as_int(c);
-}
-
-static bool is_constant_for(for_loop_node_t *loop) {
-    return loop->iter_begin_.isa<constant>() && loop->iter_end_.isa<constant>()
-            && loop->step_.isa<constant>();
-}
-
-static void get_constant_from_for_loop(for_loop_node_t *_loop, int64_t &min,
-        int64_t &max, int64_t &step, bool check_step = true) {
-    COMPILE_ASSERT(is_constant_for(_loop),
-            "Only support constant for loops for for-loop-transforms: "
-                    << _loop->node_ptr_from_this());
-    step = get_const_as_int(_loop->step_.as<constant>());
-    if (check_step) {
-        COMPILE_ASSERT(step == 1,
-                "for-loop-transforms only support step=1: "
-                        << _loop->node_ptr_from_this());
-    }
-    min = get_const_as_int(_loop->iter_begin_.as<constant>());
-    max = get_const_as_int(_loop->iter_end_.as<constant>());
-    COMPILE_ASSERT(max >= min,
-            "for-loop-transforms: the begin should be less than or eq to end: "
-                    << _loop->node_ptr_from_this());
-}
-
-////////////////////////
-// New loop transform starts here
-////////////////////////
-
-class var_inplace_replacer_t : public ir_inplace_visitor_t {
-public:
-    var_inplace_replacer_t(std::unordered_map<var_node *, expr> *remap)
-        : remap_(remap) {}
-    std::unordered_map<var_node *, expr> *remap_;
-    using ir_inplace_visitor_t::dispatch_impl;
-    using ir_inplace_visitor_t::visit_impl;
-    expr visit_impl(var v) override {
-        auto itr = remap_->find(v.get());
-        if (itr != remap_->end()) {
-            changed_ = true;
-            return itr->second;
-        }
-        return v;
-    }
-
-    // sync with loop transform
-    void transform_shrink_info(const expr &v) {
-        if (v->attr_
-                && v->attr_->has_key(tensor_shrinker_attrs::should_shrink)) {
-            auto &shrink_info = v->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                    tensor_shrinker_attrs::should_shrink);
-            std::vector<expr> ret;
-            ir_inplace_visitor_t::dispatch_expr_vector(shrink_info.base_, ret);
-            ir_inplace_visitor_t::dispatch_expr_vector(shrink_info.shape_, ret);
-        }
-    }
-
-    expr visit_impl(tensor v) override {
-        ir_inplace_visitor_t::visit_impl(v);
-        transform_shrink_info(v);
-        return v;
-    }
-
-    expr visit_impl(tensorptr v) override {
-        ir_inplace_visitor_t::visit_impl(v);
-        transform_shrink_info(v);
-        return v;
-    }
-};
-
-bool for_loop_node_t::isvalid() const {
-    return var_.defined();
-}
-
-for_loop make_const_for(var v, int64_t min, int64_t max, stmt &&body_) {
-    auto ret = make_stmt<for_loop_node_t>(std::move(v),
-            make_expr<constant_node>(min, v->dtype_),
-            make_expr<constant_node>(max, v->dtype_),
-            make_expr<constant_node>(int64_t(1), v->dtype_), std::move(body_),
-            true, for_type::NORMAL);
-    // set parent node
-    add_parent_node(ret->body_, ret);
-    return ret;
-}
-
-// copies the IR and check if there is any static var
-class ir_copier_with_unroll_check_t : public ir_copier_impl_t {
-    using ir_copier_impl_t::dispatch;
-    using ir_copier_impl_t::ir_copier_impl_t;
-
-    void view(define_c v) override {
-        COMPILE_ASSERT(v->linkage_ == linkage::local,
-                "Only allow local variables in unroll, got: " << v);
-        replace_map_[v->var_] = expr();
-        ir_copier_impl_t::view(std::move(v));
-    }
-
-    void view(for_loop_c v) override {
-        // Replace var defined by a for_loop
-        replace_map_[v->var_] = expr();
-        ir_copier_impl_t::view(std::move(v));
-    }
-};
-
-static int find_ths_and_then_remove(for_loop_node_t *ths, const stmts &parent) {
-    for (auto v = parent->seq_.begin(); v != parent->seq_.end(); v++) {
-        if ((*v).get() == ths) {
-            int pos = static_cast<int>(v - parent->seq_.begin());
-            parent->seq_.erase(v);
-            return pos;
-        }
-    }
-    COMPILE_ASSERT(false, "Cannot find the axis in the parent");
-    return -1;
-}
-
-void for_loop_node_t::unroll(uint64_t factor, const stmt &parent) {
-    COMPILE_ASSERT(isvalid(), "Transforming an invalid for-loop");
-    if (factor == 1) { return; }
-    auto this_reserve = node_ptr_from_this();
-    bool remove = false;
-    bool has_remainder = true;
-    int loop_idx = -1;
-    int64_t min, max, step;
-    bool is_const = is_constant_for(this);
-    if (factor == 0) {
-        COMPILE_ASSERT(parent.defined() && parent.isa<stmts>(),
-                "parent is not defined or is not stmts.");
-        COMPILE_ASSERT(is_const, "Need const for-loop to fully unroll");
-        loop_idx = find_ths_and_then_remove(this, parent.static_as<stmts>());
-        remove = true;
-    }
-    if (is_const) {
-        get_constant_from_for_loop(this, min, max, step, false);
-        int64_t loop_len = max - min;
-        if (factor == 0) { factor = utils::divide_and_ceil(loop_len, step); }
-        has_remainder = loop_len % (factor * step) != 0
-                || loop_len < ((int64_t)factor * step);
-    }
-    // make new variables
-    var oldvar = var_.checked_as<var>();
-    var newvar = var_->remake().static_as<var>();
-    expr step_x_factor = is_const ? (step * factor) : step_ * factor;
-    newvar->name_ += "_u";
-
-    // make the new unrolled body
-    stmts seq = make_stmt<stmts_node_t>(std::vector<stmt>());
-    // make the indexing variable that can be shared by all unrolled stmts
-    expr newidx = oldvar->remake();
-    // a loop var when reserves loop, a constant node when unrolls all.
-    expr replace_value;
-    if (!remove) {
-        seq->seq_.emplace_back(builder::make_var_tensor_def_unattached(
-                newidx, linkage::local, newvar * step_x_factor + iter_begin_));
-        replace_value = newidx;
-    } else {
-        replace_value = make_expr<constant_node>(
-                get_const_as_int(iter_begin_.static_as<constant>()),
-                oldvar->dtype_);
-    }
-    // unroll by the factor
-    for (uint64_t i = 0; i < factor; i++) {
-        std::unordered_map<expr_c, expr> replace_map;
-        ir_copier_with_unroll_check_t copier(replace_map, false);
-        replace_map[oldvar] = replace_value + (is_const ? i * step : i * step_);
-        seq->seq_.emplace_back(copier.copy(body_));
-    }
-
-    // the new loop length, constant folded and pure expr versions
-    if (!remove) {
-        uint64_t c_newloop_len = is_const ? (max - min) / (step * factor) : 0;
-        expr newloop_len = is_const
-                ? c_newloop_len
-                : ((iter_end_ - iter_begin_) / (step_ * factor));
-        // if has remainder, should put the seq into an if-else
-        if (has_remainder) {
-            // make the remainder loop, first make the lower bound of the loop
-            // var
-            expr begins = is_const
-                    ? (c_newloop_len * (step * factor) + min)
-                    : (newloop_len * (step_ * factor) + iter_begin_);
-            auto remainder = builder::make_for_loop_unattached(oldvar, begins,
-                    iter_end_, step_, body_, incremental_,
-                    kind_ != for_type::PARALLEL ? kind_ : for_type::NORMAL);
-            // put the remainder loop and the unrolled body into the if-else
-            stmts seq_remainder = make_stmt<stmts_node_t>(std::vector<stmt>());
-            seq_remainder->seq_.emplace_back(std::move(remainder));
-            auto if_check
-                    = builder::make_if_else_unattached(newvar < newloop_len,
-                            std::move(seq), std::move(seq_remainder));
-            seq = make_stmt<stmts_node_t>(
-                    std::vector<stmt> {std::move(if_check)});
-        }
-        // update the loop vars
-        body_ = std::move(seq);
-
-        iter_begin_ = 0;
-        if (has_remainder) {
-            iter_end_ = is_const ? (c_newloop_len + 1) : (newloop_len + 1);
-        } else {
-            iter_end_ = is_const ? (c_newloop_len) : (newloop_len);
-        }
-        step_ = 1;
-        var_ = newvar;
-    } else {
-        assert(loop_idx != -1);
-        auto &pseq = parent.static_as<stmts>()->seq_;
-        pseq.insert(
-                pseq.begin() + loop_idx, seq->seq_.begin(), seq->seq_.end());
-    }
-}
-
-static for_loop build_inner_for(for_loop_node_t *ths, int64_t min,
-        int64_t block, node_ptr_map *node_remap) {
-    // make new variables
-    var varptr = ths->var_.checked_as<var>();
-    // remake a new var to replace old one. Call "remake" to avoid infinite
-    // recursion in var_inplace_replacer after fuse():
-    // cause: in fuse(), v => v / 123, we use v / 123 multiple times
-    // in split, we inplace change (v * 123) => (v * 321 + 1) / 123, but (v *
-    // 123) is used many times, we will recursively replace (v * 321 + 1) / 123
-    // => ((v * 321 + 1) * 321 + 1) / 123
-    // thus we need to remake v
-    ths->var_ = ths->var_->remake();
-    std::string &varname = ths->var_.static_as<var>()->name_;
-    std::string oldname = std::move(varname);
-    varname = oldname + "_0outer";
-
-    var vin = make_expr<var_node>(ths->var_->dtype_, oldname + "_0inner");
-
-    // old iter variable is mapped to outer * block + inner
-    expr remapped
-            = ths->var_ * make_expr<constant_node>(block, ths->var_->dtype_)
-            + vin;
-    if (min != 0) {
-        remapped = remapped + make_expr<constant_node>(min, ths->var_->dtype_);
-    }
-    std::unordered_map<var_node *, expr> remap = {{varptr.get(), remapped}};
-    if (node_remap) {
-        node_remap->insert(std::make_pair(varptr.impl, remapped.impl));
-    }
-    var_inplace_replacer_t pass(&remap);
-    pass.dispatch_impl(ths->body_);
-
-    // make inner loop
-    auto inner_for = make_const_for(vin, 0, block, std::move(ths->body_));
-    return inner_for;
-}
-
-for_loop for_loop_node_t::split(int64_t block, node_ptr_map *node_remap) {
-    COMPILE_ASSERT(isvalid(), "Transforming an invalid for-loop");
-    int64_t min, max, step;
-    get_constant_from_for_loop(this, min, max, step);
-    int64_t loop_len = max - min;
-    COMPILE_ASSERT(loop_len % block == 0 && loop_len >= block,
-            "The loop length "
-                    << loop_len
-                    << " should be divisible of and larger than the block size "
-                    << block);
-    int64_t outer_len = loop_len / block;
-
-    // set outer loop
-    iter_begin_ = make_expr<constant_node>(int64_t(0), var_->dtype_);
-    iter_end_ = make_expr<constant_node>(outer_len, var_->dtype_);
-
-    auto inner_for = build_inner_for(this, min, block, node_remap);
-
-    body_ = make_stmt<stmts_node_t>(std::vector<stmt>({inner_for}));
-    // set parent node
-    add_parent_node(inner_for, body_);
-    add_parent_node(body_, node_ptr_from_this());
-    // copy loop binding axis hint
-    copy_binding_axis_hint(this, inner_for.get());
-    return inner_for;
-}
-
-for_loop for_loop_node_t::split_on_num_threads(
-        int64_t num_groups, node_ptr_map *node_remap) {
-    COMPILE_ASSERT(isvalid(), "Transforming an invalid for-loop");
-    int64_t min, max, step;
-    get_constant_from_for_loop(this, min, max, step);
-    COMPILE_ASSERT(
-            min == 0 && step == 1, "Only support begin is 0 and step is 1")
-    COMPILE_ASSERT(max == num_threads_, "Only support num_iters == num_threads")
-    int64_t ori_num_threads = num_threads_;
-    COMPILE_ASSERT(ori_num_threads % num_groups == 0,
-            "The num_threads " << ori_num_threads
-                               << " should be divisible by num_groups "
-                               << num_groups);
-    num_threads_ = ori_num_threads / num_groups;
-    iter_end_ = make_expr<constant_node>(uint64_t(num_threads_), var_->dtype_);
-
-    auto inner_for = build_inner_for(this, min, num_groups, node_remap);
-    inner_for->num_threads_ = num_groups;
-    inner_for->kind_ = for_type::PARALLEL;
-    if (attr_ && attr_->has_key(stmt_attr_key::parallel_loop_balanced)) {
-        inner_for->attr()[stmt_attr_key::parallel_loop_balanced]
-                = attr()[stmt_attr_key::parallel_loop_balanced];
-    }
-    body_ = make_stmt<stmts_node_t>(std::vector<stmt>({inner_for}));
-    // set parent node
-    add_parent_node(inner_for, body_);
-    add_parent_node(body_, node_ptr_from_this());
-    // copy loop binding axis hint
-    copy_binding_axis_hint(this, inner_for.get());
-    return inner_for;
-}
-
-for_loop get_inner_for_loop(const for_loop_node_t *f) {
-    const for_loop_node_t *cur = f;
-    if (cur->body_.isa<stmts>()) {
-        auto stmtlist = cur->body_.static_as<stmts>();
-        // if it is a basic block with one statement and it is a for-loop
-        if (stmtlist->seq_.size() == 1
-                && stmtlist->seq_.at(0).isa<for_loop>()) {
-            return stmtlist->seq_.at(0).static_as<for_loop>();
-        }
-    } else if (cur->body_.isa<for_loop>()) {
-        return cur->body_.static_as<for_loop>();
-    }
-    return for_loop();
-}
-
-for_loop get_last_loop_in_body(const stmt &body) {
-    if (body.isa<stmts>()) {
-        auto stmtlist = body.static_as<stmts>();
-        if (!stmtlist->seq_.empty()) {
-            stmt last_stmt;
-            for (int64_t i = stmtlist->seq_.size() - 1; i >= 0; --i) {
-                last_stmt = stmtlist->seq_[i];
-                if (!stmtlist->seq_[i].isa<stmts>()
-                        || !stmtlist->seq_[i].static_as<stmts>()->seq_.empty())
-                    break;
-            }
-            if (last_stmt.isa<for_loop>()) {
-                return last_stmt.static_as<for_loop>();
-            } else {
-                return get_last_loop_in_body(last_stmt);
-            }
-        }
-    } else if (body.isa<for_loop>()) {
-        return body.static_as<for_loop>();
-    }
-    return for_loop();
-}
-
-for_loop for_loop_node_t::fuse(const for_loop &ax, node_ptr_map *node_remap) {
-    COMPILE_ASSERT(ax->isvalid(), "Transforming an invalid for-loop: ax");
-    COMPILE_ASSERT(isvalid(), "Transforming an invalid for-loop: this");
-    if (!get_inner_for_loop(this).ptr_same(ax)) {
-        SC_MODULE_INFO << "We can only fuse the next inner loop";
-        return this->node_ptr_from_this().static_as<for_loop>();
-    }
-    static std::atomic<int> fuse_count(0);
-    expr min1, max1, step1;
-    expr min2, max2, step2;
-    auto get_expr_from_for_loop
-            = [](for_loop_node_t *_loop, expr &min, expr &max, expr &step) {
-                  min = _loop->iter_begin_;
-                  max = _loop->iter_end_;
-                  step = _loop->step_;
-              };
-    get_expr_from_for_loop(this, min1, max1, step1);
-    get_expr_from_for_loop(ax.get(), min2, max2, step2);
-    expr loop_len1 = max1 - min1;
-    expr loop_len2 = max2 - min2;
-    expr outer_len = loop_len1 * loop_len2;
-    var var1 = var_.checked_as<var>(), var2 = ax->var_.checked_as<var>();
-    COMPILE_ASSERT(var_->dtype_ == ax->var_->dtype_,
-            "The fused for loop variables should have the same types, got "
-                    << var_->dtype_ << " and " << var_->dtype_);
-    // make new variables
-    var vout = make_expr<var_node>(var_->dtype_,
-            std::string("fused_0") + var1->name_ + "__" + var2->name_ + "_"
-                    + std::to_string(fuse_count++));
-
-    std::unordered_map<var_node *, expr> var_remap;
-    // old iter variable is mapped to vout / loop_len2 and vout % loop_len2
-    expr outer = vout / loop_len2;
-    outer = outer + min1;
-    outer = do_cast_and_fold(outer);
-    var_remap.insert(std::make_pair(var1.get(), outer));
-    if (node_remap) {
-        node_remap->insert(std::make_pair(var1.impl, outer.impl));
-    }
-    expr inner = vout % loop_len2;
-    inner = inner + min2;
-    inner = do_cast_and_fold(inner).remove_const();
-
-    var_remap.insert(std::make_pair(var2.get(), inner));
-    if (node_remap) {
-        node_remap->insert(std::make_pair(var2.impl, inner.impl));
-    }
-    var_inplace_replacer_t pass(&var_remap);
-    auto newbody = pass.dispatch_impl(ax->body_);
-
-    var_ = vout;
-    iter_begin_ = make_expr<constant_node>(int64_t(0), var1->dtype_);
-    iter_end_ = do_cast_and_fold(outer_len).remove_const();
-    step_ = make_expr<constant_node>(int64_t(1), var1->dtype_);
-
-    // redirect parent node
-    add_parent_node(newbody, node_ptr_from_this());
-
-    body_ = std::move(newbody);
-
-    ax->var_ = expr(); // invalidate ax
-
-    // fuse loop binding axis hint
-    fuse_binding_axis_hint(this, ax.get());
-    return for_loop(shared_from_this());
-}
-
-class loop_replacer_t : public ir_inplace_visitor_t {
-public:
-    using ir_inplace_visitor_t::dispatch_impl;
-    using ir_inplace_visitor_t::visit_impl;
-    for_loop_node_t *old_;
-    for_loop new_;
-    bool replaced = false;
-    loop_replacer_t(for_loop_node_t *old, for_loop new_)
-        : old_(old), new_(std::move(new_)) {}
-    stmt visit_impl(for_loop v) override {
-        if (v.get() == old_) {
-            replaced = true;
-            return new_;
-        }
-        return v;
-    }
-
-    expr dispatch_impl(expr e) override {
-        // no need to look into expr
-        return e;
-    }
-};
-
-class loop_parallel_replacer_t : public ir_inplace_visitor_t {
-private:
-    bool ignore_nested_parallel_;
-
-public:
-    loop_parallel_replacer_t(bool ignore_nested_parallel = false)
-        : ignore_nested_parallel_(ignore_nested_parallel) {}
-    using ir_inplace_visitor_t::dispatch_impl;
-    using ir_inplace_visitor_t::visit_impl;
-    stmt visit_impl(for_loop v) override {
-        dispatch_impl(v->body_);
-        if (v->kind_ == for_type::PARALLEL) {
-            if (v->num_threads_ > 0) {
-                if (!ignore_nested_parallel_) {
-                    v->kind_ = for_type::NORMAL;
-                    v->num_threads_ = 0;
-                }
-            } else {
-                v->kind_ = for_type::NORMAL;
-            }
-        }
-        return v;
-    }
-    expr dispatch_impl(expr e) override { return e; }
-};
-
-void for_loop_node_t::reorder(stmt parent, std::vector<for_loop> &&ax) {
-    COMPILE_ASSERT(!ax.empty(), "The number of axises to reorder should > 0");
-    for_loop cur = for_loop(shared_from_this());
-    stmt inner_body;
-    for (unsigned i = 0; i < ax.size(); i++) {
-        COMPILE_ASSERT(cur.defined(),
-                "Bad number of axises to reorder. Got "
-                        << ax.size() << " to reorder, but only have " << i
-                        << " nested for-loops");
-        COMPILE_ASSERT(cur->isvalid(), "Transforming an invalid for-loop");
-        inner_body = cur->body_;
-        COMPILE_ASSERT(std::find_if(ax.begin(), ax.end(),
-                               [cur](for_loop &v) { return v.ptr_same(cur); })
-                        != ax.end(),
-                "Cannot find axis " << cur->var_
-                                    << " in the given axises to reorder");
-        cur = get_inner_for_loop(cur.get());
-    }
-
-    // redirect parent node
-    add_parent_node(inner_body, ax.back());
-
-    ax.back()->body_ = std::move(inner_body);
-    cur = ax.back();
-    if (ax.size() > 1) {
-        for (int64_t i = ax.size() - 2; i >= 0; i--) {
-            ax.at(i)->body_ = make_stmt<stmts_node_t>(
-                    std::vector<stmt>({std::move(cur)}));
-            cur = ax.at(i);
-        }
-    }
-    loop_replacer_t replacer(this, ax.front());
-    replacer.dispatch_impl(std::move(parent));
-    COMPILE_ASSERT(replacer.replaced,
-            "Cannot find the for-loop to replace in the parent stmt");
-}
-
-void flatten_stmt_and_append(const stmt &s, std::vector<stmt> &out) {
-    if (s.isa<stmts>()) {
-        for (auto &v : s.static_as<stmts>()->seq_) {
-            out.push_back(v);
-        }
-    } else {
-        out.push_back(s);
-    }
-}
-
-static void check_loop_for_merge(for_loop_node_t *ths, for_loop_node_t *ax) {
-    COMPILE_ASSERT(ths->isvalid() && ax->isvalid(),
-            "Invalid for-loop. It has been fused or merged");
-    COMPILE_ASSERT(ax != ths, "The axis to merge should not be \'this\'");
-}
-
-static bool is_loop_range_same(for_loop_node_t *ths, for_loop_node_t *ax) {
-    ir_comparer ircmp(false, true, true);
-    ircmp.set_expr_mapping(ths->var_, ax->var_);
-    return ths->incremental_ == ax->incremental_
-            && ths->iter_begin_->equals(ax->iter_begin_, ircmp)
-            && ths->iter_end_->equals(ax->iter_end_, ircmp)
-            && ths->step_->equals(ax->step_, ircmp);
-}
-
-static void find_ths_and_ax_then_remove(
-        for_loop_node_t *ths, const stmt &parent, for_loop_node_t *ax) {
-    COMPILE_ASSERT(parent.isa<stmts>(), "The parent should be an stmts_node_t");
-    auto s = parent.static_as<stmts>();
-    constexpr size_t invalid = std::numeric_limits<size_t>::max();
-    size_t this_in_parent = invalid, ax_in_parent = invalid;
-    for (size_t i = 0; i < s->seq_.size(); i++) {
-        auto &v = s->seq_.at(i);
-        if (v.get() == ths) {
-            this_in_parent = i;
-        } else if (v.get() == ax) {
-            ax_in_parent = i;
-        }
-    }
-    COMPILE_ASSERT(this_in_parent != invalid && ax_in_parent != invalid,
-            "Cannot find the axises in the parent");
-    s->seq_.erase(s->seq_.begin() + ax_in_parent); // remove ax from parent
-}
-
-static int *get_unroll_factor_attr(const for_loop_node_t *ths) {
-    if (ths->attr_) {
-        return ths->attr_->get_or_null<int>(stmt_attr_key::unroll_loop);
-    }
-    return nullptr;
-}
-
-static void do_merge(
-        for_loop_node_t *ths, const stmt &parent, const for_loop &ax) {
-    // now replace ax's variable with this->var_
-    std::unordered_map<var_node *, expr> remap;
-    remap.insert(std::make_pair(ax->var_.checked_as<var>().get(), ths->var_));
-    var_inplace_replacer_t pass(&remap);
-    auto axbody = pass.dispatch_impl(ax->body_);
-    ax->var_ = expr(); // invalidate ax
-    if (parent.defined()) {
-        find_ths_and_ax_then_remove(ths, parent, ax.get());
-    }
-    // merge the bodies
-    std::vector<stmt> newbody;
-    flatten_stmt_and_append(ths->body_, newbody);
-    flatten_stmt_and_append(axbody, newbody);
-    ths->body_ = make_stmt<stmts_node_t>(std::move(newbody));
-    if (auto unroll_ax = get_unroll_factor_attr(ax.get())) {
-        auto unroll_ths = get_unroll_factor_attr(ths);
-        if (!unroll_ths) {
-            ths->attr()[stmt_attr_key::unroll_loop] = *unroll_ax;
-        } else {
-            COMPILE_ASSERT(*unroll_ax == *unroll_ths,
-                    "Different unroll factors when merging the loops: "
-                            << *unroll_ths << "v.s." << *unroll_ax);
-        }
-    }
-}
-
-for_loop for_loop_node_t::merge(const stmt &parent, const for_loop &ax) {
-    check_loop_for_merge(this, ax.get());
-    COMPILE_ASSERT(is_loop_range_same(this, ax.get()),
-            "The ranges of the merged for-loops should be the same");
-    do_merge(this, parent, ax);
-    return node_ptr_from_this().as<for_loop>();
-}
-
-for_loop for_loop_node_t::merge(stmt parent, for_loop ax, unsigned num_inner) {
-    for_loop_node_t *ths = this;
-    for_loop axis = std::move(ax);
-    for (unsigned i = 0; i < num_inner; i++) {
-        COMPILE_ASSERT(ths && axis.defined(),
-                "Merging " << num_inner << " inner loops, but have only " << i
-                           << " loops in the IR");
-        for_loop_node_t *next_ths = get_inner_for_loop(ths).get();
-        for_loop next_ax = get_inner_for_loop(axis.get());
-        ths->merge(parent, axis);
-        parent = ths->body_;
-        ths = next_ths;
-        axis = next_ax;
-    }
-    return node_ptr_from_this().static_as<for_loop>();
-}
-
-int for_loop_node_t::merge_all(stmt parent, for_loop ax) {
-    for_loop_node_t *ths = this;
-    for_loop axis = std::move(ax);
-    int num_loops = 0;
-    // skip axis binding check attr, usually marked in UT.
-    bool skip_ab_check = any_map_t::fetch_or_else(
-            ths->attr_.get(), stmt_attr_key::skip_axis_binding_check, false);
-    for (;;) {
-        // if there is no inner loops, break
-        if (!ths || !axis.defined()) { break; }
-        for_loop_node_t *next_ths = get_inner_for_loop(ths).get();
-        for_loop next_ax = get_inner_for_loop(axis.get());
-
-        // check if the loops are valid
-        check_loop_for_merge(ths, axis.get());
-        // if the loops are not mergable, break
-        if (!is_loop_range_same(ths, axis.get())) { break; }
-        // check binding axis
-        if (!skip_ab_check && !check_loop_binding_axis(ths, axis.get())) {
-            break;
-        }
-        do_merge(ths, parent, axis);
-        num_loops++;
-        parent = ths->body_;
-        ths = next_ths;
-        axis = next_ax;
-    }
-    return num_loops;
-}
-
-void for_loop_node_t::parallel_merge(const stmt &parent, const for_loop &ax) {
-    COMPILE_ASSERT(isvalid(), "Invalid loop");
-    COMPILE_ASSERT(ax->isvalid(), "Invalid loop");
-    COMPILE_ASSERT(step_.isa<constant>()
-                    && get_const_as_int(step_.static_as<constant>()) == 1,
-            "the step of this should be 1");
-    COMPILE_ASSERT(ax->step_.isa<constant>()
-                    && get_const_as_int(ax->step_.static_as<constant>()) == 1,
-            "the step of ax should be 1");
-    find_ths_and_ax_then_remove(this, parent, ax.get());
-    auto body1 = std::move(body_);
-    stmts body2 = ax->body_.isa<stmts>()
-            ? ax->body_.static_as<stmts>()
-            : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(ax->body_)});
-    body2->seq_.insert(body2->seq_.begin(),
-            builder::make_var_tensor_def_unattached(ax->var_, linkage::local,
-                    var_ - iter_end_ + ax->iter_begin_));
-    auto if_else
-            = builder::make_if_else_unattached(var_ < iter_end_, body1, body2);
-    body_ = builder::make_stmts_unattached({if_else});
-    // set parent node
-    add_parent_node(body1, if_else);
-    add_parent_node(body2, if_else);
-    add_parent_node(if_else, body_);
-    add_parent_node(body_, node_ptr_from_this());
-    iter_end_ = iter_end_ + ax->iter_end_ - ax->iter_begin_;
-    ax->var_ = expr();
-}
-
-void remove_parallel(stmt body, bool ignore_nested_parallel) {
-    loop_parallel_replacer_t replacer(ignore_nested_parallel);
-    replacer.dispatch_impl(std::move(body));
-}
-
-void remove_parallel(func_t body, bool ignore_nested_parallel) {
-    loop_parallel_replacer_t replacer(ignore_nested_parallel);
-    replacer.dispatch_impl(std::move(body));
-}
-
-std::vector<for_loop> collect_loops(const stmt &body) {
-    std::vector<for_loop> ret;
-    if (body.isa<stmts>()) {
-        for (auto &smt : body.static_as<stmts>()->seq_) {
-            if (smt.isa<for_loop>()) {
-                ret.push_back(smt.static_as<for_loop>());
-            }
-        }
-    } else {
-        if (body.isa<for_loop>()) { ret.push_back(body.static_as<for_loop>()); }
-    }
-    return ret;
-}
-
-std::vector<for_loop> collect_nested_loops(stmt body) {
-    std::vector<for_loop> ret;
-    auto cur = std::move(body);
-    bool outer_loop = true;
-    while (true) {
-        if (cur.isa<stmts>()) {
-            auto stmts_cur = cur.static_as<stmts>();
-            bool continue_flag = false;
-            for (unsigned i = 0; i < stmts_cur->seq_.size(); i++) {
-                auto smt = stmts_cur->seq_[i];
-                if (smt.isa<for_loop>()) {
-                    ret.push_back(smt.static_as<for_loop>());
-                    cur = ret.back()->body_;
-                    continue_flag = true;
-                    break;
-                } else if (!ret.empty()) {
-                    outer_loop = false;
-                    break;
-                }
-            }
-            if (continue_flag && outer_loop) { continue; }
-        } else {
-            if (cur.isa<for_loop>()) {
-                ret.push_back(cur.static_as<for_loop>());
-                cur = ret.back()->body_;
-                continue;
-            }
-        }
-        break;
-    }
-    return ret;
-}
-
-static size_t collect_all_loops_helper(
-        std::vector<for_loop> &ret, const stmt &body) {
-    size_t collected = 0;
-    if (body.isa<stmts>()) {
-        for (auto &smt : body.static_as<stmts>()->seq_) {
-            collected += collect_all_loops_helper(ret, smt);
-        }
-    } else if (body.isa<for_loop>()) {
-        auto loop = body.static_as<for_loop>();
-        ret.push_back(loop);
-        collected++;
-        collected += collect_all_loops_helper(ret, loop->body_);
-    } else if (body.isa<if_else>()) {
-        auto cond = body.static_as<if_else>();
-        if (cond->then_case_.defined()) {
-            collected += collect_all_loops_helper(ret, cond->then_case_);
-        }
-        if (cond->else_case_.defined()) {
-            collected += collect_all_loops_helper(ret, cond->else_case_);
-        }
-    }
-    return collected;
-}
-
-std::vector<for_loop> collect_all_loops(const stmt &body) {
-    std::vector<for_loop> ret;
-    collect_all_loops_helper(ret, body);
-    return ret;
-}
-
-expr divide_and_ceil(const expr &, const expr &);
-struct loop_normalize_visitor_t : public ir_consistent_visitor_t {
-    using ir_consistent_visitor_t::dispatch;
-
-    expr_c dispatch(expr_c v) override {
-        // if nothing to replace, skip dispatching
-        if (replace_map_.empty()) { return v; }
-        return ir_consistent_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto var = dispatch(v->var_);
-        auto begin = dispatch(v->iter_begin_);
-        auto end = dispatch(v->iter_end_);
-        auto step = dispatch(v->step_);
-        bool changed = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-                && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_));
-        bool is_normalized = v->kind_ != for_type::PARALLEL
-                || (begin.cast<constant_c>()
-                                .filter([](const constant_c &v) {
-                                    return get_const_as_int(v) == 0;
-                                })
-                                .has_value()
-                        && step.cast<constant_c>()
-                                   .filter([](const constant_c &v) {
-                                       return get_const_as_int(v) == 1;
-                                   })
-                                   .has_value());
-        if (!is_normalized) {
-            auto new_beg = builder::make_constant({UINT64_C(0)}, begin->dtype_);
-            auto new_end = divide_and_ceil(
-                    (end - begin).remove_const(), step.remove_const());
-            auto new_step = builder::make_constant({UINT64_C(1)}, step->dtype_);
-            auto new_itr = copy_attr(*var,
-                    builder::make_var(
-                            var->dtype_, var.static_as<var_c>()->name_));
-            auto replace_itr = begin + new_itr * step;
-            replace_map_[var] = replace_itr;
-            auto body = dispatch(v->body_);
-            return copy_attr(*v,
-                    builder::make_for_loop_unattached(new_itr, new_beg, new_end,
-                            new_step, body, v->incremental_, v->kind_,
-                            v->num_threads_));
-        }
-
-        auto body = dispatch(v->body_);
-        changed |= !body.ptr_same(v->body_);
-        if (changed) {
-            stmt_c newv = copy_attr(*v,
-                    builder::make_for_loop_unattached(var, begin, end, step,
-                            body, v->incremental_, v->kind_, v->num_threads_));
-            return newv;
-        }
-        return v;
-    }
-};
-
-for_loop_c normalize_parallel_for_loop(const for_loop_c &loop) {
-    loop_normalize_visitor_t v;
-    return v.visit(loop).static_as<for_loop_c>();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_transform.hpp
deleted file mode 100644
index e90bb346077..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_transform.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_TRANSFORM_HPP
-#include <vector>
-#include <compiler/ir/sc_stmt.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * Removes redundant loops with parallel attribute. Will reserve the outmost
- * one loop with parallel.
- * @param body the stmts for parallel remove
- * @param ignore_nested_parallel decides whether finally remove the parallel
- * loop which also owns `num_threads_` field
- * */
-void remove_parallel(stmt body, bool ignore_nested_parallel = false);
-
-void remove_parallel(func_t body, bool ignore_nested_parallel = false);
-
-/**
- * Collect loops inside this body. Won't recurisvely look into loop body.
- * For example.
- *  for()     # loop 1
- *    for()   # loop 2
- *  for()     # loop 3
- *    for()   # loop 4
- * Only loop 1 and 3 are returned.
- * @param body the stmts for collection
- * */
-std::vector<for_loop> collect_loops(const stmt &body);
-
-/**
- * Collect nested loops inside this body.
- * For example.
- *  for()     # loop 1
- *    for()   # loop 2
- *      for() # loop 3
- *      for() # loop 4
- *
- * Only loop 1 and 2 are returned because loop 3 and 4 are not nested loop.
- *
- * @param body the stmts for collection
- * */
-std::vector<for_loop> collect_nested_loops(stmt body);
-
-/**
- * Collect all loops inside this body recursively
- * */
-std::vector<for_loop> collect_all_loops(const stmt &body);
-
-// get inner for_loop
-for_loop get_inner_for_loop(const for_loop_node_t *f);
-
-// get last for_loop in body
-for_loop get_last_loop_in_body(const stmt &body);
-
-/**
- * Normalize a parallel loop and the parallel loops in the body to make the
- * iter_begin=0 and step=1. If all the loops are already normalized, return the
- * loop without changing. Otherwise it returns the new loop and it will not
- * change the given loop.
- */
-for_loop_c normalize_parallel_for_loop(const for_loop_c &loop);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_unroll.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_unroll.cpp
deleted file mode 100644
index e64ca67c9ee..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_unroll.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "loop_unroll.hpp"
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(loop_unroller, SC_PASS_DEPENDS_ON(tensor_init),
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(IR_SIMPLIFIED));
-
-class loop_unroller_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    expr_c dispatch(expr_c v) override { return v; }
-    stmt_c visit(stmts_c v) override {
-        auto ret = ir_visitor_t::visit(v);
-        std::vector<for_loop> to_unroll;
-        for (auto &s : ret.static_as<stmts>()->seq_) {
-            if (s.isa<for_loop>()) {
-                auto &attr = s.static_as<for_loop>()->attr_;
-                if (attr && attr->has_key(stmt_attr_key::unroll_loop)) {
-                    to_unroll.emplace_back(s.static_as<for_loop>());
-                }
-            }
-        }
-        if (to_unroll.empty()) { return ret; }
-        stmts writable
-                = (ret.ptr_same(v) ? ret->remake() : ret).static_as<stmts>();
-        for (auto &f : to_unroll) {
-            f->unroll(f->attr_->get<int>(stmt_attr_key::unroll_loop), writable);
-        }
-        return constant_folder_t()(writable);
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        // check of the loop is most inner loop
-        if (!is_inner_loop_.empty()) { is_inner_loop_.back() = false; }
-        is_inner_loop_.push_back(true);
-        auto vv = ir_visitor_t::visit(std::move(v))
-                          .static_as<for_loop_c>()
-                          .remove_const();
-        // fully unroll the loop if loop is most inner
-        // and repeat time <= 8, body size <= 64, unroll size <= 300
-        auto &end = vv->iter_end_;
-        auto &step = vv->step_;
-        auto &begin = vv->iter_begin_;
-        auto is_most_inner = is_inner_loop_.back();
-        if (vv->kind_ == for_type::NORMAL && is_most_inner
-                && begin.isa<constant>() && end.isa<constant>()
-                && step.isa<constant>()) {
-            auto e = end.static_as<constant>()->value_[0].u64;
-            auto s = step.static_as<constant>()->value_[0].u64;
-            auto b = begin.static_as<constant>()->value_[0].u64;
-            auto size = vv->body_.static_as<stmts>()->seq_.size();
-            auto repeat = utils::divide_and_ceil(e - b, s);
-            if (repeat <= 8 && size <= 64 && (repeat * size) <= 300) {
-                vv->attr()[stmt_attr_key::unroll_loop] = 0;
-            }
-        }
-        // end of loop
-        is_inner_loop_.pop_back();
-        return vv;
-    }
-
-private:
-    std::vector<bool> is_inner_loop_;
-};
-
-func_c loop_unroller_t::operator()(func_c f) {
-    loop_unroller_impl_t impl;
-    return impl.dispatch(f);
-};
-
-stmt_c loop_unroller_t::operator()(stmt_c f) {
-    loop_unroller_impl_t impl;
-    return impl.dispatch(std::move(f));
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_unroll.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_unroll.hpp
deleted file mode 100644
index ab9ba960594..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/loop_unroll.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_UNROLL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_LOOP_UNROLL_HPP
-
-#include "../function_pass.hpp"
-#include "../sc_function.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Unroll loops with attr[stmt_attr_key::unroll_loop]=true
- * */
-class loop_unroller_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/module_globals_resolve.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/module_globals_resolve.cpp
deleted file mode 100644
index 680efee1d43..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/module_globals_resolve.cpp
+++ /dev/null
@@ -1,542 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "module_globals_resolve.hpp"
-#include <atomic>
-#include <memory.h>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/pass/graph_constant_cache.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/cpu/closurize.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-SC_MODULE(pass.module_globals_resolve);
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(module_globals_resolver,
-        SC_PASS_DEPENDS_ON(closurizer_cpu, kernel_lowering_cpu,
-                dynamic_parallel_transform),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-static bool is_expr_nullptr(const expr &e) {
-    if (e->dtype_.is_pointer() && e.isa<constant>()) {
-        return e.checked_as<constant>()->value_.front().u64 == 0;
-    }
-    return false;
-}
-
-static const char *shared_const_handle_name = "__shared_const_handle";
-
-class module_globals_resolver_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    std::unordered_map<std::string, func_t> *map;
-    expr current_base;
-    expr current_rtl_ctx;
-    bool is_top_level = true;
-    uintptr_t absolute_base_ = 0;
-    // change the global symbol to a local variable
-    std::vector<expr> global_symbol_local_def_;
-    std::unordered_map<expr_c, expr> global_symbol_replace_map_;
-    std::vector<expr> shared_const_base_tsr_;
-
-    expr_c visit(call_c v) override {
-        func_t the_func = std::dynamic_pointer_cast<func_base>(v->func_);
-        expr the_expr;
-        if (!the_func) {
-            the_expr = expr(std::dynamic_pointer_cast<expr_base>(v->func_));
-            assert(the_expr.defined() && the_expr->attr_->has_key("prototype"));
-            the_func = the_expr->attr_->get<func_t>("prototype");
-        }
-        auto itr = map->find(the_func->name_);
-        if (itr == map->end()) {
-            // if is parallel-call function
-            if (v->func_ == get_parallel_call_with_env_func(true)
-                    || v->func_ == get_parallel_call_with_env_func(false)) {
-                std::vector<expr> ret;
-                dispatch_expr_vector(v->args_, ret);
-                assert(ret.size() == 8UL);
-                ret[2] = current_rtl_ctx;
-                ret[3] = current_base;
-                return copy_attr(*v, make_expr<call_node>(v->func_, ret));
-            } else if (v->func_->attr_
-                    && v->func_->attr_->get_or_else(
-                            "is_brgemm_func_with_stream", false)) {
-                COMPILE_ASSERT(!v->args_.empty()
-                                && v->args_.back()->dtype_
-                                        == datatypes::pointer,
-                        "The last arg of brgemm function should be a "
-                        "pointer, got "
-                                << v);
-                if (is_expr_nullptr(v->args_.back())) {
-                    std::vector<expr> newargs;
-                    dispatch_expr_vector(v->args_, newargs);
-                    newargs.back() = current_rtl_ctx;
-                    return copy_attr(*v, builder::make_call(the_func, newargs));
-                }
-            }
-            return ir_visitor_t::visit(v);
-        }
-        std::vector<expr> ret;
-        dispatch_expr_vector(v->args_, ret);
-        ret.insert(ret.begin(), 2, expr());
-        ret[0] = current_rtl_ctx;
-        ret[1] = current_base;
-        if (the_expr.defined()) {
-            the_expr->attr_->set("prototype", itr->second->decl_);
-            return copy_attr(*v, make_expr<call_node>(the_expr, ret));
-        }
-        return copy_attr(*v, builder::make_call(itr->second->decl_, ret));
-    }
-
-    expr_c visit(constant_c v) override {
-        if (v->dtype_ == datatypes::pointer && v->value_[0].u64 == 0) {
-            if (any_map_t::fetch_or_else(
-                        v->attr_.get(), "auto_fill_stream", false)) {
-                return current_rtl_ctx;
-            }
-            if (any_map_t::fetch_or_else(
-                        v->attr_.get(), "auto_fill_module_data", false)) {
-                return builder::make_reinterpret(
-                        current_base, datatypes::pointer);
-            }
-        }
-        return v;
-    }
-    expr_c visit(func_addr_c v) override {
-        auto itr = map->find(v->func_->name_);
-        if (itr == map->end()) { return ir_visitor_t::visit(v); }
-        return copy_attr(*v, builder::make_func_addr(itr->second->decl_));
-    }
-
-    expr_c get_or_create_new_var(const expr_c &old) {
-        auto itr = global_symbol_replace_map_.find(old);
-        if (itr != global_symbol_replace_map_.end()) { return itr->second; }
-        auto ret = old->remake();
-        // if using absolute base, the module_global_offset is the pointer of
-        // the global var
-        if (absolute_base_ && ret.isa<var>()) {
-            auto &attr = ret->attr_->get_any(attr_keys::module_global_offset);
-            void *ptr = reinterpret_cast<void *>(
-                    attr.get<size_t>() + absolute_base_);
-            attr = ptr;
-        }
-        global_symbol_local_def_.emplace_back(ret);
-        global_symbol_replace_map_.insert(std::make_pair(old, ret));
-        return ret;
-    }
-
-    // for var and tensor, if they are global, change them to local definitions
-    expr_c visit(tensor_c v) override {
-        if (v->attr_ && v->attr_->has_key(attr_keys::module_global_offset)) {
-            return get_or_create_new_var(v);
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    expr_c visit(var_c v) override {
-        if (v->attr_ && v->attr_->has_key(attr_keys::module_global_offset)) {
-            return get_or_create_new_var(v);
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(define_c v) override {
-        if (!v->init_.defined()) {
-            if (auto buf = any_map_t::fetch_or_else<
-                        std::shared_ptr<cached_const_graph_tensor>>(
-                        v->var_->attr_.get(), attr_keys::shared_const,
-                        nullptr)) {
-                auto idx = v->attr_->get<size_t>("shared_const_handle_idx");
-                auto newvar = dispatch(v->var_);
-                COMPILE_ASSERT(!v->init_.defined(),
-                        "Bad shared const tensor with init");
-                return copy_attr(*v,
-                        builder::make_var_tensor_def_unattached(newvar,
-                                linkage::local,
-                                builder::tensor_ptr(
-                                        shared_const_base_tsr_.at(idx),
-                                        {buf->offset_})));
-            }
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    // converts the global vars used in function to local vars defined in
-    // top-level scope
-    stmt_c visit(stmts_c v) override {
-        if (is_top_level) {
-            is_top_level = false;
-            auto ret = ir_visitor_t::visit(v).checked_as<stmts>();
-            if (global_symbol_local_def_.empty()) { return ret; }
-            std::vector<stmt_c> seq;
-            for (auto &v : global_symbol_local_def_) {
-                expr init;
-                if (v.isa<tensor>()) {
-                    auto anyoffset = v->attr().get_any(
-                            attr_keys::module_global_offset);
-                    if (auto poffset = anyoffset.get_or_null<size_t>()) {
-                        auto offset = *poffset;
-                        if (absolute_base_) {
-                            init = make_expr<constant_node>(
-                                    (uint64_t)(absolute_base_ + offset),
-                                    datatypes::s8.get_pointerof());
-                        } else {
-                            init = builder::tensor_ptr(current_base, {offset});
-                        }
-                    } else {
-                        init = make_expr<constant_node>(
-                                (uint64_t)(anyoffset.get<void *>()),
-                                datatypes::s8.get_pointerof());
-                    }
-                }
-                seq.emplace_back(builder::make_var_tensor_def_unattached(
-                        v, linkage::local, init));
-            }
-            auto old_seq_size = seq.size();
-            if (shared_const_base_tsr_.empty()) {
-                seq.insert(seq.end(), ret->seq_.begin(), ret->seq_.end());
-            } else {
-                // insert shared_const_base_tsr definition after definition and
-                // initialization of is_init
-                bool found_init = false;
-                bool shared_const_base_inserted = false;
-                for (auto &olds : ret->seq_) {
-                    if (!shared_const_base_inserted) {
-                        if (any_map_t::fetch_or_else(olds->attr_.get(),
-                                    attr_keys::is_shared_const_init_stmt,
-                                    false)) {
-                            found_init = true;
-                        } else {
-                            if (found_init) {
-                                for (auto &tsr : shared_const_base_tsr_) {
-                                    // the function name is too long for clang
-                                    // format to break
-                                    // clang-format off
-                                    seq.emplace_back(
-                                        builder::make_var_tensor_def_unattached(
-                                                            tsr,
-                                                            linkage::local));
-                                    // clang-format on
-                                }
-                                shared_const_base_inserted = true;
-                            }
-                        }
-                    }
-                    seq.emplace_back(olds);
-                }
-                if (!shared_const_base_inserted) {
-                    for (auto &tsr : shared_const_base_tsr_) {
-                        seq.insert(seq.begin() + old_seq_size,
-                                builder::make_var_tensor_def_unattached(
-                                        tsr, linkage::local));
-                    }
-                }
-            }
-            return copy_attr(*v, builder::make_stmts_unattached(seq));
-        } else {
-            return ir_visitor_t::visit(v);
-        }
-    }
-};
-
-static size_t align_to_64(size_t v) {
-    return utils::divide_and_ceil(v, 64) * 64;
-}
-
-static size_t get_tensor_size(const tensor &tsr, const define &def) {
-    COMPILE_ASSERT(tsr->dims_.size() == 1 && tsr->dims_[0].isa<constant>(),
-            "The global tensor should be 1D and the dims should be "
-            "constant, got "
-                    << def);
-    auto size = get_const_as_int(tsr->dims_[0].static_as<constant>())
-            * utils::get_sizeof_type(tsr->elem_dtype_);
-    return size;
-}
-
-static size_t update_allocated_size(
-        const tensor &tsr, size_t size, size_t allocated_size) {
-    if (auto absptr = any_map_t::fetch_or_else<void *>(
-                tsr->attr_.get(), attr_keys::static_global, nullptr)) {
-        tsr->attr()[attr_keys::module_global_offset] = absptr;
-        return allocated_size;
-    }
-    if (size >= 64) { allocated_size = align_to_64(allocated_size); }
-    tsr->attr()[attr_keys::module_global_offset] = allocated_size;
-    allocated_size += size;
-    return allocated_size;
-}
-
-/**
- * shared const tensors are special local tensors. We collect all shared const
- * tensors in main-entry and assign a "handle" for each of the shared const
- * tensors. The handles are pointers to runtime::const_cache_proxy. To keep them
- * alive, we put them in statics_table_t in the final jit_module
- * */
-static std::vector<void *> process_shared_const_tensors(const ir_module_ptr &m,
-        std::vector<std::shared_ptr<cached_const_graph_tensor>> &out_graph_tsr,
-        std::vector<expr> &out_base_tsr) {
-    auto mainf = m->get_entry_func();
-    if (!mainf) { return {}; }
-    std::vector<void *> collected_base;
-    auto &seq = mainf->body_.checked_as<stmts>()->seq_;
-    auto bases = m->attr_.get_or_null<
-            std::vector<std::shared_ptr<runtime::const_cache_proxy>>>(
-            ir_module_t::attr_key_t::SHARED_CONST_BASES);
-    if (bases) {
-        out_base_tsr.reserve(bases->size());
-        for (auto &base : *bases) {
-            out_base_tsr.emplace_back(
-                    builder::make_tensor("__shared_const_base_"
-                                    + std::to_string(collected_base.size()),
-                            {base->size_}, datatypes::u8));
-            out_base_tsr.back()->attr()[attr_keys::shared_const_base_idx]
-                    = static_cast<size_t>(collected_base.size());
-            // collect the list of the buffer handle used. If the buffer is not
-            // lazy, then it will be never evicted. We can directly use the
-            // buffer pointer instead of the handle
-            void *handle = (base->is_lazy_) ? base.get()
-                                            : base->get_buffer_if_not_lazy();
-            collected_base.emplace_back(handle);
-        }
-    }
-    for (auto &s : seq) {
-        // if the stmt is a define node with cached_const_graph_tensor
-        if (auto const_graph_tsr
-                = s.cast<define_c>()
-                          .flat_map([](const define_c &v) {
-                              return v->var_.cast<tensor>();
-                          })
-                          .map([](const tensor &v) {
-                              return any_map_t::fetch_or_else<std::shared_ptr<
-                                      cached_const_graph_tensor>>(
-                                      v->attr_.get(), attr_keys::shared_const,
-                                      nullptr);
-                          })
-                          .get_or_else(nullptr)) {
-            out_graph_tsr.emplace_back(const_graph_tsr);
-            const auto &base = const_graph_tsr->buf_base_;
-
-            void *handle = (base->is_lazy_) ? base.get()
-                                            : base->get_buffer_if_not_lazy();
-            COMPILE_ASSERT(
-                    bases, "Expecting SHARED_CONST_BASES attr in the module");
-            auto itr = std::find(bases->begin(), bases->end(), base);
-            COMPILE_ASSERT(itr != bases->end(),
-                    "Cannot find the shared const's base buffer in "
-                    "SHARED_CONST_BASES of the module");
-            auto idx = static_cast<size_t>(itr - bases->begin());
-            s->attr()["shared_const_handle_idx"] = idx;
-            if (!out_base_tsr[idx]->attr().has_key(attr_keys::shared_const)) {
-                out_base_tsr[idx]->attr()[attr_keys::shared_const]
-                        = const_graph_tsr;
-            }
-        }
-    }
-    if (!collected_base.empty()) {
-        auto tsr = m->make_global_tensor(datatypes::index,
-                shared_const_handle_name, {uint64_t(collected_base.size())});
-    }
-    return collected_base;
-}
-
-const_ir_module_ptr module_globals_resolver_t::operator()(
-        const_ir_module_ptr m) {
-    auto ret = std::make_shared<ir_module_t>(*m);
-    std::vector<std::shared_ptr<cached_const_graph_tensor>> out_graph_tsr;
-    std::vector<expr> shared_const_base_tsr;
-    auto base_cache_tsr = process_shared_const_tensors(
-            ret, out_graph_tsr, shared_const_base_tsr);
-    // first, plan the memory layout for the static buffer to be used in an
-    // IR module:
-    // ------------
-    // global vars
-    // ------------
-    // initialized global tensors
-    // ------------
-    // uninitialized global tensors
-    // ------------
-    size_t allocated_size = 0;
-    for (auto &def : ret->get_module_vars()) {
-        if (def->var_.isa<var>()) {
-            auto size = utils::get_sizeof_type(def->var_->dtype_);
-            // natural alignment of the address
-            allocated_size
-                    = utils::divide_and_ceil(allocated_size, size) * size;
-            def->var_->attr()[attr_keys::module_global_offset] = allocated_size;
-            allocated_size += utils::get_sizeof_type(def->var_->dtype_);
-        }
-    }
-    // initialized tensors
-    for (auto &def : ret->get_module_vars()) {
-        if (def->var_.isa<tensor>()) {
-            auto tsr = def->var_.static_as<tensor>();
-            if (tsr->init_value_) {
-                auto size = get_tensor_size(tsr, def);
-                COMPILE_ASSERT(tsr->init_value_->size_ == size,
-                        "The size of the global tensor ("
-                                << tsr->init_value_->size_
-                                << ") does not match its "
-                                   "initializer ("
-                                << size << "): " << def);
-                allocated_size
-                        = update_allocated_size(tsr, size, allocated_size);
-            }
-        }
-    }
-    size_t init_size = allocated_size;
-    for (auto &def : ret->get_module_vars()) {
-        if (def->var_.isa<tensor>()) {
-            auto tsr = def->var_.static_as<tensor>();
-            if (!tsr->init_value_) {
-                auto size = get_tensor_size(tsr, def);
-                allocated_size
-                        = update_allocated_size(tsr, size, allocated_size);
-            }
-        }
-    }
-
-    std::shared_ptr<statics_table_t> sym_table
-            = std::make_shared<statics_table_t>(
-                    aligned_buffer_t(allocated_size, m->ctx_->engine_));
-    sym_table->initialized_size_ = init_size;
-    sym_table->shared_tensors_ = std::move(out_graph_tsr);
-    for (auto &def : ret->get_module_vars()) {
-        auto &anyoffset
-                = def->var_->attr().get_any(attr_keys::module_global_offset);
-        if (def->var_.isa<var>()) {
-            sym_table->add(
-                    def->var_.static_as<var>()->name_, anyoffset.get<size_t>());
-        } else {
-            auto tsr = def->var_.checked_as<tensor>();
-            if (auto offset = anyoffset.get_or_null<size_t>()) {
-                sym_table->add(tsr->name_, *offset);
-                if (tsr->init_value_) {
-                    memcpy(reinterpret_cast<char *>(sym_table->data_.data_)
-                                    + *offset,
-                            tsr->init_value_->data_, tsr->init_value_->size_);
-                }
-            }
-        }
-    }
-    // fill the shared_const_handles
-    if (!base_cache_tsr.empty()) {
-        if (auto handles = sym_table->get_or_null(shared_const_handle_name)) {
-            memcpy(handles, base_cache_tsr.data(),
-                    sizeof(base_cache_tsr[0]) * base_cache_tsr.size());
-        } else {
-            throw std::runtime_error(
-                    "Cannot find the shared_const_handle in module data");
-        }
-    }
-    uintptr_t baseptr = (uintptr_t)sym_table->data_.data_;
-    bool is_static_global = m->attr_.get_or_else(
-            ir_module_t::attr_key_t::STATIC_GLOBALS, false);
-    ret->attr_[ir_module_t::attr_key_t::MODULE_DATA_BUFFERS]
-            = std::move(sym_table);
-    auto &funcs = ret->get_contents();
-    // second, pass the base pointer of the global buffer from "main entry"
-    // func to all IR functions. We also need to replace the call nodes and
-    // append a new parameter to pass down the global buffer pointer
-    std::unordered_map<std::string, func_t> replace_map;
-    for (auto &f : funcs) {
-        if (!f->body_.defined()) { continue; }
-        if (f->attr_
-                && f->attr_->get_or_else(function_attrs::low_level, false)) {
-            continue;
-        }
-        if (any_map_t::fetch_or_else(f->attr_.get(), "device_func", false)) {
-            continue;
-        }
-        auto params = f->params_;
-        // insert two placeholders
-        params.insert(params.begin(), 2, expr());
-        params[0] = builder::make_var(datatypes::pointer, "__stream");
-        params[1] = builder::make_tensor("__module_data", {0UL}, datatypes::s8);
-        auto retf = builder::make_func(f->name_, params,
-                builder::make_stmts_unattached({}), f->ret_type_);
-        if (f->attr_) {
-            // we added new args, we need to update the comments
-            if (auto comments = f->attr_->get_or_null<std::vector<std::string>>(
-                        "comments")) {
-                if (comments->size() >= 2
-                        && utils::string_startswith((*comments)[1], "@param")) {
-                    std::vector<std::string> new_comments = {comments->at(0),
-                            "@param __stream the stream pointer, usually "
-                            "get_default_stream()",
-                            "@param __module_data the module global data"};
-                    for (size_t i = 1; i < comments->size(); i++) {
-                        auto &comment = (*comments)[i];
-                        new_comments.emplace_back(comment);
-                    }
-                    retf->decl_->attr()["comments"] = new_comments;
-                    retf->attr()["comments"] = std::move(new_comments);
-                }
-            }
-        }
-        auto *funcp = retf.get();
-        replace_map[f->name_] = copy_attr(*f, std::move(retf));
-        funcp->decl_ = copy_attr(*f, std::move(funcp->decl_));
-    }
-    for (unsigned i = 0; i < funcs.size(); i++) {
-        if (!funcs[i]->body_.defined()) { continue; }
-        if (funcs[i]->attr_
-                && funcs[i]->attr_->get_or_else(
-                        function_attrs::low_level, false)) {
-            continue;
-        }
-        if (any_map_t::fetch_or_else(
-                    funcs[i]->attr_.get(), "device_func", false)) {
-            continue;
-        }
-        module_globals_resolver_impl_t impl;
-        if ((int)i == ret->get_entry_func_idx()) {
-            impl.shared_const_base_tsr_ = std::move(shared_const_base_tsr);
-            // if it is main func, make sure shared_const_handle is used
-            auto handle_tsr
-                    = ret->get_var_def_from_symbol(shared_const_handle_name);
-            if (handle_tsr.defined()) {
-                impl.get_or_create_new_var(handle_tsr->var_);
-            }
-        }
-        impl.absolute_base_ = is_static_global ? baseptr : 0;
-        impl.map = &replace_map;
-        auto funcp = replace_map[funcs[i]->name_];
-        assert(funcp);
-        impl.current_rtl_ctx = funcp->params_[0];
-        impl.current_base = funcp->params_[1];
-        funcp->body_ = impl.dispatch(funcs[i]->body_).remove_const();
-        funcs[i] = funcp;
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/module_globals_resolve.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/module_globals_resolve.hpp
deleted file mode 100644
index 170c6a1e95e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/module_globals_resolve.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_MODULE_GLOBALS_RESOLVE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_MODULE_GLOBALS_RESOLVE_HPP
-
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace attr_keys {
-constexpr const char *static_global = "static_global";
-constexpr const char *module_global_offset = "module_global_offset";
-} // namespace attr_keys
-/**
- * Resolves global variables and tensors to module_globals_pointer + offsets.
- * Also modifies all function arguments defined in the current module by
- * appending an additional argument for module_globals_pointer
- * */
-class module_globals_resolver_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr m) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/nested_parallel_flatten.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/nested_parallel_flatten.cpp
deleted file mode 100644
index c7086d76048..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/nested_parallel_flatten.cpp
+++ /dev/null
@@ -1,507 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "nested_parallel_flatten.hpp"
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(nested_parallel_flattener,
-        SC_PASS_DEPENDS_ON(validator, buffer_rescheduling_tensor_hoisting),
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED, IR_SIMPLIFIED),
-        SC_PASS_REQUIRE_NOT_STATE(), SC_PASS_SET_STATE(),
-        SC_PASS_UNSET_STATE(CONST_FOLDED, IR_SIMPLIFIED));
-
-// In this pass, we do and only do nested parallel flattening.
-// Tensor hoisting is already done in buffer_rescheduling_tensor_hoisting pass.
-class nested_parallel_flatten_impl_t : public ir_visitor_t {
-    struct parallel_info_t {
-        int num_groups_;
-        int threads_per_group_;
-        expr thread_id_;
-        expr group_id_;
-        expr barriers_;
-        parallel_info_t(int num_groups, int threads_per_group)
-            : num_groups_(num_groups), threads_per_group_(threads_per_group) {}
-    };
-
-    std::vector<parallel_info_t> info_;
-    std::vector<stmt> *top_level_parallel_seq_ = nullptr;
-    expr global_tid_;
-    int runtime_threads_ = runtime_config_t::get().get_num_threads();
-    int count_ = 0;
-    int var_count_ = 0;
-    int for_count_ = 0;
-    bool cannot_parallel_ = false;
-    bool need_pre_barrier_ = false;
-    bool need_post_barrier_ = false;
-
-public:
-    using ir_visitor_t::dispatch;
-
-    std::string make_name(const char *n) {
-        std::string name = n;
-        name += std::to_string(count_);
-        name += '_';
-        name += std::to_string(info_.size());
-        name += '_';
-        name += std::to_string(++var_count_);
-        return name;
-    }
-
-    expr get_barrier_for_current_for() {
-        COMPILE_ASSERT(top_level_parallel_seq_ && info_.size() > 1UL,
-                "Invalid for-loop");
-        int num_barrier = 1;
-        constexpr uint64_t barrier_size = sizeof(runtime::barrier_t);
-        expr idx = info_[info_.size() - 2].group_id_ * barrier_size;
-        for (int64_t i = info_.size() - 2; i >= 0; i--) {
-            num_barrier *= info_[i].num_groups_;
-            if (i != 0) {
-                idx = info_[i - 1].group_id_ * (num_barrier * barrier_size)
-                        + idx;
-            }
-        }
-        if (info_.back().barriers_.defined()) {
-            return builder::tensor_ptr(info_.back().barriers_, {idx});
-        }
-
-        info_.back().barriers_ = builder::make_tensor(make_name("_barrier"),
-                {num_barrier * barrier_size}, datatypes::u8);
-        top_level_parallel_seq_->emplace_back(
-                builder::make_var_tensor_def_unattached(
-                        info_.back().barriers_));
-        top_level_parallel_seq_->emplace_back(builder::make_evaluate_unattached(
-                builtin::get_init_barrier_func()(info_.back().barriers_,
-                        num_barrier,
-                        uint64_t(info_[info_.size() - 2].threads_per_group_))));
-        return builder::tensor_ptr(info_.back().barriers_, {idx});
-    }
-
-    void gen_call_to_barrier(
-            std::vector<stmt> *cur_insert_point, int post_barrier_id) {
-        auto b = get_barrier_for_current_for();
-        auto the_call = builtin::get_barrier_arrive_func()(
-                b, get_ir_null(), get_ir_null());
-        cur_insert_point->emplace_back(
-                builder::make_evaluate_unattached(the_call));
-        if (post_barrier_id >= 0) {
-            the_call->attr()["post_barrier_id"] = post_barrier_id;
-        }
-    }
-
-    bool is_trace_call(const stmt &v) {
-        return v.cast<evaluate>()
-                .map([](const evaluate &v) { return v->value_.as<call>(); })
-                .map([](const call &v) {
-                    return dynamic_cast<func_base *>(v->func_.get());
-                })
-                .filter([](func_base *f) {
-                    return f->attr_
-                            && f->attr_->get_or_else(
-                                    function_attrs::is_trace_func, false);
-                })
-                .has_value();
-    }
-
-    /*
-transforming:
-void work() {
-#pragma omp parallel for num_threads(4)
-    for (int i = 0; i < 1000; i++) {
-        aaa(i);
-        bbb(i);
-#pragma omp parallel for num_threads(2)
-        for (int j = 0; j < 123; j++) {
-            ccc(j);
-            ddd(j);
-#pragma omp parallel for num_threads(8)
-          for (int k = 0; k < 56; k++) {
-              eee(k);
-              fff(k);
-          }
-        }
-    }
-}
-=====================================
-to:
-void work() {
-    barrier bar1[4]; // 1 is group level. 4 is the thread number of first group,
-                     // each bar sync 16 threads
-    barrier bar2[4][2]; // 2 is group level. 2 is the thread number of first
-                        // group, each bar sync 8 threads
-#pragma omp parallel for num_threads(64)
-    for (int tid0 = 0; tid0 < 64; tid0++) {
-        int gid1 = tid0 / 16; // group id level 1, 0-3
-        int local_tid1 = tid0 % 16; // thread id in group level 1, 0-15
-        int start1, end1;
-        balance211(gid1, 0, 1000, &start1, &end1);
-        // simulate the line : for (int i = 0; i < 1000; i++) {
-        // 64-thread group will execute following
-        for (int i = start1; i < end1; i++) {
-            if (local_tid1 == 0) {
-                // only 1 threads in 16-thread group will execute this
-                aaa(i);
-                bbb(i);
-            }
-            // start of the code: for (int j = 0; j < 123; j++) {
-            // sync 16 threads in the group
-            int gid2 = local_tid1 / 8; // group id level 2, 0-1
-            int local_tid2 = local_tid1 % 8; // thread id in group level 2
-            int start2, end2;
-            balance211(gid2, 0, 123, &start2, &end2);
-            bar1[gid1].enter_and_wait();
-            for (int j = start2; j < end2; j++) {
-                if (local_tid2 == 0) {
-                    ccc(j);
-                    ddd(j);
-                }
-                // last level has 8 threads in each group
-                int gid3 = local_tid1 / 1;
-                int local_tid3 = local_tid1 % 1; // thread id in group level 3
-                int start3, end3;
-                balance211(gid3, 0, 56, &start3, &end3);
-                bar2[gid1][gid2].enter_and_wait();
-                for (int k = start3; k < end3; k++) {
-                    eee(k);
-                    fff(k);
-                }
-                bar2[gid1][gid2].enter_and_wait();
-            }
-            bar1[gid1].enter_and_wait();
-        }
-    }
-}
-    */
-    void transform_loop(const for_loop_c &v, int num_threads_parent_group,
-            std::vector<stmt> &seq, expr tid0, bool need_pre_barrier, // NOLINT
-            bool need_post_barrier) {
-        int cur_post_barrier_id = for_count_;
-        for_count_++;
-        COMPILE_ASSERT(info_.empty() || info_.front().threads_per_group_ != 0,
-                "Cannot handle nested parallel-for without num threads in most "
-                "outer parallel-for");
-        int num_threads = v->num_threads_;
-        if (num_threads == 0) { num_threads = num_threads_parent_group; }
-        bool divisible = num_threads_parent_group % num_threads == 0;
-        uint64_t threads_per_group = num_threads_parent_group / num_threads;
-        COMPILE_ASSERT(threads_per_group > 0,
-                "Too many threads in this parallel: " << v);
-
-        info_.emplace_back(num_threads, threads_per_group);
-        auto gid1 = builder::make_var(datatypes::index, make_name("_gid"));
-        info_.back().group_id_ = gid1;
-        auto tid1 = builder::make_var(datatypes::index, make_name("_tid"));
-        info_.back().thread_id_ = tid1;
-        seq.emplace_back(builder::make_var_tensor_def_unattached(
-                gid1, linkage::local, tid0 / threads_per_group));
-        seq.emplace_back(builder::make_var_tensor_def_unattached(
-                tid1, linkage::local, tid0 % threads_per_group));
-        std::vector<stmt> *cur_insert_point = &seq;
-        // if parent threads per group is not divisible by the current num of
-        // threads, gid may excess num_threads, need to skip it
-        if (!divisible) {
-            auto gid_ok_body = make_stmt<stmts_node_t>(std::vector<stmt> {});
-            stmts gid_skip_body;
-            if (need_pre_barrier) {
-                gid_skip_body = make_stmt<stmts_node_t>(std::vector<stmt> {});
-                gen_call_to_barrier(&gid_skip_body->seq_, -1);
-            }
-            seq.emplace_back(builder::make_if_else_unattached(
-                    gid1 < make_expr<constant_node>(
-                            uint64_t(num_threads), datatypes::index),
-                    gid_ok_body, gid_skip_body));
-            cur_insert_point = &gid_ok_body->seq_;
-            // the balance211 & for body code will be omited in the if body
-        }
-        expr begin, end;
-        builtin::generate_balance211(
-                num_threads, v->iter_begin_, v->iter_end_, v->step_, gid1,
-                [&](const char *v) { return make_name(v); }, &begin, nullptr,
-                &end, cur_insert_point);
-        if (need_pre_barrier) { gen_call_to_barrier(cur_insert_point, -1); }
-
-        auto new_body = make_stmt<stmts_node_t>(std::vector<stmt> {});
-        auto step_expr = v->step_->dtype_ == datatypes::index
-                ? v->step_
-                : constant_folder_t()(
-                        builder::make_cast(datatypes::index, v->step_));
-        cur_insert_point->emplace_back(builder::make_for_loop_unattached(
-                v->var_, begin, end, step_expr, new_body, v->incremental_,
-                for_type::NORMAL));
-
-        auto &old_body = v->body_.checked_as<stmts>()->seq_;
-        stmts single_thread_body;
-        bool local_need_pre_barrier = false;
-        // convert old body of v to new_body
-        for (size_t i = 0; i < old_body.size(); i++) {
-            if (old_body[i].isa<for_loop>()
-                    && old_body[i].static_as<for_loop>()->kind_
-                            == for_type::PARALLEL) {
-                cannot_parallel_ = false;
-                // if there are single-threaded sections in the for-body, we
-                // need sync the threads
-                need_pre_barrier_ = local_need_pre_barrier;
-                need_post_barrier_ = false;
-                // check if we need to insert post barrier. If the current
-                // parallel-for is the last statement in parent parallel-for, we
-                // don't need the barrier
-                for (size_t n = i + 1; n < old_body.size(); n++) {
-                    // if the next stmt is a pure definition, we can ignore it
-                    if (old_body[n].isa<define_c>()) {
-                        auto &initv = old_body[n].static_as<define_c>()->init_;
-                        if (!initv.defined() || initv.isa<constant>()) {
-                            continue;
-                        }
-                    } else if (is_trace_call(old_body[n])) {
-                        continue;
-                    }
-
-                    // otherwise, we cannot remove the barrier because the stmt
-                    // may depend on the current loop
-                    need_post_barrier_ = true;
-                    break;
-                }
-                auto body = dispatch(old_body[i])
-                                    .remove_const()
-                                    .checked_as<stmts>();
-                new_body->seq_.insert(new_body->seq_.end(), body->seq_.begin(),
-                        body->seq_.end());
-                single_thread_body = stmts();
-                local_need_pre_barrier = false;
-            } else if (old_body[i]
-                               .cast<define>()
-                               .filter([](const define &v) {
-                                   if (!v->init_.defined()) { return true; }
-                                   bool is_allowed = v->init_.isa<indexing>()
-                                           || (v->init_.isa<select>()
-                                                   && utils::is_one_of(
-                                                           v->init_->dtype_,
-                                                           datatypes::u32,
-                                                           datatypes::u16,
-                                                           datatypes::u8));
-                                   return !is_allowed;
-                               })
-                               .has_value()) {
-                // if the statement is a define node and the init value is not
-                // indexing (bypass a LLVM bug)
-                if (old_body[i].static_as<define>()->init_.defined()) {
-                    single_thread_body = stmts();
-                    new_body->seq_.emplace_back(
-                            dispatch(old_body[i]).remove_const());
-                } else {
-                    // if the def node is a pure definition, lift it to the
-                    // begining of the block and don't break the current
-                    // for-body
-                    new_body->seq_.insert(new_body->seq_.begin(),
-                            dispatch(old_body[i]).remove_const());
-                }
-            } else if (is_trace_call(old_body[i])) {
-                new_body->seq_.emplace_back(
-                        dispatch(old_body[i]).remove_const());
-            } else {
-                cannot_parallel_ = true;
-                auto dispatched = dispatch(old_body[i]).remove_const();
-                bool is_set_idle_func_call = dispatched.isa<evaluate>()
-                        && dispatched.static_as<evaluate>()
-                                   ->value_.isa<intrin_call>()
-                        && dispatched.static_as<evaluate>()
-                                        ->value_.static_as<intrin_call>()
-                                        ->type_
-                                == intrin_type::set_thread_idle_func;
-                if (is_set_idle_func_call) {
-                    dispatched.static_as<evaluate_c>()
-                            ->value_.static_as<intrin_call>()
-                            ->attr()["post_barrier_id"]
-                            = for_count_;
-                }
-                if (threads_per_group > 1 && !is_set_idle_func_call) {
-                    if (dispatched.isa<stmts>()
-                            && dispatched.static_as<stmts>()->seq_.empty()) {
-                        // if it is a hoisted tensor..., don't need to add to
-                        // the body
-                    } else {
-                        local_need_pre_barrier = true;
-                        if (!single_thread_body.defined()) {
-                            single_thread_body = make_stmt<stmts_node_t>(
-                                    std::vector<stmt> {});
-                            new_body->seq_.emplace_back(
-                                    builder::make_if_else_unattached(
-                                            tid1 == UINT64_C(0),
-                                            single_thread_body, stmt()));
-                        }
-                        single_thread_body->seq_.emplace_back(dispatched);
-                    }
-
-                } else {
-                    new_body->seq_.emplace_back(dispatched);
-                }
-            }
-        }
-        if (need_post_barrier) {
-            gen_call_to_barrier(&seq, cur_post_barrier_id);
-        }
-
-        info_.pop_back();
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        COMPILE_ASSERT(
-                v->num_threads_ >= 0 && v->num_threads_ <= runtime_threads_,
-                "Bad thread count: " << v);
-        if (v->kind_ == for_type::PARALLEL) {
-            COMPILE_ASSERT(!cannot_parallel_,
-                    "Cannot parallel here. The inner parallel for must be "
-                    "directly nested in parent parallel-for. "
-                            << v);
-            if (info_.empty()) {
-                if (v->num_threads_ == 0
-                        || v->num_threads_ == runtime_threads_) {
-                    // first level loop is using all threads, do not transform
-                    info_.emplace_back(0, 0);
-                    top_level_parallel_seq_ = nullptr;
-                    auto ret = ir_visitor_t::visit(v);
-                    info_.pop_back();
-                    return ret;
-                }
-
-                auto body_lv0 = make_stmt<stmts_node_t>(std::vector<stmt> {});
-                top_level_parallel_seq_ = &body_lv0->seq_;
-                auto body_lv1 = make_stmt<stmts_node_t>(std::vector<stmt> {});
-                auto tid0
-                        = builder::make_var(datatypes::index, make_name("tid"));
-                count_++;
-                // now refine the top-level number of threads
-                int num_threads = v->num_threads_;
-                COMPILE_ASSERT(runtime_threads_ >= num_threads,
-                        "num_threads of the loop excesses the total number of "
-                        "threads: "
-                                << v);
-                // use the greatest number of total threads divisible by the
-                // current num_threads
-                num_threads = runtime_threads_ / num_threads * num_threads;
-                global_tid_ = tid0;
-
-                auto for_lv1 = builder::make_for_loop_unattached(tid0,
-                        UINT64_C(0), uint64_t(num_threads), UINT64_C(1),
-                        body_lv1, true, for_type::PARALLEL);
-                transform_loop(
-                        v, num_threads, body_lv1->seq_, tid0, false, false);
-                body_lv0->seq_.emplace_back(for_lv1);
-                if (v->attr_
-                        && v->attr_->get_or_else(
-                                stmt_attr_key::no_post_barrier, false)) {
-                    for_lv1->attr()[stmt_attr_key::no_post_barrier] = true;
-                }
-                global_tid_ = expr();
-                top_level_parallel_seq_ = nullptr;
-                cannot_parallel_ = false;
-                return body_lv0;
-            } else {
-                // not first level parallel
-                assert(!info_.empty());
-                auto &parent_info = info_.back();
-                auto body_lv1 = make_stmt<stmts_node_t>(std::vector<stmt> {});
-                transform_loop(v, parent_info.threads_per_group_,
-                        body_lv1->seq_, parent_info.thread_id_,
-                        need_pre_barrier_,
-                        need_post_barrier_
-                                && !(v->attr_
-                                        && v->attr_->get_or_else(
-                                                stmt_attr_key::no_post_barrier,
-                                                false)));
-                return body_lv1;
-            }
-        } else {
-            // not parallel
-            return ir_visitor_t::visit(v);
-        }
-    }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt_c> newseq;
-        newseq.reserve(v->seq_.size());
-        bool changed = false;
-        for (auto &s : v->seq_) {
-            auto news = dispatch(s);
-            if (!news.ptr_same(s)) { changed = true; }
-            if (s.isa<for_loop>() && news.isa<stmts>()) {
-                auto &inner = news.static_as<stmts>()->seq_;
-                newseq.insert(newseq.end(), inner.begin(), inner.end());
-            } else {
-                newseq.emplace_back(news);
-            }
-        }
-        if (!changed) {
-            return v;
-        } else {
-            return copy_attr(*v, builder::make_stmts_unattached(newseq));
-        }
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        if (v->type_ == intrin_type::get_group_id) {
-            uint64_t level_id
-                    = get_const_as_int(v->args_[0].checked_as<constant_c>());
-            COMPILE_ASSERT(
-                    level_id < info_.size(), "Level of group out of range");
-            return info_[level_id].group_id_;
-        } else if (v->type_ == intrin_type::get_group_thread_id) {
-            int64_t level_id
-                    = get_const_as_int(v->args_[0].checked_as<constant_c>());
-            COMPILE_ASSERT(level_id < (int64_t)info_.size(),
-                    "Level of group out of range");
-            if (level_id < 0) {
-                if (global_tid_.defined()) {
-                    return builder::make_cast(datatypes::s32, global_tid_);
-                } else {
-                    return v;
-                }
-            } else {
-                return info_[level_id].thread_id_;
-            }
-        } else {
-            return ir_visitor_t::visit(v);
-        }
-    }
-};
-
-func_c nested_parallel_flattener_t::operator()(func_c f) {
-    nested_parallel_flatten_impl_t impl;
-    f = impl.dispatch(std::move(f));
-    return f;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/nested_parallel_flatten.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/nested_parallel_flatten.hpp
deleted file mode 100644
index faf6e5d28ae..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/nested_parallel_flatten.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_NESTED_PARALLEL_FLATTEN_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_NESTED_PARALLEL_FLATTEN_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Flatten nested parallel-for into a single parallel-for with thread groups and
- * threads will be synchronized with local barriers in their own thread groups.
- * */
-class nested_parallel_flattener_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/node_uniquify.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/node_uniquify.cpp
deleted file mode 100644
index 34ce53e5841..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/node_uniquify.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "node_uniquify.hpp"
-#include <utility>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass/ir_copy_internal.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class node_uniquifier_impl_t : public ir_visitor_t {
-    // the map used in ir_copier_impl_t
-    std::unordered_map<expr_c, expr> replace_map_;
-    ir_copier_impl_t cpyer;
-    // all currently met stmts
-    std::unordered_set<stmt_c> met_stmts_;
-    // all currently met exprs
-    std::unordered_set<expr_c> met_exprs_;
-
-public:
-    using ir_visitor_t::dispatch;
-    node_uniquifier_impl_t()
-        : cpyer(replace_map_, /*create_var_tensor*/ false) {}
-    expr_c dispatch(expr_c v) override {
-        if (met_exprs_.find(v) != met_exprs_.end()) {
-            // if it is met, copy it
-            // if it is var/tensor, ir_copier_t will not copy it
-            return cpyer.copy(v);
-        }
-        met_exprs_.insert(v);
-        // dispatch down to check sub-nodes
-        return ir_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        if (met_stmts_.find(v) != met_stmts_.end()) {
-            // it is is met, copy it
-            return cpyer.copy(v);
-        }
-        met_stmts_.insert(v);
-        // dispatch down to check sub-nodes
-        return ir_visitor_t::dispatch(std::move(v));
-    }
-};
-
-func_c node_uniquifier_t::operator()(func_c f) {
-    node_uniquifier_impl_t impl;
-    return impl.dispatch(std::move(f));
-}
-expr_c node_uniquifier_t::operator()(expr_c f) {
-    node_uniquifier_impl_t impl;
-    return impl.dispatch(std::move(f));
-}
-stmt_c node_uniquifier_t::operator()(stmt_c f) {
-    node_uniquifier_impl_t impl;
-    return impl.dispatch(std::move(f));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/node_uniquify.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/node_uniquify.hpp
deleted file mode 100644
index c1c154b1ada..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/node_uniquify.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_NODE_UNIQUIFY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_NODE_UNIQUIFY_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * A simple pass to walk through the IR DAG, and convert it into a "tree". If a
- * node is not var_node or tensor_node and has multiple parent nodes, this pass
- * will deep copy it and replace it. After this pass, each node except vars and
- * tensors should have its own unique parent. Note that var_nodes and
- * tensor_nodes are not touched and they may still have multiple uses
- * */
-class node_uniquifier_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    expr_c operator()(expr_c f);
-    stmt_c operator()(stmt_c f);
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_merge.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_merge.cpp
deleted file mode 100644
index a758cc6328c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_merge.cpp
+++ /dev/null
@@ -1,474 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <atomic>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "../util_module_passes.hpp"
-#include "parallel_merge.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/pass_id.hpp>
-#include <runtime/config.hpp>
-#include <runtime/trace.hpp>
-#include <unordered_map>
-#include <util/optional_find.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-SC_DECL_PASS_INFO(parallel_merge,
-        SC_PASS_DEPENDS_ON(
-                nested_parallel_flattener, parallel_workload_dispatcher),
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-static func_base *get_callee(const stmt &s) {
-    return s.cast<evaluate>()
-            .map([](const evaluate &v) { return v->value_.as<call>(); })
-            .map([](const call &v) {
-                return dynamic_cast<func_base *>(v->func_.get());
-            })
-            .get_or_else(nullptr);
-}
-
-static constexpr bool allow_buffer_def_in_merge = false;
-static bool is_tensor_def(const stmt &s) {
-    return allow_buffer_def_in_merge
-            && s.cast<define>()
-                       .filter([](const define &v) {
-                           if (v->var_.isa<tensor>()) {
-                               if (!v->init_.defined()
-                                       || v->init_.isa<tensorptr>()) {
-                                   return true;
-                               }
-                           }
-                           return false;
-                       })
-                       .has_value();
-}
-
-static bool is_func_ok_to_merge(func_base *f, uint64_t &num_threads) {
-    // the the function call is marked no_dep_prev_op, need to check the callee
-    // func body
-    if (!f->body_.defined()) { return false; }
-    int num_pfor = 0;
-    for (auto &s : f->body_.checked_as<stmts>()->seq_) {
-        if (auto func = get_callee(s)) {
-            // if the statement is a call, and the callee is trace-related
-            // function, we can still merge the loop
-            if (any_map_t::fetch_or_else(func->attr_.get(),
-                        function_attrs::is_trace_func, false)) {
-                continue;
-            }
-            // if the callee is barrier init func, we can still merge the loop
-            if (allow_buffer_def_in_merge
-                    && func == builtin::get_init_barrier_func().get()) {
-                continue;
-            }
-            // otherwise, there is complex stmt in the function. We can't
-            // parallel-merge it
-            return false;
-        } else if (is_tensor_def(s)) {
-            // if is a tensor definition
-            continue;
-        } else if (s.cast<returns>()
-                           .filter([](const returns &v) {
-                               return v->value_.isa<constant>();
-                           })
-                           .has_value()) {
-            // if is return-const
-            continue;
-        } else if (s.cast<for_loop>()
-                           .filter([&num_pfor, &num_threads](
-                                           const for_loop &v) {
-                               if (v->var_->dtype_ != datatypes::index) {
-                                   return false;
-                               }
-                               if (v->kind_ != for_type::PARALLEL) {
-                                   return false;
-                               }
-                               num_pfor++;
-                               if (num_pfor > 1) { return false; }
-                               if (!v->iter_begin_.isa<constant>()
-                                       || !v->iter_end_.isa<constant>()
-                                       || !v->step_.isa<constant>()) {
-                                   return false;
-                               }
-                               auto v1 = get_const_as_int(
-                                       v->iter_begin_.static_as<constant>());
-                               if (v1 != 0) { return false; }
-                               auto v2 = get_const_as_int(
-                                       v->iter_end_.static_as<constant>());
-                               if (v2 > runtime_config_t::get()
-                                                .get_num_threads()) {
-                                   return false;
-                               }
-                               auto v3 = get_const_as_int(
-                                       v->step_.static_as<constant>());
-                               if (v3 != 1) { return false; }
-                               num_threads = v2;
-                               return true;
-                           })
-                           .has_value()) {
-            // if is parallel-for
-            continue;
-        } else {
-            return false;
-        }
-    }
-    return true;
-}
-
-static func_base *get_func_to_merge(const ir_module_t &mod, const stmt &s,
-        uint64_t &num_threads, const std::string *&out_expected_next_op) {
-    auto ret = get_callee(s);
-    if (!ret) { return nullptr; }
-    out_expected_next_op = any_map_t::fetch_or_null<std::string>(
-            s->attr_.get(), attr_keys::no_post_barrier);
-    if (!out_expected_next_op) { return nullptr; }
-    ret = mod.get_func(ret->name_).get();
-    return is_func_ok_to_merge(ret, num_threads) ? ret : nullptr;
-}
-
-struct func_call_numthreads_t {
-    uint64_t threads_;
-    func_t f_;
-    call c_;
-    func_call_numthreads_t(uint64_t threads, const func_t &f, const call &c)
-        : threads_(threads), f_(f), c_(c) {}
-};
-
-// sort the func calls in the parallel section by the num threads needed by each
-// func. Try to balance each thread with similar size of workload
-static void sort_funcs(
-        std::vector<func_call_numthreads_t> &funcs, uint64_t total_threads) {
-    std::vector<func_call_numthreads_t *> func_ref;
-    func_ref.reserve(funcs.size());
-    for (auto &f : funcs) {
-        func_ref.emplace_back(&f);
-    }
-    // sort by num threads in descending order
-    std::stable_sort(func_ref.begin(), func_ref.end(),
-            [](func_call_numthreads_t *v1, func_call_numthreads_t *v2) {
-                return v1->threads_ > v2->threads_;
-            });
-
-    std::vector<func_call_numthreads_t> outfuncs;
-    outfuncs.reserve(funcs.size());
-    assert(!funcs.empty());
-    // example, total_threads=16, threads per func: 10,2,3,10,3,16
-    // sorted -> 16,10,10,3,3,2
-    // first pick 16, finish. First round = [16]
-    // pick 10, then find that 3 and 3 is ok to run in parallel. [10,3,3]
-    // pick 10, 2 is ok to run in parallel (3 is already picked). [10,2]
-    // Done!
-
-    // first pick the largest num_threads
-    for (size_t i = 0; i < funcs.size(); i++) {
-        // if the func is already picked, skip
-        if (!func_ref[i]) { continue; }
-        auto f = func_ref[i];
-        func_ref[i] = nullptr;
-        outfuncs.emplace_back(*f);
-        uint64_t remaining_threads = total_threads - f->threads_;
-        if (remaining_threads == 0) { continue; }
-        for (size_t j = i + 1; j < funcs.size(); j++) {
-            if (!func_ref[j]) { continue; }
-            if (remaining_threads >= func_ref[j]->threads_) {
-                remaining_threads -= func_ref[j]->threads_;
-                outfuncs.emplace_back(*func_ref[j]);
-                func_ref[j] = nullptr;
-            }
-            if (remaining_threads == 0) { break; }
-        }
-    }
-    funcs = std::move(outfuncs);
-}
-
-static void rename_var(const expr &v, int cnt) {
-    if (v.isa<var>()) {
-        v.static_as<var>()->name_ += "_";
-        v.static_as<var>()->name_ += std::to_string(cnt);
-    } else if (v.isa<tensor>()) {
-        v.static_as<tensor>()->name_ += "_";
-        v.static_as<tensor>()->name_ += std::to_string(cnt);
-    }
-}
-
-static func_t merge_funcs(std::vector<func_call_numthreads_t> &funcs) {
-    auto funcbody = builder::make_stmts_unattached({}).static_as<stmts>();
-    auto loop_body = builder::make_stmts_unattached({});
-    std::vector<stmt> &retseq = loop_body.static_as<stmts>()->seq_;
-    std::vector<expr> retargs;
-    std::string name = "parallel";
-    uint64_t remaining_threads = runtime_config_t::get().get_num_threads();
-    uint64_t total_threads = remaining_threads;
-    sort_funcs(funcs, total_threads);
-    bool all_private = true;
-    for (auto &f : funcs) {
-        name += '_';
-        name += '_';
-        name += f.f_->name_;
-        all_private = all_private
-                && any_map_t::fetch_or_else(
-                        f.f_->attr_.get(), function_attrs::private_, false);
-    }
-    auto loop_iter = builder::make_var(datatypes::index, "merged_tid");
-    bool alloc_thread_from_0 = true;
-    int func_id = 0;
-    for (auto &kv : funcs) {
-        const auto &f = kv.f_;
-        auto num_threads = kv.threads_;
-        int trace_id = 0;
-        bool tracing = false;
-
-        const auto &seq = f->body_.static_as<stmts>()->seq_;
-        std::unordered_map<expr_c, expr> replace_map;
-        for (auto &s : seq) {
-            if (auto callee = get_callee(s)) {
-                if (any_map_t::fetch_or_else(callee->attr_.get(),
-                            function_attrs::is_trace_func, false)) {
-                    continue;
-                }
-                // if the callee is barrier init func, we can still merge the
-                // loop
-                if (allow_buffer_def_in_merge
-                        && callee == builtin::get_init_barrier_func().get()) {
-                    ir_copier_t cpyer {replace_map};
-                    funcbody->seq_.emplace_back(cpyer(s).remove_const());
-                    continue;
-                }
-            } else if (is_tensor_def(s)) {
-                ir_copier_t cpyer {replace_map};
-                funcbody->seq_.emplace_back(cpyer(s).remove_const());
-                continue;
-            } else if (s.cast<returns>()
-                               .filter([](const returns &v) {
-                                   return v->value_.isa<constant>();
-                               })
-                               .has_value()) {
-                // if is return-const
-                continue;
-            } else if (s.isa<for_loop>()) {
-                auto loop = s.static_as<for_loop>();
-                std::vector<stmt> *target_body = &retseq;
-                /*
-                if the thread num of the loop is less than threads available
-                we can rewrite the code to
-                parallel-for(tid, 0, num_threads) {
-                    ....
-                    if(tid>=TID_OFFSET && tid<TID_OFFSET+THREADS1) {
-                        real_tid = tid-TID_OFFSET
-                        // old code depending on real_tid
-                    }
-                    ...
-                }
-                */
-                uint64_t tid_offset = 0;
-                if (num_threads > remaining_threads) {
-                    remaining_threads = total_threads;
-                    alloc_thread_from_0 = !alloc_thread_from_0;
-                }
-                if (num_threads <= remaining_threads
-                        && num_threads != total_threads) {
-                    if (alloc_thread_from_0) {
-                        /*
-                        |<   offset  >|<num_threads>|
-                                      |< remaining_threads  >|
-                        |<           total_threads          >|
-                        */
-                        tid_offset = total_threads - remaining_threads;
-                    } else {
-                        /*
-                        |<offset>|<num_threads>|
-                        |< remaining_threads  >|
-                        |<           total_threads          >|
-                        */
-                        tid_offset = remaining_threads - num_threads;
-                    }
-                    remaining_threads -= num_threads;
-                }
-                if (num_threads != total_threads) {
-                    auto dispath_body = builder::make_stmts_unattached({});
-                    target_body = &dispath_body.static_as<stmts>()->seq_;
-                    retseq.emplace_back(builder::make_if_else_unattached(
-                            loop_iter >= tid_offset
-                                    && loop_iter < tid_offset + num_threads,
-                            dispath_body, stmt()));
-                }
-                if (runtime_config_t::get().trace_mode_
-                        >= runtime_config_t::trace_mode_t::MULTI_THREAD) {
-                    trace_id = register_traced_func(f->name_);
-                    tracing = true;
-                    target_body->emplace_back(builder::make_evaluate_unattached(
-                            builtin::make_trace(trace_id, 0, 0)));
-                }
-                if (tid_offset) {
-                    replace_map[loop->var_] = loop_iter - tid_offset;
-                } else {
-                    replace_map[loop->var_] = loop_iter;
-                }
-                ir_copier_t cpyer {replace_map};
-                auto copied_loop_body = cpyer(loop->body_).as<stmts_c>();
-                auto &copied = copied_loop_body->seq_;
-                target_body->insert(
-                        target_body->end(), copied.begin(), copied.end());
-
-                if (tracing) {
-                    target_body->emplace_back(builder::make_evaluate_unattached(
-                            builtin::make_trace(trace_id, 1, 0)));
-                }
-            } else {
-                throw std::runtime_error("Bad body in parallel_merge");
-            }
-        }
-
-        for (auto &p : f->params_) {
-            retargs.emplace_back(
-                    utils::find_map_value(replace_map, p)
-                            .map([](expr *v) { return *v; })
-                            .get_or_else(copy_attr(*p, p->remake())));
-            rename_var(retargs.back(), func_id);
-        }
-        func_id++;
-    }
-    auto forbody = builder::make_for_loop_unattached(loop_iter, UINT64_C(0),
-            (uint64_t)runtime_config_t::get().get_num_threads(), UINT64_C(1),
-            loop_body, true, for_type::PARALLEL);
-    funcbody->seq_.emplace_back(forbody);
-    auto ret = builder::make_func(name, retargs, funcbody, datatypes::void_t);
-    if (all_private) { ret->attr()[function_attrs::private_] = true; }
-    return ret;
-}
-
-static bool is_idle_func_setter(const stmt &s) {
-    return s.cast<evaluate>()
-            .map([](const evaluate &v) { return v->value_.as<intrin_call>(); })
-            .filter([](const intrin_call &v) {
-                return v->type_ == intrin_type::set_thread_idle_func;
-            })
-            .has_value();
-}
-
-const_ir_module_ptr parallel_merge_t::operator()(const_ir_module_ptr f) {
-    if (runtime_config_t::get().get_num_threads() == 1) { return f; }
-    auto mainf = f->get_entry_func();
-    if (!mainf) { return f; }
-    ir_module_ptr ret;
-    auto old_seq = mainf->body_.checked_as<stmts>()->seq_;
-    optional<size_t> last_idle_func_setter;
-    for (size_t i = 0; i < old_seq.size(); i++) {
-        auto &s = old_seq[i];
-        if (!s.defined()) { continue; }
-        uint64_t num_threads = 0;
-        const std::string *next_expected_func = nullptr;
-        if (auto callee
-                = get_func_to_merge(*f, s, num_threads, next_expected_func)) {
-            // look forward to find the funcs to merge
-            std::vector<func_call_numthreads_t> to_merge {
-                    {num_threads, callee->shared_from_this(),
-                            s.static_as<evaluate>()->value_.static_as<call>()}};
-            size_t insert_point = 0;
-            std::vector<size_t> to_remove_idx;
-            for (size_t j = i + 1; j < old_seq.size(); j++) {
-                auto sj = old_seq[j];
-                // we only allow define nodes between merged functions
-                if (!sj.defined()) {
-                    continue;
-                } else if (sj.isa<define>()) {
-                    continue;
-                } else if (auto callee2 = get_callee(sj)) {
-                    callee2 = f->get_func(callee2->name_).get();
-                    bool ok_to_merge = callee2
-                            && callee2->name_ == *next_expected_func
-                            && is_func_ok_to_merge(callee2, num_threads);
-                    if (ok_to_merge) {
-                        to_merge.emplace_back(num_threads,
-                                callee2->shared_from_this(),
-                                sj.static_as<evaluate>()
-                                        ->value_.static_as<call>());
-                        old_seq[j] = stmt();
-                        insert_point = j;
-
-                        next_expected_func
-                                = any_map_t::fetch_or_null<std::string>(
-                                        sj->attr_.get(),
-                                        attr_keys::no_post_barrier);
-                        if (next_expected_func) { continue; }
-                    }
-                } else if (is_idle_func_setter(sj)) {
-                    // remove prefetcher between merged ops
-                    to_remove_idx.emplace_back(j);
-                    continue;
-                }
-                // met unallowed stmt, break
-                break;
-            }
-            if (to_merge.size() > 1UL) {
-                if (!ret) { ret = std::make_shared<ir_module_t>(*f); }
-                old_seq[i] = stmt();
-                for (auto j : to_remove_idx) {
-                    old_seq[j] = stmt();
-                }
-                auto newf = merge_funcs(to_merge);
-                ret->add_func({newf});
-                std::vector<expr> args;
-                for (auto &ca : to_merge) {
-                    args.insert(args.end(), ca.c_->args_.begin(),
-                            ca.c_->args_.end());
-                }
-                // remove the idle func (if any), because the barrier the idle
-                // func may use is very likely removed
-                if (last_idle_func_setter.has_value()) {
-                    old_seq[last_idle_func_setter.get()] = stmt();
-                }
-                old_seq[insert_point] = builder::make_evaluate_unattached(
-                        builder::make_call(newf->decl_, args));
-            }
-            last_idle_func_setter = none_opt {};
-        } else if (is_idle_func_setter(s)) {
-            // record the previous idle_func_setter, so that when a op is
-            // merged, we can remove this idle func
-            last_idle_func_setter = i;
-        } else if (get_callee(s)) {
-            last_idle_func_setter = none_opt {};
-        }
-    }
-    if (!ret) { return f; }
-    std::vector<stmt_c> seq;
-    for (size_t i = 0; i < old_seq.size(); i++) {
-        auto &s = old_seq[i];
-        if (!s.defined()) { continue; }
-        seq.emplace_back(s);
-    }
-    auto newmain = copy_attr(*mainf,
-            builder::make_func(mainf->name_, mainf->params_,
-                    builder::make_stmts_unattached(seq), mainf->ret_type_));
-    ret->get_contents()[ret->get_entry_func_idx()] = newmain;
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_merge.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_merge.hpp
deleted file mode 100644
index 0e20c15582d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_merge.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_PARALLEL_MERGE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_PARALLEL_MERGE_HPP
-
-#include <utility>
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Merge the parallel-for sections. Currently, it only merge ops that does not
- * depend on each other, so that it can remove the barriers between them
- * */
-class parallel_merge_t : public module_pass_t {
-public:
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_attr.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_attr.hpp
deleted file mode 100644
index 781bb651fcf..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_attr.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_PARALLEL_WORKLOAD_ATTR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_PARALLEL_WORKLOAD_ATTR_HPP
-#include <stddef.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace parallel_workload {
-static const size_t read_weight = 1UL;
-static const size_t write_weight = 1UL;
-static constexpr const char *attr_workload_number = "workload_number";
-} // namespace parallel_workload
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_dispatch.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_dispatch.cpp
deleted file mode 100644
index 2538dabb2d7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_dispatch.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "parallel_workload_dispatch.hpp"
-#include <algorithm>
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(parallel_workload_dispatcher,
-        SC_PASS_DEPENDS_ON(tensor_init, constant_folder),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-// workload should be marked on stmts or for_loop
-static inline size_t extract_workload_from_stmt(
-        const stmt &v, int runtime_num_threads, bool &is_brgemm) {
-    if (!v.defined()) return 0UL;
-    if (v->attr().has_key(op_traits::workload_computable_t::workload_number)) {
-        return v->attr().get<size_t>(
-                op_traits::workload_computable_t::workload_number);
-    } else if (v.isa<evaluate>()) {
-        // Disable workload dispatch on brgemm
-        // TODO(zhennan): give brgemm a resonable workload
-        const auto &eval = v.as<evaluate>();
-        if (eval->value_.isa<intrin_call>()) {
-            const auto &intrin = eval->value_.as<intrin_call>();
-            if (intrin->type_ == intrin_type::brgemm
-                    || intrin->type_ == intrin_type::list_brgemm) {
-                is_brgemm = true;
-                return memory_access_threshold_per_thread * runtime_num_threads;
-            }
-        }
-    }
-    return 0UL;
-}
-
-class tid_intrin_replacer_t : public ir_visitor_t {
-public:
-    expr tid_;
-    tid_intrin_replacer_t(const expr &tid) : tid_ {tid} {}
-    tid_intrin_replacer_t() = default;
-    expr_c visit(intrin_call_c v) override {
-        if (v->type_ == intrin_type::get_group_thread_id) {
-            auto level = v->args_.at(0).as<constant>();
-            if (get_const_as_int(level) == -1) {
-                return builder::make_cast(datatypes::s32, tid_);
-            }
-        }
-        return ir_visitor_t::visit(v);
-    }
-};
-
-static stmt split_parallel_loop(
-        const for_loop &v, size_t wkld, int runtime_num_threads) {
-    if (v->kind_ != for_type::PARALLEL) { return v; }
-    if (runtime_num_threads == 1) {
-        // remove parallel in single thread env
-        v->kind_ = for_type::NORMAL;
-        return v;
-    }
-    bool need_split_parallel = wkld != 0;
-    if (wkld < (unsigned)runtime_num_threads
-                    * memory_access_threshold_per_thread) {
-        runtime_num_threads = utils::divide_and_ceil(
-                wkld, memory_access_threshold_per_thread);
-    }
-    if (need_split_parallel) {
-        auto tid = builder::make_var(datatypes::index, "tid");
-        auto seq = builder::make_stmts_unattached({}).static_as<stmts>();
-        auto thread_for = builder::make_for_loop_unattached(tid, UINT64_C(0),
-                uint64_t(runtime_num_threads), UINT64_C(1), seq, true,
-                for_type::PARALLEL, 0);
-        expr start, end;
-        builtin::generate_balance211(runtime_num_threads, v->iter_begin_,
-                v->iter_end_, v->step_, tid, nullptr, &start, nullptr, &end,
-                &seq->seq_);
-        v->iter_begin_ = start;
-        v->iter_end_ = end;
-        v->num_threads_ = 0;
-        v->kind_ = for_type::NORMAL;
-
-        tid_intrin_replacer_t replacer {tid};
-        seq->seq_.emplace_back(replacer.dispatch(v).remove_const());
-        return thread_for;
-    }
-    return v;
-}
-class workload_accumulator_t : public ir_visitor_t {
-public:
-    workload_accumulator_t(bool record, std::unordered_map<stmt_c, size_t> &map)
-        : record_workload(record), stmt_workload_map(map) {}
-    bool record_workload;
-    std::unordered_map<stmt_c, size_t> &stmt_workload_map;
-    size_t cur_workload = 0UL;
-    const int runtime_num_threads = runtime_config_t::get().get_num_threads();
-    // if the current parallel for contains complex operations like brgemm, the
-    // function may has complex body. The parallel workload dispatch may break
-    // buffer scheduling inside the function. We may skip this pass on this
-    // case.
-    bool is_complex_pfor_ = false;
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    expr_c dispatch(expr_c v) override { return v; }
-
-    stmt_c dispatch(stmt_c v) override {
-        bool is_brgemm = false;
-        size_t stmt_workload = extract_workload_from_stmt(
-                v.remove_const(), runtime_num_threads, is_brgemm);
-        is_complex_pfor_ |= is_brgemm;
-        cur_workload = 0UL;
-        auto newv = ir_visitor_t::dispatch(v);
-        cur_workload = stmt_workload + cur_workload;
-        return newv;
-    }
-    stmt_c visit(stmts_c v) override {
-        bool changed = false;
-        size_t total_wkld = 0UL;
-        std::vector<stmt_c> seq;
-        seq.reserve(v->seq_.size());
-        for (auto &s : v->seq_) {
-            auto newstmt = dispatch(s);
-            total_wkld = total_wkld + cur_workload;
-            changed |= !newstmt.ptr_same(s);
-            seq.emplace_back(std::move(newstmt));
-        }
-        cur_workload = total_wkld;
-        changed |= v->seq_.size() != seq.size();
-
-        if (changed) {
-            stmt newv = copy_attr(*v, builder::make_stmts_unattached(seq));
-            if (record_workload) { stmt_workload_map[newv] = total_wkld; }
-            return std::move(newv);
-        }
-        if (record_workload) { stmt_workload_map[v] = total_wkld; }
-        return std::move(v);
-    }
-    stmt_c visit(if_else_c v) override {
-        auto cond = dispatch(v->condition_);
-        size_t then_wkld = cur_workload;
-        auto thencase = dispatch(v->then_case_);
-        size_t else_wkld = cur_workload;
-
-        stmt_c elsecase;
-        if (v->else_case_.defined()) elsecase = dispatch(v->else_case_);
-        size_t total_wkld = std::max(then_wkld, else_wkld);
-        cur_workload = total_wkld;
-        bool changed = !cond.ptr_same(v->condition_)
-                || !elsecase.ptr_same(v->else_case_)
-                || !thencase.ptr_same(v->then_case_);
-        if (changed) {
-            stmt newv = copy_attr(*v,
-                    builder::make_if_else_unattached(cond, thencase, elsecase));
-            if (record_workload) { stmt_workload_map[newv] = total_wkld; }
-            return std::move(newv);
-        }
-        if (record_workload) { stmt_workload_map[v] = total_wkld; }
-        return std::move(v);
-    }
-    stmt_c visit(for_loop_c v) override {
-        bool is_normal_parallel
-                = v->kind_ == for_type::PARALLEL && v->num_threads_ == 0;
-        bool old_is_complex = is_complex_pfor_;
-        if (is_normal_parallel) { is_complex_pfor_ = false; }
-        size_t total_wkld = 0UL;
-        auto var = dispatch(v->var_);
-        auto begin = dispatch(v->iter_begin_);
-        auto end = dispatch(v->iter_end_);
-        auto step = dispatch(v->step_);
-        auto body = dispatch(v->body_);
-        size_t body_wkld = cur_workload;
-        assert(body.isa<stmts_c>() || body.isa<for_loop_c>());
-        bool changed = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-                && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_)
-                && body.ptr_same(v->body_));
-        if (begin.isa<constant>() && end.isa<constant>()) {
-            total_wkld = total_wkld
-                    + (get_expr_as_int(end) - get_expr_as_int(begin))
-                            * body_wkld;
-        } else {
-            total_wkld
-                    = memory_access_threshold_per_thread * runtime_num_threads;
-        }
-        cur_workload = total_wkld;
-        changed |= (body_wkld > 0UL) || (runtime_num_threads == 1);
-        if (changed) {
-            stmt_c newv = copy_attr(*v,
-                    builder::make_for_loop_unattached(var, begin, end, step,
-                            body, v->incremental_, v->kind_, v->num_threads_));
-            if (is_normal_parallel) {
-                if (!is_complex_pfor_) {
-                    newv = split_parallel_loop(newv.checked_as<for_loop>(),
-                            total_wkld, runtime_num_threads);
-                }
-                is_complex_pfor_ = old_is_complex;
-            }
-            if (record_workload) { stmt_workload_map[newv] = total_wkld; }
-            return newv;
-        }
-        if (is_normal_parallel) { is_complex_pfor_ = old_is_complex; }
-        if (record_workload) { stmt_workload_map[v] = total_wkld; }
-        return v;
-    }
-};
-
-func_c parallel_workload_dispatcher_t::operator()(func_c f) {
-    workload_accumulator_t vis(record_workload_, stmt_workload_map_);
-    return vis.dispatch(f);
-}
-
-stmt_c parallel_workload_dispatcher_t::operator()(stmt_c f) {
-    workload_accumulator_t vis(record_workload_, stmt_workload_map_);
-    return vis.dispatch(std::move(f));
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_dispatch.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_dispatch.hpp
deleted file mode 100644
index 716455f8554..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/parallel_workload_dispatch.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_PARALLEL_WORKLOAD_DISPATCH_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_PARALLEL_WORKLOAD_DISPATCH_HPP
-
-#include "../function_pass.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// measured by a f32 FMA:
-// for(i, 0, 2^16, 1) {
-//    c[i] = c[i] + a[i] * b[i];
-// }
-// workload threshold = sigma(shape * sizeof(dtype) * read/write weight)
-constexpr size_t memory_access_threshold_per_thread = 37440UL;
-/**
- * According to workload marked in loop to calculate total
- * workloads(calculation/memory attachment) and decide whether to mark
- * `PARALLEL` in loops
- * */
-class parallel_workload_dispatcher_t : public function_pass_t {
-public:
-    bool record_workload_;
-    std::unordered_map<stmt_c, size_t> stmt_workload_map_;
-    parallel_workload_dispatcher_t(bool record_workload = false)
-        : record_workload_(record_workload) {}
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/pointer_alias_info.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/pointer_alias_info.cpp
deleted file mode 100644
index 2f67666b241..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/pointer_alias_info.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <memory>
-#include "pointer_alias_info.hpp"
-#include <compiler/ir/sc_expr.hpp>
-#include <unordered_set>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace alias_info {
-
-bool tensor_alias_identity_t::has_no_alias() const {
-    if (alias_cliques_.empty()) return true;
-    for (auto &v : alias_cliques_) {
-        assert(v->set_.size() >= 1UL);
-        if (v->set_.size() > 1) { return false; }
-    }
-    return true;
-}
-
-bool tensor_alias_identity_t::is_alias_of(
-        tensor_alias_identity_t *other) const {
-    for (auto &v : alias_cliques_) {
-        if (v->set_.has(other->shared_from_this())) { return true; }
-    }
-    return false;
-}
-
-void tensor_alias_identity_t::add_to_clique(
-        const std::shared_ptr<alias_set_t> &v) {
-    v->set_.insert(shared_from_this());
-    alias_cliques_.emplace_back(v);
-}
-
-tensor_alias_identity_t *get_alias_info(const expr_base &v) {
-    if (!v.attr_) { return nullptr; }
-    auto ret = v.attr_->get_or_null<std::shared_ptr<tensor_alias_identity_t>>(
-            attr_keys::pointer_alias);
-    if (ret) { return ret->get(); }
-    return nullptr;
-}
-
-std::shared_ptr<tensor_alias_identity_t> get_or_create_alias_info(
-        expr_base &v) {
-    auto &attr = v.attr();
-    if (!attr.has_key(attr_keys::pointer_alias)) {
-        auto ret = std::make_shared<tensor_alias_identity_t>();
-        attr[attr_keys::pointer_alias] = ret;
-        return ret;
-    }
-    return attr.get<std::shared_ptr<tensor_alias_identity_t>>(
-            attr_keys::pointer_alias);
-}
-
-std::shared_ptr<alias_set_t> alias_set_t::copy() const {
-    auto cur_clique = std::make_shared<alias_info::alias_set_t>(*this);
-    for (auto p : set_) {
-        auto aid = p.lock();
-        // if the tensor is removed, skip
-        if (!aid) { continue; }
-        aid->add_to_clique(cur_clique);
-    }
-    return cur_clique;
-}
-
-} // namespace alias_info
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/pointer_alias_info.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/pointer_alias_info.hpp
deleted file mode 100644
index e71f9dc6648..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/pointer_alias_info.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_POINTER_ALIAS_INFO_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_POINTER_ALIAS_INFO_HPP
-
-#include <memory>
-#include <vector>
-#include <unordered_set>
-#include <util/weakptr_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace attr_keys {
-constexpr const char *pointer_alias = "pointer_alias";
-}
-
-class expr_base;
-
-namespace alias_info {
-struct tensor_alias_identity_t;
-using tensor_alias_indentity_ptr = std::weak_ptr<tensor_alias_identity_t>;
-struct alias_set_t;
-
-// the struct to identify a tensor. We don't use "expr" or "tensor" pointers
-// because in passes they will be copied/remade with a different address and the
-// alias info will lose track of them
-// the tensors that may alias are stored in cliques (a graph theory concepts).
-// Tensors in a clique are alias of each other. A tensor may alias to multiple
-// cliques. The union of clique sets should be the set of alias tensor of this
-// tensor. The union of clique sets that the tensor does not belong to should be
-// the "noalias" set of this tensor.
-struct tensor_alias_identity_t
-    : std::enable_shared_from_this<tensor_alias_identity_t> {
-    // push the clique to alias_cliques_ and add this tensor to the clique
-    void add_to_clique(const std::shared_ptr<alias_set_t> &v);
-    bool has_no_alias() const;
-    tensor_alias_identity_t() = default;
-    bool is_alias_of(tensor_alias_identity_t *) const;
-    std::vector<std::shared_ptr<alias_set_t>> alias_cliques_;
-};
-
-// a set of tensors. tensors in the same set may have pointer alias
-struct alias_set_t {
-    utils::weakptr_hashset_t<tensor_alias_identity_t> set_;
-    // the identifier for sorting to make the resulting IR stable
-    int64_t id_;
-
-    // copy the alias set, also add the new set to all its tensor_alias_identity
-    std::shared_ptr<alias_set_t> copy() const;
-};
-
-std::shared_ptr<tensor_alias_identity_t> get_or_create_alias_info(expr_base &v);
-tensor_alias_identity_t *get_alias_info(const expr_base &v);
-
-} // namespace alias_info
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/scope_flatten.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/scope_flatten.cpp
deleted file mode 100644
index 0476365a91a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/scope_flatten.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "scope_flatten.hpp"
-#include <utility>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static void do_scope_flatten(
-        const std::vector<stmt> &seq, std::vector<stmt> &ret, int stmt_index) {
-    if (seq[stmt_index].isa<stmts>()) {
-        for (auto &v : seq[stmt_index].static_as<stmts>()->seq_) {
-            ret.emplace_back(std::move(v));
-        }
-    } else {
-        ret.emplace_back(seq[stmt_index]);
-    }
-}
-void scope_flatten(std::vector<stmt> &seq, int stmt_index) {
-    std::vector<stmt> ret;
-    ret.reserve(seq.size());
-    if (stmt_index < 0) {
-        for (unsigned i = 0; i < seq.size(); i++) {
-            do_scope_flatten(seq, ret, i);
-        }
-    } else {
-        assert(seq.size() > unsigned(stmt_index));
-        for (int i = 0; i < stmt_index; i++) {
-            ret.emplace_back(seq[i]);
-        }
-        do_scope_flatten(seq, ret, stmt_index);
-        for (unsigned i = stmt_index + 1; i < seq.size(); i++) {
-            ret.emplace_back(seq[i]);
-        }
-    }
-    seq = std::move(ret);
-}
-
-void scope_flatten(const stmt &seq, int stmt_index) {
-    if (seq.isa<stmts>()) {
-        scope_flatten(seq.static_as<stmts>()->seq_, stmt_index);
-    } else {
-        SC_WARN << "Flattening requires a stmts node";
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/scope_flatten.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/scope_flatten.hpp
deleted file mode 100644
index 4ed010fa844..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/scope_flatten.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SCOPE_FLATTEN_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SCOPE_FLATTEN_HPP
-
-#include <vector>
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Merge one nested stmts_node_t scope into parent scope:
- * Before:
- * {
- *  { // <------stmt_index pointing here
- *      AAA
- *  }
- *  BBB
- * }
- * After:
- * {
- *  AAA
- *  BBB
- * }
- * @param seq the parent scope
- * @param stmt_index the child stmts node index to merge within parent scope. If
- *  it is not an stmts, do nothing. If stmt_index < 0, will try to flatten all
- *  stmts in the parent scope (not recursively)
- * */
-void scope_flatten(std::vector<stmt> &seq, int stmt_index);
-void scope_flatten(const stmt &seq, int stmt_index);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simple_licm.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simple_licm.cpp
deleted file mode 100644
index d2a275cd445..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simple_licm.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <unordered_map>
-#include <unordered_set>
-
-#include <utility>
-#include <vector>
-#include "../builder.hpp"
-#include "../util_module_passes.hpp"
-#include "../viewer.hpp"
-#include "../visitor.hpp"
-#include "simple_licm.hpp"
-#include <compiler/ir/pass_dep_util.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(simple_loop_invariant_code_motion,
-        SC_PASS_DEPENDS_ON(loop_merger, parallel_workload_dispatcher),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE(IR_SIMPLIFIED));
-
-class tensor_def_viewer_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    bool volatile_ = false;
-    void view(var_c v) override {
-        if (!v->attr_ || !v->attr_->get_or_else(attr_key::const_attr, false)) {
-            volatile_ = true;
-        }
-    }
-    void view(indexing_c v) override { volatile_ = true; }
-};
-
-class tensor_def_hoister_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    tensor_def_viewer_t viewer_;
-    // depth of non parallel loop in current scope.
-    int non_parallel_loop_depth_ = 0;
-    // define tensor with no related loop vars.
-    std::vector<stmt_c> no_depend_defs_;
-    bool can_not_parallel_ = false;
-    // don't care about expr.
-    expr_c dispatch(expr_c v) override { return v; }
-    stmt_c visit(define_c v) override {
-        if (non_parallel_loop_depth_ != 0 && !v->init_.defined()
-                && (!v->var_.isa<tensor>()
-                        || v->var_.checked_as<tensor>()->init_value_
-                                == nullptr)) {
-            bool do_hoist = true;
-            if (v->var_.isa<tensor>()) {
-                viewer_.dispatch(v->var_);
-                if (viewer_.volatile_) { do_hoist = false; }
-                viewer_.volatile_ = false;
-            }
-            if (do_hoist) {
-                no_depend_defs_.emplace_back(v);
-                return builder::make_stmts_unattached({});
-            }
-        }
-        return v;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        bool old_can_not_parallel = can_not_parallel_;
-        if (v->kind_ != for_type::PARALLEL) {
-            can_not_parallel_ = true;
-            non_parallel_loop_depth_++;
-        } else {
-            // if parallel for locate at non parallel for ignore that, else
-            // dispatch inside.
-            if (can_not_parallel_) { return v; }
-        }
-        auto newv = ir_visitor_t::visit(v);
-        if (v->kind_ != for_type::PARALLEL) {
-            can_not_parallel_ = old_can_not_parallel;
-            non_parallel_loop_depth_--;
-            if (non_parallel_loop_depth_ == 0 && !no_depend_defs_.empty()) {
-                no_depend_defs_.emplace_back(newv);
-                auto ret = builder::make_stmts_unattached(no_depend_defs_);
-                no_depend_defs_.clear();
-                return ret;
-            }
-        }
-        return newv;
-    }
-};
-
-func_c simple_loop_invariant_code_motion_t::operator()(func_c f) {
-    tensor_def_hoister_t pass;
-    auto ret = pass.dispatch(std::move(f));
-    return ret;
-}
-
-stmt_c simple_loop_invariant_code_motion_t::operator()(stmt_c f) {
-    tensor_def_hoister_t pass;
-    return pass.dispatch(std::move(f));
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simple_licm.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simple_licm.hpp
deleted file mode 100644
index cc66bee4d71..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simple_licm.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SIMPLE_LICM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SIMPLE_LICM_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace attr_key {
-constexpr const char *const_attr = "pass.const";
-}
-/**
- * Simple LICM of non-SSA version, hoist tensor defined inside loop. If the
- * tensor has init values or dimensions related to loop vars, do not hoist.
- * */
-class simple_loop_invariant_code_motion_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c s);
-    SC_DECL_PASS_INFO_FUNC();
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simplify.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simplify.cpp
deleted file mode 100644
index f05159ba123..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simplify.cpp
+++ /dev/null
@@ -1,429 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "simplify.hpp"
-#include <string>
-#include <utility>
-#include <vector>
-#include "../visitor.hpp"
-#include "constant_fold.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(ir_simplifier, SC_PASS_DEPENDS_ON(validator, constant_folder),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(IR_SIMPLIFIED), SC_PASS_UNSET_STATE());
-
-/** dead write elimination implementation
- *  this impl is fast and may be called many times.
- * */
-class simplify_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    bool skip_rename_;
-    simplify_impl_t(bool skip_rename) : skip_rename_(skip_rename) {}
-    // the current/ancestor stmts
-    std::vector<stmt_c> cur;
-    // the defined var/tensor in stmts
-    std::unordered_set<std::string> defs;
-    // old var to new var map
-    std::unordered_map<expr_c, expr_c> rmap;
-    // repeat var index
-    int var_index = 1;
-    // the parent stmts nodes currently met stmt sequences
-    std::vector<stmt_c> *pnewseq = nullptr;
-
-    expr_c dispatch(expr_c v) override {
-        if (skip_rename_) { return v; }
-        return ir_visitor_t::dispatch(v);
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        if (cur.empty()) { defs.clear(); }
-        cur.emplace_back(v);
-        auto ret = ir_visitor_t::dispatch(v);
-        cur.pop_back();
-        if (cur.empty()) { defs.clear(); }
-        return ret;
-    }
-
-    // only interested in var/tensor
-    expr_c visit(var_c v) override {
-        if (rmap.find(v) != rmap.end()) { return rmap[v]; }
-        return v;
-    }
-    expr_c visit(tensor_c v) override {
-        if (rmap.find(v) != rmap.end()) { return rmap[v]; }
-        return v;
-    }
-
-    stmt_c visit(define_c v) override {
-        if (skip_rename_) { return v; }
-        auto ret = ir_visitor_t::visit(v).static_as<define_c>();
-        bool changed = !ret.ptr_same(v);
-        auto var0 = ret->var_;
-        expr new_var = var0;
-        // in stmts
-        if (!cur.empty()) {
-            if (var0.isa<var>()) {
-                auto &name = var0.static_as<var>()->name_;
-                if (defs.find(name) == defs.end()) {
-                    defs.insert(name);
-                } else {
-                    new_var = var0->remake();
-                    auto new_name = name + "_" + std::to_string(var_index++);
-                    new_var.static_as<var>()->name_ = new_name;
-                    defs.insert(new_name);
-                    rmap[var0] = new_var;
-                    changed = true;
-                }
-            } else {
-                assert(var0.isa<tensor>());
-                auto &name = var0.static_as<tensor>()->name_;
-                new_var = ir_visitor_t::visit(var0.static_as<tensor>())
-                                  .remove_const();
-                bool remade = !new_var.ptr_same(var0);
-                bool duplicated_name = defs.find(name) != defs.end();
-                if (remade || duplicated_name) {
-                    new_var = remade ? new_var : new_var->remake();
-                    rmap[var0] = new_var;
-                    changed = true;
-                }
-                if (!duplicated_name) {
-                    defs.insert(name);
-                } else {
-                    auto new_name = name + "_" + std::to_string(var_index++);
-                    new_var.static_as<tensor>()->name_ = new_name;
-                    defs.insert(new_name);
-                }
-            }
-        }
-        if (changed) {
-            return copy_attr(*ret,
-                    builder::make_var_tensor_def_unattached(
-                            new_var, ret->linkage_, ret->init_));
-        }
-        return v;
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        if (v->value_.isa<call_c>() || v->value_.isa<intrin_call_c>()
-                || v->value_.isa<low_level_intrin_c>()) {
-            return ir_visitor_t::visit(std::move(v));
-        }
-        return stmt_c();
-    }
-
-    // Rename for_loop var if same
-    // Will case potential error especially in nested loops
-    stmt_c visit(for_loop_c v) override {
-        if (skip_rename_) { return ir_visitor_t::visit(v); }
-        bool changed = false;
-        auto new_var = v->var_;
-        auto &name = v->var_.static_as<var>()->name_;
-        // in stmts
-        if (!cur.empty()) {
-            // Create new var if exist same name
-            if (defs.find(name) == defs.end()) {
-                defs.insert(name);
-            } else {
-                new_var = v->var_->remake();
-                auto new_name = name + "_" + std::to_string(var_index++);
-                new_var.static_as<var>()->name_ = new_name;
-                defs.insert(new_name);
-                rmap[v->var_] = new_var;
-                changed = true;
-            }
-            // traverse for_loop
-            auto begin = dispatch(v->iter_begin_);
-            auto end = dispatch(v->iter_end_);
-            auto step = dispatch(v->step_);
-            auto body = dispatch(v->body_);
-            changed |= !begin.ptr_same(v->iter_begin_);
-            changed |= !end.ptr_same(v->iter_end_);
-            changed |= !step.ptr_same(v->step_);
-            changed |= !body.ptr_same(v->body_);
-            // make new for_loop if changed
-            if (changed) {
-                return copy_attr(*v,
-                        builder::make_for_loop_unattached(new_var, begin, end,
-                                step, body, v->incremental_, v->kind_,
-                                v->num_threads_));
-            }
-            return v;
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(stmts_c v) override {
-        bool parent_is_stmts
-                = (cur.size() > 1 && cur[cur.size() - 2].isa<stmts>());
-        // if the current seq is empty and parent is stmt, return null
-        if (v->seq_.empty() && parent_is_stmts) { return stmt_c(); }
-        auto parent_seq = pnewseq;
-        std::vector<stmt_c> newseq;
-        pnewseq = &newseq;
-
-        bool changed = false;
-        for (auto &s : v->seq_) {
-            auto ret = dispatch(s);
-            if (ret.defined()) { newseq.emplace_back(ret); }
-            changed |= !ret.ptr_same(s);
-        }
-        pnewseq = parent_seq;
-        bool preserve = v->attr_
-                && v->attr_->get_or_else(attr_keys::preserve_stmts, false);
-        if (!preserve && parent_is_stmts) {
-            // if we have no definitions in the current scope and direct parent
-            // is a stmts, promote to parent seq
-            parent_seq->insert(parent_seq->end(), newseq.begin(), newseq.end());
-            return stmts_c();
-        }
-        if (changed) {
-            return copy_attr(*v, builder::make_stmts_unattached(newseq));
-        }
-        return v;
-    }
-};
-
-/**
- * This impl will simplify and eliminate stmts including:
- * 1. for_loop
- * 2. if_else
- * @note: this impl can be treated as extended simplifier for ir_simplifier_t
- * above, which has more quick visit for ir.
- * */
-class if_loop_simplify_impl_t : public ir_consistent_visitor_t {
-public:
-    using ir_consistent_visitor_t::dispatch;
-    bool should_replace_expr_ = false;
-    expr_c dispatch(expr_c v) override {
-        if (!should_replace_expr_) {
-            return v;
-        } else {
-            return ir_consistent_visitor_t::dispatch(v);
-        }
-    }
-
-    // eliminate if(){} else(){}
-    stmt_c visit(if_else_c v) override {
-        stmt_c then_case, else_case;
-        expr_c condition = constant_folder_t()(dispatch(v->condition_));
-        bool else_is_empty_stmts = false;
-        then_case = dispatch(v->then_case_);
-        if (v->else_case_.defined()) {
-            else_case = dispatch(v->else_case_);
-            else_is_empty_stmts = else_case.isa<stmts>()
-                    && else_case.checked_as<stmts>()->seq_.empty();
-        }
-
-        /** simplify always true
-         * if(True){
-         *    // if block
-         * }
-         * else{
-         *    // else block
-         * }
-         * RETURN:
-         * if block
-         * */
-        if (condition.isa<constant>() && get_expr_as_int(condition) > 0) {
-            // similar to always FALSE
-            return then_case;
-        }
-        /** simplify always false
-         * iif(FALSE){
-         *    // if block
-         * }
-         * else{
-         *    // else block
-         * }
-         * RETURN:
-         * else block
-         * */
-        if (condition.isa<constant>() && get_expr_as_int(condition) == 0) {
-            // similar to always FALSE
-            return else_case.defined()
-                    ? else_case
-                    : make_stmt<stmts_node_t>(std::vector<stmt> {});
-        }
-
-        /** simplify empty if
-         * if(conditon){
-         *    // empty
-         * }
-         * else{
-         *    // else block
-         * }
-         * RETURN:
-         * NULL or
-         * if(!conditon){
-         *    // else block
-         * }
-         * */
-        if (then_case.isa<stmts>()
-                && then_case.static_as<stmts>()->seq_.empty()) {
-            if (!else_case.defined() || else_is_empty_stmts) {
-                return copy_attr(
-                        *v, make_stmt<stmts_node_t>(std::vector<stmt> {}));
-            } else {
-                return copy_attr(*v,
-                        builder::make_if_else_unattached(
-                                constant_folder_t()(!condition), else_case,
-                                stmt()));
-            }
-        }
-        /** simplify empty else
-         * if(conditon){
-         *    // if block
-         * }
-         * else{
-         *    // empty
-         * }
-         * RETURN:
-         * if(){
-         *    // if block
-         * }
-         * */
-        if (else_is_empty_stmts) {
-            return copy_attr(*v,
-                    builder::make_if_else_unattached(
-                            condition, then_case, stmt()));
-        }
-
-        bool changed_ = !(condition.ptr_same(v->condition_)
-                && then_case.ptr_same(v->then_case_)
-                && else_case.ptr_same(v->else_case_));
-        if (changed_) {
-            return copy_attr(*v,
-                    builder::make_if_else_unattached(
-                            condition, then_case, else_case));
-        }
-        return std::move(v);
-    }
-
-    stmt_c dispatch_loop_body_replace_loop_var(
-            const expr_c &var, const expr_c &begin, const stmt &body) {
-        auto old_should_replace_expr = should_replace_expr_;
-        should_replace_expr_ = true;
-        replace_map_[var] = begin;
-        auto outbody = dispatch(body);
-        // the replace map should only affect in current scope
-        replace_map_.erase(var);
-        should_replace_expr_ = old_should_replace_expr;
-        return outbody;
-    }
-
-    // eliminate loop
-    bool is_loop_merge = false;
-    stmt_c visit(for_loop_c v) override {
-        // eliminate for(...){}
-        if (v->body_.isa<stmts>()
-                && v->body_.static_as<stmts>()->seq_.empty()) {
-            return copy_attr(*v, make_stmt<stmts_node_t>(std::vector<stmt> {}));
-        }
-        bool cached_loop_merge = is_loop_merge;
-        auto var = dispatch(v->var_);
-        auto begin = dispatch(v->iter_begin_);
-        auto end = dispatch(v->iter_end_);
-        auto step = dispatch(v->step_);
-
-        // check if the constant folder has attached loop_len_hint
-        if (v->attr_) {
-            int64_t loop_len
-                    = v->attr_->get_or_else("loop_len_hint", INT64_C(-1));
-            if (loop_len >= 0) {
-                if (loop_len == 0) {
-                    return copy_attr(
-                            *v, make_stmt<stmts_node_t>(std::vector<stmt> {}));
-                } else if (loop_len == 1) {
-                    return dispatch_loop_body_replace_loop_var(
-                            var, begin, v->body_);
-                }
-            }
-        }
-
-        is_loop_merge |= (v->attr_
-                && v->attr_->get_or_else(stmt_attr_key::merge_loop, false));
-
-        if (begin.isa<constant>() && end.isa<constant>()
-                && step.isa<constant>()) {
-            // begin > end
-            if (get_expr_as_int(begin) >= get_expr_as_int(end)) {
-                is_loop_merge = cached_loop_merge;
-                return copy_attr(
-                        *v, make_stmt<stmts_node_t>(std::vector<stmt> {}));
-            }
-            // (begin + step) >= end
-            if ((get_expr_as_int(begin) + get_expr_as_int(step))
-                    >= get_expr_as_int(end)) {
-                auto body = dispatch_loop_body_replace_loop_var(
-                        var, begin, v->body_);
-                is_loop_merge = cached_loop_merge;
-                return body;
-            }
-        }
-        auto body = dispatch(v->body_);
-        bool changed = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-                && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_)
-                && body.ptr_same(v->body_));
-
-        if (changed
-                || (is_loop_merge
-                        && (!v->attr_
-                                || is_loop_merge
-                                        != v->attr_->get_or_else(
-                                                stmt_attr_key::merge_loop,
-                                                false)))) {
-            auto loop = copy_attr(*v,
-                    make_stmt<for_loop_node_t>(var.remove_const(),
-                            begin.remove_const(), end.remove_const(),
-                            step.remove_const(), body.remove_const(),
-                            v->incremental_, v->kind_, v->num_threads_))
-                                .checked_as<for_loop>();
-            if (is_loop_merge) loop->attr()[stmt_attr_key::merge_loop] = true;
-            is_loop_merge = cached_loop_merge;
-            return loop;
-        }
-
-        is_loop_merge = cached_loop_merge;
-        return std::move(v);
-    }
-};
-
-func_c ir_simplifier_t::operator()(func_c f) {
-    simplify_impl_t simpl {skip_rename_};
-    if_loop_simplify_impl_t ilimpl;
-    return simpl.dispatch(ilimpl.dispatch(simpl.dispatch(f)));
-}
-stmt_c ir_simplifier_t::operator()(stmt_c f) const {
-    simplify_impl_t simpl {skip_rename_};
-    if_loop_simplify_impl_t ilimpl;
-    return simpl.dispatch(ilimpl.dispatch(simpl.dispatch(std::move(f))));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simplify.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simplify.hpp
deleted file mode 100644
index 14ec53a942b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/simplify.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SIMPLIFY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SIMPLIFY_HPP
-
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace attr_keys {
-// bool. Applied on stmts_node. If true, the ir_simplifier will not remove the
-// stmts.
-constexpr const char *preserve_stmts = "preserve_stmts";
-} // namespace attr_keys
-
-/**
- * Remove empty stmts nodes in parent stmts nodes. Simplify for nodes if
- * boundaris are constants/loop body is empty. Simplify if-else nodes if
- * condition is constant.
- * @param skip_rename skip renaming the variables if the it has conflicts with
- * parent scopes. Enabling this feature will slow down this pass a lot.
- * */
-class ir_simplifier_t : public function_pass_t {
-public:
-    bool skip_rename_;
-    ir_simplifier_t(bool skip_rename) : skip_rename_(skip_rename) {}
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f) const;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/ssa_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/ssa_transform.cpp
deleted file mode 100644
index 8d77a29cce5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/ssa_transform.cpp
+++ /dev/null
@@ -1,613 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "ssa_transform.hpp"
-#include <map>
-#include <utility>
-#include <vector>
-#include "module_globals_resolve.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(ssa_transform,
-        SC_PASS_DEPENDS_ON(module_globals_resolver, local_tensor_lowering_cpu,
-                closurizer_cpu, buffer_scheduler),
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED, IR_SIMPLIFIED, FUNC_INLINED),
-        SC_PASS_REQUIRE_NOT_STATE(), SC_PASS_SET_STATE(SSA_STAGE),
-        SC_PASS_UNSET_STATE());
-
-struct ssa_var_status_t {
-    expr current_value;
-    size_t defined_scope_idx;
-    // the phi node for the variable that is referenced in the current
-    // for-loop, which is defined outside of the loop
-    std::vector<expr> for_loop_phi;
-};
-
-struct var_cmper_t {
-    bool operator()(const expr_c &l, const expr_c &r) const {
-        if (l->node_type_ != r->node_type_) {
-            return static_cast<int>(l->node_type_)
-                    < static_cast<int>(r->node_type_);
-        }
-        if (l->node_type_ == sc_expr_type::var) {
-            const auto &l_name = l.static_as<var>()->name_;
-            const auto &r_name = r.static_as<var>()->name_;
-            return (l_name == r_name) ? l.get() < r.get() : l_name < r_name;
-        } else {
-            assert(l->node_type_ == sc_expr_type::tensor);
-            const auto &l_name = l.static_as<tensor>()->name_;
-            const auto &r_name = r.static_as<tensor>()->name_;
-            return (l_name == r_name) ? l.get() < r.get() : l_name < r_name;
-        }
-    }
-};
-
-struct ssa_scope_t {
-    // old var => ssa_var_status_t. Using ordered map to make unit tests happy
-    std::map<expr_c, ssa_var_status_t, var_cmper_t> vars_;
-    std::vector<stmt> inserted_;
-    enum class kind {
-        normal,
-        for_loop,
-        if_then,
-        if_else,
-    };
-    kind kind_;
-    int for_depth_;
-
-    ssa_scope_t(int for_depth, kind kind)
-        : kind_(kind), for_depth_(for_depth) {}
-};
-
-class ssa_transform_impl_t : public ssa_visitor_t {
-public:
-    using ssa_visitor_t::dispatch;
-    using ssa_visitor_t::visit;
-    std::vector<ssa_scope_t> scopes_;
-    // if the current expr needs to be flatten in dispatch()
-    bool need_flatten_ = true;
-    expr add_ssa_def(const expr_c &ret) {
-        // if is global variable, add a "load instance"
-        auto newret = add_def(ret);
-        // copy the function pointer prototype
-        if (newret->dtype_ == datatypes::pointer && ret->attr_) {
-            if (auto proto = ret->attr_->get_or_else("prototype", func_t())) {
-                newret->attr()["prototype"] = proto;
-            }
-        }
-        return newret;
-    }
-
-    define add_ssa_def_to_parent_scope(const expr_c &ret, ssa_scope_t *scope) {
-        // if is global variable, add a "load instance"
-        auto newret = make_def_and_process(ret);
-        // copy the function pointer prototype
-        if (ret->dtype_ == datatypes::pointer && ret->attr_) {
-            if (auto proto = ret->attr_->get_or_else("prototype", func_t())) {
-                newret->var_->attr()["prototype"] = proto;
-            }
-        }
-        scope->inserted_.emplace_back(newret);
-        return newret;
-    }
-
-    ssa_scope_t &push_scope(ssa_scope_t::kind k) {
-        int for_depth;
-        if (scopes_.empty()) {
-            for_depth = 0;
-        } else {
-            for_depth = scopes_.back().for_depth_;
-        }
-        if (k == ssa_scope_t::kind::for_loop) { for_depth++; }
-        scopes_.emplace_back(for_depth, k);
-        return scopes_.back();
-    }
-
-    ssa_scope_t pop_scope() {
-        auto ret = std::move(scopes_.back());
-        scopes_.pop_back();
-        return ret;
-    }
-
-    // add an old var definition to scopes, returns the new var
-    ssa_var_status_t *insert_local_var(
-            const expr_c &old_var, const expr &new_val, size_t scope_idx) {
-        auto itr = scopes_[scope_idx].vars_.insert(std::make_pair(old_var,
-                ssa_var_status_t {new_val, scope_idx, std::vector<expr>()}));
-        return &itr.first->second;
-    }
-
-    // add an old var definition to scopes, returns the new var
-    ssa_var_status_t *insert_local_var(
-            const expr_c &old_var, const expr &new_val) {
-        return insert_local_var(old_var, new_val, scopes_.size() - 1);
-    }
-
-    ssa_var_status_t *get_local_var_nothrow(const expr_c &old_var) {
-        for (auto itr = scopes_.rbegin(); itr != scopes_.rend(); ++itr) {
-            auto varitr = (*itr).vars_.find(old_var);
-            if (varitr != (*itr).vars_.end()) { return &varitr->second; }
-        }
-        return nullptr;
-    }
-
-    // find the scope id where the local var is first defined
-    int64_t get_local_var_top_level(const expr_c &old_var) {
-        for (auto itr = scopes_.begin(); itr != scopes_.end(); ++itr) {
-            auto varitr = (*itr).vars_.find(old_var);
-            if (varitr != (*itr).vars_.end()) { return itr - scopes_.begin(); }
-        }
-        throw std::runtime_error("Undefined var in SSA transform");
-    }
-
-    ssa_var_status_t *get_local_var(const expr_c &old_var) {
-        auto ret = get_local_var_nothrow(old_var);
-        COMPILE_ASSERT(ret, "Undefined var:" << old_var);
-        return ret;
-    }
-
-    ssa_var_status_t *get_local_var_for_update(const expr_c &old_var) {
-        if (is_old_var_global(old_var.get())) { return get_local_var(old_var); }
-        auto &back = scopes_.back();
-        auto varitr = back.vars_.find(old_var);
-        if (varitr != back.vars_.end()) { return &varitr->second; }
-        return insert_local_var(old_var, expr());
-    }
-
-    bool is_old_var_global(const expr_base *old_var) {
-        return old_var->node_type_ == sc_expr_type::var && old_var->attr_
-                && old_var->attr_->has_key(attr_keys::module_global_offset);
-    }
-
-    void set_var_as_global(const expr_c &old_var) {
-        if (!old_var->ssa_data_) {
-            old_var.remove_const()->ssa_data_
-                    = utils::make_unique<ssa_data_t>();
-        }
-        old_var->ssa_data_->is_global_ = true;
-    }
-
-    ssa_data_t *init_ssa_data(expr_base *ex) {
-        assert(!ex->ssa_data_);
-        ex->ssa_data_ = utils::make_unique<ssa_data_t>();
-        return ex->ssa_data_.get();
-    }
-
-    expr_c dispatch(expr_c f) override {
-        bool old_need_flatten = need_flatten_;
-        need_flatten_ = true;
-        auto ret = ssa_visitor_t::dispatch(std::move(f));
-        if (old_need_flatten && !ret.isa<var>() && !ret.isa<tensor>()) {
-            return add_ssa_def(ret);
-        }
-        return ret;
-    }
-
-    func_c dispatch(func_c f) override {
-        push_scope(ssa_scope_t::kind::normal);
-        std::vector<expr> new_params;
-        for (auto &p : f->params_) {
-            auto newp = p->remake();
-            init_ssa_data(newp.get())->is_param_ = true;
-            insert_local_var(p, newp);
-            new_params.emplace_back(std::move(newp));
-        }
-        auto body = dispatch(f->body_);
-        pop_scope();
-        return copy_attr(*f,
-                builder::make_func(f->name_, new_params, body.remove_const(),
-                        f->ret_type_));
-    }
-
-    expr_c visit(tensor_c v) override {
-        auto ret = get_local_var(v);
-        return ret->current_value;
-    }
-    expr_c visit(var_c v) override {
-        auto ret = get_local_var(v);
-        auto &cur_scope = scopes_.back();
-        if (!ret->current_value->ssa_data_->is_global_) {
-            auto parent_scope_id = ret->defined_scope_idx;
-            auto cur_val = ret->current_value;
-            if (cur_scope.for_depth_ > scopes_[parent_scope_id].for_depth_) {
-                // if the variable depends on a value created outside the
-                // current for loop
-                // we now need to create a phi node. The source of the phi is
-                // the value from parent for-loop scope
-                if (cur_scope.for_depth_
-                        != scopes_[parent_scope_id].for_depth_ + 1) {
-                    // if there is other for-loop scopes between current scope
-                    // and the parent loop scope, we need to insert a PHI for
-                    // each for-loop scopes between them
-                    for (size_t i = parent_scope_id + 1; i < scopes_.size() - 1;
-                            i++) {
-                        auto &the_scope = scopes_[i];
-                        if (the_scope.kind_ != ssa_scope_t::kind::for_loop) {
-                            continue;
-                        }
-                        if (the_scope.for_depth_ >= cur_scope.for_depth_) {
-                            break;
-                        }
-                        auto phi = make_expr<ssa_phi_node>(
-                                std::vector<expr> {cur_val}, false);
-                        auto new_ssa_def
-                                = add_ssa_def_to_parent_scope(phi, &the_scope);
-                        cur_val = new_ssa_def->var_;
-                        rename_temp_var_with_version(
-                                cur_val.checked_as<var>(), v);
-                        insert_local_var(v, cur_val, i)
-                                ->for_loop_phi.emplace_back(cur_val);
-                    }
-                }
-                ssa_scope_t *insert_scope = nullptr;
-                size_t scope_idx = 0;
-                for (int64_t itr = scopes_.size() - 1; itr >= 0; --itr) {
-                    if (scopes_[itr].kind_ == ssa_scope_t::kind::for_loop) {
-                        insert_scope = &scopes_[itr];
-                        scope_idx = itr;
-                        break;
-                    }
-                }
-                assert(scope_idx != 0);
-                expr phi;
-                auto phi_expr = make_expr<ssa_phi_node>(
-                        std::vector<expr> {cur_val}, false);
-                if (insert_scope == &scopes_.back()) {
-                    phi = add_ssa_def(phi_expr);
-                } else {
-                    phi = add_ssa_def_to_parent_scope(phi_expr, insert_scope)
-                                  ->var_;
-                }
-                assert(cur_val.isa<var>() || cur_val.isa<tensor>()
-                        || cur_val.isa<constant>());
-                rename_temp_var_with_version(phi.checked_as<var>(), v);
-                // update the local var mapping to the phi node
-                insert_local_var(v, phi, scope_idx)
-                        ->for_loop_phi.emplace_back(phi);
-                // remember that we need to update this phi node after for-loop
-                // exits
-                return phi;
-            }
-        } else {
-            return add_ssa_def(ret->current_value);
-        }
-        return ret->current_value;
-    }
-
-    stmt_c visit(define_c v) override {
-        expr_c lhs;
-        assert(v->linkage_ == linkage::local);
-
-        auto info = insert_local_var(v->var_, expr());
-        enum { LOCAL_VAR, GLOBAL_VAR, TENSOR } type;
-        if (v->var_.isa<var>()) {
-            if (is_old_var_global(v->var_.get())) {
-                type = GLOBAL_VAR;
-            } else {
-                type = LOCAL_VAR;
-            }
-        } else {
-            assert(v->var_.isa<tensor>());
-            type = TENSOR;
-        }
-        if (type == LOCAL_VAR && !v->init_.defined()) {
-            // pure local var-def without init value, simply remove it
-            info->current_value
-                    = make_expr<constant_node>(INT64_C(0), v->var_->dtype_);
-            init_ssa_data(info->current_value.get());
-            return stmt_c();
-        }
-        auto newvar = v->var_->remake();
-        init_ssa_data(newvar.get());
-        info->current_value = newvar;
-        if (type == GLOBAL_VAR) { newvar->ssa_data_->is_global_ = true; }
-        expr_c init;
-        if (v->init_.defined()) {
-            need_flatten_ = false;
-            init = dispatch(v->init_);
-        }
-        return copy_attr(*v,
-                builder::make_var_tensor_def_unattached(
-                        newvar, v->linkage_, init));
-    }
-
-    uint64_t var_version_idx = 0;
-
-    void rename_temp_var_with_version(const var &newv, const var_c &old_var) {
-        if (newv->ssa_data_->is_local()) {
-            newv->name_
-                    = old_var->name_ + "_" + std::to_string(var_version_idx++);
-        }
-    }
-
-    stmt_c visit(assign_c v) override {
-        if (v->var_.isa<var>()) {
-            auto rhs = dispatch(v->value_);
-            auto var_info = get_local_var_for_update(v->var_);
-            if (!var_info->current_value.defined()
-                    || !var_info->current_value->ssa_data_->is_global_) {
-                if (!v->value_.isa<var>()
-                        || get_local_var(v->value_)->defined_scope_idx
-                                == scopes_.size() - 1) {
-                    // if we are sure that RHS will be transformed to a temp
-                    // var in the current scope, we can avoid making a copy of
-                    // the var
-                    var_info->current_value = rhs.remove_const();
-                    assert(var_info->current_value.isa<var>()
-                            || var_info->current_value.isa<constant>());
-                    auto cur_value = var_info->current_value.static_as<var>();
-                    rename_temp_var_with_version(
-                            cur_value, v->var_.static_as<var>());
-                    return stmt_c();
-                }
-                // make a copy for RHS
-                auto ret = make_def(rhs);
-                auto newvar = ret->var_;
-                rename_temp_var_with_version(
-                        newvar.checked_as<var>(), v->var_.static_as<var>());
-                // update the local var mapping
-                var_info->current_value = newvar;
-                return copy_attr(*v, ret);
-            } else {
-                // if is global var
-                return copy_attr(*v,
-                        builder::make_assign_unattached(
-                                var_info->current_value, rhs));
-            }
-        } else {
-            assert(v->var_.isa<indexing>());
-            need_flatten_ = false;
-            auto lhs = dispatch(v->var_);
-            return copy_attr(*v,
-                    builder::make_assign_unattached(lhs, dispatch(v->value_)));
-        }
-    }
-
-    expr resolve_single_phi(const expr &val, expr *out_last_level_var) {
-        if (val.isa<var>()) {
-            if (out_last_level_var) { *out_last_level_var = val; }
-            if (val->ssa_data_->is_global_
-                    || !val->ssa_data_->get_owner().defined()) {
-                return val;
-            }
-            return resolve_single_phi(
-                    val->ssa_data_->get_value_of_var(), out_last_level_var);
-        }
-        if (val.isa<ssa_phi>()) {
-            auto val_phi = val.static_as<ssa_phi>();
-            if (val_phi->values_.size() == 1) {
-                return resolve_single_phi(
-                        val_phi->values_[0], out_last_level_var);
-            }
-        }
-        return val;
-    }
-
-    bool is_same_with_parent_var(
-            const expr &val, const expr &parent_val, expr *out_same) {
-        expr last_level_var;
-        auto v1 = resolve_single_phi(val, nullptr);
-        auto v2 = resolve_single_phi(
-                parent_val, out_same ? &last_level_var : nullptr);
-        if (v1.ptr_same(v2)) {
-            // if the variable is unchanged in loop
-            if (out_same) {
-                if (!v1.isa<var>() && !v1.isa<tensor>()
-                        && !v1.isa<constant>()) {
-                    *out_same = last_level_var;
-                } else {
-                    *out_same = v1;
-                }
-            }
-            return true;
-        }
-        return false;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto begin = dispatch(v->iter_begin_);
-        auto end = dispatch(v->iter_end_);
-        auto step = dispatch(v->step_);
-
-        push_scope(ssa_scope_t::kind::for_loop);
-        auto thevar = v->var_->remake();
-        insert_local_var(v->var_, thevar);
-        init_ssa_data(thevar.get());
-        auto body = dispatch(v->body_);
-        ssa_scope_t scope = pop_scope();
-        if (!scope.inserted_.empty()) {
-            auto &bodyseq = body.checked_as<stmts>()->seq_;
-            bodyseq.insert(bodyseq.begin(), scope.inserted_.begin(),
-                    scope.inserted_.end());
-        }
-        for (auto &kv : scope.vars_) {
-            auto parent_var = get_local_var_nothrow(kv.first);
-            if (parent_var) {
-                // if the variable is a for-loop-phi
-                for (auto &phi : kv.second.for_loop_phi) {
-                    if (is_same_with_parent_var(
-                                phi, kv.second.current_value, nullptr)) {
-                        continue;
-                    }
-                    // if the variable is changed in loop, we need to update the
-                    // phi node input
-                    auto thephi = phi->ssa_data_->get_value_of_var()
-                                          .checked_as<ssa_phi>();
-                    thephi->values_.emplace_back(kv.second.current_value);
-                    thephi->is_loop_phi_ = true;
-                }
-                if (is_same_with_parent_var(kv.second.current_value,
-                            parent_var->current_value, nullptr)) {
-                    continue;
-                }
-                auto new_var = make_expr<ssa_phi_node>(
-                        std::vector<expr> {parent_var->current_value,
-                                kv.second.current_value},
-                        false);
-                auto cur_v = add_def_after_current_stmt(new_var);
-                get_local_var_for_update(kv.first)->current_value = cur_v;
-                rename_temp_var_with_version(
-                        cur_v.checked_as<var>(), kv.first.checked_as<var>());
-            }
-        }
-        return copy_attr(*v,
-                builder::make_for_loop_unattached(thevar, begin, end, step,
-                        body, v->incremental_, v->kind_, v->num_threads_));
-    }
-
-    stmt_c visit(if_else_c v) override {
-        auto cond = dispatch(v->condition_);
-        push_scope(ssa_scope_t::kind::if_then);
-        auto then_block = dispatch(v->then_case_);
-        ssa_scope_t then_scope = pop_scope();
-
-        stmt_c else_block;
-        if (v->else_case_.defined()) {
-            push_scope(ssa_scope_t::kind::if_else);
-            else_block = dispatch(v->else_case_);
-            ssa_scope_t else_scope = pop_scope();
-            // merge ths diverged variables with phi
-            std::map<expr_c, std::vector<expr>, var_cmper_t> updated_vars;
-            for (auto &kv : then_scope.vars_) {
-                updated_vars[kv.first].emplace_back(kv.second.current_value);
-            }
-            for (auto &kv : else_scope.vars_) {
-                updated_vars[kv.first].emplace_back(kv.second.current_value);
-            }
-            for (auto &kv : updated_vars) {
-                if (kv.first.isa<tensor>()) {
-                    // tensors/pointers are immutable, don't need phi
-                    continue;
-                }
-                auto parent_var = get_local_var_nothrow(kv.first);
-                if (!parent_var) {
-                    // if it is a var defined in child scope
-                    continue;
-                }
-
-                if (kv.second.size() == 1) {
-                    auto current_value = parent_var->current_value;
-                    if (scopes_[parent_var->defined_scope_idx].for_depth_
-                            != scopes_.back().for_depth_) {
-                        // if parent var value is computed outside of the
-                        // current for-loop, we need to build PHI for parent
-                        // value
-                        current_value = visit(kv.first.static_as<var_c>())
-                                                .remove_const();
-                    }
-                    kv.second.emplace_back(current_value);
-                }
-                // if it is a var defined in parent scope
-                // let parent for-loop to remember to reset the phi inputs
-                auto &ph = get_local_var_for_update(kv.first)->for_loop_phi;
-                auto itr = then_scope.vars_.find(kv.first);
-                if (itr != then_scope.vars_.end()) {
-                    ph.insert(ph.end(), itr->second.for_loop_phi.begin(),
-                            itr->second.for_loop_phi.end());
-                }
-                itr = else_scope.vars_.find(kv.first);
-                if (itr != else_scope.vars_.end()) {
-                    ph.insert(ph.end(), itr->second.for_loop_phi.begin(),
-                            itr->second.for_loop_phi.end());
-                }
-
-                if (kv.second.size() == 2) {
-                    // if both phi branches has the same value
-                    expr same_var;
-                    if (is_same_with_parent_var(
-                                kv.second[0], kv.second[1], &same_var)) {
-                        get_local_var_for_update(kv.first)->current_value
-                                = same_var;
-                        assert(same_var.isa<var>() || same_var.isa<tensor>()
-                                || same_var.isa<constant>());
-                        continue;
-                    }
-                }
-                if (kv.second.size() == 1) {
-                    get_local_var_for_update(kv.first)->current_value
-                            = kv.second[0];
-                    continue;
-                }
-                auto new_phi = make_expr<ssa_phi_node>(kv.second, false);
-                auto new_var = add_def_after_current_stmt(new_phi);
-                get_local_var_for_update(kv.first)->current_value = new_var;
-                rename_temp_var_with_version(
-                        new_var.checked_as<var>(), kv.first.checked_as<var>());
-            }
-        } else {
-            for (auto &kv : then_scope.vars_) {
-                auto parent_var = get_local_var_nothrow(kv.first);
-                if (parent_var) {
-                    if (kv.first.isa<tensor>()) {
-                        // tensors/pointers are immutable, don't need phi
-                        continue;
-                    }
-                    if (is_same_with_parent_var(kv.second.current_value,
-                                parent_var->current_value, nullptr)) {
-                        continue;
-                    }
-                    expr current_value = parent_var->current_value;
-                    if (scopes_[parent_var->defined_scope_idx].for_depth_
-                            != scopes_.back().for_depth_) {
-                        // if parent var value is computed outside of the
-                        // current for-loop, we need to build PHI for parent
-                        // value
-                        current_value = visit(kv.first.static_as<var_c>())
-                                                .remove_const();
-                    }
-                    auto status = get_local_var_for_update(kv.first);
-                    auto &ph = status->for_loop_phi;
-                    ph.insert(ph.end(), kv.second.for_loop_phi.begin(),
-                            kv.second.for_loop_phi.end());
-
-                    auto new_phi = make_expr<ssa_phi_node>(
-                            std::vector<expr> {
-                                    current_value, kv.second.current_value},
-                            false);
-                    auto new_var = add_def_after_current_stmt(new_phi);
-                    status->current_value = new_var;
-                    rename_temp_var_with_version(new_var.checked_as<var>(),
-                            kv.first.checked_as<var>());
-                }
-            }
-        }
-        return copy_attr(*v,
-                builder::make_if_else_unattached(cond, then_block, else_block));
-    }
-};
-
-func_c ssa_transform_t::operator()(func_c f) {
-    ssa_transform_impl_t impl;
-    return impl.top_level_dispatch(std::move(f));
-}
-
-stmt_c ssa_transform_t::operator()(stmt_c f) {
-    ssa_transform_impl_t impl;
-    return impl.top_level_dispatch(std::move(f));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/ssa_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/ssa_transform.hpp
deleted file mode 100644
index 0c63392472b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/ssa_transform.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SSA_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_SSA_TRANSFORM_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Converts the IR to SSA (Single-Static-Assignment) form.
- * 1. It flattens the exprs and makes sure that, for any expr node, its
- * sub-nodes must be constants or var nodes
- * 2. Any expr node must assigned to a var_node in define_node
- * 3. The assign nodes to local variables will be replaced by defining new
- * variables. And we make sure a local variable is assigned only once (at the
- * define_node, Single-Static-Assignment). Note that the assignments to global
- * variable or local/global tensor elements are unchanged
- * 4. expr_base::ssa_data_ will be available after this pass
- * */
-class ssa_transform_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/static_memory_planner.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/static_memory_planner.cpp
deleted file mode 100644
index 1026d570837..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/static_memory_planner.cpp
+++ /dev/null
@@ -1,664 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "static_memory_planner.hpp"
-#include <algorithm>
-#include <limits>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <util/utils.hpp>
-
-SC_MODULE(pass.mem_plan)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace memory_optim {
-
-// how the buffer was created
-enum class chunk_type {
-    ORIGIN, // the chunk is directly allocated from the large buffer
-    SPLIT, // the chunk is got by splitting another memory chunk
-    MERGED, // the chunk is got by merging several consecutive memory chunks
-};
-
-struct memory_state;
-
-struct memory_chunk_t {
-    chunk_type type_;
-    size_t size_;
-    bool free_ = true;
-    size_t last_freed_tick_ = 0;
-    bool is_inplace_split_remainder_ = false;
-    // splits the chunk and get the left hand side with size = size, registers
-    // both the returned chunk and the rest of the chunk to the state
-    void split(memory_state *state, size_t size, memory_chunk_t *&lhs,
-            memory_chunk_t *&rhs);
-    // move the buffer, propagate the message up to the parent chunk. It will
-    // not update the siblings.
-    virtual void move(int64_t start_diff) = 0;
-    // extend the buffer, propagate the message up to the parent chunk. It will
-    // not update the siblings.
-    virtual void extend(int64_t size_diff) = 0;
-
-    memory_chunk_t(chunk_type type, size_t size) : type_(type), size_(size) {}
-    // there should be no updates to memory chunks after calling
-    // get_start_offset
-    size_t get_start_offset() {
-        if (cached_start_offset == UNINITIALIZED) {
-            cached_start_offset = get_start_offset_impl();
-        }
-        return cached_start_offset;
-    }
-    virtual ~memory_chunk_t() = default;
-
-    virtual size_t get_start_offset_impl() = 0;
-
-protected:
-    static constexpr size_t UNINITIALIZED = std::numeric_limits<size_t>::max();
-    size_t cached_start_offset = UNINITIALIZED;
-};
-
-// the memory chunk that is directly allocated from the large buffer
-struct origin_chunk_t : public memory_chunk_t {
-    // no parent
-    // memory_chunk_t *parent_;
-    size_t start_;
-    origin_chunk_t(size_t start, size_t size)
-        : memory_chunk_t {chunk_type::ORIGIN, size}, start_(start) {}
-    void move(int64_t start_diff) override { start_ += start_diff; }
-    void extend(int64_t size_diff) override { size_ += size_diff; }
-    size_t get_start_offset_impl() override { return start_; }
-};
-
-// the memory chunk that is split from another chunk
-struct split_chunk_t : public memory_chunk_t {
-    memory_chunk_t *parent_;
-    // if the chunk is the left hand side (smaller starting offset)
-    bool is_lhs_;
-    split_chunk_t(size_t size, memory_chunk_t *parent, bool is_lhs)
-        : memory_chunk_t {chunk_type::SPLIT, size}
-        , parent_(parent)
-        , is_lhs_(is_lhs) {}
-    void move(int64_t start_diff) override {
-        if (is_lhs_) { parent_->move(start_diff); }
-        // no need to pass message to parent for rhs, since lhs has done so
-    }
-    void extend(int64_t size_diff) override {
-        size_ += size_diff;
-        parent_->extend(size_diff);
-        // if is_lhs, we will later call rhs->move(...)
-    }
-    size_t get_start_offset_impl() override {
-        if (is_lhs_) {
-            return parent_->get_start_offset();
-        } else {
-            return parent_->get_start_offset() + parent_->size_ - size_;
-        }
-    }
-};
-
-static size_t get_size_of_chunks(const std::vector<memory_chunk_t *> &c) {
-    size_t v = 0;
-    for (auto chk : c) {
-        v += chk->size_;
-    }
-    return v;
-}
-// the memory chunk that is merged from another chunks
-struct merged_chunk_t : public memory_chunk_t {
-    std::vector<memory_chunk_t *> parent_;
-    merged_chunk_t(std::vector<memory_chunk_t *> &&parent)
-        : memory_chunk_t {chunk_type::MERGED, get_size_of_chunks(parent)}
-        , parent_(std::move(parent)) {}
-    void move(int64_t start_diff) override {
-        for (auto v : parent_) {
-            v->move(start_diff);
-        }
-    }
-    void extend(int64_t size_diff) override {
-        size_ += size_diff;
-        parent_.back()->extend(size_diff);
-    }
-    size_t get_start_offset_impl() override {
-        return parent_.front()->get_start_offset();
-    }
-};
-
-struct memory_state {
-    // buffer_id -> allocated memory chunk, used to collect the final result
-    std::unordered_map<uintptr_t, memory_chunk_t *> allocations_;
-    // buffer_id -> <current_alive_memory_chunk>, used when inplace
-    // optimization, and when a buffer is inplace reused by another buffer. The
-    // reused buffer will have unchanged memory_chunk_t in allocations_, because
-    // allocations_ shows the final result of the buffer. cur_allocations_
-    // tracks the current mapping of buffer_id to memory_chunk_t, which may be
-    // different from allocations_
-    std::unordered_map<uintptr_t, memory_chunk_t *> cur_allocations_;
-    // all memory chunks that has been created, takes the ownerships of the
-    // memory_chunk_t objects
-    std::vector<std::unique_ptr<memory_chunk_t>> chunks_;
-    // the current memory chunks, sorted by the starting offset
-    std::vector<memory_chunk_t *> cur_chunks_;
-    // free chunks sorted by size
-    std::multimap<size_t, memory_chunk_t *> free_chunks_by_size_;
-    // free chunks sorted by last freed tick
-    std::multimap<size_t, memory_chunk_t *> free_chunks_by_tick_;
-    // the current size of the large buffer, in number of elements
-    size_t current_alloc_size_ = 0;
-    // the alignment in number of elements
-    size_t alignment_;
-    // the map from a buffer-id to the buffer-ids that the buffer can inplace
-    // reuse
-    const inplace_info_map &inplace_map_;
-    std::unordered_map<uintptr_t, std::vector<uintptr_t>>
-            &out_inplace_selection_;
-    int tick_ = 0;
-    bool hot_first_;
-
-    memory_state(size_t alignment, bool hot_first,
-            const inplace_info_map &inplace_map,
-            std::unordered_map<uintptr_t, std::vector<uintptr_t>>
-                    &out_inplace_selection)
-        : alignment_(alignment)
-        , inplace_map_(inplace_map)
-        , out_inplace_selection_(out_inplace_selection)
-        , hot_first_(hot_first) {}
-
-    void remove_chunk_from_map(memory_chunk_t *target, size_t t,
-            std::multimap<size_t, memory_chunk_t *> &m) {
-        auto mapitr = m.equal_range(t);
-        assert(mapitr.first != mapitr.second);
-        for (auto map_i = mapitr.first; map_i != mapitr.second; ++map_i) {
-            if (map_i->second == target) {
-                m.erase(map_i);
-                break;
-            }
-        }
-    }
-    void remove_chunk_from_free_list(memory_chunk_t *target) {
-        remove_chunk_from_map(target, target->size_, free_chunks_by_size_);
-        remove_chunk_from_map(
-                target, target->last_freed_tick_, free_chunks_by_tick_);
-        target->free_ = false;
-    }
-
-    void add_chunk_to_free_list(memory_chunk_t *target) {
-        free_chunks_by_size_.insert(std::make_pair(target->size_, target));
-        free_chunks_by_tick_.insert(
-                std::make_pair(target->last_freed_tick_, target));
-        target->free_ = true;
-    }
-
-    void extend_alloc(memory_chunk_t *target, size_t aligned) {
-        // remove the chunk from free list
-        remove_chunk_from_free_list(target);
-        int64_t size_diff = aligned - target->size_;
-        assert(size_diff > 0);
-        current_alloc_size_ += size_diff;
-        // extend the target chunk, also move all buffers at the right of it
-        target->extend(size_diff);
-        bool found_target = false;
-        for (auto v : cur_chunks_) {
-            if (v == target) {
-                found_target = true;
-            } else if (found_target) {
-                // v is at the right of the target
-                v->move(size_diff);
-            }
-        }
-        assert(found_target);
-        target->free_ = false;
-    }
-
-    memory_chunk_t *split_alloc(
-            memory_chunk_t *target, size_t aligned, memory_chunk_t *&rhs_ret) {
-        // found a free chunk that is large enough
-        if (target->size_ == aligned) {
-            // a perfect match, no need to split
-            auto ret = target;
-            remove_chunk_from_free_list(target);
-            return ret;
-        }
-        // split the larger chunk
-        assert(target->size_ > aligned);
-        auto lhs = utils::make_unique<split_chunk_t>(aligned, target, true);
-        auto rhs = utils::make_unique<split_chunk_t>(
-                target->size_ - aligned, target, false);
-        rhs_ret = rhs.get();
-        auto ret = lhs.get();
-
-        auto old_itr_in_cur_chunks
-                = std::find(cur_chunks_.begin(), cur_chunks_.end(), target);
-        assert(old_itr_in_cur_chunks != cur_chunks_.end());
-        // replace old chunk with rhs
-        *old_itr_in_cur_chunks = rhs.get();
-        // insert lhs before rhs
-        cur_chunks_.insert(old_itr_in_cur_chunks, lhs.get());
-        rhs->last_freed_tick_ = target->last_freed_tick_;
-        // add rhs to free list
-        add_chunk_to_free_list(rhs.get());
-
-        // move ownership
-        chunks_.emplace_back(std::move(lhs));
-        chunks_.emplace_back(std::move(rhs));
-
-        // remove old chunk in free list
-        remove_chunk_from_free_list(target);
-        ret->free_ = false;
-        return ret;
-    }
-
-    float calculate_size_score(size_t chk_size, size_t alloc_size) const {
-        // size_score = abs(chunk_size-alloc_size)/max(chunk_size, alloc_size)
-        int64_t size_diff = static_cast<int64_t>(chk_size)
-                - static_cast<int64_t>(alloc_size);
-        float size_max = static_cast<float>(std::max(alloc_size, chk_size));
-        float size_score = -std::abs(size_diff) / size_max;
-        // if we don't need to extend the buffer, add a bounus score for it
-        if (alloc_size <= chk_size) { size_score += 1; }
-        // size_score and tick_score are normalized in [-1,1]. We set a weight
-        // for these two scores: 1:1
-        return size_score;
-    }
-
-    // calculates the score of a free chunk to help select the best chunk we
-    // allocate memory from. It considers 2 factors: 1) the free chunk size and
-    // the size of the current memory allocation request. The closer they are,
-    // the better the chunk is. 2) the heat of the chunk. If the chunk's last
-    // free'd tick is closer to the current tick, the chunk is better.
-    // The better the chunk is, the greater the score is
-    float calculate_chunk_score(
-            memory_chunk_t *chk, size_t alloc_size, size_t last_tick) const {
-        // if the buffer is free'd N ticks ago, it will have score max(0, 1 - N
-        // * 0.1)
-        float tick_score = static_cast<float>(tick_ - last_tick) / 10;
-        tick_score = 1 - std::min(tick_score, 1.0f);
-        // size_score and tick_score are normalized in [-1,1]. We set a weight
-        // for these two scores: 1:1
-        return 1 * calculate_size_score(chk->size_, alloc_size)
-                + 1 * tick_score;
-    }
-
-    memory_chunk_t *alloc(uintptr_t bufferid, size_t size) {
-        tick_++;
-        auto ret = do_alloc(bufferid, size);
-        allocations_[bufferid] = ret;
-        cur_allocations_[bufferid] = ret;
-        return ret;
-    }
-
-    // check if the buffer is split from a base tensor and check the
-    // inplace_info for whether it requires zero offset
-    bool check_buffer_offset_for_inplace(
-            memory_chunk_t *chunk, const inplace_info *info) {
-        // if the old memory chunk is splitted from the base tensor
-        bool old_is_split = chunk->is_inplace_split_remainder_;
-        // if the old memory chunk is based on a offset of the base tensor
-        // and we require that we should use zero offset on that tensor, we
-        // cannot reuse it
-        if (old_is_split && info->second == inplace_kind::ZERO_OFFSET) {
-            return false;
-        }
-        return true;
-    }
-
-    // find the range of chunks in cur_chunks_ that can be merged for inplace
-    // reuse, returns the memory size of the range and the start/end iterators
-    size_t find_inplace_merge_range(memory_chunk_t *victim, size_t aligned,
-            const std::unordered_map<memory_chunk_t *, const inplace_info *>
-                    &can_inplace,
-            std::vector<memory_chunk_t *>::iterator &to_merge_start,
-            std::vector<memory_chunk_t *>::iterator &to_merge_end) {
-        // add_chunk_to_free_list(chk);
-        auto itr_in_cur_chunks
-                = std::find(cur_chunks_.begin(), cur_chunks_.end(), victim);
-        assert(itr_in_cur_chunks != cur_chunks_.end());
-        // merge right if they are free or can be inplaced
-        to_merge_start = itr_in_cur_chunks;
-        to_merge_end = itr_in_cur_chunks + 1;
-        // remember the memory size we already collected. If
-        // current_collected_size is greater than the memory size to alloc, we
-        // can stop searching
-        size_t current_collected_size = victim->size_;
-        // look right to see any one we can merge with
-        for (auto itr = itr_in_cur_chunks + 1;
-                itr != cur_chunks_.end() && current_collected_size < aligned;
-                ++itr) {
-            // if the memory chunk is in use and is in can_inplace map, we may
-            // reuse it now
-            auto inplace_info_itr = can_inplace.find(*itr);
-            if ((*itr)->free_
-                    || (inplace_info_itr != can_inplace.end()
-                            && inplace_info_itr->second->second
-                                    == inplace_kind::FREE)) {
-                to_merge_end = itr + 1;
-                current_collected_size += (*itr)->size_;
-            } else {
-                break;
-            }
-        }
-        return current_collected_size;
-    }
-
-    // inplace alloc memory on a chunk that is in use, but about to be freed.
-    memory_chunk_t *do_inplace_alloc(uintptr_t bufferid, size_t aligned) {
-        if (inplace_map_.empty()) { return nullptr; }
-        auto itr_inplace = inplace_map_.find(bufferid);
-        if (itr_inplace == inplace_map_.end()) { return nullptr; }
-        // if the buffer can inplace reuse some other buffers that is
-        // still in use but about to be freed
-        const auto &buffer_can_inplace = itr_inplace->second;
-        if (buffer_can_inplace.empty()) { return nullptr; }
-
-        // reversed map, chunk --> buffer id for inplace candidates
-        std::unordered_map<memory_chunk_t *, const inplace_info *> can_inplace;
-        for (auto &v : buffer_can_inplace) {
-            auto itr = cur_allocations_.find(v.first);
-            if (itr != cur_allocations_.end()) {
-                can_inplace[itr->second] = &v;
-            }
-        }
-
-        // stage 1, find a victim based on the memory size that can be freed
-        float target_score = -std::numeric_limits<float>::infinity();
-        memory_chunk_t *victim = nullptr;
-        std::vector<memory_chunk_t *>::iterator to_merge_start;
-        std::vector<memory_chunk_t *>::iterator to_merge_end;
-        size_t current_collected_size = 0;
-        for (auto &bufinfo : buffer_can_inplace) {
-            auto buf_id = bufinfo.first;
-            auto old_buf_itr = cur_allocations_.find(buf_id);
-            // if the buffer has already been reused by other buffers, skip
-            if (old_buf_itr == cur_allocations_.end()) { continue; }
-            // the old memory chunk
-            auto old_buf = old_buf_itr->second;
-
-            auto &old_inplace_info = can_inplace[old_buf];
-            if (!check_buffer_offset_for_inplace(old_buf, old_inplace_info)) {
-                continue;
-            }
-
-            std::vector<memory_chunk_t *>::iterator cur_merge_start;
-            std::vector<memory_chunk_t *>::iterator cur_merge_end;
-            auto cur_size = find_inplace_merge_range(old_buf, aligned,
-                    can_inplace, cur_merge_start, cur_merge_end);
-            float score = calculate_size_score(cur_size, aligned);
-            if (score > target_score) {
-                target_score = score;
-                victim = old_buf;
-                to_merge_start = cur_merge_start;
-                to_merge_end = cur_merge_end;
-                current_collected_size = cur_size;
-            }
-        }
-        if (current_collected_size * 10 < aligned) {
-            // if the memory can be reused is too small (less than 10% of the
-            // target size), inplacing has no benifits, skip
-            return nullptr;
-        }
-        if (!victim) { return nullptr; }
-        assert(!victim->free_);
-
-        victim->last_freed_tick_ = tick_;
-
-        std::vector<memory_chunk_t *> merged_buffers(
-                to_merge_start, to_merge_end);
-        for (auto buf : merged_buffers) {
-            auto itr = can_inplace.find(buf);
-            if (itr != can_inplace.end()) {
-                uintptr_t vic_buffer_id = itr->second->first;
-                if (vic_buffer_id) {
-                    out_inplace_selection_[bufferid].emplace_back(
-                            vic_buffer_id);
-                    SC_MODULE_INFO << "Buffer " << bufferid
-                                   << " inplace reuses " << vic_buffer_id;
-                }
-            }
-        }
-        if (current_collected_size < aligned) {
-            // if the collected memory size is still less than the size to
-            // alloc, need to extend
-            auto target_size = aligned - current_collected_size
-                    + merged_buffers.back()->size_;
-            if (!merged_buffers.back()->free_) {
-                // if it is not free, we are inplacing it. Temporarily move to
-                // free list
-                add_chunk_to_free_list(merged_buffers.back());
-            }
-            extend_alloc(merged_buffers.back(), target_size);
-            // after extension of the last buffer, the collected size is equal
-            // to the size to alloc
-            current_collected_size = aligned;
-        }
-
-        // remove from freelist and buffer_id->chunk map
-        for (auto itr = to_merge_start; itr != to_merge_end; ++itr) {
-            auto chunk = *itr;
-            if (chunk->free_) { remove_chunk_from_free_list(chunk); }
-            auto itr_chunk = can_inplace.find(chunk);
-            if (itr_chunk != can_inplace.end()) {
-                cur_allocations_.erase(itr_chunk->second->first);
-            }
-        }
-
-        memory_chunk_t *merged_chunk;
-        // if we need to merge multiple chunks
-        if (to_merge_end - to_merge_start > 1) {
-            // do merge
-            chunks_.emplace_back(utils::make_unique<merged_chunk_t>(
-                    std::vector<memory_chunk_t *>(merged_buffers)));
-            merged_chunk = chunks_.back().get();
-            // remove merged chunks from free list and cur_chunk list
-            // add merged chunk to cur_chunks and free_chunks_by_size
-            *to_merge_start = merged_chunk;
-            merged_chunk->last_freed_tick_ = tick_;
-            merged_chunk->free_ = false;
-            cur_chunks_.erase(to_merge_start + 1, to_merge_end);
-        } else {
-            merged_chunk = victim;
-            merged_chunk->last_freed_tick_ = tick_;
-        }
-
-        // merged_chunk is in cur_chunks_ and is removed from freelist and
-        // cur_allocations_ map
-        if (current_collected_size == aligned) {
-            // if is extended, or perfect match, just return the chunk
-            merged_chunk->free_ = false;
-            return merged_chunk;
-        } else {
-            // otherwise, there are some unused memory in the chunk to be
-            // reused. We need to split it. If the RHS of the chunk is from a
-            // inplace reused buffer, need to add a mapping of the buffer id to
-            // the RHS remaining chunk
-            if (!merged_chunk->free_) { add_chunk_to_free_list(merged_chunk); }
-            memory_chunk_t *rhs = nullptr;
-            auto ret = split_alloc(merged_chunk, aligned, rhs);
-            auto itr_chunk = can_inplace.find(merged_buffers.back());
-            if (itr_chunk != can_inplace.end()) {
-                // if the last chunk is from inplace map, the RHS chunk is not
-                // really freed, need to remove from free list and mark it not
-                // freed.
-                remove_chunk_from_free_list(rhs);
-                rhs->is_inplace_split_remainder_ = true;
-                // update the buffer id -> chunk map, so that when freeing the
-                // inplaced buffer, we can find the correct remaining buffer
-                cur_allocations_[itr_chunk->second->first] = rhs;
-            }
-            return ret;
-        }
-    }
-
-    memory_chunk_t *do_alloc(uintptr_t bufferid, size_t size) {
-        auto aligned = utils::divide_and_ceil(size, alignment_) * alignment_;
-        // try inplace
-        if (auto inp_ret = do_inplace_alloc(bufferid, size)) { return inp_ret; }
-        if (free_chunks_by_size_.empty()) {
-            chunks_.emplace_back(utils::make_unique<origin_chunk_t>(
-                    current_alloc_size_, aligned));
-            current_alloc_size_ += aligned;
-            auto ret = chunks_.back().get();
-            cur_chunks_.emplace_back(ret);
-            ret->free_ = false;
-            return ret;
-        }
-        if (hot_first_) {
-            memory_chunk_t *target = free_chunks_by_tick_.rbegin()->second;
-            float target_score = calculate_chunk_score(
-                    target, aligned, free_chunks_by_tick_.rbegin()->first);
-            for (auto &kv : free_chunks_by_tick_) {
-                float score
-                        = calculate_chunk_score(kv.second, aligned, kv.first);
-                if (score > target_score) {
-                    target = kv.second;
-                    target_score = score;
-                }
-            }
-            if (target->size_ < aligned) {
-                extend_alloc(target, aligned);
-                return target;
-            } else {
-                memory_chunk_t *rhs;
-                return split_alloc(target, aligned, rhs);
-            }
-        } else {
-            // find a free chunk that best fits the current size
-            // itr will be the smallest chunk whose size >= aligned
-            auto itr = free_chunks_by_size_.lower_bound(aligned);
-            if (itr == free_chunks_by_size_.end()) {
-                memory_chunk_t *target;
-                // itr points to the last element
-                --itr;
-                // if not found, this means that all free chunk is smaller than
-                // aligned size, switch to the largest chunk
-                target = itr->second;
-                extend_alloc(target, aligned);
-                return target;
-            } else {
-                memory_chunk_t *rhs;
-                return split_alloc(itr->second, aligned, rhs);
-            }
-        }
-    }
-
-    void dealloc(memory_chunk_t *chk) {
-        tick_++;
-        chk->last_freed_tick_ = tick_;
-        add_chunk_to_free_list(chk);
-        auto itr_in_cur_chunks
-                = std::find(cur_chunks_.begin(), cur_chunks_.end(), chk);
-        assert(itr_in_cur_chunks != cur_chunks_.end());
-        // merge left and right if they are free
-        std::vector<memory_chunk_t *>::iterator to_merge_start
-                = itr_in_cur_chunks;
-        std::vector<memory_chunk_t *>::iterator to_merge_end
-                = itr_in_cur_chunks + 1;
-        // look left to see any one we can merge with
-        for (auto itr = itr_in_cur_chunks;; --itr) {
-            if ((*itr)->free_) {
-                to_merge_start = itr;
-            } else {
-                break;
-            }
-            if (itr == cur_chunks_.begin()) { break; }
-        }
-        // look right to see any one we can merge with
-        for (auto itr = itr_in_cur_chunks + 1; itr != cur_chunks_.end();
-                ++itr) {
-            if ((*itr)->free_) {
-                to_merge_end = itr + 1;
-            } else {
-                break;
-            }
-        }
-        if (to_merge_end - to_merge_start > 1) {
-            // do merge
-            chunks_.emplace_back(utils::make_unique<merged_chunk_t>(
-                    std::vector<memory_chunk_t *>(
-                            to_merge_start, to_merge_end)));
-
-            // remove merged chunks from free list and cur_chunk list
-            for (auto itr = to_merge_start; itr != to_merge_end; ++itr) {
-                auto chunk = *itr;
-                remove_chunk_from_free_list(chunk);
-            }
-            // add merged chunk to cur_chunks and free_chunks_by_size
-            *to_merge_start = chunks_.back().get();
-            chunks_.back()->last_freed_tick_ = tick_;
-            add_chunk_to_free_list(chunks_.back().get());
-            cur_chunks_.erase(to_merge_start + 1, to_merge_end);
-        }
-        // else, no chunks are merged, do nothing
-    }
-
-    void dealloc(uintptr_t bufferid) {
-        auto alocitr = allocations_.find(bufferid);
-        COMPILE_ASSERT(alocitr != allocations_.end(),
-                "Cannot find buffer id in allocations");
-        auto itr = cur_allocations_.find(bufferid);
-        if (itr != cur_allocations_.end()) {
-            itr->second->is_inplace_split_remainder_ = false;
-            dealloc(itr->second);
-            cur_allocations_.erase(itr);
-        }
-    }
-
-    std::string to_string() const {
-        std::stringstream ss;
-        ss << "total size " << current_alloc_size_ << " ";
-        size_t cur_offset = 0;
-        for (auto buf : cur_chunks_) {
-            ss << "| " << cur_offset << ',' << buf->size_ << ',' << buf->free_
-               << " ";
-            cur_offset += buf->size_;
-        }
-        return ss.str();
-    }
-};
-
-size_t schedule_memory_allocations(
-        const std::vector<memory_alloc_trace_t> &traces, size_t alignment,
-        bool hot_first, const inplace_info_map &inplace_map,
-        std::unordered_map<uintptr_t, size_t> &out_schedule,
-        std::unordered_map<uintptr_t, std::vector<uintptr_t>>
-                &out_inplace_selection) {
-    memory_state planner {
-            alignment, hot_first, inplace_map, out_inplace_selection};
-    SC_MODULE_INFO << "Start of a function";
-    for (auto &trace : traces) {
-        if (trace.size_ > 0) {
-            planner.alloc(trace.buffer_id_, trace.size_);
-            SC_MODULE_INFO << "Alloc " << trace.buffer_id_
-                           << ", sz=" << trace.size_;
-            SC_MODULE_INFO << planner.to_string();
-        } else {
-            planner.dealloc(trace.buffer_id_);
-            SC_MODULE_INFO << "Dealloc " << trace.buffer_id_;
-            SC_MODULE_INFO << planner.to_string();
-        }
-    }
-    for (auto &kv : planner.allocations_) {
-        out_schedule[kv.first] = kv.second->get_start_offset();
-    }
-    return planner.current_alloc_size_;
-}
-} // namespace memory_optim
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/static_memory_planner.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/static_memory_planner.hpp
deleted file mode 100644
index 067a3fcb6c3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/static_memory_planner.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_STATIC_MEMORY_PLANNER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_STATIC_MEMORY_PLANNER_HPP
-
-#include <stdint.h>
-#include <utility>
-#include <vector>
-#include "tensor_inplace_info.hpp"
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace memory_optim {
-struct memory_alloc_trace_t {
-    // unique id of a buffer
-    uintptr_t buffer_id_;
-    // if > 0, size of the buffer allocation, if = 0, it is a deallocation trace
-    std::size_t size_;
-
-    memory_alloc_trace_t(uintptr_t buffer_id = 0, std::size_t size = 0) {
-        buffer_id_ = buffer_id;
-        size_ = size;
-    }
-};
-
-using inplace_info = std::pair<uintptr_t, inplace_kind>;
-
-using inplace_info_map
-        = std::unordered_map<uintptr_t, std::vector<inplace_info>>;
-
-/**
- * Given a list of memory buffer alloc and free traces, try to use a large
- * buffer to hold all allocated memory, and statically allocate each memory
- * buffer from the large buffer for better memory reuse.
- * @param traces the list of memory alloc and free traces, sorted by event time.
- * @param alignment the alignment in number of elements
- * @param hot_first use the hot buffer first, instead of using best fit in size
- * @param inplace_map the map from the tensor to alloc into the candidate
- * tensors that can be inplace reused for it.
- * @param out_schedule the output schedule for each buffer: the location that
- * the buffer should be in the large buffer (as an offset in number of elements)
- * @param out_inplace_selection the output buffer id -> inplace buffer it reuses
- * @return the size of the large buffer, in number of elements
- * */
-std::size_t schedule_memory_allocations(
-        const std::vector<memory_alloc_trace_t> &traces, std::size_t alignment,
-        bool hot_first, const inplace_info_map &inplace_map,
-        std::unordered_map<uintptr_t, std::size_t> &out_schedule,
-        std::unordered_map<uintptr_t, std::vector<uintptr_t>>
-                &out_inplace_selection);
-} // namespace memory_optim
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor2var.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor2var.cpp
deleted file mode 100644
index df121e6ffd3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor2var.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <utility>
-#include <vector>
-#include "../viewer.hpp"
-#include "../visitor.hpp"
-#include "pointer_alias_info.hpp"
-#include "tensor2var.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <util/any_map.hpp>
-
-SC_MODULE(pass.tensor2var)
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// index2var will bypass must_tensor2var. So let index2var run before tensor2var
-SC_DECL_PASS_INFO(tensor2var,
-        SC_PASS_DEPENDS_ON(dead_write_eliminator, index_flattener, index2var,
-                loop_unroller, tensor_shrinker),
-        SC_PASS_REQUIRE_STATE(CONST_FOLDED), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-struct tensor2var_result_t {
-    uint32_t simd_len_ = 0;
-    size_t tensor_len_;
-    bool can_replace_;
-    bool defined_in_parallel_;
-    std::vector<expr> to_replace_;
-    std::vector<bool> referenced_;
-    tensor2var_result_t(
-            size_t tensor_len, bool can_replace, bool defined_in_parallel)
-        : tensor_len_(tensor_len)
-        , can_replace_(can_replace)
-        , defined_in_parallel_(defined_in_parallel) {}
-};
-
-static tensor2var_result_t *get_result(const expr_c &v) {
-    if (v->temp_data_) return v->temp_data_->get_or_null<tensor2var_result_t>();
-    return nullptr;
-}
-
-static int64_t check_bound(int64_t idx, const indexing_c &index,
-        int64_t chk_size, size_t tensor_size) {
-    COMPILE_ASSERT(idx < chk_size,
-            "The out-of-bound access is found: " << index << ", tensor size:"
-                                                 << tensor_size);
-    return idx;
-}
-
-class tensor2var_analysis_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    bool is_parallel_ = false;
-
-    std::vector<sc_expr_type> expr_stack_;
-
-    expr_c dispatch(expr_c v) override {
-        expr_stack_.push_back(v->node_type_);
-        auto ret = ir_viewer_t::dispatch(std::move(v));
-        expr_stack_.pop_back();
-        return ret;
-    }
-
-    void view(define_c v) override {
-        if (v->linkage_ == linkage::local && v->var_.isa<tensor>()) {
-            auto tsr = v->var_.static_as<tensor>();
-            assert(tsr->dims_.size() == 1UL);
-            size_t sz = 0;
-            if (tsr->dims_.front().isa<constant>()) {
-                sz = get_const_as_int(tsr->dims_.front().static_as<constant>());
-            }
-            bool no_tensor2var = v->var_->attr_
-                    && v->var_->attr_->get_or_else(
-                            attr_keys::no_tensor2var, false);
-            if (!no_tensor2var) {
-                auto alias = alias_info::get_alias_info(*(v->var_));
-                if (alias && !alias->has_no_alias()) { no_tensor2var = true; }
-            }
-            v->var_->temp_data() = tensor2var_result_t {sz,
-                    /*can_replace*/ sz != 0 && !no_tensor2var, is_parallel_};
-        }
-        if (v->init_.defined()) { dispatch(v->init_); }
-    }
-
-    void view(for_loop_c v) override {
-        bool old_is_paralllel = is_parallel_;
-        is_parallel_ |= v->kind_ == for_type::PARALLEL;
-        ir_viewer_t::view(v);
-        is_parallel_ = old_is_paralllel;
-    }
-
-    void view(tensor_c v) override {
-        auto result = get_result(v);
-        if (!result) { return; }
-        if (expr_stack_.size() <= 1UL
-                || expr_stack_[expr_stack_.size() - 2]
-                        != sc_expr_type::indexing) {
-            // if the tensor node is directly used without indexing node, like
-            // `var b: pointer = A`, we cannot replace it
-            result->can_replace_ = false;
-            return;
-        }
-    }
-
-    void view(tensorptr_c v) override {
-        if (auto result = get_result(v->base_->ptr_)) {
-            // if we take address of tensor, don't replace it with var
-            result->can_replace_ = false;
-        }
-        ir_viewer_t::view(v);
-    }
-
-    void view(indexing_c v) override {
-        if (auto result = get_result(v->ptr_)) {
-            if (result->can_replace_) {
-                if (result->defined_in_parallel_ != is_parallel_
-                        || v->mask_.defined()) {
-                    // if the tensor is defined out of parallel-for and we visit
-                    // it in a parallel-for
-                    result->can_replace_ = false;
-                } else {
-                    auto cur_simd = v->dtype_.lanes_;
-                    if (result->simd_len_ == 0UL) {
-                        result->simd_len_ = cur_simd;
-                        if (result->tensor_len_ % result->simd_len_ != 0) {
-                            result->can_replace_ = false;
-                        }
-                        if (result->tensor_len_ / result->simd_len_ > 16) {
-                            result->can_replace_ = false;
-                            SC_MODULE_INFO
-                                    << "Cannot perform tensor2var on "
-                                    << v->ptr_
-                                    << " because it is too large: length="
-                                    << result->tensor_len_
-                                    << ", simd_len=" << result->simd_len_;
-                        }
-                        result->referenced_ = std::vector<bool>(
-                                result->tensor_len_ / result->simd_len_);
-                    }
-                    if (cur_simd != result->simd_len_) {
-                        // if the tensor is accessed with different SIMD len
-                        result->can_replace_ = false;
-                    } else {
-                        assert(v->idx_.size() == 1UL);
-                        auto idx = v->idx_.front();
-                        if (!idx.isa<constant>()) {
-                            // if the indexing on the tensor is not constant
-                            result->can_replace_ = false;
-                        } else {
-                            auto cidx = get_const_as_int(
-                                    idx.static_as<constant>());
-                            if (cidx % cur_simd != 0) {
-                                // if the constant index is not compatible with
-                                // SIMD length
-                                result->can_replace_ = false;
-                            }
-                            result->referenced_[check_bound(cidx / cur_simd, v,
-                                    result->referenced_.size(),
-                                    result->tensor_len_)]
-                                    = true;
-                        }
-                    }
-                }
-            }
-        }
-
-        ir_viewer_t::view(v);
-    }
-}; // namespace sc
-
-class tensor2var_replacer_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    std::vector<stmt_c> *seq_ = nullptr;
-    stmt_c visit(define_c v) override {
-        if (auto result = get_result(v->var_)) {
-            bool must_tensor2var = v->var_->attr_
-                    && v->var_->attr_->get_or_else(
-                            attr_keys::must_tensor2var, false);
-            if (must_tensor2var && !result->can_replace_) {
-                SC_MODULE_WARN << "The tensor " << v
-                               << " is marked must_tensor2var but "
-                                  "cannot be replaced.";
-            }
-            // fix-me(yijie): met performance regression when replacing scalar
-            // values. Need to figure out why, and remove the following check
-            if (result->tensor_len_ == 1) { result->can_replace_ = false; }
-            if (result->can_replace_) {
-                auto num_vars = result->simd_len_ == 0
-                        ? 0
-                        : result->tensor_len_ / result->simd_len_;
-
-                auto tsr = v->var_.checked_as<tensor>();
-                auto dtype = tsr->elem_dtype_;
-                dtype.lanes_ = result->simd_len_;
-                bool refered = false;
-                for (size_t i = 0; i < num_vars; i++) {
-                    if (result->referenced_[i]) {
-                        refered = true;
-                        result->to_replace_.emplace_back(builder::make_var(
-                                dtype, tsr->name_ + std::to_string(i)));
-                        seq_->emplace_back(
-                                builder::make_var_tensor_def_unattached(
-                                        result->to_replace_[i]));
-                    } else {
-                        result->to_replace_.emplace_back(expr());
-                    }
-                }
-                // fix-me(yijie): should remove the replaced tensor definition
-                if (refered) {
-                    return stmt();
-                } else {
-                    result->can_replace_ = false;
-                }
-            }
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    expr_c visit(tensor_c v) override {
-        auto result = get_result(v);
-        assert(!result || !result->can_replace_);
-        return ir_visitor_t::visit(v);
-    }
-
-    expr_c visit(indexing_c v) override {
-        if (auto result = get_result(v->ptr_)) {
-            if (result->can_replace_) {
-                assert(v->idx_.size() == 1UL);
-                auto idx = v->idx_.front();
-                auto cidx = get_const_as_int(idx.checked_as<constant>());
-                return result->to_replace_[check_bound(cidx / result->simd_len_,
-                        v, result->to_replace_.size(), result->tensor_len_)];
-            }
-        }
-
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(stmts_c s) override {
-        auto old_seq = seq_;
-        std::vector<stmt_c> myseq;
-        seq_ = &myseq;
-        bool changed = false;
-        for (auto &st : s->seq_) {
-            auto ret = dispatch(st);
-            changed |= !ret.ptr_same(st);
-            if (ret.defined()) { myseq.emplace_back(ret); }
-        }
-        seq_ = old_seq;
-        if (!changed) { return s; }
-        return copy_attr(*s, builder::make_stmts_unattached(myseq));
-    }
-};
-
-func_c tensor2var_t::operator()(func_c f) {
-    if (f->attr_ && f->attr_->get_or_else(function_attrs::low_level, false)) {
-        return f;
-    }
-    tensor2var_analysis_t analy;
-    analy.dispatch(f);
-    tensor2var_replacer_t impl;
-    return impl.dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor2var.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor2var.hpp
deleted file mode 100644
index 2825257768b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor2var.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR2VAR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR2VAR_HPP
-
-#include <vector>
-#include "../function_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace attr_keys {
-// bool. applied on tensor node
-constexpr const char *no_tensor2var = "no_tensor2var";
-// bool. applied on tensor node. Throw an error if the tensor cannot be
-// transformed to var
-constexpr const char *must_tensor2var = "must_tensor2var";
-} // namespace attr_keys
-
-/**
- * Replace small local tensors with vars. If all indexing nodes on a local
- * tensor have constant indices and the SIMD length are the same, we will
- * replace the tensor with a (list of) local var(s).
- * */
-class tensor2var_t : public function_pass_t {
-public:
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_init.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_init.cpp
deleted file mode 100644
index ea98fe42279..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_init.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <string>
-#include <utility>
-#include <vector>
-#include "tensor_init.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/parallel_workload_attr.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <runtime/config.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(tensor_init,
-        SC_PASS_DEPENDS_ON(tensor_shrinker, constant_folder),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-class tensor_init_impl_t : public ir_visitor_t {
-public:
-    context_ptr ctx_;
-    tensor_init_impl_t(const context_ptr &ctx) : ctx_(ctx) {}
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-    std::vector<stmt_c> insert_;
-    bool is_parallel_ = false;
-    bool no_parallel_ = false;
-
-    stmt_c visit(for_loop_c v) override {
-        bool old_parallel = is_parallel_;
-        is_parallel_ |= v->kind_ == for_type::PARALLEL;
-        auto ret = ir_visitor_t::visit(v);
-        is_parallel_ = old_parallel;
-        return ret;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt_c> seq = std::move(insert_);
-        bool changed = false;
-        changed = !seq.empty();
-
-        for (auto &s : v->seq_) {
-            auto new_st = dispatch(s);
-            seq.emplace_back(new_st);
-            if (s.isa<define>()) {
-                auto def = new_st.static_as<define>();
-                if (def->linkage_ == linkage::local) {
-                    insert_tensor_zero_init(seq, def->var_);
-                }
-            }
-            changed |= !seq.back().ptr_same(s);
-        }
-        if (!changed) { return v; }
-        return copy_attr(*v, builder::make_stmts_unattached(seq));
-    }
-
-    static void set_attr(const stmt &v, const tensor &tsr) {
-        v->attr()[parallel_workload::attr_workload_number]
-                = parallel_workload::write_weight
-                * utils::get_sizeof_etype(tsr->elem_dtype_.type_code_);
-    }
-
-    void insert_tensor_zero_init(std::vector<stmt_c> &seq, const expr &e) {
-        auto tsr = e.as<tensor>();
-        if (tsr.defined() && tsr->init_value_
-                && (!tsr->attr_
-                        || !tsr->attr_->has_key(attr_keys::shared_const))) {
-            union_val val;
-            if (tsr->init_value_
-                    == tensor_node::get_zero_tensor_initializer()) {
-                val.u64 = 0;
-            } else {
-                COMPILE_ASSERT(tsr->init_value_->size_ == sizeof(val),
-                        "Tensor initializer of local tensors must be size of "
-                        "union_val");
-                val = *reinterpret_cast<union_val *>(tsr->init_value_->data_);
-            }
-            // if already in parallel, or the func is marked no_parallel, we
-            // cannot make parallel-for
-            // todo(anyone): need check and reset non-parallel type for all
-            // loops when threads == 1
-            bool can_parallel = !is_parallel_ && !no_parallel_
-                    && runtime_config_t::get().get_num_threads() > 1;
-            assert(tsr->dims_.size() == 1);
-            auto dim = tsr->dims_[0];
-            if (dim.isa<constant>()) {
-                auto len = get_const_as_int(dim.static_as<constant>());
-                auto step = ctx_->get_max_vector_lanes(
-                        tsr->elem_dtype_.type_code_);
-                auto remainder = len % step;
-                if (len >= step) {
-                    expr itr = builder::make_var(datatypes::index, "itr");
-                    stmt assign = builder::make_assign_unattached(
-                            e[span_t({itr}, step)],
-                            make_expr<constant_node>(val,
-                                    sc_data_type_t {tsr->elem_dtype_.type_code_,
-                                            step}));
-                    set_attr(assign, tsr);
-                    stmt body = builder::make_stmts_unattached({assign});
-                    body = builder::make_for_loop_unattached(itr, UINT64_C(0),
-                            static_cast<uint64_t>(len - remainder),
-                            static_cast<uint64_t>(step), body, true,
-                            can_parallel ? for_type::PARALLEL
-                                         : for_type::NORMAL);
-                    seq.emplace_back(body);
-                }
-                if (remainder != 0) {
-                    expr itr = builder::make_var(datatypes::index, "itr_rem");
-                    stmt assign = builder::make_assign_unattached(e[{itr}],
-                            make_expr<constant_node>(val, tsr->elem_dtype_));
-                    set_attr(assign, tsr);
-                    stmt body = builder::make_stmts_unattached({assign});
-                    body = builder::make_for_loop_unattached(itr,
-                            static_cast<uint64_t>(len - remainder),
-                            static_cast<uint64_t>(len),
-                            static_cast<uint64_t>(1), body, true,
-                            for_type::NORMAL);
-                    seq.emplace_back(body);
-                } else {
-                    bool must_tensor2var = tsr->attr_
-                            && tsr->attr_->get_or_else(
-                                    attr_keys::must_tensor2var, false);
-                    // no remainder loop, try unroll
-                    if (len / step <= 8 || must_tensor2var) {
-                        seq.back()
-                                .remove_const()
-                                ->attr()[stmt_attr_key::unroll_loop]
-                                = 0;
-                    }
-                }
-            } else {
-                expr itr = builder::make_var(datatypes::index, "itr");
-                stmt assign = builder::make_assign_unattached(e[{itr}],
-                        make_expr<constant_node>(val, tsr->elem_dtype_));
-                set_attr(assign, tsr);
-                stmt body = builder::make_stmts_unattached({assign});
-                body = builder::make_for_loop_unattached(itr,
-                        static_cast<uint64_t>(0), dim, static_cast<uint64_t>(1),
-                        body, true,
-                        can_parallel ? for_type::PARALLEL : for_type::NORMAL);
-                seq.emplace_back(body);
-            }
-        }
-    }
-
-    func_c dispatch(func_c f) override {
-        no_parallel_ = f->attr_
-                && f->attr_->get_or_else(function_attrs::no_parallel, false);
-        for (auto &p : f->params_) {
-            insert_tensor_zero_init(insert_, p);
-        }
-        return ir_visitor_t::dispatch(f);
-    }
-};
-
-func_c tensor_init_t::operator()(func_c f) {
-    tensor_init_impl_t simpl(ctx_);
-    return simpl.dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_init.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_init.hpp
deleted file mode 100644
index af97162471e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_init.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_INIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_INIT_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Add initialization code for local tensors with initial values
- * */
-class tensor_init_t : public function_pass_t {
-public:
-    context_ptr ctx_;
-    tensor_init_t(context_ptr ctx) : ctx_(ctx) {}
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace.cpp
deleted file mode 100644
index b3dfc0439a6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace.cpp
+++ /dev/null
@@ -1,431 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <functional>
-#include <limits>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "buffer_schedule.hpp"
-#include "buffer_schedule_utils.hpp"
-#include "pointer_alias_info.hpp"
-#include "tensor_inplace.hpp"
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/tensor_inplace.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/any_map.hpp>
-
-SC_MODULE(pass.tensor_inplace);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-std::ostream &operator<<(std::ostream &os, const tensor_inplace_info_t &value) {
-    os << '{' << value.used_arg_idx_ << '}';
-    return os;
-}
-
-SC_DECL_PASS_INFO(tensor_inplace,
-        SC_PASS_DEPENDS_ON(
-                constant_folder, index_flattener, validator, auto_caster),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-using namespace special_ticks;
-
-class func_finder_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    std::unordered_set<std::shared_ptr<func_base>> funcs_;
-
-    void view(call_c v) override {
-        ir_viewer_t::view(v);
-        auto funct = std::dynamic_pointer_cast<func_base>(v->func_);
-        if (funct) { funcs_.insert(funct); }
-    }
-};
-
-using inplace_hint_t
-        = std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>;
-
-static void filter_and_sync_inplace_hint(
-        const func_base *callee, const func_t &f) {
-    if (!callee->attr_) { return; }
-    auto hint_in_def = callee->attr_->get_or_null<inplace_hint_t>(
-            function_attrs::inplace_hint);
-    if (!hint_in_def) { return; }
-    auto &params = callee->params_;
-    // the alias id is the in-place result selected by
-    // buffer_scheduler_t, try to use it to filter the
-    // inplace_hint
-    for (auto itrkv = hint_in_def->begin(); itrkv != hint_in_def->end();) {
-        auto out_arg_idx = itrkv->first;
-        auto &hints = itrkv->second;
-        if (auto out_alias_id
-                = alias_info::get_alias_info(*params.at(out_arg_idx))) {
-            for (auto itr_in = hints.begin(); itr_in != hints.end();) {
-                auto in_arg_idx = itr_in->used_arg_idx_;
-                if (auto in_alias_id
-                        = alias_info::get_alias_info(*params.at(in_arg_idx))) {
-                    if (out_alias_id->is_alias_of(in_alias_id)) {
-                        // if tensors are alias, the inplace happens. continue
-                        ++itr_in;
-                        continue;
-                    }
-                }
-                itr_in = hints.erase(itr_in);
-            }
-            ++itrkv;
-        } else {
-            itrkv = hint_in_def->erase(itrkv);
-        }
-    }
-    f->attr()[function_attrs::inplace_hint] = *hint_in_def;
-    f->decl_->attr()[function_attrs::inplace_hint] = *hint_in_def;
-    if (!hint_in_def->empty()) {
-        SC_MODULE_INFO << "arg inplace: " << f->name_ << " : "
-                       << utils::general_print(*hint_in_def);
-    }
-}
-
-static void get_inplace_args_from_called_funcs(const func_c &f, // caller
-        const std::unordered_map<expr_c, tensor_tick_info_t> &ticks,
-        // the output arg -> input arg map
-        std::vector<std::pair<expr_c, expr_c>> &inplace_pairs) {
-    const std::vector<expr> &f_args = f->params_;
-    // the in args that already inplaced. (We may not need this, if the upper
-    // level framework promises that it will not inplace multiple out arg to one
-    // in arg. Keep it for security.)
-    std::unordered_set<expr_c> inplaced_in_args;
-    for (const auto &st : f->body_.checked_as<stmts>()->seq_) {
-        if (!st.isa<evaluate_c>()) { continue; }
-        const auto &callee = st.static_as<evaluate_c>()->value_;
-        if (!callee.isa<call_c>()) { continue; }
-        const auto &func = std::dynamic_pointer_cast<func_base>(
-                callee.static_as<call_c>()->func_);
-        if (!func || !func->attr_) { continue; }
-        const auto &args = callee.static_as<call_c>()->args_;
-
-        auto hint = func->attr_->get_or_null<inplace_hint_t>(
-                function_attrs::inplace_hint);
-        if (hint) {
-            SC_MODULE_INFO << "Get inplace hint size: " << hint->size();
-            for (const auto &pair : *hint) {
-                // The buffer scheduler does not inplace graph's input and
-                // output args, so we need to know decide which input is
-                // actually selected to inplace. Use the first for now.
-                auto out_arg = args[pair.first];
-                if (out_arg.isa<tensorptr>()) {
-                    out_arg = get_base_tensor_of(out_arg).dyn_as<expr>();
-                }
-                if (ticks.find(out_arg) != ticks.end()
-                        && ticks.at(out_arg).is_arg_ && out_arg->attr_
-                        && out_arg->attr_->has_key("write_buffer")) {
-                    // this func out arg is also graph's out arg
-                    for (const auto &in_candidate : pair.second) {
-                        auto in_arg = args[in_candidate.used_arg_idx_];
-                        if (in_arg.isa<tensorptr>()) {
-                            in_arg = get_base_tensor_of(in_arg).dyn_as<expr>();
-                        }
-                        if (ticks.find(in_arg) != ticks.end()
-                                && ticks.at(in_arg).is_arg_ && in_arg->attr_
-                                && in_arg->attr_->has_key("read_buffer")
-                                && !(in_arg->attr_->has_key("write_buffer"))) {
-                            // this func in arg is also graph's in arg
-                            if (inplaced_in_args.count(in_arg)) { continue; }
-                            inplaced_in_args.insert(in_arg);
-                            SC_MODULE_INFO << "Inplace hint: #" << pair.first
-                                           << " arg: " << out_arg << " with #"
-                                           << in_candidate.used_arg_idx_
-                                           << " arg: " << in_arg;
-                            inplace_pairs.emplace_back(
-                                    std::make_pair(out_arg, in_arg));
-                            break;
-                        }
-                    }
-                }
-            }
-        } else {
-            SC_MODULE_INFO << "No inplace hint in func, try another way";
-            for (const auto &arg : args) {
-                if (ticks.find(arg) != ticks.end() && ticks.at(arg).is_arg_
-                        && arg->attr_ && arg->attr_->has_key("write_buffer")) {
-                    // this arg is an output arg of caller f.
-                    COMPILE_ASSERT(std::any_of(f_args.begin(), f_args.end(),
-                                           [&arg](const expr &e) {
-                                               return e.ptr_same(arg);
-                                           }),
-                            "Arg is not output arg of caller");
-                    for (const auto &f_arg : f_args) {
-                        if (std::none_of(args.begin(), args.end(),
-                                    [&f_arg](const expr &e) {
-                                        return e.ptr_same(f_arg);
-                                    })) {
-                            // f_arg is an arg of f but not an arg of func
-                            if (inplaced_in_args.count(f_arg)) { continue; }
-                            inplaced_in_args.insert(f_arg);
-                            inplace_pairs.emplace_back(
-                                    std::make_pair(arg, f_arg));
-                            SC_MODULE_INFO << "Inplace candidate: arg: " << arg
-                                           << " with arg: " << f_arg;
-                        }
-                    }
-                } else {
-                    // the input args starts, so stop finding.
-                    break;
-                }
-            }
-        }
-    }
-}
-
-void schedule_func_args(const func_c &f,
-        // pairs of {out_arg, in_arg} that out_arg can inplace in_arg
-        std::vector<std::pair<expr_c, expr_c>> &inplace_map) {
-    std::unordered_map<expr_c, tensor_tick_info_t> ticks;
-    std::vector<expr_c> defined;
-    annotate_ticks(f, ticks, defined);
-
-    // get inplace candidates from each op's perspective
-    std::vector<std::pair<expr_c, expr_c>> inplace_pairs;
-    get_inplace_args_from_called_funcs(f, ticks, inplace_pairs);
-    SC_MODULE_INFO << "Get inplace pairs from called functions, size: "
-                   << inplace_pairs.size();
-
-    if (inplace_pairs.empty()) { return; }
-
-    // get inplace hint from buffer lives in main entry f
-    // last read tick => tensor, decending order
-    std::multimap<int64_t, expr_c, std::greater<int64_t>> last_read_tensor;
-    for (auto &itr : ticks) {
-        if (itr.second.is_arg_) {
-            int64_t &lastread = itr.second.last_read_;
-            // if lastread == TICK_NOT_EXIST, then this is an output arg. We
-            // want it to be in the former part of the map, so we set it to a
-            // great value
-            if (lastread == TICK_NOT_EXIST) {
-                lastread = std::numeric_limits<int64_t>::max();
-            }
-            if (lastread != COMPLICATED_ACCESS) {
-                last_read_tensor.insert(std::make_pair(lastread, itr.first));
-            }
-        }
-    }
-
-    for (const auto &tsr_tick : ticks) {
-        const auto &out_arg = tsr_tick.first;
-        // only care about output args
-        if (!(out_arg->attr_ && out_arg->attr_->has_key("write_buffer")
-                    && !(out_arg->attr_->has_key("read_buffer"))
-                    && tsr_tick.second.is_arg_)) {
-            continue;
-        }
-        SC_MODULE_INFO << "Find inplace for out arg: " << out_arg;
-        const auto &out_tsr_tick = tsr_tick.second;
-        if (out_tsr_tick.last_read_ == COMPLICATED_ACCESS) {
-            SC_MODULE_INFO << "Complex access on " << out_arg;
-            continue;
-        }
-        tensor_c out_tsr = out_arg.static_as<tensor_c>();
-        assert(out_tsr->dims_.size() == 1);
-        if (!out_tsr->dims_[0].isa<constant>()) {
-            SC_MODULE_INFO << "Tensor " << out_tsr << " has non-constant shape";
-            continue;
-        }
-        int64_t out_tsr_size
-                = get_const_as_int(out_tsr->dims_[0].static_as<constant_c>());
-
-        // find input tensors whose last-read tick is equal or less than current
-        // first_access tick
-        auto titr = last_read_tensor.lower_bound(out_tsr_tick.first_access_);
-        while (titr != last_read_tensor.end()) {
-            if (!(titr->second->attr_
-                        && titr->second->attr_->has_key("read_buffer")
-                        && !(titr->second->attr_->has_key("write_buffer")))) {
-                ++titr;
-                continue;
-            }
-            const auto &in_arg = titr->second;
-            auto in_tsr = in_arg.checked_as<tensor_c>();
-            if (in_tsr.ptr_same(out_tsr)) {
-                // this can occur when a tensor is written, but is never read
-                ++titr;
-                continue;
-            }
-            const auto &in_tsr_tick = ticks.at(in_tsr);
-            // only care about input args
-            if (!in_tsr_tick.is_arg_) { continue; }
-            SC_MODULE_INFO << "Candidate input arg: " << in_arg;
-            if (in_tsr_tick.create_ <= out_tsr_tick.first_access_
-                    && in_tsr_tick.delete_ >= out_tsr_tick.delete_
-                    && utils::get_sizeof_type(in_tsr->elem_dtype_)
-                            == utils::get_sizeof_type(out_tsr->elem_dtype_)) {
-                // check that the candidate has no writes during the time range
-                // when out_tsr is in use: [out_tsr_tick.first_access_,
-                // out_tsr_tick.last_read_]
-                if (out_tsr_tick.last_read_ != TICK_NOT_EXIST) {
-                    auto lower = in_tsr_tick.writes_.lower_bound(
-                            out_tsr_tick.first_access_);
-                    auto upper = in_tsr_tick.writes_.upper_bound(
-                            out_tsr_tick.last_read_);
-                    // lower: the first element >= first_access_
-                    // upper: the first element > last_read_
-                    if (lower != upper) {
-                        // there are writes between first_access and last_read
-                        SC_MODULE_INFO << "Write after read: Failed "
-                                       << out_tsr->name_ << "->"
-                                       << in_tsr->name_ << "@" << *lower;
-                        ++titr;
-                        continue;
-                    }
-                }
-
-                // check if the in_tsr is large enough
-                assert(in_tsr->dims_.size() == 1);
-                int64_t in_tsr_size = get_const_as_int(
-                        in_tsr->dims_[0].static_as<constant_c>());
-                if (out_tsr_size != in_tsr_size) {
-                    ++titr;
-                    continue;
-                }
-
-                if (std::any_of(inplace_pairs.begin(), inplace_pairs.end(),
-                            [&out_arg, &in_arg](std::pair<expr_c, expr_c> &e) {
-                                return out_arg.ptr_same(e.first)
-                                        && in_arg.ptr_same(e.second);
-                            })) {
-                    inplace_map.emplace_back(std::make_pair(out_arg, in_arg));
-                    SC_MODULE_INFO << "Inplace result: " << out_arg << " use "
-                                   << in_arg;
-                }
-                // It is possible that an out arg can inplace multiple in args.
-                // Keep them all in the inplace_map vector.
-            }
-            ++titr;
-        }
-    }
-}
-
-const_ir_module_ptr tensor_inplace_t::operator()(const_ir_module_ptr f) {
-    auto ret = std::make_shared<ir_module_t>(*f);
-    auto main_entry = f->get_entry_func();
-    for (auto &entry_f : ret->get_contents()) {
-        // skip functions that 1) are not main_entry function and 2) are not top
-        // level functions
-        bool *top_level = nullptr;
-        if (entry_f->attr_) {
-            top_level = entry_f->attr_->get_or_null<bool>(
-                    function_attrs::top_level);
-        }
-        bool should_run = true;
-        // if the func is explicitly marked top-level/not-top-level, follow that
-        // instruction
-        if (top_level) {
-            should_run = *top_level;
-        } else {
-            // else, if the func is main entry
-            should_run = (entry_f == main_entry);
-        }
-        if (!should_run) { continue; }
-        // find all calls to func decl, and sync the inplace_hint with decl and
-        // definition
-        func_finder_t finder;
-        finder.dispatch(entry_f);
-        for (auto &funct : finder.funcs_) {
-            if (!funct->body_.defined()) {
-                // if the func is decl
-                auto func_def = f->get_func(funct->name_);
-                if (func_def && func_def->attr_) {
-                    if (auto hint
-                            = func_def->attr_->get_or_null<inplace_hint_t>(
-                                    function_attrs::inplace_hint)) {
-                        funct->attr()[function_attrs::inplace_hint] = *hint;
-                    }
-                }
-            }
-        }
-
-        if (!entry_f->attr_
-                || !(entry_f->attr_->has_key(function_attrs::inplace_hint))) {
-            // pairs of {out arg, in arg} that out arg can inplace in arg
-            std::vector<std::pair<expr_c, expr_c>> inplace_map;
-            schedule_func_args(entry_f, inplace_map);
-            std::vector<std::pair<size_t, size_t>> inplace_pairs;
-            if (!inplace_map.empty()) {
-                for (const auto &out_in : inplace_map) {
-                    int out_idx = -1;
-                    int in_idx = -1;
-                    for (int i = 0; i < int(entry_f->params_.size()); ++i) {
-                        if (out_in.first.ptr_same(entry_f->params_[i])) {
-                            out_idx = i;
-                        }
-                        if (out_in.second.ptr_same(entry_f->params_[i])) {
-                            in_idx = i;
-                        }
-                    }
-                    if (out_idx >= 0 && in_idx >= 0) {
-                        inplace_pairs.emplace_back(
-                                std::make_pair(in_idx, out_idx));
-                    }
-                }
-                // entry_f->attr()[function_attrs::inplace_hint] =
-                // inplace_pairs;
-            }
-        }
-
-        buffer_scheduler_t scheduler {ctx_, true, true};
-        auto new_func = scheduler(entry_f);
-        // if no changes, continue
-        if (new_func == entry_f) { continue; }
-        finder.funcs_.clear();
-        finder.dispatch(entry_f);
-        // sync back alias group info to callee functions
-        for (auto &funct : finder.funcs_) {
-            auto func_def = f->get_func(funct->name_);
-            if (func_def) {
-                filter_and_sync_inplace_hint(funct.get(), func_def);
-                if (func_def != funct) {
-                    for (size_t arg_id = 0; arg_id < funct->params_.size();
-                            arg_id++) {
-                        auto &arg_in_decl = funct->params_[arg_id];
-                        auto &arg_in_def = func_def->params_.at(arg_id);
-                        if (alias_info::get_alias_info(*arg_in_decl)) {
-                            // sync the alias info
-                            arg_in_def->attr()[attr_keys::pointer_alias]
-                                    = arg_in_decl->attr_->get_any(
-                                            attr_keys::pointer_alias);
-                        }
-                    }
-                }
-            }
-        }
-        // drop the new_func instead of replacing it with entry_f. Because
-        // parallel_merge pass needs to run before buffer scheduling
-    }
-
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace.hpp
deleted file mode 100644
index 8d42b8ab223..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_INPLACE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_INPLACE_HPP
-
-#include <utility>
-#include "tensor_inplace_info.hpp"
-#include <compiler/config/context.hpp>
-#include <compiler/ir/module_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Buffer inplace optimization in main entry function based on "inplace_hint"
- * attr of the functions. It will decide the specific tensor to reuse for each
- * function arg called by main entry. For each output tensor of function, this
- * pass then narrows down the "inplace_hint" to a single selected candidate.
- * This pass also sets the pointer alias info to help index2var and dead write
- * elim pass to correctly handle inplaced tensor. Note that this pass will only
- * modify the attrs of IR and will not change the IR itself. The real tensor
- * inplace and allocation happens in buffer_schedule pass.
- * */
-class tensor_inplace_t : public module_pass_t {
-public:
-    context_ptr ctx_;
-    tensor_inplace_t(const context_ptr &ctx) : ctx_(std::move(ctx)) {}
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace_info.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace_info.hpp
deleted file mode 100644
index c7a639e7161..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_inplace_info.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_INPLACE_INFO_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_INPLACE_INFO_HPP
-#include <memory>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace alias_info {
-struct tensor_alias_identity_t;
-}
-enum class inplace_kind {
-    ZERO_OFFSET, // this requires that the tensor share the same base
-    // pointer of the replaced tensor
-    FREE, // the tensor can freely choose any offset on this tensor
-};
-
-struct tensor_inplace_info_t {
-    int used_arg_idx_;
-    inplace_kind kind_;
-};
-
-struct temp_tensor_inplace_info_t {
-    std::shared_ptr<alias_info::tensor_alias_identity_t> to_reuse_;
-    inplace_kind kind_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_shrink.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_shrink.cpp
deleted file mode 100644
index 0f591d772f7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_shrink.cpp
+++ /dev/null
@@ -1,443 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "tensor_shrink.hpp"
-#include <string>
-#include <utility>
-#include <vector>
-#include "../ir_comparer.hpp"
-#include "../visitor.hpp"
-#include "auto_cast.hpp"
-#include "constant_fold.hpp"
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/transform/concat_memory_planning.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(tensor_shrinker, SC_PASS_DEPENDS_ON(interface_generalizer),
-        SC_PASS_REQUIRE_STATE(), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-static bool should_shrink(const expr &e) {
-    return e->attr_ && e->attr_->has_key(tensor_shrinker_attrs::should_shrink);
-}
-
-// check tensor node attr
-static bool is_tensor_and_should_shrink(const expr &e) {
-    return e.isa<tensor>() && should_shrink(e);
-}
-
-// check tensorptr node attr
-static bool is_tensorptr_and_should_shrink(const expr &e) {
-    return e.cast<tensorptr>()
-            .map([](const tensorptr &v) { return v->base_; })
-            .map([](const indexing &v) { return v->ptr_.as<tensor>(); })
-            .filter(should_shrink)
-            .has_value();
-}
-static bool is_reshaped_and_should_shrink(const expr &e) {
-    return e.cast<tensorptr>()
-            .filter([](const tensorptr &v) {
-                return !v->is_slice_ && should_shrink(v);
-            })
-            .map([](const tensorptr &v) { return v->base_; })
-            .map([](const indexing &base) { return base->ptr_.as<tensor>(); })
-            .filter(should_shrink)
-            .has_value();
-}
-
-static constexpr const char *temp_shrink_tag = "tensor_shrinker.def";
-
-/**
- * Due to in some cases, the brgemm may access discontinuous memory. If applied
- * tensor shrink, it should dynamically change leading dimension argument named
- * `LDX` in brgemm args list.
- * E.g. 1) For output[A,C,B,D], brgemm will write back
- *         it partial result into [1,C,1,D] with LDC = B*D. However, the local
- *         buffer should be shrinked into [C,D], but with new LDC = D.
- *      2) In strided writing cases, the LDC of the shrunk local buffer should
- * be updated as well. For output[A,C,D,B] and its partial result [a,c,d,b], the
- * old LDC=D*stride_w will be updated into LDC=d*stride_w.
- * */
-bool check_brgemm_LDX(const expr &buffer, expr &LDX_e) {
-    COMPILE_ASSERT(buffer.isa<tensor>() || buffer.isa<tensorptr>(),
-            "tensor or tensorptr is expected for the buffer of brgemm");
-    auto tsr = get_real_tensor(buffer);
-    if (is_tensor_and_should_shrink(tsr)) {
-        auto &shrink_info = tsr->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                tensor_shrinker_attrs::should_shrink);
-        auto LDX_c = constant_folder_t()(auto_caster_t()(LDX_e));
-        COMPILE_ASSERT(shrink_info.shape_.size() == tsr->dims_.size(),
-                "Bad number of dimensions for indexing access");
-        COMPILE_ASSERT(LDX_c.isa<constant>(),
-                "Constant LDX is expected, but got " << LDX_c);
-        int64_t LDX = get_expr_as_int(LDX_c);
-        int64_t acc_orig = 1, acc_shrink = 1;
-        // for conv_bwd_data stride_w > 1 cases
-        LDX_e->attr();
-        if (LDX_e->attr_->get_or_else("skip_shrink_check", false)) {
-            return false;
-        }
-        if (LDX_e->attr_->get_or_else("plain_init", false)) {
-            acc_shrink = get_expr_as_int(shrink_info.shape_.back());
-            LDX_e = make_expr<constant_node>(acc_shrink, datatypes::s32);
-            return true;
-        }
-        if (LDX_e->attr_->get_or_else("stride_w", 1) > 1) {
-            auto N_axis = LDX_e->attr_->get_or_else(
-                    "N_axis", std::vector<size_t> {});
-            // plain
-            if (N_axis.size() == 1) {
-                COMPILE_ASSERT(N_axis[0] == tsr->dims_.size() - 1,
-                        "currently only supports N is the last axis in plain "
-                        "brgemm");
-                acc_shrink = LDX_e->attr_->get<int>("stride_w")
-                        * get_expr_as_int(shrink_info.shape_.back());
-                LDX_e = make_expr<constant_node>(acc_shrink, datatypes::s32);
-                return true;
-            }
-            // blocking
-            for (int64_t i
-                    = static_cast<int64_t>(shrink_info.shape_.size()) - 1;
-                    i >= 0; i--) {
-                // when acc_orig > LDX, considering LDX that contains stride
-                if (acc_orig > LDX) {
-                    if (acc_shrink == acc_orig)
-                        return false;
-                    else {
-                        LDX_e = make_expr<constant_node>(
-                                acc_shrink, datatypes::s32);
-                        return true;
-                    }
-                }
-                acc_orig *= get_expr_as_int(tsr->dims_[i]);
-                acc_shrink *= get_expr_as_int(shrink_info.shape_[i]);
-            }
-        } else {
-            for (int64_t i
-                    = static_cast<int64_t>(shrink_info.shape_.size()) - 1;
-                    i >= 0; i--) {
-                if (acc_orig >= LDX) {
-                    if (acc_shrink == acc_orig)
-                        return false;
-                    else {
-                        LDX_e = make_expr<constant_node>(
-                                acc_shrink, datatypes::s32);
-                        return true;
-                    }
-                }
-                acc_orig *= get_expr_as_int(tsr->dims_[i]);
-                acc_shrink *= get_expr_as_int(shrink_info.shape_[i]);
-            }
-        }
-        COMPILE_ASSERT(0,
-                "Unexpected LDX found: " << LDX
-                                         << " for corresponding tensor dims: "
-                                         << utils::print_vector(tsr->dims_));
-    }
-    return false;
-}
-
-class shrinker_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    std::unordered_map<expr, expr> replace_map;
-
-    shrinker_impl_t(const_ir_module_ptr mod = nullptr) : mod_(std::move(mod)) {}
-
-    stmt_c visit(define_c v) override {
-        expr var = v->var_;
-        if (var->attr_
-                && var->attr_->has_key(
-                        concat_optim_attr_keys::pass_memory_offset)) {
-            // This var has memory_offset to another var, and will be replaced
-            // by tensorptr. It will become unused and be deleted finally.
-            var->attr_->remove(tensor_shrinker_attrs::should_shrink);
-            return v;
-        }
-        if (is_tensor_and_should_shrink(v->var_)) {
-            auto tsr = v->var_.static_as<tensor>();
-            bool no_init = !tsr->init_value_
-                    || tsr->init_value_
-                            == tensor_node::get_zero_tensor_initializer()
-                    || tsr->init_value_->size_ == sizeof(union_val);
-            COMPILE_ASSERT(no_init && !v->init_.defined()
-                            && v->linkage_ == linkage::local,
-                    "The tensor to shrink should not have init value or be "
-                    "re-scheduled. And it should be a local tensor: "
-                            << v);
-            auto &shrink_info
-                    = v->var_->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                            tensor_shrinker_attrs::should_shrink);
-            COMPILE_ASSERT(shrink_info.shape_.size() == tsr->dims_.size(),
-                    "Bad shape for shrinking the tensor: "
-                            << v << ", target shape = "
-                            << utils::print_vector(shrink_info.shape_));
-            auto replacer = copy_attr(*tsr,
-                    builder::make_tensor(tsr->name_ + "_shr",
-                            shrink_info.shape_, tsr->elem_dtype_,
-                            tsr->address_space_, tsr->init_value_));
-            replacer->attr_->remove(tensor_shrinker_attrs::should_shrink);
-            replace_map[tsr] = replacer;
-            auto ret = builder::make_var_tensor_def_unattached(
-                    replacer, v->linkage_);
-            // if the tensor definition is moved
-            if (shrink_info.move_def_.defined()) {
-                shrink_info.move_def_->attr()[temp_shrink_tag] = ret;
-                // the tensor def is moved, return empty
-                return builder::make_stmts_unattached({});
-            }
-            return ret;
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        if (v->type_ != intrin_type::brgemm
-                && v->type_ != intrin_type::list_brgemm) {
-            return ir_visitor_t::visit(v).checked_as<intrin_call_c>();
-        }
-        auto intrin = ir_visitor_t::visit(v).checked_as<intrin_call_c>();
-        // new args
-        auto args_cpy = intrin->args_;
-        std::vector<std::pair<int, int>> check_LDX_list = {
-                // // Input fusion
-                // {brgemm_args::A, brgemm_args::LDA},
-                // {brgemm_args::B, brgemm_args::LDB},
-                // Output fusion
-                {brgemm_args::C, brgemm_args::LDC},
-        };
-        bool changed = false;
-        for (auto &check_pair : check_LDX_list) {
-            // need to check old args v, due to some of `attr` maybe removed
-            // in new args.
-            if (check_brgemm_LDX(v->args_[check_pair.first],
-                        args_cpy[check_pair.second])) {
-                changed = true;
-            }
-        }
-        if (changed) {
-            return copy_attr(*intrin,
-                    make_expr<intrin_call_node>(
-                            intrin->type_, args_cpy, *intrin->intrin_attrs_));
-        }
-
-        return intrin;
-    }
-
-    expr_c visit(tensor_c v) override {
-        // if shrinked tensors should not go here, unless it is a direct use of
-        // the tensor, instead of indexing
-        COMPILE_ASSERT(!v->attr_
-                        || !v->attr_->has_key(
-                                tensor_shrinker_attrs::should_shrink),
-                "The shrinked tensor is referenced without indexing: " << v);
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(indexing_c v) override {
-        if (is_tensor_and_should_shrink(v->ptr_)) {
-            auto tsr = v->ptr_.static_as<tensor>();
-            COMPILE_ASSERT(v->idx_.size() == tsr->dims_.size(),
-                    "Bad number of dimensions for indexing access");
-            auto itr = replace_map.find(tsr);
-            COMPILE_ASSERT(itr != replace_map.end(),
-                    "Tensor used before definition: " << v);
-            std::vector<expr> new_idx;
-            bool changed = ir_visitor_t::dispatch_expr_vector(v->idx_, new_idx);
-            auto &shrink_info
-                    = tsr->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                            tensor_shrinker_attrs::should_shrink);
-            // already checked that in visit(define_c v). using assert now
-            assert(new_idx.size() == shrink_info.base_.size());
-            for (size_t i = 0; i < new_idx.size(); i++) {
-                new_idx[i] = new_idx[i] - shrink_info.base_[i];
-            }
-            return builder::make_indexing(itr->second, new_idx,
-                    v->dtype_.lanes_, v->mask_, v->dtype_.rows_);
-        } else if (v->ptr_.isa<tensorptr>()) {
-            // transform &A[0 - a, 0 - b][a, b] to &A[0, 0][0, 0] to make index
-            // calculation simpler. Todo: currently we don't support expression
-            // simplify: 0 - a + a => 0
-            auto new_ptr = dispatch(v->ptr_);
-            std::vector<expr> new_idx;
-            bool changed = ir_visitor_t::dispatch_expr_vector(v->idx_, new_idx);
-            changed |= !new_ptr.ptr_same(v->ptr_);
-            auto new_cur_idx = v->idx_;
-            auto new_cld_idx = new_ptr.static_as<tensorptr>()->base_->idx_;
-            if (new_cur_idx.size() == new_cld_idx.size()) {
-                bool idx_changed = false;
-                for (size_t i = 0; i < new_cld_idx.size(); i++) {
-                    if (new_cld_idx[i].isa<sub>()) {
-                        auto &lhs = new_cld_idx[i].static_as<sub>()->l_;
-                        auto &rhs = new_cld_idx[i].static_as<sub>()->r_;
-                        if (rhs.ptr_same(new_cur_idx[i].remove_const())
-                                || (rhs.isa<constant>()
-                                        && new_cur_idx[i].isa<constant>()
-                                        && rhs->equals(new_cur_idx[i]))) {
-                            new_cld_idx[i] = lhs;
-                            new_cur_idx[i] = 0;
-                            changed = true;
-                            idx_changed = true;
-                        }
-                    }
-                }
-                // remake tensorptr
-                if (idx_changed) {
-                    const auto &tptr = new_ptr.static_as<tensorptr>();
-                    new_ptr = copy_attr(*new_ptr,
-                            builder::tensor_ptr(tptr->base_->ptr_, new_cld_idx,
-                                    tptr->shape_, tptr->is_slice_));
-                }
-            }
-            if (changed) {
-                return copy_attr(*v,
-                        builder::make_indexing(new_ptr.remove_const(),
-                                new_cur_idx, v->dtype_.lanes_, v->mask_,
-                                v->dtype_.rows_));
-            }
-            return v;
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        std::vector<expr> old_args;
-        if (v->value_.isa<call>()) {
-            auto old_call_node = v->value_.static_as<call>();
-            old_args = old_call_node->args_;
-        }
-        auto evaluate = ir_visitor_t::visit(v).checked_as<evaluate_c>();
-        // after this visit(v) the c->args_[0] has been shrinked
-        // so old_args rather than new args_cpy can be used to
-        // judge whether we "should_shrink"
-        bool is_call = evaluate->value_.isa<call>();
-        if (is_call) {
-            auto c = evaluate->value_.static_as<call>();
-            auto func = std::dynamic_pointer_cast<func_base>(c->func_);
-            // func can be a nullptr
-            if (func && func->name_ == "dnnl_brgemm_init") {
-                // old arg attributes are still available on visited args
-                auto args_cpy = c->args_;
-                std::vector<std::pair<int, int>> check_LDX_list = {
-                        {0, 3}, // {C, LDC}
-                };
-                bool changed = false;
-                for (auto &check_pair : check_LDX_list) {
-                    // need to check old_args's buffer to get the correct
-                    // "should_shrink" logic
-                    if (check_brgemm_LDX(old_args[check_pair.first],
-                                args_cpy[check_pair.second])) {
-                        changed = true;
-                    }
-                }
-                if (changed) {
-                    return copy_attr(*evaluate,
-                            builder::make_evaluate_unattached(
-                                    make_expr<call_node>(c->func_, args_cpy)));
-                }
-            }
-        }
-        return evaluate;
-    }
-
-    /**
-     * TO deal with reshaped tensor, we need to transform both idx and shape
-     * from `tensorptr(tensorptr(base,{0,..},shape,false),idx,{},true)` to
-     * `tensorptr(tensorptr(base,{0,..},newshape,false),newidx,{},true)`
-     * */
-    expr_c visit(tensorptr_c v) override {
-        // transform based reshaped tensor's shape
-        if (is_reshaped_and_should_shrink(v.remove_const())) {
-            auto tptr = ir_visitor_t::visit(v).checked_as<tensorptr>();
-            auto &shrink_info
-                    = tptr->attr_->get<tensor_shrinker_t::shrink_info_t>(
-                            tensor_shrinker_attrs::should_shrink);
-            return builder::tensor_ptr(tptr->base_->ptr_,
-                    std::vector<expr>(tptr->base_->idx_.size(), expr(0)),
-                    shrink_info.shape_, v->is_slice_);
-        }
-        // transform reshaped tensorptr's idx
-        else if (v->base_->ptr_.isa<tensorptr>()
-                && is_reshaped_and_should_shrink(
-                        v->base_->ptr_.static_as<tensorptr>())) {
-            // get shrink info firstly due to it will not be returned by visit
-            // below
-            auto &shrink_info
-                    = v->base_->ptr_->attr_
-                              ->get<tensor_shrinker_t::shrink_info_t>(
-                                      tensor_shrinker_attrs::should_shrink);
-            auto tptr = ir_visitor_t::visit(v).checked_as<tensorptr>();
-            auto inner_tptr = tptr->base_->ptr_;
-            std::vector<expr> newidx;
-            bool changed = ir_visitor_t::dispatch_expr_vector(
-                    tptr->base_->idx_, newidx);
-
-            // already checked that in visit(define_c v). using assert now
-            assert(newidx.size() == shrink_info.base_.size());
-            for (size_t i = 0; i < newidx.size(); i++) {
-                newidx[i] = newidx[i] - shrink_info.base_[i];
-            }
-            return builder::tensor_ptr(tptr->base_->ptr_, newidx, {}, true);
-        }
-        return ir_visitor_t::visit(v);
-    }
-
-    stmt_c visit(stmts_c s) override {
-        if (s->attr_ && s->attr_->has_key(temp_shrink_tag)) {
-            COMPILE_ASSERT(
-                    s->seq_.empty(), "Shrink definition placeholder not empty");
-            auto def = s->attr_->get<stmt>(temp_shrink_tag);
-            s->attr_->as_map().clear();
-            return def;
-        }
-        return ir_visitor_t::visit(std::move(s));
-    }
-
-private:
-    // ir module context, nullable
-    const_ir_module_ptr mod_;
-};
-
-const_ir_module_ptr tensor_shrinker_t::operator()(const_ir_module_ptr f) {
-    shrinker_impl_t impl(f);
-    return dispatch_module_on_visitor(&impl, f);
-}
-
-func_c tensor_shrinker_t::operator()(func_c f) {
-    shrinker_impl_t impl;
-    return impl.dispatch(std::move(f));
-}
-
-stmt_c tensor_shrinker_t::operator()(stmt_c f) {
-    shrinker_impl_t impl;
-    return impl.dispatch(std::move(f));
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_shrink.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_shrink.hpp
deleted file mode 100644
index 1293e5afecd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/tensor_shrink.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_SHRINK_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_TENSOR_SHRINK_HPP
-
-#include <vector>
-#include "../module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace tensor_shrinker_attrs {
-constexpr const char *should_shrink = "should_shrink";
-constexpr const char *may_shrink = "may_shrink";
-constexpr const char *no_shrink = "no_shrink";
-constexpr const char *tensor_for_placerholder = "tsr4placeholder";
-} // namespace tensor_shrinker_attrs
-
-/**
- * Shrinks large tensors into small ones if the access pattern is limited in a
- * range. This pass depends on the attr "should_shrink" marked on tensors, which
- * is manually added by users or automatically added by fusion manager. The
- * "should_shrink" attr of a tensor should be mapped to an array of expr, say,
- * `base` and another array of expr for shape. The original tensor should be
- * replaced by a shrinked tensor with shape as the shape given in
- * "should_shrink" attr. The accesses on original tensor `A[idx]` should be
- * mapped to the accesses on the shrinked tensor `shrinked_A[idx - base]`
- * */
-class tensor_shrinker_t : public module_pass_t {
-public:
-    struct shrink_info_t {
-        std::vector<expr> base_;
-        std::vector<expr> shape_;
-        // the placeholder for the location of the new definition position of
-        // the tenosr. Can be null, indicating the tensor does not need to be
-        // moved
-        stmts move_def_;
-    };
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    func_c operator()(func_c f);
-    stmt_c operator()(stmt_c f);
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/value_numbering.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/value_numbering.cpp
deleted file mode 100644
index 717cb18e15e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/value_numbering.cpp
+++ /dev/null
@@ -1,349 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "value_numbering.hpp"
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass_dep_util.hpp>
-#include <compiler/ir/passlet/ssa_simplify.hpp>
-#include <compiler/ir/passlet/ssa_value_hash.hpp>
-#include <compiler/ir/passlet/structural_analysis.hpp>
-#include <compiler/ir/passlet/volatility_analysis.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <unordered_map>
-#include <unordered_set>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_DECL_PASS_INFO(value_numbering, SC_PASS_DEPENDS_ON(ssa_transform),
-        SC_PASS_REQUIRE_STATE(SSA_STAGE), SC_PASS_REQUIRE_NOT_STATE(),
-        SC_PASS_SET_STATE(), SC_PASS_UNSET_STATE());
-
-using namespace passlet;
-struct vn_result_t {
-    structural_result_t parent_info_;
-    size_t hash_ = 0;
-    volatility_result_t vresult_;
-    // the parent stmts for define node. It will only be set if the stmts has
-    // been visited
-    stmts_node_t *finalized_parent_ = nullptr;
-    // the stmt to replace current stmt after visit()
-    const stmt_base_t *new_object_ = nullptr;
-    bool ref_by_phi_ = false;
-};
-
-static vn_result_t &get_vn_result(const stmt_base_t *v) {
-    return v->temp_data().get<vn_result_t>();
-}
-
-static vn_result_t &get_vn_result(const stmt_c &v) {
-    return get_vn_result(v.get());
-}
-
-class value_numbering_analysis_t : public ssa_viewer_t {
-    volatility_analysis_t v_ana_;
-    ssa_value_hash_t hasher_;
-    structural_analysis_t s_ana_;
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-
-public:
-    value_numbering_analysis_t()
-        : v_ana_ {false, sc_make_temp_data_addresser(&vn_result_t::vresult_)}
-        , hasher_ {sc_make_temp_data_addresser(&vn_result_t::hash_)}
-        , s_ana_ {sc_make_temp_data_addresser(&vn_result_t::parent_info_)} {}
-
-    expr_c dispatch(expr_c f) override { return f; }
-
-    void view(define_c v) override {
-        v_ana_.view(v, pass_phase::PRE_VISIT);
-        hasher_.view(v, pass_phase::PRE_VISIT);
-
-        v_ana_.view(v, pass_phase::POST_VISIT);
-        hasher_.view(v, pass_phase::POST_VISIT);
-
-        // we need to mark PHI node's dependencies
-        if (v->init_.defined()) {
-            auto &f = v->init_;
-            if (f.isa<ssa_phi_c>()) {
-                for (auto &v : f.static_as<ssa_phi_c>()->values_) {
-                    if (v.isa<var>()) {
-                        if (!v->ssa_data_->has_owner()) { continue; }
-                        auto owner = v->ssa_data_->get_owner();
-
-                        if (!owner->temp_data().isa<vn_result_t>()) {
-                            owner->temp_data() = vn_result_t {};
-                        }
-                        get_vn_result(owner.get()).ref_by_phi_ = true;
-                    }
-                }
-            }
-        }
-    }
-
-    stmt_c dispatch(stmt_c f) override {
-        s_ana_.view(f, pass_phase::PRE_VISIT);
-        ssa_viewer_t::dispatch(f);
-        s_ana_.view(f, pass_phase::POST_VISIT);
-        return f;
-    }
-
-    func_c dispatch(func_c f) override {
-        v_ana_.view(f, pass_phase::PRE_VISIT);
-        auto ret = ssa_viewer_t::dispatch(f);
-        v_ana_.view(f, pass_phase::POST_VISIT);
-        return ret;
-    }
-};
-
-struct ssa_hasher_t {
-    size_t operator()(const define_c &v) const {
-        return v->temp_data().get<vn_result_t>().hash_;
-    }
-};
-
-struct ssa_cmper_t {
-    bool operator()(const define_c &v, const define_c &v2) const {
-        // an IR cmper which compares the var/tensor pointers, instead of create
-        // a mapping of them. Also checkes IR considering commutative law
-        if (v->var_->dtype_ != v2->var_->dtype_) { return false; }
-        assert(v->init_.defined() && v2->init_.defined());
-        ir_comparer cmper {false, false, true, false, true};
-        return cmper.compare(v->init_, v2->init_);
-    }
-};
-
-class value_numbering_mutator_t : public ssa_visitor_t {
-    using ssa_value_set
-            = std::unordered_set<define_c, ssa_hasher_t, ssa_cmper_t>;
-    using replace_map = std::unordered_map<expr_c, expr_c>;
-    struct scope_t {
-        std::unordered_set<stmt_c> alive_vars_;
-        replace_map repleace_map_;
-        const stmts_node_t *old_stmts_;
-        // inserting stmt in this seq will affect the resulting stmts for
-        // visiting old_stmts_
-        std::vector<stmt_c> *cur_seq_to_insert_;
-        scope_t(const stmts_node_t *s, std::vector<stmt_c> *seq)
-            : old_stmts_(s), cur_seq_to_insert_(seq) {}
-    };
-    // the set to help to find the same values of ssa. It is a stack for nested
-    // scopes
-    std::vector<scope_t> scopes_;
-    ssa_value_set var_set_;
-    ssa_simplify_t simplifier_;
-    bool should_update_scopes_ = false;
-    using ssa_visitor_t::dispatch;
-    using ssa_visitor_t::visit;
-
-public:
-    value_numbering_mutator_t(bool fold_const_vec)
-        : simplifier_ {fold_const_vec} {}
-
-    structural_result_t::typed_addresser_t addresser_
-            = sc_make_temp_data_addresser(&vn_result_t::parent_info_);
-
-    stmt_c dispatch(stmt_c f) override {
-        if (should_update_scopes_) {
-            should_update_scopes_ = false;
-            assert(scopes_.back().cur_seq_to_insert_ == nullptr);
-            scopes_.back().cur_seq_to_insert_ = get_current_scope();
-        }
-        auto ret = ssa_visitor_t::dispatch(f);
-        get_vn_result(f).new_object_ = ret.get();
-        return ret;
-    }
-
-    expr_c visit(var_c v) override {
-        if (simplifier_.is_in_phi_) { return v; }
-        auto ret = simplifier_.visit(v);
-        if (ret.isa<var>()) {
-            for (auto &m : scopes_) {
-                auto itr = m.repleace_map_.find(ret);
-                if (itr != m.repleace_map_.end()) { return itr->second; }
-            }
-        }
-        return ret;
-    }
-    expr_c visit(ssa_phi_c v) override {
-        simplifier_.enter_phi();
-        auto ret = ssa_visitor_t::visit(v);
-        if (ret.isa<ssa_phi_c>()) {
-            ret = simplifier_.visit(ret.static_as<ssa_phi_c>());
-        }
-        simplifier_.leave_phi();
-        return ret;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        scopes_.emplace_back(scope_t {v.get(), nullptr});
-        should_update_scopes_ = true;
-        auto ret = ssa_visitor_t::visit(v);
-        should_update_scopes_ = false;
-        // make sure we make a new stmts node, so that remove_const is safe
-        if (ret.ptr_same(v)) { ret = ret->remake(); }
-        // update all var's finalized_parent_
-        for (auto &alive : scopes_.back().alive_vars_) {
-            get_vn_result(alive).finalized_parent_
-                    = ret.static_as<stmts_c>().remove_const().get();
-        }
-        scopes_.pop_back();
-        return ret;
-    }
-
-    stmt_c visit(define_c v) override {
-        stmt_c new_def = v;
-        if (v->init_.defined()) {
-            auto new_init = dispatch(v->init_);
-            if (!new_init.ptr_same(v->init_)) {
-                new_def = builder::make_var_tensor_def_unattached(
-                        v->var_, v->linkage_, new_init);
-            }
-        }
-        if (!v->var_.isa<var>()) { return new_def; }
-        auto &vn_result = get_vn_result(v);
-        if (vn_result.vresult_.is_volatile_ == volatility_result_t::YES) {
-            return new_def;
-        }
-        if (!new_def.ptr_same(v)) { new_def->temp_data() = v->temp_data(); }
-        bool replaced = false;
-        auto itr = var_set_.find(new_def.checked_as<define_c>());
-        if (itr != var_set_.end()) {
-            for (auto &s : scopes_) {
-                auto &def = s.alive_vars_;
-                if (def.count(*itr) != 0UL) {
-                    replaced = true;
-                    break;
-                }
-            }
-            auto &old_def = *itr;
-            auto &old_vn_data = get_vn_result(old_def);
-            if (!replaced && !old_vn_data.ref_by_phi_) {
-                // the var is met before, but old var is not alive
-                // try to move the old var to a parent scope, so that the old
-                // var can be shared by old and new uses
-
-                // if the old var is referenced by phi, we cannot move it
-                auto &old_info = old_vn_data.parent_info_;
-                // find the nearest shared parent of old var def and new var def
-                // it will not come across the for-loop boundary
-                auto shared_parent = old_info.find_shared_parent(
-                        get_vn_result(new_def).parent_info_, addresser_, false,
-                        true);
-                if (shared_parent) {
-                    if (shared_parent->node_type_ == sc_stmt_type::if_else) {
-                        shared_parent = get_vn_result(shared_parent)
-                                                .parent_info_.get_raw_parent();
-                    }
-                    // remove old var def from its current parent
-                    assert(shared_parent->node_type_ == sc_stmt_type::stmts);
-                    assert(old_vn_data.finalized_parent_);
-                    auto &seq_remove = old_vn_data.finalized_parent_->seq_;
-                    auto olditr = std::find_if(seq_remove.begin(),
-                            seq_remove.end(), [&old_def](const stmt_c &v) {
-                                return v.get() == old_def.get();
-                            });
-
-                    assert(olditr != seq_remove.end());
-                    seq_remove.erase(olditr);
-                    // update parent_info_ of old_info
-                    // insert to shared_parent, update current parent
-
-                    // firstly, find the shared_parent's scope
-                    auto scope_itr = std::find_if(scopes_.begin(),
-                            scopes_.end(), [shared_parent](scope_t &scope) {
-                                return scope.old_stmts_ == shared_parent;
-                            });
-                    assert(scope_itr != scopes_.end());
-                    auto &scope = *scope_itr;
-                    {
-                        // else, need to insert the old var def before it is
-                        // used. Find the second level parent
-                        // for example, for IR like
-                        // if (...) {
-                        //   g=a+1
-                        //   A[0]=g
-                        // }
-                        // b=a+1
-                        //
-                        // We found the duplication at the point of "b=a+1", and
-                        // we need to move "g=a+1" before the "if". We need to
-                        // first find the "if" node in the shared_parent
-                        const structural_result_t *second_level = nullptr;
-                        get_vn_result(shared_parent)
-                                .parent_info_.is_parent_of(old_info, addresser_,
-                                        false, true, &second_level);
-                        COMPILE_ASSERT(second_level,
-                                "expecting second_level being not null");
-                        auto &the_seq = *scope.cur_seq_to_insert_;
-                        // get the point in shared parent to insert the def
-                        // before
-                        auto insert_before = get_vn_result(
-                                second_level->get_raw_cur_node())
-                                                     .new_object_;
-                        if (!insert_before) {
-                            // if old def has not been attached to ancestor stmt
-                            // seq
-                            the_seq.emplace_back(old_def);
-                        } else {
-                            auto insertion_point = std::find_if(the_seq.begin(),
-                                    the_seq.end(),
-                                    [insert_before](const stmt_c &v) {
-                                        return v.get() == insert_before;
-                                    });
-                            assert(insertion_point != the_seq.end());
-                            the_seq.insert(insertion_point, old_def);
-                        }
-                    }
-                    // insert to alive vars of shared_parent
-                    scope.alive_vars_.insert(old_def);
-
-                    old_info.parent_ = shared_parent->shared_from_this();
-                    old_vn_data.finalized_parent_ = nullptr;
-
-                    replaced = true;
-                }
-            }
-
-            if (replaced) {
-                scopes_.back().repleace_map_.insert(
-                        std::make_pair(v->var_, (*itr)->var_));
-            } else {
-                var_set_.erase(itr);
-            }
-        }
-        if (!replaced) {
-            scopes_.back().alive_vars_.insert(new_def);
-            var_set_.insert(new_def.static_as<define_c>());
-        }
-        return new_def;
-    }
-};
-
-func_c value_numbering_t::operator()(func_c f) {
-    value_numbering_analysis_t().dispatch(f);
-    return value_numbering_mutator_t(fold_const_vec_).top_level_dispatch(f);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/value_numbering.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/value_numbering.hpp
deleted file mode 100644
index 28e6f5ba4c8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/transform/value_numbering.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_VALUE_NUMBERING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_TRANSFORM_VALUE_NUMBERING_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * Value numbering for SSA form IR. It will fold SSA values into one if the
- * expressions looks the same and are not related to side-effects. It will also
- * do copy propagation and constant propagation of SSA. The pass also oberserves
- * commutative law and considers (a+b) and (b+a) as same expr.
- * */
-class value_numbering_t : public function_pass_t {
-public:
-    bool fold_const_vec_;
-    value_numbering_t(bool fold_const_vec = true)
-        : fold_const_vec_ {fold_const_vec} {}
-    func_c operator()(func_c f) override;
-    SC_DECL_PASS_INFO_FUNC();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/util_module_passes.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/util_module_passes.cpp
deleted file mode 100644
index ae0e8a36123..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/util_module_passes.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "util_module_passes.hpp"
-#include <chrono>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "pass_manager.hpp"
-#include "visitor.hpp"
-#include <util/scoped_timer.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-sequential_module_pass_t::sequential_module_pass_t(
-        std::vector<module_pass_ptr> &&passes)
-    : passes_(std::move(passes)) {}
-
-sequential_module_pass_t::sequential_module_pass_t(
-        sequential_module_pass_t &&other)
-    : passes_(std::move(other.passes_)) {}
-
-const_ir_module_ptr sequential_module_pass_t::operator()(
-        const_ir_module_ptr f) {
-    bool need_print_time = utils::compiler_configs_t::get().print_pass_time_;
-    bool need_result = utils::compiler_configs_t::get().print_pass_result_;
-    for (auto &p : passes_) {
-        auto timer = utils::create_scoped_timer(
-                need_print_time, [&p](utils::time_duration dur) {
-                    auto diff = std::chrono::duration_cast<
-                            std::chrono::microseconds>(dur)
-                                        .count();
-                    std::string mod_name = std::string("pass.time.")
-                            + get_pass_name(p.get());
-                    SC_MODULE_INFO2(mod_name.c_str())
-                            << "The pass took " << diff << "us";
-                });
-        f = (*p)(f);
-        if (need_result) {
-            std::string mod_name
-                    = std::string("pass.debug.") + get_pass_name(p.get());
-            SC_MODULE_INFO2(mod_name.c_str()) << f;
-        }
-    }
-    return f;
-}
-
-const char *module_function_pass_t::get_name() const {
-    return impl_->get_name();
-}
-
-#ifndef NDEBUG
-void module_function_pass_t::get_dependency_info(
-        tir_pass_dependency_t &out) const {
-    impl_->get_dependency_info(out);
-}
-#endif
-
-module_function_pass_t::module_function_pass_t(function_pass_ptr impl)
-    : impl_(std::move(impl)) {}
-
-const_ir_module_ptr module_function_pass_t::operator()(const_ir_module_ptr m) {
-    auto ret = m->copy();
-    ret->run_pass(*impl_);
-    return ret;
-}
-const_ir_module_ptr dispatch_module_on_visitor(
-        ir_visitor_t *pass, const const_ir_module_ptr &f) {
-    auto ret = std::make_shared<ir_module_t>(*f);
-    for (auto &gv : ret->get_module_vars()) {
-        gv = pass->visit(gv).checked_as<define>();
-    }
-    for (auto &funct : ret->get_contents()) {
-        funct = std::const_pointer_cast<func_base>(pass->dispatch(funct));
-    }
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/util_module_passes.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/util_module_passes.hpp
deleted file mode 100644
index 040a998b1f8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/util_module_passes.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_UTIL_MODULE_PASSES_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_UTIL_MODULE_PASSES_HPP
-
-#include <utility>
-#include <vector>
-#include "function_pass.hpp"
-#include "module_pass.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// The pass to wrap function pass to module pass. It will run the function pass
-// on each of the function in the input module
-class module_function_pass_t : public module_pass_t {
-public:
-    function_pass_ptr impl_;
-    module_function_pass_t(function_pass_ptr impl);
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-    SC_DECL_PASS_INFO_FUNC();
-    // makes a module_function_pass_t from a function_pass_t
-    // T should be a function_pass_t class
-    template <typename T, typename... Args>
-    static module_pass_ptr make(Args &&...args) {
-        return utils::make_unique<module_function_pass_t>(
-                utils::make_unique<T>(std::forward<Args>(args)...));
-    }
-};
-
-// Sequentially run module_passes on the input module. The output of the current
-// pass will be the input of the next pass
-class SC_INTERNAL_API sequential_module_pass_t : public module_pass_t {
-public:
-    std::vector<module_pass_ptr> passes_;
-    sequential_module_pass_t(std::vector<module_pass_ptr> &&passes);
-    sequential_module_pass_t(sequential_module_pass_t &&other);
-    template <typename... Args>
-    sequential_module_pass_t(Args &&...args) {
-        utils::args_to_vector<module_pass_ptr>(passes_, std::move(args)...);
-    }
-    const_ir_module_ptr operator()(const_ir_module_ptr f) override;
-};
-
-class ir_visitor_t;
-// dispatch the global variables and functions in the module on the visitor,
-// returns a new module with updated members
-const_ir_module_ptr dispatch_module_on_visitor(
-        ir_visitor_t *vis, const const_ir_module_ptr &f);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/viewer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/viewer.cpp
deleted file mode 100644
index 3cf97807fbb..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/viewer.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "viewer.hpp"
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-expr_c ir_viewer_t::visit(binary_c v) {
-    view(v);
-    return std::move(v);
-}
-
-expr_c ir_viewer_t::visit(logic_c v) {
-    view(v);
-    return std::move(v);
-}
-
-expr_c ir_viewer_t::visit(cmp_c v) {
-    view(v);
-    return std::move(v);
-}
-
-void ir_viewer_t::view(binary_c v) {
-    ir_visitor_t::visit(std::move(v));
-}
-
-void ir_viewer_t::view(logic_c v) {
-    ir_visitor_t::visit(std::move(v));
-}
-
-void ir_viewer_t::view(cmp_c v) {
-    ir_visitor_t::visit(std::move(v));
-}
-
-void ir_viewer_t::dispatch_expr_arr(const std::vector<expr> &arr) {
-    for (auto &v : arr) {
-        auto n = dispatch(v);
-    }
-}
-
-#define DECL_VIEWER_PROXY(TYPE) \
-    expr_c ir_viewer_t::visit(TYPE##_c v) { \
-        view(v); \
-        return std::move(v); \
-    } \
-    void ir_viewer_t::view(TYPE##_c v) { ir_visitor_t::visit(std::move(v)); }
-
-#define DECL_VIEWER_PROXY_CALL_PARENT(TYPE, PARENT) \
-    expr_c ir_viewer_t::visit(TYPE##_c v) { \
-        view(v); \
-        return std::move(v); \
-    } \
-    void ir_viewer_t::view(TYPE##_c v) { view(v.static_as<PARENT##_c>()); }
-
-DECL_VIEWER_PROXY(constant)
-DECL_VIEWER_PROXY(var)
-DECL_VIEWER_PROXY(tensor)
-DECL_VIEWER_PROXY(tensorptr)
-DECL_VIEWER_PROXY(cast)
-DECL_VIEWER_PROXY(intrin_call)
-DECL_VIEWER_PROXY(func_addr)
-DECL_VIEWER_PROXY(ssa_phi)
-DECL_VIEWER_PROXY(low_level_intrin)
-
-DECL_VIEWER_PROXY_CALL_PARENT(add, binary) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(sub, binary) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(mul, binary) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(div, binary) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(mod, binary) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(cmp_eq, cmp) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(cmp_lt, cmp) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(cmp_le, cmp) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(cmp_gt, cmp) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(cmp_ge, cmp) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(cmp_ne, cmp) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(logic_and, logic) // NOLINT
-DECL_VIEWER_PROXY_CALL_PARENT(logic_or, logic) // NOLINT
-DECL_VIEWER_PROXY(select)
-DECL_VIEWER_PROXY(indexing)
-DECL_VIEWER_PROXY(call)
-DECL_VIEWER_PROXY(logic_not)
-
-#define DECL_VIEWER_PROXY_STMT(TYPE) \
-    stmt_c ir_viewer_t::visit(TYPE##_c v) { \
-        view(v); \
-        return std::move(v); \
-    } \
-    void ir_viewer_t::view(TYPE##_c v) { ir_visitor_t::visit(std::move(v)); }
-
-DECL_VIEWER_PROXY_STMT(assign)
-DECL_VIEWER_PROXY_STMT(stmts)
-DECL_VIEWER_PROXY_STMT(if_else)
-DECL_VIEWER_PROXY_STMT(for_loop)
-DECL_VIEWER_PROXY_STMT(evaluate)
-DECL_VIEWER_PROXY_STMT(returns)
-DECL_VIEWER_PROXY_STMT(define)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/viewer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/viewer.hpp
deleted file mode 100644
index 7e07ca18025..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/viewer.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_VIEWER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_VIEWER_HPP
-
-#include <vector>
-#include "visitor.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * The base class of read-only passes.
- * Override the overloaded view() function that you are
- * interested. In view(), call dispatch() on each of the sub-node
- * of the IR node, if you need to "view" the sub-nodes.
- * @note Dispatch rule for super-classes and sub-classes: If you
- * override the view() function of a sub-class (e.g. add), and you
- * call dispatch() on a sub-class object (e.g. a+b), the view()
- * function of the sub-class will be called. And the super-class
- * view() function (e.g. binary) will not be called. If you did not
- * override view() of the sub-class, the super-class view() function
- * will be used by default.
- * */
-class ir_viewer_t : public ir_visitor_t {
-public:
-    /**
-     * Visit an array of expr.
-     */
-    void dispatch_expr_arr(const std::vector<expr> &);
-
-    /**
-     * Override the view() functions below to visit the
-     * IR that you are interested
-     * */
-
-#define SC_VIEWER_METHODS_IMPL(node_type, ...) \
-    virtual void view(node_type##_c v);
-
-#define SC_VIEWER_METHODS() \
-    FOR_EACH_EXPR_IR_TYPE(SC_VIEWER_METHODS_IMPL) \
-    FOR_EACH_STMT_IR_TYPE(SC_VIEWER_METHODS_IMPL) \
-    FOR_EACH_BASE_EXPR_IR_TYPE(SC_VIEWER_METHODS_IMPL)
-
-    SC_VIEWER_METHODS()
-
-private:
-    SC_VISITOR_METHODS(final)
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/visitable.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/visitable.hpp
deleted file mode 100644
index bcc390eba25..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/visitable.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_VISITABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_VISITABLE_HPP
-
-#include "visitor.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-template <typename T, typename Base>
-node_ptr<Base, Base> visitable_t<T, Base>::visited_by(ir_visitor_base_t *vis) {
-    using ptr_ty = node_ptr<T, Base>;
-    return vis->visit_impl(static_cast<T *>(this)
-                                   ->node_ptr_from_this()
-                                   .template static_as<ptr_ty>());
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/visitor.cpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/visitor.cpp
deleted file mode 100644
index 36d8e17d01d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/visitor.cpp
+++ /dev/null
@@ -1,631 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <atomic>
-#include <iostream>
-#include "builder.hpp"
-#include "sc_expr.hpp"
-#include "visitable.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-expr ir_visitor_base_t::dispatch_impl(expr e) { // NOLINT
-    return e->visited_by(this);
-}
-
-stmt ir_visitor_base_t::dispatch_impl(stmt s) { // NOLINT
-    return s->visited_by(this);
-}
-
-template <bool is_inplace, typename T>
-expr visit_base_binary(ir_visitor_base_impl_t<is_inplace> *vis, T &&v) {
-    bool &changed_ = vis->changed_;
-    auto l = vis->dispatch_impl(v->l_);
-    auto r = vis->dispatch_impl(v->r_);
-
-    changed_ = !l.ptr_same(v->l_) || !r.ptr_same(v->r_);
-    if (is_inplace) {
-        v->l_ = std::move(l);
-        v->r_ = std::move(r);
-        return std::forward<T>(v);
-    } else {
-        if (changed_) {
-            return builder::remake_binary(
-                    std::move(l), std::move(r), std::forward<T>(v));
-        } else {
-            return std::forward<T>(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(binary v) {
-    return visit_base_binary<is_inplace, binary>(this, std::move(v));
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(logic v) {
-    return visit_base_binary<is_inplace, logic>(this, std::move(v));
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(cmp v) {
-    return visit_base_binary<is_inplace, cmp>(this, std::move(v));
-}
-
-template <>
-bool ir_visitor_base_impl_t<true>::dispatch_expr_vector(
-        std::vector<expr> &arr, std::vector<expr> &newval) {
-    bool changed = false;
-    for (auto &v : arr) {
-        auto n = dispatch_impl(v);
-        if (!n.ptr_same(v)) {
-            changed = true;
-            v = std::move(n);
-        }
-    }
-    return changed;
-}
-
-template <>
-bool ir_visitor_base_impl_t<false>::dispatch_expr_vector(
-        std::vector<expr> &arr, std::vector<expr> &newval) {
-    bool changed = false;
-    newval = arr;
-    for (auto itrv = arr.begin(); itrv != arr.end(); ++itrv) {
-        auto &v = *itrv;
-        auto n = dispatch_impl(v);
-        if (!n.ptr_same(v)) {
-            newval.at(itrv - arr.begin()) = n;
-            changed = true;
-        }
-    }
-    return changed;
-}
-
-template <bool is_inplace>
-func_t ir_visitor_base_impl_t<is_inplace>::dispatch_impl(func_t s) {
-    std::vector<expr> newparam;
-    changed_ = dispatch_expr_vector(s->params_, newparam);
-    auto body = s->body_.defined() ? dispatch_impl(s->body_) : stmt();
-    changed_ |= !body.ptr_same(s->body_);
-    if (is_inplace) {
-        s->body_ = body;
-        return s;
-    } else {
-        if (changed_) {
-            return copy_attr(*s,
-                    builder::make_func(s->name_, newparam, body, s->ret_type_));
-        }
-    }
-    return s;
-}
-
-template <bool is_inplace>
-expr do_visit(ir_visitor_base_impl_t<is_inplace> *vis, binary v) {
-    return vis->visit_impl(std::move(v));
-}
-
-template <bool is_inplace>
-expr do_visit(ir_visitor_base_impl_t<is_inplace> *vis, cmp v) {
-    return vis->visit_impl(std::move(v));
-}
-
-template <bool is_inplace>
-expr do_visit(ir_visitor_base_impl_t<is_inplace> *vis, logic v) {
-    return vis->visit_impl(std::move(v));
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(constant v) {
-    return std::move(v);
-}
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(var v) {
-    return std::move(v);
-}
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(tensor v) {
-    std::vector<expr> newdims;
-    std::vector<expr> newplaindims;
-    std::vector<expr> newstrides;
-    changed_ = dispatch_expr_vector(v->dims_, newdims);
-    changed_ |= dispatch_expr_vector(v->strides_, newstrides);
-    if (is_inplace) {
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v,
-                    make_expr<tensor_node>(v->elem_dtype_, v->name_, newdims,
-                            v->address_space_, v->init_value_, newstrides));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(cast v) {
-    auto l = dispatch_impl(v->in_);
-    changed_ = !l.ptr_same(v->in_);
-    if (is_inplace) {
-        v->in_ = l;
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v, make_expr<cast_node>(v->dtype_, l));
-        }
-        return std::move(v);
-    }
-}
-
-#define GEN_VISIT(TYPE) \
-    template <bool is_inplace> \
-    expr ir_visitor_base_impl_t<is_inplace>::visit_impl(TYPE v) { \
-        return do_visit(this, v); \
-    }
-GEN_VISIT(add)
-GEN_VISIT(sub)
-GEN_VISIT(mul)
-GEN_VISIT(div)
-GEN_VISIT(mod)
-GEN_VISIT(cmp_eq)
-GEN_VISIT(cmp_lt)
-GEN_VISIT(cmp_le)
-GEN_VISIT(cmp_gt)
-GEN_VISIT(cmp_ge)
-GEN_VISIT(cmp_ne)
-GEN_VISIT(logic_and)
-GEN_VISIT(logic_or)
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(select v) {
-    auto cond = dispatch_impl(v->cond_);
-    auto l = dispatch_impl(v->l_);
-    auto r = dispatch_impl(v->r_);
-    changed_ = !cond.ptr_same(v->cond_) || !l.ptr_same(v->l_)
-            || !r.ptr_same(v->r_);
-    if (is_inplace) {
-        if (changed_) {
-            v->cond_ = cond;
-            v->l_ = l;
-            v->r_ = r;
-        }
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v,
-                    builder::make_select(
-                            std::move(cond), std::move(l), std::move(r)));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(indexing v) {
-    auto ptr = dispatch_impl(v->ptr_);
-    bool changed = !ptr.ptr_same(v->ptr_);
-    std::vector<expr> new_arr;
-    changed |= dispatch_expr_vector(v->idx_, new_arr);
-    expr mask;
-    if (v->mask_.defined()) {
-        mask = dispatch_impl(v->mask_);
-        changed |= (!mask.ptr_same(v->mask_));
-    }
-    if (is_inplace) {
-        if (changed) {
-            v->ptr_ = ptr;
-            v->mask_ = mask;
-        }
-        changed_ = changed;
-        return std::move(v);
-    } else {
-        if (changed) {
-            return copy_attr(*v,
-                    builder::make_indexing(std::move(ptr), std::move(new_arr),
-                            v->dtype_.lanes_, std::move(mask),
-                            v->dtype_.rows_));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(tensorptr v) {
-    auto ptr = visit_impl(v->base_);
-    changed_ = !ptr.ptr_same(v->base_);
-    if (is_inplace) {
-        if (changed_) { v->base_ = ptr.template checked_as<indexing>(); }
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v,
-                    make_expr<tensorptr_node>(
-                            ptr.template checked_as<indexing>(), v->shape_,
-                            v->is_slice_));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(intrin_call v) {
-    std::vector<expr> new_arr;
-    changed_ = dispatch_expr_vector(v->args_, new_arr);
-    if (is_inplace) {
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v, builder::remake_intrin_call(v, new_arr));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(func_addr v) {
-    return v;
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(ssa_phi v) {
-    throw std::runtime_error(
-            "ssa_phi should not occur in this visitor. You need to either "
-            "remove this phi node in IR or use SSA visitor/viewer instead");
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(low_level_intrin v) {
-    std::vector<expr> new_arr;
-    changed_ = dispatch_expr_vector(v->args_, new_arr);
-    if (is_inplace) {
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v, builder::remake_low_level_intrin(v, new_arr));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(call v) {
-    std::vector<expr> new_arr;
-    bool changed = dispatch_expr_vector(v->args_, new_arr);
-    auto new_callee = v->func_;
-    if (auto ex = std::dynamic_pointer_cast<expr_base>(v->func_)) {
-        new_callee = dispatch_impl(expr(ex)).impl;
-    }
-    changed |= (new_callee != v->func_);
-    changed_ = changed;
-    if (is_inplace) {
-        return std::move(v);
-    } else {
-        if (changed) {
-            auto ret = v->remake().static_as<call>();
-            ret->args_ = std::move(new_arr);
-            ret->func_ = new_callee;
-            return copy_attr(*v, std::move(ret));
-        } else {
-            return std::move(v);
-        }
-    }
-}
-
-template <bool is_inplace>
-expr ir_visitor_base_impl_t<is_inplace>::visit_impl(logic_not v) {
-    auto l = dispatch_impl(v->in_);
-    changed_ = !l.ptr_same(v->in_);
-    if (is_inplace) {
-        v->in_ = l;
-        return std::move(v);
-    } else {
-        if (changed_) { return copy_attr(*v, make_expr<logic_not_node>(l)); }
-        return std::move(v);
-    }
-}
-
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(assign v) {
-    auto l = dispatch_impl(v->var_);
-    auto r = dispatch_impl(v->value_);
-    changed_ = (!l.ptr_same(v->var_) || !r.ptr_same(v->value_));
-    if (is_inplace) {
-        if (changed_) {
-            v->var_ = l;
-            v->value_ = r;
-        }
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(
-                    *v, make_stmt<assign_node_t>(std::move(l), std::move(r)));
-        }
-        return std::move(v);
-    }
-}
-
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(stmts s) {
-    if (is_inplace) {
-        changed_ = false;
-        for (auto &v : s->seq_) {
-            auto n = dispatch_impl(v);
-            if (!n.ptr_same(v)) {
-                changed_ = true;
-                v = std::move(n);
-            }
-        }
-        return s;
-    } else {
-        std::vector<stmt> seq = s->seq_;
-        bool changed = false;
-        for (auto &v : seq) {
-            auto n = dispatch_impl(v);
-            if (!n.ptr_same(v)) {
-                changed = true;
-                v = std::move(n);
-            }
-        }
-        if (changed) {
-            return copy_attr(*s, make_stmt<stmts_node_t>(std::move(seq)));
-        }
-        return s;
-    }
-}
-
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(if_else v) {
-    auto cond = dispatch_impl(v->condition_);
-    auto thencase = dispatch_impl(v->then_case_);
-
-    stmt elsecase;
-    if (v->else_case_.defined()) elsecase = dispatch_impl(v->else_case_);
-    changed_ = !cond.ptr_same(v->condition_)
-            || !elsecase.ptr_same(v->else_case_)
-            || !thencase.ptr_same(v->then_case_);
-    if (is_inplace) {
-        v->condition_ = cond;
-        v->then_case_ = thencase;
-        v->else_case_ = elsecase;
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v,
-                    make_stmt<if_else_node_t>(std::move(cond),
-                            std::move(thencase), std::move(elsecase)));
-        }
-        return std::move(v);
-    }
-}
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(for_loop v) {
-    auto var = dispatch_impl(v->var_);
-    auto begin = dispatch_impl(v->iter_begin_);
-    auto end = dispatch_impl(v->iter_end_);
-    auto step = dispatch_impl(v->step_);
-    auto body = dispatch_impl(v->body_);
-
-    changed_ = !(var.ptr_same(v->var_) && begin.ptr_same(v->iter_begin_)
-            && end.ptr_same(v->iter_end_) && step.ptr_same(v->step_)
-            && body.ptr_same(v->body_));
-    if (is_inplace) {
-        v->var_ = var;
-        v->iter_begin_ = begin;
-        v->iter_end_ = end;
-        v->step_ = step;
-        v->body_ = body;
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v,
-                    make_stmt<for_loop_node_t>(std::move(var), std::move(begin),
-                            std::move(end), std::move(step), std::move(body),
-                            v->incremental_, v->kind_, v->num_threads_));
-        }
-        return std::move(v);
-    }
-}
-
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(evaluate v) {
-    auto val = dispatch_impl(v->value_);
-    changed_ = !val.ptr_same(v->value_);
-    if (is_inplace) {
-        v->value_ = val;
-        return std::move(v);
-    } else {
-        if (changed_) { return copy_attr(*v, make_stmt<evaluate_node_t>(val)); }
-        return std::move(v);
-    }
-}
-
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(returns v) {
-    if (v->value_.defined()) {
-        auto val = dispatch_impl(v->value_);
-        changed_ = !val.ptr_same(v->value_);
-        if (is_inplace) {
-            v->value_ = val;
-            return std::move(v);
-        } else {
-            if (changed_) {
-                return copy_attr(*v, make_stmt<returns_node_t>(val));
-            }
-            return std::move(v);
-        }
-    }
-    return std::move(v);
-}
-
-template <bool is_inplace>
-stmt ir_visitor_base_impl_t<is_inplace>::visit_impl(define v) {
-    expr init;
-    bool changed = false;
-    if (v->init_.defined()) {
-        init = dispatch_impl(v->init_);
-        changed = !init.ptr_same(v->init_);
-    }
-    auto var = dispatch_impl(v->var_);
-    changed |= !var.ptr_same(v->var_);
-    changed_ = changed;
-    if (is_inplace) {
-        v->var_ = var;
-        v->init_ = init;
-        return std::move(v);
-    } else {
-        if (changed_) {
-            return copy_attr(*v,
-                    make_stmt<define_node_t>(
-                            std::move(var), v->linkage_, std::move(init)));
-        }
-        return std::move(v);
-    }
-}
-
-#define DEFINE_VISITOR_PROXY_IMPL(TYPE, ...) \
-    __VA_ARGS__; \
-    expr_c ir_visitor_t::visit(TYPE##_c v) { \
-        return ir_visitor_base_impl_t<false>::visit_impl(v.remove_const()); \
-    } \
-    expr ir_visitor_t::visit_impl(TYPE v) { \
-        return visit(v.to_const()).remove_const(); \
-    }
-#define DEFINE_VISITOR_PROXY_BASE(TYPE) DEFINE_VISITOR_PROXY_IMPL(TYPE, )
-#define DEFINE_VISITOR_PROXY(TYPE) \
-    DEFINE_VISITOR_PROXY_IMPL( \
-            TYPE, template struct visitable_t<TYPE##_node, expr_base>)
-
-DEFINE_VISITOR_PROXY(constant) // NOLINT
-DEFINE_VISITOR_PROXY(var) // NOLINT
-DEFINE_VISITOR_PROXY(cast) // NOLINT
-
-DEFINE_VISITOR_PROXY_BASE(binary) // NOLINT
-DEFINE_VISITOR_PROXY(add) // NOLINT
-DEFINE_VISITOR_PROXY(sub) // NOLINT
-DEFINE_VISITOR_PROXY(mul) // NOLINT
-DEFINE_VISITOR_PROXY(div) // NOLINT
-DEFINE_VISITOR_PROXY(mod) // NOLINT
-
-DEFINE_VISITOR_PROXY_BASE(cmp) // NOLINT
-DEFINE_VISITOR_PROXY(cmp_eq) // NOLINT
-DEFINE_VISITOR_PROXY(cmp_lt) // NOLINT
-DEFINE_VISITOR_PROXY(cmp_le) // NOLINT
-DEFINE_VISITOR_PROXY(cmp_gt) // NOLINT
-DEFINE_VISITOR_PROXY(cmp_ge) // NOLINT
-DEFINE_VISITOR_PROXY(cmp_ne) // NOLINT
-
-DEFINE_VISITOR_PROXY_BASE(logic) // NOLINT
-DEFINE_VISITOR_PROXY(logic_and) // NOLINT
-DEFINE_VISITOR_PROXY(logic_or) // NOLINT
-
-DEFINE_VISITOR_PROXY(logic_not) // NOLINT
-DEFINE_VISITOR_PROXY(select) // NOLINT
-DEFINE_VISITOR_PROXY(indexing) // NOLINT
-DEFINE_VISITOR_PROXY(call) // NOLINT
-DEFINE_VISITOR_PROXY(tensor) // NOLINT
-DEFINE_VISITOR_PROXY(tensorptr) // NOLINT
-DEFINE_VISITOR_PROXY(intrin_call) // NOLINT
-DEFINE_VISITOR_PROXY(func_addr) // NOLINT
-DEFINE_VISITOR_PROXY(ssa_phi) // NOLINT
-DEFINE_VISITOR_PROXY(low_level_intrin) // NOLINT
-
-#define DEFINE_VISITOR_PROXY_STMT(TYPE) \
-    stmt_c ir_visitor_t::visit(TYPE##_c v) { \
-        return ir_visitor_base_impl_t<false>::visit_impl(v.remove_const()); \
-    } \
-    stmt ir_visitor_t::visit_impl(TYPE v) { \
-        return visit(v.to_const()).remove_const(); \
-    }
-DEFINE_VISITOR_PROXY_STMT(assign) // NOLINT
-DEFINE_VISITOR_PROXY_STMT(stmts) // NOLINT
-DEFINE_VISITOR_PROXY_STMT(if_else) // NOLINT
-DEFINE_VISITOR_PROXY_STMT(evaluate) // NOLINT
-DEFINE_VISITOR_PROXY_STMT(define) // NOLINT
-DEFINE_VISITOR_PROXY_STMT(returns) // NOLINT
-DEFINE_VISITOR_PROXY_STMT(for_loop) // NOLINT
-
-expr_c ir_visitor_t::dispatch(expr_c e) { // NOLINT
-    return ir_visitor_base_impl_t<false>::dispatch_impl(e.remove_const());
-}
-
-stmt_c ir_visitor_t::dispatch(stmt_c s) { // NOLINT
-    return ir_visitor_base_impl_t<false>::dispatch_impl(s.remove_const());
-}
-
-func_c ir_visitor_t::dispatch(func_c f) { // NOLINT
-    return ir_visitor_base_impl_t<false>::dispatch_impl(
-            std::const_pointer_cast<func_base>(f));
-}
-
-expr ir_visitor_t::dispatch_impl(expr e) {
-    return dispatch(std::move(e)).remove_const();
-}
-stmt ir_visitor_t::dispatch_impl(stmt s) {
-    return dispatch(std::move(s)).remove_const();
-}
-func_t ir_visitor_t::dispatch_impl(func_t v) {
-    return std::const_pointer_cast<func_base>(dispatch(std::move(v)));
-}
-
-bool ir_visitor_t::dispatch_expr_vector(
-        const std::vector<expr> &arr, std::vector<expr> &newval) {
-    return ir_visitor_base_impl_t<false>::dispatch_expr_vector(
-            const_cast<std::vector<expr> &>(arr), newval);
-}
-
-bool ir_visitor_t::dispatch_expr_vector(
-        const std::vector<expr> &arr, std::vector<expr_c> &newval) {
-    std::vector<expr> newval2;
-    bool ret = ir_visitor_base_impl_t<false>::dispatch_expr_vector(
-            const_cast<std::vector<expr> &>(arr), newval2);
-    newval.insert(newval.end(), newval2.begin(), newval2.end());
-    return ret;
-}
-
-uint64_t ir_visitor_t::get_run_id() {
-    static std::atomic<uint64_t> id = {0};
-    return id++;
-}
-
-template class ir_visitor_base_impl_t<true>; // instantiation of the template
-template class ir_visitor_base_impl_t<false>; // instantiation of the template
-
-expr_c ir_consistent_visitor_t::dispatch(expr_c e) {
-    bool is_var_or_tensor = e.isa<tensor>() || e.isa<var>();
-    if (is_var_or_tensor) {
-        auto itr = replace_map_.find(e);
-        if (itr != replace_map_.end()) { e = itr->second; }
-    }
-    expr_c newe = ir_visitor_t::dispatch(e);
-    if (is_var_or_tensor && !newe.ptr_same(e)) {
-        assert(replace_map_.count(e) == 0);
-        replace_map_[e] = newe;
-    }
-    return newe;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/ir/visitor.hpp b/src/graph/backend/graph_compiler/core/src/compiler/ir/visitor.hpp
deleted file mode 100644
index 0c3f4e3a8c0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/ir/visitor.hpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_VISITOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_IR_VISITOR_HPP
-
-#include <vector>
-#include "sc_function.hpp"
-#include <unordered_map>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// this macro declares visit_impl() on all IR node classes
-// The POSTFIX can be "=0", "final", etc.
-// use NOLINT since POSTFIX should not be enclosed in parentheses
-#define SC_BASE_VISITOR_METHODS_IMPL(node_type, ret_type, POSTFIX) \
-    virtual ret_type visit_impl(node_type v) POSTFIX; /* NOLINT*/
-
-#define SC_BASE_VISITOR_METHODS(POSTFIX) \
-    FOR_EACH_EXPR_IR_TYPE(SC_BASE_VISITOR_METHODS_IMPL, expr, POSTFIX) \
-    FOR_EACH_STMT_IR_TYPE(SC_BASE_VISITOR_METHODS_IMPL, stmt, POSTFIX) \
-    FOR_EACH_BASE_EXPR_IR_TYPE(SC_BASE_VISITOR_METHODS_IMPL, expr, POSTFIX)
-
-// The base interface class for all visitors
-class ir_visitor_base_t {
-public:
-    /**
-     * Downcasts the pointer and call visit() with the subclass pointer
-     * See the dispatch rule above
-     * @param e the pointer to dispatch
-     * @return the IR node replaces the input node
-     */
-    virtual expr dispatch_impl(expr e);
-
-    /**
-     * Downcasts the pointer and call visit() with the subclass pointer
-     * See the dispatch rule above
-     * @param e the pointer to dispatch
-     * @return the IR node replaces the input node
-     */
-    virtual stmt dispatch_impl(stmt s);
-
-    // the visitor_impl() methods to be implemented by subclasses
-    SC_BASE_VISITOR_METHODS(= 0)
-
-    virtual ~ir_visitor_base_t() = default;
-};
-
-/**
- * This class defines the default visit_impl() on all IR nodes. Can be shared by
- * in-places & copy-on-write visitors
- * */
-template <bool inplace_>
-class ir_visitor_base_impl_t : ir_visitor_base_t {
-public:
-    using ir_visitor_base_t::dispatch_impl;
-    /**
-     * Visits a function IR node. The default implementation just calls
-     * dispatch() on each of the fields
-     * @param e the pointer to dispatch
-     * @return the IR node replaces the input node
-     */
-    virtual func_t dispatch_impl(func_t v);
-
-    /**
-     * Visits an array of expr.
-     * Returns true if any expr in the array is changed. Otherwise, returns
-     * false.
-     * @param arr the array of expr to dispatch
-     * @return true if any of the expr is changed
-     */
-    bool dispatch_expr_vector(
-            std::vector<expr> &arr, std::vector<expr> &newval);
-    SC_BASE_VISITOR_METHODS()
-    bool changed_ = false;
-};
-
-#define SC_VISITOR_METHODS_IMPL(node_type, ret_type, POSTFIX) \
-    virtual ret_type visit(node_type##_c v) POSTFIX; /* NOLINT*/
-
-#define SC_VISITOR_METHODS(POSTFIX) \
-    FOR_EACH_EXPR_IR_TYPE(SC_VISITOR_METHODS_IMPL, expr_c, POSTFIX) \
-    FOR_EACH_STMT_IR_TYPE(SC_VISITOR_METHODS_IMPL, stmt_c, POSTFIX) \
-    FOR_EACH_BASE_EXPR_IR_TYPE(SC_VISITOR_METHODS_IMPL, expr_c, POSTFIX)
-
-/**
- * The base class of copy-on-write IR passes.
- * Override the overloaded visit() function that you are
- * interested. In visit(), call dispatch() on each of the sub-node
- * of the IR node, if you need to "visit" the sub-nodes. visit()
- * function should return the expr/stmt that should replace the input
- * one
- *
- * @note Dispatch rule for super-classes and sub-classes: If you
- * override the visit() function of a sub-class (e.g. add), and you
- * call dispatch() on a sub-class object (e.g. a+b), the visit()
- * function of the sub-class will be called. And the super-class
- * visit() function (e.g. binary) will not be called. If you did not
- * override visit() of the sub-class, the super-class visit() function
- * will be used by default.
- *
- * @note The visitor does not ensure that the IR DAG after the mutation is
- * "consistent" - that is, if a var/tensor is re-made (with a new address), all
- * other IR nodes that use the var/tensor still point to the old var/tensor
- * node. To automatically keep the IR DAG consistent, use
- * ir_consistent_visitor_t.
- * @see ir_consistent_visitor_t
- * */
-class ir_visitor_t : private ir_visitor_base_impl_t<false> {
-    // overrides ir_visitor_base_impl_t<false>::dispatch_impl to use dispatch()
-    expr dispatch_impl(expr e) final;
-    stmt dispatch_impl(stmt s) final;
-    func_t dispatch_impl(func_t v) final;
-
-public:
-    /**
-     * Downcasts the pointer and call visit() with the subclass pointer
-     * See the dispatch rule above
-     * @param e the pointer to dispatch
-     * @return the IR node replaces the input node
-     */
-    virtual expr_c dispatch(expr_c e);
-
-    /**
-     * Downcasts the pointer and call visit() with the subclass pointer
-     * See the dispatch rule above
-     * @param e the pointer to dispatch
-     * @return the IR node replaces the input node
-     */
-    virtual stmt_c dispatch(stmt_c s);
-
-    /**
-     * Visits an array of expr.
-     * Returns true if any expr in the array is changed. Otherwise, returns
-     * false.
-     * @param arr the array of expr to dispatch
-     * @return true if any of the expr is changed
-     */
-    bool dispatch_expr_vector(
-            const std::vector<expr> &arr, std::vector<expr> &newval);
-    bool dispatch_expr_vector(
-            const std::vector<expr> &arr, std::vector<expr_c> &newval);
-
-    /**
-     * Visits a function IR node. The default implementation just calls
-     * dispatch() on each of the fields
-     * @param e the pointer to dispatch
-     * @return the IR node replaces the input node
-     */
-    virtual func_c dispatch(func_c v);
-
-    SC_VISITOR_METHODS()
-
-private:
-    // we want to hide dispatch_impl and prevent it from being overriden
-    SC_BASE_VISITOR_METHODS(final)
-
-protected:
-    using ir_visitor_base_impl_t<false>::dispatch_impl;
-    // gets a unique run id. Multiple calls of get_run_id() will return
-    // different IDs. It is used to mark the age of temp_data on TIR.
-    // Passes/Visitors can call this function once in the constructor, and use
-    // the run_id to compare with the run_id in the temp_data to avoid reading
-    // stale data
-    static uint64_t get_run_id();
-};
-
-/**
- * Do in-place changes on the IR DAG. Will not
- * call remake if the node is changed. Instead, it will set "changed_"
- * field if any of the sub-node is changed
- * When implementing your own ir_inplace_visitor_t by extending this class,
- * make sure you set changed_ to true when you change any node of the IR.
- * @see ir_visitor_t
- * */
-class ir_inplace_visitor_t : public ir_visitor_base_impl_t<true> {};
-
-/**
- * Mutating the IR DAG and keeping the consistency of the DAG. If a tensor/var
- * is changed, it will remember the old->new mapping. Then before a tensor/var
- * is visited, this visitor will check in the mapping to find if the tensor/var
- * is replaced. If so, it will pass the replaced new tensor/var to visit(...)
- *
- * Performance Warning: Overhead in visiting tensors and vars
- * Memory leak Warning: Old replaced vars and tensors are alive in replace_map_
- * @see ir_visitor_t
- * */
-class ir_consistent_visitor_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::dispatch_expr_vector;
-    // the old -> new mapping for var/tensor
-    std::unordered_map<expr_c, expr_c> replace_map_;
-    expr_c dispatch(expr_c e) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/cfake/cfake_jit.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/cfake/cfake_jit.cpp
deleted file mode 100644
index 47b9a3c33d3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/cfake/cfake_jit.cpp
+++ /dev/null
@@ -1,420 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "cfake_jit.hpp"
-#include <atomic>
-#include <cassert>
-#include <chrono>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string.h>
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/jit/jit.hpp>
-#include <compiler/jit/symbol_resolver.hpp>
-#include <runtime/config.hpp>
-#include <runtime/env_vars.hpp>
-#include <runtime/memorypool.hpp> // to get the path of the runtime library
-#include <unordered_map>
-#include <util/file.hpp>
-#include <util/scoped_timer.hpp>
-#include <util/string_utils.hpp>
-#include <util/utils.hpp>
-
-#ifdef _WIN32
-#else
-#include <dlfcn.h>
-#include <unistd.h>
-#endif
-
-SC_MODULE(cfakejit)
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using namespace runtime;
-
-#ifdef _WIN32
-std::shared_ptr<jit_module> cfake_jit::make_jit_module(
-        const std::string &inpath, const std::string &outpath,
-        statics_table_t &&globals, bool has_generic_wrapper) {
-    // fix-me: (win32)
-    throw std::runtime_error("make_jit_module().");
-}
-
-std::shared_ptr<jit_module> cfake_jit::make_jit_module(
-        const_ir_module_ptr module, bool generate_wrapper) {
-    // fix-me: (win32)
-    throw std::runtime_error("make_jit_module().");
-}
-
-void *cfake_jit_module_code_t::get_address_of_symbol(const std::string &name) {
-    // fix-me: (win32)
-    throw std::runtime_error("get_address_of_symbol().");
-}
-
-cfake_jit_module_code_t::~cfake_jit_module_code_t() {
-    // fix-me: (win32)
-    throw std::runtime_error("~cfake_jit_module()");
-}
-
-#else
-
-static std::string get_include_path_() {
-    std::string path;
-    char home_path[512];
-    if (utils::getenv(
-                env_names[env_key::SC_C_INCLUDE], home_path, sizeof(home_path))
-            != 0) {
-        path = home_path;
-    } else {
-#ifdef SC_HOME
-        path = MACRO_2_STR(SC_HOME) "/src";
-#else
-        std::cerr << "environment variable " << env_names[env_key::SC_C_INCLUDE]
-                  << " is not set";
-#endif
-    }
-    return path;
-}
-
-static const std::string &get_include_path() {
-    static std::string path = get_include_path_();
-    return path;
-}
-
-std::shared_ptr<jit_module> cfake_jit::make_jit_module(
-        const std::string &inpath, const std::string &outpath,
-        statics_table_t &&globals, bool has_generic_wrapper,
-        thread_pool_mode_t managed_thread_pool) const {
-    auto timer = SC_SCOPED_TIMER_INFO("pass.time.cfake_jit", "");
-    auto &home_inc = get_include_path();
-    const auto &compiler_config = utils::compiler_configs_t::get();
-    if (compiler_config.print_gen_code_) {
-        std::ifstream f(inpath);
-        if (f.is_open()) std::cerr << f.rdbuf();
-    }
-
-    const std::string &command = cfake_jit::get_compiler_command();
-    // Mandatory compiler options...
-    std::vector<std::string> option = {command, "-I", home_inc, "-o", outpath,
-            inpath, "-shared", "-fPIC", "-std=c++11", "-DSC_JIT_SOURCE=1"};
-#if SC_PROFILING == 1
-    option.emplace_back("-g");
-#endif
-
-    // Discretionary compiler options...
-    const std::string &options_group
-            = utils::compiler_configs_t::get().jit_cc_options_;
-
-    std::vector<std::string> discretionary_options;
-    if (options_group.empty() || (options_group == "default")) {
-        discretionary_options
-                = std::vector<std::string> {"-march=native", "-mno-mmx", "-w"};
-        assert(opt_level_ >= 0 && opt_level_ <= 3);
-        discretionary_options.emplace_back("-O");
-        discretionary_options.back() += std::to_string(opt_level_);
-        const auto &envflags = utils::compiler_configs_t::get().cpu_jit_flags_;
-        for (const auto &i : envflags) {
-            discretionary_options.emplace_back(i);
-        }
-        if (debug_info_) { discretionary_options.emplace_back("-g"); }
-    } else if (options_group == "xbyak-dev") {
-        discretionary_options = std::vector<std::string> {"-O3",
-                "-march=native",
-
-                // Produce assembly that our (WIP) maint/dev/gnu-as-to-xbyak.py
-                // script knows how to parse.
-                "-x c++", "-save-temps=obj", "-fverbose-asm", "-masm=intel",
-
-                // Suppress object code that's too complicated for us to
-                // replicate via Xbyak (at least for the moment).
-                // TODO(xxx): It's possible that some of the following options
-                // are unnecessary for that goal.
-                "-fno-unwind-tables", "-fno-asynchronous-unwind-tables",
-                "-fno-rtti", "-fno-exceptions", "-fno-stack-protector"};
-    } else {
-        throw std::runtime_error(
-                "Unsupported value for env var SC_JIT_CC_OPTIONS_GROUP.");
-    }
-    option.insert(option.end(), discretionary_options.begin(),
-            discretionary_options.end());
-
-    int exit_status;
-    bool success
-            = utils::create_process_and_await(command, option, exit_status);
-    void *compiled_module = nullptr;
-    if (success) {
-        if (exit_status) {
-            std::ostringstream os;
-            os << "c compiler returns non-zero code: " << exit_status;
-            throw std::runtime_error(os.str());
-        }
-        compiled_module = dlopen(outpath.c_str(), RTLD_LAZY);
-        if (!compiled_module) {
-            std::ostringstream os;
-            os << "dlopen: " << dlerror();
-            throw std::runtime_error(os.str());
-        }
-        for (auto &kv : get_runtime_function_map()) {
-            void **ptr = reinterpret_cast<void **>(
-                    dlsym(compiled_module, (kv.first + "_fptr").c_str()));
-            if (ptr) { *ptr = kv.second; }
-        }
-        typedef void (*init_func_t)(void *ctx, void *mod);
-        auto init_func = reinterpret_cast<init_func_t>(
-                dlsym(compiled_module, "__sc_init__"));
-        if (init_func) { init_func(nullptr, globals.data_.data_); }
-    } else {
-        // If we call 'unlink', it will overwrite errno.
-        const int fork_errno = errno;
-        unlink(inpath.c_str());
-
-        std::ostringstream os;
-        os << "Error when fork: " << utils::get_error_msg(fork_errno);
-        throw std::runtime_error(os.str());
-    }
-    std::shared_ptr<cfake_jit_module_code_t> code(
-            new cfake_jit_module_code_t(compiled_module, inpath, outpath,
-                    has_generic_wrapper, managed_thread_pool));
-    return std::make_shared<jit_module>(std::move(globals), code);
-}
-
-statics_table_t cfake_jit::codegen_to_cpp(std::ostream &os,
-        const_ir_module_ptr &new_mod, const const_ir_module_ptr &module,
-        bool generate_wrapper, thread_pool_mode_t &out_managed_thread_pool,
-        c_generator_optional_out_t *optout) {
-    auto gen = create_c_generator(os, context_, generate_wrapper, optout);
-    new_mod = gen(module);
-    out_managed_thread_pool = new_mod->attr_.get<thread_pool_mode_t>(
-            ir_module_t::attr_key_t::MANAGED_THREAD_POOL);
-    return std::move(*new_mod->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS));
-}
-
-statics_table_t cfake_jit::codegen_to_cpp(std::ostream &os,
-        const_ir_module_ptr &new_mod, const const_ir_module_ptr &module,
-        bool generate_wrapper) {
-    thread_pool_mode_t dummy;
-    return codegen_to_cpp(os, new_mod, module, generate_wrapper, dummy);
-}
-
-std::shared_ptr<jit_module> cfake_jit::make_jit_module(
-        const_ir_module_ptr module, bool generate_wrapper) {
-    auto unique_name = utils::get_unique_name_for_file();
-    // If we're invoking gcc/g++ with the "-save-temps=obj" option, we
-    // want to have just one "." character in the name. Otherwise it
-    // seems to get confused and the saved intermediate files lack the
-    // unique_name part.
-    const auto &tmpdir = utils::compiler_configs_t::get_temp_dir_path();
-    std::string outpath = tmpdir + "/cfake_jit_module-" + unique_name + ".so";
-    std::string inpath = tmpdir + "/cfake_jit_module-" + unique_name + ".cpp";
-
-    std::ofstream of;
-    utils::open_file_for_write(of, inpath);
-
-    const auto &compiler_config = utils::compiler_configs_t::get();
-    std::ofstream dump_main_f;
-    std::ofstream dump_header_f;
-    std::ofstream dump_data_f;
-    c_generator_optional_out_t optional_dump {
-            &dump_main_f, &dump_header_f, &dump_data_f};
-    c_generator_optional_out_t *ptr_optional_dump = nullptr;
-    if (!compiler_config.dump_gen_code_.empty()) {
-        std::string dump_path_base = compiler_config.dump_gen_code_ + '/';
-        dump_path_base
-                += module->attr_.get_or_else(ir_module_t::attr_key_t::NAME,
-                        "/cfake_jit_module-" + unique_name);
-
-        std::string dump_path = dump_path_base + ".cpp";
-        std::string dump_header_path = dump_path_base + ".hpp";
-        std::string dump_data_path = dump_path_base + "_data.cpp";
-        utils::open_file_for_write(dump_main_f, dump_path);
-        utils::open_file_for_write(dump_header_f, dump_header_path);
-        utils::open_file_for_write(dump_data_f, dump_data_path);
-        ptr_optional_dump = &optional_dump;
-    }
-
-    thread_pool_mode_t managed_thread_pool;
-    const_ir_module_ptr new_mod;
-    auto attr_table = codegen_to_cpp(of, new_mod, module, generate_wrapper,
-            managed_thread_pool, ptr_optional_dump);
-    of.close();
-
-    auto ret = make_jit_module(inpath, outpath, std::move(attr_table),
-            generate_wrapper, managed_thread_pool);
-    ret->code_->postprocess(new_mod, ret->globals_);
-    return ret;
-}
-
-void *cfake_jit_module_code_t::get_address_of_symbol(const std::string &name) {
-    return dlsym(module_, name.c_str());
-}
-
-cfake_jit_module_code_t::~cfake_jit_module_code_t() {
-    if (module_) {
-        dlclose(module_);
-
-        unlink(path_.c_str());
-        unlink(src_path_.c_str());
-
-        module_ = nullptr;
-    }
-}
-
-#endif
-
-void *cfake_jit_module_code_t::get_function(
-        const std::string &name, void *&wrapper) {
-    void *fun = get_address_of_symbol(name);
-    wrapper = get_address_of_symbol(name + "_0wrapper");
-    return fun;
-}
-
-template <typename T, typename TF>
-constexpr uintptr_t myoffsetof(TF T::*fld) {
-    return reinterpret_cast<uintptr_t>(&(*(T *)nullptr.*fld));
-}
-#define foffset(F) myoffsetof(&cpu_flags_t::F)
-
-static const std::unordered_map<std::string, uintptr_t> &
-get_compiler_flag_map() {
-    static std::unordered_map<std::string, uintptr_t> ret = {
-            {"__MMX__", foffset(fMMX)},
-            {"__x86_64__", foffset(fx64)},
-            {"__ABM__", foffset(fABM)},
-            {"__RDRND__", foffset(fRDRAND)},
-            {"__BMI__", foffset(fBMI1)},
-            {"__BMI2__", foffset(fBMI2)},
-            {"__ADX__", foffset(fADX)},
-            {"__PREFETCHWT1__", foffset(fPREFETCHWT1)},
-
-            {"__SSE__", foffset(fSSE)},
-            {"__SSE2__", foffset(fSSE2)},
-            {"__SSE3__", foffset(fSSE3)},
-            {"__SSSE3__", foffset(fSSSE3)},
-            {"__SSE4_1__", foffset(fSSE41)},
-            {"__SSE4_2__", foffset(fSSE42)},
-            {"__SSE4A__", foffset(fSSE4a)},
-            {"__AES__", foffset(fAES)},
-            {"__SHA__", foffset(fSHA)},
-
-            {"__AVX__", foffset(fAVX)},
-            {"__XOP__", foffset(fXOP)},
-            {"__FMA__", foffset(fFMA3)},
-            {"__FMA4__", foffset(fFMA4)},
-            {"__AVX2__", foffset(fAVX2)},
-
-            {"__AVX512F__", foffset(fAVX512F)},
-            {"__AVX512CD__", foffset(fAVX512CD)},
-            {"__AVX512PF__", foffset(fAVX512PF)},
-            {"__AVX512ER__", foffset(fAVX512ER)},
-            {"__AVX512VL__", foffset(fAVX512VL)},
-            {"__AVX512BW__", foffset(fAVX512BW)},
-            {"__AVX512DQ__", foffset(fAVX512DQ)},
-            {"__AVX512IFMA__", foffset(fAVX512IFMA)},
-            {"__AVX512VBMI__", foffset(fAVX512VBMI)},
-            {"__AVX512BF16__", foffset(fAVX512BF16)},
-            {"__AVX512FP16__", foffset(fAVX512FP16)},
-
-            {"__AMX_FP16__", foffset(fAVX512AMXFP16)},
-            {"__AMX_BF16__", foffset(fAVX512AMXBF16)},
-            {"__AMX_INT8__", foffset(fAVX512AMXTILE)},
-            {"__AMX_TILE__", foffset(fAVX512AMXINT8)},
-    };
-    return ret;
-}
-
-static bool &get_flag_field(cpu_flags_t &flg, uintptr_t diff) {
-    return *(bool *)((char *)&flg + diff);
-}
-
-static cpu_flags_t do_get_compiler_flags() {
-    cpu_flags_t ret;
-    const auto &flagmap = get_compiler_flag_map();
-    std::vector<std::string> option = {cfake_jit::get_compiler_command(),
-            "-march=native", "-dM", "-E", "-x", "c++", "-"};
-    int exit_status;
-    std::string rstdout, rstdin; // empty input
-    bool success
-            = utils::create_process_and_await(cfake_jit::get_compiler_command(),
-                    option, exit_status, &rstdin, &rstdout);
-    if (success && !exit_status) {
-        for (auto &v : utils::string_split(rstdout, " ")) {
-            if (!v.empty() && v[0] == '_') {
-                auto itr = flagmap.find(v);
-                if (itr != flagmap.end()) {
-                    get_flag_field(ret, itr->second) = true;
-                }
-            }
-        }
-        target_machine_t::set_simd_length_and_max_cpu_threads(ret);
-        return ret;
-    }
-    ret.max_simd_bits = 0;
-    SC_WARN << "Cannot run g++ to detect SIMD length!\n";
-    return ret;
-}
-
-const cpu_flags_t &cfake_jit::get_compiler_flags() {
-    static auto flags = do_get_compiler_flags();
-    return flags;
-}
-
-std::string &cfake_jit::get_compiler_command() {
-    static std::string cmd = []() { return std::string("g++"); }();
-    return cmd;
-}
-
-void cfake_jit::set_target_machine(target_machine_t &tm) {
-    auto flg_map = get_compiler_flag_map();
-    auto f = get_compiler_flags();
-    f.dataCacheLevels_ = tm.cpu_flags_.dataCacheLevels_;
-    f.dataCacheSize_ = tm.cpu_flags_.dataCacheSize_;
-    f.family = tm.cpu_flags_.family;
-    f.model = tm.cpu_flags_.model;
-    f.step = tm.cpu_flags_.step;
-    for (auto &itr : flg_map) {
-        if (get_flag_field(tm.cpu_flags_, itr.second)
-                != get_flag_field(f, itr.second)) {
-            SC_MODULE_WARN << "The flag " << itr.first
-                           << " is enabled on your hardware but is "
-                              "disabled by the default compiler.";
-        }
-    }
-    if (tm.cpu_flags_.max_simd_bits > f.max_simd_bits) {
-        SC_MODULE_WARN << "The hardware max SIMD length is larger than the "
-                          "compiler SIMD length";
-    }
-
-    bool vnni_enabled = tm.cpu_flags_.fAVX512VNNI;
-    bool amx_f16_enabled = tm.cpu_flags_.fAVX512AMXFP16;
-    bool amx_bf16_enabled = tm.cpu_flags_.fAVX512AMXBF16;
-    bool amx_tile_enabled = tm.cpu_flags_.fAVX512AMXTILE;
-    bool amx_int8_enabled = tm.cpu_flags_.fAVX512AMXINT8;
-    tm.cpu_flags_ = f;
-    tm.cpu_flags_.fAVX512VNNI = vnni_enabled;
-    tm.cpu_flags_.fAVX512AMXBF16 = amx_bf16_enabled;
-    tm.cpu_flags_.fAVX512AMXFP16 = amx_f16_enabled;
-    tm.cpu_flags_.fAVX512AMXTILE = amx_tile_enabled;
-    tm.cpu_flags_.fAVX512AMXINT8 = amx_int8_enabled;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/cfake/cfake_jit.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/cfake/cfake_jit.hpp
deleted file mode 100644
index 455cd5dc718..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/cfake/cfake_jit.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_CFAKE_CFAKE_JIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_CFAKE_CFAKE_JIT_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/sc_function.hpp>
-#include <compiler/jit/jit.hpp>
-#include <runtime/generic_val.hpp>
-#include <runtime/target_machine.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class cfake_jit;
-class SC_INTERNAL_API cfake_jit_module_code_t : public jit_module_code {
-    friend cfake_jit;
-    void *module_;
-    std::string path_;
-    std::string src_path_;
-    cfake_jit_module_code_t(void *module, const std::string &src_path,
-            const std::string &path, bool has_generic_wrapper,
-            thread_pool_mode_t managed_thread_pool)
-        : jit_module_code(managed_thread_pool)
-        , module_(module)
-        , path_(path)
-        , src_path_(src_path) {}
-    cfake_jit_module_code_t(cfake_jit_module_code_t &&other) = delete;
-    cfake_jit_module_code_t(const cfake_jit_module_code_t &other) = delete;
-
-public:
-    ~cfake_jit_module_code_t() override;
-    std::vector<std::string> get_temp_filenames() const override {
-        return {path_, src_path_};
-    }
-
-    void *get_address_of_symbol(const std::string &name) override;
-    void *get_function(const std::string &name, void *&wrapperfunc) override;
-};
-
-struct c_generator_optional_out_t;
-
-class SC_INTERNAL_API cfake_jit : public jit_engine_t {
-public:
-    cfake_jit(context_ptr ctx = get_default_context())
-        : jit_engine_t(std::move(ctx)) {
-        opt_level_ = context_->flags_.backend_opt_level_;
-        debug_info_ = opt_level_ <= 1 || context_->flags_.debug_info_;
-    }
-    unsigned opt_level_;
-    bool debug_info_;
-
-    statics_table_t codegen_to_cpp(std::ostream &os,
-            const_ir_module_ptr &new_mod, const const_ir_module_ptr &module,
-            bool generate_wrapper);
-    statics_table_t codegen_to_cpp(std::ostream &os,
-            const_ir_module_ptr &new_mod, const const_ir_module_ptr &module,
-            bool generate_wrapper, thread_pool_mode_t &out_managed_thread_pool,
-            c_generator_optional_out_t *optional_out = nullptr);
-    std::shared_ptr<jit_module> make_jit_module(
-            const_ir_module_ptr module, bool generate_wrapper) override;
-    std::shared_ptr<jit_module> make_jit_module(const std::string &inpath,
-            const std::string &outpath, statics_table_t &&globals,
-            bool has_generic_wrapper,
-            thread_pool_mode_t managed_thread_pool) const;
-    static const runtime::cpu_flags_t &get_compiler_flags();
-    static std::string &get_compiler_command();
-    static void set_target_machine(runtime::target_machine_t &tm);
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/compiler_driver.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/compiler_driver.cpp
deleted file mode 100644
index 6d4f9ea9db2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/compiler_driver.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include "jit.hpp"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/graph_code_cache.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_API std::shared_ptr<jit_function_t> compiler_driver(const context_ptr &ctx,
-        sc_graph_t &graph,
-        const std::function<std::vector<sc_op_ptr>(sc_graph_t &graph)>
-                &get_args,
-        const dnnl::impl::graph::gc::graph_config *in_cfg,
-        ir_module_ptr *out_ir_module) {
-    graph_driver(graph, ctx, in_cfg, nullptr, 0, 0, 0, nullptr, nullptr,
-            nullptr, /*allow_cache*/ true);
-    if (auto code = graph.attrs_.get_or_else<std::shared_ptr<jit_module_code>>(
-                "graph_code_cache", nullptr)) {
-        auto static_tbl = prepare_static_table_for_cached_code(
-                *code->graph_cache_handle_, graph);
-
-        return std::make_shared<jit_module>(std::move(static_tbl), code)
-                ->get_function(code->entry_func_name_);
-    }
-
-    auto irm = lower_graph(ctx, graph, get_args(graph));
-    if (out_ir_module) { *out_ir_module = irm; }
-    auto ret = jit_engine_t::make(ctx)->get_entry_func(irm, true);
-    auto jitm = ret->get_module();
-
-    if (auto graph_key = graph.attrs_.get_or_else<
-                         std::shared_ptr<prehashed_graph_for_code_share_t>>(
-                "graph_code_cache_key", nullptr)) {
-        register_code_in_graph_cache(*jitm, std::move(graph_key));
-    }
-    return ret;
-}
-
-SC_API std::shared_ptr<jit_function_t> compiler_driver(const context_ptr &ctx,
-        sc_graph_t &graph, const std::vector<sc_op_ptr> &args,
-        const dnnl::impl::graph::gc::graph_config *in_cfg,
-        ir_module_ptr *out_ir_module) {
-    return compiler_driver(
-            ctx, graph, [&args](sc_graph_t &) { return args; }, in_cfg,
-            out_ir_module);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/compiler_driver.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/compiler_driver.hpp
deleted file mode 100644
index 6ffe8951761..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/compiler_driver.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_COMPILER_DRIVER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_COMPILER_DRIVER_HPP
-
-#include <functional>
-#include <memory>
-#include <vector>
-#include "jit.hpp"
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/graph_config.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * The top-level frontend of the compiler. It runs the Graph IR passes, lowers
- * the GIR to Tensor IR, runs TIR passes and finally JIT-compiles the code. It
- * may reuse the privious compilation result, if the graph is the same and the
- * previous compiled JIT-module is still alive.
- * @param ctx compiler context
- * @param graph the graph
- * @param get_args the callback to be called after the graph passes and before
- * lowering the graph. It should accept the graph after the graph passes and
- * return the arguments Ops of the graph. The order of the returned arguments
- * will be the order of the arguments of JIT'd main-entry function
- * @param in_cfg optional graph config
- * @param out_ir_module optionally returning the generated TIR module
- * @returns the JIT'd executable main entry function of the graph. It may share
- * the code and buffer with previously compiled graph.
- */
-SC_API std::shared_ptr<jit_function_t> compiler_driver(const context_ptr &ctx,
-        sc_graph_t &graph,
-        const std::function<std::vector<sc_op_ptr>(sc_graph_t &graph)>
-                &get_args,
-        const dnnl::impl::graph::gc::graph_config *in_cfg = nullptr,
-        ir_module_ptr *out_ir_module = nullptr);
-
-/**
- * @see compiler_driver above. The simplifed interface of compiler_driver. The
- * args are given in vectors.
- */
-SC_API std::shared_ptr<jit_function_t> compiler_driver(const context_ptr &ctx,
-        sc_graph_t &graph, const std::vector<sc_op_ptr> &args,
-        const dnnl::impl::graph::gc::graph_config *in_cfg = nullptr,
-        ir_module_ptr *out_ir_module = nullptr);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/jit.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/jit.cpp
deleted file mode 100644
index cae677025ff..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/jit.cpp
+++ /dev/null
@@ -1,413 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "jit.hpp"
-#if SC_CFAKE_JIT_ENABLED
-#include "cfake/cfake_jit.hpp"
-#endif
-#if defined(SC_LLVM_BACKEND)
-#include "llvm/llvm_jit.hpp"
-#endif
-#include <atomic>
-#include <chrono>
-#include <stdio.h>
-#if SC_BUILTIN_JIT_ENABLED
-#include "xbyak/xbyak_jit.hpp"
-#endif
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_threadpool_c.hpp>
-#include <runtime/managed_thread_pool.hpp>
-#include <runtime/microkernel/cpu/brgemm_range_handle.hpp>
-#include <util/math_utils.hpp>
-#include <util/scoped_timer.hpp>
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <common/stream.hpp>
-#include <oneapi/dnnl/dnnl_threadpool_iface.hpp>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-std::shared_ptr<jit_function_t> jit_engine_t::get_entry_func(
-        const ir_module_ptr &ir_mod, bool generic) {
-    auto jm = make_jit_module(ir_mod, generic);
-    COMPILE_ASSERT(ir_mod->get_entry_func(),
-            "Expecting an ir_module with entry function");
-    auto jit_func = jm->get_function(ir_mod->get_entry_func()->name_);
-    if (jm->code_) { jit_func->inplace_pairs_ = jm->code_->inplace_pairs_; }
-    return jit_func;
-}
-
-std::unique_ptr<jit_engine_t> jit_engine_t::make(const context_ptr &ctx) {
-    switch (ctx->flags_.jit_kind_) {
-#if SC_CFAKE_JIT_ENABLED
-        case jit_kind::cfake: return utils::make_unique<cfake_jit>(ctx);
-#endif
-#if defined(SC_LLVM_BACKEND)
-        case jit_kind::llvm: return utils::make_unique<llvm_jit>(ctx);
-#endif
-#if SC_BUILTIN_JIT_ENABLED
-        case jit_kind::xbyak: return utils::make_unique<xbyak_jit>(ctx);
-#endif
-        default:
-            assert(0 && "Bad JIT type");
-            return nullptr;
-            break;
-    }
-}
-
-void jit_engine_t::set_target_machine(
-        jit_kind kind, scflags_t &sc_flags, runtime::target_machine_t &tm) {
-    switch (kind) {
-        case jit_kind::cfake:
-#if SC_CFAKE_JIT_ENABLED == 0
-            return;
-#else
-        {
-            auto flags = cfake_jit::get_compiler_flags();
-            if (flags.fAVX512AMXBF16 && flags.fAVX512AMXTILE
-                    && flags.fAVX512AMXINT8) {
-                sc_flags.jit_support_amx_intrinsics_ = true;
-            } else {
-                sc_flags.jit_support_amx_intrinsics_ = false;
-            }
-            return cfake_jit::set_target_machine(tm);
-        }
-#endif
-
-#if defined(SC_LLVM_BACKEND)
-        case jit_kind::llvm:
-#if SC_LLVM_BACKEND <= 10
-            tm.cpu_flags_.fAVX512BF16 = false;
-#endif
-#if SC_LLVM_BACKEND >= 12
-            sc_flags.jit_support_amx_intrinsics_ = true;
-#else
-            sc_flags.jit_support_amx_intrinsics_ = false;
-#endif
-            return;
-#endif
-
-#if SC_BUILTIN_JIT_ENABLED
-        case jit_kind::xbyak:
-            sc_flags.jit_support_amx_intrinsics_ = true;
-            return xbyak_jit::set_target_machine(tm);
-#endif
-        default: assert(0 && "Bad JIT type"); break;
-    }
-}
-
-static std::atomic<size_t> module_id {0};
-
-jit_module_code::jit_module_code(thread_pool_mode_t managed_thread_pool)
-    : module_id_(module_id++), managed_thread_pool_(managed_thread_pool) {}
-
-void jit_module_code::postprocess(
-        const const_ir_module_ptr &ir_mod, statics_table_t &globals) {
-    update_runtime_data(ir_mod, globals);
-    if (ir_mod->get_entry_func()) {
-        entry_func_name_ = ir_mod->get_entry_func()->name_;
-
-        // pairs of {out arg, in arg} that out arg can inplace in arg
-        inplace_pairs_ = any_map_t::fetch_or_else(
-                ir_mod->get_entry_func()->attr_.get(),
-                function_attrs::inplace_hint,
-                std::vector<std::pair<size_t, size_t>>());
-    }
-}
-void jit_module_code::update_op_dispatch_table(
-        const const_ir_module_ptr &ir_mod, statics_table_t &globals) {
-    constexpr size_t capacity_coefficient = 2;
-    auto compiler_tables = ir_mod->get_op_table_map();
-    if (compiler_tables.empty()) { return; }
-    runtime::dispatch_table_map_t ret;
-    ret.reserve(compiler_tables.size());
-    for (auto &kv : compiler_tables) {
-        const std::string &symbol = kv.first;
-        auto &compiler_format_table = kv.second->format_table_;
-        auto &compiler_impl_kind_table = kv.second->impl_kind_table_;
-        auto &compiler_kernel_table = kv.second->kernel_table_;
-        auto &compiler_op_info = kv.second->op_info_;
-        runtime::op_dispatch_tables_ptr runtime_table
-                = utils::make_unique<runtime::op_dispatch_tables_t>();
-        if (!compiler_format_table.empty()) {
-            uint32_t format_num_keys
-                    = compiler_format_table.begin()->first.size();
-            size_t format_capacity = math_utils::nearest_power_of_2(
-                                             compiler_format_table.size())
-                    * capacity_coefficient;
-            runtime_table->format_table_
-                    = utils::make_unique<runtime::hash_dispatch_table_t>(
-                            format_num_keys, format_capacity);
-        }
-        if (!compiler_impl_kind_table.empty()) {
-            uint32_t impl_num_keys
-                    = compiler_impl_kind_table.begin()->first.size();
-            size_t impl_capacity = math_utils::nearest_power_of_2(
-                                           compiler_impl_kind_table.size())
-                    * capacity_coefficient;
-            runtime_table->impl_kind_table_
-                    = utils::make_unique<runtime::hash_dispatch_table_t>(
-                            impl_num_keys, impl_capacity);
-        }
-        if (!compiler_kernel_table.empty()) {
-            uint32_t kernel_num_keys
-                    = compiler_kernel_table.begin()->first.size();
-            size_t kernel_capacity = math_utils::nearest_power_of_2(
-                                             compiler_kernel_table.size())
-                    * capacity_coefficient;
-            // Currently we use hash dispatch table for kernel dispatch
-            runtime_table->kernel_table_
-                    = utils::make_unique<runtime::hash_dispatch_table_t>(
-                            kernel_num_keys, kernel_capacity);
-            runtime_table->kernel_dispatch_func_
-                    = runtime_table->kernel_table_->get_dispatch_func();
-        }
-
-        // initialize format table
-        for (auto &format_kv : compiler_format_table) {
-            runtime_table->set_format_table_keys(
-                    reinterpret_cast<uint64_t *>(
-                            const_cast<runtime::dispatch_key *>(
-                                    format_kv.first.data())),
-                    format_kv.first.size(),
-                    reinterpret_cast<uint64_t *>(
-                            const_cast<runtime::dispatch_key *>(
-                                    format_kv.second.data())),
-                    format_kv.second.size());
-        }
-        // initialize impl kind table
-        for (auto &impl_kv : compiler_impl_kind_table) {
-            runtime_table->set_impl_kind_table_keys(
-                    const_cast<uint64_t *>(impl_kv.first.data()),
-                    impl_kv.first.size(), impl_kv.second);
-        }
-        // initialize kernel table
-        for (auto &kernel_kv : compiler_kernel_table) {
-            void *kernel_addr = kernel_kv.second.already_compiled()
-                    ? get_address_of_symbol(kernel_kv.second.name_or_postfix_)
-                    : nullptr;
-            runtime_table->kernel_table_->set(
-                    reinterpret_cast<uint64_t *>(
-                            const_cast<runtime::dispatch_key *>(
-                                    kernel_kv.first.data())),
-                    kernel_kv.first.size(), kernel_addr);
-        }
-        // update op info
-        runtime_table->op_info_ = compiler_op_info;
-        // update global table vars' pointer
-        auto var_name = kv.first;
-        void **value = reinterpret_cast<void **>(globals.get(var_name));
-        *value = runtime_table.get();
-        ret[var_name] = std::move(runtime_table);
-    }
-    op_tables_ = std::move(ret);
-}
-
-void jit_module_code::update_runtime_data(
-        const const_ir_module_ptr &ir_mod, statics_table_t &globals) {
-    // update op dispatch table
-    update_op_dispatch_table(ir_mod, globals);
-    // update brgemm range handler.
-    auto brg_handles = ir_mod->get_brg_range_handle_vec();
-    brg_handles_.insert(
-            brg_handles_.end(), brg_handles.begin(), brg_handles.end());
-}
-
-void *jit_module::get_address_of_symbol(const std::string &name) {
-    void *global_var = globals_.get_or_null(name);
-    if (global_var) { return global_var; }
-    return code_->get_address_of_symbol(name);
-}
-std::shared_ptr<jit_function_t> jit_module::get_function(
-        const std::string &name) {
-    void *wrapper = nullptr;
-    auto func = code_->get_function(name, wrapper);
-    if (func || wrapper) {
-        if (runtime_config_t::get().execution_verbose_) {
-            return general_jit_function_t::make(shared_from_this(), func,
-                    wrapper, name, code_->managed_thread_pool_);
-        } else {
-            return general_jit_function_t::make(shared_from_this(), func,
-                    wrapper, std::string(), code_->managed_thread_pool_);
-        }
-    } else {
-        return nullptr;
-    }
-}
-
-template <bool execution_verbose>
-struct jit_timer_t {
-    jit_timer_t(const general_jit_function_t *) {}
-};
-
-template <>
-struct jit_timer_t<true> {
-    struct callback_t {
-        const general_jit_function_t *ths;
-        void operator()(utils::time_duration dur) const {
-            using namespace std::chrono;
-            double duration = static_cast<double>(
-                                      duration_cast<nanoseconds>(dur).count())
-                    / 1e6;
-            printf("Entry point: %s@%zu. Time elapsed: %lf ms\n",
-                    ths->fname_.c_str(), ths->module_->code_->module_id_,
-                    duration);
-        }
-    };
-    utils::scoped_timer<callback_t> timer_;
-    jit_timer_t(const general_jit_function_t *ths)
-        : timer_ {true, callback_t {ths}} {}
-};
-
-using functype = runtime::thread_manager::main_func_t;
-
-template <thread_pool_mode_t thread_pool_init>
-struct thread_pool_caller_t {
-    static void call(functype f, runtime::stream_t *stream, void *module_data,
-            generic_val *args) {
-        f(stream, module_data, args);
-    }
-};
-
-template <>
-struct thread_pool_caller_t<thread_pool_mode_t::MANAGED> {
-    static void call(functype f, runtime::stream_t *stream, void *module_data,
-            generic_val *args) {
-        runtime::thread_manager::cur_mgr.run_main_function(
-                f, stream, module_data, args);
-    }
-};
-
-template <>
-struct thread_pool_caller_t<thread_pool_mode_t::DYNAMIC> {
-    static void call(functype f, runtime::stream_t *stream, void *module_data,
-            generic_val *args) {
-        runtime::dynamic_threadpool::thread_main(f, stream, module_data, args);
-    }
-};
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#define before_kernel_run() \
-    bool already_in_tp = threadpool_utils::get_active_threadpool(); \
-    if (!already_in_tp) { \
-        dnnl::threadpool_interop::threadpool_iface *tp = nullptr; \
-        stream->vtable()->stream->get_threadpool(&tp); \
-        threadpool_utils::activate_threadpool(tp); \
-    }
-#define after_kernel_run() \
-    if (!already_in_tp) { threadpool_utils::deactivate_threadpool(); }
-#else
-#define before_kernel_run()
-#define after_kernel_run()
-#endif
-
-template <thread_pool_mode_t thread_pool_init, bool execution_verbose>
-class injected_general_jit_function_t : public general_jit_function_t {
-    void call_generic(
-            runtime::stream_t *stream, generic_val *args) const override {
-        injected_general_jit_function_t::call_generic(
-                stream, module_->globals_.data_.data_, args);
-    }
-
-    void call_generic(runtime::stream_t *stream, void *module_data,
-            generic_val *args) const override {
-        jit_timer_t<execution_verbose> timer(this);
-        assert(wrapper_ && "Trying to call 'call_generic' \
-            on a jit funciton with no wrapper.");
-        functype f = reinterpret_cast<functype>(wrapper_);
-        before_kernel_run();
-        thread_pool_caller_t<thread_pool_init>::call(
-                f, stream, module_data, args);
-        after_kernel_run()
-    }
-
-public:
-    using general_jit_function_t::general_jit_function_t;
-    friend class general_jit_function_t;
-};
-
-void general_jit_function_t::call_generic(
-        runtime::stream_t *stream, generic_val *args) const {
-    general_jit_function_t::call_generic(
-            stream, module_->globals_.data_.data_, args);
-}
-
-void general_jit_function_t::call_generic(
-        runtime::stream_t *stream, void *module_data, generic_val *args) const {
-    assert(wrapper_ && "Trying to call 'call_generic' \
-            on a jit funciton with no wrapper.");
-    functype f = reinterpret_cast<functype>(wrapper_);
-
-    before_kernel_run();
-    f(stream, module_data, args);
-    after_kernel_run();
-}
-
-std::shared_ptr<jit_function_t> general_jit_function_t::make(
-        const std::shared_ptr<jit_module> &module, void *funcptr, void *wrapper,
-        const std::string &name, thread_pool_mode_t managed_thread_pool) {
-    auto &runtime_cfg = runtime_config_t::get();
-    switch (managed_thread_pool) {
-        case thread_pool_mode_t::MANAGED:
-            if (runtime_cfg.execution_verbose_) {
-                return std::shared_ptr<general_jit_function_t>(
-                        new injected_general_jit_function_t<
-                                thread_pool_mode_t::MANAGED, true>(
-                                module, funcptr, wrapper, name));
-            } else {
-                return std::shared_ptr<general_jit_function_t>(
-                        new injected_general_jit_function_t<
-                                thread_pool_mode_t::MANAGED, false>(
-                                module, funcptr, wrapper, name));
-            }
-            break;
-        case thread_pool_mode_t::DIRECT:
-            if (runtime_cfg.execution_verbose_) {
-                return std::shared_ptr<general_jit_function_t>(
-                        new injected_general_jit_function_t<
-                                thread_pool_mode_t::MANAGED, true>(
-                                module, funcptr, wrapper, name));
-            } else {
-                return std::shared_ptr<general_jit_function_t>(
-                        new general_jit_function_t(
-                                module, funcptr, wrapper, name));
-            }
-        case thread_pool_mode_t::DYNAMIC:
-            if (runtime_cfg.execution_verbose_) {
-                return std::shared_ptr<general_jit_function_t>(
-                        new injected_general_jit_function_t<
-                                thread_pool_mode_t::DYNAMIC, true>(
-                                module, funcptr, wrapper, name));
-            } else {
-                return std::shared_ptr<general_jit_function_t>(
-                        new injected_general_jit_function_t<
-                                thread_pool_mode_t::DYNAMIC, false>(
-                                module, funcptr, wrapper, name));
-            }
-    }
-    return nullptr;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/jit.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/jit.hpp
deleted file mode 100644
index 76e4d3702fe..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/jit.hpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_JIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_JIT_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/ir_module.hpp>
-#include <runtime/context.hpp>
-#include <runtime/dynamic_dispatch/op_dispatch_tables.hpp>
-#include <runtime/generic_val.hpp>
-#include <runtime/threadpool_mode.hpp>
-
-struct brg_range_handle_t;
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct jit_module;
-
-// A jitted function that can be called in a module
-class SC_API jit_function_t {
-public:
-    virtual ~jit_function_t() = default;
-
-    virtual std::shared_ptr<jit_module> get_module() const = 0;
-    virtual void *get_function_pointer() const = 0;
-
-    /**
-     * Calls the generic wrapper function with default stream context. The
-     * module must have been compiled with `generate_wrapper=true`.
-     * @param args the arguments
-     */
-    void call_generic_default(generic_val *args) const {
-        call_generic(runtime::get_default_stream(), args);
-    }
-
-    /**
-     * Calls the generic wrapper function with default stream context. The
-     * module must have been compiled with `generate_wrapper=true`.
-     * @param args the arguments
-     */
-    virtual void call_generic(
-            runtime::stream_t *stream, generic_val *args) const = 0;
-
-    /**
-     * Calls the generic wrapper function and specifies a user-defined module
-     * data. The module must have been compiled with `generate_wrapper=true`.
-     * @param stream the runtime stream context
-     * @param module_data the module data buffer. It should hold the module
-     * scope vars and tensors
-     * @param args the arguments
-     */
-    virtual void call_generic(runtime::stream_t *stream, void *module_data,
-            generic_val *args) const {
-        throw std::runtime_error("Not implemeneted");
-    }
-
-    virtual void *get_module_data() const { return nullptr; }
-
-    // util wrapper for call_generic
-    template <typename... Args>
-    void call_default(Args... args) const {
-        generic_val vargs[] = {args...};
-        call_generic(runtime::get_default_stream(), vargs);
-    }
-
-    // only for testing use. Use call_default/call_generic instead
-    template <typename Ret, typename... Args>
-    Ret call(Args... args) const {
-        // functype_old is kept for legacy mode for xbyak. Remove this when
-        // xbyak JIT is updated to new JIT function interface
-        using functype_old = Ret (*)(Args...);
-        using functype = Ret (*)(void *, void *, Args...);
-        assert(get_function_pointer());
-        auto modu_ptr = get_module_data();
-        if (modu_ptr) {
-            return reinterpret_cast<functype>(get_function_pointer())(
-                    runtime::get_default_stream(), modu_ptr, args...);
-        }
-        return reinterpret_cast<functype_old>(get_function_pointer())(args...);
-    }
-
-    using generic_wrapper_t = void (*)(generic_val *);
-
-    std::vector<std::pair<size_t, size_t>> inplace_pairs_;
-};
-
-struct cached_const_graph_tensor;
-struct graph_code_cache_handle;
-
-// The executable code of compiling an ir_module_t
-class SC_INTERNAL_API jit_module_code {
-public:
-    // runtime op table for dynamic shape/format infer and dispatch.
-    runtime::dispatch_table_map_t op_tables_;
-    // brgemm range handle vec
-    std::vector<std::shared_ptr<brg_range_handle_t>> brg_handles_;
-    // the handle to the graph code cache, optional
-    std::shared_ptr<graph_code_cache_handle> graph_cache_handle_;
-    // the unique id for a JIT module in a process scope
-    size_t module_id_;
-    // whether to use managed thread pool
-    thread_pool_mode_t managed_thread_pool_;
-    std::string entry_func_name_;
-    jit_module_code(thread_pool_mode_t managed_thread_pool);
-    virtual void *get_address_of_symbol(const std::string &name) = 0;
-    virtual void *get_function(const std::string &name, void *&wrapperfunc) = 0;
-    /// This method only exists to help with debugging.
-    virtual std::vector<std::string> get_temp_filenames() const {
-        return std::vector<std::string>();
-    }
-
-    virtual ~jit_module_code() = default;
-    void postprocess(
-            const const_ir_module_ptr &ir_mod, statics_table_t &globals);
-
-    // pairs of {input arg idx, output arg index} of the main entry func
-    std::vector<std::pair<size_t, size_t>> inplace_pairs_;
-
-protected:
-    // update runtime data with same lifetime as jit module like kerenl
-    // values of op_tables_ with address of specific function and brgemm range
-    // handlers. call the self-update function after jit module is created.
-    virtual void update_runtime_data(
-            const const_ir_module_ptr &ir_mod, statics_table_t &globals);
-    // child function in update_runtime_data.
-    void update_op_dispatch_table(
-            const const_ir_module_ptr &ir_mod, statics_table_t &globals);
-};
-
-struct SC_INTERNAL_API jit_module
-    : public std::enable_shared_from_this<jit_module> {
-    statics_table_t globals_;
-    std::shared_ptr<jit_module_code> code_;
-    jit_module(statics_table_t &&data,
-            const std::shared_ptr<jit_module_code> &code)
-        : globals_(std::move(data)), code_(code) {}
-    // this constructor is reserved for legacy code
-    jit_module() = default;
-    virtual void *get_address_of_symbol(const std::string &name);
-    virtual std::shared_ptr<jit_function_t> get_function(
-            const std::string &name);
-    /// This method only exists to help with debugging.
-    virtual std::vector<std::string> get_temp_filenames() const {
-        return code_->get_temp_filenames();
-    }
-    virtual ~jit_module() = default;
-};
-
-class SC_INTERNAL_API general_jit_function_t : public jit_function_t {
-protected:
-    general_jit_function_t(
-            std::shared_ptr<jit_module> module, void *funcptr, void *wrapper)
-        : module_(std::move(module)), funcptr_(funcptr), wrapper_(wrapper) {}
-    general_jit_function_t(std::shared_ptr<jit_module> module, void *funcptr,
-            void *wrapper, const std::string &name)
-        : module_(std::move(module))
-        , funcptr_(funcptr)
-        , wrapper_(wrapper)
-        , fname_(name) {}
-
-public:
-    std::shared_ptr<jit_module> module_;
-    void *funcptr_;
-    void *wrapper_;
-    std::string fname_;
-
-    static std::shared_ptr<jit_function_t> make(
-            const std::shared_ptr<jit_module> &module, void *funcptr,
-            void *wrapper, const std::string &name,
-            thread_pool_mode_t managed_thread_pool);
-    void *get_module_data() const override {
-        return module_->globals_.data_.data_;
-    }
-
-    std::shared_ptr<jit_module> get_module() const override { return module_; }
-    void *get_function_pointer() const override { return funcptr_; }
-    void *get_wrapper_function_pointer() const { return wrapper_; }
-    void call_generic(
-            runtime::stream_t *stream, generic_val *args) const override;
-    void call_generic(runtime::stream_t *stream, void *module_data,
-            generic_val *args) const override;
-};
-
-// jit interface
-class SC_API jit_engine_t {
-public:
-    context_ptr context_;
-    jit_engine_t(context_ptr context) : context_(std::move(context)) {}
-
-    // jit an ir_module_t into a jit_module
-    virtual std::shared_ptr<jit_module> make_jit_module(
-            const_ir_module_ptr module, bool generate_wrapper)
-            = 0;
-
-    /**
-     * Generates a executable module and extract the entry function of the
-     * ir_module_t
-     * @param m module to generate. Must have entry function defined
-     * @param generic if true, creates a type-erased wrapper for the
-     *  function, users can further call `call_generic` on the
-     *  generated executable
-     * @return the executable function for the entry function
-     * */
-    std::shared_ptr<jit_function_t> get_entry_func(
-            const ir_module_ptr &m, bool generic = true);
-    virtual ~jit_engine_t() = default;
-
-    static std::unique_ptr<jit_engine_t> make(const context_ptr &ctx);
-    // negotiate with the JIT engine and get the target machine with as
-    // many flags as possible the JIT can support in the user given target
-    // machine
-    static void set_target_machine(
-            jit_kind kind, scflags_t &sc_flags, runtime::target_machine_t &tm);
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit.cpp
deleted file mode 100644
index 37c86928636..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include <fstream>
-#include <stdlib.h>
-#include "llvm_jit.hpp"
-#include "llvm_jit_resolver.hpp"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/codegen/codegen_llvm.hpp>
-#include <llvm/Analysis/TargetLibraryInfo.h>
-#include <llvm/Analysis/TargetTransformInfo.h>
-#include <llvm/ExecutionEngine/ExecutionEngine.h>
-#include <llvm/ExecutionEngine/JITEventListener.h>
-#include <llvm/ExecutionEngine/MCJIT.h>
-#include <llvm/ExecutionEngine/SectionMemoryManager.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/LegacyPassManager.h>
-#include <llvm/IR/Module.h>
-#include <llvm/IR/Verifier.h>
-#include <llvm/Support/DynamicLibrary.h>
-#include <llvm/Support/TargetSelect.h>
-#include <llvm/Support/raw_ostream.h>
-#include <llvm/Transforms/IPO.h>
-#include <llvm/Transforms/IPO/AlwaysInliner.h>
-#if SC_LLVM_BACKEND >= 16
-#include <llvm/IR/PassManager.h>
-#include <llvm/Passes/PassBuilder.h>
-#else
-#include <llvm/Transforms/IPO/PassManagerBuilder.h>
-#endif
-#include <llvm/Passes/StandardInstrumentations.h>
-#include <runtime/config.hpp>
-#include <runtime/runtime.hpp>
-#include <util/file.hpp>
-#include <util/scoped_timer.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(jit.llvm)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-#if SC_LLVM_BACKEND >= 18
-using LLVM_CodeGenOptLevel = llvm::CodeGenOptLevel;
-#else
-using LLVM_CodeGenOptLevel = llvm::CodeGenOpt::Level;
-#endif
-
-static std::string dump_module_to_string(llvm::Module *m) {
-    std::string ret;
-    llvm::raw_string_ostream os(ret);
-    os << *m;
-    return ret;
-}
-
-#if SC_LLVM_BACKEND >= 16
-static void optimize_llvm_module(llvm::TargetMachine *tm, llvm::Module *module,
-        LLVM_CodeGenOptLevel llvm_opt) {
-#if 0
-    // these code are useful for debugging LLVM optimizations
-    std::vector<const char *> args {"gc", "-pass-remarks=loop-unroll",
-            "-pass-remarks-missed=loop-unroll"};
-    llvm::cl::ParseCommandLineOptions(args.size(), args.data());
-#endif
-    llvm::LoopAnalysisManager LAM;
-    llvm::FunctionAnalysisManager FAM;
-    llvm::CGSCCAnalysisManager CGAM;
-    llvm::ModuleAnalysisManager MAM;
-    llvm::PipelineTuningOptions PTO;
-    PTO.LoopUnrolling = true;
-    // clang sets LoopInterleaving with same value of LoopUnrolling
-    PTO.LoopInterleaving = true;
-    PTO.LoopVectorization = true;
-    // cannot enable this on LLVM16, it runs too long on a specific workload
-    // PTO.SLPVectorization = true;
-    // PTO.MergeFunctions = what?
-
-    llvm::PassInstrumentationCallbacks PIC;
-#if 0
-    // these code are useful for debugging LLVM optimizations
-    llvm::PrintPassOptions PrintPassOpts;
-    PrintPassOpts.Verbose = true;
-    PrintPassOpts.SkipAnalyses = false;
-    llvm::StandardInstrumentations SI(
-            module->getContext(), true, false, PrintPassOpts);
-    SI.registerCallbacks(PIC, &FAM);
-    PB.printPassNames(llvm::errs());
-#endif
-    llvm::PassBuilder PB {tm, PTO, std::nullopt, &PIC};
-
-    PB.registerModuleAnalyses(MAM);
-    PB.registerCGSCCAnalyses(CGAM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerLoopAnalyses(LAM);
-    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-    llvm::OptimizationLevel opt_level = llvm::OptimizationLevel::O3;
-    switch (llvm_opt) {
-        case LLVM_CodeGenOptLevel::None:
-            opt_level = llvm::OptimizationLevel::O0;
-            break;
-        case LLVM_CodeGenOptLevel::Less:
-            opt_level = llvm::OptimizationLevel::O1;
-            break;
-        case LLVM_CodeGenOptLevel::Default:
-            opt_level = llvm::OptimizationLevel::O2;
-            break;
-        case LLVM_CodeGenOptLevel::Aggressive:
-            opt_level = llvm::OptimizationLevel::O3;
-            break;
-    }
-    llvm::ModulePassManager MPM = PB.buildPerModuleDefaultPipeline(opt_level);
-    MPM.run(*module, MAM);
-    SC_MODULE_INFO << dump_module_to_string(module);
-}
-#else
-static void optimize_llvm_module(llvm::TargetMachine *tm, llvm::Module *module,
-        llvm::CodeGenOpt::Level llvm_opt) {
-    llvm::PassManagerBuilder passbuilder;
-    passbuilder.OptLevel = llvm_opt;
-    passbuilder.SizeLevel = 0;
-    passbuilder.LoopVectorize = true;
-    passbuilder.SLPVectorize = true;
-    llvm::legacy::PassManager MPM;
-    llvm::legacy::FunctionPassManager FPM(module);
-#if SC_LLVM_BACKEND < 16
-    tm->adjustPassManager(passbuilder);
-#endif
-    passbuilder.Inliner = llvm::createFunctionInliningPass(
-            static_cast<unsigned int>(llvm_opt), 0, false);
-
-    MPM.add(new llvm::TargetLibraryInfoWrapperPass(tm->getTargetTriple()));
-    MPM.add(llvm::createTargetTransformInfoWrapperPass(
-            tm->getTargetIRAnalysis()));
-    FPM.add(llvm::createTargetTransformInfoWrapperPass(
-            tm->getTargetIRAnalysis()));
-
-    passbuilder.populateFunctionPassManager(FPM);
-    passbuilder.populateModulePassManager(MPM);
-
-    FPM.doInitialization();
-    for (llvm::Function &F : *module)
-        FPM.run(F);
-    FPM.doFinalization();
-    MPM.run(*module);
-    SC_MODULE_INFO << dump_module_to_string(module);
-}
-#endif
-std::unique_ptr<llvm::TargetMachine> get_llvm_target_machine(
-        LLVM_CodeGenOptLevel optlevel);
-
-static void *resolve_llvm_symbol(
-        llvm::ExecutionEngine *engine, const std::string &name);
-
-struct llvm_jit_listeners {
-    std::unique_ptr<llvm::JITEventListener> intel_jit_;
-    std::unique_ptr<llvm::JITEventListener> perf_;
-    llvm_jit_listeners()
-        : intel_jit_(std::unique_ptr<llvm::JITEventListener>(
-                llvm::JITEventListener::createIntelJITEventListener()))
-        , perf_(std::unique_ptr<llvm::JITEventListener>(
-                  llvm::JITEventListener::createPerfJITEventListener())) {}
-};
-
-std::shared_ptr<jit_module> llvm_jit::make_jit_module(
-        const_ir_module_ptr module, bool generate_wrapper) {
-    auto llvm_ctx = utils::make_unique<llvm::LLVMContext>();
-    std::unique_ptr<llvm::Module> llvmmod;
-    llvm_generator_pass gen {*llvm_ctx, llvmmod, generate_wrapper};
-    auto new_mod = gen(module);
-    ir_module_ptr copied_ir_module;
-    auto &compiler_config = utils::compiler_configs_t::get();
-    if (!compiler_config.dump_gen_code_.empty()) {
-        copied_ir_module = module->deep_copy();
-    }
-    auto timer = SC_SCOPED_TIMER_INFO("pass.time.llvm_jit", "");
-    auto &attr_table = *new_mod->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS);
-    thread_pool_mode_t use_managed_tp = new_mod->attr_.get<thread_pool_mode_t>(
-            ir_module_t::attr_key_t::MANAGED_THREAD_POOL);
-    std::string source_path = std::move(gen.out_source_path_);
-    std::string err;
-    llvm::raw_string_ostream ss(err);
-    if (llvm::verifyModule(*llvmmod, &ss)) {
-        ss << "Module:\n" << *llvmmod;
-        throw std::runtime_error("LLVM module verify error: " + err);
-    }
-
-    auto opt = std::min(opt_level_, 3U);
-    auto llvm_opt = static_cast<LLVM_CodeGenOptLevel>(opt);
-
-    llvm::Module *mod_ptr = llvmmod.get();
-    auto tm = get_llvm_target_machine(llvm_opt).release();
-    optimize_llvm_module(tm, mod_ptr, llvm_opt);
-    auto engine = llvm::EngineBuilder(std::move(llvmmod))
-                          .setErrorStr(&err)
-                          .setOptLevel(llvm_opt)
-                          .setEngineKind(llvm::EngineKind::JIT)
-                          .setMCJITMemoryManager(
-                                  utils::make_unique<sc_llvm_jit_resolver>())
-                          .create(tm);
-    std::shared_ptr<llvm_jit_listeners> outlisteners;
-    if (utils::compiler_configs_t::get().jit_profile_) {
-        // one listener for all JIT modules
-        static auto listeners = std::make_shared<llvm_jit_listeners>();
-        engine->RegisterJITEventListener(listeners->intel_jit_.get());
-        engine->RegisterJITEventListener(listeners->perf_.get());
-        outlisteners = listeners;
-    }
-    if (!engine) {
-        throw std::runtime_error("LLVM EngineBuilder error: " + err);
-    }
-    engine->finalizeObject();
-    typedef void (*init_func_t)(void *ctx, void *mod);
-    auto init_func = reinterpret_cast<init_func_t>(
-            resolve_llvm_symbol(engine, "__sc_init__"));
-    if (init_func) { init_func(nullptr, attr_table.data_.data_); }
-    auto ret = std::make_shared<llvm_jit_module_code>(
-            std::unique_ptr<llvm::ExecutionEngine>(engine), std::move(llvm_ctx),
-            std::move(outlisteners), use_managed_tp, source_path);
-    ret->postprocess(new_mod, attr_table);
-
-    if (copied_ir_module) {
-        std::stringstream of;
-        std::ofstream dump_main_f;
-        std::ofstream dump_header_f;
-        std::ofstream dump_data_f;
-        c_generator_optional_out_t optional_dump {
-                &dump_main_f, &dump_header_f, &dump_data_f};
-        std::string dump_path_base = compiler_config.dump_gen_code_ + '/';
-        dump_path_base += module->attr_.get_or_else(
-                ir_module_t::attr_key_t::NAME,
-                "/cfake_jit_module-" + utils::get_unique_name_for_file());
-
-        std::string dump_path = dump_path_base + ".cpp";
-        std::string dump_header_path = dump_path_base + ".hpp";
-        std::string dump_data_path = dump_path_base + "_data.cpp";
-        utils::open_file_for_write(dump_main_f, dump_path);
-        utils::open_file_for_write(dump_header_f, dump_header_path);
-        utils::open_file_for_write(dump_data_f, dump_data_path);
-
-        auto gen = create_c_generator(
-                of, context_, generate_wrapper, &optional_dump);
-        auto new_mod = gen(copied_ir_module);
-    }
-    return std::make_shared<jit_module>(std::move(attr_table), ret);
-}
-
-llvm_jit_module_code::llvm_jit_module_code(
-        std::unique_ptr<llvm::ExecutionEngine> engine,
-        std::unique_ptr<llvm::LLVMContext> llvm_ctx,
-        std::shared_ptr<llvm_jit_listeners> &&listeners,
-        thread_pool_mode_t managed_thread_pool, const std::string &source_path)
-    : jit_module_code(managed_thread_pool)
-    , listeners_(std::move(listeners))
-    , llvm_ctx_(std::move(llvm_ctx))
-    , engine_(std::move(engine))
-    , source_path_(source_path) {}
-
-std::vector<std::string> llvm_jit_module_code::get_temp_filenames() const {
-    if (source_path_.empty()) { return {}; }
-    return {source_path_};
-}
-
-llvm_jit_module_code::~llvm_jit_module_code() {
-    if (!source_path_.empty()) { remove(source_path_.c_str()); }
-}
-
-static void *resolve_llvm_symbol(
-        llvm::ExecutionEngine *engine, const std::string &name) {
-#ifdef __APPLE__
-    return engine->getPointerToNamedFunction("_" + name, false);
-#else
-    return engine->getPointerToNamedFunction(name, false);
-#endif
-}
-
-void *llvm_jit_module_code::get_address_of_symbol(const std::string &name) {
-    return resolve_llvm_symbol(engine_.get(), name);
-}
-void *llvm_jit_module_code::get_function(
-        const std::string &name, void *&wrapper) {
-    void *fun = resolve_llvm_symbol(engine_.get(), name);
-    wrapper = resolve_llvm_symbol(engine_.get(), name + "_0wrapper");
-    return fun;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit.hpp
deleted file mode 100644
index f65c77e50f3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_LLVM_LLVM_JIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_LLVM_LLVM_JIT_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/sc_function.hpp>
-#include <compiler/jit/jit.hpp>
-#include <runtime/generic_val.hpp>
-
-namespace llvm {
-class ExecutionEngine;
-class LLVMContext;
-} // namespace llvm
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class llvm_jit;
-struct llvm_jit_listeners;
-class SC_INTERNAL_API llvm_jit_module_code : public jit_module_code {
-    llvm_jit_module_code(llvm_jit_module_code &&other) = delete;
-    llvm_jit_module_code(const llvm_jit_module_code &other) = delete;
-
-public:
-    llvm_jit_module_code(std::unique_ptr<llvm::ExecutionEngine> engine,
-            std::unique_ptr<llvm::LLVMContext> llvm_ctx,
-            std::shared_ptr<llvm_jit_listeners> &&listeners,
-            thread_pool_mode_t managed_thread_pool,
-            const std::string &source_path);
-    // listeners_ reference will be destructed after engine_, to make sure
-    // jit_listeners are still alive when ExecutionEngine is destroyed
-    std::shared_ptr<llvm_jit_listeners> listeners_;
-    std::unique_ptr<llvm::LLVMContext> llvm_ctx_;
-    std::unique_ptr<llvm::ExecutionEngine> engine_;
-    std::string source_path_;
-    ~llvm_jit_module_code();
-
-    void *get_address_of_symbol(const std::string &name) override;
-    void *get_function(const std::string &name, void *&wrapper) override;
-
-    std::vector<std::string> get_temp_filenames() const override;
-};
-
-class SC_INTERNAL_API llvm_jit : public jit_engine_t {
-public:
-    llvm_jit(context_ptr ctx = get_default_context())
-        : jit_engine_t(std::move(ctx)) {
-        opt_level_ = context_->flags_.backend_opt_level_;
-    }
-    unsigned opt_level_;
-    std::shared_ptr<jit_module> make_jit_module(
-            const_ir_module_ptr module, bool generate_wrapper) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit_resolver.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit_resolver.cpp
deleted file mode 100644
index 1a659f59d0c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit_resolver.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <string>
-
-#include "llvm_jit_resolver.hpp"
-#include <compiler/jit/symbol_resolver.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(jit.llvm_resolver)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-sc_llvm_jit_resolver::sc_llvm_jit_resolver() = default;
-sc_llvm_jit_resolver::~sc_llvm_jit_resolver() = default;
-uint64_t sc_llvm_jit_resolver::getSymbolAddress(const std::string &name) {
-    uint64_t ret = SectionMemoryManager::getSymbolAddress(name);
-    if (ret) return ret;
-    return reinterpret_cast<uint64_t>(default_external_symbol_resolve(name));
-}
-
-uint8_t *sc_llvm_jit_resolver::allocateCodeSection(uintptr_t Size,
-        unsigned Alignment, unsigned SectionID, llvm::StringRef SectionName) {
-    SC_MODULE_INFO << "allocateCodeSection, Size=" << Size << ", SectionName"
-                   << std::string(SectionName);
-    return SectionMemoryManager::allocateCodeSection(
-            Size, Alignment, SectionID, SectionName);
-}
-
-uint8_t *sc_llvm_jit_resolver::allocateDataSection(uintptr_t Size,
-        unsigned Alignment, unsigned SectionID, llvm::StringRef SectionName,
-        bool isReadOnly) {
-    SC_MODULE_INFO << "allocateDataSection, Size=" << Size << ", SectionName"
-                   << std::string(SectionName);
-    return SectionMemoryManager::allocateDataSection(
-            Size, Alignment, SectionID, SectionName, isReadOnly);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit_resolver.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit_resolver.hpp
deleted file mode 100644
index 625fd6c063c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/llvm/llvm_jit_resolver.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_LLVM_LLVM_JIT_RESOLVER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_LLVM_LLVM_JIT_RESOLVER_HPP
-
-#include <string>
-#include <llvm/ExecutionEngine/SectionMemoryManager.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class sc_llvm_jit_resolver : public llvm::SectionMemoryManager {
-    sc_llvm_jit_resolver(const sc_llvm_jit_resolver &) = delete;
-    void operator=(const sc_llvm_jit_resolver &) = delete;
-
-public:
-    sc_llvm_jit_resolver();
-    virtual ~sc_llvm_jit_resolver();
-    virtual uint64_t getSymbolAddress(const std::string &name) override;
-    uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-            unsigned SectionID, llvm::StringRef SectionName) override;
-
-    uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-            unsigned SectionID, llvm::StringRef SectionName,
-            bool isReadOnly) override;
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/symbol_resolver.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/symbol_resolver.cpp
deleted file mode 100644
index bb27653443d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/symbol_resolver.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "symbol_resolver.hpp"
-#include <math.h>
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <runtime/dynamic_dispatch/op_func_decl.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/dynamic_threadpool_c.hpp>
-#include <runtime/generic_val.hpp>
-#include <runtime/managed_thread_pool_exports.hpp>
-#include <runtime/memorypool.hpp>
-#include <runtime/microkernel/cpu/microkernel.hpp>
-#include <runtime/parallel.hpp>
-#include <runtime/runtime.hpp>
-#include <runtime/thread_locals.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-const std::unordered_map<std::string, void *> &get_runtime_function_map() {
-    static std::unordered_map<std::string, void *> table = {
-            {"dnnl_brgemm_init", (void *)dnnl_brgemm_init},
-            {"dnnl_brgemm_update", (void *)dnnl_brgemm_update},
-            {"dnnl_brgemm_init_update", (void *)dnnl_brgemm_init_update},
-            {"dnnl_brgemm_init_list_update",
-                    (void *)dnnl_brgemm_init_list_update},
-            {"dnnl_brgemm_list_update", (void *)dnnl_brgemm_list_update},
-            {"dnnl_brgemm_list_call", (void *)dnnl_brgemm_list_call},
-            {"dnnl_brgemm_list_call_range",
-                    (void *)dnnl_brgemm_list_call_range},
-            {"dnnl_brgemm_list_call_postops",
-                    (void *)dnnl_brgemm_list_call_postops},
-            {"dnnl_brgemm_list_func", (void *)dnnl_brgemm_list_func},
-            {"dnnl_brgemm_func", (void *)dnnl_brgemm_func},
-            {"dnnl_brgemm_call", (void *)dnnl_brgemm_call},
-            {"dnnl_brgemm_call_range", (void *)dnnl_brgemm_call_range},
-            {"dnnl_brgemm_call_postops", (void *)dnnl_brgemm_call_postops},
-            {"dnnl_brgemm_postops_data_init",
-                    (void *)dnnl_brgemm_postops_data_init},
-            {"print_float", (void *)print_float},
-            {"print_index", (void *)print_index},
-            {"print_int", (void *)print_int},
-            {"print_str", (void *)print_str},
-            {"sc_global_aligned_alloc", (void *)sc_global_aligned_alloc},
-            {"sc_global_aligned_free", (void *)sc_global_aligned_free},
-            {"sc_thread_aligned_malloc", (void *)sc_thread_aligned_malloc},
-            {"sc_thread_aligned_free", (void *)sc_thread_aligned_free},
-            {"sc_acquire_const_cache", (void *)sc_acquire_const_cache},
-            {"sc_release_const_cache", (void *)sc_release_const_cache},
-            {"sc_aligned_malloc", (void *)sc_aligned_malloc},
-            {"sc_aligned_free", (void *)sc_aligned_free},
-            {"sc_make_trace", (void *)sc_make_trace},
-            {"sc_make_trace_kernel", (void *)sc_make_trace_kernel},
-            {"sc_get_tls_amx_buffer", (void *)sc_get_tls_amx_buffer},
-            {"sc_parallel_call_cpu_with_env",
-                    (void *)runtime_config_t::get()
-                            .thread_pool_table_->parallel_call},
-            {"sc_is_in_parallel",
-                    (void *)runtime_config_t::get()
-                            .thread_pool_table_->is_in_parallel},
-            {"sc_get_thread_id",
-                    (void *)runtime_config_t::get()
-                            .thread_pool_table_->get_thread_id},
-            {"sc_parallel_call_managed",
-                    (void *)runtime_config_t::get()
-                            .thread_pool_table_->parallel_call_managed},
-            {"sc_set_idle_func_managed", (void *)sc_set_idle_func_managed},
-            {"sc_arrive_at_barrier", (void *)sc_arrive_at_barrier},
-            {"sc_init_barrier", (void *)sc_init_barrier},
-            {"sc_dyn_threadpool_create_work_items",
-                    (void *)sc_dyn_threadpool_create_work_items},
-            {"sc_dyn_threadpool_run", (void *)sc_dyn_threadpool_run},
-            {"sc_dyn_threadpool_sched_destroy",
-                    (void *)sc_dyn_threadpool_sched_destroy},
-            {"sc_dyn_threadpool_sched_init",
-                    (void *)sc_dyn_threadpool_sched_init},
-            {"sc_dyn_threadpool_shared_buffer",
-                    (void *)sc_dyn_threadpool_shared_buffer},
-            {"sc_dyn_threadpool_loop_end", (void *)sc_dyn_threadpool_loop_end},
-            // dynamic query function
-            {"query_format_matmul_core_op",
-                    (void *)query_format_matmul_core_op},
-            {"query_format_managed_matmul_core_op",
-                    (void *)query_format_managed_matmul_core_op},
-            {"query_format_conv_fwd_core_op",
-                    (void *)query_format_conv_fwd_core_op},
-            {"query_format_unary_fusible_op",
-                    (void *)query_format_unary_fusible_op},
-            {"query_format_binary_fusible_op",
-                    (void *)query_format_binary_fusible_op},
-            {"query_format_reorder_op", (void *)query_format_reorder_op},
-            {"query_format_padding_op", (void *)query_format_padding_op},
-            {"query_format_pooling_op", (void *)query_format_pooling_op},
-            {"query_format_reduce_op", (void *)query_format_reduce_op},
-            {"query_format_tensor_view_op",
-                    (void *)query_format_tensor_view_op},
-            {"query_format_select_op", (void *)query_format_select_op},
-            {"query_combined_fused_op", (void *)query_combined_fused_op},
-            {"get_matmul_dyn_cfg_single", (void *)get_matmul_dyn_cfg_single},
-    };
-    return table;
-}
-
-void *default_external_symbol_resolve(const std::string &name) {
-    auto &table = get_runtime_function_map();
-    auto itr = table.find(name);
-    if (itr == table.end()) { return nullptr; }
-    return itr->second;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/symbol_resolver.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/symbol_resolver.hpp
deleted file mode 100644
index 305eb887804..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/symbol_resolver.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_SYMBOL_RESOLVER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_SYMBOL_RESOLVER_HPP
-#include <string>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// finds the address of a symbol by name in the current process. Returns nullptr
-// if the name is not found
-void *default_external_symbol_resolve(const std::string &name);
-const std::unordered_map<std::string, void *> &get_runtime_function_map();
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/README.md b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/README.md
deleted file mode 100644
index 76989215c5c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# Graph Compiler Built-in JIT (Xbyak JIT)
-
-Xbyak JIT is an entirely internal backend; thus, all optimization and code generation processes are built inside the project. This backend does not invoke third-party tools (gcc, llvm, etc.), resulting in faster compiling time and less dependencies.
-
-### File Structure
-
-```
-compiler/jit/xbyak          # Interface of xbyak_jit and xbyak_jit_module
-|
-|-- ir                      # Xbyak IR nodes and utils
-|   |-- pass                # Xbyak IR analysis and info passes
-|   |-- transform           # Xbyak backend specific optimization passes 
-|   |-- reg_allocation      # Register allocation infrastructures
-|
-|-- x86_64                  # x86_64 target, type and abi interface
-|
-|-- backend                 # Code generation modules
-```
-
-### Key Features
-
-**Optimizations:** Apart from default precodegen passes, the Xbyak backend needs additional optimizations to achieve compatibility with machine-level operations and good performance. In a particular stage of these passes, the default IR is transformed to SSA form where optimizations like LICM, value numbering are applied.
-
-**Xbyak IR:** The built-in JIT still uses internal Tensor IR to represent machine-level operations with `xbyak_intrin_node`, which is a dialect of `low_level_intrin_node`. It contains additional attributes to abstract the ISAs, instruction formats, and modifiers.
-
-**x86_64 Target:** Available registers for different ISA extensions, ABI calling convention interfaces for different OS, and low-level data types are defined to represent target x86_64 machine in the register allocation and codegen phases.
-
-**Register Allocation:** After optimization passes and machine-level operations are finalized, the Xbyak backend calculates the value nodes' liveness and assigns registers using an internal register allocator. The register allocation infrastructure is designed to be simple, performative, and flexible. It utilizes an interval tree as a virtual slot to represent each physical register and check liveness interference efficiently. It also uses a priority-based register allocator, utilizing the spill weights of variables, to search and assign available registers or spill variables to memory.
-
-**Code Generation:** The final Xbyak IR is traversed and each machine-level operation is translated into corresponding instruction with appropriate operands. In this stage, function interface, memory addressing, location of values, and stack frame are carefully managed to generate the executable binary directly using the Xbyak assembler.
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/expr_location.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/expr_location.cpp
deleted file mode 100644
index 48a4bdf92ad..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/expr_location.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-
-#include <compiler/jit/xbyak/backend/operand.hpp>
-#include <util/utils.hpp>
-
-#include "expr_location.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-//============================================================================
-//  get member
-//============================================================================
-
-expr_location::type expr_location::get_type() const {
-    return type_;
-}
-
-x86_64::cpu_data_type expr_location::get_data_type() const {
-    return data_type_;
-}
-
-op_ptr_t expr_location::get_op_ptr() const {
-    return content_;
-}
-
-int64_t expr_location::get_imm() const {
-    COMPILE_ASSERT(type_ == type::imm, "Not a imm: " << (*this));
-    return content_->as<int64_t>();
-}
-
-int64_t expr_location::get_stack_var() const {
-    COMPILE_ASSERT(type_ == type::stack_var, "Not a stack_var: " << (*this));
-    return content_->as<int64_t>();
-}
-
-int64_t expr_location::get_stack_tensor() const {
-    COMPILE_ASSERT(
-            type_ == type::stack_tensor, "Not a stack_tensor: " << (*this));
-    return content_->as<int64_t>();
-}
-
-const Xbyak::Reg &expr_location::get_reg() const {
-    COMPILE_ASSERT(type_ == type::reg, "Not a reg: " << (*this));
-    return content_->as<Xbyak::Reg>();
-}
-
-const Xbyak::Address &expr_location::get_simd_constant() const {
-    COMPILE_ASSERT(
-            type_ == type::simd_constant, "Not a simd_constant: " << (*this));
-    return content_->as<Xbyak::Address>();
-}
-
-//============================================================================
-//  Factory methods
-//============================================================================
-
-template <typename RegT>
-expr_location expr_location::make_reg(
-        RegT reg, x86_64::cpu_data_type cpu_dtype) {
-    return expr_location(type::reg, cpu_dtype, std::move(reg));
-}
-
-expr_location expr_location::make_imm(
-        int64_t imm, x86_64::cpu_data_type cpu_dtype) {
-    return expr_location(type::imm, cpu_dtype, imm);
-}
-
-expr_location expr_location::make_stack_var(
-        int64_t offset, x86_64::cpu_data_type cpu_dtype) {
-    return expr_location(type::stack_var, cpu_dtype, offset);
-}
-
-expr_location expr_location::make_stack_tensor(int64_t offset) {
-    const auto cpu_dtype = x86_64::cpu_data_type::uint_64;
-    return expr_location(type::stack_tensor, cpu_dtype, offset);
-}
-
-expr_location expr_location::make_simd_constant(
-        Xbyak::Address addr, x86_64::cpu_data_type cpu_dtype) {
-    return expr_location(type::simd_constant, cpu_dtype, addr);
-}
-
-//============================================================================
-//  make_reg template methods
-//============================================================================
-
-template expr_location expr_location::make_reg<Xbyak::Reg>(
-        Xbyak::Reg reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Reg8>(
-        Xbyak::Reg8 reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Reg16>(
-        Xbyak::Reg16 reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Reg32>(
-        Xbyak::Reg32 reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Reg64>(
-        Xbyak::Reg64 reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Xmm>(
-        Xbyak::Xmm reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Ymm>(
-        Xbyak::Ymm reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Zmm>(
-        Xbyak::Zmm reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Tmm>(
-        Xbyak::Tmm reg, x86_64::cpu_data_type cpu_dtype);
-template expr_location expr_location::make_reg<Xbyak::Opmask>(
-        Xbyak::Opmask reg, x86_64::cpu_data_type cpu_dtype);
-
-//============================================================================
-//  MISC.
-//============================================================================
-
-std::ostream &operator<<(std::ostream &os, const expr_location &v) {
-    switch (v.get_type()) {
-        case expr_location::type::none: {
-            os << "[none]";
-        } break;
-        case expr_location::type::imm: {
-            os << "[imm: " << v.get_imm() << "]";
-        } break;
-        case expr_location::type::reg: {
-            os << "[reg: " << v.get_reg().toString() << "]";
-        } break;
-        case expr_location::type::stack_var: {
-            os << "[stack_var: %rbp" << std::showpos << v.get_stack_var()
-               << std::noshowpos << "]";
-        } break;
-        case expr_location::type::stack_tensor: {
-            os << "[stack_tensor: %rbp" << std::showpos << v.get_stack_tensor()
-               << std::noshowpos << "]";
-        } break;
-        case expr_location::type::simd_constant: {
-            os << "[simd_constant: %rip+.L"
-               << v.get_simd_constant().getLabel()->getId() << "]";
-        } break;
-    }
-    return os;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/expr_location.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/expr_location.hpp
deleted file mode 100644
index 764fe5c17e9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/expr_location.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_EXPR_LOCATION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_EXPR_LOCATION_HPP
-
-#include <sstream>
-#include <utility>
-#include <compiler/ir/sc_expr.hpp>
-#include <compiler/jit/xbyak/backend/operand.hpp>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_expr.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class expr_location {
-public:
-    enum class type {
-        none,
-        imm,
-        reg,
-        stack_var,
-        stack_tensor,
-        simd_constant,
-    };
-
-    expr_location()
-        : type_(expr_location::type::none)
-        , data_type_(x86_64::cpu_data_type::void_t) {};
-
-    type get_type() const;
-    x86_64::cpu_data_type get_data_type() const;
-    op_ptr_t get_op_ptr() const;
-
-    int64_t get_imm() const;
-    int64_t get_stack_var() const;
-    int64_t get_stack_tensor() const;
-    const Xbyak::Reg &get_reg() const;
-    const Xbyak::Address &get_simd_constant() const;
-
-    // Factory methods, for convenience.
-    template <typename RegT>
-    static expr_location make_reg( //
-            RegT reg, x86_64::cpu_data_type cpu_dtype);
-    static expr_location make_imm( //
-            int64_t imm, x86_64::cpu_data_type cpu_dtype);
-    static expr_location make_stack_var( //
-            int64_t offset, x86_64::cpu_data_type cpu_dtype);
-    static expr_location make_stack_tensor( //
-            int64_t offset);
-    static expr_location make_simd_constant( //
-            Xbyak::Address addr, x86_64::cpu_data_type cpu_dtype);
-
-    friend std::ostream &operator<<(std::ostream &os, const expr_location &v);
-
-private:
-    // only allow factory methods
-    template <typename T>
-    expr_location(type t, x86_64::cpu_data_type dtype, T op)
-        : type_(t)
-        , data_type_(dtype)
-        , content_(wrap_op_ptr<T>(std::move(op))) {}
-
-    type type_;
-    x86_64::cpu_data_type data_type_;
-    op_ptr_t content_;
-};
-
-std::ostream &operator<<(std::ostream &os, const expr_location &v);
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/location_manager.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/location_manager.cpp
deleted file mode 100644
index b7992523e8a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/location_manager.cpp
+++ /dev/null
@@ -1,1442 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <cfloat>
-#include <cmath>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/jit/xbyak/ir/transform/register_allocation.hpp>
-#include <compiler/jit/xbyak/x86_64/type_mapping.hpp>
-#include <util/bf16.hpp>
-#include <util/utils.hpp>
-
-#include "location_manager.hpp"
-#include "util/fp16.hpp"
-
-SC_MODULE(xbyakjit.location_manager)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using namespace xbyak::x86_64;
-
-//==============================================================================
-// LOCATION MANAGER
-//==============================================================================
-
-location_manager::location_manager(stack_frame_model &sf_model,
-        Xbyak::CodeGenerator &gen, const x86_64::target_profile_t &profile)
-    : sf_model_(sf_model)
-    , gen_(gen)
-    , profile_(profile)
-    , cpu_flags_(profile.target_machine_.cpu_flags_) {
-    // Current target virtual reg mapping
-    virtual_slots_map_ = std::make_shared<virtual_slots_map_t>(profile_);
-}
-
-location_manager::~location_manager() = default;
-
-//==============================================================================
-// Stack operation interface
-//==============================================================================
-
-// stack_push expr
-int64_t location_manager::stack_push(const expr_c &v) {
-    return stack_push(get_location(v));
-}
-
-// stack_push location
-int64_t location_manager::stack_push(const expr_location &location) {
-    auto cpu_type = location.get_data_type();
-    switch (location.get_type()) {
-        case expr_location::type::reg: {
-            return stack_push(location.get_reg(), cpu_type);
-        } break;
-        case expr_location::type::imm: {
-            return stack_push(location.get_imm(), cpu_type);
-        } break;
-        case expr_location::type::stack_var:
-        case expr_location::type::simd_constant: {
-            auto op = get_operand(location);
-            return stack_push(op.get_addr(), cpu_type);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid stack push: " << location);
-        }
-    }
-
-    return get_stack_top_rbp_offset();
-}
-
-// stack_push imm
-int64_t location_manager::stack_push(
-        const uint64_t &imm, x86_64::cpu_data_type dtype) {
-    const size_t slot_size = get_data_slot_size(dtype);
-
-    switch (dtype) {
-        case x86_64::cpu_data_type::uint_8:
-        case x86_64::cpu_data_type::sint_8:
-        case x86_64::cpu_data_type::sint_32:
-        case x86_64::cpu_data_type::uint_64: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.mov(gen_.qword[gen_.rsp], imm);
-        } break;
-        default: COMPILE_ASSERT(false, "Invalid stack push imm: " << dtype);
-    }
-
-    sf_model_.push_anonymous_object(
-            dtype, slot_size, "stack push imm: " + std::to_string(imm));
-    return get_stack_top_rbp_offset();
-}
-
-// stack_push reg
-int64_t location_manager::stack_push(
-        const Xbyak::Reg &reg, x86_64::cpu_data_type dtype) {
-    const size_t slot_size = get_data_slot_size(dtype);
-
-    switch (dtype) {
-        // integer 8-bit/ 1-byte
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8:
-        // integer 16-bit/ 2-byte
-        case cpu_data_type::uint_16:
-        // integer 32-bit/ 4-byte
-        case cpu_data_type::uint_32:
-        case cpu_data_type::sint_32:
-        // integer 64-bit/ 8-byte
-        case cpu_data_type::uint_64: {
-            gen_.push(to_reg64(reg));
-        } break;
-        // simd 32-bit/ 4-byte
-        case cpu_data_type::float_16: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.vmovsh(gen_.word[gen_.rsp], to_xmm(reg));
-        } break;
-        case cpu_data_type::float_32: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.vmovss(gen_.dword[gen_.rsp], to_xmm(reg));
-        } break;
-        // simd 64-bit/ 8-byte
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::uint_16_x4:
-        case cpu_data_type::uint_32_x2:
-        case cpu_data_type::sint_32_x2:
-        case cpu_data_type::float_16_x4:
-        case cpu_data_type::float_32_x2: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.vmovq(gen_.qword[gen_.rsp], to_xmm(reg));
-        } break;
-        // simd 128-bit/ 16-byte
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::sint_8_x16:
-        case cpu_data_type::uint_16_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::uint_64_x2:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_32_x4: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.vmovups(gen_.xword[gen_.rsp], to_xmm(reg));
-        } break;
-        // simd 256-bit/ 32-byte
-        case cpu_data_type::uint_8_x32:
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_64_x4:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_32_x8: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.vmovups(gen_.yword[gen_.rsp], to_ymm(reg));
-        } break;
-        // simd 512-bit/ 64-byte
-        case cpu_data_type::uint_8_x64:
-        case cpu_data_type::sint_8_x64:
-        case cpu_data_type::uint_16_x32:
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x16:
-        case cpu_data_type::uint_64_x8:
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_32_x16: {
-            gen_.sub(gen_.rsp, slot_size);
-            gen_.vmovups(gen_.zword[gen_.rsp], to_zmm(reg));
-        } break;
-        // not supported
-        case cpu_data_type::mask_x4:
-        case cpu_data_type::mask_x8:
-        case cpu_data_type::mask_x16:
-        case cpu_data_type::mask_x32:
-        case cpu_data_type::mask_x64:
-        case cpu_data_type::void_t: {
-            COMPILE_ASSERT(false, "Invalid stack push reg: " << dtype);
-        } break;
-    }
-
-    sf_model_.push_anonymous_object(
-            dtype, slot_size, "stack push reg: " + std::string(reg.toString()));
-    return get_stack_top_rbp_offset();
-}
-
-// stack_push addr
-int64_t location_manager::stack_push(
-        const Xbyak::Address &addr, x86_64::cpu_data_type dtype) {
-    const size_t slot_size = get_data_slot_size(dtype);
-
-    switch (dtype) {
-        case x86_64::cpu_data_type::uint_8:
-        case x86_64::cpu_data_type::sint_8:
-        case x86_64::cpu_data_type::sint_32:
-        case x86_64::cpu_data_type::uint_64:
-        case x86_64::cpu_data_type::float_32: {
-            auto addr64 = addr;
-            addr64.setBit(64);
-            gen_.push(addr64);
-        } break;
-        default: COMPILE_ASSERT(false, "Invalid stack push addr: " << dtype);
-    }
-
-    sf_model_.push_anonymous_object(dtype, slot_size, "stack push addr");
-    return get_stack_top_rbp_offset();
-}
-
-// stack_pop expr
-int64_t location_manager::stack_pop(const expr_c &v) {
-    return stack_pop(get_location(v));
-}
-
-// stack_pop location
-int64_t location_manager::stack_pop(const expr_location &location) {
-    auto cpu_type = location.get_data_type();
-    switch (location.get_type()) {
-        case expr_location::type::reg: {
-            return stack_pop(location.get_reg(), cpu_type);
-        } break;
-        case expr_location::type::stack_var: {
-            auto op = get_operand(location);
-            return stack_pop(op.get_addr(), cpu_type);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid stack pop: " << location);
-        }
-    }
-
-    return get_stack_top_rbp_offset();
-}
-
-// stack_pop reg
-int64_t location_manager::stack_pop(
-        const Xbyak::Reg &reg, x86_64::cpu_data_type dtype) {
-    const size_t slot_size = get_data_slot_size(dtype);
-
-    switch (dtype) {
-        // integer 8-bit/ 1-byte
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8:
-        // integer 16-bit/ 2-byte
-        case cpu_data_type::uint_16:
-        // integer 32-bit/ 4-byte
-        case cpu_data_type::uint_32:
-        case cpu_data_type::sint_32:
-        // integer 64-bit/ 8-byte
-        case cpu_data_type::uint_64: {
-            auto reg_gp = to_reg64(reg);
-            gen_.pop(reg_gp);
-        } break;
-        // simd 16-bit/ 2-byte
-        case cpu_data_type::float_16: {
-            auto reg_xmm = to_xmm(reg);
-            gen_.vmovsh(reg_xmm, gen_.word[gen_.rsp]);
-            gen_.add(gen_.rsp, slot_size);
-        } break;
-        // simd 32-bit/ 4-byte
-        case cpu_data_type::float_32: {
-            auto reg_xmm = to_xmm(reg);
-            gen_.vmovss(reg_xmm, gen_.dword[gen_.rsp]);
-            gen_.add(gen_.rsp, slot_size);
-        } break;
-        // simd 64-bit/ 8-byte
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::uint_16_x4:
-        case cpu_data_type::uint_32_x2:
-        case cpu_data_type::sint_32_x2:
-        case cpu_data_type::float_16_x4:
-        case cpu_data_type::float_32_x2: {
-            auto reg_xmm = to_xmm(reg);
-            gen_.vmovq(reg_xmm, gen_.qword[gen_.rsp]);
-            gen_.add(gen_.rsp, slot_size);
-        } break;
-        // simd 128-bit/ 16-byte
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::sint_8_x16:
-        case cpu_data_type::uint_16_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::uint_64_x2:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_32_x4: {
-            auto reg_xmm = to_xmm(reg);
-            gen_.vmovups(reg_xmm, gen_.xword[gen_.rsp]);
-            gen_.add(gen_.rsp, slot_size);
-        } break;
-        // simd 256-bit/ 32-byte
-        case cpu_data_type::uint_8_x32:
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_64_x4:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_32_x8: {
-            auto reg_ymm = to_ymm(reg);
-            gen_.vmovups(reg_ymm, gen_.yword[gen_.rsp]);
-            gen_.add(gen_.rsp, slot_size);
-        } break;
-        // simd 512-bit/ 64-byte
-        case cpu_data_type::uint_8_x64:
-        case cpu_data_type::sint_8_x64:
-        case cpu_data_type::uint_16_x32:
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x16:
-        case cpu_data_type::uint_64_x8:
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_32_x16: {
-            auto reg_zmm = to_zmm(reg);
-            gen_.vmovups(reg_zmm, gen_.zword[gen_.rsp]);
-            gen_.add(gen_.rsp, slot_size);
-        } break;
-        // not supported
-        case cpu_data_type::mask_x4:
-        case cpu_data_type::mask_x8:
-        case cpu_data_type::mask_x16:
-        case cpu_data_type::mask_x32:
-        case cpu_data_type::mask_x64:
-        case cpu_data_type::void_t: {
-            COMPILE_ASSERT(false, "Invalid stack pop reg: " << dtype);
-        } break;
-    }
-
-    sf_model_.shrink(slot_size);
-    return get_stack_top_rbp_offset();
-}
-
-// stack_pop addr
-int64_t location_manager::stack_pop(
-        const Xbyak::Address &addr, x86_64::cpu_data_type dtype) {
-    const size_t slot_size = get_data_slot_size(dtype);
-
-    switch (dtype) {
-        case x86_64::cpu_data_type::uint_8:
-        case x86_64::cpu_data_type::sint_8:
-        case x86_64::cpu_data_type::sint_32:
-        case x86_64::cpu_data_type::uint_64:
-        case x86_64::cpu_data_type::float_32: {
-            auto addr64 = addr;
-            addr64.setBit(64);
-            gen_.pop(addr64);
-        } break;
-        default: COMPILE_ASSERT(false, "Invalid stack pop addr: " << dtype);
-    }
-
-    sf_model_.shrink(slot_size);
-    return get_stack_top_rbp_offset();
-}
-
-void location_manager::stack_padding(
-        const size_t &padding_bytes_needed, const std::string &comment) {
-    assert(padding_bytes_needed > 0);
-    stack_allocate(padding_bytes_needed);
-    for (size_t i = 0; i < padding_bytes_needed; ++i) {
-        // The choice of uint_8 is irrelevant. It's just because
-        // stack_frame_model currently requires a cpu_data_type for every
-        // modeled slot.
-        sf_model_.push_anonymous_object(cpu_data_type::uint_8, 1, comment);
-    }
-}
-
-void location_manager::stack_restore(const size_t &stack_diff_to_restore) {
-    gen_.add(gen_.rsp, stack_diff_to_restore);
-    sf_model_.shrink(stack_diff_to_restore);
-}
-
-size_t location_manager::stack_var_define(x86_64::cpu_data_type cpu_dtype,
-        const std::string &name, const std::string &comment) {
-    size_t slot_size = get_data_slot_size(cpu_dtype);
-    sf_model_.push_named_object(name, cpu_dtype, slot_size, comment);
-    return slot_size;
-}
-
-size_t location_manager::stack_tensor_define(x86_64::cpu_data_type cpu_dtype,
-        size_t num_elem, const std::string &name, const std::string &comment) {
-    size_t slot_size = get_tensor_slot_size(cpu_dtype, num_elem);
-    sf_model_.push_named_tensor_buffer_object(
-            name, cpu_dtype, num_elem, slot_size, comment);
-    return slot_size;
-}
-
-void location_manager::stack_allocate(size_t slot_size) {
-    if (slot_size > 0) { gen_.sub(gen_.rsp, slot_size); }
-}
-
-size_t location_manager::get_stack_current_size() {
-    return sf_model_.get_size();
-}
-
-int64_t location_manager::get_stack_top_rbp_offset() {
-    return (sf_model_.get_size() == 0)
-            ? 0
-            : sf_model_.get_top_slot()->get_rbp_offset();
-}
-
-void location_manager::conserve_stack_size() {
-    conserved_stack_.push_back(get_stack_current_size());
-}
-
-void location_manager::restore_stack_size() {
-    assert(!conserved_stack_.empty());
-    size_t pre_stack_size = conserved_stack_.back();
-    if (const size_t stack_growth_after_conservation
-            = get_stack_current_size() - pre_stack_size) {
-        stack_restore(stack_growth_after_conservation);
-    }
-    conserved_stack_.pop_back();
-}
-
-//==============================================================================
-// Function argument interface
-//==============================================================================
-
-void location_manager::handle_func_params(const std::vector<expr> &func_params,
-        const x86_64::abi_function_interface &func_iface) {
-    assert(func_params.size() == func_iface.param_locs_.size());
-
-    // When this method gets called, the callee has already created its own
-    // stack frame by pushing %rbp. So the call stack currently looks like this:
-    //
-    // |          ......          |
-    // |--------------------------| -|
-    // | (right-most stack param) |  |
-    // |--------------------------|  |
-    // | (next stack param)       |  |
-    // |--------------------------|  |- stack parameter area
-    // |          ......          |  |  (EXISTS ONLY IF ONE OR MORE PARAMETERS
-    // |--------------------------|  |  IS ACTUALLY PASSED ON THE STACK.)
-    // | (left-most stack param)  |  |
-    // |--------------------------| -|
-    // | (return address)         |
-    // |--------------------------| <-- %rsp when control first reached callee
-    // | saved value of %rbp      |
-    // |--------------------------| <-- %rsp, %rbp after callee created stack
-    // frame
-
-    // For parameters passed via that stack parameter area, we're not going to
-    // emit any asm code. We'll just update our book-keeping information so we
-    // remember where the callee can find them at runtime if needed.
-    //
-    // For parameters passed via CPU registers, we'll emi asm that pushes their
-    // values onto the stack when spilled by allocator, otherwise we just keep
-    // the value in original registers. Assuming that there are n spilled
-    // register-passed parameters, the emitted asm will make the stack look like
-    // this:
-    //
-    // |          ......          |
-    // |--------------------------| -|
-    // | (right-most stack param) |  |
-    // |--------------------------|  |
-    // | (next stack param)       |  |
-    // |--------------------------|  |- stack parameter area
-    // |          ......          |  |  (EXISTS ONLY IF ONE OR MORE PARAMETERS
-    // |--------------------------|  |  IS ACTUALLY PASSED ON THE STACK.)
-    // | (left-most stack param)  |  |
-    // |--------------------------| -|
-    // | (return address)         |
-    // |--------------------------| <-- %rsp when control first reached callee
-    // | saved value of %rbp      |
-    // |--------------------------| <-- %rbp after callee created stack frame
-    // | 1st spilled param value  | -|
-    // |--------------------------|  |
-    // |          ......          |  |- spilled register parameter area
-    // |--------------------------|  | (EXISTS ONLY IF ONE OR MORE REGISTER
-    // | nth spilled param value  | -|  PARAMETERS IS ACTUALLY SPILLED.)
-    // |--------------------------| <-- %rsp (current)
-
-    /// NOTE! The \c get_rsp_offset() method gives the offset
-    /// relative to what %rbp was when control first entered
-    /// the callee.
-    /// But the code we're generating here is executed after
-    /// the callee has created pushed %rbp onto the stack to
-    /// create the new stack frame.
-
-    for (size_t i = 0; i < func_params.size(); ++i) {
-        const abi_value_location &initial_loc = func_iface.param_locs_[i];
-        const expr &ir_expr = func_params[i];
-        const std::string &name = get_node_name(ir_expr);
-
-        const cpu_data_type_table::row &r
-                = get_cpu_data_type_row(ir_expr->dtype_);
-        const cpu_data_type cpu_dtype = r.type_;
-
-        switch (initial_loc.get_type()) {
-            case abi_value_location::tag_type::REGISTER: {
-                // Get param on reg
-                Xbyak::Reg src_reg = initial_loc.get_register();
-                COMPILE_ASSERT(src_reg.isREG(64) || src_reg.isXMM(),
-                        "Unhandled register kind: " << src_reg.toString());
-                // Push reg param to stack if spilled
-                if (GET_VIRTUAL_REG(ir_expr).spilled()) {
-                    assert(r.local_value_stack_slot_size_ == 8);
-                    int64_t rbp_offset = stack_push(src_reg, cpu_dtype);
-                    expr_location_map_[ir_expr] = expr_location::make_stack_var(
-                            rbp_offset, cpu_dtype);
-                } else if (GET_VIRTUAL_REG(ir_expr).allocated()) {
-                    allocate_free_reg(ir_expr);
-                }
-            } break;
-            case abi_value_location::tag_type::STACK: {
-                assert(r.abi_stack_slot_size_ == 8);
-                // Get param on stack
-                const size_t saved_rbp_slot_size = 8;
-                int64_t rbp_offset
-                        = initial_loc.get_rsp_offset() + saved_rbp_slot_size;
-                auto param_slot = stack_frame_model::caller_param_slot(name,
-                        r.abi_stack_slot_size_, cpu_dtype, rbp_offset,
-                        "caller-created parameter slot");
-                sf_model_.add_caller_param_slot(param_slot);
-                // Get location on stack
-                auto location
-                        = expr_location::make_stack_var(rbp_offset, cpu_dtype);
-                // Load stack param to reg if allocated reg
-                // TODO(XXX): optimize when to load
-                if (GET_VIRTUAL_REG(ir_expr).spilled()) {
-                    expr_location_map_[ir_expr] = location;
-                } else if (GET_VIRTUAL_REG(ir_expr).allocated()) {
-                    const auto &reg = allocate_free_reg(ir_expr).get_reg();
-                    load_location_to_reg(reg, location);
-                }
-            } break;
-            default: assert(!"Unreachable");
-        }
-    }
-}
-
-void location_manager::align_call_stack(
-        const x86_64::abi_function_interface &callee_iface) {
-    //--------------------------------------------------------------------------
-    // The Microsoft/SystemV ABI requires that just prior to the 'call'
-    // instruction, ((%rsp mod 16) == 0).
-    //
-    // We could just use a snippet of asm code to do this at JIT-execution time,
-    // but that would be wasteful since we have enough information during
-    // codegen.
-    //--------------------------------------------------------------------------
-
-    // TODO(xxx): When we add support for data types with stricter alignment
-    // requirements (i.e., _m256 or _m512), we'll need to stop treating 16 as
-    // a fixed requirement.
-    assert(callee_iface.initial_rsp_alignment_ == 16);
-
-    // The Microsoft/SystemV ABI also guarantees that upon entry to any
-    // function, ((%rsp mod 16) == 8).
-    constexpr size_t initial_rsp_mod16 = 8;
-
-    // When control first entered the *caller* function, we pushed the
-    // value of $rbp onto the stack. That's not considered part of the new
-    // stack frame, and therefore get_stack_current_size()
-    // doesn't include them. But we need to consider them here.
-    constexpr size_t saved_rbp_bytes = 8;
-
-    const size_t current_rsp_mod16
-            = (initial_rsp_mod16 + saved_rbp_bytes + get_stack_current_size())
-            % 16;
-
-    const size_t rsp_mod16_after_params_pushed
-            = (current_rsp_mod16 + callee_iface.get_param_area_size()) % 16;
-
-    // pre-call stack alignment needed
-    if (rsp_mod16_after_params_pushed != 0) {
-        const size_t padding_bytes_needed = 16 - rsp_mod16_after_params_pushed;
-        // We're going to pad with 8-byte dummy slots.
-        stack_padding(padding_bytes_needed);
-    }
-}
-
-void location_manager::handle_call_arg(const expr_c &arg, const expr_c &v) {
-    if (GET_VIRTUAL_REG(arg).spilled()) {
-        stack_push(v);
-    } else {
-        const auto &call_reg = allocate_free_reg(arg).get_reg();
-        load_location_to_reg(call_reg, get_location(v));
-    }
-}
-
-void location_manager::push_caller_saved(
-        const std::vector<expr_c> &caller_saved) {
-    for (auto &v : caller_saved) {
-        int64_t rbp_offset = stack_push(v);
-        local_location_map_[v] = expr_location::make_stack_var(
-                rbp_offset, get_cpu_data_type(v->dtype_));
-        caller_saved_.push_back(v);
-    }
-}
-
-void location_manager::pop_caller_saved() {
-    while (!caller_saved_.empty()) {
-        auto v = caller_saved_.back();
-        local_location_map_.erase(v);
-        stack_pop(v);
-        caller_saved_.pop_back();
-    }
-}
-
-//==============================================================================
-// Codegen operation interface
-//==============================================================================
-
-void location_manager::handle_definition(const expr_c &v) {
-    if (GET_VIRTUAL_REG(v).allocated()) { allocate_free_reg(v); }
-}
-
-void location_manager::handle_spilled_definition(
-        const std::vector<expr_c> &defined_spill) {
-    size_t slot_size = 0;
-    for (auto &v : defined_spill) {
-        if (expr_location_map_.find(v) != expr_location_map_.end()) {
-            continue;
-        } else if (GET_VIRTUAL_REG(v).spilled()) {
-            const auto name = get_node_name(v.remove_const());
-            const auto data_type = get_cpu_data_type(v->dtype_);
-            // v is a var
-            slot_size += stack_var_define(
-                    data_type, name, "allocate on stack: " + name);
-            auto offset = get_stack_top_rbp_offset();
-            expr_location_map_[v]
-                    = expr_location::make_stack_var(offset, data_type);
-        } else if (GET_VIRTUAL_REG(v).buffered()) {
-            auto vv = v.static_as<tensor_c>();
-            const auto name = get_node_name(vv.remove_const());
-            const auto num_elem = get_tensor_static_num_elements(vv);
-            const auto elem_dtype = get_cpu_data_type(vv->elem_dtype_);
-            assert(num_elem <= 128);
-            // v is a tensor
-            slot_size += stack_tensor_define(
-                    elem_dtype, num_elem, name, "allocate on stack: " + name);
-            auto offset = get_stack_top_rbp_offset();
-            expr_location_map_[v] = expr_location::make_stack_tensor(offset);
-        } else {
-            COMPILE_ASSERT(false, "Invalid spilled define: " << v);
-        }
-    }
-    stack_allocate(slot_size);
-}
-
-void location_manager::prepare_local_scope(
-        const std::vector<expr_c> &local_spill) {
-    std::vector<expr_c> local_defined;
-    for (auto &v : local_spill) {
-        if (GET_VIRTUAL_REG(v).spilled()) {
-            local_defined.push_back(v);
-        } else {
-            int64_t rbp_offset = stack_push(v);
-            local_location_map_[v] = expr_location::make_stack_var(
-                    rbp_offset, get_cpu_data_type(v->dtype_));
-        }
-    }
-    handle_spilled_definition(local_defined);
-}
-
-void location_manager::conclude_local_scope() {
-    local_location_map_.clear();
-}
-
-void location_manager::emit_callee_prologue(
-        const std::set<virt_reg_index_t> &register_usage) {
-    // CALL STACK BEFORE:
-    //
-    // [ zero or more caller-pushed args    ]
-    // [ caller-pushed return address       ] <-- %rsp
-
-    gen_.push(regs::rbp);
-    gen_.mov(regs::rbp, regs::rsp);
-    // NOTE: We intentionally avoid updating the stack-frame model
-    // regarding our pushing of %rbp, because we don't consider it
-    // to be part of the new stack frame.
-
-    // Get callee saved reg type
-    auto callee_save_cpu_type = [&](const virt_reg_type &reg_type) {
-        if (reg_type == virt_reg_type::gp_reg) {
-            return x86_64::cpu_data_type::uint_64;
-        } else if (reg_type == virt_reg_type::fp_reg) {
-            // XMM6-XMM15 nonvolatile
-            // Upper portions of YMM0-YMM15 and ZMM0-ZMM15 volatile
-            // Thus only save 128-bit
-            return x86_64::cpu_data_type::float_32_x4;
-        } else {
-            COMPILE_ASSERT(false, "Invalid callee save type.");
-        }
-    };
-    // Push callee saved reg by type
-    auto gen_callee_save = [&](const virt_reg_type &reg_type) {
-        const auto &target_callee_save
-                = virtual_slots_map_->get_callee_save(reg_type);
-        for (auto i : target_callee_save) {
-            if (register_usage.find(i) != register_usage.end()) {
-                auto reg = virtual_slots_map_->get_reg_physical(i);
-                auto loc = expr_location::make_reg(
-                        reg, callee_save_cpu_type(reg_type));
-                callee_saved_.push_back(loc);
-                stack_push(loc);
-            }
-        }
-    };
-    // Push all callee saved regs
-    gen_callee_save(virt_reg_type::gp_reg);
-    gen_callee_save(virt_reg_type::fp_reg);
-    // Save current stack
-    conserve_stack_size();
-
-    // CALL STACK AFTER:
-    //
-    // [ zero or more caller-pushed args    ]
-    // [ caller-pushed return address       ] <-- %rsp as of function entry
-    // [ callee-pushed save of %rbp         ] <-- %rbp
-    // [ callee-pushed save reg [0]         ]
-    // [ callee-pushed save reg [1]         ]
-    // ....
-    // [ callee-pushed save reg [n]         ] <-- %rsp
-}
-
-void location_manager::emit_callee_epilogue() {
-    // CALL STACK BEFORE:
-    //
-    // [ zero or more caller-pushed args    ]
-    // [ caller-pushed return address       ] <-- %rsp as of function entry
-    // [ callee-pushed save of %rbp         ] <-- %rbp
-    // [ callee-pushed save reg [0]         ]
-    // [ callee-pushed save reg [1]         ]
-    // ....
-    // [ callee-pushed save reg [n]         ]
-    // [ (any additional stack allocations) ] <-- %rsp
-
-    // Shrink the stack to remove all allocations that occurred after saved
-    // registers.
-    restore_stack_size();
-
-    // Pop all callee saved regs
-    while (!callee_saved_.empty()) {
-        const auto &loc = callee_saved_.back();
-        stack_pop(loc);
-        callee_saved_.pop_back();
-    }
-
-    // NOTE: We don't consider the saved %rbp value to be part of the stack
-    // frame itself, so don't update the stack-frame model when we pop %rbp.
-    gen_.pop(regs::rbp);
-
-    // CALL STACK AFTER:
-    //
-    // [ zero or more caller-pushed args    ]
-    // [ caller-pushed return address       ] <-- %rsp = %rsp as of function
-}
-
-void location_manager::expire(stmt_index_t current_index) {
-    std::vector<expr_c> to_be_expired;
-
-    // Get linvness ended expr
-    for (auto &loc_kv : expr_location_map_) {
-        auto &v = loc_kv.first;
-        auto &loc = loc_kv.second;
-        auto &last_ll = GET_LIVE_RANGE(v);
-        if (last_ll.end_ <= current_index) { to_be_expired.push_back(v); }
-    }
-    // release location
-    for (auto &v : to_be_expired) {
-        // SC_MODULE_FATAL << "to_be_expired: " << v;
-        expr_location_map_.erase(v);
-    }
-}
-
-void location_manager::clear() {
-    caller_saved_.clear();
-    callee_saved_.clear();
-    conserved_stack_.clear();
-    local_location_map_.clear();
-    expr_location_map_.clear();
-    simd_constant_map_.clear();
-    simd_constant_vec_.clear();
-}
-
-//==============================================================================
-// Codegen Operand interface
-//==============================================================================
-operand location_manager::get_operand(const expr_location &location) {
-    switch (location.get_type()) {
-        case expr_location::type::imm: {
-            return operand(operand::type::imm, location.get_op_ptr());
-        } break;
-        case expr_location::type::reg: {
-            return operand(operand::type::reg, location.get_op_ptr());
-        } break;
-        case expr_location::type::stack_var: {
-            auto addr = get_offset_address(
-                    location.get_stack_var(), location.get_data_type());
-            return operand(operand::type::addr, wrap_op_ptr(addr));
-        } break;
-        case expr_location::type::stack_tensor: {
-            auto addr = get_offset_address(
-                    location.get_stack_tensor(), location.get_data_type());
-            return operand(operand::type::addr, wrap_op_ptr(addr));
-        } break;
-        case expr_location::type::simd_constant: {
-            return operand(operand::type::addr, location.get_op_ptr());
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid: get_operand: " << location);
-        }
-    }
-    return operand();
-}
-
-operand location_manager::get_operand(const expr_c &v) {
-    if (v.isa<indexing_c>()) {
-        auto vv = v.static_as<indexing_c>();
-        return get_operand_indexing(vv);
-    } else {
-        return get_operand(get_location(v));
-    }
-}
-
-operand location_manager::get_operand_indexing(const indexing_c &v) {
-    // +----------------+-------------------+----------------------------+
-    // | ptr[idx] addr  | ptr var: V        | ptr stack tensor: %rbp + o |
-    // +----------------+-------------------+----------------------------+
-    // | idx var: I     | REG(V) + s*REG(I) | %rbp + o + s*REG(I)        |
-    // +----------------+-------------------+----------------------------+
-    // | idx const: i   | REG(V) + s*i      | %rbp + o + s*i             |
-    // +----------------+-------------------+----------------------------+
-    assert(v->idx_.size() == 1);
-    // Get the index value
-    expr_c idx = v->idx_.back();
-    // Get the tensor buffer's base address
-    expr_c ptr = v->ptr_;
-    assert(ptr.defined());
-    // get ptr and idx locations
-    auto ptr_location = get_location(ptr);
-    auto idx_location = get_location(idx);
-    auto ptr_loc_type = ptr_location.get_type();
-    auto idx_loc_type = idx_location.get_type();
-    COMPILE_ASSERT(ptr_loc_type == expr_location::type::reg
-                    || ptr_loc_type == expr_location::type::stack_tensor,
-            "Invalid base address location: " << v);
-    COMPILE_ASSERT(idx_loc_type == expr_location::type::imm
-                    || idx_loc_type == expr_location::type::reg,
-            "Invalid index value location: " << v);
-    // Get RegExp for ptr
-    Xbyak::RegExp addr_exp;
-    if (ptr_loc_type == expr_location::type::stack_tensor) {
-        // addr_exp = %rbp + o
-        addr_exp = get_rbp_offset(ptr_location.get_stack_tensor());
-    } else {
-        // addr_exp = REG(V)
-        addr_exp = Xbyak::RegExp(ptr_location.get_reg());
-    }
-    // Get scale for indexing
-    auto elem_type = ptr->dtype_.get_pointer_element();
-    auto scale = get_data_type_size(get_cpu_data_type(elem_type));
-    // Get RegExp for idx
-    if (idx_loc_type == expr_location::type::imm) {
-        // addr_exp += s*i
-        addr_exp = addr_exp + idx_location.get_imm() * scale;
-    } else {
-        // addr_exp += s*REG(I)
-        addr_exp = addr_exp + to_reg64(idx_location.get_reg()) * scale;
-    }
-    // Get address frame
-    return operand(get_address(addr_exp, get_cpu_data_type(v->dtype_)));
-}
-
-operand location_manager::get_operand_sib(
-        const expr_c &base, const expr_c &indx, const expr_c &disp) {
-    auto loc_base = get_location(base);
-    auto loc_indx = get_location(indx);
-    auto loc_disp = get_location(disp);
-
-    COMPILE_ASSERT(loc_base.get_type() == expr_location::type::reg
-                    && loc_indx.get_type() == expr_location::type::reg
-                    && loc_disp.get_type() == expr_location::type::imm,
-            "Invalid sib operand type: " << loc_base << ", " << loc_indx << ", "
-                                         << loc_disp);
-
-    return operand(gen_.ptr[loc_base.get_reg() + loc_indx.get_reg()
-            + loc_disp.get_imm()]);
-}
-
-//==============================================================================
-// MISC. interface
-//==============================================================================
-
-bool location_manager::is_stack_tensor(const expr_c &v) {
-    return get_location(v).get_type() == expr_location::type::stack_tensor;
-}
-
-size_t location_manager::get_data_type_size(x86_64::cpu_data_type data_type) {
-    const size_t data_size
-            = get_cpu_data_types().lookup(data_type).size_in_bytes_;
-    return data_size;
-}
-
-size_t location_manager::get_data_slot_size(x86_64::cpu_data_type data_type) {
-    const size_t slot_size = get_cpu_data_types()
-                                     .lookup(data_type)
-                                     .local_value_stack_slot_size_;
-    return slot_size;
-}
-
-size_t location_manager::get_tensor_slot_size(
-        x86_64::cpu_data_type data_type, const size_t &num_elem) {
-    COMPILE_ASSERT(num_elem > 0, "cannot allocate zero-element tensors");
-
-    // TODO(xxx): We're supposed to ensure that *every* element of the tensor
-    // buffer will meet the "natural" alignment standard. We might want an
-    // assert to verify that, instead of just assuming that no intra-element
-    // padding is needed to get that outcome.
-
-    size_t buffer_size = get_data_type_size(data_type) * num_elem;
-
-    if (const size_t excess = buffer_size % 8) { buffer_size += (8 - excess); }
-
-    return buffer_size;
-}
-
-size_t location_manager::get_tensor_static_num_elements(const tensor_c &v) {
-    COMPILE_ASSERT(v->dims_.size() == 1, "Tensors must be one-dimensional");
-
-    const constant_c dim0_node = v->dims_[0].dyn_as<constant_c>();
-    COMPILE_ASSERT(dim0_node.defined(), "Unexpected dims_[0] node type: " << v);
-
-    return get_const_as_int(dim0_node);
-}
-
-size_t location_manager::get_conserved_stack_size() const {
-    return conserved_stack_.size();
-}
-
-const Xbyak::AddressFrame *location_manager::get_address_frame(
-        const cpu_data_type cpu_dtype) {
-    switch (cpu_dtype) {
-        // integer 8-bit/ 1-byte
-        case cpu_data_type::uint_8: return &(gen_.byte);
-        case cpu_data_type::sint_8: return &(gen_.byte);
-        // integer 16-bit/ 2-byte
-        case cpu_data_type::uint_16: return &(gen_.word);
-        // fp16 16bit / 2-byte
-        case cpu_data_type::float_16: return &(gen_.word);
-        // integer 32-bit/ 4-byte
-        case cpu_data_type::uint_32: return &(gen_.dword);
-        case cpu_data_type::sint_32: return &(gen_.dword);
-        // integer 64-bit/ 8-byte
-        case cpu_data_type::uint_64: return &(gen_.qword);
-        // simd 32-bit/ 4-byte
-        case cpu_data_type::float_32: return &(gen_.dword);
-        // simd 64-bit/ 8-byte
-        case cpu_data_type::uint_8_x8: return &(gen_.qword);
-        case cpu_data_type::sint_8_x8: return &(gen_.qword);
-        case cpu_data_type::uint_16_x4: return &(gen_.qword);
-        case cpu_data_type::uint_32_x2: return &(gen_.qword);
-        case cpu_data_type::sint_32_x2: return &(gen_.qword);
-        case cpu_data_type::float_16_x4: return &(gen_.qword);
-        case cpu_data_type::float_32_x2: return &(gen_.qword);
-        // simd 128-bit/ 16-byte
-        case cpu_data_type::uint_8_x16: return &(gen_.xword);
-        case cpu_data_type::sint_8_x16: return &(gen_.xword);
-        case cpu_data_type::uint_16_x8: return &(gen_.xword);
-        case cpu_data_type::uint_32_x4: return &(gen_.xword);
-        case cpu_data_type::sint_32_x4: return &(gen_.xword);
-        case cpu_data_type::uint_64_x2: return &(gen_.xword);
-        case cpu_data_type::float_16_x8: return &(gen_.xword);
-        case cpu_data_type::float_32_x4: return &(gen_.xword);
-        // simd 256-bit/ 32-byte
-        case cpu_data_type::uint_8_x32: return &(gen_.yword);
-        case cpu_data_type::sint_8_x32: return &(gen_.yword);
-        case cpu_data_type::uint_16_x16: return &(gen_.yword);
-        case cpu_data_type::uint_32_x8: return &(gen_.yword);
-        case cpu_data_type::sint_32_x8: return &(gen_.yword);
-        case cpu_data_type::uint_64_x4: return &(gen_.yword);
-        case cpu_data_type::float_16_x16: return &(gen_.yword);
-        case cpu_data_type::float_32_x8: return &(gen_.yword);
-        // simd 512-bit/ 64-byte
-        case cpu_data_type::uint_8_x64: return &(gen_.zword);
-        case cpu_data_type::sint_8_x64: return &(gen_.zword);
-        case cpu_data_type::uint_16_x32: return &(gen_.zword);
-        case cpu_data_type::uint_32_x16: return &(gen_.zword);
-        case cpu_data_type::sint_32_x16: return &(gen_.zword);
-        case cpu_data_type::uint_64_x8: return &(gen_.zword);
-        case cpu_data_type::float_16_x32: return &(gen_.zword);
-        case cpu_data_type::float_32_x16: return &(gen_.zword);
-        // avx512 mask
-        case cpu_data_type::mask_x4: return &(gen_.byte);
-        case cpu_data_type::mask_x8: return &(gen_.byte);
-        case cpu_data_type::mask_x16: return &(gen_.word);
-        case cpu_data_type::mask_x32: return &(gen_.dword);
-        case cpu_data_type::mask_x64: return &(gen_.qword);
-        // not supported
-        case cpu_data_type::void_t: {
-            COMPILE_ASSERT(false, "Invalid address_frame: " << cpu_dtype);
-        } break;
-    }
-    return nullptr;
-}
-
-const content_hash_map<expr_c, Xbyak::Label> &
-location_manager::encode_simd_constant() {
-    std::function<int8_t(union_val)> select_s8
-            = [](union_val u) -> int8_t { return (int8_t)u.s64; };
-    std::function<uint8_t(union_val)> select_u8
-            = [](union_val u) -> uint8_t { return (uint8_t)u.u64; };
-    std::function<uint16_t(union_val)> select_u16
-            = [](union_val u) -> uint16_t { return (uint16_t)u.u64; };
-    std::function<uint32_t(union_val)> select_u32
-            = [](union_val u) -> uint32_t { return (uint32_t)u.u64; };
-    std::function<int32_t(union_val)> select_s32
-            = [](union_val u) -> int32_t { return (int32_t)u.s64; };
-    std::function<float(union_val)> select_f32
-            = [](union_val u) -> float { return (float)u.f32; };
-    std::function<uint16_t(union_val)> select_bf16
-            = [](union_val u) -> uint16_t { return bf16_t(u.f32).storage_; };
-    std::function<uint16_t(union_val)> select_f16
-            = [](union_val u) -> uint16_t { return fp16_t(u.f32).storage_; };
-    uint8_t buffer[64];
-    for (auto &c : simd_constant_vec_) {
-        auto simd_it = simd_constant_map_.find(c);
-        assert(simd_it != simd_constant_map_.end());
-        auto v = simd_it->first.static_as<constant_c>();
-        auto lanes = v->dtype_.lanes_;
-        auto type_code = v->dtype_.type_code_;
-        auto size = get_data_type_size(get_cpu_data_type(v->dtype_));
-        // padding for all constant data to be 32bit+
-        size = std::max(size, (size_t)4);
-        assert(size <= 64);
-        switch (type_code) {
-            case sc_data_etype::BF16: {
-                encode_simd_to_buffer(
-                        (uint16_t *)buffer, lanes, v->value_, select_bf16);
-            } break;
-            case sc_data_etype::U8: {
-                encode_simd_to_buffer(
-                        (uint8_t *)buffer, lanes, v->value_, select_u8);
-            } break;
-            case sc_data_etype::S8: {
-                encode_simd_to_buffer(
-                        (int8_t *)buffer, lanes, v->value_, select_s8);
-            } break;
-            case sc_data_etype::U16: {
-                encode_simd_to_buffer(
-                        (uint16_t *)buffer, lanes, v->value_, select_u16);
-            } break;
-            case sc_data_etype::U32: {
-                encode_simd_to_buffer(
-                        (uint32_t *)buffer, lanes, v->value_, select_u32);
-            } break;
-            case sc_data_etype::S32: {
-                encode_simd_to_buffer(
-                        (int32_t *)buffer, lanes, v->value_, select_s32);
-            } break;
-            case sc_data_etype::F32: {
-                encode_simd_to_buffer(
-                        (float *)buffer, lanes, v->value_, select_f32);
-            } break;
-            case sc_data_etype::F16: {
-                encode_simd_to_buffer(
-                        (uint16_t *)buffer, lanes, v->value_, select_f16);
-            } break;
-            default:
-                COMPILE_ASSERT(false, "Can't encode constant: " << v->dtype_);
-        }
-        gen_.align(16);
-        gen_.L(simd_it->second);
-        gen_.db(buffer, size);
-    }
-    return simd_constant_map_;
-}
-
-template <typename T>
-void location_manager::encode_simd_to_buffer(T *buffer, uint32_t lanes,
-        const std::vector<union_val> &value,
-        std::function<T(union_val)> select_val) {
-    if (value.size() == lanes) {
-        for (size_t i = 0; i < lanes; i++) {
-            T val = select_val(value[i]);
-            buffer[i] = val;
-        }
-    } else if (value.size() == 1) {
-        T val = select_val(value[0]);
-        for (size_t i = 0; i < lanes; i++) {
-            buffer[i] = val;
-        }
-    } else {
-        COMPILE_ASSERT(false, "Encode constant error");
-    }
-}
-
-template void location_manager::encode_simd_to_buffer<int8_t>( //
-        int8_t *, uint32_t, const std::vector<union_val> &,
-        std::function<int8_t(union_val)>);
-template void location_manager::encode_simd_to_buffer<uint8_t>( //
-        uint8_t *, uint32_t, const std::vector<union_val> &,
-        std::function<uint8_t(union_val)>);
-template void location_manager::encode_simd_to_buffer<uint16_t>( //
-        uint16_t *, uint32_t, const std::vector<union_val> &,
-        std::function<uint16_t(union_val)>);
-template void location_manager::encode_simd_to_buffer<uint32_t>( //
-        uint32_t *, uint32_t, const std::vector<union_val> &,
-        std::function<uint32_t(union_val)>);
-template void location_manager::encode_simd_to_buffer<int32_t>( //
-        int32_t *, uint32_t, const std::vector<union_val> &,
-        std::function<int32_t(union_val)>);
-template void location_manager::encode_simd_to_buffer<float>( //
-        float *, uint32_t, const std::vector<union_val> &,
-        std::function<float(union_val)>);
-
-//==============================================================================
-// Location management
-//==============================================================================
-
-expr_location location_manager::get_location(const expr_c &v) {
-    if (local_location_map_.find(v) != local_location_map_.end()) {
-        return local_location_map_[v];
-    } else if (expr_location_map_.find(v) != expr_location_map_.end()) {
-        return expr_location_map_[v];
-    } else if (GET_VIRTUAL_REG(v).allocated()) {
-        return allocate_free_reg(v);
-    } else if (v.isa<constant>()) {
-        return get_location(v.static_as<constant_c>());
-    } else {
-        COMPILE_ASSERT(false,
-                "expr not found, v=" << v << "={" << GET_VIRTUAL_REG(v) << "}");
-    }
-    return expr_location();
-}
-
-expr_location location_manager::get_location(const constant_c &c) {
-    auto constant_clear_nan = [](const constant_c &c) -> constant_c {
-        // if the constant to encode is f32 NaN, represent it as u32
-        // so content_hash can regard same NaN binary as equal
-        // e.g. used in fabs and log
-        if (c->dtype_.is_etype(sc_data_etype::F32) && c->value_.size() == 1
-                && std::isnan(c->value_[0].f32)) {
-            auto new_c = c->remake();
-            new_c->dtype_.type_code_ = sc_data_etype::U32;
-            new_c->temp_data() = GET_EXPR_DATA(c);
-            return new_c.static_as<constant_c>();
-        }
-        return c;
-    };
-    auto v = constant_clear_nan(c);
-    auto data_type = get_cpu_data_type(v->dtype_);
-    if (GET_VIRTUAL_REG(v).spilled()) {
-        // Save to encode value inside code later
-        auto simd_iter = simd_constant_map_.find(v);
-        if (simd_iter == simd_constant_map_.end()) {
-            simd_iter = simd_constant_map_.insert(simd_iter,
-                    std::make_pair<expr_c, Xbyak::Label>(v, Xbyak::Label()));
-            simd_constant_vec_.emplace_back(v);
-        }
-        // Add to location map
-        auto addr = get_offset_address(simd_iter->second, data_type);
-        auto loc_iter = expr_location_map_.find(v);
-        assert(loc_iter == expr_location_map_.end());
-        loc_iter = expr_location_map_.insert(loc_iter,
-                std::make_pair<expr_c, expr_location>(
-                        v, expr_location::make_simd_constant(addr, data_type)));
-        return loc_iter->second;
-    } else {
-        // Get immediate value
-        if (v->dtype_ == datatypes::bf16) {
-            // bf16x1 treated as uint_16, thus imm needs conversion
-            return expr_location::make_imm(
-                    bf16_t(v->value_[0].f32).storage_, data_type);
-        } else {
-            return expr_location::make_imm(v->value_[0].s64, data_type);
-        }
-    }
-}
-
-void location_manager::load_location_to_reg(
-        const Xbyak::Reg &reg, const expr_location &location) {
-    auto data_type = location.get_data_type();
-    auto op = get_operand(location);
-    switch (location.get_type()) {
-        case expr_location::type::imm: {
-            const auto imm = op.get_imm();
-            load_imm_value_to_reg(reg, imm, data_type);
-        } break;
-        case expr_location::type::reg: {
-            const auto &src = op.get_reg();
-            load_reg_value_to_reg(reg, src, data_type);
-        } break;
-        case expr_location::type::simd_constant: {
-            const auto &addr = op.get_addr();
-            load_mem_value_to_reg(reg, addr, data_type);
-        } break;
-        case expr_location::type::stack_var: {
-            const auto &addr = op.get_addr();
-            load_mem_value_to_reg(reg, addr, data_type);
-        } break;
-        case expr_location::type::stack_tensor: {
-            const auto &addr = op.get_addr();
-            load_mem_addr_to_reg(reg, addr, data_type);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid: load_location_to_reg");
-        }
-    }
-}
-
-void location_manager::load_imm_value_to_reg(const Xbyak::Reg &reg,
-        const uint64_t &imm, x86_64::cpu_data_type data_type) {
-    switch (data_type) {
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8: {
-            gen_.mov(to_reg8(reg), imm);
-        } break;
-        case cpu_data_type::sint_32: {
-            gen_.mov(to_reg32(reg), imm);
-        } break;
-        case cpu_data_type::uint_64: {
-            gen_.mov(to_reg64(reg), imm);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid: load_imm_value_to_reg");
-        }
-    }
-}
-
-void location_manager::load_reg_value_to_reg(const Xbyak::Reg &reg,
-        const Xbyak::Reg &src, x86_64::cpu_data_type data_type) {
-    if (operand(reg) == operand(src)) { return; }
-    switch (data_type) {
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8:
-        case cpu_data_type::sint_32:
-        case cpu_data_type::uint_16:
-        case cpu_data_type::uint_32:
-        case cpu_data_type::uint_64: {
-            gen_.mov(to_reg64(reg), to_reg64(src));
-        } break;
-        case cpu_data_type::float_32: {
-            gen_.vmovss(to_xmm(reg), to_xmm(src));
-        } break;
-        default: {
-            COMPILE_ASSERT(
-                    false, "Invalid: load_reg_value_to_reg " << data_type);
-        }
-    }
-}
-
-void location_manager::load_mem_value_to_reg(const Xbyak::Reg &reg,
-        const Xbyak::Address &addr, x86_64::cpu_data_type data_type) {
-    switch (data_type) {
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8: {
-            gen_.mov(to_reg8(reg), addr);
-        } break;
-        case cpu_data_type::sint_32: {
-            gen_.mov(to_reg32(reg), addr);
-        } break;
-        case cpu_data_type::uint_64: {
-            gen_.mov(to_reg64(reg), addr);
-        } break;
-        case cpu_data_type::float_32: {
-            gen_.vmovss(to_xmm(reg), addr);
-        } break;
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::sint_8_x16: {
-            gen_.vmovups(to_xmm(reg), addr);
-        } break;
-        case cpu_data_type::sint_32_x16:
-        case cpu_data_type::float_32_x16: {
-            gen_.vmovups(to_zmm(reg), addr);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid: load_mem_value_to_reg");
-        }
-    }
-}
-
-void location_manager::load_mem_addr_to_reg(const Xbyak::Reg &reg,
-        const Xbyak::Address &addr, x86_64::cpu_data_type data_type) {
-    switch (data_type) {
-        case cpu_data_type::uint_64: {
-            gen_.lea(to_reg64(reg), addr);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Invalid: load_mem_addr_to_reg");
-        }
-    }
-}
-
-Xbyak::RegExp location_manager::get_rbp_offset(const int64_t &offset) {
-    return gen_.rbp + offset;
-}
-
-Xbyak::RegRip location_manager::get_rip_offset(const Xbyak::Label &label) {
-    return gen_.rip + label;
-}
-
-Xbyak::Address location_manager::get_address(
-        const Xbyak::RegExp &exp, x86_64::cpu_data_type cpu_dtype) {
-    const auto xaf = get_address_frame(cpu_dtype);
-    return (*xaf)[exp];
-}
-
-Xbyak::Address location_manager::get_address(
-        const Xbyak::RegRip &rxp, x86_64::cpu_data_type cpu_dtype) {
-    const auto xaf = get_address_frame(cpu_dtype);
-    return (*xaf)[rxp];
-}
-
-Xbyak::Address location_manager::get_offset_address(
-        const int64_t &offset, x86_64::cpu_data_type cpu_dtype) {
-    return get_address(get_rbp_offset(offset), cpu_dtype);
-}
-
-Xbyak::Address location_manager::get_offset_address(
-        const Xbyak::Label &label, x86_64::cpu_data_type cpu_dtype) {
-    return get_address(get_rip_offset(label), cpu_dtype);
-}
-
-//==============================================================================
-// Register management
-//==============================================================================
-
-expr_location location_manager::allocate_free_reg(const expr_c &v) {
-    auto reg_loc = convert_virtual_reg(v);
-    expr_location_map_[v] = reg_loc;
-    return reg_loc;
-}
-
-expr_location location_manager::convert_virtual_reg(const expr_c &v) {
-    const auto cpu_dtype = get_cpu_data_type(v->dtype_);
-    const auto index = GET_VIRTUAL_REG(v).index_;
-    COMPILE_ASSERT(
-            index != virt_reg_const::invalid, "convert_virtual_reg failed");
-    auto reg = virtual_slots_map_->get_reg_physical(index);
-
-    if (v->dtype_.is_tile()) {
-        // skip get_cpu_data_type for tmm
-        return expr_location::make_reg(to_tmm(reg), cpu_dtype);
-    }
-    switch (cpu_dtype) {
-        // integer 8-bit/ 1-byte
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8: {
-            return expr_location::make_reg(to_reg8(reg), cpu_dtype);
-        }
-        // integer 16-bit/ 2-byte
-        case cpu_data_type::uint_16: {
-            return expr_location::make_reg(to_reg16(reg), cpu_dtype);
-        }
-        // integer 32-bit/ 4-byte
-        case cpu_data_type::uint_32:
-        case cpu_data_type::sint_32: {
-            return expr_location::make_reg(to_reg32(reg), cpu_dtype);
-        }
-        // integer 64-bit/ 8-byte
-        case cpu_data_type::uint_64: {
-            return expr_location::make_reg(to_reg64(reg), cpu_dtype);
-        }
-        // simd 32-bit/ 4-byte (fp16 16bit/2-byte)
-        case cpu_data_type::float_16:
-        case cpu_data_type::float_32: {
-            return expr_location::make_reg(to_xmm(reg), cpu_dtype);
-        }
-        // simd 64-bit/ 8-byte
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::uint_16_x4:
-        case cpu_data_type::uint_32_x2:
-        case cpu_data_type::sint_32_x2:
-        case cpu_data_type::float_16_x4:
-        case cpu_data_type::float_32_x2: {
-            return expr_location::make_reg(to_xmm(reg), cpu_dtype);
-        }
-        // simd 128-bit/ 16-byte
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::sint_8_x16:
-        case cpu_data_type::uint_16_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::uint_64_x2:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_32_x4: {
-            return expr_location::make_reg(to_xmm(reg), cpu_dtype);
-        }
-        // simd 256-bit/ 32-byte
-        case cpu_data_type::uint_8_x32:
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_64_x4:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_32_x8: {
-            return expr_location::make_reg(to_ymm(reg), cpu_dtype);
-        }
-        // simd 512-bit/ 64-byte
-        case cpu_data_type::uint_8_x64:
-        case cpu_data_type::sint_8_x64:
-        case cpu_data_type::uint_16_x32:
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x16:
-        case cpu_data_type::uint_64_x8:
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_32_x16: {
-            return expr_location::make_reg(to_zmm(reg), cpu_dtype);
-        }
-        // simd mask
-        case cpu_data_type::mask_x4:
-        case cpu_data_type::mask_x8:
-        case cpu_data_type::mask_x16:
-        case cpu_data_type::mask_x32:
-        case cpu_data_type::mask_x64: {
-            if (cpu_flags_.fAVX512F) {
-                return expr_location::make_reg(to_mask(reg), cpu_dtype);
-            } else {
-                return expr_location::make_reg(to_reg64(reg), cpu_dtype);
-            }
-        }
-        // not supported
-        case cpu_data_type::void_t: {
-            COMPILE_ASSERT(false, "Invalid virtual_reg cpu dtype.");
-        } break;
-    }
-    return expr_location();
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/location_manager.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/location_manager.hpp
deleted file mode 100644
index c13b7026d38..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/location_manager.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_LOCATION_MANAGER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_LOCATION_MANAGER_HPP
-
-#include <memory>
-#include <ostream>
-#include <set>
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/ir/content_hash.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <util/string_utils.hpp>
-
-#include <compiler/jit/xbyak/backend/expr_location.hpp>
-#include <compiler/jit/xbyak/backend/operand.hpp>
-#include <compiler/jit/xbyak/backend/stack_frame_model.hpp>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <compiler/jit/xbyak/ir/reg_allocation/virtual_slot.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_expr.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_function_interface.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-#include <compiler/jit/xbyak/x86_64/target_profile.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class location_manager {
-public:
-    location_manager(stack_frame_model &sf_model, Xbyak::CodeGenerator &gen,
-            const x86_64::target_profile_t &profile);
-
-    virtual ~location_manager();
-
-    //--------------------------------------------------------------------------
-    // Stack operation interface
-    //--------------------------------------------------------------------------
-    int64_t stack_push(const expr_c &v);
-    int64_t stack_push(const expr_location &location);
-    int64_t stack_push(const uint64_t &imm, x86_64::cpu_data_type dtype);
-    int64_t stack_push(const Xbyak::Reg &reg, x86_64::cpu_data_type dtype);
-    int64_t stack_push(const Xbyak::Address &addr, x86_64::cpu_data_type dtype);
-
-    int64_t stack_pop(const expr_c &v);
-    int64_t stack_pop(const expr_location &location);
-    int64_t stack_pop(const Xbyak::Reg &reg, x86_64::cpu_data_type dtype);
-    int64_t stack_pop(const Xbyak::Address &addr, x86_64::cpu_data_type dtype);
-
-    void stack_padding(const size_t &padding_bytes_needed,
-            const std::string &comment = "");
-    void stack_restore(const size_t &stack_diff_to_restore);
-
-    size_t stack_var_define(x86_64::cpu_data_type cpu_dtype,
-            const std::string &name = "", const std::string &comment = "");
-    size_t stack_tensor_define(x86_64::cpu_data_type cpu_dtype, size_t num_elem,
-            const std::string &name = "", const std::string &comment = "");
-
-    void stack_allocate(size_t slot_size);
-
-    size_t get_stack_current_size();
-    int64_t get_stack_top_rbp_offset();
-
-    void conserve_stack_size();
-    void restore_stack_size();
-
-    //--------------------------------------------------------------------------
-    // Function call interface
-    //--------------------------------------------------------------------------
-    /// Prepare value of each callee argument to facilitate abi function call
-    void handle_call_arg(const expr_c &arg, const expr_c &v);
-    /// Push caller saved expr
-    void push_caller_saved(const std::vector<expr_c> &caller_saved);
-    /// Prepare value of each function argument to track value passed from the
-    /// caller to the callee. Update expr_location_map_ accordingly.
-    void handle_func_params(const std::vector<expr> &func_params,
-            const x86_64::abi_function_interface &func_iface);
-    /// Prepare stack to meet ABI alignment requirement
-    void align_call_stack(const x86_64::abi_function_interface &callee_iface);
-    /// Pop caller saved expr
-    void pop_caller_saved();
-
-    //--------------------------------------------------------------------------
-    // Codegen operation interface
-    //--------------------------------------------------------------------------
-    void handle_definition(const expr_c &v);
-    void handle_spilled_definition(const std::vector<expr_c> &defined_spill);
-
-    void prepare_local_scope(const std::vector<expr_c> &local_spill);
-    void conclude_local_scope();
-
-    void emit_callee_prologue(const std::set<virt_reg_index_t> &register_usage);
-    void emit_callee_epilogue();
-
-    void expire(stmt_index_t current_index);
-
-    void clear();
-
-    //--------------------------------------------------------------------------
-    // Codegen Operand interface
-    //--------------------------------------------------------------------------
-    // get operand of expr location
-    operand get_operand(const expr_location &v);
-    // get operand of expr if expr exist
-    operand get_operand(const expr_c &v);
-
-    // get operand addr of indexing expr ptr[idx]
-    operand get_operand_indexing(const indexing_c &v);
-    // get operand addr of SIB structured for amx load/store
-    operand get_operand_sib(
-            const expr_c &base, const expr_c &indx, const expr_c &disp);
-
-    //--------------------------------------------------------------------------
-    // MISC. interface
-    //--------------------------------------------------------------------------
-    bool is_stack_tensor(const expr_c &v);
-
-    size_t get_data_type_size(x86_64::cpu_data_type data_type);
-    size_t get_data_slot_size(x86_64::cpu_data_type data_type);
-    size_t get_tensor_slot_size(
-            x86_64::cpu_data_type data_type, const size_t &num_elem);
-
-    size_t get_tensor_static_num_elements(const tensor_c &v);
-    size_t get_conserved_stack_size() const;
-
-    /// Assuming our usual mapping of sc_data_type_t to CPU-native data types, /
-    /// return a data frame of the appropriate width. E.g., \c gen_->dword or
-    /// \c gen_->qword.
-    const Xbyak::AddressFrame *get_address_frame(
-            const x86_64::cpu_data_type cpu_dtype);
-
-    template <typename T>
-    void encode_simd_to_buffer(T *buffer, uint32_t lanes,
-            const std::vector<union_val> &value,
-            std::function<T(union_val)> select_val);
-    const content_hash_map<expr_c, Xbyak::Label> &encode_simd_constant();
-
-private:
-    //--------------------------------------------------------------------------
-    // Location management
-    //--------------------------------------------------------------------------
-    expr_location get_location(const expr_c &v);
-    expr_location get_location(const constant_c &v);
-
-    void load_location_to_reg(const Xbyak::Reg &reg, //
-            const expr_location &location);
-    void load_imm_value_to_reg(const Xbyak::Reg &reg, //
-            const uint64_t &imm, x86_64::cpu_data_type data_type);
-    void load_reg_value_to_reg(const Xbyak::Reg &reg, //
-            const Xbyak::Reg &src, x86_64::cpu_data_type data_type);
-    void load_mem_value_to_reg(const Xbyak::Reg &reg, //
-            const Xbyak::Address &addr, x86_64::cpu_data_type data_type);
-    void load_mem_addr_to_reg(const Xbyak::Reg &reg, //
-            const Xbyak::Address &addr, x86_64::cpu_data_type data_type);
-
-    Xbyak::Address get_address(
-            const Xbyak::RegExp &exp, x86_64::cpu_data_type cpu_dtype);
-    Xbyak::Address get_address(
-            const Xbyak::RegRip &rxp, x86_64::cpu_data_type cpu_dtype);
-    // %rbp + offset
-    Xbyak::RegExp get_rbp_offset(const int64_t &offset);
-    // %rip + label
-    Xbyak::RegRip get_rip_offset(const Xbyak::Label &label);
-    // AddressFrame[%rbp + offset]
-    Xbyak::Address get_offset_address(
-            const int64_t &offset, x86_64::cpu_data_type cpu_dtype);
-    // AddressFrame[%rip + label]
-    Xbyak::Address get_offset_address(
-            const Xbyak::Label &label, x86_64::cpu_data_type cpu_dtype);
-
-    //--------------------------------------------------------------------------
-    // Register management
-    //--------------------------------------------------------------------------
-    expr_location allocate_free_reg(const expr_c &v);
-    expr_location convert_virtual_reg(const expr_c &v);
-
-    //--------------------------------------------------------------------------
-    // MISC.
-    //--------------------------------------------------------------------------
-    stack_frame_model &sf_model_;
-    Xbyak::CodeGenerator &gen_;
-    const x86_64::target_profile_t &profile_;
-    const runtime::cpu_flags_t &cpu_flags_;
-
-    std::shared_ptr<virtual_slots_map_t> virtual_slots_map_;
-
-    std::vector<expr_c> caller_saved_;
-    std::vector<expr_location> callee_saved_;
-
-    std::vector<size_t> conserved_stack_;
-
-    std::unordered_map<expr_c, expr_location> local_location_map_;
-    std::unordered_map<expr_c, expr_location> expr_location_map_;
-    content_hash_map<expr_c, Xbyak::Label> simd_constant_map_;
-    std::vector<expr_c> simd_constant_vec_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operand.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operand.cpp
deleted file mode 100644
index 6261e1a5230..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operand.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "operand.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using K = Xbyak::Operand::Kind;
-#define KIND_REG (K::REG)
-#define KIND_XYZ (K::XMM | K::YMM | K::ZMM)
-#define KIND_MSK (K::OPMASK)
-#define KIND_TMM (K::TMM)
-
-// constructor
-operand::operand() : type_(operand::type::none) {}
-
-operand::operand(operand::type tp, op_ptr_t ptr, //
-        int reg_kind, int reg_indx, int reg_bits)
-    : type_(tp) //
-    , content_(std::move(ptr))
-    , reg_kind_(reg_kind)
-    , reg_indx_(reg_indx)
-    , reg_bits_(reg_bits) {}
-
-operand::operand(operand::type tp, op_ptr_t ptr)
-    : type_(tp) //
-    , content_(std::move(ptr)) {
-    if (type_ == operand::type::reg) {
-        const auto &reg = content_->as<Xbyak::Reg>();
-        reg_kind_ = reg.is(KIND_XYZ) ? KIND_XYZ : reg.getKind();
-        reg_indx_ = reg.getIdx();
-        reg_bits_ = reg.getBit();
-    }
-}
-
-operand::operand(int64_t imm)
-    : type_(operand::type::imm) //
-    , content_(wrap_op_ptr(imm)) {}
-
-operand::operand(Xbyak::Address addr)
-    : type_(operand::type::addr) //
-    , content_(wrap_op_ptr(addr)) {}
-
-template <typename RegT>
-operand::operand(RegT reg) : type_(operand::type::reg) {
-    reg_kind_ = reg.is(KIND_XYZ) ? KIND_XYZ : reg.getKind();
-    reg_indx_ = reg.getIdx();
-    reg_bits_ = reg.getBit();
-    content_ = wrap_op_ptr(std::move(reg));
-}
-
-// reg constructors
-template operand::operand(Xbyak::Reg reg);
-template operand::operand(Xbyak::Reg8 reg);
-template operand::operand(Xbyak::Reg16 reg);
-template operand::operand(Xbyak::Reg32 reg);
-template operand::operand(Xbyak::Reg64 reg);
-template operand::operand(Xbyak::Xmm reg);
-template operand::operand(Xbyak::Ymm reg);
-template operand::operand(Xbyak::Zmm reg);
-template operand::operand(Xbyak::Tmm reg);
-template operand::operand(Xbyak::Opmask reg);
-
-// get operand type
-operand::type operand::get_type() const {
-    return type_;
-}
-
-// get imm operand
-int64_t operand::get_imm() const {
-    assert(type_ == operand::type::imm);
-    return content_->as<int64_t>();
-}
-
-// get reg/addr operand
-const Xbyak::Reg &operand::get_reg() const {
-    assert(type_ == operand::type::reg);
-    return content_->as<Xbyak::Reg>();
-}
-
-const Xbyak::Address &operand::get_addr() const {
-    assert(type_ == operand::type::addr);
-    return content_->as<Xbyak::Address>();
-}
-
-const Xbyak::Operand &operand::get_operand() const {
-    assert(type_ == operand::type::reg || type_ == operand::type::addr);
-    return content_->as<Xbyak::Operand>();
-}
-
-// get reg[8/16/32/64] operand
-Xbyak::Reg64 operand::get_reg64() const {
-    assert(is_reg());
-    return get_reg().cvt64();
-}
-
-Xbyak::Reg32 operand::get_reg32() const {
-    assert(is_reg());
-    return get_reg().cvt32();
-}
-
-Xbyak::Reg16 operand::get_reg16() const {
-    assert(is_reg());
-    return get_reg().cvt16();
-}
-
-Xbyak::Reg8 operand::get_reg8() const {
-    assert(is_reg());
-    return get_reg().cvt8();
-}
-
-// get [xyz]mm operand
-const Xbyak::Xmm &operand::get_xmm() const {
-    assert(is_xyz());
-    return content_->as<Xbyak::Xmm>();
-}
-
-const Xbyak::Ymm &operand::get_ymm() const {
-    assert(is_xyz());
-    return content_->as<Xbyak::Ymm>();
-}
-
-const Xbyak::Zmm &operand::get_zmm() const {
-    assert(is_xyz());
-    return content_->as<Xbyak::Zmm>();
-}
-
-// get tmm operand
-const Xbyak::Tmm &operand::get_tmm() const {
-    assert(is_tmm());
-    return content_->as<Xbyak::Tmm>();
-}
-
-// get opmask operand
-const Xbyak::Opmask &operand::get_mask() const {
-    assert(is_mask());
-    return content_->as<Xbyak::Opmask>();
-}
-
-// check certain operand
-bool operand::is_imm() const {
-    return type_ == operand::type::imm;
-}
-
-bool operand::is_reg() const {
-    return type_ == operand::type::reg && reg_kind_ == KIND_REG;
-}
-
-bool operand::is_xyz() const {
-    return type_ == operand::type::reg && reg_kind_ == KIND_XYZ;
-}
-
-bool operand::is_tmm() const {
-    return type_ == operand::type::reg && reg_kind_ == KIND_TMM;
-}
-
-bool operand::is_mask() const {
-    return type_ == operand::type::reg && reg_kind_ == KIND_MSK;
-}
-
-bool operand::is_addr() const {
-    return type_ == operand::type::addr;
-}
-
-bool operand::is_r_m() const {
-    return is_reg() || is_addr();
-}
-
-bool operand::is_x_m() const {
-    return is_xyz() || is_addr();
-}
-
-// check reg operand size
-bool operand::is_reg(int bit) const {
-    return is_reg() && is_bit(bit);
-}
-
-bool operand::is_xyz(int bit) const {
-    return is_xyz() && is_bit(bit);
-}
-
-bool operand::is_bit(int bit) const {
-    assert(reg_bits_ != 0);
-    return (bit & reg_bits_) != 0;
-}
-
-operand operand::set_evex(const operand &mask, bool zero) const {
-    assert(is_x_m());
-    // Copy current content and get Xbyak::Operand
-    auto new_content = op_content_t(*content_);
-    auto &op = new_content.as<Xbyak::Operand>();
-    // Set EVEX flags
-    if (mask.is_mask()) {
-        // EVEX mask
-        op.setOpmaskIdx(mask.get_mask().getIdx());
-    }
-    if (zero) {
-        // EVEX zero
-        op.setZero();
-    }
-    // new operand
-    return operand(type_, wrap_op_ptr(std::move(new_content)), //
-            reg_kind_, reg_indx_, reg_bits_);
-}
-
-// check same operand
-bool operand::operator==(const operand &b) const {
-    if (type_ == b.type_) {
-        switch (type_) {
-            case operand::type::none: {
-                return true;
-            }
-            case operand::type::imm: {
-                return get_imm() == b.get_imm();
-            }
-            case operand::type::reg: {
-                return reg_kind_ == b.reg_kind_ && reg_indx_ == b.reg_indx_;
-            }
-            case operand::type::addr: {
-                return get_addr() == b.get_addr();
-            }
-        }
-    }
-    return false;
-}
-
-// std iostream
-std::ostream &operator<<(std::ostream &os, const operand &op) {
-    switch (op.type_) {
-        case operand::type::none: {
-            os << "[empty_operand]";
-        } break;
-        case operand::type::imm: {
-            os << "[" << op.get_imm() << "]";
-        } break;
-        case operand::type::reg: {
-            os << "[" << op.get_reg().toString() << "]";
-        } break;
-        case operand::type::addr: {
-            os << "[addr_operand]";
-        } break;
-    }
-    return os;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operand.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operand.hpp
deleted file mode 100644
index 9eb3f5e944c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operand.hpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_OPERAND_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_OPERAND_HPP
-
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-#include <util/variant.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/**
- * Operand content, contains imm, Xbyak reg or addr.
- * xbyak content can be dynamic convert to its base classes.
- **/
-using op_content_t = variant<int64_t, Xbyak::Address, Xbyak::Reg, Xbyak::Reg8,
-        Xbyak::Reg16, Xbyak::Reg32, Xbyak::Reg64, Xbyak::Xmm, Xbyak::Ymm,
-        Xbyak::Zmm, Xbyak::Tmm, Xbyak::Opmask>;
-
-using op_ptr_t = std::shared_ptr<op_content_t>;
-
-template <typename T>
-op_ptr_t wrap_op_ptr(T op) {
-    return std::make_shared<op_content_t>(std::move(op));
-}
-
-/**
- * The xbyak codegen operand, contains imm, reg or addr.
- * Check instruction format and use suitable operand.
- **/
-class operand {
-public:
-    // constant
-    constexpr static bool T_z = true;
-    // operand type
-    enum type {
-        none = 0,
-        imm,
-        reg,
-        addr,
-    };
-
-    // constructor
-    operand();
-    operand(operand::type tp, op_ptr_t ptr);
-    operand(operand::type tp, op_ptr_t ptr, //
-            int reg_kind, int reg_indx, int reg_bits);
-
-    template <typename RegT>
-    explicit operand(RegT reg);
-    explicit operand(int64_t imm);
-    explicit operand(Xbyak::Address addr);
-
-    // get operand type
-    operand::type get_type() const;
-
-    // get imm operand
-    int64_t get_imm() const;
-
-    // get reg/addr operand
-    const Xbyak::Reg &get_reg() const;
-    const Xbyak::Address &get_addr() const;
-    const Xbyak::Operand &get_operand() const;
-
-    // get reg[8/16/32/64] operand
-    Xbyak::Reg64 get_reg64() const;
-    Xbyak::Reg32 get_reg32() const;
-    Xbyak::Reg16 get_reg16() const;
-    Xbyak::Reg8 get_reg8() const;
-
-    // get [xyz]mm operand
-    const Xbyak::Xmm &get_xmm() const;
-    const Xbyak::Ymm &get_ymm() const;
-    const Xbyak::Zmm &get_zmm() const;
-
-    // get tmm operand
-    const Xbyak::Tmm &get_tmm() const;
-
-    // get opmask operand
-    const Xbyak::Opmask &get_mask() const;
-
-    // check certain operand
-    bool is_imm() const;
-    bool is_reg() const;
-    bool is_xyz() const;
-    bool is_tmm() const;
-    bool is_mask() const;
-    bool is_addr() const;
-    bool is_r_m() const;
-    bool is_x_m() const;
-
-    // check reg operand size
-    bool is_reg(int bit) const;
-    bool is_xyz(int bit) const;
-    bool is_bit(int bit) const;
-
-    // Set evex for avx512 operands
-    operand set_evex(const operand &mask, bool zero = false) const;
-
-    // check same operand
-    bool operator==(const operand &b) const;
-
-    // std iostream
-    friend std::ostream &operator<<(std::ostream &os, const operand &op);
-
-private:
-    operand::type type_ = operand::type::none;
-    op_ptr_t content_;
-
-    // only used for mov compare
-    int reg_kind_ = 0;
-    int reg_indx_ = 0;
-    int reg_bits_ = 0;
-};
-
-std::ostream &operator<<(std::ostream &os, const operand &op);
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operations.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operations.hpp
deleted file mode 100644
index 0d838521068..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/operations.hpp
+++ /dev/null
@@ -1,553 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_OPERATIONS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_OPERATIONS_HPP
-
-#include "operand.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-//===========================================================================
-// Instruction Format Labels
-// * R: x86 reg, sub-reg: 8/16/32/64
-// * X/Y/Z: avx reg
-// * K: avx512 mask reg
-// * T: amx tile reg
-// * M: memory
-// * I: immediate
-//===========================================================================
-
-/*
- * X86_64 Instruction Format
- */
-
-#define X86_RM(GEN, INS, OP_1) \
-    { \
-        COMPILE_ASSERT(OP_1.is_r_m(), "Invalid x86_" #INS << ": " << OP_1); \
-        (GEN).INS(OP_1.get_operand()); \
-    }
-
-#define X86_R64_M(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_addr(), \
-                "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg64(), OP_2.get_addr()); \
-    }
-
-#define X86_R64_RM(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_r_m(), \
-                "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg64(), OP_2.get_operand()); \
-    }
-
-#define X86_R32_RM(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_r_m(), \
-                "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg32(), OP_2.get_operand()); \
-    }
-
-#define X86_R_RM(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_r_m(), \
-                "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg(), OP_2.get_operand()); \
-    }
-
-#define X86_R_RM_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_r_m() && OP_3.is_imm(), \
-                "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_reg(), OP_2.get_operand(), OP_3.get_imm()); \
-    }
-
-#define X86_RM_R8I(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_r_m() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_operand(), OP_2.get_reg8()); \
-        } else if (OP_1.is_r_m() && OP_2.is_imm()) { \
-            (GEN).INS(OP_1.get_operand(), OP_2.get_imm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define X86_RM_RMI(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_reg() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_reg(), OP_2.get_addr()); \
-        } else if (OP_1.is_r_m() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_operand(), OP_2.get_reg()); \
-        } else if (OP_1.is_r_m() && OP_2.is_imm()) { \
-            (GEN).INS(OP_1.get_operand(), OP_2.get_imm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define X86_RM_RM(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_reg() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_reg(), OP_2.get_addr()); \
-        } else if (OP_1.is_r_m() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_operand(), OP_2.get_reg()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define X86_R64_R64_R64(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        /*  */ if (OP_1.is_reg() && OP_2.is_reg() && OP_3.is_reg()) { \
-            (GEN).INS(OP_1.get_reg64(), OP_2.get_reg64(), OP_3.get_reg64()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid x86_" #INS << ": " << OP_1 << ", " << OP_2 << "," \
-                                        << OP_3); \
-        } \
-    }
-
-/*
- * AVX Instruction Format
- */
-
-#define AVX_R64_X(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_xyz(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg64(), OP_2.get_xmm()); \
-    }
-
-#define AVX_R32_XM(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_x_m(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg32(), OP_2.get_operand()); \
-    }
-
-#define AVX_R64_XM(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_reg() && OP_2.is_x_m(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_reg64(), OP_2.get_operand()); \
-    }
-
-#define AVX_RM_X_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_r_m() && OP_2.is_xyz() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_operand(), OP_2.get_xmm(), OP_3.get_imm()); \
-    }
-
-#define AVX_X_X_RM_I(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() \
-                        && (OP_3.is_reg() || OP_3.is_addr()) && OP_4.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << "," \
-                                    << OP_3 << "," << OP_4); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_operand(), \
-                OP_4.get_imm()); \
-    }
-
-#define AVX_Y_Y_YM_I(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() \
-                        && (OP_3.is_xyz() || OP_3.is_addr()) && OP_4.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << "," \
-                                    << OP_3 << "," << OP_4); \
-        (GEN).INS(OP_1.get_ymm(), OP_2.get_ymm(), OP_3.get_operand(), \
-                OP_4.get_imm()); \
-    }
-
-#define AVX_X_R(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_reg(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_reg()); \
-    }
-
-#define AVX_X_M(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_addr(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_addr()); \
-    }
-
-#define AVX_Y_M(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_addr(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_ymm(), OP_2.get_addr()); \
-    }
-
-#define AVX_X_XM(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_x_m(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_operand()); \
-    }
-
-#define AVX_XM_X(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_x_m() && OP_2.is_xyz(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_operand(), OP_2.get_xmm()); \
-    }
-
-#define AVX_X_X_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_imm()); \
-    }
-
-#define AVX_X_X_RM(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_r_m(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_operand()); \
-    }
-
-#define AVX_X_XM_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_x_m() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_operand(), OP_3.get_imm()); \
-    }
-
-#define AVX_X_X_XM(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_operand()); \
-    }
-
-#define AVX_Y_Y_YM(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_ymm(), OP_2.get_ymm(), OP_3.get_operand()); \
-    }
-
-#define AVX_X_M_X(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_addr() && OP_3.is_xyz(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_addr(), OP_3.get_xmm()); \
-    }
-
-#define AVX_X_X_XM_X(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m() \
-                        && OP_4.is_xyz(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3 << ", " << OP_4); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_operand(), \
-                OP_4.get_xmm()); \
-    }
-
-#define AVX_XM_Y_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_x_m() && OP_2.is_xyz() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_operand(), OP_2.get_ymm(), OP_3.get_imm()); \
-    }
-
-#define AVX_Y_YM_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_x_m() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_ymm(), OP_2.get_operand(), OP_3.get_imm()); \
-    }
-
-#define AVX_YM_Z_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_x_m() && OP_2.is_xyz() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_operand(), OP_2.get_zmm(), OP_3.get_imm()); \
-    }
-
-#define AVX_X_X_XM_I(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m() \
-                        && OP_4.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3 << ", " << OP_4); \
-        (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_operand(), \
-                OP_4.get_imm()); \
-    }
-
-#define AVX_Y_Y_XM_I(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m() \
-                        && OP_4.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3 << ", " << OP_4); \
-        (GEN).INS(OP_1.get_ymm(), OP_2.get_ymm(), OP_3.get_operand(), \
-                OP_4.get_imm()); \
-    }
-
-#define AVX_Z_Z_XM_I(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m() \
-                        && OP_4.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3 << ", " << OP_4); \
-        (GEN).INS(OP_1.get_zmm(), OP_2.get_zmm(), OP_3.get_operand(), \
-                OP_4.get_imm()); \
-    }
-
-#define AVX_XM_Z_I(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_x_m() && OP_2.is_xyz() && OP_3.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_operand(), OP_2.get_zmm(), OP_3.get_imm()); \
-    }
-
-#define AVX_K_X_XM_I(GEN, INS, OP_1, OP_2, OP_3, OP_4) \
-    { \
-        COMPILE_ASSERT(OP_1.is_mask() && OP_2.is_xyz() && OP_3.is_x_m() \
-                        && OP_4.is_imm(), \
-                "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3 << ", " << OP_4); \
-        (GEN).INS(OP_1.get_mask(), OP_2.get_xmm(), OP_3.get_operand(), \
-                OP_4.get_imm()); \
-    }
-
-#define AVX_XM_XM(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm()); \
-        } else if (OP_1.is_xyz() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_addr()); \
-        } else if (OP_1.is_addr() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_addr(), OP_2.get_xmm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_XM_X_XM(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_addr()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_addr()); \
-        } else if (OP_1.is_addr() && OP_2.is_xyz() && OP_3.is_xyz()) { \
-            (GEN).INS(OP_1.get_addr(), OP_2.get_xmm(), OP_3.get_xmm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 \
-                                        << ", " << OP_3); \
-        } \
-    }
-
-#define AVX_X_X_XI(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_xyz()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_xmm()); \
-        } else if (OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_imm()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_imm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 \
-                                        << ", " << OP_3); \
-        } \
-    }
-
-#define AVX_X_XM_XMI(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_xyz() && OP_3.is_x_m()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm(), OP_3.get_operand()); \
-        } else if (OP_1.is_xyz() && OP_2.is_x_m() && OP_3.is_imm()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_operand(), OP_3.get_imm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2 \
-                                        << ", " << OP_3); \
-        } \
-    }
-
-#define AVX_X_XMR32(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_reg32()); \
-        } else if (OP_1.is_xyz() && OP_2.is_x_m()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_operand()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_X_XMR16(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_reg16()); \
-        } else if (OP_1.is_xyz() && OP_2.is_x_m()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_operand()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_X_XMR8(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_reg8()); \
-        } else if (OP_1.is_xyz() && OP_2.is_x_m()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_operand()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_XMR32_XMR32(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_reg32()); \
-        } else if (OP_1.is_xyz() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_addr()); \
-        } else if (OP_1.is_reg() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_reg32(), OP_2.get_xmm()); \
-        } else if (OP_1.is_addr() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_addr(), OP_2.get_xmm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_XMR64_XMR64(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_xyz() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_xmm()); \
-        } else if (OP_1.is_xyz() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_reg64()); \
-        } else if (OP_1.is_xyz() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_xmm(), OP_2.get_addr()); \
-        } else if (OP_1.is_reg() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_reg64(), OP_2.get_xmm()); \
-        } else if (OP_1.is_addr() && OP_2.is_xyz()) { \
-            (GEN).INS(OP_1.get_addr(), OP_2.get_xmm()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_KMR32_KMR32(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_mask() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_mask(), OP_2.get_reg32()); \
-        } else if (OP_1.is_mask() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_mask(), OP_2.get_addr()); \
-        } else if (OP_1.is_mask() && OP_2.is_mask()) { \
-            (GEN).INS(OP_1.get_mask(), OP_2.get_mask()); \
-        } else if (OP_1.is_addr() && OP_2.is_mask()) { \
-            (GEN).INS(OP_1.get_addr(), OP_2.get_mask()); \
-        } else if (OP_1.is_reg() && OP_2.is_mask()) { \
-            (GEN).INS(OP_1.get_reg32(), OP_2.get_mask()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-#define AVX_KMR64_KMR64(GEN, INS, OP_1, OP_2) \
-    { \
-        /*  */ if (OP_1.is_mask() && OP_2.is_reg()) { \
-            (GEN).INS(OP_1.get_mask(), OP_2.get_reg64()); \
-        } else if (OP_1.is_mask() && OP_2.is_addr()) { \
-            (GEN).INS(OP_1.get_mask(), OP_2.get_addr()); \
-        } else if (OP_1.is_mask() && OP_2.is_mask()) { \
-            (GEN).INS(OP_1.get_mask(), OP_2.get_mask()); \
-        } else if (OP_1.is_addr() && OP_2.is_mask()) { \
-            (GEN).INS(OP_1.get_addr(), OP_2.get_mask()); \
-        } else if (OP_1.is_reg() && OP_2.is_mask()) { \
-            (GEN).INS(OP_1.get_reg64(), OP_2.get_mask()); \
-        } else { \
-            COMPILE_ASSERT(false, \
-                    "Invalid avx_" #INS << ": " << OP_1 << ", " << OP_2); \
-        } \
-    }
-
-/*
- * AMX Instruction Format
- */
-
-#define AMX_M(GEN, INS, OP_1) \
-    { \
-        COMPILE_ASSERT(OP_1.is_addr(), "Invalid AMX_" #INS << ": " << OP_1); \
-        (GEN).INS(OP_1.get_addr()); \
-    }
-
-#define AMX_T(GEN, INS, OP_1) \
-    { \
-        COMPILE_ASSERT(OP_1.is_tmm(), "Invalid AMX_" #INS << ": " << OP_1); \
-        (GEN).INS(OP_1.get_tmm()); \
-    }
-
-#define AMX_T_T_T(GEN, INS, OP_1, OP_2, OP_3) \
-    { \
-        COMPILE_ASSERT(OP_1.is_tmm() && OP_2.is_tmm() && OP_3.is_tmm(), \
-                "Invalid AMX_" #INS << ": " << OP_1 << ", " << OP_2 << ", " \
-                                    << OP_3); \
-        (GEN).INS(OP_1.get_tmm(), OP_2.get_tmm(), OP_3.get_tmm()); \
-    }
-
-#define AMX_T_M(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_tmm() && OP_2.is_addr(), \
-                "Invalid AMX_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_tmm(), OP_2.get_addr()); \
-    }
-
-#define AMX_M_T(GEN, INS, OP_1, OP_2) \
-    { \
-        COMPILE_ASSERT(OP_1.is_addr() && OP_2.is_tmm(), \
-                "Invalid AMX_" #INS << ": " << OP_1 << ", " << OP_2); \
-        (GEN).INS(OP_1.get_addr(), OP_2.get_tmm()); \
-    }
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/stack_frame_model.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/stack_frame_model.cpp
deleted file mode 100644
index 0f5a213e3b9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/stack_frame_model.cpp
+++ /dev/null
@@ -1,517 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <cassert>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-#include <vector>
-#include <util/string_utils.hpp>
-#include <util/utils.hpp>
-
-#include "stack_frame_model.hpp"
-
-using std::cout;
-using std::endl;
-using std::ostringstream;
-using std::string;
-using std::vector;
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using utils::make_unique;
-using utils::replace_newlines;
-
-//==============================================================================
-// Methods for logging / tracing...
-//==============================================================================
-
-std::vector<std::string> stack_frame_model::stack_item::dump_members() const {
-    vector<string> v;
-
-    ostringstream os;
-    os << "stack_size_before_item_=" << stack_size_before_item_;
-    v.push_back(os.str());
-
-    if (!debug_comment_.empty()) {
-        os.str("");
-        os << "debug_comment_=\"" << replace_newlines(debug_comment_, " ")
-           << "\"";
-        v.push_back(os.str());
-    }
-
-    return v;
-}
-
-std::vector<std::string> stack_frame_model::slot::dump_members() const {
-    vector<string> v(stack_item::dump_members());
-
-    ostringstream os;
-    os << "slot_size_=" << slot_size_;
-    v.push_back(os.str());
-
-    os.str("");
-    os << "val_type_=" << val_type_;
-    v.push_back(os.str());
-
-    const int64_t rbp_offset = get_rbp_offset();
-    os.str("");
-    os << "%rbp offset:";
-    os << " (dec)" << rbp_offset;
-    if (rbp_offset >= 0) {
-        os << " (hex)0x" << std::hex << rbp_offset << std::dec;
-    } else {
-        os << " (hex)-0x" << std::hex << (-1 * rbp_offset) << std::dec;
-    }
-    v.push_back(os.str());
-
-    return v;
-}
-
-std::vector<std::string>
-stack_frame_model::anonymous_slot::dump_members() const {
-    return slot::dump_members();
-}
-
-std::string stack_frame_model::anonymous_slot::dump_brief() const {
-    ostringstream os;
-    os << "(size:" << slot_size_ << ")";
-    return os.str();
-}
-
-std::string stack_frame_model::anonymous_slot::dump_item_kind() const {
-    return "ANONYMOUS SLOT";
-};
-
-std::vector<std::string>
-stack_frame_model::named_tensor_buffer_slot::dump_members() const {
-    vector<string> v(named_slot::dump_members());
-
-    ostringstream os;
-    os << "num_elements_=" << num_elements_;
-    v.push_back(os.str());
-
-    return v;
-}
-
-std::vector<std::string> stack_frame_model::named_slot::dump_members() const {
-    vector<string> v(slot::dump_members());
-
-    ostringstream os;
-    os << "name_=\"" << name_ << "\"";
-    v.push_back(os.str());
-
-    return v;
-}
-
-std::vector<std::string>
-stack_frame_model::caller_param_slot::dump_members() const {
-    vector<string> v(named_slot::dump_members());
-
-    ostringstream os;
-    os << "callee_rbp_offset_=" << callee_rbp_offset_;
-    v.push_back(os.str());
-
-    return v;
-}
-
-std::string stack_frame_model::caller_param_slot::dump_brief() const {
-    ostringstream os;
-    os << "(";
-    os << "param-name:\"" << name_ << "\"";
-    os << " size:" << slot_size_;
-    os << ")";
-    return os.str();
-}
-
-std::string stack_frame_model::caller_param_slot::dump_item_kind() const {
-    return "CALLER-PARAM SLOT";
-}
-
-std::string stack_frame_model::named_slot::dump_brief() const {
-    ostringstream os;
-    os << "(";
-    os << "name:\"" << name_ << "\"";
-    os << " size:" << slot_size_;
-    os << ")";
-    return os.str();
-}
-
-std::string stack_frame_model::named_slot::dump_item_kind() const {
-    return "NAMED SLOT";
-};
-
-std::string stack_frame_model::named_tensor_buffer_slot::dump_brief() const {
-    ostringstream os;
-    os << "(";
-    os << "name:\"" << name_ << "\"";
-    os << " #elem:" << num_elements_;
-    os << " size:" << slot_size_;
-    os << ")";
-    return os.str();
-}
-
-std::string
-stack_frame_model::named_tensor_buffer_slot::dump_item_kind() const {
-    return "NAMED TENSOR BUFFER SLOT";
-};
-
-std::vector<std::string>
-stack_frame_model::lexical_scope::dump_members() const {
-    return stack_item::dump_members();
-}
-
-std::string stack_frame_model::lexical_scope::dump_brief() const {
-    return "(S)";
-}
-
-std::string stack_frame_model::lexical_scope::dump_item_kind() const {
-    return "LEXICAL SCOPE";
-};
-
-void stack_frame_model::dump(std::ostream &os) const {
-    os << "stack_frame_model:" << endl;
-}
-
-std::string stack_frame_model::one_line_summary() const {
-    ostringstream os;
-    os << "{";
-
-    for (const auto &kv : name_to_caller_param_slot_) {
-        os << " " << kv.second.dump_brief();
-    }
-
-    for (size_t i = 0; i < stack_.size(); ++i) {
-        const stack_item *const item = stack_[i].get();
-        os << " " << stack_[i]->dump_brief();
-    }
-
-    os << " }";
-    return os.str();
-}
-
-#define LOG_LINE(S1, ...) \
-    if (logging_enabled_) { \
-        cout << "[" << utils::brief_lineloc(__FILE__, __LINE__) << "]" \
-             << " " << S1 __VA_ARGS__ << endl; \
-    }
-
-#define LOG_FUNC_ENTRY \
-    if (logging_enabled_) { cout << __PRETTY_FUNCTION__ << " : ENTER" << endl; }
-
-#define LOG_FUNC_ENTRY_WITH_TEXT(S1, ...) \
-    if (logging_enabled_) { \
-        cout << __PRETTY_FUNCTION__ << " : ENTER: " << S1 __VA_ARGS__ << endl; \
-    }
-
-#define LOG_MODIFIER_FUNC_EXIT \
-    if (logging_enabled_) { \
-        cout << __PRETTY_FUNCTION__ << " : PRE-EXIT DUMP:" << endl; \
-        dump(cout); \
-        cout << endl; \
-    }
-
-#define LOG_FUNC_EXIT_WITH_TEXT(S1, ...) \
-    if (logging_enabled_) { \
-        cout << __PRETTY_FUNCTION__ << " : PRE-EXIT: " << S1 __VA_ARGS__ \
-             << endl; \
-    }
-
-#define LOG_MODIFIER_FUNC_EXIT_WITH_TEXT(S1, ...) \
-    if (logging_enabled_) { \
-        cout << __PRETTY_FUNCTION__ << " : PRE-EXIT DUMP:" << S1 __VA_ARGS__ \
-             << endl; \
-        dump(cout); \
-        cout << endl; \
-    }
-
-//==============================================================================
-
-stack_frame_model::stack_frame_model(bool logging_enabled)
-    : logging_enabled_(logging_enabled) {}
-
-void stack_frame_model::pop_top() {
-    LOG_FUNC_ENTRY
-    COMPILE_ASSERT(!stack_.empty(), "stack is empty already");
-
-    const stack_item *top = stack_.back().get();
-
-    stack_.pop_back();
-    LOG_MODIFIER_FUNC_EXIT
-}
-
-void stack_frame_model::push_scope(const std::string &debug_comment) {
-    LOG_FUNC_ENTRY_WITH_TEXT("debug_comment=\"" << debug_comment)
-    const size_t old_stack_size = get_size();
-    stack_.push_back(
-            utils::make_unique<lexical_scope>(old_stack_size, debug_comment));
-    LOG_MODIFIER_FUNC_EXIT
-}
-
-size_t stack_frame_model::get_top_scope_size() const {
-    LOG_FUNC_ENTRY
-    size_t scope_size = 0;
-
-    for (auto iter = stack_.crbegin(); iter != stack_.crend(); ++iter) {
-        const stack_item *item = iter->get();
-        if (const slot *s = dynamic_cast<const slot *>(item)) {
-            scope_size += s->slot_size_;
-        } else {
-            LOG_FUNC_EXIT_WITH_TEXT("scope_size=" << scope_size)
-            return scope_size;
-        }
-    }
-
-    COMPILE_ASSERT(false, "stack frame model has no scopes");
-}
-
-size_t stack_frame_model::pop_scope() {
-    LOG_FUNC_ENTRY
-    size_t scope_size = 0;
-
-    while (!stack_.empty()) {
-        const stack_item *top = stack_.back().get();
-
-        if (const slot *s = dynamic_cast<const slot *>(top)) {
-            scope_size += s->slot_size_;
-            pop_top();
-        } else {
-            pop_top();
-            LOG_MODIFIER_FUNC_EXIT_WITH_TEXT("scope_size="
-                    << scope_size << " stack-size=" << get_size());
-            return scope_size;
-        }
-    }
-
-    COMPILE_ASSERT(false, "stack frame model has no scopes");
-}
-
-void stack_frame_model::push_named_object(const std::string &name,
-        x86_64::cpu_data_type val_type, size_t num_bytes,
-        const std::string &debug_comment) {
-    LOG_FUNC_ENTRY_WITH_TEXT("name=\"" << name << "\""
-                                       << " num_bytes=" << num_bytes
-                                       << " debug_comment=\"" << debug_comment
-                                       << "\"")
-    assert_unused_name(name);
-    COMPILE_ASSERT(!name.empty(), "named objects cannot have blank name");
-    COMPILE_ASSERT(
-            num_bytes > 0, "stack_frame_model items must have positive sizes");
-
-    const size_t old_stack_size = get_size();
-    stack_.push_back(utils::make_unique<named_slot>(
-            name, old_stack_size, num_bytes, val_type, debug_comment));
-
-    LOG_MODIFIER_FUNC_EXIT_WITH_TEXT("stack-size=" << get_size());
-}
-
-void stack_frame_model::push_named_tensor_buffer_object(const std::string &name,
-        x86_64::cpu_data_type val_type, size_t num_elements, size_t num_bytes,
-        const std::string &debug_comment) {
-    LOG_FUNC_ENTRY_WITH_TEXT("name=\""
-            << name << "\""
-            << " num_elements=" << num_elements << " num_bytes=" << num_bytes
-            << " debug_comment=\"" << debug_comment << "\"")
-    assert_unused_name(name);
-    COMPILE_ASSERT(!name.empty(), "named objects cannot have blank name");
-    COMPILE_ASSERT(
-            num_bytes > 0, "stack_frame_model items must have positive sizes");
-
-    const size_t old_stack_size = get_size();
-    stack_.push_back(utils::make_unique<named_tensor_buffer_slot>(name,
-            old_stack_size, num_bytes, val_type, num_elements, debug_comment));
-
-    LOG_MODIFIER_FUNC_EXIT_WITH_TEXT("stack-size=" << get_size());
-}
-
-void stack_frame_model::push_anonymous_object(x86_64::cpu_data_type val_type,
-        size_t num_bytes, const std::string &debug_comment) {
-    LOG_FUNC_ENTRY_WITH_TEXT("num_bytes=" << num_bytes << " debug_comment=\""
-                                          << debug_comment << "\"")
-
-    COMPILE_ASSERT(
-            num_bytes > 0, "stack_frame_model items must have positive sizes");
-
-    const size_t old_stack_size = get_size();
-    stack_.push_back(utils::make_unique<anonymous_slot>(
-            old_stack_size, num_bytes, val_type, debug_comment));
-    LOG_MODIFIER_FUNC_EXIT_WITH_TEXT("stack-size=" << get_size());
-}
-
-void stack_frame_model::shrink(size_t num_bytes) {
-    LOG_FUNC_ENTRY_WITH_TEXT("num_bytes=" << num_bytes)
-
-    const size_t old_stack_size = get_size();
-    COMPILE_ASSERT(num_bytes <= get_size(),
-            "stack_frame_model size is " << old_stack_size
-                                         << " but trying to shrink by "
-                                         << num_bytes);
-
-    size_t remaining_bytes_to_remove = num_bytes;
-    while (remaining_bytes_to_remove > 0) {
-        LOG_LINE("remaining_bytes_to_remove = " << remaining_bytes_to_remove);
-        assert(!stack_.empty());
-        stack_item *top = stack_.back().get();
-
-        if (slot *s = dynamic_cast<slot *>(top)) {
-            COMPILE_ASSERT(remaining_bytes_to_remove >= s->slot_size_,
-                    "shrink(...) can't remove partial slots");
-            remaining_bytes_to_remove -= s->slot_size_;
-            pop_top();
-        } else {
-            COMPILE_ASSERT(false, "shrink(...) can't remove lexical scopes.");
-        }
-    }
-    LOG_MODIFIER_FUNC_EXIT_WITH_TEXT(
-            "old_stack_size=" << old_stack_size << " (new)size=" << get_size());
-}
-
-void stack_frame_model::shrink_to_size(size_t new_size) {
-    LOG_FUNC_ENTRY_WITH_TEXT("new_size=" << new_size)
-    size_t current_size = get_size();
-    assert(new_size <= current_size);
-
-    size_t remaining_bytes_to_remove = current_size - new_size;
-
-    while (remaining_bytes_to_remove > 0) {
-        assert(!stack_.empty());
-        stack_item *top = stack_.back().get();
-
-        if (slot *s = dynamic_cast<slot *>(top)) {
-            COMPILE_ASSERT(remaining_bytes_to_remove >= s->slot_size_,
-                    "shrink(...) can't remove partial slots");
-            remaining_bytes_to_remove -= s->slot_size_;
-            pop_top();
-        } else {
-            // the stack item is a lexical scope.
-            pop_top();
-        }
-    }
-
-    // Eliminate any lexical scopes still at the top...
-    while ((remaining_bytes_to_remove == 0)
-            && (dynamic_cast<lexical_scope *>(stack_.back().get()))) {
-        pop_top();
-    }
-
-    LOG_MODIFIER_FUNC_EXIT_WITH_TEXT("(old)current_size="
-            << current_size << " (new)size=" << get_size());
-}
-
-size_t stack_frame_model::get_size() const {
-    // LOG_FUNC_ENTRY
-    if (stack_.empty()) {
-        // LOG_FUNC_EXIT_WITH_TEXT("(empty) return 0")
-        return 0;
-    }
-
-    const stack_item *item = stack_.back().get();
-    assert(item);
-
-    if (const slot *s = dynamic_cast<const slot *>(item)) {
-        const size_t stack_size = s->stack_size_before_item_ + s->slot_size_;
-        // LOG_FUNC_EXIT_WITH_TEXT("(slot) return " << stack_size)
-        return stack_size;
-    } else {
-        const size_t stack_size = item->stack_size_before_item_;
-        // LOG_FUNC_EXIT_WITH_TEXT("(scope) return " << stack_size)
-        return stack_size;
-    }
-}
-
-void stack_frame_model::clear() {
-    LOG_FUNC_ENTRY
-    stack_.clear();
-    name_to_caller_param_slot_.clear();
-    LOG_MODIFIER_FUNC_EXIT
-};
-
-void stack_frame_model::assert_unused_name(const std::string &name) {
-    COMPILE_ASSERT(name_to_caller_param_slot_.find(name)
-                    == name_to_caller_param_slot_.end(),
-            "A caller-parameter slot with name \"" << name
-                                                   << "\" already exists.");
-}
-
-void stack_frame_model::add_caller_param_slot(const caller_param_slot &s) {
-    LOG_FUNC_ENTRY_WITH_TEXT("name=\"" << s.name_ << "\""
-                                       << " slot_size=" << s.slot_size_
-                                       << " debug_comment=\""
-                                       << s.debug_comment_ << "\"")
-    COMPILE_ASSERT(!s.name_.empty(), "named objects cannot have blank name");
-    COMPILE_ASSERT(s.slot_size_ > 0,
-            "stack_frame_model items must have positive sizes");
-    assert_unused_name(s.name_);
-    name_to_caller_param_slot_.insert(std::make_pair(s.name_, s));
-}
-
-int64_t stack_frame_model::get_named_object_rbp_offset(
-        const std::string &name) const {
-    int64_t offset;
-    const bool success = try_get_named_object_rbp_offset(name, offset);
-    COMPILE_ASSERT(success,
-            "stack_frame_model has no stack item named '" << name << "'");
-    return offset;
-}
-
-const stack_frame_model::named_slot *stack_frame_model::try_get_named_slot(
-        const std::string &name) const {
-    const auto iter2 = name_to_caller_param_slot_.find(name);
-    if (iter2 != name_to_caller_param_slot_.end()) { return &(iter2->second); }
-
-    return nullptr;
-}
-
-const stack_frame_model::slot *stack_frame_model::get_top_slot() const {
-    const stack_item *top = stack_.back().get();
-    const slot *s = dynamic_cast<const slot *>(top);
-    assert(s);
-    return s;
-}
-
-int64_t stack_frame_model::slot::get_rbp_offset() const {
-    return -1 * (stack_size_before_item_ + slot_size_);
-}
-
-bool stack_frame_model::try_get_named_object_rbp_offset(
-        const std::string &name, int64_t &offset) const {
-    LOG_FUNC_ENTRY_WITH_TEXT("name=\"" << name << "\"");
-
-    {
-        const auto iter = name_to_caller_param_slot_.find(name);
-        if (iter != name_to_caller_param_slot_.end()) {
-            offset = iter->second.callee_rbp_offset_;
-
-            LOG_FUNC_EXIT_WITH_TEXT("offset=" << offset);
-            return true;
-        }
-    }
-
-    LOG_FUNC_EXIT_WITH_TEXT("name not found");
-    return false;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/stack_frame_model.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/stack_frame_model.hpp
deleted file mode 100644
index ff88d2bf039..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/stack_frame_model.hpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_STACK_FRAME_MODEL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_STACK_FRAME_MODEL_HPP
-
-#include <cassert>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-
-#ifdef _MSC_VER
-#define __PRETTY_FUNCTION__ __FUNCSIG__
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/**
- * Model the state of the stack frame at a particular moment in the execution of
- * some function.
- *
- * Useful for tracking which Variables/Tensors are currently
- * stored on the stack, and determining the assembly operand needed to access
- * them.
- *
- * # STACK OBJECTS:
- * The model supports four kinds of stack-based objects:
- *
- *   - named slots. By assigning a name to an object on the stack model, we
- *     have an easy way to find that object's (modeled) location during
- *     JIT translation.
- *
- *   - anonymous slots. Used for things such as temporary rvalues, where
- *     there's no obvious name for the object, nor is there a need to look it up
- *     by name.
- *
- *   - lexical scopes. Used to model lexical scopes from the source program. A
- *     scope doesn't occupy any storage space by itself, but it provides a
- *     logical grouping of all objects above it (i.e., closer to the top) on the
- *     stack.  Provides information needed for the generated code to efficiently
- *     shrink the stack frame.
- *
- *   - caller stack parameter. Used to model stack slots that were created by
- *     the caller to communicate parameter values.
- *
- * # STACK SLOTS: TYPES AND SIZES:
- * Each stack slot holds a single value of some CPU-native type.
- *   - For this discussion, we consider a SIMD vector to be a single value.
- *
- *   - This scheme isn't intended to support C/C++ structs or bitfields.
- *
- * Each slot's size (in bytes) will be equal to or greater than the size
- * of the slot's CPU-native type.
- * (See \c x86_64::cpu_data_types::size_in_bytes_.)
- *
- * Sometimes a slot's size will be larger. Some reasons for this:
- *
- *   - The slot is used to store a function-call's parameter or return value,
- *     and the psABI requires slot sizes to be multiples of 8 bytes.
- *
- *   - The CPU-native data value, stored within the slot, must have a
- *     certain address alignment. Required by the psABI and/or the
- *     Xbyak JIT engine's code-generator design.
- *
- * Whenever a slot size exceeds the CPU-native type's size, the CPU-native
- * value will be stored in the *lowest* address range of the slot.
- *
- * # OBJECT ADDRESSES:
- * There are two reasonable ways to compute the runtime address of a
- * stack-allocated object:
- *
- *   - relative to the base of the stack frame (%rbp), or
- *
- *   - relative to the top of the stack frame (%rsp).
- *
- * This class currently supports %rbp-relative addressing
- * (see \c get_named_object_rbp_offset()), but could easily add %rsp-relative
- * addressing if needed.
- *
- * # DEBUG COMMENTS:
- * Each stack object has an optional debug_comment string. This string is used
- * only to annotate the human-readable output of the \c dump() and
- * \c one_line_summary() methods.
- */
-class stack_frame_model {
-public:
-    stack_frame_model(bool logging_enabled);
-
-    ///=========================================================================
-    /// Class hierarchy for individual objects on the modeled stack...
-    ///=========================================================================
-    struct stack_item {
-        stack_item(size_t stack_size_before_item, std::string debug_comment)
-            : stack_size_before_item_(stack_size_before_item)
-            , debug_comment_(std::move(debug_comment)) {}
-
-        virtual ~stack_item() {};
-
-        /// Same value as you'd get from adding together the 'item_size_' field
-        /// from all lower stack items. Stored here for convenient lookups.
-        /// Value is 0 for caller_param_slot objects.
-        size_t stack_size_before_item_;
-
-        // For debugging / tracing / logging...
-        std::string debug_comment_;
-        virtual std::vector<std::string> dump_members() const;
-        virtual std::string dump_brief() const = 0;
-        virtual std::string dump_item_kind() const = 0;
-    };
-
-    struct slot : public stack_item {
-        slot(size_t stack_size_before_item, size_t slot_size,
-                x86_64::cpu_data_type val_type, std::string debug_comment)
-            : stack_item(stack_size_before_item, debug_comment)
-            , slot_size_(slot_size)
-            , val_type_(val_type) {
-            assert(slot_size >= x86_64::get_cpu_data_types()
-                                        .lookup(val_type)
-                                        .size_in_bytes_);
-        }
-
-        /// Number of bytes this occupies on the stack.
-        size_t slot_size_;
-
-        /// The CPU-level datatype stored in this slot.
-        x86_64::cpu_data_type val_type_;
-
-        virtual std::vector<std::string> dump_members() const override;
-
-        int64_t get_rbp_offset() const;
-    };
-
-    struct anonymous_slot : public slot {
-        anonymous_slot(size_t stack_size_before_item, size_t slot_size,
-                x86_64::cpu_data_type val_type, std::string debug_comment)
-            : slot(stack_size_before_item, slot_size, val_type, debug_comment) {
-        }
-
-        virtual std::vector<std::string> dump_members() const override;
-        virtual std::string dump_brief() const override;
-        virtual std::string dump_item_kind() const override;
-    };
-
-    struct named_slot : public slot {
-        named_slot(std::string name, size_t stack_size_before_item,
-                size_t slot_size, x86_64::cpu_data_type val_type,
-                std::string debug_comment)
-            : slot(stack_size_before_item, slot_size, val_type, debug_comment)
-            , name_(name) {}
-
-        std::string name_;
-
-        virtual std::vector<std::string> dump_members() const override;
-        virtual std::string dump_brief() const override;
-        virtual std::string dump_item_kind() const override;
-    };
-
-    struct named_tensor_buffer_slot : public named_slot {
-        named_tensor_buffer_slot(std::string name,
-                size_t stack_size_before_item, size_t slot_size,
-                x86_64::cpu_data_type element_type, size_t num_elements,
-                std::string debug_comment)
-            : named_slot(name, stack_size_before_item, slot_size, element_type,
-                    debug_comment)
-            , num_elements_(num_elements) {}
-
-        size_t num_elements_;
-
-        virtual std::vector<std::string> dump_members() const override;
-        virtual std::string dump_brief() const override;
-        virtual std::string dump_item_kind() const override;
-    };
-
-    struct caller_param_slot : public named_slot {
-        caller_param_slot(std::string name, size_t slot_size,
-                x86_64::cpu_data_type val_type, size_t callee_rbp_offset,
-                std::string debug_comment)
-            : named_slot(name, 0, slot_size, val_type, debug_comment)
-            , callee_rbp_offset_(callee_rbp_offset) {}
-
-        // The offset relative to the value of $rbp after the callee has created
-        // its stack frame.
-        size_t callee_rbp_offset_;
-
-        virtual std::vector<std::string> dump_members() const override;
-        virtual std::string dump_brief() const override;
-        virtual std::string dump_item_kind() const override;
-    };
-
-    struct lexical_scope : public stack_item {
-        lexical_scope(size_t stack_size_before_item, std::string debug_comment)
-            : stack_item(stack_size_before_item, debug_comment) {}
-
-        virtual std::vector<std::string> dump_members() const override;
-        virtual std::string dump_brief() const override;
-        virtual std::string dump_item_kind() const override;
-    };
-
-    //=========================================================================
-
-    void push_named_object(const std::string &name,
-            x86_64::cpu_data_type val_type, size_t num_bytes,
-            const std::string &debug_comment = "");
-
-    void push_named_tensor_buffer_object(const std::string &name,
-            x86_64::cpu_data_type val_type, size_t num_elements,
-            size_t num_bytes, const std::string &debug_comment = "");
-
-    void push_anonymous_object(x86_64::cpu_data_type val_type, size_t num_bytes,
-            const std::string &debug_comment = "");
-
-    void add_caller_param_slot(const caller_param_slot &s);
-
-    /// Start a new scope on the call stack.
-    /// This is for book-keeping only: it lets us keep track of how much the
-    /// stack has grown, and which named objects have been added to the stack,
-    /// since the current scope began.
-    /// Useful for generating stack-cleanup code when the target program's
-    /// control flow actually exits the scope.
-    void push_scope(const std::string &debug_comment = "");
-
-    /// Returns the number of bytes occupied by the current scope on the
-    /// stack. Computed as (current stack size) minus (stack size when the
-    /// current top scope was pushed).
-    size_t get_top_scope_size() const;
-
-    /// Removes the top scope from the stack, and shrinks the stack size
-    /// accordingly. The stack shrinkage is equivalent to calling
-    /// \c shrink(get_top_scope_size()).
-    /// Returns the number of bytes removed from the stack.
-    /// It is an error to call this when the stack contains no scope.
-    size_t pop_scope();
-
-    /// Reduce the size of the stack by the specified number of bytes.
-    /// Remove all named objects in the freed region of the stack.
-    /// Raise an exception if anything seems strange, given the intended usage.
-    void shrink(size_t num_bytes);
-
-    /// Eliminate as many stack items as necessary to reach the indicated stack
-    /// size. Also remove any lexical scopes that are at the top of the stack
-    /// once the stack size is \p new_size.
-    ///
-    /// It's an error to specify a new stack size that would require partial
-    /// deletion of a stack item.
-    ///
-    /// \param new_size The new size (in bytes) of the modeled stack.
-    /// Cannot be greater than the current size (see \c get_size()).
-    void shrink_to_size(size_t new_size);
-
-    /// Size of the stack. I.e.: (%rbp - %rsp)
-    size_t get_size() const;
-
-    /// If \p name refers to an existing \c named_slot or
-    /// \c named_tensor_buffer_slot object, return a pointer to that object.
-    /// Otherwise return \c nullptr.
-    /// NOTE! ANY modification to this \c stack_frame_model may invalidate
-    /// this pointer.
-    const named_slot *try_get_named_slot(const std::string &name) const;
-
-    /// Get the solt on top of stack
-    const slot *get_top_slot() const;
-
-    /// Return the stack size that resulted from pushing the specified object.
-    /// Used for computing the %rbp-relative address of that object at runtime.
-    /// This value will always be negative, because no x86-64 the call stack
-    /// grows downward.
-    int64_t get_named_object_rbp_offset(const std::string &name) const;
-
-    /// If the stack has a slot named \p name, set \p offset accordingly and
-    /// return \c true. Otherwise return \c false.
-    bool try_get_named_object_rbp_offset(
-            const std::string &name, int64_t &rbp_offset) const;
-
-    /// Reset the stack frame model to a fully empty state.
-    void clear();
-
-    /// Prints a human-readable, ASCII-art representation of the current state
-    /// of the stack model. Useful for debugging.
-    void dump(std::ostream &os) const;
-
-    /// Returns a human-readable, single-line summary of the current state of
-    /// the stack model. Useful for inclusion in log file and assembly
-    /// annontations.
-    std::string one_line_summary() const;
-
-private:
-    /// If \p name is already a key in
-    /// \c name_to_caller_param_slot_, report an error via
-    /// \c COMPILE_ASSERT. Otherwise do nothing.
-    void assert_unused_name(const std::string &name);
-
-    std::vector<std::unique_ptr<stack_item>> stack_;
-
-    void pop_top();
-
-    /// This map contains an entry for each function argument that was passed
-    /// via a slot on the call stack, instead of via a register.
-    std::map<std::string, caller_param_slot> name_to_caller_param_slot_;
-
-    bool logging_enabled_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_jit_generator.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_jit_generator.cpp
deleted file mode 100644
index 8a65a56991e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_jit_generator.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iomanip>
-#include <iostream>
-
-#include "xbyak_jit_generator.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-xbyak_jit_generator::xbyak_jit_generator()
-    : Xbyak::CodeGenerator(Xbyak::DEFAULT_MAX_CODE_SIZE, Xbyak::AutoGrow) {}
-
-void *xbyak_jit_generator::get_func_address(
-        const std::string &func_name) const {
-    auto iter = func_name_to_address_.find(func_name);
-    if (iter == func_name_to_address_.end()) {
-        return nullptr;
-    } else {
-        return iter->second;
-    }
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_jit_generator.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_jit_generator.hpp
deleted file mode 100644
index 353b2674245..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_jit_generator.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_XBYAK_JIT_GENERATOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_XBYAK_JIT_GENERATOR_HPP
-
-// We need these headers to be included in the specified order...
-// clang-format off
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-// clang-format on
-
-#include <memory>
-#include <vector>
-#include <map>
-#include <string>
-#include <compiler/jit/xbyak/debug/debug_info_mgr.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-/**
- * @class xbyak_jit_generator
- *
- * \brief Provides JIT translation services during translation, and owns
- * the memory containing the resulting code and data.
- *
- * The class ::xbyak::CodeGenerator provides two distinct services, which
- * (ideally) we'd prefer to keep separate:
- *
- * 1) ownership of the memory in which the JIT'ed code and data reside, and
- *
- * 2) methods and member variables that are used <i>during</i> JIT translation
- *    and are irrelevant once translation is complete.
- *
- * This class extends ::Xbyak::CodeGenerator, and so we're stuck
- * with this class also serving both of those roles.
- *
- * Once the act of JIT translation is complete, the most of the class members
- * that this class inherets from its ancestor classes are no longer relevant.
- */
-
-class xbyak_jit_generator : public ::Xbyak::CodeGenerator {
-public:
-    xbyak_jit_generator();
-    virtual ~xbyak_jit_generator() = default;
-
-    // The entry-point address of the specified JIT'ed function, or null if
-    // none has that name.
-    void *get_func_address(const std::string &func_name) const;
-
-private:
-    friend class xbyak_lowering_viewer;
-    std::vector<std::unique_ptr<debug_info_mgr>> debug_info_;
-    std::map<std::string, void *> func_name_to_address_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_lowering_viewer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_lowering_viewer.cpp
deleted file mode 100644
index e1c21c922af..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_lowering_viewer.cpp
+++ /dev/null
@@ -1,3483 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <set>
-#include <utility>
-
-#include <mutex>
-#include <compiler/ir/pass/printer.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/jit/xbyak/backend/operations.hpp>
-#include <compiler/jit/xbyak/debug/debug_info_mgr.hpp>
-#include <compiler/jit/xbyak/ir/transform/call_transform.hpp>
-#include <compiler/jit/xbyak/ir/transform/register_allocation.hpp>
-#include <compiler/jit/xbyak/ir/util/utils.hpp>
-#include <compiler/jit/xbyak/x86_64/type_mapping.hpp>
-#include <compiler/jit/xbyak/xbyak_jit.hpp>
-#include <util/optional.hpp>
-#include <util/utils.hpp>
-
-#include "xbyak_lowering_viewer.hpp"
-
-SC_MODULE(xbyakjit.xbyak_lowering_viewer)
-
-using std::endl;
-using std::ostringstream;
-using std::string;
-using std::vector;
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using namespace utils;
-using namespace xbyak::x86_64;
-
-static const bool log_module_info_enabled
-        = bool(runtime::get_info_logging_stream(__sc_module_name));
-
-// An ostream-expression fragment to briefly summarize the current content
-// of the stack model.
-#define SM_BRIEF " sm:" << sf_model_.one_line_summary()
-
-//==============================================================================
-// MACROS / DEFINITIONS FOR (CONDITIONAL) GENERATION OF ASM LISTINGS
-//==============================================================================
-
-// Adds an asm-listing comment at the current Xbyak insertion point, using
-// the current asm-listing indentation level.
-#define ASM_COMMENT(S1, ...) \
-    if (utils::compiler_configs_t::get().xbyak_jit_asm_listing_) { \
-        ostringstream os; \
-        os << asm_listing_ind_ << S1 __VA_ARGS__; \
-        add_code_comment(os.str()); \
-    }
-
-#define ASM_COMMENT_WITH_IR(NODE, S1, ...) \
-    if (utils::compiler_configs_t::get().xbyak_jit_asm_listing_) { \
-        ostringstream os; \
-        os << asm_listing_ind_ << S1 __VA_ARGS__; \
-        add_code_comment(os.str(), NODE.get()); \
-    }
-
-// Like ASM_COMMENT, but add simd constant and dtype in the output text.
-#define ASM_CONSTANT_COMMENT(MAP) \
-    if (utils::compiler_configs_t::get().xbyak_jit_asm_listing_) { \
-        for (const auto &kv : (MAP)) { \
-            ostringstream os; \
-            os << asm_listing_ind_ << c2s(kv.first); \
-            code_comments_.emplace_back(kv.second, os.str()); \
-        } \
-    }
-
-xbyak_lowering_viewer::code_comment::code_comment(
-        const Xbyak::Label &label, std::string comment)
-    : label_(label), comment_(std::move(comment)) {}
-
-void xbyak_lowering_viewer::add_code_comment(std::string text) {
-    code_comments_.emplace_back(Xbyak::Label(), std::move(text));
-    gen_->L(code_comments_.back().label_);
-}
-
-std::mutex debug_info_lock;
-
-std::vector<std::unique_ptr<debug_info_mgr>>
-xbyak_lowering_viewer::dump_code_comments(std::ostream &os) {
-    std::vector<debug_line> lines;
-    std::vector<func_symbol> symbols;
-    // Use a map to sort the comments by memory address, to help humans relate
-    // the dump output to gdb's asm display of the JIT'ed code.
-    for (auto &iter : func_name_to_entry_label_) {
-        const auto &key = iter.first;
-        void *entry = (void *)(func_name_to_entry_label_[key].getAddress());
-        void *exit = (void *)(func_name_to_exit_label_[key].getAddress());
-        if (entry == nullptr || exit == nullptr) { continue; }
-        os << utils::as_hex_t(entry) << " - " << utils::as_hex_t(exit) << endl;
-        symbols.emplace_back(func_symbol {key, entry, exit});
-    }
-
-    for (auto &line : debug_lines_) {
-        void *p = (void *)(line.label_.getAddress());
-        const source_pos *src
-                = some_opt(line.ir_node_)
-                          .map([](const node_base *p) {
-                              return p->attr_.get();
-                          })
-                          .map([](const any_map_t *v) {
-                              return v->get_or_null<source_pos>("source_pos");
-                          })
-                          .get_or_else(nullptr);
-        if (src) { lines.emplace_back(debug_line {p, src->line_, src->pos_}); }
-    }
-
-    std::map<void *, std::vector<std::string>> m;
-    for (auto &item : code_comments_) {
-        void *p = (void *)(item.label_.getAddress());
-        m[p].push_back(item.comment_);
-    }
-
-    for (const auto &item : m) {
-        bool first_line = true;
-
-        ostringstream os_a;
-        os_a << utils::as_hex_t(item.first);
-        const string a = os_a.str();
-
-        for (auto &comment : item.second) {
-            if (first_line) {
-                os << a;
-                first_line = false;
-            } else {
-                os << string(a.size(), ' ');
-            }
-            os << " : " << comment << endl;
-        }
-    }
-    std::lock_guard<std::mutex> guard {debug_info_lock};
-    return create_debug_info_mgr((void *)gen_->getCode(), gen_->getSize(),
-            "xbyak_ir.txt", symbols, lines);
-}
-
-//==============================================================================
-// MACRO DEFINITIONS TO SUPPORT LOGGING TO 'SC_MODULE_INFO'.
-//
-// NOTE:
-// - This is performance critical code, so in some cases we check to see if
-//   SC_MODULE_INFO is actually enabled before executing logging-specific code.
-//==============================================================================
-
-// Increases the logging indentation until the end of the lexical scope in
-// which this macro is invoked.
-#define LOGGING_INDENT auto __ind__ = logging_ind_.indent();
-
-#define LOGGING_OUT SC_MODULE_INFO << logging_ind_
-
-#define FUNC_INFO "[" << brief_pretty_function(__PRETTY_FUNCTION__) << "] "
-
-// Emits a single line of text to the logging output.
-#define LOG_LINE(S1, ...) \
-    LOGGING_OUT << "[" << brief_lineloc(__FILE__, __LINE__) << "]" << SM_BRIEF \
-                << " " << S1 __VA_ARGS__;
-
-// Emits a single line of logging output describing the code location
-// at which the macro was invoked.
-#define LOG_METHOD_BODY \
-    LOGGING_INDENT; \
-    { \
-        ostringstream os; \
-        os << brief_pretty_function(__PRETTY_FUNCTION__) << "[" \
-           << brief_lineloc(__FILE__, __LINE__) << "]"; \
-        os << SM_BRIEF; \
-        LOGGING_OUT << replace_newlines(os.str(), " \\ "); \
-    }
-
-// Similar to LOG_METHOD_BODY, but also logs the string representation of
-// the variable/parameter 'v' at the macro's point of invocation.
-// Useful for IR-node-handling methods, which often use the symbol
-// 'v' to indicate the IR node being handled.
-#define LOG_METHOD_BODY_V \
-    LOGGING_INDENT; \
-    { \
-        ostringstream os; \
-        os << brief_pretty_function(__PRETTY_FUNCTION__) << "[" \
-           << brief_lineloc(__FILE__, __LINE__) << "]"; \
-        os << SM_BRIEF; \
-        os << " v=" << v; \
-        LOGGING_OUT << replace_newlines(os.str(), " \\ "); \
-    }
-
-// Add more info to constant asm comment
-std::string c2s(const expr_c &e) {
-    std::stringstream ss;
-    auto v = e.static_as<constant_c>();
-    ss << v << " [";
-    for (unsigned i = 0; i < v->value_.size(); i++) {
-        ss << std::uppercase << std::hex << "0x" << v->value_[i].u64;
-        if (i != v->value_.size() - 1) { ss << ',' << ' '; }
-    }
-    ss << "] ";
-    ss << "[" << v->dtype_ << "]";
-    return ss.str();
-}
-
-//==============================================================================
-// CODEGEN MARCO
-//==============================================================================
-
-#define GET_OPERAND(ARG) (location_manager_->get_operand(ARG))
-#define GET_OPERAND_SIB(...) (location_manager_->get_operand_sib(__VA_ARGS__))
-#define XBYAK_GEN(INS, PATTERN, ...) \
-    SC_EXPAND(PATTERN(*gen_, INS, ##__VA_ARGS__))
-// SC_EXPAND because of MSVC:
-// https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
-
-//==============================================================================
-// SPECIAL MEMBER FUNCTION SECTION
-//==============================================================================
-
-xbyak_lowering_viewer::xbyak_lowering_viewer(const xbyak_jit &xje,
-        const ir_module_t &ir_mod, const x86_64::target_profile_t &profile)
-    : xje_(xje)
-    , p_ir_mod_(&ir_mod)
-    , profile_(profile)
-    , cpu_flags_(profile.target_machine_.cpu_flags_)
-    , sf_model_(
-              utils::compiler_configs_t::get().xbyak_jit_log_stack_frame_model_)
-    , logging_ind_(4)
-    , asm_listing_ind_(4) {
-    gen_.reset(new xbyak_jit_generator);
-    location_manager_.reset(new location_manager(sf_model_, *gen_, profile_));
-
-    // Set SIMD level
-    if (cpu_flags_.fAVX512F) {
-        simd_level_ = simd_level::avx512;
-    } else if (cpu_flags_.fAVX2) {
-        simd_level_ = simd_level::avx2;
-    } else if (cpu_flags_.fAVX) {
-        simd_level_ = simd_level::avx;
-    } else {
-        assert(cpu_flags_.fSSE42);
-        simd_level_ = simd_level::sse;
-    }
-    // Prepopulate the function-name symbol table, to let us lower a caller
-    // before lowering its callee...
-    for (const func_t &f : ir_mod.get_contents()) {
-        func_name_to_entry_label_[f->name_] = Xbyak::Label();
-        func_name_to_exit_label_[f->name_] = Xbyak::Label();
-    }
-    // default jmp type tp T_NEAR
-    gen_->setDefaultJmpNEAR(true);
-    // if support avx512, use EvexEncoding
-    gen_->setDefaultEncoding(simd_level_ == simd_level::avx512
-                    ? Xbyak::EvexEncoding
-                    : Xbyak::VexEncoding);
-
-    // ir_mod.get_module_vars() have been resolved in __module_data
-
-    // Manage module funcs generate
-    for (const func_t &f : ir_mod.get_contents()) {
-        // Do not generate should_inline functions
-        if (f->name_.find("_should_inline_") != string::npos) { continue; }
-        dispatch(f);
-    }
-
-    // We don't use jit_generator::getCode() because we potentially have
-    // multiple functions defined in this module.
-    gen_->readyRE();
-
-    SC_MODULE_INFO << "simd level: " << (int)simd_level_;
-    for (const auto &iter : func_name_to_entry_label_) {
-        void *entry_point = (void *)(iter.second.getAddress());
-        gen_->func_name_to_address_[iter.first] = entry_point;
-
-        SC_MODULE_INFO << "function '" << iter.first << "' has entry address "
-                       << utils::as_hex_t(entry_point);
-    };
-    auto &config = utils::compiler_configs_t::get();
-    if (config.xbyak_jit_pause_after_codegen_) {
-        std::cout << "Hit ENTER to continue." << std::endl;
-        std::cin.get();
-    }
-
-    if (config.xbyak_jit_asm_listing_) {
-        string fname = "asm_comments.txt";
-        std::ofstream f(fname);
-        f << "Code comments:" << endl;
-        gen_->debug_info_ = dump_code_comments(f);
-    }
-
-    if (config.xbyak_jit_save_obj_) {
-        // Save the generated object code to a file.
-        // Embed the code's starting virtual address in the
-        // filename, to help with disassembly.
-        const uint8_t *p_obj = gen_->getCode();
-        const size_t obj_size = gen_->getSize();
-
-        std::ostringstream os;
-        os << "jit-obj-from-" << as_hex_t(p_obj) << ".bin";
-        std::ofstream f(os.str(), std::ios::binary);
-        f.write((const char *)(p_obj), obj_size);
-    }
-
-    p_ir_mod_ = nullptr;
-}
-
-xbyak_lowering_viewer::~xbyak_lowering_viewer() = default;
-
-//==============================================================================
-// MEMBER FUNCTION SECTION
-//==============================================================================
-
-std::shared_ptr<xbyak_jit_generator>
-xbyak_lowering_viewer::get_jit_output() const {
-    return gen_;
-}
-
-const std::set<virt_reg_index_t> &
-xbyak_lowering_viewer::cached_func_register_usage(const func_t &v) {
-    assert(v->attr_ && v->attr_->has_key(attr_keys::register_usage));
-    const auto &register_usage = v->attr_->get<std::set<virt_reg_index_t>>(
-            attr_keys::register_usage);
-    return register_usage;
-}
-
-const std::vector<expr_c> &xbyak_lowering_viewer::cached_func_global_spilled(
-        const func_t &v) {
-    assert(v->attr_ && v->attr_->has_key(attr_keys::global_spilled));
-    const auto &global_spilled
-            = v->attr_->get<std::vector<expr_c>>(attr_keys::global_spilled);
-    return global_spilled;
-}
-
-void xbyak_lowering_viewer::handle_func_resolve(const std::string &name,
-        const execute_func_label &label_f, const execute_func_addr &addr_f) {
-    const auto ir_sym_table_iter = func_name_to_entry_label_.find(name);
-
-    if (ir_sym_table_iter != func_name_to_entry_label_.end()) {
-        const Xbyak::Label &callee_label = ir_sym_table_iter->second;
-        label_f(callee_label);
-    } else if (const void *callee_addr = xje_.external_symbol_resolver_(name)) {
-        addr_f(reinterpret_cast<uint64_t>(callee_addr));
-    } else {
-        COMPILE_ASSERT(false,
-                "Address/label lookup failed for function name \"" << name
-                                                                   << "\"");
-    }
-}
-
-void xbyak_lowering_viewer::handle_local_definition(
-        const expr_c &v, const expr_c &v_init) {
-    location_manager_->handle_definition(v);
-    if (v_init.defined()) { handle_operations(v, v_init); }
-}
-
-//==============================================================================
-// DISPATCH MEMBER FUNCTION SECTION
-//==============================================================================
-
-stmt_c xbyak_lowering_viewer::dispatch(stmt_c v) {
-    if (utils::compiler_configs_t::get().xbyak_jit_asm_listing_) {
-        debug_lines_.emplace_back(label_line {gen_->L(), v.get()});
-    }
-    stmt_c vv;
-    auto &stmt_data = GET_STMT_DATA(v);
-    if (!stmt_data.optimized_out_) { vv = ir_viewer_t::dispatch(std::move(v)); }
-    location_manager_->expire(stmt_data.index_);
-    return vv;
-}
-
-func_c xbyak_lowering_viewer::dispatch(func_c v) {
-    // FIXME: For now we're assuming that the only function in the supplied
-    // ir_module_t is the entry function.
-    // FIXME: We're also assuming that although the entry function's parameters
-    // are fully typed, right now we're going to emit the generic wrapper
-    // version of the function.
-
-    sf_model_.clear();
-    l_func_epilogue_.clear();
-
-    func_t func = std::const_pointer_cast<func_base>(v);
-    const auto &register_usage = cached_func_register_usage(func);
-    const auto &global_spilled = cached_func_global_spilled(func);
-    func_iface_ = cached_func_abi_interface(func);
-
-    gen_->align(16);
-    gen_->L(func_name_to_entry_label_[v->name_]);
-    ASM_COMMENT("function entry: <" << v->name_ << ">");
-
-    // Save caller-save registers, and create a stack frame
-    // After all argument-carrying and callee-save registers have been copied
-    // onto the stack, we can use whichever registers (within reason) we want.
-    ASM_COMMENT("prologue");
-    location_manager_->clear();
-    location_manager_->emit_callee_prologue(register_usage);
-    location_manager_->handle_func_params(v->params_, *func_iface_);
-    location_manager_->handle_spilled_definition(global_spilled);
-
-    dispatch(v->body_);
-
-    gen_->L(l_func_epilogue_);
-    ASM_COMMENT("epilogue");
-    location_manager_->emit_callee_epilogue();
-
-    gen_->ret();
-    gen_->L(func_name_to_exit_label_[v->name_]);
-    ASM_COMMENT("function exit: <" << v->name_ << ">");
-
-    // Encode all simd constants inside code after ret
-    auto &constant_map = location_manager_->encode_simd_constant();
-    ASM_CONSTANT_COMMENT(constant_map);
-
-    return std::const_pointer_cast<func_base>(v);
-}
-
-//==============================================================================
-// HANDLE OPERATIONS MEMBER FUNCTION SECTION
-//==============================================================================
-
-void xbyak_lowering_viewer::handle_operations(
-        const expr_c &lhs, const expr_c &rhs) {
-    switch (rhs->node_type_) {
-        case sc_expr_type::var:
-        case sc_expr_type::tensor:
-        case sc_expr_type::constant:
-        case sc_expr_type::indexing: {
-            handle_assign(lhs, rhs);
-        } break;
-        case sc_expr_type::cast: {
-            handle_cast(lhs, rhs.static_as<cast_c>());
-        } break;
-        case sc_expr_type::call: {
-            handle_call(lhs, rhs.static_as<call_c>());
-        } break;
-        case sc_expr_type::func_addr: {
-            handle_func_addr(lhs, rhs.static_as<func_addr_c>());
-        } break;
-        case sc_expr_type::tensorptr: {
-            handle_tensorptr(lhs, rhs.static_as<tensorptr_c>());
-        } break;
-        case sc_expr_type::low_level_intrin: {
-            handle_xbyak_intrin(lhs, rhs.checked_as<xbyak_intrin_c>());
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Not supported op: " << lhs << " = " << rhs);
-        } break;
-    }
-}
-
-void xbyak_lowering_viewer::handle_xbyak_intrin(
-        const expr_c &lhs, const xbyak_intrin_c &rhs) {
-    auto intrin = static_cast<xbyak_intrin_type>(rhs->type_);
-    switch (intrin) {
-        case xbyak_intrin_type::call_arg: {
-            location_manager_->handle_call_arg(lhs, rhs->args_[0]);
-        } break;
-        case xbyak_intrin_type::reinterpret: {
-            handle_reinterpret(lhs, rhs->args_[0]);
-        } break;
-        case xbyak_intrin_type::saturated_cast: {
-            handle_saturated_cast(lhs, rhs->args_[0]);
-        } break;
-        case xbyak_intrin_type::round_and_cast: {
-            handle_round_and_cast(lhs, rhs->args_[0]);
-        } break;
-        default: {
-            ASM_COMMENT(lhs << " = " << rhs);
-            switch (rhs->isa_) {
-                case xbyak_intrin_isa::x86: {
-                    handle_x86_intrisic(
-                            lhs, rhs->args_, intrin, rhs->modifier_);
-                } break;
-                case xbyak_intrin_isa::avx: {
-                    handle_avx_intrisic(
-                            lhs, rhs->args_, intrin, rhs->modifier_);
-                } break;
-                default: {
-                    COMPILE_ASSERT(false, FUNC_INFO << "Invalid isa.");
-                } break;
-            }
-        } break;
-    }
-}
-
-void xbyak_lowering_viewer::handle_x86_intrisic(const expr_c &dst,
-        array_ref<expr> args, const xbyak_intrin_type &intrin,
-        const xbyak_intrin_modifier &modifier) {
-    // Get low level data type
-    const cpu_data_type cpu_dtype = get_cpu_data_type(dst->dtype_);
-    // Generate x86 intrisics
-    switch (intrin) {
-        case xbyak_intrin_type::add: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(add, X86_RM_RMI, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::sub: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(sub, X86_RM_RMI, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::bit_or: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(or_, X86_RM_RMI, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::bit_and: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(and_, X86_RM_RMI, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::bit_xor: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(xor_, X86_RM_RMI, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::shl: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(shl, X86_RM_R8I, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::shr: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(shr, X86_RM_R8I, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::sar: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(sar, X86_RM_R8I, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::mul: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            XBYAK_GEN(imul, X86_R_RM, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::muli: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            XBYAK_GEN(imul, X86_R_RM_I, op_dst, op_lhs, op_rhs);
-        } break;
-        case xbyak_intrin_type::mulhl: {
-            // %rdx:%rax = mulhl([0]%rax, [1]src)
-            auto op_dst = GET_OPERAND(dst);
-            auto op_rax = GET_OPERAND(args[0]);
-            auto op_src = GET_OPERAND(args[1]);
-            assert(op_dst == operand(regs::rdx));
-            assert(op_rax == operand(regs::rax));
-            switch (get_type_category(dst->dtype_)) {
-                case type_category::CATE_INT: {
-                    XBYAK_GEN(imul, X86_RM, op_src);
-                } break;
-                case type_category::CATE_UINT: {
-                    XBYAK_GEN(mul, X86_RM, op_src);
-                } break;
-                default: COMPILE_ASSERT(false, "x86 mulhl type error");
-            }
-        } break;
-        case xbyak_intrin_type::sign_ext: {
-            // %rdx = x86_sign_ext([0]%rax)
-            auto op_rdx = GET_OPERAND(dst);
-            handle_x86_sign_ext(op_rdx, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::div: {
-            // dst = x86_div([0]rhs, [1]%rax, [2]%rdx)
-            auto op_dst = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            auto op_rax = GET_OPERAND(args[1]);
-            handle_x86_div(op_rhs, cpu_dtype);
-            handle_x86_mov(op_dst, op_rax);
-        } break;
-        case xbyak_intrin_type::mod: {
-            // dst = x86_mod([0]rhs, [1]%rax, [2]%rdx)
-            auto op_dst = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            auto op_rdx = GET_OPERAND(args[2]);
-            handle_x86_div(op_rhs, cpu_dtype);
-            handle_x86_mov(op_dst, op_rdx);
-        } break;
-        case xbyak_intrin_type::min: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            // compare order: (rhs, lhs)
-            handle_x86_cmp(op_rhs, op_lhs);
-            handle_x86_cmov(op_lhs, op_rhs, xbyak_condition::lt, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::max: {
-            auto op_lhs = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            // compare order: (rhs, lhs)
-            handle_x86_cmp(op_rhs, op_lhs);
-            handle_x86_cmov(op_lhs, op_rhs, xbyak_condition::gt, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::test: {
-            // test single condition
-            assert(dst.ptr_same(args[0]));
-            assert(cpu_dtype == cpu_data_type::uint_8);
-            auto op_cond = GET_OPERAND(dst);
-            handle_x86_test(op_cond);
-        } break;
-        case xbyak_intrin_type::neg: {
-            auto op_dst = GET_OPERAND(dst);
-            XBYAK_GEN(neg, X86_RM, op_dst);
-        } break;
-        case xbyak_intrin_type::cmov: {
-            const auto &code = modifier.cond_code_;
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_x86_cmov(op_dst, op_src, code, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::cmp_set: {
-            const auto cmp_dtype = get_cpu_data_type(args[0]->dtype_);
-            const auto &code = modifier.cond_code_;
-            assert(code != xbyak_condition::none);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_x86_cmp(op_lhs, op_rhs);
-            handle_x86_set(op_dst, code, cmp_dtype);
-        } break;
-        case xbyak_intrin_type::bmi_pext: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            XBYAK_GEN(pext, X86_R64_R64_R64, op_dst, op_lhs, op_rhs);
-        } break;
-        default: {
-            COMPILE_ASSERT(false,
-                    FUNC_INFO << "Invalid intrisic: "
-                              << "intrin");
-        } break;
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_intrisic(const expr_c &dst,
-        array_ref<expr> args, const xbyak_intrin_type &intrin,
-        const xbyak_intrin_modifier &modifier) {
-    // Get low level data type
-    const cpu_data_type cpu_dtype = get_cpu_data_type(dst->dtype_);
-    // Generate avx intrisics
-    switch (intrin) {
-        case xbyak_intrin_type::add: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_add(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::sub: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_sub(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::mul: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_mul(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::mulhl: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_mulhl(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::div: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_div(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::shl: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_sft = GET_OPERAND(args[1]);
-            auto variable = args[1]->dtype_.lanes_ > 1;
-            const auto src_dtype = get_cpu_data_type(modifier.type_hint_);
-            handle_avx_shl(op_dst, op_lhs, op_sft, src_dtype, variable);
-        } break;
-        case xbyak_intrin_type::shr: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_sft = GET_OPERAND(args[1]);
-            auto variable = args[1]->dtype_.lanes_ > 1;
-            const auto src_dtype = get_cpu_data_type(modifier.type_hint_);
-            handle_avx_shr(op_dst, op_lhs, op_sft, src_dtype, variable);
-        } break;
-        case xbyak_intrin_type::sar: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_sft = GET_OPERAND(args[1]);
-            auto variable = args[1]->dtype_.lanes_ > 1;
-            const auto src_dtype = get_cpu_data_type(modifier.type_hint_);
-            handle_avx_sar(op_dst, op_lhs, op_sft, src_dtype, variable);
-        } break;
-        case xbyak_intrin_type::min: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_min(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::max: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_max(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::abs: {
-            auto op_lst = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            handle_avx_abs(op_lst, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::bit_or: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_bit_or(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::bit_and: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_bit_and(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::bit_xor: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_bit_xor(op_dst, op_lhs, op_rhs, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::ceil: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_round(op_dst, op_src, cpu_dtype, INT64_C(0x2));
-        } break;
-        case xbyak_intrin_type::floor: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_round(op_dst, op_src, cpu_dtype, INT64_C(0x1));
-        } break;
-        case xbyak_intrin_type::round: {
-            auto op_lst = GET_OPERAND(dst);
-            auto op_rhs = GET_OPERAND(args[0]);
-            handle_avx_round(op_lst, op_rhs, cpu_dtype, INT64_C(0x0));
-        } break;
-        case xbyak_intrin_type::sqrt: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_sqrt(op_dst, op_src, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::rsqrt: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_rsqrt(op_dst, op_src, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::fmadd: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_mul = GET_OPERAND(args[0]);
-            auto op_add = GET_OPERAND(args[1]);
-            handle_avx_fmadd(op_dst, op_mul, op_add, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::fnmadd: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_mul = GET_OPERAND(args[0]);
-            auto op_add = GET_OPERAND(args[1]);
-            handle_avx_fnmadd(op_dst, op_mul, op_add, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::broadcast: {
-            const auto src_dtype = get_cpu_data_type(modifier.type_hint_);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_broadcast(op_dst, op_src, cpu_dtype, src_dtype);
-        } break;
-        case xbyak_intrin_type::pshuffle: {
-            const auto src_dtype = get_cpu_data_type(modifier.type_hint_);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_pshuffle(op_dst, op_lhs, op_rhs, src_dtype);
-        } break;
-        case xbyak_intrin_type::shuffle: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            auto op_imm = GET_OPERAND(args[2]);
-            auto op_type_bits = GET_OPERAND(args[3]);
-            handle_avx_shuffle(op_dst, op_lhs, op_rhs, op_imm, op_type_bits);
-        } break;
-        case xbyak_intrin_type::permute: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            auto op_imm = GET_OPERAND(args[2]);
-            handle_avx_permute(op_dst, op_lhs, op_rhs, op_imm, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::gather: {
-            auto &mask = modifier.cond_mask_;
-            assert(mask.defined());
-            auto op_msk = GET_OPERAND(mask);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_ptr = GET_OPERAND(args[0]);
-            auto op_idx = GET_OPERAND(args[1]);
-            handle_avx_gather(op_dst, op_ptr, op_idx, op_msk, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::insert: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_b = GET_OPERAND(args[0]);
-            auto op_imm = GET_OPERAND(args[1]);
-            auto op_elem_bits = GET_OPERAND(args[2]);
-            handle_avx_insert(op_dst, op_b, op_imm, op_elem_bits);
-        } break;
-        case xbyak_intrin_type::extract: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_b = GET_OPERAND(args[0]);
-            auto op_imm = GET_OPERAND(args[1]);
-            auto op_elem_bits = GET_OPERAND(args[2]);
-            handle_avx_extract(op_dst, op_b, op_imm, op_elem_bits);
-        } break;
-        case xbyak_intrin_type::permutex2var: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_idx = GET_OPERAND(args[0]);
-            auto op_src = GET_OPERAND(args[1]);
-            handle_avx_permutex2var(op_dst, op_idx, op_src, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::permutexvar: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_idx = GET_OPERAND(args[0]);
-            auto op_src = GET_OPERAND(args[1]);
-            operand bits = GET_OPERAND(args[2]);
-            handle_avx_permutexvar(op_dst, op_idx, op_src, cpu_dtype, bits);
-        } break;
-        case xbyak_intrin_type::unpack_low: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            auto op_imm = GET_OPERAND(args[2]);
-            handle_avx_unpack_low(op_dst, op_lhs, op_rhs, op_imm);
-        } break;
-        case xbyak_intrin_type::unpack_high: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            auto op_imm = GET_OPERAND(args[2]);
-            handle_avx_unpack_high(op_dst, op_lhs, op_rhs, op_imm);
-        } break;
-        case xbyak_intrin_type::extract_low: {
-            const auto in_dtype = get_cpu_data_type(modifier.type_hint_);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_extract_low(op_dst, op_src, in_dtype);
-        } break;
-        case xbyak_intrin_type::extract_high: {
-            const auto in_dtype = get_cpu_data_type(modifier.type_hint_);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_extract_high(op_dst, op_src, in_dtype);
-        } break;
-        case xbyak_intrin_type::cmov: {
-            const auto &code = modifier.cond_code_;
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_cmov(op_dst, op_src, code, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::movd: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            XBYAK_GEN(vmovd, AVX_XMR32_XMR32, op_dst, op_src)
-        } break;
-        case xbyak_intrin_type::mask_mov: {
-            auto &msk_cond = modifier.cond_mask_;
-            assert(msk_cond.defined());
-            auto op_cond = GET_OPERAND(msk_cond);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            handle_avx_mask_mov(
-                    op_dst, op_src, op_cond, cpu_dtype, modifier.zero_mask_);
-        } break;
-        case xbyak_intrin_type::blend: {
-            // Reminder: operand order is reversed comparing to select(l,r)
-            auto &msk_cond = modifier.cond_mask_;
-            assert(msk_cond.defined());
-            auto op_cond = GET_OPERAND(msk_cond);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_blend(op_dst, op_lhs, op_rhs, op_cond, cpu_dtype);
-        } break;
-        case xbyak_intrin_type::cmp_set: {
-            auto &code = modifier.cond_code_;
-            assert(code != xbyak_condition::none);
-            const auto cmp_dtype = get_cpu_data_type(modifier.type_hint_);
-            auto op_dst = GET_OPERAND(dst);
-            auto op_lhs = GET_OPERAND(args[0]);
-            auto op_rhs = GET_OPERAND(args[1]);
-            handle_avx_cmp_set(op_dst, op_lhs, op_rhs, code, cmp_dtype);
-        } break;
-        case xbyak_intrin_type::mov_mask: {
-            auto op_dst = GET_OPERAND(dst);
-            auto op_src = GET_OPERAND(args[0]);
-            const auto src_dtype = get_cpu_data_type(modifier.type_hint_);
-            handle_avx_mov_mask(op_dst, op_src, src_dtype);
-        } break;
-        default: {
-            COMPILE_ASSERT(false,
-                    FUNC_INFO << "Invalid intrisic: "
-                              << "intrin");
-        } break;
-    }
-}
-
-//==============================================================================
-// GENERAL OPERATIONS MEMBER FUNCTION SECTION
-//==============================================================================
-
-void xbyak_lowering_viewer::handle_assign(
-        const expr_c &lhs, const expr_c &rhs) {
-    auto cpu_dtype = get_cpu_data_type(rhs->dtype_);
-
-    auto rhs_op = GET_OPERAND(rhs);
-    auto lhs_op = GET_OPERAND(lhs);
-
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8:
-        case cpu_data_type::uint_16:
-        case cpu_data_type::uint_32:
-        case cpu_data_type::sint_32:
-        case cpu_data_type::uint_64: {
-            // 8,16,32,64-bit scalar mov
-            handle_x86_mov(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::float_16: {
-            handle_avx_movsh(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::float_32: {
-            // 32-bit simd mov
-            handle_avx_movss(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::uint_16_x4:
-        case cpu_data_type::sint_32_x2:
-        case cpu_data_type::float_16_x4:
-        case cpu_data_type::float_32_x2: {
-            // 64-bit simd mov
-            handle_avx_movq(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::uint_8_x32:
-        case cpu_data_type::uint_8_x64:
-        case cpu_data_type::sint_8_x16:
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::sint_8_x64: {
-            handle_avx_movps(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::uint_16_x8:
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_16_x32: {
-            handle_avx_movps(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::sint_32_x16:
-        case x86_64::cpu_data_type::uint_64_x2:
-        case x86_64::cpu_data_type::uint_64_x4:
-        case x86_64::cpu_data_type::uint_64_x8: {
-            handle_avx_movps(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x16: {
-            handle_avx_movps(lhs_op, rhs_op);
-        } break;
-        case cpu_data_type::mask_x4:
-        case cpu_data_type::mask_x8:
-        case cpu_data_type::mask_x16:
-        case cpu_data_type::mask_x32:
-        case cpu_data_type::mask_x64: {
-            handle_avx512_kmov(lhs_op, rhs_op, cpu_dtype);
-        } break;
-        default: {
-            COMPILE_ASSERT(false,
-                    FUNC_INFO << "Invalid type: " << cpu_dtype << " " << lhs
-                              << " = " << rhs);
-        }
-    }
-}
-
-void xbyak_lowering_viewer::handle_tensorptr(
-        const expr_c &lhs, const tensorptr_c &rhs) {
-    auto cpu_dtype = get_cpu_data_type(lhs->dtype_);
-
-    COMPILE_ASSERT(cpu_dtype == cpu_data_type::uint_64,
-            "Invlaid tensorptr dst type" << lhs);
-    COMPILE_ASSERT(lhs.isa<tensor>() || lhs.isa<var>(),
-            "Invalid assign_from_tensorptr lvalue node type: " << lhs);
-
-    const auto &base = rhs->base_;
-    const auto &ptr = base->ptr_;
-    const auto &idx = base->idx_.back();
-
-    if (const_exceed_32bit(idx)) {
-        const tensor_c tsr = ptr.dyn_as<tensor_c>();
-        assert(tsr.defined());
-
-        auto op_lhs = GET_OPERAND(lhs);
-        auto op_tsr = GET_OPERAND(tsr);
-        auto op_idx = GET_OPERAND(idx);
-
-        auto scale = location_manager_->get_data_type_size(
-                get_cpu_data_type(tsr->elem_dtype_));
-
-        handle_x86_mov(op_lhs, operand((int64_t)scale * op_idx.get_imm()));
-        XBYAK_GEN(add, X86_RM_RMI, op_lhs, op_tsr);
-    } else {
-        auto op_lhs = GET_OPERAND(lhs);
-        auto op_rhs = GET_OPERAND(base);
-
-        XBYAK_GEN(lea, X86_R64_M, op_lhs, op_rhs);
-    }
-}
-
-void xbyak_lowering_viewer::handle_func_addr(
-        const expr_c &lhs, const func_addr_c &rhs) {
-    auto cpu_dtype = get_cpu_data_type(lhs->dtype_);
-
-    COMPILE_ASSERT(cpu_dtype == cpu_data_type::uint_64,
-            "Invlaid func_addr dst type" << lhs);
-    COMPILE_ASSERT(lhs.isa<var>(),
-            "Invalid assign_from_func_addr lvalue node type: " << lhs);
-
-    auto func_name = rhs->func_->name_;
-    auto op_lhs = GET_OPERAND(lhs);
-
-    handle_func_resolve(
-            func_name,
-            [&](const Xbyak::Label &label) {
-                // mov label
-                gen_->mov(to_reg64(op_lhs.get_reg()), label);
-            },
-            [&](const uint64_t &addr) {
-                // mov addr
-                gen_->mov(to_reg64(op_lhs.get_reg()), addr);
-            });
-}
-
-//==============================================================================
-// HANDLE CAST MEMBER FUNCTION SECTION
-//==============================================================================
-inline bool is_type_macth(sc_data_type_t dst_dtype, sc_data_type_t src_dtype, //
-        sc_data_etype dst_etype, sc_data_etype src_etype) {
-    return (dst_dtype.type_code_ == dst_etype)
-            && (src_dtype.type_code_ == src_etype);
-}
-
-template <typename... Args>
-inline bool is_lane_macth(sc_data_type_t dst_dtype, sc_data_type_t src_dtype, //
-        Args... args) {
-    return (dst_dtype.lanes_ == src_dtype.lanes_)
-            && utils::is_one_of((int)src_dtype.lanes_, args...);
-}
-
-inline uint64_t scalar_bit(sc_data_type_t dtype) {
-    return is_x86_simd(dtype) ? UINT64_C(0)
-                              : UINT64_C(8) * utils::get_sizeof_type(dtype);
-}
-
-void xbyak_lowering_viewer::handle_cast(const expr_c &lhs, const cast_c &v) {
-    const sc_data_type_t in_dtype = v->in_->dtype_;
-    const sc_data_type_t out_dtype = v->dtype_;
-
-    auto elem_cast_simd = [&](sc_data_etype out_etype, sc_data_etype in_etype) {
-        return is_lane_macth(out_dtype, in_dtype, 4, 8, 16, 32, 64)
-                && is_type_macth(out_dtype, in_dtype, out_etype, in_etype);
-    };
-
-    auto op_in = GET_OPERAND(v->in_);
-    auto op_out = GET_OPERAND(lhs);
-    if (location_manager_->is_stack_tensor(v->in_)) {
-        assert((out_dtype.is_pointer() || out_dtype == datatypes::index
-                || out_dtype == datatypes::generic));
-        XBYAK_GEN(lea, X86_R64_M, op_out, op_in);
-    } else if ((out_dtype.is_pointer() || out_dtype == datatypes::index
-                       || out_dtype == datatypes::generic)
-            && (in_dtype.is_pointer() || in_dtype == datatypes::index
-                    || in_dtype == datatypes::generic)) {
-        // At the lowered asm level, u64 and all pointers are basically
-        // interchangable (assuming we're using 64-bit pointers).
-        handle_x86_mov(op_out, op_in);
-    } else if ((out_dtype == datatypes::s32 || out_dtype == datatypes::u32
-                       || out_dtype == datatypes::s8
-                       || out_dtype == datatypes::u8
-                       || out_dtype == datatypes::u16)
-            && (in_dtype == datatypes::generic
-                    || in_dtype == datatypes::index)) {
-        handle_x86_mov(op_out, op_in);
-    } else if ((out_dtype == datatypes::u8 || out_dtype == datatypes::s8)
-            && (in_dtype == datatypes::u32 || in_dtype == datatypes::s32)) {
-        handle_x86_mov(op_out, op_in);
-    } else if ((out_dtype == datatypes::u16 || out_dtype == datatypes::bf16)
-            && (in_dtype == datatypes::u32 || in_dtype == datatypes::s32)) {
-        handle_x86_mov(op_out, op_in);
-    } else if (out_dtype == datatypes::generic && in_dtype == datatypes::s32) {
-        handle_x86_mov(op_out, op_in);
-    } else if ((out_dtype == datatypes::u32 || out_dtype == datatypes::u16)
-            && in_dtype == datatypes::index) {
-        handle_x86_mov(op_out, op_in);
-    } else if ((out_dtype == datatypes::s32 || out_dtype == datatypes::u32)
-            && (in_dtype == datatypes::u32 || in_dtype == datatypes::s32)) {
-        handle_x86_mov(op_out, op_in);
-    } else if (out_dtype == datatypes::index && in_dtype == datatypes::s32) {
-        XBYAK_GEN(movsxd, X86_R64_RM, op_out, op_in); // sign extension
-    } else if (out_dtype == datatypes::index && in_dtype == datatypes::u32) {
-        XBYAK_GEN(mov, X86_R32_RM, op_out, op_in); // zero extension
-    } else if (out_dtype == datatypes::u32
-            && (in_dtype == datatypes::u16 || in_dtype == datatypes::bf16)) {
-        XBYAK_GEN(movzx, X86_R_RM, op_out, op_in); // zero extension
-    } else if (out_dtype == datatypes::f32 && in_dtype == datatypes::f16) {
-        XBYAK_GEN(vcvtsh2ss, AVX_X_X_XM, op_out, op_out, op_in);
-    } else if (out_dtype == datatypes::index && in_dtype == datatypes::f16) {
-        XBYAK_GEN(vcvtsh2usi, AVX_R64_XM, op_out, op_in);
-    } else if (out_dtype == datatypes::s32 && in_dtype == datatypes::f16) {
-        XBYAK_GEN(vcvtsh2si, AVX_R32_XM, op_out, op_in);
-    } else if (out_dtype == datatypes::u32 && in_dtype == datatypes::f16) {
-        XBYAK_GEN(vcvtsh2usi, AVX_R32_XM, op_out, op_in);
-    } else if (out_dtype == datatypes::f16 && in_dtype == datatypes::f32) {
-        XBYAK_GEN(vcvtss2sh, AVX_X_X_XM, op_out, op_out, op_in);
-    } else if (out_dtype == datatypes::f16
-            && (in_dtype == datatypes::index || in_dtype == datatypes::u32)) {
-        XBYAK_GEN(vcvtusi2sh, AVX_X_X_RM, op_out, op_out, op_in);
-    } else if (out_dtype == datatypes::f16 && in_dtype == datatypes::s32) {
-        XBYAK_GEN(vcvtsi2sh, AVX_X_X_RM, op_out, op_out, op_in);
-    } else if (out_dtype == datatypes::s32 && in_dtype == datatypes::u16) {
-        XBYAK_GEN(movzx, X86_R_RM, op_out, op_in); // zero extension
-    } else if (out_dtype == datatypes::s32 && in_dtype == datatypes::u16) {
-        XBYAK_GEN(movzx, X86_R_RM, op_out, op_in); // zero extension
-    } else if (out_dtype == datatypes::s32 && in_dtype == datatypes::u16) {
-        XBYAK_GEN(movzx, X86_R_RM, op_out, op_in); // zero extension
-    } else if ((out_dtype == datatypes::s32 || out_dtype == datatypes::u32)
-            && in_dtype == datatypes::s8) {
-        XBYAK_GEN(movsx, X86_R_RM, op_out, op_in); // sign extension
-    } else if ((out_dtype == datatypes::s32 || out_dtype == datatypes::index
-                       || out_dtype == datatypes::u32)
-            && (in_dtype == datatypes::u8 || in_dtype == datatypes::u16)) {
-        XBYAK_GEN(movzx, X86_R_RM, op_out, op_in); // zero extension
-    } else if (out_dtype == datatypes::f32 && in_dtype == datatypes::generic) {
-        XBYAK_GEN(vmovd, AVX_XMR32_XMR32, op_out, op_in);
-    } else if (out_dtype == datatypes::s32 && in_dtype == datatypes::f32) {
-        XBYAK_GEN(vcvttss2si, AVX_R32_XM, op_out, op_in);
-    } else if (out_dtype == datatypes::f32 && in_dtype == datatypes::s32) {
-        XBYAK_GEN(vcvtsi2ss, AVX_X_X_RM, op_out, op_out, op_in);
-    } else if (out_dtype == datatypes::f32 && in_dtype == datatypes::u32) {
-        XBYAK_GEN(vcvtusi2ss, AVX_X_X_RM, op_out, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S32, sc_data_etype::F32)) {
-        XBYAK_GEN(vcvttps2dq, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F32, sc_data_etype::S32)) {
-        XBYAK_GEN(vcvtdq2ps, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F32, sc_data_etype::U32)) {
-        XBYAK_GEN(vcvtudq2ps, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S32, sc_data_etype::S8)) {
-        XBYAK_GEN(vpmovsxbd, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S32, sc_data_etype::U8)) {
-        XBYAK_GEN(vpmovzxbd, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::U32, sc_data_etype::U16)) {
-        XBYAK_GEN(vpmovzxwd, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S32, sc_data_etype::U16)) {
-        XBYAK_GEN(vpmovzxwd, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::U16, sc_data_etype::U32)) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovdw, AVX_XM_X, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::U8, sc_data_etype::S32)) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovdb, AVX_XM_X, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S8, sc_data_etype::S32)) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovsdb, AVX_XM_X, op_out, op_in);
-    } else if (out_dtype == sc_data_type_t::u16(in_dtype.lanes_ * 2)
-            && in_dtype == sc_data_type_t::s32(in_dtype.lanes_)) {
-        assert(cpu_flags_.fAVX2);
-        XBYAK_GEN(vpackssdw, AVX_X_X_XM, op_out, op_in, op_in);
-    } else if (out_dtype == sc_data_type_t::u16(in_dtype.lanes_ * 2)
-            && in_dtype == sc_data_type_t::u32(in_dtype.lanes_)) {
-        assert(cpu_flags_.fAVX2);
-        XBYAK_GEN(vpackusdw, AVX_X_X_XM, op_out, op_in, op_in);
-    } else if (elem_cast_simd(sc_data_etype::U8, sc_data_etype::U16)) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovdb, AVX_XM_X, op_out, op_in);
-    } else if (out_dtype == sc_data_type_t::u8(in_dtype.lanes_ * 2)
-            && in_dtype == sc_data_type_t::u16(in_dtype.lanes_)) {
-        assert(cpu_flags_.fAVX2);
-        XBYAK_GEN(vpackuswb, AVX_X_X_XM, op_out, op_in, op_in);
-    } else if (out_dtype == sc_data_type_t::s8(in_dtype.lanes_ * 2)
-            && in_dtype == sc_data_type_t::u16(in_dtype.lanes_)) {
-        assert(cpu_flags_.fAVX2);
-        XBYAK_GEN(vpacksswb, AVX_X_X_XM, op_out, op_in, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S8, sc_data_etype::U16)) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovsdb, AVX_XM_X, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::BF16, sc_data_etype::F32)) {
-        assert(cpu_flags_.fAVX512BF16);
-        XBYAK_GEN(vcvtneps2bf16, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F16, sc_data_etype::F32)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtps2phx, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F16, sc_data_etype::INDEX)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtuqq2ph, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F16, sc_data_etype::U32)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtudq2ph, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F16, sc_data_etype::S32)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtdq2ph, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F16, sc_data_etype::U16)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtuw2ph, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::F32, sc_data_etype::F16)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtph2psx, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::INDEX, sc_data_etype::F16)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtph2uqq, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::U32, sc_data_etype::F16)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtph2udq, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::S32, sc_data_etype::F16)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtph2dq, AVX_X_XM, op_out, op_in);
-    } else if (elem_cast_simd(sc_data_etype::U16, sc_data_etype::F16)) {
-        assert(cpu_flags_.fAVX512FP16);
-        XBYAK_GEN(vcvtph2uw, AVX_X_XM, op_out, op_in);
-    } else if ((out_dtype == sc_data_type_t::boolean(8)
-                       || out_dtype == sc_data_type_t::boolean(4))
-            && scalar_bit(in_dtype) >= 8) {
-        assert(cpu_flags_.fAVX512DQ);
-        XBYAK_GEN(kmovb, AVX_KMR32_KMR32, op_out, op_in);
-    } else if (out_dtype == sc_data_type_t::boolean(16)
-            && scalar_bit(in_dtype) >= 16) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(kmovw, AVX_KMR32_KMR32, op_out, op_in);
-    } else if (out_dtype == sc_data_type_t::boolean(32)
-            && scalar_bit(in_dtype) >= 32) {
-        assert(cpu_flags_.fAVX512BW);
-        XBYAK_GEN(kmovd, AVX_KMR32_KMR32, op_out, op_in);
-    } else if (out_dtype == sc_data_type_t::boolean(64)
-            && scalar_bit(in_dtype) >= 64) {
-        assert(cpu_flags_.fAVX512BW);
-        XBYAK_GEN(kmovq, AVX_KMR64_KMR64, op_out, op_in);
-    } else if (out_dtype == in_dtype) {
-        handle_assign(lhs, v->in_);
-    } else {
-        COMPILE_ASSERT(false,
-                FUNC_INFO << "Invalid type: " << out_dtype << " <- " << in_dtype
-                          << ". v=" << v);
-    }
-}
-
-void xbyak_lowering_viewer::handle_saturated_cast(
-        const expr_c &dst, const expr_c &src) {
-    const sc_data_type_t src_dtype = src->dtype_;
-    const sc_data_type_t dst_dtype = dst->dtype_;
-
-    auto op_src = GET_OPERAND(src);
-    auto op_dst = GET_OPERAND(dst);
-
-    if (dst_dtype == sc_data_type_t::s8(16)
-            && src_dtype == sc_data_type_t::s32(16)) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovsdb, AVX_XM_X, op_dst, op_src);
-    } else if (dst_dtype == sc_data_type_t::u8(16)
-            && (src_dtype == sc_data_type_t::s32(16)
-                    || src_dtype == sc_data_type_t::u32(16))) {
-        assert(cpu_flags_.fAVX512F);
-        XBYAK_GEN(vpmovusdb, AVX_XM_X, op_dst, op_src);
-    } else {
-        COMPILE_ASSERT(false,
-                FUNC_INFO << "Invalid type: " << dst_dtype << " <- "
-                          << src_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_round_and_cast(
-        const expr_c &dst, const expr_c &src) {
-    const sc_data_type_t src_dtype = src->dtype_;
-    const sc_data_type_t dst_dtype = dst->dtype_;
-
-    auto elem_cast_simd = [&](sc_data_etype out_etype, sc_data_etype in_etype) {
-        return is_lane_macth(dst_dtype, src_dtype, 4, 8, 16, 32, 64)
-                && is_type_macth(dst_dtype, src_dtype, out_etype, in_etype);
-    };
-
-    auto op_src = GET_OPERAND(src);
-    auto op_dst = GET_OPERAND(dst);
-
-    if (elem_cast_simd(sc_data_etype::S32, sc_data_etype::F32)) {
-        XBYAK_GEN(vcvtps2dq, AVX_X_XM, op_dst, op_src);
-    } else if (elem_cast_simd(sc_data_etype::U32, sc_data_etype::F32)) {
-        XBYAK_GEN(vcvtps2udq, AVX_X_XM, op_dst, op_src);
-    } else if (dst_dtype == sc_data_type_t::s32(1)
-            && src_dtype == sc_data_type_t::f32(1)) {
-        XBYAK_GEN(vcvtss2si, AVX_R32_XM, op_dst, op_src);
-    } else {
-        COMPILE_ASSERT(false,
-                FUNC_INFO << "Invalid type: " << dst_dtype << " <- "
-                          << src_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_reinterpret(
-        const expr_c &lhs, const expr_c &rhs) {
-    const sc_data_type_t dtype_dst = lhs->dtype_;
-    const sc_data_type_t dtype_src = rhs->dtype_;
-
-    auto op_dst = GET_OPERAND(lhs);
-    auto op_src = GET_OPERAND(rhs);
-
-    auto size_of_dst = utils::get_sizeof_type(dtype_dst);
-    auto size_of_src = utils::get_sizeof_type(dtype_src);
-
-    COMPILE_ASSERT(size_of_dst == size_of_src,
-            "Reinterpret must match data size: " << dtype_dst << ", "
-                                                 << dtype_src);
-
-    switch (size_of_dst) {
-        case 1: { // 8-bit
-            handle_x86_mov(op_dst, op_src);
-        } break;
-        case 2: { // 16-bit
-            if (op_dst.is_xyz() || op_src.is_xyz()) {
-                XBYAK_GEN(vmovd, AVX_XMR32_XMR32, op_dst, op_src);
-            } else {
-                handle_x86_mov(op_dst, op_src);
-            }
-        } break;
-        case 4: { // 32-bit
-            if (dtype_dst != datatypes::f32 && op_src.is_addr()) {
-                // int <- addr
-                handle_x86_mov(op_dst, op_src);
-            } else if (dtype_dst == datatypes::f32 && op_src.is_addr()) {
-                // float <- addr
-                handle_avx_movss(op_dst, op_src);
-            } else if (dtype_dst == datatypes::f32
-                    || dtype_src == datatypes::f32) {
-                // float <-> int
-                XBYAK_GEN(vmovd, AVX_XMR32_XMR32, op_dst, op_src);
-            } else {
-                // int <-> int
-                handle_x86_mov(op_dst, op_src);
-            }
-        } break;
-        case 8: { // 64-bit
-            if (op_dst.is_reg() && op_src.is_addr()) {
-                // reg64 <- addr
-                handle_x86_mov(op_dst, op_src);
-            } else if (!is_x86_simd(dtype_dst) && !is_x86_simd(dtype_src)) {
-                // 64-bit int
-                handle_x86_mov(op_dst, op_src);
-            } else {
-                handle_avx_movq(op_dst, op_src);
-            }
-        } break;
-        case 16: { // 128-bit xmm
-            handle_avx_movps(op_dst, op_src);
-        } break;
-        case 32: { // 256-bit ymm
-            handle_avx_movps(op_dst, op_src);
-        } break;
-        case 64: { // 512-bit zmm
-            handle_avx_movps(op_dst, op_src);
-        } break;
-        default:
-            COMPILE_ASSERT(false,
-                    FUNC_INFO << "Invalid type: " << dtype_dst << " <- "
-                              << dtype_src);
-    }
-}
-
-//==============================================================================
-// X86 INTRINSIC HELPER MEMBER FUNCTION SECTION
-//==============================================================================
-
-void xbyak_lowering_viewer::handle_x86_mov(
-        const operand &op_dst, const operand &op_src) {
-    if (op_dst == op_src) { return; }
-    XBYAK_GEN(mov, X86_RM_RMI, op_dst, op_src);
-}
-
-void xbyak_lowering_viewer::handle_x86_test(const operand &op_cond) {
-    // Test bool type, set SF, ZF, PF flags
-    // Functionally same as: TEST cond, cond
-    gen_->cmp(op_cond.get_operand(), 0);
-}
-
-void xbyak_lowering_viewer::handle_x86_sign_ext(
-        const operand &op_rdx, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::sint_8: {
-            // Signed div, sign-extend of AX
-            gen_->cwd();
-        } break;
-        case cpu_data_type::sint_32: {
-            // Signed div, sign-extend of EAX
-            gen_->cdq();
-        } break;
-        case cpu_data_type::uint_8:
-        case cpu_data_type::uint_16:
-        case cpu_data_type::uint_32:
-        case cpu_data_type::uint_64: {
-            // Unsigned div, zero out rdx
-            gen_->xor_(op_rdx.get_reg(), op_rdx.get_reg());
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_x86_div(
-        const operand &op_div, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::sint_8:
-        case cpu_data_type::sint_32: {
-            // Signed div
-            XBYAK_GEN(idiv, X86_RM, op_div);
-        } break;
-        case cpu_data_type::uint_8:
-        case cpu_data_type::uint_16:
-        case cpu_data_type::uint_32:
-        case cpu_data_type::uint_64: {
-            // Unsigned div
-            XBYAK_GEN(div, X86_RM, op_div);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_x86_cmp(
-        const operand &op_lhs, const operand &op_rhs) {
-    XBYAK_GEN(cmp, X86_RM_RMI, op_lhs, op_rhs);
-}
-
-void xbyak_lowering_viewer::handle_x86_set(const operand &op_dst,
-        const xbyak_condition &code, const x86_64::cpu_data_type &cpu_dtype) {
-    const auto &op = op_dst.get_operand();
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_8:
-        case cpu_data_type::uint_16:
-        case cpu_data_type::uint_32:
-        case cpu_data_type::uint_64: {
-            // Unsigned condition set
-            switch (code) {
-                case xbyak_condition::eq: gen_->sete(op); break;
-                case xbyak_condition::lt: gen_->setnae(op); break;
-                case xbyak_condition::le: gen_->setna(op); break;
-                case xbyak_condition::ne: gen_->setne(op); break;
-                case xbyak_condition::ge: gen_->setnb(op); break;
-                case xbyak_condition::gt: gen_->setnbe(op); break;
-                default:
-                    COMPILE_ASSERT(
-                            false, FUNC_INFO << "Invalid condition: " << code);
-            }
-        } break;
-        case cpu_data_type::sint_8:
-        case cpu_data_type::sint_32: {
-            // Signed condition set
-            switch (code) {
-                case xbyak_condition::eq: gen_->sete(op); break;
-                case xbyak_condition::lt: gen_->setl(op); break;
-                case xbyak_condition::le: gen_->setle(op); break;
-                case xbyak_condition::ne: gen_->setne(op); break;
-                case xbyak_condition::ge: gen_->setge(op); break;
-                case xbyak_condition::gt: gen_->setg(op); break;
-                default:
-                    COMPILE_ASSERT(
-                            false, FUNC_INFO << "Invalid condition: " << code);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_x86_cmov(const operand &op_dst,
-        const operand &op_src, const xbyak_condition &code,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    const auto get_cmov_op = [](const operand &op) {
-        // cmov instructions do not support 8-bit reg as operand
-        if (op.is_reg(8)) { return operand(op.get_reg32()); }
-        return op;
-    };
-    const auto op_dst_c = get_cmov_op(op_dst);
-    const auto op_src_c = get_cmov_op(op_src);
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_8:
-        case cpu_data_type::uint_16:
-        case cpu_data_type::uint_32:
-        case cpu_data_type::uint_64: {
-            switch (code) {
-                case xbyak_condition::eq: {
-                    XBYAK_GEN(cmove, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::lt: {
-                    XBYAK_GEN(cmovb, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::le: {
-                    XBYAK_GEN(cmovbe, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::ne: {
-                    XBYAK_GEN(cmovne, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::ge: {
-                    XBYAK_GEN(cmovae, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::gt: {
-                    XBYAK_GEN(cmova, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                default: {
-                    COMPILE_ASSERT(false, FUNC_INFO << "Invalid condition.");
-                } break;
-            }
-        } break;
-        case cpu_data_type::sint_8:
-        case cpu_data_type::sint_32: {
-            switch (code) {
-                case xbyak_condition::eq: {
-                    XBYAK_GEN(cmove, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::lt: {
-                    XBYAK_GEN(cmovl, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::le: {
-                    XBYAK_GEN(cmovle, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::ne: {
-                    XBYAK_GEN(cmovne, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::ge: {
-                    XBYAK_GEN(cmovge, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                case xbyak_condition::gt: {
-                    XBYAK_GEN(cmovg, X86_R_RM, op_dst_c, op_src_c);
-                } break;
-                default: {
-                    COMPILE_ASSERT(false, FUNC_INFO << "Invalid condition.");
-                } break;
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-//==============================================================================
-// AVX INTRINSIC HELPER MEMBER FUNCTION SECTION
-//==============================================================================
-
-void xbyak_lowering_viewer::handle_avx_movq(
-        const operand &op_dst, const operand &op_src) {
-    if (op_dst == op_src) { return; }
-    XBYAK_GEN(vmovq, AVX_XMR64_XMR64, op_dst, op_src);
-}
-
-void xbyak_lowering_viewer::handle_avx_movss(
-        const operand &op_dst, const operand &op_src) {
-    if (op_dst == op_src) { return; }
-    XBYAK_GEN(vmovss, AVX_XM_XM, op_dst, op_src);
-}
-
-void xbyak_lowering_viewer::handle_avx_movsh(
-        const operand &op_dst, const operand &op_src) {
-    if (op_dst == op_src) { return; }
-    if (cpu_flags_.fAVX512FP16 && (op_src.is_r_m() || op_dst.is_r_m())) {
-        // move to memery or reg
-        XBYAK_GEN(vmovw, AVX_XMR64_XMR64, op_dst, op_src);
-    } else {
-        handle_avx_movss(op_dst, op_src);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_movps(
-        const operand &op_dst, const operand &op_src) {
-    if (op_dst == op_src) {
-        return;
-    } else if (op_dst.is_xyz() && op_src.is_xyz()) {
-        gen_->vmovaps(op_dst.get_xmm(), op_src.get_xmm());
-    } else if (op_dst.is_xyz() && op_src.is_addr()) {
-        gen_->vmovups(op_dst.get_xmm(), op_src.get_addr());
-    } else if (op_dst.is_addr() && op_src.is_xyz()) {
-        gen_->vmovups(op_dst.get_addr(), op_src.get_xmm());
-    } else {
-        COMPILE_ASSERT(false,
-                FUNC_INFO << "Invalid operand: " << op_dst << ", " << op_src);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx512_kmov(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype) {
-    if (op_dst == op_src) { return; }
-    switch (cpu_dtype) {
-        case cpu_data_type::mask_x4: {
-            XBYAK_GEN(kmovb, AVX_KMR32_KMR32, op_dst, op_src);
-        } break;
-        case cpu_data_type::mask_x8: {
-            XBYAK_GEN(kmovb, AVX_KMR32_KMR32, op_dst, op_src);
-        } break;
-        case cpu_data_type::mask_x16: {
-            XBYAK_GEN(kmovw, AVX_KMR32_KMR32, op_dst, op_src);
-        } break;
-        case cpu_data_type::mask_x32: {
-            XBYAK_GEN(kmovd, AVX_KMR32_KMR32, op_dst, op_src);
-        } break;
-        case cpu_data_type::mask_x64: {
-            XBYAK_GEN(kmovq, AVX_KMR64_KMR64, op_dst, op_src);
-        } break;
-        default: {
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_add(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            assert(cpu_flags_.fAVX512BF16);
-            XBYAK_GEN(vaddph, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16: {
-            XBYAK_GEN(vaddsh, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::float_32_x2: {
-            XBYAK_GEN(vaddps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vaddss, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::uint_32_x2:
-        case cpu_data_type::sint_32_x2: {
-            XBYAK_GEN(vpaddd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::sint_8_x16: {
-            XBYAK_GEN(vpaddb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_sub(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            XBYAK_GEN(vsubph, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16: {
-            XBYAK_GEN(vsubsh, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::float_32_x2: {
-            XBYAK_GEN(vsubps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vsubss, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::uint_32_x16: {
-            XBYAK_GEN(vpsubd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_mul(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            assert(cpu_flags_.fAVX512FP16);
-            XBYAK_GEN(vmulph, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16: {
-            XBYAK_GEN(vmulsh, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::float_32_x2: {
-            XBYAK_GEN(vmulps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vmulss, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::sint_32_x16: {
-            XBYAK_GEN(vpmulld, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_mulhl(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_32_x8: {
-            XBYAK_GEN(vpmuldq, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_div(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_16: {
-            assert(cpu_flags_.fAVX512FP16);
-            XBYAK_GEN(vdivsh, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x32: {
-            assert(cpu_flags_.fAVX512FP16);
-            XBYAK_GEN(vdivph, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8: {
-            XBYAK_GEN(vdivps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vdivss, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_bit_or(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    auto gen_avx_bit_or = [&]() {
-        switch (cpu_dtype) {
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8:
-            case cpu_data_type::uint_8_x8:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::sint_8_x32: {
-                XBYAK_GEN(vpor, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    auto gen_avx512_bit_or = [&]() {
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            case cpu_data_type::uint_32_x16:
-            case cpu_data_type::sint_32_x16:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vpord, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Gen bit_or
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_bit_or(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_bit_or(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_bit_and(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    auto gen_avx_bit_and = [&]() {
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4:
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vandps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::uint_32_x4:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8:
-            case cpu_data_type::uint_16_x8:
-            case cpu_data_type::uint_16_x16: {
-                XBYAK_GEN(vpand, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    auto gen_avx512_bit_and = [&]() {
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vandps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::uint_32_x16:
-            case cpu_data_type::sint_32_x16:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vpandd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::float_16_x8:
-            case cpu_data_type::uint_16_x32:
-            case cpu_data_type::uint_16_x16: {
-                // Use Bitwise AND of packed quadword integers, do not use
-                // writemask
-                XBYAK_GEN(vpandq, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Gen bit_and
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_bit_and(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_bit_and(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_bit_xor(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    auto gen_avx_bit_xor = [&]() {
-        switch (cpu_dtype) {
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8:
-            case cpu_data_type::uint_8_x8:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::sint_8_x8:
-            case cpu_data_type::sint_8_x16:
-            case cpu_data_type::sint_8_x32:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vpxor, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    auto gen_avx512_bit_xor = [&]() {
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_16_x32:
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::uint_32_x16:
-            case cpu_data_type::sint_32_x16:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vpxord, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Gen bit_xor
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_bit_xor(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_bit_xor(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_min(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::uint_8_x16: {
-            XBYAK_GEN(vpminub, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::sint_8_x16: {
-            XBYAK_GEN(vpminsb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_32_x8: {
-            XBYAK_GEN(vpminsd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::float_32_x2: {
-            XBYAK_GEN(vminps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vminss, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            XBYAK_GEN(vminph, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16: {
-            XBYAK_GEN(vminsh, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_16_x8: {
-            XBYAK_GEN(vpminsw, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_max(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            assert(cpu_flags_.fAVX512FP16);
-            XBYAK_GEN(vmaxph, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_16: {
-            XBYAK_GEN(vmaxsh, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::float_32_x2: {
-            XBYAK_GEN(vmaxps, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vmaxss, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::uint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::uint_32_x4: {
-            XBYAK_GEN(vpmaxud, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::sint_32_x4: {
-            XBYAK_GEN(vpmaxsd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::uint_8_x32:
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::uint_8_x8: {
-            XBYAK_GEN(vpmaxub, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::sint_8_x16:
-        case cpu_data_type::sint_8_x8: {
-            XBYAK_GEN(vpmaxsb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_16_x8: {
-            XBYAK_GEN(vpmaxsw, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_abs(const operand &op_lhs,
-        const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::sint_8_x64: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::sint_8_x16: {
-            XBYAK_GEN(vpabsb, AVX_X_XM, op_lhs, op_rhs);
-        } break;
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::sint_32_x2: {
-            XBYAK_GEN(vpabsd, AVX_X_XM, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_shr(const operand &op_dst,
-        const operand &op_lhs, const operand &op_sft,
-        const x86_64::cpu_data_type &cpu_dtype, bool variable) {
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_64_x8: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::uint_64_x4: {
-            if (variable) {
-                XBYAK_GEN(vpsrlvq, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpsrlq, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        case cpu_data_type::sint_32_x16:
-        case cpu_data_type::uint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::uint_32_x4: {
-            if (variable) {
-                XBYAK_GEN(vpsrlvd, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpsrld, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_16_x8: {
-            if (variable) {
-                XBYAK_GEN(vpsrlvw, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpsrlw, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_shl(const operand &op_dst,
-        const operand &op_lhs, const operand &op_sft,
-        const x86_64::cpu_data_type &cpu_dtype, bool variable) {
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_64_x8: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::uint_64_x4: {
-            if (variable) {
-                XBYAK_GEN(vpsllvq, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpsllq, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x4: {
-            if (variable) {
-                XBYAK_GEN(vpsllvd, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpslld, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::uint_16_x8: {
-            if (variable) {
-                XBYAK_GEN(vpsllvw, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpsllw, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_sar(const operand &op_dst,
-        const operand &op_lhs, const operand &op_sft,
-        const x86_64::cpu_data_type &cpu_dtype, bool variable) {
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_32_x16:
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x4: {
-            if (variable) {
-                XBYAK_GEN(vpsravd, AVX_X_X_XM, op_dst, op_lhs, op_sft);
-            } else {
-                XBYAK_GEN(vpsrad, AVX_X_XM_XMI, op_dst, op_lhs, op_sft);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_round(const operand &op_lhs,
-        const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype,
-        const int64_t &imm) {
-    // avx round
-    auto gen_avx_round = [&]() {
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vroundps, AVX_X_XM_I, //
-                        op_lhs, op_rhs, operand(imm));
-            } break;
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vroundss, AVX_X_X_XM_I, //
-                        op_lhs, op_rhs, op_rhs, operand(imm));
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // avx512 round
-    auto gen_avx512_round = [&]() {
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_16_x32: {
-                assert(cpu_flags_.fAVX512FP16);
-            }
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::float_16_x8:
-            case cpu_data_type::float_16_x4: {
-                XBYAK_GEN(vrndscaleph, AVX_X_XM_I, //
-                        op_lhs, op_rhs, operand(imm));
-            } break;
-            case cpu_data_type::float_16: {
-                XBYAK_GEN(vrndscalesh, AVX_X_X_XM_I, //
-                        op_lhs, op_rhs, op_rhs, operand(imm));
-            } break;
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vrndscaleps, AVX_X_XM_I, //
-                        op_lhs, op_rhs, operand(imm));
-            } break;
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vrndscaless, AVX_X_X_XM_I, //
-                        op_lhs, op_rhs, op_rhs, operand(imm));
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Gen round
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_round(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_round(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_sqrt(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8: {
-            XBYAK_GEN(vsqrtps, AVX_X_XM, op_dst, op_src);
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vsqrtss, AVX_X_X_XM, op_dst, op_dst, op_src);
-        } break;
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            assert(cpu_flags_.fAVX512FP16);
-            XBYAK_GEN(vsqrtph, AVX_X_XM, op_dst, op_src);
-        } break;
-        case cpu_data_type::float_16: {
-            XBYAK_GEN(vsqrtsh, AVX_X_X_XM, op_dst, op_dst, op_src);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_rsqrt(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype) {
-    // Use avx for low percision rsqrt
-    auto gen_avx_rsqrt = [&]() {
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8: {
-                XBYAK_GEN(vrsqrtps, AVX_X_XM, op_dst, op_src);
-            } break;
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vrsqrtss, AVX_X_X_XM, op_dst, op_src, op_src);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Use avx512 for higher percision rsqrt
-    auto gen_avx512_rsqrt = [&]() {
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8: {
-                XBYAK_GEN(vrsqrt14ps, AVX_X_XM, op_dst, op_src);
-            } break;
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vrsqrt14ss, AVX_X_X_XM, op_dst, op_src, op_src);
-            } break;
-            case cpu_data_type::float_16_x32:
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::float_16_x8:
-            case cpu_data_type::float_16_x4: {
-                XBYAK_GEN(vrsqrtph, AVX_X_XM, op_dst, op_src);
-            } break;
-            case cpu_data_type::float_16: {
-                XBYAK_GEN(vrsqrtsh, AVX_X_X_XM, op_dst, op_src, op_src);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Gen rsqrt
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_rsqrt(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_rsqrt(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_fmadd(const operand &op_dst,
-        const operand &op_mul, const operand &op_add,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4: {
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfmadd132ps, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfmadd213ps, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        case cpu_data_type::float_32: {
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfmadd132ss, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfmadd213ss, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            assert(cpu_flags_.fAVX512FP16);
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfmadd132ph, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfmadd213ph, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        case cpu_data_type::float_16: {
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfmadd132sh, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfmadd213sh, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_fnmadd(const operand &op_dst,
-        const operand &op_mul, const operand &op_add,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4: {
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfnmadd132ps, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfnmadd213ps, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        case cpu_data_type::float_32: {
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfnmadd132ss, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfnmadd213ss, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        case cpu_data_type::float_16_x32:
-        case cpu_data_type::float_16_x16:
-        case cpu_data_type::float_16_x8:
-        case cpu_data_type::float_16_x4: {
-            assert(cpu_flags_.fAVX512FP16);
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfnmadd132ph, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfnmadd213ph, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        case cpu_data_type::float_16: {
-            if (op_mul.is_addr()) {
-                XBYAK_GEN(vfnmadd132sh, AVX_X_X_XM, op_dst, op_add, op_mul);
-            } else {
-                XBYAK_GEN(vfnmadd213sh, AVX_X_X_XM, op_dst, op_mul, op_add);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_pshuffle(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::uint_8_x32: {
-            XBYAK_GEN(vpshufb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_shuffle(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs, const operand &op_imm,
-        const operand &op_bits) {
-    // Currently we assume that similar instructions do not use masks, and these
-    // instructions only need to pay attention to how many bits are operated. So
-    // we only need to use bits to choose instructions.
-    auto type_bits = op_bits.get_imm();
-
-    // Generate code for avx shuffle
-    auto gen_avx_shuffle = [&]() {
-        assert(cpu_flags_.fAVX);
-        switch (type_bits) {
-            case 32: {
-                XBYAK_GEN(
-                        vshufps, AVX_X_X_XM_I, op_dst, op_lhs, op_rhs, op_imm);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type_bits: " << type_bits);
-        }
-    };
-    // Generate code for avx512 shuffle
-    auto gen_avx512_shuffle = [&]() {
-        assert(cpu_flags_.fAVX512F);
-        switch (type_bits) {
-            case 32: {
-                XBYAK_GEN(
-                        vshufps, AVX_X_X_XM_I, op_dst, op_lhs, op_rhs, op_imm);
-            } break;
-            case 128: {
-                XBYAK_GEN(vshuff32x4, AVX_Y_Y_XM_I, op_dst, op_lhs, op_rhs,
-                        op_imm);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type_bits: " << type_bits);
-        }
-    };
-    // Generate shuffle
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_shuffle(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_shuffle(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_permute(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs, const operand &op_imm,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::uint_16_x16:
-        case cpu_data_type::float_32_x8: {
-            assert(op_imm.get_imm() == 32 || op_imm.get_imm() == 49);
-            XBYAK_GEN(vperm2f128, AVX_Y_Y_XM_I, op_dst, op_lhs, op_rhs, op_imm);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_gather(const operand &op_dst,
-        const operand &op_ptr, const operand &op_idx, const operand &mask,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    auto op_addr = operand(
-            gen_->ptr[op_ptr.get_reg() + op_idx.get_xmm() * sizeof(float)]);
-    // Generate code for avx2 gather
-    auto gen_avx2_gather = [&]() {
-        assert(cpu_flags_.fAVX2);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vgatherdps, AVX_X_M_X, //
-                        op_dst, op_addr, mask);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Generate code for avx512 gather
-    auto gen_avx512_gather = [&]() {
-        assert(cpu_flags_.fAVX512VL);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vgatherdps, AVX_X_M, //
-                        op_dst.set_evex(mask), op_addr);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Generate gather
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_gather(); break;
-        case simd_level::avx2: gen_avx2_gather(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_insert(const operand &op_dst,
-        const operand &op_b, const operand &op_imm,
-        const operand &op_elem_bits) {
-    // Currently we assume that similar instructions do not use masks, and these
-    // instructions only need to pay attention to how many bits are operated. So
-    // we only need to use bits to choose instructions.
-    auto elem_bits = op_elem_bits.get_imm();
-    switch (elem_bits) {
-        case 8: {
-            XBYAK_GEN(vpinsrb, AVX_X_X_RM_I, op_dst, op_dst, op_b, op_imm);
-        } break;
-        case 16: {
-            XBYAK_GEN(vpinsrw, AVX_X_X_RM_I, op_dst, op_dst, op_b, op_imm);
-        } break;
-        case 32: {
-            XBYAK_GEN(vpinsrd, AVX_X_X_RM_I, op_dst, op_dst, op_b, op_imm);
-        } break;
-        case 64: {
-            XBYAK_GEN(vpinsrq, AVX_X_X_RM_I, op_dst, op_dst, op_b, op_imm);
-        } break;
-        case 128: {
-            if (simd_level_ == simd_level::avx512) {
-                assert(cpu_flags_.fAVX512VL);
-                XBYAK_GEN(vinserti32x4, AVX_Y_Y_XM_I, op_dst, op_dst, op_b,
-                        op_imm);
-            } else {
-                XBYAK_GEN(vinsertf128, AVX_Y_Y_YM_I, op_dst, op_dst, op_b,
-                        op_imm);
-            }
-        } break;
-        case 256: {
-            assert(cpu_flags_.fAVX512DQ);
-            XBYAK_GEN(vinserti32x8, AVX_Z_Z_XM_I, op_dst, op_dst, op_b, op_imm);
-        } break;
-        default:
-            COMPILE_ASSERT(
-                    false, FUNC_INFO << "Invalid elem_bits: " << elem_bits);
-    };
-}
-
-void xbyak_lowering_viewer::handle_avx_extract(const operand &op_dst,
-        const operand &op_b, const operand &op_imm,
-        const operand &op_elem_bits) {
-    // Currently we assume that similar instructions do not use masks, and these
-    // instructions only need to pay attention to how many bits are operated. So
-    // we only need to use bits to choose instructions.
-    auto elem_bits = op_elem_bits.get_imm();
-    switch (elem_bits) {
-        case 8: {
-            XBYAK_GEN(vpextrb, AVX_RM_X_I, op_dst, op_b, op_imm);
-        } break;
-        case 16: {
-            XBYAK_GEN(vpextrw, AVX_RM_X_I, op_dst, op_b, op_imm);
-        } break;
-        case 32: {
-            XBYAK_GEN(vpextrd, AVX_RM_X_I, op_dst, op_b, op_imm);
-        } break;
-        case 64: {
-            XBYAK_GEN(vpextrq, AVX_RM_X_I, op_dst, op_b, op_imm);
-        } break;
-        case 128: {
-            if (simd_level_ == simd_level::avx512) {
-                assert(cpu_flags_.fAVX512VL);
-                XBYAK_GEN(vextractf32x4, AVX_XM_Y_I, op_dst, op_b, op_imm);
-            } else {
-                XBYAK_GEN(vextractf128, AVX_XM_Y_I, op_dst, op_b, op_imm);
-            }
-        } break;
-        case 256: {
-            assert(cpu_flags_.fAVX512DQ);
-            XBYAK_GEN(vextractf32x8, AVX_YM_Z_I, op_dst, op_b, op_imm);
-        } break;
-        default:
-            COMPILE_ASSERT(
-                    false, FUNC_INFO << "Invalid elem_bits: " << elem_bits);
-    };
-}
-
-void xbyak_lowering_viewer::handle_avx_unpack_low(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs, const operand &op_imm) {
-    auto elem_bits = op_imm.get_imm();
-    // Generate code for avx unpacklow
-    switch (elem_bits) {
-        case 8: {
-            XBYAK_GEN(vpunpcklbw, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case 16: {
-            XBYAK_GEN(vpunpcklwd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case 32: {
-            XBYAK_GEN(vpunpckldq, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case 64: {
-            XBYAK_GEN(vpunpcklqdq, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(
-                    false, FUNC_INFO << "Invalid elem_bits: " << elem_bits);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_unpack_high(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs, const operand &op_imm) {
-    auto elem_bits = op_imm.get_imm();
-    // Generate code for avx unpackhigh
-    switch (elem_bits) {
-        case 8: {
-            XBYAK_GEN(vpunpckhbw, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case 16: {
-            XBYAK_GEN(vpunpckhwd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case 32: {
-            XBYAK_GEN(vpunpckhdq, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        case 64: {
-            XBYAK_GEN(vpunpckhqdq, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-        } break;
-        default:
-            COMPILE_ASSERT(
-                    false, FUNC_INFO << "Invalid elem_bits: " << elem_bits);
-    };
-}
-
-void xbyak_lowering_viewer::handle_avx_extract_low(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x16:
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512VL);
-        } // fall-through
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::sint_32_x4:
-        case cpu_data_type::float_32_x2: {
-            handle_avx_movps(op_dst, op_src);
-        } break;
-        case cpu_data_type::uint_32_x2:
-        case cpu_data_type::sint_32_x2: {
-            XBYAK_GEN(vmovd, AVX_XMR32_XMR32, op_dst, op_src);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    };
-}
-
-void xbyak_lowering_viewer::handle_avx_extract_high(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-            XBYAK_GEN(vextractf64x4, AVX_XM_Z_I, op_dst, //
-                    op_src, operand(INT64_C(0x01)));
-        } break;
-        case cpu_data_type::sint_32_x16: {
-            assert(cpu_flags_.fAVX512F);
-            XBYAK_GEN(vextracti64x4, AVX_XM_Z_I, op_dst, //
-                    op_src, operand(INT64_C(0x01)));
-        } break;
-        case cpu_data_type::float_32_x8: {
-            switch (simd_level_) {
-                case simd_level::avx512: {
-                    assert(cpu_flags_.fAVX512VL);
-                    XBYAK_GEN(vextractf32x4, AVX_XM_Y_I, op_dst, //
-                            op_src, operand(INT64_C(0x01)));
-                } break;
-                case simd_level::avx2: {
-                    XBYAK_GEN(vextractf128, AVX_XM_Y_I, op_dst, //
-                            op_src, operand(INT64_C(0x01)));
-                } break;
-                default: COMPILE_ASSERT(false, FUNC_INFO << "No simd support");
-            }
-        } break;
-        case cpu_data_type::sint_32_x8: {
-            switch (simd_level_) {
-                case simd_level::avx512: {
-                    assert(cpu_flags_.fAVX512VL);
-                    XBYAK_GEN(vextracti32x4, AVX_XM_Y_I, op_dst, //
-                            op_src, operand(INT64_C(0x01)));
-                } break;
-                case simd_level::avx2: {
-                    XBYAK_GEN(vextracti128, AVX_XM_Y_I, op_dst, //
-                            op_src, operand(INT64_C(0x01)));
-                } break;
-                default: COMPILE_ASSERT(false, FUNC_INFO << "No simd support");
-            }
-        } break;
-        case cpu_data_type::float_32_x4: {
-            XBYAK_GEN(vpermilpd, AVX_X_X_XI, op_dst, //
-                    op_src, operand(INT64_C(0x01)));
-        } break;
-        case cpu_data_type::sint_32_x4: {
-            XBYAK_GEN(vpshufd, AVX_X_X_I, op_dst, //
-                    op_src, operand(INT64_C(0x4e)));
-        } break;
-        case cpu_data_type::float_32_x2: {
-            XBYAK_GEN(vmovshdup, AVX_X_XM, op_dst, //
-                    op_src);
-        } break;
-        case cpu_data_type::sint_32_x2: {
-            XBYAK_GEN(vpextrd, AVX_RM_X_I, op_dst, //
-                    op_src, operand(INT64_C(0x01)));
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_permutex2var(const operand &op_dst,
-        const operand &op_idx, const operand &op_src,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    assert(cpu_flags_.fAVX512F);
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x16:
-        case cpu_data_type::float_32_x4: {
-            XBYAK_GEN(vpermt2ps, AVX_X_X_XM, op_dst, op_idx, op_src);
-        } break;
-        case cpu_data_type::uint_8_x16: {
-            XBYAK_GEN(vpermt2b, AVX_X_X_XM, op_dst, op_idx, op_src);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_permutexvar(const operand &op_dst,
-        const operand &op_idx, const operand &op_src,
-        const x86_64::cpu_data_type &cpu_dtype, const operand &bits) {
-    switch (bits.get_imm()) {
-        case 8: {
-            assert(cpu_flags_.fAVX512VBMI);
-            XBYAK_GEN(vpermb, AVX_X_X_XM, op_dst, op_idx, op_src);
-        } break;
-        case 16: {
-            assert(cpu_flags_.fAVX512BW);
-            XBYAK_GEN(vpermw, AVX_X_X_XM, op_dst, op_idx, op_src);
-        } break;
-        case 64: {
-            if (op_idx.is_imm()) {
-                XBYAK_GEN(vpermq, AVX_Y_YM_I, op_dst, op_src, op_idx);
-            } else {
-                assert(cpu_flags_.fAVX512VL);
-                XBYAK_GEN(vpermq, AVX_Y_Y_YM, op_dst, op_idx, op_src);
-            }
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_broadcast(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype,
-        const x86_64::cpu_data_type &src_dtype) {
-    switch (src_dtype) {
-        case cpu_data_type::uint_16_x8: {
-            assert(cpu_flags_.fAVX512F);
-            if (op_src.is_addr()) {
-                XBYAK_GEN(vbroadcasti32x4, AVX_Y_M, op_dst, op_src);
-            } else {
-                assert(op_src.is_xyz());
-                switch (cpu_dtype) {
-                    case cpu_data_type::uint_16_x32: {
-                        auto op_src_z = operand(to_zmm(op_src.get_reg()));
-                        XBYAK_GEN(vshufi32x4, AVX_Y_Y_XM_I, op_dst, op_src_z,
-                                op_src_z, operand(INT64_C(0x0)));
-                    } break;
-                    case cpu_data_type::uint_16_x16: {
-                        auto op_src_y = operand(to_ymm(op_src.get_reg()));
-                        XBYAK_GEN(vshufi32x4, AVX_Y_Y_XM_I, op_dst, op_src_y,
-                                op_src_y, operand(INT64_C(0x0)));
-                    } break;
-                    default:
-                        COMPILE_ASSERT(
-                                false, "Invalid broadcast: " << cpu_dtype);
-                }
-            }
-        } break;
-        case cpu_data_type::float_32: {
-            XBYAK_GEN(vbroadcastss, AVX_X_XM, op_dst, op_src);
-        } break;
-        case cpu_data_type::uint_32:
-        case cpu_data_type::sint_32: {
-            XBYAK_GEN(vpbroadcastd, AVX_X_XM, op_dst, op_src);
-        } break;
-        case cpu_data_type::float_16:
-        case cpu_data_type::uint_16: {
-            XBYAK_GEN(vpbroadcastw, AVX_X_XM, op_dst, op_src);
-        } break;
-        case cpu_data_type::uint_8:
-        case cpu_data_type::sint_8: {
-            XBYAK_GEN(vpbroadcastb, AVX_X_XM, op_dst, op_src);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << src_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_blend(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs, const operand &op_cond,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    // Generate code for avx blend
-    auto gen_avx_blend = [&]() {
-        // Get avx mask of correct size
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vblendvps, AVX_X_X_XM_X, op_dst, op_lhs, op_rhs,
-                        op_cond);
-            } break;
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vblendvps, AVX_X_X_XM_X, op_dst, op_lhs, op_rhs,
-                        op_cond);
-            } break;
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::sint_8_x32:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::sint_8_x16:
-            case cpu_data_type::uint_8_x8:
-            case cpu_data_type::sint_8_x8: {
-                XBYAK_GEN(vpblendvb, AVX_X_X_XM_X, op_dst, op_lhs, op_rhs,
-                        op_cond);
-            } break;
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8: {
-                // Because each 16-bit element of our __m256i mask will be
-                // either 0xFFFF (all ones) or 0x0 (all zeros), this mask can be
-                // used as a mask for blend byte operand to select data.
-                XBYAK_GEN(vpblendvb, AVX_X_X_XM_X, op_dst, op_lhs, op_rhs,
-                        op_cond);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Generate code for avx512 blend using opmask reg
-    auto gen_avx512_blend = [&]() {
-        COMPILE_ASSERT(op_cond.is_mask(), "op_cond must be Opmask.");
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            case cpu_data_type::float_16_x32:
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::float_16_x8: {
-                XBYAK_GEN(vpblendmw, AVX_X_X_XM, op_dst.set_evex(op_cond),
-                        op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8: {
-                XBYAK_GEN(vblendmps, AVX_X_X_XM, op_dst.set_evex(op_cond),
-                        op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::uint_32_x16:
-            case cpu_data_type::sint_32_x16:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vpblendmd, AVX_X_X_XM, op_dst.set_evex(op_cond),
-                        op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::uint_16_x32:
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8: {
-                XBYAK_GEN(vpblendmw, AVX_X_X_XM, op_dst.set_evex(op_cond),
-                        op_lhs, op_rhs);
-            } break;
-            case cpu_data_type::uint_8_x64:
-            case cpu_data_type::sint_8_x64:
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::sint_8_x32:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::sint_8_x16:
-            case cpu_data_type::uint_8_x8:
-            case cpu_data_type::sint_8_x8: {
-                XBYAK_GEN(vpblendmb, AVX_X_X_XM, op_dst.set_evex(op_cond),
-                        op_lhs, op_rhs);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // gen blend
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_blend(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_blend(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_mask_mov(const operand &op_dst,
-        const operand &op_src, const operand &op_cond,
-        const x86_64::cpu_data_type &cpu_dtype, bool zero) {
-    // Generate code for avx2 mask_mov
-    auto gen_avx2_mask_mov = [&]() {
-        // Get avx mask of correct size
-        COMPILE_ASSERT(!(op_dst.is_addr() && zero), "cannot zero mask store.");
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vmaskmovps, AVX_XM_X_XM, //
-                        op_dst, op_cond, op_src);
-            } break;
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8:
-            case cpu_data_type::uint_32_x4:
-            case cpu_data_type::sint_32_x4: {
-                XBYAK_GEN(vpmaskmovd, AVX_XM_X_XM, //
-                        op_dst, op_cond, op_src);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Generate code for avx512 mask_mov using opmask reg
-    auto gen_avx512_mask_mov = [&]() {
-        COMPILE_ASSERT(op_cond.is_mask(), "op_cond must be Opmask.");
-        COMPILE_ASSERT(!(op_dst.is_addr() && zero), "cannot zero mask store.");
-        assert(cpu_flags_.fAVX512F);
-        switch (cpu_dtype) {
-            // may have other datatypes needs to support
-            case cpu_data_type::uint_8_x64:
-            case cpu_data_type::sint_8_x64:
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::sint_8_x32:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::sint_8_x16:
-            case cpu_data_type::uint_8_x8:
-            case cpu_data_type::sint_8_x8: {
-                XBYAK_GEN(vmovdqu8, AVX_XM_XM, //
-                        op_dst.set_evex(op_cond, zero), op_src);
-            } break;
-            case cpu_data_type::float_16_x32:
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::float_16_x8:
-            case cpu_data_type::float_16_x4:
-            case cpu_data_type::float_16:
-            case cpu_data_type::uint_16_x32:
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8: {
-                XBYAK_GEN(vmovdqu16, AVX_XM_XM, //
-                        op_dst.set_evex(op_cond, zero), op_src);
-            } break;
-            case cpu_data_type::uint_32_x16:
-            case cpu_data_type::sint_32_x16:
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::sint_32_x8:
-            case cpu_data_type::uint_32_x4:
-            case cpu_data_type::sint_32_x4:
-            case cpu_data_type::sint_32: {
-                XBYAK_GEN(vmovdqu32, AVX_XM_XM, //
-                        op_dst.set_evex(op_cond, zero), op_src);
-            } break;
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4:
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vmovups, AVX_XM_XM, //
-                        op_dst.set_evex(op_cond, zero), op_src);
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // gen code
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_mask_mov(); break;
-        case simd_level::avx2: gen_avx2_mask_mov(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_cmov(const operand &op_dst,
-        const operand &op_src, const xbyak_condition &code,
-        const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32:
-        case cpu_data_type::float_16: {
-            Xbyak::Label l_end_cmov;
-            switch (code) {
-                case xbyak_condition::eq: {
-                    // if (ZF=1) jmp over, else do mov
-                    gen_->jne(l_end_cmov, Xbyak::CodeGenerator::T_NEAR);
-                } break;
-                case xbyak_condition::ne: {
-                    // if (ZF=0) jmp over, else do mov
-                    gen_->je(l_end_cmov, Xbyak::CodeGenerator::T_NEAR);
-                } break;
-                default: COMPILE_ASSERT(false, "Invalid condition: " << code);
-            }
-            handle_avx_movss(op_dst, op_src);
-            // end
-            gen_->L(l_end_cmov);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_cmp_set(const operand &op_dst,
-        const operand &op_lhs, const operand &op_rhs,
-        const xbyak_condition &code, const x86_64::cpu_data_type &cpu_dtype) {
-    // cmp type code
-    auto op_imm = [](const xbyak_condition &code) {
-        switch (code) {
-            case xbyak_condition::eq: return operand(INT64_C(0x0));
-            case xbyak_condition::lt: return operand(INT64_C(0x1));
-            case xbyak_condition::le: return operand(INT64_C(0x2));
-            case xbyak_condition::ne: return operand(INT64_C(0x4));
-            case xbyak_condition::ge: return operand(INT64_C(0x5));
-            case xbyak_condition::gt: return operand(INT64_C(0x6));
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid condition: " << code);
-        }
-        return operand();
-    };
-    // Generate code for avx cmp_set
-    auto gen_avx_cmp_set = [&]() {
-        // Get avx mask of correct size
-        switch (cpu_dtype) {
-            case cpu_data_type::float_32_x8:
-            case cpu_data_type::float_32_x4: {
-                XBYAK_GEN(vcmpps, AVX_X_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vcmpss, AVX_X_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::uint_32_x8:
-            case cpu_data_type::uint_32_x4: {
-                switch (code) {
-                    case xbyak_condition::eq: {
-                        XBYAK_GEN(vpcmpeqd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    default: COMPILE_ASSERT(false, "No avx_cmp for: " << code);
-                }
-            } break;
-            case cpu_data_type::sint_32_x8:
-            case cpu_data_type::sint_32_x4: {
-                switch (code) {
-                    case xbyak_condition::eq: {
-                        XBYAK_GEN(vpcmpeqd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    case xbyak_condition::gt: {
-                        XBYAK_GEN(vpcmpgtd, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    default: COMPILE_ASSERT(false, "No avx_cmp for: " << code);
-                }
-            } break;
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8: {
-                switch (code) {
-                    case xbyak_condition::eq: {
-                        XBYAK_GEN(vpcmpeqw, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    default: COMPILE_ASSERT(false, "No avx_cmp for: " << code);
-                }
-            } break;
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::uint_8_x8: {
-                switch (code) {
-                    case xbyak_condition::eq: {
-                        XBYAK_GEN(vpcmpeqb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    default: COMPILE_ASSERT(false, "No avx_cmp for: " << code);
-                }
-            } break;
-            case cpu_data_type::sint_8_x32:
-            case cpu_data_type::sint_8_x16:
-            case cpu_data_type::sint_8_x8: {
-                switch (code) {
-                    case xbyak_condition::eq: {
-                        XBYAK_GEN(vpcmpeqb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    case xbyak_condition::gt: {
-                        XBYAK_GEN(vpcmpgtb, AVX_X_X_XM, op_dst, op_lhs, op_rhs);
-                    } break;
-                    default: COMPILE_ASSERT(false, "No avx_cmp for: " << code);
-                }
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // Generate code for avx512 cmp_set
-    auto gen_avx512_cmp_set = [&]() {
-        switch (cpu_dtype) {
-            case cpu_data_type::float_16_x32:
-            case cpu_data_type::float_16_x16:
-            case cpu_data_type::float_16_x8:
-            case cpu_data_type::float_16_x4: {
-                assert(cpu_flags_.fAVX512FP16);
-                XBYAK_GEN(vcmpph, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::float_16: {
-                XBYAK_GEN(vcomish, AVX_X_XM, op_lhs, op_rhs);
-                // boolean cpu_dtype is uint_8, we just use it directly.
-                handle_x86_set(op_dst, code, cpu_data_type::uint_8);
-            } break;
-            case cpu_data_type::float_32_x16:
-            case cpu_data_type::float_32_x8: {
-                XBYAK_GEN(vcmpps, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::uint_32_x16:
-            case cpu_data_type::uint_32_x8: {
-                XBYAK_GEN(vpcmpud, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::sint_32_x16:
-            case cpu_data_type::sint_32_x8: {
-                XBYAK_GEN(vpcmpd, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::uint_16_x32:
-            case cpu_data_type::uint_16_x16:
-            case cpu_data_type::uint_16_x8: {
-                XBYAK_GEN(vpcmpuw, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::uint_8_x64:
-            case cpu_data_type::uint_8_x32:
-            case cpu_data_type::uint_8_x16:
-            case cpu_data_type::uint_8_x8: {
-                XBYAK_GEN(vpcmpub, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::sint_8_x64:
-            case cpu_data_type::sint_8_x32:
-            case cpu_data_type::sint_8_x16:
-            case cpu_data_type::sint_8_x8: {
-                XBYAK_GEN(vpcmpb, AVX_K_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            case cpu_data_type::float_32: {
-                XBYAK_GEN(vcmpss, AVX_X_X_XM_I, op_dst, op_lhs, op_rhs, //
-                        op_imm(code));
-            } break;
-            default:
-                COMPILE_ASSERT(
-                        false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-        }
-    };
-    // gen code
-    switch (simd_level_) {
-        case simd_level::avx512: gen_avx512_cmp_set(); break;
-        case simd_level::avx2:
-        case simd_level::avx: gen_avx_cmp_set(); break;
-        default: assert(false && "Unreachable");
-    }
-}
-
-void xbyak_lowering_viewer::handle_avx_mov_mask(const operand &op_dst,
-        const operand &op_src, const x86_64::cpu_data_type &cpu_dtype) {
-    switch (cpu_dtype) {
-        case cpu_data_type::float_32_x8:
-        case cpu_data_type::float_32_x4:
-        case cpu_data_type::uint_32_x8:
-        case cpu_data_type::uint_32_x4:
-        case cpu_data_type::sint_32_x8:
-        case cpu_data_type::sint_32_x4: {
-            XBYAK_GEN(vmovmskps, AVX_R64_X, op_dst, op_src);
-        } break;
-        case cpu_data_type::uint_8_x8:
-        case cpu_data_type::sint_8_x8:
-        case cpu_data_type::uint_8_x32:
-        case cpu_data_type::uint_8_x16:
-        case cpu_data_type::sint_8_x32:
-        case cpu_data_type::sint_8_x16: {
-            XBYAK_GEN(vpmovmskb, AVX_R64_X, op_dst, op_src);
-        } break;
-        case cpu_data_type::uint_16_x8:
-        case cpu_data_type::uint_16_x16: {
-            XBYAK_GEN(vpmovmskb, AVX_R64_X, op_dst, op_src);
-        } break;
-        default:
-            COMPILE_ASSERT(false, FUNC_INFO << "Invalid type: " << cpu_dtype);
-    }
-}
-
-//==============================================================================
-// HANDLE CALL MEMBER FUNCTION SECTION
-//==============================================================================
-
-void xbyak_lowering_viewer::handle_pre_call(const stmts_c &v) {
-    // Get attrs
-    assert(v->attr_);
-    assert(v->attr_->has_key(attr_keys::abi_interface));
-
-    auto callee_iface = v->attr_->get<abi_function_interface::ptr>(
-            attr_keys::abi_interface);
-
-    // STEP 1: Save stack info before call prepare
-    location_manager_->conserve_stack_size();
-    // STEP 2: Align stack before call
-    location_manager_->align_call_stack(*callee_iface);
-}
-
-void xbyak_lowering_viewer::handle_call(const expr_c &lhs, const call_c &v) {
-    COMPILE_ASSERT(v->para_attr_.empty(), "Xbyak JIT not support.");
-
-    func_t callee = v->get_prototype();
-    auto expr_func = std::dynamic_pointer_cast<expr_base>(v->func_);
-
-    // STEP 1: Shadow space padding
-    if (profile_.call_convention_ == call_convention::microsoft) {
-        // Microsoft x64 calling convention: caller allocate 32 bytes of
-        // "shadow space" on the stack right before calling the function
-        ASM_COMMENT("caller: allocate shadow space");
-        location_manager_->stack_padding(profile_.shadow_space_bytes_);
-    }
-
-    // STEP 2: Call function address
-    if (expr_func) {
-        // Dynamic function call, func is a expr
-        auto op_func = GET_OPERAND(expr(expr_func));
-        if (op_func.is_reg()) {
-            gen_->call(op_func.get_reg64());
-        } else {
-            gen_->mov(regs::rax, op_func.get_operand());
-            gen_->call(regs::rax);
-        }
-    } else {
-        // Static function call, func is a func_t
-        const auto func_name = callee->name_;
-        ASM_COMMENT("call: <" + func_name + ">");
-        handle_func_resolve(
-                func_name,
-                [&](const Xbyak::Label &label) {
-                    // call label
-                    gen_->call(label);
-                },
-                [&](const uint64_t &addr) {
-                    // call addr
-                    gen_->vzeroupper();
-                    gen_->mov(regs::rax, addr);
-                    gen_->call(regs::rax);
-                });
-    }
-
-    // STEP 3: Post-call cleanup
-    // The System V ABI requires the caller to clean up the callstack after
-    // the callee returns.
-    ASM_COMMENT("caller: post-call cleanup");
-    location_manager_->restore_stack_size();
-
-    // STEP 4: Handle return vaule
-    const sc_data_type_t ret_val_dtype = callee->ret_type_;
-
-    if (lhs.defined() && ret_val_dtype != datatypes::void_t) {
-        // Get the ABI requirements for the call
-        auto callee_iface = cached_func_abi_interface(callee);
-        const abi_value_location ret_val_loc = callee_iface->return_val_loc_;
-
-        // We don't yet support function calls that return their value via
-        // memory. (If/when we do, the 'STACK' tag won't really be accurate;
-        // at least not for psABI-compliant calls.)
-        const abi_value_location::tag_type tag = ret_val_loc.get_type();
-        assert(tag == abi_value_location::tag_type::REGISTER);
-
-        const Xbyak::Reg reg_ret = ret_val_loc.get_register();
-        auto op_ret = GET_OPERAND(lhs);
-
-        if (reg_ret.isXMM()) {
-            handle_avx_movss(op_ret, operand(to_xmm(reg_ret)));
-        } else {
-            handle_x86_mov(op_ret, operand(reg_ret));
-        }
-    }
-}
-
-//==============================================================================
-// VIEW STMT SECTION
-//==============================================================================
-
-void xbyak_lowering_viewer::view(stmts_c v) {
-    if (TRANSFORMED_CALL(v)) {
-        auto conserved = location_manager_->get_conserved_stack_size();
-        ASM_COMMENT("call-scope");
-        handle_pre_call(v);
-        ir_viewer_t::view(v);
-        COMPILE_ASSERT(
-                conserved == location_manager_->get_conserved_stack_size(),
-                "Stack frame has been corrupted after call-scope.")
-    } else {
-        ir_viewer_t::view(v);
-    }
-}
-
-void xbyak_lowering_viewer::view(evaluate_c v) {
-    if (v->value_.isa<call>() || v->value_.isa<low_level_intrin>()) {
-        handle_operations(expr_c(), v->value_);
-    }
-}
-
-void xbyak_lowering_viewer::view(assign_c v) {
-    handle_operations(v->var_, v->value_);
-}
-
-void xbyak_lowering_viewer::view(define_c v) {
-    assert(v->var_.defined());
-    COMPILE_ASSERT(v->var_.isa<tensor_c>() || v->var_.isa<var_c>(),
-            "Not supported local define: " << v);
-    handle_local_definition(v->var_, v->init_);
-}
-
-void xbyak_lowering_viewer::view(returns_c v) {
-    if (v->value_.defined()) {
-        auto cpu_dtype = get_cpu_data_type(v->value_->dtype_);
-        auto slot_size = get_local_value_stack_slot_size(cpu_dtype);
-        assert(slot_size == 8);
-
-        const Xbyak::Reg reg_ret = func_iface_->return_val_loc_.get_register();
-        auto op_val = GET_OPERAND(v->value_);
-
-        switch (cpu_dtype) {
-            case cpu_data_type::uint_8:
-            case cpu_data_type::sint_8:
-            case cpu_data_type::uint_16:
-            case cpu_data_type::uint_32:
-            case cpu_data_type::sint_32:
-            case cpu_data_type::uint_64: {
-                handle_x86_mov(operand(reg_ret), op_val);
-            } break;
-            case cpu_data_type::float_16: {
-                handle_avx_movsh(operand(to_xmm(reg_ret)), op_val);
-            } break;
-            case cpu_data_type::float_32: {
-                handle_avx_movss(operand(to_xmm(reg_ret)), op_val);
-            } break;
-            default: COMPILE_ASSERT(false, "Unsupported type: " << cpu_dtype);
-        }
-    }
-
-    gen_->jmp(l_func_epilogue_);
-}
-
-void xbyak_lowering_viewer::view(if_else_c v) {
-    COMPILE_ASSERT(v->condition_->dtype_ == datatypes::boolean,
-            "Invalid predicate dtype_: " << v->condition_->dtype_);
-    // Prepare Label for end of if
-    Xbyak::Label l_after_if_statement;
-
-    auto cond = GET_OPERAND(v->condition_);
-
-    // if (condition==false)
-    ASM_COMMENT("if condition: " << v->condition_);
-    handle_x86_test(cond);
-    // Emit if bodys
-    if (v->else_case_.defined()) {
-        Xbyak::Label l_else_block;
-        // If (condition==false) jump to the 'else' block.
-        gen_->jz(l_else_block, Xbyak::CodeGenerator::T_NEAR);
-        // Otherwise, fall through to the 'then' block...
-        ASM_COMMENT("if then case: " << v->condition_);
-        dispatch(v->then_case_);
-        gen_->jmp(l_after_if_statement, Xbyak::CodeGenerator::T_NEAR);
-        // Emit the 'else' block...
-        ASM_COMMENT("if else case: " << v->condition_);
-        gen_->L(l_else_block);
-        dispatch(v->else_case_);
-    } else {
-        // If (condition==false) jump to the end.
-        gen_->jz(l_after_if_statement, Xbyak::CodeGenerator::T_NEAR);
-        // Emit the 'then' block...
-        ASM_COMMENT("if then case: " << v->condition_);
-        dispatch(v->then_case_);
-    }
-
-    // Define Label for end of if
-    gen_->L(l_after_if_statement);
-    ASM_COMMENT("if end: " << v->condition_);
-}
-
-void xbyak_lowering_viewer::view(for_loop_c v) {
-    var_c loop_var = v->var_.checked_as<var>();
-
-    COMPILE_ASSERT(loop_var->dtype_ == v->iter_begin_->dtype_,
-            "Mismatched loop expression types");
-    COMPILE_ASSERT(loop_var->dtype_ == v->iter_end_->dtype_,
-            "Mismatched loop expression types");
-    COMPILE_ASSERT(loop_var->dtype_ == v->step_->dtype_,
-            "Mismatched loop expression types");
-
-    const cpu_data_type loop_var_cpu_dtype
-            = get_cpu_data_type(loop_var->dtype_);
-
-    if (v->kind_ == for_type::PARALLEL) {
-        COMPILE_ASSERT(false, "parallel for-loops not handled yet");
-    } else {
-        // ALLOCATE AND INITIALIZE LOOP VAR
-        ASM_COMMENT("for begin: " << loop_var << " = " << v->iter_begin_);
-        if (v->attr_ && v->attr_->has_key(attr_keys::load_loop_begin)) {
-            auto load_begin = v->attr_->get<stmt>(attr_keys::load_loop_begin);
-            dispatch(load_begin);
-        }
-        handle_local_definition(loop_var, v->iter_begin_);
-
-        // LOOP LABELS
-        Xbyak::Label l_loop_test;
-        Xbyak::Label l_after_loop;
-
-        // LOOP TEST
-        // not required, but considered good practice for performance
-        gen_->align(16);
-        gen_->L(l_loop_test);
-
-        // test the loop repeat condition...
-        ASM_COMMENT("for condition: " << loop_var << " < " << v->iter_end_);
-        if (v->attr_ && v->attr_->has_key(attr_keys::load_loop_end)) {
-            auto load_end = v->attr_->get<stmt>(attr_keys::load_loop_end);
-            dispatch(load_end);
-        }
-        auto var_op = GET_OPERAND(loop_var);
-        auto end_op = GET_OPERAND(v->iter_end_);
-        handle_x86_cmp(var_op, end_op);
-        gen_->jge(l_after_loop, Xbyak::CodeGenerator::T_NEAR);
-
-        // LOOP BODY
-        dispatch(v->body_);
-
-        // LOOP INCREMENT
-        ASM_COMMENT("for step: " << loop_var << " += " << v->step_);
-        if (v->attr_ && v->attr_->has_key(attr_keys::load_loop_step)) {
-            auto load_step = v->attr_->get<stmt>(attr_keys::load_loop_step);
-            dispatch(load_step);
-        }
-        handle_x86_intrisic(loop_var, {v->step_}, xbyak_intrin_type::add);
-
-        // JMP TO LOOP TEST
-        gen_->jmp(l_loop_test, Xbyak::CodeGenerator::T_NEAR);
-
-        // AFTER LOOP
-        gen_->L(l_after_loop);
-        ASM_COMMENT("for end: " << loop_var);
-    }
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_lowering_viewer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_lowering_viewer.hpp
deleted file mode 100644
index 1209f7a993c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/backend/xbyak_lowering_viewer.hpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_XBYAK_LOWERING_VIEWER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_BACKEND_XBYAK_LOWERING_VIEWER_HPP
-
-#include <map>
-#include <memory>
-#include <ostream>
-#include <set>
-#include <string>
-#include <vector>
-
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-
-#include <compiler/ir/viewer.hpp>
-#include <compiler/jit/xbyak/backend/location_manager.hpp>
-#include <compiler/jit/xbyak/backend/stack_frame_model.hpp>
-#include <compiler/jit/xbyak/backend/xbyak_jit_generator.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_expr.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_function_interface.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_value_location.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-#include <compiler/jit/xbyak/xbyak_jit.hpp>
-#include <util/array_ref.hpp>
-#include <util/string_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/**
- * Provides most of the logic for translating Graphcompiler IR modules
- * into in-memory code and data.
- */
-class xbyak_lowering_viewer : protected ir_viewer_t {
-public:
-    /// This performs the JIT translation. If no problem occurs, the
-    /// JIT output can be obtained via the 'get_jit_output()' method.
-    ///
-    /// \param xjm The \c xbyak_jit_module that will present the lowered
-    /// code to the rest of the Graphcompiler.
-    ///
-    /// \param ir_mod The Graphcompiler IR module to be JIT'ed. The
-    /// reference only needs to be valid during this c'tor's execution.
-    xbyak_lowering_viewer(const xbyak_jit &xje, const ir_module_t &ir_mod,
-            const x86_64::target_profile_t &profile);
-
-    virtual ~xbyak_lowering_viewer();
-
-    /// Obtain the object used to access the JIT'ed code and data.
-    ///
-    /// For a given instance of @c xbyak_lowering_generator, this will
-    /// always return the same object.
-    /// The returned object remains valid even after this
-    /// @c xbyak_lowering_viewer is destroyed.
-    std::shared_ptr<xbyak_jit_generator> get_jit_output() const;
-
-private:
-    //--------------------------------------------------------------------------
-    // Support for SC_XBYAK_JIT_ASM_LISTING ... FIXME: Redesign this
-    //--------------------------------------------------------------------------
-    struct code_comment {
-        code_comment(const Xbyak::Label &label, std::string comment);
-        Xbyak::Label label_;
-        std::string comment_;
-        const node_base *ir_node_;
-    };
-
-    struct label_line {
-        Xbyak::Label label_;
-        const node_base *ir_node_;
-    };
-
-    std::vector<code_comment> code_comments_;
-    std::vector<label_line> debug_lines_;
-
-    void add_code_comment(std::string text);
-
-    /// Only call this after labels have been resolved to actual memory
-    /// addresses.
-    std::vector<std::unique_ptr<debug_info_mgr>> dump_code_comments(
-            std::ostream &os);
-
-    //--------------------------------------------------------------------------
-    // Member variables
-    //--------------------------------------------------------------------------
-    enum class simd_level {
-        sse = 0,
-        avx,
-        avx2,
-        avx512,
-    } simd_level_;
-
-    const xbyak_jit &xje_;
-
-    /// A pointer to the module that's being lowered.
-    /// Only non-null during the execution of this object's c'tor.
-    const ir_module_t *p_ir_mod_;
-
-    const x86_64::target_profile_t &profile_;
-    const runtime::cpu_flags_t &cpu_flags_;
-
-    std::shared_ptr<xbyak_jit_generator> gen_;
-    std::unique_ptr<location_manager> location_manager_;
-
-    // the map only covers IR-defined functions, not external ones.
-    std::map<std::string, Xbyak::Label> func_name_to_entry_label_;
-    std::map<std::string, Xbyak::Label> func_name_to_exit_label_;
-
-    stack_frame_model sf_model_;
-
-    // During the JIT-translation of a function, this indicates the beginning
-    // of the function's epilogue code.
-    Xbyak::Label l_func_epilogue_;
-
-    // During the JIT-translation of a function, this describes the ABI
-    // details for calling that function.
-    x86_64::abi_function_interface::ptr func_iface_;
-
-    utils::indentation_t logging_ind_;
-
-    utils::indentation_t asm_listing_ind_;
-    //--------------------------------------------------------------------------
-    // Member functions
-    //--------------------------------------------------------------------------
-    const std::vector<expr_c> &cached_func_global_spilled(const func_t &v);
-    const std::set<virt_reg_index_t> &cached_func_register_usage(
-            const func_t &v);
-
-    // Obtaining the callee's address:
-    //
-    //  - If the callee is an IR function, then our lookup will provide us
-    //    with an Xbyak::Label. Xbyak will resolve that label to an actual
-    //    memory address during final codegen.
-    //
-    //  - If the callee is a registered external function, then our lookup
-    //    will provide us with an actual memory address.
-    //
-    using execute_func_label = std::function<void(const Xbyak::Label &label)>;
-    using execute_func_addr = std::function<void(const uint64_t &addr)>;
-
-    void handle_func_resolve(const std::string &name,
-            const execute_func_label &label_f, const execute_func_addr &addr_f);
-    void handle_local_definition(const expr_c &v, const expr_c &v_init);
-
-    //--------------------------------------------------------------------------
-    // Oprations handlers
-    //--------------------------------------------------------------------------
-    // dispatch operations
-    void handle_operations(const expr_c &dst, const expr_c &src);
-    void handle_xbyak_intrin(const expr_c &lhs, const xbyak_intrin_c &rhs);
-    void handle_x86_intrisic(const expr_c &dst, array_ref<expr> args,
-            const xbyak_intrin_type &intrin,
-            const xbyak_intrin_modifier &modifier = xbyak_intrin_modifier());
-    void handle_avx_intrisic(const expr_c &dst, array_ref<expr> args,
-            const xbyak_intrin_type &intrin,
-            const xbyak_intrin_modifier &modifier = xbyak_intrin_modifier());
-
-    // general operations
-    void handle_assign(const expr_c &lhs, const expr_c &rhs);
-    void handle_func_addr(const expr_c &lhs, const func_addr_c &rhs);
-    void handle_tensorptr(const expr_c &lhs, const tensorptr_c &rhs);
-
-    // call operations
-    void handle_call(const expr_c &lhs, const call_c &v);
-    void handle_pre_call(const stmts_c &v);
-    void handle_post_call();
-
-    // cast operations
-    void handle_cast(const expr_c &lhs, const cast_c &v);
-    void handle_saturated_cast(const expr_c &dst, const expr_c &src);
-    void handle_round_and_cast(const expr_c &dst, const expr_c &src);
-    void handle_reinterpret(const expr_c &dst, const expr_c &src);
-
-    // x86 operations
-    void handle_x86_mov(const operand &op_dst, const operand &op_src);
-
-    void handle_x86_test(const operand &op_cond);
-    void handle_x86_sign_ext(
-            const operand &op_rdx, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_x86_div(
-            const operand &op_div, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_x86_cmp(const operand &op_lhs, const operand &op_rhs);
-    void handle_x86_set(const operand &op_dst, const xbyak_condition &cond,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_x86_cmov(const operand &op_dst, const operand &op_src,
-            const xbyak_condition &code,
-            const x86_64::cpu_data_type &cpu_dtype);
-
-    // avx operations
-    void handle_avx_movq(const operand &op_dst, const operand &op_src);
-    void handle_avx_movss(const operand &op_dst, const operand &op_src);
-    void handle_avx_movsh(const operand &op_dst, const operand &op_src);
-    void handle_avx_movps(const operand &op_dst, const operand &op_src);
-    void handle_avx_movph(const operand &op_dst, const operand &op_src);
-    void handle_avx512_kmov(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype);
-
-    void handle_avx_add(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_sub(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_mul(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_mulhl(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_div(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_shl(const operand &op_dst, const operand &op_lhs,
-            const operand &op_sft, const x86_64::cpu_data_type &cpu_dtype,
-            bool variable);
-    void handle_avx_shr(const operand &op_dst, const operand &op_lhs,
-            const operand &op_sft, const x86_64::cpu_data_type &cpu_dtype,
-            bool variable);
-    void handle_avx_sar(const operand &op_dst, const operand &op_lhs,
-            const operand &op_sft, const x86_64::cpu_data_type &cpu_dtype,
-            bool variable);
-    void handle_avx_max(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_abs(const operand &op_lhs, const operand &op_rhs,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_min(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_bit_or(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_bit_and(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_bit_xor(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_round(const operand &op_lhs, const operand &op_rhs,
-            const x86_64::cpu_data_type &cpu_dtype, const int64_t &imm);
-    void handle_avx_sqrt(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_rsqrt(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_fmadd(const operand &op_dst, const operand &op_mul,
-            const operand &op_add, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_fnmadd(const operand &op_dst, const operand &op_mul,
-            const operand &op_add, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_pshuffle(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_shuffle(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const operand &op_imm,
-            const operand &op_bits);
-    void handle_avx_permute(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const operand &op_imm,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_gather(const operand &op_dst, const operand &op_ptr,
-            const operand &op_idx, const operand &op_msk,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_insert(const operand &op_dst, const operand &op_b,
-            const operand &op_imm, const operand &op_elem_bits);
-    void handle_avx_extract(const operand &op_dst, const operand &op_b,
-            const operand &op_imm, const operand &op_elem_bits);
-    void handle_avx_permutex2var(const operand &op_dst, const operand &op_idx,
-            const operand &op_src, const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_permutexvar(const operand &op_dst, const operand &op_idx,
-            const operand &op_src, const x86_64::cpu_data_type &cpu_dtype,
-            const operand &bits);
-    void handle_avx_unpack_low(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const operand &op_imm);
-    void handle_avx_unpack_high(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const operand &op_imm);
-    void handle_avx_extract_low(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_extract_high(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_broadcast(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype,
-            const x86_64::cpu_data_type &src_dtype);
-    void handle_avx_blend(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const operand &op_cond,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_mask_mov(const operand &op_dst, const operand &op_src,
-            const operand &op_cond, const x86_64::cpu_data_type &cpu_dtype,
-            bool zero);
-    void handle_avx_mov_mask(const operand &op_dst, const operand &op_src,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_cmov(const operand &op_dst, const operand &op_src,
-            const xbyak_condition &code,
-            const x86_64::cpu_data_type &cpu_dtype);
-    void handle_avx_cmp_set(const operand &op_dst, const operand &op_lhs,
-            const operand &op_rhs, const xbyak_condition &code,
-            const x86_64::cpu_data_type &cpu_dtype);
-
-protected:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-
-    //--------------------------------------------------------------------------
-    // Overrides of ir_viewer_t methods...
-    //--------------------------------------------------------------------------
-    stmt_c dispatch(stmt_c v) override;
-    func_c dispatch(func_c v) override;
-
-    void view(stmts_c v) override;
-    void view(evaluate_c v) override;
-    void view(assign_c v) override;
-    void view(define_c v) override;
-    void view(returns_c v) override;
-    void view(if_else_c v) override;
-    void view(for_loop_c v) override;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/configured_xbyak.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/configured_xbyak.hpp
deleted file mode 100644
index 9094b135cbc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/configured_xbyak.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_CONFIGURED_XBYAK_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_CONFIGURED_XBYAK_HPP
-
-#ifdef XBYAK_XBYAK_H_
-#error "Don't #include xbyak.h directly! #include this file instead."
-#endif
-
-/**
- * WARNING!!!
- * xbyak.h will define certain classes, etc. differently depending on if/how
- * certain preprocessor variables are defined prior to #include<xbyak.h>.
- *
- * In order to avoid mayhem that would stem from defining these things
- * differently in MKLDNN vs. Graphcompiler, #include *this* header file instead
- * of directly #include<xbyak.h>. This header file does what's needed to match
- * MKLDNN.
- */
-
-/// NOTE: The following #define statements, etc. are copied from
-/// 3rdparty/mkl-dnn/src/cpu/X86/cpu_isa_traits.hpp to ensure
-/// consistency.
-
-/// TODO: If/when such consistency is no longer required between
-/// Graphcompiler's and mkldnn's Xbyak usage, we should reexamine the design
-/// choices implied by these #defines.
-
-#define XBYAK64
-#define XBYAK_NO_OP_NAMES
-
-/* in order to make selinux happy memory that would be marked with X-bit should
- * be obtained with mmap */
-#define XBYAK_USE_MMAP_ALLOCATOR
-
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-/* turn off `size_t to other-type implicit casting` warning
- * currently we have a lot of jit-generated instructions that
- * take uint32_t, but we pass size_t (e.g. due to using sizeof).
- * FIXME: replace size_t parameters with the appropriate ones */
-#pragma warning(disable : 4267)
-#endif
-
-#include <common/compiler_workarounds.hpp>
-
-#if defined(__GNUG__) && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-
-#include "xbyak/xbyak.h"
-#include "xbyak/xbyak_util.h"
-
-#if defined(__GNUG__) && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/debug/debug_info_mgr.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/debug/debug_info_mgr.hpp
deleted file mode 100644
index 4daa61a856e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/debug/debug_info_mgr.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_DEBUG_DEBUG_INFO_MGR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_DEBUG_DEBUG_INFO_MGR_HPP
-
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-struct func_symbol {
-    std::string name_;
-    void *start_;
-    void *end_;
-};
-struct debug_line {
-    void *start_;
-    int line_;
-    int pos_;
-};
-
-struct debug_info_mgr {
-    void *base_;
-    size_t size_;
-    std::string src_path_;
-    debug_info_mgr(void *base, size_t size, const std::string &src_path)
-        : base_ {base}, size_ {size}, src_path_ {src_path} {}
-
-    virtual ~debug_info_mgr() = default;
-};
-
-extern std::mutex debug_info_lock;
-
-#ifdef SC_PROFILING
-extern std::unique_ptr<debug_info_mgr> create_vtune_debug_info(void *base,
-        size_t size, const std::string &src_path,
-        const std::vector<func_symbol> &funcs,
-        const std::vector<debug_line> &lines);
-#endif
-
-inline std::vector<std::unique_ptr<debug_info_mgr>> create_debug_info_mgr(
-        void *base, size_t size, const std::string &src_path,
-        const std::vector<func_symbol> &funcs,
-        const std::vector<debug_line> &lines) {
-    std::vector<std::unique_ptr<debug_info_mgr>> ret;
-#ifdef SC_PROFILING
-    ret.emplace_back(
-            create_vtune_debug_info(base, size, src_path, funcs, lines));
-#endif
-    return ret;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/debug/vtune/vtune.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/debug/vtune/vtune.cpp
deleted file mode 100644
index 1f192d3e6d8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/debug/vtune/vtune.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <map>
-#include <memory>
-#include <common/ittnotify/jitprofiling.h>
-#include <compiler/jit/xbyak/debug/debug_info_mgr.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-struct vtune_debug_info_mgr_t : debug_info_mgr {
-    vtune_debug_info_mgr_t(void *base, size_t size, const std::string &src_path,
-            const std::vector<func_symbol> &funcs,
-            const std::vector<debug_line> &lines)
-        : debug_info_mgr(base, size, src_path) {
-        if (iJIT_IsProfilingActive() != iJIT_SAMPLING_ON) { return; }
-
-        std::map<void *, size_t> addr_func_map;
-        for (size_t i = 0; i < funcs.size(); i++) {
-            addr_func_map.insert(std::make_pair(funcs[i].start_, i));
-        }
-        std::vector<std::vector<const debug_line *>> lines_by_func;
-        lines_by_func.resize(funcs.size());
-        if (!addr_func_map.empty()) {
-            for (auto &line : lines) {
-                // find the first func addr > line.start_
-                auto itr = addr_func_map.upper_bound(line.start_);
-                if (itr == addr_func_map.end()) {
-                    lines_by_func.back().emplace_back(&line);
-                } else if (itr == addr_func_map.begin()) {
-                    lines_by_func.front().emplace_back(&line);
-                } else {
-                    --itr;
-                    lines_by_func[itr->second].emplace_back(&line);
-                }
-            }
-        }
-
-        for (size_t i = 0; i < funcs.size(); i++) {
-            auto jmethod = iJIT_Method_Load();
-            jmethod.method_id = iJIT_GetNewMethodID();
-            jmethod.method_name = (char *)funcs[i].name_.c_str();
-            jmethod.class_file_name = nullptr;
-            jmethod.source_file_name = (char *)src_path.c_str();
-            jmethod.method_load_address = (void *)funcs[i].start_;
-            jmethod.method_size = (unsigned int)((uint8_t *)funcs[i].end_
-                    - (uint8_t *)funcs[i].start_);
-            jmethod.line_number_size = lines_by_func[i].size();
-            std::vector<LineNumberInfo> lineinfo;
-            lineinfo.reserve(lines_by_func[i].size());
-            for (auto &line : lines_by_func[i]) {
-                LineNumberInfo v {(unsigned int)((uint8_t *)line->start_
-                                          - (uint8_t *)funcs[i].start_),
-                        (unsigned int)line->line_};
-                lineinfo.emplace_back(v);
-            }
-            jmethod.line_number_table = lineinfo.data();
-            iJIT_NotifyEvent(
-                    iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED, (void *)&jmethod);
-        }
-    }
-
-    ~vtune_debug_info_mgr_t() override = default;
-};
-
-std::unique_ptr<debug_info_mgr> create_vtune_debug_info(void *base, size_t size,
-        const std::string &src_path, const std::vector<func_symbol> &funcs,
-        const std::vector<debug_line> &lines) {
-    return utils::make_unique<vtune_debug_info_mgr_t>(
-            base, size, src_path, funcs, lines);
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_data_initializer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_data_initializer.cpp
deleted file mode 100644
index 95969cd70a0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_data_initializer.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-
-#include <compiler/jit/xbyak/ir/xbyak_visitor.hpp>
-#include <util/array_ref.hpp>
-
-#include "ir_data_initializer.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class ir_data_initializer_impl_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-    using xbyak_visitor_t::visit;
-
-    ir_data_initializer_impl_t() = default;
-
-    // dispatch override
-    func_c dispatch(func_c v) override {
-        for (auto &p : v->params_) {
-            initialize_expr_data(p, nullptr);
-            if (p.isa<tensor>()) { update_spill_weight(p, 128); }
-        }
-        return xbyak_visitor_t::dispatch(std::move(v));
-    }
-
-    expr_c dispatch(expr_c v) override {
-        initialize_expr_data(v, nullptr);
-        update_spill_weight(v, 32 * loop_depth());
-        return xbyak_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        initialize_stmt_data(v);
-        return xbyak_visitor_t::dispatch(std::move(v));
-    }
-
-    // visit override
-    stmt_c visit(define_c v) override {
-        initialize_expr_data(v->var_, current_scope());
-        if (v->var_.isa<tensor>()) { update_spill_weight(v->var_, 128); }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto ret = xbyak_visitor_t::visit(std::move(v));
-        assert(ret.isa<assign>());
-        auto vv = ret.static_as<assign_c>();
-        // Set specified xbyak_intrin operand to only use fp_reg_vex
-        if (vv->value_.isa<xbyak_intrin>()) {
-            auto intrin = vv->value_.static_as<xbyak_intrin>();
-            switch (static_cast<xbyak_intrin_type>(intrin->type_)) {
-                // vperm2f128 is an avx only intrinsic
-                case xbyak_intrin_type::permute: {
-                    force_fp_reg_vex({vv->var_});
-                    force_fp_reg_vex(intrin->args_);
-                } break;
-                default: break;
-            }
-        }
-        return vv;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        initialize_expr_data(v->var_, current_scope());
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(xbyak_intrin_c v) override {
-        auto &cond_mask = v->modifier_.cond_mask_;
-        if (cond_mask.defined()) { initialize_expr_data(cond_mask, nullptr); }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(tensorptr_c v) override {
-        initialize_expr_data(v->base_, nullptr);
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(indexing_c v) override {
-        auto ret = xbyak_visitor_t::visit(std::move(v));
-        assert(ret.isa<indexing>());
-        // Add more spill weight to ptr and index
-        auto vv = ret.static_as<indexing_c>();
-        assert(!(vv->idx_.empty()) && vv->idx_.back().defined()
-                && vv->ptr_.defined());
-        update_spill_weight(vv->ptr_, 64 * loop_depth());
-        update_spill_weight(vv->idx_.back(), 64 * loop_depth());
-        if (vv->mask_.defined()) {
-            update_spill_weight(vv->mask_, 64 * loop_depth());
-        }
-        return vv;
-    }
-
-private:
-    void initialize_expr_data(const expr_c &v, const stmt_base_t *def_scope) {
-        if (!v->temp_data().isa<xbyak_expr_data_t>()) {
-            v->temp_data() = xbyak_expr_data_t();
-        }
-        if (def_scope) { GET_EXPR_DATA(v).def_scope_ = def_scope; }
-    }
-
-    void initialize_stmt_data(const stmt_c &v) {
-        if (!v->temp_data().isa<xbyak_stmt_data_t>()) {
-            v->temp_data() = xbyak_stmt_data_t(loop_depth());
-        }
-    }
-
-    void force_fp_reg_vex(array_ref<expr> ref) {
-        for (auto &v : ref) {
-            GET_VIRTUAL_REG(v).set_force_fp_vex();
-        }
-    }
-
-    void update_spill_weight(const expr_c &v, const spill_weight_t weight) {
-        GET_VIRTUAL_REG(v).add_weight(weight + 1);
-    }
-};
-
-func_c ir_data_initializer_t::operator()(func_c v) {
-    ir_data_initializer_impl_t ir_data_initializer;
-
-    return ir_data_initializer.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_data_initializer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_data_initializer.hpp
deleted file mode 100644
index 8a8efa02d0e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_data_initializer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_PASS_IR_DATA_INITIALIZER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_PASS_IR_DATA_INITIALIZER_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Walks through the ir nodes and initializes temp_data as xbyak data, record
- * spill weight, parent stmts and loop depth etc.
- * */
-class ir_data_initializer_t : public function_pass_t {
-public:
-    ir_data_initializer_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_indexer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_indexer.cpp
deleted file mode 100644
index aff48901535..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_indexer.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-
-#include <compiler/jit/xbyak/ir/xbyak_visitor.hpp>
-#include <util/any_map.hpp>
-
-#include "ir_indexer.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class ir_indexer_impl_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-
-    stmt_index_t ir_index_;
-
-    func_c dispatch(func_c f) override {
-        ir_index_ = 0;
-        return xbyak_visitor_t::dispatch(std::move(f));
-    }
-
-    stmt_c dispatch(stmt_c s) override {
-        stmt_c ret;
-
-        if (s->node_type_ == sc_stmt_type::for_loop
-                || s->node_type_ == sc_stmt_type::if_else
-                || s->node_type_ == sc_stmt_type::stmts) {
-            ir_index_ += stmt_index_const::increment;
-            auto &stmt_data = GET_STMT_DATA(s);
-            stmt_data.init_index_ = ir_index_;
-        }
-
-        ret = xbyak_visitor_t::dispatch(std::move(s));
-
-        ir_index_ += stmt_index_const::increment;
-        auto &stmt_data = GET_STMT_DATA(ret);
-        stmt_data.set_index(ir_index_);
-
-        return ret;
-    }
-
-    expr_c dispatch(expr_c v) override { return v; }
-};
-
-func_c ir_indexer_t::operator()(func_c v) {
-    ir_indexer_impl_t ir_indexer;
-
-    return ir_indexer.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_indexer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_indexer.hpp
deleted file mode 100644
index fd8d5e7a9af..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/ir_indexer.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_PASS_IR_INDEXER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_PASS_IR_INDEXER_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Create linear incremental index for each stmt after de-ssa pass, will be
- * used later in liveness anlisys and other optmizations.
- * */
-class ir_indexer_t : public function_pass_t {
-public:
-    ir_indexer_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/live_interval.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/live_interval.cpp
deleted file mode 100644
index 6f5892130f2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/live_interval.cpp
+++ /dev/null
@@ -1,232 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <map>
-#include <string>
-#include <utility>
-
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_visitor.hpp>
-#include <util/array_ref.hpp>
-
-#include "live_interval.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class live_interval_impl_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-    using xbyak_visitor_t::visit;
-
-    std::map<uint64_t, stmt_index_t> loop_beg_map_;
-    std::map<uint64_t, stmt_index_t> loop_end_map_;
-
-    expr_c dispatch(expr_c v) override { return v; }
-
-    func_c dispatch(func_c f) override {
-        for (auto &v : f->params_) {
-            create_liveness(v, 0);
-        }
-        return xbyak_visitor_t::dispatch(std::move(f));
-    }
-
-    stmt_c visit(define_c v) override {
-        auto index = GET_STMT_INDEX(v);
-        if (v->init_.defined()) {
-            create_liveness(v->var_, index);
-            update_dependency_liveness(v->init_, index);
-        } else if (v->var_.isa<tensor>()) {
-            create_liveness(v->var_, index);
-        }
-        return std::move(v);
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto index = GET_STMT_INDEX(v);
-        update_dependency_liveness(v->var_, index);
-        update_dependency_liveness(v->value_, index);
-
-        return std::move(v);
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        if (v->value_.defined()) {
-            update_dependency_liveness(v->value_, GET_STMT_INDEX(v));
-        }
-        return std::move(v);
-    }
-
-    stmt_c visit(returns_c v) override {
-        if (v->value_.defined()) {
-            update_dependency_liveness(v->value_, GET_STMT_INDEX(v));
-        }
-        return std::move(v);
-    }
-
-    stmt_c visit(if_else_c v) override {
-        update_dependency_liveness(v->condition_, GET_STMT_INIT_INDEX(v));
-
-        if (v->then_case_.defined()) {
-            xbyak_visitor_t::dispatch(v->then_case_);
-        }
-        if (v->else_case_.defined()) {
-            xbyak_visitor_t::dispatch(v->else_case_);
-        }
-
-        return std::move(v);
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto loop_si = GET_STMT_INIT_INDEX(v);
-        auto loop_ei = GET_STMT_INDEX(v);
-
-        update_dependency_liveness(v->var_, loop_si);
-        update_dependency_liveness(v->iter_begin_, loop_si);
-
-        auto &stmt_data = GET_STMT_DATA(v->body_);
-        loop_end_map_[stmt_data.loop_depth_] = stmt_data.index_;
-        loop_beg_map_[stmt_data.loop_depth_] = stmt_data.init_index_;
-
-        xbyak_visitor_t::dispatch(v->body_);
-
-        update_dependency_liveness(v->var_, loop_ei);
-        update_dependency_liveness(v->step_, loop_ei);
-        update_dependency_liveness(v->iter_end_, loop_ei);
-
-        return std::move(v);
-    }
-
-private:
-    void create_liveness(const expr &cur, stmt_index_t index) {
-        if (!GET_LIVE_RANGE(cur).defined_) {
-            GET_LIVE_RANGE(cur) = live_range_t(index);
-        }
-    }
-
-    void update_liveness(array_ref<expr> ref, stmt_index_t index) {
-        for (auto &v : ref) {
-            create_liveness(v, index);
-            auto &expr_data = GET_EXPR_DATA(v);
-            auto &live_range = GET_LIVE_RANGE(v);
-            auto def_loop_depth_ = expr_data.def_scope_
-                    ? GET_STMT_DATA(expr_data.def_scope_).loop_depth_
-                    : 0;
-            auto cur_loop_depth_ = current_scope()
-                    ? GET_STMT_DATA(current_scope()).loop_depth_
-                    : 0;
-            if (def_loop_depth_ < cur_loop_depth_) {
-                auto scope_beg = loop_beg_map_[def_loop_depth_ + 1];
-                auto scope_end = loop_end_map_[def_loop_depth_ + 1];
-                live_range.update(scope_beg, scope_end);
-            } else {
-                live_range.update(index);
-            }
-            if (v->node_type_ == sc_expr_type::indexing) {
-                update_dependency_liveness(v, index);
-            }
-        }
-    }
-
-    void update_dependency_liveness(const expr &v, stmt_index_t index) {
-        switch (v->node_type_) {
-            case sc_expr_type::undef:
-            case sc_expr_type::ssa_phi: assert(0 && "Unreachable"); break;
-            case sc_expr_type::constant:
-            case sc_expr_type::func_addr: break;
-            case sc_expr_type::tensor:
-            case sc_expr_type::var: {
-                update_liveness(&v, index);
-            } break;
-            case sc_expr_type::cast: {
-                update_liveness(&v.static_as<cast>()->in_, index);
-            } break;
-            case sc_expr_type::add:
-            case sc_expr_type::sub:
-            case sc_expr_type::mul:
-            case sc_expr_type::div:
-            case sc_expr_type::mod: {
-                auto val = v.static_as<binary>();
-                update_liveness({val->l_, val->r_}, index);
-            } break;
-            case sc_expr_type::cmp_eq:
-            case sc_expr_type::cmp_ne:
-            case sc_expr_type::cmp_lt:
-            case sc_expr_type::cmp_le:
-            case sc_expr_type::cmp_gt:
-            case sc_expr_type::cmp_ge: {
-                auto val = v.static_as<cmp>();
-                update_liveness({val->l_, val->r_}, index);
-            } break;
-            case sc_expr_type::logic_and:
-            case sc_expr_type::logic_or: {
-                auto val = v.static_as<logic>();
-                update_liveness({val->l_, val->r_}, index);
-            } break;
-            case sc_expr_type::logic_not: {
-                update_liveness(&v.static_as<logic_not>()->in_, index);
-            } break;
-            case sc_expr_type::select: {
-                auto val = v.static_as<select>();
-                update_liveness({val->cond_, val->l_, val->r_}, index);
-            } break;
-            case sc_expr_type::indexing: {
-                auto val = v.static_as<indexing>();
-                update_liveness(&val->ptr_, index);
-                update_liveness(val->idx_, index);
-                if (val->mask_.defined()) {
-                    update_liveness(&val->mask_, index);
-                }
-            } break;
-            case sc_expr_type::call: {
-                auto val = v.static_as<call>();
-                auto func = std::dynamic_pointer_cast<expr_base>(val->func_);
-                if (func) { update_liveness({expr(func)}, index); }
-                update_liveness(val->args_, index);
-            } break;
-            case sc_expr_type::tensorptr: {
-                auto val = v.static_as<tensorptr>();
-                update_liveness(&val->base_->ptr_, index);
-                update_liveness(val->base_->idx_, index);
-            } break;
-            case sc_expr_type::intrin_call: {
-                update_liveness(v.static_as<intrin_call>()->args_, index);
-            } break;
-            case sc_expr_type::low_level_intrin: {
-                auto val = v.checked_as<xbyak_intrin>();
-                update_liveness(val->args_, index);
-                if (val->modifier_.cond_mask_.defined()) {
-                    update_liveness({val->modifier_.cond_mask_}, index);
-                }
-            } break;
-        }
-    }
-};
-
-func_c live_interval_t::operator()(func_c v) {
-    if (v->name_.find("_should_inline_") != std::string::npos) { return v; }
-    live_interval_impl_t live_interval;
-    return live_interval.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/live_interval.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/live_interval.hpp
deleted file mode 100644
index 3820cad3ad8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/pass/live_interval.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_PASS_LIVE_INTERVAL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_PASS_LIVE_INTERVAL_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Calculate the liveness of each ir expr, the live range is determined by first
- * and last use of an expr at stmt ir index.
- * */
-class live_interval_t : public function_pass_t {
-public:
-    live_interval_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/interval_tree.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/interval_tree.hpp
deleted file mode 100644
index 2fa5113ed2f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/interval_tree.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_INTERVAL_TREE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_INTERVAL_TREE_HPP
-
-#include <algorithm>
-#include <functional>
-#include <set>
-#include <vector>
-
-#include "virtual_reg.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Non-overlapping balanced interval tree implemetation based on std::set.
- * */
-class interval_tree_t {
-public:
-    // constructor
-    interval_tree_t() = default;
-    // destructor
-    virtual ~interval_tree_t() = default;
-
-    bool empty() { return node_map_.empty(); }
-
-    // insert new interval
-    void insert(stmt_index_t start, stmt_index_t end, virtual_reg_t *virt_reg) {
-        node_map_.insert(node_t(start, end, virt_reg));
-    }
-
-    // remove existing interval
-    void remove(stmt_index_t start, stmt_index_t end, virtual_reg_t *virt_reg) {
-        erase_nodes(start, end, virt_reg, [](node_t node) {});
-    }
-
-    // divide existing interval using cut range
-    void divide(stmt_index_t start, stmt_index_t end, virtual_reg_t *virt_reg) {
-        std::vector<node_t> erased_nodes;
-        auto node_func = [&](node_t node) { erased_nodes.push_back(node); };
-        erase_nodes(start, end, virt_reg, node_func);
-        for (auto &node : erased_nodes) {
-            // front
-            stmt_index_t start_front = node.start_;
-            stmt_index_t end_front = std::min(node.end_, start);
-            if (start_front < end_front) {
-                insert(start_front, end_front, virt_reg);
-            }
-            // back
-            stmt_index_t start_back = std::max(node.start_, end);
-            stmt_index_t end_back = node.end_;
-            if (start_back < end_back) {
-                insert(start_back, end_back, virt_reg);
-            }
-        }
-    }
-
-    // search interval for overlap
-    bool search(stmt_index_t start, stmt_index_t end) {
-        auto iter = node_map_.lower_bound(node_t(start, end, nullptr));
-        if (iter != node_map_.begin()) { iter--; }
-        while (iter != node_map_.end()) {
-            auto &node = *iter;
-            if (end <= node.start_) { break; }
-            if (node.intersects(start, end)) { return true; }
-            iter++;
-        }
-        return false;
-    }
-
-    // query interval for overlap
-    void query(stmt_index_t start, stmt_index_t end,
-            std::function<void(virtual_reg_t *)> func) {
-        auto iter = node_map_.lower_bound(node_t(start, end, nullptr));
-        if (iter != node_map_.begin()) { iter--; }
-        while (iter != node_map_.end()) {
-            auto &node = *iter;
-            if (end <= node.start_) { break; }
-            if (node.intersects(start, end)) { func(node.virtual_reg_); }
-            iter++;
-        }
-    }
-
-private:
-    // Internal node
-    struct node_t {
-        stmt_index_t start_;
-        stmt_index_t end_;
-        virtual_reg_t *virtual_reg_;
-
-        bool intersects(stmt_index_t start, stmt_index_t end) const {
-            return std::max(start_, start) < std::min(end_, end);
-        }
-
-        bool operator<(const node_t &b) const { return start_ < b.start_; }
-
-        node_t(stmt_index_t start, stmt_index_t end, virtual_reg_t *virt_reg)
-            : start_(start), end_(end), virtual_reg_(virt_reg) {
-            // must contain valid range
-            assert(start < end);
-        }
-    };
-    // Internal RB-tree
-    std::set<node_t> node_map_;
-
-    // erase interval node
-    void erase_nodes(stmt_index_t start, stmt_index_t end,
-            virtual_reg_t *virt_reg, std::function<void(node_t)> func) {
-        auto iter = node_map_.lower_bound(node_t(start, end, nullptr));
-        if (iter != node_map_.begin()) { iter--; }
-        while (iter != node_map_.end()) {
-            auto &node = *iter;
-            if (end <= node.start_) { break; }
-            if (node.virtual_reg_ == virt_reg) {
-                func(node);
-                iter = node_map_.erase(iter);
-            } else {
-                iter++;
-            }
-        }
-    }
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/live_range.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/live_range.hpp
deleted file mode 100644
index af0eafa81cf..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/live_range.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_LIVE_RANGE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_LIVE_RANGE_HPP
-
-#include <algorithm>
-#include <iostream>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using stmt_index_t = int64_t;
-
-namespace stmt_index_const {
-constexpr stmt_index_t increment = 4;
-}
-
-/* *
- * Live range representation for expr, records start and end point for
- * virtual_reg based on its def-use at stmt_index
- * */
-struct live_range_t {
-    bool defined_ = false;
-
-    stmt_index_t start_ = -1;
-    stmt_index_t end_ = -1;
-
-    live_range_t() = default;
-
-    live_range_t(stmt_index_t start)
-        : defined_(true), start_(start), end_(start) {}
-    live_range_t(stmt_index_t start, stmt_index_t end)
-        : defined_(true), start_(start), end_(end) {}
-
-    void update(stmt_index_t index) { end_ = index; }
-    void update(stmt_index_t init_index, stmt_index_t index) {
-        start_ = (init_index < start_) ? init_index : start_;
-        end_ = index;
-    }
-
-    bool empty() const { return start_ == end_; }
-
-    bool intersects(const live_range_t &b) const {
-        return std::max(start_, b.start_) < std::min(end_, b.end_);
-    }
-
-    bool encompasses(const live_range_t &b) const {
-        return (start_ < b.start_) && (end_ > b.end_);
-    }
-
-    bool enclose(const stmt_index_t &i) const {
-        return (start_ < i) && (end_ > i);
-    }
-
-    friend std::ostream &operator<<(std::ostream &os, const live_range_t &m) {
-        return os << "[" << m.start_ << ", " << m.end_ << "]";
-    }
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/reg_allocator.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/reg_allocator.hpp
deleted file mode 100644
index ff40c536d2e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/reg_allocator.hpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_REG_ALLOCATOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_REG_ALLOCATOR_HPP
-
-#include <deque>
-#include <map>
-#include <memory>
-#include <queue>
-#include <set>
-#include <vector>
-
-#include "virtual_slot.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * vrtual_reg dequeue priority:
- * [spill weight higher] -> [defined earlier] -> [ended earlier]
- * */
-struct spill_weight_comparator_t {
-    bool operator()(const virtual_reg_t *lhs, const virtual_reg_t *rhs) {
-        assert(lhs != nullptr && rhs != nullptr);
-        bool def_later = lhs->live_range_.start_ == rhs->live_range_.start_
-                ? lhs->live_range_.end_ > rhs->live_range_.end_
-                : lhs->live_range_.start_ > rhs->live_range_.start_;
-        bool weight_equal = lhs->spill_weight_ == rhs->spill_weight_;
-        bool weight_less = lhs->spill_weight_ < rhs->spill_weight_;
-        return (weight_equal && def_later) || (!weight_equal && weight_less);
-    }
-};
-
-/* *
- * Priority based register allocator.
- * All unassigned virtual regs need to be enqueued.
- * Dequeue each virtual reg based on priority, allocator will try to assign a
- * virtual slot for each virtual reg.
- * If no slot available, virtual reg will be spilled and if spill resolver find
- * the spilled operand is conflicted with intrinsics' instruction format,
- * conflicts will be resolved and small load/store intervals with infinite spill
- * weight will be created and put back into the priority queue.
- * */
-class reg_allocator_t {
-public:
-    reg_allocator_t(const x86_64::target_profile_t &profile)
-        : target_profile_(profile), require_resolve_(false) {
-        // Win64 ABI considers registers XMM6-XMM15 nonvolatile, but the
-        // upper portions of YMM0-YMM15 and ZMM0-ZMM15 still volatile.
-        // This will introduce unnecessary design complexity, so we treat
-        // all of them as volatile in xbyak generated callers.
-        virtual_slots_map_
-                = std::make_shared<virtual_slots_map_t>(target_profile_, true);
-        virtual_slots_array_ = std::make_shared<virtual_slots_array_t>(
-                virtual_slots_map_->get_slots_sum());
-    }
-    virtual ~reg_allocator_t() = default;
-
-    bool queue_empty() { return virtual_reg_queue_.empty(); }
-
-    void enqueue(virtual_reg_t *virt_reg) { virtual_reg_queue_.push(virt_reg); }
-
-    virtual_reg_t *dequeue() {
-        if (queue_empty()) { return nullptr; }
-        virtual_reg_t *top = virtual_reg_queue_.top();
-        virtual_reg_queue_.pop();
-        return top;
-    }
-
-    // Allocation routine
-    void run_allocator() {
-        // Go through queue and allocate every unassigned virtual_regs
-        while (!queue_empty()) {
-            // Get the virtual_reg on front of queue
-            auto virt_reg = dequeue();
-            // Try to allocate virtual_reg, if interference exists
-            std::set<virtual_reg_t *> evicted;
-            auto index = try_assign(virt_reg, evicted);
-            // Evict confilct virt_regs with less spill weight
-            for (auto &vr : evicted) {
-                unassign(vr);
-            }
-            // Assign or spill virtual reg
-            if (index == virt_reg_const::invalid) {
-                spill(virt_reg);
-            } else {
-                allocate(virt_reg, index);
-            }
-            // Resolve address mode and get new virtual_regs created
-            if (require_resolve()) { resolve_spill(virt_reg->live_range_); }
-        }
-    }
-
-    // Heuristics for determining virtual_reg assign/spill
-    virt_reg_index_t try_assign(
-            virtual_reg_t *virt_reg, std::set<virtual_reg_t *> &evicted) {
-        // Initial value for interference check
-        virt_reg_index_t index = virt_reg_const::invalid;
-        spill_weight_t weight = spill_weight_const::infinity;
-
-        // Check interference for hint reg
-        auto check_interference_hint = [&](bool preserved) {
-            auto &callee_save
-                    = slots_map().get_callee_save_set(virt_reg->type_);
-            index = virt_reg->index_hint_;
-            bool check = preserved
-                    ? callee_save.find(index) != callee_save.end()
-                    : true;
-            if (check) {
-                weight = slots_array().interfered_weights(virt_reg, index);
-            }
-        };
-        // Check interference for all regs
-        auto check_interference_all = [&](bool preserved) {
-            const auto &candidates = preserved
-                    ? slots_map().get_callee_save(virt_reg->type_)
-                    : slots_map().get_slots_index(virt_reg->type_);
-            for (const auto &i : candidates) {
-                auto w = slots_array().interfered_weights(virt_reg, i);
-                if (w <= weight) {
-                    index = i;
-                    weight = w;
-                }
-                if (weight == spill_weight_const::null) { break; }
-            }
-        };
-
-        // Check interference for different types of virtual reg
-        switch (virt_reg->hint_) {
-            case virt_reg_hint::strong: {
-                check_interference_hint(virt_reg->preserved_);
-            } break;
-            case virt_reg_hint::weak: {
-                check_interference_hint(virt_reg->preserved_);
-                if (weight == spill_weight_const::null) { break; }
-                check_interference_all(virt_reg->preserved_);
-            } break;
-            case virt_reg_hint::none: {
-                check_interference_all(virt_reg->preserved_);
-            } break;
-        }
-
-        // Final result
-        if (weight == spill_weight_const::null) {
-            // If slot aviliable
-            return index;
-        } else {
-            if (virt_reg->spill_weight_ <= weight) {
-                // If slot unaviliable
-                assert(virt_reg->spill_weight_ < spill_weight_const::infinity);
-                return virt_reg_const::invalid;
-            } else {
-                // If interfered virtual regs on slot have less spill weight
-                // Evict interfered and assign current virtual reg instead
-                assert(index != virt_reg_const::invalid);
-                evicted = slots_array().interfered_regs(virt_reg, index);
-                return index;
-            }
-        }
-    }
-
-    // Check instruction format and create new load/store interval if needed
-    void resolve_spill(const live_range_t &spill_range) {
-        std::vector<virtual_reg_t *> virtual_regs;
-        resolve_spill_impl(spill_range, virtual_regs);
-        for (auto &vr : virtual_regs) {
-            enqueue(vr);
-        }
-        require_resolve_ = false;
-    }
-
-    // Unassign virtual_reg and put back to the queue
-    void unassign(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        switch (virt_reg->stat_) {
-            case virt_reg_stat::spilled: {
-                virt_reg->set_unassigned();
-            } break;
-            case virt_reg_stat::allocated: {
-                slots_array().unassign_slot(virt_reg, virt_reg->index_);
-                virt_reg->set_unassigned();
-            } break;
-            case virt_reg_stat::disabled:
-            case virt_reg_stat::buffered:
-            case virt_reg_stat::unassigned:
-            case virt_reg_stat::designated: {
-                assert(0 && "Invalid Unassign.");
-            } break;
-        }
-        enqueue(virt_reg);
-    }
-
-    // Allocate physical_reg for virtual_reg
-    void allocate(virtual_reg_t *virt_reg, virt_reg_index_t index) {
-        assert(virt_reg);
-        switch (virt_reg->stat_) {
-            case virt_reg_stat::unassigned: {
-                slots_array().assign_slot(virt_reg, index);
-                virt_reg->set_allocated(index);
-            } break;
-            case virt_reg_stat::designated: {
-                assert(index == virt_reg->index_);
-                slots_array().assign_slot(virt_reg, index);
-            } break;
-            case virt_reg_stat::disabled:
-            case virt_reg_stat::buffered:
-            case virt_reg_stat::allocated:
-            case virt_reg_stat::spilled: {
-                assert(0 && "Invalid Allocate.");
-            } break;
-        }
-    }
-
-    // Spill virtual_reg on stack
-    void spill(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        switch (virt_reg->stat_) {
-            case virt_reg_stat::unassigned: {
-                virt_reg->set_spilled();
-            } break;
-            case virt_reg_stat::allocated:
-            case virt_reg_stat::disabled:
-            case virt_reg_stat::buffered:
-            case virt_reg_stat::designated:
-            case virt_reg_stat::spilled: {
-                assert(0 && "Invalid Spill.");
-            } break;
-        }
-        spilled_virt_regs_.insert(virt_reg);
-        require_resolve_ = true;
-    }
-
-    // Current target profile
-    const x86_64::target_profile_t &target_profile() { return target_profile_; }
-    // Slots map to map between virtual slots and physical registers
-    virtual_slots_map_t &slots_map() { return *virtual_slots_map_; }
-    // Slots array to store allocated virtual registers and check interference
-    virtual_slots_array_t &slots_array() { return *virtual_slots_array_; }
-    // Spilled occurred
-    bool require_resolve() { return require_resolve_; }
-    // Spilled virt regs
-    std::set<virtual_reg_t *> &spilled_virt_regs() {
-        return spilled_virt_regs_;
-    }
-
-    // Virtual function for allocator_impl to create spill resolver
-    virtual void resolve_spill_impl(const live_range_t &spill_range,
-            std::vector<virtual_reg_t *> &virtual_regs)
-            = 0;
-
-private:
-    const x86_64::target_profile_t &target_profile_;
-
-    std::priority_queue<virtual_reg_t *, std::vector<virtual_reg_t *>,
-            spill_weight_comparator_t>
-            virtual_reg_queue_;
-
-    bool require_resolve_;
-    std::set<virtual_reg_t *> spilled_virt_regs_;
-    std::shared_ptr<virtual_slots_map_t> virtual_slots_map_;
-    std::shared_ptr<virtual_slots_array_t> virtual_slots_array_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/virtual_reg.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/virtual_reg.hpp
deleted file mode 100644
index b2bf9903518..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/virtual_reg.hpp
+++ /dev/null
@@ -1,229 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_VIRTUAL_REG_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_VIRTUAL_REG_HPP
-
-#include <compiler/ir/sc_data_type.hpp>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <compiler/jit/xbyak/ir/util/utils.hpp>
-#include <util/utils.hpp>
-
-#include "live_range.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using virt_reg_index_t = int32_t;
-using spill_weight_t = int32_t;
-
-namespace virt_reg_const {
-constexpr virt_reg_index_t invalid = -1;
-} // namespace virt_reg_const
-
-namespace spill_weight_const {
-constexpr spill_weight_t infinity = 65535;
-constexpr spill_weight_t initial = 1;
-constexpr spill_weight_t null = 0;
-} // namespace spill_weight_const
-
-enum class virt_reg_stat {
-    disabled = 0, // no need for register allocation
-    buffered, // tensor buffer at local stack
-    designated, // designated physical registers
-    unassigned, // to be assigned by register allcator
-    allocated, // allocated by register allcator
-    spilled, // spilled by register allcator/all addr operands
-};
-
-enum class virt_reg_type {
-    gp_reg = 0, // x86 general purpose 64-bit registers
-    fp_reg, // x86 AVX/AVX512(0-15/0-31) Extended SIMD registers
-    fp_vex_reg, // x86 SSE/AVX(0-15) SIMD registers
-    mask_reg, // x86 AVX512 mask registers
-    tile_reg, // x86 AMX tile registers
-    NUM_TYPES,
-};
-
-enum class virt_reg_hint {
-    none = 0, // no hint
-    weak, // perfer to be assigned at certain register
-    strong, // must be assigned at certain register or spill
-};
-
-/* *
- * Xbyak::Reg hashing fucntion for hash map
- * */
-struct xbyak_reg_hasher_t {
-    size_t operator()(const Xbyak::Reg &r) const {
-        size_t kind_mask = r.getKind() << 16;
-        return kind_mask | r.getIdx();
-    }
-};
-
-/* *
- * Virtual regsister for each expr, containing low level ir info, e.g. status,
- * type, allocation hint, spill weight, live range, assigned slot, etc.
- * */
-struct virtual_reg_t {
-    virt_reg_hint hint_ = virt_reg_hint::none;
-    virt_reg_type type_ = virt_reg_type::gp_reg;
-    virt_reg_stat stat_ = virt_reg_stat::disabled;
-
-    live_range_t live_range_;
-
-    virt_reg_index_t index_ = virt_reg_const::invalid;
-    virt_reg_index_t index_hint_ = virt_reg_const::invalid;
-
-    spill_weight_t spill_weight_ = spill_weight_const::initial;
-
-    // need to be preserved across function calls
-    bool preserved_ = false;
-    // force fp_reg to only use sse/avx fp_reg_vex XMM0-15
-    bool force_fp_vex_ = false;
-
-    spill_weight_t extra_weight() {
-        auto range = live_range_.end_ - live_range_.start_;
-        spill_weight_t range_weight = range > 0 //
-                ? 1 + (stmt_index_const::increment * 256 / range)
-                : 0;
-        spill_weight_t preserve_weight = preserved_ //
-                ? stmt_index_const::increment * 128
-                : 0;
-        spill_weight_t hint_weight = hint_ != virt_reg_hint::none //
-                ? stmt_index_const::increment * 16
-                : 0;
-        spill_weight_t fp_reg_weight = force_fp_vex_ //
-                ? stmt_index_const::increment * 64
-                : 0;
-        return range_weight + preserve_weight + hint_weight + fp_reg_weight;
-    }
-
-    bool intersects(const virtual_reg_t &b) {
-        return live_range_.intersects(b.live_range_);
-    }
-
-    bool disabled() { return stat_ == virt_reg_stat::disabled; }
-
-    bool buffered() { return stat_ == virt_reg_stat::buffered; }
-
-    bool spilled() { return stat_ == virt_reg_stat::spilled; }
-
-    bool allocated() {
-        return stat_ == virt_reg_stat::designated
-                || stat_ == virt_reg_stat::allocated;
-    }
-
-    void set_preserved() { preserved_ = true; }
-    void set_force_fp_vex() { force_fp_vex_ = true; }
-
-    void set_type(virt_reg_type type) { type_ = type; }
-
-    void set_hint(virt_reg_hint hint, virt_reg_index_t index) {
-        if (static_cast<int>(hint) > static_cast<int>(hint_)) {
-            hint_ = hint;
-            index_hint_ = index;
-        }
-    }
-
-    void reset_hint() {
-        hint_ = virt_reg_hint::none;
-        index_hint_ = virt_reg_const::invalid;
-    }
-
-    void set_buffered() {
-        stat_ = virt_reg_stat::buffered;
-        index_ = virt_reg_const::invalid;
-    }
-
-    void set_unassigned() {
-        stat_ = virt_reg_stat::unassigned;
-        index_ = virt_reg_const::invalid;
-    }
-
-    void set_designated(virt_reg_index_t index) {
-        stat_ = virt_reg_stat::designated;
-        index_ = index;
-        spill_weight_ = spill_weight_const::infinity + 1;
-        set_hint(virt_reg_hint::strong, index);
-    }
-
-    void set_allocated(virt_reg_index_t index) {
-        stat_ = virt_reg_stat::allocated;
-        index_ = index;
-    }
-
-    void set_spilled() {
-        stat_ = virt_reg_stat::spilled;
-        index_ = virt_reg_const::invalid;
-    }
-
-    void add_weight(spill_weight_t weight) {
-        spill_weight_ = std::min(
-                spill_weight_const::infinity - 1, spill_weight_ + weight);
-        assert(spill_weight_ > 0);
-    }
-
-    friend std::ostream &operator<<(std::ostream &os, const virtual_reg_t &m) {
-        static const char *type_enum_str[] = {"gp", "fp", "fpvex", "k", "tmm"};
-        static const char *stat_enum_str[] = {"x", "B", "D", "U", "A", "S"};
-        static const char *hint_enum_str[] = {"", "h", "H"};
-        os << m.live_range_ << ": SW-" //
-           << m.spill_weight_ << ": CP-" //
-           << m.preserved_ << ": FV-" //
-           << m.force_fp_vex_ << ": " //
-           << stat_enum_str[static_cast<int>(m.stat_)]
-           << hint_enum_str[static_cast<int>(m.hint_)];
-        switch (m.stat_) {
-            case virt_reg_stat::designated:
-            case virt_reg_stat::allocated: {
-                os << ": %" << type_enum_str[static_cast<int>(m.type_)]
-                   << m.index_;
-            } break;
-            default: break;
-        }
-        return os;
-    }
-
-    virtual_reg_t() = default;
-    virtual_reg_t(virt_reg_type type) : type_(type) {}
-};
-
-inline virt_reg_type get_virt_reg_type(
-        const sc_data_type_t &t, bool is_avx512, bool force_fp_vex = false) {
-    if (t.type_code_ == sc_data_etype::BOOLEAN && t.lanes_ > 1) {
-        return is_avx512 ? virt_reg_type::mask_reg : virt_reg_type::gp_reg;
-    } else if (force_fp_vex && is_x86_simd(t)) {
-        return virt_reg_type::fp_vex_reg;
-    } else if (is_x86_simd(t)) {
-        return virt_reg_type::fp_reg;
-    } else if (t.is_tile()) {
-        return virt_reg_type::tile_reg;
-    }
-
-    return virt_reg_type::gp_reg;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/virtual_slot.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/virtual_slot.hpp
deleted file mode 100644
index 502c220701a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/reg_allocation/virtual_slot.hpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_VIRTUAL_SLOT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_REG_ALLOCATION_VIRTUAL_SLOT_HPP
-
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/jit/xbyak/x86_64/target_profile.hpp>
-
-#include "interval_tree.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Represent each physical rigister and store live ranges using non-overlapping
- * balanced interval tree
- * */
-class virtual_slot_t : public interval_tree_t {
-public:
-    virtual_slot_t() = default;
-    virtual ~virtual_slot_t() = default;
-
-    void insert(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        auto &live_range = virt_reg->live_range_;
-        interval_tree_t::insert(live_range.start_, live_range.end_, virt_reg);
-    }
-
-    void remove(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        auto &live_range = virt_reg->live_range_;
-        interval_tree_t::remove(live_range.start_, live_range.end_, virt_reg);
-    }
-
-    void divide(virtual_reg_t *virt_reg, const live_range_t &range) {
-        assert(virt_reg);
-        interval_tree_t::divide(range.start_, range.end_, virt_reg);
-    }
-
-    bool intersects(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        auto &live_range = virt_reg->live_range_;
-        return interval_tree_t::search(live_range.start_, live_range.end_);
-    }
-
-    spill_weight_t intersect_weights(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        auto &live_range = virt_reg->live_range_;
-        spill_weight_t weight = spill_weight_const::null;
-        auto query_func = [&](virtual_reg_t *virt_reg) {
-            // Sum of intersected weight
-            weight = std::min(spill_weight_const::infinity,
-                    weight + virt_reg->spill_weight_);
-        };
-        query(live_range.start_, live_range.end_, query_func);
-        return weight;
-    }
-
-    std::set<virtual_reg_t *> intersect_regs(virtual_reg_t *virt_reg) {
-        assert(virt_reg);
-        auto &live_range = virt_reg->live_range_;
-        std::set<virtual_reg_t *> virtual_regs;
-        auto query_func = [&](virtual_reg_t *vreg) {
-            // Set of intersected virtual regs
-            virtual_regs.insert(vreg);
-        };
-        query(live_range.start_, live_range.end_, query_func);
-        return virtual_regs;
-    }
-
-    virtual_reg_t *encompassing(const live_range_t &range) {
-        std::set<virtual_reg_t *> virtual_regs;
-        auto query_func = [&](virtual_reg_t *vreg) {
-            // Set of virtual_regs encompassing range
-            auto &live_range = vreg->live_range_;
-            if (live_range.encompasses(range)) { virtual_regs.insert(vreg); }
-        };
-        query(range.start_, range.end_, query_func);
-        if (virtual_regs.empty()) { return nullptr; }
-        assert(virtual_regs.size() == 1);
-        return *virtual_regs.begin();
-    }
-};
-
-/* *
- * Represent all virtual slots available for assign
- * */
-class virtual_slots_array_t {
-public:
-    virtual_slots_array_t(virt_reg_index_t slots_sum)
-        : virtual_slots_sum_(slots_sum) {
-        virtual_slots_.resize(virtual_slots_sum_);
-    }
-    virtual ~virtual_slots_array_t() = default;
-
-    void assign_slot(virtual_reg_t *virt_reg, virt_reg_index_t index) {
-        assert(virt_reg);
-        assert(index >= 0 && index < virtual_slots_sum_);
-        virtual_slots_[index].insert(virt_reg);
-    }
-
-    void unassign_slot(virtual_reg_t *virt_reg, virt_reg_index_t index) {
-        assert(virt_reg);
-        assert(index >= 0 && index < virtual_slots_sum_);
-        virtual_slots_[index].remove(virt_reg);
-    }
-
-    void divide_interval(virtual_reg_t *virt_reg, const live_range_t &range,
-            virt_reg_index_t index) {
-        assert(virt_reg);
-        assert(index >= 0 && index < virtual_slots_sum_);
-        virtual_slots_[index].divide(virt_reg, range);
-    }
-
-    bool interfered_with(virtual_reg_t *virt_reg, virt_reg_index_t index) {
-        assert(virt_reg);
-        assert(index >= 0 && index < virtual_slots_sum_);
-        return virtual_slots_[index].intersects(virt_reg);
-    }
-
-    spill_weight_t interfered_weights(
-            virtual_reg_t *virt_reg, virt_reg_index_t index) {
-        assert(virt_reg);
-        assert(index >= 0 && index < virtual_slots_sum_);
-        return virtual_slots_[index].intersect_weights(virt_reg);
-    }
-
-    std::set<virtual_reg_t *> interfered_regs(
-            virtual_reg_t *virt_reg, virt_reg_index_t index) {
-        assert(virt_reg);
-        assert(index >= 0 && index < virtual_slots_sum_);
-        return virtual_slots_[index].intersect_regs(virt_reg);
-    }
-
-    virtual_reg_t *encompassing_reg(
-            const live_range_t &range, virt_reg_index_t index) {
-        assert(!range.empty());
-        assert(index >= 0 && index < virtual_slots_sum_);
-        return virtual_slots_[index].encompassing(range);
-    }
-
-    std::set<virt_reg_index_t> utilized_slots() {
-        std::set<virt_reg_index_t> ret_set;
-        for (virt_reg_index_t index = 0; index < virtual_slots_sum_; index++) {
-            if (!virtual_slots_[index].empty()) { ret_set.insert(index); }
-        }
-        return ret_set;
-    }
-
-private:
-    virt_reg_index_t virtual_slots_sum_;
-    std::vector<virtual_slot_t> virtual_slots_;
-};
-
-/* *
- * Map between physical registers and vistual slots.
- * */
-class virtual_slots_map_t {
-public:
-    // When fp_regs_volatile is true
-    // Ignore abi profile and does not treat fp regs as callee-saved
-    virtual_slots_map_t(const x86_64::target_profile_t &profile,
-            bool fp_regs_volatile = false) {
-        // Map all allocatable regs to virtual slot indexes
-        virt_reg_index_t virt_index = 0;
-        // Allocatable slot indexes
-        allocatable_indexes_.resize(static_cast<int>(virt_reg_type::NUM_TYPES));
-        callee_save_indexes_.resize(static_cast<int>(virt_reg_type::NUM_TYPES));
-        callee_save_sets_.resize(static_cast<int>(virt_reg_type::NUM_TYPES));
-        // ========================================
-        // Get allocatable regs from target profile
-        // ========================================
-        auto get_allocatable_regs =
-                [&](const virt_reg_type &reg_type,
-                        const std::vector<Xbyak::Reg> &alloc_regs) {
-                    auto type_index = static_cast<int>(reg_type);
-                    for (size_t i = 0; i < alloc_regs.size(); i++) {
-                        allocatable_indexes_[type_index].push_back(virt_index);
-                        allocatable_regs_.push_back(alloc_regs[i]);
-                        allocatable_regs_name_.push_back(
-                                alloc_regs[i].toString());
-                        xbyak_regs_map_[alloc_regs[i]] = virt_index;
-                        virt_index++;
-                    }
-                };
-        auto get_allocatable_fp_vex_regs
-                = [&](const virt_reg_type &reg_type,
-                          const std::vector<Xbyak::Reg> &alloc_regs) {
-                      auto type_index = static_cast<int>(reg_type);
-                      for (size_t i = 0; i < alloc_regs.size(); i++) {
-                          auto idx = get_reg_index(alloc_regs[i]);
-                          allocatable_indexes_[type_index].push_back(idx);
-                      }
-                  };
-
-        // Allocatable gp regs to virtual indexes
-        get_allocatable_regs(virt_reg_type::gp_reg, profile.alloc_gp_regs_);
-        // Allocatable fp regs to virtual indexes
-        get_allocatable_regs(virt_reg_type::fp_reg, profile.alloc_xmm_regs_);
-        // Allocatable mask regs to virtual indexes
-        get_allocatable_regs(virt_reg_type::mask_reg, profile.alloc_mask_regs_);
-        // Allocatable tile regs to virtual indexes
-        get_allocatable_regs(virt_reg_type::tile_reg, profile.alloc_tile_regs_);
-        // Allocatable fp_vex regs, without create new virt_index
-        get_allocatable_fp_vex_regs(
-                virt_reg_type::fp_vex_reg, profile.alloc_xmm_vex_regs_);
-
-        // Check consitancy
-        assert((size_t)virt_index == allocatable_regs_.size());
-        slots_sum_ = virt_index;
-
-        // =========================================
-        // Get callee saved regs from target profile
-        // =========================================
-        auto get_callee_saved_regs
-                = [&](const virt_reg_type &reg_type,
-                          const std::vector<Xbyak::Reg> &callee_saved_regs) {
-                      auto type_index = static_cast<int>(reg_type);
-                      for (auto &reg : callee_saved_regs) {
-                          auto index = get_reg_index(reg);
-                          callee_save_indexes_[type_index].push_back(index);
-                          callee_save_sets_[type_index].insert(index);
-                      }
-                  };
-        // Allocatable gp regs to virtual indexes
-        get_callee_saved_regs(
-                virt_reg_type::gp_reg, profile.callee_saved_gp_regs_);
-        // Allocatable fp regs to virtual indexes
-        if (!fp_regs_volatile) {
-            get_callee_saved_regs(
-                    virt_reg_type::fp_reg, profile.callee_saved_xmm_regs_);
-        }
-    }
-
-    Xbyak::Reg get_reg_physical(virt_reg_index_t idx) {
-        assert(idx >= 0 && idx < slots_sum_);
-        return allocatable_regs_[idx];
-    }
-
-    std::string get_reg_name(virt_reg_index_t idx) {
-        if (idx >= 0 && idx < slots_sum_) {
-            return allocatable_regs_name_[idx];
-        } else {
-            return "";
-        }
-    }
-
-    virt_reg_index_t get_reg_index(const Xbyak::Reg &reg) {
-        auto iter = xbyak_regs_map_.find(reg);
-        if (iter == xbyak_regs_map_.end()) {
-            assert(false && "Not valid reg.");
-            return virt_reg_const::invalid;
-        }
-        return iter->second;
-    }
-
-    const std::vector<virt_reg_index_t> &get_slots_index(virt_reg_type type) {
-        assert(type != virt_reg_type::NUM_TYPES);
-        return allocatable_indexes_[static_cast<int>(type)];
-    }
-
-    const std::vector<virt_reg_index_t> &get_callee_save(virt_reg_type type) {
-        assert(type != virt_reg_type::NUM_TYPES);
-        return callee_save_indexes_[static_cast<int>(type)];
-    }
-
-    const std::set<virt_reg_index_t> &get_callee_save_set(virt_reg_type type) {
-        assert(type != virt_reg_type::NUM_TYPES);
-        return callee_save_sets_[static_cast<int>(type)];
-    }
-
-    virt_reg_index_t get_slots_sum() { return slots_sum_; }
-
-private:
-    std::vector<std::vector<virt_reg_index_t>> allocatable_indexes_;
-    std::vector<std::vector<virt_reg_index_t>> callee_save_indexes_;
-    std::vector<std::set<virt_reg_index_t>> callee_save_sets_;
-    std::vector<std::string> allocatable_regs_name_;
-    std::vector<Xbyak::Reg> allocatable_regs_;
-    std::unordered_map<Xbyak::Reg, virt_reg_index_t, xbyak_reg_hasher_t>
-            xbyak_regs_map_;
-    virt_reg_index_t slots_sum_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_legalizer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_legalizer.cpp
deleted file mode 100644
index 112ec3aed6c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_legalizer.cpp
+++ /dev/null
@@ -1,535 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_expr.hpp>
-
-#include "avx2_legalizer.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class avx2_legalizer_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    avx2_legalizer_impl_t() = default;
-
-    bool is_base_type_less_than_32bit(const sc_data_type_t &dtype) {
-        return utils::get_sizeof_etype(dtype.type_code_) * 8 < 32;
-    }
-
-    expr_c visit(tensor_c v) override {
-        // avoid dispatch into for loop index dependent tensor
-        return v;
-    }
-
-    expr_c visit(cast_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).dyn_as<cast_c>();
-        assert(vv.defined());
-        auto src = vv->in_;
-        const auto src_dtype = src->dtype_;
-        const auto dst_dtype = vv->dtype_;
-        auto is_32bit_avx2_cast = [&](const sc_data_etype &cmp_dst_type,
-                                          const sc_data_etype &cmp_src_type) {
-            return src_dtype.lanes_ > 1
-                    && (dst_dtype.type_code_ == cmp_dst_type)
-                    && (src_dtype.type_code_ == cmp_src_type);
-        };
-        auto s32_smid_cast = [&](bool is_u16 = false) {
-            expr cast_u16 = builder::make_cast(
-                    sc_data_type_t::u16(src_dtype.lanes_ * 2), src);
-            expr permutexvar_expr = builder::make_permutexvar(
-                    builder::make_constant(UINT64_C(0b11011000)), cast_u16, 4);
-            expr extract_u16_expr = builder::make_extract(
-                    permutexvar_expr, 0, src_dtype.lanes_);
-            if (is_u16) { return extract_u16_expr; }
-            expr cast_dst = builder::make_cast(
-                    sc_data_type_t(dst_dtype.type_code_, src_dtype.lanes_ * 2),
-                    extract_u16_expr);
-            // must cast to index, inorder to use reg64 in extract instruction
-            expr reinpt_1 = builder::make_reinterpret(
-                    cast_dst, sc_data_type_t(sc_data_etype::INDEX, 2));
-            expr extract_64bit = builder::make_extract(reinpt_1, 0);
-            expr reinpt_2 = builder::make_reinterpret(extract_64bit, dst_dtype);
-            return reinpt_2;
-        };
-        if ((is_32bit_avx2_cast(sc_data_etype::U8, sc_data_etype::S32)
-                    || is_32bit_avx2_cast(sc_data_etype::S8, sc_data_etype::S32)
-                    || is_32bit_avx2_cast(sc_data_etype::S8, sc_data_etype::U32)
-                    || is_32bit_avx2_cast(
-                            sc_data_etype::U8, sc_data_etype::U32))) {
-            return s32_smid_cast();
-        } else if ((is_32bit_avx2_cast(sc_data_etype::U16, sc_data_etype::U32)
-                           || is_32bit_avx2_cast(
-                                   sc_data_etype::U16, sc_data_etype::S32))) {
-            return s32_smid_cast(true);
-        }
-        return vv;
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<intrin_call_c>();
-        auto dst_dtype = vv->dtype_;
-        switch (vv->type_) {
-            case intrin_type::rsqrt: {
-                if (dst_dtype.is_etype(sc_data_etype::F32)) {
-                    // AVX2 rsqrt have low precision. Do as llvm
-                    // (rcpps(sqrtps)), 1/sqrt.
-                    std::vector<union_val> init_val(dst_dtype.lanes_, 1.f);
-                    return builder::make_constant(init_val, dst_dtype)
-                            / builder::make_sqrt(vv->args_[0]);
-                }
-            } break;
-            case intrin_type::saturated_cast: {
-                // Currently our project just need u32s32 to u8s8 or u16.
-                assert(utils::is_one_of(dst_dtype.type_code_, sc_data_etype::S8,
-                        sc_data_etype::U16, sc_data_etype::U8));
-                assert(utils::is_one_of(vv->args_[0]->dtype_.type_code_,
-                        sc_data_etype::S32, sc_data_etype::U32));
-                // In avx2, cast is saturated cast.
-                return builder::make_cast(dst_dtype, vv->args_[0]);
-            }; break;
-            default: break;
-        }
-        return vv;
-    }
-
-    expr_c visit(cmp_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).dyn_as<cmp_c>();
-        assert(vv.defined());
-        // AVX2 uint have no cmp other than EQ
-        // AVX2 sint have no cmp other than EQ/GT
-        auto transform_cmp = [this](const cmp_c &v) -> expr {
-            const auto src_dtype = v->l_->dtype_;
-            if (src_dtype.lanes_ > 1) {
-                switch (get_etype_category(src_dtype)) {
-                    case type_category::CATE_UINT: {
-                        return avx_uint_cmp(v);
-                    } break;
-                    case type_category::CATE_INT: {
-                        return avx_sint_cmp(v);
-                    } break;
-                    case type_category::CATE_FLOAT: {
-                        return avx_float_cmp(v);
-                    } break;
-                    default: break; // No need to transform.
-                }
-            }
-            return v.remove_const();
-        };
-        //
-        const auto cmp_dtype = vv->dtype_;
-        return avx_mask_cast(transform_cmp(vv), cmp_dtype);
-    }
-
-    expr_c visit(select_c v) override {
-        // AVX2 mask must cast to the data type it is masking
-        auto vv = ir_visitor_t::visit(std::move(v)).dyn_as<select_c>();
-        assert(vv.defined());
-        auto dtype = vv->dtype_;
-        if (dtype != vv->cond_->dtype_) {
-            return builder::make_select(
-                    avx_mask_cast(vv->cond_, dtype), vv->l_, vv->r_);
-        } else {
-            return vv;
-        }
-    }
-
-    expr_c visit(indexing_c v) override {
-        // AVX2 mask must cast to the data type it is masking
-        auto vv = ir_visitor_t::visit(std::move(v)).dyn_as<indexing_c>();
-        assert(vv.defined());
-        auto dtype = vv->dtype_;
-        if (vv->mask_.defined() && dtype != vv->mask_->dtype_) {
-            // We process u8s8 and bf16 datatype one by one like llvm.
-            if (is_base_type_less_than_32bit(dtype)) {
-                return vv;
-            } else {
-                return builder::make_indexing(vv->ptr_, vv->idx_, dtype.lanes_,
-                        avx_mask_cast(vv->mask_, dtype));
-            }
-        } else {
-            return vv;
-        }
-    }
-
-    void assign_var_stmt(expr &target_var, expr &data, const int imm,
-            const int elem_bits, indexing_c &vv,
-            std::vector<stmt_c> &then_block_list,
-            std::vector<expr> &tmp_offset) const {
-        // insert
-        if (INTRIN_TYPE == INSERT_INTRIN) {
-            then_block_list.emplace_back(builder::make_assign_unattached(
-                    target_var, builder::make_insert(target_var, data, imm)));
-        } else if (INTRIN_TYPE == EXTRACT_INTRIN) { // extract
-            then_block_list.emplace_back(builder::make_assign_unattached(
-                    builder::make_indexing(vv->ptr_, tmp_offset),
-                    builder::make_extract(data, imm)));
-        } else {
-            assert(false && "Expect insert or extract intrin type.");
-        }
-    }
-
-#define PARAM(X) X
-#define DEFINE_OFFSET_ITER_VAR() \
-    auto offset_var = builder::make_var( \
-            datatypes::index, "offset_var" + std::to_string(var_index++)); \
-    cur_list.emplace_back(builder::make_var_tensor_def_unattached(offset_var, \
-            linkage::local, builder::make_constant({0UL}, datatypes::index))); \
-    std::vector<expr> base = vv->idx_; \
-    std::vector<expr> tmp_offset(base); \
-    tmp_offset[tmp_offset.size() - 1] \
-            = tmp_offset[tmp_offset.size() - 1] + offset_var; \
-    expr iter_var = builder::make_var( \
-            datatypes::s32, "iter_var" + std::to_string(var_index++)); \
-    cur_list.emplace_back(builder::make_var_tensor_def_unattached( \
-            iter_var, linkage::local, 1)); \
-    auto mask = builder::make_var( \
-            datatypes::s32, "mask_var" + std::to_string(var_index++)); \
-    cur_list.emplace_back(builder::make_var_tensor_def_unattached(mask, \
-            linkage::local, builder::make_cast(datatypes::s32, vv->mask_)));
-
-#define INSERT_EXTRACT_DATA_TO_VAR(target_var, data, elembits, imm) \
-    assign_var_stmt(target_var, data, imm, elem_bits, vv, then_block_list, \
-            tmp_offset); \
-    then_block = builder::make_stmts_unattached(then_block_list); \
-    cur_list.emplace_back(builder::make_if_else_unattached( \
-            mask >= iter_var, then_block, stmt())); \
-    then_block_list.clear(); \
-    cur_list.emplace_back(builder::make_assign_unattached(offset_var, \
-            builder::make_add(offset_var, \
-                    builder::make_constant({1UL}, datatypes::index)))); \
-    cur_list.emplace_back(builder::make_assign_unattached( \
-            iter_var, builder::make_add(builder::make_shl(iter_var, 1), 1)));
-
-#define INSERT_EXTRACT_DATA_bf16(var_name, data) \
-    for (size_t di = 0; di < 8; di++) { \
-        INSERT_EXTRACT_DATA_TO_VAR(var_name, data, elem_bits, di) \
-    }
-
-#define INSERT_EXTRACT_DATA_u8s8(var_name, data) \
-    for (size_t di = 0; di < 16; di++) { \
-        INSERT_EXTRACT_DATA_TO_VAR(var_name, data, elem_bits, di) \
-    }
-
-    stmt_c insert_byte_by_byte(const expr &value, const expr &var) {
-        auto dtype = var->dtype_;
-        if (value.defined() && (value->node_type_ == sc_expr_type::indexing)) {
-            auto vv = value.static_as<indexing_c>();
-            if (vv->mask_.defined() && is_base_type_less_than_32bit(dtype)) {
-                INTRIN_TYPE = INSERT_INTRIN;
-                std::vector<stmt_c> cur_list, body_list;
-                std::vector<stmt_c> then_block_list;
-                stmt then_block;
-                expr insert_var_1, insert_var_2;
-
-                const int lanes = dtype.lanes_;
-                bool is_s8 = dtype.is_etype(sc_data_etype::S8);
-                bool is_bf16 = dtype.is_etype(sc_data_etype::BF16);
-                const int elem_bits = is_bf16 ? 16 : 8;
-                const int type_bits = 128;
-                bool need_2_xmm = is_bf16 ? lanes > 8 : lanes > 16;
-                const int type_lanes = need_2_xmm ? lanes / 2 : lanes;
-                sc_data_type_t insert_type = is_bf16
-                        ? sc_data_type_t::bf16(type_lanes)
-                        : is_s8 ? sc_data_type_t::s8(type_lanes)
-                                : sc_data_type_t::u8(type_lanes);
-
-                // original defined var
-                cur_list.emplace_back(builder::make_var_tensor_def_unattached(
-                        var, linkage::local,
-                        builder::make_constant({0UL}, dtype)));
-
-                DEFINE_OFFSET_ITER_VAR();
-
-                auto data = builder::make_indexing(vv->ptr_, tmp_offset);
-
-                auto insert_kernel = [&](expr &insert_var, expr &data) {
-                    if (is_bf16) {
-                        INSERT_EXTRACT_DATA_bf16(insert_var, data);
-                    } else {
-                        INSERT_EXTRACT_DATA_u8s8(insert_var, data);
-                    }
-                };
-
-                auto make_insert_var = [&](expr &var) {
-                    var = builder::make_var(insert_type,
-                            "inser_var" + std::to_string(var_index++));
-                    cur_list.emplace_back(
-                            builder::make_var_tensor_def_unattached(var,
-                                    linkage::local,
-                                    builder::make_constant(
-                                            {0UL}, insert_type)));
-                };
-
-                make_insert_var(insert_var_1);
-                if (need_2_xmm) { make_insert_var(insert_var_2); }
-                insert_kernel(insert_var_1, data);
-                if (!need_2_xmm) {
-                    cur_list.emplace_back(
-                            builder::make_assign_unattached(var, insert_var_1));
-                } else {
-                    cur_list.emplace_back(builder::make_assign_unattached(
-                            var, builder::make_insert(var, insert_var_1, 0)));
-                    insert_kernel(insert_var_2, data);
-                    cur_list.emplace_back(builder::make_assign_unattached(
-                            var, builder::make_insert(var, insert_var_2, 1)));
-                }
-
-                auto cur_body = builder::make_stmts_unattached(cur_list);
-                return cur_body;
-            } else {
-                return stmt();
-            }
-        }
-        return stmt();
-    }
-
-    stmt_c extract_byte_by_byte(const expr &value, const expr &var) {
-        auto dtype = value->dtype_;
-        if (var.defined() && (var->node_type_ == sc_expr_type::indexing)) {
-            auto vv = var.static_as<indexing_c>();
-            if (vv->mask_.defined() && is_base_type_less_than_32bit(dtype)) {
-                INTRIN_TYPE = EXTRACT_INTRIN;
-                const int extract_bits = 128;
-                std::vector<stmt_c> cur_list, body_list;
-                std::vector<stmt_c> then_block_list;
-                stmt then_block;
-                const int lanes = dtype.lanes_;
-                bool is_s8 = dtype.is_etype(sc_data_etype::S8);
-                bool is_bf16 = dtype.is_etype(sc_data_etype::BF16);
-                const int elem_bits = is_bf16 ? 16 : 8;
-                const int type_bits = 128;
-                bool need_2_xmm = is_bf16 ? lanes > 8 : lanes > 16;
-                const int type_lanes = need_2_xmm ? lanes / 2 : lanes;
-                sc_data_type_t extract_type = is_bf16
-                        ? sc_data_type_t::bf16(type_lanes)
-                        : is_s8 ? sc_data_type_t::s8(type_lanes)
-                                : sc_data_type_t::u8(type_lanes);
-                expr extract_var_1, extract_var_2;
-
-                auto make_extract_var = [&](expr &var, const int imm) {
-                    var = builder::make_var(extract_type,
-                            "extract_var" + std::to_string(var_index++));
-                    cur_list.emplace_back(
-                            builder::make_var_tensor_def_unattached(var));
-                    cur_list.emplace_back(builder::make_assign_unattached(var,
-                            builder::make_extract(value, imm, type_lanes)));
-                };
-
-                // We need to extract value to xmm register.
-                if (need_2_xmm) {
-                    make_extract_var(extract_var_1, 0);
-                    make_extract_var(extract_var_2, 1);
-                } else {
-                    extract_var_1 = builder::make_var(extract_type,
-                            "extract_var" + std::to_string(var_index++));
-                    cur_list.emplace_back(
-                            builder::make_var_tensor_def_unattached(
-                                    extract_var_1));
-                    cur_list.emplace_back(builder::make_assign_unattached(
-                            extract_var_1, value));
-                }
-
-                DEFINE_OFFSET_ITER_VAR();
-
-                if (is_bf16) {
-                    INSERT_EXTRACT_DATA_bf16(extract_var_1, extract_var_1);
-                    if (need_2_xmm) {
-                        INSERT_EXTRACT_DATA_bf16(extract_var_2, extract_var_2);
-                    }
-                } else {
-                    INSERT_EXTRACT_DATA_u8s8(extract_var_1, extract_var_1);
-                    if (need_2_xmm) {
-                        INSERT_EXTRACT_DATA_u8s8(extract_var_2, extract_var_2);
-                    }
-                }
-
-                auto cur_body = builder::make_stmts_unattached(cur_list);
-                return cur_body;
-            } else {
-                return stmt();
-            }
-        }
-        return stmt();
-    }
-
-    stmt_c visit(define_c v) override {
-        auto defined = ir_visitor_t::visit(std::move(v)).dyn_as<define_c>();
-        assert(defined.defined());
-        auto var = defined->var_;
-        auto value = defined->init_;
-        if (value.defined() && (value->node_type_ == sc_expr_type::indexing)) {
-            auto res = insert_byte_by_byte(value, var);
-            return res.defined() ? res : defined;
-        }
-        return defined;
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto assign = ir_visitor_t::visit(std::move(v)).dyn_as<assign_c>();
-        assert(assign.defined());
-        auto var = assign->var_;
-        auto value = assign->value_;
-        if (var.defined() && (var->node_type_ == sc_expr_type::indexing)) {
-            auto res = extract_byte_by_byte(value, var);
-            return res.defined() ? res : assign;
-        } else if (value.defined()
-                && value->node_type_ == sc_expr_type::indexing) {
-            auto res = insert_byte_by_byte(value, var);
-            return res.defined() ? res : assign;
-        }
-        return assign;
-    }
-
-protected:
-    // repeat var index
-    uint32_t var_index = 1;
-    const int INSERT_INTRIN = 0;
-    const int EXTRACT_INTRIN = 1;
-    int INTRIN_TYPE = -1;
-
-    // Cast AVX2 mask
-    expr avx_mask_cast(const expr &v, sc_data_type_t dtype) {
-        auto nested = v.cast<low_level_intrin_c>().filter(
-                [dtype](const low_level_intrin_c &v) {
-                    return v->kind_ == low_level_intrin_kind::x86_general
-                            && v->type_ == x86_intrin_type::avx_mask_cast
-                            && v->args_[0]->dtype_ == dtype;
-                });
-        if (nested.has_value()) {
-            // eliminate unnecessary nested mask cast
-            return nested.get()->args_[0];
-        } else if (dtype.lanes_ > 1 || v->dtype_.lanes_ > 1) {
-            return builder::make_x86_intrin(
-                    x86_intrin_type::avx_mask_cast, {v}, {{"dtype", dtype}});
-        } else {
-            return v;
-        }
-    }
-
-    // Transform for AVX2 uint compare
-    expr avx_uint_cmp(const cmp_c &v) {
-        using sc_etype = sc_expr_type;
-        const auto &t = v->node_type_;
-        const auto &l = v->l_;
-        const auto &r = v->r_;
-        switch (t) {
-            case sc_etype::cmp_eq: return transform_uint_eq(l, r);
-            case sc_etype::cmp_lt: return transform_uint_lt(l, r);
-            case sc_etype::cmp_le: return transform_uint_le(l, r);
-            case sc_etype::cmp_ne: return transform_uint_ne(l, r);
-            case sc_etype::cmp_ge: return transform_uint_ge(l, r);
-            case sc_etype::cmp_gt: return transform_uint_gt(l, r);
-            default: COMPILE_ASSERT(false, "Invalid compare type: " << t);
-        }
-        return v.remove_const();
-    }
-    expr transform_uint_eq(const expr &l, const expr &r) {
-        auto code = static_cast<uint64_t>(xbyak_condition::eq);
-        return builder::make_x86_intrin(x86_intrin_type::avx_compare,
-                {l, r, builder::make_constant(code)});
-    }
-    expr transform_uint_ne(const expr &l, const expr &r) {
-        auto ones = builder::make_constant({INT64_C(-1)}, l->dtype_);
-        return builder::make_int_xor(transform_uint_eq(l, r), ones);
-    }
-    expr transform_uint_ge(const expr &l, const expr &r) {
-        return transform_uint_eq(builder::make_max(l, r), l);
-    }
-    expr transform_uint_le(const expr &l, const expr &r) {
-        return transform_uint_ge(r, l);
-    }
-    expr transform_uint_gt(const expr &l, const expr &r) {
-        auto ones = builder::make_constant({INT64_C(-1)}, l->dtype_);
-        return builder::make_int_xor(transform_uint_le(l, r), ones);
-    }
-    expr transform_uint_lt(const expr &l, const expr &r) {
-        return transform_uint_gt(r, l);
-    }
-
-    // Transform for AVX2 sint compare
-    expr avx_sint_cmp(const cmp_c &v) {
-        using sc_etype = sc_expr_type;
-        const auto &t = v->node_type_;
-        const auto &l = v->l_;
-        const auto &r = v->r_;
-        switch (t) {
-            case sc_etype::cmp_eq: return transform_sint_eq(l, r);
-            case sc_etype::cmp_lt: return transform_sint_lt(l, r);
-            case sc_etype::cmp_le: return transform_sint_le(l, r);
-            case sc_etype::cmp_ne: return transform_sint_ne(l, r);
-            case sc_etype::cmp_ge: return transform_sint_ge(l, r);
-            case sc_etype::cmp_gt: return transform_sint_gt(l, r);
-            default: COMPILE_ASSERT(false, "Invalid compare type: " << t);
-        }
-        return v.remove_const();
-    }
-    expr transform_sint_eq(const expr &l, const expr &r) {
-        auto code = static_cast<uint64_t>(xbyak_condition::eq);
-        return builder::make_x86_intrin(x86_intrin_type::avx_compare,
-                {l, r, builder::make_constant(code)});
-    }
-    expr transform_sint_ne(const expr &l, const expr &r) {
-        auto ones = builder::make_constant({INT64_C(-1)}, l->dtype_);
-        return builder::make_int_xor(transform_sint_eq(l, r), ones);
-    }
-    expr transform_sint_ge(const expr &l, const expr &r) {
-        return transform_sint_eq(builder::make_max(l, r), l);
-    }
-    expr transform_sint_le(const expr &l, const expr &r) {
-        return transform_sint_ge(r, l);
-    }
-    expr transform_sint_gt(const expr &l, const expr &r) {
-        auto code = static_cast<uint64_t>(xbyak_condition::gt);
-        return builder::make_x86_intrin(x86_intrin_type::avx_compare,
-                {l, r, builder::make_constant(code)});
-    }
-    expr transform_sint_lt(const expr &l, const expr &r) {
-        return transform_sint_gt(r, l);
-    }
-
-    // Transform for AVX2 float compare
-    expr avx_float_cmp(const cmp_c &v) {
-        auto code = static_cast<uint64_t>(get_xbyak_condition(v->node_type_));
-        return builder::make_x86_intrin(x86_intrin_type::avx_compare,
-                {v->l_, v->r_, builder::make_constant(code)});
-    }
-};
-
-func_c avx2_legalizer_t::operator()(func_c v) {
-    // No need for AVX2 legalization when AVX512 is available
-    if (target_machine_.cpu_flags_.fAVX512F) { return v; }
-    avx2_legalizer_impl_t avx2_legalizer;
-    return avx2_legalizer.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_legalizer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_legalizer.hpp
deleted file mode 100644
index c32f3098bcd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_legalizer.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_AVX2_LEGALIZER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_AVX2_LEGALIZER_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * avx2 legalizer for xbyak backend, make sure data type and operation is
- * compatible with avx2 intrinsic requirement.
- * */
-class avx2_legalizer_t : public function_pass_t {
-public:
-    avx2_legalizer_t(const runtime::target_machine_t &target_machine)
-        : target_machine_(target_machine) {}
-    func_c operator()(func_c v) override;
-
-private:
-    const runtime::target_machine_t &target_machine_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.cpp
deleted file mode 100644
index 4e3bdfc9d9a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <unordered_map>
-
-#include "avx2_mask_indexing_transform.hpp"
-#include "util/utils.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class avx2_mask_indexing_transform_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    std::vector<stmt_c> idx_transform_stmt;
-    std::unordered_map<expr_c, expr> idx_var_map;
-
-    avx2_mask_indexing_transform_impl_t() = default;
-
-    bool is_base_type_less_than_32bit(const sc_data_type_t &dtype) {
-        return utils::get_sizeof_etype(dtype.type_code_) * 8 < 32;
-    }
-
-    stmt_c process_scope_indexing(
-            const expr &value, const expr &var, const stmt_c &v) {
-        auto is_mask_indexing = [&](const expr &inp) {
-            return (inp.defined() && inp->node_type_ == sc_expr_type::indexing
-                    && inp.dyn_as<indexing_c>()->mask_.defined()
-                    && is_base_type_less_than_32bit(inp->dtype_));
-        };
-        // a = min(b, u16x8(A[i@M])) or A[i@M] = B[i@M]
-        if ((value.defined() && value->node_type_ != sc_expr_type::indexing)
-                || (is_mask_indexing(var) && is_mask_indexing(value))) {
-            assert(idx_transform_stmt.empty());
-            auto vv = dispatch(value);
-            if (!idx_transform_stmt.empty()) {
-                std::vector<stmt_c> cur_list(std::move(idx_transform_stmt));
-                if (v.isa<define_c>()) {
-                    cur_list.emplace_back(
-                            builder::make_var_tensor_def_unattached(
-                                    var, v.dyn_as<define_c>()->linkage_, vv));
-                } else {
-                    cur_list.emplace_back(
-                            builder::make_assign_unattached(var, vv));
-                }
-                idx_var_map.clear();
-                idx_transform_stmt.clear();
-                return builder::make_stmts_unattached(cur_list);
-            }
-        }
-        return v;
-    }
-
-    expr_c visit(tensor_c v) override {
-        // avoid dispatch into for loop index dependent tensor
-        return v;
-    }
-
-    expr_c visit(indexing_c v) override {
-        auto vv = ir_visitor_t::visit(v).dyn_as<indexing_c>();
-        if (vv.defined() && is_base_type_less_than_32bit(vv->dtype_)
-                && vv->mask_.defined()) {
-            if (idx_var_map.find(vv) != idx_var_map.end()) {
-                return idx_var_map[vv];
-            }
-            auto nested_idx_var = builder::make_var(
-                    vv->dtype_, "indexing_nested" + std::to_string(count_++));
-            auto nested_var_define = builder::make_var_tensor_def_unattached(
-                    nested_idx_var, linkage::local);
-            auto nested_assign
-                    = builder::make_assign_unattached(nested_idx_var, vv);
-            std::vector<stmt_c> cur_list;
-            cur_list.emplace_back(nested_var_define);
-            cur_list.emplace_back(nested_assign);
-            auto nested_stmts = builder::make_stmts_unattached(cur_list);
-            idx_transform_stmt.emplace_back(nested_stmts);
-            idx_var_map.insert(std::make_pair(vv, nested_idx_var));
-            return nested_idx_var;
-        }
-        return vv;
-    }
-
-    stmt_c visit(define_c v) override {
-        // multiple nested expr has indexing
-        return process_scope_indexing(v->init_, v->var_, v);
-    }
-
-    stmt_c visit(assign_c v) override {
-        // multiple nested expr has indexing
-        return process_scope_indexing(v->value_, v->var_, v);
-    }
-
-private:
-    size_t count_ = 0;
-};
-
-func_c avx2_mask_indexing_t::operator()(func_c v) {
-    // No need for AVX2 legalization when AVX512 is available
-    if (target_machine_.cpu_flags_.fAVX512F) { return v; }
-    avx2_mask_indexing_transform_impl_t avx2_indexing_transform;
-    return avx2_indexing_transform.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.hpp
deleted file mode 100644
index c7ee401cde9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_AVX2_MASK_INDEXING_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_AVX2_MASK_INDEXING_TRANSFORM_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * avx2 mask indexing transformation for xbyak backend, make sure indexing is
- * not nested. So that avx2 legalize pass can handle indexing correctly.
- * */
-class avx2_mask_indexing_t : public function_pass_t {
-public:
-    avx2_mask_indexing_t(const runtime::target_machine_t &target_machine)
-        : target_machine_(target_machine) {}
-    func_c operator()(func_c v) override;
-
-private:
-    const runtime::target_machine_t &target_machine_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/call_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/call_transform.cpp
deleted file mode 100644
index cc1d2a5ef11..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/call_transform.cpp
+++ /dev/null
@@ -1,242 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_expr.hpp>
-
-#include "call_transform.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-using namespace xbyak::x86_64;
-
-// Cached ABI infomation inside func node
-abi_function_interface::ptr cached_func_abi_interface(const func_t &v) {
-    assert(v->attr_ && v->attr_->has_key(attr_keys::abi_interface));
-    auto func_iface = v->attr_->get<abi_function_interface::ptr>(
-            attr_keys::abi_interface);
-    return func_iface;
-}
-
-// Cached ABI infomation inside call node
-abi_function_interface::ptr cached_call_abi_interface(const call_c &v) {
-    return cached_func_abi_interface(v->get_prototype());
-}
-
-/* *
- * Extract call node as root expr node, so it is easier to transform individual
- * call into a call scope stmts.
- * */
-class call_extract_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    call_extract_impl_t() = default;
-
-    size_t index_ = 0;
-    size_t expr_depth_ = 0;
-
-    std::vector<stmt> stmt_seq_;
-
-    expr_c dispatch(expr_c v) override {
-        expr_depth_++;
-        auto ret = ir_visitor_t::dispatch(std::move(v));
-        expr_depth_--;
-        if (ret.isa<call_c>() && expr_depth_ > 0) {
-            return extract_call(ret.remove_const());
-        }
-        return ret;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt> new_seq;
-        for (auto &s : v->seq_) {
-            auto ss = ir_visitor_t::dispatch(s);
-            for (auto &&ts : stmt_seq_) {
-                new_seq.push_back(std::move(ts));
-            }
-            new_seq.emplace_back(ss.remove_const());
-            stmt_seq_.clear();
-        }
-        return copy_attr(*v, make_stmt<stmts_node_t>(std::move(new_seq)));
-    }
-
-    expr extract_call(expr v) {
-        auto call_var = builder::make_var(
-                v->dtype_, "tmp_call_var_" + std::to_string(index_++));
-        stmt_seq_.emplace_back(make_stmt<define_node_t>(
-                call_var, linkage::local, std::move(v)));
-        return call_var;
-    }
-};
-
-/* *
- * Transform each call node to dedicated stmts node, mark the node as function
- * call scope, add reverse order args inside scope. And make arg passed to %rdx
- * reg to be the last arg assigned (%rdx is a special reg, may used by certain
- * operations inside call scope)
- * */
-class call_transform_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    call_transform_impl_t(const x86_64::target_profile_t &profile)
-        : profile_(profile) {}
-
-    const x86_64::target_profile_t &profile_;
-
-    abi_function_interface::ptr func_iface_;
-
-    std::vector<stmt> new_seq_;
-    std::vector<stmt> new_rdx_;
-
-    size_t index_ = 0;
-
-    func_c dispatch(func_c v) override {
-        func_t func = std::const_pointer_cast<func_base>(v);
-        cache_abi_interface(func);
-        return ir_visitor_t::dispatch(std::move(v));
-    }
-
-    // Reverse call args order
-    expr_c dispatch(expr_c v) override {
-        if (v.isa<call>()) {
-            auto vv = v.static_as<call_c>();
-            func_t callee = vv->get_prototype();
-            func_iface_ = cache_abi_interface(callee);
-            auto &old_args = vv->args_;
-            std::vector<expr> new_args;
-            for (size_t i = 0; i < old_args.size(); i++) {
-                bool is_reg_rdx = false;
-                auto &arg = old_args[i];
-                auto &initial_loc = func_iface_->param_locs_[i];
-                // Create new arg placeholder to be replace args in call node
-                auto arg_placeholder = builder::make_var(arg->dtype_,
-                        "call_arg_" + std::to_string(index_) + "_"
-                                + std::to_string(i + 1));
-                // Create wrapper for actural arg value
-                auto arg_wrapper = make_xbyak_intrin(
-                        arg->dtype_, {arg}, xbyak_intrin_type::call_arg);
-                // Get currrent arg location, %rdx is spcical arg
-                if (initial_loc.get_type()
-                        == abi_value_location::tag_type::REGISTER) {
-                    Xbyak::Reg arg_reg = initial_loc.get_register();
-                    if (arg_reg.isREG()
-                            && arg_reg.getIdx() == Xbyak::Operand::RDX) {
-                        is_reg_rdx = true;
-                    }
-                }
-                // Arg passed to %rdx reg need to be the last assigned
-                if (is_reg_rdx) {
-                    new_rdx_.emplace_back(make_stmt<define_node_t>(
-                            arg_placeholder, linkage::local, arg_wrapper));
-                } else {
-                    new_seq_.emplace_back(make_stmt<define_node_t>(
-                            arg_placeholder, linkage::local, arg_wrapper));
-                }
-                // Add to new call node args
-                new_args.emplace_back(std::move(arg_placeholder));
-            }
-            index_++;
-            return copy_attr(
-                    *vv, make_expr<call_node>(vv->func_, std::move(new_args)));
-        } else {
-            // After extraction, call node only exists in root expr node.
-            return v;
-        }
-    }
-
-    stmt_c transform_call_stmt(bool is_call, stmt_c v) {
-        if (is_call) {
-            std::reverse(new_seq_.begin(), new_seq_.end());
-            if (!new_rdx_.empty()) {
-                new_seq_.emplace_back(std::move(new_rdx_.back()));
-            }
-            new_seq_.emplace_back(v.remove_const());
-            auto ret = make_stmt<stmts_node_t>(std::move(new_seq_));
-            ret->attr().set(attr_keys::abi_interface, func_iface_);
-            new_seq_.clear();
-            new_rdx_.clear();
-            return ret;
-        } else {
-            return v;
-        }
-    }
-
-    stmt_c visit(define_c v) override {
-        bool is_call = v->init_.defined() && v->init_.isa<call>();
-        auto vv = ir_visitor_t::visit(std::move(v));
-        return transform_call_stmt(is_call, std::move(vv));
-    }
-
-    stmt_c visit(assign_c v) override {
-        bool is_call = v->value_.defined() && v->value_.isa<call>();
-        auto vv = ir_visitor_t::visit(std::move(v));
-        return transform_call_stmt(is_call, std::move(vv));
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        bool is_call = v->value_.defined() && v->value_.isa<call>();
-        auto vv = ir_visitor_t::visit(std::move(v));
-        return transform_call_stmt(is_call, std::move(vv));
-    }
-
-    stmt_c visit(returns_c v) override {
-        bool is_call = v->value_.defined() && v->value_.isa<call>();
-        auto vv = ir_visitor_t::visit(std::move(v));
-        return transform_call_stmt(is_call, std::move(vv));
-    }
-
-    abi_function_interface::ptr cache_abi_interface(func_t &v) {
-        if (v->attr_ && v->attr_->has_key(attr_keys::abi_interface)) {
-            auto func_iface = v->attr_->get<abi_function_interface::ptr>(
-                    attr_keys::abi_interface);
-            return func_iface;
-        } else {
-            auto func_iface = abi_function_interface::make_interface(
-                    profile_, v->params_, v->ret_type_);
-            v->attr().set(attr_keys::abi_interface, func_iface);
-            return func_iface;
-        }
-    }
-};
-
-func_c call_transform_t::operator()(func_c v) {
-    if (v->name_.find("_should_inline_") != std::string::npos) { return v; }
-    call_extract_impl_t call_extract;
-    call_transform_impl_t call_transform(profile_);
-    return call_transform.dispatch(call_extract.dispatch(std::move(v)));
-}
-
-call_transform_t::call_transform_t(const x86_64::target_profile_t &profile)
-    : profile_(profile) {}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/call_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/call_transform.hpp
deleted file mode 100644
index d20ac4db131..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/call_transform.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_CALL_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_CALL_TRANSFORM_HPP
-
-#include <compiler/ir/function_pass.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_function_interface.hpp>
-#include <compiler/jit/xbyak/x86_64/target_profile.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-namespace attr_keys {
-// attr for func and call scope, contains x86 calling convention info
-// data type: x86_64::abi_function_interface::ptr
-constexpr const char *abi_interface = "abi_interface";
-} // namespace attr_keys
-
-#define TRANSFORMED_CALL(STMT) \
-    (STMT->attr_ && STMT->attr_->has_key(attr_keys::abi_interface))
-
-// Cached ABI infomation inside func and call node
-x86_64::abi_function_interface::ptr cached_func_abi_interface(const func_t &v);
-x86_64::abi_function_interface::ptr cached_call_abi_interface(const call_c &v);
-
-/* *
- * Extract call node as root expr node and make dedicated stmts for each call
- * node, mark the stmts as function call scope, add reverse order args inside
- * scpoe. The process is aimed to transform all function call temp calculations
- * into a local scope, so that the register utilization can be much higher.
- * */
-class call_transform_t : public function_pass_t {
-public:
-    call_transform_t(const x86_64::target_profile_t &profile);
-    func_c operator()(func_c v) override;
-
-private:
-    const x86_64::target_profile_t &profile_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/constant_optimizer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/constant_optimizer.cpp
deleted file mode 100644
index c04e31e5a16..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/constant_optimizer.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <compiler/jit/xbyak/ir/util/utils.hpp>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-#include "constant_optimizer.hpp"
-
-#define HAS_KEY_ENCODE(EXPR) \
-    ((EXPR)->attr_ && (EXPR)->attr_->has_key(attr_keys::force_simd_encode))
-
-#define HAS_KEY_LOAD(EXPR) \
-    ((EXPR)->attr_ && (EXPR)->attr_->has_key(attr_keys::load_simd_value))
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class constant_optimizer_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    stmt_c visit(define_c v) override {
-        if (v->init_.defined() && v->init_.isa<constant>()
-                && is_x86_simd(v->init_->dtype_)) {
-            v->init_->attr().set(attr_keys::load_simd_value, true);
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(assign_c v) override {
-        if (v->value_.isa<constant>() && is_x86_simd(v->value_->dtype_)) {
-            v->value_->attr().set(attr_keys::load_simd_value, true);
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(tensor_c v) override {
-        // avoid constant_optimizer for loop index dependent tensor
-        return v;
-    }
-
-    expr_c visit(constant_c v) override {
-        const auto &dtype = v->dtype_;
-        const auto &value = v->value_;
-        // Transform multi-lane constant to broadcasted single-lane constant
-        if (!HAS_KEY_ENCODE(v) && dtype.lanes_ > 1 && value.size() == 1) {
-            auto new_const = single_lane_constant(v);
-            new_const->attr().set(attr_keys::force_simd_encode, true);
-            return builder::make_broadcast(new_const, dtype.lanes_);
-        } else if (HAS_KEY_LOAD(v)) {
-            return make_expr<intrin_call_node>(intrin_type::constant_load,
-                    std::vector<expr> {v.remove_const()}, any_map_t());
-        } else {
-            return v;
-        }
-    }
-
-    expr_c visit(mul_c v) override {
-        // If constant is 2^N, transform mul(lhs, 2^n) to shl(rhs, n)
-        auto transform_const_mul = [this](const expr &lhs, const expr &rhs) {
-            auto const_rhs = rhs.static_as<constant>();
-            if (is_power_of_2(const_rhs)) {
-                auto imm = power_of_2_exponent(const_rhs);
-                return builder::make_shl(lhs, imm);
-            } else {
-                return builder::make_mul(lhs, rhs);
-            }
-        };
-        // Make constant in multiply always positioned in rhs
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<mul_c>();
-        if (vv->r_.isa<constant>()) {
-            return transform_const_mul(vv->l_, vv->r_);
-        } else if (vv->l_.isa<constant>()) {
-            return transform_const_mul(vv->r_, vv->l_);
-        } else {
-            return vv;
-        }
-    }
-
-    expr_c visit(div_c v) override {
-        // Transform unsigned div if divisor is a const of power of 2
-        if (v->r_.isa<constant>()
-                && v->r_->dtype_.is_etype(sc_data_etype::S32)) {
-            v->r_->attr().set(attr_keys::force_simd_encode, false);
-        }
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<div_c>();
-        if (vv->r_.isa<constant>()) {
-            auto const_rhs = vv->r_.static_as<constant>();
-            bool transform_pow_2
-                    = utils::is_one_of(vv->l_->dtype_, datatypes::u8,
-                              datatypes::u16, datatypes::u32, datatypes::index)
-                    && is_power_of_2(const_rhs);
-            // If constant is 2^N, transform div(x, 2^n) to shr(x, n)
-            if (transform_pow_2) {
-                auto imm = power_of_2_exponent(const_rhs);
-                return builder::make_shr(vv->l_, imm);
-            }
-        }
-        return vv;
-    }
-
-    expr_c visit(mod_c v) override {
-        // Transform unsigned mod if divisor is a const of power of 2
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<mod_c>();
-        if (vv->r_.isa<constant>()) {
-            auto const_rhs = vv->r_.static_as<constant>();
-            bool transform_pow_2
-                    = utils::is_one_of(vv->l_->dtype_, datatypes::u8,
-                              datatypes::u16, datatypes::u32, datatypes::index)
-                    && is_power_of_2(const_rhs);
-            // If constant is 2^N, transform mod(x, 2^n) to int_and(x, 2^n - 1)
-            if (transform_pow_2) {
-                auto imm = power_of_2_minus_1(const_rhs);
-                return builder::make_int_and(vv->l_, imm);
-            }
-        }
-        return vv;
-    }
-
-    expr_c visit(select_c v) override {
-        // Keep zero blend for potential simd optimization
-        if (v->r_.isa<constant>()) {
-            auto const_rhs = v->r_.static_as<constant>();
-            if (is_zero(const_rhs)) {
-                const_rhs->attr().set(attr_keys::force_simd_encode, false);
-            }
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        // convert same value constant simd shift to scalar value
-        // so xbyak can use imm in shift function
-        auto transform_shift = [this](intrin_call_c v) -> expr_c {
-            assert(v->args_.size() == 2);
-            auto &l = v->args_[0];
-            auto &r = v->args_[1];
-            if (r.isa<constant>()) {
-                auto const_rhs = r.static_as<constant>();
-                if (const_rhs->value_.size() == 1
-                        && const_rhs->dtype_.lanes_ > 1) {
-                    auto lhs = ir_visitor_t::dispatch(l).remove_const();
-                    auto rhs = single_lane_constant(const_rhs);
-                    return make_expr<intrin_call_node>(v->type_,
-                            std::vector<expr> {lhs, rhs}, any_map_t());
-                }
-            }
-            return ir_visitor_t::visit(std::move(v));
-        };
-        // mark constant broadcast value as simd encode
-        auto transform_broadcast = [this](intrin_call_c v) -> expr_c {
-            assert(v->args_.size() == 1);
-            auto &r = v->args_[0];
-            if (r.isa<constant>()) {
-                r->attr().set(attr_keys::force_simd_encode, true);
-            }
-            return ir_visitor_t::visit(std::move(v));
-        };
-        // Transform intrin_call node with const value
-        switch (v->type_) {
-            case intrin_type::shl:
-            case intrin_type::shr: {
-                return transform_shift(std::move(v));
-            } break;
-            case intrin_type::broadcast: {
-                return transform_broadcast(std::move(v));
-            } break;
-            default: break;
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-
-    // TODO(XXX): Maybe create a dedicated x86 legalize pass in the future
-    // make sure x86 imm always rhs, just like expr_c visit(mul_c v)
-    expr_c visit(cmp_c v) override {
-        auto make_swap_cmp = [&](sc_expr_type t, const expr_c &lhs,
-                                     const expr_c &rhs) {
-            using sc_etype = sc_expr_type;
-            switch (t) {
-                case sc_etype::cmp_eq: return builder::make_cmp_eq(rhs, lhs);
-                case sc_etype::cmp_lt: return builder::make_cmp_gt(rhs, lhs);
-                case sc_etype::cmp_le: return builder::make_cmp_ge(rhs, lhs);
-                case sc_etype::cmp_ne: return builder::make_cmp_ne(rhs, lhs);
-                case sc_etype::cmp_ge: return builder::make_cmp_le(rhs, lhs);
-                case sc_etype::cmp_gt: return builder::make_cmp_lt(rhs, lhs);
-                default: COMPILE_ASSERT(false, "Invalid compare type: " << t);
-            }
-            return expr();
-        };
-        // swap cmp(l, r) to equivalent cmp(r, l)
-        if (v->l_.isa<constant>()) {
-            return make_swap_cmp(
-                    v->node_type_, v->l_, ir_visitor_t::dispatch(v->r_));
-        } else {
-            return ir_visitor_t::visit(std::move(v));
-        }
-    }
-
-    bool is_zero(const constant &v) {
-        uint64_t val = v->value_[0].u64;
-        return (v->value_.size() == 1) && (val == 0);
-    }
-
-    bool is_power_of_2(const constant &v) {
-        const auto dtype = v->dtype_;
-        if (utils::is_one_of(dtype, datatypes::u8, datatypes::u16,
-                    datatypes::u32, datatypes::index)) {
-            uint64_t val = v->value_[0].u64;
-            return utils::is_power_of_2(val);
-        }
-        return false;
-    }
-
-    expr power_of_2_exponent(const constant_c &v) {
-        // Calculate integer log2(2^n)
-        uint64_t val = v->value_[0].u64;
-        return builder::make_constant({uint64_t(utils::ctz(val))}, v->dtype_);
-    }
-
-    expr power_of_2_minus_1(const constant_c &v) {
-        // Calculate integer 2^n - 1
-        uint64_t val = v->value_[0].u64;
-        return builder::make_constant({val - 1}, v->dtype_);
-    }
-
-    expr single_lane_constant(const constant_c &v) {
-        // Transform simd constant to scalar constant
-        uint64_t val = v->value_[0].u64;
-        return builder::make_constant(
-                {val}, sc_data_type_t(v->dtype_.type_code_, 1));
-    }
-};
-
-func_c constant_optimizer_t::operator()(func_c v) {
-    constant_optimizer_impl_t constant_optimizer;
-
-    return constant_optimizer.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/constant_optimizer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/constant_optimizer.hpp
deleted file mode 100644
index df315550b2d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/constant_optimizer.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_CONSTANT_OPTIMIZER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_CONSTANT_OPTIMIZER_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-namespace attr_keys {
-// attr for constant value, mark if constant value need to be encoded
-// data type: bool
-constexpr const char *force_simd_encode = "force_simd_encode";
-// attr for constant value, mark if simd will be directly load from memory
-// data type: bool
-constexpr const char *load_simd_value = "load_simd_value";
-} // namespace attr_keys
-
-#define FORCE_SIMD_ENCODE(EXPR) \
-    (EXPR->attr_ \
-            && EXPR->attr_->get_or_else(attr_keys::force_simd_encode, false))
-
-/* *
- * Constant optimizer, mark simd constant and insert broadcast when needed.
- * Add simpile strength reduction for constant div/mod/mul.
- * */
-class constant_optimizer_t : public function_pass_t {
-public:
-    constant_optimizer_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/fp16_legalizer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/fp16_legalizer.cpp
deleted file mode 100644
index 5a8408b288c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/fp16_legalizer.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-
-#include "compiler/ir/graph/fusible_op_utils.hpp"
-#include "fp16_legalizer.hpp"
-#include "util/fp16.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class fp16_legalizer_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    fp16_legalizer_impl_t() = default;
-
-    expr_c visit(tensorptr_c v) override { return v; }
-
-    stmt make_assign(const expr_c &l, const expr_c &r) {
-        return builder::make_assign_unattached(l, r);
-    }
-
-    expr make_u16_indexing(const indexing_c &v) {
-        auto tensor_pointer = builder::make_reinterpret(
-                v->ptr_, datatypes::u16.get_pointerof());
-        return copy_attr(*v, builder::make_indexing(tensor_pointer, v->idx_));
-    }
-
-    expr make_reinterpret_f16_u16(const expr_c &v) {
-        assert(v->dtype_ == datatypes::f16);
-        return builder::make_reinterpret(v, datatypes::u16);
-    }
-
-    expr make_reinterpret_u16_f16(const expr_c &v) {
-        assert(v->dtype_ == datatypes::u16);
-        return builder::make_reinterpret(v, datatypes::f16);
-    }
-
-    expr_c visit(indexing_c v) override {
-        if (v->dtype_ == datatypes::f16) {
-            return make_reinterpret_u16_f16(make_u16_indexing(v));
-        }
-        return v;
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto var = v->var_;
-        auto value = ir_visitor_t::dispatch(v->value_);
-        if (var.isa<indexing>() && var->dtype_ == datatypes::f16) {
-            var = make_u16_indexing(var.static_as<indexing>());
-            value = make_reinterpret_f16_u16(value);
-        }
-        if (!value.ptr_same(v->value_) || !var.ptr_same(v->var_)) {
-            auto new_stmt = make_assign(var, value);
-            return copy_attr(*v, std::move(new_stmt));
-        }
-        return v;
-    }
-};
-
-func_c fp16_legalizer_t::operator()(func_c v) {
-    if (target_machine_.cpu_flags_.fAVX512FP16) { return v; }
-    fp16_legalizer_impl_t legalize_transform;
-    auto ret = legalize_transform.dispatch(std::move(v));
-    return ret;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/fp16_legalizer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/fp16_legalizer.hpp
deleted file mode 100644
index 61b4cf26c7f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/fp16_legalizer.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_FP16_LEGALIZER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_FP16_LEGALIZER_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/* *
- * When fp16 must be used on a machine without avx512fp16, the IR must be
- * legalized through this pass.
- * */
-class fp16_legalizer_t : public function_pass_t {
-public:
-    fp16_legalizer_t(const runtime::target_machine_t &target_machine)
-        : target_machine_(target_machine) {}
-    func_c operator()(func_c v) override;
-
-private:
-    const runtime::target_machine_t &target_machine_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/indexing_transform.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/indexing_transform.cpp
deleted file mode 100644
index 1df9c27ce28..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/indexing_transform.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/passlet/use_count_analysis.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <util/any_map.hpp>
-#include <util/array_ref.hpp>
-
-#include "indexing_transform.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using namespace passlet;
-
-class indexing_transform_impl_t : public ir_visitor_t {
-private:
-    constant_folder_t constant_folder_;
-
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    bool is_read_only_tensor(const expr_c &v) {
-        assert(v.isa<tensor>());
-        // if tesnor is read buffer and has no alias
-        auto info = alias_info::get_alias_info(*v);
-        return any_map_t::fetch_or_else(v->attr_.get(), "read_buffer", false)
-                && (info == nullptr || info->has_no_alias());
-    }
-
-    expr_c expand_polynomial_offset(expr_c f) {
-        return constant_folder_.expand_polynomial(std::move(f), 1, true);
-    }
-
-    expr_c visit(indexing_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).checked_as<indexing_c>();
-        auto &ptr = vv->ptr_;
-        auto &idx = vv->idx_.back();
-        auto idx_type = datatypes::index;
-        auto elem_type = ptr->dtype_.get_pointer_element();
-        auto elem_size = utils::get_sizeof_type(elem_type);
-        auto is_var_or_const = [](const expr_c &i) {
-            return i.isa<constant>() || i.isa<var>();
-        };
-        // set read only to the base tensor
-        auto read_only = is_read_only_tensor(ptr);
-        ptr->attr().set(attr_keys::read_only_tensor, read_only);
-        // transform complex indexing to pointer calculation
-        if (is_var_or_const(idx)) {
-            // If indexing A[i] already simple, no need to transform
-            return vv;
-        } else if (idx.isa<add>()
-                && is_var_or_const(idx.static_as<add>()->r_)) {
-            // This requires canonicalized expr to work
-            // The right add op must be most inner var
-            auto node = idx.static_as<add>();
-            auto scale = builder::make_constant({elem_size}, idx_type);
-            auto offset = expand_polynomial_offset(scale * node->l_);
-            auto new_ptr = builder::make_cast(
-                    ptr->dtype_, builder::make_cast(idx_type, ptr) + offset);
-            // transform A[l + r] to (A + (s * l))[r]
-            auto new_idx = node->r_;
-            return builder::make_indexing(
-                    new_ptr, new_idx, vv->dtype_.lanes_, vv->mask_);
-        } else {
-            auto scale = builder::make_constant({elem_size}, idx_type);
-            auto offset = expand_polynomial_offset(scale * idx);
-            auto new_ptr = builder::make_cast(
-                    ptr->dtype_, builder::make_cast(idx_type, ptr) + offset);
-            // transform A[i] to (A + (s * i))[0]
-            auto new_idx = builder::make_constant(UINT64_C(0));
-            return builder::make_indexing(
-                    new_ptr, new_idx, vv->dtype_.lanes_, vv->mask_);
-        }
-    }
-
-    expr_c visit(tensorptr_c v) override {
-        auto &ptr = v->base_->ptr_;
-        auto &idx = v->base_->idx_.back();
-        auto elem_type = ptr->dtype_.get_pointer_element();
-        auto elem_size = utils::get_sizeof_type(elem_type);
-        auto idx_zero = idx.cast<constant>().filter([](const constant &v) {
-            uint64_t val = v->value_[0].u64;
-            return (v->value_.size() == 1) && (val == 0);
-        });
-        // transform tensorptr &A[i] to void*(A + (s * i))
-        if (idx_zero.has_value()) {
-            return builder::make_cast(v->dtype_, ptr);
-        } else {
-            auto scale = builder::make_constant({elem_size}, idx->dtype_);
-            auto offset = expand_polynomial_offset(scale * idx);
-            return builder::make_cast(v->dtype_,
-                    builder::make_cast(datatypes::index, ptr) + offset);
-        }
-        return v;
-    }
-};
-
-func_c indexing_transform_t::operator()(func_c v) {
-    indexing_transform_impl_t indexing_transform;
-    auto vv = indexing_transform.dispatch(std::move(v));
-    return vv;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/indexing_transform.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/indexing_transform.hpp
deleted file mode 100644
index ef4b6b18a5d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/indexing_transform.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_INDEXING_TRANSFORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_INDEXING_TRANSFORM_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class indexing_transform_t : public function_pass_t {
-public:
-    indexing_transform_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/intrinsics_combine.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/intrinsics_combine.cpp
deleted file mode 100644
index 2f263bb3f84..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/intrinsics_combine.cpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/passlet/use_count_analysis.hpp>
-#include <compiler/ir/ssa_data.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <util/any_map.hpp>
-#include <util/array_ref.hpp>
-
-#include "intrinsics_combine.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-using namespace passlet;
-
-struct use_count_t {
-    size_t val_ = 0;
-};
-
-class use_count_analysis_viewer_t : public ssa_viewer_t {
-public:
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-
-    use_count_analysis_t uc_ana_;
-
-    use_count_analysis_viewer_t()
-        : uc_ana_ {temp_data_addresser<use_count_t, size_t,
-                &use_count_t::val_>()} {}
-
-    func_c dispatch(func_c f) override {
-        uc_ana_.view(f, pass_phase::PRE_VISIT);
-        ssa_viewer_t::dispatch(f->body_);
-        uc_ana_.view(f, pass_phase::POST_VISIT);
-        return f;
-    }
-
-    expr_c dispatch(expr_c v) override {
-        uc_ana_.view(v, pass_phase::PRE_VISIT);
-        uc_ana_.view(v, pass_phase::POST_VISIT);
-        return v;
-    }
-
-    void view(define_c v) override {
-        uc_ana_.view(v, pass_phase::PRE_VISIT);
-        if (v->init_.defined()) { dispatch(v->init_); }
-        uc_ana_.view(v, pass_phase::POST_VISIT);
-    }
-
-    void view(for_loop_c v) override {
-        uc_ana_.view(v, pass_phase::PRE_VISIT);
-        dispatch(v->iter_begin_);
-        dispatch(v->iter_end_);
-        dispatch(v->step_);
-        dispatch(v->body_);
-        uc_ana_.view(v, pass_phase::POST_VISIT);
-    }
-};
-
-class intrinsics_combine_impl_t : public ssa_visitor_t {
-public:
-    using ssa_visitor_t::dispatch;
-    using ssa_visitor_t::visit;
-
-    std::unordered_map<stmt_c, std::vector<stmt_c> *> define_scope_map_;
-
-    expr_c combine_to_fmadd(const expr &l, const expr &r, bool is_sub = false) {
-        if (l->dtype_.is_etype(sc_data_etype::F32)
-                || l->dtype_.is_etype(sc_data_etype::F16)) {
-            stmt owner {l->ssa_data_->owner_.lock()};
-            return owner.cast<define>()
-                    .filter([this, l](const define &v) {
-                        const auto iter = define_scope_map_.find(v);
-                        return l->get_temp_data().get<use_count_t>().val_ == 1
-                                && iter != define_scope_map_.end()
-                                && iter->second == get_current_scope();
-                    })
-                    .map([](const define &v) { return v->init_; })
-                    .map([](const expr &v) { return v.as<mul>(); })
-                    .map([r, is_sub](const mul &v) {
-                        if (is_sub) {
-                            return builder::make_fnmadd(v->l_, v->r_, r);
-                        }
-                        return builder::make_fmadd(v->l_, v->r_, r);
-                    })
-                    .get_or_else(expr());
-        }
-        return expr_c();
-    }
-
-    stmt_c visit(define_c v) override {
-        define_scope_map_[v] = get_current_scope();
-        return ssa_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(add_c v) override {
-        // combine (a * b) + c to fmadd(a, b, c)
-        // if (a * b) only used in fmadd and defined in same scope
-        auto ret = combine_to_fmadd(v->l_, v->r_);
-        if (ret.defined()) { return ret; }
-        ret = combine_to_fmadd(v->r_, v->l_);
-        if (ret.defined()) { return ret; }
-        return v;
-    }
-
-    expr_c visit(sub_c v) override {
-        // combine c - (a * b) to fnmadd(a, b, c)
-        // if (a * b) only used in fnmadd and defined in same scope
-        // Sub node only appears like: c - (a * b)
-        auto ret = combine_to_fmadd(v->r_, v->l_, true);
-        return ret.defined() ? ret : v;
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        // combine broadcast(A[i]) to broadcast_idx(A, i)
-        // if A[i] only used in broadcast
-        auto combine_to_broadcast = [](const expr &e, int dst_lanes) {
-            stmt owner {e->ssa_data_->owner_.lock()};
-            return owner.cast<define>()
-                    .filter([e](const define &v) {
-                        return e->get_temp_data().get<use_count_t>().val_ == 1;
-                    })
-                    .map([](const define &v) { return v->init_; })
-                    .map([](const expr &v) { return v.as<indexing>(); })
-                    .map([dst_lanes](const indexing &idxn) -> expr_c {
-                        if (idxn->mask_.defined()) { return expr(); }
-                        auto src_lanes = builder::make_constant(
-                                (int)idxn->dtype_.lanes_);
-                        src_lanes->ssa_data_ = utils::make_unique<ssa_data_t>();
-                        assert(idxn->idx_.size() == 1);
-                        return builder::make_x86_intrin(
-                                x86_intrin_type::avx_broadcast_idx,
-                                {idxn->ptr_, idxn->idx_.back(), src_lanes},
-                                {{"lanes", dst_lanes}});
-                    });
-        };
-        switch (v->type_) {
-            case intrin_type::broadcast: {
-                assert(v->args_.size() == 1);
-                return combine_to_broadcast(v->args_[0], v->dtype_.lanes_)
-                        .get_or_else(v);
-            } break;
-            default: break;
-        }
-        return v;
-    }
-};
-
-func_c intrinsics_combine_t::operator()(func_c v) {
-    use_count_analysis_viewer_t use_count_analysis;
-    use_count_analysis.dispatch(v);
-    intrinsics_combine_impl_t intrinsics_combine;
-    return intrinsics_combine.top_level_dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/intrinsics_combine.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/intrinsics_combine.hpp
deleted file mode 100644
index edf18e9d1b7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/intrinsics_combine.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_INTRINSICS_COMBINE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_INTRINSICS_COMBINE_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class intrinsics_combine_t : public function_pass_t {
-public:
-    intrinsics_combine_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/live_range_split.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/live_range_split.cpp
deleted file mode 100644
index 1164950a5a3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/live_range_split.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <unordered_set>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <compiler/jit/xbyak/ir/util/utils.hpp>
-#include <util/any_map.hpp>
-
-#include "live_range_split.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-struct split_analysis_data_t {
-    //
-    bool cross_call_ = false;
-    //
-    std::unordered_set<expr_c> used_vars_;
-    std::unordered_set<expr_c> modified_vars_;
-
-    static split_analysis_data_t &get(const expr_c &v) {
-        return v->temp_data().get<split_analysis_data_t>();
-    }
-
-    static bool is_set(const expr_c &v) {
-        return v->get_temp_data().isa<split_analysis_data_t>();
-    }
-};
-
-class split_analysis_viewer_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-
-    std::vector<expr_c> cur_loop_iter_;
-    std::vector<std::vector<expr_c>> cur_loop_scopes_;
-    std::unordered_set<expr_c> initialized_var_;
-    std::unordered_map<expr_c, bool> defined_var_map_;
-
-    bool var_cross_call(const expr_c &v) {
-        auto iter = defined_var_map_.find(v);
-        return (iter != defined_var_map_.end()) && iter->second;
-    }
-
-    void filter_curr_scope(std::unordered_set<expr_c> &vars) {
-        for (auto it = vars.begin(); it != vars.end();) {
-            bool a = defined_var_map_.find(*it) == defined_var_map_.end();
-            bool b = initialized_var_.find(*it) == initialized_var_.end();
-            if (a || b) {
-                it = vars.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
-    void filter_inner_scope(std::unordered_set<expr_c> &inner,
-            const std::unordered_set<expr_c> &curr) {
-        for (auto it = inner.begin(); it != inner.end();) {
-            if (curr.find(*it) != curr.end()) {
-                it = inner.erase(it);
-            } else {
-                ++it;
-            }
-        }
-    }
-
-    void merge_outer_scope(std::unordered_set<expr_c> &outer,
-            const std::unordered_set<expr_c> &curr) {
-        outer.insert(curr.begin(), curr.end());
-    }
-
-    void view(call_c v) override {
-        ir_viewer_t::view(v);
-        for (auto &kv : defined_var_map_) {
-            kv.second = true;
-        }
-        if (!cur_loop_iter_.empty()) {
-            split_analysis_data_t::get(cur_loop_iter_.back()).cross_call_
-                    = true;
-        }
-    }
-
-    void view(var_c v) override {
-        ir_viewer_t::view(v);
-        if (is_x86_simd(v->dtype_)) {
-            if (!cur_loop_iter_.empty() && var_cross_call(v)) {
-                auto &data = split_analysis_data_t::get(cur_loop_iter_.back());
-                data.used_vars_.insert(v);
-            }
-        }
-    }
-
-    void view(define_c v) override {
-        ir_viewer_t::view(v);
-        if (is_x86_simd(v->var_->dtype_)) {
-            defined_var_map_[v->var_] = false;
-            if (cur_loop_iter_.empty() && v->init_.defined()) {
-                initialized_var_.insert(v->var_);
-            }
-        }
-    }
-
-    void view(assign_c v) override {
-        ir_viewer_t::view(v);
-        if (is_x86_simd(v->var_->dtype_)) {
-            if (!cur_loop_iter_.empty() && var_cross_call(v->var_)) {
-                auto &data = split_analysis_data_t::get(cur_loop_iter_.back());
-                data.modified_vars_.insert(v->var_);
-            }
-            if (cur_loop_iter_.empty()) { initialized_var_.insert(v->var_); }
-        }
-    }
-
-    void view(for_loop_c v) override {
-        // recored this as inner loop of outer
-        if (!cur_loop_scopes_.empty()) {
-            cur_loop_scopes_.back().emplace_back(v->var_);
-        }
-        cur_loop_scopes_.emplace_back(std::vector<expr_c>());
-        cur_loop_iter_.emplace_back(v->var_);
-        // build analysis data
-        v->var_->temp_data() = split_analysis_data_t();
-        auto &data = split_analysis_data_t::get(v->var_);
-
-        // dispatch into loop
-        ir_viewer_t::view(v);
-
-        // if loop contains call, no spilt for inner loop vars
-        if (split_analysis_data_t::get(v->var_).cross_call_) {
-            data.used_vars_.clear();
-            data.modified_vars_.clear();
-            for (const auto &loop_var : cur_loop_iter_) {
-                split_analysis_data_t::get(loop_var).cross_call_ = true;
-            }
-        }
-        cur_loop_iter_.pop_back();
-
-        // filter vars to spilt when defined outside loop
-        // propagate spilt vars to most outer loop when possible
-        filter_curr_scope(data.used_vars_);
-        filter_curr_scope(data.modified_vars_);
-        if (!cur_loop_iter_.empty()) {
-            auto &outer = split_analysis_data_t::get(cur_loop_iter_.back());
-            merge_outer_scope(outer.used_vars_, data.used_vars_);
-            merge_outer_scope(outer.modified_vars_, data.modified_vars_);
-        }
-        for (auto &inner_loop : cur_loop_scopes_.back()) {
-            auto &inner = split_analysis_data_t::get(inner_loop);
-            filter_inner_scope(inner.used_vars_, data.used_vars_);
-            filter_inner_scope(inner.modified_vars_, data.modified_vars_);
-        }
-        cur_loop_scopes_.pop_back();
-    }
-};
-
-// var a
-// a = ...
-// call(...)
-// for(...) {
-//    a = ...
-// }
-//
-// spilt live range of a:
-//
-// var a
-// a = ...
-// call(...)
-// var a_1 = a
-// for(...) {
-//    a_1 = ...
-// }
-
-class live_range_splitter_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    size_t var_counter_ = 0;
-    std::unordered_map<expr_c, expr_c> var_replace_map_;
-
-    expr_c visit(var_c v) override {
-        auto iter = var_replace_map_.find(v);
-        return (iter == var_replace_map_.end()) ? v : iter->second;
-    }
-
-    stmt_c visit(for_loop_c v) override {
-        auto &data = split_analysis_data_t::get(v->var_);
-        if (data.used_vars_.empty()) {
-            return ir_visitor_t::visit(std::move(v));
-        } else {
-            std::vector<stmt> seq;
-            // insert define before loop
-            for (const auto &x : data.used_vars_) {
-                auto new_v = x->remake().static_as<var>();
-                new_v->name_ += std::string("_spilt_")
-                        + std::to_string(var_counter_++);
-                var_replace_map_[x] = new_v;
-                seq.emplace_back(builder::make_var_tensor_def_unattached(
-                        new_v, linkage::local, x));
-            }
-            // dispatch loop
-            seq.emplace_back(ir_visitor_t::visit(std::move(v)).remove_const());
-            // insert assign after loop
-            for (const auto &x : data.modified_vars_) {
-                seq.emplace_back(builder::make_assign_unattached(
-                        x, var_replace_map_[x]));
-            }
-            // remove replacment vars
-            for (const auto &x : data.used_vars_) {
-                var_replace_map_.erase(x);
-            }
-
-            return make_stmt<stmts_node_t>(std::move(seq));
-        }
-    }
-};
-
-func_c live_range_splitter_t::operator()(func_c v) {
-    split_analysis_viewer_t analyzer;
-    analyzer.dispatch(v);
-
-    live_range_splitter_impl_t live_range_splitter;
-    return live_range_splitter.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/live_range_split.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/live_range_split.hpp
deleted file mode 100644
index bcf85559850..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/live_range_split.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_LIVE_RANGE_SPLIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_LIVE_RANGE_SPLIT_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Spilt live range of vars to smaller live ranges according to calls and loops
- * */
-class live_range_splitter_t : public function_pass_t {
-public:
-    live_range_splitter_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/low_level_legalizer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/low_level_legalizer.cpp
deleted file mode 100644
index a3c6f5eee7f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/low_level_legalizer.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/visitor.hpp>
-
-#include "low_level_legalizer.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class low_level_legalizer_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    low_level_legalizer_impl_t(const runtime::target_machine_t &target_machine)
-        : max_f32_lanes_(target_machine.get_device_flags().get_max_vector_lanes(
-                sc_data_etype::F32)) {}
-
-    expr_c visit(tensor_c v) override {
-        // avoid dispatch into for loop index dependent tensor
-        return v;
-    }
-
-    expr_c visit(cast_c v) override {
-        auto ret = ir_visitor_t::visit(std::move(v));
-        assert(ret.isa<cast>());
-        auto vv = ret.static_as<cast_c>();
-        return transform_f32_cast(vv->dtype_, vv->in_);
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<assign_c>();
-        auto &var = vv->var_;
-        auto &value = vv->value_;
-        if (value.isa<tensor>()) {
-            // Must cast tensor to dst dtype before assign
-            return make_stmt<assign_node_t>(
-                    var.remove_const(), builder::make_cast(var->dtype_, value));
-        }
-        return vv;
-    }
-
-    expr_c visit(intrin_call_c v) override {
-        auto vv = ir_visitor_t::visit(std::move(v)).static_as<intrin_call_c>();
-        auto dst_dtype = vv->dtype_;
-        auto src_dtype = vv->args_[0]->dtype_;
-        // TODO(longsheng): reduce no need to cast to f32
-        auto can_cast_to_f32 = !dst_dtype.is_etype(sc_data_etype::F32)
-                && (src_dtype.lanes_ <= max_f32_lanes_);
-        switch (vv->type_) {
-            case intrin_type::abs: {
-                // float and bf16
-                if (dst_dtype.is_etype(sc_data_etype::F32)) {
-                    return transform_fabs(vv->args_[0], UINT64_C(0x7FFFFFFF));
-                } else if (dst_dtype.is_etype(sc_data_etype::BF16)
-                        || dst_dtype.is_etype(sc_data_etype::F16)) {
-                    return transform_fabs(vv->args_[0], UINT64_C(0x7FFF));
-                }
-            } break;
-            case intrin_type::rsqrt: {
-                if (dst_dtype == datatypes::f32) {
-                    return builder::make_constant(1.f)
-                            / builder::make_sqrt(vv->args_[0]);
-                }
-            } break;
-            case intrin_type::reduce_add: {
-                if (can_cast_to_f32) {
-                    return transform_reduce(
-                            vv->args_[0], dst_dtype, &builder::make_reduce_add);
-                }
-            } break;
-            case intrin_type::reduce_mul: {
-                if (can_cast_to_f32) {
-                    return transform_reduce(
-                            vv->args_[0], dst_dtype, &builder::make_reduce_mul);
-                }
-            } break;
-            case intrin_type::reduce_min: {
-                if (can_cast_to_f32) {
-                    return transform_reduce(
-                            vv->args_[0], dst_dtype, &builder::make_reduce_min);
-                }
-            } break;
-            case intrin_type::reduce_max: {
-                if (can_cast_to_f32) {
-                    return transform_reduce(
-                            vv->args_[0], dst_dtype, &builder::make_reduce_max);
-                }
-            } break;
-            case intrin_type::gather: {
-                if (vv->dtype_.lanes_ == 1) {
-                    return builder::make_indexing(vv->args_[0], {vv->args_[1]});
-                }
-            } break;
-            default: break;
-        }
-        return vv;
-    }
-
-    using make_unary_f = expr (*)(const expr_c &);
-    using make_binary_f = expr (*)(const expr_c &, const expr_c &);
-
-    expr_c transform_fabs(const expr &src, const union_val &val) {
-        return builder::make_int_and(
-                src, builder::make_constant({val}, src->dtype_));
-    }
-
-    expr_c transform_reduce(const expr &src, sc_data_type_t dst_dtype,
-            make_unary_f make_reduce) {
-        auto type_to_f32 = sc_data_type_t::f32(src->dtype_.lanes_);
-        return transform_f32_cast(dst_dtype, //
-                make_reduce(transform_f32_cast(type_to_f32, src)));
-    }
-
-    expr_c transform_f32_cast(sc_data_type_t dst_dtype, const expr &src) {
-        const auto src_dtype = src->dtype_;
-        const auto is_f32_int8_cast = [](const sc_data_type_t &type_dst,
-                                              const sc_data_type_t &type_src) {
-            return (type_dst.type_code_ == sc_data_etype::U8
-                           || type_dst.type_code_ == sc_data_etype::S8)
-                    && type_src.type_code_ == sc_data_etype::F32;
-        };
-        const auto is_f16_scalar_cast
-                = [](const sc_data_type_t &type_dst,
-                          const sc_data_type_t &type_src) {
-                      return (type_dst.type_code_ == sc_data_etype::F16)
-                              && (utils::is_one_of(type_src.type_code_,
-                                      sc_data_etype::U16, sc_data_etype::S8,
-                                      sc_data_etype::U8))
-                              && type_dst.lanes_ == 1;
-                  };
-        const auto convert_s32_transform
-                = [](const sc_data_type_t &dst_dtype,
-                          const sc_data_type_t &src_dtype, const expr &src) {
-                      return builder::make_cast(dst_dtype,
-                              builder::make_cast(
-                                      sc_data_type_t::s32(src_dtype.lanes_),
-                                      src));
-                  };
-        if (is_f32_int8_cast(dst_dtype, src_dtype)
-                || is_f32_int8_cast(src_dtype, dst_dtype)) {
-            // int8 to f32 must cast to s32 first
-            return convert_s32_transform(dst_dtype, src_dtype, src);
-        } else if (is_f16_scalar_cast(dst_dtype, src_dtype)
-                || is_f16_scalar_cast(src_dtype, dst_dtype)) {
-            // other dtype to f16 must cast to s32 first
-            return convert_s32_transform(dst_dtype, src_dtype, src);
-        } else {
-            return builder::make_cast(dst_dtype, src);
-        }
-    }
-
-private:
-    const uint16_t max_f32_lanes_;
-};
-
-func_c low_level_legalizer_t::operator()(func_c v) {
-    low_level_legalizer_impl_t low_level_legalizer(target_machine_);
-    return low_level_legalizer.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/low_level_legalizer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/low_level_legalizer.hpp
deleted file mode 100644
index 69b0caeaed7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/low_level_legalizer.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_LOW_LEVEL_LEGALIZER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_LOW_LEVEL_LEGALIZER_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Low level legalizer for xbyak backend, make sure data type and data order is
- * compatible with low level requirement.
- * */
-class low_level_legalizer_t : public function_pass_t {
-public:
-    low_level_legalizer_t(const runtime::target_machine_t &target_machine)
-        : target_machine_(target_machine) {}
-    func_c operator()(func_c v) override;
-
-private:
-    const runtime::target_machine_t &target_machine_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/module_var_resolver.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/module_var_resolver.cpp
deleted file mode 100644
index f3cd0154313..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/module_var_resolver.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <utility>
-#include <unordered_map>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <util/any_map.hpp>
-
-#include "module_var_resolver.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class module_var_resolver_t_impl_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::visit;
-
-    std::unordered_map<expr_c, expr> module_vars_;
-    expr_c module_data_;
-
-    func_c dispatch(func_c v) override {
-        if (v->params_.size() > 1 && v->params_[1].isa<tensor>()) {
-            auto module_tensor = v->params_[1].static_as<tensor>();
-            if (module_tensor->name_ == "__module_data") {
-                module_data_ = module_tensor;
-            }
-        }
-        return ir_visitor_t::dispatch(std::move(v));
-    }
-
-    expr_c dispatch(expr_c v) override {
-        if (module_vars_.find(v) != module_vars_.end()) {
-            auto new_expr = module_vars_[v];
-            return copy_attr(*v, std::move(new_expr));
-        } else {
-            return ir_visitor_t::dispatch(std::move(v));
-        }
-    }
-
-    stmt_c visit(define_c v) override {
-        if (v->var_.isa<var>()) {
-            auto thevar = v->var_.static_as<var>();
-            if (module_data_.defined() && thevar->attr_
-                    && thevar->attr_->has_key(
-                            attr_keys::module_global_offset)) {
-                // TODO(XXX): module_var to indexing, maybe define just before
-                // use?
-                auto &offset = thevar->attr_->get_any(
-                        attr_keys::module_global_offset);
-                expr module_idx_ptr;
-                if (auto absptr = offset.get_or_null<void *>()) {
-                    module_idx_ptr = make_expr<constant_node>(
-                            reinterpret_cast<uint64_t>(*absptr),
-                            datatypes::s8.get_pointerof());
-                } else {
-                    // Get var_ptr = &module_data[offset]
-                    auto index = builder::make_constant(offset.get<size_t>());
-                    module_idx_ptr = builder::tensor_ptr(module_data_, {index});
-                }
-                // var_ptr->elem_dtype_ is not module_data_->elem_dtype_
-                // var_ptr->elem_dtype_ should be var->dtype_
-                auto var_ptr = builder::make_tensor(
-                        thevar->name_ + "_ptr", {1}, thevar->dtype_);
-                // Replace future module_var use with var_ptr[0]
-                auto module_var = builder::make_indexing(
-                        var_ptr, builder::make_constant(UINT64_C(0)));
-                module_vars_[thevar] = module_var;
-                // Will replace define with module_var's tensor define
-                auto var_ptr_define = make_stmt<define_node_t>(
-                        var_ptr, linkage::local, module_idx_ptr);
-                return std::move(var_ptr_define);
-            }
-        }
-        return ir_visitor_t::visit(std::move(v));
-    }
-};
-
-func_c module_var_resolver_t::operator()(func_c v) {
-    module_var_resolver_t_impl_t module_var_resolver;
-
-    return module_var_resolver.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/module_var_resolver.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/module_var_resolver.hpp
deleted file mode 100644
index e456bd40e8b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/module_var_resolver.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_MODULE_VAR_RESOLVER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_MODULE_VAR_RESOLVER_HPP
-
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * Substitute the golbal module var indicated by `module_global_offset` as
- * indexing of 1-element-sized local tensor of `module_data` address offset.
- * */
-class module_var_resolver_t : public function_pass_t {
-public:
-    module_var_resolver_t() = default;
-    func_c operator()(func_c v) override;
-
-private:
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/register_allocation.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/register_allocation.cpp
deleted file mode 100644
index 04fa0cbdc64..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/register_allocation.cpp
+++ /dev/null
@@ -1,870 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/jit/xbyak/ir/reg_allocation/reg_allocator.hpp>
-#include <compiler/jit/xbyak/ir/transform/constant_optimizer.hpp>
-#include <compiler/jit/xbyak/ir/util/utils.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_visitor.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_function_interface.hpp>
-
-#include "call_transform.hpp"
-#include "register_allocation.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-using namespace xbyak::x86_64;
-
-SC_MODULE(xbyakjit.register_allocation)
-
-// atomic accumulator for temp var name
-std::atomic<int32_t> temp_index_(0);
-
-void prepare_virtual_reg(reg_allocator_t *allocator, virtual_reg_t *virt_reg,
-        const Xbyak::Reg &phy_reg, sc_data_type_t dtype, bool is_avx512) {
-    if (phy_reg.isNone()) {
-        virt_reg->set_type(
-                get_virt_reg_type(dtype, is_avx512, virt_reg->force_fp_vex_));
-        virt_reg->set_unassigned();
-        virt_reg->add_weight(virt_reg->extra_weight());
-    } else {
-        auto &slots_map = allocator->slots_map();
-        virt_reg->set_type(
-                get_virt_reg_type(dtype, is_avx512, virt_reg->force_fp_vex_));
-        virt_reg->set_designated(slots_map.get_reg_index(phy_reg));
-    }
-    if (!virt_reg->live_range_.empty()) { allocator->enqueue(virt_reg); }
-}
-
-/* *
- * enclose_set_t: check if any stmt index in a set enclosed by a live range.
- * */
-class enclose_set_t {
-public:
-    enclose_set_t() = default;
-    void insert(stmt_index_t index) { index_map_.insert(index); }
-    void erase(stmt_index_t index) { index_map_.erase(index); }
-    bool enclosed_by(const live_range_t &range) {
-        auto iter = index_map_.lower_bound(range.start_);
-        if (iter != index_map_.begin()) { iter--; }
-        while (iter != index_map_.end()) {
-            if (range.end_ <= *iter) { break; }
-            if (range.enclose(*iter)) { return true; }
-            iter++;
-        }
-        return false;
-    }
-
-private:
-    std::set<stmt_index_t> index_map_;
-};
-
-/* *
- * Pre-allocation pass: analyze call index for reg allocation, assign hints.
- * */
-class call_analysis_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-    using xbyak_visitor_t::visit;
-
-    call_analysis_t(reg_allocator_t *allocator, enclose_set_t &call_index_set)
-        : allocator_(allocator), call_index_set_(call_index_set) {}
-
-    func_c operator()(func_c v) { return dispatch(std::move(v)); }
-
-    func_c dispatch(func_c v) override {
-        func_t func = std::const_pointer_cast<func_base>(v);
-        func_iface_ = cached_func_abi_interface(func);
-        set_func_args_hint(v->params_);
-        return xbyak_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c visit(returns_c v) override {
-        if (v->value_.defined()) {
-            set_func_return_hint(v->value_, func_iface_);
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(assign_c v) override {
-        if (v->value_.isa<call>()) {
-            auto call_v = v->value_.static_as<call_c>();
-            auto func_abi = cached_call_abi_interface(call_v);
-            set_func_return_hint(v->var_, func_abi);
-            call_index_set_.insert(GET_STMT_INDEX(v));
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(define_c v) override {
-        if (v->init_.defined() && v->init_.isa<call>()) {
-            call_index_set_.insert(GET_STMT_INDEX(v));
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        if (v->value_.isa<call>()) {
-            call_index_set_.insert(GET_STMT_INDEX(v));
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(call_c v) override {
-        auto &args = v->args_;
-        auto &callee_iface = *cached_call_abi_interface(v);
-        auto &slots_map = allocator_->slots_map();
-        for (size_t i = 0; i < args.size(); ++i) {
-            auto &param = args[i];
-            auto &virt_para = GET_VIRTUAL_REG(param);
-            auto &phy_para = GET_PHYSICAL_REG(param);
-            auto &initial_loc = callee_iface.param_locs_[i];
-            if (initial_loc.get_type()
-                    == abi_value_location::tag_type::REGISTER) {
-                phy_para = initial_loc.get_register();
-            } else {
-                virt_para.set_spilled();
-            }
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-private:
-    reg_allocator_t *allocator_;
-    enclose_set_t &call_index_set_;
-    abi_function_interface::ptr func_iface_;
-
-    void set_func_args_hint(const std::vector<expr> &params) {
-        auto &slots_map = allocator_->slots_map();
-        for (size_t i = 0; i < params.size(); ++i) {
-            auto &param = params[i];
-            auto &virt_reg = GET_VIRTUAL_REG(param);
-            auto &initial_loc = func_iface_->param_locs_[i];
-            virt_reg.set_type(get_virt_reg_type(param->dtype_, false));
-            if (initial_loc.get_type()
-                    == abi_value_location::tag_type::REGISTER) {
-                Xbyak::Reg src_reg = initial_loc.get_register();
-                COMPILE_ASSERT(src_reg.isREG(64) || src_reg.isXMM(),
-                        "Unhandled register kind: " << src_reg.toString());
-                virt_reg.set_hint(virt_reg_hint::strong,
-                        slots_map.get_reg_index(src_reg));
-                if (i < 2) {
-                    // arg is stream or modudle_data
-                    virt_reg.spill_weight_ = spill_weight_const::initial;
-                }
-            }
-        }
-    }
-
-    void set_func_return_hint(
-            const expr &v, const abi_function_interface::ptr &func_abi) {
-        auto &slots_map = allocator_->slots_map();
-        auto &virt_reg = GET_VIRTUAL_REG(v);
-        auto ret_reg = func_abi->return_val_loc_.get_register();
-        virt_reg.set_hint(
-                virt_reg_hint::weak, slots_map.get_reg_index(ret_reg));
-    }
-};
-
-/* *
- * Pre-allocation pass:
- * prepare physical regs and enqueue all virtual regs to be assigned.
- * */
-class pre_allocation_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-    using xbyak_visitor_t::visit;
-
-    pre_allocation_t(reg_allocator_t *allocator,
-            std::map<virtual_reg_t *, expr_c> &virt_reg_map,
-            const runtime::cpu_flags_t &cpu_flags)
-        : allocator_(allocator)
-        , virt_reg_map_(virt_reg_map)
-        , cpu_flags_(cpu_flags) {}
-
-    func_c operator()(func_c v) {
-        call_analysis_t call_analysis(allocator_, call_index_set_);
-        return dispatch(call_analysis(std::move(v)));
-    }
-
-    expr_c dispatch(expr_c v) override {
-        // If need reg allocation
-        if (v.isa<var>() || v.isa<tensor>()) {
-            auto &live_range = GET_LIVE_RANGE(v);
-            auto &virt_reg = GET_VIRTUAL_REG(v);
-            auto &phy_reg = GET_PHYSICAL_REG(v);
-            virt_reg_map_[&virt_reg] = v;
-            if (virt_reg.stat_ == virt_reg_stat::disabled) {
-                if (call_index_set_.enclosed_by(live_range)) {
-                    virt_reg.set_preserved();
-                }
-                prepare_virtual_reg(allocator_, &virt_reg, phy_reg, v->dtype_,
-                        cpu_flags_.fAVX512F);
-            }
-            return v;
-        }
-        return xbyak_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c visit(stmts_c v) override {
-        if (TRANSFORMED_CALL(v)) {
-            if (v->size() == 0) {
-                GET_STMT_DATA(v).optimized_out_ = true;
-                return v;
-            }
-        }
-        auto ret = xbyak_visitor_t::visit(std::move(v));
-        return ret;
-    }
-
-    stmt_c visit(define_c v) override {
-        if (GET_LIVE_RANGE(v->var_).empty()) {
-            GET_STMT_DATA(v).optimized_out_ = true;
-            return v;
-        }
-        if (v->var_.isa<tensor>() && !v->init_.defined()) {
-            auto &virt_reg = GET_VIRTUAL_REG(v->var_);
-            virt_reg.set_buffered();
-            allocator_->spilled_virt_regs().insert(&virt_reg);
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    stmt_c visit(assign_c v) override {
-        auto &live_range = GET_LIVE_RANGE(v->var_);
-        if (v->var_.isa<var>() && live_range.empty()) {
-            GET_STMT_DATA(v).optimized_out_ = true;
-            return v;
-        }
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(indexing_c v) override {
-        GET_VIRTUAL_REG(v).set_spilled();
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(func_addr_c v) override {
-        GET_VIRTUAL_REG(v).set_spilled();
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c visit(constant_c v) override {
-        auto &virt_reg = GET_VIRTUAL_REG(v);
-        virt_reg.set_type(get_virt_reg_type(v->dtype_, false));
-        if (is_x86_simd(v->dtype_) || FORCE_SIMD_ENCODE(v)) {
-            virt_reg.set_spilled();
-        }
-        return v;
-    }
-
-private:
-    reg_allocator_t *allocator_;
-    std::map<virtual_reg_t *, expr_c> &virt_reg_map_;
-    const runtime::cpu_flags_t &cpu_flags_;
-    enclose_set_t call_index_set_;
-};
-
-/* *
- * Check address mode conflicts for each operation, resolve conflicts by
- * creating small live intervels and inserting load/store when nedded.
- * */
-class spill_resolver_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-    using xbyak_visitor_t::visit;
-
-    spill_resolver_t(const live_range_t &spill_range,
-            std::vector<virtual_reg_t *> &virtual_regs,
-            const runtime::cpu_flags_t &cpu_flags)
-        : spill_range_(spill_range)
-        , virtual_regs_(virtual_regs)
-        , cpu_flags_(cpu_flags) {}
-
-    func_c operator()(func_c v) {
-        return xbyak_visitor_t::dispatch(std::move(v));
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        auto old_stmt_data = GET_STMT_DATA(v);
-        cur_index_ = old_stmt_data.index_;
-        // if current index out of spill_range, skip resolve
-        if (spill_range_.defined_
-                && (old_stmt_data.index_ < spill_range_.start_
-                        || old_stmt_data.init_index_ > spill_range_.end_)) {
-            return v;
-        }
-        // if current index in spill_range, try resolve
-        auto ret = xbyak_visitor_t::dispatch(std::move(v));
-        if (!ret->temp_data().isa<xbyak_stmt_data_t>()) {
-            ret->temp_data() = old_stmt_data;
-        }
-        return ret;
-    }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt> new_seq;
-        for (auto &s : v->seq_) {
-            // dispatch stmt
-            auto ss = dispatch(s).remove_const();
-            // remove optimized_out
-            if (GET_STMT_DATA(ss).optimized_out_) { continue; }
-            // insert before
-            for (auto &&ts : insert_before_) {
-                new_seq.push_back(std::move(ts));
-            }
-            insert_before_.clear();
-            // insert middle
-            new_seq.emplace_back(ss);
-            // insert after
-            for (auto &&ts : insert_after_) {
-                new_seq.push_back(std::move(ts));
-            }
-            insert_after_.clear();
-        }
-        return copy_attr(*v, make_stmt<stmts_node_t>(std::move(new_seq)));
-    }
-
-    stmt_c visit(assign_c v) override {
-        resolve_dst_ = resolve_dst::none;
-        dst_is_mem_ = is_spilled(v->var_);
-        auto ret = xbyak_visitor_t::visit(std::move(v))
-                           .remove_const()
-                           .static_as<assign>();
-        if (dst_is_mem_) {
-            if (resolve_dst_ == resolve_dst::store) {
-                return insert_store(std::move(ret), cur_index_);
-            } else if (resolve_dst_ == resolve_dst::load_store) {
-                return insert_load_store(std::move(ret), cur_index_);
-            } else if (is_spilled(ret->value_)) {
-                if (ret->var_.isa<var>()) {
-                    return insert_store(std::move(ret), cur_index_);
-                } else {
-                    ret->value_
-                            = insert_load(std::move(ret->value_), cur_index_);
-                    return ret;
-                }
-            } else if (const_exceed_32bit(ret->value_)) {
-                ret->value_ = insert_load(std::move(ret->value_), cur_index_);
-                return ret;
-            }
-        }
-        return ret;
-    }
-
-    // avoid dispatch into index dependent tensor
-    expr_c visit(tensor_c v) override { return v; }
-
-    stmt_c visit(for_loop_c v) override {
-        // if var spilled, spilled begin/end/step must load
-        return resolve_spill(std::move(v));
-    }
-
-    stmt_c visit(define_c v) override {
-        // var = tensor_ptr, spilled var must be store from reg
-        return resolve_spill(std::move(v));
-    }
-
-    expr_c visit(cast_c v) override {
-        // var = cast(in), spilled var must be store from reg
-        return resolve_spill(std::move(v));
-    }
-
-    expr_c visit(tensorptr_c v) override {
-        // var = &ptr[idx], spilled var must be store from reg
-        return resolve_spill(std::move(v));
-    }
-
-    expr_c visit(indexing_c v) override {
-        // ptr[idx], spilled ptr and idx need to be load to reg
-        return resolve_spill(std::move(v));
-    }
-
-    expr_c visit(xbyak_intrin_c v) override {
-        // var = xbyak_intrin(args...), check intrin_format
-        return resolve_spill(std::move(v));
-    }
-
-protected:
-    stmt_c resolve_spill(for_loop_c v) {
-        auto stmt_data = v->get_temp_data();
-        auto vv = xbyak_visitor_t::visit(std::move(v))
-                          .remove_const()
-                          .static_as<for_loop>();
-        vv->temp_data() = stmt_data;
-        bool loop_var_spilled = is_spilled(vv->var_);
-        if (loop_var_spilled && is_spilled(vv->iter_begin_)) {
-            auto begin_index = GET_STMT_INIT_INDEX(vv);
-            vv->attr().set(attr_keys::load_loop_begin,
-                    loop_var_load(vv->iter_begin_, begin_index));
-        }
-        if (loop_var_spilled && is_spilled(vv->iter_end_)) {
-            auto cond_index = GET_STMT_INIT_INDEX(vv->body_) + 1;
-            vv->attr().set(attr_keys::load_loop_end,
-                    loop_var_load(vv->iter_end_, cond_index));
-        }
-        if (loop_var_spilled && is_spilled(vv->step_)) {
-            auto step_index = GET_STMT_INDEX(vv->body_);
-            vv->attr().set(attr_keys::load_loop_step,
-                    loop_var_load(vv->step_, step_index));
-        }
-        return vv;
-    }
-
-    stmt_c resolve_spill(define_c v) {
-        auto vv = xbyak_visitor_t::visit(std::move(v))
-                          .remove_const()
-                          .static_as<define>();
-        if (is_spilled(vv->var_) && vv->init_.defined()) {
-            if (vv->var_.isa<tensor>()
-                    && (vv->init_.isa<cast>() || vv->init_.isa<tensorptr>())) {
-                // load tensor_ptr using lea needs dst be reg
-                vv->init_ = insert_load(std::move(vv->init_), cur_index_);
-            } else if (const_exceed_32bit(vv->init_)) {
-                // load constant to mem location cannot exceed 32bit
-                vv->init_ = insert_load(std::move(vv->init_), cur_index_);
-            } else if (is_spilled(vv->init_)) {
-                // load mem to mem need intermediate reg
-                vv->init_ = insert_load(std::move(vv->init_), cur_index_);
-            }
-        }
-        return vv;
-    }
-
-    expr_c resolve_spill(cast_c v) {
-        // if operand_mem_sum({dst, in}) > 1
-        // in allow mem, dst must be reg
-        resolve_dst_ = resolve_dst::store;
-        return v;
-    }
-
-    expr_c resolve_spill(tensorptr_c v) {
-        resolve_dst_ = resolve_dst::store;
-        return xbyak_visitor_t::visit(std::move(v));
-    }
-
-    expr_c resolve_spill(indexing_c x) {
-        auto v = xbyak_visitor_t::visit(std::move(x))
-                         .remove_const()
-                         .static_as<indexing>();
-        auto ptr = v->ptr_;
-        auto idx = v->idx_.back();
-        if (is_spilled(ptr)) {
-            v->ptr_ = insert_load(std::move(ptr), cur_index_);
-        }
-        if (is_spilled(idx)) {
-            v->idx_ = {insert_load(std::move(idx), cur_index_)};
-        }
-        return v;
-    }
-
-    expr_c resolve_spill(xbyak_intrin_c x) {
-        auto v = xbyak_visitor_t::visit(std::move(x))
-                         .remove_const()
-                         .static_as<xbyak_intrin>();
-        // resolve args load
-        auto resolve = [&]() {
-            size_t i = 0;
-            for (auto &arg : v->args_) {
-                if (is_spilled(arg)) { break; }
-                i++;
-            }
-            v->args_[i] = insert_load(std::move(v->args_[i]), cur_index_);
-            return v;
-        };
-        // cond_mask for intrin must be reg
-        auto &mask = v->modifier_.cond_mask_;
-        if (mask.defined() && is_spilled(mask)) {
-            mask = insert_load(std::move(mask), cur_index_);
-        }
-        // resolve differnet format
-        switch (v->format_) {
-            case xbyak_intrin_format::undefined: {
-                // no need to resolve
-                resolve_dst_ = resolve_dst::none;
-            } break;
-            case xbyak_intrin_format::directed_all_reg: {
-                // if operand_mem_sum({src, dst}) > 0
-                // src must load to reg, dst must be reg
-                resolve_dst_ = resolve_dst::store;
-                if (spilled_args_sum(v->args_) > 0) { return resolve(); }
-            } break;
-            case xbyak_intrin_format::directed_end_mem: {
-                // if operand_mem_sum({src, dst}) > 1
-                // src allow 1 mem at the end of args, excluding imm
-                // dst must be reg
-                resolve_dst_ = resolve_dst::store;
-                assert(v->args_.size() > 0);
-                auto n = v->args_.size() - 1;
-                auto imm_last = n > 0 && is_imm(v->args_.back());
-                if (cpu_flags_.fAVX512F && imm_last) { n = n - 1; }
-                if (spilled_args_sum(v->args_, n) > 0) { return resolve(); }
-            } break;
-            case xbyak_intrin_format::directed_dst_mem: {
-                // if operand_mem_sum({src, dst}) > 1
-                // when dst is mem, src must all load to reg
-                // when dst is reg, src allow 1 mem
-                resolve_dst_ = resolve_dst::none;
-                if (dst_is_mem_ && spilled_args_sum(v->args_) > 0) {
-                    return resolve();
-                } else if (spilled_args_sum(v->args_) > 1) {
-                    return resolve();
-                }
-            } break;
-            case xbyak_intrin_format::directed_dst_reg: {
-                // if operand_mem_sum({src, dst}) > 1
-                // src allow 1 mem, dst must be reg
-                resolve_dst_ = resolve_dst::store;
-                if (spilled_args_sum(v->args_) > 1) { return resolve(); }
-            } break;
-            case xbyak_intrin_format::compound_dst_mem: {
-                // if operand_mem_sum({src, dst}) > 1
-                // when dst is mem, src must all load to reg
-                // when dst is reg, src allow 1 mem
-                resolve_dst_ = resolve_dst::none;
-                if (dst_is_mem_ && spilled_args_sum(v->args_) > 0) {
-                    return resolve();
-                } else if (spilled_args_sum(v->args_) > 1) {
-                    return resolve();
-                }
-            } break;
-            case xbyak_intrin_format::compound_dst_reg: {
-                // if operand_mem_sum({src, dst}) > 1
-                // src allow 1 mem, dst must be reg
-                resolve_dst_ = resolve_dst::load_store;
-                if (spilled_args_sum(v->args_) > 1) { return resolve(); }
-            } break;
-            case xbyak_intrin_format::compound_end_mem: {
-                // if operand_mem_sum({src, dst}) > 1
-                // src allow 1 mem at the end of args, excluding imm
-                // dst must be reg
-                resolve_dst_ = resolve_dst::load_store;
-                assert(v->args_.size() > 0);
-                auto n = v->args_.size() - 1;
-                auto imm_last = n > 0 && is_imm(v->args_.back());
-                if (cpu_flags_.fAVX512F && imm_last) { n = n - 1; }
-                if (spilled_args_sum(v->args_, n) > 0) { return resolve(); }
-            } break;
-        }
-        // Special cases
-        switch (static_cast<xbyak_intrin_type>(v->type_)) {
-            case xbyak_intrin_type::call_arg: {
-                auto &arg = v->args_.back();
-                auto &virt_reg = GET_VIRTUAL_REG(arg);
-                if (dst_is_mem_ && virt_reg.buffered()) {
-                    // local stack tensor as func arg passed to stack
-                    auto node = builder::make_cast(
-                            sc_data_type_t::generic(), arg);
-                    v->args_ = {insert_load(std::move(node), cur_index_)};
-                    return v;
-                } else if (dst_is_mem_ && const_exceed_32bit(arg)) {
-                    // int exceeds 32bit as func arg passed to stack
-                    v->args_ = {insert_load(std::move(arg), cur_index_)};
-                    return v;
-                }
-            } break;
-            case xbyak_intrin_type::mask_mov: {
-                if (dst_is_mem_ && v->modifier_.zero_mask_) {
-                    // store to a var in memory with zero-masking
-                    resolve_dst_ = resolve_dst::store;
-                } else if (dst_is_mem_ && spilled_args_sum(v->args_) > 0) {
-                    // dst might be indexing, do not change ptr/idx liveness
-                    resolve_dst_ = resolve_dst::none;
-                    return resolve();
-                }
-            } break;
-            default: break;
-        }
-        return v;
-    }
-
-private:
-    // index for load/store insert
-    stmt_index_t get_index_load(stmt_index_t index) { return index - 1; }
-    stmt_index_t get_index_store(stmt_index_t index) { return index + 1; }
-
-    stmt loop_var_load(expr &old_expr, stmt_index_t index) {
-        assert(old_expr.isa<var>());
-        auto index_load = get_index_load(index);
-        auto old_var = old_expr.static_as<var>();
-        auto new_var = new_temp_var(
-                old_var, "load_" + old_var->name_, index_load, index);
-        // replace old loop var with new var
-        old_expr = new_var;
-        // return load assign stmt
-        return new_temp_assign(new_var, old_var, index_load);
-    }
-
-    expr insert_load(expr old_expr, stmt_index_t index) {
-        auto index_load = get_index_load(index);
-        auto get_new_expr = [&]() {
-            if (old_expr.isa<tensor>()) {
-                auto old_ptr = old_expr.static_as<tensor>();
-                auto new_ptr = new_temp_tensor(
-                        old_ptr, "load_tensor_", index_load, index);
-                return new_ptr;
-            } else {
-                auto old_var = old_expr.static_as<var>();
-                auto new_var
-                        = new_temp_var(old_var, "load_var_", index_load, index);
-                return new_var;
-            }
-        };
-        auto new_expr = get_new_expr();
-        auto new_load
-                = new_temp_assign(new_expr, std::move(old_expr), index_load);
-        insert_before_.emplace_back(new_load);
-        return new_expr;
-    }
-
-    stmt insert_store(assign vv, stmt_index_t index) {
-        auto index_store = get_index_store(index);
-        auto &var_range = GET_LIVE_RANGE(vv->var_);
-        if (var_range.end_ < index_store) {
-            SC_MODULE_WARN << "POTENTIAL ERROR, dead store to var: " << vv;
-            var_range.update(index_store);
-        }
-        if (vv->var_.isa<tensor>()) {
-            auto old_ptr = vv->var_.static_as<tensor>();
-            auto new_ptr = new_temp_tensor(
-                    old_ptr, "store_tensor_", index, index_store);
-            auto new_store = new_temp_assign(old_ptr, new_ptr, index_store);
-            insert_after_.emplace_back(new_store);
-            vv->var_ = new_ptr;
-        } else {
-            auto old_var = vv->var_.static_as<var>();
-            auto new_var
-                    = new_temp_var(old_var, "store_var_", index, index_store);
-            auto new_store = new_temp_assign(old_var, new_var, index_store);
-            insert_after_.emplace_back(new_store);
-            vv->var_ = new_var;
-        }
-        return vv;
-    }
-
-    stmt insert_load_store(assign vv, stmt_index_t index) {
-        auto index_load = get_index_load(index);
-        auto index_store = get_index_store(index);
-        auto &var_range = GET_LIVE_RANGE(vv->var_);
-        if (var_range.end_ < index_store) {
-            SC_MODULE_WARN << "POTENTIAL ERROR, dead store to var: " << vv;
-            var_range.update(index_store);
-        }
-        if (vv->var_.isa<tensor>()) {
-            auto old_ptr = vv->var_.static_as<tensor>();
-            auto new_ptr = new_temp_tensor(
-                    old_ptr, "store_tensor_", index_load, index_store);
-            auto new_load = new_temp_assign(new_ptr, old_ptr, index_load);
-            insert_before_.emplace_back(new_load);
-            auto new_store = new_temp_assign(old_ptr, new_ptr, index_store);
-            insert_after_.emplace_back(new_store);
-            vv->var_ = new_ptr;
-        } else {
-            auto old_var = vv->var_.static_as<var>();
-            auto new_var = new_temp_var(
-                    old_var, "store_var_", index_load, index_store);
-            auto new_load = new_temp_assign(new_var, old_var, index_load);
-            insert_before_.emplace_back(new_load);
-            auto new_store = new_temp_assign(old_var, new_var, index_store);
-            insert_after_.emplace_back(new_store);
-            vv->var_ = new_var;
-        }
-        return vv;
-    }
-
-    expr new_temp_var(const expr &old_var, const std::string &prefix,
-            stmt_index_t start, stmt_index_t end) {
-        // new var
-        auto new_var = builder::make_var(
-                old_var->dtype_, prefix + std::to_string(temp_index_++));
-        // set xbyak_expr_data_t
-        new_var->temp_data() = xbyak_expr_data_t();
-        // set virt_reg
-        auto &new_virt_reg = GET_VIRTUAL_REG(new_var);
-        new_virt_reg.type_
-                = get_virt_reg_type(new_var->dtype_, cpu_flags_.fAVX512F, true);
-        new_virt_reg.spill_weight_ = spill_weight_const::infinity;
-        new_virt_reg.live_range_ = live_range_t(start, end);
-        // add to new virtual_regs
-        new_virt_reg.set_unassigned();
-        virtual_regs_.push_back(&new_virt_reg);
-        return new_var;
-    }
-
-    expr new_temp_tensor(const tensor &old_tsr, const std::string &prefix,
-            stmt_index_t start, stmt_index_t end) {
-        // new var
-        auto new_tensor = builder::make_tensor(
-                prefix + old_tsr->name_, old_tsr->dims_, old_tsr->elem_dtype_);
-        // set xbyak_expr_data_t
-        new_tensor->temp_data() = xbyak_expr_data_t();
-        // set virt_reg
-        auto &new_virt_reg = GET_VIRTUAL_REG(new_tensor);
-        new_virt_reg.type_
-                = get_virt_reg_type(new_tensor->dtype_, cpu_flags_.fAVX512F);
-        new_virt_reg.spill_weight_ = spill_weight_const::infinity;
-        new_virt_reg.live_range_ = live_range_t(start, end);
-        // add to new virtual_regs
-        new_virt_reg.set_unassigned();
-        virtual_regs_.push_back(&new_virt_reg);
-        return new_tensor;
-    }
-
-    stmt new_temp_assign(expr var, expr value, stmt_index_t index) {
-        // new var
-        auto new_assign
-                = make_stmt<assign_node_t>(std::move(var), std::move(value));
-        // set xbyak_stmt_data_t
-        new_assign->temp_data() = xbyak_stmt_data_t(loop_depth());
-        GET_STMT_INDEX(new_assign) = index;
-        GET_STMT_INIT_INDEX(new_assign) = index;
-        return new_assign;
-    }
-
-    int spilled_args_sum(const std::vector<expr> &args) {
-        int sum = 0;
-        for (auto &v : args) {
-            sum += is_spilled(v) ? 1 : 0;
-        }
-        return sum;
-    }
-
-    int spilled_args_sum(const std::vector<expr> &args, size_t n) {
-        int sum = 0;
-        for (size_t i = 0; i < n; i++) {
-            sum += is_spilled(args[i]) ? 1 : 0;
-        }
-        return sum;
-    }
-
-    bool is_imm(const expr &v) {
-        return v.isa<constant>() && GET_VIRTUAL_REG(v).disabled();
-    }
-
-    bool is_spilled(const expr &v) {
-        auto &virt_reg = GET_VIRTUAL_REG(v);
-        return GET_VIRTUAL_REG(v).spilled();
-    }
-
-    std::vector<stmt> insert_before_;
-    std::vector<stmt> insert_after_;
-
-    const live_range_t &spill_range_;
-    std::vector<virtual_reg_t *> &virtual_regs_;
-    const runtime::cpu_flags_t &cpu_flags_;
-
-    enum class resolve_dst {
-        none,
-        store,
-        load_store,
-    } resolve_dst_
-            = resolve_dst::none;
-
-    bool dst_is_mem_ = false;
-
-    stmt_index_t cur_index_ = 0;
-};
-
-/* *
- * The actual allocator that run allocation pass for each func_c
- * */
-class register_allocation_impl_t : public reg_allocator_t {
-public:
-    register_allocation_impl_t(const x86_64::target_profile_t &profile)
-        : reg_allocator_t(profile)
-        , cpu_flags_(profile.target_machine_.cpu_flags_) {}
-
-    // Fuction pass for register_allocation_impl
-    func_c operator()(func_c v) {
-        func_ = pre_allocation(std::move(v));
-        resolve_spill(live_range_t());
-        run_allocator();
-        set_global_spilled();
-        set_register_usage();
-        return std::move(func_);
-    }
-
-    // Enqueue all virtual regs and prepare for spill insertion
-    func_c pre_allocation(func_c v) {
-        pre_allocation_t pre_allocation(this, virt_reg_map_, cpu_flags_);
-        return pre_allocation(std::move(v));
-    }
-
-    // Check intrin format and create load/store
-    void resolve_spill_impl(const live_range_t &spill_range,
-            std::vector<virtual_reg_t *> &virtual_regs) override {
-        spill_resolver_t spill_resolver(spill_range, virtual_regs, cpu_flags_);
-        func_ = spill_resolver(std::move(func_));
-    }
-
-    void set_global_spilled() {
-        func_t func = std::const_pointer_cast<func_base>(func_);
-        func->attr().set(attr_keys::global_spilled, spilled_expr_vec());
-    }
-
-    void set_register_usage() {
-        func_t func = std::const_pointer_cast<func_base>(func_);
-        func->attr().set(
-                attr_keys::register_usage, slots_array().utilized_slots());
-    }
-
-    std::vector<expr_c> spilled_expr_vec() {
-        std::vector<expr_c> ret_vec;
-        auto &spilled = spilled_virt_regs();
-        ret_vec.reserve(spilled.size());
-        for (auto &virt_reg : spilled) {
-            ret_vec.push_back(virt_reg_map_[virt_reg]);
-        }
-        spilled.clear();
-        return ret_vec;
-    }
-
-private:
-    func_c func_;
-    std::map<virtual_reg_t *, expr_c> virt_reg_map_;
-    const runtime::cpu_flags_t &cpu_flags_;
-};
-
-func_c register_allocation_t::operator()(func_c v) {
-    if (v->name_.find("_should_inline_") != std::string::npos) { return v; }
-    register_allocation_impl_t reg_allocator(profile_);
-    return reg_allocator(std::move(v));
-}
-
-register_allocation_t::register_allocation_t(
-        const x86_64::target_profile_t &profile)
-    : profile_(profile) {}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/register_allocation.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/register_allocation.hpp
deleted file mode 100644
index 11be1879601..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/register_allocation.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_REGISTER_ALLOCATION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_REGISTER_ALLOCATION_HPP
-
-#include <compiler/ir/function_pass.hpp>
-#include <compiler/jit/xbyak/x86_64/target_profile.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-namespace attr_keys {
-// attr for func, contains all global spilled exprs inside func
-// data type: std::vector<expr_c>
-constexpr const char *global_spilled = "global_spilled";
-// attr for func, contains all utilized registers inside func
-// data type: std::set<virt_reg_index_t>
-constexpr const char *register_usage = "register_usage";
-// attr for for_loop, contains load assign for spilled iter_begin_
-// data type: stmt
-constexpr const char *load_loop_begin = "load_loop_begin";
-// attr for for_loop, contains load assign for spilled iter_end_
-// data type: stmt
-constexpr const char *load_loop_end = "load_loop_end";
-// attr for for_loop, contains load assign for spilled step_
-// data type: stmt
-constexpr const char *load_loop_step = "load_loop_step";
-} // namespace attr_keys
-
-/* *
- * Register allocation uses virtual_slot to represent physcial register, each
- * expr have its own virtual_reg which contains its reg_type and live_range.
- * Pre-allocation pass put all virtual_regs in a priority_queue.
- * For each virtual_reg, allocator use virtual_slots_map to aquire corresponding
- * virtual_slot index range in a virtual_slots_array for assignment and try to
- * find a available slot that does not interfer with other virtual_regs.
- * Virtual_slot uses non-overlapping balanced intervel tree to store live_range
- * and check interference, if no virtual_slot is available, virtual_regs is
- * spilled and spill will be resolved using address mode checking.
- *
- *        <------------------Live Range------------------>
- *        _______________________________________________
- * Slot 0|__[--virt_reg_0--)___[-virt_reg_2-)____________|
- * Slot 1|________[------virt_reg_1------)_______________|
- * Slot 2|_________________[-------virt_reg_3-------)____|
- * Slot 3|_______________________________________________|
- * Slot ...
- * */
-class register_allocation_t : public function_pass_t {
-public:
-    register_allocation_t(const x86_64::target_profile_t &profile);
-    func_c operator()(func_c v) override;
-
-private:
-    const x86_64::target_profile_t &profile_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.cpp
deleted file mode 100644
index b1972c44e24..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.cpp
+++ /dev/null
@@ -1,1541 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/jit/xbyak/ir/transform/constant_optimizer.hpp>
-#include <compiler/jit/xbyak/ir/util/invariant_int.hpp>
-#include <compiler/jit/xbyak/ir/util/utils.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_visitor.hpp>
-#include <compiler/jit/xbyak/x86_64/registers.hpp>
-#include <util/any_map.hpp>
-#include <util/array_ref.hpp>
-
-#include "util/utils.hpp"
-#include "x86_intrinsics_lowering.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class x86_intrinsics_lowering_impl_t : public xbyak_visitor_t {
-public:
-    using xbyak_visitor_t::dispatch;
-    using xbyak_visitor_t::visit;
-
-    x86_intrinsics_lowering_impl_t(
-            const runtime::target_machine_t &target_machine)
-        : cpu_flags_(target_machine.cpu_flags_) {
-        COMPILE_ASSERT(cpu_flags_.fAVX2, "Support AVX2");
-    }
-
-    std::vector<stmt> transform_seq_;
-
-    expr_c dispatch(expr_c v) override { return v; }
-
-    stmt_c visit(stmts_c v) override {
-        std::vector<stmt> new_seq;
-        for (auto &s : v->seq_) {
-            auto ss = xbyak_visitor_t::dispatch(s);
-            if (!transform_seq_.empty()) {
-                for (auto &&ts : transform_seq_) {
-                    new_seq.push_back(std::move(ts));
-                }
-                transform_seq_.clear();
-            } else {
-                new_seq.emplace_back(ss.remove_const());
-            }
-        }
-        return copy_attr(*v, make_stmt<stmts_node_t>(std::move(new_seq)));
-    }
-
-    stmt_c visit(assign_c v) override {
-        x86_intrinsics_transform(v->var_, v->value_);
-        return std::move(v);
-    }
-
-    stmt_c visit(define_c v) override {
-        if (v->var_.isa<var>() && v->init_.defined()) {
-            add_defination(v->var_, v->linkage_);
-            x86_intrinsics_transform(v->var_, v->init_);
-        } else if (v->var_.isa<tensor>()
-                && v->init_.cast<intrin_call>()
-                           .filter([](const intrin_call &v) {
-                               return v->type_ == intrin_type::reinterpret
-                                       && v->args_.at(0)->dtype_
-                                       == datatypes::index;
-                           })
-                           .has_value()) {
-            // handle tensor A[...] = reintepret(...)
-            auto intri = v->init_.static_as<intrin_call>();
-            add_defination(v->var_, v->linkage_, intri->args_[0]);
-        }
-        return std::move(v);
-    }
-
-    stmt_c visit(evaluate_c v) override {
-        if (v->value_.isa<low_level_intrin>()) {
-            x86_intrinsics_transform(expr(), v->value_);
-        }
-        return std::move(v);
-    }
-
-    void x86_intrinsics_transform(const expr &dst, const expr &val) {
-        switch (val->node_type_) {
-            case sc_expr_type::add: {
-                auto bin = val.static_as<binary>();
-                transform(dst, {bin->l_, bin->r_},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::add),
-                        transform_intrin(xbyak_intrin_type::add));
-            } break;
-            case sc_expr_type::sub: {
-                auto bin = val.static_as<binary>();
-                transform(dst, {bin->l_, bin->r_},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::sub),
-                        transform_intrin(xbyak_intrin_type::sub));
-            } break;
-            case sc_expr_type::mul: {
-                auto bin = val.static_as<binary>();
-                transform(dst, {bin->l_, bin->r_},
-                        dst->dtype_, //
-                        transform_x86_mul(),
-                        transform_intrin(xbyak_intrin_type::mul));
-            } break;
-            case sc_expr_type::div: {
-                auto bin = val.static_as<binary>();
-                transform(dst, {bin->l_, bin->r_},
-                        dst->dtype_, //
-                        transform_x86_mod_div(xbyak_intrin_type::div),
-                        transform_avx_div());
-            } break;
-            case sc_expr_type::mod: {
-                auto bin = val.static_as<binary>();
-                transform(dst, {bin->l_, bin->r_},
-                        dst->dtype_, //
-                        transform_x86_mod_div(xbyak_intrin_type::mod),
-                        transform_disabled("mod"));
-            } break;
-            case sc_expr_type::logic_and: {
-                auto log = val.static_as<logic>();
-                transform(dst, {log->l_, log->r_},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::bit_and),
-                        transform_disabled("logic_and"));
-            } break;
-            case sc_expr_type::logic_or: {
-                auto log = val.static_as<logic>();
-                transform(dst, {log->l_, log->r_},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::bit_or),
-                        transform_disabled("logic_or"));
-            } break;
-            case sc_expr_type::logic_not: {
-                auto log = val.static_as<logic_not>();
-                transform(dst, {log->in_, builder::make_constant(UINT64_C(1))},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::bit_xor),
-                        transform_disabled("logic_not"));
-            } break;
-            case sc_expr_type::select: {
-                auto sel = val.static_as<select>();
-                transform_select(dst, sel);
-            } break;
-            case sc_expr_type::intrin_call: {
-                auto intrin = val.static_as<intrin_call>();
-                transform_intrin_call(dst, intrin);
-            } break;
-            case sc_expr_type::low_level_intrin: {
-                auto intrin = val.static_as<low_level_intrin>();
-                transform_low_level_intrin(dst, intrin);
-            } break;
-            case sc_expr_type::cast: {
-                auto src = val.static_as<cast>();
-                transform_cast(dst, src);
-            } break;
-            case sc_expr_type::cmp_eq:
-            case sc_expr_type::cmp_ne:
-            case sc_expr_type::cmp_lt:
-            case sc_expr_type::cmp_le:
-            case sc_expr_type::cmp_gt:
-            case sc_expr_type::cmp_ge: {
-                auto src = val.static_as<cmp>();
-                transform_cmp_set(dst, src, val->node_type_);
-            } break;
-            case sc_expr_type::var: {
-                // If potential mask store, do not zero mask
-                transform_assign(dst, val, dst, false);
-            } break;
-            case sc_expr_type::indexing: {
-                // If potential mask load, need zero mask
-                transform_assign(dst, val, val, true);
-            } break;
-            default: {
-                add_assignment(dst, val);
-            } break;
-        }
-    }
-
-    void transform_intrin_call(const expr &dst, const intrin_call &intrin) {
-        switch (intrin->type_) {
-            case intrin_type::min: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_x86_min_max(xbyak_intrin_type::min),
-                        transform_intrin(xbyak_intrin_type::min));
-            } break;
-            case intrin_type::max: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_x86_min_max(xbyak_intrin_type::max),
-                        transform_intrin(xbyak_intrin_type::max));
-            } break;
-            case intrin_type::abs: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_x86_abs(),
-                        transform_intrin(xbyak_intrin_type::abs));
-            } break;
-            case intrin_type::shl: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_x86_shift(xbyak_intrin_type::shl),
-                        transform_intrin(xbyak_intrin_type::shl));
-            } break;
-            case intrin_type::shr: {
-                auto is_uint = (CATE_UINT
-                        == get_etype_category(dst->dtype_.type_code_));
-                auto sft_type = is_uint ? xbyak_intrin_type::shr
-                                        : xbyak_intrin_type::sar;
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_x86_shift(sft_type),
-                        transform_intrin(sft_type));
-            } break;
-            case intrin_type::ceil: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_disabled("ceil"),
-                        transform_intrin(xbyak_intrin_type::ceil));
-            } break;
-            case intrin_type::floor: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_disabled("floor"),
-                        transform_intrin(xbyak_intrin_type::floor));
-            } break;
-            case intrin_type::round: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_disabled("round"),
-                        transform_intrin(xbyak_intrin_type::round));
-            } break;
-            case intrin_type::sqrt: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_disabled("sqrt"),
-                        transform_intrin(xbyak_intrin_type::sqrt));
-            } break;
-            case intrin_type::rsqrt: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_disabled("rsqrt"),
-                        transform_intrin(xbyak_intrin_type::rsqrt));
-            } break;
-            case intrin_type::fmadd: {
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1], intrin->args_[2]},
-                        dst->dtype_, //
-                        transform_disabled("fmadd"),
-                        transform_4a_to_3a(xbyak_intrin_type::fmadd));
-            } break;
-            case intrin_type::fnmadd: {
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1], intrin->args_[2]},
-                        dst->dtype_, //
-                        transform_disabled("fnmadd"),
-                        transform_4a_to_3a(xbyak_intrin_type::fnmadd));
-            } break;
-            case intrin_type::broadcast: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_disabled("broadcast"), //
-                        transform_broadcast(intrin->args_[0]->dtype_));
-            } break;
-            case intrin_type::reduce_add: {
-                transform(dst, {intrin->args_[0]},
-                        intrin->args_[0]->dtype_, //
-                        transform_disabled("reduce_add"),
-                        transform_simd_reduce_seq(xbyak_intrin_type::add));
-            } break;
-            case intrin_type::reduce_mul: {
-                transform(dst, {intrin->args_[0]},
-                        intrin->args_[0]->dtype_, //
-                        transform_disabled("reduce_mul"),
-                        transform_simd_reduce_seq(xbyak_intrin_type::mul));
-            } break;
-            case intrin_type::reduce_min: {
-                transform(dst, {intrin->args_[0]},
-                        intrin->args_[0]->dtype_, //
-                        transform_disabled("reduce_min"),
-                        transform_simd_reduce_seq(xbyak_intrin_type::min));
-            } break;
-            case intrin_type::reduce_max: {
-                transform(dst, {intrin->args_[0]},
-                        intrin->args_[0]->dtype_, //
-                        transform_disabled("reduce_max"),
-                        transform_simd_reduce_seq(xbyak_intrin_type::max));
-            } break;
-            case intrin_type::unpack_low: {
-                auto imm = intrin->intrin_attrs_->get<int>("elem_bits");
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1],
-                                builder::make_constant(imm)},
-                        dst->dtype_, //
-                        transform_disabled("unpack_low"),
-                        transform_intrin(xbyak_intrin_type::unpack_low));
-            } break;
-            case intrin_type::unpack_high: {
-                auto imm = intrin->intrin_attrs_->get<int>("elem_bits");
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1],
-                                builder::make_constant(imm)},
-                        dst->dtype_, //
-                        transform_disabled("unpack_high"),
-                        transform_intrin(xbyak_intrin_type::unpack_high));
-            } break;
-            case intrin_type::shuffle: {
-                auto imm = intrin->intrin_attrs_->get<int>("shuffle_imm");
-                auto type_bits = intrin->intrin_attrs_->get<int>("type_bits");
-
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1],
-                                builder::make_constant(imm),
-                                builder::make_constant(type_bits)},
-                        dst->dtype_, //
-                        transform_disabled("shuffle"),
-                        transform_intrin(xbyak_intrin_type::shuffle));
-            } break;
-            case intrin_type::permute: {
-                auto imm = intrin->intrin_attrs_->get<int>("permute_imm");
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1],
-                                builder::make_constant(imm)},
-                        dst->dtype_, //
-                        transform_disabled("permute"),
-                        transform_intrin(xbyak_intrin_type::permute));
-            } break;
-            case intrin_type::gather: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_disabled("gather"), transform_gather());
-            } break;
-            case intrin_type::insert: {
-                auto imm = intrin->intrin_attrs_->get<int>("insert_imm");
-                auto elem_bits
-                        = utils::get_sizeof_type(intrin->args_[1]->dtype_) * 8;
-                bool need_to_reg32 = elem_bits < 32;
-                auto insert_val = builder::make_var(
-                        sc_data_type_t::u32(1), "insert_val_");
-                if (need_to_reg32) {
-                    // need reg32
-                    add_defination(insert_val, linkage::local);
-                    add_assignment(insert_val,
-                            builder::make_cast(
-                                    datatypes::u32, intrin->args_[1]));
-                }
-                transform(dst,
-                        {intrin->args_[0],
-                                need_to_reg32 ? insert_val : intrin->args_[1],
-                                builder::make_constant(imm),
-                                builder::make_constant(elem_bits)},
-                        dst->dtype_, //
-                        transform_disabled("insert"),
-                        transform_5a_to_4a(xbyak_intrin_type::insert));
-            } break;
-            case intrin_type::extract: {
-                auto imm = intrin->intrin_attrs_->get<int>("extract_imm");
-                auto elem_bits = utils::get_sizeof_type(intrin->dtype_) * 8;
-                transform(dst,
-                        {intrin->args_[0], builder::make_constant(imm),
-                                builder::make_constant(elem_bits)},
-                        intrin->args_[0]->dtype_, //
-                        transform_disabled("extract"), transform_extract());
-            } break;
-            case intrin_type::int_and: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::bit_and),
-                        transform_intrin(xbyak_intrin_type::bit_and));
-            } break;
-            case intrin_type::int_or: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::bit_or),
-                        transform_intrin(xbyak_intrin_type::bit_or));
-            } break;
-            case intrin_type::int_xor: {
-                transform(dst, {intrin->args_[0], intrin->args_[1]},
-                        dst->dtype_, //
-                        transform_3a_to_2a(xbyak_intrin_type::bit_xor),
-                        transform_intrin(xbyak_intrin_type::bit_xor));
-            } break;
-            case intrin_type::reinterpret: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_intrin(xbyak_intrin_type::reinterpret),
-                        transform_intrin(xbyak_intrin_type::reinterpret));
-            } break;
-            case intrin_type::permutex2var: {
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1], intrin->args_[2]},
-                        dst->dtype_, //
-                        transform_disabled("permutex2var"),
-                        transform_4a_to_3a(xbyak_intrin_type::permutex2var));
-            } break;
-            case intrin_type::permutexvar: {
-                const int elem_bits
-                        = intrin->intrin_attrs_->get_or_else("lanes", 1)
-                        * utils::get_sizeof_etype(dst->dtype_.type_code_) * 8;
-                transform(dst,
-                        {intrin->args_[0], intrin->args_[1],
-                                builder::make_constant(elem_bits)},
-                        dst->dtype_, //
-                        transform_disabled("permutexvar"),
-                        transform_intrin(xbyak_intrin_type::permutexvar));
-            } break;
-            case intrin_type::saturated_cast: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_intrin(xbyak_intrin_type::saturated_cast),
-                        transform_intrin(xbyak_intrin_type::saturated_cast));
-            } break;
-            case intrin_type::round_and_cast: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_intrin(xbyak_intrin_type::round_and_cast),
-                        transform_intrin(xbyak_intrin_type::round_and_cast));
-            } break;
-            case intrin_type::constant_load:
-            case intrin_type::volatile_load: {
-                transform(dst, {intrin->args_[0]},
-                        dst->dtype_, //
-                        transform_assign(), transform_assign());
-            } break;
-            default: add_assignment(dst, intrin); break;
-        }
-    }
-
-    void transform_low_level_intrin(
-            const expr &dst, const low_level_intrin &intrin) {
-        if (intrin->kind_ == low_level_intrin_kind::x86_xbyak) {
-            add_assignment(dst, intrin);
-            return;
-        }
-        // only transform x86 low_level_intrin
-        COMPILE_ASSERT(intrin->kind_ == low_level_intrin_kind::x86_general,
-                "Must be x86 intrinsic!");
-        switch (intrin->type_) {
-            case x86_intrin_type::avx_broadcast_idx: {
-                auto &lanes = intrin->args_[2];
-                assert(lanes.isa<constant>());
-                auto arg = builder::make_indexing( //
-                        intrin->args_[0], {intrin->args_[1]},
-                        get_const_as_int(lanes.static_as<constant>()));
-                transform_broadcast(dst, arg, arg->dtype_);
-            } break;
-            case x86_intrin_type::avx_mask_cast: {
-                assert(intrin->args_.size() == 1);
-                transform_avx_mask_cast(dst, intrin->args_[0]);
-            } break;
-            case x86_intrin_type::avx_compare: {
-                assert(intrin->args_.size() == 3);
-                assert(intrin->args_[2].isa<constant_c>());
-                auto c = intrin->args_[2].static_as<constant_c>();
-                auto code = static_cast<xbyak_condition>(c->value_[0].u64);
-                add_assignment(dst,
-                        make_xbyak_intrin(dst->dtype_,
-                                {intrin->args_[0], intrin->args_[1]},
-                                xbyak_intrin_type::cmp_set,
-                                xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(code, dst->dtype_)));
-            } break;
-            default:
-                COMPILE_ASSERT(false, "Unknown low level intrinsic!");
-                break;
-        }
-    }
-
-    void transform_cast(const expr &dst, const cast &src) {
-        const sc_data_type_t src_dtype = src->in_->dtype_;
-        const sc_data_type_t dst_dtype = src->dtype_;
-
-        if (dst_dtype == sc_data_type_t::bf16(1)
-                && src_dtype == sc_data_type_t::f32(1)) {
-            auto bias = builder::make_var(sc_data_type_t::u32(1), "bias");
-            auto temp = builder::make_var(sc_data_type_t::u32(1), "temp");
-            add_defination(bias, linkage::local);
-            add_defination(temp, linkage::local);
-            // temp = reinterpret_cast<u32>(src);
-            transform_intrin(temp, {src->in_}, xbyak_intrin_type::reinterpret,
-                    xbyak_intrin_isa::x86);
-            // bias = temp;
-            add_assignment(bias, temp);
-            // bias = bias >> 16
-            transform_intrin(bias, {builder::make_constant(UINT64_C(16))},
-                    xbyak_intrin_type::shr, xbyak_intrin_isa::x86);
-            // bias = bias & 1
-            transform_intrin(bias, {builder::make_constant(UINT64_C(1))},
-                    xbyak_intrin_type::bit_and, xbyak_intrin_isa::x86);
-            // bias = bias + 0x7FFF
-            transform_intrin(bias,
-                    {builder::make_constant(
-                            {UINT64_C(32767)}, sc_data_type_t::u32(1))},
-                    xbyak_intrin_type::add, xbyak_intrin_isa::x86);
-            // temp = temp + bias
-            transform_intrin(temp, {bias}, xbyak_intrin_type::add,
-                    xbyak_intrin_isa::x86);
-            // temp = temp >> 16
-            transform_intrin(temp, {builder::make_constant(UINT64_C(16))},
-                    xbyak_intrin_type::shr, xbyak_intrin_isa::x86);
-            // dst = u16(temp)
-            add_assignment(dst, builder::make_cast(dst_dtype, temp));
-        } else {
-            add_assignment(dst, src);
-        }
-    }
-
-    void transform_select(const expr &dst, const select &sel) {
-        const sc_data_type_t dst_dtype = dst->dtype_;
-        // Transform select
-        auto cond = cast_when_mask(sel->cond_, dst_dtype);
-        auto zero = is_const_zero(sel->r_);
-        auto isa = convert_x86_operation(dst_dtype) ? xbyak_intrin_isa::x86
-                                                    : xbyak_intrin_isa::avx;
-        if (cond->dtype_ == datatypes::boolean) {
-            // test cond, cond
-            transform_intrin(cond, {cond}, xbyak_intrin_type::test,
-                    xbyak_intrin_isa::x86);
-            // TODO(XXX): fix x86 select:
-            // x86 conditional move only accepts cmovcc(r, r/m)
-            if (isa == xbyak_intrin_isa::x86) {
-                // temporary solution for cmov
-                auto rax = make_physical_reg(dst_dtype, x86_64::regs::rax);
-                add_defination(rax, linkage::local);
-                // rax = lhs
-                add_assignment(rax, sel->l_);
-                // If x86 select rhs is constant, load to var
-                auto rhs = load_when_imm(sel->r_, "__sel_tmp_var");
-                // if(cond = false) rax = rhs
-                add_assignment(rax,
-                        make_xbyak_intrin(dst_dtype, {rhs},
-                                xbyak_intrin_type::cmov, isa,
-                                xbyak_intrin_modifier(xbyak_condition::eq)));
-                add_assignment(dst, rax);
-            } else {
-                // dst = lhs
-                add_assignment(dst, sel->l_);
-                // if(cond = false) dst = rhs
-                add_assignment(dst,
-                        make_xbyak_intrin(dst_dtype, {sel->r_},
-                                xbyak_intrin_type::cmov, isa,
-                                xbyak_intrin_modifier(xbyak_condition::eq)));
-            }
-        } else {
-            if (zero && cpu_flags_.fAVX512F) {
-                // avx512 zero mask move
-                add_assignment(dst,
-                        make_xbyak_intrin(dst_dtype, {sel->l_},
-                                xbyak_intrin_type::mask_mov, isa,
-                                xbyak_intrin_modifier(cond, zero)));
-            } else {
-                // blend order reversed: select second operand if cond is true
-                add_assignment(dst,
-                        make_xbyak_intrin(dst_dtype, {sel->r_, sel->l_},
-                                xbyak_intrin_type::blend, isa,
-                                xbyak_intrin_modifier(cond, zero)));
-            }
-        }
-    }
-
-    void transform_cmp_set(const expr &dst, const cmp &src, sc_expr_type t) {
-        const sc_data_type_t cmp_dtype = src->l_->dtype_;
-        auto code = get_xbyak_condition(t);
-        auto isa = convert_x86_operation(cmp_dtype) ? xbyak_intrin_isa::x86
-                                                    : xbyak_intrin_isa::avx;
-        if (cmp_dtype == datatypes::f32) {
-            // vcmpss  xmm0, xmm1, xmm2
-            // vmovd   eax, xmm0
-            // and     eax, 1
-            auto xmm0 = make_physical_reg(datatypes::f32, x86_64::regs::xmm0);
-            add_defination(xmm0, linkage::local);
-            add_assignment(xmm0,
-                    make_xbyak_intrin(datatypes::f32, {src->l_, src->r_},
-                            xbyak_intrin_type::cmp_set, xbyak_intrin_isa::avx,
-                            xbyak_intrin_modifier(code, cmp_dtype)));
-            add_assignment(dst,
-                    make_xbyak_intrin(dst->dtype_, {xmm0},
-                            xbyak_intrin_type::movd, xbyak_intrin_isa::avx));
-            add_assignment(dst,
-                    make_xbyak_intrin(dst->dtype_,
-                            {builder::make_constant(
-                                    {UINT64_C(1)}, datatypes::u8)},
-                            xbyak_intrin_type::bit_and, xbyak_intrin_isa::x86));
-        } else {
-            add_assignment(dst,
-                    make_xbyak_intrin(dst->dtype_, {src->l_, src->r_},
-                            xbyak_intrin_type::cmp_set, isa,
-                            xbyak_intrin_modifier(code, cmp_dtype)));
-        }
-    }
-
-    // --------------------------------
-    // Transform operations and lambdas
-    // --------------------------------
-
-    using transform_func = std::function<void(
-            const expr &, array_ref<expr>, sc_data_type_t, xbyak_intrin_isa)>;
-
-    void transform(const expr &dst, array_ref<expr> args, sc_data_type_t dtype,
-            const transform_func &scalar_f, const transform_func &vec_f) {
-        if (convert_x86_operation(dtype)) {
-            scalar_f(dst, args, dtype, xbyak_intrin_isa::x86);
-        } else {
-            vec_f(dst, args, dtype, xbyak_intrin_isa::avx);
-        }
-    }
-
-    transform_func transform_disabled(const std::string &str) {
-        return [str](const expr &dst, array_ref<expr> src, sc_data_type_t dtype,
-                       xbyak_intrin_isa isa) {
-            COMPILE_ASSERT(false, "Transform disabled: " << str);
-        };
-    }
-
-    transform_func transform_assign() {
-        return [this](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype,
-                       xbyak_intrin_isa isa) { add_assignment(dst, src[0]); };
-    }
-
-    transform_func transform_gather() {
-        return [this](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_gather(dst, src[0], src[1]);
-        };
-    }
-
-    transform_func transform_extract() {
-        return [this](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_extract(dst, src[0], src[1], src[2]);
-        };
-    }
-
-    transform_func transform_broadcast(sc_data_type_t src_dtype) {
-        return [this, src_dtype](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_broadcast(dst, src[0], src_dtype);
-        };
-    }
-
-    transform_func transform_intrin(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_intrin(dst, src, intrin, isa);
-        };
-    }
-
-    transform_func transform_3a_to_2a(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_3a_to_2a(dst, src[0], src[1], intrin, isa);
-        };
-    }
-
-    transform_func transform_4a_to_3a(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_4a_to_3a(dst, src[0], src[1], src[2], intrin, isa);
-        };
-    }
-
-    transform_func transform_5a_to_4a(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_5a_to_4a(
-                    dst, src[0], src[1], src[2], src[3], intrin, isa);
-        };
-    }
-
-    transform_func transform_x86_mul() {
-        return [this](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_x86_mul(dst, src[0], src[1]);
-        };
-    }
-
-    transform_func transform_x86_abs() {
-        return [this](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_x86_abs(dst, src[0]);
-        };
-    }
-
-    transform_func transform_x86_shift(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_x86_shift(dst, src[0], src[1], intrin);
-        };
-    }
-
-    transform_func transform_x86_min_max(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_x86_min_max(dst, src[0], src[1], intrin);
-        };
-    }
-
-    transform_func transform_x86_mod_div(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_x86_mod_div(dst, src[0], src[1], dtype, intrin);
-        };
-    }
-
-    transform_func transform_avx_div() {
-        return [this](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_avx_div(dst, src[0], src[1], dtype);
-        };
-    }
-
-    transform_func transform_simd_reduce_seq(xbyak_intrin_type intrin) {
-        return [this, intrin](const expr &dst, array_ref<expr> src,
-                       sc_data_type_t dtype, xbyak_intrin_isa isa) {
-            transform_simd_reduce_seq(dst, src[0], dtype, intrin, isa);
-        };
-    }
-
-    // -----------------
-    // Transform helpers
-    // -----------------
-
-    void transform_intrin(const expr &v, array_ref<expr> args,
-            xbyak_intrin_type intrin, xbyak_intrin_isa isa) {
-        add_assignment(v,
-                make_xbyak_intrin(v->dtype_, args.as_vector(), intrin, isa,
-                        xbyak_intrin_modifier(v->dtype_)));
-    }
-
-    void transform_intrin_eval(array_ref<expr> args, //
-            xbyak_intrin_type intrin, xbyak_intrin_isa isa) {
-        add_evaluate(make_xbyak_intrin(
-                datatypes::void_t, args.as_vector(), intrin, isa));
-    }
-
-    void transform_intrin_eval_attr(const expr &v, array_ref<expr> args, //
-            xbyak_intrin_type intrin, xbyak_intrin_isa isa) {
-        add_evaluate(copy_attr(*v,
-                make_xbyak_intrin(
-                        datatypes::void_t, args.as_vector(), intrin, isa)));
-    }
-
-    void transform_3a_to_2a(const expr &dst, const expr &lhs, const expr &rhs,
-            xbyak_intrin_type intrin, xbyak_intrin_isa isa) {
-        // x86 operations only support up to 32 bit imm
-        auto load_if_imm_64_bit = [&](const expr &v) -> expr {
-            if (isa == xbyak_intrin_isa::x86 && const_exceed_32bit(v)) {
-                auto imm = builder::make_var(v->dtype_, "__64_bit_imm");
-                add_defination(imm, linkage::local);
-                add_assignment(imm, v);
-                return imm;
-            } else {
-                return v;
-            }
-        };
-        auto v = load_when_interfere(dst, rhs);
-        // dst = lhs
-        add_assignment(dst, lhs);
-        // dst = 2a(rhs)
-        transform_intrin(dst, {load_if_imm_64_bit(v)}, intrin, isa);
-    }
-
-    void transform_4a_to_3a(const expr &dst, const expr &a, const expr &b,
-            const expr &c, xbyak_intrin_type intrin, xbyak_intrin_isa isa) {
-        auto bb = load_when_interfere(dst, b);
-        auto cc = load_when_interfere(dst, c);
-        // dst = a
-        add_assignment(dst, a);
-        // dst = 3a(b, c)
-        transform_intrin(dst, {bb, cc}, intrin, isa);
-    }
-
-    void transform_5a_to_4a(const expr &dst, const expr &a, const expr &b,
-            const expr &c, const expr &d, xbyak_intrin_type intrin,
-            xbyak_intrin_isa isa) {
-        auto bb = load_when_interfere(dst, b);
-        auto cc = load_when_interfere(dst, c);
-        auto dd = load_when_interfere(dst, d);
-        // dst = src1
-        add_assignment(dst, a);
-        // dst = 4a(b, c, d)
-        transform_intrin(dst, {bb, cc, dd}, intrin, isa);
-    }
-
-    void transform_x86_mul(const expr &dst, const expr &lhs, const expr &rhs) {
-        if (rhs.isa<constant>()) {
-            // mul constant always rhs
-            transform_intrin(dst, {lhs, rhs}, xbyak_intrin_type::muli,
-                    xbyak_intrin_isa::x86);
-        } else {
-            transform_3a_to_2a(dst, lhs, rhs, xbyak_intrin_type::mul,
-                    xbyak_intrin_isa::x86);
-        }
-    }
-
-    void transform_x86_shift(const expr &dst, const expr &lhs, const expr &rhs,
-            xbyak_intrin_type intrin) {
-        if (rhs.isa<constant>()) {
-            transform_3a_to_2a(dst, lhs, rhs, intrin, xbyak_intrin_isa::x86);
-        } else {
-            // if shift rhs is not a imm, must use reg CL to store
-            auto rcx = make_physical_reg(rhs->dtype_, x86_64::regs::rcx);
-            add_defination(rcx, linkage::local);
-            add_assignment(rcx, rhs);
-            transform_3a_to_2a(dst, lhs, rcx, intrin, xbyak_intrin_isa::x86);
-        }
-    }
-
-    void transform_x86_min_max(const expr &dst, const expr &lhs,
-            const expr &rhs, xbyak_intrin_type intrin) {
-        // dst = lhs
-        add_assignment(dst, lhs);
-        // dst = min/max(rhs)
-        auto tmp = load_when_imm(rhs, "__imm_tmp_var");
-        transform_intrin(dst, {tmp}, intrin, xbyak_intrin_isa::x86);
-    }
-
-    void transform_x86_abs(const expr &dst, const expr &src) {
-        auto dtype = src->dtype_;
-        assert(utils::is_one_of(dtype, datatypes::s8, datatypes::s32));
-        // x86 conditional move only accepts cmovcc(r, r/m)
-        auto rax = make_physical_reg(dtype, x86_64::regs::rax);
-        add_defination(rax, linkage::local);
-        // rax = src
-        add_assignment(rax, src);
-        // neg(rax)
-        add_assignment(rax,
-                make_xbyak_intrin(dtype, {}, xbyak_intrin_type::neg,
-                        xbyak_intrin_isa::x86));
-        // if(SF != OF) rax = src
-        add_assignment(rax,
-                make_xbyak_intrin(dtype, {src}, xbyak_intrin_type::cmov,
-                        xbyak_intrin_isa::x86,
-                        xbyak_intrin_modifier(xbyak_condition::lt)));
-        // dst = rax
-        add_assignment(dst, rax);
-    }
-
-    void transform_x86_mod_div(const expr &dst, const expr &lhs,
-            const expr &rhs, sc_data_type_t dtype, xbyak_intrin_type intrin) {
-        // Physical regs used by mod/div
-        auto rax = make_physical_reg(dtype, x86_64::regs::rax);
-        auto rdx = make_physical_reg(dtype, x86_64::regs::rdx);
-        add_defination(rax, linkage::local);
-        add_defination(rdx, linkage::local);
-        if (rhs.isa<constant>() && rhs->dtype_ == datatypes::index) {
-            // Get multiplier
-            const auto dtype = dst->dtype_;
-            const auto divisor = rhs.static_as<constant>()->value_[0].u64;
-            const auto mult = invariant_int::UintDivMultiplier(divisor, 64);
-            // gen code for div
-            if (mult.compensate_) {
-                // %rdx = mulh([magic]~%rax, lhs)
-                // %rax = lhs - %rdx
-                // %rax >> [1]
-                // %rdx = %rdx + %rax
-                // %rdx >> [sh_post]
-                // dst = %rdx
-                add_assignment(rax, builder::make_constant(mult.magic_));
-                add_assignment(rdx,
-                        make_xbyak_intrin(dtype, {rax, lhs},
-                                xbyak_intrin_type::mulhl,
-                                xbyak_intrin_isa::x86));
-                add_assignment(rax, lhs);
-                add_assignment(rax,
-                        make_xbyak_intrin(dtype, {rdx}, xbyak_intrin_type::sub,
-                                xbyak_intrin_isa::x86));
-                assert(mult.sft_pre_ == 1);
-                add_assignment(rax,
-                        make_xbyak_intrin(dtype,
-                                {builder::make_constant(mult.sft_pre_)},
-                                xbyak_intrin_type::shr, xbyak_intrin_isa::x86));
-                add_assignment(rdx,
-                        make_xbyak_intrin(dtype, {rax}, xbyak_intrin_type::add,
-                                xbyak_intrin_isa::x86));
-                if (mult.sft_post_ > 0) {
-                    add_assignment(rdx,
-                            make_xbyak_intrin(dtype,
-                                    {builder::make_constant(mult.sft_post_)},
-                                    xbyak_intrin_type::shr,
-                                    xbyak_intrin_isa::x86));
-                }
-            } else {
-                // %rdx = lhs
-                // %rdx >> [sh_pre]
-                // %rdx = mulh([magic]~%rax, %rdx)
-                // %rdx >> [sh_post]
-                add_assignment(rdx, lhs);
-                if (mult.sft_pre_ > 0) {
-                    add_assignment(rdx,
-                            make_xbyak_intrin(dtype,
-                                    {builder::make_constant(mult.sft_pre_)},
-                                    xbyak_intrin_type::shr,
-                                    xbyak_intrin_isa::x86));
-                }
-                add_assignment(rax, builder::make_constant(mult.magic_));
-                add_assignment(rdx,
-                        make_xbyak_intrin(dtype, {rax, rdx},
-                                xbyak_intrin_type::mulhl,
-                                xbyak_intrin_isa::x86));
-                if (mult.sft_post_ > 0) {
-                    add_assignment(rdx,
-                            make_xbyak_intrin(dtype,
-                                    {builder::make_constant(mult.sft_post_)},
-                                    xbyak_intrin_type::shr,
-                                    xbyak_intrin_isa::x86));
-                }
-            }
-            // gen addtional code for mod
-            if (intrin == xbyak_intrin_type::mod) {
-                // %rdx = %rdx * rhs
-                // dst = lhs - %rdx
-                add_assignment(rdx,
-                        make_xbyak_intrin(dtype,
-                                {rdx, builder::make_constant(divisor)},
-                                xbyak_intrin_type::muli,
-                                xbyak_intrin_isa::x86));
-                add_assignment(dst, lhs);
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {rdx}, xbyak_intrin_type::sub,
-                                xbyak_intrin_isa::x86));
-            } else {
-                // dst = %rdx
-                add_assignment(dst, rdx);
-            }
-        } else {
-            // mov %rax, lhs
-            auto mov_rax = make_stmt<assign_node_t>(rax, lhs);
-            transform_seq_.emplace_back(mov_rax);
-            // unsigned [xor %rdx, %rdx] or signed [CWD/CDQ/CQO]
-            auto sign_ext_rdx = make_stmt<assign_node_t>(rdx,
-                    make_xbyak_intrin(
-                            dst->dtype_, {rax}, xbyak_intrin_type::sign_ext));
-            transform_seq_.emplace_back(sign_ext_rdx);
-            // div rhs
-            // mov dst, %rax(div)/%rdx(mod)
-            // Add %rax, %rdx to xbyak_intrin args so liveness updates
-            auto tmp = load_when_imm(rhs, "__div_imm_tmp");
-            add_assignment(dst,
-                    make_xbyak_intrin(dst->dtype_, {tmp, rax, rdx}, intrin,
-                            xbyak_intrin_isa::x86));
-        }
-    }
-
-    void transform_avx_div(const expr &dst, const expr &lhs, const expr &rhs,
-            sc_data_type_t dtype) {
-        // TODO(longsheng): refactor div transfrom before ssa
-        if (rhs.isa<constant>() && rhs->dtype_.is_etype(sc_data_etype::S32)) {
-            const auto const_val = rhs.static_as<constant>()->value_;
-            COMPILE_ASSERT(const_val.size() == 1,
-                    "AVX div by variant constant not supported.")
-            const auto divisor = const_val[0].s64;
-            const auto mult = invariant_int::SintDivMultiplier(divisor, 32);
-            // mulsh
-            const auto transform_mulh_s32 = [this, dtype](const expr &dst,
-                                                    const expr &lhs,
-                                                    uint64_t m) {
-                // mask = (0, -1, 0, -1, ...)
-                // magic = mult.magic_
-                int lanes = lhs->dtype_.lanes_;
-                auto dtype_64 = sc_data_type_t::index(lanes / 2);
-                std::vector<union_val> val(lanes);
-                for (int i = 0; i < lanes; i++) {
-                    val[i] = union_val(int64_t(-(i % 2)));
-                }
-                auto mask_c = builder::make_constant(val, dtype);
-                auto magic_c = builder::make_constant({m}, datatypes::s32);
-                mask_c->attr().set(attr_keys::force_simd_encode, true);
-                magic_c->attr().set(attr_keys::force_simd_encode, true);
-                //
-                auto magic = builder::make_var(dtype, "__magic");
-                auto hi1 = builder::make_var(dtype, "__hi1");
-                auto hi2 = builder::make_var(dtype, "__hi2");
-                add_defination(magic, linkage::local);
-                add_defination(hi1, linkage::local);
-                add_defination(hi2, linkage::local);
-                // magic = broadcast(magic_c)
-                // hi1 = _mm512_mul_epi32(lhs, [magic]);
-                // hi1 = _mm512_srli_epi64(hi1, [32]);
-                // hi2 = _mm512_srli_epi64(lhs, [32]);
-                // hi2 = _mm512_mul_epi32(hi2, [magic]);
-                // hi2 = _mm512_and_si512(hi2, [mask]);
-                // dst = _mm512_or_si512(hi1, hi2);
-                add_assignment(magic,
-                        make_xbyak_intrin(dtype, {magic_c},
-                                xbyak_intrin_type::broadcast,
-                                xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(datatypes::s32)));
-                add_assignment(hi1,
-                        make_xbyak_intrin(dtype, {lhs, magic},
-                                xbyak_intrin_type::mulhl,
-                                xbyak_intrin_isa::avx));
-                add_assignment(hi1,
-                        make_xbyak_intrin(dtype,
-                                {hi1, builder::make_constant(32)},
-                                xbyak_intrin_type::shr, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype_64)));
-                add_assignment(hi2,
-                        make_xbyak_intrin(dtype,
-                                {lhs, builder::make_constant(32)},
-                                xbyak_intrin_type::shr, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype_64)));
-                add_assignment(hi2,
-                        make_xbyak_intrin(dtype, {hi2, magic},
-                                xbyak_intrin_type::mulhl,
-                                xbyak_intrin_isa::avx));
-                add_assignment(hi2,
-                        make_xbyak_intrin(dtype, {hi2, mask_c},
-                                xbyak_intrin_type::bit_and,
-                                xbyak_intrin_isa::avx));
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {hi1, hi2},
-                                xbyak_intrin_type::bit_or,
-                                xbyak_intrin_isa::avx));
-                // TODO(longsheng): srl+and+or can combine to vpermi2d in avx512
-            };
-            //
-            if (mult.power_of_2_) {
-                auto sft1 = builder::make_constant(mult.sft_ - 1);
-                auto sft2 = builder::make_constant(32 - mult.sft_);
-                auto sft3 = builder::make_constant(mult.sft_);
-                // dst = sar(lhs, [l - 1])
-                // dst = shr(dst, [N - l])
-                // dst = dst + lhs
-                // dst = sar(dst, [l])
-                // dst = d < 0 ? -dst : dst
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {lhs, sft1},
-                                xbyak_intrin_type::sar, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {dst, sft2},
-                                xbyak_intrin_type::shr, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {dst, lhs},
-                                xbyak_intrin_type::add, xbyak_intrin_isa::avx));
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {dst, sft3},
-                                xbyak_intrin_type::sar, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                if (mult.negative_) {
-                    auto simd_zero = builder::make_var(dtype, "__simd_zero");
-                    add_defination(simd_zero, linkage::local);
-                    add_assignment(simd_zero,
-                            make_xbyak_intrin(dtype, {simd_zero, simd_zero},
-                                    xbyak_intrin_type::bit_xor,
-                                    xbyak_intrin_isa::avx));
-                    add_assignment(dst,
-                            make_xbyak_intrin(dtype, {simd_zero, dst},
-                                    xbyak_intrin_type::sub,
-                                    xbyak_intrin_isa::avx));
-                }
-            } else if (mult.compensate_) {
-                auto sft = builder::make_constant(mult.sft_);
-                auto bit = builder::make_constant(31);
-                auto xsign = builder::make_var(dtype, "__xsign");
-                add_defination(xsign, linkage::local);
-                // dst = mulsh(lhs, [magic])
-                // dst = dst + lhs
-                // dst = sar(dst, [sft])
-                // xsign = sar(lhs, [N - 1])
-                // dst = d < 0 ? (xsign - dst) : (dst - xsign)
-                transform_mulh_s32(dst, lhs, mult.magic_);
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {dst, lhs},
-                                xbyak_intrin_type::add, xbyak_intrin_isa::avx));
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {dst, sft},
-                                xbyak_intrin_type::sar, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                add_assignment(xsign,
-                        make_xbyak_intrin(dtype, {lhs, bit},
-                                xbyak_intrin_type::sar, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                if (mult.negative_) {
-                    add_assignment(dst,
-                            make_xbyak_intrin(dtype, {xsign, dst},
-                                    xbyak_intrin_type::sub,
-                                    xbyak_intrin_isa::avx));
-                } else {
-                    add_assignment(dst,
-                            make_xbyak_intrin(dtype, {dst, xsign},
-                                    xbyak_intrin_type::sub,
-                                    xbyak_intrin_isa::avx));
-                }
-            } else {
-                auto sft = builder::make_constant(mult.sft_);
-                auto bit = builder::make_constant(31);
-                auto xsign = builder::make_var(dtype, "__xsign");
-                add_defination(xsign, linkage::local);
-                // dst = mulsh(lhs, [magic])
-                // dst = sar(dst, sft)
-                // xsign = sar(lhs, [N - 1])
-                // dst = d < 0 ? (xsign - dst) : (dst - xsign)
-                transform_mulh_s32(dst, lhs, mult.magic_);
-                add_assignment(dst,
-                        make_xbyak_intrin(dtype, {dst, sft},
-                                xbyak_intrin_type::sar, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                add_assignment(xsign,
-                        make_xbyak_intrin(dtype, {lhs, bit},
-                                xbyak_intrin_type::sar, xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(dtype)));
-                if (mult.negative_) {
-                    add_assignment(dst,
-                            make_xbyak_intrin(dtype, {xsign, dst},
-                                    xbyak_intrin_type::sub,
-                                    xbyak_intrin_isa::avx));
-                } else {
-                    add_assignment(dst,
-                            make_xbyak_intrin(dtype, {dst, xsign},
-                                    xbyak_intrin_type::sub,
-                                    xbyak_intrin_isa::avx));
-                }
-            }
-        } else {
-            add_assignment(dst,
-                    make_xbyak_intrin(dst->dtype_, {lhs, rhs},
-                            xbyak_intrin_type::div, xbyak_intrin_isa::avx));
-        }
-    }
-
-    void transform_simd_reduce_seq(const expr &dst, const expr &src,
-            sc_data_type_t dtype, xbyak_intrin_type intrin,
-            xbyak_intrin_isa isa) {
-        auto lanes = dtype.lanes_;
-        auto current_src = src;
-        // lanes must be power of 2
-        assert((lanes > 1) && ((lanes & (lanes - 1)) == 0));
-        // reduce lanes down to 1
-        while (lanes > 1) {
-            lanes = lanes / 2;
-            dtype.lanes_ = lanes;
-            // hig, low
-            auto extract_hig = builder::make_var(
-                    dtype, "extract_hig_" + std::to_string(lanes));
-            auto extract_low = builder::make_var(
-                    dtype, "extract_low_" + std::to_string(lanes));
-            add_defination(extract_hig, linkage::local);
-            add_defination(extract_low, linkage::local);
-            // hig = ext_h(src)
-            add_assignment(extract_hig,
-                    make_xbyak_intrin(dtype, {current_src},
-                            xbyak_intrin_type::extract_high, isa,
-                            xbyak_intrin_modifier(current_src->dtype_)));
-            // low = ext_l(src)
-            add_assignment(extract_low,
-                    make_xbyak_intrin(dtype, {current_src},
-                            xbyak_intrin_type::extract_low, isa,
-                            xbyak_intrin_modifier(current_src->dtype_)));
-            // hig = op(low, hig)
-            if (lanes == 1) {
-                if (convert_x86_operation(dtype)) {
-                    assert(intrin == xbyak_intrin_type::add
-                            || intrin == xbyak_intrin_type::mul
-                            || intrin == xbyak_intrin_type::min
-                            || intrin == xbyak_intrin_type::max);
-                    // e.g. if reduce_add for s32x16 reach s32x1
-                    // The calculation cannot use simd intrins any more
-                    add_assignment(dst, extract_low);
-                    add_assignment(dst,
-                            make_xbyak_intrin(dst->dtype_, {extract_hig},
-                                    intrin, xbyak_intrin_isa::x86));
-                } else {
-                    add_assignment(dst,
-                            make_xbyak_intrin(dst->dtype_,
-                                    {extract_low, extract_hig}, intrin, isa));
-                }
-            } else {
-                add_assignment(extract_low,
-                        make_xbyak_intrin(extract_low->dtype_,
-                                {extract_low, extract_hig}, intrin, isa));
-            }
-            // src = hig
-            current_src = extract_low;
-        }
-    }
-
-    void transform_assign(const expr &dst, const expr &src,
-            const expr &maybe_masked, bool zero_masked) {
-        if (maybe_masked.isa<indexing>()) {
-            auto mask = maybe_masked.static_as<indexing>()->mask_;
-            if (mask.defined()) {
-                // Must be simd move
-                assert(dst->dtype_.lanes_ > 1);
-                auto cond = cast_when_mask(mask, dst->dtype_);
-                add_assignment(dst,
-                        make_xbyak_intrin(dst->dtype_, {src},
-                                xbyak_intrin_type::mask_mov,
-                                xbyak_intrin_isa::avx,
-                                xbyak_intrin_modifier(cond, zero_masked)));
-                // return here when mask_mov, avoid add_assignment twice
-                return;
-            }
-        }
-        add_assignment(dst, src);
-    }
-
-    void transform_gather(const expr &dst, const expr &src, const expr &idx) {
-        auto get_gather_mask = [this](expr mask, sc_data_type_t dst_type) {
-            if (cpu_flags_.fAVX512F) {
-                return cast_when_mask(std::move(mask), dst_type);
-            } else {
-                auto xmm0 = make_physical_reg(dst_type, x86_64::regs::xmm0);
-                add_defination(xmm0, linkage::local);
-                transform_avx_mask_cast(xmm0, mask);
-                return xmm0;
-            }
-        };
-        assert(dst->dtype_.lanes_ > 1);
-        // get mask with all bits is 1
-        uint64_t imm = (UINT64_C(1) << (dst->dtype_.lanes_)) - 1;
-        auto mask = builder::make_constant({imm}, datatypes::u32);
-        // get avx512 mask or avx2 mask ymm0
-        auto cond = get_gather_mask(std::move(mask), dst->dtype_);
-        // make sure dst and idx use different xmm reg
-        auto xmm1 = make_physical_reg(idx->dtype_, x86_64::regs::xmm1);
-        auto xmm2 = make_physical_reg(dst->dtype_, x86_64::regs::xmm2);
-        add_defination(xmm1, linkage::local);
-        add_defination(xmm2, linkage::local);
-        // transform gather intrin
-        add_assignment(xmm1, idx);
-        add_assignment(xmm2,
-                make_xbyak_intrin(dst->dtype_, {src, xmm1},
-                        xbyak_intrin_type::gather, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(cond, false)));
-        add_assignment(dst, xmm2);
-    }
-
-    void transform_extract(const expr &dst, const expr &src, const expr &imm,
-            const expr &elem_bits) {
-        add_assignment(dst,
-                make_xbyak_intrin(dst->dtype_, {src, imm, elem_bits},
-                        xbyak_intrin_type::extract, xbyak_intrin_isa::avx));
-    }
-
-    void transform_broadcast(
-            const expr &dst, const expr &src, const sc_data_type_t &src_type) {
-        // transform broadcast with type hint
-        add_assignment(dst,
-                make_xbyak_intrin(dst->dtype_, {src},
-                        xbyak_intrin_type::broadcast, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(src_type)));
-    }
-
-    void transform_avx_mask_cast(const expr &dst, const expr &src) {
-        assert(!cpu_flags_.fAVX512F && cpu_flags_.fAVX2);
-        // avx mask cast can cast scalar to xmm, or xmm to scalar
-        const auto is_scalar_mask = [](const expr &v) {
-            auto type_lane = v->dtype_.lanes_;
-            auto type_code = v->dtype_.type_code_;
-            auto type_size = utils::get_sizeof_etype(type_code);
-            return (type_lane == 1
-                           && utils::is_one_of((int)type_size, 1, 2, 4, 8))
-                    || type_code == sc_data_etype::BOOLEAN;
-        };
-        //
-        if (is_scalar_mask(src)) {
-            auto dtype = dst->dtype_;
-            // TODO(longsheng): optimize imm mask
-            auto tmp = load_when_imm(src, "__msk_tmp_var");
-            switch (utils::get_sizeof_etype(dtype.type_code_)) {
-                case 1: cast_mask8_avx2(dst, tmp, dtype); break;
-                case 2: cast_mask16_avx2(dst, tmp, dtype); break;
-                case 4: cast_mask32_avx2(dst, tmp, dtype); break;
-                default:
-                    COMPILE_ASSERT(false, "Not supported base type: " << dtype);
-            }
-        } else if (is_scalar_mask(dst)) {
-            add_assignment(dst,
-                    make_xbyak_intrin(dst->dtype_, {src}, //
-                            xbyak_intrin_type::mov_mask, //
-                            xbyak_intrin_isa::avx, //
-                            xbyak_intrin_modifier(src->dtype_)));
-            if (src->dtype_.is_etype(sc_data_etype::BF16)
-                    || src->dtype_.is_etype(sc_data_etype::U16)) {
-                assert(cpu_flags_.fBMI2);
-                add_assignment(dst,
-                        make_xbyak_intrin(dst->dtype_,
-                                {dst, builder::make_constant(0x5555)},
-                                xbyak_intrin_type::bmi_pext,
-                                xbyak_intrin_isa::x86));
-            }
-        } else {
-            COMPILE_ASSERT(false, "Invalid avx_mask_cast!");
-        }
-    }
-
-    // --------------
-    // Intrin helpers
-    // --------------
-
-    expr load_when_imm(const expr &v, const std::string &name) {
-        if (v.isa<constant>()) {
-            auto tmp = builder::make_var(v->dtype_, name);
-            add_defination(tmp, linkage::local);
-            add_assignment(tmp, v);
-            return tmp;
-        } else {
-            return v;
-        }
-    }
-
-    expr load_when_interfere(const expr &dst, const expr &v) {
-        if (dst.ptr_same(v)) {
-            auto tmp = builder::make_var(v->dtype_, "__load_interfere");
-            add_defination(tmp, linkage::local);
-            add_assignment(tmp, v);
-            return tmp;
-        } else {
-            return v;
-        }
-    }
-
-    expr load_to_reg(const expr &v, const Xbyak::Reg &reg) {
-        auto ret = make_physical_reg(v->dtype_, reg);
-        add_defination(ret, linkage::local);
-        add_assignment(ret, v);
-        return ret;
-    }
-
-    // If simd cond is not mask type, cast to mask
-    expr cast_when_mask(expr cond, sc_data_type_t dtype) {
-        auto lanes = dtype.lanes_;
-        if (cpu_flags_.fAVX512F
-                && cond->dtype_ != sc_data_type_t::boolean(lanes)) {
-            auto tmp = load_when_imm(cond, "__msk_tmp_var");
-            auto mask = builder::make_var(
-                    sc_data_type_t::boolean(lanes), "__mmask");
-            add_defination(mask, linkage::local);
-            add_assignment(mask,
-                    builder::make_cast(sc_data_type_t::boolean(lanes), tmp));
-            return mask;
-        }
-        return cond;
-    }
-
-    void cast_mask8_avx2(
-            const expr &mask, const expr &src, const sc_data_type_t &dtype) {
-        // vmovd        xmm0, src
-        // vpbroadcastd ymm0, xmm0
-        // vpshufb      ymm0, ymm0, table1
-        // vpor         ymm0, ymm0, table2
-        // vpcmpeqb     ymm1, ymm1, ymm1 // set all bit to 1
-        // vpcmpeqb     xmm0, ymm0, ymm1
-        const int mask_lanes = mask->dtype_.lanes_;
-        const int table_len = mask_lanes / 4;
-        assert(utils::is_one_of(mask_lanes, 8, 16, 32));
-        auto mask_type = sc_data_type_t(sc_data_etype::U8, mask_lanes);
-        auto movd_type = sc_data_type_t(sc_data_etype::U8, 16);
-        auto tabl_type = sc_data_type_t(sc_data_etype::U32, table_len);
-        // shuffle table
-        const std::vector<union_val> val1
-                = {UINT64_C(0x00000000), UINT64_C(0x00000000),
-                        UINT64_C(0x01010101), UINT64_C(0x01010101),
-                        UINT64_C(0x02020202), UINT64_C(0x02020202),
-                        UINT64_C(0x03030303), UINT64_C(0x03030303)};
-        const std::vector<union_val> v1(val1.begin(), val1.begin() + table_len);
-        auto table1 = builder::make_constant(v1, tabl_type);
-        // bit or value
-        const std::vector<union_val> val2
-                = {UINT64_C(0xf7fbfdfe), UINT64_C(0x7fbfdfef),
-                        UINT64_C(0xf7fbfdfe), UINT64_C(0x7fbfdfef),
-                        UINT64_C(0xf7fbfdfe), UINT64_C(0x7fbfdfef),
-                        UINT64_C(0xf7fbfdfe), UINT64_C(0x7fbfdfef)};
-        const std::vector<union_val> v2(val2.begin(), val2.begin() + table_len);
-        auto table2 = builder::make_constant(v2, tabl_type);
-        //
-        auto xmm0 = make_physical_reg(movd_type, x86_64::regs::xmm0);
-        auto ymm0 = make_physical_reg(mask_type, x86_64::regs::xmm0, "_ymm");
-        auto ymm1 = make_physical_reg(mask_type, x86_64::regs::xmm1, "_ymm");
-        add_defination(xmm0, linkage::local);
-        add_defination(ymm0, linkage::local);
-        add_defination(ymm1, linkage::local);
-        //
-        add_assignment(xmm0,
-                make_xbyak_intrin(movd_type, {src}, //
-                        xbyak_intrin_type::movd, xbyak_intrin_isa::avx));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {xmm0}, //
-                        xbyak_intrin_type::broadcast, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(datatypes::u32)));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, table1}, //
-                        xbyak_intrin_type::pshuffle, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(mask_type)));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, table2}, //
-                        xbyak_intrin_type::bit_or, xbyak_intrin_isa::avx));
-        add_assignment(ymm1,
-                make_xbyak_intrin(mask_type, {ymm1, ymm1}, //
-                        xbyak_intrin_type::cmp_set, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(xbyak_condition::eq, mask_type)));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, ymm1}, //
-                        xbyak_intrin_type::cmp_set, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(xbyak_condition::eq, mask_type)));
-        add_assignment(mask, ymm0);
-    }
-
-    void cast_mask16_avx2(
-            const expr &mask, const expr &src, const sc_data_type_t &dtype) {
-        // vmovd        xmm0, src
-        // vpbroadcastw ymm0, xmm0
-        // vpand        ymm0, ymm0, table1
-        // vpcmpeqw     mask, ymm0, table1
-        const int mask_lanes = mask->dtype_.lanes_;
-        assert(utils::is_one_of(mask_lanes, 8, 16));
-        auto mask_type = sc_data_type_t(sc_data_etype::U16, mask_lanes);
-        auto movd_type = sc_data_type_t(sc_data_etype::U16, 8);
-        // bit and value
-        std::vector<union_val> val;
-        val.reserve(mask_lanes);
-        for (int i = 0; i < mask_lanes; i++) {
-            val.emplace_back(UINT64_C(1) << i);
-        }
-        auto table1 = builder::make_constant(val, mask_type);
-        //
-        auto xmm0 = make_physical_reg(movd_type, x86_64::regs::xmm0);
-        auto ymm0 = make_physical_reg(mask_type, x86_64::regs::xmm0, "_ymm");
-        add_defination(xmm0, linkage::local);
-        add_defination(ymm0, linkage::local);
-        //
-        add_assignment(xmm0,
-                make_xbyak_intrin(movd_type, {src}, //
-                        xbyak_intrin_type::movd, xbyak_intrin_isa::avx));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {xmm0}, //
-                        xbyak_intrin_type::broadcast, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(datatypes::u16)));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, table1}, //
-                        xbyak_intrin_type::bit_and, xbyak_intrin_isa::avx));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, table1}, //
-                        xbyak_intrin_type::cmp_set, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(xbyak_condition::eq, mask_type)));
-        add_assignment(mask, ymm0);
-    }
-
-    void cast_mask32_avx2(
-            const expr &mask, const expr &src, const sc_data_type_t &dtype) {
-        // vmovd        xmm0, src
-        // vpbroadcastd ymm0, xmm0
-        // vpand        ymm0, ymm0, table1
-        // vpcmpeqd     mask, ymm0, table1
-        const int mask_lanes = mask->dtype_.lanes_;
-        assert(utils::is_one_of(mask_lanes, 4, 8));
-        auto mask_type = sc_data_type_t(sc_data_etype::U32, mask_lanes);
-        auto movd_type = sc_data_type_t(sc_data_etype::U32, 4);
-        // bit and value
-        std::vector<union_val> val;
-        val.reserve(mask_lanes);
-        for (int i = 0; i < mask_lanes; i++) {
-            val.emplace_back(UINT64_C(1) << i);
-        }
-        auto table1 = builder::make_constant(val, mask_type);
-        //
-        auto xmm0 = make_physical_reg(movd_type, x86_64::regs::xmm0);
-        auto ymm0 = make_physical_reg(mask_type, x86_64::regs::xmm0, "_ymm");
-        add_defination(xmm0, linkage::local);
-        add_defination(ymm0, linkage::local);
-        //
-        add_assignment(xmm0,
-                make_xbyak_intrin(movd_type, {src}, //
-                        xbyak_intrin_type::movd, xbyak_intrin_isa::avx));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {xmm0}, //
-                        xbyak_intrin_type::broadcast, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(datatypes::u32)));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, table1}, //
-                        xbyak_intrin_type::bit_and, xbyak_intrin_isa::avx));
-        add_assignment(ymm0,
-                make_xbyak_intrin(mask_type, {ymm0, table1}, //
-                        xbyak_intrin_type::cmp_set, xbyak_intrin_isa::avx,
-                        xbyak_intrin_modifier(xbyak_condition::eq, mask_type)));
-        add_assignment(mask, ymm0);
-    }
-
-    void add_assignment(const expr &var, const expr &value) {
-        transform_seq_.emplace_back(make_stmt<assign_node_t>(var, value));
-    }
-
-    void add_evaluate(const expr &value) {
-        transform_seq_.emplace_back(make_stmt<evaluate_node_t>(value));
-    }
-
-    void add_defination(
-            const expr &var, linkage link, const expr &init = expr()) {
-        transform_seq_.emplace_back(make_stmt<define_node_t>(var, link, init));
-    }
-
-    bool convert_x86_operation(const sc_data_type_t &dtype) {
-        return !is_x86_simd(dtype);
-    }
-
-    bool is_const_zero(const expr &v) {
-        if (v.isa<constant_c>()) {
-            auto c = v.static_as<constant_c>();
-            return (c->value_.size() == 1) && (c->value_[0].u64 == 0);
-        }
-        return false;
-    };
-
-private:
-    const runtime::cpu_flags_t &cpu_flags_;
-};
-
-func_c x86_intrinsics_lowering_t::operator()(func_c v) {
-    x86_intrinsics_lowering_impl_t x86_intrinsics_lowering(target_machine_);
-    return x86_intrinsics_lowering.dispatch(std::move(v));
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.hpp
deleted file mode 100644
index ab2c0295d37..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_X86_INTRINSICS_LOWERING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_TRANSFORM_X86_INTRINSICS_LOWERING_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/ir/function_pass.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/* *
- * A prototype of x86 intric lowering, converting high level ir to proper low
- * level ir with accurate operands.
- * */
-class x86_intrinsics_lowering_t : public function_pass_t {
-public:
-    x86_intrinsics_lowering_t(const runtime::target_machine_t &target_machine)
-        : target_machine_(target_machine) {}
-    func_c operator()(func_c v) override;
-
-private:
-    const runtime::target_machine_t &target_machine_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/util/invariant_int.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/util/invariant_int.hpp
deleted file mode 100644
index 1fb8b15261d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/util/invariant_int.hpp
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_UTIL_INVARIANT_INT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_UTIL_INVARIANT_INT_HPP
-
-#include <assert.h>
-#include <util/compiler_macros.hpp>
-#include <util/utils.hpp>
-// if the compiler has not defined uint128, we need to implement
-// it on our own. Also, the uint128 implementation is incomplete on OSS
-// dpcpp.
-#if !defined(__SIZEOF_INT128__) || (SC_IS_CLANG() && defined(_MSC_VER))
-#include <util/uint128.hpp>
-#define HAS_BUILTIN_INT128 0
-#else
-#define HAS_BUILTIN_INT128 1
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace invariant_int {
-
-// If __uint128_t not defined by compiler, use own implementation
-#if HAS_BUILTIN_INT128
-using uint128_t = __uint128_t;
-#else
-using uint128_t = utils::uint128_t;
-#endif
-#undef HAS_BUILTIN_INT128
-
-// uint128 pow of 2
-inline uint128_t u128_pow_2(const int n) {
-    return uint128_t(UINT64_C(1)) << n;
-}
-
-// internal struct for choosing multiplier
-struct ChooseMultiplier {
-    int log_;
-    int sft_;
-    uint128_t m_low_;
-    uint128_t m_hig_;
-    /**
-     * choose multiplier for invariant usigned div
-     * @param d the invariant unsigned int
-     * @param N number of calculation bits
-     * @param prec bits of precision
-     * */
-    ChooseMultiplier(uint64_t d, int N, int prec) {
-        // log = ceil(log2(d))
-        log_ = 64 - utils::clz(d - 1);
-        uint128_t _2_pow_N_l = u128_pow_2(N + log_);
-        uint128_t _2_pow_N_lp = u128_pow_2(N + log_ - prec);
-
-        sft_ = log_;
-        m_low_ = _2_pow_N_l / uint128_t(d);
-        m_hig_ = (_2_pow_N_l + _2_pow_N_lp) / uint128_t(d);
-
-        while (((m_low_ >> 1) < (m_hig_ >> 1)) && (sft_ > 0)) {
-            m_low_ = m_low_ >> 1;
-            m_hig_ = m_hig_ >> 1;
-            sft_--;
-        }
-    }
-};
-
-/**
- * multiplier for invariant usigned div optimization
- * */
-struct UintDivMultiplier {
-    uint64_t magic_;
-    int sft_pre_;
-    int sft_post_;
-    bool compensate_;
-    bool power_of_2_;
-    /**
-     * generate multiplier for invariant usigned div.
-     * @param d the invariant unsigned int
-     * @param N number of calculation bits
-     * */
-    UintDivMultiplier(uint64_t d, int N) {
-        // Get initial multiplier
-        assert(d > 1);
-        uint128_t _2_pow_N = u128_pow_2(N);
-        uint128_t magic;
-        int sft_pre, sft_post;
-        const auto mulx = ChooseMultiplier(d, N, N);
-        // If initial multiplier larger than _2_pow_N
-        // Choose a new multiplier when d stripped to odd
-        if (mulx.m_hig_ >= _2_pow_N && (d % 2) == 0) {
-            // strip d to odd
-            int e = utils::ctz(d);
-            uint64_t d_odd = d >> e;
-            // Get refined multiplier
-            const auto muly = ChooseMultiplier(d_odd, N, N - e);
-            magic = muly.m_hig_;
-            sft_pre = e;
-            sft_post = muly.sft_;
-        } else {
-            magic = mulx.m_hig_;
-            sft_pre = 0;
-            sft_post = mulx.sft_;
-        }
-        // Return final multiplier
-        if (d == (UINT64_C(1) << mulx.log_)) {
-            magic_ = 0;
-            sft_pre_ = mulx.log_;
-            sft_post_ = 0;
-            compensate_ = false;
-            power_of_2_ = true;
-        } else if (magic >= _2_pow_N) {
-            assert(sft_pre == 0);
-            magic_ = uint64_t(magic - _2_pow_N);
-            sft_pre_ = 1;
-            sft_post_ = sft_post - 1;
-            compensate_ = true;
-            power_of_2_ = false;
-        } else {
-            magic_ = uint64_t(magic);
-            sft_pre_ = sft_pre;
-            sft_post_ = sft_post;
-            compensate_ = false;
-            power_of_2_ = false;
-        }
-    }
-};
-
-/**
- * multiplier for invariant signed div optimization
- * quotient rounded towards 0
- * */
-struct SintDivMultiplier {
-    uint64_t magic_;
-    int sft_;
-    bool negative_;
-    bool compensate_;
-    bool power_of_2_;
-    /**
-     * generate multiplier for invariant signed div
-     * @param d the invariant signed int
-     * @param N number of calculation bits
-     * */
-    SintDivMultiplier(int64_t d, int N) {
-        // Get initial multiplier
-        negative_ = (d < 0);
-        uint64_t abs_d = std::abs(d);
-        assert(abs_d > 1);
-        uint128_t _2_pow_N = u128_pow_2(N);
-        uint128_t _2_pow_N_m1 = u128_pow_2(N - 1);
-        //
-        const auto mulx = ChooseMultiplier(abs_d, N, N - 1);
-        // Return final multiplier
-        if (abs_d == (UINT64_C(1) << mulx.log_)) {
-            magic_ = 0;
-            sft_ = mulx.log_;
-            compensate_ = false;
-            power_of_2_ = true;
-        } else if (mulx.m_hig_ >= _2_pow_N_m1) {
-            magic_ = uint64_t(mulx.m_hig_ - _2_pow_N);
-            sft_ = mulx.sft_;
-            compensate_ = true;
-            power_of_2_ = false;
-        } else {
-            magic_ = uint64_t(mulx.m_hig_);
-            sft_ = mulx.sft_;
-            compensate_ = false;
-            power_of_2_ = false;
-        }
-    }
-};
-
-} // namespace invariant_int
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/util/utils.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/util/utils.hpp
deleted file mode 100644
index 04c75e94a06..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/util/utils.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_UTIL_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_UTIL_UTILS_HPP
-
-#include <assert.h>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/sc_data_type.hpp>
-#include <compiler/ir/sc_expr.hpp>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-/**
- * If datatype is a x86 simd register type
- * */
-SC_INTERNAL_API inline bool is_x86_simd(const sc_data_type_t &t) {
-    return !t.is_tile()
-            && (t.type_code_ == sc_data_etype::F32
-                    || t.type_code_ == sc_data_etype::F16 || t.lanes_ > 1);
-}
-
-/**
- * If constant node scalar intger value exceeds 32bit
- * */
-SC_INTERNAL_API inline bool const_exceed_32bit(const expr_c &v) {
-    if ((utils::is_one_of(v->dtype_, datatypes::index, datatypes::generic)
-                || v->dtype_.is_pointer())
-            && v.isa<constant>()) {
-        const auto c = v.static_as<constant_c>();
-        const uint64_t x = c->value_[0].u64;
-        return !Xbyak::inner::IsInInt32(x);
-    }
-    return false;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_expr.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_expr.cpp
deleted file mode 100644
index 74b60afcb04..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_expr.cpp
+++ /dev/null
@@ -1,315 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-
-#include "xbyak_expr.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-//=========================================================================
-// xbyak intrinsic handlers
-//=========================================================================
-
-struct xbyak_intrinsic_handler_t {
-    std::string name_;
-    xbyak_intrin_format intrin_format_;
-    virtual void on_initialize(xbyak_intrin_node &node) = 0;
-    xbyak_intrinsic_handler_t(const std::string &name,
-            xbyak_intrin_format format = xbyak_intrin_format::undefined)
-        : name_(name), intrin_format_(format) {}
-    virtual ~xbyak_intrinsic_handler_t() = default;
-};
-
-//=========================================================================
-// register xbyak intrinsic handlers
-//=========================================================================
-
-#define _1A_ 0 // one address intrinsic
-#define _2A_ 1 // two address intrinsic
-#define _3A_ 2 // three address intrinsic
-#define _4A_ 3 // four address intrinsic
-#define _5A_ 4 // five address intrinsic
-
-#define TO_INDEX(X) (static_cast<size_t>(X))
-
-#define REGISTER_INTRIN(NAME, ISA, INTRIN, FORMAT, INPUT) \
-    struct ISA##INTRIN##_handler_t : public xbyak_intrinsic_handler_t { \
-        ISA##INTRIN##_handler_t() \
-            : xbyak_intrinsic_handler_t(NAME, xbyak_intrin_format::FORMAT) {} \
-        void on_initialize(xbyak_intrin_node &node) override { \
-            assert(node.args_.size() == (INPUT)); \
-            node.format_ = intrin_format_; \
-        } \
-    }; \
-    intrin_handlers[TO_INDEX(xbyak_intrin_isa::ISA)] \
-                   [TO_INDEX(xbyak_intrin_type::INTRIN)] \
-            = utils::make_unique<ISA##INTRIN##_handler_t>();
-
-using handler_table
-        = std::vector<std::vector<std::unique_ptr<xbyak_intrinsic_handler_t>>>;
-
-static handler_table register_handlers() {
-    const auto ISA_NUM = TO_INDEX(xbyak_intrin_isa::NUM_ISAS);
-    const auto INTRIN_NUM = TO_INDEX(xbyak_intrin_type::NUM_INTRINSICS);
-    // Create a 2d table of (isa * intrin)
-    handler_table intrin_handlers(ISA_NUM);
-    for (size_t i = 0; i < ISA_NUM; i++) {
-        intrin_handlers[i].resize(INTRIN_NUM);
-    }
-
-    // Special Intrinsic
-    REGISTER_INTRIN("CALL_ARG", x86, call_arg, undefined, _2A_);
-    REGISTER_INTRIN("SIGN_EXT", x86, sign_ext, undefined, _2A_);
-    REGISTER_INTRIN("REINTERPRET", x86, reinterpret, directed_dst_reg, _2A_);
-    REGISTER_INTRIN("REINTERPRET", avx, reinterpret, directed_dst_reg, _2A_);
-    REGISTER_INTRIN(
-            "SATURATED_CAST", x86, saturated_cast, directed_dst_mem, _2A_);
-    REGISTER_INTRIN(
-            "SATURATED_CAST", avx, saturated_cast, directed_dst_mem, _2A_);
-    REGISTER_INTRIN(
-            "ROUND_AND_CAST", x86, round_and_cast, directed_dst_reg, _2A_);
-    REGISTER_INTRIN(
-            "ROUND_AND_CAST", avx, round_and_cast, directed_dst_reg, _2A_);
-
-    //---------------
-    // X86 Intrinsic
-    //---------------
-    REGISTER_INTRIN("X86_TEST", x86, test, undefined, _2A_);
-
-    REGISTER_INTRIN("X86_CMOV", x86, cmov, directed_dst_reg, _2A_);
-
-    REGISTER_INTRIN("X86_ADD", x86, add, compound_dst_mem, _2A_);
-    REGISTER_INTRIN("X86_SUB", x86, sub, compound_dst_mem, _2A_);
-    REGISTER_INTRIN("X86_SHL", x86, shl, compound_dst_mem, _2A_);
-    REGISTER_INTRIN("X86_SHR", x86, shr, compound_dst_mem, _2A_);
-    REGISTER_INTRIN("X86_SAR", x86, sar, compound_dst_mem, _2A_);
-
-    REGISTER_INTRIN("X86_BIT_OR", x86, bit_or, compound_dst_mem, _2A_);
-    REGISTER_INTRIN("X86_BIT_AND", x86, bit_and, compound_dst_mem, _2A_);
-    REGISTER_INTRIN("X86_BIT_XOR", x86, bit_xor, compound_dst_mem, _2A_);
-
-    REGISTER_INTRIN("X86_MUL", x86, mul, compound_dst_reg, _2A_);
-    REGISTER_INTRIN("X86_MULI", x86, muli, directed_dst_reg, _3A_);
-    REGISTER_INTRIN("X86_MULHL", x86, mulhl, directed_dst_reg, _3A_);
-
-    REGISTER_INTRIN("X86_MIN", x86, min, compound_dst_reg, _2A_);
-    REGISTER_INTRIN("X86_MAX", x86, max, compound_dst_reg, _2A_);
-
-    // div/idiv as 4-address to include %rax and %rdx for liveness update
-    REGISTER_INTRIN("X86_DIV", x86, div, undefined, _4A_);
-    REGISTER_INTRIN("X86_MOD", x86, mod, undefined, _4A_);
-
-    REGISTER_INTRIN("X86_NEG", x86, neg, undefined, _1A_);
-
-    REGISTER_INTRIN("X86_CMP_SET", x86, cmp_set, directed_dst_mem, _3A_);
-    REGISTER_INTRIN("X86_BMI_PEXT", x86, bmi_pext, directed_all_reg, _3A_);
-
-    //---------------
-    // AVX Intrinsic
-    //---------------
-    // mask_mov: special case xbyak_format in resolve_spill
-    REGISTER_INTRIN("AVX_MASK_MOV", avx, mask_mov, undefined, _2A_);
-    REGISTER_INTRIN("AVX_MOV_MASK", avx, mov_mask, directed_all_reg, _2A_);
-
-    REGISTER_INTRIN("AVX_CMOV", avx, cmov, directed_dst_mem, _2A_);
-    REGISTER_INTRIN("AVX_MOVD", avx, movd, directed_dst_mem, _2A_);
-
-    REGISTER_INTRIN("AVX_ADD", avx, add, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_SUB", avx, sub, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_SHL", avx, shl, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_SHR", avx, shr, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_SAR", avx, sar, directed_end_mem, _3A_);
-
-    REGISTER_INTRIN("AVX_BIT_OR", avx, bit_or, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_BIT_AND", avx, bit_and, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_BIT_XOR", avx, bit_xor, directed_end_mem, _3A_);
-
-    REGISTER_INTRIN("AVX_MIN", avx, min, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_MAX", avx, max, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_ABS", avx, abs, directed_end_mem, _2A_);
-
-    REGISTER_INTRIN("AVX_MUL", avx, mul, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_MULHL", avx, mulhl, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_DIV", avx, div, directed_end_mem, _3A_);
-
-    REGISTER_INTRIN("AVX_CEIL", avx, ceil, directed_end_mem, _2A_);
-    REGISTER_INTRIN("AVX_FLOOR", avx, floor, directed_end_mem, _2A_);
-    REGISTER_INTRIN("AVX_ROUND", avx, round, directed_end_mem, _2A_);
-
-    REGISTER_INTRIN("AVX_SQRT", avx, sqrt, directed_end_mem, _2A_);
-    REGISTER_INTRIN("AVX_RSQRT", avx, rsqrt, directed_end_mem, _2A_);
-
-    REGISTER_INTRIN("AVX_FMADD", avx, fmadd, compound_dst_reg, _3A_);
-    REGISTER_INTRIN("AVX_FNMADD", avx, fnmadd, compound_dst_reg, _3A_);
-
-    REGISTER_INTRIN("AVX_BLEND", avx, blend, directed_end_mem, _3A_);
-
-    REGISTER_INTRIN("AVX_CMP_SET", avx, cmp_set, directed_end_mem, _3A_);
-
-    REGISTER_INTRIN("AVX_PSHUFFLE", avx, pshuffle, directed_end_mem, _3A_);
-    REGISTER_INTRIN("AVX_SHUFFLE", avx, shuffle, directed_end_mem, _5A_);
-    REGISTER_INTRIN("AVX_PERMUTE", avx, permute, directed_end_mem, _4A_);
-    REGISTER_INTRIN("AVX_INSERT", avx, insert, directed_end_mem, _4A_);
-    REGISTER_INTRIN("AVX_EXTRACT", avx, extract, directed_end_mem, _4A_);
-    REGISTER_INTRIN("AVX_GATHER", avx, gather, directed_all_reg, _3A_);
-
-    REGISTER_INTRIN("AVX_BROADCAST", avx, broadcast, directed_end_mem, _2A_);
-
-    REGISTER_INTRIN("AVX_UNPACK_LOW", avx, unpack_low, directed_end_mem, _4A_);
-    REGISTER_INTRIN(
-            "AVX_UNPACK_HIGH", avx, unpack_high, directed_end_mem, _4A_);
-
-    REGISTER_INTRIN(
-            "AVX_EXTRACT_LOW", avx, extract_low, directed_dst_mem, _2A_);
-    REGISTER_INTRIN(
-            "AVX_EXTRACT_HIGH", avx, extract_high, directed_all_reg, _2A_);
-
-    REGISTER_INTRIN(
-            "AVX_PERMUTEX2VAR", avx, permutex2var, compound_end_mem, _3A_);
-    REGISTER_INTRIN(
-            "AVX_PERMUTEXVAR", avx, permutexvar, directed_end_mem, _4A_);
-
-    // Finalize table
-    return intrin_handlers;
-}
-
-//=========================================================================
-// list of xbyak intrinsic handlers
-//=========================================================================
-
-static handler_table xbyak_handlers = register_handlers();
-
-//=========================================================================
-// get_xbyak_intrin_handler
-//=========================================================================
-
-xbyak_intrinsic_handler_t &get_xbyak_intrin_handler(
-        xbyak_intrin_isa isa, int64_t intrin) {
-    auto &handler = xbyak_handlers[TO_INDEX(isa)][TO_INDEX(intrin)];
-    COMPILE_ASSERT(handler,
-            "Invalid isa-intrin code: " << TO_INDEX(isa) << " - "
-                                        << TO_INDEX(intrin));
-    return *handler;
-}
-
-//=========================================================================
-// xbyak_condition
-//=========================================================================
-
-xbyak_condition get_xbyak_condition(sc_expr_type t) {
-    switch (t) {
-        case sc_expr_type::cmp_eq: return xbyak_condition::eq;
-        case sc_expr_type::cmp_lt: return xbyak_condition::lt;
-        case sc_expr_type::cmp_le: return xbyak_condition::le;
-        case sc_expr_type::cmp_ne: return xbyak_condition::ne;
-        case sc_expr_type::cmp_ge: return xbyak_condition::ge;
-        case sc_expr_type::cmp_gt: return xbyak_condition::gt;
-        default: COMPILE_ASSERT(false, "Invalid compare type: " << t);
-    }
-    return xbyak_condition::none;
-}
-
-std::ostream &operator<<(std::ostream &os, const xbyak_condition t) {
-    switch (t) {
-        case xbyak_condition::none: os << "NONE"; break;
-        case xbyak_condition::eq: os << "EQ"; break;
-        case xbyak_condition::ne: os << "NE"; break;
-        case xbyak_condition::lt: os << "LT"; break;
-        case xbyak_condition::le: os << "LE"; break;
-        case xbyak_condition::gt: os << "GT"; break;
-        case xbyak_condition::ge: os << "GE"; break;
-    }
-    return os;
-}
-
-//=========================================================================
-// xbyak_intrin_node
-//=========================================================================
-
-expr xbyak_intrin_node::remake() const {
-    expr node = make_xbyak_intrin(this->dtype_, this->args_,
-            static_cast<xbyak_intrin_type>(this->type_), this->isa_,
-            this->modifier_);
-    return copy_attr(*this, std::move(node));
-}
-
-void xbyak_intrin_node::to_string(ostream &os) const {
-    auto &v = get_xbyak_intrin_handler(isa_, type_);
-    if (modifier_.enabled_) {
-        os << "{";
-        if (modifier_.cond_code_ != xbyak_condition::none) {
-            os << modifier_.cond_code_;
-        }
-        if (modifier_.cond_mask_.defined()) {
-            os << "|" << modifier_.cond_mask_;
-        }
-        if (modifier_.zero_mask_) { os << "|Z"; }
-        os << "}";
-    }
-    os << v.name_ << '(';
-    if (!args_.empty()) {
-        for (unsigned i = 0; i < args_.size() - 1; i++) {
-            os << args_.at(i) << ", ";
-        }
-        os << args_.back();
-    }
-    os << ')';
-}
-
-xbyak_intrin_node::xbyak_intrin_node(const std::vector<expr> &args,
-        xbyak_intrin_type intrin, xbyak_intrin_isa isa,
-        xbyak_intrin_modifier modifier)
-    : low_level_intrin_node(low_level_intrin_kind::x86_xbyak,
-            static_cast<int64_t>(intrin), args, {})
-    , modifier_(std::move(modifier))
-    , isa_(isa) {
-    get_xbyak_intrin_handler(isa_, type_).on_initialize(*this);
-}
-
-expr make_xbyak_intrin(sc_data_type_t dtype, const std::vector<expr> &values,
-        xbyak_intrin_type intrin, xbyak_intrin_isa isa,
-        xbyak_intrin_modifier modifier) {
-    expr node = make_expr<xbyak_intrin_node>(
-            values, intrin, isa, std::move(modifier));
-    node->dtype_ = dtype;
-    return node;
-}
-
-expr make_physical_reg(
-        sc_data_type_t dtype, const Xbyak::Reg &reg, const std::string &post) {
-    expr node = builder::make_var(
-            dtype, std::string("%") + reg.toString() + post);
-    node->temp_data() = xbyak_expr_data_t(reg);
-    return node;
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_expr.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_expr.hpp
deleted file mode 100644
index d4611a6d9b0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_expr.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_XBYAK_EXPR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_XBYAK_EXPR_HPP
-
-#include <string>
-#include <vector>
-
-#include <compiler/ir/sc_expr.hpp>
-#include <compiler/ir/sc_stmt.hpp>
-#include <util/any_map.hpp>
-
-#include "reg_allocation/virtual_reg.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-#define GET_STMT_DATA(STMT) (STMT)->temp_data().get<xbyak::xbyak_stmt_data_t>()
-#define GET_STMT_INDEX(STMT) GET_STMT_DATA(STMT).index_
-#define GET_STMT_INIT_INDEX(STMT) GET_STMT_DATA(STMT).init_index_
-
-#define GET_EXPR_DATA(EXPR) (EXPR)->temp_data().get<xbyak::xbyak_expr_data_t>()
-#define GET_PHYSICAL_REG(EXPR) GET_EXPR_DATA(EXPR).physical_reg_
-#define GET_VIRTUAL_REG(EXPR) GET_EXPR_DATA(EXPR).virtual_reg_
-#define GET_LIVE_RANGE(EXPR) GET_EXPR_DATA(EXPR).virtual_reg_.live_range_
-
-struct xbyak_stmt_data_t {
-    bool optimized_out_ = false;
-    uint64_t loop_depth_ = 0;
-    stmt_index_t index_ = -1;
-    stmt_index_t init_index_ = -1;
-    void set_index(stmt_index_t index) {
-        index_ = index;
-        init_index_ = (init_index_ == -1) ? index : init_index_;
-    }
-    xbyak_stmt_data_t() = default;
-    xbyak_stmt_data_t(uint64_t loop_depth) : loop_depth_(loop_depth) {};
-};
-
-struct xbyak_expr_data_t {
-    const stmt_base_t *def_scope_ = nullptr;
-    virtual_reg_t virtual_reg_;
-    Xbyak::Reg physical_reg_;
-    xbyak_expr_data_t() = default;
-    xbyak_expr_data_t(const Xbyak::Reg &reg) : physical_reg_(reg) {};
-    xbyak_expr_data_t(const stmt_base_t *def_scope) : def_scope_(def_scope) {};
-};
-
-/**
- * Xbyak low-level intrinsic type
- * The begining numbers are reserved for low_level_intrin
- * */
-enum class xbyak_intrin_type {
-    call_arg = 0, // special intrin to represent call arg location (reg/stack)
-    sign_ext, // special intrin to represent CWD/CDQ/CQO/XOR before div/idiv
-    mask_mov, // special intrin to represent avx512 zero masked move
-    mov_mask, // special intrin for avx move mask 8/32
-    test, // special intrin to represent x86 bool logical compare
-    cmov, // conditional move
-    movd, // reinterpret dword f32 <-> int32
-    add,
-    sub,
-    mul, // x86: represent low result 2-address mul(r, r/m)
-    muli, // x86: represent low result 3-address mul(r, r/m, i)
-    mulhl, // x86: represent high/low result rdx:rax = mul(r/rm)~rax
-    div,
-    mod,
-    shl,
-    shr,
-    sar,
-    min,
-    max,
-    abs,
-    neg,
-    bit_or,
-    bit_and,
-    bit_xor,
-    cmp_set,
-    ceil,
-    floor,
-    round,
-    sqrt,
-    rsqrt,
-    fmadd,
-    fnmadd,
-    blend,
-    pshuffle,
-    shuffle,
-    permute,
-    gather,
-    insert,
-    extract,
-    bmi_pext,
-    broadcast,
-    reinterpret,
-    unpack_low,
-    unpack_high,
-    extract_low,
-    extract_high,
-    permutex2var,
-    permutexvar,
-    saturated_cast,
-    round_and_cast,
-    NUM_INTRINSICS,
-};
-
-/**
- * Intrinsic format to abstractly represent instruction format
- * */
-enum class xbyak_intrin_format {
-    // ------------------------------------------------------
-    // [directed_assign] operate on src, store to dst
-    // [compound_assign] operate on src and dst, store to dst
-    // ------------------------------------------------------
-    /* allow unlimited mem operands */
-    undefined = 0, // no restriction
-    /* allow 0 mem operands */
-    directed_all_reg, // [directed_assign], all must be reg
-    /* allow up to 1 mem operands */
-    directed_end_mem, // [directed_assign], only end can be mem
-    directed_dst_mem, // [directed_assign], dst can be mem
-    directed_dst_reg, // [directed_assign], dst must be reg
-    compound_end_mem, // [compound_assign], only end can be mem
-    compound_dst_mem, // [compound_assign], dst can be mem
-    compound_dst_reg, // [compound_assign], dst must be reg
-};
-
-/**
- * Intrinsic ISA to abstractly represent instruction type
- * */
-enum class xbyak_intrin_isa {
-    // TODO(XXX): sse and amx
-    x86 = 0, // base x86 intrinsic
-    avx, // avx2/avx512 intrinsic
-    NUM_ISAS,
-};
-
-/**
- * Conditional move and compare set modifier
- * */
-enum class xbyak_condition {
-    none = 0,
-    eq,
-    ne,
-    lt,
-    le,
-    gt,
-    ge,
-};
-
-// Compare sc_expr_type to xbyak_condition
-xbyak_condition get_xbyak_condition(sc_expr_type t);
-// xbyak_condition to ostream
-std::ostream &operator<<(std::ostream &os, const xbyak_condition t);
-
-/**
- * Modifier for xbyak_intrin_node
- * Reserved for cmp_set, cmov, blend, etc.
- * */
-struct xbyak_intrin_modifier {
-    xbyak_condition cond_code_;
-    sc_data_type_t type_hint_;
-    expr cond_mask_;
-    bool zero_mask_;
-    bool enabled_;
-    xbyak_intrin_modifier()
-        : cond_code_(xbyak_condition::none)
-        , type_hint_(datatypes::undef)
-        , zero_mask_(false)
-        , enabled_(false) {}
-    xbyak_intrin_modifier(sc_data_type_t type_hint)
-        : cond_code_(xbyak_condition::none)
-        , type_hint_(type_hint)
-        , zero_mask_(false)
-        , enabled_(true) {}
-    xbyak_intrin_modifier(
-            xbyak_condition cond, sc_data_type_t type_hint = datatypes::undef)
-        : cond_code_(cond)
-        , type_hint_(type_hint)
-        , zero_mask_(false)
-        , enabled_(true) {}
-    xbyak_intrin_modifier(expr mask, bool zero = false)
-        : cond_code_(xbyak_condition::none)
-        , type_hint_(datatypes::undef)
-        , cond_mask_(mask)
-        , zero_mask_(zero)
-        , enabled_(true) {}
-};
-
-/**
- * The xbyak_intrin_node node
- * @param args the arguments
- * @param intrin the intrinsic type
- * @param isa the intrinsic isa level
- * @param modifier the intrinsic modifier
- **/
-class xbyak_intrin_node : public low_level_intrin_node {
-public:
-    xbyak_intrin_node(const std::vector<expr> &args, xbyak_intrin_type intrin,
-            xbyak_intrin_isa isa,
-            xbyak_intrin_modifier modifier = xbyak_intrin_modifier());
-    expr remake() const override;
-    void to_string(ostream &os) const override;
-    xbyak_intrin_modifier modifier_;
-    xbyak_intrin_format format_;
-    xbyak_intrin_isa isa_;
-};
-SC_DEFINE_EXPR_NODE_PTR(xbyak_intrin)
-
-/**
- * Makes a xbyak intrin node
- * */
-expr make_xbyak_intrin(sc_data_type_t dtype, const std::vector<expr> &values,
-        xbyak_intrin_type intrin, xbyak_intrin_isa isa = xbyak_intrin_isa::x86,
-        xbyak_intrin_modifier modifier = xbyak_intrin_modifier());
-
-/**
- * Makes a xbyak reg node
- * */
-expr make_physical_reg(sc_data_type_t dtype, const Xbyak::Reg &reg,
-        const std::string &post = "");
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_printer.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_printer.cpp
deleted file mode 100644
index 98bdd5c97be..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_printer.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/pass/printer.hpp>
-#include <compiler/jit/xbyak/ir/transform/call_transform.hpp>
-#include <compiler/jit/xbyak/ir/transform/register_allocation.hpp>
-
-#include "xbyak_expr.hpp"
-#include "xbyak_printer.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-ostream &xbyak_printer_t::print_expr_info(ostream &os, const expr &arg) {
-    os << "{";
-    if (arg.isa<tensor>()) {
-        auto a = arg.static_as<tensor>();
-        ir_printer_t p {os};
-        a->to_string_full(p);
-    } else {
-        os << arg << ": " << arg->dtype_;
-    }
-    auto &virt_reg = GET_VIRTUAL_REG(arg);
-    os << ": " << virt_reg;
-    if (virt_reg.allocated()) {
-        os << "~%" << virtual_slots_map_->get_reg_name(virt_reg.index_);
-    }
-    os << "}";
-    return os;
-}
-
-ostream &xbyak_printer_t::print_expr_vec(
-        ostream &os, const std::vector<expr_c> &args) {
-    for (auto &arg : args) {
-        print_expr_info(os, arg.remove_const()) << ", ";
-    }
-    os << "\n";
-    return os;
-}
-
-xbyak_printer_t::xbyak_printer_t(std::ostream &os, const_ir_module_ptr &ir_mod,
-        x86_64::target_profile_t &profile)
-    : profile_(profile), ss_ {os} {
-    // Slot map for virtual slot and physical reg mapping
-    virtual_slots_map_ = std::make_shared<virtual_slots_map_t>(profile_);
-    // Print module vars
-    for (auto &f : ir_mod->get_module_vars()) {
-        ss_ << f << '\n';
-    }
-    // Print functions
-    for (auto &f : ir_mod->get_contents()) {
-        dispatch(f);
-        ss_ << '\n';
-    }
-}
-
-stmt_c xbyak_printer_t::dispatch(stmt_c f) {
-    f.remove_const()->attr()["source_pos"]
-            = source_pos {ss_.buf_.pos_, ss_.buf_.line_};
-    return ir_viewer_t::dispatch(f);
-}
-
-expr_c xbyak_printer_t::dispatch(expr_c f) {
-    if (!f.isa<var>() && !f.isa<tensor>()) {
-        f.remove_const()->attr()["source_pos"]
-                = source_pos {ss_.buf_.pos_, ss_.buf_.line_};
-    }
-    return ir_viewer_t::dispatch(f);
-}
-
-func_c xbyak_printer_t::dispatch(func_c e) {
-    indent_ = 0;
-
-    if (e->name_.find("_should_inline_") != std::string::npos) { return e; }
-
-    std::const_pointer_cast<func_base>(e)->attr()["source_pos"]
-            = source_pos {ss_.buf_.pos_, ss_.buf_.line_};
-
-    ss_ << "func " << e->name_ << '(';
-    if (!e->params_.empty()) {
-        for (unsigned i = 0; i < e->params_.size() - 1; i++) {
-            print_expr_info(ss_, e->params_.at(i)) << ", ";
-        }
-        print_expr_info(ss_, e->params_.back());
-    }
-    ss_ << "): " << e->ret_type_ << '\n';
-
-    func_t func = std::const_pointer_cast<func_base>(e);
-    assert(func->attr_ && func->attr_->has_key(attr_keys::global_spilled));
-    auto &spilled
-            = func->attr_->get<std::vector<expr_c>>(attr_keys::global_spilled);
-
-    struct expr_cmp_t {
-        bool operator()(const expr_c &a, const expr_c &b) const {
-            return (void *)a.get() < (void *)b.get();
-        }
-    };
-    std::set<expr_c, expr_cmp_t> params(e->params_.begin(), e->params_.end());
-    std::set<expr_c, expr_cmp_t> spills(spilled.begin(), spilled.end());
-
-    std::vector<expr_c> spilled_params;
-    std::vector<expr_c> spilled_global;
-
-    std::set_intersection(spills.begin(), spills.end(), params.begin(),
-            params.end(), std::back_inserter(spilled_params), expr_cmp_t());
-
-    std::set_difference(spills.begin(), spills.end(), params.begin(),
-            params.end(), std::back_inserter(spilled_global), expr_cmp_t());
-
-    print_padding_indents();
-    print_expr_vec(ss_ << "--PARAMS_SPILLED: ", spilled_params);
-    print_padding_indents();
-    print_expr_vec(ss_ << "--GLOBAL_SPILLED: ", spilled_global);
-
-    dispatch(e->body_);
-
-    return e;
-}
-
-void xbyak_printer_t::view(stmts_c v) {
-    print_index_indents(GET_STMT_INIT_INDEX(v));
-    ss_ << "{ \n";
-    if (TRANSFORMED_CALL(v)) {
-        assert(v->attr_);
-        // print func call
-        print_padding_indents();
-        ss_ << "--FUNC_CALL\n";
-    }
-
-    indent_++;
-    for (auto &s : v->seq_) {
-        dispatch(s);
-    }
-    indent_--;
-
-    print_index_indents(GET_STMT_INDEX(v));
-    ss_ << "}\n";
-}
-
-void xbyak_printer_t::view(evaluate_c v) {
-    print_index_indents(GET_STMT_INDEX(v));
-    ss_ << "evaluate{" << v->value_ << "}\n";
-}
-
-void xbyak_printer_t::view(returns_c v) {
-    print_index_indents(GET_STMT_INDEX(v));
-    ss_ << "return ";
-    if (v->value_.defined()) { ss_ << v->value_; }
-    ss_ << "\n";
-}
-
-void xbyak_printer_t::view(assign_c v) {
-    print_index_indents(GET_STMT_INDEX(v));
-    ss_ << v->var_ << " = " << v->value_ << "\n";
-}
-
-void xbyak_printer_t::view(define_c v) {
-    if (v->init_.defined()) {
-        print_padding_indents();
-        print_expr_info(ss_, v->var_) << "\n";
-        print_index_indents(GET_STMT_INDEX(v));
-        ss_ << v->var_ << " = " << v->init_ << "\n";
-    } else {
-        print_index_indents(GET_STMT_INDEX(v));
-        print_expr_info(ss_, v->var_) << "\n";
-    }
-}
-
-void xbyak_printer_t::view(if_else_c v) {
-    print_index_indents(GET_STMT_INIT_INDEX(v));
-    ss_ << "if (" << v->condition_ << ") \n";
-
-    dispatch(v->then_case_);
-
-    if (v->else_case_.defined()) {
-        print_padding_indents();
-        ss_ << "else \n";
-
-        dispatch(v->else_case_);
-    }
-    print_index_indents(GET_STMT_INDEX(v));
-    ss_ << "END: if (" << v->condition_ << ") \n";
-}
-
-void xbyak_printer_t::view(for_loop_c v) {
-    print_index_indents(GET_STMT_INIT_INDEX(v));
-    ss_ << "for " << v->var_ << " in (" << v->iter_begin_ << ", "
-        << v->iter_end_ << ", " << v->step_ << ") -> ";
-    print_expr_info(ss_, v->var_);
-    ss_ << "\n";
-
-    if (v->attr_ && v->attr_->has_key(attr_keys::load_loop_begin)) {
-        auto load_begin = v->attr_->get<stmt>(attr_keys::load_loop_begin);
-        print_padding_indents();
-        ss_ << "== load_begin: " << load_begin << "  ";
-        print_expr_info(ss_, v->iter_begin_) << "\n";
-    }
-    if (v->attr_ && v->attr_->has_key(attr_keys::load_loop_end)) {
-        auto load_end = v->attr_->get<stmt>(attr_keys::load_loop_end);
-        print_padding_indents();
-        ss_ << "== load_end: " << load_end << "  ";
-        print_expr_info(ss_, v->iter_end_) << "\n";
-    }
-    if (v->attr_ && v->attr_->has_key(attr_keys::load_loop_step)) {
-        auto load_step = v->attr_->get<stmt>(attr_keys::load_loop_step);
-        print_padding_indents();
-        ss_ << "== load_step: " << load_step << "  ";
-        print_expr_info(ss_, v->step_) << "\n";
-    }
-
-    dispatch(v->body_);
-
-    print_index_indents(GET_STMT_INDEX(v));
-    ss_ << "END: for (" << v->var_ << ") \n";
-}
-
-void xbyak_printer_t::print_index_indents(int64_t index) {
-    ss_ << std::left << std::setw(index_width_) << index << std::string(1, ' ');
-    ss_ << std::string(indent_ * 2, ' ');
-}
-
-void xbyak_printer_t::print_padding_indents() {
-    ss_ << std::string((indent_ * 2) + index_width_ + 1, ' ');
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_printer.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_printer.hpp
deleted file mode 100644
index 79a2dac5b4b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_printer.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_XBYAK_PRINTER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_XBYAK_PRINTER_HPP
-
-#include <memory>
-#include <sstream>
-#include <vector>
-
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/jit/xbyak/ir/reg_allocation/virtual_slot.hpp>
-#include <util/pos_track_stream.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class xbyak_printer_t : public ir_viewer_t {
-public:
-    xbyak_printer_t(std::ostream &os, const_ir_module_ptr &ir_mod,
-            x86_64::target_profile_t &profile);
-
-private:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-
-    func_c dispatch(func_c e) override;
-    stmt_c dispatch(stmt_c e) override;
-    expr_c dispatch(expr_c e) override;
-
-    void view(assign_c v) override;
-    void view(stmts_c v) override;
-    void view(if_else_c v) override;
-    void view(evaluate_c v) override;
-    void view(for_loop_c v) override;
-    void view(returns_c v) override;
-    void view(define_c v) override;
-
-    void print_index_indents(int64_t index);
-    void print_padding_indents();
-
-    ostream &print_expr_info(ostream &os, const expr &arg);
-    ostream &print_expr_vec(ostream &os, const std::vector<expr_c> &args);
-
-    x86_64::target_profile_t &profile_;
-    std::shared_ptr<virtual_slots_map_t> virtual_slots_map_;
-
-    constexpr static int index_width_ = 6;
-    int indent_ = 0;
-
-    track_pos_stream_t ss_;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_visitor.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_visitor.cpp
deleted file mode 100644
index 56cc2b96215..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_visitor.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <limits>
-#include <utility>
-#include <vector>
-
-#include <compiler/ir/builder.hpp>
-
-#include "xbyak_visitor.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-//-----------------
-// xbyak_visitor_t
-//-----------------
-
-stmt_c xbyak_visitor_t::visit(stmts_c v) {
-    const stmt_base_t *previous_scope_ = current_scope_;
-    current_scope_ = v.get();
-    auto vv = ir_visitor_t::visit(std::move(v));
-    current_scope_ = previous_scope_;
-    return vv;
-}
-
-stmt_c xbyak_visitor_t::visit(for_loop_c v) {
-    loop_depth_++;
-    auto vv = ir_visitor_t::visit(std::move(v));
-    loop_depth_--;
-    return vv;
-}
-
-expr_c xbyak_visitor_t::visit(tensor_c v) {
-    // do not dispatch into tensor
-    return v;
-}
-
-expr_c xbyak_visitor_t::visit(low_level_intrin_c v) {
-    switch (v->kind_) {
-        case low_level_intrin_kind::x86_general: {
-            return ir_visitor_t::visit(std::move(v));
-        } break;
-        case low_level_intrin_kind::x86_xbyak: {
-            auto vv = v.checked_as<xbyak_intrin_c>();
-            return visit(std::move(vv));
-        } break;
-        default: {
-            assert(0 && "Invalid intrin kind.");
-            return ir_visitor_t::visit(std::move(v));
-        }
-    }
-}
-
-expr_c xbyak_visitor_t::visit(xbyak_intrin_c v) {
-    std::vector<expr> new_arr;
-    bool changed = dispatch_expr_vector(v->args_, new_arr);
-    if (changed) {
-        return make_xbyak_intrin(v->dtype_, new_arr,
-                static_cast<xbyak_intrin_type>(v->type_), v->isa_,
-                v->modifier_);
-    } else {
-        return std::move(v);
-    }
-}
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_visitor.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_visitor.hpp
deleted file mode 100644
index b3c42a5efc9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/ir/xbyak_visitor.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_XBYAK_VISITOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_IR_XBYAK_VISITOR_HPP
-
-#include <compiler/ir/visitor.hpp>
-
-#include "xbyak_expr.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-class xbyak_visitor_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::dispatch;
-    using ir_visitor_t::dispatch_expr_vector;
-    using ir_visitor_t::visit;
-
-    stmt_c visit(stmts_c v) override;
-    stmt_c visit(for_loop_c v) override;
-
-    expr_c visit(tensor_c v) override;
-    expr_c visit(low_level_intrin_c v) override;
-
-    virtual expr_c visit(xbyak_intrin_c v);
-
-protected:
-    uint64_t loop_depth() { return loop_depth_; }
-    const stmt_base_t *current_scope() { return current_scope_; }
-
-private:
-    uint64_t loop_depth_ = 0;
-    const stmt_base_t *current_scope_ = nullptr;
-};
-
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_common.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_common.cpp
deleted file mode 100644
index 5d53a065a1f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_common.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/jit/xbyak/x86_64/abi_common.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-std::ostream &operator<<(std::ostream &os, abi_value_kind v) {
-    switch (v) {
-#define HANDLE_CASE(V) \
-    case abi_value_kind::V: os << "abi_value_kind::" #V; break;
-
-        HANDLE_CASE(INTEGER);
-        HANDLE_CASE(SSE);
-        HANDLE_CASE(SSEUPx15_SSE);
-
-#undef HANDLE_CASE
-    }
-
-    return os;
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_common.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_common.hpp
deleted file mode 100644
index 5ec2dc7da68..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_common.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_ABI_COMMON_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_ABI_COMMON_HPP
-
-#include <ostream>
-#include <vector>
-#include <compiler/jit/xbyak/x86_64/registers.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-/// See psABI 1.0 section 3.2.3.
-///
-/// This enumeration defines only the subset of value classes that we
-/// currently support.
-enum class abi_value_kind {
-    INTEGER,
-    SSE,
-
-    /// A hack to let us continue to use an enum for this, even though the psABI
-    /// wants to treat it as this sequence of 8-bytes: {SSEUP, ..., SSEUP, SSE}.
-    /// (Going from highest-order on the left to lowest-order on the right.)
-    SSEUPx15_SSE,
-
-    // SSEUP,
-    // X87,
-    // X87UP,
-    // COMPLEX_X87,
-    // NO_CLASS,
-    // MEMORY,
-};
-
-std::ostream &operator<<(std::ostream &, abi_value_kind v);
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_function_interface.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_function_interface.cpp
deleted file mode 100644
index 9d8797f3a95..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_function_interface.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <memory>
-
-#include <compiler/jit/xbyak/x86_64/abi_common.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_function_interface.hpp>
-#include <compiler/jit/xbyak/x86_64/type_mapping.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-/// See psABI 1.0 section 3.2.3
-static abi_function_interface compute_iface_system_v(
-        const target_profile_t &profile,
-        const std::vector<cpu_data_type> &param_types,
-        const cpu_data_type ret_cpu_dtype) {
-    // Our algorithm can have a simpler structure than the psABI's,
-    // because we support only a small subset of the psABI's data types...
-
-    abi_function_interface iface;
-    iface.param_locs_.resize(param_types.size());
-
-    // The minimum stack alignment permitted by the psABI.
-    iface.initial_rsp_alignment_ = 16;
-
-    size_t next_arg_gp_idx = 0;
-    size_t next_arg_xmm_idx = 0;
-
-    // The elements of 'param_types' that will need to be stored on the
-    // stack, in ascending-index order.
-    std::vector<size_t> stack_param_indices;
-
-    // Idenify the parameters that will be transfered via registers vs. the
-    // stack. For register parameters, assign the register immediately.
-    for (size_t param_idx = 0; param_idx < param_types.size(); ++param_idx) {
-        // The "Classification" step from psABI section 3.2.3...
-        const cpu_data_type_table::row &r
-                = get_cpu_data_types().lookup(param_types[param_idx]);
-
-        abi_value_location &loc = iface.param_locs_[param_idx];
-
-        // The "Passing" step from psABI section 3.2.3...
-        switch (r.abi_initial_val_kind_) {
-            case abi_value_kind::INTEGER:
-                if (next_arg_gp_idx < profile.func_arg_gp_regs_.size()) {
-                    const Xbyak::Reg64 reg = to_reg64(
-                            profile.func_arg_gp_regs_[next_arg_gp_idx++]);
-                    loc.set_to_register(reg);
-                } else {
-                    stack_param_indices.push_back(param_idx);
-                }
-                break;
-
-            case abi_value_kind::SSE:
-                if (next_arg_xmm_idx < profile.func_arg_xmm_regs_.size()) {
-                    const Xbyak::Xmm reg = to_xmm(
-                            profile.func_arg_xmm_regs_[next_arg_xmm_idx++]);
-                    loc.set_to_register(reg);
-                } else {
-                    stack_param_indices.push_back(param_idx);
-                }
-                break;
-
-            case abi_value_kind::SSEUPx15_SSE:
-                COMPILE_ASSERT(false,
-                        "Not yet supported: callers/callees with parameter "
-                        "type f32x16");
-                break;
-        }
-    }
-
-    // The psABI specifies that stack-based parameters are pushed to the call
-    // stack in right-to-left parameter-list order.
-    //
-    // When control is first transfered to the callee, the call stack looks
-    // like this:
-    //
-    // |          ......          |
-    // |--------------------------| -|
-    // | (right-most stack param) |  |
-    // |--------------------------|  |
-    // | (next stack param)       |  |
-    // |--------------------------|  |- stack parameter area
-    // |          ......          |  |
-    // |--------------------------|  |
-    // | (left-most stack param)  |  |
-    // |--------------------------| -|
-    // | (return address)         |
-    // |--------------------------| <-- %rsp
-    //
-    // When we specify a parameter's location on the stack, it's relative to
-    // the value of %rsp as shown in this diagram. So the address of the
-    // last-pushed stack parameter is always %rsp + 8.
-    size_t next_param_rsp_offset = 8;
-
-    // Note the iteration order. We're dealing with the stack-based indices
-    // in left-to-right order, which means we're starting with LAST-PUSHED
-    // parameter.
-    for (const size_t param_idx : stack_param_indices) {
-        const cpu_data_type_table::row &r
-                = get_cpu_data_types().lookup(param_types[param_idx]);
-
-        abi_value_location &loc = iface.param_locs_[param_idx];
-
-        // Pre-call stack alignment is determined by whichever argument has the
-        // strictest alignment requrirements.
-        iface.initial_rsp_alignment_ = std::max(
-                iface.initial_rsp_alignment_, r.abi_precall_stack_alignment_);
-
-        loc.set_to_rsp_offset(next_param_rsp_offset);
-        next_param_rsp_offset += r.abi_stack_slot_size_;
-    }
-
-    if (ret_cpu_dtype != cpu_data_type::void_t) {
-        const auto &r = get_cpu_data_types().lookup(ret_cpu_dtype);
-
-        switch (r.abi_initial_val_kind_) {
-            case abi_value_kind::INTEGER:
-                iface.return_val_loc_.set_to_register(
-                        profile.func_return_gp_reg_);
-                break;
-
-            case abi_value_kind::SSE:
-                iface.return_val_loc_.set_to_register(
-                        profile.func_return_xmm_reg_);
-                break;
-
-            case abi_value_kind::SSEUPx15_SSE:
-                COMPILE_ASSERT(false,
-                        "Not yet supported: callers/callees with return type "
-                        "f32x16");
-                break;
-        }
-    }
-
-    return iface;
-}
-
-static abi_function_interface compute_iface_microsoft(
-        const target_profile_t &profile,
-        const std::vector<cpu_data_type> &param_types,
-        const cpu_data_type ret_cpu_dtype) {
-    // Our algorithm can have a simpler structure than the msABI's,
-    // because we support only a small subset of the msABI's data types...
-
-    abi_function_interface iface;
-    iface.param_locs_.resize(param_types.size());
-
-    // The minimum stack alignment permitted by the msABI.
-    iface.initial_rsp_alignment_ = 16;
-
-    // The elements of 'param_types' that will need to be stored on the
-    // stack, in ascending-index order.
-    std::vector<size_t> stack_param_indices;
-
-    // Idenify the parameters that will be transfered via registers vs. the
-    // stack. For register parameters, assign the register immediately.
-    for (size_t param_idx = 0; param_idx < param_types.size(); ++param_idx) {
-        const cpu_data_type_table::row &r
-                = get_cpu_data_types().lookup(param_types[param_idx]);
-
-        abi_value_location &loc = iface.param_locs_[param_idx];
-
-        switch (r.abi_initial_val_kind_) {
-            case abi_value_kind::INTEGER:
-                if (param_idx < profile.func_arg_gp_regs_.size()) {
-                    const Xbyak::Reg64 reg
-                            = to_reg64(profile.func_arg_gp_regs_[param_idx]);
-                    loc.set_to_register(reg);
-                } else {
-                    stack_param_indices.push_back(param_idx);
-                }
-                break;
-
-            case abi_value_kind::SSE:
-                if (param_idx < profile.func_arg_xmm_regs_.size()) {
-                    const Xbyak::Xmm reg
-                            = to_xmm(profile.func_arg_xmm_regs_[param_idx]);
-                    loc.set_to_register(reg);
-                } else {
-                    stack_param_indices.push_back(param_idx);
-                }
-                break;
-
-            case abi_value_kind::SSEUPx15_SSE:
-                COMPILE_ASSERT(false,
-                        "Not yet supported: callers/callees with parameter "
-                        "type f32x16");
-                break;
-        }
-    }
-
-    // The msABI specifies that stack-based parameters are pushed to the call
-    // stack in right-to-left parameter-list order.
-    //
-    // When control is first transfered to the callee, the call stack looks
-    // like this:
-    //
-    // |          ......          |
-    // |--------------------------| -|
-    // | (right-most stack param) |  |
-    // |--------------------------|  |
-    // | (next stack param)       |  |
-    // |--------------------------|  |- stack parameter area
-    // |          ......          |  |
-    // |--------------------------|  |
-    // | (left-most stack param)  |  |
-    // |--------------------------| -|
-    // |                          |
-    // | (32 bytes shadow space)  |
-    // |                          |
-    // |--------------------------|
-    // | (return address)         |
-    // |--------------------------| <-- %rsp
-    //
-    // When we specify a parameter's location on the stack, it's relative to
-    // the value of %rsp as shown in this diagram. So the address of the
-    // last-pushed stack parameter is always %rsp + 8 + shadow_space_bytes.
-    size_t next_param_rsp_offset = 8 + profile.shadow_space_bytes_;
-
-    // Note the iteration order. We're dealing with the stack-based indices
-    // in left-to-right order, which means we're starting with LAST-PUSHED
-    // parameter.
-    for (const size_t param_idx : stack_param_indices) {
-        const cpu_data_type_table::row &r
-                = get_cpu_data_types().lookup(param_types[param_idx]);
-
-        abi_value_location &loc = iface.param_locs_[param_idx];
-
-        // Pre-call stack alignment is determined by whichever argument has the
-        // strictest alignment requrirements.
-        iface.initial_rsp_alignment_ = std::max(
-                iface.initial_rsp_alignment_, r.abi_precall_stack_alignment_);
-
-        loc.set_to_rsp_offset(next_param_rsp_offset);
-        next_param_rsp_offset += r.abi_stack_slot_size_;
-    }
-
-    if (ret_cpu_dtype != cpu_data_type::void_t) {
-        const auto &r = get_cpu_data_types().lookup(ret_cpu_dtype);
-
-        switch (r.abi_initial_val_kind_) {
-            case abi_value_kind::INTEGER:
-                iface.return_val_loc_.set_to_register(
-                        profile.func_return_gp_reg_);
-                break;
-
-            case abi_value_kind::SSE:
-                iface.return_val_loc_.set_to_register(
-                        profile.func_return_xmm_reg_);
-                break;
-
-            case abi_value_kind::SSEUPx15_SSE:
-                COMPILE_ASSERT(false,
-                        "Not yet supported: callers/callees with return type "
-                        "f32x16");
-                break;
-        }
-    }
-
-    return iface;
-}
-
-static abi_function_interface compute_iface(const target_profile_t &profile,
-        const std::vector<cpu_data_type> &param_types,
-        const cpu_data_type ret_cpu_dtype) {
-    if (profile.call_convention_ == call_convention::system_v) {
-        return compute_iface_system_v(profile, param_types, ret_cpu_dtype);
-    } else if (profile.call_convention_ == call_convention::microsoft) {
-        return compute_iface_microsoft(profile, param_types, ret_cpu_dtype);
-    } else {
-        assert(false && "Unsupported Calling Convention!");
-        return abi_function_interface();
-    }
-}
-
-std::vector<size_t>
-abi_function_interface::get_stack_params_descending_idx() const {
-    std::vector<size_t> v;
-
-    for (int idx = int(param_locs_.size() - 1); idx >= 0; --idx) {
-        if (param_locs_[idx].get_type()
-                == abi_value_location::tag_type::STACK) {
-            v.push_back(idx);
-        }
-    }
-
-    return v;
-}
-
-std::vector<size_t>
-abi_function_interface::get_register_params_ascending_idx() const {
-    std::vector<size_t> v;
-
-    for (size_t idx = 0; idx < param_locs_.size(); ++idx) {
-        if (param_locs_[idx].get_type()
-                == abi_value_location::tag_type::REGISTER) {
-            v.push_back(idx);
-        }
-    }
-
-    return v;
-}
-
-size_t abi_function_interface::get_param_area_size() const {
-    size_t n = 0;
-
-    for (const auto &loc : param_locs_) {
-        if (loc.get_type() == abi_value_location::tag_type::STACK) {
-            // TODO(xxx): For now we're assuming that each stack slot is exactly
-            // 8 bytes large, but that assumption will be wrong once we start
-            // handling larger objects (e.g., simd vectors).
-            n += 8;
-        }
-    }
-
-    return n;
-}
-
-abi_function_interface::ptr abi_function_interface::make_interface(
-        const target_profile_t &profile, const std::vector<expr> &params,
-        sc_data_type_t ret_type) {
-    std::vector<cpu_data_type> param_cpu_dtypes;
-    for (auto &p : params) {
-        const auto t = get_cpu_data_type(p->dtype_);
-        param_cpu_dtypes.push_back(t);
-    }
-
-    const auto ret_cpu_dtype = get_cpu_data_type(ret_type);
-    return std::make_shared<abi_function_interface>(
-            compute_iface(profile, param_cpu_dtypes, ret_cpu_dtype));
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_function_interface.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_function_interface.hpp
deleted file mode 100644
index 82a3646b098..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_function_interface.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_ABI_FUNCTION_INTERFACE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_ABI_FUNCTION_INTERFACE_HPP
-
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-
-#include <memory>
-#include <vector>
-
-#include <compiler/ir/sc_expr.hpp>
-#include <compiler/jit/xbyak/x86_64/abi_value_location.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-#include <compiler/jit/xbyak/x86_64/target_profile.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-/// For a particular combination of (ABI, function signature), this indicates
-/// where various values related to a function call are to be placed.
-/// Contains information useful for both the caller and callee.
-///
-/// Limitations:
-///    - No way to indicate that Boolean-typed values must obey certain
-///      bit-pattern rules that are tighter than what C/C++ allow.
-struct abi_function_interface {
-    /// Ptr for abi_function_interface
-    typedef std::shared_ptr<abi_function_interface> ptr;
-
-    /// Indicates where each parameter value can be found upon entry to the
-    /// callee. None of the vector elements will have tag_type::NONE.
-    std::vector<abi_value_location> param_locs_;
-
-    /// Where the callee is supposed to store the return value.
-    /// If the callee return type is \c void, this has tag_type::NONE.
-    abi_value_location return_val_loc_;
-
-    /// Indicates the %rsp's alignment requirement that must be satisfied
-    /// before any stack-based parameters are pushed (or if there are no
-    /// stack-based parameters for this call, then before the call instruction
-    /// is issued).
-    ///
-    /// I.e., the caller is responsible for ensuring that
-    /// (%rsp modulo initial_rsp_alignment_ == 0) at the stated program point.
-    size_t initial_rsp_alignment_;
-
-    /// A convenience method. Returns the indices into \c param_locs_
-    /// for the stack-based parameters. The indices are presented in
-    /// descending order (i.e., right-to-left order from the function
-    /// parameter list).
-    std::vector<size_t> get_stack_params_descending_idx() const;
-
-    std::vector<size_t> get_register_params_ascending_idx() const;
-
-    /// Returns the number of stack bytes that the stack-located parameters
-    /// will require.
-    ///
-    /// Does *not* include / and initial padding need to meet the psABI's
-    /// requirement for / %rsp alignment when control enters the callee.
-    size_t get_param_area_size() const;
-
-    static abi_function_interface::ptr make_interface(
-            const target_profile_t &profile,
-            const std::vector<sc_data_type_t> &param_types,
-            sc_data_type_t ret_type);
-    static abi_function_interface::ptr make_interface(
-            const target_profile_t &profile, const std::vector<expr> &params,
-            sc_data_type_t ret_type);
-};
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_value_location.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_value_location.cpp
deleted file mode 100644
index aa8621c3c52..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_value_location.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/jit/xbyak/x86_64/abi_value_location.hpp>
-
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-abi_value_location::abi_value_location() {
-    set_to_none();
-}
-
-abi_value_location::abi_value_location(Xbyak::Reg reg) {
-    set_to_register(reg);
-}
-
-abi_value_location::abi_value_location(int rsp_offset) {
-    set_to_rsp_offset(rsp_offset);
-}
-
-void abi_value_location::set_to_none() {
-    tag_ = tag_type::NONE;
-}
-
-void abi_value_location::set_to_register(Xbyak::Reg reg) {
-    tag_ = tag_type::REGISTER;
-    val_.reg_ = reg;
-}
-
-void abi_value_location::set_to_rsp_offset(int rsp_offset) {
-    tag_ = tag_type::STACK;
-    val_.rsp_offset_ = rsp_offset;
-}
-
-abi_value_location::tag_type abi_value_location::get_type() const {
-    return tag_;
-}
-
-Xbyak::Reg abi_value_location::get_register() const {
-    COMPILE_ASSERT(tag_ == tag_type::REGISTER, "wrong tag type");
-    return val_.reg_;
-}
-
-int abi_value_location::get_rsp_offset() const {
-    COMPILE_ASSERT(tag_ == tag_type::STACK, "wrong tag type");
-    return val_.rsp_offset_;
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_value_location.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_value_location.hpp
deleted file mode 100644
index 43911f1056c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/abi_value_location.hpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_ABI_VALUE_LOCATION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_ABI_VALUE_LOCATION_HPP
-
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-class abi_value_location {
-public:
-    enum class tag_type {
-        /// A placeholder that indicates a particular role (e.g., void return
-        /// value) is not needed and therefore has no location.
-        NONE,
-
-        /// The value is stored in a register.
-        REGISTER,
-
-        /// The value is stored on the stack.
-        STACK,
-    };
-
-    abi_value_location();
-    abi_value_location(Xbyak::Reg reg);
-    abi_value_location(int rsp_offset);
-
-    void set_to_none();
-    void set_to_register(Xbyak::Reg reg);
-    void set_to_rsp_offset(int rsp_offset);
-
-    tag_type get_type() const;
-    Xbyak::Reg get_register() const;
-    int get_rsp_offset() const;
-
-private:
-    tag_type tag_ = tag_type::NONE;
-
-    union {
-        /// Indicate's the memory address of this argument as a byte offset
-        /// relative to the value of %rsp when control first reaches the
-        /// callee.
-        int rsp_offset_;
-
-        /// The register containing the value.
-        /// NOTE: This works for integer-like values, but we'll need to
-        /// refactor if/when we support xmm/ymm/zmm registers.
-        Xbyak::Reg reg_ = Xbyak::Reg();
-    } val_;
-};
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/native_types.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/native_types.cpp
deleted file mode 100644
index 2cb13c07159..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/native_types.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-const cpu_data_type_table &get_cpu_data_types() {
-    using avk = abi_value_kind;
-
-    // clang-format off
-    static const cpu_data_type_table local_cpu_data_types(std::vector<cpu_data_type_table::row> {                                    // NOLINT
-    //  +-----------------------------+------+-----------+-----------+-------------+-----------+-------------+-------------------+   // NOLINT
-    //  | type                        | size | natural   | strictest | abi precall | abi stack | local value | ABI value         |   // NOLINT
-    //  |                             |      | alignment | alignment | stack       | slot size | stack slot  | kind              |   // NOLINT
-    //  |                             |      |           |           | alignment   |           | size        |                   |   // NOLINT
-    //  +-----------------------------+------+-----------+-----------+-------------+-----------+-------------+-------------------+   // NOLINT
-        { cpu_data_type::uint_8       ,    1 ,         1 ,         1 ,          16 ,         8 ,           8 , avk::INTEGER      },  // NOLINT
-        { cpu_data_type::uint_8_x8    ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_8_x16   ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_8_x32   ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_8_x64   ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_8       ,    1 ,         1 ,         1 ,          16 ,         8 ,           8 , avk::INTEGER      },  // NOLINT
-        { cpu_data_type::sint_8_x8    ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_8_x16   ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_8_x32   ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_8_x64   ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_16      ,    2 ,         2 ,         2 ,          16 ,         8 ,           8 , avk::INTEGER      },  // NOLINT
-        { cpu_data_type::uint_16_x4   ,    8 ,         8 ,         8 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_16_x8   ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_16_x16  ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_16_x32  ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_32      ,    4 ,         4 ,         1 ,          16 ,         8 ,           8 , avk::INTEGER      },  // NOLINT
-        { cpu_data_type::uint_32_x2   ,    8 ,         8 ,         8 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_32_x4   ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_32_x8   ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_32_x16  ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_32      ,    4 ,         4 ,         1 ,          16 ,         8 ,           8 , avk::INTEGER      },  // NOLINT
-        { cpu_data_type::sint_32_x2   ,    8 ,         8 ,         8 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_32_x4   ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_32_x8   ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::sint_32_x16  ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_64      ,    8 ,         8 ,         1 ,          16 ,         8 ,           8 , avk::INTEGER      },  // NOLINT
-        { cpu_data_type::uint_64_x2   ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_64_x4   ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::uint_64_x8   ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_16     ,    2 ,         2 ,         2 ,          16 ,         8 ,           8 , avk::SSE          },  // NOLINT
-        { cpu_data_type::float_16_x4   ,   8 ,         8 ,         8 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_16_x8   ,  16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_16_x16  ,  32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_16_x32  ,  64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_32     ,    4 ,         4 ,         1 ,          16 ,         8 ,           8 , avk::SSE          },  // NOLINT
-        { cpu_data_type::float_32_x2  ,    8 ,         8 ,         8 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_32_x4  ,   16 ,        16 ,        16 ,          16 ,        16 ,          16 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_32_x8  ,   32 ,        32 ,        32 ,          32 ,        32 ,          32 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::float_32_x16 ,   64 ,        64 ,        64 ,          64 ,        64 ,          64 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::mask_x4      ,    1 ,         1 ,         1 ,          16 ,         8 ,           8 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::mask_x8      ,    1 ,         1 ,         1 ,          16 ,         8 ,           8 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::mask_x16     ,    2 ,         2 ,         1 ,          16 ,         8 ,           8 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::mask_x32     ,    4 ,         4 ,         1 ,          16 ,         8 ,           8 , avk::SSEUPx15_SSE },  // NOLINT
-        { cpu_data_type::mask_x64     ,    8 ,         8 ,         1 ,          16 ,         8 ,           8 , avk::SSEUPx15_SSE },  // NOLINT
-    //  +-----------------------------+------+-----------+-----------+-------------+-----------+-------------+-------------------+   // NOLINT
-    });
-
-    // clang-format on
-    return local_cpu_data_types;
-}
-
-std::ostream &operator<<(std::ostream &os, const cpu_data_type t) {
-    switch (t) {
-#define HANDLE_CASE(X) \
-    case xbyak::x86_64::cpu_data_type::X: \
-        os << "xbyak::x86_64::cpu_data_type::" #X; \
-        break;
-
-        HANDLE_CASE(uint_8)
-        HANDLE_CASE(uint_8_x8)
-        HANDLE_CASE(uint_8_x16)
-        HANDLE_CASE(uint_8_x32)
-        HANDLE_CASE(uint_8_x64)
-        HANDLE_CASE(sint_8)
-        HANDLE_CASE(sint_8_x8)
-        HANDLE_CASE(sint_8_x16)
-        HANDLE_CASE(sint_8_x32)
-        HANDLE_CASE(sint_8_x64)
-        HANDLE_CASE(uint_16)
-        HANDLE_CASE(uint_16_x4)
-        HANDLE_CASE(uint_16_x8)
-        HANDLE_CASE(uint_16_x16)
-        HANDLE_CASE(uint_16_x32)
-        HANDLE_CASE(uint_32)
-        HANDLE_CASE(uint_32_x2)
-        HANDLE_CASE(uint_32_x4)
-        HANDLE_CASE(uint_32_x8)
-        HANDLE_CASE(uint_32_x16)
-        HANDLE_CASE(sint_32)
-        HANDLE_CASE(sint_32_x2)
-        HANDLE_CASE(sint_32_x4)
-        HANDLE_CASE(sint_32_x8)
-        HANDLE_CASE(sint_32_x16)
-        HANDLE_CASE(uint_64)
-        HANDLE_CASE(float_16)
-        HANDLE_CASE(float_16_x4)
-        HANDLE_CASE(float_16_x8)
-        HANDLE_CASE(float_16_x16)
-        HANDLE_CASE(float_32)
-        HANDLE_CASE(float_32_x2)
-        HANDLE_CASE(float_32_x4)
-        HANDLE_CASE(float_32_x8)
-        HANDLE_CASE(float_32_x16)
-        HANDLE_CASE(mask_x4)
-        HANDLE_CASE(mask_x8)
-        HANDLE_CASE(mask_x16)
-        HANDLE_CASE(mask_x32)
-        HANDLE_CASE(mask_x64)
-        HANDLE_CASE(void_t)
-#undef HANDLE_CASE
-        default: os << "(unrecognized cpu_data_type value)"; break;
-    }
-    return os;
-}
-
-cpu_data_type_table::row::row(cpu_data_type type, size_t size_in_bytes,
-        size_t cpu_natural_alignment, size_t cpu_strictest_alignment,
-        size_t abi_precall_stack_alignment, size_t abi_stack_slot_size,
-        size_t local_value_stack_slot_size, abi_value_kind abi_initial_val_kind)
-    : type_(type)
-    , size_in_bytes_(size_in_bytes)
-    , cpu_natural_alignment_(cpu_natural_alignment)
-    , cpu_strictest_alignment_(cpu_strictest_alignment)
-    , abi_precall_stack_alignment_(abi_precall_stack_alignment)
-    , abi_stack_slot_size_(abi_stack_slot_size)
-    , local_value_stack_slot_size_(local_value_stack_slot_size)
-    , abi_initial_val_kind_(abi_initial_val_kind) {}
-
-cpu_data_type_table::cpu_data_type_table(const std::vector<row> &content)
-    : content_(content) {}
-
-const cpu_data_type_table::row &cpu_data_type_table::lookup(
-        cpu_data_type t) const {
-    for (const auto &r : content_) {
-        if (r.type_ == t) { return r; }
-    }
-
-    COMPILE_ASSERT(false, "No matching row for " << t);
-}
-
-size_t get_local_value_stack_slot_size(cpu_data_type t) {
-    return get_cpu_data_types().lookup(t).local_value_stack_slot_size_;
-}
-
-size_t get_size_in_bytes(cpu_data_type t) {
-    return get_cpu_data_types().lookup(t).size_in_bytes_;
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/native_types.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/native_types.hpp
deleted file mode 100644
index 8e4da968b02..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/native_types.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_NATIVE_TYPES_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_NATIVE_TYPES_HPP
-
-/**
- * @file
- * @brief Defines some data types that are native to various x86-64 ISAs.
- *
- * x86-64 types can be defined according using a number of reasonable
- * taxonomies. The taxonomy we define here is guided by the following goals:
- *
- * - Standard. The mapping between our type taxonomy, and the taxonomies used
- *   in certain standard documentation is intuitive. The documentation guiding
- *   our taxonomy is:
- *
- *   - "Intel(R) 64 and IA-32 Architectures Software Developer's Manual"
- *     (especially Volume 1 Section 4). Abbreviated "SDM".
- *
- *   - (maintained here: https://gitlab.com/x86-psABIs/x86-64-ABI).
- *     Abbreviated "psABI".
- *
- * - Simple. Our taxonomy only covers that data types required by the
- *   Graphcompiler's Xbyak JIT engine. This means:
- *
- *   - There are some datatypes defined in the standard documentation
- *     (e.g. 80-bit x87 floats) that are simply omitted from our taxonomy.
- *
- *   - There may be some datatypes that other documentation treats as distinct,
- *     but our taxonomy treats as a single type. E.g., {__m128i, __m128u} vs.
- *     simply {__m128}.
- *
- * - Flat / denormalized. The Manual defines a
- *   type hierarchy with several levels: numeric data type, fundamental data
- *   type, etc. In cases where such distinctions are needed by our taxonomy,
- *   we add columns as needed to our one and only type table, rather than
- *   adding more tables.
- *
- * - Combined CPU + ABI. The data types, and the data-type table, contains
- *   all of our tabular information for x86-64 ISA *and* psABI.
- *
- *   (If/when we support additional ABIs for this ISA, it may be reasonable to
- *   separate the ABI-specific information into separate, per-ABI tables.)
- */
-
-#include <cstddef> // size_t
-#include <ostream>
-#include <vector>
-#include <compiler/jit/xbyak/x86_64/abi_common.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-/// Terminology notes:
-///
-/// From the Intel SDM:
-/// "Fundamental data types" are units of data expressed as some number of
-/// contiguously grouped bits, without consideration to the interpretation
-/// of those bits. The SDM defines these fundamental data types:
-///
-/// - In Volume 1, Section 4.1:
-///    - "byte" : 8 bits
-///    - "word" : 2 bytes
-///    - "doubleword" : 4 bytes
-///    - "quadword" : 8 bytes
-///    - "double quadword" : 16 bytes
-
-enum class cpu_data_type {
-    uint_8,
-    uint_8_x8,
-    uint_8_x16,
-    uint_8_x32,
-    uint_8_x64,
-    sint_8,
-    sint_8_x8,
-    sint_8_x16,
-    sint_8_x32,
-    sint_8_x64,
-    uint_16,
-    uint_16_x4,
-    uint_16_x8,
-    uint_16_x16,
-    uint_16_x32,
-    uint_32,
-    uint_32_x2,
-    uint_32_x4,
-    uint_32_x8,
-    uint_32_x16,
-    sint_32,
-    sint_32_x2,
-    sint_32_x4,
-    sint_32_x8,
-    sint_32_x16,
-    uint_64,
-    uint_64_x2,
-    uint_64_x4,
-    uint_64_x8,
-    float_16,
-    float_16_x4,
-    float_16_x8,
-    float_16_x16,
-    float_16_x32,
-    float_32,
-    float_32_x2,
-    float_32_x4,
-    float_32_x8,
-    float_32_x16,
-    mask_x4,
-    mask_x8,
-    mask_x16,
-    mask_x32,
-    mask_x64,
-    void_t,
-};
-
-std::ostream &operator<<(std::ostream &os, const cpu_data_type t);
-
-class cpu_data_type_table {
-public:
-    struct row {
-        cpu_data_type type_;
-
-        /// The number of bytes needed to store this data type in memory.
-        size_t size_in_bytes_;
-
-        /// For CPU memory operands having formal type \c type_,
-        /// this is the minimum memory-address alignment required for the CPU
-        /// to efficiently access the value. See SDM 4.1.1.
-        ///
-        /// The actual requirement is:
-        /// (runtime memory address) % (cpu_natural_alignment_) == 0
-        size_t cpu_natural_alignment_;
-
-        /// Some CPU instructions have a hard requirement for the memory-address
-        /// alignment of their operands. This field is the numerically-greatest
-        /// alignment requirement across all CPU instructions that operate on
-        /// this type of value. If there is no such requirements for this \c
-        /// type_, this field's value is 1. See SDM vol. 1, sections 4.1
-        /// and 15.7.
-        ///
-        /// The actual requirement is:
-        /// (runtime memory address) % (cpu_strictest_alignment_) == 0
-        size_t cpu_strictest_alignment_;
-
-        /// The psABI (section 3.2.2) requires that %rsp meets certain alignment
-        /// requirements upon entry to any callee function.
-        /// This gives the minimum alignment required by the ABI for
-        /// stack-passed arguments of type \c type_.
-        ///
-        /// This value should NOT be confused with the "Alignment (bytes)"
-        /// column from psABI Figure 3.1. That column seems to be commentary
-        /// on the alignment needed for efficient memory access on certain
-        /// x86-64 microarchitectures.
-        size_t abi_precall_stack_alignment_;
-
-        /// Indicates the size of the stack slot used for function-call
-        /// parameters of this type, as specified by the psABI.
-        /// This will always be a multiple of 8.
-        size_t abi_stack_slot_size_;
-
-        /// Prescribes the amount of stack-memory used to store local variables
-        /// and temporary r-values of this type.
-        /// This value is a design choice for the Xback-JIT codegen.
-        /// It's not (directly) constrained by the psABI.
-        size_t local_value_stack_slot_size_;
-
-        /// How the psABI (section 3.2.3) initially classifies this kind of
-        /// value when determining how function parameters and return values are
-        /// passed.
-        abi_value_kind abi_initial_val_kind_;
-
-        row(cpu_data_type type, size_t size_in_bytes,
-                size_t cpu_natural_alignment, size_t cpu_strictest_alignment,
-                size_t abi_precall_stack_alignment, size_t abi_stack_slot_size,
-                size_t local_value_stack_slot_size,
-                abi_value_kind abi_initial_val_kind);
-    };
-
-    /// Populate the table with the specified content.
-    /// It is an error for two or more rows to have the same
-    /// cpu_data_type value.
-    cpu_data_type_table(const std::vector<row> &content);
-
-    /// Return the row with the specified data type.
-    /// It is an error for \p t to not be a member of the table.
-    const row &lookup(cpu_data_type t) const;
-
-private:
-    const std::vector<row> content_;
-};
-const cpu_data_type_table &get_cpu_data_types();
-
-// Convenience functions...
-size_t get_local_value_stack_slot_size(cpu_data_type t);
-size_t get_size_in_bytes(cpu_data_type t);
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/registers.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/registers.hpp
deleted file mode 100644
index e5f850b6866..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/registers.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_REGISTERS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_REGISTERS_HPP
-
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-/// Shortened names for some of the x86-64 registers, defined here
-/// for notational convenience.
-/// Suitable for comparison / assignment to instance members such as
-/// Xbyak::CodeGenerator::r8.
-namespace regs {
-static const Xbyak::Reg64 rax(Xbyak::Operand::Code::RAX);
-static const Xbyak::Reg64 rcx(Xbyak::Operand::Code::RCX);
-static const Xbyak::Reg64 rdx(Xbyak::Operand::Code::RDX);
-static const Xbyak::Reg64 rbx(Xbyak::Operand::Code::RBX);
-static const Xbyak::Reg64 rsp(Xbyak::Operand::Code::RSP);
-static const Xbyak::Reg64 rbp(Xbyak::Operand::Code::RBP);
-static const Xbyak::Reg64 rsi(Xbyak::Operand::Code::RSI);
-static const Xbyak::Reg64 rdi(Xbyak::Operand::Code::RDI);
-static const Xbyak::Reg64 r8(Xbyak::Operand::Code::R8);
-static const Xbyak::Reg64 r9(Xbyak::Operand::Code::R9);
-static const Xbyak::Reg64 r10(Xbyak::Operand::Code::R10);
-static const Xbyak::Reg64 r11(Xbyak::Operand::Code::R11);
-static const Xbyak::Reg64 r12(Xbyak::Operand::Code::R12);
-static const Xbyak::Reg64 r13(Xbyak::Operand::Code::R13);
-static const Xbyak::Reg64 r14(Xbyak::Operand::Code::R14);
-static const Xbyak::Reg64 r15(Xbyak::Operand::Code::R15);
-
-/// NOTE: According to this Wikipedia page:
-/// https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
-/// the there were only 16 (not 32) Xmm registers in AVX.
-/// Some IA extension after AVX (perhaps AVX512?) introduced
-/// xmm16...xmm31.
-/// We'll assume all 32 Xmm registers are accessible.
-static const Xbyak::Xmm xmm0(0);
-static const Xbyak::Xmm xmm1(1);
-static const Xbyak::Xmm xmm2(2);
-static const Xbyak::Xmm xmm3(3);
-static const Xbyak::Xmm xmm4(4);
-static const Xbyak::Xmm xmm5(5);
-static const Xbyak::Xmm xmm6(6);
-static const Xbyak::Xmm xmm7(7);
-static const Xbyak::Xmm xmm8(8);
-static const Xbyak::Xmm xmm9(9);
-static const Xbyak::Xmm xmm10(10);
-static const Xbyak::Xmm xmm11(11);
-static const Xbyak::Xmm xmm12(12);
-static const Xbyak::Xmm xmm13(13);
-static const Xbyak::Xmm xmm14(14);
-static const Xbyak::Xmm xmm15(15);
-static const Xbyak::Xmm xmm16(16);
-static const Xbyak::Xmm xmm17(17);
-static const Xbyak::Xmm xmm18(18);
-static const Xbyak::Xmm xmm19(19);
-static const Xbyak::Xmm xmm20(20);
-static const Xbyak::Xmm xmm21(21);
-static const Xbyak::Xmm xmm22(22);
-static const Xbyak::Xmm xmm23(23);
-static const Xbyak::Xmm xmm24(24);
-static const Xbyak::Xmm xmm25(25);
-static const Xbyak::Xmm xmm26(26);
-static const Xbyak::Xmm xmm27(27);
-static const Xbyak::Xmm xmm28(28);
-static const Xbyak::Xmm xmm29(29);
-static const Xbyak::Xmm xmm30(30);
-static const Xbyak::Xmm xmm31(31);
-
-// OP Mask Reg for AVX512
-static const Xbyak::Opmask k0(0);
-static const Xbyak::Opmask k1(1);
-static const Xbyak::Opmask k2(2);
-static const Xbyak::Opmask k3(3);
-static const Xbyak::Opmask k4(4);
-static const Xbyak::Opmask k5(5);
-static const Xbyak::Opmask k6(6);
-static const Xbyak::Opmask k7(7);
-
-// Tile Regs for AMX
-static const Xbyak::Tmm tmm0(0);
-static const Xbyak::Tmm tmm1(1);
-static const Xbyak::Tmm tmm2(2);
-static const Xbyak::Tmm tmm3(3);
-static const Xbyak::Tmm tmm4(4);
-static const Xbyak::Tmm tmm5(5);
-static const Xbyak::Tmm tmm6(6);
-static const Xbyak::Tmm tmm7(7);
-
-} // namespace regs
-
-/// Convert gp reg to specific type
-inline Xbyak::Reg8 to_reg8(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isREG(), "Not a GP reg: " << r.toString());
-    return r.cvt8();
-}
-inline Xbyak::Reg16 to_reg16(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isREG(), "Not a GP reg: " << r.toString());
-    return r.cvt16();
-}
-inline Xbyak::Reg32 to_reg32(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isREG(), "Not a GP reg: " << r.toString());
-    return r.cvt32();
-}
-inline Xbyak::Reg64 to_reg64(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isREG(), "Not a GP reg: " << r.toString());
-    return r.cvt64();
-}
-
-/// Convert fp reg to specific type
-inline Xbyak::Xmm to_xmm(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isXMM() || r.isYMM() || r.isZMM(),
-            "Not an [XYZ]MM reg: " << r.toString());
-    return Xbyak::Xmm(r.getIdx());
-}
-inline Xbyak::Ymm to_ymm(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isXMM() || r.isYMM() || r.isZMM(),
-            "Not an [XYZ]MM reg: " << r.toString());
-    return Xbyak::Ymm(r.getIdx());
-}
-inline Xbyak::Zmm to_zmm(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isXMM() || r.isYMM() || r.isZMM(),
-            "Not an [XYZ]MM reg: " << r.toString());
-    return Xbyak::Zmm(r.getIdx());
-}
-
-/// Convert mask reg to Opmask
-inline Xbyak::Opmask to_mask(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isOPMASK(), "Not an OPMASK reg: " << r.toString());
-    return Xbyak::Opmask(r.getIdx());
-}
-
-/// Convert tile reg to AMX tile
-inline Xbyak::Tmm to_tmm(const Xbyak::Reg &r) {
-    COMPILE_ASSERT(r.isTMM(), "Not an AMX tile reg: " << r.toString());
-    return Xbyak::Tmm(r.getIdx());
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/target_profile.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/target_profile.cpp
deleted file mode 100644
index 7c24fc37177..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/target_profile.cpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <limits>
-#include <string.h>
-#include <utility>
-#include <util/utils.hpp>
-
-#include "registers.hpp"
-#include "target_profile.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-target_profile_t get_target_profile(
-        const runtime::target_machine_t &target_machine) {
-    // target profile
-    target_profile_t profile(target_machine);
-
-    // Allocatable general purpose 64bit regs
-    profile.alloc_gp_regs_ = {
-            regs::rax,
-            regs::rcx,
-            regs::rdx,
-            regs::rbx,
-            // regs::rsp, needs to point to the head of the stack
-            // regs::rbp, needs to point to start of stack frame
-            regs::rsi,
-            regs::rdi,
-            regs::r8,
-            regs::r9,
-            regs::r10,
-            regs::r11,
-            regs::r12,
-            regs::r13,
-            regs::r14,
-            regs::r15,
-    };
-
-    // Allocatable SIMD regs
-    profile.alloc_xmm_regs_ = {
-            regs::xmm0,
-            regs::xmm1,
-            regs::xmm2,
-            regs::xmm3,
-            regs::xmm4,
-            regs::xmm5,
-            regs::xmm6,
-            regs::xmm7,
-            regs::xmm8,
-            regs::xmm9,
-            regs::xmm10,
-            regs::xmm11,
-            regs::xmm12,
-            regs::xmm13,
-            regs::xmm14,
-            regs::xmm15,
-    };
-    profile.alloc_xmm_vex_regs_ = profile.alloc_xmm_regs_;
-
-    // AVX-512 extended regs
-    if (target_machine.cpu_flags_.fAVX512F) {
-        auto &xmm_regs = profile.alloc_xmm_regs_;
-        xmm_regs.insert(xmm_regs.end(),
-                {
-                        regs::xmm16,
-                        regs::xmm17,
-                        regs::xmm18,
-                        regs::xmm19,
-                        regs::xmm20,
-                        regs::xmm21,
-                        regs::xmm22,
-                        regs::xmm23,
-                        regs::xmm24,
-                        regs::xmm25,
-                        regs::xmm26,
-                        regs::xmm27,
-                        regs::xmm28,
-                        regs::xmm29,
-                        regs::xmm30,
-                        regs::xmm31,
-                });
-        // Allocatable mask regs
-        profile.alloc_mask_regs_ = {
-                regs::k1,
-                regs::k2,
-                regs::k3,
-                regs::k4,
-                regs::k5,
-                regs::k6,
-                regs::k7,
-        };
-    }
-
-    // Allocatable AMX tile regs
-    if (target_machine.cpu_flags_.fAVX512AMXTILE) {
-        profile.alloc_tile_regs_ = {
-                regs::tmm0,
-                regs::tmm1,
-                regs::tmm2,
-                regs::tmm3,
-                regs::tmm4,
-                regs::tmm5,
-                regs::tmm6,
-                regs::tmm7,
-        };
-    }
-
-    // Function call abi info
-#ifdef __linux__
-    profile.call_convention_ = call_convention::system_v;
-    profile.shadow_space_bytes_ = 0;
-    profile.red_zone_bytes_ = 0;
-    // Function return regs
-    profile.func_return_gp_reg_ = regs::rax;
-    profile.func_return_xmm_reg_ = regs::xmm0;
-    // Function call arg regs
-    profile.func_arg_gp_regs_ = {
-            regs::rdi, // 5
-            regs::rsi, // 4
-            regs::rdx, // 2
-            regs::rcx, // 1
-            regs::r8, // 6
-            regs::r9, // 7
-    };
-    profile.func_arg_xmm_regs_ = {
-            regs::xmm0,
-            regs::xmm1,
-            regs::xmm2,
-            regs::xmm3,
-            regs::xmm4,
-            regs::xmm5,
-            regs::xmm6,
-            regs::xmm7,
-    };
-    // Function call save regs
-    profile.caller_saved_gp_regs_ = {
-            regs::rax,
-            regs::rcx,
-            regs::rdx,
-            regs::rsi,
-            regs::rdi,
-            regs::r8,
-            regs::r9,
-            regs::r10,
-            regs::r11,
-    };
-    profile.callee_saved_gp_regs_ = {
-            regs::rbx,
-            regs::r12,
-            regs::r13,
-            regs::r14,
-            regs::r15,
-    };
-    profile.callee_saved_xmm_regs_ = {
-            // non
-    };
-#elif _WIN32
-    profile.call_convention_ = call_convention::microsoft;
-    profile.shadow_space_bytes_ = 32;
-    profile.red_zone_bytes_ = 0;
-    // Function return regs
-    profile.func_return_gp_reg_ = regs::rax;
-    profile.func_return_xmm_reg_ = regs::xmm0;
-    // Function call arg regs
-    profile.func_arg_gp_regs_ = {
-            regs::rcx, // 1
-            regs::rdx, // 2
-            regs::r8, // 6
-            regs::r9, // 7
-    };
-    profile.func_arg_xmm_regs_ = {
-            regs::xmm0,
-            regs::xmm1,
-            regs::xmm2,
-            regs::xmm3,
-    };
-    // Function call save regs
-    profile.caller_saved_gp_regs_ = {
-            regs::rax,
-            regs::rcx,
-            regs::rdx,
-            regs::r8,
-            regs::r9,
-            regs::r10,
-            regs::r11,
-    };
-    profile.callee_saved_gp_regs_ = {
-            regs::rbx,
-            regs::rdi,
-            regs::rsi,
-            regs::r12,
-            regs::r13,
-            regs::r14,
-            regs::r15,
-    };
-    profile.callee_saved_xmm_regs_ = {
-            regs::xmm6,
-            regs::xmm7,
-            regs::xmm8,
-            regs::xmm9,
-            regs::xmm10,
-            regs::xmm11,
-            regs::xmm12,
-            regs::xmm13,
-            regs::xmm14,
-            regs::xmm15,
-    };
-#else
-    COMPILE_ASSERT(false, "Not supported");
-#endif
-
-    return profile;
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/target_profile.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/target_profile.hpp
deleted file mode 100644
index 7db546ef637..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/target_profile.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_TARGET_PROFILE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_TARGET_PROFILE_HPP
-
-#include <compiler/config/context.hpp>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-
-#include <map>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-enum class call_convention {
-    undefined = 0,
-    system_v,
-    microsoft,
-};
-
-struct target_profile_t {
-    target_profile_t(const runtime::target_machine_t &target_machine)
-        : target_machine_(target_machine) {}
-
-    const runtime::target_machine_t &target_machine_;
-    call_convention call_convention_ = call_convention::undefined;
-    size_t shadow_space_bytes_ = 0;
-    size_t red_zone_bytes_ = 0;
-
-    std::vector<Xbyak::Reg> alloc_gp_regs_;
-    std::vector<Xbyak::Reg> alloc_xmm_regs_;
-    std::vector<Xbyak::Reg> alloc_xmm_vex_regs_;
-    std::vector<Xbyak::Reg> alloc_mask_regs_;
-    std::vector<Xbyak::Reg> alloc_tile_regs_;
-
-    std::vector<Xbyak::Reg> caller_saved_gp_regs_;
-    std::vector<Xbyak::Reg> callee_saved_gp_regs_;
-    std::vector<Xbyak::Reg> callee_saved_xmm_regs_;
-
-    std::vector<Xbyak::Reg> func_arg_gp_regs_;
-    std::vector<Xbyak::Reg> func_arg_xmm_regs_;
-
-    Xbyak::Reg func_return_gp_reg_;
-    Xbyak::Reg func_return_xmm_reg_;
-};
-
-target_profile_t get_target_profile(
-        const runtime::target_machine_t &target_machine);
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/type_mapping.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/type_mapping.cpp
deleted file mode 100644
index 1bdb32ba8ce..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/type_mapping.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/jit/xbyak/x86_64/type_mapping.hpp>
-
-#include <runtime/generic_val.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-cpu_data_type get_cpu_data_type(sc_data_type_t t) {
-    const sc_data_etype e = t.type_code_;
-
-    if (t.is_tile()) {
-        // cpu_data_type related operations for tile dtype are not supported
-        return cpu_data_type::void_t;
-    }
-
-    if (t.lanes_ == 1) {
-        if (etypes::is_pointer(e)) { return cpu_data_type::uint_64; }
-
-        switch (e) {
-            case sc_data_etype::BOOLEAN: return cpu_data_type::uint_8;
-            case sc_data_etype::U8: return cpu_data_type::uint_8;
-            case sc_data_etype::S8: return cpu_data_type::sint_8;
-            case sc_data_etype::U16: return cpu_data_type::uint_16;
-            case sc_data_etype::U32: return cpu_data_type::uint_32;
-            case sc_data_etype::S32: return cpu_data_type::sint_32;
-            case sc_data_etype::BF16: return cpu_data_type::uint_16;
-            case sc_data_etype::F16: return cpu_data_type::float_16;
-            case sc_data_etype::F32: return cpu_data_type::float_32;
-            case sc_data_etype::INDEX: return cpu_data_type::uint_64;
-
-            // We use this lowering simply because it has the same size
-            // as a generic_val union.
-            // Which asm operations we actually perform on values of this
-            // type will continue to be guided by the IR, which is fully
-            // aware that this value holds a generic_val.
-            case sc_data_etype::GENERIC: return cpu_data_type::uint_64;
-            case sc_data_etype::VOID_T: return cpu_data_type::void_t;
-
-            default: // to prevent compiler warnings
-                break;
-        }
-    } else if (t.lanes_ == 2) {
-        switch (e) {
-            case sc_data_etype::S32: return cpu_data_type::sint_32_x2;
-            case sc_data_etype::U32: return cpu_data_type::uint_32_x2;
-            case sc_data_etype::F32: return cpu_data_type::float_32_x2;
-            case sc_data_etype::INDEX: return cpu_data_type::uint_64_x2;
-            default: // to prevent compiler warnings
-                break;
-        }
-    } else if (t.lanes_ == 4) {
-        switch (e) {
-            case sc_data_etype::BOOLEAN: return cpu_data_type::mask_x4;
-            case sc_data_etype::BF16: return cpu_data_type::uint_16_x4;
-            case sc_data_etype::U16: return cpu_data_type::uint_16_x4;
-            case sc_data_etype::U32: return cpu_data_type::uint_32_x4;
-            case sc_data_etype::S32: return cpu_data_type::sint_32_x4;
-            case sc_data_etype::F16: return cpu_data_type::float_16_x4;
-            case sc_data_etype::F32: return cpu_data_type::float_32_x4;
-            case sc_data_etype::INDEX: return cpu_data_type::uint_64_x4;
-            default: // to prevent compiler warnings
-                break;
-        }
-    } else if (t.lanes_ == 8) {
-        switch (e) {
-            case sc_data_etype::BOOLEAN: return cpu_data_type::mask_x8;
-            case sc_data_etype::U8: return cpu_data_type::uint_8_x8;
-            case sc_data_etype::S8: return cpu_data_type::sint_8_x8;
-            case sc_data_etype::BF16: return cpu_data_type::uint_16_x8;
-            case sc_data_etype::U16: return cpu_data_type::uint_16_x8;
-            case sc_data_etype::U32: return cpu_data_type::uint_32_x8;
-            case sc_data_etype::S32: return cpu_data_type::sint_32_x8;
-            case sc_data_etype::F16: return cpu_data_type::float_16_x8;
-            case sc_data_etype::F32: return cpu_data_type::float_32_x8;
-            case sc_data_etype::INDEX: return cpu_data_type::uint_64_x8;
-            default: // to prevent compiler warnings
-                break;
-        }
-    } else if (t.lanes_ == 16) {
-        switch (e) {
-            case sc_data_etype::BOOLEAN: return cpu_data_type::mask_x16;
-            case sc_data_etype::U8: return cpu_data_type::uint_8_x16;
-            case sc_data_etype::S8: return cpu_data_type::sint_8_x16;
-            case sc_data_etype::U16: return cpu_data_type::uint_16_x16;
-            case sc_data_etype::U32: return cpu_data_type::uint_32_x16;
-            case sc_data_etype::S32: return cpu_data_type::sint_32_x16;
-            case sc_data_etype::BF16: return cpu_data_type::uint_16_x16;
-            case sc_data_etype::F16: return cpu_data_type::float_16_x16;
-            case sc_data_etype::F32: return cpu_data_type::float_32_x16;
-            default: // to prevent compiler warnings
-                break;
-        }
-    } else if (t.lanes_ == 32) {
-        switch (e) {
-            case sc_data_etype::BOOLEAN: return cpu_data_type::mask_x32;
-            case sc_data_etype::BF16: return cpu_data_type::uint_16_x32;
-            case sc_data_etype::F16: return cpu_data_type::float_16_x32;
-            case sc_data_etype::U16: return cpu_data_type::uint_16_x32;
-            case sc_data_etype::U8: return cpu_data_type::uint_8_x32;
-            case sc_data_etype::S8: return cpu_data_type::sint_8_x32;
-            default: // to prevent compiler warnings
-                break;
-        }
-    } else if (t.lanes_ == 64) {
-        switch (e) {
-            case sc_data_etype::BOOLEAN: return cpu_data_type::mask_x64;
-            case sc_data_etype::U8: return cpu_data_type::uint_8_x64;
-            case sc_data_etype::S8: return cpu_data_type::sint_8_x64;
-            default: // to prevent compiler warnings
-                break;
-        }
-    }
-
-    COMPILE_ASSERT(false, "Unhandled type: " << t);
-}
-
-const cpu_data_type_table::row &get_cpu_data_type_row(sc_data_type_t t) {
-    return get_cpu_data_types().lookup(get_cpu_data_type(t));
-}
-
-void get_stack_allocated_tensor_buffer_lowering_info(
-        sc_data_type_t element_type, size_t num_elements, size_t &buffer_size) {
-    COMPILE_ASSERT(num_elements > 0, "cannot allocate zero-element tensors");
-
-    const cpu_data_type_table::row &r = get_cpu_data_type_row(element_type);
-
-    // TODO(xxx): We're supposed to ensure that *every* element of the tensor
-    // buffer will meet the "natural" alignment standard. We might want an
-    // assert to verify that, instead of just assuming that no intra-element
-    // padding is needed to get that outcome.
-
-    buffer_size = r.size_in_bytes_ * num_elements;
-
-    if (const size_t excess = buffer_size % 8) { buffer_size += (8 - excess); }
-}
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/type_mapping.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/type_mapping.hpp
deleted file mode 100644
index f60e7e6e0b6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/x86_64/type_mapping.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_TYPE_MAPPING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_X86_64_TYPE_MAPPING_HPP
-
-#include <compiler/ir/sc_data_type.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-namespace x86_64 {
-
-/**
- * @brief Defines the mapping we'll use for IR types --> CPU native types.
- *
- * Returns the cpu data type prescribed by our time-mapping policy, or
- * fails a COMPILE_ASSERT if the mapping is undefined for \p t.
- *
- * Our type-mapping scheme is based on the following chain:
- *
- * (Graphcompiler IR data type)
- * --> (C/C++ type)             // prescribed by graphcompiler's design
- * --> (CPU data type)          // prescribed by psABI Table 3.1.
- *
- * Note: The type-mapping implementation will generally be incomplete.
- * We add to it on an as-needed basis.
- *
- * TL;DR:
- *
- * Our type-mapping is constrained by the following requirements:
- *
- *    - Function call arguments / return values must be psABI compliant:
- *
- *        - Graphcompiler's IR type system assumes a particular mapping from
- *          \c sc_data_type_t to (C/C++ data type).
- *
- *        - And psABI (as well as the compiler)  prescribe a particular
- *          mapping from (C/C++ data type) to (CPU data type).
- *
- *    - Computations within the body of a function less constrainted, but...
- *
- *        Strictly speaking, the Xbyak JIT engine is free to use alternative
- *        data types within the body of a function, because the psABI's
- *        constraints don't apply.
- *
- *        However, at the moment we have no particular reason to use an
- *        alternative type-mapping within the body of JIT-generated functions.
- *
- * For these reasons, we'll (currently) use the just one type-mapping scheme
- * for function arguments / return values AND within each function body.
- */
-cpu_data_type get_cpu_data_type(sc_data_type_t t);
-
-/// A convenience function that combines \c get_cpu_data_type() and
-/// \c cpu_data_type_table::lookup().
-const cpu_data_type_table::row &get_cpu_data_type_row(sc_data_type_t t);
-
-/**
- * Computes the details needed for allocating a tensor buffer onto the
- * stack.
- *
- * Assumptions:
- *   - The tensor elements will be packed, i.e., that no padding
- *     will be used between adjacent tensor elements.
- *
- *   - The %rsp is already 8-byte aligned before allocating this buffer,
- *     and that %rsp must *also* be 8-byte aligned after allocating
- *     this buffer.
- *
- *     (This is a policy decision. It may waste a little stack space, but
- *     it simplifies our logic for meeting psABI-required stack-alignment
- *     requirements.)
- *
- * \param[in] element_type
- * \param[in] num_elements Must be positive.
- * \param[out] buffer_size The minimum required size for the tensor buffer,
- *      in bytes. The buffer will be oversized as needed to meet the
- *      alignment requirements stated above.
- */
-void get_stack_allocated_tensor_buffer_lowering_info(
-        sc_data_type_t element_type, size_t num_elements, size_t &buffer_size);
-
-} // namespace x86_64
-} // namespace xbyak
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/xbyak_jit.cpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/xbyak_jit.cpp
deleted file mode 100644
index 2f33f40a631..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/xbyak_jit.cpp
+++ /dev/null
@@ -1,223 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include <compiler/jit/symbol_resolver.hpp>
-#include <compiler/jit/xbyak/backend/xbyak_jit_generator.hpp>
-#include <compiler/jit/xbyak/backend/xbyak_lowering_viewer.hpp>
-#include <compiler/jit/xbyak/ir/xbyak_printer.hpp>
-#include <runtime/config.hpp>
-#include <util/utils.hpp>
-
-#include <compiler/codegen/precodegen_passes.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dessa_transform.hpp>
-#include <compiler/ir/transform/loop_function_motion.hpp>
-#include <compiler/ir/transform/loop_invariant_code_motion.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <compiler/ir/transform/value_numbering.hpp>
-
-#include <compiler/jit/xbyak/ir/pass/ir_data_initializer.hpp>
-#include <compiler/jit/xbyak/ir/pass/ir_indexer.hpp>
-#include <compiler/jit/xbyak/ir/pass/live_interval.hpp>
-#include <compiler/jit/xbyak/ir/transform/avx2_legalizer.hpp>
-#include <compiler/jit/xbyak/ir/transform/avx2_mask_indexing_transform.hpp>
-#include <compiler/jit/xbyak/ir/transform/call_transform.hpp>
-#include <compiler/jit/xbyak/ir/transform/constant_optimizer.hpp>
-#include <compiler/jit/xbyak/ir/transform/fp16_legalizer.hpp>
-#include <compiler/jit/xbyak/ir/transform/indexing_transform.hpp>
-#include <compiler/jit/xbyak/ir/transform/intrinsics_combine.hpp>
-#include <compiler/jit/xbyak/ir/transform/live_range_split.hpp>
-#include <compiler/jit/xbyak/ir/transform/low_level_legalizer.hpp>
-#include <compiler/jit/xbyak/ir/transform/module_var_resolver.hpp>
-#include <compiler/jit/xbyak/ir/transform/register_allocation.hpp>
-#include <compiler/jit/xbyak/ir/transform/x86_intrinsics_lowering.hpp>
-
-#include "xbyak_jit.hpp"
-
-SC_MODULE(xbyakjit.xbyak_jit)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace xbyak {
-
-sequential_module_pass_t get_xbyak_precodegen_passes(
-        const context_ptr &ctx, const x86_64::target_profile_t &profile) {
-    std::vector<module_pass_ptr> ret;
-
-    ret.emplace_back(module_function_pass_t::make<module_var_resolver_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<simple_loop_function_motion_t>());
-    ret.emplace_back(module_function_pass_t::make<live_range_splitter_t>());
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(false));
-
-    ret.emplace_back(module_function_pass_t::make<indexing_transform_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<fp16_legalizer_t>(ctx->machine_));
-    ret.emplace_back(utils::make_unique<constant_folder_t>(false));
-    ret.emplace_back(utils::make_unique<auto_caster_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<low_level_legalizer_t>(ctx->machine_));
-    ret.emplace_back(module_function_pass_t::make<constant_optimizer_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<avx2_mask_indexing_t>(ctx->machine_));
-    ret.emplace_back(
-            module_function_pass_t::make<avx2_legalizer_t>(ctx->machine_));
-    ret.emplace_back(module_function_pass_t::make<ir_simplifier_t>(false));
-
-    ret.emplace_back(module_function_pass_t::make<ssa_transform_t>());
-    ret.emplace_back(module_function_pass_t::make<value_numbering_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<loop_invariant_code_motion_t>());
-    ret.emplace_back(module_function_pass_t::make<intrinsics_combine_t>());
-    ret.emplace_back(module_function_pass_t::make<value_numbering_t>());
-    ret.emplace_back(module_function_pass_t::make<dessa_transform_t>());
-    ret.emplace_back(utils::make_unique<constant_folder_t>(false));
-
-    ret.emplace_back(module_function_pass_t::make<call_transform_t>(profile));
-    ret.emplace_back(module_function_pass_t::make<x86_intrinsics_lowering_t>(
-            ctx->machine_));
-
-    ret.emplace_back(module_function_pass_t::make<ir_data_initializer_t>());
-    ret.emplace_back(module_function_pass_t::make<ir_indexer_t>());
-    ret.emplace_back(module_function_pass_t::make<live_interval_t>());
-    ret.emplace_back(
-            module_function_pass_t::make<register_allocation_t>(profile));
-
-    return sequential_module_pass_t(std::move(ret));
-}
-
-void *xbyak_external_symbol_resolve(const std::string &name) {
-    static std::unordered_map<std::string, void *> table = {
-            {"memset", (void *)memset},
-    };
-    // Find function in local table first, then external table
-    auto itr = table.find(name);
-    if (itr != table.end()) { return itr->second; }
-    return default_external_symbol_resolve(name);
-}
-
-} // namespace xbyak
-
-// ================== //
-//     xbyak_jit      //
-// ================== //
-using namespace gc::xbyak;
-
-xbyak_jit::xbyak_jit(context_ptr context)
-    : jit_engine_t(std::move(context))
-    , external_symbol_resolver_(xbyak_external_symbol_resolve) {}
-
-xbyak_jit::~xbyak_jit() = default;
-
-std::shared_ptr<jit_module> xbyak_jit::make_jit_module(
-        const_ir_module_ptr ir_mod, bool generate_wrapper) {
-    COMPILE_ASSERT(ir_mod->ctx_->machine_.cpu_flags_.fAVX2,
-            "Builtin codegen currently only support AVX2 and AVX512");
-    assert(ir_mod);
-    COMPILE_ASSERT(ir_mod->ctx_->flags_.ssa_passes_ == false,
-            "SC_SSA_PASSES is redundant for xbyak backend.");
-
-    //========================================================================
-    //  Default passes
-    //========================================================================
-    auto default_passes
-            = get_default_precodegen_passes(ir_mod->ctx_, generate_wrapper);
-    auto ir_mod1 = run_precodegen_passes(default_passes, ir_mod);
-    assert(ir_mod1);
-
-    //========================================================================
-    // Xbyak passes
-    //========================================================================
-    x86_64::target_profile_t target_profile
-            = x86_64::get_target_profile(ir_mod1->ctx_->machine_);
-
-    auto xbyak_passes
-            = get_xbyak_precodegen_passes(ir_mod1->ctx_, target_profile);
-    auto ir_mod2 = xbyak_passes(ir_mod1);
-    assert(ir_mod2);
-
-    if (utils::compiler_configs_t::get().xbyak_jit_asm_listing_) {
-        std::ofstream f("xbyak_ir.txt");
-        xbyak_printer_t printer(f, ir_mod2, target_profile);
-    }
-
-    //========================================================================
-    // Xbyak code-gen
-    //========================================================================
-    xbyak_lowering_viewer xlv {*this, *ir_mod2, target_profile};
-    auto jit_output_ = xlv.get_jit_output();
-
-    //========================================================================
-    // Execute __sc_init__
-    //========================================================================
-    auto &attr_table = *ir_mod2->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS);
-    typedef void (*init_func_t)(void *ctx, void *mod);
-    auto init_func = reinterpret_cast<init_func_t>(
-            jit_output_->get_func_address("__sc_init__"));
-    if (init_func) { init_func(nullptr, attr_table.data_.data_); }
-
-    //========================================================================
-    // Make xbyak_jit_module
-    //========================================================================
-    thread_pool_mode_t use_managed_tp = ir_mod2->attr_.get<thread_pool_mode_t>(
-            ir_module_t::attr_key_t::MANAGED_THREAD_POOL);
-    auto ret = std::shared_ptr<xbyak_jit_module_code>(
-            new xbyak_jit_module_code(std::move(jit_output_), use_managed_tp));
-    ret->postprocess(ir_mod2, attr_table);
-    return std::make_shared<jit_module>(std::move(attr_table), ret);
-}
-
-// ================== //
-//  xbyak_jit_module  //
-// ================== //
-
-xbyak_jit_module_code::xbyak_jit_module_code(
-        std::shared_ptr<xbyak_jit_generator> jit_output,
-        thread_pool_mode_t managed_thread_pool)
-    : jit_module_code(managed_thread_pool)
-    , jit_output_(std::move(jit_output)) {}
-
-void *xbyak_jit_module_code::get_address_of_symbol(const std::string &name) {
-    return jit_output_->get_func_address(name);
-}
-
-void *xbyak_jit_module_code::get_function(
-        const std::string &name, void *&wrapper) {
-    void *fun = jit_output_->get_func_address(name);
-    wrapper = jit_output_->get_func_address(name + "_0wrapper");
-    return fun;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/xbyak_jit.hpp b/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/xbyak_jit.hpp
deleted file mode 100644
index 83afb796b63..00000000000
--- a/src/graph/backend/graph_compiler/core/src/compiler/jit/xbyak/xbyak_jit.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_XBYAK_JIT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_COMPILER_JIT_XBYAK_XBYAK_JIT_HPP
-
-#include <compiler/jit/jit.hpp>
-#include <compiler/jit/xbyak/backend/xbyak_jit_generator.hpp>
-#include <compiler/jit/xbyak/configured_xbyak.hpp>
-#include <compiler/jit/xbyak/x86_64/native_types.hpp>
-
-#include <memory>
-#include <string>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class xbyak_jit;
-/**
- * @class xbyak_jit_module
- *
- * Explanation of the inheritence hierarchy:
- *
- * - The two public base classes are required to fit into the Graphcompiler JIT
- *   design.
- *
- * - We want SOME class to inherit from both jit_generator and ir_viewer_t,
- *   as a matter of coding convenience: we can directly call Xbyak-related
- *   codegen functions from the ir_handler callback methods.
- *
- * - We want THIS class to inherit from jit_generator because it's a simple
- *   way to ensure that the memory allocated for the JIT'ed code has the
- *   same lifespan as this xbyak_jit_module.
- */
-class SC_INTERNAL_API xbyak_jit_module_code : public jit_module_code {
-public:
-    virtual ~xbyak_jit_module_code() = default;
-
-private:
-    // NOTE: It may be okay to actually provide these. I just haven't given it
-    // much consideration yet. -cconvey
-    xbyak_jit_module_code(xbyak_jit_module_code &&other) = delete;
-    xbyak_jit_module_code(const xbyak_jit_module_code &other) = delete;
-
-    // xbyak_jit is this object's factory class.
-    friend class xbyak_jit;
-
-    /**
-     * @param jit_output - Describes the xbyak jit result.
-     * @param managed_thread_pool - Whether to use managed thread pool
-     */
-    xbyak_jit_module_code(
-            std::shared_ptr<xbyak::xbyak_jit_generator> jit_output,
-            thread_pool_mode_t managed_thread_pool);
-
-    std::shared_ptr<xbyak::xbyak_jit_generator> jit_output_;
-
-public:
-    void *get_address_of_symbol(const std::string &name) override;
-
-    void *get_function(const std::string &name, void *&wrapper) override;
-};
-
-class SC_INTERNAL_API xbyak_jit : public jit_engine_t {
-public:
-    std::function<void *(const std::string &)> external_symbol_resolver_;
-    xbyak_jit(context_ptr context = get_default_context());
-    virtual ~xbyak_jit();
-
-    std::shared_ptr<jit_module> make_jit_module(
-            const_ir_module_ptr ir_mod, bool generate_wrapper);
-    static void set_target_machine(runtime::target_machine_t &tm) {
-        // TODO(XXX): add checks in tm
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/batchnorm.cpp b/src/graph/backend/graph_compiler/core/src/ops/batchnorm.cpp
deleted file mode 100644
index 30d64ceca93..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/batchnorm.cpp
+++ /dev/null
@@ -1,521 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "batchnorm.hpp"
-#include <memory>
-#include <string>
-#include <utility>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-static sc_dim get_channel_size(
-        const any_map_t &attrs, const graph_tensor_ptr &input) {
-    std::string data_format = attrs.get<std::string>("data_format");
-    const auto &input_shape = input->details_.get_plain_dims();
-    sc_dim channel_size = data_format == "NCX"
-            ? input_shape[1]
-            : input_shape[input_shape.size() - 1];
-    return channel_size;
-}
-
-static void validate_channel_related_input(
-        const graph_tensor_ptr &gt, sc_dim channel_size, bool allow_bf16) {
-    COMPILE_ASSERT(gc::graph::check_shape_equal(gt->details_.get_plain_dims(),
-                           sc_dims {channel_size}),
-            "Batchnorm's mean/var/gamma/beta shall have the same shape as "
-            "channel dimension.");
-    if (allow_bf16) {
-        COMPILE_ASSERT(utils::is_one_of(gt->details_.dtype_.type_code_,
-                               sc_data_etype::F32, sc_data_etype::BF16),
-                "Batchnorm's mean/var/gamma/beta shall be with f32 or bf16 "
-                "dtype.");
-    } else {
-        COMPILE_ASSERT(gt->details_.dtype_.type_code_ == sc_data_etype::F32,
-                "Batchnorm's mean/var/gamma/beta shall be with f32 dtype.");
-    }
-}
-
-batchnorm_inference_op::batchnorm_inference_op(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    COMPILE_ASSERT(info_.inputs_.size() == 3 || info_.inputs_.size() == 5,
-            "batchnorm inference op shall have 3 or 5 inputs.");
-    COMPILE_ASSERT(
-            utils::is_one_of(info_.inputs_[0]->details_.dtype_.type_code_,
-                    sc_data_etype::F32, sc_data_etype::BF16,
-                    sc_data_etype::F16),
-            "batchnorm inference's input shall be one of fp32/bf16/fp16 "
-            "dtype.");
-    sc_dim channel_size = get_channel_size(attrs, info_.inputs_[0]);
-    bool allow_bf16 = (info_.inputs_[0]->details_.dtype_.type_code_
-            == sc_data_etype::BF16);
-    for (size_t i = 1; i < info_.inputs_.size(); ++i) {
-        validate_channel_related_input(
-                info_.inputs_[i], channel_size, allow_bf16);
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "batchnorm inference op shall have only 1 output.")
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    attrs_ = attrs;
-    op_name_ = "batchnorm_inference";
-}
-
-void batchnorm_inference_op::get_graph_impl(
-        std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    float epsilon = attrs_.get<float>("epsilon");
-    std::string format = attrs_.get<std::string>("data_format");
-    auto bc_axis = format == "NCX"
-            ? std::vector<int> {1}
-            : std::vector<int> {static_cast<int>(
-                    info_.inputs_[0]->details_.get_plain_dims().size() - 1)};
-    // input
-    graph->make_input(inputs);
-    // insert cast if input is of dtype bf16
-    for (size_t idx = 0; idx < inputs.size(); ++idx) {
-        inputs[idx] = cast_input_dtype(inputs[idx], graph);
-    }
-    // eps constant;
-    auto const_op = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {epsilon}),
-            datatypes::f32, sc_dims {1});
-    // variance+eps
-    auto var_eps = graph->make(
-            "add", {inputs[4], const_op->get_outputs()[0]}, {}, {});
-    // sqrt(variance+eps)
-    auto sqrt_op
-            = graph->make("squared_root", {var_eps->get_outputs()[0]}, {}, {});
-    // gamma / sqrt(variance+eps) (Due to benchdnn accuracy require, we need to
-    // use sqrt.)
-    auto bn_div = graph->make(
-            "div", {inputs[1], sqrt_op->get_outputs()[0]}, {}, {});
-    // mean * gamma *rsqrt(variance+eps)
-    auto mean_op
-            = graph->make("mul", {inputs[3], bn_div->get_outputs()[0]}, {}, {});
-    // beta - mean*gamma/sqrt(variance+eps)
-    auto bn_add = graph->make(
-            "sub", {inputs[2], mean_op->get_outputs()[0]}, {}, {});
-
-    auto x1 = graph->make("mul", {inputs[0], bn_div->get_outputs()[0]}, {},
-            any_map_t({{"bc_axis", bc_axis}}));
-
-    auto y1 = graph->make("add",
-            {x1->get_outputs()[0], bn_add->get_outputs()[0]}, {},
-            any_map_t({{"bc_axis", bc_axis}}));
-    // insert cast if output is bf16 dtype
-    y1 = cast_output_dtype(outputs[0], graph, y1);
-    // output
-    graph->make_output(y1->get_outputs());
-}
-
-void batchnorm_inference_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-batchnorm_forward_training_op::batchnorm_forward_training_op(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    info_.outputs_ = outs;
-    COMPILE_ASSERT(ins.size() == 5,
-            "batchnorm_forward_training's inputs size should be 5.");
-    sc_dim channel_size = get_channel_size(attrs, info_.inputs_[0]);
-    bool allow_bf16 = (info_.inputs_[0]->details_.dtype_.type_code_
-            == sc_data_etype::BF16);
-    for (size_t i = 1; i < info_.inputs_.size(); ++i) {
-        validate_channel_related_input(
-                info_.inputs_[i], channel_size, allow_bf16);
-        COMPILE_ASSERT(info_.inputs_[1]->details_.dtype_
-                        == info_.inputs_[i]->details_.dtype_,
-                "Batchnorm forward training input dtypes shall be consistent.");
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-        for (int i = 0; i < 4; i++) {
-            info_.outputs_.emplace_back(
-                    std::make_shared<graph_tensor>(this, ins[4]->details_));
-        }
-    } else {
-        COMPILE_ASSERT((outs.size() == 5),
-                "batchnorm_forward_training outputs size should be 5.");
-        info_.outputs_ = outs;
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-        for (size_t i = 1; i < outs.size(); i++) {
-            gc::graph::check_logical_tensor_shape_dtype_identical(
-                    info_.inputs_[1]->details_, info_.outputs_[i]->details_);
-        }
-    }
-    attrs_ = attrs;
-    op_name_ = "batchnorm_forward_training";
-}
-
-void batchnorm_forward_training_op::get_graph_impl(
-        std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    std::string format = attrs_.get_or_else("data_format", std::string("NXC"));
-
-    // input
-    graph->make_input(inputs);
-    int num_dims
-            = static_cast<int>(inputs[0]->details_.get_plain_dims().size());
-    auto src = inputs[0], mean = inputs[1], variance = inputs[2],
-         scale = inputs[3], shift = inputs[4];
-    auto src_pass2 = inputs[0], src_pass3 = inputs[0];
-
-    auto epsilon = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(
-                    std::vector<float> {attrs_.get_or_else("epsilon", 1e-5f)}),
-            datatypes::f32, sc_dims {1});
-    bool use_bnorm_opt = attrs_.get_or_else(op_attr_key::use_norm_opt, false);
-    // insert cast if input is bf16
-    src = cast_input_dtype(inputs[0], graph,
-            {{"dtype", datatypes::f32}, {op_attr_key::not_redundant, true}});
-    src_pass2 = cast_input_dtype(inputs[0], graph,
-            {{"dtype", datatypes::f32}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::not_redundant, true}});
-    if (!use_bnorm_opt) {
-        src_pass3 = cast_input_dtype(inputs[0], graph,
-                {{"dtype", datatypes::f32}, {op_attr_key::break_pre_fuse, true},
-                        {op_attr_key::not_redundant, true}});
-    }
-    mean = cast_input_dtype(inputs[1], graph);
-    variance = cast_input_dtype(inputs[2], graph);
-    scale = cast_input_dtype(inputs[3], graph);
-    shift = cast_input_dtype(inputs[4], graph);
-
-    std::vector<int> rd_axis, bc_axis;
-    if (format == "NCX") {
-        bc_axis = std::vector<int> {1};
-        for (int i = 0; i < num_dims; ++i) {
-            if (i != 1) rd_axis.push_back(i);
-        }
-    } else {
-        bc_axis = std::vector<int> {num_dims - 1};
-        for (int i = 0; i < num_dims - 1; ++i) {
-            rd_axis.push_back(i);
-        }
-    }
-    // mean and var of src 1 pass
-    float channel_size = 1.0f;
-    for (auto ax : rd_axis) {
-        channel_size *= inputs[0]->details_.get_plain_dims()[ax];
-    }
-    auto chan_size_op = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {channel_size}),
-            datatypes::f32, sc_dims {1});
-    auto reduce0 = graph->make("reduce", {src}, {},
-            {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-    std::shared_ptr<sc_op> new_var;
-    auto new_mean = graph->make("div",
-            {reduce0->get_outputs()[0], chan_size_op->get_outputs()[0]}, {},
-            {{op_attr_key::break_post_fuse, true},
-                    {op_attr_key::must_div, true}});
-    if (use_bnorm_opt) {
-        auto src_squared = graph->make("mul", {src, src}, {}, {});
-        auto reduce0_squared = graph->make("mul",
-                {reduce0->get_outputs()[0], reduce0->get_outputs()[0]}, {}, {});
-        auto reduce0_squared_mul = graph->make("div",
-                {reduce0_squared->get_outputs()[0],
-                        chan_size_op->get_outputs()[0]},
-                {}, {{op_attr_key::must_div, true}});
-        auto reduce1 = graph->make("reduce", {src_squared->get_outputs()[0]},
-                {}, {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-        auto sub0 = graph->make("sub",
-                {reduce1->get_outputs()[0],
-                        reduce0_squared_mul->get_outputs()[0]},
-                {}, {});
-        new_var = graph->make("div",
-                {sub0->get_outputs()[0], chan_size_op->get_outputs()[0]}, {},
-                {{op_attr_key::break_post_fuse, true},
-                        {op_attr_key::must_div, true}});
-    } else {
-        auto diff = graph->make("sub", {src_pass3, new_mean->get_outputs()[0]},
-                {}, {{"bc_axis", bc_axis}});
-        auto diff_squared = graph->make("mul",
-                {diff->get_outputs()[0], diff->get_outputs()[0]}, {}, {});
-        auto reduce1 = graph->make("reduce", {diff_squared->get_outputs()[0]},
-                {}, {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-        new_var = graph->make("div",
-                {reduce1->get_outputs()[0], chan_size_op->get_outputs()[0]}, {},
-                {{op_attr_key::break_post_fuse, true},
-                        {op_attr_key::must_div, true}});
-    }
-
-    // normalization of src (x_normalized)
-    auto sub1 = graph->make("sub", {src_pass2, new_mean->get_outputs()[0]}, {},
-            {{"bc_axis", bc_axis}});
-    auto add0 = graph->make("add",
-            {new_var->get_outputs()[0], epsilon->get_outputs()[0]}, {}, {});
-    auto sqrt = graph->make("squared_root", {add0->get_outputs()[0]}, {}, {});
-    auto div1 = graph->make("div",
-            {sub1->get_outputs()[0], sqrt->get_outputs()[0]}, {},
-            {{"bc_axis", bc_axis}});
-    // gamma*x_normalized + beta
-    auto mul2 = graph->make(
-            "mul", {div1->get_outputs()[0], scale}, {}, {{"bc_axis", bc_axis}});
-    auto add1 = graph->make(
-            "add", {mul2->get_outputs()[0], shift}, {}, {{"bc_axis", bc_axis}});
-
-    // running_mean and running_variance
-    sc_op_ptr add2, add3;
-    if (attrs_.get_or_else("momentum", float(1.)) == 1.f) {
-        add2 = graph->make("duplicate", {mean}, {}, {});
-        add3 = graph->make("duplicate", {variance}, {}, {});
-    } else if (attrs_.get_or_else("momentum", float(1.)) == 0.f) {
-        add2 = graph->make("duplicate", {new_mean->get_outputs()[0]}, {}, {});
-        add3 = graph->make("duplicate", {new_var->get_outputs()[0]}, {}, {});
-    } else {
-        auto momentum = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {
-                        attrs_.get_or_else("momentum", float(1.))}),
-                datatypes::f32, sc_dims {1});
-        auto one_sub_momentum = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {
-                        1 - attrs_.get_or_else("momentum", float(1.))}),
-                datatypes::f32, sc_dims {1});
-
-        // running_mean = momentum * mean + (1- momentum) * mean_x
-        auto mul3 = graph->make(
-                "mul", {mean, momentum->get_outputs()[0]}, {}, {});
-        auto mul4 = graph->make("mul",
-                {new_mean->get_outputs()[0],
-                        one_sub_momentum->get_outputs()[0]},
-                {}, {});
-        add2 = graph->make("add",
-                {mul3->get_outputs()[0], mul4->get_outputs()[0]}, {}, {});
-
-        // running_var = momentum * variance + (1- momentum) * var_x
-        auto mul5 = graph->make(
-                "mul", {variance, momentum->get_outputs()[0]}, {}, {});
-        auto mul6 = graph->make("mul",
-                {new_var->get_outputs()[0], one_sub_momentum->get_outputs()[0]},
-                {}, {});
-        add3 = graph->make("add",
-                {mul5->get_outputs()[0], mul6->get_outputs()[0]}, {}, {});
-    }
-
-    // insert cast if output is bf16
-    add1 = cast_output_dtype(outputs[0], graph, add1);
-    if (attrs_.get_or_else("momentum", float(1.)) != 1.f) {
-        add2 = cast_output_dtype(outputs[1], graph, add2);
-        add3 = cast_output_dtype(outputs[2], graph, add3);
-    }
-    new_mean = cast_output_dtype(outputs[3], graph, new_mean);
-    new_var = cast_output_dtype(outputs[4], graph, new_var);
-    // output
-    graph->make_output({add1->get_outputs()[0], add2->get_outputs()[0],
-            add3->get_outputs()[0], new_mean->get_outputs()[0],
-            new_var->get_outputs()[0]});
-}
-
-void batchnorm_forward_training_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-batchnorm_training_backprop_op_t::batchnorm_training_backprop_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    COMPILE_ASSERT(info_.inputs_.size() == 5,
-            "Batchnorm backprop op shall have 5 inputs.");
-    COMPILE_ASSERT(ins[0]->details_.dtype_ == ins[1]->details_.dtype_,
-            "Batchnorm backprop's src and delta_output must have the "
-            "same dtype.");
-    sc_dim channel_size = get_channel_size(attrs, info_.inputs_[0]);
-    bool allow_bf16 = (info_.inputs_[0]->details_.dtype_.type_code_
-            == sc_data_etype::BF16);
-    for (size_t i = 2; i < info_.inputs_.size(); ++i) {
-        validate_channel_related_input(
-                info_.inputs_[i], channel_size, allow_bf16);
-        COMPILE_ASSERT(info_.inputs_[2]->details_.dtype_
-                        == info_.inputs_[i]->details_.dtype_,
-                "Batchnorm backprop's gamma, mean and variance must "
-                "have the same dtype.");
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[2]->details_));
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[2]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 3,
-                "Batchnorm backprop op shall have 3 outputs.");
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[2]->details_, info_.outputs_[1]->details_);
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[2]->details_, info_.outputs_[2]->details_);
-    }
-    attrs_ = attrs;
-    op_name_ = "batchnorm_training_backprop";
-}
-
-void batchnorm_training_backprop_op_t::get_graph_impl(
-        std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    int num_dims
-            = static_cast<int>(inputs[0]->details_.get_plain_dims().size());
-
-    float epsilon = attrs_.get<float>("epsilon");
-    auto data_format = attrs_.get_or_else<std::string>("data_format", "NXC");
-    // input
-    graph->make_input(inputs);
-    // calculating reduce axis
-    std::vector<int> bc_axis, rd_axis;
-    if (data_format == "NCX") {
-        bc_axis = std::vector<int> {1};
-        for (int i = 0; i < num_dims; ++i) {
-            if (i != 1) rd_axis.push_back(i);
-        }
-    } else {
-        bc_axis = std::vector<int> {num_dims - 1};
-        for (int i = 0; i < num_dims - 1; ++i) {
-            rd_axis.push_back(i);
-        }
-    }
-
-    graph_tensor_ptr src = inputs[0], output_delta = inputs[1],
-                     mean = inputs[2], variance = inputs[3], gamma = inputs[4];
-    graph_tensor_ptr src_pass2 = inputs[0], output_delta_pass2 = inputs[1];
-    // cast input if its dtype is bf16
-    src = cast_input_dtype(inputs[0], graph,
-            {{"dtype", datatypes::f32}, {op_attr_key::not_redundant, true}});
-    src_pass2 = cast_input_dtype(inputs[0], graph,
-            {{"dtype", datatypes::f32}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::not_redundant, true}});
-    output_delta = cast_input_dtype(inputs[1], graph,
-            {{"dtype", datatypes::f32}, {op_attr_key::not_redundant, true}});
-    output_delta_pass2 = cast_input_dtype(inputs[1], graph,
-            {{"dtype", datatypes::f32}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::not_redundant, true}});
-    mean = cast_input_dtype(inputs[2], graph);
-    variance = cast_input_dtype(inputs[3], graph);
-    gamma = cast_input_dtype(inputs[4], graph);
-    // ------ calculate x_hat start ------
-    // eps constant
-    auto const_op = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {epsilon}),
-            datatypes::f32, sc_dims {1});
-    // reduce size constant, cast to float (potential inaccuracy/overflow)
-    float channel_size = 1.0f;
-    for (auto ax : rd_axis) {
-        channel_size *= inputs[0]->details_.get_plain_dims()[ax];
-    }
-    auto chan_size_op = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {channel_size}),
-            datatypes::f32, sc_dims {1});
-    // var + eps
-    auto var_eps = graph->make(
-            "add", {variance, const_op->get_outputs()[0]}, {}, {});
-    // rsqrt(var + eps)
-    auto sqrt_var_eps = graph->make("squared_root", var_eps->get_outputs(), {},
-            {{"reciprocal", false}});
-    // x - mu
-    auto x_mu = graph->make("sub", {src, mean}, {}, {{"bc_axis", bc_axis}});
-    // x_hat = (x - mu) *  rsqrt(var + eps)
-    auto x_hat = graph->make("div",
-            {x_mu->get_outputs()[0], sqrt_var_eps->get_outputs()[0]}, {},
-            {{"bc_axis", bc_axis}});
-    // ------ calculate x_hat end ------
-
-    // ------ duplicate x_mu && x_hat start ------
-    auto x_mu_pass2
-            = graph->make("sub", {src_pass2, mean}, {}, {{"bc_axis", bc_axis}});
-    // x_hat = (x - mu) *  rsqrt(var + eps)
-    auto x_hat_pass2 = graph->make("div",
-            {x_mu_pass2->get_outputs()[0], sqrt_var_eps->get_outputs()[0]}, {},
-            {{"bc_axis", bc_axis}});
-    // ------ duplicate x_mu && x_hat end ------
-
-    // gamma_delta = reducesum(dy * x_hat)
-    auto dy_x_hat = graph->make(
-            "mul", {output_delta, x_hat->get_outputs()[0]}, {}, {});
-    auto gamma_delta = graph->make("reduce", dy_x_hat->get_outputs(), {},
-            {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-    // gamma * x_hat + beta = y --> beta_delta = reducesum(dy)
-    auto beta_delta = graph->make("reduce", {output_delta}, {},
-            {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-    // ------ calculate x_delta start ------
-    // calculate: (x - mu) * rsqrt(var + eps) * gamma_delta <==> x_hat *
-    // gamma_delta
-    auto x_hat_gamma_delta = graph->make("mul",
-            {x_hat_pass2->get_outputs()[0], gamma_delta->get_outputs()[0]}, {},
-            {{"bc_axis", bc_axis}});
-    // add beta_delta and x_hat_gamma_delta
-    auto add = graph->make("add",
-            {x_hat_gamma_delta->get_outputs()[0], beta_delta->get_outputs()[0]},
-            {}, {{"bc_axis", bc_axis}});
-    //  * (1 / channel_size)
-    auto rescale = graph->make("div",
-            {add->get_outputs()[0], chan_size_op->get_outputs()[0]}, {}, {});
-    // get subtract results
-    auto sub = graph->make(
-            "sub", {output_delta_pass2, rescale->get_outputs()[0]}, {}, {});
-    // final mul: x_delta = gamma * rsqrt(var + eps) * sub_result
-    auto div = graph->make(
-            "div", {gamma, sqrt_var_eps->get_outputs()[0]}, {}, {});
-    auto x_delta
-            = graph->make("mul", {sub->get_outputs()[0], div->get_outputs()[0]},
-                    {}, {{"bc_axis", bc_axis}});
-    // ------ calculate x_delta end ------
-    // output
-    x_delta = cast_output_dtype(outputs[0], graph, x_delta);
-    gamma_delta = cast_output_dtype(outputs[1], graph, gamma_delta);
-    beta_delta = cast_output_dtype(outputs[2], graph, beta_delta);
-    graph->make_output({x_delta->get_outputs()[0],
-            gamma_delta->get_outputs()[0], beta_delta->get_outputs()[0]});
-}
-
-void batchnorm_training_backprop_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::batchnorm_inference_op, batchnorm_inference)
-OP_REGISTER(ops::batchnorm_forward_training_op, batchnorm_forward_training)
-OP_REGISTER(ops::batchnorm_training_backprop_op_t, batchnorm_training_backprop)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/batchnorm.hpp b/src/graph/backend/graph_compiler/core/src/ops/batchnorm.hpp
deleted file mode 100644
index 2225537efa3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/batchnorm.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_BATCHNORM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_BATCHNORM_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class batchnorm_inference_op : public graph_op_t,
-                               public op_traits::auto_copyable_t {
-public:
-    batchnorm_inference_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-/**
- * The batchnorm_forward_training_op
- * Inputs:
- *  - ins[0] - the data input
- *  - ins[1] - mean value
- *  - ins[2] - variance value
- *  - ins[3] - gamma
- *  - ins[4] - beta
- * Outputs:
- *  - outs[0] - bn result
- *  - outs[1] - running mean value
- *  - outs[2] - running variance value
- *  - outs[3] - batch mean value
- *  - outs[4] - batch variance value
- * Attrs:
- *  - epsilon: float - Default = 1e-5
- *  - momentum(Optional): float - Default = 1, which is used to compute
- * running_mean and running_variance
- *  - data_format(Optional): string - Default = "NXC", indicating whether data
- * is channel last.
- * */
-class batchnorm_forward_training_op : public graph_op_t,
-                                      public op_traits::auto_copyable_t {
-public:
-    batchnorm_forward_training_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-/**
- * The batchnorm_training_backprop_op
- * Inputs:
- *  - ins[0] - input (src of forward)
- *  - ins[1] - output_delta
- *  - ins[2] - mean
- *  - ins[3] - variance
- *  - ins[4] - gamma
- * Outputs:
- *  - outs[0] - input_delta
- *  - outs[1] - gamma_delta
- *  - outs[2] - beta_delta
- * Attrs:
- *  - epsilon: float - The value add to variance to increase numeric stability
- *  - data_format: string - Default: "NXC". Data format of input
- * */
-class batchnorm_training_backprop_op_t : public graph_op_t,
-                                         public op_traits::auto_copyable_t {
-public:
-    batchnorm_training_backprop_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/body_generator.hpp b/src/graph/backend/graph_compiler/core/src/ops/body_generator.hpp
deleted file mode 100644
index f6306684f25..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/body_generator.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_BODY_GENERATOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_BODY_GENERATOR_HPP
-#include <memory>
-#include <utility>
-#include <vector>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/graph/tensor_detail.hpp>
-#include <compiler/ir/sc_stmt.hpp>
-#include <unordered_map>
-#include <util/general_object.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using config_ptr = reflection::shared_general_object_t;
-class fusion_anchor_mgr_t;
-class sc_op;
-struct graph_tensor;
-struct tensor_slice;
-
-namespace tuner {
-struct config_space;
-using config_space_ptr = std::unique_ptr<tuner::config_space>;
-} // namespace tuner
-
-/**
- * The generator base class to generate IR for the body of an Op
- * */
-struct body_generator_base_t {
-    sc_op *owner_;
-    std::vector<logical_tensor_t> in_tensors_;
-    std::vector<logical_tensor_t> out_tensors_;
-    // extra parameter for internal func pointer.
-    expr single_core_func_param_;
-    /**
-     * simply judge the config is valid or not, then we needn't to generate
-     * others in graph
-     * */
-    virtual bool is_valid_config(
-            const context_ptr &ctx, const void *config) const {
-        return true;
-    }
-    /**
-     * Generates the tensor IR to the current IR builder.
-     * @param ctx the context
-     * @param config the configuration
-     * @param fusion the fusion manager. The generator should push the anchors
-     * to the fusion manager
-     * @param inputs the input args of the Op
-     * @param outputs the output tensors of the Op
-     * @param loops the for-loops to be later scheduled by schedule_loops()
-     * dispatch.
-     * @return generate status, e.g. success.
-     * */
-    virtual bool generate(context_ptr ctx, const void *config,
-            fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const = 0;
-    /**
-     * Get the single core calculation function e.g. wrapped brgemm.
-     */
-    virtual func_t get_single_core_func(context_ptr ctx, const void *config,
-            fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const {
-        return nullptr;
-    }
-
-    virtual std::vector<expr> get_extra_args_from_func(const func_t &f) const {
-        throw std::runtime_error("Unimplemented");
-    }
-
-    virtual float get_gflop() const = 0;
-
-    sc_data_type_t get_in_dtypes(size_t idx) const {
-        return in_tensors_.at(idx).dtype_;
-    }
-
-    sc_data_type_t get_out_dtypes(size_t idx) const {
-        return out_tensors_.at(idx).dtype_;
-    }
-
-    void set_single_core_func_param(const expr &single_core_func_param) {
-        single_core_func_param_ = single_core_func_param;
-    }
-
-    //   std::vector<sc_data_type_t> infer_out_dtypes() const {
-    //     if (in_tensors_.size()
-    //       && (in_tensors_.at(0).dtype_ == datatypes::u8
-    //         || in_tensors_.at(1).dtype_ == datatypes::s8)) {
-    //       return {datatypes::s32};
-    //     } else {
-    //       return {datatypes::f32};
-    //     }
-    //   }
-
-    /**
-     * Returns the type-erased default config. You can use `get()` method in
-     * the returned object to get the pointer, which can be used in `generate`
-     * */
-    virtual config_ptr get_default_config(context_ptr ctx) const = 0;
-
-    using config_ptr_vec = std::vector<config_ptr>;
-    using impl_kind_map = std::unordered_map<std::vector<int64_t>, int>;
-    virtual config_ptr_vec get_dynamic_config_candidates(
-            const context_ptr &ctx) const {
-        return config_ptr_vec();
-    }
-
-    virtual std::vector<uint64_t> convert_config_to_keys(
-            const config_ptr &config) const {
-        throw std::runtime_error("Unimplement");
-    }
-
-    virtual void schedule_loops(context_ptr ctx, const void *config, stmt body,
-            std::vector<for_loop> &fors) const = 0;
-
-    virtual ~body_generator_base_t() = default;
-
-    body_generator_base_t(sc_op *owner,
-            std::vector<logical_tensor_t> &&in_tensors,
-            std::vector<logical_tensor_t> &&out_tensors)
-        : owner_(owner)
-        , in_tensors_(std::move(in_tensors))
-        , out_tensors_(std::move(out_tensors)) {}
-
-    body_generator_base_t(sc_op *owner,
-            const std::vector<logical_tensor_t> &in_tensors,
-            const std::vector<logical_tensor_t> &out_tensors)
-        : owner_(owner), in_tensors_(in_tensors), out_tensors_(out_tensors) {}
-};
-
-using body_generator_ptr = std::unique_ptr<body_generator_base_t>;
-template <typename TConfig>
-struct body_generator_t : public body_generator_base_t {
-    virtual bool is_valid_config(
-            const context_ptr &ctx, const TConfig &config) const {
-        return true;
-    }
-    bool is_valid_config(
-            const context_ptr &ctx, const void *config) const override {
-        return is_valid_config(ctx, *reinterpret_cast<const TConfig *>(config));
-    }
-    virtual bool generate(context_ptr ctx, const TConfig &config,
-            fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const = 0;
-
-    bool generate(context_ptr ctx, const void *config,
-            fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const override {
-        return generate(ctx, *reinterpret_cast<const TConfig *>(config), fusion,
-                inputs, outputs, loops);
-    }
-
-    virtual func_t get_single_core_func(context_ptr ctx, const TConfig &config,
-            fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const {
-        return nullptr;
-    }
-
-    func_t get_single_core_func(context_ptr ctx, const void *config,
-            fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-            const std::vector<expr> &outputs,
-            std::vector<for_loop> &loops) const override {
-        return get_single_core_func(ctx,
-                *reinterpret_cast<const TConfig *>(config), fusion, inputs,
-                outputs, loops);
-    }
-
-    virtual void schedule_loops(context_ptr ctx, const TConfig &config,
-            stmt body, std::vector<for_loop> &fors) const = 0;
-
-    void schedule_loops(context_ptr ctx, const void *config, stmt body,
-            std::vector<for_loop> &fors) const override {
-        schedule_loops(
-                ctx, *reinterpret_cast<const TConfig *>(config), body, fors);
-    }
-
-    body_generator_t(sc_op *owner, const std::vector<logical_tensor_t> &ins,
-            const std::vector<logical_tensor_t> &outs)
-        : body_generator_base_t {owner, ins, outs} {}
-
-    body_generator_t(sc_op *owner, std::vector<logical_tensor_t> &&ins,
-            std::vector<logical_tensor_t> &&outs)
-        : body_generator_base_t {owner, std::move(ins), std::move(outs)} {}
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/convolution.cpp b/src/graph/backend/graph_compiler/core/src/ops/convolution.cpp
deleted file mode 100644
index 375136a0404..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/convolution.cpp
+++ /dev/null
@@ -1,2085 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "convolution.hpp"
-#include <algorithm>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include "fusible/memory_movement.hpp"
-#include "fusible/padding.hpp"
-#include "templates/conv1x1_backprop_data.hpp"
-#include "templates/conv1x1_backprop_weight.hpp"
-#include "templates/convNxN_backprop_data.hpp"
-#include "templates/convNxN_backprop_weight.hpp"
-#include "templates/conv_bwd.hpp"
-#include "templates/conv_dw_fwd.hpp"
-#include "templates/conv_fwd.hpp"
-#include "templates/conv_rl.hpp"
-#include "templates/nested_conv1x1_backprop_data.hpp"
-#include "templates/nested_conv1x1_backprop_weight.hpp"
-#include "templates/nested_convNxN_backprop_data.hpp"
-#include "templates/nested_convNxN_backprop_weight.hpp"
-#include "templates/nested_conv_fwd.hpp"
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <ops/templates/utils.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/dynamic_dispatch/ops/runtime_op_info.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/simple_math.hpp>
-#include <util/utils.hpp>
-
-SC_MODULE(ops.convolution);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-sc_data_type_t conv_fwd_core_op_t::infer_out_dtype(
-        const sc_data_type_t &input_dtype, const sc_data_type_t &weight_dtype) {
-    if (utils::is_one_of(input_dtype, datatypes::u8, datatypes::s8)
-            && weight_dtype == datatypes::s8) {
-        return datatypes::s32;
-    } else {
-        // both f32 and bf16 inputs generate f32 output
-        return datatypes::f32;
-    }
-}
-
-infer_status_code conv_fwd_core_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    bool is_weight_constant
-            = info_.inputs_[1]->producer_owner_->isa<constant_op_t>()
-            || info_.inputs_[1]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)
-            || info_.inputs_[1]->attrs_.get_or_else(
-                    "constant", const_kind::not_const);
-    if (attrs_.has_key("inverse_filter")
-            || !attrs_.get_or_else("image_affinity", true)
-            || !is_weight_constant || is_dynamic()) {
-        return infer_status_code::FAIL;
-    }
-    int C_block = 1;
-    int K_block = 1;
-    int tile_p = 1;
-
-    const auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-    auto inp_plain_dim = get_inputs()[0]->details_.get_plain_dims();
-    auto inp_plain_size = get_inputs()[0]->details_.get_plain_dims().size();
-    auto wei_plain_dim = use_rl != ops::rl_kind::NO_LOWERING
-            ? attrs_.get<sc_dims>("origin_wei_plain_dims")
-            : get_inputs()[1]->details_.get_plain_dims();
-    auto wei_plain_size = wei_plain_dim.size();
-
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    COMPILE_ASSERT(groups == 1
-                    || (inp_plain_dim.size() >= 5 && wei_plain_dim.size() >= 5
-                            && groups == inp_plain_dim[1]
-                            && groups == wei_plain_dim[0]),
-            "Group conv format should be NGCX and GOIX");
-    bool is_dw_brdgmm = (groups > 1) && (1 == inp_plain_dim[2])
-            && (1 == wei_plain_dim[1]);
-
-    // TODO(Zhicong): Add the infer slice range support for NGCHW like layout
-    if (groups > 1) { return infer_status_code::FAIL; }
-
-    if (config_data_) {
-        if (use_nested_conv_fwd_generator()) {
-            const nested_conv_fwd_config_t &tcfg
-                    = *config_data_.get_as<nested_conv_fwd_config_t>();
-            tile_p = tcfg.im_h_block;
-        } else if (use_rl == ops::rl_kind::FULL_LOWERING) {
-            const conv_fwd_rl_config_t &tcfg
-                    = *config_data_.get_as<conv_fwd_rl_config_t>();
-            tile_p = tcfg.brgemm_m;
-        } else if (is_dw_brdgmm) {
-            const conv_dw_fwd_config_t &tcfg
-                    = *config_data_.get_as<conv_dw_fwd_config_t>();
-            tile_p = tcfg.im_h_block;
-        } else {
-            const conv_fwd_config_t &tcfg
-                    = *config_data_.get_as<conv_fwd_config_t>();
-            tile_p = tcfg.tile_p;
-        }
-    }
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    // assume input is known
-    if (known_ranges_map[0].empty() && known_ranges_map[1].empty()) {
-        return infer_status_code::RETRY;
-    }
-
-    auto inp_dims = get_inputs()[0]->details_.get_blocking_dims(),
-         wei_dims = get_inputs()[1]->details_.get_blocking_dims(),
-         out_dims = get_outputs()[0]->details_.get_blocking_dims();
-    const int num_threads = runtime_config_t::get().get_num_threads();
-
-    auto &data_dtype = info_.inputs_[0]->details_.dtype_;
-    auto &weight_dtype = info_.inputs_[1]->details_.dtype_;
-    auto is_int8 = utils::is_one_of(data_dtype, datatypes::u8, datatypes::s8);
-    auto L2_cache_size
-            = get_default_context()->machine_.cpu_flags_.getDCacheSize(2);
-
-    auto get_slice_size = [](const slice_range &ranges,
-                                  const int dtype_size = 1) {
-        auto total_size = dtype_size;
-        for (auto &range : ranges) {
-            auto second = do_cast_and_fold(range.second);
-            if (second.isa<constant>()) {
-                total_size *= get_const_as_int(second.checked_as<constant>());
-            } else {
-                return -1;
-            }
-        }
-        return total_size;
-    };
-    auto input_slice_size = known_ranges_map[0].empty()
-            ? -1
-            : get_slice_size(
-                    known_ranges_map[0][0], utils::get_sizeof_type(data_dtype));
-    auto weight_size = math_utils::get_dims_product(wei_dims)
-            * utils::get_sizeof_type(weight_dtype);
-    auto can_fit_in_L2_cache = input_slice_size > 0
-            && (input_slice_size + weight_size < L2_cache_size);
-    if (config_data_ && inp_dims[0] % num_threads == 0
-            && wei_plain_dim.size() == 4 && wei_plain_dim[2] == 1
-            && wei_plain_dim[3] == 1
-            && can_fit_in_L2_cache) { // 1x1 NH-wise fusion
-        auto in_p2b_map = get_inputs()[0]
-                                  ->details_.get_format()
-                                  .format_code_.collect_p2b_mapping();
-        auto out_p2b_map = get_outputs()[0]
-                                   ->details_.get_format()
-                                   .format_code_.collect_p2b_mapping();
-        slice_range inp_slice, wei_slice, out_slice;
-        inp_slice.resize(inp_dims.size());
-        wei_slice.resize(wei_dims.size());
-        out_slice.resize(out_dims.size());
-        if (!known_ranges_map[0].empty()) {
-            slice_range inp_tmp;
-            inp_tmp = known_ranges_map[0][0];
-            if (!slice_full_on_axis(inp_dims, inp_tmp,
-                        in_p2b_map[1])) { // full on C
-                return infer_status_code::RETRY;
-            }
-            if (!slice_full_on_axis(inp_dims, inp_tmp,
-                        in_p2b_map[3])) { // full on W
-                return infer_status_code::RETRY;
-            }
-            if (!slice_divisible_by_factor(
-                        inp_tmp, {in_p2b_map[2].back()}, tile_p)
-                    || !slice_larger_than_bound_on_axis(inp_tmp, in_p2b_map[2],
-                            tile_p,
-                            data_dtype == datatypes::f32
-                                    ? 1
-                                    : 2)) { // dividable on H
-                return infer_status_code::RETRY;
-            }
-
-            for (auto i = 0UL; i < in_p2b_map.size(); i++) {
-                if (i != 1 && i != 3) {
-                    auto blocking_axis = in_p2b_map[i];
-                    for (auto &ax : blocking_axis) {
-                        inp_slice[ax] = known_ranges_map[0][0][ax];
-                        out_slice[ax] = known_ranges_map[0][0][ax];
-                    }
-                }
-            }
-        } else {
-            std::vector<int> plain_axis_required = {0, 2}; // N,H
-            for (auto plain_ax : plain_axis_required) {
-                for (unsigned i = 0; i < in_p2b_map[plain_ax].size(); i++) {
-                    auto ax = in_p2b_map[plain_ax][i];
-                    inp_slice[ax] = std::make_pair(
-                            expr(0), dim2unsigned(inp_dims[ax]));
-                }
-                for (unsigned i = 0; i < out_p2b_map[plain_ax].size(); i++) {
-                    auto ax = out_p2b_map[plain_ax][i];
-                    out_slice[ax] = std::make_pair(
-                            expr(0), dim2unsigned(out_dims[ax]));
-                }
-            }
-        }
-        if (!known_ranges_map[1].empty()) {
-            auto wei_slice = known_ranges_map[1][0];
-            std::vector<int> required_axis;
-            for (unsigned i = 0; i < wei_dims.size(); i++) {
-                required_axis.emplace_back(i);
-            }
-            if (!slice_full_on_axis(wei_dims, wei_slice, required_axis)) {
-                return infer_status_code::RETRY;
-            }
-        }
-        for (unsigned i = 0; i < wei_dims.size(); i++) {
-            wei_slice[i] = std::make_pair(expr(0), dim2unsigned(wei_dims[i]));
-        }
-        std::vector<int> plain_axis_required = {1, 3}; // C, W
-        for (auto plain_ax : plain_axis_required) {
-            for (unsigned i = 0; i < in_p2b_map[plain_ax].size(); i++) {
-                auto ax = in_p2b_map[plain_ax][i];
-                inp_slice[ax]
-                        = std::make_pair(expr(0), dim2unsigned(inp_dims[ax]));
-            }
-            for (unsigned i = 0; i < out_p2b_map[plain_ax].size(); i++) {
-                auto ax = out_p2b_map[plain_ax][i];
-                out_slice[ax]
-                        = std::make_pair(expr(0), dim2unsigned(out_dims[ax]));
-            }
-        }
-        fsmap.get(get_inputs()[0]) = slice_range_list {inp_slice};
-        fsmap.get(get_inputs()[1]) = slice_range_list {wei_slice};
-        fsmap.get(get_outputs()[0]) = slice_range_list {out_slice};
-    } else {
-        slice_range inp_slice, wei_slice, out_slice;
-        if (!known_ranges_map[0].empty()) {
-            slice_range inp_tmp;
-            inp_tmp = known_ranges_map[0][0];
-            std::vector<int> required_axis;
-            for (unsigned i = 1; i < inp_dims.size(); i++) {
-                required_axis.emplace_back(i);
-            }
-            if (!slice_full_on_axis(inp_dims, inp_tmp, required_axis)) {
-                return infer_status_code::RETRY;
-            }
-            inp_slice.emplace_back(known_ranges_map[0][0][0]);
-            out_slice.emplace_back(known_ranges_map[0][0][0]);
-        } else {
-            inp_slice.emplace_back(
-                    std::make_pair(expr(0), dim2unsigned(inp_dims[0])));
-        }
-        if (!known_ranges_map[1].empty()) {
-            auto wei_slice = known_ranges_map[1][0];
-            std::vector<int> required_axis;
-            for (unsigned i = 0; i < wei_dims.size(); i++) {
-                required_axis.emplace_back(i);
-            }
-            if (!slice_full_on_axis(wei_dims, wei_slice, required_axis)) {
-                return infer_status_code::RETRY;
-            }
-        }
-        for (unsigned i = 1; i < inp_dims.size(); i++) {
-            inp_slice.emplace_back(
-                    std::make_pair(expr(0), dim2unsigned(inp_dims[i])));
-        }
-        for (unsigned i = 0; i < wei_dims.size(); i++) {
-            wei_slice.emplace_back(
-                    std::make_pair(expr(0), dim2unsigned(wei_dims[i])));
-        }
-        for (unsigned i = 1; i < out_dims.size(); i++) {
-            out_slice.emplace_back(
-                    std::make_pair(expr(0), dim2unsigned(out_dims[i])));
-        }
-        fsmap.get(get_inputs()[0]) = slice_range_list {inp_slice};
-        fsmap.get(get_inputs()[1]) = slice_range_list {wei_slice};
-        fsmap.get(get_outputs()[0]) = slice_range_list {out_slice};
-    }
-    return infer_status_code::OK;
-}
-
-static sc_dims parse_NGCX_to_NCX(const sc_dims &shape) {
-    auto ret_shape = sc_dims(shape.size() - 1, 0);
-    ret_shape[0] = shape[0];
-    ret_shape[1] = shape[1] * shape[2];
-    for (auto i = 3UL; i < shape.size(); i++) {
-        ret_shape[i - 1] = shape[i];
-    }
-    return ret_shape;
-}
-
-static sc_dims parse_GOIX_to_OIX(const sc_dims &shape) {
-    auto ret_shape = sc_dims(shape.size() - 1, 0);
-    ret_shape[0] = shape[0] * shape[1];
-    for (auto i = 2UL; i < shape.size(); i++) {
-        ret_shape[i - 1] = shape[i];
-    }
-    return ret_shape;
-}
-
-static sc_dims parse_NCX_to_NGCX(const sc_dims &shape, int groups) {
-    size_t ndims = shape.size();
-    auto NCX_shape = shape;
-    auto ret_shape = sc_dims(ndims + 1, 0);
-    ret_shape[0] = NCX_shape[0]; // N
-    ret_shape[1] = groups; //  G
-    ret_shape[2] = NCX_shape[1] / groups; // IC
-    for (auto i = 2UL; i < ndims; i++) {
-        ret_shape[i + 1] = NCX_shape[i];
-    }
-    return ret_shape;
-}
-
-void conv_fwd_core_op_t::infer_out_tensor_details() {
-    if (!info_.outputs_[0]->details_.get_plain_dims().empty()) return;
-    auto &cur_plain_dims = info_.outputs_[0]->details_.get_plain_dims();
-    auto indims = info_.inputs_[0]->details_.get_plain_dims();
-    auto weightdims = info_.inputs_[1]->details_.get_plain_dims();
-    if (attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-            != ops::rl_kind::NO_LOWERING) {
-        weightdims = attrs_.get<sc_dims>("origin_wei_plain_dims");
-    }
-
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    auto &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    auto groups = attrs_.get_or_else("groups", 1);
-    if (groups > 1) {
-        indims = parse_NGCX_to_NCX(indims);
-        weightdims = parse_GOIX_to_OIX(weightdims);
-    }
-
-    auto expected_out_shape = infer_out_dims(get_owner_graph(), indims,
-            weightdims, pads_begin, pads_end, attrs_.get<sc_dims>("strides"),
-            get_dilations(attrs_), attrs_);
-    if (groups > 1) {
-        expected_out_shape = parse_NCX_to_NGCX(expected_out_shape, groups);
-    }
-    if (!cur_plain_dims.empty() && !is_dynamic()) {
-        COMPILE_ASSERT(info_.outputs_[0]->details_.get_plain_dims()
-                        == expected_out_shape,
-                "Bad output shape for conv");
-    } else {
-        info_.outputs_[0]->details_.set_plain_dims(expected_out_shape);
-    }
-}
-
-sc_dims conv_fwd_core_op_t::infer_out_dims(sc_graph_t &owner_graph,
-        const sc_dims &input_dims, const sc_dims &weight_dims,
-        const sc_dims &pads_begin, const sc_dims &pads_end,
-        const sc_dims &stride, const sc_dims &dilation,
-        const any_map_t &attrs) {
-    int ndims = input_dims.size();
-    const bool is_1d = (ndims == 3);
-    const bool is_3d = (ndims == 5);
-    sc_dims wei_dims = weight_dims;
-    COMPILE_ASSERT(
-            utils::is_one_of(static_cast<int>(input_dims.size()), 3, 4, 5),
-            "wrong input dims, expected to be 3D, 4D or 5D input, but got "
-                    << input_dims.size() << "D.");
-    if (attrs.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-            != ops::rl_kind::NO_LOWERING) {
-        wei_dims = attrs.get<sc_dims>("origin_wei_plain_dims");
-    }
-
-    COMPILE_ASSERT(utils::is_one_of(static_cast<int>(wei_dims.size()), 3, 4, 5)
-                    && (wei_dims.size() == input_dims.size()),
-            "wrong weight dims, only support 4D or 5D weights, but got "
-                    << wei_dims.size() << "D.");
-    COMPILE_ASSERT(
-            is_3d ? utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 3)
-                    : is_1d ? utils::is_one_of(
-                              static_cast<int>(pads_begin.size()), 1, 1)
-                            : utils::is_one_of(
-                                    static_cast<int>(pads_begin.size()), 1, 2),
-            "wrong pads_begin dims, should be 1D or 2D for 2D conv, and 1D or "
-            "3D for 3D conv, but got "
-                    << pads_begin.size() << "D for in " << (is_3d ? 3 : 2)
-                    << "D conv.");
-    COMPILE_ASSERT(is_3d
-                    ? utils::is_one_of(static_cast<int>(pads_end.size()), 1, 3)
-                    : is_1d
-                    ? utils::is_one_of(static_cast<int>(pads_end.size()), 1, 1)
-                    : utils::is_one_of(static_cast<int>(pads_end.size()), 1, 2),
-            "wrong pads_end dims, should be 1D or 2D for 2D conv, and 1D or 3D "
-            "for 3D conv, but got "
-                    << pads_end.size() << "D for in "
-                    << (is_3d                  ? 3
-                                       : is_1d ? 1
-                                               : 2)
-                    << "D conv.");
-    COMPILE_ASSERT(is_3d
-                    ? utils::is_one_of(static_cast<int>(stride.size()), 1, 3)
-                    : is_1d
-                    ? utils::is_one_of(static_cast<int>(stride.size()), 1, 2)
-                    : utils::is_one_of(static_cast<int>(stride.size()), 1, 2),
-            "wrong stride dims, should be 1D or 2D for 2D conv, and 1D or 3D "
-            "for 3D conv, but got "
-                    << stride.size() << "D for in "
-                    << (is_3d                  ? 3
-                                       : is_1d ? 1
-                                               : 2)
-                    << "D conv.");
-    COMPILE_ASSERT(is_3d
-                    ? utils::is_one_of(static_cast<int>(dilation.size()), 1, 3)
-                    : is_1d
-                    ? utils::is_one_of(static_cast<int>(dilation.size()), 1, 2)
-                    : utils::is_one_of(static_cast<int>(dilation.size()), 1, 2),
-            "wrong dilation dims, should be 1D or 2D for 2D conv, and 1D or 3D "
-            "for 3D conv, but got "
-                    << dilation.size() << "D for in "
-                    << (is_3d                  ? 3
-                                       : is_1d ? 1
-                                               : 2)
-                    << "D conv.");
-    sc_dims pads_begin_dims(ndims - 2, pads_begin[0]);
-    if (pads_begin.size() > 1) { pads_begin_dims = pads_begin; }
-    sc_dims pads_end_dims(ndims - 2, pads_end[0]);
-    if (pads_end.size() > 1) { pads_end_dims = pads_end; }
-    sc_dims stride_dims(ndims - 2, stride[0]);
-    if (stride.size() > 1) { stride_dims = stride; }
-    sc_dims dilation_dims(ndims - 2, dilation[0]);
-    if (dilation.size() > 1) { dilation_dims = dilation; }
-    auto calc_out_shapes = [](int i, int k, int pb, int pe, int s, int d) {
-        auto r = (i + pb + pe - d * (k - 1) - 1) / s + 1;
-        return r;
-    };
-    sc_dims out_dims(ndims);
-    out_dims[0] = input_dims[0];
-    out_dims[1] = wei_dims[0];
-    for (int i = 2; i < ndims; ++i) {
-        if (is_dynamic_dim(input_dims[i]) || is_dynamic_dim(wei_dims[i])
-                || is_dynamic_dim(pads_begin_dims[i - 2])
-                || is_dynamic_dim(pads_end_dims[i - 2])
-                || is_dynamic_dim(stride_dims[i - 2])) {
-            out_dims[i] = owner_graph.get_next_dynamic_placeholder();
-        } else {
-            out_dims[i] = calc_out_shapes(input_dims[i], wei_dims[i],
-                    pads_begin_dims[i - 2], pads_end_dims[i - 2],
-                    stride_dims[i - 2], dilation_dims[i - 2]);
-        }
-    }
-    if (is_1d && stride.size() > 1) {
-        out_dims[2] = attrs.get_or_else("origin_oh", sc_dim(1))
-                * attrs.get_or_else("origin_ow", sc_dim(1))
-                * (input_dims[2] / attrs.get_or_else("origin_ih", sc_dim(1))
-                        / attrs.get_or_else("origin_iw", sc_dim(1)));
-    }
-    return out_dims;
-}
-
-void conv_fwd_core_op_t::infer_auto_pad(sc_graph_t &owner_graph,
-        const sc_dims &input_dims, const sc_dims &weight_dims,
-        const sc_dims &stride, const sc_dims &dilation, any_map_t &attrs,
-        bool is_same_upper, bool is_NGCX_layout) {
-    int ndims = input_dims.size();
-    sc_dims stride_dims(ndims - 2 - is_NGCX_layout, stride[0]);
-    if (stride.size() > 1) { stride_dims = stride; }
-    sc_dims dilation_dims(ndims - 2 - is_NGCX_layout, dilation[0]);
-    if (dilation.size() > 1) { dilation_dims = dilation; }
-    sc_dims pads_begin(ndims - 2 - is_NGCX_layout, 0);
-    sc_dims pads_end(ndims - 2 - is_NGCX_layout, 0);
-    auto calc_total_padding = [](int i, int k, int o, int s, int d) {
-        return std::max((o - 1) * s + (d * (k - 1) + 1) - i, 0);
-    };
-    for (int i = 2 + is_NGCX_layout; i < ndims; ++i) {
-        if (is_dynamic_dim(input_dims[i]) || is_dynamic_dim(weight_dims[i])
-                || is_dynamic_dim(stride_dims[i - 2 - is_NGCX_layout])) {
-            // pads_begin not necessarily equal to pads_end
-            pads_begin[i - 2 - is_NGCX_layout]
-                    = owner_graph.get_next_dynamic_placeholder();
-            pads_end[i - 2 - is_NGCX_layout]
-                    = owner_graph.get_next_dynamic_placeholder();
-        } else {
-            sc_dim output_dim = utils::divide_and_ceil(
-                    input_dims[i], stride_dims[i - 2 - is_NGCX_layout]);
-            sc_dim total_pad = calc_total_padding(input_dims[i], weight_dims[i],
-                    output_dim, stride_dims[i - 2 - is_NGCX_layout],
-                    dilation_dims[i - 2 - is_NGCX_layout]);
-            if (total_pad % 2 == 0) {
-                pads_begin[i - 2 - is_NGCX_layout]
-                        = pads_end[i - 2 - is_NGCX_layout] = total_pad / 2;
-            } else {
-                pads_begin[i - 2 - is_NGCX_layout]
-                        = is_same_upper ? total_pad / 2 : total_pad / 2 + 1;
-                pads_end[i - 2 - is_NGCX_layout]
-                        = is_same_upper ? total_pad / 2 + 1 : total_pad / 2;
-            }
-        }
-    }
-    attrs.set<sc_dims>("pads_begin", pads_begin);
-    attrs.set<sc_dims>("pads_end", pads_end);
-}
-
-void conv_fwd_core_op_t::check_dtypes(const sc_data_type_t &data_dtype,
-        const sc_data_type_t &weight_dtype, const sc_data_type_t &out_dtype) {
-    if (utils::is_one_of(data_dtype, datatypes::u8, datatypes::s8)) {
-        COMPILE_ASSERT((weight_dtype == datatypes::s8),
-                "weight_dtype expected to be s8 when data_dtype is u8/s8, but "
-                "got " << weight_dtype
-                       << ".");
-        if (out_dtype != datatypes::undef) {
-            COMPILE_ASSERT((out_dtype == datatypes::s32),
-                    "out_dtype expected to be s32 when data and weights are in "
-                    "u8|s8, but got "
-                            << out_dtype << ".");
-        }
-    } else if (data_dtype == datatypes::bf16) {
-        COMPILE_ASSERT((weight_dtype == datatypes::bf16),
-                "weight_dtype expected to be bf16 when data_dtype is bf16, but "
-                "got " << weight_dtype
-                       << ".");
-    } else {
-        COMPILE_ASSERT(((data_dtype == datatypes::f32)
-                               && (weight_dtype == datatypes::f32)
-                               && (out_dtype == datatypes::undef
-                                       || out_dtype == datatypes::f32)),
-                "All datatypes are expected to be f32, but got data_dtype: "
-                        << data_dtype << ", weight_dtype: " << weight_dtype
-                        << ", out_dtype: " << out_dtype << ".");
-    }
-}
-
-conv_fwd_core_op_t::conv_fwd_core_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : tunable_op_t("conv_fwd_core", ins, outs, attrs) {
-    COMPILE_ASSERT(info_.inputs_.size() == 2, "conv expects 2 inputs");
-    auto &indims = info_.inputs_[0]->details_.get_plain_dims();
-    auto weightdims = info_.inputs_[1]->details_.get_plain_dims();
-    if (attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-            != ops::rl_kind::NO_LOWERING) {
-        weightdims = attrs.get<sc_dims>("origin_wei_plain_dims");
-    }
-    ndims_ = indims.size();
-    auto strides = attrs_.get<sc_dims>("strides");
-    auto dilations = get_dilations(attrs_);
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    // processing padding info
-    // if auto_pad is set, original pads_begin/pads_end values will be omitted
-    // so we directly overwrite attrs_
-    if (attrs_.has_key("auto_pad")) {
-        auto pad_type = attrs_.get<std::string>("auto_pad");
-        if (pad_type == "VALID") {
-            attrs_.set<sc_dims>(
-                    "pads_begin", sc_dims(ndims_ - 2 - (groups > 1), 0));
-            attrs_.set<sc_dims>(
-                    "pads_end", sc_dims(ndims_ - 2 - (groups > 1), 0));
-        } else if (pad_type == "SAME_UPPER" || pad_type == "SAME_LOWER") {
-            // output spatial dims are equal to input spatial dims
-            infer_auto_pad(get_owner_graph(), indims, weightdims, strides,
-                    dilations, attrs_, pad_type == "SAME_UPPER", groups > 1);
-        }
-        attrs_.set<std::string>("auto_pad", "none");
-    }
-    sc_dims pads_begin, pads_end;
-    if (attrs_.has_key("pads_begin")) {
-        COMPILE_ASSERT(attrs_.has_key("pads_end"),
-                "convolution op shall have pads_begin & pads_end attributes.");
-        pads_begin = attrs_.get<sc_dims>("pads_begin");
-        pads_end = attrs_.get<sc_dims>("pads_end");
-    } else {
-        pads_begin = attrs_.get<sc_dims>("paddings");
-        pads_end = pads_begin;
-    }
-    if (groups > 1) {
-        COMPILE_ASSERT(indims[1] == groups && weightdims[0] == groups,
-                "groups should be euqal to attribute");
-        COMPILE_ASSERT(groups == weightdims[0],
-                "g should be equal to filter_dims[0], but got "
-                        << groups << " vs " << weightdims[0] << ".");
-    }
-    auto &data_dtype = info_.inputs_[0]->details_.dtype_;
-    auto &weight_dtype = info_.inputs_[1]->details_.dtype_;
-    if (info_.outputs_.empty()) {
-        check_dtypes(data_dtype, weight_dtype);
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, sc_data_format_t(),
-                        sc_dims {}, infer_out_dtype(data_dtype, weight_dtype)));
-    } else {
-        COMPILE_ASSERT(info_.outputs_.size() == 1, "conv expects 1 output");
-        check_dtypes(
-                data_dtype, weight_dtype, info_.outputs_[0]->details_.dtype_);
-    }
-}
-
-bool conv_fwd_core_op_t::use_nested_conv_fwd_generator() {
-    if (is_dynamic()) return true;
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    if (groups > 1) { return false; }
-    if (attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-            != ops::rl_kind::NO_LOWERING) {
-        return false;
-    }
-    bool use_1d = info_.inputs_[0]->details_.get_plain_dims().size() == 3;
-    bool use_nested = attrs_.get_or_else("use_nested", true);
-    if (!use_nested && !use_1d) { return false; }
-    const sc_dims &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    const sc_dims &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    const sc_dims &weight_shape = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &data_shape = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &output_shape = info_.outputs_[0]->details_.get_plain_dims();
-    auto dilations = get_dilations(attrs_);
-    auto has_dilation = std::any_of(
-            dilations.begin(), dilations.end(), [](int x) { return x > 1; });
-    auto has_pad = std::any_of(pads_begin.begin(), pads_begin.end(),
-                           [](int x) { return x > 0; })
-            || std::any_of(pads_end.begin(), pads_end.end(),
-                    [](int x) { return x > 0; });
-    auto is_1x1 = std::all_of(weight_shape.begin() + 2, weight_shape.end(),
-            [](int x) { return x == 1; });
-    auto is_int8 = utils::is_one_of(
-            info_.inputs_[0]->details_.dtype_, datatypes::u8, datatypes::s8);
-    const int num_threads = runtime_config_t::get().get_num_threads();
-    auto dtype_size = utils::get_sizeof_type(info_.inputs_[1]->details_.dtype_);
-    bool os_blocking_with_oc_threads = !has_pad
-            && output_shape.back() * dtype_size < 32
-            && utils::is_one_of(info_.inputs_[0]->details_.dtype_,
-                    datatypes::u8, datatypes::s8, datatypes::bf16)
-            && !is_1x1 && weight_shape[0] >= 32
-            && is_parallel_space_enough(data_shape[0], num_threads);
-    // Only support conv 3x3 with os blocking currently
-    // TODO(zhicong): the config of nested conv 3x3 with big
-    // shape(150x150,300x300, 7x7 oc split) needs to be further tuned
-    // only used in throughput mode or real time mode in which the config is
-    // well tuned
-    auto use_nested_conv = (ndims_ == 4 && !has_pad && !has_dilation && is_int8
-                                   && !(data_shape.back() > 56 && !is_1x1)
-                                   && !(output_shape.back() <= 7 && !is_1x1)
-                                   && !(data_shape[1] % 32 != 0 && is_1x1)
-                                   && num_threads / data_shape[0] <= 4
-                                   && !os_blocking_with_oc_threads
-                                   && !attrs_.get_or_else("use_rl", false))
-            || use_1d;
-    return use_nested_conv;
-}
-
-bool conv_fwd_core_op_t::use_conv1d(const context_ptr &ctx) {
-    // should be 2d case
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    if (groups > 1) { return false; }
-    const sc_dims &weight_shape = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &data_shape = info_.inputs_[0]->details_.get_plain_dims();
-    if (weight_shape.size() != 4UL) { return false; }
-
-    // hasn't support non-zero zps yet
-    bool s8s8_compensation = ctx->machine_.cpu_flags_.fAVX512VNNI
-            && info_.inputs_[0]->details_.dtype_ == datatypes::s8
-            && (!ctx->machine_.brgemm_use_amx_
-                    || (ctx->machine_.brgemm_use_amx_
-                            && !ctx->machine_.cpu_flags_.fAVX512AMXINT8));
-    bool is_dyn_quan = attrs_.has_key(attr_keys::dyn_data_zero_points);
-    auto dyn_data_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_data_zero_points, graph_tensor_ptr());
-    auto dyn_weight_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_weight_zero_points, graph_tensor_ptr());
-    if (is_dyn_quan && !dyn_data_zero_points && !dyn_weight_zero_points) {
-        return false;
-    }
-    auto data_zero_points = attrs_.get_or_else(
-            attr_keys::data_zero_points, std::vector<int> {0});
-    auto weight_zero_points = attrs_.get_or_else(
-            attr_keys::weight_zero_points, std::vector<int> {0});
-    if (!data_zero_points.empty()
-            && !std::all_of(data_zero_points.begin(), data_zero_points.end(),
-                    [](int i) { return i == 0; })) {
-        return false;
-    }
-
-    if ((!weight_zero_points.empty()
-                && !std::all_of(weight_zero_points.begin(),
-                        weight_zero_points.end(), [](int i) { return i == 0; }))
-            || s8s8_compensation) {
-        return false;
-    }
-
-    // not support 1x1 with padding case
-    const sc_dims &paddings = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    for (auto &p : paddings) {
-        if (p != 0) { return false; }
-    }
-
-    // only support 1x1 conv
-    sc_dim kh = weight_shape[2], kw = weight_shape[3];
-    if (kh != 1 || kw != 1) { return false; }
-
-    // flatten pass cannot handle other format
-    const auto &format = get_inputs()[0]->details_.get_format();
-    if (format != sc_data_format_t::NCHW()
-            && format != sc_data_format_t::NHWC()) {
-        return false;
-    }
-
-    // training case disable
-    bool is_weight_constant
-            = get_inputs()[1]->producer_owner_->isa<constant_op_t>()
-            || get_inputs()[1]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)
-            || get_inputs()[1]->attrs_.get_or_else(
-                    "constant", const_kind::not_const);
-    if (!is_weight_constant) {
-        // TODO(zhicong): improve f32/bf16 training fwd config
-        return false;
-    }
-
-    // big data and small weight
-    auto stride = attrs_.get<sc_dims>("strides");
-    auto weight_size = math_utils::get_dims_product(weight_shape)
-            * utils::get_sizeof_type(info_.inputs_[1]->details_.dtype_);
-    auto image_size = math_utils::get_dims_product(data_shape) / data_shape[0]
-            * utils::get_sizeof_type(info_.inputs_[0]->details_.dtype_);
-    int num_threads = runtime_config_t::get().get_num_threads();
-    auto boundry = 5UL;
-    bool has_stride = !std::all_of(
-            stride.begin(), stride.end(), [](int x) { return x == 1; });
-    auto is_int8 = utils::is_one_of(
-            info_.inputs_[0]->details_.dtype_, datatypes::u8, datatypes::s8);
-    if (image_size / weight_size > boundry && !has_stride
-            && data_shape[0] % num_threads == 0) {
-        // disable conv1d to use NH fusion
-        return false;
-    }
-    // only used in throughput mode or real time mode in which the config is
-    // well tuned
-    // TODO(zhicong): further confirm the constraint 32 in other data types
-    if ((data_shape[0] == 1 && has_stride && is_int8) || data_shape[1] % 64 != 0
-            || weight_shape[0] % 64 != 0) {
-        return false;
-    }
-    return true;
-}
-
-body_generator_ptr conv_fwd_core_op_t::create_generator() {
-    auto &stride = attrs_.get<sc_dims>("strides");
-    auto dilations = get_dilations(attrs_);
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    auto &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    auto input_plain_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto weight_plain_dims = get_inputs()[1]->details_.get_plain_dims();
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    bool is_dw_brdgmm = (groups > 1) && (1 == input_plain_dims[2])
-            && (1 == weight_plain_dims[1]);
-
-#define CREATE_GENERATOR(type) \
-    utils::make_unique<type>(this, stride, dilations, pads_begin, pads_end, \
-            graph::extract_detail_from_tensors(get_inputs()), \
-            graph::extract_detail_from_tensors(get_outputs()))
-    if (use_nested_conv_fwd_generator()) {
-        return CREATE_GENERATOR(gen_nested_conv_fwd_t);
-    } else if (attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-            == ops::rl_kind::FULL_LOWERING) {
-        return CREATE_GENERATOR(gen_conv_fwd_rl_t);
-    } else if (is_dw_brdgmm) {
-        return CREATE_GENERATOR(gen_conv_dw_fwd_t);
-    } else {
-        auto ret = CREATE_GENERATOR(gen_conv_fwd_t);
-        if (attrs_.get_or_else("inverse_filter", false)) {
-            ret->inverse_filter_ = true;
-        }
-        return std::move(ret);
-    }
-#undef CREATE_GENERATOR
-}
-
-float conv_fwd_core_op_t::get_gflop() {
-    return create_generator()->get_gflop();
-}
-
-void conv_fwd_core_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    sc_data_format_t data_format, weight_format, out_format;
-    sc_data_format_t raw_weight_format
-            = info_.inputs_[1]->details_.get_format();
-    bool dynamic = is_dynamic();
-    COMPILE_ASSERT(info_.inputs_.size() == 2,
-            "conv expects 2 inputs, but got " << info_.inputs_.size()
-                                              << " inputs.");
-    ndims_ = info_.inputs_[0]->details_.get_plain_dims().size();
-    auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-    auto weight_plain_dims = use_rl > ops::rl_kind::NO_LOWERING
-            ? attrs_.get<sc_dims>("origin_wei_plain_dims")
-            : info_.inputs_[1]->details_.get_plain_dims();
-    auto input_plain_dims = info_.inputs_[0]->details_.get_plain_dims();
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    int ic = groups > 1 ? input_plain_dims[2] : input_plain_dims[1],
-        oc = groups > 1 ? weight_plain_dims[1] : weight_plain_dims[0];
-    bool is_dw_brdgmm = (groups > 1) && (1 == ic) && (1 == oc);
-    const bool is_weight_blocking
-            = info_.inputs_[1]->details_.get_format().is_blocking();
-
-    // nested os blocking conv 3x3 works when use_amx is true
-    if (!ctx->use_amx()) { attrs_.set("use_nested", false); }
-    if (!config_data_) {
-        config_data_ = create_generator()->get_default_config(ctx);
-    }
-    int C_block = 1;
-    int K_block = 1;
-    if (use_nested_conv_fwd_generator()) {
-        const nested_conv_fwd_config_t &tcfg
-                = *config_data_.get_as<nested_conv_fwd_config_t>();
-        auto body_gen = create_generator();
-        auto gen = static_cast<gen_nested_conv_fwd_t *>(body_gen.get());
-        in_formats.reserve(2);
-        C_block = tcfg.im_ic_block;
-        K_block = tcfg.im_oc_block;
-        if (gen->use_conv1d) {
-            C_block = gen->im_ic_block_;
-            K_block = gen->im_oc_block_;
-        }
-    } else if (use_rl == ops::rl_kind::FULL_LOWERING) {
-        const conv_fwd_rl_config_t &tcfg
-                = *config_data_.get_as<conv_fwd_rl_config_t>();
-        in_formats.reserve(2);
-        K_block = tcfg.brgemm_n;
-    } else if (is_dw_brdgmm) {
-        const conv_dw_fwd_config_t &tcfg
-                = *config_data_.get_as<conv_dw_fwd_config_t>();
-        auto body_gen = create_generator();
-        auto gen = static_cast<gen_conv_dw_fwd_t *>(body_gen.get());
-        in_formats.reserve(2);
-    } else {
-        const conv_fwd_config_t &tcfg
-                = *config_data_.get_as<conv_fwd_config_t>();
-        in_formats.reserve(2);
-        C_block = tcfg.C_block;
-        K_block = tcfg.K_block;
-    }
-    in_formats.resize(2);
-    out_formats.resize(1);
-    const bool is_3d = ndims_ == (groups > 1 ? 6 : 5);
-    const bool is_1d = ndims_ == 3;
-    const auto src_dtype = info_.inputs_[0]->details_.dtype_;
-    const auto wei_dtype = info_.inputs_[1]->details_.dtype_;
-    auto dilations = get_dilations(attrs_);
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    auto &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    bool has_pad = std::any_of(pads_begin.begin(), pads_begin.end(),
-                           [](sc_dim p) { return p > 0; })
-            || std::any_of(pads_end.begin(), pads_end.end(),
-                    [](sc_dim d) { return d > 0; });
-    bool is_weight_constant
-            = info_.inputs_[1]->producer_owner_->isa<constant_op_t>()
-            || info_.inputs_[1]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)
-            || info_.inputs_[1]->attrs_.get_or_else(
-                    "constant", const_kind::not_const);
-
-    bool channel_last_support = false;
-    auto is_1x1 = std::all_of(weight_plain_dims.begin() + 2 + (groups > 1),
-            weight_plain_dims.end(), [](int x) { return x == 1; });
-    if (!is_1d) {
-        channel_last_support = is_1x1 || ops::is_amx_dtype(ctx, src_dtype)
-                || (has_pad && attrs_.get_or_else("inverse_filter", false))
-                || use_rl == ops::rl_kind::FULL_LOWERING;
-        if (ops::rl_kind::KW_LOWERING == use_rl && groups > 1) {
-            channel_last_support = false;
-        }
-    }
-
-    std::string test_format;
-    if (attrs_.has_key("temp.test_format")) {
-        test_format = attrs_.get<std::string>("temp.test_format");
-    }
-    bool force_channel_last = test_format == "NHWC" || test_format == "NDHWC"
-            || test_format == "NSC" || test_format == "NHWGC"
-            || test_format == "NDHWGC";
-    bool force_blocking = test_format == "NCHWc" || test_format == "NCDHWc"
-            || test_format == "NCSc" || test_format == "NGCHWc"
-            || test_format == "NGCDHWc";
-
-    auto cur_format_set = std::unordered_set<std::vector<sc_data_format_t>>();
-    auto cur_dispatch_key_set = dispatch_key_set_t();
-    assert(in_formats.size() == 2);
-    bool is_first_format = true;
-    auto default_block = get_dyn_conv_default_block(is_1x1,
-            utils::get_sizeof_type(src_dtype), has_pad,
-            src_dtype == datatypes::f32);
-    if (dynamic) {
-        C_block = utils::get_blocks(ic, 1, default_block).back();
-        K_block = utils::get_blocks(oc, 1, default_block).back();
-    }
-
-    if (use_rl != ops::rl_kind::NO_LOWERING && groups > 1 && !dynamic) {
-        C_block = ic;
-        K_block = oc;
-    }
-
-    bool use_channel_last
-            = (((channel_last_support && !force_blocking)
-                       || (channel_last_support && force_channel_last))
-                      && ic % C_block == 0 && oc % K_block == 0)
-            || dynamic || is_dw_brdgmm;
-    // data layout
-    if (use_channel_last) {
-        data_format = is_3d ? (groups == 1 ? sc_data_format_t::NDHWC()
-                                           : sc_data_format_t::NDHWGC())
-                : is_1d     ? sc_data_format_t::NSC()
-                            : (groups == 1 ? sc_data_format_t::NHWC()
-                                           : sc_data_format_t::NHWGC());
-    } else {
-        data_format = is_3d ? (groups == 1 ? sc_data_format_t::NCDHWc(C_block)
-                                           : sc_data_format_t::NGCDHWc(C_block))
-                : is_1d     ? sc_data_format_t::NSC()
-                            : (groups == 1 ? sc_data_format_t::NCHWc(C_block)
-                                           : sc_data_format_t::NGCHWc(C_block));
-    }
-    // weight layout
-    if (use_rl == ops::rl_kind::FULL_LOWERING && !dynamic) {
-        int brgemm_k = attrs_.get<int>("brgemm_k");
-        int brgemm_n = K_block;
-        if (utils::is_one_of(src_dtype, datatypes::u8, datatypes::s8)
-                && wei_dtype == datatypes::s8) {
-            weight_format = sc_data_format_t::KNkn4k(brgemm_k, brgemm_n);
-        } else if (src_dtype == datatypes::bf16
-                && wei_dtype == datatypes::bf16) {
-            weight_format = sc_data_format_t::KNkn2k(brgemm_k, brgemm_n);
-        } else {
-            COMPILE_ASSERT(0, "Invalid datatype for reduce lowering!");
-        }
-    } else {
-        if (is_dw_brdgmm) {
-            weight_format = sc_data_format_t(format_kinds::DECAB);
-        } else {
-            if (utils::is_one_of(src_dtype, datatypes::u8, datatypes::s8)
-                    && wei_dtype == datatypes::s8) {
-                weight_format = is_3d
-                        ? (groups == 1 ? sc_data_format_t::KCDRSck4c(
-                                   C_block, K_block)
-                                       : sc_data_format_t::GKCDRSck4c(
-                                               C_block, K_block))
-                        : is_1d ? sc_data_format_t::KCSck4c(C_block, K_block)
-                                : (groups == 1 ? sc_data_format_t::KCRSck4c(
-                                           C_block, K_block)
-                                               : sc_data_format_t::GKCRSck4c(
-                                                       C_block, K_block));
-            } else if (src_dtype == datatypes::bf16
-                    && wei_dtype == datatypes::bf16) {
-                weight_format = is_3d
-                        ? (groups == 1 ? sc_data_format_t::KCDRSck2c(
-                                   C_block, K_block)
-                                       : sc_data_format_t::GKCDRSck2c(
-                                               C_block, K_block))
-                        : is_1d ? sc_data_format_t::KCSck2c(C_block, K_block)
-                                : (groups == 1 ? sc_data_format_t::KCRSck2c(
-                                           C_block, K_block)
-                                               : sc_data_format_t::GKCRSck2c(
-                                                       C_block, K_block));
-            } else {
-                weight_format = is_3d
-                        ? (groups == 1 ? sc_data_format_t::KCDRSck(
-                                   C_block, K_block)
-                                       : sc_data_format_t::GKCDRSck(
-                                               C_block, K_block))
-                        : is_1d ? sc_data_format_t::KCSck(C_block, K_block)
-                                : (groups == 1 ? sc_data_format_t::KCRSck(
-                                           C_block, K_block)
-                                               : sc_data_format_t::GKCRSck(
-                                                       C_block, K_block));
-            }
-        }
-    }
-
-    if (is_weight_blocking && dynamic) {
-        weight_format = raw_weight_format;
-        // follow last layer's config
-        if (raw_weight_format.blocks_[0]) {
-            C_block = raw_weight_format.blocks_[0];
-        }
-        if (raw_weight_format.blocks_[1]) {
-            K_block = raw_weight_format.blocks_[1];
-        }
-    }
-
-    // out layout
-    if (use_channel_last) {
-        out_format = is_3d ? (groups == 1 ? sc_data_format_t::NDHWC()
-                                          : sc_data_format_t::NDHWGC())
-                : is_1d    ? sc_data_format_t::NSC()
-                           : (groups == 1 ? sc_data_format_t::NHWC()
-                                          : sc_data_format_t::NHWGC());
-    } else {
-        out_format = is_3d ? (groups == 1 ? sc_data_format_t::NCDHWc(K_block)
-                                          : sc_data_format_t::NGCDHWc(K_block))
-                : is_1d    ? sc_data_format_t::NSC()
-                           : (groups == 1 ? sc_data_format_t::NCHWc(K_block)
-                                          : sc_data_format_t::NGCHWc(K_block));
-    }
-
-    std::vector<sc_data_format_t> ret_formats
-            = {data_format, weight_format, out_format};
-    if (cur_format_set.find(ret_formats) == cur_format_set.end()) {
-        in_formats[0].emplace_back(data_format);
-        in_formats[1].emplace_back(weight_format);
-        out_formats[0].emplace_back(out_format);
-        cur_format_set.insert(ret_formats);
-    }
-
-    if (dynamic) {
-        if (is_first_format) {
-            nested_conv_fwd_config_t &tcfg
-                    = *config_data_.get_as<nested_conv_fwd_config_t>();
-            tcfg.im_ic_block = C_block;
-            tcfg.im_oc_block = K_block;
-            is_first_format = false;
-        }
-        std::vector<std::vector<sc_dim>> var_block
-                = {{}, {C_block, K_block}, {}};
-        op_dispatch_key_t ret_key(var_block, ret_formats);
-        cur_dispatch_key_set.set_.insert(ret_key);
-        auto &dispatch_key_set = get_dispatch_key_set();
-        dispatch_key_set->get_inner_set().insert(
-                cur_dispatch_key_set.set_.begin(),
-                cur_dispatch_key_set.set_.end());
-    }
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-void conv_fwd_core_op_t::set_config_by_key(
-        const op_dispatch_key_t &key, const context_ptr &ctx) {
-    assert(key.var_block_.size() == 3);
-    if (use_nested_conv_fwd_generator()) {
-        config_data_ = dyn_config_candidates_[key.impl_];
-        nested_conv_fwd_config_t &tcfg
-                = *config_data_.get_as<nested_conv_fwd_config_t>();
-        tcfg.im_ic_block = key.var_block_[1][0];
-        tcfg.im_oc_block = key.var_block_[1][1];
-        auto cfg = dyn_config_candidates_[key.impl_]
-                           .unchecked_get_as<nested_conv_fwd_config_t>();
-        dynamic_conv_param.h_threads = cfg->h_threads;
-        dynamic_conv_param.oc_threads = cfg->oc_threads;
-        dynamic_conv_param.im_h_block = cfg->im_h_block;
-        dynamic_conv_param.im_w_block = cfg->im_w_block;
-    }
-}
-
-sc_op_ptr conv_fwd_core_op_t::copy(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ret = tunable_op_t::copy(ins, outs, mgr);
-    auto conv_fwd = ret->dyn_cast<conv_fwd_core_op_t>();
-    conv_fwd->dynamic_conv_param.h_threads = dynamic_conv_param.h_threads;
-    conv_fwd->dynamic_conv_param.oc_threads = dynamic_conv_param.oc_threads;
-    conv_fwd->dynamic_conv_param.im_h_block = dynamic_conv_param.im_h_block;
-    conv_fwd->dynamic_conv_param.im_w_block = dynamic_conv_param.im_w_block;
-    return ret;
-}
-
-std::vector<int> conv_fwd_core_op_t::get_impl_dispatch_candidates(
-        const context_ptr &ctx) {
-    return get_dynamic_impl_dispatch_candidates(this, ctx);
-}
-
-shape_rl_vec conv_fwd_core_op_t::get_dynamic_shape_relations() const {
-    return get_shape_relations_impl(get_inputs()[0]->details_.get_plain_dims(),
-            get_inputs()[1]->details_.get_plain_dims(),
-            get_outputs()[0]->details_.get_plain_dims(), attrs_);
-}
-
-void conv_fwd_core_op_t::calculate_dynamic_shape_expression() {
-    auto &g = get_owner_graph();
-    auto expr_pads_begin = g.dims_to_expr(attrs_.has_key("pads_begin")
-                    ? attrs_.get<sc_dims>("pads_begin")
-                    : attrs_.get_or_else<sc_dims>(
-                            "paddings", sc_dims(ndims_ - 2, 0)));
-
-    auto expr_pads_end = g.dims_to_expr(attrs_.has_key("pads_end")
-                    ? attrs_.get<sc_dims>("pads_end")
-                    : attrs_.get_or_else<sc_dims>(
-                            "paddings", sc_dims(ndims_ - 2, 0)));
-
-    sc_dims stride = attrs_.get<sc_dims>("strides");
-    sc_dims stride_dims(ndims_ - 2, stride[0]);
-    if (stride.size() > 1) { stride_dims = stride; }
-    auto expr_strides = g.dims_to_expr(stride_dims);
-    auto &data_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    auto &weight_dims = get_inputs()[1]->details_.get_plain_dims();
-    std::vector<expr> expr_kernels = {g.dim_to_expr(weight_dims[ndims_ - 2]),
-            g.dim_to_expr(weight_dims[ndims_ - 1])};
-    if (ndims_ == 5) {
-        expr_kernels.insert(
-                expr_kernels.begin(), g.dim_to_expr(weight_dims[ndims_ - 3]));
-    }
-    auto dilation = get_dilations(attrs_);
-    sc_dims dilations(ndims_ - 2, dilation[0]);
-    if (dilation.size() > 1) { dilations = dilation; }
-    auto expr_dilations = g.dims_to_expr(dilations);
-    const int shape_begin_axis = 2;
-    for (int i = 0; i < ndims_ - 2; i++) {
-        if (is_dynamic_dim(data_dims[shape_begin_axis + i])
-                && out_dims[shape_begin_axis + i]
-                        != data_dims[shape_begin_axis + i]) {
-            auto var_in = g.dim_to_expr(data_dims[shape_begin_axis + i]);
-            auto var_out = g.dim_to_expr(out_dims[shape_begin_axis + i]);
-            expr_c cal_expr = do_cast_and_fold(
-                    (var_in + expr_pads_begin[i] + expr_pads_end[i]
-                            - expr_dilations[i] * (expr_kernels[i] - 1) - 1)
-                            / expr_strides[i]
-                    + 1);
-            var_out->attr_->set(attr_keys::cal_expression, cal_expr);
-        }
-    }
-}
-
-reflection::shared_general_object_t
-conv_fwd_core_op_t::get_dynamic_runtime_info() {
-    sc_dims pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get_or_else<sc_dims>("paddings", sc_dims(ndims_ - 2, 0));
-    sc_dims pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get_or_else<sc_dims>("paddings", sc_dims(ndims_ - 2, 0));
-
-    sc_dims stride = attrs_.get<sc_dims>("strides");
-    sc_dims stride_dims(ndims_ - 2, stride[0]);
-    if (stride.size() > 1) { stride_dims = stride; }
-    auto dilations = get_dilations(attrs_);
-    sc_dims dilation_dims(ndims_ - 2, dilations[0]);
-    if (dilations.size() > 1) { dilation_dims = dilations; }
-
-    auto dyn_info = ndims_ == 5
-            ? dyn_conv_fwd_runtime_info_t(stride[0], stride[1], stride[2],
-                    pads_begin[0], pads_begin[1], pads_begin[2], pads_end[0],
-                    pads_end[1], pads_end[2], dilation_dims[0],
-                    dilation_dims[1], dilation_dims[2])
-            : dyn_conv_fwd_runtime_info_t(stride[0], stride[1], pads_begin[0],
-                    pads_begin[1], pads_end[0], pads_end[1], dilation_dims[0],
-                    dilation_dims[1]);
-    reflection::shared_general_object_t info
-            = reflection::general_object_t::make(dyn_info);
-    return info;
-}
-
-shape_rl_vec conv_fwd_core_op_t::get_shape_relations_impl(
-        const std::vector<sc_dim> &data_plain_dims,
-        const std::vector<sc_dim> &weight_plain_dims,
-        const std::vector<sc_dim> &out_plain_dims, const any_map_t &attrs) {
-    auto ndims = data_plain_dims.size();
-    sc_dims pads_begin = attrs.has_key("pads_begin")
-            ? attrs.get<sc_dims>("pads_begin")
-            : attrs.get_or_else<sc_dims>("paddings", sc_dims(ndims - 2, 0));
-    sc_dims pads_end = attrs.has_key("pads_end")
-            ? attrs.get<sc_dims>("pads_end")
-            : attrs.get_or_else<sc_dims>("paddings", sc_dims(ndims - 2, 0));
-
-    sc_dims stride = attrs.get<sc_dims>("strides");
-    sc_dims stride_dims(ndims - 2, stride[0]);
-    if (stride.size() > 1) { stride_dims = stride; }
-
-    shape_rl_vec ret;
-    auto is_1x1 = std::all_of(weight_plain_dims.begin() + 2,
-            weight_plain_dims.end(), [](int x) { return x == 1; });
-    assert(data_plain_dims.size() == weight_plain_dims.size()
-            && data_plain_dims.size() == 4 && weight_plain_dims.size() == 4);
-    auto data_BS = data_plain_dims[0];
-    auto data_H = data_plain_dims[data_plain_dims.size() - 2];
-    auto data_W = data_plain_dims[data_plain_dims.size() - 1];
-    auto out_BS = out_plain_dims[0];
-    auto out_H = out_plain_dims[data_plain_dims.size() - 2];
-    auto out_W = out_plain_dims[data_plain_dims.size() - 1];
-    if (is_dynamic_dim(data_BS)) { ret.emplace_back(data_BS, out_BS); }
-    if (is_dynamic_dim(data_H) && stride[0] == 1
-            && (pads_begin[0] + pads_end[0] - weight_plain_dims[2] == -1)) {
-        ret.emplace_back(data_H, out_H);
-    }
-    if (is_dynamic_dim(data_W) && stride[1] == 1
-            && (pads_begin[1] + pads_end[1] - weight_plain_dims[3] == -1)) {
-        ret.emplace_back(data_W, out_W);
-    }
-    return ret;
-}
-
-static graph_tensor_ptr get_conv_rl_real_weight(const graph_tensor_ptr &wei) {
-    auto current = wei;
-    while ((current->producer_owner_->isa<padding_op_t>()
-                   || current->producer_owner_->isa<tensor_view_op_t>())
-            && !current->producer_owner_->attrs_.get_or_else(
-                    "produce_real_weight", false)) {
-        current = current->producer_owner_->get_inputs()[0];
-    }
-    return current;
-}
-
-sc_op_ptr conv_fwd_core_op_t::do_compensations(
-        sc_graph_t &mgr, const context_ptr &ctx) {
-    need_compensation_ = false;
-    // whether we need special compensation for microkernel.
-    bool s8s8_compensation = ctx->machine_.cpu_flags_.fAVX512VNNI
-            && info_.inputs_[0]->details_.dtype_ == datatypes::s8
-            && (!ctx->machine_.brgemm_use_amx_
-                    || (ctx->machine_.brgemm_use_amx_
-                            && !ctx->machine_.cpu_flags_.fAVX512AMXINT8));
-
-    auto cur_node = shared_from_this();
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    bool is_group_conv = groups > 1;
-
-    auto data_com = get_data_compensation(mgr);
-    auto const_com = get_constant_compensation(mgr);
-    auto s8s8_weight_com
-            = get_s8s8_and_weight_compensation(mgr, s8s8_compensation);
-
-    if (data_com) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0], data_com->get_outputs()[0]}, {},
-                {});
-    }
-    if (s8s8_weight_com[0]) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0],
-                        s8s8_weight_com[0]->get_outputs()[0]},
-                {},
-                {{"bc_axis",
-                        is_group_conv ? std::vector<int> {1, 2}
-                                      : std::vector<int> {1}}});
-    }
-    if (s8s8_weight_com[1]) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0],
-                        s8s8_weight_com[1]->get_outputs()[0]},
-                {},
-                {{"bc_axis",
-                        is_group_conv ? std::vector<int> {1, 2}
-                                      : std::vector<int> {1}}});
-    }
-    if (const_com) {
-        cur_node = mgr.make("add",
-                {cur_node->get_outputs()[0], const_com->get_outputs()[0]}, {},
-                {});
-    }
-
-    return cur_node;
-}
-
-sc_op_ptr conv_fwd_core_op_t::get_data_compensation(sc_graph_t &mgr) {
-    std::string weight_zp_key = attr_keys::weight_zero_points;
-    std::string dyn_weight_zp_key = attr_keys::dyn_weight_zero_points;
-    bool is_dyn_quan = attrs_.has_key(dyn_weight_zp_key);
-    auto weight_zero_points
-            = attrs_.get_or_else(weight_zp_key, std::vector<int> {0});
-    auto dyn_weight_zero_points
-            = attrs_.get_or_else(dyn_weight_zp_key, graph_tensor_ptr());
-    const auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-    auto wei_plain_dim = use_rl != ops::rl_kind::NO_LOWERING
-            ? attrs_.get<sc_dims>("origin_wei_plain_dims")
-            : get_inputs()[1]->details_.get_plain_dims();
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    if (!is_dyn_quan
-            && (weight_zero_points.empty()
-                    || (std::all_of(weight_zero_points.begin(),
-                            weight_zero_points.end(),
-                            [](int i) { return i == 0; })))) {
-        return nullptr;
-    }
-    bool is_group_conv = groups > 1;
-    COMPILE_ASSERT(wei_plain_dim.size() >= 4UL + is_group_conv,
-            "Convolution should have >= 4 dimension(at least 2d)")
-    bool is_3d = wei_plain_dim.size() > (!is_group_conv ? 4 : 5);
-    int kd = is_3d ? wei_plain_dim.at(2 + is_group_conv) : 1;
-    int kh = wei_plain_dim.at(2 + is_3d + is_group_conv);
-    int kw = wei_plain_dim.at(3 + is_3d + is_group_conv);
-    auto data = info_.inputs_[0];
-    auto cast_node = mgr.make("cast", {data}, {}, {{"dtype", datatypes::s32}});
-
-    auto ndims = info_.inputs_[0]->details_.get_plain_dims().size();
-    std::vector<int> rdaxis = {1};
-
-    auto reduce_node = mgr.make("reduce", cast_node->get_outputs(), {},
-            {{"rd_axis", rdaxis}, {"rd_op", 0}, {"keep_dims", true}});
-    sc_op_ptr ret_node;
-    if (is_dyn_quan) {
-        if (!dyn_weight_zero_points) { return nullptr; }
-        COMPILE_ASSERT(dyn_weight_zero_points->details_.get_plain_dims()
-                        == sc_dims {1},
-                "conv_fwd_core does not support per channel weight zero "
-                "points compensation yet");
-        ret_node = mgr.make("mul",
-                {reduce_node->get_outputs()[0], dyn_weight_zero_points}, {},
-                {});
-    } else {
-        std::shared_ptr<static_data_t> weight_zero_points_ptr
-                = std::make_shared<static_data_t>(weight_zero_points);
-        sc_dims const_plain_dims;
-        sc_data_format_t const_format;
-        if (weight_zero_points.size() == 1) {
-            // per tensor
-            const_plain_dims = {1};
-        } else {
-            // per channel
-            COMPILE_ASSERT(0,
-                    "conv_fwd does not support per channel weight zero "
-                    "points "
-                    "compensation yet");
-        }
-        auto constant_node = mgr.make("constant", {}, {},
-                {{"values", weight_zero_points_ptr}, {"dtype", datatypes::s32},
-                        {"plain_dims", const_plain_dims},
-                        {"format", const_format}});
-        ret_node = mgr.make("mul",
-                {reduce_node->get_outputs()[0],
-                        constant_node->get_outputs()[0]},
-                {}, {});
-    }
-    if (kh > 1 || kw > 1 || kd > 1) {
-        std::shared_ptr<static_data_t> kernel_data_ptr
-                = std::make_shared<static_data_t>(
-                        std::vector<int>(kh * kw * kd, 1));
-        auto const_plain_dims = wei_plain_dim;
-        COMPILE_ASSERT(const_plain_dims.size() >= 3,
-                "Convolution should have >= 3 dimension(at least 1d)")
-        const_plain_dims[0] = 1;
-        const_plain_dims[1] = 1;
-        auto constant_node = mgr.make("constant", {}, {},
-                {{"values", kernel_data_ptr}, {"dtype", datatypes::s32},
-                        {"plain_dims", const_plain_dims},
-                        {"format",
-                                use_rl ? sc_data_format_t()
-                                       : info_.inputs_[1]
-                                                 ->details_.get_format()}});
-        auto cast_node = mgr.make("cast", {ret_node->get_outputs()[0]}, {},
-                {{"dtype", datatypes::f32}});
-        constant_node = mgr.make("cast", {constant_node->get_outputs()[0]}, {},
-                {{"dtype", datatypes::f32}});
-        auto conv_attr = attrs_;
-        if (use_rl) {
-            conv_attr["use_rl"] = ops::rl_kind::NO_LOWERING;
-            SC_MODULE_WARN << "Disable conv_rl and fall-back to normal conv "
-                              "for compensation";
-        }
-        auto conv_node = mgr.make("conv_fwd_core",
-                {cast_node->get_outputs()[0], constant_node->get_outputs()[0]},
-                {}, conv_attr);
-        ret_node = mgr.make("cast", {conv_node->get_outputs()[0]}, {},
-                {{"dtype", datatypes::s32}});
-    }
-    return ret_node;
-}
-std::vector<sc_op_ptr> conv_fwd_core_op_t::get_s8s8_and_weight_compensation(
-        sc_graph_t &mgr, bool s8s8_compensation) {
-    std::string data_zp_key = attr_keys::data_zero_points;
-    std::string dyn_data_zp_key = attr_keys::dyn_data_zero_points;
-    bool is_dyn_quan = attrs_.has_key(dyn_data_zp_key);
-    auto data_zero_points
-            = attrs_.get_or_else(data_zp_key, std::vector<int> {0});
-    auto dyn_data_zero_points
-            = attrs_.get_or_else(dyn_data_zp_key, graph_tensor_ptr());
-    bool weight_compensation = (is_dyn_quan && dyn_data_zero_points)
-            || (!data_zero_points.empty()
-                    && !(std::all_of(data_zero_points.begin(),
-                            data_zero_points.end(),
-                            [](int i) { return i == 0; })));
-    std::vector<sc_op_ptr> nodes = {nullptr, nullptr};
-    if (!s8s8_compensation && !weight_compensation) { return nodes; }
-
-    auto weight = get_conv_rl_real_weight(info_.inputs_[1]);
-    auto cast_node
-            = mgr.make("cast", {weight}, {}, {{"dtype", datatypes::s32}});
-
-    // K(CRS)
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    bool is_group_conv = groups > 1;
-    auto ndims = weight->details_.get_plain_dims().size();
-    std::vector<int> rdaxis(ndims - 1 - is_group_conv);
-    std::iota(rdaxis.begin(), rdaxis.end(), 1 + is_group_conv);
-    auto reduce_node = mgr.make("reduce", cast_node->get_outputs(), {},
-            {{"rd_axis", rdaxis}, {"rd_op", 0}, {"keep_dims", false}});
-
-    if (weight_compensation) {
-        if (is_dyn_quan) {
-            COMPILE_ASSERT(dyn_data_zero_points
-                            && dyn_data_zero_points->details_.get_plain_dims()
-                                    == sc_dims {1},
-                    "conv_fwd_core does not support per channel data zero "
-                    "points compensation yet");
-            nodes[0] = mgr.make("mul",
-                    {reduce_node->get_outputs()[0], dyn_data_zero_points}, {},
-                    {});
-        } else {
-            std::shared_ptr<static_data_t> data_zero_points_ptr
-                    = std::make_shared<static_data_t>(data_zero_points);
-            sc_dims const_plain_dims;
-            sc_data_format_t const_format;
-            if (data_zero_points.size() == 1) {
-                // per tensor
-                const_plain_dims = {1};
-            } else {
-                COMPILE_ASSERT(0,
-                        "conv_fwd does not support per channel data zero "
-                        "points "
-                        "compensation yet");
-            }
-            auto constant_node = mgr.make("constant", {}, {},
-                    {{"values", data_zero_points_ptr},
-                            {"dtype", datatypes::s32},
-                            {"plain_dims", const_plain_dims},
-                            {"format", const_format}});
-            nodes[0] = mgr.make("mul",
-                    {reduce_node->get_outputs()[0],
-                            constant_node->get_outputs()[0]},
-                    {}, {});
-        }
-    }
-
-    if (s8s8_compensation) {
-        auto s8_constant_node = mgr.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(
-                                 std::vector<int> {128})},
-                        {"dtype", datatypes::s32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        nodes[1] = mgr.make("mul",
-                {reduce_node->get_outputs()[0],
-                        s8_constant_node->get_outputs()[0]},
-                {}, {});
-    }
-
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    auto &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    bool has_pad = std::any_of(pads_begin.begin(), pads_begin.end(),
-                           [](sc_dim p) { return p > 0; })
-            || std::any_of(pads_end.begin(), pads_end.end(),
-                    [](sc_dim d) { return d > 0; });
-    if (has_pad && (s8s8_compensation || weight_compensation)) {
-        if (s8s8_compensation) { attrs_["padding_value"] = 128; }
-        if (weight_compensation) {
-            COMPILE_ASSERT(data_zero_points.size() == 1,
-                    "padded conv_fwd currently doesn't support per channel "
-                    "zero points");
-            attrs_["padding_value"] = attrs_.get_or_else("padding_value", 0)
-                    + data_zero_points[0];
-        }
-    }
-    return nodes;
-}
-sc_op_ptr conv_fwd_core_op_t::get_constant_compensation(sc_graph_t &g) {
-    bool is_dyn_quan = attrs_.has_key(attr_keys::dyn_data_zero_points);
-    auto data_zero_points = attrs_.get_or_else(
-            attr_keys::data_zero_points, std::vector<int> {0});
-    auto weight_zero_points = attrs_.get_or_else(
-            attr_keys::weight_zero_points, std::vector<int> {0});
-    auto dyn_data_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_data_zero_points, graph_tensor_ptr());
-    auto dyn_weight_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_weight_zero_points, graph_tensor_ptr());
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    auto &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    bool has_pad = std::any_of(pads_begin.begin(), pads_begin.end(),
-                           [](sc_dim p) { return p > 0; })
-            || std::any_of(pads_end.begin(), pads_end.end(),
-                    [](sc_dim d) { return d > 0; });
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    bool is_group_conv = groups > 1;
-    auto data_dims = info_.inputs_[0]->details_.get_plain_dims();
-    const auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-    auto weight_dims = use_rl != ops::rl_kind::NO_LOWERING
-            ? attrs_.get<sc_dims>("origin_wei_plain_dims")
-            : get_inputs()[1]->details_.get_plain_dims();
-    int C = data_dims.at(1 + is_group_conv);
-    sc_op_ptr ret_node;
-    if (is_dyn_quan) {
-        if (!dyn_data_zero_points || !dyn_weight_zero_points) {
-            return nullptr;
-        }
-    } else {
-        if (data_zero_points.empty() || weight_zero_points.empty()) {
-            return nullptr;
-        }
-        if ((std::all_of(data_zero_points.begin(), data_zero_points.end(),
-                    [](int i) { return i == 0; }))
-                || (std::all_of(weight_zero_points.begin(),
-                        weight_zero_points.end(),
-                        [](int i) { return i == 0; }))) {
-            return nullptr;
-        }
-    }
-    COMPILE_ASSERT(weight_dims.size() >= 4,
-            "Convolution should have >= 4 dimension(at least 2d)")
-    bool is_3d = weight_dims.size() > static_cast<size_t>(4 + is_group_conv);
-    int kd = is_3d ? weight_dims.at(2 + is_group_conv) : 1;
-    int kh = weight_dims.at(2 + is_3d + is_group_conv);
-    int kw = weight_dims.at(3 + is_3d + is_group_conv);
-    if (is_dyn_quan) {
-        COMPILE_ASSERT(
-                dyn_data_zero_points->details_.get_plain_dims() == sc_dims {1}
-                        && dyn_weight_zero_points->details_.get_plain_dims()
-                                == sc_dims {1},
-                "conv_fwd does not support per channel data/weight zero "
-                "points compensation yet");
-        ret_node = g.make(
-                "mul", {dyn_data_zero_points, dyn_weight_zero_points}, {}, {});
-        auto const_reduce = g.make("constant", {}, {},
-                {{"dtype", datatypes::s32},
-                        {"values",
-                                std::make_shared<static_data_t>(
-                                        &C, sizeof(int))},
-                        {"plain_dims", sc_dims {1}}});
-        ret_node = g.make("mul",
-                {ret_node->get_outputs()[0], const_reduce->get_outputs()[0]},
-                {}, {});
-    } else if (!has_pad) {
-        COMPILE_ASSERT(
-                data_zero_points.size() == 1 && weight_zero_points.size() == 1,
-                "conv_fwd does not support per channel data/weight zero "
-                "points "
-                "compensation yet");
-        ret_node = g.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(std::vector<int> {
-                                 data_zero_points[0] * weight_zero_points[0] * C
-                                 * kd * kh * kw})},
-                        {"dtype", datatypes::s32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-    } else {
-        COMPILE_ASSERT(
-                data_zero_points.size() == 1 && weight_zero_points.size() == 1,
-                "conv_fwd does not support per channel data/weight zero "
-                "points "
-                "compensation yet");
-        auto constant_data_zps_dim = data_dims;
-        COMPILE_ASSERT(constant_data_zps_dim.size() >= 3,
-                "Convolution should have >= 3 dimension(at least 1d)");
-
-        constant_data_zps_dim[0] = 1;
-        constant_data_zps_dim[1] = 1;
-        auto constant_data_zps = g.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(std::vector<int>(
-                                 std::accumulate(constant_data_zps_dim.begin(),
-                                         constant_data_zps_dim.end(), 1,
-                                         std::multiplies<int>()),
-                                 data_zero_points[0] * C))},
-                        {"dtype", datatypes::s32},
-                        {"plain_dims", constant_data_zps_dim},
-                        {"format", info_.inputs_[0]->details_.get_format()}});
-        auto constant_wei_zps_dim = weight_dims;
-        COMPILE_ASSERT(constant_wei_zps_dim.size() >= 3,
-                "Convolution should have >= 3 dimension(at least 1d)");
-
-        constant_wei_zps_dim[0] = 1;
-        constant_wei_zps_dim[1] = 1;
-        auto constant_wei_zps = g.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(std::vector<int>(
-                                 kd * kh * kw, weight_zero_points[0]))},
-                        {"dtype", datatypes::s32},
-                        {"plain_dims", constant_wei_zps_dim},
-                        {"format",
-                                use_rl ? sc_data_format_t()
-                                       : info_.inputs_[1]
-                                                 ->details_.get_format()}});
-        auto cast_data_zps
-                = g.make("cast", {constant_data_zps->get_outputs()[0]}, {},
-                        {{"dtype", datatypes::f32}});
-        auto cast_wei_zps = g.make("cast", {constant_wei_zps->get_outputs()[0]},
-                {}, {{"dtype", datatypes::f32}});
-        auto conv_attr = attrs_;
-        if (use_rl) {
-            conv_attr["use_rl"] = ops::rl_kind::NO_LOWERING;
-            SC_MODULE_WARN << "Disable conv_rl and fall-back to normal conv "
-                              "for compensation";
-        }
-        auto conv_node = g.make("conv_fwd_core",
-                {cast_data_zps->get_outputs()[0],
-                        cast_wei_zps->get_outputs()[0]},
-                {}, conv_attr);
-        ret_node = g.make("cast", {conv_node->get_outputs()[0]}, {},
-                {{"dtype", datatypes::s32}});
-    }
-    return ret_node;
-}
-
-conv_bwd_data_core_op_t::conv_bwd_data_core_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : tunable_op_t("conv_bwd_data_core", ins, outs, attrs) {
-    COMPILE_ASSERT(info_.inputs_.size() == 2 || info_.inputs_.size() == 3,
-            "conv_bwd_data expects 2 or 3 inputs");
-    auto output_shape = attrs_.get<sc_dims>("dst_shape");
-    ndims_ = info_.inputs_[0]->details_.get_plain_dims().size();
-    auto &weightdims = info_.inputs_[1]->details_.get_plain_dims();
-    is_1x1_ = std::all_of(weightdims.begin() + 2, weightdims.end(),
-            [](int x) { return x == 1; });
-    auto strides = attrs_.get<sc_dims>("strides");
-    auto dilations = get_dilations(attrs_);
-    COMPILE_ASSERT(std::all_of(dilations.begin(), dilations.end(),
-                           [](int x) { return x == 1; }),
-            "conv_bwd_data_core does not support dilation > 1 now");
-    if (attrs_.has_key("auto_pad")) {
-        auto pad_type = attrs_.get<std::string>("auto_pad");
-        if (pad_type == "VALID") {
-            attrs_.set<sc_dims>("pads_begin", sc_dims(ndims_ - 2, 0));
-            attrs_.set<sc_dims>("pads_end", sc_dims(ndims_ - 2, 0));
-        } else if (pad_type == "SAME_UPPER" || pad_type == "SAME_LOWER") {
-            // output spatial dims are equal to input spatial dims
-            conv_fwd_core_op_t::infer_auto_pad(get_owner_graph(), output_shape,
-                    weightdims, strides, dilations, attrs_,
-                    pad_type == "SAME_UPPER");
-        }
-        attrs_.set<std::string>("auto_pad", "none");
-    }
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-                this, sc_data_format_t(), output_shape, datatypes::f32));
-    } else {
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "conv_bwd_data_core expects 1 output");
-        COMPILE_ASSERT(
-                info_.outputs_[0]->details_.get_plain_dims() == output_shape,
-                "conv_bwd_data_core's out dims not correct");
-    }
-}
-
-bool conv_bwd_data_core_op_t::use_nested_generator() {
-    bool use_nested = attrs_.get_or_else("use_nested", true);
-    if (!use_nested) { return false; }
-    const sc_dims &stride = attrs_.get<sc_dims>("strides");
-    const sc_dims &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    int num_threads = runtime_config_t::get().get_num_threads();
-    const sc_dims &input_shape = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &weight_shape = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &output_shape = info_.outputs_[0]->details_.get_plain_dims();
-    if (is_1x1_) {
-        // nested generator constraints for 1x1 case
-        // ToDo(zhangyan): improve following constraints
-        if (num_threads % 7 != 0) { return false; }
-        if (ndims_ != 4) { return false; }
-        auto tmp_kernel
-                = utils::make_unique<gen_nested_conv1x1_backprop_data_t>(this,
-                        stride, pads_begin,
-                        graph::extract_detail_from_tensors(get_inputs()),
-                        graph::extract_detail_from_tensors(get_outputs()));
-        int im_oc_block = tmp_kernel->im_oc_block_;
-        int im_ic_block = tmp_kernel->im_ic_block_;
-        int im_ow_block = tmp_kernel->im_ow_block_;
-        int IC = weight_shape[1], OC = weight_shape[0],
-            OS = input_shape[2] * input_shape[3], BS = input_shape[0];
-        if (IC % im_ic_block || OC % im_oc_block || OS % im_ow_block) {
-            return false;
-        }
-        // we need to check whether we can fully utilize all threads
-        int possible_parallel_space
-                = BS * (OS / im_ow_block) * (IC / im_ic_block);
-        if (possible_parallel_space < num_threads) { return false; }
-        return true;
-    } else {
-        // nested generator constraints for NxN case
-        if (ndims_ != 4) { return false; }
-        auto tmp_kernel
-                = utils::make_unique<gen_nested_convNxN_backprop_data_t>(this,
-                        stride, pads_begin,
-                        graph::extract_detail_from_tensors(get_inputs()),
-                        graph::extract_detail_from_tensors(get_outputs()));
-        int im_ic_block = tmp_kernel->im_ic_block_;
-        int BS = input_shape[0], IC = output_shape[1], IH = output_shape[2];
-        int OW = input_shape[3], IW = output_shape[3];
-        if (IC % im_ic_block) { return false; }
-        // TODO(yifei): fix this restriction
-        // currently we force im_ow_block_ = OW
-        // this only holds if OW * stride_w == IW
-        int stride_w = stride.back();
-        if (OW * stride_w != IW) { return false; }
-        // we need to check whether we can fully utilize all threads
-        // int possible_parallel_space = BS * IH * (IC / im_ic_block);
-        // TODO(yifei): loosen the restriction here
-        // avoid the possibility that BS < bs_threads
-        if (BS < num_threads) { return false; }
-        return true;
-    }
-    return false;
-}
-
-body_generator_ptr conv_bwd_data_core_op_t::create_generator() {
-    auto &stride = attrs_.get<sc_dims>("strides");
-    const auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    const bool is_3d = ndims_ == 5;
-    int D = is_3d ? info_.inputs_[1]->details_.get_plain_dims()[2] : 1;
-    int R = is_3d ? info_.inputs_[1]->details_.get_plain_dims()[3]
-                  : info_.inputs_[1]->details_.get_plain_dims()[2];
-    int S = is_3d ? info_.inputs_[1]->details_.get_plain_dims()[4]
-                  : info_.inputs_[1]->details_.get_plain_dims()[3];
-    if (D == 1 && R == 1 && S == 1) {
-        if (use_nested_generator()) {
-            return utils::make_unique<gen_nested_conv1x1_backprop_data_t>(this,
-                    stride, pads_begin,
-                    graph::extract_detail_from_tensors(get_inputs()),
-                    graph::extract_detail_from_tensors(get_outputs()));
-        } else {
-            SC_MODULE_WARN << "Fall-back to non-nested conv1x1 backprop data.";
-            return utils::make_unique<gen_conv1x1_backprop_data_t>(this, stride,
-                    pads_begin,
-                    graph::extract_detail_from_tensors(get_inputs()),
-                    graph::extract_detail_from_tensors(get_outputs()));
-        }
-    } else {
-        if (use_nested_generator()) {
-            return utils::make_unique<gen_nested_convNxN_backprop_data_t>(this,
-                    stride, pads_begin,
-                    graph::extract_detail_from_tensors(get_inputs()),
-                    graph::extract_detail_from_tensors(get_outputs()));
-        } else {
-            SC_MODULE_WARN << "Fall-back to non-nested convNxN backprop data.";
-            return utils::make_unique<gen_convNxN_backprop_data>(this, stride,
-                    pads_begin,
-                    graph::extract_detail_from_tensors(get_inputs()),
-                    graph::extract_detail_from_tensors(get_outputs()));
-        }
-    }
-}
-
-float conv_bwd_data_core_op_t::get_gflop() {
-    return create_generator()->get_gflop();
-}
-
-void conv_bwd_data_core_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    if (!config_data_) {
-        config_data_ = create_generator()->get_default_config(ctx);
-    }
-    int oc_block, ic_block;
-    if (use_nested_generator()) {
-        auto temp_generator = create_generator();
-        auto gen_1x1 = dynamic_cast<gen_nested_conv1x1_backprop_data_t *>(
-                temp_generator.get());
-        auto gen_NxN = dynamic_cast<gen_nested_convNxN_backprop_data_t *>(
-                temp_generator.get());
-        ic_block = is_1x1_ ? gen_1x1->im_ic_block_ : gen_NxN->im_ic_block_;
-        oc_block = is_1x1_ ? gen_1x1->im_oc_block_ : gen_NxN->im_oc_block_;
-    } else {
-        const conv_bwd_data_config_t &tcfg
-                = *config_data_.get_as<conv_bwd_data_config_t>();
-        ic_block = tcfg.C_block, oc_block = tcfg.K_block;
-    }
-    const bool is_3d = ndims_ == 5;
-    in_formats.reserve(get_inputs().size());
-    bool is_bf16 = info_.inputs_[0]->details_.dtype_ == datatypes::bf16;
-    // plain input format
-    in_formats.push_back(
-            {is_3d ? sc_data_format_t::NDHWC() : sc_data_format_t::NHWC()});
-    if (is_bf16) {
-        COMPILE_ASSERT(info_.inputs_[1]->details_.dtype_ == datatypes::bf16,
-                "The two inputs of conv_bwd_data_op_t should have the same "
-                "data "
-                "format");
-        // CKRSkc2k or CKDRSkc2k
-        in_formats.push_back(
-                {is_3d ? sc_data_format_t::CKDRSkc2k(oc_block, ic_block)
-                       : sc_data_format_t::CKRSkc2k(oc_block, ic_block)});
-    } else {
-        // CKRSkc or CKDRSkc
-        in_formats.push_back(
-                {is_3d ? sc_data_format_t::CKDRSkc(oc_block, ic_block)
-                       : sc_data_format_t::CKRSkc(oc_block, ic_block)});
-    }
-    // plain output format
-    out_formats.push_back(
-            {is_3d ? sc_data_format_t::NDHWC() : sc_data_format_t::NHWC()});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-conv_bwd_weight_core_op_t::conv_bwd_weight_core_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : tunable_op_t("conv_bwd_weight_core", ins, outs, attrs) {
-    COMPILE_ASSERT(info_.inputs_.size() == 2 || info_.inputs_.size() == 3,
-            "conv_bwd_weight_core expects 2 or 3 inputs");
-    auto &in_data_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &in_fwd_output_dims = info_.inputs_[1]->details_.get_plain_dims();
-    auto &weight_shape = attrs_.get<sc_dims>("weights_shape");
-    is_1x1_ = std::all_of(weight_shape.begin() + 2, weight_shape.end(),
-            [](int x) { return x == 1; });
-    COMPILE_ASSERT(in_data_dims[0] == in_fwd_output_dims[0],
-            "The two inputs of conv_bwd_weight_core should have the same batch "
-            "size.");
-    COMPILE_ASSERT(info_.inputs_[0]->details_.dtype_
-                    == info_.inputs_[1]->details_.dtype_,
-            "The two inputs of conv_bwd_weight_core should have the "
-            "same datatype");
-    ndims_ = in_data_dims.size();
-    auto strides = attrs_.get<sc_dims>("strides");
-    auto dilations = get_dilations(attrs_);
-    COMPILE_ASSERT(std::all_of(dilations.begin(), dilations.end(),
-                           [](int x) { return x == 1; }),
-            "conv_bwd_data_core does not support dilation > 1 now");
-    if (attrs_.has_key("auto_pad")) {
-        auto pad_type = attrs_.get<std::string>("auto_pad");
-        if (pad_type == "VALID") {
-            attrs_.set<sc_dims>("pads_begin", sc_dims(ndims_ - 2, 0));
-            attrs_.set<sc_dims>("pads_end", sc_dims(ndims_ - 2, 0));
-        } else if (pad_type == "SAME_UPPER" || pad_type == "SAME_LOWER") {
-            // output spatial dims are equal to input spatial dims
-            conv_fwd_core_op_t::infer_auto_pad(get_owner_graph(), in_data_dims,
-                    weight_shape, strides, dilations, attrs_,
-                    pad_type == "SAME_UPPER");
-        }
-        attrs_.set<std::string>("auto_pad", "none");
-    }
-    const auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    const auto &pads_end = attrs_.has_key("pads_end")
-            ? attrs_.get<sc_dims>("pads_end")
-            : attrs_.get<sc_dims>("paddings");
-    bool has_pad = std::any_of(pads_begin.begin(), pads_begin.end(),
-                           [](sc_dim p) { return p > 0; })
-            || std::any_of(pads_end.begin(), pads_end.end(),
-                    [](sc_dim p) { return p > 0; });
-
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-                this, sc_data_format_t(), weight_shape, datatypes::f32));
-    } else {
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "conv_bwd_weight_core expects 1 output");
-        COMPILE_ASSERT(
-                info_.outputs_[0]->details_.get_plain_dims() == weight_shape,
-                "conv_bwd_weight_core's out dims not correct");
-    }
-}
-
-bool conv_bwd_weight_core_op_t::use_nested_generator() {
-    bool use_nested = attrs_.get_or_else("use_nested", true);
-    if (!use_nested) { return false; }
-    const sc_dims &stride = attrs_.get<sc_dims>("strides");
-    const sc_dims &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    int num_threads = runtime_config_t::get().get_num_threads();
-    const sc_dims &weight_shape = info_.outputs_[0]->details_.get_plain_dims();
-    const sc_dims &input_shape = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &delta_shape = info_.inputs_[1]->details_.get_plain_dims();
-    if (!is_1x1_) {
-        // nested generator constraints for NxN case
-        if (num_threads % 7 != 0) { return false; }
-        if (ndims_ != 4) { return false; }
-        int R = weight_shape[ndims_ - 2];
-        int S = weight_shape[ndims_ - 1];
-        int stride_h = stride[0], stride_w = stride[0];
-        if (stride.size() > 1) { stride_w = stride[1]; }
-        if (stride_h > R || stride_w > S) { return false; }
-        auto tmp_kernel = utils::make_unique<gen_nested_convNXN_bwd_weight_t>(
-                this, stride, pads_begin,
-                graph::extract_detail_from_tensors(get_inputs()),
-                graph::extract_detail_from_tensors(get_outputs()));
-        int im_oc_block = tmp_kernel->im_oc_block_;
-        int im_ic_block = tmp_kernel->im_ic_block_;
-        int im_bs_block = tmp_kernel->im_bs_block_;
-        int BS = input_shape[0], IC = input_shape[1], OC = delta_shape[1],
-            OH = delta_shape[2];
-        if (BS % im_bs_block || IC % im_ic_block || OC % im_oc_block
-                || OH % 7) {
-            return false;
-        }
-        // we need to check whether we can fully utilize all threads
-        int possible_parallel_space = (BS / im_bs_block) * (IC / im_ic_block)
-                * (OC / im_oc_block) * (OH / 7);
-        if (possible_parallel_space < num_threads) { return false; }
-        return true;
-    } else {
-        // nested generator constraints for 1x1 case
-        if (num_threads % 7 != 0) { return false; }
-        if (ndims_ != 4) { return false; }
-        auto tmp_kernel
-                = utils::make_unique<gen_nested_conv1x1_backprop_weight_t>(this,
-                        stride, pads_begin,
-                        graph::extract_detail_from_tensors(get_inputs()),
-                        graph::extract_detail_from_tensors(get_outputs()));
-        int im_oc_block = tmp_kernel->im_oc_block_;
-        int im_ic_block = tmp_kernel->im_ic_block_;
-        int im_bs_block = tmp_kernel->im_bs_block_;
-        int BS = input_shape[0], IC = input_shape[1], OC = delta_shape[1],
-            OH = delta_shape[2];
-        if (BS % im_bs_block || IC % im_ic_block || OC % im_oc_block
-                || OH % 7) {
-            return false;
-        }
-        // we need to check whether we can fully utilize all threads
-        int possible_parallel_space = (BS / im_bs_block) * (IC / im_ic_block)
-                * (OC / im_oc_block) * (OH / 7);
-        if (possible_parallel_space < num_threads) { return false; }
-        return true;
-    }
-    return false;
-}
-
-body_generator_ptr conv_bwd_weight_core_op_t::create_generator() {
-    auto &stride = attrs_.get<sc_dims>("strides");
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    auto &weight_shape = attrs_.get<sc_dims>("weights_shape");
-    sc_dims input_dims = info_.inputs_[0]->details_.get_plain_dims();
-    if (is_1x1_) {
-        if (use_nested_generator()) {
-            return utils::make_unique<gen_nested_conv1x1_backprop_weight_t>(
-                    this, stride, pads_begin,
-                    graph::extract_detail_from_tensors(get_inputs()),
-                    graph::extract_detail_from_tensors(get_outputs()));
-        } else {
-            SC_MODULE_WARN
-                    << "Fall-back to non-nested conv1x1 backprop weight.";
-            // tested for reduce on ALL
-            int block_size = 64;
-            if (weight_shape[0] * weight_shape[1] * input_dims[0]
-                            / (block_size * block_size * block_size)
-                    < runtime_config_t::get().get_num_threads()) {
-                return utils::make_unique<gen_conv1x1_backprop_weight_t>(this,
-                        stride, pads_begin,
-                        graph::extract_detail_from_tensors(get_inputs()),
-                        graph::extract_detail_from_tensors(get_outputs()),
-                        gen_conv1x1_backprop_weight_t::generator_type_t::
-                                REDUCE_ALL2);
-            } else {
-                return utils::make_unique<gen_conv1x1_backprop_weight_t>(this,
-                        stride, pads_begin,
-                        graph::extract_detail_from_tensors(get_inputs()),
-                        graph::extract_detail_from_tensors(get_outputs()),
-                        gen_conv1x1_backprop_weight_t::generator_type_t::
-                                REDUCE_N);
-            }
-        }
-    } else {
-        if (use_nested_generator()) {
-            return utils::make_unique<gen_nested_convNXN_bwd_weight_t>(this,
-                    stride, pads_begin,
-                    graph::extract_detail_from_tensors(get_inputs()),
-                    graph::extract_detail_from_tensors(get_outputs()));
-        }
-        SC_MODULE_WARN << "Fall-back to non-nested convNxN backprop weight.";
-        return utils::make_unique<gen_convNxN_backprop_weight>(this, stride,
-                pads_begin, graph::extract_detail_from_tensors(get_inputs()),
-                graph::extract_detail_from_tensors(get_outputs()),
-                gen_convNxN_backprop_weight::generator_type_t::REDUCE_N);
-    }
-}
-
-float conv_bwd_weight_core_op_t::get_gflop() {
-    return create_generator()->get_gflop();
-}
-
-void conv_bwd_weight_core_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    if (!config_data_) {
-        config_data_ = create_generator()->get_default_config(ctx);
-    }
-    if (use_nested_generator()) {
-        auto temp_generator = create_generator();
-        auto gen_1x1 = dynamic_cast<gen_nested_conv1x1_backprop_weight_t *>(
-                temp_generator.get());
-        auto gen_NxN = dynamic_cast<gen_nested_convNXN_bwd_weight_t *>(
-                temp_generator.get());
-        int im_bs_block
-                = is_1x1_ ? gen_1x1->im_bs_block_ : gen_NxN->im_bs_block_;
-        int im_ic_block
-                = is_1x1_ ? gen_1x1->im_ic_block_ : gen_NxN->im_ic_block_;
-        int im_oc_block
-                = is_1x1_ ? gen_1x1->im_oc_block_ : gen_NxN->im_oc_block_;
-        const bool is_3d = ndims_ == 5;
-        in_formats.reserve(get_inputs().size());
-        if (!is_1x1_) {
-            in_formats.push_back(
-                    {is_3d ? sc_data_format_t::NDHWCn(im_bs_block)
-                           : sc_data_format_t::NHWCn(im_bs_block)});
-        } else {
-            in_formats.push_back({is_3d ? sc_data_format_t::NDHWC()
-                                        : sc_data_format_t::NHWC()});
-        }
-        // N(D)HWK
-        in_formats.push_back(
-                {is_3d ? sc_data_format_t::NDHWC() : sc_data_format_t::NHWC()});
-        out_formats.push_back(
-                {is_3d ? sc_data_format_t::CKDRSck(im_ic_block, im_oc_block)
-                       : sc_data_format_t::CKRSck(im_ic_block, im_oc_block)});
-        format_to_dense_format_stride_pair(
-                in_formats, out_formats, supported_ins, supported_outs);
-        return;
-    }
-    const conv_bwd_weight_config_t &tcfg
-            = *config_data_.get_as<conv_bwd_weight_config_t>();
-    const bool is_3d = ndims_ == 5;
-    in_formats.reserve(get_inputs().size());
-
-    // NC(D)HWnc or NC(D)HWnc2n
-    if (info_.inputs_[0]->details_.dtype_ == datatypes::bf16) {
-        in_formats.push_back(
-                {is_3d ? sc_data_format_t(
-                         sc_data_format_kind_t(0, 1, 2, 3, 4, 0, 1, 0),
-                         {tcfg.N_block, tcfg.C_block, 2})
-                       : sc_data_format_t(
-                               sc_data_format_kind_t(0, 1, 2, 3, 0, 1, 0),
-                               {tcfg.N_block, tcfg.C_block, 2})});
-    } else {
-        // NC(D)HWnc
-        in_formats.push_back(
-                {is_3d ? sc_data_format_t(
-                         sc_data_format_kind_t(0, 1, 2, 3, 4, 0, 1),
-                         {tcfg.N_block, tcfg.C_block})
-                       : sc_data_format_t(
-                               sc_data_format_kind_t(0, 1, 2, 3, 0, 1),
-                               {tcfg.N_block, tcfg.C_block})});
-    }
-    // NK(D)HWkn
-    in_formats.push_back(
-            {is_3d ? sc_data_format_t(
-                     sc_data_format_kind_t(0, 1, 2, 3, 4, 1, 0),
-                     {tcfg.K_block, tcfg.N_block})
-                   : sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 1, 0),
-                           {tcfg.K_block, tcfg.N_block})});
-    // KC(D)RSkc
-    out_formats.push_back(
-            {is_3d ? sc_data_format_t(
-                     sc_data_format_kind_t(0, 1, 2, 3, 4, 0, 1),
-                     {tcfg.K_block, tcfg.C_block})
-                   : sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 0, 1),
-                           {tcfg.K_block, tcfg.C_block})});
-
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-} // namespace ops
-OP_REGISTER(ops::conv_fwd_core_op_t, conv_fwd_core)
-OP_REGISTER(ops::conv_bwd_data_core_op_t, conv_bwd_data_core)
-OP_REGISTER(ops::conv_bwd_weight_core_op_t, conv_bwd_weight_core)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/convolution.hpp b/src/graph/backend/graph_compiler/core/src/ops/convolution.hpp
deleted file mode 100644
index 50313d13f98..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/convolution.hpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_CONVOLUTION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_CONVOLUTION_HPP
-
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <ops/templates/nested_conv_fwd.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-inline sc_dims get_dilations(const any_map_t &attr) {
-    // In dumped graph, dilation is represented as std::vector<int>
-    // but in onednn graph doc, the type of dilation is s64
-    sc_dims dilations;
-    try {
-        dilations = attr.get_or_else("dilations", sc_dims({1}));
-    } catch (...) {
-        auto dilation_i = attr.get_or_else("dilations", std::vector<int>({1}));
-        dilations = sc_dims(dilation_i.begin(), dilation_i.end());
-    }
-    return dilations;
-}
-
-class SC_INTERNAL_API conv_fwd_core_op_t : public tunable_op_t {
-public:
-    conv_fwd_core_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    body_generator_ptr create_generator() override;
-    float get_gflop() override;
-    void infer_out_tensor_details() override;
-    static sc_dims infer_out_dims(sc_graph_t &owner_graph,
-            const sc_dims &input_dims, const sc_dims &weight_dims,
-            const sc_dims &pads_begin, const sc_dims &pads_end,
-            const sc_dims &stride, const sc_dims &dilation,
-            const any_map_t &attrs = {});
-    static void infer_auto_pad(sc_graph_t &owner_graph,
-            const sc_dims &input_dims, const sc_dims &weight_dims,
-            const sc_dims &stride, const sc_dims &dilation, any_map_t &attrs,
-            bool is_same_upper, bool is_NGCX_layout = false);
-    sc_data_type_t infer_out_dtype(const sc_data_type_t &input_dtype,
-            const sc_data_type_t &weight_dtype);
-    void check_dtypes(const sc_data_type_t &data_dtype,
-            const sc_data_type_t &weight_dtype,
-            const sc_data_type_t &out_dtype = datatypes::undef);
-    sc_op_ptr do_compensations(sc_graph_t &g, const context_ptr &ctx) override;
-    sc_op_ptr get_data_compensation(sc_graph_t &g);
-    std::vector<sc_op_ptr> get_s8s8_and_weight_compensation(
-            sc_graph_t &g, bool s8s8_compensation);
-    sc_op_ptr get_constant_compensation(sc_graph_t &g);
-    bool use_nested_conv_fwd_generator();
-    bool use_conv1d(const context_ptr &ctx);
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-
-    void set_config_by_key(
-            const op_dispatch_key_t &key, const context_ptr &ctx) override;
-    virtual sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins, // NOLINT
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-    std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx) override;
-    shape_rl_vec get_dynamic_shape_relations() const override;
-    void calculate_dynamic_shape_expression() override;
-    static shape_rl_vec get_shape_relations_impl(const sc_dims &data_plain_dims,
-            const sc_dims &weight_plain_dims, const sc_dims &out_plain_dims,
-            const any_map_t &attrs);
-    reflection::shared_general_object_t get_dynamic_runtime_info() override;
-
-private:
-    int ndims_ = 0;
-    nested_conv_fwd_config_t dynamic_conv_param {};
-};
-
-class SC_INTERNAL_API conv_bwd_data_core_op_t : public tunable_op_t {
-public:
-    conv_bwd_data_core_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    body_generator_ptr create_generator() override;
-    float get_gflop() override;
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override {
-        // TODO(XXX)
-        return infer_status_code::FAIL;
-    }
-    bool use_nested_generator();
-
-private:
-    int ndims_ = 0;
-    bool is_1x1_;
-};
-
-class SC_INTERNAL_API conv_bwd_weight_core_op_t : public tunable_op_t {
-public:
-    conv_bwd_weight_core_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    body_generator_ptr create_generator() override;
-    float get_gflop() override;
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override {
-        // TODO(XXX)
-        return infer_status_code::FAIL;
-    }
-    bool use_nested_generator();
-
-private:
-    int ndims_ = 0;
-    bool is_1x1_;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/duplicate.cpp b/src/graph/backend/graph_compiler/core/src/ops/duplicate.cpp
deleted file mode 100644
index 6907adf967c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/duplicate.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "duplicate.hpp"
-#include <memory>
-#include <utility>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-duplicate_op::duplicate_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-    }
-    attrs_ = attrs;
-    op_name_ = "duplicate";
-}
-
-void duplicate_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    // input
-    graph->make_input(inputs);
-    // Use reorder to do coping.
-    auto copy_op = graph->make("reorder", inputs, outputs,
-            any_map_t({{"format", inputs[0]->details_.get_format()},
-                    {"internal", true}, {"actually_copy", true}}));
-    // output
-    graph->make_output(copy_op->get_outputs());
-}
-} // namespace ops
-
-OP_REGISTER(ops::duplicate_op, duplicate)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/duplicate.hpp b/src/graph/backend/graph_compiler/core/src/ops/duplicate.hpp
deleted file mode 100644
index d473d75026c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/duplicate.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_DUPLICATE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_DUPLICATE_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-// This op is used to duplicate buffers for the cases one buffer is used for two
-// or more inputs/outputs.
-class duplicate_op : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    duplicate_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/dynamic_transpose.cpp b/src/graph/backend/graph_compiler/core/src/ops/dynamic_transpose.cpp
deleted file mode 100644
index 5cbb5fe5f64..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/dynamic_transpose.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <memory>
-
-#include "dynamic_transpose.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/statics_table.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-dynamic_transpose_op::dynamic_transpose_op(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : sc_op("dynamic_transpose_op", ins, outs, attrs) {
-    COMPILE_ASSERT(
-            ins.size() == 2, "Dynamic transpose op shall take 2 inputs.");
-    sc_op *order_node = ins[1]->producer_owner_;
-    COMPILE_ASSERT(
-            order_node->isa<input_op>() || order_node->isa<constant_op_t>(),
-            "Dynamic transpose expects input or constant node as the 2nd "
-            "input.");
-    COMPILE_ASSERT(order_node->attrs_.has_key("values"),
-            "Dynamic transpose's 2nd input is expected to have value "
-            "attributes.");
-    int32_t *order_value = reinterpret_cast<int32_t *>(
-            order_node->attrs_.get<std::shared_ptr<static_data_t>>("values")
-                    ->data_);
-    sc_dims outshape(ins[0]->details_.get_plain_dims().size());
-    for (size_t i = 0; i < outshape.size(); ++i) {
-        order_.emplace_back(order_value[i]);
-        assert(order_.back() >= 0 && order_.back() < (int)outshape.size());
-        outshape[i] = ins[0]->details_.get_plain_dims()[order_.back()];
-    }
-
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(graph_tensor::make(outshape,
-                sc_data_format_t(), get_inputs()[0]->details_.dtype_));
-    } else {
-        COMPILE_ASSERT(
-                gc::graph::check_shape_equal(
-                        info_.outputs_[0]->details_.get_plain_dims(), outshape),
-                "Dynamic transpose's output shape does not confirm with the "
-                "permutation order.")
-    }
-}
-
-void dynamic_transpose_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    throw std::runtime_error("Not implemented");
-}
-ir_module_ptr dynamic_transpose_op::get_func(context_ptr ctx) {
-    throw std::runtime_error("Not implemented");
-}
-sc_op_ptr dynamic_transpose_op::constant_optimize(sc_graph_t &graph) {
-    // temporarily use contant optimize pass to do the dynamic_transpose ->
-    // transpose replacement
-    auto new_input = graph.make(
-            "transpose", {get_inputs()[0]}, {}, {{"order", order_}});
-    this->replace_uses_with_and_remove(new_input);
-    return new_input;
-}
-} // namespace ops
-
-OP_REGISTER(ops::dynamic_transpose_op, dynamic_transpose);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/dynamic_transpose.hpp b/src/graph/backend/graph_compiler/core/src/ops/dynamic_transpose.hpp
deleted file mode 100644
index 4798352383f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/dynamic_transpose.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_DYNAMIC_TRANSPOSE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_DYNAMIC_TRANSPOSE_HPP
-
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/traits.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-// not truly dynamic, just used to align with current llga's transpose's schema
-class dynamic_transpose_op : public sc_op,
-                             public op_traits::auto_copyable_t,
-                             public op_traits::constant_optimizable_t {
-public:
-    dynamic_transpose_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    ir_module_ptr get_func(context_ptr ctx) override;
-    sc_op_ptr constant_optimize(sc_graph_t &graph) override;
-
-private:
-    std::vector<int> order_;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_backward.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_backward.cpp
deleted file mode 100644
index f6a6dd8d1f8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_backward.cpp
+++ /dev/null
@@ -1,560 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <atomic>
-#include <cstddef>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "binary_backward.hpp"
-#include "compiler/dimensions.hpp"
-#include "compiler/ir/graph/tensor_slice.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/brgemm_fusion.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-#include <runtime/microkernel/cpu/brgemm_alg_kind.hpp>
-#include <unordered_map>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static std::atomic<int> idx = {0};
-
-std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-binary_backward_op_impl_t::get_inplace_map() {
-    std::vector<tensor_inplace_info_t> ret;
-    auto &inp = get_inputs();
-    auto &out_dim = get_outputs()[0]->details_.get_plain_dims();
-    for (size_t i = 0; i < inp.size(); i++) {
-        if (inp[i]->details_.get_plain_dims() == out_dim) {
-            ret.emplace_back(tensor_inplace_info_t {
-                    static_cast<int>(i), inplace_kind::ZERO_OFFSET});
-        }
-    }
-    if (ret.empty()) { return {}; }
-    return {{0, std::move(ret)}, {1, std::move(ret)}};
-}
-
-binary_backward_op_impl_t::binary_backward_op_impl_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs,
-        const binary_backward_operator &backward_opt) {
-    info_.inputs_ = ins;
-    COMPILE_ASSERT(ins.size() == 3, "Binary backward op shall have 3 inputs.");
-    COMPILE_ASSERT(ins[0]->details_.get_plain_dims()
-                            == ins[1]->details_.get_plain_dims()
-                    && ins[1]->details_.get_plain_dims()
-                            == ins[2]->details_.get_plain_dims(),
-            "Binary backward op's all inputs should have the same shape ");
-    backward_op_type = backward_opt;
-    auto &input_1 = info_.inputs_[0]->details_.get_plain_dims();
-    auto &input_2 = info_.inputs_[1]->details_.get_plain_dims();
-    auto &input_3 = info_.inputs_[2]->details_.get_plain_dims();
-
-    auto &output_shape = input_1;
-    auto set_out_shape_format = [this, &output_shape]() {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        size_t sz = info_.outputs_.size();
-        auto pos = sz - 1;
-        info_.outputs_[pos]->details_.set_plain_dims(output_shape);
-        auto output_format = info_.inputs_[0]->details_.get_format();
-        info_.outputs_[pos]->details_.set_format(output_format);
-        info_.outputs_[pos]->details_.dtype_
-                = info_.inputs_[0]->details_.dtype_;
-    };
-    if (outs.empty()) {
-        // out[0]
-        set_out_shape_format();
-        // out[1]
-        set_out_shape_format();
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(
-                outs.size() == 2, "Binary backward op shall have 2 outpus.");
-    }
-    COMPILE_ASSERT(info_.outputs_[0]->details_.get_plain_dims() == output_shape,
-            "binary backward op's output shape is not set correctly.");
-    attrs_ = attrs;
-}
-
-static void compute_binary_backward_vectorized_op(const context_ptr &ctx,
-        sc_graph_t &graph, const std::vector<const tensor_slice *> &src,
-        const std::vector<tensor_slice *> &dst, sc_op_info_t &info,
-        const vectorized_info_t &vx_info,
-        const mask_compute_func_t &compute_lanes,
-        const mask_compute_func_t &compute_scalar, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld, bool use_mask) {
-    auto *dst_1 = dst[0];
-    auto *dst_2 = dst[1];
-    // In order to support non-stride test, we add dense_stride flag.
-    // If it is non-stride shape, we just use step = 1 to do
-    // this.
-    int graph_input_size = info.inputs_.size();
-    bool dense_stride = std::all_of(info.inputs_.begin(), info.inputs_.end(),
-            [](const graph_tensor_ptr &in) { return in->details_.is_dense(); });
-    bool is_blocking_shape = is_op_input_blocking_shape(info);
-    // nested loop vars
-    std::vector<expr> iter_vars;
-    // the indices for multiple inputs. First dim: the input, Second dim:
-    // the dimemsions in the tensor
-    std::vector<std::vector<expr>> src_indices_floor(src.size());
-    std::vector<std::vector<expr>> src_indices_tail(src.size());
-    // the indices for the output tensor
-    std::vector<expr> dst_idx_floor;
-    std::vector<expr> dst_idx_tail;
-    for (unsigned i = 0; i < dst_1->nslice_dims(); i++) {
-        // make the loop var for the for-loop
-        iter_vars.emplace_back(range_from_outer_loop(dst_1->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + std::to_string(idx++)));
-        // for each input tensor
-        for (size_t j = 0; j < src.size(); j++) {
-            auto &src_idx_floor = src_indices_floor.at(j);
-            auto &src_idx_tail = src_indices_tail.at(j);
-            // push an index
-            src_idx_floor.emplace_back(iter_vars.back());
-            src_idx_tail.emplace_back(iter_vars.back());
-        }
-        // push an index for output tensor
-        dst_idx_floor.emplace_back(iter_vars.back());
-        dst_idx_tail.emplace_back(iter_vars.back());
-    }
-    auto tail_var = builder::make_var(
-            datatypes::index, std::string("_fuseiter") + std::to_string(idx++));
-    for (size_t j = 0; j < src.size(); j++) {
-        auto &src_idx_tail = src_indices_tail.at(j);
-        src_idx_tail[vx_info.axis] = tail_var;
-    }
-    dst_idx_tail[vx_info.axis] = tail_var;
-    expr indexed_target_floor_1, indexed_target_floor_2;
-
-    auto bld = builder::get_current_builder();
-    bld->push_scope();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    int lanes = static_cast<int>(vx_info.lanes);
-    auto slice_len = dst_1->get_shape().at(vx_info.axis);
-    auto floor = do_cast_and_fold(slice_len / lanes * lanes);
-    auto tail = do_cast_and_fold(slice_len % lanes);
-    int floor_int = 0;
-    int tail_int = 0;
-    if (floor.isa<constant>()) {
-        floor_int = get_expr_as_int(floor);
-        tail_int = get_expr_as_int(tail);
-        COMPILE_ASSERT((floor_int + tail_int), "Don't support shape len = 0.");
-    }
-    const int INVALID_AXIS_MASK = -64;
-    int last_axis_mask = INVALID_AXIS_MASK;
-    std::unordered_map<expr, std::pair<expr, expr>> conditions;
-    if (use_mask) {
-        compute_mask_and_generate_condition(graph, src,
-                info.inputs_[0]->details_.get_plain_dims(),
-                info.inputs_[0]->details_.get_format(), iter_vars,
-                vx_info.lanes, conditions, last_axis_mask);
-    }
-    if (last_axis_mask != INVALID_AXIS_MASK && floor_int > 0) {
-        COMPILE_ASSERT(tail_int == 0,
-                "Currently we only support mask in vectorize compute not "
-                "tail.");
-    }
-    std::vector<stmt_c> tcur;
-    stmt cur;
-    int loop_size = static_cast<int>(dst_1->get_shape().size());
-    bool tail_threshold = tail.isa<constant>() && tail_int <= 1;
-    bool use_scalar = tail_threshold || lanes == 1 || !dense_stride;
-
-    // generate inner loop assign value IR
-    if (dense_stride && (!floor.isa<constant>() || floor_int)) {
-        // if the shape is less than lanes, we don't use mask to
-        // process.
-
-        // dst_1
-        indexing_from_diff_cond(false, false, *dst_1, dst_idx_floor, lanes,
-                indexed_target_floor_1, slice_len, iter_vars.at(vx_info.axis),
-                floor);
-        // dst_2
-        indexing_from_diff_cond(false, false, *dst_2, dst_idx_floor, lanes,
-                indexed_target_floor_2, slice_len, iter_vars.at(vx_info.axis),
-                floor);
-        std::vector<expr> indexed_input_floor;
-        expr input_floor_idx;
-        for (unsigned j = 0; j < src.size(); j++) {
-            indexed_input_floor.emplace_back(
-                    indexing_from_diff_cond(false, false, *src.at(j),
-                            src_indices_floor.at(j), lanes, input_floor_idx,
-                            slice_len, iter_vars.at(vx_info.axis), floor));
-        }
-        std::vector<expr::lvalue_proxy_t> target_floor
-                = {expr::lvalue_proxy_t(indexed_target_floor_1, false),
-                        expr::lvalue_proxy_t(indexed_target_floor_2, false)};
-        auto cond_it = conditions.find(iter_vars[vx_info.axis]);
-        if (cond_it != conditions.end()) {
-            assert(last_axis_mask != INVALID_AXIS_MASK);
-            cur = compute_lanes(indexed_input_floor, target_floor,
-                    cond_it->second.first, cond_it->second.second,
-                    vx_info.lanes);
-        } else {
-            cur = compute_lanes(indexed_input_floor, target_floor);
-        }
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        if (iter_vars.at(vx_info.axis).isa<var>()) {
-            cur = make_stmt<for_loop_node_t>(iter_vars.at(vx_info.axis),
-                    expr(0), floor, expr(lanes), cur, true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, vx_info.axis, true);
-        }
-        tcur.emplace_back(cur);
-    }
-    if (((!tail.isa<constant>() && !is_blocking_shape) || tail_int)
-            || !dense_stride) {
-        std::vector<expr> indexed_input_tail;
-        expr mask;
-        if (!use_scalar) {
-            mask = last_dim_generate_mask(
-                    tail_var, floor, slice_len, lanes, true);
-        }
-        expr indexed_target_tail_1 = builder::make_indexing(
-                dst_1->tptr_, dst_idx_tail, use_scalar ? 1 : lanes, mask);
-        expr indexed_target_tail_2 = builder::make_indexing(
-                dst_2->tptr_, dst_idx_tail, use_scalar ? 1 : lanes, mask);
-        for (unsigned j = 0; j < src.size(); j++) {
-            indexed_input_tail.emplace_back(builder::make_indexing(
-                    src.at(j)->tptr_, src_indices_tail.at(j),
-                    use_scalar ? 1 : lanes, mask));
-        }
-        std::vector<expr::lvalue_proxy_t> target_tail
-                = {expr::lvalue_proxy_t(indexed_target_tail_1, false),
-                        expr::lvalue_proxy_t(indexed_target_tail_2, false)};
-        if (use_scalar) {
-            cur = compute_scalar(indexed_input_tail, target_tail);
-        } else {
-            cur = compute_lanes(indexed_input_tail, target_tail);
-        }
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        cur = make_stmt<for_loop_node_t>(tail_var,
-                !dense_stride ? expr(0) : floor, slice_len,
-                use_scalar ? expr(1) : expr(lanes), cur, true,
-                for_type::NORMAL);
-        bind_loop_axis(expand_gt, cur, vx_info.axis, true);
-        tcur.emplace_back(cur);
-        // create fusible output anchor as demand
-        std::vector<int> anchor_pos_in_loop(1);
-        anchor_pos_in_loop.emplace_back(vx_info.axis);
-    }
-    cur = builder::make_stmts_unattached(tcur);
-    // recover schedule loop
-    for (int i = loop_size - 1; i >= 0; i--) {
-        if (i != vx_info.axis) {
-            stmt body;
-            if (iter_vars.at(i).isa<var>()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst_1->get_shape().at(i),
-                        vx_info.axis == i ? lanes : expr(1), std::move(body),
-                        true, for_type::NORMAL);
-                bind_loop_axis(expand_gt, cur, i, true);
-            }
-        }
-    }
-    cur->attr()[stmt_attr_key::merge_loop] = true;
-    bld->emit(cur);
-}
-
-void binary_backward_op_impl_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    COMPILE_ASSERT(info_.inputs_[0]->details_.get_plain_dims()
-                    == info_.outputs_[0]->details_.get_plain_dims(),
-            "Wrong op output shapes.\n");
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // set default vectorized information
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-
-    for (int64_t i = dst[0]->nslice_dims() - 1; i >= 0; --i) {
-        auto cur_dim = dst[0]->get_shape()[i];
-        if (!cur_dim.isa<constant>()
-                || get_const_as_int(cur_dim.checked_as<constant>()) > 1) {
-            vx_info_.axis = i;
-            break;
-        }
-    }
-    vx_info_.lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    bool use_mask = attrs_.get_or_else(op_attr_key::use_padded_mask, true);
-    if (get_owner_graph().is_dynamic()) {
-        use_mask &= info_.cur_impl_ != impl_kind_t::no_padding;
-    }
-    auto func = [&](const std::vector<expr> &in,
-                        const std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-        auto out_dtype = out[0]->dtype_;
-        expr in0, in1, in2;
-        in0 = in[0], in1 = in[1], in2 = in[2];
-        if (in[0]->dtype_ != out_dtype) {
-            in0 = builder::make_cast(out_dtype, in[0]);
-        }
-        if (in[1]->dtype_ != out_dtype) {
-            in1 = builder::make_cast(out_dtype, in[1]);
-        }
-        if (in[2]->dtype_ != out_dtype) {
-            in2 = builder::make_cast(out_dtype, in[2]);
-        }
-
-        switch (backward_op_type) {
-            case binary_backward_operator::PRELU_BWD: {
-                // src diff
-                expr res_out0 = builder::make_select(
-                        in0 > make_expr<constant_node>(0.f, in0->dtype_), in2,
-                        builder::make_mul(in1, in2));
-                // weight diff
-                expr res_out1 = builder::make_mul(
-                        builder::make_min(in0,
-                                make_expr<constant_node>(0.f, in0->dtype_)),
-                        in2);
-                auto assign_out0
-                        = builder::make_assign_unattached(out[0], res_out0);
-                auto assign_out1
-                        = builder::make_assign_unattached(out[1], res_out1);
-                return builder::make_stmts_unattached(
-                        {assign_out0, assign_out1});
-            } break;
-            default: {
-                COMPILE_ASSERT(false,
-                        "Unsupport binary backward op "
-                        "found.\n");
-                return stmt();
-            } break;
-        }
-        return stmt();
-    };
-    compute_binary_backward_vectorized_op(ctx, get_owner_graph(), inputs, dst,
-            info_, vx_info_, mask_compute_func_t(func),
-            mask_compute_func_t(func), attrs_, get_outputs()[0], wkld,
-            use_mask);
-}
-
-void binary_backward_op_impl_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    const auto &in0_format = info_.inputs_[0]->details_.get_format();
-    const auto &in1_format = info_.inputs_[1]->details_.get_format();
-    const auto &in2_format = info_.inputs_[2]->details_.get_format();
-
-    in_formats.push_back({in0_format});
-    in_formats.push_back({in1_format});
-    in_formats.push_back({in2_format});
-    out_formats.push_back({in0_format});
-    out_formats.push_back({in0_format});
-
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-infer_status_code binary_backward_op_impl_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    COMPILE_ASSERT(get_inputs().size() == 3,
-            "Binary backward op is expected 3 inputs");
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    // double-check all known case
-    if (known_ranges_map.size() == get_inputs().size()) {
-        auto erase_datamap = [&known_ranges_map, this, &fsmap](
-                                     int32_t p1, int32_t p2) {
-            if (known_ranges_map[p1].size() != known_ranges_map[p2].size()) {
-                // try to align with smaller one and erase bigger one
-                int erase_input_id = known_ranges_map[p1].size()
-                                < known_ranges_map[p2].size()
-                        ? p2
-                        : p1;
-                known_ranges_map.erase(erase_input_id);
-                fsmap.datamap_.erase(get_inputs()[erase_input_id].get());
-            }
-        };
-        erase_datamap(0, 1);
-        erase_datamap(1, 2);
-    }
-    // if unkown slice ranges exist.
-    if (known_ranges_map.size() < get_inputs().size()) {
-        int32_t iter = get_inputs().size();
-        std::vector<int32_t> arr_pos;
-        arr_pos.reserve(get_inputs().size());
-        int32_t know_pos = 0;
-        while (iter--) {
-            bool miss = known_ranges_map.find(iter) == known_ranges_map.end();
-            if (miss) {
-                arr_pos.emplace_back(iter);
-            } else {
-                know_pos = iter;
-            }
-        }
-        for (auto &x : arr_pos) {
-            known_ranges_map[x] = known_ranges_map[know_pos];
-        }
-        // set the other unknown slice range by achieved known_ranges_list
-        set_unknown_input_slice(this, known_ranges_map, fsmap);
-    }
-    auto &outslice_1 = fsmap.get(get_outputs()[0]);
-    auto &outslice_2 = fsmap.get(get_outputs()[1]);
-
-    outslice_1 = known_ranges_map[0];
-    outslice_2 = known_ranges_map[0];
-    return infer_status_code::OK;
-}
-
-void binary_backward_op_impl_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    // search known axis from any input of cur fusbile op
-    auto known_axis_map = search_known_input_axis(this, bdax_map);
-    if (!bdax_map.get(get_outputs()[0]).empty()
-            && !bdax_map.get(get_outputs()[1]).empty())
-        return;
-
-    if (known_axis_map.size() < get_inputs().size()) {
-        int32_t iter = get_inputs().size();
-        std::vector<int32_t> arr_pos;
-        arr_pos.reserve(get_inputs().size());
-        int32_t know_pos = 0;
-        while (iter--) {
-            bool miss = known_axis_map.find(iter) == known_axis_map.end();
-            if (miss) {
-                arr_pos.emplace_back(iter);
-            } else {
-                know_pos = iter;
-            }
-        }
-        for (auto x : arr_pos) {
-            known_axis_map[x] = known_axis_map[know_pos];
-        }
-    }
-    if (bdax_map.get(get_outputs()[0]).empty()) {
-        bdax_map.get(get_outputs()[0]) = known_axis_map[0];
-    }
-    if (bdax_map.get(get_outputs()[1]).empty()) {
-        bdax_map.get(get_outputs()[1]) = known_axis_map[0];
-    }
-    // set the other unknown slice range by achieved known_ranges_list
-    set_unknown_binding_axis(this, known_axis_map, bdax_map);
-}
-
-infer_status_code binary_backward_op_impl_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    auto &outslice_1 = fsmap.get(get_outputs()[0]);
-    auto &outslice_2 = fsmap.get(get_outputs()[1]);
-    if (outslice_1.empty() && outslice_2.empty()) {
-        return infer_status_code::RETRY;
-    }
-    for (size_t i = 0; i < get_inputs().size(); i++) {
-        auto &input = get_inputs()[i];
-        auto &inpslice = fsmap.get(input);
-        if (inpslice.empty()) {
-            inpslice = outslice_1.empty() ? outslice_2 : outslice_1;
-        }
-    }
-    return infer_status_code::OK;
-}
-
-void binary_backward_op_impl_t::pre_infer_binding_axis(
-        binding_axis_map &bdax_map) {
-    const int first_output = 0, second_output = 1;
-    auto &outaxis0 = bdax_map.get(get_outputs()[first_output]);
-    auto &outaxis1 = bdax_map.get(get_outputs()[second_output]);
-    COMPILE_ASSERT(!outaxis0.empty() || !outaxis1.empty(),
-            "Unknown output axis found, could not pre bind axis")
-    // just current output user do this
-    auto users_infer_bindmap = [this, &bdax_map](const int idx) {
-        for (auto &user : get_outputs()[idx]->uses_) {
-            if (auto bd_op = user.second->dyn_cast<
-                             op_traits::mixed_partition_acceptable>()) {
-                bd_op->infer_binding_axis(bdax_map);
-            }
-        }
-    };
-    if (outaxis0.empty()) {
-        bdax_map.get(get_outputs()[first_output]) = outaxis1;
-        users_infer_bindmap(first_output);
-    }
-    if (outaxis1.empty()) {
-        bdax_map.get(get_outputs()[second_output]) = outaxis0;
-        users_infer_bindmap(second_output);
-    }
-    for (size_t i = 0; i < get_inputs().size(); i++) {
-        auto &input = get_inputs()[i];
-        auto &inpaxis = bdax_map.get(input);
-        if (inpaxis.empty()) {
-            inpaxis = outaxis0.empty() ? outaxis1 : outaxis0;
-            if (auto bd_op = input->producer_owner_->dyn_cast<
-                             op_traits::mixed_partition_acceptable>()) {
-                bd_op->pre_infer_binding_axis(bdax_map);
-            }
-        }
-    }
-}
-
-shape_rl_vec binary_backward_op_impl_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in0_plain_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &in1_plain_dims = get_inputs()[1]->details_.get_plain_dims();
-    auto &in2_plain_dims = get_inputs()[2]->details_.get_plain_dims();
-    auto &out_plain_dims_0 = get_outputs()[0]->details_.get_plain_dims();
-    auto &out_plain_dims_1 = get_outputs()[1]->details_.get_plain_dims();
-    assert(in0_plain_dims.size() == in1_plain_dims.size()
-            || in0_plain_dims.size() == 1 || in1_plain_dims.size() == 1);
-    if (in0_plain_dims.size() == in1_plain_dims.size()) {
-        for (size_t i = 0; i < in0_plain_dims.size(); i++) {
-            if ((is_dynamic_dim(in0_plain_dims[i])
-                        || is_dynamic_dim(in1_plain_dims[i]))
-                    && in0_plain_dims[i] != 1 && in1_plain_dims[i] != 1) {
-                ret.emplace_back(in0_plain_dims[i], in1_plain_dims[i]);
-            }
-        }
-    }
-    auto add_rls_func = [&in0_plain_dims, &in1_plain_dims, &in2_plain_dims,
-                                &ret](const sc_dims &out_plain_dims) {
-        auto condition_add = [&ret, &out_plain_dims](
-                                     size_t i, const sc_dims &in_plain_dims) {
-            if (i < in_plain_dims.size() && in_plain_dims[i] != 1) {
-                ret.emplace_back(in_plain_dims[i], out_plain_dims[i]);
-            }
-        };
-        for (size_t i = 0; i < out_plain_dims.size(); i++) {
-            if (is_dynamic_dim(out_plain_dims[i])) {
-                condition_add(i, in0_plain_dims);
-                condition_add(i, in1_plain_dims);
-                condition_add(i, in2_plain_dims);
-            }
-        }
-    };
-    add_rls_func(out_plain_dims_0);
-    add_rls_func(out_plain_dims_1);
-    return ret;
-}
-
-OP_REGISTER(prelu_bwd_op_t, prelu_bwd)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_backward.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_backward.hpp
deleted file mode 100644
index a9b37caa42f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_backward.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_BINARY_BACKWARD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_BINARY_BACKWARD_HPP
-
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <runtime/microkernel/cpu/brgemm_alg_kind.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class binary_backward_operator { PRELU_BWD };
-
-class binary_backward_op_impl_t : public binary_backward_op_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() override;
-    binary_backward_op_impl_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs,
-            const binary_backward_operator &backward_opt);
-
-    void set_backward_operator(binary_backward_operator backward_op) {
-        backward_op_type = backward_op;
-    }
-    uint32_t get_lanes() const { return vx_info_.lanes; }
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    shape_rl_vec get_dynamic_shape_relations() const override;
-    vectorized_info_t &get_vx_info() { return vx_info_; }
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-private:
-    vectorized_info_t vx_info_;
-    binary_backward_operator backward_op_type;
-};
-
-class prelu_bwd_op_t : public binary_backward_op_impl_t {
-public:
-    // ins: ins[0] is src, ins[1] is diff_dst
-    prelu_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_backward_op_impl_t(
-                ins, outs, attrs, binary_backward_operator::PRELU_BWD) {
-        attrs_ = attrs;
-        op_name_ = "prelu_bwd";
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_elemwise.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_elemwise.cpp
deleted file mode 100644
index 07e8298f9bd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_elemwise.cpp
+++ /dev/null
@@ -1,1290 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include "binary_elemwise.hpp"
-#include "compiler/ir/attr_keys.hpp"
-#include "ops/fusible/unary_elemwise.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/brgemm_fusion.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-#include <unordered_map>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-template <class T>
-static expr_c constant_maker(T data, const sc_data_type_t &dtype) {
-    return make_expr<constant_node>(data, dtype);
-};
-
-std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-binary_elementwise_op_impl_t::get_inplace_map() {
-    std::vector<tensor_inplace_info_t> ret;
-    auto &inp = get_inputs();
-    auto &out_dim = get_outputs()[0]->details_.get_plain_dims();
-    for (size_t i = 0; i < inp.size(); i++) {
-        if (inp[i]->details_.get_plain_dims() == out_dim) {
-            ret.emplace_back(tensor_inplace_info_t {
-                    static_cast<int>(i), inplace_kind::ZERO_OFFSET});
-        }
-    }
-    if (ret.empty()) { return {}; }
-    return {{0, std::move(ret)}};
-}
-
-infer_status_code infer_binary_slice_ranges(
-        fusible_op_t *cur, fslice_map &fsmap) {
-    COMPILE_ASSERT(cur->get_inputs().size() == 2, "binary op is expected");
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(cur, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    // if unkown slice ranges exist.
-    if (known_ranges_map.size() < cur->get_inputs().size()) {
-        int unknown_idx
-                = known_ranges_map.find(0) != known_ranges_map.end() ? 1 : 0;
-        known_ranges_map[unknown_idx] = known_ranges_map[1 - unknown_idx];
-        // set the other unknown slice range by achieved known_ranges_list
-        set_unknown_input_slice(cur, known_ranges_map, fsmap);
-    }
-    // set outputs slice range
-    auto &outslice = fsmap.get(cur->get_outputs()[0]);
-    outslice = known_ranges_map[0];
-    return infer_status_code::OK;
-}
-
-static slice_range_list infer_broadcast_arg_slice(
-        const slice_range_list &known_range_list,
-        const std::vector<int> &bc_axis, bool keep_dims) {
-    slice_range_list bc_arg_range_list(known_range_list.size());
-    for (size_t i = 0; i < bc_arg_range_list.size(); i++) {
-        auto &known_range = known_range_list[i];
-        for (size_t j = 0; j < known_range.size(); j++) {
-            if (bc_axis.end() != std::find(bc_axis.begin(), bc_axis.end(), j)) {
-                bc_arg_range_list[i].emplace_back(known_range.at(j));
-            } else {
-                if (keep_dims) {
-                    bc_arg_range_list[i].emplace_back(
-                            std::make_pair(expr(0), expr(1)));
-                }
-            }
-        }
-        if (bc_arg_range_list[i].empty())
-            bc_arg_range_list[i].emplace_back(std::make_pair(0, 1));
-    }
-    return bc_arg_range_list;
-}
-
-static slice_range_list infer_broadcast_slice(
-        const slice_range_list &known_range_list,
-        const std::vector<int> &bc_axis, const std::vector<expr> &bc_dim) {
-    slice_range_list bc_range_list(known_range_list.size());
-    for (size_t i = 0; i < bc_range_list.size(); i++) {
-        auto &known_range = known_range_list[i];
-        COMPILE_ASSERT(known_range.size() == bc_dim.size()
-                        || bc_axis == std::vector<int> {-1},
-                "Unexpected cases found")
-        for (size_t j = 0; j < bc_dim.size(); j++) {
-            if (bc_axis.end() != std::find(bc_axis.begin(), bc_axis.end(), j)) {
-                bc_range_list[i].emplace_back(known_range.at(j));
-            } else {
-                bc_range_list[i].emplace_back(
-                        std::make_pair(expr(0), bc_dim[j]));
-            }
-        }
-    }
-    return bc_range_list;
-}
-
-static sc_dims infer_binary_elementwise_output_shape(const sc_dims &lhs_shape,
-        const sc_dims &rhs_shape, const std::vector<int> &input_bc_axis) {
-    sc_dims output_shape;
-    if (input_bc_axis.empty()) {
-        output_shape
-                = op_traits::may_broadcast_t::infer_auto_broadcast_output_shape(
-                        lhs_shape, rhs_shape);
-    } else {
-        if (lhs_shape.size() != rhs_shape.size()) {
-            output_shape = lhs_shape.size() > rhs_shape.size() ? lhs_shape
-                                                               : rhs_shape;
-        } else {
-            output_shape = get_number_of_squeeze_dims(lhs_shape)
-                            <= get_number_of_squeeze_dims(rhs_shape)
-                    ? lhs_shape
-                    : rhs_shape;
-        }
-    }
-    return output_shape;
-}
-
-static sc_data_type_t infer_output_dtype(
-        sc_data_type_t a, sc_data_type_t b, bool is_b_scalar) {
-    if (is_b_scalar) return a;
-    // could_promote_dtypes is a map if {dtype, dtype_precision_ranking}
-    // dtype mapped to a higher precision ranking value is more precise
-    std::unordered_map<sc_data_type_t, int> could_promote_dtypes {
-            {datatypes::s32, 0}, {datatypes::bf16, 1}, {datatypes::f32, 2}};
-    if (could_promote_dtypes.find(a) != could_promote_dtypes.end()
-            && could_promote_dtypes.find(b) != could_promote_dtypes.end()) {
-        return could_promote_dtypes[a] >= could_promote_dtypes[b] ? a : b;
-    }
-    COMPILE_ASSERT(a == b,
-            "Binary elementwise op shall have both inputs with the same "
-            "dtype except for allow promotion cases.");
-    return a;
-}
-
-void binary_elementwise_op_impl_t::set_plain_bc_axis() {
-    auto lhs_shape = info_.inputs_[0]->details_.get_plain_dims();
-    auto rhs_shape = info_.inputs_[1]->details_.get_plain_dims();
-    auto output_shape = info_.outputs_[0]->details_.get_plain_dims();
-    // get user specified bc_axis of the shorter input
-    auto input_bc_axis = attrs_.get_or_else("bc_axis", std::vector<int> {});
-    int ref_idx = get_ref_input_index(false);
-    if (ref_idx == may_broadcast_t::NOT_DETERMINED) {
-        ref_idx = lhs_shape.size() >= rhs_shape.size() ? 0 : 1;
-    }
-    // user specified bc_axis of the shorter input
-    plain_bc_axis_.clear();
-    if (input_bc_axis.empty()) {
-        plain_bc_axis_.emplace_back(
-                op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                        lhs_shape, output_shape));
-        plain_bc_axis_.emplace_back(
-                op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                        rhs_shape, output_shape));
-    } else {
-        COMPILE_ASSERT(ref_idx == 0 || ref_idx == 1,
-                "bc_axis is only applicable to uni-directional broadcast.");
-        plain_bc_axis_.resize(2);
-        plain_bc_axis_[ref_idx]
-                = op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                        info_.inputs_[ref_idx]->details_.get_plain_dims(),
-                        output_shape);
-        plain_bc_axis_[1 - ref_idx] = input_bc_axis;
-    }
-}
-
-binary_elementwise_op_impl_t::binary_elementwise_op_impl_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(
-            ins.size() == 2, "Binary elementwise op shall have 2 inputs.");
-    // if auto_broadcast is not numpy, lhs and rhs should strictly match
-    // if auto_broadcast is set && "bc_axis" is set, we follow "bc_axis"
-    // otherwise we consider auto_broadcast rule
-    std::string auto_broadcast
-            = attrs.get_or_else("auto_broadcast", std::string("numpy"));
-    COMPILE_ASSERT(auto_broadcast == "numpy"
-                    || ins[0]->details_.get_plain_dims()
-                            == ins[1]->details_.get_plain_dims(),
-            "Binary elementwise op's lhs and rhs should have the same shape "
-            "when auto_broadcast is none.");
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-    auto lhs_shape = info_.inputs_[0]->details_.get_plain_dims();
-    auto rhs_shape = info_.inputs_[1]->details_.get_plain_dims();
-    // get user specified bc_axis of the shorter input
-    auto input_bc_axis = attrs_.get_or_else("bc_axis", std::vector<int> {});
-    sc_dims output_shape = infer_binary_elementwise_output_shape(
-            lhs_shape, rhs_shape, input_bc_axis);
-    // ref_idx shall be the same side as query format's output format
-    int ref_idx = get_ref_input_index(false);
-    if (ref_idx == may_broadcast_t::NOT_DETERMINED) {
-        ref_idx = lhs_shape.size() >= rhs_shape.size() ? 0 : 1;
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_.set_plain_dims(output_shape);
-        auto output_format = info_.inputs_[ref_idx]->details_.get_format();
-        bool is_b_scalar
-                = (info_.inputs_[1 - ref_idx]->details_.get_plain_dims()
-                        == sc_dims {1});
-        auto output_dtype = infer_output_dtype(
-                info_.inputs_[ref_idx]->details_.dtype_,
-                info_.inputs_[1 - ref_idx]->details_.dtype_, is_b_scalar);
-        info_.outputs_[0]->details_.set_format(output_format);
-        info_.outputs_[0]->details_.dtype_ = output_dtype;
-    } else {
-        info_.outputs_ = outs;
-    }
-
-    COMPILE_ASSERT(
-            gc::graph::check_shape_equal(
-                    info_.outputs_[0]->details_.get_plain_dims(), output_shape),
-            "Binary elementwise op's output shape is not set correctly.");
-
-    set_plain_bc_axis();
-}
-
-binary_elementwise_op_impl_t::binary_elementwise_op_impl_t(
-        graph_tensor_ptr lhs, graph_tensor_ptr rhs, elt_operator elt_op)
-    : binary_elementwise_op_impl_t({std::move(lhs), std::move(rhs)}, {}, {}) {
-    elt_op_ = elt_op;
-    switch (elt_op) {
-        case elt_operator::ADD: op_name_ = "add"; break;
-        case elt_operator::SUB: op_name_ = "sub"; break;
-        case elt_operator::MUL: op_name_ = "mul"; break;
-        case elt_operator::DIV: op_name_ = "div"; break;
-        case elt_operator::MIN: op_name_ = "min"; break;
-        case elt_operator::MAX: op_name_ = "max"; break;
-        case elt_operator::SQD_DIFF: op_name_ = "squared_diff"; break;
-        case elt_operator::PRELU: op_name_ = "prelu"; break;
-        default: break;
-    }
-}
-
-std::vector<int> binary_elementwise_op_impl_t::get_non_broadcast_input_index(
-        bool assert_non_empty) const {
-    const sc_dims &lhs_dims = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &rhs_dims = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &out_dims = infer_binary_elementwise_output_shape(lhs_dims,
-            rhs_dims, attrs_.get_or_else("bc_axis", std::vector<int> {}));
-    std::vector<int> ret;
-    for (size_t i = 0; i < info_.inputs_.size(); ++i) {
-        if (may_broadcast_t::broadcastable_shape_equal(
-                    info_.inputs_[i]->details_.get_plain_dims(), out_dims)) {
-            ret.emplace_back(i);
-        }
-    }
-    if (assert_non_empty) {
-        COMPILE_ASSERT(!ret.empty(),
-                "Binary op is required to have at least one non-broadcast "
-                "input at this stage.");
-    }
-    return ret;
-}
-
-int binary_elementwise_op_impl_t::get_ref_input_index(
-        bool assert_determined) const {
-    auto non_bc_input_indices
-            = get_non_broadcast_input_index(assert_determined);
-    if (non_bc_input_indices.empty()) {
-        return may_broadcast_t::NOT_DETERMINED;
-    }
-    int non_bc_input_idx
-            = non_bc_input_indices.size() > 1 ? -1 : non_bc_input_indices[0];
-    if (non_bc_input_idx == -1) {
-        // if the shapes are equal, find which side has blocking format.
-        if (is_dynamic()) {
-            non_bc_input_idx
-                    = info_.inputs_[0]->details_.get_format_candidates().size()
-                            >= info_.inputs_[1]
-                                       ->details_.get_format_candidates()
-                                       .size()
-                    ? 0
-                    : 1;
-        } else {
-            // Four situations: `both blocking`, `a blocking b not`, `b blocking
-            // a not`, `both not blocking`. Only `b blocking a not` need to set
-            // non_bc_input_idx to 1.
-            non_bc_input_idx = 0;
-            if (!info_.inputs_[0]->details_.get_format().is_blocking()
-                    && info_.inputs_[1]->details_.get_format().is_blocking()) {
-                non_bc_input_idx = 1;
-            }
-        }
-    }
-    return non_bc_input_idx;
-}
-
-void binary_elementwise_op_impl_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    const auto &in0_format = info_.inputs_[0]->details_.get_format();
-    const auto &in1_format = info_.inputs_[1]->details_.get_format();
-
-    int layout_input_idx = get_ref_input_index(true);
-    attrs_[op_attr_key::layout_input_index] = layout_input_idx;
-
-    if (info_.inputs_[0]->details_.get_plain_dims().size()
-            != info_.inputs_[1]->details_.get_plain_dims().size()) {
-        COMPILE_ASSERT(in0_format == sc_data_format_t(format_kinds::A)
-                        || in1_format == sc_data_format_t(format_kinds::A),
-                "Unsupported format encountered in binary elementwise query "
-                "format.");
-        in_formats.push_back({in0_format});
-        in_formats.push_back({in1_format});
-        out_formats.push_back({layout_input_idx ? in1_format : in0_format});
-    } else {
-        if (layout_input_idx) {
-            // propagate layout from input 0 to 1.
-            auto target_format = infer_broadcast_format(
-                    info_.inputs_[1]->details_, info_.inputs_[0]->details_);
-            in_formats.push_back({target_format});
-            in_formats.push_back({in1_format});
-            out_formats.push_back({in1_format});
-        } else {
-            // propagate layout from input 1 to 0.
-            auto target_format = infer_broadcast_format(
-                    info_.inputs_[0]->details_, info_.inputs_[1]->details_);
-            in_formats.push_back({in0_format});
-            in_formats.push_back({target_format});
-            out_formats.push_back({in0_format});
-        }
-    }
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-infer_status_code binary_elementwise_op_impl_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    COMPILE_ASSERT(get_inputs().size() == 2, "binary op is expected");
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    // double-check all known case
-    if (known_ranges_map.size() == get_inputs().size()) {
-        // check whether slice size is matched
-        if (known_ranges_map[0].size() != known_ranges_map[1].size()) {
-            // try to align with smaller one and erase bigger one
-            int erase_input_id
-                    = known_ranges_map[0].size() < known_ranges_map[1].size()
-                    ? 1
-                    : 0;
-            known_ranges_map.erase(erase_input_id);
-            fsmap.erase(get_inputs()[erase_input_id]);
-        }
-    }
-    // if unkown slice ranges exist.
-    if (known_ranges_map.size() < get_inputs().size()) {
-        int unknown_idx
-                = known_ranges_map.find(0) != known_ranges_map.end() ? 1 : 0;
-        // check broadcast
-        int bc_input_idx = get_broadcast_input();
-        if (bc_input_idx >= 0) {
-            bool keep_dims = get_inputs()[bc_input_idx]
-                                     ->details_.get_blocking_dims()
-                                     .size()
-                    == get_inputs()[1 - bc_input_idx]
-                               ->details_.get_blocking_dims()
-                               .size();
-            auto bc_axis = get_bc_axis();
-            if (unknown_idx != bc_input_idx) {
-                slice_range_list bc_range_list = infer_broadcast_slice(
-                        known_ranges_map[1 - unknown_idx], bc_axis,
-                        get_inputs()[1 - bc_input_idx]
-                                ->details_.get_blocking_dims_expr(
-                                        get_owner_graph()));
-                known_ranges_map[unknown_idx] = bc_range_list;
-            } else {
-                slice_range_list bc_arg_range_list = infer_broadcast_arg_slice(
-                        known_ranges_map[1 - unknown_idx], bc_axis, keep_dims);
-                known_ranges_map[unknown_idx] = std::move(bc_arg_range_list);
-            }
-        } else {
-            known_ranges_map[unknown_idx] = known_ranges_map[1 - unknown_idx];
-        }
-        // set the other unknown slice range by achieved known_ranges_list
-        set_unknown_input_slice(this, known_ranges_map, fsmap);
-    }
-    // set outputs slice range
-    auto &outslice = fsmap.get(get_outputs()[0]);
-    int bc_idx = get_broadcast_input();
-    outslice = known_ranges_map[bc_idx > -1 ? (1 - bc_idx) : 0];
-    return infer_status_code::OK;
-}
-
-infer_status_code binary_elementwise_op_impl_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    auto &outslice = fsmap.get(get_outputs()[0]);
-    if (outslice.empty()) { return infer_status_code::RETRY; }
-    // check broadcast
-    int bc_input_idx = get_broadcast_input();
-    for (size_t i = 0; i < get_inputs().size(); i++) {
-        auto &input = get_inputs()[i];
-        auto &inpslice = fsmap.get(input);
-        if (inpslice.empty()) {
-            if (bc_input_idx == static_cast<int>(i)) {
-                auto bc_axis = get_bc_axis();
-                inpslice = infer_broadcast_arg_slice(outslice, bc_axis,
-                        get_inputs()[bc_input_idx]
-                                        ->details_.get_blocking_dims()
-                                        .size()
-                                == get_inputs()[1 - bc_input_idx]
-                                           ->details_.get_blocking_dims()
-                                           .size());
-            } else {
-                inpslice = outslice;
-            }
-        }
-    }
-    return infer_status_code::OK;
-}
-
-void binary_elementwise_op_impl_t::infer_binding_axis(
-        binding_axis_map &bdax_map) {
-    // search known axis from any input of cur fusbile op
-    auto known_axis_map = search_known_input_axis(this, bdax_map);
-    auto &outaxis = bdax_map.get(get_outputs()[0]);
-    int bc_input_idx = get_broadcast_input();
-    int ref_idx = bc_input_idx > -1 ? (1 - bc_input_idx) : 0;
-    if (!outaxis.empty()) {
-        COMPILE_ASSERT(known_axis_map.size() == get_inputs().size(),
-                "all input axis should be bound")
-        if (outaxis == known_axis_map[ref_idx]) { return; }
-    }
-
-    if (known_axis_map.size() < get_inputs().size()) {
-        int unknown_idx
-                = known_axis_map.find(0) != known_axis_map.end() ? 1 : 0;
-        // check broadcast
-        if (bc_input_idx >= 0) {
-            bool keep_dims = get_inputs()[bc_input_idx]
-                                     ->details_.get_blocking_dims()
-                                     .size()
-                    == get_inputs()[1 - bc_input_idx]
-                               ->details_.get_blocking_dims()
-                               .size();
-            if (keep_dims) {
-                known_axis_map[unknown_idx] = known_axis_map[1 - unknown_idx];
-            } else {
-                auto bc_axis = plain_bc_axis_[bc_input_idx];
-                binding_axis known_axis = known_axis_map[1 - unknown_idx],
-                             unknown_axis(known_axis.size());
-                if (unknown_idx != bc_input_idx) {
-                    if (bc_axis == std::vector<int> {-1}) {
-                        bc_axis[0] = get_inputs()[1 - bc_input_idx]
-                                             ->details_.get_plain_dims()
-                                             .size()
-                                - 1;
-                    }
-                    std::transform(known_axis.begin(), known_axis.end(),
-                            unknown_axis.begin(),
-                            [&bc_axis](const std::vector<int> &bd_ax) {
-                                std::vector<int> ret(bd_ax.size());
-                                std::transform(bd_ax.begin(), bd_ax.end(),
-                                        ret.begin(), [&bc_axis](const int &ax) {
-                                            COMPILE_ASSERT(
-                                                    ax < static_cast<int64_t>(
-                                                            bc_axis.size()),
-                                                    "Unexpected ax found: "
-                                                            << ax)
-                                            return bc_axis[ax];
-                                        });
-                                return ret;
-                            });
-                } else {
-                    for (auto &bd_ax : known_axis) {
-                        std::vector<int> ret;
-                        for (auto &ax : bd_ax) {
-                            auto iter = std::find(
-                                    bc_axis.begin(), bc_axis.end(), ax);
-                            if (iter != bc_axis.end()) {
-                                ret.emplace_back(iter - bc_axis.begin());
-                            }
-                        }
-                        unknown_axis.emplace_back(ret);
-                    }
-                }
-                known_axis_map[unknown_idx] = unknown_axis;
-            }
-        } else {
-            known_axis_map[unknown_idx] = known_axis_map[1 - unknown_idx];
-        }
-    }
-    // set outputs slice range
-    outaxis = known_axis_map[ref_idx];
-
-    // set the other unknown slice range by achieved known_ranges_list
-    set_unknown_binding_axis(this, known_axis_map, bdax_map);
-}
-
-void binary_elementwise_op_impl_t::pre_infer_binding_axis(
-        binding_axis_map &bdax_map) {
-    auto &outaxis = bdax_map.get(get_outputs()[0]);
-    COMPILE_ASSERT(!outaxis.empty(),
-            "Unknown output axis found, could not pre infer binding axis")
-
-    // check broadcast
-    int bc_input_idx = get_broadcast_input();
-    for (size_t i = 0; i < get_inputs().size(); i++) {
-        auto &input = get_inputs()[i];
-        auto &inpaxis = bdax_map.get(input);
-        if (inpaxis.empty()) {
-            if (bc_input_idx == static_cast<int>(i)) {
-                bool keep_dims = get_inputs()[bc_input_idx]
-                                         ->details_.get_blocking_dims()
-                                         .size()
-                        == get_inputs()[1 - bc_input_idx]
-                                   ->details_.get_blocking_dims()
-                                   .size();
-                if (keep_dims) {
-                    inpaxis = outaxis;
-                } else {
-                    auto bc_axis = plain_bc_axis_[bc_input_idx];
-                    for (auto &bd_ax : outaxis) {
-                        std::vector<int> ret;
-                        for (auto &ax : bd_ax) {
-                            auto iter = std::find(
-                                    bc_axis.begin(), bc_axis.end(), ax);
-                            if (iter != bc_axis.end()) {
-                                ret.emplace_back(iter - bc_axis.begin());
-                            }
-                        }
-                        inpaxis.emplace_back(ret);
-                    }
-                }
-            } else {
-                inpaxis = outaxis;
-            }
-            if (auto bd_op = input->producer_owner_->dyn_cast<
-                             op_traits::mixed_partition_acceptable>()) {
-                bd_op->pre_infer_binding_axis(bdax_map);
-            }
-        }
-    }
-}
-
-std::vector<int> binary_elementwise_op_impl_t::get_bc_axis() const {
-    int bc_input_idx = get_broadcast_input();
-    if (bc_input_idx == -1) { bc_input_idx = 1; }
-    if (plain_bc_axis_[bc_input_idx] == std::vector<int> {-1})
-        return plain_bc_axis_[bc_input_idx];
-    return transform_axis_plain2blocking(
-            info_.inputs_[1 - bc_input_idx], plain_bc_axis_[bc_input_idx]);
-}
-
-bool binary_elementwise_op_impl_t::register_brgemm_fusion(
-        const context_ptr &ctx, const std::vector<tensor_slice *> &outputs,
-        const std::vector<const tensor_slice *> &inputs,
-        brgemm_fusion_register &brg_reg) {
-    if (!fuse_in_brgemm_) { return false; }
-    int bc_input_idx = get_broadcast_input();
-    // input 0 broadcast, can not be processed in brgemm
-    if (bc_input_idx == 0) { return false; }
-    return brg_reg.register_op_infos(shared_from_this(),
-            outputs[0]->get_tensor_ptr(), inputs[1]->get_tensor_ptr(),
-            inputs[1]->get_shape());
-}
-
-shape_rl_vec binary_elementwise_op_impl_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in0_plain_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &in1_plain_dims = get_inputs()[1]->details_.get_plain_dims();
-    auto &out_plain_dims = get_outputs()[0]->details_.get_plain_dims();
-    assert(in0_plain_dims.size() == in1_plain_dims.size()
-            || in0_plain_dims.size() == 1 || in1_plain_dims.size() == 1);
-    if (in0_plain_dims.size() == in1_plain_dims.size()) {
-        for (size_t i = 0; i < in0_plain_dims.size(); i++) {
-            // maybe broadcast
-            if ((is_dynamic_dim(in0_plain_dims[i])
-                        || is_dynamic_dim(in1_plain_dims[i]))
-                    && in0_plain_dims[i] != 1 && in1_plain_dims[i] != 1) {
-                ret.emplace_back(in0_plain_dims[i], in1_plain_dims[i]);
-            }
-        }
-    }
-    for (size_t i = 0; i < out_plain_dims.size(); i++) {
-        if (is_dynamic_dim(out_plain_dims[i])) {
-            if (i < in0_plain_dims.size() && in0_plain_dims[i] != 1) {
-                ret.emplace_back(in0_plain_dims[i], out_plain_dims[i]);
-            }
-            if (i < in1_plain_dims.size() && in1_plain_dims[i] != 1) {
-                ret.emplace_back(in1_plain_dims[i], out_plain_dims[i]);
-            }
-        }
-    }
-    return ret;
-}
-
-static stmt select_algorithm(elt_operator elt_op, const expr &in0,
-        const expr &in1, const expr &out, const any_map_t &attrs,
-        const sc_op_info_t &info) {
-    std::vector<stmt_c> cur_list;
-    auto var_maker = [&cur_list, &in0](const std::string &name) {
-        auto var = builder::make_var(in0->dtype_, name);
-        cur_list.emplace_back(builder::make_var_tensor_def_unattached(var));
-        return var;
-    };
-    auto assign_maker = [&cur_list](const expr &def_var, const expr &def_val) {
-        cur_list.emplace_back(
-                builder::make_assign_unattached(def_var, def_val));
-    };
-    switch (elt_op) {
-        case elt_operator::ADD: {
-            return builder::make_assign_unattached(out, in0 + in1);
-        } break;
-        case elt_operator::SUB: {
-            return builder::make_assign_unattached(out, in0 - in1);
-        } break;
-        case elt_operator::MUL: {
-            return builder::make_assign_unattached(out, in0 * in1);
-        } break;
-        case elt_operator::DIV: {
-            return builder::make_assign_unattached(out, in0 / in1);
-        } break;
-        case elt_operator::MIN: {
-            return builder::make_assign_unattached(
-                    out, builder::make_min(in0, in1));
-        } break;
-        case elt_operator::MAX: {
-            return builder::make_assign_unattached(
-                    out, builder::make_max(in0, in1));
-        } break;
-        case elt_operator::SQD_DIFF: {
-            return builder::make_assign_unattached(
-                    out, (in0 - in1) * (in0 - in1));
-        } break;
-        case elt_operator::PRELU: {
-            return builder::make_assign_unattached(out,
-                    builder::make_select(in0 >= make_expr<constant_node>(
-                                                 (int64_t)0, in0->dtype_),
-                            in0, in0 * in1));
-        } break;
-        case elt_operator::ABS_BWD: {
-            return builder::make_assign_unattached(out,
-                    builder::make_select(
-                            in0 > make_expr<constant_node>(0.f, in0->dtype_),
-                            in1,
-                            builder::make_select(in0
-                                            != make_expr<constant_node>(
-                                                    0.f, in0->dtype_),
-                                    builder::make_sub(make_expr<constant_node>(
-                                                              0.f, in0->dtype_),
-                                            in1),
-                                    make_expr<constant_node>(
-                                            0.f, in0->dtype_))));
-        } break;
-        case elt_operator::CLAMP_BWD: {
-            return builder::make_assign_unattached(out,
-                    builder::make_select(in0 > make_expr<constant_node>(
-                                                 (float)attrs.get<float>("min"),
-                                                 in0->dtype_),
-                            builder::make_select(
-                                    in0 < make_expr<constant_node>(
-                                            (float)attrs.get<float>("max"),
-                                            in0->dtype_),
-                                    in1,
-                                    make_expr<constant_node>(0.f, in1->dtype_)),
-                            make_expr<constant_node>(0.f, in0->dtype_)));
-        } break;
-        case elt_operator::ELU_BWD: {
-            expr used_inp = builder::make_select(
-                    in0 > make_expr<constant_node>(0.f, in0->dtype_), in1,
-                    in1
-                            * make_expr<constant_node>(
-                                    (float)attrs.get<float>("alpha"),
-                                    in0->dtype_)
-                            * builder::make_exp(in0));
-            expr used_out = builder::make_select(
-                    in0 > make_expr<constant_node>(0.f, in0->dtype_), in1,
-                    in1
-                            * (in0
-                                    + make_expr<constant_node>(
-                                            (float)attrs.get<float>("alpha"),
-                                            in0->dtype_)));
-            expr res = attrs.get_or_else<bool>("use_dst", true) ? used_out
-                                                                : used_inp;
-            return builder::make_assign_unattached(out, res);
-        } break;
-        case elt_operator::HARDSWISH_BWD: {
-            auto alpha = attrs.get_or_else<float>("alpha", 1.f / 6.f);
-            auto beta = attrs.get_or_else<float>("beta", 0.5f);
-            expr test_expr = make_expr<constant_node>(alpha, in0->dtype_) * in0
-                    + make_expr<constant_node>(beta, in0->dtype_);
-            expr cal_expr = in1
-                    * (make_expr<constant_node>(2.f, in0->dtype_)
-                                    * make_expr<constant_node>(
-                                            alpha, in0->dtype_)
-                                    * in0
-                            + make_expr<constant_node>(beta, in0->dtype_));
-            expr res = builder::make_select(
-                    (test_expr) <= make_expr<constant_node>(0.f, in0->dtype_),
-                    make_expr<constant_node>(0.f, in0->dtype_),
-                    builder::make_select(
-                            (test_expr) >= make_expr<constant_node>(
-                                    1.f, in0->dtype_),
-                            in1, cal_expr));
-            return builder::make_assign_unattached(out, res);
-        } break;
-        case elt_operator::HARDSIGMOID_BWD: {
-            auto alpha = constant_maker<float>(
-                    attrs.get<float>("alpha"), in0->dtype_);
-            auto beta = constant_maker<float>(
-                    attrs.get<float>("beta"), in0->dtype_);
-            auto one_f = constant_maker<float>(1.f, in0->dtype_);
-            auto zero_f = constant_maker<float>(0.f, in0->dtype_);
-            expr test_expr = in0 * alpha + beta;
-            auto f_var0 = var_maker("f_var0");
-            assign_maker(f_var0, test_expr);
-            expr cal_expr = in1 * alpha;
-            auto f_var1 = var_maker("f_var1");
-            assign_maker(f_var1, cal_expr);
-            expr res = builder::make_select(f_var0 < one_f,
-                    builder::make_select(f_var0 > zero_f, f_var1, zero_f),
-                    zero_f);
-            assign_maker(out, res);
-            return builder::make_stmts_unattached(cur_list);
-        } break;
-        case elt_operator::SQRT_BWD: {
-            bool use_dst = attrs.get_or_else<bool>("use_dst", true);
-            auto half_f = constant_maker<float>(0.5f, in0->dtype_);
-            expr f_var = var_maker("f_var");
-            if (use_dst) {
-                auto dst_expr = (half_f / in0);
-                dst_expr->attr()[attr_keys::fast_math] = false;
-                assign_maker(f_var, dst_expr);
-            } else {
-                auto src_expr = half_f / builder::make_sqrt(in0);
-                src_expr->attr()[attr_keys::fast_math] = false;
-                assign_maker(f_var, src_expr);
-            }
-            auto ret = f_var * in1;
-            ret->attr()[attr_keys::fast_math] = false;
-            assign_maker(out, ret);
-            return builder::make_stmts_unattached(cur_list);
-        } break;
-        case elt_operator::MISH_BWD: {
-            auto min_inp = builder::make_min(
-                    in0, constant_maker<float>(22.180708f, in0->dtype_));
-            auto min_var = var_maker("inp_min_var_" + fusion_create_var_idx());
-            assign_maker(min_var, min_inp);
-            // e^x
-            auto exp_f = builder::make_exp(min_var);
-            auto var_exp_f = var_maker("exp_var_" + fusion_create_var_idx());
-            assign_maker(var_exp_f, exp_f);
-            // e^2x
-            auto exp_f2 = var_exp_f * var_exp_f;
-            auto exp_f2_var = var_maker("exp_f2_var" + fusion_create_var_idx());
-            assign_maker(exp_f2_var, exp_f2);
-            // 4 * e^2x
-            auto formular_0
-                    = exp_f2_var * constant_maker<float>(4.f, in0->dtype_);
-            auto f_var0 = var_maker("f_var0" + fusion_create_var_idx());
-            assign_maker(f_var0, formular_0);
-            // e^3x + 4 * e^2x
-            auto formular_1
-                    = builder::make_fmadd(var_exp_f, exp_f2_var, f_var0);
-            auto f_var1 = var_maker("f_var1" + fusion_create_var_idx());
-            assign_maker(f_var1, formular_1);
-            // x + 1.f
-            auto formular_2 = in0 + constant_maker<float>(1.f, in0->dtype_);
-            auto f_var2 = var_maker("f_var2" + fusion_create_var_idx());
-            assign_maker(f_var2, formular_2);
-            auto formular_3 = f_var2 + constant_maker(0.5f, in0->dtype_);
-            auto f_var3 = var_maker("f_var3" + fusion_create_var_idx());
-            assign_maker(f_var3, formular_3);
-            // 4 * (1 + 1.5f)
-            auto formular_4 = constant_maker(4.f, in0->dtype_) * f_var3;
-            auto f_var4 = var_maker("f_var4" + fusion_create_var_idx());
-            assign_maker(f_var4, formular_4);
-            // e^3x + 4*e^2x + 4*(x+1.5)*e^x
-            auto formular_5 = builder::make_fmadd(f_var4, var_exp_f, f_var1);
-            auto f_var5 = var_maker("f_var5" + fusion_create_var_idx());
-            assign_maker(f_var5, formular_5);
-            // omega = e^3x + 4*e^2x + 4*e^x*(x+1.5) + 4*(x+1)
-            auto formular_6 = builder::make_fmadd(
-                    constant_maker(4.f, in0->dtype_), f_var2, f_var5);
-            auto f_var6 = var_maker("f_var6" + fusion_create_var_idx());
-            assign_maker(f_var6, formular_6);
-            // delta = (e^x+1)^2 + 1
-            auto formular_7
-                    = var_exp_f + constant_maker<float>(1.f, in0->dtype_);
-            auto f_var7 = var_maker("f_var7" + fusion_create_var_idx());
-            assign_maker(f_var7, formular_7);
-            auto formular_8 = f_var7 * f_var7;
-            auto f_var8 = var_maker("f_var8" + fusion_create_var_idx());
-            assign_maker(f_var8, formular_8);
-            auto formular_9 = f_var8 + constant_maker<float>(1.f, in0->dtype_);
-            auto f_var9 = var_maker("f_var9" + fusion_create_var_idx());
-            assign_maker(f_var9, formular_9);
-            auto formular_10 = f_var9 * f_var9;
-            auto f_var10 = var_maker("f_var10");
-            assign_maker(f_var10, formular_10);
-            auto formular_11 = exp_f * f_var6;
-            auto f_var11 = var_maker("f_var11");
-            assign_maker(f_var11, formular_11);
-            auto formular_12 = f_var11 / f_var10;
-            formular_12->attr()[attr_keys::fast_math] = false;
-            auto f_var12 = var_maker("f_var12");
-            assign_maker(f_var12, formular_12);
-            auto res = f_var12 * in1;
-            res->attr()[attr_keys::fast_math] = false;
-            assign_maker(out, res);
-            return builder::make_stmts_unattached(cur_list);
-        } break;
-        case elt_operator::TANH_BWD: {
-            bool use_dst = attrs.get_or_else<bool>("use_dst", true);
-            tanh_op_t tanh_compute(info.inputs_[0]);
-            const auto &tanh_ret = tanh_compute.compute_element(in0);
-            expr src = builder::make_mul(in1,
-                    builder::make_sub(
-                            make_expr<constant_node>(1.f, in0->dtype_),
-                            builder::make_mul(tanh_ret, tanh_ret)));
-            expr dst = builder::make_mul(in1,
-                    builder::make_sub(
-                            make_expr<constant_node>(1.f, in0->dtype_),
-                            builder::make_mul(in0, in0)));
-            expr res = use_dst ? dst : src;
-            return builder::make_assign_unattached(out, res);
-        } break;
-        case elt_operator::SOFTPLUS_BWD: {
-            float beta = attrs.get_or_else<float>("beta", 1.f);
-            sigmoid_op_t sigmoid_compute(info.inputs_[0]);
-            bool is_f32 = in0->dtype_.type_code_ == sc_data_etype::F32;
-            auto make_cast_f32 = [](const expr &inp) {
-                return builder::make_cast(
-                        sc_data_type_t::f32(inp->dtype_.lanes_), inp);
-            };
-            auto sigmoid_inp = is_f32 ? in0 : make_cast_f32(in0);
-            const auto &sigmoid_ret
-                    = sigmoid_compute.compute_element(sigmoid_inp
-                            * constant_maker<float>(beta,
-                                    sc_data_type_t::f32(in0->dtype_.lanes_)));
-            auto inp2 = is_f32 ? in1 : make_cast_f32(in1);
-            auto f_val = builder::make_mul(sigmoid_ret, inp2);
-            auto res_val
-                    = is_f32 ? f_val : builder::make_cast(in0->dtype_, f_val);
-            auto res = builder::make_assign_unattached(out, res_val);
-            return res;
-        } break;
-        default: {
-            COMPILE_ASSERT(false,
-                    "Unsupport elementwise op "
-                    "found.\n");
-            return stmt();
-        } break;
-    }
-    return stmt();
-}
-
-void compute_block_broadcast(const context_ptr &ctx, sc_graph_t &graph,
-        const std::vector<const tensor_slice *> &src, const tensor_slice &dst,
-        sc_op_info_t &info, int bc_input_idx, const std::vector<int> &bc_axis,
-        const vectorized_info_t &vx_info, const mask_compute_func_t &compute,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool use_mask = false) {
-    //  enable vectorize code
-    bool use_vectorized = false;
-    vec_backend_require(ctx, use_vectorized);
-    // nested loop vars
-    std::vector<expr> iter_vars;
-    // the indices for multiple inputs. First dim: the input, Second dim: the
-    // dimemsions in the tensor
-    std::vector<expr> in_idx, in_bc_idx;
-    // the indices for the output tensor
-    std::vector<expr> dst_idx;
-
-    COMPILE_ASSERT(bc_input_idx == 0 || bc_input_idx == 1,
-            "bc_input_idx is expected to be 0 or 1")
-    bool is_blocking_shape = is_op_input_blocking_shape(info);
-    const tensor_slice *in_tsl = src[1 - bc_input_idx],
-                       *in_bc_tsl = src[bc_input_idx];
-    bool keep_dims = in_tsl->get_base_dims().size()
-            == in_bc_tsl->get_base_dims().size();
-    // add output type check, manual downcast
-    sc_data_etype out_etype
-            = dst.tptr_->dtype_.get_pointer_element().as_etype();
-    // use src_indices.at(0) as default
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        // make the loop var for the for-loop
-        iter_vars.emplace_back(range_from_outer_loop(dst.get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-        in_idx.emplace_back(iter_vars.back());
-        if (std::find(bc_axis.begin(), bc_axis.end(), i) != bc_axis.end()) {
-            in_bc_idx.emplace_back(iter_vars.back());
-        } else if (keep_dims) {
-            in_bc_idx.emplace_back(0);
-        }
-        /** push an index for output tensor **/
-        dst_idx.emplace_back(iter_vars.back());
-    }
-    // For empty bc_axis
-    if (in_bc_idx.empty()) in_bc_idx = {0};
-    // tail part
-    std::vector<expr> in_idx_tail = in_idx, in_bc_idx_tail = in_bc_idx,
-                      dst_idx_tail = dst_idx;
-    auto tail_var = builder::make_var(
-            datatypes::index, std::string("_fuseiter") + fusion_create_idx());
-    in_idx_tail[vx_info.axis] = tail_var;
-    dst_idx_tail[vx_info.axis] = tail_var;
-
-    expr indexed_target, indexed_input;
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    auto slice_len = dst.get_shape().at(vx_info.axis);
-    int lanes = static_cast<int>(vx_info.lanes);
-    auto floor = do_cast_and_fold(slice_len / lanes * lanes);
-    auto tail = do_cast_and_fold(slice_len % lanes);
-    int floor_int = 0;
-    int tail_int = 0;
-    int floor_len
-            = get_const_as_int(slice_len.static_as<constant>()) / lanes * lanes;
-    int tail_len = get_const_as_int(slice_len.static_as<constant>()) % lanes;
-    if (floor.isa<constant>()) {
-        floor_int = get_expr_as_int(floor);
-        tail_int = get_expr_as_int(tail);
-        COMPILE_ASSERT((floor_int + tail_int), "Don't support shape len is 0.");
-    }
-    auto last_axis = expr(floor + tail);
-    const int INVALID_AXIS_MASK = -64;
-    int last_axis_mask = INVALID_AXIS_MASK;
-    std::unordered_map<expr, std::pair<expr, expr>> conditions;
-    std::unordered_map<expr, std::pair<expr, expr>> conditions_tail;
-
-    if (use_mask) {
-        compute_mask_and_generate_condition(graph, src,
-                info.inputs_[0]->details_.get_plain_dims(),
-                info.inputs_[0]->details_.get_format(), iter_vars,
-                vx_info.lanes, conditions, last_axis_mask);
-    }
-    if (last_axis_mask != INVALID_AXIS_MASK && floor_int > 0) {
-        COMPILE_ASSERT(tail_int == 0,
-                "Currently we only support mask in vectorize compute not "
-                "tail.");
-    }
-    std::vector<stmt> tcur;
-    stmt cur;
-    bool bc_input_cast
-            = !in_bc_tsl->tptr_->dtype_.get_pointer_element().is_etype(
-                    out_etype);
-    // if lastdim satisfied threshold, will use scalar version
-    bool tail_threshold = tail.isa<constant>() && tail_int <= 1;
-    bool last_dim_eq_1 = tail.isa<constant>() && tail_int == 1;
-    bool use_scalar = tail_threshold || !use_vectorized || lanes == 1;
-    auto func_op_cast = [](sc_data_etype out_etype, expr &indexed_input,
-                                bool use_scalar = false) {
-        if (use_scalar) {
-            indexed_input = builder::make_cast(
-                    sc_data_type_t(out_etype), indexed_input);
-        } else {
-            indexed_input = builder::make_cast(
-                    sc_data_type_t(out_etype, indexed_input->dtype_.lanes_),
-                    indexed_input);
-        }
-    };
-    auto func_index_bc_input = [&](std::vector<expr> &in_bc_idx,
-                                       expr &indexed_bc_input, expr &iter_var,
-                                       bool use_scalar = false,
-                                       bool has_tail = false) {
-        if (bc_axis.back() == static_cast<int64_t>(vx_info.axis)) {
-            indexing_from_diff_cond(use_scalar, has_tail, *in_bc_tsl, in_bc_idx,
-                    lanes, indexed_bc_input, slice_len, iter_var, floor);
-        }
-        // IF last dim is excluded in bc_axis.
-        else {
-            if (use_scalar) {
-                indexed_bc_input
-                        = builder::make_indexing(in_bc_tsl->tptr_, in_bc_idx);
-            } else {
-                indexed_bc_input = builder::make_broadcast(
-                        builder::make_indexing(in_bc_tsl->tptr_, in_bc_idx),
-                        static_cast<int>(vx_info.lanes));
-            }
-        }
-        if (bc_input_cast) {
-            func_op_cast(out_etype, indexed_bc_input, use_scalar);
-        }
-    };
-    // recover schedule loop
-    for (int i = static_cast<int>(dst.get_shape().size() - 1); i >= 0; i--) {
-        stmt body;
-        // move broadcast op to body
-        if (static_cast<int>(dst.get_shape().size()) == vx_info.axis + 1
-                && i == vx_info.axis) {
-            if (!floor.isa<constant>() || floor_int) {
-                bld->push_scope();
-                // if the shape is less than lanes, we don't use mask to
-                // process.
-                indexing_from_diff_cond(false, false, dst, dst_idx, lanes,
-                        indexed_target, slice_len, iter_vars.at(i), floor);
-                indexing_from_diff_cond(false, false, *in_tsl, in_idx, lanes,
-                        indexed_input, slice_len, iter_vars.at(i), floor);
-                if (!in_tsl->tptr_->dtype_.get_pointer_element().is_etype(
-                            out_etype)) {
-                    func_op_cast(out_etype, indexed_input);
-                }
-
-                expr indexed_bc_input;
-                func_index_bc_input(
-                        in_bc_idx, indexed_bc_input, iter_vars.at(i));
-                std::vector<expr::lvalue_proxy_t> target_vec {
-                        expr::lvalue_proxy_t(indexed_target, false)};
-                auto cond_it = conditions.find(iter_vars[i]);
-                if (cond_it != conditions.end()) {
-                    assert(last_axis_mask != INVALID_AXIS_MASK);
-                    cur = compute(
-                            std::vector<expr> {indexed_input, indexed_bc_input},
-                            target_vec, cond_it->second.first,
-                            cond_it->second.second, vx_info.lanes);
-                } else {
-                    cur = compute(
-                            std::vector<expr> {indexed_input, indexed_bc_input},
-                            target_vec);
-                }
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = bld->pop_scope();
-                if (iter_vars.at(i).isa<var>()) {
-                    cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                            floor, expr(int(vx_info.lanes)), cur, true,
-                            for_type::NORMAL);
-                    bind_loop_axis(expand_gt, cur, i, true);
-                }
-                tcur.emplace_back(cur);
-            }
-            if ((!tail.isa<constant>() && !is_blocking_shape) || tail_int) {
-                auto res_it = std::find(
-                        bc_axis.begin(), bc_axis.end(), vx_info.axis);
-                if (res_it != bc_axis.end()) {
-                    in_bc_idx_tail[keep_dims ? vx_info.axis
-                                             : (res_it - bc_axis.begin())]
-                            = tail_var;
-                }
-                expr indexed_bc_input_tail;
-                func_index_bc_input(in_bc_idx_tail, indexed_bc_input_tail,
-                        tail_var, use_scalar, true);
-                expr indexed_target_tail;
-                expr indexed_input_tail;
-                indexing_from_diff_cond(use_scalar, true, dst, dst_idx_tail,
-                        lanes, indexed_target_tail, slice_len, tail_var, floor,
-                        true);
-                indexing_from_diff_cond(use_scalar, true, *in_tsl, in_idx_tail,
-                        lanes, indexed_input_tail, slice_len, tail_var, floor,
-                        true);
-                if (!in_tsl->tptr_->dtype_.get_pointer_element().is_etype(
-                            out_etype)) {
-                    func_op_cast(out_etype, indexed_input_tail);
-                }
-                std::vector<expr::lvalue_proxy_t> target_vec_tail {
-                        expr::lvalue_proxy_t(indexed_target_tail, false)};
-                bld->push_scope();
-                cur = compute(std::vector<expr> {indexed_input_tail,
-                                      indexed_bc_input_tail},
-                        target_vec_tail);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(tail_var, expr(floor),
-                        do_cast_and_fold(floor + tail),
-                        use_scalar ? expr(1) : lanes, bld->pop_scope(), true,
-                        for_type::NORMAL);
-                bind_loop_axis(expand_gt, cur, i, true);
-                tcur.emplace_back(cur);
-            }
-        } else if (iter_vars.at(i).isa<var>()) {
-            // Do not generate those dummy loop
-            if (!tcur.empty() && tcur[0].defined()) {
-                body = make_stmt<stmts_node_t>(std::move(tcur));
-                tcur.clear();
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst.get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else if (cur.defined()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst.get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else {
-                // if cur not defined, means last axis of tensor slice
-                // has range 1, e.g. tensor_slice{{i, 100},{0, 1}}
-                indexed_target = builder::make_indexing(dst.tptr_, dst_idx);
-
-                indexed_input = builder::make_indexing(in_tsl->tptr_, in_idx);
-
-                expr indexed_bc_input
-                        = builder::make_indexing(in_bc_tsl->tptr_, in_bc_idx);
-                if (bc_input_cast) {
-                    indexed_bc_input = builder::make_cast(
-                            sc_data_type_t(
-                                    out_etype, indexed_bc_input->dtype_.lanes_),
-                            indexed_bc_input);
-                }
-                std::vector<expr::lvalue_proxy_t> target_vec {
-                        expr::lvalue_proxy_t(indexed_target, false)};
-                bld->push_scope();
-                cur = compute(
-                        std::vector<expr> {indexed_input, indexed_bc_input},
-                        target_vec);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                        dst.get_shape().at(i), expr(1), bld->pop_scope(), true,
-                        for_type::NORMAL);
-            }
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-    }
-    if (!tcur.empty() && tcur[0].defined()) {
-        // TODO(xxx): currenly we don't add merge_loop attribute for this
-        // special case, need stronger loop analysis.
-        for (auto &it : tcur) {
-            bld->emit(it);
-        }
-        // TODO(yifei): analyze whether this is safe enough
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-    } else {
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    }
-}
-
-void binary_elementwise_op_impl_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // set default vectorized information
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-
-    for (int64_t i = dst[0]->nslice_dims() - 1; i >= 0; --i) {
-        auto cur_dim = dst[0]->get_shape()[i];
-        if (!cur_dim.isa<constant>()
-                || get_const_as_int(cur_dim.checked_as<constant>())) {
-            vx_info_.axis = i;
-            break;
-        }
-    }
-    vx_info_.lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    bool use_mask = attrs_.get_or_else(op_attr_key::use_padded_mask, true);
-    if (get_owner_graph().is_dynamic()) {
-        use_mask &= info_.cur_impl_ != impl_kind_t::no_padding;
-    }
-    // use broad-cast
-    int bc_input_idx = get_broadcast_input();
-    bool use_broadcast = bc_input_idx != -1;
-    auto func = [&](const std::vector<expr> &in,
-                        const std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-        auto out_dtype = out[0]->dtype_;
-        expr in0, in1;
-        if (use_broadcast) {
-            in0 = in[1 - bc_input_idx], in1 = in[bc_input_idx];
-        } else {
-            in0 = in[0], in1 = in[1];
-            if (in[0]->dtype_ != out_dtype) {
-                in0 = builder::make_cast(out_dtype, in[0]);
-            }
-            if (in[1]->dtype_ != out_dtype) {
-                in1 = builder::make_cast(out_dtype, in[1]);
-            }
-        }
-        return select_algorithm(elt_op_, in0, in1, out[0], attrs_, info_);
-    };
-    if (use_broadcast) {
-        // reuse broadcast op
-        compute_block_broadcast(ctx, get_owner_graph(), inputs, *dst[0], info_,
-                bc_input_idx, get_bc_axis(), vx_info_,
-                mask_compute_func_t(func), get_outputs()[0], wkld, use_mask);
-    } else {
-        compute_vectorized_op(ctx, get_owner_graph(), inputs, *dst[0], info_,
-                vx_info_, mask_compute_func_t(func), mask_compute_func_t(func),
-                attrs_, get_outputs()[0], wkld, use_mask);
-    }
-}
-
-void unary_backward_base_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    auto vx_info = get_vx_info();
-    auto elt_op = get_elt_operator();
-    // set default vectorized information
-    vx_info.axis = dst[0]->get_shape().size() - 1;
-
-    for (int64_t i = dst[0]->nslice_dims() - 1; i >= 0; --i) {
-        auto cur_dim = dst[0]->get_shape()[i];
-        if (!cur_dim.isa<constant>()
-                || get_const_as_int(cur_dim.checked_as<constant>())) {
-            vx_info.axis = i;
-            break;
-        }
-    }
-    vx_info.lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    bool use_mask = attrs_.get_or_else(op_attr_key::use_padded_mask, true);
-    if (get_owner_graph().is_dynamic()) {
-        use_mask &= info_.cur_impl_ != impl_kind_t::no_padding;
-    }
-    auto func = [&](const std::vector<expr> &in,
-                        const std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-        auto out_dtype = out[0]->dtype_;
-        expr in0, in1;
-        in0 = in[0], in1 = in[1];
-        if (in[0]->dtype_ != out_dtype) {
-            in0 = builder::make_cast(out_dtype, in[0]);
-        }
-        if (in[1]->dtype_ != out_dtype) {
-            in1 = builder::make_cast(out_dtype, in[1]);
-        }
-        return select_algorithm(elt_op, in0, in1, out[0], attrs_, info_);
-    };
-    compute_vectorized_op(ctx, get_owner_graph(), inputs, *dst[0], info_,
-            vx_info, mask_compute_func_t(func), mask_compute_func_t(func),
-            attrs_, get_outputs()[0], wkld, use_mask);
-}
-
-unary_backward_base_t::unary_backward_base_t(
-        graph_tensor_ptr lhs, graph_tensor_ptr rhs, elt_operator elt_op)
-    : unary_backward_base_t({std::move(lhs), std::move(rhs)}, {}, {}) {
-    set_elt_operator(elt_op);
-    switch (elt_op) {
-        case elt_operator::ABS_BWD: op_name_ = "abs_bwd"; break;
-        case elt_operator::CLAMP_BWD: op_name_ = "clamp_bwd"; break;
-        case elt_operator::ELU_BWD: op_name_ = "elu_bwd"; break;
-        case elt_operator::HARDSWISH_BWD: op_name_ = "hardswish_bwd"; break;
-        case elt_operator::HARDSIGMOID_BWD: op_name_ = "hardsigmoid_bwd"; break;
-        case elt_operator::MISH_BWD: op_name_ = "mish_bwd"; break;
-        case elt_operator::SQRT_BWD: op_name_ = "sqrt_bwd"; break;
-        case elt_operator::TANH_BWD: op_name_ = "tanh_bwd"; break;
-        case elt_operator::SOFTPLUS_BWD: op_name_ = "soft_plus_bwd"; break;
-        default: break;
-    }
-}
-
-OP_REGISTER(add_op_t, add)
-OP_REGISTER(mul_op_t, mul)
-OP_REGISTER(sub_op_t, sub)
-OP_REGISTER(div_op_t, div)
-OP_REGISTER(min_op_t, min)
-OP_REGISTER(max_op_t, max)
-OP_REGISTER(squared_diff_op_t, squared_diff)
-OP_REGISTER(prelu_op_t, prelu)
-OP_REGISTER(abs_bwd_op_t, abs_bwd)
-OP_REGISTER(clamp_bwd_op_t, clamp_bwd)
-OP_REGISTER(elu_bwd_op_t, elu_bwd)
-OP_REGISTER(hardswish_bwd_op_t, hardswish_bwd)
-OP_REGISTER(hardsigmoid_bwd_op_t, hardsigmoid_bwd)
-OP_REGISTER(sqrt_bwd_op_t, sqrt_bwd)
-OP_REGISTER(mish_bwd_op_t, mish_bwd)
-OP_REGISTER(tanh_bwd_op_t, tanh_bwd)
-OP_REGISTER(softplus_bwd_op_t, soft_plus_bwd)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_elemwise.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_elemwise.hpp
deleted file mode 100644
index 3e120196333..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/binary_elemwise.hpp
+++ /dev/null
@@ -1,416 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_BINARY_ELEMWISE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_BINARY_ELEMWISE_HPP
-
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <runtime/microkernel/cpu/brgemm_alg_kind.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class elt_operator {
-    ADD,
-    SUB,
-    MUL,
-    DIV,
-    MIN,
-    MAX,
-    SQD_DIFF,
-    PRELU,
-    ABS_BWD,
-    CLAMP_BWD,
-    ELU_BWD,
-    HARDSWISH_BWD,
-    HARDSIGMOID_BWD,
-    SQRT_BWD,
-    MISH_BWD,
-    TANH_BWD,
-    SOFTPLUS_BWD,
-};
-
-/**
- * The binary_elementwise op, including add, sub, mul, div, min, max
- * squared_diff
- * Inputs:
- *  - in[0] - the lhs input
- *  - in[1] - the rhs input
- * Outputs:
- *  - The result tensors
- * Attrs:
- *  - auto_broadcast: std::string - default = "numpy". If set to "none", lhs
- *    and rhs shall strictly match. If set to numpy, it will follow broadcast
- *    semantics
- *  - bc_axis: std::vector<int> (optional). If it is not set, the calculation
- *    will strictly follow auto-broadcast semantics. If set, the broadcast axis
- *    will follow the specified "bc_axis", which gives binary_elementwise op an
- *    opportunity to override auto-broadcast semantics. Notice that in this
- *    case, bc_input must have the same length as specified bc_axis to avoid
- *    semantic ambiguity.
- * */
-class binary_elementwise_op_impl_t : public binary_elementwise_op_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() override;
-    binary_elementwise_op_impl_t(
-            graph_tensor_ptr lhs, graph_tensor_ptr rhs, elt_operator elt_op);
-    binary_elementwise_op_impl_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    std::vector<int> get_non_broadcast_input_index(
-            bool assert_non_empty) const override;
-
-    void set_elt_operator(elt_operator elt_op) { elt_op_ = elt_op; }
-    elt_operator get_elt_operator() { return elt_op_; }
-
-    uint32_t get_lanes() const { return vx_info_.lanes; }
-
-    int get_ref_input_index(bool assert_determined) const override;
-
-    // Legacy function only required for old inplace info design
-    void set_inplace_info();
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    bool register_brgemm_fusion(const context_ptr &ctx,
-            const std::vector<tensor_slice *> &outputs,
-            const std::vector<const tensor_slice *> &inputs,
-            brgemm_fusion_register &brg_reg) override;
-    shape_rl_vec get_dynamic_shape_relations() const override;
-    // get real broadcast axis, generally, you should set bc_axis on plain
-    // format semantics if necessary.
-    std::vector<int> get_bc_axis() const;
-    void set_plain_bc_axis();
-    vectorized_info_t &get_vx_info() { return vx_info_; }
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-private:
-    elt_operator elt_op_;
-    vectorized_info_t vx_info_;
-};
-
-class add_op_t : public binary_elementwise_op_impl_t {
-public:
-    add_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::ADD) {
-        alg_kind_ = brgemm::binary_add;
-    }
-    add_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        alg_kind_ = brgemm::binary_add;
-        set_elt_operator(elt_operator::ADD);
-        op_name_ = "add";
-    }
-};
-
-class sub_op_t : public binary_elementwise_op_impl_t {
-public:
-    sub_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::SUB) {
-        alg_kind_ = brgemm::binary_sub;
-    }
-    sub_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        alg_kind_ = brgemm::binary_sub;
-        set_elt_operator(elt_operator::SUB);
-        op_name_ = "sub";
-    }
-};
-
-class mul_op_t : public binary_elementwise_op_impl_t {
-public:
-    mul_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::MUL) {
-        alg_kind_ = brgemm::binary_mul;
-    }
-    mul_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        alg_kind_ = brgemm::binary_mul;
-        set_elt_operator(elt_operator::MUL);
-        op_name_ = "mul";
-    }
-};
-
-class div_op_t : public binary_elementwise_op_impl_t {
-public:
-    div_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::DIV) {
-        alg_kind_ = brgemm::binary_div;
-    }
-    div_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        alg_kind_ = brgemm::binary_div;
-        set_elt_operator(elt_operator::DIV);
-        op_name_ = "div";
-    }
-};
-
-class min_op_t : public binary_elementwise_op_impl_t {
-public:
-    min_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::MIN) {
-        alg_kind_ = brgemm::binary_min;
-    }
-    min_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        alg_kind_ = brgemm::binary_min;
-        set_elt_operator(elt_operator::MIN);
-        op_name_ = "min";
-    }
-};
-
-class max_op_t : public binary_elementwise_op_impl_t {
-public:
-    max_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::MAX) {
-        alg_kind_ = brgemm::binary_max;
-    }
-    max_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        alg_kind_ = brgemm::binary_max;
-        set_elt_operator(elt_operator::MAX);
-        op_name_ = "max";
-    }
-};
-
-// squared_difference: (x-mean)^2
-// squared_diff should support both elementwise and broad-cast mode.
-class squared_diff_op_t : public binary_elementwise_op_impl_t {
-public:
-    squared_diff_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::SQD_DIFF) {}
-    squared_diff_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::SQD_DIFF);
-        op_name_ = "squared_diff";
-    }
-};
-
-// parameter version of leaky_relu.
-class prelu_op_t : public binary_elementwise_op_impl_t {
-public:
-    prelu_op_t(graph_tensor_ptr lhs, graph_tensor_ptr rhs)
-        : binary_elementwise_op_impl_t(
-                std::move(lhs), std::move(rhs), elt_operator::PRELU) {}
-    prelu_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::PRELU);
-        op_name_ = "prelu";
-    }
-};
-
-class unary_backward_base_t : public binary_elementwise_op_impl_t {
-public:
-    unary_backward_base_t(
-            graph_tensor_ptr lhs, graph_tensor_ptr rhs, elt_operator elt_opt);
-    unary_backward_base_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : binary_elementwise_op_impl_t(ins, outs, attrs) {
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               info_.inputs_[0]->details_.get_plain_dims(),
-                               info_.inputs_[1]->details_.get_plain_dims())
-                        && info_.inputs_[0]->details_.dtype_
-                                == info_.inputs_[1]->details_.dtype_,
-                "unary backward op's inputs is not set correctly.");
-    };
-    void compute_block(context_ptr ctx, const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs) override;
-};
-
-class abs_bwd_op_t : public unary_backward_base_t {
-public:
-    abs_bwd_op_t(
-            graph_tensor_ptr lhs, graph_tensor_ptr rhs, bool vectorized = false)
-        : unary_backward_base_t(
-                std::move(lhs), std::move(rhs), elt_operator::ABS_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    abs_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::ABS_BWD);
-        op_name_ = "abs_bwd";
-    }
-};
-
-class clamp_bwd_op_t : public unary_backward_base_t {
-public:
-    clamp_bwd_op_t(graph_tensor_ptr src, graph_tensor_ptr dst)
-        : unary_backward_base_t(
-                std::move(src), std::move(dst), elt_operator::CLAMP_BWD) {}
-    // ins: ins[0] is src or dst, ins[1] is diff_dst
-    clamp_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::CLAMP_BWD);
-        op_name_ = "clamp_bwd";
-        // should set min max attrs.
-        assert(attrs.get_or_null<float>("min")
-                && attrs.get_or_null<float>("max"));
-        use_dst_ = attrs.get_or_else<bool>("use_dst", true);
-    }
-    bool use_dst_ = true;
-};
-
-class elu_bwd_op_t : public unary_backward_base_t {
-public:
-    elu_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::ELU_BWD) {}
-    // ins: ins[0] is src or dst, ins[1] is diff_dst
-    elu_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        assert(attrs.get_or_null<float>("alpha"));
-        set_elt_operator(elt_operator::ELU_BWD);
-        op_name_ = "elu_bwd";
-        alpha_ = attrs_.get<float>("alpha");
-        use_dst_ = attrs_.get_or_else<bool>("use_dst", true);
-    }
-    float alpha_;
-    bool use_dst_;
-};
-
-class hardswish_bwd_op_t : public unary_backward_base_t {
-public:
-    hardswish_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::HARDSWISH_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    hardswish_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::HARDSWISH_BWD);
-        op_name_ = "hardswish_bwd";
-        alpha_ = attrs_.get_or_else<float>("alpha", 1.f / 6.f);
-        beta_ = attrs_.get_or_else<float>("beta", 0.5f);
-    }
-    float alpha_;
-    float beta_;
-};
-
-class hardsigmoid_bwd_op_t : public unary_backward_base_t {
-public:
-    hardsigmoid_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::HARDSIGMOID_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    hardsigmoid_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        assert(attrs.get_or_null<float>("beta")
-                && attrs.get_or_null<float>("alpha"));
-        set_elt_operator(elt_operator::HARDSIGMOID_BWD);
-        op_name_ = "hardsigmoid_bwd";
-        alpha_ = attrs_.get<float>("alpha");
-        beta_ = attrs_.get<float>("beta");
-    }
-    float alpha_;
-    float beta_;
-};
-
-class sqrt_bwd_op_t : public unary_backward_base_t {
-public:
-    sqrt_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::SQRT_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    sqrt_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::SQRT_BWD);
-        op_name_ = "sqrt_bwd";
-        use_dst_ = attrs_.get_or_else<bool>("use_dst", true);
-    }
-    bool use_dst_;
-};
-
-class mish_bwd_op_t : public unary_backward_base_t {
-public:
-    mish_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::MISH_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    mish_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::MISH_BWD);
-        op_name_ = "mish_bwd";
-    }
-};
-
-class tanh_bwd_op_t : public unary_backward_base_t {
-public:
-    tanh_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::TANH_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    tanh_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::TANH_BWD);
-        op_name_ = "tanh_bwd";
-        use_dst_ = attrs_.get_or_else<bool>("use_dst", true);
-    }
-    bool use_dst_;
-};
-
-class softplus_bwd_op_t : public unary_backward_base_t {
-public:
-    softplus_bwd_op_t(graph_tensor_ptr src_dst, graph_tensor_ptr diff_dst)
-        : unary_backward_base_t(std::move(src_dst), std::move(diff_dst),
-                elt_operator::SOFTPLUS_BWD) {}
-    // ins: ins[0] is src, ins[1] is diff_dst
-    softplus_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_backward_base_t(ins, outs, attrs) {
-        set_elt_operator(elt_operator::SOFTPLUS_BWD);
-        op_name_ = "soft_plus_bwd";
-        beta_ = attrs_.get_or_else<float>("beta", 1.f);
-    }
-    bool beta_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/broadcast.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/broadcast.cpp
deleted file mode 100644
index 56d503cd271..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/broadcast.cpp
+++ /dev/null
@@ -1,420 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-
-#include "compiler/ir/builder.hpp"
-#include "compiler/ir/graph/fusible_op_utils.hpp"
-#include "compiler/ir/graph/trait/may_broadcast.hpp"
-#include "ops/fusible/broadcast.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/*
-Check whether it strictly follows auto_broadcast rule
-e.g. [1, 2, 3, 4] & [1, 4] ==> OK
-e.g. [1, 2, 3, 4] & [2] ==> FAIL
-e.g. [1, 2, 3, 4] & [2, 3, 1] ==> OK
-*/
-static bool is_auto_broadcast(
-        const sc_dims &input_shape, const sc_dims &output_shape) {
-    const size_t input_rank = input_shape.size();
-    const size_t output_rank = output_shape.size();
-    const size_t offset = output_rank - input_rank;
-    for (int i = input_rank - 1; i >= 0; --i) {
-        // TODO(yifei): consider whether input_shape[i] != 1 is necessary
-        // here
-        if (input_shape[i] != 1 && input_shape[i] != output_shape[i + offset]) {
-            return false;
-        }
-    }
-    return true;
-}
-
-broadcast_op_t::broadcast_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    op_name_ = "broadcast";
-    attrs_ = attrs;
-    COMPILE_ASSERT(attrs_.has_key("output_shape"),
-            "output_shape must be specified for broadcast op.");
-    output_shape_ = attrs_.has_key("output_shape")
-            ? attrs_.get<sc_dims>("output_shape")
-            : outs[0]->details_.get_plain_dims();
-    // when bc_axis is explicitly specified by users, it will overshadow
-    // auto_broadcast rules
-    plain_bc_axis_ = attrs_.get_or_else("bc_axis", std::vector<int> {});
-    const auto &input_shape = info_.inputs_[0]->details_.get_plain_dims();
-    if (plain_bc_axis_.empty()) {
-        plain_bc_axis_ = op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                input_shape, output_shape_);
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, sc_data_format_t(),
-                        output_shape_, info_.inputs_[0]->details_.dtype_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(output_shape_ == outs[0]->details_.get_plain_dims(),
-                "output_shape attribute shall be consistent with specified "
-                "output.");
-    }
-}
-
-broadcast_op_t::broadcast_op_t(graph_tensor_ptr v,
-        std::vector<int> &output_shape, std::vector<int> &bc_axis)
-    : broadcast_op_t({std::move(v)}, {},
-            {{"output_shape", output_shape}, {"bc_axis", bc_axis}}) {}
-
-shape_rl_vec broadcast_op_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in_plain_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_plain_dims = get_outputs()[0]->details_.get_plain_dims();
-    assert(in_plain_dims.size() == out_plain_dims.size()
-            || in_plain_dims.size() == 1);
-    if (in_plain_dims.size() == out_plain_dims.size()) {
-        for (size_t i = 0; i < in_plain_dims.size(); i++) {
-            // maybe broadcast
-            if (is_dynamic_dim(in_plain_dims[i])) {
-                ret.emplace_back(in_plain_dims[i], out_plain_dims[i]);
-            }
-        }
-    }
-    return ret;
-}
-
-void broadcast_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    const auto &in_format = info_.inputs_[0]->details_.get_format();
-    auto input_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto output_dims = info_.outputs_[0]->details_.get_plain_dims();
-    sc_data_format_t out_format;
-    if (input_dims.size() != output_dims.size()) {
-        COMPILE_ASSERT(in_format == sc_data_format_t(format_kinds::A)
-                        && input_dims == sc_dims {1},
-                "Unsupported format encountered in broadcast op's query "
-                "format.");
-        std::vector<int> storage_args(output_dims.size(), -1);
-        std::iota(storage_args.begin(), storage_args.end(), 0);
-        out_format = sc_data_format_t(storage_args, in_format.blocks_);
-    } else {
-        out_format = in_format;
-    }
-    in_formats.push_back({in_format});
-    out_formats.push_back({out_format});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-std::vector<int> broadcast_op_t::get_bc_axis() const {
-    if (plain_bc_axis_ == std::vector<int> {-1}) return plain_bc_axis_;
-    return transform_axis_plain2blocking(info_.outputs_[0], plain_bc_axis_);
-}
-
-infer_status_code broadcast_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-
-    slice_range_list known_ranges_list = known_ranges_map[0];
-    // derive outputs slice range
-    auto out_dims = this->get_outputs()[0]->details_.get_blocking_dims();
-    auto in_dims = this->get_inputs()[0]->details_.get_blocking_dims();
-    auto blocking_bc_axis = get_bc_axis();
-    slice_range_list ranges_list(known_ranges_list.size());
-    size_t dim_diff = out_dims.size() - in_dims.size();
-    for (size_t i = 0; i < known_ranges_list.size(); i++) {
-        auto &known_range = known_ranges_list[i];
-        COMPILE_ASSERT(known_range.size() == in_dims.size(),
-                "Input's known_range shall have same length as in_dims.")
-        for (size_t j = 0; j < out_dims.size(); j++) {
-            if (std::find(blocking_bc_axis.begin(), blocking_bc_axis.end(), j)
-                    != blocking_bc_axis.end()) {
-                ranges_list[i].emplace_back(known_range[j - dim_diff]);
-            } else {
-                ranges_list[i].emplace_back(
-                        expr(0), expr(dim2unsigned(out_dims[j])));
-            }
-        }
-    }
-    fsmap.get(this->get_outputs()[0]) = ranges_list;
-    return infer_status_code::OK;
-}
-
-infer_status_code broadcast_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    auto &outslice = fsmap.get(get_outputs()[0]);
-    if (outslice.empty()) return infer_status_code::RETRY;
-    auto &input_slice = fsmap.get(get_inputs()[0]);
-    if (input_slice.empty()) {
-        auto out_dims = this->get_outputs()[0]->details_.get_blocking_dims();
-        auto in_dims = this->get_inputs()[0]->details_.get_blocking_dims();
-        slice_range_list ranges_list(in_dims.size());
-        size_t dim_diff = out_dims.size() - in_dims.size();
-        for (size_t i = 0; i < in_dims.size(); i++) {
-            if (out_dims[i + dim_diff] == in_dims[i]) {
-                ranges_list[i] = outslice[i + dim_diff];
-            } else {
-                ranges_list[i].emplace_back(expr(0), expr(1));
-            }
-        }
-    }
-    return infer_status_code::OK;
-}
-
-void broadcast_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // set default vectorized information
-    vx_info_.axis = dst[0]->nslice_dims() - 1;
-    for (int64_t i = dst[0]->nslice_dims() - 1; i >= 0; --i) {
-        auto cur_dim = dst[0]->get_shape()[i];
-        if (!cur_dim.isa<constant>()
-                || get_const_as_int(cur_dim.checked_as<constant>()) > 1) {
-            vx_info_.axis = i;
-            break;
-        }
-    }
-    vx_info_.lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-
-    // get blocking dims
-    auto out_dims = get_outputs()[0]->details_.get_blocking_dims();
-    auto in_dims = get_inputs()[0]->details_.get_blocking_dims();
-    auto bc_axis = get_bc_axis();
-    // validate whether out_dims and in_dims match
-    COMPILE_ASSERT(is_auto_broadcast(in_dims, out_dims),
-            "Broadcast op's compute_block requires the blocking shape of input "
-            "& output strictly follow the auto-broadcast rules; if the "
-            "assertion here failed, please double check specified layouts.");
-    COMPILE_ASSERT(bc_axis == std::vector<int> {-1}
-                    || in_dims.size() == out_dims.size(),
-            "Broadcast op's compute_block encounter unsupported case.");
-    // define assign
-    auto assign = [&](const std::vector<expr> &in,
-                          std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-        return builder::make_assign_unattached(out[0], in[0]);
-    };
-    // auto assign = mask_compute_func_t(func);
-
-    //  enable vectorized code
-    bool use_vectorized = false;
-    vec_backend_require(ctx, use_vectorized);
-    // nested loop vars
-    std::vector<expr> iter_vars;
-    // the indices for in & dst
-    std::vector<expr> in_idx, dst_idx;
-    const tensor_slice *in_tsl = inputs[0], *dst_tsl = dst[0];
-
-    // use src_indices.at(0) as default
-    for (unsigned i = 0; i < dst_tsl->nslice_dims(); i++) {
-        // make the loop var for the for-loop
-        iter_vars.emplace_back(range_from_outer_loop(dst_tsl->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-        dst_idx.emplace_back(iter_vars.back());
-        if (std::find(bc_axis.begin(), bc_axis.end(), i) != bc_axis.end()) {
-            in_idx.emplace_back(iter_vars.back());
-        } else if (bc_axis != std::vector<int> {-1}) {
-            in_idx.emplace_back(0);
-        }
-    }
-    // for empty bc_axis
-    if (in_idx.empty()) in_idx = {0};
-
-    std::vector<expr> in_idx_tail = in_idx, dst_idx_tail = dst_idx;
-    auto tail_var = builder::make_var(
-            datatypes::index, std::string("_fuseiter") + fusion_create_idx());
-    dst_idx_tail[vx_info_.axis] = tail_var;
-
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder set");
-    auto slice_len = dst_tsl->get_shape().at(vx_info_.axis);
-    int lanes = static_cast<int>(vx_info_.lanes);
-    auto floor = do_cast_and_fold(slice_len / lanes * lanes);
-    auto tail = do_cast_and_fold(slice_len % lanes);
-    int floor_int = 0;
-    int tail_int = 0;
-    int floor_len
-            = get_const_as_int(slice_len.static_as<constant>()) / lanes * lanes;
-    int tail_len = get_const_as_int(slice_len.static_as<constant>()) % lanes;
-    if (floor.isa<constant>()) {
-        floor_int = get_expr_as_int(floor);
-        tail_int = get_expr_as_int(tail);
-        COMPILE_ASSERT(
-                (floor_int + tail_int), "Don't support shape len == 0 cases.");
-    }
-
-    auto last_axis = expr(floor + tail);
-
-    // threshold logic is the same as binary op
-    bool tail_threshold = tail.isa<constant>() && tail_int <= 1;
-    bool last_dim_eq_1 = tail.isa<constant>() && tail_int == 1;
-    bool use_scalar = tail_threshold || !use_vectorized || lanes == 1;
-
-    auto func_indexing_input = [&](std::vector<expr> &in_idx,
-                                       expr &indexed_input, expr &iter_var,
-                                       bool use_scalar = false,
-                                       bool has_tail = false) {
-        if (bc_axis.back() == static_cast<int64_t>(vx_info_.axis)) {
-            indexing_from_diff_cond(use_scalar, has_tail, *in_tsl, in_idx,
-                    lanes, indexed_input, slice_len, iter_var, floor);
-        } else {
-            if (use_scalar) {
-                indexed_input = builder::make_indexing(in_tsl->tptr_, in_idx);
-            } else {
-                indexed_input = builder::make_broadcast(
-                        builder::make_indexing(in_tsl->tptr_, in_idx), lanes);
-            }
-        }
-    };
-
-    std::vector<stmt> tcur;
-    stmt cur;
-    // recover schedule loop
-    for (int i = static_cast<int>(dst_tsl->get_shape().size() - 1); i >= 0;
-            i--) {
-        stmt body;
-        // do vectorize on the last dim
-        if (static_cast<int>(dst_tsl->get_shape().size()) == vx_info_.axis + 1
-                && i == vx_info_.axis) {
-            if (!floor.isa<constant>() || floor_int) {
-                expr indexed_input, indexed_target;
-                indexing_from_diff_cond(false, false, *dst_tsl, dst_idx, lanes,
-                        indexed_target, slice_len, iter_vars.at(i), floor);
-                func_indexing_input(in_idx, indexed_input, iter_vars.at(i));
-                bld->push_scope();
-                std::vector<expr::lvalue_proxy_t> target_vec {
-                        expr::lvalue_proxy_t(indexed_target, false)};
-                cur = assign(std::vector<expr> {indexed_input}, target_vec);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = bld->pop_scope();
-                if (iter_vars.at(i).isa<var>()) {
-                    cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                            expr(floor), expr(lanes), cur, true,
-                            for_type::NORMAL);
-                    bind_loop_axis(get_outputs()[0], cur, i, true);
-                }
-                tcur.emplace_back(cur);
-            }
-            if (!tail.isa<constant>() || tail_int) {
-                expr indexed_target_tail, indexed_input_tail;
-                if (std::find(bc_axis.begin(), bc_axis.end(), i)
-                        != bc_axis.end()) {
-                    in_idx_tail[vx_info_.axis] = tail_var;
-                }
-                indexing_from_diff_cond(use_scalar, true, *dst_tsl,
-                        dst_idx_tail, lanes, indexed_target_tail, slice_len,
-                        tail_var, floor, true);
-                func_indexing_input(in_idx_tail, indexed_input_tail, tail_var,
-                        use_scalar, true);
-                std::vector<expr::lvalue_proxy_t> target_vec_tail {
-                        expr::lvalue_proxy_t(indexed_target_tail, false)};
-                bld->push_scope();
-                cur = assign(std::vector<expr> {indexed_input_tail},
-                        target_vec_tail);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(tail_var, expr(floor),
-                        do_cast_and_fold(floor + tail),
-                        use_scalar ? expr(1) : lanes, bld->pop_scope(), true,
-                        for_type::NORMAL);
-                bind_loop_axis(get_outputs()[0], cur, i, true);
-                tcur.emplace_back(cur);
-            }
-        } else if (iter_vars.at(i).isa<var>()) {
-            if (!tcur.empty() && tcur[0].defined()) {
-                body = make_stmt<stmts_node_t>(std::move(tcur));
-                tcur.clear();
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst_tsl->get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else if (cur.defined()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst_tsl->get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else {
-                // if cur not defined, means last axis of tensor slice has range
-                // 1, e.g. tensor_slice{{i, 100},{0, 1}}
-                expr indexed_target, indexed_input;
-                indexed_target
-                        = builder::make_indexing(dst_tsl->tptr_, dst_idx);
-                indexed_input = builder::make_indexing(in_tsl->tptr_, in_idx);
-                std::vector<expr::lvalue_proxy_t> target_vec {
-                        expr::lvalue_proxy_t(indexed_target, false)};
-                bld->push_scope();
-                cur = assign(std::vector<expr> {indexed_input}, target_vec);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst_tsl->get_shape().at(i), expr(1),
-                        bld->pop_scope(), true, for_type::NORMAL);
-            }
-            bind_loop_axis(get_outputs()[0], cur, i, true);
-        }
-    }
-    if (!tcur.empty() && tcur[0].defined()) {
-        // TODO(xxx): currenly we don't add merge_loop attribute for this
-        // special case, need stronger loop analysis.
-        for (auto &it : tcur) {
-            bld->emit(it);
-        }
-        // TODO(yifei): analyze whether this is safe enough
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-    } else {
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    }
-}
-
-size_t broadcast_op_t::compute_workload(
-        const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    // compute_workload(outs, outs) instead of compute_workload(ins, outs)
-    // because broadcast op involves reading single in for multiple times
-    // while writing the result to outs
-    return fusible_op_t::compute_workload(outs, outs)
-            * workload_penalty_coefficient;
-}
-
-OP_REGISTER(broadcast_op_t, broadcast)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/broadcast.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/broadcast.hpp
deleted file mode 100644
index 0eab01d60f3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/broadcast.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_BROADCAST_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_BROADCAST_HPP
-
-#include <vector>
-
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * The broadcast op
- * Inputs:
- *  - The input to be broadcasted
- * Outputs:
- *  - The resulting tensor
- * Attrs:
- *  - output_shape: sc_dims - Specifies the shape of the output
- *  - bc_axis: std::vector<int> (optional). If it is not set, the calculation
- *    will strictly follow auto-broadcast semantics. If set, the broadcast axis
- *    will follow the specified "bc_axis".
- * */
-class broadcast_op_t : public movement_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    broadcast_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    broadcast_op_t(graph_tensor_ptr v, std::vector<int> &output_shape,
-            std::vector<int> &bc_axis);
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-    // get broadcast axis on blocking dims (inferred from the bc_axis_ on plain
-    // dims)
-    std::vector<int> get_bc_axis() const;
-
-    std::vector<int> get_plain_bc_axis() const { return plain_bc_axis_; }
-
-private:
-    sc_dims output_shape_;
-    std::vector<int> plain_bc_axis_;
-    vectorized_info_t vx_info_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/memory_movement.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/memory_movement.cpp
deleted file mode 100644
index 48724e8cf22..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/memory_movement.cpp
+++ /dev/null
@@ -1,1520 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "memory_movement.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/anchor_loop_generator.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <unordered_map>
-#include <util/math_utils.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-ir_module_ptr reshape_op_t::get_func(context_ptr ctx) {
-    return fusible_op_get_func(this, ctx);
-}
-
-static void check_concat_validity(
-        const std::vector<graph_tensor_ptr> &candidates, unsigned axis) {
-    COMPILE_ASSERT(candidates.size() > 1,
-            "Number of candidates for concat op must be larger than 1!\n");
-    auto firstShape = candidates[0]->details_.get_blocking_dims();
-    COMPILE_ASSERT(firstShape.size(),
-            "First candidate of concat op has empty dimensions!\n");
-
-    for (unsigned i = 1; i < candidates.size(); i++) {
-        auto curShape = candidates[i]->details_.get_blocking_dims();
-        if (curShape.size() != firstShape.size()) {
-            COMPILE_ASSERT(
-                    0, "Input shapes are not matched in concat fusion op!\n");
-        }
-        for (unsigned dim = 0; dim < firstShape.size(); dim++) {
-            if (axis == dim && curShape[dim]) { continue; }
-            COMPILE_ASSERT(curShape[dim] == firstShape[dim],
-                    "Input shapes: "
-                            << utils::print_vector(curShape) << " and "
-                            << utils::print_vector(firstShape)
-                            << " are not matched in concat fusion op!\n");
-        }
-    }
-}
-
-static void compute_block_concat(const context_ptr &ctx,
-        const std::vector<const tensor_slice *> &src, const tensor_slice &dst,
-        int64_t axis, size_t wkld = 0UL) {
-    // outer nested loop vars
-    std::vector<expr> outer_iter(axis);
-    // inner nested loop vars
-    std::vector<std::vector<expr>> inner_iter(dst.nslice_dims() - axis);
-    // the indices for multiple inputs. First dim: the input, Second dim:
-    // the dimemsions in the tensor
-    std::vector<std::vector<expr>> src_idx(src.size());
-    // the indices for the output tensor. Cause concat is a assign op, we
-    // need number of src indexes.
-    std::vector<std::vector<expr>> dst_idx(src.size());
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        if (i < static_cast<unsigned>(axis)) { // outer loop
-            // make the loop var for the for-loop
-            outer_iter[i] = range_from_outer_loop(dst.get_ranges()[i])
-                    ? expr(0)
-                    : builder::make_var(datatypes::index,
-                            std::string("_fuseiter") + fusion_create_idx());
-            for (unsigned j = 0; j < src.size(); j++) {
-                src_idx[j].emplace_back(outer_iter[i]);
-                dst_idx[j].emplace_back(outer_iter[i]);
-            }
-        } else { // inner loop
-            expr cur = 0;
-            for (unsigned j = 0; j < src.size(); j++) {
-                inner_iter[i - axis].emplace_back(builder::make_var(
-                        datatypes::index,
-                        std::string("_fuseiter") + fusion_create_idx()));
-                src_idx[j].emplace_back(inner_iter[i - axis][j]);
-                if (static_cast<int64_t>(i) == axis) {
-                    if (j > 0) { cur = cur + src[j - 1]->get_shape()[i]; }
-                    dst_idx[j].emplace_back(inner_iter[i - axis][j] + cur);
-                } else {
-                    dst_idx[j].emplace_back(inner_iter[i - axis][j]);
-                }
-            }
-        }
-    }
-    expr indexed_dst;
-    expr indexed_src;
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    std::vector<stmt> tcur;
-    for (unsigned j = 0; j < src.size(); j++) {
-        size_t last_axis = dst.nslice_dims() - 1;
-        auto slice_len = do_cast_and_fold(src[j]->get_shape().at(last_axis));
-        bool is_static = slice_len.isa<constant>();
-        if (is_static) { // static shape case
-            // for the inner-most axis, use vectorization
-            auto dtype = dst.get_base_dtype();
-            auto vec_lanes = vectorize_step(ctx, dtype.type_code_);
-            auto floor = do_cast_and_fold(slice_len / vec_lanes * vec_lanes);
-            auto tail = do_cast_and_fold(slice_len % vec_lanes);
-            int floor_int = get_expr_as_int(floor);
-            int tail_int = get_expr_as_int(tail);
-            COMPILE_ASSERT(
-                    (floor_int + tail_int), "Don't support shape len = 0.");
-            auto cur = builder::make_stmts_unattached({});
-            if (tail_int) { // tail part
-                auto tail_part = builder::make_stmts_unattached({});
-                auto mask = last_dim_generate_mask(
-                        inner_iter[last_axis - axis][j], floor, slice_len,
-                        vec_lanes, true);
-                indexed_dst = builder::make_indexing(
-                        dst.tptr_, dst_idx[j], vec_lanes, mask);
-                indexed_src = builder::make_indexing(
-                        src[j]->tptr_, src_idx[j], vec_lanes, mask);
-                auto assign
-                        = make_stmt<assign_node_t>(indexed_dst, indexed_src);
-                assign->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                tail_part.static_as<stmts>()->seq_.emplace_back(assign);
-                tail_part = make_stmt<for_loop_node_t>(
-                        inner_iter[last_axis - axis][j], floor,
-                        src[j]->get_shape()[last_axis], vec_lanes,
-                        std::move(tail_part), true, for_type::NORMAL);
-                cur.static_as<stmts>()->seq_.emplace_back(tail_part);
-            }
-
-            if (floor_int) { // divisible part
-                auto divisible_part = builder::make_stmts_unattached({});
-                indexed_dst = builder::make_indexing(
-                        dst.tptr_, dst_idx[j], vec_lanes);
-                indexed_src = builder::make_indexing(
-                        src[j]->tptr_, src_idx[j], vec_lanes);
-                auto assign
-                        = make_stmt<assign_node_t>(indexed_dst, indexed_src);
-                assign->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                divisible_part.static_as<stmts>()->seq_.emplace_back(assign);
-                divisible_part = make_stmt<for_loop_node_t>(
-                        inner_iter[last_axis - axis][j], expr(0), floor,
-                        vec_lanes, std::move(divisible_part), true,
-                        for_type::NORMAL);
-                cur.static_as<stmts>()->seq_.emplace_back(divisible_part);
-            }
-
-            // for other inner axes
-            for (int64_t i = static_cast<int64_t>(dst.nslice_dims()) - 2;
-                    i >= axis; i--) {
-                auto body = cur.isa<stmts>()
-                        ? std::move(cur)
-                        : make_stmt<stmts_node_t>(
-                                std::vector<stmt> {std::move(cur)});
-                cur = make_stmt<for_loop_node_t>(inner_iter[i - axis][j],
-                        expr(0), src[j]->get_shape()[i], expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            }
-            tcur.emplace_back(std::move(cur));
-        } else { // dynamic case, use step = 1
-            indexed_dst = builder::make_indexing(dst.tptr_, dst_idx[j]);
-            indexed_src = builder::make_indexing(src[j]->tptr_, src_idx[j]);
-            stmt cur = make_stmt<assign_node_t>(indexed_dst, indexed_src);
-            cur->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            for (int64_t i = static_cast<int64_t>(dst.nslice_dims()) - 1;
-                    i >= axis; i--) {
-                auto body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-                cur = make_stmt<for_loop_node_t>(inner_iter[i - axis][j],
-                        expr(0), src[j]->get_shape()[i], expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            }
-            tcur.emplace_back(std::move(cur));
-        }
-    }
-
-    if (axis) {
-        stmt cur = make_stmt<stmts_node_t>(std::move(tcur));
-        for (int i = axis - 1; i >= 0; i--) {
-            // Do not generate those dummy loops
-            if (!outer_iter[i].isa<var>()) continue;
-            stmt body;
-            if (cur.isa<for_loop>()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-            } else {
-                body = cur;
-            }
-            cur = make_stmt<for_loop_node_t>(outer_iter[i], expr(0),
-                    src[0]->get_shape()[i], expr(1), std::move(body), true,
-                    for_type::NORMAL);
-        }
-        bld->emit(cur);
-    } else {
-        for (auto &cur : tcur) {
-            bld->emit(cur);
-        }
-    }
-}
-
-static bool check_slice_on_non_concat_axis_equal(
-        const slice_range_map &known_ranges_map, int64_t axis) {
-    std::vector<slice_range_list> slices;
-    for (auto &id_sr_pair : known_ranges_map) {
-        slices.push_back(id_sr_pair.second);
-    }
-    COMPILE_ASSERT(slices.size() > 1,
-            "Only check if slices of multiple inputs are given");
-    for (size_t i = 1; i < slices.size(); ++i) { // number of known slices
-        COMPILE_ASSERT(slices[0].size() == slices[i].size(),
-                "The multi-slice number should be equal");
-        for (size_t j = 0; j < slices[0].size(); ++j) { // number of multi-slice
-            // input tensor rank
-            COMPILE_ASSERT(slices[0][j].size() == slices[i][j].size(),
-                    "The rank of inout tensors should be equal");
-            for (size_t k = 0; k < slices[0][j].size(); ++k) {
-                // if pair offset and range is not equal on non-concat axis
-                if (int64_t(k) != axis
-                        && slices[0][j][k].first.isa<constant_c>()
-                        && slices[i][j][k].first.isa<constant_c>()
-                        && slices[0][j][k].second.isa<constant_c>()
-                        && slices[i][j][k].second.isa<constant_c>()) {
-                    auto input0_offset = get_expr_as_int(slices[0][j][k].first);
-                    auto inputi_offset = get_expr_as_int(slices[i][j][k].first);
-                    auto input0_range = get_expr_as_int(slices[0][j][k].second);
-                    auto inputi_range = get_expr_as_int(slices[i][j][k].second);
-                    if (input0_offset != inputi_offset
-                            || input0_range != inputi_range) {
-                        return false;
-                    }
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-static sc_dims infer_concat_output_shape(
-        const std::vector<graph_tensor_ptr> &inputs, const int &axis) {
-    sc_dims ref_output_shape = inputs[0]->details_.get_plain_dims();
-    sc_data_type_t ref_dtype = inputs[0]->details_.dtype_;
-    sc_dim accumulated_dim = ref_output_shape[axis];
-    for (size_t i = 1; i < inputs.size(); i++) {
-        const auto &input_shape = inputs[i]->details_.get_plain_dims();
-        COMPILE_ASSERT(ref_output_shape.size() == input_shape.size(),
-                "The rank of all inputs of concat op shall match.");
-        COMPILE_ASSERT(inputs[i]->details_.dtype_ == ref_dtype,
-                "The data type of all inputs of concat op shall match.");
-        for (size_t d = 0; d < input_shape.size(); ++d) {
-            if (d == static_cast<size_t>(axis)) {
-                accumulated_dim += input_shape[d];
-            } else {
-                COMPILE_ASSERT(input_shape[d] == ref_output_shape[d],
-                        "The shape of concat inputs on not concated dim shall "
-                        "match.");
-            }
-        }
-    }
-    ref_output_shape[axis] = accumulated_dim;
-    return ref_output_shape;
-}
-
-concat_op_t::concat_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    op_name_ = "concat";
-    COMPILE_ASSERT(!ins.empty(), "Inputs to concat should be non-empty");
-    COMPILE_ASSERT(attrs.has_key("axis"), "Concat axis should be provided.");
-    attrs_ = attrs;
-    for (auto &in : ins) {
-        info_.inputs_.emplace_back(in);
-    }
-    is_input_valid_ = std::vector<bool>(info_.inputs_.size(), true);
-    set_format_and_axis();
-    // inferring output plain shape
-    sc_dims output_plain_dim
-            = infer_concat_output_shape(info_.inputs_, plain_axis_);
-
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_.dtype_ = info_.inputs_[0]->details_.dtype_;
-        info_.outputs_[0]->details_.set_plain_dims(output_plain_dim);
-        info_.outputs_[0]->details_.set_format(
-                info_.inputs_[0]->details_.get_format());
-    } else {
-        COMPILE_ASSERT(
-                outs.size() == 1, "Only one output is supported for concat op");
-        COMPILE_ASSERT(
-                gc::graph::check_shape_equal(
-                        outs[0]->details_.get_plain_dims(), output_plain_dim),
-                "Concat op's output shape is not correct.");
-        info_.outputs_.emplace_back(outs.front());
-    }
-}
-
-void concat_op_t::set_format_and_axis() {
-    // find the largest input index
-    auto max_buffer = std::max_element(info_.inputs_.begin(),
-            info_.inputs_.end(),
-            [](const graph_tensor_ptr &a, const graph_tensor_ptr &b) {
-                return get_dims_product(a->details_.get_plain_dims())
-                        < get_dims_product(b->details_.get_plain_dims());
-            });
-    if ((*max_buffer)->details_.get_format().get_format_category()
-            == sc_format_category::non_blocking) {
-        ori_format_ = (*max_buffer)->details_.get_format();
-    } else {
-        // if the input has any/block/vnni format, use plain format when concat
-        ori_format_ = sc_data_format_t::get_plain_by_dims(
-                (int)(*max_buffer)->details_.get_plain_dims().size());
-    }
-    // here axis_ is in plain format (because it is copied from llga bridge)
-    // we need to transform it to blocking format
-    axis_ = attrs_.get<int>("axis");
-    // We accept negative axis_, but keep it non-negative internally
-    int64_t rank = info_.inputs_[0]->details_.get_plain_dims().size();
-    COMPILE_ASSERT(axis_ >= -rank && axis_ <= rank - 1,
-            "Concat axis should be in range [" << -rank << ", " << rank - 1
-                                               << "], but get: " << axis_);
-    if (axis_ < 0) { axis_ += rank; }
-    plain_axis_ = axis_;
-    std::vector<int> blocking_axes
-            = ori_format_.format_code_.collect_p2b_mapping()[axis_];
-    COMPILE_ASSERT(
-            blocking_axes.size() == 1, "The concat axis should not be blocked");
-    axis_ = blocking_axes[0];
-}
-
-concat_op_t::concat_op_t(
-        const std::vector<graph_tensor_ptr> &candidates, int axis)
-    : concat_op_t(candidates, {}, any_map_t({{"axis", axis}})) {}
-
-void concat_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    set_format_and_axis();
-    std::vector<std::vector<sc_data_format_t>> in_formats(
-            info_.inputs_.size(), {ori_format_});
-    std::vector<std::vector<sc_data_format_t>> out_formats(
-            info_.outputs_.size(), {ori_format_});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-    for (size_t i = 0; i < info_.inputs_.size(); ++i) {
-        if (info_.inputs_[i]->details_.get_format() == ori_format_) {
-            supported_ins[i][0].second
-                    = info_.inputs_[i]->details_.get_strides();
-        }
-    }
-}
-
-infer_status_code concat_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    if (known_ranges_map.size() > 1) {
-        // slice of multiple inputs are given
-        if (!check_slice_on_non_concat_axis_equal(known_ranges_map, axis_)) {
-            return infer_status_code::RETRY;
-        }
-    }
-    auto known_id = known_ranges_map.begin()->first; // input id
-    slice_range_list sr = known_ranges_map[known_id];
-
-    size_t slice_size = sr.size(); // multi-slice
-    for (size_t i = 0; i < get_inputs().size(); ++i) {
-        if (known_ranges_map.find(i) == known_ranges_map.end()) {
-            fsmap.get(get_inputs()[i]).clear();
-            fsmap.get(get_inputs()[i]).resize(slice_size);
-        }
-    }
-    fsmap.get(get_outputs()[0]).clear();
-    fsmap.get(get_outputs()[0]).resize(slice_size);
-
-    std::vector<int> required_axis = {int(axis_)};
-    for (size_t n = 0; n < slice_size; n++) { // multi-slice index
-        // slice at concat axis should be full
-        if (!slice_full_on_axis(
-                    info_.inputs_[known_id]->details_.get_blocking_dims(),
-                    sr[n], {int(axis_)})) {
-            return infer_status_code::RETRY;
-        }
-
-        // slice_ranges of inputs and output only differ at concat axis.
-        // Since we have already checked the slice_range is full, now we can
-        // safely set its offset to 0 and range to shape.
-        for (size_t i = 0; i < get_inputs().size(); ++i) {
-            slice_range sr_i = sr[n];
-            sr_i[axis_].first = 0;
-            sr_i[axis_].second = int(dim2unsigned(
-                    info_.inputs_[i]->details_.get_blocking_dims()[axis_]));
-            fsmap.get(get_inputs()[i]).at(n) = sr_i;
-        }
-        slice_range sr_o = sr[n];
-        sr_o[axis_].first = 0;
-        sr_o[axis_].second = int(dim2unsigned(
-                info_.outputs_[0]->details_.get_blocking_dims()[axis_]));
-        fsmap.get(get_outputs()[0]).at(n) = sr_o;
-    }
-    return infer_status_code::OK;
-}
-
-infer_status_code concat_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-void concat_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    compute_block_concat(ctx, inputs, *dst[0], axis_, wkld);
-}
-
-static sc_dims infer_transpose_output_shape(
-        const sc_dims &input_shape, const std::vector<int> &order) {
-    sc_dims output_shape(input_shape.size());
-    for (size_t i = 0; i < input_shape.size(); ++i) {
-        output_shape[i] = input_shape[order[i]];
-    }
-    return output_shape;
-}
-
-transpose_op_t::transpose_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : order_(attrs.get<std::vector<int>>("order")) {
-    info_.inputs_ = ins;
-    info_.outputs_ = outs;
-    assert(info_.inputs_.size() == 1);
-    COMPILE_ASSERT(
-            order_.size() == info_.inputs_[0]->details_.get_plain_dims().size(),
-            "Attribute order shall have the same length as input.");
-    auto output_shape = infer_transpose_output_shape(
-            info_.inputs_[0]->details_.get_plain_dims(), order_);
-    auto out_format = attrs.get_or_else("out_format", sc_data_format_t());
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_.set_plain_dims(output_shape);
-        info_.outputs_[0]->details_.dtype_ = ins[0]->details_.dtype_;
-        info_.outputs_[0]->details_.set_format(out_format);
-    } else {
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "Transpose op shall only have 1 output.");
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               info_.outputs_[0]->details_.get_plain_dims(),
-                               output_shape),
-                "Specified transpose op's output shape is incorrect.");
-        COMPILE_ASSERT(info_.outputs_[0]->details_.dtype_
-                        == info_.inputs_[0]->details_.dtype_,
-                "Specified transpose op's output dtype is incorrect.");
-    }
-    attrs_ = attrs;
-    op_name_ = "transpose";
-}
-
-transpose_op_t::transpose_op_t(graph_tensor_ptr v, std::vector<int> &order)
-    : order_(order) {
-    info_.inputs_.emplace_back(std::move(v));
-    info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-    op_name_ = "transpose";
-}
-
-shape_rl_vec transpose_op_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    for (size_t i = 0; i < in_dims.size(); i++) {
-        if (is_dynamic_dim(in_dims[order_[i]])) {
-            assert(is_dynamic_dim(out_dims[i]));
-            ret.emplace_back(in_dims[order_[i]], out_dims[i]);
-        }
-    }
-    return ret;
-}
-
-void transpose_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    COMPILE_ASSERT(!info_.inputs_[0]->details_.get_format().is_any(),
-            "cannot infer output format with any input format");
-    auto in_format = info_.inputs_[0]->details_.get_format();
-    auto in_strides = info_.inputs_[0]->details_.get_strides();
-    auto in_format_code = in_format.format_code_;
-    std::unordered_map<int, int> order_map;
-    for (size_t i = 0; i < order_.size(); ++i) {
-        order_map[order_[i]] = i;
-    }
-
-    std::vector<int> storage_args;
-    for (int i = 0; i < in_format_code.ndims(); ++i) {
-        int axis = in_format_code.get(i);
-        storage_args.push_back(order_map[axis]);
-    }
-    auto out_format = sc_data_format_t(storage_args, in_format.blocks_);
-
-    attrs_["layout_transformed"] = true;
-
-    supported_ins.resize(1);
-    supported_outs.resize(1);
-    supported_ins[0].emplace_back(std::make_pair(in_format, in_strides));
-    supported_outs[0].emplace_back(std::make_pair(out_format, in_strides));
-}
-
-infer_status_code transpose_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-infer_status_code transpose_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-void compute_block_transpose(const std::vector<const tensor_slice *> &src,
-        const tensor_slice &dst, const std::vector<int> &axis, size_t wkld) {
-    std::vector<expr> iters(src[0]->nslice_dims());
-    std::vector<expr> src_idx(src[0]->nslice_dims());
-    std::vector<expr> dst_idx(src[0]->nslice_dims());
-
-    for (unsigned i = 0; i < src[0]->nslice_dims(); i++) {
-        iters[i] = builder::make_var(datatypes::index,
-                std::string("_fuseiter") + fusion_create_idx());
-        src_idx[i] = iters[i];
-    }
-    dst_idx = src_idx;
-    std::swap(dst_idx[axis[0]], dst_idx[axis[1]]);
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-
-    stmt cur;
-    expr indexed_target = builder::make_indexing(dst.tptr_, dst_idx);
-    expr indexed_input = builder::make_indexing(src[0]->tptr_, src_idx);
-    cur = make_stmt<assign_node_t>(indexed_target, indexed_input);
-    for (int64_t i = src[0]->nslice_dims() - 1; i >= 0; i--) {
-        auto body = make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(iters[i], expr(0),
-                src[0]->get_shape()[i], expr(1), std::move(body), true,
-                for_type::NORMAL);
-    }
-    bld->emit(cur);
-}
-
-void transpose_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    throw std::runtime_error("Not implemented");
-}
-
-size_t transpose_op_t::compute_workload(
-        const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    return fusible_op_t::compute_workload(ins, outs)
-            * workload_penalty_coefficient;
-}
-
-sc_dims tensor_view_op_t::get_shapes() const {
-    return info_.outputs_[0]->details_.get_blocking_dims();
-}
-
-std::vector<expr> tensor_view_op_t::get_shapes_expr() {
-    return info_.outputs_[0]->details_.get_blocking_dims_expr(
-            get_owner_graph());
-}
-
-tensor_view_op_t::tensor_view_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    op_name_ = "tensor_view";
-    COMPILE_ASSERT(ins.size() == 1, "Reshape takes 1 input");
-    info_.inputs_ = ins;
-    auto cache_input_format = ins[0]->details_.get_format();
-    attrs_ = attrs;
-    auto shapes = attrs_.get<sc_dims>("shape");
-    auto format = attrs_.get_or_else("format", sc_data_format_t());
-    int total_shape1 = 1, total_shape2 = 1, total_shape3 = 1;
-    for (auto &dim : sc_data_format_t::get_padded_plain_shapes(
-                 ins[0]->details_.get_blocking_dims(), cache_input_format)) {
-        total_shape1 *= dim;
-    }
-    for (auto &dim : shapes) {
-        total_shape2 *= dim;
-    }
-    if (!outs.empty()) {
-        for (auto &dim : sc_data_format_t::get_padded_plain_shapes(
-                     outs[0]->details_.get_blocking_dims(),
-                     outs[0]->details_.get_format())) {
-            total_shape3 *= dim;
-        }
-    }
-    COMPILE_ASSERT(is_dynamic() || total_shape1 == total_shape2
-                    || (!outs.empty() && total_shape1 == total_shape3),
-            "Wrong total size of input shapes, can not do reshape plain dims "
-            "from " << utils::print_vector(ins[0]->details_.get_plain_dims())
-                    << " to " << utils::print_vector(shapes));
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_.dtype_ = ins[0]->details_.dtype_;
-        info_.outputs_[0]->details_.set_plain_dims(shapes);
-        info_.outputs_[0]->details_.set_format(format);
-        shapes_ = shapes;
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "Wrong op output size.\n");
-        info_.outputs_ = outs;
-        format = info_.outputs_[0]->details_.get_format();
-        // changed to get dynamically in need.
-        // shapes_ = outs[0]->details_.get_blocking_dims();
-    }
-    if (cache_input_format.is_any()) {
-        cache_input_format
-                = sc_data_format_t(sc_data_format_kind_t::get_plain_by_dims(
-                        ins[0]->details_.get_plain_dims().size()));
-    }
-    if (!attrs_.has_key("cache_input_format")) {
-        attrs_["cache_input_format"] = cache_input_format;
-    }
-    if (format.is_any()) {
-        format = sc_data_format_t(sc_data_format_kind_t::get_plain_by_dims(
-                info_.outputs_[0]->details_.get_plain_dims().size()));
-        attrs_["format"] = format;
-    } else if (!attrs_.has_key("format")) {
-        attrs_["format"] = format;
-    }
-    if (is_dynamic()
-            && count_dynamic_dims(get_inputs()[0]->details_.get_plain_dims())
-                    != count_dynamic_dims(
-                            get_outputs()[0]->details_.get_plain_dims())) {
-        attrs_.set(op_attr_key::no_fuse, true);
-    }
-}
-
-bool tensor_view_op_t::is_only_expand_or_penetrate() const {
-    const auto &in_tensor = this->get_inputs()[0]->details_;
-    auto in_shape = in_tensor.get_plain_dims();
-    auto in_real_shape = in_tensor.get_blocking_dims();
-    auto out_tensor = this->get_outputs()[0]->details_;
-    auto out_shape = out_tensor.get_plain_dims();
-    auto out_real_shape = out_tensor.get_blocking_dims();
-    auto erase_element_one = [&](sc_dims &dim) {
-        dim.erase(std::remove_if(dim.begin(), dim.end(),
-                          [&](sc_dim v) { return v == 1; }),
-                dim.end());
-    };
-    erase_element_one(in_shape);
-    erase_element_one(in_real_shape);
-    erase_element_one(out_shape);
-    erase_element_one(out_real_shape);
-    if (in_real_shape == out_real_shape && in_shape == out_shape) {
-        return true;
-    }
-    return false;
-}
-
-tensor_view_op_t::tensor_view_op_t(graph_tensor_ptr v, const sc_dims &shapes)
-    : tensor_view_op_t({std::move(v)}, {}, {{"shape", shapes}, {}}) {}
-
-bool tensor_view_op_t::try_penetrate(
-        sc_data_format_t &new_output_format) const {
-    if (attrs_.get_or_else("forbid_penetrate", false)) { return false; }
-    auto input_plain_shapes = info_.inputs_[0]->details_.get_plain_dims();
-    auto input_blocking_shapes = info_.inputs_[0]->details_.get_blocking_dims();
-    auto input_format = info_.inputs_[0]->details_.get_format();
-    auto output_plain_shapes = info_.outputs_[0]->details_.get_plain_dims();
-    auto input_size = input_plain_shapes.size();
-    auto output_size = output_plain_shapes.size();
-    bool inp_short = input_size < output_size;
-    auto &short_plain_shapes
-            = inp_short ? input_plain_shapes : output_plain_shapes;
-    auto &long_plain_shapes
-            = inp_short ? output_plain_shapes : input_plain_shapes;
-    auto &short_size = inp_short ? input_size : output_size;
-    auto &long_size = inp_short ? output_size : input_size;
-    bool can_penetrate = true;
-    std::unordered_map<size_t, size_t> long_to_short;
-    // if inp_short, inp blk idx to out blk idx
-    std::unordered_map<size_t, size_t> inp_blk_map;
-    size_t short_idx = 0, long_idx = 0;
-    while (short_idx < short_size) {
-        COMPILE_ASSERT(long_idx < long_plain_shapes.size(),
-                "long_idx shall be within the valid range.");
-        int64_t acc_shape = long_plain_shapes[long_idx];
-        long_to_short[long_idx] = short_idx;
-        long_idx++;
-        while (long_idx < long_size
-                && (long_size - long_idx) >= (short_size - short_idx)
-                && (acc_shape < short_plain_shapes[short_idx]
-                        || long_plain_shapes[long_idx] == 1)) {
-            acc_shape *= long_plain_shapes[long_idx];
-            long_to_short[long_idx] = short_idx;
-            long_idx++;
-        }
-
-        if (acc_shape != short_plain_shapes[short_idx]) {
-            can_penetrate = false;
-            break;
-        }
-        // blocking of short format is big than corresponding long plain dims.
-        if (inp_short) {
-            auto blk_idx_list
-                    = input_format.format_code_.collect_blocking_index(
-                            short_idx);
-            if (!blk_idx_list.empty()) {
-                inp_blk_map[short_idx] = long_idx - 1;
-                if (input_format.blocks_[blk_idx_list[0]]
-                        > long_plain_shapes[long_idx - 1]) {
-                    can_penetrate = false;
-                    break;
-                }
-            }
-        }
-        short_idx++;
-    }
-
-    if (can_penetrate) {
-        if (!inp_short) {
-            // cannot penetrate, if the number of blocking dims will exceed 4
-            // in the penetrated output format
-            if (input_size - output_size + input_format.get_blocks_size() > 4)
-                return false;
-            // cannot penetrate, if long_to_short involves any permutation
-            // e.g. {a, b, c, d} --> {a*b, c, d}
-            // ABCD --> AaBC is OK
-            // BACD --> AaBC is incorrect, since B and A are permuted
-            auto &input_code = input_format.format_code_;
-            auto input_p2b = input_code.collect_p2b_mapping();
-            for (size_t out_idx = 0; out_idx < short_size; ++out_idx) {
-                std::vector<int> concat_axis;
-                for (size_t inp_idx = 0; inp_idx < long_size; ++inp_idx) {
-                    if (long_to_short[inp_idx] == out_idx) {
-                        concat_axis.insert(concat_axis.end(),
-                                input_p2b[inp_idx].begin(),
-                                input_p2b[inp_idx].end());
-                    }
-                }
-                if (!std::is_sorted(concat_axis.begin(), concat_axis.end()))
-                    return false;
-            }
-            // start penetrated format inferring
-            sc_data_format_t new_format;
-            auto &new_code = new_format.format_code_;
-            int out_count[sc_data_format_kind_t::MAX_DIMS] = {0};
-            size_t blk_idx = 0;
-            auto remain_blocks = output_plain_shapes;
-            for (int i = 0; i < input_code.ndims(); i++) {
-                auto new_idx = long_to_short[input_code.get(i)];
-                new_code.set(i, new_idx);
-                out_count[new_code.get(i)]++;
-                if (out_count[new_code.get(i)] > 1
-                        && blk_idx < input_size - output_size) {
-                    new_format.blocks_[blk_idx++] = remain_blocks[new_idx];
-                }
-                remain_blocks[new_idx] /= input_plain_shapes[input_code.get(i)];
-            }
-            new_code.set(sc_data_format_kind_t::MAX_DIMS,
-                    input_format.format_code_.get(
-                            sc_data_format_kind_t::MAX_DIMS));
-            size_t inp_blk_idx = 0;
-            while (inp_blk_idx < 4 && blk_idx < 4
-                    && input_format.blocks_[inp_blk_idx] > 0) {
-                new_format.blocks_[blk_idx++]
-                        = input_format.blocks_[inp_blk_idx++];
-            }
-            new_output_format = new_format;
-            if (!is_dynamic()) {
-                if (math_utils::get_dims_product(input_blocking_shapes)
-                        != math_utils::get_dims_product(
-                                sc_data_format_t::get_blocking_shapes(
-                                        output_plain_shapes,
-                                        new_output_format))) {
-                    return false;
-                }
-            }
-            return true;
-        } else {
-            sc_data_format_t new_format;
-            auto &new_code = new_format.format_code_;
-            // reconstruct long_to_short to short_to_long
-            std::unordered_map<size_t, std::vector<size_t>> short_to_long;
-            for (const auto &pair : long_to_short) {
-                short_to_long[pair.second].push_back(pair.first);
-            }
-            // sort short_to_long mapping
-            for (size_t i = 0; i < input_size; i++) {
-                std::sort(short_to_long[i].begin(), short_to_long[i].end());
-            }
-            // setting output format code according to input format code
-            // and input-output-dimension-mapping
-            int format_code_offset = 0;
-            int axis_count[sc_data_format_kind_t::MAX_DIMS] = {0};
-            for (int i = 0;
-                    i < static_cast<int>(input_format.format_code_.ndims());
-                    i++) {
-                int input_dim = input_format.format_code_.get(i);
-                axis_count[input_dim]++;
-                std::vector<size_t> output_dims = short_to_long[input_dim];
-                if (axis_count[input_dim] > 1) {
-                    // blocking dimension
-                    new_code.set(i + format_code_offset,
-                            static_cast<int>(output_dims.back()));
-                } else {
-                    // plain dimension
-                    for (int idx = 0;
-                            idx < static_cast<int>(output_dims.size()); ++idx) {
-                        new_code.set(i + format_code_offset + idx,
-                                static_cast<int>(output_dims[idx]));
-                    }
-                    format_code_offset += short_to_long[input_dim].size() - 1;
-                }
-            }
-            new_code.set(sc_data_format_kind_t::MAX_DIMS,
-                    input_format.format_code_.get(
-                            sc_data_format_kind_t::MAX_DIMS));
-            new_format.blocks_ = input_format.blocks_;
-            new_output_format = new_format;
-            if (!is_dynamic()) {
-                if (math_utils::get_dims_product(input_blocking_shapes)
-                        != math_utils::get_dims_product(
-                                sc_data_format_t::get_blocking_shapes(
-                                        output_plain_shapes,
-                                        new_output_format))) {
-                    return false;
-                }
-            }
-            return true;
-        }
-    }
-    new_output_format = info_.outputs_[0]->details_.get_format();
-    return false;
-}
-
-shape_rl_vec tensor_view_op_t::get_dynamic_shape_relations() const {
-    auto rl_axis_pair = attrs_.get_or_else(
-            "temp.rl_axis_pair", std::vector<std::pair<int, int>>());
-    auto in_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto out_dims = get_outputs()[0]->details_.get_plain_dims();
-    shape_rl_vec ret;
-    for (auto &it : rl_axis_pair) {
-        if (is_dynamic_dim(in_dims[it.first])
-                || is_dynamic_dim(out_dims[it.second])) {
-            ret.emplace_back(in_dims[it.first], out_dims[it.second]);
-        }
-    }
-    return ret;
-}
-
-void tensor_view_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    sc_data_format_t output_format;
-    // temp workaround
-    assert(!attrs_.get<sc_data_format_t>("format").is_any());
-    bool query_by_dispatch_key = false;
-    if (is_dynamic()) {
-        auto key_set = get_dispatch_key_set();
-        key_set->for_each_key_process([&](const op_dispatch_key_base_t *key) {
-            auto dkey = static_cast<const op_dispatch_key_t *>(key);
-            if (!query_by_dispatch_key
-                    && dkey->in_out_formats_[0]
-                            == info_.inputs_[0]->details_.get_format()) {
-                query_by_dispatch_key = true;
-                output_format = dkey->in_out_formats_[1];
-            }
-        });
-    }
-    if (query_by_dispatch_key) {
-        out_formats.push_back({output_format});
-        in_formats.push_back({info_.inputs_[0]->details_.get_format()});
-    } else if (attrs_.has_key("expand_dim")
-            && info_.inputs_[0]->details_.get_format()
-                    == attrs_.get<sc_data_format_t>("cache_input_format")) {
-        out_formats.push_back({attrs_.get<sc_data_format_t>("format")});
-        in_formats.push_back({info_.inputs_[0]->details_.get_format()});
-    } else {
-        bool can_penetrate = try_penetrate(output_format);
-        if (can_penetrate) {
-            out_formats.push_back({output_format});
-            in_formats.push_back({info_.inputs_[0]->details_.get_format()});
-        } else {
-            out_formats.push_back({attrs_.get<sc_data_format_t>("format")});
-            in_formats.push_back(
-                    {attrs_.get<sc_data_format_t>("cache_input_format")});
-        }
-    }
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-slice_range_list infer_tensor_view_slice(sc_graph_t &graph,
-        const slice_range_list &known_ranges_list,
-        const std::vector<expr> &src_tv_dims,
-        const std::vector<expr> &dst_tv_dims) {
-    slice_range_list ret;
-    for (auto known_ranges : known_ranges_list) {
-        slice_range consistent_tv_slice;
-        auto src_dims = src_tv_dims, dst_dims = dst_tv_dims;
-        while (!dst_dims.empty() && !src_dims.empty()) {
-            if (!slice_expr_equals(dst_dims.back(), src_dims.back())) break;
-            consistent_tv_slice.insert(
-                    consistent_tv_slice.begin(), known_ranges.back());
-            known_ranges.pop_back();
-            src_dims.pop_back();
-            dst_dims.pop_back();
-        }
-
-        if (consistent_tv_slice.size() == dst_tv_dims.size()) {
-            ret.emplace_back(consistent_tv_slice);
-            continue;
-        }
-        // search continuous slice
-        bool continuous_slice_stop = false;
-        // flatten index
-        expr flatten_idx = 0;
-        // total length of static dim
-        sc_dim total_len = 1;
-        // accumulater src dims
-        expr acc_src_dim_expr = 1;
-        const int dyn_len = -2;
-        // check whether complex case exist
-        bool complex_case = false;
-        for (int i = src_dims.size() - 1; i >= 0; i--) {
-            auto slice_expr = do_cast_and_fold(known_ranges[i].second);
-            auto src_expr = src_dims[i];
-            // continuous slice check
-            if (continuous_slice_stop) {
-                // check whether slice is full on last several dims
-                if (!slice_expr.isa<constant_c>()
-                        || get_expr_as_int(slice_expr) != 1)
-                    // if tensor_view deals with inconsequence slice, it will
-                    // return empty slice range list to tell fusion manager not
-                    // to fuse it
-                    return slice_range_list {};
-            } else {
-                if (!(known_ranges[i].first.isa<constant_c>()
-                            && get_expr_as_int(known_ranges[i].first) == 0
-                            && slice_expr_equals(slice_expr, src_expr))) {
-                    // if the last dim is already non-full
-                    if (i == static_cast<int>(src_dims.size()) - 1) {
-                        // last dim of dst
-                        auto dst_expr = dst_dims.back();
-                        // double-check legality
-                        if (slice_expr.isa<constant>()
-                                && dst_expr.isa<constant>()) {
-                            auto slice_int = get_expr_as_int(slice_expr);
-                            auto dst_int = get_expr_as_int(dst_expr);
-                            // skip too complex cases to analyze tensorview
-                            // slice range mapping relationship
-                            if ((slice_int > dst_int
-                                        && slice_int % dst_int != 0)
-                                    || (dst_int > slice_int
-                                            && dst_int % slice_int != 0)) {
-                                return slice_range_list {};
-                            }
-                        }
-                    }
-                    if (!slice_expr.isa<constant_c>()
-                            || get_expr_as_int(slice_expr) != 1) {
-                        complex_case = true;
-                    }
-                    continuous_slice_stop = true;
-                }
-            }
-            if (slice_expr.isa<constant_c>()) {
-                total_len *= get_expr_as_int(slice_expr);
-            } else {
-                total_len *= dyn_len;
-            }
-            flatten_idx
-                    = flatten_idx + known_ranges[i].first * acc_src_dim_expr;
-            acc_src_dim_expr = acc_src_dim_expr * src_expr;
-        }
-        // deflatten to new shape
-        slice_range reshape_ranges;
-        sc_dims acc_dst_dim;
-        std::vector<expr> acc_dst_dim_expr;
-        sc_dim tmp_acc = 1;
-        expr tmp_acc_expr = 1;
-        for (int64_t i = static_cast<int64_t>(dst_dims.size()) - 1; i >= 0;
-                i--) {
-            tmp_acc *= !dst_dims[i].isa<constant>()
-                    ? dyn_len
-                    : get_expr_as_int(dst_dims[i]);
-            tmp_acc_expr = tmp_acc_expr * dst_dims[i];
-            acc_dst_dim.emplace_back(tmp_acc);
-            acc_dst_dim_expr.emplace_back(tmp_acc_expr);
-        }
-        std::reverse(acc_dst_dim.begin(), acc_dst_dim.end());
-        std::reverse(acc_dst_dim_expr.begin(), acc_dst_dim_expr.end());
-        std::vector<expr> dst_idx;
-        for (unsigned i = 0; i < dst_dims.size() - 1; i++) {
-            expr cur_idx = flatten_idx / acc_dst_dim_expr[i + 1];
-            dst_idx.emplace_back(cur_idx);
-            flatten_idx = flatten_idx % acc_dst_dim_expr[i + 1];
-        }
-        // mapping input slice ranges to output
-        for (int64_t i = static_cast<int64_t>(dst_dims.size()) - 1; i >= 0;
-                i--) {
-            if (abs(total_len) > abs(acc_dst_dim[i])) {
-                reshape_ranges.emplace_back(
-                        std::make_pair(expr(0), dst_dims[i]));
-            } else {
-                if (i == static_cast<int64_t>(dst_dims.size()) - 1) {
-                    reshape_ranges.emplace_back(std::make_pair(
-                            flatten_idx, expr(dim2unsigned(total_len))));
-                } else {
-                    if (!complex_case && abs(total_len) == abs(acc_dst_dim[i])
-                            && acc_dst_dim[i] != acc_dst_dim[i + 1]) {
-                        // simplify offset of ranges for easy case in avoid of
-                        // potential fuse break for following post ops
-                        reshape_ranges.emplace_back(
-                                std::make_pair(expr(0), dst_dims[i]));
-                    } else {
-                        reshape_ranges.emplace_back(std::make_pair(dst_idx[i],
-                                expr(std::max(UINT64_C(1),
-                                        dim2unsigned(total_len
-                                                / acc_dst_dim[i + 1])))));
-                    }
-                }
-            }
-        }
-        std::reverse(reshape_ranges.begin(), reshape_ranges.end());
-        for (auto &r : reshape_ranges) {
-            r.first = do_cast_and_fold(r.first);
-        }
-        reshape_ranges.insert(reshape_ranges.end(), consistent_tv_slice.begin(),
-                consistent_tv_slice.end());
-        ret.emplace_back(reshape_ranges);
-    }
-    return ret;
-}
-
-infer_status_code tensor_view_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    if (share_gt_with_op<output_op>(get_inputs()[0])) {
-        return infer_status_code::FAIL;
-    }
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    slice_range_list known_ranges_list = known_ranges_map[0];
-
-    if (fsmap.get(get_outputs()[0]).empty()) {
-        auto &graph = get_owner_graph();
-        // src
-        auto src_dims
-                = info_.inputs_[0]->details_.get_blocking_dims_expr(graph);
-        // dst
-        auto dst_dims
-                = info_.outputs_[0]->details_.get_blocking_dims_expr(graph);
-
-        auto tv_slice = infer_tensor_view_slice(
-                graph, known_ranges_list, src_dims, dst_dims);
-
-        if (tv_slice.empty()) { return infer_status_code::RETRY; }
-        fsmap.get(get_outputs()[0]) = tv_slice;
-    }
-    return infer_status_code::OK;
-}
-
-infer_status_code tensor_view_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    if (share_gt_with_op<output_op>(get_inputs()[0])) {
-        return infer_status_code::FAIL;
-    }
-    if (fsmap.get(get_inputs()[0]).empty()) {
-        slice_range_list known_ranges_list = fsmap.get(get_outputs()[0]);
-        auto &graph = get_owner_graph();
-        // src
-        auto src_dims
-                = info_.inputs_[0]->details_.get_blocking_dims_expr(graph);
-        // dst
-        auto dst_dims
-                = info_.outputs_[0]->details_.get_blocking_dims_expr(graph);
-        // NOTE: pre_infer_slice_ranges use shapes as src_dims
-        auto tv_slice = infer_tensor_view_slice(
-                graph, known_ranges_list, dst_dims, src_dims);
-        if (tv_slice.empty()) { return infer_status_code::RETRY; }
-        fsmap.get(get_inputs()[0]) = tv_slice;
-    }
-    return infer_status_code::OK;
-}
-
-// transpose_axis_map stores the transpose relation of src_axis --> dst_axis
-binding_axis infer_tensor_view_binding_axis(const binding_axis &src_axis,
-        const sc_dims &src_dims, const sc_dims &dst_dims,
-        const std::vector<int> &expand_dims = {},
-        const std::vector<int> &transpose_axis_map = {}) {
-    binding_axis dst_axis, tv_axis_map;
-
-    if (!transpose_axis_map.empty()) {
-        binding_axis real_src_axis;
-        COMPILE_ASSERT(src_dims.size() == dst_dims.size()
-                        && src_dims.size() == transpose_axis_map.size(),
-                "src dims, dst dims, and transpose_axis_map shall have the "
-                "same length.")
-        for (auto &bd_ax : src_axis) {
-            std::vector<int> ret;
-            for (auto &ax : bd_ax) {
-                COMPILE_ASSERT(ax < static_cast<int>(transpose_axis_map.size()),
-                        "ax should be less then transpose_axis_map size")
-                ret.emplace_back(transpose_axis_map[ax]);
-            }
-            real_src_axis.emplace_back(ret);
-        }
-        return real_src_axis;
-    }
-
-    sc_dims acc_src_dims(src_dims.size()), acc_dst_dims(dst_dims.size());
-    sc_dim tmp_acc = 1;
-    std::transform(src_dims.begin(), src_dims.end(), acc_src_dims.begin(),
-            [&tmp_acc](const sc_dim &d) {
-                tmp_acc *= d;
-                return tmp_acc;
-            });
-    tmp_acc = 1;
-    std::transform(dst_dims.begin(), dst_dims.end(), acc_dst_dims.begin(),
-            [&tmp_acc](const sc_dim &d) {
-                tmp_acc *= d;
-                return tmp_acc;
-            });
-    // compare src and dst
-    size_t j = 0;
-    assert(!acc_dst_dims.empty());
-    for (size_t i = 0; i < acc_src_dims.size(); i++) {
-        std::vector<int> axis;
-        while (j < acc_dst_dims.size()) {
-            axis.emplace_back(j);
-            if (std::abs(acc_src_dims[i]) <= std::abs(acc_dst_dims[j])) {
-                if (std::abs(acc_src_dims[i]) == std::abs(acc_dst_dims[j])) {
-                    j++;
-                }
-                break;
-            }
-            j++;
-        }
-        tv_axis_map.emplace_back(axis);
-    }
-
-    for (auto &bd_ax : src_axis) {
-        std::vector<int> ret;
-        for (auto &ax : bd_ax) {
-            if (expand_dims.end()
-                    != std::find(expand_dims.begin(), expand_dims.end(), ax)) {
-                continue;
-            } else {
-                ret.insert(ret.end(), tv_axis_map[ax].begin(),
-                        tv_axis_map[ax].end());
-            }
-        }
-        // check if empty to make g++12 happy
-        if (!ret.empty()) {
-            // remove duplicated axis.
-            std::sort(ret.begin(), ret.end());
-            ret.erase(std::unique(ret.begin(), ret.end()), ret.end());
-        }
-        dst_axis.emplace_back(ret);
-    }
-    return dst_axis;
-}
-
-void tensor_view_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    auto known_axis_map = search_known_input_axis(this, bdax_map);
-    if (!bdax_map.get(get_outputs()[0]).empty()) return;
-    // src
-    auto src_plain_dims = info_.inputs_[0]->details_.get_plain_dims();
-    // dst
-    auto dst_plain_dims = info_.outputs_[0]->details_.get_plain_dims();
-    auto ths = this;
-    auto order = attrs_.get_or_else("order", std::vector<int> {});
-    std::vector<int> axis_mapping(order.size(), 0);
-    for (size_t i = 0; i < order.size(); ++i) {
-        axis_mapping[order[i]] = i;
-    }
-    auto plain_bd_axis = infer_tensor_view_binding_axis(known_axis_map[0],
-            src_plain_dims, dst_plain_dims, std::vector<int> {}, axis_mapping);
-    bdax_map.get(get_outputs()[0]) = plain_bd_axis;
-    set_unknown_binding_axis(this, known_axis_map, bdax_map);
-}
-
-void tensor_view_op_t::pre_infer_binding_axis(binding_axis_map &bdax_map) {
-    auto &outaxis = bdax_map.get(get_outputs()[0]);
-    COMPILE_ASSERT(!outaxis.empty(),
-            "Unknown output axis found, could not pre infer binding axis")
-    auto &input = get_inputs()[0];
-    auto &inpaxis = bdax_map.get(input);
-
-    if (inpaxis.empty()) {
-        // src
-        auto src_plain_dims = info_.inputs_[0]->details_.get_plain_dims();
-        // dst
-        auto dst_plain_dims = info_.outputs_[0]->details_.get_plain_dims();
-        auto ths = this;
-        auto plain_bd_axis = infer_tensor_view_binding_axis(outaxis,
-                dst_plain_dims, src_plain_dims,
-                attrs_.get_or_else("expand_dim", std::vector<int> {}),
-                attrs_.get_or_else("order", std::vector<int> {}));
-        inpaxis = plain_bd_axis;
-        if (auto bd_op
-                = input->producer_owner_
-                          ->dyn_cast<op_traits::mixed_partition_acceptable>()) {
-            bd_op->pre_infer_binding_axis(bdax_map);
-        }
-    }
-}
-
-void tensor_view_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {}
-
-reshape_op_t::reshape_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    op_name_ = "reshape";
-    COMPILE_ASSERT(ins.size() == 1, "Reshape copy takes 1 input");
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-    auto &shapes = attrs_.get<sc_dims>("shape");
-    int total_shape1 = 1, total_shape2 = 1;
-    for (auto &dim : ins[0]->details_.get_plain_dims()) {
-        total_shape1 *= dim;
-    }
-    for (auto &dim : shapes) {
-        total_shape2 *= dim;
-    }
-    COMPILE_ASSERT(total_shape1 == total_shape2,
-            "Wrong total size of input shapes, can not do reshape plain dims "
-            "from " << utils::print_vector(ins[0]->details_.get_plain_dims())
-                    << " to " << utils::print_vector(shapes));
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_.dtype_ = ins[0]->details_.dtype_;
-        info_.outputs_[0]->details_.set_plain_dims(shapes);
-        shapes_ = shapes;
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "Wrong op output size.\n");
-        info_.outputs_ = outs;
-        shapes_ = outs[0]->details_.get_plain_dims();
-    }
-}
-infer_status_code reshape_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-infer_status_code reshape_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty() || known_ranges_map[0].size() != 1)
-        return infer_status_code::RETRY;
-    auto blocking_dims = info_.inputs_[0]->details_.get_blocking_dims();
-    std::vector<int> axis(blocking_dims.size());
-    std::iota(axis.begin(), axis.end(), 0);
-    if (!slice_full_on_axis(blocking_dims, known_ranges_map[0][0], axis)) {
-        return infer_status_code::RETRY;
-    }
-    // fake infer slice
-    std::vector<std::pair<expr, expr>> ranges;
-    auto &shapes = info_.outputs_[0]->details_.get_plain_dims();
-    ranges.reserve(shapes.size());
-    for (size_t i = 0; i < shapes.size(); i++) {
-        ranges.emplace_back(expr(0), expr(dim2unsigned(shapes[i])));
-    }
-    fsmap.get(get_outputs()[0]).push_back(ranges);
-    return infer_status_code::OK;
-}
-
-void reshape_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    in_formats.push_back({info_.inputs_[0]->details_.get_format()});
-    out_formats.push_back({sc_data_format_kind_t::get_plain_by_dims(
-            info_.outputs_[0]->details_.get_plain_dims().size())});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-void reshape_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dsts,
-        const std::vector<const tensor_slice *> &inputs) {
-    auto *src = inputs[0];
-    auto *dst = dsts[0];
-    // accumulate src tensor size
-    std::vector<expr> src_accsize {expr(dim2unsigned(1))};
-    expr src_size = expr(dim2unsigned(1));
-    // accumulate dst tensor size
-    std::vector<expr> dst_accsize {expr(dim2unsigned(1))};
-    expr dst_size = expr(dim2unsigned(1));
-    // outer nested loop vars
-    expr iters = builder::make_var(
-            datatypes::index, std::string("_fuseiter") + fusion_create_idx());
-    // the indices for the input tensor.
-    std::vector<expr> src_idx(src->nslice_dims());
-    // the indices for the output tensor.
-    std::vector<expr> dst_idx(dst->nslice_dims());
-    uint64_t total_size = 1;
-    for (auto it : shapes_) {
-        total_size *= it;
-    }
-    for (int64_t i = inputs.at(0)->nslice_dims() - 1; i > 0; i--) {
-        src_size = src_size * src->get_shape()[i];
-        src_accsize.emplace_back(src_size);
-    }
-
-    for (auto i = dst->nslice_dims() - 1; i > 0; i--) {
-        dst_size = dst_size * dst->get_shape()[i];
-        dst_accsize.emplace_back(dst_size);
-    }
-    std::reverse(src_accsize.begin(), src_accsize.end());
-    std::reverse(dst_accsize.begin(), dst_accsize.end());
-
-    for (int i = 0; i < (int)src->nslice_dims(); i++) {
-        if (i == 0) {
-            src_idx[i] = iters / src_accsize[i] + src->get_offset()[i];
-        } else {
-            src_idx[i] = iters % src_accsize[i - 1] / src_accsize[i]
-                    + src->get_offset()[i];
-        }
-    }
-    for (int i = 0; i < (int)dst->nslice_dims(); i++) {
-        if (i == 0) {
-            dst_idx[i] = iters / dst_accsize[i] + dst->get_offset()[i];
-        } else {
-            dst_idx[i] = iters % dst_accsize[i - 1] / dst_accsize[i]
-                    + dst->get_offset()[i];
-        }
-    }
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-
-    expr indexed_target = builder::make_indexing(dst->tptr_, {dst_idx});
-    expr indexed_input = builder::make_indexing(src->tptr_, src_idx);
-    stmt_c cur = builder::make_assign_unattached(indexed_target, indexed_input);
-    auto body = builder::make_stmts_unattached(
-            std::vector<stmt_c> {std::move(cur)});
-    cur = builder::make_for_loop_unattached(iters, expr(0), expr(total_size),
-            expr(1), std::move(body), true, for_type::NORMAL);
-    constant_folder_t folder;
-    cur = folder(cur);
-    bld->emit(cur.remove_const());
-}
-
-split_op_t::split_op_t(graph_tensor_ptr v, int dim, const sc_dims &shapes)
-    : dim_(dim), shapes_(shapes) {
-    attrs_.set(op_attr_key::no_fuse, true);
-    info_.inputs_.emplace_back(std::move(v));
-    for (unsigned i = 0; i < shapes_.size(); i++) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-    }
-}
-
-void split_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    out_formats.reserve(info_.outputs_.size());
-    for (size_t i = 0; i < out_formats.size(); ++i) {
-        out_formats[i].push_back({info_.inputs_[0]->details_.get_format()});
-    }
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-infer_status_code split_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    size_t slice_size = known_ranges_map[0].size();
-    slice_range_list split_ranges_list = known_ranges_map[0];
-    for (size_t i = 0; i < get_outputs().size(); i++) {
-        fsmap.get(get_outputs()[i]).resize(slice_size);
-        for (size_t n = 0; n < slice_size; n++) {
-            for (size_t j = 0; j < split_ranges_list.at(n).size(); j++) {
-                if (j == dim_) {
-                    // Due to query stage, split shapes should be matched
-                    // with input.
-                    fsmap.get(get_outputs()[i])
-                            .at(n)
-                            .emplace_back(std::make_pair(
-                                    expr(0), dim2unsigned(shapes_.at(i))));
-                } else {
-                    fsmap.get(get_outputs()[i])
-                            .at(n)
-                            .emplace_back(std::make_pair(
-                                    split_ranges_list.at(n).at(j).first,
-                                    split_ranges_list.at(n).at(j).second));
-                }
-            }
-        }
-    }
-    return infer_status_code::OK;
-}
-
-infer_status_code split_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-void compute_block_split(const std::vector<const tensor_slice *> &src,
-        const std::vector<tensor_slice *> &dst, unsigned dim,
-        const sc_dims &shapes, size_t wkld = 0UL) {
-    // outer nested loop vars
-    std::vector<expr> outer_iter(dim);
-    // inner nested loop vars
-    std::vector<std::vector<expr>> inner_iter(
-            static_cast<uint64_t>(src[0]->nbase_dims()) - dim);
-    // the indices for multiple inputs. First dim: the input, Second
-    // dim: the dimemsions in the tensor
-    std::vector<std::vector<expr>> src_idx(dst.size());
-    // the indices for the output tensor. Cause concat is a assign op,
-    // we need number of src indexes.
-    std::vector<std::vector<expr>> dst_idx(dst.size());
-    for (int64_t i = 0; i < src[0]->nbase_dims(); i++) {
-        if (i < dim) { // outer loop
-            // make the loop var for the for-loop
-            outer_iter[i] = builder::make_var(datatypes::index,
-                    std::string("_fuseiter") + fusion_create_idx());
-            for (unsigned j = 0; j < dst.size(); j++) {
-                src_idx[j].emplace_back(outer_iter[i]);
-                dst_idx[j].emplace_back(outer_iter[i]);
-            }
-        } else { // inner loop
-            expr cur;
-            for (unsigned j = 0; j < dst.size(); j++) {
-                inner_iter[i - dim].emplace_back(builder::make_var(
-                        datatypes::index,
-                        std::string("_fuseiter") + fusion_create_idx()));
-                dst_idx[j].emplace_back(inner_iter[i - dim][j]);
-                if (i == dim) {
-                    if (j == 0) {
-                        cur = 0;
-                    } else {
-                        cur = cur + dst[j - 1]->get_shape()[i];
-                    }
-                    src_idx[j].emplace_back(inner_iter[i - dim][j] + cur);
-                } else {
-                    src_idx[j].emplace_back(inner_iter[i - dim][j]);
-                }
-            }
-        }
-    }
-    expr indexed_target;
-    expr indexed_input;
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    std::vector<stmt> tcur;
-    for (unsigned j = 0; j < dst.size(); j++) {
-        indexed_target = builder::make_indexing(dst[j]->tptr_, dst_idx[j]);
-        indexed_input = builder::make_indexing(src[0]->tptr_, src_idx[j]);
-        stmt cur = make_stmt<assign_node_t>(indexed_target, indexed_input);
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        for (int64_t i = src[0]->nslice_dims() - 1; i >= dim; i--) {
-            auto body = make_stmt<stmts_node_t>(
-                    std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(inner_iter[i - dim][j], expr(0),
-                    dst[j]->get_shape()[i], expr(1), std::move(body), true,
-                    for_type::NORMAL);
-        }
-        tcur.emplace_back(std::move(cur));
-    }
-    if (dim) {
-        stmt cur = make_stmt<stmts_node_t>(std::move(tcur));
-        for (int i = dim - 1; i >= 0; i--) {
-            stmt body;
-            if (cur.isa<for_loop>()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-            } else {
-                body = cur;
-            }
-            cur = make_stmt<for_loop_node_t>(outer_iter[i], expr(0),
-                    src[0]->get_shape()[i], expr(1), std::move(body), true,
-                    for_type::NORMAL);
-        }
-        bld->emit(cur);
-    } else {
-        for (auto &cur : tcur) {
-            bld->emit(cur);
-        }
-    }
-}
-
-void split_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    compute_block_split(inputs, dst, dim_, shapes_, wkld);
-}
-
-OP_REGISTER(concat_op_t, concat)
-OP_REGISTER(transpose_op_t, transpose)
-OP_REGISTER(tensor_view_op_t, tensor_view)
-OP_REGISTER(reshape_op_t, reshape)
-OP_REGISTER(reorder_op_t, reorder)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/memory_movement.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/memory_movement.hpp
deleted file mode 100644
index 0aa9ae7f6ad..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/memory_movement.hpp
+++ /dev/null
@@ -1,226 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_MEMORY_MOVEMENT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_MEMORY_MOVEMENT_HPP
-
-#include <numeric>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class concat_op_t : public movement_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    concat_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    concat_op_t(const std::vector<graph_tensor_ptr> &candidates, int axis);
-
-    int64_t get_axis() { return axis_; }
-
-    void set_format_and_axis();
-
-    // For each input tensor, concat optimization pass will try to make the
-    // parent op directly write it into the output of concat; If so, we mark the
-    // corresponding input as invalid (false). For the input tensor that cannot
-    // be optimized, we need to generate IR to copy it into the output of
-    // concat; we mark this tensor as valid (true), because this is the default
-    // action of concatenating.
-    std::vector<bool> is_input_valid_;
-    // if returned value is false, then this concat has been optimized and some
-    // inputs's strides are changed
-    bool all_inputs_valid() {
-        return std::all_of(is_input_valid_.begin(), is_input_valid_.end(),
-                [](bool valid) { return valid; });
-    }
-
-protected:
-    // plain_axis_ is with respect to plain dims.
-    int64_t plain_axis_;
-    // axis_ is with respect to blocking format.
-    int64_t axis_;
-    // To make sense, the axis_ should be combined with a fixed format.
-    sc_data_format_t ori_format_;
-};
-
-/**
- * Transpose the input tensor
- * Inputs:
- *  - A single tensor to transpose
- * Outputs:
- *  - The transposed tensor
- * Attrs:
- *  - order: vector<int> - order of the input axis w.r.t output axis
- * */
-class transpose_op_t : public movement_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    transpose_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    transpose_op_t(graph_tensor_ptr v, std::vector<int> &order);
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-private:
-    std::vector<int> order_;
-};
-
-/**
- * Creates a view of an input tensor with a different shape
- * Inputs:
- *  - A single tensor to reshape
- * Outputs:
- *  - The reshaped tensor
- * Attrs:
- *  - shape: vector<int> - the output blocking shape
- *  - format: sc_data_format_t - default: any. the format of the output logical
- *    tensor
- *  - expand_dim: bool - default: false. whether the tensor_view is a pure
- *    dimension expansion (e.g. [a, b] --> [1, 1, a, b])
- * */
-class tensor_view_op_t : public movement_op_t,
-                         public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    tensor_view_op_t(graph_tensor_ptr v, const sc_dims &shapes);
-    tensor_view_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    sc_dims get_shapes() const;
-    std::vector<expr> get_shapes_expr();
-    bool try_penetrate(sc_data_format_t &new_output_format) const;
-    // the dim change is [a, b] --> [1, 1, a, b] or [1, 1, a, b] --> [a, b]
-    bool is_only_expand_or_penetrate() const;
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-private:
-    sc_dims shapes_;
-};
-
-/**
- * Creates a copy of an input tensor with a different shape.
- * Currenly only used in case whole graph has only single reshape op for perf.
- * Inputs:
- *  - A single tensor to reshape
- * Outputs:
- *  - The reshaped tensor
- * Attrs:
- *  - shape: vector<int> - the output blocking shape
- * */
-class reshape_op_t : public movement_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-    reshape_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    ir_module_ptr get_func(context_ptr ctx) override;
-
-private:
-    sc_dims shapes_;
-};
-
-class split_op_t : public movement_op_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    split_op_t(graph_tensor_ptr v, int dim, const sc_dims &shapes);
-
-private:
-    unsigned dim_;
-    sc_dims shapes_;
-};
-
-class reorder_op_t : public movement_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    reorder_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    reorder_op_t(graph_tensor_ptr v, sc_data_format_t input_format,
-            sc_data_format_t output_format);
-    ir_module_ptr get_func(context_ptr ctx) override;
-    const sc_data_format_kind_t &get_input_format_kind() const {
-        return info_.inputs_[0]->details_.get_format().format_code_;
-    }
-    const sc_data_format_kind_t &get_output_format_kind() const {
-        return info_.outputs_[0]->details_.get_format().format_code_;
-    }
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    const sc_data_format_t &get_output_format() const {
-        return info_.outputs_[0]->details_.get_format();
-    }
-    const sc_data_format_t &get_input_format() const {
-        return info_.inputs_[0]->details_.get_format();
-    }
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-    std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx) override;
-    void update_fuse_attr();
-    bool check_padding() const;
-    bool use_output_loop() const;
-    bool support_output_loop() const;
-    bool support_optimized_kernel(const context_ptr &ctx) const;
-    bool meet_vnni_reorder_require(const context_ptr &ctx) const;
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-private:
-    sc_dims plain_dims_;
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/padding.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/padding.cpp
deleted file mode 100644
index 7fbada033e7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/padding.cpp
+++ /dev/null
@@ -1,482 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include <utility>
-#include "padding.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <runtime/dynamic_dispatch/ops/runtime_op_info.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-padding_op_t::padding_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 1, "padding expects 1 input");
-    const int ndims = ins[0]->details_.get_plain_dims().size();
-
-    // extend for arbitrary dims?
-    COMPILE_ASSERT(utils::is_one_of(static_cast<int>(ndims), 2, 4, 5),
-            "wrong input dims, expected to be 2D, 4D or 5D input, but got "
-                    << ins.size() << "D.");
-
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-
-    COMPILE_ASSERT(attrs_.has_key("pads_begin") && attrs_.has_key("pads_end"),
-            "padding op shall have pads_begin & pads_end attributes");
-
-    auto &pads_begin = attrs_.get<sc_dims>("pads_begin");
-    auto &pads_end = attrs_.get<sc_dims>("pads_end");
-
-    if (ndims == 2) {
-        COMPILE_ASSERT(static_cast<int>(pads_begin.size()) == 1,
-                "wrong padding dims, 2D input, but got" << pads_begin.size()
-                                                        << "D paddings.");
-    } else {
-        COMPILE_ASSERT(pads_begin.size() == pads_end.size(),
-                "The size of pads_begin should be equal to pads_end.");
-
-        if (pads_begin.size() == 1) {
-            pads_begin = sc_dims(ndims - 2, pads_begin[0]);
-            pads_end = sc_dims(ndims - 2, pads_end[0]);
-        }
-        COMPILE_ASSERT((ndims - 2) == static_cast<int>(pads_begin.size()),
-                "wrong padding dims, " << ndims - 2 << "D input, but got"
-                                       << pads_begin.size() << "D paddings.");
-    }
-
-    sc_dims expected_out_shape = infer_out_dims(
-            info_.inputs_[0]->details_.get_plain_dims(), pads_begin, pads_end);
-
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                info_.inputs_[0]->details_.get_format(), expected_out_shape,
-                info_.inputs_[0]->details_.dtype_));
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "padding expects 1 output");
-        COMPILE_ASSERT(is_dynamic()
-                        || outs[0]->details_.get_plain_dims()
-                                == expected_out_shape,
-                "Bad output shape for padding");
-        info_.outputs_ = outs;
-    }
-    op_name_ = "padding";
-}
-
-padding_op_t::padding_op_t(
-        graph_tensor_ptr v, sc_dims &pads_begin, sc_dims &pads_end)
-    : padding_op_t({std::move(v)}, {},
-            any_map_t({{"pads_begin", pads_begin}, {"pads_end", pads_end}})) {}
-
-sc_dims padding_op_t::infer_out_dims(const sc_dims &input_dims,
-        const sc_dims &pads_begin, const sc_dims &pads_end) {
-    int ndims = input_dims.size();
-    auto out_dims = input_dims;
-    if (ndims == 2) {
-        // Note, pads on the the first dim for 2D.
-        COMPILE_ASSERT(
-                !is_dynamic(), "dynamic padding op does not support 2D cases");
-        out_dims[0] += pads_begin[0] + pads_end[0];
-    } else {
-        out_dims[0] = is_dynamic_dim(input_dims[0]) ? dimensions::dynamic_any
-                                                    : input_dims[0];
-        for (int i = 2; i < ndims; i++) {
-            if (is_dynamic_dim(input_dims[i])) {
-                out_dims[i] = dimensions::dynamic_any;
-            } else {
-                out_dims[i]
-                        = input_dims[i] + pads_begin[i - 2] + pads_end[i - 2];
-            }
-        }
-    }
-    return out_dims;
-}
-
-shape_rl_vec padding_op_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    if (is_dynamic_dim(in_dims[0])) {
-        ret.emplace_back(in_dims[0], out_dims[0]);
-    }
-    return ret;
-}
-
-reflection::shared_general_object_t padding_op_t::get_dynamic_runtime_info() {
-    auto ndims = info_.inputs_[0]->details_.get_plain_dims().size();
-    sc_dims pads_begin
-            = attrs_.get_or_else<sc_dims>("pads_begin", sc_dims(ndims - 2, 0));
-    sc_dims pads_end
-            = attrs_.get_or_else<sc_dims>("pads_end", sc_dims(ndims - 2, 0));
-
-    auto dyn_info = ndims == 5
-            ? dyn_padding_runtime_info_t(pads_begin[0], pads_begin[1],
-                    pads_begin[2], pads_end[0], pads_end[1], pads_end[2])
-            : dyn_padding_runtime_info_t(
-                    pads_begin[0], pads_begin[1], pads_end[0], pads_end[1]);
-    reflection::shared_general_object_t info
-            = reflection::general_object_t::make(dyn_info);
-    return info;
-}
-
-void padding_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    out_formats.push_back({info_.inputs_[0]->details_.get_format()});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-infer_status_code padding_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-infer_status_code padding_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-
-    if (attrs_.get_or_else(op_attr_key::break_post_fuse, false)) {
-        fsmap.get(get_outputs()[0]) = known_ranges_map[0];
-        return infer_status_code::OK;
-    }
-
-    size_t slice_size = known_ranges_map[0].size();
-
-    auto required_axis = get_real_padding_axis();
-    auto input = get_inputs()[0];
-    auto &src_dim = input->details_.get_blocking_dims();
-    // check the slice range whether meet the demand of padding op
-    for (auto &src_range : fsmap.get(input)) {
-        if (!slice_full_on_axis(src_dim, src_range, required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-    slice_range_list ranges_list(slice_size);
-    // update the input slice range offset with additional padding area, while
-    // keep the size unchanged
-    const size_t ndims = info_.inputs_[0]->details_.get_plain_dims().size();
-    const auto &pads_begin = attrs_.get<sc_dims>("pads_begin");
-    const auto &pads_end = attrs_.get<sc_dims>("pads_end");
-    // if format is channel_last, the spatial_dims_offset should set to 1
-    // Note, pads on the the first dim for 2D.
-    size_t spatial_dims_offset = ndims == 2
-            ? 0
-            : (info_.outputs_[0]->details_.get_format().is_channel_last() ? 1
-                                                                          : 2);
-    for (size_t i = 0; i < slice_size; i++) {
-        ranges_list[i] = known_ranges_map[0][i];
-        for (size_t j = 0; j < pads_begin.size(); ++j) {
-            auto &len = ranges_list[i][j + spatial_dims_offset].second;
-            len = do_cast_and_fold(
-                    len + static_cast<int>(pads_begin[j] + pads_end[j]));
-        }
-    }
-    fsmap.get(get_outputs()[0]) = std::move(ranges_list);
-    return infer_status_code::OK;
-}
-
-void padding_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &src) {
-    size_t wkld = compute_fusible_workload(ctx, dst, src);
-    const size_t ndims = dst[0]->nslice_dims();
-    auto dst_shape = get_expr_to_dims(dst[0]->get_shape());
-
-    std::vector<expr> iter_vars;
-    std::vector<expr> src_idx;
-    std::vector<expr> dst_idx;
-    const auto &pads_begin = attrs_.get<sc_dims>("pads_begin");
-    const auto &pads_end = attrs_.get<sc_dims>("pads_end");
-    const auto padding_axis = get_real_padding_axis();
-    for (size_t i = 0; i < ndims; ++i) {
-        iter_vars.emplace_back(range_from_outer_loop(src[0]->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-        src_idx.emplace_back(iter_vars.back());
-        dst_idx.emplace_back(iter_vars.back());
-
-        auto itr = find(padding_axis.begin(), padding_axis.end(), i);
-        if (itr != padding_axis.end()) {
-            dst_idx.back() = dst_idx.back()
-                    + static_cast<int>(pads_begin[std::distance(
-                            padding_axis.begin(), itr)]);
-        }
-    }
-    auto bld = builder::get_current_builder();
-
-    int step = static_cast<int>(
-            vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_));
-    auto can_vectorize
-            = get_expr_as_int(src[0]->get_shape()[ndims - 1]) % step == 0;
-
-    expr indexed_src = builder::make_indexing(
-            src[0]->tptr_, src_idx, can_vectorize ? step : 1);
-    expr indexed_dst = builder::make_indexing(
-            dst[0]->tptr_, dst_idx, can_vectorize ? step : 1);
-
-    stmt cur = make_stmt<assign_node_t>(indexed_dst, indexed_src);
-    cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-
-    for (int64_t i = static_cast<int64_t>(ndims) - 1; i >= 0; --i) {
-        // Do not generate those dummy loops
-        if (!iter_vars.at(i).isa<var>()) continue;
-        auto body = make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(std::move(iter_vars[i]), expr(0),
-                src[0]->get_shape()[i],
-                i == static_cast<int64_t>(ndims) - 1 && can_vectorize
-                        ? expr(step)
-                        : expr(1),
-                std::move(body), true, for_type::NORMAL);
-        bind_loop_axis(get_inputs()[0], cur, i, true);
-    }
-
-    bld->emit(cur);
-}
-
-size_t padding_op_t::compute_workload(const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    return fusible_op_t::compute_workload(ins, outs)
-            * workload_penalty_coefficient;
-}
-
-std::vector<int> padding_op_t::get_real_padding_axis() {
-    const int padding_dims_size = attrs_.get<sc_dims>("pads_begin").size();
-    const size_t ndims = info_.inputs_[0]->details_.get_plain_dims().size();
-
-    const int offset = (ndims == 2)                                        ? 0
-            : (info_.outputs_[0]->details_.get_format().is_channel_last()) ? 1
-                                                                           : 2;
-    std::vector<int> padding_axis(padding_dims_size, 0);
-    for (int i = 0; i < padding_dims_size; i++) {
-        padding_axis[i] = i + offset;
-    }
-    return padding_axis;
-}
-
-stmt padding_op_t::get_zero_out_stmt(
-        const tensor &out, const slice_range_list &range_list) {
-    COMPILE_ASSERT(attrs_.has_key("pads_begin") && attrs_.has_key("pads_end"),
-            "padding op shall have pads_begin & pads_end attributes");
-
-    COMPILE_ASSERT(range_list.size() <= 1, "Multi-slice is not expected")
-
-    auto input_plain_dims = get_inputs()[0]->details_.get_plain_dims();
-    int plain_ndims_ = input_plain_dims.size();
-    // Support 4d or 5d output blocking format, e.g NCHW, NHWc, NCHWc
-    // Todo (xurui) add support for output with D dim, such as NCDHWc
-    COMPILE_ASSERT(utils::is_one_of(static_cast<int>(plain_ndims_), 2, 4),
-            "padding op input was expected to be 2D or 4D");
-
-    auto out_dtype = out->dtype_.is_pointer()
-            ? out->dtype_.get_pointer_element()
-            : out->dtype_;
-
-    auto range = range_list.empty() ? slice_range {} : range_list[0];
-    auto out_tsl = range.empty() ? tensor_slice(out)
-                                 : tensor_slice(out, std::move(range));
-
-    expr N = out_tsl.get_shape()[0];
-    auto real_padding_axis = get_real_padding_axis();
-    const auto pads_begin = attrs_.get<sc_dims>("pads_begin");
-    const auto pads_end = attrs_.get<sc_dims>("pads_end");
-    auto out_tptr = out_tsl.tptr_;
-    auto padding_value = attrs_.get_or_else("padding_value", 0);
-
-    if (plain_ndims_ == 2) {
-        // Note: padding on the first dim (km) for 2D.
-        int km = get_expr_as_int(N);
-        int nm = get_expr_as_int(out_tsl.get_shape()[1]);
-        int pt = static_cast<int>(pads_begin[0]);
-        int pb = static_cast<int>(pads_end[0]);
-
-        builder::ir_builder_t bld;
-        bld.push_scope();
-        if (pt > 0) {
-            builtin::brgemm_init(builder::tensor_ptr(out_tptr, {0, 0}), pt, nm,
-                    nm, out_dtype, padding_value);
-        }
-        if (pb > 0) {
-            builtin::brgemm_init(builder::tensor_ptr(out_tptr, {km - pb, 0}),
-                    pb, nm, nm, out_dtype, padding_value);
-        }
-        auto ret = bld.pop_scope();
-        return ret;
-    } else {
-        auto is_channel_last
-                = get_outputs()[0]->details_.get_format().is_channel_last();
-
-        // All the format will be treated as NKHWc
-        const int K
-                = is_channel_last ? 1 : get_expr_as_int(out_tsl.get_shape()[1]);
-
-        int c = 1;
-        auto ndims = out->dims_.size();
-        auto is_4d_out = ndims == 4;
-
-        for (size_t i = real_padding_axis.back() + 1; i < ndims; i++) {
-            c *= range_list.empty() ? get_expr_as_int(out->dims_[i])
-                                    : get_expr_as_int(range[i].second);
-        }
-
-        // input plain format must be NCHW in conv_fwd_core
-        int ph1_ = pads_begin[0], ph2_ = pads_end[0];
-        int pw1_ = pads_begin[1], pw2_ = pads_end[1];
-
-        // for dynamic cases, it is NHWC format now
-        expr ih = is_dynamic()
-                ? do_cast_and_fold(out_tsl.get_shape()[1] - (ph1_ + ph2_))
-                : expr(int(input_plain_dims[plain_ndims_ - 2]));
-        expr iw = is_dynamic()
-                ? do_cast_and_fold(out_tsl.get_shape()[2] - (pw1_ + pw2_))
-                : expr(int(input_plain_dims[plain_ndims_ - 1]));
-        expr ow = is_dynamic() ? out_tsl.get_shape()[2]
-                               : expr(int(input_plain_dims[plain_ndims_ - 1]
-                                       + pads_begin[1] + pads_end[1]));
-
-        for_loop ln, lk, lp;
-        builder::ir_builder_t bld;
-        bld.push_scope();
-        _named_for_(ln, pad_n, 0, N, 1, for_type::PARALLEL) {
-            _named_for_(lk, pad_k, 0, K) {
-                if (ph1_ > 0) {
-                    auto ptr = is_4d_out
-                            ? (is_channel_last ? builder::tensor_ptr(
-                                       out_tptr, {pad_n, 0, 0, 0})
-                                               : builder::tensor_ptr(out_tptr,
-                                                       {pad_n, pad_k, 0, 0}))
-                            : builder::tensor_ptr(
-                                    out_tptr, {pad_n, pad_k, 0, 0, 0});
-                    builtin::brgemm_init(
-                            ptr, ph1_ * ow, c, c, out_dtype, padding_value);
-                }
-            }
-        }
-        _named_for_(ln, pad_n, 0, N, 1, for_type::PARALLEL) {
-            _named_for_(lk, pad_k, 0, K) {
-                _named_for_(lp, p1, 0, ih) {
-                    if (pw1_ > 0) {
-                        builtin::brgemm_init(
-                                is_4d_out ? (is_channel_last
-                                                ? builder::tensor_ptr(out_tptr,
-                                                        {pad_n, p1 + ph1_, 0,
-                                                                0})
-                                                : builder::tensor_ptr(out_tptr,
-                                                        {pad_n, pad_k,
-                                                                p1 + ph1_, 0}))
-                                          : builder::tensor_ptr(out_tptr,
-                                                  {pad_n, pad_k, p1 + ph1_, 0,
-                                                          0}),
-                                pw1_, c, c, out_dtype, padding_value);
-                    }
-
-                    if (pw2_ > 0) {
-                        builtin::brgemm_init(
-                                is_4d_out ? (is_channel_last
-                                                ? builder::tensor_ptr(out_tptr,
-                                                        {pad_n, p1 + ph1_,
-                                                                iw + pw1_, 0})
-                                                : builder::tensor_ptr(out_tptr,
-                                                        {
-                                                                pad_n,
-                                                                pad_k,
-                                                                p1 + ph1_,
-                                                                iw + pw1_,
-                                                        }))
-                                          : builder::tensor_ptr(out_tptr,
-                                                  {pad_n, pad_k, p1 + ph1_,
-                                                          iw + pw1_, 0}),
-                                pw2_, c, c, out_dtype, padding_value);
-                    }
-                }
-            }
-        }
-        _named_for_(ln, pad_n, 0, N, 1, for_type::PARALLEL) {
-            _named_for_(lk, pad_k, 0, K) {
-                if (ph2_ > 0) {
-                    builtin::brgemm_init(
-                            is_4d_out ? (is_channel_last
-                                            ? builder::tensor_ptr(out_tptr,
-                                                    {pad_n, ph1_ + ih, 0, 0})
-                                            : builder::tensor_ptr(out_tptr,
-                                                    {pad_n, pad_k, ph1_ + ih,
-                                                            0}))
-                                      : builder::tensor_ptr(out_tptr,
-                                              {pad_n, pad_k, ph1_ + ih, 0, 0}),
-                            ph2_ * ow, c, c, out_dtype, padding_value);
-                }
-            }
-        }
-        auto ret = bld.pop_scope();
-        return ret;
-    }
-}
-
-std::vector<expr> padding_op_t::get_padding_offsets_exprs() {
-    COMPILE_ASSERT(attrs_.has_key("pads_begin"),
-            "padding op shall have pads_begin attribute")
-    auto pads_begin = attrs_.get<sc_dims>("pads_begin");
-
-    int ndims = get_outputs()[0]->details_.get_blocking_dims().size();
-    auto real_padding_axis = get_real_padding_axis();
-
-    COMPILE_ASSERT(pads_begin.size() == real_padding_axis.size(),
-            "padding op shall have the same size of pads_begin and adding "
-            "axis");
-
-    std::vector<expr> offsets(ndims, 0);
-    for (size_t i = 0; i < pads_begin.size(); i++) {
-        offsets[real_padding_axis[i]] = (int)pads_begin[i];
-    }
-    return offsets;
-}
-
-void padding_op_t::calculate_dynamic_shape_expression() {
-    auto &g = get_owner_graph();
-    auto padding_axis = get_real_padding_axis();
-    auto data_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto out_dims = get_outputs()[0]->details_.get_plain_dims();
-    auto expr_pads_begin = g.dims_to_expr(attrs_.get<sc_dims>("pads_begin"));
-    auto expr_pads_end = g.dims_to_expr(attrs_.get<sc_dims>("pads_end"));
-    for (size_t i = 0; i < padding_axis.size(); i++) {
-        if (is_dynamic_dim(data_dims[padding_axis[i]])
-                && out_dims[padding_axis[i]] != data_dims[padding_axis[i]]) {
-            auto var_in = g.dim_to_expr(data_dims[padding_axis[i]]);
-            auto var_out = g.dim_to_expr(out_dims[padding_axis[i]]);
-            expr_c cal_expr = do_cast_and_fold(
-                    var_in + expr_pads_begin[i] + expr_pads_end[i]);
-            var_out->attr_->set(attr_keys::cal_expression, cal_expr);
-        }
-    }
-}
-
-OP_REGISTER(padding_op_t, padding)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/padding.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/padding.hpp
deleted file mode 100644
index b43ec66a7d9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/padding.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_PADDING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_PADDING_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class padding_op_t : public movement_op_t, public op_traits::auto_copyable_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    padding_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    padding_op_t(graph_tensor_ptr v, sc_dims &pads_begin, sc_dims &pads_end);
-
-    sc_dims infer_out_dims(const sc_dims &input_dims, const sc_dims &pads_begin,
-            const sc_dims &pads_end);
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-
-    std::vector<int> get_real_padding_axis();
-
-    std::vector<expr> get_padding_offsets_exprs();
-
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-    reflection::shared_general_object_t get_dynamic_runtime_info() override;
-
-    void calculate_dynamic_shape_expression() override;
-
-    stmt get_zero_out_stmt(
-            const tensor &out, const slice_range_list &range_list);
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/pooling.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/pooling.cpp
deleted file mode 100644
index 5d059489368..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/pooling.cpp
+++ /dev/null
@@ -1,1487 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include "compiler/ir/builder.hpp"
-#include "compiler/ir/graph/fusible_op_utils.hpp"
-#include "compiler/ir/graph/fusion_anchor.hpp"
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/sc_data_format.hpp"
-#include "pooling.hpp"
-#include "util/bf16.hpp"
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <runtime/dynamic_dispatch/ops/runtime_op_info.hpp>
-#include <unordered_map>
-#include <util/reflection.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static inline any_map_t add_pl_type(const any_map_t &attrs, int pl_type) {
-    auto ret = attrs;
-    ret[pooling_attr_key::pooling_type] = pl_type;
-    return ret;
-}
-
-static inline any_map_t add_pl_type_and_in_shape(
-        const any_map_t &attrs, int pl_type, const sc_dims &input_shape) {
-    auto ret = attrs;
-    ret[pooling_attr_key::pooling_type] = pl_type;
-    ret[pooling_attr_key::src_shape] = input_shape;
-    return ret;
-}
-
-// return the vector storing the indices of pooling axis in format
-// for example format{0,2,3,1,1} returns [1, 2] when not channel_last
-// and [1, 3, 4] when channel_last
-static std::vector<int> get_real_pooling_axis_form_tensor(
-        const graph_tensor_ptr &t, bool channel_last) {
-    auto ndims = t->details_.get_plain_dims().size();
-    std::vector<int> required_axis(ndims - 2);
-    int kernel_axis_begin = channel_last ? 1 : 2;
-    std::iota(required_axis.begin(), required_axis.end(), kernel_axis_begin);
-    auto real_required_axis = transform_axis_plain2blocking(t, required_axis);
-
-    return real_required_axis;
-}
-
-std::vector<int> pooling_op_t::get_real_pooling_axis() const {
-    return get_real_pooling_axis_form_tensor(get_inputs()[0], channel_last_);
-}
-
-// will change format code from ncx/nxc -> ncx to help compute_block
-static std::vector<int> get_ncx_formatcode_vector_form_tensor(
-        const graph_tensor_ptr &t, bool channal_last) {
-    std::vector<int> out;
-    auto &in_fmt = t->details_.get_format();
-    int channel_axis = t->details_.get_plain_dims().size() - 1, bs_axis = 0;
-    for (int i = 0; i <= sc_data_format_kind_t::MAX_DIMS; i++) {
-        int n = in_fmt.format_code_.get(i);
-        if (n == sc_data_format_kind_t::UNDEF_DIM) break;
-        if (channal_last) {
-            if (n == bs_axis)
-                out.push_back(n);
-            else if (n == channel_axis)
-                out.push_back(1);
-            else
-                out.push_back(n + 1);
-        } else {
-            out.push_back(n);
-        }
-    }
-
-    return out;
-}
-
-std::vector<int> pooling_op_t::get_channel_axis() const {
-    std::vector<int> ret;
-    auto ncx_fmt_vec = get_ncx_formatcode_vector_form_tensor(
-            get_inputs()[0], channel_last_);
-    for (int i = 0; i < static_cast<int>(ncx_fmt_vec.size()); i++) {
-        if (ncx_fmt_vec[i] == 1) ret.emplace_back(i);
-    }
-    return ret;
-}
-
-static void check_format(sc_data_format_t fmt, bool channel_last) {
-    int n_kernel_dims = fmt.format_code_.norig_dims() - 2;
-    std::unordered_map<int, std::vector<int>> blocked_axis
-            = fmt.get_blocked_axis();
-    std::vector<std::string> pos_name = {"h", "w"};
-    if (n_kernel_dims == 3) { pos_name.insert(pos_name.begin(), "d"); }
-    uint64_t kernel_axis_begin = channel_last ? 1 : 2;
-    for (int i = 0; i < n_kernel_dims; i++) {
-        COMPILE_ASSERT(
-                blocked_axis.find(kernel_axis_begin + i) == blocked_axis.end(),
-                pos_name[i]
-                        + " axis should not be blocked in fusible pooling!");
-    }
-}
-
-slice_range_list infer_pool_slice_ranges(const graph_tensor_ptr &infered_tensor,
-        const slice_range_list &in_range_list, bool channel_last) {
-    auto real_pooling_axis
-            = get_real_pooling_axis_form_tensor(infered_tensor, channel_last);
-    auto &o_blocked_dims = infered_tensor->details_.get_blocking_dims();
-    slice_range_list o_ranges_list;
-    auto blocking_dims_expr = infered_tensor->details_.get_blocking_dims_expr(
-            infered_tensor->producer_owner_->get_owner_graph());
-    for (auto &range_list : in_range_list) {
-        slice_range out_range;
-        for (unsigned i = 0; i < range_list.size(); i++) {
-            if (!(std::find(
-                          real_pooling_axis.begin(), real_pooling_axis.end(), i)
-                        == real_pooling_axis.end())) {
-                if (is_dynamic_dim(o_blocked_dims[i])) {
-                    out_range.emplace_back(0, blocking_dims_expr[i]);
-                } else {
-                    out_range.emplace_back(0, uint64_t(o_blocked_dims[i]));
-                }
-                continue;
-            }
-            out_range.emplace_back(range_list[i]);
-        }
-        o_ranges_list.emplace_back(out_range);
-    }
-    return o_ranges_list;
-};
-
-static void check_and_set_pads_begin_and_pads_end(any_map_t &attrs,
-        const sc_dims &input_plain_dims, sc_dims &pads_begin, sc_dims &pads_end,
-        bool channel_last) {
-    std::string auto_pad = attrs.get_or_else<std::string>(
-            pooling_attr_key::auto_pad, auto_pad_options::none);
-    uint64_t n_kernel_dims = input_plain_dims.size() - 2;
-    uint64_t kernel_axis_begin = channel_last ? 1 : 2;
-    sc_dims kernel = attrs.get<sc_dims>(pooling_attr_key::kernel);
-    sc_dims stride = attrs.get<sc_dims>(pooling_attr_key::strides);
-    // compute pads_begin and pads_end
-    COMPILE_ASSERT(auto_pad == auto_pad_options::none
-                    || auto_pad == auto_pad_options::same_upper
-                    || auto_pad == auto_pad_options::same_lower
-                    || auto_pad == auto_pad_options::valid,
-            "auto_pad type should be none/same_upper(same_lower)/valid , but "
-            "got" << auto_pad);
-
-    if (auto_pad == auto_pad_options::none) {
-        if (attrs.has_key(pooling_attr_key::paddings)) {
-            auto &padding = attrs.get<sc_dims>(pooling_attr_key::paddings);
-
-            if (padding.size() == 1) {
-                pads_begin = std::vector<int64_t>(n_kernel_dims, padding[0]);
-                pads_end = std::vector<int64_t>(n_kernel_dims, padding[0]);
-            } else if (padding.size() == n_kernel_dims) {
-                pads_begin = padding;
-                pads_end = padding;
-            } else {
-                COMPILE_ASSERT(false,
-                        "padding should have " << n_kernel_dims
-                                               << " or 1 n-dims, but got "
-                                               << padding.size());
-            }
-        } else {
-            COMPILE_ASSERT(attrs.has_key(pooling_attr_key::pads_begin)
-                            && attrs.has_key(pooling_attr_key::pads_end),
-                    "max/avg pooling op should have pads_begin and pads_end");
-            pads_begin = attrs.get<sc_dims>(pooling_attr_key::pads_begin);
-            pads_end = attrs.get<sc_dims>(pooling_attr_key::pads_end);
-            COMPILE_ASSERT(pads_begin.size() == n_kernel_dims,
-                    "pads_begin should have " << n_kernel_dims
-                                              << "n-dims, but got "
-                                              << pads_begin.size());
-            COMPILE_ASSERT(pads_end.size() == n_kernel_dims,
-                    "pads_end should have " << n_kernel_dims
-                                            << "n-dims, but got "
-                                            << pads_end.size());
-        }
-    } else if (auto_pad == auto_pad_options::valid) {
-        pads_begin = std::vector<int64_t>(n_kernel_dims, 0);
-        pads_end = std::vector<int64_t>(n_kernel_dims, 0);
-    } else if (auto_pad == auto_pad_options::same_upper
-            || auto_pad == auto_pad_options::same_lower) {
-        pads_begin = std::vector<int64_t>(n_kernel_dims);
-        pads_end = std::vector<int64_t>(n_kernel_dims);
-        for (unsigned int i = 0; i < n_kernel_dims; i++) {
-            auto in_dim = input_plain_dims[kernel_axis_begin + i];
-            auto out_dim = (in_dim + stride[i] - 1) / stride[i];
-            auto total_pad = (out_dim - 1) * stride[i] + kernel[i] - in_dim;
-            if (total_pad < 0) total_pad = 0;
-            auto half_pad_small = total_pad / 2;
-            auto half_pad_big = total_pad - half_pad_small;
-            if (auto_pad == auto_pad_options::same_upper) {
-                pads_begin[i] = half_pad_small;
-                pads_end[i] = half_pad_big;
-            } else if (auto_pad == auto_pad_options::same_lower) {
-                pads_begin[i] = half_pad_big;
-                pads_end[i] = half_pad_small;
-            }
-        }
-    }
-    // update attrs
-    attrs[pooling_attr_key::pads_begin] = pads_begin;
-    attrs[pooling_attr_key::pads_end] = pads_end;
-    if (attrs.has_key(pooling_attr_key::paddings)) {
-        attrs.remove(pooling_attr_key::paddings);
-    }
-}
-
-static void check_and_set_kernel_strides_and_pooling_type(any_map_t &attrs,
-        uint64_t n_kernel_dims, sc_dims &kernel, sc_dims &stride,
-        pooling_type_t &pooling_type) {
-    COMPILE_ASSERT(attrs.has_key(pooling_attr_key::kernel)
-                    && attrs.has_key(pooling_attr_key::strides)
-                    && attrs.has_key(pooling_attr_key::pooling_type),
-            "max/avg pooling op takes 3 attributes, kernel strides and "
-            "pooling_type")
-    stride = attrs.get<sc_dims>(pooling_attr_key::strides);
-    if (stride.size() == 1) {
-        stride = sc_dims(n_kernel_dims, stride[0]);
-    } else {
-        COMPILE_ASSERT(stride.size() == n_kernel_dims,
-                "strides should have " << n_kernel_dims << "n-dims, but got "
-                                       << stride.size());
-    }
-    kernel = attrs.get<sc_dims>(pooling_attr_key::kernel);
-    COMPILE_ASSERT(kernel.size() == n_kernel_dims,
-            "kernel should have " << n_kernel_dims << "n-dims, but got "
-                                  << kernel.size());
-
-    pooling_type
-            = pooling_type_t(attrs.get<int>(pooling_attr_key::pooling_type));
-}
-
-static bool check_data_format_channel_last(const any_map_t &attrs) {
-    std::string data_format = attrs.get_or_else<std::string>(
-            pooling_attr_key::data_format, data_format_options::NXC);
-    COMPILE_ASSERT(data_format == data_format_options::NXC
-                    || data_format == data_format_options::NCX,
-            "Error data_format:" + data_format);
-    return data_format == data_format_options::NXC;
-}
-
-pooling_op_t::pooling_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    // set inputs and attrs_
-    COMPILE_ASSERT(ins.size() == 1, "Expecting 1 input for pooling_op_t");
-    uint64_t n_plain_dims = ins[0]->details_.get_plain_dims().size();
-    COMPILE_ASSERT(n_plain_dims == 4 || n_plain_dims == 5,
-            "input should have 4 or 5 n-dims,but got " << n_plain_dims);
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-
-    // set kernel_ and  stride_
-    uint64_t n_kernel_dims = n_plain_dims - 2;
-    check_and_set_kernel_strides_and_pooling_type(
-            attrs_, n_kernel_dims, kernel_, stride_, pooling_type_);
-
-    channel_last_ = check_data_format_channel_last(attrs_);
-    // set pads_begin_ and pads_begin_
-    check_and_set_pads_begin_and_pads_end(attrs_,
-            ins[0]->details_.get_plain_dims(), pads_begin_, pads_end_,
-            channel_last_);
-
-    // set outputs
-    std::string rounding_type = attrs.get_or_else<std::string>(
-            pooling_attr_key::rounding_type, rounding_type_options::floor);
-    COMPILE_ASSERT(rounding_type == rounding_type_options::floor
-                    || rounding_type == rounding_type_options::ceil,
-            "rounding type should be floor or ceil, but got" << rounding_type);
-    bool rounding_floor = rounding_type == rounding_type_options::floor;
-    sc_dims output_dims = _calculate_output_dims(rounding_floor, channel_last_);
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                info_.inputs_[0]->details_.get_format(), output_dims,
-                ins[0]->details_.dtype_));
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "pooling expect 1 output");
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               outs[0]->details_.get_plain_dims(), output_dims),
-                "Bad output shape for pooling")
-        info_.outputs_ = outs;
-    }
-}
-
-pooling_op_t::pooling_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs,
-        const pooling_type_t &pl_type, const any_map_t &attrs)
-    : pooling_op_t(ins, outs, add_pl_type(attrs, static_cast<int>(pl_type))) {}
-
-infer_status_code pooling_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-
-    // judge whether input dims full on w and h (and d)
-    auto real_required_axis = get_real_pooling_axis_form_tensor(
-            info_.inputs_[0], channel_last_);
-    auto &in_blocked_dims = info_.inputs_[0]->details_.get_blocking_dims();
-    for (auto &range_list : known_ranges_map[0]) {
-        if (!slice_full_on_axis(
-                    in_blocked_dims, range_list, real_required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-
-    // compute output slice range list
-    auto output_ranges_list = infer_pool_slice_ranges(
-            info_.outputs_[0], known_ranges_map[0], channel_last_);
-
-    // return final result
-    fsmap.get(info_.outputs_[0]) = output_ranges_list;
-    return infer_status_code::OK;
-}
-
-infer_status_code pooling_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    if (fsmap.get(get_inputs()[0]).empty()) {
-        slice_range_list known_ranges_list = fsmap.get(get_outputs()[0]);
-        slice_range_list input_slice_list = infer_pool_slice_ranges(
-                info_.inputs_[0], known_ranges_list, channel_last_);
-        if (input_slice_list.size() != 1) { return infer_status_code::RETRY; }
-        fsmap.get(get_inputs()[0]) = input_slice_list;
-    }
-    return infer_status_code::OK;
-}
-
-static void compute_block_pooling(
-        const std::vector<const tensor_slice *> &inputs,
-        const tensor_slice &dst, pooling_type_t pooling_typ, sc_dims kernel,
-        sc_dims stride, const std::vector<expr> &pads_begin,
-        const std::vector<int> &in_fmt_vector, const vectorized_info_t &vx_info,
-        sc_data_type_t in_dtype, sc_data_type_t out_dtype, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL) {
-    /*** The final IR may look like below:
-     * _for_(_fuseiter_i, 0, I, 1)
-     *   _for_(_fuseiter_j, 0, J, 1)
-     *     _for_(_fuseiter_k, 0, K, 1)
-     *        _for_(_fuseiter_l, 0, L, 1)
-     *          max: sum = -inf; avg: sum =0
-     *          num = 0
-     *          src_idx = [i,j,k * stride_h - pad_begin_h,
-     *                     l * stride_l - pad_begin_l]
-     *          dst_idx = [i,j,k,l]
-     *          _for_(_fuseiter_s, 0, S, 1)
-     *            _if(tmp_src_h>=0 && tmp_src_h< H )
-     *               _for_(_fuseiter_r, 0, R, 1)
-     *               tmp_src = src_idx + [0,0,s,r]
-     *               _if (tmp_src_w>=0 && tmp_src_w< W)
-     *                  sum += src[tmp_src]; or sum = max(sum,src[src_idx])
-     *                  num++
-     *               _else
-     *                  sum += 0; or sum = max(sum,0)
-     *            _else
-     *               sum += 0; or sum = max(sum,0)
-     *          dst[dst_idx] = sum/num or sum;
-     ***/
-
-    auto in_vectorized_dtype
-            = sc_data_type_t(in_dtype.type_code_, vx_info.lanes);
-    auto out_vectorized_dtype
-            = sc_data_type_t(out_dtype.type_code_, vx_info.lanes);
-    auto pool_buf_vectorized_dtype
-            = sc_data_type_t(in_dtype.type_code_, vx_info.lanes);
-    // nested loop vars
-    std::vector<expr> iter_vars, kernel_iter_vars;
-
-    // the indices for the output tensor
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        iter_vars.emplace_back(range_from_outer_loop(dst.get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_iter") + fusion_create_idx()));
-    }
-
-    // the indices dor the kernel inner loop
-    for (unsigned i = 0; i < kernel.size(); i++) {
-        kernel_iter_vars.emplace_back(builder::make_var(datatypes::index,
-                std::string("_kernel") + fusion_create_idx()));
-    }
-
-    // input indices
-    std::vector<std::vector<expr>> src_indices(inputs.size());
-    auto &src_idx = src_indices.at(0);
-    std::vector<expr> conds(kernel.size());
-    for (unsigned i = 0; i < inputs[0]->nslice_dims(); i++) {
-        if (in_fmt_vector[i] < 2) {
-            src_idx.emplace_back(iter_vars[i]);
-        } else {
-            int plan_axis = in_fmt_vector[i];
-            int pads_stride_index = plan_axis - 2;
-            // k * stride_h - pad_begin_h + kernel_var_h
-            expr out_h_idx = dst.get_offset()[i];
-            if (out_h_idx.isa<constant>()) out_h_idx = iter_vars[i];
-            auto idx = int(stride[pads_stride_index]) * out_h_idx
-                    - pads_begin[pads_stride_index]
-                    + kernel_iter_vars[pads_stride_index];
-            expr tmp_cond = builder::make_logic_and(
-                    builder::make_cmp_ge(idx, 0),
-                    builder::make_cmp_lt(idx, inputs[0]->get_shape()[i]));
-            conds[pads_stride_index] = tmp_cond;
-            src_idx.emplace_back(idx);
-        }
-    }
-
-    expr indexed_target
-            = builder::make_indexing(dst.tptr_, iter_vars, vx_info.lanes);
-    expr indexed_input = builder::make_indexing(
-            inputs[0]->tptr_, src_indices.at(0), vx_info.lanes);
-
-    // builder
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-
-    // assign init value
-    variant<float, int64_t> init_value;
-    bool is_int = utils::is_one_of(in_dtype.type_code_, sc_data_etype::U8,
-            sc_data_etype::U32, sc_data_etype::S8, sc_data_etype::S32);
-    if (pooling_typ == pooling_type_t::avg) {
-        init_value = 0.f;
-    } else {
-        COMPILE_ASSERT(
-                pooling_typ == pooling_type_t::max, "wrong pooling type");
-        init_value = numeric_limits_minimum(in_dtype.type_code_);
-    }
-
-    int kernel_size = 1;
-    bool exclude_pad = false;
-    if (pooling_typ == pooling_type_t::avg) {
-        exclude_pad = attrs.get<bool>("exclude_pad");
-        if (exclude_pad) {
-            kernel_size = 0;
-        } else {
-            for (auto kn : kernel)
-                kernel_size = kernel_size * kn;
-        }
-    }
-    expr zero_constant, one_constant, kernel_size_constant,
-            pooling_buf_constant;
-    if (in_dtype.type_code_ == sc_data_etype::F32
-            || in_dtype.type_code_ == sc_data_etype::BF16) {
-        zero_constant = make_expr<constant_node>(0.f, in_vectorized_dtype);
-        one_constant = make_expr<constant_node>(1.f, in_vectorized_dtype);
-        kernel_size_constant = make_expr<constant_node>(
-                float(kernel_size), in_vectorized_dtype);
-        pooling_buf_constant = make_expr<constant_node>(
-                init_value.get<float>(), in_vectorized_dtype);
-    } else if (in_dtype.type_code_ == sc_data_etype::U8
-            || in_dtype.type_code_ == sc_data_etype::U32
-            || in_dtype.type_code_ == sc_data_etype::S8
-            || in_dtype.type_code_ == sc_data_etype::S32) {
-        zero_constant = make_expr<constant_node>(0.f, in_vectorized_dtype);
-        one_constant = make_expr<constant_node>(1.f, in_vectorized_dtype);
-        if (pooling_typ == pooling_type_t::avg) {
-            pool_buf_vectorized_dtype
-                    = sc_data_type_t(sc_data_etype::F32, vx_info.lanes);
-            kernel_size_constant = make_expr<constant_node>(
-                    float(kernel_size), pool_buf_vectorized_dtype);
-            pooling_buf_constant = make_expr<constant_node>(
-                    init_value.get<float>(), pool_buf_vectorized_dtype);
-        } else {
-            kernel_size_constant = make_expr<constant_node>(
-                    int64_t(kernel_size), pool_buf_vectorized_dtype);
-            pooling_buf_constant = make_expr<constant_node>(
-                    uint64_t(init_value.get<int64_t>()),
-                    pool_buf_vectorized_dtype);
-        }
-    } else {
-        COMPILE_ASSERT(0, "unsupported in_dtype.");
-    }
-
-    // define local vars
-    expr kernel_size_var
-            = builder::make_var(pool_buf_vectorized_dtype, "kernel_size");
-    expr pooling_buf_var
-            = builder::make_var(pool_buf_vectorized_dtype, "pool_buf");
-    stmt pooling_buf_asnode
-            = make_stmt<assign_node_t>(pooling_buf_var, pooling_buf_constant);
-    stmt define_pool_buf_var
-            = make_stmt<define_node_t>(pooling_buf_var, linkage::local, expr());
-
-    // build inner kernel loop
-    stmt cur, body;
-    for (int i = kernel_iter_vars.size() - 1; i >= 0; i--) {
-        stmt else_stmt, then_stmt, additional_assign;
-        if (i == int(kernel_iter_vars.size() - 1)) {
-            if (pooling_typ == pooling_type_t::avg) {
-                if (pool_buf_vectorized_dtype != in_vectorized_dtype) {
-                    indexed_input = builder::make_cast(
-                            pool_buf_vectorized_dtype, indexed_input);
-                }
-                then_stmt = make_stmt<assign_node_t>(pooling_buf_var,
-                        builder::make_add(indexed_input, pooling_buf_var));
-                then_stmt->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                if (exclude_pad) {
-                    additional_assign = make_stmt<assign_node_t>(
-                            kernel_size_var,
-                            builder::make_add(kernel_size_var, one_constant));
-                    additional_assign->attr()
-                            [op_traits::workload_computable_t::workload_number]
-                            = wkld;
-                    then_stmt = make_stmt<stmts_node_t>(std::vector<stmt> {
-                            std::move(then_stmt), additional_assign});
-                }
-            } else if (pooling_typ == pooling_type_t::max) {
-                then_stmt = make_stmt<assign_node_t>(pooling_buf_var,
-                        builder::make_max(indexed_input, pooling_buf_var));
-                then_stmt->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-            }
-        } else {
-            then_stmt = cur;
-        }
-
-        cur = make_stmt<if_else_node_t>(conds[i], then_stmt, else_stmt);
-
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(std::move(kernel_iter_vars.at(i)),
-                expr(0), int(kernel[i]), expr(1), std::move(body), true,
-                for_type::NORMAL);
-    }
-
-    // build outter loop to generate pooling result
-    stmt target_assign;
-    std::vector<fusion_anchor_ptr> inner_anchors;
-    std::vector<stmt> inital_stmts = {define_pool_buf_var, pooling_buf_asnode};
-    if (pooling_typ == pooling_type_t::avg && exclude_pad) {
-        stmt define_kernel_size_var = make_stmt<define_node_t>(
-                kernel_size_var, linkage::local, expr());
-        stmt kernel_size_asnode = make_stmt<assign_node_t>(
-                kernel_size_var, kernel_size_constant);
-        inital_stmts.emplace_back(define_kernel_size_var);
-        inital_stmts.emplace_back(kernel_size_asnode);
-    }
-    for (int i = iter_vars.size() - 1; i >= 0; i--) {
-        if (i == int(iter_vars.size() - 1)) {
-            if (pooling_typ == pooling_type_t::avg) {
-                expr kernel_size_expr = kernel_size_constant;
-                if (exclude_pad) kernel_size_expr = kernel_size_var;
-                expr pooling_result = builder::make_div(pooling_buf_var,
-                        builder::make_cast(
-                                pool_buf_vectorized_dtype, kernel_size_expr));
-                if (out_vectorized_dtype.type_code_
-                        != pool_buf_vectorized_dtype.type_code_)
-                    pooling_result = builder::make_cast(
-                            out_vectorized_dtype, pooling_result);
-                target_assign = make_stmt<assign_node_t>(
-                        indexed_target, pooling_result);
-            } else if (pooling_typ == pooling_type_t::max) {
-                target_assign = make_stmt<assign_node_t>(
-                        indexed_target, pooling_buf_var);
-            }
-            inital_stmts.emplace_back(std::move(cur));
-            inital_stmts.emplace_back(std::move(target_assign));
-            cur = make_stmt<stmts_node_t>(std::move(inital_stmts));
-        }
-        // Do not generate those dummy loops
-        if (!iter_vars.at(i).isa<var>()) continue;
-
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {cur});
-        if (!body.ptr_same(cur)) add_parent_node(cur, body);
-
-        // create output inner anchors for postop fusion
-        auto anchor_stmt = make_stmt<stmts_node_t>(std::vector<stmt> {});
-        body.static_as<stmts>()->seq_.emplace_back(anchor_stmt);
-        add_parent_node(anchor_stmt, body);
-        slice_range inner_slice = dst.get_ranges();
-        for (int j = i; j >= 0; j--) {
-            inner_slice[j].first = dst.get_offset()[j] + iter_vars[j];
-            inner_slice[j].second = ((static_cast<int>(j) == vx_info.axis)
-                            ? expr(int(vx_info.lanes))
-                            : expr(1));
-        }
-        fslice_map fsmap;
-        fsmap.get(expand_gt) = slice_range_list {inner_slice};
-        inner_anchors.emplace_back(
-                std::make_shared<fusion_anchor_t>(anchor_stmt, fsmap));
-
-        cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)), 0,
-                dst.get_shape()[i],
-                (i == int(iter_vars.size() - 1)) ? int(vx_info.lanes) : 1, body,
-                true, i == 0 ? for_type::PARALLEL : for_type::NORMAL);
-        bind_loop_axis(expand_gt, cur, i, true);
-        add_parent_node(body, cur);
-    }
-
-    bld->emit(cur);
-    attrs[op_attr_key::fusible_inner_anchors] = inner_anchors;
-}
-void pooling_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    // set up vx_info
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-    vx_info_.lanes = 1;
-    // if last axis are not h or w (or d) ,lanes can be not 1
-    auto last_axis = info_.inputs_[0]->details_.get_format().format_code_.get(
-            inputs[0]->nbase_dims() - 1);
-    const int channel_axis = channel_last_
-            ? info_.inputs_[0]->details_.get_plain_dims().size() - 1
-            : 1;
-    bool last_axis_not_compute = last_axis == 0 || last_axis == channel_axis;
-    if (last_axis_not_compute) {
-        int last_dim = 1;
-        auto &dim_tmp = inputs[0]->get_shape().back();
-        if (dim_tmp.isa<constant>()) {
-            last_dim = get_const_as_int(dim_tmp.checked_as<constant_c>());
-        }
-        auto vector_lanes = vectorize_step(
-                ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-        if (last_dim / vector_lanes && last_dim % vector_lanes == 0) {
-            vx_info_.lanes = vector_lanes;
-        }
-    }
-    auto in_dtype = info_.inputs_[0]->details_.dtype_;
-    auto out_dtype = info_.outputs_[0]->details_.dtype_;
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    auto in_fmt_vector = get_ncx_formatcode_vector_form_tensor(
-            info_.inputs_[0], channel_last_);
-    std::string auto_pad = attrs_.get_or_else<std::string>(
-            pooling_attr_key::auto_pad, auto_pad_options::none);
-    auto pads_begin_expr = get_owner_graph().dims_to_expr(pads_begin_);
-    if ((is_dynamic()
-                && (auto_pad == auto_pad_options::same_upper
-                        || auto_pad == auto_pad_options::same_lower))) {
-        auto src_shapes = inputs[0]->get_shape();
-        auto dst_shapes = (*dst[0]).get_shape();
-        for (unsigned i = 0; i < inputs[0]->nslice_dims(); i++) {
-            if (in_fmt_vector[i] < 2) { continue; }
-            int plan_axis = in_fmt_vector[i];
-            if (!is_dynamic_dim(
-                        info_.inputs_[0]
-                                ->details_.get_plain_dims()[plan_axis])) {
-                continue;
-            }
-            int pads_stride_index = plan_axis - 2;
-            auto total_pad
-                    = builder::make_max((dst_shapes[i] - 1)
-                                              * int(stride_[pads_stride_index])
-                                      + int(kernel_[pads_stride_index]),
-                              src_shapes[i])
-                    - src_shapes[i];
-            if (auto_pad == auto_pad_options::same_upper) {
-                pads_begin_expr[pads_stride_index]
-                        = do_cast_and_fold(total_pad / 2);
-            } else {
-                pads_begin_expr[pads_stride_index]
-                        = do_cast_and_fold((total_pad + 1) / 2);
-            }
-        }
-    }
-    compute_block_pooling(inputs, *dst[0], pooling_type_, kernel_, stride_,
-            pads_begin_expr, in_fmt_vector, vx_info_, in_dtype, out_dtype,
-            attrs_, info_.outputs_[0], wkld);
-}
-
-void pooling_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    auto &in_fmt = info_.inputs_[0]->details_.get_format();
-    check_format(in_fmt, channel_last_);
-    in_formats.push_back({in_fmt});
-    out_formats.push_back({in_fmt});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-size_t pooling_op_t::compute_workload(const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    auto &in_shape = ins[0].first;
-    auto &out_shape = outs[0].first;
-    auto &in_dtype = ins[0].second;
-    auto real_compute_axis = get_real_pooling_axis_form_tensor(
-            info_.inputs_[0], channel_last_);
-
-    size_t wkld = utils::get_sizeof_type(in_dtype) * read_weight;
-    size_t wkld_out = utils::get_sizeof_type(in_dtype) * write_weight;
-    for (auto &compute_axis : real_compute_axis) {
-        wkld *= in_shape[compute_axis];
-        wkld_out *= out_shape[compute_axis];
-    }
-
-    wkld += wkld_out;
-    wkld *= workload_penalty_coefficient;
-    return wkld;
-}
-
-shape_rl_vec pooling_op_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    if (is_dynamic_dim(in_dims[0])) {
-        ret.emplace_back(in_dims[0], out_dims[0]);
-    }
-    const int shape_begin_axis = channel_last_ ? 1 : 2;
-    auto n_pad_dims = in_dims.size() - 2;
-    std::string auto_pad = attrs_.get_or_else<std::string>(
-            pooling_attr_key::auto_pad, auto_pad_options::none);
-    for (unsigned i = 0; i < n_pad_dims; i++) {
-        if (is_dynamic_dim(in_dims[shape_begin_axis + i])) {
-            if (auto_pad == auto_pad_options::same_upper
-                    || auto_pad == auto_pad_options::same_lower) {
-                if (stride_[i] == 1) {
-                    ret.emplace_back(in_dims[shape_begin_axis + i],
-                            out_dims[shape_begin_axis + i]);
-                }
-            } else {
-                if (stride_[i] == 1
-                        && (kernel_[i] - (pads_begin_[i] + pads_end_[i])
-                                == 1)) {
-                    ret.emplace_back(in_dims[shape_begin_axis + i],
-                            out_dims[shape_begin_axis + i]);
-                }
-            }
-        }
-    }
-    return ret;
-}
-
-reflection::shared_general_object_t pooling_op_t::get_dynamic_runtime_info() {
-    auto ndims = info_.inputs_[0]->details_.get_plain_dims().size();
-    std::string auto_pad = attrs_.get_or_else<std::string>(
-            pooling_attr_key::auto_pad, auto_pad_options::none);
-    auto auto_pads_same = auto_pad == auto_pad_options::same_upper
-            || auto_pad == auto_pad_options::same_lower;
-    auto rounding_type_floor
-            = attrs_.get_or_else<std::string>(pooling_attr_key::rounding_type,
-                      rounding_type_options::floor)
-            == rounding_type_options::floor;
-    auto dyn_info = ndims == 5
-            ? dyn_pooling_runtime_info_t(stride_[0], stride_[1], stride_[2],
-                    pads_begin_[0], pads_begin_[1], pads_begin_[2],
-                    pads_end_[0], pads_end_[1], pads_end_[2], kernel_[0],
-                    kernel_[1], kernel_[2], rounding_type_floor, auto_pads_same)
-            : dyn_pooling_runtime_info_t(stride_[0], stride_[1], pads_begin_[0],
-                    pads_begin_[1], pads_end_[0], pads_end_[1], kernel_[0],
-                    kernel_[1], rounding_type_floor, auto_pads_same);
-    reflection::shared_general_object_t info
-            = reflection::general_object_t::make(dyn_info);
-    return info;
-}
-
-void pooling_op_t::calculate_dynamic_shape_expression() {
-    auto &g = get_owner_graph();
-    auto expr_pads_begin = g.dims_to_expr(pads_begin_);
-    auto expr_pads_end = g.dims_to_expr(pads_end_);
-    auto expr_strides = g.dims_to_expr(stride_);
-    auto expr_kernels = g.dims_to_expr(kernel_);
-    auto &data_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    const int shape_begin_axis = channel_last_ ? 1 : 2;
-    auto n_pad_dims = data_dims.size() - 2;
-    std::string auto_pad = attrs_.get_or_else<std::string>(
-            pooling_attr_key::auto_pad, auto_pad_options::none);
-    for (unsigned i = 0; i < n_pad_dims; i++) {
-        if (is_dynamic_dim(data_dims[shape_begin_axis + i])
-                && out_dims[shape_begin_axis + i]
-                        != data_dims[shape_begin_axis + i]) {
-            auto var_in = g.dim_to_expr(data_dims[shape_begin_axis + i]);
-            auto var_out = g.dim_to_expr(out_dims[shape_begin_axis + i]);
-            expr_c cal_expr;
-            if (auto_pad == auto_pad_options::same_upper
-                    || auto_pad == auto_pad_options::same_lower) {
-                cal_expr = do_cast_and_fold(
-                        (var_in + expr_strides[i] - 1) / expr_strides[i]);
-            } else {
-                cal_expr = do_cast_and_fold(
-                        (var_in + expr_pads_begin[i] + expr_pads_end[i]
-                                - expr_kernels[i])
-                                / expr_strides[i]
-                        + 1);
-            }
-            var_out->attr_->set(attr_keys::cal_expression, cal_expr);
-        }
-    }
-}
-
-sc_dims pooling_op_t::_calculate_output_dims(
-        bool rounding_floor, bool channel_last) {
-    auto &input_dims = info_.inputs_[0]->details_.get_plain_dims();
-    unsigned n_plain_dims = input_dims.size();
-    unsigned n_pads_dims = n_plain_dims - 2;
-    const int channel_axis = channel_last ? input_dims.size() - 1 : 1;
-    const int shape_begin_axis = channel_last ? 1 : 2;
-
-    sc_dims output_dims(n_plain_dims);
-    output_dims[0] = input_dims[0];
-    output_dims[channel_axis] = input_dims[channel_axis];
-
-    for (unsigned i = 0; i < n_pads_dims; i++) {
-        int padding = pads_begin_[i] + pads_end_[i];
-        if (is_dynamic_dim(input_dims[shape_begin_axis + i])) {
-            output_dims[shape_begin_axis + i] = dimensions::dynamic_any;
-        } else {
-            if (rounding_floor) {
-                output_dims[shape_begin_axis + i]
-                        = (input_dims[shape_begin_axis + i] + padding
-                                  - kernel_[i])
-                                / stride_[i]
-                        + 1;
-            } else {
-                output_dims[shape_begin_axis + i]
-                        = utils::divide_and_ceil(
-                                  input_dims[shape_begin_axis + i] + padding
-                                          - kernel_[i],
-                                  stride_[i])
-                        + 1;
-            }
-        }
-    }
-    return output_dims;
-}
-
-pooling_avg_op_t::pooling_avg_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : parent(ins, outs, pooling_type_t::avg, attrs) {
-    op_name_ = "pooling_avg";
-    COMPILE_ASSERT(
-            attrs.has_key("exclude_pad"), "avg pooling must have exclude_pad");
-}
-
-pooling_max_op_t::pooling_max_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : parent(ins, outs, pooling_type_t::max, attrs) {
-    op_name_ = "pooling_max";
-}
-
-pooling_backprop_op_t::pooling_backprop_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    // set inputs_ and attr_
-    COMPILE_ASSERT(!ins.empty(), "at least 1 input for pooling_backprop_op_t ");
-    unsigned n_plain_dims = ins[0]->details_.get_plain_dims().size();
-    COMPILE_ASSERT(n_plain_dims == 4 || n_plain_dims == 5,
-            "input should have 4 or 5 n-dims,but got " << n_plain_dims);
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-
-    channel_last_ = check_data_format_channel_last(attrs_);
-    // set kernel_ and  stride_ vars
-    uint64_t n_kernel_dims = n_plain_dims - 2;
-    check_and_set_kernel_strides_and_pooling_type(
-            attrs_, n_kernel_dims, kernel_, stride_, pooling_type_);
-
-    sc_dims input_plain_shape;
-    // set pads_begin_ and pads_begin_
-    if (pooling_type_ == pooling_type_t::avg) {
-        COMPILE_ASSERT(
-                attrs.has_key(pooling_attr_key::src_shape) || ins.size() == 2,
-                "the pooling_backprop_op_t op should have input_shape.");
-        bool ins_has_src_shape = ins.size() == 2;
-        input_plain_shape = ins_has_src_shape
-                ? ins[1]->details_.get_plain_dims()
-                : attrs.get<sc_dims>(pooling_attr_key::src_shape);
-    } else {
-        COMPILE_ASSERT(attrs.has_key(pooling_attr_key::src_shape),
-                "the pooling_backprop_op_t op should have input_shape "
-                "attribute");
-        input_plain_shape = attrs.get<sc_dims>(pooling_attr_key::src_shape);
-    }
-
-    check_and_set_pads_begin_and_pads_end(
-            attrs_, input_plain_shape, pads_begin_, pads_end_, channel_last_);
-
-    // set outputs
-    sc_dims out_delta_dims = input_plain_shape;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                info_.inputs_[0]->details_.get_format(), out_delta_dims,
-                ins[0]->details_.dtype_));
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "pooling backprop expect 1 output");
-        COMPILE_ASSERT(outs[0]->details_.get_plain_dims() == out_delta_dims
-                        && outs[0]->details_.dtype_ == ins[0]->details_.dtype_,
-                "Bad output shape for pooling backprop")
-        info_.outputs_ = outs;
-    }
-}
-
-pooling_avg_backprop_op_t::pooling_avg_backprop_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : pooling_backprop_op_t(ins, outs,
-            add_pl_type(attrs, static_cast<int>(pooling_type_t::avg))) {
-    COMPILE_ASSERT(ins.size() == 1, " pooling_avg_backprop_op_t have 1 inputs");
-    op_name_ = "pooling_avg_backprop";
-    COMPILE_ASSERT(
-            attrs.has_key("exclude_pad"), "avg pooling must have exclude_pad");
-}
-
-pooling_avg_backprop_op_t::pooling_avg_backprop_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_dims &input_shape,
-        const any_map_t &attrs)
-    : pooling_backprop_op_t(ins, outs,
-            add_pl_type_and_in_shape(attrs,
-                    static_cast<int>(pooling_type_t::avg), input_shape)) {
-    COMPILE_ASSERT(ins.size() == 1, " pooling_avg_backprop_op_t have 1 inputs");
-    op_name_ = "pooling_avg_backprop";
-    COMPILE_ASSERT(
-            attrs.has_key("exclude_pad"), "avg pooling must have exclude_pad");
-}
-
-pooling_max_backprop_op_t::pooling_max_backprop_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : pooling_backprop_op_t(ins, outs,
-            add_pl_type_and_in_shape(attrs,
-                    static_cast<int>(pooling_type_t::max),
-                    ins[0]->details_.get_plain_dims())) {
-    COMPILE_ASSERT(info_.inputs_.size() == 2,
-            " pooling_max_backprop_op_t have 2 inputs");
-    const auto output_delta_ndims
-            = info_.inputs_[1]->details_.get_plain_dims().size();
-    const auto input_tensor_ndims
-            = info_.inputs_[0]->details_.get_plain_dims().size();
-    COMPILE_ASSERT(input_tensor_ndims == output_delta_ndims,
-            "delta should have n dims as input tensor");
-    op_name_ = "pooling_max_backprop";
-}
-
-void pooling_backprop_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-
-    // infer output format,same as in_fmormat_of_output_delta
-    sc_data_format_t in_fmt_of_delta = info_.inputs_[0]->details_.get_format();
-    check_format(in_fmt_of_delta, channel_last_);
-    sc_data_format_t out_fmt = in_fmt_of_delta;
-
-    // infer inputs formats
-    std::vector<sc_data_format_t> in_fmts;
-    in_fmts.reserve(info_.inputs_.size());
-    for (const auto &in_tensor : info_.inputs_) {
-        sc_data_format_t in_fmt = in_tensor->details_.get_format();
-        check_format(in_fmt, channel_last_);
-        in_fmts.emplace_back(in_fmt);
-    }
-
-    // set result
-    in_formats.push_back(in_fmts);
-    out_formats.push_back({out_fmt});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-infer_status_code pooling_backprop_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-
-    // judge inputs' dims full on w and h (and d)
-    auto real_required_axis = get_real_pooling_axis_form_tensor(
-            info_.inputs_[0], channel_last_);
-    auto &in_blocked_dims = info_.inputs_[0]->details_.get_blocking_dims();
-    for (auto &range_list : known_ranges_map[0]) {
-        if (!slice_full_on_axis(
-                    in_blocked_dims, range_list, real_required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-
-    // infer other input slice range
-    if (known_ranges_map.size() < get_inputs().size()) {
-        for (size_t i = 0; i < get_inputs().size(); i++) {
-            if (i == 0) continue;
-            auto o_ranges_list = infer_pool_slice_ranges(
-                    info_.inputs_[i], known_ranges_map[0], channel_last_);
-            fsmap.get(info_.inputs_[i]) = o_ranges_list;
-        }
-    }
-
-    // compute output slice range list
-    auto o_ranges_list = infer_pool_slice_ranges(
-            info_.outputs_[0], known_ranges_map[0], channel_last_);
-    fsmap.get(info_.outputs_[0]) = o_ranges_list;
-    return infer_status_code::OK;
-}
-infer_status_code pooling_backprop_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    return infer_status_code::FAIL;
-}
-
-static void pooling_backward_fill_zero_dst(const tensor_slice &dst,
-        sc_data_type_t in_dtype, const vectorized_info_t &vx_info) {
-    auto in_vectorized_dtype
-            = sc_data_type_t(in_dtype.type_code_, vx_info.lanes);
-
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    // make iter vars as index
-    std::vector<expr> dst_idx;
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        dst_idx.emplace_back(builder::make_var(datatypes::index,
-                std::string("_fuseiter") + fusion_create_idx()));
-    }
-    // make assign node
-    expr indexed_in_delta
-            = builder::make_indexing(dst.tptr_, dst_idx, vx_info.lanes);
-    stmt indelta_zero_asnode = make_stmt<assign_node_t>(indexed_in_delta,
-            make_expr<constant_node>(0.f, in_vectorized_dtype));
-    // make loops
-    stmt body, cur;
-    for (int i = dst_idx.size() - 1; i >= 0; i--) {
-        if (i == int(dst_idx.size() - 1)) { cur = indelta_zero_asnode; }
-        body = make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(std::move(dst_idx.at(i)), 0,
-                dst.get_shape()[i],
-                (i == int(dst_idx.size() - 1)) ? int(vx_info.lanes) : 1,
-                std::move(body), true, for_type::NORMAL);
-    }
-    bld->emit(cur);
-}
-
-static void compute_block_pooling_backward_avg(
-        const std::vector<const tensor_slice *> &inputs,
-        const tensor_slice &dst, sc_dims kernel, sc_dims stride,
-        sc_dims pads_begin, const std::vector<int> &dst_fmt_vector,
-        const vectorized_info_t &vx_info, sc_data_type_t in_dtype,
-        any_map_t &attrs, const graph_tensor_ptr &expand_gt,
-        size_t wkld = 0UL) {
-    /***The final IR may look like below:
-     * _for_(_fuseiter_i, 0, I, 1)
-     *   _for_(_fuseiter_j, 0, J, 1)
-     *     // set src_delta to zero
-     *     _for_(_fuseiter_src_k, 0, K, 1)
-     *       _for_(_fuseiter_src_l, 0, L, 1)
-     *         src_idx = [i,j,k,l]
-     *         src_delta[src_idx] = 0
-     *     _for_(_fuseiter_k, 0, K, 1)
-     *       _for_(_fuseiter_l, 0, L, 1)
-     *         src_idx = [i,j,k * stride_h - pad_begin_h,l * stride_l -
-     *               pad_begin_l]
-     *         dst_idx = [i,j,k,l]
-     *         _for_(_fuseiter_s, 0, S, 1)
-     *           _if(tmp_src_h>=0 && tmp_h< H )
-     *             _for_(_fuseiter_r, 0, R, 1)
-     *               tmp_src = src_idx + [0,0,s,r]
-     *               _if (tmp_src_w>=0 && tmp_src_w< W)
-     *                 src_delta[src_idx] += dst_delta[dst_idx]/num;
-     ***/
-
-    // fill dst delta with 0
-    pooling_backward_fill_zero_dst(dst, in_dtype, vx_info);
-
-    // builder
-    auto bld = builder::get_current_builder();
-
-    // nested loop vars
-    std::vector<expr> iter_vars, kernel_iter_vars;
-
-    // the indices for the source delta
-    for (unsigned i = 0; i < inputs[0]->nslice_dims(); i++) {
-        iter_vars.emplace_back(range_from_outer_loop(inputs[0]->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-    }
-    expr indexed_src_delta = builder::make_indexing(
-            inputs[0]->tptr_, iter_vars, vx_info.lanes);
-
-    // the indices dor the kernel inner loop
-    for (unsigned i = 0; i < kernel.size(); i++) {
-        kernel_iter_vars.emplace_back(builder::make_var(datatypes::index,
-                std::string("_fuseiter") + fusion_create_idx()));
-    }
-
-    auto kernel_size_var = builder::make_var(in_dtype, "kernel_size");
-    auto define_kernel_size_var
-            = make_stmt<define_node_t>(kernel_size_var, linkage::local, expr());
-
-    // the indices of output delta
-    std::vector<expr> dst_delta_idx;
-    std::vector<expr> conds(kernel.size());
-
-    expr kernel_size_multi_expr = kernel_size_var;
-    bool multi_first = true;
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        if (dst_fmt_vector[i] < 2) {
-            dst_delta_idx.emplace_back(iter_vars[i]);
-        } else {
-            int plan_axis = dst_fmt_vector[i];
-            int pads_stride_index = plan_axis - 2;
-            // k * stride_h - pad_begin_h + kernel_var_h
-            auto idx = int(stride[pads_stride_index]) * iter_vars[i]
-                    - int(pads_begin[pads_stride_index])
-                    + kernel_iter_vars[pads_stride_index];
-            expr tmp_cond
-                    = builder::make_logic_and(builder::make_cmp_ge(idx, 0),
-                            builder::make_cmp_lt(idx, dst.get_shape()[i]));
-            conds[pads_stride_index] = tmp_cond;
-            dst_delta_idx.emplace_back(idx);
-            auto window_start = make_expr<intrin_call_node>(intrin_type::max,
-                    std::vector<expr> {make_expr<constant_node>(
-                                               int64_t(0), datatypes::s32),
-                            int(stride[pads_stride_index])
-                                            * make_expr<cast_node>(
-                                                    datatypes::s32,
-                                                    iter_vars[i])
-                                    - int(pads_begin[pads_stride_index])},
-                    any_map_t());
-            auto window_end = make_expr<intrin_call_node>(intrin_type::min,
-                    std::vector<expr> {make_expr<cast_node>(datatypes::s32,
-                                               dst.get_shape()[i]),
-                            int(stride[pads_stride_index])
-                                            * make_expr<cast_node>(
-                                                    datatypes::s32,
-                                                    iter_vars[i])
-                                    - int(pads_begin[pads_stride_index])
-                                    + int(kernel[pads_stride_index])},
-                    any_map_t());
-            if (multi_first) {
-                kernel_size_multi_expr
-                        = builder::make_sub(window_end, window_start);
-                multi_first = false;
-            } else
-                kernel_size_multi_expr
-                        = builder::make_mul(kernel_size_multi_expr,
-                                builder::make_sub(window_end, window_start));
-        }
-    }
-    expr indexed_dst_delta
-            = builder::make_indexing(dst.tptr_, dst_delta_idx, vx_info.lanes);
-
-    // assign kernel_size node
-    stmt kernel_size_asnode;
-    int kernel_size = 1;
-    for (auto kn : kernel)
-        kernel_size = kernel_size * int(kn);
-
-    bool exclude_pad = attrs.get<bool>("exclude_pad");
-    if (in_dtype.type_code_ == sc_data_etype::F32
-            || in_dtype.type_code_ == sc_data_etype::BF16) {
-        if (exclude_pad) {
-            kernel_size_asnode = make_stmt<assign_node_t>(
-                    kernel_size_var, kernel_size_multi_expr);
-        } else {
-            kernel_size_asnode = make_stmt<assign_node_t>(kernel_size_var,
-                    make_expr<constant_node>(float(kernel_size), in_dtype));
-        }
-    } else {
-        COMPILE_ASSERT(0, "unsupported in_dtype.");
-    }
-
-    // build inner kernel loop
-    stmt cur, body;
-    for (int i = kernel_iter_vars.size() - 1; i >= 0; i--) {
-        stmt else_stmt, then_stmt;
-        if (i == int(kernel_iter_vars.size() - 1)) {
-            expr kernel_size = kernel_size_var;
-            if (vx_info.lanes > 1) {
-                kernel_size = builder::make_broadcast(
-                        kernel_size_var, vx_info.lanes);
-            }
-
-            then_stmt = make_stmt<assign_node_t>(indexed_dst_delta,
-                    builder::make_add(indexed_dst_delta,
-                            builder::make_div(indexed_src_delta, kernel_size)));
-        } else {
-            then_stmt = cur;
-        }
-
-        cur = make_stmt<if_else_node_t>(conds[i], then_stmt, else_stmt);
-
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(std::move(kernel_iter_vars.at(i)),
-                expr(0), int(kernel[i]), expr(1), std::move(body), true,
-                for_type::NORMAL);
-    }
-
-    // build outter loop
-    stmt target_assign;
-    for (int i = iter_vars.size() - 1; i >= 0; i--) {
-        if (i == int(iter_vars.size() - 1)) {
-            cur = cur.isa<stmts>() ? cur
-                                   : make_stmt<stmts_node_t>(std::vector<stmt> {
-                                           kernel_size_asnode, std::move(cur)});
-        }
-        // Do not generate those dummy loops
-        if (!iter_vars.at(i).isa<var>()) continue;
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-
-        cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)), 0,
-                inputs[0]->get_shape()[i],
-                (i == int(iter_vars.size() - 1)) ? int(vx_info.lanes) : 1,
-                std::move(body), true, for_type::NORMAL);
-
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        bind_loop_axis(expand_gt, cur, i, true);
-    }
-
-    cur = make_stmt<stmts_node_t>(
-            std::vector<stmt> {define_kernel_size_var, std::move(cur)});
-
-    bld->emit(cur);
-}
-
-static void compute_block_pooling_backward_max(
-        const std::vector<const tensor_slice *> &inputs,
-        const tensor_slice &dst, sc_dims kernel, sc_dims stride,
-        sc_dims pads_begin, const std::vector<int> &dst_fmt_vector,
-        const vectorized_info_t &vx_info, sc_data_type_t in_dtype,
-        any_map_t &attrs, const graph_tensor_ptr &expand_gt,
-        size_t wkld = 0UL) {
-    /***The final IR may look like below:
-     * _for_(_fuseiter_i, 0, I, 1)
-     *   _for_(_fuseiter_j, 0, J, 1)
-     *     // set src_delta to zero
-     *     _for_(_fuseiter_src_k, 0, K, 1)
-     *       _for_(_fuseiter_src_l, 0, L, 1)
-     *         src_idx = [i,j,k,l]
-     *         src_delta[src_idx] = 0
-     *     _for_(_fuseiter_k, 0, K, 1)
-     *       _for_(_fuseiter_l, 0, L, 1)
-     *         src_idx = [i,j,k * stride_h - pad_begin_h,
-     *                     l * stride_l - pad_begin_l]
-     *         dst_idx = [i,j,k,l]
-     *         max_idx_h = -1
-     *         max_idx_w = -1
-     *         max_val = -inf
-     *         has_max = true
-     *         _for_(_fuseiter_s, 0, S, 1)
-     *           _if(tmp_src_h>=0 && tmp_src_h< H )
-     *             _for_(_fuseiter_r, 0, R, 1)
-     *               tmp_src = src_idx + [0,0,s,r]
-     *                 _if (tmp_src_w>=0 && tmp_src_w< W)
-     *                   _if in_tenor[src_idx] >= max_val:
-     *                     max_index_h&w = src_idx_h&w;
-     *                     max_val = in_tenor[src_idx]
-     *                     has_max = true
-     *                  _else
-     *                     _if max_val<0:
-     *                       has_max = false;
-     *                       max_val = 0
-     *           _if max_index_h >=0:
-     *             src_delta[max_idx]+=dst_delta[dst_idx]
-     ***/
-
-    // fill dst delta with 0
-    pooling_backward_fill_zero_dst(dst, in_dtype, vx_info);
-
-    // builder
-    auto bld = builder::get_current_builder();
-
-    // set expr max_val to save max value and max_val_pos vector to save its
-    // position
-    expr max_val, has_max;
-    stmt define_has_max;
-    std::vector<expr> max_val_pos(kernel.size());
-    std::vector<stmt> defines_of_max;
-    std::vector<stmt> assigns_of_max;
-
-    max_val = builder::make_var(in_dtype, "max_val");
-    defines_of_max.emplace_back(
-            make_stmt<define_node_t>(max_val, linkage::local, expr()));
-    std::string pos_name[] = {"i", "j", "k"};
-    for (size_t i = 0; i < kernel.size(); i++) {
-        max_val_pos[i] = builder::make_var(
-                datatypes::index, "max_val_pos_" + pos_name[i]);
-        defines_of_max.emplace_back(make_stmt<define_node_t>(
-                max_val_pos[i], linkage::local, expr()));
-    }
-    if (in_dtype.type_code_ == sc_data_etype::F32
-            || in_dtype.type_code_ == sc_data_etype::BF16) {
-        assigns_of_max.emplace_back(make_stmt<assign_node_t>(max_val,
-                make_expr<constant_node>(
-                        -std::numeric_limits<float>::infinity(), in_dtype)));
-    } else {
-        COMPILE_ASSERT(0, "unsupported in_dtype.");
-    }
-
-    has_max = builder::make_var(datatypes::boolean, "has_max");
-    define_has_max = make_stmt<define_node_t>(has_max, linkage::local, false);
-
-    // nested loop vars
-    std::vector<expr> iter_vars, kernel_iter_vars;
-
-    // the indices for the source delta
-    for (unsigned i = 0; i < inputs[1]->nslice_dims(); i++) {
-        iter_vars.emplace_back(range_from_outer_loop(inputs[1]->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-    }
-    expr indexed_src_delta
-            = builder::make_indexing(inputs[1]->tptr_, iter_vars);
-
-    // the indices dor the kernel inner loop
-    for (unsigned i = 0; i < kernel.size(); i++) {
-        kernel_iter_vars.emplace_back(builder::make_var(datatypes::index,
-                std::string("_fuseiter") + fusion_create_idx()));
-    }
-
-    // the indices of output delta and max_value position's delta
-    std::vector<expr> dst_delta_idices;
-    std::vector<expr> max_value_delta_idices;
-    std::vector<expr> cur_tensor_idices;
-    std::vector<expr> conds(kernel.size());
-    std::vector<stmt> update_max_pos_stmts;
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        if (dst_fmt_vector[i] < 2) {
-            dst_delta_idices.emplace_back(iter_vars[i]);
-            max_value_delta_idices.emplace_back(iter_vars[i]);
-            cur_tensor_idices.emplace_back(iter_vars[i]);
-        } else {
-            int plan_axis = dst_fmt_vector[i];
-            int pads_stride_index = plan_axis - 2;
-            // k * stride_h - pad_begin_h + kernel_var_h
-            auto idx = int(stride[pads_stride_index]) * iter_vars[i]
-                    - int(pads_begin[pads_stride_index])
-                    + kernel_iter_vars[pads_stride_index];
-            expr tmp_cond
-                    = builder::make_logic_and(builder::make_cmp_ge(idx, 0),
-                            builder::make_cmp_lt(idx, dst.get_shape()[i]));
-            conds[pads_stride_index] = tmp_cond;
-            dst_delta_idices.emplace_back(idx);
-            cur_tensor_idices.emplace_back(idx);
-            update_max_pos_stmts.emplace_back(make_stmt<assign_node_t>(
-                    max_val_pos[pads_stride_index], idx));
-            max_value_delta_idices.emplace_back(max_val_pos[pads_stride_index]);
-        }
-    }
-    expr indexed_dst_delta
-            = builder::make_indexing(dst.tptr_, dst_delta_idices);
-    expr indexed_max_dst_delta, indexed_cur_tensor_val;
-
-    indexed_max_dst_delta
-            = builder::make_indexing(dst.tptr_, max_value_delta_idices);
-    indexed_cur_tensor_val
-            = builder::make_indexing(inputs[0]->tptr_, cur_tensor_idices);
-
-    // build inner kernel loop
-    stmt cur, body;
-    for (int i = kernel_iter_vars.size() - 1; i >= 0; i--) {
-        stmt else_stmt, then_stmt;
-        if (i == int(kernel_iter_vars.size() - 1)) {
-            update_max_pos_stmts.emplace_back(
-                    make_stmt<assign_node_t>(max_val, indexed_cur_tensor_val));
-            update_max_pos_stmts.emplace_back(
-                    make_stmt<assign_node_t>(has_max, true));
-            stmt update_max_info
-                    = make_stmt<stmts_node_t>(std::move(update_max_pos_stmts));
-            then_stmt = make_stmt<if_else_node_t>(
-                    max_val < indexed_cur_tensor_val, update_max_info, stmt());
-        } else {
-            then_stmt = cur;
-        }
-
-        expr zero_constant = make_expr<constant_node>(
-                -std::numeric_limits<float>::infinity(), in_dtype);
-        else_stmt = stmt();
-
-        cur = make_stmt<if_else_node_t>(conds[i], then_stmt, else_stmt);
-
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(std::move(kernel_iter_vars.at(i)),
-                expr(0), int(kernel[i]), expr(1), std::move(body), true,
-                for_type::NORMAL);
-    }
-
-    // build outter loop
-    for (int i = iter_vars.size() - 1; i >= 0; i--) {
-        if (i == int(iter_vars.size() - 1)) {
-            std::vector<stmt> kernel_body = assigns_of_max;
-            kernel_body.emplace_back(define_has_max);
-            kernel_body.emplace_back(std::move(cur));
-            kernel_body.emplace_back(make_stmt<if_else_node_t>(has_max,
-                    make_stmt<assign_node_t>(indexed_max_dst_delta,
-                            builder::make_add(
-                                    indexed_max_dst_delta, indexed_src_delta)),
-                    stmt()));
-            cur = make_stmt<stmts_node_t>(std::move(kernel_body));
-        }
-
-        // Do not generate those dummy loops
-        if (!iter_vars.at(i).isa<var>()) continue;
-
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-
-        cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)), 0,
-                inputs[1]->get_shape()[i], 1, std::move(body), true,
-                for_type::NORMAL);
-
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        bind_loop_axis(expand_gt, cur, i, true);
-    }
-
-    std::vector<stmt> func_body = defines_of_max;
-    func_body.emplace_back(std::move(cur));
-    cur = make_stmt<stmts_node_t>(std::move(func_body));
-
-    bld->emit(cur);
-}
-
-void pooling_backprop_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    // set up vx_info
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-    vx_info_.lanes = 1;
-    // if last axis of all inputs are not h or w (or d) ,lanes can be not 1
-    auto vector_lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    for (size_t i = 0; i < info_.inputs_.size(); i++) {
-        auto last_axis
-                = info_.inputs_[i]->details_.get_format().format_code_.get(
-                        inputs[i]->nbase_dims() - 1);
-        bool last_axis_not_compute
-                = last_axis != 2 && last_axis != 3 && last_axis != 4;
-        if (last_axis_not_compute) {
-            int last_dim = 1;
-            auto &dim_tmp = inputs[i]->get_shape().back();
-            if (dim_tmp.isa<constant>()) {
-                last_dim = get_const_as_int(dim_tmp.checked_as<constant_c>());
-            }
-            if (last_dim / vector_lanes && last_dim % vector_lanes == 0) {
-                vx_info_.lanes = vector_lanes;
-            } else {
-                vx_info_.lanes = 1;
-            }
-        }
-    }
-    auto in_dtype = info_.inputs_[0]->details_.dtype_;
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-
-    if (pooling_type_ == pooling_type_t::avg) {
-        compute_block_pooling_backward_avg(inputs, *dst[0], kernel_, stride_,
-                pads_begin_,
-                get_ncx_formatcode_vector_form_tensor(
-                        info_.inputs_[0], channel_last_),
-                vx_info_, in_dtype, attrs_, get_inputs()[0]);
-    } else {
-        compute_block_pooling_backward_max(inputs, *dst[0], kernel_, stride_,
-                pads_begin_,
-                get_ncx_formatcode_vector_form_tensor(
-                        info_.inputs_[1], channel_last_),
-                vx_info_, in_dtype, attrs_, get_inputs()[1]);
-    }
-}
-
-size_t pooling_backprop_op_t::compute_workload(
-        const std::vector<shape_dtype_pair> &,
-        const std::vector<shape_dtype_pair> &) {
-    return 0;
-}
-OP_REGISTER(pooling_avg_op_t, pooling_avg)
-OP_REGISTER(pooling_max_op_t, pooling_max)
-OP_REGISTER(pooling_avg_backprop_op_t, pooling_avg_backprop)
-OP_REGISTER(pooling_max_backprop_op_t, pooling_max_backprop)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/pooling.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/pooling.hpp
deleted file mode 100644
index b72abe72569..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/pooling.hpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_POOLING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_POOLING_HPP
-
-#include <vector>
-#include "compiler/dimensions.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class pooling_type_t : int { avg = 0, max };
-namespace pooling_attr_key {
-constexpr const char *pooling_type = "pooling_type";
-constexpr const char *strides = "strides";
-constexpr const char *pads_begin = "pads_begin";
-constexpr const char *pads_end = "pads_end";
-constexpr const char *paddings = "paddings";
-constexpr const char *kernel = "kernel";
-constexpr const char *exclude_pad = "exclude_pad";
-constexpr const char *rounding_type = "rounding_type";
-constexpr const char *auto_pad = "auto_pad";
-constexpr const char *src_shape = "src_shape";
-constexpr const char *data_format = "data_format";
-} // namespace pooling_attr_key
-
-namespace auto_pad_options {
-constexpr const char *none = "None";
-constexpr const char *same_upper = "SAME_UPPER";
-constexpr const char *same_lower = "SAME_LOWER";
-constexpr const char *valid = "VALID";
-} // namespace auto_pad_options
-
-namespace rounding_type_options {
-constexpr const char *floor = "floor";
-constexpr const char *ceil = "ceil";
-} // namespace rounding_type_options
-
-namespace data_format_options {
-constexpr const char *NCX = "NCX";
-constexpr const char *NXC = "NXC";
-} // namespace data_format_options
-class pooling_op_t : public fusible_op_t,
-                     public op_traits::auto_copyable_t,
-                     public op_traits::may_quantize_t {
-public:
-    pooling_type_t pooling_type_;
-    sc_dims stride_;
-    sc_dims pads_begin_;
-    sc_dims pads_end_;
-    sc_dims kernel_;
-
-    pooling_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    pooling_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            const pooling_type_t &pl_type, const any_map_t &attrs);
-
-    DECLARE_QUERY_AND_COMPUTE()
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-
-    std::vector<int> get_real_pooling_axis() const;
-
-    std::vector<int> get_channel_axis() const;
-
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-    reflection::shared_general_object_t get_dynamic_runtime_info() override;
-
-    void calculate_dynamic_shape_expression() override;
-
-private:
-    sc_dims _calculate_output_dims(bool rounding_floor, bool channel_last);
-
-    bool channel_last_;
-    // use vectorized
-    vectorized_info_t vx_info_;
-};
-
-class pooling_avg_op_t : public pooling_op_t {
-public:
-    using parent = pooling_op_t;
-    /*
-     * Inputs:
-     * 1: input - input tensor. Required.
-     * Attributes:
-     * strides Required
-     * pads_begin + pads_end  (or paddings)  Required
-     * kernel Required
-     * exclude_pad Required
-     * rounding_type Optional
-     * auto_pad Optional
-     * data_format Optional
-     */
-    pooling_avg_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-
-class pooling_max_op_t : public pooling_op_t {
-public:
-    using parent = pooling_op_t;
-    /*
-     * Inputs:
-     * 1: input - input tensor. Required.
-     * Attributes:
-     * strides Required
-     * pads_begin + pads_end  (or paddings)  Required
-     * kernel Required
-     * rounding_type Optional
-     * auto_pad Optional
-     * data_format Optional
-     */
-    pooling_max_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-
-class pooling_backprop_op_t : public fusible_op_t,
-                              public op_traits::auto_copyable_t {
-public:
-    pooling_type_t pooling_type_;
-    sc_dims stride_;
-    sc_dims pads_begin_;
-    sc_dims pads_end_;
-    sc_dims kernel_;
-
-    pooling_backprop_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    DECLARE_QUERY_AND_COMPUTE()
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-
-private:
-    bool channel_last_;
-    // use vectorized
-    vectorized_info_t vx_info_;
-};
-
-class pooling_avg_backprop_op_t : public pooling_backprop_op_t {
-public:
-    using parent = pooling_backprop_op_t;
-    /*
-     * Inputs:
-     * 1: output_delta  Required.
-     * Attributes:
-     * strides Required
-     * pads_begin + pads_end  (or paddings)  Required
-     * kernel Required.
-     * exclude_pad Required.
-     * input_shape Required.
-     * auto_pad Optional
-     * data_format Optional
-     */
-    pooling_avg_backprop_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    pooling_avg_backprop_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, sc_dims &input_shape,
-            const any_map_t &attrs);
-};
-
-class pooling_max_backprop_op_t : public pooling_backprop_op_t {
-public:
-    using parent = pooling_backprop_op_t;
-    /*
-     * Inputs:
-     * 1: output_delta   Required.
-     * 2: input_forward   Required.
-     * Attributes:
-     * strides Required
-     * pads_begin + pads_end  (or paddings)  Required
-     * kernel Required
-     * auto_pad Optional
-     * data_format Optional
-     */
-    pooling_max_backprop_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/reduce.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/reduce.cpp
deleted file mode 100644
index 5f9cb61a622..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/reduce.cpp
+++ /dev/null
@@ -1,1241 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "reduce.hpp"
-#include "util/bf16.hpp"
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <runtime/config.hpp>
-#include <unordered_map>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static inline any_map_t add_key(const any_map_t &attrs, int rd_op_) {
-    auto ret = attrs;
-    ret["rd_op"] = rd_op_;
-    return ret;
-}
-
-reduce_sum_op_t::reduce_sum_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : reduce_op_t(ins, outs,
-            add_key(attrs, static_cast<int>(reduce_operator::add))) {}
-
-reduce_prod_op_t::reduce_prod_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : reduce_op_t(ins, outs,
-            add_key(attrs, static_cast<int>(reduce_operator::mul))) {}
-
-reduce_max_op_t::reduce_max_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : reduce_op_t(ins, outs,
-            add_key(attrs, static_cast<int>(reduce_operator::max))) {}
-
-reduce_min_op_t::reduce_min_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : reduce_op_t(ins, outs,
-            add_key(attrs, static_cast<int>(reduce_operator::min))) {}
-
-// compute the output data format after reduction given the plain reduction
-// axis
-static sc_data_format_t get_reduced_format(const sc_data_format_t &in_fmt,
-        const std::vector<int> &rd_axis, size_t nlogical_dims) {
-    auto base_fmt = in_fmt;
-    // we should set the blocking of the reduce axies to 1
-    int ax_offset = 0;
-    for (int ax : rd_axis) {
-        for (int blocking_idx :
-                in_fmt.format_code_.collect_blocking_index(ax - ax_offset)) {
-            base_fmt.blocks_[blocking_idx] = 1;
-        }
-    }
-    return base_fmt;
-}
-
-reduce_op_t::reduce_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 1, "Expecting 1 input for reduce_op_t");
-    info_.inputs_ = ins;
-    COMPILE_ASSERT(attrs.has_key("rd_axis") && attrs.has_key("rd_op"),
-            "attrs of reduce op should have both reduce axis and operand "
-            "information.");
-    plain_rd_axis_ = attrs.get<std::vector<int>>("rd_axis");
-    rd_op_ = reduce_operator(attrs.get<int>("rd_op"));
-    keep_dims_ = attrs.get_or_else("keep_dims", true);
-
-    auto &old_reduce_dims = ins[0]->details_.get_plain_dims();
-    std::sort(plain_rd_axis_.begin(), plain_rd_axis_.end());
-    assert(plain_rd_axis_[plain_rd_axis_.size() - 1]
-            < static_cast<int64_t>(old_reduce_dims.size()));
-    // check duplicates
-    bool duplicate
-            = std::adjacent_find(plain_rd_axis_.begin(), plain_rd_axis_.end())
-            != plain_rd_axis_.end();
-    COMPILE_ASSERT(!duplicate, "duplicate axis found in rd_axis");
-    sc_dims new_reduce_dims;
-    new_reduce_dims.reserve(keep_dims_
-                    ? old_reduce_dims.size()
-                    : old_reduce_dims.size() - plain_rd_axis_.size());
-    for (unsigned i = 0; i < old_reduce_dims.size(); i++) {
-        bool is_reduction = std::find(plain_rd_axis_.begin(),
-                                    plain_rd_axis_.end(), static_cast<int>(i))
-                != plain_rd_axis_.end();
-        if (is_reduction) {
-            if (keep_dims_) { new_reduce_dims.push_back(1); }
-        } else {
-            new_reduce_dims.push_back(old_reduce_dims[i]);
-        }
-    }
-    if (new_reduce_dims.empty()) new_reduce_dims.push_back(1);
-    if (outs.empty()) {
-        logical_tensor_t out;
-        if (keep_dims_) {
-            out = logical_tensor_t(
-                    get_reduced_format(ins[0]->details_.get_format(),
-                            plain_rd_axis_,
-                            ins[0]->details_.get_plain_dims().size()),
-                    new_reduce_dims, ins[0]->details_.dtype_);
-        } else {
-            out = logical_tensor_t(
-                    sc_data_format_t::get_plain_by_dims(new_reduce_dims.size()),
-                    new_reduce_dims, ins[0]->details_.dtype_);
-        }
-
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this, out));
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "Wrong op output size.\n");
-        info_.outputs_ = outs;
-    }
-    auto &output = info_.outputs_[0];
-    output->details_.dtype_ = info_.inputs_[0]->details_.dtype_;
-    attrs_ = attrs;
-    op_name_ = "reduce";
-}
-
-reduce_op_t::reduce_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-        reduce_operator rd_op, bool keep_dims)
-    : reduce_op_t({std::move(v)}, {},
-            {{"rd_axis", rd_axis}, {"rd_op", static_cast<int>(rd_op)},
-                    {"keep_dims", keep_dims}}) {}
-
-void reduce_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    const auto &in_fmt = info_.inputs_[0]->details_.get_format();
-    in_formats.push_back({in_fmt});
-    if (keep_dims_) {
-        out_formats.push_back({get_reduced_format(in_fmt, plain_rd_axis_,
-                info_.inputs_[0]->details_.get_plain_dims().size())});
-    } else {
-        auto out_shape_size = info_.inputs_[0]->details_.get_plain_dims().size()
-                - plain_rd_axis_.size();
-        if (out_shape_size == 0) out_shape_size = 1;
-        if (!in_fmt.is_blocking()) {
-            out_formats.push_back(
-                    {sc_data_format_t::get_plain_by_dims(out_shape_size)});
-        } else {
-            if (static_cast<int>(plain_rd_axis_.size())
-                    == in_fmt.format_code_.norig_dims()) {
-                // all reduced
-                out_formats.push_back({sc_data_format_t(format_kinds::A)});
-            } else {
-                sc_data_format_t new_format;
-                auto &new_code = new_format.format_code_;
-                std::vector<int> old_code_idx, ordered_idx;
-                // plain part
-                for (int i = 0; i < in_fmt.format_code_.norig_dims(); i++) {
-                    if (std::all_of(plain_rd_axis_.begin(),
-                                plain_rd_axis_.end(), [&](int j) {
-                                    return j != in_fmt.format_code_.get(i);
-                                })) {
-                        old_code_idx.push_back(in_fmt.format_code_.get(i));
-                    }
-                }
-                for (int i = 0; i < static_cast<int>(old_code_idx.size());
-                        i++) {
-                    ordered_idx.push_back(i);
-                }
-                std::sort(ordered_idx.begin(), ordered_idx.end(),
-                        [&old_code_idx](int p, int q) -> bool {
-                            return old_code_idx[p] < old_code_idx[q];
-                        });
-                std::vector<int> new_code_idx(old_code_idx.size(), 0);
-                for (int i = 0; i < static_cast<int>(old_code_idx.size());
-                        i++) {
-                    new_code_idx[ordered_idx[i]] = i;
-                }
-                // remained blocking part
-                for (int i = in_fmt.format_code_.norig_dims();
-                        i < in_fmt.format_code_.ndims(); i++) {
-                    for (int j = 0; j < static_cast<int>(old_code_idx.size());
-                            j++) {
-                        if (old_code_idx[j] == in_fmt.format_code_.get(i)) {
-                            new_code_idx.push_back(j);
-                            break;
-                        }
-                    }
-                }
-                // infer new_format.format_code_ accoring to new_code_idx
-                for (int i = 0; i < static_cast<int>(new_code_idx.size());
-                        i++) {
-                    new_code.set(i, new_code_idx[i]);
-                }
-                // copy blocks_ to new_format
-                if (std::all_of(plain_rd_axis_.begin(), plain_rd_axis_.end(),
-                            [&](int i) {
-                                return !in_fmt.format_code_
-                                                .collect_blocking_index(i)
-                                                .empty();
-                            })) {
-                    int blocks_idx = 0;
-                    for (int i = in_fmt.format_code_.norig_dims();
-                            i < in_fmt.format_code_.ndims(); i++) {
-                        if (std::none_of(plain_rd_axis_.begin(),
-                                    plain_rd_axis_.end(), [&](int j) {
-                                        return in_fmt.format_code_.get(i) == j;
-                                    })) {
-                            new_format.blocks_[blocks_idx] = in_fmt.blocks_[i
-                                    - in_fmt.format_code_.norig_dims()];
-                            blocks_idx++;
-                        }
-                    }
-                } else {
-                    new_format.blocks_[0] = in_fmt.blocks_[0];
-                    new_format.blocks_[1] = in_fmt.blocks_[1];
-                    new_format.blocks_[2] = in_fmt.blocks_[2];
-                    new_format.blocks_[3] = in_fmt.blocks_[3];
-                }
-                out_formats.push_back({new_format});
-            }
-        }
-    }
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-static slice_range_list infer_output_slice_range(bool is_reduce_compute,
-        uint64_t vec_step, const slice_range_list &known_ranges_list,
-        const std::vector<int> &real_rd_axis, bool keep_dims,
-        sc_dim num_threads) {
-    slice_range_list reduce_ranges_list;
-    for (auto &known_ranges : known_ranges_list) {
-        slice_range reduce_range;
-        if (num_threads > 1) {
-            reduce_range.emplace_back(std::pair<expr, expr> {0, 1});
-        }
-        // additional process is needed.
-        for (size_t i = 0; i < known_ranges.size(); i++) {
-            if (real_rd_axis.end()
-                    != std::find(real_rd_axis.begin(), real_rd_axis.end(), i)) {
-                if (keep_dims) {
-                    reduce_range.emplace_back(std::pair<expr, expr> {0, 1});
-                }
-                // last-axis reduce
-                if (is_reduce_compute && i == known_ranges.size() - 1) {
-                    reduce_range.emplace_back(
-                            std::pair<expr, expr> {0, vec_step});
-                }
-            } else {
-                reduce_range.emplace_back(known_ranges.at(i));
-            }
-        }
-        // reduce all and keep_dims = false;
-        if ((known_ranges.size() == real_rd_axis.size()) && !keep_dims)
-            reduce_range.emplace(
-                    reduce_range.begin(), std::pair<expr, expr> {0, 1});
-        reduce_ranges_list.emplace_back(reduce_range);
-    }
-    return reduce_ranges_list;
-}
-
-infer_status_code update_reduce_op_fsmap(sc_op *ths,
-        const graph_tensor_ptr &input, fslice_map &fsmap,
-        const std::vector<int> &real_rd_axis) {
-    auto required_axis = real_rd_axis;
-    if (auto red_coll = ths->dyn_cast<reduce_collect_op_t>()) {
-        if (red_coll->op_ == reduce_collect_op_t::kind::COPY) {
-            required_axis.erase(required_axis.begin());
-        }
-    }
-    auto &src_dim = input->details_.get_blocking_dims();
-    // check the slice range whether meet the least demand of reduce op
-    for (auto &src_range : fsmap.get(input)) {
-        if (!slice_full_on_axis(src_dim, src_range, required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-    return infer_status_code::OK;
-}
-
-infer_status_code reduce_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    slice_range_list &known_ranges_list = known_ranges_map[0];
-    auto real_rd_axis = get_rd_axis();
-    auto status = update_reduce_op_fsmap(
-            this, get_inputs()[0], fsmap, real_rd_axis);
-    if (status == infer_status_code::RETRY) return status;
-    fsmap.get(get_outputs()[0]) = infer_output_slice_range(
-            false, 0, known_ranges_list, real_rd_axis, keep_dims_, 1);
-    return infer_status_code::OK;
-}
-
-infer_status_code reduce_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    auto &input = get_inputs()[0];
-    auto &out_ranges = fsmap.get(get_outputs()[0]);
-    auto &in_ranges = fsmap.get(input);
-    auto real_rd_axis = get_rd_axis();
-    if (in_ranges.empty()) {
-        slice_range_list reduce_ranges_list;
-        for (auto &range : out_ranges) {
-            slice_range reduce_range;
-            // additional process is needed.
-            auto real_dims = get_inputs()[0]->details_.get_blocking_dims_expr(
-                    get_owner_graph());
-            // idx record real idx in range, used to skip range {0,1} when
-            // keep_dims=false
-            int idx = 0;
-            for (size_t i = 0; i < real_dims.size(); i++) {
-                if (real_rd_axis.end()
-                        != std::find(
-                                real_rd_axis.begin(), real_rd_axis.end(), i)) {
-                    reduce_range.emplace_back(
-                            std::pair<expr, expr> {0, real_dims.at(i)});
-                    if (keep_dims_) idx++;
-                } else {
-                    reduce_range.emplace_back(range.at(idx++));
-                }
-            }
-            reduce_ranges_list.emplace_back(reduce_range);
-        }
-        in_ranges = reduce_ranges_list;
-    }
-    return infer_status_code::OK;
-}
-
-void infer_reduce_binding_axis(fusible_op_t *cur, binding_axis_map &bdax_map,
-        const std::vector<int> &plain_rd_axis, bool keep_dims) {
-    // query real output which needs to infer
-    auto outgt = cur->get_outputs()[0];
-    // penetrate out gt if necessary
-    if (cur->isa<reduce_impl_op_t>()) {
-        if (cur->isa<reduce_compute_op_t>()
-                && !cur->get_inputs()[0]
-                            ->producer_owner_->isa<reduce_compute_op_t>()) {
-            auto user = cur->get_outputs()[0]->uses_[0].second;
-            if (user->isa<output_op>()) return;
-            if (user->isa<reduce_compute_op_t>()) {
-                user = user->get_outputs()[0]->uses_[0].second;
-            }
-            COMPILE_ASSERT(user->isa<reduce_collect_op_t>()
-                            || user->isa<reduce_op_t>(),
-                    "Only reduce_compute + reduce_collect/reduce pattern is "
-                    "expected")
-            outgt = user->get_outputs()[0];
-        } else {
-            call_output_user_axis_binding(cur, bdax_map);
-            return;
-        }
-    } else {
-        if (cur->get_inputs()[0]->producer_owner_->isa<reduce_compute_op_t>()) {
-            call_output_user_axis_binding(cur, bdax_map);
-            return;
-        }
-    }
-    auto &outaxis = bdax_map.get(outgt);
-    if (!outaxis.empty()) return;
-    auto known_axis_map = search_known_input_axis(cur, bdax_map);
-    if (keep_dims) {
-        outaxis = known_axis_map[0];
-    } else {
-        std::vector<int> non_rd_axis;
-        auto plain_dims = cur->get_inputs()[0]->details_.get_plain_dims();
-        for (size_t i = 0; i < plain_dims.size(); i++) {
-            if (plain_rd_axis.end()
-                    != std::find(plain_rd_axis.begin(), plain_rd_axis.end(),
-                            static_cast<int>(i)))
-                continue;
-            else
-                non_rd_axis.emplace_back(i);
-        }
-        for (auto &bd_ax : known_axis_map[0]) {
-            std::vector<int> ret;
-            for (auto &ax : bd_ax) {
-                auto iter
-                        = std::find(non_rd_axis.begin(), non_rd_axis.end(), ax);
-                if (iter != non_rd_axis.end()) {
-                    ret.emplace_back(iter - non_rd_axis.begin());
-                }
-            }
-            outaxis.emplace_back(ret);
-        }
-    }
-    set_unknown_binding_axis(cur, known_axis_map, bdax_map);
-}
-
-void pre_reduce_binding_axis(fusible_op_t *cur, binding_axis_map &bdax_map,
-        const std::vector<int> &plain_rd_axis, bool keep_dims) {
-    auto &output = cur->get_outputs()[0];
-    // query real input which needs to infer
-    auto input = cur->get_inputs()[0];
-    // once `reduce` op is split, its plain dims will be lost and reset to
-    // blocking dims, so we need to penetrate input. However, if its producer
-    // owner is not expected, auto skip it.
-    if (!input->producer_owner_->isa<op_traits::mixed_partition_acceptable>()
-            || is_single_op_graph(cur->get_owner_graph()))
-        return;
-    // penetrate input gt if necessary
-    if (cur->isa<reduce_impl_op_t>()) {
-        if (cur->isa<reduce_collect_op_t>()) {
-            auto producer = cur->get_inputs()[0]->producer_owner_;
-            COMPILE_ASSERT(producer->isa<reduce_compute_op_t>(),
-                    "Only reduce_compute + reduce_collect pattern is expected")
-            if (producer->get_inputs()[0]
-                            ->producer_owner_->isa<reduce_compute_op_t>()) {
-                producer = producer->get_inputs()[0]->producer_owner_;
-            }
-            COMPILE_ASSERT(
-                    !producer->get_inputs()[0]
-                             ->producer_owner_->isa<reduce_compute_op_t>(),
-                    "Unexpected reduce_compute op")
-            input = producer->get_inputs()[0];
-        } else {
-            if (auto bd_op = input->producer_owner_->dyn_cast<
-                             op_traits::mixed_partition_acceptable>()) {
-                bd_op->pre_infer_binding_axis(bdax_map);
-            }
-            return;
-        }
-    } else {
-        auto producer = cur->get_inputs()[0]->producer_owner_;
-        if (producer->isa<reduce_compute_op_t>()) {
-            input = producer->get_inputs()[0];
-        }
-        // reset `keep_dims`
-        keep_dims = (input->details_.get_plain_dims().size()
-                == output->details_.get_plain_dims().size());
-    }
-    auto &outaxis = bdax_map.get(output);
-    COMPILE_ASSERT(!outaxis.empty(),
-            "Unknown output axis found, could not pre infer binding axis")
-    auto &inpaxis = bdax_map.get(input);
-    if (inpaxis.empty()) {
-        if (keep_dims) {
-            inpaxis = outaxis;
-        } else {
-            std::vector<int> non_rd_axis;
-            auto plain_dims = input->details_.get_plain_dims();
-            for (size_t i = 0; i < plain_dims.size(); i++) {
-                if (plain_rd_axis.end()
-                        != std::find(plain_rd_axis.begin(), plain_rd_axis.end(),
-                                static_cast<int>(i)))
-                    continue;
-                else
-                    non_rd_axis.emplace_back(i);
-            }
-            // all-reduce
-            bool all_reduce = non_rd_axis.empty();
-            for (auto &bd_ax : outaxis) {
-                if (all_reduce) {
-                    inpaxis.emplace_back(std::vector<int> {});
-                } else {
-                    std::vector<int> ret;
-                    ret.reserve(bd_ax.size());
-                    for (auto &ax : bd_ax) {
-                        ret.emplace_back(non_rd_axis[ax]);
-                    }
-                    inpaxis.emplace_back(ret);
-                }
-            }
-        }
-        if (auto bd_op
-                = input->producer_owner_
-                          ->dyn_cast<op_traits::mixed_partition_acceptable>()) {
-            bd_op->pre_infer_binding_axis(bdax_map);
-        }
-    }
-}
-
-void reduce_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    infer_reduce_binding_axis(this, bdax_map, plain_rd_axis_, keep_dims_);
-}
-void reduce_op_t::pre_infer_binding_axis(binding_axis_map &bdax_map) {
-    pre_reduce_binding_axis(this, bdax_map,
-            attrs_.get_or_else("orig_plain_rd_axis", plain_rd_axis_),
-            keep_dims_);
-}
-
-shape_rl_vec reduce_op_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    auto rd_axis = get_rd_axis();
-    for (size_t i = 0; i < out_dims.size(); i++) {
-        if (is_dynamic_dim(out_dims[i])) {
-            ret.emplace_back(in_dims[i], out_dims[i]);
-        }
-    }
-    return ret;
-}
-
-static union_val get_init_val_for_reduce(
-        reduce_operator rd_op, sc_data_type_t dtype) {
-    variant<float, int64_t> init_value;
-    bool is_int = utils::is_one_of(dtype.type_code_, sc_data_etype::U8,
-            sc_data_etype::U32, sc_data_etype::S8, sc_data_etype::S32);
-    if (rd_op == reduce_operator::mul) {
-        if (is_int) {
-            init_value = int64_t(1);
-        } else {
-            init_value = 1.f;
-        }
-    } else if (rd_op == reduce_operator::add) {
-        if (is_int) {
-            init_value = int64_t(0);
-        } else {
-            init_value = 0.f;
-        }
-    } else if (rd_op == reduce_operator::min) {
-        init_value = numeric_limits_maximum(dtype.type_code_);
-    } else {
-        COMPILE_ASSERT(rd_op == reduce_operator::max, "wrong reduce kind");
-        init_value = numeric_limits_minimum(dtype.type_code_);
-    }
-    return init_value.cast<union_val>();
-}
-
-using binary_tir_gen_f = expr (*)(const expr_c &, const expr_c &);
-static binary_tir_gen_f get_binary_by_reduce_op(reduce_operator rdop) {
-    switch (rdop) {
-        case reduce_operator::add: return builder::make_add; break;
-        case reduce_operator::mul: return builder::make_mul; break;
-        case reduce_operator::max: return builder::make_max; break;
-        case reduce_operator::min: return builder::make_min; break;
-    }
-    return nullptr;
-}
-
-using unary_tir_gen_f = expr (*)(const expr_c &);
-static unary_tir_gen_f get_binary_reduce_by_reduce_op(reduce_operator rdop) {
-    switch (rdop) {
-        case reduce_operator::add: return builder::make_reduce_add; break;
-        case reduce_operator::mul: return builder::make_reduce_mul; break;
-        case reduce_operator::max: return builder::make_reduce_max; break;
-        case reduce_operator::min: return builder::make_reduce_min; break;
-    }
-    return nullptr;
-}
-
-// reduce all tensor_slice into sum, NOTE here src is a common
-// tensor_slice but dst maybe whole temp_buffer because output shape of
-// reduction is not equal to src, so it will allocate a new buffer
-static void compute_block_reduce(sc_graph_t &graph, const sc_op_info_t &info,
-        const std::vector<const tensor_slice *> &src, const tensor_slice &dst,
-        reduce_operator rd_op, std::vector<int> rd_axis, bool keep_dims,
-        const vectorized_info_t &vx_info, sc_data_type_t dtype,
-        any_map_t &attrs, const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_dynamic = false) {
-    // nested loop vars
-    std::vector<expr> iter_vars;
-    // the indices for multiple inputs. First dim: the input, Second
-    // dim: the dimemsions in the tensor
-    std::vector<std::vector<expr>> src_indices(src.size());
-    // the indices for the output tensor
-    std::vector<expr> dst_idx;
-    // If last_axis can be reduce, we use current logic, but if last axis is
-    // not in `rd_axis`, we can still use vectorization but not use
-    // reduce_add.
-    std::sort(rd_axis.begin(), rd_axis.end());
-    bool last_axis_reduce = *rd_axis.rbegin()
-            == static_cast<int>(src.at(0)->nslice_dims() - 1);
-
-    /*** Unlike compute_xxx, compute_reduce only use src.ranges_
-     * The final IR may look like below:
-     * _for_(_fuseiter_i, 0, 1)
-     *  sum = 0;
-     *  _for_(_fuseiter_j, 0, 1)
-     *   _for_(_fuseiter_k, 0, 1)
-     *     sum += src[src_idx];
-     *  dst[dst_idx] = sum(/num);
-     * */
-    // use src_indices.at(0) as default
-    auto &src_idx = src_indices.at(0);
-    // TODO(xxx): need more detailed judgement for `last_dim = 1` case
-    int last_dim = -1;
-    auto &dim_tmp = src[0]->get_shape().back();
-    if (dim_tmp.isa<constant>()) {
-        last_dim = get_const_as_int(dim_tmp.checked_as<constant_c>());
-    }
-
-    for (unsigned i = 0; i < src.at(0)->nslice_dims(); i++) {
-        iter_vars.emplace_back(range_from_outer_loop(src.at(0)->get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-        src_idx.emplace_back(iter_vars.back());
-    }
-    // dst.ranges_ is equal to dst.tptr_->dims() in this case, because it
-    // will be newly allocated.
-    for (unsigned i = 0; i < src.at(0)->nslice_dims(); i++) {
-        if (rd_axis.end() != std::find(rd_axis.begin(), rd_axis.end(), i)) {
-            if (keep_dims) dst_idx.emplace_back(expr(0));
-        } else {
-            dst_idx.emplace_back(iter_vars.at(i));
-        }
-    }
-    const int INVALID_AXIS_MASK = -64;
-    int last_axis_mask = INVALID_AXIS_MASK;
-    std::unordered_map<expr, std::pair<expr, expr>> conditions;
-    compute_mask_and_generate_condition(graph, src,
-            info.inputs_[0]->details_.get_plain_dims(),
-            info.inputs_[0]->details_.get_format(), iter_vars, vx_info.lanes,
-            conditions, last_axis_mask);
-    // need mask
-    expr mask_directly;
-    auto slice_len = src[0]->get_shape().back();
-    int lanes = static_cast<int>(vx_info.lanes);
-    auto floor = do_cast_and_fold(slice_len / lanes * lanes);
-    auto tail = do_cast_and_fold(slice_len % lanes);
-    int floor_int = 0;
-    int tail_int = 0;
-    if (floor.isa<constant>()) {
-        floor_int = get_expr_as_int(floor);
-        tail_int = get_expr_as_int(tail);
-        COMPILE_ASSERT((floor_int + tail_int), "Don't support shape len = 0.");
-    }
-    bool is_lastdim_meet_require
-            = tail.isa<constant>() && tail_int == 1 && floor_int == 0;
-    if (is_lastdim_meet_require) {
-        lanes = 1;
-    } else if (last_dim % lanes) {
-        if (rd_op == reduce_operator::add) {
-            mask_directly = last_dim_generate_mask(
-                    src_idx.back(), floor, slice_len, lanes);
-        } else {
-            lanes = 1;
-        }
-    }
-    dst_idx = !dst_idx.empty() ? dst_idx : std::vector<expr> {expr {0}};
-    expr indexed_target = builder::make_indexing(dst.tptr_, dst_idx,
-            !last_axis_reduce ? lanes : 1,
-            !last_axis_reduce ? mask_directly : expr());
-    expr indexed_input = builder::make_indexing(
-            src.at(0)->tptr_, src_indices.at(0), lanes, mask_directly);
-
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    stmt body, cur;
-    auto reduce_value
-            = builder::make_var(sc_data_type_t(dtype.type_code_, lanes),
-                    "reduce_" + fusion_create_var_idx());
-    auto inter_padding_var
-            = builder::make_var(sc_data_type_t(dtype.type_code_, lanes),
-                    "reduce_" + fusion_create_var_idx());
-
-    union_val value = get_init_val_for_reduce(rd_op, dtype);
-    stmt asnode = make_stmt<assign_node_t>(reduce_value,
-            make_expr<constant_node>(
-                    value, sc_data_type_t(dtype.type_code_, lanes)));
-    auto define_reduce
-            = make_stmt<define_node_t>(reduce_value, linkage::local, expr());
-    stmt padding_middle_asnode = make_stmt<assign_node_t>(inter_padding_var,
-            make_expr<constant_node>(
-                    value, sc_data_type_t(dtype.type_code_, lanes)));
-    auto define_padding_reduce = make_stmt<define_node_t>(
-            inter_padding_var, linkage::local, expr());
-
-    // because reduce_op_t use temp register to add up, for rightly write
-    // back it may need to reorder reduction for-loop into inner-most
-    // loop
-    std::vector<int> new_loop_order = rd_axis;
-    for (int64_t i = src.at(0)->nslice_dims() - 1; i >= 0; i--) {
-        if (rd_axis.end() != std::find(rd_axis.begin(), rd_axis.end(), i))
-            continue;
-        else
-            new_loop_order.insert(new_loop_order.begin(), i);
-    }
-    std::reverse(new_loop_order.begin(), new_loop_order.end());
-
-    bool is_padding_ir = false;
-    for (auto i : new_loop_order) {
-        if (i == new_loop_order.front()) {
-            auto cond_it = conditions.find(iter_vars[i]);
-            is_padding_ir = cond_it != conditions.end()
-                    && rd_op != reduce_operator::add;
-            // reduce add don't need to consider padding case
-            if (is_padding_ir) {
-                assert(last_axis_mask != INVALID_AXIS_MASK);
-                std::vector<stmt_c> cur_list;
-                cur_list.emplace_back(builder::make_assign_unattached(
-                        inter_padding_var, indexed_input));
-
-                // calculate mask, upper_bound - cur_index
-                auto upper_bound_int = builder::make_cast(
-                        datatypes::s32, cond_it->second.second);
-                auto cur_index_int = builder::make_cast(
-                        datatypes::s32, cond_it->second.first);
-                auto cur_step = builder::make_min(
-                        builder::make_max(builder::make_sub(upper_bound_int,
-                                                  cur_index_int),
-                                0),
-                        lanes);
-                stmt mask_def;
-                auto mask
-                        = generate_mask_var_by_step(mask_def, cur_step, lanes);
-                cur_list.emplace_back(mask_def);
-
-                cur_list.emplace_back(builder::make_assign_unattached(
-                        reduce_value,
-                        builder::make_select(mask,
-                                get_binary_by_reduce_op(rd_op)(
-                                        reduce_value, inter_padding_var),
-                                reduce_value)));
-                cur = builder::make_stmts_unattached(cur_list);
-            } else {
-                cur = make_stmt<assign_node_t>(reduce_value,
-                        get_binary_by_reduce_op(rd_op)(
-                                indexed_input, reduce_value));
-            }
-            cur->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-        }
-        // Do not generate those dummy loops
-        if (iter_vars.at(i).isa<var>()) {
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            // insert mask define.
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), src.at(0)->get_shape().at(i),
-                    i == static_cast<int>(src.at(0)->nslice_dims() - 1)
-                            ? expr(static_cast<int>(lanes))
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-        // the outer-most reduction axis
-        if (i == rd_axis.front()) {
-            std::vector<stmt_c> res;
-            std::vector<stmt_c> rd_no_padding_assign {define_reduce, asnode,
-                    std::move(cur),
-                    make_stmt<assign_node_t>(indexed_target,
-                            lanes > 1 && last_axis_reduce
-                                    ? get_binary_reduce_by_reduce_op(rd_op)(
-                                            reduce_value)
-                                    : reduce_value)};
-            if (is_padding_ir) {
-                std::vector<stmt_c> rd_padding_assign {
-                        define_padding_reduce, padding_middle_asnode};
-                rd_padding_assign.insert(rd_padding_assign.end(),
-                        rd_no_padding_assign.begin(),
-                        rd_no_padding_assign.end());
-                res = std::move(rd_padding_assign);
-            } else {
-                res = std::move(rd_no_padding_assign);
-            }
-
-            cur = builder::make_stmts_unattached(res);
-        }
-    }
-    // set merge_loop attr
-    if (cur.isa<for_loop>()) cur->attr()[stmt_attr_key::merge_loop] = true;
-    bld->emit(cur);
-}
-
-std::vector<int> reduce_op_t::get_rd_axis() const {
-    return transform_axis_plain2blocking(info_.inputs_[0], plain_rd_axis_);
-}
-
-int reduce_op_t::get_compressed_rd_axis_int() const {
-    auto rd_axis = get_rd_axis();
-    int ret = 0;
-    for (auto &rd : rd_axis) {
-        ret |= (1 << rd);
-    }
-    return ret;
-}
-
-void reduce_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // original rd_axis may be modified during layout_propagation pass, so
-    // we need to call `get_rd_axis()` to get real reduce axis
-    auto real_rd_axis = get_rd_axis();
-    // set default vectorized information
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-    auto vector_lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    vx_info_.lanes = vector_lanes;
-
-    compute_block_reduce(get_owner_graph(), info_, inputs, *dst[0], rd_op_,
-            real_rd_axis, keep_dims_, vx_info_,
-            info_.inputs_[0]->details_.dtype_, attrs_, get_inputs()[0], wkld,
-            is_dynamic());
-}
-
-size_t reduce_op_t::compute_workload(const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    auto &shape = ins[0].first;
-    auto &dtype = ins[0].second;
-    auto real_rd_axis = get_rd_axis();
-    size_t wkld = utils::get_sizeof_type(dtype) * read_weight;
-    wkld += utils::get_sizeof_type(dtype) * write_weight;
-    return wkld;
-}
-
-// assume that the first axis is parallel. we currently can use
-// reduce_compute+reduce_collect when reduction axis is not outside of the
-// parallel axis and is not last axis reduction(for performance)
-bool reduce_op_t::can_split_op() const {
-    if (attrs_.get_or_else("temp.no_split_reduce", false)) return false;
-    if (runtime_config_t::get().get_num_threads() == 1) { return true; }
-    auto ax = get_rd_axis();
-    int last_dim = get_inputs()[0]->details_.get_blocking_dims().size() - 1;
-    for (auto i : ax) {
-        if (i == 0) return false;
-    }
-    return true;
-}
-
-graph_tensor_ptr reduce_op_t::split_op(
-        const context_ptr &ctx, sc_graph_t &graph, int num_threads) {
-    auto rd_ax = get_rd_axis();
-
-    int last_dim = get_inputs()[0]->details_.get_blocking_dims().size() - 1;
-    bool last_axis = false;
-    for (auto i : rd_ax) {
-        if (last_dim == i) {
-            last_axis = true;
-            break;
-        }
-    }
-
-    auto first_out = get_outputs()[0]->copy();
-    first_out->producer_owner_ = nullptr;
-    auto second_out = get_outputs()[0]->copy();
-    second_out->producer_owner_ = nullptr;
-
-    bool is_bf16 = get_inputs()[0]->details_.dtype_ == datatypes::bf16
-            && rd_op_ != reduce_operator::max && rd_op_ != reduce_operator::min;
-    if (is_bf16) {
-        first_out->details_.dtype_ = datatypes::f32;
-        second_out->details_.dtype_ = datatypes::f32;
-    }
-
-    if (last_axis) {
-        auto vec_step
-                = vectorize_step(ctx, first_out->details_.dtype_.type_code_);
-        auto new_dims = first_out->details_.get_blocking_dims();
-        if (num_threads > 1) { new_dims.insert(new_dims.begin(), num_threads); }
-        new_dims.push_back(vec_step);
-        first_out->details_.set_blocking_dims(new_dims);
-    } else {
-        // if partial reduce, the output has a leading dimension of thread id
-        auto new_dims = first_out->details_.get_blocking_dims();
-        if (num_threads > 1) { new_dims.insert(new_dims.begin(), num_threads); }
-        first_out->details_.set_blocking_dims(new_dims);
-    }
-
-    auto first = graph.make<reduce_compute_op_t>(get_inputs()[0], first_out,
-            rd_ax, rd_op_, keep_dims_, /*local_mode*/ false);
-    if (this->attrs_.has_key(mixed_partition_hint::trial_break)) {
-        first->attrs_[mixed_partition_hint::trial_break]
-                = this->attrs_[mixed_partition_hint::trial_break];
-    }
-
-    sc_op_ptr second;
-    if (num_threads > 1) {
-        std::vector<int> rx_ax {0};
-        if (last_axis) {
-            rx_ax.push_back(first_out->details_.get_blocking_dims().size() - 1);
-        }
-        // add a standalone reduce op after partial reduce
-        second = graph.make("reduce", {first_out}, {second_out},
-                {{"rd_axis", std::move(rx_ax)},
-                        {"rd_op", static_cast<int>(rd_op_)},
-                        {"keep_dims", false}});
-    } else {
-        second = graph.make<reduce_collect_op_t>(first_out, second_out, rd_ax,
-                rd_op_, keep_dims_,
-                last_axis ? reduce_collect_op_t::LAST_AXIS_COLLECT
-                          : reduce_collect_op_t::NOOP);
-    }
-    // record original plain rd axis
-    second->attrs_.set("orig_plain_rd_axis", plain_rd_axis_);
-    if (is_bf16) {
-        auto out_tsr = second_out->copy();
-        out_tsr->details_.dtype_ = datatypes::bf16;
-        out_tsr->producer_owner_ = nullptr;
-        second = graph.make(
-                "cast", {second_out}, {out_tsr}, {{"dtype", datatypes::bf16}});
-        second_out = out_tsr;
-    }
-
-    get_outputs()[0]->replace_with(second_out);
-    remove();
-    return second_out;
-}
-
-OP_REGISTER(reduce_op_t, reduce)
-
-void reduce_impl_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    throw std::runtime_error("Cannot query_format for this internal op");
-}
-infer_status_code reduce_impl_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error(
-            "Cannot pre_infer_slice_ranges for this internal op");
-}
-
-void reduce_impl_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    infer_reduce_binding_axis(this, bdax_map,
-            transform_axis_blocking2plain(
-                    get_inputs()[0]->details_, real_rd_axis_),
-            keep_dims_);
-}
-
-void reduce_impl_op_t::pre_infer_binding_axis(binding_axis_map &bdax_map) {
-    pre_reduce_binding_axis(this, bdax_map,
-            attrs_.get_or_else("orig_plain_rd_axis",
-                    transform_axis_blocking2plain(
-                            get_inputs()[0]->details_, real_rd_axis_)),
-            keep_dims_);
-}
-
-reduce_impl_op_t::reduce_impl_op_t(const graph_tensor_ptr &in,
-        const graph_tensor_ptr &old_out, const std::vector<int> &rd_axis,
-        reduce_operator rd_op, bool keep_dims)
-    : real_rd_axis_(rd_axis), rd_op_(rd_op), keep_dims_(keep_dims) {
-    info_.inputs_ = {in};
-    info_.outputs_ = {old_out};
-    std::sort(real_rd_axis_.begin(), real_rd_axis_.end());
-}
-// get real reduce axis, generaly, you should set rd_axis on plain format
-// semantics.
-const std::vector<int> &reduce_impl_op_t::get_rd_axis() const {
-    return real_rd_axis_;
-}
-
-bool reduce_compute_op_t::can_split_op() const {
-    if (runtime_config_t::get().get_num_threads() == 1) { return false; }
-    auto last_axis = get_inputs()[0]->details_.get_blocking_dims().size() - 1;
-    bool last_axis_reduce
-            = static_cast<unsigned>(real_rd_axis_.back()) == last_axis;
-    return is_partial_reduce() && !last_axis_reduce;
-}
-
-graph_tensor_ptr reduce_compute_op_t::split_op(
-        const context_ptr &ctx, sc_graph_t &graph, int num_threads) {
-    assert(can_split_op());
-
-    auto first_out = get_outputs()[0]->copy();
-    first_out->producer_owner_ = nullptr;
-    auto second_out = get_outputs()[0]->copy();
-    second_out->producer_owner_ = nullptr;
-
-    // remove the thread-id dimension
-    auto new_first_dims = first_out->details_.get_blocking_dims();
-    new_first_dims.erase(new_first_dims.begin());
-    first_out->details_.set_blocking_dims(new_first_dims);
-
-    auto first = graph.make<reduce_compute_op_t>(get_inputs()[0], first_out,
-            real_rd_axis_, rd_op_, keep_dims_, /*local_mode*/ true);
-    if (this->attrs_.has_key(mixed_partition_hint::trial_break)) {
-        first->attrs_[mixed_partition_hint::trial_break]
-                = this->attrs_[mixed_partition_hint::trial_break];
-    }
-    auto second = graph.make<reduce_collect_op_t>(first_out, second_out,
-            real_rd_axis_, rd_op_, keep_dims_, reduce_collect_op_t::COPY);
-    get_outputs()[0]->replace_with(second_out);
-    remove();
-    return second_out;
-}
-
-sc_op_ptr reduce_compute_op_t::copy(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ret = mgr.make<reduce_compute_op_t>(ins.at(0), outs.at(0),
-            real_rd_axis_, rd_op_, keep_dims_, local_mode_);
-    ret->copy_dispatch_key_set_from_op(shared_from_this());
-    ret->attrs_ = attrs_;
-    return ret;
-}
-
-reduce_compute_op_t::reduce_compute_op_t(const graph_tensor_ptr &in,
-        const graph_tensor_ptr &old_out, const std::vector<int> &rd_axis,
-        reduce_operator rd_op, bool keep_dims, bool local_mode)
-    : reduce_impl_op_t(in, old_out, rd_axis, rd_op, keep_dims)
-    , local_mode_(local_mode) {
-    op_name_ = "reduce_compute";
-
-    size_t in_dims = in->details_.get_blocking_dims().size();
-    size_t expected_dims = in_dims;
-    // if no keep dims
-    if (!keep_dims_) {
-        expected_dims
-                = std::max((size_t)1, (expected_dims - real_rd_axis_.size()));
-    }
-    if (is_partial_reduce()) {
-        expected_dims++;
-        attrs_[op_attr_key::break_post_fuse] = true;
-    }
-    // if last axis reduction
-    if (real_rd_axis_.back() == static_cast<int>(in_dims) - 1) {
-        expected_dims += 1;
-    }
-    COMPILE_ASSERT(
-            expected_dims == old_out->details_.get_blocking_dims().size(),
-            "Bad output dims for reduce_compute op:"
-                    << expected_dims << " v.s. "
-                    << old_out->details_.get_blocking_dims().size());
-}
-
-bool reduce_compute_op_t::is_partial_reduce() const {
-    // single thread can do first axis reduction without partial reduce
-    return !local_mode_ && real_rd_axis_.front() == 0
-            && runtime_config_t::get().get_num_threads() != 1;
-}
-
-infer_status_code reduce_compute_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    slice_range_list &known_ranges_list = known_ranges_map[0];
-
-    auto &real_rd_axis = get_rd_axis();
-    sc_dim num_threads = 1;
-    if (is_partial_reduce()) {
-        num_threads = get_outputs()[0]->details_.get_blocking_dims()[0];
-    }
-    // if is last axis reduce, the last dim is the vec step
-    auto vec_step = get_outputs()[0]->details_.get_blocking_dims().back();
-    fsmap.get(get_outputs()[0]) = infer_output_slice_range(true, vec_step,
-            known_ranges_list, real_rd_axis, keep_dims_, num_threads);
-    return infer_status_code::OK;
-}
-
-void reduce_compute_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // set default vectorized information
-    auto &real_rd_axis = get_rd_axis();
-    auto last_axis = get_inputs()[0]->details_.get_blocking_dims().size() - 1;
-    bool last_axis_reduce
-            = static_cast<unsigned>(real_rd_axis.back()) == last_axis;
-    vx_info_.axis = inputs[0]->get_shape().size() - 1;
-    vx_info_.lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    bool is_partial = is_partial_reduce();
-    auto ths = this;
-    auto func = [&](const std::vector<expr> &in,
-                        std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-        indexing indexing_nd = out[0].get().checked_as<indexing>();
-        auto lanes = indexing_nd->dtype_.lanes_;
-        // if keep dims, set reduction axis to 0, else remove the axis from
-        // indexing node
-        if (ths->keep_dims_) {
-            for (auto ax : real_rd_axis) {
-                indexing_nd->idx_.at(ax) = 0;
-            }
-            if (is_partial) {
-                indexing_nd->idx_.insert(indexing_nd->idx_.begin(),
-                        builtin::get_thread_id_func()());
-            }
-            if (last_axis_reduce) { indexing_nd->idx_.emplace_back(0); }
-        } else {
-            std::vector<expr> new_idx;
-            if (is_partial) {
-                // for partial reduce, the first axis should be thread id
-                new_idx.emplace_back(builtin::get_thread_id_func()());
-            }
-            if (indexing_nd->idx_.size() == real_rd_axis.size()) {
-                // all-reduce to scalar, we need to keep an dummy axis
-                new_idx.emplace_back(0);
-            }
-            for (auto itr = indexing_nd->idx_.begin();
-                    itr != indexing_nd->idx_.end(); ++itr) {
-                bool remove = false;
-                auto axis_id = itr - indexing_nd->idx_.begin();
-                for (auto ax : real_rd_axis) {
-                    // if the axis is reduced and is not last axis, remove
-                    if (axis_id == ax) {
-                        if (ax == static_cast<int>(last_axis)) {
-                            // if is last axis reduction, set index to 0
-                            *itr = 0;
-                        } else {
-                            remove = true;
-                            break;
-                        }
-                    }
-                }
-                if (!remove) { new_idx.emplace_back(std::move(*itr)); }
-            }
-            indexing_nd->idx_ = std::move(new_idx);
-        }
-        expr result = get_binary_by_reduce_op(ths->rd_op_)(indexing_nd, in[0]);
-        return builder::make_assign_unattached(indexing_nd, result);
-    };
-
-    compute_vectorized_op(ctx, get_owner_graph(), inputs, *dst[0], info_,
-            vx_info_, mask_compute_func_t(func), mask_compute_func_t(func),
-            attrs_, get_inputs()[0], wkld, false, inputs[0],
-            /*unroll*/ local_mode_);
-}
-
-void reduce_compute_op_t::set_reduce_buffer(const tensor &buf) {
-    buf->init_value_ = tensor_node::make_tensor_initializer(
-            get_init_val_for_reduce(rd_op_, buf->elem_dtype_));
-    if (local_mode_) buf->attr()[attr_keys::must_tensor2var] = true;
-}
-
-reduce_collect_op_t::reduce_collect_op_t(const graph_tensor_ptr &in,
-        const graph_tensor_ptr &old_out, const std::vector<int> &rd_axis,
-        reduce_operator rd_op, bool keep_dims, reduce_collect_op_t::kind op)
-    : reduce_impl_op_t(in, old_out, rd_axis, rd_op, keep_dims), op_(op) {
-    op_name_ = "reduce_collect";
-}
-
-void reduce_collect_op_t::set_reduce_buffer(const tensor &buf) {
-    if (!is_place_holder_op()) {
-        buf->init_value_ = tensor_node::make_tensor_initializer(
-                get_init_val_for_reduce(rd_op_, buf->elem_dtype_));
-    }
-}
-
-sc_op_ptr reduce_collect_op_t::copy(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ret = mgr.make<reduce_collect_op_t>(
-            ins.at(0), outs.at(0), real_rd_axis_, rd_op_, keep_dims_, op_);
-    ret->copy_dispatch_key_set_from_op(shared_from_this());
-    ret->attrs_ = attrs_;
-    return ret;
-}
-
-infer_status_code reduce_collect_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    slice_range_list &known_ranges_list = known_ranges_map[0];
-    // get producer
-    auto &producer = get_inputs()[0]->producer_owner_;
-    COMPILE_ASSERT(producer->isa<reduce_compute_op_t>(),
-            "reduce_collect_op_t can only be placed after "
-            "reduce_compute_op_t, but got "
-                    << producer->op_name_);
-    auto &input = producer->get_inputs().at(0);
-    auto &real_rd_axis = get_rd_axis();
-    auto status = update_reduce_op_fsmap(this, input, fsmap, real_rd_axis);
-    if (status == infer_status_code::RETRY) return status;
-    if (op_ == LAST_AXIS_COLLECT) {
-        // if is not placeholder op, and don't keep dims, we will add an
-        // additional axis at the end, when in reduce_compute. need to drop
-        // the last axis
-        for (auto &range : known_ranges_list) {
-            range.pop_back();
-        }
-    } else if (op_ == COPY) {
-        // if is copy-mode, the output has an additional dimension for thread-id
-        for (auto &range : known_ranges_list) {
-            range.insert(range.begin(), std::pair<expr, expr> {0, 1});
-        }
-    }
-    fsmap.get(get_outputs()[0]) = known_ranges_list;
-    return infer_status_code::OK;
-}
-
-void reduce_collect_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    if (op_ == COPY) {
-        vx_info_.axis = dst[0]->get_shape().size() - 1;
-        auto vec_lanes = vectorize_step(
-                ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-        vx_info_.lanes = vec_lanes;
-        auto ths = this;
-        auto func = [&](const std::vector<expr> &in,
-                            std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-            indexing out_nd = out[0].get().checked_as<indexing>();
-            //  add the axis to indexing node
-            out_nd->idx_.front()
-                    = out_nd->idx_.front() + builtin::get_thread_id_func()();
-            auto new_in = in[0]->remake().as<indexing>();
-            // remove the addition thread-id dimension
-            new_in->idx_.erase(new_in->idx_.begin());
-            return builder::make_assign_unattached(
-                    out[0], get_binary_by_reduce_op(rd_op_)(out[0], new_in));
-        };
-        compute_vectorized_op(ctx, get_owner_graph(), inputs, *dst[0], info_,
-                vx_info_, mask_compute_func_t(func), mask_compute_func_t(func),
-                attrs_, get_outputs()[0], 0, false, dst[0], /*unroll*/ true);
-    } else if (op_ == LAST_AXIS_COLLECT) {
-        // set default vectorized information
-        auto &real_rd_axis = get_rd_axis();
-        auto last_axis
-                = get_inputs()[0]->details_.get_blocking_dims().size() - 1;
-        vx_info_.axis = dst[0]->get_shape().size() - 1;
-        auto vec_lanes = vectorize_step(
-                ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-        vx_info_.lanes = 1;
-        auto ths = this;
-        auto func = [&](const std::vector<expr> &in,
-                            std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-            indexing in_nd = in[0].checked_as<indexing>();
-            out[0]->dtype_.lanes_ = 1;
-            auto lanes = vec_lanes;
-            in_nd->dtype_.lanes_ = lanes;
-            //  add the axis to indexing node
-            in_nd->idx_.emplace_back(0);
-            expr result = get_binary_reduce_by_reduce_op(rd_op_)(in_nd);
-            return builder::make_assign_unattached(out[0], result);
-        };
-
-        compute_vectorized_op(ctx, get_owner_graph(), inputs, *dst[0], info_,
-                vx_info_, mask_compute_func_t(func), mask_compute_func_t(func),
-                attrs_, get_outputs()[0], 0, false, dst[0]);
-    } else {
-        builder::get_current_builder()->emit(
-                builder::make_stmts_unattached({}));
-    }
-}
-OP_REGISTER(reduce_sum_op_t, reduce_sum)
-OP_REGISTER(reduce_prod_op_t, reduce_prod)
-OP_REGISTER(reduce_max_op_t, reduce_max)
-OP_REGISTER(reduce_min_op_t, reduce_min)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/reduce.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/reduce.hpp
deleted file mode 100644
index ee0012edd82..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/reduce.hpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_REDUCE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_REDUCE_HPP
-
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusion_data.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace op_traits {
-struct maybe_split_optimized_t : public virtual op_base_trait_t {
-    // returns true if the reduce_op can be splitted into reduce_compute +
-    // reduce_collect
-    virtual bool can_split_op() const = 0;
-    // split into reduce_compute + reduce_collect
-    virtual graph_tensor_ptr split_op(
-            const context_ptr &ctx, sc_graph_t &graph, int num_threads)
-            = 0;
-};
-} // namespace op_traits
-
-enum class reduce_operator : int {
-    add = 0,
-    mul,
-    max,
-    min,
-};
-
-// reduce op
-class reduce_op_t : public fusible_op_t,
-                    public op_traits::auto_copyable_t,
-                    public op_traits::maybe_split_optimized_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    reduce_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    reduce_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            const reduce_operator &rd_op_, const any_map_t &attrs);
-
-    reduce_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            reduce_operator rd_op = reduce_operator::add,
-            bool keep_dims = false);
-    uint32_t get_lanes() const { return vx_info_.lanes; }
-    // get real reduce axis, generaly, you should set rd_axis on plain format
-    // semantics.
-    std::vector<int> get_rd_axis() const;
-    // get type of reduction
-    const reduce_operator get_rd_op() const { return rd_op_; }
-    // get a compressed int rd_axis.
-    int get_compressed_rd_axis_int() const;
-    size_t compute_workload(const std::vector<shape_dtype_pair> &,
-            const std::vector<shape_dtype_pair> &) override;
-
-    // returns true if the reduce_op can be splitted into reduce_compute +
-    // reduce_collect
-    bool can_split_op() const override;
-    // split into reduce_compute + reduce_collect
-    graph_tensor_ptr split_op(const context_ptr &ctx, sc_graph_t &graph,
-            int num_threads) override;
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-private:
-    // the axis which need reduction
-    std::vector<int> plain_rd_axis_;
-    // type of reduction
-    reduce_operator rd_op_;
-    // if keep_dims=True, if will retain length=1 even though be reduced.
-    bool keep_dims_;
-    // use vectorized
-    vectorized_info_t vx_info_;
-};
-
-// reduce_sum_op_t is derived from reduce_op_t
-class reduce_sum_op_t : public reduce_op_t {
-public:
-    reduce_sum_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : reduce_op_t(std::move(v), rd_axis, reduce_operator::add, keep_dims) {}
-    reduce_sum_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-
-// reduce_prod_op_t is derived from reduce_op_t
-class reduce_prod_op_t : public reduce_op_t {
-public:
-    reduce_prod_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : reduce_op_t(std::move(v), rd_axis, reduce_operator::mul, keep_dims) {}
-    reduce_prod_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-
-// reduce_max_op_t is derived from reduce_op_t
-class reduce_max_op_t : public reduce_op_t {
-public:
-    reduce_max_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : reduce_op_t(std::move(v), rd_axis, reduce_operator::max, keep_dims) {}
-    reduce_max_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-
-// reduce_min_op_t is derived from reduce_op_t
-class reduce_min_op_t : public reduce_op_t {
-public:
-    reduce_min_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : reduce_op_t(std::move(v), rd_axis, reduce_operator::min, keep_dims) {}
-    reduce_min_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-};
-
-class reduce_impl_op_t : public fusible_op_t {
-public:
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    infer_status_code pre_infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-
-    reduce_impl_op_t(const graph_tensor_ptr &in,
-            const graph_tensor_ptr &old_out, const std::vector<int> &rd_axis,
-            reduce_operator rd_op, bool keep_dims);
-    // get real sorted reduce axis
-    const std::vector<int> &get_rd_axis() const;
-    // get type of reduction
-    const reduce_operator get_rd_op() const { return rd_op_; }
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-    // set the attributes of the reduce buffer, like init values
-    virtual void set_reduce_buffer(const tensor &buf) = 0;
-
-protected:
-    // the axis which need reduction
-    std::vector<int> real_rd_axis_;
-    // type of reduction
-    reduce_operator rd_op_;
-    // if keep_dims=True, if will retain length=1 even though be reduced.
-    bool keep_dims_;
-    // use vectorized
-    vectorized_info_t vx_info_;
-};
-
-/**
- * Reduce Op will be replace by reduce_compute_op + reduce_collect_op.
- * If there is no parallelism in the reduction axis, reduce_compute_op will do
- * elementwise reduction on the result tensor. reduce_collect_op will be a
- * no-op, and is just a placeholder to tell fusion manager to place the
- * computation after the reduce_op in an outer-loop anchor.
- *
- * If there is parallelism in the reduction axis, reduce_compute_op will do
- * partial reduction on the thread-shared tensor. Another reduce_op will collect
- * the result and do final reduction.
- * To optimize frequent memory load-store to thread-shared tensor, we further
- * transform reduce_compute_op in partial-reduction mode into
- * local-reduce_compute_op + copy-reduce_collect_op pair when the reduction
- * buffer is small enough to be held in registers. The local-reduce_compute_op's
- * output will be a small local tensor instead of a thread-shared global tensor.
- * The local tensor will be further optimized to registers. A
- * copy-reduce_collect_op will copy the local tensor (registers, after
- * tensor2var optimization) to the final global thread-shared tensor.
- * */
-class reduce_compute_op_t : public reduce_impl_op_t,
-                            public op_traits::copyable_t,
-                            public op_traits::maybe_split_optimized_t {
-public:
-    bool local_mode_;
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-    void compute_block(context_ptr ctx, const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs) override;
-    reduce_compute_op_t(const graph_tensor_ptr &in,
-            const graph_tensor_ptr &old_out, const std::vector<int> &rd_axis,
-            reduce_operator rd_op, bool keep_dims, bool local_mode);
-    bool is_partial_reduce() const;
-    // NOLINT because false alarm on copy()
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins, // NOLINT
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-    // split global-partial-reduce-compute into
-    // local-reduce-compute+copy-reduce-collect
-    bool can_split_op() const override;
-    graph_tensor_ptr split_op(const context_ptr &ctx, sc_graph_t &graph,
-            int num_threads) override;
-    void set_reduce_buffer(const tensor &buf) override;
-};
-
-class reduce_collect_op_t : public reduce_impl_op_t,
-                            public op_traits::copyable_t {
-public:
-    enum kind { NOOP, LAST_AXIS_COLLECT, COPY } op_;
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-    void compute_block(context_ptr ctx, const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs) override;
-    reduce_collect_op_t(const graph_tensor_ptr &in,
-            const graph_tensor_ptr &old_out, const std::vector<int> &rd_axis,
-            reduce_operator rd_op, bool keep_dims, kind op);
-    bool is_place_holder_op() const { return op_ == NOOP; }
-    sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins, // NOLINT
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-    void set_reduce_buffer(const tensor &buf) override;
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/reorder.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/reorder.cpp
deleted file mode 100644
index b03b8090c6f..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/reorder.cpp
+++ /dev/null
@@ -1,1993 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include "memory_movement.hpp"
-#include "reorder.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/anchor_loop_generator.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-#include <unordered_map>
-#include <unordered_set>
-#include <util/exceptions.hpp>
-#include <util/math_utils.hpp>
-#include <util/utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(ops.reorder)
-
-static bool has_output_use(const sc_op *op) {
-    for (auto &use : op->get_outputs()[0]->uses_) {
-        if (use.second->isa<output_op>()) { return true; }
-    }
-    return false;
-}
-
-static bool is_dynamic_reorder_inplace(sc_op *op, const context_ptr &ctx) {
-    COMPILE_ASSERT(
-            ctx->machine_.device_type_ == runtime::target_machine_t::type::cpu,
-            "Currently support cpu only.");
-    return op->get_owner_graph().is_dynamic() && op->isa<reorder_op_t>()
-            && !has_output_use(op)
-            && op->get_inputs()[0]->details_.get_format()
-            == op->get_outputs()[0]->details_.get_format()
-            && op->get_inputs()[0]->details_.get_strides()
-            == op->get_outputs()[0]->details_.get_strides();
-}
-
-// if the reorder is tensor view in dynamic, does not need fusion manager,
-// but do inplace itself.
-ir_module_ptr inplaced_reorder_get_func(sc_op *op, const context_ptr &ctx) {
-    auto modu = std::make_shared<ir_module_t>(ctx);
-
-    std::vector<expr> ins;
-    // real_outs are the output tensors in the function arguments
-    std::vector<expr> real_outs;
-    auto func = graph::create_func_decl_for_op(op, ins, real_outs);
-    builder::ir_builder_t bld;
-    bld.push_scope();
-    bld.push_evaluate(builder::make_write_struct(real_outs[0],
-            builder::make_read_struct(ins[0], dyn_tsr_struct_t::name,
-                    dyn_tsr_struct_t::fields::data_ptr),
-            dyn_tsr_struct_t::name, dyn_tsr_struct_t::fields::data_ptr));
-    bld.push_returns(true);
-    auto body = bld.pop_scope();
-    func->body_ = std::move(body);
-    modu->add_func({func});
-    modu->set_entry_func_idx(0);
-    return modu;
-}
-
-ir_module_ptr reorder_op_t::get_func(context_ptr ctx) {
-    attrs_.set(op_attr_key::no_fuse, true);
-    // if the reorder is tensor view in dynamic, do inplacement.
-    if (is_dynamic_reorder_inplace(this, ctx)) {
-        return inplaced_reorder_get_func(this, ctx);
-    }
-    return fusible_op_get_func(this, ctx);
-}
-
-void reorder_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    // todo: currently reorder from the frontend is contiguous only, so we
-    // penetrate it here. For single internal reorder op test, please mark it as
-    // "internal".
-    supported_ins.push_back(std::vector<format_stride_pair> {
-            std::make_pair(info_.inputs_[0]->details_.get_format(),
-                    info_.inputs_[0]->details_.get_strides())});
-
-    auto out = info_.outputs_[0]->details_;
-    if (!attrs_.get_or_else("internal", false)) {
-        out.set_format(info_.inputs_[0]->details_.get_format());
-        supported_outs.push_back(std::vector<format_stride_pair> {
-                std::make_pair(out.get_format(), out.get_strides())});
-    } else {
-        supported_outs.push_back(std::vector<format_stride_pair> {
-                std::make_pair(out.get_format(), out.get_strides())});
-    }
-    // when call layout propagation before kernel lower with concrete dispatch
-    // key, set break_pre_fuse attr here.
-    // reset attrs_ first
-    auto &graph = get_owner_graph();
-    if (graph.is_dynamic()
-            && !graph.attrs_.get_or_else("insert_reorder", true)) {
-        attrs_.set(op_attr_key::break_pre_fuse, false);
-        attrs_.set(op_attr_key::break_post_fuse, false);
-        if (use_output_loop()) {
-            attrs_.set(op_attr_key::break_pre_fuse, true);
-        } else if (check_padding()) {
-            // Use input loop and has padding.
-            attrs_.set(op_attr_key::break_post_fuse, true);
-        }
-    }
-}
-
-reorder_op_t::reorder_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    for (auto &in : ins) {
-        info_.inputs_.emplace_back(in);
-    }
-    if (outs.empty()) {
-        auto plain_dims = info_.inputs_[0]->details_.get_plain_dims();
-        auto dtype = info_.inputs_[0]->details_.dtype_;
-        auto format = attrs.get<sc_data_format_t>("out_format");
-        sc_dims strides;
-        if (attrs.has_key("out_stride")) {
-            strides = attrs.get<sc_dims>("out_stride");
-        }
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-                this, format, plain_dims, dtype, strides));
-    } else {
-        info_.outputs_ = outs;
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    op_name_ = "reorder";
-    attrs_ = attrs;
-    plain_dims_ = ins[0]->details_.get_plain_dims();
-    COMPILE_ASSERT(info_.inputs_[0]->details_.get_format().is_convertible(
-                           info_.outputs_[0]->details_.get_format()),
-            "input format " << info_.inputs_[0]->details_.get_format()
-                            << " can not convert to "
-                            << info_.outputs_[0]->details_.get_format() << ".");
-
-    update_fuse_attr();
-    // currently we don't fuse reorder in dynamic as it should query next op.
-    if (is_dynamic()) {
-        if (info_.inputs_[0]->details_.get_format().is_blocking()
-                && info_.outputs_[0]->details_.get_format().is_blocking()) {
-            attrs_.set(op_attr_key::no_fuse, true);
-        }
-    }
-}
-
-reorder_op_t::reorder_op_t(graph_tensor_ptr v, sc_data_format_t input_format,
-        sc_data_format_t output_format)
-    : reorder_op_t(
-            {std::move(v)}, {}, any_map_t {{"out_format", output_format}}) {}
-
-// Update fuse attr of reorder in time once format is modified.
-void reorder_op_t::update_fuse_attr() {
-    // For safety, we only deal with static senario temporarily
-    if (!is_dynamic()) {
-        // reset attr firstly
-        attrs_.set(op_attr_key::break_pre_fuse, false);
-        if (use_output_loop()) {
-            attrs_.set(op_attr_key::break_pre_fuse, true);
-        }
-    }
-}
-
-// This function will try to merge multi slice range
-static void merge_multi_slice(slice_range &src_slice) {
-    bool all_const_start = true;
-    for (auto &r : src_slice) {
-        if (!r.first.isa<constant_c>()) {
-            all_const_start = false;
-            break;
-        }
-    }
-    if (all_const_start) {
-        std::sort(src_slice.begin(), src_slice.end(),
-                [](const std::pair<expr, expr> &a,
-                        const std::pair<expr, expr> &b) {
-                    return get_const_as_int(a.first.checked_as<constant_c>())
-                            < get_const_as_int(
-                                    b.first.checked_as<constant_c>());
-                });
-    }
-    slice_range dst_slice;
-    std::pair<int, int> temp_range = {0, 0};
-    for (auto &r : src_slice) {
-        if (r.first.isa<constant_c>() && r.second.isa<constant_c>()) {
-            int start = get_const_as_int(r.first.checked_as<constant_c>());
-            int length = get_const_as_int(r.second.checked_as<constant_c>());
-            // concat
-            if (temp_range.second == 0
-                    || start == temp_range.first + temp_range.second) {
-                if (temp_range.second == 0) temp_range.first = start;
-                temp_range.second += length;
-            }
-            // push and reset
-            else {
-                dst_slice.emplace_back(temp_range);
-                temp_range.first = start;
-                temp_range.second = length;
-            }
-        } else {
-            if (temp_range.second > 0) {
-                dst_slice.emplace_back(temp_range);
-                temp_range.second = 0;
-            }
-            dst_slice.emplace_back(std::move(r));
-        }
-    }
-    if (temp_range.second > 0) {
-        dst_slice.emplace_back(temp_range);
-        temp_range.second = 0;
-    }
-    src_slice = std::move(dst_slice);
-}
-
-// Get block to plain ranges
-slice_range get_block2plain_ranges(const expr &block_num_start,
-        const expr &block_num_length, const expr &block_size_start,
-        const expr &block_size_length, int blocks) {
-    auto folded_block_size_length
-            = constant_folder_t()(auto_caster_t()(block_size_length));
-    COMPILE_ASSERT(folded_block_size_length.isa<constant_c>(),
-            "constant length is expected, but got "
-                    << folded_block_size_length);
-
-    int block_size_length_int = get_const_as_int(
-            folded_block_size_length.checked_as<constant_c>());
-
-    std::vector<std::pair<expr, expr>> plain_range_list;
-    if (block_size_length_int == blocks) {
-        // when block size is equal to blocks, reorder will generate
-        // consequent slice in output
-        auto plain_range = std::make_pair(
-                do_cast_and_fold(block_num_start * blocks + block_size_start),
-                do_cast_and_fold(block_num_length * block_size_length_int));
-        plain_range_list = {plain_range};
-    } else {
-        if (!block_num_length.isa<constant>()) { return plain_range_list; }
-        // multi plain ranges
-        int block_num_length_int
-                = get_const_as_int(block_num_length.checked_as<constant_c>());
-        for (int i = 0; i < block_num_length_int; i++) {
-            auto plain_range = std::make_pair(
-                    do_cast_and_fold((block_num_start + expr(i)) * blocks
-                            + block_size_start),
-                    block_size_length);
-            plain_range_list.emplace_back(plain_range);
-        }
-    }
-    // try to merge multi slice
-    merge_multi_slice(plain_range_list);
-    return plain_range_list;
-}
-
-// Get plain to block ranges
-std::vector<std::pair<std::pair<expr, expr>, std::pair<expr, expr>>>
-get_plain2block_ranges(const expr &start, const expr &length, int blocks) {
-    std::vector<std::pair<std::pair<expr, expr>, std::pair<expr, expr>>> ret;
-
-    auto folder = constant_folder_t();
-    auto caster = auto_caster_t();
-    expr folded_start = folder(caster(start)).remove_const();
-    expr folded_length = folder(caster(length)).remove_const();
-    // Case 1: the most commone case.
-    if (folded_start.isa<constant>() && get_expr_as_int(folded_start) == 0) {
-        if (folded_length.isa<constant>()) {
-            int ilength
-                    = get_const_as_int(folded_length.checked_as<constant_c>());
-            if (ilength >= blocks) {
-                auto block_num_range = std::make_pair(0, ilength / blocks);
-                auto block_size_range = std::make_pair(0, blocks);
-                ret.emplace_back(std::make_pair(std::move(block_num_range),
-                        std::move(block_size_range)));
-            }
-            if (ilength % blocks != 0) {
-                auto block_num_range = std::make_pair(ilength / blocks, 1);
-                auto block_size_range = std::make_pair(0, ilength % blocks);
-                ret.emplace_back(std::make_pair(std::move(block_num_range),
-                        std::move(block_size_range)));
-            }
-        } else {
-            // dynamic case, fixed block so no tail
-            std::pair<expr, expr> block_num_range = std::make_pair(0,
-                    folder(caster(divide_and_ceil(folded_length, blocks)))
-                            .remove_const());
-            std::pair<expr, expr> block_size_range = std::make_pair(0, blocks);
-            ret.emplace_back(std::make_pair(
-                    std::move(block_num_range), std::move(block_size_range)));
-        }
-    } else {
-        COMPILE_ASSERT(folded_length.isa<constant_c>(),
-                "constant length is expected, but got " << folded_length);
-        int ilength = get_const_as_int(folded_length.checked_as<constant_c>());
-
-        // Case 2: gcd case.
-        if (folded_start->node_type_ == sc_expr_type::mul) {
-            auto r = constant_folding::get_operand_from_binary(folded_start)
-                             .second;
-            if (r.isa<constant>()) {
-                auto multiple = get_expr_as_int(r);
-                if (multiple % blocks == 0) {
-                    if (ilength >= blocks) {
-                        auto block_num_range = std::make_pair(
-                                folded_start / blocks, ilength / blocks);
-                        auto block_size_range = std::make_pair(0, blocks);
-                        ret.emplace_back(
-                                std::make_pair(std::move(block_num_range),
-                                        std::move(block_size_range)));
-                    }
-                    if (ilength % blocks != 0) {
-                        auto block_num_range = std::make_pair(
-                                folded_start / blocks + ilength / blocks, 1);
-                        auto block_size_range
-                                = std::make_pair(0, ilength % blocks);
-                        ret.emplace_back(
-                                std::make_pair(std::move(block_num_range),
-                                        std::move(block_size_range)));
-                    }
-                } else {
-                    int gcd = math_utils::get_gcd(multiple, blocks);
-                    for (int i = 0; i < ilength / gcd; i++) {
-                        auto block_num_range = std::make_pair(
-                                (folded_start + i * gcd) / blocks, 1);
-                        auto block_size_range = std::make_pair(
-                                (folded_start + i * gcd) % blocks, gcd);
-                        ret.emplace_back(
-                                std::make_pair(std::move(block_num_range),
-                                        std::move(block_size_range)));
-                    }
-                    if (ilength % gcd != 0) {
-                        auto block_num_range = std::make_pair(
-                                (folded_start + ilength / gcd) / blocks, 1);
-                        auto block_size_range = std::make_pair(
-                                (folded_start + ilength / gcd) % blocks,
-                                ilength % gcd);
-                        ret.emplace_back(
-                                std::make_pair(std::move(block_num_range),
-                                        std::move(block_size_range)));
-                    }
-                }
-            }
-        }
-    }
-
-    // Case 3: fallback to multi one-length slice
-    if (ret.empty()) {
-        COMPILE_ASSERT(folded_length.isa<constant_c>(),
-                "constant length is expected, but got " << folded_length);
-        int ilength = get_const_as_int(folded_length.checked_as<constant_c>());
-
-        for (int i = 0; i < ilength; i++) {
-            auto block_num_range
-                    = std::make_pair((folded_start + i) / blocks, 1);
-            auto block_size_range
-                    = std::make_pair((folded_start + i) % blocks, 1);
-            ret.emplace_back(std::make_pair(
-                    std::move(block_num_range), std::move(block_size_range)));
-        }
-    }
-    return ret;
-}
-
-void infer_stride2plain_reorder(slice_range &input_slice,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range &output_slice) {
-    output_slice = input_slice;
-    for (int i = 0; i < input_format.format_code_.norig_dims(); i++) {
-        int plain_axis = input_format.format_code_.get(i);
-        output_slice[plain_axis] = input_slice[i];
-    }
-}
-
-void infer_plain2stride_reorder(slice_range &input_slice,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range &output_slice) {
-    output_slice = input_slice;
-    for (int i = 0; i < output_format.format_code_.norig_dims(); i++) {
-        int plain_axis = output_format.format_code_.get(i);
-        output_slice[i] = input_slice[plain_axis];
-    }
-}
-
-void infer_stride2stride_reorder(slice_range_list &input_slice_list,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range_list &output_slice_list) {
-    for (auto &input_slice : input_slice_list) {
-        slice_range plain_slice, reorder_ranges;
-        infer_stride2plain_reorder(
-                input_slice, input_format, output_format, plain_slice);
-        infer_plain2stride_reorder(
-                plain_slice, input_format, output_format, reorder_ranges);
-        output_slice_list.emplace_back(reorder_ranges);
-    }
-}
-
-/**
- * For [AxBxCxD], if B and D, have more than one slice ranges, such as two, it
- * needs to dispatch them to four (2x2) slice ranges. Note that: if B and D come
- * from same plain axis, it only generate two slice ranges instead of four.
- * */
-void dispatch_reorder_ranges(slice_range_list &total_ranges_list,
-        slice_range_list &reorder_ranges_list,
-        sc_data_format_t output_format = sc_data_format_t()) {
-    std::vector<int> acc_size_list;
-    int total_size = 1;
-    for (size_t i = 0; i < total_ranges_list.size(); i++) {
-        total_size *= total_ranges_list.at(i).size();
-        acc_size_list.emplace_back(total_size);
-    }
-
-    // this check is aim to avoid too many compile time, it will also break
-    // this kind reorder post fusion
-    const int max_slice_size = 16;
-
-    for (int i = 0; i < total_size; i++) {
-        // set remainder
-        int rmd = i;
-        std::vector<int> index_list;
-        bool valid = true;
-        for (size_t j = 0; j < acc_size_list.size(); j++) {
-            int size = (total_size / acc_size_list[j]);
-            index_list.emplace_back(rmd / size);
-            if (!output_format.is_any()
-                    && output_format.format_code_.get(j)
-                            < static_cast<int>(index_list.size())
-                    && index_list[output_format.format_code_.get(j)]
-                            != index_list[j]) {
-                valid = false;
-                break;
-            }
-            rmd = rmd % size;
-        }
-        if (!valid) continue;
-        slice_range reorder_range;
-        for (size_t j = 0; j < index_list.size(); j++) {
-            reorder_range.emplace_back(total_ranges_list[j][index_list[j]]);
-        }
-        reorder_ranges_list.emplace_back(reorder_range);
-
-        if (reorder_ranges_list.size() > max_slice_size) {
-            reorder_ranges_list.clear();
-            return;
-        }
-    }
-}
-
-size_t throw_if_negative(int dim) {
-    if (dim < 0) { throw std::runtime_error("Bad format"); }
-    return dim;
-}
-
-// infer plain format to blocking format generally
-void infer_stride2block_reorder(slice_range_list &input_slice_list,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range_list &output_slice_list) {
-    auto out_kind = output_format.format_code_;
-    for (auto &input_slice : input_slice_list) {
-        slice_range_list reorder_ranges_list;
-        // stride->plain
-        slice_range plain_slice;
-        infer_stride2plain_reorder(input_slice, input_format,
-                output_format.to_plain(), plain_slice);
-
-        /**
-         * block_slice_dict is the map, in which:
-         * 1. key: represents plain pos
-         * 2. value: is the vector of slice_range, e.g.
-         *                               block_1_1  block_1_1
-         *             block_1  block_1
-         *   plain_1                     block_1_2  block_1_2
-         *             block_2  block_2
-         *                               block_2_1  block_2_2
-         *
-         *  the vector may look like as below:
-         *  {block_1, block_1_1, block1_1}
-         *  {block_1, block_1_2, block1_2}
-         *  {block_2, block_2_1, block2_1}
-         * */
-        std::unordered_map<int, slice_range_list> block_slice_dict;
-
-        // the first is plain index, the second is block cnt
-        std::unordered_map<int, std::vector<int>> block_cnt_dict;
-
-        for (int i = 0; i < out_kind.ndims(); i++) {
-            int plain_pos = out_kind.get(i);
-            block_cnt_dict[plain_pos].emplace_back(i);
-            if (block_slice_dict[plain_pos].empty()) {
-                block_slice_dict[plain_pos].emplace_back(
-                        slice_range {plain_slice[plain_pos]});
-            } else {
-                slice_range_list update_block_slice;
-                for (auto &block_range_list : block_slice_dict[plain_pos]) {
-                    auto cur_plain_range = block_range_list.back();
-                    auto cur_block
-                            = output_format.blocks_
-                                      [out_kind.collect_blocking_index(
-                                                       plain_pos)
-                                                      .at(block_cnt_dict[plain_pos] // NOLINT
-                                                                      .size()
-                                                              - 2)];
-                    auto cur_block_ranges
-                            = get_plain2block_ranges(cur_plain_range.first,
-                                    cur_plain_range.second, cur_block);
-
-                    for (auto &range_pair : cur_block_ranges) {
-                        slice_range cpy_last_range_list = block_range_list;
-                        cpy_last_range_list.back() = range_pair.first;
-                        cpy_last_range_list.emplace_back(range_pair.second);
-                        update_block_slice.emplace_back(cpy_last_range_list);
-                    }
-                }
-                if (!update_block_slice.empty())
-                    block_slice_dict[plain_pos] = update_block_slice;
-            }
-        }
-
-        std::vector<slice_range> total_range_list(
-                throw_if_negative(out_kind.ndims()));
-        // collect all blocking slice
-        for (auto &mp : block_slice_dict) {
-            int plain_pos = mp.first;
-            for (auto &range_list : mp.second) {
-                for (size_t i = 0; i < range_list.size(); i++) {
-                    int block_pos = block_cnt_dict[plain_pos].at(i);
-                    total_range_list[block_pos].emplace_back(range_list[i]);
-                }
-            }
-        }
-
-        dispatch_reorder_ranges(
-                total_range_list, reorder_ranges_list, output_format);
-
-        output_slice_list.insert(output_slice_list.end(),
-                reorder_ranges_list.begin(), reorder_ranges_list.end());
-    }
-}
-
-// infer blocking format to plain format generally
-void infer_block2stride_reorder(slice_range_list &input_slice_list,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range_list &output_slice_list) {
-    auto out_kind = output_format.format_code_.to_plain();
-    auto in_kind = input_format.format_code_;
-    for (auto &input_slice : input_slice_list) {
-        slice_range_list reorder_ranges_list;
-        std::unordered_map<int, slice_range_list> plain_slice_dict;
-        // from right to left
-        for (int i = in_kind.ndims() - 1; i >= 0; i--) {
-            int plain_pos = in_kind.get(i);
-            if (plain_slice_dict[plain_pos].empty()) {
-                plain_slice_dict[plain_pos].emplace_back(
-                        slice_range {input_slice[i]});
-            } else {
-                std::pair<expr, expr> cur_block_num_range = input_slice[i];
-                slice_range res;
-                for (auto &cur_block_size_range :
-                        plain_slice_dict[plain_pos].back()) {
-                    auto cur_blocks = in_kind.collect_blocking_index(plain_pos);
-                    int cur_block = input_format.blocks_[cur_blocks.at(
-                            cur_blocks.size()
-                            - plain_slice_dict[plain_pos].size())];
-                    slice_range cur_plain_ranges_list
-                            = get_block2plain_ranges(cur_block_num_range.first,
-                                    cur_block_num_range.second,
-                                    cur_block_size_range.first,
-                                    cur_block_size_range.second, cur_block);
-                    res.insert(res.end(), cur_plain_ranges_list.begin(),
-                            cur_plain_ranges_list.end());
-                }
-                plain_slice_dict[plain_pos].emplace_back(std::move(res));
-            }
-        }
-
-        std::vector<slice_range> total_range_list;
-        for (int i = 0; i < out_kind.ndims(); i++) {
-            total_range_list.emplace_back(plain_slice_dict[i].back()); // NOLINT
-        }
-
-        dispatch_reorder_ranges(total_range_list, reorder_ranges_list,
-                output_format.to_plain());
-
-        for (auto &range : reorder_ranges_list) {
-            // plain -> stride
-            slice_range stride_range;
-            infer_plain2stride_reorder(range, output_format.to_plain(),
-                    output_format, stride_range);
-            range = std::move(stride_range);
-        }
-        output_slice_list.insert(output_slice_list.end(),
-                reorder_ranges_list.begin(), reorder_ranges_list.end());
-    }
-}
-
-// infer blocking format to blocking format generally
-void infer_block2block_reorder(slice_range_list &input_slice_list,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range_list &output_slice_list) {
-    slice_range_list plain_ranges_list;
-    infer_block2stride_reorder(input_slice_list, input_format,
-            input_format.to_plain(), plain_ranges_list);
-
-    infer_stride2block_reorder(plain_ranges_list, input_format.to_plain(),
-            output_format, output_slice_list);
-}
-
-static inline bool is_not_blocking(sc_data_format_t format) {
-    return !format.is_blocking() && !format.is_any();
-}
-
-// generally infer reorder slice for any format (except padding cases)
-void infer_reorder_slice(slice_range_list &input_slice_list,
-        sc_data_format_t input_format, sc_data_format_t output_format,
-        slice_range_list &output_slice_list) {
-    if (is_not_blocking(input_format) && is_not_blocking(output_format)) {
-        infer_stride2stride_reorder(input_slice_list, input_format,
-                output_format, output_slice_list);
-    } else if (is_not_blocking(input_format) && output_format.is_blocking()) {
-        infer_stride2block_reorder(input_slice_list, input_format,
-                output_format, output_slice_list);
-    } else if (input_format.is_blocking() && is_not_blocking(output_format)) {
-        infer_block2stride_reorder(input_slice_list, input_format,
-                output_format, output_slice_list);
-    } else if (input_format.is_blocking() && output_format.is_blocking()) {
-        infer_block2block_reorder(input_slice_list, input_format, output_format,
-                output_slice_list);
-    } else {
-        std::ostringstream ss;
-        ss << "Unsupported data format. in = " << input_format
-           << ", out = " << output_format;
-        SC_MODULE_WARN << ss.str();
-        throw tuner_recoverable_exception_t(ss.str());
-    }
-    for (auto &reorder_range : output_slice_list) {
-        for (auto &r : reorder_range) {
-            r.first = do_cast_and_fold(r.first);
-        }
-    }
-}
-
-void infer_padding_reorder_slice(slice_range_list &input_slice_list,
-        const sc_dims &input_dims, const sc_dims &output_dims,
-        slice_range_list &output_slice_list) {
-    sc_dims inp_lead_dims = {input_dims.begin(), input_dims.end() - 1};
-    sc_dims out_lead_dims = {output_dims.begin(), output_dims.end() - 1};
-    // current, we can only infer padding reorder with last dim padded. E.g.
-    // [1,16,1,1,32] ====> [1,16,1,1,48]
-    if (inp_lead_dims != out_lead_dims) return;
-    output_slice_list = input_slice_list;
-    int inp_rank = input_dims.size();
-    for (size_t i = 0; i < input_slice_list.size(); i++) {
-        if (!slice_full_on_axis(
-                    input_dims, input_slice_list[i], {inp_rank - 1})) {
-            // could not infer this case, clear and return
-            output_slice_list = {};
-            return;
-        }
-        // set new full range on output slice
-        output_slice_list[i].back()
-                = std::make_pair(expr(0), dim2unsigned(output_dims.back()));
-    }
-}
-
-bool check_required_slice(const graph_tensor_ptr &gt,
-        const slice_range_list &range_list, int required_axis_from_end) {
-    auto gt_dims = gt->details_.get_blocking_dims();
-    std::vector<int> required_axis;
-    int cur_len = gt_dims.size() - required_axis_from_end;
-    for (size_t i = std::max(cur_len, 0); i < gt_dims.size(); i++) {
-        required_axis.emplace_back(i);
-    }
-    return range_list.size() == 1
-            && slice_full_on_axis(gt_dims, range_list[0], required_axis);
-}
-
-/**
- * @brief find the axis closest to the last which could be vectorized.
- * @param blocking_dims_expr blocking expr dim
- * @param format format
- * @param last_origin_axis original last axis
- * @param origin_axis_vectorized finded axis closed to the last
-that can be vectorized
- * */
-void find_vectorized_axis(std::vector<expr> const &blocking_dims_expr,
-        sc_data_format_t const &format, int &last_origin_axis,
-        int &origin_axis_vectorized) {
-    origin_axis_vectorized = format.format_code_.ndims() - 1;
-    // find not 1 dim in the last, if in dynamic cases, it will be as
-    // original logic
-    for (int i = origin_axis_vectorized; i >= 0; i--) {
-        if (!blocking_dims_expr[i].isa<constant>()) { break; }
-        if (get_expr_as_int(blocking_dims_expr[i]) > 1) {
-            origin_axis_vectorized = i;
-            break;
-        }
-    }
-    last_origin_axis = format.format_code_.get(origin_axis_vectorized);
-}
-
-/**
- * @brief find the axis closest to the last which could be vectorized.
- * @param tsl tensor slice
- * @param format format
- * @param last_origin_axis original last axis
- * @param origin_axis_vectorized finded axis closed to the last
-that can be vectorized
- * */
-void find_vectorized_axis(const tensor_slice &tsl,
-        sc_data_format_t const &format, int &last_origin_axis,
-        int &origin_axis_vectorized) {
-    origin_axis_vectorized = format.format_code_.ndims() - 1;
-    // find not 1 dim in the last, if in dynamic cases, it will be as
-    // original logic
-    for (int i = origin_axis_vectorized; i >= 0; i--) {
-        if (!tsl.get_shape()[i].isa<constant>()) { break; }
-        if (get_expr_as_int(tsl.get_shape()[i]) > 1) {
-            origin_axis_vectorized = i;
-            break;
-        }
-    }
-    last_origin_axis = format.format_code_.get(origin_axis_vectorized);
-}
-/**
- * @brief Calculate the total number of elements in a certain axis in the shape.
- * @param blocking_dims blocking dims
- * @param axis certain axis
- * */
-int collect_axis_shape_size(
-        sc_dims &blocking_dims, const std::vector<int> &axis) {
-    int ret = 1;
-    std::unordered_set<int> set;
-    set.insert(axis.begin(), axis.end());
-    for (size_t i = 0; i < blocking_dims.size(); i++) {
-        if (set.find(i) != set.end()) { ret *= blocking_dims[i]; }
-    }
-    assert(ret > 0);
-    return ret;
-};
-
-#define SLICE_RAGNE_CHECK_INIT_DATA() \
-    bool use_out_loop = use_output_loop(); \
-    sc_data_format_t target_format \
-            = use_out_loop ? output_format : input_format; \
-    auto dtype = info_.inputs_[0]->details_.dtype_; \
-    auto blocking_exprs = get_blocking_shapes_expr( \
-            get_owner_graph(), plain_dims_, target_format); \
-    auto block_axis = target_format.format_code_.get( \
-            target_format.format_code_.ndims() - 1); \
-    int origin_axis_vectorized = target_format.format_code_.ndims() - 1; \
-    auto toy_inp = builder::make_tensor( \
-            std::string("dummy_inp"), blocking_exprs, dtype); \
-    auto inp_slice = tensor_slice(toy_inp); \
-    find_vectorized_axis( \
-            inp_slice, target_format, block_axis, origin_axis_vectorized); \
-    int len_from_last \
-            = target_format.format_code_.ndims() - 1 - origin_axis_vectorized; \
-    bool must_recheck = len_from_last > 0; \
-    bool optimized_slice_check = support_optimized_kernel(ctx); \
-    bool special_slice_check \
-            = (support_optimized_kernel(ctx) || must_recheck); \
-    len_from_last += optimized_slice_check \
-            ? meet_vnni_reorder_require(ctx) ? 3 : 2 \
-            : 1;
-
-// infer reorder slice according input_slice
-infer_status_code reorder_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // has been pre-inferred, skip
-    if (!fsmap.get(get_outputs()[0]).empty()) return infer_status_code::OK;
-    auto &input_format = get_input_format();
-    auto &output_format = get_output_format();
-    COMPILE_ASSERT(input_format.is_convertible(output_format),
-            "Can not convert input format "
-                    << input_format << " to output format " << output_format
-                    << ".");
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-
-    auto input_slice_list = known_ranges_map[0];
-    std::vector<int> required_axis(
-            get_inputs()[0]->details_.get_blocking_dims().size(), 0);
-    for (auto i = 0UL; i < get_inputs()[0]->details_.get_blocking_dims().size();
-            i++) {
-        required_axis[i] = i;
-    }
-    for (auto &src_range : input_slice_list) {
-        if (!get_inputs()[0]->details_.is_dynamic()
-                && !slice_divisible_on_axis(
-                        get_inputs()[0]->details_.get_blocking_dims(),
-                        src_range, required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-
-    SLICE_RAGNE_CHECK_INIT_DATA()
-    if (special_slice_check
-            && !check_required_slice(
-                    get_inputs()[0], input_slice_list, len_from_last)) {
-        return infer_status_code::RETRY;
-    }
-
-    slice_range_list reorder_ranges_list;
-    // infer reorder slice only makes sense for non-padding cases in new fusion
-    // mgr
-    if (!check_padding() || is_dynamic()) {
-        infer_reorder_slice(input_slice_list, input_format, output_format,
-                reorder_ranges_list);
-    } else {
-        // infer padding reorder
-        infer_padding_reorder_slice(input_slice_list,
-                get_inputs()[0]->details_.get_blocking_dims(),
-                get_outputs()[0]->details_.get_blocking_dims(),
-                reorder_ranges_list);
-    }
-
-    if (!reorder_ranges_list.empty() && optimized_slice_check
-            && !check_required_slice(
-                    get_outputs()[0], reorder_ranges_list, len_from_last)) {
-        return infer_status_code::RETRY;
-    }
-    fsmap.get(get_outputs()[0]) = reorder_ranges_list;
-    return infer_status_code::OK;
-}
-
-// pre-infer reorder slice according output_slice
-infer_status_code reorder_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    slice_range_list known_ranges_list = fsmap.get(get_outputs()[0]);
-    auto &input_format = get_input_format();
-    auto &output_format = get_output_format();
-    SLICE_RAGNE_CHECK_INIT_DATA()
-    // deal with begining reorder op, which use output loop
-    if (fsmap.datamap_.size() == 1) {
-        if (!use_output_loop()
-                || (special_slice_check
-                        && !attrs_.get_or_else(
-                                mixed_partition_hint::pre_fuse_begin_op, false)
-                        && !check_required_slice(get_outputs()[0],
-                                known_ranges_list, len_from_last))) {
-            return infer_status_code::RETRY;
-        }
-        return infer_status_code::OK;
-    }
-    if (is_dynamic()) { return infer_status_code::RETRY; }
-    if (fsmap.get(get_inputs()[0]).empty()) {
-        if (check_padding()) {
-            if (!use_output_loop()) { return infer_status_code::RETRY; }
-            return infer_status_code::OK;
-        }
-        slice_range_list input_slice_list;
-        infer_reorder_slice(known_ranges_list, get_output_format(),
-                get_input_format(), input_slice_list);
-        if (input_slice_list.size() != 1 || !support_output_loop()) {
-            return infer_status_code::RETRY;
-        }
-        fsmap.get(get_inputs()[0]) = input_slice_list;
-    }
-    return infer_status_code::OK;
-}
-
-void reorder_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    infer_identical_binding_axis(this, bdax_map);
-}
-
-void reorder_op_t::pre_infer_binding_axis(binding_axis_map &bdax_map) {
-    pre_infer_identical_binding_axis(this, bdax_map);
-}
-
-std::vector<expr> get_reorder_stride2stride_indexes(
-        const std::vector<expr> &in_indexes, const sc_data_format_t &in_format,
-        const sc_data_format_t &out_format, const sc_dims &plain_dims) {
-    if (in_indexes.empty()) { return std::vector<expr>(); }
-    COMPILE_ASSERT(in_format.format_code_ != format_kinds::any
-                    && out_format.format_code_ != format_kinds::any,
-            "format can not be any in reorder op, please check it in layout "
-            "propagation.");
-    size_t base_out_dim = 0;
-    assert(in_format.format_code_.ndims() == out_format.format_code_.ndims());
-    size_t num_plain_dims = in_format.format_code_.norig_dims();
-    size_t num_out_dims = num_plain_dims;
-    std::vector<expr> ret(num_out_dims, 0);
-    COMPILE_ASSERT(in_indexes.size() == num_plain_dims,
-            "Wrong number of dimensions for format: "
-                    << in_format
-                    << ", real shape = " << utils::print_vector(in_indexes));
-
-    COMPILE_ASSERT(in_indexes.size() <= sc_data_format_kind_t::MAX_DIMS,
-            "Too many dims in plain shapes");
-    std::vector<std::vector<int>> out_axis_map
-            = out_format.format_code_.collect_p2b_mapping();
-    // format index
-    for (auto inp_idx = base_out_dim; inp_idx < num_out_dims; inp_idx++) {
-        auto orig_dim = in_format.format_code_.get(inp_idx - base_out_dim);
-        assert(orig_dim < static_cast<int>(out_axis_map.size())
-                && out_axis_map[orig_dim].size() == 1);
-        auto out_idx = out_axis_map[orig_dim][0];
-        assert(base_out_dim + out_idx < ret.size());
-        ret[base_out_dim + out_idx] = in_indexes[inp_idx];
-    }
-    return ret;
-}
-
-std::vector<expr> get_reorder_block2plain_indexes(sc_graph_t &graph,
-        const std::vector<expr> &in_indexes, const sc_data_format_t &format,
-        const sc_dims &plain_dims, expr &condition, expr &last_axis_offset,
-        expr &other_axis_condition, const int target_axis) {
-    if (in_indexes.empty()) { return std::vector<expr>(); }
-    COMPILE_ASSERT(format.format_code_ != format_kinds::any,
-            "format can not be any_t in reorder op, please check it in layout "
-            "propagation.");
-    size_t base_out_dim = 0;
-    size_t num_plain_dims = throw_if_negative(format.format_code_.norig_dims());
-    size_t num_format_dims = throw_if_negative(format.format_code_.ndims());
-    size_t num_out_dims = num_plain_dims;
-    std::vector<expr> ret(num_out_dims, 0);
-    COMPILE_ASSERT(in_indexes.size() == num_format_dims,
-            "Wrong number of dimensions for format: "
-                    << format
-                    << ", real shape = " << utils::print_vector(in_indexes));
-
-    COMPILE_ASSERT(in_indexes.size() <= sc_data_format_kind_t::MAX_DIMS,
-            "Too many dims in plain shapes");
-    condition = true;
-    other_axis_condition = true;
-    std::unordered_map<int, int>
-            axis2blocks; // plain_axis to block idx, idx++ after an access
-    auto last_orig_axis
-            = format.format_code_.get(static_cast<int>(num_format_dims) - 1);
-    if (target_axis != TARGET_AXIS_NOT_DEFINE
-            && last_orig_axis != target_axis) {
-        last_orig_axis = format.format_code_.get(target_axis);
-    }
-    // format index
-    for (int inp_idx = static_cast<int>(num_format_dims + base_out_dim) - 1;
-            inp_idx >= static_cast<int>(base_out_dim); inp_idx--) {
-        auto orig_axis = format.format_code_.get(inp_idx - base_out_dim);
-        auto blocks = format.format_code_.collect_blocking_index(orig_axis);
-        if (axis2blocks.find(orig_axis) == axis2blocks.end()) {
-            axis2blocks[orig_axis] = static_cast<int>(blocks.size());
-        }
-        if (axis2blocks[orig_axis] == static_cast<int>(blocks.size())) {
-            ret[base_out_dim + orig_axis]
-                    = ret[base_out_dim + orig_axis] + in_indexes[inp_idx];
-        } else {
-            ret[base_out_dim + orig_axis] = ret[base_out_dim + orig_axis]
-                    + in_indexes[inp_idx]
-                            * format.blocks_[blocks[axis2blocks[orig_axis]]];
-            if (axis2blocks[orig_axis] == 0) {
-                auto new_condition
-                        = ret[base_out_dim + orig_axis] < graph.dim_to_expr(
-                                  plain_dims[base_out_dim + orig_axis]);
-                condition = condition && new_condition;
-                if (orig_axis == last_orig_axis) {
-                    auto last_axis_offset_tmp
-                            = cast_to_s32(graph.dim_to_expr(
-                                      plain_dims[base_out_dim + orig_axis]))
-                            - cast_to_s32(ret[base_out_dim + orig_axis]);
-                    last_axis_offset = last_axis_offset.defined()
-                            ? builder::make_min(
-                                    last_axis_offset_tmp, last_axis_offset)
-                            : last_axis_offset_tmp;
-                } else {
-                    other_axis_condition
-                            = other_axis_condition && new_condition;
-                }
-            } else {
-                auto new_condition = ret[base_out_dim + orig_axis]
-                        < format.blocks_[blocks[axis2blocks[orig_axis] - 1]];
-                condition = condition && new_condition;
-                if (orig_axis == last_orig_axis) {
-                    last_axis_offset
-                            = cast_to_s32(format.blocks_[blocks
-                                              [axis2blocks[orig_axis] - 1]])
-                            - cast_to_s32(ret[base_out_dim + orig_axis]);
-                } else {
-                    other_axis_condition
-                            = other_axis_condition && new_condition;
-                }
-            }
-        }
-        axis2blocks[orig_axis]--; // next block
-    }
-    return ret;
-}
-
-std::vector<expr> get_reorder_plain2block_indexes(
-        const std::vector<expr> &in_indexes, const sc_data_format_t &format) {
-    if (in_indexes.empty()) { return std::vector<expr>(); }
-    COMPILE_ASSERT(format.format_code_ != format_kinds::any,
-            "format can not be any in reorder op, please check it in layout "
-            "propagation.");
-    size_t base_out_dim = 0;
-    size_t num_plain_dims = throw_if_negative(format.format_code_.norig_dims());
-    size_t num_format_dims = throw_if_negative(format.format_code_.ndims());
-    size_t num_out_dims = num_format_dims;
-    std::vector<expr> ret(num_out_dims, 0);
-    COMPILE_ASSERT(in_indexes.size() == num_plain_dims,
-            "Wrong number of dimensions for format: "
-                    << format
-                    << ", real shape = " << utils::print_vector(in_indexes));
-
-    COMPILE_ASSERT(in_indexes.size() <= sc_data_format_kind_t::MAX_DIMS,
-            "Too many dims in plain shapes");
-    std::unordered_map<int, int>
-            axis2blocks; // plain_axis to block idx, idx++ after an access
-    std::unordered_map<int, expr> axis2index; // current index of blocking
-    for (auto inp_idx = base_out_dim; inp_idx < num_plain_dims + base_out_dim;
-            inp_idx++) {
-        axis2index[inp_idx - base_out_dim] = in_indexes[inp_idx];
-    }
-    // format index
-    for (auto out_idx = base_out_dim; out_idx < num_format_dims + base_out_dim;
-            out_idx++) {
-        auto orig_axis = format.format_code_.get(out_idx - base_out_dim);
-        if (axis2blocks.find(orig_axis) == axis2blocks.end()) {
-            axis2blocks[orig_axis] = 0;
-        }
-        auto blocks = format.format_code_.collect_blocking_index(orig_axis);
-        int cur_block = 0;
-        if (axis2blocks[orig_axis] >= (int)blocks.size()) {
-            ret[out_idx] = axis2index[orig_axis];
-        } else {
-            cur_block = format.blocks_[blocks[axis2blocks[orig_axis]]];
-            ret[out_idx] = axis2index[orig_axis] / cur_block;
-            axis2index[orig_axis] = axis2index[orig_axis] % cur_block;
-            axis2blocks[orig_axis]++; // next block
-        }
-    }
-    return ret;
-}
-
-static void cannot_convert_warning(const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, const sc_dims &plain_dims) {
-    SC_MODULE_WARN << "Can not do vectorize in reorder: " << input_format
-                   << " to " << output_format
-                   << " with plain dims:" << utils::print_vector(plain_dims);
-}
-
-// complex constant folding will suprisingly cause LLVM regression on
-// vectorization for reorder
-static void set_const_fold_bypass(const context_ptr &ctx, const stmt &v) {
-#if defined(SC_LLVM_BACKEND)
-    if (ctx->flags_.jit_kind_ == jit_kind::llvm) {
-        v->attr()["bypass_complex_const_fold"] = true;
-    }
-#endif
-}
-
-constexpr const int byte = 8;
-constexpr const int avx_simd_length = 128;
-// inorder to vectorize u8s8x8
-constexpr const int u8s8_min_simd_length = 64;
-bool is_valid_step(int step) {
-    return utils::is_one_of(step, 4, 8, 16, 32, 64);
-}
-static bool check_u8s8(sc_data_type_t dtype) {
-    return dtype.is_etype(sc_data_etype::S8)
-            || dtype.is_etype(sc_data_etype::U8);
-}
-void compute_reorder_stride2stride(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_innermost_dim_strided = false, bool is_dynamic = false,
-        bool dynamic_no_padding = false) {
-    auto input = src.get_real_tensor();
-    auto output = dst.get_real_tensor();
-    auto bld = builder::get_current_builder();
-    std::vector<expr> iter_vars;
-    std::vector<expr> in_indexes, out_indexes;
-    std::vector<expr> loop_indexes;
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, output_format);
-    auto output_last_origin_axis = output_format.format_code_.get(
-            output_format.format_code_.ndims() - 1);
-    auto input_last_origin_axis = input_format.format_code_.get(
-            input_format.format_code_.ndims() - 1);
-    int input_origin_axis_vectorized = input_format.format_code_.ndims() - 1;
-    int output_origin_axis_vectorized = output_format.format_code_.ndims() - 1;
-    find_vectorized_axis(input_blocking_dims_expr, input_format,
-            input_last_origin_axis, input_origin_axis_vectorized);
-    find_vectorized_axis(output_blocking_dims_expr, output_format,
-            output_last_origin_axis, output_origin_axis_vectorized);
-    auto src_slice_shape = src.get_shape();
-    auto dst_slice_shape = dst.get_shape();
-    int step = static_cast<int>(vectorize_step(ctx, dtype.type_code_));
-    const int max_step
-            = static_cast<int>(vectorize_step(ctx, dtype.type_code_));
-    if ((!output_loop
-                && src_slice_shape[input_origin_axis_vectorized]
-                           .isa<constant>())
-            || (output_loop
-                    && dst_slice_shape[output_origin_axis_vectorized]
-                               .isa<constant>())) {
-        step = std::min(max_step,
-                (int)get_expr_as_int(!output_loop
-                                ? src_slice_shape[input_origin_axis_vectorized]
-                                : dst_slice_shape
-                                        [output_origin_axis_vectorized]));
-        step = utils::get_nearest_vector_step(step);
-    }
-
-    bool is_u8s8 = check_u8s8(dtype);
-    bool can_vectorize = !is_innermost_dim_strided
-            && input_last_origin_axis == output_last_origin_axis
-            && step * utils::get_sizeof_type(dtype) * byte >= (uint64_t)(
-                       is_u8s8 ? u8s8_min_simd_length : avx_simd_length);
-    if (!can_vectorize) {
-        cannot_convert_warning(input_format, output_format, plain_dims);
-    }
-    step = can_vectorize ? step : 1;
-    bool no_padding = is_dynamic && dynamic_no_padding;
-    if (!output_loop) {
-        no_padding |= !is_dynamic
-                && get_expr_as_int(
-                           src_slice_shape[input_origin_axis_vectorized])
-                                % step
-                        == 0;
-        for (size_t i = 0; i < plain_dims.size(); i++) {
-            iter_vars.emplace_back(builder::make_var(datatypes::index,
-                    std::string("_fuseiter") + fusion_create_idx()));
-            loop_indexes.emplace_back(iter_vars[i]);
-            in_indexes.emplace_back(iter_vars[i] + src.get_offset()[i]);
-        }
-        out_indexes = get_reorder_stride2stride_indexes(
-                in_indexes, input_format, output_format, plain_dims);
-        auto cur = builder::make_stmts_unattached({});
-        expr mask;
-        stmt mask_def;
-        if (!no_padding && can_vectorize) {
-            auto idx_len
-                    = cast_to_s32(src_slice_shape[input_origin_axis_vectorized])
-                    - cast_to_s32(iter_vars[input_origin_axis_vectorized]);
-            auto cur_step = builder::make_min(
-                    builder::make_max(builder::make_constant(0), idx_len),
-                    step);
-            mask = generate_mask_var_by_step(mask_def, cur_step, step);
-            cur.static_as<stmts>()->seq_.emplace_back(mask_def);
-        }
-        auto assign = builder::make_assign_unattached(
-                builder::make_indexing(output, out_indexes, step, mask),
-                builder::make_indexing(src.tptr_, loop_indexes, step, mask));
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        cur.static_as<stmts>()->seq_.emplace_back(assign);
-        stmt body;
-        std::vector<stmt> loops;
-
-        for (int i = static_cast<int>(plain_dims.size()) - 1; i >= 0; i--) {
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), src.get_shape()[i],
-                    can_vectorize && i == input_origin_axis_vectorized
-                            ? expr(step)
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i);
-            loops.push_back(cur);
-        }
-        std::reverse(loops.begin(), loops.end());
-        for (int i = 1; i < static_cast<int>(plain_dims.size()) - 2; i++) {
-            loops[0].checked_as<for_loop>()->fuse(
-                    loops[i].checked_as<for_loop>());
-        }
-        bld->emit(cur);
-        if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-    } else {
-        no_padding |= !is_dynamic
-                && get_expr_as_int(
-                           dst_slice_shape[output_origin_axis_vectorized])
-                                % step
-                        == 0;
-        for (size_t i = 0; i < plain_dims.size(); i++) {
-            iter_vars.emplace_back(builder::make_var(datatypes::index,
-                    std::string("_fuseiter") + fusion_create_idx()));
-            out_indexes.emplace_back(iter_vars[i] + dst.get_offset()[i]);
-        }
-        in_indexes = get_reorder_stride2stride_indexes(
-                out_indexes, output_format, input_format, plain_dims);
-        auto cur = builder::make_stmts_unattached({});
-        expr mask;
-        stmt mask_def;
-        if (!no_padding && can_vectorize) {
-            auto idx_len
-                    = cast_to_s32(
-                              dst_slice_shape[output_origin_axis_vectorized])
-                    - cast_to_s32(iter_vars[output_origin_axis_vectorized]);
-            auto cur_step = builder::make_min(
-                    builder::make_max(builder::make_constant(0), idx_len),
-                    step);
-            mask = generate_mask_var_by_step(mask_def, cur_step, step);
-            cur.static_as<stmts>()->seq_.emplace_back(mask_def);
-        }
-        auto assign = builder::make_assign_unattached(
-                builder::make_indexing(output, out_indexes, step, mask),
-                builder::make_indexing(input, in_indexes, step, mask));
-        cur.static_as<stmts>()->seq_.emplace_back(assign);
-        cur->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-        stmt body;
-        std::vector<stmt> loops;
-        for (int i = static_cast<int>(plain_dims.size()) - 1; i >= 0; i--) {
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), dst.get_shape()[i],
-                    can_vectorize && i == output_origin_axis_vectorized
-                            ? expr(step)
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i);
-            loops.push_back(cur);
-        }
-        std::reverse(loops.begin(), loops.end());
-        for (int i = 1; i < static_cast<int>(plain_dims.size()) - 2; i++) {
-            loops[0].checked_as<for_loop>()->fuse(
-                    loops[i].checked_as<for_loop>());
-        }
-        bld->emit(cur);
-        if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-    }
-}
-
-void compute_reorder_block2stride(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_innermost_dim_strided = false, bool is_dynamic = false,
-        bool dynamic_no_padding = false) {
-    auto input = src.get_real_tensor();
-    auto output = dst.get_real_tensor();
-    auto bld = builder::get_current_builder();
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, output_format);
-    assert(output_format.format_code_.ndims()
-            == output_format.format_code_.norig_dims());
-    auto output_last_origin_axis = output_format.format_code_.get(
-            output_format.format_code_.ndims() - 1);
-    auto input_last_origin_axis = input_format.format_code_.get(
-            input_format.format_code_.ndims() - 1);
-    // block has constraint
-    assert(!is_dynamic_dim(input_blocking_dims.back()));
-    int input_origin_axis_vectorized = input_format.format_code_.ndims() - 1;
-    int output_origin_axis_vectorized = output_format.format_code_.ndims() - 1;
-    find_vectorized_axis(input_blocking_dims_expr, input_format,
-            input_last_origin_axis, input_origin_axis_vectorized);
-    find_vectorized_axis(output_blocking_dims_expr, output_format,
-            output_last_origin_axis, output_origin_axis_vectorized);
-    auto src_slice_shape = src.get_shape();
-    int max_step = static_cast<int>(vectorize_step(ctx, dtype.type_code_));
-    assert(!is_dynamic_dim(input_blocking_dims[input_origin_axis_vectorized]));
-    int step = std::min(max_step,
-            static_cast<int>(get_expr_as_int(
-                    src_slice_shape[input_origin_axis_vectorized])));
-    step = utils::get_nearest_vector_step(step);
-    step = std::min(max_step, step);
-    if (attrs.get_or_else(op_attr_key::no_fuse, false)) {
-        while (step < max_step
-                && get_expr_as_int(
-                           src_slice_shape[input_origin_axis_vectorized])
-                                % (2 * step)
-                        == 0) {
-            step = 2 * step;
-        }
-    }
-    bool is_u8s8 = check_u8s8(dtype);
-    bool can_vectorize = !is_innermost_dim_strided
-            && input_last_origin_axis == output_last_origin_axis
-            && get_expr_as_int(src_slice_shape[input_origin_axis_vectorized])
-                            % step
-                    == 0
-            && is_valid_step(step)
-            && step * utils::get_sizeof_type(dtype) * byte >= (uint64_t)(
-                       is_u8s8 ? u8s8_min_simd_length : avx_simd_length);
-
-    bool no_padding = !is_dynamic
-            && sc_data_format_t::get_padded_plain_shapes(
-                       input_blocking_dims, input_format)
-                    == sc_data_format_t::get_padded_plain_shapes(
-                            output_blocking_dims, output_format);
-    no_padding |= (is_dynamic && dynamic_no_padding);
-
-    if (!can_vectorize) {
-        cannot_convert_warning(input_format, output_format, plain_dims);
-    }
-    step = can_vectorize ? step : 1;
-    std::vector<expr> iter_vars;
-    std::vector<expr> in_indexes;
-    std::vector<expr> loop_indexes;
-    for (size_t i = 0; i < input_blocking_dims.size(); i++) {
-        iter_vars.emplace_back(range_from_outer_loop(src.get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-        in_indexes.emplace_back(iter_vars[i] + src.get_offset()[i]);
-        loop_indexes.emplace_back(iter_vars[i]);
-    }
-    expr condition;
-    expr last_axis_offset, other_axis_condition;
-    std::vector<expr> tmp_out_indexes = get_reorder_block2plain_indexes(graph,
-            in_indexes, input_format, plain_dims, condition, last_axis_offset,
-            other_axis_condition, input_origin_axis_vectorized);
-    std::vector<expr> out_indexes
-            = get_reorder_stride2stride_indexes(tmp_out_indexes,
-                    output_format.to_plain(), output_format, plain_dims);
-
-    expr mask;
-    stmt mask_def;
-    if (!no_padding && can_vectorize) {
-        // mask = min(max(0, last_dim_len - last_dim_idx),step)
-        // To choose [0 ~ step] mask
-        auto cur_step = builder::make_min(
-                builder::make_max(builder::make_constant(0), last_axis_offset),
-                step);
-        // mask = other_dims_condition ? mask : 0;
-        mask = generate_mask_var_by_step(
-                mask_def, cur_step, step, other_axis_condition);
-    }
-
-    auto assign = builder::make_assign_unattached(
-            builder::make_indexing(output, out_indexes, step, mask),
-            // here, use src.tptr instead of input is aimed to avoid
-            // input is tensor_view_op. Oherwisw, it will throw
-            // illegal exception in index_flatten
-            builder::make_indexing(expr(src.tptr_), loop_indexes, step));
-    assign->attr()[op_traits::workload_computable_t::workload_number] = wkld;
-    stmt cur = assign;
-    if (mask_def.defined()) {
-        cur = builder::make_stmts_unattached({mask_def, assign});
-    }
-    if (!no_padding && !can_vectorize) {
-        cur = builder::make_if_else_unattached(condition, assign, stmt());
-    }
-    stmt body;
-    for (int i = static_cast<int>(input_blocking_dims.size()) - 1; i >= 0;
-            i--) {
-        // Do not generate those dummy loops
-        if (!iter_vars.at(i).isa<var>()) continue;
-        body = cur.isa<stmts>()
-                ? cur
-                : make_stmt<stmts_node_t>(std::vector<stmt> {std::move(cur)});
-        cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)), expr(0),
-                src.get_shape()[i],
-                can_vectorize && i == input_origin_axis_vectorized
-                        ? expr(static_cast<int>(step))
-                        : expr(1),
-                std::move(body), true, for_type::NORMAL);
-        bind_loop_axis(expand_gt, cur, i, true);
-    }
-    cur->attr()[stmt_attr_key::merge_loop] = true;
-    bld->emit(cur);
-    if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-}
-
-void compute_reorder_stride2block(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_innermost_dim_strided = false, bool is_dynamic = false,
-        bool dynamic_no_padding = false) {
-    auto input = src.get_real_tensor();
-    auto output = dst.get_real_tensor();
-    auto bld = builder::get_current_builder();
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_exprs
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_exprs
-            = get_blocking_shapes_expr(graph, plain_dims, output_format);
-    assert(input_format.format_code_.ndims()
-            == input_format.format_code_.norig_dims());
-    auto input_last_origin_axis = input_format.format_code_.get(
-            input_format.format_code_.ndims() - 1);
-    auto output_last_origin_axis = output_format.format_code_.get(
-            output_format.format_code_.ndims() - 1);
-
-    // block has constraint
-    assert(!is_dynamic_dim(output_blocking_dims.back()));
-    int input_origin_axis_vectorized = input_format.format_code_.ndims() - 1;
-    int output_origin_axis_vectorized = output_format.format_code_.ndims() - 1;
-    find_vectorized_axis(input_blocking_exprs, input_format,
-            input_last_origin_axis, input_origin_axis_vectorized);
-    find_vectorized_axis(output_blocking_exprs, output_format,
-            output_last_origin_axis, output_origin_axis_vectorized);
-    auto src_slice_shape = src.get_shape();
-    auto dst_slice_shape = dst.get_shape();
-    int max_step = static_cast<int>(vectorize_step(ctx, dtype.type_code_));
-    int step = static_cast<int>(vectorize_step(ctx, dtype.type_code_));
-    if ((!output_loop
-                && src_slice_shape[input_origin_axis_vectorized]
-                           .isa<constant>())
-            || (output_loop
-                    && dst_slice_shape[output_origin_axis_vectorized]
-                               .isa<constant>())) {
-        step = std::min(step,
-                output_loop ? static_cast<int>(get_expr_as_int(
-                        dst_slice_shape[output_origin_axis_vectorized]))
-                            : static_cast<int>(get_expr_as_int(src_slice_shape
-                                            [input_origin_axis_vectorized])));
-        step = utils::get_nearest_vector_step(step);
-    }
-    if (attrs.get_or_else(op_attr_key::no_fuse, false)) {
-        while (step < max_step
-                && get_expr_as_int(
-                           dst_slice_shape[output_origin_axis_vectorized])
-                                % (2 * step)
-                        == 0) {
-            step = 2 * step;
-        }
-    }
-    bool is_u8s8 = check_u8s8(dtype);
-    bool can_vectorize = !is_innermost_dim_strided
-            && input_last_origin_axis == output_last_origin_axis
-            && get_expr_as_int(dst_slice_shape[output_origin_axis_vectorized])
-                            % step
-                    == 0
-            && is_valid_step(step)
-            && step * utils::get_sizeof_type(dtype) * byte >= (uint64_t)(
-                       is_u8s8 ? u8s8_min_simd_length : avx_simd_length);
-    // Usually use input loop means no padding in static, but not in dynamic, if
-    // dynamic and use input loop, need to check the static dim with blocks.
-    if (!output_loop
-            && !is_dynamic_dim(
-                    input_blocking_dims[input_origin_axis_vectorized])
-            && get_expr_as_int(src_slice_shape[input_origin_axis_vectorized])
-                            % step
-                    != 0) {
-        can_vectorize = false;
-    }
-    bool no_padding = !is_dynamic
-            && sc_data_format_t::get_padded_plain_shapes(
-                       output_blocking_dims, output_format)
-                    == sc_data_format_t::get_padded_plain_shapes(
-                            input_blocking_dims, input_format);
-    no_padding |= (is_dynamic && dynamic_no_padding);
-    if (!can_vectorize) {
-        cannot_convert_warning(input_format, output_format, plain_dims);
-    }
-
-    step = can_vectorize ? step : 1;
-    std::vector<expr> iter_vars;
-    std::vector<expr> loop_indexes;
-    if (!output_loop) {
-        std::vector<expr> in_indexes;
-        for (size_t i = 0; i < input_blocking_dims.size(); i++) {
-            iter_vars.emplace_back(range_from_outer_loop(src.get_ranges()[i])
-                            ? expr(0)
-                            : builder::make_var(datatypes::index,
-                                    std::string("_fuseiter")
-                                            + fusion_create_idx()));
-            in_indexes.emplace_back(iter_vars[i] + src.get_offset()[i]);
-            loop_indexes.emplace_back(iter_vars[i]);
-        }
-        std::vector<expr> tmp_out_indexes = get_reorder_stride2stride_indexes(
-                in_indexes, input_format, input_format.to_plain(), plain_dims);
-        std::vector<expr> out_indexes = get_reorder_plain2block_indexes(
-                tmp_out_indexes, output_format);
-
-        auto assign = builder::make_stmts_unattached(
-                {builder::make_assign_unattached(
-                        builder::make_indexing(output, out_indexes, step),
-                        // here, use src.tptr instead of input is aimed to avoid
-                        // input is tensor_view_op. otherwise, it will throw
-                        // illegal exception in index_flatten
-                        builder::make_indexing(
-                                src.tptr_, loop_indexes, step))});
-        assign->attr()[op_traits::workload_computable_t::workload_number]
-                = wkld;
-        auto cur = assign;
-        stmt body;
-        for (int i = static_cast<int>(input_blocking_dims.size()) - 1; i >= 0;
-                i--) {
-            // Do not generate those dummy loops
-            if (!iter_vars.at(i).isa<var>()) continue;
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), src.get_shape()[i],
-                    can_vectorize && i == input_origin_axis_vectorized
-                            ? expr(static_cast<int>(step))
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-        if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-    } else {
-        std::vector<expr> out_indexes;
-        for (size_t i = 0; i < output_blocking_dims.size(); i++) {
-            iter_vars.emplace_back(range_from_outer_loop(dst.get_ranges()[i])
-                            ? expr(0)
-                            : builder::make_var(datatypes::index,
-                                    std::string("_fuseiter")
-                                            + fusion_create_idx()));
-            out_indexes.emplace_back(iter_vars[i] + dst.get_offset()[i]);
-        }
-        expr condition;
-        expr last_axis_offset, other_axis_condition;
-        std::vector<expr> tmp_in_indexes
-                = get_reorder_block2plain_indexes(graph, out_indexes,
-                        output_format, plain_dims, condition, last_axis_offset,
-                        other_axis_condition, output_origin_axis_vectorized);
-        std::vector<expr> in_indexes
-                = get_reorder_stride2stride_indexes(tmp_in_indexes,
-                        input_format.to_plain(), input_format, plain_dims);
-        expr mask;
-        stmt mask_def;
-        if (!no_padding && can_vectorize) {
-            // mask = min(max(0, last_dim_len - last_dim_idx),step)
-            // To choose [0 ~ step] mask
-            auto cur_step = builder::make_min(
-                    builder::make_max(
-                            builder::make_constant(0), last_axis_offset),
-                    step);
-            // mask = other_dims_condition ? mask : 0;
-            mask = generate_mask_var_by_step(
-                    mask_def, cur_step, step, other_axis_condition);
-        }
-        auto assign = builder::make_assign_unattached(
-                builder::make_indexing(expr(output), out_indexes, step),
-                builder::make_indexing(expr(input), in_indexes, step, mask));
-
-        assign->attr()[op_traits::workload_computable_t::workload_number]
-                = wkld;
-        stmt cur = assign;
-        if (mask_def.defined()) {
-            cur = builder::make_stmts_unattached({mask_def, assign});
-        }
-        if (!no_padding && !can_vectorize) {
-            auto padding = builder::make_stmts_unattached(
-                    {builder::make_assign_unattached(
-                            builder::make_indexing(output, out_indexes, step),
-                            builder::make_constant({0UL},
-                                    sc_data_type_t(dtype.type_code_, step)))});
-            cur = builder::make_if_else_unattached(condition, assign, padding);
-        }
-        stmt body;
-        std::vector<stmt> loops;
-        //______________mask version
-        for (int i = static_cast<int>(output_blocking_dims.size()) - 1; i >= 0;
-                i--) {
-            // Do not generate those dummy loops
-            if (!iter_vars.at(i).isa<var>()) continue;
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), dst.get_shape()[i],
-                    can_vectorize && i == output_origin_axis_vectorized
-                            ? expr(static_cast<int>(step))
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-            loops.push_back(cur);
-        }
-        bld->emit(cur);
-        if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-    }
-}
-
-void compute_reorder_block2block(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims1, bool output_loop, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_innermost_dim_strided = false, bool is_dynamic = false,
-        bool dynamic_no_padding = false) {
-    auto input = src.get_real_tensor();
-    auto output = dst.get_real_tensor();
-    auto bld = builder::get_current_builder();
-    // walk around for bert fusion
-    sc_dims plain_dims(plain_dims1);
-    if (plain_dims.empty()) {
-        sc_dims dst_blocking_dims;
-        for (int64_t i = 0; i < dst.nslice_dims(); i++) {
-            dst_blocking_dims.push_back(get_const_as_int(
-                    dst.get_shape()[i].checked_as<constant>()));
-        }
-        plain_dims = sc_data_format_t::get_padded_plain_shapes(
-                dst_blocking_dims, output_format);
-    }
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_exprs
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_exprs
-            = get_blocking_shapes_expr(graph, plain_dims, output_format);
-    auto input_padded_plain_dims = sc_data_format_t::get_padded_plain_shapes(
-            input_blocking_dims, input_format);
-    auto output_padded_plain_dims = sc_data_format_t::get_padded_plain_shapes(
-            output_blocking_dims, output_format);
-    // plain axis of last block
-    auto input_block_axis = input_format.format_code_.get(
-            input_format.format_code_.ndims() - 1);
-    auto output_block_axis = output_format.format_code_.get(
-            output_format.format_code_.ndims() - 1);
-    int input_origin_axis_vectorized = input_format.format_code_.ndims() - 1;
-    int output_origin_axis_vectorized = output_format.format_code_.ndims() - 1;
-    find_vectorized_axis(input_blocking_exprs, input_format, input_block_axis,
-            input_origin_axis_vectorized);
-    find_vectorized_axis(output_blocking_exprs, output_format,
-            output_block_axis, output_origin_axis_vectorized);
-    auto src_slice_shape = src.get_shape();
-    auto dst_slice_shape = dst.get_shape();
-    bool no_padding = !is_dynamic
-            && input_padded_plain_dims == output_padded_plain_dims;
-    no_padding |= (is_dynamic && dynamic_no_padding);
-    int step = std::min(static_cast<int>(vectorize_step(ctx, dtype.type_code_)),
-            output_loop
-                    ? static_cast<int>(get_expr_as_int(
-                            dst_slice_shape[output_origin_axis_vectorized]))
-                    : static_cast<int>(get_expr_as_int(
-                            src_slice_shape[input_origin_axis_vectorized])));
-    step = utils::get_nearest_vector_step(step);
-    bool is_u8s8 = check_u8s8(dtype);
-    bool can_vectorize = !is_innermost_dim_strided
-            && input_block_axis == output_block_axis
-            && get_expr_as_int(dst_slice_shape[output_origin_axis_vectorized])
-                            % step
-                    == 0
-            && get_expr_as_int(src_slice_shape[input_origin_axis_vectorized])
-                            % step
-                    == 0
-            && plain_dims[input_block_axis]
-                            % input_blocking_dims[input_origin_axis_vectorized]
-                    == 0
-            && plain_dims[output_block_axis]
-                            % output_blocking_dims
-                                    [output_origin_axis_vectorized]
-                    == 0
-            && is_valid_step(step)
-            && step * utils::get_sizeof_type(dtype) * 8 >= (uint64_t)(
-                       is_u8s8 ? u8s8_min_simd_length : avx_simd_length);
-
-    if (!can_vectorize) {
-        cannot_convert_warning(input_format, output_format, plain_dims);
-    }
-    step = can_vectorize ? step : 1;
-    std::vector<expr> iter_vars;
-    // for input loops
-    if (!output_loop) {
-        std::vector<expr> in_indexes;
-        std::vector<expr> loop_indexes;
-        for (size_t i = 0; i < input_blocking_dims.size(); i++) {
-            iter_vars.emplace_back(range_from_outer_loop(src.get_ranges()[i])
-                            ? expr(0)
-                            : builder::make_var(datatypes::index,
-                                    std::string("_fuseiter")
-                                            + fusion_create_idx()));
-            in_indexes.emplace_back(iter_vars[i] + src.get_offset()[i]);
-            loop_indexes.emplace_back(iter_vars[i]);
-        }
-        expr condition;
-        expr last_axis_offset, other_axis_condition;
-        std::vector<expr> tmp_indexes
-                = get_reorder_block2plain_indexes(graph, in_indexes,
-                        input_format, plain_dims, condition, last_axis_offset,
-                        other_axis_condition, input_origin_axis_vectorized);
-        std::vector<expr> out_indexes
-                = get_reorder_plain2block_indexes(tmp_indexes, output_format);
-
-        auto assign = builder::make_stmts_unattached(
-                {builder::make_assign_unattached(
-                        builder::make_indexing(output, out_indexes, step),
-                        // here, use src.tptr instead of input is aimed to
-                        // avoid input is tensor_view_op. Oherwisw, it will
-                        // throw illegal exception in index_flatten
-                        builder::make_indexing(
-                                src.tptr_, loop_indexes, step))});
-        assign->attr()[op_traits::workload_computable_t::workload_number]
-                = wkld;
-        auto cur = no_padding
-                ? assign
-                : builder::make_if_else_unattached(condition, assign, stmt());
-        stmt body;
-        for (int i = static_cast<int>(input_blocking_dims.size()) - 1; i >= 0;
-                i--) {
-            // Do not generate those dummy loops
-            if (!iter_vars.at(i).isa<var>()) continue;
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), src.get_shape()[i],
-                    can_vectorize && i == input_origin_axis_vectorized
-                            ? expr(static_cast<int>(step))
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-        if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-    } else {
-        std::vector<expr> out_indexes;
-        for (size_t i = 0; i < output_blocking_dims.size(); i++) {
-            iter_vars.emplace_back(builder::make_var(datatypes::index,
-                    std::string("_fuseiter") + fusion_create_idx()));
-            out_indexes.emplace_back(iter_vars[i] + dst.get_offset()[i]);
-        }
-        expr condition;
-        expr last_axis_offset, other_axis_condition;
-        std::vector<expr> tmp_indexes
-                = get_reorder_block2plain_indexes(graph, out_indexes,
-                        output_format, plain_dims, condition, last_axis_offset,
-                        other_axis_condition, output_origin_axis_vectorized);
-        std::vector<expr> in_indexes
-                = get_reorder_plain2block_indexes(tmp_indexes, input_format);
-        auto assign = builder::make_stmts_unattached(
-                {builder::make_assign_unattached(
-                        builder::make_indexing(output, out_indexes, step),
-                        builder::make_indexing(input, in_indexes, step))});
-        assign->attr()[op_traits::workload_computable_t::workload_number]
-                = wkld;
-        auto padding = builder::make_stmts_unattached(
-                {builder::make_assign_unattached(
-                        builder::make_indexing(output, out_indexes, step),
-                        builder::make_constant({0UL},
-                                sc_data_type_t(dtype.type_code_, step)))});
-        auto cur = no_padding
-                ? assign
-                : builder::make_if_else_unattached(condition, assign, padding);
-        stmt body;
-        std::vector<stmt> loops;
-        for (int i = static_cast<int>(output_blocking_dims.size()) - 1; i >= 0;
-                i--) {
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), dst.get_shape()[i],
-                    can_vectorize && i == output_origin_axis_vectorized
-                            ? expr(static_cast<int>(step))
-                            : expr(1),
-                    std::move(body), true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-            loops.push_back(cur);
-        }
-        std::reverse(loops.begin(), loops.end());
-        for (int i = 1; i < static_cast<int>(output_blocking_dims.size()) - 2;
-                i++) {
-            loops[0].checked_as<for_loop>()->fuse(
-                    loops[i].checked_as<for_loop>());
-        }
-        bld->emit(cur);
-        if (!can_vectorize) { set_const_fold_bypass(ctx, cur); }
-    }
-}
-
-void compute_reorder_block(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_innermost_dim_strided = false, bool is_dynamic = false,
-        int impl_alg = 0) {
-    COMPILE_ASSERT(input_format.is_convertible(output_format),
-            "Can not convert input format "
-                    << input_format << " to output format " << output_format
-                    << ".");
-
-    std::vector<int> inp_a_axis, inp_b_axis, out_a_axis, out_b_axis;
-
-    bool is_vnni_reorder = false, vnni_usex16step = false;
-    bool dynamic_no_padding = impl_alg & impl_kind_t::no_padding;
-    sc_trans_kernel trans_kernel_used;
-    sc_vnni_kernel vnni_kernel_used;
-    if (!is_innermost_dim_strided
-            && can_be_vnni_reorder(ctx, inp_a_axis, inp_b_axis, out_a_axis,
-                    out_b_axis, plain_dims, input_format, output_format, src,
-                    dst, dtype, is_vnni_reorder, is_dynamic, dynamic_no_padding,
-                    vnni_kernel_used)) {
-        compute_vnni_reorder(graph, ctx, src, dst, input_format, output_format,
-                dtype, plain_dims, output_loop, attrs, inp_a_axis, inp_b_axis,
-                out_a_axis, out_b_axis, expand_gt, wkld, is_vnni_reorder,
-                is_dynamic, dynamic_no_padding, vnni_kernel_used);
-    } else if (!is_innermost_dim_strided
-            && can_be_fast_transpose(graph, ctx, inp_a_axis, inp_b_axis,
-                    out_a_axis, out_b_axis, plain_dims, input_format,
-                    output_format, src, dst, dtype, is_dynamic,
-                    dynamic_no_padding, trans_kernel_used)) {
-        compute_fast_transpose(graph, ctx, src, dst, input_format,
-                output_format, dtype, plain_dims, output_loop, attrs,
-                inp_a_axis, inp_b_axis, out_a_axis, out_b_axis, expand_gt, wkld,
-                is_dynamic, dynamic_no_padding, trans_kernel_used);
-    } else if (is_not_blocking(input_format)
-            && is_not_blocking(output_format)) {
-        compute_reorder_stride2stride(graph, ctx, src, dst, input_format,
-                output_format, dtype, plain_dims, output_loop, attrs, expand_gt,
-                wkld, is_innermost_dim_strided, is_dynamic, dynamic_no_padding);
-    } else if (is_not_blocking(input_format) && output_format.is_blocking()) {
-        compute_reorder_stride2block(graph, ctx, src, dst, input_format,
-                output_format, dtype, plain_dims, output_loop, attrs, expand_gt,
-                wkld, is_innermost_dim_strided, is_dynamic, dynamic_no_padding);
-    } else if (input_format.is_blocking() && is_not_blocking(output_format)) {
-        compute_reorder_block2stride(graph, ctx, src, dst, input_format,
-                output_format, dtype, plain_dims, attrs, expand_gt, wkld,
-                is_innermost_dim_strided, is_dynamic, dynamic_no_padding);
-    } else if (input_format.is_blocking() && output_format.is_blocking()) {
-        compute_reorder_block2block(graph, ctx, src, dst, input_format,
-                output_format, dtype, plain_dims, output_loop, attrs, expand_gt,
-                wkld, is_innermost_dim_strided, is_dynamic, dynamic_no_padding);
-    } else {
-        std::ostringstream ss;
-        ss << "Unsupported data format. in = " << input_format
-           << ", out = " << output_format;
-        SC_MODULE_WARN << ss.str();
-        throw tuner_recoverable_exception_t(ss.str());
-    }
-}
-
-size_t reorder_op_t::compute_workload(const std::vector<shape_dtype_pair> &ins,
-        const std::vector<shape_dtype_pair> &outs) {
-    return fusible_op_t::compute_workload(ins, outs)
-            * workload_penalty_coefficient;
-}
-
-bool reorder_op_t::check_padding() const {
-    auto &input_format = get_input_format();
-    auto &output_format = get_output_format();
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims_, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims_, output_format);
-    bool is_dynamic_block = is_dynamic_blocking(plain_dims_, input_format);
-    is_dynamic_block |= is_dynamic_blocking(plain_dims_, output_format);
-    return (!is_dynamic_block
-                   && sc_data_format_t::get_padded_plain_shapes(
-                              input_blocking_dims, input_format)
-                           != sc_data_format_t::get_padded_plain_shapes(
-                                   output_blocking_dims, output_format))
-            || (is_dynamic_block
-                    && !(info_.cur_impl_ & impl_kind_t::no_padding));
-}
-
-bool reorder_op_t::use_output_loop() const {
-    if (attrs_.get_or_else("use_input_loop", false)) { return false; }
-    if (check_padding()) {
-        auto &input_format = get_input_format();
-        auto &output_format = get_output_format();
-        // block->stride
-        if (input_format.is_blocking() && is_not_blocking(output_format))
-            return false;
-        else if (input_format.is_blocking() && output_format.is_blocking()) {
-            // block->block: check the products of blocking dims whether
-            // same?
-            return (get_dims_product(
-                            get_inputs()[0]->details_.get_blocking_dims())
-                    < get_dims_product(
-                            get_outputs()[0]->details_.get_blocking_dims()));
-        }
-        return true;
-    }
-    if (attrs_.get_or_else(op_attr_key::no_fuse, false)) {
-        if (!get_input_format().is_blocking()) return true;
-    }
-    if (attrs_.get_or_else(op_attr_key::break_pre_fuse, false)) return true;
-    return false;
-}
-
-bool reorder_op_t::support_output_loop() const {
-    return is_not_blocking(get_input_format())
-            || get_output_format().is_blocking();
-}
-
-#define INIT_REORDER_OP_INFO() \
-    bool is_innermost_dim_strided \
-            = info_.inputs_[0]->details_.get_strides().back() != 1 \
-            || info_.outputs_[0]->details_.get_strides().back() != 1; \
-    auto &input_format = info_.inputs_[0]->details_.get_format(); \
-    auto &output_format = info_.outputs_[0]->details_.get_format(); \
-    auto dtype = info_.inputs_[0]->details_.dtype_; \
-    auto input_blocking_shapes_expr = get_blocking_shapes_expr( \
-            get_owner_graph(), plain_dims_, input_format); \
-    auto output_blocking_shapes_expr = get_blocking_shapes_expr( \
-            get_owner_graph(), plain_dims_, output_format); \
-    sc_graph_t g; \
-    auto toy_inp_tsr = builder::make_tensor( \
-            std::string("dummy_inp"), input_blocking_shapes_expr, dtype); \
-    auto toy_out_tsr = builder::make_tensor( \
-            std::string("dummy_out"), output_blocking_shapes_expr, dtype); \
-    auto src = tensor_slice(toy_inp_tsr), dst = tensor_slice(toy_out_tsr); \
-    std::vector<int> inp_a_axis, inp_b_axis, out_a_axis, out_b_axis; \
-    bool is_vnni_reorder = false; \
-    sc_trans_kernel trans_kernel_used; \
-    sc_vnni_kernel vnni_kernel_used;
-
-bool reorder_op_t::support_optimized_kernel(const context_ptr &ctx) const {
-    INIT_REORDER_OP_INFO()
-    int trans_inp_a_axis = 0, trans_inp_b_axis = 0, trans_out_a_axis = 0,
-        trans_out_b_axis = 0;
-    return (!is_innermost_dim_strided
-                   && can_be_fast_transpose(get_owner_graph(), ctx, inp_a_axis,
-                           inp_b_axis, out_a_axis, out_b_axis, plain_dims_,
-                           input_format, output_format, src, dst, dtype,
-                           is_dynamic(),
-                           info_.cur_impl_ & impl_kind_t::no_padding,
-                           trans_kernel_used))
-            || can_be_vnni_reorder(ctx, inp_a_axis, inp_b_axis, out_a_axis,
-                    out_b_axis, plain_dims_, input_format, output_format, src,
-                    dst, dtype, is_vnni_reorder, is_dynamic(),
-                    info_.cur_impl_ & impl_kind_t::no_padding,
-                    vnni_kernel_used);
-}
-
-bool reorder_op_t::meet_vnni_reorder_require(const context_ptr &ctx) const {
-    INIT_REORDER_OP_INFO()
-    return !is_innermost_dim_strided
-            && can_be_vnni_reorder(ctx, inp_a_axis, inp_b_axis, out_a_axis,
-                    out_b_axis, plain_dims_, input_format, output_format, src,
-                    dst, dtype, is_vnni_reorder, is_dynamic(),
-                    info_.cur_impl_ & impl_kind_t::no_padding,
-                    vnni_kernel_used);
-}
-
-void reorder_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    bool is_innermost_dim_strided = !is_dynamic()
-            && (info_.inputs_[0]->details_.get_strides().back() != 1
-                    || info_.outputs_[0]->details_.get_strides().back() != 1);
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    auto &input_format = info_.inputs_[0]->details_.get_format();
-    auto &output_format = info_.outputs_[0]->details_.get_format();
-    bool output_loop = use_output_loop();
-    compute_reorder_block(get_owner_graph(), ctx, *inputs[0], *dst[0],
-            input_format, output_format, info_.inputs_[0]->details_.dtype_,
-            plain_dims_, output_loop, attrs_,
-            output_loop ? get_outputs()[0] : get_inputs()[0], wkld,
-            is_innermost_dim_strided, is_dynamic(), info_.cur_impl_);
-}
-
-std::vector<int> reorder_op_t::get_impl_dispatch_candidates(
-        const context_ptr &ctx) {
-    return get_default_impl_dispatch_candidates();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/reorder.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/reorder.hpp
deleted file mode 100644
index 1b2aad74c54..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/reorder.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_REORDER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_REORDER_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class sc_trans_kernel {
-    NO_TRANS, // don't use transpose
-    F32_8X8_TRANS, // use f32 8x8 transpose (f32 default)
-    F32_16X16_TRANS, // use f32 16x16 transpose (currently this is not enabled)
-    U8S8_16X16_TRANS, // use u8s8 16x16 transpose (u8s8 default)
-    BIT16_16X16_TRANS, // use bf16x16 transpose (Enabled when the cpu supports
-    // the highest instruction set is avx2)
-    BIT16_32X8_TRANS // use bf16 32x8 transpose ( bf16 default)
-};
-
-enum class sc_vnni_kernel {
-    NO_VNNI, // don't use vnni reorder/transpose
-    INSERT_REORDER_VNNI, // use inset kernel (step = 16)
-    X8_REORDER_VNNI, // use 8 step unpack kernel
-    X16_REORDER_VNNI, // use 16 step unpack kernel (default)
-    BF16_TRANSPOSE_VNNI, // bf16 transpose (default bf16 vnni transpose)
-    U8S8_TRANSPOSE_VNNI, // u8s8 transpose (default u8s8 vnni transpose)
-};
-
-void find_vectorized_axis(std::vector<expr> const &blocking_dims_expr,
-        sc_data_format_t const &format, int &last_origin_axis,
-        int &origin_axis_vectorized);
-
-void find_vectorized_axis(const tensor_slice &tsl,
-        sc_data_format_t const &format, int &last_origin_axis,
-        int &origin_axis_vectorized);
-
-int collect_axis_shape_size(
-        sc_dims &blocking_dims, const std::vector<int> &axis);
-size_t throw_if_negative(int dim);
-static const int TARGET_AXIS_NOT_DEFINE = -1;
-std::vector<expr> get_reorder_block2plain_indexes(sc_graph_t &graph,
-        const std::vector<expr> &in_indexes, const sc_data_format_t &format,
-        const sc_dims &plain_dims, expr &condition, expr &last_axis_offset,
-        expr &other_axis_condition,
-        const int target_axis = TARGET_AXIS_NOT_DEFINE);
-std::vector<expr> get_reorder_plain2block_indexes(
-        const std::vector<expr> &in_indexes, const sc_data_format_t &format);
-bool can_be_fast_transpose(const sc_graph_t &graph, const context_ptr &ctx,
-        std::vector<int> &inp_a_axis, std::vector<int> &inp_b_axis,
-        std::vector<int> &out_a_axis, std::vector<int> &out_b_axis,
-        const sc_dims &plain_dims, const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, const tensor_slice &src,
-        const tensor_slice &dst, const sc_data_type_t &dtype, bool is_dynamic,
-        bool dynamic_no_padding, sc_trans_kernel &trans_kernel_used);
-void compute_fast_transpose(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        const std::vector<int> &inp_a_axis, const std::vector<int> &inp_b_axis,
-        const std::vector<int> &out_a_axis, const std::vector<int> &out_b_axis,
-        const graph_tensor_ptr &expand_gt, size_t wkld, bool is_dynamic,
-        bool dynamic_no_padding, const sc_trans_kernel trans_kernel_used);
-bool can_be_vnni_reorder(const context_ptr &ctx, std::vector<int> &inp_n_axis,
-        std::vector<int> &inp_k_axis, std::vector<int> &out_n_axis,
-        std::vector<int> &out_k_axis, const sc_dims &plain_dims,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, const tensor_slice &src,
-        const tensor_slice &dst, const sc_data_type_t &dtype,
-        bool &is_vnni_reorder, bool is_dynamic, bool dynamic_no_padding,
-        sc_vnni_kernel &vnni_kernel_used);
-void do_vnni_reorder(std::vector<stmt_c> &cur_list, std::vector<expr> &rows,
-        sc_data_type_t &rows_dtype, const bool is_vnni_reorder,
-        const int bf16_step);
-void vnni_reorder_insert_kernel(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        std::vector<int> &inp_n_axis, std::vector<int> &inp_k_axis,
-        std::vector<int> &out_n_axis, std::vector<int> &out_k_axis,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        bool is_dynamic = false, bool dynamic_no_padding = false);
-void compute_vnni_reorder(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        std::vector<int> &inp_n_axis, std::vector<int> &inp_k_axis,
-        std::vector<int> &out_n_axis, std::vector<int> &out_k_axis,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL,
-        const bool &is_vnni_reorder = false, bool is_dynamic = false,
-        bool dynamic_no_padding = false,
-        const sc_vnni_kernel vnni_kernel_used = sc_vnni_kernel::NO_VNNI);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/shape_of_tensor.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/shape_of_tensor.cpp
deleted file mode 100644
index 444795bdb72..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/shape_of_tensor.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "shape_of_tensor.hpp"
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-shape_of_tensor_op_t::shape_of_tensor_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    op_name_ = "shape_of_tensor";
-    COMPILE_ASSERT(ins.size() == 1, "Shape of tensor op takes 1 input");
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-    auto &shape_idx = attrs_.get<int>("shape_idx");
-    shape_type_ = padding_shape_etype_t(
-            attrs_.get<int>(attr_keys::padding_shape_type));
-    auto shape_size
-            = static_cast<int>(ins[0]->details_.get_plain_dims().size());
-    COMPILE_ASSERT(shape_idx < shape_size,
-            "Shape index: " << shape_idx
-                            << " should not be large than input shape "
-                               "size: "
-                            << shape_size << "!");
-
-    shape_idx_ = shape_idx;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        // shrinked from sc_dim
-        info_.outputs_[0]->details_.dtype_ = datatypes::s32;
-        info_.outputs_[0]->details_.set_plain_dims({1});
-    } else {
-        COMPILE_ASSERT(outs.size() == 1, "Wrong op output size.\n");
-        info_.outputs_ = outs;
-    }
-}
-
-void shape_of_tensor_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    out_formats.reserve(info_.outputs_.size());
-    out_formats.push_back({sc_data_format_t(format_kinds::A)});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-infer_status_code shape_of_tensor_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    // judge if it is the first op in partition then use the outmost anchor
-    if (fsmap.datamap_.size() == 1) {
-        auto src_dim = get_inputs()[0]->details_.get_blocking_dims();
-        std::vector<int> required_axis(src_dim.size(), 0);
-        for (size_t i = 0; i < required_axis.size(); i++) {
-            required_axis[i] = i;
-        }
-        // check the slice range whether meet the outmost anchor
-        for (auto &src_range : fsmap.get(get_inputs()[0])) {
-            if (!slice_full_on_axis(src_dim, src_range, required_axis)) {
-                return infer_status_code::RETRY;
-            }
-        }
-    }
-    fsmap.get(get_outputs()[0])
-            = slice_range_list {{std::make_pair(expr(0), expr(1))}};
-    return infer_status_code::OK;
-}
-
-static int get_constant_padded_shape(
-        int shape, padding_shape_etype_t type, bool is_batch) {
-    switch (type) {
-        case padding_shape_etype_t::without_padding: return shape; break;
-        case padding_shape_etype_t::matmul_padding: {
-            int block = get_matmul_dyn_cfg_single(shape, /*is_batch*/ is_batch);
-            return utils::divide_and_ceil(shape, block) * block;
-            break;
-        }
-        case padding_shape_etype_t::conv_padding:
-            COMPILE_ASSERT(false, "Unimplement of conv padding shape!");
-            break;
-        default: break;
-    }
-    return 0;
-}
-
-static expr get_expr_padded_shape(builder::builder_impl_t *bld,
-        const expr &shape, padding_shape_etype_t type, bool is_batch) {
-    switch (type) {
-        case padding_shape_etype_t::without_padding:
-            return builder::make_cast(datatypes::s32, shape);
-            break;
-        case padding_shape_etype_t::matmul_padding: {
-            auto block = builder::make_var(datatypes::s32, "matmul_block");
-            bld->push_var_tensor_def(block, linkage::local,
-                    builtin::call_get_matmul_dyn_cfg_single(
-                            builder::make_cast(datatypes::s32, shape),
-                            is_batch));
-            return builder::make_cast(
-                    datatypes::s32, divide_and_ceil(shape, block) * block);
-            break;
-        }
-        case padding_shape_etype_t::conv_padding:
-            COMPILE_ASSERT(false, "Unimplement of conv padding shape!");
-            break;
-        default: break;
-    }
-    return expr();
-}
-
-void shape_of_tensor_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    auto bld = builder::get_current_builder();
-    auto &graph = get_owner_graph();
-    auto input_plain_shapes_expr
-            = graph.dims_to_expr(get_inputs()[0]->details_.get_plain_dims());
-    auto shape = input_plain_shapes_expr[shape_idx_];
-    bool is_batch
-            = attrs_.get_or_else(attr_keys::shape_of_tensor_is_batch, false);
-    expr padded_shape;
-    if (shape.isa<constant>()) {
-        padded_shape = get_constant_padded_shape(
-                static_cast<int>(get_expr_as_int(shape)), shape_type_,
-                is_batch);
-    } else {
-        padded_shape = get_expr_padded_shape(bld, shape, shape_type_, is_batch);
-    }
-    assert(padded_shape.defined());
-    bld->push_assign(builder::make_indexing(dst[0]->tptr_, {0}), padded_shape);
-}
-
-OP_REGISTER(shape_of_tensor_op_t, shape_of_tensor);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/shape_of_tensor.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/shape_of_tensor.hpp
deleted file mode 100644
index 1b4fab337b2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/shape_of_tensor.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_SHAPE_OF_TENSOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_SHAPE_OF_TENSOR_HPP
-
-#include <functional>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/ir_module.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// the padding shape type decides the runtime block selector function usage.
-// E.g. matmul and conv may have different block select.
-// Currently we only support mamtul block select.
-enum class padding_shape_etype_t : int {
-    without_padding = 0,
-    matmul_padding,
-    conv_padding
-};
-namespace attr_keys {
-// value is padding_shape_etype_t
-static constexpr const char *padding_shape_type = "padding_shape_type";
-// value is boolean
-static constexpr const char *shape_of_tensor_is_batch
-        = "shape_of_tensor_is_batch";
-} // namespace attr_keys
-// Get plain(may padded) shapes of a tensor
-class shape_of_tensor_op_t : public fusible_op_t,
-                             public op_traits::auto_copyable_t {
-public:
-    shape_of_tensor_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-    infer_status_code pre_infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override {
-        return infer_status_code::OK;
-    }
-    void compute_block(context_ptr ctx, const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs) override;
-
-private:
-    // index of real/padded_plain shapes
-    int shape_idx_;
-    // decides whether and how to use padding on the shape according to the
-    // related tunable op.
-    padding_shape_etype_t shape_type_;
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/ternary_elemwise.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/ternary_elemwise.cpp
deleted file mode 100644
index 139c6e96662..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/ternary_elemwise.cpp
+++ /dev/null
@@ -1,883 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include "ternary_elemwise.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <unordered_map>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-select_op_t::get_inplace_map() {
-    std::vector<tensor_inplace_info_t> ret;
-    auto &inp = get_inputs();
-    auto &out_dim = get_outputs()[0]->details_.get_plain_dims();
-    auto &out_dtype = get_outputs()[0]->details_.dtype_;
-    for (size_t i = 0; i < inp.size(); i++) {
-        if (inp[i]->details_.get_plain_dims() == out_dim
-                && inp[i]->details_.dtype_ == out_dtype) {
-            ret.emplace_back(tensor_inplace_info_t {
-                    static_cast<int>(i), inplace_kind::ZERO_OFFSET});
-        }
-    }
-    if (ret.empty()) { return {}; }
-    return {{0, std::move(ret)}};
-}
-
-static sc_dims infer_select_output_shape(const sc_dims &cond_shape,
-        const sc_dims &then_shape, const sc_dims &else_shape) {
-    sc_dims output_shape
-            = op_traits::may_broadcast_t::infer_auto_broadcast_output_shape(
-                    then_shape, else_shape);
-    output_shape
-            = op_traits::may_broadcast_t::infer_auto_broadcast_output_shape(
-                    output_shape, cond_shape);
-    return output_shape;
-}
-
-std::vector<int> select_op_t::get_non_broadcast_input_index(
-        bool assert_non_empty) const {
-    const sc_dims &cond_dims = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &then_dims = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &else_dims = info_.inputs_[2]->details_.get_plain_dims();
-    auto output_dims
-            = infer_select_output_shape(cond_dims, then_dims, else_dims);
-    std::vector<int> ret;
-    for (size_t i = 0; i < info_.inputs_.size(); ++i) {
-        if (may_broadcast_t::broadcastable_shape_equal(
-                    info_.inputs_[i]->details_.get_plain_dims(), output_dims)) {
-            ret.emplace_back(i);
-        }
-    }
-    if (assert_non_empty) {
-        // non-broadcast input means input no need to be broadcasted, whose
-        // shape is the same as the output
-        COMPILE_ASSERT(!ret.empty(),
-                "Select op is required to have at least one non-broadcast "
-                "input at this stage.");
-    }
-    return ret;
-}
-
-static slice_range_list infer_broadcast_slice(slice_range_list known_range_list,
-        const std::vector<int> &bc_axis, const std::vector<expr> &bc_dim) {
-    slice_range_list bc_range_list(known_range_list.size());
-    for (size_t i = 0; i < bc_range_list.size(); i++) {
-        auto &known_range = known_range_list[i];
-        COMPILE_ASSERT(known_range.size() == bc_dim.size()
-                        || bc_axis == std::vector<int> {-1},
-                "Unexpected cases found")
-        for (size_t j = 0; j < bc_dim.size(); j++) {
-            if (bc_axis.end() != std::find(bc_axis.begin(), bc_axis.end(), j)) {
-                bc_range_list[i].emplace_back(known_range.at(j));
-            } else if (bc_dim.size() != 1) {
-                bc_range_list[i].emplace_back(
-                        std::make_pair(expr(0), bc_dim[j]));
-            }
-        }
-    }
-    return bc_range_list;
-}
-
-static slice_range_list infer_broadcast_arg_slice(
-        slice_range_list known_range_list, std::vector<int> bc_axis,
-        bool keep_dims) {
-    slice_range_list bc_arg_range_list(known_range_list.size());
-    for (size_t i = 0; i < bc_arg_range_list.size(); i++) {
-        auto &known_range = known_range_list[i];
-        for (size_t j = 0; j < known_range.size(); j++) {
-            if (bc_axis.end() != std::find(bc_axis.begin(), bc_axis.end(), j)) {
-                bc_arg_range_list[i].emplace_back(known_range.at(j));
-            } else {
-                if (keep_dims) {
-                    bc_arg_range_list[i].emplace_back(
-                            std::make_pair(expr(0), expr(1)));
-                }
-            }
-        }
-        if (bc_arg_range_list[i].empty())
-            bc_arg_range_list[i].emplace_back(std::make_pair(0, 1));
-    }
-    return bc_arg_range_list;
-}
-
-static binding_axis infer_broadcast_axis_binding(
-        binding_axis known_axis_list, const std::vector<int> &bc_axis) {
-    binding_axis bc_axis_list(known_axis_list.size());
-    for (size_t i = 0; i < bc_axis_list.size(); i++) {
-        auto &known_ax = known_axis_list[i];
-        for (size_t j = 0; j < known_ax.size(); j++) {
-            auto &ax = known_ax[i];
-            bc_axis_list[i].emplace_back(bc_axis[ax]);
-        }
-    }
-    return bc_axis_list;
-}
-
-static binding_axis infer_broadcast_arg_axis_binding(
-        binding_axis known_axis_list, const std::vector<int> &bc_axis) {
-    binding_axis bc_arg_axis_list(known_axis_list.size());
-    for (size_t i = 0; i < bc_arg_axis_list.size(); i++) {
-        auto &known_ax = known_axis_list[i];
-        for (size_t j = 0; j < known_ax.size(); j++) {
-            auto iter = std::find(bc_axis.begin(), bc_axis.end(), known_ax[j]);
-            if (iter != bc_axis.end()) {
-                auto offset = std::distance(bc_axis.begin(), iter);
-                bc_arg_axis_list[i].emplace_back(offset);
-            }
-        }
-    }
-    return bc_arg_axis_list;
-}
-
-select_op_t::select_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    op_name_ = "select";
-    COMPILE_ASSERT(ins.size() == 3, "Select op shall have 3 inputs.");
-    info_.inputs_ = ins;
-    auto cond_shape = info_.inputs_[0]->details_.get_plain_dims();
-    auto then_shape = info_.inputs_[1]->details_.get_plain_dims();
-    auto else_shape = info_.inputs_[2]->details_.get_plain_dims();
-    auto output_shape
-            = infer_select_output_shape(cond_shape, then_shape, else_shape);
-    auto non_bc_indices = get_non_broadcast_input_index(false);
-    std::string auto_broadcast
-            = attrs.get_or_else("auto_broadcast", std::string("numpy"));
-    COMPILE_ASSERT(auto_broadcast == "numpy" || non_bc_indices.size() == 3,
-            "Select op's all three inputs should have the same size when "
-            "auto_broadcast is none.");
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this));
-        info_.outputs_[0]->details_.set_plain_dims(output_shape);
-        int ref_idx = get_ref_input_index(true);
-        if (ref_idx == may_broadcast_t::NOT_DETERMINED) {
-            ref_idx = then_shape.size() >= else_shape.size() ? 1 : 2;
-        }
-        info_.outputs_[0]->details_.set_format(
-                info_.inputs_[ref_idx]->details_.get_format());
-        info_.outputs_[0]->details_.dtype_ = info_.inputs_[1]->details_.dtype_;
-    } else {
-        info_.outputs_ = outs;
-    }
-    COMPILE_ASSERT(
-            gc::graph::check_shape_equal(
-                    info_.outputs_[0]->details_.get_plain_dims(), output_shape),
-            "Select op's output doesn't have the correct shape");
-
-    attrs_ = attrs;
-    plain_bc_axis_.reserve(3);
-    for (size_t i = 0; i < info_.inputs_.size(); ++i) {
-        plain_bc_axis_.emplace_back(
-                op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                        info_.inputs_[i]->details_.get_plain_dims(),
-                        output_shape));
-    }
-}
-
-select_op_t::select_op_t(
-        graph_tensor_ptr cond, graph_tensor_ptr then, graph_tensor_ptr els)
-    : select_op_t({std::move(cond), std::move(then), std::move(els)}, {}, {}) {}
-
-int select_op_t::get_ref_input_index(bool assert_determined) const {
-    auto non_bc_index = get_non_broadcast_input_index(assert_determined);
-    if (!assert_determined && non_bc_index.empty())
-        return may_broadcast_t::NOT_DETERMINED;
-    int max_input_idx = non_bc_index[0];
-    bool is_cond_non_bc = std::find(non_bc_index.begin(), non_bc_index.end(), 0)
-            != non_bc_index.end();
-    bool is_then_non_bc = std::find(non_bc_index.begin(), non_bc_index.end(), 1)
-            != non_bc_index.end();
-    bool is_else_non_bc = std::find(non_bc_index.begin(), non_bc_index.end(), 2)
-            != non_bc_index.end();
-    if (is_then_non_bc && is_else_non_bc) {
-        // only consider then and else branch, similar to binary_elementwise
-        if (is_dynamic()) {
-            max_input_idx
-                    = info_.inputs_[1]->details_.get_format_candidates().size()
-                            >= info_.inputs_[2]
-                                       ->details_.get_format_candidates()
-                                       .size()
-                    ? 1
-                    : 2;
-        } else {
-            max_input_idx = 1;
-            if (!info_.inputs_[1]->details_.get_format().is_blocking()
-                    && info_.inputs_[2]->details_.get_format().is_blocking()) {
-                max_input_idx = 2;
-            }
-        }
-    }
-    if (is_cond_non_bc) {
-        COMPILE_ASSERT(non_bc_index.size() > 1,
-                "Select op's cond input shall not be the only non-broadcast "
-                "input.");
-        max_input_idx = non_bc_index[1];
-    }
-    if (attrs_.has_key(op_attr_key::layout_input_index)) {
-        max_input_idx = attrs_.get<int>(op_attr_key::layout_input_index);
-    }
-    return max_input_idx;
-}
-
-void select_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-
-    int max_input_idx = get_ref_input_index(true);
-    attrs_[op_attr_key::layout_input_index] = max_input_idx;
-
-    size_t max_rank
-            = info_.inputs_[max_input_idx]->details_.get_plain_dims().size();
-    auto ref_format = info_.inputs_[max_input_idx]->details_.get_format();
-    for (size_t i = 0; i < info_.inputs_.size(); ++i) {
-        size_t input_rank = info_.inputs_[i]->details_.get_plain_dims().size();
-        COMPILE_ASSERT((input_rank == 1
-                               && info_.inputs_[i]->details_.get_plain_dims()
-                                       == sc_dims {1}
-                               && info_.inputs_[i]->details_.get_format()
-                                       == sc_data_format_t(format_kinds::A))
-                        || input_rank == max_rank,
-                "Invalid shape or format encountered in select op's query "
-                "format.");
-        if (static_cast<int>(i) == max_input_idx) {
-            in_formats.push_back({ref_format});
-        } else if (input_rank == 1) {
-            in_formats.push_back({info_.inputs_[i]->details_.get_format()});
-        } else {
-            auto target_format = infer_broadcast_format(
-                    info_.inputs_[max_input_idx]->details_,
-                    info_.inputs_[i]->details_);
-            in_formats.push_back({target_format});
-        }
-    }
-    out_formats.push_back({ref_format});
-
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-// The logic below might be suitable for most fusible op, which has same
-// slice ranges on inputs and outputs
-infer_status_code select_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    COMPILE_ASSERT(
-            get_inputs().size() == 3, "Select op is expected to have 3 inputs");
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    // if unkown slice ranges exist.
-    int maxtensor_idx = get_ref_input_index(true);
-    if (known_ranges_map.size() < get_inputs().size()) {
-        std::vector<int> known_idx(info_.inputs_.size(), 0);
-        for (size_t i = 0; i < info_.inputs_.size(); ++i) {
-            known_idx[i] = (known_ranges_map.find(i) != known_ranges_map.end());
-        }
-        // check broadcast
-        if (maxtensor_idx >= 0) {
-            if (known_idx[maxtensor_idx] == 1) {
-                for (int i = 0; i < 3; i++) {
-                    if (known_idx[i] == 0) {
-                        bool keep_dims = get_inputs()[i]
-                                                 ->details_.get_blocking_dims()
-                                                 .size()
-                                == get_inputs()[maxtensor_idx]
-                                           ->details_.get_blocking_dims()
-                                           .size();
-                        auto bc_axis = get_bc_axis(maxtensor_idx, i);
-                        slice_range_list bc_arg_range_list
-                                = infer_broadcast_arg_slice(
-                                        known_ranges_map[maxtensor_idx],
-                                        bc_axis, keep_dims);
-                        known_ranges_map[i] = std::move(bc_arg_range_list);
-                    }
-                }
-            } else {
-                auto it = std::find(known_idx.begin(), known_idx.end(), 1);
-                COMPILE_ASSERT(it != known_idx.end(), "No known idx found.");
-                int known_tensor_idx = std::distance(known_idx.begin(), it);
-                auto bc_axis = get_bc_axis(maxtensor_idx, known_tensor_idx);
-                slice_range_list bc_range_list = infer_broadcast_slice(
-                        known_ranges_map[known_tensor_idx], bc_axis,
-                        get_inputs()[maxtensor_idx]
-                                ->details_.get_blocking_dims_expr(
-                                        get_owner_graph()));
-                known_ranges_map[maxtensor_idx] = bc_range_list;
-                known_idx[maxtensor_idx] = 1;
-                // deal with the remaining unkown slice range
-                it = std::find(known_idx.begin(), known_idx.end(), 0);
-                if (it != known_idx.end()) {
-                    int remaining_idx = std::distance(known_idx.begin(), it);
-                    bool keep_dims = get_inputs()[remaining_idx]
-                                             ->details_.get_blocking_dims()
-                                             .size()
-                            == get_inputs()[maxtensor_idx]
-                                       ->details_.get_blocking_dims()
-                                       .size();
-                    bc_axis = get_bc_axis(maxtensor_idx, remaining_idx);
-                    bc_range_list = infer_broadcast_arg_slice(
-                            known_ranges_map[maxtensor_idx], bc_axis,
-                            keep_dims);
-                    known_ranges_map[remaining_idx] = bc_range_list;
-                }
-            }
-        } else {
-            auto it = std::find(known_idx.begin(), known_idx.end(), 1);
-            COMPILE_ASSERT(it != known_idx.end(), "No known idx found.");
-            int known_tensor = std::distance(known_idx.begin(), it);
-            for (int i = 0; i < 3; i++) {
-                if (i != known_tensor) {
-                    known_ranges_map[i] = known_ranges_map[known_tensor];
-                }
-            }
-        }
-        // set the other unknown slice range by achieved known_ranges_list
-        set_unknown_input_slice(this, known_ranges_map, fsmap);
-    }
-    // set outputs slice range
-    auto &outslice = fsmap.get(get_outputs()[0]);
-    outslice = known_ranges_map[maxtensor_idx > -1 ? maxtensor_idx : 1];
-    return infer_status_code::OK;
-}
-
-void select_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    COMPILE_ASSERT(
-            get_inputs().size() == 3, "Select op is expected to have 3 inputs");
-    // search known axis from any input of cur fusbile op
-    auto known_axis_map = search_known_input_axis(this, bdax_map);
-    auto &outaxis = bdax_map.get(get_outputs()[0]);
-    int maxtensor_idx = get_ref_input_index(true);
-    int ref_idx = maxtensor_idx > -1 ? maxtensor_idx : 1;
-    if (!outaxis.empty()) {
-        COMPILE_ASSERT(known_axis_map.size() == get_inputs().size(),
-                "all input axis should be bound")
-        if (outaxis == known_axis_map[ref_idx]) { return; }
-    }
-
-    // if unkown slice ranges exist.
-    if (known_axis_map.size() < get_inputs().size()) {
-        std::vector<int> known_idx(3, 0);
-        known_idx[0] = known_axis_map.find(0) != known_axis_map.end() ? 1 : 0;
-        known_idx[1] = known_axis_map.find(1) != known_axis_map.end() ? 1 : 0;
-        known_idx[2] = known_axis_map.find(2) != known_axis_map.end() ? 1 : 0;
-        // check broadcast
-        if (maxtensor_idx >= 0) {
-            if (known_idx[maxtensor_idx] == 1) {
-                for (int i = 0; i < 3; i++) {
-                    if (known_idx[i] == 0) {
-                        bool keep_dims = get_inputs()[i]
-                                                 ->details_.get_blocking_dims()
-                                                 .size()
-                                == get_inputs()[maxtensor_idx]
-                                           ->details_.get_blocking_dims()
-                                           .size();
-                        if (keep_dims) {
-                            known_axis_map[i] = known_axis_map[maxtensor_idx];
-                        } else {
-                            COMPILE_ASSERT(
-                                    get_inputs()[i]->details_.get_plain_dims()
-                                            == sc_dims {1},
-                                    "Select op's infer binding axis "
-                                    "encountered unaligned input shapes.");
-                            binding_axis bc_arg_axis_list(
-                                    known_axis_map[maxtensor_idx].size());
-                            known_axis_map[i] = bc_arg_axis_list;
-                        }
-                    }
-                }
-            } else {
-                auto it = std::find(known_idx.begin(), known_idx.end(), 1);
-                COMPILE_ASSERT(it != known_idx.end(), "No known idx found.");
-                int known_tensor_idx = std::distance(known_idx.begin(), it);
-                bool keep_dims = get_inputs()[known_tensor_idx]
-                                         ->details_.get_blocking_dims()
-                                         .size()
-                        == get_inputs()[maxtensor_idx]
-                                   ->details_.get_blocking_dims()
-                                   .size();
-                if (keep_dims) {
-                    known_axis_map[maxtensor_idx]
-                            = known_axis_map[known_tensor_idx];
-                } else {
-                    COMPILE_ASSERT(get_inputs()[known_tensor_idx]
-                                            ->details_.get_plain_dims()
-                                    == sc_dims {1},
-                            "Select op's infer binding axis encountered "
-                            "unaligned input shapes.");
-                    auto plain_bc_axis = op_traits::may_broadcast_t::
-                            get_auto_broadcast_bc_axis(
-                                    get_inputs()[known_tensor_idx]
-                                            ->details_.get_plain_dims(),
-                                    get_inputs()[maxtensor_idx]
-                                            ->details_.get_plain_dims());
-                    if (plain_bc_axis == std::vector<int> {-1}) {
-                        plain_bc_axis[0] = get_inputs()[maxtensor_idx]
-                                                   ->details_.get_plain_dims()
-                                                   .size()
-                                - 1;
-                    }
-                    auto bc_axis_list = infer_broadcast_axis_binding(
-                            known_axis_map[known_tensor_idx], plain_bc_axis);
-                    known_axis_map[maxtensor_idx] = bc_axis_list;
-                }
-                known_idx[maxtensor_idx] = 1;
-                // deal with the remaining unknown binding axis
-                it = std::find(known_idx.begin(), known_idx.end(), 0);
-                if (it != known_idx.end()) {
-                    int remaining_idx = std::distance(known_idx.begin(), it);
-                    bool keep_dims = get_inputs()[remaining_idx]
-                                             ->details_.get_blocking_dims()
-                                             .size()
-                            == get_inputs()[maxtensor_idx]
-                                       ->details_.get_blocking_dims()
-                                       .size();
-                    if (keep_dims) {
-                        known_axis_map[remaining_idx]
-                                = known_axis_map[maxtensor_idx];
-                    } else {
-                        COMPILE_ASSERT(get_inputs()[remaining_idx]
-                                                ->details_.get_plain_dims()
-                                        == sc_dims {1},
-                                "Select op's infer binding axis encountered "
-                                "unaligned input shapes.");
-                        binding_axis bc_arg_axis_list(
-                                known_axis_map[maxtensor_idx].size());
-                        known_axis_map[remaining_idx] = bc_arg_axis_list;
-                    }
-                }
-            }
-        } else {
-            auto it = std::find(known_idx.begin(), known_idx.end(), 1);
-            COMPILE_ASSERT(it != known_idx.end(), "No known idx found.");
-            int known_tensor = std::distance(known_idx.begin(), it);
-            for (int i = 0; i < 3; i++) {
-                if (i != known_tensor) {
-                    known_axis_map[i] = known_axis_map[known_tensor];
-                }
-            }
-        }
-    }
-    // set outputs axis binding
-    outaxis = known_axis_map[ref_idx];
-    // set the other unknown axis binding by achieved known_axis_map
-    set_unknown_binding_axis(this, known_axis_map, bdax_map);
-}
-
-void select_op_t::pre_infer_binding_axis(binding_axis_map &bdax_map) {}
-
-std::vector<int> select_op_t::get_bc_axis(
-        const int axis1, const int axis2) const {
-    auto shape1 = info_.inputs_[axis1]->details_.get_plain_dims();
-    auto shape2 = info_.inputs_[axis2]->details_.get_plain_dims();
-    auto non_bc_indices = get_non_broadcast_input_index(true);
-    int ref_axis
-            = std::find(non_bc_indices.begin(), non_bc_indices.end(), axis1)
-                    != non_bc_indices.end()
-            ? axis1
-            : axis2;
-    std::vector<int> plain_axis = ref_axis == axis1
-            ? op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                    shape2, shape1)
-            : op_traits::may_broadcast_t::get_auto_broadcast_bc_axis(
-                    shape1, shape2);
-    if (plain_axis == std::vector<int> {-1}) return plain_axis;
-    return transform_axis_plain2blocking(info_.inputs_[ref_axis], plain_axis);
-}
-
-shape_rl_vec select_op_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &out_plain_dims = get_outputs()[0]->details_.get_plain_dims();
-    for (size_t i = 0; i < get_inputs().size(); ++i) {
-        const auto &in_plain_dims = get_inputs()[i]->details_.get_plain_dims();
-        assert(in_plain_dims.size() == out_plain_dims.size()
-                || in_plain_dims.size() == 1);
-    }
-    for (size_t i = 0; i < get_inputs().size(); ++i) {
-        const auto &dims1 = get_inputs()[i]->details_.get_plain_dims();
-        for (size_t j = i + 1; j < get_inputs().size(); ++j) {
-            const auto &dims2 = get_inputs()[j]->details_.get_plain_dims();
-            if (dims1.size() == dims2.size()) {
-                for (size_t idx = 0; idx < dims1.size(); ++idx) {
-                    // maybe broadcast
-                    if ((is_dynamic_dim(dims1[idx])
-                                || is_dynamic_dim(dims2[idx]))
-                            && dims1[idx] != 1 && dims2[idx] != 1) {
-                        ret.emplace_back(dims1[idx], dims2[idx]);
-                    }
-                }
-            }
-        }
-    }
-    for (size_t i = 0; i < out_plain_dims.size(); ++i) {
-        if (is_dynamic_dim(out_plain_dims[i])) {
-            for (size_t j = 0; j < get_inputs().size(); ++j) {
-                const auto &in_plain_dims
-                        = get_inputs()[j]->details_.get_plain_dims();
-                if (i < in_plain_dims.size() && in_plain_dims[i] != 1) {
-                    ret.emplace_back(in_plain_dims[i], out_plain_dims[i]);
-                }
-            }
-        }
-    }
-    return ret;
-}
-
-void compute_block_select(const context_ptr &ctx,
-        const std::vector<const tensor_slice *> &src, const tensor_slice &dst,
-        sc_op_info_t &info, const int maxtensor_idx,
-        const std::vector<std::vector<int>> &blocking_bc_axis,
-        const vectorized_info_t &vx_info, const mask_compute_func_t &compute,
-        const graph_tensor_ptr &expand_gt, size_t wkld = 0UL) {
-    bool use_vectorize = false;
-    vec_backend_require(ctx, use_vectorize);
-    // nested loop vars
-    std::vector<expr> iter_vars;
-    // the indices for multiple inputs. First dim: the input, Second dim: the
-    // dimensions in the tensor
-    std::vector<expr> in_idx, in_bc_idx_1, in_bc_idx_2;
-    // the indices for the output tensor
-    std::vector<expr> dst_idx;
-
-    COMPILE_ASSERT(maxtensor_idx >= 0, "maxtensor_idx shall be determined.")
-    bool is_blocking_shape = is_op_input_blocking_shape(info);
-    std::vector<int> bc;
-    for (int i = 0; i < 3; i++) {
-        if (i != maxtensor_idx) { bc.emplace_back(i); }
-    }
-
-    const tensor_slice *in_tsl = src[maxtensor_idx], *in_bc_tsl_1 = src[bc[0]],
-                       *in_bc_tsl_2 = src[bc[1]];
-    bool keep_dims_1 = in_tsl->get_base_dims().size()
-            == in_bc_tsl_1->get_base_dims().size();
-    bool keep_dims_2 = in_tsl->get_base_dims().size()
-            == in_bc_tsl_2->get_base_dims().size();
-    auto bc_axis_1 = blocking_bc_axis[bc[0]];
-    auto bc_axis_2 = blocking_bc_axis[bc[1]];
-    // add output type check, manual downcast
-    sc_data_etype out_etype
-            = dst.tptr_->dtype_.get_pointer_element().as_etype();
-    // use src_indices.at(0) as default
-    for (unsigned i = 0; i < dst.nslice_dims(); i++) {
-        // make the loop var for the for-loop
-        iter_vars.emplace_back(range_from_outer_loop(dst.get_ranges()[i])
-                        ? expr(0)
-                        : builder::make_var(datatypes::index,
-                                std::string("_fuseiter")
-                                        + fusion_create_idx()));
-        in_idx.emplace_back(iter_vars.back());
-        if (std::find(bc_axis_1.begin(), bc_axis_1.end(), i)
-                != bc_axis_1.end()) {
-            in_bc_idx_1.emplace_back(iter_vars.back());
-        } else if (keep_dims_1) {
-            in_bc_idx_1.emplace_back(0);
-        }
-        if (std::find(bc_axis_2.begin(), bc_axis_2.end(), i)
-                != bc_axis_2.end()) {
-            in_bc_idx_2.emplace_back(iter_vars.back());
-        } else if (keep_dims_2) {
-            in_bc_idx_2.emplace_back(0);
-        }
-        /** push an index for output tensor **/
-        dst_idx.emplace_back(iter_vars.back());
-    }
-
-    // For empty bc_axis
-    if (in_bc_idx_1.empty()) in_bc_idx_1 = {0};
-    if (in_bc_idx_2.empty()) in_bc_idx_2 = {0};
-    std::vector<expr> in_idx_tail = in_idx, in_bc_idx_1_tail = in_bc_idx_1,
-                      in_bc_idx_2_tail = in_bc_idx_2, dst_idx_tail = dst_idx;
-    auto tail_var = builder::make_var(
-            datatypes::index, std::string("_fuseiter") + fusion_create_idx());
-    in_idx_tail[vx_info.axis] = tail_var;
-    dst_idx_tail[vx_info.axis] = tail_var;
-    expr indexed_target;
-    expr indexed_input;
-    auto bld = builder::get_current_builder();
-    COMPILE_ASSERT(bld, "No active builder is set");
-    auto slice_len = dst.get_shape().at(vx_info.axis);
-    int lanes = static_cast<int>(vx_info.lanes);
-    auto floor = do_cast_and_fold(slice_len / lanes * lanes);
-    auto tail = do_cast_and_fold(slice_len % lanes);
-    int floor_int = 0;
-    int tail_int = 0;
-    if (floor.isa<constant>()) { floor_int = get_expr_as_int(floor); }
-    if (tail.isa<constant>()) { tail_int = get_expr_as_int(tail); }
-    std::vector<stmt> tcur;
-    stmt cur;
-    int vec_len = vx_info.lanes;
-    bool tail_threshold = tail.isa<constant>() && tail_int <= 1;
-    bool use_scalar = !use_vectorize || tail_threshold || lanes == 1;
-    auto find_bc_input_index = [&](bool tail_threshold,
-                                       tensor_slice const *in_bc_tsl,
-                                       std::vector<expr> const &in_bc_idx,
-                                       std::vector<int> &bc_axis,
-                                       expr &indexed_bc_input, expr &mask) {
-        // IF last dim is included in bc_axis_1.
-        if (bc_axis.back() == static_cast<int64_t>(vx_info.axis)) {
-            indexed_bc_input = builder::make_indexing(in_bc_tsl->tptr_,
-                    in_bc_idx, tail_threshold ? 1 : vx_info.lanes, mask);
-        }
-        // IF last dim is excluded in bc_axis_1.
-        else {
-            if (!tail_threshold) {
-                indexed_bc_input = builder::make_broadcast(
-                        builder::make_indexing(in_bc_tsl->tptr_, in_bc_idx),
-                        static_cast<int>(vx_info.lanes));
-            } else {
-                indexed_bc_input
-                        = builder::make_indexing(in_bc_tsl->tptr_, in_bc_idx);
-            }
-        }
-    };
-    // recover schedule loop
-    for (int i = static_cast<int>(dst.get_shape().size() - 1); i >= 0; i--) {
-        stmt body;
-        // move broadcast op to body
-        if (static_cast<int>(dst.get_shape().size()) == vx_info.axis + 1
-                && i == vx_info.axis) {
-            if ((!floor.isa<constant>() || floor_int)) {
-                expr mask;
-                bld->push_scope();
-                expr indexed_target = builder::make_indexing(
-                        dst.tptr_, dst_idx, vx_info.lanes);
-                expr indexed_input = builder::make_indexing(
-                        in_tsl->tptr_, in_idx, vx_info.lanes);
-
-                if (!in_tsl->tptr_->dtype_.get_pointer_element().is_etype(
-                            out_etype)) {
-                    indexed_input = builder::make_cast(
-                            sc_data_type_t(
-                                    out_etype, indexed_input->dtype_.lanes_),
-                            indexed_input);
-                }
-
-                expr indexed_bc_input_1, indexed_bc_input_2;
-
-                find_bc_input_index(false, in_bc_tsl_1, in_bc_idx_1, bc_axis_1,
-                        indexed_bc_input_1, mask);
-                find_bc_input_index(false, in_bc_tsl_2, in_bc_idx_2, bc_axis_2,
-                        indexed_bc_input_2, mask);
-
-                std::vector<expr::lvalue_proxy_t> target_vec {
-                        expr::lvalue_proxy_t(indexed_target, false)};
-                std::vector<expr> inputs(3);
-                if (maxtensor_idx == 0) {
-                    inputs = {indexed_input, indexed_bc_input_1,
-                            indexed_bc_input_2};
-                } else if (maxtensor_idx == 1) {
-                    inputs = {indexed_bc_input_1, indexed_input,
-                            indexed_bc_input_2};
-                } else {
-                    inputs = {indexed_bc_input_1, indexed_bc_input_2,
-                            indexed_input};
-                }
-                cur = compute(inputs, target_vec);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = bld->pop_scope();
-                if (iter_vars.at(i).isa<var>()) {
-                    cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                            expr(floor), expr(int(vx_info.lanes)), cur, true,
-                            for_type::NORMAL);
-                    bind_loop_axis(expand_gt, cur, i, true);
-                }
-                tcur.emplace_back(cur);
-            }
-            if ((!tail.isa<constant>() && !is_blocking_shape) || tail_int) {
-                auto func_tail_var_pos = [&](std::vector<expr> &in_bc_idx,
-                                                 std::vector<int> &bc_axis,
-                                                 bool keep_dims,
-                                                 expr &tail_var) {
-                    auto res_it = std::find(
-                            bc_axis.begin(), bc_axis.end(), vx_info.axis);
-                    if (res_it != bc_axis.end()) {
-                        in_bc_idx[keep_dims ? vx_info.axis
-                                            : (res_it - bc_axis.begin())]
-                                = tail_var;
-                    }
-                };
-                func_tail_var_pos(
-                        in_bc_idx_1_tail, bc_axis_1, keep_dims_1, tail_var);
-                func_tail_var_pos(
-                        in_bc_idx_2_tail, bc_axis_2, keep_dims_2, tail_var);
-                expr mask;
-                if (!use_scalar) {
-                    mask = last_dim_generate_mask(
-                            tail_var, floor, slice_len, lanes, true);
-                }
-                expr indexed_bc_input_1_tail, indexed_bc_input_2_tail;
-                find_bc_input_index(use_scalar, in_bc_tsl_1, in_bc_idx_1_tail,
-                        bc_axis_1, indexed_bc_input_1_tail, mask);
-                find_bc_input_index(use_scalar, in_bc_tsl_2, in_bc_idx_2_tail,
-                        bc_axis_2, indexed_bc_input_2_tail, mask);
-
-                expr indexed_target_tail = builder::make_indexing(
-                        dst.tptr_, dst_idx_tail, use_scalar ? 1 : lanes, mask);
-                expr indexed_input_tail = builder::make_indexing(in_tsl->tptr_,
-                        in_idx_tail, use_scalar ? 1 : lanes, mask);
-                std::vector<expr::lvalue_proxy_t> target_vec_tail {
-                        expr::lvalue_proxy_t(indexed_target_tail, false)};
-                bld->push_scope();
-                std::vector<expr> inputs_tail(3);
-                if (maxtensor_idx == 0) {
-                    inputs_tail = {indexed_input_tail, indexed_bc_input_1_tail,
-                            indexed_bc_input_2_tail};
-                } else if (maxtensor_idx == 1) {
-                    inputs_tail = {indexed_bc_input_1_tail, indexed_input_tail,
-                            indexed_bc_input_2_tail};
-                } else {
-                    inputs_tail = {indexed_bc_input_1_tail,
-                            indexed_bc_input_2_tail, indexed_input_tail};
-                }
-                cur = compute(inputs_tail, target_vec_tail);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(tail_var, expr(floor),
-                        slice_len, use_scalar ? expr(1) : lanes,
-                        bld->pop_scope(), true, for_type::NORMAL);
-                bind_loop_axis(expand_gt, cur, i, true);
-                tcur.emplace_back(cur);
-            }
-        } else if (iter_vars.at(i).isa<var>()) {
-            if (!tcur.empty() && tcur[0].defined()) {
-                body = make_stmt<stmts_node_t>(std::move(tcur));
-                tcur.clear();
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst.get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else if (cur.defined()) {
-                body = make_stmt<stmts_node_t>(
-                        std::vector<stmt> {std::move(cur)});
-                // address special condition, like temp_buffer is used
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), dst.get_shape().at(i), expr(1),
-                        std::move(body), true, for_type::NORMAL);
-            } else {
-                indexed_target = builder::make_indexing(dst.tptr_, dst_idx);
-
-                indexed_input = builder::make_indexing(in_tsl->tptr_, in_idx);
-
-                expr indexed_bc_input_1 = builder::make_indexing(
-                        in_bc_tsl_1->tptr_, in_bc_idx_1);
-                expr indexed_bc_input_2 = builder::make_indexing(
-                        in_bc_tsl_2->tptr_, in_bc_idx_2);
-
-                std::vector<expr::lvalue_proxy_t> target_vec {
-                        expr::lvalue_proxy_t(indexed_target, false)};
-                bld->push_scope();
-                std::vector<expr> inputs(3);
-                if (maxtensor_idx == 0) {
-                    inputs = {indexed_input, indexed_bc_input_1,
-                            indexed_bc_input_2};
-                } else if (maxtensor_idx == 1) {
-                    inputs = {indexed_bc_input_1, indexed_input,
-                            indexed_bc_input_2};
-                } else {
-                    inputs = {indexed_bc_input_1, indexed_bc_input_2,
-                            indexed_input};
-                }
-                cur = compute(inputs, target_vec);
-                cur->attr()[op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                bld->emit(cur);
-                cur = make_stmt<for_loop_node_t>(iter_vars.at(i), expr(0),
-                        dst.get_shape().at(i), expr(1), bld->pop_scope(), true,
-                        for_type::NORMAL);
-            }
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-    }
-    if (!tcur.empty() && tcur[0].defined()) {
-        for (auto &it : tcur) {
-            bld->emit(it);
-        }
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-    } else {
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    }
-}
-
-void select_op_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // set default vectorized information
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-
-    for (int64_t i = dst[0]->nslice_dims() - 1; i >= 0; --i) {
-        int cur_dim = get_const_as_int(
-                dst[0]->get_shape()[i].checked_as<constant>());
-        if (1 != cur_dim) {
-            vx_info_.axis = i;
-            break;
-        }
-    }
-    vx_info_.lanes
-            = vectorize_step(ctx, info_.inputs_[1]->details_.dtype_.type_code_);
-    // use broad-cast
-    int maxtensor_idx = get_ref_input_index(true);
-
-    auto func = [&](const std::vector<expr> &ins,
-                        std::vector<expr::lvalue_proxy_t> &outs) -> stmt {
-        return builder::make_assign_unattached(outs[0],
-                builder::make_select(
-                        ins[0] > make_expr<constant_node>(
-                                static_cast<uint64_t>(0), ins[0]->dtype_),
-                        ins[1], ins[2]));
-        // Here we use "ins[0] >
-        // make_expr<constant_node>(static_cast<uint64_t>(0),
-        // ins[0]->dtype_)" instead of "ins[0]", because _mm_cmp_epi8_mask
-        // intrinsic is the optimal instruction to cast bool tensor to
-        // bitmap
-    };
-    std::vector<std::vector<int>> blocking_bc_axis(info_.inputs_.size());
-    for (size_t i = 0; i < info_.inputs_.size(); i++) {
-        blocking_bc_axis[i] = get_bc_axis(maxtensor_idx, i);
-    }
-    compute_block_select(ctx, inputs, *dst[0], info_, maxtensor_idx,
-            blocking_bc_axis, vx_info_, mask_compute_func_t(func),
-            get_outputs()[0], wkld);
-}
-
-// Pure virtual function in fusible_op_t class.
-infer_status_code select_op_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    throw std::runtime_error("Not implemented");
-}
-
-OP_REGISTER(select_op_t, select)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/ternary_elemwise.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/ternary_elemwise.hpp
deleted file mode 100644
index fafbb4890df..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/ternary_elemwise.hpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_TERNARY_ELEMWISE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_TERNARY_ELEMWISE_HPP
-
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class select_op_t : public fusible_op_t,
-                    public op_traits::auto_copyable_t,
-                    public op_traits::may_inplace_t,
-                    public op_traits::may_broadcast_t {
-public:
-    DECLARE_QUERY_AND_COMPUTE();
-
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() override;
-
-    select_op_t(
-            graph_tensor_ptr cond, graph_tensor_ptr then, graph_tensor_ptr els);
-    select_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-
-    uint32_t get_lanes() const { return vx_info_.lanes; }
-
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-    std::vector<int> get_bc_axis(const int axis1, const int axis2) const;
-
-    std::vector<int> get_non_broadcast_input_index(
-            bool assert_non_empty) const override;
-
-    int get_ref_input_index(bool assert_determined) const override;
-
-    vectorized_info_t &get_vx_info() { return vx_info_; }
-
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-private:
-    vectorized_info_t vx_info_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/transpose.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/transpose.cpp
deleted file mode 100644
index 7b4f1151cb7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/transpose.cpp
+++ /dev/null
@@ -1,1434 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <string>
-#include <utility>
-#include <vector>
-#include "compiler/ir/graph/fusible_op_utils.hpp"
-#include "reorder.hpp"
-#include "util/math_utils.hpp"
-#include "util/utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-bool whole_buffer_reorder(const tensor_slice &src) {
-    for (auto &offset : src.get_offset()) {
-        if (!offset.isa<constant>() || get_expr_as_int(offset) != 0) {
-            return false;
-        }
-    }
-    return true;
-}
-
-static sc_trans_kernel get_trans_kernel_type(const sc_data_etype &etype) {
-    switch (etype) {
-        case sc_data_etype::U16:
-        case sc_data_etype::F16:
-        case sc_data_etype::BF16: {
-            return sc_trans_kernel::BIT16_32X8_TRANS;
-        }; break;
-        case sc_data_etype::F32: {
-            return sc_trans_kernel::F32_8X8_TRANS;
-        }; break;
-        case sc_data_etype::U8: {
-            return sc_trans_kernel::U8S8_16X16_TRANS;
-        }; break;
-        case sc_data_etype::S8: {
-            return sc_trans_kernel::U8S8_16X16_TRANS;
-        }; break;
-        default: {
-            COMPILE_ASSERT(false, "Do not support this dtype :" << etype);
-        } break;
-    }
-    return sc_trans_kernel::NO_TRANS;
-}
-
-// currently only support f32 8x8 and bf16 32x8
-const int trans_lanesx8 = 8;
-const int trans_lanesx16 = 16;
-const int trans_lanes_16bitx8 = 32;
-// [..., a, ... , b] <=> [..., b, ..., a]
-bool can_be_fast_transpose(const sc_graph_t &graph, const context_ptr &ctx,
-        std::vector<int> &inp_a_axis, std::vector<int> &inp_b_axis,
-        std::vector<int> &out_a_axis, std::vector<int> &out_b_axis,
-        const sc_dims &plain_dims, const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, const tensor_slice &src,
-        const tensor_slice &dst, const sc_data_type_t &dtype, bool is_dynamic,
-        bool dynamic_no_padding, sc_trans_kernel &trans_kernel_used) {
-    if (!ctx->machine_.cpu_flags_.fAVX2) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    }
-    if (!dtype.is_etype(sc_data_etype::F32)
-            && !dtype.is_etype(sc_data_etype::BF16)
-            && !dtype.is_etype(sc_data_etype::S8)
-            && !dtype.is_etype(sc_data_etype::U8)) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    }
-
-    bool is_16bit = (utils::get_sizeof_etype(dtype.type_code_) * 8) == 16;
-    bool is_float = dtype.is_etype(sc_data_etype::F32);
-    bool is_s8u8 = dtype.is_etype(sc_data_etype::S8)
-            || dtype.is_etype(sc_data_etype::U8);
-    inp_a_axis.clear();
-    inp_b_axis.clear();
-    out_a_axis.clear();
-    out_b_axis.clear();
-    int trans_inp_a_axis = 0, trans_inp_b_axis = 0, trans_out_a_axis = 0,
-        trans_out_b_axis = 0;
-    int inp_idx = 0, out_idx = 0;
-    auto &inp_code = input_format.format_code_;
-    auto &out_code = output_format.format_code_;
-    int input_ndims = input_format.format_code_.ndims();
-    int output_ndims = output_format.format_code_.ndims();
-    auto input_blocking_shapes
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_shapes
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_shapes_expr = get_blocking_shapes_expr(
-            const_cast<sc_graph_t &>(graph), plain_dims, input_format);
-    auto output_blocking_shapes_expr = get_blocking_shapes_expr(
-            const_cast<sc_graph_t &>(graph), plain_dims, output_format);
-    auto inp_b_idx = inp_code.get(input_ndims - 1);
-    int vec_inp_lastdim = input_ndims - 1;
-    auto out_a_idx = out_code.get(output_ndims - 1);
-    int vec_out_lastdim = output_ndims - 1;
-    find_vectorized_axis(src, input_format, inp_b_idx, vec_inp_lastdim);
-    find_vectorized_axis(dst, output_format, out_a_idx, vec_out_lastdim);
-    if (inp_b_idx == out_a_idx) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    }
-    while (inp_idx < input_ndims || out_idx < output_ndims) {
-        while (inp_idx < input_ndims
-                && utils::is_one_of(
-                        inp_code.get(inp_idx), out_a_idx, inp_b_idx)) {
-            if (inp_code.get(inp_idx) == out_a_idx) {
-                inp_a_axis.push_back(inp_idx);
-            } else {
-                inp_b_axis.push_back(inp_idx);
-            }
-            inp_idx++;
-        }
-        while (inp_idx + 1 < input_ndims
-                && inp_code.get(inp_idx + 1) == inp_code.get(inp_idx)) {
-            inp_idx++;
-        }
-        while (out_idx < output_ndims
-                && utils::is_one_of(
-                        out_code.get(out_idx), out_a_idx, inp_b_idx)) {
-            if (out_code.get(out_idx) == out_a_idx) {
-                out_a_axis.push_back(out_idx);
-            } else {
-                out_b_axis.push_back(out_idx);
-            }
-            out_idx++;
-        }
-        while (out_idx + 1 < output_ndims
-                && out_code.get(out_idx + 1) == out_code.get(out_idx)) {
-            out_idx++;
-        }
-        auto orig_inp_idx = inp_code.get(inp_idx);
-        auto orig_out_idx = out_code.get(out_idx);
-        // other axis should be in same order.
-        if (orig_inp_idx != orig_out_idx) {
-            trans_kernel_used = sc_trans_kernel::NO_TRANS;
-            return false;
-        }
-        inp_idx++;
-        out_idx++;
-    }
-    // number of non-transpose axis should be equal
-    if (static_cast<size_t>(input_ndims) - inp_a_axis.size() - inp_b_axis.size()
-            != static_cast<size_t>(output_ndims) - out_a_axis.size()
-                    - out_b_axis.size()) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    }
-    auto find_another_vec_axis
-            = [&](const sc_data_format_t &format, int &vec_axis,
-                      const tensor_slice &tsl, const int ori_idx) {
-                  auto format_code = format.format_code_;
-                  auto all_dims = format_code.ndims();
-                  for (int i = all_dims - 1; i >= 0; i--) {
-                      if (!tsl.get_shape()[i].isa<constant>()) { break; }
-                      if (format_code.get(i) == ori_idx
-                              && get_expr_as_int(tsl.get_shape()[i]) > 1) {
-                          vec_axis = i;
-                          break;
-                      }
-                  }
-              };
-    // find last dim of input a axis length not 1
-    int ori_inp_a = inp_code.get(inp_a_axis[inp_a_axis.size() - 1]);
-    int ori_out_b = out_code.get(out_b_axis[out_b_axis.size() - 1]);
-    int vec_inp_a = inp_a_axis[inp_a_axis.size() - 1];
-    int vec_out_b = out_b_axis[out_b_axis.size() - 1];
-    find_another_vec_axis(input_format, vec_inp_a, src, ori_inp_a);
-    find_another_vec_axis(output_format, vec_out_b, dst, ori_out_b);
-    trans_inp_a_axis = vec_inp_a;
-    trans_inp_b_axis = vec_inp_lastdim;
-    trans_out_a_axis = vec_out_lastdim;
-    trans_out_b_axis = vec_out_b;
-    // remove axis don't use
-    auto delete_axis = [&](std::vector<int> &inp_axis, const int last) {
-        // assume transpose axis is a and b, b1 dim len is 1
-        auto itlast = std::find(inp_axis.begin(), inp_axis.end(), last);
-        // [a, b, b1] -> [a, b]
-        inp_axis.erase(++itlast, inp_axis.end());
-    };
-    delete_axis(inp_a_axis, vec_inp_a);
-    delete_axis(inp_b_axis, vec_inp_lastdim);
-    delete_axis(out_a_axis, vec_out_lastdim);
-    delete_axis(out_b_axis, vec_out_b);
-    bool is_dynamic_axis = !src.shape_[trans_inp_a_axis].isa<constant>()
-            || !src.shape_[trans_inp_b_axis].isa<constant>()
-            || !dst.shape_[trans_out_a_axis].isa<constant>()
-            || !dst.shape_[trans_out_b_axis].isa<constant>();
-    if (input_format.is_blocking() && output_format.is_blocking()) {
-        // The transpose dimension in the output shape should be an integer
-        // multiple of the input shape in block2block case.
-        int ix = vec_inp_a;
-        int iy = vec_inp_lastdim;
-        int ox = vec_out_lastdim;
-        int oy = vec_out_b;
-        // can't do it in dynamic cases
-        if (is_dynamic_axis) {
-            trans_kernel_used = sc_trans_kernel::NO_TRANS;
-            return false;
-        }
-
-        int inp_x = get_expr_as_int(src.shape_[ix]);
-        int inp_y = get_expr_as_int(src.shape_[iy]);
-        int out_x = get_expr_as_int(dst.shape_[ox]);
-        int out_y = get_expr_as_int(dst.shape_[oy]);
-
-        if (out_x % inp_x != 0 || out_y % inp_y != 0) {
-            trans_kernel_used = sc_trans_kernel::NO_TRANS;
-            return false;
-        }
-        // example: ABCD4c16d2c -> ACBD8c16d, Note that the
-        // block format has been reconstructed in this example due to the
-        // multiple blocks and is not suitable for transpose. The number of data
-        // in the input and output format blocks should be the same, otherwise
-        // the data position may cause errors due to format changes.
-        auto input_block_axis = input_format.get_blocked_axis();
-        auto output_block_axis = output_format.get_blocked_axis();
-        size_t block_count_inp = input_block_axis.size();
-        size_t block_count_out = output_block_axis.size();
-        if (block_count_inp > 1 || block_count_out > 1) {
-            for (auto iter = input_block_axis.begin();
-                    iter != input_block_axis.end(); iter++) {
-                if (iter->second[0] != output_block_axis[iter->first][0]) {
-                    trans_kernel_used = sc_trans_kernel::NO_TRANS;
-                    return false;
-                }
-            }
-        }
-    }
-    auto satisfy_dim_lanes = [&]() {
-        int trans_lanes1 = is_16bit ? trans_lanes_16bitx8
-                : is_s8u8           ? trans_lanesx16
-                                    : trans_lanesx8;
-        int trans_lanes2 = is_s8u8 ? trans_lanesx16 : trans_lanesx8;
-        return plain_dims[inp_b_idx] % trans_lanes2 == 0
-                && plain_dims[out_a_idx] % trans_lanes1 == 0
-                && get_expr_as_int(src.shape_[vec_inp_a]) % trans_lanes1 == 0
-                && get_expr_as_int(dst.shape_[vec_out_b]) % trans_lanes2 == 0
-                && get_expr_as_int(src.shape_[vec_inp_lastdim]) % trans_lanes2
-                == 0
-                && get_expr_as_int(dst.shape_[vec_out_lastdim]) % trans_lanes1
-                == 0;
-    };
-    auto meet_kernel_require = [&](int threshold) {
-        int total = threshold;
-        return get_expr_as_int(src.shape_[vec_inp_a])
-                        * get_expr_as_int(src.shape_[vec_inp_lastdim])
-                >= total
-                && get_expr_as_int(dst.shape_[vec_out_lastdim])
-                        * get_expr_as_int(dst.shape_[vec_out_b])
-                >= total;
-    };
-
-    // Currently bf16 calculation needs to be larger than
-    // the number of elements threshold, otherwise the performance may
-    // regression.
-    // Multithread threashold is 35 - 88 (threads 56 - 4). We just div 6
-    // directly in u8s8 and div 3 in bf16.
-    int bit16_threshold = trans_lanesx8 * trans_lanes_16bitx8 / 3;
-    int s8u8_threshold = trans_lanesx16 * trans_lanesx16 / 6;
-    int f32_threshold = 1;
-    auto cur_run_thread = runtime_config_t::get().get_num_threads();
-    // Single threashold is 128.
-    if (cur_run_thread == 1) {
-        bit16_threshold = trans_lanesx8 * trans_lanes_16bitx8 / 2;
-        s8u8_threshold = trans_lanesx16 * trans_lanesx16 / 2;
-    }
-    // transpose axis should not be dynamic
-    if (is_16bit
-            && (dynamic_no_padding
-                    || (!is_dynamic
-                            && !meet_kernel_require(bit16_threshold)))) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    } else if (is_s8u8
-            && (dynamic_no_padding
-                    || (!is_dynamic && !meet_kernel_require(s8u8_threshold)))) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    } else if (is_float
-            && (!is_dynamic && !meet_kernel_require(f32_threshold))) {
-        trans_kernel_used = sc_trans_kernel::NO_TRANS;
-        return false;
-    }
-
-    // According to the current experimental results, when the number of
-    // shape elements is about 200 times the size of the L1cache, the
-    // performance will decline due to the drop in the L1cache hit rate.
-    // test shape example: [56, 256, 56, 56], [56, 64, 56, 56], [56, 256,
-    // 31, 22]
-    // Note the shape sizes that appear around the threshold.
-    if (!is_dynamic) {
-        auto shape_number = math_utils::get_dims_product(input_blocking_shapes)
-                * utils::get_sizeof_etype(dtype.type_code_);
-        int cache_multiplier = 200;
-        auto buffer_size_threshold
-                = ctx->machine_.cpu_flags_.getDCacheSize(1) * cache_multiplier;
-        // It seems that in the case of multi-threading, the threshold
-        // setting is similar.
-        if (cur_run_thread == 1) {
-            cache_multiplier = 125;
-            buffer_size_threshold = ctx->machine_.cpu_flags_.getDCacheSize(1)
-                    * cache_multiplier;
-        }
-        if (!whole_buffer_reorder(src)
-                && ((!satisfy_dim_lanes()
-                        && ((uint64_t)shape_number > buffer_size_threshold)))) {
-            return false;
-        }
-    } else {
-        // currently does not support tensor slice in dynamic
-        if (!whole_buffer_reorder(src)) { return false; }
-    }
-
-    trans_kernel_used = get_trans_kernel_type(dtype.as_etype());
-
-    if (is_float && ctx->machine_.cpu_flags_.fAVX512F
-            && plain_dims[inp_b_idx] > trans_lanesx8
-            && plain_dims[out_a_idx] > trans_lanesx8) {
-        // Currently we don't use f32x16 kernel (keep using f32x8, due to avx512
-        // frequency problem).But we keep it for the convenience of future
-        // performance comparison test on new machines.
-        trans_kernel_used = sc_trans_kernel::F32_8X8_TRANS;
-    }
-    if (is_16bit && !ctx->machine_.cpu_flags_.fAVX512F
-            && ctx->machine_.cpu_flags_.fAVX2) {
-        trans_kernel_used = sc_trans_kernel::BIT16_16X16_TRANS;
-    }
-
-    return true;
-}
-
-#define TRANS2D_ASSIGN(dst, src) \
-    cur_list.emplace_back(builder::make_assign_unattached( \
-            rows[((dst)-1)], rows[((src)-1)]));
-// unpack and interleave
-#define TRANS2D_UNPACK_ASSIGN(option, dst, src1, src2, elem_bits) \
-    cur_list.emplace_back(builder::make_assign_unattached(rows[((dst)-1)], \
-            builder::make_unpack_##option( \
-                    rows[((src1)-1)], rows[((src2)-1)], elem_bits)));
-#define TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32( \
-        command, dst, src1, src2, imm, elem_bits) \
-    cur_list.emplace_back(builder::make_assign_unattached(rows[((dst)-1)], \
-            builder::make_##command( \
-                    rows[((src1)-1)], rows[((src2)-1)], imm, elem_bits)));
-
-#define PERMUTEX_ASSIGN_F32(dst, src1, src2, imm, mask) \
-    cur_list.emplace_back(builder::make_assign_unattached(rows[((dst)-1)], \
-            builder::make_permute( \
-                    rows[((src1)-1)], rows[((src2)-1)], imm, mask)));
-
-#define TRANS2D_REG_CALCULATION_F32(type_bits) \
-    TRANS2D_UNPACK_ASSIGN(low, 9, 1, 2, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 1, 1, 2, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 10, 3, 4, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 2, 3, 4, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 11, 5, 6, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 3, 5, 6, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 12, 7, 8, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 4, 7, 8, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 5, 9, 10, 68, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 6, 9, 10, 238, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 7, 1, 2, 68, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 8, 1, 2, 238, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 9, 11, 12, 68, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 10, 11, 12, 238, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 11, 3, 4, 68, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(shuffle, 12, 3, 4, 238, 32) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 1, 5, 9, 32, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 2, 6, 10, 32, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 3, 7, 11, 32, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 4, 8, 12, 32, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 5, 5, 9, 49, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 6, 6, 10, 49, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 7, 7, 11, 49, type_bits) \
-    TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, 8, 8, 12, 49, type_bits)
-
-#define TRANS2D_REG_CALCULATION_BIT16x16(type_bits) \
-    for (int iter_i = 0; iter_i < 16; iter_i += 2) { \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 17, iter_i + 1, iter_i + 2, 16) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 18, iter_i + 1, iter_i + 2, 16) \
-    } \
-    for (int iter_i = 0; iter_i < 16; iter_i += 4) { \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 1, iter_i + 17, iter_i + 19, 32) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 2, iter_i + 17, iter_i + 19, 32) \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 3, iter_i + 18, iter_i + 20, 32) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 4, iter_i + 18, iter_i + 20, 32) \
-    } \
-    for (int iter_i = 0; iter_i < 16; iter_i += 8) { \
-        for (int iter_j = 0, offset = 0; iter_j < 8; iter_j += 2, offset++) { \
-            TRANS2D_UNPACK_ASSIGN(low, iter_i + 17 + iter_j, \
-                    iter_i + offset + 1, iter_i + offset + 5, 64) \
-            TRANS2D_UNPACK_ASSIGN(high, iter_i + 18 + iter_j, \
-                    iter_i + offset + 1, iter_i + offset + 5, 64) \
-        } \
-    } \
-    for (int iter_i = 0; iter_i < 8; iter_i += 1) { \
-        TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32( \
-                permute, iter_i + 1, iter_i + 17, iter_i + 25, 32, type_bits) \
-    } \
-    for (int iter_i = 8; iter_i < 16; iter_i += 1) { \
-        TRANS2D_SHUFFLE_PERMUTE_ASSIGN_F32(permute, iter_i + 1, \
-                iter_i + 17 - 8, iter_i + 25 - 8, 49, type_bits) \
-    }
-
-#define TRANS2D_REG_CALCULATION_BIT16() \
-    TRANS2D_UNPACK_ASSIGN(low, 9, 1, 2, 16) \
-    TRANS2D_UNPACK_ASSIGN(high, 10, 1, 2, 16) \
-    TRANS2D_UNPACK_ASSIGN(low, 11, 3, 4, 16) \
-    TRANS2D_UNPACK_ASSIGN(high, 12, 3, 4, 16) \
-    TRANS2D_UNPACK_ASSIGN(low, 13, 5, 6, 16) \
-    TRANS2D_UNPACK_ASSIGN(high, 14, 5, 6, 16) \
-    TRANS2D_UNPACK_ASSIGN(low, 15, 7, 8, 16) \
-    TRANS2D_UNPACK_ASSIGN(high, 16, 7, 8, 16) \
-    TRANS2D_UNPACK_ASSIGN(low, 1, 9, 11, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 2, 9, 11, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 3, 10, 12, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 4, 10, 12, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 5, 13, 15, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 6, 13, 15, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 7, 14, 16, 32) \
-    TRANS2D_UNPACK_ASSIGN(high, 8, 14, 16, 32) \
-    TRANS2D_UNPACK_ASSIGN(low, 9, 1, 5, 64) \
-    TRANS2D_UNPACK_ASSIGN(high, 10, 1, 5, 64) \
-    TRANS2D_UNPACK_ASSIGN(low, 11, 2, 6, 64) \
-    TRANS2D_UNPACK_ASSIGN(high, 12, 2, 6, 64) \
-    TRANS2D_UNPACK_ASSIGN(low, 13, 3, 7, 64) \
-    TRANS2D_UNPACK_ASSIGN(high, 14, 3, 7, 64) \
-    TRANS2D_UNPACK_ASSIGN(low, 15, 4, 8, 64) \
-    TRANS2D_UNPACK_ASSIGN(high, 16, 4, 8, 64)
-
-#define TRANS2D_REG_CALCULATION_U8S8X16() \
-    for (int iter_i = 0; iter_i < 16; iter_i += 2) { \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 17, iter_i + 1, iter_i + 2, 8) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 18, iter_i + 1, iter_i + 2, 8) \
-    } \
-    for (int iter_i = 0; iter_i < 16; iter_i += 4) { \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 1, iter_i + 17, iter_i + 19, 16) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 2, iter_i + 17, iter_i + 19, 16) \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 3, iter_i + 18, iter_i + 20, 16) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 4, iter_i + 18, iter_i + 20, 16) \
-    } \
-    for (int iter_i = 0; iter_i < 16; iter_i += 8) { \
-        for (int iter_j = 0, offset = 0; iter_j < 8; iter_j += 2, offset++) { \
-            TRANS2D_UNPACK_ASSIGN(low, iter_i + 17 + iter_j, \
-                    iter_i + offset + 1, iter_i + offset + 5, 32) \
-            TRANS2D_UNPACK_ASSIGN(high, iter_i + 18 + iter_j, \
-                    iter_i + offset + 1, iter_i + offset + 5, 32) \
-        } \
-    } \
-    for (int iter_i = 0, offset = 0; iter_i < 16; iter_i += 2, offset++) { \
-        TRANS2D_UNPACK_ASSIGN(low, iter_i + 1, offset + 17, offset + 25, 64) \
-        TRANS2D_UNPACK_ASSIGN(high, iter_i + 2, offset + 17, offset + 25, 64) \
-    }
-
-void compute_fast_transpose(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        const std::vector<int> &vec_inp_a_axis,
-        const std::vector<int> &vec_inp_b_axis,
-        const std::vector<int> &vec_out_a_axis,
-        const std::vector<int> &vec_out_b_axis,
-        const graph_tensor_ptr &expand_gt, size_t wkld, bool is_dynamic,
-        bool dynamic_no_padding, const sc_trans_kernel trans_kernel_used) {
-    auto input = src.get_real_tensor();
-    auto output = dst.get_real_tensor();
-    bool is_s8u8 = dtype.is_etype(sc_data_etype::S8)
-            || dtype.is_etype(sc_data_etype::U8);
-    bool is_16bit = (utils::get_sizeof_etype(dtype.type_code_) * 8) == 16;
-    int step = (sc_trans_kernel::F32_16X16_TRANS == trans_kernel_used
-                       && dtype == datatypes::f32)
-                    || is_s8u8
-            ? trans_lanesx16
-            : trans_lanesx8; // fixed f32x8
-    const int lanesx16 = 16;
-    bool direct_use_bit16x16 = is_16bit
-            && trans_kernel_used == sc_trans_kernel::BIT16_16X16_TRANS;
-    step = direct_use_bit16x16 ? lanesx16 : step;
-    auto bld = builder::get_current_builder();
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, output_format);
-    auto input_format_code = input_format.format_code_;
-    const int inp_a_axis = vec_inp_a_axis[vec_inp_a_axis.size() - 1];
-    const int inp_b_axis = vec_inp_b_axis[vec_inp_b_axis.size() - 1];
-    const int out_a_axis = vec_out_a_axis[vec_out_a_axis.size() - 1];
-    const int out_b_axis = vec_out_b_axis[vec_out_b_axis.size() - 1];
-    std::vector<expr> rows;
-    std::vector<expr> iter_vars, iter_vars_tail;
-    std::vector<stmt_c> cur_list, cur_list_floor, var_define_list,
-            cur_list_lastdim_floor;
-    stmt cur, body, cur_tail, body_tail;
-    bool need_mask = false;
-    int inp_a_step = dtype == datatypes::f32
-            ? (sc_trans_kernel::F32_16X16_TRANS == trans_kernel_used
-                            ? trans_lanesx16
-                            : trans_lanesx8)
-            : direct_use_bit16x16 ? lanesx16
-                                  : trans_lanes_16bitx8;
-    inp_a_step = is_s8u8 ? trans_lanesx16 : inp_a_step;
-    int inp_b_step = (sc_trans_kernel::F32_16X16_TRANS == trans_kernel_used
-                             && dtype == datatypes::f32)
-                    || is_s8u8
-            ? trans_lanesx16
-            : direct_use_bit16x16 ? lanesx16
-                                  : trans_lanesx8;
-    const int type_bits = utils::get_sizeof_type(sc_data_type_t::f32(4)) * 8;
-    expr inputloop_otheraxis_condition, outputloop_axis_condition,
-            inputloop_lastdim_condition;
-    bool use_split_loop = true, is_padding = false;
-    auto is_shapesize_need_mask = [&](const int64_t inp_a_size,
-                                          const int64_t inp_b_size,
-                                          const int64_t out_a_size,
-                                          const int64_t out_b_size) {
-        assert(!is_dynamic);
-        if ((!is_dynamic
-                    && (inp_a_size % inp_a_step || inp_b_size % inp_b_step
-                            || out_a_size % inp_a_step
-                            || out_b_size % inp_b_step))) {
-            need_mask = true;
-        }
-    };
-    if (is_dynamic && !dynamic_no_padding) { need_mask = true; }
-    if (!is_dynamic) {
-        is_shapesize_need_mask(input_blocking_dims[inp_a_axis],
-                input_blocking_dims[inp_b_axis],
-                output_blocking_dims[out_a_axis],
-                output_blocking_dims[out_b_axis]);
-        // tensor slice check
-        is_shapesize_need_mask(get_expr_as_int(src.get_shape()[inp_a_axis]),
-                get_expr_as_int(src.get_shape()[inp_b_axis]),
-                get_expr_as_int(dst.get_shape()[out_a_axis]),
-                get_expr_as_int(dst.get_shape()[out_b_axis]));
-    }
-
-    if (!is_dynamic
-            && math_utils::get_dims_product(input_blocking_dims)
-                    != math_utils::get_dims_product(output_blocking_dims)) {
-        need_mask = true;
-        is_padding = true;
-    }
-
-    auto make_rows_var = [&](std::vector<expr> &rows,
-                                 std::vector<stmt_c> &cur_list,
-                                 const sc_data_type_t &dtype, const int var_len,
-                                 bool skip_promote = true) {
-        rows.resize(var_len); // bf16 uses 16 zmms.
-        for (auto i = 0; i < var_len; i++) {
-            rows[i] = builder::make_var(dtype,
-                    "row" + std::to_string(i + 1) + fusion_create_var_idx());
-            cur_list.emplace_back(
-                    builder::make_var_tensor_def_unattached(rows[i]));
-            if (skip_promote) {
-                // skip bf16 elimination pass on rows. Otherwise it will be
-                // promote to f32.
-                rows[i]->attr()["can_promote_to_f32"] = false;
-            }
-        }
-    };
-    if (dtype == datatypes::f32 || is_s8u8 || direct_use_bit16x16) {
-        int var_len = is_s8u8 ? step * 2 : step + step / 2;
-        var_len = direct_use_bit16x16 ? 32 : var_len;
-        sc_data_type_t cur_dtype = sc_data_type_t(dtype.type_code_, step);
-        make_rows_var(rows, cur_list, cur_dtype, var_len);
-    } else if (is_16bit) {
-        const int var_len = 16;
-        auto cur_dtype = sc_data_type_t::bf16(32);
-        make_rows_var(rows, cur_list, cur_dtype, var_len, false);
-    }
-    var_define_list.assign(cur_list.begin(), cur_list.end());
-
-    auto determine_cur_step = [&](const std::vector<expr> &blocking_dims_expr,
-                                      const std::vector<expr> &tmp_in_indexes,
-                                      const std::vector<expr> &plain_indexes,
-                                      expr &cur_step, expr &sup_condition,
-                                      int in_axis, int in_baxis,
-                                      bool use_output_loop, int step,
-                                      sc_data_format_kind_t &format_code,
-                                      bool dst_condition = false) {
-        auto slice_condition = [&]() {
-            cur_step = builder::make_min(
-                    builder::make_max(cast_to_s32(blocking_dims_expr[in_baxis])
-                                    - cast_to_s32(tmp_in_indexes[in_baxis]),
-                            0),
-                    step);
-            sup_condition
-                    = tmp_in_indexes[in_axis] >= blocking_dims_expr[in_axis];
-        };
-        auto plain_condition = [&]() {
-            auto tmp_plain = graph.dims_to_expr(plain_dims);
-            auto input_last_dim = format_code.get(in_baxis);
-            auto input_other_dim = format_code.get(in_axis);
-            cur_step = builder::make_min(
-                    builder::make_max(cast_to_s32(tmp_plain[input_last_dim])
-                                    - cast_to_s32(
-                                            plain_indexes[input_last_dim]),
-                            0),
-                    step);
-            sup_condition = plain_indexes[input_other_dim]
-                    >= tmp_plain[input_other_dim];
-        };
-        if (!use_output_loop || dst_condition) {
-            slice_condition();
-        } else {
-            plain_condition();
-        }
-    };
-
-    auto src_opt_mask = [&](expr &cur_step, expr &sup_condition,
-                                const int step) {
-        auto src_shape
-                = output_loop ? input_blocking_dims_expr : src.get_shape();
-        auto src_mask_can_empty = [&](const int src_mask_axis,
-                                          const int dst_mask_axis,
-                                          const int mask_step) {
-            if (is_dynamic) return false;
-            bool is_srcshape_multiple_step
-                    = (get_expr_as_int(src_shape[src_mask_axis]) % mask_step)
-                    == 0;
-            bool is_dstshape_multiple_step = (output_loop
-                    && (get_expr_as_int(dst.get_shape()[dst_mask_axis])
-                               % mask_step)
-                            == 0
-                    && !is_padding);
-            return !is_dynamic && (is_srcshape_multiple_step)
-                    && (!output_loop || is_dstshape_multiple_step);
-        };
-        if (src_mask_can_empty(inp_a_axis, out_a_axis, inp_a_step)) {
-            sup_condition = expr();
-        } else {
-            inputloop_otheraxis_condition = sup_condition;
-        }
-        if (src_mask_can_empty(inp_b_axis, out_b_axis, inp_b_step)) {
-            cur_step = step;
-            inputloop_lastdim_condition = expr();
-        }
-    };
-    auto dst_opt_mask = [&](expr &cur_step, expr &sup_condition,
-                                const std::vector<expr> &in_indexes,
-                                const int i, const int step) {
-        auto dst_shape
-                = output_loop ? dst.get_shape() : output_blocking_dims_expr;
-        auto dst_mask_can_empty = [&](const int dst_mask_axis,
-                                          const int src_mask_axis,
-                                          const int mask_step) {
-            if (is_dynamic) return false;
-            bool is_dstshape_multiple_step
-                    = (get_expr_as_int(dst_shape[dst_mask_axis]) % mask_step)
-                    == 0;
-            bool is_srcshape_multiple_step = (!output_loop
-                    && (get_expr_as_int(src.get_shape()[src_mask_axis])
-                               % mask_step)
-                            == 0
-                    && !is_padding);
-            return !is_dynamic && (is_dstshape_multiple_step)
-                    && (output_loop || is_srcshape_multiple_step);
-        };
-        if (dst_mask_can_empty(out_b_axis, inp_b_axis, inp_b_step)) {
-            sup_condition = expr();
-        } else {
-            outputloop_axis_condition = sup_condition;
-        }
-        // ABba(4, 4, 16, 4) => AB(16,64)
-        if (input_format.is_blocking() || !output_loop) {
-            auto src_condition_shape = output_loop
-                    ? input_blocking_dims_expr[inp_b_axis]
-                    : src.get_shape()[inp_b_axis];
-            expr outof_srcshape_bound
-                    = in_indexes[inp_b_axis] + i >= src_condition_shape;
-            if (is_dynamic) {
-                sup_condition = sup_condition || (outof_srcshape_bound);
-            } else {
-                if ((get_expr_as_int(src_condition_shape) % inp_b_step) != 0) {
-                    if (sup_condition.defined()) {
-                        sup_condition = sup_condition || (outof_srcshape_bound);
-                    } else {
-                        sup_condition = (outof_srcshape_bound);
-                    }
-                    outputloop_axis_condition = sup_condition;
-                }
-            }
-        }
-        if (dst_mask_can_empty(out_a_axis, inp_a_axis, inp_a_step)) {
-            cur_step = step;
-        }
-    };
-    auto inputloop_split_ir_directly = [&](const std::vector<expr> &in_indexes,
-                                               const std::vector<expr>
-                                                       &out_indexes,
-                                               const std::vector<expr>
-                                                       &plain_indexes,
-                                               const std::vector<expr>
-                                                       &out_indexes_slice) {
-        assert(!is_dynamic);
-        expr mask, mask_floor, mask_floor_lastdim;
-        int lastdim_bound = get_expr_as_int(src.get_shape()[inp_b_axis]) % step;
-        for (int i = 0; i < (direct_use_bit16x16 ? lanesx16 : step); i++) {
-            auto tmp_in_indexes = in_indexes;
-            auto in_axis = inp_a_axis;
-            tmp_in_indexes[in_axis]
-                    = tmp_in_indexes[in_axis] + static_cast<uint64_t>(i);
-            auto tmp_plain_indexes = plain_indexes;
-            tmp_plain_indexes[input_format_code.get(in_axis)]
-                    = tmp_plain_indexes[input_format_code.get(in_axis)]
-                    + static_cast<uint64_t>(i);
-            expr tmp_in = src.tptr_;
-            if (output_loop) { tmp_in = input; }
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? input_blocking_dims_expr
-                                               : src.get_shape(),
-                        tmp_in_indexes, tmp_plain_indexes, cur_step,
-                        sup_condition, in_axis, inp_b_axis, output_loop, step,
-                        input_format_code);
-                src_opt_mask(cur_step, sup_condition, step);
-                if (i == 0 && (lastdim_bound != 0)) {
-                    inputloop_lastdim_condition
-                            = cast_to_s32(src.get_shape()[inp_b_axis])
-                                    - cast_to_s32(step)
-                                    - cast_to_s32(tmp_in_indexes[inp_b_axis])
-                            < 0;
-                }
-
-                stmt mask_def, mask_def_floor;
-                mask = generate_mask_var_by_step(
-                        mask_def, cur_step, step, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-                // if sup_condition is defined, means this axis can
-                // split loop to avoid other axis mask
-                if (inputloop_otheraxis_condition.defined()) {
-                    mask_floor = generate_mask_var_by_step(
-                            mask_def_floor, cur_step, step, expr(), true);
-                    cur_list_floor.emplace_back(mask_def_floor);
-                }
-            }
-
-            auto assign = builder::make_assign_unattached(rows[i],
-                    builder::make_indexing(tmp_in, tmp_in_indexes, step, mask));
-            assign->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            cur_list.emplace_back(assign);
-            if (inputloop_otheraxis_condition.defined()) {
-                auto assign_floor = builder::make_assign_unattached(rows[i],
-                        builder::make_indexing(
-                                tmp_in, tmp_in_indexes, step, mask_floor));
-                assign_floor->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                cur_list_floor.emplace_back(assign_floor);
-            }
-            if (inputloop_lastdim_condition.defined()) {
-                stmt assign_floor_lastdim;
-                assign_floor_lastdim = builder::make_assign_unattached(rows[i],
-                        builder::make_indexing(tmp_in, tmp_in_indexes, step));
-                assign_floor_lastdim->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                cur_list_lastdim_floor.emplace_back(assign_floor_lastdim);
-            }
-        }
-        auto tmp_cur_list = cur_list;
-        if (inputloop_otheraxis_condition.defined()
-                || inputloop_lastdim_condition.defined()) {
-            cur_list.clear();
-        }
-        if (is_s8u8) {
-            TRANS2D_REG_CALCULATION_U8S8X16();
-        } else if (direct_use_bit16x16) {
-            TRANS2D_REG_CALCULATION_BIT16x16(type_bits);
-        } else {
-            if (sc_trans_kernel::F32_16X16_TRANS == trans_kernel_used) {
-                TRANS2D_REG_CALCULATION_F32(type_bits);
-            } else {
-                TRANS2D_REG_CALCULATION_F32(type_bits);
-            }
-        }
-        if (inputloop_otheraxis_condition.defined()
-                || inputloop_lastdim_condition.defined()) {
-            cur_list_floor.insert(
-                    cur_list_floor.end(), cur_list.begin(), cur_list.end());
-            cur_list_lastdim_floor.insert(cur_list_lastdim_floor.end(),
-                    cur_list.begin(), cur_list.end());
-            tmp_cur_list.insert(
-                    tmp_cur_list.end(), cur_list.begin(), cur_list.end());
-            cur_list.clear();
-            cur_list = std::move(tmp_cur_list);
-        }
-        for (int i = 0; i < (direct_use_bit16x16 ? lanesx16 : step); i++) {
-            auto tmp_out_indexes = out_indexes;
-            auto out_axis = out_b_axis;
-            tmp_out_indexes[out_axis]
-                    = tmp_out_indexes[out_axis] + static_cast<uint64_t>(i);
-            auto tmp_out_indexes_slice
-                    = output_loop ? out_indexes_slice : out_indexes;
-            tmp_out_indexes_slice[out_axis] = tmp_out_indexes_slice[out_axis]
-                    + static_cast<uint64_t>(i);
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? dst.get_shape()
-                                               : output_blocking_dims_expr,
-                        tmp_out_indexes_slice, plain_indexes, cur_step,
-                        sup_condition, out_axis, out_a_axis, output_loop, step,
-                        input_format_code, true);
-                dst_opt_mask(cur_step, sup_condition, in_indexes, i, step);
-                stmt mask_def, mask_def_floor, mask_def_floor_lastdim;
-                mask = generate_mask_var_by_step(
-                        mask_def, cur_step, step, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-
-                if (inputloop_otheraxis_condition.defined()) {
-                    mask_floor = generate_mask_var_by_step(mask_def_floor,
-                            cur_step, step, sup_condition, true);
-                    cur_list_floor.emplace_back(mask_def_floor);
-                }
-                if (inputloop_lastdim_condition.defined()) {
-                    expr sup_condition_lastdim = tmp_out_indexes_slice[out_axis]
-                            >= output_blocking_dims_expr[out_axis];
-                    mask_floor_lastdim = generate_mask_var_by_step(
-                            mask_def_floor_lastdim, expr(step), step,
-                            sup_condition_lastdim, true);
-                    cur_list_lastdim_floor.emplace_back(mask_def_floor_lastdim);
-                }
-            }
-            stmt assign;
-            assign = builder::make_assign_unattached(
-                    builder::make_indexing(output, tmp_out_indexes, step, mask),
-                    rows[i]);
-
-            cur_list.emplace_back(assign);
-            if (inputloop_otheraxis_condition.defined()) {
-                auto assign_floor = builder::make_assign_unattached(
-                        builder::make_indexing(
-                                output, tmp_out_indexes, step, mask_floor),
-                        rows[i]);
-                cur_list_floor.emplace_back(assign_floor);
-            }
-            if (inputloop_lastdim_condition.defined()) {
-                stmt assign_floor_lastdim;
-                assign_floor_lastdim = builder::make_assign_unattached(
-                        builder::make_indexing(output, tmp_out_indexes, step,
-                                mask_floor_lastdim),
-                        rows[i]);
-
-                cur_list_lastdim_floor.emplace_back(assign_floor_lastdim);
-            }
-        }
-    };
-
-    auto outputloop_split_ir_directly = [&](const std::vector<expr> &in_indexes,
-                                                const std::vector<expr>
-                                                        &out_indexes,
-                                                const std::vector<expr>
-                                                        &plain_indexes,
-                                                const std::vector<expr>
-                                                        &out_indexes_slice) {
-        assert(cur_list_floor.empty() && cur_list.empty());
-        expr mask, mask_floor;
-        for (int i = 0; i < (direct_use_bit16x16 ? lanesx16 : step); i++) {
-            auto tmp_in_indexes = in_indexes;
-            auto in_axis = inp_a_axis;
-            tmp_in_indexes[in_axis]
-                    = tmp_in_indexes[in_axis] + static_cast<uint64_t>(i);
-            auto tmp_plain_indexes = plain_indexes;
-            tmp_plain_indexes[input_format_code.get(in_axis)]
-                    = tmp_plain_indexes[input_format_code.get(in_axis)]
-                    + static_cast<uint64_t>(i);
-            expr tmp_in = src.tptr_;
-            if (output_loop) { tmp_in = input; }
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? input_blocking_dims_expr
-                                               : src.get_shape(),
-                        tmp_in_indexes, tmp_plain_indexes, cur_step,
-                        sup_condition, in_axis, inp_b_axis, output_loop, step,
-                        input_format_code);
-                src_opt_mask(cur_step, sup_condition, step);
-                stmt mask_def, mask_def_floor;
-                mask = generate_mask_var_by_step(
-                        mask_def, cur_step, step, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-                mask_floor = generate_mask_var_by_step(
-                        mask_def_floor, cur_step, step, sup_condition, true);
-                cur_list_floor.emplace_back(mask_def_floor);
-            }
-
-            auto assign = builder::make_assign_unattached(rows[i],
-                    builder::make_indexing(tmp_in, tmp_in_indexes, step, mask));
-            assign->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            cur_list.emplace_back(assign);
-            auto assign_floor = builder::make_assign_unattached(rows[i],
-                    builder::make_indexing(
-                            tmp_in, tmp_in_indexes, step, mask_floor));
-            assign_floor
-                    ->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            cur_list_floor.emplace_back(assign_floor);
-        }
-        auto tmp_cur_list = cur_list;
-        if (!cur_list_floor.empty()) { cur_list.clear(); }
-        if (is_s8u8) {
-            TRANS2D_REG_CALCULATION_U8S8X16();
-        } else if (direct_use_bit16x16) {
-            TRANS2D_REG_CALCULATION_BIT16x16(type_bits);
-        } else {
-            if (sc_trans_kernel::F32_16X16_TRANS == trans_kernel_used) {
-                TRANS2D_REG_CALCULATION_F32(type_bits);
-            } else {
-                TRANS2D_REG_CALCULATION_F32(type_bits);
-            }
-        }
-        if (!cur_list_floor.empty()) {
-            cur_list_floor.insert(
-                    cur_list_floor.end(), cur_list.begin(), cur_list.end());
-            tmp_cur_list.insert(
-                    tmp_cur_list.end(), cur_list.begin(), cur_list.end());
-            cur_list.clear();
-            cur_list = std::move(tmp_cur_list);
-        }
-        for (int i = 0; i < (direct_use_bit16x16 ? lanesx16 : step); i++) {
-            auto tmp_out_indexes = out_indexes;
-            auto out_axis = out_b_axis;
-            tmp_out_indexes[out_axis]
-                    = tmp_out_indexes[out_axis] + static_cast<uint64_t>(i);
-            auto tmp_out_indexes_slice
-                    = output_loop ? out_indexes_slice : out_indexes;
-            tmp_out_indexes_slice[out_axis] = tmp_out_indexes_slice[out_axis]
-                    + static_cast<uint64_t>(i);
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? dst.get_shape()
-                                               : output_blocking_dims_expr,
-                        tmp_out_indexes_slice, plain_indexes, cur_step,
-                        sup_condition, out_axis, out_a_axis, output_loop, step,
-                        input_format_code, true);
-                dst_opt_mask(cur_step, sup_condition, in_indexes, i, step);
-                stmt mask_def, mask_def_floor;
-                mask = generate_mask_var_by_step(
-                        mask_def, cur_step, step, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-                if (outputloop_axis_condition.defined()) {
-                    mask_floor = generate_mask_var_by_step(
-                            mask_def_floor, cur_step, step, expr(), true);
-                    cur_list_floor.emplace_back(mask_def_floor);
-                }
-            }
-            auto assign = builder::make_assign_unattached(
-                    builder::make_indexing(output, tmp_out_indexes, step, mask),
-                    rows[i]);
-            cur_list.emplace_back(assign);
-            if (outputloop_axis_condition.defined()) {
-                auto assign_floor = builder::make_assign_unattached(
-                        builder::make_indexing(
-                                output, tmp_out_indexes, step, mask_floor),
-                        rows[i]);
-                cur_list_floor.emplace_back(assign_floor);
-            }
-        }
-    };
-
-    auto inputloop_split_ir_bit16 = [&](const std::vector<expr> &in_indexes,
-                                            const std::vector<expr>
-                                                    &out_indexes,
-                                            const std::vector<expr>
-                                                    &plain_indexes,
-                                            const std::vector<expr>
-                                                    &out_indexes_slice) {
-        expr mask, mask_floor;
-        for (int i = 0; i < step; i++) {
-            for (int p = 0; p < 4; p++) {
-                auto tmp_in_indexes = in_indexes;
-                auto in_axis = inp_a_axis;
-                tmp_in_indexes[in_axis] = tmp_in_indexes[in_axis]
-                        + (static_cast<uint64_t>(i) + p * 8);
-                auto tmp_plain_indexes = plain_indexes;
-                tmp_plain_indexes[input_format_code.get(in_axis)]
-                        = tmp_plain_indexes[input_format_code.get(in_axis)]
-                        + (static_cast<uint64_t>(i) + p * 8);
-                expr tmp_in = src.tptr_;
-                if (output_loop) { tmp_in = input; }
-                if (need_mask) {
-                    expr cur_step, sup_condition;
-                    determine_cur_step(output_loop ? input_blocking_dims_expr
-                                                   : src.get_shape(),
-                            tmp_in_indexes, tmp_plain_indexes, cur_step,
-                            sup_condition, in_axis, inp_b_axis, output_loop,
-                            trans_lanesx8, input_format_code);
-                    src_opt_mask(cur_step, sup_condition, trans_lanesx8);
-                    stmt mask_def, mask_def_tail;
-                    mask = generate_mask_var_by_step(mask_def, cur_step,
-                            trans_lanesx8, sup_condition, true);
-                    cur_list.emplace_back(mask_def);
-                    if (inputloop_otheraxis_condition.defined()) {
-                        mask_floor = generate_mask_var_by_step(
-                                mask_def_tail, cur_step, step, expr(), true);
-                        cur_list_floor.emplace_back(mask_def_tail);
-                    }
-                }
-                auto brct_src = builder::make_broadcast(
-                        builder::make_indexing(
-                                tmp_in, tmp_in_indexes, step, mask),
-                        trans_lanes_16bitx8);
-                auto assign = builder::make_assign_unattached(rows[i],
-                        p > 0 ? builder::make_select(
-                                0xff << (p * step), brct_src, rows[i])
-                              : brct_src);
-                assign->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                cur_list.emplace_back(assign);
-                if (inputloop_otheraxis_condition.defined()) {
-                    auto brct_src_floor = builder::make_broadcast(
-                            builder::make_indexing(
-                                    tmp_in, tmp_in_indexes, step, mask_floor),
-                            trans_lanes_16bitx8);
-                    auto assign_floor = builder::make_assign_unattached(rows[i],
-                            p > 0 ? builder::make_select(
-                                    0xff << (p * step), brct_src_floor, rows[i])
-                                  : brct_src_floor);
-                    assign_floor->attr()
-                            [op_traits::workload_computable_t::workload_number]
-                            = wkld;
-                    cur_list_floor.emplace_back(assign_floor);
-                }
-            }
-        }
-        auto tmp_cur_list = cur_list;
-        if (inputloop_otheraxis_condition.defined()) { cur_list.clear(); }
-        TRANS2D_REG_CALCULATION_BIT16();
-        if (inputloop_otheraxis_condition.defined()) {
-            cur_list_floor.insert(
-                    cur_list_floor.end(), cur_list.begin(), cur_list.end());
-            tmp_cur_list.insert(
-                    tmp_cur_list.end(), cur_list.begin(), cur_list.end());
-            cur_list.clear();
-            cur_list = std::move(tmp_cur_list);
-        }
-        for (int i = 0; i < step; i++) {
-            auto tmp_out_indexes = out_indexes;
-            auto out_axis = out_b_axis;
-            tmp_out_indexes[out_axis]
-                    = tmp_out_indexes[out_axis] + static_cast<uint64_t>(i);
-            auto tmp_out_indexes_slice
-                    = output_loop ? out_indexes_slice : out_indexes;
-            tmp_out_indexes_slice[out_axis] = tmp_out_indexes_slice[out_axis]
-                    + static_cast<uint64_t>(i);
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? dst.get_shape()
-                                               : output_blocking_dims_expr,
-                        tmp_out_indexes_slice, plain_indexes, cur_step,
-                        sup_condition, out_axis, out_a_axis, output_loop,
-                        trans_lanes_16bitx8, input_format_code, true);
-                dst_opt_mask(cur_step, sup_condition, in_indexes, i,
-                        trans_lanes_16bitx8);
-                stmt mask_def, mask_def_floor;
-                mask = generate_mask_var_by_step(mask_def, cur_step,
-                        trans_lanes_16bitx8, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-                if (inputloop_otheraxis_condition.defined()) {
-                    mask_floor = generate_mask_var_by_step(mask_def_floor,
-                            cur_step, trans_lanes_16bitx8, sup_condition, true);
-                    cur_list_floor.emplace_back(mask_def_floor);
-                }
-            }
-            auto assign = builder::make_assign_unattached(
-                    builder::make_indexing(
-                            output, tmp_out_indexes, trans_lanes_16bitx8, mask),
-                    rows[i + 8]);
-            cur_list.emplace_back(assign);
-            if (inputloop_otheraxis_condition.defined()) {
-                auto assign_floor = builder::make_assign_unattached(
-                        builder::make_indexing(output, tmp_out_indexes,
-                                trans_lanes_16bitx8, mask_floor),
-                        rows[i + 8]);
-                cur_list_floor.emplace_back(assign_floor);
-            }
-        }
-    };
-
-    auto compute_transpose_direct = [&](const std::vector<expr> &in_indexes,
-                                            const std::vector<expr>
-                                                    &out_indexes,
-                                            const std::vector<expr>
-                                                    &plain_indexes,
-                                            const std::vector<expr>
-                                                    &out_indexes_slice) {
-        expr mask;
-        for (int i = 0; i < (direct_use_bit16x16 ? lanesx16 : step); i++) {
-            auto tmp_in_indexes = in_indexes;
-            auto in_axis = inp_a_axis;
-            tmp_in_indexes[in_axis]
-                    = tmp_in_indexes[in_axis] + static_cast<uint64_t>(i);
-            auto tmp_plain_indexes = plain_indexes;
-            tmp_plain_indexes[input_format_code.get(in_axis)]
-                    = tmp_plain_indexes[input_format_code.get(in_axis)]
-                    + static_cast<uint64_t>(i);
-            expr tmp_in = src.tptr_;
-            if (output_loop) { tmp_in = input; }
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? input_blocking_dims_expr
-                                               : src.get_shape(),
-                        tmp_in_indexes, tmp_plain_indexes, cur_step,
-                        sup_condition, in_axis, inp_b_axis, output_loop, step,
-                        input_format_code);
-                src_opt_mask(cur_step, sup_condition, step);
-                stmt mask_def;
-                mask = generate_mask_var_by_step(
-                        mask_def, cur_step, step, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-            }
-            auto assign = builder::make_assign_unattached(rows[i],
-                    // here, use src.tptr instead of input is aimed to
-                    // avoid input is tensor_view_op. Otherwise, it will
-                    // throw illegal exception in tensor_shrink
-                    builder::make_indexing(tmp_in, tmp_in_indexes, step, mask));
-            assign->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            cur_list.emplace_back(assign);
-        }
-        if (is_s8u8) {
-            TRANS2D_REG_CALCULATION_U8S8X16();
-        } else if (direct_use_bit16x16) {
-            TRANS2D_REG_CALCULATION_BIT16x16(type_bits);
-        } else {
-            if (sc_trans_kernel::F32_16X16_TRANS == trans_kernel_used) {
-                TRANS2D_REG_CALCULATION_F32(type_bits);
-            } else {
-                TRANS2D_REG_CALCULATION_F32(type_bits);
-            }
-        }
-        for (int i = 0; i < (direct_use_bit16x16 ? lanesx16 : step); i++) {
-            auto tmp_out_indexes = out_indexes;
-            auto out_axis = out_b_axis;
-            tmp_out_indexes[out_axis]
-                    = tmp_out_indexes[out_axis] + static_cast<uint64_t>(i);
-            auto tmp_out_indexes_slice
-                    = output_loop ? out_indexes_slice : out_indexes;
-            tmp_out_indexes_slice[out_axis] = tmp_out_indexes_slice[out_axis]
-                    + static_cast<uint64_t>(i);
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? dst.get_shape()
-                                               : output_blocking_dims_expr,
-                        tmp_out_indexes_slice, plain_indexes, cur_step,
-                        sup_condition, out_axis, out_a_axis, output_loop, step,
-                        input_format_code, true);
-                dst_opt_mask(cur_step, sup_condition, in_indexes, i, step);
-                stmt mask_def;
-                mask = generate_mask_var_by_step(
-                        mask_def, cur_step, step, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-            }
-            auto assign = builder::make_assign_unattached(
-                    builder::make_indexing(output, tmp_out_indexes, step, mask),
-                    rows[i]);
-            cur_list.emplace_back(assign);
-        }
-    };
-
-    auto compute_transpose_bit16 = [&](const std::vector<expr> &in_indexes,
-                                           const std::vector<expr> &out_indexes,
-                                           const std::vector<expr>
-                                                   &plain_indexes,
-                                           const std::vector<expr>
-                                                   &out_indexes_slice) {
-        expr mask;
-        for (int i = 0; i < step; i++) {
-            for (int p = 0; p < 4; p++) {
-                auto tmp_in_indexes = in_indexes;
-                auto in_axis = inp_a_axis;
-                tmp_in_indexes[in_axis] = tmp_in_indexes[in_axis]
-                        + (static_cast<uint64_t>(i) + p * 8);
-                auto tmp_plain_indexes = plain_indexes;
-                tmp_plain_indexes[input_format_code.get(in_axis)]
-                        = tmp_plain_indexes[input_format_code.get(in_axis)]
-                        + (static_cast<uint64_t>(i) + p * 8);
-                expr tmp_in = src.tptr_;
-                if (output_loop) { tmp_in = input; }
-                if (need_mask) {
-                    expr cur_step, sup_condition;
-                    determine_cur_step(output_loop ? input_blocking_dims_expr
-                                                   : src.get_shape(),
-                            tmp_in_indexes, tmp_plain_indexes, cur_step,
-                            sup_condition, in_axis, inp_b_axis, output_loop,
-                            trans_lanesx8, input_format_code);
-                    src_opt_mask(cur_step, sup_condition, trans_lanesx8);
-                    stmt mask_def;
-                    mask = generate_mask_var_by_step(mask_def, cur_step,
-                            trans_lanesx8, sup_condition, true);
-                    cur_list.emplace_back(mask_def);
-                }
-                auto brct_src = builder::make_broadcast(
-                        builder::make_indexing(
-                                tmp_in, tmp_in_indexes, step, mask),
-                        trans_lanes_16bitx8);
-                auto assign = builder::make_assign_unattached(rows[i],
-                        // here, use src.tptr instead of input is aimed
-                        // to avoid input is tensor_view_op. Otherwise,
-                        // it will throw illegal exception in
-                        // tensor_shrink
-                        p > 0 ? builder::make_select(
-                                0xff << (p * step), brct_src, rows[i])
-                              : brct_src);
-                assign->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                cur_list.emplace_back(assign);
-            }
-        }
-
-        TRANS2D_REG_CALCULATION_BIT16();
-        for (int i = 0; i < step; i++) {
-            auto tmp_out_indexes = out_indexes;
-            auto out_axis = out_b_axis;
-            tmp_out_indexes[out_axis]
-                    = tmp_out_indexes[out_axis] + static_cast<uint64_t>(i);
-            auto tmp_out_indexes_slice
-                    = output_loop ? out_indexes_slice : out_indexes;
-            tmp_out_indexes_slice[out_axis] = tmp_out_indexes_slice[out_axis]
-                    + static_cast<uint64_t>(i);
-            if (need_mask) {
-                expr cur_step, sup_condition;
-                determine_cur_step(output_loop ? dst.get_shape()
-                                               : output_blocking_dims_expr,
-                        tmp_out_indexes_slice, plain_indexes, cur_step,
-                        sup_condition, out_axis, out_a_axis, output_loop,
-                        trans_lanes_16bitx8, input_format_code, true);
-                dst_opt_mask(cur_step, sup_condition, in_indexes, i,
-                        trans_lanes_16bitx8);
-                stmt mask_def;
-                mask = generate_mask_var_by_step(mask_def, cur_step,
-                        trans_lanes_16bitx8, sup_condition, true);
-                cur_list.emplace_back(mask_def);
-            }
-            auto assign = builder::make_assign_unattached(
-                    builder::make_indexing(
-                            output, tmp_out_indexes, trans_lanes_16bitx8, mask),
-                    rows[i + 8]);
-            cur_list.emplace_back(assign);
-        }
-    };
-
-    auto compute_loops = [&](const std::vector<expr> &blocking_dims,
-                                 const int a_axis, const int b_axis,
-                                 const tensor_slice &tsr) {
-        for (int i = static_cast<int>(blocking_dims.size()) - 1; i >= 0; i--) {
-            if (utils::is_one_of(i, a_axis, b_axis)
-                    && iter_vars.at(i).isa<var>()) {
-                body = cur.isa<stmts>()
-                        ? cur
-                        : make_stmt<stmts_node_t>(
-                                std::vector<stmt> {std::move(cur)});
-                int cur_step = i == a_axis ? inp_a_step : inp_b_step;
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), (tsr.get_shape()[i]), expr(cur_step),
-                        std::move(body), true, for_type::NORMAL);
-                bind_loop_axis(expand_gt, cur, i, true);
-            }
-        }
-        for (int i = static_cast<int>(blocking_dims.size()) - 1; i >= 0; i--) {
-            if (!utils::is_one_of(i, a_axis, b_axis)
-                    && iter_vars.at(i).isa<var>()) {
-                body = cur.isa<stmts>()
-                        ? cur
-                        : make_stmt<stmts_node_t>(
-                                std::vector<stmt> {std::move(cur)});
-                cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                        expr(0), tsr.get_shape()[i], expr(1), std::move(body),
-                        true, for_type::NORMAL);
-                bind_loop_axis(expand_gt, cur, i, true);
-            }
-        }
-    };
-
-    bool use_direct = dtype == datatypes::f32 || is_s8u8 || direct_use_bit16x16;
-    if (!output_loop) {
-        std::vector<expr> in_indexes, loop_indexes, in_indexes_tail,
-                loop_indexes_tail;
-        auto indexvar_or_zero = [&src](size_t idx) {
-            return range_from_outer_loop(src.get_ranges()[idx])
-                    ? expr(0)
-                    : builder::make_var(datatypes::index,
-                            std::string("_fuseiter") + fusion_create_idx());
-        };
-        auto make_iter_vars = [&](std::vector<expr> &iter_vars,
-                                      bool is_tail = false) {
-            for (size_t i = 0; i < input_blocking_dims_expr.size(); i++) {
-                iter_vars.emplace_back(indexvar_or_zero(i));
-                in_indexes.emplace_back(iter_vars[i] + src.get_offset()[i]);
-                loop_indexes.emplace_back(iter_vars[i]);
-            }
-        };
-        make_iter_vars(iter_vars);
-        expr condition;
-        expr last_axis_offset, other_axis_condition;
-        std::vector<expr> tmp_indexes = get_reorder_block2plain_indexes(graph,
-                in_indexes, input_format, plain_dims, condition,
-                last_axis_offset, other_axis_condition,
-                input_format.format_code_.get(
-                        static_cast<int>(input_format.format_code_.ndims())
-                        - 1));
-        std::vector<expr> out_indexes
-                = get_reorder_plain2block_indexes(tmp_indexes, output_format);
-        if (use_split_loop && !is_dynamic) {
-            cur_list.clear();
-            cur_list_floor.clear();
-            cur_list_lastdim_floor.clear();
-        }
-        if (dtype == datatypes::f32 || is_s8u8 || direct_use_bit16x16) {
-            if (use_split_loop && !is_dynamic) {
-                inputloop_split_ir_directly(
-                        loop_indexes, out_indexes, tmp_indexes, out_indexes);
-            } else {
-                compute_transpose_direct(
-                        loop_indexes, out_indexes, tmp_indexes, out_indexes);
-            }
-        } else {
-            if (use_split_loop && !is_dynamic) {
-                inputloop_split_ir_bit16(
-                        loop_indexes, out_indexes, tmp_indexes, out_indexes);
-            } else {
-                compute_transpose_bit16(
-                        loop_indexes, out_indexes, tmp_indexes, out_indexes);
-            }
-        }
-        cur = builder::make_stmts_unattached(cur_list);
-        if (use_split_loop && !is_dynamic) {
-            if (inputloop_otheraxis_condition.defined()
-                    && inputloop_lastdim_condition.defined()) {
-                auto cur_floor_lastdim = builder::make_stmts_unattached(
-                        cur_list_lastdim_floor);
-                auto cur_floor = builder::make_stmts_unattached(cur_list_floor);
-                // other axis out of bound
-                auto other_axis_meet = builder::make_if_else_unattached(
-                        inputloop_lastdim_condition, cur_floor,
-                        cur_floor_lastdim);
-                cur = builder::make_if_else_unattached(
-                        inputloop_otheraxis_condition, cur, other_axis_meet);
-            } else if (inputloop_lastdim_condition.defined()) {
-                auto cur_floor_lastdim = builder::make_stmts_unattached(
-                        cur_list_lastdim_floor);
-                cur = builder::make_if_else_unattached(
-                        inputloop_lastdim_condition, cur, cur_floor_lastdim);
-            } else if (inputloop_otheraxis_condition.defined()) {
-                auto cur_floor = builder::make_stmts_unattached(cur_list_floor);
-                cur = builder::make_if_else_unattached(
-                        inputloop_otheraxis_condition, cur, cur_floor);
-            }
-            var_define_list.emplace_back(cur);
-            cur = builder::make_stmts_unattached(var_define_list);
-        }
-        compute_loops(input_blocking_dims_expr, inp_a_axis, inp_b_axis, src);
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    } else {
-        std::vector<expr> out_indexes, out_indexes_slice;
-        for (size_t i = 0; i < output_blocking_dims_expr.size(); i++) {
-            iter_vars.emplace_back(range_from_outer_loop(dst.get_ranges()[i])
-                            ? expr(0)
-                            : builder::make_var(datatypes::index,
-                                    std::string("_fuseiter")
-                                            + fusion_create_idx()));
-            out_indexes.emplace_back(iter_vars[i] + dst.get_offset()[i]);
-            out_indexes_slice.emplace_back(iter_vars[i]);
-        }
-        expr condition;
-        expr last_axis_offset, other_axis_condition;
-        std::vector<expr> tmp_indexes = get_reorder_block2plain_indexes(graph,
-                out_indexes, output_format, plain_dims, condition,
-                last_axis_offset, other_axis_condition);
-        std::vector<expr> in_indexes
-                = get_reorder_plain2block_indexes(tmp_indexes, input_format);
-        if (use_split_loop && !is_dynamic) {
-            cur_list.clear();
-            cur_list_floor.clear();
-        }
-        if (use_direct) {
-            if (use_split_loop && !is_dynamic) {
-                outputloop_split_ir_directly(in_indexes, out_indexes,
-                        tmp_indexes, out_indexes_slice);
-            } else {
-                compute_transpose_direct(
-                        in_indexes, out_indexes, tmp_indexes, out_indexes);
-            }
-        } else {
-            compute_transpose_bit16(
-                    in_indexes, out_indexes, tmp_indexes, out_indexes_slice);
-        }
-        cur = builder::make_stmts_unattached(cur_list);
-        if (use_split_loop && !is_dynamic) {
-            if (outputloop_axis_condition.defined() && use_direct) {
-                auto cur_floor = builder::make_stmts_unattached(cur_list_floor);
-                cur = builder::make_if_else_unattached(
-                        outputloop_axis_condition, cur, cur_floor);
-            }
-            var_define_list.emplace_back(cur);
-            cur = builder::make_stmts_unattached(var_define_list);
-        }
-        compute_loops(output_blocking_dims_expr, out_a_axis, out_b_axis, dst);
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/unary_elemwise.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/unary_elemwise.cpp
deleted file mode 100644
index 06f768e4506..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/unary_elemwise.cpp
+++ /dev/null
@@ -1,529 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-#include "compiler/ir/attr_keys.hpp"
-#include "unary_elemwise.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/brgemm_fusion.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <runtime/dynamic_dispatch/ops/impl_type.hpp>
-#include <runtime/microkernel/cpu/brgemm_alg_kind.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-unary_elementwise_op_impl_t::unary_elementwise_op_impl_t(
-        graph_tensor_ptr v, const std::string &op_name)
-    : unary_elementwise_op_impl_t(op_name, {std::move(v)}, {}, {}) {}
-
-unary_elementwise_op_impl_t::unary_elementwise_op_impl_t(
-        const std::string &op_name, const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 1, "Wrong op input size.\n");
-    op_name_ = op_name;
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 1, "Wrong op output size.\n");
-        COMPILE_ASSERT(gc::graph::check_shape_equal(
-                               info_.inputs_[0]->details_.get_plain_dims(),
-                               info_.outputs_[0]->details_.get_plain_dims()),
-                "Unary elementwise op's output is incorrect.")
-    }
-    attrs_ = attrs;
-}
-
-std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-unary_elementwise_op_impl_t::get_inplace_map() {
-    return {{0, {{0, inplace_kind::ZERO_OFFSET}}}};
-}
-
-void unary_elementwise_op_impl_t::compute_block(context_ptr ctx,
-        const std::vector<tensor_slice *> &dst,
-        const std::vector<const tensor_slice *> &inputs) {
-    size_t wkld = compute_fusible_workload(ctx, dst, inputs);
-    // set default vectorized information
-    vx_info_.axis = dst[0]->get_shape().size() - 1;
-    for (int64_t i = dst[0]->nslice_dims() - 1; i >= 0; --i) {
-        auto &dim = dst.at(0)->get_shape().at(i);
-        if (!dim.isa<constant>() || 1 != get_expr_as_int(dim)) {
-            vx_info_.axis = i;
-            break;
-        }
-    }
-    vx_info_.lanes
-            = vectorize_step(ctx, info_.inputs_[0]->details_.dtype_.type_code_);
-    auto cur_cpu_flags = ctx->machine_.cpu_flags_;
-    if (!cur_cpu_flags.fAVX512F && cur_cpu_flags.fAVX2) {
-        // In avx2, bf16x16 can't cast to f32x16. Maximum lanes must be 8.
-        if (op_name_ == "cast") {
-            const sc_data_etype dst_etype
-                    = dst[0]->tptr_->base_->dtype_.type_code_;
-            auto size = utils::get_sizeof_etype(dst_etype);
-            assert(size * 8 <= 256 && "Bad type for cast");
-            const uint32_t avx2_max_lanes = 256 / (size * 8);
-            vx_info_.lanes = std::min(avx2_max_lanes, vx_info_.lanes);
-        }
-    }
-    auto func = [&](const std::vector<expr> &in,
-                        std::vector<expr::lvalue_proxy_t> &out) -> stmt {
-        return builder::make_assign_unattached(out[0], compute_element(in[0]));
-    };
-    // default use mask
-    bool use_mask = attrs_.get_or_else(op_attr_key::use_padded_mask, true);
-    if (get_owner_graph().is_dynamic()) {
-        use_mask &= info_.cur_impl_ != impl_kind_t::no_padding;
-    }
-    compute_vectorized_op(ctx, get_owner_graph(), inputs, *dst[0], info_,
-            vx_info_, mask_compute_func_t(func), mask_compute_func_t(func),
-            attrs_, get_outputs()[0], wkld, use_mask);
-}
-
-static infer_status_code infer_unary_slice_ranges(
-        fusible_op_t *cur, fslice_map &fsmap) {
-    COMPILE_ASSERT(cur->get_inputs().size() == 1, "unary op is expected");
-    // search known ranges from any input of cur fusbile op
-    slice_range_map known_ranges_map = search_known_input_slice(cur, fsmap);
-    if (known_ranges_map.empty()) return infer_status_code::RETRY;
-    // set outputs slice range
-    fsmap.get(cur->get_outputs()[0]) = known_ranges_map[0];
-    return infer_status_code::OK;
-}
-
-infer_status_code unary_elementwise_op_impl_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    return infer_unary_slice_ranges(this, fsmap);
-}
-
-static infer_status_code pre_infer_unary_slice_ranges(
-        fusible_op_t *cur, fslice_map &fsmap) {
-    auto &input = cur->get_inputs()[0];
-    auto &out_ranges = fsmap.get(cur->get_outputs()[0]);
-    if (out_ranges.empty()) { return infer_status_code::RETRY; }
-    auto &in_ranges = fsmap.get(input);
-    if (in_ranges.empty()) { in_ranges = out_ranges; }
-    return infer_status_code::OK;
-}
-
-infer_status_code unary_elementwise_op_impl_t::pre_infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    return pre_infer_unary_slice_ranges(this, fsmap);
-}
-
-void infer_identical_binding_axis(
-        fusible_op_t *cur, binding_axis_map &bdax_map) {
-    auto known_axis_map = search_known_input_axis(cur, bdax_map);
-    auto &outaxis = bdax_map.get(cur->get_outputs()[0]);
-    if (!outaxis.empty() && outaxis == known_axis_map[0]) { return; }
-    outaxis = known_axis_map[0];
-    set_unknown_binding_axis(cur, known_axis_map, bdax_map);
-}
-
-void unary_elementwise_op_impl_t::infer_binding_axis(
-        binding_axis_map &bdax_map) {
-    infer_identical_binding_axis(this, bdax_map);
-}
-
-void pre_infer_identical_binding_axis(
-        fusible_op_t *cur, binding_axis_map &bdax_map) {
-    auto &outaxis = bdax_map.get(cur->get_outputs()[0]);
-    COMPILE_ASSERT(!outaxis.empty(),
-            "Unknown output axis found, could not pre infer binding axis")
-    auto &input = cur->get_inputs()[0];
-    auto &inpaxis = bdax_map.get(input);
-    if (inpaxis.empty()) {
-        inpaxis = outaxis;
-        if (auto bd_op
-                = input->producer_owner_
-                          ->dyn_cast<op_traits::mixed_partition_acceptable>()) {
-            bd_op->pre_infer_binding_axis(bdax_map);
-        }
-    }
-}
-
-void unary_elementwise_op_impl_t::pre_infer_binding_axis(
-        binding_axis_map &bdax_map) {
-    pre_infer_identical_binding_axis(this, bdax_map);
-}
-
-shape_rl_vec unary_elementwise_op_impl_t::get_dynamic_shape_relations() const {
-    shape_rl_vec ret;
-    auto &in_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto &out_dims = get_outputs()[0]->details_.get_plain_dims();
-    for (size_t i = 0; i < in_dims.size(); i++) {
-        if (is_dynamic_dim(in_dims[i])) {
-            ret.emplace_back(in_dims[i], out_dims[i]);
-        }
-    }
-    return ret;
-}
-
-bool unary_elementwise_op_impl_t::register_brgemm_fusion(const context_ptr &ctx,
-        const std::vector<tensor_slice *> &outputs,
-        const std::vector<const tensor_slice *> &inputs,
-        brgemm_fusion_register &brg_reg) {
-    if (!fuse_in_brgemm_) { return false; }
-    return brg_reg.register_op_infos(
-            shared_from_this(), outputs[0]->get_tensor_ptr());
-}
-
-expr relu_op_t::compute_element(expr in) {
-    return builder::make_max(
-            make_expr<constant_node>((int64_t)0, in->dtype_), in);
-}
-
-expr leaky_relu_op_t::compute_element(expr in) {
-    expr alpha = make_expr<constant_node>((float)alpha_, in->dtype_);
-    if (in->dtype_.type_code_ == sc_data_etype::BF16) {
-        alpha = builder::make_cast(
-                sc_data_type_t(sc_data_etype::BF16, in->dtype_.lanes_), alpha);
-    }
-    return builder::make_select(
-            in > make_expr<constant_node>((int64_t)0, in->dtype_), in,
-            alpha * in);
-}
-
-expr select_one_op_t::compute_element(expr in) {
-    return builder::make_select(
-            in > make_expr<constant_node>((float)0.0f, in->dtype_),
-            make_expr<constant_node>((float)1.0f, in->dtype_),
-            make_expr<constant_node>((float)0.0f, in->dtype_));
-}
-
-expr round_op_t::compute_element(expr in) {
-    return builder::make_round(in);
-}
-
-static expr compute_sigmoid(expr in) {
-    auto bld = builder::get_current_builder();
-    // constants
-    auto lanes = in->dtype_.lanes_;
-    bool is_bf16 = in->dtype_.is_etype(sc_data_etype::BF16);
-    if (is_bf16) {
-        sc_data_type_t fp32ty = sc_data_type_t::f32(lanes);
-        in = builder::make_cast(fp32ty, in);
-    }
-    expr f_one = make_expr<constant_node>(1.0f, sc_data_type_t::f32(lanes));
-    expr sign_mask = make_expr<constant_node>(
-            0x80000000UL, sc_data_type_t::u32(lanes));
-    // temp vars
-    auto f_neg_x = builder::make_var(
-            sc_data_type_t::f32(lanes), "f_neg_x" + fusion_create_var_idx());
-    bld->push_var_tensor_def(f_neg_x);
-
-    auto f_exp_neg_x = builder::make_var(sc_data_type_t::f32(lanes),
-            "f_exp_neg_x" + fusion_create_var_idx());
-    bld->push_var_tensor_def(f_exp_neg_x);
-
-    // make negative x
-    bld->push_assign(f_neg_x,
-            builder::make_reinterpret(
-                    builder::make_int_xor(builder::make_reinterpret(in,
-                                                  sc_data_type_t::u32(lanes)),
-                            sign_mask),
-                    sc_data_type_t::f32(lanes)));
-
-    // out = 1 / ( 1 + exp(-x) )
-    bld->push_assign(f_exp_neg_x, builder::make_exp(f_neg_x));
-
-    if (is_bf16) {
-        sc_data_type_t bf16ty = sc_data_type_t::bf16(lanes);
-        return builder::make_cast(
-                bf16ty, builder::make_div(f_one, f_one + f_exp_neg_x));
-    } else {
-        return builder::make_div(f_one, f_one + f_exp_neg_x);
-    }
-}
-
-expr sigmoid_op_t::compute_element(expr in) {
-    return compute_sigmoid(in);
-}
-
-expr exp_op_t::compute_element(expr in) {
-    auto out = builder::make_exp(in);
-    out->attr().set(
-            "overflow_check", attrs_.get_or_else("overflow_check", true));
-    return out;
-}
-
-expr tanh_op_t::compute_element(expr in) {
-    auto lanes = in->dtype_.lanes_;
-#define DECL_VEC_CONSTANT(name, dtype, value) \
-    expr name = make_expr<constant_node>(value, sc_data_type_t::dtype(lanes));
-
-// clang-format off
-// NOLINTNEXTLINE
-#define DECL_VEC_VAR(name, dtype) auto name = builder::make_var( \
-            sc_data_type_t::dtype(lanes), #name + fusion_create_var_idx()); \
-    builder::get_current_builder()->push_var_tensor_def(name);
-// clang-format on
-#define DECL_CONSTANT(name, dtype, value) \
-    expr name = make_expr<constant_node>(value, datatypes::dtype);
-// clang-format off
-// NOLINTNEXTLINE
-#define DECL_VAR(name, dtype) auto name = builder::make_var( \
-            datatypes::dtype, #name + fusion_create_var_idx()); \
-    builder::get_current_builder()->push_var_tensor_def(name);
-    // clang-format on
-
-    auto bld = builder::get_current_builder();
-    DECL_VEC_CONSTANT(uint_saturate_ubound, u32, 0x41b00000UL);
-    DECL_VEC_CONSTANT(positive_mask, u32, 0x7fffffffUL);
-    DECL_VEC_CONSTANT(sign_mask, u32, 0x80000000UL);
-    DECL_VEC_CONSTANT(f_one, f32, 1.0f);
-    DECL_VEC_CONSTANT(f_two, f32, 2.0f);
-    DECL_VEC_VAR(abs_a, u32);
-    DECL_VEC_VAR(sign, u32);
-    DECL_VEC_VAR(f_abs_a, f32);
-    DECL_VEC_VAR(f_2a, f32);
-    DECL_VEC_VAR(f_exp_2a, f32);
-    DECL_VEC_VAR(f_tmp, f32);
-    DECL_VEC_VAR(f_out, f32);
-    DECL_VEC_VAR(f_fin, f32);
-
-    bool is_bf16 = in->dtype_.is_etype(sc_data_etype::BF16);
-    if (is_bf16) {
-        sc_data_type_t fp32ty = sc_data_type_t::f32(lanes);
-        in = builder::make_cast(fp32ty, in);
-    }
-    bld->push_assign(abs_a,
-            builder::make_int_and(
-                    builder::make_reinterpret(in, sc_data_type_t::u32(lanes)),
-                    positive_mask));
-    bld->push_assign(f_abs_a,
-            builder::make_reinterpret(abs_a, sc_data_type_t::f32(lanes)));
-    bld->push_assign(sign,
-            builder::make_int_and(
-                    builder::make_reinterpret(in, sc_data_type_t::u32(lanes)),
-                    sign_mask));
-    bld->push_assign(f_2a, builder::make_mul(f_abs_a, f_two));
-    bld->push_assign(f_exp_2a, builder::make_exp(f_2a));
-    bld->push_assign(
-            f_tmp, builder::make_div(f_exp_2a - f_one, f_exp_2a + f_one));
-    bld->push_assign(f_out,
-            builder::make_select(abs_a > uint_saturate_ubound, f_one, f_tmp));
-    bld->push_assign(f_fin,
-            builder::make_reinterpret(
-                    builder::make_int_xor(builder::make_reinterpret(f_out,
-                                                  sc_data_type_t::u32(lanes)),
-                            sign),
-                    sc_data_type_t::f32(lanes)));
-
-    if (is_bf16) {
-        sc_data_type_t bf16ty = sc_data_type_t::bf16(lanes);
-        return builder::make_cast(bf16ty, f_fin);
-    } else {
-        return f_fin;
-    }
-
-#undef DECL_VEC_CONSTANT
-#undef DECL_VEC_VAR
-#undef DECL_CONSTANT
-#undef DECL_VAR
-}
-
-expr erf_op_t::compute_element(expr in) {
-    auto lanes = in->dtype_.lanes_;
-    bool is_bf16 = in->dtype_.is_etype(sc_data_etype::BF16);
-    if (is_bf16) {
-        return builder::make_cast(in->dtype_,
-                builder::make_erf(builder::make_cast(
-                        sc_data_type_t::f32(in->dtype_.lanes_), in)));
-    }
-    return builder::make_erf(in);
-}
-
-expr square_op_t::compute_element(expr in) {
-    return in * in;
-}
-
-expr squared_root_op_t::compute_element(expr in) {
-    if (reciprocal_) { return builder::make_rsqrt(in); }
-    return builder::make_sqrt(in);
-}
-
-std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-cast_op_t::get_inplace_map() {
-    // if out size == in size, we can do inplace. Otherwise, we can't
-    COMPILE_ASSERT(info_.outputs_.size() == 1 && info_.inputs_.size() == 1,
-            "bad number of in/outs for cast op");
-    if (utils::get_sizeof_type(info_.outputs_[0]->details_.dtype_)
-            == utils::get_sizeof_type(info_.inputs_[0]->details_.dtype_)) {
-        return unary_elementwise_op_impl_t::get_inplace_map();
-    }
-    return {};
-}
-
-cast_op_t::cast_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : unary_elementwise_op_impl_t("cast", ins, outs, attrs) {
-    dtype_ = attrs.get<sc_data_type_t>("dtype");
-    saturated_ = attrs.get_or_else("saturated", false);
-    info_.outputs_[0]->details_.dtype_ = dtype_;
-    alg_kind_ = brgemm::out_dtype;
-}
-
-cast_op_t::cast_op_t(
-        graph_tensor_ptr v, sc_data_type_t out_dtype, bool saturated)
-    : unary_elementwise_op_impl_t(std::move(v), "cast")
-    , dtype_(out_dtype)
-    , saturated_(saturated) {
-    info_.outputs_[0]->details_.dtype_ = out_dtype;
-}
-
-expr cast_op_t::compute_element(expr in) {
-    sc_data_type_t vectorize_out_dtype = dtype_;
-    vectorize_out_dtype.lanes_ = in->dtype_.lanes_;
-    return saturated_ ? builder::make_saturated_cast(in, vectorize_out_dtype)
-                      : builder::make_cast(vectorize_out_dtype, in);
-}
-
-expr clamp_op_t::compute_element(expr in) {
-    auto dtype = in->dtype_;
-    float clamp_min = attrs_.get<float>("min");
-    float clamp_max = attrs_.get<float>("max");
-    return builder::make_max(make_expr<constant_node>(clamp_min, dtype),
-            builder::make_min(make_expr<constant_node>(clamp_max, dtype), in));
-}
-
-#define DEFINE_AND_ASSERT_DTYPE(op) \
-    auto dtype = in->dtype_; \
-    COMPILE_ASSERT(dtype.type_code_ == sc_data_etype::F32 \
-                    || dtype.type_code_ == sc_data_etype::BF16, \
-            (op) << "_op_t currently only supports fp32/bf16");
-
-#define DEFINE_ALPHA_BASED_ON_DTYPE(op) \
-    DEFINE_AND_ASSERT_DTYPE(op) \
-    auto f_alpha = make_expr<constant_node>(alpha_, dtype);
-
-#define DEFINE_ALPHA_AND_BETA_BASED_ON_DTYPE(op) \
-    DEFINE_ALPHA_BASED_ON_DTYPE(op) \
-    auto f_beta = make_expr<constant_node>(beta_, dtype);
-
-expr reciprocal_op_t::compute_element(expr in) {
-    // TODO(xxx): return approximate reciprocal when fast math enabled
-    DEFINE_AND_ASSERT_DTYPE("reciprocal");
-    auto f_one = make_expr<constant_node>((float)1.f, dtype);
-    return builder::make_div(f_one, in);
-}
-
-expr abs_op_t::compute_element(expr in) {
-    return builder::make_abs(in);
-}
-
-expr elu_op_t::compute_element(expr in) {
-    DEFINE_ALPHA_BASED_ON_DTYPE("elu");
-    auto f_one = make_expr<constant_node>(1.f, dtype);
-    auto f_zero = make_expr<constant_node>(0.f, dtype);
-    auto f_tail = (builder::make_exp(in) - f_one) * f_alpha;
-    return builder::make_select(in > f_zero, in, f_tail);
-}
-
-expr hardswish_op_t::compute_element(expr in) {
-    DEFINE_ALPHA_AND_BETA_BASED_ON_DTYPE("hardswish");
-    auto f_one = make_expr<constant_node>(1.f, dtype);
-    auto f_zero = make_expr<constant_node>(0.f, dtype);
-    return in
-            * builder::make_max(f_zero,
-                    builder::make_min(
-                            f_one, builder::make_fmadd(in, f_alpha, f_beta)));
-}
-
-// todo: find approximately faster algorithm(onednn impl).
-expr log_op_t::compute_element(expr in) {
-    return builder::make_log(in);
-}
-
-// mish(x) = x * ((e^x + 1)^2 - 1)/((e^x + 1)^2 + 1).
-expr mish_op_t::compute_element(expr in) {
-    DEFINE_AND_ASSERT_DTYPE("mish");
-    auto tmp = builder::make_min(
-            make_expr<constant_node>(44.361415f, dtype), in);
-    auto f_one = make_expr<constant_node>(1.f, dtype);
-    auto f_temp = builder::make_exp(tmp) + f_one;
-    f_temp = f_temp * f_temp;
-    auto f_div = (f_temp - f_one) / (f_temp + f_one);
-    // for llvm disnable fast math
-    f_div->attr()[attr_keys::fast_math] = false;
-    auto f_mul = in * f_div;
-    f_mul->attr()[attr_keys::fast_math] = false;
-    return f_mul;
-}
-
-expr soft_plus_op_t::compute_element(expr in) {
-    DEFINE_AND_ASSERT_DTYPE("soft_plus");
-    auto f_one = make_expr<constant_node>(1.f, dtype);
-    auto f_beta_r = make_expr<constant_node>(beta_, dtype);
-    return f_one / f_beta_r
-            * builder::make_log(builder::make_exp(f_beta_r * in) + f_one);
-}
-
-expr swish_op_t::compute_element(expr in) {
-    DEFINE_ALPHA_BASED_ON_DTYPE("swish");
-    auto bld = builder::get_current_builder();
-    auto f_sigmoid_in = builder::make_var(
-            dtype, "f_sigmoid_in" + fusion_create_var_idx());
-    bld->push_var_tensor_def(f_sigmoid_in, linkage::local, f_alpha * in);
-    return in * compute_sigmoid(f_sigmoid_in);
-}
-
-expr hardsigmoid_op_t::compute_element(expr in) {
-    DEFINE_ALPHA_AND_BETA_BASED_ON_DTYPE("hardsigmoid");
-    auto f_one = make_expr<constant_node>(1.f, dtype);
-    auto f_zero = make_expr<constant_node>(0.f, dtype);
-    return builder::make_max(f_zero,
-            builder::make_min(f_one, builder::make_fmadd(in, f_alpha, f_beta)));
-}
-
-OP_REGISTER(sigmoid_op_t, sigmoid)
-OP_REGISTER(exp_op_t, exp)
-OP_REGISTER(erf_op_t, erf)
-OP_REGISTER(tanh_op_t, tanh)
-OP_REGISTER(relu_op_t, relu)
-OP_REGISTER(leaky_relu_op_t, leaky_relu)
-OP_REGISTER(select_one_op_t, select_one)
-OP_REGISTER(round_op_t, round)
-OP_REGISTER(squared_root_op_t, squared_root)
-OP_REGISTER(cast_op_t, cast)
-OP_REGISTER(clamp_op_t, clamp)
-OP_REGISTER(reciprocal_op_t, reciprocal)
-OP_REGISTER(abs_op_t, abs)
-OP_REGISTER(elu_op_t, elu)
-OP_REGISTER(hardswish_op_t, hardswish)
-OP_REGISTER(log_op_t, log)
-OP_REGISTER(mish_op_t, mish)
-OP_REGISTER(soft_plus_op_t, soft_plus)
-OP_REGISTER(square_op_t, square)
-OP_REGISTER(swish_op_t, swish)
-OP_REGISTER(hardsigmoid_op_t, hardsigmoid)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/unary_elemwise.hpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/unary_elemwise.hpp
deleted file mode 100644
index 82e4df96c58..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/unary_elemwise.hpp
+++ /dev/null
@@ -1,416 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_UNARY_ELEMWISE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_FUSIBLE_UNARY_ELEMWISE_HPP
-
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusion_data.hpp>
-
-#define DECLARE_COMPUTE_ELEMENT() expr compute_element(expr in) override;
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class unary_elementwise_op_impl_t : public unary_elementwise_op_t {
-public:
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() override;
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-    infer_status_code pre_infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-    void compute_block(context_ptr ctx, const std::vector<tensor_slice *> &dst,
-            const std::vector<const tensor_slice *> &inputs) override;
-
-    bool register_brgemm_fusion(const context_ptr &ctx,
-            const std::vector<tensor_slice *> &outputs,
-            const std::vector<const tensor_slice *> &inputs,
-            brgemm_fusion_register &brg_reg) override;
-
-    unary_elementwise_op_impl_t(graph_tensor_ptr v, const std::string &op_name);
-    unary_elementwise_op_impl_t(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    vectorized_info_t &get_vx_info() { return vx_info_; }
-
-    uint32_t get_lanes() const { return vx_info_.lanes; }
-
-    virtual expr compute_element(expr in) = 0;
-
-    shape_rl_vec get_dynamic_shape_relations() const override;
-
-private:
-    vectorized_info_t vx_info_;
-};
-
-class relu_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    relu_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("relu", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_relu;
-    }
-    relu_op_t(const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs);
-    relu_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "relu") {
-        alg_kind_ = brgemm::eltwise_relu;
-    };
-};
-
-class leaky_relu_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    leaky_relu_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("leaky_relu", ins, outs, attrs) {
-        COMPILE_ASSERT(attrs.has_key("alpha"), "Cannot find attr `alpha`");
-        alpha_ = attrs.get<float>("alpha");
-    }
-    leaky_relu_op_t(
-            const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs);
-    leaky_relu_op_t(graph_tensor_ptr v, float alpha)
-        : unary_elementwise_op_impl_t(std::move(v), "leaky_relu")
-        , alpha_(alpha) {};
-
-private: // coefficient of leakage
-    float alpha_;
-};
-
-class select_one_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    select_one_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("select_one", ins, outs, attrs) {}
-    select_one_op_t(
-            const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs);
-    select_one_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "select_one") {};
-};
-
-class round_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    round_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("round", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_round;
-    }
-    round_op_t(
-            const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs);
-    round_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "round") {
-        alg_kind_ = brgemm::eltwise_round;
-    };
-};
-
-class swish_op_t;
-class sigmoid_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    sigmoid_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("sigmoid", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_logsigmoid;
-    }
-    sigmoid_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "sigmoid") {
-        alg_kind_ = brgemm::eltwise_logsigmoid;
-    };
-};
-
-class exp_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    exp_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("exp", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_exp;
-    }
-    exp_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "exp") {
-        alg_kind_ = brgemm::eltwise_exp;
-    };
-};
-
-class tanh_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    tanh_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("tanh", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_tanh;
-    }
-    tanh_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "tanh") {
-        alg_kind_ = brgemm::eltwise_tanh;
-    };
-};
-
-class erf_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    erf_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("erf", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_gelu_erf;
-    }
-    erf_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "erf") {
-        alg_kind_ = brgemm::eltwise_gelu_erf;
-    };
-};
-
-class square_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    square_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("square", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_square;
-    };
-    square_op_t(graph_tensor_ptr v, bool reciprocal = false)
-        : unary_elementwise_op_impl_t(std::move(v), "square") {
-        alg_kind_ = brgemm::eltwise_square;
-    }
-};
-
-// squared_root: sqrt(x)
-class squared_root_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    squared_root_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("squared_root", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_sqrt;
-        reciprocal_ = attrs.get_or_else("reciprocal", false);
-    };
-    squared_root_op_t(graph_tensor_ptr v, bool reciprocal = false)
-        : unary_elementwise_op_impl_t(std::move(v), "squared_root")
-        , reciprocal_(reciprocal) {
-        alg_kind_ = brgemm::eltwise_sqrt;
-    }
-
-private:
-    // This flag decides return sqrt or rsqrt.
-    bool reciprocal_;
-};
-
-class cast_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>>
-    get_inplace_map() override;
-    cast_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    cast_op_t(graph_tensor_ptr v, sc_data_type_t out_dtype,
-            bool saturated = false);
-
-private:
-    sc_data_type_t dtype_;
-    bool saturated_;
-};
-
-class clamp_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    clamp_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("clamp", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_clip_v2;
-    }
-    clamp_op_t(graph_tensor_ptr v, float clamp_min = 0.0, float clamp_max = 1.0)
-        : unary_elementwise_op_impl_t(std::move(v), "clamp") {
-        alg_kind_ = brgemm::eltwise_clip_v2;
-        attrs_.set("min", clamp_min);
-        attrs_.set("max", clamp_max);
-    };
-};
-
-class reciprocal_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-    reciprocal_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("reciprocal", ins, outs, attrs) {
-        approximate_ = attrs.get_or_else("approximate", false);
-    }
-    reciprocal_op_t(graph_tensor_ptr v, bool approximate = false)
-        : unary_elementwise_op_impl_t(std::move(v), "reciprocal")
-        , approximate_(approximate) {};
-
-private:
-    // This flag decides return exact one-div or approximate reciprocal.
-    bool approximate_;
-};
-
-class abs_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    abs_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("abs", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_abs;
-    }
-    abs_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "abs") {
-        alg_kind_ = brgemm::eltwise_abs;
-    };
-};
-
-class elu_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    elu_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("elu", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_elu;
-        alpha_ = attrs.get_or_else("alpha", 1.f);
-    }
-    elu_op_t(graph_tensor_ptr v, float alpha = 1.f)
-        : unary_elementwise_op_impl_t(std::move(v), "elu"), alpha_(alpha) {
-        alg_kind_ = brgemm::eltwise_elu;
-    };
-    float alpha_;
-};
-
-class hardswish_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    hardswish_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("hardswish", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_hardswish;
-        alpha_ = attrs.get_or_else<float>("alpha", 1.f / 6.f);
-        beta_ = attrs.get_or_else<float>("beta", 0.5f);
-    }
-    hardswish_op_t(
-            graph_tensor_ptr v, float alpha = 1.f / 6.f, float beta = 0.5f)
-        : unary_elementwise_op_impl_t(std::move(v), "hardswish")
-        , alpha_(alpha)
-        , beta_(beta) {
-        alg_kind_ = brgemm::eltwise_hardswish;
-    };
-    float alpha_;
-    float beta_;
-};
-
-class mish_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    mish_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("mish", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_mish;
-    }
-    mish_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "mish") {
-        alg_kind_ = brgemm::eltwise_mish;
-    };
-};
-
-class log_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    log_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("log", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_log;
-    }
-    log_op_t(graph_tensor_ptr v)
-        : unary_elementwise_op_impl_t(std::move(v), "log") {
-        alg_kind_ = brgemm::eltwise_log;
-    };
-};
-
-class soft_plus_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    soft_plus_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("soft_plus", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_soft_relu;
-        beta_ = attrs.get<float>("beta");
-    }
-    soft_plus_op_t(graph_tensor_ptr v, float beta = 1.f)
-        : unary_elementwise_op_impl_t(std::move(v), "soft_plus"), beta_(beta) {
-        alg_kind_ = brgemm::eltwise_soft_relu;
-    };
-    float beta_;
-};
-
-class swish_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    swish_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("swish", ins, outs, attrs) {
-        alg_kind_ = brgemm::eltwise_swish;
-        alpha_ = attrs.get<float>("alpha");
-    }
-    swish_op_t(graph_tensor_ptr v, float alpha = 1.f)
-        : unary_elementwise_op_impl_t(std::move(v), "swish"), alpha_(alpha) {
-        alg_kind_ = brgemm::eltwise_swish;
-    };
-    float alpha_;
-};
-
-class hardsigmoid_op_t : public unary_elementwise_op_impl_t {
-public:
-    DECLARE_COMPUTE_ELEMENT();
-
-    hardsigmoid_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : unary_elementwise_op_impl_t("hardsigmoid", ins, outs, attrs) {
-        alpha_ = attrs.get<float>("alpha");
-        beta_ = attrs.get<float>("beta");
-    }
-    hardsigmoid_op_t(graph_tensor_ptr v, float alpha = 1.f, float beta = 0.f)
-        : unary_elementwise_op_impl_t(std::move(v), "hardsigmoid")
-        , alpha_(alpha)
-        , beta_(beta) {};
-    float alpha_;
-    float beta_;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/fusible/vnni_reorder.cpp b/src/graph/backend/graph_compiler/core/src/ops/fusible/vnni_reorder.cpp
deleted file mode 100644
index d61de76f5dc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/fusible/vnni_reorder.cpp
+++ /dev/null
@@ -1,843 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-#include "compiler/config/context.hpp"
-#include "compiler/ir/graph/fusible_op_utils.hpp"
-#include "reorder.hpp"
-#include "util/math_utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <unordered_set>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-static sc_vnni_kernel get_vnni_kernel_type(sc_data_etype etype) {
-    switch (etype) {
-        case sc_data_etype::BF16: {
-            return sc_vnni_kernel::X16_REORDER_VNNI;
-        } break;
-        case sc_data_etype::U8: {
-            return sc_vnni_kernel::X16_REORDER_VNNI;
-        } break;
-        case sc_data_etype::S8: {
-            return sc_vnni_kernel::X16_REORDER_VNNI;
-        } break;
-        default: {
-            COMPILE_ASSERT(false, "Do not support dtype: " << etype);
-        } break;
-    }
-    return sc_vnni_kernel::NO_VNNI;
-}
-
-bool can_be_vnni_reorder(const context_ptr &ctx, std::vector<int> &inp_n_axis,
-        std::vector<int> &inp_k_axis, std::vector<int> &out_n_axis,
-        std::vector<int> &out_k_axis, const sc_dims &plain_dims,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, const tensor_slice &src,
-        const tensor_slice &dst, const sc_data_type_t &dtype,
-        bool &is_vnni_reorder, bool is_dynamic, bool dynamic_no_padding,
-        sc_vnni_kernel &vnni_kernel_used) {
-    // VNNI reorder only support NK2NKknk-liked format.
-    // Last axis should be 2 if dytpe is bf16 and 4 if dytpe is u8/s8
-    // eg. 384N 64K -> 12N 4K 8k 32n 2k
-    //     384N 64K -> 12N 2K 8k 32n 4k
-    //     128A 16B 32C -> 128A 2B 2C 4c 8b 4c
-    // dynamic cases can't check in current condition
-    if (!ctx->machine_.cpu_flags_.fAVX2) { return false; }
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-
-    if (input_format.is_blocking()) {
-        vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-        return false;
-    }
-    bool is_bf16 = dtype.as_etype() == sc_data_etype::BF16;
-    inp_n_axis.clear();
-    inp_k_axis.clear();
-    out_n_axis.clear();
-    out_k_axis.clear();
-    if (!utils::is_one_of(dtype.as_etype(), sc_data_etype::U8,
-                sc_data_etype::S8, sc_data_etype::BF16)) {
-        vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-        return false;
-    }
-    int inp_idx = 0, out_idx = 0;
-    auto &inp_code = input_format.format_code_;
-    auto &out_code = output_format.format_code_;
-    int input_ndims = input_format.format_code_.ndims();
-    int output_ndims = output_format.format_code_.ndims();
-    int vectorized_last_dims = output_ndims - 1;
-    int out_dim_counts[sc_data_format_kind_t::MAX_DIMS] = {0};
-    output_format.format_code_.collect_dim_count(out_dim_counts);
-    int vectorized_original_axis
-            = output_format.format_code_.get(vectorized_last_dims);
-    // Relax the restriction that the irrelevant dimension in the output is 1
-    for (int i = vectorized_last_dims; i >= 0; i--) {
-        auto tmp = output_format.format_code_.get(i);
-        auto target_axis_dim = dst.shape_[i];
-        // can't do vnni in dynamic cases
-        if (!target_axis_dim.isa<constant>()) {
-            vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-            return false;
-        }
-        if (out_dim_counts[tmp] > 1 && (get_expr_as_int(target_axis_dim) > 1)) {
-            vectorized_last_dims = i;
-            vectorized_original_axis = tmp;
-            break;
-        }
-    }
-    if (out_dim_counts[vectorized_original_axis] < 2) {
-        vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-        return false;
-    }
-
-    if (!dst.get_shape().at(vectorized_last_dims).isa<constant>()
-            || get_expr_as_int(
-                       dst.get_real_tensor()->strides_[vectorized_last_dims])
-                    != 1) {
-        vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-        return false;
-    }
-
-    auto out_k2_pos = vectorized_last_dims,
-         out_n_pos = vectorized_last_dims - 1, out_k_pos = -1, out_K_pos = -1,
-         out_N_pos = -1, in_K_pos = -1, in_N_pos = -1;
-    auto k_idx = out_code.get(out_k2_pos);
-    auto n_idx = out_code.get(out_n_pos);
-
-    for (auto i = vectorized_last_dims - 1; i >= 0; --i) {
-        if (out_code.get(i) == k_idx) {
-            if (out_k_pos == -1) {
-                out_k_pos = i;
-            } else if (out_K_pos == -1) {
-                out_K_pos = i;
-            }
-        }
-    }
-
-    for (auto i = vectorized_last_dims - 2; i >= 0; --i) {
-        if (out_code.get(i) == n_idx) {
-            if (out_N_pos == -1) { out_N_pos = i; }
-        }
-    }
-
-    for (auto i = input_ndims - 1; i >= 0; --i) {
-        if (inp_code.get(i) == k_idx) {
-            if (in_K_pos == -1) { in_K_pos = i; }
-        }
-    }
-    for (auto i = input_ndims - 1; i >= 0; --i) {
-        if (inp_code.get(i) == n_idx) {
-            if (in_N_pos == -1) { in_N_pos = i; }
-        }
-    }
-    // our K and N dims need to be constant
-    if (!(src.get_shape().at(in_K_pos).isa<constant>()
-                || src.get_shape().at(in_N_pos).isa<constant>())) {
-        vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-        return false;
-    }
-    // Relax the restriction that the irrelevant dimension in the input is 1
-    // eg: ABC[16,16,1] -> ABCbab[1,1,1,8,16,2] C dim is 1, still use vnni
-    // reorder
-    if (!(inp_code.get(input_ndims - 1) == k_idx
-                || inp_code.get(input_ndims - 1) == n_idx)) {
-        int input_lastdim_max_idx = std::max(in_K_pos, in_N_pos);
-        if (get_expr_as_int(
-                    src.get_real_tensor()->strides_[input_lastdim_max_idx])
-                != 1) {
-            vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-            return false;
-        }
-    }
-
-    if ((in_N_pos > in_K_pos && out_n_pos > out_k_pos)
-            || (in_N_pos < in_K_pos && out_n_pos < out_k_pos)) {
-        is_vnni_reorder = true;
-    }
-    // find axis of N K and N K k n k
-    out_n_axis.emplace_back(out_N_pos);
-    out_n_axis.emplace_back(out_n_pos);
-    out_k_axis.emplace_back(out_K_pos);
-    out_k_axis.emplace_back(out_k_pos);
-    out_k_axis.emplace_back(out_k2_pos);
-    inp_n_axis.emplace_back(in_N_pos);
-    inp_k_axis.emplace_back(in_K_pos);
-    // VNNI reorder kernel shape is 4x16 for u8/s8 and 4x8 for bf16.
-    vnni_kernel_used = get_vnni_kernel_type(dtype.as_etype());
-    if (get_expr_as_int(dst.get_shape()[out_k2_pos]) != (is_bf16 ? 2 : 4)) {
-        vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-        return false;
-    }
-    if (!is_vnni_reorder) {
-        if (get_expr_as_int(dst.get_shape()[out_n_pos]) % 4 != 0
-                || get_expr_as_int(dst.get_shape()[out_k_pos]) % 4 != 0) {
-            vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-            return false;
-        }
-        vnni_kernel_used = is_bf16 ? sc_vnni_kernel::BF16_TRANSPOSE_VNNI
-                                   : sc_vnni_kernel::U8S8_TRANSPOSE_VNNI;
-    } else {
-        if (get_expr_as_int(dst.get_shape()[out_n_pos]) % 16 == 0) {
-            vnni_kernel_used = sc_vnni_kernel::X16_REORDER_VNNI;
-        } else if (get_expr_as_int(dst.get_shape()[out_n_pos])
-                        % (is_bf16 ? 8 : 16)
-                == 0) {
-            vnni_kernel_used = sc_vnni_kernel::X8_REORDER_VNNI;
-        } else {
-            vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-            return false;
-        }
-
-        if ((get_expr_as_int(dst.get_shape()[out_k_pos])
-                    * get_expr_as_int(dst.get_shape()[out_k2_pos]))
-                        % 4
-                != 0) {
-            vnni_kernel_used = sc_vnni_kernel::NO_VNNI;
-            return false;
-        }
-    }
-
-    return true;
-}
-
-void do_vnni_reorder(std::vector<stmt_c> &cur_list, std::vector<expr> &rows,
-        sc_data_type_t &rows_dtype, const bool is_vnni_reorder,
-        const int bf16_step) {
-    bool is_bf16 = rows_dtype.type_code_ == sc_data_etype::BF16;
-    bool is_u8 = rows_dtype.type_code_ == sc_data_etype::U8;
-    // reorder on a kernel of 4x16(u8/s8) or 4x8(bf16)
-    // registers to perform reorder, should reinterpret data to u8 due to
-    // intrinsic limitation
-    any_map_t reinterpret_attr;
-    expr xmm0, xmm1, xmm2, xmm3, xmm_tmp;
-#define PARAM(X) X
-#define MAKE_VAR(name, type) \
-    PARAM(name) = builder::make_var(sc_data_type_t::type, std::string(#name)); \
-    PARAM(name)->attr()["can_promote_to_f32"] = false; \
-    cur_list.emplace_back(builder::make_var_tensor_def_unattached(name));
-
-    stmt assign;
-#define MAKE_ASSIGN(dst, src) \
-    assign = builder::make_assign_unattached(dst, src); \
-    cur_list.emplace_back(assign);
-
-#define MAKE_INTERPRET(dst, src, attr) \
-    MAKE_ASSIGN(dst, \
-            make_expr<intrin_call_node>( \
-                    intrin_type::reinterpret, std::vector<expr> {src}, attr));
-
-#define MAKE_UNPACK_HIGH(dst, src1, src2, elem_bit) \
-    MAKE_ASSIGN(dst, builder::make_unpack_high(src1, src2, elem_bit))
-
-#define MAKE_UNPACK_LOW(dst, src1, src2, elem_bit) \
-    MAKE_ASSIGN(dst, builder::make_unpack_low(src1, src2, elem_bit))
-
-#define MAKE_PERMUTE(dst, src1, src2, imm, elem_bits) \
-    MAKE_ASSIGN(dst, builder::make_permute(src1, src2, imm, elem_bits))
-
-    if (!is_vnni_reorder) {
-        expr xmm_tmp0, xmm_tmp1, xmm_tmp2;
-        if (is_bf16) {
-            MAKE_VAR(xmm_tmp0, bf16(8))
-            MAKE_VAR(xmm_tmp1, bf16(8))
-            MAKE_VAR(xmm_tmp2, bf16(8))
-        } else {
-            if (is_u8) {
-                MAKE_VAR(xmm_tmp0, u8(16))
-                MAKE_VAR(xmm_tmp1, u8(16))
-                MAKE_VAR(xmm_tmp2, u8(16))
-            } else {
-                MAKE_VAR(xmm_tmp0, s8(16))
-                MAKE_VAR(xmm_tmp1, s8(16))
-                MAKE_VAR(xmm_tmp2, s8(16))
-            }
-        }
-        MAKE_UNPACK_LOW(xmm_tmp0, rows[0], rows[1], 32)
-        MAKE_UNPACK_HIGH(rows[1], rows[0], rows[1], 32)
-        MAKE_UNPACK_LOW(xmm_tmp1, rows[2], rows[3], 32)
-        MAKE_UNPACK_HIGH(rows[3], rows[2], rows[3], 32)
-
-        MAKE_UNPACK_LOW(rows[0], xmm_tmp0, xmm_tmp1, 64)
-        MAKE_UNPACK_HIGH(xmm_tmp2, xmm_tmp0, xmm_tmp1, 64)
-        MAKE_UNPACK_LOW(xmm_tmp0, rows[1], rows[3], 64)
-        MAKE_UNPACK_HIGH(rows[3], rows[1], rows[3], 64)
-        MAKE_ASSIGN(rows[1], xmm_tmp2)
-        MAKE_ASSIGN(rows[2], xmm_tmp0)
-        return;
-    } else if (is_bf16) {
-        if (bf16_step == 16) {
-            MAKE_VAR(xmm0, bf16(bf16_step))
-            MAKE_VAR(xmm1, bf16(bf16_step))
-            MAKE_UNPACK_LOW(xmm0, rows[0], rows[1], 16)
-            MAKE_UNPACK_HIGH(rows[1], rows[0], rows[1], 16)
-            MAKE_UNPACK_LOW(xmm1, rows[2], rows[3], 16)
-            MAKE_UNPACK_HIGH(rows[3], rows[2], rows[3], 16)
-            MAKE_PERMUTE(rows[0], xmm0, rows[1], 0x20, 128)
-            MAKE_PERMUTE(rows[1], xmm0, rows[1], 0x31, 128)
-            MAKE_PERMUTE(rows[2], xmm1, rows[3], 0x20, 128)
-            MAKE_PERMUTE(rows[3], xmm1, rows[3], 0x31, 128)
-        } else {
-            MAKE_VAR(xmm0, bf16(8))
-            MAKE_VAR(xmm1, bf16(8))
-            MAKE_UNPACK_LOW(xmm0, rows[0], rows[1], 16)
-            MAKE_UNPACK_HIGH(rows[1], rows[0], rows[1], 16)
-            MAKE_UNPACK_LOW(xmm1, rows[2], rows[3], 16)
-            MAKE_UNPACK_HIGH(rows[3], rows[2], rows[3], 16)
-            MAKE_ASSIGN(rows[0], xmm0)
-            MAKE_ASSIGN(rows[2], xmm1)
-        }
-    } else {
-        if (is_u8) {
-            MAKE_VAR(xmm0, u8(16))
-            MAKE_VAR(xmm1, u8(16))
-            MAKE_VAR(xmm2, u8(16))
-            MAKE_VAR(xmm3, u8(16))
-        } else {
-            MAKE_VAR(xmm0, s8(16))
-            MAKE_VAR(xmm1, s8(16))
-            MAKE_VAR(xmm2, s8(16))
-            MAKE_VAR(xmm3, s8(16))
-        }
-        MAKE_UNPACK_LOW(xmm0, rows[0], rows[2], 8)
-        MAKE_UNPACK_HIGH(rows[2], rows[0], rows[2], 8)
-        MAKE_UNPACK_LOW(xmm1, rows[1], rows[3], 8)
-        MAKE_UNPACK_HIGH(rows[3], rows[1], rows[3], 8)
-        MAKE_UNPACK_LOW(rows[0], xmm0, xmm1, 8)
-        MAKE_UNPACK_HIGH(xmm2, xmm0, xmm1, 8)
-        MAKE_UNPACK_LOW(xmm3, rows[2], rows[3], 8)
-        MAKE_UNPACK_HIGH(rows[3], rows[2], rows[3], 8)
-        MAKE_ASSIGN(rows[1], xmm2)
-        MAKE_ASSIGN(rows[2], xmm3)
-    }
-}
-
-// Keep this flag to test insert kernel performance in the future.
-
-void compute_vnni_reorder(sc_graph_t &graph, const context_ptr &ctx,
-        const tensor_slice &src, tensor_slice &dst,
-        const sc_data_format_t &input_format,
-        const sc_data_format_t &output_format, sc_data_type_t dtype,
-        const sc_dims &plain_dims, bool output_loop, any_map_t &attrs,
-        std::vector<int> &inp_n_axis, std::vector<int> &inp_k_axis,
-        std::vector<int> &out_n_axis, std::vector<int> &out_k_axis,
-        const graph_tensor_ptr &expand_gt, size_t wkld,
-        const bool &is_vnni_reorder, bool is_dynamic, bool dynamic_no_padding,
-        const sc_vnni_kernel vnni_kernel_used) {
-    bool is_bf16 = dtype.as_etype() == sc_data_etype::BF16;
-    bool is_u8 = dtype.as_etype() == sc_data_etype::U8;
-    auto input = src.get_real_tensor();
-    auto output = dst.get_real_tensor();
-    int step = 4;
-    auto bld = builder::get_current_builder();
-    auto input_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, input_format);
-    auto input_blocking_shape_expr
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_dims
-            = sc_data_format_t::get_blocking_shapes(plain_dims, output_format);
-    auto input_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, input_format);
-    auto output_blocking_dims_expr
-            = get_blocking_shapes_expr(graph, plain_dims, output_format);
-    // plain axis of last block
-    auto input_last_origin_axis = input_format.format_code_.get(
-            input_format.format_code_.ndims() - 1);
-    auto output_last_origin_axis = output_format.format_code_.get(
-            output_format.format_code_.ndims() - 1);
-    int input_origin_axis_vectorized = input_format.format_code_.ndims() - 1;
-    int output_origin_axis_vectorized = output_format.format_code_.ndims() - 1;
-    find_vectorized_axis(input_blocking_dims_expr, input_format,
-            input_last_origin_axis, input_origin_axis_vectorized);
-    find_vectorized_axis(output_blocking_dims_expr, output_format,
-            output_last_origin_axis, output_origin_axis_vectorized);
-    bool is_padding = false;
-    if ((!is_dynamic
-                && math_utils::get_dims_product(input_blocking_dims)
-                        != math_utils::get_dims_product(output_blocking_dims))
-            || (is_dynamic && !dynamic_no_padding)) {
-        is_padding = true;
-    }
-    bool padding_k = false, padding_n = false;
-    auto padding_on_which_axis = [&]() {
-        auto inp_k_nums
-                = collect_axis_shape_size(input_blocking_dims, inp_k_axis);
-        auto inp_n_nums
-                = collect_axis_shape_size(input_blocking_dims, inp_n_axis);
-        auto out_k_nums
-                = collect_axis_shape_size(output_blocking_dims, out_k_axis);
-        auto out_n_nums
-                = collect_axis_shape_size(output_blocking_dims, out_n_axis);
-        padding_k = inp_k_nums != out_k_nums;
-        padding_n = inp_n_nums != out_n_nums;
-    };
-    if (!is_dynamic) { padding_on_which_axis(); }
-    bool can_use_condition = !is_dynamic || (is_dynamic && dynamic_no_padding);
-
-    bool use_x16step = vnni_kernel_used == sc_vnni_kernel::X16_REORDER_VNNI;
-    int u8_step = 16, bf16_step = use_x16step ? 16 : 8;
-
-    std::vector<expr> rows(step);
-    std::vector<expr> iter_vars;
-    std::vector<stmt_c> cur_list, cur_list_floor, var_define_list,
-            cur_list_padding;
-    auto rows_dtype = dtype;
-    if (use_x16step) {
-        rows_dtype.lanes_ = 16;
-    } else {
-        rows_dtype.lanes_ = is_bf16 ? 8 : 16;
-    }
-
-#define PARAM(X) X
-#define MAKE_VAR_ASSIGN_VNNI(name, type, alias) \
-    PARAM(name) = builder::make_var(sc_data_type_t::type, \
-            #alias + std::string("_vnni_reorder_") + std::to_string(i + 1) \
-                    + fusion_create_var_idx());
-
-    auto func_bf16_forloop_step = [&](int step,
-                                          std::vector<expr> &tmp_out_indexes,
-                                          int loop_step) {
-        switch (step) {
-            case 1:
-                tmp_out_indexes[out_n_axis[1]] = tmp_out_indexes[out_n_axis[1]]
-                        + static_cast<uint64_t>(loop_step);
-                break;
-            case 2:
-                tmp_out_indexes[out_k_axis[1]] = tmp_out_indexes[out_k_axis[1]]
-                        + static_cast<uint64_t>(1);
-                break;
-            case 3:
-                tmp_out_indexes[out_k_axis[1]] = tmp_out_indexes[out_k_axis[1]]
-                        + static_cast<uint64_t>(1);
-                tmp_out_indexes[out_n_axis[1]] = tmp_out_indexes[out_n_axis[1]]
-                        + static_cast<uint64_t>(loop_step);
-                break;
-        }
-    };
-
-    for (auto i = 0; i < step; i++) {
-        if (is_bf16) {
-            MAKE_VAR_ASSIGN_VNNI(rows[i], bf16(bf16_step), row)
-        } else {
-            if (is_u8) {
-                MAKE_VAR_ASSIGN_VNNI(rows[i], u8(u8_step), row)
-            } else {
-                MAKE_VAR_ASSIGN_VNNI(rows[i], s8(u8_step), row)
-            }
-        }
-
-        // skip bf16 elimination pass on rows. Otherwise it will be promote
-        // to f32.
-        rows[i]->attr()["can_promote_to_f32"] = false;
-        cur_list.emplace_back(builder::make_var_tensor_def_unattached(rows[i]));
-    }
-    cur_list_floor.assign(cur_list.begin(), cur_list.end());
-    var_define_list.assign(cur_list.begin(), cur_list.end());
-    if (!output_loop) {
-        std::vector<expr> in_indexes, loop_indexes;
-        for (size_t i = 0; i < input_blocking_dims.size(); i++) {
-            iter_vars.emplace_back(range_from_outer_loop(src.get_ranges()[i])
-                            ? expr(0)
-                            : builder::make_var(datatypes::index,
-                                    std::string("_fuseiter")
-                                            + fusion_create_idx()));
-            in_indexes.emplace_back((iter_vars[i] + src.get_offset()[i]));
-            loop_indexes.emplace_back(iter_vars[i]);
-        }
-        expr condition;
-        expr last_axis_offset, other_axis_condition;
-        std::vector<expr> tmp_indexes
-                = get_reorder_block2plain_indexes(graph, in_indexes,
-                        input_format, plain_dims, condition, last_axis_offset,
-                        other_axis_condition, input_origin_axis_vectorized);
-        std::vector<expr> out_indexes
-                = get_reorder_plain2block_indexes(tmp_indexes, output_format);
-        for (int i = 0; i < step; i++) {
-            auto tmp_in_indexes = loop_indexes;
-            if (!is_vnni_reorder) {
-                tmp_in_indexes[inp_n_axis[0]] = tmp_in_indexes[inp_n_axis[0]]
-                        + static_cast<uint64_t>(i);
-            } else {
-                tmp_in_indexes[inp_k_axis[0]] = tmp_in_indexes[inp_k_axis[0]]
-                        + static_cast<uint64_t>(i);
-            }
-            stmt assign = builder::make_assign_unattached(rows[i],
-                    // here, use src.tptr instead of input is aimed to
-                    // avoid input is tensor_view_op. Otherwise, it will
-                    // throw illegal exception in tensor_shrink
-                    builder::make_indexing(src.tptr_, tmp_in_indexes,
-                            is_bf16 ? bf16_step : u8_step));
-
-            assign->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            cur_list.emplace_back(assign);
-        }
-        do_vnni_reorder(cur_list, rows, rows_dtype, is_vnni_reorder, bf16_step);
-
-        for (int i = 0; i < step; i++) {
-            auto tmp_out_indexes = out_indexes;
-            if (!is_vnni_reorder) { // vnni transpose
-                tmp_out_indexes[out_k_axis[1]] = tmp_out_indexes[out_k_axis[1]]
-                        + static_cast<uint64_t>(i);
-            } else {
-                if (is_bf16) {
-                    if (use_x16step) {
-                        func_bf16_forloop_step(i, tmp_out_indexes, 8);
-                    } else {
-                        func_bf16_forloop_step(i, tmp_out_indexes, 4);
-                    }
-                } else {
-                    tmp_out_indexes[out_n_axis[1]]
-                            = tmp_out_indexes[out_n_axis[1]]
-                            + static_cast<uint64_t>(i) * 4;
-                }
-            }
-            stmt assign;
-
-            assign = builder::make_assign_unattached(
-                    builder::make_indexing(output, tmp_out_indexes,
-                            is_bf16 ? bf16_step : u8_step),
-                    rows[i]);
-            cur_list.emplace_back(assign);
-        }
-        stmt cur = builder::make_stmts_unattached(cur_list);
-        stmt body;
-        expr_c iter_end;
-        // for-loop-transforms only support step=1
-        // we can only divide iter_end_ rather than multiply step_
-        for (int i = static_cast<int>(input_blocking_dims.size()) - 1; i >= 0;
-                i--) {
-            // Do not generate those dummy loops
-            if (!iter_vars.at(i).isa<var>()) continue;
-            iter_end = src.get_shape()[i];
-            expr cur_step = 1;
-            if (!is_vnni_reorder) {
-                if (i == inp_n_axis[0]) {
-                    cur_step = 4;
-                } else if (i == inp_k_axis[0]) {
-                    cur_step = is_bf16 ? 8 : 16;
-                }
-            } else {
-                if (i == inp_k_axis[0]) {
-                    cur_step = 4;
-                } else if (i == inp_n_axis[0]) {
-                    cur_step = is_bf16 ? bf16_step : u8_step;
-                }
-            }
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), iter_end.remove_const(), cur_step, std::move(body),
-                    true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    } else { // use output loop
-        std::vector<expr> out_indexes;
-        // create iter variable, and make index
-        for (size_t i = 0; i < output_blocking_dims.size(); i++) {
-            iter_vars.emplace_back(range_from_outer_loop(dst.get_ranges()[i])
-                            ? expr(0)
-                            : builder::make_var(datatypes::index,
-                                    std::string("_fuseiter")
-                                            + fusion_create_idx()));
-            out_indexes.emplace_back(iter_vars[i] + dst.get_offset()[i]);
-        }
-        cur_list.clear();
-        cur_list_floor.clear();
-
-        // calculate the input index according to the output index
-        expr condition, vnni_condition;
-        expr last_axis_offset, other_axis_condition;
-        expr axis_if_condition, fully_padding_otheraxis, fully_padding_lastdim;
-        bool need_mask = false;
-        std::vector<expr> tmp_indexes
-                = get_reorder_block2plain_indexes(graph, out_indexes,
-                        output_format, plain_dims, condition, last_axis_offset,
-                        other_axis_condition, output_origin_axis_vectorized);
-        std::vector<expr> in_indexes
-                = get_reorder_plain2block_indexes(tmp_indexes, input_format);
-        auto need_fully_padding_func = [&](const std::vector<int> &inp_axis,
-                                               const std::vector<int> &out_axis,
-                                               const int vec_step) {
-            bool ret = (!is_dynamic
-                               && ((collect_axis_shape_size(
-                                            output_blocking_dims, out_axis)
-                                           - vec_step)
-                                       >= collect_axis_shape_size(
-                                               input_blocking_dims, inp_axis)))
-                    || is_dynamic;
-            return ret;
-        };
-        expr cur_step_var;
-        stmt cur_step_var_assign;
-        // load data to register
-        for (int i = 0; i < step; i++) {
-            auto tmp_in_indexes = in_indexes;
-            if (!is_vnni_reorder) {
-                tmp_in_indexes[inp_n_axis[0]] = tmp_in_indexes[inp_n_axis[0]]
-                        + static_cast<uint64_t>(i);
-            } else {
-                tmp_in_indexes[inp_k_axis[0]] = tmp_in_indexes[inp_k_axis[0]]
-                        + static_cast<uint64_t>(i);
-            }
-            expr mask, mask_floor;
-            stmt mask_def, mask_def_floor;
-            int real_step = is_bf16 ? bf16_step : u8_step;
-            int tmp_in_last_dim
-                    = is_vnni_reorder ? inp_n_axis[0] : inp_k_axis[0];
-            int tmp_in_other_dim
-                    = is_vnni_reorder ? inp_k_axis[0] : inp_n_axis[0];
-            bool padding_on_other_axis
-                    = is_vnni_reorder ? padding_k : padding_n;
-            bool padding_on_last_axis = is_vnni_reorder ? padding_n : padding_k;
-            if (is_padding) {
-                bool otheraxis_need_fully_padding = need_fully_padding_func(
-                        is_vnni_reorder ? inp_k_axis : inp_n_axis,
-                        is_vnni_reorder ? out_k_axis : out_n_axis, step);
-                last_axis_offset
-                        = cast_to_s32(
-                                  input_blocking_shape_expr[tmp_in_last_dim])
-                        - cast_to_s32(tmp_in_indexes[tmp_in_last_dim]);
-                other_axis_condition = tmp_in_indexes[tmp_in_other_dim]
-                        < input_blocking_shape_expr[tmp_in_other_dim];
-                if (can_use_condition) {
-                    if (get_expr_as_int(
-                                input_blocking_shape_expr[tmp_in_other_dim])
-                                            % step
-                                    == 0
-                            && !padding_on_other_axis) {
-                        other_axis_condition = expr();
-                    } else {
-                        if (i == step - 1) {
-                            axis_if_condition = other_axis_condition;
-                        } else if (i == 0 && otheraxis_need_fully_padding) {
-                            fully_padding_otheraxis = other_axis_condition;
-                        }
-                    }
-                }
-
-                // The cur_step corresponding to each step is the same,
-                // so we only need to count the first time and others
-                // can be reused.
-                if (i == 0) {
-                    bool lastaxis_need_fully_padding = need_fully_padding_func(
-                            is_vnni_reorder ? inp_n_axis : inp_k_axis,
-                            is_vnni_reorder ? out_n_axis : out_k_axis,
-                            (is_bf16 ? bf16_step : u8_step));
-                    // mask = min(max(0, last_dim_len -
-                    // last_dim_idx),real_step) To choose [0 ~
-                    // step] mask
-                    auto cur_step = builder::make_min(
-                            builder::make_max(builder::make_constant(0),
-                                    last_axis_offset),
-                            real_step);
-                    cur_step_var = builder::make_var(
-                            sc_data_type_t::s32(1), "cur_step_var");
-                    if (can_use_condition) {
-                        if (get_expr_as_int(
-                                    input_blocking_shape_expr[tmp_in_last_dim])
-                                                % (is_bf16 ? bf16_step
-                                                           : u8_step)
-                                        == 0
-                                && !padding_on_last_axis) {
-                            cur_step = is_bf16 ? bf16_step : u8_step;
-                        } else {
-                            if (lastaxis_need_fully_padding) {
-                                fully_padding_lastdim
-                                        = cast_to_s32(tmp_in_indexes
-                                                          [tmp_in_last_dim])
-                                        < cast_to_s32(input_blocking_shape_expr
-                                                        [tmp_in_last_dim]);
-                            }
-                            need_mask = true;
-                        }
-                    }
-
-                    cur_step_var_assign
-                            = builder::make_var_tensor_def_unattached(
-                                    cur_step_var, linkage::local, cur_step);
-
-                    var_define_list.emplace_back(cur_step_var_assign);
-                }
-                // mask = other_dims_condition ? mask : 0;
-                mask = generate_mask_var_by_step(mask_def, cur_step_var,
-                        real_step, other_axis_condition);
-                cur_list.emplace_back(mask_def);
-
-                if (need_mask && can_use_condition) {
-                    mask_floor = generate_mask_var_by_step(
-                            mask_def_floor, cur_step_var, real_step, expr());
-                    cur_list_floor.emplace_back(mask_def_floor);
-                }
-            }
-
-            // here, we use input is to fix the out-of-bound exception.
-            // Using tptr will make the address calculation wrong in
-            // some cases (479x1024..).
-            auto assign = builder::make_assign_unattached(rows[i],
-                    builder::make_indexing(input, tmp_in_indexes,
-                            is_bf16 ? bf16_step : u8_step, mask));
-            assign->attr()[op_traits::workload_computable_t::workload_number]
-                    = wkld;
-            cur_list.emplace_back(assign);
-            if (can_use_condition) {
-                auto assign_floor = builder::make_assign_unattached(rows[i],
-                        builder::make_indexing(input, tmp_in_indexes,
-                                is_bf16 ? bf16_step : u8_step,
-                                need_mask ? mask_floor : expr()));
-                assign_floor->attr()
-                        [op_traits::workload_computable_t::workload_number]
-                        = wkld;
-                cur_list_floor.emplace_back(assign_floor);
-            }
-        }
-
-        do_vnni_reorder(cur_list, rows, rows_dtype, is_vnni_reorder, bf16_step);
-        if (axis_if_condition.defined() && can_use_condition) {
-            do_vnni_reorder(cur_list_floor, rows, rows_dtype, is_vnni_reorder,
-                    bf16_step);
-        }
-
-        // store data from register
-        for (int i = 0; i < step; i++) {
-            auto tmp_out_indexes = out_indexes;
-            if (!is_vnni_reorder) { // vnni transpose
-                tmp_out_indexes[out_k_axis[1]] = tmp_out_indexes[out_k_axis[1]]
-                        + static_cast<uint64_t>(i);
-            } else {
-                if (is_bf16) {
-                    if (use_x16step) {
-                        func_bf16_forloop_step(i, tmp_out_indexes, 8);
-                    } else {
-                        func_bf16_forloop_step(i, tmp_out_indexes, 4);
-                    }
-                } else {
-                    tmp_out_indexes[out_n_axis[1]]
-                            = tmp_out_indexes[out_n_axis[1]]
-                            + static_cast<uint64_t>(i) * 4;
-                }
-            }
-            stmt assign = builder::make_assign_unattached(
-                    builder::make_indexing(output, tmp_out_indexes,
-                            is_bf16 ? bf16_step : u8_step),
-                    rows[i]);
-            cur_list.emplace_back(assign);
-            cur_list_floor.emplace_back(assign);
-            if ((fully_padding_otheraxis.defined()
-                        || fully_padding_lastdim.defined())
-                    && can_use_condition) {
-                auto bf16_zero = make_expr<constant_node>(0.f, rows[i]->dtype_);
-                auto i8_zero = make_expr<constant_node>(
-                        (uint64_t)0, rows[i]->dtype_);
-                auto assign_padding = builder::make_assign_unattached(
-                        builder::make_indexing(output, tmp_out_indexes,
-                                is_bf16 ? bf16_step : u8_step),
-                        is_bf16 ? bf16_zero : i8_zero);
-                cur_list_padding.emplace_back(assign_padding);
-            }
-        }
-
-        stmt cur = builder::make_stmts_unattached(cur_list);
-        stmt body;
-
-        if (can_use_condition) {
-            if (axis_if_condition.defined()
-                    && fully_padding_lastdim.defined()) {
-                stmt cur_tail;
-                stmt cur_floor = builder::make_stmts_unattached(cur_list_floor);
-                if (fully_padding_otheraxis.defined()) {
-                    cur_tail = builder::make_if_else_unattached(
-                            fully_padding_otheraxis && fully_padding_lastdim,
-                            cur,
-                            builder::make_stmts_unattached(cur_list_padding));
-                } else {
-                    cur_tail = builder::make_if_else_unattached(
-                            fully_padding_lastdim, cur,
-                            builder::make_stmts_unattached(cur_list_padding));
-                }
-                cur = builder::make_if_else_unattached(
-                        axis_if_condition && fully_padding_lastdim, cur_floor,
-                        cur_tail);
-            } else if (fully_padding_lastdim.defined()) {
-                cur = builder::make_if_else_unattached(fully_padding_lastdim,
-                        cur, builder::make_stmts_unattached(cur_list_padding));
-            } else if (axis_if_condition.defined()) {
-                stmt cur_floor = builder::make_stmts_unattached(cur_list_floor);
-                stmt cur_tail;
-                if (fully_padding_otheraxis.defined()) {
-                    cur_tail = builder::make_if_else_unattached(
-                            fully_padding_otheraxis, cur,
-                            builder::make_stmts_unattached(cur_list_padding));
-                    cur = builder::make_if_else_unattached(
-                            axis_if_condition, cur_floor, cur_tail);
-                } else {
-                    cur = builder::make_if_else_unattached(
-                            axis_if_condition, cur_floor, cur);
-                }
-            }
-        }
-        var_define_list.emplace_back(cur);
-        cur = builder::make_stmts_unattached(var_define_list);
-        expr iter_end;
-        // for-loop-transforms only support step=1
-        // we can only divide iter_end_ rather than multiply step_
-        for (int i = static_cast<int>(output_blocking_dims_expr.size()) - 1;
-                i >= 0; i--) {
-            // Do not generate those dummy loops
-            if (!iter_vars.at(i).isa<var>()) continue;
-            iter_end = dst.get_shape()[i];
-            expr cur_step = 1;
-            if (!is_vnni_reorder) { // vnni transpose
-                if (i == out_n_axis[1] || i == out_k_axis[1]) {
-                    cur_step = 4;
-                } else if (i == out_k_axis[2]) {
-                    cur_step = is_bf16 ? 2 : 4;
-                }
-            } else { // vnni reorder
-                if (i == out_n_axis[1]) {
-                    cur_step = is_bf16 ? bf16_step : u8_step;
-                } else if (i == out_k_axis[2]
-                        || (is_bf16 && i == out_k_axis[1])) {
-                    cur_step = is_bf16 ? 2 : 4;
-                }
-            }
-
-            body = cur.isa<stmts>() ? cur
-                                    : make_stmt<stmts_node_t>(
-                                            std::vector<stmt> {std::move(cur)});
-            cur = make_stmt<for_loop_node_t>(std::move(iter_vars.at(i)),
-                    expr(0), iter_end.remove_const(), cur_step, std::move(body),
-                    true, for_type::NORMAL);
-            bind_loop_axis(expand_gt, cur, i, true);
-        }
-        cur->attr()[stmt_attr_key::merge_loop] = true;
-        bld->emit(cur);
-    }
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/gelu.cpp b/src/graph/backend/graph_compiler/core/src/ops/gelu.cpp
deleted file mode 100644
index a30e3adea44..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/gelu.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "gelu.hpp"
-#include <string>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-gelu_op::gelu_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "gelu op shall have only 1 output.")
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    attrs_ = attrs;
-    op_name_ = "gelu";
-}
-
-gelu_backprop_op::gelu_backprop_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 2, "Wrong op input size.\n");
-    info_.inputs_ = ins;
-    COMPILE_ASSERT(gc::graph::check_shape_equal(
-                           info_.inputs_[0]->details_.get_plain_dims(),
-                           info_.inputs_[1]->details_.get_plain_dims()),
-            "2 inputs of gelu backprop op shall have the same shape.");
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "gelu backprop op shall have only 1 output.")
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    attrs_ = attrs;
-    op_name_ = "gelu_backprop";
-}
-
-void gelu_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    graph->make_input(inputs);
-    graph_tensor_ptr inputs0 = inputs[0];
-    sc_op_ptr output_op;
-    // cast input if it is bf16
-    inputs0 = cast_input_dtype(inputs[0], graph);
-    if (attrs_.get_or_else("gelu_type", std::string("erf")) == "tanh") {
-        // tanh impl
-        sc_op_ptr sqrt_2_over_pi, fitting_const, one, half;
-        sqrt_2_over_pi = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(
-                        std::vector<float> {0.79788458f}),
-                datatypes::f32, sc_dims {1});
-        fitting_const = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.044715f}),
-                datatypes::f32, sc_dims {1});
-        one = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {1.0f}),
-                datatypes::f32, sc_dims {1});
-        half = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.5f}),
-                datatypes::f32, sc_dims {1});
-        auto mul0 = graph->make("mul", {inputs0, inputs0}, {}, {});
-        auto mul1 = graph->make("mul",
-                {mul0->get_outputs()[0], fitting_const->get_outputs()[0]}, {},
-                {});
-        auto add0 = graph->make(
-                "add", {mul1->get_outputs()[0], one->get_outputs()[0]}, {}, {});
-        auto mul2
-                = graph->make("mul", {add0->get_outputs()[0], inputs0}, {}, {});
-        auto mul3 = graph->make("mul",
-                {mul2->get_outputs()[0], sqrt_2_over_pi->get_outputs()[0]}, {},
-                {});
-        auto tanh0 = graph->make("tanh", {mul3->get_outputs()[0]}, {}, {});
-        auto add1 = graph->make("add",
-                {tanh0->get_outputs()[0], one->get_outputs()[0]}, {}, {});
-        auto mul4
-                = graph->make("mul", {inputs0, add1->get_outputs()[0]}, {}, {});
-        auto mul5 = graph->make("mul",
-                {mul4->get_outputs()[0], half->get_outputs()[0]}, {}, {});
-        output_op = mul5;
-    } else {
-        // erf impl
-        sc_op_ptr one_over_sqrt_2, one, half;
-        union {
-            float v;
-            int v2;
-        } caster;
-        caster.v2 = 0x3f3504f3;
-        one_over_sqrt_2 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {caster.v}),
-                datatypes::f32, sc_dims {1});
-        one = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {1.0f}),
-                datatypes::f32, sc_dims {1});
-        half = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.5f}),
-                datatypes::f32, sc_dims {1});
-        auto mul0 = graph->make(
-                "mul", {inputs0, one_over_sqrt_2->get_outputs()[0]}, {}, {});
-        auto erf0 = graph->make("erf", {mul0->get_outputs()[0]}, {}, {});
-        auto add0 = graph->make(
-                "add", {erf0->get_outputs()[0], one->get_outputs()[0]}, {}, {});
-        auto mul1
-                = graph->make("mul", {add0->get_outputs()[0], inputs0}, {}, {});
-        auto mul2 = graph->make("mul",
-                {mul1->get_outputs()[0], half->get_outputs()[0]}, {}, {});
-        output_op = mul2;
-    }
-    output_op = cast_output_dtype(outputs[0], graph, output_op);
-    // output
-    graph->make_output(output_op->get_outputs());
-}
-
-void gelu_backprop_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-
-    // input
-    graph->make_input(inputs);
-    sc_op_ptr output_op;
-    graph_tensor_ptr inputs0 = inputs[0];
-    graph_tensor_ptr inputs1 = inputs[1];
-    inputs0 = cast_input_dtype(inputs0, graph);
-    inputs1 = cast_input_dtype(inputs1, graph);
-    if (attrs_.get_or_else("gelu_type", std::string("erf")) == "tanh") {
-        sc_op_ptr fitting_const1, fitting_const2, fitting_const3,
-                fitting_const4, two, one, neg_one, half;
-        fitting_const1 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(
-                        std::vector<float> {0.0356774f}),
-                datatypes::f32, sc_dims {1});
-        fitting_const2 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.797885f}),
-                datatypes::f32, sc_dims {1});
-        fitting_const3 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(
-                        std::vector<float> {0.0535161f}),
-                datatypes::f32, sc_dims {1});
-        fitting_const4 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.398942f}),
-                datatypes::f32, sc_dims {1});
-        two = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {2.0f}),
-                datatypes::f32, sc_dims {1});
-        one = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {1.0f}),
-                datatypes::f32, sc_dims {1});
-        neg_one = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {-1.0f}),
-                datatypes::f32, sc_dims {1});
-        half = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.5f}),
-                datatypes::f32, sc_dims {1});
-        // x*x
-        auto mul0 = graph->make("mul", {inputs0, inputs0}, {}, {});
-        // 0.0356774x*x
-        auto mul1 = graph->make("mul",
-                {mul0->get_outputs()[0], fitting_const1->get_outputs()[0]}, {},
-                {});
-        // 0.0356774x*x+0.797885
-        auto add0 = graph->make("add",
-                {mul1->get_outputs()[0], fitting_const2->get_outputs()[0]}, {},
-                {});
-        // (0.0356774x*x+0.797885)*x
-        auto mul2
-                = graph->make("mul", {add0->get_outputs()[0], inputs0}, {}, {});
-        // tanh((0.0356774x*x+0.797885)*x)
-        auto tanh0 = graph->make("tanh", {mul2->get_outputs()[0]}, {}, {});
-        // 0.5*tanh((0.0356774x*x+0.797885)*x)
-        auto mul3 = graph->make("mul",
-                {tanh0->get_outputs()[0], half->get_outputs()[0]}, {}, {});
-
-        // 0.053561x*x
-        auto mul4 = graph->make("mul",
-                {mul0->get_outputs()[0], fitting_const3->get_outputs()[0]}, {},
-                {});
-        // 0.053561x*x+0.398942
-        auto add1 = graph->make("add",
-                {mul4->get_outputs()[0], fitting_const4->get_outputs()[0]}, {},
-                {});
-        // (0.053561x*x+0.398942)*x
-        auto mul5
-                = graph->make("mul", {add1->get_outputs()[0], inputs0}, {}, {});
-
-        // exp{(0.0356774x*x+0.797885)*x}
-        auto exp = graph->make("exp", {mul2->get_outputs()[0]}, {}, {});
-        // -(0.0356774x*x+0.797885)*x
-        auto mul6 = graph->make("mul",
-                {mul2->get_outputs()[0], neg_one->get_outputs()[0]}, {}, {});
-        // exp{-(0.0356774x*x+0.797885)*x}
-        auto exp1 = graph->make("exp", {mul6->get_outputs()[0]}, {}, {});
-        // exp{(0.0356774x*x+0.797885)*x} + exp{-(0.0356774x*x+0.797885)*x}
-        auto add2 = graph->make(
-                "add", {exp->get_outputs()[0], exp1->get_outputs()[0]}, {}, {});
-        // sech((0.0356774x*x+0.797885)*x)
-        auto div = graph->make(
-                "div", {two->get_outputs()[0], add2->get_outputs()[0]}, {}, {});
-
-        // sech^2((0.0356774x*x+0.797885)*x)
-        auto mul7 = graph->make(
-                "mul", {div->get_outputs()[0], div->get_outputs()[0]}, {}, {});
-        // (0.053561x*x+0.398942)*x*sech^2((0.0356774x*x+0.797885)*x)
-        auto mul8 = graph->make("mul",
-                {mul5->get_outputs()[0], mul7->get_outputs()[0]}, {}, {});
-
-        auto add3 = graph->make("add",
-                {mul3->get_outputs()[0], mul8->get_outputs()[0]}, {}, {});
-        auto add4 = graph->make("add",
-                {add3->get_outputs()[0], half->get_outputs()[0]}, {}, {});
-        auto mul10
-                = graph->make("mul", {add4->get_outputs()[0], inputs1}, {}, {});
-        output_op = mul10;
-    } else {
-        auto one = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {1.0f}),
-                datatypes::f32, sc_dims {1});
-        auto half = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.5f}),
-                datatypes::f32, sc_dims {1});
-        auto neg_half = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {-0.5f}),
-                datatypes::f32, sc_dims {1});
-        // 1 / sqrt(2*phi)
-        auto fitting_const1 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.398942f}),
-                datatypes::f32, sc_dims {1});
-        auto fitting_const2 = graph->make<constant_op_t>(
-                std::make_shared<static_data_t>(std::vector<float> {0.707106f}),
-                datatypes::f32, sc_dims {1});
-        // x*x
-        auto mul0 = graph->make("mul", {inputs0, inputs0}, {}, {});
-        // -0.5f*x*x
-        auto mul1 = graph->make("mul",
-                {mul0->get_outputs()[0], neg_half->get_outputs()[0]}, {}, {});
-        auto exp = graph->make("exp", mul1->get_outputs(), {}, {});
-        auto mul2 = graph->make("mul",
-                {exp->get_outputs()[0], fitting_const1->get_outputs()[0]}, {},
-                {});
-        auto mul3
-                = graph->make("mul", {mul2->get_outputs()[0], inputs0}, {}, {});
-        auto mul4 = graph->make(
-                "mul", {inputs0, fitting_const2->get_outputs()[0]}, {}, {});
-        auto erf = graph->make("erf", mul4->get_outputs(), {}, {});
-        auto add0 = graph->make(
-                "add", {erf->get_outputs()[0], one->get_outputs()[0]}, {}, {});
-        auto mul5 = graph->make("mul",
-                {add0->get_outputs()[0], half->get_outputs()[0]}, {}, {});
-        auto add1 = graph->make("add",
-                {mul3->get_outputs()[0], mul5->get_outputs()[0]}, {}, {});
-        auto mul6
-                = graph->make("mul", {add1->get_outputs()[0], inputs1}, {}, {});
-        output_op = mul6;
-    }
-    // cast output
-    output_op = cast_output_dtype(outputs[0], graph, output_op);
-    // output
-    graph->make_output(output_op->get_outputs());
-}
-
-void gelu_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-void gelu_backprop_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::gelu_op, gelu)
-OP_REGISTER(ops::gelu_backprop_op, gelu_backprop)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/gelu.hpp b/src/graph/backend/graph_compiler/core/src/ops/gelu.hpp
deleted file mode 100644
index e171317cfbb..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/gelu.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_GELU_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_GELU_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class gelu_op : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    gelu_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-class gelu_backprop_op : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    gelu_backprop_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/graph_convolution.cpp b/src/graph/backend/graph_compiler/core/src/ops/graph_convolution.cpp
deleted file mode 100644
index 31559dd736c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/graph_convolution.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#include <numeric>
-#include <string>
-#include <utility>
-
-#include "compiler/ir/graph/fusible_op.hpp"
-#include "compiler/ir/graph/pass/pass.hpp"
-#include "convolution.hpp"
-#include "graph_convolution.hpp"
-#include <ops/templates/utils.hpp>
-#include <util/math_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-static void permute_shape_NXC2NCX(sc_dims &shape) {
-    size_t ndims = shape.size();
-    COMPILE_ASSERT(
-            ndims >= 2, "permute_shape_NXC2NCX requires shape.size() >= 2.")
-    sc_dim channel = shape[ndims - 1];
-    for (size_t i = ndims - 2; i >= 1; --i) {
-        shape[i + 1] = shape[i];
-    }
-    shape[1] = channel;
-}
-
-static void permute_shape_NCX2NXC(sc_dims &shape) {
-    size_t ndims = shape.size();
-    COMPILE_ASSERT(
-            ndims >= 2, "permute_shape_NCX2NXC requires shape.size() >= 2.")
-    sc_dim channel = shape[1];
-    for (size_t i = 1; i < ndims - 1; ++i) {
-        shape[i] = shape[i + 1];
-    }
-    shape[ndims - 1] = channel;
-}
-
-static void permute_shape_XIO2OIX(sc_dims &shape) {
-    size_t ndims = shape.size();
-    COMPILE_ASSERT(
-            ndims >= 2, "permute_shape_XIO2OIX requires shape.size() >= 2.")
-    sc_dim out_channel = shape[ndims - 1];
-    sc_dim in_channel = shape[ndims - 2];
-    for (size_t i = 0; i < ndims - 2; ++i) {
-        shape[i + 2] = shape[i];
-    }
-    shape[0] = out_channel;
-    shape[1] = in_channel;
-}
-
-static sc_data_type_t check_and_infer_out_dtype(
-        const sc_data_type_t &src_dtype, const sc_data_type_t &wei_dtype) {
-    if (utils::is_one_of(src_dtype, datatypes::u8, datatypes::s8)) {
-        COMPILE_ASSERT(wei_dtype == datatypes::s8,
-                "wei_dtype expected to be s8 when src_dtype is u8/s8,but got "
-                        << wei_dtype << ".");
-        return src_dtype;
-    } else if (src_dtype == datatypes::bf16) {
-        COMPILE_ASSERT(wei_dtype == datatypes::bf16,
-                "wei_dtype expected to be bf16 when src_dtype is bf16, but got "
-                        << wei_dtype << ".");
-        return datatypes::bf16;
-    } else if (src_dtype == datatypes::f16) {
-        COMPILE_ASSERT(wei_dtype == datatypes::f16,
-                "wei_dtype expected to be f16 when src_dtype is f16, but got "
-                        << wei_dtype << ".");
-        return datatypes::f16;
-    } else {
-        COMPILE_ASSERT(
-                src_dtype == datatypes::f32 && wei_dtype == datatypes::f32,
-                " src_dtype and wei_dtype are expected to be f32, but got "
-                        << src_dtype << " and " << wei_dtype << ".");
-        return datatypes::f32;
-    }
-}
-
-sc_dims conv_fwd_op_t::infer_out_dims(sc_graph_t &owner_graph,
-        const sc_dims &input_dims, const sc_dims &filter_dims,
-        const sc_dims &pads_begin, const sc_dims &pads_end,
-        const sc_dims &strides, const sc_dims &dilations,
-        const std::string &data_format, const std::string &filter_format) {
-    // logic besides conv_fwd_core_op_t::infer_out_dims will not be affected
-    // by dynamic shape
-    sc_dims input_dims_copy = input_dims;
-    sc_dims filter_dims_copy = filter_dims;
-    if (data_format == "NXC") { permute_shape_NXC2NCX(input_dims_copy); }
-    if (filter_format == "XIO") { permute_shape_XIO2OIX(filter_dims_copy); }
-    // TODO(xxx): fix the logic here
-    // the logic here will infer a 1st set of new unknown dim axis in output
-    // the conv_fwd_core in get_graph_impl will do the same inferring again
-    // which will introduce 2nd set of unknown dim axis
-    // needs to add mapping between these 2 set of unknown axis
-    sc_dims output_dims
-            = conv_fwd_core_op_t::infer_out_dims(owner_graph, input_dims_copy,
-                    filter_dims_copy, pads_begin, pads_end, strides, dilations);
-    if (data_format == "NXC") { permute_shape_NCX2NXC(output_dims); }
-    return output_dims;
-}
-
-conv_fwd_op_t::conv_fwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT((ins.size() == 2 || ins.size() == 3),
-            "convolution op's inputs size should be 2(input, filter) or "
-            "3(input, filter, bias).");
-    info_.inputs_ = ins;
-    info_.outputs_ = outs;
-    attrs_ = attrs;
-    op_name_ = "conv_fwd";
-    sc_dims input_dims = info_.inputs_[0]->details_.get_plain_dims();
-    sc_dims filter_dims = info_.inputs_[1]->details_.get_plain_dims();
-    size_t ndims = input_dims.size();
-    COMPILE_ASSERT(
-            ndims >= 3, "conv ndims should >= 3, but got " << ndims << ".");
-    auto data_format = attrs_.get_or_else("data_format", std::string("NXC"));
-    auto filter_format
-            = attrs_.get_or_else("weights_format", std::string("XIO"));
-    auto strides = attrs_.get<sc_dims>("strides");
-    sc_dims dilations = get_dilations(attrs_);
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    auto ic = data_format == "NXC" ? input_dims[ndims - 1] : input_dims[1];
-    auto oc = data_format == "NXC" ? filter_dims[ndims - 1] : filter_dims[0];
-    auto kic = filter_format == "XIO" ? filter_dims[ndims - 2] : filter_dims[1];
-    COMPILE_ASSERT(ic % groups == 0 && oc % groups == 0,
-            "input channel and output channel must both be divisible by "
-            "groups, but got ic("
-                    << ic << "), oc(" << oc << "), groups(" << groups << ").");
-    COMPILE_ASSERT(ic / groups == kic,
-            "ic/g should be equal to filter_ic, but got "
-                    << ic / groups << " vs " << kic << ".");
-    if (attrs_.has_key("auto_pad")) {
-        auto pad_type = attrs_.get<std::string>("auto_pad");
-        if (pad_type == "VALID") {
-            attrs_.set<sc_dims>("pads_begin", sc_dims(ndims - 2, 0));
-            attrs_.set<sc_dims>("pads_end", sc_dims(ndims - 2, 0));
-        } else if (pad_type == "SAME_UPPER" || pad_type == "SAME_LOWER") {
-            // we must infer_auto_pad here instead of passing the
-            // infer_auto_pad logic to conv_fwd_core after lowering, because
-            // infer_out_dims below depends on pads_begin and pads_end
-            sc_dims input_dims_copy = input_dims;
-            sc_dims filter_dims_copy = filter_dims;
-            if (data_format == "NXC") {
-                permute_shape_NXC2NCX(input_dims_copy);
-            }
-            if (filter_format == "XIO") {
-                permute_shape_XIO2OIX(filter_dims_copy);
-            }
-            conv_fwd_core_op_t::infer_auto_pad(get_owner_graph(),
-                    input_dims_copy, filter_dims_copy, strides, dilations,
-                    attrs_, pad_type == "SAME_UPPER");
-        }
-    }
-    COMPILE_ASSERT(attrs_.has_key("pads_begin") && attrs_.has_key("pads_end"),
-            "pads_begin and pads_end info must be set for convolution op");
-    // use pads related attributes
-    sc_dims pads_begin = attrs_.get<sc_dims>("pads_begin");
-    sc_dims pads_end = attrs_.get<sc_dims>("pads_end");
-    // we must infer_out_dims even when pad_type is SAME_UPPER or
-    // SAME_LOWER, because output shape will be different from inputs shape
-    // when stride > 1
-    auto expected_out_shape = infer_out_dims(
-            info_.inputs_[0]->producer_owner_->get_owner_graph(), input_dims,
-            filter_dims, pads_begin, pads_end, strides, dilations, data_format,
-            filter_format);
-    auto expected_out_dtype
-            = check_and_infer_out_dtype(info_.inputs_[0]->details_.dtype_,
-                    info_.inputs_[1]->details_.dtype_);
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                sc_data_format_t(), expected_out_shape, expected_out_dtype));
-    } else {
-        // skip check when is dynamic
-        if (is_dynamic()) return;
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "convolution expects 1 output.");
-        COMPILE_ASSERT(info_.outputs_[0]->details_.get_plain_dims()
-                        == expected_out_shape,
-                "Bad output shape for convolution");
-        COMPILE_ASSERT(info_.outputs_[0]->details_.dtype_ == expected_out_dtype,
-                "Bad output dtype for convolution");
-    }
-}
-
-sc_dims parse_shape_to_NGCX(
-        const sc_dims &shape, const std::string &format, int groups) {
-    size_t ndims = shape.size();
-    auto NCX_shape = shape;
-    if (format == "NXC") { permute_shape_NXC2NCX(NCX_shape); }
-    auto ret_shape = sc_dims(ndims + 1, 0);
-    ret_shape[0] = NCX_shape[0]; // N
-    ret_shape[1] = groups; //  G
-    ret_shape[2] = NCX_shape[1] / groups; // IC
-    for (auto i = 2UL; i < ndims; i++) {
-        ret_shape[i + 1] = NCX_shape[i];
-    }
-    return ret_shape;
-}
-
-sc_dims parse_shape_to_GOIX(
-        const sc_dims &shape, const std::string &format, int groups) {
-    size_t ndims = shape.size();
-    auto OIX_shape = shape;
-    if (format == "XIO") { permute_shape_XIO2OIX(OIX_shape); }
-    auto ret_shape = sc_dims(ndims + 1, 0);
-    ret_shape[0] = groups; // G
-    ret_shape[1] = OIX_shape[0] / groups; // K
-    for (auto i = 1UL; i < ndims; i++) {
-        ret_shape[i + 1] = OIX_shape[i];
-    }
-    return ret_shape;
-}
-
-void conv_fwd_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto ins = graph->make_input(inputs);
-    sc_op_ptr conv, graph_out;
-    graph_tensor_ptr input = inputs[0], filter = inputs[1];
-
-    bool is_low_precision_fp = utils::is_one_of(
-            input->details_.dtype_, datatypes::bf16, datatypes::f16);
-    auto data_format = attrs_.get_or_else("data_format", std::string("NXC"));
-    auto filter_format
-            = attrs_.get_or_else("weights_format", std::string("XIO"));
-    auto dim = input->details_.get_plain_dims().size();
-    sc_dim groups = attrs_.get_or_else("groups", 1);
-    COMPILE_ASSERT(dim == 3 || dim == 4 || dim == 5,
-            "Only support conv1D, conv2D and conv3D.");
-    auto is_3D = (dim == 5);
-
-    auto attrs = attrs_; // avoid attributes overwriting
-    // insert transpose to make NXC --> NCX and XIO --> OIX
-    if (data_format == "NXC") {
-        auto permute_input = graph->make("transpose", {input}, {},
-                {{"order",
-                         is_3D ? std::vector<int> {0, 4, 1, 2, 3}
-                               : std::vector<int> {0, 3, 1, 2}},
-                        {"out_format",
-                                is_3D ? sc_data_format_t::NCDHW()
-                                      : sc_data_format_t::NCHW()}});
-        input = permute_input->get_outputs()[0];
-    }
-    if (filter_format == "XIO") {
-        auto permute_weight = graph->make("transpose", {filter}, {},
-                {{"order",
-                         is_3D ? std::vector<int> {4, 3, 0, 1, 2}
-                               : std::vector<int> {3, 2, 0, 1}},
-                        {"out_format",
-                                is_3D ? sc_data_format_t::KCDRS()
-                                      : sc_data_format_t::KCRS()}});
-        filter = permute_weight->get_outputs()[0];
-    }
-    if (groups > 1) {
-        input = (graph->make("tensor_view", {input}, {},
-                         {{"shape",
-                                  parse_shape_to_NGCX(
-                                          info_.inputs_[0]
-                                                  ->details_.get_plain_dims(),
-                                          data_format, groups)},
-                                 {"cache_input_format",
-                                         is_3D ? sc_data_format_t::NDHWC()
-                                               : sc_data_format_t::NHWC()},
-                                 {"format",
-                                         is_3D ? sc_data_format_t::NDHWGC()
-                                               : sc_data_format_t::NHWGC()},
-                                 {"expand_dim", std::vector<int> {}}}))
-                        ->get_outputs()[0];
-        filter = (graph->make("tensor_view", {filter}, {},
-                          {{"shape",
-                                   parse_shape_to_GOIX(
-                                           info_.inputs_[1]
-                                                   ->details_.get_plain_dims(),
-                                           filter_format, groups)},
-                                  {"cache_input_format",
-                                          is_3D ? sc_data_format_t::KCDRS()
-                                                : sc_data_format_t::KCRS()},
-                                  {"format",
-                                          is_3D ? sc_data_format_t::GKCDRS()
-                                                : sc_data_format_t::GKCRS()},
-                                  {"expand_dim", std::vector<int> {}},
-                                  {"allow_quantize_reschedule", true},
-                                  {"produce_real_weight", true}}))
-                         ->get_outputs()[0];
-    }
-    conv = graph->make("conv_fwd_core", {input, filter}, {}, attrs);
-    if (groups > 1) {
-        auto output_shape = info_.outputs_[0]->details_.get_plain_dims();
-        if (data_format == "NXC") permute_shape_NXC2NCX(output_shape);
-        conv = graph->make("tensor_view", conv->get_outputs(), {},
-                {{"shape", output_shape},
-                        {"cache_input_format",
-                                is_3D ? sc_data_format_t::NGCDHW()
-                                      : sc_data_format_t::NGCHW()},
-                        {"format",
-                                is_3D ? sc_data_format_t::NCDHW()
-                                      : sc_data_format_t::NCHW()},
-                        {"expand_dim", std::vector<int> {}}});
-    }
-
-    if (data_format == "NXC") {
-        // conv_fwd_core's output is with NCX plain shape
-        // need to permute NCX to NXC
-        conv = graph->make("transpose", conv->get_outputs(), {},
-                {{"order",
-                         is_3D ? std::vector<int> {0, 2, 3, 4, 1}
-                               : std::vector<int> {0, 2, 3, 1}},
-                        {"out_format",
-                                is_3D ? sc_data_format_t::NDHWC()
-                                      : sc_data_format_t::NHWC()}});
-    }
-
-    if (is_low_precision_fp) {
-        conv = graph->make("cast", conv->get_outputs(), {},
-                {{"dtype", input->details_.dtype_}});
-    }
-
-    // add bias
-    if (info_.inputs_.size() == 3) {
-        COMPILE_ASSERT(inputs[2]->details_.get_plain_dims().size() == 1,
-                "Convolution op's bias shall be 1D tensor.")
-        if (is_low_precision_fp) {
-            COMPILE_ASSERT(inputs[2]->details_.dtype_ == input->details_.dtype_,
-                    "Bias should have the same data type as input and "
-                    "filter.")
-        }
-
-        int channel_axis = data_format == "NCX" ? 1 : dim - 1;
-        COMPILE_ASSERT(
-                conv->get_outputs()[0]->details_.get_plain_dims()[channel_axis]
-                        == inputs[2]->details_.get_plain_dims()[0],
-                "Bias size shall match with channel size.")
-        auto bias = graph->make("add", {conv->get_outputs()[0], inputs[2]}, {},
-                {{"bc_axis", std::vector<int> {channel_axis}}});
-        graph->make_output(bias->get_outputs());
-    } else {
-        graph->make_output(conv->get_outputs());
-    }
-}
-
-void conv_fwd_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-conv_bwd_data_op_t::conv_bwd_data_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 2 || ins.size() == 3,
-            "conv_bwd_data's inputs size should be 2(output_delta, filter) "
-            "or "
-            "3(output_delta, filter, output_shape).");
-    info_.inputs_ = ins;
-    info_.outputs_ = outs;
-    attrs_ = attrs;
-    op_name_ = "conv_bwd_data";
-    COMPILE_ASSERT(attrs_.has_key("dst_shape"),
-            "conv_bwd_data currently does not support reading dynamic "
-            "shape "
-            "passed as one of the input.");
-    auto out_shape = attrs_.get<sc_dims>("dst_shape");
-    auto out_dtype = info_.inputs_[0]->details_.dtype_;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-                this, sc_data_format_t(), out_shape, out_dtype));
-    } else {
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "conv_bwd_data expects 1 output.");
-        COMPILE_ASSERT(
-                info_.outputs_[0]->details_.get_plain_dims() == out_shape,
-                "Bad output shape for conv_bwd_data");
-        COMPILE_ASSERT(info_.outputs_[0]->details_.dtype_ == out_dtype,
-                "Bad output dtype for conv_bwd_data");
-    }
-}
-
-void conv_bwd_data_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto ins = graph->make_input(inputs);
-    sc_op_ptr conv, graph_out;
-    graph_tensor_ptr output_delta = inputs[0], filter = inputs[1];
-
-    bool is_low_precision_fp = utils::is_one_of(
-            inputs[0]->details_.dtype_, datatypes::bf16, datatypes::f16);
-    auto data_format = attrs_.get_or_else("data_format", std::string("NXC"));
-    auto filter_format
-            = attrs_.get_or_else("weights_format", std::string("XIO"));
-    auto dim = inputs[0]->details_.get_plain_dims().size();
-    COMPILE_ASSERT(dim == 4 || dim == 5, "Only support conv2D and conv3D.");
-    auto is_3D = (dim == 5);
-
-    auto filter_shape = inputs[1]->details_.get_plain_dims();
-    bool is_3x3 = std::all_of(filter_shape.begin() + 2, filter_shape.end(),
-                          [](int x) { return x == 3; })
-            || std::all_of(filter_shape.begin(), filter_shape.end() - 2,
-                    [](int x) { return x == 3; });
-    auto &stride = attrs_.get<sc_dims>("strides");
-    auto dilations = get_dilations(attrs_);
-    COMPILE_ASSERT(std::all_of(dilations.begin(), dilations.end(),
-                           [](int x) { return x == 1; }),
-            "Not support dilation > 1 in conv bwd");
-
-    auto &pads_begin = attrs_.has_key("pads_begin")
-            ? attrs_.get<sc_dims>("pads_begin")
-            : attrs_.get<sc_dims>("paddings");
-    bool stride_all_1 = std::all_of(
-            stride.begin(), stride.end(), [](int x) { return x == 1; });
-    bool padding_all_1 = std::all_of(
-            pads_begin.begin(), pads_begin.end(), [](int x) { return x == 1; });
-    bool valid_padding
-            = (attrs_.has_key("auto_pad")
-                      && attrs_.get<std::string>("auto_pad") == "SAME_UPPER")
-            || padding_all_1;
-
-    auto attrs = attrs_; // avoid attributes overwriting
-    // insert transpose to make NXC --> NCX and XIO --> OIX
-    if (data_format == "NXC") {
-        auto permute_output_delta = graph->make("transpose", {output_delta}, {},
-                {{"order",
-                        is_3D ? std::vector<int> {0, 4, 1, 2, 3}
-                              : std::vector<int> {0, 3, 1, 2}}});
-        output_delta = permute_output_delta->get_outputs()[0];
-
-        // change output_shape attributes
-        auto output_shape = attrs_.get<sc_dims>("dst_shape");
-        permute_shape_NXC2NCX(output_shape);
-        attrs.set<sc_dims>("dst_shape", output_shape);
-    }
-    if (filter_format == "XIO") {
-        auto permute_weight = graph->make("transpose", {filter}, {},
-                {{"order",
-                        is_3D ? std::vector<int> {4, 3, 0, 1, 2}
-                              : std::vector<int> {3, 2, 0, 1}}});
-        filter = permute_weight->get_outputs()[0];
-    }
-
-    auto ctx = get_default_context();
-    if (!is_3D && ctx->use_amx() && is_3x3 && stride_all_1 && valid_padding) {
-        // use conv fwd core instead
-        // make KCRS --> CKRS, since
-        // conv_fwd_core is NCHW (x) KCRS
-        // conv_bwd's semantic shall be NKHW (x) CKRS
-        auto permute_channel = graph->make("transpose", {filter}, {},
-                {{"order", std::vector<int> {1, 0, 2, 3}}});
-        filter = permute_channel->get_outputs()[0];
-        attrs.set("inverse_filter", true);
-        conv = graph->make("conv_fwd_core", {output_delta, filter}, {}, attrs);
-    } else {
-        conv = graph->make(
-                "conv_bwd_data_core", {output_delta, filter}, {}, attrs);
-    }
-
-    if (is_low_precision_fp) {
-        conv = graph->make("cast", conv->get_outputs(), {},
-                {{"dtype", inputs[0]->details_.dtype_}});
-    }
-
-    if (data_format == "NXC") {
-        // permute NCX to NXC
-        conv = graph->make("transpose", conv->get_outputs(), {},
-                {{"order",
-                        is_3D ? std::vector<int> {0, 2, 3, 4, 1}
-                              : std::vector<int> {0, 2, 3, 1}}});
-    }
-    graph->make_output(conv->get_outputs());
-}
-
-void conv_bwd_data_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-conv_bwd_weight_op_t::conv_bwd_weight_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 2 || ins.size() == 3,
-            "conv_bwd_weight's inputs size should be 2(input_forward, "
-            "output_delta) or 3(input_forward, output_delta, "
-            "filter_shape).");
-    info_.inputs_ = ins;
-    info_.outputs_ = outs;
-    attrs_ = attrs;
-    op_name_ = "conv_bwd_weight";
-    COMPILE_ASSERT(attrs_.has_key("weights_shape"),
-            "conv_bwd_weight currently does not support reading dynamic "
-            "shape "
-            "passed as one of the input.");
-    auto out_shape = attrs_.get<sc_dims>("weights_shape");
-    auto out_dtype = info_.inputs_[0]->details_.dtype_;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(
-                this, sc_data_format_t(), out_shape, out_dtype));
-    } else {
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "conv_bwd_weight expects 1 output.");
-        COMPILE_ASSERT(
-                info_.outputs_[0]->details_.get_plain_dims() == out_shape,
-                "Bad output shape for conv_bwd_weight");
-        COMPILE_ASSERT(info_.outputs_[0]->details_.dtype_ == out_dtype,
-                "Bad output dtype for conv_bwd_weight");
-    }
-}
-
-void conv_bwd_weight_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto ins = graph->make_input(inputs);
-    sc_op_ptr conv, graph_out;
-    graph_tensor_ptr input = inputs[0], output_delta = inputs[1];
-
-    bool is_low_precision_fp = utils::is_one_of(
-            inputs[0]->details_.dtype_, datatypes::bf16, datatypes::f16);
-    auto data_format = attrs_.get_or_else("data_format", std::string("NXC"));
-    auto filter_format
-            = attrs_.get_or_else("weights_format", std::string("XIO"));
-    auto dim = inputs[0]->details_.get_plain_dims().size();
-    COMPILE_ASSERT(dim == 4 || dim == 5, "Only support conv2D and conv3D.");
-    auto is_3D = (dim == 5);
-
-    auto attrs = attrs_; // avoid attributes overwriting
-    // insert transpose to make NXC --> NCX and XIO --> OIX
-    if (data_format == "NXC") {
-        auto permute_input = graph->make("transpose", {input}, {},
-                {{"order",
-                        is_3D ? std::vector<int> {0, 4, 1, 2, 3}
-                              : std::vector<int> {0, 3, 1, 2}}});
-        input = permute_input->get_outputs()[0];
-
-        auto permute_output_delta = graph->make("transpose", {output_delta}, {},
-                {{"order",
-                        is_3D ? std::vector<int> {0, 4, 1, 2, 3}
-                              : std::vector<int> {0, 3, 1, 2}}});
-        output_delta = permute_output_delta->get_outputs()[0];
-    }
-
-    if (filter_format == "XIO") {
-        // change filter_shape attributes
-        auto filter_shape = attrs_.get<sc_dims>("weights_shape");
-        permute_shape_XIO2OIX(filter_shape);
-        attrs.set<sc_dims>("weights_shape", filter_shape);
-    }
-
-    conv = graph->make(
-            "conv_bwd_weight_core", {input, output_delta}, {}, attrs);
-
-    if (is_low_precision_fp) {
-        conv = graph->make("cast", conv->get_outputs(), {},
-                {{"dtype", inputs[0]->details_.dtype_}});
-    }
-    // insert transpose for output: OIX to XIO
-    if (filter_format == "XIO") {
-        conv = graph->make("transpose", conv->get_outputs(), {},
-                {{"order",
-                        is_3D ? std::vector<int> {2, 3, 4, 1, 0}
-                              : std::vector<int> {2, 3, 1, 0}}});
-    }
-    graph->make_output(conv->get_outputs());
-}
-
-void conv_bwd_weight_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::conv_fwd_op_t, conv_fwd)
-OP_REGISTER(ops::conv_bwd_data_op_t, conv_bwd_data)
-OP_REGISTER(ops::conv_bwd_weight_op_t, conv_bwd_weight)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/graph_convolution.hpp b/src/graph/backend/graph_compiler/core/src/ops/graph_convolution.hpp
deleted file mode 100644
index ce0065e0d4a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/graph_convolution.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_GRAPH_CONVOLUTION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_GRAPH_CONVOLUTION_HPP
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/graph/graph_op.hpp"
-#include "compiler/ir/graph/traits.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class conv_fwd_op_t : public configurable_graph_op_t,
-                      public op_traits::auto_copyable_t {
-public:
-    conv_fwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    static sc_dims infer_out_dims(sc_graph_t &owner_graph,
-            const sc_dims &input_dims, const sc_dims &filter_dims,
-            const sc_dims &pads_begin, const sc_dims &pads_end,
-            const sc_dims &strides, const sc_dims &dilations,
-            const std::string &data_format, const std::string &filter_format);
-};
-
-class conv_bwd_data_op_t : public configurable_graph_op_t,
-                           public op_traits::auto_copyable_t {
-public:
-    conv_bwd_data_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-class conv_bwd_weight_op_t : public configurable_graph_op_t,
-                             public op_traits::auto_copyable_t {
-public:
-    conv_bwd_weight_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/managed_matmul_core.cpp b/src/graph/backend/graph_compiler/core/src/ops/managed_matmul_core.cpp
deleted file mode 100644
index 9a82c15f680..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/managed_matmul_core.cpp
+++ /dev/null
@@ -1,864 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "managed_matmul_core.hpp"
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include "matmul_core.hpp"
-#include "templates/managed_matmul_core.hpp"
-#include "templates/utils.hpp"
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/dynamic_internal_info.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/graph_map.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/transform/dead_func_eliminate.hpp>
-#include <runtime/config.hpp>
-#include <unordered_set>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-static sc_data_type_t infer_out_dtype(
-        const std::vector<graph_tensor_ptr> &ins) {
-    if (ins.at(0)->details_.dtype_ == datatypes::u8
-            || ins.at(0)->details_.dtype_ == datatypes::s8) {
-        assert(ins.at(1)->details_.dtype_ == datatypes::s8);
-        return datatypes::s32;
-    }
-    return datatypes::f32;
-}
-
-managed_matmul_core_op_t::managed_matmul_core_op_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : tunable_op_t("managed_matmul_core", ins, outs, attrs) {
-    COMPILE_ASSERT(
-            info_.inputs_.size() == 2, "managed_matmul_core expects 2 inputs");
-    auto &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    COMPILE_ASSERT(A_dims.size() == 2 && B_dims.size() == 2,
-            "managed_matmul_core only supports 2d cases yet");
-    sc_dims expected_out_shape = {merge_vec(get_batch_dims(),
-            {A_dims[A_dims.size() - 2], B_dims[B_dims.size() - 1]})};
-
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                sc_data_format_t(), expected_out_shape, infer_out_dtype(ins)));
-    } else {
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "matmul_core expects 1 output");
-        if (!is_dynamic()) {
-            COMPILE_ASSERT(info_.outputs_[0]->details_.get_plain_dims()
-                            == expected_out_shape,
-                    "Bad out dims");
-        }
-    }
-    // record padded_K of input A for matmul_core
-    attrs_["temp.padded_A_K"] = std::make_shared<VConst>();
-}
-
-std::vector<int> managed_matmul_core_op_t::query_prefetch(
-        const context_ptr &ctx, bool is_global,
-        const std::vector<tensor_slice> &ins) {
-    auto gen = create_generator();
-    auto gen_ptr = static_cast<gen_managed_matmul_core_t *>(gen.get());
-    if (gen_ptr->is_okay_to_prefetch(ctx,
-                *config_data_.get_as<managed_matmul_core_config_t>(),
-                is_global)) {
-        return {1};
-    } else {
-        return {};
-    }
-}
-
-void managed_matmul_core_op_t::generate_prefetcher_body_for_tensor(
-        const context_ptr &ctx, const std::vector<expr> &func_args,
-        const std::vector<expr> &ins, const std::vector<int> &indices) {
-    auto gen = create_generator();
-    static_cast<gen_managed_matmul_core_t *>(gen.get())
-            ->generate_prefetcher_body_for_tensor(ctx,
-                    *config_data_.get_as<managed_matmul_core_config_t>(),
-                    func_args, ins, indices);
-}
-
-body_generator_ptr managed_matmul_core_op_t::create_generator() {
-    COMPILE_ASSERT(
-            info_.inputs_.size() == 2, "managed_matmul_core expects 2 inputs");
-    auto &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    COMPILE_ASSERT(A_dims.size() == 2 && B_dims.size() == 2,
-            "managed_matmul_core only supports 2d cases yet");
-
-    auto num_threads = runtime_config_t::get().get_num_threads();
-    sc_dim M = A_dims.front(); // A is always 2D
-    sc_dim K = A_dims.back(); // A is always 2D
-    sc_dim N = B_dims.back(); // B is always 2D
-    bool is_valid_int8
-            = (info_.inputs_[0]->details_.dtype_ == datatypes::u8
-                      || info_.inputs_[0]->details_.dtype_ == datatypes::s8)
-            && info_.inputs_[1]->details_.dtype_ == datatypes::s8;
-    if (!is_dynamic() && M <= 5 && K >= 4096
-            && N >= 4096 // TODO(niuxiaoguang): K, N shapes are from gpt-j-6B
-            // and llama on SPR. Change them when necessary.
-            && num_threads <= 32 && is_valid_int8) {
-        attrs_["dispatch_avx"] = true;
-    }
-
-    auto mat_gen = utils::make_unique<gen_managed_matmul_core_t>(this,
-            graph::extract_detail_from_tensors(get_inputs()),
-            graph::extract_detail_from_tensors(get_outputs()));
-    auto mat_ptr = static_cast<gen_managed_matmul_core_t *>(mat_gen.get());
-    if (iim_block_ != -1) { mat_ptr->iim_block_ = iim_block_; }
-    if (iin_block_ != -1) { mat_ptr->iin_block_ = iin_block_; }
-    if (iik_block_ != -1) { mat_ptr->iik_block_ = iik_block_; }
-    if (is_dynamic()) {
-        mat_ptr->is_partial_ = static_cast<bool>(info_.cur_impl_);
-    }
-    return std::move(mat_gen);
-}
-
-float managed_matmul_core_op_t::get_gflop() {
-    return create_generator()->get_gflop();
-}
-
-sc_dims managed_matmul_core_op_t::get_batch_dims() const {
-    auto &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    return A_dims.size() > B_dims.size()
-            ? sc_dims {A_dims.begin(), A_dims.end() - 2}
-            : sc_dims {B_dims.begin(), B_dims.end() - 2};
-}
-
-void managed_matmul_core_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    const sc_dims &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &A_blocking_dims
-            = info_.inputs_[0]->details_.get_blocking_dims();
-    const sc_dims &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &B_blocking_dims
-            = info_.inputs_[1]->details_.get_blocking_dims();
-    const sc_dims &C_dims = info_.outputs_[0]->details_.get_plain_dims();
-    const sc_dim M = A_dims[A_dims.size() - 2];
-    const sc_dim K = A_dims.back();
-    const sc_dim N = B_dims.back();
-
-    bool dynamic = is_dynamic();
-    in_formats.reserve(2);
-    sc_data_type_t A_dtype = info_.inputs_[0]->details_.dtype_;
-    sc_data_type_t B_dtype = info_.inputs_[1]->details_.dtype_;
-    sc_data_format_t A_format = info_.inputs_[0]->details_.get_format();
-    sc_data_format_t B_format = info_.inputs_[1]->details_.get_format();
-    bool is_A_vnni_low_fp = ops::is_vnni_low_fp(ctx, A_dtype);
-    bool is_B_vnni_low_fp = ops::is_vnni_low_fp(ctx, B_dtype);
-    auto gen_ptr = create_generator();
-    auto gen = static_cast<gen_managed_matmul_core_t *>(gen_ptr.get());
-    int iim_block = gen->iim_block_;
-    int iin_block = gen->iin_block_;
-    int iik_block = gen->iik_block_;
-    if (!config_data_) {
-        if (!dynamic && attrs_.get_or_else("transposed_a", false)) {
-            config_data_ = gen->get_default_transposed_a_config(ctx);
-        } else {
-            auto post_rd_axis
-                    = attrs_.get_or_else("post_rd_axis", std::vector<int> {});
-            if (post_rd_axis.size() == 1 && post_rd_axis.at(0) == 1) {
-                // mmm + reduce_on_N cases
-                config_data_ = gen->get_default_post_rd_config(ctx);
-            } else {
-                config_data_ = create_generator()->get_default_config(ctx);
-            }
-        }
-    }
-
-    // constant check
-    bool constant_A = false, constant_B = false;
-    if (info_.inputs_[0]->producer_owner_->isa<constant_op_t>()
-            || info_.inputs_[0]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)) {
-        constant_A = true;
-    } else {
-        bool constant_A_parents = true;
-        for (const auto &input :
-                info_.inputs_[0]->producer_owner_->get_inputs()) {
-            auto parent_node = input->producer_owner_;
-            constant_A_parents &= (parent_node->attrs_.get_or_else(
-                                           "constant", const_kind::not_const)
-                    || parent_node->isa<constant_op_t>());
-        }
-        constant_A = constant_A_parents
-                && !info_.inputs_[0]->producer_owner_->get_inputs().empty();
-    }
-
-    if (info_.inputs_[1]->producer_owner_->isa<constant_op_t>()
-            || info_.inputs_[1]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)) {
-        constant_B = true;
-    } else {
-        bool constant_B_parents = true;
-        for (const auto &input :
-                info_.inputs_[1]->producer_owner_->get_inputs()) {
-            auto parent_node = input->producer_owner_;
-            constant_B_parents &= (parent_node->attrs_.get_or_else(
-                                           "constant", const_kind::not_const)
-                    || parent_node->isa<constant_op_t>());
-        }
-        constant_B = constant_B_parents
-                && !info_.inputs_[1]->producer_owner_->get_inputs().empty();
-    }
-
-    std::vector<int> blk_candidates = get_dynamic_block_candidates();
-    std::vector<int> m_blk_candidates = get_dynamic_batch_block_candidates();
-    bool transposed_a = attrs_.get_or_else("transposed_a", false);
-    bool transposed_b = attrs_.get_or_else("transposed_b", false);
-    sc_data_format_t ret_A_format, ret_B_format, ret_C_format;
-    auto cur_format_set = std::unordered_set<std::vector<sc_data_format_t>>();
-    auto cur_dispatch_key_set = dispatch_key_set_t();
-    std::vector<bool> is_padding = {false, true};
-    std::vector<bool> is_output_plain = {false, true};
-    bool first = true;
-    // consider ND*2D case, A_format is penetrated, which is always blocking
-    auto p2bmp_a = A_format.format_code_.collect_p2b_mapping();
-    // consider 2D*ND case, B_format is penetrated, which is always blocking
-    auto p2bmp_b = B_format.format_code_.collect_p2b_mapping();
-    bool treat_as_static
-            = !get_owner_graph().attrs_.get_or_else("insert_reorder", true);
-    for (auto &m_b : m_blk_candidates) { // M
-        for (auto &n_b : blk_candidates) { // N
-            for (auto &k_b : blk_candidates) { // K
-                for (auto A_isp : is_padding) { // A is_padding
-                    for (auto B_isp : is_padding) { // B is_padding
-                        if (is_dynamic_dim(M)) {
-                            iim_block = m_b;
-                            if (treat_as_static) { iim_block = iim_block_; }
-                        }
-                        if (is_dynamic_dim(N)) {
-                            iin_block = n_b;
-                            if (treat_as_static) { iin_block = iin_block_; }
-                        }
-                        if (is_dynamic_dim(K)) {
-                            iik_block = k_b;
-                            if (treat_as_static) { iik_block = iik_block_; }
-                        }
-                        if (A_dims.size() == 2) {
-                            if (constant_A
-                                    || (!dynamic && A_format.is_blocking()
-                                            && p2bmp_a.at(0).size() > 1
-                                            && p2bmp_a.at(1).size() > 1)
-                                    || A_isp || (!dynamic && M % iim_block)
-                                    || (!dynamic && K % iik_block)
-                                    || (!dynamic && is_A_vnni_low_fp
-                                            && A_format
-                                                    == sc_data_format_t::
-                                                            NK())) {
-                                ret_A_format = sc_data_format_t::MKmk(
-                                        iim_block, iik_block);
-                            } else {
-                                ret_A_format = sc_data_format_t::MK();
-                            }
-                            if (dynamic && A_format.is_blocking()
-                                    && p2bmp_a.at(0).size() > 1
-                                    && p2bmp_a.at(1).size() > 1) {
-                                ret_A_format = A_format;
-                                iim_block = transposed_a ? A_format.blocks_[1]
-                                                         : A_format.blocks_[0];
-                                iik_block = transposed_a ? A_format.blocks_[0]
-                                                         : A_format.blocks_[1];
-                            }
-                            if (!dynamic) {
-                                in_formats.push_back({ret_A_format});
-                            }
-                        } else {
-                            COMPILE_ASSERT(0,
-                                    "managed_matmul_core only supports 2d "
-                                    "yet");
-                        }
-                        if (B_dims.size() == 2) {
-                            if (utils::is_one_of(B_dtype, datatypes::u8,
-                                        datatypes::s8)) {
-                                ret_B_format = sc_data_format_t::NKkn4k(
-                                        iik_block, iin_block);
-                            } else if (is_B_vnni_low_fp) {
-                                // do vnni reorder in template for
-                                // transposed matmul
-                                if (!dynamic) {
-                                    ret_B_format = sc_data_format_t::NKkn2k(
-                                            iik_block, iin_block);
-                                }
-                            } else {
-                                if (constant_B || B_isp
-                                        || (!dynamic && B_format.is_blocking()
-                                                && p2bmp_b.at(0).size() > 1
-                                                && p2bmp_b.at(1).size() > 1)
-                                        || (!is_dynamic_dim(K) && K % iik_block)
-                                        || (!is_dynamic_dim(N)
-                                                && N % iin_block)) {
-                                    ret_B_format = sc_data_format_t::NKkn(
-                                            iik_block, iin_block);
-                                } else {
-                                    ret_B_format = sc_data_format_t::KN();
-                                }
-                            }
-                            if (!dynamic) {
-                                in_formats.push_back({ret_B_format});
-                            }
-                        } else {
-                            COMPILE_ASSERT(0,
-                                    "managed_matmul_core only supports 2d "
-                                    "yet");
-                        }
-                        if (M == iim_block && M >= 32 && N % iin_block == 0
-                                && !dynamic) {
-                            out_formats.push_back(
-                                    {sc_data_format_t::get_plain_by_dims(
-                                            C_dims.size())});
-                        } else if (constant_B || dynamic || M % iim_block
-                                || N % iin_block) {
-                            ret_C_format = sc_data_format_t(
-                                    sc_data_format_kind_t::
-                                            get_2dblocking_by_dims(
-                                                    C_dims.size(), false),
-                                    {iim_block, iin_block});
-                            if (!dynamic) {
-                                out_formats.push_back({ret_C_format});
-                            }
-                        } else {
-                            if (attrs_.get_or_else("transposed_b", false)) {
-                                out_formats.push_back(
-                                        {sc_data_format_t::get_plain_by_dims(
-                                                C_dims.size())});
-                            } else {
-                                sc_data_format_t out_fmt1(
-                                        sc_data_format_kind_t::
-                                                get_2dblocking_by_dims(
-                                                        C_dims.size(), false),
-                                        {iim_block, iin_block});
-                                auto out_fmt2
-                                        = sc_data_format_t::get_plain_by_dims(
-                                                C_dims.size());
-                                out_formats.push_back({out_fmt1, out_fmt2});
-                                in_formats[0].push_back(in_formats[0][0]);
-                                in_formats[1].push_back(in_formats[1][0]);
-                            }
-                        }
-                        std::vector<sc_data_format_t> ret_formats
-                                = {ret_A_format, ret_B_format, ret_C_format};
-                        if (dynamic) {
-                            std::vector<std::vector<sc_dim>> var_block
-                                    = {{iim_block, iik_block},
-                                            {iik_block, iin_block},
-                                            {iim_block, iin_block}};
-                            op_dispatch_key_t ret_key(var_block, ret_formats);
-                            cur_dispatch_key_set.set_.insert(ret_key);
-                            if (cur_format_set.find(ret_formats)
-                                    == cur_format_set.end()) {
-                                if (in_formats.empty()) {
-                                    in_formats.resize(2);
-                                }
-                                if (out_formats.empty()) {
-                                    out_formats.resize(1);
-                                }
-                                in_formats[0].emplace_back(ret_A_format);
-                                in_formats[1].emplace_back(ret_B_format);
-                                out_formats[0].emplace_back(ret_C_format);
-                                cur_format_set.insert(ret_formats);
-                            }
-                            if (first) {
-                                // reset default cfg to first candidate in
-                                // dynamic for try mode in fuse op pass.
-                                iim_block_ = iim_block;
-                                iin_block_ = iin_block;
-                                iik_block_ = iik_block;
-                                first = false;
-                            }
-                        }
-                        // break is B padding loop if it is static
-                        if (!is_dynamic_dim(K) && !is_dynamic_dim(N)) { break; }
-                    }
-                    // break is A padding loop if it is static
-                    if (!is_dynamic_dim(M) && !is_dynamic_dim(K)) { break; }
-                }
-                // break the k loop if it is static
-                if (!dynamic) { break; }
-            }
-            // break the n loop if it is static
-            if (!dynamic) { break; }
-        }
-        // break the m loop if it is static
-        if (!dynamic) { break; }
-    }
-    if (dynamic) {
-        auto &dispatch_key_set = get_dispatch_key_set();
-        dispatch_key_set->get_inner_set().insert(
-                cur_dispatch_key_set.set_.begin(),
-                cur_dispatch_key_set.set_.end());
-    }
-    // To calculate padded K of input A
-    auto pad_K_num = utils::divide_and_ceil(
-            info_.inputs_[0]->details_.get_plain_dims().back(), iik_block);
-    attrs_["temp.padded_A_K"].get<std::shared_ptr<VConst>>()->var_
-            = pad_K_num * iik_block;
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-void managed_matmul_core_op_t::set_config_by_key(
-        const op_dispatch_key_t &key, const context_ptr &ctx) {
-    iim_block_ = key.var_block_[0][0];
-    iin_block_ = key.var_block_[1][1];
-    iik_block_ = key.var_block_[0][1];
-}
-
-void managed_matmul_core_op_t::set_internal_config_by_key(
-        const impl_op_dispatch_key_t &key, const context_ptr &ctx) {
-    config_data_ = dyn_config_candidates_[key.impl_];
-    auto mmm_config = config_data_.get_as<managed_matmul_core_config_t>();
-    if (mmm_config->M_split_num * mmm_config->N_split_num
-            < runtime_config_t::get().get_num_threads()) {
-        info_.cur_impl_ = mmm_impl_kind_t::is_partial;
-    } else {
-        info_.cur_impl_ = mmm_impl_kind_t::full_k;
-    }
-}
-
-ir_module_ptr managed_matmul_core_op_t::get_internal_func(
-        const context_ptr &ctx) {
-    assert(is_dynamic());
-    if (!need_dynamic_internal_query()) { return nullptr; }
-    // query binding axis
-    query_binding_axis(get_owner_graph());
-    auto ret = std::make_shared<ir_module_t>(ctx);
-    auto gen_ptr = create_generator();
-    std::vector<expr> ins;
-    std::vector<expr> outs;
-    auto func = graph::create_func_decl_for_op(this, ins, outs);
-    COMPILE_ASSERT(!info_.internal_info_->parti_in_ltsrs_.empty()
-                    && !info_.internal_info_->parti_out_ltsrs_.empty(),
-            "Need in/out buffer args first");
-    const auto &out_details = info_.cur_impl_ == mmm_impl_kind_t::is_partial
-            ? graph::extract_detail_from_tensors(get_outputs())
-            : info_.internal_info_->parti_out_ltsrs_;
-    const auto &in_details = info_.cur_impl_ == mmm_impl_kind_t::is_partial
-            ? graph::extract_detail_from_tensors(get_inputs())
-            : info_.internal_info_->parti_in_ltsrs_;
-    auto pouts = graph::tensor_detail_to_ir_tensor(
-            get_owner_graph(), "__outs_", out_details);
-    auto pins = graph::tensor_detail_to_ir_tensor(
-            get_owner_graph(), "__ins_", in_details);
-    auto buffer_args = pouts;
-    buffer_args.insert(buffer_args.end(), pins.begin(), pins.end());
-    func->params_ = buffer_args;
-    func->params_.emplace_back(
-            builder::make_var(datatypes::pointer, "single_core_func"));
-    builder::ir_builder_t bld;
-    bld.push_scope();
-    std::vector<for_loop> loops;
-    gen_ptr->set_single_core_func_param(func->params_.back());
-    func_t single_core_func = gen_ptr->get_single_core_func(
-            ctx, config_data_.data_.get(), nullptr, pins, pouts, loops);
-    single_core_func->body_ = builder::make_returns_unattached(true);
-    single_core_func->attr().set(attr_keys::keep_func, true);
-    auto extra_args = gen_ptr->get_extra_args_from_func(single_core_func);
-    single_core_func->params_ = buffer_args;
-    single_core_func->params_.insert(single_core_func->params_.end(),
-            extra_args.begin(), extra_args.end());
-    func->params_.back()->attr().set("prototype", single_core_func);
-    bool status = gen_ptr->generate(
-            ctx, config_data_.data_.get(), nullptr, pins, pouts, loops);
-    assert(status);
-    bld.push_returns(true);
-    auto body = bld.pop_scope();
-    gen_ptr->schedule_loops(ctx, config_data_.data_.get(), body, loops);
-    func->body_ = std::move(body);
-    ret->add_func({func, single_core_func});
-    ret->set_entry_func_idx(0);
-    return ret;
-}
-
-sc_op_ptr managed_matmul_core_op_t::copy(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, sc_graph_t &mgr) {
-    auto ret = tunable_op_t::copy(ins, outs, mgr);
-    auto mmm = ret->dyn_cast<managed_matmul_core_op_t>();
-    mmm->iim_block_ = iim_block_;
-    mmm->iin_block_ = iin_block_;
-    mmm->iik_block_ = iik_block_;
-    return ret;
-}
-
-sc_op_ptr managed_matmul_core_op_t::do_compensations(
-        sc_graph_t &mgr, const context_ptr &ctx) {
-    need_compensation_ = false;
-    // whether we need special compensation for
-    // microkernel.
-    bool s8s8_compensation = ctx->machine_.cpu_flags_.fAVX512VNNI
-            && info_.inputs_[0]->details_.dtype_ == datatypes::s8
-            && (!ctx->machine_.brgemm_use_amx_
-                    || (ctx->machine_.brgemm_use_amx_
-                            && !ctx->machine_.cpu_flags_.fAVX512AMXINT8)
-                    || attrs_.get_or_else("dispatch_avx", false));
-    auto cur_node = shared_from_this();
-
-    auto data_com = get_data_compensation(mgr);
-    auto s8s8_weight_com
-            = get_s8s8_and_weight_compensation(mgr, s8s8_compensation);
-    auto const_com = get_constant_compensation(mgr);
-
-    if (data_com) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0], data_com->get_outputs()[0]}, {},
-                {});
-    }
-
-    if (s8s8_weight_com[0]) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0],
-                        s8s8_weight_com[0]->get_outputs()[0]},
-                {}, {});
-    }
-    if (s8s8_weight_com[1]) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0],
-                        s8s8_weight_com[1]->get_outputs()[0]},
-                {}, {});
-    }
-    if (const_com) {
-        cur_node = mgr.make("add",
-                {cur_node->get_outputs()[0], const_com->get_outputs()[0]}, {},
-                {});
-    }
-
-    return cur_node;
-}
-
-sc_op_ptr managed_matmul_core_op_t::get_data_compensation(sc_graph_t &mgr) {
-    bool is_dyn_quan = attrs_.has_key(attr_keys::dyn_weight_zero_points);
-    auto weight_zero_points = attrs_.get_or_else(
-            attr_keys::weight_zero_points, std::vector<int> {0});
-    auto dyn_weight_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_weight_zero_points, graph_tensor_ptr());
-    if (!is_dyn_quan
-            && (weight_zero_points.empty()
-                    || (std::all_of(weight_zero_points.begin(),
-                            weight_zero_points.end(),
-                            [](int i) { return i == 0; })))) {
-        return nullptr;
-    }
-    if (is_dyn_quan && !dyn_weight_zero_points) { return nullptr; }
-    auto data = info_.inputs_[0];
-    auto cast_node = mgr.make("cast", {data}, {}, {{"dtype", datatypes::s32}});
-
-    // K is reduce axis
-    std::vector<int> rdaxis
-            = {static_cast<int>(data->details_.get_plain_dims().size()) - 1};
-
-    auto reduce_node = mgr.make("reduce", cast_node->get_outputs(), {},
-            {{"rd_axis", rdaxis}, {"rd_op", 0}, {"keep_dims", true}});
-    sc_op_ptr mul_node;
-    if (is_dyn_quan) {
-        COMPILE_ASSERT(dyn_weight_zero_points->details_.get_plain_dims()
-                        == sc_dims {1},
-                "matmul_core does not support per channel weight zero "
-                "points compensation yet");
-        mul_node = mgr.make("mul",
-                {reduce_node->get_outputs()[0], dyn_weight_zero_points}, {},
-                {});
-    } else {
-        std::shared_ptr<static_data_t> weight_zero_points_ptr
-                = std::make_shared<static_data_t>(weight_zero_points);
-        sc_dims const_plain_dims;
-        sc_data_format_t const_format;
-        if (weight_zero_points.size() == 1) {
-            // per tensor
-            const_plain_dims = {1};
-        } else {
-            // per channel
-            COMPILE_ASSERT(0,
-                    "matmul_core does not support per channel weight zero "
-                    "points "
-                    "compensation yet");
-            auto weight = info_.inputs_[1];
-            auto weight_plain_dims = weight->details_.get_plain_dims();
-            assert(weight_plain_dims.back()
-                    == static_cast<int64_t>(weight_zero_points.size()));
-            const_plain_dims = {1, weight_plain_dims.back()};
-            const_format = info_.inputs_[1]->details_.get_format();
-        }
-        auto constant_node = mgr.make("constant", {}, {},
-                {{"values", weight_zero_points_ptr}, {"dtype", datatypes::s32},
-                        {"plain_dims", const_plain_dims},
-                        {"format", const_format}});
-        mul_node = mgr.make("mul",
-                {reduce_node->get_outputs()[0],
-                        constant_node->get_outputs()[0]},
-                {}, {});
-    }
-
-    if (data->details_.get_plain_dims().size() < get_batch_dims().size() + 2) {
-        sc_dims unsqueeze_shape(get_batch_dims().size() + 2
-                        - data->details_.get_plain_dims().size(),
-                1);
-        sc_dims reshape_dest
-                = merge_vec(unsqueeze_shape, data->details_.get_plain_dims());
-        reshape_dest.at(reshape_dest.size() - 1) = 1;
-        auto reshape_fmt = info_.outputs_[0]->details_.get_format();
-        auto reshape_node = mgr.make("tensor_view", mul_node->get_outputs(),
-                {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                        mul_node->get_outputs()[0]->details_.dtype_)},
-                {{"shape", reshape_dest}, {"format", reshape_fmt}});
-        return reshape_node;
-    }
-    return mul_node;
-}
-
-std::vector<sc_op_ptr>
-managed_matmul_core_op_t::get_s8s8_and_weight_compensation(
-        sc_graph_t &mgr, bool s8s8_compensation) {
-    bool is_dyn_quan = attrs_.has_key(attr_keys::dyn_data_zero_points);
-    auto data_zero_points = attrs_.get_or_else(
-            attr_keys::data_zero_points, std::vector<int> {0});
-    auto dyn_data_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_data_zero_points, graph_tensor_ptr());
-    bool weight_compensation = (is_dyn_quan && dyn_data_zero_points)
-            || (!data_zero_points.empty()
-                    && !(std::all_of(data_zero_points.begin(),
-                            data_zero_points.end(),
-                            [](int i) { return i == 0; })));
-    std::vector<sc_op_ptr> nodes = {nullptr, nullptr};
-    if (!s8s8_compensation && !weight_compensation) { return nodes; }
-
-    auto weight = info_.inputs_[1];
-    auto cast_node
-            = mgr.make("cast", {weight}, {}, {{"dtype", datatypes::s32}});
-
-    // K is reduce axis
-    std::vector<int> rdaxis
-            = {static_cast<int>(weight->details_.get_plain_dims().size()) - 2};
-    auto reduce_node = mgr.make("reduce", cast_node->get_outputs(), {},
-            {{"rd_axis", rdaxis}, {"rd_op", 0}, {"keep_dims", true}});
-
-    if (weight_compensation) {
-        if (is_dyn_quan) {
-            COMPILE_ASSERT(dyn_data_zero_points->details_.get_plain_dims()
-                            == sc_dims {1},
-                    "matmul_core does not support per channel data zero "
-                    "points compensation yet");
-            nodes[0] = mgr.make("mul",
-                    {reduce_node->get_outputs()[0], dyn_data_zero_points}, {},
-                    {});
-        } else {
-            std::shared_ptr<static_data_t> data_zero_points_ptr
-                    = std::make_shared<static_data_t>(data_zero_points);
-            sc_dims const_plain_dims;
-            sc_data_format_t const_format;
-            if (data_zero_points.size() == 1) {
-                // per tensor
-                const_plain_dims = {1};
-            } else {
-                // per channel
-                COMPILE_ASSERT(0,
-                        "matmul_core does not support per channel data zero "
-                        "points "
-                        "compensation yet");
-                auto data = info_.inputs_[0];
-                auto data_plain_dims = data->details_.get_plain_dims();
-                size_t bds = get_batch_dims().size();
-                assert(data_plain_dims[bds]
-                        == static_cast<int64_t>(data_zero_points.size()));
-                const_plain_dims = {data_plain_dims[bds], 1};
-                const_format = info_.inputs_[0]->details_.get_format();
-            }
-            auto constant_node = mgr.make("constant", {}, {},
-                    {{"values", data_zero_points_ptr},
-                            {"dtype", datatypes::s32},
-                            {"plain_dims", const_plain_dims},
-                            {"format", const_format}});
-            nodes[0] = mgr.make("mul",
-                    {reduce_node->get_outputs()[0],
-                            constant_node->get_outputs()[0]},
-                    {}, {});
-        }
-        if (weight->details_.get_plain_dims().size()
-                < get_batch_dims().size() + 2) {
-            sc_dims unsqueeze_shape(get_batch_dims().size() + 2
-                            - weight->details_.get_plain_dims().size(),
-                    1);
-            sc_dims reshape_dest = merge_vec(
-                    unsqueeze_shape, weight->details_.get_plain_dims());
-            reshape_dest.at(reshape_dest.size() - 2) = 1;
-            auto reshape_fmt = info_.outputs_[0]->details_.get_format();
-            nodes[0] = mgr.make("tensor_view", nodes[0]->get_outputs(),
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            nodes[0]->get_outputs()[0]->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", reshape_fmt}});
-        }
-    }
-
-    if (s8s8_compensation) {
-        auto s8_constant_node = mgr.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(
-                                 std::vector<int> {128})},
-                        {"dtype", datatypes::s32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        nodes[1] = mgr.make("mul",
-                {reduce_node->get_outputs()[0],
-                        s8_constant_node->get_outputs()[0]},
-                {}, {});
-        if (weight->details_.get_plain_dims().size()
-                < get_batch_dims().size() + 2) {
-            sc_dims unsqueeze_shape(get_batch_dims().size() + 2
-                            - weight->details_.get_plain_dims().size(),
-                    1);
-            sc_dims reshape_dest = merge_vec(
-                    unsqueeze_shape, weight->details_.get_plain_dims());
-            reshape_dest.at(reshape_dest.size() - 2) = 1;
-            auto reshape_fmt = info_.outputs_[0]->details_.get_format();
-            nodes[1] = mgr.make("tensor_view", nodes[1]->get_outputs(),
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            nodes[1]->get_outputs()[0]->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", reshape_fmt}});
-        }
-    }
-    return nodes;
-}
-
-sc_op_ptr managed_matmul_core_op_t::get_constant_compensation(sc_graph_t &mgr) {
-    bool is_dyn_quan = attrs_.has_key(attr_keys::dyn_data_zero_points);
-    auto data_zero_points = attrs_.get_or_else(
-            attr_keys::data_zero_points, std::vector<int> {0});
-    auto weight_zero_points = attrs_.get_or_else(
-            attr_keys::weight_zero_points, std::vector<int> {0});
-    auto dyn_data_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_data_zero_points, graph_tensor_ptr());
-    auto dyn_weight_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_weight_zero_points, graph_tensor_ptr());
-    auto K_orig = info_.inputs_[0]->details_.get_plain_dims().at(
-            info_.inputs_[0]->details_.get_plain_dims().size() - 1);
-    int K = static_cast<int>(K_orig);
-    COMPILE_ASSERT(attrs_.has_key("temp.padded_A_K"),
-            "No related VConst set, which maybe cause correctness error")
-    sc_op_ptr ret_node;
-    if (is_dyn_quan) {
-        if (!dyn_data_zero_points || !dyn_weight_zero_points) {
-            return nullptr;
-        }
-    } else {
-        if (data_zero_points.empty() || weight_zero_points.empty()) {
-            return nullptr;
-        }
-        if ((std::all_of(data_zero_points.begin(), data_zero_points.end(),
-                    [](int i) { return i == 0; }))
-                || (std::all_of(weight_zero_points.begin(),
-                        weight_zero_points.end(),
-                        [](int i) { return i == 0; }))) {
-            return nullptr;
-        }
-    }
-
-    if (is_dyn_quan) {
-        COMPILE_ASSERT(
-                dyn_data_zero_points->details_.get_plain_dims() == sc_dims {1}
-                        && dyn_weight_zero_points->details_.get_plain_dims()
-                                == sc_dims {1},
-                "matmul_core does not support per channel data/weight zero "
-                "points compensation yet");
-        ret_node = mgr.make(
-                "mul", {dyn_data_zero_points, dyn_weight_zero_points}, {}, {});
-        auto const_reduce = mgr.make("constant", {}, {},
-                {{"dtype", datatypes::s32},
-                        {"values",
-                                std::make_shared<static_data_t>(
-                                        &K, sizeof(int))},
-                        {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}, {"temp.val/var", 1},
-                        {"temp.var", attrs_["temp.padded_A_K"]}});
-        ret_node = mgr.make("mul",
-                {ret_node->get_outputs()[0], const_reduce->get_outputs()[0]},
-                {}, {});
-    } else {
-        COMPILE_ASSERT(
-                data_zero_points.size() == 1 && weight_zero_points.size() == 1,
-                "matmul_core does not support per channel data/weight zero "
-                "points "
-                "compensation yet");
-        ret_node = mgr.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(
-                                 std::vector<int> {data_zero_points[0]
-                                         * weight_zero_points[0] * K})},
-                        {"dtype", datatypes::s32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()},
-                        {"temp.val/var",
-                                data_zero_points[0] * weight_zero_points[0]},
-                        {"temp.var", attrs_["temp.padded_A_K"]}});
-    }
-
-    return ret_node;
-}
-
-shape_rl_vec managed_matmul_core_op_t::get_dynamic_shape_relations() const {
-    return matmul_core_op_t::get_shape_relations_impl(
-            get_inputs()[0]->details_.get_plain_dims(),
-            get_inputs()[1]->details_.get_plain_dims(),
-            get_outputs()[0]->details_.get_plain_dims());
-}
-
-bool managed_matmul_core_op_t::need_dynamic_internal_query_impl() const {
-    return is_dynamic();
-}
-
-void managed_matmul_core_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    infer_matmul_binding_axis(this, bdax_map);
-}
-void managed_matmul_core_op_t::pre_infer_binding_axis(
-        binding_axis_map &bdax_map) {
-    pre_matmul_binding_axis(this, bdax_map);
-}
-
-std::vector<int> managed_matmul_core_op_t::get_impl_dispatch_candidates(
-        const context_ptr &ctx) {
-    return std::vector<int> {
-            mmm_impl_kind_t::full_k, mmm_impl_kind_t::is_partial};
-}
-
-dispatch_set_ptr managed_matmul_core_op_t::get_internal_dispatch_key_set(
-        const context_ptr &ctx) {
-    auto ret = std::make_shared<impl_dispatch_key_set_t>();
-    auto impls = get_dynamic_impl_dispatch_candidates(this, ctx);
-    for (auto &impl : impls) {
-        ret->set_.insert(impl_op_dispatch_key_t(impl, 3));
-    }
-    return ret;
-}
-} // namespace ops
-OP_REGISTER(ops::managed_matmul_core_op_t, managed_matmul_core)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/managed_matmul_core.hpp b/src/graph/backend/graph_compiler/core/src/ops/managed_matmul_core.hpp
deleted file mode 100644
index 4cc301f4852..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/managed_matmul_core.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_MANAGED_MATMUL_CORE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_MANAGED_MATMUL_CORE_HPP
-
-#include <vector>
-#include <compiler/ir/graph/trait/may_prefetch.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class SC_INTERNAL_API managed_matmul_core_op_t
-    : public tunable_op_t,
-      public op_traits::may_prefetch_t {
-public:
-    managed_matmul_core_op_t(const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    body_generator_ptr create_generator() override;
-    float get_gflop() override;
-    sc_dims get_batch_dims() const;
-    sc_op_ptr do_compensations(
-            sc_graph_t &mgr, const context_ptr &ctx) override;
-    sc_op_ptr get_data_compensation(sc_graph_t &mgr);
-    // reuse cast and reduce nodes to do s8s8 and weight compensations togethor
-    std::vector<sc_op_ptr> get_s8s8_and_weight_compensation(
-            sc_graph_t &mgr, bool s8s8_compensation);
-    sc_op_ptr get_constant_compensation(sc_graph_t &mgr);
-    shape_rl_vec get_dynamic_shape_relations() const override;
-    bool need_dynamic_internal_query_impl() const override;
-    ir_module_ptr get_internal_func(const context_ptr &ctx) override;
-
-    std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx) override;
-    dispatch_set_ptr get_internal_dispatch_key_set(
-            const context_ptr &ctx) override;
-
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override {
-        // TODO(XXX)
-        return infer_status_code::FAIL;
-    }
-
-    std::vector<int> query_prefetch(const context_ptr &ctx, bool is_global,
-            const std::vector<tensor_slice> &ins) override;
-
-    void generate_prefetcher_body_for_tensor(const context_ptr &ctx,
-            const std::vector<expr> &func_args, const std::vector<expr> &ins,
-            const std::vector<int> &indices) override;
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-    void set_config_by_key(
-            const op_dispatch_key_t &key, const context_ptr &ctx) override;
-    void set_internal_config_by_key(
-            const impl_op_dispatch_key_t &key, const context_ptr &ctx) override;
-    virtual sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins, // NOLINT
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override;
-
-private:
-    int iim_block_ = -1;
-    int iin_block_ = -1;
-    int iik_block_ = -1;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/matmul.cpp b/src/graph/backend/graph_compiler/core/src/ops/matmul.cpp
deleted file mode 100644
index 62f71390e44..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/matmul.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#include "matmul.hpp"
-#include <algorithm>
-#include <numeric>
-#include <utility>
-#include "compiler/ir/graph/fusible_op.hpp"
-#include "matmul_core.hpp"
-#include "templates/utils.hpp"
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <util/math_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-matmul_op::matmul_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT((ins.size() == 2 || ins.size() == 3),
-            "matmul inputs size should be 2(a, b) or 3(a, b, bias).");
-    COMPILE_ASSERT((ins[0]->details_.get_plain_dims().size() >= 2
-                           && ins[1]->details_.get_plain_dims().size() >= 2),
-            "matrix a and matrix b shape should be bigger or equal than 2.");
-    info_.inputs_ = ins;
-    auto &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    bool trans_a = attrs.get_or_else("transpose_a", false);
-    bool trans_b = attrs.get_or_else("transpose_b", false);
-    bool is_int8 = utils::is_one_of(
-            ins[0]->details_.dtype_, datatypes::u8, datatypes::s8);
-    bool is_bf16 = ins[0]->details_.dtype_ == datatypes::bf16;
-    bool is_low_precision_fp = utils::is_one_of(
-            ins[0]->details_.dtype_, datatypes::bf16, datatypes::f16);
-    sc_dims output_shape;
-    if (!is_dynamic()) {
-        output_shape = {merge_vec(
-                matmul_core_op_t::get_batch_dims_with_bc_impl(A_dims, B_dims),
-                {A_dims[A_dims.size() - (trans_a ? 1 : 2)],
-                        B_dims[B_dims.size() - (trans_b ? 2 : 1)]})};
-    } else {
-        output_shape = {
-                merge_vec(matmul_core_op_t::get_batch_dims_impl(A_dims, B_dims),
-                        {A_dims[A_dims.size() - (trans_a ? 1 : 2)],
-                                B_dims[B_dims.size() - (trans_b ? 2 : 1)]})};
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                sc_data_format_t(), output_shape,
-                is_int8 ? datatypes::s32
-                        : (is_low_precision_fp ? (
-                                   is_bf16 ? datatypes::bf16 : datatypes::f16)
-                                               : datatypes::f32)));
-    } else {
-        info_.outputs_ = outs;
-        if (!is_dynamic()) {
-            COMPILE_ASSERT(info_.outputs_[0]->details_.get_plain_dims()
-                            == output_shape,
-                    "Bad out dims");
-        }
-    }
-    for (auto &op : info_.outputs_) {
-        op->producer_owner_ = this;
-    }
-    attrs_ = attrs;
-    op_name_ = "matmul";
-}
-
-static void transed_matmul(const std::shared_ptr<sc_graph_t> &graph,
-        any_map_t &attrs, const graph_tensor_ptr &ins0,
-        const graph_tensor_ptr &ins1, graph_tensor_ptr &trans0,
-        graph_tensor_ptr &trans1) {
-    if (attrs.get_or_else("transpose_a", false)) {
-        auto &original_dims = ins0->details_.get_plain_dims();
-        sc_dims transed_plain_dims(original_dims.begin(), original_dims.end());
-        COMPILE_ASSERT(transed_plain_dims.size() >= 2, "Bad input shape");
-        std::swap(transed_plain_dims[transed_plain_dims.size() - 1],
-                transed_plain_dims[transed_plain_dims.size() - 2]);
-        std::vector<int> order(transed_plain_dims.size());
-        std::iota(order.begin(), order.end(), 0);
-        std::swap(order[transed_plain_dims.size() - 1],
-                order[transed_plain_dims.size() - 2]);
-        auto out = graph_tensor::make(transed_plain_dims,
-                ins0->details_.get_format(), ins0->details_.dtype_);
-        trans0 = graph->make("transpose", {ins0}, {out}, {{"order", order}})
-                         ->get_outputs()[0];
-        attrs.set("transpose_a", false);
-    } else {
-        attrs.set("transpose_a", false);
-    }
-
-    // if transpose_b is true: need to permute
-    if (attrs.get_or_else("transpose_b", false)) {
-        auto original_dims = ins1->details_.get_plain_dims();
-        sc_dims transed_plain_dims(original_dims.begin(), original_dims.end());
-        std::swap(transed_plain_dims[transed_plain_dims.size() - 1],
-                transed_plain_dims[transed_plain_dims.size() - 2]);
-        std::vector<int> order(transed_plain_dims.size());
-        std::iota(order.begin(), order.end(), 0);
-        std::swap(order[transed_plain_dims.size() - 1],
-                order[transed_plain_dims.size() - 2]);
-        auto out = graph_tensor::make(transed_plain_dims,
-                ins1->details_.get_format(), ins1->details_.dtype_);
-        trans1 = graph->make("transpose", {ins1}, {out}, {{"order", order}})
-                         ->get_outputs()[0];
-        attrs.set("transpose_b", false);
-    } else {
-        attrs.set("transpose_b", false);
-    }
-}
-
-void matmul_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto ins = graph->make_input(inputs);
-    sc_op_ptr matmul, graph_out;
-
-    // analysis matmul is matmul_core_op_t which is tunable op by
-    // inputs[0](the left matrix) and inputs[1](the right matrix).
-    graph_tensor_ptr trans0 = ins->get_outputs()[0],
-                     trans1 = ins->get_outputs()[1];
-    // don't change attrs_ directly
-    auto attrs = attrs_;
-    // used for mmm_core
-    bool transposed_a = attrs.get_or_else("transpose_a", false);
-    bool transposed_b = attrs.get_or_else("transpose_b", false);
-    std::vector<int> post_rd_axis
-            = attrs.get_or_else("post_rd_axis", std::vector<int> {});
-    transed_matmul(graph, attrs, ins->get_outputs()[0], ins->get_outputs()[1],
-            trans0, trans1);
-
-    bool is_bf16 = false;
-    if (inputs[0]->details_.dtype_ == datatypes::bf16
-            || inputs[1]->details_.dtype_ == datatypes::bf16
-            || outputs[0]->details_.dtype_ == datatypes::bf16) {
-        COMPILE_ASSERT(inputs[0]->details_.dtype_ == datatypes::bf16
-                        && inputs[1]->details_.dtype_ == datatypes::bf16
-                        && outputs[0]->details_.dtype_ == datatypes::bf16,
-                "All inputs should have same data type.")
-        is_bf16 = true;
-    }
-
-    bool is_f16 = false;
-    if (inputs[0]->details_.dtype_ == datatypes::f16
-            || inputs[1]->details_.dtype_ == datatypes::f16
-            || outputs[0]->details_.dtype_ == datatypes::f16) {
-        COMPILE_ASSERT(inputs[0]->details_.dtype_ == datatypes::f16
-                        && inputs[1]->details_.dtype_ == datatypes::f16
-                        && outputs[0]->details_.dtype_ == datatypes::f16,
-                "All inputs should have same data type.")
-        is_f16 = true;
-    }
-
-    // For Nd*2d and 2d*Nd non-dynamic cases, ND input will be reshaped into 2D
-    // to meet more possibilities of M_block or N_block
-    sc_dims trans0_plain_dims = trans0->details_.get_plain_dims(),
-            trans1_plain_dims = trans1->details_.get_plain_dims();
-    if (!is_dynamic()) {
-        // check Nd*2d cases
-        if (trans0_plain_dims.size() > 2 && trans1_plain_dims.size() == 2) {
-            sc_dims reshape_dest = {math_utils::get_dims_product(sc_dims {
-                                            trans0_plain_dims.begin(),
-                                            trans0_plain_dims.end() - 1}),
-                    trans0_plain_dims.back()};
-            auto reshape_node = graph->make("tensor_view", {trans0},
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            trans0->details_.dtype_)},
-                    {{"shape", reshape_dest},
-                            {"format", sc_data_format_t::MK()},
-                            {"forbid_penetrate", true},
-                            {"cache_input_format",
-                                    sc_data_format_t::get_plain_by_dims(
-                                            trans0_plain_dims.size())}});
-            trans0 = reshape_node->get_outputs()[0];
-            if (post_rd_axis.size() == 1
-                    && post_rd_axis.at(0)
-                            == static_cast<int>(trans0_plain_dims.size()) - 1) {
-                post_rd_axis.at(0) = 1;
-            }
-        }
-        // check 2d*Nd cases
-        if (trans0_plain_dims.size() == 2 && trans1_plain_dims.size() > 2) {
-            sc_dims reshape_dest
-                    = {trans1_plain_dims.at(trans1_plain_dims.size() - 2),
-                            math_utils::get_dims_product(
-                                    sc_dims {trans1_plain_dims.begin(),
-                                            trans1_plain_dims.end() - 2})
-                                    * trans1_plain_dims.back()};
-            auto reshape_fmt = sc_data_format_t::NKkn(
-                    trans1_plain_dims.at(trans1_plain_dims.size() - 2),
-                    trans1_plain_dims.back());
-            sc_op_ptr reshape_node = graph->make("tensor_view", {trans1},
-                    {graph_tensor::make(reshape_dest, reshape_fmt,
-                            trans1->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", reshape_fmt},
-                            {"forbid_penetrate", true}});
-            trans1 = reshape_node->get_outputs()[0];
-        }
-    }
-    int M = trans0->details_
-                    .get_plain_dims()[trans0->details_.get_plain_dims().size()
-                            - 2];
-    int K = trans0->details_.get_plain_dims().back();
-    int N = trans1->details_.get_plain_dims().back();
-    bool use_mmm = attrs_.get_or_else("use_mmm", true);
-    // manual define output channel axis here for NDxND cases.
-    int output_channel_axis = static_cast<int>(
-            std::max(trans0_plain_dims.size(), trans1_plain_dims.size()) - 1);
-    // We don't allow the dynamic mmm exposed to external in this pr as the
-    // total dispatch key is huge.
-    if (is_dynamic() || trans0->details_.get_plain_dims().size() > 2
-            || trans1->details_.get_plain_dims().size() > 2) {
-        matmul = graph->make("matmul_core", {trans0, trans1}, {},
-                {{attr_keys::output_channel_axis, output_channel_axis},
-                        {"transposed_a", transposed_a},
-                        {"transposed_b", transposed_b}});
-    } else {
-        if (use_mmm) {
-            matmul = graph->make("managed_matmul_core", {trans0, trans1}, {},
-                    {{"transposed_a", transposed_a},
-                            {"transposed_b", transposed_b},
-                            {"post_rd_axis", post_rd_axis},
-                            {attr_keys::output_channel_axis,
-                                    output_channel_axis}});
-        } else {
-            matmul = graph->make("matmul_core", {trans0, trans1}, {},
-                    {{attr_keys::output_channel_axis, output_channel_axis},
-                            {"transposed_a", transposed_a},
-                            {"transposed_b", transposed_b}});
-        }
-    }
-
-    // view the shape back
-    if (!is_dynamic()) {
-        // Nd*2d cases
-        if (trans0_plain_dims.size() > 2 && trans1_plain_dims.size() == 2) {
-            sc_dims reshape_dest
-                    = {trans0_plain_dims.begin(), trans0_plain_dims.end() - 1};
-            reshape_dest.emplace_back(trans1_plain_dims.back());
-            matmul = graph->make("tensor_view", {matmul->get_outputs()[0]},
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            matmul->get_outputs()[0]->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", sc_data_format_t()},
-                            {"source_matmul_2D2ND", true}});
-        }
-        // 2d*Nd cases
-        if (trans0_plain_dims.size() == 2 && trans1_plain_dims.size() > 2) {
-            sc_dims reshape_dest
-                    = {trans1_plain_dims.begin(), trans1_plain_dims.end() - 2};
-            reshape_dest.emplace_back(trans0_plain_dims[0]);
-            reshape_dest.emplace_back(trans1_plain_dims.back());
-
-            matmul = graph->make("tensor_view", {matmul->get_outputs()[0]},
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            matmul->get_outputs()[0]->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", sc_data_format_t()}});
-        }
-    }
-    if (is_bf16 || is_f16) {
-        matmul = graph->make("cast", matmul->get_outputs(), {},
-                {{"dtype", inputs[0]->details_.dtype_}});
-    }
-
-    // check optional input lotgical tensor: bias
-    if (info_.inputs_.size() == 3) {
-        // create bias op by using broadcast op
-        // considering: {bs0, bs1, .., M, N} and {M,N}, for bias, it shape
-        // is equal with N.
-        if (is_bf16 || is_f16) {
-            COMPILE_ASSERT(
-                    inputs[2]->details_.dtype_ == inputs[0]->details_.dtype_,
-                    "All inputs should have same data type.")
-        }
-        auto &matmul_output_shape
-                = matmul->get_outputs()[0]->details_.get_plain_dims();
-        int matmul_output_shape_size = matmul_output_shape.size();
-        auto &bias_shape = ins->get_outputs()[2]->details_.get_plain_dims();
-        int bias_shape_size = bias_shape.size();
-        COMPILE_ASSERT(bias_shape_size == matmul_output_shape_size
-                        || bias_shape_size == 1,
-                "Rank of bias should be 1 or same to matmul output");
-        std::vector<int> bc_axes;
-        for (int axis_id = bias_shape_size - 1; axis_id >= 0; --axis_id) {
-            auto bias_dim = bias_shape[axis_id];
-            if (bias_dim != 1) {
-                int matmul_output_axis;
-                if (matmul_output_shape_size == bias_shape_size) {
-                    matmul_output_axis = axis_id;
-                } else { // bias_shape_size is 1, axis_id is 0
-                    matmul_output_axis = matmul_output_shape_size - 1;
-                }
-                COMPILE_ASSERT(
-                        bias_dim == matmul_output_shape[matmul_output_axis],
-                        "Cannot broadcast bias");
-                bc_axes.push_back(matmul_output_axis);
-            }
-        }
-        auto bias = graph->make("add",
-                {matmul->get_outputs()[0], ins->get_outputs()[2]}, {},
-                {{"bc_axis", bc_axes}});
-        graph->make_output(bias->get_outputs());
-    } else {
-        graph->make_output(matmul->get_outputs());
-    }
-} // namespace ops
-
-void matmul_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-// matmul op is graph op, matmul_core_op_t is tunable op
-OP_REGISTER(ops::matmul_op, matmul)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/matmul.hpp b/src/graph/backend/graph_compiler/core/src/ops/matmul.hpp
deleted file mode 100644
index bb927b99b8c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/matmul.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_MATMUL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_MATMUL_HPP
-
-#include <memory>
-#include <vector>
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/graph/graph_op.hpp"
-#include "compiler/ir/graph/traits.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class matmul_op : public configurable_graph_op_t,
-                  public op_traits::auto_copyable_t {
-public:
-    matmul_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/matmul_core.cpp b/src/graph/backend/graph_compiler/core/src/ops/matmul_core.cpp
deleted file mode 100644
index c675be85d45..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/matmul_core.cpp
+++ /dev/null
@@ -1,1242 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "matmul_core.hpp"
-#include <algorithm>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <utility>
-#include "templates/matmul_core.hpp"
-#include "templates/utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/graph_map.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/quantization/quantize_info.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/shape_of_tensor.hpp>
-#include <runtime/config.hpp>
-#include <unordered_set>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-blocking_axis_t get_mm_blocking_axis(const logical_tensor_t &inp,
-        const logical_tensor_t &wei, const logical_tensor_t &out) {
-    auto generate_axis_by_num = [](int num) {
-        std::vector<int> ret;
-        ret.reserve(num);
-        for (int i = 0; i < num; i++)
-            ret.emplace_back(i);
-        return ret;
-    };
-    int A_dim_size = inp.get_plain_dims().size(),
-        B_dim_size = wei.get_plain_dims().size(),
-        C_dim_size = out.get_plain_dims().size();
-
-    blocking_axis_t blocking_axis;
-    // add init function here
-    blocking_axis.A_bs = transform_axis_plain2blocking(
-            inp, generate_axis_by_num(A_dim_size - 2));
-    blocking_axis.A_m = transform_axis_plain2blocking(
-            inp, std::vector<int> {A_dim_size - 2});
-    blocking_axis.A_k = transform_axis_plain2blocking(
-            inp, std::vector<int> {A_dim_size - 1});
-    blocking_axis.B_bs = transform_axis_plain2blocking(
-            wei, generate_axis_by_num(B_dim_size - 2));
-    blocking_axis.B_k = transform_axis_plain2blocking(
-            wei, std::vector<int> {B_dim_size - 2});
-    blocking_axis.B_n = transform_axis_plain2blocking(
-            wei, std::vector<int> {B_dim_size - 1});
-    blocking_axis.C_bs = transform_axis_plain2blocking(
-            out, generate_axis_by_num(C_dim_size - 2));
-    blocking_axis.C_m = transform_axis_plain2blocking(
-            out, std::vector<int> {C_dim_size - 2});
-    blocking_axis.C_n = transform_axis_plain2blocking(
-            out, std::vector<int> {C_dim_size - 1});
-
-    return blocking_axis;
-}
-
-matmul_core_op_t::matmul_core_op_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : tunable_op_t("matmul_core", ins, outs, attrs) {
-    COMPILE_ASSERT(info_.inputs_.size() == 2, "matmul_core expects 2 inputs");
-    auto &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    COMPILE_ASSERT(A_dims.size() >= 2 && B_dims.size() >= 2,
-            "matmul_core expects each input size equal or bigger than 2 , but "
-            "got " << A_dims.size());
-    batch_dims_ = get_batch_dims();
-    sc_dims expected_out_shape = {merge_vec(batch_dims_,
-            {A_dims[A_dims.size() - 2], B_dims[B_dims.size() - 1]})};
-
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                sc_data_format_t(), expected_out_shape, infer_out_dtype(ins)));
-    } else {
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "matmul_core expects 1 output");
-        if (!is_dynamic()) {
-            COMPILE_ASSERT(info_.outputs_[0]->details_.get_plain_dims()
-                            == expected_out_shape,
-                    "Bad out dims");
-        }
-    }
-    // record padded_K of input A for matmul_core
-    attrs_["temp.padded_A_K"] = std::make_shared<VConst>();
-}
-
-body_generator_ptr matmul_core_op_t::create_generator() {
-    auto mat_gen = utils::make_unique<gen_matmul_core_t>(this,
-            graph::extract_detail_from_tensors(get_inputs()),
-            graph::extract_detail_from_tensors(get_outputs()));
-    return std::move(mat_gen);
-}
-
-float matmul_core_op_t::get_gflop() {
-    return create_generator()->get_gflop();
-}
-
-sc_dims matmul_core_op_t::get_batch_dims() const {
-    auto &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    auto &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    if (is_dynamic()) {
-        return get_batch_dims_impl(A_dims, B_dims);
-    } else {
-        return get_batch_dims_with_bc_impl(A_dims, B_dims);
-    }
-}
-
-// directly use batch dims from A or B.
-sc_dims matmul_core_op_t::get_batch_dims_impl(
-        const sc_dims &A_dims, const sc_dims &B_dims) {
-    return A_dims.size() > B_dims.size()
-            ? sc_dims {A_dims.begin(), A_dims.end() - 2}
-            : sc_dims {B_dims.begin(), B_dims.end() - 2};
-}
-
-sc_dims matmul_core_op_t::get_batch_dims_with_bc_impl(
-        const sc_dims &A_plain_dims, const sc_dims &B_plain_dims) {
-    sc_dims C_batch_dims;
-    bool is_A_dims_long = A_plain_dims.size() >= B_plain_dims.size();
-    auto long_dims = is_A_dims_long ? A_plain_dims : B_plain_dims;
-    auto short_dims = is_A_dims_long ? B_plain_dims : A_plain_dims;
-    int num_dims = long_dims.size();
-    // In 2D case, no batch axis, C_batch_dims are empty, no broadcast
-    if (num_dims < 3) { return C_batch_dims; }
-
-    int dims_diff = long_dims.size() - short_dims.size();
-    C_batch_dims.insert(C_batch_dims.end(), long_dims.begin(),
-            long_dims.begin() + dims_diff);
-
-    if (std::equal(long_dims.begin() + dims_diff, long_dims.end() - 2,
-                short_dims.begin())) {
-        // all batch dims are equal, no broadcast
-        C_batch_dims.insert(C_batch_dims.end(), long_dims.begin() + dims_diff,
-                long_dims.end() - 2);
-        return C_batch_dims;
-    }
-
-    for (int i = dims_diff; i <= num_dims - 3; ++i) {
-        if (long_dims[i] == short_dims[i - dims_diff]) {
-            // no broadcasting at this dim
-            C_batch_dims.push_back(long_dims[i]);
-        } else { // long_dims[i] != short_dims[i - dims_diff]
-            if (long_dims[i] == 1 && short_dims[i - dims_diff] != 1) {
-                C_batch_dims.push_back(short_dims[i - dims_diff]);
-            } else if (long_dims[i] != 1 && short_dims[i - dims_diff] == 1) {
-                C_batch_dims.push_back(long_dims[i]);
-            } else { // both are not 1, invalid broadcast
-                COMPILE_ASSERT(
-                        false, "Can not broadcast in batch dims properly");
-            }
-        }
-    }
-    return C_batch_dims;
-}
-
-sc_data_type_t matmul_core_op_t::infer_out_dtype(
-        const std::vector<graph_tensor_ptr> &ins) {
-    if (ins.at(0)->details_.dtype_ == datatypes::u8
-            || ins.at(0)->details_.dtype_ == datatypes::s8) {
-        assert(ins.at(1)->details_.dtype_ == datatypes::s8);
-        return datatypes::s32;
-    }
-    return datatypes::f32;
-}
-
-void matmul_core_op_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    bool dynamic = is_dynamic();
-    if (!config_data_) {
-        config_data_ = create_generator()->get_default_config(ctx);
-    }
-    const bool is_A_not_blocking
-            = !info_.inputs_[0]->details_.get_format().is_blocking();
-    const sc_dims &A_dims = info_.inputs_[0]->details_.get_plain_dims();
-    const sc_dims &A_blocking_dims
-            = info_.inputs_[0]->details_.get_blocking_dims();
-    const bool is_B_not_blocking
-            = !info_.inputs_[1]->details_.get_format().is_blocking();
-    const sc_dims &B_dims = info_.inputs_[1]->details_.get_plain_dims();
-    const sc_dims &B_blocking_dims
-            = info_.inputs_[1]->details_.get_blocking_dims();
-    const sc_dims &C_dims = info_.outputs_[0]->details_.get_plain_dims();
-    const sc_dim M = A_dims[A_dims.size() - 2];
-    const sc_dim K = A_dims.back();
-    const sc_dim N = B_dims.back();
-    auto &graph = get_owner_graph();
-    matmul_core_config_t &tcfg = *config_data_.get_as<matmul_core_config_t>();
-    int M_block = tcfg.M_block, N_block = tcfg.N_block, K_block = tcfg.K_block;
-
-    in_formats.resize(2);
-    out_formats.resize(1);
-    sc_data_type_t B_dtype = info_.inputs_[1]->details_.dtype_;
-    sc_data_format_t A_format = info_.inputs_[0]->details_.get_format();
-    sc_data_format_t B_format = info_.inputs_[1]->details_.get_format();
-    bool is_B_vnni_low_fp = ops::is_vnni_low_fp(ctx, B_dtype);
-    // constant check
-    bool constant_A = false, constant_B = false;
-    bool block_A = attrs_.get_or_else("block_A", false);
-    bool block_B = attrs_.get_or_else("block_B", false);
-    bool transposed_a = attrs_.get_or_else("transposed_a", false);
-    bool transposed_b = attrs_.get_or_else("transposed_b", false);
-    if (info_.inputs_[0]->producer_owner_->isa<constant_op_t>()
-            || info_.inputs_[0]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)) {
-        constant_A = true;
-    } else {
-        bool constant_A_parents = true;
-        for (const auto &input :
-                info_.inputs_[0]->producer_owner_->get_inputs()) {
-            auto parent_node = input->producer_owner_;
-            constant_A_parents &= (parent_node->attrs_.get_or_else(
-                                           "constant", const_kind::not_const)
-                    || parent_node->isa<constant_op_t>());
-        }
-        constant_A = constant_A_parents
-                && !info_.inputs_[0]->producer_owner_->get_inputs().empty();
-    }
-
-    if (info_.inputs_[1]->producer_owner_->isa<constant_op_t>()
-            || info_.inputs_[1]->producer_owner_->attrs_.get_or_else(
-                    "constant", const_kind::not_const)) {
-        constant_B = true;
-    } else {
-        bool constant_B_parents = true;
-        for (const auto &input :
-                info_.inputs_[1]->producer_owner_->get_inputs()) {
-            auto parent_node = input->producer_owner_;
-            constant_B_parents &= (parent_node->attrs_.get_or_else(
-                                           "constant", const_kind::not_const)
-                    || parent_node->isa<constant_op_t>());
-        }
-        constant_B = constant_B_parents
-                && !info_.inputs_[1]->producer_owner_->get_inputs().empty();
-    }
-    assert(in_formats.size() == 2);
-    std::vector<int> blk_candidates = get_dynamic_block_candidates();
-    std::vector<int> m_blk_candidates = get_dynamic_batch_block_candidates();
-    auto A_m_blk = M_block, B_n_blk = N_block, A_k_blk = K_block;
-    auto C_m_blk = M_block, C_n_blk = N_block, B_k_blk = K_block;
-    sc_data_format_t ret_A_format, ret_B_format, ret_C_format;
-    auto cur_format_set = std::unordered_set<std::vector<sc_data_format_t>>();
-    auto cur_dispatch_key_set = dispatch_key_set_t();
-    bool first = true;
-    std::vector<bool> is_padding = {false, true};
-    std::vector<bool> is_output_plain = {false, true};
-    for (auto &m_b : m_blk_candidates) { // M
-        for (auto &n_b : blk_candidates) { // N
-            for (auto &k_b : blk_candidates) { // K
-                for (auto A_isp : is_padding) { // A is_padding
-                    for (auto B_isp : is_padding) { // B is_padding
-                        for (auto out_plain :
-                                is_output_plain) { // output plain, always
-                            // false in dynamic
-                            if (is_dynamic_dim(M_block)
-                                    || is_dynamic_dim(N_block)) {
-                                if (is_dynamic_dim(M_block)) {
-                                    A_m_blk = C_m_blk = m_b;
-                                    COMPILE_ASSERT(!constant_A,
-                                            "if M is dynamic, input A should "
-                                            "not "
-                                            "be constant!");
-                                } else {
-                                    A_m_blk = C_m_blk = M_block;
-                                }
-                                if (is_dynamic_dim(N_block)) {
-                                    B_n_blk = C_n_blk = n_b;
-                                    COMPILE_ASSERT(!constant_B,
-                                            "if N is dynamic, input B should "
-                                            "not "
-                                            "be constant!");
-                                } else {
-                                    B_n_blk = C_n_blk = N_block;
-                                }
-                            }
-                            if (is_dynamic_dim(K_block)) {
-                                A_k_blk = B_k_blk = k_b;
-                                COMPILE_ASSERT(!constant_A && !constant_B,
-                                        "if K is dynamic, input A and B should "
-                                        "not "
-                                        "be constant!");
-                            }
-                            // process A
-                            if (constant_A || block_A || A_isp
-                                    || (A_dims.size() > 2
-                                            && !A_format.is_plain()) // follow
-                                    // original
-                                    // logic
-                                    || (!dynamic && A_format.is_blocking())
-                                    || (!is_dynamic_dim(M) && M % M_block)
-                                    || (!is_dynamic_dim(K) && K % K_block)) {
-                                // should be blocking
-                                if (A_dims.size() == 2) {
-                                    ret_A_format = sc_data_format_t::MKmk(
-                                            A_m_blk, A_k_blk);
-                                } else {
-                                    // regular ND*ND matmul (non-batch
-                                    // format) whether constant and no
-                                    // transA
-                                    auto A_formt_m_blk = transposed_a
-                                            ? A_format.blocks_[1]
-                                            : A_format.blocks_[0];
-                                    auto A_formt_k_blk = transposed_a
-                                            ? A_format.blocks_[0]
-                                            : A_format.blocks_[1];
-                                    if (A_formt_m_blk != A_m_blk
-                                            || A_formt_k_blk != A_k_blk
-                                            || A_format.format_code_.get(
-                                                       A_blocking_dims.size()
-                                                       - 1)
-                                                    == static_cast<int>(
-                                                            A_dims.size()
-                                                            - 2)) {
-                                        ret_A_format = sc_data_format_t(
-                                                sc_data_format_kind_t::
-                                                        get_2dblocking_by_dims(
-                                                                A_dims.size()),
-                                                {A_m_blk, A_k_blk});
-                                    } else {
-                                        ret_A_format = A_format;
-                                    }
-                                }
-                            } else {
-                                // static or dynamic with no padding
-                                if (A_dims.size() == 2) {
-                                    ret_A_format = sc_data_format_t::MK();
-                                } else {
-                                    // regular ND*ND matmul (non-batch
-                                    // format)
-                                    ret_A_format = A_format;
-                                }
-                            }
-                            // if it is dynamic, follows the layout of last
-                            // layer.
-                            if (!is_A_not_blocking && dynamic) {
-                                ret_A_format = A_format;
-                                // follow last layer's config
-                                if (A_format.blocks_[0]) {
-                                    A_m_blk = C_m_blk = transposed_a
-                                            ? A_format.blocks_[1]
-                                            : A_format.blocks_[0];
-                                }
-                                if (A_format.blocks_[1]) {
-                                    A_k_blk = B_k_blk = transposed_a
-                                            ? A_format.blocks_[0]
-                                            : A_format.blocks_[1];
-                                }
-                            }
-                            // process B
-                            if (utils::is_one_of(B_dtype, datatypes::u8,
-                                        datatypes::s8)) {
-                                if (B_dims.size() == 2) {
-                                    ret_B_format = sc_data_format_t::NKkn4k(
-                                            B_k_blk, B_n_blk);
-                                } else {
-                                    ret_B_format = sc_data_format_t(
-                                            sc_data_format_kind_t::
-                                                    get_2dblocking_by_dims(
-                                                            B_dims.size(), true,
-                                                            true),
-                                            {B_k_blk, B_n_blk, 4});
-                                }
-                            } else if (is_B_vnni_low_fp) {
-                                if (B_dims.size() == 2) {
-                                    ret_B_format = sc_data_format_t::NKkn2k(
-                                            B_k_blk, B_n_blk);
-                                } else {
-                                    ret_B_format = sc_data_format_t(
-                                            sc_data_format_kind_t::
-                                                    get_2dblocking_by_dims(
-                                                            B_dims.size(), true,
-                                                            true),
-                                            {B_k_blk, B_n_blk, 2});
-                                }
-                            } else {
-                                auto B_format_kind = sc_data_format_kind_t::
-                                        get_2dblocking_by_dims(
-                                                B_dims.size(), true);
-                                if (constant_B || block_B || B_isp
-                                        || (B_dims.size() > 2
-                                                && !B_format.is_plain())
-                                        || (!dynamic && B_format.is_blocking())
-                                        || (!is_dynamic_dim(K) && K % K_block)
-                                        || (!is_dynamic_dim(N)
-                                                && N % N_block)) {
-                                    // should be blocking
-                                    if (B_dims.size() == 2) {
-                                        ret_B_format = sc_data_format_t::NKkn(
-                                                B_k_blk, B_n_blk);
-                                    } else {
-                                        // regular ND*ND matmul (non-batch
-                                        // format) whether constant and no
-                                        // transA
-                                        auto B_formt_k_blk = transposed_b
-                                                ? B_format.blocks_[1]
-                                                : B_format.blocks_[0];
-                                        auto B_formt_n_blk = transposed_b
-                                                ? B_format.blocks_[0]
-                                                : B_format.blocks_[1];
-                                        if (B_formt_k_blk != B_k_blk
-                                                || B_formt_n_blk != B_n_blk
-                                                || B_format.format_code_.get(
-                                                           B_dims.size() - 1)
-                                                        == static_cast<int>(
-                                                                B_dims.size()
-                                                                - 2)) {
-                                            ret_B_format = sc_data_format_t(
-                                                    B_format_kind,
-                                                    {B_k_blk, B_n_blk});
-                                        } else {
-                                            ret_B_format = B_format;
-                                        }
-                                    }
-                                } else {
-                                    // static or dynamic with no padding
-                                    if (B_dims.size() == 2) {
-                                        ret_B_format = sc_data_format_t::KN();
-                                    } else {
-                                        // regular ND*ND matmul (non-batch
-                                        // format)
-                                        ret_B_format = B_format;
-                                    }
-                                }
-                            }
-                            // process C
-                            if (((!constant_A && !constant_B && !dynamic
-                                         && M % M_block == 0
-                                         && N % N_block == 0)
-                                        || (dynamic && !A_isp && !B_isp))
-                                    && out_plain) {
-                                if (C_dims.size() == 2) {
-                                    ret_C_format = sc_data_format_t::MK();
-                                } else {
-                                    // regular ND*ND matmul (non-batch
-                                    // format)
-                                    ret_C_format = sc_data_format_t::
-                                            get_plain_by_dims(C_dims.size());
-                                }
-                            } else {
-                                if (C_dims.size() == 2) {
-                                    ret_C_format = sc_data_format_t::MKmk(
-                                            C_m_blk, C_n_blk);
-                                } else {
-                                    // regular ND*ND matmul (non-batch
-                                    // format)
-                                    ret_C_format = sc_data_format_t(
-                                            sc_data_format_kind_t::
-                                                    get_2dblocking_by_dims(
-                                                            C_dims.size()),
-                                            {C_m_blk, C_n_blk});
-                                }
-                            }
-                            std::vector<std::vector<sc_dim>> var_block
-                                    = {{A_m_blk, A_k_blk}, {B_k_blk, B_n_blk},
-                                            {C_m_blk, C_n_blk}};
-                            std::vector<sc_data_format_t> ret_formats = {
-                                    ret_A_format, ret_B_format, ret_C_format};
-                            if (dynamic) {
-                                op_dispatch_key_t ret_key(
-                                        var_block, ret_formats);
-                                cur_dispatch_key_set.set_.insert(ret_key);
-                            }
-                            if (cur_format_set.find(ret_formats)
-                                    == cur_format_set.end()) {
-                                in_formats[0].emplace_back(ret_A_format);
-                                in_formats[1].emplace_back(ret_B_format);
-                                out_formats[0].emplace_back(ret_C_format);
-                                cur_format_set.insert(ret_formats);
-                            }
-                            // reset default cfg to first candidate in
-                            // dynamic for try mode in fuse op pass.
-                            if (first && dynamic) {
-                                tcfg.M_block = C_m_blk;
-                                tcfg.N_block = C_n_blk;
-                                tcfg.K_block = std::min(A_k_blk, B_k_blk);
-                                first = false;
-                            }
-
-                            // break output plain if op is dynamic
-                            if (dynamic) { break; }
-                        }
-                        // break is B padding loop if it is static
-                        if (!is_dynamic_dim(K) && !is_dynamic_dim(N)) { break; }
-                    }
-                    // break is A padding loop if it is static
-                    if (!is_dynamic_dim(M) && !is_dynamic_dim(K)) { break; }
-                }
-                // break the k loop if it is static
-                if (!dynamic) { break; }
-            }
-            // break the n loop if it is static
-            if (!dynamic) { break; }
-        }
-        // break the m loop if it is static
-        if (!dynamic) { break; }
-    }
-    if (dynamic) {
-        auto &dispatch_key_set = get_dispatch_key_set();
-        dispatch_key_set->get_inner_set().insert(
-                cur_dispatch_key_set.set_.begin(),
-                cur_dispatch_key_set.set_.end());
-    }
-    // To calculate padded K of input A
-    auto pad_K_num = utils::divide_and_ceil(K, K_block);
-    attrs_["temp.padded_A_K"].get<std::shared_ptr<VConst>>()->var_
-            = pad_K_num * K_block;
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-
-void matmul_core_op_t::set_config_by_key(
-        const op_dispatch_key_t &key, const context_ptr &ctx) {
-    assert(key.var_block_.size() == 3);
-    config_data_ = create_generator()->get_default_config(ctx);
-    matmul_core_config_t &tcfg = *config_data_.get_as<matmul_core_config_t>();
-    tcfg.M_block = key.var_block_[2][0];
-    tcfg.N_block = key.var_block_[2][1];
-    tcfg.K_block = std::min(key.var_block_[0][1], key.var_block_[1][0]);
-}
-
-std::vector<int> matmul_core_op_t::get_impl_dispatch_candidates(
-        const context_ptr &ctx) {
-    return {};
-}
-
-sc_op_ptr matmul_core_op_t::do_compensations(
-        sc_graph_t &mgr, const context_ptr &ctx) {
-    need_compensation_ = false;
-    // whether we need special compensation for microkernel.
-    bool s8s8_compensation = ctx->machine_.cpu_flags_.fAVX512VNNI
-            && info_.inputs_[0]->details_.dtype_ == datatypes::s8
-            && (!ctx->machine_.brgemm_use_amx_
-                    || (ctx->machine_.brgemm_use_amx_
-                            && !ctx->machine_.cpu_flags_.fAVX512AMXINT8));
-
-    auto cur_node = shared_from_this();
-
-    auto data_com = get_data_compensation(mgr);
-    auto s8s8_weight_com
-            = get_s8s8_and_weight_compensation(mgr, s8s8_compensation);
-    auto const_com = get_constant_compensation(mgr);
-
-    if (data_com) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0], data_com->get_outputs()[0]}, {},
-                {});
-    }
-
-    if (s8s8_weight_com[0]) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0],
-                        s8s8_weight_com[0]->get_outputs()[0]},
-                {}, {});
-    }
-    if (s8s8_weight_com[1]) {
-        cur_node = mgr.make("sub",
-                {cur_node->get_outputs()[0],
-                        s8s8_weight_com[1]->get_outputs()[0]},
-                {}, {});
-    }
-    if (const_com) {
-        cur_node = mgr.make("add",
-                {cur_node->get_outputs()[0], const_com->get_outputs()[0]}, {},
-                {});
-    }
-
-    return cur_node;
-}
-
-sc_op_ptr matmul_core_op_t::get_data_compensation(sc_graph_t &mgr) {
-    std::string weight_zp_key = attr_keys::weight_zero_points;
-    std::string dyn_weight_zp_key = attr_keys::dyn_weight_zero_points;
-    bool is_dyn_quan = attrs_.has_key(dyn_weight_zp_key);
-    auto weight_zero_points
-            = attrs_.get_or_else(weight_zp_key, std::vector<int> {0});
-    auto dyn_weight_zero_points
-            = attrs_.get_or_else(dyn_weight_zp_key, graph_tensor_ptr());
-    if (!is_dyn_quan
-            && (weight_zero_points.empty()
-                    || (std::all_of(weight_zero_points.begin(),
-                            weight_zero_points.end(),
-                            [](int i) { return i == 0; })))) {
-        return nullptr;
-    }
-    if (is_dyn_quan && !dyn_weight_zero_points) { return nullptr; }
-    auto data = info_.inputs_[0];
-    auto cast_node = mgr.make("cast", {data}, {}, {{"dtype", datatypes::s32}});
-
-    // K is reduce axis
-    std::vector<int> rdaxis
-            = {static_cast<int>(data->details_.get_plain_dims().size()) - 1};
-
-    auto reduce_node = mgr.make("reduce", cast_node->get_outputs(), {},
-            {{"rd_axis", rdaxis}, {"rd_op", 0}, {"keep_dims", true}});
-    sc_op_ptr mul_node;
-    if (is_dyn_quan) {
-        COMPILE_ASSERT(dyn_weight_zero_points->details_.get_plain_dims()
-                        == sc_dims {1},
-                "matmul_core does not support per channel weight zero "
-                "points compensation yet");
-        mul_node = mgr.make("mul",
-                {reduce_node->get_outputs()[0], dyn_weight_zero_points}, {},
-                {});
-    } else {
-        std::shared_ptr<static_data_t> weight_zero_points_ptr
-                = std::make_shared<static_data_t>(weight_zero_points);
-        sc_dims const_plain_dims;
-        sc_data_format_t const_format;
-        if (weight_zero_points.size() == 1) {
-            // per tensor
-            const_plain_dims = {1};
-        } else {
-            // per channel
-            COMPILE_ASSERT(0,
-                    "matmul_core does not support per channel weight zero "
-                    "points "
-                    "compensation yet");
-            auto weight = info_.inputs_[1];
-            auto weight_plain_dims = weight->details_.get_plain_dims();
-            assert(weight_plain_dims.back()
-                    == static_cast<int64_t>(weight_zero_points.size()));
-            const_plain_dims = {1, weight_plain_dims.back()};
-            const_format = info_.inputs_[1]->details_.get_format();
-        }
-        auto constant_node = mgr.make("constant", {}, {},
-                {{"values", weight_zero_points_ptr}, {"dtype", datatypes::s32},
-                        {"plain_dims", const_plain_dims},
-                        {"format", const_format}});
-        mul_node = mgr.make("mul",
-                {reduce_node->get_outputs()[0],
-                        constant_node->get_outputs()[0]},
-                {}, {});
-    }
-    if (data->details_.get_plain_dims().size() < batch_dims_.size() + 2) {
-        sc_dims unsqueeze_shape(
-                batch_dims_.size() + 2 - data->details_.get_plain_dims().size(),
-                1);
-        sc_dims reshape_dest
-                = merge_vec(unsqueeze_shape, data->details_.get_plain_dims());
-        reshape_dest.at(reshape_dest.size() - 1) = 1;
-        auto reshape_fmt = info_.outputs_[0]->details_.get_format();
-        auto reshape_node = mgr.make("tensor_view", mul_node->get_outputs(),
-                {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                        mul_node->get_outputs()[0]->details_.dtype_)},
-                {{"shape", reshape_dest}, {"format", reshape_fmt}});
-        return reshape_node;
-    }
-    return mul_node;
-}
-
-std::vector<sc_op_ptr> matmul_core_op_t::get_s8s8_and_weight_compensation(
-        sc_graph_t &mgr, bool s8s8_compensation) {
-    std::string data_zp_key = attr_keys::data_zero_points;
-    std::string dyn_data_zp_key = attr_keys::dyn_data_zero_points;
-    bool is_dyn_quan = attrs_.has_key(dyn_data_zp_key);
-    auto data_zero_points
-            = attrs_.get_or_else(data_zp_key, std::vector<int> {0});
-    auto dyn_data_zero_points
-            = attrs_.get_or_else(dyn_data_zp_key, graph_tensor_ptr());
-    bool weight_compensation = (is_dyn_quan && dyn_data_zero_points)
-            || (!data_zero_points.empty()
-                    && !(std::all_of(data_zero_points.begin(),
-                            data_zero_points.end(),
-                            [](int i) { return i == 0; })));
-    std::vector<sc_op_ptr> nodes = {nullptr, nullptr};
-    if (!s8s8_compensation && !weight_compensation) { return nodes; }
-
-    auto weight = info_.inputs_[1];
-    auto cast_node
-            = mgr.make("cast", {weight}, {}, {{"dtype", datatypes::s32}});
-
-    // K is reduce axis
-    std::vector<int> rdaxis
-            = {static_cast<int>(weight->details_.get_plain_dims().size()) - 2};
-    auto reduce_node = mgr.make("reduce", cast_node->get_outputs(), {},
-            {{"rd_axis", rdaxis}, {"rd_op", 0}, {"keep_dims", true}});
-
-    if (weight_compensation) {
-        if (is_dyn_quan) {
-            COMPILE_ASSERT(dyn_data_zero_points->details_.get_plain_dims()
-                            == sc_dims {1},
-                    "matmul_core does not support per channel data zero "
-                    "points compensation yet");
-            nodes[0] = mgr.make("mul",
-                    {reduce_node->get_outputs()[0], dyn_data_zero_points}, {},
-                    {});
-        } else {
-            std::shared_ptr<static_data_t> data_zero_points_ptr
-                    = std::make_shared<static_data_t>(data_zero_points);
-            sc_dims const_plain_dims;
-            sc_data_format_t const_format;
-            if (data_zero_points.size() == 1) {
-                // per tensor
-                const_plain_dims = {1};
-            } else {
-                // per channel
-                COMPILE_ASSERT(0,
-                        "matmul_core does not support per channel data zero "
-                        "points "
-                        "compensation yet");
-                auto data = info_.inputs_[0];
-                auto data_plain_dims = data->details_.get_plain_dims();
-                size_t bds = batch_dims_.size();
-                assert(data_plain_dims[bds]
-                        == static_cast<int64_t>(data_zero_points.size()));
-                const_plain_dims = {data_plain_dims[bds], 1};
-                const_format = info_.inputs_[0]->details_.get_format();
-            }
-            auto constant_node = mgr.make("constant", {}, {},
-                    {{"values", data_zero_points_ptr},
-                            {"dtype", datatypes::s32},
-                            {"plain_dims", const_plain_dims},
-                            {"format", const_format}});
-            nodes[0] = mgr.make("mul",
-                    {reduce_node->get_outputs()[0],
-                            constant_node->get_outputs()[0]},
-                    {}, {});
-        }
-        if (weight->details_.get_plain_dims().size() < batch_dims_.size() + 2) {
-            sc_dims unsqueeze_shape(batch_dims_.size() + 2
-                            - weight->details_.get_plain_dims().size(),
-                    1);
-            sc_dims reshape_dest = merge_vec(
-                    unsqueeze_shape, weight->details_.get_plain_dims());
-            reshape_dest.at(reshape_dest.size() - 2) = 1;
-            auto reshape_fmt = info_.outputs_[0]->details_.get_format();
-            nodes[0] = mgr.make("tensor_view", nodes[0]->get_outputs(),
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            nodes[0]->get_outputs()[0]->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", reshape_fmt}});
-        }
-    }
-
-    if (s8s8_compensation) {
-        auto s8_constant_node = mgr.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(
-                                 std::vector<int> {128})},
-                        {"dtype", datatypes::s32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        nodes[1] = mgr.make("mul",
-                {reduce_node->get_outputs()[0],
-                        s8_constant_node->get_outputs()[0]},
-                {}, {});
-        if (weight->details_.get_plain_dims().size() < batch_dims_.size() + 2) {
-            sc_dims unsqueeze_shape(batch_dims_.size() + 2
-                            - weight->details_.get_plain_dims().size(),
-                    1);
-            sc_dims reshape_dest = merge_vec(
-                    unsqueeze_shape, weight->details_.get_plain_dims());
-            reshape_dest.at(reshape_dest.size() - 2) = 1;
-            auto reshape_fmt = info_.outputs_[0]->details_.get_format();
-            nodes[1] = mgr.make("tensor_view", nodes[1]->get_outputs(),
-                    {graph_tensor::make(reshape_dest, sc_data_format_t(),
-                            nodes[1]->get_outputs()[0]->details_.dtype_)},
-                    {{"shape", reshape_dest}, {"format", reshape_fmt}});
-        }
-    }
-    return nodes;
-}
-
-sc_op_ptr matmul_core_op_t::get_constant_compensation(sc_graph_t &mgr) {
-    bool is_dyn_quan = attrs_.has_key(attr_keys::dyn_data_zero_points);
-    auto data_zero_points = attrs_.get_or_else(
-            attr_keys::data_zero_points, std::vector<int> {0});
-    auto weight_zero_points = attrs_.get_or_else(
-            attr_keys::weight_zero_points, std::vector<int> {0});
-    auto dyn_data_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_data_zero_points, graph_tensor_ptr());
-    auto dyn_weight_zero_points = attrs_.get_or_else(
-            attr_keys::dyn_weight_zero_points, graph_tensor_ptr());
-    auto K_orig = info_.inputs_[0]->details_.get_plain_dims().at(
-            info_.inputs_[0]->details_.get_plain_dims().size() - 1);
-    int K = static_cast<int>(K_orig);
-    COMPILE_ASSERT(attrs_.has_key("temp.padded_A_K"),
-            "No related VConst set, which maybe cause correctness error")
-    sc_op_ptr ret_node;
-    if (is_dyn_quan) {
-        if (!dyn_data_zero_points || !dyn_weight_zero_points) {
-            return nullptr;
-        }
-    } else {
-        if (data_zero_points.empty() || weight_zero_points.empty()) {
-            return nullptr;
-        }
-        if ((std::all_of(data_zero_points.begin(), data_zero_points.end(),
-                    [](int i) { return i == 0; }))
-                || (std::all_of(weight_zero_points.begin(),
-                        weight_zero_points.end(),
-                        [](int i) { return i == 0; }))) {
-            return nullptr;
-        }
-    }
-    if (is_dynamic_dim(K_orig)) {
-        COMPILE_ASSERT(!is_dyn_quan,
-                "Currently dynamic shape hasn't integrated with dynamic "
-                "quantize.");
-        ret_node = mgr.make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(std::vector<int> {
-                                 data_zero_points[0] * weight_zero_points[0]})},
-                        {"dtype", datatypes::s32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        auto weight = info_.inputs_[1];
-        int shape_idx = static_cast<int>(
-                weight->details_.get_plain_dims().size() - 2);
-        auto reduce_shape = mgr.make("shape_of_tensor", {weight}, {},
-                {{"shape_idx", shape_idx},
-                        {attr_keys::padding_shape_type,
-                                static_cast<int>(padding_shape_etype_t::
-                                                matmul_padding)}});
-        ret_node = mgr.make("mul",
-                {ret_node->get_outputs()[0], reduce_shape->get_outputs()[0]},
-                {}, {});
-    } else {
-        if (is_dyn_quan) {
-            COMPILE_ASSERT(dyn_data_zero_points->details_.get_plain_dims()
-                                    == sc_dims {1}
-                            && dyn_weight_zero_points->details_.get_plain_dims()
-                                    == sc_dims {1},
-                    "matmul_core does not support per channel data/weight zero "
-                    "points compensation yet");
-            ret_node = mgr.make("mul",
-                    {dyn_data_zero_points, dyn_weight_zero_points}, {}, {});
-            auto const_reduce = mgr.make("constant", {}, {},
-                    {{"dtype", datatypes::s32},
-                            {"values",
-                                    std::make_shared<static_data_t>(
-                                            &K, sizeof(int))},
-                            {"plain_dims", sc_dims {1}},
-                            {"format", sc_data_format_t()}, {"temp.val/var", 1},
-                            {"temp.var", attrs_["temp.padded_A_K"]}});
-            ret_node = mgr.make("mul",
-                    {ret_node->get_outputs()[0],
-                            const_reduce->get_outputs()[0]},
-                    {}, {});
-        } else {
-            COMPILE_ASSERT(data_zero_points.size() == 1
-                            && weight_zero_points.size() == 1,
-                    "matmul_core does not support per channel data/weight zero "
-                    "points "
-                    "compensation yet");
-            ret_node = mgr.make("constant", {}, {},
-                    {{"values",
-                             std::make_shared<static_data_t>(
-                                     std::vector<int> {data_zero_points[0]
-                                             * weight_zero_points[0] * K})},
-                            {"dtype", datatypes::s32},
-                            {"plain_dims", sc_dims {1}},
-                            {"format", sc_data_format_t()},
-                            {"temp.val/var",
-                                    data_zero_points[0]
-                                            * weight_zero_points[0]},
-                            {"temp.var", attrs_["temp.padded_A_K"]}});
-        }
-    }
-    return ret_node;
-}
-
-shape_rl_vec matmul_core_op_t::get_dynamic_shape_relations() const {
-    return get_shape_relations_impl(get_inputs()[0]->details_.get_plain_dims(),
-            get_inputs()[1]->details_.get_plain_dims(),
-            get_outputs()[0]->details_.get_plain_dims());
-}
-
-shape_rl_vec matmul_core_op_t::get_shape_relations_impl(
-        const std::vector<sc_dim> &data_plain_dims,
-        const std::vector<sc_dim> &weight_plain_dims,
-        const std::vector<sc_dim> &out_plain_dims) {
-    assert(data_plain_dims.size() == weight_plain_dims.size()
-            || data_plain_dims.size() == 2 || weight_plain_dims.size() == 2);
-    shape_rl_vec ret;
-    auto data_M = data_plain_dims[data_plain_dims.size() - 2];
-    auto data_K = data_plain_dims[data_plain_dims.size() - 1];
-    auto weight_K = weight_plain_dims[weight_plain_dims.size() - 2];
-    auto weight_N = weight_plain_dims[weight_plain_dims.size() - 1];
-    auto out_M = out_plain_dims[out_plain_dims.size() - 2];
-    auto out_N = out_plain_dims[out_plain_dims.size() - 1];
-    if (is_dynamic_dim(data_K) || is_dynamic_dim(weight_K)) {
-        ret.emplace_back(data_K, weight_K);
-    }
-    if (is_dynamic_dim(data_M) || is_dynamic_dim(out_M)) {
-        ret.emplace_back(data_M, out_M);
-    }
-    if (is_dynamic_dim(weight_N) || is_dynamic_dim(out_N)) {
-        ret.emplace_back(weight_N, out_N);
-    }
-    if (data_plain_dims.size() == weight_plain_dims.size()
-            && data_plain_dims.size() > 2) {
-        for (size_t i = 0; i < data_plain_dims.size() - 2; i++) {
-            if (is_dynamic_dim(data_plain_dims[i])
-                    || is_dynamic_dim(weight_plain_dims[i])) {
-                ret.emplace_back(data_plain_dims[i], weight_plain_dims[i]);
-                if (is_dynamic_dim(data_plain_dims[i])) {
-                    ret.emplace_back(data_plain_dims[i], out_plain_dims[i]);
-                } else {
-                    ret.emplace_back(weight_plain_dims[i], out_plain_dims[i]);
-                }
-            }
-        }
-    }
-    return ret;
-}
-
-infer_status_code matmul_core_op_t::infer_slice_ranges(
-        const context_ptr &ctx, fslice_map &fsmap) {
-    slice_range_map known_ranges_map = search_known_input_slice(this, fsmap);
-
-    // assume input is known
-    if (known_ranges_map[0].empty() && known_ranges_map[1].empty()) {
-        return infer_status_code::RETRY;
-    }
-
-    auto inp_plain_size = get_inputs()[0]->details_.get_plain_dims().size(),
-         wei_plain_size = get_inputs()[1]->details_.get_plain_dims().size();
-    auto &graph = get_owner_graph();
-    auto inp_dims = get_inputs()[0]->details_.get_blocking_dims();
-    auto wei_dims = get_inputs()[1]->details_.get_blocking_dims();
-    auto inp_dims_expr
-            = get_inputs()[0]->details_.get_blocking_dims_expr(graph);
-    auto wei_dims_expr
-            = get_inputs()[1]->details_.get_blocking_dims_expr(graph);
-    auto out_dims_expr
-            = get_outputs()[0]->details_.get_blocking_dims_expr(graph);
-
-    auto batch_dims_size = batch_dims_.size();
-
-    slice_range inp_slice, wei_slice, out_slice;
-    blocking_axis_t blocking_axis
-            = get_mm_blocking_axis(get_inputs()[0]->details_,
-                    get_inputs()[1]->details_, get_outputs()[0]->details_);
-    if (!known_ranges_map[0].empty()) {
-        if (known_ranges_map[0].size() > 1) { return infer_status_code::RETRY; }
-        inp_slice = known_ranges_map[0][0];
-        // check whether do M-axis fusion
-        bool M_axis_fuse = false;
-        if (get_inputs()[0]->details_.get_format().is_blocking()
-                && (blocking_axis.A_m.size() == blocking_axis.C_m.size())) {
-            if (ctx && ctx->flags_.use_cost_model_ && batch_dims_.empty()) {
-                const int run_threads
-                        = runtime_config_t::get().get_num_threads();
-                int prod = 1;
-                for (int i = 0; i < std::min(blocking_axis.A_k.front(),
-                                        (*(blocking_axis.A_m.begin() + 1)));
-                        i++) {
-                    prod *= inp_dims[i];
-                }
-                if (prod != 1 && run_threads != 1) {
-                    M_axis_fuse = (prod / run_threads > 8
-                            || (prod % run_threads == 0
-                                    && prod >= run_threads));
-                }
-            } else
-                M_axis_fuse = true;
-        }
-        std::vector<int> required_axis = M_axis_fuse
-                ? std::vector<int> {blocking_axis.A_m.begin() + 1,
-                        blocking_axis.A_m.end()}
-                : blocking_axis.A_m;
-        required_axis.insert(required_axis.end(), blocking_axis.A_k.begin(),
-                blocking_axis.A_k.end());
-        // Currently, support fuse batch dims and outer-most m_o
-        if (!slice_full_on_axis(inp_dims, inp_slice, required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-    if (!known_ranges_map[1].empty()) {
-        if (known_ranges_map[1].size() > 1) { return infer_status_code::RETRY; }
-        wei_slice = known_ranges_map[1][0];
-        auto required_axis = blocking_axis.B_k;
-        required_axis.insert(required_axis.end(), blocking_axis.B_n.begin(),
-                blocking_axis.B_n.end());
-        // Currently, only fuse batch dims
-        if (!slice_full_on_axis(wei_dims, wei_slice, required_axis)) {
-            return infer_status_code::RETRY;
-        }
-    }
-
-    if (!known_ranges_map[0].empty() && known_ranges_map[1].empty()) {
-        // implicit broadcast semantic
-        if (inp_plain_size < wei_plain_size) {
-            return infer_status_code::RETRY;
-        }
-        auto wei_size = wei_dims_expr.size();
-        wei_slice.resize(wei_size);
-        int bs_cnt = 0;
-        for (int64_t i = static_cast<int64_t>(wei_size) - 1; i >= 0; i--) {
-            if (std::find(
-                        blocking_axis.B_bs.begin(), blocking_axis.B_bs.end(), i)
-                    != blocking_axis.B_bs.end()) {
-                int bs_idx_inp
-                        = blocking_axis
-                                  .A_bs[blocking_axis.A_bs.size() - 1 - bs_cnt];
-                // explicit broadcast semantic
-                if (inp_dims[bs_idx_inp] < wei_dims[i]) {
-                    return infer_status_code::RETRY;
-                } else if (inp_dims[bs_idx_inp] == wei_dims[i]) {
-                    wei_slice[i] = inp_slice[bs_idx_inp];
-                } else {
-                    COMPILE_ASSERT(
-                            wei_dims[i] == 1, "broadcast weight is expected")
-                    wei_slice[i] = std::make_pair(expr(0), expr(1));
-                }
-                bs_cnt++;
-            } else {
-                wei_slice[i] = std::make_pair(expr(0), wei_dims_expr[i]);
-            }
-        }
-    }
-    if (known_ranges_map[0].empty() && !known_ranges_map[1].empty()) {
-        // implicit broadcast semantic
-        if (inp_plain_size > wei_plain_size) {
-            return infer_status_code::RETRY;
-        }
-        auto inp_size = inp_dims_expr.size();
-        inp_slice.resize(inp_size);
-        int bs_cnt = 0;
-        for (int64_t i = static_cast<int64_t>(inp_size) - 1; i >= 0; i--) {
-            if (std::find(
-                        blocking_axis.A_bs.begin(), blocking_axis.A_bs.end(), i)
-                    != blocking_axis.A_bs.end()) {
-                int bs_idx_wei
-                        = blocking_axis
-                                  .B_bs[blocking_axis.B_bs.size() - 1 - bs_cnt];
-                // explicit broadcast semantic
-                if (wei_dims[bs_idx_wei] < inp_dims[i]) {
-                    return infer_status_code::RETRY;
-                } else if (wei_dims[bs_idx_wei] == inp_dims[i]) {
-                    inp_slice[i] = wei_slice[bs_idx_wei];
-                } else {
-                    COMPILE_ASSERT(
-                            inp_dims[i] == 1, "broadcast input is expected")
-                    inp_slice[i] = std::make_pair(expr(0), expr(1));
-                }
-                bs_cnt++;
-            } else {
-                inp_slice[i] = std::make_pair(expr(0), inp_dims_expr[i]);
-            }
-        }
-    }
-
-    // set output slice
-    auto ref_slice = (inp_plain_size >= wei_plain_size) ? inp_slice : wei_slice;
-    auto ref_bs_axis = (inp_plain_size >= wei_plain_size) ? blocking_axis.A_bs
-                                                          : blocking_axis.B_bs;
-    auto out_size = out_dims_expr.size();
-    out_slice.resize(out_size);
-    int bs_cnt = 0;
-    int m_cnt = 0;
-    for (size_t i = 0; i < out_size; i++) {
-        if (std::find(blocking_axis.C_bs.begin(), blocking_axis.C_bs.end(), i)
-                != blocking_axis.C_bs.end()) {
-            out_slice[i] = ref_slice[ref_bs_axis[bs_cnt]];
-            bs_cnt++;
-        } else if (std::find(blocking_axis.C_m.begin(), blocking_axis.C_m.end(),
-                           i)
-                != blocking_axis.C_m.end()) {
-            if (blocking_axis.A_m.size() == blocking_axis.C_m.size()) {
-                out_slice[i] = inp_slice[blocking_axis.A_m[m_cnt]];
-                m_cnt++;
-            } else {
-                out_slice[i] = std::make_pair(expr(0), out_dims_expr[i]);
-            }
-        } else {
-            out_slice[i] = std::make_pair(expr(0), out_dims_expr[i]);
-        }
-    }
-
-    fsmap.get(get_inputs()[0]) = slice_range_list {inp_slice};
-    fsmap.get(get_inputs()[1]) = slice_range_list {wei_slice};
-    fsmap.get(get_outputs()[0]) = slice_range_list {out_slice};
-
-    return infer_status_code::OK;
-}
-
-void infer_matmul_binding_axis(tunable_op_t *cur, binding_axis_map &bdax_map) {
-    // search known axis from any input of cur fusbile op
-    auto known_axis_map = search_known_input_axis(cur, bdax_map);
-
-    binding_axis &inp_axis = known_axis_map[0], &wei_axis = known_axis_map[1],
-                 &out_axis = bdax_map.get(cur->get_outputs()[0]);
-    if (!out_axis.empty()) return;
-    auto inp_plain_dims = cur->get_inputs()[0]->details_.get_plain_dims(),
-         wei_plain_dims = cur->get_inputs()[1]->details_.get_plain_dims(),
-         out_plain_dims = cur->get_outputs()[0]->details_.get_plain_dims();
-    // if input is known
-    if (!inp_axis.empty()) {
-        for (auto &bd_axis : inp_axis) {
-            std::vector<int> ret_w, ret_o;
-            for (auto &ax : bd_axis) {
-                COMPILE_ASSERT(ax < static_cast<int64_t>(inp_plain_dims.size()),
-                        "matmul core input binded axis could not exceed "
-                        "plain dims size: "
-                                << inp_plain_dims.size() << ", but got " << ax)
-                int distance_from_right
-                        = static_cast<int64_t>(inp_plain_dims.size()) - 1 - ax;
-                // bind weight axis
-                if (distance_from_right == 0) {
-                    ret_w.emplace_back(wei_plain_dims.size() - 2);
-                } else if (distance_from_right > 1
-                        && distance_from_right
-                                < static_cast<int64_t>(wei_plain_dims.size())) {
-                    ret_w.emplace_back(
-                            wei_plain_dims.size() - 1 - distance_from_right);
-                }
-                // bind output axis
-                if (distance_from_right == 1) {
-                    ret_o.emplace_back(out_plain_dims.size() - 2);
-                } else if (distance_from_right > 1) {
-                    ret_o.emplace_back(
-                            out_plain_dims.size() - 1 - distance_from_right);
-                }
-            }
-            wei_axis.emplace_back(ret_w);
-            out_axis.emplace_back(ret_o);
-        }
-    }
-    // if weight is known, input is unknown
-    else {
-        for (auto &bd_axis : wei_axis) {
-            std::vector<int> ret_i, ret_o;
-            for (auto &ax : bd_axis) {
-                COMPILE_ASSERT(ax < static_cast<int64_t>(wei_plain_dims.size()),
-                        "matmul core weight binded axis could not exceed "
-                        "plain dims size: "
-                                << wei_plain_dims.size() << ", but got " << ax)
-                int distance_from_right
-                        = static_cast<int64_t>(wei_plain_dims.size()) - 1 - ax;
-                // bind input axis
-                if (distance_from_right == 1) {
-                    ret_i.emplace_back(inp_plain_dims.size() - 1);
-                } else if (distance_from_right > 1
-                        && distance_from_right
-                                < static_cast<int64_t>(inp_plain_dims.size())) {
-                    ret_i.emplace_back(
-                            inp_plain_dims.size() - 1 - distance_from_right);
-                }
-                // bind output axis
-                if (distance_from_right == 0) {
-                    ret_o.emplace_back(out_plain_dims.size() - 1);
-                } else if (distance_from_right > 1) {
-                    ret_o.emplace_back(
-                            out_plain_dims.size() - 1 - distance_from_right);
-                }
-            }
-            inp_axis.emplace_back(ret_i);
-            out_axis.emplace_back(ret_o);
-        }
-    }
-    set_unknown_binding_axis(cur, known_axis_map, bdax_map);
-}
-
-void pre_matmul_binding_axis(tunable_op_t *cur, binding_axis_map &bdax_map) {
-    auto &outaxis = bdax_map.get(cur->get_outputs()[0]);
-    COMPILE_ASSERT(!outaxis.empty(),
-            "Unknown output axis found, could not pre infer binding axis")
-
-    auto inp_plain_dims = cur->get_inputs()[0]->details_.get_plain_dims(),
-         wei_plain_dims = cur->get_inputs()[1]->details_.get_plain_dims(),
-         out_plain_dims = cur->get_outputs()[0]->details_.get_plain_dims();
-
-    for (size_t i = 0; i < cur->get_inputs().size(); i++) {
-        auto &input = cur->get_inputs()[i];
-        auto &inpaxis = bdax_map.get(input);
-        auto in_plain_dims = (i == 0) ? inp_plain_dims : wei_plain_dims;
-        if (inpaxis.empty()) {
-            binding_axis in_axis;
-            for (auto &bd_axis : outaxis) {
-                std::vector<int> ret;
-                for (auto &ax : bd_axis) {
-                    COMPILE_ASSERT(
-                            ax < static_cast<int64_t>(out_plain_dims.size()),
-                            "matmul core output binded axis could not exceed "
-                            "plain dims size: "
-                                    << out_plain_dims.size() << ", but got "
-                                    << ax)
-                    int distance_from_right
-                            = static_cast<int64_t>(out_plain_dims.size()) - 1
-                            - ax;
-                    // bind input/weight axis
-                    if (distance_from_right == (1 - static_cast<int64_t>(i))) {
-                        ret.emplace_back(
-                                in_plain_dims.size() - 1 - distance_from_right);
-                    } else if (distance_from_right > 1
-                            && distance_from_right < static_cast<int64_t>(
-                                       in_plain_dims.size())) {
-                        ret.emplace_back(
-                                in_plain_dims.size() - 1 - distance_from_right);
-                    }
-                }
-                in_axis.emplace_back(ret);
-            }
-            inpaxis = in_axis;
-            if (auto bd_op = input->producer_owner_->dyn_cast<
-                             op_traits::mixed_partition_acceptable>()) {
-                bd_op->pre_infer_binding_axis(bdax_map);
-            }
-        }
-    }
-}
-
-void matmul_core_op_t::infer_binding_axis(binding_axis_map &bdax_map) {
-    infer_matmul_binding_axis(this, bdax_map);
-}
-
-void matmul_core_op_t::pre_infer_binding_axis(binding_axis_map &bdax_map) {
-    pre_matmul_binding_axis(this, bdax_map);
-}
-
-} // namespace ops
-OP_REGISTER(ops::matmul_core_op_t, matmul_core)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/matmul_core.hpp b/src/graph/backend/graph_compiler/core/src/ops/matmul_core.hpp
deleted file mode 100644
index f3e047f68c2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/matmul_core.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_MATMUL_CORE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_MATMUL_CORE_HPP
-
-#include <vector>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-struct blocking_axis_t;
-
-class SC_INTERNAL_API matmul_core_op_t : public tunable_op_t {
-public:
-    matmul_core_op_t(const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    body_generator_ptr create_generator() override;
-    float get_gflop() override;
-    sc_dims get_batch_dims() const;
-    static sc_dims get_batch_dims_impl(const sc_dims &, const sc_dims &);
-
-    // Various broadcasts are supported:
-    // (1, M, K) * (A, B, K, N) -> (A, B, M, K)
-    // (A, B, M, K) * (A, 1, K, N) -> (A, B, M, K)
-    // (A, B, M, K) * (1, B, K, N) -> (A, B, M, N)
-    // (1, B, M, K) * (A, 1, K, N) -> (A, B, M, N)
-    // (A, B, C, M, K) * (A, 1, C, K, N) -> (A, B, C, M, N).
-    static sc_dims get_batch_dims_with_bc_impl(
-            const sc_dims &, const sc_dims &);
-
-    static sc_data_type_t infer_out_dtype(
-            const std::vector<graph_tensor_ptr> &);
-    sc_op_ptr do_compensations(
-            sc_graph_t &mgr, const context_ptr &ctx) override;
-    sc_op_ptr get_data_compensation(sc_graph_t &mgr);
-    // reuse cast and reduce nodes to do s8s8 and weight compensations togethor
-    std::vector<sc_op_ptr> get_s8s8_and_weight_compensation(
-            sc_graph_t &mgr, bool s8s8_compensation);
-    sc_op_ptr get_constant_compensation(sc_graph_t &mgr);
-
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override;
-
-    void infer_binding_axis(binding_axis_map &bdax_map) override;
-    void pre_infer_binding_axis(binding_axis_map &bdax_map) override;
-
-    void set_config_by_key(
-            const op_dispatch_key_t &key, const context_ptr &ctx) override;
-    std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx) override;
-    shape_rl_vec get_dynamic_shape_relations() const override;
-    static shape_rl_vec get_shape_relations_impl(const sc_dims &data_plain_dims,
-            const sc_dims &weight_plain_dims, const sc_dims &out_plain_dims);
-
-    sc_dims batch_dims_;
-};
-
-blocking_axis_t get_mm_blocking_axis(const logical_tensor_t &inp,
-        const logical_tensor_t &wei, const logical_tensor_t &out);
-
-void infer_matmul_binding_axis(tunable_op_t *cur, binding_axis_map &bdax_map);
-void pre_matmul_binding_axis(tunable_op_t *cur, binding_axis_map &bdax_map);
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/normalize.cpp b/src/graph/backend/graph_compiler/core/src/ops/normalize.cpp
deleted file mode 100644
index d045aa8340b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/normalize.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <memory>
-#include <utility>
-#include "normalize.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-// compute the output data format after reduction given the plain reduction axis
-static sc_data_format_t get_reduced_format(
-        const sc_data_format_t &in_fmt, const std::vector<int> &rd_axis) {
-    auto base_fmt = in_fmt;
-    // we should set the blocking of the reduce axies to 1
-    for (int ax : rd_axis) {
-        for (int blocking_idx :
-                in_fmt.format_code_.collect_blocking_index(ax)) {
-            base_fmt.blocks_[blocking_idx] = 1;
-        }
-    }
-    return base_fmt;
-}
-
-// compute the output data shape after reduction given the plain reduction axis
-static sc_dims get_reduced_shape(
-        const sc_dims &in_shape, const std::vector<int> &rd_axis) {
-    sc_dims reduced_shape = in_shape;
-    // we should set the reduce axies to 1
-    for (int ax : rd_axis) {
-        reduced_shape[ax] = 1;
-    }
-    return reduced_shape;
-}
-
-normalize_common_t::normalize_common_t(const normalize_kind &kind,
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-    if (kind == normalize_kind::layernorm)
-        op_name_ = "layernorm";
-    else if (kind == normalize_kind::instancenorm)
-        op_name_ = "instancenorm";
-    float epsilon = attrs_.get<float>("epsilon");
-    bool keep_stats = attrs_.get_or_else("keep_stats", true);
-    // TODO(xxx): deprecated, replaced by begin_norm_axis
-    const std::vector<int> &rd_axis = attrs_.get<std::vector<int>>("rd_axis");
-    bool use_affine = attrs_.get<bool>("use_affine");
-    if (use_affine) {
-        sc_dims expected_affine_shape;
-        auto &plain_dims = ins[0]->details_.get_plain_dims();
-        expected_affine_shape.reserve(rd_axis.size());
-        for (auto &ax : rd_axis) {
-            expected_affine_shape.push_back(plain_dims.at(ax));
-        }
-        COMPILE_ASSERT(ins.size() == 3UL,
-                op_name_ + ": Expecting 3 inputs for use_affine=True");
-        auto gamma_shape = ins[1]->details_.get_plain_dims();
-        auto beta_shape = ins[2]->details_.get_plain_dims();
-        COMPILE_ASSERT((expected_affine_shape == gamma_shape)
-                        && (expected_affine_shape == beta_shape),
-                "Wrong shape for beta and gamma of op "
-                        << op_name_.c_str() << ". Expecting "
-                        << utils::print_vector(expected_affine_shape)
-                        << ", but got gamma with shape: "
-                        << utils::print_vector(gamma_shape)
-                        << ", and beta with shape: "
-                        << utils::print_vector(beta_shape));
-    } else {
-        COMPILE_ASSERT(ins.size() == 1UL,
-                op_name_ + ": Expecting 1 input for use_affine=False");
-    }
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-        if (keep_stats) {
-            auto reduced_shape = get_reduced_shape(
-                    ins[0]->details_.get_plain_dims(), rd_axis);
-            auto reduced_format = get_reduced_format(
-                    ins[0]->details_.get_format(), rd_axis);
-            info_.outputs_.emplace_back(
-                    graph_tensor::make(reduced_shape, reduced_format));
-            info_.outputs_.emplace_back(
-                    graph_tensor::make(reduced_shape, reduced_format));
-        }
-
-    } else {
-        info_.outputs_ = outs;
-        if (keep_stats) {
-            COMPILE_ASSERT(outs.size() == 3UL,
-                    "Expecting 3 output tensor: result, mean and variance")
-            auto tensor_dtype = [](const graph_tensor_ptr &tsr) {
-                return tsr->details_.dtype_;
-            };
-            if (use_affine) {
-                COMPILE_ASSERT(tensor_dtype(ins[1]) == tensor_dtype(outs[1])
-                                && tensor_dtype(ins[1]) == tensor_dtype(ins[2])
-                                && tensor_dtype(outs[2])
-                                        == tensor_dtype(ins[2]),
-                        "The datatype of affine inputs shall be identical to "
-                        "datatypes of output stats.");
-            }
-        } else {
-            COMPILE_ASSERT(outs.size() == 1UL, "Expecting 1 result tensor")
-        }
-        COMPILE_ASSERT(outs[0]->details_.get_plain_dims()
-                                == ins[0]->details_.get_plain_dims()
-                        && outs[0]->details_.dtype_ == ins[0]->details_.dtype_,
-                "The output tensor should have the same shape and dtype of the "
-                "input")
-    }
-}
-
-void normalize_common_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    float epsilon = attrs_.get<float>("epsilon");
-    const std::vector<int> &rd_axis = attrs_.get<std::vector<int>>("rd_axis");
-    bool use_affine = attrs_.get<bool>("use_affine");
-    bool keep_stats = attrs_.get_or_else("keep_stats", true);
-
-    std::vector<int> non_normalized_bc_axis;
-    for (size_t i = 0; i < info_.inputs_[0]->details_.get_plain_dims().size();
-            i++) {
-        if (std::find(rd_axis.begin(), rd_axis.end(), static_cast<int>(i))
-                != rd_axis.end()) {
-            continue;
-        } else {
-            non_normalized_bc_axis.emplace_back(i);
-        }
-    }
-
-    // input
-    graph->make_input(inputs);
-    // constant op
-    // epsilon
-    COMPILE_ASSERT(
-            utils::is_one_of(info_.inputs_[0]->details_.dtype_.type_code_,
-                    sc_data_etype::F32, sc_data_etype::BF16),
-            "Only support bf16/f32");
-    sc_op_ptr feps = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {epsilon}),
-            datatypes::f32, sc_dims {1});
-    float channel_size = 1.0f;
-    for (auto ax : rd_axis) {
-        channel_size *= inputs[0]->details_.get_plain_dims()[ax];
-    }
-    sc_op_ptr chan_size_op = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {channel_size}),
-            datatypes::f32, sc_dims {1});
-    graph_tensor_ptr inputs0 = inputs[0], inputs1, inputs2;
-    if (use_affine) {
-        inputs1 = inputs[1];
-        inputs2 = inputs[2];
-    }
-    inputs0 = cast_input_dtype(inputs[0], graph);
-    if (use_affine) {
-        inputs1 = cast_input_dtype(inputs[1], graph);
-        inputs2 = cast_input_dtype(inputs[2], graph);
-    }
-    bool use_norm_opt = attrs_.get_or_else(op_attr_key::use_norm_opt, false);
-    std::shared_ptr<sc_op> fmean, fvar, fdiff;
-    // reduce X
-    auto reduce0 = graph->make("reduce", {inputs0}, {},
-            {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-    fmean = graph->make("div",
-            {reduce0->get_outputs()[0], chan_size_op->get_outputs()[0]}, {},
-            {{op_attr_key::must_div, true}});
-    if (use_norm_opt) {
-        // x^2
-        auto fsquare = graph->make("mul", {inputs0, inputs0}, {}, {}); // 1
-        // mean of x^2
-        auto reduce1 = graph->make("reduce", fsquare->get_outputs(), {},
-                {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-        auto fsqd_mean = graph->make("div",
-                {reduce1->get_outputs()[0], chan_size_op->get_outputs()[0]}, {},
-                {{op_attr_key::must_div, true}});
-        // square of mean
-        auto fmean_sqd = graph->make("mul",
-                {fmean->get_outputs()[0], fmean->get_outputs()[0]}, {},
-                {}); // 4
-        // x-x_mean
-        fdiff = graph->make("sub", {inputs0, fmean->get_outputs()[0]}, {},
-                {{"bc_axis", non_normalized_bc_axis}}); // 5
-
-        // var(x)
-        fvar = graph->make("sub",
-                {fsqd_mean->get_outputs()[0], fmean_sqd->get_outputs()[0]}, {},
-                {}); // 6
-    } else {
-        // x-x_mean
-        fdiff = graph->make("sub", {inputs0, fmean->get_outputs()[0]}, {},
-                {{"bc_axis", non_normalized_bc_axis}});
-        auto fdiff_square = graph->make("mul",
-                {fdiff->get_outputs()[0], fdiff->get_outputs()[0]}, {}, {});
-        // var(x)
-        auto reduce1 = graph->make("reduce", fdiff_square->get_outputs(), {},
-                {{"rd_axis", rd_axis}, {"rd_op", 0}, {"keep_dims", false}});
-        fvar = graph->make("div",
-                {reduce1->get_outputs()[0], chan_size_op->get_outputs()[0]}, {},
-                {{op_attr_key::must_div, true}});
-    }
-
-    auto fadd_eps = graph->make("add",
-            {fvar->get_outputs()[0], feps->get_outputs()[0]}, {}, {}); // 6
-    // rsqrt
-    auto rsqd_root = graph->make("squared_root", {fadd_eps->get_outputs()[0]},
-            {}, {{"reciprocal", false}}); // 7
-    auto foutput = graph->make("div",
-            {fdiff->get_outputs()[0], rsqd_root->get_outputs()[0]}, {},
-            {{"bc_axis", non_normalized_bc_axis}}); // 8
-
-    if (use_affine) {
-        foutput = graph->make("mul", {foutput->get_outputs()[0], inputs1}, {},
-                any_map_t({{"bc_axis", rd_axis}}));
-        foutput = graph->make("add", {foutput->get_outputs()[0], inputs2}, {},
-                any_map_t({{"bc_axis", rd_axis}}));
-    }
-    // output
-    foutput = cast_output_dtype(outputs[0], graph, foutput);
-    if (keep_stats) {
-        fmean = cast_output_dtype(outputs[1], graph, fmean);
-        fvar = cast_output_dtype(outputs[2], graph, fvar);
-        graph->make_output({foutput->get_outputs()[0], fmean->get_outputs()[0],
-                fvar->get_outputs()[0]});
-    } else {
-        graph->make_output(foutput->get_outputs());
-    }
-}
-
-void normalize_common_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::layernorm_op_t, layernorm)
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/normalize.hpp b/src/graph/backend/graph_compiler/core/src/ops/normalize.hpp
deleted file mode 100644
index 50f43dd1449..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/normalize.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_NORMALIZE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_NORMALIZE_HPP
-
-#include <memory>
-#include <vector>
-#include "compiler/ir/graph/graph_op.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-enum class normalize_kind {
-    layernorm = 0,
-    instancenorm,
-};
-
-class normalize_common_t : public graph_op_t,
-                           public op_traits::auto_copyable_t {
-public:
-    normalize_common_t() = default;
-    normalize_common_t(const normalize_kind &kind,
-            const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-/**
- * The layer norm op
- * Inputs:
- *  - in[0] - the data input
- *  - in[1] - (Only avaliable when use_affine = True) gamma. @see use_affine
- *  - in[2] - (Only avaliable when use_affine = True) beta. @see use_affine
- * Outputs:
- *  - The result tensor
- *  - Mean(Optional)
- *  - Variance(Optional)
- * Attrs:
- *  - keep_stats: bool - Default = true. Whether to output mean&&var.
- *  - begin_norm_axis: int - Default = -1. which axis to start layer
- * normalization. This will convert to rd_axis in this op.
- *  - use_affine: bool - Default = true. If true, output = output * gamma + beta
- *  - epsilon: float - Default = 1e-5
- *  - rd_axis: vector<int> Internal use. Set reduced axis directly. Do not set
- * begin_norm_axis if you intend to use this parameter.
- * */
-class layernorm_op_t : public normalize_common_t {
-public:
-    layernorm_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : normalize_common_t(normalize_kind::layernorm, ins, outs,
-                layernorm_op_attrs(ins, attrs)) {}
-
-private:
-    any_map_t layernorm_op_attrs(
-            const std::vector<graph_tensor_ptr> &ins, const any_map_t &attrs) {
-        any_map_t new_attrs = attrs;
-        new_attrs.set("keep_stats", attrs.get_or_else("keep_stats", true));
-        new_attrs.set("use_affine", attrs.get_or_else("use_affine", true));
-        new_attrs.set("epsilon", attrs.get_or_else("epsilon", 1e-5f));
-        if (attrs.has_key("begin_norm_axis")) { // oneDNN specification
-            int begin_norm_axis = attrs.get<int>("begin_norm_axis");
-            if (begin_norm_axis < 0) {
-                begin_norm_axis += ins[0]->details_.get_plain_dims().size();
-            }
-            if (begin_norm_axis > int(ins[0]->details_.get_plain_dims().size())
-                    || begin_norm_axis < 0) {
-                throw std::runtime_error(
-                        "layernorm_op_t::begin_norm_axis boundary exceed.");
-            }
-            std::vector<int> rd_axis;
-            for (size_t i = begin_norm_axis;
-                    i < ins[0]->details_.get_plain_dims().size(); i++)
-                rd_axis.emplace_back(i);
-            new_attrs.set("rd_axis", rd_axis);
-        } else if (attrs.has_key("rd_axis")) { // internal use
-            new_attrs.set("rd_axis", attrs.get<std::vector<int>>("rd_axis"));
-        } else { // nothing specific
-            std::vector<int> rd_axis
-                    = {int(ins[0]->details_.get_plain_dims().size() - 1)};
-            new_attrs.set("rd_axis", rd_axis);
-        }
-        return new_attrs;
-    }
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/pow.cpp b/src/graph/backend/graph_compiler/core/src/ops/pow.cpp
deleted file mode 100644
index eee57306201..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/pow.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "pow.hpp"
-#include <cmath>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-pow_op::pow_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-    }
-    attrs_ = attrs;
-    beta_ = attrs.get<float>("beta");
-    op_name_ = "pow";
-}
-
-void pow_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto pos_beta = std::fabs(beta_);
-    sc_op_ptr fast_cal_last_op, output_op;
-    // input
-    graph->make_input(inputs);
-    inputs[0] = cast_input_dtype(inputs[0], graph);
-    // fast calculation path
-    if (pos_beta == 0.f) {
-        float one = 1.f;
-        auto const_zero = graph->make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(
-                                 &pos_beta, sizeof(pos_beta))},
-                        {"dtype", datatypes::f32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        auto const_one = graph->make("constant", {}, {},
-                {{"values", std::make_shared<static_data_t>(&one, sizeof(one))},
-                        {"dtype", datatypes::f32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        auto fmul = graph->make(
-                "mul", {inputs[0], const_zero->get_outputs()[0]}, {}, {});
-        fast_cal_last_op = graph->make("add",
-                {fmul->get_outputs()[0], const_one->get_outputs()[0]}, {}, {});
-    } else if (pos_beta == 1.f) {
-        fast_cal_last_op = graph->make("tensor_view", inputs, {},
-                {{"shape", inputs[0]->details_.get_plain_dims()}});
-    } else if (pos_beta == 2.f) {
-        fast_cal_last_op = graph->make("square", inputs, {}, {});
-    } else if (pos_beta == 3.f) {
-        auto fsquare = graph->make("square", inputs, {}, {});
-        fast_cal_last_op = graph->make(
-                "mul", {fsquare->get_outputs()[0], inputs[0]}, {}, {});
-    } else if (pos_beta == 0.5f) {
-        fast_cal_last_op = graph->make(
-                "squared_root", inputs, {}, {{"reciprocal", beta_ < 0}});
-    } else {
-        // log(x)
-        auto flog = graph->make("log", {inputs[0]}, {}, {});
-
-        // (log(x)*y)
-        auto exponent = graph->make("constant", {}, {},
-                {{"values",
-                         std::make_shared<static_data_t>(
-                                 &beta_, sizeof(beta_))},
-                        {"dtype", datatypes::f32}, {"plain_dims", sc_dims {1}},
-                        {"format", sc_data_format_t()}});
-        auto fmul = graph->make("mul",
-                {flog->get_outputs()[0], exponent->get_outputs()[0]}, {}, {});
-        // pow = exp(log(x)*y)
-        output_op = graph->make("exp", {fmul->get_outputs()[0]}, {}, {});
-    }
-    if (fast_cal_last_op) {
-        // process -0.5f with rsqrt.
-        if (beta_ < 0.f && beta_ != -0.5f) {
-            fast_cal_last_op = graph->make(
-                    "reciprocal", fast_cal_last_op->get_outputs(), {}, {});
-        }
-        output_op = fast_cal_last_op;
-    }
-    output_op = cast_output_dtype(outputs[0], graph, output_op);
-    graph->make_output(output_op->get_outputs());
-}
-
-void pow_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::pow_op, pow)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/pow.hpp b/src/graph/backend/graph_compiler/core/src/ops/pow.hpp
deleted file mode 100644
index f310e2e0a1c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/pow.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_POW_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_POW_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-/**
- * The pow operator: pow(in1, in2) = in1 ^ in2
- * Inputs:
- *  - First tensor is base, exponent is in attributes.
- * Outputs:
- *  - The result tensor
- * Attrs:
- *  - auto broadcast
- * */
-class pow_op : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    pow_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-
-private:
-    float beta_;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/reduce_graph_op.cpp b/src/graph/backend/graph_compiler/core/src/ops/reduce_graph_op.cpp
deleted file mode 100644
index 50af1acdc36..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/reduce_graph_op.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "reduce_graph_op.hpp"
-#include <string>
-#include <utility>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-graph_reduce_base_t::graph_reduce_base_t(graph_tensor_ptr v,
-        const std::vector<int> &rd_axis, const std::string &op_name,
-        bool keep_dims)
-    : graph_reduce_base_t({std::move(v)}, {}, op_name,
-            {{"rd_axis", rd_axis}, {"keep_dims", keep_dims}}) {}
-
-graph_reduce_base_t::graph_reduce_base_t(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const std::string &op_name,
-        const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    attrs_ = attrs;
-    COMPILE_ASSERT(attrs.has_key("rd_axis"),
-            "attrs of reduce op should have reduce axis information.");
-    if (outs.empty()) {
-        auto rd_axis = attrs.get<std::vector<int>>("rd_axis");
-        auto keep_dims_ = attrs.get_or_else("keep_dims", true);
-        sc_dims out_dims;
-        if (keep_dims_) {
-            out_dims = ins[0]->details_.get_plain_dims();
-            for (size_t i = 0; i < rd_axis.size(); i++) {
-                out_dims[rd_axis[i]] = 1;
-            }
-        } else {
-            for (size_t i = 0; i < ins[0]->details_.get_plain_dims().size();
-                    i++) {
-                if (find(rd_axis.begin(), rd_axis.end(), i) != rd_axis.end()) {
-                    continue;
-                }
-                out_dims.emplace_back(ins[0]->details_.get_plain_dims()[i]);
-            }
-        }
-        info_.outputs_.emplace_back(std::make_shared<graph_tensor>(this,
-                ins[0]->details_.get_format(), out_dims,
-                ins[0]->details_.dtype_));
-    } else {
-        info_.outputs_ = outs;
-        if (ins[0]->details_.get_plain_dims()
-                == outs[0]->details_.get_plain_dims()) {
-            attrs_["keep_dims"] = true;
-        }
-    }
-    op_name_ = op_name;
-}
-
-void graph_reduce_base_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-void reduce_mean_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    float item_cnt = 1;
-    auto plain_rd_axis_ = attrs_.get<std::vector<int>>("rd_axis");
-    for (auto ax : plain_rd_axis_) {
-        item_cnt *= inputs[0]->details_.get_plain_dims()[ax];
-    }
-    // input
-    graph_tensor_ptr inputs0 = inputs[0];
-    // cast input
-    inputs0 = cast_input_dtype(inputs[0], graph);
-    auto reduce_num = graph->make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {item_cnt}),
-            datatypes::f32, sc_dims {1});
-    auto reduce_sum = graph->make("reduce_sum", {inputs0}, {}, attrs_);
-    auto reduce_sum_div = graph->make("div",
-            {reduce_sum->get_outputs()[0], reduce_num->get_outputs()[0]}, {},
-            {});
-    // output
-    sc_op_ptr output_op = reduce_sum_div;
-    output_op = cast_output_dtype(outputs[0], graph, output_op);
-    graph->make_output(output_op->get_outputs());
-}
-
-void reduce_l2_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto plain_rd_axis_ = attrs_.get<std::vector<int>>("rd_axis");
-
-    // input
-    graph_tensor_ptr inputs0 = inputs[0];
-    // cast input
-    inputs0 = cast_input_dtype(inputs[0], graph);
-    auto square_val = graph->make("square", {inputs0}, {}, {});
-    auto reduce_val
-            = graph->make("reduce_sum", square_val->get_outputs(), {}, attrs_);
-    auto l2_res
-            = graph->make("squared_root", reduce_val->get_outputs(), {}, {});
-    // cast output
-    l2_res = cast_output_dtype(outputs[0], graph, l2_res);
-    // output
-    graph->make_output(l2_res->get_outputs());
-}
-
-void reduce_l1_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto plain_rd_axis_ = attrs_.get<std::vector<int>>("rd_axis");
-
-    // input
-    graph_tensor_ptr inputs0 = inputs[0];
-    // cast input
-    inputs0 = cast_input_dtype(inputs[0], graph);
-    auto abs_val = graph->make("abs", {inputs0}, {}, {});
-    auto reduce_l1
-            = graph->make("reduce_sum", abs_val->get_outputs(), {}, attrs_);
-    // cast output
-    reduce_l1 = cast_output_dtype(outputs[0], graph, reduce_l1);
-    // output
-    graph->make_output(reduce_l1->get_outputs());
-}
-
-OP_REGISTER(reduce_l1_op_t, reduce_l1)
-OP_REGISTER(reduce_l2_op_t, reduce_l2)
-OP_REGISTER(reduce_mean_op_t, reduce_mean)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/reduce_graph_op.hpp b/src/graph/backend/graph_compiler/core/src/ops/reduce_graph_op.hpp
deleted file mode 100644
index 6bcb77bac3c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/reduce_graph_op.hpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_REDUCE_GRAPH_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_REDUCE_GRAPH_OP_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "fusible/reduce.hpp"
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-class graph_reduce_base_t : public graph_op_t,
-                            public op_traits::auto_copyable_t {
-public:
-    graph_reduce_base_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            const std::string &op_name, bool keep_dims = false);
-    graph_reduce_base_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            const std::string &op_name, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-
-class reduce_mean_op_t : public graph_reduce_base_t {
-public:
-    reduce_mean_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : graph_reduce_base_t(
-                std::move(v), rd_axis, "reduce_mean", keep_dims) {};
-    reduce_mean_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : graph_reduce_base_t(ins, outs, "reduce_mean", attrs) {};
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-class reduce_l1_op_t : public graph_reduce_base_t {
-public:
-    reduce_l1_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : graph_reduce_base_t(std::move(v), rd_axis, "reduce_l1", keep_dims) {};
-    reduce_l1_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : graph_reduce_base_t(ins, outs, "reduce_l1", attrs) {};
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-class reduce_l2_op_t : public graph_reduce_base_t {
-public:
-    reduce_l2_op_t(graph_tensor_ptr v, const std::vector<int> &rd_axis,
-            bool keep_dims = false)
-        : graph_reduce_base_t(std::move(v), rd_axis, "reduce_l2", keep_dims) {};
-    reduce_l2_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : graph_reduce_base_t(ins, outs, "reduce_l2", attrs) {};
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/relu_backprop.cpp b/src/graph/backend/graph_compiler/core/src/ops/relu_backprop.cpp
deleted file mode 100644
index bc45819a964..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/relu_backprop.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "relu_backprop.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-relu_backprop_op::relu_backprop_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 2, "Wrong op input size.\n");
-    info_.inputs_ = ins;
-    COMPILE_ASSERT(gc::graph::check_shape_equal(
-                           info_.inputs_[0]->details_.get_plain_dims(),
-                           info_.inputs_[1]->details_.get_plain_dims()),
-            "2 inputs of relu backprop op shall have the same shape.");
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "relu backprop op shall have only 1 output.")
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    attrs_ = attrs;
-    op_name_ = "relu_backprop";
-}
-
-void relu_backprop_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // no need to insert explicit cast for relu_backprop, since the computation
-    // of "select_one" is not sensitive to dtype.
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-
-    // input
-    graph->make_input(inputs);
-
-    // if "use_dst" is true, inputs0 is the result of forward, which is
-    // relu(x). otherwise, inputs0 is the src of forward
-
-    sc_op_ptr select_one = graph->make("select_one", {inputs[0]}, {}, {});
-    sc_op_ptr mul = graph->make(
-            "mul", {inputs[1], select_one->get_outputs()[0]}, {}, {});
-    // output
-    graph->make_output(mul->get_outputs());
-}
-
-void relu_backprop_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::relu_backprop_op, relu_backprop)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/relu_backprop.hpp b/src/graph/backend/graph_compiler/core/src/ops/relu_backprop.hpp
deleted file mode 100644
index f8ccba99066..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/relu_backprop.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_RELU_BACKPROP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_RELU_BACKPROP_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class relu_backprop_op : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    relu_backprop_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/reshape.cpp b/src/graph/backend/graph_compiler/core/src/ops/reshape.cpp
deleted file mode 100644
index 5f45e8ffb0d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/reshape.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "reshape.hpp"
-#include <memory>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <ops/fusible/memory_movement.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-static void get_output_shape(sc_dims &outshape, const sc_dims &input_dims,
-        const int32_t *shape, int dim, bool special_zero) {
-    // we allow one dim value to be -1, which is automatically calculated to
-    // keep the number of elements of the out tensor = in tensor.
-    int auto_cal_dim = -1;
-    size_t total_shape = 1;
-    for (int i = 0; i < dim; i++) {
-        int shape_v = shape[i];
-        if (shape_v == -1) {
-            COMPILE_ASSERT(
-                    auto_cal_dim == -1, "reshape only support one -1 shape");
-            auto_cal_dim = i;
-        } else {
-            if (special_zero && shape_v == 0) {
-                COMPILE_ASSERT(static_cast<size_t>(i) < input_dims.size(),
-                        "The special zero at "
-                                << i
-                                << " dimension is out of range in input shape");
-                shape_v = input_dims[i];
-            }
-            total_shape *= shape_v;
-        }
-        outshape.emplace_back(shape_v);
-    }
-    size_t input_total_shape = 1;
-    for (auto v : input_dims) {
-        input_total_shape *= v;
-    }
-    const char *error_msg
-            = "Reshape: The input tensor size does not match the given shape";
-    if (auto_cal_dim != -1) {
-        COMPILE_ASSERT(input_total_shape >= total_shape, error_msg);
-        outshape[auto_cal_dim] = input_total_shape / total_shape;
-    } else {
-        COMPILE_ASSERT(input_total_shape == total_shape, error_msg);
-    }
-}
-
-static static_data_t *validate_and_get_static_shape(sc_op *ths) {
-    COMPILE_ASSERT(
-            ths->get_inputs().size() == 2, "dynamic reshape op takes 2 inputs");
-    auto in1 = ths->get_inputs()[1]->producer_owner_;
-    auto &in1_detail = ths->get_inputs()[1]->details_;
-    COMPILE_ASSERT(in1_detail.get_format().is_plain()
-                    || in1_detail.get_format().is_any(),
-            "Expecting plain format for input 2 of " << ths->op_name_);
-    COMPILE_ASSERT(in1_detail.get_blocking_dims().size() == 1
-                    && utils::is_one_of(in1_detail.dtype_, datatypes::index,
-                            datatypes::s32),
-            "Expecting 1D and int32/int64 tensor for input 2 of "
-                    << ths->op_name_);
-    if (!in1->isa<input_op>() && !in1->isa<constant_op_t>()) { return nullptr; }
-    if (ths->is_dynamic()) { return nullptr; }
-    auto ret = in1->attrs_.get_or_else<std::shared_ptr<static_data_t>>(
-            "values", nullptr);
-    COMPILE_ASSERT(ret,
-            "Since dynamic shape is not supported yet, we are expecting the "
-            "constant value data from the inputs as the shape "
-            "info for the dynamic shaped op: "
-                    << ths->op_name_);
-    return ret.get();
-}
-
-dynamic_reshape_op::dynamic_reshape_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : sc_op("dynamic_reshape", ins, outs, attrs) {
-    auto shape_data = validate_and_get_static_shape(this);
-    sc_dims outshape;
-    auto dim = get_inputs()[1]->details_.get_plain_dims()[0];
-    if (!ins[0]->is_dynamic()) {
-        COMPILE_ASSERT(shape_data,
-                "Reshape requires compile-time constant shape for now");
-        auto input_dims = get_inputs()[0]->details_.get_plain_dims();
-        bool special_zero = attrs.get<bool>("special_zero");
-        int32_t *shape = reinterpret_cast<int32_t *>(shape_data->data_);
-        COMPILE_ASSERT(
-                dim * sizeof(int32_t) == shape_data->size_, "Bad shape data");
-        outshape.reserve(dim);
-        get_output_shape(outshape, input_dims, shape, dim, special_zero);
-    } else {
-        outshape = sc_dims(dim, dimensions::dynamic_any);
-    }
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(graph_tensor::make(outshape,
-                sc_data_format_t(), get_inputs()[0]->details_.dtype_));
-    } else {
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "Expecting 1 output for reshape");
-        auto &details = info_.outputs_[0]->details_;
-        COMPILE_ASSERT(details.dtype_ == info_.inputs_[0]->details_.dtype_,
-                "Reshape: input/output dtype does not match");
-        if (!ins[0]->is_dynamic()) {
-            COMPILE_ASSERT(details.get_plain_dims() == outshape,
-                    "Reshape: Expecting output shape = "
-                            << utils::print_vector(outshape) << ", given: "
-                            << utils::print_vector(details.get_plain_dims()));
-        }
-    }
-}
-void dynamic_reshape_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    std::vector<std::vector<sc_data_format_t>> in_formats, out_formats;
-    out_formats.push_back({sc_data_format_kind_t::get_plain_by_dims(
-            get_outputs()[0]->details_.get_plain_dims().size())});
-    in_formats.push_back({sc_data_format_kind_t::get_plain_by_dims(
-            get_inputs()[0]->details_.get_plain_dims().size())});
-    in_formats.push_back({sc_data_format_kind_t::get_plain_by_dims(1)});
-    format_to_dense_format_stride_pair(
-            in_formats, out_formats, supported_ins, supported_outs);
-}
-ir_module_ptr dynamic_reshape_op::get_func(context_ptr ctx) {
-    throw std::runtime_error("Not implemented");
-}
-sc_op_ptr dynamic_reshape_op::constant_optimize(sc_graph_t &graph) {
-    if (is_dynamic()) { return nullptr; }
-    auto shape_data = validate_and_get_static_shape(this);
-    // if input shape is not constant, return
-    if (!shape_data) { return nullptr; }
-    auto new_input = graph.make("tensor_view", {get_inputs()[0]}, {},
-            {{"shape", get_outputs()[0]->details_.get_plain_dims()}});
-    this->replace_uses_with_and_remove(new_input);
-    return new_input;
-}
-
-static_reshape_op::static_reshape_op(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-    : sc_op("static_reshape", ins, outs, attrs) {
-    COMPILE_ASSERT(
-            attrs.has_key("shape"), "Static reshape requires shape attributes");
-    auto shape = attrs.get<sc_dims>("shape");
-    std::vector<int32_t> shape_s32(shape.begin(), shape.end());
-    bool special_zero = attrs.get<bool>("special_zero");
-    auto input_dims = get_inputs()[0]->details_.get_plain_dims();
-    auto dim = static_cast<int>(shape.size());
-    sc_dims outshape;
-    outshape.reserve(dim);
-    get_output_shape(outshape, input_dims, shape_s32.data(), dim, special_zero);
-    if (info_.outputs_.empty()) {
-        info_.outputs_.emplace_back(graph_tensor::make(outshape,
-                sc_data_format_t(), get_inputs()[0]->details_.dtype_));
-    } else {
-        COMPILE_ASSERT(
-                info_.outputs_.size() == 1, "Expecting 1 output for reshape");
-        auto &details = info_.outputs_[0]->details_;
-        COMPILE_ASSERT(details.dtype_ == info_.inputs_[0]->details_.dtype_,
-                "Reshape: input/output dtype does not match");
-        COMPILE_ASSERT(details.get_plain_dims() == outshape,
-                "Reshape: Expecting output shape = "
-                        << utils::print_vector(outshape) << ", given: "
-                        << utils::print_vector(details.get_plain_dims()));
-    }
-}
-void static_reshape_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {
-    throw std::runtime_error("Not implemented");
-}
-
-// for single op generate
-ir_module_ptr static_reshape_op::get_func(context_ptr ctx) {
-    throw std::runtime_error("Not implemented");
-}
-sc_op_ptr static_reshape_op::constant_optimize(sc_graph_t &graph) {
-    // try to convert to tensor view
-    auto trial_tensor_view = graph.make("tensor_view", {get_inputs()[0]}, {},
-            {{"shape", get_outputs()[0]->details_.get_plain_dims()}});
-    sc_data_format_t trial_format;
-    bool valid_tensor_view = true;
-    if (!get_inputs()[0]->details_.get_format().is_any())
-        valid_tensor_view = trial_tensor_view->dyn_cast<tensor_view_op_t>()
-                                    ->try_penetrate(trial_format);
-    if (!valid_tensor_view) {
-        auto reorder = graph.make("reorder", {get_inputs()[0]}, {},
-                {{"internal", true},
-                        {"out_format",
-                                get_inputs()[0]
-                                        ->details_.get_format()
-                                        .to_plain()}});
-        trial_tensor_view->replace_input(0, reorder->get_outputs()[0]);
-    }
-    this->replace_uses_with_and_remove(trial_tensor_view);
-    return trial_tensor_view;
-}
-} // namespace ops
-OP_REGISTER(ops::dynamic_reshape_op, dynamic_reshape);
-OP_REGISTER(ops::static_reshape_op, static_reshape);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/reshape.hpp b/src/graph/backend/graph_compiler/core/src/ops/reshape.hpp
deleted file mode 100644
index 804fc392cc7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/reshape.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_RESHAPE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_RESHAPE_HPP
-
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/traits.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-// the dynamic reshape op. The first input is the tensor to reshape. The second
-// is the target shape. We currently only support the case when the second input
-// is constant
-class dynamic_reshape_op : public sc_op,
-                           public op_traits::auto_copyable_t,
-                           public op_traits::may_quantize_t,
-                           public op_traits::constant_optimizable_t {
-public:
-    dynamic_reshape_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    ir_module_ptr get_func(context_ptr ctx) override;
-    sc_op_ptr constant_optimize(sc_graph_t &graph) override;
-};
-
-// the static reshape op. The first input is the tensor to reshape. The second
-// is the target tensor. Shape info is included in attributes.
-class static_reshape_op : public sc_op,
-                          public op_traits::auto_copyable_t,
-                          public op_traits::constant_optimizable_t {
-public:
-    static_reshape_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    ir_module_ptr get_func(context_ptr ctx) override;
-    sc_op_ptr constant_optimize(sc_graph_t &graph) override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/sigmoid_backprop.cpp b/src/graph/backend/graph_compiler/core/src/ops/sigmoid_backprop.cpp
deleted file mode 100644
index bded524a4b8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/sigmoid_backprop.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "sigmoid_backprop.hpp"
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-sigmoid_backprop_op::sigmoid_backprop_op(
-        const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    COMPILE_ASSERT(ins.size() == 2, "Wrong op input size.\n");
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "sigmoid backprop op shall have only 1 output.")
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    attrs_ = attrs;
-    op_name_ = "sigmoid_backprop";
-}
-
-void sigmoid_backprop_op::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-
-    // input
-    graph->make_input(inputs);
-    graph_tensor_ptr inputs0 = inputs[0], inputs1 = inputs[1];
-
-    // cast inputs
-    inputs0 = cast_input_dtype(inputs[0], graph);
-    inputs1 = cast_input_dtype(inputs[1], graph);
-
-    // sigmoid_grad = sigmoid(x) - sigmoid(x)*sigmoid(x)
-    // if "use_dst" is true, inputs0 is the result of forward, which is
-    // sigmoid(x). otherwise, inputs0 is the src of forward, we need to
-    // calculate sigmoid(x) by ourselves
-    sc_op_ptr mul0, sub, mul1;
-    if (attrs_.get_or_else("use_dst", true)) {
-        mul0 = graph->make("mul", {inputs0, inputs0}, {}, {});
-        sub = graph->make("sub", {inputs0, mul0->get_outputs()[0]}, {}, {});
-    } else {
-        auto sigmoid = graph->make("sigmoid", {inputs0}, {}, {});
-        mul0 = graph->make("mul",
-                {sigmoid->get_outputs()[0], sigmoid->get_outputs()[0]}, {}, {});
-        sub = graph->make("sub",
-                {sigmoid->get_outputs()[0], mul0->get_outputs()[0]}, {}, {});
-    }
-
-    mul1 = graph->make("mul", {sub->get_outputs()[0], inputs1}, {}, {});
-    // cast output
-    mul1 = cast_output_dtype(outputs[0], graph, mul1);
-    // output
-    graph->make_output(mul1->get_outputs());
-}
-
-void sigmoid_backprop_op::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-} // namespace ops
-
-OP_REGISTER(ops::sigmoid_backprop_op, sigmoid_backprop)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/sigmoid_backprop.hpp b/src/graph/backend/graph_compiler/core/src/ops/sigmoid_backprop.hpp
deleted file mode 100644
index e56c73d90e6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/sigmoid_backprop.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_SIGMOID_BACKPROP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_SIGMOID_BACKPROP_HPP
-
-#include <memory>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-class sigmoid_backprop_op : public graph_op_t,
-                            public op_traits::auto_copyable_t {
-public:
-    sigmoid_backprop_op(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/softmax.cpp b/src/graph/backend/graph_compiler/core/src/ops/softmax.cpp
deleted file mode 100644
index 028dd1f8284..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/softmax.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "softmax.hpp"
-#include <utility>
-#include <compiler/ir/graph/fusible_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-softmax_base_t::softmax_base_t(const std::vector<graph_tensor_ptr> &ins,
-        const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs) {
-    info_.inputs_ = ins;
-    if (outs.empty()) {
-        info_.outputs_.emplace_back(
-                std::make_shared<graph_tensor>(this, ins[0]->details_));
-    } else {
-        info_.outputs_ = outs;
-        COMPILE_ASSERT(info_.outputs_.size() == 1,
-                "softmax op shall have only 1 output.")
-        gc::graph::check_logical_tensor_shape_dtype_identical(
-                info_.inputs_[0]->details_, info_.outputs_[0]->details_);
-    }
-    attrs_ = attrs;
-    axis_ = attrs_.get_or_else<std::vector<int>>("axis", {1});
-}
-
-void softmax_base_t::query_format(context_ptr ctx,
-        std::vector<std::vector<format_stride_pair>> &supported_ins,
-        std::vector<std::vector<format_stride_pair>> &supported_outs) {}
-
-graph_tensor_ptr softmax_base_t::get_stable_exp_inp(
-        const graph_tensor_ptr &input, const std::vector<int> &axis,
-        std::shared_ptr<sc_graph_t> &graph) {
-    bool numeric_stable = attrs_.get_or_else("numerically_stable", true);
-    if (numeric_stable) {
-        // x - max(x)
-        auto fmax = graph->make("reduce", {input}, {},
-                {{"need_mean", false}, {"rd_axis", axis}, {"rd_op", 2}});
-        auto fsub = graph->make("sub", {input, fmax->get_outputs()[0]}, {}, {});
-        return fsub->get_outputs()[0];
-    }
-    return input;
-}
-
-std::pair<std::shared_ptr<sc_op>, std::shared_ptr<sc_op>>
-softmax_op_t::get_exp_reduce(std::shared_ptr<sc_graph_t> &graph,
-        const graph_tensor_ptr &input, const std::vector<int> &axis) {
-    // The attribute decides whether softmax uses numerically stable process
-    // version(do x-max(x) first) or not. Default use the numerically stable
-    // version, in some specific cases like mha inference, use the unstable
-    // version.
-    graph_tensor_ptr fexpinp = get_stable_exp_inp(input, axis, graph);
-    bool numeric_stable = attrs_.get_or_else("numerically_stable", true);
-
-    // exp(x)
-    auto fexp = graph->make(
-            "exp", {fexpinp}, {}, {{"overflow_check", !numeric_stable}});
-
-    // sum(exp(x))
-    auto freduce = graph->make("reduce", {fexp->get_outputs()[0]}, {},
-            {{"need_mean", false}, {"rd_axis", axis}, {"rd_op", 0}});
-
-    return std::make_pair(fexp, freduce);
-}
-
-std::shared_ptr<sc_op> softmax_op_t::get_softmax_result(
-        std::shared_ptr<sc_graph_t> &graph, const graph_tensor_ptr &input,
-        const std::vector<int> &axis) {
-    auto exp_reduce_tuple = get_exp_reduce(graph, input, axis);
-    auto f_exp = std::get<0>(exp_reduce_tuple);
-    auto f_reduce = std::get<1>(exp_reduce_tuple);
-    auto res = graph->make("div",
-            {f_exp->get_outputs()[0], f_reduce->get_outputs()[0]}, {}, {});
-    return res;
-}
-
-void softmax_op_t::make_logical_tensor(std::vector<graph_tensor_ptr> &inputs,
-        std::vector<graph_tensor_ptr> &outputs) {
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-}
-
-void softmax_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    make_logical_tensor(inputs, outputs);
-    // get axis
-    auto &axis = get_axis();
-    graph->make_input(inputs);
-    // input dtype cast
-    graph_tensor_ptr input = cast_input_dtype(inputs[0], graph);
-    // calculate
-    auto res = get_softmax_result(graph, input, axis);
-    // output dtype cast
-    res = cast_output_dtype(outputs[0], graph, res);
-    graph->make_output(res->get_outputs());
-}
-
-void softmax_bwd_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto &axis = get_axis();
-    // input
-    graph->make_input(inputs);
-    // dtype cast
-    graph_tensor_ptr input0 = cast_input_dtype(inputs[0], graph);
-    graph_tensor_ptr input1 = cast_input_dtype(inputs[1], graph);
-
-    auto f_sbr = graph->make("mul", {input0, input1}, {}, {});
-    auto f_rd = graph->make("reduce", f_sbr->get_outputs(), {},
-            {{"need_mean", false}, {"rd_axis", axis}, {"rd_op", 0}});
-    auto f_sub = graph->make("sub", {input0, f_rd->get_outputs()[0]}, {}, {});
-    auto f_mul = graph->make("mul", {input1, f_sub->get_outputs()[0]}, {}, {});
-    // output
-    f_mul = cast_output_dtype(outputs[0], graph, f_mul);
-
-    graph->make_output(f_mul->get_outputs());
-}
-
-void log_softmax_op_t::get_graph_impl(std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    make_logical_tensor(inputs, outputs);
-    auto &axis = get_axis();
-    graph->make_input(inputs);
-    // input dtype cast
-    graph_tensor_ptr input = cast_input_dtype(inputs[0], graph);
-    // calculate
-    graph_tensor_ptr fexpinp = input;
-    auto exp_reduce_pair = get_exp_reduce(graph, input, axis);
-    fexpinp = std::get<0>(exp_reduce_pair)->get_inputs()[0];
-    auto freduce = std::get<1>(exp_reduce_pair);
-    auto f_log = graph->make("log", {freduce->get_outputs()}, {}, {});
-    auto f_sub_res
-            = graph->make("sub", {fexpinp, f_log->get_outputs()[0]}, {}, {});
-    // output dtype cast
-    f_sub_res = cast_output_dtype(outputs[0], graph, f_sub_res);
-    graph->make_output(f_sub_res->get_outputs());
-}
-
-void log_softmax_backward_op_t::get_graph_impl(
-        std::shared_ptr<sc_graph_t> &graph) {
-    // create new input logical tensors
-    std::vector<graph_tensor_ptr> inputs, outputs;
-    inputs = remake_logical_tensors(info_.inputs_);
-    outputs = remake_logical_tensors(info_.outputs_);
-    auto &axis = get_axis();
-    // input
-    graph->make_input(inputs);
-    // dtype cast
-    graph_tensor_ptr input0 = cast_input_dtype(inputs[0], graph);
-    graph_tensor_ptr input1 = cast_input_dtype(inputs[1], graph);
-
-    auto f_sbr = graph->make("reduce", {input0}, {},
-            {{"need_mean", false}, {"rd_axis", axis}, {"rd_op", 0}});
-    // exp(x)
-    auto fexp = graph->make("exp", {input1}, {}, {});
-
-    // dd - expf(d) * reduce(dd)
-    auto f_mul = graph->make(
-            "mul", {fexp->get_outputs()[0], f_sbr->get_outputs()[0]}, {}, {});
-    auto f_sub = graph->make("sub", {input0, f_mul->get_outputs()[0]}, {}, {});
-
-    // output dtype cast
-    f_sub = cast_output_dtype(outputs[0], graph, f_sub);
-
-    graph->make_output(f_sub->get_outputs());
-}
-
-} // namespace ops
-
-OP_REGISTER(ops::softmax_op_t, softmax)
-OP_REGISTER(ops::log_softmax_op_t, log_softmax)
-OP_REGISTER(ops::log_softmax_backward_op_t, log_softmax_bwd)
-OP_REGISTER(ops::softmax_bwd_op_t, softmax_bwd)
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/softmax.hpp b/src/graph/backend/graph_compiler/core/src/ops/softmax.hpp
deleted file mode 100644
index 418fef20d13..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/softmax.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_SOFTMAX_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_SOFTMAX_HPP
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/graph_op.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-/**
- * The softmax operator: exp(x)/sum(exp(x))
- * Inputs:
- *  - A single tensor
- * Outputs:
- *  - The result tensor
- * Attrs:
- *  - axis: vector<int> - The reduce axis for the "sum", see "reduce" op
- * */
-class softmax_base_t : public graph_op_t, public op_traits::auto_copyable_t {
-public:
-    softmax_base_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs);
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override;
-    std::vector<int> &get_axis() { return axis_; }
-    void set_axis(const std::vector<int> &axis) { this->axis_ = axis; }
-    graph_tensor_ptr get_stable_exp_inp(const graph_tensor_ptr &input,
-            const std::vector<int> &axis, std::shared_ptr<sc_graph_t> &graph);
-
-private:
-    std::vector<int> axis_;
-};
-
-class softmax_op_t : public softmax_base_t {
-public:
-    softmax_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : softmax_base_t(ins, outs, attrs) {
-        op_name_ = "softmax";
-        COMPILE_ASSERT(info_.inputs_.size() == 1,
-                "softmax op shall have only 1 inputs.")
-    };
-    void make_logical_tensor(std::vector<graph_tensor_ptr> &inputs,
-            std::vector<graph_tensor_ptr> &outputs);
-    std::pair<std::shared_ptr<sc_op>, std::shared_ptr<sc_op>> get_exp_reduce(
-            std::shared_ptr<sc_graph_t> &graph, const graph_tensor_ptr &input,
-            const std::vector<int> &axis);
-    std::shared_ptr<sc_op> get_softmax_result(
-            std::shared_ptr<sc_graph_t> &graph, const graph_tensor_ptr &input,
-            const std::vector<int> &axis);
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-class softmax_bwd_op_t : public softmax_base_t {
-public:
-    softmax_bwd_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : softmax_base_t(ins, outs, attrs) {
-        op_name_ = "softmax_bwd";
-        COMPILE_ASSERT(info_.inputs_.size() == 2,
-                "softmax backward op shall have only 2 inputs.")
-    };
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-class log_softmax_op_t : public softmax_op_t {
-public:
-    log_softmax_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : softmax_op_t(ins, outs, attrs) {
-        op_name_ = "log_softmax";
-        COMPILE_ASSERT(info_.inputs_.size() == 1,
-                "log softmax op shall have only 1 inputs.")
-    };
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-class log_softmax_backward_op_t : public softmax_base_t {
-public:
-    log_softmax_backward_op_t(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs, const any_map_t &attrs)
-        : softmax_base_t(ins, outs, attrs) {
-        op_name_ = "log_softmax_bwd";
-        COMPILE_ASSERT(info_.inputs_.size() == 2,
-                "log softmax backward op shall have only 2 inputs.")
-    };
-    void get_graph_impl(std::shared_ptr<sc_graph_t> &graph) override;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/.clang-format b/src/graph/backend/graph_compiler/core/src/ops/templates/.clang-format
deleted file mode 100644
index c21ade1b14e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/.clang-format
+++ /dev/null
@@ -1,122 +0,0 @@
-#===============================================================================
-# Copyright 2020-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
----
-Language:        Cpp
-AccessModifierOffset: -2
-AlignAfterOpenBracket: DontAlign
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: DontAlign
-AlignOperands:   false
-AlignTrailingComments: false
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: true
-AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: All
-BreakBeforeBraces: Custom
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializers: BeforeComma
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: true
-ConstructorInitializerIndentWidth: 2
-ContinuationIndentWidth: 2
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-FixNamespaceComments: true
-ForEachMacros:
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex: '<[[:alnum:].]+>'
-    Priority: 0
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: true
-# IndentPPDirectives: AfterHash
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-ReflowComments:  true
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: true
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: false
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-StatementMacros:
-  - for_
-  - PRAGMA_OMP
-  - PRAGMA_OMP_SIMD
-TabWidth:        2
-UseTab:          Never
-...
-# vim:ft=conf et ts=2 sw=2
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/commit_op.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/commit_op.cpp
deleted file mode 100644
index 094a0321350..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/commit_op.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "commit_op.hpp"
-#include <string>
-#include <vector>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-void commit_op(const context_ptr &ctx, const std::string &opname,
-  array_ref<tensor_slice> in_slice, array_ref<tensor_slice> out_slice,
-  const std::vector<graph_tensor_ptr> &inputs,
-  const std::vector<graph_tensor_ptr> &outputs, const any_map_t &attr) {
-  auto graph = graph::make_single_op_graph(opname, inputs, outputs, attr);
-  // query binding axis
-  query_binding_axis(graph);
-  COMPILE_ASSERT(graph.ops_.size() == 3UL && graph.ops_[1]->isa<fusible_op_t>(),
-    "commit_op only supports fusible op");
-  auto op = graph.ops_[1]->stc_cast<fusible_op_t>();
-  std::vector<tensor_slice> out_copy = out_slice.as_vector();
-  std::vector<tensor_slice *> out;
-  out.reserve(out_slice.size());
-  for (auto &v : out_copy) {
-    out.push_back(&v);
-  }
-
-  std::vector<const tensor_slice *> in;
-  in.reserve(in_slice.size());
-  for (auto &v : in_slice) {
-    in.push_back(&v);
-  }
-  op->compute_block(ctx, out, in);
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/commit_op.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/commit_op.hpp
deleted file mode 100644
index d612bf7e829..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/commit_op.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_COMMIT_OP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_COMMIT_OP_HPP
-
-#include <string>
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/tensor_slice.hpp>
-#include <util/array_ref.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-/**
- * @brief Commit the code of an Op into the current IR builder. Use op_inputs,
- * op_outputs and attr to define the computation of the Op and the
- * shape/layout/stride info. And the Op's Tensor IR code will be committed on
- * the tensor slices in in_slice and out_slice.
- * @note The base tensor of the in_slice and out_slice should be of the same
- * shape of blocking dims specified by arguments op_inputs, op_outputs.
- * Otherwise, wrong code may be generated. The users may manually add the attr
- * "tensor_shrinker_attrs::may_shrink" on the local temp tensors to make them
- * smaller for better performance. @see tensor_shrinker_t
- *
- * @param ctx the context
- * @param opname the name of the Op
- * @param in_slice the input tensor slices in Tensor IR
- * @param out_slice the output tensor slices in Tensor IR
- * @param op_inputs the input graph tensor for the Op
- * @param op_outputs the output graph tensor for the Op
- * @param attr the attributes of the Op
- */
-void commit_op(const context_ptr &ctx, const std::string &opname,
-  array_ref<tensor_slice> in_slice, array_ref<tensor_slice> out_slice,
-  const std::vector<graph_tensor_ptr> &op_inputs,
-  const std::vector<graph_tensor_ptr> &op_outputs = {},
-  const any_map_t &attr = {});
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_data.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_data.cpp
deleted file mode 100644
index 8fe1dfeea1b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_data.cpp
+++ /dev/null
@@ -1,672 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "conv1x1_backprop_data.hpp"
-#include <memory>
-#include <utility>
-#include <vector>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-config_ptr gen_conv1x1_backprop_data_t::get_default_config(
-  context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_bwd_data_config_t>();
-  conv_bwd_data_config_t &cfg = *ret.unchecked_get_as<conv_bwd_data_config_t>();
-  const auto weight_dim = get_weight_dims();
-  if (weight_dim[0] % 32 == 0) {
-    cfg.K_block = 32;
-  } else {
-    cfg.K_block = weight_dim[0];
-  }
-  if (weight_dim[1] % 32 == 0) {
-    cfg.C_block = 32;
-  } else {
-    cfg.C_block = weight_dim[1];
-  }
-
-  bool is_3d = ndims_ == 5;
-  int padding_d = is_3d ? padding_[0] : 0, stride_d = is_3d ? stride_[0] : 1;
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() > 1) {
-    if (is_3d) { padding_d = padding_[ndims_ - 5]; }
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-
-  cfg.tile_d = 1;
-  cfg.tile_p = 1;
-  if ((padding_h || padding_w || padding_d)
-    && ((stride_h > 1) || (stride_w > 1) || (stride_d > 1))) {
-    cfg.tile_q = 1;
-  } else if (stride_w > 1) {
-    cfg.tile_q = get_input_dims()[ndims_ - 1];
-  } else if (padding_w > 0) {
-    cfg.tile_q = get_output_dims()[ndims_ - 1];
-  } else {
-    cfg.tile_q = get_output_dims()[ndims_ - 1];
-  }
-  cfg.loop_sched = 1;
-  return std::move(ret);
-}
-
-gen_conv1x1_backprop_data_t::gen_conv1x1_backprop_data_t(sc_op *owner,
-  const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  ndims_ = get_input_dims().size();
-  const bool is_3d = (ndims_ == 5);
-  COMPILE_ASSERT(is_3d
-      ? utils::is_one_of(static_cast<int>(padding_.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(padding_.size()), 1, 2),
-    "wrong padding dims, should be 1, 2 or 3, but got " << padding_.size()
-                                                        << ".");
-  COMPILE_ASSERT(is_3d
-      ? utils::is_one_of(static_cast<int>(stride_.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(stride_.size()), 1, 2),
-    "wrong stride dims, should be 1, 2 or 3, but got " << stride_.size()
-                                                       << ".");
-}
-
-float gen_conv1x1_backprop_data_t::get_gflop() const {
-  float result = 0.0;
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_input_dims()[2] : get_output_dims()[2]);
-  const int H = stride_h > 1 ? get_input_dims()[ndims_ - 2]
-                             : get_output_dims()[ndims_ - 2];
-  const int W = stride_w > 1 ? get_input_dims()[ndims_ - 1]
-                             : get_output_dims()[ndims_ - 1];
-  const int C = get_input_dims()[1];
-  const int K = get_output_dims()[1];
-  const int N = get_input_dims()[0];
-
-  result = 2.f * N * C * D * W * H * K / 1e9;
-  return result;
-}
-
-void gen_conv1x1_backprop_data_t::schedule_loops(context_ptr ctx,
-  const conv_bwd_data_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  for_loop ln = fors.at(0), lc = fors.at(1), ld = fors.at(2), lp = fors.at(3);
-  auto loop_sched = config.loop_sched;
-  if (loop_sched == 1) {
-    int stride_d = (ndims_ == 5) ? stride_[0] : 1;
-    auto ln_c = ln->fuse(lc);
-    auto ln_c_d = ln_c->fuse(ld);
-    if (stride_d == 1) { auto ln_c_d_p = ln_c_d->fuse(lp); }
-  }
-}
-
-bool gen_conv1x1_backprop_data_t::generate(context_ptr ctx,
-  const conv_bwd_data_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-  bool is_3d = ndims_ == 5;
-  int padding_d = is_3d ? padding_[0] : 0, stride_d = is_3d ? stride_[0] : 1;
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() > 1) {
-    if (is_3d) { padding_d = padding_[ndims_ - 5]; }
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-
-  int N = get_input_dims()[0];
-  // O, P, Q is the depth, height, width of grad input
-  int K = get_weight_dims()[0], C = get_weight_dims()[1],
-      P = get_input_dims()[ndims_ - 2], Q = get_input_dims()[ndims_ - 1];
-  int H = get_output_dims()[ndims_ - 2], W = get_output_dims()[ndims_ - 1];
-  int O = is_3d ? get_input_dims()[2] : 0, D = is_3d ? get_output_dims()[2] : 0;
-  int K_block = config.K_block, C_block = config.C_block,
-      tile_d = config.tile_d, tile_p = config.tile_p, tile_q = config.tile_q;
-  int K_num_block = utils::divide_and_ceil(K, K_block),
-      C_num_block = utils::divide_and_ceil(C, C_block);
-  bool loop_sched = config.loop_sched;
-  bool is_out_blocking = out_tensors_[0].get_format().is_blocking();
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-
-  if (is_3d) {
-    COMPILE_ASSERT(get_weight_dims()[2] == 1 && get_weight_dims()[3] == 1
-        && get_weight_dims()[4] == 1,
-      "gen_conv1x1_backprop_data_t kernels==1");
-  } else {
-    COMPILE_ASSERT(get_weight_dims()[2] == 1 && get_weight_dims()[3] == 1,
-      "gen_conv1x1_backprop_data_t kernels==1");
-  }
-
-  // define compute
-  for_loop ln, lc, ld, lp;
-  expr del_input = outputs.at(op_params_t::out_del_input),
-       output = inputs.at(op_params_t::in_fwd_output),
-       weight = inputs.at(op_params_t::in_weight);
-  {
-    // for conv1x1 cases that have both non-one stride and none-zero padding
-    if ((padding_w > 0 || padding_h > 0 || padding_d > 0)
-      && (stride_w > 1 || stride_h > 1 || stride_d > 1)) {
-      assert(tile_d == 1 && tile_p == 1 && tile_q == 1);
-      int C_shift_d = padding_d > 0
-        ? (padding_d > stride_d
-            ? (stride_d == 1 ? 0 : stride_d - padding_d % stride_d)
-            : stride_d - padding_d)
-        : 0;
-      int C_shift_h = padding_h > 0
-        ? (padding_h > stride_h
-            ? (stride_h == 1 ? 0 : stride_h - padding_h % stride_h)
-            : stride_h - padding_h)
-        : 0;
-      int C_shift_w = padding_w > 0
-        ? (padding_w > stride_w
-            ? (stride_w == 1 ? 0 : stride_w - padding_w % stride_w)
-            : stride_w - padding_w)
-        : 0;
-      C_shift_d = C_shift_d < 0 ? 0 : C_shift_d;
-      C_shift_h = C_shift_h < 0 ? 0 : C_shift_h;
-      C_shift_w = C_shift_w < 0 ? 0 : C_shift_w;
-      int A_shift_d = padding_d > 0 ? (1 + (padding_d - 1) / stride_d) : 0;
-      int A_shift_h = padding_h > 0 ? (1 + (padding_h - 1) / stride_h) : 0;
-      int A_shift_w = padding_w > 0 ? (1 + (padding_w - 1) / stride_w) : 0;
-
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          int P_num_block = 0, Q_num_block = 0, D_num_block = is_3d ? 0 : 1;
-          for (int i = 0; i < O; i++) {
-            if (i * stride_d > padding_d - 1 && i * stride_d < D + padding_d) {
-              D_num_block++;
-            }
-          }
-          for (int i = 0; i < P; i++) {
-            if (i * stride_h > padding_h - 1 && i * stride_h < H + padding_h) {
-              P_num_block++;
-            }
-          }
-          for (int i = 0; i < Q; i++) {
-            if (i * stride_w > padding_w - 1 && i * stride_w < W + padding_w) {
-              Q_num_block++;
-            }
-          }
-          _named_for_(ld, d_o, 0, D_num_block) {
-            _named_for_(lp, p_o, 0, P_num_block) {
-              _for_(q_o, 0, Q_num_block) {
-                expr LDA, LDC, stride_a;
-                std::vector<expr> a_idx, b_idx, c_idx;
-                if (in_tensors_[0].get_format().is_blocking()) {
-                  LDA = K_block;
-                  stride_a = is_3d ? O * P * Q * K_block : P * Q * K_block;
-                  a_idx = is_3d
-                    ? std::vector<expr> {n, 0, d_o * tile_d + A_shift_d,
-                      p_o * tile_p + A_shift_h, q_o * tile_q + A_shift_w, 0}
-                    : std::vector<expr> {n, 0, p_o * tile_p + A_shift_h,
-                      q_o * tile_q + A_shift_w, 0};
-                } else {
-                  LDA = K;
-                  stride_a = K_block;
-                  a_idx = is_3d
-                    ? std::vector<expr> {n, d_o * tile_d + A_shift_d,
-                      p_o * tile_p + A_shift_h, q_o * tile_q + A_shift_w, 0}
-                    : std::vector<expr> {
-                      n, p_o * tile_p + A_shift_h, q_o * tile_q + A_shift_w, 0};
-                }
-                b_idx = std::vector<expr> {c_o, 0, 0, 0, 0, 0};
-                if (is_3d) b_idx.emplace_back(0);
-                if (dtype_block > 1) b_idx.emplace_back(0);
-                if (is_out_blocking) {
-                  LDC = C_block * stride_w;
-                  c_idx = is_3d ? std::vector<expr> {n, c_o,
-                            d_o * tile_d * stride_d + C_shift_d,
-                            p_o * tile_p * stride_h + C_shift_h,
-                            q_o * tile_q * stride_w + C_shift_w, 0}
-                                : std::vector<expr> {n, c_o,
-                                  p_o * tile_p * stride_h + C_shift_h,
-                                  q_o * tile_q * stride_w + C_shift_w, 0};
-                } else {
-                  LDC = C * stride_w;
-                  LDC->attr().set("N_axis",
-                    is_3d ? std::vector<size_t> {4} : std::vector<size_t> {3});
-                  c_idx = is_3d
-                    ? std::vector<expr> {n, d_o * tile_d * stride_d + C_shift_d,
-                      p_o * tile_p * stride_h + C_shift_h,
-                      q_o * tile_q * stride_w + C_shift_w, c_o * C_block}
-                    : std::vector<expr> {n, p_o * tile_p * stride_h + C_shift_h,
-                      q_o * tile_q * stride_w + C_shift_w, c_o * C_block};
-                }
-                LDC->attr().set("stride_w", stride_w);
-                builtin::brgemm_init_update(tensor_ptr(output, a_idx),
-                  tensor_ptr(weight, b_idx), tensor_ptr(del_input, c_idx),
-                  K_num_block, tile_p * tile_q, C_block, K_block, LDA, C_block,
-                  LDC, stride_a,
-                  (int)utils::divide_and_ceil(K_block, dtype_block)
-                    * dtype_block * C_block,
-                  dtype, dtype);
-
-                if (is_3d) {
-                  if (!is_out_blocking) {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{n, 1}, {d_o * tile_d * stride_d + C_shift_d, tile_d},
-                        {p_o * tile_p * stride_h + C_shift_h, tile_p},
-                        {q_o * tile_q * stride_w + C_shift_w, tile_q},
-                        {c_o * C_block, C_block}});
-                  } else {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{n, 1}, {c_o, 1},
-                        {d_o * tile_d * stride_d + C_shift_d, tile_d},
-                        {p_o * tile_p * stride_h + C_shift_h, tile_p},
-                        {q_o * tile_q * stride_w + C_shift_w, tile_q},
-                        {0, C_block}});
-                  }
-                } else {
-                  if (!is_out_blocking) {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{n, 1}, {p_o * tile_p * stride_h + C_shift_h, tile_p},
-                        {q_o * tile_q * stride_w + C_shift_w, tile_q},
-                        {c_o * C_block, C_block}});
-                  } else {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{n, 1}, {c_o, 1},
-                        {p_o * tile_p * stride_h + C_shift_h, tile_p},
-                        {q_o * tile_q * stride_w + C_shift_w, tile_q},
-                        {0, C_block}});
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // for conv1x1 cases that has either non-one stride or none-zero padding
-    else {
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          int D_num_block
-            = !is_3d ? 1 : (stride_d > 1 ? O / tile_d : D / tile_d),
-            P_num_block = stride_h > 1 ? P / tile_p : H / tile_p,
-            Q_num_block = stride_w > 1 ? Q / tile_q : W / tile_q;
-          _named_for_(ld, d_o, 0, D_num_block) {
-            _named_for_(lp, p_o, 0, P_num_block) {
-              _for_(q_o, 0, Q_num_block) {
-                if (is_3d) {
-                  // init brgemm when having strided writing, this enables the
-                  // locations which did not take part in convolution (due to
-                  // stride) to be 0, rather than NAN or some other values
-                  // exited in memory
-                  if (is_out_blocking && stride_w > 1) {
-                    // consider cases that stride_x is not divisible by X
-                    _for_(s_d, 0, stride_d) {
-                      // different from 2d init process, we need to initilize
-                      // one by one on D axis, considering the discontinuity
-                      // casued by stride_d
-                      std::vector<expr> del_input_index
-                        = {n, c_o, d_o * tile_d * stride_d + s_d,
-                          p_o * tile_p * stride_h, q_o * tile_q * stride_w, 0};
-                      _if_(d_o * tile_d * stride_d + s_d < expr(D)) {
-                        _if_(q_o == expr(Q_num_block - 1)) {
-                          _if_(p_o < expr(P_num_block - 1)) {
-                            builtin::dnnl_brgemm_init(
-                              tensor_ptr(del_input, del_input_index),
-                              tile_p * stride_h,
-                              C_block
-                                * (W - (Q_num_block - 1) * tile_q * stride_w),
-                              C_block
-                                * (W - (Q_num_block - 1) * tile_q * stride_w),
-                              datatypes::f32, 0);
-                          }
-                          _else_ {
-                            builtin::dnnl_brgemm_init(
-                              tensor_ptr(del_input, del_input_index),
-                              H - (P_num_block - 1) * tile_p * stride_h,
-                              C_block
-                                * (W - (Q_num_block - 1) * tile_q * stride_w),
-                              C_block
-                                * (W - (Q_num_block - 1) * tile_q * stride_w),
-                              datatypes::f32, 0);
-                          }
-                        }
-                        _else_ {
-                          _if_(p_o < expr(P_num_block - 1)) {
-                            builtin::dnnl_brgemm_init(
-                              tensor_ptr(del_input, del_input_index),
-                              tile_p * stride_h, C_block * tile_q * stride_w,
-                              C_block * tile_q * stride_w, datatypes::f32, 0);
-                          }
-                          _else_ {
-                            builtin::dnnl_brgemm_init(
-                              tensor_ptr(del_input, del_input_index),
-                              H - (P_num_block - 1) * tile_p * stride_h,
-                              C_block * tile_q * stride_w,
-                              C_block * tile_q * stride_w, datatypes::f32, 0);
-                          }
-                        }
-                      }
-                    }
-                  } else if (!is_out_blocking && stride_w > 1) {
-                    _for_(s_d, 0, stride_d) {
-                      std::vector<expr> del_input_index = {n,
-                        d_o * tile_d * stride_d + s_d, p_o * tile_p * stride_h,
-                        q_o * tile_q * stride_w, c_o * C_block};
-                      _if_(d_o * tile_d * stride_d + s_d < expr(D)) {
-                        // here we are forcing Q_num_block == 1 to make P, Q
-                        // axis continuous
-                        COMPILE_ASSERT(Q_num_block == 1,
-                          "Q_num_block must be 1 in non-blocking conv bwd data "
-                          "kernel (strided case).");
-                        expr LDC = C;
-                        LDC->attr().set("plain_init", true);
-                        _if_(p_o < expr(P_num_block - 1)) {
-                          builtin::dnnl_brgemm_init(
-                            tensor_ptr(del_input, del_input_index),
-                            tile_p * stride_h * W, C_block, LDC, datatypes::f32,
-                            0);
-                        }
-                        _else_ {
-                          builtin::dnnl_brgemm_init(
-                            tensor_ptr(del_input, del_input_index),
-                            (H - (P_num_block - 1) * tile_p * stride_h) * W,
-                            C_block, LDC, datatypes::f32, 0);
-                        }
-                      }
-                    }
-                  }
-                  expr LDA, LDC, stride_a;
-                  std::vector<expr> a_idx, c_idx;
-                  if (in_tensors_[0].get_format().is_blocking()) {
-                    LDA = K_block;
-                    stride_a = O * P * Q * K_block;
-                    a_idx = {n, 0, d_o * tile_d + padding_d,
-                      p_o * tile_p + padding_h, q_o * tile_q + padding_w, 0};
-                  } else {
-                    LDA = K;
-                    stride_a = K_block;
-                    a_idx = {n, d_o * tile_d + padding_d,
-                      p_o * tile_p + padding_h, q_o * tile_q + padding_w, 0};
-                  }
-                  if (is_out_blocking) {
-                    LDC = C_block * stride_w;
-                    c_idx = {n, c_o, d_o * tile_d * stride_d,
-                      p_o * tile_p * stride_h, q_o * tile_q * stride_w, 0};
-                  } else {
-                    LDC = C * stride_w;
-                    LDC->attr().set("N_axis", std::vector<size_t> {4});
-                    c_idx
-                      = {n, d_o * tile_d * stride_d, p_o * tile_p * stride_h,
-                        q_o * tile_q * stride_w, c_o * C_block};
-                  }
-                  LDC->attr().set("stride_w", stride_w);
-                  builtin::brgemm_init_update(tensor_ptr(output, a_idx),
-                    tensor_ptr(weight,
-                      dtype_block > 1
-                        ? std::vector<expr> {c_o, 0, 0, 0, 0, 0, 0, 0}
-                        : std::vector<expr> {c_o, 0, 0, 0, 0, 0, 0}),
-                    tensor_ptr(del_input, c_idx), K_num_block, tile_p * tile_q,
-                    C_block, K_block, LDA, C_block, LDC, stride_a,
-                    (int)utils::divide_and_ceil(K_block, dtype_block)
-                      * dtype_block * C_block,
-                    dtype, dtype);
-                  if (W % stride_w == 0 && H % stride_h == 0
-                    && D % stride_d == 0) {
-                    // when D(H, W) is not devisible by stride_d(stride_h,
-                    // stride_w), there will be non-constant value on
-                    // tensor_slice, we need to put the fusion anchor outside
-                    // this loop
-                    if (!is_out_blocking) {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        std::vector<std::pair<expr, expr>> {{n, 1},
-                          {d_o * tile_d * stride_d, tile_d * stride_d},
-                          {p_o * tile_p * stride_h, tile_p * stride_h},
-                          {q_o * tile_q * stride_w, tile_q * stride_w},
-                          {c_o * C_block, C_block}});
-                    } else {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        std::vector<std::pair<expr, expr>> {{n, 1}, {c_o, 1},
-                          {d_o * tile_d * stride_d, tile_d * stride_d},
-                          {p_o * tile_p * stride_h, tile_p * stride_h},
-                          {q_o * tile_q * stride_w, tile_q * stride_w},
-                          {0, C_block}});
-                    }
-                  }
-                } else {
-                  // init brgemm when having strided writing
-                  if (is_out_blocking && stride_w > 1) {
-                    // consider cases that stride_x is not divisible by X
-                    std::vector<expr> del_input_index = {n, c_o,
-                      p_o * tile_p * stride_h, q_o * tile_q * stride_w, 0};
-                    _if_(q_o == expr(Q_num_block - 1)) {
-                      _if_(p_o < expr(P_num_block - 1)) {
-                        builtin::dnnl_brgemm_init(
-                          tensor_ptr(del_input, del_input_index),
-                          tile_p * stride_h,
-                          C_block * (W - (Q_num_block - 1) * tile_q * stride_w),
-                          C_block * (W - (Q_num_block - 1) * tile_q * stride_w),
-                          datatypes::f32, 0);
-                      }
-                      _else_ {
-                        builtin::dnnl_brgemm_init(
-                          tensor_ptr(del_input, del_input_index),
-                          H - (P_num_block - 1) * tile_p * stride_h,
-                          C_block * (W - (Q_num_block - 1) * tile_q * stride_w),
-                          C_block * (W - (Q_num_block - 1) * tile_q * stride_w),
-                          datatypes::f32, 0);
-                      }
-                    }
-                    _else_ {
-                      _if_(p_o < expr(P_num_block - 1)) {
-                        builtin::dnnl_brgemm_init(
-                          tensor_ptr(del_input, del_input_index),
-                          tile_p * stride_h, C_block * tile_q * stride_w,
-                          C_block * tile_q * stride_w, datatypes::f32, 0);
-                      }
-                      _else_ {
-                        builtin::dnnl_brgemm_init(
-                          tensor_ptr(del_input, del_input_index),
-                          H - (P_num_block - 1) * tile_p * stride_h,
-                          C_block * tile_q * stride_w,
-                          C_block * tile_q * stride_w, datatypes::f32, 0);
-                      }
-                    }
-                  } else if (!is_out_blocking && stride_w > 1) {
-                    std::vector<expr> del_input_index
-                      = {n, p_o * tile_p * stride_h, q_o * tile_q * stride_w,
-                        c_o * C_block};
-                    // here we are forcing Q_num_block == 1 to make P, Q axis
-                    // continuous
-                    COMPILE_ASSERT(Q_num_block == 1,
-                      "Q_num_block must be 1 in non-blocking conv bwd data "
-                      "kernel (strided case).");
-                    expr LDC = C;
-                    LDC->attr().set("plain_init", true);
-                    _if_(p_o < expr(P_num_block - 1)) {
-                      builtin::dnnl_brgemm_init(
-                        tensor_ptr(del_input, del_input_index),
-                        tile_p * stride_h * W, C_block, LDC, datatypes::f32, 0);
-                    }
-                    _else_ {
-                      builtin::dnnl_brgemm_init(
-                        tensor_ptr(del_input, del_input_index),
-                        (H - (P_num_block - 1) * tile_p * stride_h) * W,
-                        C_block, LDC, datatypes::f32, 0);
-                    }
-                  }
-                  // consider plain/blocking format
-                  expr LDA, LDC, stride_a;
-                  std::vector<expr> a_idx, c_idx;
-                  if (in_tensors_[0].get_format().is_blocking()) {
-                    LDA = K_block;
-                    stride_a = P * Q * K_block;
-                    a_idx = {n, 0, p_o * tile_p + padding_h,
-                      q_o * tile_q + padding_w, 0};
-                  } else {
-                    LDA = K;
-                    stride_a = K_block;
-                    a_idx = {
-                      n, p_o * tile_p + padding_h, q_o * tile_q + padding_w, 0};
-                  }
-                  if (is_out_blocking) {
-                    LDC = C_block * stride_w;
-                    c_idx = {n, c_o, p_o * tile_p * stride_h,
-                      q_o * tile_q * stride_w, 0};
-                  } else {
-                    LDC = C * stride_w;
-                    LDC->attr().set("N_axis", std::vector<size_t> {3});
-                    c_idx = {n, p_o * tile_p * stride_h,
-                      q_o * tile_q * stride_w, c_o * C_block};
-                  }
-                  LDC->attr().set("stride_w", stride_w);
-
-                  builtin::brgemm_init_update(tensor_ptr(output, a_idx),
-                    tensor_ptr(weight,
-                      dtype_block > 1
-                        ? std::vector<expr> {c_o, 0, 0, 0, 0, 0, 0}
-                        : std::vector<expr> {c_o, 0, 0, 0, 0, 0}),
-                    tensor_ptr(del_input, c_idx), K_num_block, tile_p * tile_q,
-                    C_block, K_block, LDA, C_block, LDC, stride_a,
-                    (int)utils::divide_and_ceil(K_block, dtype_block)
-                      * dtype_block * C_block,
-                    dtype, dtype);
-                  if (W % stride_w == 0 && H % stride_h == 0) {
-                    // when H(W) is not devisible by stride_h(stride_w), there
-                    // will be non-constant value on tensor_slice, we need to
-                    // put the fusion anchor outside this loop
-                    if (!is_out_blocking) {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        std::vector<std::pair<expr, expr>> {{n, 1},
-                          {p_o * tile_p * stride_h, tile_p * stride_h},
-                          {q_o * tile_q * stride_w, tile_q * stride_w},
-                          {c_o * C_block, C_block}});
-                    } else {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        std::vector<std::pair<expr, expr>> {{n, 1}, {c_o, 1},
-                          {p_o * tile_p * stride_h, tile_p * stride_h},
-                          {q_o * tile_q * stride_w, tile_q * stride_w},
-                          {0, C_block}});
-                    }
-                  }
-                }
-              }
-              if (H % stride_h == 0) {
-                // when H is not devisible by stride_h, there will be
-                // non-constant value on tensor_slice, we need to put the fusion
-                // anchor outside this loop
-                if (is_3d) {
-                  if (D % stride_d == 0) {
-                    if (!is_out_blocking) {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        {{n, 1}, {d_o * tile_d * stride_d, tile_d * stride_d},
-                          {p_o * tile_p * stride_h, tile_p * stride_h}, {0, W},
-                          {c_o * C_block, C_block}});
-                    } else {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        {{n, 1}, {c_o, 1},
-                          {d_o * tile_d * stride_d, tile_d * stride_d},
-                          {p_o * tile_p * stride_h, tile_p * stride_h}, {0, W},
-                          {0, C_block}});
-                    }
-                  }
-                } else {
-                  if (!is_out_blocking) {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{n, 1}, {p_o * tile_p * stride_h, tile_p * stride_h},
-                        {0, W}, {c_o * C_block, C_block}});
-                  } else {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{n, 1}, {c_o, 1},
-                        {p_o * tile_p * stride_h, tile_p * stride_h}, {0, W},
-                        {0, C_block}});
-                  }
-                }
-              }
-            }
-            if (is_3d && D % stride_d == 0) {
-              if (!is_out_blocking) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  {{n, 1}, {d_o * tile_d * stride_d, tile_d * stride_d}, {0, H},
-                    {0, W}, {c_o * C_block, C_block}});
-              } else {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  {{n, 1}, {c_o, 1},
-                    {d_o * tile_d * stride_d, tile_d * stride_d}, {0, H},
-                    {0, W}, {0, C_block}});
-              }
-            }
-          }
-          if (is_3d) {
-            if (!is_out_blocking) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{n, 1}, {0, D}, {0, H}, {0, W}, {c_o * C_block, C_block}});
-            } else {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{n, 1}, {c_o, 1}, {0, D}, {0, H}, {0, W}, {0, C_block}});
-            }
-          } else {
-            if (!is_out_blocking) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{n, 1}, {0, H}, {0, W}, {c_o * C_block, C_block}});
-            } else {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{n, 1}, {c_o, 1}, {0, H}, {0, W}, {0, C_block}});
-            }
-          }
-        }
-      }
-    }
-  }
-
-  loops = {ln, lc, ld, lp};
-  return true;
-}
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_data.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_data.hpp
deleted file mode 100644
index d79dc2658ef..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_data.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV1X1_BACKPROP_DATA_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV1X1_BACKPROP_DATA_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace ops {
-
-class gen_conv1x1_backprop_data_t
-  : public body_generator_t<conv_bwd_data_config_t> {
-public:
-  sc_dims stride_;
-  sc_dims padding_;
-  struct op_params_t {
-    static constexpr int in_fwd_output = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out_del_input = 0;
-  };
-  using parent = body_generator_t<conv_bwd_data_config_t>;
-  using parent::generate;
-
-  gen_conv1x1_backprop_data_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_input_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-  const sc_dims &get_weight_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-  sc_data_type_t get_dtype() const { return in_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_bwd_data_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_bwd_data_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-
-private:
-  int ndims_ = 0;
-};
-} // namespace ops
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_weight.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_weight.cpp
deleted file mode 100644
index 8b0fdf58a17..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_weight.cpp
+++ /dev/null
@@ -1,881 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "conv1x1_backprop_weight.hpp"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <runtime/config.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-config_ptr gen_conv1x1_backprop_weight_t::get_default_config(
-  context_ptr ctx) const {
-  if (type_ == generator_type_t::REDUCE_ALL) {
-    auto ret = reflection::general_object_t::make<conv_bwd_weight_config_t>();
-    conv_bwd_weight_config_t &cfg
-      = *ret.unchecked_get_as<conv_bwd_weight_config_t>();
-    int N = static_cast<int>(get_data_dims()[0]);
-    int C = static_cast<int>(get_data_dims()[1]);
-    int K = static_cast<int>(get_grad_input_dims()[1]);
-    int P = static_cast<int>(get_grad_input_dims()[ndims_ - 2]);
-    int Q = static_cast<int>(get_grad_input_dims()[ndims_ - 1]);
-
-    // temporarily deal with first stage
-    bool large_spatial = (P >= 56 && Q >= 56);
-    if (large_spatial && N % 16 == 0) {
-      cfg.N_block = 16;
-    } else if (N % 32 == 0) {
-      cfg.N_block = 32;
-    } else {
-      cfg.N_block = N;
-    }
-
-    if (K % 64 == 0 && !large_spatial) {
-      cfg.K_block = 64;
-    } else {
-      cfg.K_block = K;
-    }
-    if (C % 64 == 0 && !large_spatial) {
-      cfg.C_block = 64;
-    } else {
-      cfg.C_block = C;
-    }
-    cfg.tile_p = 1;
-    cfg.loop_sched = 1;
-    cfg.num_tile_n = 1;
-    cfg.tile_q = 1;
-    return std::move(ret);
-  } else {
-    auto ret = reflection::general_object_t::make<conv_bwd_weight_config_t>();
-    conv_bwd_weight_config_t &cfg
-      = *ret.unchecked_get_as<conv_bwd_weight_config_t>();
-    int N = static_cast<int>(get_data_dims()[0]);
-    int C = static_cast<int>(get_data_dims()[1]);
-    int K = static_cast<int>(get_grad_input_dims()[1]);
-    int P = static_cast<int>(get_grad_input_dims()[ndims_ - 2]);
-    int Q = static_cast<int>(get_grad_input_dims()[ndims_ - 1]);
-    int num_threads = runtime_config_t::get().get_num_threads();
-
-    // temporarily deal with first stage
-    bool large_spatial = (P >= 56 && Q >= 56);
-    if (large_spatial && N % 16 == 0) {
-      cfg.N_block = 16;
-    } else if (N % 32 == 0) {
-      cfg.N_block = 32;
-    } else {
-      cfg.N_block = N;
-    }
-
-    if (K % 64 == 0 && !large_spatial) {
-      cfg.K_block = 64;
-    } else {
-      cfg.K_block = K;
-    }
-    if (C % 64 == 0 && !large_spatial) {
-      cfg.C_block = 64;
-    } else {
-      cfg.C_block = C;
-    }
-    cfg.tile_p = 1;
-    cfg.loop_sched = 1;
-    cfg.num_tile_n = 1;
-    cfg.tile_q = 1;
-    cfg.loop_sched = 1; // split NxHxW
-    if ((K / cfg.K_block) * (C / cfg.C_block) < num_threads * 2) {
-      // ouput chanel and input chanel cannot provide enough parallisiem
-      cfg.loop_sched = 1; // split NxHxW
-    } else {
-      cfg.loop_sched = 2; // don't split reduce axis
-    }
-    return std::move(ret);
-  }
-}
-
-gen_conv1x1_backprop_weight_t::gen_conv1x1_backprop_weight_t(sc_op *owner,
-  const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs,
-  generator_type_t type)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding)
-  , type_(type) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  ndims_ = get_data_dims().size();
-  const bool is_3d = (ndims_ == 5);
-  COMPILE_ASSERT(is_3d
-      ? utils::is_one_of(static_cast<int>(padding_.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(padding_.size()), 1, 2),
-    "wrong padding dims, should be 1, 2 or 3, but got " << padding_.size()
-                                                        << ".");
-  COMPILE_ASSERT(is_3d
-      ? utils::is_one_of(static_cast<int>(stride_.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(stride_.size()), 1, 2),
-    "wrong stride dims, should be 1, 2 or 3, but got " << stride_.size()
-                                                       << ".");
-}
-
-float gen_conv1x1_backprop_weight_t::get_gflop() const {
-  float result = 0.0;
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D = !is_3d
-    ? 1
-    : (stride_d > 1 ? get_grad_input_dims()[2] : get_data_dims()[2]);
-  const int H = stride_h > 1 ? get_grad_input_dims()[ndims_ - 2]
-                             : get_data_dims()[ndims_ - 2];
-  const int W = stride_w > 1 ? get_grad_input_dims()[ndims_ - 1]
-                             : get_data_dims()[ndims_ - 1];
-  const int C = get_data_dims()[1];
-  const int K = get_grad_input_dims()[1];
-  const int N = get_data_dims()[0];
-  result = 2.f * N * C * D * W * H * K / 1e9;
-  return result;
-}
-
-void gen_conv1x1_backprop_weight_t::schedule_loops(context_ptr ctx,
-  const conv_bwd_weight_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  if (fors.empty()) { return; }
-  auto loop_sched = config.loop_sched;
-  if (type_ == generator_type_t::REDUCE_N) {
-    for_loop ln = fors.at(0), lk = fors.at(1), lc = fors.at(2),
-             rlko = fors.at(3), rlco = fors.at(4);
-    if (config.loop_sched == 1) {
-      auto ln_k = ln->fuse(lk);
-      auto ln_kc = ln_k->fuse(lc);
-      auto rlk_c = rlko->fuse(rlco);
-    }
-  } else if (type_ == generator_type_t::REDUCE_ALL) {
-    for_loop ln = fors.at(0), lk = fors.at(1), lc = fors.at(2);
-    for_loop lnt = fors.at(3), lnpq = fors.at(4);
-    for_loop rlko = fors.at(5), rlco = fors.at(6), rlc = fors.at(7);
-    if (config.loop_sched == 1) {
-      auto ln_k = ln->fuse(lk);
-      auto ln_kc = ln_k->fuse(lc);
-      auto lnt_pq = lnt->fuse(lnpq);
-      auto lrkc = rlko->fuse(rlco);
-      auto lrkcc = lrkc->fuse(rlc);
-    }
-  } else if (type_ == generator_type_t::REDUCE_ALL2) {
-    for_loop lk = fors.at(0), lc = fors.at(1), ln = fors.at(2), ld = fors.at(3),
-             lp = fors.at(4), rlko = fors.at(5), rlco = fors.at(6);
-    if (config.loop_sched == 1) {
-      lk->reorder(body, {ln, lk, lc, ld, lp});
-      ln->fuse(lk)->fuse(lc)->fuse(ld)->fuse(lp);
-      rlko->fuse(rlco);
-    } else if (config.loop_sched == 2) {
-      lk->fuse(lc);
-      ln->attr().set(stmt_attr_key::no_loop_fuse, true);
-      ld->attr().set(stmt_attr_key::no_loop_fuse, true);
-      lp->attr().set(stmt_attr_key::no_loop_fuse, true);
-    }
-  }
-}
-
-bool gen_conv1x1_backprop_weight_t::generate(context_ptr ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  COMPILE_ASSERT(
-    type_ != generator_type_t::UNDEF, "Generator shall have an explicit type.");
-  if (type_ == generator_type_t::REDUCE_N) {
-    return generate_reduce_N(ctx, config, fusion, inputs, outputs, loops);
-  } else if (type_ == generator_type_t::REDUCE_ALL) {
-    return generate_reduce_ALL(ctx, config, fusion, inputs, outputs, loops);
-  } else {
-    return generate_reduce_ALL2(ctx, config, fusion, inputs, outputs, loops);
-  }
-}
-
-bool gen_conv1x1_backprop_weight_t::generate_reduce_N(const context_ptr &ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-  bool is_3d = ndims_ == 5;
-  int padding_d = is_3d ? padding_[0] : 0, stride_d = is_3d ? stride_[0] : 1;
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() > 1) {
-    if (is_3d) { padding_d = padding_[ndims_ - 5]; }
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-
-  int N = get_data_dims()[0], C = get_data_dims()[1],
-      H = get_data_dims()[ndims_ - 2], W = get_data_dims()[ndims_ - 1];
-  int O = is_3d ? get_grad_input_dims()[2] : 1,
-      D = is_3d ? get_data_dims()[2] : 1;
-  int P = get_grad_input_dims()[ndims_ - 2],
-      Q = get_grad_input_dims()[ndims_ - 1];
-  int K = get_grad_input_dims()[1];
-  int K_block = config.K_block, C_block = config.C_block,
-      N_block = config.N_block;
-  int K_num_block = utils::divide_and_ceil(K, K_block),
-      C_num_block = utils::divide_and_ceil(C, C_block),
-      N_num_block = utils::divide_and_ceil(N, N_block);
-  int N_tile = N_num_block / config.num_tile_n;
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  std::vector<int> p_locations, q_locations, d_locations;
-  for (int i = 0; i < H + 2 * padding_h; i++) {
-    if (i * stride_h >= padding_h && i * stride_h < H + padding_h) {
-      p_locations.push_back(i * stride_h);
-    }
-  }
-  for (int i = 0; i < W + 2 * padding_w; i++) {
-    if (i * stride_w >= padding_w && i * stride_w < W + padding_w) {
-      q_locations.push_back(i * stride_w);
-    }
-  }
-  for (int i = 0; i < D + 2 * padding_d; i++) {
-    if (i * stride_d >= padding_d && i * stride_d < D + padding_d) {
-      d_locations.push_back(i * stride_d);
-    }
-  }
-  int P_num_block = static_cast<int>(p_locations.size()),
-      Q_num_block = static_cast<int>(q_locations.size()),
-      D_num_block = is_3d ? static_cast<int>(d_locations.size()) : 1;
-  int padded_h_num
-    = static_cast<int>(utils::divide_and_ceil(padding_h, stride_h)),
-    padded_d_num
-    = static_cast<int>(utils::divide_and_ceil(padding_d, stride_d)),
-    padded_w_num
-    = static_cast<int>(utils::divide_and_ceil(padding_w, stride_w));
-
-  // define compute
-  for_loop ln, lc, lk, ld, lp;
-  for_loop rlco, rlko, rlk;
-  expr del_weight = outputs.at(op_params_t::out_del_weight),
-       data = inputs.at(op_params_t::in_data),
-       output = inputs.at(op_params_t::in_fwd_output);
-  auto filter_dims
-    = out_tensors_[op_params_t::out_del_weight].get_blocking_dims();
-  std::vector<expr> del_weight_tmp_buf_shape;
-  del_weight_tmp_buf_shape.reserve(filter_dims.size());
-  for (auto dim : filter_dims) {
-    del_weight_tmp_buf_shape.emplace_back(dim2unsigned(dim));
-  }
-  del_weight_tmp_buf_shape[0] = del_weight_tmp_buf_shape[0] * N_num_block;
-  {
-    _tensor_(del_weight_tmp_buf, datatypes::f32, del_weight_tmp_buf_shape);
-    _named_for_(ln, n_o, 0, N_num_block, 1, for_type::PARALLEL) {
-      _named_for_(lk, k_o, 0, K_num_block) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          _named_for_(ld, d_o, 0, D_num_block) {
-            _named_for_(lp, p_o, 0, P_num_block) {
-              std::vector<expr> output_idx
-                = {n_o, k_o, p_o + padded_h_num, padded_w_num, 0, 0},
-                data_idx
-                = {n_o, c_o, p_o * stride_h + p_locations[0] - padding_h,
-                  q_locations[0] - padding_w, 0, 0},
-                del_weight_tmp_idx = {n_o * K_num_block + k_o, c_o, 0, 0, 0, 0};
-              if (is_3d) {
-                output_idx.insert(output_idx.begin() + 2, d_o + padded_d_num);
-                data_idx.insert(data_idx.begin() + 2,
-                  d_o * stride_d + d_locations[0] - padding_d);
-                del_weight_tmp_idx.emplace_back(expr(0));
-              }
-              if (dtype_block > 1) { data_idx.emplace_back(expr(0)); }
-              _if_(d_o == 0 && p_o == 0) {
-                builtin::brgemm_init_update(tensor_ptr(output, output_idx),
-                  tensor_ptr(data, data_idx),
-                  tensor_ptr(del_weight_tmp_buf, del_weight_tmp_idx),
-                  Q_num_block, K_block, C_block, N_block, N_block, C_block,
-                  C_block, K_block * N_block,
-                  stride_w * C_block
-                    * (int)utils::divide_and_ceil(N_block, dtype_block)
-                    * dtype_block,
-                  dtype, dtype);
-              }
-              _else_ {
-                builtin::brgemm_update(tensor_ptr(output, output_idx),
-                  tensor_ptr(data, data_idx),
-                  tensor_ptr(del_weight_tmp_buf, del_weight_tmp_idx),
-                  Q_num_block, K_block, C_block, N_block, N_block, C_block,
-                  C_block, K_block * N_block,
-                  stride_w * C_block
-                    * (int)utils::divide_and_ceil(N_block, dtype_block)
-                    * dtype_block,
-                  dtype, dtype);
-              }
-            }
-          }
-        }
-      }
-    }
-    int lanes = 1;
-    if (C_block / 16 && C_block % 16 == 0) {
-      lanes = vectorize_step(ctx, out_tensors_[0].dtype_.type_code_, 16);
-    }
-    // KC(D)RSkc
-    _named_for_(rlko, l_k_o, 0, K_num_block, 1, for_type::PARALLEL) {
-      _named_for_(rlco, l_c_o, 0, C_num_block, 1) {
-        builtin::mem_zero(
-          tensor_ptr(del_weight,
-            is_3d ? std::vector<expr> {l_k_o, l_c_o, 0, 0, 0, 0, 0}
-                  : std::vector<expr> {l_k_o, l_c_o, 0, 0, 0, 0}),
-          C_block * K_block, datatypes::f32);
-        _named_for_(rlk, l_k, 0, K_block, 1) {
-          _for_(l_c, 0, C_block, lanes) {
-            _for_(l_n, 0, N_num_block, 1) {
-              std::vector<expr> del_weight_idx = {l_k_o, l_c_o, 0, 0, l_k, l_c},
-                                del_weight_tmp_idx
-                = {l_n * K_num_block + l_k_o, l_c_o, 0, 0, l_k, l_c};
-              if (is_3d) {
-                del_weight_idx.insert(del_weight_idx.begin() + 2, expr(0));
-                del_weight_tmp_idx.insert(
-                  del_weight_tmp_idx.begin() + 2, expr(0));
-              }
-              del_weight[span_t(del_weight_idx, lanes)]
-                = builder::make_add(del_weight[span_t(del_weight_idx, lanes)],
-                  del_weight_tmp_buf[span_t(del_weight_tmp_idx, lanes)]);
-            }
-          }
-        }
-        if (is_3d) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            {{l_k_o, 1}, {l_c_o, 1}, {0, 1}, {0, 1}, {0, 1}, {0, K_block},
-              {0, C_block}});
-        } else {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            {{l_k_o, 1}, {l_c_o, 1}, {0, 1}, {0, 1}, {0, K_block},
-              {0, C_block}});
-        }
-      }
-    }
-    loops = {ln, lk, lc, rlko, rlco};
-  }
-  ld->attr().set(stmt_attr_key::no_loop_fuse, true);
-  lp->attr().set(stmt_attr_key::no_loop_fuse, true);
-  return true;
-}
-
-bool gen_conv1x1_backprop_weight_t::generate_reduce_ALL(const context_ptr &ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-  bool is_3d = ndims_ == 5;
-  int padding_d = is_3d ? padding_[0] : 0, stride_d = is_3d ? stride_[0] : 1;
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() > 1) {
-    if (is_3d) { padding_d = padding_[ndims_ - 5]; }
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-
-  int C = get_data_dims()[1], K = get_grad_input_dims()[1];
-  int N = get_grad_input_dims()[0], D = is_3d ? get_data_dims()[ndims_ - 3] : 1;
-  int H = get_data_dims()[ndims_ - 2], W = get_data_dims()[ndims_ - 1];
-  std::vector<int> p_locations, q_locations, d_locations;
-  for (int i = 0; i < H + 2 * padding_h; i++) {
-    if (i * stride_h >= padding_h && i * stride_h < H + padding_h) {
-      p_locations.push_back(i * stride_h);
-    }
-  }
-  for (int i = 0; i < W + 2 * padding_w; i++) {
-    if (i * stride_w >= padding_w && i * stride_w < W + padding_w) {
-      q_locations.push_back(i * stride_w);
-    }
-  }
-  for (int i = 0; i < D + 2 * padding_d; i++) {
-    if (i * stride_d >= padding_d && i * stride_d < D + padding_d) {
-      d_locations.push_back(i * stride_d);
-    }
-  }
-  int padded_h_num
-    = static_cast<int>(utils::divide_and_ceil(padding_h, stride_h)),
-    padded_d_num
-    = static_cast<int>(utils::divide_and_ceil(padding_d, stride_d)),
-    padded_w_num
-    = static_cast<int>(utils::divide_and_ceil(padding_w, stride_w));
-  // the effective numbers in computation
-  int O = static_cast<int>(d_locations.size()),
-      P = static_cast<int>(p_locations.size()),
-      Q = static_cast<int>(q_locations.size());
-  int NPQ = N * P * Q;
-  if (is_3d) { NPQ *= O; }
-  int PQ = P * Q;
-  int OPQ = is_3d ? PQ * O : PQ;
-
-  // use num_tile_n to represent NPQ_num_tile
-  // use tile_p to represent the NPQ_block (reduce block)
-  // use tile_q to cut NPQ_num_tile
-  int NPQ_num_tile = config.num_tile_n, NPQ_block = config.tile_p,
-      tile_q = config.tile_q;
-  int NPQ_num_block = utils::divide_and_ceil(NPQ, NPQ_block);
-  int NPQ_tile = NPQ_num_block / NPQ_num_tile;
-  COMPILE_ASSERT(NPQ_num_block % NPQ_num_tile == 0,
-    "bad config with num_tile_n = " << NPQ_num_tile
-                                    << ", tile_p = " << NPQ_block
-                                    << ", NPQ_num_block = " << NPQ_num_block);
-  COMPILE_ASSERT(NPQ_num_tile % tile_q == 0,
-    "bad config with num_tile_n = " << NPQ_num_tile << ", tile_q = " << tile_q);
-  int K_block = config.K_block, C_block = config.C_block;
-  int K_num_block = utils::divide_and_ceil(K, K_block),
-      C_num_block = utils::divide_and_ceil(C, C_block);
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  int NPQ_block_pad
-    = (dtype_block > 1 && NPQ_block % 2) ? NPQ_block + 1 : NPQ_block;
-
-  // define compute
-  for_loop ln, lc, lk, lnt, lnpq, rlko, rlco, rlc;
-  expr del_weight = outputs.at(op_params_t::out_del_weight),
-       data = inputs.at(op_params_t::in_data),
-       output = inputs.at(op_params_t::in_fwd_output);
-  auto filter_dims
-    = out_tensors_[op_params_t::out_del_weight].get_blocking_dims();
-  std::vector<expr> del_weight_tmp_buf_shape;
-  del_weight_tmp_buf_shape.reserve(filter_dims.size());
-  for (auto dim : filter_dims) {
-    del_weight_tmp_buf_shape.emplace_back(dim2unsigned(dim));
-  }
-  del_weight_tmp_buf_shape[0]
-    = del_weight_tmp_buf_shape[0] * NPQ_num_tile / tile_q;
-  {
-    _tensor_(del_weight_tmp_buf, datatypes::f32, del_weight_tmp_buf_shape);
-    _named_for_(ln, n_o, 0, NPQ_num_tile / tile_q, 1, for_type::PARALLEL) {
-      _named_for_(lk, k_o, 0, K_num_block) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          _for_(qq, 0, tile_q) {
-            _tensor_(data_tmp, dtype, {NPQ_tile, C_block, NPQ_block_pad});
-            _tensor_(output_tmp, dtype,
-              dtype_block > 1
-                ? std::vector<expr> {NPQ_tile, NPQ_block_pad / dtype_block,
-                  K_block, dtype_block}
-                : std::vector<expr> {NPQ_tile, NPQ_block, K_block});
-            _named_for_(lnt, nt_i, 0, NPQ_tile) {
-              _named_for_(lnpq, npq_i, 0, NPQ_block_pad) {
-                expr npq_idx
-                  = ((n_o * tile_q + qq) * NPQ_tile + nt_i) * NPQ_block + npq_i;
-                _for_(c_i, 0, C_block) {
-                  _if_(npq_i >= NPQ_block) {
-                    data_tmp[{nt_i, c_i, npq_i}]
-                      = make_expr<constant_node>(float(0.0), dtype);
-                  }
-                  _else_ {
-                    if (is_3d) {
-                      data_tmp[{nt_i, c_i, npq_i}] = data[{npq_idx / OPQ,
-                        npq_idx % OPQ / PQ * stride_d + d_locations[0]
-                          - padding_d,
-                        npq_idx % OPQ % PQ / Q * stride_h + p_locations[0]
-                          - padding_h,
-                        npq_idx % OPQ % PQ % Q * stride_w + q_locations[0]
-                          - padding_w,
-                        c_o * C_block + c_i}];
-                    } else {
-                      data_tmp[{nt_i, c_i, npq_i}] = data[{npq_idx / PQ,
-                        npq_idx % PQ / Q * stride_h + p_locations[0]
-                          - padding_h,
-                        npq_idx % PQ % Q * stride_w + q_locations[0]
-                          - padding_w,
-                        c_o * C_block + c_i}];
-                    }
-                  }
-                }
-                _for_(k_i, 0, K_block) {
-                  if (dtype_block == 1) {
-                    if (is_3d) {
-                      output_tmp[{nt_i, npq_i, k_i}] = output[{npq_idx / OPQ,
-                        npq_idx % OPQ / PQ + padded_d_num,
-                        npq_idx % OPQ % PQ / Q + padded_h_num,
-                        npq_idx % OPQ % PQ % Q + padded_w_num,
-                        k_o * K_block + k_i}];
-                    } else {
-                      output_tmp[{nt_i, npq_i, k_i}] = output[{npq_idx / PQ,
-                        npq_idx % PQ / Q + padded_h_num,
-                        npq_idx % PQ % Q + padded_w_num, k_o * K_block + k_i}];
-                    }
-                  } else {
-                    _if_(npq_i >= NPQ_block) {
-                      output_tmp[{nt_i, npq_i / 2, k_i, npq_i % 2}]
-                        = make_expr<constant_node>(float(0.0), dtype);
-                    }
-                    _else_ {
-                      if (is_3d) {
-                        output_tmp[{nt_i, npq_i / 2, k_i, npq_i % 2}] = output[{
-                          npq_idx / OPQ, npq_idx % OPQ / PQ + padded_d_num,
-                          npq_idx % OPQ % PQ / Q + padded_h_num,
-                          npq_idx % OPQ % PQ % Q + padded_w_num,
-                          k_o * K_block + k_i}];
-                      } else {
-                        output_tmp[{nt_i, npq_i / 2, k_i, npq_i % 2}] = output[{
-                          npq_idx / PQ, npq_idx % PQ / Q + padded_h_num,
-                          npq_idx % PQ % Q + padded_w_num,
-                          k_o * K_block + k_i}];
-                      }
-                    }
-                  }
-                }
-              }
-            }
-            _if_(qq == 0) {
-              builtin::brgemm_init_update(tensor_ptr(data_tmp, {0, 0, 0}),
-                tensor_ptr(output_tmp,
-                  dtype_block > 1 ? std::vector<expr> {0, 0, 0, 0}
-                                  : std::vector<expr> {0, 0, 0}),
-                tensor_ptr(del_weight_tmp_buf,
-                  is_3d ? std::vector<expr> {n_o * K_num_block + k_o, c_o, 0, 0,
-                    0, 0, 0}
-                        : std::vector<expr> {n_o * K_num_block + k_o, c_o, 0, 0,
-                          0, 0}),
-                NPQ_tile, C_block, K_block, NPQ_block_pad, NPQ_block_pad,
-                K_block, K_block, C_block * NPQ_block_pad,
-                K_block
-                  * (int)utils::divide_and_ceil(NPQ_block_pad, dtype_block)
-                  * dtype_block,
-                dtype, dtype);
-            }
-            _else_ {
-              builtin::brgemm_update(tensor_ptr(data_tmp, {0, 0, 0}),
-                tensor_ptr(output_tmp,
-                  dtype_block > 1 ? std::vector<expr> {0, 0, 0, 0}
-                                  : std::vector<expr> {0, 0, 0}),
-                tensor_ptr(del_weight_tmp_buf,
-                  is_3d ? std::vector<expr> {n_o * K_num_block + k_o, c_o, 0, 0,
-                    0, 0, 0}
-                        : std::vector<expr> {n_o * K_num_block + k_o, c_o, 0, 0,
-                          0, 0}),
-                NPQ_tile, C_block, K_block, NPQ_block_pad, NPQ_block_pad,
-                K_block, K_block, C_block * NPQ_block_pad,
-                K_block
-                  * (int)utils::divide_and_ceil(NPQ_block_pad, dtype_block)
-                  * dtype_block,
-                dtype, dtype);
-            }
-          }
-        }
-      }
-    }
-    int lanes = 1;
-    if (K_block / 16 && K_block % 16 == 0) {
-      lanes = vectorize_step(ctx, out_tensors_[0].dtype_.type_code_, 16);
-    }
-    // KC(D)RSck
-    _named_for_(rlko, l_k_o, 0, K_num_block, 1, for_type::PARALLEL) {
-      _named_for_(rlco, l_c_o, 0, C_num_block, 1) {
-        builtin::mem_zero(
-          tensor_ptr(del_weight,
-            is_3d ? std::vector<expr> {l_k_o, l_c_o, 0, 0, 0, 0, 0}
-                  : std::vector<expr> {l_k_o, l_c_o, 0, 0, 0, 0}),
-          C_block * K_block, datatypes::f32);
-        _named_for_(rlc, l_c, 0, C_block, 1) {
-          _for_(l_k, 0, K_block, lanes) {
-            _for_(l_n, 0, NPQ_num_tile / tile_q, 1) {
-              std::vector<expr> del_weight_idx = {l_k_o, l_c_o, 0, 0, l_c, l_k},
-                                del_weight_tmp_idx
-                = {l_n * K_num_block + l_k_o, l_c_o, 0, 0, l_c, l_k};
-              if (is_3d) {
-                del_weight_idx.insert(del_weight_idx.begin() + 2, expr(0));
-                del_weight_tmp_idx.insert(
-                  del_weight_tmp_idx.begin() + 2, expr(0));
-              }
-              del_weight[span_t(del_weight_idx, lanes)]
-                = builder::make_add(del_weight[span_t(del_weight_idx, lanes)],
-                  del_weight_tmp_buf[span_t(del_weight_tmp_idx, lanes)]);
-            }
-          }
-        }
-        if (is_3d) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            {{l_k_o, 1}, {l_c_o, 1}, {0, 1}, {0, 1}, {0, 1}, {0, C_block},
-              {0, K_block}});
-        } else {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            {{l_k_o, 1}, {l_c_o, 1}, {0, 1}, {0, 1}, {0, C_block},
-              {0, K_block}});
-        }
-      }
-    }
-    loops = {ln, lk, lc, lnt, lnpq, rlko, rlco, rlc};
-  }
-  return true;
-}
-
-bool gen_conv1x1_backprop_weight_t::generate_reduce_ALL2(const context_ptr &ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-  bool is_3d = ndims_ == 5;
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int padding_d = is_3d ? padding_[0] : 0, stride_d = is_3d ? stride_[0] : 1;
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() > 1) {
-    if (is_3d) { padding_d = padding_[ndims_ - 5]; }
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-
-  int N = get_data_dims()[0], IC = get_data_dims()[1],
-      IH = get_data_dims()[ndims_ - 2], IW = get_data_dims()[ndims_ - 1];
-  int OD = is_3d ? get_grad_input_dims()[2] : 1,
-      ID = is_3d ? get_data_dims()[2] : 1;
-  int OH = get_grad_input_dims()[ndims_ - 2],
-      OW = get_grad_input_dims()[ndims_ - 1];
-  int OC = get_grad_input_dims()[1];
-  int OC_block = config.K_block, IC_block = config.C_block,
-      N_block = config.N_block;
-  int OC_num_block = utils::divide_and_ceil(OC, OC_block),
-      IC_num_block = utils::divide_and_ceil(IC, IC_block),
-      N_num_block = utils::divide_and_ceil(N, N_block);
-  int N_tile = N_num_block / config.num_tile_n;
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  std::vector<int> oh_locations, ow_locations, d_locations;
-  for (int i = 0; i < IH + 2 * padding_h; i++) {
-    if (i * stride_h >= padding_h && i * stride_h < IH + padding_h) {
-      oh_locations.push_back(i * stride_h);
-    }
-  }
-  for (int i = 0; i < IW + 2 * padding_w; i++) {
-    if (i * stride_w >= padding_w && i * stride_w < IW + padding_w) {
-      ow_locations.push_back(i * stride_w);
-    }
-  }
-  for (int i = 0; i < ID + 2 * padding_d; i++) {
-    if (i * stride_d >= padding_d && i * stride_d < ID + padding_d) {
-      d_locations.push_back(i * stride_d);
-    }
-  }
-  int OH_num_block = static_cast<int>(oh_locations.size()),
-      OW_num_block = static_cast<int>(ow_locations.size()),
-      D_num_block = is_3d ? static_cast<int>(d_locations.size()) : 1;
-  int padded_h_num
-    = static_cast<int>(utils::divide_and_ceil(padding_h, stride_h)),
-    padded_d_num
-    = static_cast<int>(utils::divide_and_ceil(padding_d, stride_d)),
-    padded_w_num
-    = static_cast<int>(utils::divide_and_ceil(padding_w, stride_w));
-
-  // define compute
-  for_loop ln, lc, lk, ld, lp;
-  for_loop rlco, rlko, rlk;
-  expr out = outputs.at(op_params_t::out_del_weight),
-       data_input = inputs.at(op_params_t::in_data),
-       grad_input = inputs.at(op_params_t::in_fwd_output);
-  auto filter_dims
-    = out_tensors_[op_params_t::out_del_weight].get_blocking_dims();
-  std::vector<expr> out_tmp_buf_shape;
-  out_tmp_buf_shape.reserve(filter_dims.size());
-  for (auto dim : filter_dims) {
-    out_tmp_buf_shape.emplace_back(dim2unsigned(dim));
-  }
-  out_tmp_buf_shape.insert(out_tmp_buf_shape.begin(), num_threads);
-
-  {
-    _tensor_(out_tmp_buf, datatypes::f32, out_tmp_buf_shape);
-    expr real_out_tmp_buf = out_tmp_buf;
-    slice_range data_input_slice_in_range, data_input_slice_out_range,
-      grad_input_slice_in_range, grad_input_slice_out_range;
-
-    if (config.loop_sched == 1) {
-      // initialize tensor
-      trace_guard_t trg(ctx, "zero_init");
-      _for_(tid, 0, num_threads, 1, for_type::PARALLEL) {
-        auto size = 1UL;
-        for (auto dim : filter_dims) {
-          size *= dim2unsigned(dim);
-        }
-        builtin::mem_zero(tensor_ptr(out_tmp_buf,
-                            is_3d ? std::vector<expr> {tid, 0, 0, 0, 0, 0, 0, 0}
-                                  : std::vector<expr> {tid, 0, 0, 0, 0, 0, 0}),
-          size, datatypes::f32);
-      }
-    } else {
-      real_out_tmp_buf = out;
-    }
-    {
-      // core computation
-      _named_for_(lk, oc, 0, OC_num_block, 1, for_type::PARALLEL) {
-        _named_for_(lc, ic, 0, IC_num_block) {
-          _named_for_(ln, bs, 0, N_num_block) {
-            _named_for_(ld, d, 0, D_num_block) {
-              _named_for_(lp, oh, 0, OH_num_block) {
-                _var_init_(
-                  tid, datatypes::s32, builder::make_get_group_thread_id(-1));
-                std::vector<expr> grad_input_idx
-                  = {bs, oc, oh + padded_h_num, padded_w_num, 0, 0},
-                  data_input_idx
-                  = {bs, ic, oh * stride_h + oh_locations[0] - padding_h,
-                    ow_locations[0] - padding_w, 0, 0},
-                  out_tmp_idx = {tid, oc, ic, 0, 0, 0, 0};
-                if (config.loop_sched == 2) {
-                  out_tmp_idx = {oc, ic, 0, 0, 0, 0};
-                }
-                if (is_3d) {
-                  grad_input_idx.insert(
-                    grad_input_idx.begin() + 2, d + padded_d_num);
-                  data_input_idx.insert(data_input_idx.begin() + 2,
-                    d * stride_d + d_locations[0] - padding_d);
-                  out_tmp_idx.emplace_back(expr(0));
-                }
-                if (dtype_block > 1) { data_input_idx.emplace_back(expr(0)); }
-                trace_guard_t trg(ctx, "brgemm");
-                if (config.loop_sched == 2) {
-                  _if_(d == 0 && oh == 0 && bs == 0) {
-                    builtin::brgemm_init_update(
-                      tensor_ptr(grad_input, grad_input_idx),
-                      tensor_ptr(data_input, data_input_idx),
-                      tensor_ptr(real_out_tmp_buf, out_tmp_idx), OW_num_block,
-                      OC_block, IC_block, N_block, N_block, IC_block, IC_block,
-                      OC_block * N_block,
-                      stride_w * IC_block
-                        * (int)utils::divide_and_ceil(N_block, dtype_block)
-                        * dtype_block,
-                      dtype, dtype);
-                  }
-                  _else_ {
-                    builtin::brgemm_update(
-                      tensor_ptr(grad_input, grad_input_idx),
-                      tensor_ptr(data_input, data_input_idx),
-                      tensor_ptr(real_out_tmp_buf, out_tmp_idx), OW_num_block,
-                      OC_block, IC_block, N_block, N_block, IC_block, IC_block,
-                      OC_block * N_block,
-                      stride_w * IC_block
-                        * (int)utils::divide_and_ceil(N_block, dtype_block)
-                        * dtype_block,
-                      dtype, dtype);
-                  }
-                } else {
-                  builtin::brgemm_update(tensor_ptr(grad_input, grad_input_idx),
-                    tensor_ptr(data_input, data_input_idx),
-                    tensor_ptr(real_out_tmp_buf, out_tmp_idx), OW_num_block,
-                    OC_block, IC_block, N_block, N_block, IC_block, IC_block,
-                    OC_block * N_block,
-                    stride_w * IC_block
-                      * (int)utils::divide_and_ceil(N_block, dtype_block)
-                      * dtype_block,
-                    dtype, dtype);
-                }
-              }
-            }
-          }
-          if (config.loop_sched == 2 && fusion) {
-            auto loop = ln->fuse(ld)->fuse(lp);
-            loop->attr().set(stmt_attr_key::no_loop_fuse, true);
-            trace_guard_t trg(ctx, "post_fusion");
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{oc, 1}, {ic, 1}, {0, 1}, {0, 1}, {0, OC_block}, {0, IC_block}});
-          }
-        }
-      }
-    }
-    int lanes = 1;
-    if (IC_block / 16 && IC_block % 16 == 0) {
-      lanes = vectorize_step(ctx, get_dtype().type_code_, 16);
-    }
-    if (config.loop_sched == 1) {
-      // final reduce
-      _named_for_(rlko, l_oc, 0, OC_num_block, 1, for_type::PARALLEL) {
-        _named_for_(rlco, l_ic, 0, IC_num_block, 1) {
-          {
-            trace_guard_t trg(ctx, "final_reduce");
-            builtin::mem_zero(
-              tensor_ptr(out,
-                is_3d ? std::vector<expr> {l_oc, l_ic, 0, 0, 0, 0, 0}
-                      : std::vector<expr> {l_oc, l_ic, 0, 0, 0, 0}),
-              IC_block * OC_block, datatypes::f32);
-            _named_for_(rlk, l_k, 0, OC_block, 1) {
-              _for_(l_c, 0, IC_block, lanes) {
-                _for_(l_tid, 0, num_threads, 1) {
-                  std::vector<expr> out_idx = {l_oc, l_ic, 0, 0, l_k, l_c},
-                                    out_tmp_idx
-                    = {l_tid, l_oc, l_ic, 0, 0, l_k, l_c};
-                  if (is_3d) {
-                    out_idx.insert(out_idx.begin() + 2, expr(0));
-                    out_tmp_idx.insert(out_tmp_idx.begin() + 2, expr(0));
-                  }
-                  out[span_t(out_idx, lanes)]
-                    = builder::make_add(out[span_t(out_idx, lanes)],
-                      real_out_tmp_buf[span_t(out_tmp_idx, lanes)]);
-                }
-              }
-            }
-          }
-          trace_guard_t trg(ctx, "post_fusion");
-          if (is_3d) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{l_oc, 1}, {l_ic, 1}, {0, 1}, {0, 1}, {0, 1}, {0, OC_block},
-                {0, IC_block}});
-          } else {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{l_oc, 1}, {l_ic, 1}, {0, 1}, {0, 1}, {0, OC_block},
-                {0, IC_block}});
-          }
-        }
-      }
-    }
-    loops = {lk, lc, ln, ld, lp, rlko, rlco};
-  }
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_weight.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_weight.hpp
deleted file mode 100644
index 6d87f28e7ab..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv1x1_backprop_weight.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV1X1_BACKPROP_WEIGHT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV1X1_BACKPROP_WEIGHT_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-#include <ops/templates/commit_op.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace ops {
-
-class gen_conv1x1_backprop_weight_t
-  : public body_generator_t<conv_bwd_weight_config_t> {
-public:
-  sc_dims stride_;
-  sc_dims padding_;
-  struct op_params_t {
-    static constexpr int in_data = 0;
-    static constexpr int in_fwd_output = 1;
-    static constexpr int out_del_weight = 0;
-  };
-  enum generator_type_t { REDUCE_N, REDUCE_ALL, REDUCE_ALL2, UNDEF };
-  generator_type_t type_;
-  using parent = body_generator_t<conv_bwd_weight_config_t>;
-  using parent::generate;
-
-  gen_conv1x1_backprop_weight_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs,
-    generator_type_t type = generator_type_t::REDUCE_ALL);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_data_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-  const sc_dims &get_grad_input_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-  sc_data_type_t get_dtype() const { return in_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_bwd_weight_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  bool generate_reduce_N(const context_ptr &ctx,
-    const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const;
-
-  bool generate_reduce_ALL(const context_ptr &ctx,
-    const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const;
-
-  bool generate_reduce_ALL2(const context_ptr &ctx,
-    const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_bwd_weight_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-
-private:
-  int ndims_ = 0;
-};
-} // namespace ops
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_data.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_data.cpp
deleted file mode 100644
index 61e750f5038..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_data.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "convNxN_backprop_data.hpp"
-#include <memory>
-#include <utility>
-#include <vector>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-config_ptr gen_convNxN_backprop_data::get_default_config(
-  context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_bwd_data_config_t>();
-  conv_bwd_data_config_t &cfg = *ret.unchecked_get_as<conv_bwd_data_config_t>();
-  const auto weight_dim = get_weight_dims();
-  if (weight_dim[0] % 32 == 0) {
-    cfg.K_block = 32;
-  } else {
-    cfg.K_block = weight_dim[0];
-  }
-  if (weight_dim[1] % 32 == 0) {
-    cfg.C_block = 32;
-  } else {
-    cfg.C_block = weight_dim[1];
-  }
-  cfg.tile_d = 1;
-  cfg.tile_p = 1;
-  cfg.tile_q = get_input_dims()[3];
-  cfg.loop_sched = 1;
-  return std::move(ret);
-}
-
-gen_convNxN_backprop_data::gen_convNxN_backprop_data(sc_op *owner,
-  const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be two.");
-  ndims_ = get_input_dims().size();
-  const bool is_3d = (ndims_ == 5);
-  COMPILE_ASSERT(!is_3d, "conv_bwd_data NxN kernel does not support 3D conv.");
-}
-
-float gen_convNxN_backprop_data::get_gflop() const {
-  const int D = 1;
-  const int H = get_input_dims()[2];
-  const int W = get_input_dims()[3];
-  const int C = get_input_dims()[1];
-  const int K = get_output_dims()[1];
-  const int N = get_input_dims()[0];
-  const int KD = 1;
-  const int KH = get_weight_dims()[2];
-  const int KW = get_weight_dims()[3];
-  float result = 2.0f * N * K * C * KD * KH * KW * D * H * W / (float)1e9;
-  return result;
-}
-
-void gen_convNxN_backprop_data::schedule_loops(context_ptr ctx,
-  const conv_bwd_data_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  for_loop ln = fors.at(0), lc = fors.at(1);
-  auto loop_sched = config.loop_sched;
-  if (loop_sched == 1) { auto ln_c = ln->fuse(lc); }
-}
-
-bool gen_convNxN_backprop_data::generate(context_ptr ctx,
-  const conv_bwd_data_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // initialize paddings and strides on fwd_input
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() > 1) { padding_w = padding_[1]; }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) { stride_w = stride_[1]; }
-
-  bool has_padding = (padding_h > 0) && (padding_w > 0);
-  bool has_stride = (stride_h > 1) && (stride_w > 1);
-
-  int N = get_input_dims()[0];
-  // P, Q is the height, width of grad input
-  int K = get_weight_dims()[0], C = get_weight_dims()[1],
-      P = get_input_dims()[2], Q = get_input_dims()[3];
-  int R = get_weight_dims()[2], S = get_weight_dims()[3],
-      H = get_output_dims()[2], W = get_output_dims()[3];
-  int K_block = config.K_block, C_block = config.C_block,
-      tile_p = config.tile_p, tile_q = config.tile_q;
-  int K_num_block = utils::divide_and_ceil(K, K_block),
-      C_num_block = utils::divide_and_ceil(C, C_block);
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  int padded_K_block
-    = utils::divide_and_ceil(K_block, dtype_block) * dtype_block;
-  COMPILE_ASSERT(
-    R != 1 && S != 1, "gen_convNxN_backprop_data only supports R!=1 and S!=1");
-  bool is_out_blocking = out_tensors_[0].get_format().is_blocking();
-
-  // define compute
-  for_loop ln, lc, lp;
-  expr del_input = outputs.at(op_params_t::out_del_input),
-       del_output = inputs.at(op_params_t::in_fwd_output),
-       weight = inputs.at(op_params_t::in_weight);
-  {
-    if (!has_padding) {
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          if (!is_out_blocking) {
-            builtin::dnnl_brgemm_init(
-              tensor_ptr(del_input, {n, 0, 0, c_o * C_block}), H * W, C_block,
-              C, datatypes::f32, 0);
-          } else {
-            builtin::mem_zero(tensor_ptr(del_input, {n, c_o, 0, 0, 0}),
-              H * W * C_block, datatypes::f32);
-          }
-          // lp cannot be parallelled due to potential race condition
-          _named_for_(lp, p_o, 0, P / tile_p) {
-            _for_(q_o, 0, Q / tile_q) {
-              _for_(p_i, 0, tile_p) {
-                _for_(r, 0, R) {
-                  _for_(s, 0, S) {
-                    std::vector<expr> del_input_index, del_output_index;
-                    expr LDA, LDC, stride_a;
-                    if (!is_out_blocking) {
-                      del_output_index
-                        = {n, p_o * tile_p + p_i, q_o * tile_q, 0};
-                      LDA = K;
-                      stride_a = K_block;
-                      del_input_index = {n, (p_o * tile_p + p_i) * stride_h + r,
-                        (q_o * tile_q) * stride_w + s, c_o * C_block};
-                      LDC = C * stride_w;
-                      LDC->attr().set("N_axis", std::vector<size_t> {3});
-                    } else {
-                      del_output_index
-                        = {n, 0, p_o * tile_p + p_i, q_o * tile_q, 0};
-                      LDA = K_block;
-                      stride_a = P * Q * K_block;
-                      del_input_index
-                        = {n, c_o, (p_o * tile_p + p_i) * stride_h + r,
-                          (q_o * tile_q) * stride_w + s, 0};
-                      LDC = C_block * stride_w;
-                    }
-                    LDC->attr().set("stride_w", stride_w);
-                    builtin::brgemm_update(
-                      tensor_ptr(del_output, del_output_index),
-                      tensor_ptr(weight,
-                        dtype_block > 1
-                          ? std::vector<expr> {c_o, 0, r, s, 0, 0, 0}
-                          : std::vector<expr> {c_o, 0, r, s, 0, 0}),
-                      tensor_ptr(del_input, del_input_index), K_num_block,
-                      tile_q, C_block, K_block, LDA, C_block, LDC, stride_a,
-                      R * S * padded_K_block * C_block, dtype, dtype);
-                  }
-                }
-              }
-            }
-          }
-          if (!is_out_blocking) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{n, 1}, {0, H}, {0, W}, {c_o * C_block, C_block}});
-          } else {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{n, 1}, {c_o, 1}, {0, H}, {0, W}, {0, C_block}});
-          }
-        }
-      }
-    } else {
-      // padding cases
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          if (!is_out_blocking) {
-            builtin::dnnl_brgemm_init(
-              tensor_ptr(del_input, {n, 0, 0, c_o * C_block}), H * W, C_block,
-              C, datatypes::f32, 0);
-          } else {
-            builtin::mem_zero(tensor_ptr(del_input, {n, c_o, 0, 0, 0}),
-              H * W * C_block, datatypes::f32);
-          }
-          _named_for_(lp, p_o, 0, P / tile_p) {
-            _for_(q_o, 0, Q / tile_q) {
-              _for_(p_i, 0, tile_p) {
-                _for_(r, 0, R) {
-                  _for_(s, 0, S) {
-                    _if_((p_o * tile_p + p_i) * stride_h + r >= padding_h
-                      && (p_o * tile_p + p_i) * stride_h + r < H + padding_h) {
-                      // blocking or non-blocking variables
-                      std::vector<expr> del_input_index, del_output_index;
-                      expr LDA, LDC, stride_a;
-                      if (!is_out_blocking) {
-                        LDA = K;
-                        stride_a = K_block;
-                        LDC = C * stride_w;
-                        LDC->attr().set("N_axis", std::vector<size_t> {3});
-                      } else {
-                        LDA = K_block;
-                        stride_a = P * Q * K_block;
-                        LDC = C_block * stride_w;
-                      }
-                      LDC->attr().set("stride_w", stride_w);
-                      _var_(valid_q_cnt, datatypes::s32);
-                      // q_left and q_right are w.r.t index on del_input after
-                      // padding
-                      _var_(q_left, datatypes::index);
-                      _var_(q_right, datatypes::index);
-                      q_left = (q_o * tile_q) * stride_w + s;
-                      q_right = ((q_o + 1) * tile_q - 1) * stride_w + s;
-                      _if_(q_right >= padding_w && q_left < W + padding_w) {
-                        _if_(q_left >= padding_w && q_right < W + padding_w) {
-                          if (!is_out_blocking) {
-                            del_output_index
-                              = {n, p_o * tile_p + p_i, q_o * tile_q, 0};
-                            del_input_index = {n,
-                              (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                              (q_o * tile_q) * stride_w + s - padding_w,
-                              c_o * C_block};
-                          } else {
-                            del_output_index
-                              = {n, 0, p_o * tile_p + p_i, q_o * tile_q, 0};
-                            del_input_index = {n, c_o,
-                              (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                              (q_o * tile_q) * stride_w + s - padding_w, 0};
-                          }
-                          builtin::brgemm_update(
-                            tensor_ptr(del_output, del_output_index),
-                            tensor_ptr(weight,
-                              dtype_block > 1
-                                ? std::vector<expr> {c_o, 0, r, s, 0, 0, 0}
-                                : std::vector<expr> {c_o, 0, r, s, 0, 0}),
-                            tensor_ptr(del_input, del_input_index), K_num_block,
-                            tile_q, C_block, K_block, LDA, C_block, LDC,
-                            stride_a, R * S * padded_K_block * C_block, dtype,
-                            dtype);
-                        }
-                        _else_ {
-                          _if_(q_left >= padding_w) {
-                            // same as valid_q_cnt = divide_and_ceil(W +
-                            // padding_w - q_left, stride_w);
-                            valid_q_cnt = builder::make_cast(datatypes::s32,
-                              (W + padding_w - q_left + stride_w - 1)
-                                / stride_w);
-                            if (!is_out_blocking) {
-                              del_output_index
-                                = {n, p_o * tile_p + p_i, q_o * tile_q, 0};
-                              del_input_index = {n,
-                                (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                                q_left - padding_w, c_o * C_block};
-                            } else {
-                              del_output_index
-                                = {n, 0, p_o * tile_p + p_i, q_o * tile_q, 0};
-                              del_input_index = {n, c_o,
-                                (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                                q_left - padding_w, 0};
-                            }
-                            builtin::brgemm_update(
-                              tensor_ptr(del_output, del_output_index),
-                              tensor_ptr(weight,
-                                dtype_block > 1
-                                  ? std::vector<expr> {c_o, 0, r, s, 0, 0, 0}
-                                  : std::vector<expr> {c_o, 0, r, s, 0, 0}),
-                              tensor_ptr(del_input, del_input_index),
-                              K_num_block, valid_q_cnt, C_block, K_block, LDA,
-                              C_block, LDC, stride_a,
-                              R * S * padded_K_block * C_block, dtype, dtype);
-                          }
-                          _if_(q_right < W + padding_w) {
-                            _var_(valid_q_cnt_idx, datatypes::index);
-                            valid_q_cnt_idx
-                              = (q_right - padding_w + 1 + stride_w - 1)
-                              / stride_w;
-                            valid_q_cnt = builder::make_cast(
-                              datatypes::s32, valid_q_cnt_idx);
-                            _var_(q_left_valid, datatypes::index);
-                            q_left_valid
-                              = q_right - (valid_q_cnt_idx - 1) * stride_w;
-                            if (!is_out_blocking) {
-                              del_output_index = {n, p_o * tile_p + p_i,
-                                q_o * tile_q + tile_q - valid_q_cnt, 0};
-                              del_input_index = {n,
-                                (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                                q_left_valid - padding_w, c_o * C_block};
-                            } else {
-                              del_output_index = {n, 0, p_o * tile_p + p_i,
-                                q_o * tile_q + tile_q - valid_q_cnt, 0};
-                              del_input_index = {n, c_o,
-                                (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                                q_left_valid - padding_w, 0};
-                            }
-                            builtin::brgemm_update(
-                              tensor_ptr(del_output, del_output_index),
-                              tensor_ptr(weight,
-                                dtype_block > 1
-                                  ? std::vector<expr> {c_o, 0, r, s, 0, 0, 0}
-                                  : std::vector<expr> {c_o, 0, r, s, 0, 0}),
-                              tensor_ptr(del_input, del_input_index),
-                              K_num_block, valid_q_cnt, C_block, K_block, LDA,
-                              C_block, LDC, stride_a,
-                              R * S * padded_K_block * C_block, dtype, dtype);
-                          }
-                          _if_(q_left < padding_w && q_right >= W + padding_w) {
-                            _var_(q_offset_left, datatypes::index);
-                            _var_(q_offset_right, datatypes::index);
-                            // DC(padding_w - q_left, stride_w)
-                            q_offset_left
-                              = (padding_w - q_left + stride_w - 1) / stride_w;
-                            // DC(q_right - W - padding_w + 1, stride_w)
-                            q_offset_right
-                              = (q_right - W - padding_w + stride_w) / stride_w;
-                            valid_q_cnt = builder::make_cast(datatypes::s32,
-                              tile_q - q_offset_left - q_offset_right);
-                            _var_(q_left_valid, datatypes::index);
-                            q_left_valid = q_left + q_offset_left * stride_w;
-                            if (!is_out_blocking) {
-                              del_output_index = {n, p_o * tile_p + p_i,
-                                q_o * tile_q + q_offset_left, 0};
-                              del_input_index = {n,
-                                (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                                q_left_valid - padding_w, c_o * C_block};
-                            } else {
-                              del_output_index = {n, 0, p_o * tile_p + p_i,
-                                q_o * tile_q + q_offset_left, 0};
-                              del_input_index = {n, c_o,
-                                (p_o * tile_p + p_i) * stride_h + r - padding_h,
-                                q_left_valid - padding_w, 0};
-                            }
-                            builtin::brgemm_update(
-                              tensor_ptr(del_output, del_output_index),
-                              tensor_ptr(weight,
-                                dtype_block > 1
-                                  ? std::vector<expr> {c_o, 0, r, s, 0, 0, 0}
-                                  : std::vector<expr> {c_o, 0, r, s, 0, 0}),
-                              tensor_ptr(del_input, del_input_index),
-                              K_num_block, valid_q_cnt, C_block, K_block, LDA,
-                              C_block, LDC, stride_a,
-                              R * S * padded_K_block * C_block, dtype, dtype);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-          if (!is_out_blocking) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{n, 1}, {0, H}, {0, W}, {c_o * C_block, C_block}});
-          } else {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{n, 1}, {c_o, 1}, {0, H}, {0, W}, {0, C_block}});
-          }
-        }
-      }
-    }
-  }
-  loops = {ln, lc};
-  lp->attr().set(stmt_attr_key::no_loop_fuse, true);
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_data.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_data.hpp
deleted file mode 100644
index 79fbac75173..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_data.hpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONVNXN_BACKPROP_DATA_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONVNXN_BACKPROP_DATA_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace ops {
-
-class gen_convNxN_backprop_data
-  : public body_generator_t<conv_bwd_data_config_t> {
-public:
-  sc_dims stride_;
-  sc_dims padding_;
-  struct op_params_t {
-    static constexpr int in_fwd_output = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out_del_input = 0;
-  };
-  using parent = body_generator_t<conv_bwd_data_config_t>;
-  using parent::generate;
-
-  gen_convNxN_backprop_data(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_input_dims() const {
-    return in_tensors_[op_params_t::in_fwd_output].get_plain_dims();
-  }
-  const sc_dims &get_weight_dims() const {
-    return in_tensors_[op_params_t::in_weight].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[op_params_t::out_del_input].get_plain_dims();
-  }
-  sc_data_type_t get_dtype() const { return in_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_bwd_data_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_bwd_data_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-
-private:
-  int ndims_ = 0;
-};
-} // namespace ops
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_weight.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_weight.cpp
deleted file mode 100644
index 69b446deb83..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_weight.cpp
+++ /dev/null
@@ -1,536 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "convNxN_backprop_weight.hpp"
-#include <memory>
-#include <utility>
-#include <vector>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-#include <algorithm>
-#include <string>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-static int calculate_q_reduce(int Q, bool is_vnni_low_fp) {
-  int Q_padded = Q % 2 == 0 ? Q : Q + 1;
-  int Q_reduce = is_vnni_low_fp ? Q_padded : Q;
-  return Q_reduce;
-}
-
-config_ptr gen_convNxN_backprop_weight::get_default_config(
-  context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_bwd_weight_config_t>();
-  conv_bwd_weight_config_t &cfg
-    = *ret.unchecked_get_as<conv_bwd_weight_config_t>();
-  sc_dim ndims = get_grad_dims().size();
-  int C = static_cast<int>(get_data_dims()[1]);
-  int K = static_cast<int>(get_grad_dims()[1]);
-  int N = static_cast<int>(get_data_dims()[0]);
-  int P = static_cast<int>(get_grad_dims()[ndims - 2]);
-  int Q = static_cast<int>(get_grad_dims()[ndims - 1]);
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, get_dtype());
-  int Q_reduce = calculate_q_reduce(Q, is_vnni_low_fp);
-
-  // while K and C are small, use N_block = 16; K_block = K; C_block = C
-  bool large_spatial = (P >= 56 && Q >= 56);
-  if (large_spatial && N % 16 == 0) {
-    cfg.N_block = 16;
-  } else if (N % 32 == 0) {
-    cfg.N_block = 32;
-  } else {
-    cfg.N_block = N;
-  }
-
-  if (K % 64 == 0 && !large_spatial) {
-    cfg.K_block = 64;
-  } else {
-    cfg.K_block = K;
-  }
-  if (C % 64 == 0 && !large_spatial) {
-    cfg.C_block = 64;
-  } else {
-    cfg.C_block = C;
-  }
-  cfg.tile_p = 1;
-  if (type_ == REDUCE_W) {
-    cfg.tile_q = Q_reduce;
-  } else {
-    cfg.tile_q = Q;
-  }
-  cfg.loop_sched = 1;
-  return std::move(ret);
-}
-
-gen_convNxN_backprop_weight::gen_convNxN_backprop_weight(sc_op *owner,
-  const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs,
-  generator_type_t type)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding)
-  , type_(type) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be two.");
-  ndims_ = get_data_dims().size();
-  const bool is_3d = (ndims_ == 5);
-  COMPILE_ASSERT(
-    !is_3d, "conv_bwd_weight NxN kernel does not support 3D conv.");
-}
-
-float gen_convNxN_backprop_weight::get_gflop() const {
-  const int D = 1;
-  const int P = get_grad_dims()[2];
-  const int Q = get_grad_dims()[3];
-  const int C = get_data_dims()[1];
-  const int K = get_grad_dims()[1];
-  const int N = get_data_dims()[0];
-  const int KD = 1;
-  const int KH = get_output_dims()[2];
-  const int KW = get_output_dims()[3];
-  float result = 2.0f * N * K * C * KD * KH * KW * D * P * Q / (float)1e9;
-  return result;
-}
-
-void gen_convNxN_backprop_weight::schedule_loops(context_ptr ctx,
-  const conv_bwd_weight_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  COMPILE_ASSERT(
-    type_ != generator_type_t::UNDEF, "Generator shall have an explicit type.");
-  if (fors.empty()) { return; }
-  auto loop_sched = config.loop_sched;
-  if (type_ == generator_type_t::REDUCE_N) {
-    COMPILE_ASSERT(fors.size() == 9 || fors.size() == 5,
-      "number of for_loops not satisfying reduce N condition.");
-    if (fors.size() == 9) {
-      for_loop ln = fors.at(0), lk = fors.at(1), lc = fors.at(2),
-               lr = fors.at(3), ls = fors.at(4);
-      for_loop rlko = fors.at(5), rlco = fors.at(6), rlr = fors.at(7),
-               rls = fors.at(8);
-      if (loop_sched == 1) {
-        // brgemm part
-        auto ln_k = ln->fuse(lk);
-        auto ln_k_c = ln_k->fuse(lc);
-        auto ln_k_c_r = ln_k_c->fuse(lr);
-        auto ln_k_c_r_s = ln_k_c_r->fuse(ls);
-        // reduce add part
-        auto rlk_c = rlko->fuse(rlco);
-        auto rlk_c_r = rlk_c->fuse(rlr);
-        auto rlk_c_r_s = rlk_c_r->fuse(rls);
-      }
-    } else {
-      for_loop ln = fors.at(0), lk = fors.at(1), lc = fors.at(2),
-               lr = fors.at(3), ls = fors.at(4);
-      if (loop_sched == 1) {
-        // brgemm part
-        auto ln_k = ln->fuse(lk);
-        auto ln_k_c = ln_k->fuse(lc);
-        auto ln_k_c_r = ln_k_c->fuse(lr);
-        auto ln_k_c_r_s = ln_k_c_r->fuse(ls);
-      }
-    }
-  } else {
-    for_loop ln = fors.at(0), lk = fors.at(1), lc = fors.at(2);
-    if (loop_sched == 1) {
-      auto ln_k = ln->fuse(lk);
-      auto ln_k_c = ln_k->fuse(lc);
-    }
-  }
-}
-
-bool gen_convNxN_backprop_weight::generate(context_ptr ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  COMPILE_ASSERT(
-    type_ != generator_type_t::UNDEF, "Generator shall have an explicit type.");
-  if (type_ == generator_type_t::REDUCE_N) {
-    return generate_reduce_N(ctx, config, fusion, inputs, outputs, loops);
-  } else {
-    return generate_reduce_W(ctx, config, fusion, inputs, outputs, loops);
-  }
-  return true;
-}
-
-bool gen_convNxN_backprop_weight::generate_reduce_N(const context_ptr &ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() == 2) { padding_w = padding_[1]; }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() == 2) { stride_w = stride_[1]; }
-
-  bool has_padding = (padding_h > 0 || padding_w > 0);
-  bool has_stride = (stride_h > 1 || stride_w > 1);
-
-  int N = get_data_dims()[0], C = get_data_dims()[1], H = get_data_dims()[2],
-      W = get_data_dims()[3];
-  int K = get_grad_dims()[1];
-  int P = get_grad_dims()[2], Q = get_grad_dims()[3];
-  int R = get_output_dims()[2], S = get_output_dims()[3];
-  int K_block = config.K_block, C_block = config.C_block;
-  int tile_n = config.N_block, tile_q = config.tile_q;
-  int K_num_block = utils::divide_and_ceil(K, K_block),
-      C_num_block = utils::divide_and_ceil(C, C_block);
-  int N_num_block = utils::divide_and_ceil(N, tile_n);
-  int Q_num_tile = utils::divide_and_ceil(Q, tile_q);
-
-  COMPILE_ASSERT(Q % tile_q == 0, "Q should be divisible by tile_q.");
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  // calculate the correct p_o value for brgemm init
-  std::vector<int> p_offset(R, 0);
-  for (int r = 0; r < R; ++r) {
-    for (int p = 0; p < P; ++p) {
-      if (p * stride_h + r >= padding_h) {
-        p_offset[r] = p;
-        break;
-      }
-    }
-  }
-
-  // define compute
-  for_loop ln, lc, lk, lr, ls, lp;
-  for_loop rlco, rlko, rlr, rls, rlk;
-  expr del_weight = outputs.at(op_params_t::out_del_weight),
-       data = inputs.at(op_params_t::in_data),
-       output = inputs.at(op_params_t::in_fwd_output);
-  auto filter_dims
-    = out_tensors_[op_params_t::out_del_weight].get_blocking_dims();
-  std::vector<expr> del_weight_tmp_buf_shape;
-  del_weight_tmp_buf_shape.reserve(filter_dims.size());
-  for (auto dim : filter_dims) {
-    del_weight_tmp_buf_shape.emplace_back(dim2unsigned(dim));
-  }
-  del_weight_tmp_buf_shape[0] = del_weight_tmp_buf_shape[0] * N_num_block;
-  {
-    _tensor_(del_weight_tmp_buf, datatypes::f32, del_weight_tmp_buf_shape);
-    _named_for_(ln, n_o, 0, N_num_block, 1, for_type::PARALLEL) {
-      _named_for_(lk, k_o, 0, K_num_block) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          // p_o here shall not be merged; set as temp.no_loop_fuse in the end
-          _named_for_(lp, p_o, 0, P) {
-            _if_(p_o * stride_h + R > padding_h
-              && p_o * stride_h < H + padding_h) {
-              _for_(q_o, 0, Q_num_tile) {
-                _named_for_(lr, r, 0, R) {
-                  _if_(p_o * stride_h + r >= padding_h
-                    && p_o * stride_h + r < H + padding_h) {
-                    _named_for_(ls, s, 0, S) {
-                      // q_o * tile_q * stride_w + s - padding_w
-                      // (q_o * tile_q + tile_q - 1) * stride_w + s - padding_w
-                      _var_(q_start, datatypes::s32);
-                      _var_(q_end, datatypes::s32);
-                      _var_(q_start_ofb_cnt, datatypes::s32); // out of bound
-                      _var_(q_end_ofb_cnt, datatypes::s32); // out of bound
-                      q_start = builder::make_cast(datatypes::s32, q_o) * tile_q
-                          * stride_w
-                        + builder::make_cast(datatypes::s32, s) - padding_w;
-                      q_end = (builder::make_cast(datatypes::s32, q_o) * tile_q
-                                + tile_q - 1)
-                          * stride_w
-                        + builder::make_cast(datatypes::s32, s) - padding_w;
-                      // divide and ceil (-q_start, stride_w)
-                      q_start_ofb_cnt = (0 - q_start + stride_w - 1) / stride_w;
-                      // divide and ceil (q_end - (W - 1), stride_w)
-                      q_end_ofb_cnt
-                        = (q_end - (W - 1) + stride_w - 1) / stride_w;
-                      q_start_ofb_cnt = builder::make_max(0, q_start_ofb_cnt);
-                      q_end_ofb_cnt = builder::make_max(0, q_end_ofb_cnt);
-                      _var_(q_start_valid, datatypes::s32);
-                      q_start_valid = q_start + stride_w * q_start_ofb_cnt;
-                      _var_(Q_batch_size, datatypes::s32);
-                      Q_batch_size = tile_q - q_start_ofb_cnt - q_end_ofb_cnt;
-                      _var_(output_q_start, datatypes::s32);
-                      output_q_start = q_start_ofb_cnt
-                        + builder::make_cast(datatypes::s32, q_o) * tile_q;
-                      // TODO(yifei): double check the condition here
-                      // to deal with padding case more carefully
-                      trace_guard_t trg(ctx, "brgemm");
-                      _if_(p_o * stride_h + r >= padding_h
-                        && (p_o == 0 || (p_o - 1) * stride_h + r < padding_h)
-                        // p_o == p_offset[r]  // cannot write like this
-                        && q_o == 0) {
-                        builtin::brgemm_init_update(
-                          tensor_ptr(
-                            output, {n_o, k_o, p_o, output_q_start, 0, 0}),
-                          tensor_ptr(data,
-                            dtype_block > 1 ? std::vector<expr> {n_o, c_o,
-                              p_o * stride_h + r - padding_h, q_start_valid, 0,
-                              0, 0}
-                                            : std::vector<expr> {n_o, c_o,
-                                              p_o * stride_h + r - padding_h,
-                                              q_start_valid, 0, 0}),
-                          tensor_ptr(
-                            N_num_block > 1 ? del_weight_tmp_buf : del_weight,
-                            {n_o * K_num_block + k_o, c_o, r, s, 0, 0}),
-                          Q_batch_size, K_block, C_block, tile_n, tile_n,
-                          C_block, C_block, K_block * tile_n,
-                          stride_w * C_block
-                            * static_cast<int>(
-                              utils::divide_and_ceil(tile_n, dtype_block))
-                            * dtype_block,
-                          dtype, dtype);
-                      }
-                      _else_ {
-                        builtin::brgemm_update(
-                          tensor_ptr(
-                            output, {n_o, k_o, p_o, output_q_start, 0, 0}),
-                          tensor_ptr(data,
-                            dtype_block > 1 ? std::vector<expr> {n_o, c_o,
-                              p_o * stride_h + r - padding_h, q_start_valid, 0,
-                              0, 0}
-                                            : std::vector<expr> {n_o, c_o,
-                                              p_o * stride_h + r - padding_h,
-                                              q_start_valid, 0, 0}),
-                          tensor_ptr(
-                            N_num_block > 1 ? del_weight_tmp_buf : del_weight,
-                            {n_o * K_num_block + k_o, c_o, r, s, 0, 0}),
-                          Q_batch_size, K_block, C_block, tile_n, tile_n,
-                          C_block, C_block, K_block * tile_n,
-                          stride_w * C_block
-                            * static_cast<int>(
-                              utils::divide_and_ceil(tile_n, dtype_block))
-                            * dtype_block,
-                          dtype, dtype);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-          if (N_num_block == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{k_o, 1}, {c_o, 1}, {0, R}, {0, S}, {0, K_block}, {0, C_block}});
-          }
-        }
-      }
-    }
-    if (N_num_block > 1) {
-      int lanes = 1;
-      if (C_block / 16 && C_block % 16 == 0) {
-        lanes = vectorize_step(ctx, out_tensors_[0].dtype_.type_code_, 16);
-      }
-      // KC(D)RSkc
-      trace_guard_t trg(ctx, "final_reduce");
-      _named_for_(rlko, l_k_o, 0, K_num_block, 1, for_type::PARALLEL) {
-        _named_for_(rlco, l_c_o, 0, C_num_block, 1) {
-          _named_for_(rlr, l_r, 0, R, 1) {
-            _named_for_(rls, l_s, 0, S, 1) {
-              builtin::mem_zero(
-                tensor_ptr(del_weight, {l_k_o, l_c_o, l_r, l_s, 0, 0}),
-                C_block * K_block, datatypes::f32);
-              _named_for_(rlk, l_k, 0, K_block, 1) {
-                _for_(l_c, 0, C_block, lanes) {
-                  _for_(l_n, 0, N_num_block, 1) {
-                    del_weight[span_t(
-                      {l_k_o, l_c_o, l_r, l_s, l_k, l_c}, lanes)]
-                      = builder::make_add(
-                        del_weight[span_t(
-                          {l_k_o, l_c_o, l_r, l_s, l_k, l_c}, lanes)],
-                        del_weight_tmp_buf[span_t({l_n * K_num_block + l_k_o,
-                                                    l_c_o, l_r, l_s, l_k, l_c},
-                          lanes)]);
-                  }
-                }
-              }
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{l_k_o, 1}, {l_c_o, 1}, {l_r, 1}, {l_s, 1}, {0, K_block},
-                  {0, C_block}});
-            }
-          }
-        }
-      }
-      loops = {ln, lk, lc, lr, ls, rlko, rlco, rlr, rls};
-    } else {
-      loops = {ln, lk, lc, lr, ls};
-    }
-  }
-  lp->attr().set(stmt_attr_key::no_loop_fuse, true);
-  return true;
-}
-
-bool gen_convNxN_backprop_weight::generate_reduce_W(const context_ptr &ctx,
-  const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() == 2) { padding_w = padding_[1]; }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() == 2) { stride_w = stride_[1]; }
-
-  bool has_padding = (padding_h > 0 || padding_w > 0);
-  bool has_stride = (stride_h > 1 || stride_w > 1);
-
-  int N = get_data_dims()[0], C = get_data_dims()[1], H = get_data_dims()[2],
-      W = get_data_dims()[3];
-  int K = get_grad_dims()[1];
-  int P = get_grad_dims()[2], Q = get_grad_dims()[3];
-  int R = get_output_dims()[3], S = get_output_dims()[4];
-  int K_block = config.K_block, C_block = config.C_block,
-      tile_p = config.tile_p, tile_q = config.tile_q;
-  int K_num_block = utils::divide_and_ceil(K, K_block),
-      C_num_block = utils::divide_and_ceil(C, C_block);
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  int Q_reduce = calculate_q_reduce(Q, is_vnni_low_fp);
-  int Q_block = Q_reduce / tile_q;
-
-  COMPILE_ASSERT(P % tile_p == 0, "P should be divisible by tile_p.");
-  COMPILE_ASSERT(Q_reduce % tile_q == 0, "Q should be divisible by tile_q.");
-
-  sc_dims input_blocking = in_tensors_[0].get_blocking_dims();
-  sc_dims dst_blocking = in_tensors_[1].get_blocking_dims();
-  sc_dims weight_blocking = out_tensors_[0].get_blocking_dims();
-
-  int data_tmp_inner = tile_q + (S - 1) / stride_w;
-  int data_tmp_outer = tile_p + (R - 1) / stride_h;
-  data_tmp_outer *= stride_h;
-
-  // define compute
-  for_loop ln, lc, lk;
-  expr del_weight = outputs.at(op_params_t::out_del_weight),
-       data = inputs.at(op_params_t::in_data),
-       output = inputs.at(op_params_t::in_fwd_output);
-  {
-    // computation performed on on plain format
-    // weight: KCRSck
-    // data: NHWC --> pack: C_block x W
-    // output_delta: NHWK --> pack: W x K_block
-    if (!has_padding) {
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lk, k_o, 0, K_num_block) {
-          _named_for_(lc, c_o, 0, C_num_block) {
-            _for_(p_o, 0, P / tile_p) {
-              _for_(q_o, 0, Q_reduce / tile_q) {
-                _tensor_(output_tmp, dtype,
-                  dtype_block > 1
-                    ? std::vector<expr> {tile_p, tile_q / 2, K_block, 2}
-                    : std::vector<expr> {tile_p, tile_q, K_block});
-                _for_(p_i, 0, tile_p) {
-                  _for_(q_i, 0, tile_q) {
-                    // q_idx = q_o * tile_q + q_i
-                    _for_(k_i, 0, K_block) {
-                      if (dtype_block > 1) {
-                        _if_(q_o * tile_q + q_i >= Q) {
-                          output_tmp[{p_i, q_i / 2, k_i, q_i % 2}]
-                            = builder::make_constant({0.0f}, dtype);
-                        }
-                        _else_ {
-                          output_tmp[{p_i, q_i / 2, k_i, q_i % 2}]
-                            = output[{n, p_o * tile_p + p_i, q_o * tile_q + q_i,
-                              k_o * K_block + k_i}];
-                        }
-                      } else {
-                        output_tmp[{p_i, q_i, k_i}]
-                          = output[{n, p_o * tile_p + p_i, q_o * tile_q + q_i,
-                            k_o * K_block + k_i}];
-                      }
-                    }
-                  }
-                }
-                _tensor_(data_tmp, dtype,
-                  {data_tmp_outer, C_block, stride_w, data_tmp_inner});
-                _for_(p_i, 0, data_tmp_outer) {
-                  _for_(q_i, 0, data_tmp_inner) {
-                    _for_(sw, 0, stride_w) {
-                      _for_(c_i, 0, C_block) {
-                        _if_((q_o * tile_q + q_i) * stride_w + sw >= W) {
-                          data_tmp[{p_i, c_i, sw, q_i}]
-                            = builder::make_constant({0.0f}, dtype);
-                        }
-                        _else_ {
-                          data_tmp[{p_i, c_i, sw, q_i}]
-                            = data[{n, p_o * tile_p * stride_h + p_i,
-                              (q_o * tile_q + q_i) * stride_w + sw,
-                              c_o * C_block + c_i}];
-                        }
-                      }
-                    }
-                  }
-                }
-                _for_(r, 0, R) {
-                  _for_(sw, 0, stride_w) {
-                    // non const loop range
-                    _for_(s, sw, S, stride_w) { // var, begin, end, step
-                      _if_(n == 0 && p_o == 0 && q_o == 0) {
-                        builtin::brgemm_init_update(
-                          tensor_ptr(data_tmp, {r, 0, sw, s / stride_w}),
-                          tensor_ptr(output_tmp,
-                            dtype_block > 1 ? std::vector<expr> {0, 0, 0, 0}
-                                            : std::vector<expr> {0, 0, 0}),
-                          tensor_ptr(del_weight, {n, k_o, c_o, r, s, 0, 0}),
-                          tile_p, C_block, K_block, tile_q,
-                          data_tmp_inner * stride_w, K_block, K_block,
-                          C_block * data_tmp_inner * stride_w * stride_h,
-                          tile_q * K_block, dtype, dtype);
-                      }
-                      _else_ {
-                        builtin::brgemm_update(
-                          tensor_ptr(data_tmp, {r, 0, sw, s / stride_w}),
-                          tensor_ptr(output_tmp,
-                            dtype_block > 1 ? std::vector<expr> {0, 0, 0, 0}
-                                            : std::vector<expr> {0, 0, 0}),
-                          tensor_ptr(del_weight, {n, k_o, c_o, r, s, 0, 0}),
-                          tile_p, C_block, K_block, tile_q,
-                          data_tmp_inner * stride_w, K_block, K_block,
-                          C_block * data_tmp_inner * stride_w * stride_h,
-                          tile_q * K_block, dtype, dtype);
-                      }
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        {{n, 1}, {k_o, 1}, {c_o, 1}, {r, 1}, {s, 1},
-                          {0, C_block}, {0, K_block}});
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    loops = {ln, lk, lc};
-  }
-  return true;
-}
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_weight.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_weight.hpp
deleted file mode 100644
index cdff3bb2071..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/convNxN_backprop_weight.hpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONVNXN_BACKPROP_WEIGHT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONVNXN_BACKPROP_WEIGHT_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace ops {
-
-class gen_convNxN_backprop_weight
-  : public body_generator_t<conv_bwd_weight_config_t> {
-public:
-  sc_dims stride_;
-  sc_dims padding_;
-  struct op_params_t {
-    static constexpr int in_data = 0;
-    static constexpr int in_fwd_output = 1;
-    static constexpr int out_del_weight = 0;
-  };
-  enum generator_type_t { REDUCE_N, REDUCE_W, UNDEF };
-  generator_type_t type_;
-  using parent = body_generator_t<conv_bwd_weight_config_t>;
-  using parent::generate;
-
-  gen_convNxN_backprop_weight(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs,
-    generator_type_t type = generator_type_t::REDUCE_N);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_data_dims() const {
-    return in_tensors_[op_params_t::in_data].get_plain_dims();
-  }
-  const sc_dims &get_grad_dims() const {
-    return in_tensors_[op_params_t::in_fwd_output].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[op_params_t::out_del_weight].get_plain_dims();
-  }
-  sc_data_type_t get_dtype() const { return in_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_bwd_weight_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  bool generate_reduce_N(const context_ptr &ctx,
-    const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const;
-
-  bool generate_reduce_W(const context_ptr &ctx,
-    const conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_bwd_weight_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-
-private:
-  int ndims_ = 0;
-};
-} // namespace ops
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_bwd.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_bwd.cpp
deleted file mode 100644
index ff5c3eeee39..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_bwd.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "conv_bwd.hpp"
-#include <memory>
-#include <utility>
-#include <vector>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using ops::conv_bwd_data_config_t;
-// clang-format off
-SC_CLASS(conv_bwd_data_config_t)
-  SC_FIELD(K_block)
-  SC_FIELD(C_block)
-  SC_FIELD(tile_d)
-  SC_FIELD(tile_p)
-  SC_FIELD(tile_q)
-  SC_FIELD(loop_sched)
-SC_CLASS_END();
-// clang-format on
-
-using ops::conv_bwd_weight_config_t;
-// clang-format off
-SC_CLASS(conv_bwd_weight_config_t)
-  SC_FIELD(K_block)
-  SC_FIELD(C_block)
-  SC_FIELD(N_block)
-  SC_FIELD(tile_p)
-  SC_FIELD(tile_q)
-  SC_FIELD(num_tile_n)
-  SC_FIELD(loop_sched)
-SC_CLASS_END();
-// clang-format on
-
-using ops::nested_conv_bwd_data_config_t;
-// clang-format off
-SC_CLASS(nested_conv_bwd_data_config_t)
-  SC_FIELD(bs_threads)
-  SC_FIELD(spatial_threads)
-  SC_FIELD(ic_threads)
-  SC_FIELD(bs_num_blocks)
-  SC_FIELD(spatial_num_blocks)
-  SC_FIELD(ic_num_blocks)
-  SC_FIELD(oc_num_blocks)
-SC_CLASS_END();
-// clang-format on
-
-using ops::nested_conv_bwd_weight_config_t;
-// clang-format off
-SC_CLASS(nested_conv_bwd_weight_config_t)
-  SC_FIELD(oc_threads)
-  SC_FIELD(ic_threads)
-  SC_FIELD(bs_threads)
-  SC_FIELD(oh_threads)
-  SC_FIELD(od_threads)
-  SC_FIELD(oc_num_blocks)
-  SC_FIELD(ic_num_blocks)
-  SC_FIELD(bs_num_blocks)
-  SC_FIELD(oh_num_blocks)
-  SC_FIELD(od_num_blocks)
-  SC_FIELD(ow_num_blocks)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-
-config_ptr gen_conv_bwd_t::get_default_config(context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_bwd_data_config_t>();
-  conv_bwd_data_config_t &cfg = *ret.unchecked_get_as<conv_bwd_data_config_t>();
-  const auto weight_dim = get_weight_dims();
-  if (weight_dim[0] % 32 == 0) {
-    cfg.K_block = 32;
-  } else {
-    cfg.K_block = weight_dim[0];
-  }
-  if (weight_dim[1] % 32 == 0) {
-    cfg.C_block = 32;
-  } else {
-    cfg.C_block = weight_dim[1];
-  }
-  cfg.tile_p = 1;
-  cfg.tile_q = get_output_dims()[3];
-  cfg.loop_sched = 1;
-  return std::move(ret);
-}
-
-gen_conv_bwd_t::gen_conv_bwd_t(sc_op *owner, const sc_dims &stride,
-  const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-  std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be two.");
-}
-
-float gen_conv_bwd_t::get_gflop() const {
-  float result = 0.0;
-  /* implement */
-  result = (float)get_input_dims()[0]
-    * (2.0 * (float)get_weight_dims()[0] * (float)get_weight_dims()[2]
-        * (float)get_weight_dims()[3]
-      - 1)
-    * (float)get_output_dims()[2] * (float)get_output_dims()[3]
-    * (float)get_weight_dims()[1] / (float)1e9;
-  return result;
-}
-
-void gen_conv_bwd_t::schedule_loops(context_ptr ctx,
-  const conv_bwd_data_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  for_loop ln = fors.at(0), lc = fors.at(1), lp = fors.at(2);
-  auto loop_sched = config.loop_sched;
-  if (loop_sched == 1) {
-    auto ln_c = ln->fuse(lc);
-    auto ln_c_p = ln_c->fuse(lp);
-  }
-}
-
-bool gen_conv_bwd_t::generate(context_ptr ctx,
-  const conv_bwd_data_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-
-  int padding_h = padding_[0], padding_w = padding_[0];
-  if (padding_.size() == 2) { padding_w = padding_[1]; }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() == 2) { stride_w = stride_[1]; }
-
-  int N = get_input_dims()[0];
-  int K = get_weight_dims()[0], C = get_weight_dims()[1],
-      P = get_input_dims()[2], Q = get_input_dims()[3];
-  int R = get_weight_dims()[2], S = get_weight_dims()[3],
-      H = get_output_dims()[2] + 2 * padding_h,
-      W = get_output_dims()[3] + 2 * padding_w;
-  int K_block = config.K_block, C_block = config.C_block,
-      tile_p = config.tile_p, tile_q = config.tile_q;
-  int K_num_block = K / K_block, C_num_block = C / C_block;
-  bool loop_sched = config.loop_sched;
-  auto dtype = get_dtype();
-  assert((K % K_block == 0) && "todo: padding K_block");
-  assert((C % C_block == 0) && "todo: padding C_block");
-  assert((P % tile_p == 0) && "wrong tile p");
-  assert((Q % tile_q == 0) && "wrong tile q");
-  // assert((get_input_dims()[1] == C) && "wrong input and weight");
-
-  // define input, weight and output block
-  sc_dims input_dims = {N, C / C_block, H, W, C_block};
-  sc_dims weight_dims = {K / K_block, C / C_block, R, S, C_block, K_block};
-  sc_dims output_dims = {N, K / K_block, P, Q, K_block};
-
-  // define compute
-  for_loop ln, lc, lp;
-  expr del_input = outputs.at(op_params_t::out_del_input),
-       output = inputs.at(op_params_t::in_fwd_output),
-       weight = inputs.at(op_params_t::in_weight);
-  {
-    _tensor_(
-      tr_weight, dtype, {C_num_block, K_num_block, R, S, K_block, C_block});
-    _for_(c_o, 0, C_num_block) {
-      _for_(k_o, 0, K_num_block) {
-        _for_(r, 0, R) {
-          _for_(s, 0, S) {
-            _for_(k_i, 0, K_block) {
-              _for_(c_i, 0, C_block) {
-                tr_weight[{c_o, k_o, R - 1 - r, S - 1 - s, k_i, c_i}]
-                  = weight[{k_o, c_o, r, s, c_i, k_i}];
-              }
-            }
-          }
-        }
-      }
-    }
-    if (R == 1 && S == 1) {
-      assert(padding_h == 0 && padding_w == 0 && "1*1 conv has no padding");
-      if (stride_h == 1 && stride_w == 1) {
-        _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-          _named_for_(lc, c_o, 0, C_num_block) {
-            _named_for_(lp, p_o, 0, P / tile_p) {
-              builtin::brgemm_init_update(
-                tensor_ptr(output, {n, 0, p_o * tile_p, 0, 0}),
-                tensor_ptr(tr_weight, {c_o, 0, 0, 0, 0, 0}),
-                tensor_ptr(del_input, {n, c_o, (p_o * tile_p), 0, 0}),
-                K_num_block, tile_p * Q, C_block, K_block, K_block, C_block,
-                C_block, P * Q * K_block, K_block * C_block, dtype, dtype);
-            }
-          }
-        }
-      } else {
-        _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-          _named_for_(lc, c_o, 0, C_num_block) {
-            _named_for_(lp, p_o, 0, P / tile_p) {
-              _for_(q_o, 0, Q / tile_q) {
-                _for_(p_i, 0, tile_p) {
-                  builtin::brgemm_init_update(
-                    tensor_ptr(
-                      output, {n, 0, p_o * tile_p + p_i, q_o * tile_q, 0}),
-                    tensor_ptr(tr_weight, {c_o, 0, 0, 0, 0, 0}),
-                    tensor_ptr(del_input,
-                      {n, c_o, (p_o * tile_p + p_i) * stride_h,
-                        q_o * tile_q * stride_w, 0}),
-                    K_num_block, tile_q, C_block, K_block, K_block, C_block,
-                    C_block * stride_w, P * Q * K_block, K_block * C_block,
-                    dtype, dtype);
-                }
-              }
-            }
-          }
-        }
-      }
-    } else {
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          _named_for_(lp, p_o, 0, P / tile_p) {
-            builtin::brgemm_init(
-              tensor_ptr(del_input, {n, c_o, p_o * tile_p * stride_h, 0, 0}),
-              (tile_p * stride_w + R - 1) * (Q * stride_w + S - 1), C_block,
-              C_block, dtype, expr(0));
-          }
-        }
-      }
-      _named_for_(ln, n, 0, N, 1, for_type::PARALLEL) {
-        _named_for_(lc, c_o, 0, C_num_block) {
-          _named_for_(lp, p_o, 0, P / tile_p) {
-            _for_(q_o, 0, Q / tile_q) {
-              _for_(p_i, 0, tile_p) {
-                _for_(r, 0, R) {
-                  _for_(s, 0, S) {
-                    builtin::brgemm_update(
-                      tensor_ptr(
-                        output, {n, 0, p_o * tile_p + p_i, q_o * tile_q, 0}),
-                      tensor_ptr(tr_weight, {c_o, 0, r, s, 0, 0}),
-                      tensor_ptr(del_input,
-                        {n, c_o, (p_o * tile_p + p_i) * stride_h + (R - r - 1),
-                          (q_o * tile_q) * stride_w + (S - s - 1), 0}),
-                      K_num_block, tile_q, C_block, K_block, K_block, C_block,
-                      C_block * stride_w, P * Q * K_block,
-                      R * S * K_block * C_block, dtype, dtype);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  loops = {ln, lc, lp};
-  return true;
-}
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_bwd.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_bwd.hpp
deleted file mode 100644
index 1db9a0d681d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_bwd.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_BWD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_BWD_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include <ops/body_generator.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace ops {
-
-struct conv_bwd_data_config_t {
-  int K_block;
-  int C_block;
-  int tile_d;
-  int tile_p;
-  int tile_q;
-  int loop_sched;
-};
-
-struct conv_bwd_weight_config_t {
-  int K_block;
-  int C_block;
-  int N_block;
-  int tile_p;
-  int tile_q;
-  int num_tile_n;
-  int loop_sched;
-};
-
-struct nested_conv_bwd_data_config_t {
-  int bs_threads;
-  int spatial_threads;
-  int ic_threads;
-  int bs_num_blocks;
-  int spatial_num_blocks;
-  int ic_num_blocks;
-  int oc_num_blocks;
-};
-
-struct nested_conv_bwd_weight_config_t {
-  int oc_threads;
-  int ic_threads;
-  int bs_threads;
-  int oh_threads;
-  int od_threads;
-  int oc_num_blocks;
-  int ic_num_blocks;
-  int bs_num_blocks;
-  int oh_num_blocks;
-  int od_num_blocks;
-  int ow_num_blocks;
-};
-
-class gen_conv_bwd_t : public body_generator_t<conv_bwd_data_config_t> {
-public:
-  sc_dims stride_;
-  sc_dims padding_;
-  struct op_params_t {
-    static constexpr int in_fwd_output = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out_del_input = 0;
-  };
-  using parent = body_generator_t<conv_bwd_data_config_t>;
-  using parent::generate;
-
-  std::tuple<int, int> get_output_shape() {
-    return std::tuple<int, int> {get_output_dims()[2], get_output_dims()[3]};
-  }
-
-  gen_conv_bwd_t(sc_op *owner, const sc_dims &stride, const sc_dims &padding,
-    std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_input_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-  const sc_dims &get_weight_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-  sc_data_type_t get_dtype() const { return in_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_bwd_data_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_bwd_data_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-};
-} // namespace ops
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_dw_fwd.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_dw_fwd.cpp
deleted file mode 100644
index 4ed8bea88ad..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_dw_fwd.cpp
+++ /dev/null
@@ -1,1245 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "conv_dw_fwd.hpp"
-#include <utility>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/trait/configurable.hpp>
-#include <ops/convolution.hpp>
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/dynamic_dispatch/utils.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using ops::conv_dw_fwd_config_t;
-// clang-format off
-SC_CLASS(conv_dw_fwd_config_t)
-  SC_FIELD(bs_threads)
-  SC_FIELD(h_threads)
-  SC_FIELD(w_threads)
-  SC_FIELD(g_threads)
-  SC_FIELD(h_block)
-  SC_FIELD(w_block)
-  SC_FIELD(g_block)
-  SC_FIELD(im_h_block)
-  SC_FIELD(im_w_block)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-
-config_ptr_vec gen_conv_dw_fwd_t::get_dynamic_config_candidates(
-  const context_ptr &ctx) const {
-  config_ptr_vec ret;
-  return ret;
-}
-
-config_ptr gen_conv_dw_fwd_t::get_default_config(context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_dw_fwd_config_t>();
-  conv_dw_fwd_config_t &cfg = *ret.unchecked_get_as<conv_dw_fwd_config_t>();
-
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  auto thread_split = get_splits(num_threads);
-  cfg.bs_threads = mb_ > num_threads
-    ? num_threads
-    : *(std::find_if(thread_split.rbegin(), thread_split.rend(),
-      [&](int split) { return split == 1 || split < mb_; }));
-  cfg.h_threads = num_threads / cfg.bs_threads;
-  cfg.w_threads = 1;
-  cfg.g_threads = 1;
-  cfg.h_block = 1;
-  cfg.w_block = ow_;
-  cfg.g_block = groups_;
-  cfg.im_h_block = 1;
-  cfg.im_w_block = ow_;
-
-  return std::move(ret);
-}
-
-gen_conv_dw_fwd_t::gen_conv_dw_fwd_t(sc_op *owner, const sc_dims &stride,
-  const sc_dims &dilation, const sc_dims &pads_begin, const sc_dims &pads_end,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs)) {
-  COMPILE_ASSERT(in_tensors_.size() == 2,
-    "Wrong number of inputs, expected to be 2 but got " << in_tensors_.size()
-                                                        << ".");
-  COMPILE_ASSERT(out_tensors_.size() == 1,
-    "Wrong number of output, expected to be 1 but got " << out_tensors_.size()
-                                                        << ".");
-
-  auto input_plain_dims = get_input_plain_dims();
-  auto weight_plain_dims = get_weight_plain_dims();
-  auto out_plain_dims = get_output_plain_dims();
-  if (owner) { attrs_ = owner->attrs_; }
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(input_plain_dims.size()), 4, 5, 6),
-    "Wrong input dims, expected to be  4D, 5D or 6D input, but got "
-      << input_plain_dims.size() << "D.");
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(weight_plain_dims.size()), 4, 5, 6)
-      && (weight_plain_dims.size() == input_plain_dims.size()),
-    "Wrong weight dims, only support 4D 5D or 6D weights, but got "
-      << weight_plain_dims.size() << "D.");
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(out_plain_dims.size()), 4, 5, 6)
-      && (out_plain_dims.size() == input_plain_dims.size()),
-    "Wrong output dims, only support 4D , 5D or 6D output, but got "
-      << out_plain_dims.size() << "D.");
-
-  ndims_ = input_plain_dims.size();
-  is_3d_ = (ndims_ == 6);
-
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 2),
-    "Wrong pads_begin dims, should be 1D, 2D or 3D, but got "
-      << pads_begin.size() << "D.");
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(stride.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(stride.size()), 1, 2),
-    "Wrong stride dims, should be 1D, 2D or 3D, but got " << stride.size()
-                                                          << "D.");
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(dilation.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(dilation.size()), 1, 2),
-    "Wrong dilation dims, should be 1D, 2D or 3D, but got " << dilation.size()
-                                                            << "D.");
-  groups_ = static_cast<int>(attrs_.get_or_else("groups", 1));
-  COMPILE_ASSERT(input_plain_dims[1] == weight_plain_dims[0]
-      && input_plain_dims[1] == groups_,
-    "expect input groups == weight groups, but got "
-      << input_plain_dims[1] << " vs " << weight_plain_dims[0] << ".");
-  COMPILE_ASSERT(
-    input_plain_dims[2] == weight_plain_dims[2] && input_plain_dims[2] == 1,
-    "expect input ic and weight ic equal  to 1");
-
-  mb_ = input_plain_dims[0];
-  ic_ = input_plain_dims[2];
-  id_ = is_3d_ ? input_plain_dims[ndims_ - 3] : 1;
-  ih_ = input_plain_dims[ndims_ - 2];
-  iw_ = input_plain_dims[ndims_ - 1];
-  oc_ = weight_plain_dims[1];
-  kd_ = is_3d_ ? weight_plain_dims[ndims_ - 3] : 1;
-  kh_ = weight_plain_dims[ndims_ - 2];
-  kw_ = weight_plain_dims[ndims_ - 1];
-  od_ = is_3d_ ? out_plain_dims[ndims_ - 3] : 1;
-  oh_ = out_plain_dims[ndims_ - 2];
-  ow_ = out_plain_dims[ndims_ - 1];
-  pd_b_ = is_3d_ ? pads_begin[0] : 0;
-  ph_b_ = pads_begin[0], pw_b_ = pads_begin[0];
-  pd_e_ = is_3d_ ? pads_end[0] : 0;
-  ph_e_ = pads_end[0], pw_e_ = pads_end[0];
-  sd_ = is_3d_ ? stride[0] : 1;
-  sh_ = stride[0], sw_ = stride[0];
-  dd_ = is_3d_ ? dilation[0] : 1;
-  dh_ = dilation[0], dw_ = dilation[0];
-
-  if (pads_begin.size() > 1) {
-    ph_b_ = pads_begin[ndims_ - 5];
-    pw_b_ = pads_begin[ndims_ - 4];
-  }
-  if (pads_end.size() > 1) {
-    ph_e_ = pads_end[ndims_ - 5];
-    pw_e_ = pads_end[ndims_ - 4];
-  }
-  if (stride.size() > 1) {
-    sh_ = stride[ndims_ - 5];
-    sw_ = stride[ndims_ - 4];
-  }
-  if (dilation.size() > 1) {
-    dh_ = dilation[ndims_ - 5];
-    dw_ = dilation[ndims_ - 4];
-  }
-}
-
-float gen_conv_dw_fwd_t::get_gflop() const {
-  float result = (float)mb_ * groups_ * oc_ * 2.0 * ic_ * kd_ * kh_ * kw_ * od_
-    * oh_ * ow_ / (float)1e9;
-  return result;
-}
-
-#define CONV_ARG_LIST \
-  const context_ptr &ctx, const conv_dw_fwd_config_t &config, \
-    fusion_anchor_mgr_t *fusion, expr &output, const expr &input, \
-    const expr &weight, std::vector<for_loop> &loops
-
-void gen_conv_dw_fwd_t::dynamic_conv_logical_padding(CONV_ARG_LIST) const {
-  COMPILE_ASSERT(false, "dynamic compute conv no padding is not supported!");
-}
-
-void gen_conv_dw_fwd_t::compute_conv_logical_padding(CONV_ARG_LIST) const {
-  expr weight_k_start = 0;
-  if (fusion) {
-    auto mxp = fusion->get_binded_mxp();
-    auto anchor = mxp->lookup_anchor_map(this->owner_, false);
-    if (anchor) {
-      auto slice = anchor->fsmap_.get(this->owner_->get_outputs()[0])[0];
-      weight_k_start = slice.back().first;
-    }
-  }
-
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  int mb_expr_ = static_cast<uint64_t>(get_expr_as_int(input_expr_dims[0]));
-  int g_expr_ = get_expr_as_int(input_expr_dims[input_expr_dims.size() - 2]);
-
-  int bs_threads = config.bs_threads;
-  int h_threads = config.h_threads;
-  int w_threads = config.w_threads;
-  int g_threads = config.g_threads;
-
-  int h_block = config.h_block;
-  int w_block = config.w_block;
-  int g_block = g_expr_ == groups_ ? config.g_block : g_expr_;
-  int im_h_block = config.im_h_block;
-  int im_w_block = config.im_w_block;
-
-  COMPILE_ASSERT(
-    h_block % im_h_block == 0, "h_block % im_h_block != 0, config is invalid")
-  COMPILE_ASSERT(
-    w_block % im_w_block == 0, "w_block % im_w_block != 0, config is invalid")
-
-  for_loop lpbs, lph, lpw, lpg;
-
-  int h_num_block_pt, h_tail_num_block_pt, w_num_block_pt, w_tail_num_block_pt,
-    g_num_block_pt, g_tail_num_block_pt;
-  int oh_used_threads = block_split(utils::divide_and_ceil(oh_, h_block),
-    h_threads, h_num_block_pt, h_tail_num_block_pt);
-  int ow_used_threads = block_split(utils::divide_and_ceil(ow_, w_block),
-    w_threads, w_num_block_pt, w_tail_num_block_pt);
-  int g_used_threads = block_split(utils::divide_and_ceil(g_expr_, g_block),
-    g_threads, g_num_block_pt, g_tail_num_block_pt);
-
-  int LDA = sw_ * ic_ * g_expr_;
-  int LDC = oc_ * g_expr_;
-
-  auto nthreads = runtime_config_t::get().get_num_threads();
-  bool parallel_space_is_enough
-    = (mb_ % nthreads == 0 || utils::divide_and_ceil(mb_, nthreads) > 8);
-
-  _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lph, ph, 0, oh_used_threads, 1) {
-      _named_for_(lpw, pw, 0, ow_used_threads, 1) {
-        _named_for_(lpg, pg, 0, g_used_threads, 1) {
-          expr n = pbs;
-          expr h_num_block = builder::make_select(ph < (oh_used_threads - 1),
-                 h_num_block_pt, h_tail_num_block_pt),
-               w_num_block = builder::make_select(pw < (ow_used_threads - 1),
-                 w_num_block_pt, w_tail_num_block_pt),
-               g_num_block = builder::make_select(pg < (g_used_threads - 1),
-                 g_num_block_pt, g_tail_num_block_pt);
-          // single core
-          _for_(o_h, 0, h_num_block_pt) {
-            _for_(o_w, 0, w_num_block_pt) {
-              _for_(o_g, 0, g_num_block_pt) {
-                expr cond
-                  = o_h < h_num_block && o_w < w_num_block && o_g < g_num_block;
-                _if_(cond) { // TODO(ciyong): maybe remove when it's dividable?
-                  _for_(i_h, 0, h_block / im_h_block) {
-                    expr h = (ph * h_num_block_pt + o_h) * h_block
-                      + i_h * im_h_block;
-                    _for_(i_w, 0, w_block / im_w_block) {
-                      expr w = (pw * w_num_block_pt + o_w) * w_block
-                        + i_w * im_w_block;
-                      _if_(w < ow_) {
-                        _tensor_(A_list, datatypes::pointer, {kh_ * kw_});
-                        _tensor_(B_list, datatypes::pointer, {kh_ * kw_});
-                        _tensor_(top_pad, datatypes::s32, {kh_ * kw_});
-                        _tensor_(bottom_pad, datatypes::s32, {kh_ * kw_});
-                        _for_(im_h_i, 0, im_h_block) {
-                          _if_(h + im_h_i < oh_) {
-                            _var_init_(cnt, datatypes::s32, 0);
-                            _for_(r, 0, kh_) {
-                              _for_(s, 0, kw_) {
-                                auto ih = builder::make_cast(datatypes::s32,
-                                            (h + im_h_i) * sh_ + r)
-                                  - ph_b_;
-                                auto iw = builder::make_cast(
-                                            datatypes::s32, w * sw_ + s)
-                                  - pw_b_;
-                                _if_(ih >= 0 && ih < ih_) {
-                                  top_pad[cnt] = make_select(iw < 0,
-                                    divide_and_ceil((0 - iw), sw_), expr(0));
-                                  bottom_pad[cnt] = make_select(
-                                    iw + (im_w_block - 1) * sw_ + 1 > iw_,
-                                    builder::make_cast(datatypes::s32,
-                                      divide_and_ceil(
-                                        (iw + (im_w_block - 1) * sw_ + 1) - iw_,
-                                        sw_)),
-                                    expr(0));
-                                  std::vector<expr> input_pos
-                                    = std::vector<expr> {n, ih, w * sw_ + s,
-                                      (pg * g_num_block_pt + o_g) * g_block, 0};
-                                  A_list[cnt] = tensor_ptr(input, input_pos);
-                                  A_list[cnt]
-                                    = builder::make_cast(datatypes::pointer,
-                                      builder::make_cast(
-                                        datatypes::index, A_list[cnt])
-                                        - builder::make_cast(datatypes::index,
-                                          pw_b_ * ic_ * g_expr_
-                                            * utils::get_sizeof_type(
-                                              get_input_dtype())));
-                                  B_list[cnt] = tensor_ptr(weight,
-                                    std::vector<expr> {r, s, 0,
-                                      (pg * g_num_block_pt + o_g) * g_block
-                                        + weight_k_start,
-                                      0});
-                                  cnt = cnt + 1;
-                                }
-                              }
-                            }
-                            std::vector<expr> output_pos
-                              = std::vector<expr> {n, h + im_h_i, w,
-                                (pg * g_num_block_pt + o_g) * g_block, 0};
-
-                            sc_brgemm_attrs_t brg_attrs {
-                              {brgemm::attr_key::hint_bs_group,
-                                sw_ == 1 ? kw_ : 1},
-                              {brgemm::attr_key::max_top_vpad,
-                                utils::divide_and_ceil(pw_b_, sw_)},
-                              {brgemm::attr_key::max_bottom_vpad,
-                                utils::divide_and_ceil(pw_e_, sw_)}};
-                            builtin::brgemm_init_list_update(A_list, B_list,
-                              tensor_ptr(output, output_pos), 1, im_w_block,
-                              g_block, -1, LDA, -1, LDC, 1 /*useless*/,
-                              1 /*useless*/, cnt, get_input_dtype(),
-                              get_weight_dtype(), brg_attrs,
-                              sc_brgemm_bd_mask_t(), get_ir_zero_index(), 1,
-                              top_pad, bottom_pad);
-
-                            // im_w_block * g_block
-                            create_fusion_anchor(fusion,
-                              owner_->get_outputs()[0],
-                              slice_range {{n, 1UL}, {h + im_h_i, 1},
-                                {w, im_w_block},
-                                {(pg * g_num_block_pt + o_g) * g_block,
-                                  g_block},
-                                {0, 1}});
-                          }
-                        }
-                        // im_h_block * im_w_block * g_block
-                        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                          slice_range {{n, 1UL}, {h, im_h_block},
-                            {w, im_w_block},
-                            {(pg * g_num_block_pt + o_g) * g_block, g_block},
-                            {0, 1}});
-                      }
-                    }
-                    // im_h_block * w_block * g_block
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      slice_range {{n, 1UL}, {h, im_h_block},
-                        {(pw * w_num_block_pt + o_w) * w_block, w_block},
-                        {(pg * g_num_block_pt + o_g) * g_block, g_block},
-                        {0, 1}});
-                  }
-                  // h_block * w_block * g_block
-                  create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                    slice_range {{n, 1UL},
-                      {(ph * h_num_block_pt + o_h) * h_block, h_block},
-                      {(pw * w_num_block_pt + o_w) * w_block, w_block},
-                      {(pg * g_num_block_pt + o_g) * g_block, g_block},
-                      {0, 1}});
-                }
-              }
-              // h_block * w_block * g_num_block_pt * g_block
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                slice_range {{n, 1UL},
-                  {(ph * h_num_block_pt + o_h) * h_block, h_block},
-                  {(pw * w_num_block_pt + o_w) * w_block, w_block},
-                  {(pg * g_num_block_pt) * g_block, g_num_block_pt * g_block},
-                  {0, 1}});
-            }
-            // h_block * w_num_block_pt * w_block * g_num_block_pt * g_block
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              slice_range {{n, 1UL},
-                {(ph * h_num_block_pt + o_h) * h_block, h_block},
-                {(pw * w_num_block_pt) * w_block, w_num_block_pt * w_block},
-                {(pg * g_num_block_pt) * g_block, g_num_block_pt * g_block},
-                {0, 1}});
-          }
-          // h_num_block_pt * h_block * w_num_block_pt * w_block *
-          // g_num_block_pt * g_block
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            slice_range {{n, 1UL},
-              {(ph * h_num_block_pt) * h_block, h_num_block_pt * h_block},
-              {(pw * w_num_block_pt) * w_block, w_num_block_pt * w_block},
-              {(pg * g_num_block_pt) * g_block, g_num_block_pt * g_block},
-              {0, 1}});
-        }
-      }
-    }
-  }
-  // loop axis bind(NGCHW)
-  bind_loop_axis(owner_->get_outputs()[0], lpbs, 0);
-  bind_loop_axis(owner_->get_outputs()[0], lph, 3);
-  bind_loop_axis(owner_->get_outputs()[0], lpw, 4);
-  bind_loop_axis(owner_->get_outputs()[0], lpg, 1);
-  loops = {lpbs, lph, lpw, lpg};
-}
-
-void gen_conv_dw_fwd_t::compute_conv_physical_padding(CONV_ARG_LIST) const {
-  expr weight_k_start = 0;
-  if (fusion) {
-    auto mxp = fusion->get_binded_mxp();
-    auto anchor = mxp->lookup_anchor_map(this->owner_, false);
-    if (anchor) {
-      auto slice = anchor->fsmap_.get(this->owner_->get_outputs()[0])[0];
-      weight_k_start = slice.back().first;
-    }
-  }
-  for_loop ln, ld, lp, lg;
-  const auto dtype_input = get_input_dtype();
-  const auto dtype_weight = get_weight_dtype();
-  const auto dtype_output = get_output_dtype();
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  int mb_expr_ = static_cast<uint64_t>(get_expr_as_int(input_expr_dims[0]));
-  int g_expr_ = get_expr_as_int(input_expr_dims[input_expr_dims.size() - 2]);
-  int g_block = g_expr_ == groups_ ? config.g_block : g_expr_;
-  // no need to include groups for LDA as it's used by sub-tensor
-  // instead of origin input tensor.
-  int LDA = sw_ * ic_ * g_expr_;
-  int LDC = oc_ * g_expr_;
-  int aux_w_block_size = (config.im_w_block - 1) * sw_ + dw_ * (kw_ - 1) + 1;
-  auto padding_value = attrs_.get_or_else("padding_value", 0);
-  typedef enum { LEFT_PAD = 0, BOTH_PAD, RIGHT_PAD } pad_kind;
-
-  // some shapes might have less pad than given at the end of current
-  // axis
-  auto get_num_pad_end = [](int ip, int k, int s, int p) {
-    int remaining = (ip - k) % s;
-    int num_pad_end = (remaining == 0)
-      ? utils::divide_and_ceil(p, s)
-      : ((p > remaining) ? utils::divide_and_ceil(p - remaining, s) : 0);
-    return num_pad_end;
-  };
-  const int out_num_pad_top = utils::divide_and_ceil(ph_b_, sh_);
-  const int out_num_pad_left = utils::divide_and_ceil(pw_b_, sw_);
-  const int out_num_pad_front = is_3d_ ? utils::divide_and_ceil(pd_b_, sd_) : 0;
-  const int out_num_pad_bottom
-    = get_num_pad_end(ih_ + ph_b_ + ph_e_, dh_ * (kh_ - 1) + 1, sh_, ph_e_);
-  const int out_num_pad_right
-    = get_num_pad_end(iw_ + pw_b_ + pw_e_, dw_ * (kw_ - 1) + 1, sw_, pw_e_);
-  const int out_num_pad_back = is_3d_
-    ? get_num_pad_end(id_ + pd_b_ + pd_e_, dd_ * (kd_ - 1) + 1, sd_, pd_e_)
-    : 0;
-
-  const int out_nopad_top = out_num_pad_top;
-  const int out_nopad_bottom = oh_ - out_num_pad_bottom - 1;
-  const int out_nopad_left = out_num_pad_left;
-  const int out_nopad_right = ow_ - out_num_pad_right - 1;
-  const int out_nopad_front = is_3d_ ? out_num_pad_front : 0;
-  const int out_nopad_back = is_3d_ ? od_ - out_num_pad_front - 1 : 1;
-  const uint32_t lanes = get_lanes(ctx, g_block, dtype_input);
-
-  // large pd and ph will be skipped for non-os-blocking approach.
-  const bool large_pad = aux_w_block_size < pw_b_ || aux_w_block_size < pw_e_;
-
-  int g_num_block = utils::divide_and_ceil(g_expr_, g_block);
-
-  const int work_amount = mb_ * g_num_block * ow_ / config.im_w_block;
-  bool reuse_aux_buffer = sh_ < (dh_ * (kh_ - 1) + 1) && g_num_block == 1
-    && is_parallel_space_enough(work_amount, num_threads) && dh_ == 1;
-  bool use_var_bs
-    = attrs_.get_or_else("use_var_bs", true) && padding_value == 0;
-  // utility function
-  auto update_pad_idx =
-    [](const expr &cur_o, const expr &cur_i, const int ker, const int dilation,
-      const int in, const int nopad_begin, const int nopad_end,
-      expr::lvalue_proxy_t &num_pad, expr::lvalue_proxy_t &pad_begin_idx,
-      expr::lvalue_proxy_t &pad_end_idx, expr::lvalue_proxy_t &nopad_begin_idx,
-      expr::lvalue_proxy_t &nopad_end_idx) {
-      _if_((cur_o >= nopad_begin) && (cur_o <= nopad_end)) {
-        num_pad = 0;
-        pad_begin_idx = 0;
-        pad_end_idx = 0;
-        nopad_begin_idx = 0;
-        nopad_end_idx = ker;
-      }
-      _else_ {
-        _if_(cur_o < nopad_begin) {
-          num_pad
-            = (0 - builder::make_cast(datatypes::s32, cur_i) - 1) / dilation
-            + 1;
-          expr num_right_pad
-            = divide_and_ceil(((builder::make_cast(datatypes::s32, cur_i)
-                                + (ker - 1) * dilation + 1 - in)),
-              dilation);
-          pad_begin_idx = 0;
-          pad_end_idx = num_pad;
-          nopad_begin_idx = num_pad;
-          nopad_end_idx = ker;
-          _if_(num_right_pad > 0) { nopad_end_idx = ker - num_right_pad; }
-        }
-        _else_ {
-          num_pad = divide_and_ceil(((builder::make_cast(datatypes::s32, cur_i)
-                                      + (ker - 1) * dilation + 1 - in)),
-            dilation);
-          pad_begin_idx = ker - num_pad;
-          pad_end_idx = ker;
-          nopad_begin_idx = 0;
-          nopad_end_idx = ker - num_pad;
-        }
-      }
-    };
-
-  // global tensor define
-  _tensor_(pad_buffer, dtype_input, {aux_w_block_size, LDA});
-  // thread shared var to hold stateful status
-  _tensor_(global_aux_buffer, dtype_input,
-    is_3d_ ? std::vector<expr> {num_threads, kd_, kh_, aux_w_block_size, LDA}
-           : std::vector<expr> {num_threads, kh_, aux_w_block_size, LDA});
-  _tensor_(global_cur_indices, datatypes::u32, {num_threads, kh_});
-  _tensor_(global_init_state, datatypes::boolean, {num_threads});
-  if (!use_var_bs) {
-    // when not using var_bs, define a unified zero-buffer for
-    // padding.
-    builtin::brgemm_init(
-      pad_buffer, aux_w_block_size, LDA, LDA, dtype_output, padding_value);
-  }
-  int outer_range
-    = reuse_aux_buffer ? ow_ / config.im_w_block : oh_ / config.im_h_block;
-  int inner_range
-    = reuse_aux_buffer ? oh_ / config.im_h_block : ow_ / config.im_w_block;
-  _named_for_(ln, n, 0, expr(mb_expr_), 1, for_type::PARALLEL) {
-    _named_for_(lp, outer_var, 0, outer_range) {
-      _named_for_(ld, d_o, 0, od_) {
-        _var_init_(tid, datatypes::s32, builder::make_get_group_thread_id(-1));
-        _tensor_(A_list, datatypes::pointer, {kd_ * kh_ * kw_});
-        _tensor_(B_list, datatypes::pointer, {kd_ * kh_ * kw_});
-        _var_(num_h_pad, datatypes::s32);
-        _var_(h_pad_begin_idx, datatypes::index);
-        _var_(h_pad_end_idx, datatypes::index);
-        _var_(h_nopad_begin_idx, datatypes::index);
-        _var_(h_nopad_end_idx, datatypes::index);
-        _var_(num_d_pad, datatypes::s32);
-        _var_(d_pad_begin_idx, datatypes::index);
-        _var_(d_pad_end_idx, datatypes::index);
-        _var_(d_nopad_begin_idx, datatypes::index);
-        _var_(d_nopad_end_idx, datatypes::index);
-        _var_(aux_buf_d, datatypes::index);
-        _var_(aux_buf_h, datatypes::index);
-
-        expr cur_od = d_o;
-        auto cur_id = cur_od * sd_;
-        // initialized stateful vars for each thread.
-        if (reuse_aux_buffer) {
-          _for_(gi, 0, kh_) {
-            global_cur_indices[{tid, gi}]
-              = builder::make_cast(datatypes::u32, gi);
-          }
-          global_init_state[tid] = true;
-        }
-        _for_(inner_var, 0, inner_range) {
-          _var_init_(
-            h_o, datatypes::index, reuse_aux_buffer ? inner_var : outer_var);
-          _var_init_(
-            w_o, datatypes::index, reuse_aux_buffer ? outer_var : inner_var);
-          auto cur_ow_begin = w_o * config.im_w_block;
-          auto cur_ow_end = cur_ow_begin + config.im_w_block - 1;
-          auto cur_iw = cur_ow_begin * sw_ - pw_b_;
-          _named_for_(lg, g, 0, g_num_block) {
-            auto cur_g = g * g_block;
-            _for_(h_i, 0, config.im_h_block) {
-              auto cur_oh = h_o * config.im_h_block + h_i;
-              auto cur_ih = cur_oh * sh_ - ph_b_;
-              std::vector<expr> output_pos = is_3d_
-                ? std::vector<expr> {n, cur_od, cur_oh, cur_ow_begin, cur_g, 0}
-                : std::vector<expr> {n, cur_oh, cur_ow_begin, cur_g, 0};
-
-              if (is_3d_) {
-                update_pad_idx(cur_od, cur_id, kd_, dd_, id_, out_nopad_front,
-                  out_nopad_back, num_d_pad, d_pad_begin_idx, d_pad_end_idx,
-                  d_nopad_begin_idx, d_nopad_end_idx);
-              }
-              update_pad_idx(cur_oh, cur_ih, kh_, dh_, ih_, out_nopad_top,
-                out_nopad_bottom, num_h_pad, h_pad_begin_idx, h_pad_end_idx,
-                h_nopad_begin_idx, h_nopad_end_idx);
-
-              auto zero_out_aux_buffer = [&]() {
-                builtin::brgemm_init(tensor_ptr(output, output_pos),
-                  config.im_w_block, g_block, LDC, dtype_output, padding_value);
-              };
-
-              auto process_tile_with_pad = [&](const expr &d_begin,
-                                             const expr &d_end,
-                                             const expr &h_begin,
-                                             const expr &h_end,
-                                             const expr &left_pad,
-                                             const expr
-                                               &w_block_size_without_pad,
-                                             const pad_kind &kind,
-                                             const expr &aux_buf_hi = 0,
-                                             const bool &update_mode = false) {
-                _for_(kd, d_begin, d_end) {
-                  _for_(kh, h_begin, h_end) {
-                    if (is_3d_) { aux_buf_d = kd - d_begin; }
-                    aux_buf_h = update_mode ? aux_buf_hi : (kh - h_begin);
-                    if (kind == LEFT_PAD || kind == BOTH_PAD) {
-                      builtin::brgemm_init(
-                        tensor_ptr(global_aux_buffer,
-                          is_3d_ ? std::vector<expr> {tid, aux_buf_d, aux_buf_h,
-                            0, 0}
-                                 : std::vector<expr> {tid, aux_buf_h, 0, 0}),
-                        builder::make_cast(datatypes::s32, left_pad), g_block,
-                        LDA, dtype_input, padding_value);
-                    }
-
-                    // mapping dst to src_padded then
-                    // mapping to original src to copy the
-                    // origin elements.
-                    _for_(aux_buf_w, left_pad, w_block_size_without_pad) {
-                      _for_(k, 0, g_block, (int)lanes) {
-                        if (is_3d_) {
-                          global_aux_buffer[span_t(
-                            {tid, aux_buf_d, aux_buf_h, aux_buf_w, k}, lanes)]
-                            = input[span_t({n, cur_id + kd * d_nopad_begin_idx,
-                                             cur_ih + kh * dh_,
-                                             cur_iw + aux_buf_w, cur_g + k, 0},
-                              lanes)];
-                        } else {
-                          global_aux_buffer[span_t(
-                            {tid, aux_buf_h, aux_buf_w, k}, lanes)]
-                            = input[span_t({n, cur_ih + kh * dh_,
-                                             cur_iw + aux_buf_w, cur_g + k, 0},
-                              lanes)];
-                        }
-                      }
-                    }
-
-                    if (kind == RIGHT_PAD || kind == BOTH_PAD) {
-                      builtin::brgemm_init(
-                        tensor_ptr(global_aux_buffer,
-                          is_3d_ ? std::vector<expr> {tid, aux_buf_d, aux_buf_h,
-                            w_block_size_without_pad, 0}
-                                 : std::vector<expr> {tid, aux_buf_h,
-                                   w_block_size_without_pad, 0}),
-                        builder::make_cast(datatypes::s32,
-                          aux_w_block_size - w_block_size_without_pad),
-                        g_block, LDA, dtype_input, padding_value);
-                    }
-
-                    _for_(kw, 0, kw_) {
-                      expr idx;
-                      if (is_3d_) {
-                        auto valid_kh
-                          = (h_nopad_end_idx - h_nopad_begin_idx - 1) / dh_ + 1;
-                        idx = builder::make_cast(datatypes::u32,
-                          use_var_bs ? (
-                            aux_buf_d * valid_kh * kw_ + aux_buf_h * kw_ + kw)
-                                     : (kd * kh_ * kw_ + kh * kw_ + kw));
-                      } else {
-                        idx = builder::make_cast(datatypes::u32,
-                          use_var_bs ? (aux_buf_h * kw_ + kw)
-                                     : (kh * kw_ + kw));
-                      }
-
-                      // TODO(xxx): pack input for dilated
-                      // conv
-                      A_list[idx] = tensor_ptr(global_aux_buffer,
-                        is_3d_
-                          ? std::vector<expr> {tid, aux_buf_d, aux_buf_h,
-                            kw * dw_, 0}
-                          : std::vector<expr> {tid, aux_buf_h, kw * dw_, 0});
-                    }
-                  }
-                }
-              };
-
-              auto fill_aux_buffer = [&](const expr &d_nopad_begin = 0,
-                                       const expr &d_nopad_end = 1) {
-                _if_(cur_ow_begin < out_nopad_left) {
-                  _if_(out_nopad_right >= 0 && cur_ow_end <= out_nopad_right) {
-                    // left pad only
-                    expr real_l_pad
-                      = 0 - builder::make_cast(datatypes::s32, cur_iw);
-                    process_tile_with_pad(d_nopad_begin, d_nopad_end,
-                      h_nopad_begin_idx, h_nopad_end_idx, real_l_pad,
-                      aux_w_block_size, LEFT_PAD);
-                  }
-                  _else_ {
-                    // both left and right pad
-                    expr real_l_pad
-                      = 0 - builder::make_cast(datatypes::s32, cur_iw);
-                    expr real_r_pad = builder::make_cast(datatypes::s32,
-                                        cur_iw + aux_w_block_size)
-                      - iw_;
-                    expr w_block_size_without_pad
-                      = aux_w_block_size - real_r_pad;
-                    process_tile_with_pad(d_nopad_begin, d_nopad_end,
-                      h_nopad_begin_idx, h_nopad_end_idx, real_l_pad,
-                      w_block_size_without_pad, BOTH_PAD);
-                  }
-                }
-                _else_ {
-                  // right pad only
-                  expr real_r_pad = builder::make_cast(
-                                      datatypes::s32, cur_iw + aux_w_block_size)
-                    - iw_;
-                  expr w_block_size_without_pad = aux_w_block_size - real_r_pad;
-                  process_tile_with_pad(d_nopad_begin, d_nopad_end,
-                    h_nopad_begin_idx, h_nopad_end_idx, 0,
-                    w_block_size_without_pad, RIGHT_PAD);
-                }
-              };
-
-              auto update_aux_buffer = [&]() {
-                _tensor_(modified_indices, datatypes::index, {sh_});
-                _var_(modified_idx, datatypes::index);
-                _var_(actual_idx, datatypes::index);
-                modified_idx = 0;
-                _for_(idx, 0, kh_) {
-                  expr prev_indices = global_cur_indices[{tid, idx}];
-                  _if_(prev_indices < sh_) {
-                    global_cur_indices[{tid, idx}] = prev_indices + kh_ - sh_;
-                    modified_indices[modified_idx] = idx;
-                    modified_idx = modified_idx + 1;
-                  }
-                  _else_ {
-                    global_cur_indices[{tid, idx}] = prev_indices - sh_;
-                  }
-                }
-
-                _for_(idx, 0, sh_) {
-                  modified_idx = modified_indices[idx];
-                  actual_idx = global_cur_indices[{tid, modified_idx}];
-                  // update necessary row of sub-tensor
-                  // according to actual_idx
-                  _if_(cur_ow_begin < out_nopad_left) {
-                    _if_(
-                      out_nopad_right >= 0 && cur_ow_end <= out_nopad_right) {
-                      // left pad only
-                      expr real_l_pad
-                        = 0 - builder::make_cast(datatypes::s32, cur_iw);
-                      process_tile_with_pad(0, kd_, actual_idx, actual_idx + 1,
-                        real_l_pad, aux_w_block_size, LEFT_PAD, modified_idx,
-                        true);
-                    }
-                    _else_ {
-                      // both left and right pad
-                      expr real_l_pad
-                        = 0 - builder::make_cast(datatypes::s32, cur_iw);
-                      expr real_r_pad = builder::make_cast(datatypes::s32,
-                                          cur_iw + aux_w_block_size)
-                        - iw_;
-                      expr w_block_size_without_pad
-                        = aux_w_block_size - real_r_pad;
-                      process_tile_with_pad(0, kd_, actual_idx, actual_idx + 1,
-                        real_l_pad, w_block_size_without_pad, BOTH_PAD,
-                        modified_idx, true);
-                    }
-                  }
-                  _else_ {
-                    // right pad only
-                    expr real_r_pad = builder::make_cast(datatypes::s32,
-                                        cur_iw + aux_w_block_size)
-                      - iw_;
-                    expr w_block_size_without_pad
-                      = aux_w_block_size - real_r_pad;
-                    process_tile_with_pad(0, kd_, actual_idx, actual_idx + 1, 0,
-                      w_block_size_without_pad, RIGHT_PAD, modified_idx, true);
-                  }
-                }
-
-                // update A_list with reusable sub-tensor
-                // using cur_indices, no padding on depth or
-                // height axis.
-                _for_(kd, 0, kd_) {
-                  _for_(kh, 0, kh_) {
-                    _var_(aux_buf_idx, datatypes::index);
-                    aux_buf_idx = builder::make_cast(
-                      datatypes::index, global_cur_indices[{tid, kh}]);
-                    _for_(kw, 0, kw_) {
-                      _var_(A_idx, datatypes::u32);
-                      if (is_3d_) {
-                        A_idx = builder::make_cast(datatypes::u32,
-                          kd * kh_ * kw_ + aux_buf_idx * kw_ + kw);
-                        A_list[A_idx] = tensor_ptr(
-                          global_aux_buffer, {tid, kd, kh, kw * dw_, 0});
-                      } else {
-                        A_idx = builder::make_cast(
-                          datatypes::u32, aux_buf_idx * kw_ + kw);
-                        A_list[A_idx] = tensor_ptr(
-                          global_aux_buffer, {tid, kh, kw * dw_, 0});
-                      }
-                    }
-                  }
-                }
-              };
-
-              auto call_brgemm = [&](int valid_kh, int valid_kd = 1) {
-                auto valid_ker_size = valid_kd * valid_kh * kw_;
-                int M = config.im_w_block, K = g_block, N = g_block;
-                auto hint_A_size = M * K * valid_ker_size;
-                auto hint_B_size = K * N * valid_ker_size;
-                auto hint_C_size = M * N;
-                sc_brgemm_attrs_t brg_attrs {
-                  {brgemm::attr_key::hint_bs_group, sw_ == 1 ? kw_ : 1},
-                  {brgemm::attr_key::max_bs, valid_ker_size}};
-
-                builtin::brgemm_init_list_update(A_list, B_list,
-                  tensor_ptr(output, output_pos), 1, M, N, -1, sw_ * LDA, -1,
-                  LDC, 1, 1, valid_ker_size, dtype_input, dtype_weight,
-                  brg_attrs);
-              };
-
-              auto generate_var_bs
-                = [](const std::function<void(int, int)> &func, int k, int o,
-                    int s, int d, int p, int i, int valid_kd, expr &cur_pos) {
-                    int valid_k;
-                    auto current_builder = get_current_builder();
-                    current_builder->push_scope();
-                    func(k, valid_kd);
-                    stmt else_stmt = current_builder->pop_scope();
-                    for (auto pos = 0; pos < o; ++pos) {
-                      auto pos_begin = pos * s - p;
-                      valid_k = 0;
-                      auto ker_pos = pos_begin;
-                      for (auto ker = 0; ker < k; ker++) {
-                        if (ker_pos >= 0 && ker_pos < i) { valid_k++; }
-                        ker_pos += d;
-                      }
-                      if (valid_k < k && valid_k > 0) {
-                        current_builder->push_scope();
-                        func(valid_k, valid_kd);
-                        auto then_stmt = current_builder->pop_scope();
-                        auto cond = (cur_pos == pos);
-                        else_stmt
-                          = make_if_else_unattached(cond, then_stmt, else_stmt);
-                      }
-                    }
-                    current_builder->emit(else_stmt);
-                  };
-
-              auto do_var_bs_for_2d = [&](const int kd, const int kh) {
-                generate_var_bs(
-                  call_brgemm, kh, oh_, sh_, dh_, ph_b_, ih_, kd, cur_oh);
-              };
-
-              if (is_3d_) {
-                auto cond = large_pad
-                  ? (((cur_iw + aux_w_block_size <= 0) || (cur_iw > iw_))
-                    || (num_d_pad >= kd_ || num_h_pad >= kh_))
-                  : (num_d_pad >= kd_ || num_h_pad >= kh_);
-                _if_(cond && padding_value == 0) { zero_out_aux_buffer(); }
-                _else_ {
-                  // 1) fill A_list
-                  if (!use_var_bs) {
-                    _for_(kd, 0, kd_) {
-                      // all zero feature map
-                      _if_(kd >= d_pad_begin_idx && kd < d_pad_end_idx) {
-                        _for_(kh, 0, kh_) {
-                          _for_(kw, 0, kw_) {
-                            expr idx = builder::make_cast(
-                              datatypes::u32, kd * kh_ * kw_ + kh * kw_ + kw);
-                            A_list[idx] = tensor_ptr(pad_buffer, {0, 0});
-                          }
-                        }
-                      }
-                      _else_ {
-                        _for_(kh, h_pad_begin_idx, h_pad_end_idx) {
-                          _for_(kw, 0, kw_) {
-                            expr idx = builder::make_cast(
-                              datatypes::u32, kd * kh_ * kw_ + kh * kw_ + kw);
-                            A_list[idx] = tensor_ptr(pad_buffer, {0, 0});
-                          }
-                        }
-                      }
-                    }
-                  }
-
-                  _if_(cur_ow_begin >= out_nopad_left
-                    && cur_ow_end <= out_nopad_right) {
-                    // 1.1) The middle region which don't need
-                    // to copy input rows but just refer to
-                    // original input buffer.
-                    _for_(kd, d_nopad_begin_idx, d_nopad_end_idx) {
-                      _for_(kh, h_nopad_begin_idx, h_nopad_end_idx) {
-                        _for_(kw, 0, kw_) {
-                          auto valid_kh = h_nopad_end_idx - h_nopad_begin_idx;
-                          expr idx = builder::make_cast(datatypes::u32,
-                            use_var_bs
-                              ? ((kd - d_nopad_begin_idx) * valid_kh * kw_
-                                + (kh - h_nopad_begin_idx) * kw_ + kw)
-                              : (kd * kh_ * kw_ + kh * kw_ + kw));
-                          A_list[idx] = tensor_ptr(input,
-                            std::vector<expr> {n, cur_id + kd * dd_,
-                              cur_ih + kh * dh_, cur_iw + kw * dw_, cur_g, 0});
-                        }
-                      }
-                    }
-                  }
-                  _else_ {
-                    // 1.2)copy rows and do physical padding
-                    if (!reuse_aux_buffer) {
-                      fill_aux_buffer(d_nopad_begin_idx, d_nopad_end_idx);
-                    } else {
-                      _if_(num_d_pad > 0 || num_h_pad > 0
-                        || global_init_state[tid]) {
-                        _if_(num_d_pad == 0 && num_h_pad == 0) {
-                          global_init_state[tid] = false;
-                        }
-                        fill_aux_buffer(d_nopad_begin_idx, d_nopad_end_idx);
-                      }
-                      _else_ {
-                        // num_d_pad == 0 && num_h_pad == 0,
-                        // reuse sub-tsr
-                        update_aux_buffer();
-                      }
-                    }
-                  }
-
-                  // 2) fill B_list
-                  if (use_var_bs) {
-                    _for_(kd, d_nopad_begin_idx, d_nopad_end_idx) {
-                      _for_(kh, h_nopad_begin_idx, h_nopad_end_idx) {
-                        _for_(kw, 0, kw_) {
-                          auto valid_kh = h_nopad_end_idx - h_nopad_begin_idx;
-                          expr idx = builder::make_cast(datatypes::u32,
-                            ((kd - d_nopad_begin_idx) * valid_kh * kw_
-                              + (kh - h_nopad_begin_idx) * kw_ + kw));
-                          B_list[idx] = tensor_ptr(weight,
-                            std::vector<expr> {
-                              kd, kh, kw, 0, cur_g + weight_k_start, 0});
-                        }
-                      }
-                    }
-                  } else {
-                    _for_(kd, 0, kd_) {
-                      _for_(kh, 0, kh_) {
-                        _for_(kw, 0, kw_) {
-                          expr idx = builder::make_cast(
-                            datatypes::u32, kd * kh_ * kw_ + kh * kw_ + kw);
-                          B_list[idx] = tensor_ptr(weight,
-                            std::vector<expr> {
-                              kd, kh, kw, 0, cur_g + weight_k_start, 0});
-                        }
-                      }
-                    }
-                  }
-
-                  if (use_var_bs) {
-                    // determine the exact value of var_bs for
-                    // brgemm call, Ai & Bi are already
-                    // fulfilled at this stage.
-                    generate_var_bs(do_var_bs_for_2d, kd_, od_, sd_, dd_, pd_b_,
-                      id_, kh_, cur_od);
-                  } else {
-                    call_brgemm(kh_, kd_);
-                  }
-                }
-              } else {
-                auto cond = large_pad
-                  ? (((cur_iw + aux_w_block_size <= 0) || (cur_iw > iw_))
-                    || (num_h_pad >= kh_))
-                  : (num_h_pad >= kh_);
-                _if_(cond && padding_value == 0) { zero_out_aux_buffer(); }
-                _else_ {
-                  auto fill_A_and_B_list = [&]() {
-                    if (!use_var_bs) {
-                      // Add zero-padding tensorptr to A_list
-                      _for_(kh, h_pad_begin_idx, h_pad_end_idx) {
-                        _for_(kw, 0, kw_) {
-                          expr idx
-                            = builder::make_cast(datatypes::u32, kh * kw_ + kw);
-                          A_list[idx] = tensor_ptr(pad_buffer, {0, 0});
-                        }
-                      }
-
-                      _if_(h_pad_begin_idx == 0 && h_nopad_end_idx < kh_) {
-                        // Add zero-padding tensorptr to
-                        // A_list
-                        _for_(kh, h_nopad_end_idx, kh_) {
-                          _for_(kw, 0, kw_) {
-                            expr idx = builder::make_cast(
-                              datatypes::u32, kh * kw_ + kw);
-                            A_list[idx] = tensor_ptr(pad_buffer, {0, 0});
-                          }
-                        }
-                      }
-                    }
-                    _if_(cur_ow_begin >= out_nopad_left
-                      && cur_ow_end <= out_nopad_right) {
-                      _for_(kh, h_nopad_begin_idx, h_nopad_end_idx) {
-                        _for_(kw, 0, kw_) {
-                          expr idx = builder::make_cast(datatypes::u32,
-                            (use_var_bs ? (kh - h_nopad_begin_idx) : kh) * kw_
-                              + kw);
-                          A_list[idx] = tensor_ptr(input,
-                            std::vector<expr> {n, cur_ih + kh * dh_,
-                              cur_iw + kw * dw_, cur_g, 0});
-                        }
-                      }
-                    }
-                    _else_ {
-                      // copy rows and do physical padding
-                      if (!reuse_aux_buffer) {
-                        fill_aux_buffer();
-                      } else {
-                        _if_(num_h_pad > 0 || global_init_state[tid]) {
-                          _if_(num_h_pad == 0) {
-                            global_init_state[tid] = false;
-                          }
-                          fill_aux_buffer();
-                        }
-                        _else_ { update_aux_buffer(); }
-                      }
-                    }
-
-                    // 2) fill B_list
-                    if (use_var_bs) {
-                      _for_(kh, h_nopad_begin_idx, h_nopad_end_idx) {
-                        _for_(kw, 0, kw_) {
-                          expr idx = builder::make_cast(datatypes::u32,
-                            (kh - h_nopad_begin_idx) * kw_ + kw);
-                          auto weight_idx = std::vector<expr> {
-                            kh, kw, 0UL, cur_g + weight_k_start, 0};
-                          B_list[idx] = tensor_ptr(weight, weight_idx);
-                        }
-                      }
-                    } else {
-                      _for_(kh, 0, kh_) {
-                        _for_(kw, 0, kw_) {
-                          expr idx
-                            = builder::make_cast(datatypes::u32, kh * kw_ + kw);
-                          B_list[idx] = tensor_ptr(weight,
-                            std::vector<expr> {
-                              kh, kw, 0, cur_g + weight_k_start, 0});
-                        }
-                      }
-                    }
-                  };
-                  fill_A_and_B_list();
-                  if (use_var_bs) {
-                    do_var_bs_for_2d(kd_, kh_);
-                  } else {
-                    call_brgemm(kh_);
-                  }
-                }
-              }
-              if (fusion) {
-                // im_w_block * g_block
-                if (is_3d_) {
-                  create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                    slice_range {{n, 1}, {cur_od, 1}, {cur_oh, 1},
-                      {cur_ow_begin, config.im_w_block}, {cur_g, g_block},
-                      {0, 1}});
-                } else {
-                  create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                    slice_range {{n, 1}, {cur_oh, 1},
-                      {cur_ow_begin, config.im_w_block}, {cur_g, g_block},
-                      {0, 1}});
-                }
-              }
-            }
-            if (fusion) {
-              // im_h_block * im_w_block * g_block
-              if (is_3d_) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  slice_range {{n, 1}, {cur_od, 1},
-                    {h_o * config.im_h_block, config.im_h_block},
-                    {cur_ow_begin, config.im_w_block}, {cur_g, g_block},
-                    {0, 1}});
-              } else {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  slice_range {{n, 1},
-                    {h_o * config.im_h_block, config.im_h_block},
-                    {cur_ow_begin, config.im_w_block}, {cur_g, g_block},
-                    {0, 1}});
-              }
-            }
-          }
-          if (fusion) {
-            // im_h_block * im_w_block * groups_
-            if (is_3d_) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                slice_range {{n, 1}, {cur_od, 1},
-                  {h_o * config.im_h_block, config.im_h_block},
-                  {cur_ow_begin, config.im_w_block}, {0, g_expr_}, {0, 1}});
-            } else {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                slice_range {{n, 1},
-                  {h_o * config.im_h_block, config.im_h_block},
-                  {cur_ow_begin, config.im_w_block}, {0, g_expr_}, {0, 1}});
-            }
-          }
-        }
-        if (fusion) {
-          auto h_start
-            = reuse_aux_buffer ? expr(0UL) : outer_var * config.im_h_block;
-          auto h_len = reuse_aux_buffer ? oh_ : config.im_h_block;
-          auto w_start
-            = reuse_aux_buffer ? outer_var * config.im_w_block : expr(0UL);
-          auto w_len = reuse_aux_buffer ? config.im_w_block : ow_;
-          // im_h_block * im_w_block * groups_
-          if (is_3d_) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              slice_range {{n, 1}, {cur_od, 1}, {h_start, h_len},
-                {w_start, w_len}, {0, g_expr_}, {0, 1}});
-          } else {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              slice_range {{n, 1}, {h_start, h_len}, {w_start, w_len},
-                {0, g_expr_}, {0, 1}});
-          }
-        }
-      }
-      if (fusion) {
-        auto h_start
-          = reuse_aux_buffer ? expr(0UL) : outer_var * config.im_h_block;
-        auto h_len = reuse_aux_buffer ? oh_ : config.im_h_block;
-        auto w_start
-          = reuse_aux_buffer ? outer_var * config.im_w_block : expr(0UL);
-        auto w_len = reuse_aux_buffer ? config.im_w_block : ow_;
-        // im_h_block * im_w_block * groups_
-        if (is_3d_) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            slice_range {{n, 1}, {0, od_}, {h_start, h_len}, {w_start, w_len},
-              {0, g_expr_}, {0, 1}});
-        } else {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            slice_range {{n, 1}, {h_start, h_len}, {w_start, w_len},
-              {0, g_expr_}, {0, 1}});
-        }
-      }
-    }
-    if (fusion) {
-      // im_h_block * im_w_block * groups_
-      if (is_3d_) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          slice_range {
-            {n, 1}, {0, od_}, {0, oh_}, {0, ow_}, {0, g_expr_}, {0, 1}});
-      } else {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          slice_range {{n, 1}, {0, oh_}, {0, ow_}, {0, g_expr_}, {0, 1}});
-      }
-    }
-  }
-  bind_loop_axis(owner_->get_outputs()[0], ln, 0);
-  bind_loop_axis(owner_->get_outputs()[0], ld,
-    is_3d_ ? std::vector<int> {3} : std::vector<int> {});
-  bind_loop_axis(owner_->get_outputs()[0], lp, is_3d_ + reuse_aux_buffer + 3);
-  bind_loop_axis(owner_->get_outputs()[0], lg, 1);
-  loops = {ln, ld, lp, lg};
-}
-
-void gen_conv_dw_fwd_t::dynamic_conv_physical_padding(CONV_ARG_LIST) const {
-  COMPILE_ASSERT(
-    false, "dynamic compute conv with padding is not yet supported!");
-}
-
-void gen_conv_dw_fwd_t::schedule_loops(context_ptr ctx,
-  const conv_dw_fwd_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  if (!is_dynamic()) {
-    COMPILE_ASSERT(static_cast<int>(fors.size()) == 4,
-      "expected to have 4 for loops, but got " << fors.size() << " for loops.");
-    auto lpbs = fors[0], lph = fors[1], lpw = fors[2], lpg = fors[3];
-    auto outer = lpbs->fuse(lph)->fuse(lpw)->fuse(lpg);
-    outer->kind_ = for_type::PARALLEL;
-  }
-}
-
-bool gen_conv_dw_fwd_t::generate(context_ptr ctx,
-  const conv_dw_fwd_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  COMPILE_ASSERT(inputs.size() == 2,
-    "Expecting 2 inputs for conv, but got " << inputs.size() << " inputs.");
-  COMPILE_ASSERT(outputs.size() == 1,
-    "Expecting 1 output for conv, but got " << outputs.size() << " output.");
-
-  auto dtypeInput = get_input_dtype();
-  auto dtypeWeight = get_weight_dtype();
-  auto dtypeOutput = get_output_dtype();
-  if (dtypeInput == datatypes::bf16) {
-    COMPILE_ASSERT((dtypeWeight == datatypes::bf16),
-      "Weights should be bf16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtypeOutput == datatypes::f32),
-      "Output should be f32 when data and weights are in bf16.");
-  }
-  if (dtypeInput == datatypes::f16) {
-    COMPILE_ASSERT((dtypeWeight == datatypes::f16),
-      "Weights should be f16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtypeOutput == datatypes::f32),
-      "Output should be f32 when data and weights are in f16.");
-  }
-  if (utils::is_one_of(dtypeInput, datatypes::s8, datatypes::u8)) {
-    COMPILE_ASSERT((dtypeWeight == datatypes::s8),
-      "Weights should be s8 when \
-            data is s8/u8, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtypeOutput == datatypes::s32),
-      "Output should be s32 when data and weights are in "
-      "s8/u8.");
-  }
-
-  if (!is_dynamic()) {
-    COMPILE_ASSERT((config.im_h_block > 0) && (oh_ % config.im_h_block == 0),
-      "oh should be dividable by im_h_block, but got oh="
-        << oh_ << " im_h_block=" << config.im_h_block << ".");
-    COMPILE_ASSERT((config.im_w_block > 0) && (ow_ % config.im_w_block == 0),
-      "ow should be dividable by im_w_block, but got ow="
-        << ow_ << " im_w_block=" << config.im_w_block << ".");
-  } else {
-    COMPILE_ASSERT(false, "dynamic conv is not supported yet!");
-  }
-
-  expr output = outputs[op_params_t::out];
-  expr input = inputs[op_params_t::data];
-  expr weight = inputs[op_params_t::weight];
-
-  if (!is_3d_ && attrs_.get_or_else("padding_value", 0) == 0) {
-    COMPILE_ASSERT(!is_3d_, "conv dw fwd does not support 3d yet.");
-    if (is_dynamic()) {
-      dynamic_conv_logical_padding(
-        ctx, config, fusion, output, input, weight, loops);
-    } else {
-      compute_conv_logical_padding(
-        ctx, config, fusion, output, input, weight, loops);
-    }
-  } else {
-    if (is_dynamic()) {
-      dynamic_conv_physical_padding(
-        ctx, config, fusion, output, input, weight, loops);
-    } else {
-      compute_conv_physical_padding(
-        ctx, config, fusion, output, input, weight, loops);
-    }
-  }
-  return true;
-}
-#undef CONV_ARG_LIST
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_dw_fwd.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_dw_fwd.hpp
deleted file mode 100644
index 37248d45cd7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_dw_fwd.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_DW_FWD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_DW_FWD_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include <ops/body_generator.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-struct conv_dw_fwd_config_t {
-  int bs_threads = 1;
-  int h_threads = 1;
-  int w_threads = 1;
-  int g_threads = 1;
-  int h_block = 1;
-  int w_block = 1;
-  int g_block = 1;
-  int im_h_block = 1;
-  int im_w_block = 1;
-
-  conv_dw_fwd_config_t() = default;
-
-  conv_dw_fwd_config_t(int bs_threads, int h_threads, int w_threads,
-    int g_threads, int h_block, int w_block, int g_block, int im_h_block,
-    int im_w_block)
-    : bs_threads(bs_threads)
-    , h_threads(h_threads)
-    , w_threads(w_threads)
-    , g_threads(g_threads)
-    , h_block(h_block)
-    , w_block(w_block)
-    , g_block(g_block)
-    , im_h_block(im_h_block)
-    , im_w_block(im_w_block) {}
-};
-
-class gen_conv_dw_fwd_t : public body_generator_t<conv_dw_fwd_config_t> {
-public:
-  struct op_params_t {
-    static constexpr int data = 0;
-    static constexpr int weight = 1;
-    static constexpr int out = 0;
-  };
-  using parent = body_generator_t<conv_dw_fwd_config_t>;
-  using parent::generate;
-
-  std::tuple<int, int, int> get_output_shape() {
-    return std::tuple<int, int, int>(od_, oh_, ow_);
-  }
-
-  gen_conv_dw_fwd_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &dilation, const sc_dims &pads_begin, const sc_dims &pads_end,
-    std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  bool is_dynamic() const {
-    return in_tensors_[0].is_dynamic() || in_tensors_[1].is_dynamic();
-  }
-
-  const sc_dims &get_input_plain_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_input_blocking_dims() const {
-    return in_tensors_[0].get_blocking_dims();
-  }
-
-  const sc_dims &get_weight_plain_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_plain_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_output_blocking_dims() const {
-    return out_tensors_[0].get_blocking_dims();
-  }
-
-  sc_data_type_t get_input_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_weight_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_output_dtype() const { return out_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_dw_fwd_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr_vec get_dynamic_config_candidates(
-    const context_ptr &ctx) const override;
-  // std::vector<uint64_t> convert_config_to_keys(
-  //   const config_ptr &configs) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_dw_fwd_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-
-  int get_im_w_block(const context_ptr &ctx) const;
-
-#define CONV_ARG_LIST \
-  const context_ptr &ctx, const conv_dw_fwd_config_t &config, \
-    fusion_anchor_mgr_t *fusion, expr &output, const expr &input, \
-    const expr &weight, std::vector<for_loop> &loops
-  void compute_conv_logical_padding(CONV_ARG_LIST) const;
-  void compute_conv_physical_padding(CONV_ARG_LIST) const;
-  void dynamic_conv_logical_padding(CONV_ARG_LIST) const;
-  void dynamic_conv_physical_padding(CONV_ARG_LIST) const;
-#undef CONV_ARG_LIST
-
-  size_t ndims_ = 0;
-  int groups_ = 1;
-  int mb_ = 0, ic_ = 0, id_ = 0, ih_ = 0, iw_ = 0;
-  int oc_ = 0, kd_ = 0, kh_ = 0, kw_ = 0;
-  int od_ = 0, oh_ = 0, ow_ = 0;
-  int sd_ = 0, sh_ = 0, sw_ = 0;
-  int pd_b_ = 0, ph_b_ = 0, pw_b_ = 0;
-  int pd_e_ = 0, ph_e_ = 0, pw_e_ = 0;
-  int dd_ = 0, dh_ = 0, dw_ = 0;
-  bool is_3d_ = false;
-  any_map_t attrs_;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_fwd.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_fwd.cpp
deleted file mode 100644
index 1dcabbca0db..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_fwd.cpp
+++ /dev/null
@@ -1,2779 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "conv_fwd.hpp"
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <utility>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-
-#include <compiler/ir/graph/binding_axis.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/templates/conv_rl.hpp>
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using ops::conv_fwd_config_t;
-// clang-format off
-SC_CLASS(conv_fwd_config_t)
-  SC_FIELD(K_block)
-  SC_FIELD(C_block)
-  SC_FIELD(tile_d)
-  SC_FIELD(tile_p)
-  SC_FIELD(tile_q)
-  SC_FIELD(tile_os)
-  SC_FIELD(pack_input)
-  SC_FIELD(loop_sched)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-
-void gen_conv_fwd_t::validate_conv_fwd_default_config(
-  const context_ptr &ctx, conv_fwd_config_t &cfg) const {
-  bool dtype_f32 = get_input_dtype() == datatypes::f32;
-  const bool use_os_blocking
-    = try_os_blocking_ && ops::is_amx_dtype(ctx, get_input_dtype());
-  auto tile_d_list = utils::get_factors(od_);
-  auto tile_p_list
-    = use_os_blocking ? std::vector<int> {-1} : utils::get_factors(oh_);
-  auto tile_q_list
-    = use_os_blocking ? std::vector<int> {-1} : utils::get_factors(ow_);
-  auto tile_os_list
-    = use_os_blocking ? get_os_blocks(ow_, adj_os_) : std::vector<int> {-1};
-  auto pack_input_list = (is_1x1_conv_ && (sd_ > 1 || sh_ > 1 || sw_ > 1))
-    ? std::vector<int> {0, 1}
-    : std::vector<int> {-1};
-  auto loop_sched_list = std::vector<int> {0, 1, 2, 3};
-  const auto dtype_size = utils::get_sizeof_type(get_weight_dtype());
-  const auto vnni_blk = 4 / dtype_size;
-  if (std::find(tile_d_list.begin(), tile_d_list.end(), cfg.tile_d)
-    == tile_d_list.end()) {
-    cfg.tile_d = tile_d_list.at(0);
-  }
-  if (std::find(tile_p_list.begin(), tile_p_list.end(), cfg.tile_p)
-    == tile_p_list.end()) {
-    cfg.tile_p = tile_p_list.at(0);
-  }
-  if (std::find(tile_q_list.begin(), tile_q_list.end(), cfg.tile_q)
-    == tile_q_list.end()) {
-    cfg.tile_q = tile_q_list.at(0);
-  }
-  if (std::find(tile_os_list.begin(), tile_os_list.end(), cfg.tile_os)
-    == tile_os_list.end()) {
-    cfg.tile_os = tile_os_list.back();
-  }
-  if (std::find(pack_input_list.begin(), pack_input_list.end(), cfg.pack_input)
-    == pack_input_list.end()) {
-    cfg.pack_input = pack_input_list.at(0);
-  }
-  if (std::find(loop_sched_list.begin(), loop_sched_list.end(), cfg.loop_sched)
-    == loop_sched_list.end()) {
-    cfg.loop_sched = loop_sched_list.at(0);
-  }
-  // group convolution will pack the kw * ic and can't be padded in C_block.
-  if (ops::is_amx_dtype(ctx, get_input_dtype()) && cfg.C_block % vnni_blk != 0
-    && attrs_.get_or_else("groups", 1) == 1) {
-    cfg.C_block = utils::rnd_up(cfg.C_block, vnni_blk);
-  }
-}
-
-void gen_conv_fwd_t::adjust_config_for_parallelisem(
-  const context_ptr &ctx, conv_fwd_config_t &cfg) const {
-  const auto nthreads = runtime_config_t::get().get_num_threads();
-  const int parallel_balance_boundary = 2 * nthreads;
-  auto is_balance_parallel = [&](int K_block, int tile_p, int tile_d) {
-    int work_amount = (oc_ / K_block) * (oh_ / tile_p) * (od_ / tile_d) * mb_;
-    return is_parallel_space_enough(work_amount, nthreads);
-  };
-
-  const auto dtype_size = utils::get_sizeof_type(get_weight_dtype());
-
-  if (!is_balance_parallel(cfg.K_block, cfg.tile_p, cfg.tile_d)) {
-    int oc_minimum_block = utils::get_blocks(oc_, 1, 32).back();
-    cfg.C_block = ic_;
-    if (is_1x1_conv_ && oc_ / oc_minimum_block >= parallel_balance_boundary) {
-      cfg.K_block = oc_minimum_block;
-    }
-    auto K_block_candidates
-      = utils::get_blocks(oc_, oc_minimum_block, cfg.K_block - 1);
-    auto tile_p_candidates = utils::get_blocks(oh_, 1, cfg.tile_p - 1);
-    auto tile_d_candidates = utils::get_blocks(od_, 1, cfg.tile_d - 1);
-    while (!is_balance_parallel(cfg.K_block, cfg.tile_p, cfg.tile_d)
-      && cfg.K_block > oc_minimum_block) {
-      cfg.K_block = K_block_candidates.back();
-      K_block_candidates.pop_back();
-    }
-    while (!is_balance_parallel(cfg.K_block, cfg.tile_p, cfg.tile_d)
-      && cfg.tile_p > 1) {
-      cfg.tile_p = tile_p_candidates.back();
-      tile_p_candidates.pop_back();
-    }
-    while (!is_balance_parallel(cfg.K_block, cfg.tile_p, cfg.tile_d)
-      && cfg.tile_d > 1) {
-      cfg.tile_d = tile_d_candidates.back();
-      tile_d_candidates.pop_back();
-    }
-  }
-}
-
-void gen_conv_fwd_t::adjust_config_for_cache_efficiency(
-  const context_ptr &ctx, conv_fwd_config_t &cfg) const {
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  const auto dtype_size = utils::get_sizeof_type(get_weight_dtype());
-
-  auto get_brgemm_A_size = [&](const conv_fwd_config_t &cfg) {
-    return cfg.tile_q * ic_ * dtype_size;
-  };
-  auto get_brgemm_B_size = [&](const conv_fwd_config_t &cfg) {
-    return kh_ * kw_ * ic_ * cfg.K_block * dtype_size;
-  };
-  auto get_brgemm_C_size = [&](const conv_fwd_config_t &cfg) {
-    return cfg.tile_q * cfg.K_block * dtype_size;
-  };
-
-  if (get_brgemm_A_size(cfg) + get_brgemm_B_size(cfg) + get_brgemm_C_size(cfg)
-    > L2_cache_size) {
-    cfg.K_block = utils::get_blocks(oc_, 64).front();
-    cfg.tile_q = utils::get_blocks(ow_, 16, 32).back();
-  }
-
-  // brgemm prefer small M if the smallest factor still larger
-  if (cfg.tile_q > 150 && utils::get_blocks(ow_, 16, 32).back() > 32) {
-    cfg.tile_q = utils::get_blocks(ow_, 16, 64).back();
-    cfg.K_block = utils::get_blocks(oc_, 16, 512).back();
-  }
-}
-
-void gen_conv_fwd_t::adjust_config_for_brgemm_efficiency(
-  const context_ptr &ctx, conv_fwd_config_t &cfg) const {
-  if (ops::is_amx_dtype(ctx, get_input_dtype())
-    && attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING)
-      == ops::rl_kind::NO_LOWERING) {
-    // C_block should be the multiple of 64, but the experiment shows that the
-    // performance of brgemm is similar if C_block % 16 ==0 and could save
-    // padding time.
-    if (cfg.C_block % 16 != 0 && ic_ >= 16) {
-      cfg.C_block = utils::rnd_up(cfg.C_block, 16);
-    }
-    // if K_block % 16 != 0, the brgemm will use several tile register to deal
-    // with the tail specifically. It would cause performance drop.
-    if (cfg.K_block % 16 != 0 && oc_ >= 16) {
-      cfg.K_block = utils::get_blocks(oc_, 64, 256).back();
-      cfg.K_block = utils::rnd_up(cfg.K_block, 16);
-      if (cfg.K_block > 256) { cfg.K_block = 64; }
-    }
-  }
-}
-
-config_ptr gen_conv_fwd_t::get_default_config(context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_fwd_config_t>();
-  conv_fwd_config_t &cfg = *ret.unchecked_get_as<conv_fwd_config_t>();
-  const auto nthreads = runtime_config_t::get().get_num_threads();
-  auto C_block_list = utils::get_blocks(ic_, 16);
-  auto K_block_list = utils::get_blocks(oc_, 16);
-  auto tile_p_list = utils::get_factors(oh_);
-  auto tile_q_list = utils::get_factors(ow_);
-
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, get_weight_dtype());
-  bool no_vnni = ops::no_vnni(ctx, get_weight_dtype());
-  auto dtype_size = no_vnni ? 4 : (is_vnni_low_fp ? 2 : 1);
-
-  cfg.tile_d = 1;
-  cfg.tile_os = -1;
-  cfg.pack_input = (is_1x1_conv_ && (sd_ > 1 || sh_ > 1 || sw_ > 1)) ? 1 : -1;
-  cfg.loop_sched = 0;
-
-  int max_ic_block = -1;
-  int max_oc_block = -1;
-  if (ic_ % 32 != 0) {
-    cfg.C_block = ic_ > 32 ? 32 : utils::rnd_up(ic_, 4);
-  } else {
-    for (int i = C_block_list.size() - 1; i >= 0; i--) {
-      if (C_block_list[i] <= 128 && C_block_list[i] % 32 == 0) {
-        max_ic_block = C_block_list[i];
-        break;
-      }
-    }
-  }
-  // K block shall only relay to oc_ and apply same logic with ic to avoid
-  // possibly double buffer
-  if (oc_ % 32 != 0) {
-    cfg.K_block = oc_ > 32 ? 32 : utils::rnd_up(oc_, 4);
-  } else {
-    for (int i = K_block_list.size() - 1; i >= 0; i--) {
-      if (K_block_list[i] <= 128 && K_block_list[i] % 32 == 0) {
-        max_oc_block = K_block_list[i];
-        break;
-      }
-    }
-  }
-  // large K N: adjust K_block and N_block(as gemm)
-  if (oc_ * ic_ >= 512 * 512) {
-    if (is_1x1_conv_) {
-      max_oc_block *= 2;
-    } else {
-      max_ic_block *= 2;
-    }
-    cfg.loop_sched = 3;
-  }
-
-  // large spatial
-  bool large_spatial = oh_ * ow_ >= 128 * 128;
-  if (large_spatial) {
-    if ((ctx->use_amx() && get_weight_dtype() != datatypes::f32)
-      || nthreads > mb_) {
-      cfg.loop_sched = 2;
-    } else {
-      cfg.loop_sched = 0;
-    }
-  }
-
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  if (is_1x1_conv_ && (oc_ / ic_ >= 4 && oc_ >= 1024)) { max_oc_block = 128; }
-  // tile_p
-  if (!is_1x1_conv_) {
-    if (mb_ % nthreads == 0) {
-      for (auto p_candidate : tile_p_list) {
-        if (p_candidate
-          >= 8 / static_cast<int>(utils::get_sizeof_type(get_weight_dtype()))) {
-          cfg.tile_p = p_candidate;
-          break;
-        }
-      }
-    } else {
-      // set tile_p == 1 to increase parallel space
-      cfg.tile_p = 1;
-    }
-  } else {
-    if (ow_ >= 16 || !is_parallel_space_enough(mb_ * oc_ / 64, nthreads)) {
-      cfg.tile_p = 1;
-    } else {
-      if (oh_ >= 16) {
-        cfg.tile_p = utils::get_blocks(oh_, 1, 64 / ow_).back();
-      } else {
-        cfg.tile_p = oh_;
-      }
-    }
-  }
-  if (get_input_dtype() == datatypes::f32 && is_1x1_conv_) { cfg.tile_p = 1; }
-  // tile q
-  if (!is_1x1_conv_) {
-    cfg.tile_q = tile_q_list.back();
-  } else {
-    // handle large M for gemm kernel: shrink M
-    if (sw_ > 1) {
-      cfg.tile_q = 1;
-    } else {
-      cfg.tile_q = tile_q_list.back();
-    }
-  }
-  if (try_os_blocking_ && ops::is_amx_dtype(ctx, get_input_dtype())) {
-    // if use os blocking override tile p and tile q above
-    cfg.tile_os = cfg.tile_q;
-    auto os_choices = get_os_blocks(ow_, adj_os_);
-    std::sort(os_choices.begin(), os_choices.end());
-    if (ow_ <= 28 && ow_ % 16 != 0) {
-      for (int i = os_choices.size() - 1; i >= 0; i--) {
-        if (nthreads <= adj_os_ / os_choices[i] * mb_ * oc_ / 64) {
-          cfg.tile_os = os_choices[i];
-          break;
-        }
-      }
-    }
-    cfg.tile_q = -1;
-    cfg.tile_p = -1;
-  }
-
-  bool has_pad = (pd_b_ > 0) || (ph_b_ > 0) || (pw_b_ > 0) || (pd_e_ > 0)
-    || (ph_e_ > 0) || (pw_e_ > 0);
-  if (is_1x1_conv_) {
-    max_oc_block = std::max(
-      max_oc_block, utils::get_blocks(oc_, 1, ow_ * 2 / dtype_size).back());
-    cfg.K_block = oc_ % 32 == 0 ? max_oc_block : oc_;
-  } else {
-    // config observation: if oc is small enough, directly consuming all oc
-    // would be better
-    // A relative large K_block(128) is good for padding_v2 template
-    auto run_on_amx = ops::is_amx_dtype(ctx, get_weight_dtype());
-    bool small_oc = oc_ <= 128;
-    bool using_v2_template = has_pad && (run_on_amx || is_3d_);
-    if ((!run_on_amx || using_v2_template || small_oc) && !large_spatial) {
-      auto default_block
-        = small_oc || using_v2_template ? 128 : 128 / dtype_size;
-      cfg.K_block = utils::get_blocks(oc_, 16, default_block).back();
-    } else if (large_spatial) {
-      cfg.K_block = utils::get_blocks(oc_, 16, 256).back();
-    } else {
-      // config observation: small oc block could provide a good performance for
-      // conv3x3 on amx no padding case
-      cfg.K_block = utils::get_blocks(oc_, 64).front();
-    }
-  }
-  max_ic_block = std::max(
-    max_ic_block, utils::get_blocks(ic_, 1, ow_ * 2 / dtype_size).back());
-  cfg.C_block = ic_ % 32 == 0 ? max_ic_block : ic_;
-
-  if (is_3d_ && has_pad) {
-    if (mb_ * od_ / cfg.tile_d >= 8) {
-      cfg.tile_p = utils::get_blocks(oh_, 1, 32).back();
-    }
-  }
-  adjust_config_for_parallelisem(ctx, cfg);
-  adjust_config_for_cache_efficiency(ctx, cfg);
-  adjust_config_for_brgemm_efficiency(ctx, cfg);
-  validate_conv_fwd_default_config(ctx, cfg);
-
-  if (inverse_filter_) {
-    cfg.C_block = ic_ % 64 == 0 ? 64 : ic_;
-    cfg.K_block = oc_ % 64 == 0 ? 64 : oc_;
-    cfg.tile_d = 1;
-    cfg.tile_p = 1;
-    cfg.tile_q = ow_;
-  }
-  return std::move(ret);
-}
-
-gen_conv_fwd_t::gen_conv_fwd_t(sc_op *owner, const sc_dims &stride,
-  const sc_dims &dilation, const sc_dims &pads_begin, const sc_dims &pads_end,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs)) {
-  COMPILE_ASSERT(in_tensors_.size() == 2,
-    "Wrong number of inputs, expected to be 2 but got " << in_tensors_.size()
-                                                        << ".");
-  COMPILE_ASSERT(out_tensors_.size() == 1,
-    "Wrong number of output, expected to be 1 but got " << out_tensors_.size()
-                                                        << ".");
-  if (owner) { attrs_ = owner->attrs_; }
-  auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-  auto input_plain_dims = get_input_plain_dims();
-  auto weight_plain_dims = use_rl == ops::rl_kind::KW_LOWERING
-    ? attrs_.get<sc_dims>("origin_wei_plain_dims")
-    : get_weight_plain_dims();
-  auto out_plain_dims = get_output_plain_dims();
-  groups_ = static_cast<int>(attrs_.get_or_else("groups", 1));
-  is_group_conv_ = groups_ > 1;
-
-  ndims_ = input_plain_dims.size();
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(ndims_ - is_group_conv_), 3, 4, 5),
-    "Wrong input dims, expected to be 3D, 4D or 5D input, but got " << ndims_
-                                                                    << "D.");
-  COMPILE_ASSERT(
-    utils::is_one_of(
-      static_cast<int>(weight_plain_dims.size() - is_group_conv_), 3, 4, 5)
-      && (weight_plain_dims.size() == ndims_),
-    "Wrong weight dims, only support 3D, 4D or 5D weights, but got "
-      << weight_plain_dims.size() << "D.");
-  COMPILE_ASSERT(
-    utils::is_one_of(
-      static_cast<int>(out_plain_dims.size() - is_group_conv_), 3, 4, 5)
-      && (out_plain_dims.size() == ndims_),
-    "Wrong output dims, only support 3D, 4D or 5D weights, but got "
-      << out_plain_dims.size() << "D.");
-
-  is_3d_ = (ndims_ == 5UL + is_group_conv_);
-
-  blocking_input_ = get_input_blocking_dims().size() > ndims_;
-  blocking_output_ = get_output_blocking_dims().size() > ndims_;
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 2),
-    "Wrong pads_begin dims, should be 1D, 2D or 3D, but got "
-      << pads_begin.size() << "D.");
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(stride.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(stride.size()), 1, 2),
-    "Wrong stride dims, should be 1D, 2D or 3D, but got " << stride.size()
-                                                          << "D.");
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(dilation.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(dilation.size()), 1, 2),
-    "Wrong dilation dims, should be 1D, 2D or 3D, but got " << dilation.size()
-                                                            << "D.");
-  COMPILE_ASSERT(input_plain_dims[1 + is_group_conv_]
-      == weight_plain_dims[1 + is_group_conv_],
-    "expect ic == kic, but got "
-      << input_plain_dims[1 + is_group_conv_] << " vs "
-      << weight_plain_dims[1 + is_group_conv_] << ".");
-
-  mb_ = input_plain_dims[0];
-  ic_ = input_plain_dims[1 + is_group_conv_];
-  id_ = is_3d_ ? input_plain_dims[ndims_ - 3] : 1;
-  iw_ = input_plain_dims[ndims_ - 1];
-  ih_ = input_plain_dims[ndims_ - 2];
-  oc_ = weight_plain_dims[is_group_conv_];
-  kd_ = is_3d_ ? weight_plain_dims[ndims_ - 3] : 1;
-  kh_ = weight_plain_dims[ndims_ - 2];
-  kw_ = weight_plain_dims[ndims_ - 1];
-  od_ = is_3d_ ? out_plain_dims[2] : 1;
-  oh_ = out_plain_dims[ndims_ - 2];
-  ow_ = out_plain_dims[ndims_ - 1];
-  is_1x1_conv_ = (kd_ == 1 && kh_ == 1 && kw_ == 1);
-  pd_b_ = is_3d_ ? pads_begin[0] : 0;
-  ph_b_ = pads_begin[0], pw_b_ = pads_begin[0];
-  pd_e_ = is_3d_ ? pads_end[0] : 0;
-  ph_e_ = pads_end[0], pw_e_ = pads_end[0];
-  bool is_int8
-    = utils::is_one_of(get_input_dtype(), datatypes::u8, datatypes::s8);
-  bool is_bf16 = get_input_dtype() == datatypes::bf16;
-
-  if (pads_begin.size() > 1) {
-    ph_b_ = pads_begin[ndims_ - 4 - is_group_conv_];
-    pw_b_ = pads_begin[ndims_ - 3 - is_group_conv_];
-  }
-  if (pads_end.size() > 1) {
-    ph_e_ = pads_end[ndims_ - 4 - is_group_conv_];
-    pw_e_ = pads_end[ndims_ - 3 - is_group_conv_];
-  }
-  sd_ = is_3d_ ? stride[0] : 1;
-  sh_ = stride[0], sw_ = stride[0];
-  if (stride.size() > 1) {
-    auto stride_size = stride.size();
-    sh_ = stride[stride_size - 2];
-    sw_ = stride[stride_size - 1];
-  }
-
-  dd_ = is_3d_ ? dilation[0] : 1;
-  dh_ = dilation[0], dw_ = dilation[0];
-  auto dilation_size = dilation.size();
-  if (dilation_size > 1) {
-    dh_ = dilation[dilation_size - 2];
-    dw_ = dilation[dilation_size - 1];
-  }
-
-  // For non 1x1 conv and AMX platform, spatial blocking instead of row
-  // blocking is used, which needs to consider the border carefully, as the
-  // cross row boundary (contains padding or not) will generate useless output
-  // which have to be skipped before storing.
-
-  actual_os_ = oh_ * ow_;
-  num_elems_skip_per_ow_ = (dw_ * (kw_ - 1)) / sw_;
-  adj_os_ = std::min(actual_os_ + num_elems_skip_per_ow_ * (oh_ - 1),
-    (ih_ + ph_b_ + ph_e_) * (iw_ + pw_b_ + pw_e_));
-
-  // Note: os blocking is only valid for non_1x1, no pad and non 3D conv with
-  // amx-int8 only so far.
-  bool has_pad = (pd_b_ > 0) || (ph_b_ > 0) || (pw_b_ > 0) || (pd_e_ > 0)
-    || (ph_e_ > 0) || (pw_e_ > 0);
-  const auto num_threads = runtime_config_t::get().get_num_threads();
-  try_os_blocking_ = (!is_1x1_conv_) && (!has_pad) && (!is_3d_)
-    && iw_ % sw_ == 0
-    && ow_ * sw_
-      < (32 / static_cast<int>(utils::get_sizeof_type(get_input_dtype())))
-    && is_parallel_space_enough(mb_ * (oc_ / 64), num_threads)
-    && use_rl == ops::rl_kind::NO_LOWERING;
-}
-
-float gen_conv_fwd_t::get_gflop() const {
-  float result = (float)mb_ * groups_ * oc_ * 2.0 * ic_ * kd_ * kh_ * kw_ * od_
-    * oh_ * ow_ / (float)1e9;
-  return result;
-}
-
-std::vector<expr> gen_conv_fwd_t::data_offset(const expr &N, const expr &G,
-  const expr &C, const expr &D, const expr &H, const expr &W,
-  const expr &C_block, const expr &c_idx, const bool &force_3d) const {
-  COMPILE_ASSERT(
-    !(is_3d_ && force_3d), "Force_3d is only capable for 2d inputs");
-  return is_group_conv_
-    ? ((is_3d_ || force_3d)
-        ? (!blocking_input_
-            ? std::vector<expr> {N, D, H, W, G, C * C_block + c_idx}
-            : std::vector<expr> {N, G, C, D, H, W, c_idx})
-        : (!blocking_input_
-            ? std::vector<expr> {N, H, W, G, C * C_block + c_idx}
-            : std::vector<expr> {N, G, C, H, W, c_idx}))
-    : ((is_3d_ || force_3d)
-        ? (!blocking_input_
-            ? std::vector<expr> {N, D, H, W, C * C_block + c_idx}
-            : std::vector<expr> {N, C, D, H, W, c_idx})
-        : (!blocking_input_ ? std::vector<expr> {N, H, W, C * C_block + c_idx}
-                            : std::vector<expr> {N, C, H, W, c_idx}));
-}
-
-std::vector<expr> gen_conv_fwd_t::output_offset(const expr &N, const expr &G,
-  const expr &C, const expr &D, const expr &H, const expr &W,
-  const expr &C_block, const expr &c_idx) const {
-  return is_group_conv_
-    ? (is_3d_ ? (!blocking_output_
-           ? std::vector<expr> {N, D, H, W, G, C * C_block + c_idx}
-           : std::vector<expr> {N, G, C, D, H, W, c_idx})
-              : (!blocking_output_
-                  ? std::vector<expr> {N, H, W, G, C * C_block + c_idx}
-                  : std::vector<expr> {N, G, C, H, W, c_idx}))
-    : (is_3d_
-        ? (!blocking_output_
-            ? std::vector<expr> {N, D, H, W, C * C_block + c_idx}
-            : std::vector<expr> {N, C, D, H, W, c_idx})
-        : (!blocking_output_ ? std::vector<expr> {N, H, W, C * C_block + c_idx}
-                             : std::vector<expr> {N, C, H, W, c_idx}));
-}
-
-void gen_conv_fwd_t::bind_output_loop_axis(const for_loop &loop,
-  const std::vector<std::string> &axis, bool is_block) const {
-  auto out_tsr = owner_->get_outputs()[0];
-  std::vector<int> real_axis;
-  for (const auto &ax : axis) {
-    if (ax == "N") {
-      real_axis.emplace_back(0);
-    } else if (ax == "C") {
-      real_axis.emplace_back(is_group_conv_ + 1);
-    } else if (ax == "G" && is_group_conv_) {
-      real_axis.emplace_back(is_group_conv_);
-    } else if (ax == "D" && is_3d_) {
-      real_axis.emplace_back(is_group_conv_ + 2);
-    } else if (ax == "H") {
-      real_axis.emplace_back(is_group_conv_ + is_3d_ + 2);
-    } else if (ax == "W") {
-      real_axis.emplace_back(is_group_conv_ + is_3d_ + 3);
-    }
-  }
-  bind_loop_axis(out_tsr, loop, real_axis);
-}
-
-std::vector<expr> gen_conv_fwd_t::weight_offset(const expr &G, const expr &K,
-  const expr &C, const expr &D, const expr &R, const expr &S) const {
-  int kpack = 1;
-  auto dtype_input = get_input_dtype();
-  if (dtype_input == datatypes::bf16) { kpack = 2; }
-  if (dtype_input == datatypes::f16) { kpack = 2; }
-  if (utils::is_one_of(dtype_input, datatypes::s8, datatypes::u8)) {
-    kpack = 4;
-  }
-  return is_group_conv_
-    ? (is_3d_ ? (kpack > 1 ? std::vector<expr> {G, K, C, D, R, S, 0, 0, 0}
-                           : std::vector<expr> {G, K, C, D, R, S, 0, 0})
-              : (kpack > 1 ? std::vector<expr> {G, K, C, R, S, 0, 0, 0}
-                           : std::vector<expr> {G, K, C, R, S, 0, 0}))
-    : (is_3d_ ? (kpack > 1 ? std::vector<expr> {K, C, D, R, S, 0, 0, 0}
-                           : std::vector<expr> {K, C, D, R, S, 0, 0})
-              : (kpack > 1 ? std::vector<expr> {K, C, R, S, 0, 0, 0}
-                           : std::vector<expr> {K, C, R, S, 0, 0}));
-}
-
-static expr tensor_offset(const sc_dims &dims_, const std::vector<expr> &idx) {
-  COMPILE_ASSERT(dims_.size() == idx.size(),
-    "The tensor of tensor_ptr has " << dims_.size() << " dimemsions, but got "
-                                    << idx.size() << " indices.");
-  expr offset = idx.back();
-  expr dim = dim2unsigned(dims_.back());
-  for (int64_t i = idx.size() - 2; i >= 0; i--) {
-    offset = idx.at(i) * dim + offset;
-    dim = dim2unsigned(dims_.at(i)) * dim;
-  }
-  return builder::make_cast(datatypes::s32, offset);
-}
-
-template <typename T>
-static T tensor_offset(const sc_dims &dims_, const std::vector<T> &idx) {
-  COMPILE_ASSERT(dims_.size() == idx.size(),
-    "The tensor of tensor_ptr has " << dims_.size() << " dimemsions, but got "
-                                    << idx.size() << " indices.");
-  int offset = idx.back();
-  int dim = dims_.back();
-  for (int i = idx.size() - 2; i >= 0; i--) {
-    offset = idx.at(i) * dim + offset;
-    dim = dims_.at(i) * dim;
-  }
-  return offset;
-}
-
-static inline int get_oc_split_factor(const int mb, const int data_size,
-  const int weight_size, const int L2_cache_size, const int K_num_block) {
-  int oc_split = 1;
-  auto nthreads = runtime_config_t::get().get_num_threads();
-  if (weight_size >= L2_cache_size
-    || (nthreads > mb && weight_size > data_size)) {
-    // config observation: real time mode will prefer split oc first and
-    // oc_split = nthreads.
-    int expected_split_num
-      = std::max(utils::divide_and_ceil(weight_size, L2_cache_size),
-        utils::divide_and_ceil(nthreads, mb));
-    for (auto &factor : utils::get_factors(K_num_block)) {
-      if (factor >= expected_split_num) {
-        expected_split_num = factor;
-        break;
-      }
-    }
-    oc_split = K_num_block < expected_split_num ? 1 : expected_split_num;
-  }
-  return oc_split;
-}
-
-void gen_conv_fwd_t::create_anchor(fusion_anchor_mgr_t *fusion,
-  const graph_tensor_ptr &output_gt, const expr &n, const int n_len,
-  const expr &g, const expr &g_len, const expr &k, const int k_len,
-  const expr &d, const int d_len, const expr &p, const expr &p_len,
-  const expr &q, const int q_len, const int K_block) const {
-  if (fusion) {
-    if (is_group_conv_) {
-      if (is_3d_) {
-        fusion->create_fusion_anchor(slice_map {{output_gt.get(),
-          blocking_output_
-            ? slice_range_list {{{n, n_len}, {g, g_len}, {k, k_len}, {d, d_len},
-              {p, p_len}, {q, q_len}, {0, K_block}}}
-            : slice_range_list {{{n, n_len}, {d, d_len}, {p, p_len}, {q, q_len},
-              {g, g_len}, {k * K_block, k_len * K_block}}}}});
-      } else {
-        fusion->create_fusion_anchor(slice_map {{output_gt.get(),
-          blocking_output_
-            ? slice_range_list {{{n, n_len}, {g, g_len}, {k, k_len}, {p, p_len},
-              {q, q_len}, {0, K_block}}}
-            : slice_range_list {{{n, n_len}, {p, p_len}, {q, q_len}, {g, g_len},
-              {k * K_block, k_len * K_block}}}}});
-      }
-    } else {
-      if (is_3d_) {
-        fusion->create_fusion_anchor(slice_map {{output_gt.get(),
-          blocking_output_
-            ? slice_range_list {{{n, n_len}, {k, k_len}, {d, d_len}, {p, p_len},
-              {q, q_len}, {0, K_block}}}
-            : slice_range_list {{{n, n_len}, {d, d_len}, {p, p_len}, {q, q_len},
-              {k * K_block, k_len * K_block}}}}});
-      } else {
-        fusion->create_fusion_anchor(slice_map {{output_gt.get(),
-          blocking_output_ ? slice_range_list {{{n, n_len}, {k, k_len},
-            {p, p_len}, {q, q_len}, {0, K_block}}}
-                           : slice_range_list {{{n, n_len}, {p, p_len},
-                             {q, q_len}, {k * K_block, k_len * K_block}}}}});
-      }
-    }
-  }
-}
-
-#define CONV_ARG_LIST \
-  const context_ptr &ctx, const conv_fwd_config_t &config, \
-    fusion_anchor_mgr_t *fusion, expr &output, const expr &input, \
-    const expr &weight, std::vector<for_loop> &loops, const int K_num_block, \
-    const int C_num_block, const int os, const int kpack, \
-    const bool use_os_blocking, const bool pack_rows, const expr &os_acc_size, \
-    const std::vector<char> &os_mask
-
-void gen_conv_fwd_t::compute_1x1_no_pack_input(CONV_ARG_LIST) const {
-  assert(loops.size() == 5 && "expected to have 5 level loops!");
-  for_loop &ln = loops.at(0), &lg = loops.at(1), &lk = loops.at(2),
-           &ld = loops.at(3), &lp = loops.at(4);
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  auto toutput = out_tensors_[0];
-  auto out_fmt = toutput.get_format();
-  auto oh_expr_ = oh_;
-  if (!out_fmt.is_any()) {
-    auto out_p2b_map = out_fmt.format_code_.collect_p2b_mapping();
-    oh_expr_ = static_cast<int>(
-      get_expr_as_int(output.checked_as<tensor>()
-                        ->dims_[out_p2b_map[2 + is_group_conv_ + is_3d_][0]]));
-  }
-
-  _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lg, g, 0, groups_) {
-      _named_for_(lk, k, 0, K_num_block) {
-        _named_for_(lp, p_o, 0, oh_expr_ / config.tile_p) {
-          _named_for_(ld, d_o, 0, od_ / config.tile_d) {
-            _for_(q_o, 0, ow_ / config.tile_q) {
-              _for_(d_i, 0, config.tile_d) {
-                _for_(p_i, 0, config.tile_p) {
-                  auto LDA = blocking_input_ ? sw_ * config.C_block
-                                             : sw_ * ic_ * groups_;
-                  auto LDC = blocking_output_ ? config.K_block : oc_ * groups_;
-                  auto stride_a = blocking_input_
-                    ? (is_3d_ ? id_ * ih_ * iw_ * config.C_block
-                              : ih_ * iw_ * config.C_block)
-                    : config.C_block;
-                  _tensor_(A_list, datatypes::pointer, {C_num_block});
-                  _tensor_(B_list, datatypes::pointer, {C_num_block});
-                  _for_(c, 0, C_num_block) {
-                    std::vector<expr> input_pos
-                      = data_offset(n, g, c, (d_o * config.tile_d + d_i) * sd_,
-                        (p_o * config.tile_p + p_i) * sh_,
-                        q_o * config.tile_q * sw_, config.C_block);
-                    A_list[c] = tensor_ptr(input, input_pos);
-                    B_list[c]
-                      = tensor_ptr(weight, weight_offset(g, k, c, 0, 0, 0));
-                  }
-
-                  const auto hint_A_size = config.tile_q * ic_;
-                  const auto hint_B_size = config.K_block * ic_;
-                  const auto hint_C_size = config.tile_q * config.K_block;
-                  sc_brgemm_attrs_t brg_attrs {
-                    {brgemm::attr_key::max_bs, C_num_block},
-                    {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-                    {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-                    {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-                    {brgemm::attr_key::use_interleave_stores, true},
-                    {brgemm::attr_key::use_uker, true}};
-
-                  std::vector<expr> output_pos = output_offset(n, g, k,
-                    d_o * config.tile_d + d_i, p_o * config.tile_p + p_i,
-                    q_o * config.tile_q, config.K_block);
-                  builtin::brgemm_init_list_update(A_list, B_list,
-                    tensor_ptr(output, output_pos), 1, config.tile_q,
-                    config.K_block, config.C_block, LDA, config.K_block, LDC,
-                    1 /*useless*/, 1 /*useless*/, C_num_block,
-                    get_input_dtype(), get_weight_dtype(), brg_attrs);
-
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k,
-                    1, d_o * config.tile_d + d_i, 1, p_o * config.tile_p + p_i,
-                    1, q_o * config.tile_q, config.tile_q, config.K_block);
-                }
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k,
-                  1, d_o * config.tile_d + d_i, 1, p_o * config.tile_p,
-                  config.tile_p, q_o * config.tile_q, config.tile_q,
-                  config.K_block);
-              }
-              create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k, 1,
-                d_o * config.tile_d, config.tile_d, p_o * config.tile_p,
-                config.tile_p, q_o * config.tile_q, config.tile_q,
-                config.K_block);
-            }
-            create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k, 1,
-              d_o * config.tile_d, config.tile_d, p_o * config.tile_p,
-              config.tile_p, 0, ow_, config.K_block);
-          }
-          create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k, 1, 0,
-            od_, p_o * config.tile_p, config.tile_p, 0, ow_, config.K_block);
-        }
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k, 1, 0,
-          od_, 0, oh_expr_, 0, ow_, config.K_block);
-      }
-      create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, 0,
-        K_num_block, 0, od_, 0, oh_expr_, 0, ow_, config.K_block);
-    }
-    create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, groups_, 0,
-      K_num_block, 0, od_, 0, oh_expr_, 0, ow_, config.K_block);
-  }
-  bind_output_loop_axis(ln, "N");
-  bind_output_loop_axis(lg, "G");
-  bind_output_loop_axis(lk, "C");
-  bind_output_loop_axis(lp, "H");
-  bind_output_loop_axis(ld, "D");
-}
-
-void gen_conv_fwd_t::compute_1x1_pack_input(CONV_ARG_LIST) const {
-  COMPILE_ASSERT(!is_3d_, "1x1 pack input doens't support 3D conv yet!");
-  assert(loops.size() == 5 && "expected to have 5 level loops!");
-  for_loop &ln = loops.at(0), &lg = loops.at(1), &lk = loops.at(2),
-           &ld = loops.at(3), &lp = loops.at(4);
-  tensor input1;
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  auto toutput = out_tensors_[0];
-  auto out_fmt = toutput.get_format();
-  auto oh_expr_ = oh_;
-  if (!out_fmt.is_any()) {
-    auto out_p2b_map = out_fmt.format_code_.collect_p2b_mapping();
-    oh_expr_ = static_cast<int>(
-      get_expr_as_int(output.checked_as<tensor>()
-                        ->dims_[out_p2b_map[2 + is_3d_ + is_group_conv_][0]]));
-  }
-  int lanes = get_lanes(ctx, config.C_block, get_input_dtype());
-  if (config.pack_input == 1 && (sd_ > 1 || sh_ > 1 || sw_ > 1)) {
-    trace_guard_t trg(ctx, "pack_input");
-    if (blocking_input_) {
-      _tensor_(input_tmp, get_input_dtype(),
-        is_group_conv_ ? std::vector<expr> {mb_expr_, groups_, C_num_block,
-          oh_expr_, ow_, config.C_block}
-                       : std::vector<expr> {
-                         mb_expr_, C_num_block, oh_expr_, ow_, config.C_block});
-      _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-        _named_for_(lg, g, 0, groups_) {
-          _named_for_(lk, c_o, 0, C_num_block) {
-            _named_for_(lp, p, 0, oh_expr_) {
-              _for_(q, 0, ow_) {
-                _for_(c_i, 0, config.C_block, (int)lanes) {
-                  input_tmp[span_t(
-                    data_offset(n, g, c_o, 0, p, q, 1, c_i), lanes)]
-                    = input[span_t(
-                      data_offset(n, g, c_o, 0, p * sh_, q * sw_, 1, c_i),
-                      lanes)];
-                }
-              }
-            }
-          }
-        }
-      }
-      auto lnk = ln->fuse(lg);
-      lnk = lnk->fuse(lk);
-      if (C_num_block * mb_ < runtime_config_t::get().get_num_threads() * 2) {
-        auto lnkp = lnk->fuse(lp);
-      }
-      input1 = input_tmp.static_as<tensor>();
-    } else {
-      _tensor_(input_tmp, get_input_dtype(),
-        is_group_conv_
-          ? std::vector<expr> {mb_expr_, oh_expr_, ow_, groups_, ic_}
-          : std::vector<expr> {mb_expr_, oh_expr_, ow_, ic_});
-      _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-        _named_for_(lp, p, 0, oh_expr_) {
-          _for_(q, 0, ow_) {
-            _named_for_(lg, g, 0, groups_) {
-              _for_(c_i, 0, ic_, (int)lanes) {
-                input_tmp[span_t(data_offset(n, g, 0, 0, p, q, 1, c_i), lanes)]
-                  = input[span_t(
-                    data_offset(n, g, 0, 0, p * sh_, q * sw_, 1, c_i), lanes)];
-              }
-            }
-          }
-        }
-      }
-      ln = ln->fuse(lp);
-      input1 = input_tmp.static_as<tensor>();
-    }
-  } else {
-    input1 = input.static_as<tensor>();
-  }
-  _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lg, g, 0, groups_) {
-      _named_for_(lk, k, 0, K_num_block) {
-        _named_for_(lp, p_o, 0, oh_expr_ / config.tile_p) {
-          auto LDA = blocking_input_ ? config.C_block : ic_ * groups_;
-          auto LDC = blocking_output_ ? config.K_block : oc_ * groups_;
-          _tensor_(A_list, datatypes::pointer, {C_num_block});
-          _tensor_(B_list, datatypes::pointer, {C_num_block});
-          /* fill the address list for A/B */
-          _for_(c, 0, C_num_block) {
-            std::vector<expr> input_pos
-              = data_offset(n, g, c, 0, p_o * config.tile_p, 0, config.C_block);
-            A_list[c] = tensor_ptr(input1, input_pos);
-            B_list[c] = tensor_ptr(weight, weight_offset(g, k, c, 0, 0, 0));
-          }
-
-          const auto hint_A_size = config.tile_p * ow_ * ic_;
-          const auto hint_B_size = config.K_block * ic_;
-          const auto hint_C_size = config.tile_p * ow_ * config.K_block;
-          sc_brgemm_attrs_t brg_attrs {{brgemm::attr_key::max_bs, C_num_block},
-            {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-            {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-            {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-            {brgemm::attr_key::use_interleave_stores, true},
-            {brgemm::attr_key::use_uker, true}};
-
-          std::vector<expr> output_pos
-            = output_offset(n, g, k, 0, p_o * config.tile_p, 0, config.K_block);
-          builtin::brgemm_init_list_update(A_list, B_list,
-            tensor_ptr(output, output_pos), 1, config.tile_p * ow_,
-            config.K_block, config.C_block, LDA, config.K_block, LDC,
-            1 /*useless*/, 1 /*useless*/, C_num_block, get_input_dtype(),
-            get_weight_dtype(), brg_attrs);
-          create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k, 1, 0,
-            1, p_o * config.tile_p, config.tile_p, 0, ow_, config.K_block);
-        }
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k, 1, 0, 1,
-          0, oh_expr_, 0, ow_, config.K_block);
-      }
-      create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, 0,
-        K_num_block, 0, 1, 0, oh_expr_, 0, ow_, config.K_block);
-    }
-    create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, groups_, 0,
-      K_num_block, 0, 1, 0, oh_expr_, 0, ow_, config.K_block);
-  }
-  bind_output_loop_axis(ln, "N");
-  bind_output_loop_axis(lg, "G");
-  bind_output_loop_axis(lk, "C");
-  bind_output_loop_axis(lp, "H");
-}
-
-void gen_conv_fwd_t::compute_conv3d_no_padding(CONV_ARG_LIST) const {
-  COMPILE_ASSERT((pd_b_ == 0 && ph_b_ == 0 && pw_b_ == 0 && pd_e_ == 0
-                   && ph_e_ == 0 && pw_e_ == 0),
-    "unexpected padding in no_padding kernels!");
-  assert(loops.size() == 5 && "expected to have 5 level loops!");
-  for_loop &ln = loops.at(0), &lg = loops.at(1), &lk = loops.at(2),
-           &ld = loops.at(3), &lp = loops.at(4);
-
-  auto LDA = blocking_input_ ? sw_ * config.C_block : sw_ * ic_ * groups_;
-  auto LDC = blocking_output_ ? config.K_block : oc_ * groups_;
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lg, g, 0, groups_, 1) {
-      _named_for_(lk, k_o, 0, K_num_block) {
-        _named_for_(lp, p_o, 0, oh_ / config.tile_p) {
-          _named_for_(ld, d_o, 0, od_ / config.tile_d) {
-            _tensor_(
-              A_list, datatypes::pointer, {kd_ * kh_ * kw_ * C_num_block});
-            _tensor_(
-              B_list, datatypes::pointer, {kd_ * kh_ * kw_ * C_num_block});
-            _for_(q_o, 0, ow_ / config.tile_q) {
-              _for_(d_i, 0, config.tile_d) {
-                _for_(p_i, 0, config.tile_p) {
-                  std::vector<expr> output_pos = output_offset(n, g, k_o,
-                    d_o * config.tile_d + d_i, p_o * config.tile_p + p_i,
-                    q_o * config.tile_q, config.K_block);
-
-                  _for_(c_o, 0, C_num_block) {
-                    _for_(d, 0, kd_) {
-                      _for_(r, 0, kh_) {
-                        _for_(s, 0, kw_) {
-                          std::vector<expr> input_pos = data_offset(n, g, c_o,
-                            (d_o * config.tile_d + d_i) * sd_ + dd_ * d,
-                            (p_o * config.tile_p + p_i) * sh_ + dh_ * r,
-                            q_o * config.tile_q * sw_ + dw_ * s,
-                            config.C_block);
-                          auto idx = c_o * kd_ * kh_ * kw_ + d * kh_ * kw_
-                            + r * kw_ + s;
-                          A_list[idx] = tensor_ptr(input, input_pos);
-                          B_list[idx] = tensor_ptr(
-                            weight, weight_offset(g, k_o, c_o, d, r, s));
-                        }
-                      }
-                    }
-                  }
-
-                  const auto hint_A_size = config.tile_q * config.C_block * kd_
-                    * kh_ * kw_ * C_num_block;
-                  const auto hint_B_size
-                    = config.K_block * config.C_block * kd_ * kh_ * kw_;
-                  // note, the actual C_size is <= tile_os if pack_rows=true
-                  const auto hint_C_size = config.tile_q * config.K_block;
-                  sc_brgemm_attrs_t brg_attrs {
-                    {brgemm::attr_key::max_bs, kd_ * kh_ * kw_ * C_num_block},
-                    {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-                    {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-                    {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-                    {brgemm::attr_key::use_interleave_stores, true},
-                    {brgemm::attr_key::use_uker, true},
-                    {brgemm::attr_key::bd_mask_level, 0}};
-
-                  builtin::brgemm_init_list_update(A_list, B_list,
-                    tensor_ptr(output, output_pos), 1, config.tile_q,
-                    config.K_block, config.C_block, LDA, config.K_block, LDC,
-                    1 /*useless*/, 1 /*useless*/, kd_ * kh_ * kw_ * C_num_block,
-                    get_input_dtype(), get_weight_dtype(), brg_attrs);
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, d_o * config.tile_d + d_i, 1,
-                    p_o * config.tile_p + p_i, 1, q_o * config.tile_q,
-                    config.tile_q, config.K_block);
-                }
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                  1, d_o * config.tile_d + d_i, 1, p_o * config.tile_p,
-                  config.tile_p, q_o * config.tile_q, config.tile_q,
-                  config.K_block);
-              }
-              create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                1, d_o * config.tile_d, config.tile_d, p_o * config.tile_p,
-                config.tile_p, q_o * config.tile_q, config.tile_q,
-                config.K_block);
-            }
-            create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1,
-              d_o * config.tile_d, config.tile_d, p_o * config.tile_p,
-              config.tile_p, 0, ow_, config.K_block);
-          }
-          create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1, 0,
-            od_, p_o * config.tile_p, config.tile_p, 0, ow_, config.K_block);
-        }
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1, 0,
-          od_, 0, oh_, 0, ow_, config.K_block);
-      }
-      create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, 0,
-        K_num_block, 0, od_, 0, oh_, 0, ow_, config.K_block);
-    }
-    create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, groups_, 0,
-      K_num_block, 0, od_, 0, oh_, 0, ow_, config.K_block);
-  }
-  bind_output_loop_axis(ln, "N");
-  bind_output_loop_axis(lg, "G");
-  bind_output_loop_axis(lk, "C");
-  bind_output_loop_axis(lp, "H");
-  bind_output_loop_axis(ld, "D");
-}
-
-void gen_conv_fwd_t::compute_conv_no_padding(CONV_ARG_LIST) const {
-  COMPILE_ASSERT((pd_b_ == 0 && ph_b_ == 0 && pw_b_ == 0 && pd_e_ == 0
-                   && ph_e_ == 0 && pw_e_ == 0),
-    "unexpected padding in no_padding kernels!");
-  assert(loops.size() == 5 && "expected to have 5 level loops!");
-  loops.emplace_back(for_loop());
-  for_loop &ln = loops.at(0), &lg = loops.at(1), &lk = loops.at(2),
-           &ld = loops.at(3), &lp = loops.at(4), &lok = loops.at(5);
-  auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-
-  auto LDA = blocking_input_ ? sw_ * config.C_block : sw_ * ic_ * groups_;
-  auto LDB = config.K_block;
-  auto LDC = blocking_output_ ? config.K_block : oc_ * groups_;
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  auto data_size
-    = math_utils::get_dims_product(in_tensors_[0].get_blocking_dims())
-    * utils::get_sizeof_type(get_input_dtype());
-  auto weight_size
-    = math_utils::get_dims_product(in_tensors_[1].get_blocking_dims())
-    * utils::get_sizeof_type(get_weight_dtype());
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  int oc_split = get_oc_split_factor(
-    mb_, data_size, weight_size, L2_cache_size, K_num_block);
-  expr real_input;
-  auto need_pack_strided_input = sh_ > 1 && use_os_blocking && pack_rows;
-  if (need_pack_strided_input) {
-    int lanes = get_lanes(ctx, config.C_block, get_input_dtype());
-    auto pack_ih = utils::divide_and_ceil(ih_, sh_);
-    if (blocking_input_) {
-      _tensor_(input_tmp, get_input_dtype(),
-        is_group_conv_ ? std::vector<expr> {mb_expr_, groups_, C_num_block, sh_,
-          pack_ih, iw_, config.C_block}
-                       : std::vector<expr> {mb_expr_, C_num_block, sh_, pack_ih,
-                         iw_, config.C_block});
-      for_loop ls;
-      _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-        _named_for_(lg, g, 0, groups_) {
-          _named_for_(lk, c_o, 0, C_num_block) {
-            _named_for_(lp, ih, 0, pack_ih) {
-              _named_for_(ls, s, 0, sh_) {
-                _if_(s + ih * sh_ < ih_) {
-                  _for_(iw, 0, iw_) {
-                    _for_(c_i, 0, config.C_block, (int)lanes) {
-                      input_tmp[span_t(
-                        data_offset(n, g, c_o, s, ih, iw, 1, c_i, true), lanes)]
-                        = input[span_t(
-                          data_offset(n, g, c_o, 0, s + ih * sh_, iw, 1, c_i),
-                          lanes)];
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-      auto lnk = ln->fuse(lg)->fuse(lk);
-      if (C_num_block * mb_ < runtime_config_t::get().get_num_threads() * 2) {
-        auto lnkp = lnk->fuse(lp)->fuse(ls);
-      }
-      real_input = input_tmp.static_as<tensor>();
-    } else {
-      _tensor_(input_tmp, get_input_dtype(),
-        is_group_conv_
-          ? std::vector<expr> {mb_expr_, sh_, pack_ih, iw_, groups_, ic_}
-          : std::vector<expr> {mb_expr_, sh_, pack_ih, iw_, ic_});
-      for_loop ls;
-      _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-        _named_for_(lp, ih, 0, pack_ih) {
-          _named_for_(ls, s, 0, sh_) {
-            _if_(s + ih * sh_ < ih_) {
-              _for_(iw, 0, iw_) {
-                _named_for_(lg, g, 0, groups_) {
-                  _for_(c_i, 0, ic_, (int)lanes) {
-                    input_tmp[span_t(
-                      data_offset(n, g, 0, s, ih, iw, 1, c_i, true), lanes)]
-                      = input[span_t(
-                        data_offset(n, g, 0, 0, s + ih * sh_, iw, 1, c_i),
-                        lanes)];
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-      ln = ln->fuse(lp)->fuse(ls);
-      real_input = input_tmp.static_as<tensor>();
-    }
-  } else {
-    real_input = input.static_as<tensor>();
-  }
-
-  _named_for_(lok, outer_k, 0, oc_split, 1, for_type::PARALLEL) {
-    _named_for_(ln, n, 0, mb_expr_, 1) {
-      _named_for_(lg, g, 0, groups_, 1) {
-        if (use_os_blocking) {
-          _named_for_(lk, k_i, 0, K_num_block / oc_split) {
-            expr k_o = outer_k * K_num_block / oc_split + k_i;
-            _named_for_(lp, o_o, 0, os / config.tile_os) {
-              _tensor_(A_list, datatypes::pointer, {kh_ * kw_ * C_num_block});
-              _tensor_(B_list, datatypes::pointer, {kh_ * kw_ * C_num_block});
-              auto out_tsr = tensor_ptr(output,
-                output_offset(n, g, k_o, 0, o_o * config.tile_os / ow_,
-                  o_o * config.tile_os % ow_, config.K_block));
-              int adj_ow = ow_ + (pack_rows ? num_elems_skip_per_ow_ : 0);
-
-              if (pack_rows) {
-                if (os / config.tile_os == 1) {
-                  out_tsr = tensor_ptr(
-                    output, output_offset(n, g, k_o, 0, 0, 0, config.K_block));
-                } else {
-                  auto acc_m = os_acc_size[{o_o}];
-                  out_tsr = tensor_ptr(output,
-                    output_offset(
-                      n, g, k_o, 0, acc_m / ow_, acc_m % ow_, config.K_block));
-                }
-              }
-
-              _for_(c_o, 0, C_num_block) {
-                _for_(r, 0, kh_) {
-                  _for_(s, 0, kw_) {
-                    auto idx = c_o * kh_ * kw_ + r * kw_ + s;
-                    std::vector<expr> input_pos = need_pack_strided_input
-                      ? data_offset(n, g, c_o, dh_ * r % sh_,
-                        ((o_o * config.tile_os) / adj_ow) + dh_ * r / sh_,
-                        ((o_o * config.tile_os) % adj_ow) * sw_ + dw_ * s,
-                        config.C_block, 0, need_pack_strided_input)
-                      : data_offset(n, g, c_o, 0,
-                        ((o_o * config.tile_os) / adj_ow) * sh_ + dh_ * r,
-                        ((o_o * config.tile_os) % adj_ow) * sw_ + dw_ * s,
-                        config.C_block, 0, need_pack_strided_input);
-                    A_list[idx] = tensor_ptr(real_input, input_pos);
-                    B_list[idx]
-                      = tensor_ptr(weight, weight_offset(g, k_o, c_o, 0, r, s));
-                  }
-                }
-              }
-
-              const auto hint_A_size
-                = config.tile_os * config.C_block * kh_ * kw_ * C_num_block;
-              const auto hint_B_size
-                = config.K_block * config.C_block * kh_ * kw_ * C_num_block;
-              // note, the actual C_size is <= tile_os if pack_rows=true
-              const auto hint_C_size = config.tile_os * config.K_block;
-              sc_brgemm_attrs_t brg_attrs {
-                {brgemm::attr_key::max_bs, kh_ * kw_ * C_num_block},
-                {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-                {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-                {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-                {brgemm::attr_key::use_interleave_stores, true},
-                {brgemm::attr_key::use_uker, true},
-                {brgemm::attr_key::bd_mask_level, pack_rows ? 2 : 0}};
-
-              builtin::brgemm_init_list_update(A_list, B_list, out_tsr, 1,
-                config.tile_os, config.K_block, config.C_block, LDA,
-                config.K_block, LDC, 1 /*useless*/, 1 /*useless*/,
-                kh_ * kw_ * C_num_block, get_input_dtype(), get_weight_dtype(),
-                brg_attrs, os_mask, o_o, os / config.tile_os);
-              auto os_num_block = os / config.tile_os;
-              if (!pack_rows) {
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                  1, 0, 1, o_o * config.tile_os / ow_, 1,
-                  o_o * config.tile_os % ow_, config.tile_os, config.K_block);
-              } else if (oh_ % os_num_block == 0) {
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                  1, 0, 1, o_o * (oh_ / os_num_block), (oh_ / os_num_block), 0,
-                  ow_, config.K_block);
-              }
-            }
-
-            // Note: slice tensor might across multi-rows with
-            // non-rectangular shapes. Currently, we just promote the fusion
-            // anchor to higher level of loop, which will consume larger
-            // buffer and is non-optimal This can be optimized in next
-            // version of fusion manager.
-            create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1,
-              0, 1, 0, oh_, 0, ow_, config.K_block);
-            bind_output_loop_axis(lp, std::vector<std::string> {"H", "W"});
-          }
-        } else {
-          _named_for_(lp, p_o, 0, oh_ / config.tile_p) {
-            _named_for_(lk, k_i, 0, K_num_block / oc_split) {
-              expr k_o = outer_k * K_num_block / oc_split + k_i;
-              int list_size = kh_ * kw_ * C_num_block;
-              int num_kw = kw_, kw_step = 1;
-              int brgemm_k = 1, num_brgemm_k = 1;
-              if (use_rl == ops::rl_kind::KW_LOWERING) {
-                brgemm_k = attrs_.get_or_else("brgemm_k", 1);
-                num_brgemm_k = attrs_.get_or_else("num_brgemm_k", 1);
-                list_size = num_brgemm_k;
-                num_kw = num_brgemm_k / kh_;
-                kw_step = brgemm_k / ic_;
-              }
-
-              _tensor_(A_list, datatypes::pointer, {list_size});
-              _tensor_(B_list, datatypes::pointer, {list_size});
-              _for_(q_o, 0, ow_ / config.tile_q) {
-                _for_(p_i, 0, config.tile_p) {
-                  std::vector<expr> output_pos
-                    = output_offset(n, g, k_o, 0, p_o * config.tile_p + p_i,
-                      q_o * config.tile_q, config.K_block);
-
-                  auto fill_A_and_B_list = [&]() {
-                    _for_(c_o, 0, C_num_block) {
-                      _for_(r, 0, kh_) {
-                        _for_(s, 0, num_kw) {
-                          auto idx = c_o * kh_ * num_kw + r * num_kw + s;
-                          std::vector<expr> input_pos = data_offset(n, g, c_o,
-                            0, (p_o * config.tile_p + p_i) * sh_ + dh_ * r,
-                            q_o * config.tile_q * sw_ + dw_ * s * kw_step,
-                            config.C_block);
-                          A_list[idx] = tensor_ptr(real_input, input_pos);
-                          B_list[idx] = tensor_ptr(weight,
-                            weight_offset(g, k_o, c_o, 0, r, s * kw_step));
-                        }
-                      }
-                    }
-                  };
-                  fill_A_and_B_list();
-
-                  int M = config.tile_q, K = config.C_block, N = config.K_block;
-                  if (use_rl == ops::rl_kind::KW_LOWERING) { K = brgemm_k; }
-                  const auto hint_A_size = M * K * list_size;
-                  const auto hint_B_size = K * N * list_size;
-                  const auto hint_C_size = M * N;
-                  sc_brgemm_attrs_t brg_attrs {
-                    {brgemm::attr_key::max_bs, list_size},
-                    {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-                    {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-                    {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-                    {brgemm::attr_key::use_interleave_stores, true},
-                    {brgemm::attr_key::use_uker, true},
-                    {brgemm::attr_key::bd_mask_level, 0}};
-
-                  builtin::brgemm_init_list_update(A_list, B_list,
-                    tensor_ptr(output, output_pos), 1, M, N, K, LDA, LDB, LDC,
-                    1 /*useless*/, 1 /*useless*/, list_size, get_input_dtype(),
-                    get_weight_dtype(), brg_attrs);
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, 0, 1, p_o * config.tile_p + p_i, 1,
-                    q_o * config.tile_q, config.tile_q, config.K_block);
-                }
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                  1, 0, 1, p_o * config.tile_p, config.tile_p,
-                  q_o * config.tile_q, config.tile_q, config.K_block);
-              }
-              create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                1, 0, 1, p_o * config.tile_p, config.tile_p, 0, ow_,
-                config.K_block);
-            }
-            create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-              outer_k * K_num_block / oc_split, K_num_block / oc_split, 0, 1,
-              p_o * config.tile_p, config.tile_p, 0, ow_, config.K_block);
-          }
-          create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-            outer_k * K_num_block / oc_split, K_num_block / oc_split, 0, 1, 0,
-            oh_, 0, ow_, config.K_block);
-          bind_output_loop_axis(lp, "H");
-        }
-      }
-      if (groups_ == 1) {
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, 1,
-          outer_k * K_num_block / oc_split, K_num_block / oc_split, 0, 1, 0,
-          oh_, 0, ow_, config.K_block);
-      } else if (groups_ > 1 && oc_split == 1) {
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, groups_, 0,
-          K_num_block, 0, 1, 0, oh_, 0, ow_, config.K_block);
-      } else { /*noops as discontiguous slice for groups >1 && outer_k > 1*/
-      }
-    }
-  }
-  bind_output_loop_axis(ln, "N");
-  bind_output_loop_axis(lg, "G");
-  bind_output_loop_axis(lok, "C");
-  bind_output_loop_axis(lk, "C");
-}
-
-void gen_conv_fwd_t::compute_conv_padding(CONV_ARG_LIST) const {
-  COMPILE_ASSERT(!is_3d_, "3D conv with padding is not supported yet!");
-  assert(loops.size() == 5 && "expected to have 5 level loops!");
-  for_loop &ln = loops.at(0), &lg = loops.at(1), &lk = loops.at(2),
-           &ld = loops.at(3), &lp = loops.at(4);
-  COMPILE_ASSERT(blocking_input_ && blocking_output_,
-    "Only blocking in&out are supported so far!");
-
-  /* to do conv 3*3 with padding */
-  std::unordered_set<int> Q1;
-  std::unordered_set<int> Q2;
-  std::unordered_set<int> Q3;
-
-  int H_PADDED = ih_ + ph_b_ + ph_e_, W_PADDED = iw_ + pw_b_ + pw_e_;
-  sc_dims padded_input_dims = blocking_input_
-    ? sc_dims {mb_, C_num_block, H_PADDED, W_PADDED, config.C_block}
-    : sc_dims {mb_, H_PADDED, W_PADDED, ic_};
-  if (is_group_conv_) {
-    padded_input_dims.insert(
-      std::next(padded_input_dims.begin(), blocking_input_ ? 1 : 3), groups_);
-  }
-
-  // collect the possible values for Q_tmp
-  for (int p_o = 0; p_o < oh_ / config.tile_p; p_o++) {
-    for (int q_o = 0; q_o < ow_ / config.tile_q; q_o++) {
-      for (int p_i = 0; p_i < config.tile_p; p_i++) {
-        int x_start_offset = tensor_offset(padded_input_dims,
-          get_expr_to_dims(data_offset(0, 0, 0, 0,
-            (p_o * config.tile_p + p_i) * sh_, q_o * config.tile_q * sw_, 0)));
-        int x_threshold_left = tensor_offset(padded_input_dims,
-          get_expr_to_dims(data_offset(
-            0, 0, 0, 0, (p_o * config.tile_p + p_i) * sh_, pw_b_, 0)));
-        int x_threshold_right = tensor_offset(padded_input_dims,
-          get_expr_to_dims(data_offset(0, 0, 0, 0,
-            (p_o * config.tile_p + p_i) * sh_, W_PADDED - pw_e_ - 1, 0)));
-        for (int s = 0; s < kw_; s++) {
-          int pad_tmp // real_left_pad
-            = (x_threshold_left - (x_start_offset + (dw_ * s) * config.C_block))
-            / config.C_block;
-          if (((x_start_offset + dw_ * s * config.C_block) < x_threshold_left)
-            && ((x_start_offset + dw_ * s * config.C_block
-                  + (config.tile_q - 1) * config.C_block * sw_)
-              <= x_threshold_right)) {
-            int interval = (pad_tmp + sw_ - 1) / sw_; // real_dst_pad
-            int Q_tmp = config.tile_q - interval;
-            Q1.insert(Q_tmp);
-          } else {
-            if (((x_start_offset + dw_ * s * config.C_block)
-                  >= x_threshold_left)
-              && ((x_start_offset + dw_ * s * config.C_block
-                    + (config.tile_q - 1) * config.C_block * sw_)
-                > x_threshold_right)) {
-              int Q_tmp = ((x_threshold_right
-                             - (x_start_offset + dw_ * s * config.C_block))
-                              / config.C_block
-                            + sw_)
-                / sw_;
-              if (Q_tmp > 0) { Q2.insert(Q_tmp); }
-            } else {
-              int Q_tmp
-                = (pad_tmp
-                    + (x_threshold_right - x_threshold_left) / config.C_block
-                    + sw_)
-                  / sw_
-                - (pad_tmp + sw_ - 1) / sw_;
-              if (Q_tmp > 0) { Q3.insert(Q_tmp); }
-            }
-          }
-        }
-      }
-    }
-  }
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lg, g, 0, groups_) {
-      _named_for_(lk, k_o, 0, K_num_block) {
-        _named_for_(lp, p_o, 0, oh_ / config.tile_p) {
-          _tensor_(A_list, datatypes::pointer, {kh_});
-          _tensor_(B_list, datatypes::pointer, {kh_});
-          _for_(q_o, 0, ow_ / config.tile_q) {
-            _for_(p_i, 0, config.tile_p) {
-              builtin::mem_zero(
-                tensor_ptr(output,
-                  output_offset(n, g, k_o, 0, p_o * config.tile_p + p_i,
-                    q_o * config.tile_q, config.K_block)),
-                config.tile_q * config.K_block, get_output_dtype());
-              _for_(c_o, 0, C_num_block) {
-                _var_(x_threshold_top, datatypes::s32);
-                _var_(x_threshold_bottom, datatypes::s32);
-                _var_(x_threshold_left, datatypes::s32);
-                _var_(x_threshold_right, datatypes::s32);
-                _var_(x_start_offset, datatypes::s32);
-                x_threshold_top = tensor_offset(
-                  padded_input_dims, data_offset(0, 0, 0, 0, ph_b_, 0, 0));
-                x_threshold_bottom = tensor_offset(padded_input_dims,
-                  data_offset(0, 0, 0, 0, H_PADDED - ph_e_, 0, 0));
-                x_threshold_left = tensor_offset(padded_input_dims,
-                  data_offset(
-                    0, 0, 0, 0, (p_o * config.tile_p + p_i) * sh_, pw_b_, 0));
-                x_threshold_right = tensor_offset(padded_input_dims,
-                  data_offset(0, 0, 0, 0, (p_o * config.tile_p + p_i) * sh_,
-                    W_PADDED - pw_e_ - 1, 0));
-                x_start_offset = tensor_offset(padded_input_dims,
-                  data_offset(0, 0, 0, 0, (p_o * config.tile_p + p_i) * sh_,
-                    q_o * config.tile_q * sw_, 0));
-
-                _var_(x_tmp_offset, datatypes::s32);
-                _var_(tmp, datatypes::s32);
-                _var_(cnt, datatypes::s32);
-                cnt = 0;
-                _var_(head, datatypes::s32);
-                head = -1;
-                _for_(s, 0, kw_) {
-                  _if_(((x_start_offset + dw_ * s * config.C_block)
-                         >= x_threshold_left)
-                    && ((x_start_offset + dw_ * s * config.C_block
-                          + (config.tile_q - 1) * config.C_block * sw_)
-                      <= x_threshold_right)) {
-                    cnt = cnt + 1;
-                    _if_(head == -1) head
-                      = builder::make_cast(datatypes::s32, s);
-                  }
-                  _else_ {
-                    _var_(pad_tmp, datatypes::s32);
-                    _var_(interval, datatypes::s32);
-                    _var_(Q_tmp, datatypes::s32);
-                    _if_(((x_start_offset + dw_ * s * config.C_block)
-                           < x_threshold_left)
-                      && ((x_start_offset + dw_ * s * config.C_block
-                            + (config.tile_q - 1) * config.C_block * sw_)
-                        <= x_threshold_right)) {
-                      pad_tmp
-                        = (x_threshold_left
-                            - (x_start_offset
-                              + builder::make_cast(datatypes::s32, dw_ * s)
-                                * config.C_block))
-                        / config.C_block;
-                      interval = (pad_tmp + sw_ - 1) / sw_;
-                      Q_tmp = config.tile_q - interval;
-                      _if_(Q_tmp > 0) {
-                        tmp = 0;
-                        _for_(r, 0, kh_) {
-                          x_tmp_offset = tensor_offset(padded_input_dims,
-                            data_offset(0, 0, 0, 0,
-                              (p_o * config.tile_p + p_i) * sh_ + dh_ * r,
-                              q_o * config.tile_q * sw_ + dw_ * s
-                                + interval * sw_,
-                              0));
-                          _if_(x_tmp_offset >= x_threshold_top
-                            && x_tmp_offset < x_threshold_bottom) {
-                            A_list[tmp] = tensor_ptr(input,
-                              data_offset(n, g, c_o, 0,
-                                (p_o * config.tile_p + p_i) * sh_ + dh_ * r
-                                  - ph_b_,
-                                q_o * config.tile_q * sw_ - pw_b_ + dw_ * s
-                                  + interval * sw_,
-                                config.C_block));
-                            B_list[tmp] = tensor_ptr(
-                              weight, weight_offset(g, k_o, c_o, 0, r, s));
-                            tmp = tmp + 1;
-                          }
-                        }
-                        _if_(tmp > 0) {
-                          if (Q1.size() == 1) {
-                            builtin::brgemm_list_update(A_list, B_list,
-                              tensor_ptr(output,
-                                output_offset(n, g, k_o, 0,
-                                  p_o * config.tile_p + p_i,
-                                  q_o * config.tile_q + interval,
-                                  config.K_block)),
-                              1, *Q1.begin(), config.K_block, config.C_block,
-                              sw_ * config.C_block, config.K_block,
-                              config.K_block, 1 /*useless*/, 1 /*useless*/, tmp,
-                              get_input_dtype(), get_weight_dtype());
-                          } else {
-                            builtin::brgemm_list_update(A_list, B_list,
-                              tensor_ptr(output,
-                                output_offset(n, g, k_o, 0,
-                                  p_o * config.tile_p + p_i,
-                                  q_o * config.tile_q + interval,
-                                  config.K_block)),
-                              1, Q_tmp, config.K_block, config.C_block,
-                              sw_ * config.C_block, config.K_block,
-                              config.K_block, 1 /*useless*/, 1 /*useless*/, tmp,
-                              get_input_dtype(), get_weight_dtype());
-                          }
-                        }
-                      }
-                    }
-                    _else_ {
-                      _if_(((x_start_offset + dw_ * s * config.C_block)
-                             >= x_threshold_left)
-                        && ((x_start_offset + dw_ * s * config.C_block
-                              + (config.tile_q - 1) * config.C_block * sw_)
-                          > x_threshold_right)) {
-                        Q_tmp
-                          = ((x_threshold_right
-                               - (x_start_offset
-                                 + builder::make_cast(datatypes::s32, dw_ * s)
-                                   * config.C_block))
-                                / config.C_block
-                              + sw_)
-                          / sw_;
-                        _if_(Q_tmp > 0) {
-                          tmp = 0;
-                          _for_(r, 0, kh_) {
-                            x_tmp_offset = tensor_offset(padded_input_dims,
-                              data_offset(0, 0, 0, 0,
-                                (p_o * config.tile_p + p_i) * sh_ + dh_ * r,
-                                q_o * config.tile_q * sw_ + dw_ * s,
-                                config.C_block));
-                            _if_(x_tmp_offset >= x_threshold_top
-                              && x_tmp_offset < x_threshold_bottom) {
-                              A_list[tmp] = tensor_ptr(input,
-                                data_offset(n, g, c_o, 0,
-                                  (p_o * config.tile_p + p_i) * sh_ + dh_ * r
-                                    - ph_b_,
-                                  q_o * config.tile_q * sw_ - pw_b_ + dw_ * s,
-                                  config.C_block));
-                              B_list[tmp] = tensor_ptr(
-                                weight, weight_offset(g, k_o, c_o, 0, r, s));
-                              tmp = tmp + 1;
-                            }
-                          }
-                          _if_(tmp > 0) {
-                            if (Q2.size() == 1) {
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output,
-                                  output_offset(n, g, k_o, 0,
-                                    p_o * config.tile_p + p_i,
-                                    q_o * config.tile_q, config.K_block)),
-                                1, *Q2.begin(), config.K_block, config.C_block,
-                                sw_ * config.C_block, config.K_block,
-                                config.K_block, config.C_block,
-                                config.C_block * config.K_block, tmp,
-                                get_input_dtype(), get_weight_dtype());
-                            } else {
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output,
-                                  output_offset(n, g, k_o, 0,
-                                    p_o * config.tile_p + p_i,
-                                    q_o * config.tile_q, config.K_block)),
-                                1, Q_tmp, config.K_block, config.C_block,
-                                sw_ * config.C_block, config.K_block,
-                                config.K_block, config.C_block,
-                                config.C_block * config.K_block, tmp,
-                                get_input_dtype(), get_weight_dtype());
-                            }
-                          }
-                        }
-                      }
-                      _else_ {
-                        pad_tmp
-                          = (x_threshold_left
-                              - (x_start_offset
-                                + builder::make_cast(datatypes::s32, dw_ * s)
-                                  * config.C_block))
-                          / config.C_block;
-                        interval = (pad_tmp + sw_ - 1) / sw_;
-                        Q_tmp = (pad_tmp
-                                  + (x_threshold_right - x_threshold_left)
-                                    / config.C_block
-                                  + sw_)
-                            / sw_
-                          - (pad_tmp + sw_ - 1) / sw_;
-                        _if_(Q_tmp > 0) {
-                          tmp = 0;
-                          _for_(r, 0, kh_) {
-                            x_tmp_offset = tensor_offset(padded_input_dims,
-                              data_offset(0, 0, 0, 0,
-                                (p_o * config.tile_p + p_i) * sh_ + dh_ * r,
-                                q_o * config.tile_q * sw_ + dw_ * s
-                                  + interval * sw_,
-                                0));
-                            _if_(x_tmp_offset >= x_threshold_top
-                              && x_tmp_offset < x_threshold_bottom) {
-                              A_list[tmp] = tensor_ptr(input,
-                                data_offset(n, g, c_o, 0,
-                                  (p_o * config.tile_p + p_i) * sh_ + dh_ * r
-                                    - ph_b_,
-                                  q_o * config.tile_q * sw_ - pw_b_ + dw_ * s
-                                    + interval * sw_,
-                                  config.C_block));
-                              B_list[tmp] = tensor_ptr(
-                                weight, weight_offset(g, k_o, c_o, 0, r, s));
-                              tmp = tmp + 1;
-                            }
-                          }
-                          _if_(tmp > 0) {
-                            if (Q3.size() == 1) {
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output,
-                                  output_offset(n, g, k_o, 0,
-                                    p_o * config.tile_p + p_i,
-                                    q_o * config.tile_q + interval,
-                                    config.K_block)),
-                                1, *Q3.begin(), config.K_block, config.C_block,
-                                sw_ * config.C_block, config.K_block,
-                                config.K_block, config.C_block,
-                                config.C_block * config.K_block, tmp,
-                                get_input_dtype(), get_weight_dtype());
-                            } else {
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output,
-                                  output_offset(n, g, k_o, 0,
-                                    p_o * config.tile_p + p_i,
-                                    q_o * config.tile_q + interval,
-                                    config.K_block)),
-                                1, Q_tmp, config.K_block, config.C_block,
-                                sw_ * config.C_block, config.K_block,
-                                config.K_block, config.C_block,
-                                config.C_block * config.K_block, tmp,
-                                get_input_dtype(), get_weight_dtype());
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-                _if_(cnt > 0) {
-                  tmp = 0;
-                  _for_(r, 0, kh_) {
-                    x_tmp_offset = tensor_offset(padded_input_dims,
-                      data_offset(0, 0, 0, 0,
-                        (p_o * config.tile_p + p_i) * sh_ + dh_ * r,
-                        q_o * config.tile_q * sw_ + dw_ * head, 0));
-                    _if_(x_tmp_offset >= x_threshold_top
-                      && x_tmp_offset < x_threshold_bottom) {
-                      A_list[tmp] = tensor_ptr(input,
-                        data_offset(n, g, c_o, 0,
-                          (p_o * config.tile_p + p_i) * sh_ + dh_ * r - ph_b_,
-                          q_o * config.tile_q * sw_ - pw_b_ + dw_ * head,
-                          config.C_block));
-                      B_list[tmp] = tensor_ptr(
-                        weight, weight_offset(g, k_o, c_o, 0, r, head));
-                      tmp = tmp + 1;
-                    }
-                  }
-                  _if_(tmp > 0) {
-                    builtin::brgemm_list_update(A_list, B_list,
-                      tensor_ptr(output,
-                        output_offset(n, g, k_o, 0, p_o * config.tile_p + p_i,
-                          q_o * config.tile_q, config.K_block)),
-                      cnt, config.tile_q, config.K_block, config.C_block,
-                      sw_ * config.C_block, config.K_block, config.K_block,
-                      dw_ * config.C_block, config.C_block * config.K_block,
-                      tmp, get_input_dtype(), get_weight_dtype());
-                  }
-                }
-              }
-              // tile_q * K_block
-              create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                1, 0, 1, p_o * config.tile_p + p_i, 1, q_o * config.tile_q,
-                config.tile_q, config.K_block);
-            }
-            // tile_p * tile_q * K_block
-            create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1,
-              0, 1, p_o * config.tile_p, config.tile_p, q_o * config.tile_q,
-              config.tile_q, config.K_block);
-          }
-          // tile_p * ow * K_block
-          create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1, 0,
-            1, p_o * config.tile_p, config.tile_p, 0, ow_, config.K_block);
-        }
-        // oh * ow * K_block
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1, 0,
-          1, 0, oh_, 0, ow_, config.K_block);
-      }
-    }
-    // oc * oh * ow * K_block
-    create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, groups_, 0,
-      K_num_block, 0, 1, 0, oh_, 0, ow_, config.K_block);
-  }
-  bind_output_loop_axis(ln, "N");
-  bind_output_loop_axis(lg, "G");
-  bind_output_loop_axis(lk, "C");
-  bind_output_loop_axis(lp, "H");
-}
-
-void gen_conv_fwd_t::compute_conv_padding_v2(CONV_ARG_LIST) const {
-  assert(loops.size() == 5 && "expected to have 5 level loops!");
-  loops.emplace_back(for_loop());
-  for_loop &ln = loops.at(0), &lg = loops.at(1), &lk = loops.at(2),
-           &ld = loops.at(3), &lp = loops.at(4), &lok = loops.at(5);
-
-  // no need to include groups for LDA as it's used by sub-tensor
-  // instead of origin input tensor.
-  const auto LDA = blocking_input_ ? config.C_block : ic_;
-  const auto LDC = blocking_output_ ? config.K_block : oc_ * groups_;
-
-  const int id_padded = is_3d_ ? (id_ + pd_b_ + pd_e_) : 1;
-  const int ih_padded = ih_ + ph_b_ + ph_e_;
-  const int iw_padded = iw_ + pw_b_ + pw_e_;
-  const auto dtype_input = get_input_dtype();
-  const auto dtype_weight = get_weight_dtype();
-  const auto dtype_output = get_output_dtype();
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  auto padding_value = attrs_.get_or_else("padding_value", 0);
-  int src_row_tile_size = (config.tile_q - 1) * sw_ + dw_ * (kw_ - 1) + 1;
-  typedef enum { LEFT_PAD = 0, BOTH_PAD, RIGHT_PAD } pad_kind;
-
-  // some shapes might have less pad than given at the end of current
-  // axis
-  auto get_num_pad_end = [](int ip, int k, int s, int p) {
-    int remaining = (ip - k) % s;
-    int num_pad_end = (remaining == 0)
-      ? utils::divide_and_ceil(p, s)
-      : ((p > remaining) ? utils::divide_and_ceil(p - remaining, s) : 0);
-    return num_pad_end;
-  };
-  const int y_num_pad_top = utils::divide_and_ceil(ph_b_, sh_);
-  const int y_num_pad_left = utils::divide_and_ceil(pw_b_, sw_);
-  const int y_num_pad_front = is_3d_ ? utils::divide_and_ceil(pd_b_, sd_) : 0;
-  const int y_num_pad_bottom
-    = get_num_pad_end(ih_padded, dh_ * (kh_ - 1) + 1, sh_, ph_e_);
-  const int y_num_pad_right
-    = get_num_pad_end(iw_padded, dw_ * (kw_ - 1) + 1, sw_, pw_e_);
-  const int y_num_pad_back
-    = is_3d_ ? get_num_pad_end(id_padded, dd_ * (kd_ - 1) + 1, sd_, pd_e_) : 0;
-
-  const int y_unpad_top = y_num_pad_top;
-  const int y_unpad_bottom = oh_ - y_num_pad_bottom - 1;
-  const int y_unpad_left = y_num_pad_left;
-  const int y_unpad_right = ow_ - y_num_pad_right - 1;
-  const int y_unpad_front = is_3d_ ? y_num_pad_front : 0;
-  const int y_unpad_back = is_3d_ ? od_ - y_num_pad_front - 1 : 1;
-  const uint32_t lanes = get_lanes(ctx, config.C_block, dtype_input);
-
-  // large pd and ph will be skipped for non-os-blocking approach.
-  const bool large_pad = src_row_tile_size < pw_b_ || src_row_tile_size < pw_e_;
-
-  const int work_amount = mb_ * K_num_block * ow_ / config.tile_q;
-  bool reuse_sub_tensor = sh_ < (dh_ * (kh_ - 1) + 1) && C_num_block == 1
-    && is_parallel_space_enough(work_amount, num_threads) && dh_ == 1;
-  bool use_var_bs
-    = attrs_.get_or_else("use_var_bs", true) && padding_value == 0;
-  auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-
-  // TODO(xxx): fix inverse filter correctness issue when
-  // use_var_bs==true
-  if (inverse_filter_) { use_var_bs = false; }
-
-  int list_size = kd_ * kh_ * kw_;
-  int brgemm_k = 1, num_brgemm_k = 1;
-  int num_kw = kw_, kw_step = 1;
-  if (use_rl == ops::rl_kind::KW_LOWERING) {
-    // TODO(xxx): enable advanced optimization for kw_lowering
-    reuse_sub_tensor = false;
-    use_var_bs = false;
-    brgemm_k = attrs_.get_or_else("brgemm_k", 1);
-    num_brgemm_k = attrs_.get_or_else("num_brgemm_k", 1);
-    list_size = num_brgemm_k * kd_;
-    num_kw = num_brgemm_k / kh_;
-    kw_step = brgemm_k / ic_;
-  }
-
-  _tensor_(pbuffer, dtype_input, {src_row_tile_size, LDA});
-  if (!use_var_bs) {
-    // when not using var_bs, define a unified zero-buffer for
-    // padding.
-    builtin::brgemm_init(
-      pbuffer, src_row_tile_size, LDA, LDA, dtype_output, padding_value);
-  }
-
-  // thread shared var to hold stateful status
-  _tensor_(g_sub_tensor, dtype_input,
-    is_3d_ ? std::vector<expr> {num_threads, kd_, kh_, src_row_tile_size, LDA}
-           : std::vector<expr> {num_threads, kh_, src_row_tile_size, LDA});
-  _tensor_(g_cur_indices, datatypes::u32, {num_threads, kh_});
-  _tensor_(g_init_state, datatypes::boolean, {num_threads});
-  auto data_size
-    = math_utils::get_dims_product(in_tensors_[0].get_blocking_dims())
-    * utils::get_sizeof_type(get_input_dtype());
-  auto weight_size
-    = math_utils::get_dims_product(in_tensors_[1].get_blocking_dims())
-    * utils::get_sizeof_type(get_weight_dtype());
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  int oc_split = get_oc_split_factor(
-    mb_, data_size, weight_size, L2_cache_size, K_num_block);
-
-  int outer_range
-    = reuse_sub_tensor ? ow_ / config.tile_q : oh_ / config.tile_p;
-  int inner_range
-    = reuse_sub_tensor ? oh_ / config.tile_p : ow_ / config.tile_q;
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  _named_for_(lok, outer_k, 0, oc_split, 1, for_type::PARALLEL) {
-    _named_for_(ln, n, 0, mb_expr_, 1) {
-      _named_for_(lg, g, 0, groups_) {
-        _named_for_(lk, k_i, 0, K_num_block / oc_split) {
-          expr k_o = outer_k * K_num_block / oc_split + k_i;
-          _named_for_(lp, outer_var, 0, outer_range) {
-            expr p_o, q_o;
-            if (reuse_sub_tensor) {
-              q_o = outer_var;
-            } else {
-              p_o = outer_var;
-            }
-            _named_for_(ld, d_o, 0, od_ / config.tile_d) {
-              _var_init_(
-                tid, datatypes::s32, builder::make_get_group_thread_id(-1));
-
-              _tensor_(A_list, datatypes::pointer, {list_size});
-              _tensor_(B_list, datatypes::pointer, {list_size});
-              _var_(prev_indices, datatypes::u32);
-              _var_(num_h_pad, datatypes::s32);
-              _var_(h_pad_begin_idx, datatypes::index);
-              _var_(h_pad_end_idx, datatypes::index);
-              _var_(h_unpad_begin_idx, datatypes::index);
-              _var_(h_unpad_end_idx, datatypes::index);
-              _var_(real_l_pad, datatypes::s32);
-              _var_(real_r_pad, datatypes::s32);
-              _var_(copy_width, datatypes::s32);
-              _var_(num_d_pad, datatypes::s32);
-              _var_(d_pad_begin_idx, datatypes::index);
-              _var_(d_pad_end_idx, datatypes::index);
-              _var_(d_unpad_begin_idx, datatypes::index);
-              _var_(d_unpad_end_idx, datatypes::index);
-
-              _for_(d_i, 0, config.tile_d) {
-                // initialized stateful vars for each thread.
-                if (reuse_sub_tensor) {
-                  _for_(gi, 0, kh_) {
-                    g_cur_indices[{tid, gi}]
-                      = builder::make_cast(datatypes::u32, gi);
-                  }
-                  g_init_state[tid] = true;
-                }
-                _for_(inner_var, 0, inner_range) {
-                  if (reuse_sub_tensor) {
-                    p_o = inner_var;
-                  } else {
-                    q_o = inner_var;
-                  }
-                  _for_(p_i, 0, config.tile_p) {
-                    _for_(c_o, 0, C_num_block) {
-                      std::vector<expr> output_pos = output_offset(n, g, k_o,
-                        d_o * config.tile_d + d_i, p_o * config.tile_p + p_i,
-                        q_o * config.tile_q, config.K_block);
-
-                      auto update_pad_idx =
-                        [](const expr &cur_o, const expr &cur_i, const int ker,
-                          const int pad, const int dilation, const int in,
-                          const int unpad_begin, const int unpad_end,
-                          expr::lvalue_proxy_t &num_pad,
-                          expr::lvalue_proxy_t &pad_begin_idx,
-                          expr::lvalue_proxy_t &pad_end_idx,
-                          expr::lvalue_proxy_t &unpad_begin_idx,
-                          expr::lvalue_proxy_t &unpad_end_idx) {
-                          _if_((cur_o >= unpad_begin) && (cur_o <= unpad_end)) {
-                            num_pad = 0;
-                            pad_begin_idx = 0;
-                            pad_end_idx = 0;
-                            unpad_begin_idx = 0;
-                            unpad_end_idx = ker;
-                          }
-                          _else_ {
-                            _if_(cur_o < unpad_begin) {
-                              num_pad
-                                = (pad
-                                    - builder::make_cast(datatypes::s32, cur_i)
-                                    - 1)
-                                  / dilation
-                                + 1;
-                              expr num_right_pad = divide_and_ceil(
-                                ((builder::make_cast(datatypes::s32, cur_i)
-                                  + (ker - 1) * dilation + 1 - (in + pad))),
-                                dilation);
-                              pad_begin_idx = 0;
-                              pad_end_idx = num_pad;
-                              unpad_begin_idx = num_pad;
-                              unpad_end_idx = ker;
-                              _if_(num_right_pad > 0) {
-                                unpad_end_idx = ker - num_right_pad;
-                              }
-                            }
-                            _else_ {
-                              num_pad = divide_and_ceil(
-                                ((builder::make_cast(datatypes::s32, cur_i)
-                                  + (ker - 1) * dilation + 1 - (in + pad))),
-                                dilation);
-                              pad_begin_idx = ker - num_pad;
-                              pad_end_idx = ker;
-                              unpad_begin_idx = 0;
-                              unpad_end_idx = ker - num_pad;
-                            }
-                          }
-                        };
-
-                      auto cur_d = d_o * config.tile_d + d_i;
-                      auto cur_id = cur_d * sd_;
-                      auto cur_p = p_o * config.tile_p + p_i;
-                      auto cur_ih = cur_p * sh_;
-                      auto cur_tile_begin = q_o * config.tile_q;
-                      auto cur_tile_end = cur_tile_begin + config.tile_q - 1;
-                      auto cur_iw = cur_tile_begin * sw_;
-
-                      if (is_3d_) {
-                        update_pad_idx(cur_d, cur_id, kd_, pd_b_, dd_, id_,
-                          y_unpad_front, y_unpad_back, num_d_pad,
-                          d_pad_begin_idx, d_pad_end_idx, d_unpad_begin_idx,
-                          d_unpad_end_idx);
-                      }
-                      update_pad_idx(cur_p, cur_ih, kh_, ph_b_, dh_, ih_,
-                        y_unpad_top, y_unpad_bottom, num_h_pad, h_pad_begin_idx,
-                        h_pad_end_idx, h_unpad_begin_idx, h_unpad_end_idx);
-
-                      auto zero_out_sub_tensor = [&]() {
-                        builtin::brgemm_init(tensor_ptr(output, output_pos),
-                          config.tile_q, config.K_block, LDC, dtype_output, 0);
-                      };
-
-                      auto process_tile_with_pad =
-                        [&](const expr &d_begin, const expr &d_end,
-                          const expr &h_begin, const expr &h_end,
-                          const expr &left_pad, const expr &right_pad,
-                          const expr &tile_size_exclude_right_pad,
-                          const pad_kind &kind, const expr &sub_tsr_hi = 0,
-                          const bool &update_mode = false) {
-                          _for_(di, d_begin, d_end) {
-                            _for_(hi, h_begin, h_end) {
-                              _var_(sub_tsr_d, datatypes::index);
-                              _var_(sub_tsr_h, datatypes::index);
-                              if (is_3d_) { sub_tsr_d = di - d_begin; }
-                              sub_tsr_h
-                                = update_mode ? sub_tsr_hi : (hi - h_begin);
-                              if (kind == LEFT_PAD || kind == BOTH_PAD) {
-                                builtin::brgemm_init(
-                                  tensor_ptr(g_sub_tensor,
-                                    is_3d_ ? std::vector<expr> {tid, sub_tsr_d,
-                                      sub_tsr_h, 0, 0}
-                                           : std::vector<expr> {tid, sub_tsr_h,
-                                             0, 0}),
-                                  builder::make_cast(datatypes::s32, left_pad),
-                                  config.C_block, LDA, dtype_input,
-                                  padding_value);
-                              }
-
-                              // mapping dst to src_padded then
-                              // mapping to original src to copy the
-                              // origin elements.
-                              _for_(j, left_pad, tile_size_exclude_right_pad) {
-                                _for_(k, 0, config.C_block, (int)lanes) {
-                                  g_sub_tensor[span_t(is_3d_
-                                      ? std::vector<expr> {tid, sub_tsr_d,
-                                        sub_tsr_h, j, k}
-                                      : std::vector<expr> {tid, sub_tsr_h, j,
-                                        k},
-                                    lanes)]
-                                    = input[span_t(
-                                      data_offset(n, g, c_o,
-                                        cur_id + di * dd_ - pd_b_,
-                                        cur_ih + hi * dh_ - ph_b_,
-                                        cur_iw + j - pw_b_, config.C_block, k),
-                                      lanes)];
-                                }
-                              }
-
-                              if (kind == RIGHT_PAD || kind == BOTH_PAD) {
-                                builtin::brgemm_init(
-                                  tensor_ptr(g_sub_tensor,
-                                    is_3d_ ? std::vector<expr> {tid, sub_tsr_d,
-                                      sub_tsr_h, tile_size_exclude_right_pad, 0}
-                                           : std::vector<expr> {tid, sub_tsr_h,
-                                             tile_size_exclude_right_pad, 0}),
-                                  builder::make_cast(datatypes::s32,
-                                    src_row_tile_size
-                                      - tile_size_exclude_right_pad),
-                                  config.C_block, LDA, dtype_input,
-                                  padding_value);
-                              }
-
-                              _for_(wi, 0, num_kw) {
-                                _var_(idx, datatypes::u32);
-                                if (is_3d_) {
-                                  auto valid_kh
-                                    = (h_unpad_end_idx - h_unpad_begin_idx - 1)
-                                      / dh_
-                                    + 1;
-                                  idx = builder::make_cast(datatypes::u32,
-                                    use_var_bs
-                                      ? (sub_tsr_d * valid_kh * num_kw
-                                        + sub_tsr_h * num_kw + wi)
-                                      : (di * kh_ * num_kw + hi * num_kw + wi));
-                                } else {
-                                  idx = builder::make_cast(datatypes::u32,
-                                    use_var_bs ? (sub_tsr_h * num_kw + wi)
-                                               : (hi * num_kw + wi));
-                                }
-                                // TODO(xxx): pack input for dilated
-                                // conv
-                                A_list[idx] = tensor_ptr(g_sub_tensor,
-                                  is_3d_ ? std::vector<expr> {tid, sub_tsr_d,
-                                    sub_tsr_h, wi * dw_ * kw_step, 0}
-                                         : std::vector<expr> {tid, sub_tsr_h,
-                                           wi * dw_ * kw_step, 0});
-                              }
-                            }
-                          }
-                        };
-
-                      auto fill_sub_tensor = [&](const expr &d_unpad_begin = 0,
-                                               const expr &d_unpad_end = 1) {
-                        _if_(cur_tile_begin < y_unpad_left) {
-                          _if_(y_unpad_right >= 0
-                            && cur_tile_end <= y_unpad_right) {
-                            // left pad only
-                            real_l_pad = pw_b_
-                              - builder::make_cast(datatypes::s32, cur_iw);
-                            process_tile_with_pad(d_unpad_begin, d_unpad_end,
-                              h_unpad_begin_idx, h_unpad_end_idx, real_l_pad, 0,
-                              src_row_tile_size, LEFT_PAD);
-                          }
-                          _else_ {
-                            // both left and right pad
-                            real_l_pad = pw_b_
-                              - builder::make_cast(datatypes::s32, cur_iw);
-                            real_r_pad = builder::make_cast(datatypes::s32,
-                                           cur_iw + src_row_tile_size)
-                              - (iw_padded - pw_e_);
-                            copy_width = src_row_tile_size - real_r_pad;
-                            process_tile_with_pad(d_unpad_begin, d_unpad_end,
-                              h_unpad_begin_idx, h_unpad_end_idx, real_l_pad,
-                              real_r_pad, copy_width, BOTH_PAD);
-                          }
-                        }
-                        _else_ {
-                          // right pad only
-                          real_r_pad = builder::make_cast(datatypes::s32,
-                                         cur_iw + src_row_tile_size)
-                            - (iw_padded - pw_e_);
-                          copy_width = src_row_tile_size - real_r_pad;
-                          process_tile_with_pad(d_unpad_begin, d_unpad_end,
-                            h_unpad_begin_idx, h_unpad_end_idx, 0, real_r_pad,
-                            copy_width, RIGHT_PAD);
-                        }
-                      };
-
-                      auto update_sub_tensor = [&](const int kd = 1) {
-                        _tensor_(modified_indices, datatypes::index, {sh_});
-                        _var_(m_idx, datatypes::index);
-                        _var_(actual_idx, datatypes::index);
-                        m_idx = 0;
-                        _for_(idx, 0, kh_) {
-                          prev_indices = g_cur_indices[{tid, idx}];
-                          _if_(prev_indices < sh_) {
-                            g_cur_indices[{tid, idx}]
-                              = prev_indices + kh_ - sh_;
-                            modified_indices[m_idx] = idx;
-                            m_idx = m_idx + 1;
-                          }
-                          _else_ {
-                            g_cur_indices[{tid, idx}] = prev_indices - sh_;
-                          }
-                        }
-
-                        _for_(idx, 0, sh_) {
-                          m_idx = modified_indices[idx];
-                          actual_idx = g_cur_indices[{tid, m_idx}];
-                          // update necessary row of sub-tensor
-                          // according to actual_idx
-                          _if_(cur_tile_begin < y_unpad_left) {
-                            _if_(y_unpad_right >= 0
-                              && cur_tile_end <= y_unpad_right) {
-                              // left pad only
-                              real_l_pad = pw_b_
-                                - builder::make_cast(datatypes::s32, cur_iw);
-                              process_tile_with_pad(0, kd, actual_idx,
-                                actual_idx + 1, real_l_pad, 0,
-                                src_row_tile_size, LEFT_PAD, m_idx, true);
-                            }
-                            _else_ {
-                              // both left and right pad
-                              real_l_pad = pw_b_
-                                - builder::make_cast(datatypes::s32, cur_iw);
-                              real_r_pad = builder::make_cast(datatypes::s32,
-                                             cur_iw + src_row_tile_size)
-                                - (iw_padded - pw_e_);
-                              copy_width = src_row_tile_size - real_r_pad;
-                              process_tile_with_pad(0, kd, actual_idx,
-                                actual_idx + 1, real_l_pad, real_r_pad,
-                                copy_width, BOTH_PAD, m_idx, true);
-                            }
-                          }
-                          _else_ {
-                            // right pad only
-                            real_r_pad = builder::make_cast(datatypes::s32,
-                                           cur_iw + src_row_tile_size)
-                              - (iw_padded - pw_e_);
-                            copy_width = src_row_tile_size - real_r_pad;
-                            process_tile_with_pad(0, kd, actual_idx,
-                              actual_idx + 1, 0, real_r_pad, copy_width,
-                              RIGHT_PAD, m_idx, true);
-                          }
-                        }
-
-                        // update A_list with reusable sub-tensor
-                        // using cur_indices, no padding on depth or
-                        // height axis.
-                        _for_(di, 0, kd) {
-                          _for_(hi, 0, kh_) {
-                            _var_(sub_tsr_idx, datatypes::index);
-                            sub_tsr_idx = builder::make_cast(
-                              datatypes::index, g_cur_indices[{tid, hi}]);
-                            _for_(wi, 0, num_kw) {
-                              _var_(A_idx, datatypes::u32);
-
-                              if (is_3d_) {
-                                A_idx = builder::make_cast(datatypes::u32,
-                                  di * kh_ * num_kw + sub_tsr_idx * num_kw
-                                    + wi * kw_step);
-                                A_list[A_idx] = tensor_ptr(g_sub_tensor,
-                                  {tid, di, hi, wi * kw_step * dw_, 0});
-                              } else {
-                                A_idx = builder::make_cast(
-                                  datatypes::u32, sub_tsr_idx * num_kw + wi);
-                                A_list[A_idx] = tensor_ptr(g_sub_tensor,
-                                  {tid, hi, wi * kw_step * dw_, 0});
-                              }
-                            }
-                          }
-                        }
-                      };
-
-                      auto call_brgemm = [&](int valid_kh, int valid_kd = 1) {
-                        COMPILE_ASSERT(valid_kd > 0 && valid_kh > 0,
-                          "Expect valid_kh and valid_kd are positive "
-                          "integer, "
-                          "but got valid_kh="
-                            << valid_kh << ", valid_kd=" << valid_kd << ".");
-                        auto valid_ker_size = valid_kd * valid_kh * num_kw;
-                        int M = config.tile_q, K = config.C_block,
-                            N = config.K_block;
-                        if (use_rl == ops::rl_kind::KW_LOWERING) {
-                          K = brgemm_k;
-                        }
-                        auto hint_A_size = M * K * valid_ker_size;
-                        auto hint_B_size = K * N * valid_ker_size;
-                        auto hint_C_size = M * N;
-                        sc_brgemm_attrs_t brg_attrs {
-                          {brgemm::attr_key::max_bs, valid_ker_size},
-                          {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-                          {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-                          {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-                          {brgemm::attr_key::use_interleave_stores, true},
-                          {brgemm::attr_key::use_uker, true},
-                          {brgemm::attr_key::var_bs,
-                            use_var_bs ? true : false}};
-
-                        _if_(c_o == 0) {
-                          builtin::brgemm_init_list_update(A_list, B_list,
-                            tensor_ptr(output, output_pos), 1, M, N, K,
-                            sw_ * LDA, config.K_block, LDC, 1, 1,
-                            valid_ker_size, dtype_input, dtype_weight,
-                            brg_attrs);
-                        }
-                        _else_ {
-                          builtin::brgemm_list_update(A_list, B_list,
-                            tensor_ptr(output, output_pos), 1, M, N, K,
-                            sw_ * LDA, config.K_block, LDC, 1, 1,
-                            valid_ker_size, dtype_input, dtype_weight,
-                            brg_attrs);
-                        }
-                      };
-
-                      auto generate_var_bs
-                        = [](const std::function<void(int, int)> &func, int k,
-                            int o, int s, int d, int p, int i, int valid_kd,
-                            expr &cur_pos) {
-                            int valid_k;
-                            auto current_builder = get_current_builder();
-                            current_builder->push_scope();
-                            func(k, valid_kd);
-                            stmt else_stmt = current_builder->pop_scope();
-                            for (auto pos = 0; pos < o; ++pos) {
-                              auto pos_begin = pos * s - p;
-                              valid_k = 0;
-                              auto ker_pos = pos_begin;
-                              for (auto ker = 0; ker < k; ker++) {
-                                if (ker_pos >= 0 && ker_pos < i) { valid_k++; }
-                                ker_pos += d;
-                              }
-                              if (valid_k < k && valid_k > 0) {
-                                current_builder->push_scope();
-                                func(valid_k, valid_kd);
-                                auto then_stmt = current_builder->pop_scope();
-                                auto cond = (cur_pos == pos);
-                                else_stmt = make_if_else_unattached(
-                                  cond, then_stmt, else_stmt);
-                              }
-                            }
-                            current_builder->emit(else_stmt);
-                          };
-
-                      auto do_var_bs_for_2d = [&](const int kd, const int kh) {
-                        generate_var_bs(call_brgemm, kh, oh_, sh_, dh_, ph_b_,
-                          ih_, kd, cur_p);
-                      };
-
-                      if (is_3d_) {
-                        auto cond = large_pad
-                          ? (((cur_iw + src_row_tile_size <= pw_b_)
-                               || (cur_iw > iw_ + pw_b_))
-                            || (num_d_pad >= kd_ || num_h_pad >= kh_))
-                          : (num_d_pad >= kd_ || num_h_pad >= kh_);
-                        _if_(cond && padding_value == 0) {
-                          zero_out_sub_tensor();
-                        }
-                        _else_ {
-                          // 1) fill A_list
-                          if (!use_var_bs) {
-                            _for_(di, 0, kd_) {
-                              // all zero feature map
-                              _if_(
-                                di >= d_pad_begin_idx && di < d_pad_end_idx) {
-                                _for_(hi, 0, kh_) {
-                                  _for_(wi, 0, num_kw) {
-                                    _var_(idx, datatypes::u32);
-                                    idx = builder::make_cast(datatypes::u32,
-                                      di * kh_ * num_kw + hi * num_kw + wi);
-                                    A_list[idx] = tensor_ptr(pbuffer, {0, 0});
-                                  }
-                                }
-                              }
-                              _else_ {
-                                _for_(hi, h_pad_begin_idx, h_pad_end_idx) {
-                                  _for_(wi, 0, num_kw) {
-                                    _var_(idx, datatypes::u32);
-                                    idx = builder::make_cast(datatypes::u32,
-                                      di * kh_ * num_kw + hi * num_kw + wi);
-                                    A_list[idx] = tensor_ptr(pbuffer, {0, 0});
-                                  }
-                                }
-                              }
-                            }
-                          }
-
-                          // 1.1) The middle region which don't need
-                          // to copy input rows but just refer to
-                          // original input buffer.
-                          _if_(cur_tile_begin >= y_unpad_left
-                            && cur_tile_end <= y_unpad_right) {
-                            _for_(di, d_unpad_begin_idx, d_unpad_end_idx) {
-                              _for_(hi, h_unpad_begin_idx, h_unpad_end_idx) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  auto valid_kh
-                                    = h_unpad_end_idx - h_unpad_begin_idx;
-                                  idx = builder::make_cast(datatypes::u32,
-                                    use_var_bs ? ((di - d_unpad_begin_idx)
-                                        * valid_kh * num_kw
-                                      + (hi - h_unpad_begin_idx) * num_kw + wi)
-                                               : (di * kh_ * num_kw
-                                                 + hi * num_kw + wi));
-                                  A_list[idx] = tensor_ptr(input,
-                                    data_offset(n, g, c_o,
-                                      cur_id + di * dd_ - pd_b_,
-                                      cur_ih + hi * dh_ - ph_b_,
-                                      cur_iw + wi * dw_ * kw_step - pw_b_,
-                                      config.C_block));
-                                }
-                              }
-                            }
-                          }
-                          _else_ {
-                            // copy rows and do physical padding
-                            if (!reuse_sub_tensor) {
-                              fill_sub_tensor(
-                                d_unpad_begin_idx, d_unpad_end_idx);
-                            } else {
-                              _if_(num_d_pad > 0 || num_h_pad > 0
-                                || g_init_state[tid]) {
-                                _if_(num_d_pad == 0 && num_h_pad == 0) {
-                                  g_init_state[tid] = false;
-                                }
-                                fill_sub_tensor(
-                                  d_unpad_begin_idx, d_unpad_end_idx);
-                              }
-                              _else_ {
-                                // num_d_pad == 0 && num_h_pad == 0,
-                                // reuse sub-tsr
-                                update_sub_tensor(kd_);
-                              }
-                            }
-                          }
-
-                          // 2) fill B_list
-                          if (use_var_bs) {
-                            _for_(di, d_unpad_begin_idx, d_unpad_end_idx) {
-                              _for_(hi, h_unpad_begin_idx, h_unpad_end_idx) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  auto valid_kh
-                                    = h_unpad_end_idx - h_unpad_begin_idx;
-                                  idx = builder::make_cast(datatypes::u32,
-                                    ((di - d_unpad_begin_idx) * valid_kh
-                                        * num_kw
-                                      + (hi - h_unpad_begin_idx) * num_kw
-                                      + wi));
-                                  if (inverse_filter_) {
-                                    idx = builder::make_cast(datatypes::u32,
-                                      (d_unpad_end_idx - d_unpad_begin_idx)
-                                          * (h_unpad_end_idx
-                                            - h_unpad_begin_idx)
-                                          * num_kw
-                                        - 1 - idx);
-                                  }
-                                  B_list[idx] = tensor_ptr(weight,
-                                    weight_offset(
-                                      g, k_o, c_o, di, hi, wi * kw_step));
-                                }
-                              }
-                            }
-                          } else {
-                            _for_(di, 0, kd_) {
-                              _for_(hi, 0, kh_) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  idx = builder::make_cast(datatypes::u32,
-                                    di * kh_ * num_kw + hi * num_kw + wi);
-                                  if (inverse_filter_) {
-                                    idx = builder::make_cast(datatypes::u32,
-                                      kd_ * kh_ * num_kw - 1 - idx);
-                                  }
-                                  B_list[idx] = tensor_ptr(weight,
-                                    weight_offset(
-                                      g, k_o, c_o, di, hi, wi * kw_step));
-                                }
-                              }
-                            }
-                          }
-
-                          if (use_var_bs) {
-                            // determine the exact value of var_bs for
-                            // brgemm call, Ai & Bi are already
-                            // fulfilled at this stage.
-                            generate_var_bs(do_var_bs_for_2d, kd_, od_, sd_,
-                              dd_, pd_b_, id_, kh_, cur_d);
-                          } else {
-                            call_brgemm(kh_, kd_);
-                          }
-                        }
-                      } else {
-                        auto cond = large_pad
-                          ? (((cur_iw + src_row_tile_size <= pw_b_)
-                               || (cur_iw > iw_ + pw_b_))
-                            || (num_h_pad >= kh_))
-                          : (num_h_pad >= kh_);
-                        _if_(cond && padding_value == 0) {
-                          zero_out_sub_tensor();
-                        }
-                        _else_ {
-                          auto fill_A_and_B_list = [&]() {
-                            if (!use_var_bs) {
-                              // Add zero-padding tensorptr to A_list
-                              _for_(hi, h_pad_begin_idx, h_pad_end_idx) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  idx = builder::make_cast(
-                                    datatypes::u32, hi * num_kw + wi);
-                                  A_list[idx] = tensor_ptr(pbuffer, {0, 0});
-                                }
-                              }
-
-                              _if_(
-                                h_pad_begin_idx == 0 && h_unpad_end_idx < kh_) {
-                                // Add zero-padding tensorptr to
-                                // A_list
-                                _for_(hi, h_unpad_end_idx, kh_) {
-                                  _for_(wi, 0, num_kw) {
-                                    _var_(idx, datatypes::u32);
-                                    idx = builder::make_cast(
-                                      datatypes::u32, hi * num_kw + wi);
-                                    A_list[idx] = tensor_ptr(pbuffer, {0, 0});
-                                  }
-                                }
-                              }
-                            }
-                            _if_(cur_tile_begin >= y_unpad_left
-                              && cur_tile_end <= y_unpad_right) {
-                              _for_(hi, h_unpad_begin_idx, h_unpad_end_idx) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  idx = builder::make_cast(datatypes::u32,
-                                    (use_var_bs ? (hi - h_unpad_begin_idx) : hi)
-                                        * num_kw
-                                      + wi);
-                                  A_list[idx] = tensor_ptr(input,
-                                    data_offset(n, g, c_o, 0,
-                                      cur_ih + hi * dh_ - ph_b_,
-                                      cur_iw + wi * dw_ * kw_step - pw_b_,
-                                      config.C_block));
-                                }
-                              }
-                            }
-                            _else_ {
-                              // copy rows and do physical padding
-                              if (!reuse_sub_tensor) {
-                                fill_sub_tensor();
-                              } else {
-                                _if_(num_h_pad > 0 || g_init_state[tid]) {
-                                  _if_(num_h_pad == 0) {
-                                    g_init_state[tid] = false;
-                                  }
-                                  fill_sub_tensor();
-                                }
-                                _else_ { update_sub_tensor(); }
-                              }
-                            }
-
-                            // 2) fill B_list
-                            if (use_var_bs) {
-                              _for_(hi, h_unpad_begin_idx, h_unpad_end_idx) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  idx = builder::make_cast(datatypes::u32,
-                                    (hi - h_unpad_begin_idx) * num_kw + wi);
-                                  if (inverse_filter_) {
-                                    idx = builder::make_cast(datatypes::u32,
-                                      (h_unpad_end_idx - h_unpad_begin_idx)
-                                          * num_kw
-                                        - 1 - idx);
-                                  }
-                                  B_list[idx] = tensor_ptr(weight,
-                                    weight_offset(
-                                      g, k_o, c_o, 0, hi, wi * kw_step));
-                                }
-                              }
-                            } else {
-                              _for_(hi, 0, kh_) {
-                                _for_(wi, 0, num_kw) {
-                                  _var_(idx, datatypes::u32);
-                                  idx = builder::make_cast(
-                                    datatypes::u32, hi * num_kw + wi);
-                                  if (inverse_filter_) {
-                                    idx = builder::make_cast(
-                                      datatypes::u32, kh_ * num_kw - 1 - idx);
-                                  }
-                                  B_list[idx] = tensor_ptr(weight,
-                                    weight_offset(
-                                      g, k_o, c_o, 0, hi, wi * kw_step));
-                                }
-                              }
-                            }
-                          };
-                          fill_A_and_B_list();
-                          if (use_var_bs) {
-                            do_var_bs_for_2d(kd_, kh_);
-                          } else {
-                            call_brgemm(kh_);
-                          }
-                        }
-                      }
-                    }
-
-                    // tile_q * K_block
-                    create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                      k_o, 1, d_o * config.tile_d + d_i, 1,
-                      p_o * config.tile_p + p_i, 1, q_o * config.tile_q,
-                      config.tile_q, config.K_block);
-                  }
-                  // tile_p * tile_q *K_block
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, d_o * config.tile_d + d_i, 1, p_o * config.tile_p,
-                    config.tile_p, q_o * config.tile_q, config.tile_q,
-                    config.K_block);
-                }
-                if (reuse_sub_tensor) {
-                  // oh_ * tile_q * K_block
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, d_o * config.tile_d + d_i, 1, 0, oh_,
-                    q_o * config.tile_q, config.tile_q, config.K_block);
-                } else {
-                  // tile_p * ow_ * K_block
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, d_o * config.tile_d + d_i, 1, p_o * config.tile_p,
-                    config.tile_p, 0, ow_, config.K_block);
-                }
-              }
-              if (is_3d_) {
-                if (reuse_sub_tensor) {
-                  // tile_d * oh_ * tile_q * K_block
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, d_o * config.tile_d, config.tile_d, 0, oh_,
-                    q_o * config.tile_q, config.tile_q, config.K_block);
-                } else {
-                  // tile_d * tile_p * ow_ * K_block
-                  create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1,
-                    k_o, 1, d_o * config.tile_d, config.tile_d,
-                    p_o * config.tile_p, config.tile_p, 0, ow_, config.K_block);
-                }
-              }
-            }
-            if (is_3d_) {
-              if (reuse_sub_tensor) {
-                // od_ * oh_ * tile_q * K_block
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                  1, 0, od_, 0, oh_, q_o * config.tile_q, config.tile_q,
-                  config.K_block);
-              } else {
-                // od_ * tile_p * ow_ * K_block
-                create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o,
-                  1, 0, od_, p_o * config.tile_p, config.tile_p, 0, ow_,
-                  config.K_block);
-              }
-            }
-          }
-          // od_ * oh_ * ow_ * K_block
-          create_anchor(fusion, owner_->get_outputs()[0], n, 1, g, 1, k_o, 1, 0,
-            od_, 0, oh_, 0, ow_, config.K_block);
-        }
-      }
-      // od_ *oh_ *ow_ *oc
-      if (groups_ == 1) {
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, 1,
-          outer_k * K_num_block / oc_split, K_num_block / oc_split, 0, od_, 0,
-          oh_, 0, ow_, config.K_block);
-      } else if (groups_ > 1 && oc_split == 1) {
-        create_anchor(fusion, owner_->get_outputs()[0], n, 1, 0, groups_, 0,
-          K_num_block, 0, od_, 0, oh_, 0, ow_, config.K_block);
-      } else { /*noops as discontiguous slice for groups >1 && outer_k > 1*/
-      }
-    }
-  }
-
-  // bind outer loops with axis hint
-  bind_output_loop_axis(lok, "C");
-  bind_output_loop_axis(ln, "N");
-  bind_output_loop_axis(lg, "G");
-  bind_output_loop_axis(lk, "C");
-  bind_output_loop_axis(lp, reuse_sub_tensor ? "W" : "H");
-}
-
-void gen_conv_fwd_t::schedule_loops(context_ptr ctx,
-  const conv_fwd_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  COMPILE_ASSERT(
-    static_cast<int>(fors.size()) == 5 || static_cast<int>(fors.size()) == 6,
-    "expected to have 5 for loops, but got " << fors.size() << " for loops.");
-  for_loop ln = fors.at(0), lg = fors.at(1), lk = fors.at(2), ld = fors.at(3),
-           lp = fors.at(4), lok;
-  if (fors.size() == 6) {
-    lok = fors[5];
-    ln = lok->fuse(ln);
-  }
-  auto loop_sched = config.loop_sched;
-  if (loop_sched == 0) {
-    // default loop order ln->lk->lp->ld,
-    // merge ln, lk, lp
-    if (!is_1x1_conv_) {
-      if (is_3d_) {
-        ln->reorder(body, {ln, lg, lk, lp, ld});
-      } else {
-        ln->reorder(body, {ln, lg, lk, lp});
-      }
-    }
-    auto outer = ln->fuse(lg);
-    outer = outer->fuse(lk);
-    outer = outer->fuse(lp);
-    if (is_3d_ && ld.defined()) { outer->fuse(ld); }
-    outer->kind_ = for_type::PARALLEL;
-  } else if (loop_sched == 1) {
-    // loop order lk->lp->ln
-    // merge lk, lp, ln
-    for_loop outer;
-    if (is_3d_ && ld.defined()) {
-      ln->reorder(body, {lg, lk, lp, ld, ln});
-      outer = lg->fuse(lk);
-      outer = outer->fuse(lp);
-      outer = outer->fuse(ld);
-    } else {
-      ln->reorder(body, {lg, lk, lp, ln});
-      outer = lg->fuse(lk);
-      outer = outer->fuse(lp);
-    }
-    outer = outer->fuse(ln);
-    outer->kind_ = for_type::PARALLEL;
-  } else if (loop_sched == 2) {
-    // loop order lp->lk->ln
-    // merge lp, lk, ln
-    for_loop outer;
-    if (is_3d_ && ld.defined()) {
-      ln->reorder(body, {lp, ld, lg, lk, ln});
-      outer = lp->fuse(ld);
-      outer = lp->fuse(lg);
-      outer = outer->fuse(lk);
-    } else {
-      ln->reorder(body, {lp, lg, lk, ln});
-      outer = lp->fuse(lg);
-      outer = outer->fuse(lk);
-    }
-    outer = outer->fuse(ln);
-    outer->kind_ = for_type::PARALLEL;
-  } else if (loop_sched == 3) {
-    // loop order lk->ln->lp
-    // merge lk,ln,lp
-    ln->reorder(body, {lg, lk, ln, lp});
-    auto outer = lg->fuse(lk);
-    outer = outer->fuse(ln);
-    outer = outer->fuse(lp);
-    if (is_3d_ && ld.defined()) { outer = outer->fuse(ld); }
-    outer->kind_ = for_type::PARALLEL;
-  }
-}
-
-bool gen_conv_fwd_t::generate(context_ptr ctx, const conv_fwd_config_t &config,
-  fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-  const std::vector<expr> &outputs, std::vector<for_loop> &loops) const {
-  COMPILE_ASSERT(inputs.size() == 2,
-    "Expecting 2 inputs for conv, but got " << inputs.size() << " inputs.");
-  COMPILE_ASSERT(outputs.size() == 1,
-    "Expecting 1 output for conv, but got " << outputs.size() << " output.");
-
-  if (!is_3d_) {
-    COMPILE_ASSERT(id_ == 1 && kd_ == 1 && od_ == 1 && config.tile_d == 1,
-      "id/kd/od/tile_d should be 1 for non-3D conv, but got id="
-        << id_ << ", kd=" << kd_ << ", od=" << od_
-        << ", tile_d=" << config.tile_d << ".");
-  }
-
-  int K_block = config.K_block;
-  int C_block = config.C_block;
-  int tile_d = config.tile_d;
-  int tile_p = config.tile_p;
-  int tile_q = config.tile_q;
-  int tile_os = config.tile_os;
-  int pack_input = config.pack_input;
-  int loop_sched = config.loop_sched;
-  int K_num_block = utils::rnd_up(oc_, K_block) / K_block;
-  int C_num_block = utils::rnd_up(ic_, C_block) / C_block;
-
-  // kpack is used to determine the vnni block format
-  //  +----+--------------+
-  //  | 1  | FP32         |
-  //  +----+--------------+
-  //  | 2  | VNNI_BF16    |
-  //  +----+--------------+
-  //  | 4  | VNNI_INT8    |
-  //  +----+--------------+
-  int kpack = 1;
-  auto dtype_input = get_input_dtype();
-  auto dtype_weight = get_weight_dtype();
-  auto dtype_output = get_output_dtype();
-  if (dtype_input == datatypes::bf16) {
-    COMPILE_ASSERT((dtype_weight == datatypes::bf16),
-      "Weights should be bf16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtype_output == datatypes::f32),
-      "Output should be f32 when data and weights are in bf16.");
-    kpack = 2;
-  }
-  if (dtype_input == datatypes::f16) {
-    COMPILE_ASSERT((dtype_weight == datatypes::f16),
-      "Weights should be f16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtype_output == datatypes::f32),
-      "Output should be f32 when data and weights are in f16.");
-    kpack = 2;
-  }
-  if (utils::is_one_of(dtype_input, datatypes::s8, datatypes::u8)) {
-    COMPILE_ASSERT((dtype_weight == datatypes::s8),
-      "Weights should be s8 when \
-            data is s8/u8, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtype_output == datatypes::s32),
-      "Output should be s32 when data and weights are in "
-      "s8/u8.");
-    kpack = 4;
-  }
-
-  const bool use_os_blocking
-    = try_os_blocking_ && ops::is_amx_dtype(ctx, dtype_input);
-  const bool pack_rows = use_os_blocking && (tile_os > 0 && ow_ % tile_os != 0);
-  int os = actual_os_;
-  COMPILE_ASSERT(tile_d && (od_ % tile_d == 0),
-    "od should be dividable by tile_d, but got od=" << od_ << " tile_d="
-                                                    << tile_d << ".");
-  auto use_rl = attrs_.get_or_else("use_rl", ops::rl_kind::NO_LOWERING);
-  COMPILE_ASSERT(use_rl != ops::rl_kind::FULL_LOWERING,
-    "full reduce lowering should be dispatched to conv_rl!");
-  if (use_rl == ops::rl_kind::KW_LOWERING) {
-    COMPILE_ASSERT(C_block == ic_,
-      "C_block should be same as ic for kw_lowering, but got "
-      "C_block="
-        << C_block << " ic=" << ic_ << ".");
-  }
-
-  std::vector<char> os_mask = {};
-  expr os_acc_size = expr();
-  if (pack_rows) {
-    os = adj_os_;
-    int os_num_block = os / tile_os;
-    int adj_ow = ow_ + num_elems_skip_per_ow_;
-    os_mask.resize(os);
-    for (int i = 0; i < os; ++i) {
-      if (i % adj_ow < ow_) {
-        os_mask[i] = 1;
-      } else {
-        os_mask[i] = 0;
-      }
-    }
-
-    _tensor_(conv_os_acc_size, datatypes::s32, {os_num_block});
-    int acc_size = 0;
-    int blk_size = 0;
-    for (int i = 0; i < os_num_block; ++i) {
-      blk_size = std::accumulate(
-        os_mask.begin() + i * tile_os, os_mask.begin() + (i + 1) * tile_os, 0);
-      conv_os_acc_size[i] = acc_size;
-      acc_size += blk_size;
-    }
-
-    os_acc_size = conv_os_acc_size;
-  }
-
-  if (use_os_blocking) {
-    COMPILE_ASSERT((tile_os > 0) && (os % tile_os == 0),
-      "os should be dividable by tile_os, but got os=" << os << " tile_os="
-                                                       << tile_os << ".");
-  } else {
-    COMPILE_ASSERT((tile_p > 0) && (oh_ % tile_p == 0),
-      "oh should be dividable by tile_p, but got oh=" << oh_ << " tile_p="
-                                                      << tile_p << ".");
-    COMPILE_ASSERT((tile_q > 0) && (ow_ % tile_q == 0),
-      "ow should be dividable by tile_q, but got ow=" << ow_ << " tile_q="
-                                                      << tile_q << ".");
-  }
-
-  for_loop ln, lg, lk, ld, lp;
-  loops = {ln, lg, lk, ld, lp};
-  expr output = outputs[op_params_t::out];
-  expr input = inputs[op_params_t::in_data];
-  expr weight = inputs[op_params_t::in_weight];
-  if (is_1x1_conv_) {
-    COMPILE_ASSERT(pd_b_ == 0 && ph_b_ == 0 && pw_b_ == 0 && pd_e_ == 0
-        && ph_e_ == 0 && pw_e_ == 0,
-      "1x1 conv doesn't support padding!");
-    COMPILE_ASSERT(
-      !inverse_filter_, "1x1 conv doesn't support inverse convolution.");
-    if (is_3d_ || (pack_input == 0 && (sd_ > 1 || sh_ > 1 || sw_ > 1))) {
-      compute_1x1_no_pack_input(ctx, config, fusion, output, input, weight,
-        loops, K_num_block, C_num_block, os, kpack);
-    } else {
-      compute_1x1_pack_input(ctx, config, fusion, output, input, weight, loops,
-        K_num_block, C_num_block, os, kpack);
-    }
-  } else {
-    if (pd_b_ == 0 && ph_b_ == 0 && pw_b_ == 0 && pd_e_ == 0 && ph_e_ == 0
-      && pw_e_ == 0) {
-      COMPILE_ASSERT(!inverse_filter_,
-        "conv NxN (no padding) does not support inverse "
-        "convolution.");
-      if (is_3d_) {
-        compute_conv3d_no_padding(ctx, config, fusion, output, input, weight,
-          loops, K_num_block, C_num_block, os, kpack);
-      } else {
-        compute_conv_no_padding(ctx, config, fusion, output, input, weight,
-          loops, K_num_block, C_num_block, os, kpack, use_os_blocking,
-          pack_rows, os_acc_size, os_mask);
-      }
-    } else {
-      auto padding_value = attrs_.get_or_else("padding_value", 0);
-      if (ops::is_amx_dtype(ctx, dtype_input) || is_3d_
-        || use_rl == ops::rl_kind::KW_LOWERING || inverse_filter_
-        || padding_value != 0) {
-        if (inverse_filter_) {
-          SC_INFO << "inverse_filter_ used in conv padding v2.";
-        }
-        compute_conv_padding_v2(ctx, config, fusion, output, input, weight,
-          loops, K_num_block, C_num_block, os, kpack);
-      } else {
-        COMPILE_ASSERT(!inverse_filter_,
-          "conv padding v1 does not support inverse convolution.");
-        compute_conv_padding(ctx, config, fusion, output, input, weight, loops,
-          K_num_block, C_num_block, os, kpack);
-      }
-    }
-  }
-  return true;
-}
-#undef CONV_ARG_LIST
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_fwd.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_fwd.hpp
deleted file mode 100644
index 6c8f02993f9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_fwd.hpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_FWD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_FWD_HPP
-
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-#include <ops/body_generator.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-struct conv_fwd_config_t {
-  int K_block = 0;
-  int C_block = 0;
-  int tile_d = 1;
-  int tile_p = 1;
-  int tile_q = 1;
-  int tile_os = -1;
-  int pack_input = 0;
-  int loop_sched = 0;
-
-  conv_fwd_config_t() = default;
-
-  conv_fwd_config_t(int K_block, int C_block, int tile_d, int tile_p,
-    int tile_q, int tile_os, int pack_input, int loop_sched)
-    : K_block(K_block)
-    , C_block(C_block)
-    , tile_d(tile_d)
-    , tile_p(tile_p)
-    , tile_q(tile_q)
-    , tile_os(tile_os)
-    , pack_input(pack_input)
-    , loop_sched(loop_sched) {}
-};
-
-class gen_conv_fwd_t : public body_generator_t<conv_fwd_config_t> {
-public:
-  struct op_params_t {
-    static constexpr int in_data = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out = 0;
-  };
-  using parent = body_generator_t<conv_fwd_config_t>;
-  using parent::generate;
-
-  std::tuple<int, int, int> get_output_shape() {
-    return std::tuple<int, int, int>(od_, oh_, ow_);
-  }
-
-  gen_conv_fwd_t(sc_op *owner, const sc_dims &stride, const sc_dims &pads_begin,
-    std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-    : gen_conv_fwd_t(owner, stride, sc_dims {1}, pads_begin, pads_begin,
-      std::move(ins), std::move(outs)) {}
-
-  gen_conv_fwd_t(sc_op *owner, const sc_dims &stride, const sc_dims &pads_begin,
-    const sc_dims &pads_end, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs)
-    : gen_conv_fwd_t(owner, stride, sc_dims {1}, pads_begin, pads_end,
-      std::move(ins), std::move(outs)) {}
-
-  gen_conv_fwd_t(sc_op *owner, const sc_dims &stride, const sc_dims &dilation,
-    const sc_dims &pads_begin, const sc_dims &pads_end,
-    std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs);
-
-  void adjust_config_for_parallelisem(
-    const context_ptr &ctx, conv_fwd_config_t &cfg) const;
-  void adjust_config_for_cache_efficiency(
-    const context_ptr &ctx, conv_fwd_config_t &cfg) const;
-  void adjust_config_for_brgemm_efficiency(
-    const context_ptr &ctx, conv_fwd_config_t &cfg) const;
-
-  float get_gflop() const override;
-
-  const sc_dims &get_input_plain_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_input_blocking_dims() const {
-    return in_tensors_[0].get_blocking_dims();
-  }
-
-  const sc_dims &get_weight_plain_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_plain_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_output_blocking_dims() const {
-    return out_tensors_[0].get_blocking_dims();
-  }
-
-  sc_data_type_t get_input_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_weight_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_output_dtype() const { return out_tensors_[0].dtype_; }
-
-  std::vector<expr> data_offset(const expr &N, const expr &G, const expr &C,
-    const expr &D, const expr &H, const expr &W, const expr &C_block,
-    const expr &c_idx = expr(0), const bool &force_3d = false) const;
-  std::vector<expr> output_offset(const expr &N, const expr &G, const expr &C,
-    const expr &D, const expr &H, const expr &W, const expr &C_block,
-    const expr &c_idx = expr(0)) const;
-  std::vector<expr> weight_offset(const expr &G, const expr &K, const expr &C,
-    const expr &D, const expr &R, const expr &S) const;
-  void create_anchor(fusion_anchor_mgr_t *fusion,
-    const graph_tensor_ptr &output_gt, const expr &n, const int n_len,
-    const expr &g, const expr &g_len, const expr &k, const int k_len,
-    const expr &d, const int d_len, const expr &p, const expr &p_len,
-    const expr &q, const int q_len, const int K_block) const;
-
-  bool generate(context_ptr ctx, const conv_fwd_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const conv_fwd_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-  void bind_output_loop_axis(const for_loop &loop,
-    const std::vector<std::string> &axis, bool is_block = false) const;
-  void bind_output_loop_axis(
-    const for_loop &loop, const std::string axis, bool is_block = false) const {
-    this->bind_output_loop_axis(
-      loop, std::vector<std::string> {axis}, is_block);
-  }
-
-  std::vector<int> get_os_factors();
-
-  bool inverse_filter_ = false;
-
-#define CONV_ARG_LIST \
-  const context_ptr &ctx, const conv_fwd_config_t &config, \
-    fusion_anchor_mgr_t *fusion, expr &output, const expr &input, \
-    const expr &weight, std::vector<for_loop> &loops, const int K_num_block, \
-    const int C_num_block, const int os, \
-    const int kpack = 1, const bool use_os_blocking = false, \
-              const bool pack_rows = false, const expr &os_acc_size = expr(), \
-              const std::vector<char> &os_mask = std::vector<char>()
-  void compute_1x1_no_pack_input(CONV_ARG_LIST) const;
-  void compute_1x1_pack_input(CONV_ARG_LIST) const;
-  void compute_conv3d_no_padding(CONV_ARG_LIST) const;
-  void compute_conv_no_padding(CONV_ARG_LIST) const;
-  void compute_conv_padding(CONV_ARG_LIST) const;
-  void compute_conv_padding_v2(CONV_ARG_LIST) const;
-#undef CONV_ARG_LIST
-
-  size_t ndims_ = 0;
-  int groups_ = 1;
-  int mb_ = 0, ic_ = 0, id_ = 0, ih_ = 0, iw_ = 0;
-  int oc_ = 0, kd_ = 0, kh_ = 0, kw_ = 0;
-  int od_ = 0, oh_ = 0, ow_ = 0;
-  int sd_ = 0, sh_ = 0, sw_ = 0;
-  int pd_b_ = 0, ph_b_ = 0, pw_b_ = 0;
-  int pd_e_ = 0, ph_e_ = 0, pw_e_ = 0;
-  int dd_ = 0, dh_ = 0, dw_ = 0;
-  int actual_os_ = 0, adj_os_ = 0;
-  int num_elems_skip_per_ow_ = 0;
-  bool try_os_blocking_ = false;
-  bool is_1x1_conv_ = false;
-  bool is_3d_ = false;
-  bool is_group_conv_ = false;
-  bool blocking_input_ = false;
-  bool blocking_output_ = false;
-  any_map_t attrs_;
-  void validate_conv_fwd_default_config(
-    const context_ptr &ctx, conv_fwd_config_t &cfg) const;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_rl.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_rl.cpp
deleted file mode 100644
index c7bece94e30..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_rl.cpp
+++ /dev/null
@@ -1,559 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "conv_rl.hpp"
-#include <algorithm>
-#include <utility>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/binding_axis.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <runtime/config.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-using namespace dnnl::impl::graph::gc::builder;
-SC_MODULE(conv_rl)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using ops::conv_fwd_rl_config_t;
-// clang-format off
-SC_CLASS(conv_fwd_rl_config_t)
-  SC_FIELD(brgemm_m)
-  SC_FIELD(brgemm_n)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-static constexpr int cache_line_size = 64;
-
-config_ptr gen_conv_fwd_rl_t::get_default_config(context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<conv_fwd_rl_config_t>();
-  conv_fwd_rl_config_t &cfg = *ret.unchecked_get_as<conv_fwd_rl_config_t>();
-  cfg.brgemm_m = ow_;
-  std::vector<int> brgemm_m_candidates = {32, 16};
-  bool has_proper_brgemm_m = false;
-  for (auto &c : brgemm_m_candidates) {
-    if (ow_ % c == 0) {
-      cfg.brgemm_m = c;
-      has_proper_brgemm_m = true;
-      break;
-    }
-  }
-  if (!has_proper_brgemm_m) {
-    cfg.brgemm_m = utils::get_blocks(ow_, 16, 64).back();
-  }
-  cfg.brgemm_n = utils::get_blocks(oc_, 16).back();
-
-  return std::move(ret);
-}
-
-std::vector<expr> gen_conv_fwd_rl_t::data_offset(const expr &N, const expr &G,
-  const expr &C, const expr &D, const expr &H, const expr &W,
-  const expr &C_block, const expr &c_idx) const {
-  return is_group_conv_
-    ? (!blocking_input_ ? std::vector<expr> {N, H, W, G, C * C_block + c_idx}
-                        : std::vector<expr> {N, G, C, H, W, c_idx})
-    : (!blocking_input_ ? std::vector<expr> {N, H, W, C * C_block + c_idx}
-                        : std::vector<expr> {N, C, H, W, c_idx});
-}
-
-std::vector<expr> gen_conv_fwd_rl_t::output_offset(const expr &N, const expr &G,
-  const expr &C, const expr &D, const expr &H, const expr &W,
-  const expr &C_block, const expr &c_idx) const {
-  return is_group_conv_
-    ? (!blocking_output_ ? std::vector<expr> {N, H, W, G, C * C_block + c_idx}
-                         : std::vector<expr> {N, G, C, H, W, c_idx})
-    : (!blocking_output_ ? std::vector<expr> {N, H, W, C * C_block + c_idx}
-                         : std::vector<expr> {N, C, H, W, c_idx});
-}
-
-void gen_conv_fwd_rl_t::create_anchor(fusion_anchor_mgr_t *fusion,
-  const graph_tensor_ptr &output_gt, const expr &n, const int n_len,
-  const expr &g, const expr &g_len, const expr &k, const int k_len,
-  const expr &d, const int d_len, const expr &p, const expr &p_len,
-  const expr &q, const int q_len, const int K_block) const {
-  if (fusion) {
-    if (is_group_conv_) {
-      fusion->create_fusion_anchor(slice_map {{output_gt.get(),
-        blocking_output_
-          ? slice_range_list {{{n, n_len}, {g, g_len}, {k, k_len}, {p, p_len},
-            {q, q_len}, {0, K_block}}}
-          : slice_range_list {{{n, n_len}, {p, p_len}, {q, q_len}, {g, g_len},
-            {k * K_block, k_len * K_block}}}}});
-
-    } else {
-      fusion->create_fusion_anchor(slice_map {{output_gt.get(),
-        blocking_output_ ? slice_range_list {{{n, n_len}, {k, k_len},
-          {p, p_len}, {q, q_len}, {0, K_block}}}
-                         : slice_range_list {{{n, n_len}, {p, p_len},
-                           {q, q_len}, {k * K_block, k_len * K_block}}}}});
-    }
-  }
-}
-
-void gen_conv_fwd_rl_t::validate_default_config(
-  const context_ptr &ctx, conv_fwd_rl_config_t &cfg) const {}
-
-float gen_conv_fwd_rl_t::get_gflop() const {
-  float result
-    = (float)mb_ * oc_ * 2.0 * ic_ * kh_ * kw_ * oh_ * ow_ / (float)1e9;
-  return result;
-}
-
-gen_conv_fwd_rl_t::gen_conv_fwd_rl_t(sc_op *owner, const sc_dims &stride,
-  const sc_dims &dilations, const sc_dims &pads_begin, const sc_dims &pads_end,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs)) {
-  if (owner) { attrs_ = owner->attrs_; }
-  COMPILE_ASSERT(attrs_.has_key("use_rl"), "expected to have 'use_rl' attrs");
-  COMPILE_ASSERT(in_tensors_.size() == 2,
-    "Wrong number of inputs, expected to be 2 but got " << in_tensors_.size()
-                                                        << ".");
-  COMPILE_ASSERT(out_tensors_.size() == 1,
-    "Wrong number of output, expected to be 1 but got " << out_tensors_.size()
-                                                        << ".");
-  auto no_dilation = std::all_of(
-    dilations.begin(), dilations.end(), [](const int d) { return d == 1; });
-  COMPILE_ASSERT(no_dilation, "conv with dilation is not supported yet!");
-
-  auto input_plain_dims = get_input_plain_dims();
-  auto weight_plain_dims = attrs_.get<sc_dims>("origin_wei_plain_dims");
-  auto out_plain_dims = get_output_plain_dims();
-
-  ndims_ = input_plain_dims.size();
-  groups_ = static_cast<int>(attrs_.get_or_else("groups", 1));
-  is_group_conv_ = groups_ > 1;
-  COMPILE_ASSERT(ndims_ == 4UL + is_group_conv_,
-    "reduce lowering currently only support 4D input!")
-  COMPILE_ASSERT(weight_plain_dims.size() == ndims_,
-    "Wrong weight dims, only support 4D weights, but got "
-      << weight_plain_dims.size() << "D.");
-
-  COMPILE_ASSERT(input_plain_dims[1 + is_group_conv_]
-      == weight_plain_dims[1 + is_group_conv_],
-    "expect ic == kic, but got "
-      << input_plain_dims[1 + is_group_conv_] << " vs "
-      << weight_plain_dims[1 + is_group_conv_] << ".");
-
-  mb_ = input_plain_dims[0];
-  ic_ = input_plain_dims[1 + is_group_conv_];
-  ih_ = input_plain_dims[2 + is_group_conv_];
-  iw_ = input_plain_dims[3 + is_group_conv_];
-
-  oc_ = weight_plain_dims[is_group_conv_];
-  kh_ = weight_plain_dims[2 + is_group_conv_];
-  kw_ = weight_plain_dims[3 + is_group_conv_];
-  oh_ = out_plain_dims[2 + is_group_conv_];
-  ow_ = out_plain_dims[3 + is_group_conv_];
-  pt_ = pads_begin[0], pb_ = pads_begin[0];
-  pl_ = pads_end[0], pr_ = pads_end[0];
-  if (pads_begin.size() > 1) {
-    pt_ = pads_begin[ndims_ - 4 - is_group_conv_];
-    pl_ = pads_begin[ndims_ - 3 - is_group_conv_];
-  }
-  if (pads_end.size() > 1) {
-    pb_ = pads_end[ndims_ - 4 - is_group_conv_];
-    pr_ = pads_end[ndims_ - 3 - is_group_conv_];
-  }
-
-  sh_ = stride[0], sw_ = stride[0];
-  if (stride.size() > 1) {
-    sh_ = stride[ndims_ - 4 - is_group_conv_];
-    sw_ = stride[ndims_ - 3 - is_group_conv_];
-  }
-
-  LDA_ = kh_ * ic_ * sw_;
-  actual_iw_ = (ow_ - 1) * sw_ + kw_;
-  actual_ih_ = (oh_ - 1) * sh_ + kh_;
-
-  COMPILE_ASSERT(pt_ <= kh_ && pb_ <= kh_ && pl_ <= kw_ && pr_ <= kw_,
-    "Not support the case of padding > filter_size!");
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int height_threshold = kh_;
-  parallel_axis_ = (mb_ >= num_threads)
-    ? parallel_kind::BATCH
-    : ((int)utils::divide_and_ceil(oh_, num_threads) > height_threshold
-        ? parallel_kind::HEIGHT
-        : parallel_kind::BATCH);
-
-  num_brgemm_k_ = attrs_.get<int>("num_brgemm_k");
-  brgemm_k_ = attrs_.get<int>("brgemm_k");
-  extra_padding_ = attrs_.get<int>("extra_padding");
-  int last_row_size = actual_ih_ * ic_ + extra_padding_;
-  if (parallel_kind::HEIGHT == parallel_axis_) {
-    last_row_size
-      = ((utils::divide_and_ceil(oh_, num_threads) - 1) * sh_ + kh_) * ic_
-      + extra_padding_;
-  }
-  aux_buf_size_ = (actual_iw_ - 1) * kh_ * ic_ + last_row_size;
-  blocking_input_ = get_input_blocking_dims().size() > ndims_;
-  blocking_output_ = get_output_blocking_dims().size() > ndims_;
-}
-
-bool gen_conv_fwd_rl_t::generate(context_ptr ctx,
-  const conv_fwd_rl_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  COMPILE_ASSERT(inputs.size() == 2,
-    "Expecting 2 inputs for conv, but got " << inputs.size() << " inputs.");
-  COMPILE_ASSERT(outputs.size() == 1,
-    "Expecting 1 output for conv, but got " << outputs.size() << " output.");
-
-  int brgemm_m = config.brgemm_m;
-  int brgemm_n = config.brgemm_n;
-  int K_num_block = oc_ / brgemm_n;
-  COMPILE_ASSERT(brgemm_n && (oc_ % brgemm_n == 0),
-    "oc should be dividable by brgemm_n, but got oc=" << oc_ << " brgemm_n="
-                                                      << brgemm_n << ".");
-  COMPILE_ASSERT((brgemm_m > 0) && (ow_ % brgemm_m == 0),
-    "oh should be dividable by brgemm_m, but got oh=" << ow_ << " brgemm_m="
-                                                      << brgemm_m << ".");
-
-  auto input_dtype = get_input_dtype();
-  auto weight_dtype = get_weight_dtype();
-  auto output_dtype = get_output_dtype();
-  if (input_dtype == datatypes::bf16) {
-    COMPILE_ASSERT((weight_dtype == datatypes::bf16),
-      "Weights should be bf16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((output_dtype == datatypes::f32),
-      "Output should be f32 when data and weights are in bf16.");
-  }
-  if (input_dtype == datatypes::f16) {
-    COMPILE_ASSERT((weight_dtype == datatypes::f16),
-      "Weights should be f16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((output_dtype == datatypes::f32),
-      "Output should be f32 when data and weights are in f16.");
-  }
-  if (utils::is_one_of(input_dtype, datatypes::s8, datatypes::u8)) {
-    COMPILE_ASSERT((weight_dtype == datatypes::s8),
-      "Weights should be s8 when \
-            data is s8/u8, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((output_dtype == datatypes::s32),
-      "Output should be s32 when data and weights are in "
-      "s8/u8.");
-  }
-
-  expr output = outputs[0];
-  expr input = inputs[0];
-  expr weight = inputs[1];
-
-  int given_num_threads = runtime_config_t::get().get_num_threads();
-  int num_threads = given_num_threads;
-  if (parallel_axis_ == parallel_kind::BATCH) {
-    num_threads = std::min(given_num_threads, mb_ * groups_);
-    if (num_threads < given_num_threads) {
-      SC_WARN
-        << "The actual parallelism is less than given due to task assignment, "
-        << num_threads << " vs " << given_num_threads << ".";
-    }
-  }
-  COMPILE_ASSERT(num_threads <= given_num_threads,
-    "expect num_threads <= given_num_threads, but got "
-      << num_threads << " vs " << given_num_threads << ".");
-
-  auto mb_expr = input.checked_as<tensor>()->dims_[0];
-  auto lanes = static_cast<int>(
-    ctx->get_max_vector_lanes(in_tensors_[0].dtype_.type_code_));
-
-  int real_pb = std::max(actual_ih_ - pt_ - ih_, 0);
-  COMPILE_ASSERT(real_pb <= pb_,
-    "expect real_pb <= pb_, but got " << real_pb << ", vs " << pb_ << ".");
-  int real_pr = std::max(actual_iw_ - pl_ - iw_, 0);
-  COMPILE_ASSERT(real_pr <= pr_,
-    "expect real_pr <= pr_, but got " << real_pr << ", vs " << pr_ << ".");
-  int ih_remaining = (pt_ + ih_ + pb_ - kh_) % sh_;
-  int oh_num_pb = (ih_remaining == 0)
-    ? utils::divide_and_ceil(pb_, sh_)
-    : ((pb_ > ih_remaining) ? utils::divide_and_ceil(pb_ - ih_remaining, sh_)
-                            : 0);
-  int oh_pr_idx = oh_num_pb > 0 ? (oh_ - oh_num_pb) : -1;
-  auto padding_value = attrs_.get_or_else("padding_value", 0);
-  auto ic_lanes
-    = ic_ * utils::get_sizeof_type(get_input_dtype()) / 8 > cache_line_size
-    ? 1
-    : ic_;
-  auto ic_real_lane
-    = ic_lanes == 1 ? 1 : std::min(lanes, get_minimal_lanes(ic_lanes));
-  expr ic_mask = ic_lanes == 1
-    ? expr()
-    : builder::make_cast(get_dtype(ic_real_lane), convert_int_to_mask(ic_));
-
-  auto init_aux_buf = [&](const expr &aux_buf, const expr &n_o, const expr &g,
-                        const expr &p, const expr &init_idx, const expr &tid) {
-    // only need to copy the valid area as all the remaining padding
-    // areas are already zero-out
-    expr cur_pt = pt_;
-    expr p_offset = 0;
-    if (parallel_axis_ == parallel_kind::HEIGHT) {
-      cur_pt = builder::make_min(kh_,
-        builder::make_max(
-          0, pt_ - builder::make_cast(datatypes::s32, init_idx * sh_)));
-      p_offset = builder::make_select(p > 0, pt_, 0);
-    }
-    _for_(iw, pl_, real_pr > 0 ? (pl_ + iw_) : actual_iw_) {
-      _for_(kh, 0, kh_ - cur_pt, 1) {
-        _for_(c_i, 0, ic_, ic_lanes) {
-          aux_buf[span_t({iw * kh_ * ic_ + cur_pt * ic_ + kh * ic_ + c_i},
-            ic_real_lane, ic_mask)]
-            = input[span_t(data_offset(n_o, g, 0, 0, p * sh_ + kh - p_offset,
-                             iw - pl_, 1, c_i),
-              ic_real_lane, ic_mask)];
-        }
-      }
-    }
-
-    if (real_pr == 0) {
-      // last row handling, requires special handling for the case of
-      // parallel on height axis
-      _var_init_(last_row, datatypes::s32, (actual_ih_ - pt_ - real_pb));
-      _var_init_(cur_pt, datatypes::s32, builder::make_select(p == 0, pt_, 0));
-      _var_init_(
-        p_offset, datatypes::s32, builder::make_select(p == 0, 0, pt_));
-      if (parallel_axis_ == parallel_kind::HEIGHT) {
-        int job1 = utils::divide_and_ceil(oh_, num_threads);
-        int job2 = oh_ / num_threads;
-        int threshold = (oh_ % num_threads) * job1;
-        _if_(p == 0) {
-          // left-most region
-          last_row = ((job1 - 1) * sh_ + kh_ - pt_);
-        }
-        _else_ {
-          _if_(p == oh_ - job2) {
-            // right-most region
-            last_row = ((job2 - 1) * sh_ + kh_ - real_pb);
-          }
-          _else_ {
-            _if_(p >= threshold) { last_row = ((job2 - 1) * sh_ + kh_); }
-            _else_ { last_row = ((job1 - 1) * sh_ + kh_); }
-          }
-        }
-      }
-      _for_(hi, 0, last_row, 1) {
-        _for_(ci, 0, ic_, ic_lanes) {
-          aux_buf[span_t(
-            {(actual_iw_ - 1) * kh_ * ic_ + hi * ic_ + ci + cur_pt * ic_},
-            ic_real_lane, ic_mask)]
-            = input[span_t(
-              data_offset(n_o, g, 0, 0, hi + init_idx * sh_ - p_offset,
-                actual_iw_ - pl_ - 1, 1, ci),
-              ic_real_lane, ic_mask)];
-        }
-      }
-    }
-  };
-
-  auto update_aux_buf = [&](const expr &aux_buf, const expr &n_o, const expr &g,
-                          const expr &p, const expr &init_idx) {
-    for (int iw = 1; iw < pl_ + 1; ++iw) {
-      builtin::brgemm_init(
-        tensor_ptr(
-          aux_buf, {((p - init_idx) - 1) * sh_ * ic_ + iw * kh_ * ic_}),
-        1, sh_ * ic_, sh_ * ic_, get_input_dtype(), padding_value);
-    }
-    // need special copy for the right-end
-    // for right-end with padding, but
-    // different update mask according to the current positions.
-    _var_init_(update_pad_lanes, datatypes::s32,
-      builder::make_select(p >= oh_pr_idx && oh_num_pb > 0,
-        builder::make_cast(
-          datatypes::s32, builder::make_min(p * sh_ + kh_ - (pt_ + ih_), sh_)),
-        0));
-    _var_init_(update_copy_lanes, datatypes::s32,
-      builder::make_cast(datatypes::s32,
-        builder::make_max(0, sh_ - builder::make_max(0, update_pad_lanes))));
-    _for_(iw, pl_ + 1,
-      real_pr > 0 && oh_num_pb > 0 ? (pl_ + iw_ + 1) : actual_iw_) {
-      // copy input
-      _for_(h, 0, update_copy_lanes, 1) {
-        _for_(c_i, 0, ic_, ic_lanes) {
-          aux_buf[span_t(
-            {((p - init_idx) - 1) * sh_ * ic_ + iw * kh_ * ic_ + h * ic_ + c_i},
-            ic_real_lane, ic_mask)]
-            = input[span_t(
-              data_offset(n_o, g, 0, 0, (p - 1) * sh_ + kh_ - pt_ + h,
-                iw - 1 - pl_, 1, c_i),
-              ic_real_lane, ic_mask)];
-        }
-      }
-      // zero-out
-      _if_(update_pad_lanes > 0) {
-        builtin::brgemm_init(tensor_ptr(aux_buf,
-                               {((p - init_idx) - 1) * sh_ * ic_
-                                 + iw * kh_ * ic_ + update_copy_lanes * ic_}),
-          1, update_pad_lanes * ic_, update_pad_lanes * ic_, get_input_dtype(),
-          padding_value);
-      }
-    }
-  };
-
-  auto do_compute = [&](const expr &aux_buf, const expr &n_o, const expr &g,
-                      const expr &p, const expr &init_idx) {
-    _for_(q, 0, ow_ / config.brgemm_m) {
-      _tensor_(A_list, datatypes::pointer, {num_brgemm_k_});
-      _tensor_(B_list, datatypes::pointer, {num_brgemm_k_});
-      auto offset = (p - init_idx) * sw_ * ic_;
-      for (int i = 0; i < num_brgemm_k_; ++i) {
-        A_list[i] = tensor_ptr(aux_buf,
-          {offset + q * config.brgemm_m * sw_ * kw_ * ic_ + i * brgemm_k_});
-      }
-
-      _for_(k_o, 0, K_num_block) {
-        for (int i = 0; i < num_brgemm_k_; ++i) {
-          // weight in KNknk format
-          B_list[i] = tensor_ptr(weight, {i, g * K_num_block + k_o, 0, 0, 0});
-        }
-        auto out_tsr = tensor_ptr(output,
-          output_offset(n_o, g, 0, 0, p, q * config.brgemm_m,
-            K_num_block * config.brgemm_n, k_o * config.brgemm_n));
-
-        const auto hint_A_size = config.brgemm_m * brgemm_k_ * num_brgemm_k_;
-        const auto hint_B_size = num_brgemm_k_ * brgemm_k_ * config.brgemm_n;
-        const auto hint_C_size = config.brgemm_m * config.brgemm_n;
-        sc_brgemm_attrs_t brg_attrs {{brgemm::attr_key::max_bs, num_brgemm_k_},
-          {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-          {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-          {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-          {brgemm::attr_key::use_interleave_stores, true},
-          {brgemm::attr_key::use_uker, true}};
-        {
-          trace_guard_t trg(ctx, "brgemm");
-          builtin::brgemm_init_list_update(A_list, B_list, out_tsr, 1,
-            config.brgemm_m, config.brgemm_n, brgemm_k_, LDA_, config.brgemm_n,
-            blocking_output_
-              ? oc_
-              : groups_ * oc_ /* channel last for g=1, blocking for g>1 */,
-            1 /*useless*/, 1 /*useless*/, num_brgemm_k_, get_input_dtype(),
-            get_weight_dtype(),
-            ctx->flags_.kernel_optim_ == 1 ? brg_attrs : sc_brgemm_attrs_t());
-        }
-
-        // brgemm_m * brgemm_n
-        trace_guard_t trg(ctx, "post-op fusion");
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          is_group_conv_
-            ? (blocking_output_
-                ? slice_range {{n_o, 1}, {g, 1}, {0, 1}, {p, 1},
-                  {q * config.brgemm_m, config.brgemm_m},
-                  {k_o * config.brgemm_n, config.brgemm_n}}
-                : slice_range {{n_o, 1}, {p, 1},
-                  {q * config.brgemm_m, config.brgemm_m}, {g, 1},
-                  {k_o * config.brgemm_n, config.brgemm_n}})
-            : (blocking_output_ ? slice_range {{n_o, 1}, {g, 1}, {p, 1},
-                 {q * config.brgemm_m, config.brgemm_m},
-                 {k_o * config.brgemm_n, config.brgemm_n}}
-                                : slice_range {{n_o, 1}, {p, 1},
-                                  {q * config.brgemm_m, config.brgemm_m},
-                                  {(g * K_num_block + k_o) * config.brgemm_n,
-                                    config.brgemm_n}}));
-      }
-      // brgemm_m * oc_
-      create_anchor(fusion, owner_->get_outputs()[0], n_o, 1, g, 1, 0, 1, 0, 1,
-        p, 1, q * config.brgemm_m, config.brgemm_m, oc_);
-    }
-  };
-
-  if (parallel_kind::BATCH == parallel_axis_) {
-    for_loop ln, lg, lp;
-    auto input_expr_dims = input.checked_as<tensor>()->dims_;
-    auto mb_expr = input_expr_dims[0];
-    _named_for_(ln, n_o, 0, mb_expr, 1, for_type::PARALLEL) {
-      _named_for_(lg, g, 0, groups_, 1) {
-        _tensor_(aux_buf, input_dtype, {aux_buf_size_});
-        builtin::brgemm_init(
-          aux_buf, 1, aux_buf_size_, aux_buf_size_, input_dtype, padding_value);
-
-        _named_for_(lp, p, 0, oh_, 1) {
-          _if_(p == 0) {
-            trace_guard_t trg(ctx, "init_aux");
-            init_aux_buf(aux_buf, n_o, g, p, 0, 0 /*useless*/);
-          }
-          _else_ {
-            trace_guard_t trg(ctx, "update_aux");
-            update_aux_buf(aux_buf, n_o, g, p, 0);
-          }
-          do_compute(aux_buf, n_o, g, p, 0);
-          // ow_ * oc_
-          create_anchor(fusion, owner_->get_outputs()[0], n_o, 1, g, 1, 0, 1, 0,
-            1, p, 1, 0, ow_, oc_);
-        }
-        if (mb_ * groups_ >= num_threads) {
-          // oh_ * ow_ * oc_
-          create_anchor(fusion, owner_->get_outputs()[0], n_o, 1, g, 1, 0, 1, 0,
-            1, 0, oh_, 0, ow_, oc_);
-        }
-      }
-      // oh_ * ow_ * oc_
-      create_anchor(fusion, owner_->get_outputs()[0], n_o, 1, 0, groups_, 0, 1,
-        0, 1, 0, oh_, 0, ow_, oc_);
-    }
-    lp->attr().set(stmt_attr_key::no_loop_fuse, true);
-    bind_loop_axis(owner_->get_outputs()[0], ln, 0);
-    bind_loop_axis(owner_->get_outputs()[0], lg,
-      is_group_conv_ ? std::vector<int> {1} : std::vector<int> {});
-    bind_loop_axis(owner_->get_outputs()[0], lp, is_group_conv_ + 2);
-  } else {
-    expr oh_b, oh_e;
-    for_loop lt;
-    expr start_idx, large_group, init_idx;
-    _named_for_(lt, t, 0, num_threads, 1, for_type::PARALLEL) {
-      _tensor_(aux_buf, input_dtype, {aux_buf_size_});
-      _var_init_(group_size, datatypes::s32,
-        get_balance211_length(oh_, num_threads, t, start_idx, large_group));
-      oh_b = start_idx;
-      oh_e = start_idx + group_size;
-      init_idx = start_idx;
-      _for_(n_o, 0, mb_, 1) {
-        _for_(g, 0, groups_, 1) {
-          builtin::brgemm_init(aux_buf, 1, aux_buf_size_, aux_buf_size_,
-            input_dtype, padding_value);
-          _for_(p, oh_b, oh_e, 1) {
-            _if_(p == init_idx) {
-              trace_guard_t trg(ctx, "init_aux");
-              init_aux_buf(aux_buf, n_o, g, p, init_idx, t);
-            }
-            _else_ {
-              trace_guard_t trg(ctx, "update_aux");
-              update_aux_buf(aux_buf, n_o, g, p, init_idx);
-            }
-            do_compute(aux_buf, n_o, g, p, init_idx);
-            // ow_ * oc_
-            create_anchor(fusion, owner_->get_outputs()[0], n_o, 1, g, 1, 0, 1,
-              0, 1, p, 1, 0, ow_, oc_);
-          }
-        }
-      }
-    }
-    bind_loop_axis(owner_->get_outputs()[0], lt, is_group_conv_ + 2);
-  }
-  return true;
-}
-
-void gen_conv_fwd_rl_t::schedule_loops(context_ptr ctx,
-  const conv_fwd_rl_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_rl.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/conv_rl.hpp
deleted file mode 100644
index fc2e865a0ce..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/conv_rl.hpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_RL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_CONV_RL_HPP
-
-#include <tuple>
-#include <vector>
-#include <compiler/ir/graph/graph.hpp>
-#include <ops/body_generator.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-struct conv_fwd_rl_config_t {
-  int brgemm_m = 1;
-  int brgemm_n = 1;
-  conv_fwd_rl_config_t() = default;
-  conv_fwd_rl_config_t(int brgemm_m, int brgemm_n)
-    : brgemm_m(brgemm_m), brgemm_n(brgemm_n) {}
-};
-
-enum class parallel_kind : int { BATCH = 0, HEIGHT };
-namespace rl_kind {
-constexpr int NO_LOWERING = 0;
-constexpr int FULL_LOWERING = 1;
-constexpr int KW_LOWERING = 2;
-} // namespace rl_kind
-// enum class rl_kind : int { NO_LOWERING = 0, FULL_LOWERING, KW_LOWERING };
-
-class gen_conv_fwd_rl_t : public body_generator_t<conv_fwd_rl_config_t> {
-public:
-  using parent = body_generator_t<conv_fwd_rl_config_t>;
-  using parent::generate;
-
-  std::tuple<int, int> get_output_shape() {
-    return std::tuple<int, int>(oh_, ow_);
-  }
-
-  gen_conv_fwd_rl_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &dilations, const sc_dims &pads_begin,
-    const sc_dims &pads_end, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_input_plain_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_input_blocking_dims() const {
-    return in_tensors_[0].get_blocking_dims();
-  }
-
-  const sc_dims &get_weight_plain_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_plain_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_output_blocking_dims() const {
-    return out_tensors_[0].get_blocking_dims();
-  }
-
-  sc_data_type_t get_input_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_weight_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_output_dtype() const { return out_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const conv_fwd_rl_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-  void validate_default_config(
-    const context_ptr &ctx, conv_fwd_rl_config_t &cfg) const;
-
-  void schedule_loops(context_ptr ctx, const conv_fwd_rl_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-
-  std::vector<expr> data_offset(const expr &N, const expr &G, const expr &C,
-    const expr &D, const expr &H, const expr &W, const expr &C_block,
-    const expr &c_idx = expr(0)) const;
-  std::vector<expr> output_offset(const expr &N, const expr &G, const expr &C,
-    const expr &D, const expr &H, const expr &W, const expr &C_block,
-    const expr &c_idx = expr(0)) const;
-  void create_anchor(fusion_anchor_mgr_t *fusion,
-    const graph_tensor_ptr &output_gt, const expr &n, const int n_len,
-    const expr &g, const expr &g_len, const expr &k, const int k_len,
-    const expr &d, const int d_len, const expr &p, const expr &p_len,
-    const expr &q, const int q_len, const int K_block) const;
-
-  size_t ndims_ = 0;
-  int groups_ = 1;
-  int mb_ = 0, ic_ = 0, ih_ = 0, iw_ = 0;
-  int oc_ = 0, oh_ = 0, ow_ = 0;
-  int kh_ = 0, kw_ = 0;
-  int sh_ = 0, sw_ = 0;
-  int pt_ = 0, pb_ = 0, pl_ = 0, pr_ = 0;
-  int actual_ih_ = 0, actual_iw_ = 0;
-  int extra_padding_ = 0;
-  int aux_buf_size_ = 0;
-  int LDA_ = 0;
-  int num_brgemm_k_ = 0, brgemm_k_ = 0;
-  uint64_t init_mask_ = 0, update_mask_ = 0;
-  int init_lanes_ = 0, update_lanes_ = 0;
-  bool is_group_conv_ = false;
-  parallel_kind parallel_axis_;
-  bool blocking_input_ = false, blocking_output_ = false;
-  any_map_t attrs_;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/managed_matmul_core.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/managed_matmul_core.cpp
deleted file mode 100644
index 0e632187cfa..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/managed_matmul_core.cpp
+++ /dev/null
@@ -1,2404 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "managed_matmul_core.hpp"
-#include <algorithm>
-#include <atomic>
-#include <cmath>
-#include <limits>
-#include <string>
-#include "../fusible/memory_movement.hpp"
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/dynamic_internal_info.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/index2var.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/transform/scope_flatten.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/templates/commit_op.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/parallel.hpp>
-#include <runtime/trace.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-
-SC_MODULE(ops.managed_matmul_core)
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using ops::managed_matmul_core_config_t;
-// clang-format off
-SC_CLASS(managed_matmul_core_config_t)
-  SC_FIELD(M_split_num)
-  SC_FIELD(N_split_num)
-  SC_FIELD(M_sub_block)
-  SC_FIELD(N_sub_block)
-  SC_FIELD(K_sub_block)
-  SC_FIELD(im_loop_order)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-
-bool gen_managed_matmul_core_t::is_valid_config(
-  const context_ptr &ctx, const managed_matmul_core_config_t &config) const {
-  auto num_threads = runtime_config_t::get().get_num_threads();
-  if (config.M_split_num * config.N_split_num > num_threads) { return false; }
-  if (config.M_sub_block <= 0 || config.N_sub_block <= 0
-    || config.K_sub_block <= 0) {
-    return false;
-  }
-  const int M = static_cast<int>(
-    utils::rnd_up(in_tensors_[0].get_plain_dims()[0], iim_block_));
-  const int K = static_cast<int>(
-    utils::rnd_up(in_tensors_[0].get_plain_dims()[1], iik_block_));
-  const int N = static_cast<int>(
-    utils::rnd_up(in_tensors_[1].get_plain_dims()[1], iin_block_));
-  int M_block_size
-    = utils::divide_and_ceil(M / iim_block_, config.M_split_num) * iim_block_;
-  int M_ib_block_size = M / iim_block_ / config.M_split_num * iim_block_;
-  int N_block_size
-    = utils::divide_and_ceil(N / iin_block_, config.N_split_num) * iin_block_;
-  int N_ib_block_size = N / iin_block_ / config.N_split_num * iin_block_;
-  int K_block_size = utils::divide_and_ceil(K / iik_block_,
-                       num_threads / config.N_split_num / config.M_split_num)
-    * iik_block_;
-  int K_ib_block_size = K / iik_block_
-    / (num_threads / config.N_split_num / config.M_split_num) * iik_block_;
-  if (M_ib_block_size == 0) { M_ib_block_size = M_block_size; }
-  if (N_ib_block_size == 0) { N_ib_block_size = N_block_size; }
-  if (K_ib_block_size == 0) { K_ib_block_size = K_block_size; }
-
-  if (M_block_size / iim_block_ < config.M_sub_block
-    || M_ib_block_size / iim_block_ < config.M_sub_block) {
-    return false;
-  }
-  if (N_block_size / iin_block_ < config.N_sub_block
-    || N_ib_block_size / iin_block_ < config.N_sub_block) {
-    return false;
-  }
-  if (K_block_size / iik_block_ < config.K_sub_block
-    || K_ib_block_size / iik_block_ < config.K_sub_block) {
-    return false;
-  }
-  return true;
-}
-
-bool is_prefetch_debug_mode() {
-  auto &cfg = runtime_config_t::get();
-  if (cfg.trace_mode_ == 1
-    && utils::string_endswith(cfg.trace_out_path_, "pref.log")) {
-    return true;
-  }
-  return false;
-}
-
-void trace_prefetch_for_debug(const expr &addr) {
-  if (!is_prefetch_debug_mode()) { return; }
-  static auto trace_id = register_traced_func("pref");
-  builder::get_current_builder()->push_evaluate(builtin::make_trace(trace_id,
-    builder::make_cast(datatypes::s32,
-      builder::make_reinterpret(addr, datatypes::index) >> UINT64_C(32)),
-    builder::make_cast(
-      datatypes::s32, builder::make_reinterpret(addr, datatypes::index))));
-}
-
-void trace_brgemm_for_debug(
-  const expr &Baddr, const expr &bs, const expr &N, const expr &K) {
-  if (!is_prefetch_debug_mode()) { return; }
-  static auto trace_id = register_traced_func("brg");
-  builder::get_current_builder()->push_evaluate(builtin::make_trace(trace_id,
-    builder::make_cast(datatypes::s32,
-      builder::make_reinterpret(Baddr, datatypes::index) >> UINT64_C(32)),
-    builder::make_cast(
-      datatypes::s32, builder::make_reinterpret(Baddr, datatypes::index))));
-  builder::get_current_builder()->push_evaluate(builtin::make_trace(trace_id, 0,
-    bs * N * K
-      * static_cast<int>(
-        utils::get_sizeof_type(Baddr->dtype_.get_pointer_element()))));
-}
-
-config_ptr_vec gen_managed_matmul_core_t::get_dynamic_config_candidates(
-  const context_ptr &ctx) const {
-  config_ptr_vec ret;
-  int num_threads = runtime_config_t::get().get_num_threads();
-  auto M_split_candidates = get_splits(num_threads);
-  auto N_split_candidates = get_splits(num_threads);
-  // todo: add more candidates in following prs.
-  std::vector<int> MNK_sub_candidates = {1, 4};
-  for (auto &M_split_num : M_split_candidates) {
-    for (auto &N_split_num : N_split_candidates) {
-      if (num_threads % (M_split_num * N_split_num) == 0) {
-        for (auto &M_sub_block : MNK_sub_candidates) {
-          for (auto &N_sub_block : MNK_sub_candidates) {
-            // for (auto &K_sub_block : MNK_sub_candidates) {
-            auto gcfg = reflection::general_object_t::make<
-              managed_matmul_core_config_t>();
-            managed_matmul_core_config_t &cfg
-              = *gcfg.unchecked_get_as<managed_matmul_core_config_t>();
-            cfg.M_split_num = M_split_num;
-            cfg.N_split_num = N_split_num;
-            cfg.M_sub_block = M_sub_block;
-            cfg.N_sub_block = N_sub_block;
-            // Currently we only support K_sub_block == 1, otherwise have
-            // correctness issue.
-            cfg.K_sub_block = 1;
-            cfg.im_loop_order = 0;
-            ret.emplace_back(std::move(gcfg));
-            // }
-          }
-        }
-      }
-    }
-  }
-  return ret;
-}
-
-std::vector<uint64_t> gen_managed_matmul_core_t::convert_config_to_keys(
-  const config_ptr &config) const {
-  managed_matmul_core_config_t &cfg
-    = *config.unchecked_get_as<managed_matmul_core_config_t>();
-  std::vector<uint64_t> keys = {static_cast<uint64_t>(cfg.M_split_num),
-    static_cast<uint64_t>(cfg.N_split_num),
-    static_cast<uint64_t>(cfg.M_sub_block),
-    static_cast<uint64_t>(cfg.N_sub_block),
-    static_cast<uint64_t>(cfg.K_sub_block),
-    static_cast<uint64_t>(cfg.im_loop_order)};
-  return keys;
-}
-
-config_ptr gen_managed_matmul_core_t::get_default_config(
-  context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<managed_matmul_core_config_t>();
-  managed_matmul_core_config_t &cfg
-    = *ret.unchecked_get_as<managed_matmul_core_config_t>();
-  if (is_dynamic()) {
-    cfg.M_split_num = 1;
-    cfg.N_split_num = 1;
-    cfg.M_sub_block = 1;
-    cfg.N_sub_block = 1;
-    cfg.K_sub_block = 1;
-    cfg.im_loop_order = 0;
-    return std::move(ret);
-  }
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  const int iim_block = iim_block_;
-  const int iin_block = iin_block_;
-  const int iik_block = iik_block_;
-  bool is_int8 = utils::is_one_of(get_A_dtype(), datatypes::u8, datatypes::s8);
-  bool is_f32 = get_A_dtype() == datatypes::f32;
-  bool no_vnni_low_fp = ops::no_vnni_low_fp(ctx, get_A_dtype());
-  const int M
-    = utils::divide_and_ceil(
-        static_cast<int>(in_tensors_[0].get_plain_dims()[0]), iim_block)
-    * iim_block;
-  const int N
-    = utils::divide_and_ceil(
-        static_cast<int>(in_tensors_[1].get_plain_dims()[1]), iin_block)
-    * iin_block;
-  const int K
-    = utils::divide_and_ceil(
-        static_cast<int>(in_tensors_[0].get_plain_dims()[1]), iik_block)
-    * iik_block;
-  const int sizeofdtypeA
-    = utils::get_sizeof_etype(in_tensors_[0].dtype_.as_etype());
-  const int sizeofdtypeC
-    = utils::get_sizeof_etype(out_tensors_[0].dtype_.as_etype());
-  get_managed_matmul_config(ctx->machine_, cfg.M_split_num, cfg.N_split_num,
-    cfg.M_sub_block, cfg.N_sub_block, cfg.K_sub_block, cfg.im_loop_order, M, N,
-    K, iim_block, iin_block, iik_block, sizeofdtypeA, sizeofdtypeC, is_int8,
-    is_f32 || no_vnni_low_fp, owner_->is_dynamic(), dispatch_avx_);
-  return std::move(ret);
-}
-
-config_ptr gen_managed_matmul_core_t::get_default_post_rd_config(
-  const context_ptr &ctx) const {
-  // mmm + post reduce_on_N's config
-  auto ret = reflection::general_object_t::make<managed_matmul_core_config_t>();
-  managed_matmul_core_config_t &cfg
-    = *ret.unchecked_get_as<managed_matmul_core_config_t>();
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  const int iim_block = iim_block_;
-  const int iin_block = iin_block_;
-  const int iik_block = iik_block_;
-  const int ori_M = in_tensors_[0].get_plain_dims()[0];
-  const int ori_N = in_tensors_[1].get_plain_dims()[1];
-  const int ori_K = in_tensors_[0].get_plain_dims()[1];
-  const int M = utils::rnd_up(ori_M, iim_block);
-  const int N = utils::rnd_up(ori_N, iin_block);
-  const int K = utils::rnd_up(ori_K, iik_block);
-  bool is_int8 = utils::is_one_of(get_A_dtype(), datatypes::u8, datatypes::s8);
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, get_A_dtype());
-  const int sizeofdtypeA
-    = utils::get_sizeof_etype(in_tensors_[0].dtype_.as_etype());
-  const int sizeofdtypeC
-    = utils::get_sizeof_etype(out_tensors_[0].dtype_.as_etype());
-  bool is_special_fm = ctx->machine_.cpu_flags_.is_spr_like();
-
-  // should discuss int8 and f32
-  if ((M < 4096 && !is_vnni_low_fp) || M / iim_block_ < num_threads) {
-    return get_default_config(ctx);
-  }
-
-  // enable to commit the anchor inside m_o
-  cfg.M_split_num = num_threads;
-  cfg.N_split_num = 1;
-  cfg.N_sub_block = 1;
-  cfg.im_loop_order = 0;
-
-  int single_M
-    = utils::divide_and_ceil(M / iim_block, cfg.M_split_num) * iim_block;
-  int single_N
-    = utils::divide_and_ceil(N / iin_block, cfg.N_split_num) * iin_block;
-  int single_K = K;
-  int L2_size = static_cast<int>(ctx->machine_.cpu_flags_.getDCacheSize(2));
-  int single_K_threshold
-    = (single_M * single_N * sizeofdtypeA < L2_size ? 2048 : 4096)
-    / sizeofdtypeA;
-  if (single_K >= single_K_threshold) {
-    cfg.K_sub_block = (is_vnni_low_fp || is_int8 || K <= 1024)
-      ? 1
-      : utils::divide_and_ceil(single_K, single_K_threshold);
-    cfg.K_sub_block = std::min(K / iik_block_, cfg.K_sub_block);
-    // K is rounded up by iik_block_, so K / iik_block_ is always non-zero
-    int L2_K = utils::divide_and_ceil(
-                 utils::divide_and_ceil(single_K, iik_block), cfg.K_sub_block)
-      * iik_block;
-    // sizeofdtypeA* (M * K) + sizeofdtypeB * (N * K) + sizeofdtypeC * (M *
-    // N)  <= L2_size, Then M = (L2_size - sizeofdtypeB * (N
-    // * K)) / (sizeofdtypeA * K + sizeofdtypeC * N)
-    int L2_MN = (L2_size - sizeofdtypeA * N * L2_K)
-      / (sizeofdtypeA * L2_K + sizeofdtypeC * N);
-    cfg.M_sub_block = L2_MN <= iim_block ? single_M / iim_block
-                                         : std::max(1, single_M / L2_MN);
-    while (cfg.M_sub_block > 1
-      && (single_M / iim_block < cfg.M_sub_block
-        || (M / iim_block % cfg.M_split_num > 0
-          && M / iim_block / cfg.M_split_num < cfg.M_sub_block))) {
-      cfg.M_sub_block--;
-    }
-
-  } else {
-    // sizeofdtypeA * M * K + sizeofdtypeB * N * K <= L2_size, then
-    // M = (L2_size - sizeofdtypeA * N * K) / (sizeofdtypeA * K)
-    int L2_MN
-      = (L2_size - sizeofdtypeA * N * single_K) / (sizeofdtypeA * single_K);
-    cfg.M_sub_block = L2_MN <= iim_block ? single_M / iim_block
-                                         : std::max(1, single_M / L2_MN);
-    while (cfg.M_sub_block > 1
-      && (single_M / iim_block < cfg.M_sub_block
-        || (M / iim_block % cfg.M_split_num > 0
-          && M / iim_block / cfg.M_split_num < cfg.M_sub_block))) {
-      cfg.M_sub_block--;
-    }
-    cfg.K_sub_block = 1;
-  }
-  return std::move(ret);
-}
-
-static bool is_dyn_threadpool() {
-  return runtime_config_t::get().managed_thread_pool_
-    == thread_pool_mode_t::DYNAMIC;
-}
-
-config_ptr gen_managed_matmul_core_t::get_default_transposed_a_config(
-  const context_ptr &ctx) const {
-  auto ret = reflection::general_object_t::make<managed_matmul_core_config_t>();
-  managed_matmul_core_config_t &cfg
-    = *ret.unchecked_get_as<managed_matmul_core_config_t>();
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  const int iim_block = iim_block_;
-  const int iin_block = iin_block_;
-  const int iik_block = iik_block_;
-  const int ori_M = in_tensors_[0].get_plain_dims()[0];
-  const int ori_N = in_tensors_[1].get_plain_dims()[1];
-  const int ori_K = in_tensors_[0].get_plain_dims()[1];
-  const int M = utils::rnd_up(ori_M, iim_block);
-  const int N = utils::rnd_up(ori_N, iin_block);
-  const int K = utils::rnd_up(ori_K, iik_block);
-  const int sizeofdtypeA
-    = utils::get_sizeof_etype(in_tensors_[0].dtype_.as_etype());
-  const int sizeofdtypeC
-    = utils::get_sizeof_etype(out_tensors_[0].dtype_.as_etype());
-  float cost = std::numeric_limits<float>::max();
-  int split_m = 1;
-  int split_n = 1;
-  cfg.im_loop_order = 1;
-
-  if (M == iim_block) {
-    cfg.M_split_num = 1;
-    if (K < 512) {
-      cfg.N_split_num = num_threads;
-    } else {
-      for (auto i : get_splits(num_threads)) {
-        float new_cost
-          = std::fabs(float(i * i) / float(num_threads) - float(N) / float(K));
-        if (new_cost < cost) {
-          cfg.N_split_num = i;
-          cost = new_cost;
-        }
-      }
-    }
-  } else if (N == iin_block) {
-    cfg.N_split_num = 1;
-    if (K < 512) {
-      cfg.M_split_num = num_threads;
-    } else {
-      for (auto i : get_splits(num_threads)) {
-        float new_cost
-          = std::fabs(float(i * i) / float(num_threads) - float(M) / float(K));
-        if (new_cost < cost) {
-          cfg.M_split_num = i;
-          cost = new_cost;
-        }
-      }
-    }
-  } else if (K == iik_block) {
-    return get_default_config(ctx);
-  } else {
-    // for small K, no need to give splits
-    if (K < 512) { return get_default_config(ctx); }
-    auto K_split_candidates = get_splits(num_threads);
-    int split_k = K_split_candidates.at(0);
-    std::vector<int> K_real_split_candidates;
-    if (K_split_candidates.size() > 2) {
-      K_split_candidates.pop_back();
-      K_split_candidates.erase(K_split_candidates.begin());
-      for (auto k : K_split_candidates) {
-        // make num_threads / k able to be further split for M and N
-        if (get_splits(num_threads / k).size() > 2) {
-          K_real_split_candidates.push_back(k);
-        }
-      }
-      if (K_real_split_candidates.empty()) {
-        K_real_split_candidates = std::move(K_split_candidates);
-      }
-      while (K_real_split_candidates.size() > 4) {
-        // corner case, such as num_threads = 128
-        K_real_split_candidates.pop_back();
-      }
-      while (K_real_split_candidates.size() < 4) {
-        K_real_split_candidates.push_back(K_real_split_candidates.back());
-      }
-
-      float relative_K = float(ori_K) / float(std::min(ori_M, ori_N));
-      if (relative_K < 8.0) {
-        split_k = K_real_split_candidates.at(0);
-      } else if (relative_K >= 8.0 && relative_K <= 12.0) {
-        split_k = K_real_split_candidates.at(1);
-      } else if (relative_K > 12.0 && relative_K <= 32.0) {
-        split_k = K_real_split_candidates.at(2);
-      } else {
-        split_k = K_real_split_candidates.at(3);
-      }
-    } else {
-      SC_MODULE_WARN << "not enough split candidates under " << num_threads
-                     << " threads, may lead to poor performance.";
-      if (float(ori_K) / float(std::min(ori_M, ori_N)) >= 128) {
-        split_k = K_split_candidates.back();
-      }
-    }
-    for (auto i : get_splits(num_threads / split_k)) {
-      int num_M_block
-        = utils::divide_and_ceil(M / iim_block, num_threads / i / split_k);
-      int num_N_block = utils::divide_and_ceil(N / iin_block, i);
-      int num_brgemm = num_M_block * num_N_block;
-      int num_core = std::min(i, N / iin_block)
-        * std::min(num_threads / i / split_k, M / iim_block);
-
-      float new_cost = 0.0;
-      if (K <= 4096 || (M <= 512 && N <= 512)) {
-        // For small matmul, make M_split_num / N_split_num closer to M / N
-        new_cost = std::fabs(num_threads * N - i * i * M * split_k);
-      } else {
-        // Cost = Shape_efficient_weight *
-        // (workload_balance + divide_N_plenty) / core_utilitizaiton
-        // single core gemm prefers square shape for A and B.
-        // For small workload, the A and B shape is not a key problem, but the
-        // num_core and num_brgemm is important to performance. Use 2048 to
-        // reduce the shape weight on small shape.
-        new_cost = (1024 + M * i * split_k / num_threads + N / i)
-          * (num_brgemm + 8 * i) / num_core;
-      }
-      if (new_cost < cost) {
-        split_n = i;
-        cost = new_cost;
-      }
-    }
-    cfg.M_split_num = num_threads / split_k / split_n;
-    cfg.N_split_num = split_n;
-  }
-
-  // big M or N needs to introduce X_sub_block for better cache reuse
-  if (M >= 4096 || N >= 4096) {
-    int single_M
-      = utils::divide_and_ceil(M / iim_block, cfg.M_split_num) * iim_block;
-    int single_N
-      = utils::divide_and_ceil(N / iin_block, cfg.N_split_num) * iin_block;
-    int single_K = utils::divide_and_ceil(K / iik_block,
-                     num_threads / cfg.M_split_num / cfg.N_split_num)
-      * iik_block;
-    int L2_size = static_cast<int>(ctx->machine_.cpu_flags_.getDCacheSize(2));
-    int single_K_threshold
-      = (single_M * single_N * sizeofdtypeA < L2_size ? 2048 : 4096)
-      / sizeofdtypeA;
-    if (single_K >= single_K_threshold) {
-      cfg.K_sub_block = utils::divide_and_ceil(single_K, single_K_threshold);
-      int L2_K = utils::divide_and_ceil(single_K / iik_block, cfg.K_sub_block)
-        * iik_block;
-      // sizeofdtypeA* (M * K) + sizeofdtypeB * (N * K) + sizeofdtypeC(M * N) <=
-      // L2_size, let M == N, then
-      // 2 * sizeofdtypeA * M * K + sizeofdtypeC * M * M <= L2_size
-      // Then M = (sqrt((2 * sizeofdtypeA * K) ^ 2 + 4 * sizeofdtypeC *
-      // L2_size) - 2 * sizeofdtypeA * K)/ (2 * sizeofdtypeC)
-      int L2_MN
-        = (sqrt(pow(2 * sizeofdtypeA * L2_K, 2) + 4 * sizeofdtypeC * L2_size)
-            - 2 * sizeofdtypeA * L2_K)
-        / (2 * sizeofdtypeC);
-      COMPILE_ASSERT(L2_MN > 0, "Bad L2_MN. Is cache size correctly fetched?");
-      cfg.M_sub_block = std::max(1, single_M / L2_MN);
-      cfg.N_sub_block = std::max(1, single_N / L2_MN);
-    } else {
-      // sizeofdtypeA * M * K + sizeofdtypeB * N * K <= L2_size
-      // let M == N, then
-      // M = L2_size / (2 * sizeofdtypeA * K)
-      int L2_MN = L2_size / (2 * sizeofdtypeA * single_K);
-      cfg.M_sub_block = std::max(1, single_M / L2_MN);
-      cfg.N_sub_block = std::max(1, single_N / L2_MN);
-      cfg.K_sub_block = 1;
-    }
-    return std::move(ret);
-  }
-
-  cfg.M_sub_block = 1;
-  cfg.N_sub_block = 1;
-  cfg.K_sub_block = 1;
-  return std::move(ret);
-}
-
-static int suggest_aligned_block(const size_t plain_X,
-  const size_t default_block, size_t min = 1, size_t align = 1) {
-  if (plain_X < default_block) {
-    if (plain_X <= min) {
-      return min;
-    } else if (plain_X < align) {
-      return utils::rnd_up(plain_X, min);
-    } else {
-      return utils::rnd_up(plain_X, align);
-    }
-  }
-  if (plain_X % default_block == 0) {
-    return utils::rnd_up(default_block, align);
-  }
-  size_t num_X_block = utils::divide_and_ceil(plain_X, default_block);
-  return utils::rnd_up(utils::divide_and_ceil(plain_X, num_X_block), align);
-}
-
-int gen_managed_matmul_core_t::suggest_aligned_iim_block(
-  const size_t plain_M, const size_t default_block, bool is_f32) {
-  auto iim = suggest_aligned_block(plain_M, default_block);
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  if (!is_f32 && num_threads <= 16
-    && iim % 2 != 0 // Even numbers are good choices
-    && plain_M <= 256) {
-    split_iim_ = default_block;
-    iim = plain_M;
-  }
-  return iim;
-}
-
-static int64_t get_bf16_M_block_default(
-  const int64_t plain_M, const int num_threads) {
-  int64_t M_block_default = 64;
-  std::vector<int64_t> iim_block_candidates = {32, 64, 96};
-  for (auto i : iim_block_candidates) {
-    auto num_block = utils::divide_and_ceil(plain_M, i) / num_threads;
-    // magic number 2
-    if (plain_M % i == 0 && num_block > 2) { M_block_default = i; }
-  }
-  return M_block_default;
-}
-
-static int64_t get_bf16_N_block_default(const int64_t plain_N) {
-  int64_t N_block_default = 64;
-  std::vector<int64_t> iim_block_candidates = {64, 96};
-  for (auto i : iim_block_candidates) {
-    // magic number 2
-    if (plain_N % i == 0 && plain_N / i > 2) { N_block_default = i; }
-  }
-  return N_block_default;
-}
-
-gen_managed_matmul_core_t::gen_managed_matmul_core_t(sc_op *owner,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs)) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  if (owner->attrs_.get_or_else("dispatch_avx", false)) { dispatch_avx_ = 1; }
-  const int64_t plain_M = get_mma_plain_dims()[0];
-  const int64_t plain_K = get_mma_plain_dims()[1];
-  const int64_t plain_N = get_mmb_plain_dims()[1];
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  bool is_vnni_low_fp
-    = ops::is_vnni_low_fp(get_default_context(), get_A_dtype());
-  bool no_vnni_low_fp
-    = ops::no_vnni_low_fp(get_default_context(), get_A_dtype());
-  bool is_f32 = get_A_dtype() == datatypes::f32;
-  bool is_int8 = utils::is_one_of(get_A_dtype(), datatypes::u8, datatypes::s8);
-  int64_t M_block_default = 64;
-  int64_t N_block_default = 64;
-  int64_t K_block_default = 64;
-  // if true, run on spr, emr, gnr
-  bool is_spr_like = get_default_context()->machine_.cpu_flags_.is_spr_like();
-  // if true, run on skx, clx, cpx, icx
-  bool is_skx_like = get_default_context()->machine_.cpu_flags_.is_skx_like();
-  bool is_dynamic = is_dynamic_dim(plain_M) || is_dynamic_dim(plain_N)
-    || is_dynamic_dim(plain_K);
-  if (is_f32 || no_vnni_low_fp) {
-    if (is_spr_like) {
-      // prefer small blocks: someone is small but none is too great
-      bool case_great_K = (std::min(plain_M, plain_N) > 512)
-        && (plain_K / std::min(plain_M, plain_N) > 16);
-      bool case_small_MNK = (plain_M < 1024 || plain_N < 1024 || plain_K < 512)
-        && !(plain_M >= 7168 || plain_N >= 7168);
-      if (case_great_K || case_small_MNK) {
-        M_block_default = 16;
-        N_block_default = 16;
-        K_block_default = 16;
-      } else if (plain_M <= 4096 && plain_N <= 4096 && plain_K <= 8192) {
-        M_block_default = 32;
-        N_block_default = 32;
-        K_block_default = 32;
-      }
-    } else {
-      if (plain_M <= 256) { M_block_default = 32; }
-    }
-  } else if (is_vnni_low_fp) {
-    if (plain_M > 16384 && plain_N >= 1024 && plain_K >= 768) {
-      M_block_default = get_bf16_M_block_default(plain_M, num_threads);
-      N_block_default = get_bf16_N_block_default(plain_N);
-      K_block_default = 64;
-    } else {
-      M_block_default = 32;
-      N_block_default = 32;
-      K_block_default = 32;
-    }
-  } else {
-    bool is_amx = get_default_context()->use_amx() && !dispatch_avx_;
-    assert(utils::is_one_of(get_A_dtype(), datatypes::u8, datatypes::s8));
-    if (plain_M <= 1024 || (plain_M / num_threads / 64 < 8 && is_amx)) {
-      M_block_default = 32;
-    }
-    // in amx, single core small M perfers using 128
-    N_block_default
-      = (num_threads == 1 && plain_M <= 12 && is_amx && plain_N >= 512) ? 128
-                                                                        : 64;
-    K_block_default
-      = (num_threads == 1 && plain_M <= 12 && is_amx && plain_K >= 512) ? 128
-                                                                        : 64;
-  }
-  if (!is_dynamic) {
-    if (plain_N <= 512 && plain_K <= 512) {
-      iim_block_ = std::max(
-        ((is_f32 || no_vnni_low_fp) && is_skx_like && plain_M >= 64
-          && plain_M <= 128 && (plain_N >= 256 || plain_K >= 256))
-          ? (int64_t)8
-          : (int64_t)4,
-        std::min(M_block_default,
-          static_cast<int64_t>(utils::divide_and_ceil(plain_M, num_threads))));
-    } else {
-      iim_block_ = suggest_aligned_iim_block(plain_M, M_block_default, is_f32);
-    }
-  } else if (!is_dynamic_dim(plain_M)) {
-    iim_block_ = get_matmul_dyn_cfg_single(plain_M, true);
-  }
-  if (!is_dynamic) {
-    iin_block_ = suggest_aligned_block(plain_N, N_block_default, 1, 16);
-  } else if (!is_dynamic_dim(plain_N)) {
-    iin_block_ = get_matmul_dyn_cfg_single(plain_N);
-  }
-  if (!is_dynamic) {
-    if (is_f32 || no_vnni_low_fp) {
-      // f32 small M with small even K prefers padding iik_block to align 16
-      if (plain_K < 16 && plain_K % 2 == 0 && plain_M <= 128 && is_skx_like) {
-        iik_block_ = 16;
-      } else {
-        iik_block_ = suggest_aligned_block(plain_K, K_block_default, 1, 16);
-      }
-    } else if (is_int8) {
-      // vnni-int8 perfers padding iik_block_ to algin 16 when M <=2048
-      bool is_amx = get_default_context()->use_amx() && !dispatch_avx_;
-      if (plain_M < 2048 && !is_amx) {
-        iik_block_ = 16;
-      } else {
-        iik_block_ = suggest_aligned_block(plain_K, K_block_default, 4, 16);
-      }
-    } else {
-      iik_block_ = suggest_aligned_block(plain_K, K_block_default, 4, 16);
-    }
-  } else if (!is_dynamic_dim(plain_K)) {
-    iik_block_ = get_matmul_dyn_cfg_single(plain_K);
-  }
-}
-
-bool gen_managed_matmul_core_t::is_okay_to_prefetch(const context_ptr &ctx,
-  const managed_matmul_core_config_t &config, bool is_global) {
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  if (ctx->flags_.opt_level_ < sc_opt_level::lv3) { return false; }
-  if (!in_tensors_[1].get_format().is_blocking()) { return false; }
-  return true;
-}
-
-void gen_managed_matmul_core_t::generate_prefetcher_body_for_tensor(
-  const context_ptr &ctx, const managed_matmul_core_config_t &config,
-  const std::vector<expr> &func_args, const std::vector<expr> &ins,
-  const std::vector<int> &indices) {
-  auto lookup = func_args[0];
-  auto expected = func_args[1];
-  auto tid = func_args[2];
-  auto doroll = [&tid](const expr &v, const expr &bound) {
-    if (is_dyn_threadpool()) { return v; }
-    return (v + tid) % bound;
-  };
-  bool is_int8 = utils::is_one_of(get_A_dtype(), datatypes::u8, datatypes::s8);
-  uint64_t sizeof_dtype = utils::get_sizeof_type(get_A_dtype());
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, get_A_dtype());
-  int N = static_cast<int>(
-        utils::rnd_up(in_tensors_[1].get_plain_dims()[1], iin_block_)),
-      K = static_cast<int>(
-        utils::rnd_up(in_tensors_[0].get_plain_dims()[1], iik_block_));
-  auto num_threads = runtime_config_t::get().get_num_threads();
-  // N_split_num
-  auto threads_per_group = num_threads / config.M_split_num;
-  int K_split_num = num_threads / config.M_split_num / config.N_split_num;
-  if (config.N_split_num * config.M_split_num != num_threads
-    && K_split_num <= 1) {
-    _if_(tid % threads_per_group >= config.N_split_num) {
-      _return_(UINT64_C(0));
-    }
-  }
-  _var_(cnt, datatypes::index);
-  cnt = 0;
-  expr n_idx, k_idx, N_single_thr_size, K_single_thr_size, X_bigger_num;
-  if (K_split_num == 1) {
-    _for_(n_s, tid % threads_per_group, tid % threads_per_group + 1) {
-      N_single_thr_size = get_balance211_length(
-        N / iin_block_, config.N_split_num, n_s, n_idx, X_bigger_num);
-      N_single_thr_size = N_single_thr_size * iin_block_;
-      n_idx = n_idx * iin_block_;
-      _for_(n_b, 0, config.N_sub_block) {
-        expr n_b_idx, n_b_bigger_num, k_b_idx, k_b_bigger_num;
-        _var_init_(n_o_end, datatypes::index,
-          builder::make_cast(datatypes::index,
-            get_balance211_length(N_single_thr_size / iin_block_,
-              config.N_sub_block, n_b, n_b_idx, n_b_bigger_num)));
-        _for_(k_b, 0, config.K_sub_block) {
-          _for_(n_o, 0, n_o_end) {
-            _var_init_(n_start_idx, datatypes::index,
-              n_idx + n_b_idx * iin_block_
-                + (doroll(n_o, n_o_end)) * iin_block_);
-            _var_init_(bs, datatypes::index,
-              builder::make_cast(datatypes::index,
-                get_balance211_length(K / iik_block_, config.K_sub_block, k_b,
-                  k_b_idx, k_b_bigger_num)));
-            _var_init_(k_start_idx, datatypes::index, 0 + k_b_idx * iik_block_);
-            _for_(i, 0, iik_block_ * iin_block_ * bs, 512 / sizeof_dtype) {
-              _if_(lookup[0] == expected && !is_prefetch_debug_mode()) {
-                _return_(cnt);
-              }
-              cnt = cnt + 1;
-              _for_(j, 0,
-                builder::make_min(
-                  iik_block_ * iin_block_ * bs, 512 / sizeof_dtype),
-                64 / sizeof_dtype) {
-                std::vector<expr> B_indices;
-                if (utils::is_one_of(
-                      get_A_dtype(), datatypes::f32, datatypes::f16)) {
-                  B_indices = {n_start_idx / expr(iin_block_),
-                    k_start_idx / expr(iik_block_), 0, i + j};
-                } else {
-                  B_indices = {n_start_idx / expr(iin_block_),
-                    k_start_idx / expr(iik_block_), 0, 0, i + j};
-                }
-                auto tptr = builder::tensor_ptr(ins[0], B_indices);
-                trace_prefetch_for_debug(tptr);
-                builder::get_current_builder()->push_evaluate(
-                  make_expr<intrin_call_node>(intrin_type::prefetch,
-                    std::vector<expr> {tptr}, any_map_t {{"locality", 1}}));
-              }
-            }
-          }
-        }
-      }
-    }
-  } else {
-    _for_(n_s, tid % (config.N_split_num * K_split_num) / K_split_num,
-      tid % (config.N_split_num * K_split_num) / K_split_num + 1) {
-      N_single_thr_size = get_balance211_length(
-        N / iin_block_, config.N_split_num, n_s, n_idx, X_bigger_num);
-      N_single_thr_size = N_single_thr_size * iin_block_;
-      n_idx = n_idx * iin_block_;
-      _for_(k_s, tid % (config.N_split_num * K_split_num) % K_split_num,
-        tid % (config.N_split_num * K_split_num) % K_split_num + 1) {
-        K_single_thr_size = get_balance211_length(
-          K / iik_block_, K_split_num, k_s, k_idx, X_bigger_num);
-        K_single_thr_size = K_single_thr_size * iik_block_;
-        k_idx = k_idx * iik_block_;
-        _for_(n_b, 0, config.N_sub_block) {
-          expr n_b_idx, n_b_bigger_num, k_b_idx, k_b_bigger_num;
-          _var_init_(n_o_end, datatypes::index,
-            builder::make_cast(datatypes::index,
-              get_balance211_length(N_single_thr_size / iin_block_,
-                config.N_sub_block, n_b, n_b_idx, n_b_bigger_num)));
-          _for_(k_b, 0, config.K_sub_block) {
-            _for_(n_o, 0, n_o_end) {
-              _var_init_(n_start_idx, datatypes::index,
-                n_idx + n_b_idx * iin_block_
-                  + (doroll(n_o, n_o_end)) * iin_block_);
-              _var_init_(bs, datatypes::index,
-                builder::make_cast(datatypes::index,
-                  get_balance211_length(K_single_thr_size / iik_block_,
-                    config.K_sub_block, k_b, k_b_idx, k_b_bigger_num)));
-              _var_init_(
-                k_start_idx, datatypes::index, k_idx + k_b_idx * iik_block_);
-              _for_(i, 0, iik_block_ * iin_block_ * bs, 512 / sizeof_dtype) {
-                _if_(lookup[0] == expected && !is_prefetch_debug_mode()) {
-                  _return_(cnt);
-                }
-                cnt = cnt + 1;
-
-                _for_(j, 0,
-                  builder::make_min(
-                    iik_block_ * iin_block_ * bs, 512 / sizeof_dtype),
-                  64 / sizeof_dtype) {
-                  std::vector<expr> B_indices;
-                  if (utils::is_one_of(
-                        get_A_dtype(), datatypes::f32, datatypes::f16)) {
-                    B_indices = {n_start_idx / expr(iin_block_),
-                      k_start_idx / expr(iik_block_), 0, i + j};
-                  } else {
-                    B_indices = {n_start_idx / expr(iin_block_),
-                      k_start_idx / expr(iik_block_), 0, 0, i + j};
-                  }
-                  auto tptr = builder::tensor_ptr(ins[0], B_indices);
-                  trace_prefetch_for_debug(tptr);
-                  builder::get_current_builder()->push_evaluate(
-                    make_expr<intrin_call_node>(intrin_type::prefetch,
-                      std::vector<expr> {tptr}, any_map_t {{"locality", 1}}));
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  _return_(cnt);
-}
-
-float gen_managed_matmul_core_t::get_gflop() const {
-  const int64_t plain_M = get_mma_plain_dims()[0];
-  const int64_t plain_K = get_mma_plain_dims()[1];
-  const int64_t plain_N = get_mmb_plain_dims()[1];
-  return get_a_batch_dims().empty() && get_a_batch_dims().empty()
-    ? 2.f * plain_M * plain_N * plain_K / 1e9
-    : 2.f * plain_M * plain_N * plain_K
-      * math_utils::get_dims_product(
-        get_a_batch_dims().size() > get_b_batch_dims().size()
-          ? get_a_batch_dims()
-          : get_b_batch_dims())
-      / 1e9;
-}
-
-void gen_managed_matmul_core_t::schedule_loops(context_ptr ctx,
-  const managed_matmul_core_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {}
-
-void gen_managed_matmul_core_t::dynamic_single_thread_matmul_call(
-  const managed_matmul_core_config_t &config,
-  const std::vector<expr> &buffer_args, const expr &m_s, const expr &n_s,
-  const expr &k_s, int K_split_num, expr &iim_block, expr &iin_block,
-  expr &iik_block) const {
-  COMPILE_ASSERT(single_core_func_param_.defined(),
-    "Single core function parameter should be defined first!");
-  auto bld = builder::get_current_builder();
-  std::vector<expr> extra_args {static_cast<uint64_t>(config.M_split_num),
-    static_cast<uint64_t>(config.N_split_num),
-    static_cast<uint64_t>(K_split_num),
-    static_cast<uint64_t>(config.M_sub_block),
-    static_cast<uint64_t>(config.N_sub_block),
-    static_cast<uint64_t>(config.K_sub_block), m_s, n_s, k_s, iim_block,
-    iin_block, iik_block};
-  extra_args.insert(extra_args.begin(), buffer_args.begin(), buffer_args.end());
-  auto single_core_call
-    = make_expr<call_node>(single_core_func_param_, extra_args);
-  bld->push_evaluate(single_core_call);
-}
-
-static void get_index_with_offset(const logical_tensor_t &ta,
-  std::vector<expr> &new_aidx, const logical_tensor_t &tc,
-  std::vector<expr> &new_cidx, int iim_idx, int split_iim, bool is_partial) {
-  if (ta.get_format() == sc_data_format_t::MK()) {
-    new_aidx[0] = new_aidx[0] + iim_idx * split_iim;
-  } else { // A is MKmk blocking
-    new_aidx[2] = new_aidx[2] + iim_idx * split_iim;
-  }
-  if (is_partial) {
-    if (tc.get_format().is_blocking()) {
-      new_cidx[3] = new_cidx[3] + iim_idx * split_iim;
-    } else {
-      new_cidx[1] = new_cidx[1] + iim_idx * split_iim;
-    }
-  } else {
-    if (tc.get_format().is_blocking()) {
-      new_cidx[2] = new_cidx[2] + iim_idx * split_iim;
-    } else {
-      new_cidx[0] = new_cidx[0] + iim_idx * split_iim;
-    }
-  }
-}
-
-void gen_managed_matmul_core_t::single_thread_matmul_call(
-  const context_ptr &ctx, sc_graph_t &graph, const logical_tensor_t &ta,
-  const logical_tensor_t &tb, const logical_tensor_t &tc,
-  const managed_matmul_core_config_t &config, const expr &M, const expr &N,
-  const expr &K, const expr &m_idx, const expr &n_idx, const expr &k_idx,
-  const expr &A, const expr &B, const expr &C, int dtype_block,
-  fusion_anchor_mgr_t *fusion, const expr &m_s, const expr &n_s,
-  std::vector<int> &M_anchor_info, std::vector<int> &N_anchor_info,
-  bool is_partial, const expr &k_s, bool is_dynamic,
-  const expr &N_block_size_expr) const {
-  expr M_sub_block = config.M_sub_block, N_sub_block = config.N_sub_block,
-       K_sub_block = config.K_sub_block;
-  for_loop im_k, im_m, im_n, o_im_m, o_im_n;
-  int ori_M = static_cast<int>(ta.get_plain_dims()[0]),
-      ori_K = static_cast<int>(ta.get_plain_dims()[1]),
-      ori_N = static_cast<int>(tb.get_plain_dims()[1]);
-  expr tid = builder::make_get_group_thread_id(-1);
-  sc_brgemm_attrs_t brg_attrs;
-  brg_attrs[brgemm::attr_key::dispatch_avx] = dispatch_avx_;
-  auto doroll = [&tid](const expr &v, const expr &bound) {
-    if (is_dyn_threadpool()) { return v; }
-    return (v + tid) % bound;
-  };
-  int num_main_iim_block = iim_block_ / split_iim_;
-  int tail_iim_block = iim_block_ % split_iim_;
-  _named_for_(o_im_m, m_b, 0, M_sub_block) {
-    _named_for_(o_im_n, n_b, 0, N_sub_block) {
-      expr m_b_idx, n_b_idx, k_b_idx, m_b_bigger_num, n_b_bigger_num,
-        k_b_bigger_num;
-      _var_init_(m_o_end, datatypes::index,
-        get_balance211_length(
-          M / iim_block_, M_sub_block, m_b, m_b_idx, m_b_bigger_num));
-      _var_init_(n_o_end, datatypes::index,
-        get_balance211_length(
-          N / iin_block_, N_sub_block, n_b, n_b_idx, n_b_bigger_num));
-      _named_for_(im_k, k_b, 0, K_sub_block) {
-        // general matmul_core loops
-        _named_for_(im_m, m_o, 0, m_o_end) {
-          _named_for_(im_n, n_o, 0, n_o_end) {
-            // rolling M and N
-            _var_init_(m_start_idx, datatypes::index,
-              m_idx + m_b_idx * iim_block_
-                + (doroll(m_o, m_o_end)) * iim_block_);
-            _var_init_(n_start_idx, datatypes::index,
-              n_idx + n_b_idx * iin_block_
-                + (doroll(n_o, n_o_end)) * iin_block_);
-            _var_init_(bs, datatypes::index,
-              builder::make_cast(datatypes::index,
-                get_balance211_length(
-                  K / iik_block_, K_sub_block, k_b, k_b_idx, k_b_bigger_num)));
-            _var_init_(
-              k_start_idx, datatypes::index, k_idx + k_b_idx * iik_block_);
-            // create input anchor for B if necessary
-            if (fusion && in_tensors_[1].get_format().is_blocking()
-              && K.isa<constant>()
-              && ((get_expr_as_int(K) / iik_block_ % config.K_sub_block) == 0)
-              && !is_dynamic_dim(ori_M) && ori_M <= 512) {
-              slice_range B_slice = {{n_start_idx / iin_block_, 1},
-                {k_start_idx / iik_block_, K / iik_block_ / K_sub_block},
-                {0, iik_block_ / dtype_block}, {0, iin_block_}};
-              if (dtype_block > 1) { B_slice.push_back({0, dtype_block}); }
-              fusion->create_fusion_anchor(
-                slice_map {
-                  {owner_->get_inputs()[1].get(), slice_range_list {B_slice}}},
-                nullptr, true);
-            }
-            std::vector<expr> aidx = ta.get_format() == sc_data_format_t::MK()
-              ? std::vector<expr> {m_start_idx, k_start_idx}
-              : std::vector<expr> {
-                m_start_idx / iim_block_, k_start_idx / iik_block_, 0, 0};
-            std::vector<expr> bidx = dtype_block > 1
-              ? std::vector<expr> {n_start_idx / iin_block_,
-                k_start_idx / iik_block_, 0, 0, 0}
-              : (!tb.get_format().is_blocking()
-                  ? std::vector<expr> {k_start_idx, n_start_idx}
-                  : std::vector<expr> {
-                    n_start_idx / iin_block_, k_start_idx / iik_block_, 0, 0});
-            std::vector<expr> cidx;
-            if (is_partial) {
-              cidx = !tc.get_format().is_blocking()
-                ? std::vector<expr> {m_b_idx * iim_block_
-                    + (doroll(m_o, m_o_end)) * iim_block_,
-                  n_b_idx * iin_block_ + (doroll(n_o, n_o_end)) * iin_block_}
-                : std::vector<expr> {m_b_idx + doroll(m_o, m_o_end),
-                  n_b_idx + doroll(n_o, n_o_end), 0, 0};
-              cidx.insert(cidx.begin(), k_s);
-            } else {
-              cidx = !tc.get_format().is_blocking()
-                ? std::vector<expr> {m_start_idx, n_start_idx}
-                : std::vector<expr> {
-                  m_start_idx / iim_block_, n_start_idx / iin_block_, 0, 0};
-            }
-            expr LDA = ta.get_format() == sc_data_format_t::MK()
-              ? graph.dim_to_expr(ori_K)
-              : expr(iik_block_);
-            expr LDB = !tb.get_format().is_blocking() ? graph.dim_to_expr(ori_N)
-                                                      : expr(iin_block_);
-            expr LDC = !tc.get_format().is_blocking()
-              ? (is_partial ? N_block_size_expr : graph.dim_to_expr(ori_N))
-              : iin_block_;
-            expr stride_a = ta.get_format() == sc_data_format_t::MK()
-              ? iik_block_
-              : iim_block_ * iik_block_;
-            expr stride_b = !tb.get_format().is_blocking()
-              ? iik_block_ * graph.dim_to_expr(ori_N)
-              : iik_block_ * iin_block_;
-            trace_brgemm_for_debug(
-              tensor_ptr(B, bidx), bs, iin_block_, iik_block_);
-
-            _if_(k_b == 0) {
-              if (split_iim_ == -1) { // normal case, use iim_block_
-                auto eval = builtin::brgemm_init_update(tensor_ptr(A, aidx),
-                  tensor_ptr(B, bidx), tensor_ptr(C, cidx), bs, iim_block_,
-                  iin_block_, iik_block_, LDA, LDB, LDC, stride_a, stride_b,
-                  ta.dtype_, tb.dtype_, brg_attrs);
-              } else {
-                for (int iim_idx = 0; iim_idx < num_main_iim_block + 1;
-                     ++iim_idx) {
-                  auto new_aidx = aidx;
-                  auto new_cidx = cidx;
-                  get_index_with_offset(ta, new_aidx, tc, new_cidx, iim_idx,
-                    split_iim_, is_partial);
-                  auto real_iim = (iim_idx == num_main_iim_block)
-                    ? tail_iim_block
-                    : split_iim_;
-                  builtin::brgemm_init_update(tensor_ptr(A, new_aidx),
-                    tensor_ptr(B, bidx), tensor_ptr(C, new_cidx), bs, real_iim,
-                    iin_block_, iik_block_, LDA, LDB, LDC, stride_a, stride_b,
-                    ta.dtype_, tb.dtype_, brg_attrs);
-                }
-              }
-            }
-            _else_ {
-              if (split_iim_ == -1) { // normal case, use iim_block_
-                builtin::brgemm_update(tensor_ptr(A, aidx), tensor_ptr(B, bidx),
-                  tensor_ptr(C, cidx), bs, iim_block_, iin_block_, iik_block_,
-                  LDA, LDB, LDC, stride_a, stride_b, ta.dtype_, tb.dtype_,
-                  brg_attrs);
-              } else {
-                for (int iim_idx = 0; iim_idx < num_main_iim_block + 1;
-                     ++iim_idx) {
-                  auto new_aidx = aidx;
-                  auto new_cidx = cidx;
-                  get_index_with_offset(ta, new_aidx, tc, new_cidx, iim_idx,
-                    split_iim_, is_partial);
-                  auto real_iim = (iim_idx == num_main_iim_block)
-                    ? tail_iim_block
-                    : split_iim_;
-                  auto eval = builtin::brgemm_update(tensor_ptr(A, new_aidx),
-                    tensor_ptr(B, bidx), tensor_ptr(C, new_cidx), bs, real_iim,
-                    iin_block_, iik_block_, LDA, LDB, LDC, stride_a, stride_b,
-                    ta.dtype_, tb.dtype_, brg_attrs);
-                }
-              }
-            }
-            if (fusion && !is_partial) {
-              _if_(k_b == K_sub_block - 1) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  !tc.get_format().is_blocking()
-                    ? std::vector<std::pair<expr, expr>> {{m_start_idx,
-                                                            expr(iim_block_)},
-                      {n_start_idx, expr(iin_block_)}}
-                    : std::vector<std::pair<expr, expr>> {
-                      {m_start_idx / iim_block_, 1},
-                      {n_start_idx / iin_block_, 1}, {0, expr(iim_block_)},
-                      {0, expr(iin_block_)}});
-              }
-            }
-          }
-          if (fusion && !is_partial && config.N_split_num == 1
-            && config.N_sub_block == 1 && config.im_loop_order == 0) {
-            _if_(k_b == K_sub_block - 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                !tc.get_format().is_blocking()
-                  ? std::vector<std::pair<expr, expr>> {{m_idx
-                                                            + m_b_idx
-                                                              * iim_block_
-                                                            + doroll(
-                                                                m_o, m_o_end)
-                                                              * iim_block_,
-                                                          expr(iim_block_)},
-                    {0, utils::rnd_up(ori_N, iin_block_)}}
-                  : std::vector<std::pair<expr, expr>> {
-                    {(m_idx + m_b_idx * iim_block_
-                       + (doroll(m_o, m_o_end)) * iim_block_)
-                        / iim_block_,
-                      1},
-                    {0, utils::divide_and_ceil(ori_N, iin_block_)},
-                    {0, expr(iim_block_)}, {0, expr(iin_block_)}});
-            }
-          }
-        }
-      }
-      if (fusion && !is_dynamic && !is_partial) {
-        // 16 cases in total
-        if (M_anchor_info[1] == M_anchor_info[2]
-          && N_anchor_info[1] == N_anchor_info[2]
-          && M_anchor_info[1] / iim_block_ % config.M_sub_block == 0
-          && N_anchor_info[1] / iin_block_ % config.N_sub_block == 0) {
-          // case 1: no imbalance on single core, X_sub_block can be
-          // dividedevenly
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            !tc.get_format().is_blocking()
-              ? std::vector<std::pair<expr, expr>> {{m_idx
-                                                        + m_b_idx * iim_block_,
-                                                      M_anchor_info[1]
-                                                        / config.M_sub_block},
-                {n_idx + n_b_idx * iin_block_,
-                  N_anchor_info[1] / config.N_sub_block}}
-              : std::vector<std::pair<expr, expr>> {
-                {(m_idx + m_b_idx * iim_block_) / expr(iim_block_),
-                  M_anchor_info[1] / iim_block_ / config.M_sub_block},
-                {(n_idx + n_b_idx * iin_block_) / expr(iin_block_),
-                  N_anchor_info[1] / iin_block_ / config.N_sub_block},
-                {0, expr(iim_block_)}, {0, expr(iin_block_)}});
-        } else {
-          slice_range_list mm_multi_slice;
-          // order:X_anchor_info[1] -> X_anchor_info[2]
-          for (int p = 0; p < 2; p++) {
-            for (int q = 0; q < 2; q++) {
-              for (int i = 0; i < 2; i++) {
-                for (int j = 0; j < 2; j++) {
-                  if (!tc.get_format().is_blocking()) {
-                    auto length_M = M_anchor_info[p + 1] / config.M_sub_block;
-                    if (M_anchor_info[p + 1] / iim_block_ % config.M_sub_block
-                      != 0) {
-                      length_M += (1 - i) * iim_block_;
-                    }
-                    auto length_N = N_anchor_info[q + 1] / config.N_sub_block;
-                    if (N_anchor_info[q + 1] / iin_block_ % config.N_sub_block
-                      != 0) {
-                      length_N += (1 - j) * iin_block_;
-                    }
-                    assert(length_M > 0 && length_N > 0);
-                    mm_multi_slice.emplace_back(
-                      slice_range {{m_idx + m_b_idx * iim_block_, length_M},
-                        {n_idx + n_b_idx * iin_block_, length_N}});
-                  } else {
-                    auto length_M
-                      = M_anchor_info[p + 1] / iim_block_ / config.M_sub_block;
-                    if (M_anchor_info[p + 1] / iim_block_ % config.M_sub_block
-                      != 0) {
-                      length_M += 1 - i;
-                    }
-                    auto length_N
-                      = N_anchor_info[q + 1] / iin_block_ / config.N_sub_block;
-                    if (N_anchor_info[q + 1] / iin_block_ % config.N_sub_block
-                      != 0) {
-                      length_N += 1 - j;
-                    }
-                    assert(length_M > 0 && length_N > 0);
-                    mm_multi_slice.emplace_back(slice_range {
-                      {(m_idx + m_b_idx * iim_block_) / expr(iim_block_),
-                        length_M},
-                      {(n_idx + n_b_idx * iin_block_) / expr(iin_block_),
-                        length_N},
-                      {0, expr(iim_block_)}, {0, expr(iin_block_)}});
-                  }
-                }
-              }
-            }
-          }
-          auto gen_iter_anchor = [&]() {
-            builder::ir_builder_t bd_helper;
-            bd_helper.push_scope();
-            _var_init_(anchor_iter, datatypes::index, UINT64_C(0));
-            // TODO(xxx): reduce the if-else node in IR
-            _if_(m_s < M_anchor_info[0]) {
-              // 0-8
-              _if_(n_s < N_anchor_info[0]) {
-                // 0-4
-                _if_(m_b < m_b_bigger_num) {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(0); }
-                  _else_ { anchor_iter = UINT64_C(1); }
-                }
-                _else_ {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(2); }
-                  _else_ { anchor_iter = UINT64_C(3); }
-                }
-              }
-              _else_ {
-                _if_(m_b < m_b_bigger_num) {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(4); }
-                  _else_ { anchor_iter = UINT64_C(5); }
-                }
-                _else_ {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(6); }
-                  _else_ { anchor_iter = UINT64_C(7); }
-                }
-              }
-            }
-            _else_ {
-              _if_(n_s < N_anchor_info[0]) {
-                _if_(m_b < m_b_bigger_num) {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(8); }
-                  _else_ { anchor_iter = UINT64_C(9); }
-                }
-                _else_ {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(10); }
-                  _else_ { anchor_iter = UINT64_C(11); }
-                }
-              }
-              _else_ {
-                _if_(m_b < m_b_bigger_num) {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(12); }
-                  _else_ { anchor_iter = UINT64_C(13); }
-                }
-                _else_ {
-                  _if_(n_b < n_b_bigger_num) { anchor_iter = UINT64_C(14); }
-                  _else_ { anchor_iter = UINT64_C(15); }
-                }
-              }
-            }
-            auto scope_helper = bd_helper.pop_scope();
-            return std::make_pair(anchor_iter, scope_helper);
-          };
-          expr anchor_iter;
-          stmt scope_helper;
-          std::tie(anchor_iter, scope_helper) = gen_iter_anchor();
-          fusion->create_fusion_anchor(anchor_iter,
-            slice_map {{owner_->get_outputs()[0].get(),
-              slice_range_list {mm_multi_slice}}},
-            scope_helper);
-        }
-      }
-    }
-  }
-  if (!is_dynamic && config.K_sub_block > 1 && config.im_loop_order != 1) {
-    im_n->attr()[stmt_attr_key::reduce_root_loop]
-      = std::weak_ptr<stmt_base_t>(o_im_n.impl);
-  }
-  if (config.im_loop_order == 1) {
-    im_m->reorder(im_k->body_, {im_n, im_m});
-    im_m->attr()[stmt_attr_key::reduce_root_loop]
-      = std::weak_ptr<stmt_base_t>(o_im_n.impl);
-  }
-  // bind outer loops with axis hint
-  bind_loop_axis(owner_->get_outputs()[0], o_im_m, 0);
-  bind_loop_axis(owner_->get_outputs()[0], o_im_n, 1);
-}
-
-std::vector<expr> gen_managed_matmul_core_t::get_extra_args_from_func(
-  const func_t &f) const {
-  return std::vector<expr>(f->params_.begin() + 3, f->params_.end());
-}
-
-func_t gen_managed_matmul_core_t::get_single_core_func(context_ptr ctx,
-  const managed_matmul_core_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  auto &graph = owner_->get_owner_graph();
-  auto &ta = in_tensors_[0];
-  auto &tb = in_tensors_[1];
-  auto &tc = out_tensors_[0];
-  int ori_M = static_cast<int>(ta.get_plain_dims()[0]),
-      ori_K = static_cast<int>(ta.get_plain_dims()[1]),
-      ori_N = static_cast<int>(tb.get_plain_dims()[1]);
-  COMPILE_ASSERT(
-    !is_dynamic_dim(ori_K), "Currently we don't support dynamic on K");
-  int dtype_block = 1;
-  auto B_dtype = get_B_dtype();
-  bool is_B_vnni_low_fp = ops::is_vnni_low_fp(ctx, B_dtype);
-  if (is_B_vnni_low_fp) {
-    dtype_block = 2;
-  } else if (utils::is_one_of(B_dtype, datatypes::u8, datatypes::s8)) {
-    dtype_block = 4;
-  }
-  ir_builder_t bld;
-  static std::atomic<int> func_idx = {0};
-  expr partial_C;
-  sc_brgemm_attrs_t brg_attrs;
-  brg_attrs[brgemm::attr_key::dispatch_avx] = dispatch_avx_;
-  _function_(datatypes::boolean, managed_matmul_single_core, {outputs[0]},
-    {inputs[0]}, {inputs[1]}, _arg_("M_split_num", datatypes::index),
-    _arg_("N_split_num", datatypes::index),
-    _arg_("K_split_num", datatypes::index),
-    _arg_("M_sub_block", datatypes::index),
-    _arg_("N_sub_block", datatypes::index),
-    _arg_("K_sub_block", datatypes::index), _arg_("m_s", datatypes::index),
-    _arg_("n_s", datatypes::index), _arg_("k_s", datatypes::index),
-    _arg_("iim_block", datatypes::s32, {1}),
-    _arg_("iin_block", datatypes::s32, {1}),
-    _arg_("iik_block", datatypes::s32, {1})) {
-    _bind_(C, A, B, M_split_num, N_split_num, K_split_num, M_sub_block,
-      N_sub_block, K_sub_block, m_s, n_s, k_s, iim_block, iin_block, iik_block);
-    expr m_idx_, n_idx_, k_idx_, K_single_thr_size_, X_bigger_num;
-    auto ori_M_expr = graph.dim_to_expr(ori_M);
-    auto ori_N_expr = graph.dim_to_expr(ori_N);
-    auto ori_K_expr = graph.dim_to_expr(ori_K);
-    auto M = divide_and_ceil(ori_M_expr, iim_block_) * iim_block_;
-    auto N = divide_and_ceil(ori_N_expr, iin_block_) * iin_block_;
-    auto K = divide_and_ceil(ori_K_expr, iik_block_) * iik_block_;
-    // Postpone calculation time of M_single_thr_size/N_single_thr_size and
-    // m_idx/n_idx.
-    iim_block[0] = iim_block_;
-    iin_block[0] = iin_block_;
-    iik_block[0] = iik_block_;
-    expr tid = builtin::get_thread_id_func()();
-    auto doroll = [&tid](const expr &v, const expr &bound) {
-      if (is_dyn_threadpool()) { return v; }
-      return (v + tid) % bound;
-    };
-    k_idx_ = 0;
-    K_single_thr_size_ = get_balance211_length(
-      K / iik_block_, K_split_num, k_s, k_idx_, X_bigger_num);
-    K_single_thr_size_ = K_single_thr_size_ * iik_block_;
-    k_idx_ = k_idx_ * iik_block_;
-    expr M_block_size_expr
-      = divide_and_ceil(M / iim_block_, M_split_num) * iim_block_;
-    expr N_block_size_expr
-      = divide_and_ceil(N / iin_block_, N_split_num) * iin_block_;
-    for_loop o_im_m, o_im_n, im_k, im_m, im_n;
-    auto K_real_split = builder::make_min(divide_and_ceil(K, iik_block_),
-      runtime_config_t::get().get_num_threads() / M_split_num / N_split_num);
-    auto N_real_split
-      = builder::make_min(divide_and_ceil(N, iin_block_), N_split_num);
-    bool input_plain = ta.get_format() == sc_data_format_t::MK();
-    _if_(m_s < divide_and_ceil(M, iim_block_)
-      && n_s < divide_and_ceil(N, iin_block_)
-      && k_s < divide_and_ceil(K, iik_block_)) {
-      _var_init_(M_single_thr_size, datatypes::index,
-        get_balance211_length(
-          M / iim_block_, M_split_num, m_s, m_idx_, X_bigger_num)
-          * iim_block_);
-      _var_init_(m_idx, datatypes::index, m_idx_ * iim_block_);
-
-      _var_init_(N_single_thr_size, datatypes::index,
-        get_balance211_length(
-          N / iin_block_, N_split_num, n_s, n_idx_, X_bigger_num)
-          * iin_block_);
-      _var_init_(n_idx, datatypes::index, n_idx_ * iin_block_);
-      _var_init_(K_single_thr_size, datatypes::index,
-        !is_partial_
-          ? do_cast_and_fold(divide_and_ceil(K, iik_block_) * iik_block_)
-          : K_single_thr_size_);
-      _named_for_(o_im_m, m_b, 0, M_sub_block) {
-        _named_for_(o_im_n, n_b, 0, N_sub_block) {
-          expr m_b_idx, n_b_idx, k_b_idx, m_b_bigger_num, n_b_bigger_num,
-            k_b_bigger_num;
-          _var_init_(m_o_end, datatypes::index,
-            get_balance211_length(M_single_thr_size / iim_block_, M_sub_block,
-              m_b, m_b_idx, m_b_bigger_num));
-          _var_init_(n_o_end, datatypes::index,
-            get_balance211_length(N_single_thr_size / iin_block_, N_sub_block,
-              n_b, n_b_idx, n_b_bigger_num));
-          auto partial_c_shape
-            = std::vector<expr> {K_real_split, M_block_size_expr / iim_block_,
-              N_block_size_expr / iin_block_, iim_block_, iin_block_};
-          partial_C = copy_attr(*(C.get()),
-            builder::make_tensor(
-              "partial_out", partial_c_shape, out_tensors_[0].dtype_));
-          partial_C->attr().set(attr_keys::plain_dims,
-            std::vector<expr> {ori_M_expr, ori_N_expr, ori_K_expr});
-          auto C_tptr = is_partial_ ? partial_C : C;
-          _named_for_(im_k, k_b, 0, K_sub_block) {
-            expr K_tail_cond = (k_b == K_sub_block - 1
-              && k_s == K_real_split - 1 && K != ori_K_expr);
-            _var_init_(bs, datatypes::index,
-              builder::make_cast(datatypes::index,
-                get_balance211_length(K_single_thr_size / iik_block_,
-                  K_sub_block, k_b, k_b_idx, k_b_bigger_num)));
-            // bs floor, tail is 1.
-            expr K_tail;
-            _var_init_(
-              k_start_idx, datatypes::index, k_idx_ + k_b_idx * iik_block_);
-            if (input_plain) {
-              bs = builder::make_select(
-                K_tail_cond, do_cast_and_fold(bs - 1), bs);
-              K_tail = is_dynamic_dim(ori_K)
-                ? ori_K_expr - k_start_idx - bs * iik_block_
-                : ori_K % iik_block_;
-              brg_attrs[brgemm::attr_key::K_range_upper_bound] = iik_block_;
-              if (!is_dynamic_dim(ori_K)) {
-                brg_attrs[brgemm::attr_key::K_range_tail_value]
-                  = ori_K % iik_block_;
-              }
-            }
-            // general matmul_core loops
-            _named_for_(im_m, m_o, 0, m_o_end) {
-              _named_for_(im_n, n_o, 0, n_o_end) {
-                // rolling M and N
-                _var_init_(m_start_idx, datatypes::index,
-                  m_idx + m_b_idx * iim_block_
-                    + (doroll(m_o, m_o_end)) * iim_block_);
-                _var_init_(n_start_idx, datatypes::index,
-                  n_idx + n_b_idx * iin_block_
-                    + (doroll(n_o, n_o_end)) * iin_block_);
-                std::vector<expr> aidx = input_plain
-                  ? std::vector<expr> {m_start_idx, k_start_idx}
-                  : std::vector<expr> {
-                    m_start_idx / iim_block_, k_start_idx / iik_block_, 0, 0};
-                std::vector<expr> bidx = dtype_block > 1
-                  ? std::vector<expr> {n_start_idx / iin_block_,
-                    k_start_idx / iik_block_, 0, 0, 0}
-                  : (!tb.get_format().is_blocking()
-                      ? std::vector<expr> {k_start_idx, n_start_idx}
-                      : std::vector<expr> {n_start_idx / iin_block_,
-                        k_start_idx / iik_block_, 0, 0});
-                std::vector<expr> partial_cidx, full_cidx;
-                partial_cidx = !tc.get_format().is_blocking()
-                  ? std::vector<expr> {m_b_idx * iim_block_
-                      + (doroll(m_o, m_o_end)) * iim_block_,
-                    n_b_idx * iin_block_ + (doroll(n_o, n_o_end)) * iin_block_}
-                  : std::vector<expr> {m_b_idx + doroll(m_o, m_o_end),
-                    n_b_idx + doroll(n_o, n_o_end), 0, 0};
-                partial_cidx.insert(partial_cidx.begin(), k_s);
-                full_cidx = !tc.get_format().is_blocking()
-                  ? std::vector<expr> {m_start_idx, n_start_idx}
-                  : std::vector<expr> {
-                    m_start_idx / iim_block_, n_start_idx / iin_block_, 0, 0};
-                auto partial_C_ptr = tensor_ptr(C_tptr, partial_cidx);
-                auto full_C_ptr = tensor_ptr(C_tptr, full_cidx);
-                expr LDA = input_plain ? ori_K_expr : expr(iik_block_);
-                expr LDB = !tb.get_format().is_blocking() ? ori_N_expr
-                                                          : expr(iin_block_);
-                expr partial_LDC = !tc.get_format().is_blocking()
-                  ? do_cast_and_fold(
-                    divide_and_ceil(N / iin_block_, N_split_num) * iin_block_)
-                  : iin_block_;
-                expr full_LDC
-                  = !tc.get_format().is_blocking() ? ori_N_expr : iin_block_;
-                expr stride_a
-                  = input_plain ? iik_block_ : iim_block_ * iik_block_;
-                expr stride_b = !tb.get_format().is_blocking()
-                  ? iik_block_ * ori_N_expr
-                  : iik_block_ * iin_block_;
-                expr m_block = iim_block_, n_block = iin_block_,
-                     k_block = iik_block_;
-                if (input_plain) {
-                  if (is_dynamic_dim(ori_M) || ori_M % iim_block_) {
-                    m_block = builder::make_select(
-                      m_start_idx + iim_block_ < ori_M_expr, iim_block_,
-                      builder::make_cast(
-                        datatypes::s32, ori_M_expr - m_start_idx));
-                    brg_attrs[brgemm::attr_key::M_range_upper_bound]
-                      = iim_block_;
-                    if (!is_dynamic_dim(ori_M)) {
-                      brg_attrs[brgemm::attr_key::M_range_tail_value]
-                        = ori_M % iim_block_;
-                    }
-                  }
-                  if (is_dynamic_dim(ori_N) || ori_N % iin_block_) {
-                    n_block = builder::make_select(
-                      n_start_idx + iin_block_ < ori_N_expr, iin_block_,
-                      builder::make_cast(
-                        datatypes::s32, ori_N_expr - n_start_idx));
-                    brg_attrs[brgemm::attr_key::N_range_upper_bound]
-                      = iin_block_;
-                    if (!is_dynamic_dim(ori_N)) {
-                      brg_attrs[brgemm::attr_key::N_range_tail_value]
-                        = ori_N % iin_block_;
-                    }
-                    partial_LDC->attr().set("skip_shrink_check", true);
-                  }
-                }
-                auto call_init_update_brgemm
-                  = [&](const expr &real_bs, const expr &k_block,
-                      const std::vector<expr> &real_aidx,
-                      const std::vector<expr> &real_bidx) {
-                      if (is_partial_) {
-                        builtin::brgemm_init_update(tensor_ptr(A, real_aidx),
-                          tensor_ptr(B, real_bidx), partial_C_ptr, real_bs,
-                          m_block, n_block, k_block, LDA, LDB, partial_LDC,
-                          stride_a, stride_b, ta.dtype_, tb.dtype_, brg_attrs);
-                      } else {
-                        builtin::brgemm_init_update(tensor_ptr(A, real_aidx),
-                          tensor_ptr(B, real_bidx), full_C_ptr, real_bs,
-                          m_block, n_block, k_block, LDA, LDB, full_LDC,
-                          stride_a, stride_b, ta.dtype_, tb.dtype_, brg_attrs);
-                      }
-                    };
-                auto call_update_brgemm
-                  = [&](const expr &real_bs, const expr &k_block,
-                      const std::vector<expr> &real_aidx,
-                      const std::vector<expr> &real_bidx) {
-                      if (is_partial_) {
-                        builtin::brgemm_update(tensor_ptr(A, real_aidx),
-                          tensor_ptr(B, real_bidx), partial_C_ptr, real_bs,
-                          m_block, n_block, k_block, LDA, LDB, partial_LDC,
-                          stride_a, stride_b, ta.dtype_, tb.dtype_, brg_attrs);
-                      } else {
-                        builtin::brgemm_update(tensor_ptr(A, real_aidx),
-                          tensor_ptr(B, real_bidx), full_C_ptr, real_bs,
-                          m_block, n_block, k_block, LDA, LDB, full_LDC,
-                          stride_a, stride_b, ta.dtype_, tb.dtype_, brg_attrs);
-                      }
-                    };
-                _if_(bs > 0) {
-                  _if_(k_b == 0) {
-                    call_init_update_brgemm(bs, iik_block_, aidx, bidx);
-                  }
-                  _else_ { call_update_brgemm(bs, iik_block_, aidx, bidx); }
-                }
-                if (input_plain) {
-                  auto k_tail_idx = k_start_idx + bs * iik_block_;
-                  std::vector<expr> tail_aidx = {m_start_idx, k_tail_idx};
-                  std::vector<expr> tail_bidx = dtype_block > 1
-                    ? std::vector<expr> {n_start_idx / iin_block_,
-                      k_tail_idx / iik_block_, 0, 0, 0}
-                    : (!tb.get_format().is_blocking()
-                        ? std::vector<expr> {k_tail_idx, n_start_idx}
-                        : std::vector<expr> {n_start_idx / iin_block_,
-                          k_tail_idx / iik_block_, 0, 0});
-                  _if_(K_tail_cond) {
-                    _if_(k_b == 0 && bs == 0) {
-                      call_init_update_brgemm(1, K_tail, tail_aidx, tail_bidx);
-                    }
-                    _else_ {
-                      call_update_brgemm(1, K_tail, tail_aidx, tail_bidx);
-                    }
-                  }
-                }
-
-                if (fusion && !is_partial_) {
-                  _if_(k_b == K_sub_block - 1) {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      !tc.get_format().is_blocking()
-                        ? std::vector<std::pair<expr, expr>> {{m_start_idx,
-                                                                expr(m_block)},
-                          {n_start_idx, expr(n_block)}}
-                        : std::vector<std::pair<expr, expr>> {
-                          {m_start_idx / iim_block_, 1},
-                          {n_start_idx / iin_block_, 1}, {0, expr(iim_block_)},
-                          {0, expr(iin_block_)}});
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    // bind outer loops with axis hint
-    bind_loop_axis(owner_->get_outputs()[0], o_im_m, 0);
-    bind_loop_axis(owner_->get_outputs()[0], o_im_n, 1);
-    // todo: K_sub_block partial or not may need combinated with format
-    // dispatch(outer dispatch key) to dispatch.(inner impl in format dispatch
-    // key not impl internal dispatch key.); Find the fusion strategy with
-    // K_sub_block > 1.
-    _return_(true);
-  }
-  if (is_partial_) {
-    managed_matmul_single_core->params_[0] = partial_C;
-    managed_matmul_single_core->decl_->params_[0] = partial_C;
-  }
-  managed_matmul_single_core->name_ += "_" + std::to_string(func_idx++);
-  managed_matmul_single_core->decl_->name_ = managed_matmul_single_core->name_;
-  return managed_matmul_single_core;
-}
-
-/**
- * For each single thread we may deal with different size of matmuls
- * For either axis, we have following candidates:
- * 1) X_block_size
- * 2) X_ib_block_size (imbalance)
-      X_block_size and X_ib_block_size are calculated by balance211 algrithm.
- Specially, X_block_size >= X_ib_block_size, and the gap is either 0 or
- iix_block_.
- * */
-bool gen_managed_matmul_core_t::generate(context_ptr ctx,
-  const managed_matmul_core_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  sc_graph_t &graph = owner_->get_owner_graph();
-  // Init
-  int M_split_num = config.M_split_num, N_split_num = config.N_split_num;
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int K_split_num = num_threads / M_split_num / N_split_num;
-  int M_sub_block = config.M_sub_block, N_sub_block = config.N_sub_block,
-      K_sub_block = config.K_sub_block, im_loop_order = config.im_loop_order;
-  int M = static_cast<int>(
-        utils::rnd_up(in_tensors_[0].get_plain_dims()[0], iim_block_)),
-      K = static_cast<int>(
-        utils::rnd_up(in_tensors_[0].get_plain_dims()[1], iik_block_)),
-      N = static_cast<int>(
-        utils::rnd_up(in_tensors_[1].get_plain_dims()[1], iin_block_));
-  expr M_expr
-    = divide_and_ceil(
-        graph.dim_to_expr(in_tensors_[0].get_plain_dims()[0]), iim_block_)
-    * iim_block_;
-  expr N_expr
-    = divide_and_ceil(
-        graph.dim_to_expr(in_tensors_[1].get_plain_dims()[1]), iin_block_)
-    * iin_block_;
-  expr K_expr
-    = divide_and_ceil(
-        graph.dim_to_expr(in_tensors_[0].get_plain_dims()[1]), iik_block_)
-    * iik_block_;
-  int M_block_size
-    = utils::divide_and_ceil(M / iim_block_, M_split_num) * iim_block_;
-  expr M_block_size_expr
-    = divide_and_ceil(M_expr / iim_block_, M_split_num) * iim_block_;
-  int M_ib_block_size = M / iim_block_ / M_split_num * iim_block_;
-  int N_block_size
-    = utils::divide_and_ceil(N / iin_block_, N_split_num) * iin_block_;
-  expr N_block_size_expr
-    = divide_and_ceil(N_expr / iin_block_, N_split_num) * iin_block_;
-  int N_ib_block_size = N / iin_block_ / N_split_num * iin_block_;
-  int K_block_size
-    = utils::divide_and_ceil(K / iik_block_, K_split_num) * iik_block_;
-  expr K_block_size_expr
-    = divide_and_ceil(K_expr / iik_block_, K_split_num) * iik_block_;
-  int K_ib_block_size = K / iik_block_ / K_split_num * iik_block_;
-
-  if (M_ib_block_size == 0) { M_ib_block_size = M_block_size; }
-  if (N_ib_block_size == 0) { N_ib_block_size = N_block_size; }
-  if (K_ib_block_size == 0) { K_ib_block_size = K_block_size; }
-
-  // M, N block num with block size equals to X_block_size
-  int M_blk_num = (M - (M_block_size - iim_block_) * M_split_num) / iim_block_;
-  int N_blk_num = (N - (N_block_size - iin_block_) * N_split_num) / iin_block_;
-  int K_blk_num = (K - (K_block_size - iik_block_) * K_split_num) / iik_block_;
-
-  bool is_dynamic = owner_->is_dynamic();
-  if (!is_dynamic) {
-    COMPILE_ASSERT(M_block_size / iim_block_ >= M_sub_block
-        && M_ib_block_size / iim_block_ >= M_sub_block && M_sub_block >= 1,
-      "bad M_sub_block given");
-    COMPILE_ASSERT(N_block_size / iin_block_ >= N_sub_block
-        && N_ib_block_size / iin_block_ >= N_sub_block && N_sub_block >= 1,
-      "bad N_sub_block given");
-    COMPILE_ASSERT(K_block_size / iik_block_ >= K_sub_block
-        && K_ib_block_size / iik_block_ >= K_sub_block && K_sub_block >= 1,
-      "bad K_sub_block given");
-  }
-  int dtype_block = 1;
-  auto A_dtype = get_A_dtype();
-  auto B_dtype = get_B_dtype();
-  const int sizeofdtypeA
-    = utils::get_sizeof_etype(in_tensors_[0].dtype_.as_etype());
-  bool is_A_vnni_low_fp = ops::is_vnni_low_fp(ctx, A_dtype);
-  bool is_B_vnni_low_fp = ops::is_vnni_low_fp(ctx, B_dtype);
-  if (is_B_vnni_low_fp) {
-    dtype_block = 2;
-  } else if (utils::is_one_of(B_dtype, datatypes::u8, datatypes::s8)) {
-    dtype_block = 4;
-  }
-  if (dtype_block > 1) {
-    COMPILE_ASSERT(in_tensors_[1].get_format().blocks_[2] == -1
-        || in_tensors_[1].get_format().blocks_[2] == dtype_block
-        || (dtype_block == 2 && in_tensors_[1].get_format().blocks_[2] == 0),
-      "Wrong data format of B");
-  }
-
-  expr C = outputs[op_params_t::out_C];
-  expr A = inputs[op_params_t::in_A];
-  expr B = inputs[op_params_t::in_B];
-  // used for anchor construction when K_split_num==1 && K_sub_block>1
-  std::vector<int> M_anchor_info = {M_blk_num, M_block_size, M_ib_block_size},
-                   N_anchor_info = {N_blk_num, N_block_size, N_ib_block_size},
-                   K_anchor_info = {K_blk_num, K_block_size, K_ib_block_size};
-  for_loop mloop, nloop, kloop;
-  expr M_real_split = is_dynamic
-    ? M_split_num
-    : do_cast_and_fold(
-      builder::make_min(divide_and_ceil(M_expr, iim_block_), M_split_num));
-  expr N_real_split = is_dynamic
-    ? N_split_num
-    : do_cast_and_fold(
-      builder::make_min(divide_and_ceil(N_expr, iin_block_), N_split_num));
-  expr K_real_split = is_dynamic
-    ? K_split_num
-    : do_cast_and_fold(
-      builder::make_min(divide_and_ceil(K_expr, iik_block_), K_split_num));
-
-  if (K_split_num == 1) {
-    expr m_idx, n_idx, M_single_thr_size, N_single_thr_size, X_bigger_num;
-    expr iim_block_tsr, iin_block_tsr, iik_block_tsr;
-    _named_for_(
-      mloop, m_s, 0, M_real_split, 1, for_type::PARALLEL, M_split_num) {
-      _named_for_(
-        nloop, n_s, 0, N_real_split, 1, for_type::PARALLEL, N_split_num) {
-        M_single_thr_size = get_balance211_length(
-          M_expr / iim_block_, M_split_num, m_s, m_idx, X_bigger_num);
-        M_single_thr_size = M_single_thr_size * iim_block_;
-        m_idx = m_idx * iim_block_;
-
-        N_single_thr_size = get_balance211_length(
-          N_expr / iin_block_, N_split_num, n_s, n_idx, X_bigger_num);
-        N_single_thr_size = N_single_thr_size * iin_block_;
-        n_idx = n_idx * iin_block_;
-        _named_for_(kloop, k_s, 0, K_split_num, 1,
-          M_split_num * N_split_num == num_threads ? for_type::NORMAL
-                                                   : for_type::PARALLEL,
-          M_split_num * N_split_num == num_threads ? 0 : K_split_num) {
-          // create input anchor for A if necessary
-          if (!is_dynamic) {
-            if (fusion && in_tensors_[0].get_format().is_blocking()
-              && (M * K * sizeofdtypeA <= 1024 * 1024
-                || K * sizeofdtypeA <= 1024)) {
-              auto commit_reorder_A_anchor
-                = [&](int length_M, int grouped_id = -1) {
-                    slice_range A_slice
-                      = {{m_idx / iim_block_,
-                           utils::divide_and_ceil(length_M, iim_block_)},
-                        {0, K / iik_block_}, {0, iim_block_}, {0, iik_block_}};
-                    if (grouped_id > -1) {
-                      // create grouped anchor
-                      fusion->create_fusion_anchor(grouped_id,
-                        slice_map {{owner_->get_inputs()[0].get(),
-                          slice_range_list {A_slice}}},
-                        nullptr, true);
-                    } else {
-                      fusion->create_fusion_anchor(
-                        slice_map {{owner_->get_inputs()[0].get(),
-                          slice_range_list {A_slice}}},
-                        nullptr, true);
-                    }
-                  };
-              if (in_tensors_[0].get_plain_dims()[0] % iim_block_ == 0) {
-                if (M_block_size == M_ib_block_size) {
-                  commit_reorder_A_anchor(M_block_size);
-                } else {
-                  _if_(m_s < M_blk_num) {
-                    commit_reorder_A_anchor(M_block_size, 0);
-                  }
-                  _else_ { commit_reorder_A_anchor(M_ib_block_size, 0); }
-                }
-              } else {
-                // has padding on M axis
-                if (M_block_size == M_ib_block_size) {
-                  _if_(m_s == M_real_split - 1) {
-                    commit_reorder_A_anchor(M_block_size - iim_block_
-                        + in_tensors_[0].get_plain_dims()[0] % iim_block_,
-                      0);
-                  }
-                  _else_ { commit_reorder_A_anchor(M_block_size, 0); }
-                } else {
-                  _if_(m_s < M_blk_num) {
-                    commit_reorder_A_anchor(M_block_size, 0);
-                  }
-                  _else_ {
-                    _if_(m_s == M_real_split - 1) {
-                      commit_reorder_A_anchor(M_ib_block_size - iim_block_
-                          + in_tensors_[0].get_plain_dims()[0] % iim_block_,
-                        0);
-                    }
-                    _else_ { commit_reorder_A_anchor(M_ib_block_size, 0); }
-                  }
-                }
-              }
-            }
-          } else {
-            // dynamic
-            _tensor_(iim_block_tmp, datatypes::s32, {1});
-            _tensor_(iin_block_tmp, datatypes::s32, {1});
-            _tensor_(iik_block_tmp, datatypes::s32, {1});
-            iim_block_tsr = iim_block_tmp;
-            iin_block_tsr = iin_block_tmp;
-            iik_block_tsr = iik_block_tmp;
-          }
-          if (!is_dynamic) {
-            single_thread_matmul_call(ctx, graph, in_tensors_[0],
-              in_tensors_[1], out_tensors_[0], config, M_single_thr_size,
-              N_single_thr_size, (int)utils::rnd_up(K, iik_block_), m_idx,
-              n_idx, k_s, A, B, C, dtype_block, fusion, m_s, n_s, M_anchor_info,
-              N_anchor_info, false, expr(), is_dynamic, N_block_size_expr);
-          } else {
-            assert(owner_->info_.internal_info_);
-            auto buffer_args = outputs;
-            buffer_args.insert(buffer_args.end(), inputs.begin(), inputs.end());
-            dynamic_single_thread_matmul_call(config, buffer_args, m_s, n_s,
-              k_s, K_split_num, iim_block_tsr, iin_block_tsr, iik_block_tsr);
-          }
-        }
-        if (fusion && !is_dynamic) {
-          slice_range_list mm_multi_slice;
-          // only 2 candidates will exist
-          for (int i = 0; i < 2; i++) {
-            for (int j = 0; j < 2; j++) {
-              auto M_length = i == 0 ? M_block_size : M_ib_block_size;
-              auto N_length = j == 0 ? N_block_size : N_ib_block_size;
-              if (out_tensors_[0].get_format().is_blocking()) {
-                mm_multi_slice.emplace_back(slice_range {
-                  {m_idx / expr(iim_block_), M_length / iim_block_},
-                  {n_idx / expr(iin_block_), N_length / iin_block_},
-                  {0, iim_block_}, {0, iin_block_}});
-              } else {
-                mm_multi_slice.emplace_back(
-                  slice_range {{m_idx, M_length}, {n_idx, N_length}});
-              }
-            }
-          }
-          if (M_block_size == M_ib_block_size
-            && N_block_size == N_ib_block_size) {
-            if (out_tensors_[0].get_format().is_blocking()) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{m_idx / expr(iim_block_), M_block_size / iim_block_},
-                  {n_idx / expr(iin_block_), N_block_size / iin_block_},
-                  {0, iim_block_}, {0, iin_block_}});
-            } else {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{m_idx, M_block_size}, {n_idx, N_block_size}});
-            }
-          } else if (M_block_size == M_ib_block_size) {
-            // differnt length on N
-            mm_multi_slice.pop_back();
-            mm_multi_slice.pop_back();
-            assert(mm_multi_slice.size() == 2);
-            auto gen_iter_anchor = [&]() {
-              builder::ir_builder_t bd_helper;
-              bd_helper.push_scope();
-              _var_init_(middle_anchor_iter, datatypes::index, UINT64_C(0));
-              _if_(n_s < N_blk_num) { middle_anchor_iter = UINT64_C(0); }
-              _else_ { middle_anchor_iter = UINT64_C(1); }
-              auto scope_helper = bd_helper.pop_scope();
-              return std::make_pair(middle_anchor_iter, scope_helper);
-            };
-            expr middle_anchor_iter;
-            stmt scope_helper;
-            std::tie(middle_anchor_iter, scope_helper) = gen_iter_anchor();
-            fusion->create_fusion_anchor(middle_anchor_iter,
-              slice_map {{owner_->get_outputs()[0].get(),
-                slice_range_list {mm_multi_slice}}},
-              scope_helper);
-          } else if (N_block_size == N_ib_block_size) {
-            // different length on M
-            mm_multi_slice.pop_back();
-            mm_multi_slice.erase(mm_multi_slice.begin() + 1);
-            assert(mm_multi_slice.size() == 2);
-            auto gen_iter_anchor = [&]() {
-              builder::ir_builder_t bd_helper;
-              bd_helper.push_scope();
-              _var_init_(middle_anchor_iter, datatypes::index, UINT64_C(0));
-              _if_(m_s < M_blk_num) { middle_anchor_iter = UINT64_C(0); }
-              _else_ { middle_anchor_iter = UINT64_C(1); }
-              auto scope_helper = bd_helper.pop_scope();
-              return std::make_pair(middle_anchor_iter, scope_helper);
-            };
-            expr middle_anchor_iter;
-            stmt scope_helper;
-            std::tie(middle_anchor_iter, scope_helper) = gen_iter_anchor();
-            fusion->create_fusion_anchor(middle_anchor_iter,
-              slice_map {{owner_->get_outputs()[0].get(),
-                slice_range_list {mm_multi_slice}}},
-              scope_helper);
-          } else {
-            // different length on both M and N
-            auto gen_iter_anchor = [&]() {
-              builder::ir_builder_t bd_helper;
-              bd_helper.push_scope();
-              _var_init_(middle_anchor_iter, datatypes::index, UINT64_C(0));
-              _if_(m_s < M_blk_num) {
-                _if_(n_s < N_blk_num) { middle_anchor_iter = UINT64_C(0); }
-                _else_ { middle_anchor_iter = UINT64_C(1); }
-              }
-              _else_ {
-                _if_(n_s < N_blk_num) { middle_anchor_iter = UINT64_C(2); }
-                _else_ { middle_anchor_iter = UINT64_C(3); }
-              }
-              auto scope_helper = bd_helper.pop_scope();
-              return std::make_pair(middle_anchor_iter, scope_helper);
-            };
-            expr middle_anchor_iter;
-            stmt scope_helper;
-            std::tie(middle_anchor_iter, scope_helper) = gen_iter_anchor();
-            fusion->create_fusion_anchor(middle_anchor_iter,
-              slice_map {{owner_->get_outputs()[0].get(),
-                slice_range_list {mm_multi_slice}}},
-              scope_helper);
-          }
-        }
-      }
-      // give explict anchor when N_split_num==1 to enable tensor shrink
-      if (fusion) {
-        if (M_block_size == M_ib_block_size) {
-          if (out_tensors_[0].get_format().is_blocking()) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{m_idx / expr(iim_block_), M_block_size / iim_block_},
-                {0, utils::divide_and_ceil(N, iin_block_)},
-                {0, expr(iim_block_)}, {0, expr(iin_block_)}});
-          } else {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{m_idx, M_block_size}, {0, N}});
-          }
-        } else {
-          slice_range_list mm_multi_slice;
-          if (out_tensors_[0].get_format().is_blocking()) {
-            mm_multi_slice
-              = {{{m_idx / expr(iim_block_), M_block_size / iim_block_},
-                   {0, utils::divide_and_ceil(N, iin_block_)},
-                   {0, expr(iim_block_)}, {0, expr(iin_block_)}},
-                {{m_idx / expr(iim_block_), M_ib_block_size / iim_block_},
-                  {0, utils::divide_and_ceil(N, iin_block_)},
-                  {0, expr(iim_block_)}, {0, expr(iin_block_)}}};
-          } else {
-            mm_multi_slice = {{{m_idx, M_block_size}, {0, N}},
-              {{m_idx, M_ib_block_size}, {0, N}}};
-          }
-          auto gen_iter_anchor = [&]() {
-            builder::ir_builder_t bd_helper;
-            bd_helper.push_scope();
-            _var_init_(outer_anchor_iter, datatypes::index, UINT64_C(0));
-            _if_(m_s < M_blk_num) { outer_anchor_iter = UINT64_C(0); }
-            _else_ { outer_anchor_iter = UINT64_C(1); }
-            auto scope_helper = bd_helper.pop_scope();
-            return std::make_pair(outer_anchor_iter, scope_helper);
-          };
-          expr outer_anchor_iter;
-          stmt scope_helper;
-          std::tie(outer_anchor_iter, scope_helper) = gen_iter_anchor();
-          fusion->create_fusion_anchor(outer_anchor_iter,
-            slice_map {{owner_->get_outputs()[0].get(),
-              slice_range_list {mm_multi_slice}}},
-            scope_helper);
-        }
-      }
-    }
-  } else {
-    // write into a temp buffer and then do reduce
-    std::vector<expr> out_tmp_buf_shape_expr
-      = out_tensors_[0].get_format().is_blocking()
-      ? std::vector<expr> {K_real_split, M_block_size_expr / iim_block_,
-        N_block_size_expr / iin_block_, iim_block_, iin_block_}
-      : std::vector<expr> {K_real_split, M_block_size_expr, N_block_size_expr};
-    if (is_dynamic) {
-      out_tmp_buf_shape_expr = std::vector<expr> {K_real_split,
-        divide_and_ceil(M_expr, M_split_num) * 2,
-        divide_and_ceil(N_expr, N_split_num) * 2};
-    };
-    auto out_dtype = utils::is_one_of(A_dtype, datatypes::u8, datatypes::s8)
-      ? datatypes::s32
-      : datatypes::f32;
-    expr m_idx, n_idx, k_idx, M_single_thr_size, N_single_thr_size,
-      X_bigger_num;
-    // for single core func query
-    expr iim_block_tsr, iin_block_tsr, iik_block_tsr;
-    expr iim_block = iim_block_, iin_block = iin_block_, iik_block = iik_block_;
-    _named_for_(
-      mloop, m_s, 0, M_real_split, 1, for_type::PARALLEL, M_split_num) {
-      _named_for_(
-        nloop, n_s, 0, N_real_split, 1, for_type::PARALLEL, N_split_num) {
-        _tensor_(out_tmp_buf, out_dtype, out_tmp_buf_shape_expr);
-        // fake plain dims to pass down.
-        out_tmp_buf->attr().set(attr_keys::plain_dims,
-          std::vector<expr> {
-            graph.dim_to_expr(in_tensors_[0].get_plain_dims()[0]),
-            graph.dim_to_expr(in_tensors_[1].get_plain_dims()[1]),
-            graph.dim_to_expr(in_tensors_[0].get_plain_dims()[1])});
-        M_single_thr_size = get_balance211_length(
-          M_expr / iim_block_, M_split_num, m_s, m_idx, X_bigger_num);
-        M_single_thr_size = M_single_thr_size * iim_block_;
-        m_idx = m_idx * iim_block_;
-
-        N_single_thr_size = get_balance211_length(
-          N_expr / iin_block_, N_split_num, n_s, n_idx, X_bigger_num);
-        N_single_thr_size = N_single_thr_size * iin_block_;
-        n_idx = n_idx * iin_block_;
-        if (is_dynamic) {
-          _tensor_(iim_block_tmp, datatypes::s32, {1});
-          _tensor_(iin_block_tmp, datatypes::s32, {1});
-          _tensor_(iik_block_tmp, datatypes::s32, {1});
-          iim_block_tsr = iim_block_tmp;
-          iin_block_tsr = iin_block_tmp;
-          iik_block_tsr = iik_block_tmp;
-          iim_block = iim_block_tmp[0];
-          iin_block = iin_block_tmp[0];
-          iik_block = iik_block_tmp[0];
-          iim_block->attr().set(attr_keys::no_index2var, true);
-          iin_block->attr().set(attr_keys::no_index2var, true);
-          iik_block->attr().set(attr_keys::no_index2var, true);
-        }
-        _named_for_(
-          kloop, k_s, 0, K_real_split, 1, for_type::PARALLEL, K_split_num) {
-          expr K_single_thr_size;
-          if (!is_dynamic && K_block_size == K_ib_block_size) {
-            K_single_thr_size = K_block_size;
-            k_idx = k_s * K_block_size;
-          } else {
-            K_single_thr_size = get_balance211_length(
-              K_expr / iik_block_, K_split_num, k_s, k_idx, X_bigger_num);
-            K_single_thr_size = K_single_thr_size * iik_block_;
-            k_idx = k_idx * iik_block_;
-          }
-          // create input anchor for A if necessary
-          if (!is_dynamic && fusion && in_tensors_[0].get_format().is_blocking()
-            && (M * K <= 1024 * 1024)) {
-            auto commit_reorder_A_anchor
-              = [&](int length_M, int length_K, int grouped_id = -1) {
-                  slice_range A_slice
-                    = {{m_idx / iim_block_,
-                         utils::divide_and_ceil(length_M, iim_block_)},
-                      {k_idx / iik_block_,
-                        utils::divide_and_ceil(length_K, iik_block_)},
-                      {0, iim_block_}, {0, iik_block_}};
-                  if (grouped_id > -1) {
-                    // create grouped anchor
-                    fusion->create_fusion_anchor(grouped_id,
-                      slice_map {{owner_->get_inputs()[0].get(),
-                        slice_range_list {A_slice}}},
-                      nullptr, true);
-                  } else {
-                    fusion->create_fusion_anchor(
-                      slice_map {{owner_->get_inputs()[0].get(),
-                        slice_range_list {A_slice}}},
-                      nullptr, true);
-                  }
-                };
-            auto discuss_K = [&](int length_M) {
-              if (K_block_size == K_ib_block_size) {
-                commit_reorder_A_anchor(length_M, K_block_size, 0);
-              } else {
-                _if_(k_s < K_blk_num) {
-                  commit_reorder_A_anchor(length_M, K_block_size, 0);
-                }
-                _else_ {
-                  commit_reorder_A_anchor(length_M, K_ib_block_size, 0);
-                }
-              }
-            };
-            auto discuss_K2 = [&](int length_M) {
-              if (K_block_size == K_ib_block_size) {
-                _if_(k_s == K_real_split - 1) {
-                  commit_reorder_A_anchor(length_M,
-                    K_block_size - iik_block_
-                      + in_tensors_[0].get_plain_dims()[1] % iik_block_,
-                    0);
-                }
-                _else_ { commit_reorder_A_anchor(length_M, K_block_size, 0); }
-              } else {
-                _if_(k_s < K_blk_num) {
-                  commit_reorder_A_anchor(length_M, K_block_size, 0);
-                }
-                _else_ {
-                  _if_(k_s == K_real_split - 1) {
-                    commit_reorder_A_anchor(length_M,
-                      K_ib_block_size - iik_block_
-                        + in_tensors_[0].get_plain_dims()[1] % iik_block_,
-                      0);
-                  }
-                  _else_ {
-                    commit_reorder_A_anchor(length_M, K_ib_block_size, 0);
-                  }
-                }
-              }
-            };
-            auto discuss_M = [&](int length_K) {
-              if (M_block_size == M_ib_block_size) {
-                commit_reorder_A_anchor(M_block_size, length_K, 0);
-              } else {
-                _if_(m_s < M_blk_num) {
-                  commit_reorder_A_anchor(M_block_size, length_K, 0);
-                }
-                _else_ {
-                  commit_reorder_A_anchor(M_ib_block_size, length_K, 0);
-                }
-              }
-            };
-            if (in_tensors_[0].get_plain_dims()[0] % iim_block_ == 0
-              && in_tensors_[0].get_plain_dims()[1] % iik_block_ == 0) {
-              // no padding
-              if (M_block_size == M_ib_block_size
-                && K_block_size == K_ib_block_size) {
-                discuss_K(M_block_size);
-              } else {
-                // need grouped anchor
-                _if_(m_s < M_blk_num) { discuss_K(M_block_size); }
-                _else_ { discuss_K(M_ib_block_size); }
-              }
-            } else if (in_tensors_[0].get_plain_dims()[1] % iik_block_ == 0) {
-              // has padding on M axis only
-              if (M_block_size == M_ib_block_size) {
-                _if_(m_s == M_real_split - 1) {
-                  discuss_K(M_block_size - iim_block_
-                    + in_tensors_[0].get_plain_dims()[0] % iim_block_);
-                }
-                _else_ { discuss_K(M_block_size); }
-              } else {
-                _if_(m_s < M_blk_num) { discuss_K(M_block_size); }
-                _else_ {
-                  _if_(m_s == M_real_split - 1) {
-                    discuss_K(M_ib_block_size - iim_block_
-                      + in_tensors_[0].get_plain_dims()[0] % iim_block_);
-                  }
-                  _else_ { discuss_K(M_ib_block_size); }
-                }
-              }
-            } else if (in_tensors_[0].get_plain_dims()[0] % iim_block_ == 0) {
-              // has padding on K axis only
-              if (K_block_size == K_ib_block_size) {
-                _if_(k_s == K_real_split - 1) {
-                  discuss_M(K_block_size - iik_block_
-                    + in_tensors_[0].get_plain_dims()[1] % iik_block_);
-                }
-                _else_ { discuss_M(K_block_size); }
-              } else {
-                _if_(k_s < K_blk_num) { discuss_M(K_block_size); }
-                _else_ {
-                  _if_(k_s == K_real_split - 1) {
-                    discuss_M(K_ib_block_size - iik_block_
-                      + in_tensors_[0].get_plain_dims()[1] % iik_block_);
-                  }
-                  _else_ { discuss_M(K_ib_block_size); }
-                }
-              }
-            } else {
-              // has padding on both M and K axes
-              if (M_block_size == M_ib_block_size) {
-                _if_(m_s == M_real_split - 1) {
-                  discuss_K2(M_block_size - iim_block_
-                    + in_tensors_[0].get_plain_dims()[0] % iim_block_);
-                }
-                _else_ { discuss_K2(M_block_size); }
-              } else {
-                _if_(m_s < M_blk_num) { discuss_K2(M_block_size); }
-                _else_ {
-                  _if_(m_s == M_real_split - 1) {
-                    discuss_K2(M_ib_block_size - iim_block_
-                      + in_tensors_[0].get_plain_dims()[0] % iim_block_);
-                  }
-                  _else_ { discuss_K2(M_ib_block_size); }
-                }
-              }
-            }
-          }
-          if (!is_dynamic) {
-            single_thread_matmul_call(ctx, graph, in_tensors_[0],
-              in_tensors_[1], out_tensors_[0], config, M_single_thr_size,
-              N_single_thr_size, K_single_thr_size, m_idx, n_idx, k_idx, A, B,
-              out_tmp_buf, dtype_block, fusion, m_s, n_s, M_anchor_info,
-              N_anchor_info, true, k_s, is_dynamic, N_block_size_expr);
-          } else {
-            assert(owner_->info_.internal_info_);
-            auto buffer_args = outputs;
-            buffer_args.insert(buffer_args.end(), inputs.begin(), inputs.end());
-            buffer_args[0] = out_tmp_buf;
-            buffer_args[0]->attr().set(attr_keys::always_trans, true);
-            dynamic_single_thread_matmul_call(config, buffer_args, m_s, n_s,
-              k_s, K_split_num, iim_block_tsr, iin_block_tsr, iik_block_tsr);
-          }
-        }
-        // do reduce here
-        for_loop rm, rn;
-        int lanes = 1;
-        if (is_dynamic || (iin_block_ / 16 && iin_block_ % 16 == 0)) {
-          lanes = vectorize_step(ctx, get_C_dtype().type_code_, 16);
-        }
-        if (is_dynamic) {
-          M_expr = divide_and_ceil(
-                     graph.dim_to_expr(in_tensors_[0].get_plain_dims()[0]),
-                     iim_block)
-            * iim_block;
-          N_expr = divide_and_ceil(
-                     graph.dim_to_expr(in_tensors_[1].get_plain_dims()[1]),
-                     iin_block)
-            * iin_block;
-          K_expr = divide_and_ceil(
-                     graph.dim_to_expr(in_tensors_[0].get_plain_dims()[1]),
-                     iik_block)
-            * iik_block;
-          M_single_thr_size = get_balance211_length(
-            M_expr / iim_block, M_split_num, m_s, m_idx, X_bigger_num);
-          M_single_thr_size = M_single_thr_size * iim_block;
-          m_idx = m_idx * iim_block;
-          N_single_thr_size = get_balance211_length(
-            N_expr / iin_block, N_split_num, n_s, n_idx, X_bigger_num);
-          N_single_thr_size = N_single_thr_size * iin_block;
-          n_idx = n_idx * iin_block;
-          M_block_size_expr
-            = divide_and_ceil(M_expr / iim_block, M_split_num) * iim_block;
-          N_block_size_expr
-            = divide_and_ceil(N_expr / iin_block, N_split_num) * iin_block;
-        }
-        expr M_single_thr_num_block
-          = divide_and_ceil(M_single_thr_size, iim_block);
-        expr N_single_thr_num_block
-          = divide_and_ceil(N_single_thr_size, iin_block);
-        if (out_tensors_[0].get_format().is_blocking()) {
-          trace_guard_t tg(ctx, "blocking_post_reduce");
-          for_loop reduce_loop;
-          _named_for_(reduce_loop, lm_ln, 0,
-            M_single_thr_num_block * N_single_thr_num_block, 1,
-            for_type::PARALLEL, K_split_num) { //
-            _var_init_(lm, datatypes::index, lm_ln / N_single_thr_num_block);
-            _var_init_(ln, datatypes::index, lm_ln % N_single_thr_num_block);
-            _var_init_(m_idx_1, datatypes::index, m_idx / iim_block);
-            _var_init_(n_idx_1, datatypes::index, n_idx / iin_block);
-            expr real_C = C, real_out_tmp_buf = out_tmp_buf;
-            if (is_dynamic) {
-              real_C = tensor_ptr(C,
-                std::vector<expr>(C.checked_as<tensor>()->dims_.size(), 0),
-                {divide_and_ceil(M_expr, iim_block),
-                  divide_and_ceil(N_expr, iin_block), iim_block, iin_block});
-              real_out_tmp_buf = tensor_ptr(out_tmp_buf,
-                std::vector<expr>(
-                  out_tmp_buf.get().checked_as<tensor>()->dims_.size(), 0),
-                {K_real_split, M_block_size_expr / iim_block,
-                  N_block_size_expr / iin_block, iim_block, iin_block});
-            }
-            builtin::mem_zero(
-              tensor_ptr(real_C, {m_idx_1 + lm, n_idx_1 + ln, 0, 0}),
-              iim_block * iin_block, out_dtype);
-            _for_(lks, 0, K_real_split, 1) {
-              if (!is_dynamic) {
-                _for_(lmo, 0, iim_block) {
-                  _for_(lno, 0, iin_block, lanes) {
-                    C[span_t({m_idx / iim_block + lm, n_idx / iin_block + ln,
-                               lmo, lno},
-                      lanes)]
-                      = builder::make_add(
-                        C[span_t({m_idx / iim_block + lm,
-                                   n_idx / iin_block + ln, lmo, lno},
-                          lanes)],
-                        out_tmp_buf[span_t({lks, lm, ln, lmo, lno}, lanes)]);
-                  }
-                }
-              } else {
-                _if_(lks < divide_and_ceil(K_expr, iik_block)) {
-                  _for_(lmo, 0, iim_block) {
-                    _for_(lno, 0, iin_block, lanes) {
-                      real_C[span_t(
-                        {m_idx_1 + lm, n_idx_1 + ln, lmo, lno}, lanes)]
-                        = builder::make_add(
-                          real_C[span_t(
-                            {m_idx_1 + lm, n_idx_1 + ln, lmo, lno}, lanes)],
-                          real_out_tmp_buf[span_t(
-                            {lks, lm, ln, lmo, lno}, lanes)]);
-                    }
-                  }
-                }
-              }
-            }
-            if (fusion && !is_dynamic) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{m_idx / expr(iim_block) + lm, 1},
-                  {n_idx / expr(iin_block) + ln, 1}, {0, expr(iim_block)},
-                  {0, expr(iin_block)}});
-            }
-          }
-          reduce_loop->attr()["dont_prefetch"] = true;
-        } else {
-          trace_guard_t tg(ctx, "plain_post_reduce");
-          builtin::dnnl_brgemm_init(tensor_ptr(C, {m_idx, n_idx}),
-            builder::make_cast(datatypes::s32, M_single_thr_size),
-            builder::make_cast(datatypes::s32, N_single_thr_size), N, out_dtype,
-            0);
-          _for_(lm_ln, 0, M_single_thr_size * N_single_thr_size, lanes,
-            for_type::PARALLEL, K_split_num) {
-            expr lm = lm_ln / N_single_thr_size;
-            expr ln = lm_ln % N_single_thr_size;
-            _for_(lks, 0, K_real_split, 1) {
-              if (!is_dynamic) {
-                C[span_t({m_idx + lm, n_idx + ln}, lanes)] = builder::make_add(
-                  C[span_t({m_idx + lm, n_idx + ln}, lanes)],
-                  out_tmp_buf[span_t({lks, lm, ln}, lanes)]);
-              } else {
-                _if_(lks < divide_and_ceil(K_expr, iik_block)) {
-                  C[span_t({m_idx + lm, n_idx + ln}, lanes)]
-                    = builder::make_add(
-                      C[span_t({m_idx + lm, n_idx + ln}, lanes)],
-                      out_tmp_buf[span_t({lks, lm, ln}, lanes)]);
-                }
-              }
-            }
-          }
-          if (fusion && !is_dynamic) {
-            slice_range_list mm_multi_slice;
-            // only 2 candidates will exist
-            for (int i = 0; i < 2; i++) {
-              for (int j = 0; j < 2; j++) {
-                auto M_length = i == 0 ? M_block_size : M_ib_block_size;
-                auto N_length = j == 0 ? N_block_size : N_ib_block_size;
-                if (out_tensors_[0].get_format().is_blocking()) {
-                  mm_multi_slice.emplace_back(slice_range {
-                    {m_idx / expr(iim_block_), M_length / iim_block_},
-                    {n_idx / expr(iin_block_), N_length / iin_block_},
-                    {0, iim_block_}, {0, iin_block_}});
-                } else {
-                  mm_multi_slice.emplace_back(
-                    slice_range {{m_idx, M_length}, {n_idx, N_length}});
-                }
-              }
-            }
-            if (M_block_size == M_ib_block_size
-              && N_block_size == N_ib_block_size) {
-              if (out_tensors_[0].get_format().is_blocking()) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  {{m_idx / expr(iim_block_), M_block_size / iim_block_},
-                    {n_idx / expr(iin_block_), N_block_size / iin_block_},
-                    {0, iim_block_}, {0, iin_block_}});
-              } else {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  {{m_idx, M_block_size}, {n_idx, N_block_size}});
-              }
-            } else if (M_block_size == M_ib_block_size) {
-              // differnt length on N
-              mm_multi_slice.pop_back();
-              mm_multi_slice.pop_back();
-              assert(mm_multi_slice.size() == 2);
-              auto gen_iter_anchor = [&]() {
-                builder::ir_builder_t bd_helper;
-                bd_helper.push_scope();
-                _var_init_(inner_anchor_iter, datatypes::index, UINT64_C(0));
-                _if_(n_s < N_blk_num) { inner_anchor_iter = UINT64_C(0); }
-                _else_ { inner_anchor_iter = UINT64_C(1); }
-                auto scope_helper = bd_helper.pop_scope();
-                return std::make_pair(inner_anchor_iter, scope_helper);
-              };
-              expr inner_anchor_iter;
-              stmt scope_helper;
-              std::tie(inner_anchor_iter, scope_helper) = gen_iter_anchor();
-              fusion->create_fusion_anchor(inner_anchor_iter,
-                slice_map {{owner_->get_outputs()[0].get(),
-                  slice_range_list {mm_multi_slice}}},
-                scope_helper);
-            } else if (N_block_size == N_ib_block_size) {
-              // different length on M
-              mm_multi_slice.pop_back();
-              mm_multi_slice.erase(mm_multi_slice.begin() + 1);
-              assert(mm_multi_slice.size() == 2);
-              auto gen_iter_anchor = [&]() {
-                builder::ir_builder_t bd_helper;
-                bd_helper.push_scope();
-                _var_init_(inner_anchor_iter, datatypes::index, UINT64_C(0));
-                _if_(m_s < M_blk_num) { inner_anchor_iter = UINT64_C(0); }
-                _else_ { inner_anchor_iter = UINT64_C(1); }
-                auto scope_helper = bd_helper.pop_scope();
-                return std::make_pair(inner_anchor_iter, scope_helper);
-              };
-              expr inner_anchor_iter;
-              stmt scope_helper;
-              std::tie(inner_anchor_iter, scope_helper) = gen_iter_anchor();
-              fusion->create_fusion_anchor(inner_anchor_iter,
-                slice_map {{owner_->get_outputs()[0].get(),
-                  slice_range_list {mm_multi_slice}}},
-                scope_helper);
-            } else {
-              // different length on both M and N
-              auto gen_iter_anchor = [&]() {
-                builder::ir_builder_t bd_helper;
-                bd_helper.push_scope();
-                _var_init_(inner_anchor_iter, datatypes::index, UINT64_C(0));
-                _if_(m_s < M_blk_num) {
-                  _if_(n_s < N_blk_num) { inner_anchor_iter = UINT64_C(0); }
-                  _else_ { inner_anchor_iter = UINT64_C(1); }
-                }
-                _else_ {
-                  _if_(n_s < N_blk_num) { inner_anchor_iter = UINT64_C(2); }
-                  _else_ { inner_anchor_iter = UINT64_C(3); }
-                }
-                auto scope_helper = bd_helper.pop_scope();
-                return std::make_pair(inner_anchor_iter, scope_helper);
-              };
-              expr inner_anchor_iter;
-              stmt scope_helper;
-              std::tie(inner_anchor_iter, scope_helper) = gen_iter_anchor();
-              fusion->create_fusion_anchor(inner_anchor_iter,
-                slice_map {{owner_->get_outputs()[0].get(),
-                  slice_range_list {mm_multi_slice}}},
-                scope_helper);
-            }
-          }
-        }
-      }
-      // give explict anchor when N_split_num==1 to enable tensor shrink
-      if (fusion && !is_dynamic && N_split_num == 1) {
-        if (M_block_size == M_ib_block_size) {
-          if (out_tensors_[0].get_format().is_blocking()) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{m_idx / expr(iim_block_), M_block_size / iim_block_},
-                {0, utils::divide_and_ceil(N, iin_block_)},
-                {0, expr(iim_block_)}, {0, expr(iin_block_)}});
-          } else {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              {{m_idx, M_block_size}, {0, N}});
-          }
-        } else {
-          slice_range_list mm_multi_slice;
-          if (out_tensors_[0].get_format().is_blocking()) {
-            mm_multi_slice
-              = {{{m_idx / expr(iim_block_), M_block_size / iim_block_},
-                   {0, utils::divide_and_ceil(N, iin_block_)},
-                   {0, expr(iim_block_)}, {0, expr(iin_block_)}},
-                {{m_idx / expr(iim_block_), M_ib_block_size / iim_block_},
-                  {0, utils::divide_and_ceil(N, iin_block_)},
-                  {0, expr(iim_block_)}, {0, expr(iin_block_)}}};
-          } else {
-            mm_multi_slice = {{{m_idx, M_block_size}, {0, N}},
-              {{m_idx, M_ib_block_size}, {0, N}}};
-          }
-          auto gen_iter_anchor = [&]() {
-            builder::ir_builder_t bd_helper;
-            bd_helper.push_scope();
-            _var_init_(outer_anchor_iter, datatypes::index, UINT64_C(0));
-            _if_(m_s < M_blk_num) { outer_anchor_iter = UINT64_C(0); }
-            _else_ { outer_anchor_iter = UINT64_C(1); }
-            auto scope_helper = bd_helper.pop_scope();
-            return std::make_pair(outer_anchor_iter, scope_helper);
-          };
-          expr outer_anchor_iter;
-          stmt scope_helper;
-          std::tie(outer_anchor_iter, scope_helper) = gen_iter_anchor();
-          fusion->create_fusion_anchor(outer_anchor_iter,
-            slice_map {{owner_->get_outputs()[0].get(),
-              slice_range_list {mm_multi_slice}}},
-            scope_helper);
-        }
-      }
-    }
-  }
-
-  mloop->attr()[stmt_attr_key::parallel_merge_loop_granularity] = iim_block_;
-  mloop->attr()[stmt_attr_key::parallel_loop_balanced]
-    = M_block_size == M_ib_block_size;
-
-  // bind outer loops with axis hint
-  bind_loop_axis(owner_->get_outputs()[0], mloop, 0);
-  bind_loop_axis(owner_->get_outputs()[0], nloop, 1);
-  bind_loop_axis(owner_->get_outputs()[0], kloop, std::vector<int> {});
-
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/managed_matmul_core.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/managed_matmul_core.hpp
deleted file mode 100644
index 804baecba1e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/managed_matmul_core.hpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_MANAGED_MATMUL_CORE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_MANAGED_MATMUL_CORE_HPP
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include <ops/body_generator.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-struct managed_matmul_core_config_t {
-  int M_split_num;
-  int N_split_num;
-  int M_sub_block;
-  int N_sub_block;
-  int K_sub_block;
-  // inner most loop order
-  int im_loop_order;
-};
-class gen_managed_matmul_core_t
-  : public body_generator_t<managed_matmul_core_config_t> {
-public:
-  // for specific shapes, dtypes and num_threads (refer to
-  // ops::managed_matmul_core_op_t for exact value), we dispatch to avx instead
-  // of amx.
-  int64_t dispatch_avx_ = 0;
-
-  int split_iim_ = -1;
-
-  // inner most block
-  int iim_block_;
-  int iin_block_;
-  int iik_block_;
-  bool is_partial_; // for dynamic combined impl kind. Unused in static.
-  struct op_params_t {
-    static constexpr int in_A = 0;
-    static constexpr int in_B = 1;
-    static constexpr int out_C = 0;
-  };
-  using parent = body_generator_t<managed_matmul_core_config_t>;
-  using parent::generate;
-
-  gen_managed_matmul_core_t(sc_op *owner, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims get_a_batch_dims() const {
-    return {in_tensors_[0].get_plain_dims().begin(),
-      in_tensors_[0].get_plain_dims().end() - 2};
-  }
-
-  const sc_dims get_b_batch_dims() const {
-    return {in_tensors_[1].get_plain_dims().begin(),
-      in_tensors_[1].get_plain_dims().end() - 2};
-  }
-
-  const sc_dims get_mma_plain_dims() const {
-    return {in_tensors_[0].get_plain_dims().begin() + get_a_batch_dims().size(),
-      in_tensors_[0].get_plain_dims().end()};
-  };
-
-  const sc_dims get_mmb_plain_dims() const {
-    return {in_tensors_[1].get_plain_dims().begin() + get_b_batch_dims().size(),
-      in_tensors_[1].get_plain_dims().end()};
-  };
-  bool is_dynamic() const {
-    return in_tensors_[0].is_dynamic() || in_tensors_[1].is_dynamic();
-  }
-
-  sc_data_type_t get_A_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_B_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_C_dtype() const { return out_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const managed_matmul_core_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  func_t get_single_core_func(context_ptr ctx,
-    const managed_matmul_core_config_t &config, fusion_anchor_mgr_t *fusion,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  std::vector<expr> get_extra_args_from_func(const func_t &f) const override;
-
-  void single_thread_matmul_call(const context_ptr &ctx, sc_graph_t &graph,
-    const logical_tensor_t &ta, const logical_tensor_t &tb,
-    const logical_tensor_t &tc, const managed_matmul_core_config_t &config,
-    const expr &M, const expr &N, const expr &K, const expr &m_idx,
-    const expr &n_idx, const expr &k_idx, const expr &A, const expr &B,
-    const expr &C, int dtype_block, fusion_anchor_mgr_t *fusion,
-    const expr &m_s, const expr &n_s, std::vector<int> &M_anchor_info,
-    std::vector<int> &N_anchor_info, bool is_partial = false,
-    const expr &k_s = expr(), bool is_dynamic = false,
-    const expr &N_block_size_expr = expr()) const;
-
-  void dynamic_single_thread_matmul_call(
-    const managed_matmul_core_config_t &config,
-    const std::vector<expr> &buffer_args, const expr &m_s, const expr &n_s,
-    const expr &k_s, int K_split_num, expr &iim_block, expr &iin_block,
-    expr &iik_block) const;
-
-  void generate_prefetcher_body_for_tensor(const context_ptr &ctx,
-    const managed_matmul_core_config_t &config,
-    const std::vector<expr> &func_args, const std::vector<expr> &ins,
-    const std::vector<int> &indices);
-
-  bool is_okay_to_prefetch(const context_ptr &ctx,
-    const managed_matmul_core_config_t &config, bool is_global);
-
-  bool is_valid_config(const context_ptr &ctx,
-    const managed_matmul_core_config_t &config) const override;
-  config_ptr_vec get_dynamic_config_candidates(
-    const context_ptr &ctx) const override;
-  std::vector<uint64_t> convert_config_to_keys(
-    const config_ptr &configs) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-  config_ptr get_default_transposed_a_config(const context_ptr &ctx) const;
-  config_ptr get_default_post_rd_config(const context_ptr &ctx) const;
-
-  void schedule_loops(context_ptr ctx,
-    const managed_matmul_core_config_t &config, stmt body,
-    std::vector<for_loop> &fors) const override;
-
-  int suggest_aligned_iim_block(
-    const size_t plain_M, const size_t default_block, bool is_f32);
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp
deleted file mode 100644
index a5d1be7daf0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.cpp
+++ /dev/null
@@ -1,871 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "matmul_core.hpp"
-#include <algorithm>
-#include <string>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/transform/scope_flatten.hpp>
-#include <ops/matmul_core.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/parallel.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using ops::matmul_core_config_t;
-// clang-format off
-SC_CLASS(matmul_core_config_t)
-  SC_FIELD(M_block)
-  SC_FIELD(N_block)
-  SC_FIELD(K_block)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-
-template <typename T>
-static std::vector<T> concat_vec(
-  const std::vector<T> &a, const std::vector<T> &b) {
-  std::vector<T> result(a);
-  for (const T &it : b) {
-    result.push_back(it);
-  }
-  return result;
-}
-
-// check if cfg is valid for lower library, fallback to a default value if is
-// not valid
-static void inline validate_cfg(
-  matmul_core_config_t &cfg, bool is_amx, sc_data_type_t dtype) {
-  if (!is_amx) return;
-  if (is_dynamic_dim(cfg.K_block)) { return; }
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(get_default_context(), dtype);
-  int rd_block = is_vnni_low_fp                             ? 32
-    : utils::is_one_of(dtype, datatypes::u8, datatypes::s8) ? 64
-                                                            : -1;
-  if (rd_block == -1) return;
-  int rdb = cfg.K_block / rd_block;
-  int rdb_tail = cfg.K_block % rd_block;
-  if (rdb > 0 && rdb_tail) {
-    cfg.K_block = utils::rnd_dn(cfg.K_block, rd_block);
-  }
-  int dtype_block = (rd_block == 32 ? 2 : 4);
-  if (rdb_tail % dtype_block) {
-    cfg.K_block = utils::rnd_up(cfg.K_block, dtype_block);
-  }
-}
-
-static inline int get_X_cfg(const int size, int thresh = 64) {
-  int chosen_cfg = 16;
-  for (int cfg = 16; cfg <= thresh; cfg += 16) {
-    if (size % 48 != 0 && cfg == 48) continue;
-    int num_blk = utils::divide_and_ceil(size, cfg);
-    int padded_size = num_blk * cfg;
-    if ((float)size / padded_size >= 0.8) { chosen_cfg = cfg; }
-  }
-  return chosen_cfg;
-}
-
-// default cfg for bmm bases on below priori knowledge(based on tests on MHA)
-// 1. if M % x == 0 (32 <= x <64), x as m_blk usually performs better than
-// 32/64, if there are multiple x to choose, even number is better than odd
-// number
-// 2. in mha case, due to post fusion exp can cost a lot, first bmm's Nblk
-// shouldn't be padded too much to reduce overhead and shall be rnd_up or rnd_dn
-// to 16x for f32/s32, and should be close to M_blk, rnd_dn(M_blk, 16) is fine
-// 3. smaller M*K should use 32 as k_blk, threshold is still testing
-
-config_ptr gen_matmul_core_t::get_default_config(context_ptr ctx) const {
-  // todo:(xianhang) take into consideration num thread information from
-  // threadpool
-  auto ret = reflection::general_object_t::make<matmul_core_config_t>();
-  matmul_core_config_t &cfg = *ret.unchecked_get_as<matmul_core_config_t>();
-  if (is_dynamic()) {
-    auto in0_dims = in_tensors_[0].get_plain_dims();
-    auto in1_dims = in_tensors_[1].get_plain_dims();
-    int M = in0_dims[in0_dims.size() - 2];
-    int K = in0_dims[in0_dims.size() - 1];
-    int N = in1_dims[in1_dims.size() - 1];
-    if (is_dynamic_dim(M)) {
-      cfg.M_block = dimensions::dynamic_any;
-    } else {
-      cfg.M_block = get_matmul_dyn_cfg_single(M, true);
-    }
-    if (is_dynamic_dim(N)) {
-      cfg.N_block = dimensions::dynamic_any;
-    } else {
-      cfg.N_block = get_matmul_dyn_cfg_single(N);
-    }
-    if (is_dynamic_dim(K)) {
-      cfg.K_block = dimensions::dynamic_any;
-    } else {
-      cfg.K_block = get_matmul_dyn_cfg_single(K);
-    }
-    return std::move(ret);
-  }
-  const bool is_amx = ctx->use_amx();
-  const bool is_int8
-    = utils::is_one_of(get_in_dtypes(0), datatypes::u8, datatypes::s8);
-  const bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, get_in_dtypes(0));
-  const int max_block = 64;
-  const int min_block = 32;
-  const auto A_plain_dims = get_mma_plain_dims();
-  const auto B_plain_dims = get_mmb_plain_dims();
-  std::vector<int> possible_blks;
-  cfg.K_block = 64;
-  if (get_in_dtypes(0) == datatypes::f32) {
-    int K = static_cast<int>(A_plain_dims[1]);
-    int ceil64_K = static_cast<int>(utils::rnd_up(K, 64));
-    int ceil48_K = static_cast<int>(utils::rnd_up(K, 48));
-    int ceil32_K = static_cast<int>(utils::rnd_up(K, 32));
-    int pad64_K = ceil64_K - K;
-    int pad48_K = ceil48_K - K;
-    int pad32_K = ceil32_K - K;
-    cfg.K_block = pad48_K >= pad64_K ? (pad64_K > pad32_K ? 32 : 64)
-                                     : (pad48_K >= pad32_K ? 32 : 48);
-    if (K < 32) { cfg.K_block = K; }
-  } else {
-    int K = static_cast<int>(A_plain_dims[1]);
-    if (K < 64) {
-      cfg.K_block = utils::rnd_up(K, is_vnni_low_fp ? 2 : 4);
-    } else if (K < 256) {
-      int ceil64_K = static_cast<int>(utils::rnd_up(K, 64));
-      int ceil32_K = static_cast<int>(utils::rnd_up(K, 32));
-      int pad64_K = ceil64_K - K;
-      int pad32_K = ceil32_K - K;
-      cfg.K_block = pad64_K > pad32_K ? 32 : 64;
-    }
-  }
-  bool is_cfg_set = false;
-  bool is_2d_gemm = A_plain_dims.size() == 2 ? true : false;
-  auto a_batch_dims = get_a_batch_dims();
-  auto b_batch_dims = get_b_batch_dims();
-  if (in_tensors_[0].get_format().is_blocking() && !is_2d_gemm) {
-    cfg.M_block = in_tensors_[0].get_format().blocks_[0];
-    cfg.K_block = in_tensors_[0].get_format().blocks_[1];
-    if (!a_batch_dims.empty() || !b_batch_dims.empty()) {
-      cfg.N_block = 64;
-      // Safe Guard: avoid K_block % rbd != 0
-      validate_cfg(cfg, is_amx, get_in_dtypes(0));
-      return std::move(ret);
-    }
-  } else {
-    is_cfg_set = true;
-    assert(A_plain_dims.size() == 2);
-    int M = static_cast<int>(A_plain_dims[0]);
-    int K = static_cast<int>(A_plain_dims[1]);
-    int N = static_cast<int>(B_plain_dims[1]);
-    if (a_batch_dims.empty() && b_batch_dims.empty()) {
-      // matmul2d default config
-      for (int m_blk = 64; m_blk >= 16; m_blk--) {
-        if (M % m_blk == 0) { possible_blks.emplace_back(m_blk); }
-      }
-      if (possible_blks.empty()) {
-        cfg.M_block = get_X_cfg(M);
-      } else {
-        cfg.M_block = possible_blks.front();
-      }
-      // N K size was calculate in the same way, thus have same cfg in the first
-      // place
-      cfg.K_block = get_X_cfg(K);
-      int thresh = 64;
-      if (K > 1500 || in_tensors_[0].dtype_ == datatypes::f32) thresh = 32;
-      cfg.N_block = get_X_cfg(N, thresh);
-      if (M < 16) cfg.M_block = M;
-      const int nthreads = runtime_config_t::get().get_num_threads();
-      // refine Blk info by thread info
-      if (nthreads == 1) {
-        cfg.M_block = std::min(64, M);
-        cfg.N_block = std::min(64, N);
-        cfg.K_block = std::min(64, K);
-      } else {
-        while (true) {
-          int M_num_block = utils::divide_and_ceil(M, cfg.M_block);
-          int N_num_block = utils::divide_and_ceil(N, cfg.N_block);
-          int K_num_block = utils::divide_and_ceil(K, cfg.K_block);
-          int total_jobs = M_num_block * N_num_block;
-          int min_job_per_thread = total_jobs / nthreads;
-          int max_job_per_thread = utils::divide_and_ceil(total_jobs, nthreads);
-          if ((float)min_job_per_thread / max_job_per_thread <= 0.7
-            && cfg.M_block * cfg.N_block * K > 32 * 32 * 32 * 8) {
-            if (possible_blks.size() > 1) {
-              possible_blks.erase(possible_blks.begin());
-              cfg.M_block = possible_blks.front();
-              break;
-            } else if (cfg.M_block % 2 == 0) {
-              cfg.M_block /= 2;
-            }
-          }
-          break;
-        }
-      }
-    } else {
-      // bmm default config
-      cfg.M_block = 0;
-      if (M < min_block && get_in_dtypes(0) == datatypes::f32) {
-        cfg.M_block = M;
-      }
-      for (int m = max_block; m >= min_block; m--) {
-        if (M % m == 0) { possible_blks.emplace_back(m); }
-      }
-      for (const auto &blk : possible_blks) {
-        if (blk % 2 == 0) {
-          cfg.M_block = blk;
-          break;
-        }
-      }
-      if (cfg.M_block == 0) {
-        if (!possible_blks.empty()) {
-          cfg.M_block = possible_blks.front();
-        } else {
-          if (M > 64) {
-            int ceil64_M = static_cast<int>(utils::rnd_up(M, 64));
-            int ceil48_M = static_cast<int>(utils::rnd_up(M, 48));
-            int ceil32_M = static_cast<int>(utils::rnd_up(M, 32));
-            int pad64_M = ceil64_M - M;
-            int pad48_M = ceil48_M - M;
-            int pad32_M = ceil32_M - M;
-            if (!(is_amx && is_vnni_low_fp)) {
-              cfg.M_block = pad48_M >= pad64_M ? (pad64_M > pad32_M ? 32 : 64)
-                                               : (pad48_M >= pad32_M ? 32 : 48);
-            } else {
-              cfg.M_block = pad32_M >= pad64_M ? 64 : 32;
-            }
-          } else {
-            cfg.M_block = std::min(M, 64);
-          }
-        }
-      }
-    }
-  }
-  if (in_tensors_[1].get_format().is_blocking() && !is_cfg_set) {
-    assert(in_tensors_[1].get_format().blocks_[0] == cfg.K_block);
-    cfg.N_block = in_tensors_[1].get_format().blocks_[1];
-  } else {
-    assert(B_plain_dims.size() == 2);
-    int N = static_cast<int>(B_plain_dims[1]);
-    int K = static_cast<int>(B_plain_dims[0]);
-
-    if (a_batch_dims.empty() && b_batch_dims.empty()) {
-      // matmul2d default config
-      // do nothing
-    } else {
-      // bmm default config
-      assert(cfg.M_block > 0);
-      int ceil64_N = static_cast<int>(utils::rnd_up(N, 64));
-      int ceil48_N = static_cast<int>(utils::rnd_up(N, 48));
-      int ceil32_N = static_cast<int>(utils::rnd_up(N, 32));
-      int pad64_N = ceil64_N - N;
-      int pad48_N = ceil48_N - N;
-      int pad32_N = ceil32_N - N;
-      if (!(is_amx && is_vnni_low_fp)) {
-        cfg.N_block = pad48_N >= pad64_N ? (pad64_N > pad32_N ? 32 : 64)
-                                         : (pad48_N >= pad32_N ? 32 : 48);
-      } else {
-        cfg.N_block = pad32_N >= pad64_N ? 64 : 32;
-      }
-      if (N < 32) { cfg.N_block = N; }
-    }
-  }
-  validate_cfg(cfg, is_amx, get_in_dtypes(0));
-  return std::move(ret);
-}
-
-gen_matmul_core_t::gen_matmul_core_t(sc_op *owner,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs)) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  init_axis();
-}
-
-float gen_matmul_core_t::get_gflop() const {
-  if (is_dynamic()) { return 0.f; }
-  const int64_t plain_M = get_mma_plain_dims()[0];
-  const int64_t plain_K = get_mma_plain_dims()[1];
-  const int64_t plain_N = get_mmb_plain_dims()[1];
-  return get_a_batch_dims().empty() && get_b_batch_dims().empty()
-    ? 2.f * plain_M * plain_N * plain_K / 1e9
-    : 2.f * plain_M * plain_N * plain_K
-      * math_utils::get_dims_product(
-        get_a_batch_dims().size() > get_b_batch_dims().size()
-          ? get_a_batch_dims()
-          : get_b_batch_dims())
-      / 1e9;
-}
-
-void gen_matmul_core_t::init_axis() {
-  blocking_axis_
-    = get_mm_blocking_axis(in_tensors_[0], in_tensors_[1], out_tensors_[0]);
-}
-
-void gen_matmul_core_t::get_and_check_blocks(sc_graph_t &graph,
-  const std::vector<expr> &inputs, const matmul_core_config_t &config,
-  expr &M_num_blocks, expr &K_num_blocks, int &M_block, int &K_block,
-  int &N_block, expr &B_K_num_blocks, expr &N_num_blocks) const {
-  auto A_dims = inputs[0].as<tensor>()->dims_;
-  auto B_dims = inputs[1].as<tensor>()->dims_;
-  bool is_config_set
-    = config.M_block != 0 && config.K_block != 0 && config.N_block != 0;
-
-  if (blocking_axis_.A_m.size() == 1) {
-    assert(blocking_axis_.A_k.size() == 1);
-    COMPILE_ASSERT(is_config_set, "config must be set with plain input.");
-    M_block = config.M_block;
-    K_block = config.K_block;
-  } else {
-    M_block = get_expr_as_int(A_dims[blocking_axis_.A_m.back()]);
-    K_block = get_expr_as_int(A_dims[blocking_axis_.A_k.back()]);
-  }
-  if (blocking_axis_.B_k.size() == 1) {
-    assert(blocking_axis_.B_n.size() == 1);
-    COMPILE_ASSERT(is_config_set, "config must be set with plain input.");
-    bool is_f16 = inputs[1].as<tensor>()->elem_dtype_ == datatypes::f16;
-    COMPILE_ASSERT(
-      inputs[1].as<tensor>()->elem_dtype_ == datatypes::f32 || is_f16,
-      "the datatype of B must be f32 or f16 when B is plain.");
-    N_block = config.N_block;
-  } else {
-    N_block = get_expr_as_int(B_dims[blocking_axis_.B_n.back()]);
-    bool is_vnni_low_fp = ops::is_vnni_low_fp(
-      get_default_context(), inputs[1].as<tensor>()->elem_dtype_);
-    if (utils::is_one_of(
-          inputs[1].as<tensor>()->elem_dtype_, datatypes::u8, datatypes::s8)
-      || is_vnni_low_fp) {
-      int dtype_block = is_vnni_low_fp ? 2 : 4;
-      if (in_tensors_[1].get_plain_dims().back() % K_block == 0) {
-        // padding because of big K_block
-        if (K_block < 4 && dtype_block == 4) {
-          assert(get_expr_as_int(B_dims[blocking_axis_.B_k.back()]) == 4);
-        } else if (K_block < 2 && dtype_block == 2) {
-          assert(get_expr_as_int(B_dims[blocking_axis_.B_k.back()]) == 2);
-        }
-        // no padding
-        else {
-          assert(K_block
-            == get_expr_as_int(B_dims[blocking_axis_.B_k.at(1)])
-              * get_expr_as_int(B_dims[blocking_axis_.B_k.back()]));
-        }
-      }
-      // padding because of undivisible by dtype_block
-      else {
-        assert(dtype_block * (int)utils::divide_and_ceil(K_block, dtype_block)
-          == get_expr_as_int(B_dims[blocking_axis_.B_k.at(1)])
-            * get_expr_as_int(B_dims[blocking_axis_.B_k.back()]));
-      }
-    } else {
-      assert(K_block == get_expr_as_int(B_dims[blocking_axis_.B_k.back()]));
-    }
-  }
-
-  COMPILE_ASSERT(!is_config_set
-      || (M_block == config.M_block && K_block == config.K_block
-        && N_block == config.N_block),
-    "Unmatched config with input format");
-  // divide and ceil(x, 1) to convert x to index datatype.
-  M_num_blocks = blocking_axis_.A_m.size() == 1
-    ? divide_and_ceil(
-      A_dims[blocking_axis_.A_m.at(0)], graph.dim_to_expr(M_block))
-    : A_dims[blocking_axis_.A_m.at(0)];
-
-  K_num_blocks = blocking_axis_.A_k.size() == 1
-    ? divide_and_ceil(
-      A_dims[blocking_axis_.A_k.at(0)], graph.dim_to_expr(K_block))
-    : A_dims[blocking_axis_.A_k.at(0)];
-
-  B_K_num_blocks = blocking_axis_.B_k.size() == 1
-    ? divide_and_ceil(
-      B_dims[blocking_axis_.B_k.at(0)], graph.dim_to_expr(K_block))
-    : B_dims[blocking_axis_.B_k.at(0)];
-
-  N_num_blocks = blocking_axis_.B_n.size() == 1
-    ? divide_and_ceil(
-      B_dims[blocking_axis_.B_n.at(0)], graph.dim_to_expr(N_block))
-    : B_dims[blocking_axis_.B_n.at(0)];
-
-  COMPILE_ASSERT(
-    (K_num_blocks.isa<constant>() && B_K_num_blocks.isa<constant>()
-      && get_expr_as_int(K_num_blocks) == get_expr_as_int(B_K_num_blocks))
-      || K_num_blocks->equals(B_K_num_blocks),
-    "A and B num blocks of K are not equal.");
-}
-
-void gen_matmul_core_t::get_brgemm_and_fusion_params(sc_graph_t &graph,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  const int &M_block, const int &K_block, const int &N_block,
-  std::vector<expr> &aidx, std::vector<expr> &bidx, std::vector<expr> &cidx,
-  expr &LDA, expr &LDB, expr &LDC, expr &stride_a, expr &stride_b,
-  std::vector<std::pair<expr, expr>> &fidx1,
-  std::vector<std::pair<expr, expr>> &fidx2,
-  std::vector<std::pair<expr, expr>> &fidx3) const {
-  auto A_dims = inputs[0].as<tensor>()->dims_;
-  auto B_dims = inputs[1].as<tensor>()->dims_;
-  auto C_dims = outputs[0].as<tensor>()->dims_;
-  bool update_a = false, update_b = false, update_c = false;
-  if (blocking_axis_.A_bs.empty()) {
-    if (blocking_axis_.A_m.size() == 1 && blocking_axis_.A_k.size() == 1) {
-      // MK
-      LDA = A_dims[blocking_axis_.A_k.back()];
-      stride_a = K_block;
-    }
-    update_a = true;
-  }
-  if (blocking_axis_.B_bs.empty()) {
-    if (blocking_axis_.B_k.size() == 1 && blocking_axis_.B_n.size() == 1) {
-      COMPILE_ASSERT(inputs[1].as<tensor>()->elem_dtype_ == datatypes::f32,
-        "the datatype of B must be f32 when B is plain.");
-      COMPILE_ASSERT(
-        blocking_axis_.B_n.back() == static_cast<int>(B_dims.size()) - 1,
-        "brgemm does not support K axis is the last axis for B");
-      // KN
-      LDB = B_dims[blocking_axis_.B_n.back()];
-      stride_b = LDB * K_block;
-      assert(bidx.size() >= 2);
-      std::swap(bidx[bidx.size() - 1], bidx[bidx.size() - 2]);
-    }
-    update_b = true;
-  }
-  if (blocking_axis_.C_m.size() == 1 && blocking_axis_.C_n.size() == 1
-    && blocking_axis_.C_bs.empty()) {
-    // MN
-    LDC = B_dims[blocking_axis_.B_n.back()];
-    update_c = true;
-  }
-
-  bool flag_s = false; // used for updating stride_a/stride_b
-  bool flag_l = false; // used for updating LDA/LDB/LDC
-
-  int bds_a = static_cast<int>(blocking_axis_.A_bs.size());
-  int bds_b = static_cast<int>(blocking_axis_.B_bs.size());
-  int bds_c = bds_a > bds_b ? bds_a : bds_b;
-  std::vector<expr> aidx_ = aidx;
-  std::vector<expr> bidx_ = bidx;
-  std::vector<expr> cidx_ = cidx;
-  std::vector<std::pair<expr, expr>> fidx1_ = fidx1;
-  std::vector<std::pair<expr, expr>> fidx2_ = fidx2;
-  std::vector<std::pair<expr, expr>> fidx3_ = fidx3;
-  int batch_idx = 0;
-
-  // update aidx, LDA and stride_a according to the format of tensor A
-  if (!update_a) {
-    if (blocking_axis_.A_m.size() == 1 && blocking_axis_.A_k.size() == 1) {
-      LDA = 1;
-      stride_a = K_block;
-      int flag_l_idx = 0, flag_s_idx = 0;
-      for (int i = 0; i < bds_a + 2; i++) {
-        if (i == blocking_axis_.A_m[0]) { // M axis
-          aidx_[i] = aidx[bds_a];
-          flag_l = true;
-          flag_l_idx = i;
-        } else if (i == blocking_axis_.A_k[0]) { // K(reduce) axis
-          aidx_[i] = aidx[bds_a + 1];
-          flag_s = true;
-          flag_s_idx = i;
-        } else { // Batch axis
-          aidx_[i] = aidx[batch_idx];
-          batch_idx++;
-        }
-        if (flag_l && flag_l_idx < i) { LDA = LDA * A_dims[i]; }
-        if (flag_s && flag_s_idx < i) { stride_a = stride_a * A_dims[i]; }
-      }
-    } else {
-      for (int i = 0; i < bds_a + 2; i++) {
-        if (i == blocking_axis_.A_m[0]) {
-          // M axis
-          aidx_[i] = aidx[bds_a];
-        } else if (i == blocking_axis_.A_k[0]) {
-          // K(reduce) axis
-          aidx_[i] = aidx[bds_a + 1];
-          flag_s = true;
-          continue;
-        } else { // Batch axis
-          aidx_[i] = aidx[batch_idx];
-          batch_idx++;
-        }
-        if (flag_s) { stride_a = stride_a * A_dims[i]; }
-      }
-    }
-    aidx.swap(aidx_);
-  }
-
-  flag_s = false;
-  flag_l = false;
-  batch_idx = 0;
-  // update bidx and stride_b according to the format of tensor B
-  if (!update_b) {
-    if (blocking_axis_.B_k.size() == 1 && blocking_axis_.B_n.size() == 1) {
-      bool is_f16 = inputs[1].as<tensor>()->elem_dtype_ == datatypes::f16;
-      COMPILE_ASSERT(
-        inputs[1].as<tensor>()->elem_dtype_ == datatypes::f32 || is_f16,
-        "the datatype of B must be f32 or f16 when B is plain.");
-      LDB = 1;
-      stride_b = K_block;
-      int flag_l_idx = 0, flag_s_idx = 0;
-      for (int i = 0; i < bds_b + 2; i++) {
-        if (i == blocking_axis_.B_k[0]) { // K(reduce) axis
-          bidx_[i] = bidx[bds_b + 1];
-          flag_l = true;
-          flag_l_idx = i;
-          flag_s = true;
-          flag_s_idx = i;
-        } else if (i == blocking_axis_.B_n[0]) { // N axis
-          bidx_[i] = bidx[bds_b];
-        } else { // Batch axis
-          bidx_[i] = bidx[batch_idx];
-          batch_idx++;
-        }
-        if (flag_l && flag_l_idx < i) { LDB = LDB * B_dims[i]; }
-        if (flag_s && flag_s_idx < i) { stride_b = stride_b * B_dims[i]; }
-      }
-    } else {
-      for (int i = 0; i < bds_b + 2; i++) {
-        if (i == blocking_axis_.B_k[0]) {
-          // K(reduce) axis
-          bidx_[i] = bidx[bds_b + 1];
-          flag_s = true;
-          continue;
-        } else if (i == blocking_axis_.B_n[0]) {
-          // N axis
-          bidx_[i] = bidx[bds_b];
-        } else { // Batch axis
-          bidx_[i] = bidx[batch_idx];
-          batch_idx++;
-        }
-        if (flag_s) { stride_b = stride_b * B_dims[i]; }
-      }
-    }
-    bidx.swap(bidx_);
-  }
-
-  flag_l = false;
-  batch_idx = 0;
-  // update cidx and fidx according to the format of tensor C
-  if (!update_c) {
-    if (blocking_axis_.C_m.size() == 1 && blocking_axis_.C_n.size() == 1) {
-      LDC = 1;
-      for (int i = 0; i < bds_c + 2; i++) {
-        if (i == blocking_axis_.C_m[0]) { // M axis
-          cidx_[i] = cidx[bds_c];
-          fidx1_[i] = fidx1[bds_c];
-          fidx2_[i] = fidx2[bds_c];
-          fidx3_[i] = fidx3[bds_c];
-          flag_l = true;
-          continue;
-        } else if (i == blocking_axis_.C_n[0]) { // N axis
-          cidx_[i] = cidx[bds_c + 1];
-          fidx1_[i] = fidx1[bds_c + 1];
-          fidx2_[i] = fidx2[bds_c + 1];
-          fidx3_[i] = fidx3[bds_c + 1];
-        } else { // Batch axis
-          cidx_[i] = cidx[batch_idx];
-          fidx1_[i] = fidx1[batch_idx];
-          fidx2_[i] = fidx2[batch_idx];
-          fidx3_[i] = fidx3[batch_idx];
-          batch_idx++;
-        }
-        if (flag_l) { LDC = LDC * C_dims[i]; }
-      }
-    } else {
-      for (int i = 0; i < bds_c + 2; i++) {
-        if (i == blocking_axis_.C_m[0]) {
-          // M axis
-          cidx_[i] = cidx[bds_c];
-          fidx1_[i] = fidx1[bds_c];
-          fidx2_[i] = fidx2[bds_c];
-          fidx3_[i] = fidx3[bds_c];
-        } else if (i == blocking_axis_.C_n[0]) {
-          // N axis
-          cidx_[i] = cidx[bds_c + 1];
-          fidx1_[i] = fidx1[bds_c + 1];
-          fidx2_[i] = fidx2[bds_c + 1];
-          fidx3_[i] = fidx3[bds_c + 1];
-        } else {
-          // Batch axis
-          cidx_[i] = cidx[batch_idx];
-          fidx1_[i] = fidx1[batch_idx];
-          fidx2_[i] = fidx2[batch_idx];
-          fidx3_[i] = fidx3[batch_idx];
-          batch_idx++;
-        }
-      }
-    }
-    cidx.swap(cidx_);
-    fidx1.swap(fidx1_);
-    fidx2.swap(fidx2_);
-    fidx3.swap(fidx3_);
-  }
-}
-
-void gen_matmul_core_t::schedule_loops(context_ptr ctx,
-  const matmul_core_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  if (get_a_batch_dims().empty() && get_b_batch_dims().empty()) {
-    for_loop lm_c = fors.at(fors.size() - 2), ln_c = fors.back();
-    auto lmn = lm_c->fuse(ln_c);
-  } else {
-    size_t bs = std::max(get_a_batch_dims().size(), get_b_batch_dims().size());
-    for (size_t i = 1; i < bs; i++) {
-      fors[0] = fors[0]->fuse(fors[i]);
-    }
-    stmts matmul_body = fors[0]->body_.static_as<stmts>();
-    fors[0]->fuse(matmul_body->seq_[0].static_as<for_loop>());
-  }
-}
-
-bool gen_matmul_core_t::generate(context_ptr ctx,
-  const matmul_core_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init
-  sc_graph_t &graph = owner_->get_owner_graph();
-  auto A_dtype = get_A_dtype(), B_dtype = get_B_dtype();
-  int M_block = 0, K_block = 0, N_block = 0;
-  expr M_num_blocks, N_num_blocks, K_num_blocks, B_K_num_blocks;
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, B_dtype);
-
-  get_and_check_blocks(graph, inputs, config, M_num_blocks, K_num_blocks,
-    M_block, K_block, N_block, B_K_num_blocks, N_num_blocks);
-
-  int dtype_block = 1;
-  if (is_vnni_low_fp) {
-    dtype_block = 2;
-  } else if (utils::is_one_of(B_dtype, datatypes::u8, datatypes::s8)) {
-    dtype_block = 4;
-  }
-  if (dtype_block > 1) {
-    COMPILE_ASSERT(blocking_axis_.B_k.size() == 3, "Wrong data format of B");
-  }
-
-  for_loop lm_c, ln_c;
-
-  expr C = outputs[op_params_t::out_C];
-  expr A = inputs[op_params_t::in_A];
-  expr B = inputs[op_params_t::in_B];
-  auto A_dims = A.as<tensor>()->dims_;
-  auto B_dims = B.as<tensor>()->dims_;
-  auto C_dims = C.as<tensor>()->dims_;
-
-  std::vector<expr> A_batch_dims; // the batch dims of A
-  for (auto i : blocking_axis_.A_bs) {
-    A_batch_dims.emplace_back(A_dims[i]);
-  }
-  std::vector<expr> B_batch_dims; // the batch dims of B
-  for (auto i : blocking_axis_.B_bs) {
-    B_batch_dims.emplace_back(B_dims[i]);
-  }
-  std::vector<expr> C_batch_dims;
-  // the batch dims of output, long and broadcasted.
-  for (auto i : blocking_axis_.C_bs) {
-    C_batch_dims.emplace_back(C_dims[i]);
-  }
-  auto batch_dims_size = C_batch_dims.size();
-  auto A_batch_dims_long = A_batch_dims.size() >= B_batch_dims.size();
-  auto long_batch_dims = A_batch_dims_long ? A_batch_dims : B_batch_dims;
-  auto short_batch_dims = A_batch_dims_long ? B_batch_dims : A_batch_dims;
-  auto short_batch_dims_size = short_batch_dims.size();
-  std::vector<expr> idxs_long, idxs_short, idxs_C;
-  std::vector<for_loop> batch_loops;
-  std::vector<for_range_simulator_t> ranges;
-  idxs_long.resize(batch_dims_size);
-  idxs_short.resize(short_batch_dims_size);
-  idxs_C.resize(batch_dims_size);
-  batch_loops.resize(batch_dims_size);
-  ranges.reserve(batch_dims_size);
-  for (size_t i = 0; i < batch_dims_size; i++) {
-    ranges.emplace_back(builder::range(
-      batch_loops[i], expr(0), C_batch_dims[i], expr(1), for_type::PARALLEL));
-  }
-  if (!C_batch_dims.empty()) {
-    _nested_for_(std::move(ranges)) {
-      std::vector<std::pair<expr, expr>> batch_tensor_slice_ranges, fidx1,
-        fidx2;
-      // index, slice and range at C
-      for (size_t i = 0; i < batch_dims_size; i++) {
-        idxs_C[i] = _0_nested_for.get_var();
-        idxs_C[i].checked_as<var>()->name_
-          = std::string("idx") + std::to_string(i);
-        batch_tensor_slice_ranges.emplace_back(idxs_C[i], 1);
-      }
-      // index at the long dims
-      for (size_t i = 0; i < batch_dims_size; i++) {
-        if (long_batch_dims[i].isa<constant_c>()
-          && get_expr_as_int(long_batch_dims[i]) == 1) {
-          // broadcasted
-          idxs_long[i] = 0;
-        } else {
-          idxs_long[i] = idxs_C[i];
-        }
-      }
-      // index at the short dims
-      for (size_t i = 0; i < short_batch_dims_size; ++i) {
-        if (short_batch_dims[i].isa<constant_c>()
-          && get_expr_as_int(short_batch_dims[i]) == 1) {
-          // broadcasted
-          idxs_short[i] = 0;
-        } else {
-          idxs_short[i] = idxs_C[batch_dims_size - short_batch_dims_size + i];
-        }
-      }
-
-      std::vector<std::pair<expr, expr>> fidx3
-        = blocking_axis_.C_m.size() == 1 && blocking_axis_.C_n.size() == 1
-        ? concat_vec(batch_tensor_slice_ranges,
-          {{0, M_num_blocks * M_block}, {0, N_num_blocks * N_block}})
-        : concat_vec(batch_tensor_slice_ranges,
-          {{0, M_num_blocks}, {0, N_num_blocks}, {0, M_block}, {0, N_block}});
-
-      _named_for_(lm_c, m_o, 0, M_num_blocks) {
-        _named_for_(ln_c, n_o, 0, N_num_blocks) {
-          std::vector<expr> aidx
-            = concat_vec(A_batch_dims_long ? idxs_long : idxs_short,
-              blocking_axis_.A_m.size() == 1 && blocking_axis_.A_k.size() == 1
-                ? std::vector<expr> {m_o * M_block, 0}
-                : std::vector<expr> {m_o, 0, 0, 0});
-          std::vector<expr> bidx
-            = concat_vec(A_batch_dims_long ? idxs_short : idxs_long,
-              blocking_axis_.B_k.size() == 1 && blocking_axis_.B_n.size() == 1
-                ? std::vector<expr> {n_o * N_block, 0}
-                : std::vector<expr> {n_o, 0, 0, 0});
-          std::vector<expr> cidx = concat_vec(idxs_C,
-            blocking_axis_.C_m.size() == 1 && blocking_axis_.C_n.size() == 1
-              ? std::vector<expr> {m_o * M_block, n_o * N_block}
-              : std::vector<expr> {m_o, n_o, 0, 0});
-          fidx1
-            = blocking_axis_.C_m.size() == 1 && blocking_axis_.C_n.size() == 1
-            ? concat_vec(batch_tensor_slice_ranges,
-              {{m_o * M_block, M_block}, {n_o * N_block, N_block}})
-            : concat_vec(batch_tensor_slice_ranges,
-              {{m_o, 1}, {n_o, 1}, {0, M_block}, {0, N_block}});
-          fidx2
-            = blocking_axis_.C_m.size() == 1 && blocking_axis_.C_n.size() == 1
-            ? concat_vec(batch_tensor_slice_ranges,
-              {{m_o * M_block, M_block}, {0, N_num_blocks * N_block}})
-            : concat_vec(batch_tensor_slice_ranges,
-              {{m_o, 1}, {0, N_num_blocks}, {0, M_block}, {0, N_block}});
-
-          if (dtype_block > 1) bidx.emplace_back(0);
-          expr LDA = K_block, LDB = N_block, LDC = N_block,
-               stride_a = M_block * K_block,
-               stride_b = (int)utils::divide_and_ceil(K_block, dtype_block)
-            * dtype_block * N_block;
-
-          get_brgemm_and_fusion_params(graph, inputs, outputs, M_block, K_block,
-            N_block, aidx, bidx, cidx, LDA, LDB, LDC, stride_a, stride_b, fidx1,
-            fidx2, fidx3);
-
-          // todo: this is for s8s8 vnni compensation
-          auto eval = builtin::brgemm_init_update_allow_fusion(
-            tensor_ptr(A, aidx), tensor_ptr(B, bidx), tensor_ptr(C, cidx),
-            K_num_blocks, M_block, N_block, K_block, LDA, LDB, LDC, stride_a,
-            stride_b, A_dtype, B_dtype);
-
-          // this is the gemm output
-          create_fusion_anchor(fusion, owner_->get_outputs()[0], fidx1);
-        }
-        create_fusion_anchor(fusion, owner_->get_outputs()[0], fidx2);
-      }
-      create_fusion_anchor(fusion, owner_->get_outputs()[0], fidx3);
-    }
-  } else {
-    _named_for_(lm_c, m_o, 0, M_num_blocks, 1, for_type::PARALLEL) {
-      _named_for_(ln_c, n_o, 0, N_num_blocks) {
-        expr LDA = K_block, LDB = N_block, LDC = N_block,
-             stride_a = M_block * K_block,
-             stride_b = (int)utils::divide_and_ceil(K_block, dtype_block)
-          * dtype_block * N_block;
-        if (!in_tensors_[0].get_format().is_blocking()) {
-          LDA = graph.dim_to_expr(in_tensors_[0].get_plain_dims().back());
-          stride_a = K_block;
-        }
-        if (!in_tensors_[1].get_format().is_blocking()) {
-          // format = KN
-          COMPILE_ASSERT(in_tensors_[1].get_format().is_same_format_kind(
-                           sc_data_format_t(format_kinds::KN)),
-            "brgemm does not support K axis is the last axis for B.");
-          LDB = graph.dim_to_expr(in_tensors_[1].get_plain_dims().back());
-          stride_b = LDB * K_block;
-        }
-        if (!out_tensors_[0].get_format().is_blocking()) {
-          LDC = graph.dim_to_expr(in_tensors_[1].get_plain_dims().back());
-        }
-        auto eval = builtin::brgemm_init_update_allow_fusion(
-          tensor_ptr(A,
-            !in_tensors_[0].get_format().is_blocking()
-              ? std::vector<expr> {m_o * M_block, 0}
-              : std::vector<expr> {m_o, 0, 0, 0}),
-          tensor_ptr(B,
-            dtype_block > 1 ? std::vector<expr> {n_o, 0, 0, 0, 0}
-                            : (!in_tensors_[1].get_format().is_blocking()
-                                ? std::vector<expr> {0, n_o * N_block}
-                                : std::vector<expr> {n_o, 0, 0, 0})),
-          tensor_ptr(C,
-            !out_tensors_[0].get_format().is_blocking()
-              ? std::vector<expr> {m_o * M_block, n_o * N_block}
-              : std::vector<expr> {m_o, n_o, 0, 0}),
-          K_num_blocks, M_block, N_block, K_block, LDA, LDB, LDC, stride_a,
-          stride_b, A_dtype, B_dtype);
-
-        // this is the gemm output
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          !out_tensors_[0].get_format().is_blocking()
-            ? slice_range {{m_o * M_block, M_block}, {n_o * N_block, N_block}}
-            : slice_range {{m_o, 1}, {n_o, 1}, {0, M_block}, {0, N_block}});
-      }
-      // this is the gemm output
-      if (M_num_blocks.isa<constant>()
-        && get_expr_as_int(M_num_blocks)
-          >= runtime_config_t::get().get_num_threads()) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          !out_tensors_[0].get_format().is_blocking()
-            ? slice_range {{m_o * M_block, M_block},
-              {0, N_num_blocks * N_block}}
-            : slice_range {
-              {m_o, 1}, {0, N_num_blocks}, {0, M_block}, {0, N_block}});
-      }
-    }
-  }
-
-  loops = concat_vec(batch_loops, {lm_c, ln_c});
-  // bind outer loops with axis hint
-  std::vector<int> bd_axis(out_tensors_[0].get_plain_dims().size());
-  std::iota(bd_axis.begin(), bd_axis.end(), 0);
-  bind_loop_axis(owner_->get_outputs()[0], loops, bd_axis);
-
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.hpp
deleted file mode 100644
index cac8251d9dc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/matmul_core.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_MATMUL_CORE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_MATMUL_CORE_HPP
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include <ops/body_generator.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-struct matmul_core_config_t {
-  int M_block;
-  int N_block;
-  int K_block;
-};
-
-struct blocking_axis_t {
-  std::vector<int> A_bs;
-  std::vector<int> A_m;
-  std::vector<int> A_k;
-  std::vector<int> B_bs;
-  std::vector<int> B_k;
-  std::vector<int> B_n;
-  std::vector<int> C_bs;
-  std::vector<int> C_m;
-  std::vector<int> C_n;
-};
-
-class gen_matmul_core_t : public body_generator_t<matmul_core_config_t> {
-public:
-  struct op_params_t {
-    static constexpr int in_A = 0;
-    static constexpr int in_B = 1;
-    static constexpr int out_C = 0;
-  };
-  using parent = body_generator_t<matmul_core_config_t>;
-  using parent::generate;
-
-  gen_matmul_core_t(sc_op *owner, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-  bool is_dynamic() const {
-    return in_tensors_[0].is_dynamic() || in_tensors_[1].is_dynamic();
-  }
-  const sc_dims get_a_batch_dims() const {
-    return {in_tensors_[0].get_plain_dims().begin(),
-      in_tensors_[0].get_plain_dims().end() - 2};
-  }
-
-  const sc_dims get_b_batch_dims() const {
-    return {in_tensors_[1].get_plain_dims().begin(),
-      in_tensors_[1].get_plain_dims().end() - 2};
-  }
-
-  const sc_dims get_mma_plain_dims() const {
-    return {in_tensors_[0].get_plain_dims().begin() + get_a_batch_dims().size(),
-      in_tensors_[0].get_plain_dims().end()};
-  };
-
-  const sc_dims get_mmb_plain_dims() const {
-    return {in_tensors_[1].get_plain_dims().begin() + get_b_batch_dims().size(),
-      in_tensors_[1].get_plain_dims().end()};
-  };
-
-  void get_and_check_blocks(sc_graph_t &graph, const std::vector<expr> &inputs,
-    const matmul_core_config_t &config, expr &M_num_blocks, expr &K_num_blocks,
-    int &M_block, int &K_block, int &N_block, expr &B_K_num_blocks,
-    expr &N_num_blocks) const;
-
-  void get_brgemm_and_fusion_params(sc_graph_t &graph,
-    const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-    const int &M_block, const int &K_block, const int &N_block,
-    std::vector<expr> &aidx, std::vector<expr> &bidx, std::vector<expr> &cidx,
-    expr &LDA, expr &LDB, expr &LDC, expr &stride_a, expr &stride_b,
-    std::vector<std::pair<expr, expr>> &fidx1,
-    std::vector<std::pair<expr, expr>> &fidx2,
-    std::vector<std::pair<expr, expr>> &fidx3) const;
-
-  sc_data_type_t get_A_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_B_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_C_dtype() const { return out_tensors_[0].dtype_; }
-
-  bool generate(context_ptr ctx, const matmul_core_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const matmul_core_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-  void init_axis();
-
-private:
-  blocking_axis_t blocking_axis_;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_data.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_data.cpp
deleted file mode 100644
index 50f40cb11fb..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_data.cpp
+++ /dev/null
@@ -1,684 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "nested_conv1x1_backprop_data.hpp"
-#include <algorithm>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <runtime/config.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-static void get_blocks_and_ib_blocks(const int X, const int X_split_num,
-  const int ix_block, int &X_block_size, int &X_ib_block_size) {
-  if (utils::divide_and_ceil(X, X_block_size) < (size_t)X_split_num
-    && X_block_size > ix_block) {
-    X_block_size -= ix_block;
-  }
-  // BS, N, K imbalance block size
-  X_ib_block_size = X - X_block_size * X_split_num <= 0
-    ? X - X_block_size * (X_split_num - 1)
-    : X_block_size + ix_block;
-  if (X_ib_block_size < 0) {
-    // cannot use all the threads
-    X_ib_block_size = X - X / X_block_size * X_block_size;
-  }
-  if (X_ib_block_size == 0) { X_ib_block_size = X_block_size; }
-}
-
-static void compute_single_thr_and_idx(const int X, const int X_split_num,
-  const int X_block_size, const int X_ib_num, const int tail_X,
-  const int X_ib_block_size, const expr &x_s, expr &X_single_thr_size,
-  expr &x_idx) {
-  if (X_block_size == X_ib_block_size) {
-    X_single_thr_size = X_block_size;
-    x_idx = x_s * X_block_size;
-  } else {
-    if (X - X_block_size * X_split_num <= 0) {
-      // cannot use all the cores due to small shapes
-      x_idx = x_s * X_block_size;
-    } else {
-      x_idx
-        = builder::make_select(x_s < X_split_num - X_ib_num, x_s * X_block_size,
-          (X_split_num - X_ib_num) * X_block_size
-            + (x_s + X_ib_num - X_split_num) * X_ib_block_size);
-    }
-    if (tail_X != X_ib_block_size) {
-      // has tail and imbalance
-      X_single_thr_size = builder::make_select(x_s < X_split_num - X_ib_num,
-        X_block_size,
-        builder::make_select(x_s == X_split_num - 1, tail_X, X_ib_block_size));
-    } else {
-      if (X - X_block_size * X_split_num <= 0) {
-        // cannot use all the cores due to small shapes
-        X_single_thr_size = builder::make_select(
-          x_s < X / X_block_size, X_block_size, X_ib_block_size);
-      } else {
-        X_single_thr_size = builder::make_select(
-          x_s < X_split_num - X_ib_num, X_block_size, X_ib_block_size);
-      }
-    }
-  }
-}
-
-config_ptr gen_nested_conv1x1_backprop_data_t::get_default_config(
-  context_ptr ctx) const {
-  auto ret
-    = reflection::general_object_t::make<nested_conv_bwd_data_config_t>();
-  nested_conv_bwd_data_config_t &cfg
-    = *ret.unchecked_get_as<nested_conv_bwd_data_config_t>();
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  const int im_bs_block = im_bs_block_;
-  const int im_ow_block = im_ow_block_;
-  const int im_ic_block = im_ic_block_;
-  const int im_oc_block = im_oc_block_;
-
-  // Assume padding = 0
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1;
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  bool has_stride = stride_h > 1 || stride_w > 1;
-
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_input_dims()[2] : get_output_dims()[2]);
-  const int H = stride_h > 1 ? get_input_dims()[ndims_ - 2]
-                             : get_output_dims()[ndims_ - 2];
-  const int W = stride_w > 1 ? get_input_dims()[ndims_ - 1]
-                             : get_output_dims()[ndims_ - 1];
-  const int IC = get_weight_dims()[1];
-  const int OC = get_weight_dims()[0];
-  const int BS = get_input_dims()[0];
-  const int OS = D * H * W;
-
-  const int sizeofdtypeA
-    = utils::get_sizeof_etype(in_tensors_[0].dtype_.as_etype());
-  const int sizeofdtypeC
-    = utils::get_sizeof_etype(out_tensors_[0].dtype_.as_etype());
-  float cost = std::numeric_limits<float>::max();
-  auto split_s_list = get_splits(num_threads);
-  int split_s = 1;
-  for (int64_t j = split_s_list.size() - 1; j >= 0; --j) {
-    int i = split_s_list[j];
-    int num_BS_block
-      = utils::divide_and_ceil(BS / im_bs_block, num_threads / i);
-    int num_S_block = utils::divide_and_ceil(OS / im_ow_block, i);
-    int num_brgemm = num_BS_block * num_S_block;
-    int num_core = std::min(i, OS / im_ow_block)
-      * std::min(num_threads / i, BS / im_bs_block);
-    // Cost = Shape_efficient_weight *
-    // (workload_balance + divide_N_plenty) / core_utilitizaiton
-    // single core gemm prefers square shape for A and B.
-    // For small workload, the A and B shape is not a key problem, but the
-    // num_core and num_brgemm is important to performance. Use 2048 to reduce
-    // the shape weight on small shape.
-    float new_cost = (1024 + BS * i / float(num_threads) + OS / float(i))
-      * (num_brgemm + 8 * i) / float(num_core);
-    if (new_cost < cost) {
-      split_s = i;
-      cost = new_cost;
-    }
-  }
-  cfg.bs_threads = num_threads / split_s;
-  cfg.spatial_threads = split_s;
-  cfg.ic_threads = 1;
-  // when spatial size is small, and has enough IC split space, give splits on
-  // BS and IC
-  if (OS < 64 && IC > 256) {
-    auto possible_factors = get_splits(num_threads);
-    for (int64_t i = possible_factors.size() - 1; i >= 0; --i) {
-      if (BS >= possible_factors[i]) {
-        cfg.bs_threads = possible_factors[i];
-        cfg.ic_threads = num_threads / possible_factors[i];
-        cfg.spatial_threads = 1;
-        break;
-      }
-    }
-  }
-
-  int single_BS = utils::divide_and_ceil(BS, cfg.bs_threads) * im_bs_block;
-  int single_S = utils::divide_and_ceil(
-                   utils::divide_and_ceil(OS, im_ow_block), cfg.spatial_threads)
-    * im_ow_block;
-  int single_IC = IC, single_OC = OC;
-  int L2_size = static_cast<int>(ctx->machine_.cpu_flags_.getDCacheSize(2));
-  int single_C_threshold
-    = (single_BS * single_S * sizeofdtypeA < L2_size ? 2048 : 4096)
-    / sizeofdtypeA;
-  if (single_IC * single_OC >= single_C_threshold) {
-    int L2_C = utils::divide_and_ceil(single_IC, im_ic_block) * im_ic_block
-      * utils::divide_and_ceil(single_OC, im_oc_block) * im_oc_block;
-    int L2_BS
-      = (sqrt(pow(2 * sizeofdtypeA * L2_C, 2) + 4 * sizeofdtypeC * L2_size)
-          - 2 * sizeofdtypeA * L2_C)
-      / (2 * sizeofdtypeC);
-    cfg.bs_num_blocks = std::max(1, single_BS / L2_C);
-    cfg.spatial_num_blocks = std::max(1, single_S / L2_C);
-  } else {
-    int L2_BS_S = L2_size / (2 * sizeofdtypeA * single_IC * single_OC);
-    cfg.bs_num_blocks = std::max(1, single_BS / L2_BS_S);
-    cfg.spatial_num_blocks = std::max(1, single_S / L2_BS_S);
-  }
-
-  cfg.ic_num_blocks = std::max(1, IC / im_ic_block / cfg.ic_threads / 8);
-  cfg.oc_num_blocks = 1;
-
-  return std::move(ret);
-}
-
-gen_nested_conv1x1_backprop_data_t::gen_nested_conv1x1_backprop_data_t(
-  sc_op *owner, const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  ndims_ = get_input_dims().size();
-  const bool is_3d = (ndims_ == 5);
-  COMPILE_ASSERT(is_3d
-      ? utils::is_one_of(static_cast<int>(padding_.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(padding_.size()), 1, 2),
-    "wrong padding dims, should be 1, 2 or 3, but got " << padding_.size()
-                                                        << ".");
-  COMPILE_ASSERT(is_3d
-      ? utils::is_one_of(static_cast<int>(stride_.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(stride_.size()), 1, 2),
-    "wrong stride dims, should be 1, 2 or 3, but got " << stride_.size()
-                                                       << ".");
-
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_input_dims()[2] : get_output_dims()[2]);
-  const int H = stride_h > 1 ? get_input_dims()[ndims_ - 2]
-                             : get_output_dims()[ndims_ - 2];
-  const int W = stride_w > 1 ? get_input_dims()[ndims_ - 1]
-                             : get_output_dims()[ndims_ - 1];
-  const int IC = get_weight_dims()[1];
-  const int OC = get_weight_dims()[0];
-  const int BS = get_input_dims()[0];
-  const int OS = D * H * W;
-
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(get_default_context(), get_dtype());
-  bool no_vnni = ops::no_vnni(get_default_context(), get_dtype());
-  bool has_stride = stride_d > 1 || stride_h > 1 || stride_w > 1;
-
-  int64_t IC_block_default = 32;
-  int64_t OC_block_default = 32;
-  if (no_vnni) {
-    IC_block_default = 16;
-    OC_block_default = 16;
-  } else if (is_vnni_low_fp) {
-    IC_block_default = 32;
-    OC_block_default = 32;
-  } else {
-    assert(utils::is_one_of(get_dtype(), datatypes::u8, datatypes::s8));
-    IC_block_default = 64;
-    OC_block_default = 64;
-  }
-
-  if (OS >= 256)
-    im_ow_block_ = W;
-  else if (OS >= 64)
-    im_ow_block_ = has_stride ? W : W * 2;
-  else
-    im_ow_block_ = has_stride ? W : OS;
-  im_ic_block_ = IC_block_default;
-  im_oc_block_ = OC_block_default;
-  im_bs_block_ = 1;
-}
-
-float gen_nested_conv1x1_backprop_data_t::get_gflop() const {
-  float result = 0.0;
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_input_dims()[2] : get_output_dims()[2]);
-  const int H = stride_h > 1 ? get_input_dims()[ndims_ - 2]
-                             : get_output_dims()[ndims_ - 2];
-  const int W = stride_w > 1 ? get_input_dims()[ndims_ - 1]
-                             : get_output_dims()[ndims_ - 1];
-  const int C = get_input_dims()[1];
-  const int K = get_output_dims()[1];
-  const int N = get_input_dims()[0];
-
-  result = 2.f * N * C * D * W * H * K / 1e9;
-  return result;
-}
-
-void gen_nested_conv1x1_backprop_data_t::schedule_loops(context_ptr ctx,
-  const nested_conv_bwd_data_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {}
-
-void gen_nested_conv1x1_backprop_data_t::
-  single_thread_conv1x1_backprop_data_call(const context_ptr &ctx,
-    const logical_tensor_t &ta, const logical_tensor_t &tb,
-    const logical_tensor_t &tc, const nested_conv_bwd_data_config_t &config,
-    const expr &BS, const expr &OS, const expr &IC, const expr &OC,
-    const expr &bs_idx, const expr &s_idx, const expr &ic_idx,
-    const expr &oc_idx, const int stride_d, const int stride_h,
-    const int stride_w, const expr &A, const expr &B, const expr &C,
-    int dtype_block, fusion_anchor_mgr_t *fusion, const expr &bs_s,
-    const expr &s_s, const expr &ic_s, std::vector<int> &BS_anchor_info,
-    std::vector<int> &S_anchor_info, std::vector<int> &IC_anchor_info,
-    const bool is_out_blocking, bool is_partial, const expr &oc_s) const {
-  expr BS_sub_block = config.bs_num_blocks,
-       S_sub_block = config.spatial_num_blocks,
-       IC_sub_block = config.ic_num_blocks, OC_sub_block = config.oc_num_blocks;
-  for_loop im_oc, im_bs, im_os, im_ic, o_im_oc;
-  int ori_BS = static_cast<int>(ta.get_plain_dims()[0]),
-      ori_H = static_cast<int>(ta.get_plain_dims()[ndims_ - 2]),
-      ori_W = static_cast<int>(ta.get_plain_dims()[ndims_ - 1]),
-      ori_S = ori_H * ori_W, ori_IC = static_cast<int>(tb.get_plain_dims()[1]),
-      ori_OC = static_cast<int>(tb.get_plain_dims()[0]);
-  _var_init_(tid, datatypes::s32, builder::make_get_group_thread_id(-1));
-
-  _for_(o_bs, 0, BS_sub_block) {
-    _for_(o_s, 0, S_sub_block) {
-      _for_(o_ic, 0, IC_sub_block) {
-        expr bs_b_idx, s_b_idx, ic_b_idx, oc_b_idx, bs_b_bigger_num,
-          s_b_bigger_num, ic_b_bigger_num, oc_b_bigger_num;
-        _var_init_(bs_o_end, datatypes::s32,
-          get_balance211_length(
-            BS / im_bs_block_, BS_sub_block, o_bs, bs_b_idx, bs_b_bigger_num));
-        _var_init_(s_o_end, datatypes::s32,
-          get_balance211_length(
-            OS / im_ow_block_, S_sub_block, o_s, s_b_idx, s_b_bigger_num));
-        _var_init_(ic_o_end, datatypes::s32,
-          get_balance211_length(
-            IC / im_ic_block_, IC_sub_block, o_ic, ic_b_idx, ic_b_bigger_num));
-        _named_for_(o_im_oc, o_oc, 0, OC_sub_block) {
-          _named_for_(im_bs, i_bs, 0, bs_o_end) {
-            _var_init_(bs_start_idx, datatypes::index,
-              bs_idx + o_bs * BS / BS_sub_block
-                + ((i_bs + tid) % bs_o_end) * im_bs_block_);
-            _named_for_(im_os, i_os, 0, s_o_end) {
-              _named_for_(im_ic, i_ic, 0, ic_o_end) {
-                _var_init_(m_start_idx, datatypes::index,
-                  s_idx + o_s * OS / S_sub_block
-                    + ((i_os + tid) % s_o_end) * im_ow_block_);
-                _var_init_(n_start_idx, datatypes::index,
-                  ic_idx + o_ic * IC / IC_sub_block
-                    + ((i_ic + tid) % ic_o_end) * im_ic_block_);
-                _var_init_(num, datatypes::s32,
-                  get_balance211_length(OC / im_oc_block_, OC_sub_block, o_oc,
-                    oc_b_idx, oc_b_bigger_num));
-                _var_init_(k_start_idx, datatypes::index, oc_idx + o_oc);
-                // TODO(zhangyan): consider 3D case
-                std::vector<expr> aidx = std::vector<expr> {bs_start_idx,
-                  m_start_idx / ori_W, m_start_idx % ori_W, k_start_idx};
-                std::vector<expr> bidx = dtype_block > 1
-                  ? std::vector<expr> {n_start_idx / im_ic_block_,
-                    k_start_idx / im_oc_block_ / 2, 0, 0, 0, 0, 0}
-                  : !tb.get_format().is_blocking()
-                  ? std::vector<expr> {k_start_idx, n_start_idx, 0, 0}
-                  : std::vector<expr> {n_start_idx / im_ic_block_,
-                    k_start_idx / im_oc_block_, 0, 0, 0, 0};
-                std::vector<expr> cidx
-                  = {bs_start_idx, m_start_idx / ori_W * stride_h,
-                    m_start_idx % ori_W * stride_w * ori_W, n_start_idx};
-                if (stride_w > 1 && !is_out_blocking) {
-                  expr LDC = ori_IC;
-                  LDC->attr().set("plain_init", true);
-                  _if_(o_oc == 0) {
-                    // when strides > 1, im_ow_block_ = OW
-                    builtin::dnnl_brgemm_init(tensor_ptr(C, cidx),
-                      stride_h * stride_w * im_ow_block_, im_ic_block_, LDC,
-                      datatypes::f32, 0);
-                  }
-                }
-                auto LDA
-                  = !ta.get_format().is_blocking() ? ori_OC : im_oc_block_;
-                auto LDB
-                  = !tb.get_format().is_blocking() ? ori_IC : im_ic_block_;
-                expr LDC
-                  = (!is_out_blocking ? ori_IC : im_ic_block_) * stride_w;
-                if (!tc.get_format().is_blocking())
-                  LDC->attr().set("stride_w", stride_w);
-                LDC->attr().set(
-                  "N_axis", std::vector<size_t> {3}); // only consider 2D case
-                auto stride_a = im_oc_block_;
-                auto stride_b = !tb.get_format().is_blocking()
-                  ? im_oc_block_ * ori_IC
-                  : im_oc_block_ * im_ic_block_;
-                _if_(o_oc == 0) {
-                  builtin::brgemm_init_update(tensor_ptr(A, aidx),
-                    tensor_ptr(B, bidx), tensor_ptr(C, cidx), num, im_ow_block_,
-                    im_ic_block_, im_oc_block_, LDA, LDB, LDC, stride_a,
-                    stride_b, ta.dtype_, tb.dtype_);
-                }
-                _else_ {
-                  builtin::brgemm_update(tensor_ptr(A, aidx),
-                    tensor_ptr(B, bidx), tensor_ptr(C, cidx), num, im_ow_block_,
-                    im_ic_block_, im_oc_block_, LDA, LDB, LDC, stride_a,
-                    stride_b, ta.dtype_, tb.dtype_);
-                } // brgemm
-                if (fusion && !is_partial) {
-                  // TODO(zhangyan): consider more fusion cases.
-                  if (ori_W % stride_w == 0 && ori_H % stride_h == 0) {
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      slice_range {{bs_start_idx, 1},
-                        {m_start_idx / ori_W * stride_h, stride_h},
-                        {m_start_idx % ori_W * stride_w,
-                          im_ow_block_ * stride_w},
-                        {n_start_idx, im_ic_block_}});
-                  }
-                } // if fusion
-              } // im_ic
-            } // im_os
-          } // im_bs
-        } // o_oc
-        if (fusion && !is_partial) {
-          // TODO(zhangyan): consider more fusion cases.
-          if (BS_anchor_info[1] == BS_anchor_info[2]
-            && S_anchor_info[1] == S_anchor_info[2]
-            && IC_anchor_info[1] == IC_anchor_info[2]
-            && BS_anchor_info[1] / im_bs_block_ % config.bs_num_blocks == 0
-            && S_anchor_info[1] / im_ow_block_ % config.spatial_num_blocks == 0
-            && IC_anchor_info[1] / im_ic_block_ % config.ic_num_blocks
-              == 0) { // no imbalance
-            if (ori_H % stride_h == 0 && ori_W % stride_w == 0) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                slice_range {{bs_idx + o_bs * BS / BS_sub_block,
-                               BS_anchor_info[1] / config.bs_num_blocks},
-                  {(s_idx + o_s * OS / S_sub_block) / ori_W * stride_h,
-                    S_anchor_info[1] / config.spatial_num_blocks / ori_W
-                      * stride_h},
-                  {(s_idx + o_s * OS / S_sub_block) % ori_W * stride_w,
-                    ori_W * stride_w},
-                  {ic_idx + o_ic * IC / IC_sub_block,
-                    IC_anchor_info[1] / config.ic_num_blocks}});
-            }
-          }
-        } // if fusion
-      } // o_ic
-    } // o_os
-  } // o_bs
-  if (config.oc_num_blocks > 1) {
-    im_ic->attr()[stmt_attr_key::reduce_root_loop]
-      = std::weak_ptr<stmt_base_t>(o_im_oc.impl);
-  }
-}
-
-bool gen_nested_conv1x1_backprop_data_t::generate(context_ptr ctx,
-  const nested_conv_bwd_data_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // Init OP param
-  // Assume paddings = 0
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1;
-  int stride_h = stride_[0], stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  int BS = get_input_dims()[0];
-  int OC = get_weight_dims()[0], IC = get_weight_dims()[1];
-  int OH = get_input_dims()[ndims_ - 2], OW = get_input_dims()[ndims_ - 1];
-  int IH = get_output_dims()[ndims_ - 2], IW = get_output_dims()[ndims_ - 1];
-  int OD = is_3d ? get_input_dims()[2] : 1,
-      ID = is_3d ? get_output_dims()[2] : 1;
-  int OS = OD * OH * OW, IS = ID * IH * IW;
-
-  // Get config
-  int BS_split_num = config.bs_threads, S_split_num = config.spatial_threads,
-      IC_split_num = config.ic_threads;
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int OC_split_num = num_threads / BS_split_num / S_split_num / IC_split_num;
-  assert(OC_split_num == 1);
-  int BS_sub_block = config.bs_num_blocks,
-      S_sub_block = config.spatial_num_blocks,
-      IC_sub_block = config.ic_num_blocks, OC_sub_block = config.oc_num_blocks;
-  int BS_block_size = utils::divide_and_ceil(
-                        utils::divide_and_ceil(BS, BS_split_num), im_bs_block_)
-    * im_bs_block_;
-  int S_block_size = utils::divide_and_ceil(
-                       utils::divide_and_ceil(OS, S_split_num), im_ow_block_)
-    * im_ow_block_;
-  int IC_block_size = utils::divide_and_ceil(
-                        utils::divide_and_ceil(IC, IC_split_num), im_ic_block_)
-    * im_ic_block_;
-  int OC_block_size = utils::divide_and_ceil(
-                        utils::divide_and_ceil(OC, OC_split_num), im_oc_block_)
-    * im_oc_block_;
-
-  // make sure that each thread has workload
-  int BS_ib_block_size, S_ib_block_size, IC_ib_block_size, OC_ib_block_size;
-  get_blocks_and_ib_blocks(
-    BS, BS_split_num, im_bs_block_, BS_block_size, BS_ib_block_size);
-  get_blocks_and_ib_blocks(
-    OS, S_split_num, im_ow_block_, S_block_size, S_ib_block_size);
-  get_blocks_and_ib_blocks(
-    IC, IC_split_num, im_ic_block_, IC_block_size, IC_ib_block_size);
-  get_blocks_and_ib_blocks(
-    OC, OC_split_num, im_oc_block_, OC_block_size, OC_ib_block_size);
-  // update X_block_size and X_ib_block_size to minimize their gaps
-  if (BS_block_size >= im_bs_block_ * 2) {
-    int BS_new_ib_block_size, BS_new_block_size = BS_block_size - im_bs_block_;
-    get_blocks_and_ib_blocks(
-      BS, BS_split_num, im_bs_block_, BS_new_block_size, BS_new_ib_block_size);
-    if (std::abs(BS_block_size - BS_ib_block_size)
-      > std::abs(BS_new_block_size - BS_new_ib_block_size)) {
-      BS_block_size = BS_new_block_size;
-      BS_ib_block_size = BS_new_ib_block_size;
-    }
-  }
-  if (S_block_size >= im_ow_block_ * 2) {
-    int S_new_ib_block_size, S_new_block_size = S_block_size - im_ow_block_;
-    get_blocks_and_ib_blocks(
-      OS, S_split_num, im_ow_block_, S_new_block_size, S_new_ib_block_size);
-    if (std::abs(S_block_size - S_ib_block_size)
-      > std::abs(S_new_block_size - S_new_ib_block_size)) {
-      S_block_size = S_new_block_size;
-      S_ib_block_size = S_new_ib_block_size;
-    }
-  }
-  if (IC_block_size >= im_ic_block_ * 2) {
-    int IC_new_ib_block_size, IC_new_block_size = IC_block_size - im_ic_block_;
-    get_blocks_and_ib_blocks(
-      IC, IC_split_num, im_ic_block_, IC_new_block_size, IC_new_ib_block_size);
-    if (std::abs(IC_block_size - IC_ib_block_size)
-      > std::abs(IC_new_block_size - IC_new_ib_block_size)) {
-      IC_block_size = IC_new_block_size;
-      IC_ib_block_size = IC_new_ib_block_size;
-    }
-  }
-  if (OC_block_size >= im_oc_block_ * 2) {
-    int OC_new_ib_block_size, OC_new_block_size = OC_block_size - im_oc_block_;
-    get_blocks_and_ib_blocks(
-      OC, OC_split_num, im_oc_block_, OC_new_block_size, OC_new_ib_block_size);
-    if (std::abs(OC_block_size - OC_ib_block_size)
-      > std::abs(OC_new_block_size - OC_new_ib_block_size)) {
-      OC_block_size = OC_new_block_size;
-      OC_ib_block_size = OC_new_ib_block_size;
-    }
-  }
-  // BS, OS, IC, OC imbalance block num
-  int BS_ib_num = BS - BS_block_size * BS_split_num < 0
-    ? 1
-    : utils::divide_and_ceil(BS - BS_block_size * BS_split_num, im_bs_block_);
-  int S_ib_num = OS - S_block_size * S_split_num < 0
-    ? 1
-    : utils::divide_and_ceil(OS - S_block_size * S_split_num, im_ow_block_);
-  int IC_ib_num = IC - IC_block_size * IC_split_num < 0
-    ? 1
-    : utils::divide_and_ceil(IC - IC_block_size * IC_split_num, im_ic_block_);
-  int OC_ib_num = OC - OC_block_size * OC_split_num < 0
-    ? 1
-    : utils::divide_and_ceil(OC - OC_block_size * OC_split_num, im_oc_block_);
-  int tail_BS = BS_ib_num <= 1 ? BS_ib_block_size
-                               : BS_ib_block_size
-      - (BS_ib_num * BS_ib_block_size
-        + (BS_split_num - BS_ib_num) * BS_block_size - BS);
-  int tail_S = S_ib_num <= 1 ? S_ib_block_size
-                             : S_ib_block_size
-      - (S_ib_num * S_ib_block_size + (S_split_num - S_ib_num) * S_block_size
-        - OS);
-  int tail_IC = IC_ib_num <= 1 ? IC_ib_block_size
-                               : IC_ib_block_size
-      - (IC_ib_num * IC_ib_block_size
-        + (IC_split_num - IC_ib_num) * IC_block_size - IC);
-  int tail_OC = OC_ib_num <= 1 ? OC_ib_block_size
-                               : OC_ib_block_size
-      - (OC_ib_num * OC_ib_block_size
-        + (OC_split_num - OC_ib_num) * OC_block_size - OC);
-  assert(BS_ib_num >= 0 && BS_ib_num <= BS_split_num && S_ib_num >= 0
-    && S_ib_num <= S_split_num && IC_ib_num >= 0 && IC_ib_num <= IC_split_num
-    && OC_ib_num >= 0 && OC_ib_num <= OC_split_num);
-
-  BS_ib_block_size = utils::rnd_up(BS_ib_block_size, im_bs_block_);
-  S_ib_block_size = utils::rnd_up(S_ib_block_size, im_ow_block_);
-  IC_ib_block_size = utils::rnd_up(IC_ib_block_size, im_ic_block_);
-  OC_ib_block_size = utils::rnd_up(OC_ib_block_size, im_oc_block_);
-  tail_BS = utils::rnd_up(tail_BS, im_bs_block_);
-  tail_S = utils::rnd_up(tail_S, im_ow_block_);
-  tail_IC = utils::rnd_up(tail_IC, im_ic_block_);
-  tail_OC = utils::rnd_up(tail_OC, im_oc_block_);
-
-  COMPILE_ASSERT(BS_block_size / im_bs_block_ >= BS_sub_block
-      && BS_ib_block_size / im_bs_block_ >= BS_sub_block,
-    "bad BS_sub_block given");
-  COMPILE_ASSERT(S_block_size / im_ow_block_ >= S_sub_block
-      && S_ib_block_size / im_ow_block_ >= S_sub_block,
-    "bad S_sub_block given");
-  COMPILE_ASSERT(IC_block_size / im_ic_block_ >= IC_sub_block
-      && IC_ib_block_size / im_ic_block_ >= IC_sub_block,
-    "bad IC_sub_block given");
-  COMPILE_ASSERT(OC_block_size / im_oc_block_ >= OC_sub_block
-      && OC_ib_block_size / im_oc_block_ >= OC_sub_block,
-    "bad OC_sub_block given");
-
-  bool is_out_blocking = out_tensors_[0].get_format().is_blocking();
-  auto dtype = get_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-
-  if (is_3d) {
-    COMPILE_ASSERT(get_weight_dims()[2] == 1 && get_weight_dims()[3] == 1
-        && get_weight_dims()[4] == 1,
-      "gen_nested_conv1x1_backprop_data_t kernels==1");
-  } else {
-    COMPILE_ASSERT(get_weight_dims()[2] == 1 && get_weight_dims()[3] == 1,
-      "gen_nested_conv1x1_backprop_data_t kernels==1");
-  }
-
-  // define compute
-  expr del_input = outputs.at(op_params_t::out_del_input),
-       output = inputs.at(op_params_t::in_fwd_output),
-       weight = inputs.at(op_params_t::in_weight);
-
-  std::vector<int> BS_anchor_info
-    = {BS_ib_num, BS_block_size, BS_ib_block_size},
-    S_anchor_info = {S_ib_num, S_block_size, S_ib_block_size},
-    IC_anchor_info = {IC_ib_num, IC_block_size, IC_ib_block_size};
-
-  for_loop bsloop;
-  int BS_real_split = std::min(
-    static_cast<int>(utils::divide_and_ceil(BS, im_bs_block_)), BS_split_num);
-  int S_real_split = std::min(
-    static_cast<int>(utils::divide_and_ceil(OS, im_ow_block_)), S_split_num);
-  int IC_real_split = std::min(
-    static_cast<int>(utils::divide_and_ceil(IC, im_ic_block_)), IC_split_num);
-  int OC_real_split = std::min(
-    static_cast<int>(utils::divide_and_ceil(OC, im_oc_block_)), OC_split_num);
-
-  if (OC_split_num == 1) { // no need to do reduction on OC axis
-    expr bs_idx, s_idx, ic_idx, oc_idx, BS_single_thr_size, S_single_thr_size,
-      IC_single_thr_size;
-    _named_for_(
-      bsloop, bs_s, 0, BS_real_split, 1, for_type::PARALLEL, BS_split_num) {
-      _for_(s_s, 0, S_real_split, 1, for_type::PARALLEL, S_split_num) {
-        _for_(ic_s, 0, IC_real_split, 1, for_type::PARALLEL, IC_split_num) {
-          compute_single_thr_and_idx(BS, BS_split_num, BS_block_size, BS_ib_num,
-            tail_BS, BS_ib_block_size, bs_s, BS_single_thr_size, bs_idx);
-          compute_single_thr_and_idx(OS, S_split_num, S_block_size, S_ib_num,
-            tail_S, S_ib_block_size, s_s, S_single_thr_size, s_idx);
-          compute_single_thr_and_idx(IC, IC_split_num, IC_block_size, IC_ib_num,
-            tail_IC, IC_ib_block_size, ic_s, IC_single_thr_size, ic_idx);
-          _for_(oc_s, 0, OC_split_num, 1, for_type::PARALLEL, OC_split_num) {
-            _if_(bs_idx < (uint64_t)BS && s_idx < (uint64_t)(OS)
-              && ic_idx < (uint64_t)IC) {
-              single_thread_conv1x1_backprop_data_call(ctx, in_tensors_[0],
-                in_tensors_[1], out_tensors_[0], config, BS_single_thr_size,
-                S_single_thr_size, IC_single_thr_size,
-                (int)utils::rnd_up(OC, im_oc_block_), bs_idx, s_idx, ic_idx,
-                oc_s, stride_d, stride_h, stride_w, output, weight, del_input,
-                dtype_block, fusion, bs_s, s_s, ic_s, BS_anchor_info,
-                S_anchor_info, IC_anchor_info, is_out_blocking);
-            } // if
-          } // oc_s loop
-          if (fusion) {
-            // TODO(zhangyan): consider more fusion cases.
-            if (BS_block_size == BS_ib_block_size
-              && S_block_size == S_ib_block_size
-              && IC_block_size == IC_ib_block_size) {
-              if (OH % stride_h == 0 && OW % stride_w) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  slice_range {{bs_idx, BS_block_size},
-                    {s_idx / OW * stride_h, S_block_size / OW * stride_w},
-                    {s_idx % OW, OW * stride_w}, {ic_idx, IC_block_size}});
-              }
-            }
-          } // if fusion
-        } // ic_s loop
-      } // s_s loop
-    } // bs_s loop
-  } else {
-    return false; // TODO(zhangyan): support reduction on OC axis
-  }
-  loops = {};
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_data.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_data.hpp
deleted file mode 100644
index 15098eadbd2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_data.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONV1X1_BACKPROP_DATA_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONV1X1_BACKPROP_DATA_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace ops {
-class gen_nested_conv1x1_backprop_data_t
-  : public body_generator_t<nested_conv_bwd_data_config_t> {
-public:
-  // inner most block
-  int im_bs_block_;
-  int im_ow_block_;
-  int im_ic_block_;
-  int im_oc_block_;
-  sc_dims stride_;
-  sc_dims padding_;
-  struct op_params_t {
-    static constexpr int in_fwd_output = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out_del_input = 0;
-  };
-  using parent = body_generator_t<nested_conv_bwd_data_config_t>;
-  using parent::generate;
-
-  gen_nested_conv1x1_backprop_data_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_input_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-  const sc_dims &get_weight_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-  sc_data_type_t get_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_weight_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_out_dtype() const { return out_tensors_[0].dtype_; }
-
-  void weight_reorder(context_ptr ctx, const expr &temp_weight,
-    const expr &weight, const sc_data_type_t &dtype, int oc_single_thr,
-    int ic_single_thr, int OC, int IC, const expr &oc_s,
-    const expr &ic_s) const;
-
-  bool generate(context_ptr ctx, const nested_conv_bwd_data_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  void single_thread_conv1x1_backprop_data_call(const context_ptr &ctx,
-    const logical_tensor_t &ta, const logical_tensor_t &tb,
-    const logical_tensor_t &tc, const nested_conv_bwd_data_config_t &config,
-    const expr &BS, const expr &S, const expr &IC, const expr &OC,
-    const expr &bs_idx, const expr &s_idx, const expr &ic_idx,
-    const expr &oc_idx, const int stride_d, const int stride_h,
-    const int stride_w, const expr &A, const expr &B, const expr &C,
-    int dtype_block, fusion_anchor_mgr_t *fusion, const expr &bs_s,
-    const expr &s_s, const expr &ic_s, std::vector<int> &BS_anchor_info,
-    std::vector<int> &S_anchor_info, std::vector<int> &IC_anchor_info,
-    const bool is_out_blocking, bool is_partial = false,
-    const expr &oc_s = 0) const;
-
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx,
-    const nested_conv_bwd_data_config_t &config, stmt body,
-    std::vector<for_loop> &fors) const override;
-
-private:
-  int ndims_ = 0;
-};
-} // namespace ops
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_weight.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_weight.cpp
deleted file mode 100644
index 1521c6092d7..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_weight.cpp
+++ /dev/null
@@ -1,599 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "nested_conv1x1_backprop_weight.hpp"
-#include <algorithm>
-#include <limits>
-#include <string>
-#include <utility>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/templates/commit_op.hpp>
-#include <runtime/config.hpp>
-#include <runtime/parallel.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-config_ptr gen_nested_conv1x1_backprop_weight_t::get_default_config(
-  context_ptr ctx) const {
-  auto ret
-    = reflection::general_object_t::make<nested_conv_bwd_weight_config_t>();
-  nested_conv_bwd_weight_config_t &cfg
-    = *ret.unchecked_get_as<nested_conv_bwd_weight_config_t>();
-  int num_threads = runtime_config_t::get().get_num_threads();
-
-  const int im_bs_block = im_bs_block_;
-  const int im_ic_block = im_ic_block_;
-  const int im_oc_block = im_oc_block_;
-
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_grad_dims()[2] : get_data_dims()[2]);
-  const int H
-    = stride_h > 1 ? get_grad_dims()[ndims_ - 2] : get_data_dims()[ndims_ - 2];
-  const int W
-    = stride_w > 1 ? get_grad_dims()[ndims_ - 1] : get_data_dims()[ndims_ - 1];
-  const int IC = get_data_dims()[1];
-  const int OC = get_grad_dims()[1];
-  const int BS = get_data_dims()[0];
-
-  cfg.oc_threads = 1;
-  cfg.ic_threads = 1;
-  cfg.bs_threads = 1;
-  cfg.od_threads = 1;
-  cfg.oh_threads = 1;
-
-  if (num_threads % 7 == 0) {
-    COMPILE_ASSERT(H % 7 == 0 && W % 7 == 0,
-      "Currently, only support cases with spatial size equals multiple integer "
-      "of 7.")
-    cfg.oh_threads = 7;
-    num_threads /= cfg.oh_threads;
-  }
-
-  const int sizeofdtypeA
-    = utils::get_sizeof_etype(in_tensors_[0].dtype_.as_etype());
-  const int sizeofdtypeC
-    = utils::get_sizeof_etype(out_tensors_[0].dtype_.as_etype());
-  float cost = std::numeric_limits<float>::max();
-  int split_ic = 1;
-  for (int i = 1; i <= num_threads; i++) {
-    int num_BS_block
-      = utils::divide_and_ceil(BS / im_bs_block, num_threads / i);
-    int num_IC_block = utils::divide_and_ceil(IC / im_ic_block, i);
-    int num_brgemm = num_BS_block * num_IC_block;
-    int num_core = std::min(i, IC / im_ic_block)
-      * std::min(num_threads / i, BS / im_bs_block);
-    // Cost = Shape_efficient_weight *
-    // (workload_balance + divide_N_plenty) / core_utilitizaiton
-    // single core gemm prefers square shape for A and B.
-    // For small workload, the A and B shape is not a key problem, but the
-    // num_core and num_brgemm is important to performance. Use 2048 to reduce
-    // the shape weight on small shape.
-    float new_cost = (1024 + BS * i / float(num_threads) + IC / float(i))
-      * (num_brgemm + 8 * i) / float(num_core);
-    if (new_cost < cost && IC / im_ic_block % i == 0
-      && BS / im_bs_block % (num_threads / i) == 0) {
-      split_ic = i;
-      cost = new_cost;
-    }
-  }
-  cfg.bs_threads = std::min(BS / im_bs_block, num_threads / split_ic);
-  cfg.ic_threads = std::min(IC / im_ic_block, split_ic);
-  // avoid (OC / im_oc_block) % oc_threads != 0
-  auto valid_splits_oc = get_splits(OC / im_oc_block);
-  for (auto split : valid_splits_oc) {
-    if (split <= num_threads / cfg.bs_threads / cfg.ic_threads) {
-      cfg.oc_threads = std::max(cfg.oc_threads, split);
-    }
-  }
-
-  // Config single core
-  cfg.oc_num_blocks = OC / cfg.oc_threads / im_oc_block / 16 >= 1
-    ? OC / cfg.oc_threads / im_oc_block / 16
-    : 1;
-  cfg.ic_num_blocks = IC / cfg.ic_threads / im_ic_block / 16 >= 1
-    ? IC / cfg.ic_threads / im_ic_block / 16
-    : 1;
-  cfg.bs_num_blocks = BS / cfg.bs_threads / im_bs_block >= 1
-    ? BS / cfg.bs_threads / im_bs_block
-    : 1;
-  cfg.oh_num_blocks = H / cfg.oh_threads / 2 >= 1 ? H / cfg.oh_threads / 2 : 1;
-  cfg.ow_num_blocks = W / 14 >= 1 ? W / 14 : 1;
-  cfg.od_num_blocks = 1;
-
-  return std::move(ret);
-}
-
-gen_nested_conv1x1_backprop_weight_t::gen_nested_conv1x1_backprop_weight_t(
-  sc_op *owner, const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  bool is_vnni_low_fp
-    = ops::is_vnni_low_fp(get_default_context(), get_A_dtype());
-  ndims_ = get_data_dims().size();
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_grad_dims()[2] : get_data_dims()[2]);
-  const int H
-    = stride_h > 1 ? get_grad_dims()[ndims_ - 2] : get_data_dims()[ndims_ - 2];
-  const int W
-    = stride_w > 1 ? get_grad_dims()[ndims_ - 1] : get_data_dims()[ndims_ - 1];
-  const int IC = get_data_dims()[1];
-  const int OC = get_grad_dims()[1];
-  const int N = get_data_dims()[0];
-  if (is_vnni_low_fp) {
-    im_oc_block_ = 32;
-    im_ic_block_ = 32;
-    im_bs_block_ = 32;
-  } else {
-    im_oc_block_ = 16;
-    im_ic_block_ = 16;
-    im_bs_block_ = 16;
-  }
-}
-
-float gen_nested_conv1x1_backprop_weight_t::get_gflop() const {
-  float result = 0.0;
-  bool is_3d = ndims_ == 5;
-  int stride_d = is_3d ? stride_[0] : 1, stride_h = stride_[0],
-      stride_w = stride_[0];
-  if (stride_.size() > 1) {
-    if (is_3d) { stride_d = stride_[ndims_ - 5]; }
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  const int D
-    = !is_3d ? 1 : (stride_d > 1 ? get_grad_dims()[2] : get_data_dims()[2]);
-  const int H
-    = stride_h > 1 ? get_grad_dims()[ndims_ - 2] : get_data_dims()[ndims_ - 2];
-  const int W
-    = stride_w > 1 ? get_grad_dims()[ndims_ - 1] : get_data_dims()[ndims_ - 1];
-  const int C = get_data_dims()[1];
-  const int K = get_grad_dims()[1];
-  const int N = get_data_dims()[0];
-  result = 2.f * N * C * D * W * H * K / 1e9;
-  return result;
-}
-
-void gen_nested_conv1x1_backprop_weight_t::schedule_loops(context_ptr ctx,
-  const nested_conv_bwd_weight_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {}
-
-void gen_nested_conv1x1_backprop_weight_t::forward_input_reorder_call(
-  context_ptr &ctx, const expr &temp_forward_input, const expr &forward_input,
-  const logical_tensor_t &input_lt, const sc_data_type_t &dtype, int bs_block,
-  int ic_block, int oh_block, int ow_block, int IH, int IW,
-  const expr &bs_offset, const expr &ic_offset, const expr &oh_offset,
-  const expr &ow_offset, int stride_h, int stride_w) const {
-  // NHWCn or NHWC --> NCHWcn
-  if (stride_h > 1 || stride_w > 1) {
-    trace_guard_t trg(ctx, "forward_input_reorder");
-    _for_(ih_reorder, 0, oh_block) {
-      _for_(iw_reorder, 0, ow_block) {
-        _for_(ic_reorder, 0, ic_block) {
-          _for_(ibs_reorder, 0, bs_block) {
-            expr bs_idx = bs_offset + ibs_reorder;
-            expr ic_idx = ic_offset + ic_reorder;
-            expr h_idx = oh_offset + ih_reorder;
-            expr w_idx = ow_offset + iw_reorder;
-            expr input_h_idx = h_idx * stride_h;
-            expr input_w_idx = w_idx * stride_w;
-            std::vector<expr> tmp_input_idx {ibs_reorder / im_bs_block_,
-              ic_reorder / im_ic_block_, ih_reorder, iw_reorder,
-              ic_reorder % im_ic_block_, ibs_reorder % im_bs_block_};
-            std::vector<expr> input_idx {
-              bs_idx, input_h_idx, input_w_idx, ic_idx};
-            _if_((input_h_idx >= 0 && input_h_idx < IH)
-              && (input_w_idx >= 0 && input_w_idx < IW)) {
-              temp_forward_input[tmp_input_idx] = forward_input[input_idx];
-            }
-            _else_ {
-              temp_forward_input[tmp_input_idx] = builder::make_constant(
-                std::vector<union_val>(0.f), sc_data_type_t(dtype.type_code_));
-            }
-          }
-        }
-      }
-    }
-  } else {
-    // use vnni reorder when strides = 1
-    // shrinked_shape
-    std::vector<expr> temp_forward_input_shape_shr = {bs_block / im_bs_block_,
-      ic_block / im_ic_block_, oh_block, ow_block, im_ic_block_, im_bs_block_};
-    std::vector<expr> shrink_offset = std::vector<expr> {
-      bs_offset / im_bs_block_, ic_offset / im_ic_block_, oh_offset, ow_offset,
-      bs_offset % im_bs_block_, ic_offset % im_ic_block_};
-    temp_forward_input->attr()[tensor_shrinker_attrs::should_shrink]
-      = tensor_shrinker_t::shrink_info_t {
-        shrink_offset, temp_forward_input_shape_shr, stmts()};
-    slice_range tmp_input_slice_range
-      = slice_range {{bs_offset, bs_block / im_bs_block_},
-        {ic_offset, ic_block / im_ic_block_}, {oh_offset, oh_block},
-        {ow_offset, ow_block}, {0, im_bs_block_}, {0, im_ic_block_}};
-    ops::commit_op(ctx, "reorder",
-      {tensor_slice(forward_input,
-        {{bs_offset, bs_block}, {oh_offset, oh_block}, {ow_offset, ow_block},
-          {ic_offset, ic_block}})},
-      {tensor_slice(temp_forward_input, std::move(tmp_input_slice_range))},
-      {graph_tensor::make(
-        input_lt.get_plain_dims(), input_lt.get_format(), input_lt.dtype_)},
-      {},
-      {{"out_format", sc_data_format_t::NCHWcn(im_ic_block_, im_bs_block_)}});
-  }
-}
-
-void gen_nested_conv1x1_backprop_weight_t::inner_loop_call(context_ptr &ctx,
-  const expr &temp_forward_input,
-  const std::vector<expr> &temp_forward_idx_non_block,
-  const logical_tensor_t &delta_output_lt, const expr &delta_output,
-  const expr &real_delta_weight_buf, const std::vector<expr> &temp_weight_idx,
-  const sc_data_type_t &dtype, int dtype_block, int ic_block, int oc_block,
-  int bs_block, int od_block, int oh_block, int ow_block, int stride_h,
-  int stride_w, const expr &o_bs, const expr &o_od, const expr &o_oh,
-  const expr &o_ow, const expr &obs_offset, const expr &oc_offset,
-  const expr &oh_offset, const expr &ow_offset, fusion_anchor_mgr_t *fusion,
-  bool is_partial) const {
-  int BS = delta_output_lt.get_plain_dims()[0];
-  int OC = delta_output_lt.get_plain_dims()[1];
-  int OH = delta_output_lt.get_plain_dims()[2];
-  int OW = delta_output_lt.get_plain_dims()[3];
-  // NPQK --> NKPQnk OR -> NKPQnk2n
-  // full shape based on delta_output's reorder result
-  std::vector<expr> temp_output_delta_shape_full = dtype_block > 1
-    ? std::vector<expr> {BS / im_bs_block_, OC / im_oc_block_, OH, OW,
-      im_bs_block_ / dtype_block, im_oc_block_, dtype_block}
-    : std::vector<expr> {
-      BS / im_bs_block_, OC / im_oc_block_, OH, OW, im_bs_block_, im_oc_block_};
-  _tensor_(temp_output_delta, dtype, temp_output_delta_shape_full);
-  _for_(i_ic, 0, ic_block / im_ic_block_) {
-    // shrinked_shape
-    std::vector<expr> temp_output_delta_shape_shr = dtype_block > 1
-      ? std::vector<expr> {bs_block / im_bs_block_, oc_block / im_oc_block_,
-        oh_block, ow_block, im_bs_block_ / dtype_block, im_oc_block_,
-        dtype_block}
-      : std::vector<expr> {bs_block / im_bs_block_, oc_block / im_oc_block_,
-        oh_block, ow_block, im_bs_block_, im_oc_block_};
-    // f32 --> vectorized; bf16 --> vnni_reorder
-    std::vector<expr> shrink_offset = dtype_block > 1
-      ? std::vector<expr> {obs_offset / im_bs_block_, oc_offset / im_oc_block_,
-        oh_offset, ow_offset, obs_offset % im_bs_block_ / dtype_block,
-        oc_offset % im_oc_block_, obs_offset % im_bs_block_ % dtype_block}
-      : std::vector<expr> {obs_offset / im_bs_block_, oc_offset / im_oc_block_,
-        oh_offset, ow_offset, obs_offset % im_bs_block_,
-        oc_offset % im_oc_block_};
-
-    _if_(i_ic == 0) {
-      // reorder temp_output_delta
-      int lanes = vectorize_step(ctx, get_B_dtype().type_code_, 32);
-      if (oc_block < lanes || oc_block % lanes != 0 || dtype_block > 1) {
-        lanes = 1;
-      }
-      temp_output_delta->attr()[tensor_shrinker_attrs::should_shrink]
-        = tensor_shrinker_t::shrink_info_t {
-          shrink_offset, temp_output_delta_shape_shr, stmts()};
-      slice_range tmp_output_slice_range = dtype_block > 1
-        ? slice_range {{obs_offset, bs_block / im_bs_block_},
-          {oc_offset, oc_block / im_oc_block_}, {oh_offset, oh_block},
-          {ow_offset, ow_block}, {0, im_bs_block_ / dtype_block},
-          {0, im_oc_block_}, {0, dtype_block}}
-        : slice_range {{obs_offset, bs_block / im_bs_block_},
-          {oc_offset, oc_block / im_oc_block_}, {oh_offset, oh_block},
-          {ow_offset, ow_block}, {0, im_bs_block_}, {0, im_oc_block_}};
-      ops::commit_op(ctx, "reorder",
-        {tensor_slice(delta_output,
-          {{obs_offset, bs_block}, {oh_offset, oh_block}, {ow_offset, ow_block},
-            {oc_offset, oc_block}})},
-        {tensor_slice(temp_output_delta, std::move(tmp_output_slice_range))},
-        {graph_tensor::make(delta_output_lt.get_plain_dims(),
-          delta_output_lt.get_format(), delta_output_lt.dtype_)},
-        {},
-        {{"out_format",
-          dtype_block > 1
-            ? sc_data_format_t::NCHWnc2n(im_bs_block_, im_oc_block_)
-            : sc_data_format_t::NCHWnc(im_bs_block_, im_oc_block_)}});
-    }
-    trace_guard_t trg(ctx, "brgemm");
-    _for_(i_oc, 0, oc_block / im_oc_block_) {
-      auto real_weight_idx = is_partial
-        ? std::vector<expr> {temp_weight_idx[0], temp_weight_idx[1] + i_ic,
-          temp_weight_idx[2] + i_oc, 0, 0, 0, 0}
-        : std::vector<expr> {
-          temp_weight_idx[0] + i_ic, temp_weight_idx[1] + i_oc, 0, 0, 0, 0};
-      _for_(i_bs, 0, bs_block / im_bs_block_) {
-        _for_(i_od, 0, od_block) {
-          _for_(i_oh, 0, oh_block) {
-            auto temp_output_delta_brgemm_index = dtype_block > 1
-              ? std::vector<expr> {shrink_offset[0] + i_bs,
-                shrink_offset[1] + i_oc, shrink_offset[2] + i_oh,
-                shrink_offset[3], shrink_offset[4], shrink_offset[5],
-                shrink_offset[6]}
-              : std::vector<expr> {shrink_offset[0] + i_bs,
-                shrink_offset[1] + i_oc, shrink_offset[2] + i_oh,
-                shrink_offset[3], shrink_offset[4], shrink_offset[5]};
-            _if_(o_bs == 0 && o_od == 0 && o_oh == 0 && o_ow == 0 && i_bs == 0
-              && i_od == 0 && i_oh == 0) {
-              // ic x bs matmul bs x oc
-              builtin::brgemm_init_update(
-                tensor_ptr(temp_forward_input,
-                  {temp_forward_idx_non_block[0] + i_bs,
-                    temp_forward_idx_non_block[1] + i_ic,
-                    temp_forward_idx_non_block[2] + i_oh,
-                    temp_forward_idx_non_block[3],
-                    temp_forward_idx_non_block[4],
-                    temp_forward_idx_non_block[5]}),
-                tensor_ptr(temp_output_delta, temp_output_delta_brgemm_index),
-                tensor_ptr(real_delta_weight_buf, real_weight_idx), ow_block,
-                im_ic_block_, im_oc_block_, im_bs_block_, im_bs_block_,
-                im_oc_block_, im_oc_block_, im_ic_block_ * im_bs_block_,
-                im_oc_block_ * im_bs_block_, dtype, dtype);
-            }
-            _else_ {
-              builtin::brgemm_update(tensor_ptr(temp_forward_input,
-                                       {temp_forward_idx_non_block[0] + i_bs,
-                                         temp_forward_idx_non_block[1] + i_ic,
-                                         temp_forward_idx_non_block[2] + i_oh,
-                                         temp_forward_idx_non_block[3],
-                                         temp_forward_idx_non_block[4],
-                                         temp_forward_idx_non_block[5]}),
-                tensor_ptr(temp_output_delta, temp_output_delta_brgemm_index),
-                tensor_ptr(real_delta_weight_buf, real_weight_idx), ow_block,
-                im_ic_block_, im_oc_block_, im_bs_block_, im_bs_block_,
-                im_oc_block_, im_oc_block_, im_ic_block_ * im_bs_block_,
-                im_oc_block_ * im_bs_block_, dtype, dtype);
-            }
-          }
-        }
-      }
-      if (fusion && !is_partial) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          slice_range {{real_weight_idx[0], 1}, {real_weight_idx[1], 1}, {0, 1},
-            {0, 1}, {0, im_ic_block_}, {0, im_oc_block_}});
-      }
-    }
-  }
-}
-
-bool gen_nested_conv1x1_backprop_weight_t::generate(context_ptr ctx,
-  const nested_conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  int padding_h = padding_[0], padding_w = padding_[0];
-  int padding_d = ndims_ == 5 ? padding_[0] : 0;
-  if (padding_.size() > 1) {
-    COMPILE_ASSERT((int)padding_.size() == ndims_ - 2,
-      "padding length shall confirm with ndims.");
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  int stride_d = ndims_ == 5 ? stride_[0] : 1;
-  if (stride_.size() > 1) {
-    COMPILE_ASSERT((int)stride_.size() == ndims_ - 2,
-      "stride length shall confirm with ndims.");
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  bool has_padding = (padding_d > 0 || padding_h > 0 || padding_w > 0);
-  bool has_stride = (stride_d > 1 || stride_h > 1 || stride_w > 1);
-
-  COMPILE_ASSERT(!has_padding, "assert padding = 0");
-
-  // setting dim values
-  int BS = get_data_dims()[0], IC = get_data_dims()[1];
-  int ID = ndims_ == 5 ? get_data_dims()[ndims_ - 3] : 1;
-  int IH = get_data_dims()[ndims_ - 2], IW = get_data_dims()[ndims_ - 1];
-  int OC = get_grad_dims()[1];
-  int OD = ndims_ == 5 ? get_grad_dims()[ndims_ - 3] : 1;
-  int OH = get_grad_dims()[2], OW = get_grad_dims()[3];
-  int KD = ndims_ == 5 ? get_output_dims()[ndims_ - 3] : 1;
-  int R = 1, S = 1;
-  // setting configs
-  int bs_threads = config.bs_threads, ic_threads = config.ic_threads,
-      oc_threads = config.oc_threads, oh_threads = config.oh_threads,
-      od_threads = config.od_threads;
-  int oc_num_blocks
-    = config.oc_num_blocks,
-    ic_num_blocks = config.ic_num_blocks, bs_num_blocks = config.bs_num_blocks,
-    oh_num_blocks = config.oh_num_blocks, od_num_blocks = config.od_num_blocks,
-    ow_num_blocks = config.ow_num_blocks;
-  bool is_partial = bs_threads * oh_threads * od_threads != 1;
-
-  // other template related pre-compute values
-  auto dtype = get_A_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  int oc_single_core = OC / oc_threads;
-  int ic_single_core = IC / ic_threads;
-  int bs_single_core = BS / bs_threads;
-  int od_single_core = OD / od_threads;
-  int oh_single_core = OH / oh_threads;
-
-  // define compute
-  // delta_weight's blocking shape [IC / ic_block, OC / oc_block, R, S,
-  // ic_block, oc_block]
-  expr delta_weight = outputs.at(op_params_t::out_delta_weight),
-       forward_input = inputs.at(op_params_t::in_forward_input),
-       delta_output = inputs.at(op_params_t::in_delta_output);
-
-  _tensor_(temp_delta_weight, datatypes::f32,
-    {bs_threads * oh_threads * od_threads, IC / im_ic_block_, OC / im_oc_block_,
-      1, 1, im_ic_block_, im_oc_block_});
-  expr real_delta_weight_buf = is_partial ? temp_delta_weight : delta_weight;
-  _for_(p_oc, 0, oc_threads, 1, for_type::PARALLEL, oc_threads) {
-    _for_(p_ic, 0, ic_threads, 1, for_type::PARALLEL, ic_threads) {
-      _for_(p_bs, 0, bs_threads, 1, for_type::PARALLEL, bs_threads) {
-        _for_(p_od, 0, od_threads, 1, for_type::PARALLEL, od_threads) {
-          _for_(p_oh, 0, oh_threads, 1, for_type::PARALLEL, oh_threads) {
-            _for_(o_oc, 0, oc_num_blocks) {
-              _for_(o_ic, 0, ic_num_blocks) {
-                int oc_block = oc_single_core / oc_num_blocks;
-                int ic_block = ic_single_core / ic_num_blocks;
-                int bs_block = bs_single_core / bs_num_blocks;
-                int od_block = od_single_core / od_num_blocks;
-                int oh_block = oh_single_core / oh_num_blocks;
-                int ow_block = OW / ow_num_blocks;
-                expr oc_offset = p_oc * oc_single_core + o_oc * oc_block;
-                expr ic_offset = p_ic * ic_single_core + o_ic * ic_block;
-                _for_(o_bs, 0, bs_num_blocks) {
-                  _for_(o_od, 0, od_num_blocks) {
-                    _for_(o_oh, 0, oh_num_blocks) {
-                      _for_(o_ow, 0, ow_num_blocks) {
-                        expr obs_offset
-                          = p_bs * bs_single_core + o_bs * bs_block;
-                        expr oh_offset
-                          = p_oh * oh_single_core + o_oh * oh_block;
-                        expr ow_offset = o_ow * ow_block;
-                        // start perform reorder:
-                        // N[D]HWCn or N[D]HWC ->NC[D]HWcn
-                        // TODO(zhangyan): consider 3D case
-                        // full shape based on forward_input's reorder result
-                        std::vector<expr> temp_forward_input_shape_full
-                          = std::vector<expr> {BS / im_bs_block_,
-                            IC / im_ic_block_, IH, IW, im_ic_block_,
-                            im_bs_block_};
-                        std::vector<expr> temp_forward_input_shape
-                          = std::vector<expr> {bs_block / im_bs_block_,
-                            ic_block / im_ic_block_, oh_block, ow_block,
-                            im_ic_block_, im_bs_block_};
-                        _tensor_(temp_forward_input, dtype,
-                          has_stride ? temp_forward_input_shape
-                                     : temp_forward_input_shape_full);
-                        forward_input_reorder_call(ctx, temp_forward_input,
-                          forward_input, in_tensors_[0], dtype, bs_block,
-                          ic_block, oh_block, ow_block, IH, IW, obs_offset,
-                          ic_offset, oh_offset, ow_offset, stride_h, stride_w);
-                        std::vector<expr> temp_forward_idx_non_block
-                          = has_stride
-                          ? std::vector<expr> {0, 0, 0, 0, 0, 0}
-                          : std::vector<expr> {obs_offset / im_bs_block_,
-                            ic_offset / im_ic_block_, oh_offset, ow_offset,
-                            obs_offset % im_bs_block_,
-                            ic_offset % im_ic_block_};
-                        std::vector<expr> temp_weight_idx = is_partial
-                          ? std::vector<expr> {p_bs * oh_threads * od_threads
-                              + p_od * oh_threads + p_oh,
-                            ic_offset / im_ic_block_, oc_offset / im_oc_block_,
-                            0, 0, im_ic_block_, im_oc_block_}
-                          : std::vector<expr> {ic_offset / im_ic_block_,
-                            oc_offset / im_oc_block_, 0, 0, im_ic_block_,
-                            im_oc_block_};
-                        inner_loop_call(ctx, temp_forward_input,
-                          temp_forward_idx_non_block, in_tensors_[1],
-                          delta_output, real_delta_weight_buf, temp_weight_idx,
-                          dtype, dtype_block, ic_block, oc_block, bs_block,
-                          od_block, oh_block, ow_block, stride_h, stride_w,
-                          o_bs, o_od, o_oh, o_ow, obs_offset, oc_offset,
-                          oh_offset, ow_offset, fusion, is_partial);
-                      }
-                    }
-                  }
-                }
-                if (fusion && !is_partial) {
-                  create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                    slice_range {
-                      {ic_offset / im_ic_block_, ic_block / im_ic_block_},
-                      {oc_offset / im_oc_block_, oc_block / im_oc_block_},
-                      {0, 1}, {0, 1}, {ic_offset % im_ic_block_, im_ic_block_},
-                      {oc_offset % im_oc_block_, im_oc_block_}});
-                }
-              }
-            }
-          }
-        }
-      }
-      // final parallel reduce
-      if (is_partial) {
-        int lanes = vectorize_step(ctx, get_C_dtype().type_code_, 16);
-        if (oc_single_core < lanes || oc_single_core % lanes != 0) {
-          lanes = 1;
-        }
-        trace_guard_t trg(ctx, "final_reduce");
-        _for_(r_parallel, 0, ic_single_core * oc_single_core / im_oc_block_, 1,
-          for_type::PARALLEL, bs_threads * oh_threads * od_threads) {
-          expr ic_block_idx = r_parallel % im_ic_block_;
-          expr s_idx = r_parallel / im_ic_block_ % S;
-          expr r_idx = r_parallel / im_ic_block_ / S % R;
-          expr idx_tmp = r_parallel / im_ic_block_ / S / R;
-          expr oc_outer_idx = idx_tmp % (oc_single_core / im_oc_block_);
-          expr ic_outer_idx = idx_tmp / (oc_single_core / im_oc_block_)
-            % (ic_single_core / im_ic_block_);
-          _for_(r_reduce, 0, bs_threads * od_threads * oh_threads, 1) {
-            _for_(r_oc_inner, 0, im_oc_block_, lanes) {
-              std::vector<expr> delta_weight_idx {
-                p_ic * ic_single_core / im_ic_block_ + ic_outer_idx,
-                p_oc * oc_single_core / im_oc_block_ + oc_outer_idx, r_idx,
-                s_idx, ic_block_idx, r_oc_inner};
-              _if_(r_reduce == 0) {
-                builtin::mem_zero(tensor_ptr(delta_weight, delta_weight_idx),
-                  lanes, datatypes::f32);
-              }
-              std::vector<expr> temp_delta_weight_idx = delta_weight_idx;
-              temp_delta_weight_idx.insert(
-                temp_delta_weight_idx.begin(), r_reduce);
-              delta_weight[span_t(delta_weight_idx, lanes)] = builder::make_add(
-                delta_weight[span_t(delta_weight_idx, lanes)],
-                temp_delta_weight[span_t(temp_delta_weight_idx, lanes)]);
-            }
-          }
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            slice_range {
-              {p_ic * ic_single_core / im_ic_block_ + ic_outer_idx, 1},
-              {p_oc * oc_single_core / im_oc_block_ + oc_outer_idx, 1}, {0, 1},
-              {0, 1}, {ic_block_idx, 1}, {0, im_oc_block_}});
-        }
-      }
-    }
-  }
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_weight.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_weight.hpp
deleted file mode 100644
index 3d5cb093df4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv1x1_backprop_weight.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONV1X1_BACKPROP_WEIGHT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONV1X1_BACKPROP_WEIGHT_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-class gen_nested_conv1x1_backprop_weight_t
-  : public body_generator_t<nested_conv_bwd_weight_config_t> {
-public:
-  // inner most block
-  int im_oc_block_;
-  int im_ic_block_;
-  int im_bs_block_;
-  sc_dims stride_;
-  sc_dims padding_;
-  int ndims_;
-  struct op_params_t {
-    static constexpr int in_forward_input = 0;
-    static constexpr int in_delta_output = 1;
-    static constexpr int out_delta_weight = 0;
-  };
-  using parent = body_generator_t<nested_conv_bwd_weight_config_t>;
-  using parent::generate;
-
-  gen_nested_conv1x1_backprop_weight_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_data_dims() const {
-    return in_tensors_[op_params_t::in_forward_input].get_plain_dims();
-  }
-  const sc_dims &get_grad_dims() const {
-    return in_tensors_[op_params_t::in_delta_output].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[op_params_t::out_delta_weight].get_plain_dims();
-  }
-
-  sc_data_type_t get_A_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_B_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_C_dtype() const { return out_tensors_[0].dtype_; }
-
-  void forward_input_reorder_call(context_ptr &ctx,
-    const expr &temp_forward_input, const expr &forward_input,
-    const logical_tensor_t &input_lt, const sc_data_type_t &dtype,
-    int bs_single_core, int ic_single_core, int oh_single_core, int OW, int IH,
-    int IW, const expr &bs_offset, const expr &ic_offset, const expr &oh_offset,
-    const expr &ow_offset, int stride_h, int stride_w) const;
-
-  void inner_loop_call(context_ptr &ctx, const expr &temp_forward_input,
-    const std::vector<expr> &temp_forward_idx_non_block,
-    const logical_tensor_t &delta_output_lt, const expr &delta_output,
-    const expr &temp_delta_weight, const std::vector<expr> &temp_weight_idx,
-    const sc_data_type_t &dtype, int dtype_block, int ic_block, int oc_block,
-    int bs_block, int od_block, int oh_block, int ow_block, int stride_h,
-    int stride_w, const expr &o_bs, const expr &o_od, const expr &o_oh,
-    const expr &o_ow, const expr &obs_offset, const expr &oc_offset,
-    const expr &oh_offset, const expr &ow_offset, fusion_anchor_mgr_t *fusion,
-    bool is_partial) const;
-
-  bool generate(context_ptr ctx, const nested_conv_bwd_weight_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx,
-    const nested_conv_bwd_weight_config_t &config, stmt body,
-    std::vector<for_loop> &fors) const override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_data.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_data.cpp
deleted file mode 100644
index 8d6918cab1b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_data.cpp
+++ /dev/null
@@ -1,418 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "nested_convNxN_backprop_data.hpp"
-#include <algorithm>
-#include <limits>
-#include <numeric>
-#include <string>
-#include <utility>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <ops/templates/commit_op.hpp>
-#include <runtime/config.hpp>
-#include <runtime/parallel.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-static std::vector<int> get_iota(int range) {
-  std::vector<int> result(range);
-  std::iota(result.begin(), result.end(), 1);
-  return result;
-}
-
-config_ptr gen_nested_convNxN_backprop_data_t::get_default_config(
-  context_ptr ctx) const {
-  auto ret
-    = reflection::general_object_t::make<nested_conv_bwd_data_config_t>();
-  nested_conv_bwd_data_config_t &cfg
-    = *ret.unchecked_get_as<nested_conv_bwd_data_config_t>();
-  const int num_threads = runtime_config_t::get().get_num_threads();
-  int BS = get_input_grad_dims()[0], IC = get_input_grad_dims()[1];
-  int IH = get_input_grad_dims()[ndims_ - 2],
-      IW = get_input_grad_dims()[ndims_ - 1];
-  int OC = get_output_grad_dims()[1];
-
-  int ih_threads = 1;
-  float cost = std::numeric_limits<float>::max();
-  for (int i = 1; i <= num_threads; i++) {
-    if (num_threads % i != 0) continue;
-    int num_bs_threads = utils::divide_and_ceil(BS, num_threads / i);
-    int num_ih_threads = utils::divide_and_ceil(IH, i);
-    int num_brgemm = BS * IH * 2 * IC / im_ic_block_;
-    // Cost = Shape_efficient_weight *
-    // (workload_balance + divide_N_plenty) / core_utilitizaiton
-    // single core gemm prefers square shape for A and B.
-    // For small workload, the A and B shape is not a key problem, but the
-    // num_core and num_brgemm is important to performance. Use 2048 to reduce
-    // the shape weight on small shape.
-    float new_cost = (1024 + BS * i / float(num_threads) + IH / float(i))
-      * (num_brgemm + 8 * i) / float(num_threads);
-    if (new_cost < cost && i <= IH && num_threads / i <= BS) {
-      ih_threads = i;
-      cost = new_cost;
-    }
-  }
-  cfg.bs_threads = num_threads / ih_threads;
-  cfg.spatial_threads = ih_threads; // ih_threads
-  cfg.ic_threads = 1;
-  // when IH is small and IC is large, prefer split on IC
-  if (IH < 28 && IC > 256) {
-    int ic_max_threads = IC / im_ic_block_;
-    auto possible_factors = get_splits(ih_threads);
-    for (int64_t i = possible_factors.size() - 1; i >= 0; --i) {
-      if (ic_max_threads % possible_factors[i] == 0) {
-        cfg.ic_threads = possible_factors[i];
-        cfg.spatial_threads = ih_threads / possible_factors[i];
-        break;
-      }
-    }
-  }
-
-  cfg.ic_num_blocks = 1;
-  cfg.oc_num_blocks = 1;
-  cfg.bs_num_blocks = BS / cfg.bs_threads;
-  cfg.spatial_num_blocks
-    = IH < 28 ? 1 : IH / cfg.spatial_threads; // ih_num_blocks
-  return std::move(ret);
-}
-
-gen_nested_convNxN_backprop_data_t::gen_nested_convNxN_backprop_data_t(
-  sc_op *owner, const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  bool is_vnni_low_fp
-    = ops::is_vnni_low_fp(get_default_context(), get_A_dtype());
-  ndims_ = get_output_grad_dims().size();
-  int OW = get_output_grad_dims()[3];
-  int OC = get_output_grad_dims()[1];
-  // TODO(yifei): enhance default values to deal with more flexible configs
-  if (is_vnni_low_fp) {
-    im_oc_block_ = OC;
-    im_ic_block_ = 32;
-    im_ow_block_ = OW;
-  } else {
-    im_oc_block_ = OC;
-    im_ic_block_ = 16;
-    im_ow_block_ = OW;
-  }
-}
-
-float gen_nested_convNxN_backprop_data_t::get_gflop() const {
-  const int OD = ndims_ == 5 ? get_output_grad_dims()[ndims_ - 3] : 1;
-  const int P = get_output_grad_dims()[ndims_ - 2];
-  const int Q = get_output_grad_dims()[ndims_ - 1];
-  const int C = get_input_grad_dims()[1];
-  const int K = get_output_grad_dims()[1];
-  const int N = get_output_grad_dims()[0];
-  const int KD = ndims_ == 5 ? get_weight_dims()[ndims_ - 3] : 1;
-  const int KH = get_weight_dims()[ndims_ - 2];
-  const int KW = get_weight_dims()[ndims_ - 1];
-  float result = 2.0f * N * K * C * KD * KH * KW * OD * P * Q / (float)1e9;
-  return result;
-}
-
-void gen_nested_convNxN_backprop_data_t::schedule_loops(context_ptr ctx,
-  const nested_conv_bwd_data_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {}
-
-void gen_nested_convNxN_backprop_data_t::pad_delta_output(
-  const context_ptr &ctx, const expr &delta_output,
-  const expr &temp_delta_output_buffer, const expr &bs_block, int oc_block,
-  int OH, int OW, const expr &oh_range, int ow_ext_range, const expr &bs_offset,
-  const expr &oh_offset, const expr &ow_offset, const expr &oc_offset,
-  const expr &temp_oh_offset) const {
-  trace_guard_t trg(ctx, "pad_delta_output");
-  // pad a piece of NPQK to NP(Q+pad)K
-  int lanes = vectorize_step(ctx, get_A_dtype().type_code_, 32);
-  // TODO(yifei): fallback to vectorized + tail if not divisible
-  if (oc_block < lanes || oc_block % lanes != 0) { lanes = 1; }
-  _for_(obs_reorder, 0, bs_block) {
-    _for_(oh_reorder, 0, oh_range) {
-      _for_(ow_reorder, 0, OW + 2 * ow_ext_range) {
-        _for_(oc_reorder, 0, oc_block, lanes) {
-          expr obs_idx = bs_offset + obs_reorder;
-          expr oh_idx = oh_offset + oh_reorder;
-          expr ow_idx = ow_offset + ow_reorder;
-          expr oc_idx = oc_offset + oc_reorder;
-          std::vector<expr> tmp_idx {
-            obs_reorder, oh_reorder, ow_reorder, oc_reorder};
-          std::vector<expr> delta_output_idx {
-            obs_idx, oh_idx, ow_idx - ow_ext_range, oc_idx};
-          _if_((oh_idx >= 0 && oh_idx < OH)
-            && (ow_idx >= ow_ext_range && ow_idx < OW + ow_ext_range)) {
-            temp_delta_output_buffer[span_t(tmp_idx, lanes)]
-              = delta_output[span_t(delta_output_idx, lanes)];
-          }
-          _else_ {
-            temp_delta_output_buffer[span_t(tmp_idx, lanes)]
-              = builder::make_broadcast(
-                builder::make_cast(get_A_dtype().type_code_, 0), lanes);
-          }
-        }
-      }
-    }
-  }
-}
-
-void gen_nested_convNxN_backprop_data_t::inner_loop_call(const context_ptr &ctx,
-  const expr &delta_input, const expr &delta_output, const expr &weight,
-  const sc_data_type_t &dtype, int dtype_block, int ic_block, int oc_block,
-  const expr &bs_block, int od_block, const expr &ih_block, int OW,
-  int stride_h, int stride_w, int padding_h, int padding_w, int R, int S,
-  int IC, int OC, int OH, int IW, const expr &obs_offset, const expr &oc_offset,
-  const expr &ic_offset, const expr &ih_offset,
-  fusion_anchor_mgr_t *fusion) const {
-  COMPILE_ASSERT(OW == im_ow_block_, "Use fixed config OW == im_ow_block_.");
-  COMPILE_ASSERT(OC == im_oc_block_, "Use fixed config OC == im_oc_block_.");
-  int num = oc_block / im_oc_block_;
-  COMPILE_ASSERT(num == 1, "num is temporarily 1.");
-  // TODO(yifei): Figure out why the hint size here lead to performance benefit
-  const auto hint_A_size = 32 * im_oc_block_ * R;
-  const auto hint_B_size = 32 * im_oc_block_ * R * S;
-  const auto hint_C_size = 32 * 32;
-  sc_brgemm_attrs_t brg_attrs {{brgemm::attr_key::max_bs, num * R * S},
-    {brgemm::attr_key::hint_expected_A_size, hint_A_size},
-    {brgemm::attr_key::hint_expected_B_size, hint_B_size},
-    {brgemm::attr_key::hint_expected_C_size, hint_C_size},
-    {brgemm::attr_key::use_interleave_stores, false},
-    {brgemm::attr_key::use_uker, false},
-    {brgemm::attr_key::hint_innermost_loop, true},
-    {brgemm::attr_key::hint_prefetching, 2}};
-  expr oh_ext_range = divide_and_ceil(ih_block + R, stride_h) + 1;
-  // ow_ext_range is the potential overflow on either left or right
-  // TODO(yifei): consider non-symmetric padding
-  int ow_ext_range = utils::divide_and_ceil(padding_w, stride_w);
-  _tensor_(temp_delta_output, get_A_dtype(),
-    {bs_block, oh_ext_range, OW + 2 * ow_ext_range, oc_block});
-  _var_(oh_offset, datatypes::index);
-  _if_(ih_offset + padding_h < R) { oh_offset = 0; }
-  _else_ { oh_offset = (ih_offset + padding_h - R) / stride_h; }
-  // it will pad OW on both lhs and rhs by ow_ext_range
-  pad_delta_output(ctx, delta_output, temp_delta_output, bs_block, oc_block, OH,
-    OW, oh_ext_range, ow_ext_range, obs_offset, oh_offset, 0, oc_offset, 0);
-  _for_(i_ic, 0, ic_block / im_ic_block_) {
-    _for_(i_bs, 0, bs_block) {
-      _for_(i_ih, 0, ih_block) {
-        expr ih_idx = ih_offset + i_ih;
-        // start the real inner most computation
-        _for_(sw_idx, 0, stride_w, 1) {
-          _tensor_(
-            temp_delta_input, datatypes::f32, {im_ow_block_, im_ic_block_});
-          _var_(len, datatypes::s32);
-          len = 0;
-          _tensor_(A_list, datatypes::pointer, {num * R * S});
-          _tensor_(B_list, datatypes::pointer, {num * R * S});
-          // the value of sh_idx ensuring oh_idx is an integer
-          expr sh_idx = (ih_idx + padding_h) % stride_h;
-          _for_(r, sh_idx, R, stride_h) {
-            // avoid out of bound and calculation overflow
-            _if_(ih_idx + padding_h >= r) {
-              expr oh_idx = (ih_idx + padding_h - r) / stride_h;
-              _if_(oh_idx < OH) {
-                _for_(s, sw_idx, S, stride_w) {
-                  // padding_w + iw_start = ow_start * stride_w + s;
-                  // the starting point of ow for a valid (in bound) iw_start
-                  _var_(ow_start, datatypes::index);
-                  _if_(s < padding_w) {
-                    ow_start
-                      = ow_ext_range + divide_and_ceil(padding_w - s, stride_w);
-                  }
-                  _else_ {
-                    _if_((OW - 1) * stride_w + s >= IW + padding_w) {
-                      ow_start = ow_ext_range
-                        - divide_and_ceil(
-                          (OW - 1) * stride_w + s - (IW + padding_w - 1),
-                          stride_w);
-                    }
-                    _else_ { ow_start = ow_ext_range; }
-                  }
-                  std::vector<expr> tmp_delta_output_index {
-                    i_bs, oh_idx - oh_offset, ow_start, 0};
-                  auto weight_index = dtype_block > 1
-                    ? std::vector<expr> {ic_offset / im_ic_block_ + i_ic, 0, r,
-                      s, 0, 0, 0}
-                    : std::vector<expr> {
-                      ic_offset / im_ic_block_ + i_ic, 0, r, s, 0, 0};
-                  A_list[len]
-                    = tensor_ptr(temp_delta_output, tmp_delta_output_index);
-                  B_list[len] = tensor_ptr(weight, weight_index);
-                  len = len + 1;
-                }
-              }
-            }
-          }
-          trace_guard_t trg(ctx, "brgemm + final copy");
-          builtin::brgemm_init_list_update(A_list, B_list,
-            tensor_ptr(temp_delta_input, {0, 0}), 1, im_ow_block_, im_ic_block_,
-            im_oc_block_, OC, im_ic_block_, im_ic_block_, 1, 1, len, dtype,
-            dtype, brg_attrs);
-          // copy temp_delta_input to delta_input
-          int lanes = vectorize_step(ctx, get_C_dtype().type_code_, 32);
-          if (im_ic_block_ < lanes || im_ic_block_ % lanes != 0) { lanes = 1; }
-          _for_(ow_copy, 0, im_ow_block_) {
-            _for_(ic_copy, 0, im_ic_block_, lanes) {
-              _var_(ow_idx, datatypes::index);
-              _var_(iw_idx, datatypes::index);
-              _if_(sw_idx < padding_w) {
-                ow_idx = divide_and_ceil(padding_w - sw_idx, stride_w);
-              }
-              _else_ { ow_idx = 0; }
-              iw_idx = (ow_idx + ow_copy) * stride_w + sw_idx - padding_w;
-              std::vector<expr> delta_input_index {obs_offset + i_bs, ih_idx,
-                iw_idx, ic_offset + i_ic * im_ic_block_ + ic_copy};
-              std::vector<expr> temp_output_index {ow_copy, ic_copy};
-              delta_input[span_t(delta_input_index, lanes)]
-                = temp_delta_input[span_t(temp_output_index, lanes)];
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                slice_range {{obs_offset + i_bs, 1}, {ih_idx, 1}, {iw_idx, 1},
-                  {ic_offset + i_ic * im_ic_block_ + ic_copy, lanes}});
-            }
-          }
-        }
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          slice_range {{obs_offset + i_bs, 1}, {ih_idx, 1}, {0, IW},
-            {ic_offset + i_ic * im_ic_block_, im_ic_block_}});
-      }
-    }
-  }
-}
-
-bool gen_nested_convNxN_backprop_data_t::generate(context_ptr ctx,
-  const nested_conv_bwd_data_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  int padding_h = padding_[0], padding_w = padding_[0];
-  int padding_d = ndims_ == 5 ? padding_[0] : 0;
-  if (padding_.size() > 1) {
-    COMPILE_ASSERT((int)padding_.size() == ndims_ - 2,
-      "padding length shall confirm with ndims.");
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  int stride_d = ndims_ == 5 ? stride_[0] : 1;
-  if (stride_.size() > 1) {
-    COMPILE_ASSERT((int)stride_.size() == ndims_ - 2,
-      "stride length shall confirm with ndims.");
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  bool has_padding = (padding_d > 0 || padding_h > 0 || padding_w > 0);
-  bool has_stride = (stride_d > 1 || stride_h > 1 || stride_w > 1);
-
-  const int num_threads = runtime_config_t::get().get_num_threads();
-
-  // setting dim values
-  int BS = get_output_grad_dims()[0], IC = get_input_grad_dims()[1];
-  int ID = ndims_ == 5 ? get_input_grad_dims()[ndims_ - 3] : 1;
-  int IH = get_input_grad_dims()[ndims_ - 2],
-      IW = get_input_grad_dims()[ndims_ - 1];
-  int OC = get_output_grad_dims()[1];
-  int OD = ndims_ == 5 ? get_output_grad_dims()[ndims_ - 3] : 1;
-  int OH = get_output_grad_dims()[2], OW = get_output_grad_dims()[3];
-  int KD = ndims_ == 5 ? get_weight_dims()[ndims_ - 3] : 1;
-  int R = get_weight_dims()[ndims_ - 2], S = get_weight_dims()[ndims_ - 1];
-  // setting configs
-  int bs_threads = config.bs_threads, ic_threads = config.ic_threads,
-      ih_threads = config.spatial_threads;
-  int ic_num_blocks = config.ic_num_blocks,
-      bs_num_blocks = config.bs_num_blocks,
-      ih_num_blocks = config.spatial_num_blocks;
-  COMPILE_ASSERT(config.oc_num_blocks == 1,
-    "oc_num_blocks is not used in convNxN_bwd_data, so it shall be 1.");
-
-  COMPILE_ASSERT(BS >= bs_threads, "BS shall be larger than bs_threads.");
-  COMPILE_ASSERT(IH >= ih_threads, "IH shall be larger than ih_threads.");
-  COMPILE_ASSERT(IC % ic_threads == 0, "imbalance on IC not supported.");
-  COMPILE_ASSERT(num_threads == bs_threads * ih_threads * ic_threads,
-    "All threads must be utilized.");
-
-  // other template related pre-compute values
-  auto dtype = get_A_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  int ic_single_core = IC / ic_threads;
-  int ih_single_core = IH / ih_threads;
-  expr bs_single_core_size, ih_single_core_size;
-  expr p_bs_offset, p_ih_offset;
-  expr X_bigger_num;
-
-  // define compute
-  expr delta_input = outputs.at(op_params_t::out_input_grad),
-       delta_output = inputs.at(op_params_t::in_output_grad),
-       weight = inputs.at(op_params_t::in_weight);
-
-  _for_(p_bs, 0, bs_threads, 1, for_type::PARALLEL, bs_threads) {
-    _for_(p_ih, 0, ih_threads, 1, for_type::PARALLEL, ih_threads) {
-      _for_(p_ic, 0, ic_threads, 1, for_type::PARALLEL, ic_threads) {
-        bs_single_core_size = get_balance211_length(
-          BS, bs_threads, p_bs, p_bs_offset, X_bigger_num);
-        ih_single_core_size = get_balance211_length(
-          IH, ih_threads, p_ih, p_ih_offset, X_bigger_num);
-        // start single core computation
-        _for_(o_bs, 0, bs_num_blocks) {
-          expr bs_block_offset, bs_block_bigger_num;
-          _var_init_(bs_block_size, datatypes::s32,
-            get_balance211_length(bs_single_core_size, bs_num_blocks, o_bs,
-              bs_block_offset, bs_block_bigger_num));
-          _var_init_(
-            obs_offset, datatypes::index, p_bs_offset + bs_block_offset);
-          _for_(o_ih, 0, ih_num_blocks) {
-            expr ih_block_offset, ih_block_bigger_num;
-            _var_init_(ih_block_size, datatypes::s32,
-              get_balance211_length(ih_single_core_size, ih_num_blocks, o_ih,
-                ih_block_offset, ih_block_bigger_num));
-            _var_init_(
-              ih_offset, datatypes::index, p_ih_offset + ih_block_offset);
-            _for_(o_ic, 0, ic_num_blocks) {
-              int ic_block = ic_single_core / ic_num_blocks;
-              expr oc_offset = 0;
-              expr ic_offset = p_ic * ic_single_core + o_ic * ic_block;
-              inner_loop_call(ctx, delta_input, delta_output, weight, dtype,
-                dtype_block, ic_block, OC, bs_block_size, 0, ih_block_size, OW,
-                stride_h, stride_w, padding_h, padding_w, R, S, IC, OC, OH, IW,
-                obs_offset, oc_offset, ic_offset, ih_offset, fusion);
-            }
-          }
-        }
-      }
-    }
-  }
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_data.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_data.hpp
deleted file mode 100644
index 8c4212d56bb..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_data.hpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONVNXN_BACKPROP_DATA_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONVNXN_BACKPROP_DATA_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-class gen_nested_convNxN_backprop_data_t
-  : public body_generator_t<nested_conv_bwd_data_config_t> {
-public:
-  // inner most block
-  int im_ow_block_;
-  int im_ic_block_;
-  int im_oc_block_;
-  sc_dims stride_;
-  sc_dims padding_;
-  int ndims_;
-  struct op_params_t {
-    static constexpr int in_output_grad = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out_input_grad = 0;
-  };
-  using parent = body_generator_t<nested_conv_bwd_data_config_t>;
-  using parent::generate;
-
-  gen_nested_convNxN_backprop_data_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_output_grad_dims() const {
-    return in_tensors_[op_params_t::in_output_grad].get_plain_dims();
-  }
-  const sc_dims &get_weight_dims() const {
-    return in_tensors_[op_params_t::in_weight].get_plain_dims();
-  }
-  const sc_dims &get_input_grad_dims() const {
-    return out_tensors_[op_params_t::out_input_grad].get_plain_dims();
-  }
-
-  sc_data_type_t get_A_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_B_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_C_dtype() const { return out_tensors_[0].dtype_; }
-
-  void pad_delta_output(const context_ptr &ctx, const expr &delta_output,
-    const expr &temp_delta_output_buffer, const expr &bs_block, int oc_block,
-    int OH, int OW, const expr &oh_range, int ow_range, const expr &bs_offset,
-    const expr &oh_offset, const expr &ow_offset, const expr &oc_offset,
-    const expr &temp_oh_offset) const;
-
-  void inner_loop_call(const context_ptr &ctx, const expr &delta_input,
-    const expr &delta_output, const expr &weight, const sc_data_type_t &dtype,
-    int dtype_block, int ic_block, int oc_block, const expr &bs_block,
-    int od_block, const expr &ih_block, int OW, int stride_h, int stride_w,
-    int padding_h, int padding_w, int R, int S, int IC, int OC, int IH, int IW,
-    const expr &obs_offset, const expr &oc_offset, const expr &ic_offset,
-    const expr &ih_offset, fusion_anchor_mgr_t *fusion) const;
-
-  bool generate(context_ptr ctx, const nested_conv_bwd_data_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx,
-    const nested_conv_bwd_data_config_t &config, stmt body,
-    std::vector<for_loop> &fors) const override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_weight.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_weight.cpp
deleted file mode 100644
index d91196233b8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_weight.cpp
+++ /dev/null
@@ -1,566 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "nested_convNxN_backprop_weight.hpp"
-#include <algorithm>
-#include <limits>
-#include <string>
-#include <utility>
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/templates/commit_op.hpp>
-#include <runtime/config.hpp>
-#include <runtime/parallel.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-config_ptr gen_nested_convNXN_bwd_weight_t::get_default_config(
-  context_ptr ctx) const {
-  auto ret
-    = reflection::general_object_t::make<nested_conv_bwd_weight_config_t>();
-  nested_conv_bwd_weight_config_t &cfg
-    = *ret.unchecked_get_as<nested_conv_bwd_weight_config_t>();
-  int num_threads = runtime_config_t::get().get_num_threads();
-  const int OC = get_grad_dims()[1];
-  const int OH = get_grad_dims()[ndims_ - 2];
-  const int OW = get_grad_dims()[ndims_ - 1];
-  const int BS = get_data_dims()[0];
-  const int IC = get_data_dims()[1];
-  // TODO(yifei): generalize it, currently assume num_threads % 7
-  COMPILE_ASSERT(num_threads == 1 || num_threads % 7 == 0,
-    "Current default config only supports num_threads divisible by 7 case.");
-  cfg.oc_threads = 1;
-  cfg.ic_threads = 1;
-  cfg.bs_threads = 1;
-  cfg.od_threads = 1;
-  cfg.oh_threads = 1;
-  if (num_threads % 7 == 0) {
-    COMPILE_ASSERT(OH % 7 == 0, "OH shall be divisible by 7.");
-    cfg.oh_threads = 7;
-    num_threads /= cfg.oh_threads;
-  }
-  int IC_space = IC / im_ic_block_;
-  int OC_space = OC / im_oc_block_;
-  int BS_space = BS / im_bs_block_;
-
-  float cost = std::numeric_limits<float>::max();
-  int max_threads_utillized = 0;
-  // the best default config following 2 herustics
-  // 1. utilize the most number of threads
-  // 2. distribute threads to result in balanced sub blocks
-  for (int bs_threads = 1; bs_threads <= num_threads; bs_threads++) {
-    if (BS_space % bs_threads != 0) continue;
-    int num_BS_block = utils::divide_and_ceil(BS_space, bs_threads);
-    for (int ic_threads = 1; ic_threads <= num_threads / bs_threads;
-         ic_threads++) {
-      if (IC_space % ic_threads != 0) continue;
-      int num_IC_block = utils::divide_and_ceil(IC_space, ic_threads);
-      for (int oc_threads = 1;
-           oc_threads <= num_threads / bs_threads / ic_threads; oc_threads++) {
-        if (OC_space % oc_threads != 0) continue;
-        int num_OC_block = utils::divide_and_ceil(OC_space, oc_threads);
-        if (bs_threads * ic_threads * oc_threads >= max_threads_utillized) {
-          cost = bs_threads * ic_threads * oc_threads == max_threads_utillized
-            ? cost
-            : std::numeric_limits<float>::max(); // reset cost if max_threads
-                                                 // increase
-          max_threads_utillized = bs_threads * ic_threads * oc_threads;
-          float avg = float(num_BS_block + num_IC_block + num_OC_block) / 3;
-          float cur_cost = (num_BS_block - avg) * (num_BS_block - avg)
-            + (num_IC_block - avg) * (num_IC_block - avg)
-            + (num_OC_block - avg) * (num_OC_block - avg);
-          if (cur_cost < cost) {
-            cost = cur_cost;
-            cfg.ic_threads = ic_threads;
-            cfg.oc_threads = oc_threads;
-            cfg.bs_threads = bs_threads;
-          }
-        }
-      }
-    }
-  }
-
-  cfg.oc_num_blocks
-    = OC / cfg.oc_threads / 64 >= 1 ? OC / cfg.oc_threads / 64 : 1;
-  cfg.ic_num_blocks
-    = IC / cfg.ic_threads / 64 >= 1 ? IC / cfg.ic_threads / 64 : 1;
-  cfg.bs_num_blocks
-    = BS / cfg.bs_threads / 64 >= 1 ? BS / cfg.bs_threads / 64 : 1;
-
-  cfg.oh_num_blocks = OH / cfg.oh_threads;
-  cfg.od_num_blocks = 1;
-  COMPILE_ASSERT(OW % 7 == 0, "OW shall be divisible by 7.");
-  if (OW > 14) {
-    cfg.ow_num_blocks = OW / 14;
-  } else {
-    cfg.ow_num_blocks = 1;
-  }
-  return std::move(ret);
-}
-
-gen_nested_convNXN_bwd_weight_t::gen_nested_convNXN_bwd_weight_t(sc_op *owner,
-  const sc_dims &stride, const sc_dims &padding,
-  std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs))
-  , stride_(stride)
-  , padding_(padding) {
-  COMPILE_ASSERT(
-    in_tensors_.size() == 2, "input logical tensor size should be two.");
-  COMPILE_ASSERT(
-    out_tensors_.size() == 1, "output logical tensor size should be one.");
-  bool is_vnni_low_fp
-    = ops::is_vnni_low_fp(get_default_context(), get_A_dtype());
-  ndims_ = get_data_dims().size();
-  // TODO(yifei): enhance default values to deal with more flexible configs
-  const int BS = get_data_dims()[0];
-  const int IC = get_data_dims()[1];
-  const int OC = get_grad_dims()[1];
-  if (is_vnni_low_fp) {
-    im_oc_block_ = 32;
-    im_ic_block_ = 32;
-    im_bs_block_ = 32;
-    if (IC >= 512 && IC % 64 == 0) { im_ic_block_ = 64; }
-    if (OC >= 512 && OC % 64 == 0) { im_oc_block_ = 64; }
-    if (im_ic_block_ == 64 && im_oc_block_ == 64) { im_bs_block_ = 64; }
-  } else {
-    im_oc_block_ = 16;
-    im_ic_block_ = 16;
-    im_bs_block_ = 16;
-  }
-}
-
-float gen_nested_convNXN_bwd_weight_t::get_gflop() const {
-  const int OD = ndims_ == 5 ? get_grad_dims()[ndims_ - 3] : 1;
-  const int P = get_grad_dims()[ndims_ - 2];
-  const int Q = get_grad_dims()[ndims_ - 1];
-  const int C = get_data_dims()[1];
-  const int K = get_grad_dims()[1];
-  const int N = get_data_dims()[0];
-  const int KD = ndims_ == 5 ? get_output_dims()[ndims_ - 3] : 1;
-  const int KH = get_output_dims()[ndims_ - 2];
-  const int KW = get_output_dims()[ndims_ - 1];
-  float result = 2.0f * N * K * C * KD * KH * KW * OD * P * Q / (float)1e9;
-  return result;
-}
-
-void gen_nested_convNXN_bwd_weight_t::schedule_loops(context_ptr ctx,
-  const nested_conv_bwd_weight_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {}
-
-void gen_nested_convNXN_bwd_weight_t::forward_input_reorder_call(
-  const context_ptr &ctx, const expr &temp_forward_input,
-  const expr &forward_input, const sc_data_type_t &dtype, int bs_block,
-  int ic_block, int oh_block, int ow_block, int IH, int IW, const expr &h_ext,
-  const expr &w_ext, const expr &bs_offset, const expr &ic_offset,
-  const expr &oh_offset, const expr &ow_offset, int stride_h, int padding_h,
-  int stride_w, int padding_w, const expr &sh_idx, const expr &sw_idx) const {
-  // NHWCn? OR NHWC? --> NCHWcn
-  trace_guard_t trg(ctx, "forward_input_reorder");
-  int lanes = vectorize_step(ctx, get_B_dtype().type_code_, 32);
-  if (im_bs_block_ < lanes || im_bs_block_ % lanes != 0) { lanes = 1; }
-  _for_(ibs_reorder_out, 0, bs_block / im_bs_block_) {
-    _for_(ih_reorder, 0, oh_block + h_ext) {
-      _for_(iw_reorder, 0, ow_block + w_ext) {
-        _for_(ic_reorder, 0, ic_block) {
-          _for_(ibs_reorder_in, 0, im_bs_block_, lanes) {
-            expr ibs_reorder = ibs_reorder_out * im_bs_block_ + ibs_reorder_in;
-            expr bs_idx = bs_offset + ibs_reorder;
-            expr ic_idx = ic_offset + ic_reorder;
-            expr h_idx = oh_offset + ih_reorder;
-            expr w_idx = ow_offset + iw_reorder;
-            expr input_h_idx = h_idx * stride_h - padding_h + sh_idx;
-            expr input_w_idx = w_idx * stride_w - padding_w + sw_idx;
-            std::vector<expr> tmp_input_idx {ibs_reorder / im_bs_block_,
-              ic_reorder / im_ic_block_, ih_reorder, iw_reorder,
-              ic_reorder % im_ic_block_, ibs_reorder % im_bs_block_};
-            std::vector<expr> input_idx {bs_idx / im_bs_block_, input_h_idx,
-              input_w_idx, ic_idx, bs_idx % im_bs_block_};
-            _if_((input_h_idx >= 0 && input_h_idx < IH)
-              && (input_w_idx >= 0 && input_w_idx < IW)) {
-              temp_forward_input[span_t(tmp_input_idx, lanes)]
-                = forward_input[span_t(input_idx, lanes)];
-            }
-            _else_ {
-              temp_forward_input[span_t(tmp_input_idx, lanes)]
-                = builder::make_broadcast(
-                  builder::make_cast(dtype.type_code_, 0), lanes);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-void gen_nested_convNXN_bwd_weight_t::inner_loop_call(const context_ptr &ctx,
-  const expr &temp_forward_input,
-  const std::vector<expr> &temp_forward_idx_non_block,
-  const logical_tensor_t &delta_output_lt, const expr &delta_output,
-  const expr &real_delta_weight_buf, const std::vector<expr> &temp_weight_idx,
-  const sc_data_type_t &dtype, int dtype_block, int ic_block, int oc_block,
-  int bs_block, int od_block, int oh_block, int ow_block, int stride_h,
-  int stride_w, int R, int S, const expr &sh_idx, const expr &sw_idx,
-  const expr &o_bs, const expr &o_od, const expr &o_oh, const expr &o_ow,
-  const expr &obs_offset, const expr &oc_offset, const expr &oh_offset,
-  const expr &ow_offset, fusion_anchor_mgr_t *fusion) const {
-  int BS = delta_output_lt.get_plain_dims()[0];
-  int OC = delta_output_lt.get_plain_dims()[1];
-  int OH = delta_output_lt.get_plain_dims()[2];
-  int OW = delta_output_lt.get_plain_dims()[3];
-  // NPQK --> NKPQnk OR -> NKPQnk2n
-  // full shape based on delta_output's reorder result
-  std::vector<expr> temp_output_delta_shape_full = dtype_block > 1
-    ? std::vector<expr> {BS / im_bs_block_, OC / im_oc_block_, OH, OW,
-      im_bs_block_ / 2, im_oc_block_, 2}
-    : std::vector<expr> {
-      BS / im_bs_block_, OC / im_oc_block_, OH, OW, im_bs_block_, im_oc_block_};
-  _tensor_(temp_output_delta, dtype, temp_output_delta_shape_full);
-  _for_(i_ic, 0, ic_block / im_ic_block_) {
-    // shrinked_shape
-    std::vector<expr> temp_output_delta_shape_shr = dtype_block > 1
-      ? std::vector<expr> {bs_block / im_bs_block_, oc_block / im_oc_block_,
-        oh_block, ow_block, im_bs_block_ / 2, im_oc_block_, 2}
-      : std::vector<expr> {bs_block / im_bs_block_, oc_block / im_oc_block_,
-        oh_block, ow_block, im_bs_block_, im_oc_block_};
-    // f32 --> vectorized; bf16 --> vnni_reorder
-    std::vector<expr> shrink_offset = dtype_block > 1
-      ? std::vector<expr> {obs_offset / im_bs_block_, oc_offset / im_oc_block_,
-        oh_offset, ow_offset, obs_offset % im_bs_block_ / 2,
-        oc_offset % im_oc_block_, obs_offset % im_bs_block_ % 2}
-      : std::vector<expr> {obs_offset / im_bs_block_, oc_offset / im_oc_block_,
-        oh_offset, ow_offset, obs_offset % im_bs_block_,
-        oc_offset % im_oc_block_};
-    // reorder temp_output_delta
-    _if_(i_ic == 0) {
-      trace_guard_t trg(ctx, "output_delta_reorder");
-      int lanes = vectorize_step(ctx, get_B_dtype().type_code_, 32);
-      if (oc_block < lanes || oc_block % lanes != 0 || dtype_block > 1) {
-        lanes = 1;
-      }
-      temp_output_delta->attr()[tensor_shrinker_attrs::should_shrink]
-        = tensor_shrinker_t::shrink_info_t {
-          shrink_offset, temp_output_delta_shape_shr, stmts()};
-      slice_range tmp_output_slice_range = dtype_block > 1
-        ? slice_range {{obs_offset, bs_block / im_bs_block_},
-          {oc_offset, oc_block / im_oc_block_}, {oh_offset, oh_block},
-          {ow_offset, ow_block}, {0, im_bs_block_ / 2}, {0, im_oc_block_},
-          {0, 2}}
-        : slice_range {{obs_offset, bs_block / im_bs_block_},
-          {oc_offset, oc_block / im_oc_block_}, {oh_offset, oh_block},
-          {ow_offset, ow_block}, {0, im_bs_block_}, {0, im_oc_block_}};
-      // TODO(yifei): figure out why expand loop based on output doesn't work
-      ops::commit_op(ctx, "reorder",
-        {tensor_slice(delta_output,
-          {{obs_offset, bs_block}, {oh_offset, oh_block}, {ow_offset, ow_block},
-            {oc_offset, oc_block}})},
-        {tensor_slice(temp_output_delta, std::move(tmp_output_slice_range))},
-        {graph_tensor::make(delta_output_lt.get_plain_dims(),
-          delta_output_lt.get_format(), delta_output_lt.dtype_)},
-        {},
-        {{"out_format",
-          dtype_block > 1
-            ? sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 0, 1, 0),
-              {im_bs_block_, im_oc_block_, 2})
-            : sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 0, 1),
-              {im_bs_block_, im_oc_block_})}});
-    }
-    _for_(i_oc, 0, oc_block / im_oc_block_) {
-      _for_(i_bs, 0, bs_block / im_bs_block_) {
-        _for_(i_od, 0, od_block) {
-          _for_(i_oh, 0, oh_block) {
-            _for_(lr, sh_idx, R, stride_h) {
-              _for_(ls, sw_idx, S, stride_w) {
-                trace_guard_t trg(ctx, "brgemm");
-                auto temp_output_delta_brgemm_index = dtype_block > 1
-                  ? std::vector<expr> {shrink_offset[0] + i_bs,
-                    shrink_offset[1] + i_oc, shrink_offset[2] + i_oh,
-                    shrink_offset[3], shrink_offset[4], shrink_offset[5],
-                    shrink_offset[6]}
-                  : std::vector<expr> {shrink_offset[0] + i_bs,
-                    shrink_offset[1] + i_oc, shrink_offset[2] + i_oh,
-                    shrink_offset[3], shrink_offset[4], shrink_offset[5]};
-                COMPILE_ASSERT(
-                  temp_weight_idx.size() == 2 || temp_weight_idx.size() == 3,
-                  "temp_weight_idx shall have length 2 or 3");
-                auto real_delta_weight_buf_index = temp_weight_idx.size() == 4
-                  ? std::vector<expr> {temp_weight_idx[0] + i_ic,
-                    temp_weight_idx[1] + i_oc, lr, ls, 0, 0}
-                  : std::vector<expr> {temp_weight_idx[0],
-                    temp_weight_idx[1] + i_ic, temp_weight_idx[2] + i_oc, lr,
-                    ls, 0, 0};
-                _if_(o_bs == 0 && o_od == 0 && o_oh == 0 && o_ow == 0
-                  && i_bs == 0 && i_od == 0 && i_oh == 0) {
-                  // ic x bs matmul bs x oc
-                  builtin::brgemm_init_update(
-                    tensor_ptr(temp_forward_input,
-                      {temp_forward_idx_non_block[0] + i_bs,
-                        temp_forward_idx_non_block[1] + i_ic,
-                        temp_forward_idx_non_block[2] + i_oh + lr / stride_h,
-                        temp_forward_idx_non_block[3] + ls / stride_w, 0, 0}),
-                    tensor_ptr(
-                      temp_output_delta, temp_output_delta_brgemm_index),
-                    tensor_ptr(
-                      real_delta_weight_buf, real_delta_weight_buf_index),
-                    ow_block, im_ic_block_, im_oc_block_, im_bs_block_,
-                    im_bs_block_, im_oc_block_, im_oc_block_,
-                    im_ic_block_ * im_bs_block_, im_oc_block_ * im_bs_block_,
-                    dtype, dtype);
-                }
-                _else_ {
-                  builtin::brgemm_update(
-                    tensor_ptr(temp_forward_input,
-                      {temp_forward_idx_non_block[0] + i_bs,
-                        temp_forward_idx_non_block[1] + i_ic,
-                        temp_forward_idx_non_block[2] + i_oh + lr / stride_h,
-                        temp_forward_idx_non_block[3] + ls / stride_w, 0, 0}),
-                    tensor_ptr(
-                      temp_output_delta, temp_output_delta_brgemm_index),
-                    tensor_ptr(
-                      real_delta_weight_buf, real_delta_weight_buf_index),
-                    ow_block, im_ic_block_, im_oc_block_, im_bs_block_,
-                    im_bs_block_, im_oc_block_, im_oc_block_,
-                    im_ic_block_ * im_bs_block_, im_oc_block_ * im_bs_block_,
-                    dtype, dtype);
-                }
-                if (fusion && temp_weight_idx.size() == 2) {
-                  create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                    slice_range {{real_delta_weight_buf_index[0], 1},
-                      {real_delta_weight_buf_index[1], 0}, {lr, 1}, {ls, 1},
-                      {0, im_ic_block_}, {0, im_oc_block_}});
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-bool gen_nested_convNXN_bwd_weight_t::generate(context_ptr ctx,
-  const nested_conv_bwd_weight_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  // set padding && stride, if d does not exist, set padding == 0 && stride ==
-  // 1
-  int padding_h = padding_[0], padding_w = padding_[0];
-  int padding_d = ndims_ == 5 ? padding_[0] : 0;
-  if (padding_.size() > 1) {
-    COMPILE_ASSERT((int)padding_.size() == ndims_ - 2,
-      "padding length shall confirm with ndims.");
-    padding_h = padding_[ndims_ - 4];
-    padding_w = padding_[ndims_ - 3];
-  }
-  int stride_h = stride_[0], stride_w = stride_[0];
-  int stride_d = ndims_ == 5 ? stride_[0] : 1;
-  if (stride_.size() > 1) {
-    COMPILE_ASSERT((int)stride_.size() == ndims_ - 2,
-      "stride length shall confirm with ndims.");
-    stride_h = stride_[ndims_ - 4];
-    stride_w = stride_[ndims_ - 3];
-  }
-  bool has_padding = (padding_d > 0 || padding_h > 0 || padding_w > 0);
-  bool has_stride = (stride_d > 1 || stride_h > 1 || stride_w > 1);
-
-  // setting dim values
-  int BS = get_data_dims()[0], IC = get_data_dims()[1];
-  int ID = ndims_ == 5 ? get_data_dims()[ndims_ - 3] : 1;
-  int IH = get_data_dims()[ndims_ - 2], IW = get_data_dims()[ndims_ - 1];
-  int OC = get_grad_dims()[1];
-  int OD = ndims_ == 5 ? get_grad_dims()[ndims_ - 3] : 1;
-  int OH = get_grad_dims()[2], OW = get_grad_dims()[3];
-  int KD = ndims_ == 5 ? get_output_dims()[ndims_ - 3] : 1;
-  int R = get_output_dims()[ndims_ - 2], S = get_output_dims()[ndims_ - 1];
-  // setting configs
-  int bs_threads = config.bs_threads, ic_threads = config.ic_threads,
-      oc_threads = config.oc_threads, oh_threads = config.oh_threads,
-      od_threads = config.od_threads;
-  int oc_num_blocks
-    = config.oc_num_blocks,
-    ic_num_blocks = config.ic_num_blocks, bs_num_blocks = config.bs_num_blocks,
-    oh_num_blocks = config.oh_num_blocks, od_num_blocks = config.od_num_blocks,
-    ow_num_blocks = config.ow_num_blocks;
-
-  // TODO(yifei): generalize this constraint
-  COMPILE_ASSERT((KD >= stride_d) && (R >= stride_h) && (S >= stride_w),
-    "Current conv_bwd_weight generator does not support this case");
-
-  // other template related pre-compute values
-  auto dtype = get_A_dtype();
-  bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, dtype);
-  int dtype_block = is_vnni_low_fp ? 2 : 1;
-  int oc_single_core = OC / oc_threads;
-  int ic_single_core = IC / ic_threads;
-  int bs_single_core = BS / bs_threads;
-  int od_single_core = OD / od_threads;
-  int oh_single_core = OH / oh_threads;
-
-  // define compute
-  // weight's shape [IC / ic_block, OC / oc_block, R, S, ic_block, oc_block]
-  expr delta_weight = outputs.at(op_params_t::out_delta_weight),
-       forward_input = inputs.at(op_params_t::in_forward_input),
-       delta_output = inputs.at(op_params_t::in_delta_output);
-
-  _tensor_(temp_delta_weight, datatypes::f32,
-    {bs_threads * oh_threads * od_threads, IC / im_ic_block_, OC / im_oc_block_,
-      R, S, im_ic_block_, im_oc_block_});
-  bool use_temp_weight = (bs_threads * oh_threads * od_threads > 1);
-  expr real_delta_weight_buf
-    = use_temp_weight ? temp_delta_weight : delta_weight;
-  _for_(p_oc, 0, oc_threads, 1, for_type::PARALLEL, oc_threads) {
-    _for_(p_ic, 0, ic_threads, 1, for_type::PARALLEL, ic_threads) {
-      _for_(p_bs, 0, bs_threads, 1, for_type::PARALLEL, bs_threads) {
-        _for_(p_od, 0, od_threads, 1, for_type::PARALLEL, od_threads) {
-          _for_(p_oh, 0, oh_threads, 1, for_type::PARALLEL, oh_threads) {
-            // start single core computation
-            _for_(sd_idx, 0, stride_d) { // stride_d == 1 in 2D case
-              _for_(sh_idx, 0, stride_h) {
-                _for_(sw_idx, 0, stride_w) {
-                  // reorder and pack forward_input
-                  // create buffer
-                  // TODO(yifei): consider 3D case for temp_forward_input
-                  // TODO(yifei): consider sh_idx > R case!
-                  expr h_ext = divide_and_ceil(R - sh_idx, stride_h) - 1;
-                  expr w_ext = divide_and_ceil(S - sw_idx, stride_w) - 1;
-                  int oc_block = oc_single_core / oc_num_blocks;
-                  int ic_block = ic_single_core / ic_num_blocks;
-                  int bs_block = bs_single_core / bs_num_blocks;
-                  int oh_block = oh_single_core / oh_num_blocks;
-                  int od_block = od_single_core / od_num_blocks;
-                  int ow_block = OW / ow_num_blocks;
-                  _for_(o_ic, 0, ic_num_blocks) {
-                    _for_(o_bs, 0, bs_num_blocks) {
-                      _for_(o_od, 0, od_num_blocks) {
-                        _for_(o_oh, 0, oh_num_blocks) {
-                          _for_(o_ow, 0, ow_num_blocks) {
-                            expr obs_offset
-                              = p_bs * bs_single_core + o_bs * bs_block;
-                            expr ic_offset
-                              = p_ic * ic_single_core + o_ic * ic_block;
-                            expr oh_offset
-                              = p_oh * oh_single_core + o_oh * oh_block;
-                            expr ow_offset = o_ow * ow_block;
-                            // start perform reorder: NH[D]WC->NC[D]HWcn
-                            _tensor_(temp_forward_input, dtype,
-                              std::vector<expr> {bs_block / im_bs_block_,
-                                ic_block / im_ic_block_, oh_block + h_ext,
-                                ow_block + w_ext, im_ic_block_, im_bs_block_});
-                            forward_input_reorder_call(ctx, temp_forward_input,
-                              forward_input, dtype, bs_block, ic_block,
-                              oh_block, ow_block, IH, IW, h_ext, w_ext,
-                              obs_offset, ic_offset, oh_offset, ow_offset,
-                              stride_h, padding_h, stride_w, padding_w, sh_idx,
-                              sw_idx);
-                            _for_(o_oc, 0, oc_num_blocks) {
-                              expr oc_offset
-                                = p_oc * oc_single_core + o_oc * oc_block;
-                              std::vector<expr> temp_forward_idx_non_block {
-                                0, 0, 0, 0};
-                              // ic/oc_offset is on full slice
-                              // weight has blocked ic/oc dimension
-                              // so extra division needed
-                              auto temp_weight_idx = use_temp_weight
-                                ? std::vector<expr> {p_bs * oh_threads
-                                      * od_threads
-                                    + p_od * oh_threads + p_oh,
-                                  ic_offset / im_ic_block_,
-                                  oc_offset / im_oc_block_}
-                                : std::vector<expr> {ic_offset / im_ic_block_,
-                                  oc_offset / im_oc_block_};
-                              inner_loop_call(ctx, temp_forward_input,
-                                temp_forward_idx_non_block, in_tensors_[1],
-                                delta_output, real_delta_weight_buf,
-                                temp_weight_idx, dtype, dtype_block, ic_block,
-                                oc_block, bs_block, od_block, oh_block,
-                                ow_block, stride_h, stride_w, R, S, sh_idx,
-                                sw_idx, o_bs, o_od, o_oh, o_ow, obs_offset,
-                                oc_offset, oh_offset, ow_offset, fusion);
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-      // final parallel reduce
-      if (bs_threads * oh_threads * od_threads > 1) {
-        int lanes = vectorize_step(ctx, get_C_dtype().type_code_, 16);
-        if (oc_single_core < lanes || oc_single_core % lanes != 0) {
-          lanes = 1;
-        }
-        trace_guard_t trg(ctx, "final_reduce");
-        // [IC / ic_block, OC / oc_block, R, S, ic_block, oc_block]
-        _for_(r_parallel, 0,
-          ic_single_core * R * S * oc_single_core / im_oc_block_, 1,
-          for_type::PARALLEL, bs_threads * oh_threads * od_threads) {
-          expr ic_block_idx = r_parallel % im_ic_block_;
-          expr s_idx = r_parallel / im_ic_block_ % S;
-          expr r_idx = r_parallel / im_ic_block_ / S % R;
-          expr idx_tmp = r_parallel / im_ic_block_ / S / R;
-          expr oc_outer_idx = idx_tmp % (oc_single_core / im_oc_block_);
-          expr ic_outer_idx = idx_tmp / (oc_single_core / im_oc_block_)
-            % (ic_single_core / im_ic_block_);
-          _for_(r_reduce, 0, bs_threads * od_threads * oh_threads, 1) {
-            _for_(r_oc_inner, 0, im_oc_block_, lanes) {
-              std::vector<expr> delta_weight_idx {
-                p_ic * ic_single_core / im_ic_block_ + ic_outer_idx,
-                p_oc * oc_single_core / im_oc_block_ + oc_outer_idx, r_idx,
-                s_idx, ic_block_idx, r_oc_inner};
-              _if_(r_reduce == 0) {
-                builtin::mem_zero(tensor_ptr(delta_weight, delta_weight_idx),
-                  lanes, datatypes::f32);
-              }
-              std::vector<expr> temp_delta_weight_idx = delta_weight_idx;
-              temp_delta_weight_idx.insert(
-                temp_delta_weight_idx.begin(), r_reduce);
-              delta_weight[span_t(delta_weight_idx, lanes)] = builder::make_add(
-                delta_weight[span_t(delta_weight_idx, lanes)],
-                temp_delta_weight[span_t(temp_delta_weight_idx, lanes)]);
-            }
-          }
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            slice_range {
-              {p_ic * ic_single_core / im_ic_block_ + ic_outer_idx, 1},
-              {p_oc * oc_single_core / im_oc_block_ + oc_outer_idx, 1},
-              {r_idx, 1}, {s_idx, 1}, {ic_block_idx, 1}, {0, im_oc_block_}});
-        }
-      }
-    }
-  }
-  return true;
-}
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_weight.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_weight.hpp
deleted file mode 100644
index 522eec12e92..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_convNxN_backprop_weight.hpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONVNXN_BACKPROP_WEIGHT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONVNXN_BACKPROP_WEIGHT_HPP
-
-#include <memory>
-#include <tuple>
-#include <vector>
-#include "conv_bwd.hpp"
-#include <ops/body_generator.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-class gen_nested_convNXN_bwd_weight_t
-  : public body_generator_t<nested_conv_bwd_weight_config_t> {
-public:
-  // inner most block
-  int im_oc_block_;
-  int im_ic_block_;
-  int im_bs_block_;
-  sc_dims stride_;
-  sc_dims padding_;
-  int ndims_;
-  struct op_params_t {
-    static constexpr int in_forward_input = 0;
-    static constexpr int in_delta_output = 1;
-    static constexpr int out_delta_weight = 0;
-  };
-  using parent = body_generator_t<nested_conv_bwd_weight_config_t>;
-  using parent::generate;
-
-  gen_nested_convNXN_bwd_weight_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &padding, std::vector<logical_tensor_t> &&ins,
-    std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  const sc_dims &get_data_dims() const {
-    return in_tensors_[op_params_t::in_forward_input].get_plain_dims();
-  }
-  const sc_dims &get_grad_dims() const {
-    return in_tensors_[op_params_t::in_delta_output].get_plain_dims();
-  }
-  const sc_dims &get_output_dims() const {
-    return out_tensors_[op_params_t::out_delta_weight].get_plain_dims();
-  }
-
-  sc_data_type_t get_A_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_B_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_C_dtype() const { return out_tensors_[0].dtype_; }
-
-  void forward_input_reorder_call(const context_ptr &ctx,
-    const expr &temp_forward_input, const expr &forward_input,
-    const sc_data_type_t &dtype, int bs_block, int ic_block, int oh_block,
-    int ow_block, int IH, int IW, const expr &h_ext, const expr &w_ext,
-    const expr &bs_offset, const expr &ic_offset, const expr &oh_offset,
-    const expr &ow_offset, int stride_h, int padding_h, int stride_w,
-    int padding_w, const expr &sh_idx, const expr &sw_idx) const;
-
-  void inner_loop_call(const context_ptr &ctx, const expr &temp_forward_input,
-    const std::vector<expr> &temp_forward_idx_non_block,
-    const logical_tensor_t &delta_output_lt, const expr &delta_output,
-    const expr &temp_delta_weight, const std::vector<expr> &temp_weight_idx,
-    const sc_data_type_t &dtype, int dtype_block, int ic_block, int oc_block,
-    int bs_block, int od_block, int oh_block, int ow_block, int stride_h,
-    int stride_w, int R, int S, const expr &sh_idx, const expr &sw_idx,
-    const expr &o_bs, const expr &o_od, const expr &o_oh, const expr &o_ow,
-    const expr &obs_offset, const expr &oc_offset, const expr &oh_offset,
-    const expr &ow_offset, fusion_anchor_mgr_t *fusion) const;
-
-  bool generate(context_ptr ctx, const nested_conv_bwd_weight_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx,
-    const nested_conv_bwd_weight_config_t &config, stmt body,
-    std::vector<for_loop> &fors) const override;
-};
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv_fwd.cpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv_fwd.cpp
deleted file mode 100644
index 178862dd220..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv_fwd.cpp
+++ /dev/null
@@ -1,4702 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <utility>
-#include "nested_conv_fwd.hpp"
-#include "utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/binding_axis.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/trait/configurable.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/convolution.hpp>
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/dynamic_dispatch/utils.hpp>
-#include <unordered_set>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-#include <thread>
-
-using namespace dnnl::impl::graph::gc::builder;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using ops::nested_conv_fwd_config_t;
-// clang-format off
-SC_CLASS(nested_conv_fwd_config_t)
-  SC_FIELD(K_block)
-  SC_FIELD(C_block)
-  SC_FIELD(pack_input)
-  SC_FIELD(bs_threads)
-  SC_FIELD(oc_threads)
-  SC_FIELD(im_oc_block)
-  SC_FIELD(im_ic_block)
-  SC_FIELD(h_threads)
-  SC_FIELD(w_threads)
-  SC_FIELD(h_block)
-  SC_FIELD(w_block)
-  SC_FIELD(im_h_block)
-  SC_FIELD(im_w_block)
-SC_CLASS_END();
-// clang-format on
-
-namespace ops {
-
-static inline int get_oc_split_factor(const int data_size,
-  const int weight_size, const int L2_cache_size, const int K_num_block) {
-  int oc_split = 1;
-  // data_size == -1 for dynamic case
-  if (weight_size >= L2_cache_size
-    && (weight_size > data_size || data_size == -1)) {
-    int expected_split_num = utils::divide_and_ceil(weight_size, L2_cache_size);
-    for (auto &factor : utils::get_factors(K_num_block)) {
-      if (factor >= expected_split_num) {
-        expected_split_num = factor;
-        break;
-      }
-    }
-    oc_split = K_num_block < expected_split_num ? 1 : expected_split_num;
-  }
-  return oc_split;
-}
-
-config_ptr_vec gen_nested_conv_fwd_t::get_dynamic_config_candidates(
-  const context_ptr &ctx) const {
-  config_ptr_vec ret;
-  // align with static default config
-  int num_threads = runtime_config_t::get().get_num_threads();
-  auto h_threads_candidates = get_splits(num_threads);
-  if (h_threads_candidates.size() > 4)
-    h_threads_candidates = std::vector<int>(
-      h_threads_candidates.begin(), h_threads_candidates.begin() + 4);
-  auto oc_threads_candidates = std::vector<int> {1, 4, 8};
-  for (auto oc_c : oc_threads_candidates) {
-    int candi = std::max(num_threads / oc_c, 1);
-    if (std::count(
-          h_threads_candidates.begin(), h_threads_candidates.end(), candi)
-      == 0)
-      h_threads_candidates.push_back(candi);
-  }
-  std::vector<int> im_h_block_candidates = std::vector<int> {1};
-  // limit im_os_block smaller than 64.
-  std::vector<int> im_w_block_candidates
-    = ow_ < 0 ? std::vector<int> {64} : std::vector<int> {std::min(ow_, 64)};
-  bool has_pad = (pd_b_ > 0) || (ph_b_ > 0) || (pw_b_ > 0) || (pd_e_ > 0)
-    || (ph_e_ > 0) || (pw_e_ > 0);
-
-  auto default_block = get_dyn_conv_default_block(is_1x1_conv_,
-    utils::get_sizeof_type(get_input_dtype()), has_pad,
-    get_input_dtype() == datatypes::f32);
-  int k_blk_ = utils::get_blocks(oc_, 1, default_block).back();
-  if (num_threads < 5) {
-    h_threads_candidates = std::vector<int> {1, num_threads};
-    oc_threads_candidates = std::vector<int> {1, num_threads};
-    im_h_block_candidates = std::vector<int> {1, 2, 4};
-  }
-  for (auto &oc_thr : oc_threads_candidates) {
-    for (auto &h_thr : h_threads_candidates) {
-      if (num_threads % (h_thr * oc_thr) != 0 || oc_ % oc_thr != 0) continue;
-      if (oc_ / oc_thr % k_blk_ != 0) continue;
-      for (auto &im_h_block : im_h_block_candidates) {
-        for (auto &im_w_blk : im_w_block_candidates) {
-          auto gcfg
-            = reflection::general_object_t::make<nested_conv_fwd_config_t>();
-          nested_conv_fwd_config_t &cfg
-            = *gcfg.unchecked_get_as<nested_conv_fwd_config_t>();
-          cfg.h_threads = h_thr;
-          cfg.oc_threads = oc_thr;
-          cfg.im_h_block = im_h_block;
-          cfg.im_w_block = im_w_blk;
-          ret.emplace_back(std::move(gcfg));
-        }
-      }
-    }
-  }
-  return ret;
-}
-
-std::vector<uint64_t> gen_nested_conv_fwd_t::convert_config_to_keys(
-  const config_ptr &config) const {
-  nested_conv_fwd_config_t &cfg
-    = *config.unchecked_get_as<nested_conv_fwd_config_t>();
-  std::vector<uint64_t> keys = {static_cast<uint64_t>(cfg.h_threads),
-    static_cast<uint64_t>(cfg.oc_threads),
-    static_cast<uint64_t>(cfg.im_h_block),
-    static_cast<uint64_t>(cfg.im_w_block)};
-  return keys;
-}
-
-config_ptr gen_nested_conv_fwd_t::get_default_config(context_ptr ctx) const {
-  auto ret = reflection::general_object_t::make<nested_conv_fwd_config_t>();
-  nested_conv_fwd_config_t &cfg
-    = *ret.unchecked_get_as<nested_conv_fwd_config_t>();
-  if (is_dynamic()) {
-    cfg.h_threads = 1;
-    cfg.w_threads = 1;
-    cfg.bs_threads = runtime_config_t::get().get_num_threads();
-    cfg.oc_threads = 1;
-    cfg.K_block = oc_;
-    cfg.C_block = ic_;
-    cfg.h_block = oh_;
-    cfg.w_block = ow_;
-    cfg.im_h_block = 1;
-    cfg.im_w_block = 64;
-    bool has_pad = (pd_b_ > 0) || (ph_b_ > 0) || (pw_b_ > 0) || (pd_e_ > 0)
-      || (ph_e_ > 0) || (pw_e_ > 0);
-    auto default_block = get_dyn_conv_default_block(is_1x1_conv_,
-      utils::get_sizeof_type(get_input_dtype()), has_pad,
-      get_input_dtype() == datatypes::f32);
-    cfg.im_oc_block = utils::get_blocks(oc_, 1, default_block).back();
-    cfg.im_ic_block = utils::get_blocks(ic_, 1, default_block).back();
-    return std::move(ret);
-  }
-  if (use_nested_2d_) {
-    const int num_threads = runtime_config_t::get().get_num_threads();
-    auto thread_split = get_splits(num_threads);
-    cfg.bs_threads = mb_ > num_threads || (mb_ == num_threads && oc_ <= 128)
-      ? num_threads
-      : *(std::find_if(thread_split.rbegin(), thread_split.rend(),
-        [&](int split) { return split == 1 || split < mb_; }));
-    cfg.oc_threads = num_threads / cfg.bs_threads;
-    cfg.h_threads = 1;
-    cfg.w_threads = 1;
-    auto ic_threads = 1;
-
-    bool is_int8
-      = utils::is_one_of(get_input_dtype(), datatypes::u8, datatypes::s8);
-    bool is_vnni_low_fp = ops::is_vnni_low_fp(ctx, get_input_dtype());
-
-    auto dtype_block = is_int8 ? 4 : (is_vnni_low_fp ? 2 : 1);
-    auto default_block = dtype_block * 32;
-
-    if (mb_ == 1 && num_threads == 4) { default_block = 64; };
-    bool has_pad = (pd_b_ > 0) || (ph_b_ > 0) || (pw_b_ > 0) || (pd_e_ > 0)
-      || (ph_e_ > 0) || (pw_e_ > 0);
-    if (has_pad) { default_block = (is_int8 || is_vnni_low_fp) ? 128 : 64; }
-
-    cfg.im_oc_block
-      = get_blocks_if_not_satisfy(oc_, 1, default_block, [](int x) {
-          return x % 32 != 0;
-        }).back();
-    cfg.im_ic_block = ic_ <= 512
-      ? ic_
-      : get_blocks_if_not_satisfy(ic_, 1, default_block, [](int x) {
-          return x % 32 != 0;
-        }).back();
-
-    cfg.im_h_block = 1;
-    cfg.im_w_block = ow_;
-
-    if (oh_ <= 14 && ow_ <= 14) { cfg.im_h_block = oh_; }
-
-    cfg.h_block = oh_;
-    cfg.w_block = ow_;
-
-    if (cfg.oc_threads != 1 && !has_pad) {
-      int im_oc_num_block = oc_ / cfg.im_oc_block;
-      if (im_oc_num_block % cfg.oc_threads != 0) {
-        auto get_suitable_block
-          = [](int total, int original_block, const std::vector<int> &splits,
-              int threads) {
-              int suitable_block = original_block;
-              for (auto split : splits) {
-                int num_block = total / split;
-                if (num_block % threads == 0) {
-                  if ((total / suitable_block) % threads != 0
-                    || std::abs(original_block - split)
-                      < std::abs(original_block - suitable_block))
-                    suitable_block = split;
-                }
-              }
-              return suitable_block;
-            };
-        // Get a suitable im_oc_block when im_oc_num_block can't be evenly
-        // distributed
-        cfg.im_oc_block = get_suitable_block(
-          oc_, cfg.im_oc_block, get_splits(oc_), cfg.oc_threads);
-      }
-    }
-
-    if (try_os_blocking_) {
-      cfg.im_w_block = get_os_blocks(ow_, adj_os_).back();
-      if (ow_ > 28 && ctx->use_amx()) {
-        cfg.im_w_block = utils::get_blocks(ow_, 1, 256).back();
-      } else {
-        auto os_blocks = get_os_blocks(ow_, adj_os_);
-        if (os_blocks.back() < 400) { cfg.im_w_block = os_blocks.back(); }
-      }
-      bool pack_rows = (cfg.im_w_block > 0 && ow_ % cfg.im_w_block != 0);
-      cfg.w_block = pack_rows ? adj_os_ : actual_os_;
-      if (mb_ == 1 && num_threads == 4) {
-        if (oc_ >= 256) {
-          cfg.bs_threads = 1;
-          cfg.h_threads = 1;
-          cfg.w_threads = 1;
-          cfg.oc_threads = num_threads;
-        } else {
-          cfg.bs_threads = 1;
-          cfg.oc_threads = 1;
-          cfg.h_threads = num_threads;
-          cfg.w_threads = 1;
-        }
-        cfg.im_oc_block
-          = std::min(utils::get_blocks(oc_, 1, default_block).back(),
-            oc_ / cfg.oc_threads);
-        // if oc threads not enough, then try to split threads on h/w
-        if (cfg.oc_threads == 1) {
-          pack_rows = (cfg.im_w_block > 0 && ow_ % cfg.im_w_block != 0);
-          if (pack_rows) {
-            // use os blocking, try to refind a appropriate os block
-            cfg.w_threads = num_threads;
-            cfg.h_threads = 1;
-            auto os_blocks = get_os_blocks(ow_, adj_os_);
-            for (int i = os_blocks.size() - 1; i >= 0; i--) {
-              if (os_blocks[i] <= 256
-                && os_blocks[i] <= adj_os_ / cfg.w_threads) {
-                cfg.im_w_block = os_blocks[i];
-                break;
-              }
-            }
-            if (cfg.im_w_block <= ow_) {
-              cfg.w_threads = 1;
-              cfg.h_threads = num_threads;
-            }
-          } else {
-            // don't use os blocking, directly split threads on h
-            cfg.h_threads = num_threads;
-            cfg.w_threads = 1;
-          }
-          cfg.oc_threads = 1;
-        }
-        auto real_os = pack_rows ? adj_os_ : actual_os_;
-        cfg.w_block = cfg.im_w_block;
-      }
-      pack_rows = (cfg.im_w_block > 0 && ow_ % cfg.im_w_block != 0);
-      if (!pack_rows) {
-        cfg.im_h_block = 1;
-        cfg.h_block = cfg.h_threads == 1
-          ? oh_
-          : (utils::divide_and_ceil(
-               utils::divide_and_ceil(oh_, cfg.im_h_block), cfg.h_threads)
-            * cfg.im_h_block);
-        cfg.w_block = cfg.w_threads == 1
-          ? ow_
-          : (utils::divide_and_ceil(
-               utils::divide_and_ceil(ow_, cfg.im_w_block), cfg.w_threads)
-            * cfg.im_w_block);
-      }
-    } else {
-      if (!is_1x1_conv_ && has_pad) {
-        if (mb_ == 1 && oc_ / cfg.im_oc_block < cfg.oc_threads) {
-          cfg.im_w_block = utils::get_blocks(ow_, 1, 256).back();
-          if (oc_ / cfg.oc_threads > 32) {
-            cfg.bs_threads = 1;
-            cfg.h_threads = 1;
-            cfg.w_threads = 1;
-            cfg.oc_threads = num_threads;
-            cfg.im_oc_block
-              = std::min(utils::get_blocks(oc_, 1, default_block).back(),
-                oc_ / cfg.oc_threads);
-          } else {
-            cfg.bs_threads = 1;
-            cfg.oc_threads = 1;
-            cfg.h_threads = num_threads;
-            cfg.w_threads = 1;
-            cfg.im_h_block = 1;
-            cfg.h_block = cfg.h_threads == 1
-              ? oh_
-              : (utils::divide_and_ceil(
-                   utils::divide_and_ceil(oh_, cfg.im_h_block), cfg.h_threads)
-                * cfg.im_h_block);
-          }
-        }
-      }
-    }
-
-    if (is_1x1_conv_) {
-      if (ic_ >= 256 && oc_ >= 256 && oh_ <= 14) {
-        cfg.im_h_block = oh_;
-      } else {
-        cfg.im_h_block = 1;
-        if (oh_ >= 28 && cfg.bs_threads % 2 == 0) {
-          cfg.h_threads = 2;
-          cfg.bs_threads /= 2;
-        }
-      }
-      if (mb_ == 1 && num_threads == 4) {
-        cfg.im_w_block = ow_;
-        if (oc_ >= 512 && ic_ >= 512 && oh_ <= 32) {
-          cfg.bs_threads = 1;
-          cfg.h_threads = 1;
-          cfg.w_threads = 1;
-          cfg.oc_threads = num_threads;
-        } else {
-          cfg.bs_threads = 1;
-          cfg.oc_threads = 1;
-          cfg.h_threads = num_threads;
-          cfg.w_threads = 1;
-          cfg.im_h_block = 1;
-        }
-      }
-      auto oc_default_block = default_block;
-      cfg.im_oc_block
-        = std::min(get_blocks_if_not_satisfy(oc_, 1, oc_default_block,
-                     [](int x) { return x % 32 != 0; })
-                     .back(),
-          oc_ / cfg.oc_threads);
-
-      if (cfg.im_h_block == 1 && cfg.im_oc_block == oc_default_block
-        && cfg.im_ic_block == default_block) {
-        if (ow_ >= 56 && ow_ % 2 == 0) {
-          cfg.im_w_block = ow_ / 2;
-        } else if (sw_ == 1 && ow_ >= 28 && oc_ >= ic_ && oc_ >= 512
-          && ow_ % 2 == 0) {
-          cfg.im_w_block = ow_ / 2;
-        } else {
-          cfg.im_w_block = ow_;
-        }
-      }
-
-      auto L1_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(1);
-      if ((cfg.im_oc_block * ic_ + cfg.im_w_block * cfg.im_h_block * ic_)
-            * (4 / dtype_block)
-          > static_cast<int64_t>(L1_cache_size)
-        && cfg.im_oc_block > 64) {
-        // adjust brgemm when brgemm is larger than L1 cache
-        oc_default_block = default_block / 2;
-        cfg.im_oc_block
-          = std::min(get_blocks_if_not_satisfy(oc_, 1, oc_default_block,
-                       [](int x) { return x % 32 != 0; })
-                       .back(),
-            oc_ / cfg.oc_threads);
-      }
-
-      cfg.h_block = cfg.h_threads == 1
-        ? oh_
-        : (utils::divide_and_ceil(
-             utils::divide_and_ceil(oh_, cfg.im_h_block), cfg.h_threads)
-          * cfg.im_h_block);
-    }
-
-    if (!is_1x1_conv_ && oc_ > 128 && cfg.im_oc_block % 32 != 0) {
-      cfg.im_oc_block = utils::rnd_up(cfg.im_oc_block, 32);
-    }
-    cfg.K_block = utils::divide_and_ceil(
-                    utils::divide_and_ceil(
-                      utils::rnd_up(oc_, cfg.im_oc_block), cfg.im_oc_block),
-                    cfg.oc_threads)
-      * cfg.im_oc_block;
-    if (utils::rnd_up(oc_, cfg.im_oc_block) % cfg.K_block != 0) {
-      cfg.K_block = cfg.im_oc_block;
-    }
-
-    if (!is_1x1_conv_ && ic_ > 32 && cfg.im_ic_block % 32 != 0) {
-      // The performance is bad when ic_block % 32 != 0. The performance of ic =
-      // 56 is worse than ic = 64(in total execution time).
-      cfg.im_ic_block = utils::rnd_up(ic_ <= 512 ? ic_ : cfg.im_ic_block, 32);
-    }
-
-    cfg.C_block = utils::divide_and_ceil(
-                    utils::divide_and_ceil(
-                      utils::rnd_up(ic_, cfg.im_ic_block), cfg.im_ic_block),
-                    ic_threads)
-      * cfg.im_ic_block;
-    if (utils::rnd_up(ic_, cfg.im_ic_block) % cfg.C_block != 0) {
-      cfg.C_block = cfg.im_ic_block;
-    }
-  }
-  if (use_conv1d) {
-    const int num_threads = runtime_config_t::get().get_num_threads();
-    auto thread_split = get_splits(num_threads);
-    auto im_s_block = get_im_w_block(ctx);
-    auto im_oc_block = get_im_oc_block(ctx);
-    auto im_ic_block = get_im_ic_block(ctx);
-    auto closest_split = [](int x, std::vector<int> splits) {
-      int close_num = splits[0];
-      for (auto split : splits) {
-        if (x - split < x - close_num && x > split) { close_num = split; }
-      }
-      return close_num;
-    };
-    cfg.bs_threads
-      = mb_ >= num_threads ? num_threads : closest_split(mb_, thread_split);
-    cfg.w_threads = num_threads / cfg.bs_threads;
-    cfg.oc_threads = 1;
-    auto s_max_task_num = ow_ / im_s_block;
-    if (mb_ == 1 && s_max_task_num < num_threads) {
-      auto oc_max_task_num = oc_ / im_oc_block;
-      if (oc_max_task_num == num_threads || oc_max_task_num % num_threads == 0
-        || oc_max_task_num > num_threads * 8) {
-        cfg.bs_threads = 1;
-        cfg.oc_threads = num_threads;
-        cfg.w_threads = 1;
-      } else if (oc_ < 1024 && oh_ * ow_ <= 28 * 28 && num_threads % 2 == 0) {
-        cfg.bs_threads = 1;
-        cfg.oc_threads = num_threads / 2;
-        cfg.w_threads = num_threads / 2;
-      } else {
-        cfg.bs_threads = 1;
-        cfg.oc_threads = 1;
-        cfg.w_threads = num_threads;
-      }
-    }
-    auto ic_threads = 1;
-    cfg.w_block
-      = utils::divide_and_ceil(
-          utils::divide_and_ceil(oh_ * ow_, im_s_block), cfg.w_threads)
-      * im_s_block;
-    cfg.K_block = utils::divide_and_ceil(
-                    utils::divide_and_ceil(oc_, im_oc_block), cfg.oc_threads)
-      * im_oc_block;
-    cfg.C_block = utils::divide_and_ceil(
-                    utils::divide_and_ceil(ic_, im_ic_block), ic_threads)
-      * im_ic_block;
-  }
-  return std::move(ret);
-}
-
-void gen_nested_conv_fwd_t::bind_output_loop_axis(const for_loop &loop,
-  const std::vector<std::string> &axis, bool is_block) const {
-  auto out_tsr = owner_->get_outputs()[0];
-  std::vector<int> real_axis;
-  for (const auto &ax : axis) {
-    if (ax == "N") {
-      real_axis.emplace_back(0);
-    } else if (ax == "C") {
-      real_axis.emplace_back(1);
-    } else if (ax == "D" && is_3d_) {
-      real_axis.emplace_back(2);
-    } else if (ax == "H") {
-      real_axis.emplace_back(is_3d_ + 2);
-    } else if (ax == "W") {
-      real_axis.emplace_back(is_3d_ + 3);
-    }
-  }
-  bind_loop_axis(out_tsr, loop, real_axis);
-}
-
-int gen_nested_conv_fwd_t::get_im_w_block(const context_ptr &ctx) const {
-  auto ret = default_im_block_;
-  auto origin_ow = dim2unsigned(attrs_.get_or_else("origin_ow", sc_dim(ow_)));
-  auto origin_oh = dim2unsigned(attrs_.get_or_else("origin_oh", sc_dim(1)));
-  bool is_large_spatial = origin_ow > 64;
-  if (is_large_spatial) { return utils::get_blocks(origin_ow, 1, 32).back(); }
-  auto s_default_block = default_im_block_;
-  if (origin_ow > 14) {
-    auto L1_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(1);
-    // not use L1_cache too full
-    s_default_block = L1_cache_size / 4
-      / (get_im_oc_block(ctx) * utils::get_sizeof_type(get_weight_dtype()));
-  }
-  auto s_block_list = utils::get_blocks(ow_, 1, s_default_block);
-  s_block_list.erase(
-    std::remove_if(s_block_list.begin(), s_block_list.end(),
-      [&](int blk) {
-        return !(origin_ow % blk == 0
-          || (blk % origin_ow == 0 && origin_oh * origin_ow % blk == 0)
-          || blk % (origin_oh * origin_ow) == 0);
-      }),
-    s_block_list.end());
-  return s_block_list.back();
-}
-
-int gen_nested_conv_fwd_t::get_im_oc_block(const context_ptr &ctx) const {
-  return im_oc_block_;
-}
-
-int gen_nested_conv_fwd_t::get_im_ic_block(const context_ptr &ctx) const {
-  return im_ic_block_;
-}
-
-gen_nested_conv_fwd_t::gen_nested_conv_fwd_t(sc_op *owner,
-  const sc_dims &stride, const sc_dims &dilation, const sc_dims &pads_begin,
-  const sc_dims &pads_end, std::vector<logical_tensor_t> &&ins,
-  std::vector<logical_tensor_t> &&outs)
-  : parent(owner, std::move(ins), std::move(outs)) {
-  COMPILE_ASSERT(in_tensors_.size() == 2,
-    "Wrong number of inputs, expected to be 2 but got " << in_tensors_.size()
-                                                        << ".");
-  COMPILE_ASSERT(out_tensors_.size() == 1,
-    "Wrong number of output, expected to be 1 but got " << out_tensors_.size()
-                                                        << ".");
-
-  auto input_plain_dims = get_input_plain_dims();
-  auto weight_plain_dims = get_weight_plain_dims();
-  auto out_plain_dims = get_output_plain_dims();
-  if (owner) { attrs_ = owner->attrs_; }
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(input_plain_dims.size()), 3, 4, 5),
-    "Wrong input dims, expected to be 3D, 4D or 5D input, but got "
-      << input_plain_dims.size() << "D.");
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(weight_plain_dims.size()), 3, 4, 5)
-      && (weight_plain_dims.size() == input_plain_dims.size()),
-    "Wrong weight dims, only support 3D, 4D or 5D weights, but got "
-      << weight_plain_dims.size() << "D.");
-  COMPILE_ASSERT(
-    utils::is_one_of(static_cast<int>(out_plain_dims.size()), 3, 4, 5)
-      && (out_plain_dims.size() == input_plain_dims.size()),
-    "Wrong output dims, only support 3D, 4D or 5D weights, but got "
-      << out_plain_dims.size() << "D.");
-
-  ndims_ = input_plain_dims.size();
-  is_3d_ = (ndims_ == 5);
-  is_1d_ = (ndims_ == 3);
-
-  blocking_input_ = get_input_blocking_dims().size() > ndims_;
-  blocking_output_ = get_output_blocking_dims().size() > ndims_;
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(pads_begin.size()), 1, 2),
-    "Wrong pads_begin dims, should be 1D, 2D or 3D, but got "
-      << pads_begin.size() << "D.");
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(stride.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(stride.size()), 1, 2),
-    "Wrong stride dims, should be 1D, 2D or 3D, but got " << stride.size()
-                                                          << "D.");
-  COMPILE_ASSERT(is_3d_
-      ? utils::is_one_of(static_cast<int>(dilation.size()), 1, 3)
-      : utils::is_one_of(static_cast<int>(dilation.size()), 1, 2),
-    "Wrong dilation dims, should be 1D, 2D or 3D, but got " << dilation.size()
-                                                            << "D.");
-  groups_ = static_cast<int>(attrs_.get_or_else("groups", 1));
-  COMPILE_ASSERT(input_plain_dims[1] / groups_ == weight_plain_dims[1],
-    "expect input_plain_dims[1] / groups == weight_plain_dims[1], but got "
-      << input_plain_dims[1] / groups_ << " vs " << weight_plain_dims[1]
-      << ".");
-
-  mb_ = input_plain_dims[0];
-  ic_ = input_plain_dims[1] / groups_;
-  id_ = is_3d_ ? input_plain_dims[2] : 1;
-  ih_ = is_1d_ ? 1 : input_plain_dims[ndims_ - 2];
-  iw_ = input_plain_dims[ndims_ - 1];
-  oc_ = weight_plain_dims[0] / groups_;
-  kd_ = is_3d_ ? weight_plain_dims[2] : 1;
-  kh_ = is_1d_ ? 1 : weight_plain_dims[ndims_ - 2];
-  kw_ = weight_plain_dims[ndims_ - 1];
-  od_ = is_3d_ ? out_plain_dims[2] : 1;
-  oh_ = is_1d_ ? 1 : out_plain_dims[ndims_ - 2];
-  ow_ = out_plain_dims[ndims_ - 1];
-  is_1x1_conv_ = (kd_ == 1 && kh_ == 1 && kw_ == 1);
-  pd_b_ = is_3d_ ? pads_begin[0] : 0;
-  ph_b_ = is_1d_ ? 0 : pads_begin[0], pw_b_ = pads_begin[0];
-  pd_e_ = is_3d_ ? pads_end[0] : 0;
-  ph_e_ = is_1d_ ? 0 : pads_end[0], pw_e_ = pads_end[0];
-  if (owner) { attrs_ = owner->attrs_; }
-  bool is_bf16 = get_input_dtype() == datatypes::bf16;
-  bool is_int8
-    = utils::is_one_of(get_input_dtype(), datatypes::u8, datatypes::s8);
-  if (is_1d_) {
-    auto dtype_block = is_int8 ? 4 : (is_bf16 ? 2 : 1);
-    const int num_threads = runtime_config_t::get().get_num_threads();
-    default_im_block_ = dtype_block * 64;
-    if (ic_ * oc_ < 512 * 512) { default_im_block_ /= 2; }
-    if (mb_ == 1 && num_threads == 4) { default_im_block_ = 64; }
-    bool is_small_oc_with_enough_parallel
-      = oc_ < 256 && is_parallel_space_enough(mb_ * ow_ / 32, num_threads);
-    im_oc_block_ = utils::get_blocks(
-      oc_, 1, is_small_oc_with_enough_parallel ? oc_ : default_im_block_)
-                     .back();
-    im_ic_block_ = utils::get_blocks(ic_, 1, default_im_block_).back();
-    im_w_block_ = utils::get_blocks(ow_ * oh_, 1, default_im_block_).back();
-  }
-
-  if (pads_begin.size() > 1) {
-    ph_b_ = pads_begin[ndims_ - 4];
-    pw_b_ = pads_begin[ndims_ - 3];
-  }
-  if (pads_end.size() > 1) {
-    ph_e_ = pads_end[ndims_ - 4];
-    pw_e_ = pads_end[ndims_ - 3];
-  }
-  sd_ = is_3d_ ? stride[0] : 1;
-  sh_ = is_1d_ ? 1 : stride[0], sw_ = stride[0];
-  if (stride.size() > 1) {
-    auto stride_size = stride.size();
-    sh_ = stride[stride_size - 2];
-    sw_ = stride[stride_size - 1];
-  }
-
-  dd_ = is_3d_ ? dilation[0] : 1;
-  dh_ = is_1d_ ? 1 : dilation[0], dw_ = dilation[0];
-  if (dilation.size() > 1) {
-    auto dilation_size = dilation.size();
-    dh_ = dilation[dilation_size - 2];
-    dw_ = dilation[dilation_size - 1];
-  }
-
-  // For non 1x1 conv and AMX platform, spatial blocking instead of row
-  // blocking is used, which needs to consider the border carefully, as the
-  // cross row boundary (contains padding or not) will generate useless output
-  // which have to be skipped before storing.
-  actual_os_ = oh_ * ow_;
-  num_elems_skip_per_ow_ = ((dw_ * (kw_ - 1)) / sw_) * sh_ + (sh_ - 1) * ow_;
-  adj_os_ = std::min(actual_os_ + num_elems_skip_per_ow_ * (oh_ - 1),
-    (ih_ + ph_b_ + ph_e_) * (iw_ + pw_b_ + pw_e_));
-
-  // Note: os blocking is only valid for non_1x1, no pad and non 3D conv with
-  // amx-int8 only so far.
-  bool has_pad = (pd_b_ > 0) || (ph_b_ > 0) || (pw_b_ > 0) || (pd_e_ > 0)
-    || (ph_e_ > 0) || (pw_e_ > 0);
-  // TODO(zhicong): when sh > 1, os blocking will have correctness issue
-  try_os_blocking_ = (!is_1x1_conv_) && (!has_pad) && (!is_3d_)
-    && (is_int8 || is_bf16) && !is_dynamic() && sh_ == 1;
-  use_nested_2d_ = (!is_1d_ && !is_3d_);
-  if (is_1d_) {
-    use_conv1d = true;
-    COMPILE_ASSERT((kw_ == 1 && pw_b_ == 0 && pw_e_ == 0),
-      "Conv1d doesn't support padding and kernel size except 1x1.");
-  }
-  COMPILE_ASSERT(use_nested_2d_ || use_conv1d,
-    "expect input is 2D in nested conv2d, but got " << ndims_ - 2 << "D input");
-}
-
-float gen_nested_conv_fwd_t::get_gflop() const {
-  float result = (float)mb_ * groups_ * oc_ * 2.0 * ic_ * kd_ * kh_ * kw_ * od_
-    * oh_ * ow_ / (float)1e9;
-  return result;
-}
-
-void gen_nested_conv_fwd_t::generate_brgemm(const expr &im_s_block,
-  int im_ic_block, int im_oc_block, int ic_block, const expr &o_ic,
-  int ic_num_block_pt, const expr &A_list, const expr &B_list,
-  const expr &out_tensor, const expr &LDA, const expr &LDC) const {
-  brgemm::attrs_setting_t::attrs_map_t range_attr_0
-    = {brgemm::attr_key::M_range_upper_bound, 64};
-  sc_brgemm_attrs_t brg_attrs = sc_brgemm_attrs_t {
-    {brgemm::attr_key::max_bs, kh_ * kw_ * ic_block / im_ic_block},
-    {brgemm::attr_key::use_interleave_stores, true},
-    {brgemm::attr_key::use_uker, true}, range_attr_0};
-
-  if (ic_num_block_pt > 1) {
-    _if_(o_ic == 0) {
-      gc::builtin::brgemm_init_list_update(A_list, B_list, out_tensor, 1,
-        im_s_block, im_oc_block, im_ic_block, LDA, im_oc_block, LDC,
-        1 /*useless*/
-        ,
-        1 /*useless*/
-        ,
-        kh_ * kw_ * ic_block / im_ic_block, get_input_dtype(),
-        get_weight_dtype(), brg_attrs);
-    }
-    _else_ {
-      gc::builtin::brgemm_list_update(A_list, B_list, out_tensor, 1, im_s_block,
-        im_oc_block, im_ic_block, LDA, im_oc_block, LDC, 1 /*useless*/
-        ,
-        1 /*useless*/
-        ,
-        kh_ * kw_ * ic_block / im_ic_block, get_input_dtype(),
-        get_weight_dtype(), brg_attrs);
-    }
-  } else {
-    gc::builtin::brgemm_init_list_update(A_list, B_list, out_tensor, 1,
-      im_s_block, im_oc_block, im_ic_block, LDA, im_oc_block, LDC, 1 /*useless*/
-      ,
-      1 /*useless*/
-      ,
-      kh_ * kw_ * ic_block / im_ic_block, get_input_dtype(), get_weight_dtype(),
-      brg_attrs);
-  }
-}
-
-#define CONV_ARG_LIST \
-  const context_ptr &ctx, const nested_conv_fwd_config_t &config, \
-    fusion_anchor_mgr_t *fusion, expr &output, const expr &input, \
-    const expr &weight, std::vector<for_loop> &loops, const int os, \
-    const int kpack, const bool use_os_blocking, const bool pack_rows, \
-    const expr &os_acc_size, const std::vector<char> &os_mask
-
-void gen_nested_conv_fwd_t::compute_conv1d(CONV_ARG_LIST) const {
-  // TODO(zhicong):
-  // 1. support blocking layout
-  // 2. provide better support in scc bench
-  // 3. add iterated anchor
-
-  // std::max avoid grid tuning generate bad config
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int bs_threads = std::max(1, config.bs_threads);
-  int s_threads
-    = std::max(1, std::min(num_threads / bs_threads, config.w_threads));
-  int oc_threads = std::max(1, num_threads / bs_threads / s_threads);
-  int ic_threads = 1;
-  int oc_block = config.K_block;
-  int s_block = config.w_block;
-  int ic_block = config.C_block;
-  int im_oc_block = get_im_oc_block(ctx);
-  int im_ic_block = get_im_ic_block(ctx);
-  int im_s_block = get_im_w_block(ctx);
-  if (oc_block % im_oc_block != 0) { oc_block = im_oc_block; }
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-  if (ic_block % im_ic_block != 0) { ic_block = im_ic_block; }
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-  if (s_block % im_s_block != 0) { s_block = im_s_block; }
-  COMPILE_ASSERT(
-    s_block % im_s_block == 0, "s_block % im_s_block != 0, config is invalid")
-
-  // param
-  expr input_tmp = input;
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-  int os_ = ow_;
-  for_loop lpbs, lps, lpoc, lpic, lobs, los, looc, loic, lioc, lis;
-
-  int bs_num_block_pt, bs_tail_num_block_pt, oc_num_block_pt,
-    oc_tail_num_block_pt, s_num_block_pt, s_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-  int bs_used_threads
-    = block_split(mb_, bs_threads, bs_num_block_pt, bs_tail_num_block_pt);
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-  int os_used_threads = block_split(utils::divide_and_ceil(os_, s_block),
-    s_threads, s_num_block_pt, s_tail_num_block_pt);
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  bool shrink_tensor = false;
-
-  if (ic_used_threads > 1) {
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-
-  auto origin_ow = dim2unsigned(attrs_.get_or_else("origin_ow", sc_dim(ow_)));
-  auto origin_oh = dim2unsigned(attrs_.get_or_else("origin_oh", sc_dim(oh_)));
-  auto infer_input_idx = [&](std::vector<expr> output_idx) {
-    std::vector<expr> input_idx = output_idx;
-    auto origin_iw = dim2unsigned(attrs_.get_or_else("origin_iw", sc_dim(iw_)));
-    auto origin_ih = dim2unsigned(attrs_.get_or_else("origin_ih", sc_dim(ih_)));
-    if (sh_ > 1 || sw_ > 1) {
-      expr os = output_idx[1];
-      expr ow = os % origin_ow;
-      expr oh = os / origin_ow % origin_oh;
-      expr bs = os / origin_ow / origin_oh;
-      expr iw = ow * sw_;
-      expr ih = oh * sh_;
-      expr is = bs * origin_iw * origin_ih + ih * origin_iw + iw;
-      input_idx[1] = is;
-    }
-    return input_idx;
-  };
-
-  _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lps, ps, 0, os_used_threads, 1) {
-      _named_for_(lpoc, poc, 0, oc_used_threads, 1) {
-        _named_for_(lpic, pic, 0, ic_used_threads, 1) {
-          expr s_num_block = builder::make_select(ps < (os_used_threads - 1),
-                 s_num_block_pt, s_tail_num_block_pt),
-               oc_num_block = builder::make_select(poc < (oc_used_threads - 1),
-                 oc_num_block_pt, oc_tail_num_block_pt);
-          if (sh_ > 1 || sw_ > 1) {
-            auto in_dims = input_blocking_dims;
-            in_dims[1] = output_blocking_dims[1];
-            _tensor_(in_tmp, tinput.dtype_, dims_to_expr(in_dims));
-            input_tmp = in_tmp;
-            if (s_threads == 1 && oc_threads == 1 && ic_threads == 1) {
-              shrink_tensor = false;
-            } else {
-              shrink_tensor = true;
-            }
-          }
-          // single core
-          expr ic_num_block = builder::make_select(
-            pic < (ic_used_threads - 1), ic_num_block_pt, ic_tail_num_block_pt);
-          expr n = pbs;
-          _named_for_(los, o_s, 0, s_num_block_pt) {
-            _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-              _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                expr cond = o_s < s_num_block && o_oc < oc_num_block
-                  && o_ic < ic_num_block;
-                _if_(cond) {
-                  _named_for_(lis, i_s, 0, s_block / im_s_block) {
-                    expr s = (ps * s_num_block_pt * s_block / im_s_block
-                               + o_s * s_block / im_s_block + i_s)
-                      * im_s_block;
-                    _if_(s < os_) {
-                      _named_for_(lioc, i_oc, 0, oc_block / im_oc_block) {
-                        _tensor_(
-                          A_list, datatypes::pointer, {ic_block / im_ic_block});
-                        _tensor_(
-                          B_list, datatypes::pointer, {ic_block / im_ic_block});
-
-                        expr oc = poc * oc_num_block_pt * oc_block / im_oc_block
-                          + o_oc * oc_block / im_oc_block + i_oc;
-                        _if_(oc * im_oc_block < oc_) {
-                          if (sh_ > 1 || sw_ > 1) {
-                            if (shrink_tensor) {
-                              input_tmp
-                                ->attr()[tensor_shrinker_attrs::should_shrink]
-                                = tensor_shrinker_t::shrink_info_t {
-                                  /*base*/ {n, s,
-                                    (pic * ic_num_block_pt * ic_block
-                                        / im_ic_block
-                                      + o_ic * ic_block / im_ic_block)
-                                      * im_ic_block},
-                                  /*shape*/ {1, im_s_block, ic_block},
-                                  /*move def*/ stmts()};
-                            } else {
-                              input_tmp
-                                ->attr()[tensor_shrinker_attrs::should_shrink]
-                                = tensor_shrinker_t::shrink_info_t {
-                                  /*base*/ {pbs, 0, 0},
-                                  /*shape*/ {1, os_, ic_block},
-                                  /*move def*/ stmts()};
-                            }
-                          }
-                          _for_(i_c, 0, ic_block / im_ic_block) {
-                            expr ic
-                              = pic * ic_num_block_pt * ic_block / im_ic_block
-                              + o_ic * ic_block / im_ic_block + i_c;
-                            _if_(ic * im_ic_block < ic_) {
-                              std::vector<expr> input_pos
-                                = std::vector<expr> {n, s, ic * im_ic_block};
-                              if (sh_ > 1 || sw_ > 1) {
-                                int lanes = 1;
-                                lanes = (uint32_t)(ctx->get_max_vector_lanes(
-                                  tinput.dtype_.type_code_));
-                                if (im_ic_block % lanes != 0) { lanes = 1; }
-                                _if_(
-                                  (o_oc == 0 && i_oc == 0) || shrink_tensor) {
-                                  _for_(ii_os, 0, im_s_block, 1) {
-                                    _for_(ii_ic, 0, im_ic_block, lanes) {
-                                      auto out_pos = input_pos;
-                                      out_pos[1] = out_pos[1] + ii_os;
-                                      out_pos[2] = out_pos[2] + ii_ic;
-                                      input_tmp[span_t(out_pos, lanes)]
-                                        = input[span_t(
-                                          infer_input_idx(out_pos), lanes)];
-                                    }
-                                  }
-                                }
-                              }
-                              A_list[i_c] = tensor_ptr(input_tmp, input_pos);
-                              B_list[i_c] = tensor_ptr(weight,
-                                kpack > 1
-                                  ? std::vector<expr> {oc, ic, 0, 0, 0, 0}
-                                  : std::vector<expr> {oc, ic, 0, 0, 0});
-                            }
-                          }
-
-                          const auto hint_A_size = im_s_block * ic_block;
-                          const auto hint_B_size = im_oc_block * ic_block;
-                          const auto hint_C_size = im_s_block * im_oc_block;
-                          sc_brgemm_attrs_t brg_attrs {
-                            {brgemm::attr_key::max_bs, ic_block / im_ic_block},
-                            {brgemm::attr_key::hint_expected_A_size,
-                              hint_A_size},
-                            {brgemm::attr_key::hint_expected_B_size,
-                              hint_B_size},
-                            {brgemm::attr_key::hint_expected_C_size,
-                              hint_C_size},
-                            {brgemm::attr_key::use_interleave_stores, true},
-                            {brgemm::attr_key::use_uker, true}};
-
-                          auto LDA = ic_;
-                          auto LDB = im_oc_block;
-                          auto LDC = oc_;
-                          auto stride_a = 1; /*useless*/
-                          auto stride_b = 1; /*useless*/
-                          auto stride_c = ic_block / im_ic_block;
-                          std::vector<expr> output_pos = std::vector<expr> {
-                            pic * mb_ + n, s, oc * im_oc_block};
-                          if (ic_num_block_pt > 1) {
-                            _if_(o_ic == 0) {
-                              builtin::brgemm_init_list_update(A_list, B_list,
-                                tensor_ptr(output_tmp, output_pos), 1,
-                                im_s_block, im_oc_block, im_ic_block, LDA, LDB,
-                                LDC, stride_a, stride_b, stride_c,
-                                get_input_dtype(), get_weight_dtype(),
-                                brg_attrs);
-                            }
-                            _else_ {
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output_tmp, output_pos), 1,
-                                im_s_block, im_oc_block, im_ic_block, LDA, LDB,
-                                LDC, stride_a, stride_b, stride_c,
-                                get_input_dtype(), get_weight_dtype(),
-                                brg_attrs);
-                            }
-                          } else {
-                            builtin::brgemm_init_list_update(A_list, B_list,
-                              tensor_ptr(output_tmp, output_pos), 1, im_s_block,
-                              im_oc_block, im_ic_block, LDA, LDB, LDC, stride_a,
-                              stride_b, stride_c, get_input_dtype(),
-                              get_weight_dtype(), brg_attrs);
-                          }
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              create_fusion_anchor(fusion,
-                                owner_->get_outputs()[0],
-                                slice_range {{n, 1UL}, {s, im_s_block},
-                                  {oc * im_oc_block, im_oc_block}});
-                            }
-                          }
-                        }
-                      }
-                      if (ic_used_threads == 1 && ic_num_block_pt == 1
-                        && oc_block * oc_used_threads == oc_) {
-                        _if_(o_ic == (ic_num_block - 1)) {
-                          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                            slice_range {{n, 1UL}, {s, im_s_block},
-                              {(poc * oc_num_block_pt * oc_block / im_oc_block
-                                 + o_oc * oc_block / im_oc_block)
-                                  * im_oc_block,
-                                oc_block}});
-                        }
-                      }
-                    }
-                  }
-                  if (ic_used_threads == 1 && ic_num_block_pt == 1
-                    && oc_block * oc_used_threads == oc_
-                    && s_block * os_used_threads == os_
-                    && s_block % (origin_oh * origin_ow) == 0) {
-                    _if_(o_ic == (ic_num_block - 1)) {
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        slice_range {{n, 1UL},
-                          {(ps * s_num_block_pt * s_block / im_s_block
-                             + o_s * s_block / im_s_block)
-                              * im_s_block,
-                            s_block},
-                          {(poc * oc_num_block_pt * oc_block / im_oc_block
-                             + o_oc * oc_block / im_oc_block)
-                              * im_oc_block,
-                            oc_block}});
-                    }
-                  }
-                }
-              }
-              // TODO(zhicong): need to use iterated anchor to support more
-              // fusion opportunity
-              if (false && fusion && ic_used_threads == 1
-                && oc_block * oc_used_threads == oc_
-                && s_block * os_used_threads == os_) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  slice_range {{n, 1UL},
-                    {(ps * s_num_block_pt * s_block / im_s_block
-                       + o_s * s_block / im_s_block)
-                        * im_s_block,
-                      s_block},
-                    {(poc * oc_num_block_pt * oc_block / im_oc_block
-                       + o_oc * oc_block / im_oc_block)
-                        * im_oc_block,
-                      oc_block}});
-              }
-            }
-          }
-
-          if (oc_threads == 1 && ic_threads == 1 && s_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              slice_range {{pbs, 1UL}, {0, os_}, {0, oc_}});
-          }
-        } // final reduce
-        if (oc_threads == 1 && s_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            slice_range {{pbs, 1UL}, {0, os_}, {0, oc_}});
-        }
-      }
-      if (s_threads == 1) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          slice_range {{pbs, 1UL}, {0, os_}, {0, oc_}});
-      }
-    }
-    if (mb_ > 1) {
-      // when mb_ == 1, no need fuse in here or the conv is flattened conv which
-      // cannot be fused in bs
-      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-        slice_range {{pbs, 1UL}, {0, os_}, {0, oc_}});
-    }
-  }
-  loops = {lpbs, lps, lpoc, lpic};
-  bind_loop_axis(owner_->get_outputs()[0], lpbs, 0);
-  bind_loop_axis(owner_->get_outputs()[0], lps, 2);
-  bind_loop_axis(owner_->get_outputs()[0], lpoc, 1);
-  bind_loop_axis(owner_->get_outputs()[0], lpic, std::vector<int> {});
-}
-
-void gen_nested_conv_fwd_t::compute_1x1_pack_input_nested(CONV_ARG_LIST) const {
-  COMPILE_ASSERT(!is_3d_, "1x1 pack input doens't support 3D conv yet!");
-  tensor input1;
-  int lanes = get_lanes(ctx, config.im_ic_block, get_input_dtype());
-  auto toutput = out_tensors_[0];
-  auto out_fmt = toutput.get_format();
-  auto oh_expr_ = oh_;
-  if (!out_fmt.is_any()) {
-    auto out_p2b_map = out_fmt.format_code_.collect_p2b_mapping();
-    oh_expr_ = static_cast<int>(get_expr_as_int(
-      output.checked_as<tensor>()->dims_[out_p2b_map[is_3d_ ? 3 : 2][0]]));
-  }
-  if (config.pack_input == 1 && (sd_ > 1 || sh_ > 1 || sw_ > 1)) {
-    for_loop ln, lk, ld, lp;
-    auto mb_expr = input.checked_as<tensor>()->dims_[0];
-    if (blocking_input_) {
-      // NCHWc
-      auto im_c_num_block = ic_ / config.im_ic_block;
-      _tensor_(input_tmp, get_input_dtype(),
-        {mb_expr, im_c_num_block, oh_expr_, ow_, config.im_ic_block});
-      _named_for_(ln, n, 0, mb_expr, 1, for_type::PARALLEL) {
-        _named_for_(lk, c_o, 0, im_c_num_block) {
-          _named_for_(lp, p, 0, oh_expr_) {
-            _for_(q, 0, ow_) {
-              _for_(c_i, 0, config.im_ic_block, (int)lanes) {
-                input_tmp[span_t({n, c_o, p, q, c_i}, lanes)]
-                  = input[span_t({n, c_o, p * sh_, q * sw_, c_i}, lanes)];
-              }
-            }
-          }
-        }
-      }
-      auto lnk = ln->fuse(lk);
-      if (im_c_num_block * mb_
-        < runtime_config_t::get().get_num_threads() * 2) {
-        auto lnkp = lnk->fuse(lp);
-      }
-      input1 = input_tmp.static_as<tensor>();
-    } else {
-      _tensor_(input_tmp, get_input_dtype(), {mb_expr, oh_expr_, ow_, ic_});
-      _named_for_(ln, n, 0, mb_expr, 1, for_type::PARALLEL) {
-        _named_for_(lp, p, 0, oh_expr_) {
-          _for_(q, 0, ow_) {
-            _for_(c_i, 0, ic_, (int)lanes) {
-              input_tmp[span_t({n, p, q, c_i}, lanes)]
-                = input[span_t({n, p * sh_, q * sw_, c_i}, lanes)];
-            }
-          }
-        }
-      }
-      ln = ln->fuse(lp);
-      input1 = input_tmp.static_as<tensor>();
-    }
-  } else {
-    input1 = input.static_as<tensor>();
-  }
-
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int bs_threads = config.bs_threads;
-  int h_threads = config.h_threads;
-  int w_threads = config.w_threads;
-  int oc_threads = config.oc_threads;
-  int ic_threads = 1;
-
-  int oc_block = config.K_block;
-  int h_block = config.h_block;
-  int w_block = config.w_block;
-  int ic_block = config.C_block;
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-  int im_h_block = config.im_h_block;
-  int im_w_block = config.im_w_block;
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-
-  COMPILE_ASSERT(
-    h_block % im_h_block == 0, "h_block % im_h_block != 0, config is invalid")
-
-  COMPILE_ASSERT(
-    w_block % im_w_block == 0, "w_block % im_w_block != 0, config is invalid")
-
-  COMPILE_ASSERT(
-    w_block % im_w_block == 0, "w_block % im_w_block != 0, config is invalid")
-
-  COMPILE_ASSERT((im_w_block == ow_ || im_h_block == 1),
-    "im_w_block or im_h_block config is invalid")
-
-  // param
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-
-  for_loop lpbs, lph, lpw, lpoc, lpic, loh, low, looc, loic, lioc, lih, liw;
-
-  int oc_num_block_pt, oc_tail_num_block_pt, h_num_block_pt,
-    h_tail_num_block_pt, w_num_block_pt, w_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-
-  int oh_used_threads = block_split(utils::divide_and_ceil(oh_, h_block),
-    h_threads, h_num_block_pt, h_tail_num_block_pt);
-
-  int ow_used_threads = block_split(utils::divide_and_ceil(ow_, w_block),
-    w_threads, w_num_block_pt, w_tail_num_block_pt);
-
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  if (ic_used_threads > 1) {
-    // barrier
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-
-  auto input_expr_dims = input1.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-
-  _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lph, ph, 0, oh_used_threads, 1) {
-      _named_for_(lpw, pw, 0, ow_used_threads, 1) {
-        _named_for_(lpoc, poc, 0, oc_used_threads, 1) {
-          _named_for_(lpic, pic, 0, ic_used_threads, 1) {
-            expr h_num_block = builder::make_select(ph < (oh_used_threads - 1),
-                   h_num_block_pt, h_tail_num_block_pt),
-                 w_num_block = builder::make_select(pw < (ow_used_threads - 1),
-                   w_num_block_pt, w_tail_num_block_pt),
-                 oc_num_block
-              = builder::make_select(poc < (oc_used_threads - 1),
-                oc_num_block_pt, oc_tail_num_block_pt);
-            // single core
-            expr ic_num_block
-              = builder::make_select(pic < (ic_used_threads - 1),
-                ic_num_block_pt, ic_tail_num_block_pt);
-
-            expr n = pbs;
-            _named_for_(loh, o_h, 0, h_num_block_pt) {
-              _named_for_(low, o_w, 0, w_num_block_pt) {
-                _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-                  _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                    expr cond = o_h < h_num_block && o_w < w_num_block
-                      && o_oc < oc_num_block && o_ic < ic_num_block;
-                    _if_(cond) {
-                      _named_for_(lih, i_h, 0, h_block / im_h_block) {
-                        expr h = (ph * h_num_block_pt * h_block / im_h_block
-                                   + o_h * h_block / im_h_block + i_h)
-                          * im_h_block;
-                        _if_(h < oh_expr_) {
-                          _named_for_(liw, i_w, 0, w_block / im_w_block) {
-                            expr w = (pw * w_num_block_pt * w_block / im_w_block
-                                       + o_w * w_block / im_w_block + i_w)
-                              * im_w_block;
-                            _if_(w < ow_) {
-                              _named_for_(
-                                lioc, i_oc, 0, oc_block / im_oc_block) {
-                                _tensor_(A_list, datatypes::pointer,
-                                  {ic_block / im_ic_block});
-                                _tensor_(B_list, datatypes::pointer,
-                                  {ic_block / im_ic_block});
-                                expr oc = poc * oc_num_block_pt * oc_block
-                                    / im_oc_block
-                                  + o_oc * oc_block / im_oc_block + i_oc;
-                                _if_(oc * im_oc_block < oc_) {
-                                  _for_(i_c, 0, ic_block / im_ic_block) {
-                                    expr ic = pic * ic_num_block_pt * ic_block
-                                        / im_ic_block
-                                      + o_ic * ic_block / im_ic_block + i_c;
-                                    _if_(ic * im_ic_block < ic_) {
-                                      std::vector<expr> input_pos
-                                        = blocking_input_
-                                        ? std::vector<expr> {n, ic, h, w, 0}
-                                        : std::vector<expr> {
-                                          n, h, w, ic * im_ic_block};
-
-                                      A_list[i_c]
-                                        = tensor_ptr(input1, input_pos);
-                                      B_list[i_c] = tensor_ptr(weight,
-                                        kpack > 1 ? std::vector<expr> {oc, ic,
-                                          0, 0, 0, 0, 0}
-                                                  : std::vector<expr> {
-                                                    oc, ic, 0, 0, 0, 0});
-                                    }
-                                  }
-                                  const auto hint_A_size
-                                    = im_h_block * im_w_block * ic_block;
-                                  const auto hint_B_size
-                                    = im_oc_block * ic_block;
-                                  const auto hint_C_size
-                                    = im_h_block * im_w_block * im_oc_block;
-                                  sc_brgemm_attrs_t brg_attrs {
-                                    {brgemm::attr_key::max_bs,
-                                      ic_block / im_ic_block},
-                                    {brgemm::attr_key::hint_expected_A_size,
-                                      hint_A_size},
-                                    {brgemm::attr_key::hint_expected_B_size,
-                                      hint_B_size},
-                                    {brgemm::attr_key::hint_expected_C_size,
-                                      hint_C_size},
-                                    {brgemm::attr_key::use_interleave_stores,
-                                      true},
-                                    {brgemm::attr_key::use_uker, true}};
-
-                                  auto LDA
-                                    = blocking_input_ ? im_ic_block : ic_;
-                                  auto LDC
-                                    = blocking_output_ ? im_oc_block : oc_;
-
-                                  std::vector<expr> output_pos
-                                    = blocking_output_
-                                    ? std::vector<expr> {pic * mb_ + n, oc, h,
-                                      w, 0}
-                                    : std::vector<expr> {
-                                      pic * mb_ + n, h, w, oc * im_oc_block};
-
-                                  if (ic_num_block_pt > 1) {
-                                    _if_(o_ic == 0) {
-                                      builtin::brgemm_init_list_update(A_list,
-                                        B_list,
-                                        tensor_ptr(output_tmp, output_pos), 1,
-                                        im_h_block * im_w_block, im_oc_block,
-                                        im_ic_block, LDA, im_oc_block, LDC,
-                                        1 /*useless*/
-                                        ,
-                                        1 /*useless*/
-                                        ,
-                                        ic_block / im_ic_block,
-                                        get_input_dtype(), get_weight_dtype(),
-                                        brg_attrs);
-                                    }
-                                    _else_ {
-                                      builtin::brgemm_list_update(A_list,
-                                        B_list,
-                                        tensor_ptr(output_tmp, output_pos), 1,
-                                        im_h_block * im_w_block, im_oc_block,
-                                        im_ic_block, LDA, im_oc_block, LDC,
-                                        1 /*useless*/
-                                        ,
-                                        1 /*useless*/
-                                        ,
-                                        ic_block / im_ic_block,
-                                        get_input_dtype(), get_weight_dtype(),
-                                        brg_attrs);
-                                    }
-                                  } else {
-                                    builtin::brgemm_init_list_update(A_list,
-                                      B_list,
-                                      tensor_ptr(output_tmp, output_pos), 1,
-                                      im_h_block * im_w_block, im_oc_block,
-                                      im_ic_block, LDA, im_oc_block, LDC,
-                                      1 /*useless*/
-                                      ,
-                                      1 /*useless*/
-                                      ,
-                                      ic_block / im_ic_block, get_input_dtype(),
-                                      get_weight_dtype(), brg_attrs);
-                                  }
-
-                                  if (ic_used_threads == 1
-                                    && ic_num_block_pt == 1) {
-                                    _if_(o_ic == (ic_num_block - 1)) {
-                                      create_fusion_anchor(fusion,
-                                        owner_->get_outputs()[0],
-                                        blocking_output_
-                                          ? slice_range {{n, 1UL}, {oc, 1},
-                                            {h, im_h_block}, {w, im_w_block},
-                                            {0, im_oc_block}}
-                                          : slice_range {{n, 1UL},
-                                            {h, im_h_block}, {w, im_w_block},
-                                            {oc * im_oc_block, im_oc_block}});
-                                    }
-                                  }
-                                }
-                              }
-                              if (ic_used_threads == 1 && ic_num_block_pt == 1
-                                && oc_block * oc_used_threads == oc_) {
-                                _if_(o_ic == (ic_num_block - 1)) {
-                                  expr anch_c = poc * oc_num_block_pt * oc_block
-                                      / im_oc_block
-                                    + o_oc * oc_block / im_oc_block;
-                                  create_fusion_anchor(fusion,
-                                    owner_->get_outputs()[0],
-                                    blocking_output_
-                                      ? slice_range {{n, 1UL}, {anch_c, 1},
-                                        {h, im_h_block}, {w, im_w_block},
-                                        {0, im_oc_block}}
-                                      : slice_range {{n, 1UL}, {h, im_h_block},
-                                        {w, im_w_block},
-                                        {anch_c * im_oc_block, oc_block}});
-                                }
-                              }
-                            }
-                          }
-
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1
-                            && oc_block * oc_used_threads == oc_
-                            && w_block * ow_used_threads == ow_) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              expr anch_c
-                                = poc * oc_num_block_pt * oc_block / im_oc_block
-                                + o_oc * oc_block / im_oc_block;
-                              expr anch_w
-                                = (pw * w_num_block_pt * w_block / im_w_block
-                                    + o_w * w_block / im_w_block)
-                                * im_w_block;
-                              create_fusion_anchor(fusion,
-                                owner_->get_outputs()[0],
-                                blocking_output_
-                                  ? slice_range {{n, 1UL}, {anch_c, 1},
-                                    {h, im_h_block}, {anch_w, w_block},
-                                    {0, im_oc_block}}
-                                  : slice_range {{n, 1UL}, {h, im_h_block},
-                                    {anch_w, w_block},
-                                    {anch_c * im_oc_block, oc_block}});
-                            }
-                          }
-                        }
-                      }
-
-                      if (ic_used_threads == 1 && ic_num_block_pt == 1
-                        && oc_block * oc_used_threads == oc_
-                        && w_block * ow_used_threads == ow_
-                        && h_block * oh_used_threads == oh_) {
-                        _if_(o_ic == (ic_num_block - 1)) {
-                          expr anch_c
-                            = poc * oc_num_block_pt * oc_block / im_oc_block
-                            + o_oc * oc_block / im_oc_block;
-                          expr anch_h
-                            = (ph * h_num_block_pt * h_block / im_h_block
-                                + o_h * h_block / im_h_block)
-                            * im_h_block;
-                          expr anch_w
-                            = (pw * w_num_block_pt * w_block / im_w_block
-                                + o_w * w_block / im_w_block)
-                            * im_w_block;
-
-                          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                            blocking_output_
-                              ? slice_range {{n, 1UL}, {anch_c, 1},
-                                {anch_h, h_block}, {anch_w, w_block},
-                                {0, im_oc_block}}
-                              : slice_range {{n, 1UL}, {anch_h, h_block},
-                                {anch_w, w_block},
-                                {anch_c * im_oc_block, oc_block}});
-                        }
-                      }
-                    }
-                  }
-                  // TODO(xurui): need to add iterated anchor here to
-                  // support more fusion opportunity
-                }
-              }
-            }
-
-            if (oc_threads == 1 && h_threads == 1 && w_threads == 1
-              && ic_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                blocking_output_
-                  ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block},
-                    {0, oh_expr_}, {0, ow_}, {0, im_oc_block}}
-                  : slice_range {
-                    {pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-            }
-          }
-          if (oc_threads == 1 && h_threads == 1 && w_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              blocking_output_
-                ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block},
-                  {0, oh_expr_}, {0, ow_}, {0, im_oc_block}}
-                : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-          }
-        }
-        if (h_threads == 1 && w_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            blocking_output_
-              ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block}, {0, oh_expr_},
-                {0, ow_}, {0, im_oc_block}}
-              : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-        }
-      }
-
-      if (h_threads == 1) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          blocking_output_
-            ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block}, {0, oh_expr_},
-              {0, ow_}, {0, im_oc_block}}
-            : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-      }
-    }
-    if (mb_ > 1) {
-      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-        blocking_output_
-          ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block}, {0, oh_expr_},
-            {0, ow_}, {0, im_oc_block}}
-          : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-    }
-  }
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lph, "H");
-  bind_output_loop_axis(lpw, "W");
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lph, lpw, lpoc, lpic};
-}
-
-void gen_nested_conv_fwd_t::dynamic_compute_1x1_pack_input_nested(
-  CONV_ARG_LIST) const {
-  COMPILE_ASSERT(
-    !is_3d_, "dynamic 1x1 pack input doens't support 3D conv yet!");
-  COMPILE_ASSERT(!blocking_input_ && !blocking_output_,
-    "dynamic 1x1 pack input doens't support blocking input / output yet!");
-  tensor input1;
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  auto ih_expr_ = input_expr_dims[1];
-  auto iw_expr_ = input_expr_dims[2];
-  auto output_expr_dims = output.checked_as<tensor>()->dims_;
-  auto oh_expr_
-    = input_expr_dims.size() == 4 ? output_expr_dims[1] : output_expr_dims[2];
-  auto ow_expr_
-    = input_expr_dims.size() == 4 ? output_expr_dims[2] : output_expr_dims[3];
-
-  int lanes = get_lanes(ctx, config.im_ic_block, get_input_dtype());
-  if (config.pack_input == 1 && (sd_ > 1 || sh_ > 1 || sw_ > 1)) {
-    for_loop ln, lk, ld, lp;
-    _tensor_(input_tmp, get_input_dtype(), {mb_expr_, oh_expr_, ow_expr_, ic_});
-    _named_for_(ln, n, 0, mb_expr_, 1, for_type::PARALLEL) {
-      _named_for_(lp, p, 0, oh_expr_) {
-        _for_(q, 0, ow_expr_) {
-          _for_(c_i, 0, ic_, (int)lanes) {
-            input_tmp[span_t({n, p, q, c_i}, lanes)]
-              = input[span_t({n, p * sh_, q * sw_, c_i}, lanes)];
-          }
-        }
-      }
-    }
-    ln = ln->fuse(lp);
-    input1 = input_tmp.static_as<tensor>();
-  } else {
-    input1 = input.static_as<tensor>();
-  }
-
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int h_threads = config.h_threads;
-  int oc_threads = config.oc_threads;
-  int bs_threads = num_threads / h_threads / oc_threads;
-  int w_threads = config.w_threads;
-  int ic_threads = 1;
-  int oc_block = oc_ / oc_threads;
-  // for small spatial
-  expr h_block = builder::make_select(oh_expr_ / h_threads > 0
-      && oh_expr_ / h_threads * ow_expr_ <= 64
-      && oh_expr_ / h_threads * h_threads == oh_expr_,
-    builder::make_cast(datatypes::s32, oh_expr_ / h_threads),
-    config.im_h_block);
-  expr w_block = ow_expr_ / w_threads;
-  int ic_block = ic_ / ic_threads;
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-  expr im_h_block = h_block;
-  expr im_w_block = builder::make_select(h_block * ow_expr_ <= 64,
-    builder::make_cast(datatypes::s32, ow_expr_), config.im_w_block);
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-
-  // param
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-
-  for_loop lpbs, lph, lpw, lpoc, lpic, loh, low, looc, loic, lioc, lis, lih,
-    liw;
-
-  int oc_num_block_pt, oc_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  expr h_num_block_pt
-    = divide_and_ceil(divide_and_ceil(oh_expr_, h_block), h_threads);
-  expr h_block_tail_pt = builder::make_select(
-    divide_and_ceil(oh_expr_, h_block) % h_num_block_pt == 0, h_num_block_pt,
-    divide_and_ceil(oh_expr_, h_block) % h_num_block_pt);
-  expr oh_used_threads
-    = divide_and_ceil(divide_and_ceil(oh_expr_, h_block), h_num_block_pt);
-  expr w_num_block_pt = 1;
-
-  _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL, bs_threads) {
-    _named_for_(lph, ph, 0, h_threads, 1, for_type::PARALLEL, h_threads) {
-      _named_for_(lpw, pw, 0, w_threads, 1, for_type::PARALLEL, w_threads) {
-        _named_for_(
-          lpoc, poc, 0, oc_threads, 1, for_type::PARALLEL, oc_threads) {
-          _named_for_(
-            lpic, pic, 0, ic_threads, 1, for_type::PARALLEL, ic_threads) {
-            _if_(ph < oh_used_threads && pw < w_threads && poc < oc_used_threads
-              && pic < ic_used_threads) {
-              // single core
-              expr oc_num_block
-                = builder::make_select(poc < (oc_used_threads - 1),
-                  oc_num_block_pt, oc_tail_num_block_pt);
-              expr ic_num_block
-                = builder::make_select(pic < (ic_used_threads - 1),
-                  ic_num_block_pt, ic_tail_num_block_pt);
-              expr h_num_block = builder::make_select(
-                ph < (oh_used_threads - 1), h_num_block_pt, h_block_tail_pt);
-              expr n = pbs;
-              _named_for_(loh, o_h, 0, h_num_block_pt) {
-                _named_for_(low, o_w, 0, w_num_block_pt) {
-                  _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-                    _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                      expr cond = o_h < h_num_block && o_w < w_num_block_pt
-                        && o_oc < oc_num_block && o_ic < ic_num_block;
-                      // innermost loop
-                      _if_(cond) {
-                        _named_for_(lioc, i_oc, 0, oc_block / im_oc_block) {
-                          _tensor_(A_list, datatypes::pointer,
-                            {ic_block / im_ic_block});
-                          _tensor_(B_list, datatypes::pointer,
-                            {ic_block / im_ic_block});
-                          expr oc
-                            = poc * oc_num_block_pt * oc_block / im_oc_block
-                            + o_oc * oc_block / im_oc_block + i_oc;
-                          _if_(oc * im_oc_block < oc_) {
-                            _named_for_(lih, i_h, 0,
-                              divide_and_ceil(h_block, im_h_block)) {
-                              expr h = ph * h_num_block_pt * h_block
-                                + o_h * h_block + i_h * im_h_block;
-                              _if_(h < oh_expr_) {
-                                _named_for_(liw, i_w, 0,
-                                  divide_and_ceil(w_block, im_w_block)) {
-                                  expr w = pw * w_num_block_pt * w_block
-                                    + o_w * w_block + i_w * im_w_block;
-                                  _for_(i_c, 0, ic_block / im_ic_block) {
-                                    expr ic = pic * ic_num_block_pt * ic_block
-                                        / im_ic_block
-                                      + o_ic * ic_block / im_ic_block + i_c;
-                                    _if_(ic * im_ic_block < ic_) {
-                                      std::vector<expr> input_pos
-                                        = std::vector<expr> {
-                                          n, h, w, ic * im_ic_block};
-                                      A_list[i_c]
-                                        = tensor_ptr(input1, input_pos);
-                                      B_list[i_c] = tensor_ptr(weight,
-                                        kpack > 1 ? std::vector<expr> {oc, ic,
-                                          0, 0, 0, 0, 0}
-                                                  : std::vector<expr> {
-                                                    oc, ic, 0, 0, 0, 0});
-                                    }
-                                  }
-                                  auto LDA = ic_;
-                                  auto LDC = oc_;
-
-                                  std::vector<expr> output_pos
-                                    = blocking_output_
-                                    ? std::vector<expr> {n, oc, h, w, 0}
-                                    : std::vector<expr> {
-                                      n, h, w, oc * im_oc_block};
-                                  auto im_w_tail_block = builder::make_cast(
-                                    datatypes::s32, ow_expr_ - w);
-                                  im_w_block = builder::make_select(
-                                    w + im_w_block <= ow_expr_, im_w_block,
-                                    im_w_tail_block);
-                                  auto im_s_block = builder::make_cast(
-                                    datatypes::s32, im_w_block * im_h_block);
-                                  generate_brgemm(im_s_block, im_ic_block,
-                                    im_oc_block, ic_block, o_ic,
-                                    ic_num_block_pt, A_list, B_list,
-                                    tensor_ptr(output, output_pos), LDA, LDC);
-                                  create_fusion_anchor(fusion,
-                                    owner_->get_outputs()[0],
-                                    {{n, 1UL}, {h, im_h_block}, {w, im_w_block},
-                                      {oc * im_oc_block, im_oc_block}});
-                                } // i_w
-                              } // check h_boundary
-                              if (oc_block * oc_used_threads == oc_) {
-                                _if_(o_ic == (ic_num_block - 1)) {
-                                  create_fusion_anchor(fusion,
-                                    owner_->get_outputs()[0],
-                                    {{n, 1UL}, {h, im_h_block}, {0, ow_expr_},
-                                      {oc * im_oc_block, im_oc_block}});
-                                }
-                              }
-                            } // i_h
-                            if (oc_block * oc_used_threads == oc_) {
-                              _if_(o_ic == (ic_num_block - 1)) {
-                                expr anchor_h
-                                  = (ph * h_num_block_pt * h_block / im_h_block
-                                      + o_h * h_block / im_h_block)
-                                  * im_h_block;
-                                create_fusion_anchor(fusion,
-                                  owner_->get_outputs()[0],
-                                  {{n, 1UL}, {anchor_h, h_block}, {0, ow_expr_},
-                                    {oc * im_oc_block, im_oc_block}});
-                              }
-                            }
-                          } // check i_oc
-                        } // i_oc
-                        if (oc_block * oc_used_threads == oc_) {
-                          _if_(h_block * oh_used_threads == oh_expr_) {
-                            expr anchor_h
-                              = (ph * h_num_block_pt * h_block / im_h_block
-                                  + o_h * h_block / im_h_block)
-                              * im_h_block;
-                            expr anchor_c = poc * oc_num_block_pt * oc_block
-                              + o_oc * oc_block;
-                            create_fusion_anchor(fusion,
-                              owner_->get_outputs()[0],
-                              {{n, 1UL}, {anchor_h, h_block}, {0, ow_expr_},
-                                {anchor_c, oc_block}});
-                          }
-                        }
-                      } // check innermost
-                    } // o_ic
-                  } // o_oc
-                } // o_w
-                if (oc_block * oc_used_threads == oc_) {
-                  _if_(h_block * oh_used_threads == oh_expr_) {
-                    expr anchor_h
-                      = ph * h_num_block_pt * h_block + o_h * h_block;
-                    expr anchor_c = poc * oc_num_block_pt * oc_block;
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      {{pbs, 1UL}, {anchor_h, h_block}, {0, ow_expr_},
-                        {anchor_c, oc_num_block_pt * oc_block}});
-                  }
-                }
-              } // o_h
-            } // check single core
-            if (oc_threads == 1 && h_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_}, {0, oc_}});
-            }
-          } // pic
-        } // poc
-        if (h_threads == 1) {
-          expr anchor_h = ph * h_num_block_pt * h_block;
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_}, {0, oc_}});
-        }
-      } // pw
-    } // ph
-    // TODO(xurui) disable the anchor fow now for dynamic bottleneck.
-    // _if_(mb_expr_ > 1) {
-    //   create_fusion_anchor(fusion, owner_->get_outputs()[0],
-    //     {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_}, {0, oc_}});
-    // }
-  } // pbs
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lph, "H");
-  bind_output_loop_axis(lpw, "W");
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lph, lpw, lpoc, lpic};
-}
-
-void gen_nested_conv_fwd_t::compute_1x1_no_pack_input_nested(
-  CONV_ARG_LIST) const {
-  int bs_threads = config.bs_threads;
-  int h_threads = config.h_threads;
-  int w_threads = config.w_threads;
-  int oc_threads = config.oc_threads;
-  int ic_threads = 1;
-
-  int oc_block = config.K_block;
-  int h_block = config.h_block;
-  int w_block = config.w_block;
-  int ic_block = config.C_block;
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-  int im_h_block = config.im_h_block;
-  int im_w_block = config.im_w_block;
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-
-  COMPILE_ASSERT(
-    h_block % im_h_block == 0, "h_block % im_h_block != 0, config is invalid")
-
-  COMPILE_ASSERT(
-    w_block % im_w_block == 0, "w_block % im_w_block != 0, config is invalid")
-
-  COMPILE_ASSERT(
-    w_block % im_w_block == 0, "w_block % im_w_block != 0, config is invalid")
-
-  // param
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-  auto out_fmt = toutput.get_format();
-  auto oh_expr_ = oh_;
-  if (!out_fmt.is_any()) {
-    auto out_p2b_map = out_fmt.format_code_.collect_p2b_mapping();
-    oh_expr_ = static_cast<int>(get_expr_as_int(
-      output.checked_as<tensor>()->dims_[out_p2b_map[is_3d_ ? 3 : 2][0]]));
-  }
-
-  for_loop lpbs, lph, lpw, lpoc, lpic, loh, low, looc, loic, lioc, lih, liw;
-
-  int oc_num_block_pt, oc_tail_num_block_pt, h_num_block_pt,
-    h_tail_num_block_pt, w_num_block_pt, w_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-
-  int oh_used_threads = block_split(utils::divide_and_ceil(oh_, h_block),
-    h_threads, h_num_block_pt, h_tail_num_block_pt);
-
-  int ow_used_threads = block_split(utils::divide_and_ceil(ow_, w_block),
-    w_threads, w_num_block_pt, w_tail_num_block_pt);
-
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  if (ic_used_threads > 1) {
-    // barrier
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-
-  _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-    _named_for_(lph, ph, 0, oh_used_threads, 1) {
-      _named_for_(lpw, pw, 0, ow_used_threads, 1) {
-        _named_for_(lpoc, poc, 0, oc_used_threads, 1) {
-          _named_for_(lpic, pic, 0, ic_used_threads, 1) {
-            expr h_num_block = builder::make_select(ph < (oh_used_threads - 1),
-                   h_num_block_pt, h_tail_num_block_pt),
-
-                 w_num_block = builder::make_select(pw < (ow_used_threads - 1),
-                   w_num_block_pt, w_tail_num_block_pt),
-
-                 oc_num_block
-              = builder::make_select(poc < (oc_used_threads - 1),
-                oc_num_block_pt, oc_tail_num_block_pt);
-
-            // single core
-            expr ic_num_block
-              = builder::make_select(pic < (ic_used_threads - 1),
-                ic_num_block_pt, ic_tail_num_block_pt);
-
-            expr n = pbs;
-            _named_for_(loh, o_h, 0, h_num_block_pt) {
-              _named_for_(low, o_w, 0, w_num_block_pt) {
-                _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-                  _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                    expr cond = o_h < h_num_block && o_w < w_num_block
-                      && o_oc < oc_num_block && o_ic < ic_num_block;
-                    _if_(cond) {
-                      _named_for_(lih, i_h, 0, h_block / im_h_block) {
-                        expr h = (ph * h_num_block_pt * h_block / im_h_block
-                                   + o_h * h_block / im_h_block + i_h)
-                          * im_h_block;
-                        _named_for_(liw, i_w, 0, w_block / im_w_block) {
-                          expr w = (pw * w_num_block_pt * w_block / im_w_block
-                                     + o_w * w_block / im_w_block + i_w)
-                            * im_w_block;
-                          _if_(w < ow_) {
-                            _named_for_(lioc, i_oc, 0, oc_block / im_oc_block) {
-                              expr oc
-                                = poc * oc_num_block_pt * oc_block / im_oc_block
-                                + o_oc * oc_block / im_oc_block + i_oc;
-                              _if_(oc * im_oc_block < oc_) {
-                                _for_(im_h_i, 0, im_h_block) {
-                                  _if_(h + im_h_i < oh_expr_) {
-                                    _tensor_(A_list, datatypes::pointer,
-                                      {ic_block / im_ic_block});
-                                    _tensor_(B_list, datatypes::pointer,
-                                      {ic_block / im_ic_block});
-
-                                    _for_(i_c, 0, ic_block / im_ic_block) {
-                                      expr ic = pic * ic_num_block_pt * ic_block
-                                          / im_ic_block
-                                        + o_ic * ic_block / im_ic_block + i_c;
-                                      _if_(ic * im_ic_block < ic_) {
-                                        std::vector<expr> input_pos
-                                          = blocking_input_
-                                          ? std::vector<expr> {n, ic,
-                                            (h + im_h_i) * sh_, w * sw_, 0}
-                                          : std::vector<expr> {n,
-                                            (h + im_h_i) * sh_, w * sw_,
-                                            ic * im_ic_block};
-
-                                        A_list[i_c]
-                                          = tensor_ptr(input, input_pos);
-                                        B_list[i_c] = tensor_ptr(weight,
-                                          kpack > 1 ? std::vector<expr> {oc, ic,
-                                            0, 0, 0, 0, 0}
-                                                    : std::vector<expr> {
-                                                      oc, ic, 0, 0, 0, 0});
-                                      }
-                                    }
-                                    const auto hint_A_size
-                                      = im_w_block * ic_block;
-                                    const auto hint_B_size
-                                      = im_oc_block * ic_block;
-                                    const auto hint_C_size
-                                      = im_w_block * im_oc_block;
-
-                                    sc_brgemm_attrs_t brg_attrs {
-                                      {brgemm::attr_key::max_bs,
-                                        ic_block / im_ic_block},
-                                      {brgemm::attr_key::hint_expected_A_size,
-                                        hint_A_size},
-                                      {brgemm::attr_key::hint_expected_B_size,
-                                        hint_B_size},
-                                      {brgemm::attr_key::hint_expected_C_size,
-                                        hint_C_size},
-                                      {brgemm::attr_key::use_interleave_stores,
-                                        true},
-                                      {brgemm::attr_key::use_uker, true}};
-
-                                    auto LDA = blocking_input_
-                                      ? sw_ * im_ic_block
-                                      : sw_ * ic_;
-                                    auto LDC
-                                      = blocking_output_ ? im_oc_block : oc_;
-
-                                    std::vector<expr> output_pos
-                                      = blocking_output_
-                                      ? std::vector<expr> {pic * mb_ + n, oc,
-                                        h + im_h_i, w, 0}
-                                      : std::vector<expr> {pic * mb_ + n,
-                                        h + im_h_i, w, oc * im_oc_block};
-
-                                    if (ic_num_block_pt > 1) {
-                                      _if_(o_ic == 0) {
-                                        builtin::brgemm_init_list_update(A_list,
-                                          B_list,
-                                          tensor_ptr(output_tmp, output_pos), 1,
-                                          im_w_block, im_oc_block, im_ic_block,
-                                          LDA, im_oc_block, LDC, 1 /*useless*/
-                                          ,
-                                          1 /*useless*/
-                                          ,
-                                          ic_block / im_ic_block,
-                                          get_input_dtype(), get_weight_dtype(),
-                                          brg_attrs);
-                                      }
-                                      _else_ {
-                                        builtin::brgemm_list_update(A_list,
-                                          B_list,
-                                          tensor_ptr(output_tmp, output_pos), 1,
-                                          im_w_block, im_oc_block, im_ic_block,
-                                          LDA, im_oc_block, LDC, 1 /*useless*/
-                                          ,
-                                          1 /*useless*/
-                                          ,
-                                          ic_block / im_ic_block,
-                                          get_input_dtype(), get_weight_dtype(),
-                                          brg_attrs);
-                                      }
-                                    } else {
-                                      builtin::brgemm_init_list_update(A_list,
-                                        B_list,
-                                        tensor_ptr(output_tmp, output_pos), 1,
-                                        im_w_block, im_oc_block, im_ic_block,
-                                        LDA, im_oc_block, LDC, 1 /*useless*/
-                                        ,
-                                        1 /*useless*/
-                                        ,
-                                        ic_block / im_ic_block,
-                                        get_input_dtype(), get_weight_dtype(),
-                                        brg_attrs);
-                                    }
-
-                                    if (ic_used_threads == 1
-                                      && ic_num_block_pt == 1) {
-                                      _if_(o_ic == (ic_num_block - 1)) {
-                                        create_fusion_anchor(fusion,
-                                          owner_->get_outputs()[0],
-                                          blocking_output_
-                                            ? slice_range {{n, 1UL}, {oc, 1},
-                                              {h + im_h_i, 1}, {w, im_w_block},
-                                              {0, im_oc_block}}
-                                            : slice_range {{n, 1UL},
-                                              {h + im_h_i, 1}, {w, im_w_block},
-                                              {oc * im_oc_block, im_oc_block}});
-                                      }
-                                    }
-                                  }
-                                }
-
-                                if (ic_used_threads == 1) {
-                                  _if_(o_ic == (ic_num_block - 1)) {
-                                    create_fusion_anchor(fusion,
-                                      owner_->get_outputs()[0],
-                                      blocking_output_
-                                        ? slice_range {{n, 1UL}, {oc, 1},
-                                          {h, im_h_block}, {w, im_w_block},
-                                          {0, im_oc_block}}
-                                        : slice_range {{n, 1UL},
-                                          {h, im_h_block}, {w, im_w_block},
-                                          {oc * im_oc_block, im_oc_block}});
-                                  }
-                                }
-                              }
-                            }
-                            if (ic_used_threads == 1 && ic_num_block_pt == 1
-                              && oc_block * oc_used_threads == oc_) {
-                              _if_(o_ic == (ic_num_block - 1)) {
-                                expr anch_c = poc * oc_num_block_pt * oc_block
-                                    / im_oc_block
-                                  + o_oc * oc_block / im_oc_block;
-                                create_fusion_anchor(fusion,
-                                  owner_->get_outputs()[0],
-                                  blocking_output_
-                                    ? slice_range {{n, 1UL}, {anch_c, 1},
-                                      {h, im_h_block}, {w, im_w_block},
-                                      {0, im_oc_block}}
-                                    : slice_range {{n, 1UL}, {h, im_h_block},
-                                      {w, im_w_block},
-                                      {anch_c * im_oc_block, oc_block}});
-                              }
-                            }
-                          }
-                        }
-
-                        if (ic_used_threads == 1 && ic_num_block_pt == 1
-                          && oc_block * oc_used_threads == oc_
-                          && w_block * ow_used_threads == ow_) {
-                          _if_(o_ic == (ic_num_block - 1)) {
-                            expr anch_c
-                              = poc * oc_num_block_pt * oc_block / im_oc_block
-                              + o_oc * oc_block / im_oc_block;
-                            expr anch_w
-                              = (pw * w_num_block_pt * w_block / im_w_block
-                                  + o_w * w_block / im_w_block)
-                              * im_w_block;
-                            create_fusion_anchor(fusion,
-                              owner_->get_outputs()[0],
-                              blocking_output_
-                                ? slice_range {{n, 1UL}, {anch_c, 1},
-                                  {h, im_h_block}, {anch_w, w_block},
-                                  {0, im_oc_block}}
-                                : slice_range {{n, 1UL}, {h, im_h_block},
-                                  {anch_w, w_block},
-                                  {anch_c * im_oc_block, oc_block}});
-                          }
-                        }
-                      }
-
-                      if (ic_used_threads == 1 && ic_num_block_pt == 1
-                        && oc_block * oc_used_threads == oc_
-                        && w_block * ow_used_threads == ow_
-                        && h_block * oh_used_threads == oh_) {
-                        _if_(o_ic == (ic_num_block - 1)) {
-                          expr anch_c
-                            = (poc * oc_num_block_pt * oc_block / im_oc_block
-                              + o_oc * oc_block / im_oc_block);
-                          expr anch_h
-                            = (ph * h_num_block_pt * h_block / im_h_block
-                                + o_h * h_block / im_h_block)
-                            * im_h_block;
-                          expr anch_w
-                            = (pw * w_num_block_pt * w_block / im_w_block
-                                + o_w * w_block / im_w_block)
-                            * im_w_block;
-                          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                            blocking_output_
-                              ? slice_range {{n, 1UL}, {anch_c, 1},
-                                {anch_h, h_block}, {anch_w, w_block},
-                                {0, im_oc_block}}
-                              : slice_range {{n, 1UL}, {anch_h, h_block},
-                                {anch_w, w_block},
-                                {anch_c * im_oc_block, oc_block}});
-                        }
-                      }
-                    }
-                  }
-                  // TODO(xurui): need to add iterated anchor here to
-                  // support more fusion opportunity
-                }
-              }
-            }
-
-            if (oc_threads == 1 && ic_threads == 1 && h_threads == 1
-              && w_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                blocking_output_
-                  ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block},
-                    {0, oh_expr_}, {0, ow_}, {0, im_oc_block}}
-                  : slice_range {
-                    {pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-            }
-          }
-          if (oc_threads == 1 && h_threads == 1 && w_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              blocking_output_
-                ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block},
-                  {0, oh_expr_}, {0, ow_}, {0, im_oc_block}}
-                : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-          }
-        }
-
-        if (h_threads == 1 && w_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            blocking_output_
-              ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block}, {0, oh_expr_},
-                {0, ow_}, {0, im_oc_block}}
-              : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-        }
-      }
-
-      if (h_threads == 1) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          blocking_output_
-            ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block}, {0, oh_expr_},
-              {0, ow_}, {0, im_oc_block}}
-            : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-      }
-    }
-    if (mb_ > 1) {
-      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-        blocking_output_
-          ? slice_range {{pbs, 1UL}, {0, oc_ / im_oc_block}, {0, oh_expr_},
-            {0, ow_}, {0, im_oc_block}}
-          : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_}, {0, oc_}});
-    }
-  }
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lph, "H");
-  bind_output_loop_axis(lpw, "W");
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lph, lpw, lpoc, lpic};
-}
-
-void gen_nested_conv_fwd_t::compute_conv_no_padding_os_blocking_nested(
-  CONV_ARG_LIST) const {
-  COMPILE_ASSERT(
-    pack_rows, "Use nested conv with os blocking only if pack_rows is true")
-  int bs_threads = config.bs_threads;
-  int s_threads = config.w_threads;
-  int oc_threads = config.oc_threads;
-  int ic_threads = 1;
-
-  int oc_block = config.K_block;
-  int s_block = config.w_block;
-  int ic_block = config.C_block;
-
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-  int im_s_block = config.im_w_block;
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid");
-  COMPILE_ASSERT(
-    s_block % im_s_block == 0, "s_block % im_s_block != 0, config is invalid");
-
-  // param
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-
-  for_loop lpbs, lps, lpoc, lpic, los, looc, loic, lioc, lis, lok;
-
-  int bs_num_block_pt, bs_tail_num_block_pt, oc_num_block_pt,
-    oc_tail_num_block_pt, s_num_block_pt, s_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-  int bs_used_threads
-    = block_split(mb_, bs_threads, bs_num_block_pt, bs_tail_num_block_pt);
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-  int os_used_threads = block_split(utils::divide_and_ceil(os, s_block),
-    s_threads, s_num_block_pt, s_tail_num_block_pt);
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-
-  if (ic_used_threads > 1) {
-    // barrier
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-  auto LDA = blocking_input_ ? sw_ * im_ic_block : sw_ * ic_;
-  auto LDC = blocking_output_ ? im_oc_block : oc_;
-
-  int oc_split = 1;
-  auto nthreads = runtime_config_t::get().get_num_threads();
-  bool parallel_space_is_enough
-    = (mb_ % nthreads == 0 || utils::divide_and_ceil(mb_, nthreads) > 8);
-  auto weight_size
-    = math_utils::get_dims_product(in_tensors_[1].get_blocking_dims())
-    * utils::get_sizeof_type(get_weight_dtype());
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  if (weight_size >= L2_cache_size && parallel_space_is_enough
-    && oc_threads == 1 && oc_num_block_pt == 1) {
-    int num_block = oc_block / im_oc_block;
-    int expected_split_num = utils::divide_and_ceil(weight_size, L2_cache_size);
-    for (auto &factor : utils::get_factors(num_block)) {
-      if (factor >= expected_split_num) {
-        expected_split_num = factor;
-        break;
-      }
-    }
-    oc_split = num_block < expected_split_num ? 1 : expected_split_num;
-  }
-
-  _named_for_(lok, outer_k, 0, oc_split, 1, for_type::PARALLEL) {
-    _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-      _named_for_(lps, ps, 0, os_used_threads, 1) {
-        _named_for_(lpoc, poc, 0, oc_used_threads, 1) {
-          _named_for_(lpic, pic, 0, ic_used_threads, 1) {
-            expr s_num_block = builder::make_select(ps < (os_used_threads - 1),
-                   s_num_block_pt, s_tail_num_block_pt),
-                 oc_num_block
-              = builder::make_select(poc < (oc_used_threads - 1),
-                oc_num_block_pt, oc_tail_num_block_pt);
-            // single core
-            expr ic_num_block
-              = builder::make_select(pic < (ic_used_threads - 1),
-                ic_num_block_pt, ic_tail_num_block_pt);
-
-            expr n = pbs;
-            _named_for_(los, o_s, 0, s_num_block_pt) {
-              _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-                _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                  expr cond = o_s < s_num_block && o_oc < oc_num_block
-                    && o_ic < ic_num_block;
-                  _if_(cond) {
-                    _named_for_(
-                      lioc, i_oc, 0, oc_block / im_oc_block / oc_split) {
-                      expr oc = poc * oc_num_block_pt * oc_block / im_oc_block
-                        + o_oc * oc_block / im_oc_block
-                        + outer_k * oc_block / im_oc_block / oc_split + i_oc;
-
-                      _if_(oc * im_oc_block < oc_) {
-                        _named_for_(lis, i_s, 0, s_block / im_s_block) {
-                          _tensor_(A_list, datatypes::pointer,
-                            {kh_ * kw_ * ic_block / im_ic_block});
-                          _tensor_(B_list, datatypes::pointer,
-                            {kh_ * kw_ * ic_block / im_ic_block});
-                          auto im_s_block_idx
-                            = ps * s_num_block_pt * s_block / im_s_block
-                            + o_s * s_block / im_s_block + i_s;
-
-                          auto out_tsr = tensor_ptr(output,
-                            blocking_output_
-                              ? std::vector<expr> {n, oc,
-                                (im_s_block_idx * im_s_block) / ow_,
-                                im_s_block_idx * im_s_block % ow_, 0}
-                              : std::vector<expr> {n,
-                                (im_s_block_idx * im_s_block) / ow_,
-                                (im_s_block_idx * im_s_block) % ow_,
-                                oc * im_oc_block});
-
-                          int adj_ow = ow_ + num_elems_skip_per_ow_;
-
-                          if (os / im_s_block == 1) {
-                            out_tsr = tensor_ptr(output,
-                              blocking_output_
-                                ? std::vector<expr> {n, oc, 0, 0, 0}
-                                : std::vector<expr> {
-                                  n, 0, 0, oc * config.im_oc_block});
-                          } else {
-                            auto acc_m = os_acc_size[{im_s_block_idx}];
-                            out_tsr = tensor_ptr(output,
-                              blocking_output_
-                                ? std::vector<expr> {n, oc, acc_m / ow_,
-                                  acc_m % ow_, 0}
-                                : std::vector<expr> {n, acc_m / ow_,
-                                  acc_m % ow_, oc * im_oc_block});
-                          }
-
-                          _for_(i_c, 0, ic_block / im_ic_block) {
-                            expr ic
-                              = pic * ic_num_block_pt * ic_block / im_ic_block
-                              + o_ic * ic_block / im_ic_block + i_c;
-                            _if_(ic * im_ic_block < ic_) {
-                              _for_(r, 0, kh_) {
-                                _for_(s, 0, kw_) {
-                                  auto idx = i_c * kh_ * kw_ + r * kw_ + s;
-                                  auto h
-                                    = ((im_s_block_idx * im_s_block) / adj_ow);
-                                  auto w
-                                    = ((im_s_block_idx * im_s_block) % adj_ow);
-                                  std::vector<expr> input_pos = blocking_input_
-                                    ? std::vector<expr> {n, ic,
-                                      h * sh_ + dh_ * r, w * sw_ + dw_ * s, 0}
-                                    : std::vector<expr> {n, h * sh_ + dh_ * r,
-                                      w * sw_ + dw_ * s, ic * im_ic_block};
-
-                                  A_list[idx] = tensor_ptr(input, input_pos);
-                                  B_list[idx] = tensor_ptr(weight,
-                                    kpack > 1
-                                      ? std::vector<expr> {oc, ic, r, s, 0, 0,
-                                        0}
-                                      : std::vector<expr> {oc, ic, r, s, 0, 0});
-                                }
-                              }
-                            }
-                          }
-                          const auto hint_A_size = im_s_block * im_ic_block
-                            * kh_ * kw_ * ic_block / im_ic_block;
-                          const auto hint_B_size
-                            = im_oc_block * ic_block * kh_ * kw_;
-                          const auto hint_C_size = im_s_block * im_oc_block;
-
-                          sc_brgemm_attrs_t brg_attrs {
-                            {brgemm::attr_key::max_bs,
-                              kh_ * kw_ * ic_block / im_ic_block},
-                            {brgemm::attr_key::hint_expected_A_size,
-                              hint_A_size},
-                            {brgemm::attr_key::hint_expected_B_size,
-                              hint_B_size},
-                            {brgemm::attr_key::hint_expected_C_size,
-                              hint_C_size},
-                            {brgemm::attr_key::use_interleave_stores, true},
-                            {brgemm::attr_key::use_uker, true},
-                            {brgemm::attr_key::bd_mask_level, 2}};
-
-                          builtin::brgemm_init_list_update(A_list, B_list,
-                            out_tsr, 1, im_s_block, im_oc_block, im_ic_block,
-                            LDA, im_oc_block, LDC, 1 /*useless*/, 1 /*useless*/,
-                            kh_ * kw_ * ic_block / im_ic_block,
-                            get_input_dtype(), get_weight_dtype(), brg_attrs,
-                            os_mask, im_s_block_idx, os / im_s_block);
-
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              auto os_num_block = os / im_s_block;
-                              if (oh_ % os_num_block == 0) {
-                                create_fusion_anchor(fusion,
-                                  owner_->get_outputs()[0],
-                                  blocking_output_
-                                    ? slice_range {{n, 1UL}, {oc, 1},
-                                      {im_s_block_idx * (oh_ / os_num_block),
-                                        (oh_ / os_num_block)},
-                                      {0, ow_}, {0, im_oc_block}}
-                                    : slice_range {{n, 1UL},
-                                      {im_s_block_idx * (oh_ / os_num_block),
-                                        (oh_ / os_num_block)},
-                                      {0, ow_},
-                                      {oc * im_oc_block, im_oc_block}});
-                              }
-                            }
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-
-            if (oc_threads == 1 && ic_threads == 1 && s_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                blocking_output_ ? slice_range {{pbs, 1UL},
-                  {outer_k * oc_ / im_oc_block / oc_split,
-                    oc_ / im_oc_block / oc_split},
-                  {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                                 : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                                   {outer_k * oc_ / oc_split, oc_ / oc_split}});
-            }
-          }
-
-          if (oc_threads == 1 && s_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              blocking_output_ ? slice_range {{pbs, 1UL},
-                {outer_k * oc_ / im_oc_block / oc_split,
-                  oc_ / im_oc_block / oc_split},
-                {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                               : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                                 {outer_k * oc_ / oc_split, oc_ / oc_split}});
-          }
-        }
-        if (s_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            blocking_output_ ? slice_range {{pbs, 1UL},
-              {outer_k * oc_ / im_oc_block / oc_split,
-                oc_ / im_oc_block / oc_split},
-              {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                             : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                               {outer_k * oc_ / oc_split, oc_ / oc_split}});
-        }
-      }
-      if (mb_ > 1) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          blocking_output_ ? slice_range {{pbs, 1UL},
-            {outer_k * oc_ / im_oc_block / oc_split,
-              oc_ / im_oc_block / oc_split},
-            {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                           : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                             {outer_k * oc_ / oc_split, oc_ / oc_split}});
-      }
-    }
-  }
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lps, std::vector<std::string> {"H", "W"});
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lps, lpoc, lpic, lok};
-}
-
-void gen_nested_conv_fwd_t::dynamic_compute_conv_no_padding_nested(
-  CONV_ARG_LIST) const {
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int h_threads = config.h_threads;
-  int oc_threads = config.oc_threads;
-  int bs_threads = num_threads / h_threads / oc_threads;
-  int w_threads = config.w_threads;
-  int ic_threads = 1;
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-
-  auto output_expr_dims = output.checked_as<tensor>()->dims_;
-  auto oh_expr_
-    = input_expr_dims.size() == 4 ? output_expr_dims[1] : output_expr_dims[2];
-  auto ow_expr_
-    = input_expr_dims.size() == 4 ? output_expr_dims[2] : output_expr_dims[3];
-
-  int oc_block = oc_ / oc_threads;
-
-  // by observation
-  expr im_h_block = do_cast_and_fold(
-    builder::make_select(oh_expr_ <= 14 && ow_expr_ <= 14 && h_threads == 1,
-      builder::make_cast(datatypes::s32, oh_expr_), config.im_h_block));
-
-  expr h_block
-    = do_cast_and_fold(builder::make_select(oh_expr_ % h_threads == 0,
-      builder::make_cast(datatypes::s32, oh_expr_ / h_threads),
-      config.im_h_block));
-
-  expr w_block = do_cast_and_fold((ow_expr_ + w_threads - 1) / w_threads);
-
-  int ic_block = ic_ / ic_threads;
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-
-  int im_w_block = config.im_w_block;
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-
-  // param
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-
-  for_loop lpbs, lph, lpw, lpoc, lpic, loh, low, looc, loic, lioc, lih, liw,
-    lok;
-
-  int oc_num_block_pt, oc_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-  expr h_num_block_pt
-    = divide_and_ceil(divide_and_ceil(oh_expr_, h_block), h_threads);
-  expr h_tail_num_block_pt = builder::make_select(
-    divide_and_ceil(oh_expr_, h_block) % h_num_block_pt == 0, h_num_block_pt,
-    divide_and_ceil(oh_expr_, h_block) % h_num_block_pt);
-  expr oh_used_threads
-    = divide_and_ceil(divide_and_ceil(oh_expr_, h_block), h_num_block_pt);
-
-  expr ow_used_threads = do_cast_and_fold((ow_expr_ + w_block - 1) / w_block);
-  expr w_num_block_pt = ow_used_threads / w_threads;
-  expr w_tail_num_block_pt
-    = builder::make_select(ow_used_threads % w_num_block_pt == 0,
-      w_num_block_pt, ow_used_threads % w_num_block_pt);
-
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  if (ic_used_threads > 1) {
-    // barrier
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-
-  expr cond_tail_w = w_block % im_w_block != 0 || ow_expr_ % im_w_block != 0;
-  expr cond_tail_h = h_block % im_h_block != 0 || oh_expr_ % im_h_block != 0;
-  auto weight_size
-    = math_utils::get_dims_product(in_tensors_[1].get_blocking_dims())
-    * utils::get_sizeof_type(get_weight_dtype());
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  int oc_split = (oc_threads == 1 && oc_num_block_pt == 1)
-    ? get_oc_split_factor(
-      -1, weight_size, L2_cache_size, oc_block / im_oc_block)
-    : 1;
-
-  auto LDA = blocking_input_ ? sw_ * im_ic_block : sw_ * ic_;
-  auto LDC = blocking_output_ ? im_oc_block : oc_;
-  // will update template for parallel merge in dynamic conv block
-  _named_for_(lok, outer_k, 0, oc_split, 1, for_type::PARALLEL) {
-    _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-      _named_for_(lph, ph, 0, h_threads, 1) {
-        _named_for_(lpw, pw, 0, w_threads, 1) {
-          _named_for_(lpoc, poc, 0, oc_threads, 1) {
-            _named_for_(lpic, pic, 0, ic_threads, 1) {
-              expr h_num_block
-                = builder::make_select(ph < (oh_used_threads - 1),
-                  h_num_block_pt, h_tail_num_block_pt),
-                w_num_block = builder::make_select(pw < (ow_used_threads - 1),
-                  w_num_block_pt, w_tail_num_block_pt),
-                oc_num_block = builder::make_select(poc < (oc_used_threads - 1),
-                  oc_num_block_pt, oc_tail_num_block_pt);
-              _if_(ph < oh_used_threads && pw < ow_used_threads
-                && poc < oc_used_threads && pic < ic_used_threads) {
-                // single core
-                expr ic_num_block
-                  = builder::make_select(pic < (ic_used_threads - 1),
-                    ic_num_block_pt, ic_tail_num_block_pt);
-
-                expr n = pbs;
-                _named_for_(loh, o_h, 0, h_num_block_pt) {
-                  _named_for_(low, o_w, 0, w_num_block_pt) {
-                    _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-                      _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                        expr cond = o_h < h_num_block && o_w < w_num_block
-                          && o_oc < oc_num_block && o_ic < ic_num_block;
-                        _if_(cond) {
-                          _named_for_(lih, i_h, 0,
-                            (h_block + im_h_block - 1) / im_h_block) {
-                            expr h = ph * h_num_block_pt * h_block
-                              + o_h * h_block + i_h * im_h_block;
-                            expr is_tail_h
-                              = cond_tail_h && (h + im_h_block > oh_expr_);
-
-                            expr real_im_h_block
-                              = builder::make_select(is_tail_h,
-                                builder::make_cast(
-                                  datatypes::s32, oh_expr_ % im_h_block),
-                                im_h_block);
-                            _if_(h < oh_expr_) {
-                              _named_for_(liw, i_w, 0,
-                                (w_block + im_w_block - 1) / im_w_block) {
-                                expr w = pw * w_num_block_pt * w_block
-                                  + o_w * w_block + i_w * im_w_block;
-                                _if_(w < ow_expr_) {
-                                  expr is_tail_w = cond_tail_w
-                                    && (w + im_w_block > ow_expr_);
-                                  expr real_im_w_block
-                                    = builder::make_select(is_tail_w,
-                                      builder::make_cast(
-                                        datatypes::s32, ow_expr_ % im_w_block),
-                                      im_w_block);
-                                  _named_for_(lioc, i_oc, 0,
-                                    oc_block / im_oc_block / oc_split) {
-                                    expr oc = poc * oc_num_block_pt * oc_block
-                                        / im_oc_block
-                                      + o_oc * oc_block / im_oc_block
-                                      + outer_k * oc_block / im_oc_block
-                                        / oc_split
-                                      + i_oc;
-                                    _if_(oc * im_oc_block < oc_) {
-                                      _for_(im_h_i, 0, im_h_block) {
-                                        _if_(h + im_h_i < oh_expr_) {
-                                          _tensor_(A_list, datatypes::pointer,
-                                            {kh_ * kw_ * ic_block
-                                              / im_ic_block});
-                                          _tensor_(B_list, datatypes::pointer,
-                                            {kh_ * kw_ * ic_block
-                                              / im_ic_block});
-
-                                          _for_(
-                                            i_c, 0, ic_block / im_ic_block) {
-                                            expr ic = pic * ic_num_block_pt
-                                                * ic_block / im_ic_block
-                                              + o_ic * ic_block / im_ic_block
-                                              + i_c;
-
-                                            _if_(ic * im_ic_block < ic_) {
-                                              _for_(r, 0, kh_) {
-                                                _for_(s, 0, kw_) {
-                                                  auto idx = i_c * kh_ * kw_
-                                                    + r * kw_ + s;
-                                                  std::vector<expr> input_pos
-                                                    = blocking_input_
-                                                    ? std::vector<expr> {n, ic,
-                                                      (h + im_h_i) * sh_
-                                                        + dh_ * r,
-                                                      w * sw_ + dw_ * s, 0}
-                                                    : std::vector<expr> {n,
-                                                      (h + im_h_i) * sh_
-                                                        + dh_ * r,
-                                                      w * sw_ + dw_ * s,
-                                                      ic * im_ic_block};
-
-                                                  A_list[idx] = tensor_ptr(
-                                                    input, input_pos);
-                                                  B_list[idx]
-                                                    = tensor_ptr(weight,
-                                                      kpack > 1
-                                                        ? std::vector<expr> {oc,
-                                                          ic, r, s, 0, 0, 0}
-                                                        : std::vector<expr> {
-                                                          oc, ic, r, s, 0, 0});
-                                                }
-                                              }
-                                            }
-                                          }
-                                          std::vector<expr> output_pos
-                                            = blocking_output_
-                                            ? std::vector<expr> {pic * mb_expr_
-                                                + n,
-                                              oc, h + im_h_i, w, 0}
-                                            : std::vector<expr> {
-                                              pic * mb_expr_ + n, h + im_h_i, w,
-                                              oc * im_oc_block};
-
-                                          generate_brgemm(real_im_w_block,
-                                            im_ic_block, im_oc_block, ic_block,
-                                            o_ic, ic_num_block_pt, A_list,
-                                            B_list,
-                                            tensor_ptr(output_tmp, output_pos),
-                                            LDA, LDC);
-
-                                          if (ic_used_threads == 1
-                                            && ic_num_block_pt == 1) {
-                                            _if_(o_ic == (ic_num_block - 1)) {
-                                              create_fusion_anchor(fusion,
-                                                owner_->get_outputs()[0],
-                                                blocking_output_
-                                                  ? slice_range {{n, 1UL},
-                                                    {oc, 1}, {h + im_h_i, 1},
-                                                    {w, real_im_w_block},
-                                                    {0, im_oc_block}}
-                                                  : slice_range {{n, 1UL},
-                                                    {h + im_h_i, 1},
-                                                    {w, real_im_w_block},
-                                                    {oc * im_oc_block,
-                                                      im_oc_block}});
-                                            }
-                                          } // im_h_i
-                                        }
-                                      }
-                                      if (ic_used_threads == 1
-                                        && ic_num_block_pt == 1) {
-                                        _if_(o_ic == (ic_num_block - 1)) {
-                                          create_fusion_anchor(fusion,
-                                            owner_->get_outputs()[0],
-                                            blocking_output_
-                                              ? slice_range {{n, 1UL}, {oc, 1},
-                                                {h, real_im_h_block},
-                                                {w, real_im_w_block},
-                                                {0, im_oc_block}}
-                                              : slice_range {{n, 1UL},
-                                                {h, real_im_h_block},
-                                                {w, real_im_w_block},
-                                                {oc * im_oc_block,
-                                                  im_oc_block}});
-                                        }
-                                      }
-                                    } // i_oc
-                                  }
-                                  if (ic_used_threads == 1
-                                    && ic_num_block_pt == 1
-                                    && oc_block * oc_used_threads == oc_) {
-                                    _if_(o_ic == (ic_num_block - 1)) {
-                                      expr anch_c = poc * oc_num_block_pt
-                                          * oc_block / im_oc_block
-                                        + o_oc * oc_block / im_oc_block
-                                        + outer_k * oc_block / im_oc_block
-                                          / oc_split;
-                                      create_fusion_anchor(fusion,
-                                        owner_->get_outputs()[0],
-                                        blocking_output_
-                                          ? slice_range {{n, 1UL}, {anch_c, 1},
-                                            {h, real_im_h_block},
-                                            {w, real_im_w_block},
-                                            {0, im_oc_block}}
-                                          : slice_range {{n, 1UL},
-                                            {h, real_im_h_block},
-                                            {w, real_im_w_block},
-                                            {anch_c * im_oc_block,
-                                              im_oc_block}});
-                                    }
-                                  }
-                                } // i_w
-                              }
-                              if (!is_dynamic_dim(ow_)
-                                && get_expr_as_int(w_block)
-                                    * get_expr_as_int(ow_used_threads)
-                                  == ow_
-                                && ic_used_threads == 1 && ic_num_block_pt == 1
-                                && oc_block * oc_used_threads == oc_) {
-                                _if_(o_ic == (ic_num_block - 1)) {
-                                  expr anch_c = poc * oc_num_block_pt * oc_block
-                                      / im_oc_block
-                                    + o_oc * oc_block / im_oc_block
-                                    + outer_k * oc_block / im_oc_block
-                                      / oc_split;
-                                  expr anch_w = pw * w_num_block_pt * w_block
-                                    + o_w * w_block;
-                                  create_fusion_anchor(fusion,
-                                    owner_->get_outputs()[0],
-                                    blocking_output_
-                                      ? slice_range {{n, 1UL}, {anch_c, 1},
-                                        {h, real_im_h_block}, {anch_w, w_block},
-                                        {0, im_oc_block}}
-                                      : slice_range {{n, 1UL},
-                                        {h, real_im_h_block}, {anch_w, w_block},
-                                        {anch_c * im_oc_block, oc_block}});
-                                }
-                              }
-                            } // i_h
-                          }
-
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1
-                            && !is_dynamic_dim(oh_) && !is_dynamic_dim(ow_)
-                            && oc_block * oc_used_threads == oc_
-                            && get_expr_as_int(h_block)
-                                * get_expr_as_int(oh_used_threads)
-                              == oh_
-                            && get_expr_as_int(w_block)
-                                * get_expr_as_int(ow_used_threads)
-                              == ow_) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              expr anch_c
-                                = poc * oc_num_block_pt * oc_block / im_oc_block
-                                + o_oc * oc_block / im_oc_block
-                                + outer_k * oc_block / im_oc_block / oc_split;
-                              expr anch_h
-                                = ph * h_num_block_pt * h_block + o_h * h_block;
-                              expr anch_w = (pw * w_num_block_pt * w_block
-                                + o_w * w_block);
-                              create_fusion_anchor(fusion,
-                                owner_->get_outputs()[0],
-                                blocking_output_
-                                  ? slice_range {{n, 1UL}, {anch_c, 1},
-                                    {anch_h, oh_ / oh_used_threads},
-                                    {anch_w, ow_ / ow_used_threads},
-                                    {0, im_oc_block}}
-                                  : slice_range {{n, 1UL},
-                                    {anch_h, oh_ / oh_used_threads},
-                                    {anch_w, ow_ / ow_used_threads},
-                                    {anch_c * im_oc_block, oc_block}});
-                            }
-                          }
-                        } // o_ic
-                      }
-                    }
-                  }
-                }
-              }
-
-              if (oc_threads == 1 && ic_threads == 1 && h_threads == 1
-                && w_threads == 1 && !is_dynamic_dim(oh_)
-                && !is_dynamic_dim(ow_)) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  blocking_output_
-                    ? slice_range {{pbs, 1UL},
-                      {outer_k * oc_ / im_oc_block / oc_split,
-                        oc_ / im_oc_block / oc_split},
-                      {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                    : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                      {outer_k * oc_ / oc_split, oc_ / oc_split}});
-              }
-            }
-
-            if (oc_threads == 1 && h_threads == 1 && w_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                blocking_output_
-                  ? slice_range {{pbs, 1UL},
-                    {outer_k * oc_ / im_oc_block / oc_split,
-                      oc_ / im_oc_block / oc_split},
-                    {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-                  : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                    {outer_k * oc_ / oc_split, oc_ / oc_split}});
-            }
-          }
-          if (h_threads == 1 && w_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              blocking_output_
-                ? slice_range {{pbs, 1UL},
-                  {outer_k * oc_ / im_oc_block / oc_split,
-                    oc_ / im_oc_block / oc_split},
-                  {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-                : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                  {outer_k * oc_ / oc_split, oc_ / oc_split}});
-          }
-        }
-
-        if (h_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            blocking_output_
-              ? slice_range {{pbs, 1UL},
-                {outer_k * oc_ / im_oc_block / oc_split,
-                  oc_ / im_oc_block / oc_split},
-                {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-              : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                {outer_k * oc_ / oc_split, oc_ / oc_split}});
-        }
-      }
-
-      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-        blocking_output_
-          ? slice_range {{pbs, 1UL},
-            {outer_k * oc_ / im_oc_block / oc_split,
-              oc_ / im_oc_block / oc_split},
-            {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-          : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-            {outer_k * oc_ / oc_split, oc_ / oc_split}});
-    }
-  }
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lph, "H");
-  bind_output_loop_axis(lpw, "W");
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lok, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lph, lpw, lpoc, lpic, lok};
-}
-
-void gen_nested_conv_fwd_t::compute_conv_no_padding_nested(
-  CONV_ARG_LIST) const {
-  int bs_threads = config.bs_threads;
-  int h_threads = config.h_threads;
-  int w_threads = config.w_threads;
-  int oc_threads = config.oc_threads;
-  int ic_threads = 1;
-
-  int oc_block = config.K_block;
-  int h_block = config.h_block;
-  int w_block = config.w_block;
-  int ic_block = config.C_block;
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-  int im_h_block = config.im_h_block;
-  int im_w_block = config.im_w_block;
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-  COMPILE_ASSERT(
-    h_block % im_h_block == 0, "h_block % im_h_block != 0, config is invalid")
-  COMPILE_ASSERT(
-    w_block % im_w_block == 0, "w_block % im_w_block != 0, config is invalid")
-
-  // param
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-
-  for_loop lpbs, lph, lpw, lpoc, lpic, loh, low, looc, loic, lioc, lih, liw,
-    lok;
-
-  int oc_num_block_pt, oc_tail_num_block_pt, h_num_block_pt,
-    h_tail_num_block_pt, w_num_block_pt, w_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-  int oh_used_threads = block_split(utils::divide_and_ceil(oh_, h_block),
-    h_threads, h_num_block_pt, h_tail_num_block_pt);
-
-  int ow_used_threads = block_split(utils::divide_and_ceil(ow_, w_block),
-    w_threads, w_num_block_pt, w_tail_num_block_pt);
-
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  if (ic_used_threads > 1) {
-    // barrier
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-
-  auto LDA = blocking_input_ ? sw_ * im_ic_block : sw_ * ic_;
-  auto LDC = blocking_output_ ? im_oc_block : oc_;
-
-  int oc_split = 1;
-  auto nthreads = runtime_config_t::get().get_num_threads();
-  bool parallel_space_is_enough
-    = (mb_ % nthreads == 0 || utils::divide_and_ceil(mb_, nthreads) > 8);
-  auto weight_size
-    = math_utils::get_dims_product(in_tensors_[1].get_blocking_dims())
-    * utils::get_sizeof_type(get_weight_dtype());
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  if (weight_size >= L2_cache_size && parallel_space_is_enough
-    && oc_threads == 1 && oc_num_block_pt == 1) {
-    int num_block = oc_block / im_oc_block;
-    int expected_split_num = utils::divide_and_ceil(weight_size, L2_cache_size);
-    for (auto &factor : utils::get_factors(num_block)) {
-      if (factor >= expected_split_num) {
-        expected_split_num = factor;
-        break;
-      }
-    }
-    oc_split = num_block < expected_split_num ? 1 : expected_split_num;
-  }
-
-  _named_for_(lok, outer_k, 0, oc_split, 1, for_type::PARALLEL) {
-    _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-      _named_for_(lph, ph, 0, oh_used_threads, 1) {
-        _named_for_(lpw, pw, 0, ow_used_threads, 1) {
-          _named_for_(lpoc, poc, 0, oc_used_threads, 1) {
-            _named_for_(lpic, pic, 0, ic_used_threads, 1) {
-              expr h_num_block
-                = builder::make_select(ph < (oh_used_threads - 1),
-                  h_num_block_pt, h_tail_num_block_pt),
-                w_num_block = builder::make_select(pw < (ow_used_threads - 1),
-                  w_num_block_pt, w_tail_num_block_pt),
-                oc_num_block = builder::make_select(poc < (oc_used_threads - 1),
-                  oc_num_block_pt, oc_tail_num_block_pt);
-
-              // single core
-              expr ic_num_block
-                = builder::make_select(pic < (ic_used_threads - 1),
-                  ic_num_block_pt, ic_tail_num_block_pt);
-
-              expr n = pbs;
-              _named_for_(loh, o_h, 0, h_num_block_pt) {
-                _named_for_(low, o_w, 0, w_num_block_pt) {
-                  _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-                    _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-                      expr cond = o_h < h_num_block && o_w < w_num_block
-                        && o_oc < oc_num_block && o_ic < ic_num_block;
-                      _if_(cond) {
-                        _named_for_(lih, i_h, 0, h_block / im_h_block) {
-                          expr h = (ph * h_num_block_pt * h_block / im_h_block
-                                     + o_h * h_block / im_h_block + i_h)
-                            * im_h_block;
-                          _named_for_(liw, i_w, 0, w_block / im_w_block) {
-                            expr w = (pw * w_num_block_pt * w_block / im_w_block
-                                       + o_w * w_block / im_w_block + i_w)
-                              * im_w_block;
-                            _if_(w < ow_) {
-                              _named_for_(
-                                lioc, i_oc, 0, oc_block / im_oc_block) {
-                                expr oc = poc * oc_num_block_pt * oc_block
-                                    / im_oc_block
-                                  + o_oc * oc_block / im_oc_block
-                                  + outer_k * oc_block / im_oc_block / oc_split
-                                  + i_oc;
-                                _if_(oc * im_oc_block < oc_) {
-                                  _tensor_(A_list, datatypes::pointer,
-                                    {kh_ * kw_ * ic_block / im_ic_block});
-                                  _tensor_(B_list, datatypes::pointer,
-                                    {kh_ * kw_ * ic_block / im_ic_block});
-
-                                  _for_(im_h_i, 0, im_h_block) {
-                                    _if_(h + im_h_i < oh_) {
-                                      _for_(i_c, 0, ic_block / im_ic_block) {
-                                        expr ic = pic * ic_num_block_pt
-                                            * ic_block / im_ic_block
-                                          + o_ic * ic_block / im_ic_block + i_c;
-                                        _if_(ic * im_ic_block < ic_) {
-                                          _for_(r, 0, kh_) {
-                                            _for_(s, 0, kw_) {
-                                              auto idx
-                                                = i_c * kh_ * kw_ + r * kw_ + s;
-                                              std::vector<expr> input_pos
-                                                = blocking_input_
-                                                ? std::vector<expr> {n, ic,
-                                                  (h + im_h_i) * sh_ + r,
-                                                  w * sw_ + s, 0}
-                                                : std::vector<expr> {n,
-                                                  (h + im_h_i) * sh_ + r,
-                                                  w * sw_ + s,
-                                                  ic * im_ic_block};
-
-                                              A_list[idx]
-                                                = tensor_ptr(input, input_pos);
-                                              B_list[idx] = tensor_ptr(weight,
-                                                kpack > 1
-                                                  ? std::vector<expr> {oc, ic,
-                                                    r, s, 0, 0, 0}
-                                                  : std::vector<expr> {
-                                                    oc, ic, r, s, 0, 0});
-                                            }
-                                          }
-                                        }
-                                      }
-                                      const auto hint_A_size
-                                        = im_w_block * ic_block * kh_ * kw_;
-                                      const auto hint_B_size
-                                        = im_oc_block * ic_block * kh_ * kw_;
-                                      const auto hint_C_size
-                                        = im_w_block * im_oc_block;
-
-                                      sc_brgemm_attrs_t brg_attrs {
-                                        {brgemm::attr_key::max_bs,
-                                          kh_ * kw_ * ic_block / im_ic_block},
-                                        {brgemm::attr_key::hint_expected_A_size,
-                                          hint_A_size},
-                                        {brgemm::attr_key::hint_expected_B_size,
-                                          hint_B_size},
-                                        {brgemm::attr_key::hint_expected_C_size,
-                                          hint_C_size},
-                                        {brgemm::attr_key::
-                                            use_interleave_stores,
-                                          true},
-                                        {brgemm::attr_key::use_uker, true},
-                                        {brgemm::attr_key::bd_mask_level, 0}};
-
-                                      std::vector<expr> output_pos
-                                        = blocking_output_
-                                        ? std::vector<expr> {pic * mb_ + n, oc,
-                                          h + im_h_i, w, 0}
-                                        : std::vector<expr> {pic * mb_ + n,
-                                          h + im_h_i, w, oc * im_oc_block};
-
-                                      if (ic_num_block_pt > 1) {
-                                        _if_(o_ic == 0) {
-                                          builtin::brgemm_init_list_update(
-                                            A_list, B_list,
-                                            tensor_ptr(output_tmp, output_pos),
-                                            1, im_w_block, im_oc_block,
-                                            im_ic_block, LDA, im_oc_block, LDC,
-                                            1 /*useless*/
-                                            ,
-                                            1 /*useless*/
-                                            ,
-                                            kh_ * kw_ * ic_block / im_ic_block,
-                                            get_input_dtype(),
-                                            get_weight_dtype(), brg_attrs);
-                                        }
-                                        _else_ {
-                                          builtin::brgemm_list_update(A_list,
-                                            B_list,
-                                            tensor_ptr(output_tmp, output_pos),
-                                            1, im_w_block, im_oc_block,
-                                            im_ic_block, LDA, im_oc_block, LDC,
-                                            1 /*useless*/
-                                            ,
-                                            1 /*useless*/
-                                            ,
-                                            kh_ * kw_ * ic_block / im_ic_block,
-                                            get_input_dtype(),
-                                            get_weight_dtype(), brg_attrs);
-                                        }
-                                      } else {
-                                        builtin::brgemm_init_list_update(A_list,
-                                          B_list,
-                                          tensor_ptr(output_tmp, output_pos), 1,
-                                          im_w_block, im_oc_block, im_ic_block,
-                                          LDA, im_oc_block, LDC, 1 /*useless*/
-                                          ,
-                                          1 /*useless*/
-                                          ,
-                                          kh_ * kw_ * ic_block / im_ic_block,
-                                          get_input_dtype(), get_weight_dtype(),
-                                          brg_attrs);
-                                      }
-
-                                      if (ic_used_threads == 1
-                                        && ic_num_block_pt == 1) {
-                                        _if_(o_ic == (ic_num_block - 1)) {
-                                          create_fusion_anchor(fusion,
-                                            owner_->get_outputs()[0],
-                                            blocking_output_
-                                              ? slice_range {{n, 1UL}, {oc, 1},
-                                                {h + im_h_i, 1},
-                                                {w, im_w_block},
-                                                {0, im_oc_block}}
-                                              : slice_range {{n, 1UL},
-                                                {h + im_h_i, 1},
-                                                {w, im_w_block},
-                                                {oc * im_oc_block,
-                                                  im_oc_block}});
-                                        }
-                                      }
-                                    }
-                                  }
-                                  if (ic_used_threads == 1
-                                    && ic_num_block_pt == 1) {
-                                    _if_(o_ic == (ic_num_block - 1)) {
-                                      create_fusion_anchor(fusion,
-                                        owner_->get_outputs()[0],
-                                        blocking_output_
-                                          ? slice_range {{n, 1UL}, {oc, 1},
-                                            {h, im_h_block}, {w, im_w_block},
-                                            {0, im_oc_block}}
-                                          : slice_range {{n, 1UL},
-                                            {h, im_h_block}, {w, im_w_block},
-                                            {oc * im_oc_block, im_oc_block}});
-                                    }
-                                  }
-                                }
-                              }
-                              if (ic_used_threads == 1 && ic_num_block_pt == 1
-                                && oc_block * oc_used_threads == oc_) {
-                                _if_(o_ic == (ic_num_block - 1)) {
-                                  expr anch_c = poc * oc_num_block_pt * oc_block
-                                      / im_oc_block
-                                    + o_oc * oc_block / im_oc_block
-                                    + outer_k * oc_block / im_oc_block
-                                      / oc_split;
-                                  create_fusion_anchor(fusion,
-                                    owner_->get_outputs()[0],
-                                    blocking_output_
-                                      ? slice_range {{n, 1UL}, {anch_c, 1},
-                                        {h, im_h_block}, {w, im_w_block},
-                                        {0, im_oc_block}}
-                                      : slice_range {{n, 1UL}, {h, im_h_block},
-                                        {w, im_w_block},
-                                        {anch_c * im_oc_block, oc_block}});
-                                }
-                              }
-                            }
-                          }
-
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1
-                            && oc_block * oc_used_threads == oc_
-                            && w_block * ow_used_threads == ow_) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              expr anch_c
-                                = poc * oc_num_block_pt * oc_block / im_oc_block
-                                + o_oc * oc_block / im_oc_block
-                                + outer_k * oc_block / im_oc_block / oc_split;
-                              expr anch_w
-                                = (pw * w_num_block_pt * w_block / im_w_block
-                                    + o_w * w_block / im_w_block)
-                                * im_w_block;
-                              create_fusion_anchor(fusion,
-                                owner_->get_outputs()[0],
-                                blocking_output_
-                                  ? slice_range {{n, 1UL}, {anch_c, 1},
-                                    {h, im_h_block}, {anch_w, w_block},
-                                    {0, im_oc_block}}
-                                  : slice_range {{n, 1UL}, {h, im_h_block},
-                                    {anch_w, w_block},
-                                    {anch_c * im_oc_block, oc_block}});
-                            }
-                          }
-                        }
-
-                        if (ic_used_threads == 1 && ic_num_block_pt == 1
-                          && oc_block * oc_used_threads == oc_
-                          && w_block * ow_used_threads == ow_
-                          && h_block * oh_used_threads == oh_) {
-                          _if_(o_ic == (ic_num_block - 1)) {
-                            expr anch_c
-                              = poc * oc_num_block_pt * oc_block / im_oc_block
-                              + o_oc * oc_block / im_oc_block
-                              + outer_k * oc_block / im_oc_block / oc_split;
-                            expr anch_h
-                              = (ph * h_num_block_pt * h_block / im_h_block
-                                  + o_h * h_block / im_h_block)
-                              * im_h_block;
-                            expr anch_w
-                              = (pw * w_num_block_pt * w_block / im_w_block
-                                  + o_w * w_block / im_w_block)
-                              * im_w_block;
-                            create_fusion_anchor(fusion,
-                              owner_->get_outputs()[0],
-                              blocking_output_
-                                ? slice_range {{n, 1UL}, {anch_c, 1},
-                                  {anch_h, h_block}, {anch_w, w_block},
-                                  {0, im_oc_block}}
-                                : slice_range {{n, 1UL}, {anch_h, h_block},
-                                  {anch_w, w_block},
-                                  {anch_c * im_oc_block, oc_block}});
-                          }
-                        }
-                      }
-                    }
-                    // TODO(xurui): need to add iterated anchor here to
-                    // support more fusion opportunity
-                  }
-                }
-              }
-
-              if (oc_threads == 1 && ic_threads == 1 && h_threads == 1
-                && w_threads == 1) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  blocking_output_
-                    ? slice_range {{pbs, 1UL},
-                      {outer_k * oc_ / im_oc_block / oc_split,
-                        oc_ / im_oc_block / oc_split},
-                      {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                    : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                      {outer_k * oc_ / oc_split, oc_ / oc_split}});
-              }
-            }
-
-            if (oc_threads == 1 && h_threads == 1 && w_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                blocking_output_ ? slice_range {{pbs, 1UL},
-                  {outer_k * oc_ / im_oc_block / oc_split,
-                    oc_ / im_oc_block / oc_split},
-                  {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                                 : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                                   {outer_k * oc_ / oc_split, oc_ / oc_split}});
-            }
-          }
-          if (h_threads == 1 && w_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              blocking_output_ ? slice_range {{pbs, 1UL},
-                {outer_k * oc_ / im_oc_block / oc_split,
-                  oc_ / im_oc_block / oc_split},
-                {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                               : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                                 {outer_k * oc_ / oc_split, oc_ / oc_split}});
-          }
-        }
-
-        if (h_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            blocking_output_ ? slice_range {{pbs, 1UL},
-              {outer_k * oc_ / im_oc_block / oc_split,
-                oc_ / im_oc_block / oc_split},
-              {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                             : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                               {outer_k * oc_ / oc_split, oc_ / oc_split}});
-        }
-      }
-      if (mb_ > 1) {
-        create_fusion_anchor(fusion, owner_->get_outputs()[0],
-          blocking_output_ ? slice_range {{pbs, 1UL},
-            {outer_k * oc_ / im_oc_block / oc_split,
-              oc_ / im_oc_block / oc_split},
-            {0, oh_}, {0, ow_}, {0, im_oc_block}}
-                           : slice_range {{pbs, 1UL}, {0, oh_}, {0, ow_},
-                             {outer_k * oc_ / oc_split, oc_ / oc_split}});
-      }
-    }
-  }
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lph, "H");
-  bind_output_loop_axis(lpw, "W");
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lok, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lph, lpw, lpoc, lpic, lok};
-}
-
-void gen_nested_conv_fwd_t::single_thread_conv_padding_call(expr &output,
-  const expr &input, const expr &weight, const expr &pbs, const expr &poc,
-  const expr &ph, const expr &pw, const expr &pic, const expr &outer_k,
-  const expr &h_num_block, const int h_num_block_pt, const expr &w_num_block,
-  const int w_num_block_pt, const expr &oc_num_block, const int oc_num_block_pt,
-  const expr &ic_num_block, const int ic_num_block_pt, const expr &pbuffer,
-  for_loop &loh, for_loop &low, for_loop &looc, for_loop &loic, for_loop &lioc,
-  for_loop &lih, for_loop &liw, const int oc_split, const int src_row_tile_size,
-  const uint32_t lanes, const nested_conv_fwd_config_t &config,
-  fusion_anchor_mgr_t *fusion, const int ic_used_threads,
-  const int oh_used_threads, const int ow_used_threads, const int y_unpad_top,
-  const int y_unpad_bottom, const int y_unpad_left, const int y_unpad_right,
-  const int iw_padded, const int kpack) const {
-  auto h_block = config.h_block;
-  auto w_block = config.w_block;
-  auto im_h_block = config.im_h_block;
-  auto im_w_block = config.im_w_block;
-
-  auto ic_block = config.C_block;
-  auto im_ic_block = config.im_ic_block;
-  auto oc_block = config.K_block;
-  auto im_oc_block = config.im_oc_block;
-
-  auto dtypeInput = get_input_dtype();
-  auto dtypeWeight = get_weight_dtype();
-  auto dtypeOutput = get_output_dtype();
-
-  auto LDA = blocking_input_ ? im_ic_block : ic_;
-  auto LDC = blocking_output_ ? im_oc_block : oc_;
-
-  expr n = pbs;
-  _named_for_(loh, o_h, 0, h_num_block_pt) {
-    _named_for_(low, o_w, 0, w_num_block_pt) {
-      _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-        _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-          expr cond = o_h < h_num_block && o_w < w_num_block
-            && o_oc < oc_num_block && o_ic < ic_num_block;
-          _if_(cond) {
-            _named_for_(lioc, i_oc, 0, oc_block / im_oc_block / oc_split) {
-              expr oc = poc * oc_num_block_pt * oc_block / im_oc_block
-                + o_oc * oc_block / im_oc_block
-                + outer_k * oc_block / im_oc_block / oc_split + i_oc;
-              _if_(oc * im_oc_block < oc_) {
-                _named_for_(lih, i_h, 0, h_block / im_h_block) {
-                  expr h = (ph * h_num_block_pt * h_block / im_h_block
-                             + o_h * h_block / im_h_block + i_h)
-                    * im_h_block;
-
-                  _tensor_(A_list, datatypes::pointer, {kh_ * kw_});
-                  _tensor_(B_list, datatypes::pointer, {kh_ * kw_});
-
-                  // create a sub-tensor with maximum size which
-                  // holds all the boundary
-                  // that contains padding
-                  _tensor_(
-                    sub_tensor, dtypeInput, {kh_, src_row_tile_size, LDA});
-                  _var_(pad_begin_index, datatypes::index);
-                  _var_(pad_end_index, datatypes::index);
-                  _var_(unpad_begin_index, datatypes::index);
-                  _var_(unpad_end_index, datatypes::index);
-                  _var_(real_pad_left, datatypes::u32);
-                  _var_(real_pad_right, datatypes::u32);
-                  _var_(num_pad_rows, datatypes::u32);
-                  _var_(copy_width, datatypes::u32);
-
-                  _named_for_(liw, i_w, 0, w_block / im_w_block) {
-                    expr w = (pw * w_num_block_pt * w_block / im_w_block
-                               + o_w * w_block / im_w_block + i_w)
-                      * im_w_block;
-                    _if_(w < ow_) {
-                      _for_(im_h_i, 0, im_h_block) {
-                        _if_(h + im_h_i < oh_) {
-                          std::vector<expr> output_pos = blocking_output_
-                            ? std::vector<expr> {pic * mb_ + n, oc, h + im_h_i,
-                              w, 0}
-                            : std::vector<expr> {
-                              pic * mb_ + n, h + im_h_i, w, oc * im_oc_block};
-
-                          if (ic_num_block_pt > 1) {
-                            _if_(o_ic == 0) {
-                              builtin::brgemm_init(
-                                tensor_ptr(output, output_pos), im_w_block,
-                                im_oc_block, LDC, dtypeOutput, 0);
-                            }
-                          } else {
-                            builtin::brgemm_init(tensor_ptr(output, output_pos),
-                              im_w_block, im_oc_block, LDC, dtypeOutput, 0);
-                          }
-
-                          _for_(i_c, 0, ic_block / im_ic_block) {
-                            expr ic
-                              = pic * ic_num_block_pt * ic_block / im_ic_block
-                              + o_ic * ic_block / im_ic_block + i_c;
-                            _if_(ic * im_ic_block < ic_) {
-                              // 1) top or bottom region with
-                              // padding inputs
-                              // 1.1) calculate the number of
-                              // padding rows
-                              _if_(((h + im_h_i) >= y_unpad_top)
-                                && ((h + im_h_i) <= y_unpad_bottom)) {
-                                num_pad_rows = 0;
-                                pad_begin_index = 0;
-                                pad_end_index = 0;
-                                unpad_begin_index = 0;
-                                unpad_end_index = kh_;
-                              }
-                              _else_ {
-                                _if_((h + im_h_i) < y_unpad_top) {
-                                  num_pad_rows = builder::make_min(ph_b_
-                                      - builder::make_cast(
-                                          datatypes::u32, h + im_h_i)
-                                        * sh_,
-                                    kh_);
-                                  pad_begin_index = 0;
-                                  pad_end_index = num_pad_rows;
-                                  unpad_begin_index = num_pad_rows;
-                                  unpad_end_index = kh_;
-                                }
-                                _else_ {
-                                  num_pad_rows = builder::make_min(
-                                    builder::make_cast(
-                                      datatypes::u32, h + im_h_i)
-                                        * sh_
-                                      + kh_ - (ih_ + ph_b_),
-                                    kh_);
-                                  pad_begin_index = kh_ - num_pad_rows;
-                                  pad_end_index = kh_;
-                                  unpad_begin_index = 0;
-                                  unpad_end_index = kh_ - num_pad_rows;
-                                }
-
-                                // 1.2) Add zero-padding tensor to
-                                // A_list
-                                _for_(r, pad_begin_index, pad_end_index) {
-                                  _for_(s, 0, kw_) {
-                                    _var_(idx, datatypes::u32);
-                                    idx = builder::make_cast(
-                                      datatypes::u32, r * kw_ + s);
-                                    A_list[idx] = tensor_ptr(pbuffer, {0, 0});
-                                  }
-                                }
-                              }
-
-                              // 1.3) copy sub-tensor and append
-                              // to A_list
-                              _if_(num_pad_rows < kh_) {
-                                // 1.3.1) copy sub-tensor
-                                _if_(w < y_unpad_left) {
-                                  _if_((w + im_w_block - 1) <= y_unpad_right) {
-                                    // 1.3.1.1) left pad only
-                                    real_pad_left = pw_b_
-                                      - builder::make_cast(
-                                        datatypes::u32, w * sw_);
-
-                                    // copy sub-tensor
-                                    _for_(
-                                      i, unpad_begin_index, unpad_end_index) {
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index, 0, 0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_left),
-                                        im_ic_block, LDA, dtypeInput, 0);
-
-                                      // mapping dst to padding
-                                      // src, then mapping padding
-                                      // src to real src to get
-                                      // the actual elements.
-                                      _for_(
-                                        j, real_pad_left, src_row_tile_size) {
-                                        _for_(k, 0, im_ic_block, (int)lanes) {
-                                          sub_tensor[span_t(
-                                            {i - unpad_begin_index, j, k},
-                                            lanes)]
-                                            = input[blocking_input_
-                                                ? span_t(
-                                                  {n, ic,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_, k},
-                                                  lanes)
-                                                : span_t(
-                                                  {n,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_,
-                                                    ic * im_ic_block + k},
-                                                  lanes)];
-                                        }
-                                      }
-                                    }
-
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(sub_tensor,
-                                          {r - unpad_begin_index, s, 0});
-                                      }
-                                    }
-                                  }
-                                  _else_ {
-                                    // 1.3.1.2) both left and
-                                    // right pad
-                                    real_pad_left = pw_b_
-                                      - builder::make_cast(
-                                        datatypes::u32, w * sw_);
-                                    real_pad_right
-                                      = builder::make_cast(datatypes::u32,
-                                          w * sw_ + src_row_tile_size)
-                                      - (iw_padded - pw_e_);
-
-                                    copy_width = src_row_tile_size
-                                      - real_pad_left - real_pad_right;
-
-                                    // copy sub-tensor
-                                    _for_(
-                                      i, unpad_begin_index, unpad_end_index) {
-                                      // memzero left part
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index, 0, 0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_left),
-                                        im_ic_block, LDA, dtypeInput, 0);
-
-                                      _for_(j, real_pad_left,
-                                        copy_width + real_pad_left) {
-                                        _for_(k, 0, im_ic_block, (int)lanes) {
-                                          // N, C, H, W, c
-                                          sub_tensor[span_t(
-                                            {i - unpad_begin_index, j, k},
-                                            lanes)]
-                                            = input[blocking_input_
-                                                ? span_t(
-                                                  {n, ic,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_, k},
-                                                  lanes)
-                                                : span_t(
-                                                  {n,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_,
-                                                    ic * im_ic_block + k},
-                                                  lanes)];
-                                        }
-                                      }
-
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index,
-                                            copy_width + real_pad_left, 0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_right),
-                                        im_ic_block, LDA, dtypeInput, 0);
-                                    }
-
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(sub_tensor,
-                                          {r - unpad_begin_index, s, 0});
-                                      }
-                                    }
-                                  }
-                                }
-                                _else_ {
-                                  _if_((w + im_w_block - 1) <= y_unpad_right) {
-                                    // 1.3.1.3) not using pad
-                                    // buffer, use original buffer
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(input,
-                                          blocking_input_
-                                            ? std::vector<expr> {n, ic,
-                                              (h + im_h_i) * sh_ + r - ph_b_,
-                                              w * sw_ + s - pw_b_, 0}
-                                            : std::vector<expr> {n,
-                                              (h + im_h_i) * sh_ + r - ph_b_,
-                                              w * sw_ + s - pw_b_,
-                                              ic * im_ic_block});
-                                      }
-                                    }
-                                  }
-                                  _else_ {
-                                    // 1.3.1.4) right pad only
-                                    real_pad_right
-                                      = builder::make_cast(datatypes::u32,
-                                          w * sw_ + src_row_tile_size)
-                                      - (iw_padded - pw_e_);
-                                    copy_width
-                                      = src_row_tile_size - real_pad_right;
-                                    // copy sub-tensor
-
-                                    _for_(
-                                      i, unpad_begin_index, unpad_end_index) {
-                                      _for_(j, 0, copy_width) {
-                                        _for_(k, 0, im_ic_block, (int)lanes) {
-                                          sub_tensor[span_t(
-                                            {i - unpad_begin_index, j, k},
-                                            lanes)]
-                                            = input[blocking_input_
-                                                ? span_t(
-                                                  {n, ic,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_, k},
-                                                  lanes)
-                                                : span_t(
-                                                  {n,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_,
-                                                    ic * im_ic_block + k},
-                                                  lanes)];
-                                        }
-                                      }
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index, copy_width,
-                                            0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_right),
-                                        im_ic_block, LDA, dtypeInput, 0);
-                                    }
-
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(sub_tensor,
-                                          {r - unpad_begin_index, s, 0});
-                                      }
-                                    }
-                                  }
-                                }
-                              }
-
-                              // Add tensor to B_list
-                              _for_(r, 0, kh_) {
-                                _for_(s, 0, kw_) {
-                                  _var_(idx, datatypes::u32);
-                                  // inverse the idx
-                                  if (inverse_filter_) {
-                                    idx = builder::make_cast(datatypes::u32,
-                                      kh_ * kw_ - 1 - (r * kw_ + s));
-                                  } else {
-                                    idx = builder::make_cast(
-                                      datatypes::u32, r * kw_ + s);
-                                  }
-                                  B_list[idx] = tensor_ptr(weight,
-                                    kpack > 1
-                                      ? std::vector<expr> {oc, ic, r, s, 0, 0,
-                                        0}
-                                      : std::vector<expr> {oc, ic, r, s, 0, 0});
-                                }
-                              }
-
-                              const auto hint_A_size
-                                = im_w_block * im_ic_block * kh_ * kw_;
-                              const auto hint_B_size
-                                = im_oc_block * im_ic_block;
-                              const auto hint_C_size = im_w_block * im_oc_block;
-                              sc_brgemm_attrs_t brg_attrs {
-                                {brgemm::attr_key::max_bs, kh_ * kw_},
-                                {brgemm::attr_key::hint_expected_A_size,
-                                  hint_A_size},
-                                {brgemm::attr_key::hint_expected_B_size,
-                                  hint_B_size},
-                                {brgemm::attr_key::hint_expected_C_size,
-                                  hint_C_size},
-                                {brgemm::attr_key::use_interleave_stores, true},
-                                {brgemm::attr_key::use_uker, true}};
-
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output, output_pos), 1, im_w_block,
-                                im_oc_block, im_ic_block, sw_ * LDA,
-                                im_oc_block, LDC, 1, 1, kh_ * kw_, dtypeInput,
-                                dtypeWeight, brg_attrs);
-                            }
-                          }
-
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              create_fusion_anchor(fusion,
-                                owner_->get_outputs()[0],
-                                blocking_output_
-                                  ? slice_range {{n, 1UL}, {oc, 1},
-                                    {h + im_h_i, 1}, {w, im_w_block},
-                                    {0, im_oc_block}}
-                                  : slice_range {{n, 1UL}, {h + im_h_i, 1},
-                                    {w, im_w_block},
-                                    {oc * im_oc_block, im_oc_block}});
-                            }
-                          }
-                        } // im_h_i
-                      }
-                      if (ic_used_threads == 1 && ic_num_block_pt == 1) {
-                        _if_(o_ic == (ic_num_block - 1)) {
-                          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                            blocking_output_
-                              ? slice_range {{n, 1UL}, {oc, 1}, {h, im_h_block},
-                                {w, im_w_block}, {0, im_oc_block}}
-                              : slice_range {{n, 1UL}, {h, im_h_block},
-                                {w, im_w_block},
-                                {oc * im_oc_block, im_oc_block}});
-                        }
-                      }
-                    } // i_w
-                  }
-
-                  if (ic_used_threads == 1 && ic_num_block_pt == 1
-                    && w_block * ow_used_threads == ow_) {
-                    _if_(o_ic == (ic_num_block - 1)) {
-                      expr anch_w = (pw * w_num_block_pt * w_block / im_w_block
-                                      + o_w * w_block / im_w_block)
-                        * im_w_block;
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        blocking_output_ ? slice_range {{n, 1UL}, {oc, 1},
-                          {h, im_h_block}, {anch_w, w_block}, {0, im_oc_block}}
-                                         : slice_range {{n, 1UL},
-                                           {h, im_h_block}, {anch_w, w_block},
-                                           {oc * im_oc_block, im_oc_block}});
-                    }
-                  }
-                } // i_h
-
-                if (ic_used_threads == 1 && ic_num_block_pt == 1
-                  && h_block * oh_used_threads == oh_
-                  && w_block * ow_used_threads == ow_) {
-                  _if_(o_ic == (ic_num_block - 1)) {
-                    expr anch_h = (ph * h_num_block_pt * h_block / im_h_block
-                                    + o_h * h_block / im_h_block)
-                      * im_h_block;
-                    expr anch_w = (pw * w_num_block_pt * w_block / im_w_block
-                                    + o_w * w_block / im_w_block)
-                      * im_w_block;
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      blocking_output_
-                        ? slice_range {{n, 1UL}, {oc, 1}, {anch_h, h_block},
-                          {anch_w, w_block}, {0, im_oc_block}}
-                        : slice_range {{n, 1UL}, {anch_h, h_block},
-                          {anch_w, w_block}, {oc * im_oc_block, im_oc_block}});
-                  }
-                }
-              } // ioc
-            }
-
-            if (ic_used_threads == 1 && ic_num_block_pt == 1
-              && h_block * oh_used_threads == oh_
-              && w_block * ow_used_threads == ow_) {
-              _if_(o_ic == (ic_num_block - 1)) {
-                expr anch_h = (ph * h_num_block_pt * h_block / im_h_block
-                                + o_h * h_block / im_h_block)
-                  * im_h_block;
-                expr anch_w = (pw * w_num_block_pt * w_block / im_w_block
-                                + o_w * w_block / im_w_block)
-                  * im_w_block;
-                expr anch_oc = poc * oc_num_block_pt * oc_block / im_oc_block
-                  + o_oc * oc_block / im_oc_block
-                  + outer_k * oc_block / im_oc_block / oc_split;
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  blocking_output_
-                    ? slice_range {{n, 1UL}, {anch_oc, 1}, {anch_h, h_block},
-                      {anch_w, w_block}, {0, im_oc_block}}
-                    : slice_range {{n, 1UL}, {anch_h, h_block},
-                      {anch_w, w_block}, {anch_oc * im_oc_block, im_oc_block}});
-              }
-            } // o_ic
-          }
-        }
-      }
-    }
-  }
-}
-
-void gen_nested_conv_fwd_t::single_thread_dynamic_conv_padding_call(
-  expr &output, const expr &input, const expr &weight, const expr &pbs,
-  const expr &poc, const expr &ph, const expr &pw, const expr &pic,
-  const expr &outer_k, const expr &h_num_block, const expr &h_num_block_pt,
-  const expr &w_num_block, const expr &w_num_block_pt, const expr &oc_num_block,
-  const int oc_num_block_pt, const expr &ic_num_block,
-  const int ic_num_block_pt, const expr &pbuffer, for_loop &loh, for_loop &low,
-  for_loop &looc, for_loop &loic, for_loop &lioc, for_loop &lih, for_loop &liw,
-  const int oc_split, const expr &src_row_tile_size, const uint32_t lanes,
-  const nested_conv_fwd_config_t &config, fusion_anchor_mgr_t *fusion,
-  const int ic_used_threads, const int oc_used_threads,
-  const expr &oh_used_threads, const expr &ow_used_threads,
-  const expr &y_unpad_top, const expr &y_unpad_bottom, const expr &y_unpad_left,
-  const expr &y_unpad_right, const expr &iw_padded, const int kpack,
-  const expr &h_block, const expr &w_block, const expr &im_h_block,
-  const expr &im_w_block, const expr &oh_expr_, const expr &ow_expr_,
-  const expr &ih_expr_, const expr &iw_expr_, expr &cond_tail_h,
-  expr &cond_tail_w, int oc_block, int ic_block) const {
-  auto im_ic_block = config.im_ic_block;
-  auto im_oc_block = config.im_oc_block;
-
-  auto dtypeInput = get_input_dtype();
-  auto dtypeWeight = get_weight_dtype();
-  auto dtypeOutput = get_output_dtype();
-
-  auto LDA = blocking_input_ ? im_ic_block : ic_;
-  auto LDC = blocking_output_ ? im_oc_block : oc_;
-
-  expr n = pbs;
-  _named_for_(loh, o_h, 0, h_num_block_pt) {
-    _named_for_(low, o_w, 0, w_num_block_pt) {
-      _named_for_(looc, o_oc, 0, oc_num_block_pt) {
-        _named_for_(loic, o_ic, 0, ic_num_block_pt) {
-          expr cond = o_h < h_num_block && o_w < w_num_block
-            && o_oc < oc_num_block && o_ic < ic_num_block;
-          _if_(cond) {
-            _named_for_(lioc, i_oc, 0, oc_block / im_oc_block / oc_split) {
-              expr oc = poc * oc_num_block_pt * oc_block / im_oc_block
-                + o_oc * oc_block / im_oc_block
-                + outer_k * oc_block / im_oc_block / oc_split + i_oc;
-              _if_(oc * im_oc_block < oc_) {
-                _named_for_(
-                  lih, i_h, 0, (h_block + im_h_block - 1) / im_h_block) {
-                  expr h = ph * h_num_block_pt * h_block + o_h * h_block
-                    + i_h * im_h_block;
-                  expr is_tail_h = cond_tail_h && (h + im_h_block > oh_expr_);
-                  expr real_im_h_block = builder::make_select(is_tail_h,
-                    builder::make_cast(datatypes::s32, oh_expr_ % im_h_block),
-                    im_h_block);
-                  _tensor_(A_list, datatypes::pointer, {kh_ * kw_});
-                  _tensor_(B_list, datatypes::pointer, {kh_ * kw_});
-
-                  _named_for_(
-                    liw, i_w, 0, (w_block + im_w_block - 1) / im_w_block) {
-                    expr w = pw * w_num_block_pt * w_block + o_w * w_block
-                      + i_w * im_w_block;
-                    _if_(w < ow_expr_) {
-                      expr is_tail_w
-                        = cond_tail_w && (w + im_w_block > ow_expr_);
-                      expr real_im_w_block = builder::make_select(is_tail_w,
-                        builder::make_cast(
-                          datatypes::s32, ow_expr_ % im_w_block),
-                        im_w_block);
-                      expr real_src_row_tile_size
-                        = builder::make_select(is_tail_w,
-                          builder::make_cast(
-                            datatypes::s32, (ow_expr_ - w - 1) * sw_ + kw_),
-                          src_row_tile_size);
-
-                      _for_(im_h_i, 0, im_h_block) {
-                        _if_(h + im_h_i < oh_expr_) {
-                          // create a sub-tensor with maximum size
-                          // which holds all the boundary that
-                          // contains padding
-                          _tensor_(sub_tensor, dtypeInput,
-                            {kh_, real_src_row_tile_size, LDA});
-
-                          _var_(pad_begin_index, datatypes::index);
-                          _var_(pad_end_index, datatypes::index);
-                          _var_(unpad_begin_index, datatypes::index);
-                          _var_(unpad_end_index, datatypes::index);
-                          _var_(real_pad_left, datatypes::index);
-                          _var_(real_pad_right, datatypes::index);
-                          _var_(num_pad_rows, datatypes::index);
-                          _var_(copy_width, datatypes::index);
-                          std::vector<expr> output_pos = blocking_output_
-                            ? std::vector<expr> {pic * mb_ + n, oc, h + im_h_i,
-                              w, 0}
-                            : std::vector<expr> {
-                              pic * mb_ + n, h + im_h_i, w, oc * im_oc_block};
-
-                          if (ic_num_block_pt > 1) {
-                            _if_(o_ic == 0) {
-                              builtin::brgemm_init(
-                                tensor_ptr(output, output_pos), real_im_w_block,
-                                im_oc_block, LDC, dtypeOutput, 0);
-                            }
-                          } else {
-                            builtin::brgemm_init(tensor_ptr(output, output_pos),
-                              real_im_w_block, im_oc_block, LDC, dtypeOutput,
-                              0);
-                          }
-
-                          _for_(i_c, 0, ic_block / im_ic_block) {
-                            expr ic
-                              = pic * ic_num_block_pt * ic_block / im_ic_block
-                              + o_ic * ic_block / im_ic_block + i_c;
-                            _if_(ic * im_ic_block < ic_) {
-                              // 1) top or bottom region with
-                              // padding inputs
-                              // 1.1) calculate the number of
-                              // padding rows
-                              _if_(((h + im_h_i) >= y_unpad_top)
-                                && ((h + im_h_i) <= y_unpad_bottom)) {
-                                num_pad_rows = 0;
-                                pad_begin_index = 0;
-                                pad_end_index = 0;
-                                unpad_begin_index = 0;
-                                unpad_end_index = kh_;
-                              }
-                              _else_ {
-                                _if_((h + im_h_i) < y_unpad_top) {
-                                  num_pad_rows = builder::make_min(ph_b_
-                                      - builder::make_cast(
-                                          datatypes::u32, h + im_h_i)
-                                        * sh_,
-                                    kh_);
-                                  pad_begin_index = 0;
-                                  pad_end_index = num_pad_rows;
-                                  unpad_begin_index = num_pad_rows;
-                                  unpad_end_index = kh_;
-                                }
-                                _else_ {
-                                  num_pad_rows = builder::make_min(
-                                    builder::make_cast(
-                                      datatypes::u32, h + im_h_i)
-                                        * sh_
-                                      + kh_ - (ih_expr_ + ph_b_),
-                                    kh_);
-                                  pad_begin_index = kh_ - num_pad_rows;
-                                  pad_end_index = kh_;
-                                  unpad_begin_index = 0;
-                                  unpad_end_index = kh_ - num_pad_rows;
-                                }
-
-                                // 1.2) Add zero-padding tensor to
-                                // A_list
-                                _for_(r, pad_begin_index, pad_end_index) {
-                                  _for_(s, 0, kw_) {
-                                    _var_(idx, datatypes::u32);
-                                    idx = builder::make_cast(
-                                      datatypes::u32, r * kw_ + s);
-                                    A_list[idx] = tensor_ptr(pbuffer, {0, 0});
-                                  }
-                                }
-                              }
-
-                              // 1.3) copy sub-tensor and append
-                              // to A_list
-                              _if_(num_pad_rows < kh_) {
-                                // 1.3.1) copy sub-tensor
-                                _if_(w < y_unpad_left) {
-                                  _if_((w + real_im_w_block - 1)
-                                    <= y_unpad_right) {
-                                    // 1.3.1.1) left pad only
-                                    real_pad_left = pw_b_
-                                      - builder::make_cast(
-                                        datatypes::u32, w * sw_);
-
-                                    // copy sub-tensor
-                                    _for_(
-                                      i, unpad_begin_index, unpad_end_index) {
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index, 0, 0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_left),
-                                        im_ic_block, LDA, dtypeInput, 0);
-
-                                      // mapping dst to padding
-                                      // src, then mapping padding
-                                      // src to real src to get
-                                      // the actual elements.
-                                      _for_(j, real_pad_left,
-                                        real_src_row_tile_size) {
-                                        _for_(k, 0, im_ic_block, (int)lanes) {
-                                          sub_tensor[span_t(
-                                            {i - unpad_begin_index, j, k},
-                                            lanes)]
-                                            = input[blocking_input_
-                                                ? span_t(
-                                                  {n, ic,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_, k},
-                                                  lanes)
-                                                : span_t(
-                                                  {n,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_,
-                                                    ic * im_ic_block + k},
-                                                  lanes)];
-                                        }
-                                      }
-                                    }
-
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(sub_tensor,
-                                          {r - unpad_begin_index, s, 0});
-                                      }
-                                    }
-                                  }
-                                  _else_ {
-                                    // 1.3.1.2) both left and
-                                    // right pad
-                                    real_pad_left = pw_b_
-                                      - builder::make_cast(
-                                        datatypes::u32, w * sw_);
-                                    real_pad_right
-                                      = builder::make_cast(datatypes::u32,
-                                          w * sw_ + real_src_row_tile_size)
-                                      - (iw_padded - pw_e_);
-
-                                    copy_width = real_src_row_tile_size
-                                      - real_pad_left - real_pad_right;
-
-                                    // copy sub-tensor
-                                    _for_(
-                                      i, unpad_begin_index, unpad_end_index) {
-                                      // memzero left part
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index, 0, 0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_left),
-                                        im_ic_block, LDA, dtypeInput, 0);
-
-                                      _for_(j, real_pad_left,
-                                        copy_width + real_pad_left) {
-                                        _for_(k, 0, im_ic_block, (int)lanes) {
-                                          // N, C, H, W, c
-                                          sub_tensor[span_t(
-                                            {i - unpad_begin_index, j, k},
-                                            lanes)]
-                                            = input[blocking_input_
-                                                ? span_t(
-                                                  {n, ic,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_, k},
-                                                  lanes)
-                                                : span_t(
-                                                  {n,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_,
-                                                    ic * im_ic_block + k},
-                                                  lanes)];
-                                        }
-                                      }
-
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index,
-                                            copy_width + real_pad_left, 0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_right),
-                                        im_ic_block, LDA, dtypeInput, 0);
-                                    }
-
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(sub_tensor,
-                                          {r - unpad_begin_index, s, 0});
-                                      }
-                                    }
-                                  }
-                                }
-                                _else_ {
-                                  _if_((w + real_im_w_block - 1)
-                                    <= y_unpad_right) {
-                                    // 1.3.1.3) not using pad
-                                    // buffer, use original buffer
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(input,
-                                          blocking_input_
-                                            ? std::vector<expr> {n, ic,
-                                              (h + im_h_i) * sh_ + r - ph_b_,
-                                              w * sw_ + s - pw_b_, 0}
-                                            : std::vector<expr> {n,
-                                              (h + im_h_i) * sh_ + r - ph_b_,
-                                              w * sw_ + s - pw_b_,
-                                              ic * im_ic_block});
-                                      }
-                                    }
-                                  }
-                                  _else_ {
-                                    // 1.3.1.4) right pad only
-                                    real_pad_right = builder::make_min(
-                                      builder::make_cast(datatypes::u32,
-                                        w * sw_ + real_src_row_tile_size)
-                                        - (iw_padded - pw_e_),
-                                      real_src_row_tile_size);
-                                    copy_width
-                                      = real_src_row_tile_size - real_pad_right;
-                                    // copy sub-tensor
-
-                                    _for_(
-                                      i, unpad_begin_index, unpad_end_index) {
-                                      _for_(j, 0, copy_width) {
-                                        _for_(k, 0, im_ic_block, (int)lanes) {
-                                          sub_tensor[span_t(
-                                            {i - unpad_begin_index, j, k},
-                                            lanes)]
-                                            = input[blocking_input_
-                                                ? span_t(
-                                                  {n, ic,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_, k},
-                                                  lanes)
-                                                : span_t(
-                                                  {n,
-                                                    (h + im_h_i) * sh_ + i
-                                                      - ph_b_,
-                                                    w * sw_ + j - pw_b_,
-                                                    ic * im_ic_block + k},
-                                                  lanes)];
-                                        }
-                                      }
-                                      builtin::brgemm_init(
-                                        tensor_ptr(sub_tensor,
-                                          {i - unpad_begin_index, copy_width,
-                                            0}),
-                                        builder::make_cast(
-                                          datatypes::s32, real_pad_right),
-                                        im_ic_block, LDA, dtypeInput, 0);
-                                    }
-
-                                    _for_(
-                                      r, unpad_begin_index, unpad_end_index) {
-                                      _for_(s, 0, kw_) {
-                                        _var_(idx, datatypes::u32);
-                                        idx = builder::make_cast(
-                                          datatypes::u32, r * kw_ + s);
-                                        A_list[idx] = tensor_ptr(sub_tensor,
-                                          {r - unpad_begin_index, s, 0});
-                                      }
-                                    }
-                                  }
-                                }
-                              }
-
-                              // Add tensor to B_list
-                              _for_(r, 0, kh_) {
-                                _for_(s, 0, kw_) {
-                                  _var_(idx, datatypes::u32);
-                                  // inverse the idx
-                                  if (inverse_filter_) {
-                                    idx = builder::make_cast(datatypes::u32,
-                                      kh_ * kw_ - 1 - (r * kw_ + s));
-                                  } else {
-                                    idx = builder::make_cast(
-                                      datatypes::u32, r * kw_ + s);
-                                  }
-                                  B_list[idx] = tensor_ptr(weight,
-                                    kpack > 1
-                                      ? std::vector<expr> {oc, ic, r, s, 0, 0,
-                                        0}
-                                      : std::vector<expr> {oc, ic, r, s, 0, 0});
-                                }
-                              }
-
-                              brgemm::attrs_setting_t::attrs_map_t range_attr_0
-                                = {brgemm::attr_key::M_range_upper_bound, 64};
-                              sc_brgemm_attrs_t brg_attrs = sc_brgemm_attrs_t {
-                                {brgemm::attr_key::max_bs, kh_ * kw_},
-                                {brgemm::attr_key::use_interleave_stores, true},
-                                {brgemm::attr_key::use_uker, true},
-                                range_attr_0};
-
-                              builtin::brgemm_list_update(A_list, B_list,
-                                tensor_ptr(output, output_pos), 1,
-                                real_im_w_block, im_oc_block, im_ic_block,
-                                sw_ * LDA, im_oc_block, LDC, 1, 1, kh_ * kw_,
-                                dtypeInput, dtypeWeight, brg_attrs);
-                            }
-                          }
-                          if (ic_used_threads == 1 && ic_num_block_pt == 1) {
-                            _if_(o_ic == (ic_num_block - 1)) {
-                              create_fusion_anchor(fusion,
-                                owner_->get_outputs()[0],
-                                blocking_output_
-                                  ? slice_range {{n, 1}, {oc, 1},
-                                    {h + im_h_i, 1}, {w, real_im_w_block},
-                                    {0, im_oc_block}}
-                                  : slice_range {{n, 1UL}, {h + im_h_i, 1},
-                                    {w, real_im_w_block},
-                                    {oc * im_oc_block, im_oc_block}});
-                            }
-                          } // im_h_i
-                        }
-                      }
-                      if (ic_used_threads == 1 && ic_num_block_pt == 1) {
-                        _if_(o_ic == (ic_num_block - 1)) {
-                          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                            blocking_output_
-                              ? slice_range {{n, 1UL}, {oc, 1},
-                                {h, real_im_h_block}, {w, real_im_w_block},
-                                {0, im_oc_block}}
-                              : slice_range {{n, 1UL}, {h, real_im_h_block},
-                                {w, real_im_w_block},
-                                {oc * im_oc_block, im_oc_block}});
-                        }
-                      } // i_w
-                    }
-                  }
-
-                  if (ic_used_threads == 1 && ic_num_block_pt == 1
-                    && !is_dynamic_dim(ow_)
-                    && get_expr_as_int(w_block)
-                        * get_expr_as_int(ow_used_threads)
-                      == ow_) {
-                    _if_(o_ic == (ic_num_block - 1)) {
-                      expr anch_w
-                        = pw * w_num_block_pt * w_block + o_w * w_block;
-                      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                        blocking_output_
-                          ? slice_range {{n, 1UL}, {oc, 1},
-                            {h, real_im_h_block}, {anch_w, w_block},
-                            {0, im_oc_block}}
-                          : slice_range {{n, 1UL}, {h, real_im_h_block},
-                            {anch_w, w_block},
-                            {oc * im_oc_block, im_oc_block}});
-                    }
-                  }
-                } // i_h
-
-                if (ic_used_threads == 1 && ic_num_block_pt == 1
-                  && !is_dynamic_dim(ow_) && !is_dynamic_dim(oh_)
-                  && get_expr_as_int(w_block) * get_expr_as_int(ow_used_threads)
-                    == ow_
-                  && get_expr_as_int(h_block) * get_expr_as_int(oh_used_threads)
-                    == oh_) {
-                  _if_(o_ic == (ic_num_block - 1)) {
-                    expr anch_h = ph * h_num_block_pt * h_block + o_h * h_block;
-                    expr anch_w = pw * w_num_block_pt * w_block + o_w * w_block;
-                    create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                      blocking_output_
-                        ? slice_range {{n, 1UL}, {oc, 1}, {anch_h, h_block},
-                          {anch_w, w_block}, {0, im_oc_block}}
-                        : slice_range {{n, 1UL}, {anch_h, h_block},
-                          {anch_w, w_block}, {oc * im_oc_block, im_oc_block}});
-                  }
-                } // i_oc
-              }
-            }
-
-            if (ic_used_threads == 1 && ic_num_block_pt == 1
-              && !is_dynamic_dim(ow_) && !is_dynamic_dim(oh_)
-              && get_expr_as_int(w_block) * get_expr_as_int(ow_used_threads)
-                == ow_
-              && get_expr_as_int(h_block) * get_expr_as_int(oh_used_threads)
-                == oh_
-              && oc_block * oc_used_threads == oc_) {
-              _if_(o_ic == (ic_num_block - 1)) {
-                expr anch_h = ph * h_num_block_pt * h_block + o_h * h_block;
-                expr anch_w = pw * w_num_block_pt * w_block + o_w * w_block;
-                expr anch_oc = poc * oc_num_block_pt * oc_block / im_oc_block
-                  + o_oc * oc_block / im_oc_block
-                  + outer_k * oc_block / im_oc_block / oc_split;
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  blocking_output_
-                    ? slice_range {{n, 1UL}, {anch_oc, 1}, {anch_h, h_block},
-                      {anch_w, w_block}, {0, im_oc_block}}
-                    : slice_range {{n, 1UL}, {anch_h, h_block},
-                      {anch_w, w_block}, {anch_oc * im_oc_block, im_oc_block}});
-              }
-            }
-          }
-        }
-      }
-    }
-  } // namespace ops
-} // namespace gc
-void gen_nested_conv_fwd_t::dynamic_compute_conv_padding_nested(
-  CONV_ARG_LIST) const {
-  int num_threads = runtime_config_t::get().get_num_threads();
-  int h_threads = config.h_threads;
-  int w_threads = config.w_threads;
-  int oc_threads = config.oc_threads;
-  int ic_threads = 1;
-
-  int bs_threads = num_threads / h_threads / oc_threads;
-  int oc_block = oc_ / oc_threads;
-  int ic_block = ic_ / ic_threads;
-  int im_oc_block = config.im_oc_block;
-  int im_ic_block = config.im_ic_block;
-  int im_w_block = config.im_w_block;
-
-  COMPILE_ASSERT(oc_block % im_oc_block == 0,
-    "oc_block % im_oc_block != 0, config is invalid")
-  COMPILE_ASSERT(ic_block % im_ic_block == 0,
-    "ic_block % im_ic_block != 0, config is invalid")
-  // param
-  expr output_tmp = output;
-  auto tinput = in_tensors_[0];
-  auto tweight = in_tensors_[1];
-  auto toutput = out_tensors_[0];
-  const auto &input_blocking_dims = tinput.get_blocking_dims();
-  const auto &weight_blocking_dims = tweight.get_blocking_dims();
-  const auto &output_blocking_dims = toutput.get_blocking_dims();
-
-  for_loop lpbs, lph, lpw, lpoc, lpic, loh, low, looc, loic, lioc, lih, liw,
-    lok;
-
-  int oc_num_block_pt, oc_tail_num_block_pt, ic_num_block_pt,
-    ic_tail_num_block_pt;
-
-  int oc_used_threads = block_split(utils::divide_and_ceil(oc_, oc_block),
-    oc_threads, oc_num_block_pt, oc_tail_num_block_pt);
-
-  auto input_expr_dims = input.checked_as<tensor>()->dims_;
-  auto mb_expr_ = input_expr_dims[0];
-  auto ih_expr_
-    = input_expr_dims.size() == 4 ? input_expr_dims[1] : input_expr_dims[2];
-  auto iw_expr_
-    = input_expr_dims.size() == 4 ? input_expr_dims[2] : input_expr_dims[3];
-  auto output_expr_dims = output.checked_as<tensor>()->dims_;
-  auto oh_expr_
-    = input_expr_dims.size() == 4 ? output_expr_dims[1] : output_expr_dims[2];
-  auto ow_expr_
-    = input_expr_dims.size() == 4 ? output_expr_dims[2] : output_expr_dims[3];
-
-  // by observation
-  expr im_h_block = do_cast_and_fold(
-    builder::make_select(oh_expr_ <= 14 && ow_expr_ <= 14 && h_threads == 1,
-      builder::make_cast(datatypes::s32, oh_expr_), config.im_h_block));
-
-  expr h_block
-    = do_cast_and_fold(builder::make_select(oh_expr_ % h_threads == 0,
-      builder::make_cast(datatypes::s32, oh_expr_ / h_threads),
-      config.im_h_block));
-  expr w_block = do_cast_and_fold((ow_expr_ + w_threads - 1) / w_threads);
-
-  expr h_num_block_pt
-    = divide_and_ceil(divide_and_ceil(oh_expr_, h_block), h_threads);
-  expr h_tail_num_block_pt = builder::make_select(
-    divide_and_ceil(oh_expr_, h_block) % h_num_block_pt == 0, h_num_block_pt,
-    divide_and_ceil(oh_expr_, h_block) % h_num_block_pt);
-  expr oh_used_threads
-    = divide_and_ceil(divide_and_ceil(oh_expr_, h_block), h_num_block_pt);
-
-  expr ow_used_threads = do_cast_and_fold((ow_expr_ + w_block - 1) / w_block);
-  expr w_num_block_pt = ow_used_threads / w_threads;
-  expr w_tail_num_block_pt
-    = builder::make_select(ow_used_threads % w_num_block_pt == 0,
-      w_num_block_pt, ow_used_threads % w_num_block_pt);
-
-  int ic_used_threads = block_split(utils::divide_and_ceil(ic_, ic_block),
-    ic_threads, ic_num_block_pt, ic_tail_num_block_pt);
-
-  if (ic_used_threads > 1) {
-    // barrier
-    // output temp buffer
-    auto out_dims = output_blocking_dims;
-    out_dims[0] *= ic_used_threads;
-    _tensor_(out_tmp, toutput.dtype_, dims_to_expr(out_dims));
-    output_tmp = out_tmp;
-  }
-  expr ih_padded = ih_expr_ + (ph_b_ + ph_e_),
-       iw_padded = iw_expr_ + (pw_b_ + pw_e_);
-  auto dtypeInput = get_input_dtype();
-  uint32_t lanes = get_lanes(ctx, im_ic_block, dtypeInput);
-
-  /** calculate the unpadded point of spatial space in output tensor
-   *   +-----------------------+
-   *   |p p p p ...    p p p p |
-   *   |p a x x ...    x x b p |
-   *   |p x x x ...    x x x p |
-   *   |p x x x ...    x x x p |
-   *   |p x x x ...    x x x p |
-   *   |p c x x ...    x x d p |
-   *   |p p p p ...    p p p p |
-   *   +-----------------------+
-   *  where:
-   *    p: pad area
-   *    x: valid area
-   *    a: (y_unpad_top, y_unpad_left)
-   *    b: (y_unpad_top, y_unpad_right)
-   *    c: (y_unpad_bottom, y_unpad_left)
-   *    d: (y_unpad_bottom, y_unpad_right)
-   */
-
-  // some shapes might not have bottom or right pad at all
-  auto get_num_pad_end = [](const expr &ip, int k, int s, int p) {
-    expr remaining = (ip - k) % s;
-    if (remaining.isa<constant>() && get_expr_as_int(remaining)) {
-      return expr(utils::divide_and_ceil(p, s));
-    }
-    return builder::make_select(p > remaining,
-      builder::make_cast(datatypes::s32, (p - remaining + s - 1) / s), expr(0));
-  };
-
-  const int dst_num_pad_top = utils::divide_and_ceil(ph_b_, sh_);
-  const int dst_num_pad_left = utils::divide_and_ceil(pw_b_, sw_);
-  expr dst_num_pad_bottom = get_num_pad_end(ih_padded, kh_, sh_, ph_e_);
-  expr dst_num_pad_right = get_num_pad_end(iw_padded, kw_, sw_, pw_e_);
-
-  int y_unpad_top = dst_num_pad_top;
-  expr y_unpad_bottom = oh_expr_ - dst_num_pad_bottom - 1;
-  int y_unpad_left = dst_num_pad_left;
-  expr y_unpad_right = ow_expr_ - dst_num_pad_right - 1;
-
-  auto weight_size
-    = math_utils::get_dims_product(in_tensors_[1].get_blocking_dims())
-    * utils::get_sizeof_type(get_weight_dtype());
-  auto L2_cache_size = ctx->machine_.cpu_flags_.getDCacheSize(2);
-  int oc_split = (oc_threads == 1 && oc_num_block_pt == 1)
-    ? get_oc_split_factor(
-      -1, weight_size, L2_cache_size, oc_block / im_oc_block)
-    : 1;
-
-  // create a global shared zero-buffer referenced by padding
-  expr src_row_tile_size
-    = builder::make_select(im_w_block <= ow_expr_, (im_w_block - 1) * sw_ + kw_,
-      builder::make_cast(datatypes::s32, (ow_expr_ - 1) * sw_ + kw_));
-
-  auto LDA = blocking_input_ ? im_ic_block : ic_;
-  _tensor_(pbuffer, dtypeInput, {src_row_tile_size, LDA});
-  builtin::mem_zero(pbuffer, src_row_tile_size * LDA, dtypeInput);
-  expr cond_tail_w = w_block % im_w_block != 0 || ow_expr_ % im_w_block != 0;
-  expr cond_tail_h = h_block % im_h_block != 0;
-  // will update template for parallel merge in dynamic conv block
-  _named_for_(lok, outer_k, 0, oc_split, 1, for_type::PARALLEL) {
-    _named_for_(lpbs, pbs, 0, mb_expr_, 1, for_type::PARALLEL) {
-      _named_for_(lph, ph, 0, h_threads, 1) {
-        _named_for_(lpw, pw, 0, w_threads, 1) {
-          _named_for_(lpoc, poc, 0, oc_threads, 1) {
-            _named_for_(lpic, pic, 0, ic_threads, 1) {
-              expr h_num_block
-                = builder::make_select(ph < (oh_used_threads - 1),
-                  h_num_block_pt, h_tail_num_block_pt),
-                w_num_block = builder::make_select(pw < (ow_used_threads - 1),
-                  w_num_block_pt, w_tail_num_block_pt),
-                oc_num_block = builder::make_select(poc < (oc_used_threads - 1),
-                  oc_num_block_pt, oc_tail_num_block_pt);
-
-              _if_(ph < oh_used_threads && pw < ow_used_threads
-                && poc < oc_used_threads && pic < ic_used_threads) {
-                // single core
-                expr ic_num_block
-                  = builder::make_select(pic < (ic_used_threads - 1),
-                    ic_num_block_pt, ic_tail_num_block_pt);
-
-                single_thread_dynamic_conv_padding_call(output, input, weight,
-                  pbs, poc, ph, pw, pic, outer_k, h_num_block, h_num_block_pt,
-                  w_num_block, w_num_block_pt, oc_num_block, oc_num_block_pt,
-                  ic_num_block, ic_num_block_pt, pbuffer, loh, low, looc, loic,
-                  lioc, lih, liw, oc_split, src_row_tile_size, lanes, config,
-                  fusion, ic_used_threads, oc_used_threads, oh_used_threads,
-                  ow_used_threads, y_unpad_top, y_unpad_bottom, y_unpad_left,
-                  y_unpad_right, iw_padded, kpack, h_block, w_block, im_h_block,
-                  im_w_block, oh_expr_, ow_expr_, ih_expr_, iw_expr_,
-                  cond_tail_h, cond_tail_w, oc_block, ic_block);
-              }
-
-              if (oc_threads == 1 && ic_threads == 1 && h_threads == 1
-                && w_threads == 1) {
-                create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                  blocking_output_
-                    ? slice_range {{pbs, 1UL},
-                      {outer_k * oc_ / im_oc_block / oc_split,
-                        oc_ / im_oc_block / oc_split},
-                      {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-                    : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                      {outer_k * oc_ / oc_split, oc_ / oc_split}});
-              }
-            }
-
-            if (oc_threads == 1 && h_threads == 1 && w_threads == 1) {
-              create_fusion_anchor(fusion, owner_->get_outputs()[0],
-                blocking_output_
-                  ? slice_range {{pbs, 1UL},
-                    {outer_k * oc_ / im_oc_block / oc_split,
-                      oc_ / im_oc_block / oc_split},
-                    {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-                  : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                    {outer_k * oc_ / oc_split, oc_ / oc_split}});
-            }
-          }
-          if (h_threads == 1 && w_threads == 1) {
-            create_fusion_anchor(fusion, owner_->get_outputs()[0],
-              blocking_output_
-                ? slice_range {{pbs, 1UL},
-                  {outer_k * oc_ / im_oc_block / oc_split,
-                    oc_ / im_oc_block / oc_split},
-                  {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-                : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                  {outer_k * oc_ / oc_split, oc_ / oc_split}});
-          }
-        }
-
-        if (h_threads == 1) {
-          create_fusion_anchor(fusion, owner_->get_outputs()[0],
-            blocking_output_
-              ? slice_range {{pbs, 1UL},
-                {outer_k * oc_ / im_oc_block / oc_split,
-                  oc_ / im_oc_block / oc_split},
-                {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-              : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-                {outer_k * oc_ / oc_split, oc_ / oc_split}});
-        }
-      }
-      create_fusion_anchor(fusion, owner_->get_outputs()[0],
-        blocking_output_
-          ? slice_range {{pbs, 1UL},
-            {outer_k * oc_ / im_oc_block / oc_split,
-              oc_ / im_oc_block / oc_split},
-            {0, oh_expr_}, {0, ow_expr_}, {0, im_oc_block}}
-          : slice_range {{pbs, 1UL}, {0, oh_expr_}, {0, ow_expr_},
-            {outer_k * oc_ / oc_split, oc_ / oc_split}});
-    }
-  }
-  bind_output_loop_axis(lpbs, "N");
-  bind_output_loop_axis(lph, "H");
-  bind_output_loop_axis(lpw, "W");
-  bind_output_loop_axis(lpoc, "C");
-  bind_output_loop_axis(lok, "C");
-  bind_output_loop_axis(lpic, "");
-  loops = {lpbs, lph, lpw, lpoc, lpic, lok};
-}
-
-void gen_nested_conv_fwd_t::schedule_loops(context_ptr ctx,
-  const nested_conv_fwd_config_t &config, stmt body,
-  std::vector<for_loop> &fors) const {
-  if (use_conv1d) {
-    auto lpbs = fors[0], lps = fors[1], lpoc = fors[2], lpic = fors[3];
-    lpbs->fuse(lps)->fuse(lpoc)->fuse(lpic);
-  } else if (use_nested_2d_) {
-    if (!is_dynamic()) {
-      const auto pack_rows
-        = (config.im_w_block > 0 && ow_ % config.im_w_block != 0);
-      if (try_os_blocking_ && pack_rows) {
-        COMPILE_ASSERT(static_cast<int>(fors.size()) == 5,
-          "expected to have 4 for loops, but got " << fors.size()
-                                                   << " for loops.");
-        auto lpbs = fors[0], lps = fors[1], lpoc = fors[2], lpic = fors[3],
-             lok = fors[4];
-        lok->fuse(lpbs)->fuse(lps)->fuse(lpoc)->fuse(lpic);
-      } else {
-        if (!is_1x1_conv_) {
-          COMPILE_ASSERT(static_cast<int>(fors.size()) == 6,
-            "expected to have 6 for loops, but got " << fors.size()
-                                                     << " for loops.");
-          auto lpbs = fors[0], lph = fors[1], lpw = fors[2], lpoc = fors[3],
-               lpic = fors[4], lok = fors[5];
-          lok->fuse(lpbs)->fuse(lph)->fuse(lpw)->fuse(lpoc)->fuse(lpic);
-        } else {
-          COMPILE_ASSERT(static_cast<int>(fors.size()) == 5,
-            "expected to have 5 for loops, but got " << fors.size()
-                                                     << " for loops.");
-          auto lpbs = fors[0], lph = fors[1], lpw = fors[2], lpoc = fors[3],
-               lpic = fors[4];
-          lpbs->fuse(lph)->fuse(lpw)->fuse(lpoc)->fuse(lpic);
-        }
-      }
-    } else if (!is_1x1_conv_) {
-      COMPILE_ASSERT(static_cast<int>(fors.size()) == 6,
-        "expected to have 5 for loops, but got " << fors.size()
-                                                 << " for loops.");
-      auto lpbs = fors[0], lph = fors[1], lpw = fors[2], lpoc = fors[3],
-           lpic = fors[4], lok = fors[5];
-      lok->fuse(lpbs)->fuse(lph)->fuse(lpw)->fuse(lpoc)->fuse(lpic);
-    }
-  }
-}
-
-bool gen_nested_conv_fwd_t::generate(context_ptr ctx,
-  const nested_conv_fwd_config_t &config, fusion_anchor_mgr_t *fusion,
-  const std::vector<expr> &inputs, const std::vector<expr> &outputs,
-  std::vector<for_loop> &loops) const {
-  COMPILE_ASSERT(inputs.size() == 2,
-    "Expecting 2 inputs for conv, but got " << inputs.size() << " inputs.");
-  COMPILE_ASSERT(outputs.size() == 1,
-    "Expecting 1 output for conv, but got " << outputs.size() << " output.");
-
-  int K_block = is_dynamic() ? oc_ : config.K_block;
-  int C_block = is_dynamic() ? ic_ : config.C_block;
-  int im_s_block = config.im_w_block;
-
-  int pack_input = config.pack_input;
-  const bool use_os_blocking = try_os_blocking_ && ctx->use_amx();
-  const bool pack_rows
-    = use_os_blocking && (im_s_block > 0 && ow_ % im_s_block != 0);
-  int os = actual_os_;
-  if (use_conv1d) {
-    COMPILE_ASSERT(im_oc_block_ && (oc_ % im_oc_block_ == 0),
-      "oc should be dividable by K_block, but got oc=" << oc_ << " K_block="
-                                                       << im_oc_block_ << ".");
-    COMPILE_ASSERT(im_ic_block_ && (ic_ % im_ic_block_ == 0),
-      "ic should be dividable by C_block, but got ic=" << ic_ << " C_block="
-                                                       << im_ic_block_ << ".");
-  } else {
-    COMPILE_ASSERT(
-      K_block && (utils::rnd_up(oc_, config.im_oc_block) % K_block == 0),
-      "oc should be dividable by K_block, but got oc=" << oc_ << " K_block="
-                                                       << K_block << ".");
-    COMPILE_ASSERT(
-      C_block && (utils::rnd_up(ic_, config.im_ic_block) % C_block == 0),
-      "ic should be dividable by C_block, but got ic=" << ic_ << " C_block="
-                                                       << C_block << ".");
-  }
-  // kpack is used to determine the vnni block format
-  //  +----+--------------+
-  //  | 1  | FP32         |
-  //  +----+--------------+
-  //  | 2  | VNNI_BF16    |
-  //  +----+--------------+
-  //  | 4  | VNNI_INT8    |
-  //  +----+--------------+
-  int kpack = 1;
-  auto dtypeInput = get_input_dtype();
-  auto dtypeWeight = get_weight_dtype();
-  auto dtypeOutput = get_output_dtype();
-  if (dtypeInput == datatypes::bf16) {
-    COMPILE_ASSERT((dtypeWeight == datatypes::bf16),
-      "Weights should be bf16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtypeOutput == datatypes::f32),
-      "Output should be f32 when data and weights are in bf16.");
-    kpack = 2;
-  }
-  if (dtypeInput == datatypes::f16) {
-    COMPILE_ASSERT((dtypeWeight == datatypes::f16),
-      "Weights should be f16 as "
-      "data, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtypeOutput == datatypes::f32),
-      "Output should be f32 when data and weights are in f16.");
-    kpack = 1;
-  }
-  if (utils::is_one_of(dtypeInput, datatypes::s8, datatypes::u8)) {
-    COMPILE_ASSERT((dtypeWeight == datatypes::s8),
-      "Weights should be s8 when \
-            data is s8/u8, the mixed datatypes is not supported yet!");
-    COMPILE_ASSERT((dtypeOutput == datatypes::s32),
-      "Output should be s32 when data and weights are in "
-      "s8/u8.");
-    kpack = 4;
-  }
-
-  std::vector<char> os_mask = {};
-  expr os_acc_size = expr();
-  if (pack_rows) {
-    os = adj_os_;
-    int adj_ow = ow_ + num_elems_skip_per_ow_;
-    os_mask.resize(os);
-    for (int i = 0; i < os; ++i) {
-      if (i % adj_ow < ow_) {
-        os_mask[i] = 1;
-      } else {
-        os_mask[i] = 0;
-      }
-    }
-
-    int im_os_num_block = os / im_s_block;
-    _tensor_(conv_os_acc_size, datatypes::s32, {im_os_num_block});
-    int acc_size = 0;
-    int blk_size = 0;
-    for (int i = 0; i < im_os_num_block; ++i) {
-      blk_size = std::accumulate(os_mask.begin() + i * im_s_block,
-        os_mask.begin() + (i + 1) * im_s_block, 0);
-      conv_os_acc_size[i] = acc_size;
-      acc_size += blk_size;
-    }
-    os_acc_size = conv_os_acc_size;
-  }
-
-  if (!is_dynamic()) {
-    if (use_os_blocking) {
-      COMPILE_ASSERT((im_s_block > 0) && (os % im_s_block == 0),
-        "os should be dividable by im_w_block, but got os="
-          << os << " im_w_block=" << config.im_w_block << ".");
-    } else if (!use_conv1d) {
-      COMPILE_ASSERT((config.im_h_block > 0) && (oh_ % config.im_h_block == 0),
-        "oh should be dividable by im_h_block, but got oh="
-          << oh_ << " im_h_block=" << config.im_h_block << ".");
-      COMPILE_ASSERT((config.im_w_block > 0) && (ow_ % config.im_w_block == 0),
-        "ow should be dividable by tile_q, but got ow="
-          << ow_ << " im_w_block=" << config.im_w_block << ".");
-    }
-  }
-
-  expr output = outputs[op_params_t::out];
-  expr input = inputs[op_params_t::in_data];
-  expr weight = inputs[op_params_t::in_weight];
-
-  if (use_conv1d) {
-    // no padding/stride 1x1 1d/2d
-    compute_conv1d(
-      ctx, config, fusion, output, input, weight, loops, os, kpack);
-  } else if (is_1x1_conv_) {
-    COMPILE_ASSERT(pd_b_ == 0 && ph_b_ == 0 && pw_b_ == 0 && pd_e_ == 0
-        && ph_e_ == 0 && pw_e_ == 0,
-      "1x1 conv doesn't support padding!");
-    COMPILE_ASSERT(
-      !inverse_filter_, "1x1 conv doesn't support inverse convolution.");
-    if (pack_input == 0 && (sd_ > 1 || sh_ > 1 || sw_ > 1)) {
-      compute_1x1_no_pack_input_nested(
-        ctx, config, fusion, output, input, weight, loops, os, kpack);
-    } else {
-      if (is_dynamic()) {
-        dynamic_compute_1x1_pack_input_nested(
-          ctx, config, fusion, output, input, weight, loops, os, kpack);
-      } else {
-        compute_1x1_pack_input_nested(
-          ctx, config, fusion, output, input, weight, loops, os, kpack);
-      }
-    }
-  } else {
-    if (pd_b_ == 0 && ph_b_ == 0 && pw_b_ == 0 && pd_e_ == 0 && ph_e_ == 0
-      && pw_e_ == 0) {
-      COMPILE_ASSERT(!inverse_filter_,
-        "conv NxN (no padding) does not support inverse "
-        "convolution.");
-      if (is_3d_) {
-        COMPILE_ASSERT(!is_3d_,
-          "nested conv fwd does not support 3d convolution currently.");
-      } else {
-        if (use_os_blocking && pack_rows) {
-          compute_conv_no_padding_os_blocking_nested(ctx, config, fusion,
-            output, input, weight, loops, os, kpack, use_os_blocking, pack_rows,
-            os_acc_size, os_mask);
-        } else {
-          if (is_dynamic()) {
-            dynamic_compute_conv_no_padding_nested(ctx, config, fusion, output,
-              input, weight, loops, os, kpack, use_os_blocking, pack_rows,
-              os_acc_size, os_mask);
-          } else {
-            compute_conv_no_padding_nested(ctx, config, fusion, output, input,
-              weight, loops, os, kpack, use_os_blocking, pack_rows, os_acc_size,
-              os_mask);
-          }
-        }
-      }
-    } else {
-      if (is_dynamic()) {
-        dynamic_compute_conv_padding_nested(ctx, config, fusion, output, input,
-          weight, loops, os, kpack, use_os_blocking, pack_rows, os_acc_size,
-          os_mask);
-      }
-    }
-  }
-  return true;
-}
-#undef CONV_ARG_LIST
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv_fwd.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv_fwd.hpp
deleted file mode 100644
index 62fc8405667..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/nested_conv_fwd.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONV_FWD_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_NESTED_CONV_FWD_HPP
-
-#include <memory>
-#include <string>
-#include <tuple>
-#include <vector>
-#include <ops/body_generator.hpp>
-#include <util/any_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace ops {
-
-struct nested_conv_fwd_config_t {
-  int K_block;
-  int C_block;
-  int bs_threads = 1;
-  int h_threads = 1;
-  int w_threads = 1;
-  int oc_threads = 1;
-  int h_block = -1;
-  int w_block = -1;
-  int pack_input = 1;
-
-  // keep this for tuning
-  int im_oc_block = -1;
-  int im_ic_block = -1;
-  int im_h_block = -1;
-  int im_w_block = -1;
-
-  nested_conv_fwd_config_t() = default;
-
-  nested_conv_fwd_config_t(int bs_threads, int h_threads, int w_threads,
-    int oc_threads, int oc_block, int ic_block, int h_block, int w_block,
-    int pack_input, int loop_sched)
-    : K_block(oc_block)
-    , C_block(ic_block)
-    , bs_threads(bs_threads)
-    , h_threads(h_threads)
-    , w_threads(w_threads)
-    , oc_threads(oc_threads)
-    , h_block(h_block)
-    , w_block(w_block)
-    , pack_input(pack_input) {}
-};
-
-class gen_nested_conv_fwd_t
-  : public body_generator_t<nested_conv_fwd_config_t> {
-public:
-  struct op_params_t {
-    static constexpr int in_data = 0;
-    static constexpr int in_weight = 1;
-    static constexpr int out = 0;
-  };
-  using parent = body_generator_t<nested_conv_fwd_config_t>;
-  using parent::generate;
-
-  std::tuple<int, int, int> get_output_shape() {
-    return std::tuple<int, int, int>(od_, oh_, ow_);
-  }
-
-  gen_nested_conv_fwd_t(sc_op *owner, const sc_dims &stride,
-    const sc_dims &dilation, const sc_dims &pads_begin, const sc_dims &pads_end,
-    std::vector<logical_tensor_t> &&ins, std::vector<logical_tensor_t> &&outs);
-
-  float get_gflop() const override;
-
-  bool is_dynamic() const {
-    return in_tensors_[0].is_dynamic() || in_tensors_[1].is_dynamic();
-  }
-
-  const sc_dims &get_input_plain_dims() const {
-    return in_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_input_blocking_dims() const {
-    return in_tensors_[0].get_blocking_dims();
-  }
-
-  const sc_dims &get_weight_plain_dims() const {
-    return in_tensors_[1].get_plain_dims();
-  }
-  const sc_dims &get_output_plain_dims() const {
-    return out_tensors_[0].get_plain_dims();
-  }
-
-  const sc_dims &get_output_blocking_dims() const {
-    return out_tensors_[0].get_blocking_dims();
-  }
-
-  sc_data_type_t get_input_dtype() const { return in_tensors_[0].dtype_; }
-  sc_data_type_t get_weight_dtype() const { return in_tensors_[1].dtype_; }
-  sc_data_type_t get_output_dtype() const { return out_tensors_[0].dtype_; }
-
-  void generate_brgemm(const expr &im_s_block, int im_ic_block, int im_oc_block,
-    int ic_block, const expr &o_ic, int ic_num_block_pt, const expr &A_list,
-    const expr &B_list, const expr &out_tensor, const expr &LDA,
-    const expr &LDC) const;
-
-  bool generate(context_ptr ctx, const nested_conv_fwd_config_t &config,
-    fusion_anchor_mgr_t *fusion, const std::vector<expr> &inputs,
-    const std::vector<expr> &outputs,
-    std::vector<for_loop> &loops) const override;
-  config_ptr_vec get_dynamic_config_candidates(
-    const context_ptr &ctx) const override;
-  std::vector<uint64_t> convert_config_to_keys(
-    const config_ptr &configs) const override;
-  config_ptr get_default_config(context_ptr ctx) const override;
-
-  void schedule_loops(context_ptr ctx, const nested_conv_fwd_config_t &config,
-    stmt body, std::vector<for_loop> &fors) const override;
-  void bind_output_loop_axis(const for_loop &loop,
-    const std::vector<std::string> &axis, bool is_block = false) const;
-  void bind_output_loop_axis(
-    const for_loop &loop, const std::string axis, bool is_block = false) const {
-    this->bind_output_loop_axis(
-      loop, std::vector<std::string> {axis}, is_block);
-  }
-
-  bool inverse_filter_ = false;
-
-  int get_im_w_block(const context_ptr &ctx) const;
-  int get_im_oc_block(const context_ptr &ctx) const;
-  int get_im_ic_block(const context_ptr &ctx) const;
-
-#define CONV_ARG_LIST \
-  const context_ptr &ctx, const nested_conv_fwd_config_t &config, \
-    fusion_anchor_mgr_t *fusion, expr &output, const expr &input, \
-    const expr &weight, std::vector<for_loop> &loops, const int os, \
-    const int kpack = 1, const bool use_os_blocking = false, \
-              const bool pack_rows = false, const expr &os_acc_size = expr(), \
-              const std::vector<char> &os_mask = std::vector<char>()
-  void compute_1x1_pack_input_nested(CONV_ARG_LIST) const;
-  void compute_1x1_no_pack_input_nested(CONV_ARG_LIST) const;
-  void compute_conv_no_padding_nested(CONV_ARG_LIST) const;
-  void compute_conv_no_padding_os_blocking_nested(CONV_ARG_LIST) const;
-  void compute_conv1d(CONV_ARG_LIST) const;
-  void dynamic_compute_conv_no_padding_nested(CONV_ARG_LIST) const;
-  void dynamic_compute_conv_padding_nested(CONV_ARG_LIST) const;
-  void dynamic_compute_1x1_pack_input_nested(CONV_ARG_LIST) const;
-#undef CONV_ARG_LIST
-
-  void single_thread_conv_padding_call(expr &output, const expr &input,
-    const expr &weight, const expr &pbs, const expr &poc, const expr &ph,
-    const expr &pw, const expr &pic, const expr &outer_k,
-    const expr &h_num_block, const int h_num_block_pt, const expr &w_num_block,
-    const int w_num_block_pt, const expr &oc_num_block,
-    const int oc_num_block_pt, const expr &ic_num_block,
-    const int ic_num_block_pt, const expr &pbuffer, for_loop &loh,
-    for_loop &low, for_loop &looc, for_loop &loic, for_loop &lioc,
-    for_loop &lih, for_loop &liw, const int oc_split,
-    const int src_row_tile_size, const uint32_t lanes,
-    const nested_conv_fwd_config_t &config, fusion_anchor_mgr_t *fusion,
-    const int ic_used_threads, const int oh_used_threads,
-    const int ow_used_threads, const int y_unpad_top, const int y_unpad_bottom,
-    const int y_unpad_left, const int y_unpad_right, const int iw_padded,
-    const int kpack) const;
-
-  void single_thread_dynamic_conv_padding_call(expr &output, const expr &input,
-    const expr &weight, const expr &pbs, const expr &poc, const expr &ph,
-    const expr &pw, const expr &pic, const expr &outer_k,
-    const expr &h_num_block, const expr &h_num_block_pt,
-    const expr &w_num_block, const expr &w_num_block_pt,
-    const expr &oc_num_block, const int oc_num_block_pt,
-    const expr &ic_num_block, const int ic_num_block_pt, const expr &pbuffer,
-    for_loop &loh, for_loop &low, for_loop &looc, for_loop &loic,
-    for_loop &lioc, for_loop &lih, for_loop &liw, const int oc_split,
-    const expr &src_row_tile_size, const uint32_t lanes,
-    const nested_conv_fwd_config_t &config, fusion_anchor_mgr_t *fusion,
-    const int ic_used_threads, const int oc_used_threads,
-    const expr &oh_used_threads, const expr &ow_used_threads,
-    const expr &y_unpad_top, const expr &y_unpad_bottom,
-    const expr &y_unpad_left, const expr &y_unpad_right, const expr &iw_padded,
-    const int kpack, const expr &h_block, const expr &w_block,
-    const expr &im_h_block, const expr &im_w_block, const expr &oh_expr_,
-    const expr &ow_expr_, const expr &ih_expr_, const expr &iw_expr_,
-    expr &cond_tail_h, expr &cond_tail_w, int oc_block, int ic_block) const;
-
-  size_t ndims_ = 0;
-  int groups_ = 1;
-  int mb_ = 0, ic_ = 0, id_ = 0, ih_ = 0, iw_ = 0;
-  int oc_ = 0, kd_ = 0, kh_ = 0, kw_ = 0;
-  int od_ = 0, oh_ = 0, ow_ = 0;
-  int sd_ = 0, sh_ = 0, sw_ = 0;
-  int pd_b_ = 0, ph_b_ = 0, pw_b_ = 0;
-  int pd_e_ = 0, ph_e_ = 0, pw_e_ = 0;
-  int dd_ = 0, dh_ = 0, dw_ = 0;
-  int actual_os_ = 0, adj_os_ = 0;
-  int default_im_block_ = 64;
-  int im_oc_block_, im_ic_block_, im_w_block_, im_h_block_;
-  int num_elems_skip_per_ow_ = 0;
-  bool try_os_blocking_ = false;
-  bool is_1x1_conv_ = false;
-  bool is_3d_ = false;
-  bool is_1d_ = false;
-  bool use_conv1d = false;
-  bool use_nested_2d_ = false;
-  bool blocking_input_ = false;
-  bool blocking_output_ = false;
-  any_map_t attrs_;
-};
-
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/ops/templates/utils.hpp b/src/graph/backend/graph_compiler/core/src/ops/templates/utils.hpp
deleted file mode 100644
index d74a16bd7fa..00000000000
--- a/src/graph/backend/graph_compiler/core/src/ops/templates/utils.hpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_OPS_TEMPLATES_UTILS_HPP
-
-#include <algorithm>
-#include <string>
-#include <utility>
-#include <vector>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/ir/sc_data_type.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <runtime/trace.hpp>
-#include <unordered_set>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-expr divide_and_ceil(const expr &, const expr &);
-namespace ops {
-template <typename T>
-/**
- * This func is used in config parameter candidates merge, drops duplicate items
- * and set floor and ceil value to filter candidates.
- * @param v1 non-const vector to merge
- * @param v2 const vector to merge
- * @param floor clip min value
- * @param ceil clip max value
- * */
-static std::vector<T> concat_candidate_vec(std::vector<T> v1,
-  const std::vector<T> &v2, const T floor = 8, const T ceil = 4096) {
-  if (v1.empty()) { return v2; }
-  if (v2.empty()) { return v1; }
-  std::unordered_set<T> v3(v1.begin(), v1.end());
-  std::sort(v1.begin(), v1.end());
-  T biggest = v1[v1.size() - 1];
-  for (auto it = v2.begin(); it != v2.end(); it++) {
-    if (*it < biggest * 2) { v3.insert(*it); }
-  }
-  v1.clear();
-  v1.insert(v1.end(), v3.begin(), v3.end());
-  std::sort(v1.rbegin(), v1.rend());
-  while (v1.back() < floor && !v1.empty()) {
-    v1.pop_back();
-  }
-  std::sort(v1.begin(), v1.end());
-  while (v1.back() > ceil && !v1.empty()) {
-    v1.pop_back();
-  }
-  return v1;
-}
-
-inline bool is_amx_dtype(const context_ptr &ctx, const sc_data_type_t &dtype) {
-  bool ret = false;
-  if (ctx->use_amx()) {
-    if (ctx->machine_.cpu_flags_.fAVX512AMXBF16) {
-      ret |= (dtype == datatypes::bf16);
-    }
-    if (ctx->machine_.cpu_flags_.fAVX512AMXINT8) {
-      ret |= utils::is_one_of(dtype, datatypes::u8, datatypes::s8);
-    }
-  }
-
-  return ret;
-}
-
-inline bool is_vnni_low_fp(
-  const context_ptr &ctx, const sc_data_type_t &dtype) {
-  return dtype == datatypes::bf16;
-}
-
-inline bool no_vnni_low_fp(
-  const context_ptr &ctx, const sc_data_type_t &dtype) {
-  return dtype == datatypes::f16;
-}
-
-inline bool no_vnni(const context_ptr &ctx, const sc_data_type_t &dtype) {
-  return dtype == datatypes::f32 || dtype == datatypes::f16;
-}
-
-inline std::vector<expr> dims_to_expr(const sc_dims &dim) {
-  std::vector<expr> ret;
-  for (auto i : dim) {
-    ret.emplace_back(dim2unsigned(i));
-  }
-  return ret;
-}
-
-inline bool is_parallel_space_enough(int work_amount, int nthreads) {
-  return (work_amount > 0
-    && (work_amount % nthreads == 0
-      || utils::divide_and_ceil(work_amount, nthreads) >= 4));
-}
-
-/**
- * filter the vector by factor and tile factor, in order to adapt to the input
- * of dnnl amx.
- * @param input a array before filter
- * @param reduce_factor factor of reduce dim, 64 for dtype
- * == s8 and 32 for dtype == bf16
- * @param tile_factor tile factor of reduce dim, 4 for dtype
- * == s8 and 2 for dtype == bf16
- * */
-inline std::vector<int> filter_valid_factor_for_amx(
-  const std::vector<int> &input, int reduce_factor = 1, int tile_factor = 1) {
-  std::vector<int> results;
-  for (auto &it : input) {
-    if ((it / reduce_factor == 0 && it % reduce_factor % tile_factor == 0)
-      || it % reduce_factor == 0) {
-      results.push_back(it);
-    }
-  }
-  return results;
-}
-
-inline std::pair<int, int> get_amx_reduce_and_tile_factor(
-  sc_data_type_t dtype) {
-  int reduce_factor = 1, tile_factor = 1;
-  if (dtype.type_code_ == sc_data_etype::S8) {
-    reduce_factor = 64;
-    tile_factor = 4;
-  } else if (dtype.type_code_ == sc_data_etype::BF16) {
-    reduce_factor = 32;
-    tile_factor = 2;
-  }
-  return std::make_pair(reduce_factor, tile_factor);
-}
-
-template <typename T>
-std::vector<T> merge_vec(const std::vector<T> &a, const std::vector<T> &b) {
-  std::vector<T> result(a);
-  for (auto it : b) {
-    result.push_back(it);
-  }
-  return result;
-}
-
-inline std::vector<int> get_dynamic_block_candidates() {
-  return std::vector<int> {16, 32, 64};
-}
-
-inline std::vector<int> get_dynamic_batch_block_candidates() {
-  return std::vector<int> {16, 32, 64, 2, 4, 8};
-}
-
-inline uint16_t vectorize_step(
-  const context_ptr &ctx, sc_data_etype detype, uint16_t minv) {
-  return std::min(minv, ctx->get_max_vector_lanes(detype));
-}
-
-struct trace_guard_t {
-  int trace_id;
-  context_ptr ctx;
-  trace_guard_t(const context_ptr &ctx, const std::string &func_name)
-    : ctx(ctx) {
-    trace_id = register_traced_func(func_name);
-    if (ctx->flags_.trace_) {
-      builder::get_current_builder()->push_evaluate(
-        builtin::make_trace(trace_id, 0, 0));
-    }
-  }
-  ~trace_guard_t() {
-    if (ctx->flags_.trace_) {
-      builder::get_current_builder()->push_evaluate(
-        builtin::make_trace(trace_id, 1, 0));
-    }
-  }
-};
-
-inline static std::vector<int> get_splits(const int X) {
-  std::vector<int> splits;
-  for (auto i = 1; i <= X; ++i) {
-    if (X % i == 0) { splits.push_back(i); }
-  }
-  return splits;
-}
-
-inline static std::vector<int> get_sub_blocks(const int X, int factor = 2) {
-  std::vector<int> sub_blocks = {1, 2, 4, 8, 12, 16, 32};
-  for (size_t i = 0; i < sub_blocks.size(); i++) {
-    if (sub_blocks.at(i) >= X / factor) {
-      return {sub_blocks.begin(), sub_blocks.begin() + i + 1};
-    }
-  }
-  return sub_blocks;
-}
-
-template <typename T>
-inline static std::vector<int> get_blocks_if_not_satisfy(
-  const int X, int floor, int ceiling, T filter) {
-  auto block_list = utils::get_blocks(X, floor, ceiling);
-  block_list.erase(std::remove_if(block_list.begin(), block_list.end(), filter),
-    block_list.end());
-  if (block_list.empty()) { block_list = utils::get_blocks(X, floor, ceiling); }
-  return block_list;
-}
-
-inline expr get_balance211_length(
-  const expr &n, const expr &team, const expr &idx, expr &n_start, expr &T1) {
-  assert(team.isa<var>() || get_expr_as_int(team) >= 1);
-  expr n1 = divide_and_ceil(n, team);
-  expr n2 = do_cast_and_fold(n1 - 1);
-  T1 = do_cast_and_fold(n - n2 * team);
-  n_start = builder::make_select(idx <= T1, do_cast_and_fold(idx * n1),
-    do_cast_and_fold(T1 * n1 + (idx - T1) * n2));
-  return builder::make_select(idx < T1, n1, n2);
-}
-
-inline std::vector<int> get_os_blocks(const int ow, const int adj_os) {
-  std::vector<int> factors = utils::get_factors(ow);
-  std::vector<int> os_factors = utils::get_blocks(adj_os, 16);
-  factors.insert(factors.end(), os_factors.begin(), os_factors.end());
-  std::unordered_set<int> unique_factors(factors.begin(), factors.end());
-  factors.assign(unique_factors.begin(), unique_factors.end());
-  std::sort(factors.begin(), factors.end());
-  return factors;
-}
-
-inline int block_split(
-  const int &total_size, const int &num, int &block, int &tail_block) {
-  block = utils::divide_and_ceil(total_size, num);
-  tail_block = total_size % block;
-  if (tail_block == 0) { tail_block = block; }
-  int used_threads = utils::divide_and_ceil(total_size, block);
-  return used_threads;
-}
-
-inline int get_lanes(
-  const context_ptr &ctx, const int C_block, const sc_data_type_t &dtype) {
-  int lanes = 1;
-  if (utils::is_one_of(dtype, datatypes::s8, datatypes::u8)) {
-    if (C_block / 64 && C_block % 64 == 0) {
-      lanes = vectorize_step(ctx, dtype.type_code_, 64);
-    } else if (C_block / 32 && C_block % 32 == 0) {
-      lanes = vectorize_step(ctx, dtype.type_code_, 32);
-    } else if (C_block / 16 && C_block % 16 == 0) {
-      lanes = vectorize_step(ctx, dtype.type_code_, 16);
-    }
-  } else if (is_vnni_low_fp(ctx, dtype)) {
-    if (C_block / 32 && C_block % 32 == 0) {
-      lanes = vectorize_step(ctx, dtype.type_code_, 32);
-    } else if (C_block / 16 && C_block % 16 == 0) {
-      lanes = vectorize_step(ctx, dtype.type_code_, 16);
-    }
-  } else {
-    if (C_block / 16 && C_block % 16 == 0) {
-      lanes = vectorize_step(ctx, dtype.type_code_, 16);
-    }
-  }
-  return lanes;
-}
-
-inline uint64_t convert_int_to_mask(const int val) {
-  uint64_t mask = 0;
-  for (int i = 0; i < val; ++i) {
-    mask = mask << 1;
-    mask |= 0x1;
-  }
-  return mask;
-}
-
-inline int get_minimal_lanes(const int val) {
-  COMPILE_ASSERT(val <= 64,
-    "expected to be less than cache line size(64), but got " << val << "!");
-  if (val > 32) {
-    return 64;
-  } else if (val > 16) {
-    return 32;
-  } else {
-    return 16;
-  }
-}
-
-inline sc_data_type_t get_dtype(const int lanes) {
-  sc_data_type_t var_dtype;
-  switch (lanes) {
-    case 16: {
-      var_dtype = datatypes::u16;
-      break;
-    }
-    case 32: {
-      var_dtype = datatypes::u32;
-      break;
-    }
-    case 64: {
-      var_dtype = datatypes::index;
-      break;
-    }
-    default:
-      COMPILE_ASSERT(0, "expected lanes to be 16, 32, 64, but got " << lanes);
-  }
-  return var_dtype;
-}
-
-bool is_prefetch_debug_mode();
-// emit traces of prefetched address for debugging
-void trace_prefetch_for_debug(const expr &addr);
-// emit traces of BRGEMM B address for debugging
-void trace_brgemm_for_debug(
-  const expr &Baddr, const expr &bs, const expr &N, const expr &K);
-} // namespace ops
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/CMakeLists.txt b/src/graph/backend/graph_compiler/core/src/runtime/CMakeLists.txt
deleted file mode 100644
index fa7a8aa5d91..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#===============================================================================
-# Copyright 2022-2023 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-file(GLOB_RECURSE SC_RUNTIME_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.c
-    )
-if(${SC_PRODUCTION} STREQUAL ON)
-    list(REMOVE_ITEM SC_RUNTIME_SOURCES ${SC_SKIP_SOURCES})
-endif()
-if(${SC_GPU_BACKEND} STREQUAL "OFF")
-    list(REMOVE_ITEM SC_RUNTIME_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/l0_runtime.cpp)
-endif()
-
-add_library(graphcompiler_runtime STATIC ${SC_RUNTIME_SOURCES})
-target_link_libraries(graphcompiler_runtime PRIVATE ${DNNL_LIBNAME} "${EXTRA_SHARED_LIBS}")
-if(NOT ${SC_GPU_BACKEND} STREQUAL "OFF")
-    target_link_libraries(graphcompiler_runtime PRIVATE ze_loader)
-endif()
-if(DEFINED SC_SOURCES)
-    install(TARGETS graphcompiler_runtime
-        EXPORT graphcompiler_runtime_export
-        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
-endif()
-
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/aligned_ptr.cpp b/src/graph/backend/graph_compiler/core/src/runtime/aligned_ptr.cpp
deleted file mode 100644
index 751a9f9700a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/aligned_ptr.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "aligned_ptr.hpp"
-#include <immintrin.h>
-#include <runtime/config.hpp>
-#include <util/parallel.hpp>
-#include <util/simple_math.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-using utils::divide_and_ceil;
-using utils::parallel;
-
-void generic_ptr_base_t::zeroout() const {
-    static constexpr int page_size = 4096;
-    int numthreads = runtime_config_t::get().get_num_threads();
-    parallel(
-            [&](uint64_t i, uint64_t n) {
-                if (i != n - 1) {
-                    memset(static_cast<char *>(ptr_) + i * page_size, 0,
-                            page_size);
-                } else {
-                    memset(static_cast<char *>(ptr_) + i * page_size, 0,
-                            size_ - i * page_size);
-                }
-            },
-            0, divide_and_ceil(size_, page_size), 1, numthreads);
-}
-/**
- * Flush cache
- * */
-void generic_ptr_base_t::flush_cache() const {
-    static constexpr int cache_line_size = 64;
-    int numthreads = runtime_config_t::get().get_num_threads();
-    parallel(
-            [&](uint64_t i, uint64_t n) {
-                _mm_clflush(static_cast<const void *>(
-                        static_cast<const char *>(ptr_) + i));
-            },
-            0, size_, cache_line_size, numthreads);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/aligned_ptr.hpp b/src/graph/backend/graph_compiler/core/src/runtime/aligned_ptr.hpp
deleted file mode 100644
index 53c6adac0d9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/aligned_ptr.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_ALIGNED_PTR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_ALIGNED_PTR_HPP
-
-#include <memory.h>
-#include <stddef.h>
-#include <utility>
-#include <util/def.hpp>
-#include <util/os.hpp>
-#include <util/simple_math.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-/**
- * Allocator and RAII memory manager for aligned memory
- * @param sz the memory size, in bytes
- * @param aligment the alignment in bytes
- * */
-struct SC_INTERNAL_API generic_ptr_base_t {
-    void *ptr_;
-    size_t size_;
-    /**
-     * Sets the memory buffer to 0
-     * */
-    void zeroout() const;
-    /**
-     * Flush cache
-     * */
-    void flush_cache() const;
-};
-
-struct aligned_ptr_policy_t {
-    static void *alloc(size_t sz, size_t alignment) {
-        return aligned_alloc(
-                alignment, utils::divide_and_ceil(sz, alignment) * alignment);
-    }
-
-    static void dealloc(void *ptr, size_t sz) { aligned_free(ptr); }
-};
-
-/**
- * Allocator and RAII memory manager for aligned memory
- * @param sz the memory size, in bytes
- * @param aligment the alignment in bytes
- * */
-template <typename Policy>
-struct raii_ptr_t : protected generic_ptr_base_t {
-    using generic_ptr_base_t::flush_cache;
-    using generic_ptr_base_t::ptr_;
-    using generic_ptr_base_t::size_;
-    using generic_ptr_base_t::zeroout;
-    raii_ptr_t(size_t sz, size_t alignment = 64) {
-        ptr_ = Policy::alloc(sz, alignment);
-        size_ = sz;
-    }
-
-    raii_ptr_t(void *ptr, size_t sz) : generic_ptr_base_t {ptr, sz} {}
-    /**
-     * Move another ptr to this
-     * */
-    raii_ptr_t(raii_ptr_t &&other) {
-        ptr_ = other.ptr_;
-        other.ptr_ = nullptr;
-        size_ = other.size_;
-        other.size_ = 0;
-    }
-
-    raii_ptr_t &operator=(raii_ptr_t &&other) {
-        if (&other == this) { return *this; }
-        if (ptr_) { Policy::dealloc(ptr_, size_); }
-        ptr_ = other.ptr_;
-        other.ptr_ = nullptr;
-        size_ = other.size_;
-        other.size_ = 0;
-        return *this;
-    }
-    raii_ptr_t copy() const {
-        if (ptr_) {
-            size_t alignment = 64;
-            auto newptr = Policy::alloc(size_, alignment);
-            memcpy(newptr, ptr_, size_);
-            return raii_ptr_t {newptr, size_};
-        }
-        return raii_ptr_t {};
-    }
-    raii_ptr_t() : generic_ptr_base_t {nullptr, 0} {}
-    ~raii_ptr_t() {
-        if (ptr_) { Policy::dealloc(ptr_, size_); }
-    }
-};
-
-using generic_aligned_ptr_t = raii_ptr_t<aligned_ptr_policy_t>;
-
-template <typename T, typename Base = generic_aligned_ptr_t>
-struct aligned_ptr_t : Base {
-    using Base::flush_cache;
-    using Base::ptr_;
-    using Base::size_;
-    using Base::zeroout;
-    /**
-     * Creates a typed aligned memory buffer
-     * @param counts the count of the elements
-     * @param aligment the alignment in bytes
-     * */
-    aligned_ptr_t(size_t counts, size_t alignment = 64)
-        : Base(counts * sizeof(T), alignment) {}
-    aligned_ptr_t(aligned_ptr_t &&other) : Base(std::move(other)) {}
-    aligned_ptr_t() : Base() {}
-    aligned_ptr_t &operator=(aligned_ptr_t &&other) {
-        Base::operator=(std::move(other));
-        return *this;
-    }
-    aligned_ptr_t copy() { return Base::copy(); }
-
-    T *get() { return reinterpret_cast<T *>(ptr_); }
-    const T *get() const { return reinterpret_cast<const T *>(ptr_); }
-    T &operator[](size_t index) { return get()[index]; }
-    const T &operator[](size_t index) const { return get()[index]; }
-
-    size_t size() const { return size_ / sizeof(T); }
-    T *begin() { return get(); }
-    const T *begin() const { return get(); }
-    T *end() { return get() + size(); }
-    const T *end() const { return get() + size(); }
-
-    T *data() { return get(); };
-    const T *data() const { return get(); };
-    /**
-     * Fills the buffer
-     * @param f the functor which generates values for each elements of the
-     *      buffer. Should have the prototype T func(size_t v);, where v is
-     * the index in the buffer and it should returns the value to fill on
-     * the index
-     * */
-    template <typename Func>
-    void fill(Func f) {
-        T *ptr = get();
-        for (size_t i = 0; i < size_ / sizeof(T); i++) {
-            ptr[i] = f(i);
-        }
-    }
-
-private:
-    aligned_ptr_t(Base &&other) : Base(std::move(other)) {}
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/barrier.cpp b/src/graph/backend/graph_compiler/core/src/runtime/barrier.cpp
deleted file mode 100644
index 80676601a5c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/barrier.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-#include <atomic>
-#include <chrono>
-#include <immintrin.h>
-#include "barrier.hpp"
-#include "trace.hpp"
-#include <runtime/microkernel/cpu/kernel_timer.hpp>
-#include <runtime/runtime.hpp>
-
-#ifdef SC_KERNEL_PROFILE
-static void make_trace(int in_or_out, int count) {
-    if (sc_is_trace_enabled()) { sc_make_trace_kernel(3, in_or_out, count); }
-}
-static void make_trace_prefetch(int in_or_out, int count) {
-    if (sc_is_trace_enabled()) { sc_make_trace_kernel(4, in_or_out, count); }
-}
-#else
-#define make_trace(v, count) SC_UNUSED(count)
-#define make_trace_prefetch(v, count) SC_UNUSED(count)
-#endif
-
-namespace gc = dnnl::impl::graph::gc;
-
-extern "C" SC_API void sc_arrive_at_barrier(gc::runtime::barrier_t *b,
-        gc::runtime::barrier_idle_func idle_func, void *idle_args) {
-    make_trace(0, 0);
-    auto cur_round = b->rounds_.load(std::memory_order_acquire);
-    auto cnt = --b->pending_;
-    assert(cnt >= 0);
-    int count = 0;
-    if (cnt == 0) {
-        b->pending_.store(b->total_);
-        b->rounds_.store(cur_round + 1);
-    } else {
-        if (idle_func) {
-            if (cur_round != b->rounds_.load()) {
-                make_trace(1, 0);
-                return;
-            }
-            auto ret = idle_func(&b->rounds_, cur_round + 1, -1, idle_args);
-            count = ret & 0xffffffff;
-        }
-        while (cur_round == b->rounds_.load()) {
-            _mm_pause();
-        }
-    }
-    make_trace(1, count);
-}
-
-static_assert(sizeof(gc::runtime::barrier_t) == 64,
-        "size of barrier_t should be 64-byte");
-
-extern "C" SC_API void sc_init_barrier(
-        gc::runtime::barrier_t *b, int num_barriers, uint64_t thread_count) {
-    for (int i = 0; i < num_barriers; i++) {
-        b[i].total_ = thread_count;
-        b[i].pending_.store(thread_count);
-        b[i].rounds_.store(0);
-    }
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/barrier.hpp b/src/graph/backend/graph_compiler/core/src/runtime/barrier.hpp
deleted file mode 100644
index bcb8e9c25f5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/barrier.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_BARRIER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_BARRIER_HPP
-#include <atomic>
-#include <stdint.h>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-struct barrier_t {
-    alignas(64) std::atomic<int32_t> pending_;
-    std::atomic<int32_t> rounds_;
-    uint64_t total_;
-    // pad barrier to size of cacheline to avoid false sharing
-    char padding_[64 - 4 * sizeof(int32_t)];
-};
-
-typedef uint64_t (*barrier_idle_func)(std::atomic<int32_t> *remaining,
-        int32_t expected_remain, int32_t tid, void *args);
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-extern "C" SC_API void sc_arrive_at_barrier(
-        dnnl::impl::graph::gc::runtime::barrier_t *b,
-        dnnl::impl::graph::gc::runtime::barrier_idle_func idle_func,
-        void *idle_args);
-extern "C" SC_API void sc_init_barrier(
-        dnnl::impl::graph::gc::runtime::barrier_t *b, int num_barriers,
-        uint64_t thread_count);
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/config.hpp b/src/graph/backend/graph_compiler/core/src/runtime/config.hpp
deleted file mode 100644
index 992571ed0f0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/config.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_CONFIG_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_CONFIG_HPP
-#include <stdint.h>
-#include <string>
-#include <runtime/generic_val.hpp>
-#include <runtime/threadpool_mode.hpp>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct thread_pool_table {
-    // submits job in thread pool
-    void (*parallel_call)(void (*pfunc)(void *, void *, int64_t, generic_val *),
-            uint64_t flags, void *rtl_ctx, void *module_env, int64_t begin,
-            int64_t end, int64_t step, generic_val *args);
-    // submits job in GC-managed thread pool
-    void (*parallel_call_managed)(
-            void (*pfunc)(void *, void *, int64_t, generic_val *),
-            uint64_t flags, void *rtl_ctx, void *module_env, int64_t begin,
-            int64_t end, int64_t step, generic_val *args);
-    // gets the max number of threads in pool
-    int (*get_num_threads)();
-    // sets the max number of threads in pool
-    void (*set_num_threads)(int v);
-    // get the current thread id in pool. Should be 0~N
-    int (*get_thread_id)();
-    // returns non-zero if is in parallel section
-    int (*is_in_parallel)();
-};
-
-struct SC_API runtime_config_t {
-    enum trace_mode_t { OFF = 0, FAST, KERNEL, MULTI_THREAD };
-    thread_pool_table *thread_pool_table_;
-    // if in muti-instance simulation, the number of threads per instance.
-    int get_num_threads() { return thread_pool_table_->get_num_threads(); }
-    bool set_num_threads(int num) const;
-    std::string trace_out_path_;
-    int trace_initial_cap_ = 4096;
-    trace_mode_t trace_mode_ = OFF;
-    bool execution_verbose_ = false;
-    thread_pool_mode_t managed_thread_pool_ = thread_pool_mode_t::DIRECT;
-    int verbose_level_ = 0;
-    static runtime_config_t &get() noexcept;
-
-private:
-    runtime_config_t();
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/const_cache_wrapper.cpp b/src/graph/backend/graph_compiler/core/src/runtime/const_cache_wrapper.cpp
deleted file mode 100644
index 6a335a50baa..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/const_cache_wrapper.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <memory>
-#include <stdint.h>
-#include <runtime/const_cache_wrapper.hpp>
-#include <runtime/memorypool.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-const_cache_proxy::~const_cache_proxy() = default;
-
-void *const_cache_proxy::acquire(int32_t *inited) {
-    if (check_alive_and_ref()) {
-        *inited = *inited && initialized_;
-        return buffer_;
-    }
-    return nullptr;
-}
-
-bool const_cache_proxy::release() {
-    if (is_alive()) {
-        deref();
-        initialized_ = 1;
-        return true;
-    }
-    return false;
-}
-
-std::shared_ptr<const_cache_proxy> create_and_register_const_cache(
-        dnnl::impl::graph::gc::runtime::engine_t *engine, size_t size) {
-    // simply allocate buffer and return
-    std::shared_ptr<void> base = std::shared_ptr<void> {
-            engine->vtable_->persistent_alloc(engine, size), [engine](void *p) {
-                engine->vtable_->persistent_dealloc(engine, p);
-            }};
-    return std::make_shared<const_cache_proxy>(base, base.get(), size, true);
-}
-
-void unregister_const_cache(const_cache_proxy *cache) {
-    // currently do nothing
-}
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-extern "C" SC_API void *sc_acquire_const_cache(
-        dnnl::impl::graph::gc::runtime::stream_t *stream,
-        dnnl::impl::graph::gc::runtime::const_cache_proxy *cacheptr,
-        size_t size, int32_t *inited) {
-    if (auto buf = cacheptr->acquire(inited)) { return buf; }
-    *inited = 0;
-    return sc_aligned_malloc(stream, size);
-}
-extern "C" SC_API void sc_release_const_cache(
-        dnnl::impl::graph::gc::runtime::stream_t *stream,
-        dnnl::impl::graph::gc::runtime::const_cache_proxy *cacheptr,
-        void *ptr) {
-    if (cacheptr->release()) { return; }
-    return sc_aligned_free(stream, ptr);
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/const_cache_wrapper.hpp b/src/graph/backend/graph_compiler/core/src/runtime/const_cache_wrapper.hpp
deleted file mode 100644
index e49e7a2c129..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/const_cache_wrapper.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_CONST_CACHE_WRAPPER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_CONST_CACHE_WRAPPER_HPP
-#include <atomic>
-#include <memory>
-#include <stdexcept>
-#include <stdint.h>
-#include <runtime/context.hpp>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-/**
- * The helper class to manage ref count manually for an object allocated with
- * shared ptr. It holds an additional shared ptr reference to the object and
- * contains an additional self-managed refcount. The refcount will be set to 1
- * when the object is initialized (see init()). When the refcount counts down to
- * 0, the additional shared ptr is reset.
- */
-struct ref_count_managed {
-    ref_count_managed() = default;
-    ref_count_managed(const std::shared_ptr<void> &keep_alive) {
-        init(keep_alive);
-    }
-    void init(const std::shared_ptr<void> &keep_alive) {
-        keep_alive_ = keep_alive;
-        ref_count_.store(1);
-    }
-
-    void ref() { ++ref_count_; }
-    void deref() {
-        auto newv = --ref_count_;
-        if (newv == 0) { keep_alive_ = nullptr; }
-    }
-
-    // atomically check if ref_count_ > 0. if so, ref() the object and return
-    // true. Otherwise (if ref_count_==0), return false
-    bool check_alive_and_ref() {
-        auto oldv = ref_count_.load();
-        for (;;) {
-            if (oldv <= 0) { return false; }
-            if (ref_count_.compare_exchange_strong(oldv, oldv + 1)) {
-                return true;
-            }
-            // CAS failed, oldv has now the newest known value of ref_count_
-        }
-    }
-
-    bool is_alive() const { return ref_count_ > 0; }
-    void *unsafe_get_ptr() const { return keep_alive_.get(); }
-
-private:
-    std::shared_ptr<void> keep_alive_;
-    std::atomic<int> ref_count_ {0};
-};
-
-/**
- * The proxy for the constant cache of Graph API. It holds a shared ptr pointing
- * to the cache item in the cache manager (keep_alive) to extend the lifetime by
- * refcount, @see ref_count_managed. To access the memory buffer of the const
- * cache, use sc_acquire_const_cache and sc_release_const_cache functions. They
- * will ref/deref the const_cache_proxy to make sure the cache is alive after
- * calling sc_acquire_const_cache and before sc_release_const_cache. The cache
- * manager of Graph API may evict the cache item by dereferenceing this
- * ref_count_managed object. sc_{acquire,release}_const_cache functions will
- * find out that the cache has been invalidated and they will then use the
- * memory allocator in the runtime::stream_t to re-allocate the buffer. Usually
- * we expect JIT modules to hold shared ptr to const_cache_proxy via
- * cached_const_graph_tensor.
- * If is_lazy_ == true, the cache item's lifetime will be managed by the cache
- * manager of Graph API and it is filled with data after the first execution of
- * the computation. Otherwise, the cache item is always alive as long as the
- * jit_module of the kernel is alive.
- */
-struct const_cache_proxy : ref_count_managed {
-    const_cache_proxy(const std::shared_ptr<void> &keep_alive, void *buffer,
-            size_t size, bool is_lazy)
-        : ref_count_managed(keep_alive)
-        , size_(size)
-        , is_lazy_(is_lazy)
-        , buffer_(buffer) {}
-    ~const_cache_proxy();
-
-    // get the buffer and increment the refcount. If the buffer is evicted,
-    // returns null
-    void *acquire(int32_t *inited);
-    // decrement the refcount
-    bool release();
-
-    void *get_buffer_if_not_lazy() const {
-        if (is_lazy_) {
-            throw std::runtime_error(
-                    "get_buffer_if_not_lazy: The buffer must be lazy");
-        }
-        return buffer_;
-    }
-
-    size_t size_;
-    // if the buffer is lazy-initialized. If false, it should be filled before
-    // computation
-    bool is_lazy_;
-
-private:
-    // raw pointer to the buffer
-    void *buffer_;
-    // if the buffer has been initialized. calling release() will set this to 1
-    int32_t initialized_ = 0;
-};
-
-// allocate the const cache buffer and register it to Graph API cache manager
-std::shared_ptr<const_cache_proxy> create_and_register_const_cache(
-        dnnl::impl::graph::gc::runtime::engine_t *engine, size_t size);
-
-// unregister it in Graph API cache manager
-void unregister_const_cache(const_cache_proxy *cache);
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-// acquire the cached constant buffer pointer. If the cache has been evicted,
-// this function will allocate a new buffer and set *inited to 0. If the cache
-// is still alive, this function will increment the refcount of the buffer to
-// keep it alive, and set `*inited = cacheptr->initialized_ & *inited`
-extern "C" SC_API void *sc_acquire_const_cache(
-        dnnl::impl::graph::gc::runtime::stream_t *stream,
-        dnnl::impl::graph::gc::runtime::const_cache_proxy *cacheptr,
-        size_t size, int32_t *inited);
-// release the cached constant buffer pointer. If the cache has been evicted,
-// this function will free the newly allocated buffer. If the cache
-// is still alive, this function will decrement the refcount of the buffer
-extern "C" SC_API void sc_release_const_cache(
-        dnnl::impl::graph::gc::runtime::stream_t *stream,
-        dnnl::impl::graph::gc::runtime::const_cache_proxy *cacheptr, void *ptr);
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/context.cpp b/src/graph/backend/graph_compiler/core/src/runtime/context.cpp
deleted file mode 100644
index 6c9d45bf657..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/context.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include <assert.h>
-#include "const_cache_wrapper.hpp"
-#include "memorypool.hpp"
-#include "parallel.hpp"
-#include "runtime.hpp"
-#include "thread_locals.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-static void *global_alloc(runtime::engine_t *eng, size_t sz) {
-    return sc_global_aligned_alloc(sz, 64);
-}
-
-static void global_free(runtime::engine_t *eng, void *p) {
-    return sc_global_aligned_free(p, 64);
-}
-
-static engine_vtable_t vtable {global_alloc, global_free,
-        memory_pool::alloc_by_mmap, memory_pool::dealloc_by_mmap,
-        create_and_register_const_cache,
-        [](engine_t *) -> size_t { return 999999; }};
-
-engine_t::engine_t(engine_vtable_t *vtable)
-    : vtable_(vtable), registry_(get_thread_locals_registry()) {}
-
-static engine_t default_engine {&vtable};
-stream_t default_stream {
-        {sc_parallel_call_cpu_with_env_impl, nullptr}, &default_engine};
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-static stream_t *get_default_stream_impl() {
-    return &default_stream;
-}
-stream_t *(*get_default_stream)() = get_default_stream_impl;
-#endif
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/context.hpp b/src/graph/backend/graph_compiler/core/src/runtime/context.hpp
deleted file mode 100644
index bd2ea4926d6..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/context.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_CONTEXT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_CONTEXT_HPP
-
-#include <memory>
-#include <stddef.h>
-#include <stdint.h>
-#include <util/def.hpp>
-
-struct dnnl_stream;
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-union generic_val;
-
-namespace runtime {
-
-struct engine_t;
-struct const_cache_proxy;
-struct thread_local_registry_t;
-
-struct engine_vtable_t {
-    using alloc_t = void *(*)(engine_t *, size_t);
-    using dealloc_t = void (*)(engine_t *, void *);
-    alloc_t persistent_alloc;
-    dealloc_t persistent_dealloc;
-    alloc_t temp_alloc;
-    dealloc_t temp_dealloc;
-    std::shared_ptr<const_cache_proxy> (*alloc_and_register_tensor_cache)(
-            engine_t *, size_t);
-    size_t (*get_tensor_cache_cap)(engine_t *);
-};
-
-struct stream_vtable_t {
-    using parallel_call_cpu_t = void (*)(
-            void (*)(void *, void *, int64_t, generic_val *), uint64_t, void *,
-            void *, int64_t, int64_t, int64_t, generic_val *);
-    parallel_call_cpu_t parallel_call;
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-    const dnnl_stream *stream;
-#endif
-    constexpr stream_vtable_t(
-            parallel_call_cpu_t pcall, const dnnl_stream *pstream)
-        : parallel_call(pcall)
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        , stream(pstream)
-#endif
-    {
-    }
-};
-
-struct engine_t {
-    engine_vtable_t *vtable_;
-    std::shared_ptr<thread_local_registry_t> registry_;
-    engine_t(engine_vtable_t *vtable);
-};
-
-struct stream_t {
-    // we are using stream_vtable_t instead of stream_vtable_t* because
-    // currently stream_vtable_t has only one field. Using the value type
-    // instead of the pointer saves a memory access. Need to change back to
-    // pointer if the vtable has move than 1 field
-    stream_vtable_t vtable_;
-    engine_t *engine_;
-
-    constexpr stream_t(stream_vtable_t vtable, engine_t *engine)
-        : vtable_ {vtable}, engine_ {engine} {}
-    const stream_vtable_t *vtable() const { return &vtable_; }
-};
-
-SC_API extern stream_t default_stream;
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-SC_INTERNAL_API extern stream_t *(*get_default_stream)();
-#else
-inline stream_t *get_default_stream() {
-    return &default_stream;
-}
-#endif
-
-} // namespace runtime
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/data_type.hpp b/src/graph/backend/graph_compiler/core/src/runtime/data_type.hpp
deleted file mode 100644
index ed30aa22cfd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/data_type.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DATA_TYPE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DATA_TYPE_HPP
-
-#include <stdint.h> // uint64_t, uint32_t
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// The basic data types for scalars and pointers. The lower 8-bits represent
-// the "base type". And the basic type is a pointer if the 9-th bit is 1.
-// For example, `sc_data_etype::POINTER || sc_data_etype::F32` means the pointer
-// type to float32. A sc_data_etype which equals to POINTER is a `void*` type.
-enum class sc_data_etype : uint32_t {
-    /// Undefined data type.
-    UNDEF = 0,
-    /// 16-bit/half-precision floating point.
-    F16 = 1,
-    /// non-standard 16-bit floating point with 7-bit mantissa.
-    BF16 = 2,
-    /// 16-bit unsigned integer.
-    U16 = 3,
-    /// 32-bit/single-precision floating point.
-    F32 = 4,
-    /// 32-bit signed integer.
-    S32 = 5,
-    /// 32-bit unsigned integer.
-    U32 = 6,
-    /// 8-bit signed integer.
-    S8 = 7,
-    /// 8-bit unsigned integer.
-    U8 = 8,
-    /// data type used for indexing.
-    INDEX = 9,
-    /// generic_val type, a union type for all supported scalar types
-    GENERIC = 10,
-    /// boolean
-    BOOLEAN = 11,
-    /// void type
-    VOID_T = 12,
-    /// the max enum value + 1
-    MAX_VALUE = 13,
-    /// general pointer type, also used as a pointer bit mask
-    /// void* type. The opaque pointer type. Any pointers (including tensor /
-    /// tensor ptr) can be auto-cast to a pointer value. But casting back
-    /// is not allowed
-    POINTER = 0x100,
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dispatch_key.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dispatch_key.hpp
deleted file mode 100644
index 27e4323214b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dispatch_key.hpp
+++ /dev/null
@@ -1,200 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <assert.h>
-#include <stdint.h>
-#include <util/hash_utils.hpp>
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DISPATCH_KEY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DISPATCH_KEY_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-// the compressed 64-bit dispatch_key
-union dispatch_key {
-    uint64_t storage_;
-    struct {
-        // the compressed encoding for 1st blocking; block_idx1=block1/16-1. It
-        // is used for fast indexing on most frequently used blocking numbers
-        unsigned block_idx1_ : 2;
-        // the compressed encoding for 2nd blocking; block_idx2=block2/16-1. It
-        // is used for fast indexing on most frequently used blocking numbers
-        unsigned block_idx2_ : 2;
-        // the compressed encoding for op implement algorithm type. There are 16
-        // reserved algorithms for select.
-        unsigned impl_alg_ : 11;
-        // uncompressed 1st blocking number: 0-255. If compressed encoding is
-        // used, it should be 0
-        unsigned block1_ : 8;
-        // uncompressed 2nd blocking number: 0-255. If compressed encoding is
-        // used, it should be 0
-        unsigned block2_ : 8;
-        // the format kind is plain or not.
-        unsigned is_plain_ : 1;
-        // format_kind, see sc_data_format_kind_t. 4 bits per axis and we can
-        // encode at most 8 axises
-        unsigned format_kind_ : 32;
-        // unused bits to pad to 64 bits. Should be 0
-        // unsigned unused_ : 0;
-    };
-    struct meta {
-        static constexpr int MAX_DIMS = 8;
-        static constexpr int IMPL_ALG_BITS = 11;
-        static constexpr int LINEAR_INDEX_BITS = 4 + IMPL_ALG_BITS;
-        static constexpr int BLOCKS_BIT_OFFSET = LINEAR_INDEX_BITS;
-        static constexpr int BLOCKS_BITS = 16;
-        static constexpr int BLOCKS_MASK = ((1UL << BLOCKS_BITS) - 1)
-                << BLOCKS_BIT_OFFSET;
-        static constexpr int FORMAT_BITS_OFFSET = 32;
-        static constexpr int FORMAT_BITS = 32;
-        static constexpr uint64_t FORMAT_MASK
-                = ((static_cast<uint64_t>(1) << FORMAT_BITS) - 1)
-                << FORMAT_BITS_OFFSET;
-        static constexpr int PLAIN_BIT_OFFSET = LINEAR_INDEX_BITS + BLOCKS_BITS;
-        static constexpr uint64_t PLAIN_MASK = 1UL << PLAIN_BIT_OFFSET;
-        static constexpr int BITS_PER_SLOT = 4;
-    };
-
-    dispatch_key() = default;
-    constexpr dispatch_key(uint64_t storage) : storage_(storage) {}
-    dispatch_key(unsigned format_kind, unsigned block1, unsigned block2,
-            unsigned impl_alg, bool is_plain = false)
-        : storage_ {0} {
-        impl_alg_ = impl_alg;
-        is_plain_ = is_plain;
-        if (block1 % 16 == 0 && block1 <= 64) {
-            block1_ = 0;
-            block_idx1_ = block1 ? (block1 / 16 - 1) : 0;
-        } else {
-            block1_ = block1;
-            block_idx1_ = 0;
-        }
-        if (block2 % 16 == 0 && block2 <= 64) {
-            block2_ = 0;
-            block_idx2_ = block2 ? (block2 / 16 - 1) : 0;
-        } else {
-            block2_ = block2;
-            block_idx2_ = 0;
-        }
-        format_kind_ = format_kind;
-    }
-
-    constexpr int get(int idx) const {
-        return 0xf & (get_format_bits() >> (idx * meta::BITS_PER_SLOT));
-    }
-
-    void set(int idx, int axis) {
-        format_kind_ = (format_kind_ & ~(0xF << (idx * meta::BITS_PER_SLOT)))
-                | (axis << (idx * meta::BITS_PER_SLOT));
-    }
-
-    int ndims() const {
-        int idx = 0;
-        while (idx < meta::MAX_DIMS && get(idx) != 0xF) {
-            idx++;
-        }
-        return idx;
-    }
-
-    constexpr operator uint64_t() const { return storage_; }
-    constexpr uint16_t get_block1() const {
-        return (block1_ == 0) ? (block_idx1_ + 1) * 16 : block1_;
-    }
-
-    constexpr uint16_t get_block2() const {
-        return (block2_ == 0) ? (block_idx2_ + 1) * 16 : block2_;
-    }
-
-    constexpr uint32_t get_linear_index() const { return storage_ & (0xff); }
-    constexpr uint32_t get_impl_alg_type() const { return impl_alg_; }
-
-    constexpr uint32_t get_format_bits() const {
-        return (storage_ & meta::FORMAT_MASK) >> meta::FORMAT_BITS_OFFSET;
-    }
-
-    void set_block1(uint16_t block) {
-        if (block % 16 == 0 && block <= 64) {
-            block_idx1_ = block / 16 - 1;
-            block1_ = 0;
-        } else {
-            block1_ = block;
-            block_idx1_ = 0;
-        }
-    }
-    void set_block2(uint16_t block) {
-        if (block % 16 == 0 && block <= 64) {
-            block_idx2_ = block / 16 - 1;
-            block2_ = 0;
-        } else {
-            block2_ = block;
-            block_idx2_ = 0;
-        }
-    }
-    void set_impl_alg(unsigned impl_alg) { impl_alg_ = impl_alg; }
-    void reset_blocks_and_impl() {
-        impl_alg_ = 0;
-        if (is_plain_) {
-            block_idx1_ = 0;
-            block_idx2_ = 0;
-            block1_ = 0;
-            block2_ = 0;
-        }
-    }
-    constexpr bool is_blocks_uncompressed() const {
-        return storage_ & meta::BLOCKS_MASK;
-    }
-
-    constexpr bool is_plain() const { return storage_ & meta::PLAIN_MASK; }
-
-    // converter for format_kind => 0~N index
-    template <uint64_t format_kind, uint64_t... args>
-    struct linear_converter {
-        static constexpr int idx = 1 + linear_converter<args...>::idx;
-        static uint64_t call(dispatch_key v) {
-            if (v.format_kind_ == uint32_t(format_kind)) { return idx; }
-            return linear_converter<args...>::call(v);
-        }
-    };
-
-    template <uint64_t format_kind>
-    struct linear_converter<format_kind> {
-        static constexpr int idx = 0;
-        static uint64_t call(dispatch_key v) {
-            assert(v.format_kind_ == uint32_t(format_kind));
-            return 0;
-        }
-    };
-};
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-namespace std {
-template <>
-struct hash<::dnnl::impl::graph::gc::runtime::dispatch_key> {
-    std::size_t operator()(
-            const ::dnnl::impl::graph::gc::runtime::dispatch_key &in) const {
-        return std::hash<uint64_t>()(uint64_t(in));
-    }
-};
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dispatch_table.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dispatch_table.hpp
deleted file mode 100644
index 3cbedc07855..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dispatch_table.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_DISPATCH_TABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_DISPATCH_TABLE_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-struct dispatch_table_t {
-    virtual ~dispatch_table_t() = default;
-    using dispatch_func_t = void *(*)(dispatch_table_t *ths, uint64_t *keys,
-            uint64_t num_keys);
-    virtual dispatch_func_t get_dispatch_func() = 0;
-    virtual void *get(uint64_t *keys, uint64_t num_keys) = 0;
-    virtual void set(uint64_t *keys, uint64_t num_keys, void *value) = 0;
-};
-
-} // namespace runtime
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dyn_dispatch_table.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dyn_dispatch_table.cpp
deleted file mode 100644
index 413f6c82970..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dyn_dispatch_table.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-#include <stdio.h>
-#include <vector>
-#include "dyn_dispatch_table.hpp"
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-size_t dyn_dispatch_table_t::compute_linear_index(
-        uint64_t *keys, uint64_t num_keys) const {
-    uint64_t idx = 0;
-    assert(number_of_args_ == num_keys);
-    uint64_t cumulative_size = 1;
-    uint64_t start = 0;
-    for (uint64_t i = 0; i < number_of_args_; i++) {
-        dispatch_key fmt = keys[i];
-        uint32_t key = fmt.format_kind_;
-        bool found = false;
-        auto next_start = start + number_of_candidates_[i];
-        for (int j = 0; j < number_of_candidates_[i]; j++) {
-            auto &candidate = format_look_up_table_[j + start];
-            if (candidate == key) {
-                idx += j * cumulative_size;
-                found = true;
-                break;
-            }
-        }
-        start = next_start;
-        cumulative_size *= number_of_candidates_[i];
-
-        assert(found);
-        (void)found;
-    }
-
-    return idx * number_of_blocks_ + block_to_idx_(keys, num_keys);
-}
-
-void *dyn_dispatch_table_t::dispatch(
-        dispatch_table_t *ths, uint64_t *keys, uint64_t num_keys) {
-    auto thetable = static_cast<dyn_dispatch_table_t *>(ths);
-    return thetable->table_[thetable->compute_linear_index(keys, num_keys)];
-}
-
-dyn_dispatch_table_t::dyn_dispatch_table_t(
-        std::vector<format_arg_t> &&format_args,
-        block_extract_func_t block_to_idx, size_t number_of_blocks)
-    : block_to_idx_(block_to_idx)
-    , number_of_blocks_(number_of_blocks)
-    , number_of_args_(format_args.size()) {
-    uint64_t cur_size = 1;
-    int arg_idx = 0;
-    int idx = 0;
-    for (auto &args : format_args) {
-        cur_size *= args.info_.size();
-        for (auto &info : args.info_) {
-            format_look_up_table_[idx] = info.key_;
-            idx++;
-        }
-        number_of_candidates_[arg_idx] = args.info_.size();
-        arg_idx++;
-    }
-    // calculate the total table size and reserve buffer
-    table_.resize(cur_size * number_of_blocks);
-}
-
-// the unrolled loop to find v in
-template <uint64_t idx, uint64_t total_size>
-struct unrolled_find_t {
-    static uint64_t call(uint32_t *cur, uint32_t v) {
-        if (cur[idx] == v) { return idx; }
-        return unrolled_find_t<idx + 1, total_size - 1>::call(cur, v);
-    }
-};
-
-template <uint64_t idx>
-struct unrolled_find_t<idx, 1> {
-    static uint64_t call(uint32_t *cur, uint32_t v) {
-        assert(cur[idx] == v);
-        return idx;
-    }
-};
-
-template <int idx, uint64_t cum_sum, uint64_t cum_product,
-        int first_num_candidates, int... num_candidates>
-struct semi_dyn_dispatch_t {
-    static size_t call(dyn_dispatch_table_t *ths, uint64_t *keys) {
-        auto ptr = unrolled_find_t<0, first_num_candidates>::call(
-                &ths->format_look_up_table_[cum_sum],
-                dispatch_key(keys[idx]).format_kind_);
-        return ptr * cum_product
-                + semi_dyn_dispatch_t<idx + 1, cum_sum + first_num_candidates,
-                        cum_product * first_num_candidates,
-                        num_candidates...>::call(ths, keys);
-    }
-};
-
-template <int idx, uint64_t cum_sum, uint64_t cum_product,
-        int first_num_candidates>
-struct semi_dyn_dispatch_t<idx, cum_sum, cum_product, first_num_candidates> {
-    static size_t call(dyn_dispatch_table_t *ths, uint64_t *keys) {
-        auto ptr = unrolled_find_t<0, first_num_candidates>::call(
-                &ths->format_look_up_table_[cum_sum],
-                dispatch_key(keys[idx]).format_kind_);
-        return ptr * cum_product;
-    }
-};
-
-template <int... num_candidates>
-static void *semi_dispatch(
-        dispatch_table_t *ths, uint64_t *keys, uint64_t num_keys) {
-    dyn_dispatch_table_t *table = static_cast<dyn_dispatch_table_t *>(ths);
-    auto result_idx = semi_dyn_dispatch_t<0, 0, 1, num_candidates...>::call(
-            table, keys);
-    // assert(result_idx * table->number_of_blocks_
-    //                 + table->block_to_idx_(keys, num_keys)
-    //         == table->compute_linear_index(keys, num_keys));
-    return table->table_[result_idx * table->number_of_blocks_
-            + table->block_to_idx_(keys, num_keys)];
-}
-
-dyn_dispatch_table_t::dispatch_func_t
-dyn_dispatch_table_t::get_dispatch_func() {
-    if (number_of_args_ == 3
-            && number_of_candidates_ == std::array<int, 4> {2, 2, 2, 0}) {
-        return &semi_dispatch<2, 2, 2>;
-    }
-    return &dispatch;
-};
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dyn_dispatch_table.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dyn_dispatch_table.hpp
deleted file mode 100644
index f78869c9d70..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dyn_dispatch_table.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_DYN_DISPATCH_TABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_DYN_DISPATCH_TABLE_HPP
-
-#include <array>
-#include <vector>
-#include "dispatch_table.hpp"
-#include <runtime/dispatch_key.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-/**
- * The convertion table to map a list of formats to a single linear index. The
- * blocking of the formats must be in [16,32,48,64]. The table has a fixed
- * number of format_args, and the user needs to input the same number of formats
- * at the run time to dispatch the list of formats. The "fixed" here means
- * "fixed" since construction of this table.
- *
- * The candidates of a format arg should be sorted to "most likely used" first.
- * So that it will speedup finding the format.
- *
- * This dispatch table also provides specialized dispatch function for some
- * combinations of candidate numbers: 2x2x2
- *
- * If the candidate numbers are these, it will use the fast dispatch function.
- *
- * @arg format_args the vector of format_arg_t. Each element for an input
- * format. A format_arg_t contains the format kind candidates for the format
- * argument.
- *
- * */
-struct dyn_dispatch_table_t : public dispatch_table_t {
-    struct format_candidate_t {
-        uint32_t key_;
-        format_candidate_t() = default;
-        format_candidate_t(uint64_t format_kind)
-            : key_(static_cast<uint32_t>(format_kind)) {}
-    };
-    struct format_arg_t {
-        std::vector<format_candidate_t> info_;
-    };
-    using block_extract_func_t
-            = uint64_t (*)(uint64_t *keys, uint64_t num_keys);
-    std::array<uint32_t, 8> format_look_up_table_;
-    // the function to convert block numbers in format keys into an integer in
-    // [0, number_of_blocks-1]
-    block_extract_func_t block_to_idx_;
-    size_t number_of_blocks_;
-    std::vector<void *> table_;
-    size_t number_of_args_;
-    std::array<int, 4> number_of_candidates_;
-
-    size_t compute_linear_index(uint64_t *keys, uint64_t num_keys) const;
-    void *get(uint64_t *keys, uint64_t num_keys) final override {
-        return table_[compute_linear_index(keys, num_keys)];
-    }
-
-    void set(uint64_t *keys, uint64_t num_keys, void *value) final override {
-        table_[compute_linear_index(keys, num_keys)] = value;
-    }
-
-    dyn_dispatch_table_t(std::vector<format_arg_t> &&format_args,
-            block_extract_func_t block_to_idx, size_t number_of_blocks);
-    static void *dispatch(
-            dispatch_table_t *ths, uint64_t *keys, uint64_t num_keys);
-
-    dispatch_func_t get_dispatch_func() final override;
-};
-
-} // namespace runtime
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dynamic_tensor.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dynamic_tensor.hpp
deleted file mode 100644
index 75ea6497102..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/dynamic_tensor.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_DYNAMIC_TENSOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_DYNAMIC_TENSOR_HPP
-#include <stdint.h>
-#include "../data_type.hpp"
-#include <compiler/dimensions.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-/** This tensor is prepared for dynamic shape. If tensor node has uncertain
- * shapes during compile-time, then tensor node will be transformed to `void *`
- * which is exactly `dynamic_tensor_t *` instead of raw data pointer by
- * dyn_tsr_transform pass.
- * @param data, raw data pointer.
- * @param dims, pointer to olain shape dimensions of tensor
- * @param ndims, number of shape dimensions
- * @param dtype, datatype of the tensor
- * @param dyn_mask, pointer to a boolean list who match with pdims and indicate
- * which dimension is dynamic.
- * */
-struct dynamic_tensor_t {
-    dynamic_tensor_t() = default;
-    dynamic_tensor_t(void *data, sc_dim *dims, int ndims, uint32_t dtype,
-            uint8_t dyn_mask)
-        : data_(data)
-        , dims_(dims)
-        , ndims_(ndims)
-        , dtype_(dtype)
-        , dyn_mask_(dyn_mask) {}
-    // the raw opaque data pointer.
-    void *data_;
-    sc_dim *dims_;
-    // number of dimensions;
-    int ndims_;
-    uint32_t dtype_;
-    uint8_t dyn_mask_;
-};
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/hash_dispatch_table.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/hash_dispatch_table.cpp
deleted file mode 100644
index 66ded87d8e8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/hash_dispatch_table.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <assert.h>
-#include <stdexcept>
-#include <stdio.h>
-#include <string.h>
-#include <vector>
-#include "hash_dispatch_table.hpp"
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-hash_dispatch_table_t::hash_dispatch_table_t(uint32_t num_args, size_t capacity)
-    : num_args_(num_args)
-    , size_per_entry_(num_args * sizeof(uint64_t) + sizeof(void *))
-    , buffer_ {new char[size_per_entry_ * capacity]}
-    , capacity_(capacity) {
-    assert((capacity_ & (capacity_ - 1)) == 0);
-    memset(buffer_.get(), 0, size_per_entry_ * capacity);
-}
-
-static uint64_t hash(uint64_t x) {
-    x = (x ^ (x >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
-    x = (x ^ (x >> 27)) * UINT64_C(0x94d049bb133111eb);
-    x = x ^ (x >> 31);
-    x = (x == 0) ? 1 : x;
-    return x;
-}
-
-static uint64_t simple_hash_combine(uint64_t *keys, uint64_t num_keys) {
-    uint64_t simple_hash = keys[0];
-    for (uint64_t i = 1; i < num_keys; i++) {
-        simple_hash = simple_hash * 0x9e3779b9 + keys[i];
-    }
-    return simple_hash;
-}
-
-static bool keys_equals(uint64_t *keys1, uint64_t *keys2, uint64_t num_args) {
-    for (uint32_t i = 0; i < num_args; i++) {
-        if (keys1[i] != keys2[i]) { return false; }
-    }
-    return true;
-}
-
-static bool keys_empty(uint64_t *keys1, uint64_t num_args) {
-    for (uint32_t i = 0; i < num_args; i++) {
-        if (keys1[i] != 0) { return false; }
-    }
-    return true;
-}
-
-void *hash_dispatch_table_t::get(uint64_t *keys, uint64_t num_keys) {
-    assert(num_keys == num_args_);
-    uint64_t simple_hash = simple_hash_combine(keys, num_args_);
-    uint64_t hash_v = bound_by_capacity(hash(simple_hash));
-    for (uint64_t i = 0; i < capacity_; i++) {
-        auto idx = bound_by_capacity((i + hash_v));
-        if (keys_equals(get_keys_by_idx(idx), keys, num_args_)) {
-            return get_value_by_idx(idx);
-        }
-        if (keys_empty(get_keys_by_idx(idx), num_args_)) { return nullptr; }
-    }
-
-    return nullptr;
-}
-
-void hash_dispatch_table_t::set(
-        uint64_t *keys, uint64_t num_keys, void *value) {
-    assert(num_keys == num_args_);
-    if (num_elems_ > capacity_ / 2) {
-        // todo: rehash
-        throw std::runtime_error("Rehash not implemented");
-    }
-    uint64_t simple_hash = simple_hash_combine(keys, num_args_);
-    uint64_t hash_v = hash(simple_hash) & (~(-capacity_));
-
-    for (uint64_t i = 0; i < capacity_; i++) {
-        auto idx = bound_by_capacity(i + hash_v);
-        auto cur_key = get_keys_by_idx(idx);
-        if (keys_empty(cur_key, num_args_)) {
-            for (size_t num = 0; num < num_args_; num++) {
-                cur_key[num] = keys[num];
-            }
-            get_value_by_idx(idx) = value;
-            num_elems_++;
-            return;
-        }
-        if (keys_equals(cur_key, keys, num_args_)) {
-            get_value_by_idx(idx) = value;
-            return;
-        }
-    }
-}
-
-hash_dispatch_table_t::dispatch_func_t
-hash_dispatch_table_t::get_dispatch_func() {
-    return [](dispatch_table_t *ths, uint64_t *keys, uint64_t num_keys) {
-        return static_cast<hash_dispatch_table_t *>(ths)->get(keys, num_keys);
-    };
-}
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/hash_dispatch_table.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/hash_dispatch_table.hpp
deleted file mode 100644
index e9f070f0957..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/hash_dispatch_table.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_HASH_DISPATCH_TABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_HASH_DISPATCH_TABLE_HPP
-
-#include <memory>
-#include "dispatch_table.hpp"
-#include <runtime/dispatch_key.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-/**
- * The dispatch table implemented by Open Addressing hash map.
- * @param num_args the number of input args to hash as the key
- * @param capacity the capacity of the hash map, must be power of 2
- * */
-struct hash_dispatch_table_t : public dispatch_table_t {
-    const uint32_t num_args_;
-    const uint32_t size_per_entry_;
-    // [entry 1: [key1, key2, ... keyN], value], [entry 2: [key1, key2, ...
-    // keyN], value]
-    std::unique_ptr<char[]> buffer_;
-    // the capacity of the hash bucket. Must be power of 2
-    size_t capacity_;
-    size_t num_elems_ = 0;
-
-    hash_dispatch_table_t(uint32_t num_args, size_t capacity);
-
-    uint64_t *get_keys_by_idx(uint64_t idx) {
-        return reinterpret_cast<uint64_t *>(
-                buffer_.get() + size_per_entry_ * idx);
-    }
-
-    void *&get_value_by_idx(uint64_t idx) {
-        return *reinterpret_cast<void **>(buffer_.get() + size_per_entry_ * idx
-                + num_args_ * sizeof(uint64_t));
-    }
-    // returns v % capacity_
-    size_t bound_by_capacity(size_t v) { return v & (~(-(int64_t)capacity_)); }
-
-    void *get(uint64_t *keys, uint64_t num_keys) final override;
-    void set(uint64_t *keys, uint64_t num_keys, void *value) final override;
-
-    dispatch_func_t get_dispatch_func() final override;
-};
-
-} // namespace runtime
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_dispatch_tables.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_dispatch_tables.cpp
deleted file mode 100644
index 3e5e6ed0b9e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_dispatch_tables.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "op_dispatch_tables.hpp"
-#include <assert.h>
-#include <string.h>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-void op_dispatch_tables_t::set_format_table_keys(uint64_t *keys,
-        uint64_t num_keys, uint64_t *values, uint64_t num_values) {
-    assert(format_table_);
-    if (void *v = format_table_->get(keys, num_keys)) {
-        memcpy(v, values, num_values * sizeof(uint64_t));
-    } else {
-        std::unique_ptr<uint64_t[]> new_values(new uint64_t[num_values]);
-        memcpy(new_values.get(), values, num_values * sizeof(uint64_t));
-        format_table_->set(keys, num_keys, new_values.get());
-        format_values_.emplace_back(std::move(new_values));
-    }
-}
-
-void op_dispatch_tables_t::set_impl_kind_table_keys(
-        uint64_t *keys, uint64_t num_keys, int value) {
-    assert(impl_kind_table_);
-    if (void *v = impl_kind_table_->get(keys, num_keys)) {
-        memcpy(v, &value, sizeof(int));
-    } else {
-        impl_kind_values_.emplace_back(new int);
-        *impl_kind_values_.back() = value;
-        impl_kind_table_->set(keys, num_keys, impl_kind_values_.back());
-    }
-}
-
-op_dispatch_tables_t::~op_dispatch_tables_t() {
-    kernel_dispatch_func_ = nullptr;
-    for (auto &addr : impl_kind_values_) {
-        delete addr;
-    }
-}
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_dispatch_tables.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_dispatch_tables.hpp
deleted file mode 100644
index 21e0d03a712..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_dispatch_tables.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OP_DISPATCH_TABLES_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OP_DISPATCH_TABLES_HPP
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include <unordered_map>
-#include <util/reflection.hpp>
-
-#include "dispatch_table.hpp"
-#include "hash_dispatch_table.hpp"
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct jit_module;
-
-namespace runtime {
-struct op_dispatch_tables_t {
-    // format table, fmt0, unknown, unknown => fmt0, fmt1, fmt2. Currently we
-    // use hash table.
-    std::unique_ptr<hash_dispatch_table_t> format_table_;
-    // impl kind table, configs => impl kind
-    std::unique_ptr<dispatch_table_t> impl_kind_table_;
-    // kernel table, fmt0, fmt1, fmt2 => kernel.
-    std::unique_ptr<dispatch_table_t> kernel_table_;
-    // pointer to kernel dispatch function.
-    dispatch_table_t::dispatch_func_t kernel_dispatch_func_ = nullptr;
-    reflection::shared_general_object_t op_info_;
-    std::vector<std::shared_ptr<jit_module>> compiled_modules_;
-    op_dispatch_tables_t() = default;
-    virtual ~op_dispatch_tables_t();
-    void set_format_table_keys(uint64_t *keys, uint64_t num_keys,
-            uint64_t *values, uint64_t num_values);
-    void set_impl_kind_table_keys(uint64_t *keys, uint64_t num_keys, int value);
-    void set_kernel_table(
-            std::unique_ptr<dispatch_table_t> &&kernel_table_ptr) {
-        kernel_table_ = std::move(kernel_table_ptr);
-    }
-    void set_kernel_dispatch_func(dispatch_table_t::dispatch_func_t p) {
-        kernel_dispatch_func_ = p;
-    }
-
-private:
-    std::vector<std::unique_ptr<uint64_t[]>> format_values_;
-    std::vector<int *> impl_kind_values_;
-};
-using op_dispatch_tables_ptr = std::shared_ptr<op_dispatch_tables_t>;
-// the map for dispatch tables: global table var => op dispatch table
-using dispatch_table_map_t
-        = std::unordered_map<std::string, op_dispatch_tables_ptr>;
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_func_decl.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_func_decl.hpp
deleted file mode 100644
index 4883adafe0e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/op_func_decl.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-#include <runtime/dynamic_dispatch/ops/runtime_op_info.hpp>
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OP_FUNC_DECL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OP_FUNC_DECL_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-extern "C" {
-SC_API void infer_shape_matmul_op(void *out, void *data, void *weight);
-SC_API void infer_shape_conv_fwd_op(void *out, void *data, void *weight,
-        dyn_conv_fwd_runtime_info_t &op_info);
-SC_API void infer_shape_unary_fusible_op(void *out, void *in);
-SC_API void infer_shape_binary_fusible_op(void *out, void *in0, void *in1);
-SC_API void infer_shape_pooling_fusible_op(
-        void *out, void *in, dyn_pooling_runtime_info_t &op_info);
-SC_API void infer_shape_reduce_op(
-        void *out, void *in, int *rd_axis, int num_axis);
-SC_API void infer_shape_transpose_op(
-        void *out, void *in, int *tr_axis, int num_axis);
-SC_API void infer_shape_tensor_view_op(void *out, void *in, int64_t *old_axis,
-        int num_old_axis, int64_t *new_axis, int num_new_axis);
-SC_API void infer_shape_select_op(void *out, void *in0, void *in1, void *in2);
-
-SC_API void query_format_matmul_core_op(void *table, void *out, void *data,
-        void *weight, uint64_t *out_fmt, uint64_t *data_fmt,
-        uint64_t *weight_fmt, uint64_t *out_size, void *kernel, int *impl_alg);
-SC_API void query_format_managed_matmul_core_op(void *table, void *out,
-        void *data, void *weight, uint64_t *out_fmt, uint64_t *data_fmt,
-        uint64_t *weight_fmt, uint64_t *out_size, void *kernel, int *impl_alg);
-SC_API void query_format_conv_fwd_core_op(void *table, void *out, void *data,
-        void *weight, uint64_t *out_fmt, uint64_t *data_fmt,
-        uint64_t *weight_fmt, uint64_t *out_size, void *kernel, int *impl_alg);
-SC_API void query_format_unary_fusible_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel);
-SC_API void query_format_binary_fusible_op(void *table, void *out, void *in0,
-        void *in1, uint64_t *out_fmt, uint64_t *in0_fmt, uint64_t *in1_fmt,
-        uint64_t *out_size, void *kernel);
-SC_API void query_format_reorder_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg);
-SC_API void query_format_padding_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg);
-SC_API void query_format_pooling_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg);
-SC_API void query_format_reduce_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel);
-SC_API void query_format_tensor_view_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel);
-SC_API void query_format_dynamic_reshape_op(void *table, void *out, void *in1,
-        void *in2, uint64_t *out_fmt, uint64_t *in_fmt1, uint64_t *in_fmt2,
-        uint64_t *out_size, void *kernel);
-SC_API void query_format_select_op(void *table, void *out, void *in0, void *in1,
-        void *in2, uint64_t *out_fmt, uint64_t *in0_fmt, uint64_t *in1_fmt,
-        uint64_t *in2_fmt, uint64_t *out_size, void *kernel);
-SC_API void query_combined_fused_op(void *table, uint64_t **combined_keys,
-        int *combined_algs, int *each_op_num_key, int op_num, void *kernel);
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/config.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/config.cpp
deleted file mode 100644
index 9be774fe11c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/config.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <algorithm>
-#include <functional>
-#include <limits>
-#include <math.h>
-
-#include "../../target_machine.hpp"
-#include "config.hpp"
-#include <runtime/config.hpp>
-#include <util/utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-extern "C" int get_matmul_dyn_cfg_single(int in, bool is_batch) {
-    assert(in > 0);
-    const int blk_step = 16;
-    int blk = 16;
-    bool has_no_tail = false;
-    int padded_in = std::numeric_limits<int>::max();
-    if (is_batch && in <= 16) {
-        if (in <= 2) { return 2; }
-        if (in <= 4) { return 4; }
-        if (in <= 8) { return 8; }
-        return 16;
-    }
-    for (int i = 1; i <= 4; i++) {
-        int cur_blk = blk_step * i;
-        if (cur_blk == 48) { continue; }
-        int cur_num_blk = utils::divide_and_ceil(in, cur_blk);
-        int cur_padded_in = cur_num_blk * cur_blk;
-        if (in % cur_padded_in == 0) {
-            has_no_tail = true;
-            blk = cur_blk;
-        } else if (!has_no_tail && in / (float)cur_padded_in >= 0.8) {
-            blk = cur_blk;
-            padded_in = cur_padded_in;
-        } else if (!has_no_tail) {
-            if (cur_padded_in <= padded_in) {
-                blk = cur_blk;
-                padded_in = cur_padded_in;
-            }
-        }
-    }
-    return blk;
-}
-
-// according to sub block candidates.
-inline int get_mmm_sub_block_floor(const int x) {
-    // todo: recover this in following prs.
-    // if (x >= 8) { return 8; }
-    if (x >= 4) { return 4; }
-    return 1;
-}
-
-void get_managed_matmul_config(const runtime::target_machine_t &tm,
-        int &M_split_num, int &N_split_num, int &M_sub_block, int &N_sub_block,
-        int &K_sub_block, int &im_loop_order, const int M, const int N,
-        const int K, const int iim_block, const int iin_block,
-        const int iik_block, const int sizeofdtypeA, const int sizeofdtypeC,
-        bool is_int8, bool is_f32, bool is_dynamic, int64_t dispatch_avx) {
-    im_loop_order = 0;
-    bool is_bf16 = !is_int8 && !is_f32;
-    const int num_threads = runtime_config_t::get().get_num_threads();
-    auto thread_factors = utils::get_factors(num_threads);
-    float cost = std::numeric_limits<float>::max();
-    int split_n = 1;
-    // spr, emr, gnr
-    bool is_seg = tm.cpu_flags_.is_spr_like() && !dispatch_avx;
-    // skx, clx, cpx, icx
-    bool is_scpi = tm.cpu_flags_.is_skx_like();
-    auto cal_cost = [&](int i) {
-        int num_M_block
-                = utils::divide_and_ceil(M / iim_block, num_threads / i);
-        int num_N_block = utils::divide_and_ceil(N / iin_block, i);
-        int num_brgemm = num_M_block * num_N_block;
-        int num_core = std::min(i, N / iin_block)
-                * std::min(num_threads / i, M / iim_block);
-        // Cost = Shape_efficient_weight *
-        // (workload_balance + divide_X_plenty) / core_utilitizaiton
-        // single core gemm prefers square shape for A and B.
-        // For small workload, the A and B shape is not a key problem, but the
-        // num_core and num_brgemm is important to performance. Use 2048 to
-        // reduce the shape weight on small shape.
-        float new_cost;
-        float sew = 1024 + M * i / num_threads + N / i;
-        bool is_amx = tm.use_amx() && !dispatch_avx;
-        if ((K >= 1024 && is_int8 && !is_amx)
-                || (K >= 512 && is_f32 && is_scpi)) {
-            // Cost += empty_cores, making M_split_num * N_split_num closer to
-            // num_threads
-            float empty_cores = num_threads - i * (num_threads / i);
-            if (((N >= 1024 && is_int8 && M <= 2 * N)
-                        || (N >= 256 && is_f32 && M <= 64))
-                    || (is_f32 && M <= 256 && N >= 1024 && K >= 1024)) {
-                // give bigger splits on N when N is bigger
-                new_cost = sew * (num_brgemm + num_threads / i / 2) / num_core
-                        + empty_cores;
-            } else if (N >= 256 && is_f32 && M <= 256) {
-                new_cost = sew * (num_brgemm + i + num_threads / i * 2)
-                                / num_core
-                        + empty_cores;
-            } else {
-                new_cost = sew * (num_brgemm + i / 2) / num_core + empty_cores;
-            }
-        } else {
-            auto dxp = 8 * i;
-            // give bigger splits on N when N is bigger
-            if (N >= 16 * M && N >= 4096 && !is_f32) {
-                // TODO(xianhang): in some mlp shapes, only one or some few
-                // layres have big N while others are small. Give bigger splits
-                // on N would break fusion to some extent, which influences the
-                // performance. The logic will be refactored after involving
-                // graph-level loop up to make all the matmul use the same split
-                // manner
-                dxp = num_threads / i / 2;
-            } else if (is_bf16) {
-                // small M, big N, K cases
-                if (M < N && M < K && std::max(N, K) / M >= 8
-                        && thread_factors.size() < 5) {
-                    dxp = num_threads / i;
-                }
-                // big M, small N, K cases
-                else if (M > 4096 && M <= 8192) {
-                    dxp = M / std::max(N, K) <= 4
-                            ? num_threads / i
-                            : (M / std::max(N, K) <= 12 ? i : 8 * i);
-                } else if (iim_block > 32 && iin_block > 32) {
-                    sew = 1024 + 32 * M * i / num_threads / iim_block
-                            + 32 * N / i / iin_block;
-                    dxp = 320 * i;
-                }
-            }
-            new_cost = sew * (num_brgemm + dxp) / num_core;
-        }
-
-        if (new_cost < cost) {
-            split_n = i;
-            cost = new_cost;
-        }
-    };
-
-    // different between dynamic and static, static may choice non-factor of
-    // num_threads.
-    if (!is_dynamic) {
-        for (int i = 1; i <= num_threads; i++) {
-            cal_cost(i);
-        }
-    } else {
-        for (auto &i : thread_factors) {
-            cal_cost(i);
-        }
-    }
-    M_split_num = num_threads / split_n;
-    N_split_num = split_n;
-    if (is_int8 && N <= 512 && K <= 512) {
-        // for int8 datatype and small N/Ks, we prefer to give splits only on M
-        // when M is small, num_threadsx1x1 split is the same as 1x1x1 split,
-        // which runs on single core
-        M_split_num = num_threads;
-        N_split_num = 1;
-    } else if (N <= 192 && K <= 192) {
-        // for other datatypes, we prefer to give splits only on M with much
-        // smaller N/Ks
-        M_split_num = num_threads;
-        N_split_num = 1;
-    } else if ((M == iim_block && is_seg)
-            || (M == iim_block && is_scpi && num_threads <= 4)) {
-        M_split_num = 1;
-        if (num_threads <= 4) {
-            // magic number = 4096, needs to be further discussed for pretty big
-            // K magic number = 4, needs to be further discussed for different M
-            if ((K < 4096 || M <= 4) && !is_f32) {
-                N_split_num = num_threads;
-            } else {
-                if (K > N * 4 && thread_factors.size() > 2) {
-                    N_split_num = num_threads / thread_factors.at(1);
-                } else {
-                    N_split_num = num_threads;
-                }
-            }
-        } else {
-            // for really small M with super big N and K, despites N is bigger
-            // than K, giving splits on K has performance advantage
-            auto possible_splits = thread_factors;
-            auto split_idx = 0;
-            if (is_int8) {
-                if (K >= 4096 && N >= 4096 && M <= 4) {
-                    auto split_idx_init = 1;
-                    split_idx = split_idx_init;
-                    cost = std::numeric_limits<float>::max();
-                    if (N >= K) {
-                        split_idx_init = possible_splits.size() > 3 ? 2 : 1;
-                    } else if (K >= 4 * N) {
-                        split_idx_init = possible_splits.size() > 4 ? 3 : 1;
-                    }
-                    float split_ratio = N >= K ? 3. : (K >= 4 * N ? 1. : 0.);
-                    for (auto i = split_idx_init;
-                            i < static_cast<int>(possible_splits.size()); i++) {
-                        float new_cost = num_threads / possible_splits.at(i)
-                                        / possible_splits.at(i)
-                                - split_ratio;
-                        if (new_cost >= 0 && new_cost < cost
-                                && split_ratio != 0.) {
-                            cost = new_cost;
-                            split_idx = i;
-                        }
-                    }
-                } else {
-                    split_idx = K >= 4096
-                            ? (N < 2 * K ? (
-                                       N <= K / 2 && possible_splits.size() > 2
-                                               ? 2
-                                               : 1)
-                                         : (K >= 4096 ? 1 : 0))
-                            : 0;
-                }
-                N_split_num = num_threads / possible_splits.at(split_idx);
-            } else {
-                // works well on bf16, needs to be further discussed for f32
-                if (K >= 4096) {
-                    if (M < 16) {
-                        split_idx = possible_splits.size() > 6
-                                ? (N >= 4 * K ? 2 : 3)
-                                : (num_threads > 32 ? 1 : 0);
-                        if (N >= 10 * K) {
-                            if (M > 16 || num_threads <= 28) { split_idx = 1; }
-                        }
-                    } else {
-                        split_idx = K >= 4 * N
-                                ? (possible_splits.size() > 6
-                                                ? 3
-                                                : (num_threads > 32 ? 2 : 0))
-                                : ((N >= 10 * K || num_threads <= 32) ? 0 : 1);
-                    }
-                    N_split_num = num_threads / possible_splits.at(split_idx);
-                }
-            }
-        }
-    } else if (K >= 8192) {
-        // for really big K, we need to give splits on K
-        if (M < N) {
-            auto possible_splits = utils::get_factors(M_split_num);
-            if (possible_splits.size() > 2 && N / M < 3) {
-                M_split_num = M_split_num / possible_splits[1];
-            } else {
-                M_split_num = 1;
-                int K_split_num = num_threads == 1 ? 1 : thread_factors.at(1);
-                N_split_num = num_threads / K_split_num;
-            }
-        } else {
-            auto possible_splits = utils::get_factors(N_split_num);
-            if (possible_splits.size() > 2) {
-                N_split_num = N_split_num / possible_splits[1];
-            }
-        }
-    } else if (is_f32 && is_scpi && M <= 256 && N >= 256 && K >= 512) {
-        // f32 special case
-        // for small M, consider giving splits on big K
-        if (K >= 1024 && K >= 2 * N && thread_factors.size() > 3) {
-            // give bigger splits on K
-            int K_split_num = thread_factors.at(2);
-            N_split_num = N_split_num / K_split_num > 0
-                    ? N_split_num / K_split_num
-                    : N_split_num;
-        } else if (thread_factors.size() > 2 && N >= 1024 && N != K) {
-            int K_split_num = thread_factors.at(1);
-            N_split_num = N_split_num / K_split_num > 0
-                    ? N_split_num / K_split_num
-                    : N_split_num;
-        } else if (N == 256 && K == 512 && thread_factors.size() > 3) {
-            // special requirements in dlrm shapes, will refactor logic after
-            // involving graph-level loop up
-            if (M_split_num >= 2) { M_split_num /= 2; }
-            if (N_split_num >= 2) { N_split_num /= 2; }
-        }
-    } else if (M / iim_block < 2 && (N >= 16 * M || K >= 16 * M)) {
-        // int8 special case
-        bool is_amx = tm.use_amx() && !dispatch_avx;
-        if (is_int8 && !is_amx) {
-            M_split_num = 1;
-            int K_split_num = 1;
-            if (K >= 16 * M) {
-                K_split_num
-                        = thread_factors.size() > 2 ? thread_factors.at(1) : 1;
-            }
-            N_split_num = num_threads / K_split_num;
-        }
-    }
-    int single_M = utils::divide_and_ceil(
-                           utils::divide_and_ceil(M, iim_block), M_split_num)
-            * iim_block;
-    int single_N = utils::divide_and_ceil(
-                           utils::divide_and_ceil(N, iin_block), N_split_num)
-            * iin_block;
-    int single_K = K;
-    int L2_size = static_cast<int>(tm.cpu_flags_.getDCacheSize(2));
-    int single_K_threshold
-            = (single_M * single_N * sizeofdtypeA < L2_size ? 2048 : 4096)
-            / sizeofdtypeA;
-    if (is_f32 && is_scpi && num_threads / M_split_num / N_split_num <= 2
-            && M >= 128) {
-        // if no split is given on K axis, bigger K_sub_block is required
-        single_K_threshold /= 4;
-    } else if (is_f32 && num_threads <= 4) {
-        single_K_threshold /= 2;
-    }
-    if (single_K >= single_K_threshold) {
-        K_sub_block = utils::divide_and_ceil(single_K, single_K_threshold);
-        int K_split_num = num_threads / M_split_num / N_split_num;
-        while (K / iik_block / K_split_num < K_sub_block && K_sub_block > 1) {
-            K_sub_block--;
-        }
-        if (is_f32 && K < 1024) { K_sub_block = 1; }
-        if (is_bf16 && K < 2048) { K_sub_block = 1; }
-        int L2_K = utils::divide_and_ceil(
-                           utils::divide_and_ceil(single_K, iik_block),
-                           K_sub_block)
-                * iik_block;
-        // sizeofdtypeA* (M * K) + sizeofdtypeB * (N * K) + sizeofdtypeC(M * N)
-        // <= L2_size, let N == P * M, then (P + 1) * sizeofdtypeA * M * K +
-        // sizeofdtypeC * M * P * M <= L2_size Then M = (sqrt(((P + 1) *
-        // sizeofdtypeA * K) ^ 2 + 4 * P* sizeofdtypeC * L2_size) - (P + 1) *
-        // sizeofdtypeA * K)/ (2 * P * sizeofdtypeC)
-        int P = single_N > single_M
-                ? (single_N / single_M > 16 ? (is_f32 ? 4 : 16)
-                                            : single_N / single_M)
-                : 1;
-        int L2_MN = (sqrt(pow((P + 1) * sizeofdtypeA * L2_K, 2)
-                             + 4 * P * sizeofdtypeC * L2_size)
-                            - (P + 1) * sizeofdtypeA * L2_K)
-                / (2 * P * sizeofdtypeC);
-        M_sub_block = std::max(1, single_M / L2_MN);
-        if (is_bf16 && N >= K && iim_block > 32 && iin_block > 32) {
-            L2_MN /= 2;
-        } else if (is_f32 && N > 16 * M) {
-            L2_MN *= 2;
-        }
-        N_sub_block = std::max(1, single_N / L2_MN);
-    } else {
-        // sizeofdtypeA * M * K + sizeofdtypeB * N * K <= L2_size
-        // let let N == P * M, then
-        // M = L2_size / ((1 + P) * sizeofdtypeA * K)
-        int P = single_N > single_M
-                ? (single_N / single_M > 16 ? (is_f32 ? 4 : 16)
-                                            : single_N / single_M)
-                : 1;
-        int L2_MN = L2_size / ((1 + P) * sizeofdtypeA * single_K);
-        N_sub_block = std::max(1, single_N / L2_MN);
-        if (is_bf16 && N >= K && iim_block > 32 && iin_block > 32) {
-            L2_MN *= 4;
-        }
-        M_sub_block = std::max(1, single_M / L2_MN);
-        K_sub_block = 1;
-    }
-    while (M / iim_block / M_split_num < M_sub_block && M_sub_block > 1) {
-        M_sub_block--;
-    }
-    while (N / iin_block / N_split_num < N_sub_block && N_sub_block > 1) {
-        N_sub_block--;
-    }
-    if (is_dynamic) {
-        M_sub_block = get_mmm_sub_block_floor(M_sub_block);
-        N_sub_block = get_mmm_sub_block_floor(N_sub_block);
-        K_sub_block = 1;
-    }
-}
-
-int get_dyn_conv_default_block(const bool is_1x1, const int dtype_size,
-        const bool has_pad, const bool is_f32) {
-    int default_block = 128;
-    auto dtype_block = 4 / dtype_size;
-    if (!is_1x1) {
-        default_block = dtype_block * 32;
-        if (has_pad) { default_block = is_f32 ? 64 : 128; }
-    }
-    return default_block;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/config.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/config.hpp
deleted file mode 100644
index fc0b25f49dc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/config.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_CONFIG_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_CONFIG_HPP
-#include <util/def.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct target_machine_t;
-}
-/**
- * @brief Get the dynamic config single block from the plain dynamic dimension
- * for matmul
- *
- * @param in the dynamic dimension
- * @param is_batch default false, candidates are [16, 32, 64], if true,
- * candidates are [2, 4, 8, 16, 32, 64].
- * @return the selected block config
- */
-extern "C" SC_API int get_matmul_dyn_cfg_single(int in, bool is_batch = false);
-
-// The function calculate the config of managed matmul, it is used in both
-// compiler and runtime.
-void get_managed_matmul_config(const runtime::target_machine_t &tm,
-        int &M_split_num, int &N_split_num, int &M_sub_block, int &N_sub_block,
-        int &K_sub_block, int &im_loop_order, const int M, const int N,
-        const int K, const int iim_block, const int iin_block,
-        const int iik_block, const int sizeofdtypeA, const int sizeofdtypeC,
-        bool is_int8, bool is_f32, bool is_dynamic, int64_t dispatch_avx = 0);
-
-// The function calculate the block of dynamic conv
-int get_dyn_conv_default_block(const bool is_1x1, const int dtype_size,
-        const bool has_pad, const bool is_f32);
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/fused_op.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/fused_op.cpp
deleted file mode 100644
index f3f98725292..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/fused_op.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-#include <string.h>
-#include "impl_type.hpp"
-#include "util.hpp"
-#include <runtime/dynamic_dispatch/op_dispatch_tables.hpp>
-#include <util/null_check.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-extern "C" void *sc_global_aligned_alloc(size_t sz, size_t align);
-extern "C" void sc_global_aligned_free(void *ptr, size_t align);
-extern "C" void query_combined_fused_op(void *table, uint64_t **combined_keys,
-        int *combined_algs, int *each_op_num_key, int op_num, void *kernel) {
-    // first combine alg into dispatch keys
-    int total_key_num = 0;
-    for (int i = 0; i < op_num; i++) {
-        total_key_num += each_op_num_key[i];
-    }
-    size_t sz = sizeof(uint64_t) * total_key_num;
-    runtime::dispatch_key *final_query_keys
-            = static_cast<runtime::dispatch_key *>(
-                    sc_global_aligned_alloc(sz, 64));
-    SC_ABORT_IF_NULL(final_query_keys);
-    runtime::dispatch_key **combined_dispatch_keys
-            = reinterpret_cast<runtime::dispatch_key **>(combined_keys);
-    // link all impl alg of reorder, if all of impl of reorder is no_padding,
-    // then use no_padding; else reset all of impl to normal.
-    int linked_reorder_impl = impl_kind_t::no_padding;
-    bool stop = false;
-    for (int i = 0; !stop && i < op_num; i++) {
-        for (int k = 0; k < each_op_num_key[i]; k++) {
-            // currently use number == 2 to judge if it is a reorder.
-            if (!combined_algs
-                    || (each_op_num_key[i] == 2
-                            && combined_algs[i] == impl_kind_t::normal)) {
-                linked_reorder_impl = impl_kind_t::normal;
-                stop = true;
-                break;
-            }
-        }
-    }
-    int offset = 0;
-    for (int i = 0; i < op_num; i++) {
-        for (int k = 0; k < each_op_num_key[i]; k++) {
-            final_query_keys[offset + k] = *combined_dispatch_keys[offset + k];
-            if (each_op_num_key[i] == 2) {
-                final_query_keys[offset + k].reset_blocks_and_impl();
-                final_query_keys[offset + k].set_impl_alg(linked_reorder_impl);
-            } else {
-                final_query_keys[offset + k].set_impl_alg(combined_algs[i]);
-            }
-        }
-        offset += each_op_num_key[i];
-    }
-    // query kernel, need determine the impl alg first.
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    auto &kernel_table = op_table->kernel_table_;
-    if (kernel_table) {
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(),
-                reinterpret_cast<uint64_t *>(final_query_keys), total_key_num);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // reset blocks and impl
-    for (int i = 0; i < total_key_num; i++) {
-        combined_dispatch_keys[i]->reset_blocks_and_impl();
-    }
-    sc_global_aligned_free(final_query_keys, 64);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/fusible_ops.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/fusible_ops.cpp
deleted file mode 100644
index 0ea822d3423..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/fusible_ops.cpp
+++ /dev/null
@@ -1,533 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-#include "impl_type.hpp"
-#include "util.hpp"
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <runtime/dynamic_dispatch/op_dispatch_tables.hpp>
-#include <runtime/dynamic_dispatch/ops/runtime_op_info.hpp>
-#include <runtime/dynamic_dispatch/utils.hpp>
-#include <runtime/target_machine.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static bool is_block_produce_padding(
-        runtime::dynamic_tensor_t *in, runtime::dispatch_key *in_fmt_st) {
-    if (!in_fmt_st->is_plain()) {
-        int dim_count[runtime::dispatch_key::meta::MAX_DIMS] = {0};
-        bool first_block = true;
-        for (int i = 0; i < in_fmt_st->ndims(); i++) {
-            int ori_dim = in_fmt_st->get(i);
-            dim_count[ori_dim]++;
-            if (dim_count[ori_dim] == 2) {
-                if (first_block) {
-                    if (in->dims_[ori_dim] % in_fmt_st->get_block1() != 0) {
-                        return true;
-                    }
-                    first_block = false;
-                } else {
-                    if (in->dims_[ori_dim] % in_fmt_st->get_block2() != 0) {
-                        return true;
-                    }
-                }
-            }
-        }
-    }
-    return false;
-}
-
-static bool is_fast_transpose_padding(runtime::dynamic_tensor_t *in,
-        runtime::dispatch_key *in_fmt_st, runtime::dispatch_key *out_fmt_st) {
-    int ori_in_dim = in_fmt_st->get(in_fmt_st->ndims() - 1);
-    int ori_out_dim = out_fmt_st->get(out_fmt_st->ndims() - 1);
-    uint32_t etype = in->dtype_;
-    if (ori_in_dim != ori_out_dim) {
-        if (etype == uint32_t(sc_data_etype::F32)
-                && (in->dims_[ori_in_dim] % 8 != 0
-                        || in->dims_[ori_out_dim] % 8 != 0)) {
-            return true;
-        }
-        if (etype == uint32_t(sc_data_etype::BF16)
-                && (in->dims_[ori_in_dim] % 8 != 0
-                        || in->dims_[ori_in_dim] % 32 != 0)) {
-            return true;
-        }
-    }
-    return false;
-}
-
-static int check_and_set_reorder_impl(runtime::dynamic_tensor_t *in,
-        runtime::dispatch_key *in_fmt_st, runtime::dispatch_key *out_fmt_st) {
-    int impl_alg = impl_kind_t::no_padding;
-    int ndims = in->ndims_;
-    if (is_block_produce_padding(in, in_fmt_st)
-            || is_block_produce_padding(in, out_fmt_st)) {
-        impl_alg = impl_kind_t::normal;
-    }
-    if (impl_alg == impl_kind_t::no_padding
-            && is_fast_transpose_padding(in, in_fmt_st, out_fmt_st)) {
-        impl_alg = impl_kind_t::normal;
-    }
-    in_fmt_st->set_impl_alg(impl_alg);
-    out_fmt_st->set_impl_alg(impl_alg);
-    return impl_alg;
-}
-
-extern "C" void infer_shape_unary_fusible_op(void *out, void *in) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-    runtime::deep_copy_dynamic_tensor(out_dyn_tsr, in_dyn_tsr);
-}
-
-extern "C" void infer_shape_padding_fusible_op(
-        void *out, void *in, dyn_padding_runtime_info_t &op_info) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-
-    int data_ndims = in_dyn_tsr->ndims_;
-    out_dyn_tsr->ndims_ = data_ndims;
-    int64_t *data_dims = in_dyn_tsr->dims_;
-    out_dyn_tsr->dims_[0] = data_dims[0];
-    out_dyn_tsr->dims_[1] = data_dims[1];
-    int pads_begin[3] = {
-            op_info.pads_begin_d, op_info.pads_begin_h, op_info.pads_begin_w};
-    int pads_end[3]
-            = {op_info.pads_end_d, op_info.pads_end_h, op_info.pads_end_w};
-    int offset = data_ndims == 5 ? -2 : -1;
-    for (int i = 2; i < data_ndims; i++) {
-        out_dyn_tsr->dims_[i]
-                = data_dims[i] + pads_begin[i + offset] + pads_end[i + offset];
-    }
-}
-
-extern "C" void query_format_unary_fusible_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel) {
-    // infer shape
-    infer_shape_unary_fusible_op(out, in);
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        void *value = format_table->get(in_fmt, 1);
-        assert(value);
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[1];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    if (kernel_table) {
-        uint64_t keys[2] = {*in_fmt, *out_fmt};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 2);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // query inplace
-    *out_size = runtime::calculate_blocking_dims(out, out_fmt);
-}
-
-extern "C" void infer_shape_binary_fusible_op(void *out, void *in0, void *in1) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in0_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in0);
-    runtime::dynamic_tensor_t *in1_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in1);
-    bool dims_equal = in0_dyn_tsr->ndims_ == in1_dyn_tsr->ndims_;
-    assert(dims_equal || in1_dyn_tsr->ndims_ == 1);
-    out_dyn_tsr->ndims_ = in0_dyn_tsr->ndims_;
-    for (int i = 0; i < in0_dyn_tsr->ndims_; i++) {
-        if (dims_equal) {
-            out_dyn_tsr->dims_[i]
-                    = std::max(in0_dyn_tsr->dims_[i], in1_dyn_tsr->dims_[i]);
-        } else {
-            out_dyn_tsr->dims_[i] = in0_dyn_tsr->dims_[i];
-        }
-    }
-}
-
-// we have partern like a fused op connected before two reorders, when we query
-// the first reorder, we query the fused op first, the shape of dyn tsr of
-// second reorder's output is unknown.
-extern "C" void query_format_binary_fusible_op(void *table, void *out,
-        void *in0, void *in1, uint64_t *out_fmt, uint64_t *in0_fmt,
-        uint64_t *in1_fmt, uint64_t *out_size, void *kernel) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in0_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in0);
-    runtime::dynamic_tensor_t *in1_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in1);
-    // infer shape
-    infer_shape_binary_fusible_op(out, in0, in1);
-    // update dyn_mask
-    out_dyn_tsr->dyn_mask_ = in0_dyn_tsr->dyn_mask_ | in1_dyn_tsr->dyn_mask_;
-
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        uint64_t fmt_keys[2] = {*in0_fmt, *in1_fmt};
-        void *value = format_table->get(fmt_keys, 2);
-        assert(value);
-        *in0_fmt = reinterpret_cast<uint64_t *>(value)[0];
-        *in1_fmt = reinterpret_cast<uint64_t *>(value)[1];
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[2];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    if (kernel_table) {
-        uint64_t keys[3] = {*in0_fmt, *in1_fmt, *out_fmt};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 3);
-        assert(func);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // query inplace
-    *out_size = runtime::calculate_blocking_dims(out, out_fmt);
-}
-
-// actually reorder op does not need to query format, we only query kernel here.
-extern "C" void query_format_reorder_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg) {
-    // infer shape
-    infer_shape_unary_fusible_op(out, in);
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    // reset blocks for plain format
-    uint64_t cp_in_fmt = *in_fmt, cp_out_fmt = *out_fmt;
-    auto *in_fmt_st = reinterpret_cast<runtime::dispatch_key *>(&cp_in_fmt);
-    auto *out_fmt_st = reinterpret_cast<runtime::dispatch_key *>(&cp_out_fmt);
-    // reset before for some plain in/out formats.
-    in_fmt_st->reset_blocks_and_impl();
-    out_fmt_st->reset_blocks_and_impl();
-    auto tmp_impl_alg
-            = check_and_set_reorder_impl(in_dyn_tsr, in_fmt_st, out_fmt_st);
-    if (impl_alg) { *impl_alg = tmp_impl_alg; }
-    if (kernel_table) {
-        uint64_t keys[2] = {*in_fmt_st, *out_fmt_st};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 2);
-        assert(func);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    in_fmt_st->reset_blocks_and_impl();
-    out_fmt_st->reset_blocks_and_impl();
-    // query inplace
-    if (*in_fmt == uint64_t(out_fmt)) {
-        *out_size = 0;
-    } else {
-        *out_size = runtime::calculate_blocking_dims(out, out_fmt);
-    }
-}
-
-extern "C" void query_format_padding_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg) {
-    // infer shape
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-
-    dyn_padding_runtime_info_t info
-            = *op_table->op_info_
-                       .unchecked_get_as<dyn_padding_runtime_info_t>();
-    infer_shape_padding_fusible_op(out, in, info);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        void *value = format_table->get(in_fmt, 1);
-        assert(value);
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[1];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    if (kernel_table) {
-        uint64_t keys[2] = {*in_fmt, *out_fmt};
-        void *func
-                = op_table->kernel_dispatch_func_(kernel_table.get(), keys, 2);
-        assert(func);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // query inplace
-    *out_size = runtime::calculate_blocking_dims(out, out_fmt);
-}
-
-extern "C" void infer_shape_pooling_fusible_op(
-        void *out, void *in, dyn_pooling_runtime_info_t &op_info) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-    int data_ndims = in_dyn_tsr->ndims_;
-    out_dyn_tsr->ndims_ = data_ndims;
-    int64_t *data_dims = in_dyn_tsr->dims_;
-    out_dyn_tsr->dims_[0] = data_dims[0];
-    out_dyn_tsr->dims_[1] = data_dims[1];
-    int strides[3] = {op_info.stride_d, op_info.stride_h, op_info.stride_w};
-    int kernels[3] = {op_info.kernel_d, op_info.kernel_h, op_info.kernel_w};
-    int pads_begin[3] = {
-            op_info.pads_begin_d, op_info.pads_begin_h, op_info.pads_begin_w};
-    int pads_end[3]
-            = {op_info.pads_end_d, op_info.pads_end_h, op_info.pads_end_w};
-    int offset = data_ndims == 5 ? -2 : -1;
-    if (op_info.auto_pads_same) {
-        for (int i = 2; i < data_ndims; i++) {
-            out_dyn_tsr->dims_[i] = (data_dims[i] + strides[i + offset] - 1)
-                    / strides[i + offset];
-        }
-    } else {
-        for (int i = 2; i < data_ndims; i++) {
-            if (op_info.rounding_type_floor) {
-                out_dyn_tsr->dims_[i]
-                        = (data_dims[i] + pads_begin[i + offset]
-                                  + pads_end[i + offset] - kernels[i + offset])
-                                / strides[i + offset]
-                        + 1;
-            } else {
-                out_dyn_tsr->dims_[i] = utils::divide_and_ceil(data_dims[i]
-                                                        + pads_begin[i + offset]
-                                                        + pads_end[i + offset]
-                                                        - kernels[i + offset],
-                                                strides[i + offset])
-                        + 1;
-            }
-        }
-    }
-}
-
-extern "C" void query_format_pooling_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg) {
-    // infer shape
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    dyn_pooling_runtime_info_t info
-            = *op_table->op_info_
-                       .unchecked_get_as<dyn_pooling_runtime_info_t>();
-    infer_shape_pooling_fusible_op(out, in, info);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        void *value = format_table->get(in_fmt, 1);
-        assert(value);
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[1];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    if (kernel_table) {
-        uint64_t keys[2] = {*in_fmt, *out_fmt};
-        void *func
-                = op_table->kernel_dispatch_func_(kernel_table.get(), keys, 2);
-        assert(func);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // query inplace
-    *out_size = runtime::calculate_blocking_dims(out, out_fmt);
-}
-
-extern "C" void infer_shape_reduce_op(
-        void *out, void *in, int *rd_axis, int num_axis) {
-    // todo: currently we only support keep dimension reduce.
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-    assert(num_axis > 0);
-    assert(in_dyn_tsr->ndims_ >= num_axis);
-    out_dyn_tsr->ndims_ = in_dyn_tsr->ndims_;
-    for (int i = 0; i < in_dyn_tsr->ndims_; i++) {
-        out_dyn_tsr->dims_[i] = in_dyn_tsr->dims_[i];
-    }
-    for (int i = 0; i < num_axis; i++) {
-        out_dyn_tsr->dims_[rd_axis[i]] = 1;
-    }
-}
-
-extern "C" void query_format_reduce_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel) {
-    // check the output shape should be infered before query.
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    for (int i = 0; i < out_dyn_tsr->ndims_; i++) {
-        assert(out_dyn_tsr->dims_[i] > 0);
-    }
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        void *value = format_table->get(in_fmt, 1);
-        assert(value);
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[1];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    // reset blocks for plain format
-    runtime::dispatch_key tmp_fmt = *out_fmt;
-    //     if (tmp_fmt.is_plain()) { tmp_fmt.reset_blocks(); }
-    if (kernel_table) {
-        uint64_t keys[2] = {*in_fmt, tmp_fmt};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 2);
-        assert(func);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // query inplace
-    *out_size = runtime::calculate_blocking_dims(out_dyn_tsr, out_fmt);
-}
-
-extern "C" void query_format_tensor_view_op(void *table, void *out, void *in,
-        uint64_t *out_fmt, uint64_t *in_fmt, uint64_t *out_size, void *kernel) {
-    // only query format for tensor view
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        void *value = format_table->get(in_fmt, 1);
-        assert(value);
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[1];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    assert(!kernel_table);
-    // query inplace
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    *out_size = runtime::calculate_blocking_dims(out_dyn_tsr, out_fmt);
-}
-
-extern "C" void query_format_select_op(void *table, void *out, void *in0,
-        void *in1, void *in2, uint64_t *out_fmt, uint64_t *in0_fmt,
-        uint64_t *in1_fmt, uint64_t *in2_fmt, uint64_t *out_size,
-        void *kernel) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in1_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in1);
-    runtime::dynamic_tensor_t *in2_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in2);
-    // update dyn_mask
-    out_dyn_tsr->dyn_mask_ = in1_dyn_tsr->dyn_mask_ | in2_dyn_tsr->dyn_mask_;
-
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    // query format
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        uint64_t fmt_keys[3] = {0, 0, *in2_fmt};
-        void *value = format_table->get(fmt_keys, 3);
-        assert(value);
-        *in0_fmt = reinterpret_cast<uint64_t *>(value)[0];
-        *in1_fmt = reinterpret_cast<uint64_t *>(value)[1];
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[3];
-    }
-    // query kernel
-    auto &kernel_table = op_table->kernel_table_;
-    if (kernel_table) {
-        uint64_t keys[4] = {*in0_fmt, *in1_fmt, *in2_fmt, *out_fmt};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 4);
-        assert(func);
-        *reinterpret_cast<void **>(kernel) = func;
-    }
-    // query inplace
-    *out_size = runtime::calculate_blocking_dims(out, out_fmt);
-}
-
-extern "C" void infer_shape_transpose_op(
-        void *out, void *in, int *tr_axis, int num_axis) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-    assert(in_dyn_tsr->ndims_ == num_axis || num_axis == 0);
-    out_dyn_tsr->ndims_ = in_dyn_tsr->ndims_;
-    if (num_axis == 0) {
-        for (int i = 0; i < in_dyn_tsr->ndims_; i++) {
-            out_dyn_tsr->dims_[i] = in_dyn_tsr->dims_[i];
-        }
-    } else {
-        for (int i = 0; i < num_axis; i++) {
-            out_dyn_tsr->dims_[i] = in_dyn_tsr->dims_[tr_axis[i]];
-        }
-    }
-}
-
-extern "C" void infer_shape_tensor_view_op(void *out, void *in,
-        int64_t *old_axis, int num_old_axis, int64_t *new_axis,
-        int num_new_axis) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in);
-    assert(num_old_axis > 0 && num_new_axis > 0);
-    out_dyn_tsr->ndims_ = num_new_axis;
-    int old_idx = 0, new_idx = 0;
-    for (; new_idx < num_new_axis; new_idx++) {
-        if (new_axis[new_idx] < 0) {
-            while (old_idx < num_old_axis && old_axis[old_idx] > 0) {
-                old_idx++;
-            }
-            assert(old_idx < num_old_axis);
-            new_axis[new_idx] = in_dyn_tsr->dims_[old_idx];
-            old_idx++;
-        }
-        out_dyn_tsr->dims_[new_idx] = new_axis[new_idx];
-    }
-}
-
-extern "C" void infer_shape_select_op(
-        void *out, void *in0, void *in1, void *in2) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *in0_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in0);
-    runtime::dynamic_tensor_t *in1_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in1);
-    runtime::dynamic_tensor_t *in2_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(in2);
-    assert(in1_dyn_tsr->ndims_ == 1 && in1_dyn_tsr->dims_[0] == 1);
-    int lhs_ndims = in0_dyn_tsr->ndims_;
-    int rhs_ndims = in2_dyn_tsr->ndims_;
-    int max_ndims = std::max(lhs_ndims, rhs_ndims);
-    int lhs_offset = max_ndims - lhs_ndims;
-    int rhs_offset = max_ndims - rhs_ndims;
-    out_dyn_tsr->ndims_ = max_ndims;
-    for (int i = 0; i < max_ndims; i++) {
-        int lhs = i >= lhs_offset ? in0_dyn_tsr->dims_[i - lhs_offset] : 1;
-        int rhs = i >= rhs_offset ? in2_dyn_tsr->dims_[i - rhs_offset] : 1;
-        out_dyn_tsr->dims_[i] = std::max(lhs, rhs);
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/impl_type.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/impl_type.hpp
deleted file mode 100644
index e72064420a1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/impl_type.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_IMPL_TYPE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_IMPL_TYPE_HPP
-
-#include <stdint.h>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// Predefine all ops' impl algorithm type here.
-// impl algorithm type, include normal(padding)/no padding select.
-enum impl_kind_t : int {
-    normal = 0, // default generate rule
-    no_padding = 1, // generate without padding
-};
-enum mmm_impl_kind_t : int {
-    full_k = 0, // full,
-    is_partial = 1, // is partial K
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/runtime_op_info.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/runtime_op_info.cpp
deleted file mode 100644
index 98116a2cf82..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/runtime_op_info.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <stdint.h>
-#include "runtime_op_info.hpp"
-#include <util/reflection.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// clang-format off
-SC_CLASS(dyn_conv_fwd_runtime_info_t)
-  SC_FIELD(stride_d)
-  SC_FIELD(stride_h)
-  SC_FIELD(stride_w)
-  SC_FIELD(pads_begin_d)
-  SC_FIELD(pads_begin_h)
-  SC_FIELD(pads_begin_w)
-  SC_FIELD(pads_end_d)
-  SC_FIELD(pads_end_h)
-  SC_FIELD(pads_end_w)
-  SC_FIELD(dilation_d)
-  SC_FIELD(dilation_h)
-  SC_FIELD(dilation_w)
-SC_CLASS_END();
-
-SC_CLASS(dyn_padding_runtime_info_t)
-  SC_FIELD(pads_begin_d)
-  SC_FIELD(pads_begin_h)
-  SC_FIELD(pads_begin_w)
-  SC_FIELD(pads_end_d)
-  SC_FIELD(pads_end_h)
-  SC_FIELD(pads_end_w)
-SC_CLASS_END();
-
-SC_CLASS(dyn_pooling_runtime_info_t)
-  SC_FIELD(stride_d)
-  SC_FIELD(stride_h)
-  SC_FIELD(stride_w)
-  SC_FIELD(pads_begin_d)
-  SC_FIELD(pads_begin_h)
-  SC_FIELD(pads_begin_w)
-  SC_FIELD(pads_end_d)
-  SC_FIELD(pads_end_h)
-  SC_FIELD(pads_end_w)
-  SC_FIELD(kernel_d)
-  SC_FIELD(kernel_h)
-  SC_FIELD(kernel_w)
-  SC_FIELD(rounding_type_floor)
-  SC_FIELD(auto_pads_same)
-  SC_CLASS_END();
-// clang-format on
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/runtime_op_info.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/runtime_op_info.hpp
deleted file mode 100644
index 6968703d6d8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/runtime_op_info.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_RUNTIME_OP_INFO_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_RUNTIME_OP_INFO_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct dyn_conv_fwd_runtime_info_t {
-    int stride_d = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    int pads_begin_d = 0;
-    int pads_begin_h = 0;
-    int pads_begin_w = 0;
-
-    int pads_end_d = 0;
-    int pads_end_h = 0;
-    int pads_end_w = 0;
-
-    int dilation_d = 1;
-    int dilation_h = 1;
-    int dilation_w = 1;
-
-    dyn_conv_fwd_runtime_info_t() = default;
-    dyn_conv_fwd_runtime_info_t(int stride_d, int stride_h, int stride_w,
-            int pads_begin_d, int pads_begin_h, int pads_begin_w,
-            int pads_end_d, int pads_end_h, int pads_end_w, int dilation_d,
-            int dilation_h, int dilation_w)
-        : stride_d(stride_d)
-        , stride_h(stride_h)
-        , stride_w(stride_w)
-        , pads_begin_d(pads_begin_d)
-        , pads_begin_h(pads_begin_h)
-        , pads_begin_w(pads_begin_w)
-        , pads_end_d(pads_end_d)
-        , pads_end_h(pads_end_h)
-        , pads_end_w(pads_end_w)
-        , dilation_d(dilation_d)
-        , dilation_h(dilation_h)
-        , dilation_w(dilation_w) {}
-
-    dyn_conv_fwd_runtime_info_t(int stride_h, int stride_w, int pads_begin_h,
-            int pads_begin_w, int pads_end_h, int pads_end_w, int dilation_h,
-            int dilation_w)
-        : stride_h(stride_h)
-        , stride_w(stride_w)
-        , pads_begin_h(pads_begin_h)
-        , pads_begin_w(pads_begin_w)
-        , pads_end_h(pads_end_h)
-        , pads_end_w(pads_end_w)
-        , dilation_h(dilation_h)
-        , dilation_w(dilation_w) {}
-};
-
-struct dyn_padding_runtime_info_t {
-    int pads_begin_d = 0;
-    int pads_begin_h = 0;
-    int pads_begin_w = 0;
-
-    int pads_end_d = 0;
-    int pads_end_h = 0;
-    int pads_end_w = 0;
-
-    dyn_padding_runtime_info_t() = default;
-
-    dyn_padding_runtime_info_t(int pads_begin_d, int pads_begin_h,
-            int pads_begin_w, int pads_end_d, int pads_end_h, int pads_end_w)
-        : pads_begin_d(pads_begin_d)
-        , pads_begin_h(pads_begin_h)
-        , pads_begin_w(pads_begin_w)
-        , pads_end_d(pads_end_d)
-        , pads_end_h(pads_end_h)
-        , pads_end_w(pads_end_w) {}
-
-    dyn_padding_runtime_info_t(
-            int pads_begin_h, int pads_begin_w, int pads_end_h, int pads_end_w)
-        : pads_begin_h(pads_begin_h)
-        , pads_begin_w(pads_begin_w)
-        , pads_end_h(pads_end_h)
-        , pads_end_w(pads_end_w) {}
-};
-
-struct dyn_pooling_runtime_info_t {
-    int stride_d = 1;
-    int stride_h = 1;
-    int stride_w = 1;
-
-    int pads_begin_d = 0;
-    int pads_begin_h = 0;
-    int pads_begin_w = 0;
-
-    int pads_end_d = 0;
-    int pads_end_h = 0;
-    int pads_end_w = 0;
-
-    int kernel_d = 0;
-    int kernel_h = 0;
-    int kernel_w = 0;
-
-    bool rounding_type_floor = true;
-    bool auto_pads_same = false;
-
-    dyn_pooling_runtime_info_t() = default;
-
-    dyn_pooling_runtime_info_t(int stride_d, int stride_h, int stride_w,
-            int pads_begin_d, int pads_begin_h, int pads_begin_w,
-            int pads_end_d, int pads_end_h, int pads_end_w, int kernel_d,
-            int kernel_h, int kernel_w, bool rounding_type_floor,
-            bool auto_pads_same)
-        : stride_d(stride_d)
-        , stride_h(stride_h)
-        , stride_w(stride_w)
-        , pads_begin_d(pads_begin_d)
-        , pads_begin_h(pads_begin_h)
-        , pads_begin_w(pads_begin_w)
-        , pads_end_d(pads_end_d)
-        , pads_end_h(pads_end_h)
-        , pads_end_w(pads_end_w)
-        , kernel_d(kernel_d)
-        , kernel_h(kernel_h)
-        , kernel_w(kernel_w)
-        , rounding_type_floor(rounding_type_floor)
-        , auto_pads_same(auto_pads_same) {}
-
-    dyn_pooling_runtime_info_t(int stride_h, int stride_w, int pads_begin_h,
-            int pads_begin_w, int pads_end_h, int pads_end_w, int kernel_h,
-            int kernel_w, bool rounding_type_floor, bool auto_pads_same)
-        : stride_h(stride_h)
-        , stride_w(stride_w)
-        , pads_begin_h(pads_begin_h)
-        , pads_begin_w(pads_begin_w)
-        , pads_end_h(pads_end_h)
-        , pads_end_w(pads_end_w)
-        , kernel_h(kernel_h)
-        , kernel_w(kernel_w)
-        , rounding_type_floor(rounding_type_floor)
-        , auto_pads_same(auto_pads_same) {}
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/tunable_ops.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/tunable_ops.cpp
deleted file mode 100644
index 0a225fe2968..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/tunable_ops.cpp
+++ /dev/null
@@ -1,612 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-#include "config.hpp"
-#include "impl_type.hpp"
-#include "util.hpp"
-#include <compiler/config/context.hpp>
-#include <runtime/config.hpp>
-#include <runtime/data_type.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <runtime/dynamic_dispatch/op_dispatch_tables.hpp>
-#include <runtime/dynamic_dispatch/ops/runtime_op_info.hpp>
-#include <runtime/dynamic_dispatch/utils.hpp>
-#include <runtime/target_machine.hpp>
-#include <util/utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static int check_and_set_matmul_core_impl(
-        runtime::op_dispatch_tables_t *op_table,
-        runtime::dispatch_key *data_fmt_st,
-        runtime::dispatch_key *weight_fmt_st, runtime::dispatch_key *out_fmt_st,
-        runtime::dynamic_tensor_t *data_dyn_tsr,
-        runtime::dynamic_tensor_t *weight_dyn_tsr,
-        runtime::dynamic_tensor_t *out_dyn_tsr, int M_blk, int N_blk, int K_blk,
-        int &internal_impl) {
-    // query impl kind here. default return normal impl kind.
-    auto &impl_kind_table = op_table->impl_kind_table_;
-    if (impl_kind_table) {
-        uint64_t keys[3] = {static_cast<uint64_t>(M_blk),
-                static_cast<uint64_t>(N_blk), static_cast<uint64_t>(K_blk)};
-        void *value = impl_kind_table->get(keys, 3);
-        assert(value);
-        int impl = *reinterpret_cast<int *>(value);
-        data_fmt_st->set_impl_alg(impl);
-        weight_fmt_st->set_impl_alg(impl);
-        out_fmt_st->set_impl_alg(impl);
-        return impl;
-    }
-
-    return impl_kind_t::normal;
-}
-
-static int check_and_set_managed_matmul_core_impl(
-        runtime::op_dispatch_tables_t *op_table,
-        runtime::dispatch_key *data_fmt_st,
-        runtime::dispatch_key *weight_fmt_st, runtime::dispatch_key *out_fmt_st,
-        runtime::dynamic_tensor_t *data_dyn_tsr,
-        runtime::dynamic_tensor_t *weight_dyn_tsr,
-        runtime::dynamic_tensor_t *out_dyn_tsr, int M_blk, int N_blk, int K_blk,
-        int &internal_impl) {
-    // query impl kind here.
-    auto &impl_kind_table = op_table->impl_kind_table_;
-    assert(impl_kind_table);
-    int M_split_num, N_split_num, M_sub_block, N_sub_block, K_sub_block,
-            im_loop_order;
-    bool is_int8 = utils::is_one_of(sc_data_etype(data_dyn_tsr->dtype_),
-            sc_data_etype::U8, sc_data_etype::S8);
-    bool is_f32 = sc_data_etype(data_dyn_tsr->dtype_) == sc_data_etype::F32;
-    bool no_vnni_f16 = get_default_context()->machine_.cpu_flags_.fAVX512FP16
-            && sc_data_etype(data_dyn_tsr->dtype_) == sc_data_etype::F16;
-    const int M = utils::divide_and_ceil(data_dyn_tsr->dims_[0], M_blk) * M_blk;
-    const int N
-            = utils::divide_and_ceil(weight_dyn_tsr->dims_[1], N_blk) * N_blk;
-    const int K = utils::divide_and_ceil(data_dyn_tsr->dims_[1], K_blk) * K_blk;
-    const int sizeofdtypeA
-            = utils::get_sizeof_etype(sc_data_etype(data_dyn_tsr->dtype_));
-    const int sizeofdtypeC
-            = utils::get_sizeof_etype(sc_data_etype(out_dyn_tsr->dtype_));
-    get_managed_matmul_config(runtime::get_runtime_target_machine(),
-            M_split_num, N_split_num, M_sub_block, N_sub_block, K_sub_block,
-            im_loop_order, M, N, K, M_blk, N_blk, K_blk, sizeofdtypeA,
-            sizeofdtypeC, is_int8, is_f32 || no_vnni_f16,
-            /*is_dynamic*/ true);
-    uint64_t keys[6] = {static_cast<uint64_t>(M_split_num),
-            static_cast<uint64_t>(N_split_num),
-            static_cast<uint64_t>(M_sub_block),
-            static_cast<uint64_t>(N_sub_block),
-            static_cast<uint64_t>(K_sub_block),
-            static_cast<uint64_t>(im_loop_order)};
-    void *value = impl_kind_table->get(keys, 6);
-    assert(value);
-    internal_impl = *reinterpret_cast<int *>(value);
-    const int num_threads = runtime_config_t::get().get_num_threads();
-    const int K_split_num = num_threads / M_split_num / N_split_num;
-    if (K_split_num > 1) { return mmm_impl_kind_t::is_partial; }
-    return mmm_impl_kind_t::full_k;
-}
-
-extern "C" void infer_shape_matmul_op(void *out, void *data, void *weight) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *data_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(data);
-    runtime::dynamic_tensor_t *weight_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(weight);
-    int data_ndims = data_dyn_tsr->ndims_;
-    int weight_ndims = weight_dyn_tsr->ndims_;
-    int &out_ndims = out_dyn_tsr->ndims_;
-    // Currently not support  2D x ND
-    assert(data_ndims >= weight_ndims);
-    out_dyn_tsr->ndims_ = data_ndims;
-
-    // batch dims
-    int64_t *batch_dims = data_dyn_tsr->dims_;
-    for (int i = 0; i < out_ndims - 2; i++) {
-        out_dyn_tsr->dims_[i] = batch_dims[i];
-    }
-    out_dyn_tsr->dims_[out_ndims - 2] = data_dyn_tsr->dims_[data_ndims - 2];
-    out_dyn_tsr->dims_[out_ndims - 1] = weight_dyn_tsr->dims_[weight_ndims - 1];
-}
-typedef int (*impl_set_func)(runtime::op_dispatch_tables_t *,
-        runtime::dispatch_key *, runtime::dispatch_key *,
-        runtime::dispatch_key *, runtime::dynamic_tensor_t *,
-        runtime::dynamic_tensor_t *, runtime::dynamic_tensor_t *, int, int, int,
-        int &);
-extern "C" void query_format_matmul_common_process(void *table, void *out,
-        void *data, void *weight, void *ori_data, void *ori_weight,
-        uint64_t *out_fmt, uint64_t *data_fmt, uint64_t *weight_fmt,
-        uint64_t *ori_data_fmt, uint64_t *ori_weight_fmt, uint64_t *out_size,
-        void *kernel, int *impl_alg, impl_set_func impl_func,
-        bool is_mmm = false) {
-    // update output shape and mask.
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *data_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(data);
-    runtime::dynamic_tensor_t *weight_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(weight);
-    runtime::dynamic_tensor_t *ori_data_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(ori_data);
-    runtime::dynamic_tensor_t *ori_weight_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(ori_weight);
-    runtime::deep_copy_dynamic_tensor(data_dyn_tsr, ori_data_dyn_tsr);
-    runtime::deep_copy_dynamic_tensor(weight_dyn_tsr, ori_weight_dyn_tsr);
-
-    int data_ndims = data_dyn_tsr->ndims_;
-    int weight_ndims = weight_dyn_tsr->ndims_;
-    int &out_ndims = out_dyn_tsr->ndims_;
-
-    int64_t M = data_dyn_tsr->dims_[data_ndims - 2];
-    int64_t K = data_dyn_tsr->dims_[data_ndims - 1];
-    int64_t K1 = weight_dyn_tsr->dims_[weight_ndims - 2];
-    int64_t N = weight_dyn_tsr->dims_[weight_ndims - 1];
-    assert(K == K1);
-    // infer shape
-    infer_shape_matmul_op(out, data, weight);
-    // update dyn_mask
-    out_dyn_tsr->dyn_mask_
-            = data_dyn_tsr->dyn_mask_ | weight_dyn_tsr->dyn_mask_;
-    // M mask
-    uint8_t M_mask = (data_dyn_tsr->dyn_mask_ & (1 << (data_ndims - 2)))
-            | ~(1 << (data_ndims - 2));
-    out_dyn_tsr->dyn_mask_ &= M_mask;
-    // N mask
-    uint8_t N_mask = (weight_dyn_tsr->dyn_mask_ & (1 << (weight_ndims - 1)))
-            | ~(1 << (weight_ndims - 1));
-    out_dyn_tsr->dyn_mask_ &= N_mask;
-
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-
-    // query format
-    bool is_M_dynamic = data_dyn_tsr->dyn_mask_ & (1 << (data_ndims - 2));
-    bool is_N_dynamic = weight_dyn_tsr->dyn_mask_ & (1 << (weight_ndims - 1));
-    bool is_K_dynamic = data_dyn_tsr->dyn_mask_ & (1 << (data_ndims - 1));
-    assert(is_K_dynamic
-            == (bool)(weight_dyn_tsr->dyn_mask_ & (1 << (weight_ndims - 2))));
-    auto cp_data_fmt = *ori_data_fmt;
-    auto cp_weight_fmt = *ori_weight_fmt;
-    auto data_fmt_st = reinterpret_cast<runtime::dispatch_key *>(&cp_data_fmt);
-    auto weight_fmt_st
-            = reinterpret_cast<runtime::dispatch_key *>(&cp_weight_fmt);
-    int a = weight_fmt_st->get(0);
-
-    int M_blk, N_blk, K_blk;
-    M_blk = get_matmul_dyn_cfg_single(M, true);
-    K_blk = get_matmul_dyn_cfg_single(K);
-    N_blk = get_matmul_dyn_cfg_single(N);
-
-    if (data_fmt_st->is_plain() || data_fmt_st->ndims() == data_ndims) {
-        data_fmt_st->set_block1(M_blk);
-        data_fmt_st->set_block2(K_blk);
-        // todo: find the better way for s8 with amx process.
-        if ((!is_mmm && (M % M_blk || K % K_blk || !data_fmt_st->is_plain()))
-                || (is_mmm
-                        && data_dyn_tsr->dtype_ == uint32_t(sc_data_etype::S8)
-                        && K % K_blk)) {
-            if (!data_fmt_st->is_plain()) {
-                for (int i = 0; i < data_ndims; i++) {
-                    data_fmt_st->set(i, i);
-                }
-            }
-            data_fmt_st->set(data_ndims,
-                    data_fmt_st->get(data_ndims - 2)); // M block
-            data_fmt_st->set(data_ndims + 1,
-                    data_fmt_st->get(data_ndims - 1)); // K block
-            data_fmt_st->is_plain_ = 0;
-        }
-    } else {
-        assert(data_fmt_st->ndims() == data_ndims + 2);
-        // reuse last blocking.
-        K_blk = data_fmt_st->get_block2();
-    }
-    bool is_vnni = weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::U8)
-            || weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::S8)
-            || weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::BF16);
-    if (weight_fmt_st->is_plain() || weight_fmt_st->ndims() == weight_ndims) {
-        weight_fmt_st->set_block1(K_blk);
-        weight_fmt_st->set_block2(N_blk);
-        if (N % N_blk || K % K_blk || is_vnni || !weight_fmt_st->is_plain()) {
-            if (!weight_fmt_st->is_plain()) {
-                for (int i = 0; i < weight_ndims; i++) {
-                    weight_fmt_st->set(i, i);
-                }
-            }
-            int K_axis = weight_fmt_st->get(weight_ndims - 2);
-            int N_axis = weight_fmt_st->get(weight_ndims - 1);
-            weight_fmt_st->set(weight_ndims - 2, N_axis);
-            weight_fmt_st->set(weight_ndims - 1, K_axis);
-            weight_fmt_st->set(weight_ndims, K_axis);
-            weight_fmt_st->set(weight_ndims + 1, N_axis);
-            if (is_vnni) { weight_fmt_st->set(weight_ndims + 2, K_axis); }
-            weight_fmt_st->is_plain_ = 0;
-        }
-    } else {
-        assert((!is_vnni && weight_fmt_st->ndims() == weight_ndims + 2)
-                || (is_vnni && weight_fmt_st->ndims() == weight_ndims + 3));
-        // reuse last blocking.
-    }
-    auto &format_table = op_table->format_table_;
-    if (format_table) {
-        uint64_t fmt_keys[2] = {cp_data_fmt, cp_weight_fmt};
-        void *value = format_table->get(fmt_keys, 2);
-        assert(value);
-        *out_fmt = reinterpret_cast<uint64_t *>(value)[0];
-    }
-    // query kernel, need determine the impl alg first.
-    uint64_t cp_out_fmt = *out_fmt;
-    auto *out_fmt_st = reinterpret_cast<runtime::dispatch_key *>(&cp_out_fmt);
-    auto &kernel_table = op_table->kernel_table_;
-    int internal_impl;
-    int impl = impl_func(op_table, data_fmt_st, weight_fmt_st, out_fmt_st,
-            data_dyn_tsr, weight_dyn_tsr, out_dyn_tsr, M_blk, N_blk, K_blk,
-            internal_impl);
-    if (!impl_alg) {
-        // single op query.
-        uint64_t keys[3] = {cp_data_fmt, cp_weight_fmt, cp_out_fmt};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 3);
-        assert(func);
-        data_fmt_st->reset_blocks_and_impl();
-        weight_fmt_st->reset_blocks_and_impl();
-        *reinterpret_cast<void **>(kernel) = func;
-    } else {
-        *impl_alg = impl;
-    }
-    // impl func
-    if (is_mmm) {
-        assert(kernel_table);
-        runtime::dispatch_key impl_fmt_st
-                = runtime::get_impl_dispatch_key(internal_impl);
-        uint64_t keys[3] = {impl_fmt_st, impl_fmt_st, impl_fmt_st};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 3);
-        assert(func);
-        *(reinterpret_cast<void **>(kernel)
-                + static_cast<int>(impl_alg == nullptr))
-                = func;
-    }
-    // avoid internal status change in multi thread case.
-    *data_fmt = cp_data_fmt;
-    *weight_fmt = cp_weight_fmt;
-
-    // query inplace
-    *out_size = calculate_blocking_dims(out_dyn_tsr, out_fmt);
-}
-
-static int check_and_set_conv_fwd_impl(runtime::op_dispatch_tables_t *op_table,
-        runtime::dispatch_key *data_fmt_st,
-        runtime::dispatch_key *weight_fmt_st, runtime::dispatch_key *out_fmt_st,
-        int N, int P, int Q, int K, int C, int k_blk, bool is_bf16, bool dyn_bs,
-        bool dyn_h, bool dyn_w, bool is_conv_1x1) {
-    // query impl kind here. default return normal impl kind.
-    int impl = impl_kind_t::normal;
-    auto &impl_kind_table = op_table->impl_kind_table_;
-    int num_threads = runtime_config_t::get().get_num_threads();
-    int max_thr = 1;
-    size_t num_threads_candidates = 4;
-    std::vector<int> threads_candidates = utils::get_factors(num_threads);
-    std::vector<int> h_threads_ = {};
-    for (size_t i = 0; i < num_threads_candidates; ++i) {
-        h_threads_.push_back(
-                threads_candidates.size() > i && P > threads_candidates[i]
-                        ? threads_candidates[i]
-                        : max_thr);
-        max_thr = std::max(max_thr, h_threads_[i]);
-    }
-
-    std::vector<int> oc_threads_ = {1, 4, 8};
-    if (impl_kind_table) {
-        int h_threads = 1;
-        int oc_threads = 1;
-        int im_w_blk = 64;
-        int im_h_blk = 1;
-        // large channel size, split oc first
-        if (N < num_threads) {
-            if (K >= 512) {
-                oc_threads = *(std::find_if(oc_threads_.rbegin(),
-                        oc_threads_.rend(), [&](int split) {
-                            return split == 1
-                                    || (K / k_blk % split == 0
-                                            && num_threads % split == 0);
-                        }));
-            }
-            num_threads /= oc_threads;
-
-            if (N == 1) {
-                h_threads = num_threads;
-            } else {
-                for (int i = h_threads_.size() - 1; i >= 0; --i) {
-                    if (P > (2 ^ (i + 2))) {
-                        h_threads = std::max(h_threads,
-                                num_threads % h_threads_[3] == 0 ? h_threads_[3]
-                                                                 : 1);
-                    }
-                }
-            }
-            num_threads = runtime_config_t::get().get_num_threads();
-            if (num_threads < 5 && N == 1) {
-                if (!is_conv_1x1) {
-                    if ((K >= 256) && K % num_threads == 0
-                            && K / num_threads % k_blk == 0) {
-                        oc_threads = num_threads;
-                        h_threads = 1;
-                    } else {
-                        h_threads = P / num_threads >= 2 ? num_threads : 1;
-                        oc_threads = 1;
-                    }
-                } else {
-                    if ((K >= 512) && K % num_threads == 0) {
-                        oc_threads = num_threads;
-                        h_threads = 1;
-                    } else {
-                        h_threads = P / num_threads >= 2 ? num_threads : 1;
-                        oc_threads = 1;
-                    }
-                }
-            }
-        }
-
-        im_w_blk = dyn_w ? 64 : std::min(Q, 64);
-        if (P % h_threads == 0 && num_threads <= 4) {
-            if (Q <= 16)
-                im_h_blk = P / h_threads % 4 == 0 ? 4
-                        : P / h_threads % 2 == 0  ? 2
-                                                  : 1;
-            else if (Q <= 32)
-                im_h_blk = P / h_threads % 2 == 0 && P % h_threads == 0 ? 2 : 1;
-        }
-
-        uint64_t keys[4] = {static_cast<uint64_t>(h_threads),
-                static_cast<uint64_t>(oc_threads),
-                static_cast<uint64_t>(im_h_blk),
-                static_cast<uint64_t>(im_w_blk)};
-        void *value = impl_kind_table->get(keys, 4);
-        assert(value);
-        impl = *reinterpret_cast<int *>(value);
-        data_fmt_st->set_impl_alg(impl);
-        weight_fmt_st->set_impl_alg(impl);
-        out_fmt_st->set_impl_alg(impl);
-    }
-
-    return impl;
-}
-
-extern "C" void infer_shape_conv_fwd_op(void *out, void *data, void *weight,
-        dyn_conv_fwd_runtime_info_t &op_info) {
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *data_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(data);
-    runtime::dynamic_tensor_t *weight_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(weight);
-    int data_ndims = data_dyn_tsr->ndims_;
-    int weight_ndims = weight_dyn_tsr->ndims_;
-    int &out_ndims = out_dyn_tsr->ndims_;
-
-    out_dyn_tsr->ndims_ = data_ndims;
-
-    int64_t OC = weight_ndims == 4 ? weight_dyn_tsr->dims_[0]
-                                   : weight_dyn_tsr->dims_[0]
-                    * weight_dyn_tsr->dims_[weight_ndims - 1];
-
-    int64_t *data_dims = data_dyn_tsr->dims_;
-    out_dyn_tsr->dims_[0] = data_dims[0];
-
-    int strides[3] = {op_info.stride_d, op_info.stride_h, op_info.stride_w};
-    int pads_begin[3] = {
-            op_info.pads_begin_d, op_info.pads_begin_h, op_info.pads_begin_w};
-    int pads_end[3]
-            = {op_info.pads_end_d, op_info.pads_end_h, op_info.pads_end_w};
-    int dilations[3]
-            = {op_info.dilation_d, op_info.dilation_h, op_info.dilation_w};
-    int offset = data_ndims == 5 ? -2 : -1;
-    auto calc_out_shapes = [](int i, int k, int pb, int pe, int s, int d) {
-        auto r = (i + pb + pe - d * (k - 1) - 1) / s + 1;
-        return r;
-    };
-    for (int i = 2; i < out_ndims; i++) {
-        out_dyn_tsr->dims_[i]
-                = calc_out_shapes(data_dims[i], weight_dyn_tsr->dims_[i],
-                        pads_begin[i + offset], pads_end[i + offset],
-                        strides[i + offset], dilations[i + offset]);
-    }
-    out_dyn_tsr->dims_[1] = OC;
-}
-
-extern "C" void query_format_conv_fwd_core_op(void *table, void *out,
-        void *data, void *weight, void *ori_data, void *ori_weight,
-        uint64_t *out_fmt, uint64_t *data_fmt, uint64_t *weight_fmt,
-        uint64_t *ori_data_fmt, uint64_t *ori_weight_fmt, uint64_t *out_size,
-        void *kernel, int *impl_alg) {
-    // update output shape and mask.
-    runtime::dynamic_tensor_t *out_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(out);
-    runtime::dynamic_tensor_t *data_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(data);
-    runtime::dynamic_tensor_t *weight_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(weight);
-    runtime::dynamic_tensor_t *ori_data_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(ori_data);
-    runtime::dynamic_tensor_t *ori_weight_dyn_tsr
-            = reinterpret_cast<runtime::dynamic_tensor_t *>(ori_weight);
-    runtime::deep_copy_dynamic_tensor(data_dyn_tsr, ori_data_dyn_tsr);
-    runtime::deep_copy_dynamic_tensor(weight_dyn_tsr, ori_weight_dyn_tsr);
-
-    int data_ndims = data_dyn_tsr->ndims_;
-    int weight_ndims = weight_dyn_tsr->ndims_;
-    int &out_ndims = out_dyn_tsr->ndims_;
-
-    int64_t BS = data_dyn_tsr->dims_[0];
-    int64_t IC = data_dyn_tsr->dims_[1];
-    int64_t IH = data_dyn_tsr->dims_[data_ndims - 2];
-    int64_t IW = data_dyn_tsr->dims_[data_ndims - 1];
-
-    int64_t OC = weight_ndims == 4 ? weight_dyn_tsr->dims_[0]
-                                   : weight_dyn_tsr->dims_[0]
-                    * weight_dyn_tsr->dims_[weight_ndims - 1];
-    int64_t IC1 = weight_ndims == 4 ? weight_dyn_tsr->dims_[1]
-                                    : weight_dyn_tsr->dims_[1]
-                    * weight_dyn_tsr->dims_[weight_ndims - 2];
-    assert(IC == IC1);
-    // infer shape
-    runtime::op_dispatch_tables_t *op_table
-            = reinterpret_cast<runtime::op_dispatch_tables_t *>(table);
-    dyn_conv_fwd_runtime_info_t info
-            = *op_table->op_info_
-                       .unchecked_get_as<dyn_conv_fwd_runtime_info_t>();
-    infer_shape_conv_fwd_op(out, data, weight, info);
-
-    int64_t OH = weight_ndims == 4 ? out_dyn_tsr->dims_[data_ndims - 2]
-                                   : out_dyn_tsr->dims_[data_ndims - 3];
-    int64_t OW = weight_ndims == 4 ? out_dyn_tsr->dims_[data_ndims - 1]
-                                   : out_dyn_tsr->dims_[data_ndims - 2];
-    // update dyn_mask
-    out_dyn_tsr->dyn_mask_ = data_dyn_tsr->dyn_mask_;
-    // query format
-    bool is_BS_dynamic = data_dyn_tsr->dyn_mask_ & 1;
-    bool is_IH_dynamic = data_dyn_tsr->dyn_mask_ & (1 << 2);
-    bool is_IW_dynamic = data_dyn_tsr->dyn_mask_ & (1 << 3);
-    auto cp_data_fmt = *ori_data_fmt;
-    auto cp_weight_fmt = *ori_weight_fmt;
-    auto data_fmt_st = reinterpret_cast<runtime::dispatch_key *>(&cp_data_fmt);
-    auto weight_fmt_st
-            = reinterpret_cast<runtime::dispatch_key *>(&cp_weight_fmt);
-    int a = weight_fmt_st->get(0);
-
-    // 4. according to the dynamic dim and dynamic var to get the dispatch
-    // key (blocking size)
-    bool is_conv_1x1
-            = weight_dyn_tsr->dims_[2] == 1 && weight_dyn_tsr->dims_[3] == 1;
-    bool has_pad = info.pads_begin_h > 0 || info.pads_begin_w > 0
-            || info.pads_begin_d > 0;
-    bool is_f32 = sc_data_etype(data_dyn_tsr->dtype_) == sc_data_etype::F32;
-    bool no_vnni_f16 = get_default_context()->machine_.cpu_flags_.fAVX512FP16
-            && sc_data_etype(data_dyn_tsr->dtype_) == sc_data_etype::F16;
-    auto default_block = get_dyn_conv_default_block(is_conv_1x1,
-            utils::get_sizeof_etype(sc_data_etype(data_dyn_tsr->dtype_)),
-            has_pad, is_f32 || no_vnni_f16);
-    int k_block = utils::get_blocks(OC, 1, default_block).back();
-    int c_block = utils::get_blocks(IC, 1, default_block).back();
-    auto &format_table = op_table->format_table_;
-    if (data_fmt_st->is_plain()) {
-        int n_aix = data_fmt_st->get(0);
-        int c_axis = data_fmt_st->get(1);
-        int h_axis = data_fmt_st->get(data_ndims - 2);
-        int w_axis = data_fmt_st->get(data_ndims - 1);
-        data_fmt_st->set(0, n_aix);
-        data_fmt_st->set(data_ndims - 1, c_axis);
-        data_fmt_st->set(data_ndims - 3, h_axis);
-        data_fmt_st->set(data_ndims - 2, w_axis);
-        if (data_ndims == 5) {
-            int d_axis = data_fmt_st->get(data_ndims - 3);
-            data_fmt_st->set(data_ndims - 4, d_axis);
-        }
-        data_fmt_st->is_plain_ = 0;
-    }
-    data_fmt_st->impl_alg_ = 0;
-    if (weight_ndims == 4) {
-        bool is_vnni = weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::U8)
-                || weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::S8)
-                || weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::BF16);
-        if (weight_fmt_st->ndims() == weight_ndims) {
-            weight_fmt_st->set_block1(c_block);
-            weight_fmt_st->set_block2(k_block);
-            if (!weight_fmt_st->is_plain()) {
-                for (int i = 0; i < weight_ndims; i++) {
-                    weight_fmt_st->set(i, i);
-                }
-            }
-            int c_axis = weight_fmt_st->get(1);
-            int k_axis = weight_fmt_st->get(0);
-            weight_fmt_st->set(0, k_axis);
-            weight_fmt_st->set(1, c_axis);
-            weight_fmt_st->set(weight_ndims, c_axis);
-            weight_fmt_st->set(weight_ndims + 1, k_axis);
-            if (is_vnni) { weight_fmt_st->set(weight_ndims + 2, c_axis); }
-            weight_fmt_st->is_plain_ = 0;
-        } else {
-            assert((!is_vnni && weight_fmt_st->ndims() == weight_ndims + 2)
-                    || (is_vnni && weight_fmt_st->ndims() == weight_ndims + 3));
-            // reuse last blocking.
-        }
-    }
-    weight_fmt_st->impl_alg_ = 0;
-    uint64_t fmt_keys[2] = {cp_data_fmt, cp_weight_fmt};
-    void *value = format_table->get(fmt_keys, 2);
-    assert(value);
-    *out_fmt = reinterpret_cast<uint64_t *>(value)[0];
-    // query kernel, need determine the impl alg first.
-    uint64_t cp_out_fmt = *out_fmt;
-    auto *out_fmt_st = reinterpret_cast<runtime::dispatch_key *>(&cp_out_fmt);
-    auto &kernel_table = op_table->kernel_table_;
-    bool is_bf16_weight
-            = weight_dyn_tsr->dtype_ == uint32_t(sc_data_etype::BF16);
-    if (kernel_table) {
-        check_and_set_conv_fwd_impl(op_table, data_fmt_st, weight_fmt_st,
-                out_fmt_st, BS, OH, OW, OC, IC, k_block, is_bf16_weight,
-                is_BS_dynamic, is_IH_dynamic, is_IW_dynamic, is_conv_1x1);
-        uint64_t keys[3] = {cp_data_fmt, cp_weight_fmt, cp_out_fmt};
-        void *func = runtime::run_query_and_wait(
-                op_table->kernel_dispatch_func_, kernel_table.get(), keys, 3);
-        assert(func);
-        data_fmt_st->reset_blocks_and_impl();
-        weight_fmt_st->reset_blocks_and_impl();
-        *reinterpret_cast<void **>(kernel) = func;
-    } else {
-        assert(impl_alg);
-        *impl_alg = check_and_set_conv_fwd_impl(op_table, data_fmt_st,
-                weight_fmt_st, out_fmt_st, BS, OH, OW, OC, IC, k_block,
-                is_bf16_weight, is_BS_dynamic, is_IH_dynamic, is_IW_dynamic,
-                is_conv_1x1);
-    }
-    // avoid internal status change in multi thread case.
-    *data_fmt = cp_data_fmt;
-    *weight_fmt = cp_weight_fmt;
-
-    // query inplace
-    *out_size = calculate_blocking_dims(out_dyn_tsr, out_fmt);
-}
-
-extern "C" void query_format_matmul_core_op(void *table, void *out, void *data,
-        void *weight, void *ori_data, void *ori_weight, uint64_t *out_fmt,
-        uint64_t *data_fmt, uint64_t *weight_fmt, uint64_t *ori_data_fmt,
-        uint64_t *ori_weight_fmt, uint64_t *out_size, void *kernel,
-        int *impl_alg) {
-    query_format_matmul_common_process(table, out, data, weight, ori_data,
-            ori_weight, out_fmt, data_fmt, weight_fmt, ori_data_fmt,
-            ori_weight_fmt, out_size, kernel, impl_alg,
-            check_and_set_matmul_core_impl);
-}
-
-extern "C" void query_format_managed_matmul_core_op(void *table, void *out,
-        void *data, void *weight, void *ori_data, void *ori_weight,
-        uint64_t *out_fmt, uint64_t *data_fmt, uint64_t *weight_fmt,
-        uint64_t *ori_data_fmt, uint64_t *ori_weight_fmt, uint64_t *out_size,
-        void *kernel, int *impl_alg) {
-    query_format_matmul_common_process(table, out, data, weight, ori_data,
-            ori_weight, out_fmt, data_fmt, weight_fmt, ori_data_fmt,
-            ori_weight_fmt, out_size, kernel, impl_alg,
-            check_and_set_managed_matmul_core_impl, true);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/util.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/util.hpp
deleted file mode 100644
index 3dff5db7591..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/ops/util.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdint.h>
-#include <util/def.hpp>
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_UTIL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_OPS_UTIL_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct dispatch_table_t;
-inline void *run_query_and_wait(
-        void *(*f)(dispatch_table_t *ths, uint64_t *keys, uint64_t num_keys),
-        dispatch_table_t *table, uint64_t *keys, uint64_t num_keys) {
-    return f(table, keys, num_keys);
-}
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/static_dispatch_table.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/static_dispatch_table.hpp
deleted file mode 100644
index f9bba4fe0b9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/static_dispatch_table.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_STATIC_DISPATCH_TABLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_STATIC_DISPATCH_TABLE_HPP
-
-#include <array>
-#include <assert.h>
-#include "dispatch_table.hpp"
-#include <runtime/dispatch_key.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-template <uint64_t... values>
-struct static_dispatch_keys {
-    using linearizer = dispatch_key::linear_converter<values...>;
-    static constexpr int num_of_values = linearizer::idx + 1;
-};
-
-template <int index, typename First, typename... Keys>
-struct static_dispatch_trait {
-    using next_trait = static_dispatch_trait<index + 1, Keys...>;
-    static constexpr int value = First::num_of_values * next_trait::value;
-    static constexpr int num_keys = next_trait::num_keys + 1;
-    static uint64_t compute(uint64_t *keys) {
-        return First::linearizer::call(dispatch_key(keys[index]))
-                * next_trait::value
-                + next_trait::compute(keys);
-    }
-};
-
-template <int index, typename First>
-struct static_dispatch_trait<index, First> {
-    static constexpr int value = First::num_of_values;
-    static constexpr int num_keys = 1;
-    static uint64_t compute(uint64_t *keys) {
-        return First::linearizer::call(dispatch_key(keys[index]));
-    }
-};
-
-template <typename block_index_compute, uint64_t blocks, typename... Keys>
-struct static_dispatch_table_t : public dispatch_table_t {
-    using trait = static_dispatch_trait<0, Keys...>;
-    static constexpr int size = trait::value * blocks;
-    std::array<void *, size> table_;
-    static size_t compute_linear_index(uint64_t *keys, uint64_t num_keys) {
-        assert(num_keys == trait::num_keys);
-        return trait::compute(keys) * blocks
-                + block_index_compute::call(keys, num_keys);
-    }
-
-    static void *dispatch(
-            dispatch_table_t *ths, uint64_t *keys, uint64_t num_keys) {
-        auto index = compute_linear_index(keys, num_keys);
-        return static_cast<static_dispatch_table_t *>(ths)->table_[index];
-    }
-
-    void *get(uint64_t *keys, uint64_t num_keys) override {
-        return table_[compute_linear_index(keys, num_keys)];
-    }
-    void set(uint64_t *keys, uint64_t num_keys, void *value) override {
-        table_[compute_linear_index(keys, num_keys)] = value;
-    }
-    dispatch_func_t get_dispatch_func() override { return &dispatch; }
-};
-} // namespace runtime
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/utils.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/utils.cpp
deleted file mode 100644
index c5d8b3619c2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/utils.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <limits>
-#include <string.h>
-#include "../dispatch_key.hpp"
-#include "utils.hpp"
-#include <util/simple_math.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-void deep_copy_dynamic_tensor(
-        runtime::dynamic_tensor_t *out, const runtime::dynamic_tensor_t *in) {
-    out->ndims_ = in->ndims_;
-    out->dyn_mask_ = in->dyn_mask_;
-    memcpy(out->dims_, in->dims_, sizeof(int64_t) * in->ndims_);
-}
-
-uint64_t calculate_blocking_dims(void *placeholder, uint64_t *format) {
-    auto *dyn_tsr = reinterpret_cast<dynamic_tensor_t *>(placeholder);
-    auto *fmt = reinterpret_cast<dispatch_key *>(format);
-    uint64_t size = 1;
-    if (fmt->is_plain()) {
-        for (int i = 0; i < dyn_tsr->ndims_; i++) {
-            size *= dyn_tsr->dims_[i];
-        }
-    } else {
-        constexpr int max_format_dims = dispatch_key::meta::FORMAT_BITS / 4;
-        uint16_t count[max_format_dims] = {0};
-        uint16_t idx = 0;
-        bool first_block = true;
-        while (idx < max_format_dims) {
-            auto axis = fmt->get(idx);
-            if (axis == 0xF) { break; }
-            count[axis]++;
-            if (count[axis] == 2) {
-                uint16_t block;
-                if (first_block) {
-                    first_block = false;
-                    block = fmt->get_block1();
-                } else {
-                    block = fmt->get_block2();
-                }
-                size = size / dyn_tsr->dims_[axis]
-                        * utils::divide_and_ceil(dyn_tsr->dims_[axis], block)
-                        * block;
-            } else {
-                size = size * dyn_tsr->dims_[axis];
-            }
-            idx++;
-        }
-    }
-    return size;
-}
-
-dispatch_key get_impl_dispatch_key(int impl) {
-    dispatch_key ret(0);
-    ret.set_impl_alg(impl);
-    return ret;
-}
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/utils.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/utils.hpp
deleted file mode 100644
index c799693dc70..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_dispatch/utils.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_DISPATCH_UTILS_HPP
-
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-union dispatch_key;
-void deep_copy_dynamic_tensor(
-        dynamic_tensor_t *out, const dynamic_tensor_t *in);
-
-uint64_t calculate_blocking_dims(void *placeholder, uint64_t *format);
-
-dispatch_key get_impl_dispatch_key(int impl);
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool.cpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool.cpp
deleted file mode 100644
index ca83489a826..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool.cpp
+++ /dev/null
@@ -1,782 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <memory>
-#include <mmintrin.h>
-#include <string.h>
-#include "dynamic_threadpool.hpp"
-#include "dynamic_threadpool_c.hpp"
-#include "memorypool.hpp"
-#include "parallel.hpp"
-#include "thread_locals.hpp"
-#include <runtime/low_level_threadpool_wrapper.hpp>
-#include <runtime/managed_thread_pool_exports.hpp>
-#include <util/os.hpp>
-#include <util/simple_math.hpp>
-
-#ifdef _MSC_VER
-#define __builtin_expect(EXP_, C) (EXP_)
-#endif
-
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-#define DYN_THREAD_POOL_PROFILE 0
-
-#if DYN_THREAD_POOL_PROFILE
-#include <vector>
-#include <util/scoped_timer.hpp>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-extern void cleanup_worker_thread_state();
-namespace dynamic_threadpool {
-
-static thread_local std::unique_ptr<threadpool_scheduler> main_sched;
-
-static thread_local struct threadlocals_t {
-    threadpool_scheduler *current_sched;
-    work_item_shared_data *current_work_data;
-    uint64_t *cur_itr;
-    // the tid of the current work should belong to, not necessarily the real
-    // current thread id
-    uint64_t work_tid;
-    memory_pool::filo_memory_pool_t *mem_pool = nullptr;
-} threadlocals;
-
-// 64KB memory pool
-static memory_pool::filo_memory_pool_t *get_mem_pool(threadlocals_t &tls) {
-    if (unlikely(!tls.mem_pool)) {
-        tls.mem_pool = &thread_local_buffer_t::tls_buffer()
-                                .additional_->dyn_threadpool_mem_pool_;
-    }
-    return tls.mem_pool;
-}
-static threadpool_scheduler *get_current_sched() {
-    return threadlocals.current_sched;
-}
-
-#if DYN_THREAD_POOL_PROFILE
-static thread_local uint64_t payload_us = 0;
-static thread_local uint64_t postprocess_us = 0;
-static thread_local decltype(
-        std::chrono::high_resolution_clock::now()) main_start_us;
-static thread_local uint64_t launch_us = 0;
-#endif
-
-void *threadpool_arena::alloc(memory_pool::filo_memory_pool_t *pool,
-        stream_t *stream, uint64_t size, uint64_t alignment) {
-    return pool->alloc(stream, size);
-}
-
-void queue::init(uint32_t size) {
-    base_ = new work_item *[size];
-    size_ = size;
-    thr_state_ = thread_state::UNATTENDED;
-    broadcast_ob_ver_.store(0, std::memory_order_relaxed);
-    fast_slot_.store(nullptr, std::memory_order_relaxed);
-    lock_ = 0;
-    head_and_tail_ = 0;
-}
-
-queue::~queue() {
-    delete[] base_;
-}
-
-void queue::lock() noexcept {
-    uint64_t expected = 0;
-    while (!lock_.compare_exchange_strong(
-            expected, 1, std::memory_order_acq_rel)) {
-        expected = 0;
-    }
-}
-
-static bool try_enqueue_fast_slot(
-        std::atomic<work_item *> &fast_slot, work_item *item) {
-    auto old_slot = fast_slot.load(std::memory_order_relaxed);
-    if (!old_slot) {
-        // if fast slot is empty
-        if (fast_slot.compare_exchange_strong(
-                    old_slot, item, std::memory_order_acq_rel)) {
-            return true;
-        }
-    }
-    return false;
-}
-
-bool queue::enqueue(work_item *item) noexcept {
-    if (try_enqueue_fast_slot(fast_slot_, item)) { return true; }
-    if (volatile_length() >= size_ - 1) { return false; }
-    lock();
-    // double check locking
-    auto head_and_tail = head_and_tail_.load(std::memory_order_relaxed);
-    auto head = head_and_tail >> 32;
-    auto tail = head_and_tail & 0xFFFFFFFF;
-    // head_new = (head + 1) % size_
-    auto head_new = head + 1;
-    if (head_new >= size_) { head_new -= size_; }
-    if (unlikely(head_new == tail)) {
-        lock_.store(0, std::memory_order_release);
-        return false;
-    }
-    base_[head] = item;
-    head_and_tail = (head_new << 32) | tail;
-    head_and_tail_.store(head_and_tail, std::memory_order_relaxed);
-    lock_.store(0, std::memory_order_release);
-    return true;
-}
-
-bool queue::jump_queue(work_item *item) noexcept {
-    if (try_enqueue_fast_slot(fast_slot_, item)) { return true; }
-    if (unlikely(volatile_length() >= size_ - 1)) { return false; }
-    lock();
-    // double check locking
-    auto head_and_tail = head_and_tail_.load(std::memory_order_relaxed);
-    auto head = head_and_tail >> 32;
-    auto tail = head_and_tail & 0xFFFFFFFF;
-    auto tail_new = (tail == 0) ? (size_ - 1) : tail - 1;
-    if (unlikely(tail_new == head)) {
-        lock_.store(0, std::memory_order_release);
-        return false;
-    }
-    base_[tail_new] = item;
-    head_and_tail = (head << 32) | tail_new;
-    head_and_tail_.store(head_and_tail, std::memory_order_relaxed);
-    lock_.store(0, std::memory_order_release);
-    return true;
-}
-
-work_item *queue::dequeue() noexcept {
-    if (auto old_slot = fast_slot_.load(std::memory_order_relaxed)) {
-        // if fast slot has a work item
-        auto ret = old_slot;
-        if (fast_slot_.compare_exchange_strong(
-                    old_slot, nullptr, std::memory_order_acq_rel)) {
-            return ret;
-        }
-    }
-    if (volatile_length() == 0) { return nullptr; }
-    lock();
-    // double check locking
-    auto head_and_tail = head_and_tail_.load(std::memory_order_relaxed);
-    auto head = head_and_tail >> 32;
-    auto tail = head_and_tail & 0xFFFFFFFF;
-    if (head == tail) {
-        lock_.store(0, std::memory_order_release);
-        return nullptr;
-    }
-    auto ret = base_[tail];
-    auto tail_new = tail + 1;
-    if (tail_new >= size_) { tail_new -= size_; }
-
-    head_and_tail = (head << 32) | tail_new;
-    head_and_tail_.store(head_and_tail, std::memory_order_relaxed);
-    lock_.store(0, std::memory_order_release);
-    return ret;
-}
-
-uint64_t queue::volatile_length() noexcept {
-    auto head_and_tail = head_and_tail_.load(std::memory_order_relaxed);
-    auto head = head_and_tail >> 32;
-    auto tail = head_and_tail & 0xFFFFFFFF;
-    if (head < tail) { head += size_; }
-    return head - tail;
-}
-
-threadpool_scheduler::threadpool_scheduler(
-        stream_t *stream, uint64_t queue_size, uint64_t num_threads)
-    : stream_(stream), num_queues_ {num_threads} {
-    threadlocals.current_sched = this;
-    threadlocals.work_tid = 0;
-    num_broadcast_events_ = 0;
-    static_assert(sizeof(queue) == 64, "expecting sizeof(queue) == 64");
-    queues_ = (queue *)aligned_alloc(64, sizeof(queue) * num_threads);
-    for (uint64_t i = 0; i < num_threads; i++) {
-        queues_[i].init(queue_size);
-    }
-}
-
-threadpool_scheduler::~threadpool_scheduler() {
-    aligned_free(queues_);
-}
-
-static bool find_queue_and_insert_work(threadpool_scheduler *scheduler,
-        work_item *item, uint64_t &outqid, queue *&theq) {
-    for (uint64_t i = outqid; i < scheduler->num_queues_; i++) {
-        if (scheduler->queues_[i].enqueue(item)) {
-            theq = &scheduler->queues_[i];
-            outqid = i;
-            return true;
-        }
-    }
-    for (uint64_t i = 0; i < outqid; i++) {
-        if (scheduler->queues_[i].enqueue(item)) {
-            theq = &scheduler->queues_[i];
-            outqid = i;
-            return true;
-        }
-    }
-    return false;
-}
-
-static void deref_buffer(void *ptr) {
-    auto buffer = (shared_buffer *)((uint8_t *)ptr
-            - offsetof(shared_buffer, buffer_));
-    auto cnt = buffer->ref_count_.fetch_sub(1, std::memory_order_acq_rel);
-    assert(cnt >= 1);
-    if (cnt == 1) { aligned_free(buffer); }
-}
-
-static void ref_buffer(void *ptr, uint64_t count) {
-    auto buffer = (shared_buffer *)((uint8_t *)ptr
-            - offsetof(shared_buffer, buffer_));
-    buffer->ref_count_.fetch_add(count, std::memory_order_acq_rel);
-}
-
-static void on_work_item_done(threadpool_scheduler *scheduler,
-        work_item_shared_data *shared_data, uint64_t shared_buffer_count) {
-    auto &the_signal = scheduler->cur_section_.load(std::memory_order_relaxed)
-                               ->pending_sink_;
-    the_signal.fetch_sub(1, std::memory_order_acq_rel);
-    for (uint64_t i = 0; i < shared_buffer_count; i++) {
-        auto buf = shared_data->buffers_[i];
-        deref_buffer(buf);
-    }
-}
-
-static bool submit_single_work_item(threadpool_scheduler *scheduler,
-        work_item *item, uint64_t &qid, queue *&theq,
-        bool (queue::*insert_method)(work_item *)) {
-    if (unlikely(item->size_ == 0)) {
-        on_work_item_done(scheduler, item->data_, item->data_->buffers_.size_);
-        return false;
-    }
-    // insert in the prefered queue
-    if (likely((theq->*insert_method)(item))) { return false; }
-    // insert in an non-empty queue
-    if (find_queue_and_insert_work(scheduler, item, qid, theq)) { return true; }
-    // fallback to slow queue
-    {
-        auto &fallbackq = scheduler->fallback_queue_;
-        std::lock_guard<std::mutex> guard {fallbackq.lock_};
-        fallbackq.size_++;
-        fallbackq.queue_.push(item);
-    }
-    return false;
-}
-
-static void do_work(threadpool_scheduler *scheduler, threadpool_section *sect,
-        void *stream, void *module_data, threadlocals_t *tls, work_item *item,
-        queue *local_q) {
-#if DYN_THREAD_POOL_PROFILE
-    auto timer = utils::create_scoped_timer(true, [](utils::time_duration v) {
-        postprocess_us
-                += std::chrono::duration_cast<std::chrono::nanoseconds>(v)
-                           .count();
-    });
-#endif
-    auto shared_data = item->data_;
-    tls->current_work_data = shared_data;
-    auto &cur_layer = shared_data->layer_;
-    auto num_levels = shared_data->num_shared_iter_ + 1;
-    // prepare the iterators
-    uint64_t iter[6];
-    if (unlikely(num_levels > 6)) { std::abort(); }
-    tls->cur_itr = iter;
-    for (uint64_t i = 0; i < num_levels - 1; i++) {
-        iter[i] = shared_data->shared_iter_[i];
-    }
-    // call the workload function
-    {
-        local_q->thr_state_ = queue::thread_state::RUNNING;
-#if DYN_THREAD_POOL_PROFILE
-        auto timer
-                = utils::create_scoped_timer(true, [](utils::time_duration v) {
-                      payload_us += std::chrono::duration_cast<
-                              std::chrono::nanoseconds>(v)
-                                            .count();
-                  });
-#endif
-        auto base_idx = item->base_idx_;
-        for (uint64_t i = 0; i < item->size_; i++) {
-            iter[num_levels - 1] = base_idx + i;
-            cur_layer.pfunc_(stream, module_data, iter,
-                    shared_data->buffers_.ptr_, sect->args_);
-        }
-        local_q->thr_state_ = queue::thread_state::SCHEDULING;
-    }
-    on_work_item_done(scheduler, shared_data, shared_data->buffers_.size_);
-    tls->current_work_data = nullptr;
-}
-
-static void try_execute_broadcast_work(threadpool_scheduler *sched,
-        threadpool_section *cur_sect, uint64_t cur_tid, queue *q) {
-    auto thework = sched->broadcast_work_.load(std::memory_order_acquire);
-    auto exec = [](threadpool_scheduler *sched, threadpool_section *cur_sect,
-                        queue *q, broadcast_work_item *thework,
-                        uint64_t work_ver, uint64_t tid) {
-        auto cur_bcast_ver
-                = q[tid].broadcast_ob_ver_.load(std::memory_order_acquire);
-        // wait until cur_bcast_ver reaches the expected value
-        for (;;) {
-            if (likely(cur_bcast_ver >= work_ver - 1)) { break; }
-            cur_bcast_ver
-                    = q[tid].broadcast_ob_ver_.load(std::memory_order_acquire);
-        }
-        if (cur_bcast_ver >= work_ver) { return; }
-        // take the broadcast work
-        if (!q[tid].broadcast_ob_ver_.compare_exchange_strong(
-                    cur_bcast_ver, work_ver, std::memory_order_acq_rel)) {
-            // fail to take the work due to contention, skip
-            return;
-        }
-        if (tid < thework->loop_len_) {
-            threadlocals.cur_itr = &tid;
-            thework->layer_.pfunc_(sched->stream_, cur_sect->module_, &tid,
-                    nullptr, cur_sect->args_);
-        }
-        --thework->trigger_.pending_;
-    };
-    if (thework) {
-        auto work_ver = thework->broadcast_events_ver_;
-        exec(sched, cur_sect, q, thework, work_ver, cur_tid);
-        // q[cur_tid].broadcast_ob_ver_ = thework->broadcast_events_ver_;
-        if (cur_tid == 0) {
-            uint64_t min_unattended = 0;
-            while (thework->trigger_.pending_.load(std::memory_order_acquire)) {
-                for (; min_unattended < sched->num_queues_; min_unattended++) {
-                    if (q[min_unattended].thr_state_
-                            == queue::thread_state::UNATTENDED) {
-                        exec(sched, cur_sect, q, thework, work_ver,
-                                min_unattended);
-                    }
-                }
-                _mm_pause();
-            }
-            // printf("MAIN release %lu\n", thework->loop_len_);
-            sched->broadcast_work_.store(nullptr, std::memory_order_release);
-            return;
-        }
-        while (sched->broadcast_work_.load(std::memory_order_acquire)
-                == thework) {}
-    }
-}
-
-#if DYN_THREAD_POOL_PROFILE
-struct tp_profile_t {
-    uint32_t count;
-    uint32_t steal_count;
-    uint32_t overflow;
-    uint32_t total_us;
-    uint32_t payload_us;
-    uint32_t postprocess_us;
-    uint32_t main_us;
-};
-
-struct tp_profile_holder {
-    std::vector<std::vector<tp_profile_t>> vec;
-    tp_profile_holder(size_t num_thr, size_t init_sz) {
-        printf("WARNING: threadpool tracing is ON: threads=%zu, init_sz=%zu\n",
-                num_thr, init_sz);
-        vec.resize(num_thr);
-        for (auto &v : vec) {
-            v.reserve(init_sz);
-        }
-    }
-    void clear() {
-        for (size_t i = 0; i < vec.size(); i++) {
-            auto &v = vec[i];
-            uint64_t total = 0, payload = 0, postprocess = 0;
-            for (size_t j = 1; j < v.size(); j++) {
-                auto &trace = v[j];
-                printf("tid=%zu,itr=%zu,queued=%u,steal=%u,overflow=%u,"
-                       "total_us=%u,payload_us=%u,postprocess_ns=%u,main_us=%"
-                       "u\n",
-                        i, j, trace.count, trace.steal_count, trace.overflow,
-                        trace.total_us, trace.payload_us / 1000,
-                        trace.postprocess_us, trace.main_us);
-                total += trace.total_us;
-                payload += trace.payload_us;
-                postprocess += trace.postprocess_us;
-            }
-            if (v.size()) {
-                printf("************ payload/total=%f %lu/%lu "
-                       "post=%f schedule=%f***********\n",
-                        double(payload) / 1000 / total, payload / 1000, total,
-                        postprocess / 1000.f, total - postprocess / 1000.f);
-            }
-            v.clear();
-        }
-    }
-    void trace(int tid, uint32_t count, uint32_t steal_count, uint32_t overflow,
-            uint32_t total_us, uint32_t payload_us, uint32_t post_us) {
-        vec.at(tid).emplace_back(tp_profile_t {count, steal_count, overflow,
-                total_us, payload_us, post_us, 0});
-    }
-    ~tp_profile_holder() { clear(); }
-};
-static tp_profile_holder trace_holder {64, 128};
-#endif
-
-static threadpool_section dummy_section {nullptr, nullptr, {0}};
-
-void threadpool_scheduler::select_and_run_jobs(uint64_t tid) {
-    auto num_threads = num_queues_;
-    const auto qid = tid % num_threads;
-    const auto local_q = &queues_[qid];
-    local_q->thr_state_ = queue::thread_state::SCHEDULING;
-    auto *tls = &threadlocals;
-    auto last_steal_queue = qid;
-    size_t count = 0;
-    bool has_unattended_queue = true;
-#if DYN_THREAD_POOL_PROFILE
-    size_t overflow = 0;
-    size_t steal_count = 0;
-    payload_us = 0;
-    postprocess_us = 0;
-    auto timer = utils::create_scoped_timer(true, [&](utils::time_duration v) {
-        auto total_us = std::chrono::duration_cast<std::chrono::microseconds>(v)
-                                .count();
-        trace_holder.trace(tid, count, steal_count, overflow, total_us,
-                payload_us, postprocess_us);
-    });
-#endif
-    for (;;) {
-        auto cur_sect = cur_section_.load(std::memory_order_acquire);
-        if (!cur_sect) { continue; }
-        if (cur_sect == &dummy_section) { break; }
-        try_execute_broadcast_work(this, cur_sect, tid, this->queues_);
-        // pick a work item from this thread's queue
-        work_item *item = local_q->dequeue();
-        if (likely(item)) {
-            count++;
-            tls->work_tid = qid;
-            do_work(this, cur_sect, stream_, cur_sect->module_, tls, item,
-                    local_q);
-            continue;
-        }
-// steal jobs from other's queue
-#if 1
-        // only enable job stealing when the current thread has done at least 1
-        // work item and there is any thread that is unattended
-        if (unlikely(count > 0 && has_unattended_queue)) {
-            has_unattended_queue = false;
-            for (uint64_t q = 0; q < num_threads; q++) {
-                auto the_q = (last_steal_queue + q) % num_threads;
-                auto &victim = queues_[the_q];
-#if 1
-                if (victim.thr_state_ != queue::thread_state::UNATTENDED) {
-                    // if the queue's main thread is present in our thread pool,
-                    // and it is not busy, we expect it will take the job soon
-                    continue;
-                }
-                has_unattended_queue = true;
-#endif
-                item = victim.dequeue();
-                if (item) {
-#if DYN_THREAD_POOL_PROFILE
-                    steal_count++;
-#endif
-                    tls->work_tid = the_q;
-                    do_work(this, cur_sect, stream_, cur_sect->module_, tls,
-                            item, local_q);
-                    last_steal_queue = q;
-                    break;
-                }
-            }
-        }
-#endif
-        if (unlikely(fallback_queue_.size_)) {
-            item = nullptr;
-            {
-                std::lock_guard<std::mutex> guard {fallback_queue_.lock_};
-                if (!fallback_queue_.queue_.empty()) {
-                    fallback_queue_.size_--;
-                    item = fallback_queue_.queue_.front();
-                    fallback_queue_.queue_.pop();
-                }
-            }
-            if (item) {
-#if DYN_THREAD_POOL_PROFILE
-                overflow++;
-#endif
-                tls->work_tid = 0;
-                do_work(this, cur_sect, stream_, cur_sect->module_, tls, item,
-                        local_q);
-                continue;
-            }
-        }
-        if (unlikely(!item && tid == 0)) {
-            // no job available, check if all tasks are done
-            if (cur_sect->pending_sink_.load(std::memory_order_acquire) == 0) {
-                cur_section_.store(nullptr, std::memory_order_release);
-                break;
-            }
-        }
-    }
-}
-
-int64_t threadpool_adapter_t::before_parallel(threadpool_scheduler *ths) {
-    return 0;
-}
-
-int64_t threadpool_adapter_t::parse_tid(std::atomic<int64_t> &v,
-        threadpool_scheduler *ths, thread_local_buffer_t &tls, int64_t i) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-    return v++;
-#else
-    return i;
-#endif
-}
-
-threadpool_scheduler *threadpool_adapter_t::all_thread_prepare(
-        threadpool_scheduler *ths, runtime::stream_t *stream, int threads) {
-    if (!main_sched || main_sched->num_queues_ != (size_t)threads) {
-        main_sched = std::unique_ptr<threadpool_scheduler>(
-                new threadpool_scheduler(stream, 16, threads));
-    }
-    get_tls(stream); // set the TLS to the current engine
-    main_sched->num_broadcast_events_ = 0;
-    main_sched->stream_ = stream;
-    for (uint64_t i = 0; i < main_sched->num_queues_; i++) {
-        main_sched->queues_[i].thr_state_ = queue::thread_state::UNATTENDED;
-        main_sched->queues_[i].broadcast_ob_ver_.store(
-                0, std::memory_order_relaxed);
-    }
-    main_sched->broadcast_work_.store(nullptr, std::memory_order_release);
-    main_sched->cur_section_.store(nullptr, std::memory_order_release);
-    return main_sched.get();
-}
-void threadpool_adapter_t::main_thread(threadpool_scheduler *sched,
-        main_func_t f, runtime::stream_t *stream, void *mod_data,
-        generic_val *args) {
-    get_tls(stream); // set the TLS to the current engine
-    threadlocals.current_sched = sched;
-    threadlocals.work_tid = 0;
-    sched->queues_[0].thr_state_ = queue::thread_state::SCHEDULING;
-    f(stream, mod_data, args);
-    sched->cur_section_.store(&dummy_section, std::memory_order_release);
-    get_mem_pool(threadlocals)->clear();
-    threadlocals.current_sched = nullptr;
-    cleanup_worker_thread_state();
-}
-
-void threadpool_adapter_t::worker_thread(threadpool_scheduler *sched, int tid) {
-    get_tls(sched->stream_); // set the TLS to the current engine
-    threadlocals.current_sched = sched;
-    sched->select_and_run_jobs(tid);
-    get_mem_pool(threadlocals)->clear();
-    threadlocals.current_sched = nullptr;
-    cleanup_worker_thread_state();
-}
-
-void threadpool_adapter_t::single_thread(threadpool_scheduler *ths,
-        main_func_t f, runtime::stream_t *stream, void *mod_data,
-        generic_val *args) {
-    std::abort();
-}
-
-void thread_main(main_func_t f, runtime::stream_t *stream, void *mod_data,
-        generic_val *args) {
-    call_threadpool<threadpool_adapter_t, threadpool_scheduler>(
-            nullptr, f, stream, mod_data, args);
-}
-
-} // namespace dynamic_threadpool
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-using namespace dnnl::impl::graph::gc::runtime::dynamic_threadpool;
-using namespace dnnl::impl::graph::gc;
-
-#if DYN_THREAD_POOL_PROFILE
-extern "C" SC_API void sc_dyn_threadpool_print_trace() {
-    trace_holder.clear();
-}
-#endif
-
-extern "C" SC_API void sc_dyn_threadpool_run() {
-    auto sched = get_current_sched();
-    sched->select_and_run_jobs(0);
-}
-
-static thread_local threadpool_section tls_section;
-
-extern "C" SC_API void sc_dyn_threadpool_sched_init(runtime::stream_t *stream,
-        void *module, generic_val *args, uint64_t num_roots,
-        uint64_t queue_size, uint64_t num_threads) {
-#if DYN_THREAD_POOL_PROFILE
-    main_start_us = std::chrono::high_resolution_clock::now();
-#endif
-    auto ret = get_current_sched();
-    tls_section.args_ = args;
-    tls_section.module_ = module;
-    // make sure worker threads are alive before the first job submision
-    tls_section.pending_sink_.store(num_roots, std::memory_order_release);
-    ret->cur_section_.store(&tls_section, std::memory_order_release);
-}
-
-extern "C" SC_API void sc_dyn_threadpool_sched_destroy() {
-#if DYN_THREAD_POOL_PROFILE
-    auto total_us = std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::high_resolution_clock::now() - main_start_us)
-                            .count();
-    trace_holder.vec[0].back().main_us = total_us;
-#endif
-}
-
-extern "C" SC_API work_item_shared_data *sc_dyn_threadpool_loop_end(
-        work_item_shared_data *current, uint64_t up_levels) {
-    if (!current) { current = threadlocals.current_work_data; }
-    for (uint64_t i = 0; i < up_levels; i++) {
-        current = current->parent_;
-    }
-    auto updated_count = current->trigger_.pending_.fetch_sub(
-            1, std::memory_order_acq_rel);
-    assert(updated_count >= 1);
-    if (unlikely(updated_count == 1)) { return current; }
-    return nullptr;
-}
-
-static bool submit_bcast_work(threadpool_scheduler *sched,
-        memory_pool::filo_memory_pool_t *pool, closure_t pfunc,
-        uint64_t loop_len, uint64_t flags) {
-    auto old_bcast = sched->broadcast_work_.load(std::memory_order_acquire);
-    if (old_bcast) { return false; }
-    auto item = threadpool_arena::alloc_vsize<broadcast_work_item, uint64_t>(
-            pool, sched->stream_, 0);
-    ++sched->num_broadcast_events_;
-    item->broadcast_events_ver_ = sched->num_broadcast_events_;
-    item->layer_.pfunc_ = pfunc;
-    item->loop_len_ = loop_len;
-    item->trigger_.pending_.store(
-            sched->num_queues_, std::memory_order_relaxed);
-
-    sched->broadcast_work_.store(item, std::memory_order_release);
-
-    bool is_root = flags & work_item_flags::is_root;
-    if (unlikely(is_root)) {
-        // notify other threads that the first submission is done
-        sched->cur_section_.load(std::memory_order_relaxed)
-                ->pending_sink_.fetch_sub(1, std::memory_order_acq_rel);
-    }
-    // printf("MAIN submit %lu\n", item->broadcast_events_ver_);
-    return true;
-}
-
-extern "C" SC_API void sc_dyn_threadpool_create_work_items(closure_t pfunc,
-        uint64_t *iter, uint64_t num_iter, uint64_t loop_len,
-        uint64_t num_blocks, uint64_t outer_loop_hash, uint64_t num_buffers,
-        void **buffers, uint64_t flags) {
-    auto &tls = threadlocals;
-    auto sched = tls.current_sched;
-    auto work_tid = tls.work_tid;
-    auto parent = tls.current_work_data;
-    auto pool = get_mem_pool(tls);
-    if (num_iter == 0 && num_blocks == sched->num_queues_
-            && loop_len <= num_blocks && num_buffers == 0 && loop_len >= 4) {
-        // it is a broadcast job
-        if (likely(submit_bcast_work(sched, pool, pfunc, loop_len, flags))) {
-            return;
-        }
-    }
-    auto shared
-            = threadpool_arena::alloc_vsize<work_item_shared_data, uint64_t>(
-                    pool, sched->stream_, num_iter);
-    shared->parent_ = parent;
-    shared->layer_.pfunc_ = pfunc;
-    shared->buffers_.init(pool, sched->stream_, num_buffers);
-    if (unlikely(!buffers && num_buffers)) { buffers = parent->buffers_.ptr_; }
-    for (uint64_t i = 0; i < num_buffers; i++) {
-        shared->buffers_[i] = buffers[i];
-        ref_buffer(buffers[i], num_blocks);
-    }
-    shared->trigger_.pending_.store(loop_len, std::memory_order_release);
-    shared->num_shared_iter_ = num_iter;
-    if (likely(iter == nullptr)) { iter = tls.cur_itr; }
-    for (uint64_t i = 0; i < num_iter; i++) {
-        shared->shared_iter_[i] = iter[i];
-    }
-    sched->cur_section_.load(std::memory_order_relaxed)
-            ->pending_sink_.fetch_add(
-                    std::min(loop_len, num_blocks), std::memory_order_acq_rel);
-    bool bind_last_level = flags & work_item_flags::bind_last_level;
-    uint64_t thread_id_step = flags & work_item_flags::thread_id_step_mask;
-    uint64_t thread_block_size = num_blocks * thread_id_step;
-    uint64_t base_tid = work_tid / thread_block_size * thread_block_size;
-    auto tp_num_threads = sched->num_queues_;
-    auto items = (work_item *)threadpool_arena::alloc(
-            pool, sched->stream_, sizeof(work_item) * num_blocks, 64);
-    if (likely(loop_len <= num_blocks)) {
-        for (uint64_t tid = 0; tid < loop_len; tid++) {
-            auto item = &items[tid];
-            item->base_idx_ = tid;
-            item->size_ = 1;
-            auto suggested_queue = base_tid + tid * thread_id_step;
-            item->data_ = shared;
-            auto qid = suggested_queue % tp_num_threads;
-            auto q = &sched->queues_[qid];
-            submit_single_work_item(sched, item, qid, q, &queue::jump_queue);
-        }
-    } else {
-        uint64_t end = loop_len;
-        uint64_t begin = 0;
-        uint64_t step = 1;
-        uint64_t len = end - begin;
-        uint64_t num_jobs = len;
-        uint64_t my_jobs = utils::divide_and_ceil(num_jobs, num_blocks);
-        assert(my_jobs > 0);
-        uint64_t my_jobs_2 = my_jobs - 1;
-        uint64_t the_tid = num_jobs - my_jobs_2 * num_blocks;
-        for (uint64_t tid = 0; tid < num_blocks; tid++) {
-            uint64_t cur_jobs = tid < the_tid ? my_jobs : my_jobs_2;
-            uint64_t my_begin = tid <= the_tid
-                    ? tid * my_jobs
-                    : the_tid * my_jobs + (tid - the_tid) * my_jobs_2;
-            my_begin = my_begin * step + begin;
-            auto item = &items[tid];
-            item->base_idx_ = my_begin;
-            item->size_ = cur_jobs;
-            auto suggested_queue = base_tid + tid * thread_id_step;
-            item->data_ = shared;
-
-            auto qid = suggested_queue % tp_num_threads;
-            auto q = &sched->queues_[qid];
-            submit_single_work_item(sched, item, qid, q, &queue::jump_queue);
-        }
-    }
-    bool is_root = flags & work_item_flags::is_root;
-    if (unlikely(is_root)) {
-        // notify other threads that the first submission is done
-        sched->cur_section_.load(std::memory_order_relaxed)
-                ->pending_sink_.fetch_sub(1, std::memory_order_acq_rel);
-    }
-}
-
-extern "C" SC_API void *sc_dyn_threadpool_shared_buffer(uint64_t size) {
-    auto ret = (shared_buffer *)aligned_alloc(64, sizeof(shared_buffer) + size);
-    ret->size_ = size;
-    ret->ref_count_.store(0, std::memory_order_release);
-    return &ret->buffer_[0];
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool.hpp
deleted file mode 100644
index 87c4eaaabf9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool.hpp
+++ /dev/null
@@ -1,225 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_THREADPOOL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_THREADPOOL_HPP
-
-#include <atomic>
-#include <cassert>
-#include <mutex>
-#include <queue>
-#include <stdint.h>
-#include <vector>
-#include "context.hpp"
-#include <util/compiler_macros.hpp>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace memory_pool {
-struct filo_memory_pool_t;
-}
-namespace runtime {
-struct thread_local_buffer_t;
-
-namespace dynamic_threadpool {
-
-using closure_t = void (*)(void *stream, void *mod_data, uint64_t *itr,
-        void **shared_buffers, generic_val *);
-
-enum class schedule_policy : uint64_t {
-    current_layer_first,
-    next_layer_first,
-};
-
-struct threadpool_arena {
-    static void *alloc(memory_pool::filo_memory_pool_t *, stream_t *s,
-            uint64_t size, uint64_t alignment);
-    // allocate variable sized struct
-    template <typename T, typename TArray>
-    static T *alloc_vsize(memory_pool::filo_memory_pool_t *pool, stream_t *s,
-            uint64_t len_arr) {
-        uint64_t size = sizeof(T) + sizeof(TArray) * len_arr;
-        return (T *)alloc(pool, s, size, alignof(T));
-    }
-};
-
-template <typename T>
-struct arena_vector {
-    T *ptr_;
-    uint64_t size_;
-    T &operator[](uint64_t idx) {
-        assert(idx < size_);
-        return ptr_[idx];
-    }
-
-    void init_empty() {
-        size_ = 0;
-        ptr_ = nullptr;
-    }
-    void init(
-            memory_pool::filo_memory_pool_t *pool, stream_t *s, uint64_t size) {
-        if (size > 0) {
-            ptr_ = reinterpret_cast<T *>(threadpool_arena::alloc(
-                    pool, s, sizeof(T) * size, alignof(T)));
-        }
-        size_ = size;
-    }
-};
-
-/**
- * A layer of the threadpool represents a pure nested parallel for. "Pure" here
- * means the parallel fors are tightly nested without any statements between
- * each level. A "task" represents an instance of parallel-for. If a
- * parallel-for is executed N times by an outer parallel-loop, there will be N
- * "tasks" for this level of parallel-for
- */
-struct threadpool_layer {
-    closure_t pfunc_;
-};
-
-struct trigger;
-
-struct shared_buffer {
-    std::atomic<uint64_t> ref_count_;
-    uint64_t size_;
-    alignas(64) char buffer_[0];
-};
-
-struct trigger {
-    // the count of pending subtasks within this task
-    alignas(64) std::atomic<int64_t> pending_;
-};
-
-struct work_item_shared_data {
-    work_item_shared_data *parent_;
-    threadpool_layer layer_;
-    arena_vector<void *> buffers_;
-    trigger trigger_;
-    uint64_t num_shared_iter_;
-    uint64_t shared_iter_[0];
-};
-
-struct work_item {
-    work_item_shared_data *data_;
-    // the last-level iterator idx
-    uint64_t base_idx_;
-    uint64_t size_;
-};
-
-struct queue {
-    // the lock to protect head_and_tail_ and base_
-    alignas(64) std::atomic<uint64_t> lock_;
-    // lock-free fast slot for single work item
-    std::atomic<work_item *> fast_slot_;
-    // the newest version of broadcast work item that this queue(thread) has
-    // seen
-    std::atomic<uint64_t> broadcast_ob_ver_;
-    work_item **base_;
-    uint32_t size_;
-    enum class thread_state : uint32_t {
-        // The expected thread-id is never executed. It may occur in the case of
-        // customized thread-pool, when a specific thread is busy for other jobs
-        // unrelated to our kernel
-        UNATTENDED,
-        // The thread is scheduling/waiting on the task
-        SCHEDULING,
-        // The thread is working on a payload
-        RUNNING,
-    } thr_state_;
-    // head: the next position to insert. It should point to an empty position
-    // tail: the next position to dequeue. If head!=tail, it should point to a
-    // valid item
-    std::atomic<uint64_t> head_and_tail_;
-    bool enqueue(work_item *item) noexcept;
-    bool jump_queue(work_item *item) noexcept;
-    void lock() noexcept;
-    work_item *dequeue() noexcept;
-    uint64_t volatile_length() noexcept;
-    queue() = default;
-    queue(const queue &) = delete;
-    queue(queue &&) = delete;
-    void init(uint32_t size);
-    ~queue();
-};
-
-struct fallback_queue {
-    std::mutex lock_;
-    uint64_t size_ = 0;
-    std::queue<work_item *> queue_;
-};
-
-struct threadpool_section {
-    void *module_;
-    generic_val *args_;
-    // the pending sink jobs, if it counts down to 0, all jobs are done
-    std::atomic<uint64_t> pending_sink_;
-};
-
-struct broadcast_work_item {
-    threadpool_layer layer_;
-    trigger trigger_;
-    uint64_t loop_len_;
-    uint64_t broadcast_events_ver_;
-};
-
-struct threadpool_scheduler {
-    stream_t *stream_;
-    // per-thread queue
-    queue *queues_;
-    uint64_t num_queues_;
-    // shared queue when per-thread queue is full
-    fallback_queue fallback_queue_;
-    // only used by the main thread
-    uint64_t num_broadcast_events_;
-    std::atomic<broadcast_work_item *> broadcast_work_;
-    std::atomic<threadpool_section *> cur_section_;
-    threadpool_scheduler(
-            stream_t *stream, uint64_t queue_size, uint64_t num_threads);
-    ~threadpool_scheduler();
-    void select_and_run_jobs(uint64_t tid);
-};
-
-using main_func_t = void (*)(stream_t *, void *, generic_val *);
-struct threadpool_adapter_t {
-    static constexpr bool can_optimize_single_thread = false;
-    using TyState = std::atomic<int64_t>;
-
-    static threadpool_scheduler *all_thread_prepare(
-            threadpool_scheduler *ths, runtime::stream_t *stream, int threads);
-    static void main_thread(threadpool_scheduler *sched, main_func_t f,
-            runtime::stream_t *stream, void *mod_data, generic_val *args);
-
-    static void worker_thread(threadpool_scheduler *sched, int tid);
-
-    static void after_parallel(threadpool_scheduler *ths) {}
-
-    static void single_thread(threadpool_scheduler *ths, main_func_t f,
-            runtime::stream_t *stream, void *mod_data, generic_val *args);
-    static int64_t before_parallel(threadpool_scheduler *ths);
-    static int64_t parse_tid(std::atomic<int64_t> &v, threadpool_scheduler *ths,
-            thread_local_buffer_t &tls, int64_t i);
-};
-
-} // namespace dynamic_threadpool
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool_c.hpp b/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool_c.hpp
deleted file mode 100644
index 447acce6361..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/dynamic_threadpool_c.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_THREADPOOL_C_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_DYNAMIC_THREADPOOL_C_HPP
-
-#include <stdint.h>
-#include "context.hpp"
-#include <runtime/generic_val.hpp>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-namespace dynamic_threadpool {
-
-using closure_t = void (*)(void *stream, void *mod_data, uint64_t *itr,
-        void **shared_buffers, generic_val *);
-struct threadpool_scheduler;
-struct work_item_shared_data;
-
-namespace work_item_flags {
-enum flags : uint64_t {
-    is_root = (1ULL << 32),
-    bind_last_level = (1ULL << 33),
-    thread_id_step_mask = (1ULL << 32) - 1
-};
-}
-
-using main_func_t = void (*)(stream_t *, void *, generic_val *);
-void thread_main(
-        main_func_t f, stream_t *stream, void *mod_data, generic_val *args);
-
-} // namespace dynamic_threadpool
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-/**
- * initialize an threadpool_scheduler
- * @param sched the memory buffer for the scheduler
- * @param stream the runtime stream object
- * @param arena_size the memory allocator initial size (bytes)
- * @param queue_size the number of workitems for each per-thread queues
- * @param num_threads the expected number of threads
- */
-extern "C" SC_API void sc_dyn_threadpool_sched_init(
-        dnnl::impl::graph::gc::runtime::stream_t *stream, void *module_data,
-        dnnl::impl::graph::gc::generic_val *args, uint64_t num_roots,
-        uint64_t queue_size, uint64_t num_threads);
-
-/**
- * Destory the threadpool_scheduler.
- */
-extern "C" SC_API void sc_dyn_threadpool_sched_destroy();
-
-extern "C" SC_API void *sc_dyn_threadpool_shared_buffer(uint64_t size);
-
-extern "C" SC_API void sc_dyn_threadpool_create_work_items(
-        dnnl::impl::graph::gc::runtime::dynamic_threadpool::closure_t pfunc,
-        uint64_t *iter, uint64_t num_iter, uint64_t loop_len,
-        uint64_t num_blocks, uint64_t outer_loop_hash, uint64_t num_buffers,
-        void **buffers, uint64_t flags);
-
-extern "C" SC_API dnnl::impl::graph::gc::runtime::dynamic_threadpool::
-        work_item_shared_data *
-        sc_dyn_threadpool_loop_end(
-                dnnl::impl::graph::gc::runtime::dynamic_threadpool::
-                        work_item_shared_data *current,
-                uint64_t up_levels);
-
-extern "C" SC_API void sc_dyn_threadpool_run();
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/env_var.hpp b/src/graph/backend/graph_compiler/core/src/runtime/env_var.hpp
deleted file mode 100644
index d4074863467..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/env_var.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_ENV_VAR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_ENV_VAR_HPP
-
-#include <string>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-// Reads an environment variable 'name' and stores its string value in the
-// 'buffer' of 'buffer_size' bytes (including the terminating zero) on
-// success.
-//
-// - Returns the length of the environment variable string value (excluding
-// the terminating 0) if it is set and its contents (including the terminating
-// 0) can be stored in the 'buffer' without truncation.
-//
-// - Returns negated length of environment variable string value and writes
-// "\0" to the buffer (if it is not NULL) if the 'buffer_size' is to small to
-// store the value (including the terminating 0) without truncation.
-//
-// - Returns 0 and writes "\0" to the buffer (if not NULL) if the environment
-// variable is not set.
-//
-// - Returns INT_MIN if the 'name' is NULL.
-//
-// - Returns INT_MIN if the 'buffer_size' is negative.
-//
-// - Returns INT_MIN if the 'buffer' is NULL and 'buffer_size' is greater than
-// zero. Passing NULL 'buffer' with 'buffer_size' set to 0 can be used to
-// retrieve the length of the environment variable value string.
-//
-SC_INTERNAL_API int getenv(const char *name, char *buffer, int buffer_size);
-
-// Reads an integer from the environment
-SC_INTERNAL_API int getenv_int(const char *name, int default_value = 0);
-
-// A convenience wrapper for the 'getenv' function defined above.
-//
-// Note: Due to an apparent limitation in the wrapped 'getenv'
-// function, this function makes no distinction between:
-// (a) the environment variable not being defined at all, vs.
-// (b) the environment variable defined, with a value of empty-string.
-//
-// This function's behavior is undefined if 'name' is null or the empty-string.
-SC_INTERNAL_API std::string getenv_string(const char *name);
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/env_vars.cpp b/src/graph/backend/graph_compiler/core/src/runtime/env_vars.cpp
deleted file mode 100644
index de54552b7fd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/env_vars.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "env_vars.hpp"
-#include <assert.h>
-#include <climits>
-#include <cstring>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include "env_var.hpp"
-#ifdef _WIN32
-#include <windows.h>
-#endif
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-#define DEF_ENV(x) "ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_" #x
-#define DEF_ENV_TRACE() "ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_KERNEL_TRACE"
-const char *env_names[] = {
-        DEF_ENV(CPU_JIT),
-        DEF_ENV(OPT_LEVEL),
-        DEF_ENV_TRACE(),
-        DEF_ENV(VERBOSE),
-        DEF_ENV(PRINT_PASS_RESULT),
-        DEF_ENV(DUMP_GENCODE),
-        DEF_ENV(C_INCLUDE),
-        DEF_ENV(TRACE_INIT_CAP),
-        DEF_ENV(MANAGED_THREAD_POOL),
-};
-
-namespace utils {
-// TODO(xxx): Copied from onednn, should be removed when merge
-int getenv(const char *name, char *buffer, int buffer_size) {
-    if (name == nullptr || buffer_size < 0
-            || (buffer == nullptr && buffer_size > 0))
-        return INT_MIN;
-
-    int result = 0;
-    int term_zero_idx = 0;
-    size_t value_length = 0;
-
-#ifdef _WIN32
-    value_length = GetEnvironmentVariable(name, buffer, buffer_size);
-#else
-    const char *value = ::getenv(name);
-    value_length = value == nullptr ? 0 : strlen(value);
-#endif
-
-    if (value_length > INT_MAX)
-        result = INT_MIN;
-    else {
-        int int_value_length = (int)value_length;
-        if (int_value_length >= buffer_size) {
-#ifdef _WIN32
-            if (int_value_length > 0) int_value_length -= 1;
-#endif
-            result = -int_value_length;
-        } else {
-            term_zero_idx = int_value_length;
-            result = int_value_length;
-#ifndef _WIN32
-            if (value && buffer) strncpy(buffer, value, buffer_size - 1);
-#endif
-        }
-    }
-
-    if (buffer != nullptr) buffer[term_zero_idx] = '\0';
-    return result;
-}
-
-int getenv_int(const char *name, int default_value) {
-    int value = default_value;
-    // # of digits in the longest 32-bit signed int + sign + terminating null
-    const int len = 12;
-    char value_str[len]; // NOLINT
-    if (getenv(name, value_str, len) > 0) value = atoi(value_str);
-    return value;
-}
-
-std::string getenv_string(const char *name) {
-    assert(name);
-    assert(strlen(name) != 0);
-
-    const int value_strlen
-            = ::dnnl::impl::graph::gc::utils::getenv(name, nullptr, 0) * -1;
-    assert(value_strlen >= 0);
-
-    if (value_strlen == 0) {
-        return std::string();
-    } else {
-        std::vector<char> buffer(value_strlen + 1);
-        const int rc = ::dnnl::impl::graph::gc::utils::getenv(
-                name, &buffer[0], buffer.size());
-        assert(rc == value_strlen);
-        return std::string(&buffer[0]);
-    }
-}
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/env_vars.hpp b/src/graph/backend/graph_compiler/core/src/runtime/env_vars.hpp
deleted file mode 100644
index cda95b4e42a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/env_vars.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_ENV_VARS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_ENV_VARS_HPP
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace env_key {
-enum key {
-    SC_CPU_JIT,
-    SC_OPT_LEVEL,
-    SC_TRACE,
-    SC_VERBOSE,
-    SC_PRINT_PASS_RESULT,
-    SC_DUMP_GENCODE,
-    SC_C_INCLUDE,
-    SC_TRACE_INIT_CAP,
-    SC_MANAGED_THREAD_POOL,
-    NUM_KEYS
-};
-} // namespace env_key
-
-extern const char *env_names[env_key::NUM_KEYS];
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/generic_val.hpp b/src/graph/backend/graph_compiler/core/src/runtime/generic_val.hpp
deleted file mode 100644
index 705b60d225b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/generic_val.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_GENERIC_VAL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_GENERIC_VAL_HPP
-
-#include <stdint.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-union generic_val {
-    uint16_t v_uint16_t;
-    float v_float;
-    int32_t v_int32_t;
-    int8_t v_int8_t;
-    uint8_t v_uint8_t;
-    uint64_t v_uint64_t;
-    void *v_ptr;
-    generic_val() = default;
-    generic_val(uint16_t v) : v_uint16_t(v) {}
-    generic_val(float v) : v_float(v) {}
-    generic_val(int32_t v) : v_int32_t(v) {}
-    generic_val(int8_t v) : v_int8_t(v) {}
-    generic_val(uint8_t v) : v_uint8_t(v) {}
-    generic_val(uint64_t v) : v_uint64_t(v) {}
-    generic_val(void *v) : v_ptr(v) {}
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/generic_val_pack.hpp b/src/graph/backend/graph_compiler/core/src/runtime/generic_val_pack.hpp
deleted file mode 100644
index 7110930758e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/generic_val_pack.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_GENERIC_VAL_PACK_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_GENERIC_VAL_PACK_HPP
-
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-union generic_val;
-
-struct generic_val_pack {
-    virtual generic_val *get() = 0;
-    virtual void flush_cache() const = 0;
-    virtual SC_INTERNAL_API ~generic_val_pack() = default;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/cpu_include.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/cpu_include.hpp
deleted file mode 100644
index 3f948b57007..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/cpu_include.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_CPU_INCLUDE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_CPU_INCLUDE_HPP
-
-#if defined(__GNUC__)
-#if __GNUC__ >= 12
-#define _IS_GCC_12_ABOVE
-#endif
-#endif
-
-#ifdef _IS_GCC_12_ABOVE
-// gcc 12 cannot compile its own x86 intrinsic header!!!
-// bypass the check here
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-
-#include <cmath>
-#include <cstddef>
-#include <cstdlib>
-#include <immintrin.h>
-#include <stdint.h>
-#ifdef __AVX512F__
-#include "x86simd/vec_f16x16.hpp"
-#include "x86simd/vec_f16x32.hpp"
-#include "x86simd/vec_f16x4.hpp"
-#include "x86simd/vec_f16x8.hpp"
-#include "x86simd/vec_f32x16.hpp"
-#include "x86simd/vec_s32x16.hpp"
-#include "x86simd/vec_s8x64.hpp"
-#include "x86simd/vec_u16x32.hpp"
-#include "x86simd/vec_u32x16.hpp"
-#include "x86simd/vec_u64x8.hpp"
-#include "x86simd/vec_u8x64.hpp"
-#endif
-
-#ifdef __AVX2__
-#include "x86simd/vec_f32x8.hpp"
-#include "x86simd/vec_s32x8.hpp"
-#include "x86simd/vec_s8x32.hpp"
-#include "x86simd/vec_u16x16.hpp"
-#include "x86simd/vec_u32x8.hpp"
-#include "x86simd/vec_u64x4.hpp"
-#include "x86simd/vec_u8x32.hpp"
-#endif
-
-#ifdef __SSE3__
-#include "x86simd/vec_f32x4.hpp"
-#include "x86simd/vec_s32x4.hpp"
-#include "x86simd/vec_s8x16.hpp"
-#include "x86simd/vec_s8x8.hpp"
-#include "x86simd/vec_u16x4.hpp"
-#include "x86simd/vec_u16x8.hpp"
-#include "x86simd/vec_u32x4.hpp"
-#include "x86simd/vec_u64x2.hpp"
-#include "x86simd/vec_u8x16.hpp"
-#include "x86simd/vec_u8x8.hpp"
-#endif
-
-#ifdef _IS_GCC_12_ABOVE
-#pragma GCC diagnostic pop
-#endif
-
-#include <runtime/generic_val.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct stream_t;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace gc = dnnl::impl::graph::gc;
-
-#ifndef SC_JIT_SOURCE // if it is compiled by SC-gtests
-extern "C" void *sc_aligned_malloc(
-        gc::runtime::stream_t *stream, size_t sz) noexcept;
-extern "C" void sc_aligned_free(
-        gc::runtime::stream_t *stream, void *p) noexcept;
-extern "C" void *sc_thread_aligned_malloc(
-        gc::runtime::stream_t *stream, size_t sz) noexcept;
-extern "C" void sc_thread_aligned_free(
-        gc::runtime::stream_t *stream, void *p) noexcept;
-extern "C" void *sc_global_aligned_alloc(size_t sz, size_t align) noexcept;
-extern "C" void sc_global_aligned_free(void *ptr, size_t align) noexcept;
-#endif
-
-#define DEF_MINMAX(T) \
-    inline T sc_min(T v1, T v2) { return v1 < v2 ? v1 : v2; } \
-    inline T sc_max(T v1, T v2) { return v1 > v2 ? v1 : v2; }
-#define DEF_OP(T) \
-    DEF_MINMAX(T) \
-    inline T sc_abs(T v1) { return v1 > 0 ? v1 : -v1; }
-#define DEF_ROUND(T) \
-    inline T sc_floor(T v1) { return floor(v1); } \
-    inline T sc_ceil(T v1) { return ceil(v1); }
-#define DEF_EXP(T) \
-    inline T sc_exp(T v1) { return expf(v1); }
-#define DEF_SQRT(T) \
-    inline T sc_sqrt(T v1) { return std::sqrt(v1); }
-#define DEF_RSQRT(T) \
-    inline T sc_rsqrt(T v1) { return 1.0 / std::sqrt(v1); }
-#define inf INFINITY
-
-#define DEF_FMADD(T) \
-    inline T sc_fmadd(T v1, T v2, T v3) { return v1 * v2 + v3; }
-#define DEF_FNMADD(T) \
-    inline T sc_fnmadd(T v1, T v2, T v3) { return -(v1 * v2) + v3; }
-DEF_OP(float)
-DEF_OP(int32_t)
-DEF_OP(int8_t)
-DEF_MINMAX(uint8_t)
-DEF_MINMAX(uint16_t)
-DEF_MINMAX(uint32_t)
-DEF_MINMAX(uint64_t)
-#ifdef __AVX512FP16__
-DEF_MINMAX(_Float16)
-DEF_FMADD(_Float16)
-DEF_FNMADD(_Float16)
-#endif
-DEF_ROUND(float)
-DEF_EXP(float)
-DEF_SQRT(float)
-DEF_RSQRT(float)
-DEF_FMADD(float)
-DEF_FNMADD(float)
-
-using generic_val = gc::generic_val;
-
-template <typename T, typename T2>
-T sc_reinterpret(T2 v) {
-    union {
-        T v1;
-        T2 v2;
-    } val;
-    val.v2 = v;
-    return val.v1;
-}
-
-inline float sc_round(float v) {
-    __m128 sseval = _mm_set_ss(v);
-    __m128 rounded_val = _mm_round_ps(sseval, _MM_FROUND_TO_NEAREST_INT);
-    return _mm_cvtss_f32(rounded_val);
-}
-
-inline bool sc_isnan(float v) {
-    return std::isnan(v);
-}
-
-inline bool sc_isnan(uint16_t v) {
-    return (v & 0x7F80) == 0x7F80 && (v & 0x007F);
-}
-
-template <class T1, class T2>
-inline T1 sc_saturated_cast(const T2 &v) {
-    return static_cast<T1>(v);
-}
-
-template <>
-inline int8_t sc_saturated_cast(const int32_t &v) {
-    if (v > 127) {
-        return 127;
-    } else if (v < -128) {
-        return -128;
-    } else {
-        return v;
-    }
-}
-template <>
-inline uint8_t sc_saturated_cast(const int32_t &v) {
-    if (v > 255) {
-        return 255;
-    } else if (v < 0) {
-        return 0;
-    } else {
-        return v;
-    }
-}
-template <>
-inline int8_t sc_saturated_cast(const float &v) {
-    int iv = std::round(v);
-    if (iv > 127) {
-        return 127;
-    } else if (iv < -128) {
-        return -128;
-    } else {
-        return (int8_t)(iv);
-    }
-}
-template <>
-inline uint8_t sc_saturated_cast(const float &v) {
-    int iv = std::round(v);
-    if (iv > 255) {
-        return 255;
-    } else if (iv < 0) {
-        return 0;
-    } else {
-        return (uint8_t)iv;
-    }
-}
-
-template <class T1, class T2>
-inline T1 sc_round_and_cast(const T2 &v) {
-    return static_cast<T1>(v);
-}
-
-template <>
-inline int32_t sc_round_and_cast(const float &v) {
-    return std::roundf(v);
-}
-
-inline float sc_gather(float const *a, int const &b) {
-    return a[b];
-}
-
-inline float sc_pow(float const &a, float const &b) {
-    return powf(a, b);
-}
-
-#ifdef __AVX512F__
-INLINE vec_f32x16 sc_permutex2var(
-        vec_f32x16 const &a, vec_u32x16 const &idx, vec_f32x16 const &b) {
-    return _mm512_permutex2var_ps(a.v, idx.v, b.v);
-}
-INLINE vec_f32x8 sc_permutex2var(
-        vec_f32x8 const &a, vec_u32x8 const &idx, vec_f32x8 const &b) {
-    return _mm256_permutex2var_ps(a.v, idx.v, b.v);
-}
-INLINE vec_f32x4 sc_permutex2var(
-        vec_f32x4 const &a, vec_u32x4 const &idx, vec_f32x4 const &b) {
-    return _mm_permutex2var_ps(a.v, idx.v, b.v);
-}
-INLINE vec_f32x16 sc_permutex2var(
-        vec_f32x16 const &a, vec_s32x16 const &idx, vec_f32x16 const &b) {
-    return _mm512_permutex2var_ps(a.v, idx.v, b.v);
-}
-INLINE vec_f32x8 sc_permutex2var(
-        vec_f32x8 const &a, vec_s32x8 const &idx, vec_f32x8 const &b) {
-    return _mm256_permutex2var_ps(a.v, idx.v, b.v);
-}
-INLINE vec_f32x4 sc_permutex2var(
-        vec_f32x4 const &a, vec_s32x4 const &idx, vec_f32x4 const &b) {
-    return _mm_permutex2var_ps(a.v, idx.v, b.v);
-}
-#endif
-
-#include "x86simd/vector_maskloadstore.hpp"
-#include "x86simd/vector_utils.hpp"
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/common.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/common.hpp
deleted file mode 100644
index ef8d7c06c7c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/common.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_COMMON_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_COMMON_HPP
-#ifdef _MSC_VER
-#define ALIGNAS(N) __declspec(align(N))
-#define INLINE __inline
-#else
-#define ALIGNAS(N) __attribute__((aligned(N)))
-#define INLINE __attribute__((always_inline)) inline
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x16.hpp
deleted file mode 100644
index 287b48537ae..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x16.hpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X16_HPP
-#include <cstdint>
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#include "runtime/kernel_include/x86simd/vec_u16x16.hpp"
-class vec_u16x16;
-class vec_f32x16;
-class vec_u32x16;
-class vec_s32x16;
-#ifdef __AVX512FP16__
-class vec_f16x16 {
-public:
-    union {
-        __m256i v256;
-        __m256h v;
-        uint16_t raw[16];
-    } __attribute__((aligned(32)));
-
-    INLINE vec_f16x16() = default;
-    INLINE vec_f16x16(_Float16 f) { v = _mm256_set1_ph(f); }
-    INLINE vec_f16x16(_Float16 i0, _Float16 i1, _Float16 i2, _Float16 i3,
-            _Float16 i4, _Float16 i5, _Float16 i6, _Float16 i7, _Float16 i8,
-            _Float16 i9, _Float16 i10, _Float16 i11, _Float16 i12, _Float16 i13,
-            _Float16 i14, _Float16 i15) {
-        v = _mm256_setr_ph(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
-                i12, i13, i14, i15);
-    }
-    INLINE vec_f16x16(__m256h const &x) { v = x; }
-    INLINE vec_f16x16(__m256i const &x) { v256 = x; }
-    INLINE operator vec_u16x16() const;
-    INLINE operator vec_f32x16() const;
-    INLINE operator vec_u32x16() const;
-    INLINE operator vec_s32x16() const;
-
-    static INLINE vec_f16x16 load(const _Float16 *p) {
-        return _mm256_loadu_ph(p);
-    }
-    static INLINE vec_f16x16 load_aligned(const _Float16 *p) {
-        return _mm256_load_ph(p);
-    }
-    static INLINE void store(vec_f16x16 v, _Float16 *p) {
-        _mm256_storeu_ph(p, v.v);
-    }
-    static INLINE void store_aligned(vec_f16x16 v, _Float16 *p) {
-        _mm256_store_ph(p, v.v);
-    }
-
-    static INLINE vec_f16x16 mask_load(const _Float16 *p, __mmask16 mask) {
-        // avx512-fp16 instructions don't have bigger than 128bit mask
-        // load/store, we need to do this conversion.
-        __m256i load_256b_data
-                = _mm256_mask_loadu_epi16(_mm256_setzero_si256(), mask, p);
-        return _mm256_cvtepi16_ph(load_256b_data);
-    }
-    static INLINE void mask_store(
-            _Float16 *p, __mmask16 mask, vec_f16x16 const &a) {
-        return _mm256_mask_storeu_epi16(p, mask, _mm256_castph_si256(a.v));
-    }
-};
-
-INLINE vec_f16x16 operator+(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_add_ph(a.v, b.v);
-}
-
-INLINE vec_f16x16 operator-(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_sub_ph(a.v, b.v);
-}
-
-INLINE vec_f16x16 operator*(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_mul_ph(a.v, b.v);
-}
-
-INLINE vec_f16x16 operator/(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_div_ph(a.v, b.v);
-}
-
-INLINE __mmask16 operator!(vec_f16x16 const &a) {
-    return _mm256_cmp_ph_mask(a.v, _mm256_setzero_ph(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_cmp_ph_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_cmp_ph_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f16x16 sc_select(
-        __mmask16 mask, vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_mask_blend_ph(mask, b.v, a.v);
-}
-
-INLINE vec_f16x16 sc_fmadd(
-        vec_f16x16 const &a, vec_f16x16 const &b, vec_f16x16 const &c) {
-    return _mm256_fmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x16 sc_fnmadd(
-        vec_f16x16 const &a, vec_f16x16 const &b, vec_f16x16 const &c) {
-    return _mm256_fnmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x16 sc_max(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_max_ph(a.v, b.v);
-}
-INLINE vec_f16x16 sc_min(vec_f16x16 const &a, vec_f16x16 const &b) {
-    return _mm256_min_ph(a.v, b.v);
-}
-
-INLINE vec_f16x16 sc_round(vec_f16x16 const &a) {
-    return _mm256_roundscale_ph(
-            a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-INLINE vec_f16x16 sc_ceil(vec_f16x16 const &a) {
-    return _mm256_roundscale_ph(a.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-INLINE vec_f16x16 sc_floor(vec_f16x16 const &a) {
-    return _mm256_roundscale_ph(a.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-INLINE _Float16 sc_reduce_add(vec_f16x16 const &a) {
-    return _mm256_reduce_add_ph(a.v);
-}
-
-INLINE vec_f16x16 sc_sqrt(vec_f16x16 const &a) {
-    return _mm256_sqrt_ph(a.v);
-}
-INLINE vec_f16x16 sc_rsqrt(vec_f16x16 const &a) {
-    return _mm256_rsqrt_ph(a.v);
-}
-INLINE vec_f16x16 sc_abs(vec_f16x16 const &a) {
-    return _mm256_abs_ph(a.v);
-}
-
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x32.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x32.hpp
deleted file mode 100644
index 7941d70f36e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x32.hpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X32_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X32_HPP
-#include <cstdint>
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#include "runtime/kernel_include/x86simd/vec_u16x32.hpp"
-#ifdef __AVX512FP16__
-class vec_u16x32;
-class vec_f16x32 {
-public:
-    union {
-        __m512i v512;
-        __m512h v;
-        uint16_t raw[32];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_f16x32() = default;
-    INLINE vec_f16x32(_Float16 f) { v = _mm512_set1_ph(f); }
-    INLINE vec_f16x32(__m512h const &x) { v = x; }
-
-    static INLINE vec_f16x32 load(const _Float16 *p) {
-        return _mm512_loadu_ph(p);
-    }
-    static INLINE vec_f16x32 load_aligned(const _Float16 *p) {
-        return _mm512_load_ph(p);
-    }
-    static INLINE void store(vec_f16x32 v, _Float16 *p) {
-        return _mm512_storeu_ph(p, v.v);
-    }
-    static INLINE void store_aligned(vec_f16x32 v, _Float16 *p) {
-        _mm512_store_ph(p, v.v);
-    }
-
-    static INLINE vec_f16x32 mask_load(const _Float16 *p, __mmask32 mask) {
-        // avx512-fp16 instructions don't have bigger than 128bit mask
-        // load/store, we need to do this conversion.
-        __m512i load_512b_data
-                = _mm512_mask_loadu_epi16(vec_u16x32(0).v, mask, p);
-        return _mm512_cvtepi16_ph(load_512b_data);
-    }
-    static INLINE void mask_store(
-            _Float16 *p, __mmask32 mask, vec_f16x32 const &a) {
-        return _mm512_mask_storeu_epi16(p, mask, _mm512_castph_si512(a.v));
-    }
-};
-
-INLINE vec_f16x32 operator+(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_add_ph(a.v, b.v);
-}
-
-INLINE vec_f16x32 operator-(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_sub_ph(a.v, b.v);
-}
-
-INLINE vec_f16x32 operator*(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_mul_ph(a.v, b.v);
-}
-
-INLINE vec_f16x32 operator/(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_div_ph(a.v, b.v);
-}
-
-INLINE __mmask32 operator!(vec_f16x32 const &a) {
-    return _mm512_cmp_ph_mask(a.v, _mm512_setzero_ph(), _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator==(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_cmp_ph_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator!=(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_cmp_ph_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask32 operator>(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask32 operator<(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask32 operator>=(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask32 operator<=(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f16x32 sc_select(
-        __mmask32 mask, vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_mask_blend_ph(mask, b.v, a.v);
-}
-
-INLINE vec_f16x32 sc_fmadd(
-        vec_f16x32 const &a, vec_f16x32 const &b, vec_f16x32 const &c) {
-    return _mm512_fmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x32 sc_fnmadd(
-        vec_f16x32 const &a, vec_f16x32 const &b, vec_f16x32 const &c) {
-    return _mm512_fnmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x32 sc_max(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_max_ph(a.v, b.v);
-}
-INLINE vec_f16x32 sc_min(vec_f16x32 const &a, vec_f16x32 const &b) {
-    return _mm512_min_ph(a.v, b.v);
-}
-
-INLINE vec_f16x32 sc_round(vec_f16x32 const &a) {
-    return _mm512_roundscale_ph(
-            a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-INLINE vec_f16x32 sc_ceil(vec_f16x32 const &a) {
-    return _mm512_roundscale_ph(a.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-INLINE vec_f16x32 sc_floor(vec_f16x32 const &a) {
-    return _mm512_roundscale_ph(a.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-INLINE _Float16 sc_reduce_add(vec_f16x32 const &a) {
-    return _mm512_reduce_add_ph(a.v);
-}
-
-INLINE vec_f16x32 sc_sqrt(vec_f16x32 const &a) {
-    return _mm512_sqrt_ph(a.v);
-}
-INLINE vec_f16x32 sc_rsqrt(vec_f16x32 const &a) {
-    return _mm512_rsqrt_ph(a.v);
-}
-
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x4.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x4.hpp
deleted file mode 100644
index fb91591052d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x4.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X4_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X4_HPP
-#include <cstdint>
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#ifdef __AVX512FP16__
-class vec_f16x4 {
-public:
-    union {
-        __m128i v128;
-        __m128h v;
-        uint16_t raw[4];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_f16x4() = default;
-    INLINE vec_f16x4(_Float16 f) { v = _mm_set1_ph(f); }
-    INLINE vec_f16x4(_Float16 i0, _Float16 i1, _Float16 i2, _Float16 i3) {
-        v = _mm_setr_ph(i0, i1, i2, i3, 0, 0, 0, 0);
-    }
-    INLINE vec_f16x4(__m128h const &x) { v = x; }
-
-    static INLINE vec_f16x4 load(const _Float16 *p) { return _mm_loadu_ph(p); }
-    static INLINE vec_f16x4 load_aligned(const _Float16 *p) {
-        return _mm_load_ph(p);
-    }
-    static INLINE void store(vec_f16x4 v, _Float16 *p) {
-        _mm_storeu_ph(p, v.v);
-    }
-    static INLINE void store_aligned(vec_f16x4 v, _Float16 *p) {
-        _mm_store_ph(p, v.v);
-    }
-
-    static INLINE vec_f16x4 mask_load(const _Float16 *p, __mmask8 mask) {
-        return _mm_mask_load_sh(vec_f16x4(0).v, mask, p);
-    }
-    static INLINE void mask_store(
-            _Float16 *p, __mmask8 mask, vec_f16x4 const &a) {
-        return _mm_mask_store_sh(p, mask, a.v);
-    }
-};
-
-INLINE vec_f16x4 operator+(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_add_ph(a.v, b.v);
-}
-
-INLINE vec_f16x4 operator-(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_sub_ph(a.v, b.v);
-}
-
-INLINE vec_f16x4 operator*(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_mul_ph(a.v, b.v);
-}
-
-INLINE vec_f16x4 operator/(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_div_ph(a.v, b.v);
-}
-
-INLINE __mmask8 operator!(vec_f16x4 const &a) {
-    return _mm_cmp_ph_mask(a.v, _mm_setzero_ph(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f16x4 sc_select(
-        __mmask8 mask, vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_mask_blend_ph(mask, b.v, a.v);
-}
-
-INLINE vec_f16x4 sc_fmadd(
-        vec_f16x4 const &a, vec_f16x4 const &b, vec_f16x4 const &c) {
-    return _mm_fmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x4 sc_fnmadd(
-        vec_f16x4 const &a, vec_f16x4 const &b, vec_f16x4 const &c) {
-    return _mm_fnmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x4 sc_max(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_max_ph(a.v, b.v);
-}
-INLINE vec_f16x4 sc_min(vec_f16x4 const &a, vec_f16x4 const &b) {
-    return _mm_min_ph(a.v, b.v);
-}
-
-INLINE vec_f16x4 sc_round(vec_f16x4 const &a) {
-    return _mm_roundscale_ph(
-            a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-INLINE vec_f16x4 sc_ceil(vec_f16x4 const &a) {
-    return _mm_roundscale_ph(a.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-INLINE vec_f16x4 sc_floor(vec_f16x4 const &a) {
-    return _mm_roundscale_ph(a.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-INLINE _Float16 sc_reduce_add(vec_f16x4 const &a) {
-    return _mm_reduce_add_ph(a.v);
-}
-
-INLINE vec_f16x4 sc_sqrt(vec_f16x4 const &a) {
-    return _mm_sqrt_ph(a.v);
-}
-INLINE vec_f16x4 sc_rsqrt(vec_f16x4 const &a) {
-    return _mm_rsqrt_ph(a.v);
-}
-
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x8.hpp
deleted file mode 100644
index 94365ee91ed..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f16x8.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F16X8_HPP
-#include <cstdint>
-#include <immintrin.h>
-#include "common.hpp"
-#ifdef __AVX512FP16__
-class vec_f32x8;
-class vec_f16x8 {
-public:
-    union {
-        __m128i v128;
-        __m128h v;
-        uint16_t raw[8];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_f16x8() = default;
-    INLINE vec_f16x8(_Float16 f) { v = _mm_set1_ph(f); }
-    INLINE vec_f16x8(_Float16 i0, _Float16 i1, _Float16 i2, _Float16 i3,
-            _Float16 i4, _Float16 i5, _Float16 i6, _Float16 i7) {
-        v = _mm_setr_ph(i0, i1, i2, i3, i4, i5, i6, i7);
-    }
-    INLINE vec_f16x8(__m128h const &x) { v = x; }
-    INLINE operator vec_f32x8() const;
-
-    static INLINE vec_f16x8 load(const _Float16 *p) { return _mm_loadu_ph(p); }
-    static INLINE vec_f16x8 load_aligned(const _Float16 *p) {
-        return _mm_load_ph(p);
-    }
-    static INLINE void store(vec_f16x8 v, _Float16 *p) {
-        _mm_storeu_ph(p, v.v);
-    }
-    static INLINE void store_aligned(vec_f16x8 v, _Float16 *p) {
-        _mm_store_ph(p, v.v);
-    }
-
-    static INLINE vec_f16x8 mask_load(const _Float16 *p, __mmask8 mask) {
-        return _mm_mask_load_sh(vec_f16x8(0).v, mask, p);
-    }
-    static INLINE void mask_store(
-            _Float16 *p, __mmask8 mask, vec_f16x8 const &a) {
-        _mm_mask_store_sh(p, mask, a.v);
-    }
-};
-
-INLINE vec_f16x8 operator+(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_add_ph(a.v, b.v);
-}
-
-INLINE vec_f16x8 operator-(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_sub_ph(a.v, b.v);
-}
-
-INLINE vec_f16x8 operator*(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_mul_ph(a.v, b.v);
-}
-
-INLINE vec_f16x8 operator/(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_div_ph(a.v, b.v);
-}
-
-INLINE __mmask8 operator!(vec_f16x8 const &a) {
-    return _mm_cmp_ph_mask(a.v, _mm_setzero_ph(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_cmp_ph_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f16x8 sc_select(
-        __mmask8 mask, vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_mask_blend_ph(mask, b.v, a.v);
-}
-
-INLINE vec_f16x8 sc_fmadd(
-        vec_f16x8 const &a, vec_f16x8 const &b, vec_f16x8 const &c) {
-    return _mm_fmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x8 sc_fnmadd(
-        vec_f16x8 const &a, vec_f16x8 const &b, vec_f16x8 const &c) {
-    return _mm_fnmadd_ph(a.v, b.v, c.v);
-}
-
-INLINE vec_f16x8 sc_max(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_max_ph(a.v, b.v);
-}
-INLINE vec_f16x8 sc_min(vec_f16x8 const &a, vec_f16x8 const &b) {
-    return _mm_min_ph(a.v, b.v);
-}
-
-INLINE vec_f16x8 sc_round(vec_f16x8 const &a) {
-    return _mm_roundscale_ph(
-            a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-INLINE vec_f16x8 sc_ceil(vec_f16x8 const &a) {
-    return _mm_roundscale_ph(a.v, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-INLINE vec_f16x8 sc_floor(vec_f16x8 const &a) {
-    return _mm_roundscale_ph(a.v, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-INLINE _Float16 sc_reduce_add(vec_f16x8 const &a) {
-    return _mm_reduce_add_ph(a.v);
-}
-
-INLINE vec_f16x8 sc_sqrt(vec_f16x8 const &a) {
-    return _mm_sqrt_ph(a.v);
-}
-INLINE vec_f16x8 sc_rsqrt(vec_f16x8 const &a) {
-    return _mm_rsqrt_ph(a.v);
-}
-
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x16.hpp
deleted file mode 100644
index 48875c781f5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x16.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F32X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F32X16_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#ifdef __AVX512F__
-class vec_u16x16;
-class vec_u8x16;
-class vec_s8x16;
-class vec_s32x16;
-#ifdef __AVX512FP16__
-class vec_f16x16;
-#endif
-class vec_f32x16 {
-public:
-    union {
-        __m512 v;
-        float raw[16];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_f32x16() = default;
-    INLINE vec_f32x16(float f) { v = _mm512_set1_ps(f); }
-    INLINE vec_f32x16(float i0, float i1, float i2, float i3, float i4,
-            float i5, float i6, float i7, float i8, float i9, float i10,
-            float i11, float i12, float i13, float i14, float i15) {
-        v = _mm512_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
-                i12, i13, i14, i15);
-    }
-    INLINE vec_f32x16(__m512 const &x) { v = x; }
-    INLINE operator vec_u8x16() const;
-    INLINE operator vec_s8x16() const;
-    INLINE operator vec_s32x16() const;
-#ifdef __AVX512FP16__
-    INLINE operator vec_f16x16() const;
-#endif
-
-    static INLINE vec_f32x16 load(const float *p) { return _mm512_loadu_ps(p); }
-    static INLINE vec_f32x16 load_aligned(const float *p) {
-        // gcc bug here, we need to cast to double
-        return _mm512_load_ps((double *)p);
-    }
-    static INLINE vec_f32x16 mask_load(const float *p, __mmask16 mask) {
-        return _mm512_mask_loadu_ps(vec_f32x16(0.f).v, mask, p);
-    }
-    static INLINE void store(vec_f32x16 v, float *p) {
-        _mm512_storeu_ps(p, v.v);
-    }
-    static INLINE void store_aligned(vec_f32x16 v, float *p) {
-        _mm512_store_ps(p, v.v);
-    }
-    static INLINE void mask_store(
-            float *p, __mmask16 mask, vec_f32x16 const &a) {
-        return _mm512_mask_storeu_ps(p, mask, a.v);
-    }
-};
-
-INLINE vec_f32x16 operator+(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_add_ps(a.v, b.v);
-}
-
-INLINE vec_f32x16 operator-(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_sub_ps(a.v, b.v);
-}
-
-INLINE vec_f32x16 operator*(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_mul_ps(a.v, b.v);
-}
-
-INLINE vec_f32x16 operator/(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_div_ps(a.v, b.v);
-}
-
-INLINE __mmask16 operator!(vec_f32x16 const &a) {
-    return _mm512_cmp_ps_mask(a.v, _mm512_setzero_ps(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_cmp_ps_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_cmp_ps_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_cmp_ps_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_cmp_ps_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_cmp_ps_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_cmp_ps_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f32x16 sc_select(
-        __mmask16 mask, vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_mask_blend_ps(mask, b.v, a.v);
-}
-
-INLINE vec_f32x16 sc_max(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_max_ps(a.v, b.v);
-}
-INLINE vec_f32x16 sc_min(vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_min_ps(a.v, b.v);
-}
-
-INLINE vec_f32x16 sc_round(vec_f32x16 const &a) {
-    return _mm512_roundscale_ps(a.v, _MM_FROUND_TO_NEAREST_INT);
-}
-
-INLINE vec_f32x16 sc_ceil(vec_f32x16 const &a) {
-    return _mm512_ceil_ps(a.v);
-}
-INLINE vec_f32x16 sc_floor(vec_f32x16 const &a) {
-    return _mm512_floor_ps(a.v);
-}
-
-INLINE vec_f32x16 sc_sqrt(vec_f32x16 const &a) {
-    return _mm512_sqrt_ps(a.v);
-}
-
-INLINE vec_f32x16 sc_rsqrt(vec_f32x16 const &a) {
-    return _mm512_rsqrt14_ps(a.v);
-}
-
-INLINE float sc_reduce_add(vec_f32x16 const &a) {
-    return _mm512_reduce_add_ps(a.v);
-}
-
-INLINE float sc_reduce_mul(vec_f32x16 const &a) {
-    return _mm512_reduce_mul_ps(a.v);
-}
-
-INLINE float sc_reduce_max(vec_f32x16 const &a) {
-    return _mm512_reduce_max_ps(a.v);
-}
-
-INLINE float sc_reduce_min(vec_f32x16 const &a) {
-    return _mm512_reduce_min_ps(a.v);
-}
-
-INLINE vec_f32x16 sc_fmadd(
-        vec_f32x16 const &a, vec_f32x16 const &b, vec_f32x16 const &c) {
-    return _mm512_fmadd_ps(a.v, b.v, c.v);
-}
-
-INLINE vec_f32x16 sc_fnmadd(
-        vec_f32x16 const &a, vec_f32x16 const &b, vec_f32x16 const &c) {
-    return _mm512_fnmadd_ps(a.v, b.v, c.v);
-}
-
-INLINE vec_f32x16 sc_abs(vec_f32x16 const &a) {
-    return _mm512_abs_ps(a.v);
-}
-
-INLINE vec_f32x16 sc_log(vec_f32x16 const &a) {
-    vec_f32x16 b;
-    for (int i = 0; i < 16; i++) {
-        b.raw[i] = logf(a.raw[i]);
-    }
-    return b;
-}
-
-INLINE vec_f32x16 sc_unpack_low_vec_f32x16_32bits(
-        vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_unpacklo_ps(a.v, b.v);
-}
-
-INLINE vec_f32x16 sc_unpack_low_vec_f32x16_64bits(
-        vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_castpd_ps(
-            _mm512_unpacklo_pd(_mm512_castps_pd(a.v), _mm512_castps_pd(b.v)));
-}
-
-INLINE vec_f32x16 sc_unpack_high_vec_f32x16_32bits(
-        vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_unpackhi_ps(a.v, b.v);
-}
-
-INLINE vec_f32x16 sc_unpack_high_vec_f32x16_64bits(
-        vec_f32x16 const &a, vec_f32x16 const &b) {
-    return _mm512_castpd_ps(
-            _mm512_unpackhi_pd(_mm512_castps_pd(a.v), _mm512_castps_pd(b.v)));
-}
-
-INLINE vec_f32x16 sc_pow(vec_f32x16 const &a, vec_f32x16 const &b) {
-    vec_f32x16 c;
-    for (int i = 0; i < 16; i++) {
-        c.raw[i] = powf(a.raw[i], b.raw[i]);
-    }
-    return c;
-}
-#define PARAM_F32X16(X) X.v
-#define sc_shuffle_vec_f32x16_128bits(a, b, imm) \
-    _mm512_shuffle_f32x4(PARAM_F32X16(a), PARAM_F32X16(b), imm);
-#define sc_shuffle_vec_f32x16_32bits(a, b, imm) \
-    _mm512_shuffle_ps(PARAM_F32X16(a), PARAM_F32X16(b), imm);
-
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x4.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x4.hpp
deleted file mode 100644
index bba1eb2b432..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x4.hpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F32X4_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F32X4_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u16x4;
-class vec_s32x4;
-class vec_f32x4 {
-public:
-    union {
-        __m128 v;
-        float raw[4];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_f32x4() = default;
-    INLINE vec_f32x4(float f) { v = _mm_set1_ps(f); }
-    INLINE vec_f32x4(float i0, float i1, float i2, float i3) {
-        v = _mm_setr_ps(i0, i1, i2, i3);
-    }
-    INLINE vec_f32x4(__m128 const &x) { v = x; }
-
-    INLINE operator vec_s32x4() const;
-
-    static INLINE vec_f32x4 load(const float *p) { return _mm_loadu_ps(p); }
-    static INLINE vec_f32x4 load_aligned(const float *p) {
-        return _mm_load_ps(p);
-    }
-    static INLINE void store(vec_f32x4 v, float *p) { _mm_storeu_ps(p, v.v); }
-    static INLINE void store_aligned(vec_f32x4 v, float *p) {
-        _mm_store_ps(p, v.v);
-    }
-
-#ifdef __AVX512F__
-    static INLINE vec_f32x4 mask_load(const float *p, __mmask8 mask) {
-        return _mm_mask_loadu_ps(vec_f32x4(0.f).v, mask, p);
-    }
-    static INLINE void mask_store(float *p, __mmask8 mask, vec_f32x4 const &a) {
-        return _mm_mask_storeu_ps(p, mask, a.v);
-    }
-#elif __AVX2__
-    static INLINE vec_f32x4 mask_load(const float *p, uint32_t mask) {
-        const __m128i table(_mm_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3));
-        __m128i vmask(_mm_set1_epi32(mask));
-        vmask = _mm_and_si128(vmask, table);
-        vmask = _mm_cmpeq_epi32(vmask, table);
-        return _mm_maskload_ps(p, vmask);
-    }
-    static INLINE void mask_store(float *p, uint32_t mask, vec_f32x4 const &a) {
-        const __m128i table(_mm_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3));
-        __m128i vmask(_mm_set1_epi32(mask));
-        vmask = _mm_and_si128(vmask, table);
-        vmask = _mm_cmpeq_epi32(vmask, table);
-        return _mm_maskstore_ps(p, vmask, a.v);
-    }
-#endif
-};
-
-INLINE vec_f32x4 operator+(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_add_ps(a.v, b.v);
-}
-
-INLINE vec_f32x4 operator-(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_sub_ps(a.v, b.v);
-}
-
-INLINE vec_f32x4 operator*(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_mul_ps(a.v, b.v);
-}
-
-INLINE vec_f32x4 operator/(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_div_ps(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_f32x4 const &a) {
-    return _mm_cmp_ps_mask(a.v, _mm_setzero_ps(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_cmp_ps_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_cmp_ps_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_cmp_ps_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_cmp_ps_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_cmp_ps_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_cmp_ps_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f32x4 sc_select(
-        __mmask8 mask, vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_mask_blend_ps(mask, b.v, a.v);
-}
-#endif
-
-INLINE vec_f32x4 sc_fmadd(
-        vec_f32x4 const &a, vec_f32x4 const &b, vec_f32x4 const &c) {
-    return _mm_fmadd_ps(a.v, b.v, c.v);
-}
-
-INLINE vec_f32x4 sc_fnmadd(
-        vec_f32x4 const &a, vec_f32x4 const &b, vec_f32x4 const &c) {
-    return _mm_fnmadd_ps(a.v, b.v, c.v);
-}
-
-INLINE vec_f32x4 sc_max(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_max_ps(a.v, b.v);
-}
-INLINE vec_f32x4 sc_min(vec_f32x4 const &a, vec_f32x4 const &b) {
-    return _mm_min_ps(a.v, b.v);
-}
-
-INLINE vec_f32x4 sc_round(vec_f32x4 const &a) {
-    return _mm_round_ps(a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-INLINE vec_f32x4 sc_ceil(vec_f32x4 const &a) {
-    return _mm_ceil_ps(a.v);
-}
-INLINE vec_f32x4 sc_floor(vec_f32x4 const &a) {
-    return _mm_floor_ps(a.v);
-}
-
-INLINE float sc_reduce_add(vec_f32x4 const &a) {
-    __m128 v4 = _mm_hadd_ps(a.v, a.v);
-    v4 = _mm_hadd_ps(v4, v4);
-    return _mm_cvtss_f32(v4);
-}
-
-INLINE vec_f32x4 sc_sqrt(vec_f32x4 const &a) {
-    return _mm_sqrt_ps(a.v);
-}
-INLINE vec_f32x4 sc_rsqrt(vec_f32x4 const &a) {
-    return _mm_rsqrt_ps(a.v);
-}
-
-INLINE vec_f32x4 sc_pow(vec_f32x4 const &a, vec_f32x4 const &b) {
-    vec_f32x4 c;
-    for (int i = 0; i < 4; i++) {
-        c.raw[i] = powf(a.raw[i], b.raw[i]);
-    }
-    return c;
-}
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x8.hpp
deleted file mode 100644
index 8f2dbf8378d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_f32x8.hpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F32X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_F32X8_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#ifndef __AVX512F__
-#include <cmath>
-#endif
-#include "common.hpp"
-
-class vec_u16x8;
-class vec_s32x8;
-#ifdef __AVX512FP16__
-class vec_f16x8;
-#endif
-class vec_f32x8 {
-public:
-    union {
-        __m256 v;
-        float raw[8];
-    } __attribute__((aligned(32)));
-
-    INLINE vec_f32x8() = default;
-    INLINE vec_f32x8(float f) { v = _mm256_set1_ps(f); }
-    INLINE vec_f32x8(float i0, float i1, float i2, float i3, float i4, float i5,
-            float i6, float i7) {
-        v = _mm256_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7);
-    }
-    INLINE vec_f32x8(__m256 const &x) { v = x; }
-    INLINE operator vec_s32x8() const;
-#ifdef __AVX512FP16__
-    INLINE operator vec_f16x8() const;
-#endif
-
-    static INLINE vec_f32x8 load(const float *p) { return _mm256_loadu_ps(p); }
-    static INLINE vec_f32x8 load_aligned(const float *p) {
-        return _mm256_load_ps(p);
-    }
-    static INLINE void store(vec_f32x8 v, float *p) {
-        _mm256_storeu_ps(p, v.v);
-    }
-    static INLINE void store_aligned(vec_f32x8 v, float *p) {
-        _mm256_store_ps(p, v.v);
-    }
-
-#ifdef __AVX512F__
-    static INLINE vec_f32x8 mask_load(const float *p, __mmask8 mask) {
-        return _mm256_mask_loadu_ps(vec_f32x8(0.f).v, mask, p);
-    }
-    static INLINE void mask_store(float *p, __mmask8 mask, vec_f32x8 const &a) {
-        return _mm256_mask_storeu_ps(p, mask, a.v);
-    }
-#elif __AVX2__
-    static INLINE vec_f32x8 mask_load(const float *p, uint32_t mask) {
-        const __m256i table(_mm256_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3,
-                1 << 4, 1 << 5, 1 << 6, 1 << 7));
-        __m256i vmask(_mm256_set1_epi32(mask));
-        vmask = _mm256_and_si256(vmask, table);
-        vmask = _mm256_cmpeq_epi32(vmask, table);
-        return _mm256_maskload_ps(p, vmask);
-    }
-    static INLINE void mask_store(float *p, uint32_t mask, vec_f32x8 const &a) {
-        const __m256i table(_mm256_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3,
-                1 << 4, 1 << 5, 1 << 6, 1 << 7));
-        __m256i vmask(_mm256_set1_epi32(mask));
-        vmask = _mm256_and_si256(vmask, table);
-        vmask = _mm256_cmpeq_epi32(vmask, table);
-        return _mm256_maskstore_ps(p, vmask, a.v);
-    }
-#endif
-};
-
-INLINE vec_f32x8 operator+(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_add_ps(a.v, b.v);
-}
-
-INLINE vec_f32x8 operator-(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_sub_ps(a.v, b.v);
-}
-
-INLINE vec_f32x8 operator*(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_mul_ps(a.v, b.v);
-}
-
-INLINE vec_f32x8 operator/(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_div_ps(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_f32x8 const &a) {
-    return _mm256_cmp_ps_mask(a.v, _mm256_setzero_ps(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_cmp_ps_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_cmp_ps_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_cmp_ps_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_cmp_ps_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_cmp_ps_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_cmp_ps_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_f32x8 sc_select(
-        __mmask8 mask, vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_mask_blend_ps(mask, b.v, a.v);
-}
-#else
-INLINE vec_f32x8 sc_select(
-        unsigned char mask, vec_f32x8 const &a, vec_f32x8 const &b) {
-    float buf[8];
-    for (int i = 0; i < 8; i++) {
-        int bit = 1 << i;
-        if (mask & bit) {
-            buf[i] = ((float *)&a)[i];
-        } else {
-            buf[i] = ((float *)&b)[i];
-        }
-    }
-    return vec_f32x8::load(buf);
-}
-
-INLINE unsigned char operator==(vec_f32x8 const &a, vec_f32x8 const &b) {
-    auto ret = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
-    return _mm256_movemask_ps(ret);
-}
-INLINE unsigned char operator!=(vec_f32x8 const &a, vec_f32x8 const &b) {
-    auto ret = _mm256_cmp_ps(a.v, b.v, _CMP_NEQ_OQ);
-    return _mm256_movemask_ps(ret);
-}
-INLINE unsigned char operator>(vec_f32x8 const &a, vec_f32x8 const &b) {
-    auto ret = _mm256_cmp_ps(a.v, b.v, _CMP_GT_OQ);
-    return _mm256_movemask_ps(ret);
-}
-INLINE unsigned char operator<(vec_f32x8 const &a, vec_f32x8 const &b) {
-    auto ret = _mm256_cmp_ps(a.v, b.v, _CMP_LT_OQ);
-    return _mm256_movemask_ps(ret);
-}
-INLINE unsigned char operator>=(vec_f32x8 const &a, vec_f32x8 const &b) {
-    auto ret = _mm256_cmp_ps(a.v, b.v, _CMP_GE_OQ);
-    return _mm256_movemask_ps(ret);
-}
-INLINE unsigned char operator<=(vec_f32x8 const &a, vec_f32x8 const &b) {
-    auto ret = _mm256_cmp_ps(a.v, b.v, _CMP_LE_OQ);
-    return _mm256_movemask_ps(ret);
-}
-#endif
-
-INLINE vec_f32x8 sc_fmadd(
-        vec_f32x8 const &a, vec_f32x8 const &b, vec_f32x8 const &c) {
-    return _mm256_fmadd_ps(a.v, b.v, c.v);
-}
-
-INLINE vec_f32x8 sc_fnmadd(
-        vec_f32x8 const &a, vec_f32x8 const &b, vec_f32x8 const &c) {
-    return _mm256_fnmadd_ps(a.v, b.v, c.v);
-}
-
-INLINE vec_f32x8 sc_max(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_max_ps(a.v, b.v);
-}
-INLINE vec_f32x8 sc_min(vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_min_ps(a.v, b.v);
-}
-
-INLINE vec_f32x8 sc_round(vec_f32x8 const &a) {
-    return _mm256_round_ps(a.v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-}
-
-INLINE vec_f32x8 sc_ceil(vec_f32x8 const &a) {
-    return _mm256_ceil_ps(a.v);
-}
-INLINE vec_f32x8 sc_floor(vec_f32x8 const &a) {
-    return _mm256_floor_ps(a.v);
-}
-
-INLINE vec_f32x8 sc_sqrt(vec_f32x8 const &a) {
-    return _mm256_sqrt_ps(a.v);
-}
-INLINE vec_f32x8 sc_rsqrt(vec_f32x8 const &a) {
-    return _mm256_rsqrt_ps(a.v);
-}
-
-INLINE vec_f32x8 sc_abs(vec_f32x8 const &a) {
-    return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a.v);
-}
-
-INLINE float sc_reduce_add(vec_f32x8 const &a) {
-    const __m128 v4 = _mm_add_ps(
-            _mm256_extractf128_ps(a.v, 1), _mm256_castps256_ps128(a.v));
-    const __m128 v2 = _mm_add_ps(v4, _mm_movehl_ps(v4, v4));
-    const __m128 v1 = _mm_add_ss(v2, _mm_shuffle_ps(v2, v2, 0x55));
-    return _mm_cvtss_f32(v1);
-}
-
-INLINE vec_f32x8 sc_unpack_low_vec_f32x8_64bits(
-        vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_castpd_ps(
-            _mm256_unpacklo_pd(_mm256_castps_pd(a.v), _mm256_castps_pd(b.v)));
-}
-
-INLINE vec_f32x8 sc_unpack_low_vec_f32x8_32bits(
-        vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_unpacklo_ps(a.v, b.v);
-}
-
-INLINE vec_f32x8 sc_unpack_high_vec_f32x8_32bits(
-        vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_unpackhi_ps(a.v, b.v);
-}
-
-INLINE vec_f32x8 sc_unpack_high_vec_f32x8_64bits(
-        vec_f32x8 const &a, vec_f32x8 const &b) {
-    return _mm256_castpd_ps(
-            _mm256_unpackhi_pd(_mm256_castps_pd(a.v), _mm256_castps_pd(b.v)));
-}
-
-#define PARAM_F32X8(X) X.v
-#define sc_permute_vec_f32x8(a, b, imm8) \
-    _mm256_permute2f128_ps(PARAM_F32X8(a), PARAM_F32X8(b), imm8);
-#define sc_shuffle_vec_f32x8_128bits(a, b, imm8) \
-    _mm256_shuffle_f32x4(PARAM_F32X8(a), PARAM_F32X8(b), imm8);
-#define sc_shuffle_vec_f32x8_32bits(a, b, imm8) \
-    _mm256_shuffle_ps(PARAM_F32X8(a), PARAM_F32X8(b), imm8);
-
-INLINE vec_f32x8 sc_exp(vec_f32x8 const &a) {
-    float *flo = (float *)&a;
-    float out[8];
-    for (int i = 0; i < 8; i++) {
-        out[i] = std::exp(flo[i]);
-    }
-    return vec_f32x8::load(out);
-}
-
-INLINE vec_f32x8 sc_log(vec_f32x8 const &a) {
-    vec_f32x8 b;
-    for (int i = 0; i < 8; i++) {
-        b.raw[i] = logf(a.raw[i]);
-    }
-    return b;
-}
-
-INLINE vec_f32x8 sc_pow(vec_f32x8 const &a, vec_f32x8 const &b) {
-    vec_f32x8 c;
-    for (int i = 0; i < 8; i++) {
-        c.raw[i] = powf(a.raw[i], b.raw[i]);
-    }
-    return c;
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x16.hpp
deleted file mode 100644
index 466dafcd8c9..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x16.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S32X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S32X16_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-
-class vec_f32x16;
-class vec_u8x16;
-class vec_s8x16;
-#ifdef __AVX512F__
-#ifdef __AVX512FP16__
-class vec_f16x16;
-#endif
-class vec_s32x16 {
-public:
-    union {
-        __m512i v;
-        int32_t raw[16];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_s32x16() = default;
-    INLINE vec_s32x16(int32_t f) { v = _mm512_set1_epi32(f); }
-    INLINE vec_s32x16(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
-            int32_t i4, int32_t i5, int32_t i6, int32_t i7, int32_t i8,
-            int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13,
-            int32_t i14, int32_t i15) {
-        v = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
-                i12, i13, i14, i15);
-    }
-
-    INLINE vec_s32x16(__m512i const &x) { v = x; }
-    INLINE operator vec_f32x16() const;
-    INLINE operator vec_u8x16() const;
-    INLINE operator vec_s8x16() const;
-
-#ifdef __AVX512FP16__
-    INLINE operator vec_f16x16() const;
-#endif
-
-    static INLINE vec_s32x16 load(const int32_t *p) {
-        return _mm512_loadu_si512((const __m512i *)p);
-    }
-    static INLINE vec_s32x16 load_aligned(const int32_t *p) {
-        return _mm512_load_si512((const __m512i *)p);
-    }
-    static INLINE vec_s32x16 mask_load(const int *p, __mmask16 mask) {
-        return _mm512_mask_loadu_epi32(vec_s32x16(0).v, mask, p);
-    }
-    static INLINE void store(vec_s32x16 v, int32_t *p) {
-        _mm512_storeu_si512((__m512i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_s32x16 v, int32_t *p) {
-        _mm512_store_si512((__m512i *)p, v.v);
-    }
-    static INLINE void mask_store(int *p, __mmask16 mask, vec_s32x16 const &a) {
-        return _mm512_mask_storeu_epi32(p, mask, a.v);
-    }
-};
-
-INLINE vec_s32x16 operator+(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_add_epi32(a.v, b.v);
-}
-
-INLINE vec_s32x16 operator-(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_sub_epi32(a.v, b.v);
-}
-INLINE vec_s32x16 operator-(vec_s32x16 const &a) {
-    return _mm512_sub_epi32(_mm512_setzero_si512(), a.v);
-}
-
-INLINE vec_s32x16 operator*(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_mullo_epi32(a.v, b.v);
-}
-
-// INLINE vec_s32x16 operator/(vec_s32x16 const &a, vec_s32x16 const &b) {
-//     return _mm512_div_epi32(a.v, b.v);
-// }
-
-INLINE vec_s32x16 operator~(vec_s32x16 const &a) {
-    return _mm512_xor_si512(a.v, _mm512_set1_epi32(-1));
-}
-INLINE vec_s32x16 operator&(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_and_si512(a.v, b.v);
-}
-INLINE vec_s32x16 operator|(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_or_si512(a.v, b.v);
-}
-INLINE vec_s32x16 operator^(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_xor_si512(a.v, b.v);
-}
-
-INLINE __mmask16 operator!(vec_s32x16 const &a) {
-    return _mm512_cmp_epi32_mask(a.v, _mm512_setzero_si512(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s32x16 sc_select(
-        __mmask16 mask, vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_mask_blend_epi32(mask, b.v, a.v);
-}
-
-INLINE vec_s32x16 operator<<(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_sllv_epi32(a.v, b.v);
-}
-INLINE vec_s32x16 operator>>(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_srav_epi32(a.v, b.v);
-}
-
-// operator /
-
-INLINE vec_s32x16 sc_max(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_max_epi32(a.v, b.v);
-}
-INLINE vec_s32x16 sc_min(vec_s32x16 const &a, vec_s32x16 const &b) {
-    return _mm512_min_epi32(a.v, b.v);
-}
-
-INLINE int sc_reduce_add(vec_s32x16 const &a) {
-    return _mm512_reduce_add_epi32(a.v);
-}
-
-INLINE vec_s32x16 sc_abs(vec_s32x16 const &a) {
-    return _mm512_abs_epi32(a.v);
-}
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x4.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x4.hpp
deleted file mode 100644
index a1c2a4a9074..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x4.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S32X4_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S32X4_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_f32x4;
-class vec_s32x4 {
-public:
-    union {
-        __m128i v;
-        int32_t raw[4];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_s32x4() = default;
-    INLINE vec_s32x4(int32_t f) { v = _mm_set1_epi32(f); }
-    INLINE vec_s32x4(int32_t i0, int32_t i1, int32_t i2, int32_t i3) {
-        v = _mm_setr_epi32(i0, i1, i2, i3);
-    }
-    INLINE vec_s32x4(__m128i const &x) { v = x; }
-    INLINE operator vec_f32x4() const;
-
-    static INLINE vec_s32x4 load(const int32_t *p) {
-        return _mm_loadu_si128((const __m128i *)p);
-    }
-    static INLINE vec_s32x4 load_aligned(const int32_t *p) {
-        return _mm_load_si128((const __m128i *)p);
-    }
-    static INLINE void store(vec_s32x4 v, int32_t *p) {
-        _mm_storeu_si128((__m128i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_s32x4 v, int32_t *p) {
-        _mm_store_si128((__m128i *)p, v.v);
-    }
-
-#ifdef __AVX512F__
-    static INLINE void mask_store(int *p, __mmask8 mask, vec_s32x4 const &a) {
-        return _mm_mask_storeu_epi32(p, mask, a.v);
-    }
-    static INLINE vec_s32x4 mask_load(const int *p, __mmask8 mask) {
-        return _mm_mask_loadu_epi32(vec_s32x4(0).v, mask, p);
-    }
-#elif __AVX2__
-    static INLINE vec_s32x4 mask_load(const int *p, uint32_t mask) {
-        const __m128i table(_mm_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3));
-        __m128i vmask(_mm_set1_epi32(mask));
-        vmask = _mm_and_si128(vmask, table);
-        vmask = _mm_cmpeq_epi32(vmask, table);
-        return _mm_maskload_epi32(p, vmask);
-    }
-    static INLINE void mask_store(int *p, uint32_t mask, vec_s32x4 const &a) {
-        const __m128i table(_mm_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3));
-        __m128i vmask(_mm_set1_epi32(mask));
-        vmask = _mm_and_si128(vmask, table);
-        vmask = _mm_cmpeq_epi32(vmask, table);
-        return _mm_maskstore_epi32(p, vmask, a.v);
-    }
-#endif
-};
-
-INLINE vec_s32x4 operator+(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_add_epi32(a.v, b.v);
-}
-
-INLINE vec_s32x4 operator-(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_sub_epi32(a.v, b.v);
-}
-INLINE vec_s32x4 operator-(vec_s32x4 const &a) {
-    return _mm_sub_epi32(_mm_setzero_si128(), a.v);
-}
-
-INLINE vec_s32x4 operator*(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_mullo_epi32(a.v, b.v);
-}
-
-// INLINE vec_s32x4 operator/(vec_s32x4 const &a, vec_s32x4 const &b) {
-//     return _mm_div_epi32(a.v, b.v);
-// }
-
-INLINE vec_s32x4 operator~(vec_s32x4 const &a) {
-    return _mm_xor_si128(a.v, _mm_set1_epi32(-1));
-}
-INLINE vec_s32x4 operator&(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_s32x4 operator|(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_s32x4 operator^(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_s32x4 const &a) {
-    return _mm_cmp_epi32_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s32x4 sc_select(
-        __mmask8 mask, vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_mask_blend_epi32(mask, b.v, a.v);
-}
-#endif
-
-INLINE vec_s32x4 operator<<(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_sllv_epi32(a.v, b.v);
-}
-INLINE vec_s32x4 operator>>(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_srav_epi32(a.v, b.v);
-}
-
-// operator /
-
-INLINE vec_s32x4 sc_max(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_max_epi32(a.v, b.v);
-}
-INLINE vec_s32x4 sc_min(vec_s32x4 const &a, vec_s32x4 const &b) {
-    return _mm_min_epi32(a.v, b.v);
-}
-
-INLINE vec_s32x4 sc_abs(vec_s32x4 const &a) {
-    return _mm_abs_epi32(a.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x8.hpp
deleted file mode 100644
index 83c2413458c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s32x8.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S32X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S32X8_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#include "util/assert.hpp"
-
-class vec_f32x8;
-class vec_s32x8 {
-public:
-    union {
-        __m256i v;
-        int32_t raw[8];
-    } __attribute__((aligned(32)));
-
-    INLINE vec_s32x8() = default;
-    INLINE vec_s32x8(int32_t f) { v = _mm256_set1_epi32(f); }
-    INLINE vec_s32x8(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
-            int32_t i5, int32_t i6, int32_t i7) {
-        v = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
-    }
-    INLINE vec_s32x8(__m256i const &x) { v = x; }
-    INLINE operator vec_f32x8() const;
-
-    static INLINE vec_s32x8 load(const int32_t *p) {
-        return _mm256_loadu_si256((const __m256i *)p);
-    }
-    static INLINE vec_s32x8 load_aligned(const int32_t *p) {
-        return _mm256_load_si256((const __m256i *)p);
-    }
-    static INLINE void store(vec_s32x8 v, int32_t *p) {
-        _mm256_storeu_si256((__m256i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_s32x8 v, int32_t *p) {
-        _mm256_store_si256((__m256i *)p, v.v);
-    }
-
-#ifdef __AVX512F__
-    static INLINE vec_s32x8 mask_load(const int *p, __mmask8 mask) {
-        return _mm256_mask_loadu_epi32(vec_s32x8(0).v, mask, p);
-    }
-    static INLINE void mask_store(int *p, __mmask8 mask, vec_s32x8 const &a) {
-        return _mm256_mask_storeu_epi32(p, mask, a.v);
-    }
-#elif __AVX2__
-    static INLINE vec_s32x8 mask_load(const int *p, uint32_t mask) {
-        const __m256i table(_mm256_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3,
-                1 << 4, 1 << 5, 1 << 6, 1 << 7));
-        __m256i vmask(_mm256_set1_epi32(mask));
-        vmask = _mm256_and_si256(vmask, table);
-        vmask = _mm256_cmpeq_epi32(vmask, table);
-        return _mm256_maskload_epi32(p, vmask);
-    }
-    static INLINE void mask_store(int *p, uint32_t mask, vec_s32x8 const &a) {
-        const __m256i table(_mm256_setr_epi32(1 << 0, 1 << 1, 1 << 2, 1 << 3,
-                1 << 4, 1 << 5, 1 << 6, 1 << 7));
-        __m256i vmask(_mm256_set1_epi32(mask));
-        vmask = _mm256_and_si256(vmask, table);
-        vmask = _mm256_cmpeq_epi32(vmask, table);
-        return _mm256_maskstore_epi32(p, vmask, a.v);
-    }
-#endif
-};
-
-INLINE vec_s32x8 operator+(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_add_epi32(a.v, b.v);
-}
-
-INLINE vec_s32x8 operator-(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_sub_epi32(a.v, b.v);
-}
-INLINE vec_s32x8 operator-(vec_s32x8 const &a) {
-    return _mm256_sub_epi32(_mm256_setzero_si256(), a.v);
-}
-
-INLINE vec_s32x8 operator*(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_mullo_epi32(a.v, b.v);
-}
-
-// INLINE vec_s32x8 operator/(vec_s32x8 const &a, vec_s32x8 const &b) {
-//     return _mm256_div_epi32(a.v, b.v);
-// }
-
-INLINE vec_s32x8 operator~(vec_s32x8 const &a) {
-    return _mm256_xor_si256(a.v, _mm256_set1_epi32(-1));
-}
-INLINE vec_s32x8 operator&(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_and_si256(a.v, b.v);
-}
-INLINE vec_s32x8 operator|(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_or_si256(a.v, b.v);
-}
-INLINE vec_s32x8 operator^(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_xor_si256(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_s32x8 const &a) {
-    return _mm256_cmp_epi32_mask(a.v, _mm256_setzero_si256(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_cmp_epi32_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s32x8 sc_select(
-        __mmask8 mask, vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_mask_blend_epi32(mask, b.v, a.v);
-}
-#endif
-
-INLINE vec_s32x8 operator<<(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_sllv_epi32(a.v, b.v);
-}
-INLINE vec_s32x8 operator>>(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_srav_epi32(a.v, b.v);
-}
-
-// operator /
-
-INLINE vec_s32x8 sc_max(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_max_epi32(a.v, b.v);
-}
-INLINE vec_s32x8 sc_min(vec_s32x8 const &a, vec_s32x8 const &b) {
-    return _mm256_min_epi32(a.v, b.v);
-}
-
-INLINE vec_s32x8 sc_abs(vec_s32x8 const &a) {
-    return _mm256_abs_epi32(a.v);
-}
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x16.hpp
deleted file mode 100644
index 35184cd99fa..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x16.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X16_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_f32x16;
-class vec_s32x16;
-class vec_s8x16 {
-public:
-    union {
-        __m128i v;
-        int8_t raw[16];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_s8x16() = default;
-    INLINE vec_s8x16(int8_t f) { v = _mm_set1_epi8(f); }
-    INLINE vec_s8x16(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4,
-            int8_t i5, int8_t i6, int8_t i7, int8_t i8, int8_t i9, int8_t i10,
-            int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) {
-        v = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12,
-                i13, i14, i15);
-    }
-    vec_s8x16(__m128i const &x) { v = x; }
-    INLINE operator vec_f32x16() const;
-    INLINE operator vec_s32x16() const;
-    static INLINE vec_s8x16 load(const int8_t *p) {
-        return _mm_loadu_si128((const __m128i *)p);
-    }
-    static INLINE vec_s8x16 load_aligned(const int8_t *p) {
-        return _mm_load_si128((const __m128i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_s8x16 mask_load(const int8_t *p, __mmask16 mask) {
-        return _mm_mask_loadu_epi8(vec_s8x16(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_s8x16 v, int8_t *p) {
-        _mm_storeu_si128((__m128i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_s8x16 v, int8_t *p) {
-        _mm_store_si128((__m128i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            int8_t *p, __mmask16 mask, vec_s8x16 const &a) {
-        return _mm_mask_storeu_epi8(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_s8x16 operator+(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_add_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x16 operator-(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x16 operator-(vec_s8x16 const &a) {
-    return _mm_sub_epi8(_mm_setzero_si128(), a.v);
-}
-
-// INLINE vec_s8x16 operator*(vec_s8x16 const &a, vec_s8x16 const &b) {
-//     return _mm_mullo_epi8(a.v, b.v);
-// }
-// INLINE vec_s8x16 operator/(vec_s8x16 const &a, vec_s8x16 const &b) {
-//     return _mm_div_epi8(a.v, b.v);
-// }
-
-INLINE vec_s8x16 operator~(vec_s8x16 const &a) {
-    return _mm_xor_si128(a.v, _mm_set1_epi32(0xFFFFFFFF));
-}
-INLINE vec_s8x16 operator&(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_s8x16 operator|(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_s8x16 operator^(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask16 operator!(vec_s8x16 const &a) {
-    return _mm_cmp_epi8_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s8x16 sc_select(
-        __mmask16 mask, vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_mask_blend_epi8(mask, b.v, a.v);
-}
-#endif
-// INLINE vec_s8x16 operator<<(vec_s8x16 const &a, int16_t b) {
-//     return _mm_sll_epi8(a.v, _mm_cvtsi32_si128(b));
-// }
-// INLINE vec_s8x16 operator>>(vec_s8x16 const &a, int16_t b) {
-//     return _mm_sra_epi8(a.v, _mm_cvtsi32_si128(b));
-// }
-
-// operator /
-
-INLINE vec_s8x16 sc_unpack_low_vec_s8x16_8bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpacklo_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_low_vec_s8x16_16bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpacklo_epi16(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_low_vec_s8x16_32bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpacklo_epi32(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_low_vec_s8x16_64bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpacklo_epi64(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_high_vec_s8x16_8bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpackhi_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_high_vec_s8x16_16bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpackhi_epi16(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_high_vec_s8x16_32bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpackhi_epi32(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_unpack_high_vec_s8x16_64bits(
-        vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_unpackhi_epi64(a.v, b.v);
-}
-
-INLINE vec_s8x16 sc_max(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_max_epi8(a.v, b.v);
-}
-INLINE vec_s8x16 sc_min(vec_s8x16 const &a, vec_s8x16 const &b) {
-    return _mm_min_epi8(a.v, b.v);
-}
-INLINE vec_s8x16 sc_abs(vec_s8x16 const &a) {
-    return _mm_abs_epi8(a.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x32.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x32.hpp
deleted file mode 100644
index d79920f1c38..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x32.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X32_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X32_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_s8x32 {
-public:
-    union {
-        __m256i v;
-        int8_t raw[32];
-    } __attribute__((aligned(32)));
-
-    INLINE vec_s8x32() = default;
-    INLINE vec_s8x32(int8_t f) { v = _mm256_set1_epi8(f); }
-
-    INLINE vec_s8x32(__m256i const &x) { v = x; }
-
-    static INLINE vec_s8x32 load(const int8_t *p) {
-        return _mm256_loadu_si256((const __m256i *)p);
-    }
-    static INLINE vec_s8x32 load_aligned(const int8_t *p) {
-        return _mm256_load_si256((const __m256i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_s8x32 mask_load(const int8_t *p, __mmask32 mask) {
-        return _mm256_mask_loadu_epi8(vec_s8x32(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_s8x32 v, int8_t *p) {
-        _mm256_storeu_si256((__m256i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_s8x32 v, int8_t *p) {
-        _mm256_store_si256((__m256i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            int8_t *p, __mmask32 mask, vec_s8x32 const &a) {
-        return _mm256_mask_storeu_epi8(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_s8x32 operator+(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_add_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x32 operator-(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x32 operator-(vec_s8x32 const &a) {
-    return _mm256_sub_epi8(_mm256_setzero_si256(), a.v);
-}
-
-// INLINE vec_s8x32 operator*(vec_s8x32 const &a, vec_s8x32 const &b) {
-//     return _mm256_mullo_epi8(a.v, b.v);
-// }
-// INLINE vec_s8x32 operator/(vec_s8x32 const &a, vec_s8x32 const &b) {
-//     return _mm256_div_epi8(a.v, b.v);
-// }
-
-INLINE vec_s8x32 operator~(vec_s8x32 const &a) {
-    return _mm256_xor_si256(a.v, _mm256_set1_epi32(0xFFFFFFFF));
-}
-INLINE vec_s8x32 operator&(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_and_si256(a.v, b.v);
-}
-INLINE vec_s8x32 operator|(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_or_si256(a.v, b.v);
-}
-INLINE vec_s8x32 operator^(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_xor_si256(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask32 operator!(vec_s8x32 const &a) {
-    return _mm256_cmp_epi8_mask(a.v, _mm256_setzero_si256(), _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator==(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator!=(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask32 operator>(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask32 operator<(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask32 operator>=(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask32 operator<=(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s8x32 sc_select(
-        __mmask32 mask, vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_mask_blend_epi8(mask, b.v, a.v);
-}
-#endif
-// INLINE vec_s8x32 operator<<(vec_s8x32 const &a, int16_t b) {
-//     return _mm256_sll_epi8(a.v, _mm256_cvtsi32_si256(b));
-// }
-// INLINE vec_s8x32 operator>>(vec_s8x32 const &a, int16_t b) {
-//     return _mm256_sra_epi8(a.v, _mm256_cvtsi32_si256(b));
-// }
-
-// operator /
-
-INLINE vec_s8x32 sc_max(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_max_epi8(a.v, b.v);
-}
-INLINE vec_s8x32 sc_min(vec_s8x32 const &a, vec_s8x32 const &b) {
-    return _mm256_min_epi8(a.v, b.v);
-}
-INLINE vec_s8x32 sc_abs(vec_s8x32 const &a) {
-    return _mm256_abs_epi8(a.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x64.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x64.hpp
deleted file mode 100644
index 385e8a50b86..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x64.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X64_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X64_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#ifdef __AVX512F__
-class vec_s8x64 {
-public:
-    union {
-        __m512i v;
-        int8_t raw[64];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_s8x64() = default;
-    INLINE vec_s8x64(int8_t f) { v = _mm512_set1_epi8(f); }
-
-    INLINE vec_s8x64(__m512i const &x) { v = x; }
-
-    static INLINE vec_s8x64 load(const int8_t *p) {
-        return _mm512_loadu_si512((const __m512i *)p);
-    }
-    static INLINE vec_s8x64 load_aligned(const int8_t *p) {
-        return _mm512_load_si512((const __m512i *)p);
-    }
-    static INLINE vec_s8x64 mask_load(const int8_t *p, __mmask64 mask) {
-        return _mm512_mask_loadu_epi8(vec_s8x64(0).v, mask, p);
-    }
-    static INLINE void store(vec_s8x64 v, int8_t *p) {
-        _mm512_storeu_si512((__m512i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_s8x64 v, int8_t *p) {
-        _mm512_store_si512((__m512i *)p, v.v);
-    }
-    static INLINE void mask_store(
-            int8_t *p, __mmask64 mask, vec_s8x64 const &a) {
-        return _mm512_mask_storeu_epi8(p, mask, a.v);
-    }
-};
-
-INLINE vec_s8x64 operator+(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_add_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x64 operator-(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x64 operator-(vec_s8x64 const &a) {
-    return _mm512_sub_epi8(_mm512_setzero_si512(), a.v);
-}
-
-// INLINE vec_s8x64 operator*(vec_s8x64 const &a, vec_s8x64 const &b) {
-//     return _mm512_mullo_epi8(a.v, b.v);
-// }
-// INLINE vec_s8x64 operator/(vec_s8x64 const &a, vec_s8x64 const &b) {
-//     return _mm512_div_epi8(a.v, b.v);
-// }
-
-INLINE vec_s8x64 operator~(vec_s8x64 const &a) {
-    return _mm512_xor_si512(a.v, _mm512_set1_epi32(0xFFFFFFFF));
-}
-INLINE vec_s8x64 operator&(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_and_si512(a.v, b.v);
-}
-INLINE vec_s8x64 operator|(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_or_si512(a.v, b.v);
-}
-INLINE vec_s8x64 operator^(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_xor_si512(a.v, b.v);
-}
-
-INLINE __mmask64 operator!(vec_s8x64 const &a) {
-    return _mm512_cmp_epi8_mask(a.v, _mm512_setzero_si512(), _MM_CMPINT_EQ);
-}
-INLINE __mmask64 operator==(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask64 operator!=(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask64 operator>(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask64 operator<(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask64 operator>=(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask64 operator<=(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s8x64 sc_select(
-        __mmask64 mask, vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_mask_blend_epi8(mask, b.v, a.v);
-}
-// INLINE vec_s8x64 operator<<(vec_s8x64 const &a, int16_t b) {
-//     return _mm512_sll_epi8(a.v, _mm512_cvtsi32_si512(b));
-// }
-// INLINE vec_s8x64 operator>>(vec_s8x64 const &a, int16_t b) {
-//     return _mm512_sra_epi8(a.v, _mm512_cvtsi32_si512(b));
-// }
-
-// operator /
-
-INLINE vec_s8x64 sc_max(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_max_epi8(a.v, b.v);
-}
-INLINE vec_s8x64 sc_min(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_min_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x64 sc_abs(vec_s8x64 const &a) {
-    return _mm512_abs_epi8(a.v);
-}
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x8.hpp
deleted file mode 100644
index 31ed1ebbbdc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_s8x8.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_S8X8_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include <xmmintrin.h>
-#include "common.hpp"
-class vec_s8x8 {
-public:
-    union {
-        __m128i v;
-        int8_t raw[16];
-    } __attribute__((aligned(16)));
-    INLINE vec_s8x8() = default;
-    INLINE vec_s8x8(int8_t f) { v = _mm_set1_epi8(f); }
-    INLINE vec_s8x8(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4,
-            int8_t i5, int8_t i6, int8_t i7) {
-        v = _mm_setr_epi8(
-                i0, i1, i2, i3, i4, i5, i6, i7, 0, 0, 0, 0, 0, 0, 0, 0);
-    }
-    vec_s8x8(__m128i const &x) { v = x; }
-    static INLINE vec_s8x8 load(const int8_t *p) {
-        return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, ((int8_t *)p)[7],
-                ((int8_t *)p)[6], ((int8_t *)p)[5], ((int8_t *)p)[4],
-                ((int8_t *)p)[3], ((int8_t *)p)[2], ((int8_t *)p)[1],
-                ((int8_t *)p)[0]);
-    }
-    static INLINE vec_s8x8 load_aligned(const int8_t *p) {
-        return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, ((int8_t *)p)[7],
-                ((int8_t *)p)[6], ((int8_t *)p)[5], ((int8_t *)p)[4],
-                ((int8_t *)p)[3], ((int8_t *)p)[2], ((int8_t *)p)[1],
-                ((int8_t *)p)[0]);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_s8x8 mask_load(const int8_t *p, __mmask8 mask8) {
-        __mmask16 mask16 = 0x00FF; // Initialize the 16-bit mask to higher bits
-                // 0's (initially unmasked)
-
-        // Manually copy 8-bit mask to the lower 8 bits of the 16-bit mask
-        mask16 &= (mask8 & 0xFF);
-        return _mm_mask_loadu_epi8(vec_s8x8(0).v, mask16, p);
-    }
-#endif
-    static INLINE void store(vec_s8x8 v, int8_t *p) {
-        _mm_storel_pi((__m64 *)p, _mm_castsi128_ps(v.v));
-    }
-    // static INLINE void store_aligned(vec_s8x8 v, int8_t *p) {
-    //     // warning: no aligned store
-    //     _mm_storeu_si64((__m64 *)p, v.v);
-    // }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            int8_t *p, __mmask8 mask8, vec_s8x8 const &a) {
-        __mmask16 mask16 = 0x00FF; // Initialize the 16-bit mask to higher bits
-                // 0's (initially unmasked)
-
-        // Manually copy 8-bit mask to the lower 8 bits of the 16-bit mask
-        mask16 &= (mask8 & 0xFF);
-        return _mm_mask_storeu_epi8(p, mask16, a.v);
-    }
-#endif
-};
-
-INLINE vec_s8x8 operator+(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_add_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x8 operator-(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_s8x8 operator-(vec_s8x8 const &a) {
-    return _mm_sub_epi8(_mm_setzero_si128(), a.v);
-}
-
-// INLINE vec_s8x8 operator*(vec_s8x8 const &a, vec_s8x8 const &b) {
-//     return _mm_mullo_epi8(a.v, b.v);
-// }
-// INLINE vec_s8x8 operator/(vec_s8x8 const &a, vec_s8x8 const &b) {
-//     return _mm_div_epi8(a.v, b.v);
-// }
-
-INLINE vec_s8x8 operator~(vec_s8x8 const &a) {
-    auto xor_helper = _mm_setr_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-            0xFF, 0, 0, 0, 0, 0, 0, 0, 0);
-    return _mm_xor_si128(a.v, xor_helper);
-}
-INLINE vec_s8x8 operator&(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_s8x8 operator|(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_s8x8 operator^(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_s8x8 const &a) {
-    return _mm_cmp_epi8_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_s8x8 sc_select(
-        __mmask8 mask8, vec_s8x8 const &a, vec_s8x8 const &b) {
-    __mmask16 mask16 = 0x00FF; // Initialize the 16-bit mask to higher bits
-            // 0's (initially unmasked)
-
-    // Manually copy 8-bit mask to the lower 8 bits of the 16-bit mask
-    mask16 &= (mask8 & 0xFF);
-    return _mm_mask_blend_epi8(mask16, b.v, a.v);
-}
-#endif
-// INLINE vec_s8x8 operator<<(vec_s8x8 const &a, int16_t b) {
-//     return _mm_sll_epi8(a.v, _mm_cvtsi32_si128(b));
-// }
-// INLINE vec_s8x8 operator>>(vec_s8x8 const &a, int16_t b) {
-//     return _mm_sra_epi8(a.v, _mm_cvtsi32_si128(b));
-// }
-
-// operator /
-
-INLINE vec_s8x8 sc_max(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_max_epi8(a.v, b.v);
-}
-INLINE vec_s8x8 sc_min(vec_s8x8 const &a, vec_s8x8 const &b) {
-    return _mm_min_epi8(a.v, b.v);
-}
-INLINE vec_s8x8 sc_abs(vec_s8x8 const &a) {
-    return _mm_abs_epi8(a.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x16.hpp
deleted file mode 100644
index 1f614414e2e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x16.hpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X16_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#include "util/assert.hpp"
-class vec_u32x16;
-class vec_s32x16;
-class vec_f32x16;
-#ifdef __AVX512FP16__
-class vec_f16x16;
-#endif
-class vec_u16x16 {
-public:
-    union {
-        __m256i v;
-        uint16_t raw[16];
-#ifdef __AVX512BF16__
-        __m256bh v16;
-#endif
-    } __attribute__((aligned(32)));
-
-    INLINE vec_u16x16() = default;
-    INLINE vec_u16x16(uint16_t f) { v = _mm256_set1_epi16(f); }
-    INLINE vec_u16x16(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3,
-            uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, uint16_t i8,
-            uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13,
-            uint16_t i14, uint16_t i15) {
-        v = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
-                i12, i13, i14, i15);
-    }
-    INLINE vec_u16x16(__m256i const &x) { v = x; }
-    INLINE operator vec_u32x16() const;
-    INLINE operator vec_s32x16() const;
-#ifdef __AVX512FP16__
-    INLINE operator vec_f16x16() const;
-#endif
-
-    static INLINE vec_u16x16 load(const uint16_t *p) {
-        return _mm256_loadu_si256((const __m256i *)p);
-    }
-    static INLINE vec_u16x16 load_aligned(const uint16_t *p) {
-        return _mm256_load_si256((const __m256i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u16x16 mask_load(const uint16_t *p, __mmask16 mask) {
-        return _mm256_mask_loadu_epi16(vec_u16x16(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_u16x16 v, uint16_t *p) {
-        _mm256_storeu_si256((__m256i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u16x16 v, uint16_t *p) {
-        _mm256_store_si256((__m256i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint16_t *p, __mmask16 mask, vec_u16x16 const &a) {
-        return _mm256_mask_storeu_epi16(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_u16x16 operator+(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_add_epi16(a.v, b.v);
-}
-
-INLINE vec_u16x16 operator-(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_sub_epi16(a.v, b.v);
-}
-INLINE vec_u16x16 operator-(vec_u16x16 const &a) {
-    return _mm256_sub_epi16(_mm256_setzero_si256(), a.v);
-}
-
-// _mm_mulhi_epu16 was supported, but the high 16 bits of result was return.
-// INLINE vec_u16x16 operator*(vec_u16x16 const &a, vec_u16x16 const &b) {
-//    return _mm256_mulhi_epu16(a.v, b.v);
-// }
-// INLINE vec_u16x16 operator/(vec_u16x16 const &a, vec_u16x16 const &b) {
-//     return _mm256_div_epu16(a.v, b.v);
-// }
-
-INLINE vec_u16x16 operator~(vec_u16x16 const &a) {
-    return _mm256_xor_si256(a.v, _mm256_set1_epi16(0xFFFF));
-}
-INLINE vec_u16x16 operator&(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_and_si256(a.v, b.v);
-}
-INLINE vec_u16x16 operator|(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_or_si256(a.v, b.v);
-}
-INLINE vec_u16x16 operator^(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_xor_si256(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask16 operator!(vec_u16x16 const &a) {
-    return _mm256_cmp_epu16_mask(a.v, _mm256_setzero_si256(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u16x16 sc_select(
-        __mmask16 mask, vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_mask_blend_epi16(mask, b.v, a.v);
-}
-#endif
-
-INLINE vec_u16x16 operator<<(vec_u16x16 const &a, const int b) {
-    return _mm256_sll_epi16(a.v, _mm_cvtsi32_si128(b));
-}
-INLINE vec_u16x16 operator>>(vec_u16x16 const &a, const int b) {
-    return _mm256_srl_epi16(a.v, _mm_cvtsi32_si128(b));
-}
-
-INLINE vec_u16x16 sc_max(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_max_epu16(a.v, b.v);
-}
-INLINE vec_u16x16 sc_min(vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_min_epu16(a.v, b.v);
-}
-INLINE vec_u16x16 sc_unpack_low_vec_u16x16_16bits(
-        vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_unpacklo_epi16(a.v, b.v);
-}
-INLINE vec_u16x16 sc_unpack_high_vec_u16x16_16bits(
-        vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_unpackhi_epi16(a.v, b.v);
-}
-INLINE vec_u16x16 sc_unpack_low_vec_u16x16_32bits(
-        vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_unpacklo_epi32(a.v, b.v);
-}
-INLINE vec_u16x16 sc_unpack_high_vec_u16x16_32bits(
-        vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_unpackhi_epi32(a.v, b.v);
-}
-INLINE vec_u16x16 sc_unpack_low_vec_u16x16_64bits(
-        vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_unpacklo_epi64(a.v, b.v);
-}
-INLINE vec_u16x16 sc_unpack_high_vec_u16x16_64bits(
-        vec_u16x16 const &a, vec_u16x16 const &b) {
-    return _mm256_unpackhi_epi64(a.v, b.v);
-}
-
-#define PARAM_U16X16(X) X.v
-#define sc_permute_vec_u16x16(a, b, imm) \
-    _mm256_permute2f128_si256(PARAM_U16X16(a), PARAM_U16X16(b), imm);
-#define sc_shuffle_vec_u16x16_128bits(a, b, imm8) \
-    _mm256_castps_si256( \
-            _mm256_shuffle_f32x4(_mm256_castsi256_ps(PARAM_U16X16(a)), \
-                    _mm256_castsi256_ps(PARAM_U16X16(b)), imm8));
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x32.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x32.hpp
deleted file mode 100644
index f7547f74091..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x32.hpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X32_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X32_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u16x8;
-#ifdef __AVX512F__
-class vec_u16x32 {
-public:
-    union {
-        __m512i v;
-        uint16_t raw[32];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_u16x32() = default;
-    INLINE vec_u16x32(uint16_t f) { v = _mm512_set1_epi16(f); }
-    INLINE vec_u16x32(__m512i const &x) { v = x; }
-    INLINE vec_u16x32(vec_u16x8 const &x);
-    INLINE vec_u16x32(vec_u16x8 const &x, int mask);
-    INLINE vec_u16x32(vec_u16x8 const &x, int mask, vec_u16x32 const &src);
-    static INLINE vec_u16x32 load(const uint16_t *p) {
-        return _mm512_loadu_si512((const __m512i *)p);
-    }
-    static INLINE vec_u16x32 load_aligned(const uint16_t *p) {
-        return _mm512_load_si512((const __m512i *)p);
-    }
-    static INLINE vec_u16x32 mask_load(const uint16_t *p, __mmask32 mask) {
-        return _mm512_mask_loadu_epi16(vec_u16x32(0).v, mask, p);
-    }
-
-    static INLINE void store(vec_u16x32 v, uint16_t *p) {
-        _mm512_storeu_si512((__m512i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u16x32 v, uint16_t *p) {
-        _mm512_store_si512((__m512i *)p, v.v);
-    }
-    static INLINE void mask_store(
-            uint16_t *p, __mmask32 mask, vec_u16x32 const &a) {
-        return _mm512_mask_storeu_epi16(p, mask, a.v);
-    }
-};
-
-INLINE vec_u16x32 operator+(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_add_epi16(a.v, b.v);
-}
-
-INLINE vec_u16x32 operator-(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_sub_epi16(a.v, b.v);
-}
-INLINE vec_u16x32 operator-(vec_u16x32 const &a) {
-    return _mm512_sub_epi16(_mm512_setzero_si512(), a.v);
-}
-
-// _mm_mulhi_epu16 was supported, but the high 16 bits of result was return.
-// INLINE vec_u16x32 operator*(vec_u16x32 const &a, vec_u16x32 const &b) {
-//     return _mm512_mulhi_epu16(a.v, b.v);
-// }
-
-// INLINE vec_u16x32 operator/(vec_u16x32 const &a, vec_u16x32 const &b) {
-//     return _mm512_div_epu16(a.v, b.v);
-// }
-
-INLINE vec_u16x32 operator~(vec_u16x32 const &a) {
-    return _mm512_xor_si512(a.v, _mm512_set1_epi16(0xFFFF));
-}
-INLINE vec_u16x32 operator&(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_and_si512(a.v, b.v);
-}
-INLINE vec_u16x32 operator|(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_or_si512(a.v, b.v);
-}
-INLINE vec_u16x32 operator^(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_xor_si512(a.v, b.v);
-}
-
-INLINE __mmask32 operator!(vec_u16x32 const &a) {
-    return _mm512_cmp_epu16_mask(a.v, _mm512_setzero_si512(), _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator==(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator!=(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask32 operator>(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask32 operator<(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask32 operator>=(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask32 operator<=(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u16x32 sc_select(
-        __mmask32 mask, vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_mask_blend_epi16(mask, b.v, a.v);
-}
-
-INLINE vec_u16x32 operator<<(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_sllv_epi16(a.v, b.v);
-}
-INLINE vec_u16x32 operator>>(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_srlv_epi16(a.v, b.v);
-}
-
-INLINE vec_u16x32 sc_max(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_max_epu16(a.v, b.v);
-}
-INLINE vec_u16x32 sc_min(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_min_epu16(a.v, b.v);
-}
-INLINE vec_u16x32 sc_unpack_low_vec_u16x32_16bits(
-        vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_unpacklo_epi16(a.v, b.v);
-}
-
-INLINE vec_u16x32 sc_unpack_low_vec_u16x32_32bits(
-        vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_unpacklo_epi32(a.v, b.v);
-}
-
-INLINE vec_u16x32 sc_unpack_low_vec_u16x32_64bits(
-        vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_unpacklo_epi64(a.v, b.v);
-}
-
-INLINE vec_u16x32 sc_unpack_high_vec_u16x32_16bits(
-        vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_unpackhi_epi16(a.v, b.v);
-}
-INLINE vec_u16x32 sc_unpack_high_vec_u16x32_32bits(
-        vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_unpackhi_epi32(a.v, b.v);
-}
-INLINE vec_u16x32 sc_unpack_high_vec_u16x32_64bits(
-        vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_unpackhi_epi64(a.v, b.v);
-}
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x4.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x4.hpp
deleted file mode 100644
index 9e7d48b3621..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x4.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X4_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X4_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u32x4;
-class vec_f32x4;
-class vec_u16x4 {
-public:
-    union {
-        __m128i v128;
-        __m64 v;
-        uint16_t raw[8];
-#ifdef __AVX512BF16__
-        __m128bh v16;
-#endif
-    } __attribute__((aligned(16)));
-
-    INLINE vec_u16x4() = default;
-    INLINE vec_u16x4(uint16_t f) { v128 = _mm_set1_epi16(f); }
-    INLINE vec_u16x4(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3) {
-        v128 = _mm_setr_epi16(i0, i1, i2, i3, 0, 0, 0, 0);
-    }
-    INLINE operator vec_u32x4() const;
-
-    INLINE vec_u16x4(__m64 const &x) { v = x; }
-    INLINE vec_u16x4(__m128i const &x) { v128 = x; }
-
-    static INLINE vec_u16x4 load(const uint16_t *p) {
-        return *(const __m64 *)p;
-    }
-
-    static INLINE void store(vec_u16x4 v, uint16_t *p) { *(__m64 *)p = v.v; }
-};
-
-// INLINE vec_u16x4 operator+(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_add_epi16(a.v, b.v);
-// }
-
-// INLINE vec_u16x4 operator-(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_sub_epi16(a.v, b.v);
-// }
-// INLINE vec_u16x4 operator-(vec_u16x4 const &a, uint16_t b) {
-//     return _mm_sub_epi16(a.v, _mm_set1_epi16(b));s
-// }
-// INLINE vec_u16x4 operator-(vec_u16x4 const &a) {
-//     return _mm_sub_epi16(_mm_setzero_si128(), a.v);
-// }
-
-// INLINE vec_u16x4 operator*(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_mullo_epi16(a.v, b.v);
-// }
-// INLINE vec_u16x4 operator*(vec_u16x4 const &a, uint16_t b) {
-//     return _mm_mullo_epi16(a.v, _mm_set1_epi16(b));
-// }
-
-// INLINE vec_u16x4 operator~(vec_u16x4 const &a) {
-//     return _mm_xor_si128(a.v, _mm_set1_epi16(-1));
-// }
-INLINE vec_u16x4 operator&(vec_u16x4 const &a, vec_u16x4 const &b) {
-    return _mm_and_si64(a.v, b.v);
-}
-// INLINE vec_u16x4 operator&&(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_and_si128(a.v, b.v);
-// }
-
-// INLINE vec_u16x4 operator|(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_or_si128(a.v, b.v);
-// }
-// INLINE vec_u16x4 operator||(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_or_si128(a.v, b.v);
-// }
-
-// INLINE vec_u16x4 operator^(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_xor_si128(a.v, b.v);
-// }
-
-// #ifdef __AVX512F__
-// INLINE __mmask8 operator!(vec_u16x4 const &a) {
-//     return _mm_cmp_epu16_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-// }
-// INLINE __mmask8 operator==(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_EQ);
-// }
-// INLINE __mmask8 operator!=(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_NE);
-// }
-// INLINE __mmask8 operator>(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GT);
-// }
-// INLINE __mmask8 operator>(vec_u16x4 const &a, float b) {
-//     return _mm_cmp_epu16_mask(a.v, _mm_set1_epi16(b), _MM_CMPINT_GT);
-// }
-// INLINE __mmask8 operator<(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LT);
-// }
-// INLINE __mmask8 operator<(vec_u16x4 const &a, float &b) {
-//     return _mm_cmp_epu16_mask(a.v, _mm_set1_epi16(b), _MM_CMPINT_LT);
-// }
-// INLINE __mmask8 operator>=(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GE);
-// }
-// INLINE __mmask8 operator>=(vec_u16x4 const &a, float b) {
-//     return _mm_cmp_epu16_mask(a.v, _mm_set1_epi16(b), _MM_CMPINT_GE);
-// }
-// INLINE __mmask8 operator<=(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LE);
-// }
-// INLINE __mmask8 operator<=(vec_u16x4 const &a, float b) {
-//     return _mm_cmp_epu16_mask(a.v, _mm_set1_epi16(b), _MM_CMPINT_LE);
-// }
-// vec_u16x4 sc_select(__mmask8 mask, vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_mask_blend_epi16(mask, b.v, a.v);
-// }
-// #endif
-
-INLINE vec_u16x4 operator<<(vec_u16x4 const &a, vec_u16x4 const &b) {
-    return _mm_sll_pi16(a.v, b.v);
-}
-INLINE vec_u16x4 operator>>(vec_u16x4 const &a, vec_u16x4 const &b) {
-    return _mm_srl_pi16(a.v, b.v);
-}
-
-// operator /
-
-// This is for signed 16-bit integers comparison
-// INLINE vec_u16x4 sc_max(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_max_pi16(a.v, b.v);
-// }
-// INLINE vec_u16x4 sc_min(vec_u16x4 const &a, vec_u16x4 const &b) {
-//     return _mm_min_pi16(a.v, b.v);
-// }
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x8.hpp
deleted file mode 100644
index 7172bbc8829..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u16x8.hpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U16X8_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u32x8;
-class vec_f32x8;
-class vec_u16x8 {
-public:
-    union {
-        __m128i v;
-        uint16_t raw[8];
-#ifdef __AVX512BF16__
-        __m128bh v16;
-#endif
-    } __attribute__((aligned(16)));
-
-    INLINE vec_u16x8() = default;
-    INLINE vec_u16x8(uint16_t f) { v = _mm_set1_epi16(f); }
-    INLINE vec_u16x8(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3,
-            uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7) {
-        v = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
-    }
-    // INLINE vec_u16x8(vec_f32x8 val);
-    INLINE vec_u16x8(__m128i const &x) { v = x; }
-    INLINE operator vec_u32x8() const;
-
-    static INLINE vec_u16x8 load(const uint16_t *p) {
-        return _mm_loadu_si128((const __m128i *)p);
-    }
-    static INLINE vec_u16x8 load_aligned(const uint16_t *p) {
-        return _mm_load_si128((const __m128i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u16x8 mask_load(const uint16_t *p, __mmask8 mask) {
-        return _mm_mask_loadu_epi16(vec_u16x8(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_u16x8 v, uint16_t *p) {
-        _mm_storeu_si128((__m128i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u16x8 v, uint16_t *p) {
-        _mm_store_si128((__m128i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint16_t *p, __mmask8 mask, vec_u16x8 const &a) {
-        return _mm_mask_storeu_epi16(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_u16x8 operator+(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_add_epi16(a.v, b.v);
-}
-
-INLINE vec_u16x8 operator-(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_sub_epi16(a.v, b.v);
-}
-INLINE vec_u16x8 operator-(vec_u16x8 const &a) {
-    return _mm_sub_epi16(_mm_setzero_si128(), a.v);
-}
-// _mm_mulhi_epu16 was supported, but the high 16 bits of result was return.
-// INLINE vec_u16x8 operator*(vec_u16x8 const &a, vec_u16x8 const &b) {
-//     return _mm_mulhi_epu16(a.v, b.v);
-// }
-// INLINE vec_u16x8 operator/(vec_u16x8 const &a, vec_u16x8 const &b) {
-//     return _mm_div_epu16(a.v, b.v);
-// }
-
-INLINE vec_u16x8 operator~(vec_u16x8 const &a) {
-    return _mm_xor_si128(a.v, _mm_set1_epi16(0xFFFF));
-}
-INLINE vec_u16x8 operator&(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_u16x8 operator|(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_u16x8 operator^(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_u16x8 const &a) {
-    return _mm_cmp_epu16_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_cmp_epu16_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u16x8 sc_select(
-        __mmask8 mask, vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_mask_blend_epi16(mask, b.v, a.v);
-}
-#endif
-
-INLINE vec_u16x8 operator<<(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_sll_epi16(a.v, b.v);
-}
-INLINE vec_u16x8 operator>>(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_srl_epi16(a.v, b.v);
-}
-
-INLINE vec_u16x8 sc_max(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_max_epu16(a.v, b.v);
-}
-INLINE vec_u16x8 sc_min(vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_min_epu16(a.v, b.v);
-}
-INLINE vec_u16x8 sc_unpack_low_vec_u16x8_16bits(
-        vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_unpacklo_epi16(a.v, b.v);
-}
-INLINE vec_u16x8 sc_unpack_high_vec_u16x8_16bits(
-        vec_u16x8 const &a, vec_u16x8 const &b) {
-    return _mm_unpackhi_epi16(a.v, b.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x16.hpp
deleted file mode 100644
index 858654bcf3a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x16.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U32X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U32X16_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u16x16;
-#ifdef __AVX512F__
-#ifdef __AVX512FP16__
-class vec_f16x16;
-#endif
-class vec_u32x16 {
-public:
-    union {
-        __m512i v;
-        uint32_t raw[16];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_u32x16() = default;
-    INLINE vec_u32x16(uint32_t f) { v = _mm512_set1_epi32(f); }
-    INLINE vec_u32x16(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3,
-            uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7, uint32_t i8,
-            uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13,
-            uint32_t i14, uint32_t i15) {
-        v = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11,
-                i12, i13, i14, i15);
-    }
-    INLINE operator vec_u16x16() const;
-    INLINE operator vec_u8x16() const;
-
-#ifdef __AVX512FP16__
-    INLINE operator vec_f16x16() const;
-#endif
-
-    INLINE vec_u32x16(__m512i const &x) { v = x; }
-
-    static INLINE vec_u32x16 load(const uint32_t *p) {
-        return _mm512_loadu_si512((const __m512i *)p);
-    }
-    static INLINE vec_u32x16 load_aligned(const uint32_t *p) {
-        return _mm512_load_si512((const __m512i *)p);
-    }
-    static INLINE vec_u32x16 mask_load(const uint32_t *p, __mmask16 mask) {
-        return _mm512_mask_loadu_epi32(vec_u32x16(0).v, mask, p);
-    }
-    static INLINE void store(vec_u32x16 v, uint32_t *p) {
-        _mm512_storeu_si512((__m512i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u32x16 v, uint32_t *p) {
-        _mm512_store_si512((__m512i *)p, v.v);
-    }
-    static INLINE void mask_store(
-            uint32_t *p, __mmask16 mask, vec_u32x16 const &a) {
-        return _mm512_mask_storeu_epi32(p, mask, a.v);
-    }
-};
-
-// TODO(xxx): here we use signed integer add operation for u32x16.
-// Currently, some of BF16 lowering depends on this func.
-INLINE vec_u32x16 operator+(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_add_epi32(a.v, b.v);
-}
-// INLINE vec_u32x16 operator-(vec_u32x16 const &a, vec_u32x16 const &b) {
-//     return _mm512_sub_epi32(a.v, b.v);
-// }
-// INLINE vec_u32x16 operator-(vec_u32x16 const &a) {
-//     return _mm512_sub_epi32(_mm512_setzero_si512(), a.v);
-// }
-// INLINE vec_u32x16 operator*(vec_u32x16 const &a, vec_u32x16 const &b) {
-//     return _mm512_mul_epu32(a.v, b.v);
-// }
-// INLINE vec_u32x16 operator/(vec_u32x16 const &a, vec_u32x16 const &b) {
-//     return _mm512_div_epu32(a.v, b.v);
-// }
-
-INLINE vec_u32x16 operator~(vec_u32x16 const &a) {
-    return _mm512_xor_si512(a.v, _mm512_set1_epi32(0xFFFFFFFF));
-}
-INLINE vec_u32x16 operator&(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_and_si512(a.v, b.v);
-}
-INLINE vec_u32x16 operator|(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_or_si512(a.v, b.v);
-}
-INLINE vec_u32x16 operator^(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_xor_si512(a.v, b.v);
-}
-
-INLINE __mmask16 operator!(vec_u32x16 const &a) {
-    return _mm512_cmp_epu32_mask(a.v, _mm512_setzero_si512(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u32x16 sc_select(
-        __mmask16 mask, vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_mask_blend_epi32(mask, b.v, a.v);
-}
-
-INLINE vec_u32x16 operator<<(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_sllv_epi32(a.v, b.v);
-}
-INLINE vec_u32x16 operator>>(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_srlv_epi32(a.v, b.v);
-}
-
-INLINE vec_u32x16 sc_max(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_max_epi32(a.v, b.v);
-}
-INLINE vec_u32x16 sc_min(vec_u32x16 const &a, vec_u32x16 const &b) {
-    return _mm512_min_epi32(a.v, b.v);
-}
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x4.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x4.hpp
deleted file mode 100644
index 3b48f3313ea..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x4.hpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U32X4_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U32X4_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u16x4;
-class vec_u32x4 {
-public:
-    union {
-        __m128i v;
-        uint32_t raw[4];
-    } ALIGNAS(16);
-
-    INLINE vec_u32x4() = default;
-    INLINE vec_u32x4(uint32_t f) { v = _mm_set1_epi32(f); }
-    INLINE vec_u32x4(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) {
-        v = _mm_setr_epi32(i0, i1, i2, i3);
-    }
-    INLINE vec_u32x4(__m128i const &x) { v = x; }
-    INLINE operator vec_u16x4() const;
-    static INLINE vec_u32x4 load(const uint32_t *p) {
-        return _mm_loadu_si128((const __m128i *)p);
-    }
-    static INLINE vec_u32x4 load_aligned(const uint32_t *p) {
-        return _mm_load_si128((const __m128i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u32x4 mask_load(const uint32_t *p, __mmask8 mask) {
-        return _mm_mask_loadu_epi32(vec_u32x4(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_u32x4 v, uint32_t *p) {
-        _mm_storeu_si128((__m128i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u32x4 v, uint32_t *p) {
-        _mm_store_si128((__m128i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint32_t *p, __mmask8 mask, vec_u32x4 const &a) {
-        return _mm_mask_storeu_epi32(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_u32x4 operator+(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_add_epi32(a.v, b.v);
-}
-
-// INLINE vec_u32x4 operator-(vec_u32x4 const &a, vec_u32x4 const &b) {
-//     return _mm_sub_epi32(a.v, b.v);
-// }
-// INLINE vec_u32x4 operator-(vec_u32x4 const &a) {
-//     return _mm_sub_epi32(_mm_setzero_si128(), a.v);
-// }
-// INLINE vec_u32x4 operator*(vec_u32x4 const &a, vec_u32x4 const &b) {
-//     return _mm_mullo_epi32(a.v, b.v);
-// }
-// INLINE vec_u32x4 operator/(vec_u32x4 const &a, vec_u32x4 const &b) {
-//     return _mm_div_epu32(a.v, b.v);
-// }
-
-INLINE vec_u32x4 operator~(vec_u32x4 const &a) {
-    return _mm_xor_si128(a.v, _mm_set1_epi32(0xFFFFFFFF));
-}
-INLINE vec_u32x4 operator&(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_u32x4 operator|(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_u32x4 operator^(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_u32x4 const &a) {
-    return _mm_cmp_epu32_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u32x4 sc_select(
-        __mmask8 mask, vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_mask_blend_epi32(mask, b.v, a.v);
-}
-#endif
-
-#ifdef __AVX2__
-INLINE vec_u32x4 operator<<(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_sllv_epi32(a.v, b.v);
-}
-INLINE vec_u32x4 operator>>(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_srlv_epi32(a.v, b.v);
-}
-#endif
-
-#ifdef __SSE4_1__
-INLINE vec_u32x4 sc_max(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_max_epi32(a.v, b.v);
-}
-INLINE vec_u32x4 sc_min(vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_min_epi32(a.v, b.v);
-}
-#endif
-
-INLINE vec_u32x4 sc_unpack_low_vec_u32x4_32bits(
-        vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_unpacklo_epi32(a.v, b.v);
-}
-
-INLINE vec_u32x4 sc_unpack_high_vec_u32x4_32bits(
-        vec_u32x4 const &a, vec_u32x4 const &b) {
-    return _mm_unpackhi_epi32(a.v, b.v);
-}
-
-INLINE vec_u16x4 sc_reinterpret(vec_u32x4 const &x);
-INLINE vec_u32x4 sc_reinterpret(vec_u16x4 const &x);
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x8.hpp
deleted file mode 100644
index c82c2cadedd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u32x8.hpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U32X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U32X8_HPP
-
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u16x8;
-class vec_u32x8 {
-public:
-    union {
-        __m256i v;
-        uint32_t raw[8];
-    } __attribute__((aligned(32)));
-
-    INLINE vec_u32x8() = default;
-    INLINE vec_u32x8(int32_t f) { v = _mm256_set1_epi32(f); }
-    INLINE vec_u32x8(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
-            int32_t i5, int32_t i6, int32_t i7) {
-        v = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
-    }
-    INLINE vec_u32x8(__m256i const &x) { v = x; }
-    INLINE operator vec_u16x8() const;
-    static INLINE vec_u32x8 load(const uint32_t *p) {
-        return _mm256_loadu_si256((const __m256i *)p);
-    }
-    static INLINE vec_u32x8 load_aligned(const int32_t *p) {
-        return _mm256_load_si256((const __m256i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u32x8 mask_load(const uint32_t *p, __mmask8 mask) {
-        return _mm256_mask_loadu_epi32(vec_u32x8(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_u32x8 v, uint32_t *p) {
-        _mm256_storeu_si256((__m256i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u32x8 v, uint32_t *p) {
-        _mm256_store_si256((__m256i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint32_t *p, __mmask8 mask, vec_u32x8 const &a) {
-        return _mm256_mask_storeu_epi32(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_u32x8 operator+(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_add_epi32(a.v, b.v);
-}
-
-// INLINE vec_u32x8 operator-(vec_u32x8 const &a, vec_u32x8 const &b) {
-//     return _mm256_sub_epi32(a.v, b.v);
-// }
-// INLINE vec_u32x8 operator-(vec_u32x8 const &a) {
-//     return _mm256_sub_epi32(_mm256_setzero_si256(), a.v);
-// }
-
-// INLINE vec_u32x8 operator*(vec_u32x8 const &a, vec_u32x8 const &b) {
-//     return _mm256_mullo_epi32(a.v, b.v);
-// }
-// INLINE vec_u32x8 operator/(vec_u32x8 const &a, vec_u32x8 const &b) {
-//     return _mm256_div_epu32(a.v, b.v);
-// }
-
-INLINE vec_u32x8 operator~(vec_u32x8 const &a) {
-    return _mm256_xor_si256(a.v, _mm256_set1_epi32(0xFFFFFFFF));
-}
-INLINE vec_u32x8 operator&(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_and_si256(a.v, b.v);
-}
-INLINE vec_u32x8 operator|(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_or_si256(a.v, b.v);
-}
-INLINE vec_u32x8 operator^(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_xor_si256(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_u32x8 const &a) {
-    return _mm256_cmp_epu32_mask(a.v, _mm256_setzero_si256(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_cmp_epu32_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u32x8 sc_select(
-        __mmask8 mask, vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_mask_blend_epi32(mask, b.v, a.v);
-}
-#else
-INLINE unsigned char operator>(vec_u32x8 const &a, vec_u32x8 const &b) {
-    unsigned char ret = 0;
-    for (int i = 0; i < 8; i++) {
-        ret <<= 1;
-        ret |= ((uint32_t *)&a)[i] > ((uint32_t *)&b)[i];
-    }
-    return ret;
-}
-vec_u32x8 sc_select(
-        unsigned char mask, vec_u32x8 const &a, vec_u32x8 const &b) {
-    uint32_t buf[8];
-    for (int i = 0; i < 8; i++) {
-        int bit = 1 << (7 - i);
-        if (mask & bit) {
-            buf[i] = ((uint32_t *)&b)[i];
-        } else {
-            buf[i] = ((uint32_t *)&a)[i];
-        }
-    }
-    return vec_u32x8::load(buf);
-}
-#endif
-
-INLINE vec_u32x8 operator<<(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_sllv_epi32(a.v, b.v);
-}
-INLINE vec_u32x8 operator>>(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_srlv_epi32(a.v, b.v);
-}
-
-INLINE vec_u32x8 sc_max(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_max_epu32(a.v, b.v);
-}
-INLINE vec_u32x8 sc_min(vec_u32x8 const &a, vec_u32x8 const &b) {
-    return _mm256_min_epu32(a.v, b.v);
-}
-
-INLINE vec_u16x8 sc_reinterpret(vec_u32x8 const &x);
-INLINE vec_u32x8 sc_reinterpret(vec_u16x8 const &x);
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x2.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x2.hpp
deleted file mode 100644
index ac238d34d7c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x2.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U64X2_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U64X2_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u64x2 {
-public:
-    union {
-        __m128i v;
-        uint64_t raw[2];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_u64x2() = default;
-    INLINE vec_u64x2(__m128i const &x) { v = x; }
-    static INLINE vec_u64x2 load(const uint64_t *p) {
-        return _mm_loadu_si128((const __m128i *)p);
-    }
-    static INLINE vec_u64x2 load_aligned(const uint64_t *p) {
-        return _mm_load_si128((const __m128i *)p);
-    }
-
-    static INLINE void store(vec_u64x2 v, uint64_t *p) {
-        _mm_storeu_si128((__m128i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u64x2 v, uint64_t *p) {
-        _mm_store_si128((__m128i *)p, v.v);
-    }
-};
-
-INLINE vec_u64x2 operator+(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_add_epi64(a.v, b.v);
-}
-
-INLINE vec_u64x2 operator-(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_sub_epi64(a.v, b.v);
-}
-
-#if defined(__AVX512DQ__) && defined(__AVX512VL__)
-INLINE vec_u64x2 operator*(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_mullo_epi64(a.v, b.v);
-}
-#endif
-
-INLINE vec_u64x2 operator~(vec_u64x2 const &a) {
-    return _mm_xor_si128(a.v, _mm_set1_epi8(-1));
-}
-INLINE vec_u64x2 operator&(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_u64x2 operator|(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_u64x2 operator^(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#if defined(__AVX512F__) && defined(__AVX512VL__)
-INLINE vec_u64x2 sc_max(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_max_epi64(a.v, b.v);
-}
-INLINE vec_u64x2 sc_min(vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_min_epi64(a.v, b.v);
-}
-#endif
-
-INLINE vec_u64x2 sc_unpack_low_vec_u64x2_64bits(
-        vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_unpacklo_epi64(a.v, b.v);
-}
-
-INLINE vec_u64x2 sc_unpack_high_vec_u64x2_64bits(
-        vec_u64x2 const &a, vec_u64x2 const &b) {
-    return _mm_unpackhi_epi64(a.v, b.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x4.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x4.hpp
deleted file mode 100644
index b83ce5c7efe..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x4.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U64X4_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U64X4_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u64x4 {
-public:
-    union {
-        __m256i v;
-        uint64_t raw[4];
-    } ALIGNAS(32);
-
-    INLINE vec_u64x4() = default;
-    INLINE vec_u64x4(__m256i const &x) { v = x; }
-    static INLINE vec_u64x4 load(const uint64_t *p) {
-        return _mm256_loadu_si256((const __m256i *)p);
-    }
-    static INLINE vec_u64x4 load_aligned(const uint64_t *p) {
-        return _mm256_load_si256((const __m256i *)p);
-    }
-
-    static INLINE void store(vec_u64x4 v, uint64_t *p) {
-        _mm256_storeu_si256((__m256i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u64x4 v, uint64_t *p) {
-        _mm256_store_si256((__m256i *)p, v.v);
-    }
-};
-
-INLINE vec_u64x4 operator+(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_add_epi64(a.v, b.v);
-}
-
-INLINE vec_u64x4 operator-(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_sub_epi64(a.v, b.v);
-}
-
-#if defined(__AVX512DQ__) && defined(__AVX512VL__)
-INLINE vec_u64x4 operator*(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_mullo_epi64(a.v, b.v);
-}
-#endif
-
-INLINE vec_u64x4 operator~(vec_u64x4 const &a) {
-    return _mm256_xor_si256(a.v, _mm256_set1_epi8(-1));
-}
-INLINE vec_u64x4 operator&(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_and_si256(a.v, b.v);
-}
-INLINE vec_u64x4 operator|(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_or_si256(a.v, b.v);
-}
-INLINE vec_u64x4 operator^(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_xor_si256(a.v, b.v);
-}
-
-#if defined(__AVX512F__) && defined(__AVX512VL__)
-INLINE vec_u64x4 sc_max(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_max_epi64(a.v, b.v);
-}
-INLINE vec_u64x4 sc_min(vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_min_epi64(a.v, b.v);
-}
-#endif
-
-INLINE vec_u64x4 sc_unpack_low_vec_u64x4_64bits(
-        vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_unpacklo_epi64(a.v, b.v);
-}
-
-INLINE vec_u64x4 sc_unpack_high_vec_u64x4_64bits(
-        vec_u64x4 const &a, vec_u64x4 const &b) {
-    return _mm256_unpackhi_epi64(a.v, b.v);
-}
-#define PARAM_U64X4(X) X.v
-#define sc_permute_vec_u64x4(a, b, imm8) \
-    _mm256_castpd_si256( \
-            _mm256_permute2f128_ps(_mm256_castsi256_pd(PARAM_U64X4(a)), \
-                    _mm256_castsi256_pd(PARAM_U64X4(b)), imm8));
-#define sc_shuffle_vec_u64x4_128bits(a, b, imm8) \
-    _mm256_castpd_si256( \
-            _mm256_shuffle_f64x2(_mm256_castsi256_pd(PARAM_U64X4(a)), \
-                    _mm256_castsi256_pd(PARAM_U64X4(b)), imm8));
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x8.hpp
deleted file mode 100644
index d59b5d1e1e4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u64x8.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U64X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U64X8_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#ifdef __AVX512F__
-class vec_u64x8 {
-public:
-    union {
-        __m512i v;
-        uint64_t raw[8];
-    } ALIGNAS(64);
-
-    INLINE vec_u64x8() = default;
-    INLINE vec_u64x8(__m512i const &x) { v = x; }
-    static INLINE vec_u64x8 load(const uint64_t *p) {
-        return _mm512_loadu_si512((const __m512i *)p);
-    }
-    static INLINE vec_u64x8 load_aligned(const uint64_t *p) {
-        return _mm512_load_si512((const __m512i *)p);
-    }
-
-    static INLINE void store(vec_u64x8 v, uint64_t *p) {
-        _mm512_storeu_si512((__m512i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u64x8 v, uint64_t *p) {
-        _mm512_store_si512((__m512i *)p, v.v);
-    }
-};
-
-INLINE vec_u64x8 operator+(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_add_epi64(a.v, b.v);
-}
-
-INLINE vec_u64x8 operator-(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_sub_epi64(a.v, b.v);
-}
-
-#if defined(__AVX512DQ__) && defined(__AVX512VL__)
-INLINE vec_u64x8 operator*(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_mullo_epi64(a.v, b.v);
-}
-#endif
-
-INLINE vec_u64x8 operator~(vec_u64x8 const &a) {
-    return _mm512_xor_si512(a.v, _mm512_set1_epi8(-1));
-}
-INLINE vec_u64x8 operator&(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_and_si512(a.v, b.v);
-}
-INLINE vec_u64x8 operator|(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_or_si512(a.v, b.v);
-}
-INLINE vec_u64x8 operator^(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_xor_si512(a.v, b.v);
-}
-
-#if defined(__AVX512VL__)
-INLINE vec_u64x8 sc_max(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_max_epi64(a.v, b.v);
-}
-INLINE vec_u64x8 sc_min(vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_min_epi64(a.v, b.v);
-}
-#endif
-
-INLINE vec_u64x8 sc_unpack_low_vecu64x8_64bits(
-        vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_unpacklo_epi64(a.v, b.v);
-}
-
-INLINE vec_u64x8 sc_unpack_high_vecu64x8_64bits(
-        vec_u64x8 const &a, vec_u64x8 const &b) {
-    return _mm512_unpackhi_epi64(a.v, b.v);
-}
-#define PARAM_U64X8(X) X.v
-#define sc_shuffle_vec_u64x8_128bits(a, b, imm8) \
-    _mm512_castpd_si512( \
-            _mm512_shuffle_f64x2(_mm512_castsi512_pd(PARAM_U64X8(a)), \
-                    _mm512_castsi512_pd(PARAM_U64X8(b)), imm8));
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x16.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x16.hpp
deleted file mode 100644
index 0685b0380bd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x16.hpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X16_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_f32x16;
-class vec_s32x16;
-class vec_u8x16 {
-public:
-    union {
-        __m128i v;
-        uint8_t raw[16];
-    } __attribute__((aligned(16)));
-
-    INLINE vec_u8x16() = default;
-    INLINE vec_u8x16(uint8_t f) { v = _mm_set1_epi8(f); }
-    INLINE vec_u8x16(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4,
-            uint8_t i5, uint8_t i6, uint8_t i7, uint8_t i8, uint8_t i9,
-            uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14,
-            uint8_t i15) {
-        v = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12,
-                i13, i14, i15);
-    }
-    INLINE vec_u8x16(__m128i const &x) { v = x; }
-    INLINE operator vec_f32x16() const;
-    INLINE operator vec_s32x16() const;
-    static INLINE vec_u8x16 load(const uint8_t *p) {
-        return _mm_loadu_si128((const __m128i *)p);
-    }
-    static INLINE vec_u8x16 load_aligned(const int8_t *p) {
-        return _mm_load_si128((const __m128i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u8x16 mask_load(const uint8_t *p, __mmask16 mask) {
-        return _mm_mask_loadu_epi8(vec_u8x16(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_u8x16 v, uint8_t *p) {
-        _mm_storeu_si128((__m128i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u8x16 v, int8_t *p) {
-        _mm_store_si128((__m128i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint8_t *p, __mmask16 mask, vec_u8x16 const &a) {
-        return _mm_mask_storeu_epi8(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_u8x16 operator+(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_add_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x16 operator-(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x16 operator-(vec_u8x16 const &a) {
-    return _mm_sub_epi8(_mm_setzero_si128(), a.v);
-}
-
-// INLINE vec_u8x16 operator*(vec_u8x16 const &a, vec_u8x16 const &b) {
-//     return _mm_mullo_epu8(a.v, b.v);
-// }
-
-// INLINE vec_u9x16 operator/(vec_u8x16 const &a, vec_u8x16 const &b) {
-//     return _mm_div_epu8(a.v, b.v);
-// }
-
-INLINE vec_u8x16 operator~(vec_u8x16 const &a) {
-    return _mm_xor_si128(a.v, _mm_set1_epi8(-1));
-}
-INLINE vec_u8x16 operator&(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_u8x16 operator|(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_u8x16 operator^(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask16 operator!(vec_u8x16 const &a) {
-    return _mm_cmp_epu8_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator==(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask16 operator!=(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask16 operator>(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask16 operator<(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask16 operator>=(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask16 operator<=(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u8x16 sc_select(
-        __mmask16 mask, vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_mask_blend_epi8(mask, b.v, a.v);
-}
-#endif
-
-#ifdef __AVX512VBMI__
-INLINE vec_u8x16 sc_permutex2var(
-        vec_u8x16 const &a, vec_u8x16 const &idx, vec_u8x16 const &b) {
-    return _mm_permutex2var_epi8(a.v, idx.v, b.v);
-}
-#endif
-
-INLINE vec_u8x16 sc_unpack_low_vec_u8x16_8bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpacklo_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_low_vec_u8x16_16bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpacklo_epi16(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_low_vec_u8x16_32bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpacklo_epi32(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_low_vec_u8x16_64bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpacklo_epi64(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_high_vec_u8x16_8bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpackhi_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_high_vec_u8x16_16bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpackhi_epi16(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_high_vec_u8x16_32bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpackhi_epi32(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_unpack_high_vec_u8x16_64bits(
-        vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_unpackhi_epi64(a.v, b.v);
-}
-
-INLINE vec_u8x16 sc_max(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_max_epu8(a.v, b.v);
-}
-INLINE vec_u8x16 sc_min(vec_u8x16 const &a, vec_u8x16 const &b) {
-    return _mm_min_epu8(a.v, b.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x32.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x32.hpp
deleted file mode 100644
index f9e6ce34b32..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x32.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X32_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X32_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-class vec_u8x32 {
-public:
-    union {
-        __m256i v;
-        uint8_t raw[32];
-    } __attribute__((aligned(32)));
-
-    INLINE vec_u8x32() = default;
-    INLINE vec_u8x32(uint8_t f) { v = _mm256_set1_epi8(f); }
-    INLINE vec_u8x32(__m256i const &x) { v = x; }
-
-    static INLINE vec_u8x32 load(const uint8_t *p) {
-        return _mm256_loadu_si256((const __m256i *)p);
-    }
-    static INLINE vec_u8x32 load_aligned(const int8_t *p) {
-        return _mm256_load_si256((const __m256i *)p);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u8x32 mask_load(const uint8_t *p, __mmask32 mask) {
-        return _mm256_mask_loadu_epi8(vec_u8x32(0).v, mask, p);
-    }
-#endif
-    static INLINE void store(vec_u8x32 v, uint8_t *p) {
-        _mm256_storeu_si256((__m256i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u8x32 v, int8_t *p) {
-        _mm256_store_si256((__m256i *)p, v.v);
-    }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint8_t *p, __mmask32 mask, vec_u8x32 const &a) {
-        return _mm256_mask_storeu_epi8(p, mask, a.v);
-    }
-#endif
-};
-
-INLINE vec_u8x32 operator+(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_add_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x32 operator-(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x32 operator-(vec_u8x32 const &a) {
-    return _mm256_sub_epi8(_mm256_setzero_si256(), a.v);
-}
-
-// INLINE vec_u8x32 operator*(vec_u8x32 const &a, vec_u8x32 const &b) {
-//     return _mm256_mullo_epu8(a.v, b.v);
-// }
-// INLINE vec_u8x32 operator/(vec_u8x32 const &a, vec_u8x32 const &b) {
-//     return _mm256_div_epu8(a.v, b.v);
-// }
-
-INLINE vec_u8x32 operator~(vec_u8x32 const &a) {
-    return _mm256_xor_si256(a.v, _mm256_set1_epi8(-1));
-}
-INLINE vec_u8x32 operator&(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_and_si256(a.v, b.v);
-}
-INLINE vec_u8x32 operator|(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_or_si256(a.v, b.v);
-}
-INLINE vec_u8x32 operator^(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_xor_si256(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask32 operator!(vec_u8x32 const &a) {
-    return _mm256_cmp_epu8_mask(a.v, _mm256_setzero_si256(), _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator==(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask32 operator!=(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask32 operator>(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask32 operator<(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask32 operator>=(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask32 operator<=(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u8x32 sc_select(
-        __mmask32 mask, vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_mask_blend_epi8(mask, b.v, a.v);
-}
-#endif
-
-INLINE vec_u8x32 sc_max(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_max_epu8(a.v, b.v);
-}
-INLINE vec_u8x32 sc_min(vec_u8x32 const &a, vec_u8x32 const &b) {
-    return _mm256_min_epu8(a.v, b.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x64.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x64.hpp
deleted file mode 100644
index cf524d31e7b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x64.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X64_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X64_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include "common.hpp"
-#ifdef __AVX512F__
-class vec_u8x64 {
-public:
-    union {
-        __m512i v;
-        uint8_t raw[64];
-    } __attribute__((aligned(64)));
-
-    INLINE vec_u8x64() = default;
-    INLINE vec_u8x64(uint8_t f) { v = _mm512_set1_epi8(f); }
-    INLINE vec_u8x64(__m512i const &x) { v = x; }
-
-    static INLINE vec_u8x64 load(const uint8_t *p) {
-        return _mm512_loadu_si512((const __m512i *)p);
-    }
-    static INLINE vec_u8x64 load_aligned(const int8_t *p) {
-        return _mm512_load_si512((const __m512i *)p);
-    }
-    static INLINE vec_u8x64 mask_load(const uint8_t *p, __mmask64 mask) {
-        return _mm512_mask_loadu_epi8(vec_u8x64(0).v, mask, p);
-    }
-    static INLINE void store(vec_u8x64 v, uint8_t *p) {
-        _mm512_storeu_si512((__m512i *)p, v.v);
-    }
-    static INLINE void store_aligned(vec_u8x64 v, int8_t *p) {
-        _mm512_store_si512((__m512i *)p, v.v);
-    }
-    static INLINE void mask_store(
-            uint8_t *p, __mmask64 mask, vec_u8x64 const &a) {
-        return _mm512_mask_storeu_epi8(p, mask, a.v);
-    }
-};
-
-INLINE vec_u8x64 operator+(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_add_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x64 operator-(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x64 operator-(vec_u8x64 const &a) {
-    return _mm512_sub_epi8(_mm512_setzero_si512(), a.v);
-}
-
-// INLINE vec_u8x64 operator*(vec_u8x64 const &a, vec_u8x64 const &b) {
-//     return _mm512_mullo_epu8(a.v, b.v);
-// }
-// INLINE vec_u8x64 operator/(vec_u8x64 const &a, vec_u8x64 const &b) {
-//     return _mm512_div_epu8(a.v, b.v);
-// }
-
-INLINE vec_u8x64 operator~(vec_u8x64 const &a) {
-    return _mm512_xor_si512(a.v, _mm512_set1_epi8(-1));
-}
-INLINE vec_u8x64 operator&(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_and_si512(a.v, b.v);
-}
-INLINE vec_u8x64 operator|(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_or_si512(a.v, b.v);
-}
-INLINE vec_u8x64 operator^(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_xor_si512(a.v, b.v);
-}
-
-INLINE __mmask64 operator!(vec_u8x64 const &a) {
-    return _mm512_cmp_epu8_mask(a.v, _mm512_setzero_si512(), _MM_CMPINT_EQ);
-}
-INLINE __mmask64 operator==(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask64 operator!=(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask64 operator>(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask64 operator<(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask64 operator>=(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask64 operator<=(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u8x64 sc_select(
-        __mmask64 mask, vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_mask_blend_epi8(mask, b.v, a.v);
-}
-
-INLINE vec_u8x64 sc_max(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_max_epu8(a.v, b.v);
-}
-INLINE vec_u8x64 sc_min(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_min_epu8(a.v, b.v);
-}
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x8.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x8.hpp
deleted file mode 100644
index e13dae19204..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vec_u8x8.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X8_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VEC_U8X8_HPP
-#include <immintrin.h>
-#include <stdint.h>
-#include <xmmintrin.h>
-#include "common.hpp"
-class vec_u8x8 {
-public:
-    union {
-        __m128i v;
-        uint8_t raw[16];
-    } __attribute__((aligned(16)));
-    INLINE vec_u8x8() = default;
-    INLINE vec_u8x8(uint8_t f) { v = _mm_set1_epi8(f); }
-    INLINE vec_u8x8(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4,
-            uint8_t i5, uint8_t i6, uint8_t i7) {
-        v = _mm_setr_epi8(
-                i0, i1, i2, i3, i4, i5, i6, i7, 0, 0, 0, 0, 0, 0, 0, 0);
-    }
-    vec_u8x8(__m128i const &x) { v = x; }
-    static INLINE vec_u8x8 load(const uint8_t *p) {
-        return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, ((uint8_t *)p)[7],
-                ((uint8_t *)p)[6], ((uint8_t *)p)[5], ((uint8_t *)p)[4],
-                ((uint8_t *)p)[3], ((uint8_t *)p)[2], ((uint8_t *)p)[1],
-                ((uint8_t *)p)[0]);
-    }
-    static INLINE vec_u8x8 load_aligned(const uint8_t *p) {
-        return _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, ((uint8_t *)p)[7],
-                ((uint8_t *)p)[6], ((uint8_t *)p)[5], ((uint8_t *)p)[4],
-                ((uint8_t *)p)[3], ((uint8_t *)p)[2], ((uint8_t *)p)[1],
-                ((uint8_t *)p)[0]);
-    }
-#ifdef __AVX512F__
-    static INLINE vec_u8x8 mask_load(const uint8_t *p, __mmask8 mask8) {
-        __mmask16 mask16 = 0x00FF; // Initialize the 16-bit mask to higher bits
-                // 0's (initially unmasked)
-
-        // Manually copy 8-bit mask to the lower 8 bits of the 16-bit mask
-        mask16 &= (mask8 & 0xFF);
-        return _mm_mask_loadu_epi8(vec_u8x8(0).v, mask16, p);
-    }
-#endif
-    static INLINE void store(vec_u8x8 v, uint8_t *p) {
-        _mm_storel_pi((__m64 *)p, _mm_castsi128_ps(v.v));
-    }
-    // static INLINE void store_aligned(vec_u8x8 v, int8_t *p) {
-    //     // warning: no aligned store
-    //     _mm_storeu_si64((__m64 *)p, v.v);
-    // }
-#ifdef __AVX512F__
-    static INLINE void mask_store(
-            uint8_t *p, __mmask8 mask8, vec_u8x8 const &a) {
-        __mmask16 mask16 = 0x00FF; // Initialize the 16-bit mask to higher bits
-                // 0's (initially unmasked)
-
-        // Manually copy 8-bit mask to the lower 8 bits of the 16-bit mask
-        mask16 &= (mask8 & 0xFF);
-        return _mm_mask_storeu_epi8(p, mask16, a.v);
-    }
-#endif
-};
-
-INLINE vec_u8x8 operator+(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_add_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x8 operator-(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_sub_epi8(a.v, b.v);
-}
-
-INLINE vec_u8x8 operator-(vec_u8x8 const &a) {
-    return _mm_sub_epi8(_mm_setzero_si128(), a.v);
-}
-
-// INLINE vec_u8x8 operator*(vec_u8x8 const &a, vec_u8x8 const &b) {
-//     return _mm_mullo_epi8(a.v, b.v);
-// }
-// INLINE vec_u8x8 operator/(vec_u8x8 const &a, vec_u8x8 const &b) {
-//     return _mm_div_epi8(a.v, b.v);
-// }
-
-INLINE vec_u8x8 operator~(vec_u8x8 const &a) {
-    auto xor_helper = _mm_setr_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-            0xFF, 0, 0, 0, 0, 0, 0, 0, 0);
-    return _mm_xor_si128(a.v, xor_helper);
-}
-INLINE vec_u8x8 operator&(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_and_si128(a.v, b.v);
-}
-INLINE vec_u8x8 operator|(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_or_si128(a.v, b.v);
-}
-INLINE vec_u8x8 operator^(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_xor_si128(a.v, b.v);
-}
-
-#ifdef __AVX512F__
-INLINE __mmask8 operator!(vec_u8x8 const &a) {
-    return _mm_cmp_epi8_mask(a.v, _mm_setzero_si128(), _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator==(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_EQ);
-}
-INLINE __mmask8 operator!=(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_cmp_epi8_mask(a.v, b.v, _MM_CMPINT_NE);
-}
-INLINE __mmask8 operator>(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GT);
-}
-INLINE __mmask8 operator<(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LT);
-}
-INLINE __mmask8 operator>=(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_GE);
-}
-INLINE __mmask8 operator<=(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_cmp_epu8_mask(a.v, b.v, _MM_CMPINT_LE);
-}
-INLINE vec_u8x8 sc_select(
-        __mmask8 mask8, vec_u8x8 const &a, vec_u8x8 const &b) {
-    __mmask16 mask16 = 0x00FF; // Initialize the 16-bit mask to higher bits
-            // 0's (initially unmasked)
-
-    // Manually copy 8-bit mask to the lower 8 bits of the 16-bit mask
-    mask16 &= (mask8 & 0xFF);
-    return _mm_mask_blend_epi8(mask16, b.v, a.v);
-}
-#endif
-// INLINE vec_u8x8 operator<<(vec_u8x8 const &a, int16_t b) {
-//     return _mm_sll_epi8(a.v, _mm_cvtsi32_si128(b));
-// }
-// INLINE vec_u8x8 operator>>(vec_u8x8 const &a, int16_t b) {
-//     return _mm_sra_epi8(a.v, _mm_cvtsi32_si128(b));
-// }
-
-// operator /
-
-INLINE vec_u8x8 sc_max(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_max_epu8(a.v, b.v);
-}
-INLINE vec_u8x8 sc_min(vec_u8x8 const &a, vec_u8x8 const &b) {
-    return _mm_min_epu8(a.v, b.v);
-}
-INLINE vec_u8x8 sc_abs(vec_u8x8 const &a) {
-    return _mm_abs_epi8(a.v);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vector_maskloadstore.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vector_maskloadstore.hpp
deleted file mode 100644
index bd943002e14..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vector_maskloadstore.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VECTOR_MASKLOADSTORE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VECTOR_MASKLOADSTORE_HPP
-#include "common.hpp"
-
-#ifdef __AVX__
-INLINE vec_f32x8 mask_load(const float *p, vec_s32x8 mask) {
-    return _mm256_maskload_ps(p, mask.v);
-}
-INLINE vec_f32x4 mask_load(const float *p, vec_s32x4 mask) {
-    return _mm_maskload_ps(p, mask.v);
-}
-INLINE void mask_store(float *p, vec_s32x8 mask, vec_f32x8 const &a) {
-    _mm256_maskstore_ps(p, mask.v, a.v);
-}
-INLINE void mask_store(float *p, vec_s32x4 mask, vec_f32x4 const &a) {
-    _mm_maskstore_ps(p, mask.v, a.v);
-}
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vector_utils.hpp b/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vector_utils.hpp
deleted file mode 100644
index 95841efd688..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/kernel_include/x86simd/vector_utils.hpp
+++ /dev/null
@@ -1,279 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VECTOR_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_KERNEL_INCLUDE_X86SIMD_VECTOR_UTILS_HPP
-
-template <>
-INLINE vec_s32x4 sc_round_and_cast(const vec_f32x4 &x) {
-    return _mm_cvtps_epi32(x.v);
-}
-
-INLINE vec_f32x4::operator vec_s32x4() const {
-    return _mm_cvttps_epi32(v);
-}
-
-INLINE vec_s32x4::operator vec_f32x4() const {
-    return _mm_cvtepi32_ps(v);
-}
-
-#ifdef __AVX__
-template <>
-INLINE vec_s32x8 sc_round_and_cast(const vec_f32x8 &x) {
-    return _mm256_cvtps_epi32(x.v);
-}
-
-INLINE vec_s32x8::operator vec_f32x8() const {
-    return _mm256_cvtepi32_ps(v);
-}
-
-INLINE vec_f32x8::operator vec_s32x8() const {
-    return _mm256_cvttps_epi32(v);
-}
-#endif
-
-INLINE vec_f32x4 sc_gather(float const *a, vec_s32x4 const &b) {
-#ifdef __AVX512F__
-    __m128 dummy = _mm_set1_ps(0.f);
-    return _mm_mmask_i32gather_ps(dummy, 0xff, b.v, a, 4);
-#else
-#ifdef __AVX2__
-    return _mm_i32gather_ps(a, b.v, 4);
-#else
-    return vec_f32x4(a[b.raw[0]], a[b.raw[1]], a[b.raw[2]], a[b.raw[3]]);
-#endif
-#endif
-}
-
-#ifdef __AVX2__
-INLINE vec_f32x8 sc_gather(float const *a, vec_s32x8 const &b) {
-#ifdef __AVX512F__
-    __m256 dummy = _mm256_set1_ps(0.f);
-    union {
-        int v;
-        float v2;
-    } mask;
-    mask.v = 0xffffffff;
-    return _mm256_mask_i32gather_ps(dummy, a, b.v, vec_f32x8(mask.v2).v, 4);
-#else
-    return _mm256_i32gather_ps(a, b.v, 4);
-#endif
-}
-#define sc_extract_vec_s8x8(a, imm) _mm_extract_epi8(a.v, imm);
-#define sc_extract_vec_u8x8(a, imm) _mm_extract_epi8(a.v, imm);
-#define sc_extract_vec_s8x16(a, imm) _mm_extract_epi8(a.v, imm);
-#define sc_extract_vec_u8x16(a, imm) _mm_extract_epi8(a.v, imm);
-#define sc_extract_vec_u16x8(a, imm) _mm_extract_epi16(a.v, imm);
-#define sc_extract_vec_s32x8(a, imm) _mm256_extract_epi32(a.v, imm);
-#define sc_insert_vec_s8x16(a, imm) _mm_insert_epi8(a.v, b.v, imm);
-#define sc_insert_vec_u8x16(a, imm) _mm_insert_epi8(a.v, b.v, imm);
-#define sc_insert_vec_u16x8(a, imm) _mm_insert_epi16(a.v, b.v, imm);
-#define sc_insert_vec_s32x8(a, imm) _mm256_insert_epi32(a.v, b.v, imm);
-#define sc_permutexvar_vec_u8x32_64bits(idx, a) \
-    _mm256_permute4x64_epi64(a.v, idx);
-
-#ifndef __AVX512F__
-#define sc_extract_vec_s8x32(a, imm) _mm256_extract_epi8(a.v, imm);
-#define sc_extract_vec_u8x32(a, imm) _mm256_extract_epi8(a.v, imm);
-#define sc_extract_vec_u16x16(a, imm) _mm256_extract_epi16(a.v, imm);
-#define sc_insert_vec_s8x32(a, imm) _mm256_insert_epi8(a.v, b.v, imm);
-#define sc_insert_vec_u8x32(a, imm) _mm256_insert_epi8(a.v, b.v, imm);
-#define sc_insert_vec_u16x16(a, imm) _mm256_insert_epi16(a.v, b.v, imm);
-#define sc_permutexvar_vec_u8x64_64bits(idx, a) \
-    _mm512_permutexvar_epi64(a.v, idx);
-
-#endif
-#endif
-
-#ifdef __AVX512F__
-INLINE vec_f32x16 sc_gather(float const *a, vec_s32x16 const &b) {
-    return _mm512_i32gather_ps(b.v, a, 4);
-}
-INLINE vec_u8x16::operator vec_s32x16() const {
-    return _mm512_cvtepu8_epi32(v);
-}
-INLINE vec_s32x16::operator vec_u8x16() const {
-    return _mm512_cvtepi32_epi8(v);
-}
-INLINE vec_u8x16::operator vec_f32x16() const {
-    return _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(v));
-}
-INLINE vec_f32x16::operator vec_u8x16() const {
-    return _mm512_cvtepi32_epi8(_mm512_cvttps_epi32(v));
-}
-INLINE vec_s8x16::operator vec_s32x16() const {
-    return _mm512_cvtepi8_epi32(v);
-}
-INLINE vec_s32x16::operator vec_s8x16() const {
-    return _mm512_cvtepi32_epi8(v);
-}
-INLINE vec_u32x16::operator vec_u8x16() const {
-    return _mm512_cvtusepi32_epi8(v);
-}
-INLINE vec_f32x16::operator vec_s32x16() const {
-    return _mm512_cvttps_epi32(v);
-}
-INLINE vec_s32x16::operator vec_f32x16() const {
-    return _mm512_cvtepi32_ps(v);
-}
-INLINE vec_s8x16::operator vec_f32x16() const {
-    return _mm512_cvtepi32_ps(_mm512_cvtepi8_epi32(v));
-}
-INLINE vec_f32x16::operator vec_s8x16() const {
-    return _mm512_cvtepi32_epi8(_mm512_cvttps_epi32(v));
-}
-INLINE vec_u32x16::operator vec_u16x16() const {
-    return _mm512_cvtepi32_epi16(v);
-}
-INLINE vec_u16x16::operator vec_u32x16() const {
-    return _mm512_cvtepu16_epi32(v);
-}
-INLINE vec_u16x16::operator vec_s32x16() const {
-    return _mm512_cvtepu16_epi32(v);
-}
-INLINE vec_u32x4::operator vec_u16x4() const {
-    return _mm_cvtepi32_epi16(v);
-}
-INLINE vec_u16x4::operator vec_u32x4() const {
-    return _mm_cvtepu16_epi32(v128);
-}
-INLINE vec_u32x8::operator vec_u16x8() const {
-    return _mm256_cvtepi32_epi16(v);
-}
-INLINE vec_u16x8::operator vec_u32x8() const {
-    return _mm256_cvtepu16_epi32(v);
-}
-#ifdef __AVX512FP16__
-INLINE vec_f16x16::operator vec_u16x16() const {
-    return _mm256_cvtph_epu16(v);
-}
-INLINE vec_f16x16::operator vec_f32x16() const {
-    return _mm512_cvtxph_ps(v);
-}
-INLINE vec_f16x8::operator vec_f32x8() const {
-    return _mm256_cvtxph_ps(v);
-}
-INLINE vec_f16x16::operator vec_u32x16() const {
-    return _mm512_cvtph_epu32(v);
-}
-INLINE vec_f16x16::operator vec_s32x16() const {
-    return _mm512_cvtph_epi32(v);
-}
-INLINE vec_f32x16::operator vec_f16x16() const {
-    return _mm512_cvtxps_ph(v);
-}
-INLINE vec_f32x8::operator vec_f16x8() const {
-    return _mm256_cvtxps_ph(v);
-}
-INLINE vec_u16x16::operator vec_f16x16() const {
-    return _mm256_cvtepu16_ph(v);
-}
-INLINE vec_s32x16::operator vec_f16x16() const {
-    return _mm512_cvtepi32_ph(v);
-}
-INLINE vec_u32x16::operator vec_f16x16() const {
-    return _mm512_cvtepu32_ph(v);
-}
-#endif
-
-INLINE vec_u16x32::vec_u16x32(vec_u16x8 const &x) {
-    v = _mm512_broadcast_i32x4(x.v);
-}
-INLINE vec_u16x32::vec_u16x32(vec_u16x8 const &x, int mask) {
-    v = _mm512_maskz_broadcast_i32x4(mask, x.v);
-}
-INLINE vec_u16x32::vec_u16x32(
-        vec_u16x8 const &x, int mask, vec_u16x32 const &src) {
-    v = _mm512_mask_broadcast_i32x4(src.v, mask, x.v);
-}
-#ifdef __AVX512DQ__
-#define sc_insert_vec_u16x32(a, b, imm) _mm512_inserti32x8(a.v, b.v, imm);
-#define sc_insert_vec_s8x64(a, b, imm) _mm512_inserti32x8(a.v, b.v, imm);
-#define sc_insert_vec_u8x64(a, b, imm) _mm512_inserti32x8(a.v, b.v, imm);
-#define sc_extract_vec_u16x32(a, imm) _mm512_extracti32x8_epi32(a.v, imm);
-#define sc_extract_vec_s8x64(a, imm) _mm512_extracti32x8_epi32(a.v, imm);
-#define sc_extract_vec_u8x64(a, imm) _mm512_extracti32x8_epi32(a.v, imm);
-#endif
-#ifdef __AVX512VL__
-#define sc_insert_vec_s8x32(a, b, imm) _mm256_inserti32x4(a.v, b.v, imm);
-#define sc_insert_vec_u8x32(a, b, imm) _mm256_inserti32x4(a.v, b.v, imm);
-#define sc_extract_vec_s8x32(a, imm) _mm256_extracti32x4_epi32(a.v, imm);
-#define sc_extract_vec_u8x32(a, imm) _mm256_extracti32x4_epi32(a.v, imm);
-
-#endif
-#ifdef __AVX512VBMI__
-INLINE vec_u8x64 sc_permutexvar(vec_u8x64 const &a, vec_u8x64 const &b) {
-    return _mm512_permutexvar_epi8(a.v, b.v);
-}
-INLINE vec_s8x64 sc_permutexvar(vec_s8x64 const &a, vec_s8x64 const &b) {
-    return _mm512_permutexvar_epi8(a.v, b.v);
-}
-INLINE vec_u16x32 sc_permutexvar(vec_u16x32 const &a, vec_u16x32 const &b) {
-    return _mm512_permutexvar_epi16(a.v, b.v);
-}
-#endif
-template <>
-INLINE vec_s32x16 sc_round_and_cast(const vec_f32x16 &x) {
-    return _mm512_cvtps_epi32(x.v);
-}
-template <>
-INLINE vec_u32x16 sc_round_and_cast(const vec_f32x16 &x) {
-    return _mm512_cvtps_epu32(x.v);
-}
-
-// saturated cast
-template <>
-INLINE vec_s8x16 sc_saturated_cast(const vec_s32x16 &x) {
-    return _mm512_cvtsepi32_epi8(x.v);
-}
-
-template <>
-INLINE vec_u8x16 sc_saturated_cast(const vec_s32x16 &x) {
-    return _mm512_cvtusepi32_epi8(x.v);
-}
-
-#ifdef __AVX512BF16__
-INLINE vec_u16x4 tobf16(vec_f32x4 const &x) {
-    vec_u16x4 r;
-    r.v16 = _mm_cvtneps_pbh(x.v);
-    return r;
-}
-INLINE vec_u16x8 tobf16(vec_f32x8 const &x) {
-    vec_u16x8 r;
-    r.v16 = _mm256_cvtneps_pbh(x.v);
-    return r;
-}
-INLINE vec_u16x16 tobf16(vec_f32x16 const &x) {
-    vec_u16x16 r;
-    r.v16 = _mm512_cvtneps_pbh(x.v);
-    return r;
-}
-
-INLINE uint16_t tobf16(float const &x) {
-    uint16_t storage_;
-    union caster_t {
-        uint32_t vl;
-        float vf;
-    };
-    caster_t caster;
-    caster.vf = x;
-    uint32_t rounding_bias = ((caster.vl >> 16) & 1) + UINT32_C(0x7FFF);
-    storage_ = static_cast<uint16_t>((caster.vl + rounding_bias) >> 16);
-    return storage_;
-}
-#endif
-#endif
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/logging.cpp b/src/graph/backend/graph_compiler/core/src/runtime/logging.cpp
deleted file mode 100644
index 9affb47fb3a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/logging.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <set>
-#include <string>
-#include <runtime/config.hpp>
-#include <runtime/env_var.hpp>
-#include <runtime/env_vars.hpp>
-#include <runtime/logging.hpp>
-#include <util/string_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-#define should_pass_filter(x) true
-static std::ostream *stream_target = &std::cerr;
-
-logging_stream_t::logging_stream_t(const char *append)
-    : stream_(new std::stringstream {}), append_(append) {}
-logging_stream_t::~logging_stream_t() {
-    if (stream_) {
-        *stream_ << append_;
-        (*stream_target) << static_cast<std::stringstream *>(stream_)->str();
-        delete stream_;
-    }
-}
-
-void set_logging_stream(std::ostream *s) {
-    stream_target = s;
-}
-
-std::ostream *get_logging_stream() {
-    return stream_target;
-}
-
-static logging_stream_t get_stream(verbose_level level, const char *module_name,
-        const char *appender, const char *prefix) {
-    if (runtime_config_t::get().verbose_level_ < level) {
-        return logging_stream_t();
-    }
-    if (!module_name || should_pass_filter(module_name)) {
-        logging_stream_t ret {appender};
-        *(ret.stream_) << prefix;
-        if (module_name) { *(ret.stream_) << '[' << module_name << ']' << ' '; }
-        return ret;
-    }
-    return logging_stream_t();
-}
-
-logging_stream_t get_info_logging_stream(const char *module_name) {
-    return get_stream(INFO, module_name, "\n", "[INFO] ");
-}
-
-logging_stream_t get_warning_logging_stream(const char *module_name) {
-    return get_stream(WARNING, module_name, "\033[0m\n", "\033[33m[WARN] ");
-}
-
-logging_stream_t get_fatal_logging_stream(const char *module_name) {
-    return get_stream(FATAL, module_name, "\033[0m\n", "\033[31m[FATAL] ");
-}
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/logging.hpp b/src/graph/backend/graph_compiler/core/src/runtime/logging.hpp
deleted file mode 100644
index e2b6aac947b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/logging.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_LOGGING_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_LOGGING_HPP
-#include <sstream>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct SC_INTERNAL_API logging_stream_t {
-    std::ostream *stream_;
-    const char *append_;
-    logging_stream_t() : stream_(nullptr), append_(nullptr) {}
-    logging_stream_t(const char *append);
-    logging_stream_t(logging_stream_t &&other) {
-        stream_ = other.stream_;
-        other.stream_ = nullptr;
-        append_ = other.append_;
-    }
-    ~logging_stream_t();
-    operator bool() const { return stream_; };
-};
-
-SC_INTERNAL_API logging_stream_t get_info_logging_stream(
-        const char *module_name = nullptr);
-SC_INTERNAL_API logging_stream_t get_warning_logging_stream(
-        const char *module_name = nullptr);
-SC_INTERNAL_API logging_stream_t get_fatal_logging_stream(
-        const char *module_name = nullptr);
-
-enum verbose_level { FATAL = 0, WARNING, INFO };
-
-void set_logging_stream(std::ostream *s);
-std::ostream *get_logging_stream();
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#define SC_INFO \
-    if (auto __sc_stream_temp__ \
-            = ::dnnl::impl::graph::gc::runtime::get_info_logging_stream()) \
-    (*__sc_stream_temp__.stream_)
-
-#define SC_MODULE_INFO2(NAME) \
-    if (auto __sc_stream_temp__ \
-            = ::dnnl::impl::graph::gc::runtime::get_info_logging_stream(NAME)) \
-    (*__sc_stream_temp__.stream_)
-
-#define SC_MODULE_INFO SC_MODULE_INFO2(__sc_module_name)
-
-#define SC_WARN \
-    if (auto __sc_stream_temp__ \
-            = ::dnnl::impl::graph::gc::runtime::get_warning_logging_stream()) \
-    (*__sc_stream_temp__.stream_)
-#define SC_MODULE_WARN \
-    if (auto __sc_stream_temp__ \
-            = ::dnnl::impl::graph::gc::runtime::get_warning_logging_stream( \
-                    __sc_module_name)) \
-    (*__sc_stream_temp__.stream_)
-
-#define SC_FATAL \
-    if (auto __sc_stream_temp__ \
-            = ::dnnl::impl::graph::gc::runtime::get_fatal_logging_stream()) \
-    (*__sc_stream_temp__.stream_)
-#define SC_MODULE_FATAL \
-    if (auto __sc_stream_temp__ \
-            = ::dnnl::impl::graph::gc::runtime::get_fatal_logging_stream( \
-                    __sc_module_name)) \
-    (*__sc_stream_temp__.stream_)
-
-#define SC_MODULE(NAME) static constexpr const char *__sc_module_name = #NAME;
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/low_level_threadpool_wrapper.hpp b/src/graph/backend/graph_compiler/core/src/runtime/low_level_threadpool_wrapper.hpp
deleted file mode 100644
index 129ddb24ed3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/low_level_threadpool_wrapper.hpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_LOW_LEVEL_THREADPOOL_WRAPPER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_LOW_LEVEL_THREADPOOL_WRAPPER_HPP
-
-#include <stdexcept>
-#include "thread_locals.hpp"
-#include <runtime/config.hpp>
-#include <util/compiler_macros.hpp>
-#include <util/def.hpp>
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <common/dnnl_thread.hpp>
-#endif
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-// clang-format off
-#include <tbb/parallel_for_each.h>
-// clang-format on
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-inline thread_local_buffer_t &get_tls_helper() {
-    return thread_local_buffer_t::tls_buffer();
-}
-
-using main_func_t = void (*)(runtime::stream_t *, void *, generic_val *);
-template <typename T, typename TSched>
-void call_threadpool(TSched *ths, main_func_t f, runtime::stream_t *stream,
-        void *mod_data, generic_val *args) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-    int threads = dnnl::impl::threadpool_utils::get_active_threadpool()
-                          ->get_num_threads();
-    int runtime_threads = runtime_config_t::get().get_num_threads();
-    threads = (threads < runtime_threads) ? threads : runtime_threads;
-#else
-    int threads = runtime_config_t::get().get_num_threads();
-#endif
-    ths = T::all_thread_prepare(ths, stream, threads);
-    if (threads > 1 || !T::can_optimize_single_thread) {
-        typename T::TyState rtl_state {T::before_parallel(ths)};
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_SEQ
-        int64_t i = 0; // make compiler happy
-        throw std::runtime_error("Running SEQ in thread pool");
-#endif
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-        SC_NO_OP();
-#pragma omp parallel for
-        for (int it = 0; it < threads; it++) {
-            int64_t i = it;
-            SC_NO_OP();
-#elif SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-        oneapi::tbb::task_arena arena(threads);
-        arena.execute([&] {
-            tbb::parallel_for(0, threads, 1, [&](int64_t i) {
-#elif SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        dnnl::impl::parallel(threads, [&](int64_t i, int64_t dummy) {
-#endif
-            // use helper func to workaround a icx compiler bug
-            auto &tls = get_tls_helper();
-            i = T::parse_tid(rtl_state, ths, tls, i);
-            tls.in_managed_thread_pool_ = true;
-            tls.additional_->linear_thread_id_ = i;
-            if (i == 0) {
-                T::main_thread(ths, f, stream, mod_data, args);
-            } else {
-                T::worker_thread(ths, i);
-            }
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-        }
-#elif SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-            });
-        });
-#elif SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        });
-#endif
-        T::after_parallel(ths);
-    } else {
-        T::single_thread(ths, f, stream, mod_data, args);
-    }
-}
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool.cpp b/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool.cpp
deleted file mode 100644
index 2456d1702c1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <atomic>
-#ifdef SC_KERNEL_PROFILE
-#include <chrono>
-#endif
-#include <immintrin.h>
-#include "config.hpp"
-#include "managed_thread_pool.hpp"
-#include "managed_thread_pool_exports.hpp"
-#include "memorypool.hpp"
-#include "runtime.hpp"
-#include "thread_locals.hpp"
-#include "thread_pool_flags.hpp"
-#include <cpu/x64/amx_tile_configure.hpp>
-#include <runtime/low_level_threadpool_wrapper.hpp>
-#include <runtime/microkernel/cpu/kernel_timer.hpp>
-#include <util/compiler_macros.hpp>
-#include <util/simple_math.hpp>
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <common/dnnl_thread.hpp>
-#endif
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-// clang-format off
-#include <tbb/parallel_for_each.h>
-// clang-format on
-#endif
-
-using namespace dnnl::impl::graph::gc;
-using runtime::thread_manager;
-static void do_dispatch(thread_manager *s, int tid);
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-#ifdef SC_KERNEL_PROFILE
-static void make_trace(int in_or_out, int count) {
-    if (sc_is_trace_enabled()) { sc_make_trace_kernel(2, in_or_out, count); }
-}
-
-#else
-#define make_trace(v, count) SC_UNUSED(count)
-#endif
-
-constexpr uint64_t max_wait_count = 1;
-
-void thread_manager::thread_pool_state::wait_all() {
-    make_trace(0, 0);
-    auto idl_f = idle_func;
-    bool has_idle_func = idl_f
-            && (execution_flags & thread_pool_flags::THREAD_POOL_RUN_IDLE_FUNC);
-    int count = 0;
-    if (has_idle_func && remaining.load(std::memory_order_acquire) != 0) {
-        count = idl_f(&remaining, 0, 0, idle_args);
-    }
-    for (;;) {
-        if (remaining.load(std::memory_order_acquire) == 0) {
-            make_trace(1, count);
-            break;
-        }
-        _mm_pause();
-    }
-}
-
-void thread_manager::thread_pool_state::reset_scoreboard() {
-    remaining.store(num_threads - 1, std::memory_order_release);
-}
-
-#ifdef SC_KERNEL_PROFILE
-static std::atomic<int> instances {0};
-#endif
-thread_manager::thread_manager() {
-    state.trigger = 1;
-#ifdef SC_KERNEL_PROFILE
-    instance_id_ = instances++;
-#endif
-}
-
-void cleanup_worker_thread_state() {
-    auto &tls = thread_local_buffer_t::tls_buffer();
-    tls.in_managed_thread_pool_ = false;
-    auto &need_release_amx = tls.amx_buffer_.need_release_tile_;
-    if (need_release_amx) {
-        dnnl::impl::cpu::x64::amx_tile_release();
-        need_release_amx = false;
-        // force to re-configure next time
-        tls.amx_buffer_.cur_palette = nullptr;
-    }
-}
-
-static void worker_func(thread_manager *ths, int tid) {
-    int st;
-    auto &task = ths->state.task;
-    int current_job_id = 2;
-    thread_manager::idle_func_t idl_f = nullptr;
-    uint64_t exec_flags = 0;
-    while (true) {
-        bool has_idle_func = idl_f
-                && (exec_flags & thread_pool_flags::THREAD_POOL_RUN_IDLE_FUNC);
-        int count = 0;
-        if (has_idle_func && (st = ths->state.trigger) != current_job_id) {
-            count = idl_f(&ths->state.trigger, current_job_id, tid,
-                    ths->state.idle_args);
-        }
-        while ((st = ths->state.trigger.load(std::memory_order_relaxed))
-                != current_job_id) {
-            if (st == -1) {
-                cleanup_worker_thread_state();
-                make_trace(1, count);
-                return;
-            }
-            _mm_pause();
-        }
-        std::atomic_thread_fence(std::memory_order_acquire);
-        idl_f = ths->state.idle_func;
-        exec_flags = ths->state.execution_flags;
-        make_trace(1, count);
-        do_dispatch(ths, tid);
-        make_trace(0, 0);
-        // check for last parallel-for fast exit
-        if (exec_flags & thread_pool_flags::THREAD_POOL_EXIT) {
-            cleanup_worker_thread_state();
-            make_trace(1, count);
-            return;
-        }
-        --ths->state.remaining;
-        current_job_id++;
-    }
-}
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-static thread_local thread_manager *current_active_thr_mgr = nullptr;
-#endif
-
-struct mtp_threadpool_adapter_t {
-    static constexpr bool can_optimize_single_thread = true;
-    using TyState = int;
-
-    static thread_manager *all_thread_prepare(
-            thread_manager *ths, runtime::stream_t *stream, int threads) {
-        ths->state.num_threads = threads;
-        return ths;
-    }
-
-    static TyState before_parallel(thread_manager *ths) {
-        ths->state.trigger = 1;
-        ths->state.execution_flags
-                = gc::runtime::thread_pool_flags::THREAD_POOL_DEFAULT;
-        return 0;
-    }
-
-    static void main_thread(thread_manager *ths, thread_manager::main_func_t f,
-            runtime::stream_t *stream, void *mod_data, generic_val *args) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        current_active_thr_mgr = ths;
-#endif
-        f(stream, mod_data, args);
-        if (!(ths->state.execution_flags
-                    & thread_pool_flags::THREAD_POOL_EXIT)) {
-            // send the exit signal. If it has EXIT flag, don't send,
-            // and let the thread exit by itself. If we send the signal
-            // in this case, it may result in some threads skipping
-            // the last job
-            ths->state.trigger = -1;
-        }
-        cleanup_worker_thread_state();
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        current_active_thr_mgr = nullptr;
-#endif
-    }
-
-    static void worker_thread(thread_manager *ths, int tid) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        current_active_thr_mgr = ths;
-#endif
-        worker_func(ths, tid);
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-        current_active_thr_mgr = nullptr;
-#endif
-    }
-
-    static void after_parallel(thread_manager *ths) {
-        if (ths->state.execution_flags & thread_pool_flags::THREAD_POOL_EXIT) {
-            ths->state.trigger = -1;
-            ths->state.execution_flags = 0;
-        }
-    }
-
-    static int64_t parse_tid(TyState, thread_manager *ths,
-            thread_local_buffer_t &tls, int64_t i) {
-#ifdef SC_KERNEL_PROFILE
-        tls.additional_->instance_id_ = ths->instance_id_;
-#endif
-        return i;
-    }
-
-    static void single_thread(thread_manager *ths,
-            thread_manager::main_func_t f, runtime::stream_t *stream,
-            void *mod_data, generic_val *args) {
-        auto &tls = thread_local_buffer_t::tls_buffer();
-        tls.in_managed_thread_pool_ = true;
-        tls.additional_->linear_thread_id_ = 0;
-#ifdef SC_KERNEL_PROFILE
-        tls.additional_->instance_id_ = ths->instance_id_;
-#endif
-        f(stream, mod_data, args);
-        cleanup_worker_thread_state();
-    }
-};
-
-void thread_manager::run_main_function(main_func_t f, runtime::stream_t *stream,
-        void *mod_data, generic_val *args) {
-    call_threadpool<mtp_threadpool_adapter_t>(this, f, stream, mod_data, args);
-}
-
-alignas(64) thread_local thread_manager thread_manager::cur_mgr;
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-static thread_manager *get_current_active_thr_mgr() {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-    return dnnl::impl::graph::gc::runtime::current_active_thr_mgr;
-#else
-    return &thread_manager::cur_mgr;
-#endif
-}
-
-// using balance211 to dispatch the workloads
-static void do_dispatch(thread_manager *s, int tid) {
-    size_t end = s->state.task.end;
-    size_t begin = s->state.task.begin;
-    size_t step = s->state.task.step;
-    size_t len = end - begin;
-    size_t num_jobs = utils::divide_and_ceil(len, s->state.task.step);
-    if (num_jobs == (unsigned)s->state.num_threads) {
-        s->state.task.pfunc(s->state.task.stream, s->state.task.module_env,
-                begin + step * tid, s->state.task.args);
-        return;
-    }
-    size_t my_jobs = utils::divide_and_ceil(num_jobs, s->state.num_threads);
-    assert(my_jobs > 0);
-    size_t my_jobs_2 = my_jobs - 1;
-    size_t the_tid = num_jobs - my_jobs_2 * s->state.num_threads;
-    size_t cur_jobs = (size_t)tid < the_tid ? my_jobs : my_jobs_2;
-    size_t my_begin = (size_t)tid <= the_tid
-            ? tid * my_jobs
-            : the_tid * my_jobs + (tid - the_tid) * my_jobs_2;
-    my_begin = my_begin * step + begin;
-    bool disable_rolling = s->state.execution_flags
-            & runtime::thread_pool_flags::THREAD_POOL_DISABLE_ROLLING;
-    for (size_t jid = 0; jid < cur_jobs; jid++) {
-        // Rolling i with tid
-        size_t real_jid = disable_rolling ? jid : ((jid + tid) % cur_jobs);
-        size_t rolling_i = real_jid * step + my_begin;
-        s->state.task.pfunc(s->state.task.stream, s->state.task.module_env,
-                rolling_i, s->state.task.args);
-    }
-}
-
-void sc_parallel_call_managed(
-        void (*pfunc)(void *, void *, int64_t, generic_val *),
-        uint64_t execution_flags, void *rtl_ctx, void *module_env,
-        int64_t begin, int64_t end, int64_t step, generic_val *args) {
-    runtime::thread_local_buffer_t::tls_buffer().additional_->is_main_thread_
-            = true;
-    thread_manager *stream = get_current_active_thr_mgr();
-    stream->state.execution_flags = execution_flags;
-    stream->state.reset_scoreboard();
-    stream->state.task = thread_manager::thread_pool_state::task_type {
-            pfunc, rtl_ctx, module_env, begin, end, step, args};
-    stream->state.trigger
-            = stream->state.trigger.load(std::memory_order_relaxed) + 1;
-    do_dispatch(stream, 0);
-    if (execution_flags
-            & dnnl::impl::graph::gc::runtime::thread_pool_flags::
-                    THREAD_POOL_EXIT) {
-        return;
-    }
-    stream->state.wait_all();
-
-    if (execution_flags
-            & runtime::thread_pool_flags::THREAD_POOL_RUN_IDLE_FUNC) {
-        stream->state.idle_func = nullptr;
-    }
-    stream->state.execution_flags
-            = runtime::thread_pool_flags::THREAD_POOL_DEFAULT;
-}
-
-void sc_set_idle_func_managed(thread_manager::idle_func_t func, void *args) {
-    thread_manager *mgr = get_current_active_thr_mgr();
-    mgr->state.idle_func = func;
-    mgr->state.idle_args = args;
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool.hpp b/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool.hpp
deleted file mode 100644
index dcea282ebc8..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MANAGED_THREAD_POOL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MANAGED_THREAD_POOL_HPP
-#include <atomic>
-#include <runtime/context.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-struct thread_manager {
-    using idle_func_t = uint64_t (*)(std::atomic<int32_t> *remaining,
-            int32_t expected_remain, int32_t tid, void *args);
-    struct thread_pool_state {
-        struct task_type {
-            void (*pfunc)(void *, void *, int64_t, generic_val *);
-            void *stream;
-            void *module_env;
-            int64_t begin;
-            int64_t end;
-            int64_t step;
-            generic_val *args;
-        } task;
-        int num_threads;
-
-        std::atomic<int> trigger;
-        idle_func_t idle_func = nullptr;
-        void *idle_args = nullptr;
-        uint64_t execution_flags = 0;
-
-        alignas(64) std::atomic<int> remaining;
-
-        void wait_all();
-        void reset_scoreboard();
-    } state;
-#ifdef SC_KERNEL_PROFILE
-    int instance_id_;
-#endif
-    thread_manager();
-    using main_func_t = void (*)(runtime::stream_t *, void *, generic_val *);
-    void run_main_function(main_func_t f, runtime::stream_t *stream,
-            void *mod_data, generic_val *args);
-    static thread_local thread_manager cur_mgr;
-};
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool_exports.hpp b/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool_exports.hpp
deleted file mode 100644
index 39a002ea176..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/managed_thread_pool_exports.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MANAGED_THREAD_POOL_EXPORTS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MANAGED_THREAD_POOL_EXPORTS_HPP
-#include <atomic>
-#include <runtime/context.hpp>
-
-extern "C" SC_API void sc_parallel_call_managed(
-        void (*pfunc)(
-                void *, void *, int64_t, dnnl::impl::graph::gc::generic_val *),
-        uint64_t execution_flags, void *rtl_ctx, void *module_env,
-        int64_t begin, int64_t end, int64_t step,
-        dnnl::impl::graph::gc::generic_val *args);
-
-extern "C" SC_API void sc_set_idle_func_managed(
-        uint64_t (*func)(std::atomic<int> *remaining, int expected_remain,
-                int tid, void *args),
-        void *args);
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/memorypool.cpp b/src/graph/backend/graph_compiler/core/src/runtime/memorypool.cpp
deleted file mode 100644
index f6d4d514d4a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/memorypool.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <memory.h>
-#include <stdexcept>
-#include <stdio.h>
-#include <stdlib.h>
-#include "context.hpp"
-#include "memorypool.hpp"
-#include "thread_locals.hpp"
-#include <runtime/os.hpp>
-#include <util/simple_math.hpp>
-
-#ifdef _WIN32
-#include <Windows.h>
-#else
-#include <sys/mman.h>
-#endif
-
-#ifdef _MSC_VER
-#define __builtin_expect(EXP_, C) (EXP_)
-#endif
-
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace memory_pool {
-using utils::divide_and_ceil;
-static constexpr size_t default_alignment = 64;
-
-memory_chunk_t *memory_chunk_t::init(intptr_t pdata, size_t sz) {
-    memory_chunk_t *ths = reinterpret_cast<memory_chunk_t *>(
-            pdata - sizeof(memory_chunk_t));
-    ths->canary_ = magic_check_num_;
-    ths->size_ = sz;
-    return ths;
-}
-
-intptr_t memory_block_t::calc_alloc_ptr() {
-    intptr_t start_addr = reinterpret_cast<intptr_t>(this) + allocated_
-            + sizeof(memory_chunk_t);
-    return divide_and_ceil(start_addr, default_alignment) * default_alignment;
-}
-
-void *alloc_by_mmap(runtime::engine_t *eng, size_t sz) {
-#ifdef _MSC_VER
-    auto ret = VirtualAlloc(
-            nullptr, sz, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
-#else
-    auto ret = mmap(nullptr, sz, PROT_READ | PROT_WRITE,
-            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-#endif
-    assert(ret);
-    return ret;
-}
-
-memory_block_t *memory_block_t::make(runtime::stream_t *stream, size_t sz,
-        memory_block_t *prev, memory_block_t *next) {
-    auto ret = stream->engine_->vtable_->temp_alloc(stream->engine_, sz);
-    if (!ret) { throw std::runtime_error("Out of Memory."); }
-    memory_block_t *blk = reinterpret_cast<memory_block_t *>(ret);
-    blk->size_ = sz;
-    blk->allocated_ = sizeof(memory_block_t);
-    blk->engine_ = stream->engine_;
-    static_assert(sizeof(memory_block_t) == offsetof(memory_block_t, buffer_),
-            "sizeof(memory_block_t) == offsetof(memory_block_t, buffer_)");
-    blk->prev_ = prev;
-    blk->next_ = next;
-    return blk;
-}
-
-void dealloc_by_mmap(runtime::engine_t *eng, void *b) {
-#ifdef _MSC_VER
-    auto ret = VirtualFree(b, 0, MEM_RELEASE);
-    SC_UNUSED(ret);
-    assert(ret);
-#else
-    munmap(b, reinterpret_cast<memory_block_t *>(b)->size_);
-#endif
-}
-
-static void free_memory_block_list(memory_block_t *b) {
-    while (b) {
-        memory_block_t *next = b->next_;
-        auto engine = b->engine_;
-        engine->vtable_->temp_dealloc(engine, b);
-        b = next;
-    }
-}
-
-size_t filo_memory_pool_t::get_block_size(size_t sz) const {
-    // calculate the aligned size of management blocks in the header
-    constexpr size_t header_size
-            = divide_and_ceil(sizeof(memory_block_t) + sizeof(memory_chunk_t),
-                      default_alignment)
-            * default_alignment;
-    // the allocated size should include the aligned header size
-    sz = sz + header_size;
-    if (sz > block_size_) {
-        return divide_and_ceil(sz, runtime::get_os_page_size())
-                * runtime::get_os_page_size();
-    } else {
-        return block_size_;
-    }
-}
-
-void *filo_memory_pool_t::alloc(runtime::stream_t *stream, size_t sz) {
-    if (unlikely(!buffers_)) {
-        buffers_ = memory_block_t::make(
-                stream, get_block_size(sz), nullptr, nullptr);
-        current_ = buffers_;
-    }
-    do {
-        intptr_t newptr = current_->calc_alloc_ptr();
-        size_t newallocated
-                = newptr + sz - reinterpret_cast<intptr_t>(current_);
-        if (likely(newallocated <= current_->size_)) {
-            // if the current block is not full
-            size_t alloc_size = newallocated - current_->allocated_;
-            current_->allocated_ = newallocated;
-            memory_chunk_t *chunk = memory_chunk_t::init(newptr, alloc_size);
-            return reinterpret_cast<void *>(newptr);
-        }
-        // if the block is full, check the next block
-        // if there is no next block left, allocate a new one
-        if (!current_->next_) {
-            current_->next_ = memory_block_t::make(
-                    stream, get_block_size(sz), current_, nullptr);
-        }
-        current_ = current_->next_;
-    } while (true);
-}
-
-void filo_memory_pool_t::dealloc(void *ptr) {
-    auto intptr = reinterpret_cast<intptr_t>(ptr);
-    auto intcur = reinterpret_cast<intptr_t>(current_);
-    // Optional: check if the pointer is valid in the current block
-
-    assert(intptr > intcur
-            && intptr - intcur < static_cast<ptrdiff_t>(current_->size_));
-    auto chunk = reinterpret_cast<memory_chunk_t *>(
-            intptr - sizeof(memory_chunk_t));
-    // Optional: check if the stack is ok
-    assert(chunk->canary_ == memory_chunk_t::magic_check_num_
-            && "Corrupt stack detected");
-    assert(current_->allocated_ > chunk->size_);
-    current_->allocated_ -= chunk->size_;
-
-    // skip the empty blocks
-    while (unlikely(current_->allocated_ == sizeof(memory_block_t))) {
-        if (current_->prev_) {
-            current_ = current_->prev_;
-        } else {
-            break;
-        }
-    }
-}
-
-void filo_memory_pool_t::release() {
-    free_memory_block_list(buffers_);
-    buffers_ = nullptr;
-    current_ = nullptr;
-}
-
-void filo_memory_pool_t::clear() {
-    for (auto cur = current_; cur; cur = cur->prev_) {
-        cur->allocated_ = sizeof(memory_block_t);
-    }
-    current_ = buffers_;
-}
-
-filo_memory_pool_t::~filo_memory_pool_t() {
-    release();
-}
-
-} // namespace memory_pool
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-using stream_t = dnnl::impl::graph::gc::runtime::stream_t;
-namespace runtime = dnnl::impl::graph::gc::runtime;
-extern "C" SC_API void *sc_aligned_malloc(
-        stream_t *pstream, size_t sz) noexcept {
-    if (sz == 0) { return nullptr; }
-    return runtime::get_tls(pstream).main_memory_pool_.alloc(pstream, sz);
-}
-
-extern "C" SC_API void sc_aligned_free(stream_t *pstream, void *p) noexcept {
-    runtime::get_tls(pstream).main_memory_pool_.dealloc(p);
-}
-
-extern "C" SC_API void *sc_thread_aligned_malloc(
-        stream_t *pstream, size_t sz) noexcept {
-    return runtime::get_tls(pstream).thread_memory_pool_.alloc(pstream, sz);
-}
-
-extern "C" SC_API void sc_thread_aligned_free(
-        stream_t *pstream, void *p) noexcept {
-    runtime::get_tls(pstream).thread_memory_pool_.dealloc(p);
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/memorypool.hpp b/src/graph/backend/graph_compiler/core/src/runtime/memorypool.hpp
deleted file mode 100644
index c8af3cd9195..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/memorypool.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MEMORYPOOL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MEMORYPOOL_HPP
-
-#include <memory>
-#include <stddef.h>
-#include <stdint.h>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct stream_t;
-struct engine_t;
-} // namespace runtime
-
-namespace memory_pool {
-
-// 4MB
-constexpr size_t threadlocal_chunk_size = 4 * 1024 * 1024;
-// 16MB
-constexpr size_t main_chunk_size = 16 * 1024 * 1024;
-
-// The chunk of memory that is allocated to the user
-struct memory_chunk_t {
-    static constexpr uint64_t magic_check_num_ = 0xc0ffeebeef0102ff;
-    // `canary` should be set as a magic value to check the existence of
-    // overflow
-    uint64_t canary_;
-    // the size of the memory_chunk_t allocated, incluing this `memory_chunk_t`
-    size_t size_;
-    // the memory for the user
-    char buffer_[0];
-    // initalizes the memory chunk, given the address of data
-    static memory_chunk_t *init(intptr_t pdata, size_t sz);
-};
-
-// the control block for pre-allocated memory block - created by page-wise
-// allocation system calls (mmap). We can divide the memory block into memory
-// chunks for user memory allocation in memory starting from `buffer_`
-struct memory_block_t {
-    // size of the memory block, starting from `this`
-    size_t size_;
-    // size of allocated bytes, including this struct
-    size_t allocated_;
-    memory_block_t *prev_;
-    memory_block_t *next_;
-    runtime::engine_t *engine_;
-    // here starts the allocatable memory
-    char buffer_[0];
-
-    /**
-     * Calculates the next pointer to allocate with alignment = 512-bits (64
-     * bytes). The  (returned pointer - sizeof(memory_chunk_t)) should be the
-     * address of memory_chunk_t
-     * */
-    intptr_t calc_alloc_ptr();
-
-    static memory_block_t *make(runtime::stream_t *stream, size_t sz,
-            memory_block_t *prev, memory_block_t *next);
-};
-
-// The FILO memory pool. The memory allocation and deallocation should be in
-// first-in-last-out fashion
-struct filo_memory_pool_t {
-    size_t block_size_;
-    // the linked list of all allocated memory blocks
-    memory_block_t *buffers_ = nullptr;
-    memory_block_t *current_ = nullptr;
-    size_t get_block_size(size_t sz) const;
-    void *alloc(runtime::stream_t *stream, size_t sz);
-    void dealloc(void *ptr);
-    filo_memory_pool_t(size_t block_size) : block_size_(block_size) {}
-    ~filo_memory_pool_t();
-    // release the memory to os/underlying memory allocator
-    void release();
-    // reset the memory pool, but keep the allocated memory in the pool
-    void clear();
-};
-void dealloc_by_mmap(runtime::engine_t *eng, void *b);
-void *alloc_by_mmap(runtime::engine_t *eng, size_t sz);
-} // namespace memory_pool
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-extern "C" SC_API void *sc_aligned_malloc(
-        dnnl::impl::graph::gc::runtime::stream_t *stream, size_t sz) noexcept;
-extern "C" SC_API void sc_aligned_free(
-        dnnl::impl::graph::gc::runtime::stream_t *stream, void *p) noexcept;
-extern "C" SC_API void *sc_thread_aligned_malloc(
-        dnnl::impl::graph::gc::runtime::stream_t *stream, size_t sz) noexcept;
-extern "C" SC_API void sc_thread_aligned_free(
-        dnnl::impl::graph::gc::runtime::stream_t *stream, void *p) noexcept;
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_alg_kind.hpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_alg_kind.hpp
deleted file mode 100644
index da452a103a3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_alg_kind.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_BRGEMM_ALG_KIND_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_BRGEMM_ALG_KIND_HPP
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace brgemm {
-
-// inherit from onednn
-enum alg_kind_t {
-    alg_kind_undef,
-    eltwise_begin = 0x1f,
-    /// Eltwise: ReLU
-    eltwise_relu = eltwise_begin,
-    /// Eltwise: hyperbolic tangent non-linearity (tanh)
-    eltwise_tanh = 0x2f,
-    /// Eltwise: exponential linear unit (elu)
-    eltwise_elu = 0x3f,
-    /// Eltwise: square
-    eltwise_square = 0x4f,
-    /// Eltwise: abs
-    eltwise_abs = 0x5f,
-    /// Eltwise: square root
-    eltwise_sqrt = 0x6f,
-    /// Eltwise: linear
-    eltwise_linear = 0x7f,
-    /// Eltwise: bounded_relu
-    eltwise_bounded_relu = 0x8f,
-    /// Eltwise: soft_relu
-    eltwise_soft_relu = 0x9f,
-    /// Eltwise: logistic
-    eltwise_logistic = 0xaf,
-    /// Eltwise: exponent
-    eltwise_exp = 0xbf,
-    /// Eltwise: gelu
-    ///
-    /// @note Tanh approximation formula is used to approximate
-    /// the cumulative distribution function of a Gaussian here
-    eltwise_gelu_tanh = 0xcf,
-    /// Eltwise: tanh-based gelu (alias for eltwise_gelu_tanh)
-    eltwise_gelu = eltwise_gelu_tanh,
-    /// Eltwise: swish
-    eltwise_swish = 0xdf,
-    /// Eltwise: natural logarithm
-    eltwise_log = 0xef,
-    /// Eltwise: clip
-    eltwise_clip = 0xff,
-    /// Eltwise: clip version 2
-    eltwise_clip_v2 = 0x10,
-    /// Eltwise: pow
-    eltwise_pow = 0x20,
-    /// Eltwise: erf-based gelu
-    eltwise_gelu_erf = 0x30,
-    /// Eltwise: round
-    eltwise_round = 0x40,
-    /// Eltwise: logsigmoid
-    eltwise_logsigmoid = 0x50,
-    /// Eltwise: mish
-    eltwise_mish = 0x60,
-    /// Eltwise: hardswish
-    eltwise_hardswish = 0x70,
-    /// Eltwise: ReLU (dst for backward)
-    eltwise_relu_use_dst_for_bwd = 0x100,
-    /// Eltwise: hyperbolic tangent non-linearity (tanh) (dst for backward)
-    eltwise_tanh_use_dst_for_bwd = 0x101,
-    /// Eltwise: exponential linear unit (elu) (dst for backward)
-    eltwise_elu_use_dst_for_bwd = 0x102,
-    /// Eltwise: square root (dst for backward)
-    eltwise_sqrt_use_dst_for_bwd = 0x103,
-    /// Eltwise: logistic (dst for backward)
-    eltwise_logistic_use_dst_for_bwd = 0x104,
-    /// Eltwise: exp (dst for backward)
-    eltwise_exp_use_dst_for_bwd = 0x105,
-    /// Eltwise: clip version 2 (dst for backward)
-    eltwise_clip_v2_use_dst_for_bwd = 0x106,
-    eltwise_end = eltwise_clip_v2_use_dst_for_bwd,
-    binary_begin = 0x1fff0,
-    /// Binary add
-    binary_add = binary_begin,
-    /// Binary mul
-    binary_mul = 0x1fff1,
-    /// Binary max
-    binary_max = 0x1fff2,
-    /// Binary min
-    binary_min = 0x1fff3,
-    /// Binary div
-    binary_div = 0x1fff4,
-    /// Binary sub
-    binary_sub = 0x1fff5,
-    /// Binary greater or equal
-    binary_ge = 0x1fff6,
-    /// Binary greater than
-    binary_gt = 0x1fff7,
-    /// Binary less or equal
-    binary_le = 0x1fff8,
-    /// Binary less than
-    binary_lt = 0x1fff9,
-    /// Binary equal
-    binary_eq = 0x1fffa,
-    /// Binary not equal
-    binary_ne = 0x1fffb,
-    binary_end = binary_ne,
-    /// customized alg kind, because in onednn side, these postops are described
-    /// as specific interfaces like `set_output_scales()`
-    bias_add,
-    out_scales,
-    a_zp,
-    b_zp,
-    c_zp,
-    out_dtype,
-};
-} // namespace brgemm
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_builtin.cpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_builtin.cpp
deleted file mode 100644
index 2c4b7e5c316..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_builtin.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <runtime/data_type.hpp>
-#include <runtime/microkernel/cpu/brgemm_common.hpp>
-#include <runtime/microkernel/cpu/microkernel.hpp>
-#include <util/assert.hpp>
-
-using namespace dnnl::impl::graph::gc;
-typedef sc_data_etype sc_dtype;
-
-extern "C" {
-
-SC_API void *sc_brgemm_get_amx_scratch(
-        const char *palette, bool *need_config_amx, runtime::stream_t *stream) {
-    bool amx_exclusive = false;
-    *need_config_amx = false;
-    return do_get_amx_tile_buf(
-            palette, stream, amx_exclusive, *need_config_amx);
-}
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_common.hpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_common.hpp
deleted file mode 100644
index 3ccc98b1a8e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_common.hpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_BRGEMM_COMMON_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_BRGEMM_COMMON_HPP
-
-#include <assert.h>
-#include <map>
-#include <utility>
-#include <vector>
-#include "brgemm_alg_kind.hpp"
-#include <runtime/context.hpp>
-#include <runtime/data_type.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace brgemm {
-
-enum attr_key {
-    // if unrollaed kernel is used (use_uker == true)
-    // then "max_bs" is the the only batch size that can be used on
-    // kernel call else "max_bs" is the maximum batch size that can be
-    // used
-    max_bs = 0, // int
-    max_top_vpad, // int
-    max_bottom_vpad, // int
-    hint_expected_A_size, // int64_t
-    hint_expected_B_size, // int64_t
-    hint_expected_C_size, // int64_t
-    hint_innermost_loop, // bool
-    hint_loop_order, // enum, not use for now
-    hint_prefetching, // default true, not use for now
-    wary_tail_read, // bool
-    generate_skip_accumulation, // bool
-    // Value of bd_mask_level specifies how bd_mask is used in brgemm kernel
-    // 0 - bd_mask is not used
-    // 1 - bd_mask is used on storing stage only
-    // 2 - bd_mask used both on reading and storing stages
-    bd_mask_level, // int
-    // use_uker is a boolean value that determines whether to use the unrolled
-    // kernel or not
-    use_uker, // bool
-    // use_interleave_stores is a value that determines whether to use the
-    // interleave stores or not
-    use_interleave_stores, // bool
-    // hint of prefetching distance for A, B and C
-    // value should be int
-    hint_prfA_dist1,
-    hint_prfA_dist2,
-    hint_prfB_dist1,
-    hint_prfB_dist2,
-    hint_prfC_dist1,
-    hint_prfC_dist2,
-    var_bs, // bool, enable variable batch size for uker
-    hint_bs_group, // int, grouping in bs, used by brdgmm
-    dispatch_avx, // use avx instead of amx isa
-    nkeys, // brgemm internal attribute nkeys
-    // extra attribute for range brgemm cache.
-    M_range_upper_bound, // generate brg with M from 1 to M_range_upper_bound
-    N_range_upper_bound, // generate brg with N from 1 to N_range_upper_bound
-    K_range_upper_bound, // generate brg with K from 1 to K_range_upper_bound
-    M_range_tail_value, // could be dyn_tail(-1), no_tail(0), static tail(>0)
-    N_range_tail_value, // could be dyn_tail(-1), no_tail(0), static tail(>0)
-    K_range_tail_value, // could be dyn_tail(-1), no_tail(0), static tail(>0)
-};
-
-// enumerate of buffer type in post op calculation
-enum postop_data_kind : int {
-    bias = 0,
-    scales,
-    binary_post_ops_rhs,
-    oc_logical_off,
-    dst_row_logical_off,
-    data_C_ptr,
-    first_mb_matrix_addr_off,
-    a_zp_compensations,
-    b_zp_compensations,
-    c_zp_values,
-    skip_accumulation,
-    zp_a_val,
-    do_only_comp,
-    do_only_zp_a_val,
-};
-
-struct attrs_setting_t {
-    static const int max_attrs_num = attr_key::nkeys; // without bd_mask
-    typedef std::pair<attr_key, int64_t> attrs_map_t;
-    int num_ = 0;
-    attrs_map_t map_[0];
-};
-
-// Todo: currently we don't support sum post op(inplace add)
-
-// elementwise post op define
-struct elt_op_t {
-    elt_op_t() : elt_op_t(alg_kind_t::alg_kind_undef) {}
-    elt_op_t(alg_kind_t alg, float scale = 1.f, float alpha = 1.f,
-            float beta = 0.f)
-        : alg_(alg), scale_(scale), alpha_(alpha), beta_(beta) {}
-    alg_kind_t alg_;
-    float scale_;
-    float alpha_; // 0.f for general relu.
-    float beta_;
-};
-
-// binary post op define
-struct bin_op_t {
-    bin_op_t(alg_kind_t alg, const int *shape, sc_data_etype dtype)
-        : alg_(alg) {
-        shape_[0] = shape[0];
-        shape_[1] = shape[1];
-        assert(shape_[0] > 0 && shape_[1] > 0);
-        dtype_ = dtype;
-    }
-    alg_kind_t alg_ = alg_kind_t::alg_kind_undef;
-    int shape_[2] = {0};
-    sc_data_etype dtype_ = sc_data_etype::F32;
-};
-
-// customize bias op, align onednn sematic
-// bias add occured before zp/scale calculation in onednn.
-struct bias_op_t {
-    bias_op_t(sc_data_etype dtype)
-        : alg_(alg_kind_t::bias_add), dtype_(dtype) {}
-    alg_kind_t alg_ = alg_kind_t::bias_add;
-    sc_data_etype dtype_ = sc_data_etype::F32;
-};
-
-// Currently we only support single scale, but onednn need a vector of scales,
-// even for `per_tensor`.
-struct scale_op_t {
-    scale_op_t() = default;
-    alg_kind_t alg_ = alg_kind_t::out_scales;
-    // the scale is fake, only need to tell brgemm creator that scales exist.
-    float scale_ = 1.1f;
-};
-
-// currently not support zp because of brgemm interface.
-// But it is effective.
-struct zp_op_t {
-    zp_op_t(alg_kind_t alg) : alg_(alg) {}
-    alg_kind_t alg_ = alg_kind_t::b_zp;
-    // the zp is fake, only need to tell brgemm creator that zp exist.
-    int zp_ = 2;
-};
-
-struct out_op_t {
-    out_op_t(sc_data_etype dtype) : dtype_(dtype) {}
-    alg_kind_t alg_ = alg_kind_t::out_dtype;
-    sc_data_etype dtype_;
-};
-
-struct empty_op_t {
-    alg_kind_t alg_ = alg_kind_t::alg_kind_undef;
-};
-
-#define DECLARE_POSTOP_CONSTRUCTOR(kind) \
-    postop_setting_t(const kind##_op_t &op) { \
-        reset(); \
-        kind##_op_ = op; \
-    }
-union postop_setting_t {
-    void reset() {
-        pack_info_[0] = 0;
-        pack_info_[1] = 0;
-    }
-    postop_setting_t() {
-        static_assert(sizeof(postop_setting_t) == sizeof(int64_t) * 2,
-                "postop setting size is bigger than 16 bytes.");
-        reset();
-        empty_op_ = empty_op_t();
-    }
-    DECLARE_POSTOP_CONSTRUCTOR(elt);
-    DECLARE_POSTOP_CONSTRUCTOR(bin);
-    DECLARE_POSTOP_CONSTRUCTOR(bias);
-    DECLARE_POSTOP_CONSTRUCTOR(scale);
-    DECLARE_POSTOP_CONSTRUCTOR(zp);
-    DECLARE_POSTOP_CONSTRUCTOR(out);
-
-    bool operator==(const postop_setting_t &other) const {
-        return pack_info_[0] == other.pack_info_[0]
-                && pack_info_[1] == other.pack_info_[1];
-    }
-
-    bool operator!=(const postop_setting_t &other) const {
-        return !(*this == other);
-    }
-
-    empty_op_t empty_op_;
-    elt_op_t elt_op_;
-    bin_op_t bin_op_;
-    bias_op_t bias_op_;
-    scale_op_t scale_op_;
-    zp_op_t zp_op_;
-    out_op_t out_op_;
-    int64_t pack_info_[2];
-};
-
-// allow multiple post ops.
-struct postops_setting_t {
-    // currently we support maximum 9 postops because of alignment of brgemm
-    // cache `brg_arg` in runtime.
-    static const int max_postops_num = 9;
-    static const int op_size = sizeof(postop_setting_t);
-    // number of post ops;
-    int num_ = 0;
-    postop_setting_t ops_[];
-};
-
-// nargs inherited from `brgemm_post_ops_data_t` in onednn backend.
-static const int postops_data_init_func_nargs = 14;
-static const int postops_data_size = postops_data_init_func_nargs * 8; // bytes
-} // namespace brgemm
-
-using sc_brgemm_attrs_t = std::map<brgemm::attr_key, int64_t>;
-// to use bd_mask, we need to set brgemm kind to list_addr, use amx, max_bs>=1,
-// bd_mask_level>=0 and use_uker=true
-using sc_brgemm_bd_mask_t = std::vector<char>;
-using sc_brgemm_postops_setting_t = std::vector<brgemm::postop_setting_t>;
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-static constexpr int PALETTE_SIZE = 64;
-
-// insert palette ptr to global map, return the inserted dnnl palette ptr
-// will return existing one if there's same one
-char *insert_global_palette(char *palette);
-
-void *do_get_amx_tile_buf(const char *palette,
-        dnnl::impl::graph::gc::runtime::stream_t *stream, bool &amx_exclusive,
-        bool &need_config_amx);
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_onednn.cpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_onednn.cpp
deleted file mode 100644
index a9fe86cf2f3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_onednn.cpp
+++ /dev/null
@@ -1,1316 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <cstring>
-#include <dnnl.h>
-#include <iostream>
-#include <math.h>
-#include <mutex>
-#include <stdio.h>
-#include <stdlib.h>
-#include "brgemm_common.hpp"
-#include "brgemm_range_handle.hpp"
-#include "kernel_timer.hpp"
-#include "microkernel.hpp"
-#include <common/memory_desc.hpp>
-#include <cpu/x64/amx_tile_configure.hpp>
-#include <cpu/x64/brgemm/brgemm.hpp>
-#include <cpu/x64/brgemm/brgemm_types.hpp>
-#include <cpu/x64/cpu_isa_traits.hpp>
-#include <runtime/config.hpp>
-#include <runtime/context.hpp>
-#include <runtime/data_type.hpp>
-#include <runtime/kernel_include/x86simd/vec_u32x4.hpp>
-#include <runtime/logging.hpp>
-#include <runtime/os.hpp>
-#include <runtime/thread_locals.hpp>
-#include <unordered_map>
-#include <util/compiler_macros.hpp>
-#include <util/hash_utils.hpp>
-#include <util/null_check.hpp>
-#include <util/os.hpp>
-
-SC_MODULE(runtime.brgemm_onednn);
-
-using namespace dnnl::impl::cpu::x64;
-namespace gc = dnnl::impl::graph::gc;
-using namespace gc::brgemm;
-typedef gc::sc_data_etype sc_dtype;
-
-static dnnl_data_type_t convert_dnnl_dtype(int dtype) {
-    switch (sc_dtype(dtype)) {
-        case sc_dtype::F32: return dnnl_f32;
-        case sc_dtype::S32: return dnnl_s32;
-        case sc_dtype::F16: return dnnl_f16;
-        case sc_dtype::BF16: return dnnl_bf16;
-        case sc_dtype::S8: return dnnl_s8;
-        case sc_dtype::U8: return dnnl_u8;
-        default:
-            throw std::runtime_error(
-                    "convert_dnnl_dtype error, currently only support datatype "
-                    "f32/s32/f16/bf16/s8/u8");
-    }
-}
-
-static size_t get_dtype_sizeof(int dtype) {
-    switch (sc_dtype(dtype)) {
-        case sc_dtype::F32: return sizeof(float);
-        case sc_dtype::F16: return sizeof(uint16_t);
-        case sc_dtype::S32: return sizeof(int32_t);
-        case sc_dtype::BF16: return sizeof(uint16_t);
-        case sc_dtype::S8: return sizeof(int8_t);
-        case sc_dtype::U8: return sizeof(uint8_t);
-        default:
-            throw std::runtime_error(
-                    "Get dtype size error, currently only support datatype "
-                    "f32/s32/f16/bf16/s8/u8");
-    }
-}
-
-static int64_t get_brgemm_attr_dispatch_avx(const attrs_setting_t &attrs) {
-    for (int i = 0; i < attrs.num_; i++) {
-        const std::pair<attr_key, int64_t> &it = attrs.map_[i];
-        if (it.first == attr_key::dispatch_avx) { return it.second; }
-    }
-    return 0;
-}
-
-static brgemm_attr_t get_dnnl_brgemm_attrs(const attrs_setting_t &attrs) {
-    brgemm_attr_t dnnl_attrs;
-    for (int i = 0; i < attrs.num_; i++) {
-        std::pair<attr_key, int64_t> it = attrs.map_[i];
-        switch (it.first) {
-            case attr_key::max_bs:
-                dnnl_attrs.max_bs = static_cast<int>(it.second);
-                break;
-            case attr_key::max_top_vpad:
-                dnnl_attrs.max_top_vpad = static_cast<int>(it.second);
-                break;
-            case attr_key::max_bottom_vpad:
-                dnnl_attrs.max_bottom_vpad = static_cast<int>(it.second);
-                break;
-            case attr_key::hint_expected_A_size:
-                dnnl_attrs.hint_expected_A_size = it.second;
-                break;
-            case attr_key::hint_expected_B_size:
-                dnnl_attrs.hint_expected_B_size = it.second;
-                break;
-            case attr_key::hint_expected_C_size:
-                dnnl_attrs.hint_expected_C_size = it.second;
-                break;
-            case attr_key::hint_innermost_loop:
-                dnnl_attrs.hint_innermost_loop
-                        = static_cast<brgemm_kernel_innermost_loop_t>(
-                                it.second);
-                break;
-            case attr_key::hint_loop_order:
-                dnnl_attrs.hint_loop_order
-                        = static_cast<brgemm_kernel_loop_order_t>(it.second);
-                break;
-            case attr_key::hint_prefetching:
-                dnnl_attrs.hint_prefetching
-                        = static_cast<brgemm_kernel_prefetching_t>(it.second);
-                break;
-            case attr_key::wary_tail_read:
-                dnnl_attrs.wary_tail_read = static_cast<bool>(it.second);
-                break;
-            case attr_key::generate_skip_accumulation:
-                dnnl_attrs.generate_skip_accumulation
-                        = static_cast<bool>(it.second);
-                break;
-            case attr_key::bd_mask_level:
-                dnnl_attrs.bd_mask_level = static_cast<int>(it.second);
-                break;
-            case attr_key::use_uker:
-                dnnl_attrs.use_uker = static_cast<bool>(it.second);
-                break;
-            case attr_key::use_interleave_stores:
-                dnnl_attrs.use_interleave_stores = static_cast<bool>(it.second);
-                break;
-            case attr_key::hint_prfA_dist1:
-                dnnl_attrs.hint_prfA.dist1 = static_cast<int>(it.second);
-                break;
-            case attr_key::hint_prfA_dist2:
-                dnnl_attrs.hint_prfA.dist2 = static_cast<int>(it.second);
-                break;
-            case attr_key::hint_prfB_dist1:
-                dnnl_attrs.hint_prfB.dist1 = static_cast<int>(it.second);
-                break;
-            case attr_key::hint_prfB_dist2:
-                dnnl_attrs.hint_prfB.dist2 = static_cast<int>(it.second);
-                break;
-            case attr_key::hint_prfC_dist1:
-                dnnl_attrs.hint_prfC.dist1 = static_cast<int>(it.second);
-                break;
-            case attr_key::hint_prfC_dist2:
-                dnnl_attrs.hint_prfC.dist2 = static_cast<int>(it.second);
-                break;
-            case attr_key::var_bs:
-                dnnl_attrs.var_bs = static_cast<bool>(it.second);
-                break;
-            case attr_key::hint_bs_group:
-                dnnl_attrs.hint_bs_group = static_cast<int>(it.second);
-                break;
-            case attr_key::nkeys:
-            default: break;
-        }
-    }
-    return dnnl_attrs;
-}
-
-static dnnl_memory_desc get_dst_default_md(int M, int N, int dtype) {
-    dnnl_memory_desc md = {};
-    md.ndims = 2;
-    md.dims[0] = static_cast<dnnl_dim_t>(M);
-    md.dims[1] = static_cast<dnnl_dim_t>(N);
-    md.padded_dims[0] = md.dims[0];
-    md.padded_dims[1] = md.dims[1];
-    md.format_kind = dnnl_format_kind_t::dnnl_blocked;
-    md.data_type = convert_dnnl_dtype(dtype);
-    md.format_desc.blocking.strides[0] = md.dims[1];
-    md.format_desc.blocking.strides[1] = 1;
-    return md;
-}
-
-static dnnl::impl::post_ops_t get_dnnl_postops_setting(
-        const postops_setting_t &ops, dnnl_primitive_attr &pattr,
-        dnnl_data_type_t &bias_dt, dnnl_data_type_t &out_dt) {
-    dnnl::impl::post_ops_t dnnl_postops;
-    dnnl_status_t status = dnnl_status_t::dnnl_success;
-    for (int i = 0; i < ops.num_; i++) {
-        auto &op = ops.ops_[i];
-        auto &alg = op.empty_op_.alg_;
-        if (alg == alg_kind_t::bias_add) {
-            bias_dt = convert_dnnl_dtype(static_cast<int>(op.bias_op_.dtype_));
-        } else if (alg == alg_kind_t::out_scales) {
-            status = pattr.output_scales_.set(op.scale_op_.scale_);
-        } else if (alg == alg_kind_t::a_zp) {
-            status = pattr.zero_points_.set(DNNL_ARG_SRC, op.zp_op_.zp_);
-        } else if (alg == alg_kind_t::b_zp) {
-            status = pattr.zero_points_.set(DNNL_ARG_WEIGHTS, op.zp_op_.zp_);
-        } else if (alg == alg_kind_t::c_zp) {
-            status = pattr.zero_points_.set(DNNL_ARG_DST, op.zp_op_.zp_);
-        } else if (alg == alg_kind_t::out_dtype) {
-            out_dt = convert_dnnl_dtype(static_cast<int>(op.out_op_.dtype_));
-        } else if (alg >= alg_kind_t::eltwise_begin
-                && alg <= alg_kind_t::eltwise_end) {
-            status = dnnl_postops.append_eltwise(op.elt_op_.scale_,
-                    static_cast<dnnl_alg_kind_t>(alg), op.elt_op_.alpha_,
-                    op.elt_op_.beta_);
-        } else if (alg >= alg_kind_t::binary_begin
-                && alg <= alg_kind_t::binary_end) {
-            dnnl_memory_desc bin_md = get_dst_default_md(op.bin_op_.shape_[0],
-                    op.bin_op_.shape_[1], static_cast<int>(op.bin_op_.dtype_));
-            status = dnnl_postops.append_binary(
-                    static_cast<dnnl_alg_kind_t>(alg), &bin_md);
-        } else {
-            throw std::runtime_error("invalid alg kind!");
-        }
-        assert(status == dnnl_status_t::dnnl_success);
-        SC_UNUSED(status);
-    }
-    return dnnl_postops;
-}
-
-struct alignas(64) brg_arg_t {
-    float alpha;
-    float beta;
-    int LDA;
-    int LDB;
-    int LDC;
-    int M;
-    int N;
-    int K;
-    int stride_a;
-    int stride_b;
-    brgemm_batch_kind_t brg_type;
-    int dtypeA;
-    int dtypeB;
-    int has_bd_mask;
-    int64_t brg_attrs[attrs_setting_t::max_attrs_num] = {0};
-    int64_t brg_postops[postops_setting_t::max_postops_num
-            * postops_setting_t::op_size / sizeof(int64_t)]
-            = {0};
-    char bd_mask[];
-
-    brg_arg_t(float alpha, float beta, int LDA, int LDB, int LDC, int M, int N,
-            int K, int stride_a, int stride_b, brgemm_batch_kind_t brg_type,
-            int dtypeA, int dtypeB, const attrs_setting_t *attrs_setting,
-            const postops_setting_t *postops_setting, char *bd_mask_ptr)
-        : alpha(alpha)
-        , beta(beta)
-        , LDA(LDA)
-        , LDB(LDB)
-        , LDC(LDC)
-        , M(M)
-        , N(N)
-        , K(K)
-        , stride_a(stride_a)
-        , stride_b(stride_b)
-        , brg_type(brg_type)
-        , dtypeA(dtypeA)
-        , dtypeB(dtypeB)
-        , has_bd_mask(0) {
-        if (attrs_setting != nullptr) {
-            for (int i = 0; i < attrs_setting->num_; i++) {
-                const std::pair<attr_key, int64_t> &it = attrs_setting->map_[i];
-                brg_attrs[it.first] = it.second;
-            }
-        }
-        if (postops_setting != nullptr) {
-            assert(postops_setting->num_ <= postops_setting_t::max_postops_num);
-            memset(brg_postops, 0,
-                    postops_setting_t::max_postops_num
-                            * sizeof(postop_setting_t));
-            memcpy(brg_postops, postops_setting->ops_,
-                    postops_setting->num_ * sizeof(postop_setting_t));
-        }
-        if (bd_mask_ptr != nullptr) {
-            has_bd_mask = 1;
-            memcpy(bd_mask, bd_mask_ptr, M * sizeof(char));
-        }
-    }
-
-    bool operator==(const brg_arg_t &v) const {
-        if (memcmp(this, &v, sizeof(brg_arg_t))) { return false; }
-        if (has_bd_mask && memcmp(bd_mask, v.bd_mask, M * sizeof(char))) {
-            return false;
-        }
-        return true;
-    }
-
-    size_t get_hash() const {
-        static_assert(sizeof(brg_arg_t) == 64 * 6,
-                "expecting (64 * 6)-byte size for brg_arg");
-        vec_u32x4 v = vec_u32x4(0);
-        for (int i = 0; i < static_cast<int>(sizeof(brg_arg_t)) / 16; i += 2) {
-            vec_u32x4 v0 = vec_u32x4::load(
-                    reinterpret_cast<const uint32_t *>(this) + 4 * i);
-            vec_u32x4 v1 = vec_u32x4::load(
-                    reinterpret_cast<const uint32_t *>(this) + 4 * (i + 1));
-            v0 = v0 ^ (_mm_srli_si128(v1.v, 3));
-            v0 = v1 ^ (_mm_srli_si128(v0.v, 2));
-            v = v ^ v0;
-        }
-        size_t ret = 0;
-        for (int i = 0; i < 2; i++) {
-            ret ^= reinterpret_cast<uint64_t *>(v.raw)[i];
-        }
-        size_t bd_ret = 0;
-        if (has_bd_mask) {
-            // todo: optimize bd mask hash against byte-by-byte.
-            for (int i = 0; i < M; i++) {
-                gc::hash_combine(bd_ret, bd_mask[i]);
-            }
-        }
-        ret = ret ^ bd_ret;
-        return ret;
-    }
-};
-
-namespace std {
-template <>
-struct hash<brg_arg_t> {
-    std::size_t operator()(const brg_arg_t &k) const { return k.get_hash(); }
-};
-} // namespace std
-
-struct brg_arg_ptr_hash_t {
-    std::size_t operator()(const brg_arg_t *k) const { return k->get_hash(); }
-};
-
-struct brg_arg_ptr_eq_to_t {
-    bool operator()(const brg_arg_t *k, const brg_arg_t *k2) const {
-        return *k == *k2;
-    }
-};
-
-struct palette_ptr_t {
-    char *ptr_;
-
-    palette_ptr_t(const char *copied) {
-        ptr_ = (char *)aligned_alloc(64, PALETTE_SIZE);
-        SC_ABORT_IF_NULL(ptr_);
-        memcpy(ptr_, copied, PALETTE_SIZE);
-    }
-
-    palette_ptr_t(palette_ptr_t &&other) noexcept {
-        ptr_ = other.ptr_;
-        other.ptr_ = nullptr;
-    }
-
-    palette_ptr_t &operator=(palette_ptr_t &&other) = delete;
-
-    ~palette_ptr_t() {
-        if (ptr_) { aligned_free(ptr_); }
-    }
-
-    struct hasher_t {
-        size_t operator()(const palette_ptr_t &p) const {
-            size_t ret = 0;
-            for (int i = 0; i < int(PALETTE_SIZE / sizeof(ret)); i++) {
-                uint64_t val = ((uint64_t *)(p.ptr_))[i];
-                gc::hash_combine(ret, val);
-            }
-            return ret;
-        }
-    };
-
-    struct cmper_t {
-        bool operator()(const palette_ptr_t &p, const palette_ptr_t &p2) const {
-            return !memcmp(p.ptr_, p2.ptr_, PALETTE_SIZE);
-        }
-    };
-};
-
-struct brgemm_kernel_info {
-    const char *palette_;
-    brgemm_kernel_t *brg_kernel_;
-    bool is_amx_;
-#ifdef SC_KERNEL_PROFILE
-    int32_t flops_;
-#endif
-    ~brgemm_kernel_info() {
-        if (brg_kernel_) {
-            brgemm_kernel_destroy(brg_kernel_);
-            brg_kernel_ = nullptr;
-        }
-    }
-};
-
-struct brg_desc_safe_t {
-    ~brg_desc_safe_t() { // NOLINT
-        for (auto &kv : brg_desc_vec_) {
-            kv.first->~brg_arg_t();
-            aligned_free((void *)kv.first);
-        }
-    }
-
-    brgemm_kernel_info *get(const brg_arg_t *arg) {
-        auto found_kernel = brg_desc_vec_local_.find(arg);
-        if (found_kernel != brg_desc_vec_local_.end()) {
-            return found_kernel->second;
-        }
-        return nullptr;
-    }
-
-    brgemm_kernel_info *getInstance(float alpha, float beta, int LDA, int LDB,
-            int LDC, int M, int N, int K, int stride_a, int stride_b,
-            brgemm_batch_kind_t brg_type, int dtypeA, int dtypeB,
-            const void *attrs_setting, char *bd_mask,
-            const void *postops_setting) {
-        size_t arg_sz = sizeof(brg_arg_t) + (bd_mask == nullptr ? 0 : M);
-        brg_arg_t *arg_ptr = (brg_arg_t *)(alloca(arg_sz));
-        new (arg_ptr) brg_arg_t {alpha, beta, LDA, LDB, LDC, M, N, K, stride_a,
-                stride_b, brg_type, dtypeA, dtypeB,
-                reinterpret_cast<const attrs_setting_t *>(attrs_setting),
-                reinterpret_cast<const postops_setting_t *>(postops_setting),
-                bd_mask};
-        brgemm_kernel_info *found_kernel = get(arg_ptr);
-        // check if the brg_arg is in thread local cache (lock free)
-        if (found_kernel) { return found_kernel; }
-        // if the brg_arg is not found in thread local cache
-        std::lock_guard<std::mutex> guard(lock_);
-        auto itr = brg_desc_vec_.find(arg_ptr);
-        if (itr != brg_desc_vec_.end()) {
-            // double check if it is global kernel cache. If so, update the
-            // thread local cache and return
-            brg_desc_vec_local_.insert(
-                    std::make_pair(itr->first, &itr->second));
-            return &itr->second;
-        }
-        arg_ptr = (brg_arg_t *)(aligned_alloc(64, arg_sz));
-        SC_ABORT_IF_NULL(arg_ptr);
-        new (arg_ptr) brg_arg_t {alpha, beta, LDA, LDB, LDC, M, N, K, stride_a,
-                stride_b, brg_type, dtypeA, dtypeB,
-                reinterpret_cast<const attrs_setting_t *>(attrs_setting),
-                reinterpret_cast<const postops_setting_t *>(postops_setting),
-                bd_mask};
-        brg_arg_t &arg = *arg_ptr;
-        // If we go here, the kernel is not yet created.
-        brgemm_desc_t desc;
-        brgemm_strides_t stride_info = {arg.stride_a, arg.stride_b};
-        auto dnnl_dtypeA = convert_dnnl_dtype(arg.dtypeA);
-        auto dnnl_dtypeB = convert_dnnl_dtype(arg.dtypeB);
-        size_t dtype_size = get_dtype_sizeof(arg.dtypeA);
-        // todo: this type assignment is caused by lack of tail processing
-        // in oneDNN (src/cpu/x64/brgemm/brgemm.cpp:305)
-        auto choose_isa_type = [&]() {
-            auto fallback_isa = (int)dtype_size == 2 ? avx512_core_bf16
-                    : (int)dtype_size == 1           ? avx512_core_vnni
-                                                     : isa_undef;
-
-            auto attrs_setting_ptr
-                    = reinterpret_cast<const attrs_setting_t *>(attrs_setting);
-            if (attrs_setting_ptr) {
-                int64_t dispatch_avx
-                        = get_brgemm_attr_dispatch_avx(*attrs_setting_ptr);
-                if (dispatch_avx) {
-                    SC_MODULE_WARN
-                            << "Dispatch to AVX isa: " << fallback_isa
-                            << " (avx512_core_vnni: " << avx512_core_vnni
-                            << ", avx512_core_bf16: " << avx512_core_bf16;
-                    return fallback_isa;
-                }
-            }
-
-            if (dnnl_dtypeA != dnnl_f32 && (arg.K < (4 / (int)dtype_size)))
-                return fallback_isa;
-            int max_rd_block = dnnl_dtypeA == dnnl_bf16                  ? 32
-                    : (dnnl_dtypeA == dnnl_s8 || dnnl_dtypeA == dnnl_u8) ? 64
-                                                                         : -1;
-            // when no need for amx:
-            if (max_rd_block == -1) { return isa_undef; }
-            int dtype_block = max_rd_block == 32 ? 2 : 4;
-            int rd_block = dtype_block;
-            for (int i = max_rd_block; i > 0; i -= dtype_block) {
-                if (arg.K % i == 0) {
-                    rd_block = i;
-                    break;
-                }
-            }
-            int rdb = arg.K / rd_block;
-            int rdb_tail = arg.K % rd_block;
-            // if somehow invalid config for amx was generated anyway, make sure
-            // it runs on vnni, which has less constraints
-            if (rdb > 0 && rdb_tail > 0) { return fallback_isa; }
-            if (rdb_tail % dtype_block) { return fallback_isa; }
-            return isa_undef;
-        };
-        cpu_isa_t isa_type = choose_isa_type();
-
-        dnnl::impl::status_t status = dnnl::impl::status::invalid_arguments;
-        if (arg.K == -1 && arg.LDB == -1) {
-            assert(alpha == 1.0f && beta == 0.f);
-            status = brdgmm_desc_init(&desc, isa_type, arg.brg_type,
-                    dnnl_dtypeA, dnnl_dtypeB, false, brgemm_row_major,
-                    arg.alpha, arg.beta, arg.LDA, arg.LDC, arg.M, arg.N,
-                    &stride_info);
-        } else {
-            status = brgemm_desc_init(&desc, isa_type, arg.brg_type,
-                    dnnl_dtypeA, dnnl_dtypeB, false, false, brgemm_row_major,
-                    arg.alpha, arg.beta, arg.LDA, arg.LDB, arg.LDC, arg.M,
-                    arg.N, arg.K, &stride_info);
-        }
-        assert(status == dnnl::impl::status::success);
-
-        // create an entry in kernel cache
-        auto new_itr = brg_desc_vec_.insert(
-                std::make_pair(arg_ptr, brgemm_kernel_info()));
-        // check that the insertion happens
-        assert(new_itr.second);
-        found_kernel = &new_itr.first->second;
-        // insert the kernel to thread local cache
-        brg_desc_vec_local_.insert(
-                std::make_pair(new_itr.first->first, found_kernel));
-
-        // set brgemm attrs
-        if (attrs_setting != nullptr) {
-            brgemm_attr_t dnnl_brg_attrs = get_dnnl_brgemm_attrs(
-                    *reinterpret_cast<const attrs_setting_t *>(attrs_setting));
-            dnnl_brg_attrs.bd_mask = bd_mask;
-            brgemm_desc_set_attr(&desc, dnnl_brg_attrs);
-        }
-
-        found_kernel->is_amx_ = false;
-        char palette_buffer[PALETTE_SIZE];
-        status = brgemm_init_tiles(desc, palette_buffer);
-        if (status == dnnl::impl::status::success) {
-            auto itr_pair = palettes_.insert(palette_ptr_t(palette_buffer));
-            found_kernel->palette_ = itr_pair.first->ptr_;
-            amx_tile_configure(found_kernel->palette_);
-            dnnl::impl::graph::gc::runtime::thread_local_buffer_t::tls_buffer()
-                    .amx_buffer_.cur_palette
-                    = nullptr;
-            found_kernel->is_amx_ = true;
-        } else {
-            found_kernel->palette_ = nullptr;
-        }
-
-        // set brgemm post ops.
-        if (postops_setting != nullptr) {
-            dnnl_primitive_attr dnnl_pattr;
-            dnnl_data_type_t dnnl_bias_dtype
-                    = dnnl_data_type_t::dnnl_data_type_undef;
-            dnnl_data_type_t dnnl_out_dtype
-                    = dnnl_data_type_t::dnnl_data_type_undef;
-            dnnl::impl::post_ops_t dnnl_postops_setting
-                    = get_dnnl_postops_setting(
-                            *reinterpret_cast<const postops_setting_t *>(
-                                    postops_setting),
-                            dnnl_pattr, dnnl_bias_dtype, dnnl_out_dtype);
-            assert(dnnl_out_dtype != dnnl_data_type_t::dnnl_data_type_undef);
-            status = dnnl_pattr.set_post_ops(dnnl_postops_setting);
-            assert(status == dnnl::impl::status::success);
-            // currently we output f32 for all input types.
-            dnnl_memory_desc dnnl_dst_md = get_dst_default_md(
-                    arg.M, arg.N, static_cast<int>(sc_dtype::F32));
-            dnnl_dst_md.data_type = dnnl_out_dtype;
-
-            status = brgemm_desc_set_postops(
-                    &desc, &dnnl_pattr, &dnnl_dst_md, arg.LDC, dnnl_bias_dtype);
-            assert(status == dnnl::impl::status::success);
-            // use local vars' lifetime
-            status = brgemm_kernel_create(&found_kernel->brg_kernel_, desc);
-        } else {
-            status = brgemm_kernel_create(&found_kernel->brg_kernel_, desc);
-        }
-        assert(status == dnnl::impl::status::success);
-#ifdef SC_KERNEL_PROFILE
-        found_kernel->flops_ = 2 * M * K * N;
-#endif
-        return found_kernel;
-    }
-
-    std::mutex lock_;
-    // the table of brgemm argument => brgemm kernel map. It is shared by
-    // threads of the same process
-    std::unordered_map<const brg_arg_t *, brgemm_kernel_info,
-            brg_arg_ptr_hash_t, brg_arg_ptr_eq_to_t>
-            brg_desc_vec_;
-    std::unordered_set<palette_ptr_t, palette_ptr_t::hasher_t,
-            palette_ptr_t::cmper_t>
-            palettes_;
-
-    using thread_local_cache = std::unordered_map<const brg_arg_t *,
-            brgemm_kernel_info *, brg_arg_ptr_hash_t, brg_arg_ptr_eq_to_t>;
-    // the thread local cache of brgemm kernel. The cached key-values are
-    // pointers to the key-values in the map above
-    static thread_local thread_local_cache brg_desc_vec_local_;
-};
-
-static brg_desc_safe_t g_brg_desc_s;
-thread_local brg_desc_safe_t::thread_local_cache
-        brg_desc_safe_t::brg_desc_vec_local_;
-static int get_range_size(int tail_value, int upper_bound) {
-    if (tail_value == brg_range_tail_value::dyn_tail) { return upper_bound; }
-    if (tail_value == brg_range_tail_value::no_tail) { return 1; }
-    // static tail has 2 possible values.
-    assert(tail_value > 0);
-    return 2;
-}
-void brg_range_handle_t::init_func(brgemm_batch_kind_t brg_type, int M, int N,
-        int K, int LDA, int LDB, int LDC, int stride_a, int stride_b,
-        float beta, int dtypeA, int dtypeB, const void *brg_attrs,
-        int M_tail_value, int N_tail_value, int K_tail_value) {
-    int M_size = get_range_size(M_tail_value, M_upper_bound);
-    int N_size = get_range_size(N_tail_value, N_upper_bound);
-    int K_size = get_range_size(K_tail_value, K_upper_bound);
-    int total_size = M_size * N_size * K_size;
-    if (total_size <= linear_cache_capacity) {
-        linear_cache.reserve(total_size);
-        auto get_real_value = [](int i, int tail_value, int upper_bound) {
-            if (tail_value == brg_range_tail_value::dyn_tail) {
-                return i + 1; // idx + 1
-            }
-            if (tail_value == brg_range_tail_value::no_tail) {
-                return upper_bound;
-            }
-            assert(tail_value > 0);
-            return i ? upper_bound : tail_value;
-        };
-        for (int i = 0; i < total_size; i++) {
-            int M_real = get_real_value(
-                    i / (N_size * K_size), M_tail_value, M_upper_bound);
-            int N_real = get_real_value(
-                    i / K_size % N_size, N_tail_value, N_upper_bound);
-            int K_real
-                    = get_real_value(i % K_size, K_tail_value, K_upper_bound);
-            linear_cache.emplace_back(g_brg_desc_s.getInstance(1.f, beta, LDA,
-                    LDB, LDC, M_real, N_real, K_real,
-                    static_cast<int>(stride_a * get_dtype_sizeof(dtypeA)),
-                    static_cast<int>(stride_b * get_dtype_sizeof(dtypeB)),
-                    brg_type, dtypeA, dtypeB, brg_attrs, nullptr, nullptr));
-        }
-    } else {
-        extra_args = std::make_shared<extra_arg_t>(beta, LDA, LDB, LDC,
-                stride_a, stride_b, dtypeA, dtypeB, brg_attrs);
-    }
-}
-brg_range_handle_t::brg_range_handle_t(int M, int N, int K, int LDA, int LDB,
-        int LDC, float beta, int dtypeA, int dtypeB, const void *brg_attrs,
-        int M_tail_value, int N_tail_value, int K_tail_value)
-    : M_upper_bound(M)
-    , N_upper_bound(N)
-    , K_upper_bound(K)
-    , M_tail_value(M_tail_value)
-    , N_tail_value(N_tail_value)
-    , K_tail_value(K_tail_value) {
-    init_func(brgemm_addr, M, N, K, LDA, LDB, LDC, 0, 0, beta, dtypeA, dtypeB,
-            brg_attrs, M_tail_value, N_tail_value, K_tail_value);
-}
-
-brg_range_handle_t::brg_range_handle_t(int M, int N, int K, int LDA, int LDB,
-        int LDC, int stride_a, int stride_b, float beta, int dtypeA, int dtypeB,
-        const void *brg_attrs, int M_tail_value, int N_tail_value,
-        int K_tail_value)
-    : M_upper_bound(M)
-    , N_upper_bound(N)
-    , K_upper_bound(K)
-    , M_tail_value(M_tail_value)
-    , N_tail_value(N_tail_value)
-    , K_tail_value(K_tail_value) {
-    init_func(brgemm_strd, M, N, K, LDA, LDB, LDC, stride_a, stride_b, beta,
-            dtypeA, dtypeB, brg_attrs, M_tail_value, N_tail_value,
-            K_tail_value);
-}
-brgemm_kernel_info *brg_range_handle_t::get_linear_kernel(
-        int M_real, int N_real, int K_real) const {
-    int M_size = get_range_size(M_tail_value, M_upper_bound);
-    int N_size = get_range_size(N_tail_value, N_upper_bound);
-    int K_size = get_range_size(K_tail_value, K_upper_bound);
-    size_t linear_idx = 0;
-    auto cal_dim = [&](int dim, size_t base, int tail_value, int upper_bound) {
-        if (tail_value == brg_range_tail_value::dyn_tail) {
-            linear_idx += (dim - 1) * base;
-            return;
-        }
-        if (tail_value == brg_range_tail_value::no_tail) { return; }
-        assert(tail_value > 0);
-        // static tail process
-        if (dim == tail_value) { return; }
-        if (dim == upper_bound) {
-            linear_idx += base;
-            return;
-        }
-        // not in cache.
-        linear_idx += linear_cache_capacity;
-    };
-    cal_dim(M_real, N_size * K_size, M_tail_value, M_upper_bound);
-    cal_dim(N_real, K_size, N_tail_value, N_upper_bound);
-    cal_dim(K_real, 1, K_tail_value, K_upper_bound);
-    if (linear_idx < linear_cache.size()) { return linear_cache[linear_idx]; }
-    return nullptr;
-}
-void brg_range_handle_t::brg_list_call(int M_real, int N_real, int K_real,
-        const void **A_list, const void **B_list, void *C, int num,
-        int stride_a, int stride_b, int len, int dtypeA, int dtypeB,
-        const void *top_pad, const void *bottom_pad,
-        dnnl::impl::graph::gc::runtime::stream_t *stream) {
-    brgemm_kernel_info *brg = get_linear_kernel(M_real, N_real, K_real);
-    // default use runtime kernel creation.
-    if (!brg) {
-        assert(extra_args);
-        brg = g_brg_desc_s.getInstance(1.f, extra_args->beta, extra_args->LDA,
-                extra_args->LDB, extra_args->LDC, M_real, N_real, K_real, 0, 0,
-                brgemm_addr, dtypeA, dtypeB, extra_args->brg_attrs, nullptr,
-                nullptr);
-    }
-    dnnl_brgemm_list_call(brg, A_list, B_list, C, num, stride_a, stride_b, len,
-            dtypeA, dtypeB, top_pad, bottom_pad, stream);
-}
-
-void brg_range_handle_t::brg_strd_call(int M_real, int N_real, int K_real,
-        const void *A, const void *B, void *C, int num, const void *top_pad,
-        const void *bottom_pad,
-        dnnl::impl::graph::gc::runtime::stream_t *stream) {
-    brgemm_kernel_info *brg = get_linear_kernel(M_real, N_real, K_real);
-    if (!brg) {
-        // default use runtime kernel creation.
-        assert(extra_args);
-        brg = g_brg_desc_s.getInstance(1.f, extra_args->beta, extra_args->LDA,
-                extra_args->LDB, extra_args->LDC, M_real, N_real, K_real,
-                static_cast<int>(extra_args->stride_a
-                        * get_dtype_sizeof(extra_args->dtypeA)),
-                static_cast<int>(extra_args->stride_b
-                        * get_dtype_sizeof(extra_args->dtypeB)),
-                brgemm_strd, extra_args->dtypeA, extra_args->dtypeB,
-                extra_args->brg_attrs, nullptr, nullptr);
-    }
-    dnnl_brgemm_call(brg, A, B, C, num, top_pad, bottom_pad, stream);
-}
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-void amx_buffer_t::reset(gc::runtime::stream_t *stream) {
-    // Based on jit_brgemm_conv_utils.cpp:2121
-    const size_t amx_buf_size = 2 * runtime::get_os_page_size();
-    ptr_ = stream->engine_->vtable_->persistent_alloc(
-            stream->engine_, amx_buf_size);
-}
-void amx_buffer_t::release(engine_t *engine) {
-    if (ptr_) {
-        assert(engine);
-        engine->vtable_->persistent_dealloc(engine, ptr_);
-        ptr_ = nullptr;
-    }
-}
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-void *do_get_amx_tile_buf(const char *palette, gc::runtime::stream_t *stream,
-        bool &amx_exclusive, bool &need_config_amx) {
-    void *tmp_amx_tile_buf = nullptr;
-    auto &tls = gc::runtime::get_tls(stream);
-    // if using managed thread pool, we can avoid re-config/release within
-    // the kernel
-    bool managed_thread_pool = tls.in_managed_thread_pool_;
-    amx_exclusive = managed_thread_pool;
-    if (!amx_exclusive || tls.amx_buffer_.cur_palette != palette) {
-        if (need_config_amx) {
-            amx_tile_configure(palette);
-        } else {
-            need_config_amx = true;
-        }
-        if (managed_thread_pool) {
-            tls.amx_buffer_.cur_palette = palette;
-            // tell the thread pool to release amx tile
-            tls.amx_buffer_.need_release_tile_ = true;
-        }
-    }
-
-    auto &amx_tile_buf = tls.amx_buffer_;
-    if (!amx_tile_buf.ptr_) { amx_tile_buf.reset(stream); }
-    tmp_amx_tile_buf = amx_tile_buf.ptr_;
-
-    return tmp_amx_tile_buf;
-}
-
-static void *get_amx_tile_buf(brgemm_kernel_info *brg_desc,
-        gc::runtime::stream_t *stream, bool &amx_exclusive) {
-    if (!brg_desc->is_amx_) { return nullptr; }
-
-    bool need_config_amx = true;
-    return do_get_amx_tile_buf(
-            brg_desc->palette_, stream, amx_exclusive, need_config_amx);
-}
-
-char *insert_global_palette(char *palette) {
-    std::lock_guard<std::mutex> g(g_brg_desc_s.lock_);
-    // unordered_set `insert` will check if identical palette exists
-    // if so, reuse it to reduce runtime palette switching
-    auto insert_res = g_brg_desc_s.palettes_.insert(palette_ptr_t(palette));
-    return insert_res.first->ptr_;
-}
-
-extern "C" {
-SC_API void *dnnl_brgemm_func(int M, int N, int K, int LDA, int LDB, int LDC,
-        int stride_a, int stride_b, float beta, int dtypeA, int dtypeB,
-        const void *brg_attrs, char *bd_mask, const void *postops_setting) {
-    float alpha = 1.0;
-    return g_brg_desc_s.getInstance(alpha, beta, LDA, LDB, LDC, M, N, K,
-            static_cast<int>(stride_a * get_dtype_sizeof(dtypeA)),
-            static_cast<int>(stride_b * get_dtype_sizeof(dtypeB)), brgemm_strd,
-            dtypeA, dtypeB, brg_attrs, bd_mask, postops_setting);
-}
-
-SC_API void dnnl_brgemm_call(brgemm_kernel_info *brg_desc, const void *A,
-        const void *B, void *C, int num, const void *top_pad,
-        const void *bottom_pad, gc::runtime::stream_t *stream) {
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    if (top_pad || bottom_pad) {
-#ifdef _MSC_VER
-        brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-                num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-        std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-                new brgemm_batch_element_t[num]);
-        brgemm_batch_element_t *batch = batch_v.get();
-#else
-        brgemm_batch_element_t batch[num]; // NOLINT
-#endif
-#endif
-        if (top_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.top = ((int *)top_pad)[i];
-            }
-        }
-        if (bottom_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.bottom = ((int *)bottom_pad)[i];
-            }
-        }
-        brgemm_kernel_execute(brg_desc->brg_kernel_, num, (void **)A,
-                (void **)B, batch, (void *)C, tmp_amx_tile_buf);
-#ifdef _MSC_VER
-        _freea(batch);
-#endif
-    } else {
-        brgemm_kernel_execute(brg_desc->brg_kernel_, num, (void **)A,
-                (void **)B, nullptr, (void *)C, tmp_amx_tile_buf);
-    }
-
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-}
-
-SC_API void dnnl_brgemm_call_range(brg_range_handle_t *brg_range_desc,
-        int M_real, int N_real, int K_real, const void *A, const void *B,
-        void *C, int num, const void *top_pad, const void *bottom_pad,
-        gc::runtime::stream_t *stream) {
-    brg_range_desc->brg_strd_call(
-            M_real, N_real, K_real, A, B, C, num, top_pad, bottom_pad, stream);
-}
-
-SC_API void dnnl_brgemm_call_postops(brgemm_kernel_info *brg_desc,
-        const void *A, const void *B, void *C, int num, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        gc::runtime::stream_t *stream) {
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    if (top_pad || bottom_pad) {
-#ifdef _MSC_VER
-        brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-                num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-        std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-                new brgemm_batch_element_t[num]);
-        brgemm_batch_element_t *batch = batch_v.get();
-#else
-        brgemm_batch_element_t batch[num]; // NOLINT
-#endif
-#endif
-        if (top_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.top = ((int *)top_pad)[i];
-            }
-        }
-        if (bottom_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.bottom = ((int *)bottom_pad)[i];
-            }
-        }
-
-        brgemm_kernel_execute_postops(brg_desc->brg_kernel_, num, (void **)A,
-                (void **)B, batch, (void *)c_buf, (void *)C,
-                *reinterpret_cast<const brgemm_post_ops_data_t *>(postops_data),
-                tmp_amx_tile_buf);
-#ifdef _MSC_VER
-        _freea(batch);
-#endif
-    } else {
-        brgemm_kernel_execute_postops(brg_desc->brg_kernel_, num, (void **)A,
-                (void **)B, nullptr, (void *)c_buf, (void *)C,
-                *reinterpret_cast<const brgemm_post_ops_data_t *>(postops_data),
-                tmp_amx_tile_buf);
-    }
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-}
-
-SC_API void *dnnl_brgemm_list_func(int M, int N, int K, int LDA, int LDB,
-        int LDC, float beta, int dtypeA, int dtypeB, const void *brg_attrs,
-        char *bd_mask, const void *postops_setting) {
-    float alpha = 1.0;
-    if (M <= 0) { return nullptr; }
-    return g_brg_desc_s.getInstance(alpha, beta, LDA, LDB, LDC, M, N, K, 0, 0,
-            brgemm_addr, dtypeA, dtypeB, brg_attrs, bd_mask, postops_setting);
-}
-
-SC_API void dnnl_brgemm_list_call(brgemm_kernel_info *brg_desc,
-        const void **A_list, const void **B_list, void *C, int num,
-        int stride_a, int stride_b, int len, int dtypeA, int dtypeB,
-        const void *top_pad, const void *bottom_pad,
-        gc::runtime::stream_t *stream) {
-    const int batch_num = num * len;
-#ifdef _MSC_VER
-    brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-            batch_num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-    std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-            new brgemm_batch_element_t[batch_num]);
-    brgemm_batch_element_t *batch = batch_v.get();
-#else
-    brgemm_batch_element_t batch[batch_num]; // NOLINT
-#endif
-#endif
-
-    int sizeofA = get_dtype_sizeof(dtypeA);
-    int sizeofB = get_dtype_sizeof(dtypeB);
-    for (int i = 0; i < len; ++i) {
-        for (int j = 0; j < num; ++j) {
-            batch[i * num + j].ptr.A
-                    = (((char **)A_list)[i] + (j * stride_a * sizeofA));
-            batch[i * num + j].ptr.B
-                    = (((char **)B_list)[i] + (j * stride_b * sizeofB));
-        }
-    }
-    if (top_pad) {
-        for (int i = 0; i < len; ++i) {
-            for (int j = 0; j < num; ++j) {
-                batch[i * num + j].vvpad.top = ((int *)top_pad)[i * num + j];
-            }
-        }
-    }
-    if (bottom_pad) {
-        for (int i = 0; i < len; ++i) {
-            for (int j = 0; j < num; ++j) {
-                batch[i * num + j].vvpad.bottom
-                        = ((int *)bottom_pad)[i * num + j];
-            }
-        }
-    }
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, batch_num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    brgemm_kernel_execute(brg_desc->brg_kernel_, batch_num, batch, (void *)C,
-            tmp_amx_tile_buf);
-#ifdef _MSC_VER
-    _freea(batch);
-#endif
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-}
-
-SC_API void dnnl_brgemm_list_call_range(brg_range_handle_t *brg_range_desc,
-        int M_real, int N_real, int K_real, const void **A_list,
-        const void **B_list, void *C, int num, int stride_a, int stride_b,
-        int len, int dtypeA, int dtypeB, const void *top_pad,
-        const void *bottom_pad, gc::runtime::stream_t *stream) {
-    brg_range_desc->brg_list_call(M_real, N_real, K_real, A_list, B_list, C,
-            num, stride_a, stride_b, len, dtypeA, dtypeB, top_pad, bottom_pad,
-            stream);
-}
-
-SC_API void dnnl_brgemm_list_call_postops(brgemm_kernel_info *brg_desc,
-        const void **A_list, const void **B_list, void *C, int num,
-        int stride_a, int stride_b, int len, int dtypeA, int dtypeB,
-        const void *top_pad, const void *bottom_pad, const void *postops_data,
-        void *c_buf, gc::runtime::stream_t *stream) {
-    const int batch_num = num * len;
-#ifdef _MSC_VER
-    brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-            batch_num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-    std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-            new brgemm_batch_element_t[batch_num]);
-    brgemm_batch_element_t *batch = batch_v.get();
-#else
-    brgemm_batch_element_t batch[batch_num]; // NOLINT
-#endif
-#endif
-
-    int sizeofA = get_dtype_sizeof(dtypeA);
-    int sizeofB = get_dtype_sizeof(dtypeB);
-    for (int i = 0; i < len; ++i) {
-        for (int j = 0; j < num; ++j) {
-            batch[i * num + j].ptr.A
-                    = (((char **)A_list)[i] + (j * stride_a * sizeofA));
-            batch[i * num + j].ptr.B
-                    = (((char **)B_list)[i] + (j * stride_b * sizeofB));
-        }
-    }
-    if (top_pad) {
-        for (int i = 0; i < len; ++i) {
-            for (int j = 0; j < num; ++j) {
-                batch[i * num + j].vvpad.top = ((int *)top_pad)[i * num + j];
-            }
-        }
-    }
-    if (bottom_pad) {
-        for (int i = 0; i < len; ++i) {
-            for (int j = 0; j < num; ++j) {
-                batch[i * num + j].vvpad.bottom
-                        = ((int *)bottom_pad)[i * num + j];
-            }
-        }
-    }
-
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, batch_num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    brgemm_kernel_execute_postops(brg_desc->brg_kernel_, batch_num, batch,
-            (void *)c_buf, (void *)C,
-            *reinterpret_cast<const brgemm_post_ops_data_t *>(postops_data),
-            tmp_amx_tile_buf);
-#ifdef _MSC_VER
-    _freea(batch);
-#endif
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-}
-
-SC_API int dnnl_brgemm_init(
-        void *C, int M, int N, int LDC, int dtypeC, float value) {
-#define BRGEMM_DTYPE_INIT(dtype) \
-    if (LDC == N) { \
-        memset(C, (dtype)value, M *N *dtype_size); \
-    } else { \
-        for (int i = 0; i < M; ++i) { \
-            for (int j = 0; j < N; ++j) { \
-                ((dtype *)C)[i * LDC + j] = (dtype)value; \
-            } \
-        } \
-    }
-    auto dtype_size = get_dtype_sizeof(dtypeC);
-    if (dtype_size == 1) {
-        BRGEMM_DTYPE_INIT(uint8_t);
-    } else if (dtype_size == 2) {
-        BRGEMM_DTYPE_INIT(uint16_t);
-    } else {
-        BRGEMM_DTYPE_INIT(int32_t);
-    }
-    return 0;
-#undef BRGEMM_DTYPE_INIT
-}
-
-SC_API int dnnl_brgemm_init_update(const void *A, const void *B, void *C,
-        int num, int M, int N, int K, int LDA, int LDB, int LDC, int stride_a,
-        int stride_b, int dtypeA, int dtypeB, const void *brg_attrs,
-        char *bd_mask, const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        gc::runtime::stream_t *stream) {
-    float alpha = 1.0, beta = 0.0;
-    auto brg_desc = g_brg_desc_s.getInstance(alpha, beta, LDA, LDB, LDC, M, N,
-            K, static_cast<int>(stride_a * get_dtype_sizeof(dtypeA)),
-            static_cast<int>(stride_b * get_dtype_sizeof(dtypeB)), brgemm_strd,
-            dtypeA, dtypeB, brg_attrs, bd_mask, postops_setting);
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    if (top_pad || bottom_pad) {
-        const int batch_num = num;
-#ifdef _MSC_VER
-        brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-                batch_num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-        std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-                new brgemm_batch_element_t[batch_num]);
-        brgemm_batch_element_t *batch = batch_v.get();
-#else
-        brgemm_batch_element_t batch[batch_num]; // NOLINT
-#endif
-#endif
-        if (top_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.top = ((int *)top_pad)[i];
-            }
-        }
-        if (bottom_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.bottom = ((int *)bottom_pad)[i];
-            }
-        }
-        if (postops_setting == nullptr) {
-            brgemm_kernel_execute(brg_desc->brg_kernel_, num, (void **)A,
-                    (void **)B, batch, (void *)C, tmp_amx_tile_buf);
-        } else {
-            brgemm_kernel_execute_postops(brg_desc->brg_kernel_, num,
-                    (void **)A, (void **)B, batch, (void *)c_buf, (void *)C,
-                    *reinterpret_cast<const brgemm_post_ops_data_t *>(
-                            postops_data),
-                    tmp_amx_tile_buf);
-        }
-#ifdef _MSC_VER
-        _freea(batch);
-#endif
-    } else {
-        if (postops_setting == nullptr) {
-            brgemm_kernel_execute(brg_desc->brg_kernel_, num, (void **)A,
-                    (void **)B, nullptr, (void *)C, tmp_amx_tile_buf);
-        } else {
-            brgemm_kernel_execute_postops(brg_desc->brg_kernel_, num,
-                    (void **)A, (void **)B, nullptr, (void *)c_buf, (void *)C,
-                    *reinterpret_cast<const brgemm_post_ops_data_t *>(
-                            postops_data),
-                    tmp_amx_tile_buf);
-        }
-    }
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-    return 0;
-}
-
-SC_API int dnnl_brgemm_update(const void *A, const void *B, void *C, int num,
-        int M, int N, int K, int LDA, int LDB, int LDC, int stride_a,
-        int stride_b, int dtypeA, int dtypeB, const void *brg_attrs,
-        char *bd_mask, const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        gc::runtime::stream_t *stream) {
-    float alpha = 1.0, beta = 1.0;
-    auto brg_desc = g_brg_desc_s.getInstance(alpha, beta, LDA, LDB, LDC, M, N,
-            K, static_cast<int>(stride_a * get_dtype_sizeof(dtypeA)),
-            static_cast<int>(stride_b * get_dtype_sizeof(dtypeB)), brgemm_strd,
-            dtypeA, dtypeB, brg_attrs, bd_mask, postops_setting);
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    if (top_pad || bottom_pad) {
-        const int batch_num = num;
-#ifdef _MSC_VER
-        brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-                batch_num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-        std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-                new brgemm_batch_element_t[batch_num]);
-        brgemm_batch_element_t *batch = batch_v.get();
-#else
-        brgemm_batch_element_t batch[batch_num]; // NOLINT
-#endif
-#endif
-        if (top_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.top = ((int *)top_pad)[i];
-            }
-        }
-        if (bottom_pad) {
-            for (int i = 0; i < num; ++i) {
-                batch[i].vvpad.bottom = ((int *)bottom_pad)[i];
-            }
-        }
-        if (postops_setting == nullptr) {
-            brgemm_kernel_execute(brg_desc->brg_kernel_, num, (void **)A,
-                    (void **)B, batch, (void *)C, tmp_amx_tile_buf);
-        } else {
-            brgemm_kernel_execute_postops(brg_desc->brg_kernel_, num,
-                    (void **)A, (void **)B, batch, (void *)c_buf, (void *)C,
-                    *reinterpret_cast<const brgemm_post_ops_data_t *>(
-                            postops_data),
-                    tmp_amx_tile_buf);
-        }
-#ifdef _MSC_VER
-        _freea(batch);
-#endif
-    } else {
-        if (postops_setting == nullptr) {
-            brgemm_kernel_execute(brg_desc->brg_kernel_, num, (void **)A,
-                    (void **)B, nullptr, (void *)C, tmp_amx_tile_buf);
-        } else {
-            brgemm_kernel_execute_postops(brg_desc->brg_kernel_, num,
-                    (void **)A, (void **)B, nullptr, (void *)c_buf, (void *)C,
-                    *reinterpret_cast<const brgemm_post_ops_data_t *>(
-                            postops_data),
-                    tmp_amx_tile_buf);
-        }
-    }
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-    return 0;
-}
-
-static int dnnl_brgemm_list_update_func(const void **A_list,
-        const void **B_list, void *C, int num, int M, int N, int K, int LDA,
-        int LDB, int LDC, int stride_a, int stride_b, int len, int dtypeA,
-        int dtypeB, float beta, const void *brg_attrs, char *bd_mask,
-        const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        gc::runtime::stream_t *stream) {
-    float alpha = 1.0;
-    const int batch_num = num * len;
-#ifdef _MSC_VER
-    brgemm_batch_element_t *batch = (brgemm_batch_element_t *)_malloca(
-            batch_num * sizeof(brgemm_batch_element_t));
-#else
-#if SC_IS_DPCPP() || (defined(CLANGVERSION) && CLANGVERSION <= 3)
-    std::unique_ptr<brgemm_batch_element_t[]> batch_v(
-            new brgemm_batch_element_t[batch_num]);
-    brgemm_batch_element_t *batch = batch_v.get();
-#else
-    brgemm_batch_element_t batch[batch_num]; // NOLINT
-#endif
-#endif
-    int sizeofA = get_dtype_sizeof(dtypeA);
-    int sizeofB = get_dtype_sizeof(dtypeB);
-    for (int i = 0; i < len; ++i) {
-        for (int j = 0; j < num; ++j) {
-            batch[i * num + j].ptr.A
-                    = (((char **)A_list)[i] + (j * stride_a * sizeofA));
-            batch[i * num + j].ptr.B
-                    = (((char **)B_list)[i] + (j * stride_b * sizeofB));
-        }
-    }
-    if (top_pad) {
-        for (int i = 0; i < len; ++i) {
-            for (int j = 0; j < num; ++j) {
-                batch[i * num + j].vvpad.top = ((int *)top_pad)[i * num + j];
-            }
-        }
-    }
-    if (bottom_pad) {
-        for (int i = 0; i < len; ++i) {
-            for (int j = 0; j < num; ++j) {
-                batch[i * num + j].vvpad.bottom
-                        = ((int *)bottom_pad)[i * num + j];
-            }
-        }
-    }
-    auto brg_desc = g_brg_desc_s.getInstance(alpha, beta, LDA, LDB, LDC, M, N,
-            K, 0, 0, brgemm_addr, dtypeA, dtypeB, brg_attrs, bd_mask,
-            postops_setting);
-    bool amx_exclusive = false;
-    sc_make_timer(brg_desc, batch_num);
-    void *tmp_amx_tile_buf = get_amx_tile_buf(brg_desc, stream, amx_exclusive);
-    if (postops_setting == nullptr) {
-        brgemm_kernel_execute(brg_desc->brg_kernel_, batch_num, batch,
-                (void *)C, tmp_amx_tile_buf);
-    } else {
-        brgemm_kernel_execute_postops(brg_desc->brg_kernel_, batch_num, batch,
-                (void *)c_buf, (void *)C,
-                *reinterpret_cast<const brgemm_post_ops_data_t *>(postops_data),
-                tmp_amx_tile_buf);
-    }
-#ifdef _MSC_VER
-    _freea(batch);
-#endif
-    if (!amx_exclusive && brg_desc->is_amx_) { amx_tile_release(); }
-    return 0;
-}
-
-SC_API int dnnl_brgemm_init_list_update(const void **A_list,
-        const void **B_list, void *C, int num, int M, int N, int K, int LDA,
-        int LDB, int LDC, int stride_a, int stride_b, int len, int dtypeA,
-        int dtypeB, const void *brg_attrs, char *bd_mask,
-        const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        gc::runtime::stream_t *stream) {
-    float beta = 0.f;
-    int ret = dnnl_brgemm_list_update_func(A_list, B_list, C, num, M, N, K, LDA,
-            LDB, LDC, stride_a, stride_b, len, dtypeA, dtypeB, beta, brg_attrs,
-            bd_mask, postops_setting, top_pad, bottom_pad, postops_data, c_buf,
-            stream);
-    return ret;
-}
-
-SC_API int dnnl_brgemm_list_update(const void **A_list, const void **B_list,
-        void *C, int num, int M, int N, int K, int LDA, int LDB, int LDC,
-        int stride_a, int stride_b, int len, int dtypeA, int dtypeB,
-        const void *brg_attrs, char *bd_mask, const void *postops_setting,
-        const void *top_pad, const void *bottom_pad, const void *postops_data,
-        void *c_buf, gc::runtime::stream_t *stream) {
-    float beta = 1.f;
-    int ret = dnnl_brgemm_list_update_func(A_list, B_list, C, num, M, N, K, LDA,
-            LDB, LDC, stride_a, stride_b, len, dtypeA, dtypeB, beta, brg_attrs,
-            bd_mask, postops_setting, top_pad, bottom_pad, postops_data, c_buf,
-            stream);
-    return ret;
-}
-
-SC_API void dnnl_brgemm_postops_data_init(void *dnnl_data, void *bias,
-        void *scales, void *binary_post_ops_rhs, uint64_t oc_logical_off,
-        uint64_t dst_row_logical_off, void *data_C_ptr_,
-        uint64_t first_mb_matrix_addr_off, void *a_zp_compensations,
-        void *b_zp_compensations, void *c_zp_values, bool skip_accumulation,
-        int zp_a_val, bool do_only_comp, bool do_only_zp_a_val) {
-    brgemm_post_ops_data_t *postop_data
-            = reinterpret_cast<brgemm_post_ops_data_t *>(dnnl_data);
-    new (postop_data) brgemm_post_ops_data_t {bias,
-            reinterpret_cast<float *>(scales), binary_post_ops_rhs,
-            oc_logical_off, dst_row_logical_off,
-            reinterpret_cast<char *>(data_C_ptr_), first_mb_matrix_addr_off,
-            a_zp_compensations, b_zp_compensations, c_zp_values,
-            skip_accumulation, zp_a_val, do_only_comp, do_only_zp_a_val};
-}
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_range_handle.hpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_range_handle.hpp
deleted file mode 100644
index 8abb80ab890..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/brgemm_range_handle.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_BRGEMM_RANGE_HANDLE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_BRGEMM_RANGE_HANDLE_HPP
-#include <memory>
-#include <vector>
-#include <cpu/x64/brgemm/brgemm_types.hpp>
-struct brgemm_kernel_info;
-using namespace dnnl::impl::cpu::x64;
-
-namespace brg_range_tail_value {
-constexpr const int dyn_tail = -1; // dynamic tail range, from 1 to upper bound
-constexpr const int no_tail = 0; // not a range, fixed to upper bound
-// if the value is >0, it means the dimension has tail in static case and the
-// value is the static tail.
-} // namespace brg_range_tail_value
-// For range shape brgemm process, use a linear cache for indexing when the
-// number of total ranges is below 256. Currently we only consider contiguous
-// range.
-struct brg_range_handle_t {
-    struct extra_arg_t {
-        float beta;
-        int LDA;
-        int LDB;
-        int LDC;
-        int stride_a;
-        int stride_b;
-        int dtypeA;
-        int dtypeB;
-        const void *brg_attrs;
-        extra_arg_t(float beta, int LDA, int LDB, int LDC, int stride_a,
-                int stride_b, int dtypeA, int dtypeB, const void *brg_attrs)
-            : beta(beta)
-            , LDA(LDA)
-            , LDB(LDB)
-            , LDC(LDC)
-            , stride_a(stride_a)
-            , stride_b(stride_b)
-            , dtypeA(dtypeA)
-            , dtypeB(dtypeB)
-            , brg_attrs(brg_attrs) {}
-    };
-    // record the upper bound of M, N, K.
-    int M_upper_bound;
-    int N_upper_bound;
-    int K_upper_bound;
-    int M_tail_value;
-    int N_tail_value;
-    int K_tail_value;
-    // use linear when total range number is below 256.
-    std::vector<brgemm_kernel_info *> linear_cache;
-    // record extra args for runtime generation.
-    std::shared_ptr<extra_arg_t> extra_args;
-    static constexpr const int linear_cache_capacity = 256;
-    virtual ~brg_range_handle_t() {}
-    void init_func(brgemm_batch_kind_t brg_type, int M, int N, int K, int LDA,
-            int LDB, int LDC, int stride_a, int stride_b, float beta,
-            int dtypeA, int dtypeB, const void *brg_attrs, int M_tail_value,
-            int N_tail_value, int K_tail_value);
-    brg_range_handle_t(int M, int N, int K, int LDA, int LDB, int LDC,
-            float beta, int dtypeA, int dtypeB, const void *brg_attrs,
-            int M_tail_value, int N_tail_value, int K_tail_value);
-    brg_range_handle_t(int M, int N, int K, int LDA, int LDB, int LDC,
-            int stride_a, int stride_b, float beta, int dtypeA, int dtypeB,
-            const void *brg_attrs, int M_tail_value, int N_tail_value,
-            int K_tail_value);
-    brgemm_kernel_info *get_linear_kernel(
-            int M_real, int N_real, int K_real) const;
-    void brg_list_call(int M_real, int N_real, int K_real, const void **A_list,
-            const void **B_list, void *C, int num, int stride_a, int stride_b,
-            int len, int dtypeA, int dtypeB, const void *top_pad,
-            const void *bottom_pad,
-            dnnl::impl::graph::gc::runtime::stream_t *stream);
-    void brg_strd_call(int M_real, int N_real, int K_real, const void *A,
-            const void *B, void *C, int num, const void *top_pad,
-            const void *bottom_pad,
-            dnnl::impl::graph::gc::runtime::stream_t *stream);
-};
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/kernel_timer.hpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/kernel_timer.hpp
deleted file mode 100644
index 02d703128f4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/kernel_timer.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_KERNEL_TIMER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_KERNEL_TIMER_HPP
-#ifdef SC_KERNEL_PROFILE
-#include <runtime/config.hpp>
-#include <runtime/thread_locals.hpp>
-
-inline bool sc_is_trace_enabled() {
-    namespace gc = dnnl::impl::graph::gc;
-    auto mode = gc::runtime_config_t::get().trace_mode_;
-    return (mode == gc::runtime_config_t::trace_mode_t::KERNEL
-                   && gc::runtime::thread_local_buffer_t::tls_buffer()
-                                   .additional_->linear_thread_id_
-                           == 0)
-            || mode == gc::runtime_config_t::trace_mode_t::MULTI_THREAD;
-}
-
-inline void sc_make_timer_id(int flops, int num) {
-    namespace gc = dnnl::impl::graph::gc;
-    if (sc_is_trace_enabled()) {
-        auto &log = gc::runtime::thread_local_buffer_t::tls_buffer()
-                            .additional_->trace_.trace_logs_.back();
-        log.arg_ = flops;
-    }
-}
-
-#define sc_make_timer(desc, num) sc_make_timer_id(desc->flops_, num);
-#else
-#define sc_make_timer(id, num)
-#define sc_make_timer_id(id, num)
-#endif
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/microkernel.hpp b/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/microkernel.hpp
deleted file mode 100644
index 6b69d6ef7c1..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/microkernel/cpu/microkernel.hpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_MICROKERNEL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_MICROKERNEL_CPU_MICROKERNEL_HPP
-
-#include <stdint.h>
-#include <util/def.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct stream_t;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-extern "C" {
-SC_API int dnnl_brgemm_init_update(const void *A, const void *B, void *C,
-        int num, int M, int N, int K, int LDA, int LDB, int LDC, int stride_a,
-        int stride_b, int dtypeA, int dtypeB, const void *brg_attrs,
-        char *bd_mask, const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API int dnnl_brgemm_init(
-        void *C, int M, int N, int LDC, int dtypeC, float value = 0.f);
-SC_API int dnnl_brgemm_update(const void *A, const void *B, void *C, int num,
-        int M, int N, int K, int LDA, int LDB, int LDC, int stride_a,
-        int stride_b, int dtypeA, int dtypeB, const void *brg_attrs,
-        char *bd_mask, const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API int dnnl_brgemm_list_update(const void **A_list, const void **B_list,
-        void *C, int num, int M, int N, int K, int LDA, int LDB, int LDC,
-        int stride_a, int stride_b, int len, int dtypeA, int dtypeB,
-        const void *brg_attrs, char *bd_mask, const void *postops_setting,
-        const void *top_pad, const void *bottom_pad, const void *postops_data,
-        void *c_buf, dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API int dnnl_brgemm_init_list_update(const void **A_list,
-        const void **B_list, void *C, int num, int M, int N, int K, int LDA,
-        int LDB, int LDC, int stride_a, int stride_b, int len, int dtypeA,
-        int dtypeB, const void *brg_attrs, char *bd_mask,
-        const void *postops_setting, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void *dnnl_brgemm_list_func(int M, int N, int K, int LDA, int LDB,
-        int LDC, float beta, int dtypeA, int dtypeB, const void *brg_attrs,
-        char *bd_mask, const void *postops_setting);
-
-struct brgemm_kernel_info;
-struct brg_range_handle_t;
-SC_API void dnnl_brgemm_list_call(brgemm_kernel_info *brg_desc,
-        const void **A_list, const void **B_list, void *C, int len, int num,
-        int stride_a, int stride_b, int dtypeA, int dtypeB, const void *top_pad,
-        const void *bottom_pad,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void dnnl_brgemm_list_call_range(brg_range_handle_t *brg_range_desc,
-        int M_real, int N_real, int K_real, const void **A_list,
-        const void **B_list, void *C, int num, int stride_a, int stride_b,
-        int len, int dtypeA, int dtypeB, const void *top_pad,
-        const void *bottom_pad,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void dnnl_brgemm_list_call_postops(brgemm_kernel_info *brg_desc,
-        const void **A_list, const void **B_list, void *C, int len, int num,
-        int stride_a, int stride_b, int dtypeA, int dtypeB, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void *dnnl_brgemm_func(int M, int N, int K, int LDA, int LDB, int LDC,
-        int stride_a, int stride_b, float beta, int dtypeA, int dtypeB,
-        const void *brg_attrs, char *bd_mask, const void *postops_setting);
-SC_API void dnnl_brgemm_call(brgemm_kernel_info *brg_desc, const void *A,
-        const void *B, void *C, int num, const void *top_pad,
-        const void *bottom_pad,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void dnnl_brgemm_call_range(brg_range_handle_t *brg_range_desc,
-        int M_real, int N_real, int K_real, const void *A, const void *B,
-        void *C, int num, const void *top_pad, const void *bottom_pad,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void dnnl_brgemm_call_postops(brgemm_kernel_info *brg_desc,
-        const void *A, const void *B, void *C, int num, const void *top_pad,
-        const void *bottom_pad, const void *postops_data, void *c_buf,
-        dnnl::impl::graph::gc::runtime::stream_t *stream);
-SC_API void dnnl_brgemm_postops_data_init(void *dnnl_data = nullptr,
-        void *bias = nullptr, void *scales = nullptr,
-        void *binary_post_ops_rhs = nullptr, uint64_t oc_logical_off = 0UL,
-        uint64_t dst_row_logical_off = 0, void *data_C_ptr_ = nullptr,
-        uint64_t first_mb_matrix_addr_off = 0,
-        void *a_zp_compensations = nullptr, void *b_zp_compensations = nullptr,
-        void *c_zp_values = nullptr, bool skip_accumulation = false,
-        int zp_a_val = 1, bool do_only_comp = false,
-        bool do_only_zp_a_val = false);
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/os.hpp b/src/graph/backend/graph_compiler/core/src/runtime/os.hpp
deleted file mode 100644
index c2bf3bb5f14..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/os.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_OS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_OS_HPP
-
-#include <stddef.h>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-/**
- * Gets the size of OS memory page
- * */
-SC_INTERNAL_API size_t get_os_page_size();
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/parallel.cpp b/src/graph/backend/graph_compiler/core/src/runtime/parallel.cpp
deleted file mode 100644
index 4388369923a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/parallel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <memory>
-#include "config.hpp"
-#include "context.hpp"
-#include <runtime/generic_val.hpp>
-#include <runtime/parallel.hpp>
-#include <util/compiler_macros.hpp>
-#include <util/simple_math.hpp>
-#include <util/utils.hpp>
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <common/dnnl_thread.hpp>
-#include <oneapi/dnnl/dnnl_threadpool.h>
-#endif
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-#include <tbb/global_control.h>
-#include <tbb/parallel_for_each.h>
-#include <tbb/task_arena.h>
-#endif
-
-#include <runtime/thread_locals.hpp>
-#ifdef SC_KERNEL_PROFILE
-#include <atomic>
-#endif
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-#include <omp.h>
-#endif
-
-using namespace dnnl::impl::graph::gc;
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-
-static int num_threads_override = 0;
-static int get_num_threads() {
-    if (num_threads_override > 0) { return num_threads_override; }
-    int ret = 0;
-    dnnl_threadpool_interop_get_max_concurrency(&ret);
-    return ret;
-}
-
-extern "C" void sc_parallel_call_cpu_with_env_impl(
-        void (*pfunc)(void *, void *, int64_t, generic_val *), uint64_t flags,
-        void *rtl_ctx, void *module_env, int64_t begin, int64_t end,
-        int64_t step, generic_val *args) {
-    runtime::thread_local_buffer_t::tls_buffer().additional_->is_main_thread_
-            = true;
-    using namespace dnnl::impl;
-    auto num_jobs
-            = dnnl::impl::graph::gc::utils::divide_and_ceil(end - begin, step);
-    int nthr = adjust_num_threads(
-            std::min(get_num_threads(), dnnl_get_current_num_threads()),
-            num_jobs);
-    if (nthr) {
-        dnnl::impl::parallel(nthr, [&](int ithr, int nthr) {
-            runtime::thread_local_buffer_t::tls_buffer()
-                    .additional_->linear_thread_id_
-                    = ithr;
-            auto f = [&](int64_t i) {
-                pfunc(rtl_ctx, module_env, i * step + begin, args);
-            };
-            for_nd(ithr, nthr, num_jobs, f);
-        });
-    }
-    runtime::thread_local_buffer_t::tls_buffer().additional_->linear_thread_id_
-            = 0;
-}
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-int get_max_threadpool_concurrency() {
-    static int v = get_num_threads();
-    return v;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-static void set_num_threads(int num) {
-    (void)get_max_threadpool_concurrency();
-    num_threads_override = num;
-    dnnl_threadpool_interop_set_max_concurrency(num);
-}
-
-static int get_thread_num() {
-    return runtime::thread_local_buffer_t::tls_buffer()
-            .additional_->linear_thread_id_;
-}
-
-static int get_in_parallel() {
-    return dnnl::impl::threadpool_utils::get_active_threadpool()
-            ->get_in_parallel();
-}
-
-#elif SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-
-static int &get_default_threads() {
-    static int num_threads = oneapi::tbb::info::default_concurrency();
-    return num_threads;
-}
-extern "C" void sc_parallel_call_cpu_with_env_impl(
-        void (*pfunc)(void *, void *, int64_t, generic_val *), uint64_t flags,
-        void *rtl_ctx, void *module_env, int64_t begin, int64_t end,
-        int64_t step, generic_val *args) {
-    runtime::thread_local_buffer_t::tls_buffer().additional_->is_main_thread_
-            = true;
-    oneapi::tbb::task_arena arena(get_default_threads());
-    arena.execute([&] {
-        tbb::parallel_for(
-                begin, end, step,
-                [&](int64_t i) { pfunc(rtl_ctx, module_env, i, args); },
-                tbb::simple_partitioner());
-    });
-}
-
-static int get_num_threads() {
-    return get_default_threads();
-}
-
-static void set_num_threads(int num) {
-    static std::unique_ptr<oneapi::tbb::global_control> ctrl;
-    ctrl = utils::make_unique<oneapi::tbb::global_control>(
-            oneapi::tbb::global_control::max_allowed_parallelism, num);
-    get_default_threads() = num;
-}
-
-static int get_thread_num() {
-    return tbb::this_task_arena::current_thread_index();
-}
-static int get_in_parallel() {
-    return 0;
-}
-
-#else
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-#define get_num_threads omp_get_max_threads
-#define set_num_threads omp_set_num_threads
-#define get_thread_num omp_get_thread_num
-#define get_in_parallel omp_in_parallel
-#else
-static int get_num_threads() {
-    return 1;
-}
-
-static void set_num_threads(int num) {}
-static int get_thread_num() {
-    return 0;
-}
-static int get_in_parallel() {
-    return 0;
-}
-#endif
-
-#ifdef SC_KERNEL_PROFILE
-static std::atomic<int> instance_cnt = {0};
-static thread_local int instance_id = instance_cnt++;
-#endif
-
-// omp or sequential
-extern "C" void sc_parallel_call_cpu_with_env_impl(
-        void (*pfunc)(void *, void *, int64_t, generic_val *), uint64_t flags,
-        void *rtl_ctx, void *module_env, int64_t begin, int64_t end,
-        int64_t step, generic_val *args) {
-#ifdef SC_KERNEL_PROFILE
-    int parent_instance_id = instance_id;
-#endif
-    runtime::thread_local_buffer_t::tls_buffer().additional_->is_main_thread_
-            = true;
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-    SC_NO_OP();
-#pragma omp parallel for
-#endif
-    for (int64_t i = begin; i < end; i += step) {
-        SC_NO_OP();
-#ifdef SC_KERNEL_PROFILE
-        auto &tls = runtime::thread_local_buffer_t::tls_buffer();
-        tls.additional_->instance_id_ = parent_instance_id;
-        tls.additional_->linear_thread_id_ = get_thread_num();
-#endif
-        pfunc(rtl_ctx, module_env, i, args);
-    }
-}
-
-#endif
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-thread_pool_table sc_pool_table {&sc_parallel_call_cpu_with_env_impl, nullptr,
-        &get_num_threads, &set_num_threads, &get_thread_num, &get_in_parallel};
-}
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/parallel.hpp b/src/graph/backend/graph_compiler/core/src/runtime/parallel.hpp
deleted file mode 100644
index 3c2f98044bc..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/parallel.hpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_PARALLEL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_PARALLEL_HPP
-#include <runtime/config.hpp>
-#include <runtime/generic_val.hpp>
-#include <util/def.hpp>
-
-// the default implementation of SC's thread pool
-extern "C" SC_API void sc_parallel_call_cpu_with_env_impl(
-        void (*pfunc)(
-                void *, void *, int64_t, dnnl::impl::graph::gc::generic_val *),
-        uint64_t flags, void *rtl_ctx, void *module_env, int64_t begin,
-        int64_t end, int64_t step, dnnl::impl::graph::gc::generic_val *args);
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-extern thread_pool_table sc_pool_table;
-}
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/runtime.hpp b/src/graph/backend/graph_compiler/core/src/runtime/runtime.hpp
deleted file mode 100644
index a1d6331302e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/runtime.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_RUNTIME_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_RUNTIME_HPP
-#include <stddef.h>
-#include <stdint.h>
-#include <util/def.hpp>
-
-extern "C" {
-
-SC_API void print_float(float f);
-SC_API void print_index(uint64_t f);
-SC_API void print_int(int f);
-SC_API void print_str(char *f);
-SC_API void *sc_global_aligned_alloc(size_t sz, size_t align);
-SC_API void sc_global_aligned_free(void *ptr, size_t align);
-SC_API void sc_make_trace(int id, int in_or_out, int arg);
-SC_API void sc_make_trace_kernel(int id, int in_or_out, int arg);
-// dynamic
-SC_API void *sc_extract_dyn_base(void *tsr);
-SC_API void *sc_extract_dyn_shape(void *tsr);
-SC_API void sc_initialize_dyn_tsr(
-        void *dyn_tsr, void *tsr, void *shapes, uint8_t dyn_mask, int ndims);
-};
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct engine_t;
-}
-SC_API void release_runtime_memory(runtime::engine_t *engine);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/string_utils.cpp b/src/graph/backend/graph_compiler/core/src/runtime/string_utils.cpp
deleted file mode 100644
index f1c3ab067de..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/string_utils.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <util/string_utils.hpp>
-
-#include <iomanip>
-#include <sstream>
-
-using std::ostringstream;
-using std::string;
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-std::vector<std::string> string_split(
-        const std::string &str, const std::string &delimiter) {
-    size_t prev = 0;
-    std::vector<std::string> ret;
-    for (size_t cur = 0; cur < str.length();) {
-        cur = str.find_first_of(delimiter, cur);
-        if (cur != std::string::npos) {
-            ret.push_back(str.substr(prev, cur - prev));
-            cur++;
-            prev = cur;
-        } else {
-            ret.push_back(str.substr(prev));
-            break;
-        }
-    }
-    return ret;
-}
-
-bool string_startswith(const std::string &str, const std::string &prefix) {
-    return !str.compare(0, prefix.size(), prefix);
-}
-
-bool string_endswith(const std::string &str, const std::string &prefix) {
-    return str.size() >= prefix.size()
-            && !str.compare(str.size() - prefix.size(), prefix.size(), prefix);
-}
-
-std::string replace_newlines(
-        const std::string &in_str, const std::string &subst) {
-    std::string out_str;
-    out_str.reserve(in_str.size() * 2);
-    for (const auto &c : in_str) {
-        if (c == '\n') {
-            out_str.append(subst);
-        } else {
-            out_str.push_back(c);
-        }
-    }
-    return out_str;
-}
-
-std::string brief_pretty_function(std::string text) {
-    const size_t paren_idx = text.find('(');
-    if (paren_idx == string::npos) {
-        // Confused. Just bail out.
-        return text;
-    }
-
-    const size_t last_space_before_params_idx = text.rfind(' ', paren_idx);
-
-    const size_t long_func_name_start_idx
-            = (last_space_before_params_idx == string::npos)
-            ? 0
-            : (last_space_before_params_idx + 1);
-
-    const size_t last_colon_before_params_idx = text.rfind(':', paren_idx);
-
-    const size_t short_func_name_start_idx
-            = (last_colon_before_params_idx == string::npos)
-            ? long_func_name_start_idx
-            : last_colon_before_params_idx + 1;
-
-    return text.substr(short_func_name_start_idx);
-}
-
-std::string brief_lineloc(std::string filename, int64_t line_num) {
-    size_t last_slash_idx = filename.rfind('/');
-    if (last_slash_idx != string::npos) {
-        filename = string(filename, last_slash_idx + 1);
-    }
-
-    ostringstream os;
-    os << filename << ":" << line_num;
-    return os.str();
-}
-
-indentation_t::indentation_t(size_t chars_per_level)
-    : chars_per_level_(chars_per_level) {}
-
-indentation_t::level_holder_t::level_holder_t(indentation_t &owner)
-    : owner_(owner) {
-    ++owner_.level_;
-}
-
-indentation_t::level_holder_t::~level_holder_t() {
-    --owner_.level_;
-}
-
-indentation_t::level_holder_t indentation_t::indent() {
-    return level_holder_t(*this);
-}
-
-std::ostream &operator<<(std::ostream &os, const indentation_t &i) {
-    os << std::string(i.level_ * i.chars_per_level_, ' ');
-    return os;
-};
-
-as_hex_t::as_hex_t(const void *ptr, const char *null_alt_string)
-    : ptr_(ptr), null_alt_string_(null_alt_string) {}
-
-std::ostream &operator<<(std::ostream &os, as_hex_t a) {
-    if ((!a.ptr_) && (a.null_alt_string_)) {
-        os << a.null_alt_string_;
-    } else {
-        constexpr size_t chars_per_byte = 2;
-        constexpr size_t base_chars = 2;
-        constexpr size_t total_width
-                = base_chars + chars_per_byte * sizeof(void *);
-        std::ostringstream ss;
-        ss << std::hex << std::internal << std::showbase << std::setfill('0')
-           << std::setw(total_width);
-        ss << a.ptr_;
-        os << ss.str();
-    }
-    return os;
-}
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/support.cpp b/src/graph/backend/graph_compiler/core/src/runtime/support.cpp
deleted file mode 100644
index d46b9ec0c2c..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/support.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <cmath>
-#include <limits>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "config.hpp"
-#include <runtime/data_type.hpp>
-#include <runtime/env_var.hpp>
-#include <runtime/env_vars.hpp>
-#include <runtime/logging.hpp>
-#include <runtime/managed_thread_pool_exports.hpp>
-#include <runtime/os.hpp>
-#include <runtime/parallel.hpp>
-#include <runtime/runtime.hpp>
-#include <util/def.hpp>
-#include <util/os.hpp>
-#include <util/string_utils.hpp>
-#ifdef _WIN32
-#include <windows.h>
-#else
-#include <unistd.h>
-#endif
-
-SC_MODULE(runtime.support)
-
-using namespace dnnl::impl::graph::gc;
-extern "C" void print_float(float f) {
-    printf("%f\n", f);
-}
-
-extern "C" void print_index(uint64_t f) {
-    printf("%llu\n", static_cast<unsigned long long>(f)); // NOLINT
-}
-
-extern "C" void print_int(int f) {
-    printf("%d\n", f);
-}
-
-extern "C" void print_str(char *f) {
-    fputs(f, stdout);
-}
-
-extern "C" void *sc_global_aligned_alloc(size_t sz, size_t align) {
-    return aligned_alloc(align, (sz / align + 1) * align);
-}
-
-extern "C" void sc_global_aligned_free(void *ptr, size_t align) {
-    aligned_free(ptr);
-}
-
-alignas(64) extern const uint32_t sc_log_const_int_vals_1[32] = {0x3f800000,
-        0x3f780000, 0x3f700000, 0x3f680000, 0x3f600000, 0x3f580000, 0x3f580000,
-        0x3f500000, 0x3f480000, 0x3f480000, 0x3f400000, 0x3f400000, 0x3f380000,
-        0x3f380000, 0x3f300000, 0x3f300000, 0x3fa80000, 0x3fa80000, 0x3fa00000,
-        0x3fa00000, 0x3fa00000, 0x3f980000, 0x3f980000, 0x3f900000, 0x3f900000,
-        0x3f900000, 0x3f900000, 0x3f880000, 0x3f880000, 0x3f880000, 0x3f800000,
-        0x3f800000};
-alignas(64) extern const uint32_t sc_log_const_int_vals_2[32] = {0xc2b00f34,
-        0xc2affef2, 0xc2afee29, 0xc2afdccd, 0xc2afcad6, 0xc2afb837, 0xc2afb837,
-        0xc2afa4e4, 0xc2af90cf, 0xc2af90cf, 0xc2af7be9, 0xc2af7be9, 0xc2af661e,
-        0xc2af661e, 0xc2af4f5c, 0xc2af4f5c, 0xc2b09a6f, 0xc2b09a6f, 0xc2b08174,
-        0xc2b08174, 0xc2b08174, 0xc2b06731, 0xc2b06731, 0xc2b04b82, 0xc2b04b82,
-        0xc2b04b82, 0xc2b04b82, 0xc2b02e3e, 0xc2b02e3e, 0xc2b02e3e, 0xc2b00f34,
-        0xc2b00f34};
-alignas(64) extern const uint32_t sc_erf_const_int_vals[6][32] = {
-        {0xa6f2cb94, 0x32827792, 0x3381cc0c, 0x34523d4a, 0x351ac44d, 0x35f36d88,
-                0x36ee8229, 0x37b8a3bb, 0x3867a213, 0x3940033b, 0x3a2a5a1d,
-                0x3ae35863, 0x3b7828f2, 0x3c08b14b, 0x3c515ed3, 0xbb503236,
-                0xbd8d8e5e, 0xbe8abcd9, 0xbf0c19a2, 0xbeccb328, 0x3e176ced,
-                0x3f470d99, 0x3f7abb28, 0x3f800000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000},
-        {0x3f4c422a, 0x3f4c421f, 0x3f4c4207, 0x3f4c41cb, 0x3f4c413b, 0x3f4c3fad,
-                0x3f4c3a2f, 0x3f4c2d40, 0x3f4c146a, 0x3f4bc341, 0x3f4ad08c,
-                0x3f48f8cf, 0x3f45fac7, 0x3f404e07, 0x3f3b980f, 0x3f48dff3,
-                0x3f78b21b, 0x3fbb0704, 0x40019c32, 0x3fe536d6, 0x3f81331e,
-                0x3e6c8684, 0x3c98f936, 0x00000000, 0x3f800000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000},
-        {0xb62173f4, 0x3735e4cf, 0x37f2ff89, 0x388c23be, 0x3917535c, 0x39ab2ab0,
-                0x3a60fadb, 0x3af9b960, 0x3b6e5491, 0x3c0a4ec5, 0x3ca5aa8c,
-                0x3d2138d9, 0x3d8737d4, 0x3ddfb660, 0x3e0f27ab, 0x3d94004b,
-                0xbe0efdeb, 0xbf1d96c3, 0xbf89db58, 0xbf6d9897, 0xbef69fb8,
-                0xbdc4f8a8, 0xbbde6422, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000},
-        {0xbe081a19, 0xbe084570, 0xbe08639b, 0xbe089837, 0xbe08f409, 0xbe09ab95,
-                0xbe0b66d0, 0xbe0e400a, 0xbe124df8, 0xbe1bde02, 0xbe2f19c9,
-                0xbe4931bf, 0xbe685fbc, 0xbe89c95f, 0xbe96cbca, 0xbe8044aa,
-                0xbe0550f2, 0x3dcfd6a1, 0x3e94c826, 0x3e79345f, 0x3decec91,
-                0x3ca46568, 0x3aa1e00a, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000},
-        {0xba3d61db, 0x39f097a3, 0x3a5845dc, 0x3ab1fa35, 0x3b0cefb8, 0x3b653ab6,
-                0x3bcae527, 0x3c221712, 0x3c6c5840, 0x3cc0a703, 0x3d1dcc19,
-                0x3d63656d, 0x3d955907, 0x3dbf9910, 0x3dd53f69, 0x3db7dcef,
-                0x3d639ebe, 0xba6ede48, 0xbd22be69, 0xbd041cf1, 0xbc64f5ab,
-                0xbb097a32, 0xb8ebf380, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000},
-        {0x3cb7d80c, 0x3c9b6050, 0x3c978d11, 0x3c92e850, 0x3c8d058b, 0x3c848454,
-                0x3c6cd623, 0x3c4c824b, 0x3c2a7935, 0x3be0b390, 0x3b0651ac,
-                0xbb232f53, 0xbbd42fa0, 0xbc2c5366, 0xbc492c9e, 0xbc2a7aa6,
-                0xbbd55d04, 0xba823a76, 0x3b102aa8, 0x3ae25a7e, 0x3a31f792,
-                0x38b84375, 0x3689bb5a, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
-                0x00000000}};
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-size_t get_os_page_size() {
-#ifdef _WIN32
-    // fix-me: (win32) impl
-    return 4096;
-#else
-    static size_t v = getpagesize();
-    return v;
-#endif
-}
-} // namespace runtime
-
-runtime_config_t &runtime_config_t::get() noexcept {
-    static runtime_config_t cfg {};
-    return cfg;
-}
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-extern int get_max_threadpool_concurrency();
-#define SET_NUM_THREADS_SUCCESS(NUM) ((NUM) <= get_max_threadpool_concurrency())
-#else
-// We are free to set num threads in TBB and OMP
-#define SET_NUM_THREADS_SUCCESS(NUM) true
-#endif
-
-bool runtime_config_t::set_num_threads(int num) const {
-    thread_pool_table_->set_num_threads(num);
-    return SET_NUM_THREADS_SUCCESS(num);
-}
-
-using namespace env_key;
-runtime_config_t::runtime_config_t() {
-    thread_pool_table_ = &sc_pool_table;
-    constexpr int default_MTP =
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-            (int)thread_pool_mode_t::DYNAMIC;
-#else
-            (int)thread_pool_mode_t::MANAGED;
-#endif
-    auto mtp_val
-            = utils::getenv_int(env_names[SC_MANAGED_THREAD_POOL], default_MTP);
-    if (mtp_val < (int)thread_pool_mode_t::DIRECT
-            || mtp_val > (int)thread_pool_mode_t::DYNAMIC) {
-        mtp_val = default_MTP;
-        SC_WARN << "Bad SC_MANAGED_THREAD_POOL value";
-    }
-    managed_thread_pool_ = static_cast<thread_pool_mode_t>(mtp_val);
-
-    if (managed_thread_pool_ == thread_pool_mode_t::MANAGED) {
-        thread_pool_table_->parallel_call_managed = &sc_parallel_call_managed;
-    }
-    trace_initial_cap_ = utils::getenv_int(env_names[SC_TRACE_INIT_CAP], 4096);
-    trace_out_path_ = utils::getenv_string(env_names[SC_TRACE]);
-    char mode = 0;
-
-    if (trace_out_path_.size() == 1) {
-        // for SC_TRACE=1
-        mode = trace_out_path_[0] - '0';
-        trace_out_path_ = "sctrace.json";
-    } else if (trace_out_path_.size() > 2 && trace_out_path_[1] == ',') {
-        // for SC_TRACE=2,abc.json
-        mode = trace_out_path_[0] - '0';
-        trace_out_path_ = trace_out_path_.substr(2);
-    } else if (trace_out_path_.empty()) {
-        mode = 0;
-    } else {
-        // for SC_TRACE=abc.json
-        mode = 1;
-    }
-
-    switch (mode) {
-        case 0: trace_mode_ = trace_mode_t::OFF; break;
-        case 1: trace_mode_ = trace_mode_t::FAST; break;
-        case 2: trace_mode_ = trace_mode_t::KERNEL; break;
-        case 3: trace_mode_ = trace_mode_t::MULTI_THREAD; break;
-        default:
-            trace_mode_ = trace_mode_t::OFF;
-            trace_out_path_ = "";
-            break;
-    }
-
-    constexpr int default_verbose = 0;
-    int tmp_get_verbose_level
-            = utils::getenv_int(env_names[SC_VERBOSE], default_verbose);
-    if (tmp_get_verbose_level < 0 || tmp_get_verbose_level > 2) {
-        tmp_get_verbose_level = 0;
-    }
-    verbose_level_ = tmp_get_verbose_level;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/target_machine.cpp b/src/graph/backend/graph_compiler/core/src/runtime/target_machine.cpp
deleted file mode 100644
index 20a3cd9e86b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/target_machine.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <immintrin.h>
-#include <cpu/x64/cpu_isa_traits.hpp>
-#include <runtime/config.hpp>
-#include <runtime/env_vars.hpp>
-#include <runtime/target_machine.hpp>
-#include <unordered_map>
-#include <util/string_utils.hpp>
-#include <util/utils.hpp>
-#ifdef _MSC_VER
-//  Windows
-#include <intrin.h>
-#define cpuid(info, x, y) __cpuidex(info, x, y)
-#else
-// #include <cpuid.h>
-static void cpuid(int info[4], int InfoType, int v2) {
-    __cpuid_count(InfoType, v2, info[0], info[1], info[2], info[3]);
-}
-#endif
-
-#ifndef _WIN32
-extern char **environ;
-#endif
-
-static size_t extractBit(size_t val, size_t base, size_t end) {
-    return (val >> base) & ((1u << (end - base)) - 1);
-}
-
-SC_MODULE(target)
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-using namespace env_key;
-namespace runtime {
-target_machine_t::target_machine_t(
-        type device_type, std::unique_ptr<machine_flags_t> device_flags)
-    : device_type_(device_type), device_flags_(std::move(device_flags)) {}
-
-target_machine_t::target_machine_t(const target_machine_t &other)
-    : device_type_(other.device_type_)
-    , cpu_flags_(other.cpu_flags_)
-    , brgemm_use_amx_(other.brgemm_use_amx_) {}
-
-target_machine_t &target_machine_t::operator=(const target_machine_t &other) {
-    device_type_ = other.device_type_;
-    cpu_flags_ = other.cpu_flags_;
-    return *this;
-}
-
-const machine_flags_t &target_machine_t::get_device_flags() const {
-    if (device_type_ == type::cpu) { return cpu_flags_; }
-    return *device_flags_;
-}
-
-static int get_xcr0() {
-    uint32_t xcr0;
-#if defined(_MSC_VER)
-    xcr0 = (uint32_t)_xgetbv(0); /* min VS2010 SP1 compiler is required */
-#else
-    __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
-#endif
-    return xcr0;
-}
-
-// sets max_simd_bits
-void target_machine_t::set_simd_length_and_max_cpu_threads(cpu_flags_t &flg) {
-    if (flg.fAVX512F)
-        flg.max_simd_bits = 512;
-    else if (flg.fAVX2 || flg.fAVX)
-        flg.max_simd_bits = 256;
-    else if (flg.fSSE)
-        flg.max_simd_bits = 128;
-    else
-        flg.max_simd_bits = 0;
-}
-
-bool target_machine_t::use_amx() const {
-    return cpu_flags_.fAVX512AMXTILE
-            && (cpu_flags_.fAVX512AMXBF16 || cpu_flags_.fAVX512AMXINT8);
-}
-
-uint32_t machine_flags_t::get_max_vector_lanes(sc_data_etype etype) const {
-    return max_simd_bits / 8 / utils::get_sizeof_etype(etype);
-}
-
-target_machine_t &get_runtime_target_machine() {
-    static target_machine_t rtm = get_native_target_machine();
-    return rtm;
-}
-
-void set_runtime_target_machine(const target_machine_t &in) {
-    auto &rtm = get_runtime_target_machine();
-    rtm = in;
-}
-
-target_machine_t get_native_target_machine() {
-    target_machine_t tm(target_machine_t::type::cpu, nullptr);
-    int xcr0 = 0;
-    int info[4];
-    cpuid(info, 0, 0);
-    int nIds = info[0];
-    int vendor = info[2];
-    auto vendor_int = 0x6c65746e; //"ntel"
-
-    cpuid(info, 0x80000000, 0);
-    unsigned nExIds = info[0];
-
-    //  Detect Features
-    if (nIds >= 0x00000001) {
-        cpuid(info, 0x00000001, 0);
-        tm.cpu_flags_.fMMX = (info[3] & ((int)1 << 23)) != 0;
-        tm.cpu_flags_.fSSE = (info[3] & ((int)1 << 25)) != 0;
-        tm.cpu_flags_.fSSE2 = (info[3] & ((int)1 << 26)) != 0;
-        tm.cpu_flags_.fSSE3 = (info[2] & ((int)1 << 0)) != 0;
-
-        tm.cpu_flags_.fSSSE3 = (info[2] & ((int)1 << 9)) != 0;
-        tm.cpu_flags_.fSSE41 = (info[2] & ((int)1 << 19)) != 0;
-        tm.cpu_flags_.fSSE42 = (info[2] & ((int)1 << 20)) != 0;
-        tm.cpu_flags_.fAES = (info[2] & ((int)1 << 25)) != 0;
-
-        bool hasXSAVE = (info[2] & ((int)1 << 27)) != 0;
-        if (hasXSAVE) { xcr0 = get_xcr0(); }
-        tm.cpu_flags_.fAVX
-                = ((xcr0 & 0x6) == 0x6) && (info[2] & ((int)1 << 28)) != 0;
-        tm.cpu_flags_.fFMA3 = (info[2] & ((int)1 << 12)) != 0;
-
-        tm.cpu_flags_.fRDRAND = (info[2] & ((int)1 << 30)) != 0;
-    }
-    if (nIds >= 0x00000007) {
-        cpuid(info, 0x00000007, 0);
-        tm.cpu_flags_.fAVX2
-                = tm.cpu_flags_.fAVX && (info[1] & ((int)1 << 5)) != 0;
-
-        tm.cpu_flags_.fBMI1 = (info[1] & ((int)1 << 3)) != 0;
-        tm.cpu_flags_.fBMI2 = (info[1] & ((int)1 << 8)) != 0;
-        tm.cpu_flags_.fADX = (info[1] & ((int)1 << 19)) != 0;
-        tm.cpu_flags_.fSHA = (info[1] & ((int)1 << 29)) != 0;
-        tm.cpu_flags_.fPREFETCHWT1 = (info[2] & ((int)1 << 0)) != 0;
-
-        tm.cpu_flags_.fAVX512F
-                = ((xcr0 & 0xe6) == 0xe6) && (info[1] & ((int)1 << 16)) != 0;
-        bool hasavx512 = tm.cpu_flags_.fAVX512F;
-        tm.cpu_flags_.fAVX512CD = hasavx512 && (info[1] & ((int)1 << 28)) != 0;
-        tm.cpu_flags_.fAVX512PF = hasavx512 && (info[1] & ((int)1 << 26)) != 0;
-        tm.cpu_flags_.fAVX512ER = hasavx512 && (info[1] & ((int)1 << 27)) != 0;
-        tm.cpu_flags_.fAVX512VL = hasavx512 && (info[1] & ((int)1 << 31)) != 0;
-        tm.cpu_flags_.fAVX512BW = hasavx512 && (info[1] & ((int)1 << 30)) != 0;
-        tm.cpu_flags_.fAVX512DQ = hasavx512 && (info[1] & ((int)1 << 17)) != 0;
-        tm.cpu_flags_.fAVX512IFMA
-                = hasavx512 && (info[1] & ((int)1 << 21)) != 0;
-        tm.cpu_flags_.fAVX512VNNI
-                = hasavx512 && (info[2] & ((int)1 << 11)) != 0;
-        tm.cpu_flags_.fAVX512AMXBF16
-                = hasavx512 && (info[3] & ((int)1 << 22)) != 0;
-        tm.cpu_flags_.fAVX512FP16
-                = hasavx512 && (info[3] & ((int)1 << 23)) != 0;
-        tm.cpu_flags_.fAVX512AMXTILE
-                = hasavx512 && (info[3] & ((int)1 << 24)) != 0;
-        tm.cpu_flags_.fAVX512AMXINT8
-                = hasavx512 && (info[3] & ((int)1 << 25)) != 0;
-        tm.cpu_flags_.fAVX512VBMI = hasavx512 && (info[2] & ((int)1 << 1)) != 0;
-
-        if (hasavx512) {
-            int info2[4];
-            cpuid(info2, 0x00000007, 1);
-            tm.cpu_flags_.fAVX512BF16 = (info2[0] & ((int)1 << 5)) != 0;
-            tm.cpu_flags_.fAVX512AMXFP16 = (info2[0] & ((int)1 << 21)) != 0;
-        }
-    }
-    if (nExIds >= 0x80000001) {
-        cpuid(info, 0x80000001, 0);
-        tm.cpu_flags_.fx64 = (info[3] & ((int)1 << 29)) != 0;
-        tm.cpu_flags_.fABM = (info[2] & ((int)1 << 5)) != 0;
-        tm.cpu_flags_.fSSE4a = (info[2] & ((int)1 << 6)) != 0;
-        tm.cpu_flags_.fFMA4 = (info[2] & ((int)1 << 16)) != 0;
-        tm.cpu_flags_.fXOP = (info[2] & ((int)1 << 11)) != 0;
-    }
-    cpuid(info, 0x00000001, 0);
-    uint8_t family = (info[0] >> 8) & 0xf;
-    uint8_t model = (info[0] >> 4) & 0xf;
-    uint8_t step = info[0] & 0xf;
-    if (family == 0x6 || family == 0xf) {
-        model += ((info[0] >> 16) & 0xF) << 4;
-    }
-    if (family == 0xf) { family += (info[0] >> 20) & 0xff; }
-    tm.cpu_flags_.family = family;
-    tm.cpu_flags_.model = model;
-    tm.cpu_flags_.step = step;
-    auto leaf = (vendor_int == vendor) ? 0x00000004 : 0x8000001D;
-    for (int i = 0; tm.cpu_flags_.dataCacheLevels_
-            < runtime::cpu_flags_t::maxNumberCacheLevels;
-            i++) {
-        cpuid(info, leaf, i);
-        tm.cpu_flags_.dataCacheSize_[tm.cpu_flags_.dataCacheLevels_]
-                = (extractBit(info[1], 22, 31) + 1)
-                * (extractBit(info[1], 12, 21) + 1)
-                * (extractBit(info[1], 0, 11) + 1) * (info[2] + 1);
-        tm.cpu_flags_.dataCacheLevels_++;
-    }
-    target_machine_t::set_simd_length_and_max_cpu_threads(tm.cpu_flags_);
-    return tm;
-}
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/target_machine.hpp b/src/graph/backend/graph_compiler/core/src/runtime/target_machine.hpp
deleted file mode 100644
index c45d5783f6b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/target_machine.hpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_TARGET_MACHINE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_TARGET_MACHINE_HPP
-#include <array>
-#include <memory>
-#include <utility>
-#include "data_type.hpp"
-#include <util/def.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-struct machine_flags_t {
-    unsigned int max_simd_bits;
-    SC_INTERNAL_API uint32_t get_max_vector_lanes(sc_data_etype etype) const;
-};
-
-struct cpu_flags_t : public machine_flags_t {
-    //  Misc.
-    bool fMMX = false;
-    bool fx64 = false;
-    bool fABM = false; // Advanced Bit Manipulation
-    bool fRDRAND = false;
-    bool fBMI1 = false;
-    bool fBMI2 = false;
-    bool fADX = false;
-    // what about fPREFETCHW?
-    bool fPREFETCHWT1 = false;
-
-    //  SIMD: 128-bit
-    bool fSSE = false;
-    bool fSSE2 = false;
-    bool fSSE3 = false;
-    bool fSSSE3 = false;
-    bool fSSE41 = false;
-    bool fSSE42 = false;
-    bool fSSE4a = false;
-    bool fAES = false;
-    bool fSHA = false;
-
-    //  SIMD: 256-bit
-    bool fAVX = false;
-    bool fXOP = false;
-    bool fFMA3 = false;
-    bool fFMA4 = false;
-    bool fAVX2 = false;
-
-    //  SIMD: 512-bit
-    bool fAVX512F = false; //  AVX512 Foundation
-    bool fAVX512CD = false; //  AVX512 Conflict Detection
-    bool fAVX512PF = false; //  AVX512 Prefetch
-    bool fAVX512ER = false; //  AVX512 Exponential + Reciprocal
-    bool fAVX512VL = false; //  AVX512 Vector Length Extensions
-    bool fAVX512BW = false; //  AVX512 Byte + Word
-    bool fAVX512DQ = false; //  AVX512 Doubleword + Quadword
-    bool fAVX512IFMA = false; //  AVX512 Integer 52-bit Fused Multiply-Add
-    bool fAVX512VNNI = false; //  AVX512 Vector Neural Network Instructions
-    bool fAVX512AMXFP16 = false; // AVX512 Advanced Matrix Extension for f16
-    bool fAVX512AMXBF16 = false; // AVX512 Advanced Matrix Extension for bf16
-    bool fAVX512AMXTILE = false; // AVX512 Advanced Matrix Extension for tile
-    bool fAVX512AMXINT8 = false; // AVX512 Advanced Matrix Extension for int8
-    bool fAVX512VBMI = false; //  AVX512 Vector Byte Manipulation Instructions
-    bool fAVX512BF16 = false; //  AVX512 BF16 Instructions
-    bool fAVX512FP16 = false; // AVX512 FP16 Instructions
-
-    uint8_t family = 0;
-    uint8_t model = 0;
-    uint8_t step = 0;
-
-    static const size_t maxNumberCacheLevels = 10;
-    std::array<size_t, maxNumberCacheLevels> dataCacheSize_;
-    size_t dataCacheLevels_ = 0;
-
-    // guess if the CPU model is spr, emr, gnr or later
-    bool is_spr_like() const { return fAVX512AMXTILE; }
-    // guess if the CPU model is skx, clx, cpx, icx: with AVX512 and without AMX
-    bool is_skx_like() const { return !fAVX512AMXTILE && fAVX512F; }
-
-public:
-    size_t getDCacheSize(size_t cache_level) const {
-        if (cache_level == 1) {
-            // L1-d cache size
-            return dataCacheSize_[0];
-        } else {
-            return dataCacheSize_[cache_level];
-        }
-    }
-};
-
-struct SC_INTERNAL_API target_machine_t {
-    enum class type {
-        cpu,
-    } device_type_;
-    cpu_flags_t cpu_flags_;
-    // move it here as it influnces both compiler and runtime.
-    bool brgemm_use_amx_ = false;
-    target_machine_t(
-            type device_type, std::unique_ptr<machine_flags_t> device_flags);
-    // if is device, returns device_flags, else, return cpu_flags_t
-    const machine_flags_t &get_device_flags() const;
-    target_machine_t(const target_machine_t &other);
-    target_machine_t(target_machine_t &&other) = default;
-    target_machine_t &operator=(target_machine_t &&other) {
-        device_type_ = other.device_type_;
-        cpu_flags_ = other.cpu_flags_;
-        device_flags_ = std::move(other.device_flags_);
-        brgemm_use_amx_ = other.brgemm_use_amx_;
-        return *this;
-    }
-    target_machine_t &operator=(const target_machine_t &other);
-
-    static void set_simd_length_and_max_cpu_threads(cpu_flags_t &tm);
-    bool use_amx() const;
-
-private:
-    std::unique_ptr<machine_flags_t> device_flags_;
-};
-
-SC_INTERNAL_API target_machine_t get_native_target_machine();
-
-SC_API target_machine_t &get_runtime_target_machine();
-SC_API void set_runtime_target_machine(const target_machine_t &);
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/thread_locals.cpp b/src/graph/backend/graph_compiler/core/src/runtime/thread_locals.cpp
deleted file mode 100644
index fba50a9de6e..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/thread_locals.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <list>
-#include <mutex>
-#include <utility>
-#include <vector>
-#include "config.hpp"
-#include <runtime/context.hpp>
-#ifdef SC_KERNEL_PROFILE
-#include <sstream>
-#include <stdio.h>
-#endif
-#include "thread_locals.hpp"
-#include "thread_locals_registry.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-
-// if registry is destoryed, it will be set to true
-static bool registry_destroyed = false;
-
-thread_local_registry_t::thread_local_registry_t() = default;
-
-// release all registered TLS resources. The registry still keeps track of
-// the TLS objects
-void thread_local_registry_t::release(engine_t *engine) {
-    std::lock_guard<std::mutex> guard(lock_);
-    write_traces(this);
-    dead_threads_.clear();
-    for (auto node : tls_buffers_) {
-        if (engine == nullptr || node->engine_ == engine) {
-            node->main_memory_pool_.release();
-            node->thread_memory_pool_.release();
-            node->additional_->dyn_threadpool_mem_pool_.release();
-            node->amx_buffer_.release(node->engine_);
-            node->engine_ = nullptr;
-        }
-    }
-}
-thread_local_registry_t::~thread_local_registry_t() {
-    release(nullptr);
-}
-
-void thread_local_registry_t::for_each_tls_additional(
-        const std::function<void(thread_local_buffer_t::additional_t *)> &f) {
-    for (auto &v : tls_buffers_) {
-        f(v->additional_.get());
-    }
-    for (auto &v : dead_threads_) {
-        f(v.get());
-    }
-}
-
-struct registry_guard_t {
-    std::shared_ptr<thread_local_registry_t> ptr_
-            = std::make_shared<thread_local_registry_t>();
-    ~registry_guard_t() {
-        ptr_->release(nullptr);
-        registry_destroyed = true;
-    }
-};
-
-const std::shared_ptr<thread_local_registry_t> &get_thread_locals_registry() {
-    static registry_guard_t registry;
-    return registry.ptr_;
-}
-
-thread_local_buffer_t::additional_t::additional_t() {
-    assert(!registry_destroyed);
-    registry_ = get_thread_locals_registry();
-}
-
-// register itself into registry
-thread_local_buffer_t::thread_local_buffer_t()
-    : additional_(std::unique_ptr<additional_t>(new additional_t {})) {
-    auto &registry = *additional_->registry_;
-    std::lock_guard<std::mutex> guard(registry.lock_);
-    registry.tls_buffers_.emplace_back(this);
-    cur_pos_ = registry.tls_buffers_.end();
-    // cur_pos should point to the current buffer iterator in tls_buffers_
-    --cur_pos_;
-}
-
-// the destructor of TLS. It will unregister `this` pointer in registry. Note
-// that C++ standard makes sure that thread local objects are destroyed
-// before "static lifetime" objects. However, OpenMP seems not have clearly
-// specified whether/when C++11 thread_local is destructed. Experiences on
-// g++ 8.4.0 shows that the destructor of thread_local objects in OpenMP threads
-// are NEVER called! So we still need to check if registry has already been
-// destructed
-thread_local_buffer_t::~thread_local_buffer_t() {
-    // C++ compiler will call ~thread_local_buffer_t() first and then call dtor
-    // of its fields. Note that after ~thread_local_buffer_t() returns, the
-    // lock will be released and dtors of member fields will not be protected by
-    // the lock. This is OK because we have removed the reference to `this`
-    // pointer from the registry and the registry has no chance to call
-    // release() on `this` any more. So there will be only one thread calling
-    // dtor/release() on the members at the same time
-
-    // move out the ownership of the registry_ to avoid self-referencing
-    auto p_registry = std::move(additional_->registry_);
-    std::lock_guard<std::mutex> guard(p_registry->lock_);
-    assert(*cur_pos_ = this);
-    // remove from the tls_buffers_ in registry
-    p_registry->tls_buffers_.erase(cur_pos_);
-    if (!additional_->trace_.trace_logs_.empty()) {
-        // let registry remember the trace logs (we have already moved the
-        // registry_ inside additional_)
-        p_registry->dead_threads_.emplace_back(std::move(additional_));
-    }
-}
-
-} // namespace runtime
-
-SC_API void release_runtime_memory(runtime::engine_t *engine) {
-    if (!engine) {
-        // in case registry_guard already destroyed
-        if (runtime::registry_destroyed) { return; }
-        runtime::get_thread_locals_registry()->release(engine);
-    } else {
-        engine->registry_->release(engine);
-    }
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-extern "C" SC_API void *sc_get_tls_amx_buffer(
-        dnnl::impl::graph::gc::runtime::stream_t *stream) noexcept {
-    auto &tls = dnnl::impl::graph::gc::runtime::get_tls(stream);
-    return &tls.amx_buffer_;
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/thread_locals.hpp b/src/graph/backend/graph_compiler/core/src/runtime/thread_locals.hpp
deleted file mode 100644
index fdb7a00c3bd..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/thread_locals.hpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREAD_LOCALS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREAD_LOCALS_HPP
-#include <assert.h>
-#include <list>
-#include <memory>
-#include <vector>
-#include "context.hpp"
-#include "memorypool.hpp"
-#include "trace.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-// the thread-local AMX-related data
-struct amx_buffer_t {
-    // AMX scratch pad
-    void *ptr_ = nullptr;
-    // the pointer to the current configured palette
-    const char *cur_palette = nullptr;
-    // if there is an alive tile palette
-    bool need_release_tile_ = false;
-    // alloc memory for AMX scratch pad at ptr_ using the allocator in the
-    // stream(engine)
-    void reset(stream_t *stream);
-    // release memory for AMX scratch pad at ptr_. Destructor of this class will
-    // not call this function. It must be manually called.
-    void release(engine_t *engine);
-};
-
-struct thread_local_registry_t;
-// a container for thread local resources. Users can call
-// release_runtime_memory to manually release all thread local memory
-// managed by this struct
-struct thread_local_buffer_t {
-    // the additional thread local data. Referenced via a pointer in
-    // thread_local_buffer_t to reduce TLS size and improve performance
-    struct additional_t {
-        int linear_thread_id_ = 0;
-        int instance_id_ = 0;
-        bool is_main_thread_ = false;
-        memory_pool::filo_memory_pool_t dyn_threadpool_mem_pool_ {4096 * 16};
-        trace_manager_t trace_;
-        // the pointer to keep registry alive
-        std::shared_ptr<thread_local_registry_t> registry_;
-        additional_t();
-    };
-    bool in_managed_thread_pool_ = false;
-    engine_t *engine_ = nullptr;
-    amx_buffer_t amx_buffer_;
-
-    // if the current thread is the "main" thread, use this pool
-    memory_pool::filo_memory_pool_t main_memory_pool_ {
-            memory_pool::main_chunk_size};
-    // if the current thread is a worker thread, use this pool
-    memory_pool::filo_memory_pool_t thread_memory_pool_ {
-            memory_pool::threadlocal_chunk_size};
-
-    std::unique_ptr<additional_t> additional_;
-
-    ~thread_local_buffer_t();
-    using list_type = std::list<thread_local_buffer_t *>;
-
-    static inline thread_local_buffer_t &tls_buffer() {
-        static thread_local thread_local_buffer_t tls_buffer_;
-        return tls_buffer_;
-    }
-
-    // disable move and copy
-    thread_local_buffer_t(const thread_local_buffer_t &) = delete;
-    thread_local_buffer_t(thread_local_buffer_t &&) = delete;
-
-    thread_local_buffer_t &operator=(const thread_local_buffer_t &) = delete;
-    thread_local_buffer_t &operator=(thread_local_buffer_t &&) = delete;
-
-private:
-    friend struct thread_local_registry_t;
-    // private ctor makes sure this struct can only be used in TLS
-    thread_local_buffer_t();
-    // the current position in thread_local_registry
-    list_type::iterator cur_pos_;
-};
-
-// gets the Thread Local Storage associated with the stream. Note that we assume
-// that a thread will be attached to one stream when the thread runs a kernel at
-// the first time and it will not switch between streams at the run time. We
-// also have the same assumption on the "main" thread which invokes the main
-// entry of the kernel
-inline thread_local_buffer_t &get_tls(runtime::stream_t *stream) {
-    auto &ret = thread_local_buffer_t::tls_buffer();
-    assert(ret.engine_ == nullptr || ret.engine_ == stream->engine_);
-    ret.engine_ = stream->engine_;
-    return ret;
-}
-
-const std::shared_ptr<thread_local_registry_t> &get_thread_locals_registry();
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-extern "C" SC_API void *sc_get_tls_amx_buffer(
-        dnnl::impl::graph::gc::runtime::stream_t *stream) noexcept;
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/thread_locals_registry.hpp b/src/graph/backend/graph_compiler/core/src/runtime/thread_locals_registry.hpp
deleted file mode 100644
index 06562007b4a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/thread_locals_registry.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREAD_LOCALS_REGISTRY_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREAD_LOCALS_REGISTRY_HPP
-#include <functional>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <vector>
-#include "thread_locals.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-struct trace_env_t {
-    std::mutex name_lock_;
-    std::vector<std::string> names_;
-    trace_env_t();
-};
-
-// the registry of all TLS resources.
-struct thread_local_registry_t {
-    std::mutex lock_;
-    std::list<thread_local_buffer_t *> tls_buffers_;
-    std::vector<std::unique_ptr<thread_local_buffer_t::additional_t>>
-            dead_threads_;
-    trace_env_t trace_env_;
-    void release(engine_t *engine);
-    void for_each_tls_additional(
-            const std::function<void(thread_local_buffer_t::additional_t *)>
-                    &f);
-
-    thread_local_registry_t();
-    ~thread_local_registry_t();
-};
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/thread_pool_flags.hpp b/src/graph/backend/graph_compiler/core/src/runtime/thread_pool_flags.hpp
deleted file mode 100644
index 321e74a16ed..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/thread_pool_flags.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREAD_POOL_FLAGS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREAD_POOL_FLAGS_HPP
-#include <stdint.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace runtime {
-
-namespace thread_pool_flags {
-constexpr int THREAD_POOL_DEFAULT = 0;
-constexpr int THREAD_POOL_RUN_IDLE_FUNC = 1;
-constexpr int THREAD_POOL_DISABLE_ROLLING = 1 << 1;
-// set when this parallel-for is the last one in the whole kernel
-constexpr int THREAD_POOL_EXIT = 1 << 2;
-} // namespace thread_pool_flags
-
-} // namespace runtime
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/threadpool_mode.hpp b/src/graph/backend/graph_compiler/core/src/runtime/threadpool_mode.hpp
deleted file mode 100644
index bfc75279a27..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/threadpool_mode.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREADPOOL_MODE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_THREADPOOL_MODE_HPP
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class thread_pool_mode_t {
-    DIRECT, // directly using underlying thread pool (OMP, Eigen, etc.)
-    MANAGED, // managed thread pool
-    DYNAMIC, // queue-based dynamic thread pool
-};
-
-}
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/trace.cpp b/src/graph/backend/graph_compiler/core/src/runtime/trace.cpp
deleted file mode 100644
index 433ea6793ab..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/trace.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <chrono>
-#include <mutex>
-#include <string>
-#include <thread>
-#include <vector>
-#include "config.hpp"
-#include <runtime/logging.hpp>
-#include <util/assert.hpp>
-#ifdef _MSC_VER
-#else
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/types.h>
-#endif
-#include <functional>
-#include <runtime/env_vars.hpp>
-#include <runtime/microkernel/cpu/kernel_timer.hpp>
-#include <runtime/runtime.hpp>
-#include <runtime/thread_locals.hpp>
-#include <runtime/thread_locals_registry.hpp>
-#include <util/string_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-SC_MODULE(runtime.trace);
-
-namespace runtime {
-
-trace_env_t::trace_env_t()
-    : names_ {"brgemm", "list_brgemm", "barrier", "barrier_internal",
-            "prefetch"} {}
-
-static void write_json_traces(FILE *outf, thread_local_registry_t *r,
-        int64_t min_val, size_t trace_size, bool main_thread_found) {
-    fputs(R"({
-"traceEvents": [
-)",
-            outf);
-    size_t i = 0;
-    r->for_each_tls_additional([&](thread_local_buffer_t::additional_t *tlb) {
-        if (runtime_config_t::get().trace_mode_
-                < runtime_config_t::trace_mode_t::MULTI_THREAD) {
-            if (!tlb->is_main_thread_ && main_thread_found) {
-                tlb->trace_.trace_logs_.clear();
-                return;
-            }
-        }
-        for (auto &v : tlb->trace_.trace_logs_) {
-            fprintf(outf,
-                    R"({"pid":1, "tid":%zu, "ts":%lf, "ph":"%c", "name":"%s@%d", "args":{"flop":%d}, "cat":"call" }%c
-)",
-                    (size_t)tlb->linear_thread_id_,
-                    (v.tick_ - min_val) / 1000.0, v.in_or_out_ ? 'E' : 'B',
-                    r->trace_env_.names_[v.func_id_].c_str(), v.func_id_,
-                    v.arg_, i == trace_size - 1 ? ' ' : ',');
-            i++;
-        }
-        tlb->trace_.trace_logs_.clear();
-    });
-    fputs(R"(],
-"sc_version": "0.0.0"
-}
-)",
-            outf);
-}
-
-static void write_compact_traces(FILE *outf, thread_local_registry_t *r,
-        int64_t min_val, size_t trace_size, bool main_thread_found) {
-    fprintf(outf, "symbols:");
-    for (size_t i = 0; i < r->trace_env_.names_.size(); i++) {
-        fprintf(outf, "%zu-%s,", i, r->trace_env_.names_[i].c_str());
-    }
-    fprintf(outf, "\n");
-    r->for_each_tls_additional([&](thread_local_buffer_t::additional_t *tlb) {
-        fprintf(outf, "trace:%d,%d:", tlb->linear_thread_id_,
-                tlb->instance_id_);
-        for (auto &v : tlb->trace_.trace_logs_) {
-            fprintf(outf, "%ld-%d-%d-%d,", (long)(v.tick_ - min_val), // NOLINT
-                    v.in_or_out_, (int)v.func_id_, (int)v.arg_); // NOLINT
-        }
-        fprintf(outf, "\n");
-        tlb->trace_.trace_logs_.clear();
-    });
-}
-
-void write_traces(thread_local_registry_t *r) {
-    std::string &tracep = runtime_config_t::get().trace_out_path_;
-    size_t trace_cap = runtime_config_t::get().trace_initial_cap_;
-    if (tracep.empty()) { return; }
-    size_t trace_size = 0;
-    int64_t min_val = std::numeric_limits<uint64_t>::max();
-    bool main_thread_found = false;
-    bool already_warned = false;
-    r->for_each_tls_additional([&](thread_local_buffer_t::additional_t *v) {
-        auto cur_trace_size = v->trace_.trace_logs_.size();
-        trace_size += cur_trace_size;
-        if (!already_warned && cur_trace_size > trace_cap) {
-            already_warned = true;
-            SC_MODULE_WARN << "Received too many traces: " << cur_trace_size
-                           << ". The initial capacity is " << trace_cap
-                           << ". This will cause inaccurate result. Please "
-                              "consider enlarge "
-                           << env_names[env_key::SC_TRACE_INIT_CAP];
-        }
-        if (v->is_main_thread_) { main_thread_found = true; }
-        for (auto &log : v->trace_.trace_logs_) {
-            min_val = std::min(log.tick_, min_val);
-        }
-    });
-    if (trace_size == 0UL) { return; }
-    FILE *outf;
-    const char *filename;
-    bool compact = false;
-    if (tracep == "stderr") {
-        outf = stderr;
-        filename = "*stderr*";
-    } else {
-        outf = fopen(tracep.c_str(), "w");
-        filename = tracep.c_str();
-        compact = !utils::string_endswith(tracep, ".json");
-    }
-    SC_WARN << "Generating traces to " << filename << " ...";
-    if (compact) {
-        write_compact_traces(outf, r, min_val, trace_size, main_thread_found);
-    } else {
-        write_json_traces(outf, r, min_val, trace_size, main_thread_found);
-    }
-    if (outf != stderr) { fclose(outf); }
-}
-} // namespace runtime
-
-SC_INTERNAL_API void generate_trace_file() {
-    dnnl::impl::graph::gc::release_runtime_memory(nullptr);
-}
-
-int register_traced_func(const std::string &name) {
-    const auto &reg
-            = dnnl::impl::graph::gc::runtime::get_thread_locals_registry();
-    std::lock_guard<std::mutex> guard(reg->trace_env_.name_lock_);
-    reg->trace_env_.names_.emplace_back(name);
-    return reg->trace_env_.names_.size() - 1;
-}
-
-int get_last_trace_func_id() {
-    const auto &reg
-            = dnnl::impl::graph::gc::runtime::get_thread_locals_registry();
-    std::lock_guard<std::mutex> guard(reg->trace_env_.name_lock_);
-    return reg->trace_env_.names_.size() - 1;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-using namespace dnnl::impl::graph::gc;
-extern "C" void sc_make_trace(int id, int in_or_out, int arg) {
-    auto &trace_mgr
-            = runtime::thread_local_buffer_t::tls_buffer().additional_->trace_;
-    if (trace_mgr.trace_logs_.empty()) {
-        trace_mgr.trace_logs_.reserve(
-                runtime_config_t::get().trace_initial_cap_);
-    }
-    auto t = std::chrono::high_resolution_clock::now();
-    auto now = std::chrono::time_point_cast<std::chrono::nanoseconds>(t)
-                       .time_since_epoch()
-                       .count();
-    trace_mgr.trace_logs_.emplace_back(runtime::trace_manager_t::trace_log_t {
-            static_cast<uint16_t>(id), static_cast<char>(in_or_out), arg, now});
-}
-
-extern "C" void sc_make_trace_kernel(int id, int in_or_out, int arg) {
-#ifdef SC_KERNEL_PROFILE
-    if (sc_is_trace_enabled()) { sc_make_trace(id, in_or_out, arg); }
-#endif
-}
diff --git a/src/graph/backend/graph_compiler/core/src/runtime/trace.hpp b/src/graph/backend/graph_compiler/core/src/runtime/trace.hpp
deleted file mode 100644
index 61dce9cdaaf..00000000000
--- a/src/graph/backend/graph_compiler/core/src/runtime/trace.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_TRACE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_RUNTIME_TRACE_HPP
-#include <stdint.h>
-#include <string>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace runtime {
-
-struct thread_local_buffer_t;
-struct thread_local_registry_t;
-struct trace_manager_t {
-    struct trace_log_t {
-        uint16_t func_id_;
-        char in_or_out_;
-        int32_t arg_;
-        int64_t tick_;
-    };
-    std::vector<trace_log_t> trace_logs_;
-};
-
-void write_traces(thread_local_registry_t *r);
-
-} // namespace runtime
-int register_traced_func(const std::string &name);
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/static_interface.cpp b/src/graph/backend/graph_compiler/core/src/static_interface.cpp
deleted file mode 100644
index c8bac9a8ab5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/static_interface.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-/**
- * This file is useful in static build to make sure the initializers are
- * referenced and prevent them from being removed by the linker.
- * */
-
-#define RUN_ON_PRODUCTION(F) \
-    F(abs); \
-    F(matmul_core); \
-    F(softmax); \
-    F(log_softmax); \
-    F(log_softmax_bwd); \
-    F(softmax_bwd); \
-    F(static_reshape); \
-    F(dynamic_reshape); \
-    F(shape_of_tensor); \
-    F(matmul); \
-    F(quantize); \
-    F(exp); \
-    F(dynamic_transpose); \
-    F(add); \
-    F(padding); \
-    F(reorder); \
-    F(reduce); \
-    F(reduce_sum); \
-    F(reduce_prod); \
-    F(reduce_max); \
-    F(reduce_mean); \
-    F(reduce_min); \
-    F(reduce_l1); \
-    F(reduce_l2); \
-    F(relu); \
-    F(relu_backprop); \
-    F(reciprocal); \
-    F(sigmoid_backprop); \
-    F(conv_fwd); \
-    F(conv_fwd_core); \
-    F(conv_bwd_data); \
-    F(conv_bwd_weight); \
-    F(batchnorm_inference); \
-    F(batchnorm_forward_training); \
-    F(batchnorm_training_backprop); \
-    F(managed_matmul_core); \
-    F(select); \
-    F(layernorm); \
-    F(gelu); \
-    F(abs); \
-    F(elu); \
-    F(hardswish); \
-    F(log); \
-    F(pow); \
-    F(mish); \
-    F(soft_plus); \
-    F(soft_plus_bwd); \
-    F(square); \
-    F(swish); \
-    F(prelu); \
-    F(abs_bwd); \
-    F(clamp_bwd); \
-    F(elu_bwd); \
-    F(hardswish_bwd) \
-    F(hardsigmoid_bwd) \
-    F(sqrt_bwd) \
-    F(mish_bwd) \
-    F(prelu_bwd) \
-    F(hardsigmoid); \
-    F(clamp); \
-    F(leaky_relu); \
-    F(duplicate); \
-    F(pooling_avg); \
-    F(pooling_max); \
-    F(pooling_max_backprop); \
-    F(pooling_avg_backprop); \
-    F(broadcast);
-
-#define DECL_INIT(NAME) extern volatile bool __help_dummy_##NAME;
-
-#define RUN_ON_REFLECTION_CLASS(F) F(shared_ptr_static_data)
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-RUN_ON_PRODUCTION(DECL_INIT)
-
-#define DECL_REFLECTION_INIT(NAME) extern void *__reflection_init_##NAME;
-namespace reflection {
-RUN_ON_REFLECTION_CLASS(DECL_REFLECTION_INIT)
-}
-
-void __dummy_init() {
-#define REF_INIT(NAME) (void)__help_dummy_##NAME;
-    RUN_ON_PRODUCTION(REF_INIT)
-
-#define REF_REFLECTION_INIT(NAME) (void)reflection::__reflection_init_##NAME;
-    RUN_ON_REFLECTION_CLASS(REF_REFLECTION_INIT)
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/any_map.cpp b/src/graph/backend/graph_compiler/core/src/util/any_map.cpp
deleted file mode 100644
index f4c2da9e477..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/any_map.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <util/any_map.hpp>
-#include <util/any_reflection_cvt.hpp>
-#include <util/reflection.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static_assert(sizeof(any_t) == 64, "Expecting sizeof(any_t)==64");
-
-namespace utils {
-reflection::general_ref_t any_to_general_ref(const any_t &v) {
-    COMPILE_ASSERT(!v.empty(), "any_to_general_ref meets empty any value");
-    auto ty = reflection::get_type_by_rtti(v.type_code());
-    COMPILE_ASSERT(ty,
-            "Cannot find the type in reflection for any_t: "
-                    << v.type_code()->name());
-    return reflection::general_ref_t {(void *)v.get_raw(), *ty};
-}
-} // namespace utils
-
-namespace any_detail {
-static std::unordered_map<const std::type_info *, any_vtable_t *> &
-get_rtti_vtable_map() {
-    static std::unordered_map<const std::type_info *, any_vtable_t *> map;
-    return map;
-}
-
-void any_vtable_t::set_rtti_to_vtable_map(
-        const std::type_info *typeinfo, any_vtable_t *table) {
-    auto &map = get_rtti_vtable_map();
-    assert(map.find(typeinfo) == map.end());
-    map.insert(std::make_pair(typeinfo, table));
-}
-
-any_vtable_t *any_vtable_t::get_vtable_by_rtti(const std::type_info *typeinfo) {
-    auto &map = get_rtti_vtable_map();
-    auto itr = map.find(typeinfo);
-    COMPILE_ASSERT(itr != map.end(),
-            "Cannot find any_vtable_t for type: " << typeinfo->name());
-    return itr->second;
-}
-
-} // namespace any_detail
-
-void any_t::create_buffer(const any_detail::any_vtable_t *vt) {
-    if (vt) {
-        if (vt->size_ > INLINE_BUFFER_SIZE) data_.ptr_ = new char[vt->size_];
-    }
-    vtable_ = vt;
-}
-
-any_t any_t::make_by_type(const reflection::type *ty) {
-    any_t ret;
-    if (ty->base_ == reflection::basic_type::t_string
-            && ty->array_depth_ == 0) {
-        ret = std::string();
-    } else {
-        auto rtti = reflection::get_rtti_by_type(ty);
-        ret.switch_buffer_to_type(
-                any_detail::any_vtable_t::get_vtable_by_rtti(rtti));
-        if (ty->meta_) { ty->meta_->constructor_(ret.get_raw()); }
-    }
-    return ret;
-}
-
-// switches the buffer to a type. If we are already holding this type, do
-// nothing and return true. Else, we release the held object, create new
-// buffer, change the vtable and return false
-bool any_t::switch_buffer_to_type(const any_detail::any_vtable_t *vt) {
-    if (vtable_ != vt) {
-        clear();
-        create_buffer(vt);
-        return false;
-    }
-    // if same type, no need to clear()
-    return true;
-}
-
-void any_t::copy_from(const void *data, const any_detail::any_vtable_t *vt) {
-    if (vtable_ != vt) {
-        if (vt && !vt->copy_ctor_) {
-            throw std::runtime_error("The type is not copy-constructible");
-        }
-    } else {
-        if (vt && !vt->copy_assigner_) {
-            throw std::runtime_error("The type is not copy-assignable");
-        }
-    }
-    if (switch_buffer_to_type(vt)) {
-        if (vtable_) vtable_->copy_assigner_(get_raw(), (void *)data);
-    } else {
-        if (vtable_) vtable_->copy_ctor_(get_raw(), (void *)data);
-    }
-}
-
-void any_t::move_from(void *data, const any_detail::any_vtable_t *vt) {
-    if (vtable_ != vt) {
-        if (vt && !vt->move_ctor_) {
-            throw std::runtime_error("The type is not move-constructible");
-        }
-    } else {
-        if (vt && !vt->move_assigner_) {
-            throw std::runtime_error("The type is not move-assignable");
-        }
-    }
-    if (switch_buffer_to_type(vt)) {
-        if (vtable_) vtable_->move_assigner_(get_raw(), data);
-    } else {
-        if (vtable_) vtable_->move_ctor_(get_raw(), data);
-    }
-}
-
-void any_t::clear() {
-    if (vtable_) {
-        if (vtable_->size_ > INLINE_BUFFER_SIZE) {
-            vtable_->destructor_(data_.ptr_);
-            delete[] data_.ptr_;
-        } else {
-            vtable_->destructor_(&data_.inlined_buffer_);
-        }
-        vtable_ = nullptr;
-        data_.ptr_ = nullptr;
-    }
-}
-
-// if we move from any_t, we can simply "steal" the pointer and vtable
-void any_t::move_from_any(any_t &&v) {
-    clear();
-    if (v.vtable_) {
-        vtable_ = v.vtable_;
-        if (vtable_->size_ > INLINE_BUFFER_SIZE) {
-            // large object, it is a pointer. Move the pointer
-            data_.ptr_ = v.data_.ptr_;
-            v.data_.ptr_ = nullptr;
-            v.vtable_ = nullptr;
-        } else {
-            // small object, we cannot steal the inlined buffer. Call move ctor
-            // instead
-            vtable_->move_ctor_(
-                    &data_.inlined_buffer_, &v.data_.inlined_buffer_);
-        }
-    }
-}
-
-int any_t::cmp(const any_t &other) const {
-    COMPILE_ASSERT(!empty() && !other.empty(), "Comparing an empty any_t");
-    reflection::general_ref_t ths = utils::any_to_general_ref(*this);
-    reflection::general_ref_t ohr = utils::any_to_general_ref(other);
-    return ths.cmp(ohr);
-}
-
-size_t any_t::hash() const {
-    COMPILE_ASSERT(!empty(), "Hashing an empty any_t");
-    return utils::any_to_general_ref(*this).hash();
-}
-
-any_t &any_map_t::get_any(const std::string &v) {
-    auto itr = impl_.find(v);
-    COMPILE_ASSERT(
-            itr != impl_.end(), "Cannot find the key " << v << " in the map");
-    return itr->second;
-}
-
-const any_t &any_map_t::get_any(const std::string &v) const {
-    auto itr = impl_.find(v);
-    COMPILE_ASSERT(
-            itr != impl_.end(), "Cannot find the key " << v << " in the map");
-    return itr->second;
-}
-
-bool any_map_t::operator==(const any_map_t &other) const {
-    if (impl_.size() != other.impl_.size()) { return false; }
-    for (auto &kv : impl_) {
-        auto otherkv = other.impl_.find(kv.first);
-        if (otherkv == other.impl_.end()) { return false; }
-        if (kv.second.cmp(otherkv->second) != 0) { return false; }
-    }
-    return true;
-}
-
-size_t any_map_t::hash() const {
-    size_t result = 0;
-    // using xor to combine the hash results.
-    for (auto &kv : impl_) {
-        result ^= std::hash<std::string>()(kv.first);
-        result ^= kv.second.hash();
-    }
-    return result;
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/any_map.hpp b/src/graph/backend/graph_compiler/core/src/util/any_map.hpp
deleted file mode 100644
index 1eba905a87b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/any_map.hpp
+++ /dev/null
@@ -1,560 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ANY_MAP_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ANY_MAP_HPP
-
-#include <iostream>
-#include <string>
-#include <typeinfo>
-#include <utility>
-#include <vector>
-#include "assert.hpp"
-#include "def.hpp"
-#include <type_traits>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace reflection {
-struct type;
-}
-
-namespace any_detail {
-
-using unary_func = void (*)(void *);
-using binary_func = void (*)(void *ths, void *other);
-using binary_const_func = void (*)(void *ths, const void *other);
-struct any_vtable_t {
-    size_t size_;
-    const std::type_info &typeinfo_;
-    unary_func destructor_;
-    // move assign: operator =(T&&)
-    binary_func move_assigner_;
-    // move constructor: T(T&&)
-    binary_func move_ctor_;
-    // copy assign: operator =(const T&)
-    binary_const_func copy_assigner_;
-    // copy constructor: T(const T&)
-    binary_const_func copy_ctor_;
-    any_vtable_t(size_t size, const std::type_info &typeinfo,
-            unary_func destructor, binary_func move_assigner,
-            binary_func move_ctor, binary_const_func copy_assigner,
-            binary_const_func copy_ctor)
-        : size_(size)
-        , typeinfo_(typeinfo)
-        , destructor_(destructor)
-        , move_assigner_(move_assigner)
-        , move_ctor_(move_ctor)
-        , copy_assigner_(copy_assigner)
-        , copy_ctor_(copy_ctor) {
-        set_rtti_to_vtable_map(&typeinfo_, this);
-    }
-
-    SC_API static void set_rtti_to_vtable_map(
-            const std::type_info *typeinfo, any_vtable_t *);
-    SC_API static any_vtable_t *get_vtable_by_rtti(
-            const std::type_info *typeinfo);
-};
-
-template <typename T>
-struct registry;
-
-// if is_move_assignable, use move
-template <bool movable, typename T>
-struct move_assign_impl_t {
-    static inline void call(void *ths, void *other) {
-        T *ptr = reinterpret_cast<T *>(ths);
-        T *oth = reinterpret_cast<T *>(other);
-        *ptr = std::move(*oth);
-    }
-};
-
-// if not is_move_assignable
-template <typename T>
-struct move_assign_impl_t<false, T> {
-    static constexpr void (*call)(void *ths, void *other) = nullptr;
-};
-
-// if is_move_constructible, use move
-template <bool movable, typename T>
-struct move_constru_impl_t {
-    static inline void call(void *ths, void *other) {
-        T *oth = reinterpret_cast<T *>(other);
-        new (ths) T(std::move(*oth));
-    }
-};
-
-// if not is_move_constructible
-template <typename T>
-struct move_constru_impl_t<false, T> {
-    static constexpr void (*call)(void *ths, void *other) = nullptr;
-};
-
-// if is_copy_assignable, use copy
-template <bool copyable, typename T>
-struct copy_assign_impl_t {
-    static inline void call(void *ths, const void *other) {
-        T *ptr = reinterpret_cast<T *>(ths);
-        const T *oth = reinterpret_cast<const T *>(other);
-        *ptr = *oth;
-    }
-};
-
-// if not is_copy_assignable
-template <typename T>
-struct copy_assign_impl_t<false, T> {
-    static constexpr void (*call)(void *ths, const void *other) = nullptr;
-};
-
-// if is_copy_constructible, use copy
-template <bool copyable, typename T>
-struct copy_constru_impl_t {
-    static inline void call(void *ths, const void *other) {
-        const T *oth = reinterpret_cast<const T *>(other);
-        new (ths) T(*oth);
-    }
-};
-
-// if not is_copy_constructible
-template <typename T>
-struct copy_constru_impl_t<false, T> {
-    static constexpr void (*call)(void *ths, const void *other) = nullptr;
-};
-
-template <typename T>
-struct destructor_impl_t {
-    static inline void destructor(void *p) {
-        T *ptr = reinterpret_cast<T *>(p);
-        ptr->~T();
-    }
-};
-
-template <typename T, std::size_t N>
-struct destructor_impl_t<T[N]> {
-    using impl_t = std::array<T, N>;
-    static inline void destructor(void *p) {
-        auto *ptr = reinterpret_cast<impl_t *>(p);
-        ptr->~impl_t();
-    }
-};
-
-template <typename T>
-struct registry {
-    static SC_API any_vtable_t vtable;
-    static constexpr any_vtable_t *get_vtable() { return &vtable; }
-};
-
-#if defined(__clang__) || !defined(SC_DLL) || defined(SC_DLL_EXPORTS)
-template <typename T>
-SC_API any_vtable_t registry<T>::vtable(sizeof(T), typeid(T),
-        destructor_impl_t<T>::destructor,
-        move_assign_impl_t<std::is_move_assignable<T>::value, T>::call,
-        move_constru_impl_t<std::is_move_constructible<T>::value, T>::call,
-        copy_assign_impl_t<std::is_copy_assignable<T>::value, T>::call,
-        copy_constru_impl_t<std::is_copy_constructible<T>::value, T>::call);
-#endif
-
-template <typename T>
-struct assign_impl;
-
-// pattern matching impl
-
-template <typename Ret, typename Arg, typename... Rest>
-Arg first_argument_helper(Ret (*)(Arg, Rest...));
-
-template <typename Ret, typename F, typename Arg, typename... Rest>
-Arg first_argument_helper(Ret (F::*)(Arg, Rest...));
-
-template <typename Ret, typename F, typename Arg, typename... Rest>
-Arg first_argument_helper(Ret (F::*)(Arg, Rest...) const);
-
-template <typename F>
-decltype(first_argument_helper(&F::operator())) first_argument_helper(F);
-
-template <typename T>
-using first_argument = typename std::decay<decltype(
-        first_argument_helper(std::declval<T>()))>::type;
-} // namespace any_detail
-
-/**
- * The type-erased container for any type. It will save the
- * destructor/copyer/mover to correctly destruct/move/copy the contained object.
- * assume we have `any_t` variables: any_val and any_val2. We also have a
- * variable of type `T`: var. Requirements on the type `T`:
- *  1. in the assignment `any_val = var`:
- *      a) if any_val.isa<T>(), then requires that type T is copy assignable
- *      b) else, requires that type T is copy constructible
- *  2. in the assignment `any_val = std::move(var)`:
- *      a) if any_val.isa<T>(), then requires that type T is move assignable
- *      b) else, requires that type T is move constructible
- *  3. in the assignment `any_val = any_val2`:
- *      a) if any_val has the same type of any_val2, then requires that type T
- *          is copy assignable
- *      b) else, requires that type T is copy constructible
- *  4. in the assignment `any_val = std::move(any_val2)`:
- *      requires nothing. It will move the data pointer from any_val2 to any_val
- *  5. in the construction `any_t new_val = any_val`, requires that T is copy
- *      constructible
- *  6. in the construction `any_t new_val = std::move(any_val2)`, it requires
- *     nothing. It will move the data pointer from any_val2 to any_val
- *
- *  We can roughly say: if you want to move an `any_t`, the object in it should
- * be movable. if you want to copy an `any_t`, the object in it should be
- * copyable
- *
- * If the requirement is not satisfied, will abort
- * @note simple types (e.g. int, float, bool) are allowed here
- * */
-struct any_t {
-    static constexpr size_t INLINE_BUFFER_SIZE = 64 - sizeof(void *);
-
-private:
-    // for large objects with size > INLINE_BUFFER_SIZE bytes, `any_t` will
-    // allocate a buffer on the heap for it and internally store a pointer to
-    // the object. For small objects <= INLINE_BUFFER_SIZE bytes, it will use
-    // the `inlined_buffer_` to store it.
-    union buffer_or_pointer {
-        char *ptr_;
-        char inlined_buffer_[INLINE_BUFFER_SIZE];
-        buffer_or_pointer() : inlined_buffer_ {0} {}
-    } data_;
-    const any_detail::any_vtable_t *vtable_ = nullptr;
-    template <class T>
-    friend struct any_detail::assign_impl;
-
-    SC_API void create_buffer(const any_detail::any_vtable_t *vt);
-
-    // switches the buffer to a type. If we are already holding this type, do
-    // nothing and return true. Else, we release the held object, create new
-    // buffer, change the vtable and return false
-    SC_API bool switch_buffer_to_type(const any_detail::any_vtable_t *vt);
-
-    SC_API void copy_from(const void *data, const any_detail::any_vtable_t *vt);
-
-    SC_API void move_from(void *data, const any_detail::any_vtable_t *vt);
-
-    // if we move from any_t, we can simply "steal" the pointer and vtable
-    SC_API void move_from_any(any_t &&v);
-
-public:
-    void *get_raw() {
-        if (!vtable_) { return nullptr; }
-        if (vtable_->size_ <= INLINE_BUFFER_SIZE) {
-            return &data_.inlined_buffer_[0];
-        } else {
-            return data_.ptr_;
-        }
-    }
-    const void *get_raw() const { return const_cast<any_t *>(this)->get_raw(); }
-    /**
-     * Clears the contained object. Calls the destructor and free the buffer.
-     * */
-    SC_API void clear();
-
-    /**
-     * Compares this with other. Requires that both this and other's type have
-     * reflection::type registered. (Basic types like int and classes registered
-     * in reflection are okay.). If the requirement is not satisfied or any_t of
-     * this or other is empty, it will throw an exception
-     * @return -1 if *this<other. 0 if *this==other. 1 if *this>other.
-     * */
-    SC_API int cmp(const any_t &other) const;
-
-    /**
-     * Computes the hash code , using reflection::general_ref_t::hash. Requires
-     * that the type have reflection::type registered. If the requirement is not
-     * satisfied or this is empty, it will throw an exception
-     * */
-    SC_API size_t hash() const;
-    /**
-     * Pattern matching on the type of the contained value of this any_t
-     * The parameters starting from the second are the functions that matches
-     * the contained type. They should have the signature of `void (T)`, where T
-     * can be a constant reference or a base type. If the contained type in this
-     * any_t matches T, the function will be called. The first parameter of
-     * `match(...)` is a function, which will be called when no type is matched.
-     * */
-    template <typename T, typename T1, typename... Args>
-    bool match(T defaults, T1 func1, Args &&...args) const {
-        using MatchedT = any_detail::first_argument<T1>;
-        if (isa<MatchedT>()) {
-            func1(get<MatchedT>());
-            return true;
-        }
-        return match(std::forward<T>(defaults), std::forward<Args>(args)...);
-    }
-
-    template <typename T, typename T1>
-    bool match(T defaults, T1 func1) const {
-        using MatchedT = any_detail::first_argument<T1>;
-        if (isa<MatchedT>()) {
-            func1(get<MatchedT>());
-            return true;
-        }
-        defaults();
-        return false;
-    }
-
-    any_t() = default;
-
-    // keep this overload function to make compiler happy
-    any_t(const any_t &v) { copy_from(v.get_raw(), v.vtable_); }
-    // keep this overload function to make compiler happy
-    any_t(any_t &&v) { move_from_any(std::move(v)); }
-
-    template <typename T>
-    any_t(T &&v) {
-        *this = std::forward<T>(v);
-    }
-
-    // C++ templates cannot correctly override T&& and const T& (all goes to
-    // T&&), so we need to use std::decay to choose the correct version
-    template <typename T>
-    any_t &operator=(T &&v) {
-        any_detail::assign_impl<typename std::decay<T>::type>::call(
-                *this, std::forward<T>(v));
-        return *this;
-    }
-
-    // keep this overload function to make compiler happy
-    any_t &operator=(any_t &&v) {
-        if (this == &v) return *this;
-        move_from_any(std::move(v));
-        return *this;
-    }
-    // keep this overload function to make compiler happy
-    any_t &operator=(const any_t &v) {
-        if (this == &v) return *this;
-        copy_from(v.get_raw(), v.vtable_);
-        return *this;
-    }
-
-    ~any_t() { clear(); }
-
-    template <typename T>
-    void set(const T &v) {
-        *this = v;
-    }
-
-    template <typename T>
-    static any_detail::any_vtable_t *get_vtable() {
-        return any_detail::registry<typename std::decay<T>::type>::get_vtable();
-    }
-
-    template <typename T>
-    T *get_or_null() {
-        if (!isa<T>()) { return nullptr; }
-        T *ptr = reinterpret_cast<T *>(get_raw());
-        return ptr;
-    }
-
-    template <typename T>
-    const T *get_or_null() const {
-        return const_cast<any_t *>(this)->get_or_null<T>();
-    }
-
-    template <typename T>
-    T &get() {
-        COMPILE_ASSERT(isa<T>(),
-                "Incorrect type for any_t::get, this = "
-                        << vtable_->typeinfo_.name() << ", expected "
-                        << get_vtable<T>()->typeinfo_.name());
-        T *ptr = reinterpret_cast<T *>(get_raw());
-        return *ptr;
-    }
-
-    template <typename T>
-    const T &get() const {
-        return const_cast<any_t *>(this)->get<T>();
-    }
-
-    // returns if the `any_t` has an object of type T
-    template <typename T>
-    bool isa() const {
-        return vtable_ == get_vtable<T>();
-    }
-
-    // returns the type_info of the contained object. If is empty, return null
-    const std::type_info *type_code() const {
-        return vtable_ ? &vtable_->typeinfo_ : nullptr;
-    }
-
-    const any_detail::any_vtable_t *vtable() const { return vtable_; }
-
-    // returns true if there is any value in this `any_t`
-    bool empty() const { return !vtable_; }
-
-    // makes an any_t by the reflection type
-    static SC_API any_t make_by_type(const reflection::type *type);
-};
-
-namespace any_detail {
-template <typename T>
-struct assign_impl {
-    inline static void call(any_t &ths, const T &v) {
-        ths.copy_from(&v, any_detail::registry<T>::get_vtable());
-    }
-
-    inline static void call(any_t &ths, T &&v) {
-        ths.move_from(&v, any_detail::registry<T>::get_vtable());
-    }
-};
-
-template <>
-struct assign_impl<any_t> {
-    inline static void call(any_t &ths, const any_t &v) {
-        if (&ths == &v) { return; }
-        ths.copy_from(v.get_raw(), v.vtable_);
-    }
-
-    inline static void call(any_t &ths, any_t &&v) {
-        if (&ths == &v) { return; }
-        ths.move_from_any(std::move(v));
-    }
-};
-
-template <>
-struct assign_impl<const char *> {
-    inline static void call(any_t &ths, const char *v) {
-        assign_impl<std::string>::call(ths, std::string(v));
-    }
-};
-
-template <>
-struct assign_impl<char *> {
-    inline static void call(any_t &ths, char *v) {
-        assign_impl<const char *>::call(ths, v);
-    }
-};
-} // namespace any_detail
-
-struct SC_API any_map_t {
-private:
-    std::unordered_map<std::string, any_t> impl_;
-
-public:
-    any_map_t() = default;
-    any_map_t(const std::unordered_map<std::string, any_t> &impl)
-        : impl_(impl) {}
-    any_map_t(std::initializer_list<std::pair<const std::string, any_t>> init)
-        : impl_(init) {}
-
-    size_t size() const { return impl_.size(); };
-
-    const std::unordered_map<std::string, any_t> &as_map() const {
-        return impl_;
-    }
-    std::unordered_map<std::string, any_t> &as_map() { return impl_; }
-
-    any_t &get_any(const std::string &v);
-
-    const any_t &get_any(const std::string &v) const;
-
-    template <typename T>
-    T &get(const std::string &v) {
-        return get_any(v).get<T>();
-    };
-
-    template <typename T>
-    const T &get(const std::string &v) const {
-        return get_any(v).get<T>();
-    };
-
-    // gets a value. if key is not found, returns default_value
-    template <typename T>
-    const T &get_or_else(const std::string &k, const T &default_value) const {
-        if (!has_key(k)) { return default_value; }
-        return get_any(k).get<T>();
-    };
-
-    // gets the pointer of a value. if key is not found, returns nullptr
-    template <typename T>
-    T *get_or_null(const std::string &k) {
-        if (!has_key(k)) { return nullptr; }
-        return &get_any(k).get<T>();
-    };
-
-    // gets the pointer of a value. if key is not found, returns nullptr
-    template <typename T>
-    const T *get_or_null(const std::string &k) const {
-        if (!has_key(k)) { return nullptr; }
-        return &get_any(k).get<T>();
-    };
-
-    template <typename T>
-    void set(const std::string &k, const T &v) {
-        if (has_key(k)) {
-            get_any(k) = v;
-        } else {
-            any_t anyv = v;
-            impl_.insert(std::make_pair(k, std::move(anyv)));
-        }
-    };
-
-    bool has_key(const std::string &v) const {
-        auto itr = impl_.find(v);
-        return itr != impl_.end();
-    }
-
-    void remove(const std::string &v) { impl_.erase(v); }
-
-    any_t &operator[](const std::string &v) {
-        if (!has_key(v)) { impl_.insert(std::make_pair(v, any_t())); }
-        return get_any(v);
-    }
-    /**
-     * Compares the map with other using reflection
-     * Requires that the types in it have reflection::type registered. If not,
-     * it will throw an exception
-     * */
-    bool operator==(const any_map_t &other) const;
-
-    bool operator!=(const any_map_t &other) const { return !(*this == other); }
-
-    /**
-     * Computes the hash code using reflection
-     * Requires that the types in it have reflection::type registered. If not,
-     * it will throw an exception
-     * */
-    size_t hash() const;
-
-    // a static version for get_or_else. It also returns defaultv when v is null
-    template <typename T>
-    inline static const T fetch_or_else(
-            const any_map_t *v, const std::string &k, const T &defaultv) {
-        if (!v) { return defaultv; }
-        return v->get_or_else(k, defaultv);
-    }
-
-    // a static version for get_or_null. It also returns null when v is null
-    template <typename T>
-    inline static const T *fetch_or_null(
-            const any_map_t *v, const std::string &k) {
-        if (!v) { return nullptr; }
-        return v->get_or_null<T>(k);
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/any_reflection_cvt.hpp b/src/graph/backend/graph_compiler/core/src/util/any_reflection_cvt.hpp
deleted file mode 100644
index 573b022a68d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/any_reflection_cvt.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ANY_REFLECTION_CVT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ANY_REFLECTION_CVT_HPP
-
-#include "any_map.hpp"
-#include "general_object.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-// converts an any_t to general ref. The RTTI of the any_t must be registered in
-// reflection's type registery. Otherwise, this function will throw an exception
-reflection::general_ref_t any_to_general_ref(const any_t &);
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/array_ref.hpp b/src/graph/backend/graph_compiler/core/src/util/array_ref.hpp
deleted file mode 100644
index 3035a87ddd5..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/array_ref.hpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ARRAY_REF_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ARRAY_REF_HPP
-
-#include <array>
-#include <stddef.h>
-#include <vector>
-#include "compiler_macros.hpp"
-#include <initializer_list>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// ArrayRef implementation taken from LLVM
-//===- ArrayRef.h - Array Reference Wrapper -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM
-// Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===------------------------------------------------------------------===//
-
-template <typename T>
-class array_ref {
-private:
-    const T *ptr_ = nullptr;
-    size_t sz_ = 0;
-
-public:
-    using value_type = T;
-    using const_pointer = const value_type *;
-    using const_iterator = const_pointer;
-    using const_reference = const value_type &;
-    using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-    using difference_type = ptrdiff_t;
-    using iterator = const_pointer;
-    using pointer = value_type *;
-    using reference = value_type &;
-    using reverse_iterator = std::reverse_iterator<iterator>;
-    using size_type = size_t;
-
-    array_ref() = default;
-    array_ref(const T *ptr) : ptr_(ptr), sz_(1) {}
-    array_ref(const T *ptr, size_t size) : ptr_(ptr), sz_(size) {}
-
-    array_ref(const T *begin, const T *end) : ptr_(begin), sz_(end - begin) {}
-
-#if SC_GNUC_VERSION_GE(9)
-// Disable gcc's warning in this constructor as it generates an enormous amount
-// of messages. Anyone using ArrayRef should already be aware of the fact that
-// it does not do lifetime extension.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Winit-list-lifetime"
-#endif
-    constexpr array_ref(const std::initializer_list<T> &v)
-        : ptr_(v.begin() == v.end() ? nullptr : v.begin()), sz_(v.size()) {}
-#if SC_GNUC_VERSION_GE(9)
-#pragma GCC diagnostic pop
-#endif
-
-    template <typename A>
-    array_ref(const std::vector<T, A> &v) : ptr_(v.data()), sz_(v.size()) {}
-
-    template <size_t N>
-    constexpr array_ref(const std::array<T, N> &v) : ptr_(v.data()), sz_(N) {}
-
-    template <size_t N>
-    constexpr array_ref(const T (&v)[N]) : ptr_(v), sz_(N) {}
-
-    bool empty() const { return sz_ == 0; }
-
-    const T *data() const { return ptr_; }
-
-    size_t size() const { return sz_; }
-
-    const T &front() const { return (*this)[0]; }
-
-    const T &back() const { return (*this)[sz_ - 1]; }
-
-    const T &operator[](size_t i) const {
-        assert(i < sz_);
-        return ptr_[i];
-    }
-
-    /// Disallow accidental assignment from a temporary.
-    ///
-    /// The declaration here is extra complicated so that "arrayRef = {}"
-    /// continues to select the move assignment operator.
-    template <typename U>
-    typename std::enable_if<std::is_same<U, T>::value, array_ref<T>>::type &
-    operator=(U &&)
-            = delete;
-
-    /// Disallow accidental assignment from a temporary.
-    ///
-    /// The declaration here is extra complicated so that "arrayRef = {}"
-    /// continues to select the move assignment operator.
-    template <typename U>
-    typename std::enable_if<std::is_same<U, T>::value, array_ref<T>>::type &
-    operator=(std::initializer_list<U>)
-            = delete;
-
-    iterator begin() const { return ptr_; }
-    iterator end() const { return ptr_ + sz_; }
-
-    reverse_iterator rbegin() const { return reverse_iterator(end()); }
-    reverse_iterator rend() const { return reverse_iterator(begin()); }
-
-    std::vector<T> as_vector() const {
-        return std::vector<T>(ptr_, ptr_ + sz_);
-    }
-
-    bool operator==(const array_ref<T> &other) const {
-        if (sz_ != other.sz_) { return false; }
-        for (size_t i = 0; i < sz_; i++) {
-            if (ptr_[i] != other.ptr_[i]) return false;
-        }
-        return true;
-    }
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/assert.hpp b/src/graph/backend/graph_compiler/core/src/util/assert.hpp
deleted file mode 100644
index 6936e568e73..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/assert.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ASSERT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_ASSERT_HPP
-
-#include <sstream>
-#include <stdexcept>
-
-#define COMPILE_ASSERT(cond, ...) \
-    if (!(cond)) { \
-        ::std::stringstream ss; \
-        ss << __FILE__ << "[" << __LINE__ << "]: " << __VA_ARGS__ << "\n"; \
-        throw ::std::runtime_error(ss.str()); \
-    }
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/bf16.hpp b/src/graph/backend/graph_compiler/core/src/util/bf16.hpp
deleted file mode 100644
index 1456ed0eb5a..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/bf16.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_BF16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_BF16_HPP
-
-#include <cmath>
-#include <stdint.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// The BFloat16 datatype implementation, can be cast from/to float
-struct bf16_t {
-    uint16_t storage_;
-    union caster_t {
-        uint32_t vl;
-        float vf;
-    };
-    operator float() const {
-        caster_t val;
-        val.vl = uint32_t(storage_) << 16;
-        return val.vf;
-    }
-    bool operator==(const bf16_t &compare_to) const {
-        return storage_ == compare_to.storage_;
-    }
-    bool operator!=(const bf16_t &compare_to) const {
-        return storage_ != compare_to.storage_;
-    }
-    bf16_t(float v) {
-        if (std::isnan(v)) {
-            storage_ = UINT32_C(0x7FC0);
-        } else {
-            caster_t caster;
-            caster.vf = v;
-            uint32_t rounding_bias = ((caster.vl >> 16) & 1) + UINT32_C(0x7FFF);
-            storage_ = static_cast<uint16_t>((caster.vl + rounding_bias) >> 16);
-        }
-    }
-    bf16_t() : storage_(0) {}
-    inline static bf16_t from_storage(uint16_t v) {
-        bf16_t ret;
-        ret.storage_ = v;
-        return ret;
-    }
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/compiler_macros.hpp b/src/graph/backend/graph_compiler/core/src/util/compiler_macros.hpp
deleted file mode 100644
index d66e34f41da..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/compiler_macros.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_COMPILER_MACROS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_COMPILER_MACROS_HPP
-
-#if defined(__GNUC__)
-#define SC_GNUC_VERSION_GE(x) (__GNUC__ >= (x))
-#else
-#define SC_GNUC_VERSION_GE(x) 0
-#endif
-
-#if defined(__GNUC__)
-#define SC_GNUC_VERSION_LT(x) (__GNUC__ < (x))
-#else
-#define SC_GNUC_VERSION_LT(x) 0
-#endif
-
-// a macro which does nothing. It is used as placeholder which workarounds a
-// bug in parsing of "omp parallel for" in icx compiler
-#define SC_NO_OP()
-
-#if defined(__INTEL_LLVM_COMPILER)
-// the dpcpp spec says that we also need to find existence of
-// SYCL_LANGUAGE_VERSION, which is not true
-#define SC_IS_DPCPP() 1
-#else
-#define SC_IS_DPCPP() 0
-#endif
-
-#if defined(__clang__)
-#define SC_IS_CLANG() 1
-#else
-#define SC_IS_CLANG() 0
-#endif
-
-#ifdef _MSC_VER
-// returns if the compiler is really MSVC (not dpcpp simulation)
-#define SC_IS_MSVC() (!SC_IS_DPCPP() && !SC_IS_CLANG())
-#else
-#define SC_IS_MSVC() 0
-#endif
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/def.hpp b/src/graph/backend/graph_compiler/core/src/util/def.hpp
deleted file mode 100644
index 406ff2f46ec..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/def.hpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_DEF_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_DEF_HPP
-
-#define SC_UNUSED(x) ((void)(x))
-// the macro marks that a function is the top-level API of graph-compiler
-#ifdef _MSC_VER
-#ifdef SC_DLL
-#ifdef SC_DLL_EXPORTS
-#define SC_API __declspec(dllexport)
-#else
-#define SC_API __declspec(dllimport)
-#endif
-#else
-#define SC_API
-#endif
-#else
-#define SC_API __attribute__((visibility("default")))
-#endif
-
-#define SC_INTERNAL_API
-
-#ifndef SC_MEMORY_LEAK_CHECK
-#define SC_MEMORY_LEAK_CHECK 0
-#endif
-
-#if SC_MEMORY_LEAK_CHECK > 0
-#define SC_EXTENDS_LEAK_CHECK(T) \
-    , public ::dnnl::impl::graph::gc::utils::leak_detect_base<T>
-#define SC_LEAK_CHECK(T) \
-    , public ::dnnl::impl::graph::gc::utils::leak_detect_base<T>
-#else
-#define SC_EXTENDS_LEAK_CHECK(T)
-#define SC_LEAK_CHECK(T)
-#endif
-
-#ifndef SC_CFAKE_JIT_ENABLED
-#if defined(_WIN32) || defined(__APPLE__)
-#define SC_CFAKE_JIT_ENABLED 0
-#else
-#define SC_CFAKE_JIT_ENABLED 1
-#endif
-#endif
-
-#ifndef SC_BUILTIN_JIT_ENABLED
-#define SC_BUILTIN_JIT_ENABLED 1
-#endif
-
-#define SC_THREAD_POOL_SEQ 0
-#define SC_THREAD_POOL_OMP 1
-#define SC_THREAD_POOL_TBB 2
-#define SC_THREAD_POOL_CUSTOM 3
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/exceptions.hpp b/src/graph/backend/graph_compiler/core/src/util/exceptions.hpp
deleted file mode 100644
index 9c6970ca71b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/exceptions.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_EXCEPTIONS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_EXCEPTIONS_HPP
-
-#include <stdexcept>
-#include <string>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct tuner_recoverable_exception_t : public std::runtime_error {
-    using parent = std::runtime_error;
-    using parent::runtime_error;
-};
-
-class json_error : public std::runtime_error {
-    using parent = std::runtime_error;
-    using parent::runtime_error;
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/fdstream.cpp b/src/graph/backend/graph_compiler/core/src/util/fdstream.cpp
deleted file mode 100644
index dfa259b3220..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/fdstream.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "fdstream.hpp"
-#include <cstdio>
-#include <string.h>
-
-#ifdef _MSC_VER
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-void fdoutbuf_t::close() {
-    if (fd_ != -1) {
-        ::close(fd_);
-        fd_ = -1;
-    }
-}
-
-fdoutbuf_t::int_type fdoutbuf_t::overflow(int_type c) {
-    if (c != EOF) {
-        if (write(fd_, &c, 1) != 1) { return EOF; }
-    }
-    return c;
-}
-
-std::streamsize fdoutbuf_t::xsputn(const char *s, std::streamsize num) {
-    return write(fd_, s, num);
-}
-
-void fdinbuf_t::close() {
-    if (fd_ != -1) {
-        ::close(fd_);
-        fd_ = -1;
-    }
-}
-
-int fdinbuf_t::underflow() {
-    if (gptr() < egptr()) { return *gptr(); }
-
-    int putback_cnt = gptr() - eback();
-    if (putback_cnt > putback_size_) { putback_cnt = putback_size_; }
-    memmove(data_.get() + putback_size_ - putback_cnt, gptr() - putback_cnt,
-            putback_cnt);
-
-    int ret = read(fd_, data_.get() + putback_size_, buf_size_);
-    if (ret <= 0) { return EOF; }
-
-    setg(data_.get() + putback_size_ - putback_cnt, data_.get() + putback_size_,
-            data_.get() + putback_size_ + ret);
-    return *gptr();
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/fdstream.hpp b/src/graph/backend/graph_compiler/core/src/util/fdstream.hpp
deleted file mode 100644
index 5d0d077ae52..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/fdstream.hpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_FDSTREAM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_FDSTREAM_HPP
-#include <istream>
-#include <memory>
-#include <ostream>
-#include <streambuf>
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-/**
- * fd output buffer
- * */
-class fdoutbuf_t : public std::streambuf {
-public:
-    fdoutbuf_t(int fd = -1) : fd_(fd) {}
-    fdoutbuf_t(fdoutbuf_t &&other) {
-        fd_ = other.fd_;
-        other.fd_ = -1;
-    }
-    fdoutbuf_t &operator=(fdoutbuf_t &&other) {
-        close();
-        fd_ = other.fd_;
-        other.fd_ = -1;
-        return *this;
-    }
-    void close();
-    ~fdoutbuf_t() override { close(); }
-
-protected:
-    int fd_;
-    std::streamsize xsputn(const char *s, std::streamsize num) override;
-    int_type overflow(int_type c) override;
-};
-
-/**
- * The output stream that operates on an fd. Will take the ownership of the fd
- * */
-class ofdstream_t : public std::ostream {
-protected:
-    fdoutbuf_t buf_;
-
-public:
-    ofdstream_t(int fd = -1) : std::ostream(nullptr), buf_(fd) { rdbuf(&buf_); }
-    void reset(int fd) {
-        this->~ofdstream_t();
-        new (this) ofdstream_t(-1);
-        buf_ = fdoutbuf_t(fd);
-    }
-    // g++ 4.8 don't support these
-    // ofdstream_t(ofdstream_t &&other)
-    //     : std::ostream(std::move(other)), buf_(std::move(other.buf_)) {}
-    // ofdstream_t &operator=(ofdstream_t &&other) {
-    //     this->std::ostream::operator=(std::move(other));
-    //     buf_ = std::move(other.buf_);
-    //     return *this;
-    // }
-};
-
-class fdinbuf_t : public std::streambuf {
-protected:
-    int fd_;
-    std::unique_ptr<char[]> data_;
-    static constexpr int putback_size_ = 4;
-    static constexpr int buf_size_ = 1024;
-
-public:
-    fdinbuf_t(int fd = -1)
-        : fd_(fd)
-        , data_(std::unique_ptr<char[]>(new char[putback_size_ + buf_size_])) {}
-    fdinbuf_t(fdinbuf_t &&other) : data_(std::move(other.data_)) {
-        setg(other.eback(), other.gptr(), other.egptr());
-        fd_ = other.fd_;
-        other.fd_ = -1;
-    }
-    fdinbuf_t &operator=(fdinbuf_t &&other) {
-        close();
-        setg(other.eback(), other.gptr(), other.egptr());
-        fd_ = other.fd_;
-        other.fd_ = -1;
-        data_ = std::move(other.data_);
-        return *this;
-    }
-    void close();
-    ~fdinbuf_t() override { close(); }
-
-protected:
-    int underflow() override;
-};
-
-class ifdstream_t : public std::istream {
-protected:
-    fdinbuf_t buf_;
-
-public:
-    ifdstream_t(int fd = -1) : std::istream(nullptr), buf_(fd) { rdbuf(&buf_); }
-    void reset(int fd) {
-        this->~ifdstream_t();
-        new (this) ifdstream_t(-1);
-        buf_ = fdinbuf_t(fd);
-    }
-    // g++ 4.8 don't support these
-    // ifdstream_t(ifdstream_t &&other)
-    //     : std::istream(std::move(other)), buf(std::move(other.buf)) {}
-    // ifdstream_t &operator=(ifdstream_t &&other) {
-    //     this->std::istream::operator=(std::move(other));
-    //     buf = std::move(other.buf);
-    //     return *this;
-    // }
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/file.hpp b/src/graph/backend/graph_compiler/core/src/util/file.hpp
deleted file mode 100644
index 068a30858ac..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/file.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_FILE_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_FILE_HPP
-
-#include <fstream>
-#include <string>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-/**
- * Generates a unique name for file name
- * */
-SC_INTERNAL_API std::string get_unique_name_for_file();
-
-/**
- * Opens a file for write (in text mode). If failed to open, throws an error
- * Due to a g++4.8 bug, we cannot return the stream.
- * */
-SC_INTERNAL_API void open_file_for_write(
-        std::ofstream &ret, const std::string &path);
-
-/**
- * Opens a file for read (in text mode). If failed to open, throws an error
- * */
-SC_INTERNAL_API void open_file_for_read(
-        std::ifstream &ret, const std::string &path);
-
-/**
- * Opens a file for write (in text mode) in system temp directory. If failed to
- * open, throws an error
- * */
-SC_INTERNAL_API void open_temp_file_for_write(
-        std::ofstream &ret, const std::string &filename);
-
-/**
- * Opens a file for read (in text mode) in system temp directory. If failed to
- * open, throws an error
- * */
-SC_INTERNAL_API void open_temp_file_for_read(
-        std::ifstream &ret, const std::string &filename);
-
-} // namespace utils
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/fp16.hpp b/src/graph/backend/graph_compiler/core/src/util/fp16.hpp
deleted file mode 100644
index 4e6f7290023..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/fp16.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_FP16_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_FP16_HPP
-
-#include <cmath>
-#include <stdint.h>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// The FP16 datatype implementation, can be cast from/to float
-struct fp16_t {
-    uint16_t storage_;
-    union caster_t {
-        uint32_t vl;
-        float vf;
-    };
-    bool operator==(const fp16_t &compare_to) const {
-        return storage_ == compare_to.storage_;
-    }
-    bool operator!=(const fp16_t &compare_to) const {
-        return storage_ != compare_to.storage_;
-    }
-
-    static uint32_t as_uint(const float x) {
-        caster_t ct;
-        ct.vf = x;
-        return ct.vl;
-    }
-    static float as_float(const uint32_t x) {
-        caster_t ct;
-        ct.vl = x;
-        return ct.vf;
-    }
-    static float half_to_float(const uint32_t x) {
-        // IEEE-754 16-bit floating-point format (without
-        // infinity): 1-5-10, exp-15, +-131008.0,
-        // +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
-        const uint32_t e = (x & 0x7C00) >> 10; // exponent
-        const uint32_t m = (x & 0x03FF) << 13; // mantissa
-        const uint32_t v = as_uint((float)m) >> 23;
-        // evil log2 bit hack to count leading zeros in
-        // denormalized format
-        // sign : normalized : denormalized
-        return as_float((x & 0x8000) << 16 | (e != 0) * ((e + 112) << 23 | m)
-                | ((e == 0) & (m != 0))
-                        * ((v - 37) << 23 | ((m << (150 - v)) & 0x007FE000)));
-    }
-    inline uint16_t float_to_half(const float x) {
-        // refrence from
-        // https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
-        // IEEE-754 16-bit floating-point format (without
-        // infinity): 1-5-10, exp-15, +-131008.0,
-        // +-6.1035156E-5, +-5.9604645E-8, 3.311 digits
-        const uint32_t b = as_uint(x) + 0x00001000;
-        // round-to-nearest-even: add last
-        // bit after truncated mantissa
-        const uint32_t e = (b & 0x7F800000) >> 23; // exponent
-        const uint32_t m = b & 0x007FFFFF;
-        // mantissa; in line below: 0x007FF000 =
-        // 0x00800000-0x00001000 = decimal
-        // indicator flag - initial rounding
-        // sign : normalized : denormalized : saturate
-        return (b & 0x80000000) >> 16
-                | (e > 112) * ((((e - 112) << 10) & 0x7C00) | m >> 13)
-                | ((e < 113) & (e > 101))
-                * ((((0x007FF000 + m) >> (125 - e)) + 1) >> 1)
-                | (e > 143) * 0x7FFF;
-    }
-    bool is_allbit_1(float f) {
-        auto value = as_uint(f);
-        return (value == 0x7FFFu) || (value == 0xFFFFu);
-    }
-    fp16_t(float v) {
-        if (is_allbit_1(v)) {
-            storage_ = as_uint(v);
-        } else {
-            auto ret = float_to_half(v);
-            storage_ = static_cast<uint16_t>(ret);
-        }
-    }
-    operator float() const {
-        auto ret = half_to_float(storage_);
-        return ret;
-    }
-    fp16_t() : storage_(0) {}
-    inline static fp16_t from_storage(uint16_t v) {
-        fp16_t ret;
-        ret.storage_ = v;
-        return ret;
-    }
-};
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/general_object.hpp b/src/graph/backend/graph_compiler/core/src/util/general_object.hpp
deleted file mode 100644
index 29aa72b18a3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/general_object.hpp
+++ /dev/null
@@ -1,242 +0,0 @@
-
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_GENERAL_OBJECT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_GENERAL_OBJECT_HPP
-
-#include <cassert>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-#include <type_traits>
-#include <unordered_map>
-#include <util/def.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct any_t;
-namespace reflection {
-struct class_metadata;
-struct vector_metadata;
-
-enum class basic_type {
-    t_int32_t,
-    t_int64_t,
-    t_uint32_t,
-    t_uint64_t,
-    t_float,
-    t_double,
-    t_bool,
-    t_string,
-    t_class,
-};
-
-struct type {
-    basic_type base_;
-    unsigned array_depth_;
-    // only if base_ == t_class, meta_ is valid
-    class_metadata *meta_;
-    constexpr bool operator==(const type &other) const {
-        return other.base_ == base_ && other.array_depth_ == array_depth_
-                && other.meta_ == meta_;
-    }
-    // three-way compares with another type, this<other => -1, this==other => 0,
-    // this>other => 1.
-    SC_INTERNAL_API int cmp(const type &other) const;
-    SC_INTERNAL_API size_t size() const;
-
-    /**
-     * Gets the unique string representation of a type.
-     * int32 => i
-     * int64 => i
-     * uint32 => u
-     * uint64 => U
-     * float => f
-     * double => d
-     * bool => b
-     * string => s
-     * class => class name in class_metadata
-     * */
-    SC_INTERNAL_API std::string to_string() const;
-};
-
-// any class extending this class will be tagged "reflection enabled".
-// json reader and writer will use reflection on this class
-struct reflection_enabled_t {};
-
-template <typename T, typename Dummy = int>
-struct type_registry;
-
-/**
- * The type-erased container for user-defined classes.
- * It holds the concrete values of an object
- * */
-struct SC_INTERNAL_API general_object_t {
-    // the type-erased pointer to the user-defined object
-    std::unique_ptr<char[]> data_;
-    // the function table of a user-defined class
-    std::shared_ptr<class_metadata> vtable_;
-    ~general_object_t() { release(); }
-    general_object_t(general_object_t &&other);
-    general_object_t(std::unique_ptr<char[]> &&data,
-            const std::shared_ptr<class_metadata> &vtable);
-    general_object_t() : vtable_(nullptr) {}
-
-    general_object_t(const general_object_t &other) = delete;
-    general_object_t &operator=(const general_object_t &other) = delete;
-
-    void release();
-
-    std::unique_ptr<void, void (*)(void *)> move_to_unique_ptr();
-
-    template <typename T>
-    static general_object_t make() {
-        static_assert(std::is_class<T>::value,
-                "general_object_t::make must be applied on a class");
-        return type_registry<T>::metadata()->make_instance();
-    }
-
-    template <typename T>
-    static general_object_t make(T &&v) {
-        using decay_t = typename std::decay<T>::type;
-        auto ret = make<decay_t>();
-        *ret.template unchecked_get_as<decay_t>() = std::forward<T>(v);
-        return ret;
-    }
-
-    template <typename T>
-    T *get_as() const {
-        assert(type_registry<T>::metadata() == vtable_.get());
-        return reinterpret_cast<T *>(data_.get());
-    }
-
-    template <typename T>
-    T *unchecked_get_as() const {
-        return reinterpret_cast<T *>(data_.get());
-    }
-
-    general_object_t &operator=(general_object_t &&other) {
-        release();
-        data_ = std::move(other.data_);
-        vtable_ = other.vtable_;
-        other.vtable_ = nullptr;
-        return *this;
-    }
-
-    void copy_from(const std::unordered_map<std::string, any_t> &m);
-    void copy_to(std::unordered_map<std::string, any_t> &m);
-
-    static void copy_from_any_map(
-            const std::unordered_map<std::string, any_t> &m, void *object,
-            class_metadata *vtable);
-    static void copy_to_any_map(std::unordered_map<std::string, any_t> &m,
-            void *object, class_metadata *vtable);
-
-    void *get() const { return data_.get(); }
-};
-
-/**
- * The type-erased container for user-defined classes. It has shared ownership
- * of the pointer
- * */
-struct SC_INTERNAL_API shared_general_object_t {
-    // the type-erased pointer to the user-defined object
-    std::shared_ptr<void> data_;
-    // the function table of a user-defined class
-    std::shared_ptr<class_metadata> vtable_;
-
-    shared_general_object_t(std::nullptr_t) {};
-    shared_general_object_t() = default;
-    shared_general_object_t(const shared_general_object_t &) = default;
-    shared_general_object_t(shared_general_object_t &&) = default;
-    shared_general_object_t(general_object_t &&other) {
-        *this = std::move(other);
-    }
-
-    shared_general_object_t &operator=(general_object_t &&other) {
-        vtable_ = other.vtable_;
-        data_ = other.move_to_unique_ptr();
-        return *this;
-    }
-
-    shared_general_object_t &operator=(shared_general_object_t &&other) {
-        vtable_ = std::move(other.vtable_);
-        data_ = std::move(other.data_);
-        return *this;
-    }
-
-    shared_general_object_t &operator=( // NOLINT
-            const shared_general_object_t &other) { // NOLINT
-        vtable_ = other.vtable_;
-        data_ = other.data_;
-        return *this;
-    }
-
-    template <typename T>
-    T *get_as() const {
-        assert(type_registry<T>::metadata() == vtable_.get());
-        return reinterpret_cast<T *>(data_.get());
-    }
-
-    template <typename T>
-    T *unchecked_get_as() const {
-        return reinterpret_cast<T *>(data_.get());
-    }
-
-    operator bool() const { return bool(data_); }
-
-    void *get() const { return data_.get(); };
-};
-
-// General reference. Similar to general_object_t, except that this is a borrow
-// of the pointer and do not take the ownership
-struct SC_INTERNAL_API general_ref_t {
-    // the type-erased pointer to the user-defined object
-    void *data_;
-    // the type of the data
-    type type_;
-
-    general_ref_t() : data_(nullptr) {}
-    general_ref_t(void *data, type tp) : data_(data), type_(tp) {}
-    template <typename T>
-    static general_ref_t from(T &obj) {
-        type thetype = type_registry<typename std::decay<T>::type>::type_;
-        return {(void *)&obj, thetype};
-    }
-
-    static general_ref_t from(general_object_t &obj);
-    static general_ref_t from(const shared_general_object_t &obj);
-    static general_ref_t from(shared_general_object_t &obj);
-    static general_ref_t from(const general_object_t &obj);
-    bool cmp_equal(general_ref_t ori_param) const;
-    // three-way compares with another ref, this<other => -1, this==other => 0,
-    // this>other => 1.
-    int cmp(general_ref_t other) const;
-    // computes the hash code
-    size_t hash() const;
-};
-
-} // namespace reflection
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/hash_utils.hpp b/src/graph/backend/graph_compiler/core/src/util/hash_utils.hpp
deleted file mode 100644
index 616d6b7b958..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/hash_utils.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_HASH_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_HASH_UTILS_HPP
-
-#include <functional>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// The following code is derived from Boost C++ library
-// Copyright 2005-2014 Daniel James.
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-// https://github.com/boostorg/container_hash/blob/master/include/boost/container_hash/hash.hpp
-template <typename T>
-inline void hash_combine(std::size_t &seed, T const &v) {
-    seed ^= std::hash<T>()(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-
-// a stable hash that does not depend on stl implementation
-inline void hash_combine_stable(std::size_t &seed, std::size_t v) {
-    seed ^= v + 0x9e3779b9 + (seed << 6) + (seed >> 2);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-namespace std {
-template <typename T>
-struct hash<std::vector<T>> {
-    std::size_t operator()(const std::vector<T> &v) const {
-        size_t seed = 0;
-        for (size_t i = 0; i < v.size(); i++) {
-            dnnl::impl::graph::gc::hash_combine(seed, v[i]);
-        }
-        return seed;
-    }
-};
-} // namespace std
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/leak_detector.hpp b/src/graph/backend/graph_compiler/core/src/util/leak_detector.hpp
deleted file mode 100644
index a73b06a383d..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/leak_detector.hpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_LEAK_DETECTOR_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_LEAK_DETECTOR_HPP
-
-#include <atomic>
-#include <functional>
-#include <stdio.h>
-#include <string>
-#include <typeinfo>
-#include <util/def.hpp>
-#if SC_MEMORY_LEAK_CHECK == 2
-#include <mutex>
-#include <sstream>
-#include <unordered_set>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-struct instance_counter {
-    const char *base_name;
-    std::atomic<uint64_t> newcounter {0};
-    std::atomic<uint64_t> delcounter {0};
-#if SC_MEMORY_LEAK_CHECK == 2
-    std::unordered_set<void *> pool;
-    std::mutex lock;
-#endif
-    std::function<void(void *, FILE *)> obj_dumper;
-    void on_new(void *ptr) {
-        ++newcounter;
-#if SC_MEMORY_LEAK_CHECK == 2
-        std::lock_guard<std::mutex> guard(lock);
-        pool.insert(ptr);
-#endif
-    }
-    void on_delete(void *ptr) {
-        ++delcounter;
-#if SC_MEMORY_LEAK_CHECK == 2
-        std::lock_guard<std::mutex> guard(lock);
-        assert(pool.count(ptr));
-        pool.erase(ptr);
-#endif
-    }
-    instance_counter(const char *base_name,
-            std::function<void(void *, FILE *)> obj_dumper = nullptr)
-        : base_name(base_name), obj_dumper(obj_dumper) {}
-    ~instance_counter() {
-        auto fn = std::string(base_name) + ".counter.txt";
-        FILE *f = fopen(fn.c_str(), "w");
-        const char *result
-                = (newcounter.load() - delcounter.load() == 0) ? "OK" : "LEAK";
-        fprintf(f, "%s %s %lu - %lu = %lu\n", base_name, result,
-                newcounter.load(), delcounter.load(),
-                newcounter.load() - delcounter.load());
-
-#if SC_MEMORY_LEAK_CHECK == 2
-        if (pool.size()) {
-            printf("Possible leak detected: Alive %s %lu\n", base_name,
-                    pool.size());
-
-            if (obj_dumper) {
-                for (auto p : pool)
-                    obj_dumper(p, f);
-            }
-        }
-#endif
-        fclose(f);
-    }
-};
-
-namespace impl {
-
-template <typename C>
-struct has_to_string {
-private:
-    template <typename T>
-    static constexpr auto check(T *) ->
-            typename std::is_same<decltype(&T::to_string),
-                    decltype(&T::to_string)>::type;
-    template <typename>
-    static constexpr std::false_type check(...);
-
-    typedef decltype(check<C>(0)) type;
-
-public:
-    static constexpr bool value = type::value;
-};
-
-template <typename T, bool b_has_to_string>
-struct obj_dumper_impl {
-    constexpr static std::nullptr_t funct = nullptr;
-};
-
-#if SC_MEMORY_LEAK_CHECK == 2
-template <typename T>
-struct obj_dumper_impl<T, true> {
-    static void funct(void *p, FILE *fp) {
-        T *obj = (T *)p;
-        std::stringstream ss;
-        // obj->to_string(ss);
-        ss << obj << '\n';
-        fputs(ss.str().c_str(), fp);
-    }
-};
-#endif
-
-} // namespace impl
-
-template <typename T>
-struct SC_INTERNAL_API leak_detect_base {
-    leak_detect_base() { cnter().on_new(static_cast<T *>(this)); }
-    ~leak_detect_base() { cnter().on_delete(static_cast<T *>(this)); }
-    static instance_counter &cnter() {
-        static instance_counter v {typeid(T).name(),
-                impl::obj_dumper_impl<T, impl::has_to_string<T>::value>::funct};
-        return v;
-    }
-};
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/math_utils.hpp b/src/graph/backend/graph_compiler/core/src/util/math_utils.hpp
deleted file mode 100644
index 669cd1a4873..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/math_utils.hpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_MATH_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_MATH_UTILS_HPP
-#include <algorithm>
-#include <vector>
-#include "parallel.hpp"
-#include <runtime/config.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace math_utils {
-
-template <class T>
-std::vector<T> vector_mul(const std::vector<T> &inputs1,
-        const std::vector<T> &inputs2, bool parallel = true) {
-    assert(inputs1.size() == inputs2.size() || inputs1.size() == 1UL
-            || inputs2.size() == 1UL);
-    size_t outsize = std::max(inputs1.size(), inputs2.size());
-    std::vector<T> outputs(outsize);
-    auto func = [&](uint64_t iter, uint64_t end) {
-        T input1, input2;
-        if (inputs1.size() == 1UL) {
-            input1 = inputs1[0];
-        } else {
-            input1 = inputs1[iter];
-        }
-        if (inputs2.size() == 1UL) {
-            input2 = inputs2[0];
-        } else {
-            input2 = inputs2[iter];
-        }
-        outputs[iter] = input1 * input2;
-    };
-    if (parallel) {
-        utils::parallel(func, 0, outsize);
-    } else {
-        for (uint64_t i = 0; i < outsize; i++) {
-            func(i, outsize);
-        }
-    }
-    return outputs;
-}
-
-template <class T>
-std::vector<T> vector_mul(const std::vector<T> &inputs1, const T &input2) {
-    std::vector<T> outputs(inputs1.size());
-    auto func = [&](uint64_t iter, uint64_t end) {
-        outputs[iter] = inputs1[iter] * input2;
-    };
-    utils::parallel(func, 0, inputs1.size());
-    return outputs;
-}
-
-template <class T>
-T get_dims_product(const std::vector<T> &dims) {
-    T ret = 1;
-    for (unsigned i = 0; i < dims.size(); ++i) {
-        ret *= dims[i];
-    }
-    assert(ret > 0 && "Overflow or non-constant shape detected");
-    return ret;
-}
-
-template <typename T,
-        typename dummy
-        = typename std::enable_if<std::is_same<float, std::decay<T>>::value
-                || std::is_same<double, std::decay<T>>::value>>
-std::vector<T> vector_rcp(const std::vector<T> &inputs) {
-    std::vector<T> outputs(inputs.size());
-    auto func = [&](uint64_t iter, uint64_t end) {
-        outputs[iter] = 1.0 / inputs[iter];
-    };
-    utils::parallel(func, 0, inputs.size());
-    return outputs;
-}
-
-inline int nearest_power_of_2(int in) {
-    if (in & (in - 1)) {
-        in |= in >> 1;
-        in |= in >> 2;
-        in |= in >> 4;
-        in |= in >> 8;
-        in |= in >> 16;
-        return in + 1;
-    }
-    return in == 0 ? 1 : in;
-}
-
-// get greatest common divisor of block_in and block_out
-inline int64_t get_gcd(int64_t a, int64_t b) {
-    COMPILE_ASSERT(a * b != 0, "non-zero number is expected");
-    int64_t i = std::min(a, b);
-    while (a % i != 0 || b % i != 0) {
-        i--;
-        if (i == 0) return 1;
-    }
-    return i;
-}
-
-} // namespace math_utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/null_check.hpp b/src/graph/backend/graph_compiler/core/src/util/null_check.hpp
deleted file mode 100644
index 617c1e5ca77..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/null_check.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_NULL_CHECK_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_NULL_CHECK_HPP
-#include <cstdlib>
-
-#define SC_ABORT_IF_NULL(v) \
-    if (!(v)) { std::abort(); }
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/optional.hpp b/src/graph/backend/graph_compiler/core/src/util/optional.hpp
deleted file mode 100644
index 84bbe07e505..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/optional.hpp
+++ /dev/null
@@ -1,338 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_OPTIONAL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_OPTIONAL_HPP
-#include <memory>
-#include <stdexcept>
-#include <stdint.h>
-#include <utility>
-#ifdef _GLIBCXX_DEBUG
-#include <string.h>
-#endif
-
-// MSVC had a old bug that it can not optimize empty base (EBO) with
-// multiple-inheritance. The bug was fixed but EBO is disabled by default. We
-// need to enable it with this macro
-#ifdef _MSC_VER
-#define SC_EMPTY_BASE_OPTIMIZE __declspec(empty_bases)
-#else
-#define SC_EMPTY_BASE_OPTIMIZE
-#endif
-
-namespace std {
-template <class T, class Deleter>
-class unique_ptr;
-
-template <class T>
-class shared_ptr;
-} // namespace std
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-template <typename T>
-struct optional;
-
-namespace optional_impl {
-using std::size_t;
-
-template <typename T>
-struct optional_base {
-    bool has_value_;
-
-    void init_as_empty(void *v) {
-        has_value_ = false;
-        // make GCC _GLIBCXX_DEBUG mode happy, or it may complain that it is
-        // uninitialized
-#ifdef _GLIBCXX_DEBUG
-        ::memset(v, 0, sizeof(T));
-#endif
-    }
-    void set_has_value(void *v) { has_value_ = true; }
-    bool has_value_impl(const void *v) const { return has_value_; }
-};
-
-template <typename T>
-struct pointer_like_optional_impl {
-    void init_as_empty(void *v) { new (v) T(nullptr); }
-    void set_has_value(void *v) {}
-    bool has_value_impl(const void *v) const {
-        return bool(*reinterpret_cast<const T *>(v));
-    }
-};
-
-template <typename T>
-struct optional_base<T *> : public pointer_like_optional_impl<T *> {};
-
-template <typename T, typename Deleter>
-struct optional_base<std::unique_ptr<T, Deleter>>
-    : public pointer_like_optional_impl<std::unique_ptr<T, Deleter>> {};
-
-template <typename T>
-struct optional_base<std::shared_ptr<T>>
-    : public pointer_like_optional_impl<std::shared_ptr<T>> {};
-
-template <typename T>
-struct extract_nested_optional {};
-
-template <typename T>
-struct extract_nested_optional<optional<T>> {
-    using internal_t = T;
-};
-
-template <typename T, typename Func>
-struct extract_mapper_func_ret {
-    using type = typename std::decay<decltype(
-            std::declval<Func>()(std::declval<T>()))>::type;
-};
-
-template <typename T, typename Func>
-using mapper_func_ret_t = typename extract_mapper_func_ret<T, Func>::type;
-
-template <typename TOptional, bool copyable>
-struct handle_copyable {
-    handle_copyable() = default;
-    handle_copyable(const handle_copyable &other) {
-        static_cast<TOptional &>(*this).copy_from(
-                static_cast<const TOptional &>(other));
-    }
-    using T = typename extract_nested_optional<TOptional>::internal_t;
-    handle_copyable(const T &v) {
-        auto &ths = static_cast<TOptional &>(*this);
-        new (&ths.storage_) T(v);
-        ths.set_has_value(&ths.storage_);
-    }
-};
-
-template <typename TOptional>
-struct handle_copyable<TOptional, false> {
-    handle_copyable() = default;
-    handle_copyable(const handle_copyable &other) = delete;
-};
-
-} // namespace optional_impl
-
-struct none_opt {};
-
-/**
- * @brief Optional container for type T. Can either be empty or contain a value
- * of T. It will also treat nullptr for pointer types as empty
- *
- * @tparam The contained type
- */
-template <typename T>
-struct SC_EMPTY_BASE_OPTIMIZE optional
-    : protected optional_impl::optional_base<T>,
-      protected optional_impl::handle_copyable<optional<T>,
-              std::is_copy_constructible<T>::value> {
-private:
-    template <typename TOptional, bool copyable>
-    friend struct optional_impl::handle_copyable;
-
-    using size_t = std::size_t;
-    using storage_type = typename std::aligned_storage<sizeof(T),
-            std::alignment_of<T>::value>::type;
-    storage_type storage_;
-    using impl_t = typename optional_impl::optional_base<T>;
-
-    static constexpr bool copyable = std::is_copy_constructible<T>::value;
-    using copy_handler_t =
-            typename optional_impl::handle_copyable<optional<T>, copyable>;
-
-    const T *get_impl() const { return reinterpret_cast<const T *>(&storage_); }
-
-    T *get_impl() { return reinterpret_cast<T *>(&storage_); }
-
-    // called when storage_ is not initialized, or is cleared
-    void move_from(optional &&other) {
-        if (other.has_value()) {
-            new (&storage_) T(std::move(*other.get_impl()));
-            impl_t::set_has_value(&storage_);
-            other.clear();
-        } else {
-            impl_t::init_as_empty(&storage_);
-        }
-    }
-
-    template <typename optionalT,
-            typename Dummy = typename std::enable_if<copyable
-                            && std::is_same<optionalT, optional>::value,
-                    int>::type>
-    void copy_from(const optionalT &other) {
-        if (other.has_value()) {
-            new (&storage_) T(*other.get_impl());
-            impl_t::set_has_value(&storage_);
-        } else {
-            impl_t::init_as_empty(&storage_);
-        }
-    }
-
-public:
-    optional(const none_opt &) { impl_t::init_as_empty(&storage_); }
-    // the optional is by default "empty".
-    optional() { impl_t::init_as_empty(&storage_); }
-
-    optional(const optional &v) = default;
-    using copy_handler_t::copy_handler_t;
-
-    // move constructors
-    optional(optional &&other) { move_from(std::move(other)); }
-    optional(T &&v) {
-        new (&storage_) T(std::move(v));
-        impl_t::set_has_value(&storage_);
-    }
-    optional &operator=(optional &&other) {
-        if (&other == this) return *this;
-        clear();
-        move_from(std::move(other));
-        return *this;
-    }
-
-    // template <typename T2,
-    //         typename Dummy = typename std::enable_if<
-    //                 copyable && std::is_same<T2, T>::value, int>::type>
-    // optional &operator=(const optional<T2> &other) {
-    //     clear();
-    //     copy_from(static_cast<const TOptional &>(other));
-    //     return ths;
-    // }
-
-    bool has_value() const { return impl_t::has_value_impl(&storage_); }
-
-    /**
-     * @brief Transform the contained value by a function "f" and box the return
-     * value in another optional, if this optioanl is not empty. Otherwise,
-     * return an empty optional
-     *
-     * @param f the transform function. Takes an argument of T/const T&. Its
-     * return value will be boxed into an optional
-     * @return optional<Ret> returns a boxed value if current optional is not
-     * empty. Otherwise, returns none_opt
-     */
-    template <typename Func,
-            typename Ret = typename optional_impl::mapper_func_ret_t<T, Func>>
-    optional<Ret> map(Func &&f) const {
-        if (has_value()) { return optional<Ret>(f(*get_impl())); }
-        return none_opt {};
-    }
-
-    /**
-     * @brief Transform the contained value by a function "f" and return the
-     * result of "f" function, if this optional is not empty. Otherwise, return
-     * an empty optional
-     *
-     * @param f the transform function. Takes an argument of T/const T&. Its
-     * return value should be an optional<Ret>
-     * @return optional<Ret> returns "f"'s result if current optional is not
-     * empty. Otherwise, returns none_opt
-     */
-    template <typename Func,
-            typename Ret = typename optional_impl::extract_nested_optional<
-                    typename optional_impl::mapper_func_ret_t<T,
-                            Func>>::internal_t>
-    optional<Ret> flat_map(Func &&f) const {
-        if (has_value()) { return f(*get_impl()); }
-        return none_opt {};
-    }
-
-    /**
-     * @brief Return *this, if this optional is not empty. Otherwise, return the
-     * result optional of function "f"
-     *
-     * @param f the function. Takes no arguments. Its return value
-     * should be an optional<T>
-     * @return optional<T>
-     */
-    template <typename Func>
-    optional or_else(Func &&f) && {
-        if (has_value()) { return std::move(*this); }
-        return f();
-    }
-
-    /**
-     * @brief Return *this, if this optional is not empty and the function "f"
-     * returns true. Otherwise, return an empty optional
-     *
-     * @param f the filter function. Takes an argument of T/const T&. Its return
-     * value should be a boolean
-     * @return optional<T>
-     */
-    template <typename Func>
-    optional filter(Func &&f) && {
-        if (has_value() && f(*get_impl())) { return std::move(*this); }
-        return none_opt {};
-    }
-
-    // Clear the optional. After calling this function, the optional object will
-    // be empty.
-    void clear() {
-        if (has_value()) {
-            get_impl()->~T();
-            impl_t::init_as_empty(&storage_);
-        }
-    }
-
-    ~optional() { clear(); }
-
-    /**
-     * @brief Gets the contained value. Will throw an exception if it is empty
-     *
-     * @return T& the contained value
-     */
-    T &get() {
-        if (has_value()) { return *get_impl(); }
-        throw std::runtime_error("Bad optional");
-    }
-
-    /**
-     * @brief Gets the contained value. Will throw an exception if it is empty
-     *
-     * @return const T& the contained value
-     */
-    const T &get() const {
-        if (has_value()) { return *get_impl(); }
-        throw std::runtime_error("Bad optional");
-    }
-
-    // Gets the contained value. Or return a given value if this optional
-    // is empty
-    T get_or_else(const T &v) const {
-        if (has_value()) { return *get_impl(); }
-        return v;
-    }
-
-    // Gets the contained value. Or return the result of the function if this
-    // optional is empty
-    template <typename Func, typename dummy = decltype(std::declval<Func>()())>
-    T get_or_else(Func &&f) const {
-        if (has_value()) { return *get_impl(); }
-        return f();
-    }
-};
-
-template <typename T>
-optional<typename std::decay<T>::type> some_opt(T &&v) {
-    return optional<typename std::decay<T>::type> {std::forward<T>(v)};
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/optional_find.hpp b/src/graph/backend/graph_compiler/core/src/util/optional_find.hpp
deleted file mode 100644
index 3c1e6183f89..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/optional_find.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_OPTIONAL_FIND_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_OPTIONAL_FIND_HPP
-#include "optional.hpp"
-#include <type_traits>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-namespace impl {
-template <typename MapT>
-struct extract_map_types {};
-
-template <typename MapT>
-struct extract_map_types<const MapT &> {
-    using ptr_value_t = const typename MapT::mapped_type *;
-    using ptr_pair_t = const typename MapT::value_type *;
-};
-
-template <typename MapT>
-struct extract_map_types<MapT &> {
-    using ptr_value_t = typename MapT::mapped_type *;
-    using ptr_pair_t = typename MapT::value_type *;
-};
-
-} // namespace impl
-
-template <typename MapT>
-optional<typename impl::extract_map_types<MapT>::ptr_value_t> find_map_value(
-        MapT &&m, const typename std::decay<MapT>::type::key_type &key) {
-    auto itr = m.find(key);
-    if (itr != m.end()) { return &(itr->second); }
-    return none_opt {};
-}
-
-template <typename MapT>
-optional<typename impl::extract_map_types<MapT>::ptr_pair_t> find_map_pair(
-        MapT &&m, const typename std::decay<MapT>::type::key_type &key) {
-    auto itr = m.find(key);
-    if (itr != m.end()) { return &(*itr); }
-    return none_opt {};
-}
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/os.hpp b/src/graph/backend/graph_compiler/core/src/util/os.hpp
deleted file mode 100644
index 33369878ff2..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/os.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_OS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_OS_HPP
-
-#include <stdlib.h>
-#ifdef _WIN32
-#include <malloc.h>
-#define aligned_alloc(a, b) _aligned_malloc((b), (a))
-#define __PRETTY_FUNCTION__ __FUNCSIG__
-#define aligned_free(a) _aligned_free((a))
-#else
-#define aligned_free(a) free((a))
-#endif
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/parallel.hpp b/src/graph/backend/graph_compiler/core/src/util/parallel.hpp
deleted file mode 100644
index 4edc12c75d4..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/parallel.hpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_PARALLEL_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_PARALLEL_HPP
-
-#include <utility>
-#include <runtime/config.hpp>
-#include <util/compiler_macros.hpp>
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-#include <tbb/parallel_for.h>
-#endif
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <common/dnnl_thread.hpp>
-#include <util/simple_math.hpp>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-template <typename F>
-void parallel(F &&f, int64_t begin, int64_t end, int64_t step = 1,
-        int num_threads = dnnl_get_current_num_threads()) {
-    auto num_jobs = utils::divide_and_ceil(end - begin, step);
-    int nthr = adjust_num_threads(num_threads, num_jobs);
-    auto execf = [&](int64_t i) { f(i * step + begin, end); };
-    if (nthr)
-        dnnl::impl::parallel(nthr, [&](int ithr, int nthr) {
-            for_nd(ithr, nthr, num_jobs, execf);
-        });
-}
-
-template <typename F>
-void parallel_for(int64_t begin, int64_t end, int64_t step, F &&f) {
-    auto num_jobs = utils::divide_and_ceil(end - begin, step);
-    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), num_jobs);
-    auto execf = [&](int64_t i) { f(i * step + begin); };
-    if (nthr)
-        dnnl::impl::parallel(nthr, [&](int ithr, int nthr) {
-            for_nd(ithr, nthr, num_jobs, execf);
-        });
-}
-#elif SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-template <typename F>
-void parallel(F &&f, int64_t begin, int64_t end, int64_t step = 1,
-        int num_threads = runtime_config_t::get().get_num_threads()) {
-    tbb::parallel_for(begin, end, step, [&](int64_t i) { f(i, end); });
-}
-
-template <typename F>
-void parallel_for(int64_t begin, int64_t end, int64_t step, F &&f) {
-    tbb::parallel_for(begin, end, step, std::forward<F>(f));
-}
-
-#else
-template <typename F>
-void parallel(F f, int64_t begin, int64_t end, int64_t step = 1,
-        int num_threads = runtime_config_t::get().get_num_threads()) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-    SC_NO_OP();
-#pragma omp parallel for num_threads(num_threads)
-#endif
-    for (int64_t i = begin; i < end; i += step) {
-        SC_NO_OP();
-        f(i, end);
-    }
-}
-
-template <typename F>
-void parallel_for(int64_t begin, int64_t end, int64_t step, F &&f) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-    SC_NO_OP();
-#pragma omp parallel for
-#endif
-    for (int64_t i = begin; i < end; i += step) {
-        SC_NO_OP();
-        f(i);
-    }
-}
-#endif
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/pos_track_stream.hpp b/src/graph/backend/graph_compiler/core/src/util/pos_track_stream.hpp
deleted file mode 100644
index d8274d29916..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/pos_track_stream.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_POS_TRACK_STREAM_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_POS_TRACK_STREAM_HPP
-#include <istream>
-#include <ostream>
-#include <streambuf>
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-class track_pos_buf_t : public std::streambuf {
-public:
-    std::ostream &os_;
-    track_pos_buf_t(std::ostream &os) : os_(os) {}
-    int pos_ = 1;
-    int line_ = 1;
-
-protected:
-    void process_char(char c) {
-        if (c == '\n') {
-            line_++;
-            pos_ = 0;
-        } else {
-            pos_++;
-        }
-    }
-    std::streamsize xsputn(const char *s, std::streamsize num) override {
-        for (std::streamsize i = 0; i < num; i++) {
-            process_char(s[i]);
-        }
-        os_.write(s, num);
-        return num;
-    }
-    int_type overflow(int_type c) override {
-        process_char(c);
-        os_ << (char)c;
-        return c;
-    }
-};
-
-// an ostream to track the current line position in the output text
-class track_pos_stream_t : public std::ostream {
-public:
-    track_pos_buf_t buf_;
-    track_pos_stream_t(std::ostream &os) : std::ostream(nullptr), buf_(os) {
-        rdbuf(&buf_);
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/reflection.cpp b/src/graph/backend/graph_compiler/core/src/util/reflection.cpp
deleted file mode 100644
index b66441c3b8b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/reflection.cpp
+++ /dev/null
@@ -1,534 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "reflection.hpp"
-#include <common/compiler_workarounds.hpp>
-#include <compiler/ir/sc_data_type.hpp>
-#include <util/compiler_macros.hpp>
-#include <util/string_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// clang-format off
-SC_CLASS(sc_data_type_t)
-    SC_FIELD(type_code_)
-    SC_FIELD(ldata_)
-SC_CLASS_END();
-
-SC_CLASS_WITH_NAME(std_any_map, std::unordered_map<std::string, any_t>)
-SC_CLASS_END();
-
-SC_CLASS_WITH_NAME(reflection_general_obj, reflection::general_object_t)
-SC_CLASS_END();
-
-SC_CLASS_WITH_NAME(reflection_shared_general_obj, reflection::shared_general_object_t) // NOLINT
-SC_CLASS_END();
-
-// clang-format on
-
-// explicit explicit instantiation of commonly used types
-template struct reflection::type_registry<std::vector<int64_t>>;
-template struct reflection::type_registry<std::vector<std::vector<int64_t>>>;
-template struct reflection::type_registry<std::vector<int>>;
-template struct reflection::type_registry<std::vector<float>>;
-
-namespace reflection {
-
-#define MACRO_ON_BASIC_TYPES_NO_STRING(M) \
-    M(int32_t) \
-    M(int64_t) \
-    M(uint32_t) \
-    M(uint64_t) \
-    M(float) \
-    M(double) \
-    M(bool)
-#define MACRO_ON_BASIC_TYPES(M) \
-    MACRO_ON_BASIC_TYPES_NO_STRING(M) \
-    M(string)
-
-void general_object_t::release() {
-    if (data_) {
-        vtable_->destructor_(data_.get());
-        vtable_ = nullptr;
-        data_ = nullptr;
-    };
-}
-
-int type::cmp(const type &other) const {
-    if (base_ < other.base_) {
-        return -1;
-    } else if (base_ > other.base_) {
-        return 1;
-    }
-    // base==other.base_
-
-    if (array_depth_ < other.array_depth_) {
-        return -1;
-    } else if (array_depth_ > other.array_depth_) {
-        return 1;
-    }
-    // array_depth_==other.array_depth_
-
-    if ((uintptr_t)meta_ < (uintptr_t)other.meta_) {
-        return -1;
-    } else if ((uintptr_t)meta_ > (uintptr_t)other.meta_) {
-        return 1;
-    }
-    return 0;
-}
-
-general_object_t::general_object_t(general_object_t &&other)
-    : data_(std::move(other.data_)), vtable_(other.vtable_) {}
-general_object_t::general_object_t(
-        std::unique_ptr<char[]> &&data, const class_metadata_ptr &vtable)
-    : data_(std::move(data)), vtable_(vtable) {}
-
-std::unique_ptr<void, void (*)(void *)> general_object_t::move_to_unique_ptr() {
-    auto ptr = data_.release();
-    return {ptr, vtable_->destructor_};
-}
-
-general_object_t class_metadata::make_instance() {
-    auto data = std::unique_ptr<char[]>(new char[size_]);
-    constructor_(data.get());
-    return general_object_t(std::move(data), shared_from_this());
-}
-
-std::shared_ptr<class_metadata> class_metadata::shared_from_this() {
-    constexpr int sz = sizeof(shared_this_);
-    auto ret = shared_this_.lock();
-    assert(ret && ret.get() == this);
-    return ret;
-}
-
-void general_object_t::copy_from(
-        const std::unordered_map<std::string, any_t> &m) {
-    copy_from_any_map(m, data_.get(), vtable_.get());
-}
-
-void general_object_t::copy_to(std::unordered_map<std::string, any_t> &m) {
-    copy_to_any_map(m, data_.get(), vtable_.get());
-}
-
-void general_object_t::copy_from_any_map(
-        const std::unordered_map<std::string, any_t> &m, void *object,
-        class_metadata *vtable) {
-    COMPILE_ASSERT(m.size() == vtable->fields_.size(),
-            "The size of any map does not match the number of fields");
-    for (auto &v : m) {
-        auto itr = vtable->field_map_.find(v.first);
-        COMPILE_ASSERT(itr != vtable->field_map_.end(),
-                "Cannot find field " << v.first << " in class "
-                                     << vtable->name_);
-        itr->second->write(object, v.second);
-    }
-}
-
-size_t type::size() const {
-    if (array_depth_ > 0 || base_ == basic_type::t_class) {
-        return meta_->size_;
-    }
-    switch (base_) {
-        case basic_type::t_int32_t: return sizeof(int32_t);
-        case basic_type::t_int64_t: return sizeof(int64_t);
-        case basic_type::t_uint32_t: return sizeof(uint32_t);
-        case basic_type::t_uint64_t: return sizeof(uint64_t);
-        case basic_type::t_float: return sizeof(float);
-        case basic_type::t_double: return sizeof(double);
-        case basic_type::t_bool: return sizeof(bool);
-        case basic_type::t_string: return sizeof(std::string);
-        default: assert(0 && "Base basic type"); break;
-    }
-    // make compiler happy
-    return 0;
-}
-
-/**
- * Gets the unique string representation of a type.
- * int32 => i
- * int64 => i
- * uint32 => u
- * uint64 => U
- * float => f
- * double => d
- * bool => b
- * string => s
- * class => class name in class_metadata
- * */
-std::string type::to_string() const {
-    if (meta_) {
-        assert(base_ == basic_type::t_class || array_depth_ > 0);
-        return meta_->name_;
-    }
-    assert(array_depth_ == 0 && base_ != basic_type::t_class);
-    switch (base_) {
-#define PUT_VALUE(PREFIX, TYPE) \
-    case basic_type::t_##TYPE: return PREFIX;
-        PUT_VALUE("i", int32_t)
-        PUT_VALUE("I", int64_t)
-        PUT_VALUE("u", uint32_t)
-        PUT_VALUE("U", uint64_t)
-        PUT_VALUE("f", float)
-        PUT_VALUE("d", double)
-        PUT_VALUE("b", bool)
-        PUT_VALUE("s", string)
-        default: break;
-    }
-#undef PUT_VALUE
-    return std::string();
-}
-
-void general_object_t::copy_to_any_map(
-        std::unordered_map<std::string, any_t> &m, void *object,
-        class_metadata *vtable) {
-    for (auto &v : vtable->fields_) {
-        any_t outv;
-        v->read(object, outv);
-        m[v->name_] = outv;
-    }
-}
-
-// this function wraps the class map to ensure it is initialized first
-// if we use class map as a global variable, it may not be initialized when
-// initializing other globals
-
-static std::unordered_map<const std::type_info *, type> &get_rtti_type_map() {
-#define REGISTER_BASIC_TYPE(T) \
-    {&typeid(T), type {basic_type::t_##T, 0, nullptr}},
-    static std::unordered_map<const std::type_info *, type> class_map {
-            MACRO_ON_BASIC_TYPES(REGISTER_BASIC_TYPE)};
-    return class_map;
-#undef REGISTER_BASIC_TYPE
-}
-
-static std::unordered_map<std::string, type> &get_class_map() {
-#define REGISTER_BASIC_TYPE2(NAME, T) \
-    { \
-        NAME, type { basic_type::t_##T, 0, nullptr } \
-    }
-    static std::unordered_map<std::string, type> class_map {
-            REGISTER_BASIC_TYPE2("i", int32_t),
-            REGISTER_BASIC_TYPE2("I", int64_t),
-            REGISTER_BASIC_TYPE2("u", uint32_t),
-            REGISTER_BASIC_TYPE2("U", uint64_t),
-            REGISTER_BASIC_TYPE2("f", float), REGISTER_BASIC_TYPE2("d", double),
-            REGISTER_BASIC_TYPE2("b", bool), REGISTER_BASIC_TYPE2("s", string)};
-    return class_map;
-}
-
-struct type_hash_t {
-    size_t operator()(const type *v) const noexcept {
-        size_t ret = v->array_depth_;
-        ret = (ret << 16) ^ (size_t)v->base_;
-        ret ^= (size_t)v->meta_;
-        return ret;
-    }
-};
-
-struct type_compare_eq_t {
-    bool operator()(const type *v, const type *v2) const noexcept {
-        return *v == *v2;
-    }
-};
-
-using type_rtti_map = std::unordered_map<const type *, const std::type_info *,
-        type_hash_t, type_compare_eq_t>;
-static type_rtti_map &get_type_rtti_map() {
-    static type_rtti_map class_map = []() {
-        // the initial values of rtti_type_map is constructed by
-        // reversing get_rtti_type_map()
-        auto &map = get_rtti_type_map();
-        type_rtti_map ret;
-        for (auto &kv : map) {
-            ret.insert(std::make_pair(&kv.second, kv.first));
-        }
-        return ret;
-    }();
-    return class_map;
-}
-
-// gets the reflection::type by the rtti. returns nullptr if not found
-const type *get_type_by_rtti(const std::type_info *rtti_data) {
-    auto itr = get_rtti_type_map().find(rtti_data);
-    if (itr != get_rtti_type_map().end()) { return &itr->second; }
-    return nullptr;
-}
-
-const std::type_info *get_rtti_by_type(const type *ty) {
-    auto itr = get_type_rtti_map().find(ty);
-    assert(itr != get_type_rtti_map().end());
-    return itr->second;
-}
-
-void set_rtti_map_to_type(const std::type_info *rtti_data, const type &ty,
-        const std::string *alternative_name) {
-    assert(get_rtti_type_map().find(rtti_data) == get_rtti_type_map().end());
-    auto &map_ty = get_rtti_type_map()[rtti_data];
-    map_ty = ty;
-    assert(get_type_rtti_map().find(&map_ty) == get_type_rtti_map().end()
-            || get_type_rtti_map()[&map_ty] == rtti_data);
-    get_type_rtti_map()[&map_ty] = rtti_data;
-    const std::string &name
-            = alternative_name ? *alternative_name : ty.to_string();
-    assert(get_class_map().find(name) == get_class_map().end());
-    get_class_map()[name] = ty;
-}
-
-class_metadata *get_metadata(const std::string &name) {
-    auto ret = get_type_by_name(name);
-    if (ret) { return ret->meta_; }
-    return nullptr;
-}
-
-const type *get_type_by_name(const std::string &name) {
-    auto itr = get_class_map().find(name);
-    if (itr != get_class_map().end()) { return &itr->second; }
-    return nullptr;
-}
-
-void set_metadata(const std::string &name, class_metadata *meta,
-        const std::type_info *rtti_data) {
-    assert(get_class_map().find(name) == get_class_map().end());
-    unsigned array_depth;
-    basic_type btype;
-    if (meta->vector_kind_ == vector_kind::not_vector) {
-        array_depth = 0;
-        btype = basic_type::t_class;
-    } else {
-#if SC_GNUC_VERSION_GE(12)
-// Disable gcc's warning. We already checked the vector_kind_. meta must be
-// vector_metadata
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Warray-bounds"
-#endif
-        auto vec_meta = static_cast<vector_metadata *>(meta);
-        array_depth = vec_meta->array_depth_;
-        btype = vec_meta->element_type_.base_;
-#if SC_GNUC_VERSION_GE(12)
-#pragma GCC diagnostic pop
-#endif
-    }
-    set_rtti_map_to_type(rtti_data, type {btype, array_depth, meta}, &name);
-}
-void dummy_class_metadata_deleter(class_metadata *) {}
-
-general_ref_t general_ref_t::from(general_object_t &obj) {
-    if (obj.vtable_->vector_kind_ != vector_kind::not_vector) {
-        auto metadata = static_cast<vector_metadata *>(obj.vtable_.get());
-        return {obj.data_.get(),
-                {basic_type::t_class, metadata->array_depth_,
-                        obj.vtable_.get()}};
-    } else {
-        return {obj.data_.get(), {basic_type::t_class, 0, obj.vtable_.get()}};
-    }
-}
-
-general_ref_t general_ref_t::from(const shared_general_object_t &obj) {
-    if (obj.vtable_->vector_kind_ != vector_kind::not_vector) {
-        auto metadata = static_cast<vector_metadata *>(obj.vtable_.get());
-        return {obj.data_.get(),
-                {basic_type::t_class, metadata->array_depth_,
-                        obj.vtable_.get()}};
-    } else {
-        return {obj.data_.get(), {basic_type::t_class, 0, obj.vtable_.get()}};
-    }
-}
-
-general_ref_t general_ref_t::from(shared_general_object_t &obj) {
-    return from(static_cast<const shared_general_object_t &>(obj));
-}
-
-general_ref_t general_ref_t::from(const general_object_t &obj) {
-    if (obj.vtable_->vector_kind_ != vector_kind::not_vector) {
-        auto metadata = static_cast<vector_metadata *>(obj.vtable_.get());
-        return {obj.data_.get(),
-                {basic_type::t_class, metadata->array_depth_,
-                        obj.vtable_.get()}};
-    } else {
-        return {obj.data_.get(), {basic_type::t_class, 0, obj.vtable_.get()}};
-    }
-}
-
-using stdanymap = std::unordered_map<std::string, any_t>;
-static const reflection::type &general_obj_type() {
-    return reflection::type_registry<reflection::general_object_t>::type_;
-}
-static const reflection::type &std_anymap_type() {
-    return reflection::type_registry<stdanymap>::type_;
-}
-
-bool visitor_t::dispatch(general_ref_t *v, general_ref_t *v2) {
-    if (v2 && !(v->type_ == v2->type_)) { return false; }
-    if (v->type_ == general_obj_type()) {
-        auto obj = reinterpret_cast<general_object_t *>(v->data_);
-        general_object_t *obj2 = v2
-                ? reinterpret_cast<general_object_t *>(v2->data_)
-                : nullptr;
-        auto ref1 = general_ref_t::from(*obj);
-        if (v2) {
-            auto ref2 = general_ref_t::from(*obj2);
-            return dispatch(&ref1, &ref2);
-        } else {
-            return dispatch(&ref1, nullptr);
-        }
-    } else if (v->type_.array_depth_ >= 1) {
-        auto vec_meta
-                = static_cast<reflection::vector_metadata *>(v->type_.meta_);
-        size_t len = vec_meta->size(v->data_);
-        size_t len2 = v2 ? vec_meta->size(v2->data_) : 0;
-        size_t objsize = vec_meta->element_type_.size();
-        char *ptrelem = (char *)vec_meta->base(v->data_);
-        char *ptrelem2 = v2 ? (char *)vec_meta->base(v2->data_) : nullptr;
-        return visit_array(
-                v, v2, vec_meta, len, len2, objsize, ptrelem, ptrelem2);
-    } else if (v->type_.base_ == reflection::basic_type::t_class) {
-        return visit_class(v, v2);
-    } else {
-// clang-format off
-#define PUT_VALUE(TYPE) \
-    case basic_type::t_##TYPE: return visit(reinterpret_cast<TYPE *>(v->data_),  v2 ? reinterpret_cast<TYPE *>(v2->data_) : nullptr);  break; // NOLINT
-        // clang-format on
-        using reflection::basic_type;
-        using std::string;
-        switch (v->type_.base_) {
-            MACRO_ON_BASIC_TYPES(PUT_VALUE)
-            default:
-                assert(0 && "bad basic type");
-                return false;
-                break;
-        }
-#undef PUT_VALUE
-    }
-}
-
-bool visitor_t::visit_class(general_ref_t *v, general_ref_t *v2) {
-    auto metadata = v->type_.meta_;
-    for (size_t i = 0; i < metadata->fields_.size(); ++i) {
-        auto &fld = metadata->fields_[i];
-        auto ptr1 = fld->addresser_->get(v->data_);
-        auto ptr2 = v2 ? fld->addresser_->get(v2->data_) : nullptr;
-        general_ref_t obj1 = {ptr1, fld->type_};
-        general_ref_t obj2 = {ptr2, fld->type_};
-        if (!dispatch(&obj1, v2 ? &obj2 : nullptr)) { return false; }
-    }
-    return true;
-}
-bool visitor_t::visit_array(general_ref_t *v, general_ref_t *v2,
-        vector_metadata *vec_meta, size_t len, size_t len2, size_t objsize,
-        char *base1, char *base2) {
-    assert(!v2);
-    for (unsigned i = 0; i < len; i++) {
-        reflection::general_ref_t elem {base1, vec_meta->element_type_};
-        if (!dispatch(&elem, nullptr)) { return false; }
-        base1 += objsize;
-        base2 += objsize;
-    }
-    return true;
-}
-
-// clang-format off
-#define IMPL_VISIT(TYPE) \
-    virtual bool visit(TYPE *v, TYPE *v2) override { return do_visit(v, v2); } // NOLINT
-// clang-format on
-struct cmp_visitor_t : public visitor_t {
-    int result = 0;
-    bool dispatch(general_ref_t *v1, general_ref_t *v2) override {
-        int cmpresult = v1->type_.cmp(v2->type_);
-        if (cmpresult != 0) {
-            result = cmpresult;
-            return false;
-        }
-        return visitor_t::dispatch(v1, v2);
-    }
-    bool visit_array(general_ref_t *v, general_ref_t *v2,
-            vector_metadata *vec_meta, size_t len, size_t len2, size_t objsize,
-            char *base1, char *base2) override {
-        if (len != len2) {
-            if (len < len2) {
-                result = -1;
-            } else {
-                result = 1;
-            }
-            return false;
-        }
-        for (unsigned i = 0; i < len; i++) {
-            reflection::general_ref_t elem {base1, vec_meta->element_type_};
-            reflection::general_ref_t elem2 {base2, vec_meta->element_type_};
-            if (!dispatch(&elem, &elem2)) { return false; }
-            base1 += objsize;
-            base2 += objsize;
-        }
-        return true;
-    }
-    bool visit(std::string *v, std::string *v2) override {
-        int cmpresult = v->compare(*v2);
-        if (cmpresult != 0) {
-            result = cmpresult;
-            return false;
-        }
-        return true;
-    }
-    template <typename T>
-    bool do_visit(T *v, T *v2) {
-        if (*v == *v2) { return true; }
-        if (*v > *v2) {
-            result = 1;
-        } else {
-            result = -1;
-        }
-        return false;
-    }
-    MACRO_ON_BASIC_TYPES_NO_STRING(IMPL_VISIT)
-};
-
-int general_ref_t::cmp(general_ref_t other) const {
-    COMPILE_ASSERT(
-            type_ == other.type_, "Cannot compare objects of different types");
-    if (type_.meta_ && type_.meta_->vtable_ && type_.meta_->vtable_->compare_) {
-        return type_.meta_->vtable_->compare_(data_, other.data_);
-    }
-    cmp_visitor_t v;
-    v.dispatch(const_cast<general_ref_t *>(this), &other);
-    return v.result;
-}
-
-bool general_ref_t::cmp_equal(general_ref_t other) const {
-    return cmp(other) == 0;
-}
-
-struct hash_visitor_t : public visitor_t {
-    size_t result = 0;
-    template <typename T>
-    bool do_visit(T *v, T *v2) {
-        result = result * 23 + std::hash<T>()(*v);
-        return true;
-    }
-    MACRO_ON_BASIC_TYPES(IMPL_VISIT)
-};
-
-size_t general_ref_t::hash() const {
-    hash_visitor_t hv;
-    hv.dispatch(const_cast<general_ref_t *>(this), nullptr);
-    return hv.result;
-}
-
-} // namespace reflection
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/reflection.hpp b/src/graph/backend/graph_compiler/core/src/util/reflection.hpp
deleted file mode 100644
index e470339bf39..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/reflection.hpp
+++ /dev/null
@@ -1,604 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_REFLECTION_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_REFLECTION_HPP
-
-#include <array>
-#include <cassert>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "any_map.hpp"
-#include "general_object.hpp"
-#include "utils.hpp"
-#include <type_traits>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace json {
-class json_writer;
-class json_reader;
-} // namespace json
-
-namespace reflection {
-
-template <typename T, typename Dummy>
-struct type_registry {
-public:
-    static SC_INTERNAL_API class_metadata *metadata();
-    static_assert(std::is_class<T>::value || std::is_union<T>::value,
-            "Pointer is not supported in reflection, only classes are "
-            "supported");
-    static constexpr unsigned depth = 0;
-    static constexpr basic_type base = basic_type::t_class;
-    static type type_;
-};
-
-#define SC_REFLECT_REG_TYPE(ty) \
-    template <> \
-    struct type_registry<ty, int> { \
-        static constexpr unsigned depth = 0; \
-        static constexpr basic_type base = basic_type::t_##ty; \
-        static constexpr class_metadata *metadata() { return nullptr; } \
-        static constexpr type type_ {base, depth, nullptr}; \
-    };
-
-SC_REFLECT_REG_TYPE(uint32_t)
-SC_REFLECT_REG_TYPE(uint64_t)
-SC_REFLECT_REG_TYPE(int32_t)
-SC_REFLECT_REG_TYPE(int64_t)
-SC_REFLECT_REG_TYPE(float)
-SC_REFLECT_REG_TYPE(double)
-SC_REFLECT_REG_TYPE(bool)
-using string = std::string;
-SC_REFLECT_REG_TYPE(string)
-
-// enum types, redirect to int
-template <typename T>
-struct type_registry<T,
-        typename std::enable_if<std::is_enum<T>::value, int>::type>
-    : public type_registry<typename std::underlying_type<T>::type, int> {};
-
-// typename trait for vector/array element name
-template <typename T, bool is_enum = std::is_enum<T>::value>
-struct type_name_trait_t {};
-
-#define SC_ARRAY_LIKE_FIELDS(ElemT) \
-    static constexpr unsigned depth = type_registry<ElemT>::depth + 1; \
-    static constexpr basic_type base = type_registry<ElemT>::base; \
-    static SC_INTERNAL_API vector_metadata *metadata(); \
-    static type type_;
-
-template <typename T>
-struct type_registry<std::vector<T>> {
-    static_assert(!std::is_same<T, bool>::value,
-            "Reflection do not support std::vector<bool>");
-    SC_ARRAY_LIKE_FIELDS(T)
-};
-
-template <typename T, std::size_t N>
-struct type_registry<std::array<T, N>> {
-    SC_ARRAY_LIKE_FIELDS(T)
-};
-
-template <typename T, std::size_t N>
-struct type_registry<T[N]> {
-    SC_ARRAY_LIKE_FIELDS(T)
-};
-#undef SC_ARRAY_LIKE_FIELDS
-
-struct field_address_converter_t {
-    virtual void *get(void *obj) = 0;
-    virtual std::unique_ptr<field_address_converter_t> copy() = 0;
-    virtual ~field_address_converter_t() = default;
-};
-
-struct offset_field_converter_t : public field_address_converter_t {
-    size_t offset_;
-    void *get(void *obj) override { return (char *)obj + offset_; }
-    std::unique_ptr<field_address_converter_t> copy() override {
-        return utils::make_unique<offset_field_converter_t>(offset_);
-    };
-    offset_field_converter_t(size_t offset) : offset_(offset) {}
-};
-
-struct field_base_t {
-    std::string name_;
-    type type_;
-    std::unique_ptr<field_address_converter_t> addresser_;
-    virtual std::unique_ptr<field_base_t> copy() = 0;
-    virtual void read(void *obj, void *out) = 0;
-    virtual void write(void *obj, void *value) = 0;
-    virtual void read(void *obj, any_t &out) = 0;
-    virtual void write(void *obj, const any_t &value) = 0;
-    virtual ~field_base_t() = default;
-};
-
-template <typename T>
-struct field : public field_base_t {
-    field(const std::string &name,
-            std::unique_ptr<field_address_converter_t> &&addresser) {
-        name_ = name;
-        addresser_ = std::move(addresser);
-        type_ = {type_registry<T>::base, type_registry<T>::depth,
-                type_registry<T>::metadata()};
-    }
-    std::unique_ptr<field_base_t> copy() override {
-        return utils::make_unique<field<T>>(name_, addresser_->copy());
-    }
-    using copier = typename any_detail::copy_assign_impl_t<
-            std::is_copy_assignable<T>::value, T>;
-    void read(void *obj, void *out) override {
-        void *dst = out;
-        void *src = addresser_->get(obj);
-        copier::call(dst, src);
-    }
-    void write(void *obj, void *value) override {
-        void *src = value;
-        void *dst = addresser_->get(obj);
-        copier::call(dst, src);
-    }
-    void read(void *obj, any_t &out) override {
-        T &src = *reinterpret_cast<T *>(addresser_->get(obj));
-        out = src;
-    }
-    void write(void *obj, const any_t &value) override {
-        T &dst = *reinterpret_cast<T *>(addresser_->get(obj));
-        copier::call(&dst, &value.get<T>());
-    }
-};
-
-struct class_vtable_t {
-    void (*json_serailize_)(void *, json::json_writer *) = nullptr;
-    void (*json_deserailize_)(void *, json::json_reader *) = nullptr;
-    // three-way compares with another type, this<other => -1, this==other => 0,
-    // this>other => 1.
-    int (*compare_)(void *, void *) = nullptr;
-};
-
-enum class vector_kind {
-    not_vector,
-    std_vector,
-    std_array,
-};
-
-/**
- * The class to hold the type info of a reflection-enabled class.
- * Note that the `class_metadata` object can be created from two sources. One is
- * to use `SC_CLASS*` macro to declare the type info the a class. In this case,
- * the `class_metadata` object is a staic object in the binary and is alive
- * during the whole lifetime of the program/shared library. The second case is
- * that the `class_metadata` object can be created dynamically by the tuner's
- * config space. The metadata object's lifetime will be controlled by a
- * shared_ptr and every living `general_object_t` created by the metadata object
- * will have a shared_ptr to it to make sure the metadata is alive.
- * To adapt the first case (static metadata) for the second case, static
- * metadata are also "managed" by a dummy shared_ptr which has empty deleter.
- * The objects' lifetime is still controlled by the program's whole lifetime.
- * */
-struct SC_INTERNAL_API class_metadata {
-    std::string name_;
-    std::vector<std::unique_ptr<field_base_t>> fields_;
-    size_t size_;
-    std::unordered_map<std::string, field_base_t *> field_map_;
-    // the functor calling the constructor i.e. T()
-    void (*constructor_)(void *);
-    // the additional initalizer (called after calling constructor)
-    void (*initalizer_)(void *) = nullptr;
-    // the functor calling the destructor i.e. ~T()
-    void (*destructor_)(void *);
-    // if this class is vector/array. If kind_!=not_vector, the metadata can
-    // static cast to vector_metadata
-    vector_kind vector_kind_ = vector_kind::not_vector;
-    // a weak ptr to convert this to share_ptr
-    std::weak_ptr<class_metadata> shared_this_;
-    // creates a type-erased object
-    general_object_t make_instance();
-    std::shared_ptr<class_metadata> shared_from_this();
-
-    std::unique_ptr<class_vtable_t> vtable_;
-};
-
-// the extra data & functions for vector<T>
-struct vector_metadata : public class_metadata {
-    type element_type_;
-    unsigned array_depth_;
-    size_t (*size)(void *obj);
-    void *(*base)(void *obj);
-    void (*move_push)(void *vec, void *to_move);
-    void (*push_empty)(void *vec);
-
-    void *ptr_of_element(void *obj, size_t idx) const {
-        auto b = base(obj);
-        return ((char *)b) + idx * element_type_.size();
-    }
-};
-
-template <typename T>
-struct type_name_trait_t<T, false> {
-    static std::string name(const vector_metadata &v) {
-        return v.element_type_.to_string();
-    }
-};
-
-// a dummy metadata deleter (does nothing). It is used in static classes to
-// convert a static class_metadata to a shared_ptr. The static class metadata
-// object is still managed by the lifetime of the whole program, not the
-// shared_ptr. For class_metadata dynamically created in tuning spaces, the
-// lifetime is managed by shared_ptr
-void dummy_class_metadata_deleter(class_metadata *);
-
-// sets the rtti <=> reflection::type mapping. If alternative_name name is not
-// null, using it as the name of the type. This parameter is useful when
-// registering a class metadata in the builder, because the class metadata is
-// not yet ready.
-void set_rtti_map_to_type(const std::type_info *rtti_data, const type &ty,
-        const std::string *alternative_name = nullptr);
-
-/**
- * The util class to traverse one or two general_ref_t and can be used to
- * recursively visit their sub-fields and array elements
- *
- * The function dispatch() will parse the input ref, and call the respective
- * visit*() function:
- *      - if v1 is a class object, call visit_class(). By default, it will call
- * dispatch() on all fields
- *      - if v1 is an array, call visit_array(). By default, it will call
- * dispatch() on all elements of `v1` and ignore v2
- *      - if v1 is an basic value, call visit() with its type
- *
- * All `v2` in the functions are optional. The visitor can be used to traverse
- * one general_ref_t by setting `v2 = nullptr` in dispatch(). Also, users can
- * visit 2 general_refs at the same time by setting `v2`. This is useful when
- * comparing two general_refs.
- *
- * The return values of visit*() functions means that if we need to continue to
- * call disaptch() on other fields/elements. If a visit*() call returns false,
- * we can skip the remaining values of the object.
- * */
-struct visitor_t {
-    virtual bool dispatch(general_ref_t *v1, general_ref_t *v2);
-    virtual bool visit_class(general_ref_t *v1, general_ref_t *v2);
-    virtual bool visit_array(general_ref_t *v, general_ref_t *v2,
-            vector_metadata *vec_meta, size_t len, size_t len2, size_t objsize,
-            char *base1, char *base2);
-    virtual bool visit(std::string *v, std::string *v2) = 0;
-    virtual bool visit(int32_t *v, int32_t *v2) = 0;
-    virtual bool visit(uint32_t *v, uint32_t *v2) = 0;
-    virtual bool visit(int64_t *v, int64_t *v2) = 0;
-    virtual bool visit(uint64_t *v, uint64_t *v2) = 0;
-    virtual bool visit(float *v, float *v2) = 0;
-    virtual bool visit(double *v, double *v2) = 0;
-    virtual bool visit(bool *v, bool *v2) = 0;
-};
-
-template <typename T>
-struct container_impl;
-
-template <typename T>
-struct container_impl<std::vector<T>> {
-    using the_type = std::vector<T>;
-    static void constructor(void *p) { new (p) the_type(); };
-    static void destructor(void *p) {
-        auto *ths = reinterpret_cast<the_type *>(p);
-        ths->~the_type();
-    };
-
-    static void move_push(void *p, void *to_move) {
-        std::vector<T> &vec = *reinterpret_cast<std::vector<T> *>(p);
-        T &data = *reinterpret_cast<T *>(to_move);
-        vec.emplace_back(std::move(data));
-    };
-
-    static void push_empty(void *p) {
-        std::vector<T> &vec = *reinterpret_cast<std::vector<T> *>(p);
-        vec.emplace_back(T());
-    };
-
-    static size_t size(void *p) {
-        return reinterpret_cast<std::vector<T> *>(p)->size();
-    };
-    static void *base(void *p) {
-        return reinterpret_cast<std::vector<T> *>(p)->data();
-    };
-    static constexpr vector_kind kind_ = vector_kind::std_vector;
-    static void name(std::stringstream &ss) { ss << "v["; }
-};
-
-template <typename T, std::size_t N>
-struct container_impl<std::array<T, N>> {
-    using the_type = std::array<T, N>;
-    static void constructor(void *p) { new (p) the_type(); };
-    static void destructor(void *p) {
-        auto *ths = reinterpret_cast<the_type *>(p);
-        ths->~the_type();
-    };
-    static constexpr void (*move_push)(void *vec, void *to_move) = nullptr;
-    static constexpr void (*push_empty)(void *vec) = nullptr;
-
-    static size_t size(void *p) { return N; };
-    static void *base(void *p) {
-        return reinterpret_cast<std::array<T, N> *>(p)->data();
-    };
-    static constexpr vector_kind kind_ = vector_kind::std_array;
-    static void name(std::stringstream &ss) { ss << "a[" << N; }
-};
-
-template <typename T, std::size_t N>
-struct container_impl<T[N]> : container_impl<std::array<T, N>> {
-    using impl_t = container_impl<std::array<T, N>>;
-    using impl_t::base;
-    using impl_t::constructor;
-    using impl_t::destructor;
-    using impl_t::kind_;
-    using impl_t::move_push;
-    using impl_t::push_empty;
-    using impl_t::size;
-    static void name(std::stringstream &ss) {
-        ss << "na[" << N;
-    } // native array
-};
-
-// create vector_metadata for container<T>
-template <typename VecT>
-vector_metadata create_vector_meta_impl(
-        const std::type_info *rtti, vector_metadata *ptr) {
-    vector_metadata v;
-    using T = typename std::remove_reference<decltype(
-            std::declval<VecT>()[0])>::type;
-    using ImplT = container_impl<VecT>;
-    v.size_ = sizeof(VecT);
-    v.constructor_ = &ImplT::constructor;
-    v.destructor_ = &ImplT::destructor;
-    v.vector_kind_ = ImplT::kind_;
-    v.element_type_ = {type_registry<T>::base, type_registry<T>::depth,
-            type_registry<T>::metadata()};
-    v.array_depth_ = type_registry<T>::depth + 1;
-    v.size = &ImplT::size;
-    v.base = &ImplT::base;
-    v.move_push = ImplT::move_push;
-    v.push_empty = ImplT::push_empty;
-    std::stringstream ss;
-    ImplT::name(ss);
-    ss << type_name_trait_t<T>::name(v);
-    ss << ']';
-    v.name_ = ss.str();
-    set_rtti_map_to_type(rtti,
-            reflection::type {type_registry<T>::base, v.array_depth_, ptr},
-            &v.name_);
-    return v;
-}
-
-template <typename VecT>
-vector_metadata create_vector_meta(vector_metadata *ptr) {
-    static auto shared = std::shared_ptr<vector_metadata>(
-            ptr, dummy_class_metadata_deleter);
-    auto v = create_vector_meta_impl<VecT>(&typeid(VecT), ptr);
-    v.shared_this_ = shared;
-    return v;
-}
-
-#define SC_VECTOR_TYPE_DEF(...) \
-    SC_INTERNAL_API vector_metadata *type_registry<__VA_ARGS__>::metadata() { \
-        static vector_metadata meta = create_vector_meta<__VA_ARGS__>(&meta); \
-        return &meta; \
-    }
-#define SC_VECTOR_TYPE_DEF2(...) \
-    type type_registry<__VA_ARGS__>::type_ \
-            = {type_registry<__VA_ARGS__>::base, \
-                    type_registry<__VA_ARGS__>::depth, \
-                    type_registry<__VA_ARGS__>::metadata()}
-
-template <typename T>
-SC_VECTOR_TYPE_DEF(std::vector<T>);
-template <typename T>
-SC_VECTOR_TYPE_DEF2(std::vector<T>);
-
-template <typename T, std::size_t N>
-SC_VECTOR_TYPE_DEF(std::array<T, N>);
-template <typename T, std::size_t N>
-SC_VECTOR_TYPE_DEF2(std::array<T, N>);
-
-template <typename T, std::size_t N>
-SC_VECTOR_TYPE_DEF(T[N]);
-template <typename T, std::size_t N>
-SC_VECTOR_TYPE_DEF2(T[N]);
-
-#undef SC_VECTOR_TYPE_DEF
-#undef SC_VECTOR_TYPE_DEF2
-
-// explicit instantiation of commonly used types, implemented in reflection.cpp
-extern template struct reflection::type_registry<std::vector<int64_t>>;
-extern template struct reflection::type_registry<
-        std::vector<std::vector<int64_t>>>;
-extern template struct reflection::type_registry<std::vector<int>>;
-extern template struct reflection::type_registry<std::vector<float>>;
-
-using class_metadata_ptr = std::shared_ptr<class_metadata>;
-
-template <typename T, typename Dummy>
-type type_registry<T, Dummy>::type_ = {type_registry<T, Dummy>::base,
-        type_registry<T, Dummy>::depth, type_registry<T, Dummy>::metadata()};
-
-// gets the metadata by its registered name. Returns null if name not found
-SC_INTERNAL_API class_metadata *get_metadata(const std::string &name);
-
-// gets the reflection type by the type name (generated by type::to_string())
-const type *get_type_by_name(const std::string &name);
-
-// sets the metadata by its registered name. Also sets the rtti =>
-// reflection::type map
-void set_metadata(const std::string &name, class_metadata *meta,
-        const std::type_info *rtti_data);
-
-// gets the reflection::type by the rtti. returns nullptr if not found
-const type *get_type_by_rtti(const std::type_info *rtti_data);
-// gets the rtti by the reflection::type.
-SC_INTERNAL_API const std::type_info *get_rtti_by_type(const type *rtti_data);
-
-template <typename T>
-struct class_builder_t {
-    // the created config space
-    class_metadata metadata_;
-    /**
-     * @param name the name of the config space. Usually should be the name
-     * of the user-defined config class
-     * */
-    class_builder_t(class_metadata *ptr, const char *name) {
-        metadata_.name_ = name;
-        metadata_.size_ = sizeof(T);
-        metadata_.destructor_ = [](void *p) {
-            T *ths = reinterpret_cast<T *>(p);
-            ths->~T();
-        };
-        metadata_.constructor_ = [](void *p) { new (p) T; };
-        metadata_.vector_kind_ = vector_kind::not_vector;
-        set_metadata(metadata_.name_, ptr, &typeid(T));
-
-        // use a static shared_ptr to make sure the shared_ptr itself is alive
-        static auto meta_shared_ptr = std::shared_ptr<class_metadata>(
-                ptr, dummy_class_metadata_deleter);
-        // the lifetime of metadata is not controlled by shared_ptr, see
-        // comments in dummy_class_metadata_deleter
-        metadata_.shared_this_ = meta_shared_ptr;
-    }
-
-    /**
-     * Get the created config space
-     * */
-    class_metadata get() { return std::move(metadata_); }
-
-    /**
-     * Registers the initializer function of the class T
-     * @param ptr the member function pointer
-     * @note the initializer function will be called on the created
-     *      user-defined config object after its constructor is called
-     * */
-    template <void (T::*method)()>
-    class_builder_t &init() {
-        metadata_.initalizer_ = [](void *p) {
-            T *ths = reinterpret_cast<T *>(p);
-            (ths->*method)();
-        };
-        return *this;
-    }
-
-    /**
-     * Registers and binds a field of the user-defined config class
-     * @param name the field name
-     * @param ptr the member pointer
-     * */
-    template <typename FT>
-    class_builder_t &field(const char *name, FT T::*ptr) {
-        size_t offset = (size_t)(&(((T *)nullptr)->*ptr)); // NOLINT
-        auto field_m = utils::make_unique<offset_field_converter_t>(offset);
-        metadata_.fields_.emplace_back(
-                utils::make_unique<reflection::field<FT>>(
-                        name, std::move(field_m)));
-        metadata_.field_map_.insert(
-                std::make_pair(name, metadata_.fields_.back().get()));
-        return *this;
-    }
-};
-
-} // namespace reflection
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#define SC_CLASS_WITH_NAME(IDENTIFIER, ...) \
-    struct __namespace_checker_##IDENTIFIER; \
-    static_assert(std::is_same<__namespace_checker_##IDENTIFIER, \
-                          ::dnnl::impl::graph::gc:: \
-                                  __namespace_checker_##IDENTIFIER>::value, \
-            "SC_CLASS macro should be used in sc namespace!"); \
-    namespace reflection { \
-    template <> \
-    SC_INTERNAL_API class_metadata * \
-    type_registry<__VA_ARGS__, int>::metadata(); \
-    void *__reflection_init_##IDENTIFIER \
-            = &type_registry<__VA_ARGS__, int>::type_; \
-    template <> \
-    class_metadata *type_registry<__VA_ARGS__, int>::metadata() { \
-        using TClass = __VA_ARGS__; \
-        static class_metadata meta \
-                = class_builder_t<TClass>(&meta, #__VA_ARGS__)
-
-/**
- * The macros to define a class for reflection. Example:
- * struct A {
- *  int a;
- *  std::vector<int> b;
- * };
- *
- * Use SC_CLASS in namespace sc:
- * namespace dnnl{
-namespace impl{
-namespace graph{
-namespace gc{
- *  SC_CLASS(A)
- *      SC_FIELD(a)
- *      SC_FIELD(b)
- *  SC_CLASS_END();
- * }
- *
- * Or
- * namespace dnnl{
-namespace impl{
-namespace graph{
-namespace gc{
- *  SC_CLASS_WITH_NAME(AClassName, ::A)
- *      SC_FIELD(a)
- *      SC_FIELD(b)
- *  SC_CLASS_END();
- * }
- *
- * This macro defines the metadata object for the class. The object is a static
- * member of type_registry<T>. This ensures that the metadata object will be
- * initialized on program initialization.
- * */
-#define SC_CLASS(NAME) SC_CLASS_WITH_NAME(NAME, NAME)
-
-#define SC_FIELD(F) .field(#F, &TClass::F)
-#define SC_INITIALIZER(F) .init<&TClass::F>() // NOLINT
-
-#define SC_CLASS_END() \
-    .get(); \
-    return &meta; \
-    } \
-    }
-
-#define SC_REG_ENUM(NAME) \
-    namespace reflection { \
-    template <> \
-    struct type_name_trait_t<NAME, true> { \
-        static constexpr const char *name(const vector_metadata &v) { \
-            static_assert( \
-                    std::is_enum<NAME>::value, #NAME " should be an enum"); \
-            return #NAME; \
-        } \
-    }; \
-    }
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/scoped_timer.hpp b/src/graph/backend/graph_compiler/core/src/util/scoped_timer.hpp
deleted file mode 100644
index 8fffba41d5b..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/scoped_timer.hpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_SCOPED_TIMER_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_SCOPED_TIMER_HPP
-#include <chrono>
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-using time_point = std::chrono::high_resolution_clock::time_point;
-using time_duration = std::chrono::high_resolution_clock::duration;
-
-// the util timer which will print the time elapsed of the lifetime of the
-// object.
-template <typename T>
-struct scoped_timer : T {
-    time_point start_time_;
-    scoped_timer(bool enabled, T &&v) : T(std::forward<T>(v)) {
-        if (enabled) {
-            start_time_ = std::chrono::high_resolution_clock::now();
-        }
-    }
-    scoped_timer(const scoped_timer &) = delete;
-    scoped_timer(scoped_timer &&other)
-        : T(std::move(other)), start_time_(other.start_time_) {
-        other.start_time_ = time_point {};
-    }
-
-    ~scoped_timer() {
-        if (start_time_ != time_point {}) {
-            auto duration
-                    = std::chrono::high_resolution_clock::now() - start_time_;
-            T::operator()(duration);
-        }
-    }
-};
-
-template <typename T>
-inline scoped_timer<T> create_scoped_timer(bool enabled, T &&func) {
-    return scoped_timer<T>(enabled, std::move(func));
-}
-
-#define SC_SCOPED_TIMER_INFO(name, postfix) \
-    ::dnnl::impl::graph::gc::utils::create_scoped_timer( \
-            ::dnnl::impl::graph::gc::utils::compiler_configs_t::get() \
-                    .print_pass_time_, \
-            [](::dnnl::impl::graph::gc::utils::time_duration dur) { \
-                SC_MODULE_INFO2(name) \
-                        << "took " \
-                        << std::chrono::duration_cast< \
-                                   std::chrono::microseconds>(dur) \
-                                   .count() \
-                        << "us" postfix; \
-            });
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/simple_math.hpp b/src/graph/backend/graph_compiler/core/src/util/simple_math.hpp
deleted file mode 100644
index 8e68b2b58b0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/simple_math.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_SIMPLE_MATH_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_SIMPLE_MATH_HPP
-#include <cstdint>
-#include <immintrin.h>
-#include <stddef.h>
-#include <util/compiler_macros.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-static constexpr size_t divide_and_ceil(size_t x, size_t y) {
-    return (x + y - 1) / y;
-}
-
-static constexpr size_t rnd_up(const size_t a, const size_t b) {
-    return (divide_and_ceil(a, b) * b);
-}
-
-static constexpr size_t rnd_dn(const size_t a, const size_t b) {
-    return (a / b) * b;
-}
-
-inline bool is_power_of_2(const uint64_t val) {
-    return (val > 0) && ((val & (val - 1)) == 0);
-}
-
-// get leading zeros
-inline int clz(const uint32_t val) {
-#if SC_IS_MSVC()
-    return _lzcnt_u32(val);
-#else
-    return __builtin_clz(val);
-#endif
-}
-
-inline int clz(const uint64_t val) {
-#if SC_IS_MSVC()
-    return _lzcnt_u64(val);
-#else
-    return __builtin_clzll(val);
-#endif
-}
-
-// get trailing zeros
-inline int ctz(const uint32_t val) {
-#if SC_IS_MSVC()
-    return _tzcnt_u32(val);
-#else
-    return __builtin_ctz(val);
-#endif
-}
-
-inline int ctz(const uint64_t val) {
-#if SC_IS_MSVC()
-    return _tzcnt_u64(val);
-#else
-    return __builtin_ctzll(val);
-#endif
-}
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/string_utils.hpp b/src/graph/backend/graph_compiler/core/src/util/string_utils.hpp
deleted file mode 100644
index 06374610315..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/string_utils.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_STRING_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_STRING_UTILS_HPP
-
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-// Split the string into an array of substrings by the delimiter. The delimiters
-// will not occur in the resulting substrings
-std::vector<std::string> string_split(
-        const std::string &str, const std::string &delimiter);
-
-// returns true if the str starts with prefix
-bool string_startswith(const std::string &str, const std::string &prefix);
-// returns true if the str ends with prefix
-bool string_endswith(const std::string &str, const std::string &prefix);
-
-/// Return a new string in which every occurrence of '\n' in @p in_str
-/// has been replaced with \p subst.
-std::string replace_newlines(
-        const std::string &in_str, const std::string &subst);
-
-/// Assume that \p text is the string provided by an invocation of
-/// __PRETTY_FUNCTION__. Attempt to strip away any namespace and/or class names
-/// at the beginning, and the parameter list at the end. If successful, return
-/// the resulting string. Otherwise just return the original string.
-std::string brief_pretty_function(std::string text);
-
-/// Assume that \p filename is provided by an invocation of __FILE__, and
-/// \p line_num is provided by an invocation of __LINE__.
-/// Return the string "(filename-stripped-of-leading-dirnames):linenum".
-std::string brief_lineloc(std::string filename, int64_t line_num);
-
-/**
- * Class to simplify the work of incrementally increasing or decreasing
- * the indentation of text lines during logging.
- *
- * Uses RAII to decrement indentation levels automatically.
- *
- * Example of suggested usage:
- *
- *         class my_visitor : public ir_visitor_t {
- *             private:
- *                 indentation_t ind_;
- *
- *                 virtual void view(cast_c v) {
- *                     cout << ind_ << "HANDLING A CAST NODE" << endl;
- *                     {
- *                         // Increase the indentation used when visiting my
- *                         // descdendent nodes.
- *                         ind_.level_holder_t h;
- *
- *                         dispatch(v->in_);
- *                     }
- *                 }
- *         };
- *
- * To make using this class even more convenient, consider defining a
- * preprocessor macro similar to this:
- *
- *         #define INDENT ind_.level_holder_t h;
- */
-class indentation_t {
-public:
-    indentation_t(size_t chars_per_level = 2);
-
-    class level_holder_t {
-    public:
-        level_holder_t(indentation_t &owner);
-        ~level_holder_t();
-
-    private:
-        indentation_t &owner_;
-    };
-
-    level_holder_t indent();
-
-private:
-    friend std::ostream &operator<<(std::ostream &os, const indentation_t &i);
-    size_t chars_per_level_;
-    size_t level_ = 0;
-};
-
-std::ostream &operator<<(std::ostream &os, const indentation_t &i);
-
-/**
- * Wraps a C++ pointer value so tht it's rendered as a 64-bit
- * hexadecimal value (with leading "0x") OR as the specified
- * alternative string.
- *
- * \param ptr The address to be rendered.
- *
- * \param null_alt_string Governs what text is produced if \p ptr
- *   is null. When \p ptr is null:
- *   - If <i>this</i> parameter is also null, produce "0x00000000".
- *   - Otherwise produce the string pointed to by this parameter.
- *
- * Example:
- *     void* p = malloc(10);
- *     cout << "p = " << as_hex_t(p) << endl;
- *     cout << "p = " << as_hex_t(p, "(null)") endl;
- */
-class as_hex_t {
-public:
-    as_hex_t(const void *ptr, const char *null_alt_string = nullptr);
-
-private:
-    friend std::ostream &operator<<(std::ostream &os, as_hex_t a);
-    const void *ptr_;
-    const char *null_alt_string_;
-};
-
-std::ostream &operator<<(std::ostream &os, as_hex_t a);
-
-template <typename T>
-void general_print(std::ostream &os, const std::vector<T> &container);
-template <typename T1, typename T2>
-void general_print(std::ostream &os, const std::pair<T1, T2> &value);
-
-template <typename T>
-void general_print(std::ostream &os, const T &value) {
-    os << value;
-}
-template <typename T1, typename T2>
-void general_print(std::ostream &os, const std::pair<T1, T2> &value) {
-    os << '{';
-    general_print(os, value.first);
-    os << ": ";
-    general_print(os, value.second);
-    os << '}';
-}
-
-template <typename T>
-void general_print(std::ostream &os, const std::vector<T> &container) {
-    os << '[';
-    bool first = true;
-    for (const auto &element : container) {
-        if (!first) {
-            os << ", ";
-        } else {
-            first = false;
-        }
-        general_print(os, element);
-    }
-    os << ']';
-}
-
-template <typename T>
-std::string general_print(const T &value) {
-    std::stringstream ss;
-    general_print(ss, value);
-    return ss.str();
-}
-
-template <typename T>
-std::string print_vector(const T &vec) {
-    return general_print(vec);
-}
-
-template <typename T>
-std::string print_pair_vector(const std::vector<std::pair<T, T>> &pvec) {
-    return general_print(pvec);
-}
-
-template <typename T>
-std::string print_nested_vector(const std::vector<std::vector<T>> &nested_vec) {
-    return general_print(nested_vec);
-}
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/subprocess.cpp b/src/graph/backend/graph_compiler/core/src/util/subprocess.cpp
deleted file mode 100644
index db78f6e6290..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/subprocess.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <fstream>
-#include <ios>
-#include <iostream>
-#include <sstream>
-#include <string.h>
-#include "utils.hpp"
-#ifdef _WIN32
-#else
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <spawn.h>
-#include <unistd.h>
-#include <sys/prctl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-extern char **environ;
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-#ifdef _WIN32
-bool create_process_and_await(const std::string &program,
-        const std::vector<std::string> &args, int &exit_code,
-        const std::string *rstdin, std::string *rstdout, std::string *rstderr) {
-    // fix-me: (win32)
-    throw std::runtime_error("create_process_and_await");
-}
-
-bool wait_process(uintptr_t pid, int &exit_code) {
-    int status;
-    // fix-me: (win32)
-    throw std::runtime_error("wait_process");
-}
-
-bool create_process(const std::string &program,
-        const std::vector<std::string> &args, uintptr_t &outhandle,
-        ofdstream_t *rstdin, ifdstream_t *rstdout, ifdstream_t *rstderr) {
-    // fix-me: (win32)
-    throw std::runtime_error("create_process");
-}
-
-#else
-void read_all_from_fd(int fd, std::string &str) {
-    constexpr int buff_size = 256;
-    char buf[buff_size]; // NOLINT
-    ssize_t written = read(fd, buf, buff_size);
-    while (written > 0) {
-        str.insert(str.size(), buf, written);
-        written = read(fd, buf, buff_size);
-    }
-}
-
-struct posix_spawn_file_actions_helper_t {
-    posix_spawn_file_actions_t child_fd_actions;
-    bool success;
-    posix_spawn_file_actions_helper_t() {
-        if (auto retv = posix_spawn_file_actions_init(&child_fd_actions)) {
-            perror("posix_spawn_file_actions_init");
-            success = false;
-        } else {
-            success = true;
-        }
-    }
-    ~posix_spawn_file_actions_helper_t() {
-        if (success) { posix_spawn_file_actions_destroy(&child_fd_actions); }
-    }
-};
-
-struct posix_spawnattr_helper_t {
-    posix_spawnattr_t attr;
-    bool success;
-    posix_spawnattr_helper_t() {
-        if (auto retv = posix_spawnattr_init(&attr)) {
-            perror("posix_spawn_file_actions_init");
-            success = false;
-        } else {
-            success = true;
-        }
-    }
-    ~posix_spawnattr_helper_t() {
-        if (success) { posix_spawnattr_destroy(&attr); }
-    }
-};
-
-bool create_process_and_await(const std::string &program,
-        const std::vector<std::string> &args, int &exit_code,
-        const std::string *rstdin, std::string *rstdout, std::string *rstderr) {
-    int stdinpipefd[2];
-    if (rstdin) {
-        if (pipe(stdinpipefd)) {
-            perror("pipe stdin failed:");
-            return false;
-        }
-    }
-
-    int stdoutpipefd[2];
-    if (rstdout) {
-        if (pipe(stdoutpipefd)) {
-            perror("pipe stdout failed:");
-            return false;
-        }
-    }
-
-    int stderrpipefd[2];
-    if (rstderr) {
-        if (pipe(stderrpipefd)) {
-            perror("pipe stderr failed:");
-            return false;
-        }
-    }
-    char *opt_char[args.size() + 1];
-    opt_char[args.size()] = nullptr;
-    for (unsigned i = 0; i < args.size(); i++) {
-        opt_char[i] = const_cast<char *>(args.at(i).c_str());
-    }
-
-    pid_t pid;
-    posix_spawn_file_actions_helper_t child_fd_actions_helper;
-    if (!child_fd_actions_helper.success) { return false; }
-    posix_spawn_file_actions_t &child_fd_actions
-            = child_fd_actions_helper.child_fd_actions;
-
-#define ADD_ACTION(action, ...) \
-    if (auto retv = posix_spawn_file_actions_add##action( \
-                &child_fd_actions, __VA_ARGS__)) { \
-        perror("posix_spawn_file_actions_add" #action); \
-        return false; \
-    }
-
-    if (rstdin) {
-        ADD_ACTION(dup2, stdinpipefd[0], STDIN_FILENO);
-        ADD_ACTION(close, stdinpipefd[1]);
-    }
-
-    if (rstdout) {
-        ADD_ACTION(dup2, stdoutpipefd[1], STDOUT_FILENO);
-        ADD_ACTION(close, stdoutpipefd[0]);
-    }
-    if (rstderr) {
-        ADD_ACTION(dup2, stderrpipefd[1], STDERR_FILENO);
-        ADD_ACTION(close, stderrpipefd[0]);
-    }
-
-    posix_spawnattr_helper_t attr_helper;
-    if (!attr_helper.success) { return false; }
-    posix_spawnattr_t &attr = attr_helper.attr;
-    if (auto vret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_USEVFORK)) {
-        perror("posix_spawnattr_setflags");
-        return false;
-    }
-
-    if (auto retv = posix_spawnp(&pid, program.c_str(), &child_fd_actions,
-                &attr, opt_char, environ)) {
-        perror("posix_spawnp");
-        return false;
-    }
-
-    if (rstdin) {
-        close(stdinpipefd[0]);
-        if (write(stdinpipefd[1], rstdin->c_str(), rstdin->size())
-                != (int)rstdin->size()) {
-            perror("write stdout failed:");
-            return false;
-        }
-        close(stdinpipefd[1]);
-    }
-    if (rstdout) { close(stdoutpipefd[1]); }
-    if (rstderr) { close(stderrpipefd[1]); }
-
-    if (rstdout) { read_all_from_fd(stdoutpipefd[0], *rstdout); }
-    if (rstderr) { read_all_from_fd(stderrpipefd[0], *rstderr); }
-
-    int status;
-    waitpid(pid, &status, 0);
-
-    if (rstdout) { close(stdoutpipefd[0]); }
-    if (rstderr) { close(stderrpipefd[0]); }
-    if (WIFEXITED(status)) {
-        exit_code = WEXITSTATUS(status);
-    } else {
-        exit_code = -1;
-    }
-    return true;
-}
-
-bool wait_process(uintptr_t pid, int &exit_code) {
-    int status;
-    waitpid(pid, &status, 0);
-    if (WIFEXITED(status)) { exit_code = WEXITSTATUS(status); }
-    return true;
-}
-
-bool create_process(const std::string &program,
-        const std::vector<std::string> &args, uintptr_t &outhandle,
-        ofdstream_t *rstdin, ifdstream_t *rstdout, ifdstream_t *rstderr) {
-    int stdinpipefd[2];
-    if (rstdin) {
-        if (pipe(stdinpipefd)) {
-            perror("pipe stdin failed:");
-            return false;
-        }
-        rstdin->reset(stdinpipefd[1]);
-    }
-
-    int stdoutpipefd[2];
-    if (rstdout) {
-        if (pipe(stdoutpipefd)) {
-            perror("pipe stdout failed:");
-            return false;
-        }
-        rstdout->reset(stdoutpipefd[0]);
-    }
-
-    int stderrpipefd[2];
-    if (rstderr) {
-        if (pipe(stderrpipefd)) {
-            perror("pipe stderr failed:");
-            return false;
-        }
-        rstderr->reset(stderrpipefd[0]);
-    }
-    auto pid = fork();
-    if (pid > 0) {
-        if (rstdin) { close(stdinpipefd[0]); }
-        if (rstdout) { close(stdoutpipefd[1]); }
-        if (rstderr) { close(stderrpipefd[1]); }
-        outhandle = pid;
-        return true;
-    } else if (pid == 0) {
-        int r = prctl(PR_SET_PDEATHSIG, SIGTERM);
-        if (r == -1) {
-            perror(nullptr);
-            exit(1);
-        }
-        if (rstdin) {
-            dup2(stdinpipefd[0], STDIN_FILENO);
-            close(stdinpipefd[1]);
-        }
-        if (rstdout) {
-            dup2(stdoutpipefd[1], STDOUT_FILENO);
-            close(stdoutpipefd[0]);
-        }
-        if (rstderr) {
-            dup2(stderrpipefd[1], STDERR_FILENO);
-            close(stderrpipefd[0]);
-        }
-        char *opt_char[args.size() + 1];
-        opt_char[args.size()] = nullptr;
-        for (unsigned i = 0; i < args.size(); i++) {
-            opt_char[i] = const_cast<char *>(args.at(i).c_str());
-        }
-        execvp(program.c_str(), opt_char);
-        exit(0);
-    } else {
-        perror("Error when fork");
-    }
-    return false;
-}
-#endif
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/uint128.cpp b/src/graph/backend/graph_compiler/core/src/util/uint128.cpp
deleted file mode 100644
index 4f37d01dd65..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/uint128.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <stdexcept>
-
-#include "simple_math.hpp"
-#include "uint128.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-/*******************************************************************************
- ********************** Function definition of uint64_t ************************
- *******************************************************************************/
-
-// count leading zero of 128 bit val
-inline int uint128_t::clz() const {
-    return (*this).upper_64_ == 0 //
-            ? utils::clz((*this).lower_64_) + 64
-            : utils::clz((*this).upper_64_);
-}
-// get upper/lower
-inline uint64_t uint128_t::upper_64() const {
-    return (*this).upper_64_;
-}
-inline uint64_t uint128_t::lower_64() const {
-    return (*this).lower_64_;
-}
-// get constant
-inline constexpr uint128_t uint128_t::one() {
-    return uint128_t(UINT64_C(1));
-}
-inline constexpr uint128_t uint128_t::zero() {
-    return uint128_t(UINT64_C(0));
-}
-
-// compare
-bool uint128_t::operator==(const uint128_t &other) const {
-    return (*this).upper_64_ == other.upper_64_
-            && (*this).lower_64_ == other.lower_64_;
-}
-bool uint128_t::operator!=(const uint128_t &other) const {
-    return (*this).upper_64_ != other.upper_64_
-            || (*this).lower_64_ != other.lower_64_;
-}
-bool uint128_t::operator>(const uint128_t &other) const {
-    return (*this).upper_64_ == other.upper_64_
-            ? (*this).lower_64_ > other.lower_64_
-            : (*this).upper_64_ > other.upper_64_;
-}
-bool uint128_t::operator<(const uint128_t &other) const {
-    return ((*this).upper_64_ == other.upper_64_)
-            ? (*this).lower_64_ < other.lower_64_
-            : (*this).upper_64_ < other.upper_64_;
-}
-bool uint128_t::operator>=(const uint128_t &other) const {
-    return (*this) == other || (*this) > other;
-}
-bool uint128_t::operator<=(const uint128_t &other) const {
-    return (*this) == other || (*this) < other;
-}
-
-// bitwise or
-uint128_t uint128_t::operator|(const uint128_t &rhs) const {
-    return uint128_t((*this).upper_64_ | rhs.upper_64_,
-            (*this).lower_64_ | rhs.lower_64_);
-}
-uint128_t &uint128_t::operator|=(const uint128_t &rhs) {
-    *this = *this | rhs;
-    return *this;
-}
-// bitwise and
-uint128_t uint128_t::operator&(const uint128_t &rhs) const {
-    return uint128_t((*this).upper_64_ & rhs.upper_64_,
-            (*this).lower_64_ & rhs.lower_64_);
-}
-uint128_t &uint128_t::operator&=(const uint128_t &rhs) {
-    *this = *this & rhs;
-    return *this;
-}
-// bitwise xor
-uint128_t uint128_t::operator^(const uint128_t &rhs) const {
-    return uint128_t((*this).upper_64_ ^ rhs.upper_64_,
-            (*this).lower_64_ ^ rhs.lower_64_);
-}
-uint128_t &uint128_t::operator^=(const uint128_t &rhs) {
-    *this = *this ^ rhs;
-    return *this;
-}
-
-// bitwise not
-uint128_t uint128_t::operator~() const {
-    return uint128_t(~(*this).upper_64_, ~(*this).lower_64_);
-}
-
-// add
-uint128_t uint128_t::operator+(const uint128_t &rhs) const {
-    uint64_t lower = (*this).lower_64_ + rhs.lower_64_;
-    uint64_t upper = (*this).upper_64_ + rhs.upper_64_;
-    uint64_t carry = uint64_t(lower < (*this).lower_64_);
-    return uint128_t(upper + carry, lower);
-}
-uint128_t &uint128_t::operator+=(const uint128_t &rhs) {
-    *this = *this + rhs;
-    return *this;
-}
-// sub
-uint128_t uint128_t::operator-(const uint128_t &rhs) const {
-    uint64_t lower = (*this).lower_64_ - rhs.lower_64_;
-    uint64_t upper = (*this).upper_64_ - rhs.upper_64_;
-    uint64_t carry = uint64_t(lower > (*this).lower_64_);
-    return uint128_t(upper - carry, lower);
-}
-uint128_t &uint128_t::operator-=(const uint128_t &rhs) {
-    *this = *this - rhs;
-    return *this;
-}
-
-// logical shift left
-uint128_t uint128_t::operator<<(const int bits) const {
-    if (bits < 0 || bits >= 128) {
-        throw std::runtime_error("Undefined Behavior");
-    }
-    //
-    uint64_t upper;
-    uint64_t lower;
-    if (bits == 0) {
-        return *this;
-    } else if (bits >= 64) {
-        upper = (*this).lower_64_ << (bits - 64);
-        lower = 0;
-    } else {
-        upper = ((*this).upper_64_ << bits)
-                | ((*this).lower_64_ >> (64 - bits));
-        lower = (*this).lower_64_ << bits;
-    }
-    return uint128_t(upper, lower);
-}
-uint128_t &uint128_t::operator<<=(int bits) {
-    *this = *this << bits;
-    return *this;
-}
-// logical shift right
-uint128_t uint128_t::operator>>(const int bits) const {
-    if (bits < 0 || bits >= 128) {
-        throw std::runtime_error("Undefined Behavior");
-    }
-    //
-    uint64_t upper;
-    uint64_t lower;
-    if (bits == 0) {
-        return *this;
-    } else if (bits >= 64) {
-        upper = 0;
-        lower = (*this).upper_64_ >> (bits - 64);
-    } else {
-        upper = (*this).upper_64_ >> bits;
-        lower = ((*this).lower_64_ >> bits)
-                | ((*this).upper_64_ << (64 - bits));
-    }
-    return uint128_t(upper, lower);
-}
-uint128_t &uint128_t::operator>>=(int bits) {
-    *this = *this >> bits;
-    return *this;
-}
-
-// mul
-uint128_t uint128_t::operator*(const uint128_t &rhs) const {
-    return mul_impl(*this, rhs);
-}
-uint128_t &uint128_t::operator*=(const uint128_t &rhs) {
-    *this = *this * rhs;
-    return *this;
-}
-// simple distributive algorithm to calculate mul
-uint128_t uint128_t::mul_impl(
-        const uint128_t &lhs, const uint128_t &rhs) const {
-    // Split the 128 bit to 1 64-bit for high and 2 32-bit for low
-    //
-    // lhs = ah[63:0]*2^64 + al1[31:0]*2^32 + al2[31:0]
-    // rhs = bh[63:0]*2^64 + bl1[31:0]*2^32 + bl2[31:0]
-    //
-    // lhs * rhs =
-    //     ah * bh * 2^128       // truncated
-    //   + ah * (bl1:bl2) * 2^64 // mul_upper_a * 2^64
-    //   + bh * (al1:al2) * 2^64 // mul_upper_b * 2^64
-    //   + al1 * bl1 * 2^64      // mul_upper_c * 2^64
-    //   + al1 * bl2 * 2^32      // add_a
-    //   + al2 * bl1 * 2^32      // add_b
-    //   + al2 * bl2             // lower
-    //
-    uint64_t l_low_63_32 = lhs.lower_64_ >> 32;
-    uint64_t l_low_31_00 = lhs.lower_64_ & 0xffffffff;
-    uint64_t r_low_63_32 = rhs.lower_64_ >> 32;
-    uint64_t r_low_31_00 = rhs.lower_64_ & 0xffffffff;
-    //
-    uint64_t mul_upper_a = lhs.upper_64_ * rhs.lower_64_;
-    uint64_t mul_upper_b = lhs.lower_64_ * rhs.upper_64_;
-    uint64_t mul_upper_c = l_low_63_32 * r_low_63_32;
-    //
-    uint64_t upper = mul_upper_a + mul_upper_b + mul_upper_c;
-    uint64_t lower = l_low_31_00 * r_low_31_00;
-    //
-    uint128_t add_a = uint128_t(l_low_63_32 * r_low_31_00) << 32;
-    uint128_t add_b = uint128_t(l_low_31_00 * r_low_63_32) << 32;
-    return uint128_t(upper, lower) + add_a + add_b;
-}
-
-// div
-uint128_t uint128_t::operator/(const uint128_t &rhs) const {
-    return div_mod_impl(*this, rhs).first;
-}
-uint128_t &uint128_t::operator/=(const uint128_t &rhs) {
-    *this = *this / rhs;
-    return *this;
-}
-// mod
-uint128_t uint128_t::operator%(const uint128_t &rhs) const {
-    return div_mod_impl(*this, rhs).second;
-}
-uint128_t &uint128_t::operator%=(const uint128_t &rhs) {
-    *this = *this % rhs;
-    return *this;
-}
-// simple shift-subtract algorithm to calculate div/mod
-uint128_t::qr_pair uint128_t::div_mod_impl(
-        const uint128_t &lhs, const uint128_t &rhs) const {
-    if (rhs == zero()) { throw std::runtime_error("Divide by Zero"); }
-    if (lhs == rhs) { return {one(), zero()}; }
-    if (lhs < rhs) { return {zero(), lhs}; }
-    // Distance between most significant bits
-    // 0 <= shift < 128
-    int shift = rhs.clz() - lhs.clz();
-    uint128_t divisor = rhs << shift;
-    uint128_t quotient = zero();
-    uint128_t dividend = lhs;
-    for (int i = 0; i <= shift; ++i) {
-        quotient = quotient << 1;
-        if (dividend >= divisor) {
-            dividend = dividend - divisor;
-            quotient = quotient | one();
-        }
-        divisor = divisor >> 1;
-    }
-    // {quotient, remainder}
-    return {quotient, dividend};
-}
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/uint128.hpp b/src/graph/backend/graph_compiler/core/src/util/uint128.hpp
deleted file mode 100644
index c4bf60505b3..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/uint128.hpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_UINT128_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_UINT128_HPP
-
-#include <cmath>
-#include <cstdint>
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-struct uint128_t {
-public:
-    // constructor
-    uint128_t() = default;
-    constexpr uint128_t(const uint64_t val) //
-        : upper_64_(0), lower_64_(val) {}
-    constexpr uint128_t(const uint64_t upper, const uint64_t lower) //
-        : upper_64_(upper), lower_64_(lower) {}
-#ifdef __SIZEOF_INT128__
-    constexpr uint128_t(const __uint128_t val) //
-        : upper_64_(uint64_t(val << 64)), lower_64_(uint64_t(val)) {}
-#endif
-
-    // cast
-    inline operator uint64_t() const { return lower_64_; }
-#ifdef __SIZEOF_INT128__
-    inline operator __uint128_t() const {
-        return (__uint128_t(upper_64_) << 64) | __uint128_t(lower_64_);
-    }
-#endif
-
-    // helper
-    inline int clz() const;
-    inline uint64_t upper_64() const;
-    inline uint64_t lower_64() const;
-    inline static constexpr uint128_t one();
-    inline static constexpr uint128_t zero();
-
-    // compare
-    bool operator==(const uint128_t &other) const;
-    bool operator!=(const uint128_t &other) const;
-    bool operator>(const uint128_t &other) const;
-    bool operator<(const uint128_t &other) const;
-    bool operator>=(const uint128_t &other) const;
-    bool operator<=(const uint128_t &other) const;
-
-    // bitwise or/and/xor
-    uint128_t operator|(const uint128_t &rhs) const;
-    uint128_t operator&(const uint128_t &rhs) const;
-    uint128_t operator^(const uint128_t &rhs) const;
-    uint128_t &operator|=(const uint128_t &rhs);
-    uint128_t &operator&=(const uint128_t &rhs);
-    uint128_t &operator^=(const uint128_t &rhs);
-
-    // bitwise not
-    uint128_t operator~() const;
-
-    // logical shift
-    uint128_t operator<<(const int bits) const;
-    uint128_t operator>>(const int bits) const;
-    uint128_t &operator<<=(const int bits);
-    uint128_t &operator>>=(const int bits);
-
-    // add/sub
-    uint128_t operator+(const uint128_t &rhs) const;
-    uint128_t operator-(const uint128_t &rhs) const;
-    uint128_t &operator+=(const uint128_t &rhs);
-    uint128_t &operator-=(const uint128_t &rhs);
-
-    // mul
-    uint128_t operator*(const uint128_t &rhs) const;
-    uint128_t &operator*=(const uint128_t &rhs);
-
-    // div/mod
-    uint128_t operator/(const uint128_t &rhs) const;
-    uint128_t operator%(const uint128_t &rhs) const;
-    uint128_t &operator/=(const uint128_t &rhs);
-    uint128_t &operator%=(const uint128_t &rhs);
-
-private:
-    // uint128 data storage
-    uint64_t upper_64_;
-    uint64_t lower_64_;
-
-    // calculation implementation
-    using qr_pair = std::pair<uint128_t, uint128_t>;
-    uint128_t mul_impl(const uint128_t &lhs, const uint128_t &rhs) const;
-    qr_pair div_mod_impl(const uint128_t &lhs, const uint128_t &rhs) const;
-};
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/utils.cpp b/src/graph/backend/graph_compiler/core/src/util/utils.cpp
deleted file mode 100644
index c4c2c766050..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/utils.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <climits>
-#include <cstring>
-#ifdef _WIN32
-#include <io.h>
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#include <unistd.h>
-#endif
-#include <atomic>
-#include <fstream>
-#include <stdlib.h>
-#include <string>
-#include "file.hpp"
-#include "utils.hpp"
-#include <compiler/ir/sc_data_type.hpp>
-#include <runtime/config.hpp>
-#include <runtime/env_vars.hpp>
-#include <runtime/logging.hpp>
-
-#ifdef _WIN32
-#define getprocessid GetCurrentProcessId
-#else
-#define getprocessid getpid
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-static std::atomic<int32_t> cnt {0};
-
-std::string get_unique_name_for_file() {
-    std::stringstream name_maker;
-    name_maker << getprocessid() << '_' << ++cnt;
-    return name_maker.str();
-}
-
-std::string get_dyn_lib_path(void *addr) {
-#ifdef _WIN32
-    // fix-me: add impl
-    throw std::runtime_error("get_dyn_lib_path");
-#else
-    // On Windows, use GetMappedFileNameW
-    Dl_info info;
-    if (dladdr(addr, &info)) { return info.dli_fname; }
-#endif
-    return std::string();
-}
-
-uint32_t get_sizeof_etype(sc_data_etype etype) {
-    switch (etype) {
-        case sc_data_etype::S8:
-        case sc_data_etype::U8:
-        case sc_data_etype::BOOLEAN: return 1;
-        case sc_data_etype::U16:
-        case sc_data_etype::F16:
-        case sc_data_etype::BF16: return 2;
-        case sc_data_etype::U32:
-        case sc_data_etype::F32:
-        case sc_data_etype::S32: return 4;
-        case sc_data_etype::GENERIC:
-        case sc_data_etype::INDEX:
-            return 8; // TODO(xxx): should be target dependent
-        default:
-            if (etypes::is_pointer(etype)) { return 8; };
-            assert(0);
-            return 0;
-    }
-}
-
-std::string etype_to_string(sc_data_etype edtype) {
-    std::stringstream os;
-    os << edtype;
-    return os.str();
-}
-
-uint64_t get_sizeof_type(sc_data_type_t dtype) {
-    return get_sizeof_etype(dtype.type_code_) * dtype.lanes_;
-}
-
-std::string get_error_msg(int errnum) {
-#if defined(_WIN32) || defined(__APPLE__)
-    // fix-me: (win32)
-    return "Error message from get_error_msg";
-#else
-    std::vector<char> buffer(1024); // gnu docs say this is big enough
-
-#if (_POSIX_C_SOURCE >= 200112L) && !_GNU_SOURCE
-    // The XSI-compliant version of strerror_r
-    if (strerror_r(errnum, &buffer[0], buffer.size()) == 0) {
-        return std::string(&buffer[0]);
-    } else {
-        return "Failed call to strerror_r";
-    }
-#else
-    // The GCC version of strerror_r
-    return std::string(strerror_r(errnum, &buffer[0], buffer.size()));
-#endif
-#endif
-}
-
-// select nearest even step
-int get_nearest_vector_step(const int step) {
-    assert(step > 0);
-    int nbits = 0, n = step;
-    while (n) {
-        n = n >> 1;
-        nbits++;
-    }
-    assert(nbits <= 6 || (nbits == 7 && step == 64));
-    return (1 << (nbits - 1)) == step ? step : (1 << nbits);
-}
-
-compiler_configs_t &compiler_configs_t::get() {
-    static compiler_configs_t cfg {};
-    return cfg;
-}
-
-const std::string &compiler_configs_t::get_temp_dir_path() {
-    const std::string &temp_dir = compiler_configs_t::get().temp_dir_;
-    return temp_dir;
-}
-
-template <typename T>
-static void parse_value(const char *name, T &v) {
-    auto strv = utils::getenv_string(name);
-    if (!strv.empty()) { v = T(std::stoi(strv)); };
-}
-
-using namespace env_key;
-compiler_configs_t::compiler_configs_t() {
-    dump_gen_code_ = utils::getenv_string(env_names[SC_DUMP_GENCODE]);
-    print_pass_result_ = utils::getenv_int(env_names[SC_PRINT_PASS_RESULT], 0);
-
-    if (temp_dir_.empty()) {
-#ifndef _WIN32
-        // use the order defined by POSIX standard
-        do {
-            temp_dir_ = utils::getenv_string("TMPDIR");
-            if (!temp_dir_.empty()) { break; }
-            temp_dir_ = utils::getenv_string("TMP");
-            if (!temp_dir_.empty()) { break; }
-            temp_dir_ = utils::getenv_string("TEMP");
-            if (!temp_dir_.empty()) { break; }
-            temp_dir_ = utils::getenv_string("TEMPDIR");
-            if (!temp_dir_.empty()) { break; }
-            temp_dir_ = "/tmp";
-        } while (false);
-#else
-        char temp[MAX_PATH + 2];
-        auto ret = GetTempPathA(MAX_PATH + 1, temp);
-        if (ret != 0) { temp_dir_ = temp; }
-#endif // _WIN32
-    }
-}
-
-void open_file_for_write(std::ofstream &ret, const std::string &path) {
-    ret.open(path);
-    COMPILE_ASSERT(ret, "Cannot open file for write:" << path);
-}
-
-void open_file_for_read(std::ifstream &ret, const std::string &path) {
-    ret.open(path);
-    COMPILE_ASSERT(ret, "Cannot open file for read:" << path);
-}
-
-static std::string make_temp_path(const std::string &filename) {
-    std::string name = compiler_configs_t::get_temp_dir_path();
-    name += '/';
-    name += filename;
-    return name;
-}
-
-void open_temp_file_for_write(std::ofstream &ret, const std::string &filename) {
-    open_file_for_write(ret, make_temp_path(filename));
-}
-
-void open_temp_file_for_read(std::ifstream &ret, const std::string &filename) {
-    open_file_for_read(ret, make_temp_path(filename));
-}
-
-} // namespace utils
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/src/graph/backend/graph_compiler/core/src/util/utils.hpp b/src/graph/backend/graph_compiler/core/src/util/utils.hpp
deleted file mode 100644
index 8513eb91ad0..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/utils.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_UTILS_HPP
-
-#include <algorithm>
-#include <iostream>
-#include <math.h>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-#include "assert.hpp"
-#include "def.hpp"
-#include "fdstream.hpp"
-#include <compiler/ir/sc_data_type.hpp>
-#include <runtime/env_var.hpp>
-#include <runtime/logging.hpp>
-#include <util/simple_math.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-// get position index from vector
-template <typename T>
-inline size_t get_index(const std::vector<T> vec, const T &element) {
-    auto pos = std::find_if(vec.begin(), vec.end(),
-            [&](const T &other) -> bool { return other == element; });
-    if (pos == vec.end()) {
-        COMPILE_ASSERT(0, "Individual and vector do not match");
-        return 0;
-    } else {
-        return std::distance(vec.begin(), pos);
-    }
-}
-
-template <typename T>
-constexpr bool is_one_of(T value, T last) {
-    return value == last;
-}
-
-template <typename T, typename... Args>
-constexpr bool is_one_of(T value, T first, Args... args) {
-    return value == first || is_one_of(value, args...);
-}
-
-// Copied from onednn utils.hpp
-template <typename T, typename... Args>
-std::unique_ptr<T> make_unique(Args &&...args) {
-    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
-}
-
-template <typename TDst, typename TSrc>
-struct bind_assigner_t {
-    static void assign(TDst &dst, const TSrc &src) { dst = src; }
-};
-
-template <int idx, typename T>
-void bind_vector_to_args(const std::vector<T> &v) {}
-
-template <int idx = 0, typename T, typename T2, typename... Args>
-void bind_vector_to_args(const std::vector<T> &v, T2 &out1, Args &...args) {
-    bind_assigner_t<T2, T>::assign(out1, v[idx]);
-    bind_vector_to_args<idx + 1>(v, args...);
-}
-
-template <typename T, typename T2>
-void bind_vector_to_args(const std::vector<T> &v, std::vector<T2> &out) {
-    for (size_t idx = 0; idx < v.size(); idx++) {
-        bind_assigner_t<T2, T>::assign(out[idx], v[idx]);
-    }
-}
-
-template <typename T>
-void args_to_vector(std::vector<T> &v) {}
-
-template <typename T, typename T2, typename... Args>
-void args_to_vector(std::vector<T> &v, T2 &&out1, Args &&...args) {
-    v.emplace_back(std::forward<T2>(out1));
-    args_to_vector(v, std::move(args)...);
-}
-
-template <typename T, typename... Args>
-std::vector<T> args_to_vector(Args &&...args) {
-    std::vector<T> ret;
-    args_to_vector(ret, std::move(args)...);
-    return std::move(ret);
-}
-
-/**
- * Creates a process and waits until it exits
- * @param program the program path
- * @param args the arguments (including the program name)
- * @param exit_code outputs the exit code of the process if process creating
- *      succeeds. If failed to create the process, this value is not changed
- * @param rstdin the stdin data to dump into the process, nullable
- * @param rstdout receives the stdout data from the process, nullable
- * @param rstderr receives the stderr data from the process, nullable
- * @return true if creating process succeeded
- * */
-SC_INTERNAL_API bool create_process_and_await(const std::string &program,
-        const std::vector<std::string> &args, int &exit_code,
-        const std::string *rstdin = nullptr, std::string *rstdout = nullptr,
-        std::string *rstderr = nullptr);
-
-/**
- * Creates a process without waiting for its termination. May redirect stdin,
- * stdout, stderr as stream
- * @param program the program path
- * @param args the arguments (including the program name)
- * @param outhandle outputs handle of the process if process creating
- *      succeeds. If failed to create the process, this value is not changed
- * @param rstdin the stdin stream to dump into the process, nullable
- * @param rstdout receives the stdout data from the process, nullable
- * @param rstderr receives the stderr data from the process, nullable
- * @return true if creating process succeeded
- * */
-SC_INTERNAL_API bool create_process(const std::string &program,
-        const std::vector<std::string> &args, uintptr_t &outhandle,
-        ofdstream_t *rstdin = nullptr, ifdstream_t *rstdout = nullptr,
-        ifdstream_t *rstderr = nullptr);
-
-// waits for the termination of the process. Returns true if succeeded
-SC_INTERNAL_API bool wait_process(uintptr_t outhandle, int &exit_code);
-
-#define MACRO_2_STR_HELPER(x) #x
-#define MACRO_2_STR(name) MACRO_2_STR_HELPER(name)
-
-/**
- * Get the factors for a given size, which will be used by the
- * tuner
- * @param X size of a given dimension
- * @return the factors for a given size
- * */
-inline std::vector<int> get_factors(const int X) {
-    std::vector<int> factors;
-    for (auto i = 1; i <= (int)sqrt((double)X); ++i) {
-        if (X % i == 0) {
-            factors.push_back(i);
-            if (X / i != i) factors.push_back(X / i);
-        }
-    }
-    std::sort(factors.begin(), factors.end());
-    return factors;
-}
-
-/**
- * Get the possible block size list for a given size, which will be used by the
- * tuner
- * @param X size of a given dimension
- * @return a list of block size
- * */
-inline std::vector<int> get_blocks(
-        const int X, int threshold = 8, int floor = 1024) {
-    std::vector<int> blocks;
-    for (auto i = 1; i <= (int)sqrt((double)X); ++i) {
-        if (X % i == 0) {
-            // add a judgement here to prune search space for efficient search
-            if (i >= threshold && i <= floor) { blocks.push_back(i); }
-            auto div_x = X / i;
-            if (div_x >= threshold && div_x <= floor && (div_x != i)) {
-                blocks.push_back(div_x);
-            }
-        }
-    }
-    if (blocks.empty()) {
-        blocks = get_factors(X);
-    } else {
-        std::sort(blocks.begin(), blocks.end());
-    }
-    return blocks;
-}
-
-/**
- * Gets the size of a etype in bytes
- * */
-SC_INTERNAL_API uint32_t get_sizeof_etype(sc_data_etype etype);
-
-/**
- * Gets the size of a type in bytes
- * */
-SC_INTERNAL_API uint64_t get_sizeof_type(sc_data_type_t dtype);
-
-/**
- * A convenience wrapper around the standard 'strerror_r' function.
- * */
-SC_INTERNAL_API std::string get_error_msg(int errnum);
-
-/**
- * Gets the file path of a dynamic library
- * @param addr an address of any function in the library
- * @return the library path, or empty if anything goes wrong
- * */
-SC_INTERNAL_API std::string get_dyn_lib_path(void *addr);
-
-/**
- * Get the nearest even step of the for loop
- */
-SC_INTERNAL_API int get_nearest_vector_step(int step);
-
-/**
- * Get the string of etype.
- */
-SC_INTERNAL_API std::string etype_to_string(sc_data_etype edtype);
-
-struct SC_INTERNAL_API compiler_configs_t {
-    bool print_gen_code_;
-    std::string dump_gen_code_;
-    std::string jit_cc_options_;
-    std::vector<std::string> cpu_jit_flags_;
-    bool xbyak_jit_save_obj_ = false;
-    bool xbyak_jit_asm_listing_ = false;
-    bool xbyak_jit_log_stack_frame_model_ = false;
-    bool xbyak_jit_pause_after_codegen_ = false;
-    bool diagnose_ = false;
-    bool printer_print_address_ = false;
-    bool print_pass_time_ = false;
-    bool print_pass_result_ = false;
-    bool jit_profile_ = false;
-
-    static compiler_configs_t &get();
-    static const std::string &get_temp_dir_path();
-
-private:
-    compiler_configs_t();
-    // set to private to prevent use without permission check
-    std::string temp_dir_;
-};
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/variant.hpp b/src/graph/backend/graph_compiler/core/src/util/variant.hpp
deleted file mode 100644
index 4f7d62a8b52..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/variant.hpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_VARIANT_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_VARIANT_HPP
-#include <stdexcept>
-#include <stdint.h>
-#include <utility>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-namespace variant_impl {
-using std::size_t;
-template <size_t arg1, size_t... args>
-struct const_max;
-
-template <size_t arg1, size_t arg2, size_t... args>
-struct const_max<arg1, arg2, args...> {
-    static constexpr size_t value = arg1 >= arg2
-            ? const_max<arg1, args...>::value
-            : const_max<arg2, args...>::value;
-};
-
-template <size_t v>
-struct const_max<v> {
-    static constexpr size_t value = v;
-};
-
-template <typename... Args>
-struct helper;
-
-template <typename T, typename... Args>
-struct helper<T, Args...> {
-    using inner = helper<Args...>;
-    static constexpr size_t id = sizeof...(Args) + 1;
-
-    template <typename T2, bool is_same>
-    struct handle_get_id {
-        static constexpr size_t call() { return id; }
-        static constexpr bool has_type() { return true; }
-    };
-
-    template <typename T2>
-    struct handle_get_id<T2, false> {
-        static constexpr size_t call() { return inner::template get_id<T2>(); }
-        static constexpr bool has_type() {
-            return inner::template has_type<T2>();
-        }
-    };
-
-    template <typename T2>
-    static constexpr size_t get_id() {
-        return handle_get_id<T2, std::is_same<T2, T>::value>::call();
-    }
-
-    template <typename T2>
-    static constexpr bool has_type() {
-        return handle_get_id<T2, std::is_same<T2, T>::value>::has_type();
-    }
-
-    static void move(size_t curid, void *src, void *dst) {
-        if (id == curid) {
-            new (dst) T(std::move(*reinterpret_cast<T *>(src)));
-        } else {
-            inner::move(curid, src, dst);
-        }
-    }
-
-    // use the name copyit instead of copy to make linter happy
-    template <typename T2 = T>
-    static void copyit(size_t curid, const void *src, void *dst) {
-        if (id == curid) {
-            new (dst) T(*reinterpret_cast<const T *>(src));
-        } else {
-            inner::copyit(curid, src, dst);
-        }
-    }
-
-    static void destroy(size_t curid, void *data) {
-        if (id == curid) {
-            reinterpret_cast<T *>(data)->~T();
-        } else {
-            inner::destroy(curid, data);
-        }
-    }
-
-    template <typename T2, bool is_T2_parent_of_T>
-    struct handle_as {
-        // case when is_T2_parent_of_T=true
-        static T2 *call(size_t curid, void *data) {
-            if (id == curid) {
-                // pointer of T and T2 may not be the same, if T2 is not the
-                // only base class of T. Use static_cast to ensure correct
-                // casting
-                return static_cast<T2 *>(reinterpret_cast<T *>(data));
-            }
-            return inner::template as<T2>(curid, data);
-        }
-    };
-
-    template <typename T2>
-    struct handle_as<T2, false> {
-        // case when is_T2_parent_of_T=false
-        static T2 *call(size_t curid, void *data) {
-            return inner::template as<T2>(curid, data);
-        }
-    };
-
-    template <typename T2>
-    static T2 *as(size_t curid, void *data) {
-        constexpr bool T2_is_base_of_T
-                = std::is_same<T2, T>::value || std::is_base_of<T2, T>::value;
-        return handle_as<T2, T2_is_base_of_T>::call(curid, data);
-    }
-
-    template <typename T2, bool is_T_convertible_to_T2>
-    struct handle_cast {
-        // case when is_T2_parent_of_T=true
-        static T2 call(size_t curid, const void *data) {
-            if (id == curid) { return *reinterpret_cast<const T *>(data); }
-            return inner::template cast<T2>(curid, data);
-        }
-    };
-
-    template <typename T2>
-    struct handle_cast<T2, false> {
-        // case when is_T2_parent_of_T=false
-        static T2 call(size_t curid, const void *data) {
-            return inner::template cast<T2>(curid, data);
-        }
-    };
-
-    template <typename T2>
-    static T2 cast(size_t curid, const void *data) {
-        return handle_cast<T2, std::is_convertible<T, T2>::value>::call(
-                curid, data);
-    }
-};
-
-template <>
-struct helper<> {
-    static void move(size_t curid, void *src, void *dst) {}
-    static void copyit(size_t curid, const void *src, void *dst) {}
-    static void destroy(size_t curid, void *data) {}
-
-    template <typename T2>
-    static constexpr bool has_type() {
-        return false;
-    }
-
-    template <typename T2>
-    static T2 *as(size_t curid, void *data) {
-        return nullptr;
-    }
-
-    template <typename T2>
-    static T2 cast(size_t curid, const void *data) {
-        throw std::runtime_error("Bad variant cast");
-    }
-};
-
-template <typename T, typename variantT>
-struct copy_or_move_handler {
-    static_assert(variantT::helper_t::template has_type<T>(),
-            "The variant does not include this type");
-    static void call(variantT &ths, T &&src) {
-        auto id = ths.template get_id_of_type<T>();
-        new (&ths.data_) T(std::move(src));
-        ths.id_ = id;
-    }
-
-    // using template to avoid instantiating the use of copy ctor when it is
-    // never used
-    template <typename T2 = variantT>
-    static void call(variantT &ths, const T &src) {
-        auto id = ths.template get_id_of_type<T>();
-        new (&ths.data_) T(src);
-        ths.id_ = id;
-    }
-};
-
-template <typename variantT>
-struct copy_or_move_handler<variantT, variantT> {
-    static void call(variantT &ths, variantT &&other) {
-        variantT::helper_t::move(other.id_, &other.data_, &ths.data_);
-        ths.id_ = other.id_;
-        other.id_ = 0;
-    }
-
-    // using template to avoid instantiating the use of copy ctor when it is
-    // never used
-    template <typename T2 = variantT>
-    static void call(variantT &ths, const variantT &other) {
-        variantT::helper_t::copyit(other.id_, &other.data_, &ths.data_);
-        ths.id_ = other.id_;
-    }
-};
-
-} // namespace variant_impl
-
-/**
- * @brief The type-safe tagged union. The candidates types are decided by the
- * template arguments Args. When getting the values from the variant, it will
- * check the real type stored in the variant at the run time. It supports
- * move/copy assignment and constructor. The variant is by default "empty". An
- * empty variant has not any valid data.
- * @note Each type in a variant type has a unique type id in the current
- * variant's scope. Type id 0 is for empty variant without any value. The last
- * type of the candidate types has type id 1. The type id is increased by 1 for
- * each type from right to left.
- *
- * @tparam Args the candidate types. The size and the alignment of variant will
- * be the max values of those in candidate types.
- */
-template <typename... Args>
-struct variant {
-private:
-    using size_t = std::size_t;
-    template <typename T>
-    using copy_or_move_handler
-            = variant_impl::copy_or_move_handler<typename std::decay<T>::type,
-                    variant>;
-    static const size_t size = variant_impl::const_max<sizeof(Args)...>::value;
-    static const size_t alignment
-            = variant_impl::const_max<alignof(Args)...>::value;
-
-    using buffer_t = typename std::aligned_storage<size, alignment>::type;
-    size_t id_ = 0;
-    buffer_t data_;
-
-    template <typename T, typename variantT>
-    friend struct variant_impl::copy_or_move_handler;
-
-public:
-    using helper_t = variant_impl::helper<Args...>;
-    // the variant is by default "empty". An empty variant has not any valid
-    // data
-    variant() = default;
-
-    // copy/move from variant/candidate types
-    template <typename T>
-    variant(T &&v) {
-        copy_or_move_handler<T>::call(*this, std::forward<T>(v));
-    }
-
-    // copy/move assign from variant/candidate types
-    template <typename T>
-    variant &operator=(T &&other) {
-        clear();
-        copy_or_move_handler<T>::call(*this, std::forward<T>(other));
-        return *this;
-    }
-
-    // check if the stored type is exactly the same as T
-    template <typename T>
-    bool isa() const {
-        return id_ == get_id_of_type<T>();
-    }
-
-    // get the type index of the stored value. returns 0 if it is empty.
-    size_t get_id() const { return id_; }
-
-    // get the type index of a type in candidates
-    template <typename T>
-    static constexpr size_t get_id_of_type() {
-        static_assert(helper_t::template has_type<T>(),
-                "The variant does not include this type");
-        return helper_t::template get_id<typename std::decay<T>::type>();
-    }
-
-    // return true of the type is in candidates
-    template <typename T>
-    static constexpr bool has_type() {
-        return helper_t::template has_type<typename std::decay<T>::type>();
-    }
-
-    // return true of the variant is not empty
-    bool defined() const { return id_ != 0; }
-
-    // get the stored object. If the stored type is not exactly the same as T,
-    // throw an error.
-    template <typename T>
-    T &get() {
-        if (id_ == get_id_of_type<T>()) {
-            return *reinterpret_cast<T *>(&data_);
-        } else {
-            throw std::runtime_error("Bad variant cast");
-        }
-    }
-
-    // get the stored object. If the stored type is not exactly the same as T,
-    // throw an error.
-    template <typename T>
-    const T &get() const {
-        if (id_ == get_id_of_type<T>()) {
-            return *reinterpret_cast<const T *>(&data_);
-        } else {
-            throw std::runtime_error("Bad variant cast");
-        }
-    }
-
-    ~variant() { helper_t::destroy(id_, &data_); }
-
-    // get the stored object's pointer, if T is same as or base type of the
-    // stored type. Otherwise, return null. Note that if the stored type is X,
-    // it returns a pointer to X
-    template <typename T>
-    const T *as_or_null() const {
-        return helper_t::template as<typename std::decay<T>::type>(
-                id_, const_cast<buffer_t *>(&data_));
-    }
-
-    // get the stored object's pointer, if T is same as or base type of the
-    // stored type. Otherwise, return null. Note that if the stored type is X,
-    // it returns a pointer to X
-    template <typename T>
-    T *as_or_null() {
-        return helper_t::template as<typename std::decay<T>::type>(id_, &data_);
-    }
-
-    // get the stored object's reference, if T is same as or base type of the
-    // stored type. Otherwise, throw an error
-    template <typename T>
-    T &as() {
-        auto ret = as_or_null<T>();
-        if (!ret) { throw std::runtime_error("Bad variant cast"); }
-        return *ret;
-    }
-
-    // get the stored object's reference, if T is same as or base type of the
-    // stored type. Otherwise, throw an error
-    template <typename T>
-    const T &as() const {
-        auto ret = as_or_null<T>();
-        if (!ret) { throw std::runtime_error("Bad variant cast"); }
-        return *ret;
-    }
-
-    // Convert the stored object to type T and return the converted object, if
-    // the stored type can be converted to T. Otherwise, throw an error
-    template <typename T>
-    T cast() const {
-        return helper_t::template cast<T>(id_, &data_);
-    }
-
-    // Clear the variant. After calling this function, the variant object will
-    // be empty.
-    void clear() {
-        helper_t::destroy(id_, &data_);
-        id_ = 0;
-    }
-};
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/core/src/util/weakptr_utils.hpp b/src/graph/backend/graph_compiler/core/src/util/weakptr_utils.hpp
deleted file mode 100644
index cc4fc8b9328..00000000000
--- a/src/graph/backend/graph_compiler/core/src/util/weakptr_utils.hpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_WEAKPTR_UTILS_HPP
-#define GRAPH_BACKEND_GRAPH_COMPILER_CORE_SRC_UTIL_WEAKPTR_UTILS_HPP
-
-#include <memory>
-#include <unordered_map>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace utils {
-
-template <typename T>
-bool is_uninitialized_weakptr(const std::weak_ptr<T> &weak) {
-    return !weak.owner_before(std::weak_ptr<T> {})
-            && !std::weak_ptr<T> {}.owner_before(weak);
-}
-
-template <typename T>
-static T *get_raw_from_weakptr(const std::weak_ptr<T> &wptr) {
-    if (is_uninitialized_weakptr(wptr)) return nullptr;
-    auto raw = wptr.lock();
-    assert(raw);
-    return raw.get();
-}
-
-template <typename T>
-struct weakptr_hashset_t {
-    using impl_t = std::unordered_map<T *, std::weak_ptr<T>>;
-    impl_t impl_;
-    template <typename IterT>
-    struct iterator_t {
-        IterT impl_itr_;
-        std::weak_ptr<T> operator*() { return impl_itr_->second; }
-        iterator_t &operator++() {
-            ++impl_itr_;
-            return *this;
-        }
-        iterator_t operator++(int) {
-            iterator old = *this;
-            ++impl_itr_;
-            return old;
-        }
-        bool operator!=(const iterator_t &other) const {
-            return impl_itr_ != other.impl_itr_;
-        }
-    };
-    using iterator = iterator_t<typename impl_t::iterator>;
-    using const_iterator = iterator_t<typename impl_t::const_iterator>;
-    iterator begin() noexcept { return iterator {impl_.begin()}; }
-    iterator end() noexcept { return iterator {impl_.end()}; }
-    const_iterator begin() const noexcept {
-        return const_iterator {impl_.begin()};
-    }
-    const_iterator end() const noexcept { return const_iterator {impl_.end()}; }
-
-    const_iterator find(std::weak_ptr<T> v) const {
-        return const_iterator {impl_.find(v.lock().get())};
-    }
-
-    iterator find(std::weak_ptr<T> v) {
-        return iterator {impl_.find(v.lock().get())};
-    }
-
-    bool has(const std::weak_ptr<T> &v) const {
-        return impl_.find(v.lock().get()) != impl_.end();
-    }
-
-    void merge(const weakptr_hashset_t<T> &other) {
-        impl_.insert(other.impl_.begin(), other.impl_.end());
-    }
-
-    void insert(const std::weak_ptr<T> &v) {
-        auto ptr = v.lock();
-        impl_[ptr.get()] = v;
-    }
-
-    void insert(const std::shared_ptr<T> &v) { impl_[v.get()] = v; }
-
-    void erase(const std::weak_ptr<T> &v) { impl_.erase(v.lock().get()); }
-
-    weakptr_hashset_t() = default;
-    weakptr_hashset_t(std::initializer_list<std::weak_ptr<T>> initv) {
-        for (auto &v : initv) {
-            auto ptr = v.lock();
-            impl_[ptr.get()] = v;
-        }
-    }
-
-    size_t size() const { return impl_.size(); }
-
-    bool operator==(const weakptr_hashset_t<T> &other) const {
-        if (size() != other.size()) { return false; }
-        for (auto &kv : impl_) {
-            if (other.impl_.find(kv.first) == other.impl_.end()) {
-                return false;
-            }
-        }
-        return true;
-    }
-};
-
-} // namespace utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/concat_pattern.hpp b/src/graph/backend/graph_compiler/patterns/concat_pattern.hpp
deleted file mode 100644
index 7fd5958adcd..00000000000
--- a/src/graph/backend/graph_compiler/patterns/concat_pattern.hpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_CONCAT_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_CONCAT_PATTERN_HPP
-
-#include <memory>
-#include <utility>
-
-#include "graph/backend/graph_compiler/patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-namespace pm = graph::utils::pm;
-using in_edges_t = pm::in_edges_t;
-using pb_graph_t = graph::utils::pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(concat_patterns)
-
-/* from GPT-J bf16
-[IN0](dtype)  [IN1](dtype) [IN2](dtype)   [IN3](dtype)
-      \            /            |              /
-           add                 to1            /
-            \                  /             /
-                concat1                     /
-                   |                       /
-                permute                   /
-                   \                     /
-                         concat2
-                            |
-                           to2
-                            |
-                          [OUT0](dtype)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, add_to_concat_permute_concat_to)
-        .set_priority(5.5f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto add_layer = pgraph->append_op(graph::op_kind::Add);
-                    add_layer->allow_external_outputs();
-                    auto to_layer1
-                            = pgraph->append_op(graph::op_kind::TypeCast);
-                    to_layer1->allow_external_outputs();
-                    auto concat_layer1
-                            = pgraph->append_op(graph::op_kind::Concat,
-                                    {in_edge(0, add_layer, 0),
-                                            in_edge(1, to_layer1, 0)});
-                    auto permute_layer
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, concat_layer1, 0)});
-                    permute_layer->allow_external_outputs();
-                    auto concat_layer2
-                            = pgraph->append_op(graph::op_kind::Concat,
-                                    {in_edge(1, permute_layer, 0)});
-                    concat_layer2->allow_external_outputs();
-                    append_single_op_repetition_subgraph(pgraph,
-                            graph::op_kind::TypeCast, concat_layer2, 0, 3);
-                });
-
-/* from GPT-J bf16
-[IN0](dtype)  [IN1](dtype)  [IN2](dtype)
-      \            /           |
-           add                to
-             \               /
-                 concat
-                    |
-                 permute
-                    |
-                [OUT0](dtype)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, add_to_concat_permute)
-        .set_priority(5.0f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto add_layer = pgraph->append_op(graph::op_kind::Add);
-                    add_layer->allow_external_outputs();
-                    auto to_layer1
-                            = pgraph->append_op(graph::op_kind::TypeCast);
-                    to_layer1->allow_external_outputs();
-                    auto concat_layer1
-                            = pgraph->append_op(graph::op_kind::Concat,
-                                    {in_edge(0, add_layer, 0),
-                                            in_edge(1, to_layer1, 0)});
-                    auto permute_layer
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, concat_layer1, 0)});
-                    permute_layer->allow_external_outputs();
-                });
-
-/* from GPT-J bf16
-[IN0](dtype) [IN1](dtype)
-    |             |
-    |          permute
-     \            /
-         concat
-           |
-           to
-           |
-        [OUT0](dtype)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, permute_concat_to)
-        .set_priority(5.0f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto permute_layer = pgraph->append_op(
-                            graph::op_kind::StaticTranspose);
-                    permute_layer->allow_external_outputs();
-                    auto concat_layer
-                            = pgraph->append_op(graph::op_kind::Concat,
-                                    {in_edge(1, permute_layer, 0)});
-                    concat_layer->allow_external_outputs();
-                    append_single_op_repetition_subgraph(pgraph,
-                            graph::op_kind::TypeCast, concat_layer, 0, 3);
-                });
-
-/* from Llama int8_bf16
-typecast -> concat
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, typecast_concat)
-        .set_priority(5.0f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto to_layer1
-                            = pgraph->append_op(graph::op_kind::TypeCast);
-                    to_layer1->allow_external_outputs();
-                    pgraph->append_op(
-                            graph::op_kind::Concat, {in_edge(1, to_layer1, 0)});
-                });
-
-/* from Llama int8_bf16
-concat -> mul
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, concat_mul)
-        .set_priority(5.0f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto concat_layer
-                            = pgraph->append_op(graph::op_kind::Concat);
-                    concat_layer->allow_external_outputs();
-                    pgraph->append_op(graph::op_kind::Multiply,
-                            {in_edge(0, concat_layer, 0)});
-                });
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(int8_concat_patterns)
-
-/* from GPT-J int8-bf16
-[IN0](dtype) [IN1](dtype)
-    |             |
-    |          permute
-     \            /
-         concat
-           |
-           to
-           |
-        quantize
-           |
-        [OUT0](dtype)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, permute_concat_to_quantize)
-        .set_priority(5.6f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto permute_layer = pgraph->append_op(
-                            graph::op_kind::StaticTranspose);
-                    permute_layer->allow_external_outputs();
-                    auto concat_layer
-                            = pgraph->append_op(graph::op_kind::Concat,
-                                    {in_edge(1, permute_layer, 0)});
-                    concat_layer->allow_external_outputs();
-                    auto typecast_rep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, concat_layer, 0,
-                            3);
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, typecast_rep, 0)});
-                });
-
-/* from GPT-J int8_bf16
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, mul_mul_add_concat_permute_concat_quant)
-        .set_priority(6.5f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto mul1 = pgraph->append_op(graph::op_kind::Multiply);
-                    auto mul2 = pgraph->append_op(graph::op_kind::Multiply);
-                    auto add_layer = pgraph->append_op(graph::op_kind::Add,
-                            {in_edge(0, mul1, 0), in_edge(1, mul2, 0)});
-                    add_layer->allow_external_outputs();
-                    auto concat_layer1 = pgraph->append_op(
-                            graph::op_kind::Concat, {in_edge(0, add_layer, 0)});
-                    auto permute_layer
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, concat_layer1, 0)});
-                    permute_layer->allow_external_outputs();
-                    auto concat_layer2
-                            = pgraph->append_op(graph::op_kind::Concat,
-                                    {in_edge(1, permute_layer, 0)});
-                    concat_layer2->allow_external_outputs();
-                    auto typecast_rep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, concat_layer2, 0,
-                            3);
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, typecast_rep, 0)});
-                });
-
-/* from GPT-J int8_bf16
-[IN0](dtype)  [IN1](dtype)  [IN2](dtype)
-     |              \            /
-     |                   add
-      \                  /
-             concat1
-                |
-             permute
-                |
-              quant
-                |
-            [OUT0](dtype)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, add_concat_permute_quant)
-        .set_priority(6.5f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto add_layer = pgraph->append_op(graph::op_kind::Add);
-                    add_layer->allow_external_outputs();
-                    auto concat_layer = pgraph->append_op(
-                            graph::op_kind::Concat, {in_edge(0, add_layer, 0)});
-                    auto permute
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, concat_layer, 0)});
-                    auto typecast_rep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, permute, 0, 3);
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, typecast_rep, 0)});
-                });
-
-/* from GPT-J int8_bf16
-concat -> [typecasts ->] quant
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, concat_quant)
-        .set_priority(5.5f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto concat_layer
-                            = pgraph->append_op(graph::op_kind::Concat);
-                    concat_layer->allow_external_outputs();
-                    concat_layer->append_decision_function(
-                            check_if_null_producer);
-                    auto typecast_rep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, concat_layer, 0,
-                            3);
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, typecast_rep, 0)});
-                });
-
-/* from Llama int8_bf16
-add -> typecast -> concat -> typecasts -> quant
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, add_typecast_concat_typecasts_quant)
-        .set_priority(6.5f)
-        .set_engine_kind(engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::concat_fusion_memory_optim)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto add_layer = pgraph->append_op(graph::op_kind::Add);
-                    auto to_layer1 = pgraph->append_op(graph::op_kind::TypeCast,
-                            {in_edge(0, add_layer, 0)});
-                    to_layer1->allow_external_outputs();
-                    auto concat_layer = pgraph->append_op(
-                            graph::op_kind::Concat, {in_edge(1, to_layer1, 0)});
-                    concat_layer->allow_external_outputs();
-                    auto typecast_rep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, concat_layer, 0,
-                            3);
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, typecast_rep, 0)});
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/conv_pattern.hpp b/src/graph/backend/graph_compiler/patterns/conv_pattern.hpp
deleted file mode 100644
index fb390a31915..00000000000
--- a/src/graph/backend/graph_compiler/patterns/conv_pattern.hpp
+++ /dev/null
@@ -1,1089 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_CONV_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_CONV_PATTERN_HPP
-
-#include <memory>
-#include <utility>
-
-#include "graph/backend/graph_compiler/patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-namespace pm = graph::utils::pm;
-using in_edges_t = pm::in_edges_t;
-using pb_graph_t = graph::utils::pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-// conv_bias_relu
-/*
-         [input]   [filter]
-              \     /
-            Convolution      [bias]
-                 \           /
-                BiasAdd (optional)
-                   |
-                  Relu
-                   |
-                [output]
-*/
-pm::pb_node_t *conv_bias_relu(const std::shared_ptr<pb_graph_t> &pgraph,
-        pm::pb_node_t *input, bool has_relu = false, bool is_bf16 = false) {
-    in_edges_t in_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-
-    pm::pb_op_t *conv
-            = pgraph->append_op(graph::op_kind::Convolution, in_edges);
-    conv->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(graph::op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd = pgraph->append_optional(
-            biasadd_subgraph, in_edges_t {in_edge(0, conv, 0)});
-
-    if (has_relu) {
-        pm::pb_op_t *relu = pgraph->append_op(graph::op_kind::ReLU,
-                in_edges_t {in_edge(0, optional_biasadd, 0)});
-        return relu;
-    } else {
-        return optional_biasadd;
-    }
-};
-
-std::pair<pm::pb_op_t *, pm::pb_op_t *> conv_bias_relu_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16 = false) {
-    pm::pb_op_t *conv = pgraph->append_op(graph::op_kind::Convolution);
-    conv->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(graph::op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd = pgraph->append_optional(
-            biasadd_subgraph, in_edges_t {in_edge(0, conv, 0)});
-
-    pm::pb_op_t *relu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, optional_biasadd, 0)});
-    return {conv, relu};
-};
-
-// conv_bias_add_relu
-/*
-         [input]   [filter]
-              \     /
-            Convolution   [bias]
-                 \        /
-       (optional) BiasAdd   [other]
-                        \   /
-                         Add
-                          |
-                         Relu
-                          |
-                       [output]
-*/
-pm::pb_node_t *conv_bias_add_relu_flex(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        pm::pb_node_t *post_src, bool is_bf16 = false) {
-    in_edges_t in_edges, post_src_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-
-    pm::pb_op_t *conv = pgraph->append_op(
-            graph::op_kind::Convolution, in_edges_t {in_edge(0, input, 0)});
-    conv->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(graph::op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd = pgraph->append_optional(
-            biasadd_subgraph, in_edges_t {in_edge(0, conv, 0)});
-
-    in_edges_t add_in_edges = in_edges_t {in_edge(0, optional_biasadd, 0)};
-    if (post_src) { add_in_edges.emplace_back(in_edge(1, post_src, 0)); }
-    pm::pb_op_t *add = pgraph->append_op(graph::op_kind::Add, add_in_edges);
-    pm::pb_op_t *relu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, add, 0)});
-    return relu;
-};
-
-pm::pb_node_t *conv_bias_add_relu(const std::shared_ptr<pb_graph_t> &pgraph,
-        pm::pb_node_t *input, pm::pb_node_t *post_src, bool use_biasadd = false,
-        bool is_bf16 = false) {
-    in_edges_t in_edges, post_src_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-
-    pm::pb_op_t *conv = pgraph->append_op(
-            graph::op_kind::Convolution, in_edges_t {in_edge(0, input, 0)});
-    conv->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-
-    pm::pb_op_t *conv_bias_dst = nullptr;
-    if (use_biasadd) {
-        pm::pb_op_t *biasadd = pgraph->append_op(
-                graph::op_kind::BiasAdd, in_edges_t {in_edge(0, conv, 0)});
-        conv_bias_dst = biasadd;
-    } else {
-        conv_bias_dst = conv;
-    }
-
-    in_edges_t add_in_edges = in_edges_t {in_edge(0, conv_bias_dst, 0)};
-    if (post_src) { add_in_edges.emplace_back(in_edge(1, post_src, 0)); }
-    pm::pb_op_t *add = pgraph->append_op(graph::op_kind::Add, add_in_edges);
-    pm::pb_op_t *relu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, add, 0)});
-    return relu;
-};
-
-/*
-                    [filter]
-                       |
-         [input]    Quantize (optional)
-            |          |
-      Dequantize   Dequantize
-              \     /
-            Convolution      [bias]
-                 \           /
-                BiasAdd (optional)
-                   |
-                  Relu
-                   |
-                Quantize
-                   |
-                [output]
-*/
-pm::pb_node_t *int8_conv_bias_relu(const std::shared_ptr<pb_graph_t> &pgraph,
-        pm::pb_node_t *input, bool has_relu = false,
-        bool use_quant_wei = false) {
-    in_edges_t in_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-
-    pm::pb_op_t *dequant_src = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-            in_edges);
-    pm::pb_op_t *dequant_wei;
-    if (use_quant_wei) {
-        pm::pb_op_t *quant_wei = pgraph->append_alternation(
-                {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize});
-        dequant_wei = pgraph->append_alternation(
-                {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-                in_edges_t {in_edge(0, quant_wei, 0)});
-    } else {
-        dequant_wei = pgraph->append_alternation({graph::op_kind::Dequantize,
-                graph::op_kind::DynamicDequantize});
-    }
-
-    pm::pb_op_t *conv = pgraph->append_op(graph::op_kind::Convolution,
-            in_edges_t {
-                    in_edge(0, dequant_src, 0), in_edge(1, dequant_wei, 0)});
-    conv->append_decision_function(check_conv_attrs);
-    conv->append_decision_function(check_input_dtype<graph::data_type::f32>);
-
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(graph::op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd = pgraph->append_optional(
-            biasadd_subgraph, in_edges_t {in_edge(0, conv, 0)});
-
-    in_edges_t quant_in_edges {in_edge(0, optional_biasadd, 0)};
-    if (has_relu) {
-        pm::pb_op_t *relu = pgraph->append_op(graph::op_kind::ReLU,
-                in_edges_t {in_edge(0, optional_biasadd, 0)});
-        quant_in_edges = {in_edge(0, relu, 0)};
-    }
-    pm::pb_op_t *quant_dst = pgraph->append_alternation(
-            {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize},
-            quant_in_edges);
-    return quant_dst;
-};
-
-std::pair<pm::pb_op_t *, pm::pb_op_t *> int8_conv_bias_relu_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph) {
-    pm::pb_op_t *dequant_src = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize});
-    pm::pb_op_t *dequant_wei = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize});
-
-    pm::pb_op_t *conv = pgraph->append_op(graph::op_kind::Convolution,
-            in_edges_t {
-                    in_edge(0, dequant_src, 0), in_edge(1, dequant_wei, 0)});
-    conv->append_decision_function(check_conv_attrs);
-    conv->append_decision_function(check_input_dtype<graph::data_type::f32>);
-
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(graph::op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd = pgraph->append_optional(
-            biasadd_subgraph, in_edges_t {in_edge(0, conv, 0)});
-
-    pm::pb_op_t *relu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, optional_biasadd, 0)});
-    pm::pb_op_t *quant_dst = pgraph->append_alternation(
-            {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize},
-            in_edges_t {in_edge(0, relu, 0)});
-    return {dequant_src, quant_dst};
-};
-
-// int8_conv_bias_add_relu
-/*
-                    [filter]
-                       |
-         [input]    Quantize (optional)
-            |          |
-      Dequantize   Dequantize
-              \       /
-            Convolution   [bias]
-                 \        /
-       (optional) BiasAdd   [other]
-                        \   /
-                         Add
-                          |
-                         Relu
-                          |
-                       Quantize
-                          |
-                        [output]
-*/
-pm::pb_node_t *int8_conv_bias_add_relu_flex(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        pm::pb_node_t *post_src, bool use_quant_wei = false,
-        bool f32_output = false) {
-    in_edges_t in_edges, post_src_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-    if (post_src) { post_src_edges = in_edges_t {in_edge(0, post_src, 0)}; }
-
-    pm::pb_op_t *dequant_src = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-            in_edges);
-    pm::pb_op_t *dequant_wei;
-    if (use_quant_wei) {
-        pm::pb_op_t *quant_wei = pgraph->append_alternation(
-                {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize});
-        dequant_wei = pgraph->append_alternation(
-                {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-                in_edges_t {in_edge(0, quant_wei, 0)});
-    } else {
-        dequant_wei = pgraph->append_alternation({graph::op_kind::Dequantize,
-                graph::op_kind::DynamicDequantize});
-    }
-    pm::pb_op_t *dequant_other = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-            post_src_edges);
-
-    pm::pb_op_t *conv = pgraph->append_op(graph::op_kind::Convolution,
-            in_edges_t {
-                    in_edge(0, dequant_src, 0), in_edge(1, dequant_wei, 0)});
-    conv->append_decision_function(check_conv_attrs);
-    conv->append_decision_function(check_input_dtype<graph::data_type::f32>);
-
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(graph::op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd = pgraph->append_optional(
-            biasadd_subgraph, in_edges_t {in_edge(0, conv, 0)});
-
-    pm::pb_op_t *add = pgraph->append_op(graph::op_kind::Add,
-            in_edges_t {in_edge(0, optional_biasadd, 0),
-                    in_edge(1, dequant_other, 0)});
-    pm::pb_op_t *relu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, add, 0)});
-
-    // deal with itex int8 last bottleneck
-    if (f32_output) { return relu; }
-
-    pm::pb_op_t *quant_dst = pgraph->append_alternation(
-            {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize},
-            in_edges_t {in_edge(0, relu, 0)});
-    return quant_dst;
-};
-
-pm::pb_node_t *int8_conv_bias_add_relu(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        pm::pb_node_t *post_src, bool use_biasadd = false,
-        bool use_quant_wei = false, bool f32_output = false) {
-    in_edges_t in_edges, post_src_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-    if (post_src) { post_src_edges = in_edges_t {in_edge(0, post_src, 0)}; }
-
-    pm::pb_op_t *dequant_src = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-            in_edges);
-    pm::pb_op_t *dequant_wei;
-    if (use_quant_wei) {
-        pm::pb_op_t *quant_wei = pgraph->append_alternation(
-                {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize});
-        dequant_wei = pgraph->append_alternation(
-                {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-                in_edges_t {in_edge(0, quant_wei, 0)});
-    } else {
-        dequant_wei = pgraph->append_alternation({graph::op_kind::Dequantize,
-                graph::op_kind::DynamicDequantize});
-    }
-    pm::pb_op_t *dequant_other = pgraph->append_alternation(
-            {graph::op_kind::Dequantize, graph::op_kind::DynamicDequantize},
-            post_src_edges);
-
-    pm::pb_op_t *conv = pgraph->append_op(graph::op_kind::Convolution,
-            in_edges_t {
-                    in_edge(0, dequant_src, 0), in_edge(1, dequant_wei, 0)});
-    conv->append_decision_function(check_conv_attrs);
-    conv->append_decision_function(check_input_dtype<graph::data_type::f32>);
-
-    pm::pb_op_t *conv_bias_dst = nullptr;
-    if (use_biasadd) {
-        pm::pb_op_t *biasadd = pgraph->append_op(
-                graph::op_kind::BiasAdd, in_edges_t {in_edge(0, conv, 0)});
-        conv_bias_dst = biasadd;
-    } else {
-        conv_bias_dst = conv;
-    }
-
-    pm::pb_op_t *add = pgraph->append_op(graph::op_kind::Add,
-            in_edges_t {in_edge(0, conv_bias_dst, 0),
-                    in_edge(1, dequant_other, 0)});
-    pm::pb_op_t *relu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, add, 0)});
-
-    // deal with itex int8 last bottleneck
-    if (f32_output) { return relu; }
-
-    pm::pb_op_t *quant_dst = pgraph->append_alternation(
-            {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize},
-            in_edges_t {in_edge(0, relu, 0)});
-    return quant_dst;
-};
-
-pm::pb_node_t *convolutional_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_bf16 = false) {
-    pm::pb_node_t *dst0 = conv_bias_relu(pgraph, input, true, is_bf16);
-    pm::pb_node_t *dst1 = conv_bias_relu(pgraph, dst0, true, is_bf16);
-    pm::pb_node_t *dst2 = conv_bias_relu(pgraph, input, false, is_bf16);
-    pm::pb_node_t *dst3 = conv_bias_add_relu_flex(pgraph, dst1, dst2, is_bf16);
-    return dst3;
-};
-
-pm::pb_node_t *identical_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_bf16 = false) {
-    pm::pb_node_t *dst0 = conv_bias_relu(pgraph, input, true, is_bf16);
-    pm::pb_node_t *dst1 = conv_bias_relu(pgraph, dst0, true, is_bf16);
-    pm::pb_node_t *dst2 = conv_bias_add_relu_flex(pgraph, dst1, input, is_bf16);
-    return dst2;
-};
-
-pm::pb_node_t *int8_identical_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool use_biasadd = false, bool use_quant_wei = false,
-        bool output_f32 = false) {
-    pm::pb_node_t *quant_dst0
-            = int8_conv_bias_relu(pgraph, input, true, use_quant_wei);
-    pm::pb_node_t *quant_dst1
-            = int8_conv_bias_relu(pgraph, quant_dst0, true, use_quant_wei);
-    pm::pb_node_t *quant_dst2 = int8_conv_bias_add_relu(
-            pgraph, quant_dst1, input, use_biasadd, use_quant_wei, output_f32);
-    return quant_dst2;
-};
-
-pm::pb_node_t *int8_convolutional_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool use_quant_wei = false) {
-    pm::pb_node_t *quant_dst0
-            = int8_conv_bias_relu(pgraph, input, true, use_quant_wei);
-    pm::pb_node_t *quant_dst1
-            = int8_conv_bias_relu(pgraph, quant_dst0, true, use_quant_wei);
-    pm::pb_node_t *quant_dst2
-            = int8_conv_bias_relu(pgraph, input, false, use_quant_wei);
-    pm::pb_node_t *quant_dst3
-            = int8_conv_bias_add_relu(pgraph, quant_dst1, quant_dst2);
-    return quant_dst3;
-};
-
-pm::pb_node_t *int8_convolutional_bottleneck_resblock_v2(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool use_biasadd = false, bool use_quant_wei = false) {
-    pm::pb_node_t *quant_dst0
-            = int8_conv_bias_relu(pgraph, input, true, use_quant_wei);
-    pm::pb_node_t *quant_dst1
-            = int8_conv_bias_relu(pgraph, quant_dst0, true, use_quant_wei);
-    pm::pb_node_t *quant_dst2
-            = int8_conv_bias_relu(pgraph, quant_dst1, false, use_quant_wei);
-    pm::pb_node_t *quant_dst3 = int8_conv_bias_add_relu(
-            pgraph, input, quant_dst2, use_biasadd, use_quant_wei);
-    return quant_dst3;
-};
-
-pm::pb_node_t *general_identical_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16 = false) {
-    auto subgraph = std::make_shared<pb_graph_t>();
-    auto ports = conv_bias_relu_subgraph(subgraph, is_bf16);
-    subgraph->create_input_port(0, ports.first, 0);
-    subgraph->create_output_port(0, ports.second, 0);
-
-    pm::pb_node_t *dst1 = pgraph->append_repetition(subgraph, {0, 0}, 1, 4);
-
-    pm::pb_node_t *dst2
-            = conv_bias_add_relu_flex(pgraph, dst1, nullptr, is_bf16);
-    return dst2;
-};
-
-pm::pb_node_t *general_convolutional_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16 = false) {
-    auto subgraph = std::make_shared<pb_graph_t>();
-    auto ports = conv_bias_relu_subgraph(subgraph, is_bf16);
-    subgraph->create_input_port(0, ports.first, 0);
-    subgraph->create_output_port(0, ports.second, 0);
-    pm::pb_node_t *dst1 = pgraph->append_repetition(subgraph, {0, 0}, 1, 4);
-
-    pm::pb_node_t *dst2 = conv_bias_relu(pgraph, nullptr, false, is_bf16);
-    pm::pb_node_t *dst3 = conv_bias_add_relu_flex(pgraph, dst1, dst2, is_bf16);
-    return dst3;
-};
-
-pm::pb_node_t *general_int8_identical_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph) {
-    auto subgraph = std::make_shared<pb_graph_t>();
-    auto ports = int8_conv_bias_relu_subgraph(subgraph);
-    subgraph->create_input_port(0, ports.first, 0);
-    subgraph->create_output_port(0, ports.second, 0);
-    pm::pb_node_t *quant_dst1
-            = pgraph->append_repetition(subgraph, {0, 0}, 1, 4);
-
-    pm::pb_node_t *quant_dst2
-            = int8_conv_bias_add_relu_flex(pgraph, quant_dst1, nullptr);
-    return quant_dst2;
-};
-
-pm::pb_node_t *general_int8_convolutional_bottleneck_resblock(
-        const std::shared_ptr<pb_graph_t> &pgraph) {
-    auto subgraph = std::make_shared<pb_graph_t>();
-    auto ports = int8_conv_bias_relu_subgraph(subgraph);
-    subgraph->create_input_port(0, ports.first, 0);
-    subgraph->create_output_port(0, ports.second, 0);
-    pm::pb_node_t *quant_dst1
-            = pgraph->append_repetition(subgraph, {0, 0}, 1, 4);
-
-    pm::pb_node_t *quant_dst2
-            = int8_conv_bias_relu(pgraph, nullptr, false, false);
-    pm::pb_node_t *quant_dst3
-            = int8_conv_bias_add_relu_flex(pgraph, quant_dst1, quant_dst2);
-    return quant_dst3;
-};
-
-pm::pb_op_t *conv_bn_relu(const std::shared_ptr<pb_graph_t> &pgraph,
-        pm::pb_op_t *input, bool has_relu = false, bool is_bf16 = false) {
-    in_edges_t in_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-
-    pm::pb_op_t *conv
-            = pgraph->append_op(graph::op_kind::Convolution, in_edges);
-    conv->allow_external_outputs();
-    conv->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-
-    pm::pb_op_t *bn
-            = pgraph->append_op(graph::op_kind::BatchNormForwardTraining,
-                    in_edges_t {in_edge(0, conv, 0)});
-    bn->allow_external_outputs();
-    bn->append_decision_function(check_input_num<5>);
-    pm::pb_op_t *output = bn;
-    if (has_relu) {
-        output = pgraph->append_op(graph::op_kind::ReLU, {in_edge(0, bn, 0)});
-        output->allow_external_outputs();
-    }
-    return output;
-}
-
-pm::pb_op_t *conv_bn_relu_bwd(const std::shared_ptr<pb_graph_t> &pgraph,
-        pm::pb_op_t *input, bool has_relu = false, bool is_bf16 = false) {
-    in_edges_t in_edges;
-    if (input) {
-        // delta is the second input of both bn_bwd and relu_bwd
-        in_edges = in_edges_t {in_edge(1, input, 0)};
-    }
-
-    pm::pb_op_t *relu_bwd;
-    if (has_relu) {
-        relu_bwd = pgraph->append_op(graph::op_kind::ReLUBackward, in_edges);
-        relu_bwd->allow_external_outputs();
-        in_edges = in_edges_t {in_edge(1, relu_bwd, 0)};
-    }
-    pm::pb_op_t *bn_bwd = pgraph->append_op(
-            graph::op_kind::BatchNormTrainingBackward, in_edges);
-    bn_bwd->allow_external_outputs();
-    bn_bwd->append_decision_function(check_input_num<5>);
-    bn_bwd->append_decision_function(check_output_num<3>);
-    pm::pb_op_t *conv_bwd_data
-            = pgraph->append_op(graph::op_kind::ConvolutionBackwardData,
-                    in_edges_t {in_edge(0, bn_bwd, 0)});
-    conv_bwd_data->allow_external_outputs();
-    conv_bwd_data->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv_bwd_data->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv_bwd_data->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-
-    pm::pb_op_t *conv_bwd_filter
-            = pgraph->append_op(graph::op_kind::ConvolutionBackwardWeights,
-                    in_edges_t {in_edge(1, bn_bwd, 0)});
-    conv_bwd_filter->allow_external_outputs();
-    conv_bwd_filter->append_decision_function(check_conv_attrs);
-    if (is_bf16) {
-        conv_bwd_filter->append_decision_function(
-                check_input_dtype<graph::data_type::bf16>);
-    } else {
-        conv_bwd_filter->append_decision_function(
-                check_input_dtype<graph::data_type::f32>);
-    }
-    return conv_bwd_data;
-}
-
-pm::pb_op_t *convolutional_bottleneck_training_forward(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool is_bf16 = false) {
-    pm::pb_op_t *dst0 = conv_bn_relu(pgraph, nullptr, true, is_bf16);
-    pm::pb_op_t *dst1 = conv_bn_relu(pgraph, dst0, true, is_bf16);
-    pm::pb_op_t *dst2 = conv_bn_relu(pgraph, dst1, false, is_bf16);
-    pm::pb_op_t *dst3 = conv_bn_relu(pgraph, nullptr, false, is_bf16);
-    auto bottleneck_add = pgraph->append_op(
-            graph::op_kind::Add, {in_edge(0, dst2, 0), in_edge(1, dst3, 0)});
-    auto relu = pgraph->append_op(
-            graph::op_kind::ReLU, {in_edge(0, bottleneck_add, 0)});
-    return relu;
-};
-
-pm::pb_op_t *identical_bottleneck_training_forward(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool is_bf16 = false) {
-    pm::pb_op_t *dst0 = conv_bn_relu(pgraph, input, true, is_bf16);
-    pm::pb_op_t *dst1 = conv_bn_relu(pgraph, dst0, true, is_bf16);
-    pm::pb_op_t *dst2 = conv_bn_relu(pgraph, dst1, false, is_bf16);
-    auto bottleneck_add
-            = pgraph->append_op(graph::op_kind::Add, {in_edge(0, dst2, 0)});
-    auto relu = pgraph->append_op(
-            graph::op_kind::ReLU, {in_edge(0, bottleneck_add, 0)});
-    return relu;
-};
-
-pm::pb_op_t *convolutional_bottleneck_training_backward(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool is_bf16 = false) {
-    pm::pb_op_t *relu_bwd_top = pgraph->append_op(graph::op_kind::ReLUBackward);
-    pm::pb_op_t *dst0 = conv_bn_relu_bwd(pgraph, relu_bwd_top, false, is_bf16);
-    pm::pb_op_t *dst1 = conv_bn_relu_bwd(pgraph, dst0, true, is_bf16);
-    pm::pb_op_t *dst2 = conv_bn_relu_bwd(pgraph, dst1, true, is_bf16);
-    pm::pb_op_t *dst3 = conv_bn_relu_bwd(pgraph, relu_bwd_top, false, is_bf16);
-    pm::pb_op_t *bottleneck_add = pgraph->append_op(
-            graph::op_kind::Add, {in_edge(0, dst2, 0), in_edge(1, dst3, 0)});
-    return bottleneck_add;
-};
-
-pm::pb_op_t *identical_bottleneck_training_backward(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool is_bf16 = false) {
-    pm::pb_op_t *relu_bwd_top = pgraph->append_op(graph::op_kind::ReLUBackward);
-    pm::pb_op_t *dst0 = conv_bn_relu_bwd(pgraph, relu_bwd_top, false, is_bf16);
-    pm::pb_op_t *dst1 = conv_bn_relu_bwd(pgraph, dst0, true, is_bf16);
-    pm::pb_op_t *dst2 = conv_bn_relu_bwd(pgraph, dst1, true, is_bf16);
-    pm::pb_op_t *bottleneck_add = pgraph->append_op(graph::op_kind::Add,
-            {in_edge(0, dst2, 0), in_edge(1, relu_bwd_top, 0)});
-    return bottleneck_add;
-};
-
-pm::pb_op_t *convolutional_bottleneck_training_backward_v2(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool is_bf16 = false) {
-    auto dst0 = conv_bn_relu_bwd(pgraph, nullptr, false, is_bf16);
-    auto dst1 = conv_bn_relu_bwd(pgraph, dst0, true, is_bf16);
-    auto dst2 = conv_bn_relu_bwd(pgraph, dst1, true, is_bf16);
-    auto dst3 = conv_bn_relu_bwd(pgraph, nullptr, false, is_bf16);
-    auto bottleneck_add = pgraph->append_op(
-            graph::op_kind::Add, {in_edge(0, dst2, 0), in_edge(1, dst3, 0)});
-    auto relu_bwd = pgraph->append_op(
-            graph::op_kind::ReLUBackward, {in_edge(1, bottleneck_add, 0)});
-    return relu_bwd;
-};
-
-pm::pb_op_t *identical_bottleneck_training_backward_v2(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool is_bf16 = false) {
-    pm::pb_op_t *dst0 = conv_bn_relu_bwd(pgraph, nullptr, false, is_bf16);
-    pm::pb_op_t *dst1 = conv_bn_relu_bwd(pgraph, dst0, true, is_bf16);
-    pm::pb_op_t *dst2 = conv_bn_relu_bwd(pgraph, dst1, true, is_bf16);
-    pm::pb_op_t *bottleneck_add
-            = pgraph->append_op(graph::op_kind::Add, {in_edge(0, dst2, 0)});
-    pm::pb_op_t *relu_bwd = pgraph->append_op(
-            graph::op_kind::ReLUBackward, {in_edge(1, bottleneck_add, 0)});
-    return relu_bwd;
-};
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(fp32_conv_inference_pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_identical_bottleneck)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    general_identical_bottleneck_resblock(pgraph, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_convolutional_bottleneck)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    general_convolutional_bottleneck_resblock(pgraph, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_resnet50_stage_1_4_fusion_gc)
-        .set_priority(22.f) // high priority to support itex
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = convolutional_bottleneck_resblock(pgraph, nullptr);
-                    output = identical_bottleneck_resblock(pgraph, output);
-                    output = identical_bottleneck_resblock(pgraph, output);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_resnet50_stage_2_fusion_gc)
-        .set_priority(22.1f) // high priority to support itex
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = convolutional_bottleneck_resblock(pgraph, nullptr);
-                    const size_t identical_residual_block_num = 3;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = identical_bottleneck_resblock(pgraph, output);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_resnet50_stage_3_fusion_gc)
-        .set_priority(22.2f) // high priority to support itex
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = convolutional_bottleneck_resblock(pgraph, nullptr);
-                    const size_t identical_residual_block_num = 5;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = identical_bottleneck_resblock(pgraph, output);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(fp32_conv_training_pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_identical_bottleneck_forward)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    identical_bottleneck_training_forward(
-                            pgraph, nullptr, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_convolutional_bottleneck_forward)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    convolutional_bottleneck_training_forward(
-                            pgraph, nullptr, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_identical_bottleneck_backward_v1)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    identical_bottleneck_training_backward(
-                            pgraph, nullptr, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_convolutional_bottleneck_backward_v1)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    convolutional_bottleneck_training_backward(
-                            pgraph, nullptr, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_identical_bottleneck_backward_v2)
-        .set_priority(4.0f) // set to lower priority as backup
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    identical_bottleneck_training_backward_v2(
-                            pgraph, nullptr, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, f32_convolutional_bottleneck_backward_v2)
-        .set_priority(4.5f) // set to lower priority as backup
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    convolutional_bottleneck_training_backward_v2(
-                            pgraph, nullptr, false);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(bf16_conv_inference_pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_identical_bottleneck)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    general_identical_bottleneck_resblock(pgraph, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_convolutional_bottleneck)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    general_convolutional_bottleneck_resblock(pgraph, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_resnet50_stage_1_4_fusion_gc)
-        .set_priority(22.f) // high priority to support itex
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = convolutional_bottleneck_resblock(
-                            pgraph, nullptr, true);
-                    output = identical_bottleneck_resblock(
-                            pgraph, output, true);
-                    output = identical_bottleneck_resblock(
-                            pgraph, output, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_resnet50_stage_2_fusion_gc)
-        .set_priority(22.1f) // high priority to support itex
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = convolutional_bottleneck_resblock(
-                            pgraph, nullptr, true);
-                    const size_t identical_residual_block_num = 3;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = identical_bottleneck_resblock(
-                                pgraph, output, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_resnet50_stage_3_fusion_gc)
-        .set_priority(22.2f) // high priority to support itex
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = convolutional_bottleneck_resblock(
-                            pgraph, nullptr, true);
-                    const size_t identical_residual_block_num = 5;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = identical_bottleneck_resblock(
-                                pgraph, output, true);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(bf16_conv_training_pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_identical_bottleneck_forward)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    identical_bottleneck_training_forward(
-                            pgraph, nullptr, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_convolutional_bottleneck_forward)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    convolutional_bottleneck_training_forward(
-                            pgraph, nullptr, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_identical_bottleneck_backward_v1)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    identical_bottleneck_training_backward(
-                            pgraph, nullptr, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_convolutional_bottleneck_backward_v1)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    convolutional_bottleneck_training_backward(
-                            pgraph, nullptr, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_identical_bottleneck_backward_v2)
-        .set_priority(4.0f) // set to lower priority as backup
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    identical_bottleneck_training_backward_v2(
-                            pgraph, nullptr, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_convolutional_bottleneck_backward_v2)
-        .set_priority(4.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    convolutional_bottleneck_training_backward_v2(
-                            pgraph, nullptr, true);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(int8_conv_inference_pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_identical_bottleneck)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    general_int8_identical_bottleneck_resblock(pgraph);
-                });
-
-// conv bottleneck inference
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_convolutional_bottleneck)
-        .set_priority(5.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    general_int8_convolutional_bottleneck_resblock(pgraph);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_resnet50_stage_1_4_fusion_gc)
-        .set_priority(22.f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock(
-                            pgraph, nullptr);
-                    output = int8_identical_bottleneck_resblock(pgraph, output);
-                    output = int8_identical_bottleneck_resblock(pgraph, output);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_resnet50_stage_2_fusion_gc)
-        .set_priority(22.1f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock(
-                            pgraph, nullptr);
-                    const size_t identical_residual_block_num = 3;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = int8_identical_bottleneck_resblock(
-                                pgraph, output);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_resnet50_stage_3_fusion_gc)
-        .set_priority(22.2f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock(
-                            pgraph, nullptr);
-                    const size_t identical_residual_block_num = 5;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = int8_identical_bottleneck_resblock(
-                                pgraph, output);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, itex_int8_resnet50_stage_1_fusion_gc)
-        .set_priority(22.1f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock_v2(
-                            pgraph, nullptr, true, true);
-                    output = int8_identical_bottleneck_resblock(
-                            pgraph, output, true, true);
-                    output = int8_identical_bottleneck_resblock(
-                            pgraph, output, true, true);
-                });
-
-// For itex int8 rn50 only (include the weight quantize into pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, itex_int8_resnet50_stage_2_fusion_gc)
-        .set_priority(22.2f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock_v2(
-                            pgraph, nullptr, true, true);
-                    // 3 F(x)+x blocks
-                    const size_t identical_residual_block_num = 3;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = int8_identical_bottleneck_resblock(
-                                pgraph, output, true, true);
-                });
-
-// For itex int8 rn50 only (include the weight quantize into pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, itex_int8_resnet50_stage_3_fusion_gc)
-        .set_priority(22.3f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock_v2(
-                            pgraph, nullptr, true, true);
-                    const size_t identical_residual_block_num = 5;
-                    for (size_t i = 0; i < identical_residual_block_num; i++)
-                        output = int8_identical_bottleneck_resblock(
-                                pgraph, output, true, true);
-                });
-
-// For itex int8 rn50 only (include the weight quantize & output has no quant)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, itex_int8_resnet50_stage_4_fusion_gc)
-        .set_priority(22.1f) // high priority to support lz models
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_residual_conv_blocks)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pm::pb_node_t *output = nullptr;
-                    output = int8_convolutional_bottleneck_resblock_v2(
-                            pgraph, nullptr, true, true);
-                    output = int8_identical_bottleneck_resblock(
-                            pgraph, output, true, true);
-                    output = int8_identical_bottleneck_resblock(
-                            pgraph, output, true, true, /* f32 output */ true);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/fusions.hpp b/src/graph/backend/graph_compiler/patterns/fusions.hpp
deleted file mode 100644
index 7214dff2f6b..00000000000
--- a/src/graph/backend/graph_compiler/patterns/fusions.hpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_FUSIONS_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_FUSIONS_HPP
-
-#include <vector>
-
-#include "graph/backend/graph_compiler/patterns/transformation_pattern.hpp"
-#include "graph/backend/graph_compiler/target_machine.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-namespace pm = graph::utils::pm;
-using in_edges_t = pm::in_edges_t;
-using pb_graph_t = pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-template <graph::data_type_t DTYPE>
-inline bool check_input_dtype(op_t *op) {
-    for (size_t i = 0; i < op->num_inputs(); ++i) {
-        const logical_tensor_t &iport
-                = op->get_input_value(i)->get_logical_tensor();
-        if (iport.data_type != DTYPE) return false;
-    }
-
-    return true;
-}
-
-inline bool check_reduce_attrs(op_t *op) {
-    auto attrs = op->get_attributes();
-    if (attrs.find(op_attr::axes) != attrs.end()
-            && !attrs[op_attr::axes].get<std::vector<int64_t>>().empty()) {
-        return true;
-    }
-    return false;
-}
-
-inline bool check_conv_attrs(op_t *op) {
-    auto attrs = op->get_attributes();
-    // dilations must be {1, 1, ...}
-    if (attrs.find(graph::op_attr::dilations) != attrs.end()) {
-        auto dilations
-                = attrs[graph::op_attr::dilations].get<std::vector<int64_t>>();
-        if (!std::all_of(dilations.begin(), dilations.end(),
-                    [&](const int64_t &d) { return d == 1; })) {
-            return false;
-        }
-    }
-    // groups must be 1
-    if (attrs.find(graph::op_attr::groups) != attrs.end()
-            && attrs[graph::op_attr::groups].get<int64_t>() != 1) {
-        return false;
-    }
-    // preferred to be a 2D conv
-    auto strides = attrs[graph::op_attr::strides].get<std::vector<int64_t>>();
-    if (strides.size() != 2) { return false; }
-    // preferred to be symmetric padding
-    // if no auto_pad set, needs to check pads_begin == pads_end
-    if (attrs.find(op_attr::auto_pad) == attrs.end()) {
-        auto pads_begin
-                = attrs[graph::op_attr::pads_begin].get<std::vector<int64_t>>();
-        auto pads_end
-                = attrs[graph::op_attr::pads_end].get<std::vector<int64_t>>();
-        if (pads_begin != pads_end) { return false; }
-    }
-    return true;
-}
-
-// checks whether an op has no producer or wildcard producer
-inline bool check_if_null_producer(op_t *op) {
-    bool null_producer = true;
-    for (const auto &in_value : op->get_input_values()) {
-        if (in_value->has_producer()) {
-            null_producer = null_producer
-                    && in_value->get_producer().get_kind()
-                            == graph::op_kind::Wildcard;
-        }
-    }
-    return null_producer;
-}
-
-// checks datatype and isa
-inline bool check_isa_compatibility(op_t *op) {
-    bool require_vnni = false;
-    bool require_bf16 = false;
-    for (const auto &in_value : op->get_input_values()) {
-        auto in_dtype = in_value->get_logical_tensor().data_type;
-        if (in_dtype == data_type::bf16) {
-            require_bf16 = true;
-        } else if (in_dtype == data_type::s8 || in_dtype == data_type::u8) {
-            require_vnni = true;
-        }
-    }
-    if (require_bf16) { return support_bf16(); }
-    if (require_vnni) { return support_vnni(); }
-    return true;
-}
-
-template <size_t N>
-bool check_input_num(op_t *op) {
-    return op->num_inputs() == N;
-}
-
-template <size_t N>
-bool check_output_num(op_t *op) {
-    return op->num_outputs() == N;
-}
-
-inline bool check_pooling_input_num(op_t *op) {
-    if (op->get_kind() == graph::op_kind::AvgPoolBackward)
-        return check_input_num<1>(op);
-    return true;
-}
-
-inline pm::pb_node_t *append_single_op_repetition_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph, graph::op_kind_t kind,
-        pm::pb_node_t *input, int rep_min = 0, int rep_max = 2) {
-    in_edges_t in_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-    auto rep_subgraph = std::make_shared<pb_graph_t>();
-    auto single_op = rep_subgraph->append_op(kind);
-    rep_subgraph->create_input_port(0, single_op, 0);
-    rep_subgraph->create_output_port(0, single_op, 0);
-    auto rep = pgraph->append_repetition(
-            rep_subgraph, {0, 0}, rep_min, rep_max, in_edges);
-    return rep;
-};
-
-inline pm::pb_node_t *append_optional_scale(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input) {
-    in_edges_t in_edges {in_edge(0, input, 0)};
-    auto optional_subgraph = std::make_shared<pb_graph_t>();
-    auto scale_op = optional_subgraph->append_alternation(
-            {graph::op_kind::Divide, graph::op_kind::Multiply});
-    optional_subgraph->create_input_port(0, scale_op, 0);
-    optional_subgraph->create_output_port(0, scale_op, 0);
-    auto optional = pgraph->append_optional(optional_subgraph, in_edges);
-    return optional;
-};
-
-inline pm::pb_node_t *append_optional_output_reshape(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input) {
-    in_edges_t in_edges {in_edge(0, input, 0)};
-    auto optional_subgraph = std::make_shared<pb_graph_t>();
-    auto reshape_op = optional_subgraph->append_alternation(
-            {graph::op_kind::Reorder, graph::op_kind::StaticReshape});
-    optional_subgraph->create_input_port(0, reshape_op, 0);
-    optional_subgraph->create_output_port(0, reshape_op, 0);
-    auto optional = pgraph->append_optional(optional_subgraph, in_edges);
-    return optional;
-};
-
-inline pm::pb_node_t *create_dequant_matmul(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_bf16 = false, bool is_int8 = false) {
-    in_edges_t in_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-    if (is_int8) {
-        auto dequantize_A
-                = pgraph->append_op(graph::op_kind::Dequantize, in_edges);
-        auto dequantize_B = pgraph->append_op(graph::op_kind::Dequantize);
-        if (is_bf16) {
-            auto typecast_A = pgraph->append_op(
-                    graph::op_kind::TypeCast, {in_edge(0, dequantize_A, 0)});
-            auto typecast_B = pgraph->append_op(
-                    graph::op_kind::TypeCast, {in_edge(0, dequantize_B, 0)});
-            in_edges = in_edges_t {
-                    in_edge(0, typecast_A, 0), in_edge(1, typecast_B, 0)};
-        } else {
-            in_edges = in_edges_t {
-                    in_edge(0, dequantize_A, 0), in_edge(1, dequantize_B, 0)};
-        }
-    }
-    auto matmul = pgraph->append_op(graph::op_kind::MatMul, in_edges);
-    matmul->append_decision_function(is_bf16
-                    ? check_input_dtype<graph::data_type::bf16>
-                    : check_input_dtype<graph::data_type::f32>);
-    return matmul;
-}
-
-inline const std::vector<graph::op_kind_t> &get_conv_forward_ops() {
-    const static std::vector<graph::op_kind_t> conv_ops
-            = {graph::op_kind::Convolution};
-    return conv_ops;
-}
-
-inline const std::vector<graph::op_kind_t> &get_conv_backward_ops() {
-    const static std::vector<graph::op_kind_t> conv_ops
-            = {graph::op_kind::ConvolutionBackwardData,
-                    graph::op_kind::ConvolutionBackwardWeights};
-    return conv_ops;
-}
-
-inline const std::vector<graph::op_kind_t> &get_matmul_op() {
-    const static std::vector<graph::op_kind_t> matmul_op
-            = {graph::op_kind::MatMul};
-    return matmul_op;
-}
-
-inline const std::vector<graph::op_kind_t> &get_pooling_ops() {
-    const static std::vector<graph::op_kind_t> pooling_ops
-            = {graph::op_kind::AvgPool, graph::op_kind::AvgPoolBackward,
-                    graph::op_kind::MaxPool, graph::op_kind::MaxPoolBackward};
-    return pooling_ops;
-}
-
-inline const std::vector<graph::op_kind_t> &get_reduction_ops() {
-    const static std::vector<graph::op_kind_t> reduction_ops
-            = {graph::op_kind::ReduceL1, graph::op_kind::ReduceL2,
-                    graph::op_kind::ReduceMax, graph::op_kind::ReduceMin,
-                    graph::op_kind::ReduceSum, graph::op_kind::ReduceProd,
-                    graph::op_kind::ReduceMean};
-    return reduction_ops;
-}
-
-inline const std::vector<graph::op_kind_t> &get_bn_training_ops() {
-    const static std::vector<graph::op_kind_t> bn_training_ops
-            = {graph::op_kind::BatchNormForwardTraining,
-                    graph::op_kind::BatchNormTrainingBackward};
-    return bn_training_ops;
-}
-
-inline const std::vector<graph::op_kind_t> get_no_constraint_ops() {
-    auto supported_kinds = compiler_graph_impl_t::get_supported_op_kinds();
-    std::vector<graph::op_kind_t> no_constraint_ops;
-    std::vector<graph::op_kind_t> constraint_ops;
-    constraint_ops.insert(constraint_ops.begin(),
-            get_conv_forward_ops().begin(), get_conv_forward_ops().end());
-    constraint_ops.insert(constraint_ops.begin(),
-            get_conv_backward_ops().begin(), get_conv_backward_ops().end());
-    constraint_ops.insert(constraint_ops.begin(), get_reduction_ops().begin(),
-            get_reduction_ops().end());
-    constraint_ops.insert(constraint_ops.begin(), get_pooling_ops().begin(),
-            get_pooling_ops().end());
-    constraint_ops.insert(constraint_ops.begin(), get_matmul_op().begin(),
-            get_matmul_op().end());
-    constraint_ops.insert(constraint_ops.begin(), get_bn_training_ops().begin(),
-            get_bn_training_ops().end());
-    for (const auto &kind : supported_kinds) {
-        if (std::find(constraint_ops.begin(), constraint_ops.end(), kind)
-                == constraint_ops.end()) {
-            no_constraint_ops.push_back(kind);
-        }
-    }
-    return no_constraint_ops;
-}
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/mha_pattern.hpp b/src/graph/backend/graph_compiler/patterns/mha_pattern.hpp
deleted file mode 100644
index 70b7a3c903f..00000000000
--- a/src/graph/backend/graph_compiler/patterns/mha_pattern.hpp
+++ /dev/null
@@ -1,2160 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_MHA_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_MHA_PATTERN_HPP
-
-#include <memory>
-
-#include "graph/backend/graph_compiler/patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-namespace pm = graph::utils::pm;
-using in_edges_t = pm::in_edges_t;
-using pb_graph_t = graph::utils::pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-// [rep_min, rep_max)
-pm::repetition_t *create_append_transpose_repetition_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        int rep_min, int rep_max) {
-    in_edges_t in_edges;
-    if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-    auto transpose_subgraph = std::make_shared<pb_graph_t>();
-    auto transpose_rep
-            = transpose_subgraph->append_op(graph::op_kind::StaticTranspose);
-    transpose_subgraph->create_input_port(0, transpose_rep, 0);
-    transpose_subgraph->create_output_port(0, transpose_rep, 0);
-    auto transpose = pgraph->append_repetition(
-            transpose_subgraph, {0, 0}, rep_min, rep_max, in_edges);
-    return transpose;
-};
-
-pm::repetition_t *create_optional_mul_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input,
-        bool allow_external = false) {
-    auto optional_mul_subgraph = std::make_shared<pb_graph_t>();
-    auto optional_mul
-            = optional_mul_subgraph->append_op(graph::op_kind::Multiply);
-    if (allow_external) { optional_mul->allow_external_outputs(); }
-
-    optional_mul_subgraph->create_input_port(0, optional_mul, 0);
-    optional_mul_subgraph->create_output_port(0, optional_mul, 0);
-    auto mul = pgraph->append_optional(
-            optional_mul_subgraph, {in_edge(0, input, 0)});
-    return mul;
-};
-
-pm::alternation_t *create_alternative_mul_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *input) {
-    // 2 alternations: 1) a single mul; 2) 2 consecutive mul
-    auto successive_mul_subgraph = std::make_shared<pb_graph_t>();
-    auto mul1 = successive_mul_subgraph->append_op(graph::op_kind::Multiply);
-    auto mul2 = successive_mul_subgraph->append_op(
-            graph::op_kind::Multiply, {in_edge(0, mul1, 0)});
-    successive_mul_subgraph->create_input_port(0, mul1, 0);
-    successive_mul_subgraph->create_output_port(0, mul2, 0);
-
-    auto single_mul_subgraph = std::make_shared<pb_graph_t>();
-    auto mul = single_mul_subgraph->append_op(graph::op_kind::Multiply);
-    single_mul_subgraph->create_input_port(0, mul, 0);
-    single_mul_subgraph->create_output_port(0, mul, 0);
-
-    auto mul_subgraph = pgraph->append_alternation(
-            {successive_mul_subgraph, single_mul_subgraph},
-            {in_edge(0, input, 0)});
-    return mul_subgraph;
-}
-
-pm::pb_node_t *append_optional_typecast_quantize(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_bf16 = false) {
-    auto subgraph = std::make_shared<pb_graph_t>();
-    in_edges_t in_edges;
-    pm::pb_node_t *subgraph_in_node;
-    if (is_bf16) {
-        auto typecast_output = subgraph->append_op(graph::op_kind::TypeCast);
-        in_edges = in_edges_t {in_edge(0, typecast_output, 0)};
-        subgraph_in_node = typecast_output;
-    }
-    auto quantize = subgraph->append_op(graph::op_kind::Quantize, in_edges);
-    if (!is_bf16) { subgraph_in_node = quantize; }
-    subgraph->create_input_port(0, subgraph_in_node, 0);
-    subgraph->create_output_port(0, quantize, 0);
-    auto output = pgraph->append_optional(subgraph, {in_edge(0, input, 0)});
-    return output;
-}
-
-/*
- [query]    [key]
-      \     /
-[cond] MatMul [mask]
-     \   |   /
-       Select   [scale]
-            \   /
-             Div   [add in]
-               \   /
-                Add
-                 |
-               Softmax   [value]
-                    \     /
-                     MatMul
-                       |
-                StaticTranspose
-                       |
-                    Reorder
-                       |
-                    [output]
-*/
-void create_gpt_mha(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16, bool is_int8) {
-    auto matmul_qk = create_dequant_matmul(pgraph, nullptr, is_bf16, is_int8);
-    auto select = pgraph->append_op(
-            graph::op_kind::Select, {in_edge(1, matmul_qk, 0)});
-    auto div = pgraph->append_op(
-            graph::op_kind::Divide, {in_edge(0, select, 0)});
-    auto add = pgraph->append_op(graph::op_kind::Add, {in_edge(0, div, 0)});
-    auto softmax
-            = pgraph->append_op(graph::op_kind::SoftMax, {in_edge(0, add, 0)});
-
-    pm::pb_node_t *matmul_v_input = softmax;
-
-    if (is_bf16) {
-        auto extra_casts = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, softmax, 0, 4);
-        matmul_v_input = extra_casts;
-    }
-
-    if (is_int8) {
-        auto quantize_softmax = pgraph->append_op(
-                graph::op_kind::Quantize, {in_edge(0, matmul_v_input, 0)});
-        matmul_v_input = quantize_softmax;
-    }
-
-    auto matmul_v
-            = create_dequant_matmul(pgraph, matmul_v_input, is_bf16, is_int8);
-    auto transpose_output = pgraph->append_op(
-            graph::op_kind::StaticTranspose, {in_edge(0, matmul_v, 0)});
-    auto reorder_output = pgraph->append_alternation(
-            {graph::op_kind::Reorder, graph::op_kind::StaticReshape},
-            {in_edge(0, transpose_output, 0)});
-    if (is_int8) {
-        append_optional_typecast_quantize(pgraph, reorder_output, is_bf16);
-    }
-}
-
-/*
- [Query]    [Key]
-      \     /
-       MatMul  [fscore scale]
-         \    /
-           Div
-            |
-        TypeCast*[0-1]  [add in]
-                  \     /
-                    Add
-                     |
-                TypeCast*[0-1]  [max in]
-                          \     /
-                          Maximum
-                             |
-                        TypeCast*[0-1]
-                             |
-                          Softmax
-                             |
-                        TypeCast*[0-1] [Value]
-                                  \     /
-                                   MatMul
-                                     |
-                                  [output]
-*/
-void create_llama_mha(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16, bool is_int8) {
-    auto matmul_qk = create_dequant_matmul(pgraph, nullptr, is_bf16, is_int8);
-    auto div = pgraph->append_op(
-            graph::op_kind::Divide, {in_edge(0, matmul_qk, 0)});
-    auto extra_cast1 = append_single_op_repetition_subgraph(
-            pgraph, graph::op_kind::TypeCast, div, 0, 2);
-    auto add = pgraph->append_op(
-            graph::op_kind::Add, {in_edge(0, extra_cast1, 0)});
-    auto extra_cast2 = append_single_op_repetition_subgraph(
-            pgraph, graph::op_kind::TypeCast, add, 0, 2);
-    auto max = pgraph->append_op(
-            graph::op_kind::Maximum, {in_edge(0, extra_cast2, 0)});
-    auto extra_cast3 = append_single_op_repetition_subgraph(
-            pgraph, graph::op_kind::TypeCast, max, 0, 2);
-    auto softmax = pgraph->append_op(
-            graph::op_kind::SoftMax, {in_edge(0, extra_cast3, 0)});
-    auto extra_cast4 = append_single_op_repetition_subgraph(
-            pgraph, graph::op_kind::TypeCast, softmax, 0, 3);
-
-    pm::pb_node_t *matmul_v_input = extra_cast4;
-    if (is_int8) {
-        auto quantize_softmax = pgraph->append_op(
-                graph::op_kind::Quantize, {in_edge(0, extra_cast4, 0)});
-        matmul_v_input = quantize_softmax;
-    }
-
-    auto matmul_v
-            = create_dequant_matmul(pgraph, matmul_v_input, is_bf16, is_int8);
-    if (is_int8) {
-        append_optional_typecast_quantize(pgraph, matmul_v, is_bf16);
-    }
-    UNUSED(matmul_v);
-}
-
-void create_starcoder_mha(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16, bool is_int8) {
-    auto matmul_qk = create_dequant_matmul(pgraph, nullptr, is_bf16, is_int8);
-    auto mul = pgraph->append_op(
-            graph::op_kind::Multiply, {in_edge(0, matmul_qk, 0)});
-    auto select
-            = pgraph->append_op(graph::op_kind::Select, {in_edge(1, mul, 0)});
-    auto softmax = pgraph->append_op(
-            graph::op_kind::SoftMax, {in_edge(0, select, 0)});
-    pm::pb_node_t *matmul_v_input = softmax;
-    if (is_int8) {
-        if (is_bf16) {
-            auto cast = pgraph->append_op(
-                    graph::op_kind::TypeCast, {in_edge(0, softmax, 0)});
-            matmul_v_input = cast;
-        }
-        auto quantize_softmax = pgraph->append_op(
-                graph::op_kind::Quantize, {in_edge(0, matmul_v_input, 0)});
-        matmul_v_input = quantize_softmax;
-    }
-    auto matmul_v
-            = create_dequant_matmul(pgraph, matmul_v_input, is_bf16, is_int8);
-    UNUSED(matmul_v);
-}
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(fp32_mha_pattern)
-// fp32 MHA pattern
-/*
-   (f32)[Query]    [Key](f32)
-            |         |
-        Reshape    Reshape
-            |         |
-  [1-2]*Transpose Transpose*[1-2]
-              \     /
-               MatMul  [fscore scale](f32)
-                 \    /
-[Attention Mask] Div|Mul  [Value](f32)
-              \   /        |
-        Add (optional)  Reshape
-                 |         |
-              Softmax  Transpose*[1-2]
-                    \     /
-                     MatMul
-                        |
-                    Transpose
-                        |
-                  Reshape (optional)
-                        |
-                     [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fp32_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto query_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    query_reshape->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto query_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, query_reshape, 1, 3);
-                    auto key_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto key_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, key_reshape, 1, 3);
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, query_transpose, 0),
-                                    in_edge(1, key_transpose, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto value_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto value_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, value_reshape, 1, 3);
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, softmax, 0),
-                                    in_edge(1, value_transpose, 0)});
-                    auto transpose_output
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-
-                    auto optional_reshape_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_reshape
-                            = optional_reshape_subgraph->append_op(
-                                    graph::op_kind::StaticReshape);
-                    optional_reshape_subgraph->create_input_port(
-                            0, optional_reshape, 0);
-                    optional_reshape_subgraph->create_output_port(
-                            0, optional_reshape, 0);
-
-                    pgraph->append_optional(optional_reshape_subgraph,
-                            {in_edge(0, transpose_output, 0)});
-                });
-
-// fp32 MHA pattern alternative
-/*
-   (f32)[Query]    [Key](f32)
-              \     /
-               MatMul  [fscore scale](f32)
-                 \    /
-[Attention Mask] Div|Mul (optional)
-              \   /
-           Add (optional)
-                 |
-              Softmax  [Value](f32)
-                    \     /
-                     MatMul
-                        |
-                    Transpose (optional)
-                        |
-                Reorder|StaticReshape (optional)
-                        |
-                     [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mha_pattern_alternative)
-        .set_priority(4.5f) // lower priority than non-alternative
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale
-                            = append_optional_scale(pgraph, matmul_qk);
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto matmul_v = pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
-                    matmul_v->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto transpose = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::StaticTranspose, matmul_v);
-                    append_optional_output_reshape(pgraph, transpose);
-                });
-
-// fp32_distill_bert_mha_pattern
-/*
-           (f32)[Query]    [Key](f32)
-                      \     /
-                       MatMul
-                         |
-                    Divide (optional)
-      (f32)[Fill Value]  |
-   (bool)[Att Mask]   \  |
-                    \  \ |
-                      Select
-                         |
-                      Softmax  [Value](f32)
-                            \     /
-                             MatMul
-                                |
-                            Transpose
-                                |
-                             Reorder
-                                |
-                             [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_distill_bert_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Divide, matmul_qk);
-                    auto select = pgraph->append_op(graph::op_kind::Select,
-                            {in_edge(2, fscore_scale, 0)});
-                    auto softmax = pgraph->append_op(
-                            graph::op_kind::SoftMax, {in_edge(0, select, 0)});
-                    auto matmul_v = pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
-                    matmul_v->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    pgraph->append_op(graph::op_kind::Reorder,
-                            {in_edge(0, transpose, 0)});
-                });
-
-// fp32 MHA training forward pattern
-/*
-    (f32)[QueryTrans]   [KeyTrans](f32)
-                  \      /
-                   MatMul  [FscoreScale](f32)
-                     \    /
-(f32)[AttentionMask] Div|Mul
-                  \   /
-              Add (optional)
-                     |
-                  Softmax   [Dropout](f32)
-                      \     /
-                      Multiply  [Select](f32)
-                         \      /
-               (optional)Multiply  [ValueTrans](f32)
-                            \      /
-                             MatMul
-                                |
-                            Transpose
-                                |
-                             Reshape
-                                |
-                            [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mha_forward_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    fscore_scale->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    softmax->allow_external_outputs();
-                    auto dropout = pgraph->append_op(
-                            graph::op_kind::Multiply, {in_edge(0, softmax, 0)});
-                    dropout->allow_external_outputs();
-                    auto select = create_optional_mul_subgraph(
-                            pgraph, dropout, true);
-                    auto matmul_v = pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, select, 0)});
-                    matmul_v->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto reshape
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    pgraph->append_op(graph::op_kind::StaticReshape,
-                            {in_edge(0, reshape, 0)});
-                });
-
-// fp32 MHA training backward pattern
-/*
-                [BackwardIn](f32)
-                        |
-                     Reshape
-                        |
-(f32)[DrouputOut]   Transpose   [ValueTrans](f32)
-          \       /         \    /
-            MatMul           MatMul    [Select](f32)
-              |                  \     /
-        [output](f32)   (optional)Multiply    [Dropout](f32)
-                                       \     /
-                                      Multiply     [SoftmaxOut](f32)
-                                        /  \        /
-                                       /     Multiply
-                                       |      |
-                                       |  ReduceSum
-                                        \   /
-                                         Sub  [SoftmaxOut](f32)
-                                          \   /
-                                       Multiply  [Fscore](f32)
-                                            \    /
-                                           Div|Mul  [QueryTrans](f32)
-                         ___________________/    \   /
-                         \   [KeyTrans](f32)     MatMul
-                          \      /                 |
-                           MatMul             [output](f32)
-                             |
-                         [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mha_backward_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto in_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    in_reshape->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto in_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, in_reshape, 0)});
-                    auto bmm_v_grad_weight
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(1, in_transpose, 0)});
-                    bmm_v_grad_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto bmm_v_grad_data
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(0, in_transpose, 0)});
-                    bmm_v_grad_data->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto dropout_grad = create_alternative_mul_subgraph(
-                            pgraph, bmm_v_grad_data);
-                    auto softmax_mul
-                            = pgraph->append_op(graph::op_kind::Multiply,
-                                    {in_edge(0, dropout_grad, 0)});
-                    softmax_mul->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto softmax_sum
-                            = pgraph->append_op(graph::op_kind::ReduceSum,
-                                    {in_edge(0, softmax_mul, 0)});
-                    softmax_sum->append_decision_function(check_input_num<1>);
-                    softmax_sum->append_decision_function(check_reduce_attrs);
-                    auto softmax_sub
-                            = pgraph->append_op(graph::op_kind::Subtract,
-                                    {in_edge(0, dropout_grad, 0),
-                                            in_edge(1, softmax_sum, 0)});
-                    /* Create 2 subgraph for alternation */
-                    auto successive_mul_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto softmax_mul2 = successive_mul_subgraph->append_op(
-                            graph::op_kind::Multiply);
-                    auto fscore_grad_alter1
-                            = successive_mul_subgraph->append_alternation(
-                                    {graph::op_kind::Divide,
-                                            graph::op_kind::Multiply},
-                                    {in_edge(0, softmax_mul2, 0)});
-                    successive_mul_subgraph->create_input_port(
-                            0, softmax_mul2, 0);
-                    successive_mul_subgraph->create_output_port(
-                            0, fscore_grad_alter1, 0);
-
-                    auto single_fscore_grad_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto fscore_grad_alter2
-                            = single_fscore_grad_subgraph->append_alternation(
-                                    {graph::op_kind::Divide,
-                                            graph::op_kind::Multiply});
-                    single_fscore_grad_subgraph->create_input_port(
-                            0, fscore_grad_alter2, 0);
-                    single_fscore_grad_subgraph->create_output_port(
-                            0, fscore_grad_alter2, 0);
-
-                    auto softmax_grad_alter = pgraph->append_alternation(
-                            {successive_mul_subgraph,
-                                    single_fscore_grad_subgraph},
-                            {in_edge(0, softmax_sub, 0)});
-                    auto bmm_q_grad_weight
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(0, softmax_grad_alter, 0)});
-                    bmm_q_grad_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto bmm_k_grad_weight
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(0, softmax_grad_alter, 0)});
-                    bmm_k_grad_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                });
-
-// fp32 MHA pattern with special reshape for softmax
-/*
-   (f32)[Query]    [Key](f32)
-            |         |
-        Reshape    Reshape
-            |         |
-        Transpose Transpose
-              \     /
-               MatMul  [Attention Mask](f32)
-                  \   /
-                   Add
-                    |
-                 Reshape   [Value](f32)
-                    |        |
-                 Softmax   Reshape
-                    |        |
-                 Reshape  Transpose
-                     \     /
-                      MatMul
-                        |
-                    Transpose
-                        |
-                     Reshape
-                        |
-                     [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mha_pattern_alternative2)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto query_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto query_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, query_reshape, 0)});
-
-                    auto key_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto key_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, key_reshape, 0)});
-
-                    auto mul = create_optional_mul_subgraph(
-                            pgraph, key_transpose);
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, query_transpose, 0),
-                                    in_edge(1, mul, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-
-                    auto fscore_add = pgraph->append_op(
-                            graph::op_kind::Add, {in_edge(0, matmul_qk, 0)});
-
-                    auto optional_pre_reshape_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_pre_reshape
-                            = optional_pre_reshape_subgraph->append_op(
-                                    graph::op_kind::StaticReshape);
-                    optional_pre_reshape_subgraph->create_input_port(
-                            0, optional_pre_reshape, 0);
-                    optional_pre_reshape_subgraph->create_output_port(
-                            0, optional_pre_reshape, 0);
-                    auto pre_reshape = pgraph->append_optional(
-                            optional_pre_reshape_subgraph,
-                            {in_edge(0, fscore_add, 0)});
-
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, pre_reshape, 0)});
-
-                    auto optional_post_reshape_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_post_reshape
-                            = optional_post_reshape_subgraph->append_op(
-                                    graph::op_kind::StaticReshape);
-                    optional_post_reshape_subgraph->create_input_port(
-                            0, optional_post_reshape, 0);
-                    optional_post_reshape_subgraph->create_output_port(
-                            0, optional_post_reshape, 0);
-                    auto post_reshape = pgraph->append_optional(
-                            optional_post_reshape_subgraph,
-                            {in_edge(0, softmax, 0)});
-
-                    auto value_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto value_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, value_reshape, 0)});
-
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, post_reshape, 0),
-                                    in_edge(1, value_transpose, 0)});
-
-                    auto post_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    pgraph->append_op(graph::op_kind::StaticReshape,
-                            {in_edge(0, post_transpose, 0)});
-                });
-
-// fake int8 MHA pattern corresponding to fp32_mha_pattern_alternative2
-/*
-   (f32)[Query]    [Key](f32)
-            |         |
-        Reshape    Reshape
-            |         |
-        Transpose Transpose
-              \     /
-               MatMul
-                 |
-              Quantize
-                 |
-             Dequantize    [Attention Mask](f32) 
-                  \__    __/
-                      Add    [Fscore Scale](f32) 
-                        \      /
-                        Multiply
-                           |
-                        Reshape   [Value](f32)
-                           |        |
-                        Softmax   Reshape
-                           |        |
-                        Reshape  Transpose
-                            \     /
-                             MatMul
-                               |
-                            [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fake_int8_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto query_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto query_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, query_reshape, 0)});
-
-                    auto key_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto key_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, key_reshape, 0)});
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, query_transpose, 0),
-                                    in_edge(1, key_transpose, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fquantize = pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fdequantize = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize},
-                            {in_edge(0, fquantize, 0)});
-                    auto radd = pgraph->append_op(
-                            graph::op_kind::Add, {in_edge(0, fdequantize, 0)});
-                    auto rmultiply = pgraph->append_op(
-                            graph::op_kind::Multiply, {in_edge(0, radd, 0)});
-
-                    auto pre_softmax_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, rmultiply, 0)});
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, pre_softmax_reshape, 0)});
-                    auto post_softmax_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, softmax, 0)});
-
-                    auto value_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto value_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, value_reshape, 0)});
-
-                    pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, post_softmax_reshape, 0),
-                                    in_edge(1, value_transpose, 0)});
-                });
-
-/*
-   (f32)[Query]     [Key](f32)
-              \     /
-               MatMul
-                 |
-         Divide|Multiply
-                 |
-             Add (optional)
-                 |
-              Softmax
-                 |
-            [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_matmul_softmax_fusion)
-        .set_priority(4.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::matmul_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                });
-
-// fp32 MHA pattern with special view and max
-/*
-   (f32)[Query]    [Key](f32)
-              \     /
-               MatMul
-                 |
-              Reshape  [Attention Mask](f32)
-                  \    /
-                   Add  [Max In](f32)
-                    \   /
-                     Max
-                      |
-                   Reshape
-                      |
-                   Softmax   [Value](f32)
-                       \     /
-                        MatMul
-                          |
-                       [output](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mha_pattern_alternative4)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto reshape1
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = pgraph->append_op(
-                            graph::op_kind::Add, {in_edge(0, reshape1, 0)});
-                    auto fscore_max = pgraph->append_op(graph::op_kind::Maximum,
-                            {in_edge(0, fscore_add, 0)});
-                    auto reshape2
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, fscore_max, 0)});
-                    auto softmax = pgraph->append_op(
-                            graph::op_kind::SoftMax, {in_edge(0, reshape2, 0)});
-
-                    pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fp32_gpt_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mha(pgraph, false, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fp32_llama_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mha(pgraph, false, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fp32_starcoder_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_starcoder_mha(pgraph, false, false);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(bf16_mha_pattern)
-// bf16 MHA pattern (it is the same as fp32 pattern except dtype)
-/*
-  (bf16)[Query]    [Key](bf16)
-            |         |
-        Reshape    Reshape
-            |         |
-   [1-2]*Transpose Transpose*[1-2]
-              \     /
-               MatMul  [fscore scale]
-                 \    /
-[Attention Mask] Div|Mul  [Value](bf16)
-              \   /        |
-         Add (optional)  Reshape
-                 |         |
-              Softmax  Transpose*[1-2]
-                    \     /
-                     MatMul
-                        |
-                    Transpose
-                        |
-                  Reshape (optional)
-                        |
-                     [output](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, bf16_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto query_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    query_reshape->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto query_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, query_reshape, 1, 3);
-                    auto key_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto key_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, key_reshape, 1, 3);
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, query_transpose, 0),
-                                    in_edge(1, key_transpose, 0)});
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto value_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    auto value_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, value_reshape, 1, 3);
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, softmax, 0),
-                                    in_edge(1, value_transpose, 0)});
-                    auto transpose_output
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-
-                    auto optional_reshape_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_reshape
-                            = optional_reshape_subgraph->append_op(
-                                    graph::op_kind::StaticReshape);
-                    optional_reshape_subgraph->create_input_port(
-                            0, optional_reshape, 0);
-                    optional_reshape_subgraph->create_output_port(
-                            0, optional_reshape, 0);
-
-                    pgraph->append_optional(optional_reshape_subgraph,
-                            {in_edge(0, transpose_output, 0)});
-                });
-
-// bf16 MHA pattern alternative
-/*
-  (bf16)[Query]    [Key](bf16)
-              \     /
-               MatMul  [fscore scale]
-                 \    /
-[Attention Mask] Div|Mul (optional)
-              \   /
-          Add (optional)
-                 |
-              Softmax  [Value](bf16)
-                    \     /
-                     MatMul
-                        |
-                    Transpose (optional)
-                        |
-                Reorder|StaticReshape (optional)
-                        |
-                     [output](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_mha_pattern_alternative)
-        .set_priority(4.5f) // lower priority than non-alternative
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale
-                            = append_optional_scale(pgraph, matmul_qk);
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto matmul_v = pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
-                    matmul_v->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto transpose = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::StaticTranspose, matmul_v);
-                    append_optional_output_reshape(pgraph, transpose);
-                });
-
-// bf16_distill_bert_mha_pattern
-/*
-           (bf16)[Query]    [Key](bf16)
-                      \     /
-                       MatMul
-                         |
-                    Divide (optional)
-     (bf16)[Fill Value]  |
-   (bool)[Att Mask]   \  |
-                    \  \ |
-                      Select
-                         |
-                      Softmax  [Value](bf16)
-                            \     /
-                             MatMul
-                                |
-                            Transpose
-                                |
-                             Reorder
-                                |
-                            [output](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_distill_bert_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Divide, matmul_qk);
-                    auto select = pgraph->append_op(graph::op_kind::Select,
-                            {in_edge(2, fscore_scale, 0)});
-                    auto softmax = pgraph->append_op(
-                            graph::op_kind::SoftMax, {in_edge(0, select, 0)});
-                    auto matmul_v = pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, softmax, 0)});
-                    matmul_v->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    pgraph->append_op(graph::op_kind::Reorder,
-                            {in_edge(0, transpose, 0)});
-                });
-
-// bf16 MHA training forward pattern
-/*
-   (bf16)[QueryTrans]   [KeyTrans](bf16)
-                  \      /
-                   MatMul  [FscoreScale](f32/bf16)
-                     \    /
-(bf16)[AttentionMask] Div|Mul
-                  \   /
-                Add (optional)
-                     |
-                  Softmax   [Dropout](bf16)
-                      \     /
-                      Multiply  [Select](bf16)
-                         \      /
-               (optional)Multiply  [ValueTrans](bf16)
-                            \      /
-                             MatMul
-                                |
-                            Transpose
-                                |
-                             Reshape
-                                |
-                            [output](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_mha_forward_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    softmax->allow_external_outputs();
-                    auto dropout = pgraph->append_op(
-                            graph::op_kind::Multiply, {in_edge(0, softmax, 0)});
-                    dropout->allow_external_outputs();
-                    auto select = create_optional_mul_subgraph(
-                            pgraph, dropout, true);
-                    auto matmul_v = pgraph->append_op(
-                            graph::op_kind::MatMul, {in_edge(0, select, 0)});
-                    matmul_v->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto reshape
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    pgraph->append_op(graph::op_kind::StaticReshape,
-                            {in_edge(0, reshape, 0)});
-                });
-
-// bf16 MHA training backward pattern
-/*
-                [BackwardIn](bf16)
-                        |
-                     Reshape
-                        |
-(bf16)[DrouputOut]  Transpose   [ValueTrans](bf16)
-          \       /         \    /
-            MatMul           MatMul    [Select](bf16)
-              |                  \     /
-        [output](bf16)  (optional)Multiply    [Dropout](bf16)
-                                       \     /
-                                      Multiply     [SoftmaxOut](bf16)
-                                        /  \        /
-                                       /     Multiply
-                                       |      |
-                                       |  ReduceSum
-                                        \   /
-                                         Sub  [SoftmaxOut](bf16)
-                                          \   /
-                                       Multiply  [Fscore](bf16)
-                                            \    /
-                                           Div|Mul  [QueryTrans](bf16)
-                         ___________________/    \   /
-                         \   [KeyTrans](bf16)     MatMul
-                          \      /                 |
-                           MatMul             [output](bf16)
-                             |
-                         [output](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_mha_backward_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto in_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape);
-                    in_reshape->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto in_transpose
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, in_reshape, 0)});
-                    auto bmm_v_grad_weight
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(1, in_transpose, 0)});
-                    bmm_v_grad_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto bmm_v_grad_data
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(0, in_transpose, 0)});
-                    bmm_v_grad_data->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto dropout_grad = create_alternative_mul_subgraph(
-                            pgraph, bmm_v_grad_data);
-                    auto softmax_mul
-                            = pgraph->append_op(graph::op_kind::Multiply,
-                                    {in_edge(0, dropout_grad, 0)});
-                    softmax_mul->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto softmax_sum
-                            = pgraph->append_op(graph::op_kind::ReduceSum,
-                                    {in_edge(0, softmax_mul, 0)});
-                    softmax_sum->append_decision_function(check_input_num<1>);
-                    softmax_sum->append_decision_function(check_reduce_attrs);
-                    auto softmax_sub
-                            = pgraph->append_op(graph::op_kind::Subtract,
-                                    {in_edge(0, dropout_grad, 0),
-                                            in_edge(1, softmax_sum, 0)});
-                    /* Create 2 subgraph for alternation */
-                    auto successive_mul_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto softmax_mul2 = successive_mul_subgraph->append_op(
-                            graph::op_kind::Multiply);
-                    auto fscore_grad_alter1
-                            = successive_mul_subgraph->append_alternation(
-                                    {graph::op_kind::Divide,
-                                            graph::op_kind::Multiply},
-                                    {in_edge(0, softmax_mul2, 0)});
-                    successive_mul_subgraph->create_input_port(
-                            0, softmax_mul2, 0);
-                    successive_mul_subgraph->create_output_port(
-                            0, fscore_grad_alter1, 0);
-
-                    auto single_fscore_grad_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto fscore_grad_alter2
-                            = single_fscore_grad_subgraph->append_alternation(
-                                    {graph::op_kind::Divide,
-                                            graph::op_kind::Multiply});
-                    single_fscore_grad_subgraph->create_input_port(
-                            0, fscore_grad_alter2, 0);
-                    single_fscore_grad_subgraph->create_output_port(
-                            0, fscore_grad_alter2, 0);
-
-                    auto softmax_grad_alter = pgraph->append_alternation(
-                            {successive_mul_subgraph,
-                                    single_fscore_grad_subgraph},
-                            {in_edge(0, softmax_sub, 0)});
-                    auto bmm_q_grad_weight
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(0, softmax_grad_alter, 0)});
-                    bmm_q_grad_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto bmm_k_grad_weight
-                            = pgraph->append_op(graph::op_kind::MatMul,
-                                    {in_edge(0, softmax_grad_alter, 0)});
-                    bmm_k_grad_weight->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                });
-
-/*
-   (bf16)[Query]     [Key](bf16)
-              \     /
-               MatMul
-                 |
-         Divide|Multiply
-                 |
-          Add (optional)
-                 |
-              Softmax
-                 |
-            [output](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_matmul_softmax_fusion)
-        .set_priority(4.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::matmul_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul);
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, bf16_gpt_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mha(pgraph, true, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, bf16_llama_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mha(pgraph, true, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, bf16_starcoder_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_starcoder_mha(pgraph, true, false);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(int8_mha_pattern)
-// int8 MHA pattern
-/*
-       (u8/s8)[Query]   [Key](u8/s8)
-                 |         |
-            Dequantize  Dequantize
-                 |         |
-             Reshape    Reshape
-                 |         |
-        [1-2]*Transpose Transpose*[1-2]
-                   \     /
-                    MatMul  [Fscore Scale](f32)
-                      \    /
-(f32)[Attention Mask] Div|Mul
-                   \   /
-                Add (optional) [Value](u8/s8)
-                      |         |
-                   Softmax   Dequantize
-                      |         |
-                   Quantize   Reshape
-                      |         |
-                  Dequantize Transpose*[1-2]
-                         \     /
-                          MatMul
-                             |
-                         Transpose
-                             |
-                          Reshape (optional)
-                             |
-                          Quantize
-                             |
-                        [output](u8/s8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto dequantize_value = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto query_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, dequantize_query, 0)});
-                    auto query_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, query_reshape, 1, 3);
-                    auto key_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, dequantize_key, 0)});
-                    auto key_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, key_reshape, 1, 3);
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, query_transpose, 0),
-                                    in_edge(1, key_transpose, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto quantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, softmax, 0)});
-                    auto dequantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize},
-                            {in_edge(0, quantize_softmax, 0)});
-                    auto value_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, dequantize_value, 0)});
-                    auto value_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, value_reshape, 1, 3);
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_softmax, 0),
-                                    in_edge(1, value_transpose, 0)});
-
-                    auto transpose_output
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-
-                    auto optional_reshape_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_reshape
-                            = optional_reshape_subgraph->append_op(
-                                    graph::op_kind::StaticReshape);
-                    optional_reshape_subgraph->create_input_port(
-                            0, optional_reshape, 0);
-                    optional_reshape_subgraph->create_output_port(
-                            0, optional_reshape, 0);
-
-                    auto reshape_output
-                            = pgraph->append_optional(optional_reshape_subgraph,
-                                    {in_edge(0, transpose_output, 0)});
-                    pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, reshape_output, 0)});
-                });
-
-// int8-bf16 MHA pattern
-/*
-       (u8/s8)[Query]   [Key](u8/s8)
-                 |         |
-             Dequantize Dequantize
-                 |         |
-              Typecast  Typecast
-                 |         |
-              Reshape   Reshape
-                 |         |
-        [1-2]*Transpose Transpose*[1-2]
-                   \     /
-                    MatMul  [Fscore Scale](f32)
-                      \    /
-(b16)[Attention Mask] Div|Mul
-                   \   /
-                Add (optional)
-                      |
-                   Softmax   [Value](u8/s8)
-                      |         |
-                   Typecast   Dequantize
-                      |         |
-                   Quantize   Typecast
-                      |         |
-                  Dequantize  Reshape
-                      |         |
-                   Typecast   Transpose*[1-2]
-                         \     /
-                          MatMul
-                             |
-                         Transpose
-                             |
-                          Reshape
-                             |
-                          Typecast
-                             |
-                          Quantize
-                             |
-                        [output](u8/s8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_bf16_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto typecast_query
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_query, 0)});
-                    auto query_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, typecast_query, 0)});
-                    auto query_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, query_reshape, 1, 3);
-
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto typecast_key
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_key, 0)});
-                    auto key_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, typecast_key, 0)});
-                    auto key_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, key_reshape, 1, 3);
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, query_transpose, 0),
-                                    in_edge(1, key_transpose, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto typecast_softmax = pgraph->append_op(
-                            graph::op_kind::TypeCast, {in_edge(0, softmax, 0)});
-                    auto quantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, typecast_softmax, 0)});
-                    auto dequantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize},
-                            {in_edge(0, quantize_softmax, 0)});
-                    auto typecast_softmax_bf16
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_softmax, 0)});
-
-                    auto dequantize_value = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto typecast_value
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_value, 0)});
-                    auto value_reshape
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, typecast_value, 0)});
-                    auto value_transpose
-                            = create_append_transpose_repetition_subgraph(
-                                    pgraph, value_reshape, 1, 3);
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, typecast_softmax_bf16, 0),
-                                    in_edge(1, value_transpose, 0)});
-
-                    auto transpose_output
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    auto reshape_output
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, transpose_output, 0)});
-                    auto typecast_output
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, reshape_output, 0)});
-                    pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, typecast_output, 0)});
-                });
-
-/*
-        (int8)[Query]   [Key](int8)
-                 |          |
-             Dequantize Dequantize
-                   \     /
-                    MatMul  [Fscore Scale](f32)
-                      \    /
-(f32)[Attention Mask] Div|Mul (optional)
-                   \   /
-                Add (optional)
-                      |
-                   Softmax
-                      |
-                   Quantize  [Value](int8)
-                      |          |
-                  Dequantize Dequantize
-                         \     /
-                          MatMul
-                             |
-                         Transpose (optional)
-                             |
-                Reorder|StaticReshape (optional)
-                             |
-                          Quantize
-                             |
-                        [output](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_mha_pattern_alternative)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_query, 0),
-                                    in_edge(1, dequantize_key, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale
-                            = append_optional_scale(pgraph, matmul_qk);
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto quantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, softmax, 0)});
-                    auto dequantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize},
-                            {in_edge(0, quantize_softmax, 0)});
-
-                    auto dequantize_value = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_softmax, 0),
-                                    in_edge(1, dequantize_value, 0)});
-                    auto transpose = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::StaticTranspose, matmul_v);
-                    auto reshape
-                            = append_optional_output_reshape(pgraph, transpose);
-                    pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, reshape, 0)});
-                });
-
-/*
-        (int8)[Query]   [Key](int8)
-                 |          |
-             Dequantize Dequantize
-                 |          |
-              TypeCast   TypeCast
-                   \     /
-                    MatMul  [Fscore Scale](f32)
-                      \    /
-(bf16)[Attention Mask] Div|Mul (optional)
-                   \   /
-                Add (optional)
-                      |
-                   Softmax
-                      |
-                   TypeCast
-                      |
-                   Quantize  [Value](int8)
-                      |          |
-                  Dequantize Dequantize
-                      |          |
-                   TypeCast   TypeCast
-                         \     /
-                          MatMul
-                             |
-                         Transpose (optional)
-                             |
-                Reorder|StaticReshape (optional)
-                             |
-                          TypeCast
-                             |
-                          Quantize
-                             |
-                        [output](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_bf16_mha_pattern_alternative)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto cast_query
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_query, 0)});
-
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto cast_key = pgraph->append_op(graph::op_kind::TypeCast,
-                            {in_edge(0, dequantize_key, 0)});
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, cast_query, 0),
-                                    in_edge(1, cast_key, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale
-                            = append_optional_scale(pgraph, matmul_qk);
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-                    auto cast_softmax_fp32 = pgraph->append_op(
-                            graph::op_kind::TypeCast, {in_edge(0, softmax, 0)});
-                    auto quantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, cast_softmax_fp32, 0)});
-                    auto dequantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize},
-                            {in_edge(0, quantize_softmax, 0)});
-                    auto cast_softmax
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_softmax, 0)});
-
-                    auto dequantize_value = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto cast_value
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_value, 0)});
-
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, cast_softmax, 0),
-                                    in_edge(1, cast_value, 0)});
-                    auto transpose = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::StaticTranspose, matmul_v);
-                    auto reshape
-                            = append_optional_output_reshape(pgraph, transpose);
-                    auto cast_output_fp32 = pgraph->append_op(
-                            graph::op_kind::TypeCast, {in_edge(0, reshape, 0)});
-                    pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, cast_output_fp32, 0)});
-                });
-
-/*
-   (int8)[Query]     [Key](int8)
-            |          |
-      Dequantize   Dequantize
-              \     /
-               MatMul
-                 |
-         Divide|Multiply
-                 |
-          Add (optional)
-                 |
-              Softmax
-                 |
-             Quantize (optional)
-                 |
-            [output](int8/f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_matmul_softmax_fusion)
-        .set_priority(4.1f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::matmul_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_query, 0),
-                                    in_edge(1, dequantize_key, 0)});
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-
-                    auto optional_quantize_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_quantize
-                            = optional_quantize_subgraph->append_alternation(
-                                    {graph::op_kind::Quantize,
-                                            graph::op_kind::DynamicQuantize});
-                    optional_quantize_subgraph->create_input_port(
-                            0, optional_quantize, 0);
-                    optional_quantize_subgraph->create_output_port(
-                            0, optional_quantize, 0);
-                    pgraph->append_optional(optional_quantize_subgraph,
-                            {in_edge(0, softmax, 0)});
-                });
-
-/*
-   (int8)[Query]     [Key](int8)
-            |          |
-      Dequantize   Dequantize
-            |          |
-       TypeCast     TypeCast
-              \     /
-               MatMul
-                 |
-          Divide|Multiply
-                 |
-           Add (optional)
-                 |
-              Softmax
-                 |
-              TypeCast (optional)
-                 |
-              Quantize (optional)
-                 |
-            [output](int8/bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_bf16_matmul_softmax_fusion)
-        .set_priority(4.1f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::matmul_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto cast_query
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_query, 0)});
-
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto cast_key = pgraph->append_op(graph::op_kind::TypeCast,
-                            {in_edge(0, dequantize_key, 0)});
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, cast_query, 0),
-                                    in_edge(1, cast_key, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale = pgraph->append_alternation(
-                            {graph::op_kind::Divide, graph::op_kind::Multiply},
-                            {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Add, fscore_scale);
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_add, 0)});
-
-                    auto optional_output_subgraph
-                            = std::make_shared<pb_graph_t>();
-                    auto optional_typecast
-                            = optional_output_subgraph->append_op(
-                                    graph::op_kind::TypeCast);
-                    auto optional_quantize
-                            = optional_output_subgraph->append_alternation(
-                                    {graph::op_kind::Quantize,
-                                            graph::op_kind::DynamicQuantize},
-                                    {in_edge(0, optional_typecast, 0)});
-
-                    optional_output_subgraph->create_input_port(
-                            0, optional_typecast, 0);
-                    optional_output_subgraph->create_output_port(
-                            0, optional_quantize, 0);
-                    pgraph->append_optional(
-                            optional_output_subgraph, {in_edge(0, softmax, 0)});
-                });
-
-// int8 version of MHA pattern with special view and max
-// (a.k.a int8_mha_pattern_alternative4)
-/*
-    (int8)[Query]   [Key](int8)
-            |          |
-      Dequantize   Dequantize
-              \     /
-               MatMul
-                 |
-              Reshape  [Attention Mask](f32)
-                  \    /
-                   Add  [Max In](f32)
-                    \   /
-                     Max
-                      |
-                   Reshape
-                      |
-                   Softmax
-                      |
-                  Quantize   [Value](int8)
-                      |         |
-                 Dequantize   Dequantize
-                         \     /
-                          MatMul
-                            |
-                         Quantize
-                            |
-                       [output](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_mha_pattern_alternative4)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto dequantize_key = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_query, 0),
-                                    in_edge(1, dequantize_key, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto reshape1
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, matmul_qk, 0)});
-                    auto fscore_add = pgraph->append_op(
-                            graph::op_kind::Add, {in_edge(0, reshape1, 0)});
-                    auto fscore_max = pgraph->append_op(graph::op_kind::Maximum,
-                            {in_edge(0, fscore_add, 0)});
-                    auto reshape2
-                            = pgraph->append_op(graph::op_kind::StaticReshape,
-                                    {in_edge(0, fscore_max, 0)});
-                    auto softmax = pgraph->append_op(
-                            graph::op_kind::SoftMax, {in_edge(0, reshape2, 0)});
-                    auto quantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, softmax, 0)});
-                    auto dequantize_softmax = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize},
-                            {in_edge(0, quantize_softmax, 0)});
-                    auto dequantize_value = pgraph->append_alternation(
-                            {graph::op_kind::Dequantize,
-                                    graph::op_kind::DynamicDequantize});
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_softmax, 0),
-                                    in_edge(1, dequantize_value, 0)});
-                    pgraph->append_alternation(
-                            {graph::op_kind::Quantize,
-                                    graph::op_kind::DynamicQuantize},
-                            {in_edge(0, matmul_v, 0)});
-                });
-
-// int8_distill_bert_mha_pattern
-/*
-        (int8)[Query]   [Key](int8)
-                 |          |
-             Dequantize Dequantize
-                   \     /
-                    MatMul
-                      |
-                Divide (optional)
-  (fp32)[Fill Value]  |
-(bool)[Att Mask]   \  |
-                 \  \ |
-                   Select
-                      |
-                   Softmax
-                      |
-                   Quantize  [Value](int8)
-                      |          |
-                  Dequantize Dequantize
-                         \     /
-                          MatMul
-                             |
-                         Transpose
-                             |
-                          Reorder
-                             |
-                          Quantize
-                             |
-                        [output](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_distill_bert_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query
-                            = pgraph->append_op(graph::op_kind::Dequantize);
-                    auto dequantize_key
-                            = pgraph->append_op(graph::op_kind::Dequantize);
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_query, 0),
-                                    in_edge(1, dequantize_key, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    auto fscore_scale = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Divide, matmul_qk);
-                    auto fscore_select
-                            = pgraph->append_op(graph::op_kind::Select,
-                                    {in_edge(2, fscore_scale, 0)});
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_select, 0)});
-                    auto quantize_softmax = pgraph->append_op(
-                            graph::op_kind::Quantize, {in_edge(0, softmax, 0)});
-                    auto dequantize_softmax
-                            = pgraph->append_op(graph::op_kind::Dequantize,
-                                    {in_edge(0, quantize_softmax, 0)});
-                    auto dequantize_value
-                            = pgraph->append_op(graph::op_kind::Dequantize);
-
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, dequantize_softmax, 0),
-                                    in_edge(1, dequantize_value, 0)});
-                    auto transpose_output
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    auto reshape_reorder_output = pgraph->append_alternation(
-                            {graph::op_kind::Reorder,
-                                    graph::op_kind::StaticReshape},
-                            {in_edge(0, transpose_output, 0)});
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, reshape_reorder_output, 0)});
-                });
-
-/*
-        (int8)[Query]   [Key](int8)
-                 |          |
-             Dequantize Dequantize
-                 |          |
-              TypeCast   TypeCast
-                   \     /
-                    MatMul
-                      |
-                Divide (optional)
-  (bf16)[Fill Value]  |
-(bool)[Att Mask]   \  |
-                 \  \ |
-                   Select
-                      |
-                   Softmax
-                      |
-                   TypeCast
-                      |
-                   Quantize  [Value](int8)
-                      |          |
-                  Dequantize Dequantize
-                      |          |
-                   TypeCast   TypeCast
-                         \     /
-                          MatMul
-                             |
-                         Transpose
-                             |
-                          Reorder
-                             |
-                          TypeCast
-                             |
-                          Quantize
-                             |
-                        [output](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_bf16_distill_bert_mha_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto dequantize_query
-                            = pgraph->append_op(graph::op_kind::Dequantize);
-                    auto cast_query
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_query, 0)});
-
-                    auto dequantize_key
-                            = pgraph->append_op(graph::op_kind::Dequantize);
-                    auto cast_key = pgraph->append_op(graph::op_kind::TypeCast,
-                            {in_edge(0, dequantize_key, 0)});
-
-                    auto matmul_qk = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, cast_query, 0),
-                                    in_edge(1, cast_key, 0)});
-                    matmul_qk->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    auto fscore_scale = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::Divide, matmul_qk);
-                    auto fscore_select
-                            = pgraph->append_op(graph::op_kind::Select,
-                                    {in_edge(2, fscore_scale, 0)});
-                    auto softmax = pgraph->append_op(graph::op_kind::SoftMax,
-                            {in_edge(0, fscore_select, 0)});
-                    auto cast_softmax_fp32 = pgraph->append_op(
-                            graph::op_kind::TypeCast, {in_edge(0, softmax, 0)});
-                    auto quantize_softmax
-                            = pgraph->append_op(graph::op_kind::Quantize,
-                                    {in_edge(0, cast_softmax_fp32, 0)});
-                    auto dequantize_softmax
-                            = pgraph->append_op(graph::op_kind::Dequantize,
-                                    {in_edge(0, quantize_softmax, 0)});
-                    auto cast_softmax
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_softmax, 0)});
-
-                    auto dequantize_value
-                            = pgraph->append_op(graph::op_kind::Dequantize);
-                    auto cast_value
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, dequantize_value, 0)});
-
-                    auto matmul_v = pgraph->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, cast_softmax, 0),
-                                    in_edge(1, cast_value, 0)});
-                    auto transpose_output
-                            = pgraph->append_op(graph::op_kind::StaticTranspose,
-                                    {in_edge(0, matmul_v, 0)});
-                    auto reshape_reorder_output = pgraph->append_alternation(
-                            {graph::op_kind::Reorder,
-                                    graph::op_kind::StaticReshape},
-                            {in_edge(0, transpose_output, 0)});
-                    auto cast_output_fp32
-                            = pgraph->append_op(graph::op_kind::TypeCast,
-                                    {in_edge(0, reshape_reorder_output, 0)});
-                    pgraph->append_op(graph::op_kind::Quantize,
-                            {in_edge(0, cast_output_fp32, 0)});
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_gpt_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mha(pgraph, false, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_bf16_gpt_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mha(pgraph, true, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_llama_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mha(pgraph, false, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_bf16_llama_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mha(pgraph, true, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_starcoder_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_starcoder_mha(pgraph, false, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_bf16_starcoder_mha)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mha)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_starcoder_mha(pgraph, true, true);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/misc_pattern.hpp b/src/graph/backend/graph_compiler/patterns/misc_pattern.hpp
deleted file mode 100644
index 8d9214bc528..00000000000
--- a/src/graph/backend/graph_compiler/patterns/misc_pattern.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_MISC_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_MISC_PATTERN_HPP
-
-#include <memory>
-#include <utility>
-
-#include "graph/backend/graph_compiler/patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-using pb_graph_t = graph::utils::pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(misc_pattern)
-
-/*
-(f32/bf16)[IN0]
-            |
- TypeCast*[0-1]    [IN1](f32/bf16)
-             \     /
-             Multiply
-                |
-            TypeCast*[0-2]
-                |
-            Quantize
-                |
-           [output](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, mul_typecast_quantize)
-        .set_priority(1.f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto mul = pgraph->append_op(graph::op_kind::Multiply);
-                    mul->append_decision_function(check_if_null_producer);
-                    auto prep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, mul, 0, 3);
-                    auto quantize = pgraph->append_op(graph::op_kind::Quantize,
-                            in_edges_t {in_edge(0, prep, 0)});
-                    UNUSED(quantize);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto cast = pgraph->append_op(graph::op_kind::TypeCast);
-                    cast->append_decision_function(check_if_null_producer);
-                    auto mul = pgraph->append_op(graph::op_kind::Multiply,
-                            in_edges_t {in_edge(0, cast, 0)});
-                    auto prep = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, mul, 0, 3);
-                    auto quantize = pgraph->append_op(graph::op_kind::Quantize,
-                            in_edges_t {in_edge(0, prep, 0)});
-                    UNUSED(quantize);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/mlp_pattern.hpp b/src/graph/backend/graph_compiler/patterns/mlp_pattern.hpp
deleted file mode 100644
index 590a25faa6e..00000000000
--- a/src/graph/backend/graph_compiler/patterns/mlp_pattern.hpp
+++ /dev/null
@@ -1,1121 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_MLP_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_MLP_PATTERN_HPP
-
-#include <memory>
-#include <utility>
-
-#include "graph/backend/graph_compiler/patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-namespace pm = graph::utils::pm;
-using in_edges_t = pm::in_edges_t;
-using pb_graph_t = graph::utils::pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-#define MLP_NUM_LAYER_LOWER_BOUND 2
-#define MLP_NUM_LAYER_UPPER_BOUND 11
-
-std::pair<pm::pb_node_t *, pm::pb_node_t *> single_layer_mlp(
-        const std::shared_ptr<pb_graph_t> &pgraph, bool is_bf16 = false,
-        bool is_int8 = false) {
-    pm::pb_node_t *layer_input, *layer_output;
-    in_edges_t matmul_in_edges;
-    if (is_int8) {
-        auto dequantize_input
-                = pgraph->append_alternation({graph::op_kind::Dequantize,
-                        graph::op_kind::DynamicDequantize});
-        auto dequantize_weight
-                = pgraph->append_alternation({graph::op_kind::Dequantize,
-                        graph::op_kind::DynamicDequantize});
-        if (is_bf16) {
-            auto typecast_input = pgraph->append_op(graph::op_kind::TypeCast,
-                    {in_edge(0, dequantize_input, 0)});
-            auto typecast_weight = pgraph->append_op(graph::op_kind::TypeCast,
-                    {in_edge(0, dequantize_weight, 0)});
-            matmul_in_edges = in_edges_t {in_edge(0, typecast_input, 0),
-                    in_edge(1, typecast_weight, 0)};
-        } else {
-            matmul_in_edges = in_edges_t {in_edge(0, dequantize_input, 0),
-                    in_edge(1, dequantize_weight, 0)};
-        }
-        layer_input = dequantize_input;
-    }
-    auto matmul = pgraph->append_op(graph::op_kind::MatMul, matmul_in_edges);
-    matmul->append_decision_function(is_bf16
-                    ? check_input_dtype<graph::data_type::bf16>
-                    : check_input_dtype<graph::data_type::f32>);
-    matmul->allow_external_outputs();
-    if (!is_int8) { layer_input = matmul; }
-
-    /* optional add/biasAdd after matmul */
-    auto add_subgraph = std::make_shared<pb_graph_t>();
-    auto add = add_subgraph->append_alternation(
-            {graph::op_kind::Add, graph::op_kind::BiasAdd});
-    add->allow_external_outputs();
-    add_subgraph->create_input_port(0, add, 0);
-    add_subgraph->create_output_port(0, add, 0);
-    auto optional_add
-            = pgraph->append_optional(add_subgraph, {in_edge(0, matmul, 0)});
-
-    /* optional activation */
-    auto activation_subgraph = std::make_shared<pb_graph_t>();
-    auto activation = activation_subgraph->append_alternation(
-            {graph::op_kind::ReLU, graph::op_kind::Sigmoid});
-    activation->allow_external_outputs();
-    activation_subgraph->create_input_port(0, activation, 0);
-    activation_subgraph->create_output_port(0, activation, 0);
-    auto optional_activation = pgraph->append_optional(
-            activation_subgraph, {in_edge(0, optional_add, 0)});
-    layer_output = optional_activation;
-
-    if (is_int8) {
-        if (is_bf16) {
-            auto typecast_output = pgraph->append_op(
-                    graph::op_kind::TypeCast, {in_edge(0, layer_output, 0)});
-            layer_output = typecast_output;
-        }
-        auto quantize_output = pgraph->append_alternation(
-                {graph::op_kind::Quantize, graph::op_kind::DynamicQuantize},
-                {in_edge(0, layer_output, 0)});
-        layer_output = quantize_output;
-    }
-    return std::make_pair(layer_input, layer_output);
-}
-
-pm::pb_node_t *weight_grad_alternation_unit(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_op_t *activation,
-        bool is_bf16 = false) {
-    /* Create 2 subgraph for alternation */
-    auto weight_grad_option1 = std::make_shared<pb_graph_t>();
-    auto transpose_subgraph1 = std::make_shared<pb_graph_t>();
-    auto transpose_x1
-            = transpose_subgraph1->append_op(graph::op_kind::StaticTranspose);
-    transpose_subgraph1->create_input_port(0, transpose_x1, 0);
-    transpose_subgraph1->create_output_port(0, transpose_x1, 0);
-    auto optional_transpose_x
-            = weight_grad_option1->append_optional(transpose_subgraph1);
-    auto matmul_grad_w1 = weight_grad_option1->append_op(
-            graph::op_kind::MatMul, {in_edge(0, optional_transpose_x, 0)});
-    matmul_grad_w1->append_decision_function(is_bf16
-                    ? check_input_dtype<graph::data_type::bf16>
-                    : check_input_dtype<graph::data_type::f32>);
-    weight_grad_option1->create_input_port(0, matmul_grad_w1, 1);
-    weight_grad_option1->create_output_port(0, matmul_grad_w1, 0);
-
-    auto weight_grad_option2 = std::make_shared<pb_graph_t>();
-    auto transpose_x2
-            = weight_grad_option2->append_op(graph::op_kind::StaticTranspose);
-    auto matmul_grad_w2 = weight_grad_option2->append_op(
-            graph::op_kind::MatMul, {in_edge(0, transpose_x2, 0)});
-    matmul_grad_w2->append_decision_function(is_bf16
-                    ? check_input_dtype<graph::data_type::bf16>
-                    : check_input_dtype<graph::data_type::f32>);
-    weight_grad_option2->create_input_port(0, transpose_x2, 0);
-    weight_grad_option2->create_output_port(0, matmul_grad_w2, 0);
-
-    auto weight_grad = pgraph->append_alternation(
-            {weight_grad_option1, weight_grad_option2},
-            {in_edge(0, activation, 0)});
-    return weight_grad;
-};
-
-pm::pb_node_t *append_optional_quant_dequant(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_mixed_dtype = false) {
-    auto quant_dequant_subgraph = std::make_shared<pb_graph_t>();
-    pm::pb_op_t *typecast1 = nullptr, *typecast2 = nullptr;
-    in_edges_t quant_in_edges;
-    if (is_mixed_dtype) {
-        typecast1 = quant_dequant_subgraph->append_op(graph::op_kind::TypeCast);
-        quant_in_edges = in_edges_t {in_edge(0, typecast1, 0)};
-    }
-    auto quant = quant_dequant_subgraph->append_op(
-            graph::op_kind::Quantize, quant_in_edges);
-    auto dequant = quant_dequant_subgraph->append_op(
-            graph::op_kind::Dequantize, {in_edge(0, quant, 0)});
-    if (is_mixed_dtype) {
-        typecast2 = quant_dequant_subgraph->append_op(
-                graph::op_kind::TypeCast, {in_edge(0, dequant, 0)});
-    }
-    quant_dequant_subgraph->create_input_port(
-            0, is_mixed_dtype ? typecast1 : quant, 0);
-    quant_dequant_subgraph->create_output_port(
-            0, is_mixed_dtype ? typecast2 : dequant, 0);
-    auto optional_quant_dequant = pgraph->append_optional(
-            quant_dequant_subgraph, {in_edge(0, input, 0)});
-    return optional_quant_dequant;
-};
-
-pm::pb_node_t *append_gelu_subgraph(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input) {
-    auto pow = pgraph->append_op(graph::op_kind::Pow, {in_edge(0, input, 0)});
-    auto mul1
-            = pgraph->append_op(graph::op_kind::Multiply, {in_edge(0, pow, 0)});
-    auto add1 = pgraph->append_op(
-            graph::op_kind::Add, {in_edge(0, input, 0), in_edge(1, mul1, 0)});
-    auto mul2 = pgraph->append_op(
-            graph::op_kind::Multiply, {in_edge(0, add1, 0)});
-    auto tanh = pgraph->append_op(graph::op_kind::Tanh, {in_edge(0, mul2, 0)});
-    auto add2 = pgraph->append_op(graph::op_kind::Add, {in_edge(0, tanh, 0)});
-    auto mul3 = pgraph->append_op(
-            graph::op_kind::Multiply, {in_edge(0, input, 0)});
-    auto mul4 = pgraph->append_op(graph::op_kind::Multiply,
-            {in_edge(0, add2, 0), in_edge(1, mul3, 0)});
-    return mul4;
-};
-
-/*
-    [IN0]    [IN1]
-       \      /
-        MatMul
-          |
-        GELU    [IN2]  [IN3]    [IN4]
-           \     /        \      /
-            MatMul         MatMul
-                \___    ___/
-                    Add
-                     |
-                    Add
-                     |
-                 LayerNorm
-                     |
-                   [OUT]
-*/
-void create_gpt_mlp(const std::shared_ptr<pb_graph_t> &pgraph,
-        bool gelu_subgraph = false, bool is_bf16 = false, bool is_int8 = false,
-        bool quantize_output = true) {
-    auto matmul1 = create_dequant_matmul(pgraph, nullptr, is_bf16, is_int8);
-    pm::pb_node_t *gelu;
-    if (gelu_subgraph) {
-        gelu = append_gelu_subgraph(pgraph, matmul1);
-    } else {
-        gelu = pgraph->append_op(
-                graph::op_kind::GELU, {in_edge(0, matmul1, 0)});
-    }
-
-    if (is_int8) {
-        if (is_bf16) {
-            auto typecast1 = pgraph->append_op(
-                    graph::op_kind::TypeCast, {in_edge(0, gelu, 0)});
-            gelu = typecast1;
-        }
-        auto smooth_quant_mul1 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::Multiply, gelu);
-        auto extra_casts1 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, smooth_quant_mul1, 0, 3);
-        auto quant1 = pgraph->append_op(
-                graph::op_kind::Quantize, {in_edge(0, extra_casts1, 0)});
-        gelu = quant1;
-    }
-    auto matmul2 = create_dequant_matmul(pgraph, gelu, is_bf16, is_int8);
-    auto matmul3 = create_dequant_matmul(pgraph, nullptr, is_bf16, is_int8);
-    auto qdq_matmul2 = append_optional_quant_dequant(pgraph, matmul2, is_bf16);
-    auto qdq_matmul3 = append_optional_quant_dequant(pgraph, matmul3, is_bf16);
-    auto add3 = pgraph->append_op(graph::op_kind::Add,
-            {in_edge(0, qdq_matmul2, 0), in_edge(1, qdq_matmul3, 0)});
-    auto add4 = pgraph->append_op(graph::op_kind::Add, {in_edge(0, add3, 0)});
-    add4->allow_external_outputs(); // residual edge to next mlp
-    auto layernorm = pgraph->append_op(
-            graph::op_kind::LayerNorm, {in_edge(0, add4, 0)});
-    layernorm->allow_external_outputs();
-    if (is_int8 && quantize_output) {
-        if (is_bf16) {
-            auto typecast4 = pgraph->append_op(
-                    graph::op_kind::TypeCast, {in_edge(0, layernorm, 0)});
-            typecast4->allow_external_outputs();
-            layernorm = typecast4;
-        }
-        auto smooth_quant_mul4 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::Multiply, layernorm);
-        auto extra_casts2 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, smooth_quant_mul4, 0, 3);
-        auto quant4 = pgraph->append_op(
-                graph::op_kind::Quantize, {in_edge(0, extra_casts2, 0)});
-        UNUSED(quant4);
-    }
-};
-
-pm::pb_node_t *append_rms_norm_option1(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_bf16 = false, bool is_int8 = false, bool end_cast = false) {
-    if (is_bf16) {
-        auto typecast = pgraph->append_op(
-                graph::op_kind::TypeCast, {in_edge(0, input, 0)});
-        input = typecast;
-    }
-    auto pow = pgraph->append_op(graph::op_kind::Pow, {in_edge(0, input, 0)});
-    auto mean = pgraph->append_op(
-            graph::op_kind::ReduceMean, {in_edge(0, pow, 0)});
-    mean->append_decision_function(check_input_num<1>);
-    mean->append_decision_function(check_reduce_attrs);
-    auto add = pgraph->append_op(graph::op_kind::Add, {in_edge(0, mean, 0)});
-    auto rsqrt = pgraph->append_op(graph::op_kind::Pow, {in_edge(0, add, 0)});
-    auto mul1 = pgraph->append_op(graph::op_kind::Multiply,
-            {in_edge(0, input, 0), in_edge(1, rsqrt, 0)});
-    auto cast1 = append_single_op_repetition_subgraph(
-            pgraph, graph::op_kind::TypeCast, mul1, 0, 3);
-    auto mul2 = pgraph->append_op(
-            graph::op_kind::Multiply, {in_edge(0, cast1, 0)});
-    mul2->allow_external_outputs();
-    pm::pb_node_t *output = mul2;
-    if (end_cast) {
-        output = pgraph->append_op(
-                graph::op_kind::TypeCast, {in_edge(0, mul2, 0)});
-    }
-    UNUSED(is_int8);
-    return output;
-};
-
-pm::pb_node_t *append_rms_norm_option2(
-        const std::shared_ptr<pb_graph_t> &pgraph, pm::pb_node_t *input,
-        bool is_bf16 = false, bool is_int8 = false, bool end_cast = false) {
-    pm::pb_node_t *pow_in = input;
-    pm::pb_node_t *mul1_in = input;
-    if (is_bf16) {
-        auto typecast1 = pgraph->append_op(
-                graph::op_kind::TypeCast, {in_edge(0, input, 0)});
-        pow_in = typecast1;
-    }
-    if (is_bf16) {
-        auto typecast2 = pgraph->append_op(
-                graph::op_kind::TypeCast, {in_edge(0, input, 0)});
-        mul1_in = typecast2;
-    }
-    auto pow = pgraph->append_op(graph::op_kind::Pow, {in_edge(0, pow_in, 0)});
-    auto mean = pgraph->append_op(
-            graph::op_kind::ReduceMean, {in_edge(0, pow, 0)});
-    mean->append_decision_function(check_input_num<1>);
-    mean->append_decision_function(check_reduce_attrs);
-    auto add = pgraph->append_op(graph::op_kind::Add, {in_edge(0, mean, 0)});
-    auto rsqrt = pgraph->append_op(graph::op_kind::Pow, {in_edge(0, add, 0)});
-    auto mul1 = pgraph->append_op(graph::op_kind::Multiply,
-            {in_edge(0, mul1_in, 0), in_edge(1, rsqrt, 0)});
-    auto cast1 = append_single_op_repetition_subgraph(
-            pgraph, graph::op_kind::TypeCast, mul1, 0, 3);
-    auto mul2 = pgraph->append_op(
-            graph::op_kind::Multiply, {in_edge(0, cast1, 0)});
-    mul2->allow_external_outputs();
-    pm::pb_node_t *output = mul2;
-    if (end_cast) {
-        output = pgraph->append_op(
-                graph::op_kind::TypeCast, {in_edge(0, mul2, 0)});
-    }
-    UNUSED(is_int8);
-    return output;
-};
-
-/*
-    [IN0]    [IN1]
-       \      /
-        MatMul
-          |
-         Add ______________________
-          |                        |
-          |                    RMSNorm_____     [IN2]
-          |                   /             \    /
-          |                  /               MatMul
-          |                  \     [IN3]    /     \
-          |                   \    /       |    Sigmoid
-          |                   MatMul        \     /
-          |                       \         Multiply
-          |                        \        /
-          |                         Multiply   [IN4]
-          |                               \    /
-          |                               MatMul
-          | ________________________________|
-         Add
-          |
-       RMSNorm
-          |
-        [OUT]
-*/
-void create_llama_mlp(const std::shared_ptr<pb_graph_t> &pgraph,
-        bool is_bf16 = false, bool is_int8 = false,
-        bool use_rms_norm_alternative = false, bool split_smooth_quant = false,
-        bool end_cast = false) {
-    auto matmul1 = create_dequant_matmul(pgraph, nullptr, is_bf16, is_int8);
-    auto add1
-            = pgraph->append_op(graph::op_kind::Add, {in_edge(0, matmul1, 0)});
-    add1->allow_external_outputs();
-    auto norm1 = use_rms_norm_alternative
-            ? append_rms_norm_option1(pgraph, add1, is_bf16, is_int8, end_cast)
-            : append_rms_norm_option2(pgraph, add1, is_bf16, is_int8, end_cast);
-
-    pm::pb_node_t *norm1_for_lhs = norm1, *norm1_for_rhs = norm1;
-    if (is_int8) {
-        auto extra_cast_before_mul = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, norm1_for_lhs);
-        auto smooth_quant_mul1 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::Multiply, extra_cast_before_mul);
-        auto extra_cast_after_mul = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, smooth_quant_mul1, 0, 3);
-        auto quant1 = pgraph->append_op(graph::op_kind::Quantize,
-                {in_edge(0, extra_cast_after_mul, 0)});
-        if (split_smooth_quant) {
-            auto extra_cast_before_mul_rhs
-                    = append_single_op_repetition_subgraph(
-                            pgraph, graph::op_kind::TypeCast, norm1_for_rhs);
-            auto smooth_quant_mul1_rhs = append_single_op_repetition_subgraph(
-                    pgraph, graph::op_kind::Multiply,
-                    extra_cast_before_mul_rhs);
-            auto extra_cast_after_mul_rhs
-                    = append_single_op_repetition_subgraph(pgraph,
-                            graph::op_kind::TypeCast, smooth_quant_mul1_rhs, 0,
-                            3);
-            auto quant1_rhs = pgraph->append_op(graph::op_kind::Quantize,
-                    {in_edge(0, extra_cast_after_mul_rhs, 0)});
-            norm1_for_lhs = quant1;
-            norm1_for_rhs = quant1_rhs;
-        } else {
-            norm1_for_lhs = quant1;
-            norm1_for_rhs = quant1;
-        }
-    }
-
-    auto matmul2
-            = create_dequant_matmul(pgraph, norm1_for_lhs, is_bf16, is_int8);
-    auto silu_sigmoid = pgraph->append_op(
-            graph::op_kind::Sigmoid, {in_edge(0, matmul2, 0)});
-    auto silu_mul = pgraph->append_op(graph::op_kind::Multiply,
-            {in_edge(0, matmul2, 0), in_edge(1, silu_sigmoid, 0)});
-
-    auto matmul3
-            = create_dequant_matmul(pgraph, norm1_for_rhs, is_bf16, is_int8);
-
-    pm::pb_node_t *mul = pgraph->append_op(graph::op_kind::Multiply,
-            {in_edge(0, silu_mul, 0), in_edge(1, matmul3, 0)});
-
-    if (is_int8) {
-        auto extra_cast_before_mul = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, mul);
-        auto smooth_quant_mul2 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::Multiply, extra_cast_before_mul);
-        auto extra_cast_after_mul = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, smooth_quant_mul2, 0, 3);
-        auto quant2 = pgraph->append_op(graph::op_kind::Quantize,
-                {in_edge(0, extra_cast_after_mul, 0)});
-        mul = quant2;
-    }
-
-    auto matmul4 = create_dequant_matmul(pgraph, mul, is_bf16, is_int8);
-    auto add2 = pgraph->append_op(
-            graph::op_kind::Add, {in_edge(0, matmul4, 0), in_edge(1, add1, 0)});
-    add2->allow_external_outputs();
-    auto norm2 = use_rms_norm_alternative
-            ? append_rms_norm_option1(pgraph, add2, is_bf16, is_int8, end_cast)
-            : append_rms_norm_option2(pgraph, add2, is_bf16, is_int8, end_cast);
-    if (is_int8) {
-        auto extra_cast_before_mul = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, norm2);
-        auto smooth_quant_mul3 = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::Multiply, extra_cast_before_mul);
-        auto extra_cast_after_mul = append_single_op_repetition_subgraph(
-                pgraph, graph::op_kind::TypeCast, smooth_quant_mul3, 0, 3);
-        auto quant3 = pgraph->append_op(graph::op_kind::Quantize,
-                {in_edge(0, extra_cast_after_mul, 0)});
-        UNUSED(quant3);
-    }
-};
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(fp32_mlp_pattern)
-
-/*
-repetition unit:
-  (f32)[REP_IN0]   [REP_IN1](f32)
-              \     /
-               MatMul
-                 |
-                Add (optional)
-                 |
-             Activation (optional)
-                 |
-             [REP_OUT0](f32)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mlp_forward_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto mlp_layer = std::make_shared<pb_graph_t>();
-                    pm::pb_node_t *matmul, *optional_activation;
-                    std::tie(matmul, optional_activation)
-                            = single_layer_mlp(mlp_layer, false, false);
-                    mlp_layer->create_input_port(0, matmul, 0);
-                    mlp_layer->create_output_port(0, optional_activation, 0);
-                    // repeat layer for [LOWER_BOUND, UPPER_BOUND) times
-                    pgraph->append_repetition(mlp_layer, {0, 0},
-                            MLP_NUM_LAYER_LOWER_BOUND,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-                });
-
-/*
-[repetition unit]:
-          (f32)[x_next]     [gradient_x_next](f32)
-                      \       /     [weight](f32)
-                       \     /           |
-                       Backward    StaticTranspose
-                     /    |   \     /     (optional)
-    [grad_w_subgraph]  Reduce  Matmul
-             |            |      |
-       [gradient_w](f32)  |  [gradient_x](f32)
-                          |
-                  [gradient_bias](f32)
-
-[optional unit]:
-          (f32)[x_next]   [repetition_unit_out](f32)
-                      \       /
-                       \     /
-                      Backward
-                     /    |
-      [grad_w_subgraph] Reduce
-             |            |
-       [gradient_w](f32)  |
-                          |
-                  [gradient_bias](f32)
-
-pattern:
-        [repetition unit]*[1-10]
-                |
-        [optional unit] (optional)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mlp_backward_pattern)
-        .set_priority(5.1f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto bwd_mlp_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd = bwd_mlp_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-
-                    weight_grad_alternation_unit(bwd_mlp_layer, activation_bwd);
-
-                    auto transpose_subgraph2 = std::make_shared<pb_graph_t>();
-                    auto transpose_w = transpose_subgraph2->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph2->create_input_port(0, transpose_w, 0);
-                    transpose_subgraph2->create_output_port(0, transpose_w, 0);
-                    auto optional_transpose_w = bwd_mlp_layer->append_optional(
-                            transpose_subgraph2);
-                    auto matmul_layer = bwd_mlp_layer->append_op(
-                            graph::op_kind::MatMul,
-                            {in_edge(0, activation_bwd, 0),
-                                    in_edge(1, optional_transpose_w, 0)});
-
-                    auto reduce_bias = bwd_mlp_layer->append_op(
-                            graph::op_kind::ReduceSum,
-                            {in_edge(0, activation_bwd, 0)});
-                    reduce_bias->append_decision_function(check_input_num<1>);
-                    reduce_bias->append_decision_function(check_reduce_attrs);
-
-                    bwd_mlp_layer->create_input_port(0, activation_bwd, 1);
-                    bwd_mlp_layer->create_output_port(0, matmul_layer, 0);
-
-                    // repeat layer for [LOWER_BOUND - 1, UPPER_BOUND) times
-                    auto repetition = pgraph->append_repetition(bwd_mlp_layer,
-                            {0, 0}, MLP_NUM_LAYER_LOWER_BOUND - 1,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-
-                    // start append the optional last layer
-                    auto last_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd_last = last_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd_last->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    // still allow extra grad_input computation
-                    activation_bwd_last->allow_external_outputs();
-
-                    auto weight_grad = weight_grad_alternation_unit(
-                            last_layer, activation_bwd_last);
-                    auto reduce_bias_last
-                            = last_layer->append_op(graph::op_kind::ReduceSum,
-                                    {in_edge(0, activation_bwd_last, 0)});
-                    reduce_bias_last->append_decision_function(
-                            check_input_num<1>);
-                    reduce_bias_last->append_decision_function(
-                            check_reduce_attrs);
-                    last_layer->create_input_port(0, activation_bwd_last, 1);
-                    last_layer->create_output_port(0, weight_grad, 0);
-                    pgraph->append_optional(
-                            last_layer, {in_edge(0, repetition, 0)});
-                });
-
-/*
-[repetition unit]:
-           (f32)[x_next]     [gradient_x_next](bf16)
-        [x](f32)      \       /     [weight](f32)
-            |          \     /           |
-     StaticTranspose   Backward    StaticTranspose
-(optional)     \     /        \     /     (optional)
-                Matmul         Matmul
-                   |             |
-       [gradient_w](f32)     [gradient_x](f32)
-
-[optional unit]:
-           (f32)[x_next]   [repetition_unit_out](f32)
-        [x](f32)      \       /
-            |          \     /
-     StaticTranspose  Backward
-(optional)     \     /
-                Matmul
-                   |
-          [gradient_w](f32)
-
-pattern:
-        [repetition unit]*[1-10]
-                |
-         [optional unit] (optional)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, fp32_mlp_backward_pattern_v2)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto bwd_mlp_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd = bwd_mlp_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    activation_bwd->allow_external_outputs();
-
-                    auto transpose_subgraph1 = std::make_shared<pb_graph_t>();
-                    auto transpose_x = transpose_subgraph1->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph1->create_input_port(0, transpose_x, 0);
-                    transpose_subgraph1->create_output_port(0, transpose_x, 0);
-                    auto optional_transpose_x = bwd_mlp_layer->append_optional(
-                            transpose_subgraph1);
-
-                    auto transpose_subgraph2 = std::make_shared<pb_graph_t>();
-                    auto transpose_w = transpose_subgraph2->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph2->create_input_port(0, transpose_w, 0);
-                    transpose_subgraph2->create_output_port(0, transpose_w, 0);
-                    auto optional_transpose_w = bwd_mlp_layer->append_optional(
-                            transpose_subgraph2);
-
-                    bwd_mlp_layer->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, optional_transpose_x, 0),
-                                    in_edge(1, activation_bwd, 0)});
-                    auto matmul_layer = bwd_mlp_layer->append_op(
-                            graph::op_kind::MatMul,
-                            {in_edge(0, activation_bwd, 0),
-                                    in_edge(1, optional_transpose_w, 0)});
-                    matmul_layer->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-
-                    bwd_mlp_layer->create_input_port(0, activation_bwd, 1);
-                    bwd_mlp_layer->create_output_port(0, matmul_layer, 0);
-
-                    // repeat layer for [LOWER_BOUND - 1, UPPER_BOUND) times
-                    auto repetition = pgraph->append_repetition(bwd_mlp_layer,
-                            {0, 0}, MLP_NUM_LAYER_LOWER_BOUND - 1,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-
-                    // start append the optional last layer
-                    auto last_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd_last = last_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd_last->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    // still allow extra grad_input computation
-                    activation_bwd_last->allow_external_outputs();
-
-                    auto transpose_subgraph_last
-                            = std::make_shared<pb_graph_t>();
-                    auto transpose_x_last = transpose_subgraph_last->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph_last->create_input_port(
-                            0, transpose_x_last, 0);
-                    transpose_subgraph_last->create_output_port(
-                            0, transpose_x_last, 0);
-                    auto optional_transpose_x_last
-                            = last_layer->append_optional(
-                                    transpose_subgraph_last);
-                    auto matmul_last = last_layer->append_op(
-                            graph::op_kind::MatMul,
-                            {in_edge(0, optional_transpose_x_last, 0),
-                                    in_edge(1, activation_bwd_last, 0)});
-                    matmul_last->append_decision_function(
-                            check_input_dtype<graph::data_type::f32>);
-                    last_layer->create_input_port(0, activation_bwd_last, 1);
-                    last_layer->create_output_port(0, matmul_last, 0);
-                    pgraph->append_optional(
-                            last_layer, {in_edge(0, repetition, 0)});
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fp32_gpt_mlp)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, false, false, false);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, true, false, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, fp32_llama_mlp)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, false, false);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(int8_mlp_pattern)
-
-/*
-repetition unit:
- (int8)[REP_IN0]    [REP_IN1](int8)
-           |            |
-      Dequantize    Dequantize
-              \     /
-               MatMul
-                 |
-                Add (optional)
-                 |
-             Activation (optional)
-                 |
-              Quantize
-                 |
-             [REP_OUT0](int8)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_mlp_pattern)
-        .set_priority(6.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto mlp_layer = std::make_shared<pb_graph_t>();
-                    pm::pb_node_t *dequantize_input, *quantize_output;
-                    std::tie(dequantize_input, quantize_output)
-                            = single_layer_mlp(mlp_layer, false, true);
-
-                    mlp_layer->create_input_port(0, dequantize_input, 0);
-                    mlp_layer->create_output_port(0, quantize_output, 0);
-
-                    // repeat layer for [LOWER_BOUND, UPPER_BOUND) times
-                    pgraph->append_repetition(mlp_layer, {0, 0},
-                            MLP_NUM_LAYER_LOWER_BOUND,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_gpt_mlp)
-        .set_priority(6.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, false, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, true, false, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_gpt_mlp_fp32_out)
-        .set_priority(6.4f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, false, false, true, false);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, true, false, true, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_bf16_gpt_mlp)
-        .set_priority(6.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, false, true, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, true, true, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, int8_bf16_gpt_mlp_fp32_out)
-        .set_priority(6.4f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, false, true, true, false);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, true, true, true, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_llama_mlp)
-        .set_priority(6.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, false, true, false, true);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, int8_bf16_llama_mlp)
-        .set_priority(6.5f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::quantized_mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, true, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, false, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, true, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, false, true, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, true, true, true, true);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(bf16_mlp_pattern)
-
-/*
-repetition unit:
- (bf16)[REP_IN0]   [REP_IN1](bf16)
-              \     /
-               MatMul
-                 |
-                Add (optional)
-                 |
-             Activation
-                 |
-             [REP_OUT0](bf16)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_mlp_forward_pattern)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto mlp_layer = std::make_shared<pb_graph_t>();
-                    pm::pb_node_t *matmul, *optional_activation;
-                    std::tie(matmul, optional_activation)
-                            = single_layer_mlp(mlp_layer, true, false);
-                    mlp_layer->create_input_port(0, matmul, 0);
-                    mlp_layer->create_output_port(0, optional_activation, 0);
-
-                    // repeat layer for [LOWER_BOUND, UPPER_BOUND) times
-                    pgraph->append_repetition(mlp_layer, {0, 0},
-                            MLP_NUM_LAYER_LOWER_BOUND,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-                });
-
-/*
-[repetition unit]:
-          (bf16)[x_next]     [gradient_x_next](bf16)
-                      \       /     [weight](bf16)
-                       \     /           |
-                       Backward    StaticTranspose
-                     /    |   \     /     (optional)
-    [grad_w_subgraph]  Reduce  Matmul
-             |            |      |
-      [gradient_w](bf16)  |  [gradient_x](bf16)
-                          |
-                  [gradient_bias](bf16)
-
-[optional unit]:
-          (bf16)[x_next]   [repetition_unit_out](bf16)
-                      \       /
-                       \     /
-                      Backward
-                     /    |
-      [grad_w_subgraph] Reduce
-             |            |
-      [gradient_w](bf16)  |
-                          |
-                  [gradient_bias](bf16)
-
-pattern:
-        [repetition unit]*[1-10]
-                |
-        [optional unit] (optional)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_mlp_backward_pattern)
-        .set_priority(5.1f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto bwd_mlp_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd = bwd_mlp_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-
-                    weight_grad_alternation_unit(
-                            bwd_mlp_layer, activation_bwd, true);
-
-                    auto transpose_subgraph2 = std::make_shared<pb_graph_t>();
-                    auto transpose_w = transpose_subgraph2->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph2->create_input_port(0, transpose_w, 0);
-                    transpose_subgraph2->create_output_port(0, transpose_w, 0);
-                    auto optional_transpose_w = bwd_mlp_layer->append_optional(
-                            transpose_subgraph2);
-                    auto matmul_layer = bwd_mlp_layer->append_op(
-                            graph::op_kind::MatMul,
-                            {in_edge(0, activation_bwd, 0),
-                                    in_edge(1, optional_transpose_w, 0)});
-
-                    auto reduce_bias = bwd_mlp_layer->append_op(
-                            graph::op_kind::ReduceSum,
-                            {in_edge(0, activation_bwd, 0)});
-                    reduce_bias->append_decision_function(check_input_num<1>);
-                    reduce_bias->append_decision_function(check_reduce_attrs);
-
-                    bwd_mlp_layer->create_input_port(0, activation_bwd, 1);
-                    bwd_mlp_layer->create_output_port(0, matmul_layer, 0);
-
-                    // repeat layer for [LOWER_BOUND - 1, UPPER_BOUND) times
-                    auto repetition = pgraph->append_repetition(bwd_mlp_layer,
-                            {0, 0}, MLP_NUM_LAYER_LOWER_BOUND - 1,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-
-                    // start append the optional last layer
-                    auto last_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd_last = last_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd_last->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    // still allow extra grad_input computation
-                    activation_bwd_last->allow_external_outputs();
-
-                    auto weight_grad = weight_grad_alternation_unit(
-                            last_layer, activation_bwd_last, true);
-                    auto reduce_bias_last
-                            = last_layer->append_op(graph::op_kind::ReduceSum,
-                                    {in_edge(0, activation_bwd_last, 0)});
-                    reduce_bias_last->append_decision_function(
-                            check_input_num<1>);
-                    reduce_bias_last->append_decision_function(
-                            check_reduce_attrs);
-                    last_layer->create_input_port(0, activation_bwd_last, 1);
-                    last_layer->create_output_port(0, weight_grad, 0);
-                    pgraph->append_optional(
-                            last_layer, {in_edge(0, repetition, 0)});
-                });
-
-/*
-[repetition unit]:
-          (bf16)[x_next]     [gradient_x_next](bf16)
-       [x](bf16)      \       /     [weight](bf16)
-            |          \     /           |
-     StaticTranspose  Backward    StaticTranspose
-(optional)     \     /        \     /     (optional)
-                Matmul         Matmul
-                   |             |
-      [gradient_w](bf16)     [gradient_x](bf16)
-
-[optional unit]:
-          (bf16)[x_next]   [repetition_unit_out](bf16)
-       [x](bf16)      \       /
-            |          \     /
-     StaticTranspose  Backward
-(optional)     \     /
-                Matmul
-                   |
-      [gradient_w](bf16)
-
-pattern:
-        [repetition unit]*[1-10]
-                |
-         [optional unit] (optional)
-*/
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, bf16_mlp_backward_pattern_v2)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    auto bwd_mlp_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd = bwd_mlp_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    activation_bwd->allow_external_outputs();
-
-                    auto transpose_subgraph1 = std::make_shared<pb_graph_t>();
-                    auto transpose_x = transpose_subgraph1->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph1->create_input_port(0, transpose_x, 0);
-                    transpose_subgraph1->create_output_port(0, transpose_x, 0);
-                    auto optional_transpose_x = bwd_mlp_layer->append_optional(
-                            transpose_subgraph1);
-
-                    auto transpose_subgraph2 = std::make_shared<pb_graph_t>();
-                    auto transpose_w = transpose_subgraph2->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph2->create_input_port(0, transpose_w, 0);
-                    transpose_subgraph2->create_output_port(0, transpose_w, 0);
-                    auto optional_transpose_w = bwd_mlp_layer->append_optional(
-                            transpose_subgraph2);
-
-                    bwd_mlp_layer->append_op(graph::op_kind::MatMul,
-                            {in_edge(0, optional_transpose_x, 0),
-                                    in_edge(1, activation_bwd, 0)});
-                    auto matmul_layer = bwd_mlp_layer->append_op(
-                            graph::op_kind::MatMul,
-                            {in_edge(0, activation_bwd, 0),
-                                    in_edge(1, optional_transpose_w, 0)});
-                    matmul_layer->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    bwd_mlp_layer->create_input_port(0, activation_bwd, 1);
-                    bwd_mlp_layer->create_output_port(0, matmul_layer, 0);
-
-                    // repeat layer for [LOWER_BOUND - 1, UPPER_BOUND) times
-                    auto repetition = pgraph->append_repetition(bwd_mlp_layer,
-                            {0, 0}, MLP_NUM_LAYER_LOWER_BOUND - 1,
-                            MLP_NUM_LAYER_UPPER_BOUND);
-
-                    // start append the optional last layer
-                    auto last_layer = std::make_shared<pb_graph_t>();
-                    auto activation_bwd_last = last_layer->append_alternation(
-                            {graph::op_kind::ReLUBackward,
-                                    graph::op_kind::SigmoidBackward});
-                    activation_bwd_last->append_decision_function(
-                            check_input_dtype<graph::data_type::bf16>);
-                    // still allow extra grad_input computation
-                    activation_bwd_last->allow_external_outputs();
-
-                    auto transpose_subgraph_last
-                            = std::make_shared<pb_graph_t>();
-                    auto transpose_x_last = transpose_subgraph_last->append_op(
-                            graph::op_kind::StaticTranspose);
-                    transpose_subgraph_last->create_input_port(
-                            0, transpose_x_last, 0);
-                    transpose_subgraph_last->create_output_port(
-                            0, transpose_x_last, 0);
-                    auto optional_transpose_x_last
-                            = last_layer->append_optional(
-                                    transpose_subgraph_last);
-                    auto matmul_last = last_layer->append_op(
-                            graph::op_kind::MatMul,
-                            {in_edge(0, optional_transpose_x_last, 0),
-                                    in_edge(1, activation_bwd_last, 0)});
-                    last_layer->create_input_port(0, activation_bwd_last, 1);
-                    last_layer->create_output_port(0, matmul_last, 0);
-                    pgraph->append_optional(
-                            last_layer, {in_edge(0, repetition, 0)});
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, bf16_gpt_mlp)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, false, true, false);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_gpt_mlp(pgraph, true, true, false);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, bf16_llama_mlp)
-        .set_priority(5.0f)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(graph::partition_kind_t::mlp)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, false);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, false, false, false, true);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    create_llama_mlp(pgraph, true, false, true, false, true);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/pattern_utils.hpp b/src/graph/backend/graph_compiler/patterns/pattern_utils.hpp
deleted file mode 100644
index 24c9811206a..00000000000
--- a/src/graph/backend/graph_compiler/patterns/pattern_utils.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_PATTERN_UTILS_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_PATTERN_UTILS_HPP
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-#include <unordered_set>
-
-#include "graph/interface/graph.hpp"
-#include "graph/interface/partition.hpp"
-
-#include "graph/backend/graph_compiler/compiler_backend.hpp"
-#include "graph/backend/graph_compiler/compiler_partition_impl.hpp"
-
-#include "graph/utils/pm/nested_matcher.hpp"
-#include "graph/utils/pm/pbuilder.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-
-class pattern_utils_t {
-public:
-    inline void match(graph::graph_t &backend_graph,
-            std::shared_ptr<graph::utils::pm::pb_graph_t> pgraph,
-            std::vector<std::vector<op_t *>> &fusion_ops);
-    inline void set_partitions(graph::graph_t &backend_graph,
-            std::vector<std::vector<op_t *>> &fusion_ops,
-            graph::partition_kind_t pkind, std::string pname);
-
-    pattern_utils_t() = default;
-    pattern_utils_t(const pattern_utils_t &) = delete;
-    pattern_utils_t(pattern_utils_t &&) = delete;
-    pattern_utils_t &operator=(const pattern_utils_t &) = delete;
-};
-
-static bool contain_unsupported_dtype(const graph::op_t *op) {
-    for (const auto &in_value : op->get_input_values()) {
-        if (in_value->get_logical_tensor().data_type == data_type::f16
-                || in_value->get_logical_tensor().data_type
-                        == data_type::f8_e4m3
-                || in_value->get_logical_tensor().data_type
-                        == data_type::f8_e5m2) {
-            return true;
-        }
-    }
-    for (const auto &out_value : op->get_output_values()) {
-        if (out_value->get_logical_tensor().data_type == data_type::f16
-                || out_value->get_logical_tensor().data_type
-                        == data_type::f8_e4m3
-                || out_value->get_logical_tensor().data_type
-                        == data_type::f8_e5m2) {
-            return true;
-        }
-    }
-    return false;
-}
-
-inline void pattern_utils_t::match(graph::graph_t &backend_graph,
-        std::shared_ptr<graph::utils::pm::pb_graph_t> pgraph,
-        std::vector<std::vector<op_t *>> &fusion_ops) {
-    // dfs_visit graph, do pattern matching
-    topo_order_visit(backend_graph.get_output_ops(), [&](op_t *cur_op) {
-        std::vector<op_t *> candidate_fusion;
-        if (!graph::utils::pm::match_pattern(
-                    cur_op, pgraph, candidate_fusion)) {
-            return status::success;
-        }
-        bool unsupported_dtype = false;
-        for (const auto &op : candidate_fusion) {
-            if (contain_unsupported_dtype(op)) {
-                unsupported_dtype = true;
-                break;
-            }
-        }
-        if (unsupported_dtype) {
-            for (const auto &c : candidate_fusion) {
-                c->remove_attr(op_attr::matched);
-            }
-            return status::success;
-        }
-        fusion_ops.emplace_back(candidate_fusion);
-        return status::success;
-    });
-}
-
-inline void pattern_utils_t::set_partitions(graph::graph_t &backend_graph,
-        std::vector<std::vector<op_t *>> &fusion_ops,
-        graph::partition_kind_t pkind, std::string pname) {
-    std::vector<op_t *> fusion_ops_set;
-    std::unordered_set<op_t *> visit;
-
-    for (auto &pairs : fusion_ops) {
-        fusion_ops_set.clear();
-        visit.clear();
-        auto pimpl = std::make_shared<compiler_partition_impl_t>(
-                backend_graph.get_engine_kind(),
-                backend_graph.get_fpmath_mode(), pkind, pname);
-
-        for (size_t i = 0; i < pairs.size(); ++i) {
-            visit.insert(pairs[i]);
-            fusion_ops_set.push_back(pairs[i]);
-        }
-
-        for (auto &cur_op : fusion_ops_set) {
-            for (size_t j = 0; j < cur_op->num_inputs(); ++j) {
-                auto in_value = cur_op->get_input_value(j);
-                if (!in_value->has_producer()
-                        || !visit.count(&in_value->get_producer())) {
-                    pimpl->add_input_tensor(in_value);
-                }
-            }
-
-            for (size_t j = 0; j < cur_op->num_outputs(); ++j) {
-                auto out_value = cur_op->get_output_value(j);
-                // if out_value has no consumer
-                // OR any of its consumers are not inside the pattern
-                // it is output tensor
-                bool is_output = out_value->get_consumers().empty();
-                for (auto &consumer : out_value->get_consumers()) {
-                    if (!visit.count(&consumer.get_op())) {
-                        is_output = true;
-                        break;
-                    }
-                }
-                if (is_output) { pimpl->add_output_tensor(out_value); }
-            }
-        }
-
-        // transfer the matched op's ownership from graph to partition
-        for (size_t i = 0; i < pairs.size(); ++i) {
-            pimpl->add_op(pairs[i]->shared_from_this());
-            // claim the op belong to the partition
-            pairs[i]->set_partition(pimpl.get());
-        }
-        backend_graph.add_partition(pimpl);
-    }
-}
-
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/single_op_pattern.hpp b/src/graph/backend/graph_compiler/patterns/single_op_pattern.hpp
deleted file mode 100644
index cb412c9e46a..00000000000
--- a/src/graph/backend/graph_compiler/patterns/single_op_pattern.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_SINGLE_OP_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_SINGLE_OP_PATTERN_HPP
-
-#include <memory>
-#include <utility>
-
-#include "graph/backend/graph_compiler/compiler_graph.hpp"
-#include "graph/backend/graph_compiler/patterns/fusions.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-namespace pm = graph::utils::pm;
-using in_edges_t = pm::in_edges_t;
-using pb_graph_t = graph::utils::pm::pb_graph_t;
-using FCreatePattern = graph::pass::FCreatePattern;
-
-#define DEFAULT_PRIORITY 1.f
-
-COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(single_op_pattern)
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, single_op_gc)
-        .set_priority(DEFAULT_PRIORITY)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    pgraph->append_alternation(get_no_constraint_ops());
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, single_op_reduce_gc)
-        .set_priority(DEFAULT_PRIORITY)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *reduction
-                            = pgraph->append_alternation(get_reduction_ops());
-                    reduction->append_decision_function(check_input_num<1>);
-                    reduction->append_decision_function(check_reduce_attrs);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, single_op_conv_gc)
-        .set_priority(DEFAULT_PRIORITY)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *conv
-                            = pgraph->append_alternation(
-                                    get_conv_forward_ops());
-                    conv->append_decision_function(check_conv_attrs);
-                    conv->append_decision_function(check_isa_compatibility);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *conv_backward
-                            = pgraph->append_alternation(
-                                    get_conv_backward_ops());
-                    conv_backward->append_decision_function(check_input_num<2>);
-                    conv_backward->append_decision_function(check_conv_attrs);
-                    conv_backward->append_decision_function(
-                            check_isa_compatibility);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(
-        compiler, single_op_batchnorm_training_gc)
-        .set_priority(DEFAULT_PRIORITY)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *bn_fwd = pgraph->append_op(
-                            graph::op_kind::BatchNormForwardTraining);
-                    bn_fwd->append_decision_function(check_input_num<5>);
-                })
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *bn_bwd = pgraph->append_op(
-                            graph::op_kind::BatchNormForwardTraining);
-                    bn_bwd->append_decision_function(check_input_num<5>);
-                    bn_bwd->append_decision_function(check_output_num<3>);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, single_op_pooling_gc)
-        .set_priority(DEFAULT_PRIORITY)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *pooling
-                            = pgraph->append_alternation(get_pooling_ops());
-                    pooling->append_decision_function(check_pooling_input_num);
-                    pooling->append_decision_function(check_conv_attrs);
-                });
-
-COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS(compiler, single_op_matmul_gc)
-        .set_priority(DEFAULT_PRIORITY)
-        .set_engine_kind(graph::engine_kind::cpu)
-        .set_kind(partition_kind_t::misc_post_ops)
-        .set_attr<FCreatePattern>("FCreatePattern",
-                [](const std::shared_ptr<pb_graph_t> &pgraph) -> void {
-                    graph::utils::pm::pb_op_t *matmul
-                            = pgraph->append_alternation(get_matmul_op());
-                    matmul->append_decision_function(check_isa_compatibility);
-                });
-COMPILER_BACKEND_REGISTER_PASSES_DEF_END
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/patterns/transformation_pattern.hpp b/src/graph/backend/graph_compiler/patterns/transformation_pattern.hpp
deleted file mode 100644
index f3cbd84bd82..00000000000
--- a/src/graph/backend/graph_compiler/patterns/transformation_pattern.hpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef BACKEND_GRAPH_COMPILER_PATTERNS_TRANSFORMATION_PATTERN_HPP
-#define BACKEND_GRAPH_COMPILER_PATTERNS_TRANSFORMATION_PATTERN_HPP
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "pattern_utils.hpp"
-
-#include "graph/utils/pm/nested_matcher.hpp"
-#include "graph/utils/pm/pass_base.hpp"
-#include "graph/utils/pm/pbuilder.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace pass {
-
-/*!
- * \brief transformation_pass matches patterns in the given graph
- *        and creates partitions based on that
- */
-class transformation_pass_t : public graph::pass::pass_base {
-public:
-    explicit transformation_pass_t(std::string pbackend, std::string pname)
-        : graph::pass::pass_base(std::move(pbackend), std::move(pname)) {}
-
-    static graph::pass::pass_base_ptr create(
-            std::string pbackend, std::string pname) {
-        return std::make_shared<transformation_pass_t>(
-                std::move(pbackend), std::move(pname));
-    }
-
-    graph::status_t run(graph::graph_t &agraph) override {
-        // check if current pattern pass can be run on current graph
-        engine_kind_t graph_engine_kind = agraph.get_engine_kind();
-        if (get_engine_kind() != engine_kind::any_engine
-                && get_engine_kind() != graph_engine_kind)
-            return impl::status::success;
-
-        std::vector<graph::pass::Pattern> pgraphs
-                = get_attr<graph::pass::Pattern>("Pattern");
-        pattern_utils_t pu;
-        for (const auto &pgraph : pgraphs) {
-            // check if min_op_num in the pattern is larger than
-            // num_unpartitioned_ops in the graph, if true,
-            // no need to run this pattern any more
-            if (pgraph->get_min_op_num() > agraph.num_unpartitioned_ops())
-                continue;
-            // match the given pattern in the graph
-            std::vector<std::vector<op_t *>> matched_pairs_list;
-            if (get_verbose(verbose_t::create_dispatch, component_t::graph)) {
-                printf("onednn_verbose,graph,create:dispatch,pattern_"
-                       "matcher,%s,compiler_backend\n",
-                        get_pass_name().c_str());
-            }
-            pu.match(agraph, pgraph, matched_pairs_list);
-            if (!matched_pairs_list.empty()) {
-                // temporary solution here for showing which pattern matched
-                if (getenv_int_user("GRAPH_DUMP", 0) > 0
-                        || graph::utils::check_verbose_string_user(
-                                "DUMP", "pattern")) {
-                    printf("onednn_graph_verbose,info,pattern,hit,%s\n",
-                            get_pass_name().c_str());
-                    fflush(stdout);
-                }
-                pu.set_partitions(agraph, matched_pairs_list, get_kind(),
-                        get_pass_name());
-            }
-        }
-        return graph::status::success;
-    }
-};
-
-#define DECLARE_PASS_EX(bname, pname, counter) \
-    static auto _registered_pass_##pname##_##bname##_##counter##_
-
-#define DECLARE_PASS(bname, pname, counter) \
-    DECLARE_PASS_EX(bname, pname, counter)
-
-#define COMPILER_BACKEND_REGISTER_TRANSFORMATION_PASS( \
-        backend_name, pass_class_name) \
-    DECLARE_PASS(backend_name, pass_class_name, __COUNTER__) \
-            = registry.register_pass(#backend_name, #pass_class_name, \
-                    &transformation_pass_t::create)
-
-#define COMPILER_BACKEND_REGISTER_PASSES_DEF_BEGIN(passes_class_) \
-    void register_##passes_class_(graph::pass::pass_registry_t &registry) {
-#define COMPILER_BACKEND_REGISTER_PASSES_DEF_END }
-
-#define COMPILER_BACKEND_REGISTER_PASSES_CALL(passes_class_, pass_registry_) \
-    pass::register_##passes_class_(pass_registry_);
-
-} // namespace pass
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/src/graph/backend/graph_compiler/target_machine.hpp b/src/graph/backend/graph_compiler/target_machine.hpp
deleted file mode 100644
index ba7702f18d0..00000000000
--- a/src/graph/backend/graph_compiler/target_machine.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_TARGET_MACHINE_HPP
-#define BACKEND_GRAPH_COMPILER_TARGET_MACHINE_HPP
-
-#include "core/src/compiler/config/context.hpp"
-#include "runtime/config.hpp"
-
-#define REQUIRE_AVX512_BEGIN \
-    if (::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512F) {
-#define REQUIRE_VNNI_AMXINT8_BEGIN \
-    if (::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512VNNI \
-            || ::dnnl::impl::graph::gc::get_default_context() \
-                       ->machine_.cpu_flags_.fAVX512AMXINT8) {
-#define REQUIRE_BF16_AMXBF16_BEGIN \
-    if (::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512BF16 \
-            || ::dnnl::impl::graph::gc::get_default_context() \
-                       ->machine_.cpu_flags_.fAVX512AMXBF16) {
-#define REQUIRE_AMX_BEGIN \
-    if (dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXTILE) {
-#define REQUIRE_AMXBF16_BEGIN \
-    if (dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXBF16) {
-#define REQUIRE_AVX512_END }
-#define REQUIRE_VNNI_AMXINT8_END }
-#define REQUIRE_BF16_AMXBF16_END }
-#define REQUIRE_AMX_END }
-#define REQUIRE_AMXBF16_END }
-
-inline bool support_vnni() {
-    REQUIRE_VNNI_AMXINT8_BEGIN
-    return true;
-    REQUIRE_VNNI_AMXINT8_END
-    return false;
-}
-
-inline bool support_bf16() {
-    REQUIRE_BF16_AMXBF16_BEGIN
-    return true;
-    REQUIRE_BF16_AMXBF16_END
-    return false;
-}
-#endif
diff --git a/src/graph/backend/graph_compiler/utils.hpp b/src/graph/backend/graph_compiler/utils.hpp
deleted file mode 100644
index a59ba1b1ccd..00000000000
--- a/src/graph/backend/graph_compiler/utils.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef BACKEND_GRAPH_COMPILER_UTILS_HPP
-#define BACKEND_GRAPH_COMPILER_UTILS_HPP
-
-#include <functional>
-#include <memory>
-#include <numeric>
-#include <vector>
-
-#include <graph/interface/c_types_map.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace compiler_impl {
-namespace utils {
-
-// gcc4.8.5 can 't support enum class as key
-struct enum_hash_t {
-    template <typename T>
-    size_t operator()(const T &t) const {
-        return static_cast<size_t>(t);
-    }
-};
-
-template <class F, class T>
-inline auto func_map(const T &inputs, const F &fn)
-        -> std::vector<decltype(fn(*inputs.begin()))> {
-    std::vector<decltype(fn(*inputs.begin()))> r;
-    r.reserve(inputs.size());
-    for (const auto &input : inputs)
-        r.push_back(fn(input));
-    return r;
-}
-
-inline dims get_dense_strides(const dims &shape) {
-    dims strides(shape.size());
-    for (auto it = shape.begin(); it < shape.end(); ++it) {
-        const auto val = std::accumulate(
-                std::next(it), shape.end(), 1, std::multiplies<dim_t>());
-        const auto dist = std::distance(shape.begin(), it);
-        strides[static_cast<size_t>(dist)] = val;
-    }
-    return strides;
-}
-
-} // namespace utils
-} // namespace compiler_impl
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/src/graph/interface/allocator.cpp b/src/graph/interface/allocator.cpp
index 641945aa386..8cf7ff91263 100644
--- a/src/graph/interface/allocator.cpp
+++ b/src/graph/interface/allocator.cpp
@@ -139,67 +139,3 @@ dnnl_graph_ocl_interop_make_engine_from_cache_blob_with_allocator(
     return status::success;
 }
 #endif
-
-void dnnl_graph_allocator::monitor_t::record_allocate(
-        const void *buf, size_t size, dnnl_graph_allocator::mem_type_t type) {
-    const auto persistent = dnnl_graph_allocator::mem_type_t::persistent;
-    const auto temp = dnnl_graph_allocator::mem_type_t::temp;
-    if (type == persistent) {
-        persist_mem_ += size;
-        persist_mem_infos_.emplace(buf, mem_info_t {size, persistent});
-    } else if (type == temp) {
-        auto tid = std::this_thread::get_id();
-        temp_mem_[tid] += size;
-        if (peak_temp_mem_[tid] < temp_mem_[tid])
-            peak_temp_mem_[tid] = temp_mem_[tid];
-        temp_mem_infos_[tid].emplace(buf, mem_info_t {size, temp});
-    } else {
-        // we didn't use output type buffer now.
-        assertm(0, "we didn't use output type buffer now");
-    }
-}
-
-void dnnl_graph_allocator::monitor_t::record_deallocate(const void *buf) {
-    bool is_persist = persist_mem_infos_.find(buf) != persist_mem_infos_.end();
-    if (is_persist) {
-        auto persist_pos = persist_mem_infos_.find(buf);
-        persist_mem_ -= persist_pos->second.size_;
-        persist_mem_infos_.erase(persist_pos);
-    } else {
-        auto tid = std::this_thread::get_id();
-        auto temp_pos = temp_mem_infos_[tid].find(buf);
-        if (temp_pos != temp_mem_infos_[tid].end()) {
-            temp_mem_[tid] -= temp_pos->second.size_;
-        }
-    }
-}
-
-void dnnl_graph_allocator::monitor_t::reset_peak_temp_memory() {
-    auto tid = std::this_thread::get_id();
-    rw_mutex_.lock_write();
-    peak_temp_mem_[tid] = 0;
-    rw_mutex_.unlock_write();
-}
-
-size_t dnnl_graph_allocator::monitor_t::get_peak_temp_memory() {
-    auto tid = std::this_thread::get_id();
-    rw_mutex_.lock_read();
-    size_t ret = peak_temp_mem_.at(tid);
-    rw_mutex_.unlock_read();
-    return ret;
-}
-
-size_t dnnl_graph_allocator::monitor_t::get_total_persist_memory() {
-    rw_mutex_.lock_read();
-    size_t size = persist_mem_;
-    rw_mutex_.unlock_read();
-    return size;
-}
-
-void dnnl_graph_allocator::monitor_t::lock_write() {
-    rw_mutex_.lock_write();
-}
-
-void dnnl_graph_allocator::monitor_t::unlock_write() {
-    rw_mutex_.unlock_write();
-}
diff --git a/src/graph/interface/allocator.hpp b/src/graph/interface/allocator.hpp
index bf92d38ee74..fe6e7085ffa 100644
--- a/src/graph/interface/allocator.hpp
+++ b/src/graph/interface/allocator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,23 +17,15 @@
 #ifndef GRAPH_INTERFACE_ALLOCATOR_HPP
 #define GRAPH_INTERFACE_ALLOCATOR_HPP
 
-#include <atomic>
-#include <cstdlib>
-#include <unordered_map>
-
 #include "oneapi/dnnl/dnnl_graph.h"
 
-#include "common/rw_mutex.hpp"
-
 #include "graph/interface/c_types_map.hpp"
 
 #include "graph/utils/alloc.hpp"
-#include "graph/utils/id.hpp"
-#include "graph/utils/utils.hpp"
 #include "graph/utils/verbose.hpp"
 
 #ifdef DNNL_WITH_SYCL
-#include "graph/utils/sycl_check.hpp"
+#include "oneapi/dnnl/dnnl_sycl.hpp"
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
@@ -41,7 +33,7 @@
 #include "oneapi/dnnl/dnnl_graph_ocl.h"
 #endif
 
-struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
+struct dnnl_graph_allocator {
 public:
     dnnl_graph_allocator() = default;
 
@@ -61,41 +53,6 @@ struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
         : ocl_malloc_(ocl_malloc), ocl_free_(ocl_free) {}
 #endif
 
-    dnnl_graph_allocator(const dnnl_graph_allocator &alloc) {
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-        ocl_malloc_ = alloc.ocl_malloc_;
-        ocl_free_ = alloc.ocl_free_;
-#endif
-
-#ifdef DNNL_WITH_SYCL
-        sycl_malloc_ = alloc.sycl_malloc_;
-        sycl_free_ = alloc.sycl_free_;
-#endif
-
-        host_malloc_ = alloc.host_malloc_;
-        host_free_ = alloc.host_free_;
-    }
-
-    ~dnnl_graph_allocator() = default;
-
-    dnnl_graph_allocator &operator=(const dnnl_graph_allocator &alloc) {
-        // check self-assignment
-        if (this == &alloc) return *this;
-
-#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-        ocl_malloc_ = alloc.ocl_malloc_;
-        ocl_free_ = alloc.ocl_free_;
-#endif
-
-#ifdef DNNL_WITH_SYCL
-        sycl_malloc_ = alloc.sycl_malloc_;
-        sycl_free_ = alloc.sycl_free_;
-#endif
-        host_malloc_ = alloc.host_malloc_;
-        host_free_ = alloc.host_free_;
-        return *this;
-    }
-
     enum class mem_type_t {
         persistent = 0,
         output = 1,
@@ -108,88 +65,23 @@ struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
         size_t alignment_;
 
         /// Default constructor for an uninitialized attribute
-        mem_attr_t() {
-            type_ = mem_type_t::persistent;
-            alignment_ = 0;
-        }
-
-        mem_attr_t(mem_type_t type, size_t alignment) {
-            type_ = type;
-            alignment_ = alignment;
-        }
-
-        /// Copy constructor
-        mem_attr_t(const mem_attr_t &other) = default;
-
-        /// Assign operator
-        mem_attr_t &operator=(const mem_attr_t &other) = default;
-    };
-
-    struct mem_info_t {
-        mem_info_t(size_t size, mem_type_t type) : size_(size), type_(type) {}
-        size_t size_;
-        mem_type_t type_;
-    };
+        mem_attr_t() : type_(mem_type_t::persistent), alignment_(0) {}
 
-    struct monitor_t {
-    private:
-        size_t persist_mem_ = 0;
-
-        std::unordered_map<const void *, mem_info_t> persist_mem_infos_;
-
-        std::unordered_map<std::thread::id, size_t> temp_mem_;
-        std::unordered_map<std::thread::id, size_t> peak_temp_mem_;
-        std::unordered_map<std::thread::id,
-                std::unordered_map<const void *, mem_info_t>>
-                temp_mem_infos_;
-
-        // Since the memory operation will be performed from multiple threads,
-        // so we use the rw lock to guarantee the thread safety of the global
-        // persistent memory monitoring.
-        dnnl::impl::utils::rw_mutex_t rw_mutex_;
-
-    public:
-        void record_allocate(const void *buf, size_t size, mem_type_t type);
-
-        void record_deallocate(const void *buf);
-
-        void reset_peak_temp_memory();
-
-        size_t get_peak_temp_memory();
-
-        size_t get_total_persist_memory();
-
-        void lock_write();
-        void unlock_write();
+        mem_attr_t(mem_type_t type, size_t alignment)
+            : type_(type), alignment_(alignment) {}
     };
 
     void *allocate(size_t size, mem_attr_t attr = {}) const {
-#ifndef NDEBUG
-        monitor_.lock_write();
         void *buffer = host_malloc_(size, attr.alignment_);
-        monitor_.record_allocate(buffer, size, attr.type_);
-        monitor_.unlock_write();
-#else
-        void *buffer = host_malloc_(size, attr.alignment_);
-#endif
         return buffer;
     }
 
 #ifdef DNNL_WITH_SYCL
     void *allocate(size_t size, const ::sycl::device &dev,
             const ::sycl::context &ctx, mem_attr_t attr = {}) const {
-#ifndef NDEBUG
-        monitor_.lock_write();
-        void *buffer = sycl_malloc_(size, attr.alignment_,
-                static_cast<const void *>(&dev),
-                static_cast<const void *>(&ctx));
-        monitor_.record_allocate(buffer, size, attr.type_);
-        monitor_.unlock_write();
-#else
         void *buffer = sycl_malloc_(size, attr.alignment_,
                 static_cast<const void *>(&dev),
                 static_cast<const void *>(&ctx));
-#endif
         return buffer;
     }
 #endif
@@ -197,14 +89,7 @@ struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     void *allocate(size_t size, cl_device_id dev, cl_context ctx,
             mem_attr_t attr = {}) const {
-#ifndef NDEBUG
-        monitor_.lock_write();
-        void *buffer = ocl_malloc_(size, attr.alignment_, dev, ctx);
-        monitor_.record_allocate(buffer, size, attr.type_);
-        monitor_.unlock_write();
-#else
         void *buffer = ocl_malloc_(size, attr.alignment_, dev, ctx);
-#endif
         return buffer;
     }
 #endif
@@ -237,34 +122,16 @@ struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
 #endif
 
     void deallocate(void *buffer) const {
-        if (buffer) {
-#ifndef NDEBUG
-            monitor_.lock_write();
-            monitor_.record_deallocate(buffer);
-            host_free_(buffer);
-            monitor_.unlock_write();
-#else
-            host_free_(buffer);
-#endif
-        }
+        if (buffer) { host_free_(buffer); }
     }
 
 #ifdef DNNL_WITH_SYCL
     void deallocate(void *buffer, const ::sycl::device &dev,
             const ::sycl::context &ctx, ::sycl::event deps) const {
         if (buffer) {
-#ifndef NDEBUG
-            monitor_.lock_write();
-            monitor_.record_deallocate(buffer);
-            sycl_free_(buffer, static_cast<const void *>(&dev),
-                    static_cast<const void *>(&ctx),
-                    static_cast<void *>(&deps));
-            monitor_.unlock_write();
-#else
             sycl_free_(buffer, static_cast<const void *>(&dev),
                     static_cast<const void *>(&ctx),
                     static_cast<void *>(&deps));
-#endif
         }
     }
 #endif
@@ -273,21 +140,12 @@ struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
     void deallocate(void *buffer, cl_device_id dev, cl_context ctx,
             cl_event deps) const {
         if (buffer) {
-#ifndef NDEBUG
-            monitor_.lock_write();
-            monitor_.record_deallocate(buffer);
-            ocl_free_(buffer, dev, ctx, deps);
-            monitor_.unlock_write();
-#else
             ocl_free_(buffer, dev, ctx, deps);
-#endif
             buffer = nullptr;
         }
     }
 #endif
 
-    monitor_t &get_monitor() { return monitor_; }
-
 private:
     dnnl_graph_host_allocate_f host_malloc_ {
             dnnl::impl::graph::utils::cpu_allocator_t::malloc};
@@ -308,8 +166,6 @@ struct dnnl_graph_allocator final : public dnnl::impl::graph::utils::id_t {
     dnnl_graph_ocl_deallocate_f ocl_free_ {
             dnnl::impl::graph::utils::ocl_allocator_t::free};
 #endif
-
-    mutable monitor_t monitor_;
 };
 
 #endif
diff --git a/src/graph/interface/backend.hpp b/src/graph/interface/backend.hpp
index 3ae8705a842..4b3a9800b50 100644
--- a/src/graph/interface/backend.hpp
+++ b/src/graph/interface/backend.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
+ * Copyright 2020-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,10 +40,6 @@ namespace graph {
 // forward declaration
 status_t register_dnnl_backend();
 status_t register_fake_backend();
-#ifdef DNNL_ENABLE_COMPILER_BACKEND
-// register graph compiler backend
-status_t register_compiler_backend();
-#endif
 
 class backend_t {
 public:
@@ -162,7 +158,6 @@ class backend_registry_t {
     // of vector
     std::vector<const backend_t *> &get_registered_backends() {
         invoke_backend_registration();
-        std::lock_guard<std::mutex> lock(m_);
         return sorted_backends_;
     }
 
@@ -175,7 +170,6 @@ class backend_registry_t {
     const backend_t *get_registered_backend(size_t layout_id) {
         invoke_backend_registration();
         size_t backend_id = extract_backend_id(layout_id);
-        std::lock_guard<std::mutex> lock(m_);
         return backends_[backend_id];
     }
 
diff --git a/src/graph/interface/c_types_map.hpp b/src/graph/interface/c_types_map.hpp
index 1fde5218645..c07ae3ff61a 100644
--- a/src/graph/interface/c_types_map.hpp
+++ b/src/graph/interface/c_types_map.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -63,6 +63,8 @@ const data_type_t u8 = dnnl_u8;
 const data_type_t boolean = dnnl_boolean;
 const data_type_t f8_e5m2 = dnnl_f8_e5m2;
 const data_type_t f8_e4m3 = dnnl_f8_e4m3;
+const data_type_t s4 = dnnl_s4;
+const data_type_t u4 = dnnl_u4;
 } // namespace data_type
 
 using partition_policy_t = dnnl_graph_partition_policy_t;
@@ -154,6 +156,8 @@ const op_kind_t End = dnnl_graph_op_end;
 const op_kind_t Exp = dnnl_graph_op_exp;
 const op_kind_t GELU = dnnl_graph_op_gelu;
 const op_kind_t GELUBackward = dnnl_graph_op_gelu_backward;
+const op_kind_t GenIndex = dnnl_graph_op_gen_index;
+const op_kind_t GreaterEqual = dnnl_graph_op_greater_equal;
 const op_kind_t GroupNorm = dnnl_graph_op_group_norm;
 const op_kind_t HardSigmoid = dnnl_graph_op_hard_sigmoid;
 const op_kind_t HardSigmoidBackward = dnnl_graph_op_hard_sigmoid_backward;
@@ -243,6 +247,7 @@ const op_attr_t shape = dnnl_graph_op_attr_shape;
 const op_attr_t sizes = dnnl_graph_op_attr_sizes;
 const op_attr_t strides = dnnl_graph_op_attr_strides;
 const op_attr_t zps = dnnl_graph_op_attr_zps;
+const op_attr_t group_shape = dnnl_graph_op_attr_group_shape;
 
 const op_attr_t exclude_pad = dnnl_graph_op_attr_exclude_pad;
 const op_attr_t keep_dims = dnnl_graph_op_attr_keep_dims;
diff --git a/src/graph/interface/constant_tensor_cache.cpp b/src/graph/interface/constant_tensor_cache.cpp
index 09b3f0b11ae..2f50a912ad7 100644
--- a/src/graph/interface/constant_tensor_cache.cpp
+++ b/src/graph/interface/constant_tensor_cache.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
+ * Copyright 2023-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -352,7 +352,7 @@ struct global_cache_manager_t {
             // The first field: engine kind
             impl::engine_kind_t env_eng_kind = impl::engine_kind::any_engine;
             if (!fields.empty() && !fields[0].empty()) {
-                std::string eng_kind = fields[0];
+                const std::string &eng_kind = fields[0];
                 assertm(eng_kind == "cpu" || eng_kind == "gpu",
                         "engine kind must be cpu or gpu");
                 env_eng_kind = eng_kind == "cpu" ? impl::engine_kind::cpu
diff --git a/src/graph/interface/graph.cpp b/src/graph/interface/graph.cpp
index b2a7307ec51..8b01c3181b3 100644
--- a/src/graph/interface/graph.cpp
+++ b/src/graph/interface/graph.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,7 +31,8 @@
 #include "graph/utils/pm/dag_check_pass.hpp"
 #include "graph/utils/pm/op_depth_check_pass.hpp"
 #include "graph/utils/pm/pass_manager.hpp"
-#include "graph/utils/utils.hpp"
+
+#include "graph/utils/json.hpp"
 
 using namespace dnnl::impl::graph;
 
@@ -77,7 +78,7 @@ bool logical_tensor_sanity_check(
 // partition into `topo_fused_ops`, the definition of “proxy op” is: the last
 // one in the topological order of the partition
 status_t dnnl_graph_graph::get_ordered_partitions(
-        std::vector<partition_t *> &partitions) {
+        std::vector<partition_t *> &partitions) const {
 
     std::vector<op_t *> topo_unfused_ops;
     std::vector<op_t *> topo_fused_ops;
@@ -201,6 +202,46 @@ status_t dnnl_graph_graph::analyze() {
     return ret;
 }
 
+status_t dnnl_graph_graph::serialize(const std::string &filename) const {
+    const auto &fpmath = get_fpmath_mode();
+    dnnl::impl::verbose_printf(
+            "graph,info,serialize graph to a json file %s\n", filename.c_str());
+    std::ofstream of(filename);
+    utils::json::json_writer_t writer(&of);
+    writer.begin_object();
+    std::string version = std::to_string(dnnl_version()->major) + "."
+            + std::to_string(dnnl_version()->minor) + "."
+            + std::to_string(dnnl_version()->patch);
+    writer.write_keyvalue("version", version);
+    writer.write_keyvalue("engine_kind",
+            std::string(utils::engine_kind2str(get_engine_kind())));
+    writer.write_keyvalue(
+            "fpmath_mode", std::string(utils::fpmath_mode2str(fpmath.mode_)));
+    writer.write_keyvalue("fpmath_mode_apply_to_int",
+            std::string(fpmath.apply_to_int_ ? "true" : "false"));
+    std::vector<size_t> inputs_id;
+    inputs_id.reserve(get_input_values().size());
+    for (const auto &val : get_input_values()) {
+        auto lt = val->get_logical_tensor();
+        auto ltw = logical_tensor_wrapper_t(lt);
+        inputs_id.push_back(ltw.id());
+    }
+    writer.write_keyvalue("input_ports", inputs_id);
+    std::vector<size_t> outputs_id;
+    outputs_id.reserve(get_output_values().size());
+    for (const auto &val : get_output_values()) {
+        auto lt = val->get_logical_tensor();
+        auto ltw = logical_tensor_wrapper_t(lt);
+        outputs_id.push_back(ltw.id());
+    }
+    writer.write_keyvalue("output_ports", outputs_id);
+    writer.write_keyvalue("graph", get_ops());
+    writer.end_object();
+    writer.write_newline();
+
+    return graph::status::success;
+}
+
 // Deep copy a graph
 std::vector<dnnl_graph_graph::op_ptr> dnnl_graph_graph::deep_copy(
         const std::vector<dnnl_graph_graph::op_ptr> &ops) {
@@ -280,6 +321,29 @@ status_t DNNL_API dnnl_graph_graph_destroy(graph_t *graph) {
     return status::success;
 }
 
+status_t dnnl_graph_graph_set_fpmath_mode(
+        dnnl_graph_graph_t graph, dnnl_fpmath_mode_t mode, int apply_to_int) {
+
+    if (graph == nullptr) { return status::invalid_arguments; }
+
+    if (graph->is_finalized()) { return status::invalid_graph; }
+
+    return graph->set_fpmath_mode(mode, apply_to_int);
+}
+
+status_t dnnl_graph_graph_get_fpmath_mode(
+        dnnl_graph_graph_t graph, dnnl_fpmath_mode_t *mode, int *apply_to_int) {
+
+    if (graph == nullptr) { return status::invalid_arguments; }
+    if (graph->is_finalized()) { return status::invalid_graph; }
+
+    const auto &fpmath = graph->get_fpmath_mode();
+    if (mode) *mode = fpmath.mode_;
+    if (apply_to_int) *apply_to_int = fpmath.apply_to_int_;
+
+    return status::success;
+}
+
 status_t DNNL_API dnnl_graph_add_op(graph_t *graph, op_t *op) {
     if (graph == nullptr || op == nullptr) { return status::invalid_arguments; }
 
diff --git a/src/graph/interface/graph.hpp b/src/graph/interface/graph.hpp
index 4b0a4b69490..496d6bca035 100644
--- a/src/graph/interface/graph.hpp
+++ b/src/graph/interface/graph.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include "oneapi/dnnl/dnnl_graph.h"
 
 #include "graph/interface/c_types_map.hpp"
+#include "graph/interface/graph_attr.hpp"
 #include "graph/interface/logical_tensor.hpp"
 #include "graph/interface/op.hpp"
 #include "graph/interface/op_schema.hpp"
@@ -40,7 +41,6 @@
 
 #include "graph/utils/debug.hpp"
 #include "graph/utils/id.hpp"
-#include "graph/utils/json.hpp"
 #include "graph/utils/utils.hpp"
 
 namespace graph = dnnl::impl::graph;
@@ -68,13 +68,13 @@ struct dnnl_graph_graph : public graph::utils::id_t {
 
 private:
     /*! \brief added ops*/
-    std::vector<op_ptr> ops_ {};
+    std::vector<op_ptr> ops_;
 
     /*! \brief The engine kind on which the operator will be evaluated */
     graph::engine_kind_t engine_kind_ {};
 
     /*! \brief The floating-point math mode */
-    graph::fpmath_mode_t fpmath_mode_ {};
+    graph::fpmath_t fpmath_;
 
     std::vector<std::shared_ptr<graph::partition_impl_t>> partition_impls_;
 
@@ -85,24 +85,28 @@ struct dnnl_graph_graph : public graph::utils::id_t {
 
 public:
     dnnl_graph_graph(graph::engine_kind_t kind = graph::engine_kind::cpu)
-        : engine_kind_(kind), fpmath_mode_(dnnl::impl::get_fpmath_mode()) {}
+        : engine_kind_(kind) {
+        fpmath_.mode_ = dnnl::impl::get_fpmath_mode();
+    }
 
     dnnl_graph_graph(
             graph::engine_kind_t kind, graph::fpmath_mode_t fpmath_mode)
-        : engine_kind_(kind), fpmath_mode_(fpmath_mode) {}
+        : engine_kind_(kind) {
+        fpmath_.mode_ = fpmath_mode;
+    }
 
     // deep copy (except that the partition_impls_ is shallow copy)
     dnnl_graph_graph(const dnnl_graph_graph &other)
         : id_t(other)
         , ops_(deep_copy(other.ops_))
         , engine_kind_(other.engine_kind_)
-        , fpmath_mode_(other.fpmath_mode_)
+        , fpmath_(other.fpmath_)
         , partition_impls_(other.partition_impls_) {};
 
     dnnl_graph_graph(const std::vector<op_ptr> &ops,
             graph::engine_kind_t kind = graph::engine_kind::cpu,
             graph::fpmath_mode_t fpmath_mode = graph::fpmath_mode::strict)
-        : ops_(ops), engine_kind_(kind), fpmath_mode_(fpmath_mode) {}
+        : ops_(ops), engine_kind_(kind), fpmath_ {fpmath_mode, false} {};
 
     dnnl_graph_graph &operator=(const dnnl_graph_graph &other) = delete;
 
@@ -110,7 +114,7 @@ struct dnnl_graph_graph : public graph::utils::id_t {
 
     graph::engine_kind_t get_engine_kind() const { return engine_kind_; }
 
-    graph::fpmath_mode_t get_fpmath_mode() const { return fpmath_mode_; }
+    const graph::fpmath_t &get_fpmath_mode() const { return fpmath_; }
 
     /*!
      * \brief Check whether an operator can be added
@@ -142,6 +146,13 @@ struct dnnl_graph_graph : public graph::utils::id_t {
         return graph::status::success;
     }
 
+    graph::status_t set_fpmath_mode(
+            graph::fpmath_mode_t mode, bool apply_to_int) {
+        fpmath_.mode_ = mode;
+        fpmath_.apply_to_int_ = apply_to_int;
+        return graph::status::success;
+    }
+
     op_t *create_op(graph::op_kind_t kind, std::string name = "") {
         ops_.push_back(std::make_shared<op_t>(kind, std::move(name)));
         return ops_.back().get();
@@ -268,7 +279,7 @@ struct dnnl_graph_graph : public graph::utils::id_t {
      * \param list of partitions
      */
     graph::status_t get_ordered_partitions(
-            std::vector<graph::partition_t *> &partitions);
+            std::vector<graph::partition_t *> &partitions) const;
 
     // Finalize the graph after finishing adding ops.
     graph::status_t finalize();
@@ -283,7 +294,7 @@ struct dnnl_graph_graph : public graph::utils::id_t {
     // This function is used to infer shape for all the ops in a graph.
     // Before calling this function, the inputs value of the graph should
     // have valid shape
-    graph::status_t infer_shape() {
+    graph::status_t infer_shape() const {
         using value_ptr = std::shared_ptr<value_t>;
 
         // Check inputs shape
@@ -334,6 +345,7 @@ struct dnnl_graph_graph : public graph::utils::id_t {
 
     // This function is used to set user given logical tensors for inputs and
     // outputs of a graph.
+    // NOLINTNEXTLINE(readability-make-member-function-const)
     graph::status_t set_user_inputs_outputs(
             const std::vector<graph::logical_tensor_t> &inputs,
             const std::vector<graph::logical_tensor_t> &outputs) {
@@ -396,42 +408,7 @@ struct dnnl_graph_graph : public graph::utils::id_t {
     }
 
     // This function is used to serialize graph to a JSON file
-    graph::status_t serialize(const std::string &filename) const {
-        dnnl::impl::verbose_printf(
-                "onednn_graph_verbose,info,serialize graph to a json file %s\n",
-                filename.c_str());
-        std::ofstream of(filename);
-        graph::utils::json::json_writer_t writer(&of);
-        writer.begin_object();
-        std::string version = std::to_string(dnnl_version()->major) + "."
-                + std::to_string(dnnl_version()->minor) + "."
-                + std::to_string(dnnl_version()->patch);
-        writer.write_keyvalue("version", version);
-        writer.write_keyvalue("engine_kind",
-                std::string(graph::utils::engine_kind2str(get_engine_kind())));
-        writer.write_keyvalue("fpmath_mode",
-                std::string(graph::utils::fpmath_mode2str(get_fpmath_mode())));
-        std::vector<size_t> inputs_id;
-        inputs_id.reserve(get_input_values().size());
-        for (const auto &val : get_input_values()) {
-            auto lt = val->get_logical_tensor();
-            auto ltw = logical_tensor_wrapper_t(lt);
-            inputs_id.push_back(ltw.id());
-        }
-        writer.write_keyvalue("input_ports", inputs_id);
-        std::vector<size_t> outputs_id;
-        outputs_id.reserve(get_output_values().size());
-        for (const auto &val : get_output_values()) {
-            auto lt = val->get_logical_tensor();
-            auto ltw = logical_tensor_wrapper_t(lt);
-            outputs_id.push_back(ltw.id());
-        }
-        writer.write_keyvalue("output_ports", outputs_id);
-        writer.write_keyvalue("graph", get_ops());
-        writer.end_object();
-
-        return graph::status::success;
-    }
+    graph::status_t serialize(const std::string &filename) const;
 
     static std::vector<op_ptr> deep_copy(const std::vector<op_ptr> &ops);
 };
diff --git a/src/graph/interface/graph_attr.hpp b/src/graph/interface/graph_attr.hpp
new file mode 100644
index 00000000000..8bc72481d91
--- /dev/null
+++ b/src/graph/interface/graph_attr.hpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef GRAPH_INTERFACE_GRAPH_ATTR_HPP
+#define GRAPH_INTERFACE_GRAPH_ATTR_HPP
+
+#include "graph/interface/c_types_map.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace graph {
+
+struct fpmath_t {
+    fpmath_t() = default;
+
+    fpmath_t(fpmath_mode_t mode, bool apply_to_int)
+        : mode_(mode), apply_to_int_(apply_to_int) {}
+
+    bool operator==(const fpmath_t &rhs) const {
+        return mode_ == rhs.mode_ && apply_to_int_ == rhs.apply_to_int_;
+    }
+
+    fpmath_mode_t mode_ = fpmath_mode::strict;
+    bool apply_to_int_ = false;
+};
+
+} // namespace graph
+} // namespace impl
+} // namespace dnnl
+
+#endif
diff --git a/src/graph/interface/logical_tensor.cpp b/src/graph/interface/logical_tensor.cpp
index 0e686d17030..bb8b21e1d4c 100644
--- a/src/graph/interface/logical_tensor.cpp
+++ b/src/graph/interface/logical_tensor.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,9 @@ size_t logical_tensor_wrapper_t::size() const {
                     static_cast<size_t>(strided_pdim * effective_stride));
         }
 
-        return max_size * data_type_size();
+        size_t data_size = utils::div_up(
+                max_size * data_type_size(), sub_byte_data_type_multiplier());
+        return data_size;
     } else if (is_opaque()) {
         size_t layout_id = lt->layout.layout_id;
         auto backend
diff --git a/src/graph/interface/logical_tensor.hpp b/src/graph/interface/logical_tensor.hpp
index 9f71af9a074..fe4e1a793b7 100644
--- a/src/graph/interface/logical_tensor.hpp
+++ b/src/graph/interface/logical_tensor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -154,6 +154,13 @@ struct logical_tensor_wrapper_t {
                 /* check_dtype = */ true);
     }
 
+    /** For sub-byte data types returns number of elements per byte.
+     * For the rest data types returns 1. */
+    size_t sub_byte_data_type_multiplier() const {
+        if (utils::one_of(data_type(), data_type::s4, data_type::u4)) return 2;
+        return 1;
+    }
+
     // return the size of data type
     size_t data_type_size() const { return types::data_type_size(data_type()); }
 
diff --git a/src/graph/interface/op.hpp b/src/graph/interface/op.hpp
index be1df9ad458..f04a5838a64 100644
--- a/src/graph/interface/op.hpp
+++ b/src/graph/interface/op.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ struct dnnl_graph_op : public std::enable_shared_from_this<dnnl_graph_op> {
     }
 
     bool attributes_equal(const dnnl_graph_op &other) const {
-        for (auto attr : this->attributes_) {
+        for (const auto &attr : this->attributes_) {
             // There is no need to check internal attributes.
             if (attr.first >= dnnl_graph_op_attr_t::dnnl_graph_op_attr_end)
                 continue;
@@ -318,6 +318,7 @@ struct dnnl_graph_op : public std::enable_shared_from_this<dnnl_graph_op> {
             CASE(axis);
             CASE(begin_norm_axis);
             CASE(groups);
+            CASE(group_shape);
             CASE(axes);
             CASE(dilations);
             CASE(weights_shape);
@@ -393,6 +394,8 @@ struct dnnl_graph_op : public std::enable_shared_from_this<dnnl_graph_op> {
             CASE(Exp);
             CASE(GELU);
             CASE(GELUBackward);
+            CASE(GenIndex);
+            CASE(GreaterEqual);
             CASE(GroupNorm);
             CASE(HardSigmoid);
             CASE(HardSigmoidBackward);
@@ -497,8 +500,7 @@ struct dnnl_graph_op : public std::enable_shared_from_this<dnnl_graph_op> {
         std::for_each(attrs.begin(), attrs.end(),
                 [&copied_attrs](
                         const std::pair<op_attr_t, attribute_value_t> &v) {
-                    copied_attrs.emplace(
-                            std::make_pair(attr2str(v.first), v.second));
+                    copied_attrs.emplace(attr2str(v.first), v.second);
                 });
 
         copied_attrs.erase("op_depth");
@@ -513,9 +515,9 @@ struct dnnl_graph_op : public std::enable_shared_from_this<dnnl_graph_op> {
 private:
     size_t id_ {};
     op_kind_t kind_ {};
-    std::string name_ {};
-    std::vector<std::shared_ptr<value_t>> inputs_ {};
-    std::vector<std::shared_ptr<value_t>> outputs_ {};
+    std::string name_;
+    std::vector<std::shared_ptr<value_t>> inputs_;
+    std::vector<std::shared_ptr<value_t>> outputs_;
     std::unordered_map<op_attr_t, attribute_value_t> attributes_;
 
     dnnl::impl::graph::partition_impl_t *partition_ {nullptr};
@@ -524,7 +526,7 @@ struct dnnl_graph_op : public std::enable_shared_from_this<dnnl_graph_op> {
     // fused op: we still need to represent a fused op
     // possibly we can remove these once the new backend API and new pattern
     // matcher is done.
-    std::vector<size_t> op_ids_ {};
+    std::vector<size_t> op_ids_;
     // Map from the fused op input index -> (original op id, op input offset)
     std::unordered_map<size_t, pair_t> input_tensor_map_;
     // Map from the fused op output index -> (original op id, op output offset)
diff --git a/src/graph/interface/op_def.hpp b/src/graph/interface/op_def.hpp
index 20ea687fdcf..760197239bb 100644
--- a/src/graph/interface/op_def.hpp
+++ b/src/graph/interface/op_def.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -54,13 +54,17 @@ DNNL_GRAPH_OP_SCHEMA(Add, 1,
                 .set_num_inputs(2)
                 .set_num_outputs(1)
                 .set_commutative_inputs()
-                .set_input(0, "src_0", "T")
-                .set_input(1, "src_1", "T")
-                .set_output(0, "dst", "T")
+                .set_input(0, "src_0", "T1")
+                .set_input(1, "src_1", "T2")
+                .set_output(0, "dst", "T3")
                 .set_attr(op_attr::auto_broadcast, false, attribute_kind::s,
                         "numpy", {"none", "numpy"})
                 .set_type_constraints(
-                        "T", {data_type::f32, data_type::bf16, data_type::f16})
+                        "T1", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints(
+                        "T2", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints(
+                        "T3", {data_type::f32, data_type::bf16, data_type::f16})
                 .set_shape_inference_function(
                         infer_elemwise_arithmetic_output_shape))
 
@@ -436,6 +440,34 @@ DNNL_GRAPH_OP_SCHEMA(GELUBackward, 1,
                         "T", {data_type::f32, data_type::bf16, data_type::f16})
                 .set_shape_inference_function(infer_identity_output_shape))
 
+DNNL_GRAPH_OP_SCHEMA(GenIndex, 1,
+        op_schema_t()
+                .set_num_inputs(1)
+                .set_num_outputs(1)
+                .set_input(0, "src", "T1")
+                .set_output(0, "dst", "T2")
+                .set_attr(op_attr::axis, true, attribute_kind::i)
+                .set_type_constraints(
+                        "T1", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints("T2", {data_type::s32})
+                .set_shape_inference_function(infer_identity_output_shape))
+
+DNNL_GRAPH_OP_SCHEMA(GreaterEqual, 1,
+        op_schema_t()
+                .set_num_inputs(2)
+                .set_num_outputs(1)
+                .set_input(0, "src_0", "T1")
+                .set_input(1, "src_1", "T1")
+                .set_output(0, "dst", "T2")
+                .set_attr(op_attr::auto_broadcast, false, attribute_kind::s,
+                        "numpy", {"none", "numpy"})
+                .set_type_constraints("T1",
+                        {data_type::f32, data_type::bf16, data_type::f16,
+                                data_type::s32})
+                .set_type_constraints("T2", {data_type::boolean})
+                .set_shape_inference_function(
+                        infer_elemwise_arithmetic_output_shape))
+
 DNNL_GRAPH_OP_SCHEMA(GroupNorm, 1,
         op_schema_t()
                 .set_inputs_option(op_schema_t::param_num_option::optional)
@@ -656,10 +688,13 @@ DNNL_GRAPH_OP_SCHEMA(MatMul, 1,
                 .set_input(0, "src", "T")
                 .set_input(1, "weights", "T")
                 .set_input(2, "bias", "T")
-                .set_output(0, "dst", "T")
+                .set_output(0, "dst", "T1")
                 .set_type_constraints(
                         "T", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints(
+                        "T1", {data_type::f32, data_type::bf16, data_type::f16})
                 .set_shape_inference_function(infer_matmul_output_shape)
+                .set_op_def_constraint_function(check_matmul_dtype)
                 .SET_MATMUL_COMMON_ATTRS)
 
 DNNL_GRAPH_OP_SCHEMA(Maximum, 1,
@@ -760,9 +795,6 @@ DNNL_GRAPH_OP_SCHEMA(MishBackward, 1,
                         "T", {data_type::f32, data_type::bf16, data_type::f16})
                 .set_shape_inference_function(infer_identity_output_shape))
 
-// TODO(Yixin): for Multiply. input and output needs to have the same dtypes
-// But in current pytorch bridge's type promotion system, there's no
-// such constraints. So this feature is postponed.
 DNNL_GRAPH_OP_SCHEMA(Multiply, 1,
         op_schema_t()
                 .set_num_inputs(2)
@@ -1001,12 +1033,15 @@ DNNL_GRAPH_OP_SCHEMA(SoftMax, 1,
         op_schema_t()
                 .set_num_inputs(1)
                 .set_num_outputs(1)
-                .set_input(0, "src", "T")
-                .set_output(0, "dst", "T")
+                .set_input(0, "src", "T1")
+                .set_output(0, "dst", "T2")
                 .set_attr(op_attr::axis, false, attribute_kind::i, (int64_t)1)
                 .set_type_constraints(
-                        "T", {data_type::f32, data_type::bf16, data_type::f16})
-                .set_shape_inference_function(infer_identity_output_shape))
+                        "T1", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints(
+                        "T2", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_shape_inference_function(infer_identity_output_shape)
+                .set_op_def_constraint_function(check_softmax_dtype))
 
 DNNL_GRAPH_OP_SCHEMA(SoftMaxBackward, 1,
         op_schema_t()
@@ -1093,13 +1128,17 @@ DNNL_GRAPH_OP_SCHEMA(Subtract, 1,
         op_schema_t()
                 .set_num_inputs(2)
                 .set_num_outputs(1)
-                .set_input(0, "src_0", "T")
-                .set_input(1, "src_1", "T")
-                .set_output(0, "dst", "T")
+                .set_input(0, "src_0", "T1")
+                .set_input(1, "src_1", "T2")
+                .set_output(0, "dst", "T3")
                 .set_attr(op_attr::auto_broadcast, false, attribute_kind::s,
                         "numpy", {"none", "numpy"})
                 .set_type_constraints(
-                        "T", {data_type::f32, data_type::bf16, data_type::f16})
+                        "T1", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints(
+                        "T2", {data_type::f32, data_type::bf16, data_type::f16})
+                .set_type_constraints(
+                        "T3", {data_type::f32, data_type::bf16, data_type::f16})
                 .set_shape_inference_function(
                         infer_elemwise_arithmetic_output_shape))
 
@@ -1255,10 +1294,15 @@ DNNL_GRAPH_OP_SCHEMA(DynamicDequantize, 1,
                 .set_attr(
                         op_attr::qtype, false, attribute_kind::s, "per_tensor")
                 .set_attr(op_attr::axis, false, attribute_kind::i, int64_t(1))
-                .set_type_constraints("T1", {data_type::u8, data_type::s8})
-                .set_type_constraints("T2", {data_type::f32})
+                .set_attr(op_attr::group_shape, false, attribute_kind::is)
+                .set_type_constraints("T1",
+                        {data_type::u8, data_type::s8, data_type::s4,
+                                data_type::u4})
                 .set_type_constraints(
-                        "T3", {data_type::u8, data_type::s8, data_type::s32})
+                        "T2", {data_type::bf16, data_type::f16, data_type::f32})
+                .set_type_constraints("T3",
+                        {data_type::u4, data_type::s4, data_type::u8,
+                                data_type::s8, data_type::s32})
                 .set_shape_inference_function(infer_identity_output_shape)
                 .set_op_def_constraint_function(
                         check_dyn_quant_dequant_scales_zps))
diff --git a/src/graph/interface/op_def_constraint.cpp b/src/graph/interface/op_def_constraint.cpp
index 7efd6213561..139c3ccf868 100644
--- a/src/graph/interface/op_def_constraint.cpp
+++ b/src/graph/interface/op_def_constraint.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,8 @@
 #include "graph/interface/op_def_constraint.hpp"
 
 #define VCHECK_SHAPE_INFER(cond, msg, ...) \
-    VCONDCHECK(graph, create, check, add_op, (cond), false, msg, ##__VA_ARGS__);
+    VCONDCHECK(graph, create, check, shape_infer, (cond), false, msg, \
+            ##__VA_ARGS__);
 
 namespace dnnl {
 namespace impl {
@@ -85,12 +86,51 @@ bool check_bn_data_type(const op_t *n) {
     return true;
 }
 
+// For MatMul, it's required that src and wei have the same data type. When
+// src/wei is xf16, dst can be f32 or xf16 (the same type as src/wei). We can
+// disable this check to allow f32f32xf16 when there is a request.
+bool check_matmul_dtype(const op_t *mm) {
+    const auto &inputs = mm->get_input_values();
+    const auto &outputs = mm->get_output_values();
+
+    const logical_tensor_t &src = inputs[0]->get_logical_tensor();
+    const logical_tensor_t &dst = outputs[0]->get_logical_tensor();
+    if (src.data_type != dst.data_type) {
+        if (dst.data_type != data_type::f32) {
+            VCHECK_SHAPE_INFER(false, "%s, %s src + %s dst is not supported",
+                    op_t::kind2str(mm->get_kind()).c_str(),
+                    dnnl_dt2str(src.data_type), dnnl_dt2str(dst.data_type));
+        }
+    }
+
+    return true;
+}
+
+// For SoftMax, if the src is f32, dst can be xf16. Otherwise, src and dst
+// should have the same data type.
+bool check_softmax_dtype(const op_t *n) {
+    const auto &inputs = n->get_input_values();
+    const auto &outputs = n->get_output_values();
+
+    const logical_tensor_t &src = inputs[0]->get_logical_tensor();
+    const logical_tensor_t &dst = outputs[0]->get_logical_tensor();
+    if (src.data_type != dst.data_type) {
+        if (src.data_type != data_type::f32) {
+            VCHECK_SHAPE_INFER(false, "%s, %s src + %s dst is not supported",
+                    op_t::kind2str(n->get_kind()).c_str(),
+                    dnnl_dt2str(src.data_type), dnnl_dt2str(dst.data_type));
+        }
+    }
+
+    return true;
+}
+
 // check function for data_type of LayerNorm and GroupNorm.
 // only when data is bf16, gamma/beta/mean/var can be bf16.
 // If data is bf16, gamma/beta/mean/var can be f32 or bf16.
 bool check_ln_gn_data_type(const op_t *n) {
-    auto input_values = n->get_input_values();
-    auto output_values = n->get_output_values();
+    const auto &input_values = n->get_input_values();
+    const auto &output_values = n->get_output_values();
 
     const logical_tensor_t &src_lt = input_values[0]->get_logical_tensor();
     logical_tensor_t aux_lt;
@@ -328,11 +368,38 @@ bool check_dyn_quant_dequant_scales_zps(const op_t *n) {
         // in case of not setting value for zps
         if (sz_zps == DNNL_GRAPH_UNKNOWN_DIM) { return true; }
 
-        VCHECK_SHAPE_INFER((sz_scales == sz_zps),
-                "%s, scales and zps should keep same. given scale "
-                "size: %d, given zp size: %d.",
-                op_t::kind2str(n->get_kind()).c_str(),
-                static_cast<int>(sz_scales), static_cast<int>(sz_zps));
+        if (qtype == "per_group") {
+            const auto &ndims
+                    = n->get_input_value(1)->get_logical_tensor().ndims;
+            const auto &scale_ndims
+                    = n->get_input_value(1)->get_logical_tensor().ndims;
+            const auto &scale_dims
+                    = n->get_input_value(1)->get_logical_tensor().dims;
+            const auto &zp_ndims
+                    = n->get_input_value(2)->get_logical_tensor().ndims;
+            const auto &zp_dims
+                    = n->get_input_value(2)->get_logical_tensor().dims;
+            VCHECK_SHAPE_INFER((ndims >= 2),
+                    "group quantization requires at least two dimensions");
+            VCHECK_SHAPE_INFER(((ndims == scale_ndims) && (ndims == zp_ndims)),
+                    "%s, input, scales and zps should keep the number of "
+                    "dimensions for group quantization",
+                    op_t::kind2str(n->get_kind()).c_str());
+            VCHECK_SHAPE_INFER(
+                    (std::equal(scale_dims, scale_dims + ndims, zp_dims)),
+                    "%s, scales and zps should keep the same shape for group "
+                    "quantization",
+                    op_t::kind2str(n->get_kind()).c_str());
+        }
+
+        if (qtype == "per_channel") {
+            VCHECK_SHAPE_INFER((sz_zps == 1 || sz_scales == sz_zps),
+                    "%s, zps should be 1 or equals to scales size for "
+                    "per_channel policy, given zps size: %d and scales size: "
+                    "%d",
+                    op_t::kind2str(n->get_kind()).c_str(),
+                    static_cast<int>(sz_zps), static_cast<int>(sz_scales));
+        }
 
         if (qtype == "per_tensor") {
             VCHECK_SHAPE_INFER((sz_zps == 1),
diff --git a/src/graph/interface/op_def_constraint.hpp b/src/graph/interface/op_def_constraint.hpp
index 3d519383fce..d9e19457935 100644
--- a/src/graph/interface/op_def_constraint.hpp
+++ b/src/graph/interface/op_def_constraint.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,6 +28,10 @@ bool check_pads(const op_t *n);
 
 bool check_bn_data_type(const op_t *n);
 
+bool check_matmul_dtype(const op_t *n);
+
+bool check_softmax_dtype(const op_t *n);
+
 bool check_ln_gn_data_type(const op_t *n);
 
 bool check_typecast_data_type(const op_t *n);
diff --git a/src/graph/interface/op_schema.cpp b/src/graph/interface/op_schema.cpp
index 7d47a10cb4d..341835071f8 100644
--- a/src/graph/interface/op_schema.cpp
+++ b/src/graph/interface/op_schema.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -86,8 +86,7 @@ std::set<size_t> op_schema_t::get_num_outputs() const {
 op_schema_t &op_schema_t::set_input(
         size_t in_offset, std::string &&in_name, std::string &&dtype_string) {
     verify_input_(in_offset);
-    inputs_.emplace_back(
-            op_parameter_t(std::move(in_name), std::move(dtype_string)));
+    inputs_.emplace_back(std::move(in_name), std::move(dtype_string));
     return *this;
 }
 
@@ -99,8 +98,7 @@ op_schema_t::get_inputs() const {
 op_schema_t &op_schema_t::set_output(
         size_t out_offset, std::string &&out_name, std::string &&dtype_string) {
     verify_output_(out_offset);
-    outputs_.emplace_back(
-            op_parameter_t(std::move(out_name), std::move(dtype_string)));
+    outputs_.emplace_back(std::move(out_name), std::move(dtype_string));
     return *this;
 }
 
diff --git a/src/graph/interface/op_schema.hpp b/src/graph/interface/op_schema.hpp
index 26bed1b3a55..8662d0e99f2 100644
--- a/src/graph/interface/op_schema.hpp
+++ b/src/graph/interface/op_schema.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -184,7 +184,7 @@ class op_schema_t {
     /*! @brief Set a particular attribute of the op schema. */
     template <typename T>
     op_schema_t &set_attr(op_attr_t name, bool required,
-            attribute_kind_t attr_kind, T value,
+            attribute_kind_t attr_kind, const T &value,
             const std::vector<T> &candidates = {}) {
         assertm(attributes_.count(name) == 0,
                 "provided attribute has already been set");
diff --git a/src/graph/interface/opset.hpp b/src/graph/interface/opset.hpp
index a0d2eb08a65..99588c4c2ad 100644
--- a/src/graph/interface/opset.hpp
+++ b/src/graph/interface/opset.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,6 +65,8 @@ class opset_v1_t {
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(Exp, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(GELU, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(GELUBackward, 1)>());
+        fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(GenIndex, 1)>());
+        fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(GreaterEqual, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(GroupNorm, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(HardSigmoid, 1)>());
         fn(get_op_schema<DNNL_GRAPH_OP_SCHEMA_CLASS_NAME(
diff --git a/src/graph/interface/partition.cpp b/src/graph/interface/partition.cpp
index 3e9fcfa6d1a..17170c322cc 100644
--- a/src/graph/interface/partition.cpp
+++ b/src/graph/interface/partition.cpp
@@ -41,7 +41,7 @@
 #include "graph/interface/partition_cache.hpp"
 
 #ifdef DNNL_WITH_SYCL
-#include "graph/utils/sycl_check.hpp"
+#include "oneapi/dnnl/dnnl_sycl.hpp"
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
@@ -322,27 +322,13 @@ status_t DNNL_API dnnl_graph_compiled_partition_execute(
 
     if (get_verbose(dnnl::impl::verbose_t::exec_profile,
                 dnnl::impl::component_t::graph)) {
-#ifndef NDEBUG
-        allocator_t *alloc = reinterpret_cast<allocator_t *>(
-                compiled_partition->get_engine()->get_allocator());
-        allocator_t::monitor_t &monitor = alloc->get_monitor();
-        monitor.reset_peak_temp_memory();
-#endif
         stream->wait();
         double start_ms = dnnl::impl::get_msec();
         CHECK(compiled_partition->execute(stream, ins, outs));
         stream->wait();
         double duration_ms = dnnl::impl::get_msec() - start_ms;
-#ifndef NDEBUG
-        VFORMAT(start_ms, graph, exec, VERBOSE_profile, "%s,%g,%zu,%s,%zu,%zu",
-                compiled_partition->info(), duration_ms, alloc->id(),
-                utils::thread_id_to_str(std::this_thread::get_id()).c_str(),
-                monitor.get_total_persist_memory(),
-                monitor.get_peak_temp_memory());
-#else
         VPROF(start_ms, graph, exec, VERBOSE_profile,
                 compiled_partition->info(), duration_ms);
-#endif
     } else {
         CHECK(compiled_partition->execute(stream, ins, outs));
     }
@@ -375,36 +361,8 @@ status_t DNNL_API dnnl_graph_sycl_interop_compiled_partition_execute(
     for (size_t i = 0; i < num_outputs; ++i) {
         outs.emplace_back(**(outputs + i));
     }
-#ifndef NDEBUG
-    if (get_verbose(dnnl::impl::verbose_t::exec_profile,
-                dnnl::impl::component_t::graph)) {
-        allocator_t *alloc = reinterpret_cast<allocator_t *>(
-                compiled_partition->get_engine()->get_allocator());
-        allocator_t::monitor_t &monitor = alloc->get_monitor();
-        monitor.reset_peak_temp_memory();
-        stream->wait();
-        double start_ms = dnnl::impl::get_msec();
-        if (deps != nullptr) {
-            const auto &sycl_deps = *(const std::vector<::sycl::event> *)deps;
-            CHECK(compiled_partition->execute_sycl(stream, ins, outs, sycl_deps,
-                    static_cast<::sycl::event *>(sycl_event)));
-        } else {
-            CHECK(compiled_partition->execute_sycl(stream, ins, outs, {},
-                    static_cast<::sycl::event *>(sycl_event)));
-        }
-        stream->wait();
-        double duration_ms = dnnl::impl::get_msec() - start_ms;
-        VFORMAT(start_ms, graph, exec, VERBOSE_profile, "%s,%g,%zu,%s,%zu,%zu",
-                compiled_partition->info(), duration_ms, alloc->id(),
-                utils::thread_id_to_str(std::this_thread::get_id()).c_str(),
-                monitor.get_total_persist_memory(),
-                monitor.get_peak_temp_memory());
-    } else if (get_verbose(dnnl::impl::verbose_t::exec_profile,
-                       dnnl::impl::component_t::graph)) {
-#else
     if (get_verbose(dnnl::impl::verbose_t::exec_profile,
                 dnnl::impl::component_t::graph)) {
-#endif
         stream->wait();
         double start_ms = dnnl::impl::get_msec();
         if (deps != nullptr) {
@@ -465,36 +423,9 @@ status_t DNNL_API dnnl_graph_ocl_interop_compiled_partition_execute(
     for (size_t i = 0; i < num_outputs; ++i) {
         outs.emplace_back(**(outputs + i));
     }
-#ifndef NDEBUG
-    if (get_verbose(dnnl::impl::verbose_t::exec_profile,
-                dnnl::impl::component_t::graph)) {
-        allocator_t *alloc = reinterpret_cast<allocator_t *>(
-                compiled_partition->get_engine()->get_allocator());
-        allocator_t::monitor_t &monitor = alloc->get_monitor();
-        monitor.reset_peak_temp_memory();
-        stream->wait();
-        double start_ms = dnnl::impl::get_msec();
-        if (deps != nullptr) {
-            std::vector<cl_event> ocl_deps(deps, deps + ndeps);
-            CHECK(compiled_partition->execute_ocl(
-                    stream, ins, outs, ocl_deps, ocl_event));
-        } else {
-            CHECK(compiled_partition->execute_ocl(
-                    stream, ins, outs, {}, ocl_event));
-        }
-        stream->wait();
-        double duration_ms = dnnl::impl::get_msec() - start_ms;
-        VFORMAT(start_ms, graph, exec, VERBOSE_profile, "%s,%g,%zu,%s,%zu,%zu",
-                compiled_partition->info(), duration_ms, alloc->id(),
-                utils::thread_id_to_str(std::this_thread::get_id()).c_str(),
-                monitor.get_total_persist_memory(),
-                monitor.get_peak_temp_memory());
-    } else if (get_verbose(dnnl::impl::verbose_t::exec_profile,
-                       dnnl::impl::component_t::graph)) {
-#else
+
     if (get_verbose(dnnl::impl::verbose_t::exec_profile,
                 dnnl::impl::component_t::graph)) {
-#endif
         stream->wait();
         double start_ms = dnnl::impl::get_msec();
         if (deps != nullptr) {
@@ -676,7 +607,9 @@ status_t dnnl_graph_partition::compile(compiled_partition_t *cp,
         auto part = pimpl_->clone();
         const std::vector<std::shared_ptr<op_t>> &fused_op = part->get_ops();
         if (fused_op.empty()) return status::invalid_arguments;
-        auto agraph = graph_t(fused_op, get_engine_kind(), get_fpmath_mode());
+        const auto &fpm = get_fpmath_mode();
+        auto agraph = graph_t(fused_op, get_engine_kind());
+        agraph.set_fpmath_mode(fpm.mode_, fpm.apply_to_int_);
         // set user given logical tensors and infer shape
         agraph.set_user_inputs_outputs(tmp_inputs, tmp_outputs);
         agraph.infer_shape();
diff --git a/src/graph/interface/partition.hpp b/src/graph/interface/partition.hpp
index 5c34b50b348..6b569fb01b8 100644
--- a/src/graph/interface/partition.hpp
+++ b/src/graph/interface/partition.hpp
@@ -28,6 +28,7 @@
 #include <unordered_set>
 
 #include "graph/interface/c_types_map.hpp"
+#include "graph/interface/graph_attr.hpp"
 #include "graph/interface/logical_tensor.hpp"
 #include "graph/interface/op.hpp"
 #include "graph/interface/partition_impl.hpp"
@@ -86,7 +87,7 @@ struct dnnl_graph_partition : public dnnl::impl::graph::utils::id_t {
         return pimpl_->get_engine_kind();
     }
 
-    graph::fpmath_mode_t get_fpmath_mode() const {
+    const graph::fpmath_t &get_fpmath_mode() const {
         return pimpl_->get_fpmath_mode();
     }
 
diff --git a/src/graph/interface/partition_hashing.cpp b/src/graph/interface/partition_hashing.cpp
index cd27f299562..e314cb510ab 100644
--- a/src/graph/interface/partition_hashing.cpp
+++ b/src/graph/interface/partition_hashing.cpp
@@ -32,7 +32,14 @@ key_t::key_t(const impl::engine_t *engine,
         const std::vector<const logical_tensor_t *> &outs)
     : ops_(get_raw_ptrs(ops))
     , nthread_(dnnl_get_max_threads())
-    , engine_id_(engine->engine_id())
+    // Here we use engine as a member of partition_hashing key_t, because for
+    // CPU engine and nativa runtime, the engine_id is nullptr for all engine
+    // instances. The compiled partition would be hit under different engine
+    // instances. For example, first compile with engine 1 -> execute with
+    // engine1 -> second compile with engine 2 (cache hit) -> execute with
+    // engine 2(fail). So we need to use engine as a member of key_t to avoid
+    // execution crash.
+    , engine_(engine)
     , thread_id_(std::this_thread::get_id()) {
     ins_.reserve(ins.size());
     outs_.reserve(outs.size());
@@ -61,7 +68,7 @@ bool key_t::operator==(const key_t &rhs) const {
 
     bool ret = true && lhs_num_ops == rhs_num_ops && lhs_num_ins == rhs_num_ins
             && lhs_num_outs == rhs_num_outs && nthread_ == rhs.nthread_
-            && engine_id_ == rhs.engine_id_;
+            && engine_ == rhs.engine_;
     if (!ret) return false;
 
     for (size_t i = 0; i < lhs_num_ops; ++i) {
diff --git a/src/graph/interface/partition_hashing.hpp b/src/graph/interface/partition_hashing.hpp
index 30c74d7c101..e3e6052d05c 100644
--- a/src/graph/interface/partition_hashing.hpp
+++ b/src/graph/interface/partition_hashing.hpp
@@ -27,6 +27,7 @@
 
 #include "oneapi/dnnl/dnnl_graph.h"
 
+#include "common/engine.hpp"
 #include "common/engine_id.hpp"
 
 #include "graph/interface/c_types_map.hpp"
@@ -63,15 +64,15 @@ struct key_t {
     bool operator==(const key_t &other) const;
     const std::thread::id &thread_id() const { return thread_id_; }
     bool has_runtime_dependencies() const {
-        return !(engine_id_.kind() == engine_kind::cpu
-                && impl::is_native_runtime(engine_id_.runtime_kind()));
+        return !(engine_->kind() == engine_kind::cpu
+                && impl::is_native_runtime(engine_->runtime_kind()));
     }
 
     mutable std::vector<op_t *> ops_;
     mutable std::vector<logical_tensor_t> ins_;
     mutable std::vector<logical_tensor_t> outs_;
     int nthread_;
-    impl::engine_id_t engine_id_;
+    const impl::engine_t *engine_;
 
 private:
     // Thread ID is not used as part of the key, it's only used to get
@@ -148,7 +149,8 @@ struct hash<dnnl::impl::graph::partition_hashing::key_t> {
         size_t seed = 0;
         // Compute hash for nthread_, engine_kind_
         seed = dnnl::impl::hash_combine(seed, key.nthread_);
-        seed = dnnl::impl::hash_combine(seed, key.engine_id_.hash());
+        seed = dnnl::impl::hash_combine(
+                seed, reinterpret_cast<uintptr_t>(key.engine_));
 
         // Combine hash for op_kinds & attributes with the computed hash
         seed = get_array_hash(seed, key.ops_);
diff --git a/src/graph/interface/partition_impl.hpp b/src/graph/interface/partition_impl.hpp
index 161ce3615c6..ddb9aa6510c 100644
--- a/src/graph/interface/partition_impl.hpp
+++ b/src/graph/interface/partition_impl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include "common/engine.hpp"
 
 #include "graph/interface/c_types_map.hpp"
+#include "graph/interface/graph_attr.hpp"
 #include "graph/interface/logical_tensor.hpp"
 #include "graph/interface/op.hpp"
 
@@ -37,7 +38,7 @@
 #include "graph/utils/utils.hpp"
 
 #ifdef DNNL_WITH_SYCL
-#include "graph/utils/sycl_check.hpp"
+#include "oneapi/dnnl/dnnl_sycl.hpp"
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
@@ -66,19 +67,21 @@ class backend_t;
 
 class partition_impl_t : public std::enable_shared_from_this<partition_impl_t> {
 public:
-    explicit partition_impl_t(engine_kind_t engine_kind,
-            fpmath_mode_t fpmath_mode, partition_kind_t pkind)
+    explicit partition_impl_t(engine_kind_t engine_kind, fpmath_t fpmath_mode,
+            partition_kind_t pkind)
         : engine_kind_(engine_kind)
         , fpmath_mode_(fpmath_mode)
         , pkind_(pkind)
         , can_use_blocked_layout_(false) {}
 
-    explicit partition_impl_t(engine_kind_t engine_kind,
-            fpmath_mode_t fpmath_mode = fpmath_mode::strict)
+    explicit partition_impl_t(
+            engine_kind_t engine_kind, fpmath_t fpmath_mode = {})
         : engine_kind_(engine_kind)
         , fpmath_mode_(fpmath_mode)
         , pkind_(partition_kind_t::undef)
-        , can_use_blocked_layout_(false) {}
+        , can_use_blocked_layout_(false) {
+        fpmath_mode_.mode_ = graph::fpmath_mode::strict;
+    }
 
     virtual ~partition_impl_t() = default;
 
@@ -86,7 +89,7 @@ class partition_impl_t : public std::enable_shared_from_this<partition_impl_t> {
     engine_kind_t get_engine_kind() const { return engine_kind_; }
 
     /// The getter for fpmath_mode_
-    fpmath_mode_t get_fpmath_mode() const { return fpmath_mode_; }
+    const fpmath_t &get_fpmath_mode() const { return fpmath_mode_; }
 
     /// The getter for partition kind
     partition_kind_t get_kind() const { return pkind_; }
@@ -193,7 +196,7 @@ class partition_impl_t : public std::enable_shared_from_this<partition_impl_t> {
     engine_kind_t engine_kind_;
 
     // floating-point math mode
-    fpmath_mode_t fpmath_mode_;
+    fpmath_t fpmath_mode_;
 
     // Partition kind
     partition_kind_t pkind_;
@@ -238,10 +241,10 @@ class partition_impl_t : public std::enable_shared_from_this<partition_impl_t> {
     std::vector<std::shared_ptr<op_t>> ops_;
 
     /// All the input logical tensors of a partition
-    std::vector<logical_tensor_t> inputs_ {};
+    std::vector<logical_tensor_t> inputs_;
 
     /// All the output logical tensors of a partition
-    std::vector<logical_tensor_t> outputs_ {};
+    std::vector<logical_tensor_t> outputs_;
 
     /// Partition_impl id
     size_t id_ = std::numeric_limits<size_t>::max();
@@ -375,12 +378,12 @@ class compiled_partition_impl_t {
     /// The inputs logical tensors which this compiled_partition_impl_t
     /// is specialized for.Should have exact shape/dtype/layout and be
     /// in same order with inputs_ in partition_impl_t
-    std::vector<logical_tensor_t> inputs_ {};
+    std::vector<logical_tensor_t> inputs_;
 
     /// The outputs logical tensors which this compiled_partition_impl_t
     /// is specialized for.Should have exact shape/dtype/layout and be
     /// in same order with outputs_ in partition_impl_t
-    std::vector<logical_tensor_t> outputs_ {};
+    std::vector<logical_tensor_t> outputs_;
 
     /// The inplace_pair_t is used to indicate which input
     /// and output tensor given in execute can share same
diff --git a/src/graph/interface/shape_infer.cpp b/src/graph/interface/shape_infer.cpp
index 8f1c8a3d94e..556eb631958 100644
--- a/src/graph/interface/shape_infer.cpp
+++ b/src/graph/interface/shape_infer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,9 +32,6 @@ namespace dnnl {
 namespace impl {
 namespace graph {
 
-// utils function
-namespace {
-
 std::string dims2str(const dims &dims) {
     if (dims.empty()) return std::string("");
 
@@ -45,8 +42,6 @@ std::string dims2str(const dims &dims) {
     return str;
 }
 
-} // namespace
-
 /// convert shape to ncx or oix
 dims canonicalize(const dims &shape, const std::string &format) {
     dims ret(shape);
diff --git a/src/graph/interface/shape_infer.hpp b/src/graph/interface/shape_infer.hpp
index 976e4c481ff..a9b72305cd3 100644
--- a/src/graph/interface/shape_infer.hpp
+++ b/src/graph/interface/shape_infer.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -74,6 +74,8 @@ status_t infer_auto_pad(const dim_t in_dim, const dim_t stride,
 /// TODO(xxx): 0-D broadcasting?
 status_t broadcast(const dims &lhs, const dims &rhs, dims &broadcasted);
 
+std::string dims2str(const dims &dims);
+
 status_t one_way_broadcast(const dims &lhs, const dims &rhs);
 
 /// This function assumes the size of all vectors are correct. Eg. size of
diff --git a/src/graph/interface/tensor.cpp b/src/graph/interface/tensor.cpp
index b8b20467d35..e76ee44ea75 100644
--- a/src/graph/interface/tensor.cpp
+++ b/src/graph/interface/tensor.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ static const size_t DNNL_SYCL_MEMALIGNMENT = 64;
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include "xpu/ocl/engine_factory.hpp"
 static const size_t DNNL_OCL_MEMALIGNMENT = 0;
-using namespace dnnl::impl::gpu::intel::ocl;
+using namespace dnnl::impl::gpu::intel;
 #endif
 
 using namespace dnnl::impl::graph;
@@ -61,7 +61,7 @@ static void *tensor_malloc(
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
         return alc->allocate(size, *dev, *ctx, {type, DNNL_SYCL_MEMALIGNMENT});
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-        auto *ocl_engine = utils::downcast<const ocl_gpu_engine_t *>(eng);
+        auto *ocl_engine = utils::downcast<const ocl::engine_t *>(eng);
         const cl_device_id &ocl_dev = ocl_engine->device();
         const cl_context &ocl_ctx = ocl_engine->context();
         return alc->allocate(
@@ -97,7 +97,7 @@ static void tensor_free(void *p, const engine_t *eng) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
         alc->deallocate(p, *dev, *ctx, {});
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-        auto *ocl_engine = utils::downcast<const ocl_gpu_engine_t *>(eng);
+        auto *ocl_engine = utils::downcast<const ocl::engine_t *>(eng);
         const cl_device_id &ocl_dev = ocl_engine->device();
         const cl_context &ocl_ctx = ocl_engine->context();
         return alc->deallocate(p, ocl_dev, ocl_ctx, {});
@@ -110,8 +110,7 @@ static void tensor_free(void *p, const engine_t *eng) {
 }
 
 dnnl_graph_tensor::dnnl_graph_tensor(
-        const dnnl::impl::graph::logical_tensor_t &lt,
-        const dnnl::impl::graph::engine_t *eng, void *handle)
+        const logical_tensor_t &lt, const engine_t *eng, void *handle)
     : lt_(lt), eng_(eng) {
     if (handle == DNNL_MEMORY_ALLOCATE) {
         size_t num_bytes = logical_tensor_wrapper_t(lt).size();
diff --git a/src/graph/utils/alloc.hpp b/src/graph/utils/alloc.hpp
index a0a8434b465..425acdaf36a 100644
--- a/src/graph/utils/alloc.hpp
+++ b/src/graph/utils/alloc.hpp
@@ -21,7 +21,7 @@
 #include "graph/utils/utils.hpp"
 
 #ifdef DNNL_WITH_SYCL
-#include "graph/utils/sycl_check.hpp"
+#include "oneapi/dnnl/dnnl_sycl.hpp"
 #endif
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
diff --git a/src/graph/utils/any.hpp b/src/graph/utils/any.hpp
index a83b986c6eb..39475734969 100644
--- a/src/graph/utils/any.hpp
+++ b/src/graph/utils/any.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
+ * Copyright 2020-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,10 +59,11 @@ class any_t {
     any_t() = default;
     any_t(any_t &&v) {
         clear();
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
         avtable_ = v.avtable_;
         v.avtable_ = nullptr;
     }
-    any_t(const any_t &v) { avtable_ = v.avtable_; }
+    any_t(const any_t &v) = default;
 
     template <typename T,
             typename = enable_if_t<!std::is_same<T, any_t &>::value>>
@@ -70,6 +71,7 @@ class any_t {
         clear();
         using value_type = typename std::decay<
                 typename std::remove_reference<T>::type>::type;
+        // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
         avtable_ = std::make_shared<vtable_t<value_type>>(std::forward<T>(v));
     }
 
diff --git a/src/graph/utils/debug.cpp b/src/graph/utils/debug.cpp
index 225e2831b62..b43fff47be2 100644
--- a/src/graph/utils/debug.cpp
+++ b/src/graph/utils/debug.cpp
@@ -34,6 +34,8 @@ const char *data_type2str(data_type_t v) {
     if (v == data_type::boolean) return "boolean";
     if (v == data_type::f8_e5m2) return "f8_e5m2";
     if (v == data_type::f8_e4m3) return "f8_e4m3";
+    if (v == data_type::s4) return "s4";
+    if (v == data_type::u4) return "u4";
     assert(!"unknown data_type");
     return "unknown data_type";
 }
diff --git a/src/graph/utils/id.hpp b/src/graph/utils/id.hpp
index b0bf762a090..892c6d757d1 100644
--- a/src/graph/utils/id.hpp
+++ b/src/graph/utils/id.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,12 +31,9 @@ struct id_t {
     value_type id() const { return id_; }
 
     id_t() : id_(++counter) {};
-    id_t(const id_t &other) : id_(other.id()) {};
-    id_t &operator=(const id_t &other) = delete;
 
 protected:
     static std::atomic<value_type> counter;
-    ~id_t() = default;
 
 private:
     const value_type id_;
diff --git a/src/graph/utils/json.hpp b/src/graph/utils/json.hpp
index e8263d0c42e..a1386f73f67 100644
--- a/src/graph/utils/json.hpp
+++ b/src/graph/utils/json.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
+ * Copyright 2020-2025 Intel Corporation
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,10 +52,10 @@ struct if_else_type;
  * \tparam T the type to be serialized
  */
 template <typename T>
-struct json_handler;
+struct json_handler_t;
 
 template <typename T>
-struct common_json;
+struct common_json_t;
 
 /*!
  * \brief json to write any type.
@@ -101,6 +101,8 @@ class json_writer_t {
     template <typename valuetype>
     inline void write_array_item(const valuetype &value);
 
+    inline void write_newline();
+
 private:
     std::ostream *os_;
     /*!
@@ -194,7 +196,7 @@ class read_helper_t {
     template <typename T>
     inline static void reader_function(json_reader_t *reader, void *addr);
     /*! \brief callback type to reader function */
-    typedef void (*readfunc)(json_reader_t *reader, void *addr);
+    using readfunc = void (*)(json_reader_t *reader, void *addr);
     /*! \brief data entry */
     struct entry_t {
         readfunc func;
@@ -226,7 +228,7 @@ struct num_json_t {
 };
 
 template <typename valuetype>
-struct common_json {
+struct common_json_t {
     inline static void write(json_writer_t *writer, const valuetype &value) {
         value.save(writer);
     }
@@ -236,7 +238,7 @@ struct common_json {
 };
 
 template <typename valuetype>
-struct common_json<std::shared_ptr<valuetype>> {
+struct common_json_t<std::shared_ptr<valuetype>> {
     inline static void write(
             json_writer_t *writer, const std::shared_ptr<valuetype> &value) {
         auto *v = value.get();
@@ -268,7 +270,7 @@ struct array_json_t {
         reader->begin_array();
         while (reader->next_array_item()) {
             elemtype value;
-            json_handler<elemtype>::read(reader, &value);
+            json_handler_t<elemtype>::read(reader, &value);
             array->insert(array->end(), value);
         }
     }
@@ -298,7 +300,7 @@ struct map_json_t {
 };
 
 template <>
-struct json_handler<std::string> {
+struct json_handler_t<std::string> {
     inline static void write(json_writer_t *writer, const std::string &value) {
         writer->write_string(value);
     }
@@ -308,31 +310,31 @@ struct json_handler<std::string> {
 };
 
 template <typename T>
-struct json_handler<std::map<std::string, T>>
+struct json_handler_t<std::map<std::string, T>>
     : public map_json_t<std::map<std::string, T>> {};
 
 template <typename T>
-struct json_handler<std::unordered_map<std::string, T>>
+struct json_handler_t<std::unordered_map<std::string, T>>
     : public map_json_t<std::unordered_map<std::string, T>> {};
 
 template <typename T>
-struct json_handler<std::vector<T>> : public array_json_t<std::vector<T>> {};
+struct json_handler_t<std::vector<T>> : public array_json_t<std::vector<T>> {};
 
 template <typename T>
-struct json_handler<std::list<T>> : public array_json_t<std::list<T>> {};
+struct json_handler_t<std::list<T>> : public array_json_t<std::list<T>> {};
 /*!
  * \brief generic serialization json
  */
 template <typename T>
-struct json_handler {
+struct json_handler_t {
     inline static void write(json_writer_t *writer, const T &data) {
         using Tjson = typename if_else_type<std::is_arithmetic<T>::value,
-                num_json_t<T>, common_json<T>>::type;
+                num_json_t<T>, common_json_t<T>>::type;
         Tjson::write(writer, data);
     }
     inline static void read(json_reader_t *reader, T *data) {
         using Tjson = typename if_else_type<std::is_arithmetic<T>::value,
-                num_json_t<T>, common_json<T>>::type;
+                num_json_t<T>, common_json_t<T>>::type;
         Tjson::read(reader, data);
     }
 };
@@ -352,7 +354,7 @@ inline void json_writer_t::write_keyvalue(
     *os_ << key;
     *os_ << "\": ";
     scope_count_.back() += 1;
-    json_handler<valuetype>::write(this, value);
+    json_handler_t<valuetype>::write(this, value);
 }
 
 template <typename valuetype>
@@ -394,7 +396,7 @@ inline void json_writer_t::end_array() {
 }
 
 inline void json_writer_t::write_array_seperator() {
-    if (scope_count_.back() != 0) { *os_ << ", "; }
+    if (scope_count_.back() != 0) { *os_ << ","; }
     scope_count_.back() += 1;
     write_seperator();
 }
@@ -402,7 +404,7 @@ inline void json_writer_t::write_array_seperator() {
 template <typename valuetype>
 inline void json_writer_t::write_array_item(const valuetype &value) {
     this->write_array_seperator();
-    json::json_handler<valuetype>::write(this, value);
+    json::json_handler_t<valuetype>::write(this, value);
 }
 
 inline void json_writer_t::end_object() {
@@ -423,6 +425,10 @@ inline void json_writer_t::write_seperator() {
     }
 }
 
+inline void json_writer_t::write_newline() {
+    *os_ << '\n';
+}
+
 inline int json_reader_t::next_char() {
     return is_->get();
 }
@@ -553,7 +559,7 @@ inline bool json_reader_t::next_array_item() {
 
 template <typename valuetype>
 inline void json_reader_t::read(valuetype *out_value) {
-    json::json_handler<valuetype>::read(this, out_value);
+    json::json_handler_t<valuetype>::read(this, out_value);
 }
 
 inline bool read_helper_t::read_fields(json_reader_t *reader) {
@@ -578,7 +584,7 @@ inline bool read_helper_t::read_fields(json_reader_t *reader) {
 
 template <typename T>
 inline void read_helper_t::reader_function(json_reader_t *reader, void *addr) {
-    json::json_handler<T>::read(reader, static_cast<T *>(addr));
+    json::json_handler_t<T>::read(reader, static_cast<T *>(addr));
 }
 
 } // namespace json
diff --git a/src/graph/utils/ocl_check.hpp b/src/graph/utils/ocl_check.hpp
index 70968071e4d..48534c5fe8c 100644
--- a/src/graph/utils/ocl_check.hpp
+++ b/src/graph/utils/ocl_check.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,6 +31,6 @@
 #endif
 #endif
 
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 #endif
diff --git a/src/graph/utils/pm/dag_check_pass.hpp b/src/graph/utils/pm/dag_check_pass.hpp
index 2cf14769973..cd95e6ee4cc 100644
--- a/src/graph/utils/pm/dag_check_pass.hpp
+++ b/src/graph/utils/pm/dag_check_pass.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,10 +34,10 @@ namespace pass {
  * \brief dag_check_pass_t analyzes the graph
  *        to see if it's DAG or not.
  */
-class dag_check_pass_t : public graph::pass::pass_base {
+class dag_check_pass_t : public graph::pass::pass_base_t {
 public:
     explicit dag_check_pass_t(std::string pbackend, std::string pname)
-        : graph::pass::pass_base(std::move(pbackend), std::move(pname)) {}
+        : graph::pass::pass_base_t(std::move(pbackend), std::move(pname)) {}
 
     static graph::pass::pass_base_ptr create(
             std::string pbackend, std::string pname) {
diff --git a/src/graph/utils/pm/nested_matcher.cpp b/src/graph/utils/pm/nested_matcher.cpp
index f425228ea83..69e04ac3907 100644
--- a/src/graph/utils/pm/nested_matcher.cpp
+++ b/src/graph/utils/pm/nested_matcher.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -364,7 +364,20 @@ bool node_outputs_matcher_t::match_op_consumers() {
             auto node_consumer = current_node_output_.second[k];
             pb_node_t *out_node = node_consumer->first;
             // check if the out_node has been matched by previous out_ops
-            if (node_oport_matched_cons.count(k)) continue;
+            if (node_oport_matched_cons.count(k)) {
+                if (out_node->get_node_kind() == pb_node_kind::PB_NODE_KIND_OP)
+                    continue;
+                if (out_node->get_node_kind()
+                        == pb_node_kind::PB_NODE_KIND_ALTERNATION)
+                    continue;
+                // For repetition case, check if multi consumers exist
+                repetition_t *rep_node = dynamic_cast<repetition_t *>(out_node);
+                if (rep_node->get_body()->get_inner_consumer(0) == nullptr)
+                    continue;
+                if (rep_node->get_body()->get_inner_consumer(0)->size() == 1)
+                    continue;
+            }
+
             binding_t out_bind(BIND_IN, out_op, op_consumer.get_offset(),
                     out_node, node_consumer->second);
 
@@ -633,11 +646,8 @@ bool resolve_node(const binding_t &bind_arg, match_context_t *ctx,
 std::vector<value_t::consumer_t> sort_op_consumers(
         std::shared_ptr<value_t> &op_out_value) {
     auto &cons = op_out_value->get_consumers();
-    std::vector<value_t::consumer_t> sorted_consumers;
     if (cons.empty()) return cons;
-    for (size_t i = 0; i < cons.size(); i++) {
-        sorted_consumers.push_back(cons[i]);
-    }
+    std::vector<value_t::consumer_t> sorted_consumers = cons;
     std::sort(sorted_consumers.begin(), sorted_consumers.end(),
             [&](value_t::consumer_t con_1, value_t::consumer_t con_2) {
                 return con_1.get_op().get_attr<int64_t>(op_attr::op_depth)
@@ -661,6 +671,7 @@ bool match_pattern(op_t *first_op, const std::shared_ptr<pb_graph_t> &pattern,
         DEBUG(DEBUGINFO_PM, "matching failed \n");
         return false;
     }
+    if (!verify_global_in_map(&init_ctx)) { return false; }
 
     fusion_ops = reorder_matched_list(matched_op_map);
     DEBUG(DEBUGINFO_PM, "finish matching, matched ops: ");
@@ -675,6 +686,28 @@ bool match_pattern(op_t *first_op, const std::shared_ptr<pb_graph_t> &pattern,
     return true;
 }
 
+bool verify_global_in_map(match_context_t *ctx) {
+    pb_graph_t *graph = ctx->get_graph();
+    if (!graph) return true;
+
+    auto inner_cons = graph->get_inner_consumers();
+    if (inner_cons.empty()) return true;
+
+    for (size_t i = 0; i < inner_cons.size(); ++i) {
+        if (inner_cons[i].second.size() != ctx->in_port_map.count(i)) {
+            DEBUG(DEBUGINFO_PM,
+                    "expected graph input %zu consumers size: %zu, actual "
+                    "consumers size: %zu",
+                    i, inner_cons[i].second.size(), ctx->in_port_map.count(i));
+            VPATTERN_MATCHER(
+                    "matching failed: number of inputs check failed,%s:%i \n",
+                    __FILE__, __LINE__);
+            return false;
+        }
+    }
+    return true;
+}
+
 inline std::vector<op_t *> reorder_matched_list(
         const std::unordered_map<op_t *, pb_op_t *> &matched_op_map) {
     // split ops and pb_op_ts
@@ -797,8 +830,18 @@ void fill_local_in_map(match_context_t *local_ctx, pb_node_t *cur_node,
         for (size_t j = 0; j < inner_cons[i].second.size(); ++j) {
             size_t iport = inner_cons[i].first;
             const std::shared_ptr<consumer_t> &con = inner_cons[i].second[j];
-            if (con->first == cur_node)
-                local_ctx->in_port_map[iport] = {cur_op, cur_op_port};
+            if (con->first == cur_node) {
+                // check if the input port has been filled, if filled, the existing
+                // input should be the same as the current op
+                auto it = local_ctx->in_port_map.find(iport);
+                if (it != local_ctx->in_port_map.end()) {
+                    if (it->second.first->get_input_value(it->second.second)
+                            != cur_op->get_input_value(con->second)) {
+                        return;
+                    }
+                }
+                local_ctx->in_port_map.insert({iport, {cur_op, con->second}});
+            }
         }
     }
 }
@@ -900,6 +943,9 @@ bool match_alternation(const binding_t &bind_arg, match_context_t *ctx,
     if (bind_arg.bind_kind == BIND_IN) {
         DEBUG(DEBUGINFO_PM, "now doing alt matching");
     }
+    // Coverity: Dynamic cast to pointer can return NULL.
+    if (!alt_nodes) return false;
+
     for (pb_graph_t *alt_node : alt_nodes->get_alternatives()) {
         std::unordered_map<op_t *, pb_op_t *> temp_op_map = matched_op_map;
         binding_t temp_bind = bind_arg;
@@ -931,8 +977,11 @@ bool match_alternation(const binding_t &bind_arg, match_context_t *ctx,
             } else {
                 // alternation is restricted to have only 1 in port
                 if (local_ctx.in_port_map.size() != 1) return false;
-                op_t *current_op = local_ctx.in_port_map[0].first;
-                size_t current_port = local_ctx.in_port_map[0].second;
+                const auto &iter = local_ctx.in_port_map.find(0);
+                if (iter == local_ctx.in_port_map.end()) return false;
+
+                op_t *current_op = iter->second.first;
+                size_t current_port = iter->second.second;
                 binding_t current_bind(BIND_OUT, current_op, current_port,
                         bind_arg.bind_node, bind_arg.bind_port);
                 return match_node_inputs(current_bind, ctx, matched_op_map);
@@ -1037,7 +1086,10 @@ bool repetition_matcher_t::prepare_next_matching_round(
             // for next round's match
             iport_t iport = pmap_.second;
             // start op for last round's match
-            op_t *start_op = local_cached_ctx.in_port_map.at(iport).first;
+            const auto &it_iport = local_cached_ctx.in_port_map.find(iport);
+            if (it_iport == local_cached_ctx.in_port_map.end()) return false;
+
+            op_t *start_op = it_iport->second.first;
             pb_op_t *start_pb_op = updated_op_map_[start_op];
             op_t *next_op = nullptr;
             size_t next_op_iport = 0;
@@ -1070,7 +1122,10 @@ bool repetition_matcher_t::prepare_next_matching_round(
     } else { // backward matching
         single_iter_bind_.bind_kind = BIND_OUT;
         iport_t iport = pmap_.second;
-        op_t *current_op = local_cached_ctx.in_port_map.at(iport).first;
+        const auto &it_iport = local_cached_ctx.in_port_map.find(iport);
+        if (it_iport == local_cached_ctx.in_port_map.end()) return false;
+
+        op_t *current_op = it_iport->second.first;
         if (iport >= current_op->num_inputs()) return true;
         auto in_value = current_op->get_input_value(iport);
         single_iter_bind_.bind_op = &(in_value->get_producer());
@@ -1143,9 +1198,11 @@ bool repetition_matcher_t::match_next_op(const binding_t &bind_arg) {
         assertm(bind_arg.bind_node->get_inputs().size() <= 1,
                 "repetition is restricted to have only 1 input");
         if (bind_arg.bind_node->get_inputs().size() == 1) {
-            op_t *current_op = rep_global_ctx_.in_port_map[pmap_.second].first;
-            size_t current_port
-                    = rep_global_ctx_.in_port_map[pmap_.second].second;
+            const auto &iter = rep_global_ctx_.in_port_map.find(pmap_.second);
+            if (iter == rep_global_ctx_.in_port_map.end()) return false;
+
+            op_t *current_op = iter->second.first;
+            size_t current_port = iter->second.second;
             binding_t current_bind(BIND_OUT, current_op, current_port,
                     rep_node_, bind_arg.bind_port);
             if (!match_node_inputs(current_bind, parent_ctx_, updated_op_map_))
@@ -1195,6 +1252,8 @@ size_t repetition_matcher_t::match_repetition_blocks() {
     size_t num_rep = 0;
     while (true) {
         match_context_t local_cached_ctx {rep_global_ctx_};
+        local_cached_ctx.in_port_map.clear();
+        local_cached_ctx.out_port_map.clear();
         std::unordered_map<op_t *, pb_op_t *> local_op_map = updated_op_map_;
         if (forward_match_) {
             // Same reason as adding condition of
diff --git a/src/graph/utils/pm/nested_matcher.hpp b/src/graph/utils/pm/nested_matcher.hpp
index a10c7e42ba7..b921f223b07 100644
--- a/src/graph/utils/pm/nested_matcher.hpp
+++ b/src/graph/utils/pm/nested_matcher.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -92,7 +92,12 @@ class binding_t {
     size_t hint_op_port = 0;
 };
 
-using graph_port_map = std::unordered_map<size_t, std::pair<op_t *, size_t>>;
+// one input port can have multiple consumers
+using graph_in_port_map
+        = std::unordered_multimap<size_t, std::pair<op_t *, size_t>>;
+// one output port corresponds to one producer
+using graph_out_port_map
+        = std::unordered_map<size_t, std::pair<op_t *, size_t>>;
 
 //
 // match context tracks a pattern graph match progress
@@ -111,8 +116,8 @@ class match_context_t {
     match_context_t *get_parent_context() { return parent_ctx; };
     pb_graph_t *get_graph() { return graph_; };
 
-    graph_port_map in_port_map;
-    graph_port_map out_port_map;
+    graph_in_port_map in_port_map;
+    graph_out_port_map out_port_map;
 
 protected:
     match_context_t *parent_ctx;
@@ -208,6 +213,8 @@ bool match_pattern(op_t *first_op, const std::shared_ptr<pb_graph_t> &pattern,
 inline std::vector<op_t *> reorder_matched_list(
         const std::unordered_map<op_t *, pb_op_t *> &matched_op_map);
 
+// verify if all the in_ports of the graph have been filled
+bool verify_global_in_map(match_context_t *ctx);
 //
 // fill the current match_context's in/out port map
 // to pattern match_context. Useful for nested patterns
diff --git a/src/graph/utils/pm/op_depth_check_pass.hpp b/src/graph/utils/pm/op_depth_check_pass.hpp
index c38863348b5..4aa096b9de8 100644
--- a/src/graph/utils/pm/op_depth_check_pass.hpp
+++ b/src/graph/utils/pm/op_depth_check_pass.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,11 +32,11 @@ namespace utils {
 namespace pm {
 
 // Figure out depth of every op in order to find longest path to the root
-class graph_op_depth_check_pass_t : public graph::pass::pass_base {
+class graph_op_depth_check_pass_t : public graph::pass::pass_base_t {
 public:
     explicit graph_op_depth_check_pass_t(
             std::string pbackend, std::string pname)
-        : graph::pass::pass_base(std::move(pbackend), std::move(pname)) {}
+        : graph::pass::pass_base_t(std::move(pbackend), std::move(pname)) {}
 
     static graph::pass::pass_base_ptr create(
             std::string pbackend, std::string pname) {
diff --git a/src/graph/utils/pm/pass_base.cpp b/src/graph/utils/pm/pass_base.cpp
index 980ae6f0e1d..0b1fd311ac0 100644
--- a/src/graph/utils/pm/pass_base.cpp
+++ b/src/graph/utils/pm/pass_base.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ namespace graph {
 namespace pass {
 
 template <>
-pass_base &pass_base::set_attr<FCreatePattern>(
+pass_base_t &pass_base_t::set_attr<FCreatePattern>(
         const std::string &attr_name, // NOLINT(*)
         const FCreatePattern &func) {
     Pattern pgraph = std::make_shared<pb_graph_t>();
diff --git a/src/graph/utils/pm/pass_base.hpp b/src/graph/utils/pm/pass_base.hpp
index 79a42950d06..9650d2e725b 100644
--- a/src/graph/utils/pm/pass_base.hpp
+++ b/src/graph/utils/pm/pass_base.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,8 +40,8 @@ namespace pass {
 
 using pb_graph_t = utils::pm::pb_graph_t;
 
-class pass_base;
-using pass_base_ptr = std::shared_ptr<pass_base>;
+class pass_base_t;
+using pass_base_ptr = std::shared_ptr<pass_base_t>;
 
 // FCreatePattern: a function for defining pattern.
 // One pass can have several FCreatePattern functions.
@@ -50,16 +50,16 @@ using FCreatePattern
 using Pattern = std::shared_ptr<pb_graph_t>;
 
 /*!
- * \brief pass_base provides a base class for pass creation.
+ * \brief pass_base_t provides a base class for pass creation.
  *        A pass is used to do pattern matching on a given graph,
  *        and reconstruct a new graph based on optimized patterns.
  */
-class pass_base {
+class pass_base_t {
 public:
-    pass_base(std::string pbackend, std::string pname)
+    pass_base_t(std::string pbackend, std::string pname)
         : backend_(std::move(pbackend)), name_(std::move(pname)) {}
 
-    pass_base() = default;
+    pass_base_t() = default;
 
     // the criteria of pass execution
     virtual impl::status_t run(graph_t &agraph) {
@@ -91,7 +91,7 @@ class pass_base {
         pkind_ = utils::str2partition_kind(kind);
     }
 
-    virtual ~pass_base() = default;
+    virtual ~pass_base_t() = default;
 
     std::string get_pass_backend() { return backend_; }
 
@@ -99,7 +99,7 @@ class pass_base {
 
     // set pass priority, passes with high priority
     // will be executed before passes with low priority
-    pass_base &set_priority(float priority) {
+    pass_base_t &set_priority(float priority) {
         priority_ = priority;
         return *this;
     }
@@ -107,21 +107,21 @@ class pass_base {
 
     // set enable status
     // can be used for override default value of enable_
-    pass_base &set_enable(bool enable) {
+    pass_base_t &set_enable(bool enable) {
         enable_ = enable;
         return *this;
     }
 
     bool get_enable() const { return enable_; }
 
-    pass_base &set_kind(partition_kind_t pkind) {
+    pass_base_t &set_kind(partition_kind_t pkind) {
         pkind_ = pkind;
         return *this;
     }
 
     partition_kind_t get_kind() const { return pkind_; }
 
-    pass_base &set_engine_kind(engine_kind_t kind) {
+    pass_base_t &set_engine_kind(engine_kind_t kind) {
         engine_kind_ = kind;
         return *this;
     }
@@ -135,7 +135,7 @@ class pass_base {
     * \tparam value_type The type of the value to be set.
     */
     template <typename value_type>
-    pass_base &set_attr(const std::string &attr_name, // NOLINT(*)
+    pass_base_t &set_attr(const std::string &attr_name, // NOLINT(*)
             const value_type &value) {
         attrs_.insert(make_pair(attr_name, value));
         return *this;
@@ -171,8 +171,8 @@ class pass_base {
     std::unordered_multimap<std::string, utils::any_t> attrs_;
 
 private:
-    std::string backend_ {};
-    std::string name_ {};
+    std::string backend_;
+    std::string name_;
     float priority_ {5.0f};
     bool enable_ {true};
     partition_kind_t pkind_ {partition_kind_t::undef};
@@ -180,7 +180,7 @@ class pass_base {
 };
 
 template <>
-pass_base &pass_base::set_attr<FCreatePattern>(
+pass_base_t &pass_base_t::set_attr<FCreatePattern>(
         const std::string &attr_name, // NOLINT(*)
         const FCreatePattern &func);
 
diff --git a/src/graph/utils/pm/pass_manager.cpp b/src/graph/utils/pm/pass_manager.cpp
index dda3318f14b..94734601342 100644
--- a/src/graph/utils/pm/pass_manager.cpp
+++ b/src/graph/utils/pm/pass_manager.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ bool default_pass_filter(const pass_base_ptr &pass, partition_policy_t policy) {
 }
 
 // register a pass
-pass_base &pass_registry_t::register_pass(const std::string &backend_name,
+pass_base_t &pass_registry_t::register_pass(const std::string &backend_name,
         const std::string &pass_name, pass_create_fn fn) {
     // create new pass
     auto find = std::find_if(passes_.begin(), passes_.end(),
@@ -51,7 +51,7 @@ pass_base &pass_registry_t::register_pass(const std::string &backend_name,
     }
 }
 
-pass_base &pass_registry_t::register_pass(const pass_base_ptr &pass) {
+pass_base_t &pass_registry_t::register_pass(const pass_base_ptr &pass) {
     passes_.push_back(pass);
     passes_map_[pass->get_pass_name()] = pass;
     return *pass;
@@ -70,7 +70,7 @@ void pass_manager_t::print_passes(const std::string &pass_config_json) {
 }
 
 void pass_manager_t::print_passes(std::ostream *os) {
-    auto passes = get_passes();
+    const auto &passes = get_passes();
     json::json_writer_t write(os);
     write.begin_object();
     std::string hash = dnnl_version()->hash;
@@ -126,12 +126,12 @@ impl::status_t pass_manager_t::run_passes(graph_t &agraph, std::istream *fs,
         }
         if (read_json) {
             verbose_printf(
-                    "onednn_graph_verbose,warn,pattern,ignore config file for "
-                    "incompatible hash id\n");
+                    "graph,warn,pattern,ignore config file for incompatible "
+                    "hash id\n");
         } else {
             verbose_printf(
-                    "onednn_graph_verbose,warn,pattern,ignore config file for "
-                    "missing json filed\n");
+                    "graph,warn,pattern,ignore config file for missing json "
+                    "field\n");
         }
     }
 
diff --git a/src/graph/utils/pm/pass_manager.hpp b/src/graph/utils/pm/pass_manager.hpp
index d60d157699f..28159c04e82 100644
--- a/src/graph/utils/pm/pass_manager.hpp
+++ b/src/graph/utils/pm/pass_manager.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,9 +47,9 @@ class pass_registry_t {
 
 public:
     // register a pass
-    pass_base &register_pass(const std::string &backend_name,
+    pass_base_t &register_pass(const std::string &backend_name,
             const std::string &pass_name, pass_create_fn fn);
-    pass_base &register_pass(const pass_base_ptr &pass);
+    pass_base_t &register_pass(const pass_base_ptr &pass);
     // get registered passes
     const std::list<pass_base_ptr> &get_passes() const { return passes_; }
 
diff --git a/src/graph/utils/pm/pbuilder.cpp b/src/graph/utils/pm/pbuilder.cpp
index aff8cdd11ca..399dbcd6d54 100644
--- a/src/graph/utils/pm/pbuilder.cpp
+++ b/src/graph/utils/pm/pbuilder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -220,6 +220,7 @@ bool pb_graph_t::set_edge(const std::shared_ptr<consumer_t> &p_consumer,
 
 std::vector<pb_node_t *> pb_graph_t::get_nodes() {
     std::vector<pb_node_t *> retval;
+    retval.reserve(nodes_.size());
     for (auto const &i : nodes_) {
         retval.push_back(i.get());
     }
@@ -369,6 +370,7 @@ alternation_t::alternation_t(std::vector<std::shared_ptr<pb_graph_t>> p_nodes)
 
 std::vector<pb_graph_t *> alternation_t::get_alternatives() {
     std::vector<pb_graph_t *> retval;
+    retval.reserve(alternatives_.size());
     for (auto const &i : alternatives_) {
         retval.push_back(i.get());
     }
@@ -380,9 +382,9 @@ repetition_t::repetition_t(std::shared_ptr<pb_graph_t> p_node, port_map p_map,
     : body_ {std::move(p_node)}
     , port_map_ {std::move(p_map)}
     , min_rep_ {min_rep}
-    , max_rep_ {max_rep} {
+    , max_rep_ {max_rep}
+    , min_op_num_ {body_->get_min_op_num() * min_rep} {
     node_kind_ = pb_node_kind::PB_NODE_KIND_REPETITION;
-    min_op_num_ = body_->get_min_op_num() * min_rep_;
     auto contained_ops = body_->get_contained_ops();
     p_ops_.insert(contained_ops.begin(), contained_ops.end());
 }
diff --git a/src/graph/utils/sycl_check.hpp b/src/graph/utils/sycl_check.hpp
deleted file mode 100644
index 67620a2a4d0..00000000000
--- a/src/graph/utils/sycl_check.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef GRAPH_UTILS_SYCL_CHECK_HPP
-#define GRAPH_UTILS_SYCL_CHECK_HPP
-
-#ifdef DNNL_WITH_SYCL
-#if __has_include(<sycl/sycl.hpp>)
-#include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
-#else
-#error "Unsupported compiler"
-#endif
-
-#if defined(__INTEL_LLVM_COMPILER)
-#if (__INTEL_LLVM_COMPILER < 20230000)
-#define DNNL_USE_SYCL121_API 1
-#else
-#define DNNL_USE_SYCL121_API 0
-#endif
-#elif defined(__LIBSYCL_MAJOR_VERSION)
-#if (__LIBSYCL_MAJOR_VERSION < 6)
-#define DNNL_USE_SYCL121_API 1
-#else
-#define DNNL_USE_SYCL121_API 0
-#endif
-#else
-#error "Unsupported compiler"
-#endif
-#endif
-
-#endif
diff --git a/src/graph/utils/utils.hpp b/src/graph/utils/utils.hpp
index 3df061ed25f..a96bcc6f97c 100644
--- a/src/graph/utils/utils.hpp
+++ b/src/graph/utils/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ using namespace dnnl::impl::utils;
 #ifndef NDEBUG
 #define DEBUG_PRINT_ERROR(message) \
     do { \
-        std::cout << message << std::endl; \
+        std::cout << (message) << std::endl; \
     } while (0)
 #else
 #define DEBUG_PRINT_ERROR(message)
diff --git a/src/graph/utils/verbose.cpp b/src/graph/utils/verbose.cpp
index 7f780198256..5caa7942904 100644
--- a/src/graph/utils/verbose.cpp
+++ b/src/graph/utils/verbose.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -56,15 +56,6 @@ void print_verbose_header() {
     }
 }
 
-#if defined(DISABLE_VERBOSE)
-void partition_info_t::init(
-        const engine_t *engine, const compiled_partition_t *partition) {
-    UNUSED(engine);
-    UNUSED(partition);
-}
-
-#else
-
 namespace {
 
 std::string logical_tensor2dim_str(const logical_tensor_t &logical_tensor) {
@@ -226,7 +217,9 @@ std::string init_info_partition(const engine_t *engine,
         }
     }
 
-    ss << ",fpm:" << fpmath_mode2str(partition.get_pimpl()->get_fpmath_mode());
+    const auto &fpm = partition.get_pimpl()->get_fpmath_mode();
+    ss << ",fpm:" << fpmath_mode2str(fpm.mode_);
+    if (fpm.apply_to_int_) ss << ":true";
 
     ss << "," << compiled_partition->get_pimpl()->str();
 
@@ -239,6 +232,7 @@ std::string init_info_partition(const engine_t *engine,
 
 void partition_info_t::init(const engine_t *engine,
         const compiled_partition_t *compiled_partition) {
+    // Handles VERBOSE_DISABLE since `is_initialized_` is set to `true`.
     if (is_initialized_) return;
 
     std::call_once(initialization_flag_, [&] {
@@ -247,8 +241,6 @@ void partition_info_t::init(const engine_t *engine,
     });
 }
 
-#endif
-
 } // namespace utils
 } // namespace graph
 } // namespace impl
diff --git a/src/graph/utils/verbose.hpp b/src/graph/utils/verbose.hpp
index 006a2d51b17..3d8b7501408 100644
--- a/src/graph/utils/verbose.hpp
+++ b/src/graph/utils/verbose.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,19 +35,10 @@ void print_verbose_header();
 
 struct partition_info_t {
     partition_info_t() = default;
-    partition_info_t(const partition_info_t &rhs)
-        : str_(rhs.str_), is_initialized_(rhs.is_initialized_) {};
-    partition_info_t &operator=(const partition_info_t &rhs) {
-        str_ = rhs.str_;
-        is_initialized_ = rhs.is_initialized_;
-        return *this;
-    }
-
+    void init(const engine_t *engine, const compiled_partition_t *partition);
     const char *c_str() const { return str_.c_str(); }
     bool is_initialized() const { return is_initialized_; }
 
-    void init(const engine_t *engine, const compiled_partition_t *partition);
-
 private:
     std::string str_;
 
diff --git a/src/xpu/ocl/buffer_memory_storage.hpp b/src/xpu/ocl/buffer_memory_storage.hpp
index e4c148966d7..8a3f46ad9a1 100644
--- a/src/xpu/ocl/buffer_memory_storage.hpp
+++ b/src/xpu/ocl/buffer_memory_storage.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 
 #include "xpu/ocl/memory_storage_base.hpp"
 
-#include "gpu/intel/ocl/ocl_utils.hpp"
+#include "gpu/intel/ocl/utils.hpp"
 
 namespace dnnl {
 namespace impl {
diff --git a/src/xpu/ocl/capi/engine.cpp b/src/xpu/ocl/capi/engine.cpp
index 92bb051dc3b..aead39ea46b 100644
--- a/src/xpu/ocl/capi/engine.cpp
+++ b/src/xpu/ocl/capi/engine.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -138,7 +138,8 @@ status_t dnnl_ocl_interop_engine_get_cache_blob_id(
     if (!cache_blob) {
         id_size = platform_name_size + device_name_size + driver_version_size
                 + sizeof(version->major) + sizeof(version->minor)
-                + sizeof(version->patch) + std::strlen(version->hash);
+                + sizeof(version->patch) + std::strlen(version->hash)
+                + 4 * sizeof(size_t);
         return status::success;
     }
 
@@ -148,7 +149,7 @@ status_t dnnl_ocl_interop_engine_get_cache_blob_id(
             &platform_name[0], nullptr);
     OCL_CHECK(err);
 
-    sstream.write(platform_name.data(), platform_name.size());
+    sstream.append_array(platform_name.size(), platform_name.data());
 
     // Get device name.
     auto device_name = std::string(device_name_size, '\0');
@@ -156,7 +157,7 @@ status_t dnnl_ocl_interop_engine_get_cache_blob_id(
             device, CL_DEVICE_NAME, device_name_size, &device_name[0], nullptr);
     OCL_CHECK(err);
 
-    sstream.write(device_name.data(), device_name.size());
+    sstream.append_array(device_name.size(), device_name.data());
 
     // Get driver version.
     auto driver_version = std::string(driver_version_size, '\0');
@@ -164,15 +165,15 @@ status_t dnnl_ocl_interop_engine_get_cache_blob_id(
             &driver_version[0], nullptr);
     OCL_CHECK(err);
 
-    sstream.write(driver_version.data(), driver_version.size());
+    sstream.append_array(driver_version.size(), driver_version.data());
 
     // Get oneDNN version.
-    sstream.write(&version->major);
-    sstream.write(&version->minor);
-    sstream.write(&version->patch);
+    sstream.append(version->major);
+    sstream.append(version->minor);
+    sstream.append(version->patch);
 
     // Get oneDNN hash.
-    sstream.write(version->hash, std::strlen(version->hash));
+    sstream.append_array(std::strlen(version->hash), version->hash);
 
     // Not enough buffer space for copying cache blob.
     if (id_size != sstream.get_data().size()) return status::invalid_arguments;
diff --git a/src/xpu/ocl/capi/memory.cpp b/src/xpu/ocl/capi/memory.cpp
index cbfd10e55c7..e6bd5ab4c4e 100644
--- a/src/xpu/ocl/capi/memory.cpp
+++ b/src/xpu/ocl/capi/memory.cpp
@@ -29,6 +29,62 @@
 using namespace dnnl::impl;
 using namespace dnnl::impl::xpu::ocl;
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+status_t dnnl_ocl_interop_memory_create_v2(memory_t **memory,
+        const memory_desc_t *md, engine_t *engine, memory_kind_t memory_kind,
+        int nhandles, void **handles) {
+
+    bool ok = !utils::any_null(memory, md, engine, handles) && nhandles > 0
+            && engine->runtime_kind() == runtime_kind::ocl;
+    if (!ok) return status::invalid_arguments;
+
+    const auto mdw = memory_desc_wrapper(md);
+    if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
+        return status::invalid_arguments;
+
+    std::vector<unsigned> flags_vec(nhandles);
+    std::vector<void *> handles_vec(nhandles);
+    for (int i = 0; i < nhandles; i++) {
+        unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE)
+                ? memory_flags_t::alloc
+                : memory_flags_t::use_runtime_ptr;
+        void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i];
+        flags_vec[i] = f;
+        handles_vec[i] = h;
+    }
+
+    bool is_usm = memory_kind == memory_kind::usm;
+    std::vector<std::unique_ptr<memory_storage_t>> mem_storages(nhandles);
+
+    if (is_usm) {
+        for (int i = 0; i < nhandles; i++) {
+            if (handles[i] != DNNL_MEMORY_NONE
+                    && handles[i] != DNNL_MEMORY_ALLOCATE
+                    && xpu::ocl::usm::get_pointer_type(engine, handles[i])
+                            == xpu::ocl::usm::kind_t::unknown
+                    && !engine->mayiuse_system_memory_allocators()) {
+                return status::invalid_arguments;
+            }
+            size_t sz = dnnl_memory_desc_get_size_v2(md, i);
+            mem_storages[i].reset(new xpu::ocl::usm_memory_storage_t(engine));
+            if (!mem_storages[i]) return status::out_of_memory;
+            CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
+        }
+    } else {
+        for (int i = 0; i < nhandles; i++) {
+            size_t sz = dnnl_memory_desc_get_size_v2(md, i);
+            mem_storages[i].reset(
+                    new xpu::ocl::buffer_memory_storage_t(engine));
+            if (!mem_storages[i]) return status::out_of_memory;
+            CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
+        }
+    }
+
+    return safe_ptr_assign(
+            *memory, new memory_t(engine, md, std::move(mem_storages)));
+}
+#endif
+
 status_t dnnl_ocl_interop_memory_create(memory_t **memory,
         const memory_desc_t *md, engine_t *engine, memory_kind_t memory_kind,
         void *handle) {
diff --git a/src/xpu/ocl/context.hpp b/src/xpu/ocl/context.hpp
index 285d9282f46..90f72160a61 100644
--- a/src/xpu/ocl/context.hpp
+++ b/src/xpu/ocl/context.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,7 +48,7 @@ struct event_t final : xpu::event_t {
     static const event_t &from(const xpu::event_t &event) {
         return *utils::downcast<const event_t *>(&event);
     }
-    std::unique_ptr<xpu::event_t> clone() const {
+    std::unique_ptr<xpu::event_t> clone() const override {
         return std::unique_ptr<xpu::event_t>(new event_t(*this));
     }
 
@@ -62,10 +62,10 @@ struct event_t final : xpu::event_t {
 
 struct context_t final : public xpu::context_t {
     context_t() = default;
-    context_t(const std::vector<xpu::ocl::wrapper_t<cl_event>> &&events)
+    context_t(std::vector<xpu::ocl::wrapper_t<cl_event>> events)
         : events_(std::move(events)) {};
     context_t(const context_t &) = default;
-    ~context_t() = default;
+    ~context_t() override = default;
 
     context_t &operator=(const context_t &other) {
         events_ = other.events_;
diff --git a/src/xpu/ocl/engine_factory.hpp b/src/xpu/ocl/engine_factory.hpp
index f2b6ed1292b..3ac31e321c2 100644
--- a/src/xpu/ocl/engine_factory.hpp
+++ b/src/xpu/ocl/engine_factory.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include "xpu/ocl/utils.hpp"
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
+#include "gpu/intel/ocl/engine.hpp"
 #endif
 
 namespace dnnl {
@@ -60,8 +60,11 @@ class engine_factory_t : public impl::engine_factory_t {
         std::vector<cl_device_id> ocl_devices;
 
         status = xpu::ocl::get_devices(&ocl_devices, CL_DEVICE_TYPE_GPU);
-        VERROR_ENGINE(
-                status == status::success, status, "no ocl devices found");
+        VERROR_ENGINE(status == status::success, status,
+                VERBOSE_INVALID_ENGINE_KIND, "opencl", "gpu");
+
+        VERROR_ENGINE(!ocl_devices.empty(), status::invalid_arguments,
+                "opencl gpu devices queried but not found");
 
         VERROR_ENGINE(index < ocl_devices.size(), status::invalid_arguments,
                 VERBOSE_INVALID_ENGINE_IDX, ocl_devices.size(), "ocl", index);
diff --git a/src/xpu/ocl/engine_impl.cpp b/src/xpu/ocl/engine_impl.cpp
new file mode 100644
index 00000000000..2586bd78cd1
--- /dev/null
+++ b/src/xpu/ocl/engine_impl.cpp
@@ -0,0 +1,54 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common/engine.hpp"
+
+#include "xpu/ocl/engine_impl.hpp"
+#include "xpu/ocl/memory_storage.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace xpu {
+namespace ocl {
+
+status_t engine_impl_t::create_memory_storage(memory_storage_t **storage,
+        engine_t *engine, unsigned flags, size_t size, void *handle) const {
+    std::unique_ptr<memory_storage_t> _storage;
+
+    // This interface is expected to be used by the engine that holds this
+    // engine_impl_t. Other use cases are possible but not expected at this
+    // point.
+    assert(engine->impl() == this);
+
+    if (flags & memory_flags_t::prefer_device_usm) {
+        _storage.reset(new xpu::ocl::usm_memory_storage_t(
+                engine, xpu::ocl::usm::kind_t::device));
+    } else
+        _storage.reset(new xpu::ocl::buffer_memory_storage_t(engine));
+
+    if (!_storage) return status::out_of_memory;
+
+    status_t status = _storage->init(flags, size, handle);
+    if (status != status::success) return status;
+
+    *storage = _storage.release();
+    return status::success;
+}
+
+} // namespace ocl
+} // namespace xpu
+} // namespace impl
+} // namespace dnnl
diff --git a/src/xpu/ocl/engine_impl.hpp b/src/xpu/ocl/engine_impl.hpp
index 185d54a2f48..4d7a7c39598 100644
--- a/src/xpu/ocl/engine_impl.hpp
+++ b/src/xpu/ocl/engine_impl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -103,6 +103,9 @@ class engine_impl_t : public impl::engine_impl_t {
         return status::success;
     }
 
+    status_t create_memory_storage(memory_storage_t **storage, engine_t *engine,
+            unsigned flags, size_t size, void *handle) const override;
+
     cl_device_id device() const { return device_; }
     cl_context context() const { return context_; }
     cl_platform_id platform() const { return platform_; }
@@ -134,13 +137,17 @@ class engine_impl_t : public impl::engine_impl_t {
     std::string name_;
     runtime_version_t runtime_version_;
 
-private:
     xpu::ocl::wrapper_t<cl_device_id> device_;
     xpu::ocl::wrapper_t<cl_context> context_;
     cl_platform_id platform_ = nullptr;
     bool is_user_context_;
 };
 
+#define DECLARE_COMMON_OCL_ENGINE_FUNCTIONS() \
+    cl_device_id device() const { return impl()->device(); } \
+    cl_context context() const { return impl()->context(); } \
+    cl_platform_id platform() const { return impl()->platform(); }
+
 } // namespace ocl
 } // namespace xpu
 } // namespace impl
diff --git a/src/xpu/ocl/stream_impl.cpp b/src/xpu/ocl/stream_impl.cpp
index 57c71405b3d..7d9c62eb9c6 100644
--- a/src/xpu/ocl/stream_impl.cpp
+++ b/src/xpu/ocl/stream_impl.cpp
@@ -145,14 +145,31 @@ status_t stream_impl_t::copy(impl::stream_t *stream,
         void *src_mapped_ptr;
         void *dst_mapped_ptr;
 
-        CHECK(src.map_data(&src_mapped_ptr, stream, size));
-        CHECK(dst.map_data(&dst_mapped_ptr, stream, size));
+        // It is allowed for src or dst memory to be created for an engine that
+        // is not associated with the stream passed to this function. It is done
+        // to enabled cross engine reordering.
+        //
+        // For example, there are two memory objects created using different
+        // engines. One of the engines was then used to create the reorder
+        // primitive and a stream. In this case only one memory object contains
+        // an engine that matches the engine contained by the stream.
+        //
+        // The OpenCL copy routines require both pointers (src and dst) to be
+        // associated with the same context as the queue the copy routine runs
+        // on.
+        auto *src_map_stream
+                = src.engine() == stream->engine() ? stream : nullptr;
+        auto *dst_map_stream
+                = dst.engine() == stream->engine() ? stream : nullptr;
+
+        CHECK(src.map_data(&src_mapped_ptr, src_map_stream, size));
+        CHECK(dst.map_data(&dst_mapped_ptr, dst_map_stream, size));
 
         std::memcpy(static_cast<void *>(dst_mapped_ptr),
                 static_cast<const void *>(src_mapped_ptr), size);
 
-        CHECK(src.unmap_data(src_mapped_ptr, stream));
-        CHECK(dst.unmap_data(dst_mapped_ptr, stream));
+        CHECK(src.unmap_data(src_mapped_ptr, src_map_stream));
+        CHECK(dst.unmap_data(dst_mapped_ptr, dst_map_stream));
 
         // Short-circuit event management due to calls to wait
         return status::success;
diff --git a/src/xpu/ocl/stream_profiler.cpp b/src/xpu/ocl/stream_profiler.cpp
index 7e067715574..de82b412483 100644
--- a/src/xpu/ocl/stream_profiler.cpp
+++ b/src/xpu/ocl/stream_profiler.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -35,7 +35,12 @@ namespace ocl {
 status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
         int *num_entries, uint64_t *data) const {
     if (!num_entries) return status::invalid_arguments;
+    bool is_per_kernel = (data_kind == profiling_data_kind::time_per_kernel);
     if (!data) {
+        if (is_per_kernel) {
+            *num_entries = (int)events_.size();
+            return status::success;
+        }
         std::unordered_set<uint64_t> seen;
         for (auto &ev : events_)
             seen.insert(ev.stamp);
@@ -44,8 +49,8 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
     }
 
     std::map<uint64_t, xpu::stream_profiler_t::entry_t> stamp2entry;
+    int idx = 0;
     for (auto &ev : events_) {
-        auto &entry = stamp2entry[ev.stamp];
         const xpu::ocl::event_t &ocl_event
                 = *utils::downcast<xpu::ocl::event_t *>(ev.event.get());
         cl_ulong beg, end;
@@ -54,6 +59,11 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
                 CL_PROFILING_COMMAND_START, sizeof(beg), &beg, nullptr));
         OCL_CHECK(clGetEventProfilingInfo(ocl_event[0].get(),
                 CL_PROFILING_COMMAND_END, sizeof(end), &end, nullptr));
+        if (is_per_kernel) {
+            data[idx++] = static_cast<uint64_t>(end - beg);
+            continue;
+        }
+        auto &entry = stamp2entry[ev.stamp];
         entry.min_nsec = std::min(entry.min_nsec, beg);
         entry.max_nsec = std::max(entry.max_nsec, end);
         const auto *gpu_stream
@@ -61,6 +71,7 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
         entry.freq += gpu_stream->get_freq(*ev.event);
         entry.kernel_count++;
     }
+    if (is_per_kernel) return status::success;
     return xpu::stream_profiler_t::get_info_impl(stamp2entry, data_kind, data);
 }
 
diff --git a/src/xpu/ocl/usm_memory_storage.cpp b/src/xpu/ocl/usm_memory_storage.cpp
index 99ea750f904..5d311c0b52a 100644
--- a/src/xpu/ocl/usm_memory_storage.cpp
+++ b/src/xpu/ocl/usm_memory_storage.cpp
@@ -43,7 +43,7 @@ status_t usm_memory_storage_t::map_data(
 
     if (!stream) CHECK(engine()->get_service_stream(stream));
 
-    void *host_ptr = usm::malloc_host(engine(), size);
+    void *host_ptr = usm::malloc_host(stream->engine(), size);
     if (!host_ptr) return status::out_of_memory;
 
     auto leak_guard = decltype(usm_ptr_)(
diff --git a/src/xpu/ocl/usm_memory_storage.hpp b/src/xpu/ocl/usm_memory_storage.hpp
index 39baf4f5b29..361451f1976 100644
--- a/src/xpu/ocl/usm_memory_storage.hpp
+++ b/src/xpu/ocl/usm_memory_storage.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,9 +28,6 @@
 #include "xpu/ocl/memory_storage_base.hpp"
 #include "xpu/ocl/usm_utils.hpp"
 
-#include "gpu/intel/ocl/ocl_gpu_engine.hpp"
-#include "gpu/intel/ocl/ocl_utils.hpp"
-
 namespace dnnl {
 namespace impl {
 namespace xpu {
@@ -75,7 +72,7 @@ class usm_memory_storage_t : public memory_storage_base_t {
 protected:
     status_t init_allocate(size_t size) override {
         using kind_t = usm::kind_t;
-        if (usm_kind_ == kind_t::unknown) usm_kind_ = kind_t::shared;
+        if (usm_kind_ == kind_t::unknown) usm_kind_ = kind_t::device;
 
         void *usm_ptr_alloc = nullptr;
 
diff --git a/src/xpu/ocl/verbose.hpp b/src/xpu/ocl/verbose.hpp
index ace1cfeecdc..4e85a8c0bb0 100644
--- a/src/xpu/ocl/verbose.hpp
+++ b/src/xpu/ocl/verbose.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,8 +31,13 @@ namespace impl {
 namespace xpu {
 namespace ocl {
 
-void print_verbose_header() {
+inline void print_verbose_header() {
     xpu::ocl::engine_factory_t factory(engine_kind::gpu);
+
+    verbose_printf("info,gpu,engine,opencl device count:%zu %s\n",
+            factory.count(),
+            factory.count() == 0 ? "- no devices available." : "");
+
     for (size_t i = 0; i < factory.count(); ++i) {
         impl::engine_t *eng_ptr = nullptr;
         status_t status = factory.engine_create(&eng_ptr, i);
@@ -45,7 +50,7 @@ void print_verbose_header() {
         const auto *engine_impl
                 = utils::downcast<const xpu::ocl::engine_impl_t *>(
                         eng_ptr->impl());
-        auto s_name = engine_impl->name();
+        const auto &s_name = engine_impl->name();
         auto s_ver = engine_impl->runtime_version().str();
 
 #if DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
diff --git a/src/xpu/stream_profiler.hpp b/src/xpu/stream_profiler.hpp
index 77356f5e887..c87d61acd6d 100644
--- a/src/xpu/stream_profiler.hpp
+++ b/src/xpu/stream_profiler.hpp
@@ -100,7 +100,8 @@ struct stream_profiler_t {
                 case profiling_data_kind::time: data[idx] = e.get_nsec(); break;
                 case profiling_data_kind::cycles: {
                     double freq = e.freq / e.kernel_count;
-                    data[idx] = freq * e.get_nsec() / 1e9;
+                    data[idx] = static_cast<uint64_t>(
+                            freq * static_cast<double>(e.get_nsec()) / 1e9);
                     if (callback_) callback_(kv.first, e.get_nsec());
                     break;
                 }
diff --git a/src/xpu/sycl/buffer_memory_storage.cpp b/src/xpu/sycl/buffer_memory_storage.cpp
index e10cd90b751..3a2978f8f17 100644
--- a/src/xpu/sycl/buffer_memory_storage.cpp
+++ b/src/xpu/sycl/buffer_memory_storage.cpp
@@ -39,7 +39,10 @@ memory_arg_t<mode> get_memory_arg(const buffer_memory_storage_t *storage,
                 = utils::downcast<xpu::sycl::stream_impl_t *>(stream->impl());
         return {sycl_stream_impl->get_dummy_accessor<mode>(cgh)};
     }
-    return {storage->buffer().get_access<mode>(cgh)};
+    ::sycl::id<1> offset(storage->offset());
+    ::sycl::range<1> range(storage->buffer().size() - storage->offset());
+
+    return {storage->buffer().get_access<mode>(cgh, range, offset)};
 }
 
 } // namespace
@@ -124,6 +127,16 @@ std::unique_ptr<memory_storage_t> buffer_memory_storage_t::clone() const {
 
     storage->buffer_ = buffer_;
     storage->base_offset_ = base_offset_;
+    storage->set_offset(offset());
+
+    return storage;
+}
+
+std::unique_ptr<memory_storage_t> buffer_memory_storage_t::clone_ptr_off(
+        size_t offset) const {
+    auto storage = clone();
+    storage->set_offset(offset + this->offset());
+
     return storage;
 }
 
diff --git a/src/xpu/sycl/buffer_memory_storage.hpp b/src/xpu/sycl/buffer_memory_storage.hpp
index 9453cf4a24f..b3dcdf57687 100644
--- a/src/xpu/sycl/buffer_memory_storage.hpp
+++ b/src/xpu/sycl/buffer_memory_storage.hpp
@@ -68,6 +68,9 @@ class buffer_memory_storage_t : public memory_storage_base_t {
 
     std::unique_ptr<memory_storage_t> clone() const override;
 
+    std::unique_ptr<memory_storage_t> clone_ptr_off(
+            size_t offset) const override;
+
     in_memory_arg_t get_in_memory_arg(
             stream_t *stream, ::sycl::handler &cgh) const override;
     out_memory_arg_t get_out_memory_arg(
diff --git a/src/xpu/sycl/capi/capi_memory.cpp b/src/xpu/sycl/capi/capi_memory.cpp
index c694fc9855d..17428d97346 100644
--- a/src/xpu/sycl/capi/capi_memory.cpp
+++ b/src/xpu/sycl/capi/capi_memory.cpp
@@ -25,6 +25,7 @@
 #include "xpu/sycl/engine_factory.hpp"
 #include "xpu/sycl/memory_storage.hpp"
 
+using namespace dnnl::impl;
 using namespace dnnl::impl::xpu::sycl;
 
 using dnnl::impl::engine_t;
@@ -34,6 +35,67 @@ using dnnl::impl::status_t;
 using ::sycl::context;
 using ::sycl::get_pointer_type;
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+status_t dnnl_sycl_interop_memory_create_v2(memory_t **memory,
+        const memory_desc_t *md, engine_t *engine, memory_kind_t memory_kind,
+        int nhandles, void **handles) {
+
+    bool ok = !utils::any_null(memory, md, engine, handles) && nhandles > 0
+            && engine->runtime_kind() == runtime_kind::sycl;
+    if (!ok) return status::invalid_arguments;
+
+    const auto mdw = memory_desc_wrapper(md);
+    if (mdw.format_any() || mdw.has_runtime_dims_or_strides())
+        return status::invalid_arguments;
+
+    std::vector<unsigned> flags_vec(nhandles);
+    std::vector<void *> handles_vec(nhandles);
+    for (int i = 0; i < nhandles; i++) {
+        unsigned f = (handles[i] == DNNL_MEMORY_ALLOCATE)
+                ? memory_flags_t::alloc
+                : memory_flags_t::use_runtime_ptr;
+        void *h = (handles[i] == DNNL_MEMORY_ALLOCATE) ? nullptr : handles[i];
+        flags_vec[i] = f;
+        handles_vec[i] = h;
+    }
+
+    bool is_usm = memory_kind == memory_kind::usm;
+    std::vector<std::unique_ptr<memory_storage_t>> mem_storages(nhandles);
+
+    if (is_usm) {
+        for (int i = 0; i < nhandles; i++) {
+            if (handles[i] != DNNL_MEMORY_NONE
+                    && handles[i] != DNNL_MEMORY_ALLOCATE) {
+                const auto *sycl_engine_impl
+                        = utils::downcast<const xpu::sycl::engine_impl_t *>(
+                                engine->impl());
+                auto &sycl_ctx = sycl_engine_impl->context();
+                ::sycl::usm::alloc ptr_type
+                        = get_pointer_type(handles[i], sycl_ctx);
+                if (ptr_type == ::sycl::usm::alloc::unknown
+                        && !engine->mayiuse_system_memory_allocators())
+                    return status::invalid_arguments;
+            }
+            size_t sz = dnnl_memory_desc_get_size_v2(md, i);
+            mem_storages[i].reset(new xpu::sycl::usm_memory_storage_t(engine));
+            if (!mem_storages[i]) return status::out_of_memory;
+            CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
+        }
+    } else {
+        for (int i = 0; i < nhandles; i++) {
+            size_t sz = dnnl_memory_desc_get_size_v2(md, i);
+            mem_storages[i].reset(
+                    new xpu::sycl::buffer_memory_storage_t(engine));
+            if (!mem_storages[i]) return status::out_of_memory;
+            CHECK(mem_storages[i]->init(flags_vec[i], sz, handles_vec[i]));
+        }
+    }
+
+    return safe_ptr_assign(
+            *memory, new memory_t(engine, md, std::move(mem_storages)));
+}
+#endif
+
 status_t dnnl_sycl_interop_memory_create(memory_t **memory,
         const memory_desc_t *md, engine_t *engine, memory_kind_t memory_kind,
         void *handle) {
diff --git a/src/xpu/sycl/compat.hpp b/src/xpu/sycl/compat.hpp
index fa5ebf680f8..2149d194f33 100644
--- a/src/xpu/sycl/compat.hpp
+++ b/src/xpu/sycl/compat.hpp
@@ -58,27 +58,10 @@ inline void host_task(H &cgh, F &&f) {
 
 constexpr auto target_device = ::sycl::target::device;
 
-#if DNNL_USE_SYCL121_API
 template <typename T, int dims>
 using local_accessor = ::sycl::accessor<T, dims,
         ::sycl::access::mode::read_write, ::sycl::access::target::local>;
 
-constexpr auto ext_intel_gpu_slices
-        = ::sycl::info::device::ext_intel_gpu_slices;
-constexpr auto ext_intel_gpu_subslices_per_slice
-        = ::sycl::info::device::ext_intel_gpu_subslices_per_slice;
-
-const auto cpu_selector_v = ::sycl::cpu_selector {};
-const auto gpu_selector_v = ::sycl::gpu_selector {};
-#else
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-template <typename T, int dims>
-using local_accessor = ::sycl::accessor<T, dims,
-        ::sycl::access::mode::read_write, ::sycl::access::target::local>;
-#pragma clang diagnostic pop
-
 using ext_intel_gpu_slices = ::sycl::ext::intel::info::device::gpu_slices;
 using ext_intel_gpu_subslices_per_slice
         = ::sycl::ext::intel::info::device::gpu_subslices_per_slice;
@@ -86,8 +69,6 @@ using ext_intel_gpu_subslices_per_slice
 inline const auto &cpu_selector_v = ::sycl::cpu_selector_v;
 inline const auto &gpu_selector_v = ::sycl::gpu_selector_v;
 
-#endif
-
 } // namespace compat
 } // namespace sycl
 } // namespace xpu
diff --git a/src/xpu/sycl/context.hpp b/src/xpu/sycl/context.hpp
index 593dbf06659..85be90e5a21 100644
--- a/src/xpu/sycl/context.hpp
+++ b/src/xpu/sycl/context.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ struct event_t : public xpu::event_t {
     event_t(const std::vector<::sycl::event> &event) : events(event) {}
     event_t(std::vector<::sycl::event> &&event) : events(std::move(event)) {}
     event_t(const event_t &) = default;
-    event_t &operator=(event_t other) {
+    event_t &operator=(const event_t &other) {
         events = other.events;
         return *this;
     }
@@ -63,7 +63,7 @@ struct event_t : public xpu::event_t {
 
 struct context_t final : public xpu::context_t {
     context_t() = default;
-    context_t(const std::vector<::sycl::event> &&events)
+    context_t(std::vector<::sycl::event> events)
         : events_(std::move(events)) {};
     context_t(const context_t &) = default;
     ~context_t() override = default;
diff --git a/src/xpu/sycl/engine_impl.hpp b/src/xpu/sycl/engine_impl.hpp
index 055cdee7433..fa62b17d1d3 100644
--- a/src/xpu/sycl/engine_impl.hpp
+++ b/src/xpu/sycl/engine_impl.hpp
@@ -68,7 +68,7 @@ class engine_impl_t : public impl::engine_impl_t {
     }
 
     status_t create_memory_storage(memory_storage_t **storage, engine_t *engine,
-            unsigned flags, size_t size, void *handle) const;
+            unsigned flags, size_t size, void *handle) const override;
 
     const ::sycl::device &device() const { return device_; }
     const ::sycl::context &context() const { return context_; }
@@ -113,6 +113,11 @@ class engine_impl_t : public impl::engine_impl_t {
     backend_t backend_;
 };
 
+#define DECLARE_COMMON_SYCL_ENGINE_FUNCTIONS() \
+    const ::sycl::device &device() const { return impl()->device(); } \
+    const ::sycl::context &context() const { return impl()->context(); } \
+    xpu::sycl::backend_t backend() const { return impl()->backend(); }
+
 } // namespace sycl
 } // namespace xpu
 } // namespace impl
diff --git a/src/xpu/sycl/memory_storage_helper.hpp b/src/xpu/sycl/memory_storage_helper.hpp
index 841de606256..3131a2edda4 100644
--- a/src/xpu/sycl/memory_storage_helper.hpp
+++ b/src/xpu/sycl/memory_storage_helper.hpp
@@ -63,17 +63,19 @@ class interop_memory_arg_t {
     interop_memory_arg_t(memory_storage_t *raw_mem, ::sycl::handler &cgh) {
         if (!raw_mem || raw_mem->is_null()) { return; }
         auto *mem = static_cast<memory_storage_base_t *>(raw_mem);
+        dim_t offset = mem->offset();
         switch (mem->memory_kind()) {
             case sycl::memory_kind::buffer: {
                 auto *buffer_storage
                         = utils::downcast<buffer_memory_storage_t *>(mem);
                 acc_.emplace(buffer_storage->buffer(), cgh);
-                offset_ = buffer_storage->base_offset();
+                offset_ = buffer_storage->base_offset() + offset;
                 break;
             }
             case sycl::memory_kind::usm: {
                 raw_ptr_ = utils::downcast<const usm_memory_storage_t *>(mem)
                                    ->usm_ptr();
+                offset_ = offset;
                 break;
             }
             default: assert(!"unexpected memory kind");
@@ -86,6 +88,11 @@ class interop_memory_arg_t {
         acc_.emplace(buf, cgh);
     }
 
+    interop_memory_arg_t(uint8_t *usm_ptr, size_t offset = 0)
+        : offset_ {offset} {
+        raw_ptr_ = usm_ptr;
+    }
+
     template <typename T = void>
     T *get_native_pointer(
 #ifdef DNNL_SYCL_CUDA
@@ -102,7 +109,8 @@ class interop_memory_arg_t {
                             ih.get_native_mem<be>(acc_.value()))
                     + offset_);
         } else {
-            raw_ptr = raw_ptr_;
+            raw_ptr = reinterpret_cast<T *>(
+                    reinterpret_cast<uint8_t *>(raw_ptr_) + offset_);
         }
         return reinterpret_cast<T *>(raw_ptr);
     }
@@ -112,7 +120,7 @@ class interop_memory_arg_t {
 private:
     void *raw_ptr_ = nullptr;
     std::optional<::sycl::accessor<uint8_t, 1, mode>> acc_;
-    size_t offset_;
+    size_t offset_ = 0;
 };
 
 } // namespace sycl
diff --git a/src/xpu/sycl/stream_impl.cpp b/src/xpu/sycl/stream_impl.cpp
index 69b5a84e062..e602b4f047f 100644
--- a/src/xpu/sycl/stream_impl.cpp
+++ b/src/xpu/sycl/stream_impl.cpp
@@ -43,14 +43,31 @@ status_t stream_impl_t::copy(impl::stream_t *stream,
         void *src_mapped_ptr;
         void *dst_mapped_ptr;
 
-        CHECK(src.map_data(&src_mapped_ptr, stream, size));
-        CHECK(dst.map_data(&dst_mapped_ptr, stream, size));
+        // It is allowed for src or dst memory to be created for an engine that
+        // is not associated with the stream passed to this function. It is done
+        // to enabled cross engine reordering.
+        //
+        // For example, there are two memory objects created using different
+        // engines. One of the engines was then used to create the reorder
+        // primitive and a stream. In this case only one memory object contains
+        // an engine that matches the engine contained by the stream.
+        //
+        // The SYCL copy routines require both pointers (src and dst) to be
+        // associated with the same context as the queue the copy routine runs
+        // on.
+        auto *src_map_stream
+                = src.engine() == stream->engine() ? stream : nullptr;
+        auto *dst_map_stream
+                = dst.engine() == stream->engine() ? stream : nullptr;
+
+        CHECK(src.map_data(&src_mapped_ptr, src_map_stream, size));
+        CHECK(dst.map_data(&dst_mapped_ptr, dst_map_stream, size));
 
         std::memcpy(static_cast<void *>(dst_mapped_ptr),
                 static_cast<const void *>(src_mapped_ptr), size);
 
-        CHECK(src.unmap_data(src_mapped_ptr, stream));
-        CHECK(dst.unmap_data(dst_mapped_ptr, stream));
+        CHECK(src.unmap_data(src_mapped_ptr, src_map_stream));
+        CHECK(dst.unmap_data(dst_mapped_ptr, dst_map_stream));
 
         return status::success;
     }
diff --git a/src/xpu/sycl/stream_profiler.cpp b/src/xpu/sycl/stream_profiler.cpp
index 6fdc1db8607..3d05dc47d6f 100644
--- a/src/xpu/sycl/stream_profiler.cpp
+++ b/src/xpu/sycl/stream_profiler.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,12 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
         int *num_entries, uint64_t *data) const {
     using namespace ::sycl::info;
     if (!num_entries) return status::invalid_arguments;
+    bool is_per_kernel = (data_kind == profiling_data_kind::time_per_kernel);
     if (!data) {
+        if (is_per_kernel) {
+            *num_entries = (int)events_.size();
+            return status::success;
+        }
         std::unordered_set<uint64_t> seen;
         for (auto &ev : events_)
             seen.insert(ev.stamp);
@@ -42,6 +47,7 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
     }
 
     std::map<uint64_t, stream_profiler_t::entry_t> stamp2entry;
+    int idx = 0;
     for (auto &ev : events_) {
         const xpu::sycl::event_t &sycl_event
                 = *utils::downcast<xpu::sycl::event_t *>(ev.event.get());
@@ -51,11 +57,16 @@ status_t stream_profiler_t::get_info(profiling_data_kind_t data_kind,
                           .get_profiling_info<event_profiling::command_start>();
         auto end = sycl_event[0]
                            .get_profiling_info<event_profiling::command_end>();
+        if (is_per_kernel) {
+            data[idx++] = static_cast<uint64_t>(end - beg);
+            continue;
+        }
         auto &entry = stamp2entry[ev.stamp];
         entry.min_nsec = std::min(entry.min_nsec, beg);
         entry.max_nsec = std::max(entry.max_nsec, end);
         entry.kernel_count++;
     }
+    if (is_per_kernel) return status::success;
     return xpu::stream_profiler_t::get_info_impl(stamp2entry, data_kind, data);
 }
 
diff --git a/src/xpu/sycl/types.hpp b/src/xpu/sycl/types.hpp
index 2852de1ecfc..9865c3d37bf 100644
--- a/src/xpu/sycl/types.hpp
+++ b/src/xpu/sycl/types.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,6 +48,38 @@ namespace sycl {
                     &CTX_OUT_STORAGE(arg)) \
                       ->get_out_memory_arg(ctx.stream(), cgh)
 
+#define CTX_INOUT_SYCL_KERNEL_MEMORY(arg) \
+    CTX_OUT_STORAGE(arg).is_null() \
+            ? xpu::sycl::memory_storage_base_t::empty_inout_memory_arg( \
+                    ctx.stream(), cgh) \
+            : utils::downcast<const xpu::sycl::memory_storage_base_t *>( \
+                    &CTX_OUT_STORAGE(arg)) \
+                      ->get_inout_memory_arg(ctx.stream(), cgh)
+
+#define CTX_IN_SCRATCH_KERNEL_MEMORY(KEY) \
+    ctx.get_scratchpad_grantor() \
+                    .get_memory_storage(memory_tracking::names::KEY) \
+                    ->is_null() \
+            ? xpu::sycl::memory_storage_base_t::empty_in_memory_arg( \
+                    ctx.stream(), cgh) \
+            : utils::downcast<const xpu::sycl::memory_storage_base_t *>( \
+                    ctx.get_scratchpad_grantor() \
+                            .get_memory_storage(memory_tracking::names::KEY) \
+                            .get()) \
+                      ->get_in_memory_arg(ctx.stream(), cgh);
+
+#define CTX_OUT_SCRATCH_KERNEL_MEMORY(KEY) \
+    ctx.get_scratchpad_grantor() \
+                    .get_memory_storage(memory_tracking::names::KEY) \
+                    ->is_null() \
+            ? xpu::sycl::memory_storage_base_t::empty_out_memory_arg( \
+                    ctx.stream(), cgh) \
+            : utils::downcast<const xpu::sycl::memory_storage_base_t *>( \
+                    ctx.get_scratchpad_grantor() \
+                            .get_memory_storage(memory_tracking::names::KEY) \
+                            .get()) \
+                      ->get_out_memory_arg(ctx.stream(), cgh);
+
 #define CHECK_SYCL_KERNEL_ARG_TYPE(type) \
     static_assert(::sycl::is_device_copyable_v<type>)
 
@@ -69,7 +101,8 @@ struct memory_arg_t {
         if (usm_) return usm_;
         return const_cast<acc_dt *>(
                 acc_.template get_multi_ptr<::sycl::access::decorated::no>()
-                        .get());
+                        .get()
+                + acc_.get_offset());
     }
 
     bool empty() const { return empty_; }
@@ -121,7 +154,7 @@ struct md_t {
         data_type_ = mdw.data_type();
 #define CHECK_AND_ASSIGN(lhs, rhs) \
     assert((rhs) <= INT32_MAX); \
-    (lhs) = (rhs)
+    (lhs) = static_cast<dim32_t>(rhs)
 
         CHECK_AND_ASSIGN(ndims_, mdw.ndims());
         CHECK_AND_ASSIGN(offset0_, mdw.offset0());
@@ -265,30 +298,30 @@ CHECK_SYCL_KERNEL_ARG_TYPE(md_t);
 CHECK_SYCL_KERNEL_ARG_TYPE(bfloat16_t);
 
 template <data_type_t>
-struct prec_traits;
+struct prec_traits_t;
 
 template <>
-struct prec_traits<data_type::f16> {
+struct prec_traits_t<data_type::f16> {
     using type = float16_t;
 };
 template <>
-struct prec_traits<data_type::bf16> {
+struct prec_traits_t<data_type::bf16> {
     using type = bfloat16_t;
 };
 template <>
-struct prec_traits<data_type::f32> {
+struct prec_traits_t<data_type::f32> {
     using type = float;
 };
 template <>
-struct prec_traits<data_type::s32> {
+struct prec_traits_t<data_type::s32> {
     using type = int32_t;
 };
 template <>
-struct prec_traits<data_type::s8> {
+struct prec_traits_t<data_type::s8> {
     using type = int8_t;
 };
 template <>
-struct prec_traits<data_type::u8> {
+struct prec_traits_t<data_type::u8> {
     using type = uint8_t;
 };
 
diff --git a/src/xpu/sycl/usm_memory_storage.hpp b/src/xpu/sycl/usm_memory_storage.hpp
index 1483d7cc8e3..09cec869080 100644
--- a/src/xpu/sycl/usm_memory_storage.hpp
+++ b/src/xpu/sycl/usm_memory_storage.hpp
@@ -111,10 +111,16 @@ class usm_memory_storage_t : public memory_storage_base_t {
 
         storage->usm_ptr_ = decltype(usm_ptr_)(usm_ptr_.get(), [](void *) {});
         storage->usm_kind_ = usm_kind_;
+        storage->set_offset(offset());
 
         return storage;
     }
 
+    std::unique_ptr<memory_storage_t> clone_ptr_off(
+            size_t offset) const override {
+        return get_sub_storage(offset, 0);
+    }
+
     in_memory_arg_t get_in_memory_arg(
             stream_t *stream, ::sycl::handler &cgh) const override;
     out_memory_arg_t get_out_memory_arg(
@@ -131,7 +137,7 @@ class usm_memory_storage_t : public memory_storage_base_t {
         auto &sycl_ctx = sycl_engine_impl->context();
         using ::sycl::usm::alloc;
 
-        if (usm_kind_ == alloc::unknown) usm_kind_ = alloc::shared;
+        if (usm_kind_ == alloc::unknown) usm_kind_ = alloc::device;
 
         void *usm_ptr_alloc = nullptr;
 
diff --git a/src/xpu/sycl/utils.cpp b/src/xpu/sycl/utils.cpp
index 8cf5a2ee916..d5eee06700b 100644
--- a/src/xpu/sycl/utils.cpp
+++ b/src/xpu/sycl/utils.cpp
@@ -96,14 +96,13 @@ bool is_host(const ::sycl::platform &plat) {
 backend_t get_backend(const ::sycl::device &dev) {
     if (is_host(dev)) return backend_t::host;
 
-    auto plat = dev.get_platform();
-    std::string plat_name = plat.get_info<::sycl::info::platform::name>();
-    if (plat_name.find("OpenCL") != std::string::npos) return backend_t::opencl;
-    if (plat_name.find("NVIDIA") != std::string::npos) return backend_t::nvidia;
-    if (plat_name.find("AMD") != std::string::npos) return backend_t::amd;
-    if (plat_name.find("Level-Zero") != std::string::npos)
-        return backend_t::level0;
-
+    switch (dev.get_backend()) {
+        case ::sycl::backend::opencl: return backend_t::opencl;
+        case ::sycl::backend::ext_oneapi_level_zero: return backend_t::level0;
+        case ::sycl::backend::ext_oneapi_cuda: return backend_t::nvidia;
+        case ::sycl::backend::ext_oneapi_hip: return backend_t::amd;
+        default: break;
+    }
     return backend_t::unknown;
 }
 
@@ -310,6 +309,10 @@ status_t get_device_index(size_t *index, const ::sycl::device &dev) {
     auto backend = get_backend(dev);
     auto devices = get_devices(dev_type, backend);
 
+    VERROR_ENGINE(devices.size() > 0, status::invalid_arguments,
+            "%s devices queried but not found",
+            xpu::sycl::to_string(dev_type).c_str());
+
     // Find the top level device in the list
     auto it = std::find(devices.begin(), devices.end(), get_root_device(dev));
     if (it != devices.end()) {
@@ -320,8 +323,8 @@ status_t get_device_index(size_t *index, const ::sycl::device &dev) {
         // TODO: remove this work around once Level-Zero is fixed
         if (backend == backend_t::level0) return status::success;
         VERROR_ENGINE(false, status::invalid_arguments,
-                VERBOSE_INVALID_ENGINE_IDX, SIZE_MAX,
-                to_string(dev_type).c_str(), devices.size());
+                VERBOSE_INVALID_ENGINE_IDX, devices.size(),
+                xpu::sycl::to_string(dev_type).c_str(), SIZE_MAX);
     }
 }
 
diff --git a/src/xpu/sycl/utils.hpp b/src/xpu/sycl/utils.hpp
index 29967170834..f708e333968 100644
--- a/src/xpu/sycl/utils.hpp
+++ b/src/xpu/sycl/utils.hpp
@@ -24,24 +24,6 @@
 
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
-#else
-#error "Unsupported compiler"
-#endif
-
-#if defined(__INTEL_LLVM_COMPILER)
-#if (__INTEL_LLVM_COMPILER < 20230000)
-#define DNNL_USE_SYCL121_API 1
-#else
-#define DNNL_USE_SYCL121_API 0
-#endif
-#elif defined(__LIBSYCL_MAJOR_VERSION)
-#if (__LIBSYCL_MAJOR_VERSION < 6)
-#define DNNL_USE_SYCL121_API 1
-#else
-#define DNNL_USE_SYCL121_API 0
-#endif
 #else
 #error "Unsupported compiler"
 #endif
diff --git a/src/xpu/sycl/verbose.hpp b/src/xpu/sycl/verbose.hpp
index 210485fe696..e6e8d53423b 100644
--- a/src/xpu/sycl/verbose.hpp
+++ b/src/xpu/sycl/verbose.hpp
@@ -37,6 +37,11 @@ namespace sycl {
 void print_verbose_header(engine_kind_t kind) {
     engine_factory_t factory(kind);
     auto s_engine_kind = (kind == engine_kind::cpu ? "cpu" : "gpu");
+
+    verbose_printf("info,%s,engine,sycl %s device count:%zu %s\n",
+            s_engine_kind, s_engine_kind, factory.count(),
+            factory.count() == 0 ? "- no devices available." : "");
+
     for (size_t i = 0; i < factory.count(); ++i) {
         try {
             impl::engine_t *eng_ptr = nullptr;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1fdfb04bf6f..c927b45e5ea 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2024 Intel Corporation
+# Copyright 2016-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,9 +18,7 @@ if (NOT DNNL_BUILD_TESTS)
     return()
 endif()
 
-if(POLICY CMP0065)
-    cmake_policy(SET CMP0065 NEW)
-endif()
+include_directories_with_host_compiler(${PROJECT_SOURCE_DIR}/third_party)
 
 # propagate TEST specific flags
 append(CMAKE_C_FLAGS "${CMAKE_TEST_CCXX_FLAGS}")
@@ -73,7 +71,7 @@ if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE" AND
     register_exe(api-c api.c "test" "${LIBM}")
 endif()
 
-if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND (UNIX OR MINGW))
+if(NOT CMAKE_CXX_COMPILER_ID MATCHES "(Apple)?[Cc]lang" AND (UNIX OR MINGW))
     get_directory_property(include_dirs INCLUDE_DIRECTORIES)
     set(test_c_symbols "${CMAKE_CURRENT_BINARY_DIR}/test_c_symbols.c")
     add_custom_command(
diff --git a/tests/benchdnn/CMakeLists.txt b/tests/benchdnn/CMakeLists.txt
index f7cf0369dcd..c032c40e3d4 100644
--- a/tests/benchdnn/CMakeLists.txt
+++ b/tests/benchdnn/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2017-2024 Intel Corporation
+# Copyright 2017-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,9 +24,6 @@ file(GLOB_RECURSE SOURCES
 list(APPEND SOURCES ${TEST_THREAD})
 if(ONEDNN_BUILD_GRAPH)
     add_definitions_with_host_compiler(-DBUILD_GRAPH)
-    if(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND)
-        add_definitions_with_host_compiler(-DENABLE_GRAPH_COMPILER_BACKEND)
-    endif()
 else()
     file(GLOB_RECURSE GRAPH_SRC ${CMAKE_CURRENT_SOURCE_DIR}/graph/*.cpp)
     list(REMOVE_ITEM SOURCES ${GRAPH_SRC})
@@ -85,6 +82,10 @@ function(register_benchdnn_test engine driver test_file)
         set(cmd "--mode=${tm} ${mode_modifier} -v1 --engine=${engine} --${driver} --batch=${test_file}")
         set(benchdnn_target ${target_name}_${engine})
 
+        if(DNNL_TEST_SET_HAS_GRAPH_EXE EQUAL 1)
+            string(PREPEND cmd "--execution-mode=graph")
+        endif()
+
         if(NOT WIN32 OR DNNL_BUILD_FOR_CI)
             string(REPLACE " " ";" cmd "benchdnn ${cmd}")
             add_dnnl_test(${benchdnn_target} ${cmd})
@@ -193,6 +194,7 @@ foreach(driver ${all_drivers})
     elseif(DNNL_TEST_SET_COVERAGE EQUAL DNNL_TEST_SET_NIGHTLY)
         if(NOT DNNL_EXPERIMENTAL_SPARSE)
             list(REMOVE_ITEM test_files_cpu "test_matmul_sparse")
+            list(REMOVE_ITEM test_files_gpu "test_matmul_sparse_gpu")
         endif()
 
         ## Filter out gpu, large cpu and invalid inputs from cpu
diff --git a/tests/benchdnn/README.md b/tests/benchdnn/README.md
index a23c6aee701..7b8f0a571e4 100644
--- a/tests/benchdnn/README.md
+++ b/tests/benchdnn/README.md
@@ -2,7 +2,7 @@
 
 **benchdnn** is an extended and robust correctness verification and performance
 benchmarking tool for the primitives provided by
-[oneDNN](https://github.com/oneapi-src/oneDNN). The purpose of the benchmark is
+[oneDNN](https://github.com/uxlfoundation/oneDNN). The purpose of the benchmark is
 an extended and robust correctness verification of the primitives provided by
 oneDNN. **benchdnn** itself is a harness for different primitive-specific
 drivers.
diff --git a/tests/benchdnn/benchdnn.cpp b/tests/benchdnn/benchdnn.cpp
index 6c32443d597..54758154a9b 100644
--- a/tests/benchdnn/benchdnn.cpp
+++ b/tests/benchdnn/benchdnn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ bool canonical {false};
 bool mem_check {true};
 std::string skip_impl;
 stat_t benchdnn_stat {0};
+summary_t summary {};
 std::string driver_name;
 
 double max_ms_per_prb {default_max_ms_per_prb};
@@ -76,6 +77,8 @@ int test_start {0};
 bool attr_same_pd_check {false};
 bool check_ref_impl {false};
 
+execution_mode_t execution_mode {execution_mode_t::direct};
+
 int main(int argc, char **argv) {
     using namespace parser;
 
@@ -142,16 +145,40 @@ int main(int argc, char **argv) {
         zeropad::bench(--argc, ++argv);
     } else if (!strcmp("--brgemm", argv[0])) {
         brgemm::bench(--argc, ++argv);
-#ifdef BUILD_GRAPH
+        brgemm::brgemm_finalize();
     } else if (!strcmp("--graph", argv[0])) {
+#ifdef BUILD_GRAPH
         graph::bench(--argc, ++argv);
+#else
+        printf("Error: the library was built without Graph API support.\n");
+        exit(1);
 #endif
     } else {
-        fprintf(stderr, "err: unknown driver\n");
+        printf("Error: can't parse the driver name \'%s\'.\n", argv[0]);
+        exit(1);
     }
 
     total_time.stamp();
 
+    if (has_bench_mode_bit(mode_bit_t::corr) && summary.failed_cases
+            && !benchdnn_stat.failed_cases.empty()) {
+        printf("===========================================================\n");
+        printf("= Failed cases summary (--summary=no-failures to disable) =\n");
+        printf("===========================================================\n");
+        const size_t n_cases = benchdnn_stat.failed_cases.size();
+        size_t n_printed = 0;
+        for (const auto &e : benchdnn_stat.failed_cases) {
+            printf("%s\n", e.second.c_str());
+            n_printed++;
+            if (n_printed < 10) continue;
+            if (n_cases > n_printed) {
+                printf("(... %zu more cases ...)\n", n_cases - n_printed);
+            }
+            break;
+        }
+        printf("============================\n");
+    }
+
     printf("tests:%d passed:%d skipped:%d mistrusted:%d unimplemented:%d "
            "invalid_arguments:%d failed:%d listed:%d\n",
             benchdnn_stat.tests, benchdnn_stat.passed, benchdnn_stat.skipped,
diff --git a/tests/benchdnn/binary/bench_binary.cpp b/tests/benchdnn/binary/bench_binary.cpp
index ec97087e393..af3e8f1d412 100644
--- a/tests/benchdnn/binary/bench_binary.cpp
+++ b/tests/benchdnn/binary/bench_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@
 
 namespace binary {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -49,11 +39,10 @@ void check_correctness(
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
         const prb_t prb(s.prb_vdims, i_sdt, i_ddt, i_stag, i_dtag, i_alg,
-                i_inplace, i_attr, i_ctx_init, i_ctx_exe);
+                i_inplace, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -62,33 +51,33 @@ int verify_input(const settings_t &s) {
     static constexpr int n_inputs = 2;
 
     if (s.prb_vdims.n_inputs() != n_inputs) {
-        fprintf(stderr,
+        BENCHDNN_PRINT(0, "%s\n",
                 "ERROR: binary driver: expect problem dimensions in format "
-                "`A0xA1x...:B0xB1x...`.\n");
+                "`A0xA1x...:B0xB1x...`.");
         SAFE_V(FAIL);
     }
 
     for (const auto &i_sdt : s.sdt) {
         if (i_sdt.size() != n_inputs) {
-            fprintf(stderr,
+            BENCHDNN_PRINT(0, "%s\n",
                     "ERROR: binary driver: expect data types in format "
-                    "`DT:DT`.\n");
+                    "`DT:DT`.");
             SAFE_V(FAIL);
         }
     }
 
     for (const auto &i_stag : s.stag) {
         if (i_stag.size() != n_inputs) {
-            fprintf(stderr,
+            BENCHDNN_PRINT(0, "%s\n",
                     "ERROR: binary driver: expect format tags in format "
-                    "`TAG:TAG`.\n");
+                    "`TAG:TAG`.");
             SAFE_V(FAIL);
         }
     }
 
     for (const auto &i_alg : s.alg) {
         if (!(i_alg > alg_t::BINARY_START && i_alg < alg_t::BINARY_END)) {
-            fprintf(stderr,
+            BENCHDNN_PRINT(0,
                     "ERROR: binary driver: algorithm `%s` does not belong to "
                     "binary algorithm type.\n",
                     attr_t::post_ops_t::kind2str(i_alg));
@@ -115,13 +104,7 @@ int bench(int argc, char **argv) {
                 || parse_alg(
                         s.alg, def.alg, attr_t::post_ops_t::str2kind, argv[0])
                 || parse_inplace(s.inplace, def.inplace, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/binary/binary.cpp b/tests/benchdnn/binary/binary.cpp
index c059bf01aa0..425563fe269 100644
--- a/tests/benchdnn/binary/binary.cpp
+++ b/tests/benchdnn/binary/binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 *******************************************************************************/
 
 #include <algorithm>
+#include <random>
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -49,18 +50,40 @@ int fill_mem(
                 prb->alg, "binary");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
-    const auto dt = mem_dt.dt();
-    const int range = 16;
-    const int f_min = dt == dnnl_u8 ? 0 : -range / 2;
-
-    benchdnn_parallel_nd(nelems, [&](int64_t i) {
-        const int64_t gen = (12 * i + 5 * input_idx + 16) % (range + 1);
-        const float scale = 1.25f;
-        float value = (f_min + gen) * scale;
-        // Remove zeroes in src1 to avoid division by zero
-        if (input_idx == 1 && value == 0.0f) value = 1.0f;
-        mem_fp.set_elem(i, round_to_nearest_representable(dt, value));
+    int min_val = MAX2(-8, static_cast<int>(lowest_dt(mem_dt.dt())));
+    // Tenrary op supports a third input which can't be negative so far.
+    if (input_idx == 2) min_val = 0;
+
+    /* Do fixed partitioning to have same filling for any number of threads */
+    static constexpr int64_t chunk_size = 64;
+    const int64_t n_chunks = div_up(nelems, chunk_size);
+    benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
+        int64_t idx_start = idx_chunk * chunk_size;
+        int64_t idx_end = MIN2(idx_start + chunk_size, nelems);
+        // Note: we use a different seed for each chunk to avoid
+        // repeating patterns. We could use discard(idx_start) too but
+        // it has a complexity in O(idx_start). We also add 1 to avoid
+        // seeding with 0.
+        std::minstd_rand int_seed(idx_start + nelems * input_idx + 1);
+        int_seed.discard(1);
+
+        std::uniform_int_distribution<> gen(min_val, 8);
+
+        for (int64_t idx = idx_start; idx < idx_end; ++idx) {
+            float val = gen(int_seed);
+            // Make floating-point values only for src0 as src1 filling can be
+            // used in other drivers and preferred to be integer.
+            if (input_idx == 0) val *= 0.5f;
+            // Remove zeroes in src1 to avoid division by zero.
+            if (input_idx == 1 && val == 0.0f) val = 1.0f;
+            val = round_to_nearest_representable(mem_dt.dt(), val);
+            mem_fp.set_elem(idx, val);
+        }
     });
 
     SAFE(mem_dt.reorder(mem_fp), WARN);
@@ -89,10 +112,14 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     auto dnnl_attr = make_benchdnn_dnnl_wrapper(
             create_dnnl_attr(prb->attr, attr_args));
 
-    TIME_C_PD(DNN_SAFE_STATUS(dnnl_binary_primitive_desc_create(
+    auto src2_d = prb->is_ternary_op() ? dnn_mem_t::init_md(prb->ndims,
+                          prb->vdims[0].data(), dnnl_s8, prb->stag[0])
+                                       : nullptr;
+
+    TIME_C_PD(DNN_SAFE_STATUS(dnnl_binary_primitive_desc_create_v2(
             &init_pd_args.pd, init_pd_args.engine, alg,
-            init_pd_args.src_md ? init_pd_args.src_md : src0_d, src1_d, dst_d,
-            dnnl_attr)));
+            init_pd_args.src_md ? init_pd_args.src_md : src0_d, src1_d, src2_d,
+            dst_d, dnnl_attr)));
 
     return dnnl_success;
 }
@@ -114,13 +141,11 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
             return;
         }
 
-        // gpu does not support s32
-        for (const auto &dt : dts)
-            if (dt == dnnl_s32) {
-                res->state = SKIPPED;
-                res->reason = skip_reason::data_type_not_supported;
-                return;
-            }
+        if (prb->is_ternary_op()) {
+            res->state = SKIPPED;
+            res->reason = skip_reason::case_not_supported;
+            return;
+        }
     }
 }
 
@@ -184,6 +209,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
     static const std::vector<int> exec_args = {
             DNNL_ARG_SRC_0,
             DNNL_ARG_SRC_1,
+            DNNL_ARG_SRC_2,
             DNNL_ARG_DST,
     };
     return exec_args;
@@ -192,7 +218,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -227,9 +253,12 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
             case DNNL_ARG_SRC_1:
                 SAFE(fill_mem(prb, 1, mem, ref_mem), WARN);
                 break;
+            case DNNL_ARG_SRC_2:
+                SAFE(fill_mem(prb, 2, mem, ref_mem), WARN);
+                break;
             case DNNL_ARG_DST:
                 if (prb->attr.post_ops.find(alg_t::SUM) >= 0) {
-                    SAFE(fill_mem(prb, 2, mem, ref_mem), WARN);
+                    SAFE(fill_mem(prb, 3, mem, ref_mem), WARN);
 
                     // Bitwise mode for sum requires a copy due to data for
                     // post-op will be overwritten and it must be refreshed.
@@ -258,10 +287,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/binary/binary.hpp b/tests/benchdnn/binary/binary.hpp
index 354b5fef5b1..425eadd9895 100644
--- a/tests/benchdnn/binary/binary.hpp
+++ b/tests/benchdnn/binary/binary.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,16 +65,17 @@ struct prb_t : public prb_vdims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_vdims, s.sdt[0], s.ddt[0], s.stag[0], s.dtag[0], s.alg[0],
-                s.inplace[0], s.attributes.front(), s.ctx_init[0],
-                s.ctx_exe[0]) {
+                s.inplace[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_vdims_t &prb_vdims,
             const std::vector<dnnl_data_type_t> &sdt, dnnl_data_type_t ddt,
-            const std::vector<std::string> &stag, std::string dtag, alg_t alg,
-            bool inplace, const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe)
+            const std::vector<std::string> &stag, const std::string &dtag,
+            alg_t alg, bool inplace, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_vdims_t(prb_vdims)
         , sdt(sdt)
         , ddt(ddt)
@@ -84,7 +85,8 @@ struct prb_t : public prb_vdims_t {
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
@@ -97,6 +99,7 @@ struct prb_t : public prb_vdims_t {
     bool inplace;
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -107,6 +110,8 @@ struct prb_t : public prb_vdims_t {
 
     const char *str() const { return repro.c_str(); }
 
+    bool is_ternary_op() const { return alg == alg_t::SELECT; }
+
 private:
     std::string repro;
 
@@ -163,8 +168,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/binary/binary_aux.cpp b/tests/benchdnn/binary/binary_aux.cpp
index baf8a0dfa55..f297b32731a 100644
--- a/tests/benchdnn/binary/binary_aux.cpp
+++ b/tests/benchdnn/binary/binary_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,6 +42,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_vdims_t>(*this);
     return s.str();
diff --git a/tests/benchdnn/binary/ref_binary.cpp b/tests/benchdnn/binary/ref_binary.cpp
index 645db951abf..07397a0df92 100644
--- a/tests/benchdnn/binary/ref_binary.cpp
+++ b/tests/benchdnn/binary/ref_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,8 +22,10 @@ namespace binary {
 
 void compute_ref(
         const prb_t *prb, const args_t &args, dnnl_primitive_t prim_ref) {
+
     const dnn_mem_t &src0 = args.find(DNNL_ARG_SRC_0);
     const dnn_mem_t &src1 = args.find(DNNL_ARG_SRC_1);
+    const dnn_mem_t &src2 = args.find(DNNL_ARG_SRC_2);
     const dnn_mem_t &dst = args.find(DNNL_ARG_DST);
 
     float *dst_ptr = (float *)dst;
@@ -39,10 +41,15 @@ void compute_ref(
     auto v_po_masks = prb->attr.post_ops.get_po_masks();
 
     benchdnn_parallel_nd(nelems, [&](int64_t i) {
-        const auto idx_A = dst.get_scale_idx(i, broadcast_mask_A);
-        const auto idx_B = dst.get_scale_idx(i, broadcast_mask_B);
+        const auto idx_A = dst.get_idx(i, broadcast_mask_A);
+        const auto idx_B = dst.get_idx(i, broadcast_mask_B);
+
+        const bool c_val = prb->is_ternary_op()
+                ? static_cast<bool>(src2.get_elem(idx_A))
+                : false;
+
         float res = compute_binary(
-                prb->alg, scales[0] * A[idx_A], scales[1] * B[idx_B]);
+                prb->alg, scales[0] * A[idx_A], scales[1] * B[idx_B], c_val);
         float &dst_fp = dst_ptr[i];
 
         const auto v_po_vals = prepare_po_vals(dst, args, v_po_masks, i);
diff --git a/tests/benchdnn/bnorm/bench_bnorm.cpp b/tests/benchdnn/bnorm/bench_bnorm.cpp
index be999bcd176..ee731fa1cc3 100644
--- a/tests/benchdnn/bnorm/bench_bnorm.cpp
+++ b/tests/benchdnn/bnorm/bench_bnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,17 +26,7 @@
 
 namespace bnorm {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -50,13 +40,12 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
-        const prb_t prb(s.desc, i_mb, i_dir, i_dt, i_tag, i_strides, i_flags,
-                i_inplace, i_attr, i_ctx_init, i_ctx_exe, s.check_alg,
-                s.debug_check_ws);
+        const prb_t prb(s.desc, i_dir, i_dt, i_tag, i_strides, i_flags,
+                s.check_alg, s.debug_check_ws, i_mb, i_inplace, i_attr,
+                i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -147,13 +136,7 @@ int bench(int argc, char **argv) {
                 || parse_single_value_option(s.debug_check_ws,
                         def.debug_check_ws, str2bool, argv[0], "debug-check-ws",
                         help_debug_check_ws)
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/bnorm/bnorm.cpp b/tests/benchdnn/bnorm/bnorm.cpp
index b730340418d..b51df6fa5a8 100644
--- a/tests/benchdnn/bnorm/bnorm.cpp
+++ b/tests/benchdnn/bnorm/bnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,6 +44,10 @@ int fill_mean(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
 
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         float val = 0.f;
@@ -63,6 +67,10 @@ int fill_src(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         const float m = ref_mean.get_elem(c);
@@ -120,6 +128,10 @@ int fill_variance(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
                 attr_t::post_ops_t::kind_t::ADD, "variance");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         float val = 0.f;
@@ -158,6 +170,10 @@ int fill_src_add(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, prb->mb, prb->id, prb->ih, prb->iw,
             [&](int64_t c, int64_t mb, int64_t d, int64_t h, int64_t w) {
@@ -205,6 +221,10 @@ int fill_scale(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         float val = (1.f / 8) * (1 << (c % 7));
@@ -225,6 +245,10 @@ int fill_shift(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         float val = ((c % 3) - 1) * (1.f / 512 * (1 << (c % 7)));
@@ -296,6 +320,10 @@ int prepare_bwd(
 
         return OK;
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     // Idea behind filling: integer diff_dst values decrease norms unlike fp32
     // values in [-1.f, 1.f] range. To decrease norms more, make data pretty
@@ -533,8 +561,8 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
                 const auto &sh = ref_args.find(DNNL_ARG_SHIFT);
                 const auto &dst = ref_args.find(DNNL_ARG_DST);
-                const int64_t c = dst.get_scale_idx(
-                        args.idx, 1 << 1 /* channel_mask */);
+                const int64_t c
+                        = dst.get_idx(args.idx, 1 << 1 /* channel_mask */);
                 const float beta = sh.get_elem(c);
                 // Using an empirically derived threshold, check if
                 // cancellation error in `|Y| = |a*X - (-b)|` is huge.
@@ -583,7 +611,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     // TODO: this function still allocates the full memory print needed to fill
     // the data and each argument can't be destroyed right away since filling
@@ -690,11 +718,16 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+        if (v_prim[1]) SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/bnorm/bnorm.hpp b/tests/benchdnn/bnorm/bnorm.hpp
index d745b8edbd4..922eb6a99b7 100644
--- a/tests/benchdnn/bnorm/bnorm.hpp
+++ b/tests/benchdnn/bnorm/bnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -97,46 +97,48 @@ struct settings_t : public base_settings_t {
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.mb[0], s.dir[0], s.dt[0], s.tag[0], s.strides[0],
-                s.flags[0], s.inplace[0], s.attributes.front(), s.ctx_init[0],
-                s.ctx_exe[0], s.check_alg, s.debug_check_ws) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.tag[0], s.strides[0], s.flags[0],
+                s.check_alg, s.debug_check_ws, s.mb[0], s.inplace[0],
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
-    prb_t(const desc_t &desc, int64_t mb, dir_t dir, dnnl_data_type_t dt,
+    prb_t(const desc_t &desc, dir_t dir, dnnl_data_type_t dt,
             const std::string &tag, const vdims_t &strides, flags_t flags,
+            check_alg_t check_alg, bool debug_check_ws, int64_t mb,
             bool inplace, const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, check_alg_t check_alg,
-            bool debug_check_ws)
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : desc_t(desc)
-        , check_alg(check_alg)
-        , debug_check_ws(debug_check_ws)
         , dir(dir)
         , dt(dt)
         , tag(tag)
         , strides(strides)
         , flags(flags)
+        , check_alg(check_alg)
+        , debug_check_ws(debug_check_ws)
+        , user_mb(mb)
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
-    check_alg_t check_alg;
-    bool debug_check_ws;
-
     dir_t dir;
     dnnl_data_type_t dt;
     std::string tag;
     vdims_t strides;
     flags_t flags;
+    check_alg_t check_alg;
+    bool debug_check_ws;
+    int64_t user_mb;
     bool inplace;
     attr_t attr;
-    const thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     bool need_ws() const {
         return (flags & (FUSE_NORM_RELU | FUSE_NORM_ADD_RELU))
@@ -276,8 +278,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/bnorm/bnorm_aux.cpp b/tests/benchdnn/bnorm/bnorm_aux.cpp
index 0ae3991d675..d4c5ae83758 100644
--- a/tests/benchdnn/bnorm/bnorm_aux.cpp
+++ b/tests/benchdnn/bnorm/bnorm_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -89,8 +89,8 @@ int str2desc(desc_t *desc, const char *str) {
     const char *s = str;
     assert(s);
 
-    auto mstrtol = [](const char *nptr, char **endptr) {
-        return strtol(nptr, endptr, 10);
+    auto mstrtoll = [](const char *nptr, char **endptr) {
+        return strtoll(nptr, endptr, 10);
     };
 
 #define CASE_NN(prb, c, cvfunc) \
@@ -120,11 +120,11 @@ int str2desc(desc_t *desc, const char *str) {
 #define CASE_N(c, cvfunc) CASE_NN(#c, c, cvfunc)
     while (*s) {
         int ok = 0;
-        CASE_N(mb, mstrtol);
-        CASE_N(ic, mstrtol);
-        CASE_N(id, mstrtol);
-        CASE_N(ih, mstrtol);
-        CASE_N(iw, mstrtol);
+        CASE_N(mb, mstrtoll);
+        CASE_N(ic, mstrtoll);
+        CASE_N(id, mstrtoll);
+        CASE_N(ih, mstrtoll);
+        CASE_N(iw, mstrtoll);
         CASE_N(eps, strtof);
         if (*s == 'n') {
             d.name = s + 1;
@@ -217,6 +217,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/brgemm/bench_brgemm.cpp b/tests/benchdnn/brgemm/bench_brgemm.cpp
index d176a539cb7..652de8dae49 100644
--- a/tests/benchdnn/brgemm/bench_brgemm.cpp
+++ b/tests/benchdnn/brgemm/bench_brgemm.cpp
@@ -46,7 +46,7 @@ void check_correctness(const settings_t &s) {
     for (const auto &i_ctx_exe : s.ctx_exe) {
         const prb_t prb(s.prb_vdims, i_dt, i_stag, i_wtag, i_dtag, i_strides,
                 i_ld, i_bia_dt, i_alpha, i_beta, i_batch_size, i_brgemm_attr,
-                i_batch_kind, i_attr, i_ctx_init, i_ctx_exe);
+                i_batch_kind, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
         BENCHDNN_PRINT(1, "run: %s\n", prb.str());
 
diff --git a/tests/benchdnn/brgemm/brgemm.cpp b/tests/benchdnn/brgemm/brgemm.cpp
index b38368f72e9..745e8b29c5a 100644
--- a/tests/benchdnn/brgemm/brgemm.cpp
+++ b/tests/benchdnn/brgemm/brgemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -135,13 +135,15 @@ dnnl_status_t brgemm_attr_init(
         PROCESS_KEY_VAL(hint_expected_A_size);
         PROCESS_KEY_VAL(hint_expected_B_size);
         PROCESS_KEY_VAL(hint_expected_C_size);
-        PROCESS_KEY_VAL(wary_tail_read);
+        PROCESS_KEY_VAL(wary_A_k_tail_read);
+        PROCESS_KEY_VAL(extendable_k);
         PROCESS_KEY_VAL(generate_skip_accumulation);
         // TODO: `bd_mask` can't be passed to the kernel at this moment, that's
         // why `bd_mask_level` has to stay `0` for now until it's enabled.
         // PROCESS_KEY_VAL(bd_mask_level);
         PROCESS_KEY_VAL(use_uker);
         PROCESS_KEY_VAL(use_interleave_stores);
+        PROCESS_KEY_VAL(b_is_vnni);
         PROCESS_KEY_VAL(postops_only);
         PROCESS_KEY_VAL(hint_bd_block);
         PROCESS_KEY_VAL(hint_bd_block2);
@@ -220,6 +222,11 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
 
     assert(mem_dt.nelems() == mem_fp.nelems());
 
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
+
     cfg_t::density_args_t density_args;
     density_args.data_kind = kind;
     density_args.n_acc = prb->k;
@@ -342,7 +349,8 @@ int init_kernel(kernel_args_t &kernel_args) {
     attr_args.prepare_post_ops_mds(prb->attr, prb->ndims, prb->dst_dims.data());
     const auto &wei_scale = prb->attr.scales.get(DNNL_ARG_WEIGHTS);
     if (wei_scale.policy == policy_t::PER_OC) {
-        attr_args.prepare_scales(prb->attr, DNNL_ARG_WEIGHTS, 2);
+        attr_args.prepare_quant(
+                prb->attr, DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, 2);
     }
     auto dnnl_attr = make_benchdnn_dnnl_wrapper(
             create_dnnl_attr(prb->attr, attr_args));
@@ -363,6 +371,9 @@ int init_kernel(kernel_args_t &kernel_args) {
             WARN);
     if (res->state == SKIPPED) return OK;
 
+    SAFE(check_dnnl_status(brgemm_desc_finalize(&brgemm_desc), prb, res), WARN);
+    if (res->state == SKIPPED) return OK;
+
     kernel_args.generate_skip_accumulation_
             = brgemm_attr.generate_skip_accumulation;
 
@@ -424,7 +435,9 @@ int init_kernel(kernel_args_t &kernel_args) {
     if (res->state == SKIPPED) return OK;
 
     dnnl_pack_type_t pack_type = dnnl_pack_type_undef;
-    DNN_SAFE(dnnl_brgemm_get_B_pack_type(brgemm, &pack_type), WARN);
+    DNN_SAFE(dnnl_brgemm_get_B_pack_type(
+                     &pack_type, prb->src_dt(), prb->wei_dt()),
+            WARN);
     kernel_args.need_pack_ = pack_type == dnnl_pack_type_pack32;
 
     DNN_SAFE(dnnl_brgemm_generate(brgemm), WARN);
@@ -634,6 +647,7 @@ dnnl_status_t brgemm_kernel_execute_postops_wrapper(const_dnnl_brgemm_t brgemm,
         const bool use_dst_as_acc, const void *src_ptr,
         const void *wei_packed_ptr, const std::vector<dnnl_dim_t> &offsets,
         void *acc_ptr, void *dst_ptr, void *scratchpad_ptr,
+        const_dnnl_ukernel_attr_params_t attr_params,
         const dnnl_stream_t &stream,
         const std::vector<dnnl_exec_arg_t> &dnnl_args) {
 
@@ -643,7 +657,7 @@ dnnl_status_t brgemm_kernel_execute_postops_wrapper(const_dnnl_brgemm_t brgemm,
                 offsets.data(), dst_ptr, scratchpad_ptr);
     } else {
         st = dnnl_brgemm_execute_postops(brgemm, src_ptr, wei_packed_ptr,
-                offsets.data(), acc_ptr, dst_ptr, scratchpad_ptr, nullptr);
+                offsets.data(), acc_ptr, dst_ptr, scratchpad_ptr, attr_params);
     }
     return st;
 }
@@ -880,6 +894,9 @@ void init_memory_args(
 
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         const prb_t *prb, const kernel_args_t &kernel_args, res_t *res) {
+
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
+
     const auto &ref_engine = get_cpu_engine();
 
     // Move cfg out of filling since its creation is not free.
@@ -989,6 +1006,7 @@ int scales_post_processing(dnn_mem_map_t &mem_map) {
         dims_t dims = {16};
         auto new_md = dnn_mem_t::init_md(1, dims.data(), dt, tag::abx);
         dnn_mem_t new_m(new_md, get_test_engine());
+        if (!new_m.is_mapped()) new_m.map();
         for (int64_t i = 0; i < new_m.nelems(); i++) {
             new_m.set_elem(i, val);
         }
@@ -1086,7 +1104,19 @@ int release_hw_config(const kernel_args_t &kernel_args) {
         DNN_SAFE(namespace_impl::amx_tile_release(), WARN);
     }
 #endif
-#else // !defined(DNNL_EXPERIMENTAL_UKERNEL)
+#endif
+    return OK;
+}
+
+// `release_hw_config` and `brgemm_finalize` are doing the same thing -
+// releasing the hw resources they allocated. The difference is in the
+// implementation side - ukernel has lazy initialization and would reset the
+// state per `set_hw_config` call while internal API doesn't - it just sets
+// the new state unconditionally. Because of laziness, the test wants to ensure
+// resetting is correct and, thus, releasing is done once - at the end of the
+// suite, while for internal API it is done after each case.
+int brgemm_finalize() {
+#if defined(DNNL_EXPERIMENTAL_UKERNEL)
     DNN_SAFE(dnnl_brgemm_release_hw_context(), WARN);
 #endif
     return OK;
@@ -1124,6 +1154,13 @@ int doit(const prb_t *prb, res_t *res) {
     // "Reference" are used as usual.
     args_t args(mem_map), ref_args(ref_mem_map);
 
+    // The implementation memory must be mapped to setup point arguments for
+    // brgemm implementation call. This assumes that mapping is effectively a
+    // no-op on the target device.
+    for (auto &kv : mem_map) {
+        if (!kv.second.is_mapped()) kv.second.map();
+    }
+
     const char *src_ptr = (const char *)mem_map.at(DNNL_ARG_SRC);
     const char *wei_ptr = (const char *)mem_map.at(DNNL_ARG_WEIGHTS);
     char *acc_ptr = (char *)mem_map.at(DNNL_ARG_DST_1);
@@ -1271,16 +1308,12 @@ int doit(const prb_t *prb, res_t *res) {
                 post_ops_data, scratchpad_ptr);
     }
 #else // !defined(DNNL_EXPERIMENTAL_UKERNEL)
-    if (prb->use_dst_as_acc()) {
-        DNN_SAFE(dnnl_brgemm_execute(brgemm, src_ptr, wei_packed_ptr,
-                         offsets.data(), dst_ptr, scratchpad_ptr),
-                WARN);
-    } else {
-        DNN_SAFE(dnnl_brgemm_execute_postops(brgemm, src_ptr, wei_packed_ptr,
-                         offsets.data(), acc_ptr, dst_ptr, scratchpad_ptr,
-                         attr_params),
-                WARN);
-    }
+    // `prb->use_dst_as_acc()=true` will make `dst_ptr=acc_ptr` and rest should
+    // be handled by API.
+    DNN_SAFE(dnnl_brgemm_execute_postops(brgemm, src_ptr, wei_packed_ptr,
+                     offsets.data(), acc_ptr, dst_ptr, scratchpad_ptr,
+                     attr_params),
+            WARN);
 #endif
     res->state = EXECUTED;
 
@@ -1298,8 +1331,8 @@ int doit(const prb_t *prb, res_t *res) {
 #else // !defined(DNNL_EXPERIMENTAL_UKERNEL)
     perf_function_t perf_func = std::bind(brgemm_kernel_execute_postops_wrapper,
             kernel_args.brgemm_, prb->use_dst_as_acc(), src_ptr, wei_packed_ptr,
-            offsets, acc_ptr, dst_ptr, scratchpad_ptr, std::placeholders::_1,
-            std::placeholders::_2);
+            offsets, acc_ptr, dst_ptr, scratchpad_ptr, attr_params_ptr,
+            std::placeholders::_1, std::placeholders::_2);
 #endif
 
     measure_perf(prb->ctx_exe, res, perf_func, args);
@@ -1311,6 +1344,11 @@ int doit(const prb_t *prb, res_t *res) {
 
 #else
 
+// For builadability of non-x64 configuration.
+int brgemm_finalize() {
+    return OK;
+}
+
 int doit(const prb_t *prb, res_t *res) {
     return OK;
 }
diff --git a/tests/benchdnn/brgemm/brgemm.hpp b/tests/benchdnn/brgemm/brgemm.hpp
index 3b278558925..60782ca352f 100644
--- a/tests/benchdnn/brgemm/brgemm.hpp
+++ b/tests/benchdnn/brgemm/brgemm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,7 +68,7 @@ struct settings_t : public base_settings_t {
     std::vector<std::string> batch_kind {"addr"};
 
     const char *perf_template_csv() const {
-        static const std::string args = "";
+        static const std::string args;
         return perf_template_csv_base(args);
     }
 
@@ -83,7 +83,7 @@ struct prb_t : public prb_vdims_t {
             float alpha, float beta, int batch_size,
             const std::string &brgemm_attr, const std::string &batch_kind,
             const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : prb_vdims_t(prb_vdims)
         , dt(dt)
         , stag(stag)
@@ -99,7 +99,8 @@ struct prb_t : public prb_vdims_t {
         , batch_kind(batch_kind)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
 
         // Broadcast data types if needed
         if (dt.size() == 1) {
@@ -140,6 +141,7 @@ struct prb_t : public prb_vdims_t {
 
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     double ops;
 
@@ -255,6 +257,9 @@ struct cfg_t : public base_cfg_t {
     float get_density(const density_args_t &density_args) const override;
 };
 
+// See the description next to the definition.
+int brgemm_finalize();
+
 void skip_unimplemented_prb(const prb_t *prb, res_t *res);
 void skip_invalid_prb(const prb_t *prb, res_t *res);
 void compute_ref(const prb_t *prb, const args_t &args,
diff --git a/tests/benchdnn/brgemm/ref_brgemm.cpp b/tests/benchdnn/brgemm/ref_brgemm.cpp
index 0699e62838c..3759826c7e7 100644
--- a/tests/benchdnn/brgemm/ref_brgemm.cpp
+++ b/tests/benchdnn/brgemm/ref_brgemm.cpp
@@ -142,7 +142,7 @@ void compute_ref_brgemm(const prb_t *prb, const args_t &args) {
         float tmp = ((float *)dst_tmp)[dst_off] * src_scale * wei_scale;
 
         if (prb->bia_dt != dnnl_data_type_undef) {
-            int64_t bia_off = dst_m.get_scale_idx(dst_off, bias_broadcast_mask);
+            int64_t bia_off = dst_m.get_idx(dst_off, bias_broadcast_mask);
             float *bia_ptr = (float *)bia_m;
             tmp += bia_ptr[bia_off];
         }
diff --git a/tests/benchdnn/common.cpp b/tests/benchdnn/common.cpp
index eb89446603b..861c2877fcf 100644
--- a/tests/benchdnn/common.cpp
+++ b/tests/benchdnn/common.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,7 +18,9 @@
 #include <limits.h>
 #include <stdint.h>
 
+#include <algorithm>
 #include <cctype>
+#include <cerrno>
 #include <fstream>
 #include <functional>
 #include <string>
@@ -28,7 +30,6 @@
 #include "oneapi/dnnl/dnnl.h"
 
 #include "common.hpp"
-#include "utils/parser.hpp"
 
 #include "utils/parallel.hpp"
 
@@ -93,72 +94,71 @@ dir_t str2dir(const char *str) {
 
 void parse_result(res_t &res, const char *pstr) {
     auto &bs = benchdnn_stat;
+
+    // Can be updated for `INITIALIZED`. TODO: remove this.
     const char *state = state2str(res.state);
+    bool is_failed = false;
+    bool print_me = true;
 
     switch (res.state) {
-        case UNTESTED:
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
-            bs.failed++;
-            break;
+        case UNTESTED: is_failed = true; break;
         case EXECUTED:
-            if (bench_mode == bench_mode_t::exec)
-                BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
             bs.passed++;
+            if (bench_mode != bench_mode_t::exec) print_me = false;
             break;
-        case FAILED: {
-            bs.failed++;
-            std::string error_stat;
-            if (res.errors > 0) {
-                error_stat = " (errors:" + std::to_string(res.errors)
-                        + " total:" + std::to_string(res.total) + ")";
-            }
-
-            std::string reason;
-            if (!res.reason.empty()) { reason = " (" + res.reason + ")"; }
-            BENCHDNN_PRINT(0, "%d:%s%s%s __REPRO: %s\n", bs.tests, state,
-                    reason.c_str(), error_stat.c_str(), pstr);
-        } break;
-        case SKIPPED:
-            BENCHDNN_PRINT(0, "%d:%s (%s) __REPRO: %s\n", bs.tests, state,
-                    res.reason.c_str(), pstr);
-            bs.skipped++;
-            break;
+        case FAILED: is_failed = true; break;
+        case SKIPPED: bs.skipped++; break;
         case UNIMPLEMENTED:
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
+            is_failed = true;
             bs.unimplemented++;
-            bs.failed++;
             break;
         case INVALID_ARGUMENTS:
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
+            is_failed = true;
             bs.invalid_arguments++;
-            bs.failed++;
-            break;
-        case MISTRUSTED:
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
-            bs.mistrusted++;
-            break;
-        case PASSED:
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
-            bs.passed++;
-            break;
-        case LISTED:
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
-            bs.listed++;
             break;
+        case MISTRUSTED: bs.mistrusted++; break;
+        case PASSED: bs.passed++; break;
+        case LISTED: bs.listed++; break;
         case INITIALIZED:
             // TODO: workaround for failed fill functions.
             if (bench_mode != bench_mode_t::init) {
+                is_failed = true;
                 state = "FAILED";
-                bs.failed++;
             } else {
                 bs.passed++;
             }
-
-            BENCHDNN_PRINT(0, "%d:%s __REPRO: %s\n", bs.tests, state, pstr);
             break;
-        default: assert(!"unknown state"); SAFE_V(FAIL);
+        default:
+            BENCHDNN_PRINT(0, "%s\n",
+                    "Error: unknown state encountered in \'parse_results()\'.");
+            SAFE_V(FAIL);
+    }
+
+    std::string reason;
+    if (!res.reason.empty()) { reason = " (" + res.reason + ")"; }
+
+    std::string error_stat;
+    if (res.errors > 0) {
+        error_stat = " (errors:" + std::to_string(res.errors)
+                + " total:" + std::to_string(res.total) + ")";
+    }
+
+    const auto &tct = res.timer_map.get_timer(timer::names::test_case_timer);
+    // Round to integer for nicer input.
+    const int64_t tct_ms = static_cast<int64_t>(tct.ms());
+    std::string tct_str = " (" + std::to_string(tct_ms) + " ms)";
+
+    // This is the common format of the repro line ([] - for optional entries):
+    // case_num:status[ (reason)][ (error_stats)] (time) __REPRO: prb_str
+    std::string full_repro = std::to_string(bs.tests) + ":" + std::string(state)
+            + reason + error_stat + tct_str + " __REPRO: " + pstr;
+    if (is_failed) {
+        bs.failed++;
+        bs.failed_cases.emplace(bs.tests, full_repro);
     }
+    if (print_me) { BENCHDNN_PRINT(0, "%s\n", full_repro.c_str()); }
 
+    // Update this after collecting stats.
     bs.tests++;
     assert(bs.tests
             == bs.passed + bs.skipped + bs.mistrusted + bs.failed + bs.listed);
@@ -212,6 +212,7 @@ static void *zmalloc_protect(size_t size) {
     // Protect one page right after the block of size bytes
     int err = mprotect(ptr_protect, page_sz, PROT_NONE);
     if (err != 0) {
+        printf("Error: mprotect returned \'%s\'.\n", strerror(errno));
         ::free(ptr_start);
         return nullptr;
     }
@@ -245,7 +246,10 @@ static void zfree_protect(void *ptr) {
 
 void *zmalloc(size_t size, size_t align) {
 #ifdef BENCHDNN_MEMORY_CHECK
-    if (has_bench_mode_bit(mode_bit_t::corr)) { return zmalloc_protect(size); }
+    if (has_bench_mode_bit(mode_bit_t::exec)
+            && !has_bench_mode_bit(mode_bit_t::perf)) {
+        return zmalloc_protect(size);
+    }
 #endif
 
     void *ptr;
@@ -270,7 +274,8 @@ void *zmalloc(size_t size, size_t align) {
 void zfree(void *ptr) {
     if (!ptr) return;
 #ifdef BENCHDNN_MEMORY_CHECK
-    if (has_bench_mode_bit(mode_bit_t::corr)) {
+    if (has_bench_mode_bit(mode_bit_t::exec)
+            && !has_bench_mode_bit(mode_bit_t::perf)) {
         zfree_protect(ptr);
         return;
     }
@@ -322,21 +327,6 @@ bool match_regex(const char *str, const char *pattern) {
 }
 #endif /* _WIN32 */
 
-bool maybe_skip(const std::string &impl_str) {
-    if (skip_impl.empty()) return false;
-
-    size_t start_pos = 0;
-    // Iterate over impls in skip list.
-    while (start_pos != std::string::npos) {
-        const auto skip_impl_item
-                = parser::get_substr(skip_impl, start_pos, ',');
-        if (skip_impl_item.empty()) continue;
-        if (impl_str.find(skip_impl_item) != std::string::npos) return true;
-    }
-
-    return false;
-}
-
 bool skip_start(res_t *res, int idx) {
     if (idx < test_start) {
         res->state = SKIPPED;
@@ -442,6 +432,8 @@ std::string locate_file(const std::string &fname) {
                 BENCHDNN_PRINT(50, "file used: %s\n", fullname.c_str());
                 ifs.close();
                 return fullname;
+            } else {
+                BENCHDNN_PRINT(50, "File not found at: %s\n", fullname.c_str());
             }
             ifs.close();
         }
@@ -459,7 +451,7 @@ int batch(const char *fname, bench_f bench) {
     std::string str;
     bool continued_line = false;
     while (ifs >> str) {
-        if (str.length() == 0) continue;
+        if (str.empty()) continue;
 
         // shell style comments
         if (str.front() == '#') {
@@ -508,11 +500,21 @@ int64_t div_up(const int64_t a, const int64_t b) {
     return (a + b - 1) / b;
 }
 
+size_t div_up(const size_t a, const size_t b) {
+    SAFE_V(b != 0 ? OK : FAIL);
+    return (a + b - 1) / b;
+}
+
 int64_t rnd_up(const int64_t a, const int64_t b) {
     SAFE_V(b != 0 ? OK : FAIL);
     return div_up(a, b) * b;
 }
 
+size_t rnd_up(const size_t a, const size_t b) {
+    SAFE_V(b != 0 ? OK : FAIL);
+    return div_up(a, b) * b;
+}
+
 int64_t next_pow2(int64_t a) {
     assert(a > 0 && a <= ((int64_t)1 << 62));
     if (a > 1) a--;
diff --git a/tests/benchdnn/common.hpp b/tests/benchdnn/common.hpp
index bbdb9132584..3c66196c561 100644
--- a/tests/benchdnn/common.hpp
+++ b/tests/benchdnn/common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <cinttypes>
 #include <functional>
+#include <map>
 #include <string>
 #include <vector>
 #include <unordered_map>
@@ -67,12 +68,12 @@ enum { CRIT = 1, WARN = 2 };
     do { \
         int status__ = (f); \
         if (status__ != OK) { \
-            if (s == CRIT || s == WARN) { \
+            if ((s) == CRIT || (s) == WARN) { \
                 BENCHDNN_PRINT(0, \
                         "Error: Function '%s' at (%s:%d) returned '%d'\n", \
                         __FUNCTION__, __FILE__, __LINE__, status__); \
                 fflush(0); \
-                if (s == CRIT) exit(1); \
+                if ((s) == CRIT) exit(1); \
             } \
             return status__; \
         } \
@@ -107,9 +108,14 @@ extern std::string driver_name;
         } \
     } while (0)
 
+//NOLINTBEGIN(bugprone-macro-parentheses)
+// dnnl_common.hpp:119:5: error: expected ';' at end of declaration list [clang-diagnostic-error]
+//  119 |     BENCHDNN_DISALLOW_COPY_AND_ASSIGN(stream_t);
+//      |     ^
 #define BENCHDNN_DISALLOW_COPY_AND_ASSIGN(T) \
     T(const T &) = delete; \
     T &operator=(const T &) = delete;
+//NOLINTEND(bugprone-macro-parentheses)
 
 /* perf */
 extern double max_ms_per_prb; // max time spend per prb in ms
@@ -136,6 +142,8 @@ struct stat_t {
     int invalid_arguments;
     int listed;
     std::unordered_map<std::string, double[timer::timer_t::mode_t::n_modes]> ms;
+    // Key is the number of the test, value is the repro string.
+    std::map<int, std::string> failed_cases;
 };
 extern stat_t benchdnn_stat;
 
@@ -162,12 +170,10 @@ void zfree(void *ptr);
 bool str2bool(const char *str);
 const char *bool2str(bool value);
 
-/* TODO: why two functions??? */
 bool match_regex(const char *str, const char *pattern);
-bool maybe_skip(const std::string &impl_str);
 bool skip_start(res_t *res, int idx = benchdnn_stat.tests);
 
-typedef int (*bench_f)(int argc, char **argv);
+using bench_f = int (*)(int, char **);
 std::string locate_file(const std::string &fname);
 int batch(const char *fname, bench_f bench);
 
@@ -175,7 +181,9 @@ int batch(const char *fname, bench_f bench);
 int flip_coin(ptrdiff_t seed, float probability);
 
 int64_t div_up(const int64_t a, const int64_t b);
+size_t div_up(const size_t a, const size_t b);
 int64_t rnd_up(const int64_t a, const int64_t b);
+size_t rnd_up(const size_t a, const size_t b);
 int64_t next_pow2(int64_t a);
 int mxcsr_cvt(float f);
 
@@ -203,4 +211,12 @@ void print_dhw(bool &print_d, bool &print_h, bool &print_w, int ndims,
 int benchdnn_getenv_int(const char *name, int default_value);
 std::string benchdnn_getenv_string(const char *name);
 
+// Responsible for printing service information.
+struct summary_t {
+    // Prints up to 10 failed cases reproducers at the end of the run.
+    bool failed_cases = true;
+};
+
+extern summary_t summary;
+
 #endif
diff --git a/tests/benchdnn/concat/bench_concat.cpp b/tests/benchdnn/concat/bench_concat.cpp
index bfd475e39ba..bec5257d15e 100644
--- a/tests/benchdnn/concat/bench_concat.cpp
+++ b/tests/benchdnn/concat/bench_concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@
 
 namespace concat {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -48,11 +38,10 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
         const prb_t prb(s.prb_vdims, i_sdt, i_ddt, i_stag, i_dtag, i_axis,
-                i_attr, i_ctx_init, i_ctx_exe);
+                i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -87,13 +76,7 @@ int bench(int argc, char **argv) {
                 || parse_multi_tag(s.stag, def.stag, argv[0])
                 || parse_tag(s.dtag, def.dtag, argv[0], "dtag")
                 || parse_axis(s.axis, def.axis, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/concat/concat.cpp b/tests/benchdnn/concat/concat.cpp
index e39a0dd5c03..586d1b71ca2 100644
--- a/tests/benchdnn/concat/concat.cpp
+++ b/tests/benchdnn/concat/concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,6 +72,10 @@ int fill_src(int input_idx, dnnl_data_type_t dt, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     // Do fixed partitioning to have same filling for any number of threads.
     const int64_t chunk_size = 64;
@@ -97,9 +101,11 @@ int fill_src(int input_idx, dnnl_data_type_t dt, dnn_mem_t &mem_dt,
         std::minstd_rand msr(input_idx * n_chunks + idx_start + 1);
         msr.discard(1);
         std::uniform_int_distribution<> igen(min_val, max_val);
-        // No need to round final value as it's already in needed dt.
-        for (int64_t idx = idx_start; idx < idx_end; ++idx)
-            mem_fp.set_elem(idx, (float)igen(msr));
+        // Most fp8 values can't be represented exactly with integers.
+        for (int64_t idx = idx_start; idx < idx_end; ++idx) {
+            mem_fp.set_elem(idx,
+                    round_to_nearest_representable(mem_dt.dt(), igen(msr)));
+        }
     });
 
     SAFE(mem_dt.reorder(mem_fp), WARN);
@@ -147,7 +153,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -192,21 +198,28 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    // Assume it doesn't change through the execution.
-    static int capacity = 0;
-    static auto st = dnnl_get_primitive_cache_capacity(&capacity);
-    if (st != dnnl_success) return FAIL;
-    if (capacity > 0 && prb->n_inputs() + 1 > capacity) {
-        BENCHDNN_PRINT(2, "%s\n",
-                "[INFO] The number of potential internal reorder pds plus "
-                "concat itself exceeds the cache capacity which will lead to a "
-                "test case false-positive failure.");
-        return OK;
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        // The assumtion is the capacity doesn't change through the execution.
+        static int capacity = 0;
+        static auto st = dnnl_get_primitive_cache_capacity(&capacity);
+        if (st != dnnl_success) return FAIL;
+
+        if (capacity > 0 && prb->n_inputs() + 1 > capacity) {
+            BENCHDNN_PRINT(2, "%s\n",
+                    "[INFO] The number of potential internal reorder pds plus "
+                    "concat itself exceeds the cache capacity which will lead "
+                    "to a test case false-positive failure.");
+            return OK;
+        }
+
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
     }
-    return check_caches(v_prim[0], prb, res);
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/concat/concat.hpp b/tests/benchdnn/concat/concat.hpp
index dd8cd88eb75..d53cabe5fe8 100644
--- a/tests/benchdnn/concat/concat.hpp
+++ b/tests/benchdnn/concat/concat.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,14 +62,16 @@ struct prb_t : public prb_vdims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_vdims, s.sdt[0], s.ddt[0], s.stag[0], s.dtag[0],
-                s.axis[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0]) {
+                s.axis[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_vdims_t &prb_vdims, dnnl_data_type_t sdt,
             dnnl_data_type_t ddt, const std::vector<std::string> &stag,
             const std::string &dtag, int axis, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_vdims_t(prb_vdims)
         , sdt(sdt)
         , ddt(ddt)
@@ -78,15 +80,12 @@ struct prb_t : public prb_vdims_t {
         , axis(axis)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
         // If dst is omitted by `dtag = tag::undef`, omit `ddt` as well.
         if (dtag == tag::undef) this->ddt = dnnl_data_type_undef;
 
-        // Broadcast tag if needed
-        if (stag.size() == 1) {
-            const auto val = stag[0]; // Need a copy here.
-            this->stag.assign(prb_vdims.n_inputs(), val);
-        }
+        broadcast_vector(this->stag, prb_vdims.n_inputs());
 
         dst_dims[axis] = axis_size();
         repro = set_repro_line(); // must be last in ctor to collect right info
@@ -100,6 +99,7 @@ struct prb_t : public prb_vdims_t {
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     int64_t axis_size() const {
         int64_t as = 0;
@@ -172,8 +172,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/concat/concat_aux.cpp b/tests/benchdnn/concat/concat_aux.cpp
index 28615bfd713..cf2d430b675 100644
--- a/tests/benchdnn/concat/concat_aux.cpp
+++ b/tests/benchdnn/concat/concat_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,6 +44,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_vdims_t>(*this);
 
diff --git a/tests/benchdnn/conv/bench_conv.cpp b/tests/benchdnn/conv/bench_conv.cpp
index eccd11e619d..e2f42757962 100644
--- a/tests/benchdnn/conv/bench_conv.cpp
+++ b/tests/benchdnn/conv/bench_conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,22 +27,13 @@
 
 namespace conv {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
     for_(const auto &i_dir : s.dir)
     for_(const auto &i_dt : s.dt)
+    for_(const auto &i_bia_dt_ : s.bia_dt)
     for_(const auto &i_stag : s.stag)
     for_(const auto &i_wtag : s.wtag)
     for_(const auto &i_dtag : s.dtag)
@@ -52,18 +43,34 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (const auto &i_mb : s.mb) {
-        const prb_t prb(s.desc, i_dir, i_dt, i_stag, i_wtag, i_dtag, i_strides,
-                i_alg, i_attr, i_ctx_init, i_ctx_exe, i_mb);
+        auto i_bia_dt = i_bia_dt_;
+        if (i_dir & FLAG_BIA) {
+            if (i_bia_dt != dnnl_data_type_undef) {
+                BENCHDNN_PRINT(0, "%s\n",
+                        "Warning: `--dir=FWD_B,BWD_WB` options are "
+                        "incompatible with `--bia-dt` option. To specify a "
+                        "bias data type, use `--dir=FWD_D,FWD_I,BWD_W` values "
+                        "intead.");
+            }
+            // The f32/f64 data type should be used as the default for bias with
+            // directions that include a bias.
+            const bool is_f64 = (i_dt.size() == 1 && i_dt[0] == dnnl_f64)
+                    || (i_dt.size() > 1 && i_dt[1] == dnnl_f64);
+            i_bia_dt = is_f64 ? dnnl_f64 : dnnl_f32;
+        }
+        const prb_t prb(s.desc, i_dir, i_dt, i_bia_dt, i_stag, i_wtag, i_dtag,
+                i_strides, i_alg, i_mb, i_attr, i_ctx_init, i_ctx_exe,
+                s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
         bool has_dw_po = i_attr.post_ops.convolution_index() >= 0;
         auto &conv_createit
                 = has_dw_po ? conv_dw_fusion::createit : conv::createit;
-        auto &conv_check_cacheit = has_dw_po ? conv_dw_fusion::check_cacheit
-                                             : conv::check_cacheit;
+        auto &conv_checkit
+                = has_dw_po ? conv_dw_fusion::checkit : conv::checkit;
         auto &conv_doit = has_dw_po ? conv_dw_fusion::doit : conv::doit;
-        task_executor.submit(prb, s.perf_template, conv_createit,
-                conv_check_cacheit, conv_doit);
+        task_executor.submit(
+                prb, s.perf_template, conv_createit, conv_checkit, conv_doit);
     }
 }
 
@@ -128,19 +135,14 @@ int bench(int argc, char **argv) {
                 || parse_batch(bench, argv[0])
                 || parse_dir(s.dir, def.dir, argv[0])
                 || parse_multi_dt(s.dt, def.dt, argv[0], "dt")
+                || parse_dt(s.bia_dt, def.bia_dt, argv[0], "bia-dt")
                 || parse_tag(s.stag, def.stag, argv[0], "stag")
                 || parse_tag(s.wtag, def.wtag, argv[0], "wtag")
                 || parse_tag(s.dtag, def.dtag, argv[0], "dtag")
                 || parse_strides(s.strides, def.strides, argv[0], "strides")
                 || parse_alg(s.alg, def.alg, str2alg, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/conv/cfg.cpp b/tests/benchdnn/conv/cfg.cpp
index bbf9af8ce1c..cc3698e23dc 100644
--- a/tests/benchdnn/conv/cfg.cpp
+++ b/tests/benchdnn/conv/cfg.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -80,8 +80,7 @@ float cfg_t::get_density(const cfg_t::density_args_t &density_args) const {
     const data_kind_t allowed_non_dense_kind
             = output_data_kind_ == DST ? SRC : DST;
 
-    if (has_bench_mode_bit(mode_bit_t::corr)
-            && density_args.data_kind == allowed_non_dense_kind) {
+    if (density_args.data_kind == allowed_non_dense_kind) {
         int64_t safe_n_acc = get_safe_n_acc();
         assert(safe_n_acc > 0);
         safe_n_acc_str = std::to_string(safe_n_acc);
@@ -112,6 +111,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-32, 32}},
             {{dnnl_bf16}, {-4, 4}},
             {{dnnl_f16}, {-4, 4}},
+            {{dnnl_f4_e2m1}, {0, 1}},
+            {{dnnl_f4_e3m0}, {0, 1}},
             {{dnnl_f8_e5m2}, {-4, 4}},
             {{dnnl_f8_e4m3}, {-4, 4}},
             {{dnnl_s8}, {-4, 4}},
@@ -123,6 +124,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-32, 32}},
             {{dnnl_bf16}, {-4, 4}},
             {{dnnl_f16}, {-2, 2}},
+            {{dnnl_f4_e2m1}, {-1, 1}},
+            {{dnnl_f4_e3m0}, {-1, 1}},
             {{dnnl_f8_e5m2}, {-2, 2}},
             {{dnnl_f8_e4m3}, {-2, 2}},
             {{dnnl_s8}, {-4, 4}},
@@ -145,6 +148,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-8, 8}},
             {{dnnl_bf16}, {-4, 4}},
             {{dnnl_f16}, {-4, 4}},
+            {{dnnl_f4_e2m1}, {-2, 2}},
+            {{dnnl_f4_e3m0}, {-2, 2}},
             {{dnnl_f8_e5m2}, {-4, 4}},
             {{dnnl_f8_e4m3}, {-4, 4}},
             {{dnnl_s8}, {-4, 4}},
diff --git a/tests/benchdnn/conv/conv.cpp b/tests/benchdnn/conv/conv.cpp
index cd802e080c5..63b6f101a9d 100644
--- a/tests/benchdnn/conv/conv.cpp
+++ b/tests/benchdnn/conv/conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 * Copyright 2021 FUJITSU LIMITED
 * Copyright 2021 Arm Ltd. and affiliates
 *
@@ -92,7 +92,7 @@ int check_reorder_presence(
     /* Note for x64:
     Both data types of src and weight are s8, oneDNN addds 128 to one of the s8
     input to make it of type u8 instead, as explained in
-    https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html or
+    https://uxlfoundation.github.io/oneDNN/dev_guide_int8_computations.html or
     doc/advanced/int8_computations.md
     It is because `VPDPBUSD` instruction uses the combination of s8 and u8 as
     input.
@@ -132,6 +132,10 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     cfg_t::density_args_t density_args;
     density_args.data_kind = kind;
@@ -218,8 +222,11 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     auto wei_d = dnn_mem_t::init_md(prb->ndims + prb->has_groups,
             prb->wei_dims().data(), force_f32_dt ? dnnl_f32 : prb->get_dt(WEI),
             prb->wtag, prb->strides[STRIDES_WEI]);
-    auto bia_d = dnn_mem_t::init_md(1, prb->bia_dims().data(),
-            force_f32_dt ? dnnl_f32 : prb->get_dt(BIA), tag::any);
+    benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> bia_d {};
+    if (prb->bia_dt() != dnnl_data_type_undef) {
+        bia_d = dnn_mem_t::init_md(1, prb->bia_dims().data(),
+                force_f32_dt ? dnnl_f32 : prb->get_dt(BIA), tag::any);
+    }
     auto dst_d = dnn_mem_t::init_md(prb->ndims, prb->dst_dims().data(),
             force_f32_dt ? dnnl_f32 : prb->get_dt(DST), prb->dtag,
             prb->strides[STRIDES_DST]);
@@ -236,18 +243,20 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
         // oihw: per_oc: 1 << 0 -> 1
         // goihw: per_oc: 1 << 1 + 1 << 0 -> 3
         auto wei_mask = prb->has_groups ? 3 : 1;
-        attr_args.prepare_scales(prb->attr, DNNL_ARG_WEIGHTS, wei_mask);
+        attr_args.prepare_quant(
+                prb->attr, DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_mask);
     }
     auto dw_wei_scale
             = prb->attr.scales.get(DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS);
     if (dw_wei_scale.policy == policy_t::PER_OC) {
         // dw fusion always has groups.
         auto wei_mask = 3;
-        attr_args.prepare_scales(prb->attr,
-                DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS, wei_mask);
+        attr_args.prepare_quant(prb->attr,
+                DNNL_ARG_ATTR_SCALES | DNNL_ARG_ATTR_POST_OP_DW
+                        | DNNL_ARG_WEIGHTS,
+                wei_mask);
     }
-    const auto dw_bia_dt = prb->dir == FWD_B ? dnnl_f32 : dnnl_data_type_undef;
-    attr_args.prepare_dw_post_op(prb->attr, prb->get_dt(WEI), dw_bia_dt);
+    attr_args.prepare_dw_post_op(prb->attr, prb->get_dt(WEI), prb->bia_dt());
     auto dnnl_attr = make_benchdnn_dnnl_wrapper(
             create_dnnl_attr(prb->attr, attr_args));
 
@@ -255,7 +264,6 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
         case FWD_D:
         case FWD_B:
         case FWD_I:
-            if (prb->dir != FWD_B) bia_d.reset(nullptr);
             TIME_C_PD(DNN_SAFE_STATUS(
                     dnnl_convolution_forward_primitive_desc_create(
                             &init_pd_args.pd, init_pd_args.engine,
@@ -278,7 +286,6 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
             break;
         case BWD_W:
         case BWD_WB:
-            if (prb->dir == BWD_W) bia_d.reset(nullptr);
             TIME_C_PD(DNN_SAFE_STATUS(
                     dnnl_convolution_backward_weights_primitive_desc_create(
                             &init_pd_args.pd, init_pd_args.engine, alg, src_d,
@@ -310,55 +317,43 @@ int init_prim_ref(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
     // Wino inputs doesn't suit optimized CPU implementation.
     if (prb->alg == WINO) return OK;
 
-    // Create a new copy of prb to avoid potentially corrupting the test by
-    // modifying prb in place.
-    // DIRECT algorithm is used to prevent fallback  to the slow benchdnn
-    // reference implementation.
-    auto cpu_attr = prb->attr;
-    update_cpu_ref_attrs(cpu_attr);
     std::vector<std::vector<dnnl_data_type_t>> prim_ref_dt {
             prb->dt, {dnnl_f32}};
-    if (is_cpu()) prim_ref_dt.erase(prim_ref_dt.begin());
-    dnnl_primitive_t prim_ref_ {};
-
-    for (const auto &prim_ref_dt_i : prim_ref_dt) {
-        prb_t prb_cpu {*prb, prb->dir, prim_ref_dt_i, tag::any, tag::any,
-                tag::any, {vdims_t(STRIDES_SIZE)}, DIRECT, cpu_attr,
-                prb->ctx_init, prb->ctx_exe, prb->mb};
-
-        init_pd_args_t<prb_t> init_pd_args(
-                /* res = */ nullptr, get_cpu_engine(), &prb_cpu, prb->dir,
-                /* hint = */ nullptr, /* src_md = */ nullptr);
-        init_pd(init_pd_args);
-
-        benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> pdw;
-        fetch_impl(pdw, init_pd_args, /* res = */ nullptr,
-                /* is_service_prim = */ true);
-
-        // Prim desc wasn't created - try the next set...
-        if (!pdw) continue;
-        // Reference impl was fetched - try the next set...
-        if (query_impl_info(pdw) == "ref:any") continue;
-
-        auto st = dnnl_primitive_create(&prim_ref_, pdw);
-        // Primitive wan't created - try the next set...
-        if (st != dnnl_success) continue;
-
-        BENCHDNN_PRINT(5, "CPU reference oneDNN implementation: %s\n",
-                query_impl_info(pdw).c_str());
-        res->prim_ref_repro = prb_cpu.str();
-        prim_ref.reset(prim_ref_);
-        return OK;
+    // If there's no bias, undef data type should be used for prim_ref as well.
+    dnnl_data_type_t cpu_bia_dt
+            = prb->bia_dt() == dnnl_data_type_undef ? prb->bia_dt() : dnnl_f32;
+    std::vector<dnnl_data_type_t> prim_ref_bia_dt {prb->bia_dt(), cpu_bia_dt};
+    if (is_cpu()) {
+        prim_ref_dt.erase(prim_ref_dt.begin());
+        prim_ref_bia_dt.erase(prim_ref_bia_dt.begin());
     }
 
-    prim_ref.reset(prim_ref_);
+    for_(const auto &prim_ref_dt_i : prim_ref_dt)
+    for (const auto &prim_ref_bia_dt_i : prim_ref_bia_dt) {
+        auto cpu_attr = prb->attr;
+        update_cpu_ref_attrs(cpu_attr, prim_ref_dt_i.back());
+
+        // Create a new copy of prb to avoid potentially corrupting the test by
+        // modifying prb in place.
+        // `DIRECT` algorithm is used to prevent fallback to the slow benchdnn
+        // reference implementation.
+        prb_t prb_cpu {*prb, prb->dir, prim_ref_dt_i, prim_ref_bia_dt_i,
+                tag::any, tag::any, tag::any, {vdims_t(STRIDES_SIZE)}, DIRECT,
+                prb->mb, cpu_attr, prb->ctx_init, prb->ctx_exe,
+                prb->impl_filter};
+
+        auto st = init_prim_ref_common(prim_ref, &prb_cpu, res);
+        if (st == OK) return OK;
+    }
+
+    prim_ref.reset(nullptr);
     return OK;
 }
 
 void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
-    skip_unimplemented_data_type(
-            {prb->get_dt(SRC), prb->get_dt(WEI), prb->get_dt(DST)}, prb->dir,
-            res);
+    skip_unimplemented_data_type({prb->get_dt(SRC), prb->get_dt(WEI),
+                                         prb->get_dt(BIA), prb->get_dt(DST)},
+            prb->dir, res);
     skip_unimplemented_sum_po(prb->attr, res, dnnl_convolution,
             prb->get_dt(SRC), prb->get_dt(DST));
     skip_unimplemented_prelu_po(prb->attr, res, dnnl_convolution);
@@ -383,9 +378,11 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
         const bool is_f16_dst = prb->get_dt(DST) == dnnl_f16;
         const bool is_x8x8f16 = is_int8_src && is_int8_wei && is_f16_dst;
         const bool is_wei_zp = !prb->attr.zero_points.is_def(DNNL_ARG_WEIGHTS);
+        const bool is_non_s32_src_zp
+                = prb->attr.zero_points.get(DNNL_ARG_SRC).dt != dnnl_s32;
 
         if (is_f32f32x8 || is_bf16bf16x8 || is_x8x8f16 || !is_valid_f16
-                || is_wei_zp) {
+                || is_wei_zp || is_non_s32_src_zp) {
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
@@ -461,7 +458,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -538,7 +535,7 @@ std::vector<data_kind_t> get_kinds_to_check(const prb_t *prb) {
         check_kinds = {SRC};
     } else if (prb->dir & FLAG_BWD && prb->dir & FLAG_WEI) {
         check_kinds = {WEI};
-        if (prb->dir & FLAG_BIA) check_kinds.push_back(BIA);
+        if (prb->bia_dt() != dnnl_data_type_undef) check_kinds.push_back(BIA);
     } else {
         assert(!"unexpected!");
         SAFE_V(FAIL);
@@ -556,11 +553,26 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    // Don't check caches for CPU prim as the reference.
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        const auto &prim_ref = v_prim[1];
+        if (prim_ref) {
+            // Copy res to avoid save/restore state and reason.
+            res_t res_copy = *res;
+            SAFE(check_total_size(&res_copy, prim_ref), WARN);
+            if (res_copy.state == SKIPPED) {
+                v_prim[1].reset(nullptr);
+                SAFE(check_total_size(res), WARN);
+            }
+        } else {
+            SAFE(check_total_size(res), WARN);
+        }
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        // Don't check caches for CPU prim as the reference.
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/conv/conv.hpp b/tests/benchdnn/conv/conv.hpp
index b90ddabcc16..8001865281e 100644
--- a/tests/benchdnn/conv/conv.hpp
+++ b/tests/benchdnn/conv/conv.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -100,6 +100,7 @@ struct settings_t : public base_settings_t {
 
     std::vector<dir_t> dir {FWD_B};
     std::vector<std::vector<dnnl_data_type_t>> dt {{dnnl_f32}};
+    std::vector<dnnl_data_type_t> bia_dt {dnnl_data_type_undef};
     std::vector<std::string> stag {tag::any}, wtag {tag::any}, dtag {tag::any};
     std::vector<vdims_t> strides {vdims_t(STRIDES_SIZE)};
     std::vector<alg_t> alg {DIRECT};
@@ -113,39 +114,44 @@ struct settings_t : public base_settings_t {
     void reset() { *this = settings_t(perf_template); }
 
     bool has_single_setup() const override {
-        return dir.size() == 1 && dt.size() == 1 && stag.size() == 1
-                && wtag.size() == 1 && dtag.size() == 1 && strides.size() == 1
-                && alg.size() == 1 && base_settings_t::has_single_setup();
+        return dir.size() == 1 && dt.size() == 1 && bia_dt.size() == 1
+                && stag.size() == 1 && wtag.size() == 1 && dtag.size() == 1
+                && strides.size() == 1 && alg.size() == 1
+                && base_settings_t::has_single_setup();
     }
 };
 
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.dir[0], s.dt[0], s.stag[0], s.wtag[0], s.dtag[0],
-                s.strides[0], s.alg[0], s.attributes.front(), s.ctx_init[0],
-                s.ctx_exe[0], s.mb[0]) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.bia_dt[0], s.stag[0], s.wtag[0],
+                s.dtag[0], s.strides[0], s.alg[0], s.mb[0],
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const desc_t &desc, dir_t dir,
-            const std::vector<dnnl_data_type_t> &dt, const std::string &stag,
-            const std::string &wtag, const std::string &dtag,
-            const vdims_t &strides, alg_t alg, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe, int64_t mb = 0)
+            const std::vector<dnnl_data_type_t> &dt, dnnl_data_type_t bia_dt,
+            const std::string &stag, const std::string &wtag,
+            const std::string &dtag, const vdims_t &strides, alg_t alg,
+            int64_t mb, const attr_t &attr, const thr_ctx_t &ctx_init,
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : desc_t(desc)
         , dir(dir)
         , dt(dt)
+        , bia_dt_(bia_dt)
         , stag(stag)
         , wtag(wtag)
         , dtag(dtag)
         , strides(strides)
         , alg(alg)
-        , attr(attr)
         , user_mb(mb)
         , ops(0)
+        , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
 
         // Broadcast data types if needed
@@ -160,15 +166,16 @@ struct prb_t : public desc_t {
 
     dir_t dir;
     std::vector<dnnl_data_type_t> dt;
+    dnnl_data_type_t bia_dt_; // `_` to avoid conflicting name with bia_dt().
     std::string stag, wtag, dtag;
     vdims_t strides;
     mutable alg_t alg; // `mutable` because of `AUTO`.
-    bool inplace = false; // Lacks placement, always considered `false`.
-    attr_t attr;
     int64_t user_mb;
-
     double ops;
+    bool inplace = false; // Lacks placement, always considered `false`.
+    attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     void count_ops();
     int64_t count_n_acc() const {
@@ -179,9 +186,7 @@ struct prb_t : public desc_t {
 
     dnnl_data_type_t src_dt() const { return dt[0]; }
     dnnl_data_type_t wei_dt() const { return dt[1]; }
-    dnnl_data_type_t bia_dt() const {
-        return is_integral_dt(wei_dt()) ? dnnl_f32 : wei_dt();
-    } // TODO: customize
+    dnnl_data_type_t bia_dt() const { return bia_dt_; }
     dnnl_data_type_t dst_dt() const { return dt[2]; }
     dnnl_data_type_t get_dt(data_kind_t data_kind) const;
 
@@ -307,8 +312,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/conv/conv_aux.cpp b/tests/benchdnn/conv/conv_aux.cpp
index 3ac274ca4f7..358722d643b 100644
--- a/tests/benchdnn/conv/conv_aux.cpp
+++ b/tests/benchdnn/conv/conv_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ int str2desc(desc_t *desc, const char *str) {
             ok = 1; \
             s += strlen(prb); \
             char *end_s; \
-            d.c = strtol(s, &end_s, 10); \
+            d.c = strtoll(s, &end_s, 10); \
             if (end_s == s) { \
                 BENCHDNN_PRINT(0, \
                         "ERROR: No value found for `%s` setting. Full " \
@@ -360,22 +360,39 @@ int64_t desc_t::desc_nelems(int arg, int mask) const {
 void prb_t::count_ops() {
     if (ops > 0) return;
 
+    auto sp_upper = this->ow;
+    auto sp_lower = this->ow - (this->kw + (this->kw - 1) * this->dw - 1);
+    if (this->oh > 0) {
+        sp_upper *= this->oh;
+        sp_lower *= this->oh - (this->kh + (this->kh - 1) * this->dh - 1);
+    }
+    if (this->od > 0) {
+        sp_upper *= this->od;
+        sp_lower *= this->od - (this->kd + (this->kd - 1) * this->dd - 1);
+    }
+    auto max_sp_error = double(sp_upper - sp_lower) / sp_lower;
     double sp_ops = 0;
-    for_(int64_t od = 0; od < this->od; ++od)
-    for_(int64_t oh = 0; oh < this->oh; ++oh)
-    for (int64_t ow = 0; ow < this->ow; ++ow) {
-        for (int64_t kd = 0; kd < this->kd; ++kd) {
-            const int64_t id = od * this->sd - this->pd + kd * (this->dd + 1);
-            if (id < 0 || id >= this->id) continue;
-            for (int64_t kh = 0; kh < this->kh; ++kh) {
-                const int64_t ih
-                        = oh * this->sh - this->ph + kh * (this->dh + 1);
-                if (ih < 0 || ih >= this->ih) continue;
-                for (int64_t kw = 0; kw < this->kw; ++kw) {
-                    const int64_t iw
-                            = ow * this->sw - this->pw + kw * (this->dw + 1);
-                    if (iw < 0 || iw >= this->iw) continue;
-                    sp_ops += 1;
+    if (max_sp_error < 1e-6) {
+        // Return estimate for very large sizes to avoid unnecessary compute.
+        sp_ops = sp_upper * this->kd * this->kh * this->kw;
+    } else {
+        for_(int64_t od = 0; od < this->od; ++od)
+        for_(int64_t oh = 0; oh < this->oh; ++oh)
+        for (int64_t ow = 0; ow < this->ow; ++ow) {
+            for (int64_t kd = 0; kd < this->kd; ++kd) {
+                const int64_t id
+                        = od * this->sd - this->pd + kd * (this->dd + 1);
+                if (id < 0 || id >= this->id) continue;
+                for (int64_t kh = 0; kh < this->kh; ++kh) {
+                    const int64_t ih
+                            = oh * this->sh - this->ph + kh * (this->dh + 1);
+                    if (ih < 0 || ih >= this->ih) continue;
+                    for (int64_t kw = 0; kw < this->kw; ++kw) {
+                        const int64_t iw = ow * this->sw - this->pw
+                                + kw * (this->dw + 1);
+                        if (iw < 0 || iw >= this->iw) continue;
+                        sp_ops += 1;
+                    }
                 }
             }
         }
@@ -395,6 +412,8 @@ std::string prb_t::set_repro_line() {
 
     if (canonical || dir != def.dir[0]) s << "--dir=" << dir << " ";
     if (canonical || !has_default_dts) s << "--dt=" << dt << " ";
+    if ((canonical || bia_dt_ != def.bia_dt[0]) && !(dir & FLAG_BIA))
+        s << "--bia-dt=" << bia_dt_ << " ";
     if (canonical || stag != def.stag[0]) s << "--stag=" << stag << " ";
     if (canonical || wtag != def.wtag[0]) s << "--wtag=" << wtag << " ";
     if (canonical || dtag != def.dtag[0]) s << "--dtag=" << dtag << " ";
@@ -407,6 +426,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/conv/conv_dw_fusion.cpp b/tests/benchdnn/conv/conv_dw_fusion.cpp
index c68d7471085..2ad9959760b 100644
--- a/tests/benchdnn/conv/conv_dw_fusion.cpp
+++ b/tests/benchdnn/conv/conv_dw_fusion.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,8 +47,9 @@ std::unique_ptr<prb_t> get_first_conv_prb(const prb_t *prb) {
     }
 
     return std::unique_ptr<prb_t>(new prb_t((desc_t)*prb, prb->dir, prb->dt,
-            prb->stag, prb->wtag, tag::any, {vdims_t(STRIDES_SIZE)}, prb->alg,
-            attr, prb->ctx_init, prb->ctx_exe, prb->mb));
+            prb->bia_dt(), prb->stag, prb->wtag, tag::any,
+            {vdims_t(STRIDES_SIZE)}, prb->alg, prb->mb, attr, prb->ctx_init,
+            prb->ctx_exe, prb->impl_filter));
 }
 
 void get_fused_conv_dst_dims(const int ndims,
@@ -131,9 +132,10 @@ std::unique_ptr<prb_t> get_fused_conv_prb(const prb_t *prb) {
     cd.ndims = prb->ndims;
     cd.init_pad_r();
 
-    return std::unique_ptr<prb_t>(new prb_t(cd, prb->dir, dw_dt, tag::any,
-            tag::any, prb->dtag, {vdims_t(STRIDES_SIZE)}, alg_t::DIRECT,
-            fusion_attr, prb->ctx_init, prb->ctx_exe, prb->mb));
+    return std::unique_ptr<prb_t>(new prb_t(cd, prb->dir, dw_dt,
+            prb->get_dt(BIA), tag::any, tag::any, prb->dtag,
+            {vdims_t(STRIDES_SIZE)}, alg_t::DIRECT, prb->mb, fusion_attr,
+            prb->ctx_init, prb->ctx_exe, prb->impl_filter));
 }
 
 int init_ref_memory_args(dnn_mem_map_t &mem_map0, dnn_mem_map_t &mem_map1,
@@ -304,8 +306,7 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
     SAFE(check_caches(v_prim[0], prb, res), WARN);
 
diff --git a/tests/benchdnn/conv/conv_dw_fusion.hpp b/tests/benchdnn/conv/conv_dw_fusion.hpp
index e0f6832f218..f91db2afd3a 100644
--- a/tests/benchdnn/conv/conv_dw_fusion.hpp
+++ b/tests/benchdnn/conv/conv_dw_fusion.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,8 +36,7 @@ using cfg_t = conv::cfg_t;
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/conv/ref_conv.cpp b/tests/benchdnn/conv/ref_conv.cpp
index 9605a9e4501..14422a3d446 100644
--- a/tests/benchdnn/conv/ref_conv.cpp
+++ b/tests/benchdnn/conv/ref_conv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,12 +41,17 @@ void compute_ref_direct_fwd(const prb_t *prb, const args_t &args) {
     const bool has_src_scale = !prb->attr.scales.get(DNNL_ARG_SRC).is_def();
     const bool has_wei_scale = !prb->attr.scales.get(DNNL_ARG_WEIGHTS).is_def();
     const bool has_dst_scale = !prb->attr.scales.get(DNNL_ARG_DST).is_def();
-    assert(IMPLICATION(has_src_scale, src_scales.nelems() == 1));
-    assert(IMPLICATION(has_dst_scale, dst_scales.nelems() == 1));
-    float src_scale = has_src_scale ? src_scales.get_elem(0) : 1.f;
-    float dst_scale = has_dst_scale ? 1.f / dst_scales.get_elem(0) : 1.f;
     const int wei_scale_mask = prb->attr.scales.get_mask(
             DNNL_ARG_WEIGHTS, dnnl_convolution, wei_m.ndims(), prb->has_groups);
+    const int src_scale_mask = prb->attr.scales.get_mask(
+            DNNL_ARG_SRC, dnnl_convolution, src_m.ndims(), prb->has_groups);
+    const int dst_scale_mask = prb->attr.scales.get_mask(
+            DNNL_ARG_DST, dnnl_convolution, dst_m.ndims(), prb->has_groups);
+
+    assert(IMPLICATION(
+            has_src_scale, src_scales.nelems() == 1 || src_scale_mask == 3));
+    assert(IMPLICATION(
+            has_dst_scale, dst_scales.nelems() == 1 || dst_scale_mask == 2));
 
     const bool has_src_zp = !prb->attr.zero_points.get(DNNL_ARG_SRC).is_def();
     const bool has_wei_zp
@@ -93,12 +98,17 @@ void compute_ref_direct_fwd(const prb_t *prb, const args_t &args) {
                     for (int64_t ic = 0; ic < ICG; ++ic) {
                         int64_t src_off = ((ic * ID + id) * IH + ih) * IW + iw;
                         int64_t wei_off = ((ic * KD + kd) * KH + kh) * KW + kw;
+                        float src_scale = 1.f;
+                        if (has_src_scale)
+                            src_scale = src_scales.get_elem(
+                                    src_scale_mask > 0 ? g * ICG + ic : 0);
                         int src_zp = has_src_zp ? src_zps.get_elem(
                                              src_zp_mask > 0 ? g * ICG + ic : 0)
                                                 : 0;
                         const float s = src_loc[src_off];
                         const float w = wei_loc[wei_off];
-                        const float d_tmp = (s - src_zp) * (w - wei_zp);
+                        const float d_tmp
+                                = ((s - src_zp) * src_scale) * (w - wei_zp);
                         d += d_tmp;
                     }
                 }
@@ -118,14 +128,18 @@ void compute_ref_direct_fwd(const prb_t *prb, const args_t &args) {
 
                 // apply scale as:
                 //    dst = src_scale * wei_scale * conv(src - zp_src, wei)
-                float wei_scale = 1.f;
+                float wei_scale = 1.f, dst_scale = 1.f;
                 if (has_wei_scale)
                     wei_scale = wei_scales.get_elem(
                             wei_scale_mask > 0 ? g * OCG + oc : 0);
-                const float scale = src_scale * wei_scale;
-                conv_res *= scale;
+                if (has_dst_scale)
+                    dst_scale = 1.f
+                            / dst_scales.get_elem(
+                                    dst_scale_mask > 0 ? g * OCG + oc : 0);
+
+                conv_res *= wei_scale;
 
-                if (prb->dir & FLAG_BIA) {
+                if (prb->bia_dt() != dnnl_data_type_undef) {
                     const size_t bia_off = bia_off_f(prb, g, oc);
                     conv_res += ((float *)bia_m)[bia_off];
                 }
@@ -139,6 +153,8 @@ void compute_ref_direct_fwd(const prb_t *prb, const args_t &args) {
                         ? dst_zps.get_elem(dst_zp_mask > 0 ? g * OCG + oc : 0)
                         : 0;
                 dst = conv_res * dst_scale + dst_zp;
+                maybe_round(
+                        prb->attr, DNNL_ARG_DST, dst, dst_off, prb->dst_dt());
             });
 }
 
@@ -307,7 +323,7 @@ void compute_ref_direct_bwd_d(const prb_t *prb, const args_t &args) {
                 else
                     ker(conv_res, g, mb, ic, id, ih, iw);
 
-                if (prb->dir & FLAG_BIA) {
+                if (prb->bia_dt() != dnnl_data_type_undef) {
                     const size_t bia_off = (size_t)g * ICG + ic;
                     conv_res += ((float *)bia_m)[bia_off];
                 }
@@ -344,8 +360,8 @@ void compute_ref_bwd_weights(const prb_t *prb, const args_t &args) {
             = [](int64_t I, int64_t O, int64_t k, int64_t S, int64_t P,
                       int64_t D, int64_t &o_s, int64_t &o_e) {
                   const float tmp = P - k * D;
-                  o_s = MAX2(0, ceilf(tmp / S));
-                  o_e = MIN2(O, ceilf((I + tmp) / S));
+                  o_s = MAX2(0, div_up(tmp, S));
+                  o_e = MIN2(O, div_up(I + tmp, S));
               };
 
     auto ker = [&](float &dw, int64_t g, int64_t oc, int64_t ic, int64_t kd,
@@ -413,7 +429,7 @@ void compute_ref_bwd_bias(const prb_t *prb, const args_t &args) {
 
 void compute_ref_direct_bwd_w(const prb_t *prb, const args_t &args) {
     compute_ref_bwd_weights(prb, args);
-    if (!(prb->dir & FLAG_BIA)) return;
+    if (prb->bia_dt() == dnnl_data_type_undef) return;
     compute_ref_bwd_bias(prb, args);
 }
 
diff --git a/tests/benchdnn/conv/ref_wino.cpp b/tests/benchdnn/conv/ref_wino.cpp
index c0d6a43bc30..0845cccb0d8 100644
--- a/tests/benchdnn/conv/ref_wino.cpp
+++ b/tests/benchdnn/conv/ref_wino.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -309,7 +309,7 @@ void compute_wino_ref_fwd(const prb_t *prb, const args_t &args) {
     SAFE_V(prb->kh == 3 ? OK : FAIL);
     SAFE_V(prb->kw == 3 ? OK : FAIL);
 
-    bool with_bias = prb->dir & FLAG_BIA;
+    bool with_bias = prb->bia_dt() != dnnl_data_type_undef;
     const int64_t t_pad = prb->ph;
     const int64_t l_pad = prb->pw;
     const int64_t wp_max = prb->iw + l_pad;
@@ -436,7 +436,7 @@ void compute_wino_ref_bwd_d(const prb_t *prb, const args_t &args) {
     const int64_t hp_max = prb->oh + t_pad;
     const int64_t p_dim = prb->mb * sp.h_tiles * sp.w_tiles;
 
-    bool with_bias = prb->dir & FLAG_BIA;
+    bool with_bias = prb->bia_dt() != dnnl_data_type_undef;
 
     benchdnn_parallel_nd(prb->mb, prb->oc, sp.h_tiles, sp.w_tiles,
             [&](int64_t img, int64_t c, int64_t hfm, int64_t wfm) {
@@ -626,7 +626,7 @@ void compute_wino_ref_bwd_w(const prb_t *prb, const args_t &args) {
 
     free_scratchpad(&sp);
 
-    if (prb->dir & FLAG_BIA) compute_ref_bwd_bias(prb, args);
+    if (prb->bia_dt() != dnnl_data_type_undef) compute_ref_bwd_bias(prb, args);
 }
 
 } // namespace conv
diff --git a/tests/benchdnn/deconv/bench_deconv.cpp b/tests/benchdnn/deconv/bench_deconv.cpp
index 7f4a232d864..fb77adcfc70 100644
--- a/tests/benchdnn/deconv/bench_deconv.cpp
+++ b/tests/benchdnn/deconv/bench_deconv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,22 +26,13 @@
 
 namespace deconv {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
     for_(const auto &i_dir : s.dir)
     for_(const auto &i_dt : s.dt)
+    for_(const auto &i_bia_dt_ : s.bia_dt)
     for_(const auto &i_stag : s.stag)
     for_(const auto &i_wtag : s.wtag)
     for_(const auto &i_dtag : s.dtag)
@@ -50,12 +41,26 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (const auto &i_mb : s.mb) {
-        const prb_t prb(s.desc, i_dir, i_dt, i_stag, i_wtag, i_dtag, i_alg,
-                i_attr, i_ctx_init, i_ctx_exe, i_mb);
+        auto i_bia_dt = i_bia_dt_;
+        if (i_dir & FLAG_BIA) {
+            if (i_bia_dt != dnnl_data_type_undef) {
+                BENCHDNN_PRINT(0, "%s\n",
+                        "Warning: `--dir=FWD_B,BWD_WB` options are "
+                        "incompatible with `--bia-dt` option. To specify a "
+                        "bias data type, use `--dir=FWD_D,FWD_I,BWD_W` values "
+                        "intead.");
+            }
+            // The f32/f64 data type should be used as the default for bias with
+            // directions that include a bias.
+            const bool is_f64 = (i_dt.size() == 1 && i_dt[0] == dnnl_f64)
+                    || (i_dt.size() > 1 && i_dt[1] == dnnl_f64);
+            i_bia_dt = is_f64 ? dnnl_f64 : dnnl_f32;
+        }
+        const prb_t prb(s.desc, i_dir, i_dt, i_bia_dt, i_stag, i_wtag, i_dtag,
+                i_alg, i_mb, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -85,18 +90,13 @@ int bench(int argc, char **argv) {
                 || parse_batch(bench, argv[0])
                 || parse_dir(s.dir, def.dir, argv[0])
                 || parse_multi_dt(s.dt, def.dt, argv[0], "dt")
+                || parse_dt(s.bia_dt, def.bia_dt, argv[0], "bia-dt")
                 || parse_tag(s.stag, def.stag, argv[0], "stag")
                 || parse_tag(s.wtag, def.wtag, argv[0], "wtag")
                 || parse_tag(s.dtag, def.dtag, argv[0], "dtag")
                 || parse_alg(s.alg, def.alg, str2alg, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/deconv/cfg.cpp b/tests/benchdnn/deconv/cfg.cpp
index 919a5081c9d..fe7671283f8 100644
--- a/tests/benchdnn/deconv/cfg.cpp
+++ b/tests/benchdnn/deconv/cfg.cpp
@@ -63,8 +63,7 @@ float cfg_t::get_density(const cfg_t::density_args_t &density_args) const {
     const data_kind_t allowed_non_dense_kind
             = output_data_kind_ == DST ? SRC : DST;
 
-    if (has_bench_mode_bit(mode_bit_t::corr)
-            && density_args.data_kind == allowed_non_dense_kind) {
+    if (density_args.data_kind == allowed_non_dense_kind) {
         int64_t safe_n_acc = get_safe_n_acc();
         assert(safe_n_acc > 0);
         safe_n_acc_str = std::to_string(safe_n_acc);
diff --git a/tests/benchdnn/deconv/deconv.cpp b/tests/benchdnn/deconv/deconv.cpp
index 2fb143e0560..43deabf7dd3 100644
--- a/tests/benchdnn/deconv/deconv.cpp
+++ b/tests/benchdnn/deconv/deconv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ int check_reorder_presence(
     /* Note for x64:
     Both data types of src and weight are s8, oneDNN addds 128 to one of the s8
     input to make it of type u8 instead, as explained in
-    https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html or
+    https://uxlfoundation.github.io/oneDNN/dev_guide_int8_computations.html or
     doc/advanced/int8_computations.md
     It is because `VPDPBUSD` instruction uses the combination of s8 and u8 as
     input.
@@ -144,6 +144,10 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     cfg_t::density_args_t density_args;
     density_args.data_kind = kind;
@@ -222,8 +226,11 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     auto wei_d = dnn_mem_t::init_md(prb->ndims + prb->has_groups,
             prb->wei_dims().data(), force_f32_dt ? dnnl_f32 : prb->get_dt(WEI),
             prb->wtag);
-    auto bia_d = dnn_mem_t::init_md(1, prb->bia_dims().data(),
-            force_f32_dt ? dnnl_f32 : prb->get_dt(BIA), tag::any);
+    benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> bia_d {};
+    if (prb->bia_dt() != dnnl_data_type_undef) {
+        bia_d = dnn_mem_t::init_md(1, prb->bia_dims().data(),
+                force_f32_dt ? dnnl_f32 : prb->get_dt(BIA), tag::any);
+    }
     auto dst_d = dnn_mem_t::init_md(prb->ndims, prb->dst_dims().data(),
             force_f32_dt ? dnnl_f32 : prb->get_dt(DST), prb->dtag);
 
@@ -236,7 +243,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
         // oihw: per_oc: 1 << 0 -> 1
         // goihw: per_oc: 1 << 1 + 1 << 0 -> 3
         auto wei_mask = prb->has_groups ? 3 : 1;
-        attr_args.prepare_scales(prb->attr, DNNL_ARG_WEIGHTS, wei_mask);
+        attr_args.prepare_quant(
+                prb->attr, DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_mask);
     }
     attr_args.prepare_post_ops_mds(
             prb->attr, prb->ndims, prb->dst_dims().data());
@@ -247,7 +255,6 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
         case FWD_D:
         case FWD_B:
         case FWD_I:
-            if (prb->dir != FWD_B) bia_d.reset(nullptr);
             TIME_C_PD(DNN_SAFE_STATUS(
                     dnnl_deconvolution_forward_primitive_desc_create(
                             &init_pd_args.pd, init_pd_args.engine,
@@ -270,7 +277,6 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
             break;
         case BWD_W:
         case BWD_WB:
-            if (prb->dir == BWD_W) bia_d.reset(nullptr);
             TIME_C_PD(DNN_SAFE_STATUS(
                     dnnl_deconvolution_backward_weights_primitive_desc_create(
                             &init_pd_args.pd, init_pd_args.engine, alg, src_d,
@@ -300,64 +306,54 @@ int init_prim_ref(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
     if (is_cpu() && (prb->src_dt() == dnnl_f32 && prb->wei_dt() == dnnl_f32))
         return OK;
 
-    // Create a new copy of prb to avoid potentially corrupting the test by
-    // modifying prb in place.
-    // DIRECT algorithm is used to prevent fallback  to the slow benchdnn
-    // reference implementation.
-    auto cpu_attr = prb->attr;
-    update_cpu_ref_attrs(cpu_attr);
     std::vector<std::vector<dnnl_data_type_t>> prim_ref_dt {
             prb->dt, {dnnl_f32}};
-    if (is_cpu()) prim_ref_dt.erase(prim_ref_dt.begin());
-    dnnl_primitive_t prim_ref_ {};
-
-    for (const auto &prim_ref_dt_i : prim_ref_dt) {
-        prb_t prb_cpu {*prb, prb->dir, prim_ref_dt_i, tag::any, tag::any,
-                tag::any, DIRECT, cpu_attr, prb->ctx_init, prb->ctx_exe,
-                prb->mb};
-
-        init_pd_args_t<prb_t> init_pd_args(
-                /* res = */ nullptr, get_cpu_engine(), &prb_cpu, prb->dir,
-                /* hint = */ nullptr, /* src_md = */ nullptr);
-        init_pd(init_pd_args);
-
-        benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> pdw;
-        fetch_impl(pdw, init_pd_args, /* res = */ nullptr,
-                /* is_service_prim = */ true);
-
-        // Prim desc wasn't created - try the next set...
-        if (!pdw) continue;
-        // Reference impl was fetched - try the next set...
-        if (query_impl_info(pdw) == "ref:any") continue;
-
-        auto st = dnnl_primitive_create(&prim_ref_, pdw);
-        // Primitive wan't created - try the next set...
-        if (st != dnnl_success) continue;
-
-        BENCHDNN_PRINT(5, "CPU reference oneDNN implementation: %s\n",
-                query_impl_info(pdw).c_str());
-        res->prim_ref_repro = prb_cpu.str();
-        prim_ref.reset(prim_ref_);
-        return OK;
+    // If there's no bias, undef data type should be used for prim_ref as well.
+    dnnl_data_type_t cpu_bia_dt
+            = prb->bia_dt() == dnnl_data_type_undef ? prb->bia_dt() : dnnl_f32;
+    std::vector<dnnl_data_type_t> prim_ref_bia_dt {prb->bia_dt(), cpu_bia_dt};
+    if (is_cpu()) {
+        prim_ref_dt.erase(prim_ref_dt.begin());
+        prim_ref_bia_dt.erase(prim_ref_bia_dt.begin());
+    }
+
+    for_(const auto &prim_ref_dt_i : prim_ref_dt)
+    for (const auto &prim_ref_bia_dt_i : prim_ref_bia_dt) {
+        auto cpu_attr = prb->attr;
+        update_cpu_ref_attrs(cpu_attr, prim_ref_dt_i.back());
+
+        // Create a new copy of prb to avoid potentially corrupting the test by
+        // modifying prb in place.
+        // `DIRECT` algorithm is used to prevent fallback to the slow benchdnn
+        // reference implementation.
+        prb_t prb_cpu {*prb, prb->dir, prim_ref_dt_i, prim_ref_bia_dt_i,
+                tag::any, tag::any, tag::any, DIRECT, prb->mb, cpu_attr,
+                prb->ctx_init, prb->ctx_exe, prb->impl_filter};
+
+        auto st = init_prim_ref_common(prim_ref, &prb_cpu, res);
+        if (st == OK) return OK;
     }
 
-    prim_ref.reset(prim_ref_);
+    prim_ref.reset(nullptr);
     return OK;
 }
 
 void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
-    skip_unimplemented_data_type(
-            {prb->get_dt(SRC), prb->get_dt(WEI), prb->get_dt(DST)}, prb->dir,
-            res);
+    skip_unimplemented_data_type({prb->get_dt(SRC), prb->get_dt(WEI),
+                                         prb->get_dt(BIA), prb->get_dt(DST)},
+            prb->dir, res);
     skip_unimplemented_sum_po(
             prb->attr, res, dnnl_deconvolution, prb->get_dt(SRC));
     skip_unimplemented_prelu_po(prb->attr, res, dnnl_deconvolution);
 
-    // GPU supports only post ops and all but x8s8bf16 cfg
+    // GPU supports only post ops and all but x8s8bf16 and f32xf16f32 cfg
     if (is_gpu()) {
         const bool is_x8s8bf16_cfg
                 = prb->get_dt(WEI) == dnnl_s8 && prb->get_dt(DST) == dnnl_bf16;
-        const bool fwd_ok = !is_x8s8bf16_cfg;
+        const bool is_f32xf16_cfg = (prb->get_dt(WEI) == dnnl_f16
+                                            || prb->get_dt(WEI) == dnnl_bf16)
+                && prb->get_dt(SRC) == dnnl_f32;
+        const bool fwd_ok = !(is_x8s8bf16_cfg || is_f32xf16_cfg);
         if (!fwd_ok) {
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
@@ -416,7 +412,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -519,7 +515,7 @@ std::vector<data_kind_t> get_kinds_to_check(const prb_t *prb) {
         check_kinds = {SRC};
     } else if (prb->dir & FLAG_BWD && prb->dir & FLAG_WEI) {
         check_kinds = {WEI};
-        if (prb->dir & FLAG_BIA) check_kinds.push_back(BIA);
+        if (prb->bia_dt() != dnnl_data_type_undef) check_kinds.push_back(BIA);
     } else {
         assert(!"unexpected!");
         SAFE_V(FAIL);
@@ -537,11 +533,26 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    // Don't check caches for CPU prim as the reference.
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        const auto &prim_ref = v_prim[1];
+        if (prim_ref) {
+            // Copy res to avoid save/restore state and reason.
+            res_t res_copy = *res;
+            SAFE(check_total_size(&res_copy, prim_ref), WARN);
+            if (res_copy.state == SKIPPED) {
+                v_prim[1].reset(nullptr);
+                SAFE(check_total_size(res), WARN);
+            }
+        } else {
+            SAFE(check_total_size(res), WARN);
+        }
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        // Don't check caches for CPU prim as the reference.
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/deconv/deconv.hpp b/tests/benchdnn/deconv/deconv.hpp
index 7bb0599940a..be4ed25b1e7 100644
--- a/tests/benchdnn/deconv/deconv.hpp
+++ b/tests/benchdnn/deconv/deconv.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -103,6 +103,7 @@ struct settings_t : public base_settings_t {
 
     std::vector<dir_t> dir {FWD_B};
     std::vector<std::vector<dnnl_data_type_t>> dt {{dnnl_f32}};
+    std::vector<dnnl_data_type_t> bia_dt {dnnl_data_type_undef};
     std::vector<std::string> stag {tag::any}, wtag {tag::any}, dtag {tag::any};
     std::vector<alg_t> alg {DIRECT};
 
@@ -115,38 +116,41 @@ struct settings_t : public base_settings_t {
     void reset() { *this = settings_t(perf_template); }
 
     bool has_single_setup() const override {
-        return dir.size() == 1 && dt.size() == 1 && stag.size() == 1
-                && wtag.size() == 1 && dtag.size() == 1 && alg.size() == 1
-                && base_settings_t::has_single_setup();
+        return dir.size() == 1 && dt.size() == 1 && bia_dt.size() == 1
+                && stag.size() == 1 && wtag.size() == 1 && dtag.size() == 1
+                && alg.size() == 1 && base_settings_t::has_single_setup();
     }
 };
 
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.dir[0], s.dt[0], s.stag[0], s.wtag[0], s.dtag[0],
-                s.alg[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
-                s.mb[0]) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.bia_dt[0], s.stag[0], s.wtag[0],
+                s.dtag[0], s.alg[0], s.mb[0], s.attributes.front(),
+                s.ctx_init[0], s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const desc_t &desc, dir_t dir,
-            const std::vector<dnnl_data_type_t> &dt, const std::string &stag,
-            const std::string &wtag, const std::string &dtag, alg_t alg,
-            const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, int64_t mb = 0)
+            const std::vector<dnnl_data_type_t> &dt, dnnl_data_type_t bia_dt,
+            const std::string &stag, const std::string &wtag,
+            const std::string &dtag, alg_t alg, int64_t mb, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : desc_t(desc)
         , dir(dir)
         , dt(dt)
+        , bia_dt_(bia_dt)
         , stag(stag)
         , wtag(wtag)
         , dtag(dtag)
         , alg(alg)
+        , user_mb(mb)
+        , ops(0)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb)
-        , ops(0) {
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
 
         // Broadcast data types if needed
@@ -161,14 +165,15 @@ struct prb_t : public desc_t {
 
     dir_t dir;
     std::vector<dnnl_data_type_t> dt;
+    dnnl_data_type_t bia_dt_; // `_` to avoid conflicting name with bia_dt().
     std::string stag, wtag, dtag;
     alg_t alg;
+    int64_t user_mb;
+    double ops;
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
-
-    double ops;
+    impl_filter_t impl_filter;
 
     void count_ops();
     int64_t count_n_acc() const {
@@ -179,9 +184,7 @@ struct prb_t : public desc_t {
 
     dnnl_data_type_t src_dt() const { return dt[0]; }
     dnnl_data_type_t wei_dt() const { return dt[1]; }
-    dnnl_data_type_t bia_dt() const {
-        return is_integral_dt(wei_dt()) ? dnnl_f32 : wei_dt();
-    } // TODO: customize
+    dnnl_data_type_t bia_dt() const { return bia_dt_; }
     dnnl_data_type_t dst_dt() const { return dt[2]; }
     dnnl_data_type_t get_dt(data_kind_t data_kind) const;
 
@@ -307,8 +310,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/deconv/deconv_aux.cpp b/tests/benchdnn/deconv/deconv_aux.cpp
index dcb72fe4b57..b36c2e89bd1 100644
--- a/tests/benchdnn/deconv/deconv_aux.cpp
+++ b/tests/benchdnn/deconv/deconv_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ int str2desc(desc_t *desc, const char *str) {
             ok = 1; \
             s += strlen(prb); \
             char *end_s; \
-            d.c = strtol(s, &end_s, 10); \
+            d.c = strtoll(s, &end_s, 10); \
             if (end_s == s) { \
                 BENCHDNN_PRINT(0, \
                         "ERROR: No value found for `%s` setting. Full " \
@@ -410,6 +410,8 @@ std::string prb_t::set_repro_line() {
 
     if (canonical || dir != def.dir[0]) s << "--dir=" << dir << " ";
     if (canonical || !has_default_dts) s << "--dt=" << dt << " ";
+    if ((canonical || bia_dt_ != def.bia_dt[0]) && !(dir & FLAG_BIA))
+        s << "--bia-dt=" << bia_dt_ << " ";
     if (canonical || stag != def.stag[0]) s << "--stag=" << stag << " ";
     if (canonical || wtag != def.wtag[0]) s << "--wtag=" << wtag << " ";
     if (canonical || dtag != def.dtag[0]) s << "--dtag=" << dtag << " ";
@@ -420,6 +422,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/deconv/ref_deconv.cpp b/tests/benchdnn/deconv/ref_deconv.cpp
index f63f0ccd125..f426cf73d16 100644
--- a/tests/benchdnn/deconv/ref_deconv.cpp
+++ b/tests/benchdnn/deconv/ref_deconv.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -113,7 +113,7 @@ void compute_ref_direct_fwd(const prb_t *prb, const args_t &args) {
                 float conv_res = 0;
                 ker(conv_res, g, mb, oc, od, oh, ow);
 
-                if (prb->dir & FLAG_BIA) {
+                if (prb->bia_dt() != dnnl_data_type_undef) {
                     const size_t bia_off = bia_off_f(prb, g, oc);
                     conv_res += ((float *)bia_m)[bia_off];
                 }
@@ -287,7 +287,7 @@ void compute_ref_direct_bwd_d(const prb_t *prb, const args_t &args) {
                 else
                     ker(conv_res, g, mb, ic, id, ih, iw);
 
-                if (prb->dir & FLAG_BIA) {
+                if (prb->bia_dt() != dnnl_data_type_undef) {
                     const size_t bia_off = (size_t)g * ICG + ic;
                     conv_res += ((float *)bia_m)[bia_off];
                 }
@@ -324,8 +324,8 @@ void compute_ref_bwd_weights(const prb_t *prb, const args_t &args) {
             = [](int64_t I, int64_t O, int64_t k, int64_t S, int64_t P,
                       int64_t D, int64_t &o_s, int64_t &o_e) {
                   const float tmp = P - k * D;
-                  o_s = MAX2(0, ceilf(tmp / S));
-                  o_e = MIN2(O, ceilf((I + tmp) / S));
+                  o_s = MAX2(0, div_up(tmp, S));
+                  o_e = MIN2(O, div_up(I + tmp, S));
               };
 
     auto ker = [&](float &dw, int64_t g, int64_t oc, int64_t ic, int64_t kd,
@@ -481,7 +481,7 @@ void compute_ref_bwd_w(
     // entry problem which is transposed - `p_tr`. Simpler to use the kernel
     // directly.
     // Take original memories, not `ref_conv_args`.
-    if (prb->dir & FLAG_BIA) {
+    if (prb->bia_dt() != dnnl_data_type_undef) {
         const dnn_mem_t &diff_bia_m = args.find(DNNL_ARG_DIFF_BIAS);
         const dnn_mem_t &diff_dst_m = args.find(DNNL_ARG_DIFF_DST);
         /* help compiler optimize the code */
@@ -512,9 +512,9 @@ void compute_ref_bwd_w(
 void compute_ref(
         const prb_t *prb, const args_t &args, dnnl_primitive_t prim_ref) {
     // Update prb descriptor to re-use convolution reference.
-    prb_t prb_tr((desc_t)*prb, prb->dir, prb->dt, prb->stag, prb->wtag,
-            prb->dtag, prb->alg, prb->attr, prb->ctx_init, prb->ctx_exe,
-            prb->mb);
+    prb_t prb_tr((desc_t)*prb, prb->dir, prb->dt, prb->bia_dt(), prb->stag,
+            prb->wtag, prb->dtag, prb->alg, prb->mb, prb->attr, prb->ctx_init,
+            prb->ctx_exe, prb->impl_filter);
     std::swap(prb_tr.ic, prb_tr.oc);
     std::swap(prb_tr.ih, prb_tr.oh);
     std::swap(prb_tr.id, prb_tr.od);
diff --git a/tests/benchdnn/deconv/ref_wino.cpp b/tests/benchdnn/deconv/ref_wino.cpp
index 617782af865..cdbe8b37616 100644
--- a/tests/benchdnn/deconv/ref_wino.cpp
+++ b/tests/benchdnn/deconv/ref_wino.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -305,7 +305,7 @@ void compute_wino_ref_fwd(const prb_t *prb, const args_t &args) {
     SAFE_V(prb->kh == 3 ? OK : FAIL);
     SAFE_V(prb->kw == 3 ? OK : FAIL);
 
-    bool with_bias = prb->dir & FLAG_BIA;
+    bool with_bias = prb->bia_dt() != dnnl_data_type_undef;
     const int64_t t_pad = prb->ph;
     const int64_t l_pad = prb->pw;
     const int64_t wp_max = prb->iw + l_pad;
@@ -431,7 +431,7 @@ void compute_wino_ref_bwd_d(const prb_t *prb, const args_t &args) {
     const int64_t hp_max = prb->oh + t_pad;
     const int64_t p_dim = prb->mb * sp.h_tiles * sp.w_tiles;
 
-    bool with_bias = prb->dir & FLAG_BIA;
+    bool with_bias = prb->bia_dt() != dnnl_data_type_undef;
 
     benchdnn_parallel_nd(prb->mb, prb->oc, sp.h_tiles, sp.w_tiles,
             [&](int64_t img, int64_t c, int64_t hfm, int64_t wfm) {
@@ -621,7 +621,7 @@ void compute_wino_ref_bwd_w(const prb_t *prb, const args_t &args) {
 
     free_scratchpad(&sp);
 
-    if (prb->dir & FLAG_BIA) compute_ref_bwd_bias(prb, args);
+    if (prb->bia_dt() != dnnl_data_type_undef) compute_ref_bwd_bias(prb, args);
 }
 
 } // namespace deconv
diff --git a/tests/benchdnn/dnn_types.cpp b/tests/benchdnn/dnn_types.cpp
index 1f12839baf1..64af9737454 100644
--- a/tests/benchdnn/dnn_types.cpp
+++ b/tests/benchdnn/dnn_types.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -210,7 +210,8 @@ int attr_t::policy2mask(int arg, policy_t policy,
             default: SAFE(FAIL, CRIT); return -1;
         }
     } else if (prim_kind == dnnl_matmul) {
-        if ((arg != DNNL_ARG_SRC && arg != DNNL_ARG_WEIGHTS)
+        if ((arg != DNNL_ARG_SRC && arg != DNNL_ARG_WEIGHTS
+                    && arg != DNNL_ARG_DST)
                 || policy == policy_t::COMMON)
             return attr_t::get_default_mask(policy);
 
@@ -219,6 +220,7 @@ int attr_t::policy2mask(int arg, policy_t policy,
             case PER_DIM_1:
             case PER_OC: return (1 << (ndims - 1));
             case PER_OCIC: return (1 << (ndims - 1)) + (1 << (ndims - 2));
+            case PER_TENSOR: return attr_t::get_default_mask(policy);
             default: SAFE_V(FAIL); return -1;
         }
     } else if (prim_kind == dnnl_layer_normalization) {
@@ -311,6 +313,7 @@ int attr_t::arg_scales_t::entry_t::from_str(const std::string &s) {
 
     if (!groups.empty()) {
         switch (this->policy) {
+            case PER_TENSOR:
             case PER_OC:
             case PER_OCIC:
                 if (this->groups.size() != 2) {
@@ -379,6 +382,8 @@ int attr_t::zero_points_t::entry_t::from_str(const std::string &s) {
             parser::parser_utils::stoll_safe, g_str, 'x');
     if (!groups.empty()) {
         switch (this->policy) {
+            case PER_TENSOR:
+            case PER_OC:
             case PER_OCIC:
                 if (this->groups.size() != 2) {
                     BENCHDNN_PRINT(0, "%s\n",
@@ -528,6 +533,7 @@ static po_table_entry_t kind_table[] = {
         {pk_t::MIN, {"min", "binary_min"}, dnnl_binary_min},
         {pk_t::MUL, {"mul", "binary_mul"}, dnnl_binary_mul},
         {pk_t::NE, {"ne", "binary_ne"}, dnnl_binary_ne},
+        {pk_t::SELECT, {"select", "binary_select"}, dnnl_binary_select},
         {pk_t::SUB, {"sub", "binary_sub"}, dnnl_binary_sub},
         {pk_t::BINARY_END, {"binary_undef"}, dnnl_alg_kind_undef},
         // prelu
@@ -591,17 +597,17 @@ std::vector<std::pair<int, int>> attr_t::post_ops_t::get_po_masks(
             continue;
 
         assert(mask >= 0);
-        v_masks.emplace_back(std::make_pair(
-                DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | arg, mask));
+        v_masks.emplace_back(DNNL_ARG_ATTR_MULTIPLE_POST_OP(idx) | arg, mask);
     }
     return v_masks;
 }
 
-bool attr_t::is_def(bool skip_fpmath) const {
+bool attr_t::is_def(bool skip_fpmath, bool skip_acc_mode) const {
     return scales.is_def() && zero_points.is_def() && post_ops.is_def()
             && scratchpad_mode == get_default_scratchpad_mode()
             && IMPLICATION(!skip_fpmath, fpmath_mode.is_def())
-            && acc_mode == dnnl_accumulation_mode_strict
+            && IMPLICATION(
+                    !skip_acc_mode, acc_mode == dnnl_accumulation_mode_strict)
             && rounding_mode.is_def() && deterministic.is_def()
             && dropout.is_def();
 }
@@ -624,7 +630,11 @@ bool attr_t::post_ops_t::entry_t::is_eltwise_kind() const {
     return kind > ELTWISE_START && kind < ELTWISE_END;
 }
 bool attr_t::post_ops_t::entry_t::is_binary_kind() const {
-    return kind > pk_t::BINARY_START && kind < pk_t::BINARY_END;
+    // binary select is a ternary operation and not currently
+    // supported in post-ops for the binary primitive
+    // TODO: add post-ops support for binary select operation
+    return kind > pk_t::BINARY_START && kind < pk_t::BINARY_END
+            && kind != pk_t::SELECT;
 }
 bool attr_t::post_ops_t::entry_t::is_prelu_kind() const {
     return kind == PRELU;
@@ -669,7 +679,7 @@ std::ostream &operator<<(
 
     s << scale.policy;
     if (scale.policy == policy_t::COMMON) s << ":" << scale.scale;
-    if (scale.dt != dnnl_f32) s << ':' << scale.dt;
+    if (scale.dt != dnnl_f32 || !scale.groups.empty()) s << ':' << scale.dt;
     if (!scale.groups.empty()) s << ":" << dims2str(scale.groups);
     return s;
 }
@@ -684,7 +694,8 @@ std::ostream &operator<<(
         s << arg2str(point.first) << ":" << point.second.policy;
         if (point.second.policy == policy_t::COMMON)
             s << ":" << point.second.value;
-        if (point.second.dt != dnnl_s32) s << ':' << point.second.dt;
+        if (point.second.dt != dnnl_s32 || !point.second.groups.empty())
+            s << ':' << point.second.dt;
         if (!point.second.groups.empty())
             s << ":" << dims2str(point.second.groups);
         delim = "+";
@@ -907,21 +918,27 @@ std::ostream &dump_global_params(std::ostream &s) {
     if (canonical || stream_kind != default_stream_kind)
         s << "--stream-kind=" << stream_kind << " ";
 #endif
-    if (canonical || cold_cache_mode != default_cold_cache_mode)
-        s << "--cold-cache=" << cold_cache_mode << " ";
+    if (canonical || cold_cache_input != default_cold_cache_input())
+        s << "--cold-cache=" << cold_cache_input << " ";
+    if (canonical || execution_mode != execution_mode_t::direct)
+        s << "--execution-mode=" << execution_mode2str(execution_mode) << " ";
 
     return s;
 }
 
-dnnl_engine_kind_t str2engine_kind(const char *str) {
-    const char *param = "cpu";
-    if (!strncasecmp(param, str, strlen(param))) return dnnl_cpu;
-
-    param = "gpu";
-    if (!strncasecmp(param, str, strlen(param))) return dnnl_gpu;
-
-    assert(!"not expected");
-    return dnnl_cpu;
+dnnl_engine_kind_t str2engine_kind(const std::string &s) {
+    if (s == "cpu") {
+        return dnnl_cpu;
+    } else if (s == "gpu") {
+        return dnnl_gpu;
+    } else {
+        BENCHDNN_PRINT(0,
+                "Error: engine kind supports values \'cpu\' and \'gpu\' only. "
+                "Given input: %s\n",
+                s.c_str());
+        SAFE_V(FAIL);
+    }
+    return dnnl_any_engine;
 }
 
 dnnl_scratchpad_mode_t str2scratchpad_mode(const char *str) {
@@ -1572,7 +1589,7 @@ float compute_eltwise_bwd(
     return NAN;
 }
 
-float compute_binary(pk_t kind, float src0, float src1) {
+float compute_binary(pk_t kind, float src0, float src1, bool src2) {
     // don't compute on nan, propagate it
     if (std::isnan(src0) || std::isnan(src1)) return NAN;
 
@@ -1600,6 +1617,8 @@ float compute_binary(pk_t kind, float src0, float src1) {
         return src0 == src1;
     } else if (kind == pk_t::NE) {
         return src0 != src1;
+    } else if (kind == pk_t::SELECT) {
+        return src2 ? src0 : src1;
     } else {
         assert(!"operation not supported!");
     }
@@ -1659,7 +1678,7 @@ void maybe_post_ops(const attr_t &attr, float &val, float sum_val,
             const auto &b = e.eltwise.beta;
             val = compute_eltwise_fwd(e.kind, val, a, b);
         } else if (e.is_binary_kind()) {
-            val = compute_binary(e.kind, val, *it_po);
+            val = compute_binary(e.kind, val, *it_po, false);
             it_po++;
         } else if (e.is_prelu_kind()) {
             val = val > 0 ? val : val * (*it_po);
@@ -1668,20 +1687,30 @@ void maybe_post_ops(const attr_t &attr, float &val, float sum_val,
     }
 }
 
-void update_cpu_ref_attrs(attr_t &attr, dnnl_data_type_t new_dt) {
+// This is a part of prim_ref setup. Update binary post-op data types and tags
+// to let prim_ref dispatch to the library as GPU may use data types not
+// supported natively on CPU, and output format may not be supported as well.
+// prim_ref will be configured and correspondent reorders used to a selected by
+// the library format.
+//
+// `dst_dt` helps to decide on sum_dt. When it's not f32, `sum.dt` must be
+// preserved, otherwise, updated.
+void update_cpu_ref_attrs(attr_t &attr, dnnl_data_type_t dst_dt) {
     auto &po = attr.post_ops;
     for (int idx = 0; idx < po.len(); ++idx) {
         auto &e = po.entry[idx];
-        if (!e.is_binary_kind()) continue;
-
-        e.binary.src1_dt = new_dt;
-        e.binary.tag = tag::abx; // Hardcoded in local fill functions.
-        // Since tag is updated, it might get printed with policy, which means
-        // that mask_input should be specified.
-        using mask_input_t
-                = attr_t::post_ops_t::entry_t::binary_t::mask_input_t;
-        if (e.binary.mask_input == mask_input_t::none)
-            e.binary.mask_input = mask_input_t::policy;
+        if (e.is_binary_kind()) {
+            e.binary.src1_dt = dnnl_f32;
+            e.binary.tag = tag::any;
+            // Since tag is updated, it might get printed with policy, which
+            // means that mask_input should be specified.
+            using mask_input_t
+                    = attr_t::post_ops_t::entry_t::binary_t::mask_input_t;
+            if (e.binary.mask_input == mask_input_t::none)
+                e.binary.mask_input = mask_input_t::policy;
+        } else if (e.is_sum_kind()) {
+            if (dst_dt == dnnl_f32) e.sum.dt = dnnl_data_type_undef;
+        }
     }
 }
 
@@ -1702,7 +1731,9 @@ int sparse_options_t::from_str(const std::string &s) {
     int options_count = 0;
     size_t start_pos = 0;
     while (start_pos != std::string::npos) {
-        auto subs = parser::get_substr(s, start_pos, ':');
+        // Note: `csr+0.99::` is a valid input, dangling `:` is legit.
+        auto subs = parser::get_substr(
+                s, start_pos, ':', /* allow_dangling = */ true);
 
         if (subs.empty()) {
             add(get_arg(options_count), sparse_options_t::def_encoding,
diff --git a/tests/benchdnn/dnn_types.hpp b/tests/benchdnn/dnn_types.hpp
index ee7b8d6e202..5ab1188ce4b 100644
--- a/tests/benchdnn/dnn_types.hpp
+++ b/tests/benchdnn/dnn_types.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -92,7 +92,7 @@ struct attr_t {
 
             bool is_def() const {
                 return policy == COMMON && value == 0 && dt == dnnl_s32
-                        && groups.size() == 0;
+                        && groups.empty();
             }
 
             policy_t policy = COMMON;
@@ -141,7 +141,6 @@ struct attr_t {
                     arg, e.policy, prim_kind, ndims, has_groups);
         }
 
-        zero_points_t() : points() {} // needed for debug icc190 build;
         std::map<int, entry_t> points;
     };
 
@@ -156,7 +155,7 @@ struct attr_t {
 
             bool is_def() const {
                 return policy == COMMON && scale == 1.f && dt == dnnl_f32
-                        && groups.size() == 0;
+                        && groups.empty();
             }
 
             policy_t policy = COMMON;
@@ -165,7 +164,7 @@ struct attr_t {
             std::vector<dnnl_dim_t> groups;
         };
 
-        void set(int arg, entry_t scale) { scales[arg] = scale; }
+        void set(int arg, const entry_t &scale) { scales[arg] = scale; }
 
         entry_t get(int arg) const {
             const auto &s = scales.find(arg);
@@ -194,8 +193,6 @@ struct attr_t {
         }
         int from_str(const std::string &s);
 
-        arg_scales_t() : scales() {} // needed for debug icc190 build;
-
         std::map<int, entry_t> scales;
     };
 
@@ -274,6 +271,7 @@ struct attr_t {
             MIN,
             MUL,
             NE,
+            SELECT,
             SUB,
             BINARY_END, // a guard to check kind is binary
             // prelu
@@ -338,7 +336,7 @@ struct attr_t {
             bool is_prelu_kind() const;
         };
 
-        post_ops_t() : entry() {}
+        post_ops_t() = default;
 
         int len() const { return (int)entry.size(); }
         bool is_def() const { return len() == 0; }
@@ -440,7 +438,7 @@ struct attr_t {
     dropout_t dropout;
     rounding_mode_t rounding_mode;
 
-    bool is_def(bool skip_fpmath = false) const;
+    bool is_def(bool skip_fpmath = false, bool skip_acc_mode = false) const;
 };
 
 struct isa_hints_t {
@@ -459,7 +457,7 @@ struct isa_hints_t {
     cpu_hints_t hints_;
     isa_hints_t(cpu_hints_t hints) : hints_(hints) {}
 
-    cpu_hints_t get() { return hints_; }
+    cpu_hints_t get() const { return hints_; }
 
     static std::string hints2str(const isa_hints_t &isa_hints) {
         switch (isa_hints.hints_) {
@@ -503,6 +501,17 @@ struct sparse_options_t {
         if (options_.count(arg) == 0) return dnnl_sparse_encoding_undef;
         return options_.at(arg).first;
     }
+    dnnl_sparse_encoding_t get_encoding(data_kind_t kind) const {
+        // Note: the commented code doesn't work as `arg` returned is a
+        // backward exec_arg. See the function comment.
+        // const auto arg = data_kind2exec_arg(kind);
+        // return get_encoding(arg);
+        switch (kind) {
+            case SRC: return get_encoding(DNNL_ARG_SRC);
+            case WEI: return get_encoding(DNNL_ARG_WEIGHTS);
+            default: return def_encoding;
+        }
+    }
 
     float get_sparsity(int arg) const {
         if (options_.count(arg) == 0) return 0.0f;
@@ -528,6 +537,7 @@ struct sparse_options_t {
 
     std::vector<int> get_args() const {
         std::vector<int> args;
+        args.reserve(options_.size());
         for (const auto &opt : options_) {
             args.push_back(opt.first);
         }
@@ -567,12 +577,8 @@ struct attr_args_t {
 
     attr_args_t() = default;
 
-    void prepare_scales(const attr_t &attr, int arg, int mask = -1) {
-        entries.insert(std::make_pair(DNNL_ARG_ATTR_SCALES | arg, mask));
-    };
-
-    void prepare_zero_points(const attr_t &attr, int arg, int mask = -1) {
-        entries.insert(std::make_pair(DNNL_ARG_ATTR_ZERO_POINTS | arg, mask));
+    void prepare_quant(const attr_t &attr, int arg, int mask = -1) {
+        entries.insert(std::make_pair(arg, mask));
     };
 
     int prepare_post_ops_mds(const attr_t &attr, int ndims,
@@ -633,7 +639,7 @@ std::string normalize_tag(const std::string &tag, int ndims = -1);
 dnnl_primitive_attr_t create_dnnl_attr(
         const attr_t &attr, const attr_args_t &attr_args);
 
-dnnl_engine_kind_t str2engine_kind(const char *str);
+dnnl_engine_kind_t str2engine_kind(const std::string &s);
 dnnl_scratchpad_mode_t str2scratchpad_mode(const char *str);
 dnnl_fpmath_mode_t str2fpmath_mode(const char *str);
 dnnl_accumulation_mode_t str2accumulation_mode(const char *str);
@@ -649,7 +655,8 @@ float compute_eltwise_fwd(
         attr_t::post_ops_t::kind_t kind, float src, float alpha, float beta);
 float compute_eltwise_bwd(attr_t::post_ops_t::kind_t kind, float d_dst,
         float src, float alpha, float beta);
-float compute_binary(attr_t::post_ops_t::kind_t kind, float src0, float src1);
+float compute_binary(
+        attr_t::post_ops_t::kind_t kind, float src0, float src1, bool src2);
 void maybe_dropout(const attr_t &attr, float &val, int64_t offset,
         const dnn_mem_t &dropout);
 void maybe_round(const attr_t &attr, int arg, float &val, int64_t offset,
@@ -661,9 +668,5 @@ inline void maybe_post_ops(
     maybe_post_ops(attr, val, sum_val, std::vector<float>());
 }
 
-// When using fast-ref option, reference expects everything to be in f32
-// data type and also no additional memories coming from runtime attributes.
-// That's why we update all data types to f32 and remove all runtime arguments
-// to makes them constant when possible.
-void update_cpu_ref_attrs(attr_t &attr, dnnl_data_type_t new_dt = dnnl_f32);
+void update_cpu_ref_attrs(attr_t &attr, dnnl_data_type_t dst_dt);
 #endif
diff --git a/tests/benchdnn/dnnl_common.cpp b/tests/benchdnn/dnnl_common.cpp
index e8d66fc8e9a..4eb5f7b6639 100644
--- a/tests/benchdnn/dnnl_common.cpp
+++ b/tests/benchdnn/dnnl_common.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <algorithm> // for std::reverse and std::copy
 #include <functional> // for std::bind and std::placeholders
 #include <list>
+#include <numeric>
 #include <string> // for std::string
 #include <utility> // for std::pair
 #include <vector> // for std::vector
@@ -141,7 +142,7 @@ struct lru_cache_t {
             cache_mapper_.erase(cache_list_.back().key_);
             cache_list_.pop_back();
         }
-        cache_list_.push_front(entry_t(key, value));
+        cache_list_.emplace_front(key, value);
         cache_mapper_.insert({key, cache_list_.begin()});
     }
 
@@ -269,11 +270,14 @@ int default_num_streams = 1;
 int num_streams = default_num_streams;
 
 void init_isa_settings() {
-    if (hints.get() == isa_hints_t::no_hints)
+    if (hints.get() == isa_hints_t::no_hints) {
         DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_no_hints));
-    else if (hints.get() == isa_hints_t::prefer_ymm)
-        DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_prefer_ymm));
-    else {
+    } else if (hints.get() == isa_hints_t::prefer_ymm) {
+        auto status = dnnl_set_cpu_isa_hints(dnnl_cpu_isa_prefer_ymm);
+        // Unimplemented is a valid status for non-x64
+        if (status == dnnl_success || status == dnnl_unimplemented) return;
+        DNN_SAFE_V(status);
+    } else {
         // Do nothing when hints == none
         assert(hints.get() == isa_hints_t::none);
     }
@@ -353,7 +357,7 @@ void execute_unmap_args(
 
 // Map the memory back after execute
 void execute_map_args(const args_t &args) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return;
 
     for (int i = 0; i < args.size(); ++i)
         if (!args.dnn_mem(i).is_mapped()) args.dnn_mem(i).map();
@@ -364,13 +368,45 @@ int execute_and_wait(perf_function_t &exec_func, const dnnl_engine_t &engine,
     stream_t stream(engine);
     std::vector<dnnl_exec_arg_t> dnnl_args;
 
-    execute_unmap_args(args, dnnl_args);
+    TIME_EXECUTE(execute_unmap_args(args, dnnl_args));
 
-    auto status = exec_func(stream, dnnl_args);
-    DNN_SAFE(dnnl_stream_wait(stream), CRIT);
+    dnnl_status_t status = dnnl_runtime_error;
+    bool run_regular_exec = true;
+#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
+    while (execution_mode == execution_mode_t::graph && is_gpu(engine)) {
+        void *queue_ptr;
+        DNN_SAFE(dnnl_sycl_interop_stream_get_queue(stream, &queue_ptr), CRIT);
+        sycl::queue queue = *static_cast<sycl::queue *>(queue_ptr);
+        const bool can_run_sycl_graph = queue.get_device().get_backend()
+                == sycl::backend::ext_oneapi_level_zero;
+        if (!can_run_sycl_graph) break;
+
+        BENCHDNN_PRINT(
+                2, "%s\n", "[INFO] Using experimental SYCL graph execution.");
+        sycl::ext::oneapi::experimental::command_graph graph {
+                queue.get_context(), queue.get_device()};
+
+        graph.begin_recording(queue);
+        status = exec_func(stream, dnnl_args);
+        graph.end_recording(queue);
+        DNN_SAFE(dnnl_stream_wait(stream), CRIT);
+
+        auto exec = graph.finalize();
+        queue.ext_oneapi_graph(exec).wait();
+
+        // SYCL graph feature completed submission and execution, no need to
+        // have a regular run.
+        run_regular_exec = false;
+        break;
+    }
+#endif
+    if (run_regular_exec) {
+        TIME_EXECUTE(status = exec_func(stream, dnnl_args));
+        TIME_EXECUTE(DNN_SAFE(dnnl_stream_wait(stream), CRIT));
+    }
     if (res) res->state = EXECUTED;
 
-    execute_map_args(args);
+    TIME_EXECUTE(execute_map_args(args));
     if (status != dnnl_success) {
         if (res) res->state = FAILED;
         return FAIL;
@@ -401,8 +437,8 @@ void reset_gpu_profiling(dnnl_stream_t stream) {
 #endif
 }
 
-void get_gpu_profiling_info(dnnl_stream_t stream, std::vector<uint64_t> &nsecs,
-        std::vector<uint64_t> &cycles) {
+int get_gpu_profiling_info(dnnl_stream_t stream, std::vector<uint64_t> &nsecs,
+        std::vector<uint64_t> &cycles, int expected_num_entries) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
         || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
     dnnl_profiling_data_kind_t undef_kind {};
@@ -420,15 +456,29 @@ void get_gpu_profiling_info(dnnl_stream_t stream, std::vector<uint64_t> &nsecs,
 #endif
 
     int num_entries = 0;
-    DNN_SAFE_V(dnnl_query_profiling_data(
-            stream, undef_kind, &num_entries, nullptr));
+    DNN_SAFE(dnnl_query_profiling_data(
+                     stream, undef_kind, &num_entries, nullptr),
+            CRIT);
+    if (expected_num_entries != -1 && num_entries != expected_num_entries) {
+        BENCHDNN_PRINT(0,
+                "ERROR: profiling entries mismatch, expected: %d entries but "
+                "got %d entries\n",
+                expected_num_entries, num_entries);
+        return FAIL;
+    }
+    DNN_SAFE(dnnl_query_profiling_data(
+                     stream, time_kind, &num_entries, nsecs.data()),
+            CRIT);
     nsecs.resize(num_entries);
     cycles.resize(num_entries);
-    DNN_SAFE_V(dnnl_query_profiling_data(
-            stream, time_kind, &num_entries, nsecs.data()));
-    DNN_SAFE_V(dnnl_query_profiling_data(
-            stream, cycles_kind, &num_entries, cycles.data()));
+    DNN_SAFE(dnnl_query_profiling_data(
+                     stream, time_kind, &num_entries, nsecs.data()),
+            CRIT);
+    DNN_SAFE(dnnl_query_profiling_data(
+                     stream, cycles_kind, &num_entries, cycles.data()),
+            CRIT);
 #endif
+    return OK;
 }
 
 void notify_gpu_profiling_complete(dnnl_stream_t stream) {
@@ -444,7 +494,7 @@ void finalize() {
 
 inline int measure_perf_individual(timer::timer_t &t, dnnl_stream_t stream,
         perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) {
-    cold_cache_t cold_cache(dnnl_args);
+    cold_cache_t cold_cache(dnnl_args, stream);
 
     t.reset();
     while (true) {
@@ -476,8 +526,8 @@ inline int measure_perf_aggregate(timer::timer_t &t,
         // kernel has not been built and skews the results.
         DNN_SAFE(perf_func(v_stream[j], dnnl_args[j]), WARN);
         DNN_SAFE(dnnl_stream_wait(v_stream[j]), CRIT);
+        cold_cache[j] = cold_cache_t(dnnl_args[j], v_stream[j]);
         if (use_profiling) reset_gpu_profiling(v_stream[j]);
-        cold_cache[j] = cold_cache_t(dnnl_args[j]);
     }
 
     bool is_first_loop = true;
@@ -486,11 +536,14 @@ inline int measure_perf_aggregate(timer::timer_t &t,
 
     t.reset();
     while (true) {
+        // Keep separate var due to a `break` inside the loop.
+        int execute_count = 0;
         // Keep inner loop over streams for better submission overlapping.
         for_(int i = 0; i < cur_batch_times; i++)
         for (size_t j = 0; j < v_stream.size(); j++) {
             if (!cold_cache[j].update_dnnl_args(dnnl_args[j])) break;
             DNN_SAFE(perf_func(v_stream[j], dnnl_args[j]), WARN);
+            execute_count++;
         }
 
         for (size_t j = 0; j < v_stream.size(); j++) {
@@ -502,7 +555,9 @@ inline int measure_perf_aggregate(timer::timer_t &t,
             std::vector<std::vector<uint64_t>> v_cycles(num_streams);
             bool nsecs_is_empty = false;
             for (size_t j = 0; j < v_stream.size(); j++) {
-                get_gpu_profiling_info(v_stream[j], v_nsecs[j], v_cycles[j]);
+                SAFE(get_gpu_profiling_info(v_stream[j], v_nsecs[j],
+                             v_cycles[j], execute_count),
+                        CRIT);
                 reset_gpu_profiling(v_stream[j]);
 
                 // Profiling should have information to report, otherwise, stop.
@@ -613,8 +668,7 @@ std::vector<float> prepare_po_vals(const dnn_mem_t &dst_m, const args_t &args,
     std::vector<float> v_vals(v_po_masks.size());
 
     for (size_t d = 0; d < v_po_masks.size(); ++d) {
-        const auto po_offset
-                = dst_m.get_scale_idx(dst_off, v_po_masks[d].second);
+        const auto po_offset = dst_m.get_idx(dst_off, v_po_masks[d].second);
         const float val = args.find(v_po_masks[d].first).get_elem(po_offset);
         v_vals[d] = val;
     }
@@ -644,6 +698,10 @@ void skip_unimplemented_data_type(
                             !(dir & FLAG_INF), has_training_support(dnnl_f16)));
     const bool has_e8m0_support
             = is_gpu() || (is_cpu() && has_data_type_support(dnnl_e8m0));
+    const bool has_f4_e2m1_support
+            = is_gpu() || (is_cpu() && has_data_type_support(dnnl_f4_e2m1));
+    const bool has_f4_e3m0_support
+            = is_gpu() || (is_cpu() && has_data_type_support(dnnl_f4_e3m0));
     const bool has_f8_e5m2_support = is_gpu()
             || (is_cpu() && has_data_type_support(dnnl_f8_e5m2)
                     && (dir & FLAG_INF));
@@ -654,6 +712,8 @@ void skip_unimplemented_data_type(
     const bool has_bf16_support = is_gpu();
     // f16 is supported on GPU for inference only.
     const bool has_f16_support = is_gpu() && (dir & FLAG_FWD);
+    const bool has_f4_e2m1_support = is_gpu();
+    const bool has_f4_e3m0_support = false;
     const bool has_e8m0_support = is_gpu();
     const bool has_f8_e5m2_support = is_gpu();
     const bool has_f8_e4m3_support = is_gpu();
@@ -666,6 +726,8 @@ void skip_unimplemented_data_type(
             case dnnl_f16: need_skip = !has_f16_support; break;
             case dnnl_f64: need_skip = !has_f64_support; break;
             case dnnl_e8m0: need_skip = !has_e8m0_support; break;
+            case dnnl_f4_e2m1: need_skip = !has_f4_e2m1_support; break;
+            case dnnl_f4_e3m0: need_skip = !has_f4_e3m0_support; break;
             case dnnl_f8_e5m2: need_skip = !has_f8_e5m2_support; break;
             case dnnl_f8_e4m3: need_skip = !has_f8_e4m3_support; break;
             default: break;
@@ -806,6 +868,10 @@ int check_same_pd(const dnnl_primitive_desc_t &pd_no_attr, res_t *res) {
 int check_ref_impl_hit(res_t *res) {
     if (!check_ref_impl) return OK;
 
+    // Nvidia, AMD and Generic backends use reference implementations to fill
+    // gaps in feature support.
+    if (is_nvidia_gpu() || is_amd_gpu() || is_generic_gpu()) return OK;
+
     const auto &impl_name = res->impl_name;
     if (impl_name.find("ref") != std::string::npos) {
         res->state = FAILED;
@@ -835,7 +901,7 @@ bool is_opencl_engine(const dnnl_engine_t &engine) {
 }
 
 bool is_nvidia_gpu(const dnnl_engine_t &engine) {
-#ifdef DNNL_WITH_SYCL
+#if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR == DNNL_VENDOR_NVIDIA
     if (!is_gpu(engine)) return false;
     constexpr int nvidia_vendor_id = 0x10DE;
     auto eng = dnnl::engine(engine, true);
@@ -848,7 +914,7 @@ bool is_nvidia_gpu(const dnnl_engine_t &engine) {
 }
 
 bool is_amd_gpu(const dnnl_engine_t &engine) {
-#ifdef DNNL_WITH_SYCL
+#if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR == DNNL_VENDOR_AMD
     if (!is_gpu(engine)) return false;
     constexpr int amd_vendor_id = 0x1002;
     auto eng = dnnl::engine(engine, true);
@@ -860,6 +926,14 @@ bool is_amd_gpu(const dnnl_engine_t &engine) {
     return false;
 }
 
+bool is_generic_gpu(const dnnl_engine_t &engine) {
+#if defined(DNNL_WITH_SYCL) && DNNL_GPU_VENDOR == DNNL_VENDOR_GENERIC
+    return is_gpu(engine);
+#endif
+
+    return false;
+}
+
 bool is_f64_supported(const dnnl_engine_t &engine) {
     if (!is_gpu(engine)) return false;
     if (is_nvidia_gpu(engine) || is_amd_gpu(engine)) return false;
@@ -867,11 +941,7 @@ bool is_f64_supported(const dnnl_engine_t &engine) {
     if (is_sycl_engine(engine)) {
         auto eng = dnnl::engine(engine, true);
         auto dev = dnnl::sycl_interop::get_device(eng);
-#ifdef DNNL_SYCL_INTEROP_USE_SYCL121
-        return dev.has_extension("cl_khr_fp64");
-#else
         return dev.has(::sycl::aspect::fp64);
-#endif
     }
 #endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
@@ -1013,8 +1083,40 @@ int get_gpu_cache_size(size_t &cache_size) {
     return OK;
 }
 
-static int check_total_size(
-        const check_mem_size_args_t &check_mem_size_args, res_t *res) {
+std::string smart_bytes(double bytes) {
+    std::string s;
+    static constexpr int oneK = 1024;
+
+    if (bytes < oneK) {
+        s = std::to_string(static_cast<size_t>(bytes)) + " B";
+        return s;
+    }
+    auto KB = bytes / oneK;
+    if (KB < oneK) {
+        s = std::to_string(KB) + " KB";
+        return s;
+    }
+    auto MB = KB / oneK;
+    if (MB < oneK) {
+        s = std::to_string(MB) + " MB";
+        return s;
+    }
+    auto GB = MB / oneK;
+    s = std::to_string(GB) + " GB";
+    return s;
+}
+
+// The function logic is the following:
+// `checkit` function verifies that the bare minimum (the library and the stock
+// reference) memory requirements are complied with the limits.
+// If no, drop the case, can't run it.
+// If yes, the second call to this function with `prim_ref` specified will
+// check memory requirements for prim_ref, update according memory parts and
+// verify updated numbers if they are complied.
+// If yes, good to go with a `prim_ref`.
+// If no, indicate that the system won't make it and drop `prim_ref` falling
+// back to stock reference.
+int check_total_size(res_t *res, dnnl_primitive_t prim_ref) {
     static size_t cpu_device_capacity = get_cpu_ram_size();
     static size_t gpu_device_capacity = 0;
     static size_t gpu_max_alloc_capacity = 0;
@@ -1022,24 +1124,26 @@ static int check_total_size(
 
     const size_t device_max_capacity
             = is_cpu() ? cpu_device_capacity : gpu_device_capacity;
-    const size_t cpu_max_capacity = cpu_device_capacity;
 
     // 0.75f is taken randomly and is subject to change in future.
     const double capacity_factor = 0.75;
     const double benchdnn_device_limit = capacity_factor * device_max_capacity;
-    const double benchdnn_cpu_limit = capacity_factor * cpu_max_capacity;
+    const double benchdnn_cpu_limit = capacity_factor * cpu_device_capacity;
+    const double benchdnn_combined_limit = 0.90 * cpu_device_capacity;
     assert(benchdnn_device_limit > 0 && benchdnn_cpu_limit > 0);
 
-    auto GB = [](double bytes) { return bytes / powf(2, 30); };
     auto dir_c_str = [&res]() {
-        return (res->mem_check_dir & FLAG_FWD) ? "FWD" : "BWD";
+        assert(res->mem_size_args.dir != DIR_UNDEF);
+        return (res->mem_size_args.dir & FLAG_FWD) ? "FWD" : "BWD";
     };
 
+    const check_mem_size_args_t &check_mem_size_args = res->mem_size_args;
+
     if (is_gpu()) {
         const bool fits_device_ram = check_mem_size_args.total_size_device
                 <= benchdnn_device_limit;
         if (!fits_device_ram) {
-            BENCHDNN_PRINT(2,
+            BENCHDNN_PRINT(1,
                     "[CHECK_MEM][%s]: Not enough device RAM for a problem.\n",
                     dir_c_str());
             res->state = SKIPPED;
@@ -1051,10 +1155,11 @@ static int check_total_size(
                 check_mem_size_args.sizes.cend(), [&](size_t s) {
                     const bool fit = s < gpu_max_alloc_capacity;
                     if (!fit) {
-                        BENCHDNN_PRINT(2,
-                                "[CHECK_MEM][%s]: Allocation of size %g GB "
-                                "doesn't fit allocation limit of %g GB.\n",
-                                dir_c_str(), GB(s), GB(gpu_max_alloc_capacity));
+                        BENCHDNN_PRINT(1,
+                                "[CHECK_MEM][%s]: Allocation of size %s "
+                                "doesn't fit allocation limit of %s.\n",
+                                dir_c_str(), smart_bytes(s).c_str(),
+                                smart_bytes(gpu_max_alloc_capacity).c_str());
                     }
                     return fit;
                 });
@@ -1063,22 +1168,58 @@ static int check_total_size(
             res->reason = skip_reason::not_enough_ram;
         }
 
-        BENCHDNN_PRINT((!fits_device_ram ? 2 : 6),
-                "[CHECK_MEM][%s]: Requested: %g GB; benchdnn_device_limit: %g "
-                "GB; device_RAM_capacity: %g GB; gpu_max_alloc: %g GB;\n",
-                dir_c_str(), GB(check_mem_size_args.total_size_device),
-                GB(benchdnn_device_limit), GB(gpu_device_capacity),
-                GB(gpu_max_alloc_capacity));
+        BENCHDNN_PRINT((!fits_device_ram ? 1 : 6),
+                "[CHECK_MEM][%s]: Requested: %s; benchdnn_device_limit: %s; "
+                "device_RAM_capacity: %s; gpu_max_alloc: %s;\n",
+                dir_c_str(),
+                smart_bytes(check_mem_size_args.total_size_device).c_str(),
+                smart_bytes(benchdnn_device_limit).c_str(),
+                smart_bytes(gpu_device_capacity).c_str(),
+                smart_bytes(gpu_max_alloc_capacity).c_str());
+    }
+
+    // Note: in theory, `total_size_ref` can be smaller for a `prim_ref` because
+    // stock reference uses f32 for estimation and best `prim_ref` tries
+    // requested data types first which can be lower precision data types which
+    // require less memory.
+    size_t total_size_ref = check_mem_size_args.total_size_ref;
+    if (prim_ref) {
+        // Collect memory sizes of prim_ref.
+        check_mem_size_args_t prim_ref_mem_size_args;
+        collect_mem_size(prim_ref_mem_size_args, query_pd(prim_ref), DIR_UNDEF,
+                /* need_skip = */ false);
+        // Update reference size number.
+        total_size_ref = std::accumulate(prim_ref_mem_size_args.sizes.begin(),
+                prim_ref_mem_size_args.sizes.end(), 0ULL);
     }
 
-    size_t total_size_cpu = check_mem_size_args.total_size_cpu;
-    if (is_cpu()) total_size_cpu += check_mem_size_args.total_size_device;
-    bool fits_cpu_ram = total_size_cpu <= benchdnn_cpu_limit;
+    size_t total_size_cpu = total_size_ref
+            + check_mem_size_args.total_size_compare
+            + check_mem_size_args.total_size_mapped;
+    // If the problem runs on CPU, the combined memory represents requirements
+    // for the library and for the reference paths.
+    // If the problem runs on a device, the combined memory represents potential
+    // requirement for integrated devices that use CPU pool for both memories.
+    // The second case has higher limit because TODO:<the_reason>.
+    size_t cpu_and_device_size
+            = total_size_cpu + check_mem_size_args.total_size_device;
+    bool fits_cpu_ram = cpu_and_device_size
+            <= (is_cpu() ? benchdnn_cpu_limit : benchdnn_combined_limit);
+
+    // Check combined size against CPU capacity as the simpler method to account
+    // for integrated devices and mapping/unmapping memory.
+
+    if (!fits_cpu_ram) {
+        std::string prim_ref_msg
+                = prim_ref ? " with CPU primitive reference" : "";
+        BENCHDNN_PRINT(1,
+                "[CHECK_MEM][%s]: Not enough CPU RAM for a problem%s.\n",
+                dir_c_str(), prim_ref_msg.c_str());
+        res->state = SKIPPED;
+        res->reason = skip_reason::not_enough_ram;
+    }
 
     if (!fits_cpu_ram) {
-        BENCHDNN_PRINT(2,
-                "[CHECK_MEM][%s]: Not enough CPU RAM for a problem.\n",
-                dir_c_str());
         // Try to catch a huge scratchpad size requested by the library.
         // Use following logic:
         //     scratch_size
@@ -1087,25 +1228,41 @@ static int check_total_size(
         //
         // 0.75 value supposed to be experimental and might be adjusted.
         static constexpr float scratch_trh = 0.75f;
-        if (check_mem_size_args.scratchpad_size
-                > scratch_trh * total_size_cpu) {
-            BENCHDNN_PRINT(2,
+        if (is_cpu()
+                && check_mem_size_args.scratchpad_size
+                        > scratch_trh * check_mem_size_args.total_size_device) {
+            BENCHDNN_PRINT(1,
                     "[CHECK_MEM][%s]: CPU scratchpad size `%zu` exceeded a "
                     "given threshold `%zu`.\n",
                     dir_c_str(), check_mem_size_args.scratchpad_size,
-                    (size_t)(scratch_trh * total_size_cpu));
+                    (size_t)(scratch_trh
+                            * check_mem_size_args.total_size_device));
             res->state = FAILED;
-        } else {
-            res->state = SKIPPED;
         }
-        res->reason = skip_reason::not_enough_ram;
     }
 
-    BENCHDNN_PRINT((!fits_cpu_ram ? 2 : 6),
-            "[CHECK_MEM][%s]: Requested: %g GB; benchdnn_CPU_limit: %g GB; "
-            "CPU_RAM_capacity: %g GB;\n",
-            dir_c_str(), GB(total_size_cpu), GB(benchdnn_cpu_limit),
-            GB(cpu_device_capacity));
+    BENCHDNN_PRINT((!fits_cpu_ram ? 1 : 6),
+            "[CHECK_MEM][%s]: benchdnn_CPU_limit: %s; CPU_RAM_capacity: %s;\n",
+            dir_c_str(), smart_bytes(benchdnn_cpu_limit).c_str(),
+            smart_bytes(cpu_device_capacity).c_str());
+
+    std::string sizes_str;
+    for (const auto sz : check_mem_size_args.sizes) {
+        const bool is_scratchpad = sz == check_mem_size_args.scratchpad_size;
+        sizes_str += smart_bytes(sz) + (is_scratchpad ? " (Scratchpad)" : "")
+                + ", ";
+    }
+    BENCHDNN_PRINT(6, "[CHECK_MEM][%s]: Sizes: {%s};\n", dir_c_str(),
+            sizes_str.c_str());
+
+    std::string total_size_device_str = is_cpu()
+            ? smart_bytes(check_mem_size_args.total_size_device) + " (Lib), "
+            : "";
+    BENCHDNN_PRINT((!fits_cpu_ram ? 1 : 6),
+            "[CHECK_MEM][%s]: Requested: %s%s (Service), %s (combined);\n",
+            dir_c_str(), total_size_device_str.c_str(),
+            smart_bytes(total_size_cpu).c_str(),
+            smart_bytes(cpu_and_device_size).c_str());
 
     return res->state == FAILED ? FAIL : OK;
 }
@@ -1130,41 +1287,37 @@ void add_md_size(const_dnnl_memory_desc_t md,
     // However due to a driver issue oneDNN pretends that shared USM is not
     // accessible on the host, hence map will allocate an extra memory.
     const bool mapped_mem_factor = !is_cpu()
-            && !has_bench_mode_modifier(mode_modifier_t::no_host_memory);
+            && !has_bench_mode_modifier(mode_modifier_t::no_ref_memory);
 
     // Mapped memory for GPU backend on CPU.
-    check_mem_size_args.total_size_cpu += mapped_mem_factor * mem_size;
+    check_mem_size_args.total_size_mapped += mapped_mem_factor * mem_size;
 
-    if (check_mem_size_args.is_scratchpad) {
-        check_mem_size_args.scratchpad_size += mem_size;
-    } else {
-        const bool is_corr = has_bench_mode_bit(mode_bit_t::corr);
-        const bool is_bitwise = has_bench_mode_bit(mode_bit_t::bitwise);
-        // Reference memories are always tag::abx fp32, hence need re-creating
-        // memory descriptor and take its size.
-        auto ref_md = dnn_mem_t::init_md(
-                query_md_ndims(md), query_md_dims(md), dnnl_f32, tag::abx);
-        const auto ref_md_size = dnnl_memory_desc_get_size(ref_md);
-
-        const size_t ref_mem_idx = check_mem_size_args.want_input ? 0 : 1;
-        check_mem_size_args.total_ref_md_size[ref_mem_idx] = ref_md_size;
-
-        // A memory copy for ref_compute, happens only in correctness.
-        check_mem_size_args.total_size_cpu += is_corr * ref_md_size;
-
-        // Comparison function allocates an additional tag::abx f32 memory.
-        // This allocation holds for correctness and bitwise modes.
-        const bool compare_mem_factor
-                = !check_mem_size_args.want_input && (is_corr || is_bitwise);
-        check_mem_size_args.total_size_cpu += compare_mem_factor * ref_md_size;
-
-        // Bitwise comparison allocates an additional tag::abx f32 memory from
-        // the first run to compare results against it.
-        const bool bitwise_compare_mem_factor
-                = !check_mem_size_args.want_input && is_bitwise;
-        check_mem_size_args.total_size_cpu
-                += bitwise_compare_mem_factor * ref_md_size;
-    }
+    const bool is_corr = has_bench_mode_bit(mode_bit_t::corr);
+    const bool is_bitwise = has_bench_mode_bit(mode_bit_t::bitwise);
+    // Reference memories are always tag::abx fp32, hence need re-creating
+    // memory descriptor and take its size.
+    auto ref_md = dnn_mem_t::init_md(
+            query_md_ndims(md), query_md_dims(md), dnnl_f32, tag::abx);
+    const auto ref_md_size = dnnl_memory_desc_get_size(ref_md);
+
+    const size_t ref_mem_idx = check_mem_size_args.want_input ? 0 : 1;
+    check_mem_size_args.total_ref_md_size[ref_mem_idx] = ref_md_size;
+
+    // A memory copy for ref_compute, happens only in correctness.
+    check_mem_size_args.total_size_ref += is_corr * ref_md_size;
+
+    // Comparison function allocates an additional tag::abx f32 memory.
+    // This allocation holds for correctness and bitwise modes.
+    const bool compare_mem_factor
+            = !check_mem_size_args.want_input && (is_corr || is_bitwise);
+    check_mem_size_args.total_size_compare += compare_mem_factor * ref_md_size;
+
+    // Bitwise comparison allocates an additional tag::abx f32 memory from
+    // the first run to compare results against it.
+    const bool bitwise_compare_mem_factor
+            = !check_mem_size_args.want_input && is_bitwise;
+    check_mem_size_args.total_size_compare
+            += bitwise_compare_mem_factor * ref_md_size;
 }
 
 bool is_fwd_training(dnnl_prop_kind_t prop_kind) {
@@ -1226,69 +1379,69 @@ void get_memory_bytes(check_mem_size_args_t &check_mem_size_args) {
 int check_mem_size(const_dnnl_memory_desc_t md, res_t *res) {
     if (!mem_check) return OK;
 
-    check_mem_size_args_t check_mem_size_args(nullptr, false);
     const auto md_size = dnnl_memory_desc_get_size(md);
-    check_mem_size_args.total_size_device = md_size;
-    check_mem_size_args.sizes.push_back(md_size);
-
-    return check_total_size(check_mem_size_args, res);
+    res->mem_size_args.total_size_device = md_size;
+    res->mem_size_args.sizes.push_back(md_size);
+    return check_total_size(res);
 }
 
-int check_mem_size(const_dnnl_primitive_desc_t const_pd, res_t *res, dir_t dir,
-        bool need_skip) {
+int collect_mem_size(check_mem_size_args_t &mem_size_args,
+        const_dnnl_primitive_desc_t const_pd, dir_t dir, bool need_skip) {
     // Skip the check if it is disabled.
     if (!mem_check) return OK;
 
     // Skip the check if the test object won't be executed.
     if (!has_bench_mode_bit(mode_bit_t::exec)) return OK;
 
-    // Skip the check if it has already happened for provided `dir`. Saves from
-    // repreated run when the second test object is created to test the
-    // primitive cache, but allows to verify both objects when a double-run
-    // driver executes fwd-for-bwd first and bwd after.
-    if (need_skip && res->mem_check_dir == dir) return OK;
-    res->mem_check_dir = dir;
+    // Skip the check if it has already happened for the passed `dir`.
+    // It saves from a repeated run when the second test object is created to
+    // validate the primitive cache. At the same time it allows to verify both
+    // test objects when a double-run driver executes the fwd-for-bwd object
+    // first and the bwd object after.
+    // ANCHOR: MEM_CHECK_ARGS_DIR;
+    if (need_skip && mem_size_args.dir == dir) return OK;
 
     // Get input sizes.
-    check_mem_size_args_t check_mem_size_args(const_pd, /* input = */ true);
+    check_mem_size_args_t check_mem_size_args(
+            const_pd, /* input = */ true, dir);
     get_memory_bytes(check_mem_size_args);
 
     // Get scratchpad size.
-    // Since scratchpad modes are mutually excluded, it takes sizes of both
-    // modes as either of them will report 0 size depending on the mode.
-    const auto library_scratchpad_size = query_mem_consumption(const_pd);
-    if (library_scratchpad_size > 0) {
-        // Update same fields as `add_md_size` would. See details there.
-        check_mem_size_args.sizes.push_back(library_scratchpad_size);
-        check_mem_size_args.total_size_device += library_scratchpad_size;
-        check_mem_size_args.scratchpad_size += library_scratchpad_size;
-    } else {
-        check_mem_size_args.is_scratchpad = true;
-        const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD);
-        add_md_size(scratchpad_md, check_mem_size_args);
-        check_mem_size_args.is_scratchpad = false;
-    }
+    // Since scratchpad modes are mutually excluded, get sizes of both modes as
+    // either of them will report 0 size depending on the mode, and take the
+    // biggest from them.
+    const size_t library_scratchpad_size
+            = static_cast<size_t>(query_mem_consumption(const_pd));
+    const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD);
+    const size_t user_scratchpad_size
+            = dnnl_memory_desc_get_size(scratchpad_md);
+    const size_t scratchpad_size
+            = MAX2(library_scratchpad_size, user_scratchpad_size);
+    // Update same fields as `add_md_size` would. See details there.
+    check_mem_size_args.sizes.push_back(scratchpad_size);
+    check_mem_size_args.total_size_device += scratchpad_size;
+    check_mem_size_args.scratchpad_size = scratchpad_size;
 
     // Get output sizes.
     check_mem_size_args.want_input = false;
     get_memory_bytes(check_mem_size_args);
 
-    // Save the mem size args for graph driver check.
-    res->mem_size_args = check_mem_size_args;
-
-    return check_total_size(check_mem_size_args, res);
+    // Copy memory stats. It's required to accumulate them before performing
+    // the check.
+    mem_size_args = check_mem_size_args;
+    return OK;
 }
 
 int get_memory_footprint(const_dnnl_primitive_desc_t const_pd, res_t *res) {
     check_mem_size_args_t check_mem_in_size_args(
-            const_pd, /* want_input = */ true);
+            const_pd, /* want_input = */ true, DIR_UNDEF);
     get_memory_bytes(check_mem_in_size_args); // Get input bytes.
     check_mem_size_args_t check_mem_out_size_args(
-            const_pd, /* want_input = */ false);
+            const_pd, /* want_input = */ false, DIR_UNDEF);
     get_memory_bytes(check_mem_out_size_args); // Get output bytes.
 
     // Sum post-ops include dst bytes as an input. Not included in get_memory_bytes
-    // since it would cause check_mem_size to double-count dst bytes.
+    // since it would cause `collect_mem_size` to double-count dst bytes.
     auto const_attr_po = query_post_ops(const_pd);
     auto po_len = dnnl_post_ops_len(const_attr_po);
     for (int idx = 0; idx < po_len; ++idx) {
@@ -1349,6 +1502,32 @@ memory_kind_ext_t str2memory_kind(const char *str) {
     return memory_kind_ext_t::usm;
 }
 
+const char *execution_mode2str(execution_mode_t mode) {
+#define EXECUTION_MODE_TO_STR(name, ...) \
+    if (execution_mode_t::name == mode) return #name;
+
+    EXECUTION_MODE_TO_STR(direct);
+    EXECUTION_MODE_TO_STR(graph);
+#undef EXECUTION_MODE_STR
+
+    BENCHDNN_PRINT(0, "%s", "Error: execution mode value is not recognized.\n");
+    SAFE_V(FAIL);
+    return "";
+}
+
+execution_mode_t str2execution_mode(const char *str) {
+#define STR_TO_EXECUTION_MODE(name, ...) \
+    if (!strcasecmp(#name, str)) return execution_mode_t::name;
+
+    STR_TO_EXECUTION_MODE(direct);
+    STR_TO_EXECUTION_MODE(graph);
+#undef STR_TO_EXECUTION_MODE
+
+    BENCHDNN_PRINT(0, "%s", "Error: execution mode value is not recognized.\n");
+    SAFE_V(FAIL);
+    return execution_mode_t::direct;
+}
+
 static void maybe_print_cpu_engine_error_message() {
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
     fprintf(stderr,
@@ -1368,17 +1547,11 @@ engine_t::engine_t(dnnl_engine_kind_t engine_kind) : is_owner_(true) {
     dnnl_status_t status = dnnl_engine_create(&engine_, engine_kind, idx);
     if (engine_kind == dnnl_cpu && status != dnnl_success)
         maybe_print_cpu_engine_error_message();
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) {
-        if (engine_tgt_kind != dnnl_gpu) {
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
+        if (has_bench_mode_bit(mode_bit_t::corr)) {
             BENCHDNN_PRINT(0, "%s\n",
-                    "Error: the modifier to disable host memory usage is "
-                    "supported for GPU engine only.");
-            status = dnnl_invalid_arguments;
-        }
-        if (!has_bench_mode_bit(mode_bit_t::perf)) {
-            BENCHDNN_PRINT(0, "%s\n",
-                    "Error: the modifier to disable host memory usage is "
-                    "supported for performance mode only.");
+                    "Error: the modifier to disable host memory usage "
+                    "cannot be used for correctness testing.");
             status = dnnl_invalid_arguments;
         }
     }
@@ -1594,7 +1767,12 @@ int update_ref_mem_map_from_prim(dnnl_primitive_t prim_ref,
     // Avoid replacing memories that have already been properly filled.
     if (skip_replace) return OK;
 
-    if (!is_scratchpad) SAFE(prim_ref_mem.reorder(ref_mem, swapped_dt), WARN);
+    if (!is_scratchpad) {
+        const auto prim_ref_swapped_dt = query_md_data_type(ref_md) == dnnl_f32
+                ? dnnl_data_type_undef
+                : swapped_dt;
+        SAFE(prim_ref_mem.reorder(ref_mem, prim_ref_swapped_dt), WARN);
+    }
     ref_mem_map[exec_arg] = std::move(prim_ref_mem);
 
     return OK;
diff --git a/tests/benchdnn/dnnl_common.hpp b/tests/benchdnn/dnnl_common.hpp
index 908bcb29531..865d638bae7 100644
--- a/tests/benchdnn/dnnl_common.hpp
+++ b/tests/benchdnn/dnnl_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 #include "utils/dims.hpp"
 #include "utils/dnnl_query.hpp"
 #include "utils/fill.hpp"
+#include "utils/impl_filter.hpp"
 #include "utils/numeric.hpp"
 #include "utils/parallel.hpp"
 
@@ -44,13 +45,13 @@
     do { \
         dnnl_status_t status__ = f; \
         if (status__ != dnnl_success) { \
-            if (s == CRIT || s == WARN) { \
+            if ((s) == CRIT || (s) == WARN) { \
                 BENCHDNN_PRINT(0, \
                         "Error: Function '%s' at (%s:%d) returned '%s'\n", \
                         __FUNCTION__, __FILE__, __LINE__, \
                         status2str(status__)); \
                 fflush(0); \
-                if (s == CRIT) exit(2); \
+                if ((s) == CRIT) exit(2); \
             } \
             return FAIL; \
         } \
@@ -58,7 +59,7 @@
 
 #define DNN_SAFE_V(f) \
     do { \
-        dnnl_status_t status__ = f; \
+        dnnl_status_t status__ = (f); \
         if (status__ != dnnl_success) { \
             BENCHDNN_PRINT(0, \
                     "Error: Function '%s' at (%s:%d) returned '%s'\n", \
@@ -71,7 +72,7 @@
 // Unlike `DNN_SAFE` this one returns `dnnl_status_t`, not `OK/FAIL`.
 #define DNN_SAFE_STATUS(f) \
     do { \
-        dnnl_status_t status__ = f; \
+        dnnl_status_t status__ = (f); \
         if (status__ != dnnl_success) { return status__; } \
     } while (0)
 
@@ -145,6 +146,7 @@ bool is_opencl_engine(const dnnl_engine_t &engine = get_test_engine());
 bool is_nvidia_gpu(const dnnl_engine_t &engine = get_test_engine());
 bool is_f64_supported(const dnnl_engine_t &engine = get_test_engine());
 bool is_amd_gpu(const dnnl_engine_t &engine = get_test_engine());
+bool is_generic_gpu(const dnnl_engine_t &engine = get_test_engine());
 
 // Extended version of dnnl_sycl_interop_memory_kind_t enumeration.
 enum class memory_kind_ext_t {
@@ -233,15 +235,20 @@ int get_gpu_ram_sizes(size_t &ram_size, size_t &max_alloc_size);
 int get_cpu_cache_size(cpu_cache_args_t &cache_args);
 int get_gpu_cache_size(size_t &cache_size);
 
+std::string smart_bytes(double bytes);
+int check_total_size(res_t *res, dnnl_primitive_t prim_ref = nullptr);
 bool is_fwd_training(dnnl_prop_kind_t prop_kind);
 bool is_fwd_prop_kind(dnnl_prop_kind_t prop_kind);
 int get_memory_footprint(const_dnnl_primitive_desc_t pd, res_t *res);
 int check_same_pd(const dnnl_primitive_desc_t &pd_no_attr, res_t *res);
 int test_persistent_cache_api(
         benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim, res_t *res);
+// This call is used in zeropad only and still does check inside, too.
 int check_mem_size(const_dnnl_memory_desc_t md, res_t *res);
-int check_mem_size(const_dnnl_primitive_desc_t const_pd, res_t *res, dir_t dir,
-        bool need_skip = true);
+// Only collects memory sizes from an input `const_pd` and puts the result into
+// `mem_size_args`.
+int collect_mem_size(check_mem_size_args_t &mem_size_args,
+        const_dnnl_primitive_desc_t const_pd, dir_t dir, bool need_skip = true);
 
 inline bool should_stop(const timer::timer_t &t) {
     const bool stop = false
@@ -334,9 +341,12 @@ int check_dnnl_status(dnnl_status_t status, const prb_t *prb, res_t *res) {
 //     simple-prims-of-complex-prim).
 // 2b. It's a tested primitive and not all implementations hit skip-impl option
 //     values.
+//
+// Note: `res` can be empty when fetching impl for prim_ref support.
 template <typename prb_t>
 int fetch_impl(benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> &pdw,
-        init_pd_args_t<prb_t> &init_pd_args, res_t *res, bool is_service_prim) {
+        init_pd_args_t<prb_t> &init_pd_args, const impl_filter_t &impl_filter,
+        res_t *res, bool is_service_prim) {
     if (!init_pd_args.pd) return FAIL;
 
     // Wrapper is expected to come empty.
@@ -349,29 +359,31 @@ int fetch_impl(benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> &pdw,
 
     while (true) {
         const auto impl_name = query_impl_info(pdw);
-        // Skip-impl is not requested or hit. Latest pd already fetched.
-        if (!maybe_skip(impl_name)) return OK;
+        if (!need_next_impl(impl_name, impl_filter)) return OK;
 
-        BENCHDNN_PRINT(6, "Implementation skipped: %s\n", impl_name.c_str());
+        BENCHDNN_PRINT(6, "[IMPL_FILTER] Implementation skipped: %s\n",
+                impl_name.c_str());
 
         // Iterator is not supported, further logic is not applicable.
         if (!init_pd_args.is_iterator_supported) {
-            res->state = SKIPPED;
-            res->reason = skip_reason::skip_impl_hit;
+            if (res) res->state = SKIPPED;
+            if (res) res->reason = skip_reason::skip_impl_hit;
             return OK;
         }
 
         auto status = dnnl_primitive_desc_next_impl(pdw);
         if (status == dnnl_last_impl_reached) {
-            BENCHDNN_PRINT(2, "%s\n", "All implementations were skipped!");
-            res->state = SKIPPED;
-            res->reason = skip_reason::skip_impl_hit;
+            BENCHDNN_PRINT(2, "%s\n",
+                    "[IMPL_FILTER] All implementations were skipped!");
+            if (res) res->state = SKIPPED;
+            if (res) res->reason = skip_reason::skip_impl_hit;
             pdw.reset(nullptr);
             return OK;
         } else if (status == dnnl_success) {
             continue;
         } else {
-            BENCHDNN_PRINT(0, "%s\n", "Unexpected status from pd iterator.");
+            BENCHDNN_PRINT(0, "%s\n",
+                    "[IMPL_FILTER] Unexpected status from pd iterator.");
             return FAIL;
         }
     }
@@ -402,15 +414,40 @@ int create_primitive(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &primw,
     if (res->state == SKIPPED) return OK;
 
     // Fetch also checks if user requested to skip certain implementations.
-    SAFE(fetch_impl(pdw, init_pd_args, res, is_service_prim), WARN);
+    SAFE(fetch_impl(pdw, init_pd_args, prb->impl_filter, res, is_service_prim),
+            WARN);
     if (res->state == SKIPPED) return OK;
 
-    // Check memory requirements if only execution happens.
-    // Note: As a graph may contains moare than one operations with identical
-    // `dir`. Since the mem size check for all the operations are necessary,
-    // the check should not be skipped.
-    SAFE(check_mem_size(pdw, res, dir, /*need_skip=*/!is_graph_ref), WARN);
-    if (res->state == SKIPPED) return OK;
+    // Note: Graph may contain more than one operation with identical `dir`.
+    //   It's required to collect all memory sizes regardless of `dir`.
+    SAFE(collect_mem_size(res->mem_size_args, pdw, dir,
+                 /* need_skip = */ !is_graph_ref),
+            WARN);
+
+    // The library scratchpad is allocated at create_primitive stage. The memory
+    // check is moved after the creation stage. It's necessary to check the
+    // library scratchpad size against gpu_max_alloc, otherwise, out_of_memory
+    // would be issued by the library.
+    if (res->mem_size_args.scratchpad_size > 0 && is_gpu()
+            && query_scratchpad_mode(query_attr(pdw))
+                    == dnnl_scratchpad_mode_library) {
+        static size_t gpu_device_capacity = 0;
+        static size_t gpu_max_alloc_capacity = 0;
+        SAFE(get_gpu_ram_sizes(gpu_device_capacity, gpu_max_alloc_capacity),
+                WARN);
+        const bool fit
+                = res->mem_size_args.scratchpad_size < gpu_max_alloc_capacity;
+        if (!fit) {
+            BENCHDNN_PRINT(1,
+                    "[CHECK_MEM]: Size of the scratchpad %s "
+                    "doesn't fit the allocation limit of %s.\n",
+                    smart_bytes(res->mem_size_args.scratchpad_size).c_str(),
+                    smart_bytes(gpu_max_alloc_capacity).c_str());
+            res->state = SKIPPED;
+            res->reason = skip_reason::not_enough_ram;
+            return OK;
+        }
+    }
 
     TIME_C_PRIM(DNN_SAFE(dnnl_primitive_create(&prim, pdw), WARN));
     primw.reset(prim);
@@ -462,6 +499,7 @@ int init_prim(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &user_prim,
         // Rationale: make sure that the primitive cache is robust in the case
         // where CPU and GPU engines are re-created because this is a commonly
         // used scenario in the frameworks.
+        // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
         engine_t engine(get_test_engine());
 
         // The first primitive creation using a temporary engine.
@@ -580,6 +618,12 @@ void check_correctness(const prb_t *prb, const std::vector<data_kind_t> &kinds,
         TIME_COMPARE(check_buffer_overwrite(args.dnn_mem(i), args.arg(i), res));
     }
 
+    // Report prim_ref run status for easier distinguishing between GPU failures
+    // and ref CPU failures.
+    if (prim_ref) {
+        BENCHDNN_PRINT(1, "run ref: %s\n", res->prim_ref_repro.c_str());
+    }
+
     TIME_REF(compute_ref(prb, ref_args, prim_ref));
 
     for (const auto &kind : kinds) {
@@ -607,21 +651,11 @@ void check_correctness(const prb_t *prb, const std::vector<data_kind_t> &kinds,
                 cpu_cache_args.L2_size, cpu_cache_args.L3_size,
                 benchdnn_get_max_threads(),
                 query_impl_info(query_pd(prim_ref)).c_str());
-
-        // Replace engine kind for repro line from GPU to CPU.
-        const auto eng_pos = res->prim_ref_repro.find("engine=gpu");
-        if (eng_pos != std::string::npos)
-            // Replace `g` in `gpu` with `c`
-            res->prim_ref_repro[eng_pos + 7] = 'c';
-
-        BENCHDNN_PRINT(
-                0, "[PRIM_REF][REPRO]: %s\n", res->prim_ref_repro.c_str());
     }
 }
 
-typedef std::function<dnnl_status_t(
-        const dnnl_stream_t &, const std::vector<dnnl_exec_arg_t> &)>
-        perf_function_t;
+using perf_function_t = std::function<dnnl_status_t(
+        const dnnl_stream_t &, const std::vector<dnnl_exec_arg_t> &)>;
 
 int execute_and_wait(perf_function_t &exec_func, const dnnl_engine_t &engine,
         const args_t &args, res_t *res = nullptr);
@@ -632,8 +666,8 @@ void reset_gpu_profiling(dnnl_stream_t stream);
 
 void finalize();
 
-void get_gpu_profiling_info(dnnl_stream_t stream, std::vector<uint64_t> &nsecs,
-        std::vector<uint64_t> &cycles);
+int get_gpu_profiling_info(dnnl_stream_t stream, std::vector<uint64_t> &nsecs,
+        std::vector<uint64_t> &cycles, int expected_num_entries);
 int measure_perf(const thr_ctx_t &ctx, res_t *res, perf_function_t &perf_func,
         args_t &args);
 int measure_perf(
@@ -648,6 +682,12 @@ bool check_md_consistency_with_tag(
 
 memory_kind_ext_t str2memory_kind(const char *str);
 
+enum class execution_mode_t { direct, graph };
+extern execution_mode_t execution_mode;
+
+const char *execution_mode2str(execution_mode_t mode);
+execution_mode_t str2execution_mode(const char *str);
+
 float reorder_rescale_factor();
 
 // The function converts a memory descriptor dims into a `dims_t` object under
@@ -743,18 +783,44 @@ void init_memory_args(dnn_mem_map_t &mem_map, const prb_t *prb,
                     mem_map.emplace(exec_arg + i, dnn_mem_t(md, test_engine));
                 }
             } else {
+                const bool is_arg_in_map
+                        = mem_map.find(exec_arg) != mem_map.end();
                 const auto &md = query_md(const_pd, exec_arg);
-                if (has_runtime_dims(md)) {
-                    mem_map.emplace(exec_arg,
-                            dnn_mem_t(prb->get_md(exec_arg), test_engine));
+                // Check for ndims is needed when the driver supported args map
+                // contains extra arguments for other purposes.
+                const int ndims = query_md_ndims(md);
+                if (is_arg_in_map) {
+                    // It may happen that the map already has the argument but
+                    // the library requires it in a different format, e.g., RNN
+                    // BWD support on GPU (for better performance). It may also
+                    // happen in a combination with `no_ref_memory` modifier,
+                    // which requires the library memories map to handle such
+                    // cases.
+                    if (ndims > 0
+                            && dnnl_memory_desc_equal(
+                                       md, mem_map.at(exec_arg).md_)
+                                    == 0) {
+                        assert(!has_runtime_dims(md));
+                        dnn_mem_t new_mem(md, test_engine);
+                        // Reorder user's data from the old memory to the new one.
+                        auto st = new_mem.reorder(mem_map.at(exec_arg));
+                        assert(st == OK);
+                        if (st != OK) return;
+                        mem_map[exec_arg] = std::move(new_mem);
+                    }
                 } else {
-                    // In case when arguments get updated on backward when
-                    // forward is required, `emplace` guarantees newly
-                    // constructed element will be destroyed if an element with
-                    // a key already present in the map. C++17 could use
-                    // try_emplace instead to mitigate construction/destruction
-                    // overhead.
-                    mem_map.emplace(exec_arg, dnn_mem_t(md, test_engine));
+                    if (has_runtime_dims(md)) {
+                        mem_map.emplace(exec_arg,
+                                dnn_mem_t(prb->get_md(exec_arg), test_engine));
+                    } else {
+                        // In case when arguments get updated on backward when
+                        // forward is required, `emplace` guarantees newly
+                        // constructed element will be destroyed if an element
+                        // with a key already present in the map. C++17 could
+                        // use try_emplace instead to mitigate
+                        // construction/destruction overhead.
+                        mem_map.emplace(exec_arg, dnn_mem_t(md, test_engine));
+                    }
                 }
             }
         }
@@ -989,4 +1055,57 @@ int init_ref_memory_args_default_case(int exec_arg, dnn_mem_t &mem,
 int check_bitwise(dnnl_primitive_t prim, const std::vector<data_kind_t> &kinds,
         const args_t &args, const attr_t &attr, bool inplace, res_t *res);
 
+template <typename prb_t>
+int init_prim_ref_common(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
+        const prb_t *prb_cpu, res_t *res) {
+
+    init_pd_args_t<prb_t> init_pd_args(
+            /* res = */ nullptr, get_cpu_engine(), prb_cpu, prb_cpu->dir,
+            /* hint = */ nullptr, /* src_md = */ nullptr);
+    init_pd(init_pd_args);
+
+    benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> pdw;
+    // `is_service_prim=true` prevents from filtering the implementation
+    // by name which is intended through a `get_prim_ref_impl_filter()`.
+    // As `fetch_impl` doesn't have any further logic related to it, it's
+    // safe to set it to `false`.
+    fetch_impl(pdw, init_pd_args, get_prim_ref_impl_filter(),
+            /* res = */ nullptr,
+            /* is_service_prim = */ false);
+
+    // Prim desc wasn't created - try the next set...
+    if (!pdw) return FAIL;
+
+    dnnl_primitive_t prim_ref_ptr {};
+    auto st = dnnl_primitive_create(&prim_ref_ptr, pdw);
+    // Primitive wasn't created - try the next set...
+    if (st != dnnl_success) return FAIL;
+
+    BENCHDNN_PRINT(5, "CPU reference oneDNN implementation: %s\n",
+            query_impl_info(pdw).c_str());
+
+    res->prim_ref_repro = prb_cpu->str();
+    // Replace engine kind for repro line from GPU to CPU.
+    const auto eng_pos = res->prim_ref_repro.find("engine=gpu");
+    if (eng_pos != std::string::npos) {
+        // Replace `g` in `gpu` with `c`
+        res->prim_ref_repro[eng_pos + 7] = 'c';
+    }
+
+    // Remove `--impl=XXX` as it doesn't affect prim_ref.
+    const auto impl_pos = res->prim_ref_repro.find("--impl=");
+    if (impl_pos != std::string::npos) {
+        // Search for the next space starting from `impl_pos` as names' length
+        // is variadic.
+        const auto end_impl_pos
+                = res->prim_ref_repro.find_first_of(" ", impl_pos);
+        assert(end_impl_pos != std::string::npos);
+        // `+ 1` is for extra space.
+        res->prim_ref_repro.erase(impl_pos, end_impl_pos - impl_pos + 1);
+    }
+
+    prim_ref.reset(prim_ref_ptr);
+    return OK;
+}
+
 #endif
diff --git a/tests/benchdnn/dnnl_debug.hpp b/tests/benchdnn/dnnl_debug.hpp
index dc8dd4bb9c5..448ca4c9f36 100644
--- a/tests/benchdnn/dnnl_debug.hpp
+++ b/tests/benchdnn/dnnl_debug.hpp
@@ -56,7 +56,7 @@ const char *fpmath_mode2str(dnnl_fpmath_mode_t mode);
 /* accumulation mode */
 const char *accumulation_mode2str(dnnl_accumulation_mode_t mode);
 
-/* accumulation mode */
+/* rounding mode */
 const char *rounding_mode2str(dnnl_rounding_mode_t mode);
 
 #endif
diff --git a/tests/benchdnn/dnnl_debug_autogenerated.cpp b/tests/benchdnn/dnnl_debug_autogenerated.cpp
index f2ee9c670ff..ba5968e33a3 100644
--- a/tests/benchdnn/dnnl_debug_autogenerated.cpp
+++ b/tests/benchdnn/dnnl_debug_autogenerated.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
+* Copyright 2024-2025 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,6 +49,8 @@ dnnl_data_type_t str2dt(const char *str) {
     CASE(s4);
     CASE(u4);
     CASE(e8m0);
+    CASE(f4_e2m1);
+    CASE(f4_e3m0);
     CASE(data_type_max);
 #undef CASE
     if (!strcmp("undef", str) || !strcmp("dnnl_data_type_undef", str))
@@ -66,6 +69,7 @@ dnnl_sparse_encoding_t str2sparse_encoding(const char *str) {
 } while (0)
     CASE(csr);
     CASE(packed);
+    CASE(coo);
 #undef CASE
     if (!strcmp("undef", str) || !strcmp("dnnl_sparse_encoding_undef", str))
         return dnnl_sparse_encoding_undef;
@@ -917,6 +921,18 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(bcad);
     CASE(cabd);
     CASE(dabc);
+    CASE(Ab32a);
+    CASE(aCBd8b8c);
+    CASE(aCBde8b8c);
+    CASE(BAc8a8b);
+    CASE(BAcd8a8b);
+    CASE(BAcde8a8b);
+    CASE(aCBdef8b8c);
+    CASE(abdEC16e4c);
+    CASE(abDC16d4c);
+    CASE(BA24b8a);
+    CASE(aCB24c8b);
+    CASE(abDC24d8c);
     CASE(x);
     CASE(nc);
     CASE(cn);
@@ -969,9 +985,11 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(ldgo);
     CASE(ldOi16o);
     CASE(ldOi32o);
+    CASE(ldOI16o4i);
     CASE(ldOI32o4i);
     CASE(ldIo32i);
     CASE(ldgOi16o);
+    CASE(ldgOI16o4i);
     CASE(ldgOi32o);
     CASE(ldgOI32o2i);
     CASE(ldgOI32o4i);
@@ -1021,6 +1039,7 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(OI8i24o);
     CASE(OI8i16o);
     CASE(OI8i8o);
+    CASE(IOw8o8i);
     CASE(IOw16o16i);
     CASE(IOw16i16o);
     CASE(OIw16i16o);
@@ -1089,6 +1108,7 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(OwI8i16o);
     CASE(OwI8o4i);
     CASE(IOhw16i16o);
+    CASE(IOhw8o8i);
     CASE(IOhw16o16i);
     CASE(Ohwi16o);
     CASE(OhwI16o2i);
@@ -1219,6 +1239,7 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(OIdhw8o4i);
     CASE(IOdhw16i16o);
     CASE(OIdhw4o8i8o4i);
+    CASE(IOdhw8o8i);
     CASE(IOdhw16o16i);
     CASE(OIdhw16o16i2o);
     CASE(OIdhw8i32o);
@@ -1230,6 +1251,7 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(Goiw16g);
     CASE(Goiw8g);
     CASE(Goiw4g);
+    CASE(gIOw8o8i);
     CASE(gIOw16o16i);
     CASE(gIOw16i16o);
     CASE(gOIw16i16o);
@@ -1273,6 +1295,7 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(goIw4i);
     CASE(goIw32i);
     CASE(gIOhw16i16o);
+    CASE(gIOhw8o8i);
     CASE(gIOhw16o16i);
     CASE(gOhwi16o);
     CASE(gOhwI16o2i);
@@ -1336,6 +1359,7 @@ dnnl_format_tag_t str2fmt_tag(const char *str) {
     CASE(gOIhw4i8o2i);
     CASE(gOIhw4o8i2o);
     CASE(gIOdhw16i16o);
+    CASE(gIOdhw8o8i);
     CASE(gIOdhw16o16i);
     CASE(gOdhwi16o);
     CASE(gOdhwI16o2i);
diff --git a/tests/benchdnn/dnnl_memory.cpp b/tests/benchdnn/dnnl_memory.cpp
index 761479d0cd1..b8aba5a5664 100644
--- a/tests/benchdnn/dnnl_memory.cpp
+++ b/tests/benchdnn/dnnl_memory.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -68,6 +68,18 @@ dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt,
     }
 }
 
+dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt,
+        const dnnl_dims_t strides, dnnl_engine_t engine) {
+    const int ndims = query_md_ndims(md);
+    if (ndims > 0) {
+        auto status = dnnl_memory_desc_create_with_strides(
+                &md_, ndims, query_md_dims(md), dt, strides);
+        (void)status;
+        assert(status == dnnl_success);
+        active_ = (initialize(engine) == OK);
+    }
+}
+
 dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt,
         const std::string &tag, dnnl_engine_t engine) {
     if (ndims > 0) {
@@ -137,11 +149,30 @@ int execute_reorder(const dnn_mem_t &src, dnn_mem_t &dst,
     }
 #endif
 
-    if (!r_pd_) {
-        DNN_SAFE(dnnl_reorder_primitive_desc_create(&r_pd_, src.md_,
-                         src.engine(), dst.md_, dst.engine(), attr),
-                WARN);
+    while (!r_pd_) {
+        // Fallback to GPU reorder.
+        auto status = dnnl_reorder_primitive_desc_create(
+                &r_pd_, src.md_, src.engine(), dst.md_, dst.engine(), attr);
+        if (status == dnnl_success) break;
+        if (dnnl_memory_desc_equal(src.md_, dst.md_)) {
+            // If fail to create reorder pd, use plain data copy for identical
+            // mds.
+            BENCHDNN_PRINT(2, "%s\n", "[REORDER] Fallback to plain copy.");
+            const int64_t chunk_size = 64;
+            const int64_t n_chunks = div_up(src.nelems(), chunk_size);
+            benchdnn_parallel_nd(n_chunks, [&](int64_t idx_chunk) {
+                int64_t idx_start = idx_chunk * chunk_size;
+                int64_t idx_end = MIN2(idx_start + chunk_size, src.nelems());
+                for (int64_t idx = idx_start; idx < idx_end; ++idx) {
+                    float e = src.get_elem(idx);
+                    dst.set_elem(idx, e);
+                }
+            });
+            return OK;
+        }
+        return FAIL;
     }
+
     auto r_pd = make_benchdnn_dnnl_wrapper(r_pd_);
     const auto &scratchpad_md = query_md(r_pd, DNNL_ARG_SCRATCHPAD);
     const auto &scratchpad_engine
@@ -169,10 +200,10 @@ int dnn_mem_t::reorder(const dnn_mem_t &rhs, const_dnnl_primitive_attr_t attr,
     // Do nothing, return a good status. Keep here to avoid guarding externally.
     if (query_md_ndims(rhs.md_) == 0) return OK;
 
-    // Assumption is `no_host_memory` assigned values at construction, and no
+    // Assumption is `no_ref_memory` assigned values at construction, and no
     // actual reorder needed. This check is to avoid extra code outside of
     // reorder interface.
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const bool do_swap_dt = swap_dt != dnnl_data_type_undef;
     dnnl_data_type_t orig_dt = this->dt();
@@ -186,6 +217,14 @@ size_t dnn_mem_t::size() const {
     return dnnl_memory_desc_get_size(md_);
 }
 
+bool dnn_mem_t::is_sparse_md() const {
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    return query_md_sparse_encoding(md_) != dnnl_sparse_encoding_undef;
+#else
+    return false;
+#endif
+}
+
 size_t dnn_mem_t::sizeof_dt() const {
     return dnnl_data_type_size(dt());
 }
@@ -216,17 +255,27 @@ float dnn_mem_t::get_elem(int64_t idx, int buffer_index) const {
             elem = static_cast<dnnl::impl::float8_e4m3_t *>(data)[idx];
             break;
         case dnnl_s4: {
-            auto half = idx % 2 ? dnnl::impl::int4_extract_t::high_half
-                                : dnnl::impl::int4_extract_t::low_half;
-            elem = static_cast<float>(dnnl::impl::int4_t::extract(
-                    static_cast<uint8_t *>(data)[idx / 2], half));
+            dnnl::impl::nibble2_t nibble_pair(
+                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
+            elem = dnnl::impl::int4_t(nibble_pair.get(idx % 2));
             break;
         }
         case dnnl_u4: {
-            auto half = idx % 2 ? dnnl::impl::int4_extract_t::high_half
-                                : dnnl::impl::int4_extract_t::low_half;
-            elem = static_cast<float>(dnnl::impl::uint4_t::extract(
-                    static_cast<uint8_t *>(data)[idx / 2], half));
+            dnnl::impl::nibble2_t nibble_pair(
+                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
+            elem = dnnl::impl::uint4_t(nibble_pair.get(idx % 2));
+            break;
+        }
+        case dnnl_f4_e2m1: {
+            dnnl::impl::nibble2_t nibble_pair(
+                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
+            elem = dnnl::impl::float4_e2m1_t(nibble_pair.get(idx % 2));
+            break;
+        }
+        case dnnl_f4_e3m0: {
+            dnnl::impl::nibble2_t nibble_pair(
+                    reinterpret_cast<uint8_t *>(data)[idx / 2]);
+            elem = dnnl::impl::float4_e3m0_t(nibble_pair.get(idx % 2));
             break;
         }
         default: assert(!"bad data type");
@@ -253,32 +302,102 @@ void dnn_mem_t::set_elem(int64_t idx, float value, int buffer_index) const {
             ((dnnl::impl::float8_e4m3_t *)data)[idx] = value;
             break;
         case dnnl_s4: {
-            using type = dnnl::impl::int4_t;
-            auto half = idx % 2 ? dnnl::impl::int4_extract_t::high_half
-                                : dnnl::impl::int4_extract_t::low_half;
-            uint8_t dst_val = ((uint8_t *)data)[idx / 2];
-            ((type *)data)[idx / 2] = type(value).insert(dst_val, half);
+            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
+            dst_val.set(dnnl::impl::int4_t(value).raw_bits_, idx % 2);
+            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
             break;
         }
         case dnnl_u4: {
-            using type = dnnl::impl::uint4_t;
-            auto half = idx % 2 ? dnnl::impl::int4_extract_t::high_half
-                                : dnnl::impl::int4_extract_t::low_half;
-            uint8_t dst_val = ((uint8_t *)data)[idx / 2];
-            ((type *)data)[idx / 2] = type(value).insert(dst_val, half);
+            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
+            dst_val.set(dnnl::impl::uint4_t(value).raw_bits_, idx % 2);
+            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
+            break;
+        }
+        case dnnl_f4_e2m1: {
+            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
+            dst_val.set(dnnl::impl::float4_e2m1_t(value).raw_bits_, idx % 2);
+            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
+            break;
+        }
+        case dnnl_f4_e3m0: {
+            auto dst_val = ((dnnl::impl::nibble2_t *)data)[idx / 2];
+            dst_val.set(dnnl::impl::float4_e3m0_t(value).raw_bits_, idx % 2);
+            ((dnnl::impl::nibble2_t *)data)[idx / 2] = dst_val;
             break;
         }
         default: assert(!"bad data type");
     }
 }
 
+// Returns an updated logical index based on input `logical_index` and
+// `dims_mask`.
+// `logical_idx` is a generally composed index where dims[ndims - 1] is most
+// dense and dims[0] is least dense dimensions, as tensor in `abx` format.
+// `dims_mask` represents dimensions to keep or remove. A value is composed of
+// number of bits equal to `ndims` and value of `1` indicates the dimension to
+// present in final calculation. E.g., mask=0 means a single value, and mask=2,
+// or 1 << 1, means to keep dims[1] only.
+// `ndims` allows to reduce the number of dimensions from innermost direction.
+// It is used to find an index in batched dimensions. E.g., if ndims() returns
+// `4`, but `ndims` argument is passed as `2`, it will count only first two
+// dimensions instead of all 4.
+// `groups` is an extension of scales when their dimensions are different.
+// In this case, it's required to adjust dimensions values according to group
+// values to properly compute stride in a smaller tensor. The definition is
+// aligned with the API and expects groups of size 2 or empty.
+// When `ndims()/ndims` are bigger than 2, groups are applied only to two most
+// dense dimensions, which is aligned with 2D matmul definition. E.g., dims=2x6,
+// mask=3, ndims=2 and groups=1x3; in this case indices 0,1,2 will return 0 as
+// those indices represent the first group. 3,4,5 will return 1. Changing the
+// example like dims=6x2, mask=3, ndims=2 and groups=3x1 will change ouput as
+// well due to groups are dense not over the last dimension. The match will look
+// like this: 0->0, 1->1, 2->0, 3->1, ..., 6->2, ..., 11->4.
+//
+// Helps to find an index of smaller tensor in bigger one, e.g., scales. Scales
+// are usually represented by a 1D array and have a mask indicating dims to be
+// applied for. It coincides with `dims_mask`.
+// Another usage is to identify an index for operations that support broadcast
+// over different dimensions, e.g., matmul batch dimensions, or binary
+// primitive.
+//
+// Example: there's a tensor 3x4 and scales with mask=2, which means to apply
+// scales over the dimension of 4. Passing indices from 0 to 3 will return
+// values from 0 to 3 correspondently. However, passing `logical_idx` of 4 will
+// return 0, as 4 is a logical representation of point 1x0.
+int64_t dnn_mem_t::get_idx(int64_t logical_idx, int dims_mask, const int ndims,
+        const dims_t &groups) const {
+    if (dims_mask == 0) return 0;
+
+    const auto &dims = this->dims();
+    int64_t stride = 1;
+    int64_t offset = 0;
+
+    assert(groups.empty() || groups.size() == 2);
+    assert(groups.size() <= static_cast<size_t>(ndims));
+    dims_t groups_ext(ndims, 1);
+    if (!groups.empty()) {
+        groups_ext[ndims - 2] = groups[0];
+        groups_ext[ndims - 1] = groups[1];
+    }
+
+    for (int i = 0; i < ndims; ++i) {
+        int d = ndims - 1 - i;
+        auto pos = logical_idx % dims[d];
+        logical_idx /= dims[d];
+        if (dims_mask & (1 << d)) {
+            offset += (pos / groups_ext[d]) * stride;
+            stride *= (dims[d] / groups_ext[d]);
+        }
+    }
+
+    return offset;
+}
+
 // Creates a memory object from the underlying buffer of an existing memory
 // object `mem`. The size of `mem` must not be less than the size of `md`.
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || defined(DNNL_WITH_SYCL)
 static int init_memory(
         dnnl_memory_t *ret, const dnnl_memory_desc_t &md, dnnl_memory_t mem) {
-    void *handle;
-    DNN_SAFE(dnnl_memory_get_data_handle(mem, &handle), CRIT);
 
     dnnl_engine_t engine;
     DNN_SAFE(dnnl_memory_get_engine(mem, &engine), CRIT);
@@ -288,22 +407,43 @@ static int init_memory(
 
     *ret = nullptr;
 
+    const int nhandles = query_md_num_handles(md);
+    std::vector<void *> handles(nhandles);
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    for (int i = 0; i < nhandles; i++)
+        DNN_SAFE(dnnl_memory_get_data_handle_v2(mem, &handles[i], i), CRIT);
+#else
+    DNN_SAFE(dnnl_memory_get_data_handle(mem, &handles[0]), CRIT);
+#endif
+
     if (is_opencl) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
         dnnl_ocl_interop_memory_kind_t mem_kind;
         DNN_SAFE(dnnl_ocl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT);
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        DNN_SAFE(dnnl_ocl_interop_memory_create_v2(ret, md, engine, mem_kind,
+                         (int)handles.size(), handles.data()),
+                CRIT);
+#else
         DNN_SAFE(dnnl_ocl_interop_memory_create(
-                         ret, md, engine, mem_kind, handle),
+                         ret, md, engine, mem_kind, handles[0]),
                 CRIT);
+#endif
 #endif
     } else if (is_sycl) {
 #ifdef DNNL_WITH_SYCL
         dnnl_sycl_interop_memory_kind_t mem_kind;
         DNN_SAFE(
                 dnnl_sycl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT);
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        DNN_SAFE(dnnl_sycl_interop_memory_create_v2(ret, md, engine, mem_kind,
+                         (int)handles.size(), handles.data()),
+                CRIT);
+#else
         DNN_SAFE(dnnl_sycl_interop_memory_create(
-                         ret, md, engine, mem_kind, handle),
+                         ret, md, engine, mem_kind, handles[0]),
                 CRIT);
+#endif
 #endif
     }
 
@@ -358,12 +498,17 @@ void dnn_mem_t::unmap() const {
     }
 }
 
-void dnn_mem_t::memset(int value, size_t size) const {
+void dnn_mem_t::memset(int value, size_t size, int buffer_index) const {
     bool is_opencl = is_opencl_engine(engine_);
     bool is_sycl = is_sycl_engine(engine_);
     auto mem = m_padded_ ? m_padded_ : m_;
     void *mem_handle;
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    DNN_SAFE_V(dnnl_memory_get_data_handle_v2(mem, &mem_handle, buffer_index));
+#else
     DNN_SAFE_V(dnnl_memory_get_data_handle(mem, &mem_handle));
+#endif
+
     if (is_opencl) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
         stream_t stream(engine_);
@@ -400,12 +545,7 @@ void dnn_mem_t::memset(int value, size_t size) const {
                 auto &buf = *static_cast<::sycl::buffer<uint8_t, 1> *>(
                         mem_handle);
                 queue.submit([&](::sycl::handler &cgh) {
-#ifdef DNNL_SYCL_INTEROP_USE_SYCL121
-                    constexpr auto target_device
-                            = ::sycl::target::global_buffer;
-#else
                     constexpr auto target_device = ::sycl::target::device;
-#endif
                     ::sycl::accessor<uint8_t, 1, ::sycl::access::mode::write,
                             target_device>
                             acc(buf, cgh);
@@ -445,14 +585,19 @@ size_t dnn_mem_t::pad_memory_size(
             || engine_kind == dnnl_cpu)
         return sz;
 
-    const int pad_size = 4096;
-    if (was_padded) *was_padded = true;
-    return sz + pad_size;
+    const size_t page_size = 4096;
+    auto padded_sz = rnd_up(sz, page_size);
+    if (was_padded) *was_padded = padded_sz != sz;
+    return padded_sz;
 }
 
 dnnl_memory_desc_t dnn_mem_t::pad_memory_desc(const_dnnl_memory_desc_t md,
         dnnl_engine_kind_t engine_kind, bool *was_padded) {
     if (was_padded) *was_padded = false;
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+    // TODO: add padded memory descriptor support for sparse memory.
+    if (query_md_format_kind(md) == dnnl_format_kind_sparse) return nullptr;
+#endif
     size_t old_sz = dnnl_memory_desc_get_size(md);
     if (old_sz == 0 || !has_bench_mode_bit(mode_bit_t::corr)
             || engine_kind == dnnl_cpu)
@@ -475,7 +620,7 @@ benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_md(int ndims,
     const bool use_strides = !strides_.empty();
     // Ignore tag_ in case strides_ are explicitly provided
     if (use_strides) {
-        std::vector<dnnl_dim_t> strides(strides_);
+        const std::vector<dnnl_dim_t> &strides(strides_);
         DNN_SAFE_V(dnnl_memory_desc_create_with_strides(
                 &md, ndims, dims, data_type, strides.data()));
         return md;
@@ -507,6 +652,15 @@ benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_csr_md(int ndims,
     return md;
 }
 
+benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_coo_md(int ndims,
+        const dnnl_dims_t dims, dnnl_data_type_t data_type, dnnl_dim_t nnz,
+        dnnl_data_type_t indices_dt) {
+    dnnl_memory_desc_t md {};
+    DNN_SAFE_V(dnnl_memory_desc_create_with_coo_encoding(
+            &md, ndims, dims, data_type, nnz, indices_dt));
+    return md;
+}
+
 benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_sparse_packed_md(
         int ndims, const dnnl_dims_t dims, dnnl_data_type_t data_type,
         dnnl_dim_t nnz) {
@@ -521,9 +675,18 @@ int dnn_mem_t::initialize_memory_create_sycl(const handle_info_t &handle_info) {
 #ifdef DNNL_WITH_SYCL
     if (handle_info.is_host_ptr) {
         // Ignore memory_kind with host pointers and force USM.
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        const int nhandles = query_md_num_handles(md_);
+        std::vector<void *> handles(nhandles, handle_info.ptr);
+        DNN_SAFE(dnnl_sycl_interop_memory_create_v2(&m_, md_, engine_,
+                         dnnl_sycl_interop_usm, (int)handles.size(),
+                         handles.data()),
+                CRIT);
+#else
         DNN_SAFE(dnnl_sycl_interop_memory_create(&m_, md_, engine_,
                          dnnl_sycl_interop_usm, handle_info.ptr),
                 CRIT);
+#endif
         return OK;
     }
 
@@ -537,9 +700,18 @@ int dnn_mem_t::initialize_memory_create_sycl(const handle_info_t &handle_info) {
                     = (memory_kind == memory_kind_ext_t::usm
                                     ? dnnl_sycl_interop_usm
                                     : dnnl_sycl_interop_buffer);
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+            const int nhandles = query_md_num_handles(md_);
+            std::vector<void *> handles(nhandles, handle_info.ptr);
+            DNN_SAFE(dnnl_sycl_interop_memory_create_v2(&m_padded_, md_padded,
+                             engine_, mem_kind, (int)handles.size(),
+                             handles.data()),
+                    CRIT);
+#else
             DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded,
                              engine_, mem_kind, handle_info.ptr),
                     CRIT);
+#endif
             SAFE(init_memory(&m_, md_, m_padded_), CRIT);
             break;
         }
@@ -547,21 +719,39 @@ int dnn_mem_t::initialize_memory_create_sycl(const handle_info_t &handle_info) {
         case memory_kind_ext_t::usm_shared: {
             SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT);
             is_data_owner_ = true;
-            size_t sz = dnnl_memory_desc_get_size(md_padded);
+
             auto eng = dnnl::engine(engine_, true);
             auto dev = dnnl::sycl_interop::get_device(eng);
             auto ctx = dnnl::sycl_interop::get_context(eng);
-            if (memory_kind == memory_kind_ext_t::usm_device) {
-                data_.push_back(::sycl::malloc_device(sz, dev, ctx));
-            } else {
-                data_.push_back(::sycl::malloc_shared(sz, dev, ctx));
+
+            const int nhandles = query_md_num_handles(md_);
+            for (int i = 0; i < nhandles; i++) {
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+                size_t sz = dnnl_memory_desc_get_size_v2(md_padded, i);
+#else
+                size_t sz = dnnl_memory_desc_get_size(md_padded);
+#endif
+                if (memory_kind == memory_kind_ext_t::usm_device) {
+                    data_.push_back(::sycl::malloc_device(sz, dev, ctx));
+                } else {
+                    data_.push_back(::sycl::malloc_shared(sz, dev, ctx));
+                }
+                if (sz > 0 && !data_[i]) {
+                    for (void *p : data_)
+                        ::sycl::free(p, ctx);
+                    DNN_SAFE(dnnl_out_of_memory, CRIT);
+                }
             }
-            assert(data_.size() == 1);
-            DNN_SAFE((sz > 0 && !data_[0]) ? dnnl_out_of_memory : dnnl_success,
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+            DNN_SAFE(dnnl_sycl_interop_memory_create_v2(&m_padded_, md_padded,
+                             engine_, dnnl_sycl_interop_usm, (int)data_.size(),
+                             data_.data()),
                     CRIT);
+#else
             DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded,
                              engine_, dnnl_sycl_interop_usm, data_[0]),
                     CRIT);
+#endif
             SAFE(init_memory(&m_, md_, m_padded_), CRIT);
             break;
         }
@@ -580,9 +770,18 @@ int dnn_mem_t::initialize_memory_create_opencl(
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
     if (handle_info.is_host_ptr) {
         // Ignore memory_kind with host pointers and force USM.
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+        const int nhandles = query_md_num_handles(md_);
+        std::vector<void *> handles(nhandles, handle_info.ptr);
+        DNN_SAFE(dnnl_ocl_interop_memory_create_v2(&m_, md_, engine_,
+                         dnnl_ocl_interop_usm, (int)handles.size(),
+                         handles.data()),
+                CRIT);
+#else
         DNN_SAFE(dnnl_ocl_interop_memory_create(&m_, md_, engine_,
                          dnnl_ocl_interop_usm, handle_info.ptr),
                 CRIT);
+#endif
         return OK;
     }
 
@@ -598,29 +797,56 @@ int dnn_mem_t::initialize_memory_create_opencl(
                     = (memory_kind == memory_kind_ext_t::usm
                                     ? dnnl_ocl_interop_usm
                                     : dnnl_ocl_interop_buffer);
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+            const int nhandles = query_md_num_handles(md_);
+            std::vector<void *> handles(nhandles, handle_info.ptr);
+            DNN_SAFE(dnnl_ocl_interop_memory_create_v2(&m_padded_, md_padded,
+                             engine_, mem_kind, (int)handles.size(),
+                             handles.data()),
+                    CRIT);
+#else
             DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded,
                              engine_, mem_kind, handle_info.ptr),
                     CRIT);
+#endif
             SAFE(init_memory(&m_, md_, m_padded_), CRIT);
             break;
         }
         case memory_kind_ext_t::usm_device:
         case memory_kind_ext_t::usm_shared: {
             is_data_owner_ = true;
-            size_t sz = dnnl_memory_desc_get_size(md_padded);
-            if (memory_kind == memory_kind_ext_t::usm_device) {
-                data_.push_back(
-                        dnnl::impl::xpu::ocl::usm::malloc_device(engine_, sz));
-            } else {
-                data_.push_back(
-                        dnnl::impl::xpu::ocl::usm::malloc_shared(engine_, sz));
+
+            const int nhandles = query_md_num_handles(md_);
+            for (int i = 0; i < nhandles; i++) {
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+                size_t sz = dnnl_memory_desc_get_size_v2(md_padded, i);
+#else
+                size_t sz = dnnl_memory_desc_get_size(md_padded);
+#endif
+                if (memory_kind == memory_kind_ext_t::usm_device) {
+                    data_.push_back(dnnl::impl::xpu::ocl::usm::malloc_device(
+                            engine_, sz));
+                } else {
+                    data_.push_back(dnnl::impl::xpu::ocl::usm::malloc_shared(
+                            engine_, sz));
+                }
+
+                if (sz > 0 && !data_[i]) {
+                    for (void *p : data_)
+                        dnnl::impl::xpu::ocl::usm::free(engine_, p);
+                    DNN_SAFE(dnnl_out_of_memory, CRIT);
+                }
             }
-            assert(data_.size() == 1);
-            DNN_SAFE((sz > 0 && !data_[0]) ? dnnl_out_of_memory : dnnl_success,
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+            DNN_SAFE(dnnl_ocl_interop_memory_create_v2(&m_padded_, md_padded,
+                             engine_, dnnl_ocl_interop_usm, (int)data_.size(),
+                             data_.data()),
                     CRIT);
+#else
             DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded,
                              engine_, dnnl_ocl_interop_usm, data_[0]),
                     CRIT);
+#endif
             SAFE(init_memory(&m_, md_, m_padded_), CRIT);
             break;
         }
@@ -699,7 +925,18 @@ int dnn_mem_t::initialize(
     SAFE(initialize_memory_create(handle_info), CRIT);
 
     if (handle_info.is_allocate()) {
-        if (!has_bench_mode_modifier(mode_modifier_t::no_host_memory)) map();
+        // Memory objects consisting of several buffers can rely on indirect
+        // data access through metadata (e.g., sparse memory objects).
+        // Filling metadata buffers with random values can lead to accessing an
+        // address location not controlled by the process. Thus, such metadata
+        // buffers must be always properly filled according to the driver rules.
+        // Filling buffers requires them to be mapped.
+        // To save code on updating every case separately, update the logic in
+        // this common place.
+        const bool mem_has_indirect_access = is_sparse_md();
+        if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+                || mem_has_indirect_access)
+            map();
 
         const int nhandles = query_md_num_handles(md_);
         for (int i = 0; i < nhandles; i++) {
@@ -715,10 +952,12 @@ int dnn_mem_t::initialize(
                 // Avoid costy data reorders for cold cache mode when
                 // initializing cold cache buffers.
                 // TODO: consider enabling broadly for perf mode.
-                if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)
-                        || cold_cache_mode != default_cold_cache_mode) {
+                if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+                        || cold_cache_input.cold_cache_mode_
+                                != default_cold_cache_input()
+                                           .cold_cache_mode_) {
                     // Fill memory directly with 0x3F3F3F3F (0.747059f) number.
-                    this->memset(dnnl_mem_default_perf_test_value, sz);
+                    this->memset(dnnl_mem_default_perf_test_value, sz, i);
                 } else {
                     // Fill memory with a magic number (NAN for fp data types)
                     // to catch possible uninitialized access.
@@ -766,7 +1005,7 @@ static int cleanup_opencl(
 
 int dnn_mem_t::cleanup() {
     if (!active_) return OK;
-    if (!has_bench_mode_modifier(mode_modifier_t::no_host_memory)) unmap();
+    if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) unmap();
     DNN_SAFE(dnnl_memory_desc_destroy(md_), CRIT);
     DNN_SAFE(dnnl_memory_destroy(m_), CRIT);
     if (is_data_owner_) {
@@ -942,6 +1181,8 @@ int check_zero_padding(
             CASE(dnnl_u8, uint8_t);
             CASE(dnnl_s4, dnnl::impl::int4_t);
             CASE(dnnl_u4, dnnl::impl::uint4_t);
+            CASE(dnnl_f4_e2m1, dnnl::impl::float4_e2m1_t);
+            CASE(dnnl_f4_e3m0, dnnl::impl::float4_e3m0_t);
         default: assert(!"bad data_type");
     };
 #undef CASE
@@ -1006,6 +1247,14 @@ dnnl_dim_t md_off_v(
     return phys_offset;
 }
 
+bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map) {
+    for (const auto &e : dnn_mem_map) {
+        const auto &m = e.second;
+        if (m.is_sparse_md()) return true;
+    }
+    return false;
+}
+
 dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md) {
     dnnl_memory_desc_t cloned_md;
     auto status = dnnl_memory_desc_clone(&cloned_md, md);
diff --git a/tests/benchdnn/dnnl_memory.hpp b/tests/benchdnn/dnnl_memory.hpp
index 7a765701718..38549a201cf 100644
--- a/tests/benchdnn/dnnl_memory.hpp
+++ b/tests/benchdnn/dnnl_memory.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -47,8 +47,11 @@ struct dnn_mem_t {
     dnn_mem_t() { map(); }
     dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_engine_t engine,
             const handle_info_t &handle_info = handle_info_t::allocate());
+
     dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt,
             const std::string &tag, dnnl_engine_t engine);
+    dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt,
+            const dnnl_dims_t strides, dnnl_engine_t engine);
 
     dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt,
             const std::string &tag, dnnl_engine_t engine);
@@ -115,6 +118,11 @@ struct dnn_mem_t {
     const dnnl_dims_t &inner_blks() const;
     const dnnl_dims_t &inner_idxs() const;
 
+    // Sparse memories require special handling for `no_ref_memory` modifier
+    // because of indirect access. Thus, filling should apply to metadata and
+    // it must be meaningful. It implies unconditional mapping.
+    bool is_sparse_md() const;
+
     size_t sizeof_dt() const;
 
     template <typename T>
@@ -135,29 +143,10 @@ struct dnn_mem_t {
     float get_elem(int64_t idx, int buffer_index = 0) const;
     void set_elem(int64_t idx, float value, int buffer_index = 0) const;
 
-    int64_t get_scale_idx(
-            int64_t data_idx, int scale_mask, const int ndims) const {
-        const auto &_dims = dims();
-        int64_t stride = 1;
-        int64_t offset = 0;
-
-        if (scale_mask != 0) {
-            for (int i = 0; i < ndims; ++i) {
-                int d = ndims - 1 - i;
-                auto pos = data_idx % _dims[d];
-                data_idx /= _dims[d];
-                if (scale_mask & (1 << d)) {
-                    offset += pos * stride;
-                    stride *= _dims[d];
-                }
-            }
-        }
-
-        return offset;
-    }
-
-    int64_t get_scale_idx(int64_t data_idx, int scale_mask) const {
-        return get_scale_idx(data_idx, scale_mask, ndims());
+    int64_t get_idx(int64_t logical_idx, int dims_mask, const int ndims,
+            const dims_t &groups = {}) const;
+    int64_t get_idx(int64_t logical_idx, int dims_mask) const {
+        return get_idx(logical_idx, dims_mask, ndims(), dims_t());
     }
 
     dnnl_engine_t engine() const { return engine_; }
@@ -169,7 +158,7 @@ struct dnn_mem_t {
 
     void map() const;
     void unmap() const;
-    void memset(int value, size_t size) const;
+    void memset(int value, size_t size, int buffer_index) const;
 
     static dnn_mem_t create_from_host_ptr(
             const dnnl_memory_desc_t &md, dnnl_engine_t engine, void *host_ptr);
@@ -191,6 +180,10 @@ struct dnn_mem_t {
     static benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> init_csr_md(int ndims,
             const dnnl_dims_t dims, dnnl_data_type_t data_type, dnnl_dim_t nnz,
             dnnl_data_type_t indices_dt, dnnl_data_type_t pointers_dt);
+    // Initializes memory descriptor for COO encoding.
+    static benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> init_coo_md(int ndims,
+            const dnnl_dims_t dims, dnnl_data_type_t data_type, dnnl_dim_t nnz,
+            dnnl_data_type_t indices_dt);
     // Initializes memory descriptor for packed encoding.
     static benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> init_sparse_packed_md(
             int ndims, const dnnl_dims_t dims, dnnl_data_type_t data_type,
@@ -212,7 +205,7 @@ struct dnn_mem_t {
     bool active_ = false;
 
     dnnl_engine_kind_t engine_kind_ = dnnl_any_engine;
-    dnnl_engine_t engine_ = NULL;
+    dnnl_engine_t engine_ = nullptr;
 
     mutable bool is_mapped_ = false;
     mutable std::vector<void *> mapped_ptrs_;
@@ -231,6 +224,8 @@ struct dnn_mem_t {
 
 using dnn_mem_map_t = std::unordered_map<int, dnn_mem_t>;
 
+bool has_sparse_md(const dnn_mem_map_t &dnn_mem_map);
+
 dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md);
 
 // Checks that zero padding is preserved.
diff --git a/tests/benchdnn/doc/benchdnn_general_info.md b/tests/benchdnn/doc/benchdnn_general_info.md
index 99a3f6901fe..21efcf94e8b 100644
--- a/tests/benchdnn/doc/benchdnn_general_info.md
+++ b/tests/benchdnn/doc/benchdnn_general_info.md
@@ -100,10 +100,10 @@ otherwise. The following modes (`--mode`) are supported:
 
 ## Mode modifiers
 
-Modes may have extensions to their default behavior. Those extensions may be
-enabled by special mode modifiers (`--mode-modifier`). They have limited scope
-and applicability. See details next to each modifier to know their limits.
-The following modifiers are supported:
+Modes may have extensions to their default behavior. These extensions are
+enabled by default for certain modes and further modifiers may be enabled by
+(`--mode-modifier`). See details next to each modifier to know their limits. The
+following modifiers are supported:
 * Parallel test object creation (`P`). This is an extension of step 4, when
   several backend objects, up to the number of threads identified on the system,
   are created in parallel and then executed in order. This allows to overlap
@@ -114,11 +114,10 @@ The following modifiers are supported:
   used  unless "-DDNNL_ENABLE_CONCURRENT_EXEC=ON" is enabled at the build time.
   Otherwise scratchpad pointers are invalidated due to threads used for creation
   are no longer alive at the point when execution time comes.
-* Disabling usage of host memory (`M`). This is an extension of performance mode
-  when all work with host memory is disabled. It includes mapping/unmapping
-  memory objects and also skipping filling functions with their reorders. Every
+* Disable reference memory (`M`). This is an extension for modes which do not
+  need reference memory, like run mode. It includes skipping
+  mapping/unmapping memory objects and also skipping filling functions. Every
   value of a device memory object is assigned with a special value directly.
-  This is applicable for GPU only.
 
 ## Problem Statuses
 
diff --git a/tests/benchdnn/doc/cold_cache.md b/tests/benchdnn/doc/cold_cache.md
deleted file mode 100644
index fb86dd65a88..00000000000
--- a/tests/benchdnn/doc/cold_cache.md
+++ /dev/null
@@ -1,59 +0,0 @@
-# Cold Cache Feature
-
-Benchdnn provides a mode to measure the performance of a desired
-functionality. It runs the same execute call with the same memory objects over
-and over until specific criterion is no longer triggered, signaling it's time
-to stop.
-
-Running measurements this way is advantageous in that it shows almost pure
-kernel performance, particularly under conditions where all memory objects may
-end up in a system cache. This means much faster loading time, depending on
-the cache data level, and better FLOPS or milliseconds for a
-given problem. However, it doesn't necessarily reflect the performance that
-end users typically have. For example, given an application's number of blocks
-and their sizes, most memory objects come from RAM and only some
-pieces roll over from the previous layer.
-
-In order to measure the library functionality closer to real-world
-applications, benchdnn has introduced a cold cache feature. In the core,
-each execution call in a performance measurement loop uses new memory objects
-for specific execution arguments, which is controlled by the user.
-
-To enable cold cache, users must specify `--cold-cache=MODE` and choose a
-`MODE` from one of three options: `wei`, `all`, or `custom` (lower case).
-
-`wei` mode has benchdnn prepare a pile of weights tensors and, for each new
-run, use a new set until the stack is over. Then, it starts from the top of
-the pile again. It ensures that pile size is sufficient enough to evict
-previously used sets of weights from caches. This mode emulates a regular
-situation in topologies for primitives such as (de-)convolutions, inner
-product, and matmul. Usually, these primitives (and whole models) use a
-technique called "double buffering," where all activations transfer between
-two buffers: one for source and one for destination. It allows for an
-increased cache hit rate; only weights are taken from RAM. Note that if a
-primitive requests this mode but does not have a notion of weights, a warning
-is printed to stdout and cold cache is not enabled.
-
-`all` mode estimates sizes for the whole problem and makes equal piles of
-memory objects for each execution argument so that the problem executes a
-unique set every time. It targets situations when first load happens, such as
-reorders. It also targets branching when just two buffers is not enough, and a
-copy of the previous layer comes from RAM. Note that the `user` mode scratchpad
-counts towards the estimated size since this is a separate memory object. In
-`library` mode, it doesn't count because the memory buffer is not under user's
-control.
-
-`custom` mode is a compromise between flexibility and command-line usability.
-In terms of functionality, it provides the user with an ability to specify the
-desired execution arguments to be put in cold cache, but they must be
-programmed in benchdnn. Once the `custom` value is provided and the problem
-gets executed, benchdnn checks if custom arguments are filled (by default they
-are empty). If not, an error is returned to the user with the file name and
-line where modifications are expected. Once updated, `custom` mode starts
-working with the specified arguments.
-
-Since cold cache targets measurements to show real RAM bandwidth, our
-recommendation is to utilize a custom performance template that contains
-bandwidth metrics: `--perf-template=%-Gbw%,%0Gbw%`. This example provides both
-best and average bandwidth to inspect since the execution cannot control or
-guarantee that each run will somehow utilize the cache.
diff --git a/tests/benchdnn/doc/driver_binary.md b/tests/benchdnn/doc/driver_binary.md
index 20a41c1cc71..dbda0fc6faa 100644
--- a/tests/benchdnn/doc/driver_binary.md
+++ b/tests/benchdnn/doc/driver_binary.md
@@ -17,9 +17,9 @@ where *binary-knobs* are:
             Refer to [tags](knobs_tag.md) for details.
  - `--dtag={any [default], ...}` -- physical dst memory layout.
             Refer to [tags](knobs_tag.md) for details.
- - `--alg={ADD [default], DIV, EQ, GE, GT, LE, LT, MAX, MIN, MUL, NE, SUB}` --
-            algorithm for binary operations.
-            Refer to [binary primitive](https://oneapi-src.github.io/oneDNN/dev_guide_binary.html)
+ - `--alg={ADD [default], DIV, EQ, GE, GT, LE, LT, MAX, MIN, MUL, NE, SELECT, SUB}`
+            -- algorithm for binary operations.
+            Refer to [binary primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_binary.html)
             for details.
  - `--inplace=BOOL` -- memory mode for the primitive. If `true`, it uses input
             memory as output, otherwise, input and output are separate.
diff --git a/tests/benchdnn/doc/driver_bnorm.md b/tests/benchdnn/doc/driver_bnorm.md
index 47eeec4a435..ccd59e6a88d 100644
--- a/tests/benchdnn/doc/driver_bnorm.md
+++ b/tests/benchdnn/doc/driver_bnorm.md
@@ -23,7 +23,7 @@ where *bnorm-knobs* are:
             `H` is dnnl_use_shift;
             `R` is dnnl_fuse_norm_relu;
             `A` is dnnl_fuse_norm_add_relu;
-            Refer to [batch normalization primitive](https://oneapi-src.github.io/oneDNN/dev_guide_batch_normalization.html)
+            Refer to [batch normalization primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_batch_normalization.html)
             for details.
  - `--inplace=BOOL` -- memory mode for the primitive. If `true`, it uses input
             memory as output, otherwise, input and output are separate.
diff --git a/tests/benchdnn/doc/driver_conv.md b/tests/benchdnn/doc/driver_conv.md
index ff86a7f8e2f..a1e2e6cf690 100644
--- a/tests/benchdnn/doc/driver_conv.md
+++ b/tests/benchdnn/doc/driver_conv.md
@@ -26,6 +26,11 @@ where *conv-knobs* are:
             specification for `src`, `weights`, and `dst` tensors through
             strides values. Refer to [option documentation](knob_strides.md)
             for details.
+ - `--bia-dt={undef [default], f32, bf16, f16, ...}` -- bias data type.
+            To run Inner Product without bias, use `undef` data type (default).
+            `--dir=FWD_B|BWD_WB` will set `--bia-dt` to `f32` to preserve
+            compatibility with the former behavior.
+            Refer to [data types](knobs_dt.md) for details.
  - `--alg={DIRECT [default], WINO, AUTO}` -- convolution algorithm. `WINO` is
             Winograd-based convolution. `AUTO` will pick one of `DIRECT` or
             `WINO` automatically, library-based decision.
@@ -55,8 +60,8 @@ errors.
 
 The table below shows supported name configurations for this driver:
 
-For data type support, refer to [data types](https://oneapi-src.github.io/oneDNN/dev_guide_data_types.html)
-and [convolution primitive](https://oneapi-src.github.io/oneDNN/dev_guide_convolution.html#data-types)
+For data type support, refer to [data types](https://uxlfoundation.github.io/oneDNN/dev_guide_data_types.html)
+and [convolution primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_convolution.html#data-types)
 documentation.
 
 | src  | wei  | dst  | acc  | cfg             |
diff --git a/tests/benchdnn/doc/driver_eltwise.md b/tests/benchdnn/doc/driver_eltwise.md
index b77b5c165fd..5ec55c6bf88 100644
--- a/tests/benchdnn/doc/driver_eltwise.md
+++ b/tests/benchdnn/doc/driver_eltwise.md
@@ -14,7 +14,7 @@ where *eltwise-knobs* are:
  - `--tag={nchw [default], ...}` -- physical src and dst memory layout.
             Refer to [tags](knobs_tag.md) for details.
  - `--alg={RELU [default], ...}` -- dnnl_eltwise algorithm. Refer to
-            [eltwise primitive](https://oneapi-src.github.io/oneDNN/dev_guide_eltwise.html)
+            [eltwise primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_eltwise.html)
             for details.
  - `--alpha=FLOAT` -- float value corresponding to algorithm operation.
             Refer to ``Floating point arguments`` below.
diff --git a/tests/benchdnn/doc/driver_gnorm.md b/tests/benchdnn/doc/driver_gnorm.md
index 392aae1a0b6..80351b4ec04 100644
--- a/tests/benchdnn/doc/driver_gnorm.md
+++ b/tests/benchdnn/doc/driver_gnorm.md
@@ -19,7 +19,7 @@ where *gnorm-knobs* are:
             `G` is dnnl_use_global_stats;
             `C` is dnnl_use_scale;
             `H` is dnnl_use_shift;
-            Refer to [group normalization primitive](https://oneapi-src.github.io/oneDNN/dev_guide_group_normalization.html)
+            Refer to [group normalization primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_group_normalization.html)
             for details.
  - `--inplace=BOOL` -- memory mode for the primitive. If `true`, it uses input
             memory as output, otherwise, input and output are separate.
diff --git a/tests/benchdnn/doc/driver_graph.md b/tests/benchdnn/doc/driver_graph.md
index 3738bbef326..98ae2e47d33 100644
--- a/tests/benchdnn/doc/driver_graph.md
+++ b/tests/benchdnn/doc/driver_graph.md
@@ -6,57 +6,79 @@
     ./benchdnn --graph [benchdnn-knobs] [graph-knobs] [graph-case] ...
 ```
 
-where *graph-knobs* are:
-
- - `--mb=INT` -- Override minibatch size specified in the JSON file default
-    case. When set to `0`, use batch size as defined by the
-    individual test case. The option doesn't take effect for
-    operations that don't support the `mb` concept. The default is `0`.
-
- - `--in-shapes=ID:SHAPE[*TAG+ID:SHAPE*TAG+...]` -- Override a shape and
-    stride of a graph input tensor that includes `ID` in a graph with `SHAPE` and
-    `TAG` values. `SHAPE` and `TAG` are separated by a `*`. Multiple
-    inputs may be specified using the `+` delimiter.
-
-    If both `--mb` and `--in-shapes` are set, `--mb` takes precedence
-    over `--in-shapes`.
-
-    The shape of internal tensors and graph output tensors are inferred
-    by the graph driver automatically. By default, the option value is empty,
-    meaning values are taken from the original graph.
-
-    `TAG` means the memory layout of that tensor, represented by a string starting with `a`.
-    The order may differ; a different order means a different memory layout that users may
-    provide according to their own needs. Assume, for instance, a tensor with shape `[1,32,4,4]`
-    & stride `[512,16,4,1]`, the stride of which can be represented as a Tag `abcd`. If users
-    want to modify stride to `[128,1,128,32]`, `TAG` should be provided as `acdb`, and the
-    stride values will be calculated within the Benchdnn-graph.
-
-    Below are several use options for `--in-shapes`:
-    1. Modify shape only: `--in-shapes=ID:SHAPE[+ID:SHAPE...]`. Users can modify
-            rank as well. Modifying shape to 1D tensor with shape `[0]` is also included:
-            `--in-shapes=ID:0[+ID:0+...]`.
-    2. Modify stride only: `--in-shapes=ID:TAG[+ID:TAG...]`.
-    3. Modify shape and stride: `--in-shapes=ID:SHAPE[*TAG+ID:SHAPE*TAG+...]`.
-            Users can modify rank as well.
-    4. Modify rank to 0; that is, a scalar with shape [], which is represented by `-`
-            in cml: `--in-shapes=ID:-[+ID:-+...]`.
-
-    Examples are provided below.
-
- - `--op-attrs=ID:ATTR_STRING[+ID:ATTR_STRING]` -- Override a series attributes
-            value of op with `ID` in the graph with `ATTR_STRING` values.
-            `ATTR_STRING` is `ATTR_NAME:ATTR_VALUE[*ATTR_NAME:ATTR_VALUE]`.
-            Multiple attributes value changes may be specified using the `*`
-            delimeter. Multiple ops modification may be specified using the `+`
-            delimeter. By default, the option value is empty, meaning values are taken from original graph.
- - `--expected-n-partitions=INT` -- Specify the number of expected partitions 
-    returned from the graph. `INT` is a non-negative integer value. When `INT`
-    value is `0` (the default), the check is skipped.
-
-and *graph-case* is a JSON file which is dumped by a library or created from scratch.
-It must be passed to the graph driver as `--case=JSON_FILE`. Refer to the JSON
-file example at the end of this document.
+* [graph-knobs] can have the following attributes:
+
+  - `--mb=INT` -- Override minibatch size specified in the JSON file default
+      case. When set to `0`, use batch size as defined by the
+      individual test case. The option doesn't take effect for
+      operations that don't support the `mb` concept. The default is `0`.
+
+  - `--in-shapes=ID:SHAPE[*TAG+ID:SHAPE*TAG+...]` -- Override a shape and
+      stride of a graph input tensor that includes `ID` in a graph with `SHAPE` and
+      `TAG` values. `SHAPE` and `TAG` are separated by a `*`. Multiple
+      inputs may be specified using the `+` delimiter.
+
+      If both `--mb` and `--in-shapes` are set, `--mb` takes precedence
+      over `--in-shapes`.
+
+      The shape of internal tensors and graph output tensors are inferred
+      by the graph driver automatically. By default, the option value is empty,
+      meaning values are taken from the original graph.
+
+      `TAG` means the memory layout of that tensor, represented by a string starting with `a`.
+      The order may differ; a different order means a different memory layout that users may
+      provide according to their own needs. Assume, for instance, a tensor with shape `[1,32,4,4]`
+      & stride `[512,16,4,1]`, the stride of which can be represented as a Tag `abcd`. If users
+      want to modify stride to `[128,1,128,32]`, `TAG` should be provided as `acdb`, and the
+      stride values will be calculated within the Benchdnn-graph.
+
+      Below are several use options for `--in-shapes`:
+      1. Modify shape only: `--in-shapes=ID:SHAPE[+ID:SHAPE...]`. Users can modify
+              rank as well. Modifying shape to 1D tensor with shape `[0]` is also included:
+              `--in-shapes=ID:0[+ID:0+...]`.
+      2. Modify stride only: `--in-shapes=ID:TAG[+ID:TAG...]`.
+      3. Modify shape and stride: `--in-shapes=ID:SHAPE[*TAG+ID:SHAPE*TAG+...]`.
+              Users can modify rank as well.
+      4. Modify rank to 0; that is, a scalar with shape [], which is represented by `-`
+              in cml: `--in-shapes=ID:-[+ID:-+...]`.
+
+      Examples are provided below.
+
+  - `--op-attrs=ID:ATTR_STRING[+ID:ATTR_STRING]` -- Override a series attributes
+              value of op with `ID` in the graph with `ATTR_STRING` values.
+              `ATTR_STRING` is `ATTR_NAME:ATTR_VALUE[*ATTR_NAME:ATTR_VALUE]`.
+              Multiple attributes value changes may be specified using the `*`
+              delimeter. Multiple ops modification may be specified using the `+`
+              delimeter. By default, the option value is empty, meaning values are taken from original graph.
+  - `--expected-n-partitions=INT` -- Specify the number of expected partitions
+      returned from the graph. `INT` is a non-negative integer value. When `INT`
+      value is `0`, the check is skipped. By default, the value is `1` which means
+      the graph should be fused as one partition.
+  - `--dt={undef [default], f32, bf16, f16}` -- Specify the data types in the
+    input JSON file. Currently, you can define data types for pure floating-point
+    input graph only. For example, you can specify `--dt=f16` for an `f32` graph
+    and then test it in `f16`. It has the same effect as changing the data type
+    field of all logical tensors in the input JSON file from `f32` to `f16`. If
+    `--dt` is not specified or specified as `undef`, the original data types
+    contained in the input JSON file will be used for testing.
+    - `--dt=ID:DT[+ID:DT]` -- Another format to specify the data types in the
+      input JSON file. `ID` specifies the input or output tensor of an operation
+      in the JSON file. `DT` is the target data type. To specify the data types of
+      multiple tensors, use `+` to concatenate the `ID` and `DT` pairs. An error
+      will occur if `ID` is not contained in the JSON file. oneDNN operations have
+      restrictions for input and output tensor data types. Changing the data type
+      of a tensor may lead to graph construction failures, for example, failure to
+      perform the `add_op()` operation in the graph.
+  - `--op-kind=ID:KIND[+ID:KIND]` -- Override a series of operation kinds in the
+    input JSON file. `ID` specifies the operations in the graph, and `KIND`
+    indicates the target operation kind. To specify the kind of multiple
+    operations, use `+` to concatenate the `ID` and `KIND` pairs. An error will
+    occur if `ID` is not contained in the JSON file. Currently, this override
+    behavior is only allowed for binary and eltwise operations. 
+
+* [graph-case] is a JSON file which is dumped by a library or created from
+  scratch. It must be passed to the graph driver as `--case=JSON_FILE`. Refer to
+  the JSON file example at the end of this document.
 
 The oneDNN Graph serialization feature to dump JSON files at runtime may be enabled
 by using the `-DONEDNN_ENABLE_GRAPH_DUMP=ON` build time switch. By default, dump is
@@ -64,6 +86,11 @@ disabled. When the build option is on, and the `ONEDNN_GRAPH_DUMP=subgraph` envi
 variable is specified, the library generates JSON files with partitions
 returned.
 
+## Limitations
+
+* Graph driver doesn't support `--mode-modifier=M` or `--mode=F` (which contains
+  `--mode-modifier=M` in it).
+
 ## Example
 
 Run the demo `conv_post_ops_fusion` partition
diff --git a/tests/benchdnn/doc/driver_ip.md b/tests/benchdnn/doc/driver_ip.md
index 3dcd5212ba5..142d38fc584 100644
--- a/tests/benchdnn/doc/driver_ip.md
+++ b/tests/benchdnn/doc/driver_ip.md
@@ -21,6 +21,11 @@ where *ip-knobs* are:
             Refer to [tags](knobs_tag.md) for details.
  - `--dtag={any [default], ...}` -- physical dst memory layout.
             Refer to [tags](knobs_tag.md) for details.
+ - `--bia-dt={undef [default], f32, bf16, f16, ...}` -- bias data type.
+            To run Inner Product without bias, use `undef` data type (default).
+            `--dir=FWD_B|BWD_WB` will set `--bia-dt` to `f32` to preserve
+            compatibility with the former behavior.
+            Refer to [data types](knobs_dt.md) for details.
  - `--mb=INT` -- override minibatch size specified in the problem description.
              When set to `0`, use minibatch size as defined by the individual
              problem descriptor. The default is `0`.
diff --git a/tests/benchdnn/doc/driver_lnorm.md b/tests/benchdnn/doc/driver_lnorm.md
index d87ea51f119..7b249b20da5 100644
--- a/tests/benchdnn/doc/driver_lnorm.md
+++ b/tests/benchdnn/doc/driver_lnorm.md
@@ -22,7 +22,7 @@ where *lnorm-knobs* are:
             `G` is dnnl_use_global_stats;
             `C` is dnnl_use_scale;
             `H` is dnnl_use_shift;
-            Refer to [layer normalization primitive](https://oneapi-src.github.io/oneDNN/dev_guide_layer_normalization.html)
+            Refer to [layer normalization primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_layer_normalization.html)
             for details.
  - `--inplace=BOOL` -- memory mode for the primitive. If `true`, it uses input
             memory as output, otherwise, input and output are separate.
diff --git a/tests/benchdnn/doc/driver_lrn.md b/tests/benchdnn/doc/driver_lrn.md
index 1b1fd53f571..920c59cd40d 100644
--- a/tests/benchdnn/doc/driver_lrn.md
+++ b/tests/benchdnn/doc/driver_lrn.md
@@ -16,7 +16,7 @@ where *lrn-knobs* are:
  - `--alg={ACROSS [default], WITHIN}` -- lrn algorithm.
             `ACROSS` is dnnl_lrn_across_channels;
             `WITHIN` is dnnl_lrn_within_channel;
-            Refer to [LRN primitive](https://oneapi-src.github.io/oneDNN/dev_guide_lrn.html)
+            Refer to [LRN primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_lrn.html)
             for details.
  - `--mb=INT` -- override minibatch size specified in the problem description.
              When set to `0`, use minibatch size as defined by the individual
@@ -36,7 +36,7 @@ size value and accepts integer X values. The default is `5`. `alphaF` stands for
 LRN alpha scale and accepts float F values. The default is `1.f / 8192`. `betaF`
 stands for LRN beta power and accepts float F values. The default is `0.75f`.
 `kF` stands for LRN k shift and accept float F values. The default is `1.f`.
-Refer to [LRN primitive](https://oneapi-src.github.io/oneDNN/dev_guide_lrn.html)
+Refer to [LRN primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_lrn.html)
 for details.
 
 ## Essence of Testing
diff --git a/tests/benchdnn/doc/driver_matmul.md b/tests/benchdnn/doc/driver_matmul.md
index 72033eaa84f..c37b8c41292 100644
--- a/tests/benchdnn/doc/driver_matmul.md
+++ b/tests/benchdnn/doc/driver_matmul.md
@@ -21,7 +21,7 @@ where *matmul-knobs* are:
             specification for `src`, `weights`, and `dst` tensors through
             strides values. Refer to [option documentation](knob_strides.md)
             for details.
- - `--bia_dt={undef [default], f32, s32, s8, u8}` -- bias data type.
+ - `--bia-dt={undef [default], f32, s32, s8, u8}` -- bias data type.
             To run MatMul without bias, use `undef` data type (default).
             Refer to [data types](knobs_dt.md) for details.
  - `--bia_mask=INT` -- a bit-mask that indicates which bias dimensions are
@@ -85,7 +85,7 @@ runtime, but sizes specified at creation time:
 Run single precision batched matrix multiplication with bias, of which only the
 full dimension is along the `n`-axis:
 ``` sh
-    ./benchdnn --matmul --bia_dt=f32 --bia_mask=4 2x10x30:2x30x20
+    ./benchdnn --matmul --bia-dt=f32 --bia_mask=4 2x10x30:2x30x20
 ```
 
 Run single precision batched matrix multiplication with strides so that `dst` tensor
diff --git a/tests/benchdnn/doc/driver_pool.md b/tests/benchdnn/doc/driver_pool.md
index 015dca421ed..f131d433e9b 100644
--- a/tests/benchdnn/doc/driver_pool.md
+++ b/tests/benchdnn/doc/driver_pool.md
@@ -23,7 +23,7 @@ where *pool-knobs* are:
                     dnnl_pooling_avg_exclude_padding;
             `avg_p` or `pooling_avg_include_padding` is
                     dnnl_pooling_avg_include_padding;
-            Refer to [pooling primitive](https://oneapi-src.github.io/oneDNN/dev_guide_pooling.html)
+            Refer to [pooling primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_pooling.html)
             for details.
  - `--mb=INT` -- override minibatch size specified in the problem description.
              When set to `0`, use minibatch size as defined by the individual
diff --git a/tests/benchdnn/doc/driver_reduction.md b/tests/benchdnn/doc/driver_reduction.md
index e662d780267..0f9eccc5b1f 100644
--- a/tests/benchdnn/doc/driver_reduction.md
+++ b/tests/benchdnn/doc/driver_reduction.md
@@ -16,7 +16,7 @@ where *reduction-knobs* are:
  - `--dtag={any [default], ...}` -- physical dst memory layout.
             Refer to [tags](knobs_tag.md) for details.
  - `--alg={sum [default], ...}` -- algorithm for reduction operations.
-            Refer to [reduction primitive](https://oneapi-src.github.io/oneDNN/dev_guide_reduction.html)
+            Refer to [reduction primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_reduction.html)
             for details.
  - `--p=FLOAT` -- float value corresponding to algorithm operation.
             Refer to ``Floating point arguments`` below.
diff --git a/tests/benchdnn/doc/driver_resampling.md b/tests/benchdnn/doc/driver_resampling.md
index 27f2279135a..712b4b94c57 100644
--- a/tests/benchdnn/doc/driver_resampling.md
+++ b/tests/benchdnn/doc/driver_resampling.md
@@ -18,7 +18,7 @@ where *resampling-knobs* are:
  - `--alg={nearest [default], linear}` -- resampling algorithm.
             `nearest` or `resampling_nearest` is dnnl_resampling_nearest;
             `linear` or `resampling_nearest` is dnnl_resampling_linear;
-            Refer to [resampling primitive](https://oneapi-src.github.io/oneDNN/dev_guide_resampling.html)
+            Refer to [resampling primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_resampling.html)
             for details.
  - `--mb=INT` -- override minibatch size specified in the problem description.
              When set to `0`, use minibatch size as defined by the individual
diff --git a/tests/benchdnn/doc/driver_rnn.md b/tests/benchdnn/doc/driver_rnn.md
index 530c207896c..dd3587d9777 100644
--- a/tests/benchdnn/doc/driver_rnn.md
+++ b/tests/benchdnn/doc/driver_rnn.md
@@ -47,7 +47,7 @@ where *rnn-knobs* are:
  - `--flags=[|O]` -- RNN flags, default `undef` (no flags); where multiple
             simultaneous flags are supported.
             `O` is dnnl_rnn_flags_diff_weights_overwrite;
-            Refer to [RNN primitive](https://oneapi-src.github.io/oneDNN/dev_guide_rnn.html) for details.
+            Refer to [RNN primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_rnn.html) for details.
  - Any attributes options. Refer to [attributes](knobs_attr.md) for details.
 
 and *rnn-desc* is a problem descriptor. The canonical form is:
diff --git a/tests/benchdnn/doc/driver_softmax.md b/tests/benchdnn/doc/driver_softmax.md
index 04e7a097ce7..536a6dcd433 100644
--- a/tests/benchdnn/doc/driver_softmax.md
+++ b/tests/benchdnn/doc/driver_softmax.md
@@ -20,7 +20,7 @@ where *softmax-knobs* are:
  - `--alg={SOFTMAX [default], LOGSOFTMAX}` -- softmax algorithm.
             `SOFTMAX` or `softmax_accurate` is `dnnl_softmax_accurate`;
             `LOGSOFTMAX` or `softmax_log` is `dnnl_softmax_log`;
-            Refer to [softmax primitive](https://oneapi-src.github.io/oneDNN/dev_guide_softmax.html)
+            Refer to [softmax primitive](https://uxlfoundation.github.io/oneDNN/dev_guide_softmax.html)
             for details.
  - `--axis=INT` -- dimension on which operation will be performed.
             Default is `1`, corresponds to channels in logical memory layout.
diff --git a/tests/benchdnn/doc/knob_cold_cache.md b/tests/benchdnn/doc/knob_cold_cache.md
new file mode 100644
index 00000000000..3ab3ad0d152
--- /dev/null
+++ b/tests/benchdnn/doc/knob_cold_cache.md
@@ -0,0 +1,91 @@
+# Cold Cache Feature
+
+## Introduction
+Benchdnn provides a mode to measure the performance of a desired
+functionality. It runs the same execute call with the same memory objects over
+and over until specific criterion is no longer triggered, signaling it's time
+to stop.
+
+Running measurements this way is advantageous in that it shows almost pure
+kernel performance, particularly under conditions where all memory objects may
+end up in a system cache. This means much faster loading time, depending on
+the cache data level, and better FLOPS or milliseconds for a
+given problem. However, it doesn't necessarily reflect the performance that
+end users typically have. For example, given an application's number of blocks
+and their sizes, most memory objects come from RAM and only some
+pieces roll over from the previous layer.
+
+To measure the library functionality closer to real-world applications, benchdnn
+has introduced a cold cache feature. In the core, each execution call in a
+performance measurement loop uses new memory objects for specific execution
+arguments, which is controlled by the user.
+
+## Usage
+```
+    --cold-cache=MODE[+EXTENSION]
+```
+
+`MODE` can be the one of these options: `none` (the default), `wei`, `all`, or
+`custom` (lowercase).
+
+`wei` mode has benchdnn prepare a pile of weights tensors and, for each new
+run, use a new set until the stack is over. Then, it starts from the top of
+the pile again. It ensures that pile size is sufficient to evict previously used
+sets of weights from caches. This mode emulates a regular situation in
+topologies for primitives such as (de-)convolutions, inner product, and matmul.
+
+It is common in modern frameworks that a primitive output activations reuse
+the same memory pages and cache lines as the input activations of the previously
+run primitive. As a result, cache hit rate is often increased for activation
+accesses while weights are taken from RAM. Note that if a primitive requests
+this mode but does not have a notion of weights, a warning is printed to stdout
+and cold cache is not enabled.
+
+`all` mode estimates sizes for the whole problem and makes equal piles of
+memory objects for each execution argument so that the problem executes a
+unique set every time. It targets situations when first load happens, such as
+reorders. It also targets branching when just two buffers is not enough, and a
+copy of the previous layer comes from RAM. Note that the `user` mode scratchpad
+counts towards the estimated size since this is a separate memory object. In
+`library` mode, it doesn't count because the memory buffer is not under user's
+control.
+
+`custom` mode is a compromise between flexibility and command-line usability.
+In terms of functionality, it provides the user with an ability to specify the
+desired execution arguments to be put in cold cache, but they must be
+programmed in benchdnn. Once the `custom` value is provided and the problem
+gets executed, benchdnn checks if custom arguments are filled (by default they
+are empty). If not, an error is returned to the user with the file name and
+line where modifications are expected. Once updated, `custom` mode starts
+working with the specified arguments.
+
+Since cold cache targets measurements to show real RAM bandwidth, our
+recommendation is to utilize a custom performance template that contains
+bandwidth metrics: `--perf-template=%-Gbw%,%0Gbw%`. This example provides both
+best and average bandwidth to inspect since the execution cannot control or
+guarantee that each run will somehow utilize the cache.
+
+## Extensions
+In addition to the main cold cache mode, extensions may be added. Extensions are
+separated with `+` delimiter from the `MODE` and from each other.
+
+### Cold TLB
+One of the extensions simulates a cold state for translation lookaside buffer,
+or TLB. TLB acts as another level of cache for page table entries. Having
+specific memory pages cached in TLB can result in better problem performance.
+Once enabled, benchdnn will allocate a specific amount of memory to make cold
+cache buffers trigger TLB misses.
+
+The following is the extension syntax:
+```
+    tlb[:SIZE]
+```
+
+`SIZE` is a string-literal consisting of a floating-point number followed by
+an `M` (for Megabytes) or `G` (for Gigabytes) character. By default, the `SIZE`
+is 1 Gigabyte.
+
+For example, `--cold-cache=all+tlb:2G` will instruct benchdnn to enable cold
+cache for all execution arguments and allocates an additional 2 GB of memory
+before having a performance run over cold cache arguments.
+
diff --git a/tests/benchdnn/doc/knob_impl_filter.md b/tests/benchdnn/doc/knob_impl_filter.md
new file mode 100644
index 00000000000..582d8c512d4
--- /dev/null
+++ b/tests/benchdnn/doc/knob_impl_filter.md
@@ -0,0 +1,94 @@
+# Implementation filter specification
+
+## Introduction
+oneDNN operates with compute kernel abstractions wrapped into implementation
+abstractions. Under a single primitive, multiple implementations exist and
+oneDNN may provide a different level of optimization depending on the arguments
+passed to the library, hardware characteristics, and many others. Several
+implementations may support a single problem, however, their performance may be
+different. The implementations are specified in a fixed order as defined by the
+library from the best to the worst performance outcome. Those implementation
+abstractions have names one can compare and trigger different implementations
+for a single problem or validate a specific implementation from the middle of
+the list which can't be dispatched by default.
+
+Benchdnn provides knobs allowing one to control which implementation should be
+searched for and which implementations shouldn't be considered when setting up
+the library objects.
+
+## Usage
+```
+    --impl=STRING_NAME[,STRING_NAME...]
+    --skip-impl=STRING_NAME[,STRING_NAME...]
+    --global-impl=STRING_NAME[,STRING_NAME...]
+    --global-skip-impl=STRING_NAME[,STRING_NAME...]
+```
+
+`STRING_NAME` is a string literal without spaces or quotes. Multiple names
+are supported and can be specified by a comma-separated list.
+
+The `--impl` option is used to search for an implementation containing one of
+the names provided. If the first implementation fetched doesn't match any of the
+names, benchdnn will use the library API to fetch the next one in the list until
+it finds the one that matches or reaches the end of the list. If the end is
+reached and no implementations were identified as suitable, the `SKIPPED` status
+will be returned.
+
+The `--skip-impl` option is used to skip implementations containing one of the
+names provided. If the first implementation fetched matches any of the names,
+benchdnn will use the library API to fetch the next one in the list until it
+finds the one that doesn't match or reaches the end of the list. If the end is
+reached and no implementations were identified as suitable, the `SKIPPED` status
+will be returned.
+
+Global counterparts act identically with the only exception - they override
+values from `--impl` and `--skip-impl` options because these options are
+considered as local to a specific problem. This gives the advantage of forcing
+the filtering policy regardless of what policies are set in the batch files.
+For example, if a batch file filters out the reference implementation, with the
+`--global-impl=ref` option that restriction will be obliviated.
+
+## Details
+For the preceding options, "matching" means a complete match of the
+`STRING_NAME` with the string queried from the library. For example, if the
+library returns `marvelous:any`, the `--impl=super` option will skip this
+implementation because `marvelous:any` doesn't contain the string `super`.
+However, the `--impl=marv` option will fetch it because `marvelous` contains
+`marv`.
+
+`-v6` provides additional information about filtering.
+
+## Limitations
+
+The options are logically opposite to each other. Because of this, they are
+controlled by the same object internally. Thus, if both options are specified,
+only the latter one will take effect.
+
+## Examples
+This example demonstrates the importance of precision in the matching process:
+``` sh
+benchdnn --matmul -v6 --impl=gemmx:jit:f32 64x32:32x64
+create: --matmul --impl=gemmx:jit:f32 64x32:32x64
+[IMPL_FILTER] Implementation skipped:: brg_matmul:avx512_core
+[IMPL_FILTER] Implementation skipped:: gemm:jit:f32
+[IMPL_FILTER] Implementation skipped:: brg_matmul:avx2
+[IMPL_FILTER] Implementation skipped:: ref:any
+All implementations were skipped!
+run: --matmul --impl=gemmx:jit:f32 64x32:32x64
+0:SKIPPED (Skip-impl option hit) __REPRO: --matmul --impl=gemmx:jit:f32 64x32:32x64
+```
+
+This example demonstrates the effectiveness of the global option. Note that the
+local option will be printed in the reproducer script:
+``` sh
+benchdnn --matmul -v6 --global-impl=ref --skip-impl=gemm,ref 64x64:64x32
+create: --matmul --impl=ref 64x64:64x32
+[IMPL_FILTER] Implementation skipped:: brg_matmul:avx512_core
+[IMPL_FILTER] Implementation skipped:: gemm:jit:f32
+[IMPL_FILTER] Implementation skipped:: brg_matmul:avx2
+...
+oneDNN implementation: ref:any
+run: --matmul --impl=ref 64x64:64x32
+...
+0:PASSED __REPRO: --matmul --impl=ref 64x64:64x32
+```
diff --git a/tests/benchdnn/doc/knob_summary.md b/tests/benchdnn/doc/knob_summary.md
new file mode 100644
index 00000000000..6dd5b74da56
--- /dev/null
+++ b/tests/benchdnn/doc/knob_summary.md
@@ -0,0 +1,36 @@
+# Summary information
+
+## Usage
+```
+    --summary=[no-]SETTING1[+[no-]SETTING2...]
+```
+
+The `--summary` knob is a global state of benchdnn and provides the summary
+statistics at the end of the run. Different options are separated with `+`
+delimiter. To negate the effect of the option, use the "no-" prefix in front of
+the option value.
+
+If the same setting is specified multiple times, only the latter value is
+considered.
+
+## Failed cases summary
+
+### Introduction
+A batch file can contain a large number of test cases. Running the batch file
+may result in the data in the beginning of the list getting lost due to the
+short session screen buffer. Even if the buffer fits all the test cases in it,
+to find the specific failed or unimplemented cases in the middle of the output
+usually requires additional steps such as copying the whole screen buffer to a
+memory buffer, collecting the data in a file, and using the search functionality
+to scan through the file.
+
+The `failures` knob can improve usability in such scenarios.
+
+### Usage
+```
+    --summary=[no-]failures
+```
+
+By default, you can see the summary with up to ten failed cases.
+
+To disable the summary output, use the "no-failures" input value.
diff --git a/tests/benchdnn/doc/knobs_attr.md b/tests/benchdnn/doc/knobs_attr.md
index 217ef03ad52..7155e63ecc7 100644
--- a/tests/benchdnn/doc/knobs_attr.md
+++ b/tests/benchdnn/doc/knobs_attr.md
@@ -20,7 +20,7 @@
 ## --attr-scratchpad
 `--attr-scratchpad` specifies the scratchpad mode to be used for benchmarking.
 `MODE` values can be `library` (the default) or `user`. Refer to
-[scratchpad primitive attribute](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_scratchpad.html)
+[scratchpad primitive attribute](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_scratchpad.html)
 for details.
 
 ## --attr-fpmath
@@ -29,7 +29,7 @@ for details.
 or `any`.
 `APPLY_TO_INT` values can be either `true` (the default) or `false`.
 Refer to
-[fpmath primitive attribute](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_fpmath_mode.html)
+[fpmath primitive attribute](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_fpmath_mode.html)
 for details.
 
 
@@ -37,7 +37,7 @@ for details.
 `--attr-acc-mode` specifies the accumulation mode to be used for benchmarking.
 `ACCMODE` values can be any of `strict` (the default), `relaxed`, `any`, `f32`,
 `s32` or `f16`. Refer to
-[accumulation mode primitive attribute](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_accumulation_mode.html)
+[accumulation mode primitive attribute](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_accumulation_mode.html)
 for details.
 
 ## --attr-rounding-mode
@@ -48,14 +48,14 @@ for details.
   - `diff_weights` corresponds to `DNNL_ARG_DIFF_WEIGHTS`.
 `MODE` specifies which mode to apply to the corresponding memory
 argument. Supported values are: `environment` (default) and `stochastic`.  Refer
-to [rounding mode primitive attribute](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_rounding_mode.html)
+to [rounding mode primitive attribute](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_rounding_mode.html)
 for details.
 
 ## --attr-deterministic
 `--attr-deterministic` specifies the deterministic mode to be used for
 benchmarking. `BOOL` values can be `true`, which enables the deterministic
 mode and `false` (the default), which disables it. Refer to
-[deterministic primitive attribute](https://oneapi-src.github.io/oneDNN/dev_guide_attributes_deterministic.html)
+[deterministic primitive attribute](https://uxlfoundation.github.io/oneDNN/dev_guide_attributes_deterministic.html)
 for details.
 
 ## --attr-dropout
diff --git a/tests/benchdnn/doc/knobs_common.md b/tests/benchdnn/doc/knobs_common.md
index 9d01c0ce9cd..1827c544739 100644
--- a/tests/benchdnn/doc/knobs_common.md
+++ b/tests/benchdnn/doc/knobs_common.md
@@ -65,6 +65,11 @@ checked up with ONEDNN_VERBOSE output. By default, `INDEX` is `0`. If the
 index is greater or equal to the number of devices of requested kind
 discovered on a system, a runtime error occurs.
 
+### --impl
+`--impl=STRING_NAME[,STRING_NAME...]` option is used to search for an
+implementation containing one of the names provided.
+Refer to [implementation filtering](knob_impl_filter.md) for details.
+
 ### --mem-check
 `--mem-check=BOOL` instructs the driver to perform a device RAM capability
 check if the problem fits the device including all service memory allocations.
@@ -84,7 +89,7 @@ runtimes. `KIND` values can be `usm` (default), `buffer`, `usm_device`
                `--mode=P --mode-modifier=PM --max-ms-per-prb=10`
   - `CP` or `cp` for both correctness and performance testing
   - `B` or `b` for bitwise (numerical determinism) testing
-  - `R` or `r` for run mode
+  - `R` or `r` for run mode, enables `--mode-modifier=M`
   - `I` or `i` for initialization mode
   - `L` or `l` for listing mode
 
@@ -95,7 +100,7 @@ Refer to [modes](benchdnn_general_info.md) for details.
 `MODIFIER` values can be:
   - empty for no modifiers (the default)
   - `P` or `p` for parallel backend object creation
-  - `M` or `m` for disabling usage of host memory (GPU only)
+  - `M` or `m` for disabling usage of reference memory (GPU only)
 
 Refer to [mode modifiers](benchdnn_general_info.md) for details.
 
@@ -127,26 +132,31 @@ not be reset. COMMON-OPTIONS describe a global state and, thus, are not affected
 by this option.
 
 ### --skip-impl
-`--skip-impl=STR` instructs the driver to jump to the next implementation
-in the list if the name of the returned one matches `STR` symbol-by-symbol.
-`STR` is a string literal with no spaces. When `STR` is empty (the default), the
-driver uses the first fetched implementation. `STR` supports several patterns to
-be matched against through the comma `,` delimiter between patterns. The name of
-a fetched implementation is searched against all specified patterns; and if any
-of the patterns match any part of the implementation name string, it counts as a
-hit. For example, `--skip-impl=ref,gemm` causes `ref:any` or `x64:gemm:jit`
-implementations to be skipped.
+`--skip-impl=STRING_NAME[,STRING_NAME...]` option is used to search for an
+implementation that doesn't contain any of the names provided.
+Refer to [implementation filtering](knob_impl_filter.md) for details.
 
 ### --start
 `--start=N` specifies the test index `N` to start testing from. All tests
 before the index `N` will be skipped.
 
+### --summary
+`--summary=VALUE` provides additional specific statistics output. Refer to
+[summary documentation](knob_summary.md) for details.
+
 ### --verbose
 `--verbose=N`, or a short form `-vN`, specifies the driver verbosity level.
 Additional information is printed to the stdout depending on a level `N`. `N` is
 a non-negative integer value. The default value is `0`. Refer to
 [verbose](knobs_verbose.md) for details.
 
+### --execution-mode
+`--execution-mode=MODE` specifies the execution mode to be used. When `MODE`
+is set to `direct` (the default), the driver will execute normally. When `MODE`
+is set to `graph` it instructs the driver to execute on a graph backend.
+Currently this feature is limited to the experimental SYCL Graph feature on
+DPC++ runtime with Level Zero backend.
+
 ## Correctness mode settings
 
 ### --attr-same-pd-check
@@ -162,7 +172,8 @@ changes the implementation dispatching which is an undesired behavior. When
 string against the `ref` string pattern. When `BOOL` is set to `true`, the check
 returns an error if the name matches the reference pattern. By default, the
 check is disabled. It's useful to catch unexpected fallbacks to slow reference
-implementations from a big batch of problems.
+implementations from a big batch of problems. This option is always disabled on
+NVIDIA, AMD, and Generic vendors.
 
 ### --fast-ref
 `--fast-ref=BOOL` instructs the driver to use an optimized implementation
@@ -181,7 +192,7 @@ only. This mode targets forward and backward by data propagation kinds. When
 This targets any propagation kind but mostly bandwidth-limited functionality
 to emulate first access to data or branching cases. When `MODE` is set to
 `custom`, cold cache is enabled for specified arguments, but it requires source
-code adjustments. Refer to [cold cache](cold_cache.md) for more information.
+code adjustments. Refer to [cold cache](knob_cold_cache.md) for more information.
 
 ### --fix-times-per-prb
 `--fix-times-per-prb=N` specifies the `N` number of rounds per problem to run,
diff --git a/tests/benchdnn/doc/knobs_dt.md b/tests/benchdnn/doc/knobs_dt.md
index 0a0efdbdbbf..d266599eea0 100644
--- a/tests/benchdnn/doc/knobs_dt.md
+++ b/tests/benchdnn/doc/knobs_dt.md
@@ -10,6 +10,8 @@ The following data types are supported:
 | s32       | standard int or int32_t
 | s8        | standard char or int8_t
 | u8        | standard unsigned char or uint8_t
+| f8_e4m3   | 1-byte float (1 sign bit, 4 exp bits, 3 mantissa bits)
+| f8_e5m2   | 1-byte float (1 sign bit, 5 exp bits, 2 mantissa bits)
 | f16       | 2-byte float (1 sign bit, 5 exp bits, 10 mantissa bits)
 | bf16      | 2-byte float (1 sign bit, 8 exp bits, 7 mantissa bits)
 | f64       | double precision float
diff --git a/tests/benchdnn/doc/knobs_encoding.md b/tests/benchdnn/doc/knobs_encoding.md
index 25e2ea5df55..9c4bf2ae4a6 100644
--- a/tests/benchdnn/doc/knobs_encoding.md
+++ b/tests/benchdnn/doc/knobs_encoding.md
@@ -7,6 +7,8 @@ The following sparse encodings are supported:
 | Sparse encoding | Description
 | :---            | :---
 | csr             | Compressed Sparse Row (CSR) encoding
+| coo             | Co-ordinate Sparse (COO) encoding
+| packed          | Packed Sparse encoding
 
 ## Usage
 ```
diff --git a/tests/benchdnn/doc/knobs_verbose.md b/tests/benchdnn/doc/knobs_verbose.md
index 1719bfc2e18..2a229d8dfc5 100644
--- a/tests/benchdnn/doc/knobs_verbose.md
+++ b/tests/benchdnn/doc/knobs_verbose.md
@@ -16,6 +16,8 @@ following information is printed for certain verbosity levels.
 ## Level 1
 * Problem reproducer line right after the problem was constructed. It is
   convenient to catch the repro line in case of a program crash.
+* The problem memory footprint and RAM capacity on devices in cases when the
+  limit is reached and the problem will be skipped.
 
 ## Level 2
 * Various warnings.
@@ -29,9 +31,10 @@ following information is printed for certain verbosity levels.
 * The library implementation name picked to compute the given problem.
 
 ## Level 6
-* The problem memory footprint and RAM capacity on devices.
+* The problem memory footprint and RAM capacity on devices, unconditionally.
 * Fill configuration stats.
 * Compare configuration stats.
+* Additional implementation filtering information.
 
 ## Level 7
 * Graph: prints the essential part of the graph (after the rewriter pass).
diff --git a/tests/benchdnn/eltwise/bench_eltwise.cpp b/tests/benchdnn/eltwise/bench_eltwise.cpp
index 31c602a4fa9..d3e03940787 100644
--- a/tests/benchdnn/eltwise/bench_eltwise.cpp
+++ b/tests/benchdnn/eltwise/bench_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,17 +27,7 @@
 
 namespace eltwise {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -53,11 +43,10 @@ void check_correctness(
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
         const prb_t prb(s.prb_dims, i_dir, i_dt, i_tag, i_alg, i_alpha, i_beta,
-                i_inplace, i_attr, i_ctx_init, i_ctx_exe, i_mb);
+                i_mb, i_inplace, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -144,13 +133,7 @@ int bench(int argc, char **argv) {
                         s.alg, def.alg, attr_t::post_ops_t::str2kind, argv[0])
                 || parse_inplace(s.inplace, def.inplace, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/eltwise/eltwise.cpp b/tests/benchdnn/eltwise/eltwise.cpp
index e5d1fe08c90..a80dfd012e3 100644
--- a/tests/benchdnn/eltwise/eltwise.cpp
+++ b/tests/benchdnn/eltwise/eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -211,6 +211,10 @@ int fill_data(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
                 prb->alg, "eltwise");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     /* Do fixed partitioning to have same filling for any number of threads */
     const int64_t chunk_size = 64;
@@ -404,7 +408,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -494,11 +498,16 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+        if (v_prim[1]) SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/eltwise/eltwise.hpp b/tests/benchdnn/eltwise/eltwise.hpp
index 58f4dd3ee1f..35166cbb353 100644
--- a/tests/benchdnn/eltwise/eltwise.hpp
+++ b/tests/benchdnn/eltwise/eltwise.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,15 +65,16 @@ struct prb_t : public prb_dims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_dims, s.dir[0], s.dt[0], s.tag[0], s.alg[0], s.alpha[0],
-                s.beta[0], s.inplace[0], s.attributes.front(), s.ctx_init[0],
-                s.ctx_exe[0], s.mb[0]) {
+                s.beta[0], s.mb[0], s.inplace[0], s.attributes.front(),
+                s.ctx_init[0], s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_dims_t &prb_dims, dir_t dir, dnnl_data_type_t dt,
             const std::string &tag, alg_t alg, float alpha, float beta,
-            bool inplace, const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, int64_t mb = 0)
+            int64_t mb, bool inplace, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_dims_t(prb_dims)
         , dir(dir)
         , dt(dt)
@@ -81,11 +82,12 @@ struct prb_t : public prb_dims_t {
         , alg(alg)
         , alpha(alpha)
         , beta(beta)
+        , user_mb(mb)
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
         if (mb) dims[0] = mb;
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
@@ -95,10 +97,11 @@ struct prb_t : public prb_dims_t {
     std::string tag;
     alg_t alg;
     float alpha, beta;
+    int64_t user_mb;
     bool inplace;
     attr_t attr;
-    const thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     bool use_dst() const {
         return alg == alg_t::RELU_DST || alg == alg_t::TANH_DST
@@ -169,8 +172,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/eltwise/eltwise_aux.cpp b/tests/benchdnn/eltwise/eltwise_aux.cpp
index 71470cda47b..d6a8f62f0a9 100644
--- a/tests/benchdnn/eltwise/eltwise_aux.cpp
+++ b/tests/benchdnn/eltwise/eltwise_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,6 +42,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_dims_t>(*this);
 
diff --git a/tests/benchdnn/gnorm/bench_gnorm.cpp b/tests/benchdnn/gnorm/bench_gnorm.cpp
index dc93edf694f..212d92d02fd 100644
--- a/tests/benchdnn/gnorm/bench_gnorm.cpp
+++ b/tests/benchdnn/gnorm/bench_gnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@ using namespace bnorm;
 
 namespace gnorm {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -48,12 +38,11 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
-        const prb_t prb(s.desc, i_mb, i_dir, i_dt, i_tag, i_flags, i_inplace,
-                i_attr, i_ctx_init, i_ctx_exe, s.check_alg);
+        const prb_t prb(s.desc, i_dir, i_dt, i_tag, i_flags, s.check_alg, i_mb,
+                i_inplace, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -111,13 +100,7 @@ int bench(int argc, char **argv) {
                         "flags", help_flags)
                 || parse_inplace(s.inplace, def.inplace, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/gnorm/gnorm.cpp b/tests/benchdnn/gnorm/gnorm.cpp
index 90ab3490136..2584579657a 100644
--- a/tests/benchdnn/gnorm/gnorm.cpp
+++ b/tests/benchdnn/gnorm/gnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,6 +40,10 @@ int fill_mean(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
 
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->mb, prb->g, [&](int64_t mb, int64_t g) {
         const int64_t idx = mb * prb->g + g;
@@ -66,6 +70,10 @@ int fill_src(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto spatial = prb->id * prb->ih * prb->iw;
     const float val_coeff = is_integral_dt(prb->dt[0]) ? 1.f : 0.25f;
@@ -183,6 +191,10 @@ int fill_variance_fwd(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
                 attr_t::post_ops_t::kind_t::ADD, "variance");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->mb, prb->g, [&](int64_t mb, int64_t g) {
         const int64_t idx = mb * prb->g + g;
@@ -222,6 +234,10 @@ int fill_scale(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         float val = (1.f / 8) * (1 << (c % 7));
@@ -242,6 +258,10 @@ int fill_shift(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->ic, [&](int64_t c) {
         float val = ((c % 3) - 1) * (1.f / 512 * (1 << (c % 7)));
@@ -299,6 +319,10 @@ int fill_variance_bwd(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
                 attr_t::post_ops_t::kind_t::ADD, "variance");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->mb, prb->g, [&](int64_t mb, int64_t g) {
         const int64_t idx = mb * prb->g + g;
@@ -321,6 +345,10 @@ int fill_src_bwd(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto SP = prb->id * prb->ih * prb->iw;
 
@@ -356,6 +384,10 @@ int fill_diff_dst_bwd(
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto SP = prb->id * prb->ih * prb->iw;
 
@@ -520,8 +552,8 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
                 const auto &sh = ref_args.find(DNNL_ARG_SHIFT);
                 const auto &dst = ref_args.find(DNNL_ARG_DST);
-                const int64_t c = dst.get_scale_idx(
-                        args.idx, 1 << 1 /* last_dim_mask */);
+                const int64_t c
+                        = dst.get_idx(args.idx, 1 << 1 /* last_dim_mask */);
                 const float beta = sh.get_elem(c);
                 // Using an empirically derived threshold, check if
                 // cancellation error in `|Y| = |a*X - (-b)|` is huge.
@@ -585,7 +617,7 @@ fill_cfg_t binary_po_fill_cfg(
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     // TODO: this function still allocates the full memory print needed to fill
     // the data and each argument can't be destroyed right away since filling
@@ -671,10 +703,14 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/gnorm/gnorm.hpp b/tests/benchdnn/gnorm/gnorm.hpp
index e8f356940fa..b72969f0ca7 100644
--- a/tests/benchdnn/gnorm/gnorm.hpp
+++ b/tests/benchdnn/gnorm/gnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -83,56 +83,50 @@ struct settings_t : public base_settings_t {
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.mb[0], s.dir[0], s.dt[0], s.tag[0], s.flags[0],
-                s.inplace[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
-                s.check_alg) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.tag[0], s.flags[0], s.check_alg,
+                s.mb[0], s.inplace[0], s.attributes.front(), s.ctx_init[0],
+                s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
-    prb_t(const desc_t &desc, int64_t mb, dir_t dir,
+    prb_t(const desc_t &desc, dir_t dir,
             const std::vector<dnnl_data_type_t> &dt,
-            const std::vector<std::string> &tag, flags_t flags, bool inplace,
-            const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, check_alg_t check_alg)
+            const std::vector<std::string> &tag, flags_t flags,
+            check_alg_t check_alg, int64_t mb, bool inplace, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : desc_t(desc)
-        , check_alg(check_alg)
         , dir(dir)
         , dt(dt)
         , tag(tag)
         , flags(flags)
+        , check_alg(check_alg)
+        , user_mb(mb)
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
 
         if (mb) this->mb = mb;
-        // Broadcast data types if needed
-        if (dt.size() == 1) {
-            const auto val = dt[0]; // Need a copy here.
-            this->dt.assign(2, val);
-        }
-
-        // Broadcast tags if needed
-        if (tag.size() == 1) {
-            const auto val = tag[0];
-            this->tag.assign(2, val);
-        }
+
+        broadcast_vector(this->dt, 2);
+        broadcast_vector(this->tag, 2);
 
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
-    check_alg_t check_alg;
-
     std::string stat_tag;
     dir_t dir;
     std::vector<dnnl_data_type_t> dt;
     std::vector<std::string> tag;
     flags_t flags;
+    check_alg_t check_alg;
+    int64_t user_mb;
     bool inplace;
     attr_t attr;
-    const thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     bool use_stats() const { return flags & GLOB_STATS; }
     bool use_sc() const { return flags & USE_SCALE; }
@@ -278,8 +272,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/gnorm/gnorm_aux.cpp b/tests/benchdnn/gnorm/gnorm_aux.cpp
index b5d37a246fc..0ad56590d6d 100644
--- a/tests/benchdnn/gnorm/gnorm_aux.cpp
+++ b/tests/benchdnn/gnorm/gnorm_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,8 +53,8 @@ int str2desc(desc_t *desc, const char *str) {
     const char *s = str;
     assert(s);
 
-    auto mstrtol = [](const char *nptr, char **endptr) {
-        return strtol(nptr, endptr, 10);
+    auto mstrtoll = [](const char *nptr, char **endptr) {
+        return strtoll(nptr, endptr, 10);
     };
 
 #define CASE_NN(prb, c, cvfunc) \
@@ -84,12 +84,12 @@ int str2desc(desc_t *desc, const char *str) {
 #define CASE_N(c, cvfunc) CASE_NN(#c, c, cvfunc)
     while (*s) {
         int ok = 0;
-        CASE_N(g, mstrtol);
-        CASE_N(mb, mstrtol);
-        CASE_N(ic, mstrtol);
-        CASE_N(id, mstrtol);
-        CASE_N(ih, mstrtol);
-        CASE_N(iw, mstrtol);
+        CASE_N(g, mstrtoll);
+        CASE_N(mb, mstrtoll);
+        CASE_N(ic, mstrtoll);
+        CASE_N(id, mstrtoll);
+        CASE_N(ih, mstrtoll);
+        CASE_N(iw, mstrtoll);
         CASE_N(eps, strtof);
         if (*s == 'n') {
             d.name = s + 1;
@@ -189,6 +189,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/graph/allocator.cpp b/tests/benchdnn/graph/allocator.cpp
index 7f2ee391c7d..5914f58eafb 100644
--- a/tests/benchdnn/graph/allocator.cpp
+++ b/tests/benchdnn/graph/allocator.cpp
@@ -129,7 +129,7 @@ void graph_mem_manager_t::sycl_free_wrapper(
         void *ptr, const void *device, const void *context, void *event) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
     if (!has_bench_mode_bit(mode_bit_t::corr) && is_gpu())
-        mem_pool_.deallocate(ptr, device, context, event);
+        mem_pool_.deallocate(ptr);
     else
 #endif
         default_sycl_free(ptr, device, context, event);
@@ -162,7 +162,7 @@ void *graph_mem_manager_t::ocl_malloc_wrapper(size_t size, size_t alignment,
 
 void graph_mem_manager_t::ocl_free_wrapper(
         void *buf, cl_device_id device, cl_context context, cl_event event) {
-    mem_pool_.deallocate(buf, device, context, event);
+    mem_pool_.deallocate(buf);
 }
 
 void *ocl_allocator(size_t size, size_t alignment, cl_device_id device,
diff --git a/tests/benchdnn/graph/allocator.hpp b/tests/benchdnn/graph/allocator.hpp
index 76d6c21e468..8f53b0cd64d 100644
--- a/tests/benchdnn/graph/allocator.hpp
+++ b/tests/benchdnn/graph/allocator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ struct graph_mem_manager_t {
 
 private:
     graph_mem_manager_t() : need_mem_check_(false) {}
-    ~graph_mem_manager_t() {};
+    ~graph_mem_manager_t() = default;
 
 #ifdef DNNL_WITH_SYCL
     void *default_sycl_malloc(
diff --git a/tests/benchdnn/graph/bench_graph.cpp b/tests/benchdnn/graph/bench_graph.cpp
index 5c6c3658e39..f26d46db137 100644
--- a/tests/benchdnn/graph/bench_graph.cpp
+++ b/tests/benchdnn/graph/bench_graph.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,20 +30,31 @@ void check_correctness(const settings_t &s) {
     for_(const auto &i_op_attrs : s.op_attrs_vec)
     for_(const auto &i_expected_n_partition : s.expected_n_partition_vec)
     for_(const auto &i_fpmath_mode : s.fpmath_mode_vec)
+    for_(const auto &i_op_kind_map : s.op_kind_map)
+    for_(const auto &i_dt : s.dt)
+    for_(const auto &i_dt_map : s.dt_map)
     for (const auto &i_mb : s.mb) {
-        deserialized_graph dg;
+        deserialized_graph_t dg;
         dg.load(locate_file(s.json_file));
-        flex_rewrite fw(i_in_shapes, i_op_attrs, i_fpmath_mode, i_mb);
+        flex_rewrite_t fw(i_in_shapes, i_op_attrs, i_fpmath_mode, i_mb, i_dt,
+                i_dt_map, i_op_kind_map);
         fw.rewrite(dg);
         BENCHDNN_PRINT(7, "[INFO] Graph dump:\n%s\n", dg.get_string().c_str());
 
         const prb_t prb(dg, i_expected_n_partition);
         const auto &cpp_pstr = case_to_str(s.json_file, i_in_shapes, i_op_attrs,
-                i_fpmath_mode, i_expected_n_partition, i_mb);
+                i_fpmath_mode, i_expected_n_partition, i_mb, i_dt, i_dt_map,
+                i_op_kind_map);
         const char *pstr = cpp_pstr.c_str();
         BENCHDNN_PRINT(1, "run: %s\n", pstr);
         res_t res {};
+
+        // A timer for each test case.
+        auto &tct = res.timer_map.get_timer(timer::names::test_case_timer);
+        tct.start();
         doit(&prb, &res);
+        tct.stamp();
+
         // Reset the memory size args for the graph after testing.
         reset_graph_mem_req();
 
@@ -64,8 +75,10 @@ int bench(int argc, char **argv) {
     for (; argc > 0; --argc, ++argv) {
         const bool parsed_options = parse_bench_settings(argv[0])
                 || parse_batch(bench, argv[0])
+                || parse_dt(s.dt, s.dt_map, argv[0])
                 || parse_input_shapes(s.in_shapes_vec, argv[0])
                 || parse_op_attrs(s.op_attrs_vec, argv[0])
+                || parse_op_kind(s.op_kind_map, argv[0])
                 || parse_graph_expected_n_partitions(
                         s.expected_n_partition_vec, argv[0])
                 || parse_graph_fpmath_mode(s.fpmath_mode_vec, argv[0])
diff --git a/tests/benchdnn/graph/custom_driver.cpp b/tests/benchdnn/graph/custom_driver.cpp
index 060b67cfd76..7a0540dab77 100644
--- a/tests/benchdnn/graph/custom_driver.cpp
+++ b/tests/benchdnn/graph/custom_driver.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,6 +32,69 @@ extern "C" dnnl_status_t dnnl_memory_desc_create_with_string_tag(
 
 namespace custom {
 
+namespace genindex {
+// GENINDEX OP
+// DNNL_ARG_SRC: src
+// DNNL_ARG_DST: dst
+
+std::vector<int> exec_args = {
+        DNNL_ARG_SRC,
+        DNNL_ARG_DST,
+};
+
+int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
+        const prb_t *prb, res_t *res) {
+
+    const auto &ref_engine = get_cpu_engine();
+
+    for (auto &entry : mem_map) {
+        const int exec_arg = entry.first;
+        auto &mem = entry.second;
+
+        ref_mem_map.emplace(
+                exec_arg, dnn_mem_t(mem.md_, dnnl_f32, tag::abx, ref_engine));
+        auto &ref_mem = ref_mem_map[exec_arg];
+
+        switch (exec_arg) {
+            case DNNL_ARG_SRC:
+                // For GenIndex op, the input value doesn't affect the output
+                // value, it doesn't matter what value we fill in.
+                SAFE(::custom::fill_mem(mem, ref_mem, 0, 0), WARN);
+                break;
+            default: break;
+        }
+    }
+    return OK;
+}
+
+int execute(const prb_t *prb, const args_t &args, res_t *res) {
+    dnn_mem_t &dst = const_cast<dnn_mem_t &>(args.find(DNNL_ARG_DST));
+    auto ndims = dst.ndims();
+    const size_t axis = prb->axis < 0 ? (prb->axis + ndims) : prb->axis;
+    auto dims = dst.dims();
+    auto strides = dst.strides();
+
+    benchdnn_parallel_nd(dst.nelems(), [&](int64_t index) {
+        // This function resembles dnn_mem_t::get_idx but has a format + axis
+        // peculiarity which can't be covered in get_idx function without
+        // sacrificing performance, and the current code as is.
+        size_t offdst = 0, result = 0;
+        for (int i = 0; i < ndims; i++) {
+            // calculate the idx on each dimension
+            int idx = index % dims[i];
+            if ((size_t)i == axis) result = idx;
+            index /= dims[i];
+
+            // accumulate offset for each arg
+            offdst += strides[i] * idx;
+            if (index == 0) break;
+        }
+        dst.set_elem(offdst, result);
+    });
+    return OK;
+}
+} // namespace genindex
+
 namespace select {
 // SELECT OP
 // DNNL_ARG_WEIGHTS: cond
@@ -192,6 +255,7 @@ int execute(const prb_t *prb, const args_t &args, res_t *res) {
     int ret = pad.reorder(src);
     if (ret != OK) { res->state = FAILED; }
     // update output shape with dense stride
+    dnnl_memory_desc_destroy(pad.md_);
     dnnl_memory_desc_create_with_string_tag(&pad.md_, dst.ndims(), dst.dims(),
             dst.dt(), normalize_tag(tag::abx, dst.ndims()).c_str());
     ret = dst.reorder(pad);
@@ -207,6 +271,7 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
 std::vector<int> supported_exec_args(const prb_t *prb) {
     std::vector<int> exec_args;
     switch (prb->alg) {
+        case GENINDEX: return ::custom::genindex::exec_args;
         case SELECT: return ::custom::select::exec_args;
         case TRANSPOSE: return ::custom::transpose::exec_args;
         case RESHAPE: return ::custom::reshape::exec_args;
@@ -218,6 +283,7 @@ std::vector<int> supported_exec_args(const prb_t *prb) {
 void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
         const args_t &ref_args) {
     switch (prb->alg) {
+        case GENINDEX:
         case SELECT:
         case TRANSPOSE:
         case RESHAPE: cmp.set_zero_trust_percent(100.f); break;
@@ -227,10 +293,20 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
 int fill_mem(dnn_mem_t &mem_dt, dnn_mem_t &mem_fp, int f_min, int f_max) {
 
+    const auto dt = mem_dt.dt();
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+            && !is_integral_dt(dt)) {
+        // Use data filled by benchdnn for `no_ref_memory`, except some
+        // customized operations in Graph API which expect the input
+        // values to indicate indexing information, especially for integral
+        // inputs. Hence we need to be limited the input value to the
+        // provided range.
+        return OK;
+    }
+
     const auto nelems = mem_fp.nelems();
     if (nelems == 0) return OK;
 
-    const auto dt = mem_dt.dt();
     f_min = (dt == dnnl_u8 && f_min < 0) ? 0 : f_min;
     const int64_t n_chunks = 16;
     const int64_t chunk_size = div_up(nelems, n_chunks);
@@ -256,15 +332,16 @@ void init_memory_args(dnn_mem_map_t &mem_map, const prb_t *prb,
         if (prb->arg_mds_.find(exec_arg) == prb->arg_mds_.end()) {
             assert(!"missing required args");
             SAFE_V(FAIL);
-        };
+        }
         auto arg_mds_ = prb->arg_mds_.find(exec_arg)->second;
-        dnnl_dims_t dnnl_dims;
-        auto dim = ::std::get<1>(arg_mds_);
-        for (size_t i = 0; i < dim.size(); i++) {
-            dnnl_dims[i] = dim[i];
+        dnnl_dims_t dnnl_dims {};
+        auto dims = ::std::get<1>(arg_mds_);
+        for (size_t i = 0; i < dims.size(); i++) {
+            dnnl_dims[i] = dims[i];
         }
+
         mem_map.emplace(exec_arg,
-                dnn_mem_t(static_cast<int>(dim.size()), dnnl_dims,
+                dnn_mem_t(static_cast<int>(dims.size()), dnnl_dims,
                         std::get<2>(arg_mds_), ::std::get<0>(arg_mds_),
                         test_engine));
     }
@@ -272,9 +349,12 @@ void init_memory_args(dnn_mem_map_t &mem_map, const prb_t *prb,
 
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         const prb_t *prb, res_t *res) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
-
     switch (prb->alg) {
+        case GENINDEX:
+            SAFE(::custom::genindex::init_ref_memory_args(
+                         ref_mem_map, mem_map, prb, res),
+                    WARN);
+            break;
         case SELECT:
             SAFE(::custom::select::init_ref_memory_args(
                          ref_mem_map, mem_map, prb, res),
@@ -302,6 +382,7 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {}
 int execute(const prb_t *prb, const args_t &args, res_t *res) {
     int ret = FAILED;
     switch (prb->alg) {
+        case GENINDEX: ret = ::custom::genindex::execute(prb, args, res); break;
         case SELECT: ret = ::custom::select::execute(prb, args, res); break;
         case TRANSPOSE:
             ret = ::custom::transpose::execute(prb, args, res);
diff --git a/tests/benchdnn/graph/custom_driver.hpp b/tests/benchdnn/graph/custom_driver.hpp
index a843501e60e..71de3ddeb52 100644
--- a/tests/benchdnn/graph/custom_driver.hpp
+++ b/tests/benchdnn/graph/custom_driver.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 namespace custom {
 
 enum alg_t {
+    GENINDEX,
     SELECT,
     TRANSPOSE,
     RESHAPE,
@@ -42,25 +43,27 @@ struct settings_t {
 
     ::std::unordered_map<int, arg_md_t> arg_mds_;
     ::std::vector<int64_t> order;
-    alg_t alg;
+    int64_t axis = -1;
+    alg_t alg = ALG_UNKNOWN;
 
     // A stub to be compliant with `base_settings_t`.
     void finalize() {};
 };
 
 struct prb_t {
-    prb_t(const settings_t &s) {
-        arg_mds_ = s.arg_mds_;
-        alg = s.alg;
+    prb_t(const settings_t &s) : arg_mds_(s.arg_mds_), alg(s.alg) {
         switch (alg) {
+            case GENINDEX: axis = s.axis; break;
             case TRANSPOSE: order = s.order; break;
             default: break;
         }
     }
+
     ::std::unordered_map<int, arg_md_t> arg_mds_;
     ::std::vector<int64_t> order;
+    int64_t axis = -1;
     attr_t attr;
-    alg_t alg;
+    alg_t alg = ALG_UNKNOWN;
 };
 
 dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args);
diff --git a/tests/benchdnn/graph/deserialize.cpp b/tests/benchdnn/graph/deserialize.cpp
index d4540a653c5..b695b647044 100644
--- a/tests/benchdnn/graph/deserialize.cpp
+++ b/tests/benchdnn/graph/deserialize.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ namespace graph {
 using namespace dnnl::graph;
 using namespace dnnl::impl::graph;
 
-void deserialized_attr::load(utils::json::json_reader_t *reader) {
+void deserialized_attr_t::load(utils::json::json_reader_t *reader) {
     reader->begin_object();
     std::string key_entry;
     std::string value_entry;
@@ -64,7 +64,7 @@ void deserialized_attr::load(utils::json::json_reader_t *reader) {
     reader->next_object_item(&value_entry);
 }
 
-logical_tensor::data_type deserialized_lt::get_data_type() const {
+logical_tensor::data_type deserialized_lt_t::get_data_type() const {
     if (data_type_ == "f32") {
         return logical_tensor::data_type::f32;
     } else if (data_type_ == "f16") {
@@ -83,12 +83,16 @@ logical_tensor::data_type deserialized_lt::get_data_type() const {
         return logical_tensor::data_type::f8_e5m2;
     } else if (data_type_ == "f8_e4m3") {
         return logical_tensor::data_type::f8_e4m3;
+    } else if (data_type_ == "s4") {
+        return logical_tensor::data_type::s4;
+    } else if (data_type_ == "u4") {
+        return logical_tensor::data_type::u4;
     } else {
         return logical_tensor::data_type::undef;
     }
 }
 
-logical_tensor::property_type deserialized_lt::get_property_type() const {
+logical_tensor::property_type deserialized_lt_t::get_property_type() const {
     if (property_type_ == "constant") {
         return logical_tensor::property_type::constant;
     } else if (property_type_ == "variable") {
@@ -98,7 +102,7 @@ logical_tensor::property_type deserialized_lt::get_property_type() const {
     }
 }
 
-logical_tensor deserialized_lt::create() const {
+logical_tensor deserialized_lt_t::create() const {
     if (layout_type_ == "any") {
         return logical_tensor(id_, get_data_type(), shape_,
                 logical_tensor::layout_type::any, get_property_type());
@@ -108,7 +112,7 @@ logical_tensor deserialized_lt::create() const {
     }
 }
 
-void deserialized_lt::load(utils::json::json_reader_t *reader) {
+void deserialized_lt_t::load(utils::json::json_reader_t *reader) {
     utils::json::read_helper_t helper;
 
     helper.declare_field("id", &id_);
@@ -120,7 +124,7 @@ void deserialized_lt::load(utils::json::json_reader_t *reader) {
     helper.read_fields(reader);
 }
 
-void deserialized_op::load(utils::json::json_reader_t *reader) {
+void deserialized_op_t::load(utils::json::json_reader_t *reader) {
     utils::json::read_helper_t helper;
 
     helper.declare_field("id", &id_);
@@ -132,7 +136,7 @@ void deserialized_op::load(utils::json::json_reader_t *reader) {
     helper.read_fields(reader);
 }
 
-op deserialized_op::create() const {
+op deserialized_op_t::create() const {
     op aop(id_, opstr2kind(kind_), name_);
     for (auto it = attrs_.begin(); it != attrs_.end(); ++it) {
         const auto &attr = attrstr2kind(it->first);
@@ -169,49 +173,49 @@ op deserialized_op::create() const {
     return aop;
 }
 
-bool deserialized_op::get_attr_string(
+bool deserialized_op_t::get_attr_string(
         std::string &attr, const std::string &attr_name) const {
     auto it = attrs_.find(attr_name);
     if (it == attrs_.end()) return false;
     return attr = it->second.str_value_, true;
 }
 
-bool deserialized_op::get_attr_bool(
+bool deserialized_op_t::get_attr_bool(
         bool &attr, const std::string &attr_name) const {
     auto it = attrs_.find(attr_name);
     if (it == attrs_.end()) return false;
     return attr = it->second.bool_value_, true;
 }
 
-bool deserialized_op::get_attr_f32(
+bool deserialized_op_t::get_attr_f32(
         float &attr, const std::string &attr_name) const {
     auto it = attrs_.find(attr_name);
     if (it == attrs_.end()) return false;
     return attr = it->second.f32_value_, true;
 }
 
-bool deserialized_op::get_attr_s64(
+bool deserialized_op_t::get_attr_s64(
         int64_t &attr, const std::string &attr_name) const {
     auto it = attrs_.find(attr_name);
     if (it == attrs_.end()) return false;
     return attr = it->second.s64_value_, true;
 }
 
-bool deserialized_op::get_attr_f32_vector(
+bool deserialized_op_t::get_attr_f32_vector(
         std::vector<float> &attr, const std::string &attr_name) const {
     auto it = attrs_.find(attr_name);
     if (it == attrs_.end()) return false;
     return attr = it->second.f32_vector_, true;
 }
 
-bool deserialized_op::get_attr_s64_vector(
+bool deserialized_op_t::get_attr_s64_vector(
         std::vector<int64_t> &attr, const std::string &attr_name) const {
     auto it = attrs_.find(attr_name);
     if (it == attrs_.end()) return false;
     return attr = it->second.s64_vector_, true;
 }
 
-bool deserialized_op::has_NXC_format() const {
+bool deserialized_op_t::has_NXC_format() const {
     std::string data_format;
     if (get_attr_string(data_format, "data_format")) {
         return data_format == "NXC";
@@ -230,14 +234,14 @@ bool deserialized_op::has_NXC_format() const {
     }
 }
 
-logical_tensor::dims deserialized_op::get_NCX_shape(
+logical_tensor::dims deserialized_op_t::get_NCX_shape(
         size_t idx, bool input) const {
     auto src_dims = input ? in_lts_.at(idx).shape_ : out_lts_.at(idx).shape_;
     if (has_NXC_format()) { change_format_to_ncx(src_dims); }
     return src_dims;
 }
 
-void deserialized_graph::load(const std::string &pass_config_json) {
+void deserialized_graph_t::load(const std::string &pass_config_json) {
     std::ifstream fs(pass_config_json.c_str());
     utils::json::json_reader_t read(&fs);
     utils::json::read_helper_t helper;
@@ -245,6 +249,8 @@ void deserialized_graph::load(const std::string &pass_config_json) {
     helper.declare_field("version", &version_);
     helper.declare_field("engine_kind", &engine_kind_);
     helper.declare_field("fpmath_mode", &fpmath_mode_);
+    helper.declare_field(
+            "fpmath_mode_apply_to_int", &fpmath_mode_apply_to_int_);
     helper.declare_field("input_ports", &input_ports_);
     helper.declare_field("output_ports", &output_ports_);
     helper.read_fields(&read);
@@ -278,7 +284,7 @@ void deserialized_graph::load(const std::string &pass_config_json) {
     }
 
     std::map<size_t, size_t> deg; // record indegree for each op
-    std::map<size_t, deserialized_op> ops_map; // op_id -> op
+    std::map<size_t, deserialized_op_t> ops_map; // op_id -> op
     for (const auto &aop : ops_) {
         ops_map[aop.id_] = aop;
         deg[aop.id_] = 0;
@@ -342,8 +348,11 @@ void deserialized_graph::load(const std::string &pass_config_json) {
         }
     }
 
+    // Keep the object out of the call due to recursion inside the call.
+    // Accumulates the state of mb rewrite of nested ops.
+    std::unordered_map<size_t, bool> mb_rewrite_ret;
     for (const auto &graph_in : graph_tensors_) {
-        if (check_tensor_with_mb(graph_in.first)) {
+        if (check_tensor_with_mb(graph_in.first, mb_rewrite_ret)) {
             graph_inputs_with_mb_.push_back(graph_in.first);
         }
     }
@@ -356,13 +365,13 @@ void deserialized_graph::load(const std::string &pass_config_json) {
 }
 
 // Prints the lt in the plain string format: `(id):dt:shape`.
-std::ostream &operator<<(std::ostream &s, const deserialized_lt &dlt) {
+std::ostream &operator<<(std::ostream &s, const deserialized_lt_t &dlt) {
     s << "(" << dlt.id_ << "):" << dlt.data_type_ << ":"
       << lt_dims2str(dlt.shape_);
     return s;
 }
 
-std::string deserialized_lt::get_string() const {
+std::string deserialized_lt_t::get_string() const {
     std::stringstream ss;
     ss << *this;
     return ss.str();
@@ -373,7 +382,7 @@ std::string deserialized_lt::get_string() const {
 //     In: { lt0, lt1, ... }
 //     Out: { lt0, lt1, ... }
 //     Attrs: { Scales: { val0, ... } }  // <-- if any available.
-std::ostream &operator<<(std::ostream &s, const deserialized_op &dop) {
+std::ostream &operator<<(std::ostream &s, const deserialized_op_t &dop) {
     s << "{(" << dop.id_ << ") " << dop.kind_ << "}\n";
 
     s << "    In: { ";
@@ -393,48 +402,72 @@ std::ostream &operator<<(std::ostream &s, const deserialized_op &dop) {
     s << "}\n";
 
     const auto it_attr_scales = dop.attrs_.find("scales");
+    const auto it_attr_group_shape = dop.attrs_.find("group_shape");
     const bool has_scales = it_attr_scales != dop.attrs_.end();
-    if (has_scales) {
+    const bool has_group_shape = it_attr_group_shape != dop.attrs_.end();
+
+    if (has_scales || has_group_shape) {
         s << "    Attrs: { ";
 
-        const auto &scales_v = it_attr_scales->second.f32_vector_;
-        const auto size = scales_v.size();
-        std::string size_str = " (" + std::to_string(size) + ")";
-        s << "Scales" << (size > 1 ? size_str : "") << ": { ";
-        for (size_t i = 0; i < size; i++) {
-            s << scales_v[i];
-            if (i != size - 1) s << ",";
-            s << " ";
+        if (has_scales) {
+            const auto &scales_v = it_attr_scales->second.f32_vector_;
+            const auto size = scales_v.size();
+            std::string size_str = " (" + std::to_string(size) + ")";
+            s << "Scales" << (size > 1 ? size_str : "") << ": { ";
+            for (size_t i = 0; i < size; i++) {
+                s << scales_v[i];
+                if (i != size - 1) s << ",";
+                s << " ";
+            }
+            s << "} "; // Scales
+        }
+
+        if (has_group_shape) {
+            const auto &group_shape_v = it_attr_group_shape->second.s64_vector_;
+            const auto size = group_shape_v.size();
+            std::string size_str = " (" + std::to_string(size) + ")";
+            s << "Group shape:" << (size > 1 ? size_str : "") << ": { ";
+            for (size_t i = 0; i < size; i++) {
+                s << group_shape_v[i];
+                if (i != size - 1) s << ",";
+                s << " ";
+            }
+            s << "} "; // Group Shape
         }
-        s << "} "; // Scales
+
         s << "}\n"; // Attrs
     }
+
     return s;
 }
 
-std::string deserialized_op::get_string() const {
+std::string deserialized_op_t::get_string() const {
     std::stringstream ss;
     ss << *this;
     return ss.str();
 }
 
-std::ostream &operator<<(std::ostream &s, const deserialized_graph &dg) {
+std::ostream &operator<<(std::ostream &s, const deserialized_graph_t &dg) {
     for (const auto &op : dg.ops_) {
         s << op;
     }
     return s;
 }
 
-std::string deserialized_graph::get_string() const {
+std::string deserialized_graph_t::get_string() const {
     std::stringstream ss;
     ss << *this;
     return ss.str();
 }
 
-dnnl::graph::graph deserialized_graph::to_graph(
-        dnnl::fpmath_mode fpmath_mode) const {
+dnnl::graph::graph deserialized_graph_t::to_graph(
+        const graph_fpmath_mode_t &fpmath_mode) const {
     const auto &engine = get_graph_engine();
-    dnnl::graph::graph g(engine.get_kind(), fpmath_mode);
+    dnnl::graph::graph g(engine.get_kind());
+    g.set_fpmath_mode(static_cast<dnnl::fpmath_mode>(
+                              str2fpmath_mode(fpmath_mode.mode_.c_str())),
+            fpmath_mode.apply_to_int_);
+
     for (const auto &aop : ops_) {
         try {
             g.add_op(aop.create());
@@ -447,52 +480,71 @@ dnnl::graph::graph deserialized_graph::to_graph(
     return g;
 }
 
-const deserialized_op &deserialized_graph::get_op(size_t id) const {
+const deserialized_op_t &deserialized_graph_t::get_op(size_t id) const {
     for (const auto &op : ops_) {
         if (op.id_ == id) return op;
     }
     assert(!"Given id was not found in the deserialized graph.");
-    static deserialized_op dummy;
+    static deserialized_op_t dummy;
     return dummy;
 }
 
-const deserialized_op &deserialized_graph::get_op_by_out_lt(
+const deserialized_op_t &deserialized_graph_t::get_op_by_out_lt(
         size_t out_lt_id) const {
     for_(const auto &op : ops_)
     for (const auto &out_lt : op.out_lts_) {
         if (out_lt.id_ == out_lt_id) return op;
     }
 
-    static deserialized_op dummy;
+    static deserialized_op_t dummy;
     return dummy;
 }
 
-const deserialized_op &deserialized_graph::get_op_by_in_lt(
+const deserialized_op_t &deserialized_graph_t::get_op_by_in_lt(
         size_t in_lt_id) const {
     for_(const auto &op : ops_)
     for (const auto &in_lt : op.in_lts_) {
         if (in_lt.id_ == in_lt_id) return op;
     }
 
-    static deserialized_op dummy;
+    static deserialized_op_t dummy;
     return dummy;
 }
 
-bool deserialized_graph::check_tensor_with_mb(size_t tensor_id) const {
+bool deserialized_graph_t::check_tensor_with_mb(size_t tensor_id,
+        std::unordered_map<size_t, bool> &mb_rewrite_ret) const {
     if (in_lt_2_ops_.find(tensor_id) == in_lt_2_ops_.end()) return true;
+    if (mb_rewrite_ret.find(tensor_id) != mb_rewrite_ret.end())
+        return mb_rewrite_ret.at(tensor_id);
 
+    bool ret = true;
+    // TODO: initialize with false to avoid multiple re-initialization
     for (const auto &aop : in_lt_2_ops_.at(tensor_id)) {
-        // those unsupport op need rewrite dst_shape / weight_shape also
+        const bool matmul_mb_rewrite = (aop.kind_ == "MatMul")
+                && aop.in_lts_[0].shape_.size() > 2
+                && (tensor_id == aop.in_lts_[0].id_
+                        || tensor_id == aop.in_lts_[1].id_);
+        // The second and third inputs of dynamic dequantize are allowed to
+        // rewrite md only when the sebsequent op supports mb rewriting.
+        const bool dynamicdq_mb_rewrite = (aop.kind_ == "DynamicDequantize")
+                && aop.in_lts_[0].shape_.size() > 2
+                && check_tensor_with_mb(aop.out_lts_[0].id_, mb_rewrite_ret)
+                && (tensor_id == aop.in_lts_[0].id_
+                        || tensor_id == aop.in_lts_[1].id_
+                        || tensor_id == aop.in_lts_[2].id_);
+
         if (std::find(unsupport_mb_rewrite_ops_.begin(),
                     unsupport_mb_rewrite_ops_.end(), aop.kind_)
                 != unsupport_mb_rewrite_ops_.end()) {
-            return false;
-            // bwd ops have multiple inputs with mb
+            // those unsupport op need rewrite dst_shape / weight_shape also
+            ret = false;
         } else if (std::find(bwd_ops_.begin(), bwd_ops_.end(), aop.kind_)
                 != bwd_ops_.end()) {
+            // bwd ops have multiple inputs with mb
+            ret = false;
             if (tensor_id == aop.in_lts_[0].id_
                     || tensor_id == aop.in_lts_[1].id_) {
-                check_tensor_with_mb(aop.out_lts_[0].id_);
+                ret = check_tensor_with_mb(aop.out_lts_[0].id_, mb_rewrite_ret);
                 // deal with LayerNormBackward
             } else if (aop.kind_ == "LayerNormBackward"
                     && ((tensor_id == aop.in_lts_[2].id_
@@ -501,13 +553,12 @@ bool deserialized_graph::check_tensor_with_mb(size_t tensor_id) const {
                             || (tensor_id == aop.in_lts_[3].id_
                                     && aop.in_lts_[3].shape_[0]
                                             == aop.in_lts_[0].shape_[0]))) {
-                check_tensor_with_mb(aop.out_lts_[0].id_);
-            } else {
-                return false;
+                ret = check_tensor_with_mb(aop.out_lts_[0].id_, mb_rewrite_ret);
             }
-            // binary ops need consider rank of 2 inputs
         } else if (std::find(binary_ops_.begin(), binary_ops_.end(), aop.kind_)
                 != binary_ops_.end()) {
+            // binary ops need consider rank of 2 inputs
+            ret = false;
             size_t max_rank_id = aop.in_lts_[0].shape_.size()
                             >= aop.in_lts_[1].shape_.size()
                     ? aop.in_lts_[0].id_
@@ -515,28 +566,29 @@ bool deserialized_graph::check_tensor_with_mb(size_t tensor_id) const {
             if ((aop.in_lts_[0].shape_.size() == aop.in_lts_[1].shape_.size()
                         && aop.in_lts_[1].shape_[0] != 1)
                     || tensor_id == max_rank_id) {
-                check_tensor_with_mb(aop.out_lts_[0].id_);
-            } else {
-                return false;
+                ret = check_tensor_with_mb(aop.out_lts_[0].id_, mb_rewrite_ret);
             }
-            // prelu input1 may has same shape with input0
         } else if (aop.kind_ == "PReLU" && tensor_id == aop.in_lts_[1].id_) {
+            // prelu input1 may has same shape with input0
+            ret = false;
             if (aop.in_lts_[0].shape_.size() == aop.in_lts_[1].shape_.size()) {
-                check_tensor_with_mb(aop.out_lts_[0].id_);
-            } else {
-                return false;
+                ret = check_tensor_with_mb(aop.out_lts_[0].id_, mb_rewrite_ret);
             }
-            // apply mb for all inputs of concat
-        } else if (aop.kind_ != "Concat" && tensor_id != aop.in_lts_[0].id_) {
-            return false;
-            // check consumer ops recursively
+        } else if (!(matmul_mb_rewrite || dynamicdq_mb_rewrite
+                           || aop.kind_ == "Concat")
+                && tensor_id != aop.in_lts_[0].id_) {
+            // Do not rewrite if the given tensor is not the first input of the
+            // op, except matmul, dynamic dequantize and concat.
+            ret = false;
         } else if (aop.kind_ == "End") {
-            return true;
+            ret = true;
         } else {
-            return check_tensor_with_mb(aop.out_lts_[0].id_);
+            ret = check_tensor_with_mb(aop.out_lts_[0].id_, mb_rewrite_ret);
         }
     }
-    return true;
+
+    mb_rewrite_ret.emplace(tensor_id, ret);
+    return ret;
 }
 
 } // namespace graph
diff --git a/tests/benchdnn/graph/deserialize.hpp b/tests/benchdnn/graph/deserialize.hpp
index 074564957c9..420c96a13dd 100644
--- a/tests/benchdnn/graph/deserialize.hpp
+++ b/tests/benchdnn/graph/deserialize.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ namespace graph {
 using namespace dnnl::graph;
 using namespace dnnl::impl::graph;
 
-struct deserialized_attr {
+struct deserialized_attr_t {
     std::string type_;
     std::string str_value_;
     bool bool_value_;
@@ -43,7 +43,7 @@ struct deserialized_attr {
     void load(utils::json::json_reader_t *reader);
 };
 
-struct deserialized_lt {
+struct deserialized_lt_t {
     size_t id_;
     std::string data_type_;
     logical_tensor::dims shape_;
@@ -61,17 +61,18 @@ struct deserialized_lt {
     // Outputs the information about lt from operator<< into a string.
     std::string get_string() const;
 };
-std::ostream &operator<<(std::ostream &s, const deserialized_lt &dlt);
+std::ostream &operator<<(std::ostream &s, const deserialized_lt_t &dlt);
 
-struct deserialized_op {
+struct deserialized_op_t {
     size_t id_;
     std::string name_;
     std::string kind_;
     std::string fpmath_mode_;
+    std::string fpmath_mode_apply_to_int_;
 
-    std::unordered_map<std::string, deserialized_attr> attrs_;
-    std::vector<deserialized_lt> in_lts_;
-    std::vector<deserialized_lt> out_lts_;
+    std::unordered_map<std::string, deserialized_attr_t> attrs_;
+    std::vector<deserialized_lt_t> in_lts_;
+    std::vector<deserialized_lt_t> out_lts_;
 
     op create() const;
 
@@ -97,21 +98,21 @@ struct deserialized_op {
 
     logical_tensor::dims get_NCX_shape(size_t idx, bool input) const;
 
-    // Returns `true` if `deserialized_op` wasn't created.
+    // Returns `true` if `deserialized_op_t` wasn't created.
     bool empty() const { return kind_.empty(); }
 };
-std::ostream &operator<<(std::ostream &s, const deserialized_op &dop);
+std::ostream &operator<<(std::ostream &s, const deserialized_op_t &dop);
 
-using op_ref_t = std::reference_wrapper<const deserialized_op>;
+using op_ref_t = std::reference_wrapper<const deserialized_op_t>;
 using op_ref_list_t = std::list<op_ref_t>;
 
-struct deserialized_graph {
+struct deserialized_graph_t {
     void load(const std::string &pass_config_json);
 
-    dnnl::graph::graph to_graph(dnnl::fpmath_mode fpmath_mode) const;
+    dnnl::graph::graph to_graph(const graph_fpmath_mode_t &fpmath_mode) const;
     const std::vector<size_t> &get_input_ports() const { return input_ports_; };
 
-    std::vector<deserialized_op> ops_;
+    std::vector<deserialized_op_t> ops_;
     // record all tensors id and its dims
     std::map<size_t, logical_tensor::dims> graph_tensors_;
     // reorder logical tensor id to memory tag.
@@ -121,31 +122,35 @@ struct deserialized_graph {
     std::vector<size_t> graph_inputs_with_mb_;
 
     // Returns an op based on its ID.
-    const deserialized_op &get_op(size_t id) const;
+    const deserialized_op_t &get_op(size_t id) const;
     // Returns an op based on its output logical tensor ID.
-    const deserialized_op &get_op_by_out_lt(size_t out_lt_id) const;
+    const deserialized_op_t &get_op_by_out_lt(size_t out_lt_id) const;
     // Returns an op based on its input logical tensor ID.
-    const deserialized_op &get_op_by_in_lt(size_t in_lt_id) const;
+    const deserialized_op_t &get_op_by_in_lt(size_t in_lt_id) const;
 
     // Outputs the information about graph from operator<< into a string.
     std::string get_string() const;
 
     // Return the fpmath mode attribute
-    const std::string &get_fpmath_mode() const { return fpmath_mode_; }
+    std::pair<std::string, std::string> get_fpmath_mode() const {
+        return std::make_pair(fpmath_mode_, fpmath_mode_apply_to_int_);
+    }
 
-    void set_fpmath_mode(const std::string &fpmath_mode) {
-        fpmath_mode_ = fpmath_mode;
+    void set_fpmath_mode(const graph_fpmath_mode_t &fpmath_mode) {
+        fpmath_mode_ = fpmath_mode.mode_;
+        fpmath_mode_apply_to_int_ = bool2str(fpmath_mode.apply_to_int_);
     }
 
 private:
     std::string engine_kind_;
     std::string version_;
     std::string fpmath_mode_;
+    std::string fpmath_mode_apply_to_int_;
     std::vector<size_t> input_ports_;
     std::vector<size_t> output_ports_;
 
-    std::map<size_t, std::vector<deserialized_op>> in_lt_2_ops_;
-    std::map<size_t, deserialized_op> out_lt_2_op_;
+    std::map<size_t, std::vector<deserialized_op_t>> in_lt_2_ops_;
+    std::map<size_t, deserialized_op_t> out_lt_2_op_;
     std::vector<std::string> binary_ops_ {"Add", "BiasAdd", "Divide", "Maximum",
             "Minimum", "Multiply", "Substract"};
     // need change dst_shape or weight_shape attribute value
@@ -163,9 +168,11 @@ struct deserialized_graph {
             "SigmoidBackward", "SoftMaxBackward", "SoftPlusBackward",
             "SqrtBackward", "TanhBackward"};
 
-    bool check_tensor_with_mb(size_t tensor_id) const;
+    // Check whether the tensor supports mb rewrite.
+    bool check_tensor_with_mb(size_t tensor_id,
+            std::unordered_map<size_t, bool> &mb_rewrite_ret) const;
 };
-std::ostream &operator<<(std::ostream &s, const deserialized_graph &dg);
+std::ostream &operator<<(std::ostream &s, const deserialized_graph_t &dg);
 
 } // namespace graph
 
diff --git a/tests/benchdnn/graph/flex_rewrite.cpp b/tests/benchdnn/graph/flex_rewrite.cpp
index 8247a19bc23..bf613471ddf 100644
--- a/tests/benchdnn/graph/flex_rewrite.cpp
+++ b/tests/benchdnn/graph/flex_rewrite.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,171 @@
 
 namespace graph {
 
-void flex_rewrite::rewrite(deserialized_graph &dgraph) {
+void flex_rewrite_t::rewrite_linked_shape_and_attr(
+        deserialized_graph_t &dgraph) {
+    for (auto &aop : dgraph.ops_) {
+        if (aop.kind_ == "DynamicDequantize") {
+            auto &attr = aop.attrs_;
+            if (attr.find("qtype") == attr.end()
+                    || attr["qtype"].str_value_ != "per_group")
+                continue;
+            if (attr.find("group_shape") == attr.end()) {
+                BENCHDNN_PRINT(0,
+                        "Error: missed `group-shape` attribute for "
+                        "per-group quantization for op with id=\'%zu\'\n",
+                        aop.id_);
+                SAFE_V(FAIL);
+            }
+
+            bool input_shape_rewrite = std::any_of(aop.in_lts_.begin(),
+                    aop.in_lts_.end(), [&](const deserialized_lt_t &in_lt) {
+                        return in_shapes_.count(in_lt.id_)
+                                && in_shapes_[in_lt.id_] != "default";
+                    });
+            bool group_shape_rewrite = op_attrs_.count(aop.id_)
+                    && parse_attrs(op_attrs_.at(aop.id_)).count("group_shape");
+
+            auto &group_shape = attr["group_shape"].s64_vector_;
+            const auto &src_lt = aop.in_lts_[0];
+
+            if (!group_shape_rewrite && !input_shape_rewrite) continue;
+            if (input_shape_rewrite && group_shape_rewrite) {
+                // if both input shapes and group_shape are provided, check if
+                // the new shape are valid.
+                auto &scale_lt = aop.in_lts_[1];
+                if (src_lt.shape_.size() != scale_lt.shape_.size()) {
+                    BENCHDNN_PRINT(0,
+                            "Error: the ndims of scale tensor should align "
+                            "with the ndims of input tensor for op with "
+                            "id=\'%zu\'\n",
+                            aop.id_);
+                    SAFE_V(FAIL);
+                }
+
+                if (src_lt.shape_.size() != group_shape.size()) {
+                    BENCHDNN_PRINT(0,
+                            "Error: the ndims of `group-shape` attribute "
+                            "should align with the input ndims for op with "
+                            "id=\'%zu\'\n",
+                            aop.id_);
+                    SAFE_V(FAIL);
+                }
+
+                for (size_t idx = 0; idx < src_lt.shape_.size(); ++idx) {
+                    if (src_lt.shape_[idx]
+                            != scale_lt.shape_[idx] * group_shape[idx]) {
+                        BENCHDNN_PRINT(0,
+                                "Error: the input shape should equal with the "
+                                "product of corresponding dimension of scale "
+                                "shape and group shape, input shape: %lld, "
+                                "scale shape: %lld, group shape: %lld\n",
+                                (long long)src_lt.shape_[idx],
+                                (long long)scale_lt.shape_[idx],
+                                (long long)group_shape[idx]);
+                        SAFE_V(FAIL);
+                    }
+                }
+
+                if (aop.in_lts_.size() > 2) {
+                    auto &zp_lt = aop.in_lts_[2];
+                    if (scale_lt.shape_.size() != zp_lt.shape_.size()) {
+                        BENCHDNN_PRINT(0,
+                                "Error: the ndims of scale tensor should align "
+                                "with the ndims of zero-point tensor for op "
+                                "with id=\'%zu\'\n",
+                                aop.id_);
+                        SAFE_V(FAIL);
+                    }
+
+                    for (size_t idx = 0; idx < scale_lt.shape_.size(); ++idx) {
+                        if (scale_lt.shape_[idx] != zp_lt.shape_[idx]) {
+                            BENCHDNN_PRINT(0,
+                                    "Error: the shape of zero-point tensor "
+                                    "should be the same as the shape of scale "
+                                    "tensor for op with id=\'%zu\'\n",
+                                    aop.id_);
+                            SAFE_V(FAIL);
+                        }
+                    }
+                }
+                continue;
+            }
+
+            if (group_shape_rewrite && !input_shape_rewrite) {
+                // if user only rewrite group shape attribute, update the scale
+                // shape and zps shape (if available) accordingly.
+                dims_t new_group_quant_scale_zps_dims(
+                        src_lt.shape_.size(), DNNL_GRAPH_UNKNOWN_DIM);
+                if (src_lt.shape_.size() != group_shape.size()) {
+                    BENCHDNN_PRINT(0,
+                            "Error: the ndims of `group-shape` attribute "
+                            "should align with the input ndims for op with "
+                            "id=\'%zu\'\n",
+                            aop.id_);
+                    SAFE_V(FAIL);
+                }
+
+                for (size_t idx = 0; idx < src_lt.shape_.size(); ++idx) {
+                    if (src_lt.shape_[idx] % group_shape[idx] != 0) {
+                        BENCHDNN_PRINT(0,
+                                "Error: the dimension of `group-shape` "
+                                "attribute should be divisible by the "
+                                "corresponding dimensions of the input shape, "
+                                "group shape: %lld, input shape: %lld\n",
+                                (long long)group_shape[idx],
+                                (long long)src_lt.shape_[idx]);
+                        SAFE_V(FAIL);
+                    }
+                    new_group_quant_scale_zps_dims[idx]
+                            = src_lt.shape_[idx] / group_shape[idx];
+                }
+
+                auto &scale_lt = aop.in_lts_[1];
+                scale_lt.shape_ = new_group_quant_scale_zps_dims;
+                scale_lt.stride_ = memory_tag2strides(
+                        scale_lt.shape_, dgraph.lt_2_mtag_[scale_lt.id_]);
+                if (aop.in_lts_.size() > 2) {
+                    auto &zp_lt = aop.in_lts_[2];
+                    zp_lt.shape_ = new_group_quant_scale_zps_dims;
+                    zp_lt.stride_ = memory_tag2strides(
+                            zp_lt.shape_, dgraph.lt_2_mtag_[zp_lt.id_]);
+                }
+            } else if (input_shape_rewrite && !group_shape_rewrite) {
+                // if user only rewrites input shapes, update the group-shape
+                // attribute accordingly.
+                auto &scale_lt = aop.in_lts_[1];
+                if (src_lt.shape_.size() != scale_lt.shape_.size()) {
+                    BENCHDNN_PRINT(0,
+                            "Error: the ndims of scale tensor should align "
+                            "with the ndims of input tensor for op with "
+                            "id=\'%zu\'\n",
+                            aop.id_);
+                    SAFE_V(FAIL);
+                }
+
+                std::vector<int64_t> new_group_shape(src_lt.shape_.size(), 1);
+                for (size_t idx = 0; idx < src_lt.shape_.size(); ++idx) {
+                    if (src_lt.shape_[idx] % scale_lt.shape_[idx] != 0) {
+                        BENCHDNN_PRINT(0,
+                                "Error: the dimension of scale  shape should "
+                                "be divisible by the corresponding dimensions "
+                                "of the input  shape, scale shape: %lld, input "
+                                "shape: %lld\n",
+                                (long long)scale_lt.shape_[idx],
+                                (long long)src_lt.shape_[idx]);
+                        SAFE_V(FAIL);
+                    }
+                    new_group_shape[idx]
+                            = src_lt.shape_[idx] / scale_lt.shape_[idx];
+                }
+
+                group_shape = new_group_shape;
+            }
+        }
+    }
+}
+
+void flex_rewrite_t::rewrite(deserialized_graph_t &dgraph) {
     bool change_stride = false;
     inports_shape_rewrite(dgraph, change_stride);
     if (!(op_attrs_.size() == 1 && op_attrs_.count(0)
@@ -35,10 +199,14 @@ void flex_rewrite::rewrite(deserialized_graph &dgraph) {
     }
     infer_output_shape(dgraph, change_stride);
     quantized_graph_rewrite(dgraph);
+    op_kind_rewrite(dgraph);
     graph_attrs_rewrite(dgraph);
+    rewrite_linked_shape_and_attr(dgraph);
+    dt_rewrite(dgraph);
+    dt_map_rewrite(dgraph);
 }
 
-void flex_rewrite::split_ncx(const std::string &data_format, dims_t &in,
+void flex_rewrite_t::split_ncx(const std::string &data_format, dims_t &in,
         int64_t &n, int64_t &c, dims_t &x) const {
     x.clear();
     n = in[0];
@@ -55,7 +223,7 @@ void flex_rewrite::split_ncx(const std::string &data_format, dims_t &in,
     }
 }
 
-void flex_rewrite::merge_ncx(const std::string &data_format, dims_t &out,
+void flex_rewrite_t::merge_ncx(const std::string &data_format, dims_t &out,
         int64_t n, int64_t c, const dims_t &x) const {
     out.clear();
     out.push_back(n);
@@ -72,7 +240,7 @@ void flex_rewrite::merge_ncx(const std::string &data_format, dims_t &out,
     }
 }
 
-void flex_rewrite::split_oix(const std::string &data_format, dims_t &in,
+void flex_rewrite_t::split_oix(const std::string &data_format, dims_t &in,
         dims_t &oi, dims_t &x) const {
     x.clear();
     if (data_format == "OIX" || data_format == "IOX") {
@@ -97,8 +265,22 @@ void flex_rewrite::split_oix(const std::string &data_format, dims_t &in,
     }
 }
 
-void flex_rewrite::broadcast(
-        const dims_t &x, const dims_t &y, dims_t &z) const {
+template <typename T>
+std::string stdvec2string(const std::vector<T> &v) {
+    std::string s;
+    if (v.empty()) return s;
+
+    s.append("[");
+    const size_t sz = v.size() - 1;
+    for (size_t i = 0; i < sz; i++) {
+        s.append(std::to_string(v[i])).append(", ");
+    }
+    s.append(std::to_string(v[sz])).append("]");
+    return s;
+}
+
+void flex_rewrite_t::broadcast(const dims_t &x, const dims_t &y, dims_t &z,
+        const std::string &x_str, const std::string &y_str) const {
     const size_t x_rank = x.size();
     const size_t y_rank = y.size();
     const size_t max_rank = std::max(x_rank, y_rank);
@@ -111,18 +293,25 @@ void flex_rewrite::broadcast(
         if (i >= by) r = y[i - by];
         if (l != r) {
             if (l != 1 && r != 1) {
-                fprintf(stderr, "graph: failed to broadcast in infer shape!\n");
+                BENCHDNN_PRINT(0,
+                        "Error: batched dimensions \'%lld\' from \'%s\' and "
+                        "\'%lld\' from \'%s\' are not consistent. They should "
+                        "be equal to each other or one of them should be equal "
+                        "to 1.\n",
+                        (long long)l, x_str.c_str(), (long long)r,
+                        y_str.c_str());
                 SAFE_V(FAIL);
             }
             z[i] = (l == 1 ? r : l);
         } else {
+            // Batch sizes are equal, use it as a final value.
             z[i] = l;
         }
     }
 }
 
-void flex_rewrite::cal_pads(dims_t &pads_begin, dims_t &pads_end,
-        const deserialized_op &aop, const dims_t &spatial_dims,
+void flex_rewrite_t::cal_pads(dims_t &pads_begin, dims_t &pads_end,
+        const deserialized_op_t &aop, const dims_t &spatial_dims,
         const dims_t &strides, const dims_t &kernel, bool deconv) const {
     pads_begin.clear();
     pads_end.clear();
@@ -160,8 +349,8 @@ void flex_rewrite::cal_pads(dims_t &pads_begin, dims_t &pads_end,
     }
 }
 
-void flex_rewrite::infer_output_shape(
-        deserialized_graph &dgraph, bool change_stride) {
+void flex_rewrite_t::infer_output_shape(
+        deserialized_graph_t &dgraph, bool change_stride) {
     auto &gi = dgraph.graph_tensors_;
     for (auto &aop : dgraph.ops_) {
         auto kind = opstr2kind(aop.kind_);
@@ -214,6 +403,8 @@ void flex_rewrite::infer_output_shape(
             case dnnl::graph::op::kind::Exp:
             case dnnl::graph::op::kind::GELU:
             case dnnl::graph::op::kind::GELUBackward:
+            case dnnl::graph::op::kind::GenIndex:
+            case dnnl::graph::op::kind::GreaterEqual:
             case dnnl::graph::op::kind::HardSigmoid:
             case dnnl::graph::op::kind::HardSigmoidBackward:
             case dnnl::graph::op::kind::HardSwish:
@@ -592,15 +783,26 @@ void flex_rewrite::infer_output_shape(
                     x[x.size() - 1] = n;
                     gi[out0] = x;
                 } else {
-                    assert(x[x.size() - 1] == y[y.size() - 2]);
-                    size_t a = x[x.size() - 2], b = y[y.size() - 1];
-                    x.pop_back();
-                    x.pop_back();
-                    y.pop_back();
-                    y.pop_back();
-                    broadcast(x, y, gi[out0]);
-                    gi[out0].push_back(a);
-                    gi[out0].push_back(b);
+                    // Check that K is consistent in updated inputs.
+                    if (x[x.size() - 1] != y[y.size() - 2]) {
+                        BENCHDNN_PRINT(0,
+                                "Error: updated shapes are not consistent. "
+                                "Expected element \'%lld\' from \'%s\' to be "
+                                "equal to element \'%lld\' from \'%s\'.\n",
+                                (long long)(x[x.size() - 1]),
+                                stdvec2string(x).c_str(),
+                                (long long)(y[y.size() - 2]),
+                                stdvec2string(y).c_str());
+                        SAFE_V(FAIL);
+                    }
+                    size_t M = x[x.size() - 2];
+                    size_t N = y[y.size() - 1];
+                    dims_t x_batch(x.begin(), x.end() - 2);
+                    dims_t y_batch(y.begin(), y.end() - 2);
+                    broadcast(x_batch, y_batch, gi[out0], stdvec2string(x),
+                            stdvec2string(y));
+                    gi[out0].push_back(M);
+                    gi[out0].push_back(N);
                 }
                 break;
             // infer_prelu_bwd_output_shape
@@ -716,7 +918,7 @@ void flex_rewrite::infer_output_shape(
 /// @param msg Error message info when function returns `false` value.
 /// @return `true` if an inport info is valid and `false` otherwise. A message `msg`
 /// describes an error occurred.
-bool flex_rewrite::get_inport_shape_stride(const std::string &in_shape,
+bool flex_rewrite_t::get_inport_shape_stride(const std::string &in_shape,
         std::string &shape, std::string &stride, std::string &msg) {
     assert(shape.empty() && stride.empty());
     if (in_shape == "0" || in_shape == "-") {
@@ -773,16 +975,16 @@ bool flex_rewrite::get_inport_shape_stride(const std::string &in_shape,
     }
 }
 
-void flex_rewrite::inports_shape_rewrite(
-        deserialized_graph &dgraph, bool &change_stride) {
+void flex_rewrite_t::inports_shape_rewrite(
+        deserialized_graph_t &dgraph, bool &change_stride) {
     // reminder mb rewrite status
     if (mb_ != 0 && dgraph.graph_inputs_with_mb_.empty()) {
         BENCHDNN_PRINT(0,
-                "Error: flex_rewrite: can't rewrite mb value with \'%ld\'.\n",
+                "Error: flex_rewrite_t: can't rewrite mb value with \'%ld\'.\n",
                 (long)mb_);
     }
 
-    const auto set_default_deserialized_lt = [](deserialized_lt &lt) {
+    const auto set_default_deserialized_lt_t = [](deserialized_lt_t &lt) {
         auto ndims = lt.shape_.size();
         logical_tensor::dims infer_dim(ndims, -1);
         lt.shape_ = infer_dim;
@@ -806,7 +1008,15 @@ void flex_rewrite::inports_shape_rewrite(
     for (auto &lt : aop.in_lts_) {
         // if 'lt' is not a inport, set default logical tensor info
         if (dgraph.graph_tensors_.find(lt.id_) == dgraph.graph_tensors_.end()) {
-            set_default_deserialized_lt(lt);
+            // At the same time check if in_shapes contain non-inport tensors.
+            if (in_shapes_.find(lt.id_) != in_shapes_.end()) {
+                BENCHDNN_PRINT(0,
+                        "Error: \'in-shapes\' option contains a tensor with "
+                        "id=\'%zu\' which is not an input for a given graph.\n",
+                        lt.id_);
+                SAFE_V(FAIL);
+            }
+            set_default_deserialized_lt_t(lt);
             continue;
         }
 
@@ -912,12 +1122,13 @@ void flex_rewrite::inports_shape_rewrite(
 
     for_(auto &aop : dgraph.ops_)
     for (auto &lt : aop.out_lts_) {
-        set_default_deserialized_lt(lt);
+        set_default_deserialized_lt_t(lt);
     }
 }
 
-void flex_rewrite::op_attrs_rewrite(deserialized_graph &dgraph) {
+void flex_rewrite_t::op_attrs_rewrite(deserialized_graph_t &dgraph) {
     std::vector<size_t> op_ids_;
+    op_ids_.reserve(dgraph.ops_.size());
     for (const auto &aop : dgraph.ops_) {
         op_ids_.emplace_back(aop.id_);
     }
@@ -961,7 +1172,7 @@ void flex_rewrite::op_attrs_rewrite(deserialized_graph &dgraph) {
     }
 }
 
-inline bool is_int8_quantization(const deserialized_op &aop) {
+inline bool is_int8_quantization(const deserialized_op_t &aop) {
     if (aop.kind_ == "Dequantize") {
         const auto dt = aop.in_lts_.front().get_data_type();
         return (dt == logical_tensor::data_type::u8
@@ -976,7 +1187,7 @@ inline bool is_int8_quantization(const deserialized_op &aop) {
     }
 }
 
-void flex_rewrite::quantized_graph_rewrite(deserialized_graph &dgraph) {
+void flex_rewrite_t::quantized_graph_rewrite(deserialized_graph_t &dgraph) {
     for (auto &aop : dgraph.ops_) {
         if (aop.kind_ != "Dequantize" && aop.kind_ != "Quantize") continue;
 
@@ -1017,14 +1228,208 @@ void flex_rewrite::quantized_graph_rewrite(deserialized_graph &dgraph) {
     }
 }
 
-void flex_rewrite::graph_attrs_rewrite(deserialized_graph &dgraph) {
+// Select: only rewrite src_1/src_2/dst as `cond` is always `bool`.
+void dt_rewrite_select(deserialized_op_t &select, const std::string &dt) {
+    select.in_lts_[1].data_type_ = dt;
+    select.in_lts_[2].data_type_ = dt;
+    select.out_lts_[0].data_type_ = dt;
+}
+
+// Normalization ops: only rewrite src/dst/diff_src/diff_dst as f16
+// normalization still requires f32 for gamma/beta/etc. This is good for most of
+// the cases. But there is a potential issue if gamma/beta/etc is connected to
+// another op which will rewrite the data type at other places.
+void dt_rewrite_norm(deserialized_op_t &norm, const std::string &dt) {
+    if (norm.kind_ == "BatchNormTrainingBackward"
+            || norm.kind_ == "LayerNormBackward") {
+        // rewrite for src/diff_dst/diff_src.
+        norm.in_lts_[0].data_type_ = dt;
+        norm.in_lts_[1].data_type_ = dt;
+        norm.out_lts_[0].data_type_ = dt;
+    } else {
+        // only rewrite for src/dst.
+        norm.in_lts_[0].data_type_ = dt;
+        norm.out_lts_[0].data_type_ = dt;
+    }
+}
+
+void flex_rewrite_t::dt_rewrite(deserialized_graph_t &dgraph) {
+    if (dt_ == dnnl_data_type_undef) return;
+
+    // We can only do data type rewriting for pure floating-point graph.
+    static const std::vector<dnnl_data_type_t> fp_dts {
+            dnnl_f32, dnnl_bf16, dnnl_f16};
+    if (!std::any_of(fp_dts.begin(), fp_dts.end(),
+                [this](const dnnl_data_type_t &dt) { return dt_ == dt; })) {
+        BENCHDNN_PRINT(0, "graph: rewrite: `%s` data type is not supported\n",
+                dt2str(dt_));
+        SAFE_V(FAIL);
+    }
+
+    static const std::vector<std::string> lowp_ops {
+            "TypeCast",
+            "Quantize",
+            "Dequantize",
+            "DynamicQuantize",
+            "DynamicDequantize",
+    };
+    // If the graph contains mix-precision ops, we cannot rewrite the data type
+    // trivially.
+    for (auto &aop : dgraph.ops_) {
+        if (std::any_of(lowp_ops.begin(), lowp_ops.end(),
+                    [&aop](const std::string &k) { return aop.kind_ == k; })) {
+            BENCHDNN_PRINT(0,
+                    "graph: rewrite: the graph contains operation `%s`\n",
+                    aop.kind_.c_str());
+            SAFE_V(FAIL);
+        }
+    }
+
+    // Normalization ops need additional handling. See the comments of function
+    // `dt_rewrite_norm`.
+    static const std::vector<std::string> norm_ops {
+            "BatchNormForwardTraining",
+            "BatchNormInference",
+            "BatchNormTrainingBackward",
+            "GroupNorm",
+            "LayerNorm",
+            "LayerNormBackward",
+    };
+
+    // rewrite
+    std::string str_dt(dt2str(dt_));
+    for (auto &aop : dgraph.ops_) {
+        if (aop.kind_ == "Select") {
+            dt_rewrite_select(aop, str_dt);
+        } else if (std::any_of(norm_ops.begin(), norm_ops.end(),
+                           [&aop](const std::string &k) {
+                               return aop.kind_ == k;
+                           })) {
+            dt_rewrite_norm(aop, str_dt);
+        } else if (aop.kind_ == "GenIndex") {
+            // GenIndex: only rewrite src dtype
+            aop.in_lts_[0].data_type_ = str_dt;
+        } else if (aop.kind_ == "GreaterEqual") {
+            // GreaterEqual: only rewrite src dtype when it's floating-point
+            if (std::any_of(fp_dts.begin(), fp_dts.end(),
+                        [&aop](const dnnl_data_type_t &fp_dt) {
+                            return aop.in_lts_[0].data_type_ == dt2str(fp_dt);
+                        })) {
+                aop.in_lts_[0].data_type_ = str_dt;
+                aop.in_lts_[1].data_type_ = str_dt;
+            }
+        } else {
+            for (auto &lt : aop.in_lts_) {
+                lt.data_type_ = str_dt;
+            }
+
+            for (auto &lt : aop.out_lts_) {
+                lt.data_type_ = str_dt;
+            }
+        }
+    }
+}
+
+void flex_rewrite_t::op_kind_rewrite(deserialized_graph_t &dgraph) {
+    // Step 1: check the op kind in the map and whether the given ids are in
+    // the graph.
+    for (const auto &v : op_kind_map_) {
+        if (v.second == "default") return;
+
+        auto target_kind = opstr2kind(v.second);
+        if (target_kind == dnnl::graph::op::kind::LastSymbol) {
+            BENCHDNN_PRINT(0,
+                    "graph: rewrite: invalid target op kind %s is provided\n",
+                    v.second.c_str());
+            SAFE_V(FAIL);
+        }
+
+        // Only support op kind rewrite for binary and eltwise ops.
+        auto target_driver = opkind2driver(target_kind);
+        if (target_driver != dnnl_driver_t::binary
+                && target_driver != dnnl_driver_t::eltwise) {
+            BENCHDNN_PRINT(0,
+                    "graph: rewrite: target op kind %s for id `%zd` is not "
+                    "supported\n",
+                    v.second.c_str(), v.first);
+            SAFE_V(FAIL);
+        }
+
+        auto &aop = dgraph.get_op(v.first);
+        if (aop.empty()) {
+            BENCHDNN_PRINT(0,
+                    "graph: rewrite: ID `%zd` is not found in the graph\n",
+                    v.first);
+            SAFE_V(FAIL);
+        }
+        auto op_driver = opkind2driver(opstr2kind(aop.kind_));
+        if (op_driver != target_driver) {
+            BENCHDNN_PRINT(0,
+                    "graph: rewrite: target op kind `%s` does not "
+                    "match the op kind `%s` in the graph\n",
+                    v.second.c_str(), aop.kind_.c_str());
+            SAFE_V(FAIL);
+        }
+    }
+
+    // Step 2: rewrite the op kinds.
+    for (const auto &v : op_kind_map_) {
+        for (auto &aop : dgraph.ops_) {
+            if (aop.id_ == v.first) { aop.kind_ = v.second; }
+        }
+    }
+}
+
+void flex_rewrite_t::dt_map_rewrite(deserialized_graph_t &dgraph) {
+    // check the IDs and data types in dt_map.
+    for (const auto &v : dt_map_) {
+        if (v.second == dnnl_data_type_undef) return;
+
+        bool found_id = false;
+        for (auto &aop : dgraph.ops_) {
+            for (auto &lt : aop.in_lts_) {
+                if (lt.id_ == v.first) found_id = true;
+            }
+
+            for (auto &lt : aop.out_lts_) {
+                if (lt.id_ == v.first) found_id = true;
+            }
+        }
+
+        if (!found_id) {
+            BENCHDNN_PRINT(0,
+                    "graph: rewrite: ID `%zd` is not found in the graph\n",
+                    v.first);
+            SAFE_V(FAIL);
+        }
+    }
+
+    // rewrite
+    for (const auto &v : dt_map_) {
+        const std::string str_dt(dt2str(v.second));
+        for (auto &aop : dgraph.ops_) {
+            for (auto &lt : aop.in_lts_) {
+                if (lt.id_ == v.first) lt.data_type_ = str_dt;
+            }
+
+            for (auto &lt : aop.out_lts_) {
+                if (lt.id_ == v.first) lt.data_type_ = str_dt;
+            }
+        }
+    }
+}
+
+void flex_rewrite_t::graph_attrs_rewrite(deserialized_graph_t &dgraph) {
 
-    // if the fpmath mode is specified by users through cml
-    if (fpmath_mode_ != "default") dgraph.set_fpmath_mode(fpmath_mode_);
+    // if the fpmath mode is specified by users through cml, replace the fpmath
+    // mode from JSON file with the value from cml.
+    if (fpmath_mode_.override_json_value_) dgraph.set_fpmath_mode(fpmath_mode_);
 
     for (auto &aop : dgraph.ops_) {
         // save the graph-level config for ops
-        aop.fpmath_mode_ = dgraph.get_fpmath_mode();
+        const auto &mode = dgraph.get_fpmath_mode();
+        aop.fpmath_mode_ = mode.first;
+        aop.fpmath_mode_apply_to_int_ = mode.second;
     }
 }
 
@@ -1033,8 +1438,8 @@ void flex_rewrite::graph_attrs_rewrite(deserialized_graph &dgraph) {
 /// @param dgraph A deserialized graph
 /// @param change_stride A boolean value indicating whether the graph input strides
 /// have been changed.
-void flex_rewrite::update_output_info(
-        deserialized_op &aop, deserialized_graph &dgraph, bool change_stride) {
+void flex_rewrite_t::update_output_info(deserialized_op_t &aop,
+        deserialized_graph_t &dgraph, bool change_stride) {
     auto kind = opstr2kind(aop.kind_);
     auto &gi = dgraph.graph_tensors_;
     // if a input stride is not changed, the output stride should not be changed as well
@@ -1077,6 +1482,8 @@ void flex_rewrite::update_output_info(
         case dnnl::graph::op::kind::Exp:
         case dnnl::graph::op::kind::GELU:
         case dnnl::graph::op::kind::GELUBackward:
+        case dnnl::graph::op::kind::GenIndex:
+        case dnnl::graph::op::kind::GreaterEqual:
         case dnnl::graph::op::kind::GroupNorm:
         case dnnl::graph::op::kind::HardSigmoid:
         case dnnl::graph::op::kind::HardSigmoidBackward:
diff --git a/tests/benchdnn/graph/flex_rewrite.hpp b/tests/benchdnn/graph/flex_rewrite.hpp
index 4d3dc14c60e..9abf9707878 100644
--- a/tests/benchdnn/graph/flex_rewrite.hpp
+++ b/tests/benchdnn/graph/flex_rewrite.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,27 +21,38 @@
 #include <string>
 
 #include "deserialize.hpp"
+#include "utils.hpp"
 
 namespace graph {
 
-struct flex_rewrite {
-    flex_rewrite(const std::map<size_t, std::string> &in_shapes,
+struct flex_rewrite_t {
+    flex_rewrite_t(const std::map<size_t, std::string> &in_shapes,
             const std::map<size_t, std::string> &op_attrs,
-            const std::string &fpmath_mode, const int64_t mb)
+            const graph_fpmath_mode_t &fpmath_mode, const int64_t mb,
+            const dnnl_data_type_t dt,
+            const std::map<size_t, dnnl_data_type_t> &dt_map,
+            const std::map<size_t, std::string> &op_kind_map)
         : in_shapes_(in_shapes)
         , op_attrs_(op_attrs)
         , fpmath_mode_(fpmath_mode)
-        , mb_(mb) {}
+        , mb_(mb)
+        , dt_(dt)
+        , dt_map_(dt_map)
+        , op_kind_map_(op_kind_map) {}
 
-    void rewrite(deserialized_graph &dgraph);
+    void rewrite(deserialized_graph_t &dgraph);
 
 private:
     // input shape info from CML
     std::map<size_t, std::string> in_shapes_;
     // input attributes from CML
     std::map<size_t, std::string> op_attrs_;
-    std::string fpmath_mode_;
+    graph_fpmath_mode_t fpmath_mode_;
     int64_t mb_;
+    dnnl_data_type_t dt_; // Updates whole graph with a single dt value.
+    std::map<size_t, dnnl_data_type_t>
+            dt_map_; // Updates specific LT with selected dt values.
+    std::map<size_t, std::string> op_kind_map_;
 
     void split_ncx(const std::string &data_format, dims_t &in, int64_t &n,
             int64_t &c, dims_t &x) const;
@@ -49,20 +60,29 @@ struct flex_rewrite {
             int64_t c, const dims_t &x) const;
     void split_oix(const std::string &data_format, dims_t &in, dims_t &oi,
             dims_t &x) const;
-    void broadcast(const dims_t &x, const dims_t &y, dims_t &z) const;
+    void broadcast(const dims_t &x, const dims_t &y, dims_t &z,
+            const std::string &x_str = "", const std::string &y_str = "") const;
     // Returns `pad_begin` and `pad_end` for each dimension.
     void cal_pads(dims_t &pads_begin, dims_t &pads_end,
-            const deserialized_op &aop, const dims_t &spatial_dims,
+            const deserialized_op_t &aop, const dims_t &spatial_dims,
             const dims_t &strides, const dims_t &kernel, bool deconv) const;
-    void infer_output_shape(deserialized_graph &dgraph, bool change_stride);
-    void inports_shape_rewrite(deserialized_graph &dgraph, bool &change_stride);
+    void infer_output_shape(deserialized_graph_t &dgraph, bool change_stride);
+    void inports_shape_rewrite(
+            deserialized_graph_t &dgraph, bool &change_stride);
     bool get_inport_shape_stride(const std::string &in_shape,
             std::string &shape, std::string &stride, std::string &msg);
-    void op_attrs_rewrite(deserialized_graph &dgraph);
-    void quantized_graph_rewrite(deserialized_graph &dgraph);
-    void update_output_info(deserialized_op &aop, deserialized_graph &dgraph,
-            bool change_stride);
-    void graph_attrs_rewrite(deserialized_graph &dgraph);
+    void op_attrs_rewrite(deserialized_graph_t &dgraph);
+    void quantized_graph_rewrite(deserialized_graph_t &dgraph);
+    void update_output_info(deserialized_op_t &aop,
+            deserialized_graph_t &dgraph, bool change_stride);
+    void graph_attrs_rewrite(deserialized_graph_t &dgraph);
+    void dt_rewrite(deserialized_graph_t &dgraph);
+    void dt_map_rewrite(deserialized_graph_t &dgraph);
+    void op_kind_rewrite(deserialized_graph_t &dgraph);
+    // Rewrite some linked attribute and shapes, such as group-shape and
+    // scale/zp shape of dynamic dequantization for per-group quantization, to
+    // simplify the cml input of rewriting.
+    void rewrite_linked_shape_and_attr(deserialized_graph_t &dgraph);
 };
 
 } // namespace graph
diff --git a/tests/benchdnn/graph/graph.cpp b/tests/benchdnn/graph/graph.cpp
index 2e64b360ead..1f612e7ecfa 100644
--- a/tests/benchdnn/graph/graph.cpp
+++ b/tests/benchdnn/graph/graph.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include "dnnl_common.hpp"
 #include "graph.hpp"
 #include "ref_partition.hpp"
+#include "utils/stream_kind.hpp"
 
 namespace {
 
@@ -33,7 +34,7 @@ namespace {
 /// @param partitions a list of partitions
 /// @param id_to_set_any_layout a set of ids of logical tensors with any layout
 ///     type
-void set_any_layout(const graph::deserialized_graph &dg,
+void set_any_layout(const graph::deserialized_graph_t &dg,
         const std::vector<dnnl::graph::partition> &partitions,
         std::unordered_set<size_t> &id_to_set_any_layout) {
     // mapping from output tensor id to the all supported flags of
@@ -175,7 +176,7 @@ void record_queried_logical_tensors(
 /// @param alt a deserialized logical tensor to be updated
 /// @param is_input a boolean flag to indicate to search input or output lts
 int find_logical_tensor(size_t lt_id, const graph::op_ref_list_t &ops,
-        graph::deserialized_op &aop, graph::deserialized_lt &alt,
+        graph::deserialized_op_t &aop, graph::deserialized_lt_t &alt,
         const bool is_input) {
 
     for (const auto &op : ops) {
@@ -201,6 +202,10 @@ int find_logical_tensor(size_t lt_id, const graph::op_ref_list_t &ops,
 int map_unmap_partition_mem(graph::partition_mem_map_t &partition_mem_map,
         const std::vector<dnnl::graph::logical_tensor> &lts,
         const int &map_flag, res_t *res) {
+
+    // Not map or unmap the reference primitive memories for `no_ref_memory`
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
+
     // In case one logical tensor is used for multiple inputs, record the
     // processed logical tensor ids to avoid duplicate processing
     std::unordered_set<size_t> processed_ids;
@@ -243,8 +248,8 @@ int make_input_tensors(std::vector<dnnl::graph::tensor> &input_ts,
         // find the op id of the input logical tensor
         const auto &in = ins[idx];
         const auto &lt_id = in.get_id();
-        graph::deserialized_lt lt;
-        graph::deserialized_op op;
+        graph::deserialized_lt_t lt;
+        graph::deserialized_op_t op;
         if (find_logical_tensor(lt_id, ops, op, lt, true) != OK) {
             BENCHDNN_PRINT(0,
                     "FAIL: Cannot find logical tensor with id %zu! \n", lt_id);
@@ -252,7 +257,6 @@ int make_input_tensors(std::vector<dnnl::graph::tensor> &input_ts,
         }
 
         // generate tensor for graph path
-
         const auto iter = partition_mem_map.find(lt_id);
         if (iter != partition_mem_map.end()) {
             const auto &graph_mem = iter->second;
@@ -283,8 +287,8 @@ int make_output_tensors(std::vector<dnnl::graph::tensor> &output_ts,
         // find the op id of the output logical tensor
         const auto &out = outs[idx];
         const auto &lt_id = out.get_id();
-        graph::deserialized_op op;
-        graph::deserialized_lt lt;
+        graph::deserialized_op_t op;
+        graph::deserialized_lt_t lt;
         if (find_logical_tensor(lt_id, ops, op, lt, false) != OK) {
             BENCHDNN_PRINT(0,
                     "FAIL: Cannot find logical tensor with id %zu! \n", lt_id);
@@ -342,13 +346,40 @@ using namespace dnnl::graph;
 std::string case_to_str(const std::string &json_file,
         const std::map<size_t, std::string> &in_shapes,
         const std::map<size_t, std::string> &op_attrs,
-        const std::string &fpmath_mode, const size_t expected_n_partitions,
-        const int64_t mb) {
+        const graph_fpmath_mode_t &fpmath_mode,
+        const size_t expected_n_partitions, const int64_t mb,
+        const dnnl_data_type_t dt,
+        const std::map<size_t, dnnl_data_type_t> &dt_map,
+        const std::map<size_t, std::string> &op_kind_map) {
     std::stringstream s;
     dump_global_params(s);
 
     if (mb != 0) { s << "--mb=" << mb << " "; }
 
+    if (dt != dnnl_data_type_undef) { s << "--dt=" << dt << " "; }
+
+    const bool skip_dts = dt_map.empty()
+            || (dt_map.size() == 1 && dt_map.count(SIZE_MAX) == 1);
+    if (!skip_dts) {
+        s << "--dt=";
+        std::string tmp;
+        for (const auto &v : dt_map) {
+            tmp += (std::to_string(v.first) + ":" + dt2str(v.second) + "+");
+        }
+        s << tmp.substr(0, tmp.length() - 1) << " ";
+    }
+
+    if (!(op_kind_map.size() == 1 && op_kind_map.count(SIZE_MAX) == 1
+                && op_kind_map.at(SIZE_MAX) == "default")) {
+        s << "--op-kind=";
+        std::string tmp;
+        for (const auto &v : op_kind_map) {
+            tmp += (std::to_string(v.first) + ":" + v.second + "+");
+        }
+        // Remove dangling '+'.
+        s << tmp.substr(0, tmp.size() - 1) << " ";
+    }
+
     if (!(in_shapes.size() == 1 && in_shapes.count(0)
                 && in_shapes.at(0) == "default")) {
         s << "--in-shapes=";
@@ -372,11 +403,13 @@ std::string case_to_str(const std::string &json_file,
         s << " ";
     }
 
-    if (strcmp(fpmath_mode.c_str(), "default") != 0) {
-        s << "--attr-fpmath=" << fpmath_mode << " ";
+    if (fpmath_mode.override_json_value_) {
+        s << "--attr-fpmath=" << fpmath_mode.mode_.c_str();
+        if (fpmath_mode.apply_to_int_) { s << ":true"; }
+        s << " ";
     }
 
-    if (expected_n_partitions != 0) {
+    if (expected_n_partitions != 1) {
         s << "--expected-n-partitions=" << std::to_string(expected_n_partitions)
           << " ";
     }
@@ -386,21 +419,16 @@ std::string case_to_str(const std::string &json_file,
 }
 
 void skip_unimplemented_ops(const dnnl::graph::partition &partition,
-        const deserialized_graph &dg, res_t *res) {
-    // Unconditionally skip all unimplemented cases for Graph Compiler. They got
-    // triggered when `_DNNL_DISABLE_DNNL_BACKEND=1` is utilized.
-    // TODO: extend with `getenv` call if limits too much.
-    if (is_gc_backend()) {
-        res->state = SKIPPED;
-        res->reason = skip_reason::case_not_supported;
-        return;
-    }
-
+        const deserialized_graph_t &dg, res_t *res) {
     // A list of ops that don't have DNNL backend support so far.
     static const std::vector<std::string> unimplemented_ops {"Pow"};
-
+    // A list of ops that don't have DNNL backend support so far on GPU.
+    static const std::vector<std::string> unimplemented_ops_gpu {
+            "GreaterEqual"};
+    const auto &eng = get_graph_engine();
+    bool is_gpu = eng.get_kind() == dnnl::engine::kind::gpu;
     // For an unsupported partition, retrieve all operation IDs, find a
-    // correspondent operation kind in a deserialized_graph and match it against
+    // correspondent operation kind in a deserialized_graph_t and match it against
     // a list of known unsupported ops.
     const std::vector<size_t> &partition_op_ids = partition.get_ops();
     for (const size_t op_id : partition_op_ids) {
@@ -417,19 +445,20 @@ void skip_unimplemented_ops(const dnnl::graph::partition &partition,
             res->reason = skip_reason::case_not_supported;
             return;
         }
-    }
-}
 
-void skip_unimplemented_graph_attribute(
-        const dnnl::fpmath_mode &fpmath_mode, res_t *res) {
-    // Compiler backend only supports strict and bf16 for floating-point math
-    // mode
-    if (is_gc_backend()) {
-        if (fpmath_mode != dnnl::fpmath_mode::strict
-                && fpmath_mode != dnnl::fpmath_mode::bf16) {
-            res->state = SKIPPED;
-            res->reason = skip_reason::case_not_supported;
-            return;
+        if (is_gpu) {
+            const bool has_unimplemented_op_gpu = std::any_of(
+                    unimplemented_ops_gpu.begin(), unimplemented_ops_gpu.end(),
+                    [&dg_op_kind](const std::string &kind) {
+                        return dg_op_kind == kind;
+                    });
+            if (has_unimplemented_op_gpu) {
+                BENCHDNN_PRINT(2, "[INFO]: Unimplemented op on GPU: %s.\n",
+                        dg_op_kind.c_str());
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
         }
     }
 }
@@ -455,9 +484,6 @@ int doit(const prb_t *prb, res_t *res) {
     skip_start(res);
     if (res->state == SKIPPED) return OK;
 
-    skip_unimplemented_graph_attribute(prb->fpmath_mode, res);
-    if (res->state == SKIPPED) return OK;
-
     const auto &dg = prb->dg;
     const auto graph_in_ports = dg.get_input_ports();
     auto ograph = dg.to_graph(prb->fpmath_mode);
@@ -469,19 +495,6 @@ int doit(const prb_t *prb, res_t *res) {
         if (aop.kind_ == "End") { end_opid_v.emplace_back(aop.id_); }
     }
 
-    if (prb->expected_n_partition != 0) {
-        // If the expected partition num is specified by user with command line
-        // knob
-        if (partitions.size() != prb->expected_n_partition) {
-            BENCHDNN_PRINT(0,
-                    "Error: the expected number of partitions (%zu) doesn't "
-                    "coincide with the actual number of partitions returned "
-                    "(%zu).\n ",
-                    prb->expected_n_partition, partitions.size());
-            return res->state = FAILED, FAIL;
-        }
-    }
-
     if (partitions.empty()) {
         BENCHDNN_PRINT(0, "%s\n", "Error: partitions are empty");
         return res->state = FAILED, FAIL;
@@ -518,6 +531,11 @@ int doit(const prb_t *prb, res_t *res) {
                 case logical_tensor::data_type::f8_e4m3:
                     in_out_dt.emplace_back(dnnl_f8_e4m3);
                     break;
+                case logical_tensor::data_type::s4:
+                    in_out_dt.emplace_back(dnnl_s4);
+                    break;
+                case logical_tensor::data_type::u4:
+                    in_out_dt.emplace_back(dnnl_u4);
                 default: break;
             }
         }
@@ -546,10 +564,27 @@ int doit(const prb_t *prb, res_t *res) {
         return FAIL;
     }
 
+    if (prb->expected_n_partition != 0
+            && partitions.size() != prb->expected_n_partition) {
+        BENCHDNN_PRINT(0,
+                "Error: the expected number of partitions (%zu) doesn't "
+                "coincide with the actual number of partitions returned "
+                "(%zu).\n ",
+                prb->expected_n_partition, partitions.size());
+        return res->state = FAILED, FAIL;
+    }
+
     if (res->state == SKIPPED || res->state == UNIMPLEMENTED) return OK;
 
     const auto &eng = get_graph_engine();
-    cpp_stream_t strm {eng};
+    const dnnl::engine &dnnl_eng = static_cast<const dnnl::engine>(eng);
+
+    const bool use_profiling = has_bench_mode_bit(mode_bit_t::perf)
+            && is_gpu(dnnl_eng.get()) && !is_nvidia_gpu(dnnl_eng.get())
+            && !is_amd_gpu(dnnl_eng.get());
+    dnnl_stream_flags_t flags
+            = stream_kind2stream_flags(stream_kind, use_profiling);
+    cpp_stream_t strm {eng, static_cast<dnnl::stream::flags>(flags)};
 
     // mark the output logical tensors of partition as ANY layout enabled
     std::unordered_set<size_t> id_to_set_any_layout;
@@ -617,10 +652,12 @@ int doit(const prb_t *prb, res_t *res) {
         std::vector<dnnl::graph::tensor> output_ts(outputs.size());
 
         ref_partition_t ref_partition(dg, partitions[i], inputs, outputs);
+
         // Construct memory for both perf & corr modes
-        SAFE(ref_partition.init_ref(
-                     graph_in_ports, partition_mem_map_v[i], res),
-                WARN);
+        SAFE(ref_partition.init_ref(graph_in_ports, res), WARN);
+        if (res->state == SKIPPED) return OK;
+
+        SAFE(ref_partition.init_graph_mem(partition_mem_map_v[i], res), WARN);
         if (res->state == SKIPPED) return OK;
 
         if (has_bench_mode_bit(mode_bit_t::corr)) {
@@ -637,15 +674,12 @@ int doit(const prb_t *prb, res_t *res) {
         }
 
         // unmap memory from host to device
-        map_unmap_partition_mem(partition_mem_map_v[i], inputs, UNMAP, res);
-        map_unmap_partition_mem(partition_mem_map_v[i], outputs, UNMAP, res);
-        if (res->state == FAIL) {
-            BENCHDNN_PRINT(0,
-                    "FAIL: Fail to unmap memories to host for partition "
-                    "%zu.\n",
-                    i);
-            return FAIL;
-        }
+        SAFE(map_unmap_partition_mem(
+                     partition_mem_map_v[i], inputs, UNMAP, res),
+                WARN);
+        SAFE(map_unmap_partition_mem(
+                     partition_mem_map_v[i], outputs, UNMAP, res),
+                WARN);
 
         const op_ref_list_t &op_list = ref_partition.get_partition_ops();
         const auto &inplace_ports
@@ -685,8 +719,10 @@ int doit(const prb_t *prb, res_t *res) {
         graph_mem_mgr.stop_graph_mem_check();
 
         // map memory from device back to host
-        map_unmap_partition_mem(partition_mem_map_v[i], inputs, MAP, res);
-        map_unmap_partition_mem(partition_mem_map_v[i], outputs, MAP, res);
+        SAFE(map_unmap_partition_mem(partition_mem_map_v[i], inputs, MAP, res),
+                WARN);
+        SAFE(map_unmap_partition_mem(partition_mem_map_v[i], outputs, MAP, res),
+                WARN);
 
         // If the device is out-of-memory due to graph path execution, skip the
         // case.
diff --git a/tests/benchdnn/graph/graph.hpp b/tests/benchdnn/graph/graph.hpp
index a2d45aafccf..bff805fc877 100644
--- a/tests/benchdnn/graph/graph.hpp
+++ b/tests/benchdnn/graph/graph.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,16 +42,22 @@ struct settings_t : public base_settings_t {
 
     // ctor to save certain fields from resetting
     settings_t(const char *perf_template) : settings_t() {
-        this->perf_template = perf_template;
+        this->perf_template
+                = perf_template; // NOLINT(cppcoreguidelines-prefer-member-initializer)
     }
     std::string json_file;
     std::vector<std::map<size_t, std::string>> in_shapes_vec {{{0, "default"}}};
     std::vector<std::map<size_t, std::string>> op_attrs_vec {{{0, "default"}}};
-    // `0` means not specified by user with command line knob, will skip
-    // the partition num check.
-    std::vector<size_t> expected_n_partition_vec {0};
-    // `default` means not specified by user with command line knob.
-    std::vector<std::string> fpmath_mode_vec {"default"};
+    // By default, we expect the graph should be fused as a single partition.
+    // The user can specify `--expected-n-partitions=0` to skip the partition
+    // number check.
+    std::vector<size_t> expected_n_partition_vec {1};
+    std::vector<graph_fpmath_mode_t> fpmath_mode_vec {graph_fpmath_mode_t {}};
+    std::vector<dnnl_data_type_t> dt {dnnl_data_type_undef};
+    std::vector<std::map<size_t, dnnl_data_type_t>> dt_map {
+            {{SIZE_MAX, dnnl_data_type_undef}}};
+    std::vector<std::map<size_t, std::string>> op_kind_map {
+            {{SIZE_MAX, "default"}}};
 
     const char *perf_template_csv
             = "perf,%engine%,%DESC%,"
@@ -64,27 +70,30 @@ struct settings_t : public base_settings_t {
 
 // TODO evaluate prb_t struct
 struct prb_t {
-    prb_t(const deserialized_graph &dg, const size_t &expected_n_partition)
+    prb_t(const deserialized_graph_t &dg, const size_t &expected_n_partition)
         : dg(dg), expected_n_partition(expected_n_partition) {
 
-        const std::string &fpmath_mode = dg.get_fpmath_mode();
-        this->fpmath_mode = static_cast<dnnl::fpmath_mode>(
-                str2fpmath_mode(fpmath_mode.c_str()));
+        const auto &fpmath = dg.get_fpmath_mode();
+        fpmath_mode.mode_ = fpmath.first;
+        fpmath_mode.apply_to_int_ = str2bool(fpmath.second.c_str());
     }
 
-    deserialized_graph dg;
+    deserialized_graph_t dg;
     size_t expected_n_partition;
-    dnnl::fpmath_mode fpmath_mode;
+    graph_fpmath_mode_t fpmath_mode;
 };
 
 std::string case_to_str(const std::string &json_file,
         const std::map<size_t, std::string> &in_shapes,
         const std::map<size_t, std::string> &op_attrs,
-        const std::string &fpmath_mode, const size_t expected_n_partitions,
-        const int64_t mb);
+        const graph_fpmath_mode_t &fpmath_mode,
+        const size_t expected_n_partitions, const int64_t mb,
+        const dnnl_data_type_t dt,
+        const std::map<size_t, dnnl_data_type_t> &dt_map,
+        const std::map<size_t, std::string> &op_kind_map);
 
 struct perf_report_t : public base_perf_report_t {
-    perf_report_t(const std::string case_str, const char *perf_template)
+    perf_report_t(const std::string &case_str, const char *perf_template)
         : base_perf_report_t(perf_template), case_str_(case_str) {}
     void dump_desc(std::ostream &s) const override { s << case_str_; }
     void dump_desc_csv(std::ostream &s) const override { dump_desc(s); }
diff --git a/tests/benchdnn/graph/graph_memory.cpp b/tests/benchdnn/graph/graph_memory.cpp
index 87264af619c..e4be1ad0a52 100644
--- a/tests/benchdnn/graph/graph_memory.cpp
+++ b/tests/benchdnn/graph/graph_memory.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,33 +42,13 @@ size_t get_benchdnn_device_limit() {
     return benchdnn_device_limit;
 }
 
+// Constructs memories for all inputs and outputs needed for comparison.
 dnn_graph_mem_t::dnn_graph_mem_t(const dnn_mem_t &mem,
-        const deserialized_lt &lt, const bool is_op_input,
-        const bool is_fake_output) {
-
-    // Init memory for all inputs and outputs that needs comparison
-    const auto &prim_dt = mem.dt();
-    const auto &graph_dt = static_cast<dnnl_data_type_t>(lt.get_data_type());
-    const bool is_boolean
-            = lt.get_data_type() == logical_tensor::data_type::boolean;
-
-    // Use data type from graph path to represent boolean
-    const auto &c_data_type = is_boolean ? prim_dt : graph_dt;
-
-    // Get memory tag of primitive memory
-    int ndims = mem.ndims();
-    dims_t strides(mem.strides(), mem.strides() + ndims);
-    std::string mtag = strides2memory_tag(ndims, strides);
-
-    graph_dims_ = lt.shape_;
-    graph_strides_ = lt.stride_;
-
+        const deserialized_lt_t &lt, const bool is_op_input,
+        const bool use_graph_layout)
+    : graph_dims_(lt.shape_), graph_strides_(lt.stride_) {
     const auto &g_eng = get_graph_engine().operator const dnnl::engine &();
 
-    // We create memory for graph path in two steps:
-    // 1. Create memory objects.
-    // 2. Do memory copy if needed.
-    //
     // For inputs, graph path needs data from reference path,
     // and the data movement requires both memories have the same
     // shape, so the tag of graph path is used to create the memory.
@@ -77,42 +57,88 @@ dnn_graph_mem_t::dnn_graph_mem_t(const dnn_mem_t &mem,
     // otherwise use shape & tag from ref path side
 
     // Create memory for graph path
-    const auto data_type = static_cast<dnnl::memory::data_type>(c_data_type);
-    if (is_op_input) {
-        if (graph_dims_.empty()) graph_dims_.push_back(1);
-        if (graph_strides_.empty()) graph_strides_.push_back(1);
+    const auto &graph_dt = convert_dt(lt.get_data_type());
+    const auto data_type = static_cast<dnnl::memory::data_type>(graph_dt);
+
+    if (graph_dims_.empty()) {
+        // As graph strides are deduced from graph dims, they should be in
+        // compliance with each other.
+        assert(graph_strides_.empty());
+
+        graph_dims_.push_back(1);
+        graph_strides_.push_back(1);
+    }
 
-        // create graph memory
+    if (is_op_input) {
+        // Create graph memory with memory description from graph path.
         dnnl::memory::desc md(graph_dims_, data_type, graph_strides_);
         mem_ = dnn_mem_t(md.get(), g_eng.get());
 
-        const auto prim_to_graph_memcpy = [](dnn_mem_t &graph_mem,
-                                                  const dnn_mem_t &prim_mem) {
-            const void *prim_data_handle = static_cast<const void *>(prim_mem);
-            void *graph_data_handle = graph_mem.get_mapped_pointer<void>();
-            std::memcpy(graph_data_handle, prim_data_handle, graph_mem.size());
-        };
-
-        // Not do reorder for boolean data tensor
-        if (!is_boolean && prim_dt != c_data_type) {
-            dnn_mem_t c_mem(ndims, mem.dims(), c_data_type, mtag, g_eng.get());
-            SAFE_V(c_mem.reorder(mem));
-            prim_to_graph_memcpy(mem_, c_mem);
-        } else {
-            prim_to_graph_memcpy(mem_, mem);
+        if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
+            // Fill data from reference memories.
+            fill_mem_with_data(mem);
         }
+
     } else {
-        if (is_fake_output) {
+        if (use_graph_layout) {
+            // For some cases such as fake outputs and no reference memory
+            // mode, which means the output does not have correctponding
+            // argument in primitives, we need to create them with memory
+            // description from graph path.
             dnnl::memory::desc md(graph_dims_, data_type, graph_strides_);
             mem_ = dnn_mem_t(md.get(), g_eng.get());
+
         } else {
-            mem_ = dnn_mem_t(mem.md_, c_data_type, mtag, g_eng.get());
+            // Use information from the reference memory descriptor to create
+            // memories. As we need to reorder output from both paths to abx
+            // for comparison, the memory tag of graph path output should align
+            // the reference path.
+
+            // Get memory tag of primitive memory
+            int ndims = mem.ndims();
+            dims_t strides(mem.strides(), mem.strides() + ndims);
+            std::string mtag = strides2memory_tag(ndims, strides);
+
+            mem_ = dnn_mem_t(mem.md_, graph_dt, mtag, g_eng.get());
         }
     }
 }
 
+int dnn_graph_mem_t::fill_mem_with_data(const dnn_mem_t &mem) {
+
+    const auto &src_dt = mem.dt();
+    const auto &dst_dt = mem_.dt();
+    if (src_dt == dst_dt && mem.size() != mem_.size()) {
+        BENCHDNN_PRINT(0, "%s\n",
+                "Error: failed to fill graph memory with given memory\n");
+        SAFE(FAIL, WARN);
+    }
+
+    int ndims = mem.ndims();
+    dims_t strides(mem.strides(), mem.strides() + ndims);
+    std::string mtag = strides2memory_tag(ndims, strides);
+    const auto &g_eng = get_graph_engine().operator const dnnl::engine &();
+
+    const auto prim_to_graph_memcpy = [](dnn_mem_t &graph_mem,
+                                              const dnn_mem_t &prim_mem) {
+        const void *prim_data_handle = static_cast<const void *>(prim_mem);
+        void *graph_data_handle = graph_mem.get_mapped_pointer<void>();
+        std::memcpy(graph_data_handle, prim_data_handle, graph_mem.size());
+    };
+
+    if (src_dt != dst_dt) {
+        dnn_mem_t c_mem(ndims, mem.dims(), dst_dt, mtag, g_eng.get());
+        SAFE_V(c_mem.reorder(mem));
+        prim_to_graph_memcpy(mem_, c_mem);
+    } else {
+        prim_to_graph_memcpy(mem_, mem);
+    }
+
+    return OK;
+}
+
 dnnl::graph::tensor dnn_graph_mem_t::make_graph_tensor(
-        const deserialized_lt &lt) const {
+        const deserialized_lt_t &lt) const {
     void *data_handle;
     dnnl_memory_get_data_handle(mem_.m_, &data_handle);
     dnnl::graph::logical_tensor graph_lt(lt.id_, lt.get_data_type(), lt.shape_,
diff --git a/tests/benchdnn/graph/graph_memory.hpp b/tests/benchdnn/graph/graph_memory.hpp
index 07e84b7d232..7cb85c00705 100644
--- a/tests/benchdnn/graph/graph_memory.hpp
+++ b/tests/benchdnn/graph/graph_memory.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ struct graph_memory_req_args_t {
     graph_memory_req_args_t() {
         req_ = std::vector<std::vector<size_t>>(2, std::vector<size_t>(3, 0));
     }
-    ~graph_memory_req_args_t() {};
+    ~graph_memory_req_args_t() = default;
     BENCHDNN_DISALLOW_COPY_AND_ASSIGN(graph_memory_req_args_t);
 
     // The detailed memory size requrest for specific path and devices.
@@ -156,14 +156,15 @@ struct dnn_graph_mem_t {
     //
     // The constructor accepts three boolean parameters:
     // 1. is_op_input: whether the logical tensor is an input of an op
-    // 2. is_fake_output: for fake outputs, the driver cannot create memory
-    // objects based on primitive memory for them, but construct memory
-    // from graph shape. The default value is false.
+    // 2. use_graph_layout: for fake outputs and mode without reference
+    // memories, the driver cannot create memory objects based on primitive
+    // memory for them, but construct memory from graph shape. The default
+    // value is false.
     //
-    dnn_graph_mem_t(const dnn_mem_t &mem, const deserialized_lt &lt,
-            const bool is_op_input, const bool is_fake_output = false);
+    dnn_graph_mem_t(const dnn_mem_t &mem, const deserialized_lt_t &lt,
+            const bool is_op_input, const bool use_graph_layout = false);
 
-    dnnl::graph::tensor make_graph_tensor(const deserialized_lt &lt) const;
+    dnnl::graph::tensor make_graph_tensor(const deserialized_lt_t &lt) const;
 
     const dnn_mem_t &get_mem() const { return mem_; }
 
@@ -171,6 +172,8 @@ struct dnn_graph_mem_t {
     void unmap_mem() { mem_.unmap(); }
 
 private:
+    int fill_mem_with_data(const dnn_mem_t &mem);
+
     dnn_mem_t mem_;
     std::shared_ptr<void> buffer_;
     dnnl::memory::dims graph_dims_;
diff --git a/tests/benchdnn/graph/input_displacer.cpp b/tests/benchdnn/graph/input_displacer.cpp
index 4c6f5876ce8..72c2828c20a 100644
--- a/tests/benchdnn/graph/input_displacer.cpp
+++ b/tests/benchdnn/graph/input_displacer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,14 +23,14 @@
 namespace graph {
 
 partition_data_displacer_t::partition_data_displacer_t(
-        const deserialized_graph &dg, const dnnl::graph::partition &par)
+        const deserialized_graph_t &dg, const dnnl::graph::partition &par)
     : dg_(&dg) {
     const auto &op_ids = par.get_ops();
     op_ids_set_ = std::unordered_set<size_t>(op_ids.begin(), op_ids.end());
 
     static const std::unordered_set<std::string> main_op_kind {"Convolution",
             "ConvTranspose", "AvgPool", "MaxPool", "MatMul", "Add", "Divide",
-            "Maximum", "Minimum", "Multiply", "Substract"};
+            "Maximum", "Minimum", "Multiply", "Substract", "Select"};
 
     static const std::unordered_set<std::string> go_through_op_kind {
             "StaticTranspose", "StaticReshape", "TypeCast", "Quantize",
@@ -39,7 +39,7 @@ partition_data_displacer_t::partition_data_displacer_t(
     static const std::unordered_set<std::string> f8_main_op_kind {
             "MatMul", "Convolution"};
 
-    // The logic below relies on the assumption that deserialized_graph is
+    // The logic below relies on the assumption that deserialized_graph_t is
     // sorted in the chronological order.
     for (const auto &aop : dg_->ops_) {
         // Skip the check if op is not in the partition.
@@ -83,7 +83,8 @@ partition_data_displacer_t::partition_data_displacer_t(
                     break;
                 }
 
-                if (parent_op->kind_ == "Dequantize") {
+                if (parent_op->kind_ == "Dequantize"
+                        || parent_op->kind_ == "DynamicDequantize") {
                     // Dequantize is accepted when it doesn't have any
                     // predecessors in the partition (though it may have it in
                     // the graph).
@@ -109,24 +110,23 @@ partition_data_displacer_t::partition_data_displacer_t(
                                         filling_type_t::quantization));
                         break;
                     }
-
-                    if (parent_op->kind_ == "StaticReshape") {
-                        // StaticReshape is accepted when the pattern is
-                        // "StaticReshape + Matmul" and it doesn't have any
-                        // predecessors in the partition
-                        const auto &parent_op_in_lt = parent_op->in_lts_[0];
-                        const auto &prev_parent_op
-                                = dg_->get_op_by_out_lt(parent_op_in_lt.id_);
-                        if (prev_parent_op.empty()
-                                || op_ids_set_.find(prev_parent_op.id_)
-                                        == op_ids_set_.end()) {
-                            if (aop.kind_ == "MatMul") {
-                                quantize_displace_.emplace(parent_op_in_lt.id_,
-                                        std::make_tuple(aop, i, parent_op_in_lt,
-                                                filling_type_t::quantization));
-                            }
-                            break;
+                }
+                if (parent_op->kind_ == "StaticReshape") {
+                    // StaticReshape is accepted when the pattern is
+                    // "StaticReshape + Matmul" and it doesn't have any
+                    // predecessors in the partition
+                    const auto &parent_op_in_lt = parent_op->in_lts_[0];
+                    const auto &prev_parent_op
+                            = dg_->get_op_by_out_lt(parent_op_in_lt.id_);
+                    if (prev_parent_op.empty()
+                            || op_ids_set_.find(prev_parent_op.id_)
+                                    == op_ids_set_.end()) {
+                        if (aop.kind_ == "MatMul") {
+                            quantize_displace_.emplace(parent_op_in_lt.id_,
+                                    std::make_tuple(aop, i, parent_op_in_lt,
+                                            filling_type_t::quantization));
                         }
+                        break;
                     }
                 }
                 // Continue only on allowed ops.
@@ -136,6 +136,124 @@ partition_data_displacer_t::partition_data_displacer_t(
                 }
             }
         }
+
+        // Alternatively, looking for Add->SoftMax chain, which represents
+        // explicit SDPA mask, and should be filled with upper-corner with -inf:
+        // 0 -inf -inf -inf
+        // 0    0 -inf -inf
+        // 0    0    0 -inf
+        // 0    0    0    0
+        // This is done to avoid taking future tokens into account by
+        // influencing SoftMax input values.
+        while (aop.kind_ == "Add" || aop.kind_ == "Select") {
+            auto *aop_out_lt = &aop.out_lts_[0];
+            auto *child_op = &dg_->get_op_by_in_lt(aop_out_lt->id_);
+            if (child_op->kind_ != "SoftMax") break;
+
+            // Softmax must be a part of same partition as the mask. This is to
+            // avoid cases, where mask is the last op in the partition, from
+            // being modified.
+            if (op_ids_set_.find(child_op->id_) == op_ids_set_.end()) break;
+
+            // Search for an input lt without a parent, this is the one to
+            // modify for both explicit and implicit masks.
+            const deserialized_lt_t *causal_mask_lt = nullptr;
+            size_t offset = SIZE_MAX;
+            size_t qk_data_offset = SIZE_MAX;
+            // Select condition having a parent or not is the only reliable
+            // difference between explicit and implicit causal mask.
+            bool select_cond_has_parent = false;
+            // Need to iterate over all inputs to handle padding mask expressed
+            // through Select op.
+            for (size_t i = 0; i < aop.in_lts_.size(); i++) {
+                auto *aop_in_lt = &aop.in_lts_[i];
+                auto *parent_op = &dg_->get_op_by_out_lt(aop_in_lt->id_);
+                if (!parent_op->empty()) {
+                    if (aop_in_lt->get_data_type()
+                            != logical_tensor::data_type::boolean) {
+                        // This is the qk_data, need to know its offset to
+                        // properly fill condition for padding mask.
+                        qk_data_offset = i;
+                    } else {
+                        // This means it's implicit causal mask.
+                        select_cond_has_parent = true;
+                    }
+                    continue;
+                }
+
+                // Explicit padding mask expressed through the Select op would
+                // have two user inputs: condition, hinting where padding
+                // occurred and a special value (-inf) to use. In such scenario,
+                // unlike for implicit causal mask, it's required to update the
+                // condition to always take qk values instead of a special one.
+                //
+                // Checking for data type to make sure that in case of two user
+                // inputs, the condition one will be updated. For implicit
+                // causal mask, the condition would have a parent and a check
+                // for `causal_mask_lt` being non-empty will fail.
+                if (causal_mask_lt
+                        && aop_in_lt->get_data_type()
+                                != logical_tensor::data_type::boolean)
+                    continue;
+
+                causal_mask_lt = aop_in_lt;
+                offset = i;
+            }
+            // No suitable tensor/subgraph for a mask displacement.
+            if (!causal_mask_lt) break;
+
+            filling_type_t filling_type = filling_type_t::undef;
+            if (aop.kind_ == "Add") {
+                const auto ndims = causal_mask_lt->shape_.size();
+                if (ndims < 2) {
+                    BENCHDNN_PRINT(7, "%s\n",
+                            "[DISPLACE]: Causal mask ndims is less than 2");
+                    break;
+                }
+
+                const auto M = causal_mask_lt->shape_[ndims - 2];
+                if (M == 1) {
+                    // This is a padding mask case, when padded tokens should
+                    // be removed from the final computations. In case of
+                    // benchdnn, there's no such thing as padding as all tokens
+                    // are computed. To avoid numerical instabilities, a zero
+                    // mask can be applied without compromising validation
+                    // capabilities.
+                    filling_type = filling_type_t::zero;
+                } else {
+                    // This is a look-ahead (or causal) mask case, when future
+                    // tokens (row < col) are set to infinity to remove all
+                    // connections of current tokens to unissued ones.
+                    filling_type = filling_type_t::causal_mask;
+                }
+            } else if (aop.kind_ == "Select") {
+                if (select_cond_has_parent) {
+                    // Implicit causal mask case.
+                    filling_type = filling_type_t::minus_infinity;
+                } else {
+                    // Padding mask.
+                    assert(qk_data_offset == 1 || qk_data_offset == 2);
+                    // Fill condition depending on qk values tensor to use only
+                    // its values, which is equivalent of not using a mask.
+                    if (qk_data_offset == 1) {
+                        filling_type = filling_type_t::one;
+                    } else if (qk_data_offset == 2) {
+                        filling_type = filling_type_t::zero;
+                    }
+                }
+            }
+
+            if (filling_type == filling_type_t::undef) {
+                BENCHDNN_PRINT(
+                        7, "%s\n", "[DISPLACE]: Filling type was not set");
+                break;
+            }
+
+            quantize_displace_.emplace(causal_mask_lt->id_,
+                    std::make_tuple(
+                            aop, offset, *causal_mask_lt, filling_type));
+            break;
+        }
     }
 }
 
@@ -187,19 +305,35 @@ int partition_data_displacer_t::displace_input_data(
         const auto &user_set
                 = is_div ? pow2_div_vals : (is_mul ? pow2_mul_vals : dummy);
         fill_cfg_t fill_cfg(user_set, "Mul/Div displacer");
-        SAFE(gen_pow2_filling(mem_replace, mem.md_, fill_cfg, res), WARN);
+        SAFE(gen_fixed_set_filling(mem_replace, mem.md_, fill_cfg, res), WARN);
+    } else if (filling_type == filling_type_t::causal_mask) {
+        SAFE(gen_causal_mask_filling(mem_replace, mem.md_, res), WARN);
+    } else if (filling_type == filling_type_t::minus_infinity) {
+        static const std::vector<float> user_set {-INFINITY};
+        fill_cfg_t fill_cfg(user_set, "Implicit_causal_mask");
+        SAFE(gen_fixed_set_filling(mem_replace, mem.md_, fill_cfg, res), WARN);
+    } else if (filling_type == filling_type_t::zero) {
+        static const std::vector<float> user_set {0.f};
+        fill_cfg_t fill_cfg(user_set, "Explicit_padding_mask");
+        SAFE(gen_fixed_set_filling(mem_replace, mem.md_, fill_cfg, res), WARN);
+    } else if (filling_type == filling_type_t::one) {
+        static const std::vector<float> user_set {1.f};
+        fill_cfg_t fill_cfg(user_set, "Explicit_padding_mask");
+        SAFE(gen_fixed_set_filling(mem_replace, mem.md_, fill_cfg, res), WARN);
     } else {
-        assert(!"unexepcted filling type");
+        assert(!"unexpected filling type");
     }
 
     if (res->state == SKIPPED || res->state == UNIMPLEMENTED) return OK;
 
     // do the reverse job
     auto *parent_op = &dg_->get_op_by_out_lt(tensor.id_);
+    bool backward_path_launched = false;
     while (filling_type == filling_type_t::quantization && !parent_op->empty()
             && op_ids_set_.find(parent_op->id_) != op_ids_set_.end()) {
+        backward_path_launched = true;
         // generate the reverse op based on OP kind
-        // make a copy of deserialized_op to avoid impact on graph execution
+        // make a copy of deserialized_op_t to avoid impact on graph execution
         // Currently, we support the following OPs' reverse execution:
         // All of the execution need to swap the input lt and output lt first
 
@@ -270,7 +404,9 @@ int partition_data_displacer_t::displace_input_data(
         parent_op = &dg_->get_op_by_out_lt(tensor.id_);
     }
 
-    BENCHDNN_PRINT(3, "%s\n", "[DISPLACE]: Backward path ended.");
+    if (backward_path_launched) {
+        BENCHDNN_PRINT(3, "%s\n", "[DISPLACE]: Backward path ended.");
+    }
 
     bool mds_are_equal = dnnl_memory_desc_equal(mem_replace.md_, mem.md_) == 1;
     bool mds_are_int8 = is_integral_dt(mem_replace.dt())
@@ -298,45 +434,46 @@ int partition_data_displacer_t::displace_input_data(
     dnnl_memory_desc_destroy(mem_replace.md_);
     dnnl_memory_desc_clone(&mem_replace.md_, md);
     SAFE(mem.reorder(mem_replace), WARN);
+
     if (is_reshaped_dims) dnnl_memory_desc_destroy(md);
     return OK;
 }
 
 int partition_data_displacer_t::gen_quantize_filling(
-        const ::graph::deserialized_op &main_op, int arg, dnn_mem_t &mem,
+        const ::graph::deserialized_op_t &main_op, int arg, dnn_mem_t &mem,
         const ::std::string &dt, res_t *res) {
     // clone a deserialized op object and modify to specified data type
-    ::graph::deserialized_op op = main_op;
+    ::graph::deserialized_op_t op = main_op;
     auto driver = opkind2driver(opstr2kind(op.kind_));
     bool is_f8_quantization = (dt == "f8_e5m2" || dt == "f8_e4m3");
-    bool is_f16 = dt == "f16";
 
     op.in_lts_[0].data_type_ = dt;
     if (op.in_lts_.size() > 1) {
-        if (is_f8_quantization) { // handle fp8 case.
-            op.in_lts_[1].data_type_ = dt;
-        } else { // handle int8 case
-            // matmul/conv/deconv does not support u8u8, replace it with u8s8
-            op.in_lts_[1].data_type_
-                    = ((op.kind_ == "MatMul" || op.kind_ == "Convolution"
-                               || op.kind_ == "ConvTranspose")
-                              && dt == "u8")
-                    ? "s8"
-                    : dt;
+        op.in_lts_[1].data_type_ = dt;
+        // Matmul/Conv/Deconv have limited support for quantized configurations.
+        if (op.kind_ == "MatMul" || op.kind_ == "Convolution"
+                || op.kind_ == "ConvTranspose") {
+            if (dt == "u8") {
+                // None of them supports u8u8, replace with u8s8.
+                op.in_lts_[1].data_type_ = "s8";
+            } else if (dt == "s4" || dt == "u4") {
+                // None of them supports x4x4, replace with f32x4f32 or
+                // xf16x4xf16.
+                op.in_lts_[0].data_type_ = op.out_lts_[0].data_type_;
+            }
         }
     }
-    if (driver == dnnl_driver_t::pool || driver == dnnl_driver_t::binary) {
-        // pool does not support x8f32 on cpu
-        // binary does not support x8x8bf16 on gpu
-        // replace output with x8
+    if (driver == dnnl_driver_t::pool || driver == dnnl_driver_t::binary
+            || is_f8_quantization) {
+        // pool does not support x8f32 on cpu, and binary does not support
+        // x8x8bf16 on gpu, hence replace output with x8.
+        // f8 data types needs setting output data type to f8
         op.out_lts_[0].data_type_ = dt;
     } else if (op.out_lts_[0].data_type_ != "bf16") {
         if (op.in_lts_.size() > 1 && op.in_lts_[1].data_type_ == "s8") {
             // Use u8 as output data type for two-input operations to avoid
             // data overflow due to the specific driver logic.
             op.out_lts_[0].data_type_ = "u8";
-        } else if (is_f8_quantization) {
-            op.out_lts_[0].data_type_ = "f8_e5m2";
         } else {
             // Use f32 as output data type since not all primitives support
             // different data types for input and output.
@@ -345,11 +482,7 @@ int partition_data_displacer_t::gen_quantize_filling(
     }
 
     ::std::unordered_set<size_t> empty_set;
-    // As f8 and f16 support status is limited now, use test engine to ensure
-    // that primitive can be created and generate data
-    const auto &eng = is_f8_quantization || is_f16 ? get_test_engine()
-                                                   : get_cpu_engine();
-
+    const auto &eng = get_test_engine();
     ref_primitive_t ref_prim(op);
     ref_prim.init_prb(res);
     if (res->state == INVALID_ARGUMENTS) return FAIL;
@@ -364,7 +497,7 @@ int partition_data_displacer_t::gen_quantize_filling(
     return OK;
 }
 
-int partition_data_displacer_t::gen_pow2_filling(dnn_mem_t &mem,
+int partition_data_displacer_t::gen_fixed_set_filling(dnn_mem_t &mem,
         const_dnnl_memory_desc_t md, const fill_cfg_t &fill_cfg,
         res_t *res) const {
 
@@ -401,4 +534,27 @@ int partition_data_displacer_t::gen_pow2_filling(dnn_mem_t &mem,
     return OK;
 }
 
+int partition_data_displacer_t::gen_causal_mask_filling(
+        dnn_mem_t &mem, const_dnnl_memory_desc_t md, res_t *res) const {
+
+    dnn_mem_t tmp_mem(md, get_test_engine());
+
+    const int ndims = query_md_ndims(md);
+    assert(ndims >= 2); // This was checked at displacer initialization.
+    const auto &dims = query_md_dims(md);
+    const int64_t batch = std::accumulate(dims, dims + ndims - 2, (dnnl_dim_t)1,
+            std::multiplies<dnnl_dim_t>());
+    const int64_t M = dims[ndims - 2];
+    const int64_t N = dims[ndims - 1];
+
+    benchdnn_parallel_nd(batch, M, N, [&](int64_t b, int64_t m, int64_t n) {
+        int64_t idx = b * M * N + m * N + n;
+        float val = m >= n ? 0.f : -INFINITY;
+        tmp_mem.set_elem(idx, val);
+    });
+
+    mem = std::move(tmp_mem);
+    return OK;
+}
+
 } // namespace graph
diff --git a/tests/benchdnn/graph/input_displacer.hpp b/tests/benchdnn/graph/input_displacer.hpp
index d564fa7256b..ec273f62a13 100644
--- a/tests/benchdnn/graph/input_displacer.hpp
+++ b/tests/benchdnn/graph/input_displacer.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include "ref_primitive.hpp"
 
+#include "src/common/memory_desc.hpp"
 #include "utils/fill.hpp"
 
 namespace graph {
@@ -26,7 +27,19 @@ namespace graph {
 enum class filling_type_t {
     undef = 0,
     quantization,
+    // Floating-point power-of-2 values for precise disivision/multiplication.
     pow2,
+    // Explicit causal mask from SDPA pattern.
+    causal_mask,
+    // Implicit causal mask free input.
+    minus_infinity,
+    // Explicit padding mask (1D case) through the Add op from SDPA pattern.
+    zero,
+    // TODO: `pow2`, `minus_infinity`, `zero` and `one` types can be replaced
+    // with the one defined by non empty fill_cfg which will become a member
+    // of displacer.
+    // Explicit padding mask (1D case) through the Select op from SDPA pattern.
+    one,
 };
 
 // tuple<
@@ -35,28 +48,32 @@ enum class filling_type_t {
 //     the tensor as a displace starting point,
 //     filling_type
 // >
-using displace_t = ::std::tuple<::graph::deserialized_op, size_t,
-        ::graph::deserialized_lt, filling_type_t>;
+using displace_t = ::std::tuple<::graph::deserialized_op_t, size_t,
+        ::graph::deserialized_lt_t, filling_type_t>;
 
 class partition_data_displacer_t {
 public:
     partition_data_displacer_t() = default;
     partition_data_displacer_t(
-            const deserialized_graph &dg, const dnnl::graph::partition &par);
+            const deserialized_graph_t &dg, const dnnl::graph::partition &par);
     int displace_input_data(size_t lt_id, dnn_mem_t &mem, res_t *res);
 
 private:
-    const deserialized_graph *dg_ = nullptr;
+    const deserialized_graph_t *dg_ = nullptr;
     // A set of op_id values from a partition came to a displacer. Used to
     // identify at displacement stage if Deq is the starting point or not.
     std::unordered_set<size_t> op_ids_set_;
     ::std::unordered_map<size_t, displace_t> quantize_displace_;
 
-    int gen_quantize_filling(const ::graph::deserialized_op &main_op, int arg,
+    int gen_quantize_filling(const ::graph::deserialized_op_t &main_op, int arg,
             dnn_mem_t &mem, const ::std::string &dt, res_t *res);
-    // Generates floating-point power-of-2 values in the target memory.
-    int gen_pow2_filling(dnn_mem_t &mem, const_dnnl_memory_desc_t lt,
+    // Generates values in the target memory based on predefined set of values
+    // from `fill_cfg`.
+    int gen_fixed_set_filling(dnn_mem_t &mem, const_dnnl_memory_desc_t md,
             const fill_cfg_t &fill_cfg, res_t *res) const;
+    // Generates causal mask filling for "Add" operation.
+    int gen_causal_mask_filling(
+            dnn_mem_t &mem, const_dnnl_memory_desc_t md, res_t *res) const;
 };
 
 } // namespace graph
diff --git a/tests/benchdnn/graph/memory_pool.hpp b/tests/benchdnn/graph/memory_pool.hpp
index db0ab871e91..eb5fb36129f 100644
--- a/tests/benchdnn/graph/memory_pool.hpp
+++ b/tests/benchdnn/graph/memory_pool.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -63,7 +63,7 @@ static void *ocl_malloc_device(
 }
 
 static void ocl_free(
-        void *ptr, cl_device_id dev, const cl_context ctx, cl_event event) {
+        void *ptr, cl_device_id dev, cl_context ctx, cl_event event) {
     if (nullptr == ptr) return;
     using F = cl_int (*)(cl_context, void *);
     if (event) { OCL_CHECK(clWaitForEvents(1, &event)); }
@@ -77,10 +77,10 @@ static void ocl_free(
 }
 #endif
 
-// This memory pool is for benchdnn graph performance test. The clear and
-// set_capacity functions aren't thread safe. The multi-threaded scenario is
-// mainly used in Graph Compiler backend.
-// Note: for benchdnn graph we use the memory pool for gpu backend currently.
+// This memory pool is for benchdnn graph performance validation. `clear` and
+// `set_capacity` functions aren't thread safe.
+// Note: memory pool for GPU backend currently.
+
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL \
         || DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 class simple_memory_pool_t {
@@ -121,38 +121,29 @@ class simple_memory_pool_t {
             auto sh_ptr = std::shared_ptr<void> {
                     malloc_shared(size, *static_cast<const sycl::device *>(dev),
                             *static_cast<const sycl::context *>(ctx)),
-                    sycl_deletor {*static_cast<const sycl::context *>(ctx)}};
+                    sycl_deletor_t {*static_cast<const sycl::context *>(ctx)}};
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
             auto sh_ptr = std::shared_ptr<void> {
                     ocl_malloc_device(size, alignment, dev, ctx),
-                    ocl_deletor {dev, ctx}};
+                    ocl_deletor_t {dev, ctx}};
 #endif
             ptr = sh_ptr.get();
             // record the map of mm size and its ptr for reuse
-            map_size_ptr_.emplace(std::make_pair(size, sh_ptr));
+            map_size_ptr_.emplace(size, sh_ptr);
             is_free_ptr_[ptr] = false;
         }
         return ptr;
     }
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
-    void deallocate(
-            void *ptr, const void *device, const void *context, void *event) {
+    void deallocate(void *ptr) {
         std::lock_guard<std::mutex> pool_guard(pool_lock);
-        if (event) {
-            auto sycl_deps_ptr = static_cast<::sycl::event *>(event);
-            sycl_deps_ptr->wait();
-        }
         is_free_ptr_[ptr] = true;
-        return;
     }
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-    void deallocate(
-            void *ptr, cl_device_id dev, const cl_context ctx, cl_event event) {
+    void deallocate(void *ptr) {
         std::lock_guard<std::mutex> pool_guard(pool_lock);
-        if (event) { OCL_CHECK(clWaitForEvents(1, &event)); }
         is_free_ptr_[ptr] = true;
-        return;
     }
 #endif
 
@@ -167,9 +158,9 @@ class simple_memory_pool_t {
     std::unordered_map<void *, bool> is_free_ptr_;
 
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
-    struct sycl_deletor {
-        sycl_deletor() = delete;
-        sycl_deletor(const ::sycl::context &ctx) : ctx_(ctx) {}
+    struct sycl_deletor_t {
+        sycl_deletor_t() = delete;
+        sycl_deletor_t(const ::sycl::context &ctx) : ctx_(ctx) {}
         void operator()(void *ptr) {
             if (ptr) ::sycl::free(ptr, ctx_);
         }
@@ -178,9 +169,9 @@ class simple_memory_pool_t {
         ::sycl::context ctx_;
     };
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
-    struct ocl_deletor {
-        ocl_deletor() = delete;
-        ocl_deletor(const cl_device_id dev, const cl_context ctx)
+    struct ocl_deletor_t {
+        ocl_deletor_t() = delete;
+        ocl_deletor_t(cl_device_id dev, cl_context ctx)
             : dev_(dev), ctx_(ctx) {}
         void operator()(void *ptr) {
             if (ptr) ocl_free(ptr, dev_, ctx_, {});
diff --git a/tests/benchdnn/graph/parser.cpp b/tests/benchdnn/graph/parser.cpp
index af042ac60e7..ec7f5e63095 100644
--- a/tests/benchdnn/graph/parser.cpp
+++ b/tests/benchdnn/graph/parser.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "utils/parser.hpp"
 
 #include "parser.hpp"
+#include "utils.hpp"
 
 namespace graph {
 
@@ -32,7 +33,7 @@ bool parse_string(
 }
 
 void parse_key_value(std::vector<std::map<size_t, std::string>> &res_v,
-        const std::string &key_val_str) {
+        const std::string &key_val_str, const std::string &option_name = "") {
     if (key_val_str.empty()) return;
     res_v.clear();
 
@@ -48,34 +49,65 @@ void parse_key_value(std::vector<std::map<size_t, std::string>> &res_v,
 
         std::string::size_type val_pos = 0;
         std::map<size_t, std::string> key_val_case;
-        key_val_case.clear();
         while (val_pos < case_str.size()) {
             std::string single_key_val = get_substr(case_str, val_pos, '+');
             if (single_key_val.empty()) continue;
 
             std::string::size_type key_pos = 0;
             std::string key_str = get_substr(single_key_val, key_pos, ':');
+            if (key_pos == std::string::npos) {
+                BENCHDNN_PRINT(0,
+                        "Error: a colon separating the key and value was not "
+                        "found. Parsed input for option \'%s\': \'%s\'. Please "
+                        "check the option documentation.\n",
+                        option_name.c_str(), key_str.c_str());
+                SAFE_V(FAIL);
+            }
+
             std::string val_str
                     = single_key_val.substr(key_pos, val_pos - key_pos);
-            auto key_num = size_t(stoll(key_str));
-            if (key_val_case.count(key_num) || single_key_val.empty()) {
-                fprintf(stderr, "graph: Parser: repeat id `%zd`, exiting...\n",
-                        key_num);
+            if (val_str.empty()) {
+                BENCHDNN_PRINT(0,
+                        "Error: a value after colon was not parsed. Parsed "
+                        "input for option \'%s\': \'%s\'. Please check the "
+                        "option documentation.\n",
+                        option_name.c_str(), single_key_val.c_str());
                 SAFE_V(FAIL);
             }
-            key_val_case.emplace(stoll(key_str), val_str);
+
+            const auto key_num = size_t(stoll(key_str));
+            if (key_val_case.count(key_num)) {
+                BENCHDNN_PRINT(0,
+                        "Error: a tensor with \'%zu\' ID was already updated. "
+                        "Previous value for the option \'%s\' with this ID is "
+                        "\'%s\', new value is \'%s\'.\n",
+                        key_num, option_name.c_str(),
+                        key_val_case.at(key_num).c_str(), val_str.c_str());
+                SAFE_V(FAIL);
+            }
+            key_val_case.emplace(key_num, val_str);
         }
         res_v.push_back(key_val_case);
     }
 }
+
+// Copy-pasted from utils::parser. Refer to documentation there.
+std::string get_substr(const std::string &s, size_t &start_pos, char delim) {
+    auto end_pos = s.find_first_of(delim, start_pos);
+    auto sub = s.substr(start_pos, end_pos - start_pos);
+    start_pos = end_pos + (end_pos != eol);
+    return sub;
+}
+
 } // namespace
 
 bool parse_input_shapes(
         std::vector<std::map<size_t, std::string>> &in_shapes_vec,
-        const char *str) {
+        const char *str, const std::string &option_name) {
     std::string in_shapes_str;
-    if (!parse_string(in_shapes_str, str, "in-shapes")) return false;
-    return parse_key_value(in_shapes_vec, in_shapes_str), true;
+    if (!parse_string(in_shapes_str, str, option_name)) return false;
+    parse_key_value(in_shapes_vec, in_shapes_str, option_name);
+    return true;
 }
 
 bool parse_op_attrs(std::vector<std::map<size_t, std::string>> &op_attrs_vec,
@@ -85,6 +117,66 @@ bool parse_op_attrs(std::vector<std::map<size_t, std::string>> &op_attrs_vec,
     return parse_key_value(op_attrs_vec, op_attrs_str), true;
 }
 
+bool parse_op_kind(std::vector<std::map<size_t, std::string>> &op_kind_map,
+        const char *str, const std::string &option_name) {
+    std::string s;
+    if (!parse_string(s, str, option_name)) return false;
+
+    //--op-kind=ID:KIND[+ID:KIND], change the kind should not change the topology
+    if (s.find(":") == std::string::npos) {
+        BENCHDNN_PRINT(0, "%s\n",
+                "Error: --op-kind is not correctly specified with a pair of op "
+                "id and target op kind.");
+        SAFE_V(FAIL);
+    }
+    parse_key_value(op_kind_map, s, option_name);
+    return true;
+}
+
+bool parse_dt(std::vector<dnnl_data_type_t> &dt,
+        std::vector<std::map<size_t, dnnl_data_type_t>> &dt_map,
+        const char *str, const std::string &option_name) {
+    std::string dts_str;
+    if (!parse_string(dts_str, str, option_name)) return false;
+
+    if (dts_str.find(":") == std::string::npos) {
+        // `dt` object: format like --dt=f32,bf16,f16
+        const bool has_dt_map
+                = dt_map.size() != 1 || (dt_map[0].count(SIZE_MAX) == 0);
+        if (has_dt_map) {
+            BENCHDNN_PRINT(0, "%s\n",
+                    "Error: --dt is specified twice with different styles.");
+            SAFE_V(FAIL);
+        }
+
+        const std::vector<dnnl_data_type_t> def_dt = {dnnl_data_type_undef};
+        return parser::parse_dt(dt, def_dt, str, option_name);
+    } else {
+        // `dt_map` object: format like --dt=0:f32+1:f32,0:f16+1:f16
+        const bool has_dt = dt.size() != 1 || dt[0] != dnnl_data_type_undef;
+        if (has_dt) {
+            BENCHDNN_PRINT(0, "%s\n",
+                    "Error: --dt is specified twice with different styles.");
+            SAFE_V(FAIL);
+        }
+
+        std::vector<std::map<size_t, std::string>> dts_tmp;
+        parse_key_value(dts_tmp, dts_str, option_name);
+        dt_map.clear();
+        dt_map.resize(dts_tmp.size());
+        // convert size_t:string to size_t:dnnl_data_type_t
+        for (size_t i = 0; i < dts_tmp.size(); i++) {
+            std::map<size_t, dnnl_data_type_t> tmp;
+            for (const auto &v : dts_tmp[i]) {
+                tmp[v.first] = str2dt(v.second.c_str());
+            }
+            dt_map[i] = std::move(tmp);
+        }
+    }
+
+    return true;
+}
+
 bool parse_graph_expected_n_partitions(
         std::vector<size_t> &expected_n_partition_vec, const char *str) {
     std::string expected_n_partitions_str;
@@ -115,7 +207,7 @@ bool parse_graph_expected_n_partitions(
 }
 
 bool parse_graph_fpmath_mode(
-        std::vector<std::string> &fpmath_mode_vec, const char *str) {
+        std::vector<graph_fpmath_mode_t> &fpmath_mode_vec, const char *str) {
     std::string graph_attrs_str;
     if (!parse_string(graph_attrs_str, str, "attr-fpmath")) return false;
 
@@ -123,10 +215,34 @@ bool parse_graph_fpmath_mode(
     std::string mode;
     while (std::getline(ss, mode, ',')) {
         if (!mode.empty()) {
+            // override_json_value == false indicates that the fpmath mode is
+            // not from the cml knob.
             if (fpmath_mode_vec.size() == 1
-                    && fpmath_mode_vec.front() == "default")
+                    && !fpmath_mode_vec.front().override_json_value_)
                 fpmath_mode_vec.pop_back();
-            fpmath_mode_vec.emplace_back(mode);
+
+            size_t start_pos = 0;
+            auto mode_subs = get_substr(mode, start_pos, ':');
+            if (start_pos != std::string::npos && start_pos >= mode.size()) {
+                BENCHDNN_PRINT(0, "%s \'%s\'\n",
+                        "Error: dangling symbol at the end of input",
+                        mode.c_str());
+                SAFE_V(FAIL);
+            }
+
+            bool apply_to_int = false;
+            if (start_pos != std::string::npos) {
+                auto bool_subs = get_substr(mode, start_pos, '\0');
+                if (start_pos != std::string::npos) {
+                    BENCHDNN_PRINT(0, "%s \'%s\'\n",
+                            "Error: dangling symbol at the end of input",
+                            mode.c_str());
+                    SAFE_V(FAIL);
+                }
+                apply_to_int = str2bool(bool_subs.c_str());
+            }
+            fpmath_mode_vec.emplace_back(
+                    mode_subs, apply_to_int, /* override_json_value = */ true);
         }
     }
     return true;
@@ -144,7 +260,7 @@ std::map<std::string, std::string> parse_attrs(const std::string &attrs_str) {
         val_end = attrs_str.find('*', val_pos);
         std::string key_str = attrs_str.substr(key_pos, key_end - key_pos);
         std::string val_str = attrs_str.substr(val_pos, val_end - val_pos);
-        // Validation of input happens at `deserialized_op::create()`.
+        // Validation of input happens at `deserialized_op_t::create()`.
         if (attrs_map.count(key_str)) {
             attrs_map[key_str] = val_str;
             BENCHDNN_PRINT(0, "Repeat attr: %s, will use last value for it.\n",
diff --git a/tests/benchdnn/graph/parser.hpp b/tests/benchdnn/graph/parser.hpp
index 2417aae74f5..a050f8d8204 100644
--- a/tests/benchdnn/graph/parser.hpp
+++ b/tests/benchdnn/graph/parser.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include "allocator.hpp"
 #include "dnnl_common.hpp"
 #include "oneapi/dnnl/dnnl_graph.hpp"
+#include "utils.hpp"
 
 extern dnnl_engine_kind_t engine_tgt_kind;
 
@@ -30,7 +31,7 @@ namespace graph {
 
 bool parse_input_shapes(
         std::vector<std::map<size_t, std::string>> &in_shapes_vec,
-        const char *str);
+        const char *str, const std::string &option_name = "in-shapes");
 
 bool parse_op_attrs(std::vector<std::map<size_t, std::string>> &op_attrs_vec,
         const char *str);
@@ -39,10 +40,17 @@ bool parse_graph_expected_n_partitions(
         std::vector<size_t> &expected_n_partition_vec, const char *str);
 
 bool parse_graph_fpmath_mode(
-        std::vector<std::string> &fpmath_mode_vec, const char *str);
+        std::vector<graph_fpmath_mode_t> &fpmath_mode_vec, const char *str);
 
 bool parse_input_file(std::string &json_file, const char *str);
 
+bool parse_dt(std::vector<dnnl_data_type_t> &dt,
+        std::vector<std::map<size_t, dnnl_data_type_t>> &dt_map,
+        const char *str, const std::string &option_name = "dt");
+
+bool parse_op_kind(std::vector<std::map<size_t, std::string>> &op_kind_map,
+        const char *str, const std::string &option_name = "op-kind");
+
 std::map<std::string, std::string> parse_attrs(const std::string &attrs_str);
 
 // Convert f32 vec attrs string into f32 vec
diff --git a/tests/benchdnn/graph/ref_partition.cpp b/tests/benchdnn/graph/ref_partition.cpp
index eaa13eab191..917c069616a 100644
--- a/tests/benchdnn/graph/ref_partition.cpp
+++ b/tests/benchdnn/graph/ref_partition.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ void check_memory_fit(
 
 } // namespace
 
-ref_partition_t::ref_partition_t(const deserialized_graph &dg,
+ref_partition_t::ref_partition_t(const deserialized_graph_t &dg,
         const dnnl::graph::partition &par,
         const std::vector<dnnl::graph::logical_tensor> &ins,
         const std::vector<dnnl::graph::logical_tensor> &outs)
@@ -70,8 +70,12 @@ ref_partition_t::ref_partition_t(const deserialized_graph &dg,
     }
 };
 
-int ref_partition_t::init_ref(const std::vector<size_t> &graph_in_ports,
-        partition_mem_map_t &partition_mem_map, res_t *res) {
+int ref_partition_t::init_ref(
+        const std::vector<size_t> &graph_in_ports, res_t *res) {
+
+    // Not create reference primitives and filling data with pre-designed
+    // strategies if correctness check is not enabled.
+    if (!has_bench_mode_bit(mode_bit_t::corr)) return OK;
 
     for (const auto &par_op_ref : partition_ops_ref_) {
         // res should be independent from op to op
@@ -150,30 +154,49 @@ int ref_partition_t::init_ref(const std::vector<size_t> &graph_in_ports,
         SAFE_V(data_displacer.displace_input_data(
                 entry.first, const_cast<dnn_mem_t &>(entry.second), res));
     }
+    return OK;
+}
+
+int ref_partition_t::init_graph_mem(
+        partition_mem_map_t &partition_mem_map, res_t *res) {
 
     // init graph input/oputput memory from lt_id_2_mems_
     for (const auto &id : partition_in_ids_) {
-        if (lt_id_2_mems_.find(id) == lt_id_2_mems_.end()) {
-            BENCHDNN_PRINT(0, "Fail: cannot find memory for %zu\n", id);
-            res->state = FAILED;
-            return FAIL;
-        }
-        partition_mem_map.emplace(id,
-                dnn_graph_mem_t(
-                        lt_id_2_mems_.at(id), lt_id_2_lt_.at(id), true));
+        if (!has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
+            if (lt_id_2_mems_.find(id) == lt_id_2_mems_.end()) {
+                BENCHDNN_PRINT(0, "Fail: cannot find memory for %zu\n", id);
+                res->state = FAILED;
+                return FAIL;
+            }
+            partition_mem_map.emplace(id,
+                    dnn_graph_mem_t(lt_id_2_mems_.at(id), lt_id_2_lt_.at(id),
+                            /*is_op_input=*/true));
+        } else
+            partition_mem_map.emplace(id,
+                    dnn_graph_mem_t({}, lt_id_2_lt_.at(id),
+                            /*is_op_input=*/true));
     }
+
     for (const auto &id : partition_out_ids_) {
-        if (fake_lt_ids_.find(id) != fake_lt_ids_.end()) {
-            partition_mem_map.emplace(
-                    id, dnn_graph_mem_t({}, lt_id_2_lt_.at(id), false, true));
-        } else if (lt_id_2_mems_.find(id) == lt_id_2_mems_.end()) {
+
+        if (fake_lt_ids_.find(id) != fake_lt_ids_.end()
+                || has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) {
+            partition_mem_map.emplace(id,
+                    dnn_graph_mem_t({}, lt_id_2_lt_.at(id),
+                            /*is_op_input=*/false, /*use_graph_layout=*/true));
+        } else if (lt_id_2_mems_.find(id) != lt_id_2_mems_.end()) {
+            // For output memories of graph, they need to be in compliance with
+            // the reference memories regarding the shapes and memory tags, as
+            // the memories of both paths will be reordered to abx for
+            // comparison.
+            partition_mem_map.emplace(id,
+                    dnn_graph_mem_t(lt_id_2_mems_.at(id), lt_id_2_lt_.at(id),
+                            /*is_op_input=*/false));
+        } else {
             BENCHDNN_PRINT(0, "Fail: cannot find memory for %zu\n", id);
             res->state = FAILED;
             return FAIL;
-        } else
-            partition_mem_map.emplace(id,
-                    dnn_graph_mem_t(
-                            lt_id_2_mems_.at(id), lt_id_2_lt_.at(id), false));
+        }
     }
 
     return OK;
@@ -202,33 +225,58 @@ void ref_partition_t::exec_ops(res_t *res) {
             ref_prim->replace_arg(arg, lt_id_2_mems_.at(lt.id_));
         }
 
-        // There are unfusable operations (such as Softmax) inside a partition
-        // that are executed with user-requested data type. To have correctness
+        // There are unfusable operations inside complex fusion partitions
+        // (such as Softmax in SDPA or chains of MatMuls in MLP) that are
+        // executed with user-requested data type. To have correctness
         // validation working as expected, the data for such operations should
         // be adjusted accordingly in case of low precision data types. E.g.,
         // if pattern is bfloat16 only, the output of a matmul op is bfloat16.
         // Having a float reference implies that is should use "same" bfloat16
-        // data, otherwise, the output from bfloat16 softmax inside the graph
+        // data, otherwise, the output from bfloat16 softmax inside the library
         // and float softmax inside the reference will mismatch, which happens
         // due to the property of softmax, and exponent part in particular.
-        const bool unfusable_transcendental_op
-                = ref_prim->get_kind() == dnnl::graph::op::kind::SoftMax;
-
-        // However, this practice must be limited to the cases when it's
-        // mandatory. The requirement for input adjustment is having a parent
-        // op, since there's an assumption the current op is unfusable.
         //
-        // Note: Compiler backend doesn't down-convert to the lower precision
-        // data type when pass data to a transcendental op. However, this
-        // conversion can't be disabled without additional library API which
-        // would provide an info if Compiler or DNNL backend was used, since
-        // fall back to DNNL from Compiler backend fails.
-        if (unfusable_transcendental_op && has_parent_op(op)) {
+        // However, this practice of data conversion to a lower precision and
+        // back must be limited to the cases when it's necessary.
+        //
+        // For SDPA, it is limited for a Softmax with a parent op presented, as
+        // it's assumed Softmax is unfusable.
+        const bool is_sdpa_pattern
+                = ref_prim->get_kind() == dnnl::graph::op::kind::SoftMax
+                && has_parent_op(op, /* check_all_in_lts = */ true);
+        // For gated-MLP, it is complicated - the Swish op is decomposed into
+        // Sigmoid and Multiply which has inputs from MatMul0 and Sigmoid. Its
+        // output is passed to another Multiply which is the target for the
+        // reorder, both input and output (since its input is down-converted
+        // by MatMul0, and its output would be a down-converted output of
+        // MatMul1). The variable below carefully checks which Multiply it is
+        // there - Swish's one or not.
+        const bool is_child_multiply
+                = ref_prim->get_kind() == dnnl::graph::op::kind::Multiply
+                && has_parent_op(op, /* check_all_in_lts */ true);
+        bool is_gated_mlp_pattern = false;
+        if (is_child_multiply && op.in_lts_.size() == 2) {
+            const auto &parent0 = get_parent_op(op.in_lts_[0].id_)->kind_;
+            const auto &parent1 = get_parent_op(op.in_lts_[1].id_)->kind_;
+            is_gated_mlp_pattern
+                    = (parent0 == "MatMul" && parent1 == "Multiply")
+                    || (parent0 == "Multiply" && parent1 == "MatMul");
+        }
+
+        if (is_sdpa_pattern || is_gated_mlp_pattern) {
             for (size_t i = 0; i < op.in_lts_.size(); i++) {
                 const auto dt = ref_prim->get_lt_dt(op.in_lts_[i].id_);
                 // There's no need to reorder data for f32 tensors.
                 if (dt == dnnl_f32 || dt == dnnl_data_type_undef) continue;
 
+                // MLP pattern requires reorder only for an input coming from
+                // MatMul0 directly, not from Swish.
+                if (is_gated_mlp_pattern) {
+                    const auto parent_op = get_parent_op(op.in_lts_[i].id_);
+                    if (!parent_op) continue;
+                    if (parent_op->kind_ != "MatMul") continue;
+                }
+
                 int arg = get_prim_arg_name_from_graph_op_input_offset(
                         ref_prim->get_kind(), i, use_dst);
                 dnn_mem_t &src_i
@@ -247,7 +295,8 @@ void ref_partition_t::exec_ops(res_t *res) {
         // A data type to where transform the data will also be provided by the
         // same function since there are corner cases.
         dnnl_data_type_t dt = dnnl_data_type_undef;
-        if (unfusable_transcendental_op && need_unfusable_output_crop(op, dt)) {
+        if ((is_sdpa_pattern || is_gated_mlp_pattern)
+                && need_unfusable_output_crop(op, dt)) {
             for (size_t i = 0; i < op.out_lts_.size(); i++) {
                 // There's no need to reorder data for undefined or f32 tensors.
                 if (dt == dnnl_data_type_undef || dt == dnnl_f32) continue;
@@ -340,26 +389,31 @@ int ref_partition_t::check_partition_correctness(
     return OK;
 }
 
-bool ref_partition_t::has_parent_op(const deserialized_op &op) const {
+bool ref_partition_t::has_parent_op(
+        const deserialized_op_t &op, bool check_all_in_lts) const {
     if (partition_ops_ref_.size() < 2) return false;
 
     for (const auto &in_lt : op.in_lts_) {
-        // Check if parent op exist for an `op`.
-        const auto &parent_op = dg_->get_op_by_out_lt(in_lt.id_);
-        if (parent_op.empty()) continue;
-
-        // If it does, check its ID presents in a partition.
-        for (const auto &op_ref : partition_ops_ref_) {
-            const auto &cur_op = op_ref.get();
-            if (parent_op.id_ == cur_op.id_) return true;
+        const auto *parent_op = get_parent_op(in_lt.id_);
+        if (!parent_op) {
+            if (check_all_in_lts) return false;
+            continue;
+        } else {
+            if (check_all_in_lts) continue;
+            return true;
         }
     }
 
-    return false;
+    // The logic for `check_all_in_lts=true` is exclusive along the
+    // verification. If it made till the end, all lts had a parent. The logic
+    // for `check_all_in_lts=false` would return during the verification, and if
+    // reached the end, it means no parent was met.
+    return check_all_in_lts;
 }
 
-bool ref_partition_t::has_child_op(
-        const deserialized_op &op, const deserialized_op **child_op_ptr) const {
+// TODO: add get_child and remove the second arg.
+bool ref_partition_t::has_child_op(const deserialized_op_t &op,
+        const deserialized_op_t **child_op_ptr) const {
     if (partition_ops_ref_.size() < 2) return false;
 
     for (const auto &out_lt : op.out_lts_) {
@@ -380,11 +434,27 @@ bool ref_partition_t::has_child_op(
     return false;
 }
 
+const deserialized_op_t *ref_partition_t::get_parent_op(size_t in_lt_id) const {
+    if (partition_ops_ref_.size() < 2) return nullptr;
+
+    // Check if a parent op exists for an `op`.
+    const auto &parent_op = dg_->get_op_by_out_lt(in_lt_id);
+    if (parent_op.empty()) return nullptr;
+
+    // If it does, check its ID presents in a partition.
+    for (const auto &op_ref : partition_ops_ref_) {
+        const auto &cur_op = op_ref.get();
+        if (parent_op.id_ == cur_op.id_) { return &parent_op; }
+    }
+
+    return nullptr;
+}
+
 // This function decides when unfusable transcendental op output should be
 // reordered to lower data type and back to f32 for a reference path.
 bool ref_partition_t::need_unfusable_output_crop(
-        const deserialized_op &op, dnnl_data_type_t &dt) const {
-    const deserialized_op *child_op = nullptr;
+        const deserialized_op_t &op, dnnl_data_type_t &dt) const {
+    const deserialized_op_t *child_op = nullptr;
     // First of all, the output should have a child op...
     if (!has_child_op(op, &child_op)) return false;
     // If the child op is not a TypeCast, it's safe to crop.
@@ -396,7 +466,7 @@ bool ref_partition_t::need_unfusable_output_crop(
     // When it is a TypeCast (it always changes `cur_dt` <-> f32, both ways are
     // possible), there are options:
     // * If it's the last one, no crop, as f32 will happen on the other end.
-    const deserialized_op *next_child_op = nullptr;
+    const deserialized_op_t *next_child_op = nullptr;
     if (!has_child_op(*child_op, &next_child_op)) return false;
     // * If there's a child Quantize, no crop either, since output would
     //   perform a reorder with a proper scale value to match the other end.
@@ -414,9 +484,9 @@ bool ref_partition_t::need_unfusable_output_crop(
     return true;
 }
 
-bool ref_partition_t::is_output_op(const deserialized_op &op) const {
+bool ref_partition_t::is_output_op(const deserialized_op_t &op) const {
     return std::any_of(op.out_lts_.begin(), op.out_lts_.end(),
-            [this](const deserialized_lt &lt) {
+            [this](const deserialized_lt_t &lt) {
                 return std::find(partition_out_ids_.begin(),
                                partition_out_ids_.end(), lt.id_)
                         != partition_out_ids_.end();
@@ -425,7 +495,7 @@ bool ref_partition_t::is_output_op(const deserialized_op &op) const {
 
 // check the partition memory footprint of the graph path
 int ref_partition_t::check_partition_total_size(
-        const deserialized_op &op, res_t *res) {
+        const deserialized_op_t &op, res_t *res) {
 
     // Prepare the memory limit for benchdnn graph
     static size_t benchdnn_cpu_limit = get_benchdnn_cpu_limit();
@@ -485,7 +555,9 @@ int ref_partition_t::check_partition_total_size(
     // after reference path data filling(`C` mode only)
     // 3. Memory to be allocated for comparing results(`C` mode only)
     // 4. Memory to be allocated for mapping device memory(GPU backend only)
-    size_t new_cpu_req = check_mem_size_args.total_size_cpu;
+    size_t new_cpu_req = check_mem_size_args.total_size_ref
+            + check_mem_size_args.total_size_compare
+            + check_mem_size_args.total_size_mapped;
     size_t new_gpu_req = check_mem_size_args.total_size_device;
 
     // STEP 1: Memory allocation stage for the reference path
@@ -530,17 +602,17 @@ int ref_partition_t::check_partition_total_size(
 // Return the logical tensor ids of the given op which is the input/output of
 // the partition.
 std::vector<size_t> ref_partition_t::get_in_out_lt_ids(
-        const deserialized_op &op) const {
+        const deserialized_op_t &op) const {
     std::vector<size_t> in_out_lt_ids;
     std::for_each(op.in_lts_.begin(), op.in_lts_.end(),
-            [&in_out_lt_ids, this](const deserialized_lt &lt) {
+            [&in_out_lt_ids, this](const deserialized_lt_t &lt) {
                 if (std::find(partition_in_ids_.begin(),
                             partition_in_ids_.end(), lt.id_)
                         != partition_in_ids_.end())
                     in_out_lt_ids.emplace_back(lt.id_);
             });
     std::for_each(op.out_lts_.begin(), op.out_lts_.end(),
-            [&in_out_lt_ids, this](const deserialized_lt &lt) {
+            [&in_out_lt_ids, this](const deserialized_lt_t &lt) {
                 if (std::find(partition_out_ids_.begin(),
                             partition_out_ids_.end(), lt.id_)
                         != partition_out_ids_.end())
diff --git a/tests/benchdnn/graph/ref_partition.hpp b/tests/benchdnn/graph/ref_partition.hpp
index 224bf872042..ee7e1954dbf 100644
--- a/tests/benchdnn/graph/ref_partition.hpp
+++ b/tests/benchdnn/graph/ref_partition.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,14 +34,16 @@ class ref_partition_t {
     ref_partition_t() = default;
     // to get a Topo ordered partition OPs reference and save the map
     // of input/output logical tensors ids to partition OPs reference
-    ref_partition_t(const deserialized_graph &dg,
+    ref_partition_t(const deserialized_graph_t &dg,
             const dnnl::graph::partition &par,
             const std::vector<dnnl::graph::logical_tensor> &ins,
             const std::vector<dnnl::graph::logical_tensor> &outs);
 
     // prepare memories in both paths, one by one ref primitive
-    int init_ref(const std::vector<size_t> &graph_ports,
-            partition_mem_map_t &partition_mem_map, res_t *res);
+    int init_ref(const std::vector<size_t> &graph_ports, res_t *res);
+
+    int init_graph_mem(partition_mem_map_t &partition_mem_map, res_t *res);
+
     // run partition in ref path, one by one ref primitive
     void exec_ops(res_t *res);
 
@@ -50,7 +52,7 @@ class ref_partition_t {
             partition_mem_map_t &partition_mem_map, res_t *res);
 
     // check the partition memory footprint of graph path
-    int check_partition_total_size(const deserialized_op &op, res_t *res);
+    int check_partition_total_size(const deserialized_op_t &op, res_t *res);
 
     // check the partition memory footprint of reference path
     int check_partition_total_size(
@@ -63,26 +65,36 @@ class ref_partition_t {
     }
 
 private:
-    // Returns `true` if an `op` has a parent op in the partition.
-    bool has_parent_op(const deserialized_op &op) const;
+    // Returns `true` if an `op` has a parent op in the partition for any of
+    // its logical tensors.
+    // When `check_all_in_lts` is set to true, returns `true` if only the op has
+    // a parent for each of its logical tensors.
+    bool has_parent_op(
+            const deserialized_op_t &op, bool check_all_in_lts) const;
+
     // Returns `true` if an `op` has a child op in the partition.
     // If `child_op_ptr` is not empty, updates the pointer with a child op.
     //
     // Note: double pointer is needed to initialize a pointer. A pointer is
     // needed to avoid a copy of an `child_op` object.
-    bool has_child_op(const deserialized_op &op,
-            const deserialized_op **child_op_ptr) const;
+    bool has_child_op(const deserialized_op_t &op,
+            const deserialized_op_t **child_op_ptr) const;
+
+    // Returns a pointer to parent op for a given input lt id. If the parent is
+    // not found, an empty pointer is returned.
+    const deserialized_op_t *get_parent_op(size_t in_lt_id) const;
+
     // Returns `true` if unfusable transcendental op should have cropped output.
     // `dt` is a target data type for following transform. Updated only when the
     // function returns `true`.
     bool need_unfusable_output_crop(
-            const deserialized_op &op, dnnl_data_type_t &dt) const;
+            const deserialized_op_t &op, dnnl_data_type_t &dt) const;
 
-    bool is_input_op(const deserialized_op &op) const;
-    bool is_output_op(const deserialized_op &op) const;
-    std::vector<size_t> get_in_out_lt_ids(const deserialized_op &op) const;
+    bool is_input_op(const deserialized_op_t &op) const;
+    bool is_output_op(const deserialized_op_t &op) const;
+    std::vector<size_t> get_in_out_lt_ids(const deserialized_op_t &op) const;
 
-    const deserialized_graph *dg_;
+    const deserialized_graph_t *dg_;
     // Objects below are constructed.
     // OPs in the partition, which is Topo ordered
     op_ref_list_t partition_ops_ref_;
@@ -106,7 +118,7 @@ class ref_partition_t {
     // keep the lt id for fake output which is not supported by primitive
     std::unordered_set<size_t> fake_lt_ids_;
 
-    std::unordered_map<size_t, const deserialized_lt &> lt_id_2_lt_;
+    std::unordered_map<size_t, const deserialized_lt_t &> lt_id_2_lt_;
 };
 
 } // namespace graph
diff --git a/tests/benchdnn/graph/ref_primitive.cpp b/tests/benchdnn/graph/ref_primitive.cpp
index f5aa3d0f5f6..12af75d2e94 100644
--- a/tests/benchdnn/graph/ref_primitive.cpp
+++ b/tests/benchdnn/graph/ref_primitive.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,10 +19,8 @@
 
 namespace graph {
 
-ref_primitive_t::ref_primitive_t(const deserialized_op &op) {
-    op_ = op;
-    kind_ = opstr2kind(op_.kind_);
-    driver_ = opkind2driver(kind_);
+ref_primitive_t::ref_primitive_t(const deserialized_op_t &op)
+    : op_(op), kind_(opstr2kind(op_.kind_)), driver_(opkind2driver(kind_)) {
 
     static const ::std::unordered_set<::std::string> special_backward_op = {
             // bnorm backward
@@ -299,11 +297,12 @@ void ref_primitive_t::check_correctness(
         //   matmul's output precise.
         // * bf16 softmax's output contains irregular floating-point values that
         //   potentially get accumulated in a different order on each end, and
-        //   it leads to an output mismatch.
+        //   it leads to an output mismatch. Different underlying
+        //   implementations can add more to that.
         //
         // Note: the following threshold is obtained from actual runs on
         // different hardware.
-        cmp.set_threshold(8e-5f);
+        cmp.set_threshold(1e-4f);
         cmp.set_norm_validation_mode(true);
         cmp.compare(mem_fp_abx, mem_dt, attr, res);
     }
diff --git a/tests/benchdnn/graph/ref_primitive.hpp b/tests/benchdnn/graph/ref_primitive.hpp
index dec288f9788..43c39847001 100644
--- a/tests/benchdnn/graph/ref_primitive.hpp
+++ b/tests/benchdnn/graph/ref_primitive.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ class prb_wrapper_base_t {
 template <typename prb_t>
 class prb_wrapper_t : public prb_wrapper_base_t {
 public:
-    prb_wrapper_t(const std::shared_ptr<prb_t> prb) { prb_ = prb; }
+    prb_wrapper_t(const std::shared_ptr<prb_t> prb) : prb_(prb) {}
     // get raw pointer of prb object
     const prb_t *get() const { return prb_.get(); }
 
@@ -61,7 +61,7 @@ inline const prb_t *prb_wrapper_base_t::get() const {
 class ref_primitive_t {
 public:
     ref_primitive_t() = default;
-    ref_primitive_t(const deserialized_op &op);
+    ref_primitive_t(const deserialized_op_t &op);
 
     int init_prb(res_t *res);
     // By default, the reference primitives are created with f32 data type.
@@ -99,7 +99,7 @@ class ref_primitive_t {
 private:
     BENCHDNN_DISALLOW_COPY_AND_ASSIGN(ref_primitive_t);
 
-    deserialized_op op_;
+    deserialized_op_t op_;
     ::dnnl::graph::op::kind kind_;
     dnnl_driver_t driver_;
     bool is_special_backward_op_;
diff --git a/tests/benchdnn/graph/setting_handler.cpp b/tests/benchdnn/graph/setting_handler.cpp
index ceffb45d5c5..d5ce1f579c3 100644
--- a/tests/benchdnn/graph/setting_handler.cpp
+++ b/tests/benchdnn/graph/setting_handler.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -73,17 +73,26 @@ void assign_shape_val(int64_t &c, int64_t &w, int64_t &h, int64_t &d,
     d = has_d ? ncx_shape[2] : 1;
 };
 
-bool get_graph_attr(const deserialized_op &base_op_ref,
+bool get_graph_attr(const deserialized_op_t &base_op_ref,
         attr_t::fpmath_mode_t &arg_fpmath_mode) {
 
-    const auto &fpmath_mode = base_op_ref.fpmath_mode_;
-    arg_fpmath_mode.set(str2fpmath_mode(fpmath_mode.c_str()));
+    const auto &op_kind = base_op_ref.kind_;
+    static const std::unordered_set<std::string> accept_fpmath_op {"MatMul",
+            "Convolution", "ConvolutionBackwardData",
+            "ConvolutionBackwardWeights", "ConvTranspose",
+            "ConvTransposeBackwardData", "ConvTransposeBackwardWeights"};
+
+    if (accept_fpmath_op.find(op_kind) != accept_fpmath_op.end()) {
+        const auto &fpmath_mode = base_op_ref.fpmath_mode_;
+        arg_fpmath_mode.set(str2fpmath_mode(fpmath_mode.c_str()),
+                str2bool(base_op_ref.fpmath_mode_apply_to_int_.c_str()));
+    }
 
     return true;
 }
 
-bool get_driver_tag_by_idx(const deserialized_op &base_op_ref, std::string &tag,
-        int idx = 0, bool from_output = false) {
+bool get_driver_tag_by_idx(const deserialized_op_t &base_op_ref,
+        std::string &tag, int idx = 0, bool from_output = false) {
     logical_tensor::dims strides = from_output
             ? base_op_ref.out_lts_[idx].stride_
             : base_op_ref.in_lts_[idx].stride_;
@@ -95,19 +104,19 @@ bool get_driver_tag_by_idx(const deserialized_op &base_op_ref, std::string &tag,
     return true;
 }
 
-bool get_driver_tag(const deserialized_op &base_op_ref, std::string &tag,
+bool get_driver_tag(const deserialized_op_t &base_op_ref, std::string &tag,
         bool from_output = false) {
     return get_driver_tag_by_idx(base_op_ref, tag, 0, from_output);
 }
 
-bool get_driver_stag_and_dtag(const deserialized_op &base_op_ref,
+bool get_driver_stag_and_dtag(const deserialized_op_t &base_op_ref,
         std::string &stag, std::string &dtag, bool from_output = false) {
     bool ret = get_driver_tag(base_op_ref, stag, from_output);
     dtag = stag;
     return ret;
 }
 
-bool get_driver_axis(const deserialized_op &base_op_ref, int &axis) {
+bool get_driver_axis(const deserialized_op_t &base_op_ref, int &axis) {
     int64_t val = 0;
     base_op_ref.get_attr_s64(val, "axis");
     axis = val >= 0
@@ -116,7 +125,19 @@ bool get_driver_axis(const deserialized_op &base_op_ref, int &axis) {
     return true;
 }
 
-bool get_prb_dims(const deserialized_op &base_op_ref, prb_dims_t &prb_dims) {
+bool get_driver_bia_dt(const deserialized_op_t &base_op_ref,
+        dnnl_data_type_t &bia_dt, const dnnl_data_type_t dt) {
+    if (base_op_ref.in_lts_.size() <= 2)
+        bia_dt = dnnl_data_type_undef;
+    else if (is_integral_dt(dt)) {
+        bia_dt = convert_dt(base_op_ref.in_lts_[2].get_data_type());
+    } else {
+        bia_dt = dt;
+    }
+    return true;
+}
+
+bool get_prb_dims(const deserialized_op_t &base_op_ref, prb_dims_t &prb_dims) {
     prb_dims.dims = base_op_ref.in_lts_.front().shape_;
     prb_dims.ndims = static_cast<int>(prb_dims.dims.size());
     return true;
@@ -124,7 +145,7 @@ bool get_prb_dims(const deserialized_op &base_op_ref, prb_dims_t &prb_dims) {
 
 // extend shape in src to match the ndims
 // if the rank in tensor is less than ndims, we need to insert 1
-void extend_dims(::graph::deserialized_lt &lt, size_t ndims) {
+void extend_dims(::graph::deserialized_lt_t &lt, size_t ndims) {
     size_t nelem = 1;
     for (size_t i = 0; i < lt.shape_.size(); i++) {
         nelem *= lt.shape_[i];
@@ -140,10 +161,14 @@ void extend_dims(::graph::deserialized_lt &lt, size_t ndims) {
 namespace custom {
 
 ::custom::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::custom::settings_t op_setting;
     auto opkind = opstr2kind(base_op_ref.kind_);
     switch (opkind) {
+        case ::graph::op::kind::GenIndex:
+            op_setting.alg = ::custom::alg_t::GENINDEX;
+            base_op_ref.get_attr_s64(op_setting.axis, "axis");
+            break;
         case ::graph::op::kind::Select:
             op_setting.alg = ::custom::alg_t::SELECT;
             break;
@@ -160,20 +185,13 @@ ::custom::settings_t get_setting(
             res->state = res_state_t::INVALID_ARGUMENTS;
             return op_setting;
     }
-    // Select op has boolean weights. It requires special handling for dt
-    // conversion because custom driver prb values directly translate into graph
-    // objects, there's no intermediate primitive layer that can be instructed
-    // to have f32 data type.
-    const bool op_is_select = opkind == ::graph::op::kind::Select;
 
     for (size_t i = 0; i < base_op_ref.in_lts_.size(); i++) {
         const auto arg = get_prim_arg_name_from_graph_op_input_offset(
                 opkind, static_cast<int>(i));
         const auto &lt = base_op_ref.in_lts_[i];
         auto dim = lt.shape_;
-        const auto orig_dt = convert_dt(lt.get_data_type());
-        const auto dt
-                = op_is_select && arg == DNNL_ARG_WEIGHTS ? orig_dt : dnnl_f32;
+        const auto dt = dnnl_f32;
         auto tag = strides2memory_tag(lt.stride_.size(), lt.stride_, false);
 
         // 0-dim means scalar input in graph, extend to 1-dim to match behavior.
@@ -188,9 +206,7 @@ ::custom::settings_t get_setting(
                 opkind, static_cast<int>(i));
         const auto &lt = base_op_ref.out_lts_[i];
         auto dim = lt.shape_;
-        const auto orig_dt = convert_dt(lt.get_data_type());
-        const auto dt
-                = op_is_select && arg == DNNL_ARG_WEIGHTS ? orig_dt : dnnl_f32;
+        const auto dt = dnnl_f32;
         auto tag = strides2memory_tag(lt.stride_.size(), lt.stride_, false);
 
         // 0-dim means scalar input in graph, extend to 1-dim to match behavior.
@@ -207,10 +223,10 @@ ::custom::settings_t get_setting(
 
 namespace binary {
 bool get_binary_prb_vdims(
-        const deserialized_op &base_op_ref, prb_vdims_t &prb_vdims) {
+        const deserialized_op_t &base_op_ref, prb_vdims_t &prb_vdims) {
     // since base_op_ref is a copy from the original
     // it is safe to modify it
-    deserialized_op &base_op = const_cast<deserialized_op &>(base_op_ref);
+    deserialized_op_t &base_op = const_cast<deserialized_op_t &>(base_op_ref);
 
     auto &src0_dims = base_op.in_lts_[0].shape_;
     auto &src1_dims = base_op.in_lts_[1].shape_;
@@ -254,8 +270,8 @@ bool get_binary_prb_vdims(
     return true;
 }
 
-bool get_binary_sdt_and_ddt(
-        const deserialized_op &base_op_ref, ::binary::settings_t &op_setting) {
+bool get_binary_sdt_and_ddt(const deserialized_op_t &base_op_ref,
+        ::binary::settings_t &op_setting) {
     auto sdt0 = convert_dt(base_op_ref.in_lts_[0].get_data_type());
     auto sdt1 = convert_dt(base_op_ref.in_lts_[1].get_data_type());
     auto ddt = convert_dt(base_op_ref.out_lts_[0].get_data_type());
@@ -265,8 +281,8 @@ bool get_binary_sdt_and_ddt(
     return true;
 }
 
-bool get_binary_stag_and_dtag(
-        const deserialized_op &base_op_ref, ::binary::settings_t &op_setting) {
+bool get_binary_stag_and_dtag(const deserialized_op_t &base_op_ref,
+        ::binary::settings_t &op_setting) {
     // src1, src2 and dst could have different tags.
     std::string stag0, stag1, dtag;
     if (!get_driver_tag_by_idx(base_op_ref, dtag, 0, true)
@@ -279,7 +295,8 @@ bool get_binary_stag_and_dtag(
     return true;
 }
 
-bool get_binary_alg(const deserialized_op &base_op_ref, ::binary::alg_t &alg) {
+bool get_binary_alg(
+        const deserialized_op_t &base_op_ref, ::binary::alg_t &alg) {
     static const std::unordered_map<std::string, ::binary::alg_t>
             map_kind_to_alg {{"Add", ::binary::alg_t::ADD},
                     {"BiasAdd", ::binary::alg_t::ADD},
@@ -287,7 +304,8 @@ bool get_binary_alg(const deserialized_op &base_op_ref, ::binary::alg_t &alg) {
                     {"Maximum", ::binary::alg_t::MAX},
                     {"Minimum", ::binary::alg_t::MIN},
                     {"Multiply", ::binary::alg_t::MUL},
-                    {"Subtract", ::binary::alg_t::SUB}};
+                    {"Subtract", ::binary::alg_t::SUB},
+                    {"GreaterEqual", ::binary::alg_t::GE}};
 
     const auto &op_kind = base_op_ref.kind_;
     if (map_kind_to_alg.find(op_kind) == map_kind_to_alg.end()) return false;
@@ -297,7 +315,7 @@ bool get_binary_alg(const deserialized_op &base_op_ref, ::binary::alg_t &alg) {
 }
 
 ::binary::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::binary::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             binary::get_binary_prb_vdims(base_op_ref, op_setting.prb_vdims),
@@ -321,7 +339,7 @@ ::binary::settings_t get_setting(
 
 namespace bnorm {
 
-bool get_bnorm_desc(const deserialized_op &base_op_ref, ::bnorm::desc_t &d) {
+bool get_bnorm_desc(const deserialized_op_t &base_op_ref, ::bnorm::desc_t &d) {
     const auto &src_ncx_shape = base_op_ref.get_NCX_shape(0, true);
     d.mb = src_ncx_shape[0];
     d.ndims = static_cast<int>(src_ncx_shape.size());
@@ -330,7 +348,7 @@ bool get_bnorm_desc(const deserialized_op &base_op_ref, ::bnorm::desc_t &d) {
     return true;
 }
 
-bool get_bnorm_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_bnorm_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "BatchNormForwardTraining") {
         dir = dir_t::FWD_D;
@@ -351,13 +369,13 @@ bool get_bnorm_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return true;
 }
 
-bool get_bnorm_dt(const deserialized_op &base_op_ref, dnnl_data_type_t &dt) {
+bool get_bnorm_dt(const deserialized_op_t &base_op_ref, dnnl_data_type_t &dt) {
     dt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     return true;
 }
 
 bool get_bnorm_flag(
-        const deserialized_op &base_op_ref, ::bnorm::flags_t &flag) {
+        const deserialized_op_t &base_op_ref, ::bnorm::flags_t &flag) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "BatchNormForwardTraining") {
         if (base_op_ref.in_lts_.size() == 3) {
@@ -385,7 +403,7 @@ bool get_bnorm_flag(
 }
 
 ::bnorm::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::bnorm::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             bnorm::get_bnorm_desc(base_op_ref, op_setting.desc), res);
@@ -407,8 +425,9 @@ ::bnorm::settings_t get_setting(
 namespace concat {
 
 bool get_concat_prb_vdims(
-        const deserialized_op &base_op_ref, prb_vdims_t &prb_vdims) {
+        const deserialized_op_t &base_op_ref, prb_vdims_t &prb_vdims) {
     std::vector<dims_t> vdims;
+    vdims.reserve(base_op_ref.in_lts_.size());
     for (const auto &in : base_op_ref.in_lts_) {
         vdims.push_back(in.shape_);
     }
@@ -416,16 +435,16 @@ bool get_concat_prb_vdims(
     return true;
 }
 
-bool get_concat_sdt_and_ddt(
-        const deserialized_op &base_op_ref, ::concat::settings_t &op_setting) {
+bool get_concat_sdt_and_ddt(const deserialized_op_t &base_op_ref,
+        ::concat::settings_t &op_setting) {
     const auto &dt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     op_setting.sdt.front() = dt;
     op_setting.ddt.front() = dt;
     return true;
 }
 
-bool get_concat_stag_and_dtag(
-        const deserialized_op &base_op_ref, ::concat::settings_t &op_setting) {
+bool get_concat_stag_and_dtag(const deserialized_op_t &base_op_ref,
+        ::concat::settings_t &op_setting) {
     size_t in_size = base_op_ref.in_lts_.size();
     std::vector<std::string> stags(in_size);
     std::string dtag;
@@ -440,7 +459,7 @@ bool get_concat_stag_and_dtag(
 }
 
 ::concat::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::concat::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             concat::get_concat_prb_vdims(base_op_ref, op_setting.prb_vdims),
@@ -463,7 +482,7 @@ ::concat::settings_t get_setting(
 
 namespace conv {
 
-bool get_conv_desc(const deserialized_op &base_op_ref, ::conv::desc_t &d) {
+bool get_conv_desc(const deserialized_op_t &base_op_ref, ::conv::desc_t &d) {
     d.g = 1;
     d.sd = d.sh = d.sw = 1;
     d.pd = d.ph = d.pw = -1;
@@ -539,22 +558,23 @@ bool get_conv_desc(const deserialized_op &base_op_ref, ::conv::desc_t &d) {
     return true;
 }
 
-bool get_conv_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_conv_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "Convolution") {
-        dir = base_op_ref.in_lts_.size() > 2 ? dir_t::FWD_B : dir_t::FWD_I;
+        dir = dir_t::FWD_I;
     } else if (op_kind == "ConvolutionBackwardData") {
         dir = dir_t::BWD_D;
     } else if (op_kind == "ConvolutionBackwardWeights") {
         dir = dir_t::BWD_W;
     } else {
+        assert(!"unexpected op_kind");
         return false;
     }
     return true;
 }
 
-bool get_conv_dt(
-        const deserialized_op &base_op_ref, std::vector<dnnl_data_type_t> &dt) {
+bool get_conv_dt(const deserialized_op_t &base_op_ref,
+        std::vector<dnnl_data_type_t> &dt) {
     dnnl_data_type_t src_dt, wei_dt, dst_dt;
     auto in_lt0_dt = convert_dt(base_op_ref.in_lts_[0].get_data_type());
     auto in_lt1_dt = convert_dt(base_op_ref.in_lts_[1].get_data_type());
@@ -584,7 +604,7 @@ bool get_conv_dt(
     return true;
 }
 
-bool get_conv_wtag(const deserialized_op &base_op_ref, std::string &tag) {
+bool get_conv_wtag(const deserialized_op_t &base_op_ref, std::string &tag) {
     std::string weights_format {};
     if (!base_op_ref.get_attr_string(weights_format, "weights_format"))
         return false;
@@ -622,7 +642,7 @@ bool get_conv_wtag(const deserialized_op &base_op_ref, std::string &tag) {
 }
 
 bool get_conv_stag_and_dtag(
-        const deserialized_op &base_op_ref, ::conv::settings_t &op_setting) {
+        const deserialized_op_t &base_op_ref, ::conv::settings_t &op_setting) {
     std::string stag, dtag;
     if (base_op_ref.kind_ == "Convolution") {
         if (!get_driver_tag_by_idx(base_op_ref, stag, 0)
@@ -648,7 +668,8 @@ bool get_conv_stag_and_dtag(
     return true;
 }
 
-::conv::settings_t get_setting(const deserialized_op &base_op_ref, res_t *res) {
+::conv::settings_t get_setting(
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::conv::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             conv::get_conv_desc(base_op_ref, op_setting.desc), res);
@@ -656,6 +677,10 @@ ::conv::settings_t get_setting(const deserialized_op &base_op_ref, res_t *res) {
             conv::get_conv_dir(base_op_ref, op_setting.dir.front()), res);
     DNN_GRAPH_CHECK_SETTINGS(
             conv::get_conv_dt(base_op_ref, op_setting.dt.front()), res);
+    DNN_GRAPH_CHECK_SETTINGS(
+            get_driver_bia_dt(base_op_ref, op_setting.bia_dt.front(),
+                    op_setting.dt.front()[0]),
+            res);
     DNN_GRAPH_CHECK_SETTINGS(
             conv::get_conv_stag_and_dtag(base_op_ref, op_setting), res);
     DNN_GRAPH_CHECK_SETTINGS(
@@ -670,7 +695,8 @@ ::conv::settings_t get_setting(const deserialized_op &base_op_ref, res_t *res) {
 
 namespace deconv {
 
-bool get_deconv_desc(const deserialized_op &base_op_ref, ::deconv::desc_t &d) {
+bool get_deconv_desc(
+        const deserialized_op_t &base_op_ref, ::deconv::desc_t &d) {
     d.g = 1;
     d.sd = d.sh = d.sw = 1;
     d.pd = d.ph = d.pw = -1;
@@ -741,27 +767,23 @@ bool get_deconv_desc(const deserialized_op &base_op_ref, ::deconv::desc_t &d) {
     return true;
 }
 
-bool get_deconv_dir(const deserialized_op &base_op_ref, dir_t &dir) {
-    bool ret = false;
+bool get_deconv_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "ConvTranspose") {
-        dir = base_op_ref.in_lts_.size() > 2 ? dir_t::FWD_B : dir_t::FWD_I;
-        ret = true;
+        dir = dir_t::FWD_I;
     } else if (op_kind == "ConvTransposeBackwardData") {
         dir = dir_t::BWD_D;
-        ret = true;
     } else if (op_kind == "ConvTransposeBackwardWeights") {
         dir = dir_t::BWD_W;
-        ret = true;
     } else {
         assert(!"unexpected op_kind");
         return false;
     }
-    return ret;
+    return true;
 }
 
-bool get_deconv_dt(
-        const deserialized_op &base_op_ref, std::vector<dnnl_data_type_t> &dt) {
+bool get_deconv_dt(const deserialized_op_t &base_op_ref,
+        std::vector<dnnl_data_type_t> &dt) {
     dnnl_data_type_t src_dt, wei_dt, dst_dt;
     auto in_lt0_dt = convert_dt(base_op_ref.in_lts_[0].get_data_type());
     auto in_lt1_dt = convert_dt(base_op_ref.in_lts_[1].get_data_type());
@@ -790,7 +812,7 @@ bool get_deconv_dt(
     return true;
 }
 
-bool get_deconv_wtag(const deserialized_op &base_op_ref, std::string &tag) {
+bool get_deconv_wtag(const deserialized_op_t &base_op_ref, std::string &tag) {
     std::string weights_format {};
     if (!base_op_ref.get_attr_string(weights_format, "weights_format"))
         return false;
@@ -831,7 +853,7 @@ bool get_deconv_wtag(const deserialized_op &base_op_ref, std::string &tag) {
 }
 
 ::deconv::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::deconv::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             deconv::get_deconv_desc(base_op_ref, op_setting.desc), res);
@@ -839,6 +861,10 @@ ::deconv::settings_t get_setting(
             deconv::get_deconv_dir(base_op_ref, op_setting.dir.front()), res);
     DNN_GRAPH_CHECK_SETTINGS(
             deconv::get_deconv_dt(base_op_ref, op_setting.dt.front()), res);
+    DNN_GRAPH_CHECK_SETTINGS(
+            get_driver_bia_dt(base_op_ref, op_setting.bia_dt.front(),
+                    op_setting.dt.front()[0]),
+            res);
     DNN_GRAPH_CHECK_SETTINGS(
             get_driver_stag_and_dtag(base_op_ref, op_setting.stag.front(),
                     op_setting.dtag.front(),
@@ -894,13 +920,13 @@ get_eltwise_kind_map() {
     return map_;
 }
 
-bool get_flag_use_dst_for_bwd_compute(const deserialized_op &base_op_ref) {
+bool get_flag_use_dst_for_bwd_compute(const deserialized_op_t &base_op_ref) {
     const auto it = base_op_ref.attrs_.find("use_dst");
     if (it == base_op_ref.attrs_.end()) return false;
     return it->second.bool_value_;
 }
 
-bool get_eltwise_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_eltwise_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind.rfind("Backward") == std::string::npos) {
         dir = dir_t::FWD_D;
@@ -910,13 +936,14 @@ bool get_eltwise_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return true;
 }
 
-bool get_eltwise_dt(const deserialized_op &base_op_ref, dnnl_data_type_t &dt) {
+bool get_eltwise_dt(
+        const deserialized_op_t &base_op_ref, dnnl_data_type_t &dt) {
     dt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     return true;
 }
 
 bool get_eltwise_alg(
-        const deserialized_op &base_op_ref, ::eltwise::alg_t &alg) {
+        const deserialized_op_t &base_op_ref, ::eltwise::alg_t &alg) {
     static const std::unordered_map<std::string, ::eltwise::alg_t>
             map_kind_to_alg_dst {
                     {"ClampBackward", ::eltwise::alg_t::CLIP_V2_DST},
@@ -941,7 +968,7 @@ bool get_eltwise_alg(
     return true;
 }
 
-bool get_eltwise_alpha(const deserialized_op &base_op_ref, float &alpha) {
+bool get_eltwise_alpha(const deserialized_op_t &base_op_ref, float &alpha) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "Clamp" || op_kind == "ClampBackward") {
         base_op_ref.get_attr_f32(alpha, "min");
@@ -962,7 +989,7 @@ bool get_eltwise_alpha(const deserialized_op &base_op_ref, float &alpha) {
     return true;
 }
 
-bool get_eltwise_beta(const deserialized_op &base_op_ref, float &beta) {
+bool get_eltwise_beta(const deserialized_op_t &base_op_ref, float &beta) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "Reciprocal") {
         beta = -1; // Reciprocal is pow(-1)
@@ -978,7 +1005,7 @@ bool get_eltwise_beta(const deserialized_op &base_op_ref, float &beta) {
 }
 
 ::eltwise::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::eltwise::settings_t op_setting;
     const auto &map_kind_to_alg = get_eltwise_kind_map();
     DNN_GRAPH_CHECK_SETTINGS(
@@ -1012,7 +1039,7 @@ ::eltwise::settings_t get_setting(
 
 namespace gnorm {
 
-bool get_gnorm_desc(const deserialized_op &base_op_ref, ::gnorm::desc_t &d) {
+bool get_gnorm_desc(const deserialized_op_t &base_op_ref, ::gnorm::desc_t &d) {
     auto src_dims = base_op_ref.in_lts_[0].shape_;
     if (base_op_ref.has_NXC_format()) {
         src_dims = base_op_ref.get_NCX_shape(0, true);
@@ -1032,7 +1059,7 @@ bool get_gnorm_desc(const deserialized_op &base_op_ref, ::gnorm::desc_t &d) {
     return true;
 }
 
-bool get_gnorm_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_gnorm_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "GroupNorm") {
         bool keep_stats = false;
@@ -1060,8 +1087,8 @@ bool get_gnorm_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return true;
 }
 
-bool get_gnorm_dt(
-        const deserialized_op &base_op_ref, std::vector<dnnl_data_type_t> &dt) {
+bool get_gnorm_dt(const deserialized_op_t &base_op_ref,
+        std::vector<dnnl_data_type_t> &dt) {
     auto src_dt = convert_dt(base_op_ref.in_lts_[0].get_data_type());
     auto dst_dt = convert_dt(base_op_ref.out_lts_[0].get_data_type());
     dt = {src_dt, dst_dt};
@@ -1069,7 +1096,7 @@ bool get_gnorm_dt(
 }
 
 bool get_gnorm_flags(
-        const deserialized_op &base_op_ref, ::bnorm::flags_t &flags) {
+        const deserialized_op_t &base_op_ref, ::bnorm::flags_t &flags) {
     bool use_affine = false;
     base_op_ref.get_attr_bool(use_affine, "use_affine");
     const auto &op_kind = base_op_ref.kind_;
@@ -1100,7 +1127,7 @@ bool get_gnorm_flags(
     return true;
 }
 
-bool get_gnorm_stag_and_dtag(const deserialized_op &base_op_ref,
+bool get_gnorm_stag_and_dtag(const deserialized_op_t &base_op_ref,
         std::vector<std::vector<std::string>> &tag) {
     // src and dst may have different tags.
     std::string stag, dtag;
@@ -1114,7 +1141,7 @@ bool get_gnorm_stag_and_dtag(const deserialized_op &base_op_ref,
 }
 
 ::gnorm::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::gnorm::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(get_gnorm_desc(base_op_ref, op_setting.desc), res);
     DNN_GRAPH_CHECK_SETTINGS(
@@ -1134,7 +1161,7 @@ ::gnorm::settings_t get_setting(
 
 namespace lnorm {
 
-bool get_lnorm_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_lnorm_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "LayerNorm") {
         bool keep_stats = false;
@@ -1161,13 +1188,13 @@ bool get_lnorm_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return true;
 }
 
-bool get_lnorm_dt(const deserialized_op &base_op_ref, dnnl_data_type_t &dt) {
+bool get_lnorm_dt(const deserialized_op_t &base_op_ref, dnnl_data_type_t &dt) {
     dt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     return true;
 }
 
 bool get_lnorm_flags(
-        const deserialized_op &base_op_ref, ::bnorm::flags_t &flags) {
+        const deserialized_op_t &base_op_ref, ::bnorm::flags_t &flags) {
     bool use_affine = false;
     base_op_ref.get_attr_bool(use_affine, "use_affine");
     const auto &op_kind = base_op_ref.kind_;
@@ -1210,7 +1237,7 @@ bool get_lnorm_flags(
 }
 
 ::lnorm::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::lnorm::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             get_prb_dims(base_op_ref, op_setting.prb_dims), res);
@@ -1233,9 +1260,9 @@ ::lnorm::settings_t get_setting(
 namespace matmul {
 
 bool get_matmul_prb_vdims(
-        const deserialized_op &base_op_ref, prb_vdims_t &prb_vdims) {
+        const deserialized_op_t &base_op_ref, prb_vdims_t &prb_vdims) {
 
-    deserialized_op &base_op = const_cast<deserialized_op &>(base_op_ref);
+    deserialized_op_t &base_op = const_cast<deserialized_op_t &>(base_op_ref);
 
     auto &src_dims = base_op.in_lts_[0].shape_;
     auto &wei_dims = base_op.in_lts_[1].shape_;
@@ -1267,8 +1294,8 @@ bool get_matmul_prb_vdims(
     return true;
 }
 
-bool get_matmul_dt(
-        const deserialized_op &base_op_ref, std::vector<dnnl_data_type_t> &dt) {
+bool get_matmul_dt(const deserialized_op_t &base_op_ref,
+        std::vector<dnnl_data_type_t> &dt) {
     auto src_dt = convert_dt(base_op_ref.in_lts_[0].get_data_type());
     auto wei_dt = convert_dt(base_op_ref.in_lts_[1].get_data_type());
     auto dst_dt = convert_dt(base_op_ref.out_lts_[0].get_data_type());
@@ -1277,7 +1304,7 @@ bool get_matmul_dt(
     return true;
 }
 
-bool get_matmul_tags(const deserialized_op &base_op_ref, std::string &stag,
+bool get_matmul_tags(const deserialized_op_t &base_op_ref, std::string &stag,
         std::string &wtag, std::string &dtag, const int &ndims) {
     logical_tensor::dims src_strides = base_op_ref.in_lts_[0].stride_;
     logical_tensor::dims wei_strides = base_op_ref.in_lts_[1].stride_;
@@ -1298,13 +1325,9 @@ bool get_matmul_tags(const deserialized_op &base_op_ref, std::string &stag,
     return true;
 }
 
-bool get_matmul_bia_dt_mask(const deserialized_op &base_op_ref,
-        dnnl_data_type_t &bia_dt, const dnnl_data_type_t dt, int &bia_mask) {
-    bia_dt = dnnl_data_type_undef;
+bool get_matmul_bia_mask(const deserialized_op_t &base_op_ref, int &bia_mask) {
     if (base_op_ref.in_lts_.size() <= 2) return true;
 
-    // bia_dt is the same as src_dt
-    bia_dt = dt;
     const logical_tensor::dims &bias_shape = base_op_ref.in_lts_[2].shape_;
     const logical_tensor::dims &dst_shape = base_op_ref.out_lts_[0].shape_;
     if (bias_shape.size() != dst_shape.size()) {
@@ -1326,7 +1349,7 @@ bool get_matmul_bia_dt_mask(const deserialized_op &base_op_ref,
 }
 
 ::matmul::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::matmul::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             matmul::get_matmul_prb_vdims(base_op_ref, op_setting.prb_vdims),
@@ -1334,9 +1357,11 @@ ::matmul::settings_t get_setting(
     DNN_GRAPH_CHECK_SETTINGS(
             matmul::get_matmul_dt(base_op_ref, op_setting.dt.front()), res);
     DNN_GRAPH_CHECK_SETTINGS(
-            matmul::get_matmul_bia_dt_mask(base_op_ref,
-                    op_setting.bia_dt.front(), op_setting.dt.front()[0],
-                    op_setting.bia_mask.front()),
+            get_driver_bia_dt(base_op_ref, op_setting.bia_dt.front(),
+                    op_setting.dt.front()[0]),
+            res);
+    DNN_GRAPH_CHECK_SETTINGS(matmul::get_matmul_bia_mask(
+                                     base_op_ref, op_setting.bia_mask.front()),
             res);
     DNN_GRAPH_CHECK_SETTINGS(
             matmul::get_matmul_tags(base_op_ref, op_setting.stag.front(),
@@ -1353,7 +1378,7 @@ ::matmul::settings_t get_setting(
 
 namespace pool {
 
-bool get_pool_desc(const deserialized_op &base_op_ref, ::pool::desc_t &d) {
+bool get_pool_desc(const deserialized_op_t &base_op_ref, ::pool::desc_t &d) {
 
     d.sd = d.sh = d.sw = 1;
     d.pd = d.ph = d.pw = -1;
@@ -1416,7 +1441,7 @@ bool get_pool_desc(const deserialized_op &base_op_ref, ::pool::desc_t &d) {
     return true;
 }
 
-bool get_pool_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_pool_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     bool ret = false;
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "MaxPool" || op_kind == "AvgPool") {
@@ -1433,15 +1458,15 @@ bool get_pool_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return ret;
 }
 
-bool get_pool_dt(
-        const deserialized_op &base_op_ref, std::vector<dnnl_data_type_t> &dt) {
+bool get_pool_dt(const deserialized_op_t &base_op_ref,
+        std::vector<dnnl_data_type_t> &dt) {
     auto src_dt = convert_dt(base_op_ref.in_lts_[0].get_data_type());
     auto dst_dt = convert_dt(base_op_ref.out_lts_[0].get_data_type());
     dt = {src_dt, dst_dt};
     return true;
 }
 
-bool get_pool_alg(const deserialized_op &base_op_ref, ::pool::alg_t &alg) {
+bool get_pool_alg(const deserialized_op_t &base_op_ref, ::pool::alg_t &alg) {
 
     const auto op_kind_ = base_op_ref.kind_;
     if (op_kind_ == "MaxPool" || op_kind_ == "MaxPoolBackward") {
@@ -1468,7 +1493,8 @@ bool get_pool_alg(const deserialized_op &base_op_ref, ::pool::alg_t &alg) {
     return true;
 }
 
-::pool::settings_t get_setting(const deserialized_op &base_op_ref, res_t *res) {
+::pool::settings_t get_setting(
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::pool::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             pool::get_pool_desc(base_op_ref, op_setting.desc), res);
@@ -1489,7 +1515,7 @@ ::pool::settings_t get_setting(const deserialized_op &base_op_ref, res_t *res) {
 namespace prelu {
 
 bool get_prelu_prb_vdims(
-        const deserialized_op &base_op_ref, prb_vdims_t &prb_vdims) {
+        const deserialized_op_t &base_op_ref, prb_vdims_t &prb_vdims) {
 
     auto src_dims = base_op_ref.in_lts_[0].shape_;
     auto wei_dims = base_op_ref.in_lts_[1].shape_;
@@ -1521,7 +1547,7 @@ bool get_prelu_prb_vdims(
     return true;
 }
 
-bool get_prelu_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_prelu_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     bool ret = false;
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "PReLU") {
@@ -1537,15 +1563,15 @@ bool get_prelu_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return ret;
 }
 
-bool get_prelu_sdt(
-        const deserialized_op &base_op_ref, std::vector<dnnl_data_type_t> &dt) {
+bool get_prelu_sdt(const deserialized_op_t &base_op_ref,
+        std::vector<dnnl_data_type_t> &dt) {
     const auto &_dt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     dt = {_dt, _dt};
     return true;
 }
 
 bool get_prelu_stag(
-        const deserialized_op &base_op_ref, ::prelu::settings_t &op_setting) {
+        const deserialized_op_t &base_op_ref, ::prelu::settings_t &op_setting) {
     std::string tag0, tag1;
     if (!get_driver_tag_by_idx(base_op_ref, tag0)
             || !get_driver_tag_by_idx(base_op_ref, tag1, 1))
@@ -1555,7 +1581,7 @@ bool get_prelu_stag(
 }
 
 ::prelu::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::prelu::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             prelu::get_prelu_prb_vdims(base_op_ref, op_setting.prb_vdims), res);
@@ -1576,7 +1602,7 @@ ::prelu::settings_t get_setting(
 namespace reduction {
 
 bool get_reduction_prb_vdims(
-        const deserialized_op &base_op_ref, prb_vdims_t &prb_vdims) {
+        const deserialized_op_t &base_op_ref, prb_vdims_t &prb_vdims) {
     const auto &src_dims = base_op_ref.in_lts_[0].shape_;
     auto dst_dims = base_op_ref.out_lts_[0].shape_;
 
@@ -1601,20 +1627,19 @@ bool get_reduction_prb_vdims(
     }
 
     prb_vdims.vdims = {src_dims, dst_dims};
-    prb_vdims.dst_dims = src_dims;
     prb_vdims.ndims = static_cast<int>(src_dims.size());
     return true;
 }
 
-bool get_reduction_dt(const deserialized_op &base_op_ref, dnnl_data_type_t &sdt,
-        dnnl_data_type_t &ddt) {
+bool get_reduction_dt(const deserialized_op_t &base_op_ref,
+        dnnl_data_type_t &sdt, dnnl_data_type_t &ddt) {
     sdt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     ddt = convert_dt(base_op_ref.out_lts_.front().get_data_type());
     return true;
 }
 
 bool get_reduction_alg(
-        const deserialized_op &base_op_ref, ::reduction::alg_t &alg) {
+        const deserialized_op_t &base_op_ref, ::reduction::alg_t &alg) {
     static const std::unordered_map<std::string, ::reduction::alg_t>
             map_kind_to_alg {{"ReduceSum", ::reduction::alg_t::sum},
                     {"ReduceProd", ::reduction::alg_t::mul},
@@ -1628,14 +1653,14 @@ bool get_reduction_alg(
     return true;
 }
 
-bool get_reduction_p(const deserialized_op &base_op_ref, float &p) {
+bool get_reduction_p(const deserialized_op_t &base_op_ref, float &p) {
     const auto &op_kind = base_op_ref.kind_;
     p = (op_kind == "ReduceL2") ? 2.f : 1.f;
     return true;
 }
 
 ::reduction::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::reduction::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(reduction::get_reduction_prb_vdims(
                                      base_op_ref, op_setting.prb_vdims),
@@ -1663,14 +1688,20 @@ ::reduction::settings_t get_setting(
 
 namespace reorder {
 
-bool get_reorder_dt(const deserialized_op &base_op_ref, dnnl_data_type_t &sdt,
+bool get_reorder_dt(const deserialized_op_t &base_op_ref, dnnl_data_type_t &sdt,
         dnnl_data_type_t &ddt) {
     sdt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     ddt = convert_dt(base_op_ref.out_lts_.front().get_data_type());
+
+    const auto &op_kind = base_op_ref.kind_;
+    // As we always use f32 computation in the reference path, to link
+    // arguments correctly in the reference path, we need to always create
+    // dequantize ops with f32 output.
+    if (op_kind == "DynamicDequantize") { ddt = dnnl_f32; }
     return true;
 }
 
-bool get_reorder_stag_and_dtag(const deserialized_op &base_op_ref,
+bool get_reorder_stag_and_dtag(const deserialized_op_t &base_op_ref,
         std::string &stag, std::string &dtag) {
     bool ret = get_driver_stag_and_dtag(base_op_ref, stag, dtag);
     if (!ret) return false;
@@ -1678,7 +1709,7 @@ bool get_reorder_stag_and_dtag(const deserialized_op &base_op_ref,
     return ret;
 }
 
-bool get_reorder_attrs(const deserialized_op &base_op_ref,
+bool get_reorder_attrs(const deserialized_op_t &base_op_ref,
         attr_t::arg_scales_t &arg_scales, attr_t::zero_points_t &zp) {
 
     const auto &op_kind = base_op_ref.kind_;
@@ -1695,10 +1726,16 @@ bool get_reorder_attrs(const deserialized_op &base_op_ref,
     // scale
     attr_t::policy_t scale_policy = attr_t::policy_t::COMMON;
     int64_t axis = 1;
+    std::vector<dnnl_dim_t> groups;
+    dnnl_data_type_t scale_dt, zp_dt;
+
+    const int ndims
+            = static_cast<int>(base_op_ref.in_lts_.front().shape_.size());
+    base_op_ref.get_attr_s64(axis, "axis");
+    if (axis < 0) axis += ndims;
+
+    // per dimension
     if (qtype == "per_channel") {
-        // per dimension
-        base_op_ref.get_attr_s64(axis, "axis");
-        const auto ndims = base_op_ref.in_lts_.front().shape_.size();
         if (axis < 0) axis += ndims;
         if (axis == 0) {
             scale_policy = attr_t::PER_DIM_0;
@@ -1711,6 +1748,12 @@ bool get_reorder_attrs(const deserialized_op &base_op_ref,
         } else {
             assert(!"unsupported axis");
         }
+    } else if (qtype == "per_group") {
+        scale_policy = attr_t::PER_TENSOR;
+
+        std::vector<int64_t> group_shape;
+        base_op_ref.get_attr_s64_vector(group_shape, "group_shape");
+        groups = {group_shape[ndims - 2], group_shape[ndims - 1]};
     }
 
     if (op_kind == "Dequantize" || op_kind == "Quantize") {
@@ -1725,25 +1768,36 @@ bool get_reorder_attrs(const deserialized_op &base_op_ref,
         if (has_zps && !zps.empty())
             zp.set(arg, attr_t::policy_t::COMMON, zps.front());
     } else if (op_kind == "DynamicDequantize" || op_kind == "DynamicQuantize") {
+        // For reference path, it always use f32 for computation.
+        scale_dt = dnnl_f32;
+
         //  TODO: benchdnn needs to alloc memory based on is_def() function.
         //  so add tmp value for per_tensor scales && zps to make is_def()
         //  return false to alloc memory.
         if (qtype == "per_tensor") {
             arg_scales.set(arg, {scale_policy, 2});
+        } else if (qtype == "per_group") {
+            arg_scales.set(arg, {scale_policy, 1.f, scale_dt, groups});
         } else {
             arg_scales.set(arg, {scale_policy});
         }
         // zps is optional for DynamicDequantize/DynamicQuantize, default is
         // symmetric quantization
         if (base_op_ref.in_lts_.size() == 3) {
-            zp.set(arg, attr_t::policy_t::COMMON, 1);
+            if (qtype == "per_group") {
+                zp_dt = static_cast<dnnl_data_type_t>(
+                        base_op_ref.in_lts_[2].get_data_type());
+                zp.set(arg, {scale_policy, 0, zp_dt, groups});
+            } else {
+                zp.set(arg, attr_t::policy_t::COMMON, 1);
+            }
         }
     }
     return true;
 }
 
 ::reorder::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::reorder::settings_t op_setting;
     const auto op_kind = base_op_ref.kind_;
 
@@ -1777,7 +1831,7 @@ ::reorder::settings_t get_setting(
 namespace resampling {
 
 bool get_resampling_desc(
-        const deserialized_op &base_op_ref, ::resampling::desc_t &d) {
+        const deserialized_op_t &base_op_ref, ::resampling::desc_t &d) {
     std::string data_format {};
     base_op_ref.get_attr_string(data_format, "data_format");
 
@@ -1802,7 +1856,7 @@ bool get_resampling_desc(
     return true;
 }
 
-bool get_resampling_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_resampling_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
 
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "Interpolate") {
@@ -1816,7 +1870,7 @@ bool get_resampling_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return true;
 }
 
-bool get_resampling_dt(const deserialized_op &base_op_ref,
+bool get_resampling_dt(const deserialized_op_t &base_op_ref,
         dnnl_data_type_t &sdt, dnnl_data_type_t &ddt) {
     sdt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     ddt = convert_dt(base_op_ref.out_lts_.front().get_data_type());
@@ -1824,7 +1878,7 @@ bool get_resampling_dt(const deserialized_op &base_op_ref,
 }
 
 bool get_resampling_alg(
-        const deserialized_op &base_op_ref, ::resampling::alg_t &alg) {
+        const deserialized_op_t &base_op_ref, ::resampling::alg_t &alg) {
     std::string alg_value {};
     base_op_ref.get_attr_string(alg_value, "mode");
     if (alg_value == "linear" || alg_value == "bilinear"
@@ -1840,7 +1894,7 @@ bool get_resampling_alg(
 }
 
 ::resampling::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::resampling::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             resampling::get_resampling_desc(base_op_ref, op_setting.desc), res);
@@ -1864,7 +1918,7 @@ ::resampling::settings_t get_setting(
 
 namespace softmax {
 
-bool get_softmax_dir(const deserialized_op &base_op_ref, dir_t &dir) {
+bool get_softmax_dir(const deserialized_op_t &base_op_ref, dir_t &dir) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "SoftMax" || op_kind == "LogSoftmax") {
         dir = dir_t::FWD_D;
@@ -1878,8 +1932,8 @@ bool get_softmax_dir(const deserialized_op &base_op_ref, dir_t &dir) {
     return true;
 };
 
-bool get_softmax_sdt_and_ddt(
-        const deserialized_op &base_op_ref, ::softmax::settings_t &op_setting) {
+bool get_softmax_sdt_and_ddt(const deserialized_op_t &base_op_ref,
+        ::softmax::settings_t &op_setting) {
     const auto &dt = convert_dt(base_op_ref.in_lts_.front().get_data_type());
     op_setting.sdt.front() = dt;
     op_setting.ddt.front() = dt;
@@ -1887,7 +1941,7 @@ bool get_softmax_sdt_and_ddt(
 }
 
 bool get_softmax_alg(
-        const deserialized_op &base_op_ref, ::softmax::alg_t &alg) {
+        const deserialized_op_t &base_op_ref, ::softmax::alg_t &alg) {
     const auto &op_kind = base_op_ref.kind_;
     if (op_kind == "SoftMax" || op_kind == "SoftMaxBackward") {
         alg = ::softmax::alg_t::SOFTMAX;
@@ -1901,7 +1955,7 @@ bool get_softmax_alg(
 };
 
 ::softmax::settings_t get_setting(
-        const deserialized_op &base_op_ref, res_t *res) {
+        const deserialized_op_t &base_op_ref, res_t *res) {
     ::softmax::settings_t op_setting;
     DNN_GRAPH_CHECK_SETTINGS(
             get_prb_dims(base_op_ref, op_setting.prb_dims), res);
diff --git a/tests/benchdnn/graph/setting_handler.hpp b/tests/benchdnn/graph/setting_handler.hpp
index e754bff764f..a394a965e92 100644
--- a/tests/benchdnn/graph/setting_handler.hpp
+++ b/tests/benchdnn/graph/setting_handler.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ namespace graph {
 #define DECLARE_GET_SETTING(driver) \
     namespace driver { \
     ::driver::settings_t get_setting( \
-            const deserialized_op &base_op_ref, res_t *res); \
+            const deserialized_op_t &base_op_ref, res_t *res); \
     }
 
 DECLARE_GET_SETTING(binary);
@@ -67,18 +67,18 @@ using req = typename std::enable_if<B, bool>::type;
 #define DECLARE_TEMPLATE_GET_SETTING(driver) \
     template <typename setting_t, \
             req<std::is_same<setting_t, ::driver::settings_t>::value> = true> \
-    setting_t get_setting(const deserialized_op &base_op_ref, res_t *res) { \
-        deserialized_op base_op = base_op_ref; \
+    setting_t get_setting(const deserialized_op_t &base_op_ref, res_t *res) { \
+        deserialized_op_t base_op = base_op_ref; \
         for (size_t i = 0; i < base_op.in_lts_.size(); i++) { \
-            if (base_op.in_lts_[i].shape_.size() == 0) \
+            if (base_op.in_lts_[i].shape_.empty()) \
                 base_op.in_lts_[i].shape_.emplace_back(1); \
-            if (base_op.in_lts_[i].stride_.size() == 0) \
+            if (base_op.in_lts_[i].stride_.empty()) \
                 base_op.in_lts_[i].stride_.emplace_back(1); \
         } \
         for (size_t i = 0; i < base_op.out_lts_.size(); i++) { \
-            if (base_op.out_lts_[i].shape_.size() == 0) \
+            if (base_op.out_lts_[i].shape_.empty()) \
                 base_op.out_lts_[i].shape_.emplace_back(1); \
-            if (base_op.out_lts_[i].stride_.size() == 0) \
+            if (base_op.out_lts_[i].stride_.empty()) \
                 base_op.out_lts_[i].stride_.emplace_back(1); \
         } \
         return driver::get_setting(base_op, res); \
@@ -104,7 +104,7 @@ DECLARE_TEMPLATE_GET_SETTING(softmax);
 
 namespace eltwise {
 
-bool get_flag_use_dst_for_bwd_compute(const deserialized_op &base_op_ref);
+bool get_flag_use_dst_for_bwd_compute(const deserialized_op_t &base_op_ref);
 
 const std::unordered_map<std::string, ::eltwise::alg_t> &get_eltwise_kind_map();
 
diff --git a/tests/benchdnn/graph/utils.cpp b/tests/benchdnn/graph/utils.cpp
index 16a0b3b1565..d2476a9891f 100644
--- a/tests/benchdnn/graph/utils.cpp
+++ b/tests/benchdnn/graph/utils.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -145,7 +145,11 @@ inline int measure_perf_aggregate(timer::timer_t &t,
         if (use_profiling) {
             std::vector<uint64_t> nsecs;
             std::vector<uint64_t> cycles;
-            get_gpu_profiling_info(((dnnl::stream)stream).get(), nsecs, cycles);
+            // Cannot determine the number of expected profiling entries
+            // beforehand so pass -1.
+            SAFE(get_gpu_profiling_info(((dnnl::stream)stream).get(), nsecs,
+                         cycles, /*expected_num_entries=*/-1),
+                    CRIT);
             reset_gpu_profiling(((dnnl::stream)stream).get());
 
             // Profiling should have information to report, otherwise, stop.
@@ -287,6 +291,8 @@ dnnl::graph::op::kind opstr2kind(const std::string &kind) {
             {"Exp", dnnl::graph::op::kind::Exp},
             {"GELU", dnnl::graph::op::kind::GELU},
             {"GELUBackward", dnnl::graph::op::kind::GELUBackward},
+            {"GenIndex", dnnl::graph::op::kind::GenIndex},
+            {"GreaterEqual", dnnl::graph::op::kind::GreaterEqual},
             {"GroupNorm", dnnl::graph::op::kind::GroupNorm},
             {"HardSigmoid", dnnl::graph::op::kind::HardSigmoid},
             {"HardSigmoidBackward", dnnl::graph::op::kind::HardSigmoidBackward},
@@ -371,6 +377,7 @@ dnnl::graph::op::attr attrstr2kind(const std::string &attr_name) {
             {"axis", dnnl::graph::op::attr::axis},
             {"begin_norm_axis", dnnl::graph::op::attr::begin_norm_axis},
             {"groups", dnnl::graph::op::attr::groups},
+            {"group_shape", dnnl::graph::op::attr::group_shape},
             // int64_t vector attributes. The value of these attributes can be a
             // vector of int64 numbers.
             {"axes", dnnl::graph::op::attr::axes},
@@ -479,6 +486,9 @@ dnnl_driver_t opkind2driver(const dnnl::graph::op::kind &kind) {
                     {dnnl::graph::op::kind::GELU, dnnl_driver_t::eltwise},
                     {dnnl::graph::op::kind::GELUBackward,
                             dnnl_driver_t::eltwise},
+                    {dnnl::graph::op::kind::GenIndex, dnnl_driver_t::custom},
+                    {dnnl::graph::op::kind::GreaterEqual,
+                            dnnl_driver_t::binary},
                     {dnnl::graph::op::kind::GroupNorm, dnnl_driver_t::gnorm},
                     {dnnl::graph::op::kind::HardSigmoid,
                             dnnl_driver_t::eltwise},
@@ -834,7 +844,8 @@ int get_prim_arg_name_from_graph_op_input_offset(
         case dnnl::graph::op::kind::Maximum:
         case dnnl::graph::op::kind::Minimum:
         case dnnl::graph::op::kind::Multiply:
-        case dnnl::graph::op::kind::Subtract: {
+        case dnnl::graph::op::kind::Subtract:
+        case dnnl::graph::op::kind::GreaterEqual: {
             if (input_offset == 0)
                 return DNNL_ARG_SRC_0;
             else if (input_offset == 1)
@@ -1232,14 +1243,6 @@ cpp_engine_t::cpp_engine_t() {
     }
 }
 
-bool is_gc_backend() {
-#if defined(ENABLE_GRAPH_COMPILER_BACKEND)
-    return true;
-#else
-    return false;
-#endif
-}
-
 dnnl_data_type_t convert_dt(const dnnl::graph::logical_tensor::data_type dt) {
     using graph_dt = dnnl::graph::logical_tensor::data_type;
 
@@ -1250,11 +1253,14 @@ dnnl_data_type_t convert_dt(const dnnl::graph::logical_tensor::data_type dt) {
         case graph_dt::s32: return dnnl_s32;
         case graph_dt::s8: return dnnl_s8;
         case graph_dt::u8: return dnnl_u8;
-        // use u8 instead of boolean in the reference path
-        // dnn_graph_mem_t will use the data type from the logical tensor and the u8 data handle
+        // Use `u8` instead of `boolean` in the reference path.
+        // `dnn_graph_mem_t` will use the data type from the logical tensor and
+        // the `u8` data handle.
         case graph_dt::boolean: return dnnl_u8;
         case graph_dt::f8_e5m2: return dnnl_f8_e5m2;
         case graph_dt::f8_e4m3: return dnnl_f8_e4m3;
+        case graph_dt::s4: return dnnl_s4;
+        case graph_dt::u4: return dnnl_u4;
         case graph_dt::undef:
         default: return dnnl_data_type_undef;
     }
diff --git a/tests/benchdnn/graph/utils.hpp b/tests/benchdnn/graph/utils.hpp
index cb0a76caa90..b6d2e0db527 100644
--- a/tests/benchdnn/graph/utils.hpp
+++ b/tests/benchdnn/graph/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@
 
 namespace graph {
 
-struct deserialized_lt;
+struct deserialized_lt_t;
 
 struct bdnn_state_t {
     res_state_t state;
@@ -85,29 +85,28 @@ enum { CRIT = 0x001, WARN = 0x002, NEED_CLEANUP = 0x004 };
 #define DNN_GRAPH_SAFE(f, s, ss) \
     do { \
         try { \
-            f; \
+            (f); \
         } catch (const dnnl::error &e) { \
-            if ((s & CRIT) || (s & WARN)) { \
+            if (((s)&CRIT) || ((s)&WARN)) { \
                 bdnn_state_t bs = convert_state(e.status); \
-                ss->state = bs.state; \
-                if (ss->state == res_state_t::SKIPPED) { \
-                    ss->reason = bs.reason; \
+                (ss)->state = bs.state; \
+                if ((ss)->state == res_state_t::SKIPPED) { \
+                    (ss)->reason = bs.reason; \
                 } else { \
                     BENCHDNN_PRINT(0, \
                             "Error: Function '%s' at (%s:%d) returned '%s'\n", \
                             __FUNCTION__, __FILE__, __LINE__, e.what()); \
                 } \
                 fflush(0); \
-                if (s & CRIT) exit(2); \
+                if ((s)&CRIT) exit(2); \
             } \
-            if (!(s & NEED_CLEANUP)) return FAIL; \
+            if (!((s)&NEED_CLEANUP)) return FAIL; \
         } \
     } while (0)
 
-typedef std::function<void(dnnl::stream &,
+using perf_function_t = std::function<void(dnnl::stream &,
         const std::vector<dnnl::graph::tensor> &inputs,
-        const std::vector<dnnl::graph::tensor> &outputs)>
-        perf_function_t;
+        const std::vector<dnnl::graph::tensor> &outputs)>;
 
 void compiled_partition_executor(dnnl::graph::compiled_partition &cp,
         dnnl::stream &stream, const std::vector<dnnl::graph::tensor> &inputs,
@@ -212,13 +211,31 @@ inline const cpp_engine_t &get_graph_engine() {
     return instance;
 }
 
-bool is_gc_backend();
-
 dnnl_data_type_t convert_dt(const dnnl::graph::logical_tensor::data_type dt);
 
 inline double GB(double bytes) {
     return bytes / powf(2, 30);
 }
 
+struct graph_fpmath_mode_t {
+    graph_fpmath_mode_t() = default;
+    graph_fpmath_mode_t(const std::string &mode, bool apply_to_int,
+            bool override_json_value)
+        : mode_(mode)
+        , apply_to_int_(apply_to_int)
+        , override_json_value_(override_json_value) {}
+
+    bool operator==(const graph_fpmath_mode_t &rhs) const {
+        return mode_ == rhs.mode_ && apply_to_int_ == rhs.apply_to_int_
+                && override_json_value_ == rhs.override_json_value_;
+    }
+
+    std::string mode_ = "strict";
+    bool apply_to_int_ = false;
+    // Since fpmath_mode doesn't provide an "undef" value that would indicate
+    // it was not set externally to the json case, need to maintain this flag.
+    bool override_json_value_ = false;
+};
+
 } // namespace graph
 #endif
diff --git a/tests/benchdnn/inputs/binary/harness_binary_bf16 b/tests/benchdnn/inputs/binary/harness_binary_bf16
index 6b5dc832a15..22be4f83cd9 100644
--- a/tests/benchdnn/inputs/binary/harness_binary_bf16
+++ b/tests/benchdnn/inputs/binary/harness_binary_bf16
@@ -4,7 +4,7 @@
 --inplace=true,false
 --ddt=bf16 --sdt=bf16:bf16
 
---alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE
+--alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 --batch=option_set_all
 --batch=option_set_src0_bcast
 
diff --git a/tests/benchdnn/inputs/binary/harness_binary_f16 b/tests/benchdnn/inputs/binary/harness_binary_f16
index a931e174652..508c4f1c75a 100644
--- a/tests/benchdnn/inputs/binary/harness_binary_f16
+++ b/tests/benchdnn/inputs/binary/harness_binary_f16
@@ -4,7 +4,7 @@
 --inplace=true,false
 --ddt=f16 --sdt=f16:f16
 
---alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE
+--alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 --batch=option_set_all
 --batch=option_set_src0_bcast
 
diff --git a/tests/benchdnn/inputs/binary/harness_binary_f32 b/tests/benchdnn/inputs/binary/harness_binary_f32
index 6d772ff2b68..32dbf42ff5f 100644
--- a/tests/benchdnn/inputs/binary/harness_binary_f32
+++ b/tests/benchdnn/inputs/binary/harness_binary_f32
@@ -4,7 +4,7 @@
 --inplace=true,false
 --ddt=f32 --sdt=f32:f32
 
---alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE
+--alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 --batch=option_set_all
 --batch=option_set_src0_bcast
 
diff --git a/tests/benchdnn/inputs/binary/harness_binary_i8 b/tests/benchdnn/inputs/binary/harness_binary_i8
index 22454c9153e..d10c4c7d875 100644
--- a/tests/benchdnn/inputs/binary/harness_binary_i8
+++ b/tests/benchdnn/inputs/binary/harness_binary_i8
@@ -9,7 +9,7 @@
 --inplace=true,false
 --ddt=s8 --sdt=s8:s8
 
---alg=ADD,MUL,MAX,MIN,SUB,GE,GT,LE,LT,EQ,NE
+--alg=ADD,MUL,MAX,MIN,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 --batch=option_set_all
 --batch=option_set_src0_bcast
 
diff --git a/tests/benchdnn/inputs/binary/harness_binary_regression b/tests/benchdnn/inputs/binary/harness_binary_regression
old mode 100755
new mode 100644
index fab5fcbc746..b44a4b4d1b3
--- a/tests/benchdnn/inputs/binary/harness_binary_regression
+++ b/tests/benchdnn/inputs/binary/harness_binary_regression
@@ -6,3 +6,6 @@
 
 # Mixed src1/post-op src broadcast
 --reset --attr-post-ops=add:f32:2 1x17:1x1
+
+# per_w broadcasting strategy
+--reset --attr-post-ops=mul:f32:4+add:f32:4 --alg=add 1x20x768:1x20x1
diff --git a/tests/benchdnn/inputs/binary/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/binary/option_set_fwks_ext_gpu
index d6710301403..2da20758d19 100644
--- a/tests/benchdnn/inputs/binary/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/binary/option_set_fwks_ext_gpu
@@ -1,68 +1,68 @@
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 64x384x1024:1x384x1024_n"9e315d7c93c74ca0af0643e39f259867*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 64x384x1024:64x384x1024_n"d86417a8c72fffd7785d2c0a1eceda1b*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x1x384x384:1x1x1x1_n"5b5fd8ff9c6662d16df600e14a9ba11d*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x16x384x384:64x1x384x384_n"ecfbb6cd546bfe9cc86c4d5cf0602a00*24"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 24576x1024:24576x1024_n"25b08f7ece9a01ea2e755f2ca10f43d1*24"
---reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1024:1x1_n"56e6ba19b147d7de30144f0de5375e2b*5"
---reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:32768x1_n"787856c95d872796ee167d11d241d704*1"
---reset --allow-enum-tags-only=0 --alg=gt --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:1x1_n"37e32c9961f6cf5a8ad8920de9d763aa*1"
---reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:1x1_n"dedece2c1978b6c9b045c5a86c98186f*1"
---reset --allow-enum-tags-only=0 --alg=le --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:1x1_n"2a8ec734cc69bac7023a6efcd524eaf4*1"
---reset --allow-enum-tags-only=0 --alg=eq --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:32768x1_n"5a54de67b0f565a5b45007daa245ae1a*1"
---reset --allow-enum-tags-only=0 --alg=gt --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 200x32768:200x32768_n"3f44e47edf4004044c1399c9428fd27d*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x8x50x64:64x8x50x1_n"c3b661e7f79043d1fe57525bb683e382*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1113x8x30x64:1113x8x30x1_n"ee0e9800c53fc520b39efd4a0a7904ee*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 64x1x384x384:1x1x1x1_n"e4c812142976a46fba6870d71551bd28*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 64x384x1024:64x384x1024_n"e1b508ca0565cb119ab193424a4737e4*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 64x384x1024:1x384x1024_n"735c92845d0572040ef00158b4f4a871*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 64x16x384x384:64x1x384x384_n"a74b90a02f7196e840015f59b4b44f81*24"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 24576x1024:24576x1024_n"417d9eb4c47008b7ab84ad5cfe22612c*24"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 64x384x1024:64x384x1024_n"52b5368069188c231318739fa5dd5fad*50"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 64x16x384x384:64x1x1x384_n"85565ab81464ffb622ffd2ed8bfaec59*24"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x32:1x1x1x1x1_n"ca0fd8050d6cf166a1cc910db2d229b1*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x32:1x1x1x1x32_n"13156d0b6a830ce2614c6c434db4b7ca*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x1x1x1x32_n"87c6787580ffb92cfcea90d99cbf55a8*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x1x1x1x32_n"30476213896b47e7078631a261b249ee*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x64:1x1x1x1x1_n"aabe43abc83403a2ff8c1016debc952e*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x64:1x1x1x1x64_n"5df18f1587901015275bca5a1519804d*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x1x1x1x64_n"ab67ec134bc7e8ed5b017b51cdfe2495*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x1x1x1x64_n"9d08fe84049bc52392ea55f6b486da7e*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x128:1x1x1x1x1_n"465e24fb6bcf8fb923e5fef9c03486e4*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x128:1x1x1x1x128_n"1d447d10da8fde94d477d996fd5003bc*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x1x1x1x128_n"3fe642238a9ad80af91ba874aac8b365*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x1x1x1x128_n"8327691f0de9a2cfaebad09fb3e048f7*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x256:1x1x1x1x1_n"c74915b09a4412b4a05105fcf11894b2*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x256:1x1x1x1x256_n"5040854302e211f23881f5aa9371fa11*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x1x1x1x256_n"da8af33ff7e150ea689df4391a41b146*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x1x1x1x256_n"1d04838483e4a39b5685032ba0b4427c*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x320:1x1x1x1x1_n"c627356773847aad46e49d4663e46e09*6"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x320:1x1x1x1x320_n"323f516d5f5d1bfb0cd7e096545accbf*36"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x1x1x1x320_n"dc0680e70b6722aa87f0f2e82e393dbb*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x1x1x1x320_n"b1a0f2a449c3aa473e3ba35615311679*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x1x1x1x320_n"6883e25ff58eecae0b354a1492462de1*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x1x1x1x320_n"428d070d5fbf054d24f660aed2a6fd21*2"
---reset --allow-enum-tags-only=0 --alg=le --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"cf3a56811f474f2a53224bbf9166b705*1"
---reset --allow-enum-tags-only=0 --alg=min --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"b948994b9b94d6bccec6eee26961c365*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x128x128x128x3_n"3b0dc5a57231df08ad70b171d3ccdf25*8"
---reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"2515e25f15d0d73c7be6c4b764f26d98*1"
---reset --allow-enum-tags-only=0 --alg=max --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"f100fb7a1116f9101d729eaccbca5d27*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"4dc6244ae9948c3752b1c0e898009da1*2"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 1x3:1x1_n"b9fd98a4e250a14e7a3b4e4f0880db14*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 1x3:1x1_n"3145da6923564896de9f5c2d8fbb292a*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x128x128x128x3_n"b46101d042e83c59265b2dbbff7776db*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 1x3:1x3_n"d92669f8b954582cc29eb2bc934e0084*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=a:a --dtag=a 1:1_n"90e85c0e0a8e7f996bfb6ef29b577e56*1"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x4:1x128x128x128x4_n"e8d054a13cc9db16b1ad55dd00afec52*2"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x128x128x128x32_n"b353a0b268922fd1dbd42fd7ee1fef16*8"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x1x1x1x1_n"99c2f585e2b11a1a744c83e84fa7e3f6*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x64x64x64x64_n"04ef8518a615c565ae480920c28706ac*8"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x1x1x1x1_n"bdaddeab27e99f8e56c3b10816585631*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x32x32x32x128_n"e733778259cc40f5028f4f08c86455d0*8"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x1x1x1x1_n"d193a8df3e52a8869656541c6286e058*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x16x16x16x256_n"1246bfc4a675aa5775c5a2113de12d9e*8"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x1x1x1x1_n"6d742bab9bfa7ffb931bb0988689d634*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x8x8x8x320_n"6de829a133e22ba4ef17c90270c89858*8"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x1x1x1x1_n"bf6554907f53cf5c260e529011682909*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x4x4x4x320_n"44d2f419222a606eb3daf7231bd3ad8a*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x1x1x1x1_n"4145afd54ab6b8e3e59bce1d5b265a9c*2"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=a:a --dtag=a 1:1_n"04ca820ca29239e17f7d8b96d5888c9b*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x32:1x1x1x1x1_n"22029678eb0b6170ec77855a40180af1*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x32:1x1x1x1x32_n"3d3196da052c3fd975cdc36387666904*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x1x1x1x32_n"0c7705f9ebbe968be57e93a37560da93*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x1x1x1x32_n"1d545dd1d50e027dbd69b484604115ac*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x64:1x1x1x1x1_n"0efc83117290ea8d274ec4354a119670*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x64:1x1x1x1x64_n"4f3cf258854167c6c4e2f7e4a61c53fc*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x1x1x1x64_n"0f1eca675750e843ff9551798df14af6*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x1x1x1x64_n"52bfef76579b2ed7763d5aa2666f6d82*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x128:1x1x1x1x1_n"c5ccdc07255041e5f2098b49d3539bf4*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x128:1x1x1x1x128_n"bfd332a3398316603cbcf67bd940c100*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x1x1x1x128_n"cccfa7e5655bad339932a34d3919cfc8*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x1x1x1x128_n"5e19282651049cd0a679b981ce6536db*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x256:1x1x1x1x1_n"4e4e0f66ccb4f3819dcfcae5adf5b0fb*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x256:1x1x1x1x256_n"e5e5bf0fe12f42580fdbe245a8db699d*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x1x1x1x256_n"4502361bb41073a833dbec76ca7cb7d0*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x1x1x1x256_n"4c490a22b29e168d8e452becab8a54f4*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x320:1x1x1x1x1_n"9c1f631d0d7383fcb593948bc6cb4e6d*6"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x1x1x1x320:1x1x1x1x320_n"844327039cdda368f17e89da9e12003f*36"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x1x1x1x320_n"c5c372376eed33c1493b76a05dbe1d97*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x1x1x1x320_n"fda4499361a1927d71f9e2a795ba22ad*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x1x1x1x320_n"9a44183d0cf560753f8ca6fb16094b31*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x1x1x1x320_n"6b357701d65293fd19236d11dfbef7ed*2"
+--reset --allow-enum-tags-only=0 --alg=le --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"c4b06f0fbcd7f73d5562196bbc5da34b"
+--reset --allow-enum-tags-only=0 --alg=min --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"7067e6187c80b43010d813330f564dcb"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x128x128x128x3_n"2f4ad504fc5479bf999bb67d11e38dbf*8"
+--reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"d6ce6aec4529067b6319cc582f063618"
+--reset --allow-enum-tags-only=0 --alg=max --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"8e44718a82d1e1b508cda9db5b9c12d3"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x1x1x1x1_n"40f3ca1a97435d9721853bf48bc791b0*2"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 1x3:1x1_n"90eac329329b430a3b7f23fbab53dcbf*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 1x3:1x1_n"810b0e5f0932475c7506b3d3158cdce6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 1x128x128x128x3:1x128x128x128x3_n"d0627711773ccd20f40df1c272c55f46"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 1x3:1x3_n"a51e9362195d2a031b53ac53ff250307"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=a:a --dtag=a 1:1_n"bd09d6b138e578f2021a7d0096585e4e"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x4:1x128x128x128x4_n"b8c9d7b189f0d334a7b79c397b95ae6c*2"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x128x128x128x32_n"6cf402c2debed123b4e9d3aebe3b8b08*8"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x128x128x128x32:1x1x1x1x1_n"bdad679738d440e3dd0b608ce6fd5c54*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x64x64x64x64_n"aaede185d70fcd23d056f43f6acbfa94*8"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x64x64x64x64:1x1x1x1x1_n"1eca04091956983f58828d5eedf3a05e*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x32x32x32x128_n"0b46ec3d452c76c624812184fa1ed948*8"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x32x32x32x128:1x1x1x1x1_n"471da3d49ed74f34cc79a68170396b9b*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x16x16x16x256_n"fe7cdf31c50a3c4c9598fb428215e96d*8"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x16x16x16x256:1x1x1x1x1_n"08fb12fc91619825822083564308dc96*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x8x8x8x320_n"c40f55bc46b3939839f3d7f68956ebb7*8"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x8x8x8x320:1x1x1x1x1_n"07174c941ad83f05f4918dc86567b27a*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x4x4x4x320_n"78fddb329c340bfe9754c20e5929c24f*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 1x4x4x4x320:1x1x1x1x1_n"a430c547228ce9e0a98f50e825fc2de2*2"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=a:a --dtag=a 1:1_n"b840baf34111e98bde55ec0ca2898c7d*2"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 64x1x384x384:1x1x1x1_n"b0d8500c7de58915deedf84f865309a8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 64x384x1024:64x384x1024_n"bc4eeee28430cbad9d6abe03ec16b21c"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 64x384x1024:1x384x1024_n"d36164823cd97c039feb5318a2e28623"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 64x16x384x384:64x1x384x384_n"bdc44efea8e63066342d89b47ad73bbd*24"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 24576x1024:24576x1024_n"dd9c439ae7d51cd98f487b492b282d32*24"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 64x384x1024:64x384x1024_n"ad51e8cf726d9cd9dd1a9b25f08b856e*50"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 64x16x384x384:64x1x1x384_n"2240e101eed89d48e4ee71bde5de3ebc*24"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 64x384x1024:1x384x1024_n"369b16f8dc59bcd0e15e976c9ea724b6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 64x384x1024:64x384x1024_n"b5a05e019a6c1ca6e322519a5aa52027"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x1x384x384:1x1x1x1_n"9b8fdb1ab24f713295b33dc51486cbc6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x16x384x384:64x1x384x384_n"880b9eb2bc26411663f1308f17680a52*24"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 24576x1024:24576x1024_n"6a65dd53855ef94c6e1abd9f8f191ee5*24"
+--reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1024:1x1_n"2e16a73fc8019287883f800ca002c64e*5"
+--reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:32768x1_n"d8fa37ea578e54d3f9bcc3cc4e1aee6f"
+--reset --allow-enum-tags-only=0 --alg=gt --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:1x1_n"bc2aeb3b20c50e287412193410dbe591"
+--reset --allow-enum-tags-only=0 --alg=ge --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:1x1_n"070cd41595a6fab616527a82ecf9b080"
+--reset --allow-enum-tags-only=0 --alg=le --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:1x1_n"b6bc3c834d6e66c8458b801ec4b6a66b"
+--reset --allow-enum-tags-only=0 --alg=eq --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 32768x1:32768x1_n"c5a99eafd97bd894e68eb7c6a0f02bf3"
+--reset --allow-enum-tags-only=0 --alg=gt --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 200x32768:200x32768_n"d5d46d0e18cddb5e0abc1319b96fc3b7"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x8x50x64:64x8x50x1_n"988310d48e60ad767eab1356d41d5254"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1113x8x30x64:1113x8x30x1_n"30cd057443df3727a807fac7ebbefa7b"
diff --git a/tests/benchdnn/inputs/binary/option_set_fwks_key_gpu b/tests/benchdnn/inputs/binary/option_set_fwks_key_gpu
index 6dc47113013..bbaa8b23940 100644
--- a/tests/benchdnn/inputs/binary/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/binary/option_set_fwks_key_gpu
@@ -1,128 +1,128 @@
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x14:128x14_n"1b7c8cbeca2ccba51a3a57cb722a2a30*515"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 128x128x200:1x1x1_n"f9a611c92b538b6775186d72075b519e*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x200:128x200_n"56ce035cf8ecb608b765fbf0ccc8db30*5378"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x200:128x200_n"290c593782071ac57a62f6f415dbf2ac*256"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 128x128x200:128x128x200_n"4aee69510f998f438645cdbd7eb8c01a*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 128x128x1:128x128x1_n"309fcaa91161e8242b1bba7aa311cc3f*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=u8 --stag=ab:ab --dtag=ab --attr-scales=src0:common:0.5+src1:common:0.5 5000x845:1x1_n"1d8bb98a0ea2cf883f1a8676e1837add*400"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x256x56x56:16x256x56x56_n"f98d6a5f35ca4b111fb6f9f6b68f971b*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x512x28x28:16x512x28x28_n"cc03cce5b8c7f069a18908e34fce1885*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x1024x14x14:16x1024x14x14_n"71878f0103d0caa5631827daa2e95370*12"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x2048x7x7:16x2048x7x7_n"d8c10ae19aedf483f32360d3a376027c*5"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x64x56x56:16x64x56x56_n"43c39b9024a27993bdee897ed9ba0913*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x256x56x56:1x256x56x56_n"2ba9a33c16a08ffd7bba85029b26b704*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x512x28x28:1x512x28x28_n"759ede79bdf6e3827c78964592eb4da3*3"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x1024x14x14:1x1024x14x14_n"40986cc1ccc358d70fb1b7e3d6e71822*5"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x2048x7x7:1x2048x7x7_n"cc3e3231793214e6d618bcdd79ff5d73*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 65536x2:65536x2_n"03e8c5548ab3479186031abb9827bfa5*1536"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 23x72x72:23x72x72_n"f94c36bafa40f472fe6765aa5ddaae0e*5110"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 256x256x256:256x256x256_n"e6126721a550ac75b35b83a197726aa9*20"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"0bd52d5bb043ce356e2db2be38869149*2060"
---reset --allow-enum-tags-only=0 --alg=div --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"a44705e86b89e08baad69c5603b4734c*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 23x72x72x256x2:23x72x72x256x2_n"1dc3dd8b859879eec890555efdd5c430*1022"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 292x336x256x2:292x336x256x2_n"925ce8d0e882482c1362a35bb0b1fcc6*22"
---reset --allow-enum-tags-only=0 --alg=div --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"ea0e537b2a251223ec9bedfd13feca7d*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 280x336x256x2:280x336x256x2_n"32f876bdf6720974306153f5c8e21315*22"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x1152:1x1_n"a0cdfebec94d426a03758ff82613a014*46"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 64x2864:64x2864_n"1efaf42a94f8c1240e3b036c9f8cc930*420"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x2048:1x1_n"ef60b5d2a7024966e096cfdb348f77e9*96"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 64x2864:64x2864_n"27d66ae4b2ad729108179f59e756797c*46"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x1088:1x1_n"be2247a97ef1906189592777a2d4977f*48"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x18:1x1_n"ef10a6f3bc684faf35d4733b8e870ab9*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 1x2864:1x2864_n"ddfa3b2db486482b1077f9786b20f226*140"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x32:1x1_n"acca09f910d8bbe25f4e0032863629db*48"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 1x2864:1x2864_n"caa3951fa4a29cc07ce2bbd6ae8665c8*46"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x17:1x1_n"0a3e4b97e2fc60a73c04c4e931fcb273*24"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x15000:4x512x1_n"f1482bd9b3eb4a61a0db634ad43e876c*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x15000:4x1x15000_n"f49ef8372a1e36d490c95d31a2ea9e6d*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x128x512:4x128x1_n"b9184659ed925952224fd58664896180*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x128x512:4x1x512_n"38939ee7d4e8b8f7ed3f69df0b425861*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x128:4x512x1_n"2e66716329c5536f2b621c2e9303da8e*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x128:4x1x128_n"1a07e7e08b8c36d9f6d8737ad4051b14*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x15000x512:4x15000x1_n"7c9ac998bc1b1dc3d82009e29613ac05*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x15000x512:4x1x512_n"62a0019e3e34d754d2fa17e37e201fb3*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x15000:2x512x1_n"87749c391c808e2c146ac1bdb633ed87*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x15000:2x1x15000_n"f15875929cbf26576683544608da8af3*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x128x512:2x128x1_n"d7ddaddd64946b66e79e6ee98d6a5f4c*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x128x512:2x1x512_n"5e5417ca942b141e11d0692a4061723d*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x128:2x512x1_n"49e2d85c2e3a728995ebdf99f4703868*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x128:2x1x128_n"7c811106fb8a9f902fd2d7075d4b25d6*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x15000x512:2x15000x1_n"9f2ed378beb73ca2ed59368c8eb69766*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x15000x512:2x1x512_n"b259e13c7b10bab606d75420ebb15328*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x56x56:1x256x56x56_n"58bde8cda212d2c5e2e42e91ba1b0a8b*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x28x28:1x512x28x28_n"4f29370b83c44da1ca267b9bfb52e9d2*3"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1024x14x14:1x1024x14x14_n"a4a73c0bb4cb9af699aabcab7ba01d29*5"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x2048x7x7:1x2048x7x7_n"5884e99ace315c54cf84a59fa4df4195*2"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x1000:1x1_n"6eba169575fe4ba110376dd57fb5f39c*22"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x1000:128x1000_n"6c5f4631baadd2a899fa29e5f471bf2c*22"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x712:8x256x712_n"18b782dda611cc5d899a5061e642004c*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x712:8x512x712_n"33ffa9124d3c8e06e502fbf040c8d066*9"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x1120:8x256x1120_n"91d4fc9eb58f2e59da17cfd59bdf2be7*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x1120:8x512x1120_n"e8750167266d240322a7f759a3b2a1df*9"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x360:8x256x360_n"bc9586e50ab4c7b20607f56137210576*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x360:8x512x360_n"6a5c601b702e6a54529807da33e3531b*9"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x288:8x256x288_n"4facd6623f52085c110bdad161ba9986*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x288:8x512x288_n"b8d3331e2be6f00809b13a1222e57368*9"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x560:8x256x560_n"9b9dce35eb5bb5cacee1c1ac349517a3*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x560:8x512x560_n"85449d7b1ab7712572b1baa3ba70c26a*9"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x256x90x90:64x256x90x90_n"b8eb83ecc367041598dd6b285de79138*3"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x512x45x45:64x512x45x45_n"23a5c15ddd95b402998ffd0fb6e8bab3*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x1024x23x23:64x1024x23x23_n"0053460e05a141815fda312673501661*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x2048x12x12:64x2048x12x12_n"c6c61270393157eb3425e5a923cc58fc*3"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ab:ab --dtag=ab 128x768:128x768_n"d9a246e225d047f19220d49de6cfa7b1*11"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x160x160:32x48x160x160_n"c89a905735998ef5dd74bc0c8451b32b*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x80x80:32x96x80x80_n"4486af1eaff68c879c0542ac85e53051*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x40x40:32x192x40x40_n"6d86c26ea52bfee9d16233640284a6a4*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x20x20:32x384x20x20_n"32f702db4a794f9e281965557ff7ef34*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x2x56x56:1x2x56x56_n"12617506f659e7e05bc2acb39c30f6a8*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x4x28x28:1x4x28x28_n"3f4798bfcf4d7d6cb2b0f4d6f896f2bc*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x8x14x14:1x8x14x14_n"8f24a51c98873d22c24d7bcccad51782*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x16x7x7:1x16x7x7_n"bb485e4d2c1733842b1f85a13b285116*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x256x208x336:16x256x208x336_n"c8f834cb68e0824454ba599c66d10be4*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x256x208x336:16x256x208x336_n"ccfa7650827d29e0e951b57f8fa4c7e6*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x512x104x168:16x512x104x168_n"ddaabec40a2ef56ba338de0631c4b38b*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x512x104x168:16x512x104x168_n"913c320f211d9cf1615cc8e0a137b131*3"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x1024x52x84:16x1024x52x84_n"234ffc24a4d612302971815e75a81f85*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x1024x52x84:16x1024x52x84_n"3eebf0a58f996c98ed8d10c6de667609*4"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:acdb --dtag=acdb 16x1024x52x84:16x1024x52x84_n"edbbcc30314f4e91601b990e0b4f02a5*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x2048x26x42:16x2048x26x42_n"024ae9b77b912c54464645b3f0ed43ec*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x2048x26x42:16x2048x26x42_n"6d70598f6fd3f93482a894ed583c6ba4*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:acdb --dtag=acdb 16x256x52x84:16x256x52x84_n"e6d93539bf8dd0ade53b8493a6c440ba*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:acdb --dtag=acdb 16x256x104x168:16x256x104x168_n"f2b2c0b7f4103b2880be537430c5f95a*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 128x128x768:1x128x768_n"830608dbe4e161a6101f10572e848ddf*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:128x1x128x128_n"0d3c9e4825173fdfe596f272497e1eef*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:1x1x1x1_n"b50c55f7e99aec7c06a60d0d7bd8b659*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:128x12x128x128_n"5ff28cbda8feab64046002a09b9b758c*96"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768:16384x768_n"aae8e0e2f4ffef1f4f031c6f638e0485*94"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768:16384x768_n"6765d835cfa228c3b36c4274eaf88c76*48"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x3072:1x1_n"ff41e656c7ce7328367fec410479f4ef*48"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x3072:16384x3072_n"ffadd7732700ecf1c382139434fae575*24"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x3072:16384x3072_n"70858fa2a71a373a48fe73c7a71af605*48"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 128x768:128x768_n"3c508d219ad68ceddf752449ba0f13d5*4"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 128x768:1x1_n"a0d511fb797a411e263484d74a15e561*2"
---reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768:1x768_n"60df6ee30c4da4c510b1129ce222ed23*46"
---reset --allow-enum-tags-only=0 --alg=sub --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:128x12x128x1_n"758fa1415b058c6b12f63d784332d94c*24"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x256x56x56:16x256x56x56_n"6ae78207ccd42ca33aac501b4b89fbd3*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x512x28x28:16x512x28x28_n"1da0284754493e3c7e27b1b873162cfb*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x1024x14x14:16x1024x14x14_n"92ce18c9c09d4ac2c80cf74e052e635c*12"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x2048x7x7:16x2048x7x7_n"019bc09a2dd1c5cf2c01d0b7d72dad1c*5"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x64x56x56:16x64x56x56_n"01e03628da9fdb3610a614c79e4cc5f1*1"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x256x2264:1x256x2264_n"994409b2f196080736c299b81876ec63*22"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x128x18112:1x128x18112_n"d4ab3fcd36950afa782cf9c191290484*22"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x64x36224:1x64x36224_n"4e65285082946ad81ebb66a17df95cd6*22"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x32x72448:1x32x72448_n"c5ccbb6a2db39e1eaead16dc3644ff21*22"
---reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 16x17x33x33x32:16x17x33x33x32_n"3cfc1912a072c384cd9686c3abd275f9*8"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x360x640:1x64x360x640_n"6e3840d478d720e26c3712fe79efc3f0*6"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 65536x2:65536x2_n"61d33fb2580dac483516c78f31e30b4f*1536"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 23x72x72:23x72x72_n"5e8ca0e20e877beac35dbf8596a7e404*5110"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 256x256x256:256x256x256_n"15cf1bdd7af24203324e662e86b630e1*20"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"2d3d278d1a88054500d1e81f4f673695*2060"
---reset --allow-enum-tags-only=0 --alg=div --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"421986e67dddcfd123a7d0dacaa23342*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 23x72x72x256x2:23x72x72x256x2_n"ec2d5a705eac38e9fbe61ba00733f263*1022"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 292x336x256x2:292x336x256x2_n"6d7510f20497adc38e5d0071f1d4a6e2*22"
---reset --allow-enum-tags-only=0 --alg=div --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"69e5c08666feb7e3d5996ae869957ef5*2"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 280x336x256x2:280x336x256x2_n"efbfd41f2e3d512e203849f5773f986a*22"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x320x64x64:1x320x1x1_n"c883e86d713a04d793d6615a73d97b17*10"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x640x32x32:1x640x1x1_n"ea296138be0c779bd379bc92145efac3*10"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x1280x16x16:1x1280x1x1_n"126a67f37e7bb6f81f44dfa134b9fa32*10"
---reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x1280x8x8:1x1280x1x1_n"e25803dcb52acdf38e2426dec3b91bdc*14"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x1152:1x1_n"f7b3052ea24114049585958eba89e20e*46"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 64x2864:64x2864_n"173a523820dbff71160e415b85d3e02c*420"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x2048:1x1_n"1afcbf25c5c9c51c58ace0afdc215f4b*96"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 64x2864:64x2864_n"e414fc03929f1a620d71e4cf476cc38c*46"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x1088:1x1_n"ca554f9c40832b30d503da138395ccc6*48"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x18:1x1_n"b7404ecf3b1c66ca4a62b100a9a927e3*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 1x2864:1x2864_n"cba3633c1a1c5b54d407dc787ab67722*140"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x32:1x1_n"9595e519b7a81faf1ad23074613c3f83*48"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 1x2864:1x2864_n"626ba80d7a9c032830ba721435448c42*46"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 179x17:1x1_n"a000afa851abeb972b6a7a72007c8091*24"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x256x56x56:16x256x56x56_n"41bb4f47ec2f7bd330b6b1931adfbea7*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x512x28x28:16x512x28x28_n"5fa8ccddf018d3888f5c95bd062799d8*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x1024x14x14:16x1024x14x14_n"746ade5e60839798d6c516924cf6c95e*12"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x2048x7x7:16x2048x7x7_n"8ce20174cf63f6c99fdd1ccc0b2d3739*5"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=acdb:acdb --dtag=acdb 16x64x56x56:16x64x56x56_n"0bc37f82ab328f140e46b9b78e90f39f"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x256x56x56:16x256x56x56_n"175ce17efbe1b44acc95f374fb3dc413*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x512x28x28:16x512x28x28_n"f9e57bf9c42120d2a25f9d4b64b2c4c6*8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x1024x14x14:16x1024x14x14_n"68f0f1141a1a81c1084121dc9359070b*12"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x2048x7x7:16x2048x7x7_n"059326afe1bdd6a8ced64b12e4204e1b*5"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ABcd16a16b:ABcd16a16b --dtag=ABcd16a16b 16x64x56x56:16x64x56x56_n"12ab28875c3c5f70fcfcaf1d6bde8407"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x256x90x90:64x256x90x90_n"39ea8866b0a3baf4f5d772951f866075*3"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x512x45x45:64x512x45x45_n"3caef5d091a63d29c9bb1431966c7b92*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x1024x23x23:64x1024x23x23_n"5d410ae69c0a6404a6be78001f291553*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x2048x12x12:64x2048x12x12_n"744ba8e27828b5816b80726491b5106a*3"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=u8 --stag=ab:ab --dtag=ab --attr-scales=src0:common:0.5+src1:common:0.5 5000x845:1x1_n"a7f13dd0c666d0dc4d1a6ba8dee05099*400"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x15000:4x512x1_n"ce36c02d8e5c39bf82a9ba86c4c4c371*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x15000:4x1x15000_n"a5a2a257f8cf41a58a91e9ef294e4a95*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x128x512:4x128x1_n"6e6b9829ac23a8611deda182d7631790*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x128x512:4x1x512_n"54de9131161683fbf46e69c7cd96bf98*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x128:4x512x1_n"b300d5dab7e9dab742e9e1f7e9012ba7*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x512x128:4x1x128_n"4924b023fa94e1e38236c26ede21607c*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x15000x512:4x15000x1_n"2a50fcd34226e35628d3989329fa8ee4*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 4x15000x512:4x1x512_n"a4a2ed07feed78c97c016e6d5899e2b8*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x15000:2x512x1_n"f6ad491db7a253b2165bd4691d822526"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x15000:2x1x15000_n"7e27d6bd157ec9c4443eab34061a9bc8"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x128x512:2x128x1_n"784d12d6960ebd2f0ffef552f638de50"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x128x512:2x1x512_n"6e73ab37ddf6cac55d0834e06ce3c746"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x128:2x512x1_n"164893854f12d40d81ec1f22f6ec6ac6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x512x128:2x1x128_n"5d1938c4ff3ff4673b7cb42ccb0d9375"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x15000x512:2x15000x1_n"a66a2d38294c76836fcc9206b528cdd0"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 2x15000x512:2x1x512_n"86211bfb8e43b6e4bb5feeb5e2b69fa9"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x256x2264:1x256x2264_n"a2a78c329e4198409c7565e836dde886*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x128x18112:1x128x18112_n"a04ba0af037f5d663bcb221035d99cc2*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x64x36224:1x64x36224_n"4aed8b84f61237cbc30bb78354b0b923*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 1x32x72448:1x32x72448_n"b5fb3a8d72f34043ac131f29d8326800*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 16x17x33x33x32:16x17x33x33x32_n"91da36ded1be8eab77dd1a6c603b1d6c*8"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x14:128x14_n"d1f42a754072dab01f45de2e609acf02*515"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 128x128x200:1x1x1_n"f9c8814eeda4330795229e3234495029*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x200:128x200_n"214f3b1bf67b7e054a2e51e24e3ef88f*5378"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x200:128x200_n"48d5e033503f78e53d89aaa6e97a9b97*256"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 128x128x200:128x128x200_n"4d8f98e7e730bd9db02dc82ea8d04eb8*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 128x128x1:128x128x1_n"5bbade68fb7b713cca626df0e7b88c56*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x360x640:1x64x360x640_n"77ccaa2d582361f3494fe466159c74f1*6"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x1000:1x1_n"e06bd1fc4b2be80da2be3061c67fda94*22"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 128x1000:128x1000_n"b701fbb2d4771fdd64c921c74e9bdcd8*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 65536x2:65536x2_n"6e74df05e580d51b6e1bc761884be341*1536"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 23x72x72:23x72x72_n"1826411ee429c2de48be5f8cf45329f8*5110"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 256x256x256:256x256x256_n"21ffad01202693a7a06ad7bddeabba8d*20"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"11b3948bec26865adfd2c14368651c5c*2060"
+--reset --allow-enum-tags-only=0 --alg=div --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"34186da8d0293eacafeb263c46c95a71*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcde:abcde --dtag=abcde 23x72x72x256x2:23x72x72x256x2_n"1503c2a4b37d00e5ce8da5a945e8369d*1022"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 292x336x256x2:292x336x256x2_n"2545685f63e053ffbe98dbab6e58cbdb*22"
+--reset --allow-enum-tags-only=0 --alg=div --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"ecd70064e85911426d7573b93433a1df*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=bf16:bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 280x336x256x2:280x336x256x2_n"2c56c09fdcc0f7d0c0265862575d5305*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 65536x2:65536x2_n"d6ffd5c3d5c630a15f111e8f570c6e6b*1536"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 23x72x72:23x72x72_n"9a804b19159b05e85b7a524a7964ed3b*5110"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 256x256x256:256x256x256_n"e6ca8134822c7a439840405c4fefeef1*20"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"609477f20beb5523020c97dff023d01d*2060"
+--reset --allow-enum-tags-only=0 --alg=div --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"72a752c5dde262a322c6673c694f0a23*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcde:abcde --dtag=abcde 23x72x72x256x2:23x72x72x256x2_n"034078ebf77d834c44317d68a00a1cf5*1022"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 292x336x256x2:292x336x256x2_n"0699462052d4cff6eb6e41067185e5b5*22"
+--reset --allow-enum-tags-only=0 --alg=div --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 256x256x256x2:256x256x256x2_n"11774c266bd2e737862bbdaf09d4a520*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 280x336x256x2:280x336x256x2_n"e18bc06d1f8860fd884c8ab9024b6a41*22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x712:8x256x712_n"34cdc7f5d0cd0c6b4961eb3248e7f1c5*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x712:8x512x712_n"119aa1633773719938d308594602cd84*9"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x1120:8x256x1120_n"33a577f941b71d9a2df32655e4241065*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x1120:8x512x1120_n"68f3da06a1d523fdaab8baa629f71629*9"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x360:8x256x360_n"000d62720f664a2311e7fd8061106538*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x360:8x512x360_n"25e547ab6274942a3edc0799bbdc088b*9"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x288:8x256x288_n"7b25cc0e69dd8e22071c9cb5dd423051*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x288:8x512x288_n"5688022a44108abb17a1cb7009d4aab1*9"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x256x560:8x256x560_n"b4cc06fc975e747d10c3fb9ab1367093*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBc16b:aBc16b --dtag=aBc16b 8x512x560:8x512x560_n"a45861d0ee6c7765e44f485488978a33*9"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x160x160:32x48x160x160_n"601f702d029137ff05d9f7a3e9c5d5e9*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x80x80:32x96x80x80_n"05efb3c6ba18ad140e6308bd3e2b469f*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x40x40:32x192x40x40_n"24be77d4448a431af1ac237414139a7a*6"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x20x20:32x384x20x20_n"7a16f2baa0b3f5e071cbb05f97d7da87*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x320x64x64:1x320x1x1_n"74c1a6ed920e4c1a8871a4458bf975d4*10"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x640x32x32:1x640x1x1_n"a7e5eb7a4d1438689d7d448da4fb0ad5*10"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x1280x16x16:1x1280x1x1_n"7e50646bee6163b1ad2f177b49d35cc9*10"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:acdb --dtag=acdb 1x1280x8x8:1x1280x1x1_n"e6e3ce919821ff92289c38dace250bd7*14"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x256x56x56:1x256x56x56_n"ccce24807ceeccaa87460fcb2ef75a49*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x512x28x28:1x512x28x28_n"515a761504f45c60257eae53d105eeb1*3"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x1024x14x14:1x1024x14x14_n"40a64b2821aa8742e7693556cbcdeb87*5"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x2048x7x7:1x2048x7x7_n"d530bc61b81a04e4b72a2a698da73666*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x256x208x336:16x256x208x336_n"b2d87934ea52bfd1a658091bc29d5b04"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x256x208x336:16x256x208x336_n"abfabbba5a02376a5460ed846bc96312*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x512x104x168:16x512x104x168_n"25997a38b213165d5db54e0003cf5785"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x512x104x168:16x512x104x168_n"f819f1e099bdddcabd945cc58c2a66df*3"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x1024x52x84:16x1024x52x84_n"5a606330749e20eebd99b721d09abf87"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x1024x52x84:16x1024x52x84_n"6890398199272915dd6276434ebee703*4"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:acdb --dtag=acdb 16x1024x52x84:16x1024x52x84_n"fe1aa752138248af31f0985dda4c173a"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:ABcd16a16b --dtag=acdb 16x2048x26x42:16x2048x26x42_n"3761f3958494ee6e35c06e4a7312e64f"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=acdb:ABcd16a16b --dtag=acdb 16x2048x26x42:16x2048x26x42_n"56ac4b0529291755c270f53dcefcc695*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:acdb --dtag=acdb 16x256x52x84:16x256x52x84_n"00e3468bf473a4d9bca506c1c0218bb7"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ABcd16a16b:acdb --dtag=acdb 16x256x104x168:16x256x104x168_n"279ffd215fa98e50733d169194baca22"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=ab:ab --dtag=ab 128x768:128x768_n"d81c6795c3e8287587a9ba3e9ce44362*11"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 128x128x768:1x128x768_n"3cbab74430f304f5bde0cbc063f80f74*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:128x1x128x128_n"83146dda064114edec03920e8fab4b32*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:1x1x1x1_n"2113cc683bd076db7ae6ec0da9d105e8*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:128x12x128x128_n"a47c8b4c183778536e908d8a078a83c3*96"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768:16384x768_n"c2692974a6c79e10dbc2ec15eed8c453*94"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768:16384x768_n"a1f30e4ac0062cb3930333f035cc45e2*48"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x3072:1x1_n"d5a732b41b9f1ad26505212cdc71474d*48"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x3072:16384x3072_n"7eb79ccd1f8d98829068190184bf4203*24"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x3072:16384x3072_n"96152d98a15bda8a41b4bfc21ea331c3*48"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 128x768:128x768_n"03f9fdf90325f55f304828268613c0c0*4"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 128x768:1x1_n"b09ffa9723f567c2d1d1cf826e6421a1*2"
+--reset --allow-enum-tags-only=0 --alg=mul --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768:1x768_n"cbd032aab944d0c805c067813d3265a6*46"
+--reset --allow-enum-tags-only=0 --alg=sub --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 128x12x128x128:128x12x128x1_n"ef70416fad39be292b3eb86fd5378794*24"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x2x56x56:1x2x56x56_n"e889ce84bf273619e5cc0426daffd453"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x4x28x28:1x4x28x28_n"fc5b1ab1dec3fa6df2f5840806173428"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x8x14x14:1x8x14x14_n"d6856fe6bb9c9fbfcbeb005a1adefcac"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x16x7x7:1x16x7x7_n"34609d40e6755730e411e7fe2ad1aa4c"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x56x56:1x256x56x56_n"78270def318c80ed82de4f9fd7dcc932*2"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x28x28:1x512x28x28_n"30b154eb4a822e8e30152dc1ab31aa54*3"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1024x14x14:1x1024x14x14_n"7d14b793cbc5295105fd3b42dc050871*5"
+--reset --allow-enum-tags-only=0 --alg=add --sdt=f16:f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x2048x7x7:1x2048x7x7_n"ba7d070d6678443ed7fd199d090b86c0*2"
diff --git a/tests/benchdnn/inputs/binary/test_binary_ci b/tests/benchdnn/inputs/binary/test_binary_ci
index ab461d707e1..fac598a8548 100644
--- a/tests/benchdnn/inputs/binary/test_binary_ci
+++ b/tests/benchdnn/inputs/binary/test_binary_ci
@@ -1,7 +1,7 @@
 --reset
 
 --inplace=true,false
---alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE
+--alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 --stag=abx:any,axb:any
 
 --ddt=f32 --sdt=f32:f32
diff --git a/tests/benchdnn/inputs/binary/test_binary_different_dt_ci b/tests/benchdnn/inputs/binary/test_binary_different_dt_ci
index dd82d7d9108..158d279b80b 100644
--- a/tests/benchdnn/inputs/binary/test_binary_different_dt_ci
+++ b/tests/benchdnn/inputs/binary/test_binary_different_dt_ci
@@ -2,7 +2,7 @@
 
 --inplace=false # Different src and dst data types does not support in-place mode.
 --ddt=s8,u8,f32,s32 --sdt=s8:u8,u8:s8,s8:f32,f32:u8,f32:f32,f32:s32,s32:f32
---alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE
+--alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 --stag=abx:any,axb:any
 --batch=shapes_ci
 
diff --git a/tests/benchdnn/inputs/binary/test_binary_gpu b/tests/benchdnn/inputs/binary/test_binary_gpu
index ea2675389c2..f417f868ae0 100644
--- a/tests/benchdnn/inputs/binary/test_binary_gpu
+++ b/tests/benchdnn/inputs/binary/test_binary_gpu
@@ -36,6 +36,10 @@
 --sdt=f32:f32
 --batch=option_set_minimal
 
+--ddt=s32
+--sdt=s32:s32
+--batch=option_set_minimal
+
 --ddt=s8
 --sdt=s8:s8
 --attr-scales=,src:common:0.25+src1:common:0.5
@@ -87,6 +91,10 @@
 --sdt=f32:f32
 --batch=option_set_all
 
+--ddt=s32
+--sdt=s32:s32
+--batch=option_set_all
+
 --batch=harness_binary_different_dt
 
 --attr-post-ops=relu:-0.01+add:f32
diff --git a/tests/benchdnn/inputs/bnorm/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/bnorm/option_set_fwks_ext_gpu
index 93ff4485123..d4b42423f5a 100644
--- a/tests/benchdnn/inputs/bnorm/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/bnorm/option_set_fwks_ext_gpu
@@ -1,44 +1,44 @@
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic512iw861_n"fa69ca10be609670d3c58764195bff44*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic80iw861_n"9c8ea2689ffaeb0980b44fb9a930ba15*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic512iw903_n"a919db4ff66b87b0655ba14c57b57a22*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic80iw903_n"6cd5e0280be124618ad225cba0197213*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd16a16b mb1024ic64ih112iw112_n"e21a6e8f9d53bbc35b38414671e14c2b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic128ih56iw56_n"cabb76cdc83c1daaa043b8c6d361fb7e*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=abcd mb1024ic128ih56iw56_n"8d101fd270524458026c13939daad3ee*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic256ih56iw56_n"15382e998f2b63e1ffb5606deedd1d23*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=abcd mb1024ic256ih28iw28_n"77931e3a2cd092e43b9bf4d14211c8f4*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic512ih28iw28_n"b5c26da9c45116d72caf16b42d6723e0*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic256ih28iw28_n"b63c162bb3139e9b3eecc2652c9d2ed1*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd16a16b mb1024ic512ih14iw14_n"7fde0cbacefc40ca8fd5036b2d07cf27*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic1024ih14iw14_n"d1a933b32434795553905462a35d5b80*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic512ih14iw14_n"ece20eebd30efb87f7e2dfc27787f8b7*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd16a16b mb1024ic1024ih7iw7_n"665c15888f69cda254e2711119d92e5a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic2048ih7iw7_n"c5aea2870c24a2de2547eda215c1a79f*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic1024ih7iw7_n"f380eda7a03f6d54a4cc143837fb0c1c*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic8ih640iw1024_n"c1cc5737d24e399cbe417284b0dd416b*18"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic16ih320iw512_n"d01c4466d9eb1abaefe137cf42623df1*18"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic24ih160iw256_n"74692522b0fc6e48f7b0560e0dba7e43*18"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic32ih80iw128_n"0bdb6e8359efbba4e0935c27e3ba838c*18"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic40ih40iw64_n"172b4970fe6b6ea3b8ab662758776f07*18"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic48ih20iw32_n"abca0fac8223710dbc5a17e32768387b*18"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic56ih10iw16_n"85c7122347225d25aac8d390ea35d1a5*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic256ih10iw16_n"4965a26b5f05b2ed7a86c8fa55622f4a*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic168ih10iw16_n"15b22a4da2323721a98c5f9c4f4e5003*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic8ih640iw1024_n"61ab1160ad629450654d29c1c531d9b4*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic16ih320iw512_n"670d0581b5541c6059e38c06d619dd30*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic24ih160iw256_n"04873126651d2d9a54d1f776479788c2*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic32ih80iw128_n"d90f319bf3ac4bfbc1bcf441c21cc7dd*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic40ih40iw64_n"58975eeac0701e034db2206914363703*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic48ih20iw32_n"62cc75cedf3bd1b16592842405eaeff2*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic56ih10iw16_n"4d12ee184abb93231c7e350cc36958e2*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic256ih10iw16_n"7bee70597bb6f476b74e2347ce2793b1*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic168ih10iw16_n"1fd6f92a05bbcb0daf6ff29f028dc2dd*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic8ih640iw1024_n"1b3a879ef1860a304c809faf9ada6629*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic16ih320iw512_n"0c8abb178e547068805c8444943552c6*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic24ih160iw256_n"18675d77dcf34adfd7edae17ab5fa02f*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic32ih80iw128_n"e7d0eb6cbeff4ff2addc5213f0e1ff53*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic40ih40iw64_n"a180ea9062787cd15ffe7f5b8357edf9*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic48ih20iw32_n"57169a9a52c1d2edde80ff5709ad98fd*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic168ih10iw16_n"501692215573284dbbc0c965cc30ba19*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic256ih10iw16_n"04575d9a2b1e7cf87faadef304dee9d5*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic56ih10iw16_n"a2e5bad1c16279ad7b465235f7f0c821*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic8ih640iw1024_n"7320c26bb852da5ab97a5780d64f4115*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic16ih320iw512_n"9e5c5c95a06680b379eaf8a322570a1e*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic24ih160iw256_n"eff9bb224681e07609c03b4767d3ed95*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic32ih80iw128_n"96ecf4559c923408da41f8e806e234a0*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic40ih40iw64_n"66a99ef7ff4eb3c38d074b91a3dcb4be*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic48ih20iw32_n"f16210aa8f05a2786138c0cb759c43d2*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic56ih10iw16_n"16f51e78d862f79a5784967cd43bc464*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic256ih10iw16_n"0acbe2bd8a037fe65b05e433ace7d12e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=acdb mb14ic168ih10iw16_n"3e67e6c718ac583d7d3f4027866b5fa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic8ih640iw1024_n"e13f192f677b598af8f6c47230997f46*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic16ih320iw512_n"3276dd6a1b0e72a34567f03903ad37a3*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic24ih160iw256_n"f93a542f13bebec074ad7754a086e878*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic32ih80iw128_n"391e59597b605dd43930ffd0b2b31416*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic40ih40iw64_n"1cf6215ca976b97abeb18d12bc0fb32c*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic48ih20iw32_n"2085426afc58d605f509e0e0e8d52570*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic56ih10iw16_n"4d729c2634a47460c188062565f23194*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic256ih10iw16_n"8877366864636db2417bc246b7c5d4b8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb14ic168ih10iw16_n"ffe6667f2dd0a3068d03d31c214c2254"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic8ih640iw1024_n"41309ad67cfb31390a3656fef1503538*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic16ih320iw512_n"b2c582dcefbcff22bc607ba2f1fda057*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic24ih160iw256_n"3dd06063d70601208e9f949a32c7ca4e*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic32ih80iw128_n"9c18d87dd5cdf56c7087d6e498f5affd*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic40ih40iw64_n"112446705d90a54e2ba04e8f74f6e79c*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic48ih20iw32_n"c591a73de478a5e4c95b25628b84b74b*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic168ih10iw16_n"0010fb5c60141e4a36f4d450aad59acd"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic256ih10iw16_n"466405c0b7b47580437e6a00804584c3*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb14ic56ih10iw16_n"63ec5643b844c86c084fc175a539b208*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic512iw861_n"81de02569cbbabc9a1fe271fd88f8b4c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic80iw861_n"68272b918e60060d9c5d051ffdf003b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic512iw903_n"18daa99ed2259277555443e86b133dac*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acb mb16ic80iw903_n"9cf13125bbcea6df98fd66277ef15e40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd16a16b mb1024ic64ih112iw112_n"6c5d5fa729b9bccc217f39ac89189997"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic128ih56iw56_n"e68adafce460a47d13a4ce3a924dedad*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=abcd mb1024ic128ih56iw56_n"0081c55c61861ec53f1644cc009b7be7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic256ih56iw56_n"fe837f48960eabd1477a3ed7420ed555*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=abcd mb1024ic256ih28iw28_n"586d17079b245929d198c0977fb54cbb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic512ih28iw28_n"a6f1bfb06bfe8e718d117914062a18ca*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic256ih28iw28_n"372666a94aeb7070ef48db8ac241697c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd16a16b mb1024ic512ih14iw14_n"1399da40b0543b2a8469907f18f69244*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic1024ih14iw14_n"51fbe4986c01ca0f27ae33680bdf2671*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic512ih14iw14_n"9e5abdf096704ec8c739cf26d94a9968*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd16a16b mb1024ic1024ih7iw7_n"fbe7c13dbe0ae96325ae9acdd6192eed*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic2048ih7iw7_n"14b938639022399b6017f7815a2ee658*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f32 --tag=ABcd32a16b mb1024ic1024ih7iw7_n"f62042315041b192f8733fd03e98c0da*2"
diff --git a/tests/benchdnn/inputs/bnorm/option_set_fwks_key_gpu b/tests/benchdnn/inputs/bnorm/option_set_fwks_key_gpu
index e42c7a3cc63..a16a6429f82 100644
--- a/tests/benchdnn/inputs/bnorm/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/bnorm/option_set_fwks_key_gpu
@@ -1,399 +1,399 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic64ih112iw112_n"13add1aec229865c0a4f578e0499e85c*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic64ih56iw56_n"6bf6314a28a526bc33321cd84b2f4795*120"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic256ih56iw56_n"b013ce9bcf16d38ce7b9c8b4cdf1b656*80"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic128ih56iw56_n"c98c7560d192a56d8a0650608408e0dc*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic128ih28iw28_n"94c184d18a0e5c10c924b32fcb83ed31*140"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic512ih28iw28_n"84fb7bcc60dbde29f6457b41579dcee4*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic256ih28iw28_n"b2e92cc21f5bd3f9139b90763e7a5cb2*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic256ih14iw14_n"abd0e6615789d494a574e10afcfae07f*220"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic1024ih14iw14_n"1931fde0b43ba38a9830f48e1833c417*140"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic512ih14iw14_n"06f2ba1bf862184c72341107dab98c93*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic512ih7iw7_n"1421841346f8e5940d3c74199a267fcb*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic2048ih7iw7_n"c890a85512bd1fcdc65816a98a3cdbe5*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic2048ih7iw7_n"35b7a121ef55d1747fb4ab1b0ed3f7df*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic512ih7iw7_n"68b5e8066b3e646a719f9ea30238fd77*100"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic512ih14iw14_n"0328bae9371c0245c72b4a63a166292a*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic1024ih14iw14_n"ede1652856d1337571629263247a69da*140"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic256ih14iw14_n"8d7788de2b304964433723a5aa21e3c4*220"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic256ih28iw28_n"ec5e30e496dad42b5ed5aa2093c8f0e5*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic512ih28iw28_n"46c1fdba05073f3897a8dddf86bb4846*100"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic128ih28iw28_n"6f339ac897a84bdf2fff0a7180524e69*140"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic128ih56iw56_n"3ed129cf3f63258a95bcca631284c431*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic256ih56iw56_n"9306e6ad39123a263a6142aef924b86c*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic64ih56iw56_n"cbf3d4520a1fa80b01bb22af46ef2980*120"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic64ih112iw112_n"c1a5beafce6a39561ae081756313bd93*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic8ih640iw1024_n"8c95495ec5ea70cab31e05afb2dfd162*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic16ih320iw512_n"e99f34c6398d4e12bc4df41d71942c33*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic24ih160iw256_n"21afd7f0ba7ac723991b83a4c9887109*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic32ih80iw128_n"78b23991599a3e38a4d7b3397e5ee273*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic40ih40iw64_n"ad90ea340421e3a933e297bf373bd496*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic48ih20iw32_n"1f51c1e0c0c6c56e00c5a21411d05b70*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic56ih10iw16_n"912a620a520607a95f05d5aad99f6b9c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih10iw16_n"033d72317d14fd7f7ecfb120c77f8ef8*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic168ih10iw16_n"6480fe1c8233e031b8763d727fb2185c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic8ih640iw1024_n"be5b954163a6208ceea0057e7d5f3854*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic16ih320iw512_n"f6c9980064ebfd730079da49b5fcff8d*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic24ih160iw256_n"c75f0319a7bab2c25d97396264b17404*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic32ih80iw128_n"26018570c0e8d16236e97f4d10667016*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic40ih40iw64_n"2d9fcb3a1929d66455e97636385c6d00*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic48ih20iw32_n"b074fcd402856757918ef8fedec4552e*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic168ih10iw16_n"84aea8dac682a593169f9c1c649c519b*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih10iw16_n"e25007a1f5d6e60faedf402bd223fa14*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic56ih10iw16_n"61e97073739477024617a97459d66b41*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic64ih112iw112_n"ba2d3f977f8db0dc2115796aae9bb843*1&0b0415c3fadebb72a818d963eaddbbef*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic256ih56iw56_n"472cb5cc0297ebdd86aaea2db733861f*10&bdef55d2e36dfe3900a7f84128e03b22*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic512ih56iw56_n"e2a44fc71a0804dc8cea2e631fe0694f*1&4ab368b37b8b67f2b339d50c979850e4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic512ih28iw28_n"ca6be8a7a3c4c88a10a70e24c8e66918*12&35a2c7b5b2548bd786a38935c3a61678*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic1024ih28iw28_n"b140d47334bd9e82b11934ef986fde3b*1&db70a858d7ba91acd954ebcd23635da3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic1024ih14iw14_n"8b1992834365d5c08f2d7c266fa662d8*69&752f597eccbf95574745600cb0f89b05*69"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic2048ih14iw14_n"897b84c5ecd5c48d88ab097cf39efba0*1&2d0a4966bc52561ab94354e90bc4a8ee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic2048ih7iw7_n"6d167e1f33817073de41d5d2aed591aa*9&b60db9392299a6f818bef357991cdb89*9"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic2048ih7iw7_n"65fc5f1549866946883d0dde7b5f328f*9&5e73850686e0d069c9705de0f9066bb3*9"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic2048ih14iw14_n"51aede7861abac9a45d3c1f812ffe69e*1&2b1e32572619ab2b44508f231b906fae*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic1024ih14iw14_n"1b1510ef9180af9d1a4bce9a567bb417*69&fae116fa75e818b1f48eff07a35a79fb*69"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic1024ih28iw28_n"0028b756c33a834b46a47176b222dcf1*1&029344236370168a5ef4025c9159509f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic512ih28iw28_n"e002106907d69373c5327864cabcf16a*12&ef893413efa297c14ec984cda9c23d6e*12"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic512ih56iw56_n"1c36d5bfaa9e83f2fe0adce14c7555b6*1&bfd806fe0a3bd964056846280e613ab3*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic256ih56iw56_n"0571d1250c1a8f5faca5c0919c80ca22*10&dc411a8a349644508bf0c21fe9c698d9*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic64ih112iw112_n"d2f24dad1c359a38cf82101fd7ebe176*1&096320b11a9c7de756df9b16ae5a8d97*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih112iw112_n"e86d2e173cd66032de34d9b12c23f417*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih56iw56_n"3b8e4c54c19ba9e16952b6a4115a26cb*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih56iw56_n"9204ab4105a02b82dbf9e07acfe9ec1d*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih56iw56_n"587d3f0a74fcd75d374a55860087ef60*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih28iw28_n"6790ab5fe3190b5ab192ad89b297cd42*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih28iw28_n"929c55c5386c043267a91f843de80f74*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih28iw28_n"7aaaf55b62285d16f9d86226082610b2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih14iw14_n"b8a812863258197d8ea28435c62079b8*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic1024ih14iw14_n"c8628e23a3226fbd2bd00b7cab51e3aa*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih14iw14_n"ff40eb0dd0d95a87b628d0e864edad45*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih7iw7_n"ed539b9cfc09e10c40d22453e8a74083*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic2048ih7iw7_n"eb6cd4fadc48cd6f4889a99112736d37*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic2048ih7iw7_n"1c05d49cddaf5da6fefdcf779c8e399e*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih7iw7_n"8a570417cdcec73bc44e276fbfb5cc1d*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih14iw14_n"6a5e3517c9f6b8b0e0fe1bcf0d717dde*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic1024ih14iw14_n"e48ad5895f139e67481c069537a6747f*7"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih14iw14_n"bf014dc6cd05fe44e07f28a52f56c45f*11"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih28iw28_n"5d458343bf4968d0b374d7900131dc10*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih28iw28_n"74dc8f33ee5e93e47af0c5970a7dd89a*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih28iw28_n"cdf0f6e7df64737f931a745f5fa30215*7"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih56iw56_n"893ab12d22d4404261348922b6dbc42d*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih56iw56_n"8b61bc5713ef1fe36e69a4507cd2f0b5*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih56iw56_n"8d7b231e8994c809cda10832fa9c4ac6*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih112iw112_n"82f05f313fc1b2094777fa978422bc61*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic64ih416iw672_n"a20daeecc10fd7cef2425f0a26c44673*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic64ih208iw336_n"4ee390275d8b63a5d2afaa07a73df0ad*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic256ih208iw336_n"f466f674862e7279d8f59749c7135ce2*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic128ih208iw336_n"ea7d7c849a9968f6251b7de987cf5f72*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic512ih104iw168_n"46f6e9a1a6f8972aa3f3ec2a3c9d255f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic128ih104iw168_n"a9a272b27391b19243a8e4dd582015ac*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic256ih104iw168_n"380d208d711df4ed6328602956389752*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic1024ih52iw84_n"609da98558ca16c59bae1a3c7d66c86b*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic256ih52iw84_n"0de6c09812b8c73daab1bc39e3d6365d*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic512ih52iw84_n"a0a807fa4d23c8c9691ef26d8d63297e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic2048ih26iw42_n"66398e45e8b3179fb5ff8a33358f105c*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic512ih26iw42_n"e8e1fa20f4e5adb87f86ec8daf220e11*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic32id224ih224iw160_n"84d564ad3dfa878add2a3094c32d935a*268"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic64id112ih112iw80_n"97174f65f0010864d285d4271f50fcbd*268"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic128id56ih56iw40_n"73f092ce2388dad7f89358cb9248e981*268"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic256id28ih28iw20_n"bf16fd4647c07f64daaa304c0ce19a2c*268"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic320id14ih14iw10_n"63d9aad334f6f633851b3b2a612c423c*268"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic320id7ih7iw5_n"b2d7fe78bac15be1e91ecd231bbce293*134"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic64ih112iw112_n"65b4625af2a4292f5982340e9e7f55b6*40"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic64ih56iw56_n"2d3eaea8f58149a94af3ba32a643f441*240"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic256ih56iw56_n"d25856185de5fe707676ecaee7981ab8*160"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic128ih56iw56_n"3b5597c87aafec75e5f56fa3561f9a86*40"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic128ih28iw28_n"006901b754a86e457e9b2369d7c54496*280"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic512ih28iw28_n"e14d041d2055c977eebd812fa4883ae8*200"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic256ih28iw28_n"5a96ed06c6606649cc0dd9e2ec134231*40"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic256ih14iw14_n"c176b1eac587264d0dcc78ba6898aa15*440"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic1024ih14iw14_n"76f52048083635b27e2ad8637f8f053e*280"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic512ih14iw14_n"8deffb00791ca37f570bff54db9e48f1*40"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic512ih7iw7_n"ccfe0676f5915dac6a365c1f374e7550*200"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic2048ih7iw7_n"b3cc71a350e8c893e71887297d36d3e1*160"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic2048ih7iw7_n"222c05bcb466f6974acc65489f75fe9a*160"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic512ih7iw7_n"a39262af16b5def2a6cba16a1eeb18f6*200"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic512ih14iw14_n"cd2971136512175dec511fd2b25f8046*40"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic1024ih14iw14_n"68ad766888c0a06fbc6219dec8bcc6e1*280"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic256ih14iw14_n"7cc75fa2a7fa73dbba1593fa245c37d9*440"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic256ih28iw28_n"a767f3db73a91a0e47e0166b46e4d96b*40"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic512ih28iw28_n"bb961932ebe3171ac1cffec72f418f87*200"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic128ih28iw28_n"9c58f6820f86d8cbf3a50dfed93f0dd5*280"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic128ih56iw56_n"2a2dffcb524eca2f5a747916339f649d*40"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic256ih56iw56_n"9ec4bd885c4f2141ff4348a7b195d671*160"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic64ih56iw56_n"cdffd3691ce50d3598add9ad93684dec*240"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic64ih112iw112_n"dccf1e70f87aa87999aaadd447460ba9*40"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic64ih64iw512_n"62b29b9c810439b62a612082907f54c4*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw512_n"bc241a9b88db36c218a3562154806312*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw128_n"32ed346c3810f5b4cc6094a55eddec18*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic256ih64iw128_n"1a6a61fcbaa526c71cedcebc69f4ecd7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic256ih128iw1_n"0e9610735a466691294be4e916b6d81b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic512ih128iw1_n"7d7c762f200be749222fe3b6f0cd376d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic1024ih128iw1_n"73d4667830e6f1505d90094b576bfe67*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic256iw128_n"8df5c778272c53862f8ed7e669e9d227*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic256iw512_n"ab98e9a5d739cfc7545a6c772bf1d5d7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic128iw512_n"84426d9a8e0631f35bf924390931a72e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic128iw15000_n"24128f9ddf91518505c82384b454a6a0*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic128iw15000_n"8ebb6ab2b2e08d878cef5a6cf8080c47*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic128iw512_n"bda1f61adfbb3c7a6b9732ce77c8865d*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic256iw512_n"18c1d36cc2bba685743a42a0c759a481*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic256iw128_n"311a272b448b2fa5b3b3924d820905b9*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic1024ih128iw1_n"05b4a26fb36626ea35579b4234ce0a31*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic512ih128iw1_n"84f45500f0280d6a2261825a0a9beba5*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic256ih128iw1_n"fee83f66fd6343f851fe01b8f874c46a*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic256ih64iw128_n"b95b5e9140b7ed2fb339eeb2b3efe5c3*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw128_n"7fe6d0ef1a9e2e15ea212dd1210a0bfc*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw512_n"087d46b13343f52656d918ce4c7b32ab*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic64ih64iw512_n"870d3dc750647f018d360aaed23af04d*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic64ih64iw512_n"bb7b5a3f6deebd1f9d6914a124cf3bbf*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih64iw512_n"8863445b681a29d1767d8283dc7c204b*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih64iw128_n"7b743df45d1645a77a2f7cc50ec9fd96*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih64iw128_n"ca76220a6998b62f209eb55edf36f6d9*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih128iw1_n"b6e48790e5c647ff3806076c756474f6*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic512ih128iw1_n"c9c0ac3ae14d64aa67b2e2e2fb853663*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic1024ih128iw1_n"637ea4a19671e1e232f95ef22718e427*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic256iw128_n"b02b4ce27f1fcb360ed2cce6c129db18*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic256iw512_n"035c4761728beb2b0025cfe9d9fe2e8c*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic128iw512_n"ebb04540c23f6ac1a948097e241aaaaf*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic128iw15000_n"ec2bceb3f8cd3e16c4d5d46cdb7ca3bb*8"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic64ih64iw512_n"cc3a520bd19b65a923e7ff3869444958*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic128ih64iw512_n"29887d84e60845660274f9456926a8ab*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic128ih64iw128_n"78645e7f1caf007b5bff83af6886da84*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic256ih64iw128_n"685b848e754d44d276cf75aad75ecbc9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic256ih128iw1_n"10b049f7d11ec9f906eb68275c495671*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic512ih128iw1_n"a1cbc66b0c5a779b2c1f192442245b28*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic1024ih128iw1_n"c3ddb28b62e6248db354c1960d0ea090*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic256iw128_n"7ed5cc745d3eb4575fe47fa31d7a6938*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic256iw512_n"245a1b2ca01e4d802d851a342b66906b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic128iw512_n"d24cd45a243f8b87e9e341d958bc2990*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic128iw15000_n"f4a5ace7547906b3f5083ce56ad26126*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ab mb16ic512_n"28b342d25663dbe338731bd7dd59f353*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ab mb16ic256_n"267b30f5e1cf1e84676670b144907327*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic64ih112iw112_n"6dc805a6cf6c60ed6a5da52bfdc65292*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic64ih56iw56_n"d153e76ee3192eed2e100ed508b3506e*120"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic256ih56iw56_n"e698eebaebc6bef7868eb0cb2e8da52d*80"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic128ih56iw56_n"85305398b7355345a100f10ce2e36446*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic128ih28iw28_n"c8d346008b3eaf565380848f586f5386*140"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic512ih28iw28_n"c458b78fb995856fbe1113e3cef8193e*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic256ih28iw28_n"9b67135ad9ddd56e1e9c9779dfac078d*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic1024ih14iw14_n"e3b5be7caed63dd542f744db214f6419*140"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic512ih14iw14_n"a24fe2725f28d5f27f9c5cf556e4d432*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic2048ih7iw7_n"527d15fdc498dba4d2f10c1772d3cb9b*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic2048ih7iw7_n"85a69d3b34bfaa554712c3aea08846de*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic512ih14iw14_n"96509f9fd0ea33f8c6394f81a78a6ef4*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic1024ih14iw14_n"13730fb158905b6137048869e0775705*140"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic256ih28iw28_n"8a40671f72592f50b43ea8db5483a7ef*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic512ih28iw28_n"5606f50e6a7e5bd0817b78ea698230c6*100"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic128ih28iw28_n"17642ddf8949244c9f3e3310a4aa3d81*140"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic128ih56iw56_n"614d84ed55ce4f4017bed10fd799713a*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic256ih56iw56_n"ecb715fc82a8cf2793fc51eaec845b39*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic64ih56iw56_n"30df9284abec8ff403d6eb523f01bc4b*120"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic64ih112iw112_n"c3a4f2a8cfd990e6b55e648c4c6135e5*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw712_n"6b748a69b509e0007191a667f5023b1a*37"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw712_n"b1c67a610d8178babe782ce4791126d4*55"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw712_n"8efd370dd44fb9815d5665a5113570b2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw1120_n"e23d52008d1fc635bf73140dac8743bb*37"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw1120_n"05c5b397c442da2cbbb125cee11a910a*55"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw1120_n"fdf1a50c19b139e1817c7fcd69140b91*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw360_n"c26187862fa730cf985103a4c9d9b2fc*37"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw360_n"ed2751d8cc3316bdd450060f5aa008a1*55"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw360_n"572c69a677ac65ea2541c5965cc0efbf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw288_n"7989fae40019f095bd21ae76022803a5*37"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw288_n"48a47723ee3982ee36f3e36156bd850d*55"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw288_n"790215674583741a89e9f4e1d8562ade*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw560_n"5a8ce569a721ed145402c8b9a57371aa*37"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw560_n"97705140124619bcfbd0eab4f30e18b6*55"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw560_n"8238049fd501afc3e95d9c61726d6483*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic64ih112iw112_n"4d6e4ccf64339bb52d34d12c7d523fc9*20&1c2bc51f7244b855d6ec075338c0e659*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic64ih56iw56_n"82547712607b7d1139894d24d69ff5cf*120&dea7c3590caf8736c0a5f78e3d076949*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic256ih56iw56_n"299c460a901b2cc08ce8c724d214058f*80&017870ac88293a9e17aa91e0ea3d2161*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic128ih56iw56_n"220311992c98fa080e7eedc227ccf55a*20&960d0269f0cee838f1cbf85455d3a80f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic128ih28iw28_n"5d361b08926b58f6a6a0c0d2c0ea0930*140&a4d68e9cea3c8439c8ca6e06e8a24d6a*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic512ih28iw28_n"323e2dc1a0383aa3bb471d374fc882a6*100&941d366002a6ffd33c306c8d59859a05*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic256ih28iw28_n"b2a6153d40e90f53eda6c6b8746b6b9b*20&89962a0c71a6d57e690cd4dc374e032f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic256ih14iw14_n"fa10f5237df808a9e0ad9cbc3e301115*220&ff1509b3fd8ec4dcaf5aa3d17ceece4b*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic1024ih14iw14_n"f48d5e77cd9a239b28a29be78306cfd1*140&4fd4f3872ef288e2226b46286e495fe2*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic512ih14iw14_n"a606942a0b8931ef59f9c7f8cae07f2c*20&ccd834dfef55107bed864063f141234d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic512ih7iw7_n"26d8dacb953df7247d98f2bc6e0c847b*100&a8b77d8cfce3ace4f1478b8f96e1b42c*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic2048ih7iw7_n"0becec9385560cb874ab5f58006f7fc8*80&9edd8eae915e45f85233e0902b4a2043*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic2048ih7iw7_n"ec8126025d53d0363723529994f2424d*80&a736d14ff87a7212fefad70ba04e426f*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic512ih7iw7_n"123953e98ef66ef3dc97c2c4c5f6128d*100&a84254d9998b5a037e39fe436f62d275*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic512ih14iw14_n"1ff388db57ac248df9337edbd72fef61*20&a1ab4496078ad2a7ba04cacbb9e0a359*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic1024ih14iw14_n"67e25ed80555cdf43a8b5298816b9ba6*140&22ed970acecd4758bf988cee0302bee8*7"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic256ih14iw14_n"d448c327352df8100d54f7b4a8e4d0a0*220&419467a8391c5430a6b1323a56c3fd57*11"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic256ih28iw28_n"eb78660918737d9b553d0786f94bb785*20&b77fe8f79531c5a58af07091ed03b804*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic512ih28iw28_n"e854d26e7ba426f6e2b4ce4d66f97b66*100&8e0b201dc760fcfafc70258c83e859bc*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic128ih28iw28_n"e6ed02022d6785bcf6e9037918a04742*140&9d39c41b3626c30d21f2d6d6b1db93b3*7"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic128ih56iw56_n"58ed9048261bbddd17046e8492d3ed8e*20&8bf934e924cceb001a3b8584707f2a1f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic256ih56iw56_n"83b3147e6cf6cde76f26aec343ab8954*80&e1d498ff59db1957db2fc0822ece2479*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic64ih56iw56_n"62123b0ae8c977497363277b4ef631ba*120&997356b275cf950be213e7dd1860c333*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic64ih112iw112_n"66ac631f191c79b8dd2f7e36fcdc032e*20&2de9824c65c58502a2ecf5f040815cd2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic64ih180iw180_n"628f317eb1e346d597d8bee3fe06eb0f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic64ih90iw90_n"34032ec9b0a41aacf5a514579a6e5c97*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic256ih90iw90_n"c42de0932bc60a520cf7fc5f623d0bcb*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic128ih90iw90_n"a9192d37394e5d0bc757928785839514*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb64ic128ih45iw45_n"1f8384cd260d86042a8271e29be8d7fc*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic512ih45iw45_n"a9b3717d93139b49c682340d5efbafc6*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic128ih45iw45_n"d85f8fa65298f392686d5491fa126725*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic256ih45iw45_n"5d840a817f46c5691d90c454e23e6d58*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb64ic256ih23iw23_n"a4582937b223fc19b746a65a9644159a*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic1024ih23iw23_n"efd4cc07690fa34277d79a8a450635e5*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic256ih23iw23_n"49b261560a634cff94bc7df121cc8644*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic512ih23iw23_n"fa5a3eed0b6a7a7acd294d20767b81ff*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb64ic512ih12iw12_n"2c8bd0cbf26abfd7fc535af3b7227555*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic2048ih12iw12_n"0f11d15a05be4842417a80dc8d56b767*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic512ih12iw12_n"ee16196d85504a260e32a678367f8ffc*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=ab mb16ic512_n"ae1f39d0a6ceafd0948c016d85ea07e4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=ab mb16ic256_n"36984240d088bdd580dd0cc86d40475f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ab mb1ic2_n"08093332b9a2bd05dca7458dc40f259a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic32ih384iw576_n"a44d2d2e475e7464d43c45520baf2763*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic64ih384iw576_n"71280977163e3221479045ca5e3377e7*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih384iw576_n"58273acdc97ca73925d2e28ff40fa66b*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih192iw288_n"5040d1f0a69e1be83b4a5f2c7ee1e998*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih192iw288_n"f20f16beba96aea35654f1d81dfe8c02*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih96iw144_n"4299b9aa1ae63d5b53206766c36ee090*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic728ih96iw144_n"58342f93e885dd4cde8c59ca3b3b3a83*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic728ih48iw72_n"2b169fa6b9f7352aa741de2b9117fbae*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic1024ih48iw72_n"c7d670fcbbb4b10291ae0d8ea66e7089*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic1536ih48iw72_n"e40728dfcc61cff03f0a8f8806ae7e8e*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic2048ih48iw72_n"65b7b4348de4382fc4f3ba1dd77de127*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih48iw72_n"3f5a94d10675f819ac60e37a458feb6e*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih1iw1_n"6e54f0628607cc9b32eb859e98a08a5f*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic48ih192iw288_n"7a7c6b1e02535ca4d386fb8deda8c098*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih384iw576_n"d41f345b1d00dcfeac1802c0fd4e2e54*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih384iw576_n"3b7f39cc939f5ac9e616f83306343ddf*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih192iw288_n"dec00b22001f6dd4db636b083446e787*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih96iw144_n"f08e5a1d102a2dbbe1f52f06da084127*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic48ih192iw288_n"76113caa36d378a69150243010ead150*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih48iw72_n"f7a3067c1ea31fbe6029c8d149f5c1ca*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih1iw1_n"1d8b5adca8e625943a52b36eeb22f8ad*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic2048ih48iw72_n"ca276c5e351ef7e75291a9813592c4f6*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic1536ih48iw72_n"17923e05207918a6390d92a84def22bc*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic1024ih48iw72_n"443a357d94ac2d3b955e51ac1d1e65f0*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic728ih48iw72_n"8fdfc5386018906a9874eabe82cea397*100"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic728ih96iw144_n"70540d6a64d67a9c96cfe06b4fa2e823*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic128ih192iw288_n"ebb510fdcfaafe65c454d5d6ae8b8e69*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic128ih384iw576_n"b83516c8f3e57812ba7ab0aab81d5a9e*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic64ih384iw576_n"c389c056dcdb996e8e5c89a0b880fb4a*2"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic32ih384iw576_n"95e49a83dea1b1a0f29619cebdd6c004*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic64ih416iw672_n"bf6516b3f20faea16953b5e56bf454a2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic64ih208iw336_n"06485b2fb00e5dfd74c9c83415f9d647*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic256ih208iw336_n"5b8324d6c5c7cf34f2e1a19cd0052fc8*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic128ih208iw336_n"943997a8ba9c7f7b786aed273cfe12d6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic512ih104iw168_n"9ddf1a632be89988147dfe878c381f96*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic128ih104iw168_n"e03cc363a6db5ad1a8fc8d2fb53e0645*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic256ih104iw168_n"16284d0ef9b6c1f452409121b24800d3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic1024ih52iw84_n"ce4bd4e5786cc79f59266de37fe1bb23*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic256ih52iw84_n"b8bdc2939440b3ebf6c6f79a4985b786*11"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic512ih52iw84_n"8d0989344b40a0850ef13d8d0bb8e8c5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic2048ih26iw42_n"0f240b54de281e862b956743fa31addc*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic512ih26iw42_n"2e96508c02cfd4198373c32dfba56308*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcd mb1ic16384ih768iw1_n"e41dd633f4b23abd4de720b7dd0db35b*50"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=abcd mb1ic16384ih768iw1_n"09d46436d75674978b7cd26accb10aee*50"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic8ih640iw1024_n"8a61a30f7dd0524995ec72d8b04934c7*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic16ih320iw512_n"a125586fd9b3b24d1e4d418307324658*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic24ih160iw256_n"e956a9be7df27f228fd2fa71d9289a59*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic32ih80iw128_n"ff7e318784e36058038fa85ae0dd2ba5*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic40ih40iw64_n"7207a80da4d0198c11a88a5e21110c25*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic48ih20iw32_n"0ef8f6ef8d8e43f1f64050a584d1489f*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic56ih10iw16_n"67de0bf82b65489abc123b7efca23da2*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic256ih10iw16_n"2cb6dcfbb6ede987f243010167440d7d*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic168ih10iw16_n"fa8114e88b4b5155c3260039874116fe*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic8ih640iw1024_n"e134208da6fe8924cd7b80d6f79b7bba*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic16ih320iw512_n"f9427b5aa803ee6ad3777ba1388ed0f7*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic24ih160iw256_n"645263c0c6476aff77495fd081fa871e*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic32ih80iw128_n"51157ef56f4977eb3959efaad189b8f6*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic40ih40iw64_n"5c2090535c1653862a57cce4b1840c24*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic48ih20iw32_n"77a494f1b6981f85c41c7cb4499db92b*18"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic168ih10iw16_n"77c91f61aef703764be7f752490cbd7d*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic256ih10iw16_n"7d1ea1f3bb2b7c56c35eb76419a4de9c*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic56ih10iw16_n"cd6e021ba7f4cde1712cffcff326dc53*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic64ih112iw112_n"bd97669b61423cfbf14901968f846caa*40&f33111d24f766868d6b193543247c461*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic64ih56iw56_n"ecf873134d01ebdb61060d246b060cbf*240&925bc9d36adf36a92fb0d887960cc0c5*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic256ih56iw56_n"8fc13961bd59a246a5c42bf2b9e65a4c*160&bfb80de23cc8d0e305e09d618e4459c0*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic128ih56iw56_n"7b121d15c285e45c38cd4f010d8401a4*40&005aa1862a02d24e86c3ebfc12e96eff*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic128ih28iw28_n"c686912ed237dfd3a30cdf4850142223*280&1a0b7efdc8f0a1a363a82ae18d5ae382*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic512ih28iw28_n"4d4d2957d33ae9a94988b3815b72b5b6*200&a97968c31e46686b7aaeb1c963c24f18*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic256ih28iw28_n"61b5d45c3938de93b2914efa95f75b0b*40&3a87912616ca9f55d38bf3d9a09ff58c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic256ih14iw14_n"522745af59bf3d495c8b75e5f06fcceb*440&425107906d44f8ffeff242000568eb3e*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic1024ih14iw14_n"b2d9e3c2944e422fb969684a078b5731*280&839c4c141bb5db8921bf44d7e190122c*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic512ih14iw14_n"5d0b11ad103e76e5595cc8aff25e2fcb*40&d7c0149566891c40d7f8604e678d4f03*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic512ih7iw7_n"3f93ade4184bc02885398c35d82d0d82*200&4b7b8d5a3494bb9b210b3f736e4b19c6*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic2048ih7iw7_n"1386fdc831632ebf275fff389fad3ded*160&cfe37ef08a4986fb07f63333745372cc*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic2048ih7iw7_n"8149b71fc51b6a0cd13e3766beb456e0*160&8c13b6b3b0c5be9e262440517f35e25d*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic512ih7iw7_n"635403d5a14efd0bbdc4ff23ea9bbe51*200&85ddd3ccf079ba72d21c07bcb0156e5b*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic512ih14iw14_n"a5295735bdc62649fa66c1f191252d41*40&7e0c3ff3ee246bb4021021b946b4f3d0*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic1024ih14iw14_n"3610f28b6b466d63c37ef5b87d8f5c4b*280&8f3d43a6dd328c4c97e1bf10ae4555cb*7"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic256ih14iw14_n"2d132f47d34eacd3cee2fdd63d040751*440&0634ee3af9703e0496c368763cb26dd0*11"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic256ih28iw28_n"aca4571a90cc314a3ec2d7733d1631e3*40&0a107f55fab82d35827f7af4fc3af0d7*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic512ih28iw28_n"a382240637074da47c56d4def27250df*200&1b317d5352a40811c0b63c58f93e477f*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic128ih28iw28_n"17ada820be918f6bcf2506876ba8e2f3*280&e69c830ce83f5881843fce765726810f*7"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic128ih56iw56_n"8a58dbdc91262ca971a127545531991d*40&b9ddeef3b51b05d5eeb6d7853f37e5ad*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic256ih56iw56_n"70dca3ada448a2f272d7bd5aafd4ff86*160&2ccb23c6589001230e7884ffaed737be*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic64ih56iw56_n"11655f41a4d67dd1f2a3fc079644582f*240&e205ad358b5249ae8f09ec32fef13c75*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic64ih112iw112_n"835be5133edbfc1a166d2859b9d9d4f7*40&b0b0e5b238245a57506aaed89e159ccf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic64ih150iw150_n"bfffa015fa67bef8f8d9c44b2df7f5f9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic64ih75iw75_n"86a930d7d663fca66ed5b537101c3a78*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih38iw38_n"7115cb21eabb5ea19fda2490ca94a5c4*9"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih38iw38_n"7ea26ad56f02c273b54f611db26352c9*14"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic512ih19iw19_n"36b2e7d2071bba779efefec905d2caec*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih19iw19_n"6a0795fbdcf450264e42814c16ecabbb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic512ih10iw10_n"bd0dd03654dc354e506aaad0c1060914*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih10iw10_n"df76024110cdadde19b8db67069ab359*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih5iw5_n"dcb52d909f99626fcb2aa21627a4a616*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih5iw5_n"67cacae32b61cd44a171b823fdc3277d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih3iw3_n"4224f5a75113c34ec0bb183f9438f4a9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih3iw3_n"22a868bd0a551d0877228d28ec2b0ae4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih1iw1_n"1272859283f0394009c5697a23b5460c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih1iw1_n"83875aea32acd245172b37066a4ad200*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih3iw3_n"9aa6814289b98ed6df73f540c70e47e7*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih3iw3_n"3b708cad79b791cded4a460868f20d40*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih5iw5_n"2077d04f526e8fa2af868c6147ff48cb*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih5iw5_n"40fb8ba863080306006acbfa9ce86052*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih10iw10_n"17974125e04af78b52836935732e82fc*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic512ih10iw10_n"482c82a4366c4062c850ab2599f3ce78*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih19iw19_n"c736daebb5b3a6a2a54b571c64c8b92d*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic512ih19iw19_n"a903ca803185caa0a5e09d2f23fa20be*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih38iw38_n"eabdab35271becff823cd47c07cc04c6*14"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih38iw38_n"84000831a4c239e5c5935505ba409f79*9"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic64ih75iw75_n"07702a60e678bf22caf66c68a2512ea6*6"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic64ih150iw150_n"e3bc5f7876447977735f31d08e485847*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic64ih112iw112_n"ddb6914fee54508c65e1685ced700c7f*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic64ih56iw56_n"f395582244869ee5514e1d43be1ab46c*120"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic256ih56iw56_n"4b31bf79fa1b942f28b8cbcd8d226a61*80"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic128ih56iw56_n"4b9d89416824d16dce89b20fb881d3f9*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic128ih28iw28_n"8a48325dc17fe90ba8c9df1cbb6b0e42*140"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic512ih28iw28_n"834d11b1692bfc062a043941f5adde15*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic256ih28iw28_n"8376b82f6d01b656574196d55c049d7c*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic256ih14iw14_n"afa02e5154c60f98a9a209334b0daed6*220"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic1024ih14iw14_n"cbf4a890762a8c9bd1628962380e01de*140"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic512ih14iw14_n"d01a04e643f19a52d2dd180f6fcf40a6*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic512ih7iw7_n"d4b5b3d5bb2e15f8ea56f0b5c4f59aaf*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic2048ih7iw7_n"8507002460d4f8c24fd35cb3c6446ba5*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic2048ih7iw7_n"2582044608f2566db0d31a47471841af*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic512ih7iw7_n"ab2e409a47235d267f34619487d2ab03*100"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic512ih14iw14_n"243d11a04f851f747ef47cde06e35a5b*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic1024ih14iw14_n"1b8802cf322a1d6b8e2b67743beabbd3*140"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic256ih14iw14_n"086ec18d866a49f616d35f5962cd74fc*220"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic256ih28iw28_n"8b4ff192232f1baa7f5c2378ffd02386*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic512ih28iw28_n"b52b770435201c54007201ebab33d884*100"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic128ih28iw28_n"fce8af431c7a9fa01dce12caeaf0259f*140"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic128ih56iw56_n"8d91d578b69cf7eddebfe1f8192b2f99*20"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic256ih56iw56_n"e7ecc57316f50fd3740a27f3f5294e0f*80"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic64ih56iw56_n"2ea0296292257455f5c36fe9256298ae*120"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic64ih112iw112_n"8afe402613e8e696f454f8ca14ece788*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic64ih112iw112_n"c91dd8fc675d239bfd6dde3826092e36*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic256ih56iw56_n"27ce82543faa31b033a7666f67f8eb47*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic512ih56iw56_n"102af23667ec5a1cc775597df54218b0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic512ih28iw28_n"f77ca2202fd7b5f7067373e3db6c026a*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih28iw28_n"5d4693ca5deb4feac5d76b96b6995af7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih14iw14_n"052776c5fdcae746ec8a50f055b42c2c*69"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih14iw14_n"6d5debc6d71458e9b916e5f581f9907e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih7iw7_n"fbe4dcf0e42123ac1c4725efb4f85c9b*9"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih7iw7_n"b947781ce6f0fb46c8bf9003c332629b*9"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih14iw14_n"40eb6558c2c0250ab0e444148e2b86d7*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih14iw14_n"a78c74d07ae3be8df1abd565b180d0db*69"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih28iw28_n"ca567072bf8ed2505a49c4e873abb925*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic512ih28iw28_n"a0e9f130526b021ea30dfa626108abaf*12"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic512ih56iw56_n"c44c03ca536be1fc3eed76b6ccb99f4a*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic256ih56iw56_n"347e51def3ca0a79218556775294b7d4*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic64ih112iw112_n"8cbc6626febf933ce0ed64deb4bba90e*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic64ih112iw112_n"18297096f740a7990eb53aabc6cb0209*40&dd659ad312974603ac2df9cc4d054d16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic64ih56iw56_n"06df375259134de7c03c46c05d468894*240&08fb64bdc4e1e924312e515c4aaffb74*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic256ih56iw56_n"125b9bb6b868c3bd335f3e4976ea31a8*160&ef2e155a3aa71a05d4c378002fceed88*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic128ih56iw56_n"39fded6d6643970b6fbd2f726fbd07f8*40&9e5b322eb43812964f6b5938814618aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic128ih28iw28_n"1f41326ca42ef70ebbe8d3c739ecfe35*280&574050e18352c4ea2c5d49504c915374*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic512ih28iw28_n"fda3b14ee7642e4bfef0e1f69b0755b3*200&b0484295e39ea2cf3385f81206bfe655*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic256ih28iw28_n"ccc1d45b6dd4fe82806432e99681c551*40&3bd7243f7b7a006a3acbba1f7c4217b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic256ih14iw14_n"1737fc88a8dc715e2fe50b9823427759*440&d5cbfc2ef4ab5211deca08e555241104*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic1024ih14iw14_n"ced8defc7a8f34c65c8e8b77b6671061*280&1f82f821321ed814c58abd5d714836e2*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic512ih14iw14_n"6d33af01788831589b129064fe933754*40&ddc4502ca11cef93ac8fc630293c840b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic512ih7iw7_n"d63e53689772f4778c78875e3f58dad0*200&d9e8aa63f8d38a75721afc5d7b1b0ef3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb16ic2048ih7iw7_n"edd66337431abcf384def12f55e0d1fb*160&7892d7f485384dca8b4f358c086de359*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic2048ih7iw7_n"a3cdc4ed84417a4a1199761517676d8f*160&4fcc38c84fcd7e7a51620ffac7cdfc5e*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic512ih7iw7_n"ad2aaad4bac60c6ee9b2278475b7c603*200&a8c1245ff6de566a05e345453bf83c66*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic512ih14iw14_n"7c09153af0ac28a29579d7d70e6a4d8b*40&f4732f5050292f41ae9dea72d0df74d0"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic1024ih14iw14_n"6cec43293856362919a41a90f25af691*280&1a4bfeacd8924829faba071f2e165f21*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic256ih14iw14_n"0cef282f3c5e8f53e589ca2b32055e4d*440&00d9e73567ddeea0087e140cb6fd6986*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic256ih28iw28_n"424a3b38031602032ba0737c94c09684*40&926cd29a2a8d951cb8ae9891e17e5b02"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic512ih28iw28_n"dc38c059ad1107c27e483df9538f7996*200&548e20779542fe51018862e347c3f4be*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic128ih28iw28_n"9719e7e5a6853afeeb216f1163566b55*280&182585ffd5b3db44f283391b541ec7e5*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic128ih56iw56_n"28fc93ef45f2e7cd241d4ac794b3ca32*40&a64bd7d0aefc46563b55222985564726"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic256ih56iw56_n"0c23e09708a19d91c51ff5018e640396*160&5ae817dc341ece1fe5dde38adf20502e*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic64ih56iw56_n"dc1c58766a5b1250ca5bfc70a6bf872e*240&5695de7d893da0916f7aafb1ce0d3b41*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb16ic64ih112iw112_n"f89d05e84e4c728268ceb96c6b2e992c*40&8161c252c33326201c1c3401994b5432"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic8ih640iw1024_n"0a30b524071d0a93e90c9b4d31007c92*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic16ih320iw512_n"9706749066bd383ca59775650c85dc76*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic24ih160iw256_n"5cf0561c8278f35792e828431501ded2*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic32ih80iw128_n"7fa5d6792bcb3266455b47e99eaeb1b4*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic40ih40iw64_n"6a4fff1a339f6dc1c311a2452a122ad0*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic48ih20iw32_n"10d8e5537093c1302fe43a708cbf6656*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic56ih10iw16_n"f8ada4d70664c5d1989997481578b042*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic256ih10iw16_n"f6deae95ebf6d2689ab3fe36a9ef712e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic168ih10iw16_n"cd01c72f02b9885cd252e3c40025797b"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic8ih640iw1024_n"f88b013c2acd5220b5c74dee8d06f3a5*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic16ih320iw512_n"b00e1a7dcf1df290883b4b44c88b0277*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic24ih160iw256_n"1089af5b83e8ce639a91e1ef69cc6ba4*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic32ih80iw128_n"da04bf34ca580f611f2c89288d084e01*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic40ih40iw64_n"1771a7b8b9e51ba49f498d6052ad3136*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic48ih20iw32_n"c8e0d80ce9fd43ccb6c324234553ee83*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic168ih10iw16_n"00d65d329f46469d6b3b9692f03f9adc"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic256ih10iw16_n"3ec3c2ac4c6e87e30c57d99295f5b8e3*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic56ih10iw16_n"7b9bf56f58d2b8ffe3ca9cacb46dae5a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic64ih416iw672_n"d829b2f6ea1d8808c2cece058b7b9575"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic64ih208iw336_n"cd7a31873be034d7d7c4aeacdb7735dd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic256ih208iw336_n"f115608c26fe3c30416023b64cb3fbaa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic128ih208iw336_n"0ba858332ff14fbf60f645aff714686d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic512ih104iw168_n"5eaf834962b6b76ca76cd9a53f8f3b62*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic128ih104iw168_n"62ca4fe456c71925ef2c25723ff886a1*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic256ih104iw168_n"6a2eb43103770272418b9a1dc5c8b1c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic1024ih52iw84_n"5cd6e850b925cde35b2a57d33e78759f*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic256ih52iw84_n"3584d40fc87c92672367398da68aab40*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic512ih52iw84_n"c524870e6d9db9888aacf086b7ec1ae0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb1ic2048ih26iw42_n"ea108759f19aec1df67a37284e18bcf9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=GCHR --dt=f16 --tag=acdb mb1ic512ih26iw42_n"4ce956136fedf20b569d1a769b522367*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih112iw112_n"b11e7bb9bda21259780b43dad8204ec4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih56iw56_n"f14c0a2731bfee2ce0b9d92620813065*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih56iw56_n"e07bb1fa30575e2cc103114b612ab22d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih56iw56_n"87260e98f5ffd4a9b6bf6dd4fdf0cf4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih28iw28_n"223e160a2c5d91afd267a84737460a76*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih28iw28_n"7300837bde645dcc5952c8f3946f8900*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih28iw28_n"5df7b35c53cd279bbbce20cb411427ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih14iw14_n"a431d26cbbc256e843b88814196fd32c*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic1024ih14iw14_n"594b17b5f608f7753cce426e8a99d1f9*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih14iw14_n"3bb08585ff2c259e4929380f9fbc66bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih7iw7_n"f8846408d15f13a6f1609aa4005696f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic2048ih7iw7_n"ac3fcebb38a266cf9c65928a2c613a2c*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic2048ih7iw7_n"c7a52e4ef561ff7f4f089e52753d1cb0*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih7iw7_n"7c00f677619601f3fa1321908cba5cf9*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih14iw14_n"2989f6bb5c5daf39983babeb66635bde"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic1024ih14iw14_n"773027407fcdff346730c65ebe8f7469*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih14iw14_n"98744c54308cc865f7a1c98b5ca4e280*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih28iw28_n"06a89104867ab01ff89a7c0fd6772a09"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic512ih28iw28_n"04d637f695a5f90a771c1f333ebb1b78*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih28iw28_n"17396aa49a1d23d0836de4a2bb980782*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic128ih56iw56_n"36add0ac99a0e092713a6f3e67188fa7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic256ih56iw56_n"6a20b4083d1eac35cb187aef88f1c968*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih56iw56_n"21b0f9afe3fc2759a7c7f20ac25fddbb*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=ABcd16a16b mb16ic64ih112iw112_n"dea57941ccf802f90c4f1933b86af1a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic64ih64iw512_n"300a7ec53d69af883cffe9a7aa520ec2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw512_n"065374fd0f75cfe68bfa8bdf08bd7828"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw128_n"4a45ab41deace65727ae94489a216bb5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic256ih64iw128_n"6ddb5c344c44656464614a8b2661b116"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic256ih128iw1_n"9a0416ef83ad986fe4e18c04b5b33f23"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic512ih128iw1_n"e21b1a5da709089aa503b5961eb1ea15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb4ic1024ih128iw1_n"0f330e9aeaebb02fd549568995164719"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic256iw128_n"840ddeb9d494b229c1d8e7aac09121aa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic256iw512_n"1e88a93297099a4a11e3726a8641abd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic128iw512_n"4a44db725a9f425bb5a3af79d4ed2290"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acb mb4ic128iw15000_n"5b4a3f8be86b385b425099d8f80fd73c*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic128iw15000_n"b31d382212316c206926148830eb7916*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic128iw512_n"edd55e9d8bbe53f34a8cd222320e767f"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic256iw512_n"b48657e80f62888953f2e80444041d36"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acb mb4ic256iw128_n"b437cb07cd5787320cf0f7e924e9a9ad*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic1024ih128iw1_n"5c2cff6aacf7a9b9f5ccbbe66e055273"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic512ih128iw1_n"4c94a350fbeef6a0fd095e83279b5eee"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic256ih128iw1_n"df4f24be3e7b5bc24121dbe2ffc7150e"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic256ih64iw128_n"3beea5ad22c95b67ef7074f26fe0715f"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw128_n"5389b04d255ebb1fad4ab185784e4e71*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic128ih64iw512_n"2405a93a87800651c8ef2c31b8d5582c"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb4ic64ih64iw512_n"f28e7d8fc65bfa9427a3395060c3824e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic32id224ih224iw160_n"5bcf9593a603c6b9f6f04f0a71027eff*268"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic64id112ih112iw80_n"a959420f42984514ae88910fbc809084*268"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic128id56ih56iw40_n"bb02a59780d14b20c7bcde47feb618a0*268"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic256id28ih28iw20_n"81bf5e3ed18662eb10e006f76fc7bddf*268"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic320id14ih14iw10_n"b02747bbd4795c2e08f1f51f07100e23*268"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcde mb1ic320id7ih7iw5_n"af9840718cf628ff4e3891d1096453eb*134"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic64ih112iw112_n"6e7fb438be44f723a63c65dde9a1627a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic64ih56iw56_n"929424044e0a3c2e726228bbb9b058c8*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic256ih56iw56_n"c71ab088276fc3f1b41fb665ef22798b*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic128ih56iw56_n"b98813ea74876605ffadb49401513de1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic128ih28iw28_n"1a2b8434ab3c35ecdce8314a0497c9b9*140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic512ih28iw28_n"5afe11ac62b196c23dfae84340338d3f*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic256ih28iw28_n"ea53bd3a5567105c37ee4e9aec32fd33*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic256ih14iw14_n"dc322d7c37f49fa697e3dd99e472d53e*220"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic1024ih14iw14_n"120b1e3702c0a1fc29c9c6ce41f5f26a*140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic512ih14iw14_n"8d6c2417680dc58e2d4ec2e981137812*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic512ih7iw7_n"eb1b44d33e983843c1fda1bb727f0083*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb128ic2048ih7iw7_n"9071f99c1717724935fd696c198ef9ce*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic2048ih7iw7_n"5c1d8390fa58e8219daf2888db5358d9*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic512ih7iw7_n"4f6b93e000fc3cf35908e1715d416ab4*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic512ih14iw14_n"9c31f81ab197d8e187fe7b6f33d0b687*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic1024ih14iw14_n"257077e94129eb7fd240f6c5e8ad2e2c*140"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic256ih14iw14_n"33bc584cd10d463cb8136f8893a530aa*220"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic256ih28iw28_n"96e79ba6793b59de556c583b06235fae*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic512ih28iw28_n"e2c09305c750848b691547e5e158be96*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic128ih28iw28_n"c078ec6b2a3bc7fec84fbe8aa67a8ece*140"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic128ih56iw56_n"8d1b34e3988583419a1e5a028672b8cb*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic256ih56iw56_n"215e8a4a0159ac20016bade2ad787b3a*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic64ih56iw56_n"ac3b3ebaeec3c041c96ba53e6d860fb2*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb128ic64ih112iw112_n"561ddbf2ad8bd1a5dc683859eee30384*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic64ih180iw180_n"c280ec43121f561c7d026a56d25b9e84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic64ih90iw90_n"bd390c0d5532ea2a734b6abbf3a7e14f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic256ih90iw90_n"abc4d65b04ffbe2f93bfd97f1ceb036a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic128ih90iw90_n"5aafe8427942e11382481d44bc9af448"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb64ic128ih45iw45_n"ead67966e1472c4a87432762171a0829*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic512ih45iw45_n"f2fe5b3d7fe0898027730abba6f0c1e8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic128ih45iw45_n"327d5e9f9ec9325ffbb77f1ebfd9ec76*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic256ih45iw45_n"f44c6af9543a248d4a834bf004d5762f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb64ic256ih23iw23_n"a5bd692f5f2f7c95008dfcbb4a39d50e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic1024ih23iw23_n"1706f0417a05aa0273019faf3bcdf52e*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic256ih23iw23_n"3bf9589e789dae6db068353e4917ad10*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic512ih23iw23_n"871a2f52d509c0d407beb122b084a225"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=acdb mb64ic512ih12iw12_n"60858c09b76104573aaecf92e2f0015b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic2048ih12iw12_n"37504b06c1711cf3f9604678225f06c8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd32a16b mb64ic512ih12iw12_n"5a4759e20e13f967bd8c030a5860ec4a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic64ih112iw112_n"1acd1c22c0dd113b699c2204aef701eb&6fa8be5e6a2dd6f4fe248933a2e63d5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic256ih56iw56_n"508fef020bfdb7288c57ecfb75e47f6c*10&3498903f9d2d5ed5b46a97978ed688b1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic512ih56iw56_n"250c068f71230d83b017538fcb2d081e&a5f5027d3a7bebfe5fe5dd7c5757479a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic512ih28iw28_n"4279e657146361023fa5171163454d7a*12&4b44c45ce20cae29bf1deb6b57f715d3*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic1024ih28iw28_n"38bb6624e24802b552219bc9796bcab7&3dddd4910abe9554deceb17c3ebcda36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic1024ih14iw14_n"dabf056560e9b3150d53c32e0253bc51*69&9e6e5f6bfb3f5433e5429d47714f1337*69"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic2048ih14iw14_n"5b21f9b64b91f8706c4fa57d1b65e7dd&85932aa7b029c9a38eb9d4100eefbff1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb24ic2048ih7iw7_n"fc7a750157bb87f549d867af98c4bc1e*9&18a902d90da3aa33e9b15005da13bd1b*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic2048ih7iw7_n"157377fd0aaa4dda320adb4edd2b0bac*9&546d686fddd9607aa1f2e6e8bea1aa72*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic2048ih14iw14_n"55787f80ac7a8ebe26ad25e76cfc6f7e&9deff625658b4603a1ad908df0970880"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic1024ih14iw14_n"6524bb50be62f5ee8396ca4bf112ccb4*69&a01c7a93b65b4f87f17212fc2827f2a3*69"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic1024ih28iw28_n"c396ec6d6823da28de2fbb65eba3ef79&bfcfa99b157bfd7d6cc806c5919e87e8"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic512ih28iw28_n"d05a0d37567e43466764835299efd00e*12&e973f9563f356c2c5b4ed427b2530ba8*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic512ih56iw56_n"38f104bc714ea3abfdbe4d4c54f805f9&f40b375ab8bd35e285a03cb280c7f08c"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic256ih56iw56_n"726d789b94ea27f9571d4c79ed13d648*10&35a57e3e56f9eaf4f310bf7d7a083c94*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb24ic64ih112iw112_n"d7fb9140e5603637d107d57fc9a72a3a&7b415241d5b59eeb1934cb29ed9dba82"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic64ih112iw112_n"ced54a5930b8da23c63fcb5ec45dd2a3*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic64ih56iw56_n"761e266eeb10b06b608cef5b9e55ddff*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic256ih56iw56_n"eae0f8ca59b00b8cf3531fd4e0a51336*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic128ih56iw56_n"c963991d57156a15c447facd7cf687e1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic128ih28iw28_n"0111d1b7519ca5c2636f69f4154d35ef*140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic512ih28iw28_n"fa836f56f6453b44ccf177e5dd686501*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic256ih28iw28_n"a66627aa4f29ffbd51372ee608e543ee*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic1024ih14iw14_n"cc62788726813fdf2c3e8c2ceaa77822*140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic512ih14iw14_n"d54164f93790906fa5dd90c58a32d0f7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb1ic2048ih7iw7_n"1342b1df93255fb59c3ecf826b26dd5f*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic2048ih7iw7_n"25e14a2c9fce2383768cefbc1e294b04*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic512ih14iw14_n"10a5d90913bf9b4ee0331f4f596aef8d*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic1024ih14iw14_n"77c486c688c3e914cee1fc11f1520add*140"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic256ih28iw28_n"099489c5dd2c24806323a445c6a24d46*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic512ih28iw28_n"577fd630ee2f6506554fb8750a691c0b*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic128ih28iw28_n"67bb78401b54ae5315da9ae1ef0a4e57*140"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic128ih56iw56_n"baecdd1bce77ad92a5f10b98ae877948*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic256ih56iw56_n"29099656c2614bfcae05dc42c04925cc*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic64ih56iw56_n"76eb68588330daaaaaa852e6631ffddb*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb1ic64ih112iw112_n"4745eee3fc8511681f412acc5f099b2c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ab mb16ic512_n"72a72bbdb12c2a3458d69ab643a054ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=ab mb16ic256_n"f4d4713b0232d1ed725c1503cde2222e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic64ih64iw512_n"efc44d21982e1c4ae420f30d3f3eef9b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih64iw512_n"7e1219a75ff12d57e5549e5a4da6f30b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih64iw128_n"fcc66bde26af469b20ecb1358286c0e6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih64iw128_n"a30b610cd7188fb6a232f743e12427fe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih128iw1_n"533d519ef69a86904ea34a60dea650b4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic512ih128iw1_n"c96935659532cbd17547f148d16a6f3a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic1024ih128iw1_n"653b46c77bde63ec1f3503597335efc9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic256iw128_n"e4097ea5ed9aaf2947b567b58d56f6ba*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic256iw512_n"a908fa4362dace56d24d7e31ccf3b8c9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic128iw512_n"46b2f506a1cec5f8656b08b79e3392fb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb4ic128iw15000_n"7153ba78b302bd8a2d37f8aa811cbd6a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic64ih64iw512_n"ee2e39b325c745758cac34cc12e49d42*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic128ih64iw512_n"7a34e99c9f2d36bb9a87d54c7a992742"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic128ih64iw128_n"47ef3dd21e70da396e46ac6ca7854ef2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic256ih64iw128_n"f3001371beff9d0871fb9c8f22ef3f7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic256ih128iw1_n"fe31abffbb77dc6fb4fd4faaf02604a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic512ih128iw1_n"9affca23952e84b90653d3b08f80b40d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb2ic1024ih128iw1_n"8240e8247f2292a51934c0023a145185"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic256iw128_n"cf15b95816cf5833e32d79ac5ac9686f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic256iw512_n"3996f8e6ef2c1a1d7a11685945f49b39"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic128iw512_n"c8ed25d4f3e2dcbd5caefccb130b445f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acb mb2ic128iw15000_n"534aa7f9dcb3e7612e39c7b69a0cbdcf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic32ih384iw576_n"ad0505e10e9503783b9280949eeb3838*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic64ih384iw576_n"d7f4eff8723f64595411564752983f05*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih384iw576_n"cd39adf14b10f5eb93af6fb22010bd8e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic128ih192iw288_n"1dcfa08bff24f2c2024d44ad750ad37e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih192iw288_n"4d80ee33ad588f2443741cf09189a855*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih96iw144_n"7cbdbe12af8b671504c37b6a3401dbd3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic728ih96iw144_n"62864f4d7b845cb1d31907d0eb7468c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic728ih48iw72_n"a16c900ab23a34666e26459d4a5ef7f7*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic1024ih48iw72_n"7c569c15a931b0c767f20e634d9b634b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic1536ih48iw72_n"9e561107beaf94ad5c39d06d1f11a1eb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic2048ih48iw72_n"fe27805ee80a8a62b8c9c28d3555ceb7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih48iw72_n"8127cfd73321c5ac6d4ed27e6560de2f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih1iw1_n"5a4835ca478b95ca7c8c07e66098b7ee*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic48ih192iw288_n"aaa06cd81083ae0c0f6eb279986679f7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih384iw576_n"c150f3742f81b870e1953b1911d36bc2*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih384iw576_n"39faae5730305cfe672ebea79a8d9aa7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih192iw288_n"71e774cf2a530e54ac3a9438cda39986*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih96iw144_n"8b2682a2ea59d17e52dc47aa2f9a9f65*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic48ih192iw288_n"8d4f37c211032302df5036b02518444b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih48iw72_n"32796e9b40723d0ac0be569769cdc134*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih1iw1_n"cc14dd1279fb6e378f58e72c16234724*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic2048ih48iw72_n"039fdef31f2769055049d77caea6124b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic1536ih48iw72_n"91053dd9ec8960b724d9073ff215e2ea*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic1024ih48iw72_n"be5e6e8dfa95f9d3cf8f4d2dd53b1a11*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic728ih48iw72_n"b6346622ffe1af6bb1f666ef1c8da4dd*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic728ih96iw144_n"9ba9c676c93e6b7ae3e13fd425afa8a5*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic128ih192iw288_n"1f9b0e732ce58e0f240079b548fcecd0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic128ih384iw576_n"c7726dbab4b89a36a19f0836aef98f72*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic64ih384iw576_n"de3302b651ca7517901c8d088746a4ff*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic32ih384iw576_n"cf189122ec02468a661aa717433c61d3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic64ih112iw112_n"0d050efd1510a722b47d7b55f70855ad*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic64ih56iw56_n"63ec3f7f6a305c6c6012cb4577d2f317*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic256ih56iw56_n"6c0fd645efef910122de6ca4366119fd*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic128ih56iw56_n"43528b70deb0a674c87d44da5d9d1ef1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic128ih28iw28_n"5991c23578158a6c321f40ff77a2575c*140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic512ih28iw28_n"7d8481ad0cc9e18aef0ee35719ead7d8*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic256ih28iw28_n"c93c798aa450cb0f4915b66e0ca37be3*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic256ih14iw14_n"be6f288c7f75f60a9fbb4d8db192d2b6*220"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic1024ih14iw14_n"f1bd3c819ab0491324289e2e562fa396*140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic512ih14iw14_n"3d54d9358bbb422cdab293213bd7cadb*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic512ih7iw7_n"1f732573942afb28561be04ef0d85daf*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb16ic2048ih7iw7_n"ad876da6edf7111cdf846d340807e837*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic2048ih7iw7_n"8b6cf0d707c809ef98314a0b4b55eae2*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic512ih7iw7_n"0f39cbddebdd5502f3c3c42e8152a591*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic512ih14iw14_n"5ca29c790adbd4ec1666ae694a6b0683*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic1024ih14iw14_n"334f76f51729f41b492f67e823c864e9*140"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic256ih14iw14_n"5edc36a9ba6fa75b412a8364f1971414*220"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic256ih28iw28_n"9849aff96b59cee6002cffc2e47abf95*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic512ih28iw28_n"c68ad9230da717f55979e6e0e716f8bb*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic128ih28iw28_n"0ba116f80ab685ba3e0fb7ca3726acd8*140"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic128ih56iw56_n"d8cf4c98cd2ed13d861dd331678cfbb6*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic256ih56iw56_n"a5240789fcc14d01a30eca1e53ee1990*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic64ih56iw56_n"7ad3036dd29b9283f35a12414bf3950d*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb16ic64ih112iw112_n"ad08447e462e33953be24254b119c6af*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw712_n"3a5c9f27ae03a945e31699154fb902ab*37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw712_n"9d5fac25e9e52604d01d0a00309fa4ae*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw712_n"c137ea85ded3d66606841e1a76573ec0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw1120_n"79e892f4a7836d1e775b5d795b4b39bf*37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw1120_n"a92b1e7da8e9e7d3a3e6f44949a6a837*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw1120_n"669cb9b11c03451ee122dac4f9af8077"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw360_n"730a51223c28f4cc84c6233cbe00ea67*37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw360_n"0374d8e272ca43a96723a8f7e6739ec8*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw360_n"327c689f502eb59a22ce0752937a90ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw288_n"b5f4300ecb11b19712e00be0f48292d8*37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw288_n"1677b5ecb3afb7a69d5f710225ca94db*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw288_n"a0e8ee2dfb5bbd1d3bdbecc5af5d5e8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic256iw560_n"4b8ce804891c6c58211e5f9ec8590e86*37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic512iw560_n"f48934664d48bfd1cf7332a8c96e22fc*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=aBc16b mb8ic1024iw560_n"796b2f5a345d4a8309d9b34fab913afb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic64ih150iw150_n"bdf1b73758a19d23035bc9ce8fcf44d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic64ih75iw75_n"99766470040adb6d16df01d1c6a962d8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih38iw38_n"f8f86f5ccd29ed87bcc571f21c05e125*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih38iw38_n"28bbce9bbd8dc7a497c44d8b42328419*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic512ih19iw19_n"9ae3dd3ef007c5752980528a2fc4796e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih19iw19_n"dfb6c3bcbca177d6b6cacf592619e44b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic512ih10iw10_n"bee3787cda79c51e69e72ca24677d1fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih10iw10_n"27dcde3b7443a40b1cc1add275b433b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih5iw5_n"6676c42a6db9f8c0aa9de6dd0f788518"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih5iw5_n"b75e8183644faae7009ba37336309723"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih3iw3_n"ff16b2e8d97bf3729cedc59a2482e762"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic128ih3iw3_n"25ed806db8737e1fe2f7049bc33b7881"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb32ic256ih1iw1_n"2a05686032678ed9e910cad28a66535a"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih1iw1_n"de672284c3f582bfd34f7fbd61fd0294"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih3iw3_n"2740d446294619b552cc1dd4d933cb22"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih3iw3_n"2caf6f7045ddd05f56c2ecfe57d1c53f"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih5iw5_n"32499a615fc814dc23480d0f91b93d6a"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih5iw5_n"a315318c0f72feb3d634d1c20073faf1"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih10iw10_n"c760d4fa5da6b0c5867901fc2af9f733"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic512ih10iw10_n"b300f462cd5e668f60a3f995bef6dbc9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih19iw19_n"41484b448a6414c29c9d6f9e42b4e58f"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic512ih19iw19_n"ba55636ee94e74d312c1d1560c2113b1"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic256ih38iw38_n"b95a06a332faa91ecdd5a2c1a49acccf*14"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic128ih38iw38_n"a37ff33dea59094a8184a5c5c4520dd6*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic64ih75iw75_n"176ef345773bdae2e98e374eb32391d7*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb32ic64ih150iw150_n"9cf61e112539a0a7244899c2b72a4d14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic64ih112iw112_n"92fc98a4cfc1ace9f0ca36b07b3d6fd3*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic64ih56iw56_n"492e63eb19d86287f594cb62b1fedecf*240"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic256ih56iw56_n"1ae8a3c58b4fac6cbc8dd3e2f5748802*160"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic128ih56iw56_n"57ccd3d975ec286702afa3ba0e93a525*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic128ih28iw28_n"11f029b8a272700082dfde0044e59894*280"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic512ih28iw28_n"7c9fa8fcb25c1efdbc2eb94b1207a963*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic256ih28iw28_n"d0cbb152d38ff5388d6387d776e153a1*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic256ih14iw14_n"d118631426e229511e3e03a92c3ee2bd*440"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic1024ih14iw14_n"1c89feaf19b9f82b0e08fe52aabb2e6f*280"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic512ih14iw14_n"2ee1b17e7680cd9881cacab5c9c9e3a9*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic512ih7iw7_n"e348defb9d7b715bc3c1c7adeb2aac3b*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb256ic2048ih7iw7_n"02ad035c5811504fc50a9d0bd869f975*160"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic2048ih7iw7_n"e7bb63ceeb83b476fd8307c9d7f57d00*160"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic512ih7iw7_n"21d322de5fb4b16d1f46ab60e4d7ab11*200"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic512ih14iw14_n"9873c4589ff6a7ac54e2461ea5bddef3*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic1024ih14iw14_n"41f170aaac4884326dbffa1fee7c244d*280"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic256ih14iw14_n"c540e964a96dc38dc1e5bd03a9f0c221*440"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic256ih28iw28_n"b511b07b3f907d823c20748bec571051*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic512ih28iw28_n"72a0b88f356b7cb7dee871ef2818301a*200"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic128ih28iw28_n"0dbed15561f7f9455289a1cdc6ecefbe*280"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic128ih56iw56_n"1bda87652764292e30c31dd8ff5a4c2b*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic256ih56iw56_n"34b164be972982537b4f30dab5be15d5*160"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic64ih56iw56_n"da878053fb4c368961c950159cded471*240"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb256ic64ih112iw112_n"ecc812d9ba62c1c050dc2de664f45cc0*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic64ih112iw112_n"97eedec2bd9444a0267af4cff912696f*20&ecbf3b0ad5aaa8ff98d9761953843c60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic64ih56iw56_n"66642d871bf00e0f6cbeb4162fa74c29*120&4b08ac66727f9d1ad23025d2a6c7039c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic256ih56iw56_n"45693bf8bb74658cb5075205514019af*80&d8e49236aa58b731eaa902599346c540*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic128ih56iw56_n"b3b0dc26ac84901f7a3293e72569ea1f*20&46fa08208047571c325bddf1b7ed829f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic128ih28iw28_n"0e35bc96a359a9f4b4191d86d37b07cc*140&7d2fa13c314c7f0f6234a0e39d344160*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic512ih28iw28_n"a6c6537db31fcf2ebf2f37262b575cec*100&60ca97e4d4fdaf9b888a8b5a6fe0bd40*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic256ih28iw28_n"3742e511e270fc13cd9901535f775cdb*20&9c6d11f37c4355bf5529ad26a1fccf32"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic256ih14iw14_n"51f4e904cd16fa4facb14bff9a95bfa8*220&2de8d0bf4d984e88fa229cf15560ae33*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic1024ih14iw14_n"a84bb896014ce21e1a56bd9b9a3b0f0e*140&d66bef6859a565da597dfc7463e69057*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic512ih14iw14_n"10eb75280902d4aaf10c305b9b88d8b6*20&509e2152e3d1de46b1d673c91d6f3a94"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic512ih7iw7_n"f97bc771d400dde7b82cb37efe32b7d4*100&72f83fe6d9443c5da25d17c7d3716ec8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb256ic2048ih7iw7_n"d5930717aa0d923c9f0de462be380cf0*80&ed429e0c09a4016833dcbfe02380a4fa*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic2048ih7iw7_n"3af47d537d0aa270e28dda936b31479e*80&66cec9b8154e4c225930543229890ae4*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic512ih7iw7_n"dba43f7bec7c9a3425880065fb2b8112*100&b4f2d34bb807fcd7e6a99e1308c10aa9*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic512ih14iw14_n"a654405b56ccc3de515b4f05dadb467b*20&e28cc235c8765843ee6670ce587208b9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic1024ih14iw14_n"5b04fd8e0e7ee101e4695cc46101f5cc*140&bd9563e0629313d4a9aa1325a4c8744c*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic256ih14iw14_n"0dd8dbc94b729196981be20ecd703ab3*220&db70fc422c20ea0a5b6bfd3645fb2350*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic256ih28iw28_n"c8d3b31ba52557a0970223af06d27caf*20&0fe6168b29b632ef645105fd5bd55d27"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic512ih28iw28_n"976c88088fef293ac3ec2c7b2c87cc22*100&2835c17fd09937f26abb111000f59ed5*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic128ih28iw28_n"4da0938dd1bb4d4a7d15c427343b53ae*140&b19f0d7f2338b6387799e55891c47ccf*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic128ih56iw56_n"9548ac44f7c6eab7ceb4e5af9c7d944b*20&cd2cc7bf564b01bb3efe13bc78a735c4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic256ih56iw56_n"38cee3ffb831ae97953adf5eea3dcec3*80&0361759e7eb24552f293e81aceefe916*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic64ih56iw56_n"53e3ee306da4d5520bc4f3bb3f656889*120&c397bbea0ca4d297af7a2ef8a72aba2e*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb256ic64ih112iw112_n"8797bce10bfc69a4e549d2cc21e3fae6*20&362769cf9e9f393eb4de1785d4a7739a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic8ih640iw1024_n"34f49ea5584ce1a5596f5c29decfec90*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic16ih320iw512_n"7fa592001f6c0893a17342afe939a689*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic24ih160iw256_n"77511ecef59c61838e484b186be5fb80*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic32ih80iw128_n"f096f39aa7b1d4fcf526750661a322ad*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic40ih40iw64_n"ef91ce02a63cafbb2ae2cc5eeba16d8c*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic48ih20iw32_n"f7b66afc5df84bea6aec0b85fd1acf0f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic56ih10iw16_n"1d14b69d4ddec281658303b3d7562c29*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic256ih10iw16_n"fdd6c34748dd2677ec63a6d230a4ac23*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=acdb mb4ic168ih10iw16_n"0979ec203d0344e85557a833a45cd4a4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic8ih640iw1024_n"07f161e447ed168670eb7c1f53b7f981*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic16ih320iw512_n"96c35b92ab1703901593062e1f2f2120*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic24ih160iw256_n"8dfdbce8c25f4a351d06a46a6bf24358*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic32ih80iw128_n"b27a9ad34847309fe1f4e40d3ebbc823*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic40ih40iw64_n"9fb7fabc5140abb0f496a73b5a2270ad*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic48ih20iw32_n"0d897264e82200188991c849f689ec81*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic168ih10iw16_n"f5a94ccb9a9aae0fed3c24919cc292bc"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic256ih10iw16_n"f9cdf5c617733df9ae33fa9e466cfd68*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=acdb mb4ic56ih10iw16_n"4b1fd977cdb8776327719b664201ee78*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic64ih112iw112_n"9c634c61a9551add1f56270ef4be4688"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic256ih56iw56_n"4a3dea1e57d61e56035dcf5123e3d701*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic512ih56iw56_n"13f71f6873e5c23b4da94a359b797597"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic512ih28iw28_n"09d8a61b79499d73f2598ef19da98b34*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih28iw28_n"4dbf38d9d83ef960e6f2854e07f323fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih14iw14_n"e372725f32c59f48443dea46eed11b39*69"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih14iw14_n"c190a4fb79e59bbd11d4376844bd486b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih7iw7_n"8d93606c959bbf0f074b80804be4052f*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih7iw7_n"6a8c23e1aad8780a3deb30f75c718790*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic2048ih14iw14_n"3b473409a79f015c669ad85aa6e461a5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih14iw14_n"32532e40df130eace98fe2987909d6d7*69"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic1024ih28iw28_n"e74e6e817afd4413451cbf5c6cfd8d12"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic512ih28iw28_n"0bc747e20902bb20b0d26bcdd7ea9674*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic512ih56iw56_n"b1aa92391e129e603a14fd3248ab9822"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic256ih56iw56_n"f449709a1c6b7b804962105472fecc2a*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16 --tag=acdb mb24ic64ih112iw112_n"e87567c9bd249687cac112a380f6a093"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic64ih416iw672_n"3152c9c0cf089545c3b0d335a8cd63ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic64ih208iw336_n"c26a6a3651a1916368dd152f06a723e5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic256ih208iw336_n"afd205d935bdbe724b0d306ae2e7aee8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic128ih208iw336_n"d2ed7fb0f9c387b471d9e817eaa3a302"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic512ih104iw168_n"d067602efd2c0eb8c81607388f5117db*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic128ih104iw168_n"7ba3aa6f54aea5a09db05efb2c898689*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic256ih104iw168_n"c70c6c0b6c25c90936c30b9ab990f588"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic1024ih52iw84_n"b92eea1445c37284cc187978a92448bb*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic256ih52iw84_n"b590263efd727ca7882e08e176919c72*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic512ih52iw84_n"04ec54ca096572148397da285a23ce2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic2048ih26iw42_n"e2bd25fc94212fd2629ab0e05d5c82c8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ABcd16a16b mb16ic512ih26iw42_n"08aa815da6fe2cabde819d6ee6dd0ba0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=ab mb16ic512_n"0ceee0a4acab29192c42276f5a7d9e66"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16 --tag=ab mb16ic256_n"44489626fc246030971e1440f7de6373"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32 --tag=abcd mb1ic16384ih768iw1_n"389d9db33cdcea8724c2f41a04cf0746*50"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32 --tag=abcd mb1ic16384ih768iw1_n"0ac0006f9487536e520c67d3455c8bbf*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=GCH --dt=f16 --tag=ab mb1ic2_n"35562a0deeaaab233d964977ea70232b"
diff --git a/tests/benchdnn/inputs/brgemm/test_brgemm_bf16 b/tests/benchdnn/inputs/brgemm/test_brgemm_bf16
index 5a792528a81..311e1b50d81 100644
--- a/tests/benchdnn/inputs/brgemm/test_brgemm_bf16
+++ b/tests/benchdnn/inputs/brgemm/test_brgemm_bf16
@@ -1,6 +1,6 @@
 --reset
 
---dt=bf16,bf16:bf16:f32
+--dt=bf16,bf16:bf16:f32,f32:bf16:f32
 --bia_dt=undef,f32,bf16
 --beta=0,1
 --attr-post-ops=,sum:2,relu
diff --git a/tests/benchdnn/inputs/brgemm/test_brgemm_f16 b/tests/benchdnn/inputs/brgemm/test_brgemm_f16
index 1b0c9396968..d03ecd26784 100644
--- a/tests/benchdnn/inputs/brgemm/test_brgemm_f16
+++ b/tests/benchdnn/inputs/brgemm/test_brgemm_f16
@@ -1,27 +1,58 @@
 --reset
+--dt=f16,f16:f16:f32,f16:f32:f32,f32:f16:f32
+--bia_dt=undef,f32,f16
+--beta=0,1
+--attr-post-ops=,sum:2,relu
+--brgemm-attr=b_is_vnni:0
+# f16 uses f32 blocking for avx512_core_fp16
+--batch=option_set_f32
 
+--reset
 --dt=f16,f16:f16:f32,f16:f32:f32,f32:f16:f32
 --bia_dt=undef,f32,f16
 --beta=0,1
 --attr-post-ops=,sum:2,relu
---batch=option_set_f32 # f16 uses f32 blocking for avx512_core_fp16
+--brgemm-attr=b_is_vnni:1
+# f16 with b_is_vnni=1 uses vnni blocking for avx512_core_fp16
+--batch=option_set_bf16
 
 # Separate cases for non-default alpha
 --reset
 --dt=f16
 --alpha=2
+--brgemm-attr=b_is_vnni:0
 --batch=shapes_2d_no_tail_f32
 
+--reset
+--dt=f16
+--alpha=2
+--brgemm-attr=b_is_vnni:1
+--batch=shapes_2d_no_tail_bf16
+
 # Skip-acc feature
 --reset
---brgemm-attr=generate_skip_accumulation:1
+--brgemm-attr=generate_skip_accumulation:1+b_is_vnni:0
 --beta=0,1
 --bia_dt=f32
 --dt=f16
 --batch=option_set_f32
 
+--reset
+--brgemm-attr=generate_skip_accumulation:1+b_is_vnni:1
+--beta=0,1
+--bia_dt=f32
+--dt=f16
+--batch=option_set_bf16
+
 # ukernel wtag support
 --reset
 --wtag=ba
 --dt=f16
+--brgemm-attr=b_is_vnni:0
 --batch=option_set_f32
+
+--reset
+--wtag=ba
+--dt=f16
+--brgemm-attr=b_is_vnni:1
+--batch=option_set_bf16
diff --git a/tests/benchdnn/inputs/brgemm/test_brgemm_regression b/tests/benchdnn/inputs/brgemm/test_brgemm_regression
new file mode 100644
index 00000000000..af74f08a6fd
--- /dev/null
+++ b/tests/benchdnn/inputs/brgemm/test_brgemm_regression
@@ -0,0 +1,2 @@
+# Incorrect mask comparison in lazy hw config, need two problems to run back-to-back
+--reset --dt=bf16 128x448:448x32 42x608:608x32
diff --git a/tests/benchdnn/inputs/concat/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/concat/option_set_fwks_ext_gpu
index 3efd151e0e0..26bcf203b49 100644
--- a/tests/benchdnn/inputs/concat/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/concat/option_set_fwks_ext_gpu
@@ -1,1007 +1,1007 @@
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x16x256x256:1x16x256x256_n"222ce4d54e608e63be8e398d67f446fa*15&aec40bf545fe0ea14e08d185205bd846*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x20x20:1x256x20x20:1x256x20x20:1x256x20x20_n"1a139a5cb5813406eece3f9fcf143374*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x19x32x56:1x38x32x56_n"123d2cd8e561b9aafa8ffa35f88a8a46*5&cd9f8d39a94b71c33ae192258fea82d8*5&b2bb3423e13ad40e15ed5343e7f6a1d7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x19x32x56:1x38x32x56_n"9600e32750dd5a88d3d7256598470b53*5&0962bbc1d3f2e490b6585dfbee165a21*5&5ed16058c616d86d3b8b7ecdf2221037*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x57x32x56_n"a5eb3c3fdbdeffcbb17b72b96be53ee4*5&5f5210911cfbfae2f46b042862b3b8e2*5&50c74d8e1364a83715c7adb3b646d307*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcde32b:aBcde32b:aBcde32b:aBcde32b --dtag=aBcde32b 1x192x20x14x14:1x208x20x14x14:1x48x20x14x14:1x64x20x14x14_n"63935efe458f324fec9308176337e764*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcde32b:aBcde32b:aBcde32b:aBcde32b --dtag=aBcde32b 1x112x20x14x14:1x288x20x14x14:1x64x20x14x14:1x64x20x14x14_n"05a52f162ef8fc7e5f9fd4b7a2e89797*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x24x184x184:1x24x184x184:1x24x184x184:1x24x184x184_n"81bc7f63a8b7d40e42b049540f791a96*5&2ed2b33d79cf344c3eb9258f93178123*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56_n"2f9ea5089805e3a01d329bc8cc4aee78*1&11350117c4cc22f6f9654a7091001af7*1&3b2547a7581a747006925e52af0c81e4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56_n"0a27ae570e1e50fb495bfefa6f94c80d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"08b00633070c5810646ad8845600ee62*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"c699db89af684c811dfa6d715d06e04c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"f074a66d08b96f86501159aba4a763a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"3a19c5b2b8585d2d46c7233fdb461cf6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28_n"a7a8ba55b597316e3e2e0f57cc69fc5d*1&053fe14418ad316cf127c8c3e5156596*1&73dcd40493f3520ade72487bea338738*1&5f12b7645cd95255b410e28a7a874f27*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28_n"62b9815cc405d2a7eb23bc1049418ebe*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"f7c68896d08f56b980c6e5aafad84239*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"f8f67b984ad843c1dcc749ef1d019ac6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"b9eda5bcb73c766c0cf2845d63c188f9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"faff6a4da05ee04b91fdc17102fcd33d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"179649b6aabf8dc085537b460efbb38b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"730972c9ef65fbf1c61ee38519d1126f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"8ed634fd84c3bd792d91be13112396f0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"a2271ed2e3ae03c61b4ed8e7180fa30f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"606c068f1e8846446a4a849c43d371f5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"729d19ec3f3dbc0921b004e9514d37a6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14_n"d08d4d291bc798b97ed288a15487a9c5*1&615efd649897088e8fef2409d107ad13*1&ba08c61b5c6d7d86a3d0538a2a6a6331*1&21b1c1fe020b593ec129876120c88fd8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14_n"91396b50e01502dd3a20c1e4920476d0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"012e1bacc23a3ce58c629aa3cd776458*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f40bf5203316adcee4cd5fea75d2fc8e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e3cd6dde96907e821f7399606745bc82*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"a350a09273b9161b19be8985e63bf0a4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"ad8d3002ebb02c55fe1059c4dea2368c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e62ea2fadeede64e9cb4b2e4d3e8af5a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"d024ac4ae7eb3cfeb15d08d09b4d2484*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"273be1a0356d45179162f2c0d32355c2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"de3f2aa4a3140b16da4ccc46aa66afd4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"037c25517535ccea277f8057a9b28578*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"47f0db3130f9f0043de538565624c4ea*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"c55afd0990fd8c1e592610ea767fb94a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"6445497894d770c4ae6d38b1ee4c8850*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"815a120c7479a9724dd5decf394f2e77*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e43805aca988a2cbe1735152e972e2cb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"fa9ff935525fa5426a1e993c4d42ce77*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"8a677fb2b915f98a8ba720fa39624c29*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"9581bc36c2362ace3415022d08652c89*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"2b5a37430eaeef9284f9d8685d9e0b2b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"cb792238f8fc649df717ffc5016fed3d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"c11b6c22a55b4db03b2cf9294d763967*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"3b7ee39b91df11611c10057701b3d05e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7_n"860dc731c6293dd86ba37bc13c9d0e74*1&7360c94195bd00ca45df917cbbacc102*1&a2dff2c15b37692bd918c463346bf671*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7_n"6f3b296396fa513a450a4e62d89bfbf7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"1ba17a9d713846b2a2b1a8fcf04ecfbf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"9dde780464a56686eeb061df198967ef*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"cd68d6aa11f781696846405b4c98772f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"05a379cd91d48f90f10c099d3d3ce533*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"30e2a2c31cbe011bb53f4c176345f9e8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"0fc1062372f93079e07de46d77551733*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"432abbf52f1e05277219716b5d68afff*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"1f9ead744cd6d5313a919abe076d11cb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"9973972b89450812accde5f9f14adc28*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"f5b34a1ce700357ef72f50bb6552b15d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"7c88fb43cbc9bdf573e613a9fb2de88c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"61772aacbce33d3b98f0d973e2abc91e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"bb2847454d1fffeaf8acb5b4650d3e7c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"95d7362b14dcb4c6a216f024709be8a5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x320x320:32x64x320x320_n"d409a2dc99ed6cfc80b2603cf54329a3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x160x160:32x128x160x160_n"15e823ba159b16195193097d1b09a525*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x80x80:32x192x80x80_n"1c468f5cf8a6adfa39877997c20dc3ad*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x40x40:32x256x40x40_n"7c6c884d9de2aabb5c4873b6352555a6*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x20x20:32x320x20x20_n"5e5f56cd1ac9015ae7346e2b2cadcd63*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x20x20:32x320x20x20:32x320x20x20:32x320x20x20_n"dc07bd02248edc0bfe7f365df5fcc4bd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x19x19:1x512x19x19:1x512x19x19:1x512x19x19_n"6f8191bd5a77362bf30182da50da355e*5&95dca549b7cafe1e1828bcdd8993cd15*5&80bfe3684e5fa9db64088b8a5a495b0c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x56x56:32x16x56x56_n"6537c7979955dd6771270a7d3d7db902*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x48x56x56_n"4cecbb33824388259919552ba7caabaa*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x56x56:32x16x56x56_n"df52e473311f830d37d6fe7ca1ef722f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x64x56x56_n"3a11091d2304e3103aa00e13b908fa3f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x16x56x56_n"474d91ede864cd15763cdbfb76e77e09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x80x56x56_n"18821911c3d2cade9d9370ef56b9ddaa*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x28x28:32x32x28x28_n"3f253774ff4d2e916e9f4e23f548b0c2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x96x28x28_n"4e52488a26848cb30170b0c97a8680bd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x28x28:32x32x28x28_n"78c8ff4d5628a41ff4743b0b221e48c5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x128x28x28_n"dba63f892663b0e5133f82dfad05e0ad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x160x28x28_n"99f21f02a4ef0288401db18f5d40817e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x28x28:32x32x28x28_n"3484561278e436770681c0a6f4f81576*1&b8c0a1fa707ebe292d5f706fa3fae727*1&19aab90bf8d7a8dcb8cc2ed3d7b62d52*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x192x28x28_n"8bd4d17e3715c4994a2d646b939cf53e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x14x14:32x32x14x14_n"9d03729bb9a7439a7c0868b4ab0a19d1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x96x14x14_n"62d3767f2146ea741b81dd6271ff0523*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x14x14:32x32x14x14_n"f4ca2a1893645b1fc2cb58cd15723e10*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x128x14x14_n"f5027be057e4a4c64f52366df3e32601*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x14x14:32x32x14x14_n"a61ce58e857a175d7f5ef369fca1769b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x160x14x14_n"aad143db71e0a8bcd71dfb891d713de7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x14x14:32x32x14x14_n"548c9d94ca596702adb92c9240c1f552*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x192x14x14_n"e55194dd4f4a7c09328ce21083deacc4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x14x14:32x32x14x14_n"df908a68f9aec88773c7420179fac3cd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x224x14x14_n"d36ee5939bfe9e06a79b912b0e826798*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x14x14:32x32x14x14_n"d0a600414ba2d48743cf7e3ced826b73*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x256x14x14_n"55e293665189188958417bafeb7083ab*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x288x14x14_n"18e9a0b48f8bcd8e3aebc10d0a7f9957*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x14x14:32x32x14x14_n"8da01261e4d28c394c5f674fcfc5a159*1&7a503f82d8d199e387ea63b07d120e8a*1&de8d1f6ed82f9dd191a07dc8cadf22ba*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x320x14x14_n"235ab7c90b63d827cde840efc139be27*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x14x14:32x32x14x14_n"57870b835861c47ada7d20fe53047eca*1&baaff9a50ddfb7b0f9c758f95d03e4d5*1&12375e9369a4dee92e7ff166ff8f7363*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x352x14x14_n"fd1455ab009a5f4524773c40ccb17ccd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x14x14:32x32x14x14_n"2a9e577acd68d68b32ce43b1a9804cab*1&66e81747a149be9d78ae0468c9f98dbc*1&50a220e3bc1f51571bac05cfd98e4066*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x384x14x14_n"ea07c59f4a0012bb2dcaa843979cbcd3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x14x14:32x32x14x14_n"80748640725b33eb370216bf5856b37e*1&6bf779e43ed4a8ccc50517a1c858d966*1&e17d45c76d76efb6f0b60f68a84cfb2a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x416x14x14_n"b2b1ae2a5e4cea4c5c68a6b2d5f08b69*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x14x14:32x32x14x14_n"c1758170dd1e9c9b605a2b955b4f034d*1&a96cbb724114df7d6dd26a9b7c77ae41*1&6a1b478d2164b5cdd80c9e07b002a71d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x448x14x14_n"0fc247533419da70270cc8dc5ae09fb2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x7x7:32x64x7x7_n"317869baf28fef165f94d11a31e1cbf8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x192x7x7_n"4997761e0fcb5a089c4fb7c3b980b8aa*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x7x7:32x64x7x7_n"1b12f858c165957c6f7d7a0111aee2dc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x256x7x7_n"e9b6ca50e3fd795bf6916050f90d5e07*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x7x7:32x64x7x7_n"5b546048d91ace924cf476417e5c48af*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x320x7x7_n"40ad9bc361fcd653060e5639c356cbb6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x192x192:32x24x192x192_n"7856ddf586d6b53fde3532a7b68b8f84*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x72x192x192:32x24x192x192_n"1f52c094edf4bae97edfe64393b0a2b6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x192x192:32x24x192x192_n"273b75d1475a7f76c679fb522360147c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x120x192x192:32x24x192x192_n"eea74cebd12aa38a7e6e55799af519e5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x96x96:32x48x96x96_n"ca461057154ba8ee2ab101dd9dd458ee*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x96x96:32x48x96x96_n"b66b5a3c870778190ea37585f48adfce*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x144x96x96:32x48x96x96_n"d3755442d2188a03139e9606f5715c59*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x96x96:32x48x96x96_n"92a893e83f29e6a802064f12213a05a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x48x48:32x96x48x48_n"372dc2779e2f283ace236e21657998d2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x48x48:32x96x48x48_n"64933c60a36bc0b2344b1bf1da882131*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x48x48:32x96x48x48_n"089f006e3e2a87db40b693768dfa2fba*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x48x48:32x96x48x48_n"508f20dbbc12c6d7fecfb831f085e253*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x24x24:32x96x24x24_n"7b6d701bd23940313ada3fe98579d16a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x24x24:32x96x24x24_n"a25258522aeb8f688f73189e5371a284*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x24x24:32x96x24x24_n"b06e5783996c993cf845ac24fd76141b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x24x24:32x96x24x24_n"5d1a911e8967fd878907061584295d03*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x48x48:32x192x48x48_n"d81f34c81ef0925618d7c81073f256f6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x48x48:32x96x48x48_n"6d26b7285000ba010ee6d9a71bc1e770*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x48x48:32x96x48x48_n"585153746f50fd1a20d57004e5c1842c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x48x48:32x96x48x48_n"ecd4117d0397287a9c575b974c99e0ec*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x96x96:32x96x96x96_n"50f8ebd00db90e0394f33a0b245e10c5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x96x96:32x48x96x96_n"f5004b81bcf5036cc6e4f6360a67b80f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x336x96x96:32x48x96x96_n"3882457114c858fb5863b625b0668db3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x96x96:32x48x96x96_n"b6ca730cbeb33377f0f491ad33e0211e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x432x96x96:32x48x96x96_n"6f68aa375363e8374b8be1b6348219c4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x192x192:32x48x192x192_n"7e2d0b9eddb03419a664c7544161aeb0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x144x192x192:32x24x192x192_n"60aa8f4fdb1217afa3c9d15750e4c0d0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x168x192x192:32x24x192x192_n"391fb729a7fa525994768dbb9c95a616*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x192x192:32x24x192x192_n"eb07dfc6aeb7c5750e32f12f5097c94b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x216x192x192:32x24x192x192_n"a5c04c94932b3fc13e0b1bffc0ce47cb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x256x512:1x64x256x512_n"41b53cb196601dfaf36b802faebbf2e3*1&3b9160b27dc1c4efd398ea26a5dfd9c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x2048x64x120:1x512x64x120_n"7e9c3d2a54850dbf77ffbf9add207ecb*5&dba261941113314aea3b56dd29257944*5&271209a40c58a97b7766b35896c3a6ad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x128x240:1x256x128x240_n"b96b37415228a437dd54066e70c6c8eb*5&324d2cd3ac3bd4c5283d760094dc44a2*5&7d24c96e315c773889e8e8a9624d342b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x256x480:1x64x256x480_n"b26497804d9a6718d72b82f12d9bb045*5&52214fae4fffa85233085b2559c8d4df*5&e804e9ed2c4e550589d18c8228a4ff13*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x512x512:1x32x512x512_n"f75b608230ed2140de55523fcfb0a81f*5&0e6346235d7be88f50cbc8874338e654*1&c35110389d49a7910b0368a8330c9a0b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x55x55:32x64x55x55_n"a62ad60a53bc37ebf9ae709e111c9bdd*2&c1ef42f0c2dd1961d5cb29c398624dbd*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x55x55:32x128x55x55_n"33e030c5fa63bc29078aecf3cfd79922*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x27x27:32x128x27x27_n"01a5ceec124e60875828f01fb2d9365f*2&e35ba46ca2393c329e863d49779a572c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x27x27:32x192x27x27_n"71965ffd1c6218c69cf1250bfea45848*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x27x27:32x256x27x27_n"3592373a95bd7ba8aa8e92b8fec9495f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x13x13:32x256x13x13_n"b52a25e953171f694310b9a068701c86*2&d61b477dfb57ec0780e446f2c43d7e6c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x320x14x14x10:1x320x14x14x10_n"a1b0ca04b13b5d12619b8919ec3db88c*1&a4937f77339ed661e740722ede772e25*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x256x28x28x20:1x256x28x28x20_n"5a0245ad238673e96c60a2f1081dbcf4*1&f96ac2c8890c2b01226f0e5535083a30*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x128x56x56x40:1x128x56x56x40_n"08c015a81ba3d131a387cf10e4fb6db5*1&de0d653476c3e2b232b68e78e90cb773*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x64x112x112x80:1x64x112x112x80_n"974111e1fb856f3cd05d31bb672d7447*1&4df7858b6452bc94d1f5455d288b84d4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x32x224x224x160:1x32x224x224x160_n"8179ccb7124ada7cc8d1d44a9d2535c5*1&f12cc1cdee3173e24b7fba6be6271227*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x26x26:32x256x26x26_n"d4b74c0c98614ce0c8408bb7c31e255b*1&6e9a746ba8a12ebe763a7abed915f8f3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56_n"79b7037efadf9d780c0fa1132717e9ff*5&53bfc700012e3113151bf7970b90e02b*5&9f1b7bbe11db7ef695e3d2c460284908*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56_n"b63e08e99c9a234123da8db0febabed2*5&315c70371489b943539936f771c7239e*5&0e39ffcf5143279319a2bc1dbc7d935b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"17fdc0cfca22a425ffff1f553e727446*5&dfdd387d0303ba88c9d89f23fce3e458*5&ffddfb509779cb61795792cce53d9317*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"060afab3c28ade61e25b0221451805ee*5&5017760883b02af4f4ee35df0225c811*5&eada770edc048a0211b04105b58faecf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"c1ca067aa1eb98a69433ecb6d9b1dafd*5&200db970d81997f9782986c448b8f57c*5&2c16a9157763fdceba4ea2966d6f8be4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"9ccf003efc93665d14e37685e17146b9*5&c478959dd0d5772ac87c83a7f0ca0c7e*5&9b04f8c9b41b8c194c04be8643432fb5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28_n"86d0e797eee50093207ecd4d1b1af92f*5&c8c0ac99b326e046e46f3ba21a02a51c*5&e527be852e4b5eb6e093da3e2b846e4e*5&8b3d70adf1b4c3fdb3199750f77e7659*5&85360df8e7452216ca387922b0592fce*1&7b656e90a9fc1601724f88cc4c1a3992*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28_n"8c0819ce23fde61be2824dabf8fc58b0*5&df4f21bcc76e8c106f4d2fabd9bce61f*5&663dc68a2d89c80e8daa0fe31787169b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"8e6e0d1514233c8fd8ebd307075678bf*5&22ade45fc70353fb6b3b766cf1635156*5&45a01fbea2432720dd63dfdc279231bf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"59a8dc4cba71e4b3659c59e4c2ebb433*5&ed1e8b0c797d23a3484daa0f96f9412d*5&dec996626d2846e88e76a8a926db4895*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"c78d4a55c048a73df0214b066f2eae37*5&42cb47516d60dc2080d09887df3a0347*5&f44879688cfc57a66786d53a6a876568*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"c9fe378e1854ac4fb3b45abf3cd7c2b8*5&a504447924e3660e86232ce6eaebb053*5&6c445846bf3a46ccde31ceba39ccb2dd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"36200610f87eb469c06802f4d4cab06c*5&9020593ee7130af1fff25160a9ceeb20*5&998c2be1cc3ef9fe604be3f21cea1731*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"7e3c27be8dcc8db5e6376ca31ea681c6*5&338ea022925f33765e8d8ca246c271bc*5&e0a4d9c59c500e84f9c6dca91e7efa80*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"f9884b47394ed3af6eb070e74f79ad81*5&8291aa2ecc33066756c3bf36aab85789*5&00f3b83429c2be28fc6d31aa1acde7ce*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"4901c191c71b4d95c592a7dc7bfc924e*5&128659e0a1a0864d98aac8372d733dfd*5&58686b58a748860b010bc363768bf2aa*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"2738322c24de3cf5f059b268bf668c19*5&123f619a0c6f59802c3448fab5cba529*5&5b3099b98f9a78481f898e0ddc8a3171*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"b28bc58c1547642121bace1c33ef70a6*5&6ee14afff481351cc2e2b1dd8b8495d3*5&0bd05a578941609d0ab52b33c9d3c2ec*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14_n"eb7912129e4225451c9c643614bc4d00*5&1722eb3da188033346ca97eac0f87201*5&2183a8a1ca350b5b21945d88e55fd2e1*5&d9cc8713e2c288a492af1ce98693d0c4*5&b937663f0df26c9471e3f4a540913928*1&814cb32778d28361627729b4bc36bf8c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14_n"2f2692203b0f399e7c11b4a316a0451b*5&0d32b828e97251f173ec8f0bb75b8743*5&83ff50f7602c47c2073d2f21da7e0cfc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"2aab174e3bf75ee5b98cb993f531cb83*5&4b09fbe0e84dcea8ca61db8a19d01db7*5&dcb889a0edaf0e38d482e65fb06f9575*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"58417df3b56fb7dd015b6ed6c9a94ca8*5&cddf141c4633c30ec4cb1fa800f772e4*5&4a98a4d17a593a1f0927fc3716ed7f7e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"65c1c125fec6d57ab6623095f43e1a7e*5&81c9bf283b26b360edfd6295c5994d89*5&9014d3b43e2e5ce3da4b72ed5c0144e6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"c4a7e2b225097fddf7eecfb95cffec9a*5&4b171cdc249b7c66d251379d278032fe*5&f6bbbeecc4066dc810240006d5b2d1b8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"0d1fda2e3e0a433763496f7f304263f8*5&f97d4c3e34c1cbc725f622ed14840b45*5&192088b1a87f9ff53e3f47c2311d11ac*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"34325f4d7440476ea7cfd1d3af6afb67*5&4e0a714334b7faecf9cffcbe4d828b5a*5&a5c2864954da7cd5ea6842a24495db94*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"dac33f2547dca4c0feb48ddb88643926*5&bc8d64f3e9f8974543a7cb07ccef86d0*5&4412526868c65e903e39fba4e7b7a5ab*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"d5599540a5b815459f769548d0e5b4f3*5&7b85405c63b103f2e9ac01e5bcec58be*5&0eeace27eac929297c67d7a30ad90325*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"fd5dca25c7cd238920e1e69a59fa3420*5&7e5217057be7418b6b7df6326f13fd66*5&9875c1bbd4fcb3fdb184871359498812*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"0ad487a4fff21a5a7ec50cc66a45ae01*5&ba399f2bbe4fe9ce10bbd947bd4c7030*5&9934f96236efb6f584539f047f32c372*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"6cd0f1bf68e84758d4ce4ddd783b83ad*5&198bcae3b2932f9750de8f2ed27d3433*5&86a4ee4957d249fcb1f9d46ec4b61eea*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"7934cef84c2ce91a4e0f3561b7bb8e1e*5&47c06a4ed0b45ce2df3c017b7f0a4bde*5&6fca8b3f4e4023603c5f956cafa21fb5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"01dbf9be5d65406144e2930e81b99f15*5&669f2ae8bbd6a8f4bca5c6143c696ceb*5&1cb472ea93fcaea2b19bed08bcbe3bf0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"d9a746651e543205adb9afc57e821fbe*5&76e13a66cb12942da1ba4ed447d8516d*5&84dce87deab849a43126ebbe9bc96548*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"8dcefa3bbe3f52b4c051c1296e81f1f6*5&47a8c6c6a6a7b756ff4ca56cf5e63a8a*5&ed2db6247171eff8b9663c26b2a00643*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"24406638b3081e5d55eb7a07420ca945*5&874e0db4cf4d3dadd85f23bf19c338bb*5&d3d68758c22920cc7a9c27f2e6a92c92*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"5c36af0da22dac093ced8f789076c1e1*5&e0095f511a641a08fbc892413dfaa77d*5&47a5422e3298a4fc97099e06de82c720*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"c87325487b1580073fba9a2494db8e7a*5&f4237cdce4129f04e1b35cac37f2d9be*5&abd5539e58b8ebccfdd17bfd41fd3a63*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"5f44d00050ae6494cec659d16f7f36e8*5&8d5a1d9fdf81f49311e0b719c0f05420*5&6b8edd2823247d6079916a9e9497f8e5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"eba91c43d18473e410c06a0161edeb68*5&20cb176e7a870ceab428c9fdefce1bed*5&d578405789fc4cc22464e77396e2b016*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"b48a8526498e18e8893a60bc1bcc87bb*5&b9429ada13743a740ae0cdbdd1f060e3*5&536f1624c92e45049c0616e5e62f3bdf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"a7655f2219c16815998219fd289a74d6*5&5243a88b9602c4848fe2ff06471e3665*5&203c72e36d435520ac28c86a6bd23246*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7_n"b1ec09631c07c82c6b40594c2c985c84*5&8b05156e78891c7d8e13aff31a36e69a*5&7732612cb1cf9bc748b67f0dbffdb13e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7_n"763c45647cb72f28b0e9375d14040263*5&2efa3f313a397204f9892f88b72ca0c5*5&592fb8324fca5143f8f8674f201dbd41*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"cce748f88b949ecbeb478f0edda46c3b*5&4dd456e60210ad7a0f5456886cccd0b3*5&7e7a7b09fe6b324bc307a69408913386*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"aaca3a64275f374154ab826679f31ec3*5&16fe3b39584695b943dd1f35bb73687e*5&3b7eec34c5d1129f72ed4a50c041e672*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"6d8622e3d724c098625bfdf0a7975e69*5&e50dfccf9e5281c5ea4e9e5339cd372e*5&2b010479b10e1d6f40c767c850a6df01*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"272dcb771547ba9582c99ec6d13e378c*5&99d56301fb8a89231c0054558c5680fb*5&8e7ed68d14ec067ef143172564901106*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"a07b9b338537e3e5102ff544b2680636*5&8cd960abcef24480da17781b75a7130b*5&9d5492ba6ca3a48aa13a7459a2145e25*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"063e7727c9bf07b2e82c39fa23c3e5cc*5&ee471ba8e6cd563b9ce08896fc49d054*5&627d354c78bacc8977d6558c103c5dc8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"7cadf09df7dcd1d7e29c3c84010ed902*5&195e215490e8ac595ba4cb149fef2cda*5&20f6d2853c2f2d40c85ccec5fecb426a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"20f3a86d69d307c7590533e2be42d0ac*5&4b0d820d18e5d13ec02afd88a3e17203*5&7fc1eea0e6572b7fad38971c0a56aa97*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"4016f4e67e0651677a6ecb025275420d*5&62757c94b46e05d1117c264ed81e49fb*5&b0185cdf5395e2f271d238c87123e000*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"6aa57af49e17fe39c0fd4e4c838fd14b*5&0a6693a17109623d01eb9353c5f6258c*5&074b7b5115b65451034db2b0fe296199*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"aa3d7b3e16f9046d5993fd9f7ebd90bc*5&124920227e57177b3148be74a29c2a12*5&d809906dd1b7f15447162967b0474d39*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"87a83f7a23395dc3e18621078e8f083f*5&5bf5f00fd08e2a9077eaf272ef0f5b73*5&41051819390c3bc2a7e484be6e3a7780*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"aa7a4a5a6908da1f8e71dfa2a6b4dd2d*5&d2f6228085e31958b3d663a7aafcda79*5&1392973d10933db419e49c1e007c7115*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"8a7fce230a02d04f37e9c00b23139a33*5&9bce402863073ed79d2ca72ee901bcc4*5&00bc5be63ff0f4e208619c165f744db2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x320x20x20:1x320x20x20:1x320x20x20:1x320x20x20_n"cbf08811cb7bc110ffeb1ad60092b5b3*5&1826dbce37d0514b9467372b2ac9cd3b*5&9e4d7f831aec6104d75b1c64c2dac322*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x58x28x28:1x58x28x28_n"9be009b4479c89e85f29ab3795792532*5&7b8a231da0c60df1348f03d18f812c07*1&2d2a5f98e9f29918415ff7906f53bbcf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x116x14x14:1x116x14x14_n"aa14deb850222ea8304d81edb6335387*5&06f139a133c0eb6b589d9c2ca69836de*1&8d36378fc44a6a9261f1581af5bde5d7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x232x7x7:1x232x7x7_n"1ed2d3bbcdba9d408003b0181de966b0*5&7d3ee737dfe3b5e894d87cb03ff7cfa2*1&2025cbb66c5eb7b9f9af43cb69e55ec6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x55x55:32x32x55x55_n"f3a77f693623e491aacbb2b52754b311*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x55x55:32x32x55x55_n"9823afa262edb75a114a775777599ab6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x55x55:32x32x55x55_n"26bcdc910af481e9fd8feb63e7c47d9a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x55x55:32x32x55x55_n"5c17304e9e614c187fcc48f2017ff298*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x55x55:32x32x55x55_n"6cd484530d39b1f1f51f4de02bb06192*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x55x55:32x32x55x55_n"40fc723bd8f9eb7692f30dc298f76937*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x27x27:32x32x27x27_n"6ee761b486f3db2364607b0156421391*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x27x27:32x32x27x27_n"f22e12c49808a75bb33c2447bc48de21*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x27x27:32x32x27x27_n"98583cfd5c68939cfecd4f2425c98d90*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x27x27:32x32x27x27_n"db124c996084ae8d0e182c4957165017*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x27x27:32x32x27x27_n"54932e2de52b333fccbdad41b71616df*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x27x27:32x32x27x27_n"9c5d6c9e322c44f5c3638b3d54540f13*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x27x27:32x32x27x27_n"e75328ef753de4cf5d0a004d8fd009f5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x27x27:32x32x27x27_n"0e78b341ef6d3030f1709b43c4f5b3f7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x27x27:32x32x27x27_n"f7e369d24b391ba317f27597caccd343*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x27x27:32x32x27x27_n"5321223076e6e14c83b6e303296fca68*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x27x27:32x32x27x27_n"25191353df4dfca5f9864ff6e1295984*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x27x27:32x32x27x27_n"2e4f89c203b0e69dbe56fb752067ec09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x13x13:32x32x13x13_n"b2f1596b07195b9df3f6a52df2c25026*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x13x13:32x32x13x13_n"9c4ff345efbb12382c4d59c8b5cda61e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x13x13:32x32x13x13_n"82695a9c72ccc2367c2f347b3dc3efad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x13x13:32x32x13x13_n"fb4e31f4dfe0d6e95eb60e15ec5aaebc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x13x13:32x32x13x13_n"d9b956dea417346138adc8a81caff5de*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x13x13:32x32x13x13_n"b65381a7d92cab75a35bad41e4b29769*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x13x13:32x32x13x13_n"e21a5833a0aa7e8909ed677ab4fbd932*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x13x13:32x32x13x13_n"97ed91bbe4fb2a522f843be891db86c2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x32x13x13_n"96b6ee0df7df3f31bef55b67737172da*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x13x13:32x32x13x13_n"6e87f900cfc54df56e23da4fe581c1d2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x13x13:32x32x13x13_n"36434bc0f16895e9183934eff1f41275*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x13x13:32x32x13x13_n"e3c842d6183ea7cf4a2b4ba2909f8b15*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x13x13:32x32x13x13_n"9e1d184ed7d8f281028b0f8b7c2f6a17*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x13x13:32x32x13x13_n"a0b701a2c5b823c8313d89d945eb63af*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x13x13:32x32x13x13_n"bdab3a1bae85f6615395aa22789e8c9e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x13x13:32x32x13x13_n"824ff8ffe5a77727a3a502f1a6441871*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x13x13:32x32x13x13_n"02e18a28c6a6459dd42cc767530c9927*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x13x13:32x32x13x13_n"e4878da03bdfb2df95d8de5357ed2f7d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x13x13:32x32x13x13_n"8b6b953cadc7198610c12cb4019693f2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x13x13:32x32x13x13_n"575f8e31cdc0094fc6f7c28569f5a8fc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x13x13:32x32x13x13_n"a143c1c2ce9e68bf7729f8a639a3bfe2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x13x13:32x32x13x13_n"8b7663e931fedb8d250d939c382fe3d3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x13x13:32x32x13x13_n"8f31b95a02cf21c2abff89289421089d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x13x13:32x32x13x13_n"8dc0bed5dd9334167a2e578107727976*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x6x6:32x32x6x6_n"dc1a803c9e4ee986192d65273f3eb89f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x6x6:32x32x6x6_n"bfa0a9d67520659e5ecd5267bfb0b77d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x6x6:32x32x6x6_n"6bac6177b5931bd1f81471d04d1acf42*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x6x6:32x32x6x6_n"b2761990acd6c94b1a14e597eba75ab3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x6x6:32x32x6x6_n"1279240169f015d7059df7d8cbcf9ea2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x6x6:32x32x6x6_n"07e7b154a3970caf2e02ce441c0d4bbb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x6x6:32x32x6x6_n"e7485b8e0987fa135c8827088f1af5d3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x6x6:32x32x6x6_n"656112196e2608dead0eaa6b44d4fdc3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x6x6:32x32x6x6_n"f016e0e444bba39f183c30a2e53d752c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x6x6:32x32x6x6_n"879ffd5afba7ee495df7f7c445ac81fc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x6x6:32x32x6x6_n"f65fbbd9d21c3cc76855c93ce5fba6dc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x6x6:32x32x6x6_n"1f39acda2e214dc908be8d0d0c82c2f2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x6x6:32x32x6x6_n"cb21dc69a5e6e591d9beda573342f2fc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x6x6:32x32x6x6_n"744e0e4c9978b2afd6582a360c3bd091*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x6x6:32x32x6x6_n"b0fda08d9c8286cdd976b1479562d6e1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x6x6:32x32x6x6_n"e4e71e3605f865d7641520ce346c8a48*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56_n"d14f005a6fdd4595f36e1a2bd687777f*1&d7d762e2edd38b4769b607af0c107f7c*1&9179895853ce148ffbb6f0293f11d1de*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x56x56:32x32x56x56_n"9b883bbeced63aa5175410df76e3c34c*1&f4c16ed9913a1c50c39a3a1604345f10*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x56x56:32x32x56x56_n"d0bb1fdb027e66e8f1d0597e00955184*1&175c9ddb347146e23bff38d7e4bef1e3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x56x56:32x32x56x56_n"76e0312de5c9a26d77f335b37a50893a*1&67e4015cedff9940bd1883816c93f671*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x56x56:32x32x56x56_n"dd07a07f058ac1acf225c25b4b3612ee*1&a72f194cd7e35973cb0f23e4446f275f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x56x56:32x32x56x56_n"58849b0482fa20a89642cacd3c699568*1&e610181097d108ac3da1c2a949cef623*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28_n"097b854e1e0f8660d461420fcf983497*1&6f1feaa3dd3cf00042ce7ea441f53508*1&f3c22d1d37e4e93c41d2d2b7e372d5fc*1&24652d755a324819a9912d6328c17418*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x28x28:32x32x28x28_n"372fd5a1b29e894f503b2bee7bfc3296*1&9e2ea33b4a7c2df86dffe27955647518*1&c6d19ea5768ba1464277081994e5f721*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x28x28:32x32x28x28_n"3ba49ea3324a6d0d7cb20574e5a23f5c*1&b32b49b7c49be593c665b5613856d6fe*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x28x28:32x32x28x28_n"ef4075bf38aa16d410fd44e97a39400e*1&10ce25008f047625285d2e193da4a28b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x28x28:32x32x28x28_n"4c465be400116630151f94ddf0493297*1&3d3fef22e048a4a6824f205eaaa65984*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x28x28:32x32x28x28_n"2b98ff01cbcd4cf836536707c10cde14*1&3eff55b4863980c28fb1ad23c30f43a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x28x28:32x32x28x28_n"9a6ab655053ee9916a4d823d526083df*1&d958d27e22d301e36e12faff02688962*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x28x28:32x32x28x28_n"0d0d8969309ea4c1bf3065fae4b32c6f*1&edbc2c160d2c4ce1bc85b427a21ec300*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x28x28:32x32x28x28_n"7fee423bd87c2d3c88d80ea41a13cbd6*1&527e65c452d2bec40da21a12a9050adc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x28x28:32x32x28x28_n"664ff1fc418b051100f2d65e1441e01f*1&4bc39b52caa020fcdb88fac004e5e648*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x28x28:32x32x28x28_n"9f54654191c32c3248a2d69d916013aa*1&3abf73d7377aa7dc0a1b48d94bc327e0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x28x28:32x32x28x28_n"bc71fcb858791f3d4ef39ec25dfbcfc9*1&758f939967cf1e78e730c9353b267c1a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14_n"5e7e99e21513f549f673f543436a5f49*1&8a68c86bf7fb0a125bc087f7a0881415*1&af7689a4b005b219504e3aa818ba3b4a*1&a9e5f31e986e8b28aa8325ecd17439b9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x14x14:32x32x14x14_n"f9668cc76241b7df08750d0a0ac64743*1&108c8f761060e61699a91ada4b1f814f*1&784ed01255769de82ff58ca3c74f8d30*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x14x14:32x32x14x14_n"b0531175a52f80f74ee44965944e8b39*1&74633776a1097213082e38f2a05a7ecb*1&97ac735ecac0f222409851a4d5a9bc23*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x14x14:32x32x14x14_n"10fee3ee15e57b54908000296b5b108a*1&f86a8b2c08c0036b6e551015a594e017*1&6a359e0b90ceed7f9b0fda664332e860*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x14x14:32x32x14x14_n"22d057ffe5141c142bb94623521d77db*1&3b7d6fb4cc2ceb21016bcef2abeffc87*1&6bc578b1821c21050719c726c3e495af*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x14x14:32x32x14x14_n"679185159ec3192fe322114dbbc2a412*1&af608ed0f2168aa38f3292a21e4c1828*1&1b3ebc235826ecbc91377fa8fb5e6c19*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x14x14:32x32x14x14_n"fd19a7961f9c4d7fd3111ed31d083388*1&5e16f3eea19a485abf92ce72dc961ada*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x14x14:32x32x14x14_n"607247d19bb2b28089a541c94c26efda*1&e23f71b95a7f8310c7a1417774e603c3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x14x14:32x32x14x14_n"5a0d4c4e3690c207abba6b3f75746aac*1&4845282d1b60d87694eda4e9c365021b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x14x14:32x32x14x14_n"dd5b543f65fb58a295ea09c513618a8a*1&c646f442e40197b0312702d87f3aa4a0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x14x14:32x32x14x14_n"3297f8e3860fbdd41fb960d5bf9ee872*1&4445d150f1fa713b0eee37e7583198c6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x14x14:32x32x14x14_n"7a733707fc694599ea385ed3f44b623b*1&13eff934f76591f7a56aed33a3803b1c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x14x14:32x32x14x14_n"af209e48e112b9748c97c6019941937d*1&a9750248febf3c7304aef766b0516489*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x14x14:32x32x14x14_n"b503bbda008a5b81f10904ce669b3adb*1&e768058cf34f6dd524ff484f24a8329a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x14x14:32x32x14x14_n"c619b6a8c46cbad906ad627acf5a09a2*1&2403b9ececcc1d5dd2fcbee0a5258945*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x14x14:32x32x14x14_n"382d0d534a71af559785092b57bccc43*1&e957adbd7fa2914aeb240e9da07989e7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x14x14:32x32x14x14_n"6645c08e3120f242da97ddeb61230856*1&64a4b76074c6c9a230e135e074d84751*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x14x14:32x32x14x14_n"6f67b933d7b7bb653a2852ff11c1cf49*1&1de29a037bfd9ce0a960b8a039695430*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x14x14:32x32x14x14_n"8253629a7c7ea763a285d1e21a63e144*1&23521aa291e0e67d9fe89b89ee5adfd2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x14x14:32x32x14x14_n"2654cd4819256a89e5b433c995698282*1&0ec3957275d6fb1be3284d852c26e159*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x14x14:32x32x14x14_n"b9ef548f289e5a5977ee238cc75daf91*1&7a36be2c30516cdaa44db1b5e193d38b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x14x14:32x32x14x14_n"df455a2519e443cc33de1644c0190849*1&7bc649a6c9e75de4c1911bbb90bfaaf1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x14x14:32x32x14x14_n"aef78650273854d97dd4018c647c3daf*1&4eeb644e0e7ef5a3914e7864aa69d5d9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x14x14:32x32x14x14_n"a064f56b52a05882c0d864f3c6ce50dd*1&55ec319b90bd4bb3ed7ebc108067f414*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7_n"71f0c61df709b5eca98c704a0d42a16e*1&55b186ac1db599325ed0819b0953f651*1&efaff105b1bb46f583594dfb869ac94b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x7x7:32x32x7x7_n"0d4cf8c5d9bd5b2711c61e09de23d75b*1&2ada9c9c702710a58e90234ed5b76097*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x7x7:32x32x7x7_n"ec874764ee34e1420b0e43b1ca698755*1&9691f3edf59f0dc0519f049486c7b14a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x7x7:32x32x7x7_n"b4c75db9fe79c2ad42b6a784fba001ad*1&b373b714a76f9de29123b1a155aa66c7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x7x7:32x32x7x7_n"fc28ff634084f678c9d57927b4058a26*1&4eede94a42003057fbb0637bbd7a537c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x7x7:32x32x7x7_n"c82335f870e6ad339913b38e183d3e6b*1&29693c63de651a9d5e66c629c03a1142*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x7x7:32x32x7x7_n"a1165d6adc1ace42d947cb50886795bb*1&98c7e2a13440f33ecb3629717d7c7db8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x7x7:32x32x7x7_n"6d30ea6800740050b141f7668668c921*1&db540194cb3a8c21650aa22bc41d7aaf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x7x7:32x32x7x7_n"90270035925e25fc2dbbf2978e2300b8*1&90ca6c91c7b04fca8e2767ffe2d7f9a9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x7x7:32x32x7x7_n"7ccb14e7931d760cbf5201bc5f47e21f*1&1129ad3d4d7f70e11cb4c32bb09511aa*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x7x7:32x32x7x7_n"99f7344c89cad19936f122227dfd2938*1&f1b963cf8a930421cdda990d4221675c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x7x7:32x32x7x7_n"8d32b75d5aa9fde9144b520c25d77663*1&db45778608a193fabe67ecbc3c80715b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x7x7:32x32x7x7_n"4a040f5ad4e21ef3b9b37b3547df82ac*1&b191fd0e25f59bf42d1b74c1be0ebdf6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x7x7:32x32x7x7_n"d546ca7dee40b59f1b08f1309d770d23*1&7f21a735fb141062a781654847966e54*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x7x7:32x32x7x7_n"3b38e62040055348f37aa9e932e01263*1&54b2858921d32f0417665b87e7d632a5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x7x7:32x32x7x7_n"6460bcd5cf0506ece2a7d5fd55d8c70f*1&28297fd80dd0eea3b9418739778cd5a3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x304x304:32x64x304x304_n"8b2286035aeb24a86f7c7452f8606935*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x152x152:32x64x152x152_n"c13db730035e9d38828f04a77061c3dc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x76x76:32x128x76x76_n"d9c21945f8ca56338e677f25e8a818fe*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x38x38:32x256x38x38_n"4064231d4e75e133226bd4affb1e7643*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19_n"59a860ca138a665c6cad8c2543d93f60*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19:32x512x19x19:32x512x19x19_n"a1058e6a02affbd6a779fdb28f032e67*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x48x56x56_n"8298e20bfc3cda26dd093fcb3e15646c*5&206f235732fb37a07a44e3f2cd82a6d5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x56x56:1x16x56x56_n"fc9973a96cd5921d5fd1f58c038a2742*5&b98e2689f7fca69e1012ec0d5c4abd72*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x64x56x56_n"704c3320eabe29be91aade8840ca369a*5&953769abca527d6e3eb6bb22d90da56f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x16x56x56_n"b43dff718b2157907cba2c05320563ee*5&eebae7e4e6af46ec8c76073831d973de*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x96x28x28_n"a32e25f7a816d6d1030b3cf9b89b06ca*5&043c7495d60ea44905ac9142888bb060*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x96x28x28:1x32x28x28_n"4165a16fa3d500eace60b9d8bf72fa94*5&51edc426cf3edc85672712c68abc9fec*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x128x28x28_n"97a09a1a7e70375b007d72ff01297ad1*5&49f15ea493cca136b1e28f1a1d7b599a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28_n"c0e37833c9070fbc223f64107a2afdb0*5&8634599f2c70265f4e25d953d2453052*5&52a4dceff0c3ea4558741d1aba333953*1&614ccc09ffd528af68e44442aac777ee*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x160x28x28_n"190a5af9e537d9a07c6cf9c168757442*5&ee3f229802413d3adc877eb40eadd6fc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x160x28x28:1x32x28x28_n"9b16dabc40574407a247a55a948151fd*5&2110a9ee219fc4fc718941b576c1733b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x96x14x14_n"95d877bf1ad90f412d3b8a1c62a8c3c2*5&f85f6639752c3b2a1689997c04066e14*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x96x14x14:1x32x14x14_n"b608bdb0a88bbce91ed077493a2e882f*5&7d0bec369b89e4b62bb62bf5e76d7e93*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x128x14x14_n"6a7cdcbe7fea348408fcf0c43450419e*5&680752318811e99e7457b663a749c06b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x14x14:1x32x14x14_n"c5c1590a2dfe0c0ec1b17569351571a9*5&a7922b2b83ddcb7171698a86b80c92d4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x160x14x14_n"01f1f16811ab55f6831ff65c2b04f544*5&39d1798a03ab10033a146cf29dcec340*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x160x14x14:1x32x14x14_n"81e8f160457e51bdc447fb704028a492*5&d95b6d6ae6994100bfe33d9e398264df*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x192x14x14_n"cdc76b1cf96832a4c1a3538c228fc3f3*5&3cc95cddb46390862eceff27b1d7281f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x192x14x14:1x32x14x14_n"5ad42efb92f892fa79a5202ca59c6d3d*5&e91a47fe7960a7b16bbd0f1ad598df01*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x224x14x14_n"cb956e3b717b501d72a80d48551cfd94*5&795609ff4364ac6bec4b3c924d14fe77*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x224x14x14:1x32x14x14_n"cd7785550857ac73febdb45a99657fd2*5&968847952b64ab4ad07423144f70f73e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x256x14x14_n"78e2533f03e7f06e8aa35e578a7c974e*5&d80c76ffe8dc0d75da0b74c9dc032326*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14_n"c4c7bc59512773db3efde42b8de6fb20*5&44bf863ab000ec17ae5cb44cfe19293e*5&c52e2a8ef1f4d5017e9da0aa847691c8*1&e0d522db5bbc6b380dd986be793a959c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x288x14x14_n"1769dcb10d9b77c935fa009dc93f19ed*5&25d8ec7e2dfa8347ad0e536891d5b4b6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x288x14x14:1x32x14x14_n"8043efd6d3ea5d12166a7d1e6dfa5d88*5&cfbaa1cb67592cfc25bb4f94bf085495*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x320x14x14_n"dc89381ee0aed0b1780a317e50e704e5*5&45760c7da75984943323d52694ce749e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x320x14x14:1x32x14x14_n"6ba7ea07f5aab95b68615aa19c066101*5&f3804a3659c3cfb976ee77c13d974cd8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x352x14x14_n"8cd0a115e6d5cc6996ddc8731362fe96*5&55887944c78385fabcde56e9eccb9126*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x352x14x14:1x32x14x14_n"f8bc15a5641e56c920fb6cbafe9c4408*5&f2e4994061dd9591f84ec7e98c5f6e3e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x384x14x14_n"62edb2ec5cb335789a4824f1d249dc76*5&bd4c0287162eb941e090fa64fb841ac6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x384x14x14:1x32x14x14_n"1b3e7584057ab6822f59b0d13fe3c21e*5&f8e05f9b2701e1fceb6b24742d22fe1f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x416x14x14_n"e6cfbbd70b1e2c851d565ba570326a39*5&eab9e7a3ca5cec012688b9e64825ce16*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x416x14x14:1x32x14x14_n"f028d627cee779bac581e685686bac51*5&8d816f46cb90ea07b26d47a99836267a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x192x7x7_n"6aab2c9f6ee3773f682f5793754b95c4*5&78863be898249adc125baa14e0239d91*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x192x7x7:1x64x7x7_n"b6524e94ccb559ae43873051b690c7a1*5&8ad9643626da93312087d48cd849f573*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x256x7x7_n"7b0ce59977906a992f17aa637eecbb4c*5&b79559bc28bf2ecc0edb45b2fb30b5cd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x7x7:1x64x7x7_n"2b583599e01ad74bb4855b090058f22a*5&476c062f20a83046758320e1b3c39c12*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"04329f649ab6797dfd51e5bb3a98a4f4*5&346ab080142ec0e33a36a6195a3774bc*5&ea0b0957910a281956f668f3f1d8d50b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x58x28x28:32x58x28x28_n"af0cc807522fd06ce25b7009bd72fc45*1&84973367a586834c10343b1d6dfd2048*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x116x14x14:32x116x14x14_n"19fb995a2a3d094ddf4eb9681a00d04e*1&06539f0946d04e8fdb5ed8073c086993*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x232x7x7:32x232x7x7_n"08dade734a0a5337aca389ebf2d6a041*1&0dc2f7da3cad04e99096603611efbf44*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x48x192x192:1x24x192x192_n"33a320da2abccab07290a1554be04663*5&5e1c3fd6bbc3498eaaa5a0e056e8598e*5&3028af6acb69e8cd28071e7f5f14952f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x72x192x192:1x24x192x192_n"5d7531ab159153d8cdb7a0e11a8033e2*5&d8e8311d7fc52f2f95fc07e7b95594b8*5&444403693308cafa9275fa6e33b94fee*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x120x192x192:1x24x192x192_n"042dc34dafdd3c282c0c07dec8f2eb74*5&d1b262f749b7c8cab7c7c40480c93a0b*5&2420560f855e1c648bf1c8d43b13ba2a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x168x192x192:1x24x192x192_n"9d8050825953774f312d368d4565d89b*5&48353c2fe76a0b87deb5a1877409c769*5&7c569cb5e41ad13f6b27eb5e168491ad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x216x192x192:1x24x192x192_n"37632428168a027b54104fb3b284752c*5&925c3d02f3ec9b6184356cc35f5974c5*5&8b3dd5b0df15fe156cd5d9932c7a96d2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x12x12:32x512x12x12_n"2c7f9b0e521f43444bc8f592400578ab*1&0f28099b35d70a684438d090c8c6dc47*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x24x24:32x336x24x24_n"ec00a3deeb1e87eed525bb712068c561*1&d5b9c888a6e602e14ff7b39a1d53201c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x48x48:32x192x48x48_n"a713fa94505f8ee96e3ee7bf783ac46c*1&831081182b1418dacb61d68390f45541*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x96x96:32x112x96x96_n"cb979da03097203b96dd0cbf7466e054*1&7bbe37e34564407a5db5243b19d61717*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x192x192:32x64x192x192_n"c0c215242fc6c99f93b005f289fc5bd5*1&e657e4ea9dad3c4cd058c14850ca38b1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x16x16:32x512x16x16_n"6cf76123d8424871886bdedb88b87eca*1&9be95b174dc7986739ad7c1c233e3a2e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x32x32:32x512x32x32_n"7891016c2a8c6a5cc03c024dcc56b1b0*1&91917dff66625e486fb9268c52dd757d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x64x64:32x336x64x64_n"32821ded3d9132fae2dc323870b278ed*1&66750c1156d26b5eb0ec7070697d44e2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x336x128x128:32x192x128x128_n"0c1d4043ef45c3e73a416e465b3b9a37*1&7d2cb0843a06901c76cae9f9127ccf75*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x256x256:32x112x256x256_n"15fd548dd62bb7e03c750380db2a52ba*1&5e49d0ea91c3790490c456b1fb20bb56*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x112x512x512:32x64x512x512_n"a4898d7efc1ea8120a00138368b98a1c*1&b30319b2ebdedc6d7b15a322fb928c62*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x32x32:32x304x32x32_n"a313c2179583e150924e41b99e78ad50*1&da9c69be66d44e72e1e7d57b8d5aa2f7*1&85269d6235387b3ebc79f585f3dff575*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x304x64x64:32x160x64x64_n"769a7ae0d0be4f633d9da8e49cbe149a*1&d1b76993bfd30e7edd1f6e22d46483d3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x128x128:32x96x128x128_n"109dcad70a020a69679d8e751a655f88*1&b0b7496cf99333cc543680190fe667eb*1&ff0bb179367ea6ba5a691f1bcdcf5be1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x256x256:32x48x256x256_n"d3853e6ab709629b6f99e7e97e865972*1&3f8cfb646bc0ba80a0600407137cd2cd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x512x512:32x32x512x512_n"de852a0891a743061ad8c7274d64cbef*1&ee28e2a6ef135f1344fe28b0c028abd4*1&b0c64dd6a2e55ce660dc8ca0533130b5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x32x32:32x32x32x32:32x32x32x32:32x32x32x32_n"7eb698ea82dbccac984f14f90fed9e31*3&87b2d72d995b601170a23d45080c83a4*3&a4967c848607f57c892524c9875b89fb*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x24x256x256:1x24x256x256_n"0443c817d0a5b88003a9f93ff1c8d512*5&d54cadb5cfd8e7aa396fe2c30ad4c047*5&c46ad13ecf7231859e4ec3032a120db1*5&6a11e6556ea3de72a12a5cbb74a045ae*5&e65a37b70b13017bd7a61d4eadb9d7d3*1&cfa673a31b51426fb14dc38d99edef92*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x64x56x56_n"3ec00c66dba3df20744f44f8be4096d9*2&a19ba64effc5de3cf0d8f58e60793c4b*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x128x28x28_n"7642ed2a6bb43e4499734ecdd0f2bae0*2&a01a31932189754ccd0d86ca214da8a2*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x14x14:32x192x14x14_n"90b0e424e409d014269a3e7d7d4f7dfc*2&8d8d660ec87aa383332f7499e1922613*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x256x14x14_n"266efd05ba0fd8d3bce2cba4bc6195e8*2&8fe00b0a6415e70551b7747833fdba36*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x26x26:32x256x26x26_n"c44cc8fdd153e4a45dbf6ed523346d94*1&02c57c8b2d446a39ef1f1796b4637ae7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ab:ab:ab:ab --dtag=ab 6x64:6x128:6x256:6x512_n"2f34a8d2100dc7f0df0240586ee16353*1&2ea78b76cd69b3699228369385b6382c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x20x20:1x256x20x20:1x256x20x20:1x256x20x20_n"3e38e6e4a03aa422d04863e7d4438643*5&a24e62eeeaffb050bf6fbdfe8b260fff*5&a269895c29925195d65d0d55bfac5f52*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x58x28x28:1x58x28x28_n"d6abd833f79dfdc0cfb894a8e6f268a4*20&6d3dbdeae374469464fd6fc671927fd9*5&2dcbfcb99d1265e84b82c4b1f3d02606*20&5acb021985b6decf13420388150befc7*5&da7e9d3d03cfeb237b36b57dc3ff2676*4&bf74395a07361636ff6fc4f2911983b5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x116x14x14:1x116x14x14_n"e5e68b996933945d46af7ad816d500f3*40&d46d2a82867edfeb2fdecb62b8b73a82*5&d02ca0602a5f50662b255cf7c92a8639*40&897eff0dcf63263cf943e27c58f44ca7*5&7df55180366bc7d019a8dcd91cf5a7b1*8&4475e50c7dad578887c14d0766120304*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x232x7x7:1x232x7x7_n"a7b51bd029f6f2a12e7e226724fef51e*20&0383ed612811f8b9ae4c51344f48b059*5&ac03fbeba09c547344603be4156ea457*20&2f00abeac54a13e45eb1116d000e993c*5&04ba8c0d740e6e23312648abfd3b1c83*4&cf8d571c429fc7a54740585fbec32542*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x75x128:32x64x75x128:32x96x75x128:32x64x75x128_n"bb573955b824ce2614b42a6927f1dbed*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x75x128:32x32x75x128:32x64x75x128_n"c82862563164aa109a5c541acfcc6a41*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x75x128:32x384x75x128:32x320x75x128_n"659cf8c2b58cc0fac7f878527e2e725c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x75x128:32x192x75x128_n"143615d31b3bd69139adb8ee3061fea5*20"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 3200x384x8x8:3200x288x8x8:3200x320x8x8:3200x1088x8x8_n"5bbdff0b0fd131b1ae6eeb972e2ddd44*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 3200x192x8x8:3200x256x8x8_n"5bf4d412ba082310c387046c80ed9847*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x1x256x1:1x196x256x1_n"5765898f20dc68f3b0fcf8d03c2dac9e*5&65fc2fdb4cd11f02a5d64bef14ba87ae*5&d836d7358fbb967d750f36c27b1b474b*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x56x56:32x32x56x56_n"86990676e6b709d90093338126db83cd*1&b48525b79a578cbb54aca0537bda7c9e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x56x56:32x32x56x56_n"bf6b88eb9337aeab14d11347cc850abf*1&eda79555c721255003779039dee72121*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x56x56:32x32x56x56_n"c5f71dea87619340819f235a04c72555*1&eb24127422ce16b90037cfa5421e5045*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x56x56:32x32x56x56_n"2422eb00308538e73b20da7878e1602e*1&124a3ef234df43e64db6b10132d80d68*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x56x56:32x32x56x56_n"fb2bc8a66cf87affb50436d2bff56cf6*1&e8fd12b852543b6650b2d46e007bc2c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x28x28:32x32x28x28_n"aaf9c3b3d945faf863aa50be1e9450a1*1&e64160a165c4f4955e2426417c285495*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x28x28:32x32x28x28_n"c64a05aeab792f816b758a6d1687fae9*1&3a9978fd831be0dfc92c4396df7ecee7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x28x28:32x32x28x28_n"f43cbafe4de3f276fe2aaf8894934a9e*1&08e64e8e9e737a5dc421fbb594d31d55*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x28x28:32x32x28x28_n"0b58c434442643f19032fd054126af7f*1&48663973618817455a9ff2d29aedad09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x28x28:32x32x28x28_n"71d2d666ccd1413d6a9b2b9bcf623fa0*1&7f83e021539a22756bfa599ca696cd9f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x28x28:32x32x28x28_n"69bb19ea0f761e29ed1b70abfc3924b4*1&c161ee6079b8fd4c1a2ca70d61b55bb8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x28x28:32x32x28x28_n"0c0df995a0b87468fa2e2b3b8c4a91db*1&944a41ffb37c89c1830cf0d79c9cc947*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x28x28:32x32x28x28_n"86d3b15a5169815727adced1a48bf994*1&043c2d831a90ff08574f4dafff3f906c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x28x28:32x32x28x28_n"7faaca076044541ada6e57f734f4a31a*1&379757acc67d22e86a50ee06291fab8b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x28x28:32x32x28x28_n"7353f7dfc5dc4c644acd3349c74e15f1*1&0e053afbb2c46f24b3092eb5692267e9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x14x14:32x32x14x14_n"cc6bee24f95fcf803c993e6a1ba3c82c*1&6b2144ea8ced38cce7f98fe0311db22a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x14x14:32x32x14x14_n"669bb0d35b607de807ba73765a8e4142*1&306a809d05aca8c7d6d2b947e1f68b38*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x14x14:32x32x14x14_n"61ca5486abd9cd4b02cac64ff155534f*1&ae3d7a8cbf83808d11b0c7b8cef49086*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x14x14:32x32x14x14_n"2d6cf399c150f49cf470a543c7ed0cb4*1&3c581e600f212d7bce620b35f2c908ac*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x14x14:32x32x14x14_n"162b6168dcdf6798a800b6a0466de2b0*1&ace7345a80c8a391e7122e3ee27f0556*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x14x14:32x32x14x14_n"939874748ef4a409f82a483131a95ada*1&9aa321c235ea44487105a960f2dd6311*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x14x14:32x32x14x14_n"e6d38c5dceaa360784d1cbd3d3ba4e2d*1&27db5c626c950b6e85f9c573f22c5790*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x14x14:32x32x14x14_n"35b0aa3e34d021a21770f9b562d38e5f*1&8cc8de1cbef2fa2f71414cad0489a033*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x14x14:32x32x14x14_n"ac4f8c92ba6c58a6f014d6d174bc945a*1&9eb897546c670cab45ce75c6c3dc6c39*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x14x14:32x32x14x14_n"1edcc32554ef1779b626dad47dc564df*1&52ebedd12e4ad84d472141b0b0f40464*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x14x14:32x32x14x14_n"0edcb9b90f98482119442490244f838f*1&81ae4b37ee1a5731f9b372907c18e54c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x14x14:32x32x14x14_n"5a8b072a0d8f68b9a27e7274663b6d0a*1&5f67c71bf998e84104c83790dca35c8e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x14x14:32x32x14x14_n"e1275a0911b980fa437bbb4c7e0bdfdf*1&6020bb481f0f91bbab28e735416e37f8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x14x14:32x32x14x14_n"7b03eeecdadd40aabe7643910ba8872d*1&b41fb7e826d36fe19662e1e289704ec1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x14x14:32x32x14x14_n"f56cf9ad968eed47aa598b15d817d06d*1&761336a4324554b7d01dd1810863a35d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x14x14:32x32x14x14_n"41c8dfe1a013cbff5d9aadce61a96f4d*1&c36c53c3286fae457d3a83df2325d1e1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x14x14:32x32x14x14_n"f200aa9f16829da88a26998780192b83*1&e46e65d93f4693c4dd1563f20dbc31e5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x14x14:32x32x14x14_n"2da6931dd595dbc95204ce5195101a60*1&2d4f0f62f38b9876c8257850944c5087*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x7x7:32x32x7x7_n"4199a6b2483d6241db144029696e3545*1&80a92451a63248b779d0985ed15abfa3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x7x7:32x32x7x7_n"1e141f9199dc16f120d71d85b3cfd85b*1&42bb32df70ee4e2db74f7de02e7cc668*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x7x7:32x32x7x7_n"a8f9fc0fba60f63a202bb59243a963b0*1&a0c17cfd95a737891dd0676ce440c1b9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x7x7:32x32x7x7_n"d393686e4303f3361486598d23c5093c*1&0bb5b31a82fc478715fdade410f1d288*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x7x7:32x32x7x7_n"935da5a9c389cec2a0920bd17a05c3ba*1&94234e3f51f842dbadcfc8ccfb6af7e2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x7x7:32x32x7x7_n"c3d202bb7d57b6efc055f549db984a82*1&6aa0e3af98c7755c13ee10ab4206ac97*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x7x7:32x32x7x7_n"51f8be30e8c6d15ef0a6a4343f02bce9*1&a10b65a1abd80b309a261b4fd9bd4806*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x7x7:32x32x7x7_n"6a2106a8e605c3244f902fc18dc4cfe3*1&fb303f91c15eab58d72eac21f47cac61*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x7x7:32x32x7x7_n"4e3b09ad3ac83380b351c5f67b3fc192*1&3806620fd84f6a78c27e948e3abee2a0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x7x7:32x32x7x7_n"0e3f378ae0582497ba096f619c8c18d7*1&092977cf1027b276d3394a1b777b8880*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x7x7:32x32x7x7_n"ef3c884e3455816c638ebc5bf02507dd*1&a8284500562a016e0f5d42838aac698a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x7x7:32x32x7x7_n"07295c30f0a5bceda323704f64a1d4a9*1&4b36e726b2e0090fdf4427cb44af6640*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x7x7:32x32x7x7_n"4301802b4ee85d872ce640c5041daa9b*1&866b1b3a50bd0fb47acc7256edf8c57e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x7x7:32x32x7x7_n"a8002b18f7c52e9e4f48749ac59d5d35*1&c5b5125adeecd19ec71cb852ca130007*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x7x7:32x32x7x7_n"09a558e5336c935e7a8eeb33f623d071*1&822fb3dd1a3587d01938b0aafbe3241b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x24x256x256:1x24x256x256_n"ae9fb5eb51b2b3d619b63b36ac2ced93*5&b3a6c1197fb2444da6bb00f7f87723c5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x56x56:1x128x56x56_n"024635eef854c00750d8d63bdd5eb773*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x28x28:1x256x28x28_n"68c7406240fc89c2e700729f9a20d21c*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x28x28:1x256x28x28:1x256x28x28_n"7211ab9512f7cd929618789ae9abf8a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x28x28:1x256x28x28:1x128x28x28:1x256x28x28:1x256x28x28_n"1bfaa9589146fedab51901ccd5e4e4ff*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14_n"818d0a89924fb7662662e3f129304aa8*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14:1x512x14x14_n"fdc8f388ffb553f0b8a946d93fde4e2f*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"5c032c06193730bba617546d64cd1b63*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14:1x256x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"217a48b9e5d207285e98e5312b4b37be*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x1024x7x7:1x1024x7x7:1x512x7x7_n"bf9fe9140338c4f103094de69c69e6b4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x160x160:1x48x160x160_n"63289e8684cbb79076df7e5960f317c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"416e47c36e787c48b233d42e02805837*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x32x128x128:1x32x128x128_n"74a1a9ec4f6008c0c3d7b90d49221450*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x32x256x256:1x32x256x256_n"6d98e21b9188e60e07135829eee049d1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x16x256x256:1x16x256x256_n"015fe23c9dd7c041213d32c87dd7ad7f*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x32x56:32x19x32x56:32x38x32x56_n"8e085ecca6e48799bfa657e8424d924c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x19x32x56:32x38x32x56_n"905473175dbe6b7c036e46d872146532*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x32x56:32x57x32x56_n"8b4fe1097a9f0ede8fd079106a78ebd4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x65x65:32x256x65x65_n"1dae015175a01cd86ef44be2c2525993*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x64x56x56_n"ccb0e54f9855a37318399e6f9f102bd8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x28x28:1x64x28x28_n"f05a9bc25408a7308ca3f0777d1d1fad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x28x28:1x64x28x28:1x64x28x28:1x64x28x28_n"3f255e29b06d505239ca35b8f6230e82*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x14x14:1x128x14x14_n"cfc9e53cf5b0a93541e834c673df283a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x14x14:1x128x14x14:1x64x14x14:1x128x14x14_n"08e8d6465b7afecd1472a8cf71a4fcd5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x7x7:1x256x7x7:1x128x7x7_n"cb08ea1bee4ee252be161422c6356de6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x65x65:32x256x65x65_n"6417baec95e0a443c14c508bf5c8dc53*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x320x14x14x10:1x320x14x14x10_n"b6feb6219f55157ad3426544c744e9f3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x256x28x28x20:1x256x28x28x20_n"7ef16c913e659ceb5289c8275e809f4e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x128x56x56x40:1x128x56x56x40_n"d33d38af38950c8ce260c69a9021d49d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x64x112x112x80:1x64x112x112x80_n"d30ec967030a9234524dadf329a4ce09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x32x224x224x160:1x32x224x224x160_n"7099f3361f8f00d01a33ef4c8165902e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x32x32:32x304x32x32_n"096fe926072c7313ad1e5b50520c9206*1&4a9fbb87833621a82c9fdb1b57daf6fd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x304x64x64:32x160x64x64_n"18ecb3001fb2abdf93d44cb5da2f17a2*1&3c161d2787e836424ae6664b353993e0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x128x128:32x96x128x128_n"c5ce63266a2d24f83e93bc335f96cf0b*1&56250dd8ceb0312c8038c6d02284bcf5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x256x256:32x48x256x256_n"39774ebaf8d529ac5cc59fae4f8904d9*1&69af47f0afd0e0ec3b8272f40cf04c5e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x512x512:32x32x512x512_n"6ad9dd2024a81f78d32eaa4b3d9d875c*1&397297228f228d8d16ccd3af626b9cb6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x64x56x56_n"76b4991541d7f90f6aa53fd3186ea4ee*1&543eb2ff1a451aa64f4da34c33112e81*2&10fbd1b486a7d6c4af0e1a576b52ec64*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x128x28x28_n"26b22978509c4599fe534ca337f7915f*1&dd808138a39f32595d7fff3a59a29266*2&cc725a59cbf8970778c2b4c912ab266c*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x14x14:32x192x14x14_n"1b55dccb2a7a6b7e2cf669eb041c6175*2&dd16552627ed6b943045b0b4ed468b3d*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x256x14x14_n"4011ddf86d6d897c2c304aec8d1d3942*1&f0e2bd7755445fbf8cfa86228e14faf7*2&fa32a1cd92d9a8de87a0871ccd5c21b8*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x160x160:1x48x160x160_n"5dc98e0c172ddb4ecb5920c74cd3c54f*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"21161dad05221bb04c14646466673cbf*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x48x56x56_n"38de49ee38efe9fcf1465eee16dffb5f*5&0eb4b729a5909991e6f19dbace6b6a4d*5&bf4c520b60360c13ebd5946dca95ff28*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x48x56x56:1x16x56x56_n"cf9b374b492fef0c4cc4f162886174ad*5&f71c5aa5b2ad4b4742604f5fce8fe34c*5&bbe903df3c40c075a928397be0ae9fbc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x64x56x56_n"cfdfacec5f4415a6f7b8590f30e40331*5&4c959c220f28d9fbbd8d65ea2b4a369f*5&08650131fe427f8d4eb8489345431988*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x16x56x56_n"76045f67dbe6c039ef6dff45a363876b*5&4f223a0ff54051e4d390a39b675a265a*5&62cd8e6ce3c9e8c283ef37ae3efc11c2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x96x28x28_n"2edc9cfa2e4bf62b872efc7a86ed698c*5&2df528a0d60c6e94c08f557b05c7ace2*5&4ece48cd67fb166e05b564265851ca91*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x96x28x28:1x32x28x28_n"e51e61ed59324e2e835613aee509ad92*5&901b2bb5c536fdc6866655f646e521c5*5&7bc04837b020ba52112c0597ae0be6a3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x128x28x28_n"f69b98570ae2eecff09cba21b120cac5*5&a9ce571d108152e52dbf232d52c0b847*5&0af5a5eec45b17d4462b266546cbfdc8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x160x28x28_n"294c65272966b18a02bbef3b2fb1619d*5&2583186763f750b627adb063a7bedfa9*5&54c32ad58d77f601f92e4ed8a0b52958*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x160x28x28:1x32x28x28_n"a60aa1816eea8d195d8650cdf84b72e9*5&2d42ed7539903e9b2903304fd351b681*5&64141e92083312c7ad062311ab07caa5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x96x14x14_n"66ed6704001a645d495f268873b58dbf*5&2f598d2ac0698e6fd56e1be66eb63fc9*5&9cd65af3173933989e12b2ff6ec7e2a4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x96x14x14:1x32x14x14_n"dcefab7aca68a49698e9cb6e57c4cbeb*5&e1a75799bd130fb7d30d6e39d1b0f184*5&b9394690cf3137b37ceac21319f85438*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x128x14x14_n"32f1cfba19d518149bfe3d9bfae92167*5&6fc06527bd3a97326a4e2a26c8964dd1*5&f3659264399612561f133c2802109260*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x14x14:1x32x14x14_n"c9577c6d7b676470d06d4a442c66cef7*5&7402cac75afac0d24a304d8b3f4f5adb*5&ada8607c13679b267d73880892e56ff8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x160x14x14_n"f17bf52a7d7e19e17ae37db5442cd6f6*5&e355046f64c521f5f08e052ccada2f5a*5&ae9efe3fdf6f884927e6c603b0c65378*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x160x14x14:1x32x14x14_n"6b3d023596a07eb67cd35a871ada0bcd*5&6284c64e2b24131e5fc60f43b1301d21*5&30596ace47f0cab63cb3d27cea21b3c0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x192x14x14_n"33dfcc6255e2e88ae0e80e662498f6e7*5&20f9bdf52a1df93b8a83a4091b381363*5&6d21d318a319a8a470845a6833a42d23*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x192x14x14:1x32x14x14_n"1eba6f412c2811a83c24e098b4a8198c*5&031c54f10f5f697609b3102ac87982c6*5&001daf21d18208369bc37918262127f5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x224x14x14_n"83b82f667e754521f1cd8e3affcf0b61*5&de429c2049780187cc17904335ce88af*5&cddae3cfeec61f9243af8652dd62eff6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x224x14x14:1x32x14x14_n"5a19552e2224ea290248f9e94a4ec03c*5&1d647e65710d6f3b0b904423ed0b8a0e*5&4fcb8edfb51efbbc97a70349592c2a87*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x256x14x14_n"9c360be6293dda8e6a6fa7323df1860e*5&759546961bb05bd80232e28fdd468b1d*5&076a7d9e23f6ec7eee552d9798e6ac73*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x288x14x14_n"a66523cfe146d8e11b1f219240b7623f*5&824f8a32ec5802fc1419af442885b97e*5&1fb6cce6f2453150fac758e2ac5ddb50*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x288x14x14:1x32x14x14_n"bbe198c6905af7da7ef34079179d9866*5&b68081333b42cd69f65433aad3669aab*5&9ff040fe5241c5cac047ad02ea271beb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x320x14x14_n"815a383e65468f520281587d12c93ef2*5&626a37bd2bd0b338f42aaf44a4288d8d*5&1dd9b1d3718ed76ed9525d37380aeb1c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x320x14x14:1x32x14x14_n"015ff3f837ab0a553252cf17df54c0a6*5&066c71bb9d2e9cc7c6e00397350d06da*5&24abbbc0381c8f862694da29bd2930c2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x352x14x14_n"078308fd7b9c3557f6636c210d4f8484*5&6212f98fd5cb5d40b848bfacb12a4c7a*5&69baba142f791aba61b9d773b2137c9e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x352x14x14:1x32x14x14_n"21ea782977c8b33fb1b24771f35b3ed6*5&49fa134492b81815c1d9dc0cfe221ed1*5&02b9dc3988ad242989836dadfd59cb7d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x384x14x14_n"54f17fcf55664ce7cae222fa52cd0423*5&6118ccaca28cf7776de9aa84b69c3b16*5&b03bb278507c6e36709c405b70006439*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x384x14x14:1x32x14x14_n"91801099a6bd35fe2b733b2169fcd687*5&25e3429ab79395b77b84f641a6c20deb*5&f3e7b1360a33aeb325f75cb8425d87ad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x416x14x14_n"ee8736ed0623ff46aeb90b3c3fac1be9*5&d9cf27bf85d2d6f7d7f106c8e65f17d9*5&d013db5e3f34d9e612ef5a9ab57b0ddd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x416x14x14:1x32x14x14_n"0cc523587cd35ac799499d580ecef4dc*5&f6d3f431b3d42b652160fca60a6744c4*5&0ab481072a376046d3033cc34db995a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x192x7x7_n"93096eb5b95a230f739ab4b25bab7742*5&7ad3f3688acd41ece2ed4e6d855bf5ee*5&fc87cef277fcbb3a84e8a5f04fb35383*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x192x7x7:1x64x7x7_n"fd955b0762f446bb1cccf0468ae37426*5&b9e660679d9b1707c5072c23aff44b6f*5&e83266b9be3b0b76a8438afabfc79a0c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x256x7x7_n"8152c2501dd1d1d474b0e2f0bec43ea9*5&e25fa9f6ea124d572eec38400c8e7f44*5&d284a488ff6925c14488b87a604b694a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x7x7:1x64x7x7_n"1545c9d6b78bdaeb098c42bca2d47b9e*5&c5230e8e673f451e4406c5e32ddb7514*5&4fcb02d0bbf5d92ddba37e5d6816837c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x19x32x56:1x38x32x56_n"64625d888d7000b02ddadffc048212fc*5&47e15f8e67c4139a6a67274bf2bd3b6c*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x19x32x56:1x38x32x56_n"e765eb3305f0033539fa468bfb9f5f92*5&f51f9b7a3ad6bbb76209290f866bf683*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x57x32x56_n"bb819a3dcb0d998131ceca95591fcbe8*5&52bafe9c1f622d6df45873f96224195d*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x192x4x4:100x256x4x4:100x576x4x4_n"bbcd6dfd877bf1368252d7a64fa65c4e*5&db626cd2694aa0c54815497516baf529*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x352x4x4:100x320x4x4:100x224x4x4:100x128x4x4_n"010cdc97ef5eaed840a75ec21b34372e*10&39fab8029fb2f3b6d4ee28c3789041c3*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x18x16x16:32x36x16x16:32x72x16x16:32x144x16x16_n"1521d77b95115208f45cc8f1504ec0f7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 100x384x8x8:100x288x8x8:100x320x8x8:100x1088x8x8_n"5feb6f424a413492d9525197cac33a32*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 100x192x8x8:100x256x8x8_n"4a29267f6b80a1485c8130fb199b4abb*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd32b --dtag=aBcd16b 1x24x256x256:1x24x256x256_n"27e6348227a03e73d3763f687171daff*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x73x73:32x96x73x73_n"b970b934c520963703ccd29b3d6de3b9*1&05253f8e9e8a17da541adec1d6287855*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x71x71:32x96x71x71_n"977d87485e54baeb40b914dc7ac3923f*1&61fde3bc490618b04703c9e1f03b5205*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x35x35:32x192x35x35_n"5d76880e97b2aa66674c50de866a2e43*1&f8153d5f8ee6d3713ab1b3ef00621572*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x35x35:32x96x35x35:32x96x35x35:32x96x35x35_n"59b1a60914613d18552b7e5fd4218103*4&1518e3634660e1ebf3908dd70dc566ac*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x17x17:32x256x17x17:32x384x17x17_n"86259055aa7b07cd154f6f3031f0409a*1&c24d072360db35a1358dd31860eade09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x17x17:32x256x17x17:32x256x17x17:32x128x17x17_n"c9d4f7d33b1cc99e85c1cb9c61b19e07*7&d33202f588e766bb45b91e93b308013c*7"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x8x8:32x320x8x8:32x1024x8x8_n"9fa84e2df222112c34fff5222cf1276d*1&d1ea1d915b4e3358b9876671aba89252*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x8x8:32x256x8x8_n"310d807dad89f90f60e8455158dda746*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x8x8:32x512x8x8:32x512x8x8:32x256x8x8_n"c4f0ff8b2279c5dfd6be332dfc4a4da8*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8_n"4abbaf31a4296e035ad3fb1b8132c0ab*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x19x19:1x512x19x19:1x512x19x19:1x512x19x19_n"3a44325aa01f75bb408190de995db43a*5&1ff6b6c7662ca7e719ad76646161afe7*5&6bfe4e9e606836a09831699d7c645502*5&f6c004b7606499a92778bdc5cd3f4134*5&3bb2dc251db6fe2378f4f32356de0233*1&5e1b9339f7b0a67c3525bdb237d3bf0a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x32x56:32x19x32x56:32x38x32x56_n"7636bd1a4e52fe837e0517f338ee31f8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x32x56:32x57x32x56_n"9914167ca676b438c99e8f65bc6f9394*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x192x192:1x24x192x192_n"cce067e58917702eaa05c3c05444c3c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x72x192x192:1x24x192x192_n"8f6a9ba717deb2b3dd5528eeceefe7d2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x120x192x192:1x24x192x192_n"65e37a5ccbfa917d739d9c947509f0c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x96x96:1x48x96x96_n"602e066d9cbd74fad3e618891e058579*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x144x96x96:1x48x96x96_n"b06994ce63e8ecf35ede1a9675d38b10*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x96x96:1x48x96x96_n"e36f4d792052c47b508dd1f66d07d3eb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x432x96x96:1x48x96x96_n"621f9fdb22ecea9c29c469ca1f048561*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x144x192x192:1x24x192x192_n"b1d6a81eb0c6ece4fd3e2440c5c3cdc9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x168x192x192:1x24x192x192_n"fff1456b00e2ab846150674afb8ad252*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x216x192x192:1x24x192x192_n"ec9f1127f1a7f1ef3aba2642db703e49*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x16x16:32x512x16x16_n"11e38fce3db6cd121bd2cbc0036680b6*1&40dea917ce1ef23917042f08fea5254d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x32x32:32x512x32x32_n"0dde96602c953684cfbde18334337aaa*1&6b135fbd8f886abc7aad6d533ddb0449*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x64x64:32x336x64x64_n"a5499ced9ccd368b81d8a1f413b5fb5a*1&376338310c00f89ce156e8457343bd9f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x336x128x128:32x192x128x128_n"678275fb8250db483c737327b6ab76d1*1&f5b8a687b226965450d0ad680bfc3210*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x256x256:32x112x256x256_n"8994e95b6b7d6ba1b7348569c8874e5e*1&9b01a9d5edf844dfa4320ab7052572db*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x112x512x512:32x64x512x512_n"cede16efaae7085139e430d4019a4163*1&1b0e998119431572bbc4e26de3dc9df4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x128x128:1x192x128x128_n"6d0b890748711fc512f8119e1529969d*1&d39fac51cec1a9692f970510b04f22f2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x112x512x512:1x64x512x512_n"4acd5c8baa434b97e031559d027506b8*1&658327efa30729e51ac3c39e31b19f12*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1_n"78b5860cdae827b7da8cd25b853b3695*5&7ae40f2816070ba3c8298276028cab16*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x128x1x1:1x351x1x1_n"6929f2539e3cf2795b95eb9bd56e4908*5&5ed961478a6906412c5d085d3880ad6f*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x24x256x256:32x24x256x256_n"d656fea9102174a4359e6b0dbac10761*1&0e44c888e9b2986ca81d7cadc7423add*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x65x65:32x64x65x65_n"5e7e6e483eb21800993d082f7e3843b5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x32x32:32x32x32x32:32x32x32x32:32x32x32x32_n"a34f37719cef97779f9062218922aeb1*3&2ae8ea8b03cc6930997b46029eedafeb*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 32x1x256x1:32x196x256x1_n"35c14ef43b66eba116eb795932754004*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x55x55:32x64x55x55_n"e58246b2d7115f0e35aa6bf286ecdb93*2&e504de9a9ebbe62df9d74ce552b26e7d*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x27x27:32x128x27x27_n"1fcbcc3096c5106223dd07a42b8c918a*2&dca739283da3df956008241f65321dfb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x13x13:32x192x13x13_n"d570b808757fcca1848ede3a9db5d138*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x13x13:32x256x13x13_n"404e4f7212d3c2dbc3cd4c04200c5f1e*2&39aa53df7556a012a7e955b5a00a788b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1_n"e2c432aa2f0296597e467618051dd21c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x320x20x20:1x320x20x20:1x320x20x20:1x320x20x20_n"ce5a93b8c480efe62ae02a02bc96dd7b*5&200fe9847062ec1b2fc3f9563212b70e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56_n"44e660e500516335250f160ec0a826a8*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28_n"8491c5d28b9764f8898b121fed4959ca*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14_n"b53c7e56315d1ded380aa6fa1b6af9f8*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"f3a8fa53afe32221eb2f3d3f7c777bcd*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x19x32x56:1x38x32x56_n"64d6f48f438161a04a93d2477c35bdee*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x19x32x56:1x38x32x56_n"0b435eced839bace62eec0fb5f1baf65*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x57x32x56_n"e7fde8f3b36d3819340f26ba9eee4c87*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x192x4x4:100x256x4x4:100x576x4x4_n"f06576f72b7c69d479e37bc78158e032*5&7905a92d99e747275d32ea902e3d004b*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x352x4x4:100x320x4x4:100x224x4x4:100x128x4x4_n"8a60d13a5b494a15f5dc55f9435887f4*10&5b63f360d2ebf81ca62ac313ca313c4e*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x16x160x160:1x16x160x160:1x16x160x160_n"42c9e7f36c3fb4486917c80748869e2b*5&ccdfc276521894609c7b1d43afd72212*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x32x80x80:1x32x80x80:1x32x80x80:1x32x80x80_n"3ee1217a2bbd5f05efffb40ecf6b1f5b*5&d03acaa31f042ce7b18e4aaba3ea45e9*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x40x40:1x64x40x40:1x64x40x40:1x64x40x40_n"3eb480634cbdf99aedf6876f2e770338*5&1e29ae9d0605f33742d7821959df8615*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x20x20:1x128x20x20:1x128x20x20_n"d5184f7b3a1353b5462b8746e32ee21d*5&9a46dd5700174fdd1e12e24ab2818eb1*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x4x256x480:1x1x256x480_n"87d7014e8b8d48c3d7e57ae3323ec019*5&3cf174230d0678681da092ca522e673e*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x384x8x8:100x288x8x8:100x320x8x8:100x1088x8x8_n"1b47ffccdf7721cd555deb0f6b4975c9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x192x8x8:100x256x8x8_n"d115cc8b64fc48f472545c7fee3ab5ef*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x55x55:32x32x55x55_n"b3194b64292475b3fc5a4478ea222ca5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x55x55:32x32x55x55_n"da55f57a0e57b09d3a999085da524c54*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x55x55:32x32x55x55_n"ca3f8f4b71343b5eae6294c048c10984*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x55x55:32x32x55x55_n"a3d39f90b9af41f6dfdef01362c18f39*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x55x55:32x32x55x55_n"03f260f7ddd58e186f5e5072d8e89194*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x55x55:32x32x55x55_n"425d68e9c8ee0a4e893a67dd13b5c933*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x27x27:32x32x27x27_n"4883bf7fee136b71a91b479611689219*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x27x27:32x32x27x27_n"f4b1d74307347d045ec849dd94d0e7ca*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x27x27:32x32x27x27_n"529e1d37ae2fb6e029a906553d959199*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x27x27:32x32x27x27_n"949a012579c02f3a149065c824f56597*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x27x27:32x32x27x27_n"58d7d4331e63b88296c383d9e0a47a7a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x27x27:32x32x27x27_n"724142551ea7e435da31842b177b3069*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x27x27:32x32x27x27_n"0a6704202137d2c27bd85a76952b28d0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x27x27:32x32x27x27_n"0d6cff74e3d8a4ff5a746a1165dcc318*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x27x27:32x32x27x27_n"3d1083eec4dd81eac3ba136ec1529980*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x27x27:32x32x27x27_n"60687272a42612da6bdab9aafb18112f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x27x27:32x32x27x27_n"10a244d89d5c213ebec32a5cd371637b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x27x27:32x32x27x27_n"5e39e171f02720ea93b15c3b16571bef*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x13x13:32x32x13x13_n"fe5c0295f9b6ccba0c7e4b9343ce3c63*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x13x13:32x32x13x13_n"b70b8a0bdafb084d8fa328012d906e50*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x13x13:32x32x13x13_n"940ee8f6b059ed4f2e0aa79221e7074c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x13x13:32x32x13x13_n"a86209b7e538824682fa7825fd6b6c0f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x13x13:32x32x13x13_n"189bacefc0803de52de00b2a08b25e43*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x13x13:32x32x13x13_n"74c3567a67c6b6a257b94b05b0c849d6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x13x13:32x32x13x13_n"726a4b8b67fc1c7bddbcb4ff2e87101e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x13x13:32x32x13x13_n"79568de402bd6cfe71dae9c7e233bc3b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x13x13:32x32x13x13_n"91c79f6c0e7d60933ce66551531fa948*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x13x13:32x32x13x13_n"6f080f6d5bc8a92cdd7c136dd5c81d22*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x13x13:32x32x13x13_n"8d5b826889f4b5f87154f353cc491891*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x13x13:32x32x13x13_n"6159a478daf3637ddb3e6cd9c6b9e10a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x13x13:32x32x13x13_n"b3b88fc7eff38db7df4ada2a4e826d16*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x13x13:32x32x13x13_n"85bd3fa11bd0f04ff2070ed3cebfd798*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x13x13:32x32x13x13_n"fad43a7eea288594ff4b5c2e2763900a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x13x13:32x32x13x13_n"7042e1d63dc3c5413c04c033bbfcb129*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x13x13:32x32x13x13_n"75185aa38152441c7368c1cb05213521*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x13x13:32x32x13x13_n"7b793a2dfaf8281a36c82e82e81d098b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x13x13:32x32x13x13_n"e25dfc777512fefc32047a7025ba1983*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x13x13:32x32x13x13_n"99715aceef3345a8b74e726921dcae67*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x13x13:32x32x13x13_n"ae7239b43201a5b202423d9162553180*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x13x13:32x32x13x13_n"7de2e4edea522634e3eb7970925ed88a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x13x13:32x32x13x13_n"4177b1ba26e19cd4b7de2c299172fa9d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x13x13:32x32x13x13_n"cc408900b37678960b35c9bdd4440e09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x6x6:32x32x6x6_n"9ff84db6f9fa41e12b14356a619c27fc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x6x6:32x32x6x6_n"a9281b99e997bf82b5a8de95a590ab99*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x6x6:32x32x6x6_n"282aefc0649bce2a88d4661beda7e333*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x6x6:32x32x6x6_n"34afa63d56aedafef69effb10661fb45*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x6x6:32x32x6x6_n"61662dcce83b1885553870af1053a8a4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x6x6:32x32x6x6_n"e67fef9fa9ff25fa198e7c950f8e462b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x6x6:32x32x6x6_n"4d7043acbf8b81380a2371f0caf8b8a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x6x6:32x32x6x6_n"93b0d9d48b36520a996fe1bbf08f242a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x6x6:32x32x6x6_n"767a4edd3ef11ed5fa4323a87ff27a7c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x6x6:32x32x6x6_n"92f184d5fc94aae0b0841ab394a8ff5c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x6x6:32x32x6x6_n"857215135f3c14686ba14eb6a0723126*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x6x6:32x32x6x6_n"616a6504416594a9b06b3727b1bd0b2c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x6x6:32x32x6x6_n"1f2b0aeeaeb703d248d803e440ee9998*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x6x6:32x32x6x6_n"a27b0e05fa33c7b6f23ff4c3a43449b2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x6x6:32x32x6x6_n"880e7fd324a9dbbc344687da8a4a1161*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x6x6:32x32x6x6_n"4df113e82d15bf47373102d9061fbecb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x56x56:1x128x56x56_n"f140292489300fe4e88ca99eb1cfcf7b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x28x28_n"14bf9a4e2f8ac0204a4c4b96a64a1c1a*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x28x28:1x256x28x28_n"93ea02d3d908d96646525a019d491c07*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x28x28:1x128x28x28:1x256x28x28:1x256x28x28_n"14d880b367a37de68c20b8ec14efbc66*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14_n"576a043cb36c021a9531ec5572fd8c5d*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14:1x512x14x14_n"7b7378fb58c075fae96649308820d9ef*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"87cb93945aa8a4164334714f30d77f9b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14:1x256x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"b7d5c10864682638ce3fcd3bd5ed9b96*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x1024x7x7:1x1024x7x7:1x512x7x7_n"3fb640e6d8a8f78cb78447b699e76b9d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x56x56:32x16x56x56_n"bb8e10e8c0fe0df2794059711695f26d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x48x56x56_n"8d6b78419ac10b478fd4ade5228afa18*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x56x56:32x16x56x56_n"6440fff7ead105fecb61571399e7ced3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x16x56x56_n"be78548962eb78b530a42e086135deac*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x80x56x56_n"3e32ddf46dd7020d99384d23022fdd38*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x28x28:32x32x28x28_n"da8b50adb4d3fa4108688efdb31c63c1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x96x28x28_n"9cf8a20e4e3d2eb52ec3e41678f4fc17*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x28x28:32x32x28x28_n"e39512c52400cf8dbfc3de2d68821822*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x160x28x28_n"72ccdbe5bf798511370fe2ccd6d5302b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x192x28x28_n"7cd98d871d82621c9cd5ee22a17e2bae*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x14x14:32x32x14x14_n"46bc26ef03ecc1cea51d1e776f99c799*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x96x14x14_n"ae73a5d20373f8e1bda0fe68e99ad4c3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x14x14:32x32x14x14_n"54f2d8668f872f82f02e2db561f93f2d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x128x14x14_n"d41fcc4138726595941e491f5b6a9180*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x14x14:32x32x14x14_n"2243299bc53c5c4723369da0c9dd4fc0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x160x14x14_n"76212a60d7d75d42e84e605cd22dd4c4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x14x14:32x32x14x14_n"0c9d0039a0c418c4246128d62c77cdc0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x192x14x14_n"cc61d8ac01f6f0f8f4844a92168a5cca*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x14x14:32x32x14x14_n"f9492ca9fad4ee7cc2fa97f632dc7d65*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x224x14x14_n"d8a93ed3654253a93fcdc241bee620bb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x14x14:32x32x14x14_n"d75540fae64601b266d8b507b191904f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x288x14x14_n"bbb499dbb05a257c6af8b0ffd7c8045c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x320x14x14_n"1f147c34b7f65fd22b1a312b65077854*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x352x14x14_n"4e2f9af2459bc3a50ce662ddd5b943e4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x384x14x14_n"b8d475e6619370ce3dea9bea7504938b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x416x14x14_n"b58c3726913d39701ead43466cab2b09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x448x14x14_n"2c5ca4b2940f364760d54abbca764757*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x7x7:32x64x7x7_n"a60c776d81d4455cac559182bf7694fe*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x192x7x7_n"6667972524c5b29ecd04e75aa35bb064*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x7x7:32x64x7x7_n"9522a6f6e6e9f34f1c567e0643694f86*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x256x7x7_n"a24ba25ff5a80857c12b164330482262*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x7x7:32x64x7x7_n"70a1ee400905d37e099960759b744527*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x320x7x7_n"27e6c37276a746660615366bac586895*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x160x160:32x32x160x160_n"754c0c71eb39fb85942667ffa777cf72*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x80x80:32x64x80x80_n"b9018c5640828bc59aa132023453d269*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x40x40:32x128x40x40_n"4e10df83b45021ff8f0e98c15f765799*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x20x20:32x256x20x20:32x256x20x20:32x256x20x20_n"f5293beba0850f4a3c334edaac7eaf66*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x20x20:32x256x20x20_n"e931667dfcb690dcb851909f81ff68b2*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x40x40:32x256x40x40_n"4c6ad820d768270674b619da1c2f2ccc*1&a6615d333b106d182b7da959ca7f94ba*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x80x80:32x128x80x80_n"a5780709a2b0f04d838bc7fbdfa94e91*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x58x28x28:32x58x28x28_n"37a3a6a552a5600f75027156ff0d884a*1&b9aa12143a635e720f1b072d2de4a105*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x116x14x14:32x116x14x14_n"e333bb82ca478bb535ee610ef90b1200*1&ebd2896f5aebccbeea874505e77b5933*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x232x7x7:32x232x7x7_n"dd04ca55c2e70058806d1ff88eef2d34*1&fad5c913233ef8f20f20f9b24fc93753*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x64x64:32x64x64x64_n"3dc4692a4482ae216082fdfd1fa88fc7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x24x256x256:1x24x256x256_n"99185eeb8f6b321d02d386a2bc7f1bd9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x55x55:32x128x55x55_n"6749f2c7cd3aa0432c7f1d1125c74ed0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x27x27:32x192x27x27_n"0aacd634af591bee4c6fb5df57396e2a*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x27x27:32x256x27x27_n"96d21bbb81978e642006995b3cc1b580*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x46x60:32x512x46x60_n"e7915317df7857471c4a034d45654561*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x92x120:32x256x92x120_n"cfd219fad0b802060df4700389341e1a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x184x240:32x128x184x240_n"fe8d02e3586faaae0efa109d45d02120*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x368x480:32x64x368x480_n"c4c11427e9307e58763dd2d9fc35a889*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x16x160x160:1x16x160x160:1x16x160x160_n"f428f630f01fab90550dfe252f552fbf*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x32x80x80:1x32x80x80:1x32x80x80:1x32x80x80_n"223c908e8a9fd6027505825553935f33*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x40x40:1x64x40x40:1x64x40x40:1x64x40x40_n"0b45858c44dcd3b4ffda0030268aa5c5*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x20x20:1x128x20x20:1x128x20x20_n"9c2f1184257d6ba305501ab9727d60ee*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x18x16x16:1x36x16x16:1x72x16x16:1x144x16x16_n"86339b35e4c224ad227edd0c5bf015ab*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x144x56x56:1x48x56x56_n"23da7a7d6231f40e5d3f293b043e2f1f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x240x56x56:1x48x56x56_n"2462a9d24a2cb3af6d5f40956c02625d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x56x56:1x48x56x56_n"16c96b27e8392b0f1511d2b424541ce1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x240x28x28:1x48x28x28_n"3434c3b9202a4e5b0cc7a6073ec2a798*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x28x28:1x48x28x28_n"fd371d99287b8e1e3469aba3417fe502*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x432x28x28:1x48x28x28_n"217d087cafc4137f26a09fe435c6a8ce*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x528x28x28:1x48x28x28_n"6002fdcb9340326e52defbe2321a957e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x624x28x28:1x48x28x28_n"2d5778bed73b7391c18c6b54f704c6ac*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x720x28x28:1x48x28x28_n"90947aabe4fe454386b7f23e4e5e3635*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x432x14x14:1x48x14x14_n"6e0d14c40aa28087b37bdf8ab06e65f4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x528x14x14:1x48x14x14_n"1396272f384bfffa2920650478478b14*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x624x14x14:1x48x14x14_n"3297763239be7f6e336791778145eedd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x720x14x14:1x48x14x14_n"280aec1473d9c9494c3463fe983b177b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x816x14x14:1x48x14x14_n"82e353e4e45ff5cd02f9f2523b4b2388*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x912x14x14:1x48x14x14_n"5944daae621ca7f5a55f40694b7b66ae*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1008x14x14:1x48x14x14_n"936fd5e31e85939b6a6ae3f7069f31b2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1104x14x14:1x48x14x14_n"bd4ca9adf27924fe675f23b1635a8847*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1200x14x14:1x48x14x14_n"6c5f7ef20818d2a5a717f291a9f878cf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1296x14x14:1x48x14x14_n"07b6f3fb9fe665ed4d2a01d47dcc9a58*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1392x14x14:1x48x14x14_n"7e4ae9b87d5187039b17e3ede58541b0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1488x14x14:1x48x14x14_n"19d2d8a519feff648fc22bbeb175f1ea*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1584x14x14:1x48x14x14_n"fcac7c31b3b1c238e212a61bb838a2ac*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1680x14x14:1x48x14x14_n"0f37ccbf08f7240804c880e653b4d073*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1776x14x14:1x48x14x14_n"d35dbd2b7209de8ad820337e60728ddd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1872x14x14:1x48x14x14_n"20d5868d79a2d5a495fa9603e537ef79*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1968x14x14:1x48x14x14_n"fd521a2f03107de03c31d3881956bd19*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2064x14x14:1x48x14x14_n"892d5270cdbb07e35301f6c1df028996*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1104x7x7:1x48x7x7_n"47cc5af0a58fd82a566136598091e9da*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1200x7x7:1x48x7x7_n"ac60b92572ea17d7032234c0e005f721*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1296x7x7:1x48x7x7_n"30a933d089b1dcac09a5adb80fba4ea8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1392x7x7:1x48x7x7_n"a231a70a26676df6ef5f523235455752*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1488x7x7:1x48x7x7_n"db110ce95dbc6558d7689b44d0f37020*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1584x7x7:1x48x7x7_n"14084acfee5d9574c111409b402ea027*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1680x7x7:1x48x7x7_n"4fe2e4a6636c8ffefb56040f72e2e65c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1776x7x7:1x48x7x7_n"9fdd8188cda652bc76f9a93fa09b33d1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1872x7x7:1x48x7x7_n"a2271cb6213cf4e4904b3221dae9ac49*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1968x7x7:1x48x7x7_n"ebc2085d269eac1c61e34560941613e8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2064x7x7:1x48x7x7_n"66dfd824cbfe68b7ce5b7cd57653a15b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2160x7x7:1x48x7x7_n"d6f2669779254a3a47e1659f5c9b6d70*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56_n"6e1ef3cbb46e2173abbb388425c61bd2*5&d568f95b184ffdd4dbb33cf9447addab*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56_n"caa917005e830950e6e788d2027ee5bc*5&9d64637d0f886730b265c00d25035532*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"ebdf55f0d4f19f2d84f721dc2a9e9df4*5&71d66f472a538d00fef1c69147c13230*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"9ccf9ff44edaf067c97e3c61029c6e36*5&5f7c8c5870dc105c9ede45e763b0b605*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"dc48f179a6d4212436fabe0f0ba74346*5&a867d92592e5a4291551337356e194b0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"c6cdf73693281f7120aa7b0a55cc225e*5&2c9c043bce9b9b25ee0f1b5f8f81094d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28_n"700d19584c69de443b9175f7e60f96e9*5&0e78b6786755fbbfc4758ad4c9efebfc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"e3e654c5993de20a946d4f60653f9d93*5&256f680893ed99a05ba4c1074aaab7f2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"4ab89075ee51c6a27b2ba44236cc2c8f*5&382a5b9ebc7c0f1dae86d57b966b0173*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"379c9776350a4374069f1382b1f71b39*5&33900000c5052f81ec4dd97317410ba8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"eb795c68a2970cf29b9c23766500c03f*5&3cdfed92f3679b67ccbc73282c667055*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"866a290f1bb939ed3f306a9ee88f353c*5&9b8f9f57988c32d5b467cc771b62aa5e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"5d99d23f8fb3f6f25edfc8fb4ccefb4e*5&3a0f4ca6116c820d923b5d5a18c472b8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"0516e83772ab93f570230efb3be6bd4c*5&5853cdd7addaa88f206c2c7af2a60e61*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"7548643179fd7f98783c11d625dbc712*5&a6167b064b4dfb5d88a145406c48dd3b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"ebdf3b1b657adcad1db0bf5a63e63d23*5&b4ae881a12a96b6c314da0b70d8448f3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"f951d5353ecb4d520f3903a93e6aa456*5&db4bd6e48db2e2efede3bc7fc5a2ec81*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14_n"51233472b300ac21a6146c16871745c4*5&fd80d52b52ff57335af98ac8db1d98c1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"ac9efedc15a152f117184dee4950e290*5&c7139c97c22d6b29c05d554d0498544f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"57047c428f2e0c22b75da0ed61f40f98*5&14c37f14a9fb3094b124a4a097d7054c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"ee8d077d5906788a1ac1274c6f37ddc4*5&1126ccf2c0e8940fe271c5e1f5fe343f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"842903c7558fee08dc5c5411c8883db0*5&7acbd4e22a410e60f26531118530ac06*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"60491bdf74f48318ff4fcb2b16216576*5&022b72ae91413d44039df4c9164fd613*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"8d5b844d24f71d3ee5d872567cc9549b*5&61c3f4ed30d4382cfd19b19584cfd09e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"dc29112a0725087fa9e5abfd7c024c88*5&2ee2e789a1fca261d4142dfa2b97485e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"1e4e5a142e5bce1a447017adb146697d*5&aae165ce632f547433914588ed41368d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"996aec2cd37756b109256bfa2c044131*5&17b1850f9f3a58d3033895ff6e8d24ca*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"0bc9cedced37949099ea1f97bbd067bb*5&0885023c716fc6cf08b39993e0738878*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"38d9b9c8a4b2dfadd221b93b9a94846f*5&1817bca813c5f2dcca799dcab5bb3e72*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"5918c5d7eec952296c49065887f88dd1*5&0c18bf3d9c5e0d402e45b972eececdd9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"63bcfa95f6902d5066ad177bc72c86ec*5&0d0e2611ba1b9334ea6f4b7c3b71a492*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"6c56df21038618d6ac1e57699396d6fe*5&777937cbbfb823f6bc7ed6534c31c4b1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"9e2e4e571c2468b69289bb0fed590fce*5&096a58865844831d870607a8dda35835*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"b32549d113a70e65f7fb25d50b9b07b9*5&290387f9d69cc57637abd4a31b8b8201*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"c48f9052eb6ef099e11c26563993521f*5&b23c994bb17d24564523fdd4330d98ce*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"9a09781b23e8ce56384954a142b8923c*5&a11eb8fafbc5f8042020c512fd8f1c12*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"16a8d15945c92ae7cf28b45910353105*5&e871c5704ac9e709724d69df82df3be7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"e445ced568e2b82852c20b21ef3cd078*5&abab0b4b512ee10ad308e18fbd062800*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"1115278896fcc6580b3f86d7b499dadb*5&b53a72a8b7b2ec1cb87ea0a3c36e009a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"3a1e1a566796fc1ae4621cc5da4ab863*5&f17f4a708f8d94a815f437ccfc4c1294*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7_n"59872f04794b492f5b58373aea365751*5&df2188603a33f24b75ef291e51f8c327*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7_n"2df7609d46f191e31da6898c9fb6f6e0*5&a11745f1221757a5ac339a9e77cd2078*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"eecf6ed66c2d7c766d27c3351e62478c*5&3ffddba277f7e7bd1b2a3b8773a37349*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"72833ed190695c9d014b42d9eec80296*5&787bde298e87fe7a31a2dd164c7b95d5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"0d65d8d8faf13dee728bf058acdf609c*5&a536941a0e14ba163e8c279a35ed76a3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"d39827a9c8cb2cab86b3a5efda44df2d*5&c316c3bee325c942f970bf7e6bf09d8c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"c4460f4d899aed593c92404599216146*5&db464f643ccf5b4b01e62f4656ee424a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"e3ca07b0e0dfc1a8cf41796264cb6894*5&47e8928cb61e975c4a4dcc691a030053*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"c223ee86651d5ec18068f812f9d999a1*5&d5ded5a5f220e58b973e0c1fd58a1b60*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"b78585f7a65479c3eb04d8935bcfccea*5&a7a4dbc993a3ace1b6cd0a0e642b2f68*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"7f0a26c28616daa84148d2a25e30f8a4*5&9f514d3c5a2a2c261f81597ae4540ebc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"27e1b1a28d8932bee993fa38171f844a*5&0ca9c6d7857c4bc375adf75b03c39146*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"cd7166f0b92772537333ea5e096b1a34*5&5796c0330ee665f549b25fc9c45a9fc9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"fe056a15cdc2f037524e41bc80e865c2*5&d64767776ecc7b3ecd03f2529708fde5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"ca9f3de67b3cd0e97d878e5ba7b71171*5&6cb1a30b9b82f630509d176ee7799820*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"23f6e0065505dee0a533a44a25ffc379*5&baa3056646c2067f5ed02e84e40f30f8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x304x64x64:1x160x64x64_n"f36118f2442415edc89ce16eac2afa13*5&c6dfc5fc551ac3896f4cc2a1c4a011d7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x320x320:32x64x320x320_n"01d5508ac27cce317571c8d27fa78b91*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x160x160:32x128x160x160_n"f14d7b33be080bbfa98a575b712ceb1b*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x80x80:32x192x80x80_n"9265556eaabaf7295cd59d4e7506d7c9*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x20x20:32x320x20x20_n"64ee914845aca361a9f8ef9b5a686d7b*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x20x20:32x320x20x20:32x320x20x20:32x320x20x20_n"83c389ca8e9335757f87145b0677ac73*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x17x17:32x32x17x17:32x32x17x17_n"ec639c8f7ec42af4987d96892cd5ce28*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x8x8:32x256x8x8:32x256x8x8_n"5d9f4a750955810d39cb1be01d996055*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x8x8:32x128x8x8_n"2e2cc42cda387a5efb5d8322e7dc2dae*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x3x3:32x256x3x3:32x256x3x3:32x896x3x3_n"29973f7e577e1e036c78bc0123718d32*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x3x3:32x192x3x3_n"fbc38562235f0c93814ed5236e275d0c*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x19x19:1x512x19x19:1x512x19x19:1x512x19x19_n"daf6fc47f50dc497cedcebff67eac743*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x104x104:1x64x104x104_n"304e3035cd0a99cb610845b89a6f5715*5&647789c5341ba658e559dd19dae71c88*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x52x52:1x128x52x52_n"a38c3f9845cd7ce3cba54b6f02eb51cb*5&484f2bf8d68280566f9e15644bca5cf4*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x26x26:1x256x26x26_n"dfc635676564b2fd305c60525319cf28*5&aa5f1c952e6e65b1c037004b4a203dcc*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x26x26:1x256x26x26_n"df71d8943043deab927ef4a59fc1908e*5&64d5b912a799847b2b349951820a6889*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x192x192:32x24x192x192_n"a464e97b4f0714e8198a3161e454637a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x72x192x192:32x24x192x192_n"0ee04629e2e560b412072ebd300ba7c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x192x192:32x24x192x192_n"1659f11934fecc4ab2221dfb0663a94f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x120x192x192:32x24x192x192_n"912850db342152b989e1e6ae0d89f197*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x96x96:32x48x96x96_n"f975354264ec83d2d8976eba2107f693*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x96x96:32x48x96x96_n"14754666ae867bad79935f9d59a17038*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x144x96x96:32x48x96x96_n"351f0835fc9786e99049a6d3ff81bc6a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x96x96:32x48x96x96_n"731b1fcbb1a5e28aca9796a54f4a19d5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x48x48:32x96x48x48_n"a794df41241a2a1d5c6c156a43affe92*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x48x48:32x96x48x48_n"b76f617cf7289297496b33100a9d722f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x48x48:32x96x48x48_n"307e91f496574840c0602cad984ea196*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x48x48:32x96x48x48_n"6faaec3342bd06caa1d37b1f23c3987a*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x24x24:32x96x24x24_n"71475e4ad7ffa1854c4fe407e2b7efde*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x24x24:32x96x24x24_n"2a1ac1d15ab6ad6e174ce125090b3741*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x24x24:32x96x24x24_n"ed6d200d42f63ab3b583da767d2e69e8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x24x24:32x96x24x24_n"9fc80bb8b0a4011a4f344b07d1683b94*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x48x48:32x192x48x48_n"115265d2d219a6b71b591df9e9071fb5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x48x48:32x96x48x48_n"1698d30853cf3555746c860c4f35cb4a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x48x48:32x96x48x48_n"3d34048329023b1a4d995c74c7ed1b9b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x48x48:32x96x48x48_n"ce7dfb0515261afae12a2980a1aff0be*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x96x96:32x96x96x96_n"f8acb8c79aece0b5d66d976d25ff11b5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x96x96:32x48x96x96_n"e2ae8fdb5918cfd3f640138117ab2874*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x336x96x96:32x48x96x96_n"f96d61d1ed52ec10649550d4dd9934e9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x96x96:32x48x96x96_n"ed78c3b7736ee29710dd02010d55ab3d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x432x96x96:32x48x96x96_n"7597fcedcd32221d7601c2d9eb10e356*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x192x192:32x48x192x192_n"d6225ff2e15db685b82bbf2a49f03de9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x144x192x192:32x24x192x192_n"430eb08d0daba9ebbfae2a35ba46b5ed*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x168x192x192:32x24x192x192_n"2523a6c65b2444d03142ac005b013fd9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x192x192:32x24x192x192_n"9741ef135a7cd3cba7cabc5860f922a2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x216x192x192:32x24x192x192_n"2b103d319fac69177157b623a4c0fa29*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x19x32x56:1x38x32x56_n"a127540d80ac52a3bb1c2ca8f92a5456*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x19x32x56:1x38x32x56_n"b8ed145ef6394058548fdedc43e82c60*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x57x32x56_n"e652faee4027eb0c35c412fa01fb52fc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x304x64x64:32x160x64x64_n"518c5e3bd54a9d9698ce60a7370913a9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x256x256:32x48x256x256_n"2d8baa602d4eac25a2bcd95dabff0a3f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x1x256x256:32x2x256x256:32x1x256x256_n"fd52003094cde45a2562dd58b6aa4bee*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1_n"be9737126ebebdd176d66db72d3b3abf*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x46x60:32x512x46x60_n"8b1e9282512f974aff79d7f525c7cf0e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x92x120:32x256x92x120_n"6b4b6553ff90732b446a91d4c64626de*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x184x240:32x128x184x240_n"fc479aa25c9fba945ecccfcd639d1867*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x368x480:32x64x368x480_n"bd3d7fa032af926734598b9c3345d99c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab --dtag=ab 32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128_n"02506bb17595ecb99d1ef7d20426aea8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 32768x128:32768x351_n"caa1a79a040a12d19ae80fb817002c1a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56_n"68a03bd483c275ad8420ecb91e830bed*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28_n"bccb8b9d2b3c402f0d19873e39f45acc*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14_n"4f80750a896af5d3c9bbe5d29f3df04f*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"f58d0e97fb50d319fc3322e2ecf256d9*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x16x128x128:1x16x128x128_n"d260e80bd18dc89a92cdff0ca733b4e1*15"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x32x32:1x48x32x32_n"4486e4e764e92bd96915630f49af29be*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x144x16x16:1x18x16x16:1x36x16x16:1x72x16x16_n"9fdd20e1de5943da0716741580cd34f1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x75x128:32x64x75x128:32x96x75x128:32x64x75x128_n"7cd58f12b8b6a320f29b7382a0aed76e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x75x128:32x32x75x128:32x64x75x128_n"c58477d4b6a4f71c791759edeff42d43*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x75x128:32x384x75x128:32x320x75x128_n"3bc894098f1e6927828081aa9f671585*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x75x128:32x192x75x128_n"704ead17a88ffe35b690e5ee0dae4998*20"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 3200x384x8x8:3200x288x8x8:3200x320x8x8:3200x1088x8x8_n"b733d869aae61937ab39e7bcce0d3020*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 3200x192x8x8:3200x256x8x8_n"83d8c09679e50a5fc0e2aa06d2ee61f8*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x73x73:32x96x73x73_n"eed64a3985f554c44303cdd40a67ff5d*1&389a4331640b6d369f00873c34936bf3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x71x71:32x96x71x71_n"c724f3e14e42319ef5a98d147bc5dfde*1&cb5479c1e2134c3008c093c3bd07204c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x35x35:32x192x35x35_n"a21db4a176550243f3a93bcdfeda2b0c*1&b2f840a928fe38939acbc46ea1c06b0b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x35x35:32x96x35x35:32x96x35x35:32x96x35x35_n"604f2dfaa5581b3ac36c7e518fa2472f*4&19034e9c44fbb5df553b02ee8131e45a*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x17x17:32x256x17x17:32x384x17x17_n"9425cba50f6f061aa3071b0426863e42*1&c0e51a27550fd9fb5a27b9f5d316e65b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x17x17:32x256x17x17:32x256x17x17:32x128x17x17_n"8b47c77759756920543a56b527a265f2*7&2d339b67afc89ad003189da4d49ea57f*7"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x8x8:32x320x8x8:32x1024x8x8_n"c6896a613d3035adf34ed26b7bb95b0f*1&9fb35743c34cd3e3c51a82ae28c8f931*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8_n"94249c8a448f97d9017cdbe19c1ff119*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x304x304:32x64x304x304_n"d37846686a0093b8735e70911a45263f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x152x152:32x64x152x152_n"3d5bbb629063e01233cc68c6f4a1c56f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x76x76:32x128x76x76_n"9fa6d67014c2e71ba9d751719730dfcc*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x38x38:32x256x38x38_n"269783166c907c2a54a40519e68fa63f*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19_n"7f96102ecfab75718eae5ee885334d89*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19:32x512x19x19:32x512x19x19_n"d25ef60fd2787315ca0faea698f3ac05*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x304x304:32x64x304x304_n"d7acb59adf41fbb47943f9f9ce687467*1&175fd47e43742f62e414e48ff0c30f8e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x152x152:32x64x152x152_n"5321d79c9ea16dc6a2e8775e2bacadf8*1&6180213122e2423d0daa56fc52336a09*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x76x76:32x128x76x76_n"59850326321009c16f6d1b9cd04f4899*2&40f0af47e1d0e9325726ef4d5ab5474d*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x38x38:32x256x38x38_n"6e808ba5541d8ced39da19d2b0edbc92*3&862e1b482ce718d633acb9452b4b2870*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x19x19:32x512x19x19_n"2af4bb770047f0e19ab25bd0935709f1*2&9c35529a7c602d5fd4544fde9f45fb18*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x19x19:32x512x19x19:32x512x19x19:32x512x19x19_n"4025ec9eb92ca45152681bd335c6d202*1&08c611e03f63ac7a9e787ec9495583e1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x2048x64x120:32x512x64x120_n"c89a103401d5dd16ee22a819c76c1194*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x128x240:32x256x128x240_n"71e2a1c95a8d9918b03e2d028429cb4c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x256x480:32x64x256x480_n"496d0c6a1e8aff9eb88a3248f8e8c7ad*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 32x4x256x480:32x1x256x480_n"752f84b419698c1f95bba17ff1248213*1&2dd578afb5816f817258d483efae8bb3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x16x128x128x128:1x16x128x128x128_n"78774cd9137c1ad2c9ca85dcdfca47bc*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x13x13:32x192x13x13_n"5afb785509f77624106d68f5303dcff6*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56_n"82fc0cf27db4f465c557e6a5c97dd92b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"c93f70a54bf3b3296383a6c36a7fcbb3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"dfbc1c808700f47535ef53f2b2b97322*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"95365857ddb3ddb06fe708187ddb6d32*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"cf0d3ad4847db0047394efa8aa17b2f5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28_n"7df31359a953741cbc7a5e792125d284*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"502cb9b7825540cc1bcb7e055504fbbe*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"f78e297500382997ca95f93c89ea9a2a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"48df5dbc6deb356ba9cfd1a4b02518d5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"5e086a9672c0b74da13cdef4c0c4b86a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"d48617e95ea28186404362a7b5f693be*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"4e278c04922a97d7d8d6728fa03d90e5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"e149dc41e69e0e8fb5578655f661e2e3*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"4e7b8de1497ef3ef5f97a96aaa844319*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"720d427cf8830151e4f1b0dcc5b7e2a4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"c85634dc8c5db4eb20e8323add2e8d24*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14_n"ce9760e7f1f1a19aa5a97a540a28e4b8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"359fc8aa416508dc98190cc20ff9f0db*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"c9edbb190d220766929edc0c214617bd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"7b627a63c51d383f2eaf230d280154e7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e718154ae6776a83b25028e8013f41b5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e91279fc4f8db92b7969aafe9440a346*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"8356108d49c8179ea309358e3d83b900*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"8cfd0886fb39647fa8224776baf2ca85*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"a15163a2738f06ab71747ef80886f99d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"9727ae23f2b107965176a23b22b5e1e7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f04f23db5e3b53d7bc4117d2ec896a74*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"752895a58649864651865c143e21f351*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"7e5fd5e76f62f9bfac574eaa63f931de*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"14594779d063f0522e2555505ef1ec08*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"65e982e81bf673c2258810b3a2de5280*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"694fb2dd47c6af96bb3c81eb2a7e2033*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"eea7608a0eafa0ce125599889d51194c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"06f1887271720fa144053358b4f4a7e7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"4ce817327d7ff0b4b93f04ace2b49936*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"ec27116138828995cbe90f66857da03f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"030cdd182f28d4b71f0d46c58db32f55*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"3162fa168061d786116238c1c44aae27*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"c45698f17d7e4950d796df660f82b8a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7_n"302fd531fab83a96b88a87d7b13656eb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"183b9e9ee9e60ad44b5872bbaebaa2ca*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"ad602e5a0d874801d12d3decb6811223*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"eeb5a3e5107d838df95fc4b18833e9c5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"f542930b20bade1e2ce3ea2683ff2173*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"75e9e55b9b4fcd7145ccbc658d824b34*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"afe4616b9086c09d2c0e8a2717d81fe2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"23b3cb5f575887bc4e859c41aef62ab6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"d405adcf9ed8133bc0cfff0a6fcfcd42*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"46aa36238976176bdee35f38fbe7b04a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"d1ed766c7d343992950c9bff7226abc9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"d649587356e0722ee2197a3f2ac3e369*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"16a0106febc1d30b6983710406196cb0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"53154a269acc8303399bd6e0d5ca142d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"2a43432a28f08ef1f1035ef7edef1f5d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x8x8:32x256x8x8_n"ba8dfbe282774adcb4cdf39dd77845bc*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x8x8:32x512x8x8:32x512x8x8:32x256x8x8_n"0b818997d65a8288143a6fa023c2157b*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x17x17:32x32x17x17:32x32x17x17_n"350e3b58fccce67ee4e71f5c6239badd*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x8x8:32x256x8x8:32x256x8x8_n"f9c6ebe84dd4deb05af2edf411fd44de*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x8x8:32x128x8x8_n"6f09fb995685ebf93ad0179514110c62*10"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x3x3:32x256x3x3:32x256x3x3:32x896x3x3_n"7558a9216443a466e5dcc89c92c0ed01*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x3x3:32x192x3x3_n"faeb7b5a688ba3433ead9d188bd7c2e6*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x104x104:1x64x104x104_n"4f44a26eae1cda6a43663fdea10359fd*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x52x52:1x128x52x52_n"e288a4d80bcf8424891d9d15882f9d7d*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x26x26:1x256x26x26_n"fbbca5a44e43be6a4d3a30b9ce757d07*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x26x26:1x256x26x26_n"a8fa12940be1f017437e550a4b711aa4*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x2048x64x120:32x512x64x120_n"e1788165e8397df62f4c79a4046de858*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x128x240:32x256x128x240_n"d07a9c82d792846477f6ddb56ba564c2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x256x480:32x64x256x480_n"196b83111144c88643c325410870173e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2048x64x120:1x512x64x120_n"4397ed5aeb65ffe0a3f398fae3cfadd2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x128x240:1x256x128x240_n"cfa6d23039f42e7d76111d370a5df805*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x24x256x256:32x24x256x256_n"2f013f792d87ee6129403b167f8e8d4d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x65x65:32x64x65x65_n"14a863e9db9d4fe7a23583f9be9f75ce*1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x304x304:32x64x304x304_n"3406e28b810c348e1c902ad963863fcd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x152x152:32x64x152x152_n"83dffc61890bbe6a355c15821dc531ca"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x76x76:32x128x76x76_n"431a9f2b60a41908acf598bb27fa14b8*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x38x38:32x256x38x38_n"8c51b65c5bdbfa125a576bf7c72d4196*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19_n"be3840af870c45657a6d7b2b682b3937*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19:32x512x19x19:32x512x19x19_n"db3122215c23a3ba53712e3d3c3b3518"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x48x56x56_n"c054fd1813e973cbbcde6abfad3677c1*5&308ab598b2e486db58e1aa1ad1ea0343"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x48x56x56:1x16x56x56_n"4864af7bf89ca939dad31a2807df8164*5&15584d259b8b69c285c86728f6b184fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x64x56x56_n"879ca27b153c2303c71a59e566a7bbe7*5&84e52ea8c40a5d104b3d8bf60bba4a4a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x16x56x56_n"4f2ff3910e1f5c5dd046a4d829d1e424*5&b4cf9241a8b1ed5e0a399b0e1f554122"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x96x28x28_n"5b68427a5bc6cd950e13f053c474f462*5&58dd27abd616cd506f01e3b3ce064f36"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x96x28x28:1x32x28x28_n"ac1343a6a0e529899a6db0b4a9856e60*5&68cd090eb57d53cb828ec3533018ad9b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x128x28x28_n"af154e2d3ae7c6cc62053143087283ff*5&e3663d2c9a74539a860f9c49dcff410a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28_n"d970ba9ca10c176986dbb74a08cde189*5&36d90e512aad37386c57534b813c8f32*5&07eeff0b5cba67bf59c3bc2a430dfece&38bd7d0184aaca0d79e30eb03ae033ed"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x160x28x28_n"92f155661a909dbcfcc3d0de4a5e5c89*5&0022da7e7da1398fc91eff2f3ea6c958"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x160x28x28:1x32x28x28_n"3df81bc5dcf99bd391037f0f80d7eee7*5&d3f41073937f50182df9edd27e91a279"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x96x14x14_n"e3635d984bf7033c5760a1b9cb7a2ca0*5&aab22a7f24e23b5d8e2bcc119eb5e9c8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x96x14x14:1x32x14x14_n"5daa711140ee3be3aed22ee132c052b1*5&ef7bcf1c1acf064cb4a4851381ca9bf4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x128x14x14_n"90b972d5a0594aa7a8be740dc3021b14*5&8756c9e44292b1d2f126d5b8d71be7d3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x14x14:1x32x14x14_n"2e9fa5ce9ee2b6a9d3de9a28489b5b29*5&773b535428e018aab736ffdb70f9b14b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x160x14x14_n"3c170efbb1c904da8e5407a74adb139c*5&28886a6a6ca81ecd337a5f261450d18c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x160x14x14:1x32x14x14_n"31d0dd424041aa5464643efa8f8153eb*5&f8b163f1a27743219ee9f8a69c311fa0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x192x14x14_n"29808410830055c9b70436d5cd329c03*5&9c60a3048dfaeecbf8fcb0d7c8709121"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x192x14x14:1x32x14x14_n"06092582d73242fbc9beb7c1bf5f103f*5&47e73e87155f49fec63a79b638a44bc4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x224x14x14_n"fb2549643ed890bfd5f24b473bb032bf*5&bbf61a26ee70dc0c30c397124abb3f1d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x224x14x14:1x32x14x14_n"7afaaacf28f4380c95d349165336a142*5&9afa9e36fee9e03d815be4152997ed58"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x256x14x14_n"800013807a8608585e9cf9439e57fd4d*5&18e762a77a15d4ec38c4d05d2f0bb68e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14_n"a8cbac597208588cc0720cbf775e4eff*5&19855ff5450103cdbb40f4d687115b42*5&49c20aa0d89dfefcd20c9563a271d3bb&f379b339a4adcb98b06f5329ccda1789"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x288x14x14_n"c8b29b70aeee97ce10404a1e12b42989*5&cea869df3a06170becb28f06d68c8f8e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x288x14x14:1x32x14x14_n"d9130314b76150576fc8671f216c8c18*5&c71973bc01aa06f80789898e315613dd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x320x14x14_n"e4d98f26ba66adf7047c40a8adc0cc39*5&674d5b21507516e4adc8c76e1502866f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x320x14x14:1x32x14x14_n"086b25211e49a28af851e59f72e2e455*5&d41d5f9a367e242f7c1bce442bd6bd78"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x352x14x14_n"a6768fd9d3594ccc20787de534a1a9e8*5&dd741b1fee1d464226633b135a0006c5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x352x14x14:1x32x14x14_n"42ba97787b2075f6aa3960332829684d*5&7d1fe24a1b84fcc6b5589d5961603b77"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x384x14x14_n"aacda5ecc1758f814226e9e99f3e2a59*5&880c0ab9cc36344b23b7ed77c8865a64"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x384x14x14:1x32x14x14_n"b8150b2413b476930a41297af3d8caec*5&6ffdae0c2b6c1d6514678e5846de2091"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x416x14x14_n"a16b5b9f868a6e08f73210bb697a2146*5&273970d4b22c53cb55bb95d07a2fb7bc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x416x14x14:1x32x14x14_n"a757f454959e97d0c6bb9b1e5e6dfd86*5&99f494be8deb70266f37cf0a7dba938a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x192x7x7_n"080afb5560c84351be25cb1e5a0a4db6*5&1ecde5a4b56a90d913f91ad5bbdef95d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x192x7x7:1x64x7x7_n"77321a5c0fd7eecaff1fec07ff1a22cf*5&0e49af7f74d24c0bf21bcdcf296b14fc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x256x7x7_n"6ecaa45521954680db9a0ae0c79a54a6*5&fe79beb709ed5d738f5d29f8195d524f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x7x7:1x64x7x7_n"5b7834198ee82bdc761d76468bf90b50*5&407415132b7059dc595f73ca0c901d5b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x48x192x192:1x24x192x192_n"eebec61f41beaa96a0f7e6fd1e8bf968*5&39230d23008acad3eea5fbe5dea4f060"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x72x192x192:1x24x192x192_n"de713892cadceb2408002d7ce526964d*5&4b6d73b730b9500c98999bb348fd06ec"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x120x192x192:1x24x192x192_n"cf1c21e13165fb8354a9dc4204732d56*5&820479c6b05163edd00820ddf87c4be6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x168x192x192:1x24x192x192_n"11ada07e3ada4a62dd32d198db9abf71*5&f03247e979c80796f5030ca8f3cc7da8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x216x192x192:1x24x192x192_n"f86535482437bcc1db6958cd02844df6*5&ea72646ca3839e1da3b3dabbb2b40a8c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x75x128:32x64x75x128:32x96x75x128:32x64x75x128_n"bf42e5002fb2b11624ef55b534fea3ac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x75x128:32x32x75x128:32x64x75x128_n"5a7611d46ee9bde2d1dfafc8e4839ac9*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x75x128:32x384x75x128:32x320x75x128_n"b802a913e5fdb9eae301e66be6685c64"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x75x128:32x192x75x128_n"34dd2685ba04c7357787330a969daedd*20"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 3200x384x8x8:3200x288x8x8:3200x320x8x8:3200x1088x8x8_n"c12fdeb7ad1da5646d329243121b390e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 3200x192x8x8:3200x256x8x8_n"dd89a6f6ba9a38d74a6bab937ee9e43f*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x1x256x1:1x196x256x1_n"67f713e58bf444e7ce9457790b11c282*5&51e38edbb8adb16430da1f633a31d721*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x55x55:32x64x55x55_n"a6590b5648bfac4937f1d7061d98b55e*2&532ab158ff89408767ac842334963567*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x55x55:32x128x55x55_n"c8b1a4b347f9fc913ee034d71e299d4f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x27x27:32x128x27x27_n"358dfdf8627cb27c7044bca5a101a053*2&4ee50203ed732be30e99afacfa06b0f3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x27x27:32x192x27x27_n"c937b65de7aab8f0f45584f59e00c761*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x27x27:32x256x27x27_n"24ef15600e17ceefbde5ed8f7646b280"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x13x13:32x256x13x13_n"449fffd54018754a26526f88e4f22471*2&71d4666aca272753df614b7c34007c91"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x320x20x20:1x320x20x20:1x320x20x20:1x320x20x20_n"e3f215d8f9439af2dbb76eebbab36cf6*5&10f1ba16524242f61e6613405a3585b7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56_n"d3d8efa322193aa14f6c47dec886d4d1*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28_n"aa92fc6e8f27df5c3799fe94029eaedd*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14_n"11159c01de310555bab05fcfa437b0e7*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"e192051e3d3f21cc1dbdbe3356ce6998*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x58x28x28:1x58x28x28_n"3d426a2a66cb8cc83dbb73790eddb693*5&d725e9505c36de44d36f7c7775ab361c&8ce1c4f0fcc3cec0a753b4b31df3a1ff"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x116x14x14:1x116x14x14_n"fde28610c051867390ba2acfea5a2078*5&8daa4ebadaa64a85057187e68fbb2882&dd654f732b155547ead1beecde37f0fe"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x232x7x7:1x232x7x7_n"0c8cb61a18a0a4be916c50e0d52caf9c*5&e4ec5b029df778a007b0938b504b5c21&72ec1d9c9e4dc8ea98aa1f0e61909f8b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x19x32x56:1x38x32x56_n"c741dc73d97ce84ca0da0e973a67eaed"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x19x32x56:1x38x32x56_n"0d150ea9d9ea009028939612d9a59020"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x57x32x56_n"12812513dd096a652253c7f9568d5bf7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x2048x64x120:1x512x64x120_n"ec20761a8c1e3f0efe0d73e9750effb1*5&ad59408f71237fcbf18559297886530f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x128x240:1x256x128x240_n"8bce3da4923241255c17488650db2c42*5&3c179df7a8d0fbfeafbfe277cafe7a77"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x256x480:1x64x256x480_n"915c8d5fb26b3792fea114272ff10bc0*5&692ab8a85f110b504db0e82a80af359d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1:1x128x1x1_n"5a619509d3d8daac342497a1eb079806*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 1x128x1x1:1x351x1x1_n"e2ed9261d72bea80dbe2d907310c1f39*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x32x32:32x32x32x32:32x32x32x32:32x32x32x32_n"b966747c62a3b74de49458f50f287f8b*3&c371302458d9a743a23e7b9b8d2ee935*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x26x26:32x256x26x26_n"9784127d695db1268a648680c0c01fef&f454dc723eb7368ec8ce117517db8ee9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x16x160x160:1x16x160x160:1x16x160x160_n"f759a0352deef1a8a79e6b5910880301*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x32x80x80:1x32x80x80:1x32x80x80:1x32x80x80_n"c10d0ad91ec9123bbfbb131a3f3c37e1*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x40x40:1x64x40x40:1x64x40x40:1x64x40x40_n"530b6665bbcf2da945cf5d12fbf139f9*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x20x20:1x128x20x20:1x128x20x20_n"293b69e2f0b9b9dd8597255d7a3e2e07*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x58x28x28:1x58x28x28_n"4320e0bbb093bbcb7bb66a7b392b6e0e*20&134a044c62ba5360d7c858b8a3d100e5*5&dc3a04d4bfbb47d7186b007a914241d3*4&5e416943402a9d412e8801eab60de605"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x116x14x14:1x116x14x14_n"a40c7840445c4e84b4fde90008495b2c*40&f3528a28ed3b147b98c6ad9d55a89ad4*5&63b80a019022aedfda6897db1498c671*8&c611bd45f7e4493ea64d0d6b641c2032"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x232x7x7:1x232x7x7_n"dd71613fa3ed6b9ca86a4d318481125c*20&b579e438529b50052e870fca66279e27*5&c39a647e0c740b1fbb73e83ba91fd992*4&0e7043fccd42f9ddcfa9221791e0f772"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x24x256x256:1x24x256x256_n"e511848c233f9111108deff41b4a9ebc*5&c501d5b366f0de7432c995c2e117fe89*5&357911b1fa2ec0b6709815b62533beca&4463fabd2014a684279289ea4682575d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x56x56:32x16x56x56_n"12d25031826bf3c7325231d897c4925b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x48x56x56_n"b14cbbb658f4dc5fc759545bd8efbdc0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x56x56:32x16x56x56_n"6daaf29b3bfdf30a9a5495a7a70396de"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x64x56x56_n"beeeee7a2af026853eac5b005ebd5288"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x16x56x56_n"0d2666ee93bc6eb3a2d43b65b86514b6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x80x56x56_n"58d0fc9601fe61bf679d9b46b8db1216"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x28x28:32x32x28x28_n"0265adaa5b50f1c12eb163178e232d5b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x96x28x28_n"7cae8ba99bbbb17a79262339ba8b4410"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x28x28:32x32x28x28_n"b717b0b0d9cb6412e600287c9ad4001e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x128x28x28_n"09b864fe8f38c37f59cb47afcb42d17f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28_n"b5cf375ae164a34e7a745bf1a879a290&797e1d837350b4a1f1a1fcde934e7b2d&a56d546129fe26ce4731904244179744&3bb85d93ac7582490b7f3965716ba625"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x160x28x28_n"38165df4b6073326ed042e4d02ca26c5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x28x28:32x32x28x28_n"f1a02958f683d1b0a05aea951f32e527&0a5cf225d72315906b47a641053044ba&3a0fcac70a1bd3df56800cca9331d2c1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x192x28x28_n"30d0651d110816dc112b86285f18ec4a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x14x14:32x32x14x14_n"bd214985441dadb51afc1ea78980e1b8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x96x14x14_n"ae1e3c48af8f857e97b5db8e9bd051e6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x14x14:32x32x14x14_n"944e525629f068743bd4e83a04291705"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x128x14x14_n"149a35e0125ee6a87292b19d5c4a0fc2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x14x14:32x32x14x14_n"a2fa4330d0f2458eab52f3ea94289d03"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x160x14x14_n"f6b3a2f27126bfe27e61d880e9d32be7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x14x14:32x32x14x14_n"f06cc555bc269e22ee984bcf0e9e91d9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x192x14x14_n"6280be85ba67076338e36e1aae7be603"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x14x14:32x32x14x14_n"0d903da9ba6ffea7f1d8e96e97c29e61"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x224x14x14_n"b95adfbbf309f2d2689db3840a4a9c30"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x14x14:32x32x14x14_n"64b1004f16a1b5278f4cc2a48ead0518"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x256x14x14_n"e773a0ae62e3395f95cca2e7e73d1813"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14_n"d621f253472676626d732088e1c49cf8&47fb9b96be71dc02fefe8c7638d7a676&f105d12744e88336648350d370e49cd8&2c5e59fc6b1fa6fcc4b3a2aef58404ec"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x288x14x14_n"7178bd448451a90b63ed680381918158"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x14x14:32x32x14x14_n"d829e0b3a52fc237d54525bbf8d02926&2d9c7501204a821e1a4ebba0af8bc8ef&262e564ae8c10682d4646adc8bed9de9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x320x14x14_n"d811b632e0865b9bcce88a55a20db139"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x14x14:32x32x14x14_n"e286b08dbe715855b34e1e3a70a1509c&f916b1c51e5eb507a3be4fe3299e084e&342545d0fe4027d13818a4d5cb192b6b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x352x14x14_n"9aefa57611f9e8926c09b5d90d57f54f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x14x14:32x32x14x14_n"600ae7c16732c7321b0c787ad90606ed&7b8078f14106c25bd0a0d4b3c8c1b7b3&35ec477529ad984e304985a87b90d9d7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x384x14x14_n"829bac39eb053936703891f70fc56343"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x14x14:32x32x14x14_n"18ce219f38e90e3f8f82609221b88c28&6bc10c10a5ca8b99f8db6c54d3740c1f&3bd7b4f56dcdd6a887b558b294b2becd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x416x14x14_n"1f61b5407459e172e97964e7247d5d69"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x14x14:32x32x14x14_n"31dc6745a6f76830e9c5d3e67c82ca94&8c5c4beb8ffd9831e31b599f7be54638&93538be3b9b411b9449cd55b24276689"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x448x14x14_n"23dab2bc1fa835d24aeafe7a4428bb32"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x7x7:32x64x7x7_n"7a6efd89cb4650353096ed7f5760d9e0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x192x7x7_n"837e9f104c0491d0be59c4c9127606fa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x7x7:32x64x7x7_n"b03846159ab4293ef039656fec8e9fba"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x256x7x7_n"b9219e83d7423a1c026ec4c49cfe623d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x7x7:32x64x7x7_n"02ed8e0c758c813bd81dd4fe85e094ae"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x320x7x7_n"5f400512397df0aefcb35f96cc4e832b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x64x56x56_n"3c0d820438a002ee83c0345d66b5cfc9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x28x28:1x64x28x28_n"79aa1dd3fe5b0a88a5a3fb7b906903df"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x28x28:1x64x28x28:1x64x28x28:1x64x28x28_n"f23fd873aeae4e8fcb4f006cff2749bf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x14x14:1x128x14x14_n"983a0cb39ecaeaedf199d72893be749b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x14x14:1x128x14x14:1x64x14x14:1x128x14x14_n"f721b91228fa147cd7faa946876dd246"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x7x7:1x256x7x7:1x128x7x7_n"14cc5dee31ceffe0f720afe89a0deb49"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x32x32:32x304x32x32_n"9feb81e2d4fd82ae15bc81bcd863506a&aaba041a259cf3194686e266fc695ae3&1a846433bbc31b8e6f6dbef9470afaa3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x304x64x64:32x160x64x64_n"066593f90120a49885438ec558d912c9&c588cd3b4932524b5e1a85fd8ffa05cf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x128x128:32x96x128x128_n"bf5d80ad3aabacd791a0bf0ec21d7ba2&77f36c7e50966cc6854aefde905f8837&b5f31eeb7a1e476382942b1e1804a05b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x256x256:32x48x256x256_n"e2d5bd22a8a708dabf10e1742afbbb7b&490a18b21307c5e8ae489df6a4451dc1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x512x512:32x32x512x512_n"9e80fc0217b3a0922d22a99b3fb42300&40f93dc6d4399e2cbcb8a5db9c852994&3a117423b38dfa66447eb2d31acc20cf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x320x20x20:1x320x20x20:1x320x20x20:1x320x20x20_n"8e4b5bcd915053693cc8f1cbe4f8029d*5&9e72e152698f7c011127dda62e372002"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x144x56x56:1x48x56x56_n"80a20ffbeaa90aac8b139037d6430e70"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x240x56x56:1x48x56x56_n"6344f8afb464e895695eaa957e889f97"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x56x56:1x48x56x56_n"646c9dc4698b0b829d11e61e7fbff928"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x240x28x28:1x48x28x28_n"66aa7cf8148e24d0e9aaa42e3071c345"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x28x28:1x48x28x28_n"77dfd1b7585d7a29d9d6ddac2b45560c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x432x28x28:1x48x28x28_n"9eb16ace21d6a03424b657fc656945c1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x528x28x28:1x48x28x28_n"16026fff420cabe89f2e7f0d7b8a0776"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x624x28x28:1x48x28x28_n"2906a5be69a6433d058b8c562e2f6c44"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x720x28x28:1x48x28x28_n"3048620e8890d354b2f58cc031edced7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x432x14x14:1x48x14x14_n"9d7e3bd595788e21d58dc2c550a966dd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x528x14x14:1x48x14x14_n"94b769f3660d74b01ad54d0faad663af"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x624x14x14:1x48x14x14_n"173e00a0e1572c7af2374d369c2fdb67"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x720x14x14:1x48x14x14_n"8ae78efb05a2a7e6e3b4d9d706ebc3a1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x816x14x14:1x48x14x14_n"b74562353665a7d4f42cb0c9cac28bb8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x912x14x14:1x48x14x14_n"54a501f7785d80ae072779f007c74833"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1008x14x14:1x48x14x14_n"43df37246e3a7953bf969dcf18aa3d3d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1104x14x14:1x48x14x14_n"70ff362f555e1ad8ddd76931da0b4415"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1200x14x14:1x48x14x14_n"7c39b16f37dbf5c1b3e8acc41763e128"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1296x14x14:1x48x14x14_n"c121bbc065a94d998661b9cb8a28a852"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1392x14x14:1x48x14x14_n"a12d1825f0dcef5048a129a6c1a82601"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1488x14x14:1x48x14x14_n"6e39f10eb094f65a4faeced63df64fc7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1584x14x14:1x48x14x14_n"30b020fbfa031df864b17aa4b2de97f0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1680x14x14:1x48x14x14_n"1c887df97459cc1426662c6f1b6ff3ec"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1776x14x14:1x48x14x14_n"201ade4f537927d1fea7120cf07ded80"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1872x14x14:1x48x14x14_n"e3c3bccb2833ef0bfa0775a167f52932"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1968x14x14:1x48x14x14_n"094c0b0b9f4c9360a2fae612bd0392f9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2064x14x14:1x48x14x14_n"0dee8ee06d9be06492ee532806b27dea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1104x7x7:1x48x7x7_n"19f303f4b8ca12fa38ec06d1b2b77d7c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1200x7x7:1x48x7x7_n"01611d46cdbcf8b281adc54f33b6009b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1296x7x7:1x48x7x7_n"1cfdb2c59f911eb34d8ef13affc4ceef"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1392x7x7:1x48x7x7_n"21ca3ed3371e9aa255a96589d5f8fd71"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1488x7x7:1x48x7x7_n"e5ef992986b5ca5da5694eb721dc9c9f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1584x7x7:1x48x7x7_n"acf092d1208cc31dc0bf3d17b5a11c7f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1680x7x7:1x48x7x7_n"9e679d691372c4813f692bfcbe46ad68"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1776x7x7:1x48x7x7_n"030afcf4a3ed6b83e1124e2d1492978e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1872x7x7:1x48x7x7_n"8483cc66cb43b346ed7d5b8693e94c4f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x1968x7x7:1x48x7x7_n"9f21b41075c8ff161d46cd068a10df86"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2064x7x7:1x48x7x7_n"6cbcd82852d844ba2d8351409f139227"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2160x7x7:1x48x7x7_n"c7973e3c213ad0e8f7051d140f20b1fa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x104x104:1x64x104x104_n"46b6c09f0a631374fbfafbfe519645f5*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x52x52:1x128x52x52_n"4676864e20f24f0da90b4bb729a947eb*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x26x26:1x256x26x26_n"349b9be1e39f2f72d231ceb175b31c7a*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x26x26:1x256x26x26_n"98266ef0a0730b26bddd47ae6b32063a*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56_n"061d4e236198c799c14800bad2f15836*5&4bb4e9d337bf15924ec0ab1a3b8e7107"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56_n"e40676e13d04a2f0fe1e784ae40a6039*5&3b6be2c900bade9a95da797c5d434523"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"abd952e73b6d422af14f356c15f4d913*5&467379af10a984da2fceaee8b731e58a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"407627ec558f931763c683cb7b28cd3d*5&1ecb72a66f7ddd2e8b13f428d642f14a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"f901b39403ff5c0ba1df2b19c324a4c8*5&f544dc2f0e7be1d98bd5a58bc7ef5f15"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"ce65f31d159980031f534693444df634*5&94ca5e956a65b0453f7b97389cb28686"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28_n"acb19c117a3e2b76ed00b82cb55190d2*5&ed7eee4f80c2d00fb2b130ef5c2e156d*5&9acd82d78667d85c7ff2e03567a3c082&a6f200bfa01e11d373e476efca42c009"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28_n"23f62e7ffb4f6ee5dd4797c60cb2fda5*5&83c1c0d4c2939817e09cfa67835f406b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"51bdf34b4c7cf36faa857607fcbafcbf*5&0d07676470fa9f2cb4210d89e94392ec"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"ac458fde5cc93b6c348585b72ace560b*5&d97c817205f3d3b4bc6e3fddb33e7efb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"68c56777e234cc5d60195840ac602073*5&9d8164f16f3dc5ca195ff7cbae4c6469"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"551bc5d09acbb7967e28cc8cc7d6af1c*5&7c06dc2b4da559ad526d7c4bed6af8a8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"cc1777b1c2d4350f9cbeb356320fc31b*5&6984402358d1211100e9c1b2bf8c83ab"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"a15e5f53ba9eee2b1f40f709a03f1c89*5&545f2e5ea8f4d9f59ccda433a10a18e9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"adb80a1f4e8c0854b7e5a631c6c54a79*5&8c2cd28e6fa75a778565347091e069ce"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"3d659ac7f709c6b1cce4d85c730e81ee*5&df8819c47614d1f144fdc414a79212be"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"8d27a7e0b7784244a6630e5e9c5b899c*5&c57579b225f2586ea9c5da18230339c0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"40b9a2b91686e0ac4cd6287b4aa658ff*5&0d5955e86d8d0a287cd5eed288a6f37a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14_n"d70678fcf0926aa2f7b0546ed6730d2b*5&1f302d145be2d4e84d160a32dc1e52b6*5&e68e1caba6df51e778f6f4c1c7ce3d33&36764e21e807a958008695eb1b299635"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14_n"aea63d57cb0f63a66614428e0df63728*5&8a2511d19902625feed2b5066cf36b0f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"09ba19ce4207e6b26bd01f82e5192184*5&d9b6f80626d11abc710e54b243e5bdae"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"6e168ff5e8567b12e0f3026001cdb308*5&3ae0013c58b0b149540a363829f7c2ac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"7291b8cf14ee0eac8cbbabe0072077bf*5&70b992ef4cc48f9f5c61390b44b5faff"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"2c1a2aef0b8da4d1dda337d3011cfdfc*5&53ba272159440ae44342e6cfa440ce61"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"681a82324ffa2eb48b480d6e51bde1b2*5&de2405120590a7d6e8a6df166ec64dc4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"e9f127ed7c01e0b993920fb50884077d*5&bad76ac494d29ed6ab21df97cc49e143"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"816fd506a0d473d6cb6a4b4afa8a3758*5&6c5db5584cc4d20a279d4ceeb8cfa325"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"3c83eddcb7a4715386c11db7732732b4*5&a280e02317722905280d84915eb30211"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"3c7b8a97df9a60b321b051ad13b3476d*5&5eafb48fb9a8cbc594ffee2af2291dbd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"60c9a48e3e657faafabf15a60431eeac*5&605f4b9605a3ea9f4a7c532c062409b9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"961a25644519ffc7c7982d66d82bd67c*5&60598c619d305281e2b6d024558a9102"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"4cb812566ca7ce264efa1bb337fe6854*5&51eec0d520e690d7ce9c99478764b68d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"bc09b907b0cc82f7eef1f7bd296a1361*5&cf7c6dbebfd44d8a331d3f71fff58b04"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"0ed97740c712d90b841c5467b609f84d*5&a16bd1408d40c96e49e3e669a722334c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"20a71e3f8ef90446c0d4c30b487d7147*5&f8ec351c07db45d7ac17784c59c3f8dd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"678f6845848d41721fb84038ee5f6e0d*5&f9c7c046ccb212e96e0bd1a1706fe795"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"395acc1390620f8cd076386f02ba46c5*5&de1a45ece5eab5805471f87126afa4d1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"00bb3af886f33bf79d57c2958ad50e17*5&b17b102ba234544f3cadd00fa0aeeef2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"c3920aea3cdfadb23c5c0738e4a7ff9b*5&3399672fe1d2f97ccf15fd07e9e4795c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"96b7da737d167a98f39d277659cdc577*5&1deacca3bc634d31b6e1a2030113d14b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"4fc8a4a02308c8a8e7a961ec24631100*5&d7ff4c78b62deab09f5a41c481bb4377"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"2264fb848d3a6ccc92e88290a92549e8*5&b2eeefc253f483e57a4a9fa9a9844bbf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7_n"fb82347c2339df82e4a8312a92dbca17*5&f4915a0dcb5a74a6feddc55891dc237e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7_n"90b1dc6b8eec8df9271b3e824ab5f37e*5&648dff8660f2b04c536c8a8e1eed7763"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"ccf1b101a908390a0936b7ce3c1747ec*5&e6b83c36fe418f268286a3fc3a0d1d59"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"f36256163b8b80484b5201594e475cce*5&c4c231da0147a14fe1b6a7ac749e7675"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"81aad2f346ac5db5fdb5dcdfcc35829d*5&44a3723a3259ea0d7572cdda87562f40"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"c9820e36db8ad8e71128807823f90088*5&055b1e8b85464d49872223dfd24001cc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"bcd38b5cd28e31c5ac7e1627a3e6d879*5&48b6afd656e7e2e2d9a369b9fa8ee48e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"63e87517f1c958763d20ebce6e58ed3a*5&5120db370f9dcc75a5f94a01e4d07805"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"87ed11bf5d0ae6ff1a84f6c3fbaf3bc6*5&8831226084fab8c374d5aec4ccb9cca2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"fd57eb8ca4986b527920f58412900763*5&ca718c55ce7e01e991e0cef880943897"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"03ba642d98d97f59761020285f293b9f*5&dbc9b726071534f32861677a4fe392e0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"fb2a069428984408a8b22c8f2d106417*5&78beeed9d45848e70a445701fca3fbf9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"5968d87100d51344039e2f5133c1bd7f*5&308633ae3d2baa0086bd697a1e800b38"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"76f9e7e9896ab4a6a743f3b834095e3f*5&2fc90cf7fd1316b19012f29eb44a9bbf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"77d00ebefdc8b7a0d81937c09e6f596d*5&9950fb5368e1b9e116a7ceead3139fa6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"131f973366a286192443a8f84da99081*5&3caed3e86f7e8204375afd2d1065acdc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x16x128x128:1x16x128x128_n"68c037ff924b1c6ea8ef458ec6a92466*15"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x32x32:1x48x32x32_n"3900f391687b69b82cd31b1b6dd59b4a*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x16x256x256:1x16x256x256_n"33a2ba719361cf10080296c9991bf06a*15&888d1dc668d73667f9896ad5ffe1f72b*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56_n"9e41dc41be65e929974aaace9cc4cb8b*5&5a806e4819c0a85e3a1ee736e5736e01"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56_n"96d244914ee7c417420a9a0ec3d530cd*5&975437fd62e38bbfde3c5a27084c0fd1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"3147371c56af0ec875ee81259a0bdbbe*5&e0c2480d8b95ef3dc9fedac98d3b4caf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"8d3b5ff1ad386f2ecb84414779206e55*5&9344fbc6f3809f774afcaa2ea9942918"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"f6be61592e22ff9631d1a89568eea3a6*5&810f0f2bb48d7cc14624f0494302d297"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x64x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56:1x32x56x56_n"27dc78892804ff472be407e76dd4c77c*5&c3e485a5a38a92f73621bfeb45140742"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28_n"bc984027f0816b94d059c8ff707fd43b*5&6e256ae7469c25ca691e5823531bf865"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"135422fc2b927a8006b8ab70e635e109*5&9acee7785c7778057ca0581509f1151b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"5af107046ca36ffb151085b5cacfdd9f*5&05ab51a7f5590b7ab112a4bfab894c6b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"f2c3e01b83b97b1483a70dc2e6126689*5&48e0813452eac2a8a186a09516b05cf8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"6cad38f2f3b05a7486ea003191fcfca1*5&8c81ad0564697658b5a4b421babeff57"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"8f626c1698bcc2ca1963f43645e3a79c*5&15ef3d41ea4a48d199644776acec2cda"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"a8df8ee332053870b6326fefd77a2928*5&6f28b83546ca5d06eb057b5afc4ce909"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"b4d1f88477985a9365cbd18a7f41229c*5&15a6a071082a77ae9b29f3c69be75871"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"9d1a6216380f2947b783a353fb037f33*5&4bed7343d23b39c115237ddae2b9e8b0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"299e42bdd4e5490c3fe7dd70981f0ec1*5&88520fbb947944e8ca870e720c8b844f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28:1x32x28x28_n"dbdc64ee61ca9642bb1f018f8b50b70e*5&ed79aa3e25a729e89429433e736d6feb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14_n"0518f7ff33df33366fb4a6e8f009e701*5&28dbaf6ad7a36da5f1ded98e3cbb079e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"8d6e91561b26b1995e6ffdbf2799956f*5&196b804c77523c2b570af35722d83238"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"9279599a387105b1ea2ea0aedaeaba46*5&49e418e7cb1e157163165f928360c7e4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"ba14b19d5a824b05c0cc480e5a40fcb5*5&ba92ffb0b961ab75b43dfde6ee13e573"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"25fb9d3a17d4b75049ce3492d0648649*5&5284fb83f4065d7bef7131d03e751164"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"cdda668fcd51e659be3284d675f71c21*5&8e7c2256769efc809f5b110247336add"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"b3978c999d3aa42b1fa6b79b50fffc80*5&b102cbb1e9a099f95b0ad65604706180"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"e19332f5b8a1e0aa76a525da7180d96a*5&9e8c063faf474b32e676b50f2fc01d1b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"800d85ccbc7591f92b55fccc452b4627*5&0429c5176193c9acf3dc3833384e850a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"d0f15e415f994b70381eab841afdcb07*5&4eb98fae145760399549fbd90be7bdb8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"759944b6de1de5facc5ee1f5da4523dc*5&ce55aec14f77f6c65338712ca83a6163"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"6fbc1d60e6ad862f37cf8fe83bd3e363*5&85d74e4b5af299f3c2f99c97f49254e9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"ed5cd077096c91f7dd759efbfe1f5174*5&0997391f0a83cc83c3e3eddcef10bde2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"80167bd2c12a83994bda307178dedb69*5&f8bb0bcdd884b07830d7bb12731fce27"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"6260e523d70d8144d04c5ba09b5e78d0*5&c6cb960ac7bbcb462843f7feb8da29be"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"f417ff36ef90bf1dd7325e6fe2a87cf5*5&3ea357f9ad7653b06e14c44cd35f1046"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"6480ce76a7202b0ad2eb5a53c1f43a7c*5&0e923605fac985cbe14ca1cddea20bc4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"8c74c3e35c59916e423ea8f0f621218e*5&c551a8237dea402e8333b35dafc6a8aa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"54b8c8f4545ca7f374e5cdd0cef355f9*5&a7d1a0f73d3fc68d628d1714fe650088"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"3f3670ff0de160e579c7ef4e0bd63492*5&82376908cf1aaaabd703ad3d175a95b4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"258667e325952f7978dac3db967d9892*5&a9f7a344ae14b3dbb1b10684447e8f44"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"69e53fb1c799b24c1c717822c0551b25*5&2408f5ab61696751e169f9f4fc8f18a8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14:1x32x14x14_n"66fd6fa65d0adf3fd2c9e3907a0ae3a6*5&831f0772321a1f3c8814e3d7e32cce4b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7_n"ab6439b1575e61a4bb18f57a1b6f138c*5&a02f2d778b42a4578b759f970cccfafa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7_n"0514fad8eff0d585c7ef260955fcf92b*5&9aa78f0eb52e7189d0f9eb705872a321"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"a565fe48f403d42f8c291fdf293f2516*5&66f5c2455a62874c3f349cc3acbecc58"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"76ef0779f35c99e83e60796faed94ecd*5&7d7c463df7d43f92faff4806d99cd4eb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"e0b2716831476fbaad23875e98a2b7b5*5&4182397717788559107d9a96b4e03567"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"0af377abbcc746dd2f1f455c2d0b4208*5&f0c1b99599c14f758f8a2442ca7d218d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"3268cda21fc29bea9fe5babadd2cffe5*5&50217c68cee4862cb78b7ce5b093806e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"d3ce5bab057bcbfaf2d2cd1591daa152*5&d861ec57473a089b54ae2f76456c78e9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"de70dc561472663276bb75e5a10451cb*5&de5b22994e18bd0e64c144ad9d5350d3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"442ea0cbba4652f0253b34cc020d7338*5&fdb99a5841b6ecdbaab3cd8255a37737"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"5c9036713e81fce6c9fbf8eec2bb05be*5&1ac96aaa1ed68ef0df2a1f0e4fcac486"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"fc9f58d72077b38d930a1c41f2eeaf07*5&0097024af24b10b377b955595d550cea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"a2e0b362b9ffd52d8472c775d43a751d*5&ad38126e38120666293cdcc3b3360a1e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"c25d731e4ad844bcddff9e1300c18332*5&57ba80392c988bb6f3a04fb8b4b71bb2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"97ec52cf1b2b53298c7aa2a7adeef366*5&ff219b569b498e687fb85c4ce60438fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7:1x32x7x7_n"305c9b652861c40a1c69ef07d6774e9d*5&223ae5e5ecee5abffd296b7cb69d07a7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x48x56x56_n"59493d2c9ce2d98ac718766e9c65219c*5&44caf828fc6d822affbbb877b47c4441"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x56x56:1x16x56x56_n"c6e1862a0fc8dd21b887d226903428be*5&8de9b56996148cad803be6af55034fa5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x64x56x56_n"884fb3d204ebd63e36e63d2bd924aad3*5&8885be575c8585a89d05bfa7099af274"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x64x56x56:1x16x56x56_n"158494c28c6b536cde827e335525f2d8*5&47186ec52830f08cb48d340d6eb0e72e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x96x28x28_n"9187440892b241a0a357adc6dc64150e*5&e56aa05b072828fcac9064aaedea4326"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x96x28x28:1x32x28x28_n"a03ab73573ad3b3143c683af482649a2*5&73b08a01b3d38395da37da8f36a1e7af"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x128x28x28_n"d676d9440c740519c1a5a9d44a55385e*5&f5b919605c36cf237331eedb679f339f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x28x28:1x160x28x28_n"00f521641a33c55392b3e83969b7aeea*5&06d67c854eb1f4da4ed2cdd3a45c1579"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x160x28x28:1x32x28x28_n"5eb363397ace7d164d988989e99e2aee*5&d87373694ff62394dd3db8820ca25377"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x96x14x14_n"77be8dc8757a2d39b87d943ce49ae2fb*5&00ad3a40abcc20b5105748b4ca970371"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x96x14x14:1x32x14x14_n"641faca39b7dd1780bda2a23e12b881f*5&2d11a6ccd172facabe50f41a805b7647"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x128x14x14_n"27d9b30e1f5baec8e9e553004436c55c*5&128c2364c4a020a5da9ca9ad50cfdb5b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x14x14:1x32x14x14_n"1d54fece3095aafd4c443c3c4ff3d031*5&537b1495ccae582a917d337c4e3b6ebc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x160x14x14_n"5725dbeb094a883a4d2248ddd4c9a929*5&fc994155916d7c7f35e3f28abd8447e3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x160x14x14:1x32x14x14_n"86469b15ec0d4357964569ddbfcc0323*5&cb3b039f6e0c575a6d0ba0a41afc083e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x192x14x14_n"6eda8420469fabbdf19745c4026e7a9c*5&b6a5c1d71fef1b5a9eace9e09a1a24dd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x192x14x14:1x32x14x14_n"db0d42b11bc3ceb744518eede34a5c5b*5&49df8c9684f9fc0eb7bf690f44389f40"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x224x14x14_n"1850a36604741e43d0d8d3c0446ba55b*5&876cbf29fdfd6f015a4b7b5d6d723386"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x224x14x14:1x32x14x14_n"e117770a784956d8ff85a73d72692de5*5&6ecbe80334e31f7f4e6bcf2fa7cecd4c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x256x14x14_n"bf98641091e32fe1cd703795ac49f2b4*5&2c8151f0e5ad92510749ec7d66d0bf7e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x288x14x14_n"b90ca991ae627c7015f052deebf5ea8c*5&04d13e89b05a119619ea782d2d4f12eb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x288x14x14:1x32x14x14_n"b07f0acdb4185c9421b46b17d6373900*5&8da57ccff9f3d9f842e38fa61b4dc7b0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x320x14x14_n"24709e95b1166254ae229be49c41b812*5&a1a8f01ec5e4d206b1e37b81b01da91a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x320x14x14:1x32x14x14_n"1ffda99dab7753c8044c9b7be61d0c44*5&152880075253a4b8bbc0f2de5522d110"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x352x14x14_n"f296bd570b2b868b7984943b3d14b5b7*5&2d91e98cb45130dd22cedd375c88ac75"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x352x14x14:1x32x14x14_n"a57e066e0e11514af3a49fbf4a815622*5&fe385475c11f6ef8228aaede163f7303"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x384x14x14_n"2ba3188901959333e8d0564623d75db8*5&1a64462f55d4fa7e4353a700f23c4911"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x384x14x14:1x32x14x14_n"afd053357bec4dc9f693fc7107b86d93*5&252dd516e2e4c14877bb82f1507fe57e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x14x14:1x416x14x14_n"0dae32e403d00dbd6b131fed980c4243*5&f53cbacc29ec3d2076b30754be361f62"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x416x14x14:1x32x14x14_n"ddc3bbb8983dd08d5eec343523b25a97*5&f3710fd2837b01f9ba4ca2d1daf43aac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x192x7x7_n"2178482e17266769974fdbc330fea44b*5&f56ecf7894cff44094892ca5ebe5303b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x192x7x7:1x64x7x7_n"78368a14b07e4b11de9ac895603abf3c*5&799314d6890b658742c55cfe249a3cb5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x7x7:1x256x7x7_n"8d2555b15e26dcd9cad9a4c75f64d218*5&d3df47085671c9cbb01daad15eb8ab42"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x7x7:1x64x7x7_n"02384543f9d64e1da79fd94d221de0a4*5&ea6f96d68a28c090689e964b4d0922f8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x55x55:32x64x55x55_n"89c767ea7b93859fd39275f226bb865b*2&2c7708d88a40d645193a39c0d117a422*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x55x55:32x128x55x55_n"ce3c015cf12def4f7df0d953fe533bb1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x27x27:32x128x27x27_n"31c94382554464da79bd2899a9d4135f*2&b0e461b2589afb72c35bfa60048bfc41"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x27x27:32x192x27x27_n"9d38b806c61f64fbf71840a044f0f6b2*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x27x27:32x256x27x27_n"ff0a4dde20630467524457e6a3976f0e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x13x13:32x256x13x13_n"e9b9320978e3dee0afdd5eae39a4d69b*2&ef3fd5b87cf57fe9436bc90b4b054479"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56:32x4x56x56_n"351054eefcc5ff8f243bec72a7d07e8c*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb:acdb --dtag=acdb 32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28:32x8x28x28_n"e08d6c5042001aceaa306c91aab6bf5b*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14:32x16x14x14_n"aa6c62ea10175d7f7966d93e0e25b8d8*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"ba5a01e4de6dd131bb6e4dd07aa9aa28*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x19x19:1x512x19x19:1x512x19x19:1x512x19x19_n"57ce79f1a69be6a0744990c5e80375c8*5&de5a302c35a100bfc1919acee0f2075c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x192x4x4:100x256x4x4:100x576x4x4_n"7c46498d9ed959246f5a83e21125b631*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x352x4x4:100x320x4x4:100x224x4x4:100x128x4x4_n"efb4d226615d5fe4649108429efec017*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd32b --dtag=aBcd16b 1x24x256x256:1x24x256x256_n"554e1a2bf8aca5cf651db43757bc1072*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x19x32x56:1x38x32x56_n"771eaab4e4baa904a449760a285cbe45*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x19x32x56:1x38x32x56_n"f3245be98ae1a57e92999b8fbfab1ab0*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x57x32x56_n"0f2a0fc3166b6de76a6b4b43df1e2c7b*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x73x73:32x96x73x73_n"39c04ceb53e7071d7ad9e2c58509e51f&f4fab8e8660cf8538758afd389144f4b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x71x71:32x96x71x71_n"a0147d005b8bc70674a58c7338a9845b&0ec27bbf52401a5572ac8fee34cacb12"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x35x35:32x192x35x35_n"2de797a164b1fdb5c5ee9e2a041af76f&20311fae4545ebea9fefedb309ee72dd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x35x35:32x96x35x35:32x96x35x35:32x96x35x35_n"733978cd967cfce1f27e3a05e0c1bc0a*4&61083265e84543929144644c91c29450*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x17x17:32x256x17x17:32x384x17x17_n"3a28db4d3136af0fa17abca27a1fbc5c&4f9925e0f1168675913ca8c78659bed1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x17x17:32x256x17x17:32x256x17x17:32x128x17x17_n"191064ae3557be4394cc3b33999f1690*7&f4498ea5d0e218b01d87445e5197436f*7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x8x8:32x320x8x8:32x1024x8x8_n"4d94000a6fefa6ad92a64686d10777e6&996f03765b9769067abde74f73564220"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8_n"fb61264675c6fa993eb71def68348613*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x19x19:1x512x19x19:1x512x19x19:1x512x19x19_n"369eeece17133ac20d4df6e60c3e3a0c*5&ec90ca3f5e9c58a254a0a7b0d58625f0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x192x4x4:100x256x4x4:100x576x4x4_n"c9d139764c80deb114a9dd7ee3340fd8*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x352x4x4:100x320x4x4:100x224x4x4:100x128x4x4_n"9c4fb13c5b2c44e2676d99a99d3dc76a*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcde32b:aBcde32b:aBcde32b:aBcde32b --dtag=aBcde32b 1x192x20x14x14:1x208x20x14x14:1x48x20x14x14:1x64x20x14x14_n"abbc642c53aaba17b3f636d8e2971db6*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcde32b:aBcde32b:aBcde32b:aBcde32b --dtag=aBcde32b 1x112x20x14x14:1x288x20x14x14:1x64x20x14x14:1x64x20x14x14_n"4d5e45a531434963b3f5acc5468d6c7c*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x384x8x8:100x288x8x8:100x320x8x8:100x1088x8x8_n"aaaf82753e3edb8a8b0a093318a33814"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 100x192x8x8:100x256x8x8_n"6e700a2945bfa1fcaa0d17af3f880138*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x320x320:32x64x320x320_n"98ccabf600390fe4236dba5dae8833e2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x160x160:32x128x160x160_n"347a99f568cc78d01d5f72cee0636ea7*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x80x80:32x192x80x80_n"57dad9d50663283d841aea3c286233a1*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x40x40:32x256x40x40_n"22ab9b07001dad5159008ac6fe3c9001*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x20x20:32x320x20x20_n"e9c0a7e252f3c9cd47c143e70f86e2fa*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x20x20:32x320x20x20:32x320x20x20:32x320x20x20_n"b2b2aa5f9de382b974c318f99f11e871"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x16x128x128x128:1x16x128x128x128_n"905ad3d4f5c026c26713081ef404001f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x160x160:1x48x160x160_n"5978ccfd34c8f441739503ca491ec2d6*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"4c7b822dea104fe0f4d14c248fe2fea2*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x20x20:1x256x20x20:1x256x20x20:1x256x20x20_n"e25bde3f610eb4b02d172b397962815c*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x56x56:1x128x56x56_n"152e93d4d2633baad9cdc3d2369b1786"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x28x28_n"8dd489636dd61dbdf7c26c5834c05af1*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x28x28:1x256x28x28_n"9efe42e763d5bad668843411e85b4d93"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x28x28:1x128x28x28:1x256x28x28:1x256x28x28_n"2e5f30c5eac4bc3ca3389b990c472dd7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14_n"bf2f43f7c8a8005eb23a7ccb26fda1a5*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14:1x512x14x14_n"7324f653c61231f4a0d2e8184c4a2f74*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"e9d14d4d8d95bc5933a80f9cb68b31f7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x14x14:1x256x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"a56111d65bb07a94542705bd8de28a93"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x1024x7x7:1x1024x7x7:1x512x7x7_n"9526d8cdabe7406f685c35b4a888992d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x32x32:32x304x32x32_n"6c2a6de1591b1d3a49dd3ac9d25bd552&dc89e74a715c5fab1549031eb8603049"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x304x64x64:32x160x64x64_n"f1422f779ed1a8d27a4bece09e59621a&9dcc1e2e66cfe469b5ba0526b88dfd82"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x128x128:32x96x128x128_n"030c51ac966f1db7ebddfc964bb0b177&93386ccc17c596adb70fb76489090a63"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x256x256:32x48x256x256_n"2983990536c7a67050d3b5b693cb7226&4c417a0abe56ffc559931687f0331061"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x512x512:32x32x512x512_n"1855f10a7fa2169a135cb2da862df37b&a85cdf1494cc64570d66df158e17c88b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x64x56x56_n"161b8fd84404d2b1d73d325e433ee965&dbb91ce1a7a89d59bb511af3ac26c7a9*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x128x28x28_n"78c35c2a116b53f6de1751218722639c&b2dafa1684b9cb6695e36f592ce4edb2*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x14x14:32x192x14x14_n"44adb06427322eeb623bbe3bb6ba6713*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x256x14x14_n"521f72ee05c270726e44c1032c170caa&ddef5f16449dd993e4ebfd4550eb516f*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"5f3a9ecf95013ab383114b31b825a433*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 100x384x8x8:100x288x8x8:100x320x8x8:100x1088x8x8_n"430e0b8a3212dad753d4356fa133f812"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 100x192x8x8:100x256x8x8_n"ae337e66872472273940dcf075faeb42*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x24x256x256:1x24x256x256_n"05a80d1a5b3503ad734dd432ff2b2189*5&018c1262c63976f237b866c16db6cad0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1_n"cbb798b09a585afcf099d5652bc9af8f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x17x17:32x32x17x17:32x32x17x17_n"baf8cdbd3d203ef2c07bd13ac122cd69*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x8x8:32x256x8x8:32x256x8x8_n"453486a6005ba73e200ea68cc6ccdd7a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x8x8:32x128x8x8_n"6881267ee84b3e8f6eca657e461d214e*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x3x3:32x256x3x3:32x256x3x3:32x896x3x3_n"45f71d520ceefd202988e8e389bf437d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x3x3:32x192x3x3_n"5829da522515b0fb80b32b69c44b7a04*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ab:ab:ab:ab --dtag=ab 6x64:6x128:6x256:6x512_n"e2d887c0cccfac20fa1f0557dd44dbf8&beca2bd209745708a53b14d6b022f3a3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x2048x64x120:1x512x64x120_n"6204a919522bfdea921b2c7bf83a8e44"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x128x240:1x256x128x240_n"02043041dae74e5f2ace510a12a8bf83"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x24x256x256:32x24x256x256_n"828067155949b18ad0e0e4713ffdbeee&52d27c94b4b969209d4d1378392923fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x65x65:32x64x65x65_n"e1a75489239f8e7d17fe5e2b941e4864"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x32x32:32x32x32x32:32x32x32x32:32x32x32x32_n"a62463c9ce95bdf9a815eec1dacb80f0*3&bc75bc8fb284cf6c7e298b857239e625*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x46x60:32x512x46x60_n"0c768dcc943bb46480f6ec6eda0bd91c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x92x120:32x256x92x120_n"35394c337c95ccdc41736baaad8b0185"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x184x240:32x128x184x240_n"737ddf9f4b73cd8d22be9bf36eaf17cf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x368x480:32x64x368x480_n"3f159fcb091902d4cdec33f69e1987d1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x16x160x160:1x16x160x160:1x16x160x160_n"b4dc45d26b5b173bbdbaf7861de1f4e2*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x32x80x80:1x32x80x80:1x32x80x80:1x32x80x80_n"3a235ce50e0a259785311fdfbd5433c5*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x64x40x40:1x64x40x40:1x64x40x40:1x64x40x40_n"1d02e6bccf0cdd064870c855b49f2e97*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x20x20:1x128x20x20:1x128x20x20_n"cb495c1be8bf7df5cb6e9acf6f020368*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x18x16x16:32x36x16x16:32x72x16x16:32x144x16x16_n"aa7b2a815ff5419f58ebabf4e8d2d759"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x304x304:32x64x304x304_n"89d757d04d9ceab0a78ae6f7541aa4bd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x152x152:32x64x152x152_n"5f748b518631231e9e8155a2b0bd6559"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x76x76:32x128x76x76_n"06fbc771e6dff1b5dd8558893086c8a0*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x38x38:32x256x38x38_n"8ed61da86554edeecad767a9afdf851d*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19_n"cac1dd1d1670b8acd4a77b9a2229227d*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x19x19:32x512x19x19:32x512x19x19:32x512x19x19_n"49ade9c51cf45928def67ef044941c02"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x58x28x28:32x58x28x28_n"373b21f1ae97c124a694196800d8323f&6a9e7536076d17d71d5e513927f09d03"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x116x14x14:32x116x14x14_n"74446f3bc1cfe63775c032140c5dc31a&cedabe2a32aee338d9df286ce8306929"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x232x7x7:32x232x7x7_n"7c3e43aa0f5df52a3e8e9697e37e6683&60891a1e1c4c853b68439ff0d69c6c31"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x19x32x56:1x38x32x56_n"9c2aff58232f24ac918181ee1604c10e*5&405ed00b2c709bf17acca07304486ca5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x19x32x56:1x38x32x56_n"b122d8a699714c9441e3658981584f45*5&26747dda974a1200dd69c628733ab89b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x32x56:1x57x32x56_n"164400f39b853411b9ca61b27ea9950d*5&2e5383eb39bcb6b9bea3c3042ffe8fe7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x304x64x64:32x160x64x64_n"654aba5a56a10ea0cdfa5ff22a8b8c27"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x256x256:32x48x256x256_n"872b5d6b9386d23cacea916eb8074e00"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x304x64x64:1x160x64x64_n"cd422d0257a47570a803353d3c045a42*5&0e0d46db0604f9e54accd2ef22e88162"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x512x512:1x32x512x512_n"4db996266252967174b0c70d91ee336c*5&37dd6a423644de0c1dddf816d3a483f4&14c4689273610cb54e55676d7b274765"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x16x16:32x512x16x16_n"613ec860d0b67da03141ff44ffac07e3&9ed02f0c5b281dfd1b67e8203f6d43d1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x32x32:32x512x32x32_n"530eede6f193a03f7c950c85aaff0318&cd46868858d814ace46f2de59a17a0fe"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x64x64:32x336x64x64_n"951033b1a12454ee569aab9288128dfc&53545e8e543fac6bbff8f63c5ee5aadf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x336x128x128:32x192x128x128_n"897a1490e8eecc67108f04e1ea7d2e4f&a37129cd23437ddfd59317aaf08ab9e2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x256x256:32x112x256x256_n"bedc1cff3e4233cec4073c64f58efe51&3ffbc2ba6e91649863082e007053f845"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x112x512x512:32x64x512x512_n"271124c46044f22e9acf51267d7a85cd&800e63f43c25317d18be4d50dfb50f78"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x320x14x14x10:1x320x14x14x10_n"44c1ad3ef6c4f71db2e801c4dab9bcbf&f19a06e3f5726e81817b12e1c5a04658"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x256x28x28x20:1x256x28x28x20_n"d19fc4245005eff3de8c5b8ae5e6d5b1&99c307680f14d77301e0ffb60f8b1a31"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x128x56x56x40:1x128x56x56x40_n"932739947d38238cb2a38f52e8110884&6b30967c8914841c3c47b8e4cfab78b5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x64x112x112x80:1x64x112x112x80_n"0eae78f8f77f144b71591be53456ce46&36044dad148434611a58dd84d89f61d8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcde16b:aBcde16b --dtag=aBcde16b 1x32x224x224x160:1x32x224x224x160_n"b50f85f1c5d9b332ab08a84e71900703&6d57043e46a3a9c4f154ba32e8cbdbbd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x19x32x56:1x38x32x56_n"372894d5d79871b3613bd2f42c5a0dc4*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x19x32x56:1x38x32x56_n"dc0f4fe563ab94c80a6b29649029c7ea*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x32x56:1x57x32x56_n"7a61f35e8b92338664583757d752bbe7*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x73x73:32x96x73x73_n"11bf4dbf777d88dbb51edde4e48bcf3c&7247a69eb921de2f298976f7ad088b8b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x71x71:32x96x71x71_n"8010dfac581afd2a5782db175479e7e0&0a26c71e06fec74531c09192be448e7d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x35x35:32x192x35x35_n"37f9a2ec4d0aa651988df718d0a77964&da828cf5e46187a0e234dd289621b76c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x35x35:32x96x35x35:32x96x35x35:32x96x35x35_n"79e4a6bdd099491ab3e88c25ffc54b5e*4&a007490cb2d11fe9ea6d2954678745ed*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x17x17:32x256x17x17:32x384x17x17_n"f0ef75b14cbb936967edbe757d237128&77552ff3ae1c9c062850ac936e1bf903"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x17x17:32x256x17x17:32x256x17x17:32x128x17x17_n"b987e2caa635f1162bd1c2cfe8295b94*7&b79ad868999823d8d58b92b332c33ce1*7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x8x8:32x320x8x8:32x1024x8x8_n"a4ce80c565ef0d731be9ba9b1e607efb&db62e3c749d5f2d0d54ca4a63912fa1d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x8x8:32x256x8x8_n"143399f83903d283042fd6c0058577c1*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x8x8:32x512x8x8:32x512x8x8:32x256x8x8_n"b89da7f40e9719b1b1eed20396c9e4c0*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x55x55:32x32x55x55_n"4591bdfa6fc1a771e90e8e955cee5513"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x55x55:32x32x55x55_n"ae5c9d9d780eb96af69bd9d649070198"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x55x55:32x32x55x55_n"71b333dabbdd8c2419ec0a858b9d0c4e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x55x55:32x32x55x55_n"0d0b810a5415a9b08c9eba2f9d634ffe"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x55x55:32x32x55x55_n"8f60cc500e2c936a84180ecf9bc047c5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x55x55:32x32x55x55_n"ba98752428084549ccc3aa76cadd8ff9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x27x27:32x32x27x27_n"49edabd023d60eec8a4ea0ba2432fa7e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x27x27:32x32x27x27_n"2b4105d682d3c3ef7aaefbcb9e97562f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x27x27:32x32x27x27_n"7dcf2ce903aef3c250168e8f1a5cad5e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x27x27:32x32x27x27_n"f0caacb9be7ed0893b26508c46884dfb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x27x27:32x32x27x27_n"05f418389d5ac6acf00604ef325d327d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x27x27:32x32x27x27_n"7f0ea536b29537297885472b69c1eecf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x27x27:32x32x27x27_n"d54c3216189bd3e592bf1f04afaec2c9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x27x27:32x32x27x27_n"ae35537de82c18537558cfc622e3bcde"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x27x27:32x32x27x27_n"24e9d526d86d6702fa71d1a45732173e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x27x27:32x32x27x27_n"4561757a11df7188e81f62b83f2960b5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x27x27:32x32x27x27_n"a35e2756f59a6a0cadb2284be410a74f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x27x27:32x32x27x27_n"8e4a2b07ed994179e00d3c62f6c3ba95"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x13x13:32x32x13x13_n"945f61b85ed01f7e51dbc4bb2159b80d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x13x13:32x32x13x13_n"b7bd71f7725c910e822e8e7b69346304"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x13x13:32x32x13x13_n"e33a340f3c8cb0048e9f3f4835e9be26"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x13x13:32x32x13x13_n"4ab85af4f0b39a8aee1c12b7e1c34d55"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x13x13:32x32x13x13_n"0227f58c9c311aa05603a0b225dc8c66"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x13x13:32x32x13x13_n"8df9b3814dfb112570001530fbd91c84"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x13x13:32x32x13x13_n"147983da74210fd32ce878107bf9249c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x13x13:32x32x13x13_n"97e0110a21c7dd7bb49618fede9eceb7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x13x13:32x32x13x13_n"07c73e9101ad6d9c7e9600db084fbb4c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x13x13:32x32x13x13_n"1092ede46cea78c7067f96f09483d4fa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x13x13:32x32x13x13_n"d3170fde4ab39285e52b58f4bbb0b034"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x13x13:32x32x13x13_n"0d0b1c270bec82dff922573380134fbf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x13x13:32x32x13x13_n"b13c9cc05f27ba718950cd0753b07bd1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x13x13:32x32x13x13_n"4ad64db8fa35a9a4707d4e0143d9b982"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x13x13:32x32x13x13_n"4d1e65e3e7ea456f5f3ba35ca2b8ae46"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x13x13:32x32x13x13_n"60cbfbd3ddfd9cf7bb1638227fd816b7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x13x13:32x32x13x13_n"63613311b6099725a8b340f94d7c3d90"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x13x13:32x32x13x13_n"d5046bdc809e157bf8a4ecbdc658008a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x13x13:32x32x13x13_n"2a757ed82af69447775df6186f97402f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x13x13:32x32x13x13_n"29df8d14f743473ece30a17307753e52"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x13x13:32x32x13x13_n"bb1c4cffda41f0034f8f14ac735639fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x13x13:32x32x13x13_n"386721107725ae41a9facdc6201ebe35"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x13x13:32x32x13x13_n"f5f8ce66035d5585ed49c0dc64d54b2b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x13x13:32x32x13x13_n"c5588d68a91c9740fba12117d88662bf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x6x6:32x32x6x6_n"ee72e71f7390928e755099ee46583787"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x6x6:32x32x6x6_n"00a97746f5f576dc44fd3299f57989bd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x6x6:32x32x6x6_n"d20f67dcfc1ed084ec09ead66bc7e9bc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x6x6:32x32x6x6_n"598af5323d4bd5672bcaf35593c59520"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x6x6:32x32x6x6_n"cf03daa9bba34be2918cfba81db9ad9a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x6x6:32x32x6x6_n"d7b37163673976d3690cfadc3af6538b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x6x6:32x32x6x6_n"7a297dbf514c854dcf6bac8fc32df8db"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x6x6:32x32x6x6_n"6734ebb76d468656001602e94463de71"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x6x6:32x32x6x6_n"31692417924faa0587ff15bd865cbcb6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x6x6:32x32x6x6_n"dc79dc937fd006c216b183b598a7962c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x6x6:32x32x6x6_n"46459afcc926878304ffbd513f50ea39"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x6x6:32x32x6x6_n"402bab67c13a1bfd63a0406a3d3dd86b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x6x6:32x32x6x6_n"65358e69869ae25e18d272447f07f940"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x6x6:32x32x6x6_n"ce4218cef44bbecbc68e375c96e39a48"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x6x6:32x32x6x6_n"95b04d18db08b8003a57f0bd849ce5f0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x6x6:32x32x6x6_n"862fe51d4248a2097f47724947f57b69"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1:32x256x166x1_n"191135bd15befc24f1701d510eb98a2c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 1x4x256x480:1x1x256x480_n"cc122748590580e7ae8a354f35fbff01*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56_n"a935ce1433bf3db91286d100a2ba0ad7&ed99f862e01149dd8242e0405d3a22ca&9282912264c7f1e325f5355d2cdad01f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x56x56:32x32x56x56_n"d525e2e914a9aecfd53f5a7a893ea0f4&342cf676090640780ef47e7c8d708323"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x56x56:32x32x56x56_n"2dd6bc483de5021bbdc146712ba9b590&e42879f538e6129deb455bcd1c4f69e7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x56x56:32x32x56x56_n"b260f80a6f975a6c40f63319d86a266c&e800f986437f21e737de3b9d9838fc3a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x56x56:32x32x56x56_n"3728463e1627831cc3cab1144863a6c6&b6361ce7e1853af29895c3ccf7350f2c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x56x56:32x32x56x56_n"f1170f6a452e19db762778078c740079&3b8d72d28410a67c401f3fc18aaa30bb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28_n"396fbd62e55c486226bd91b8b48ae413&f770760ca89c84efaa6738aa01736a5a&43e0060acfaab8300ee095801a660f35&811ebfabd4270ff3689e97b0a41f57f0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x28x28:32x32x28x28_n"0335646b0967ade7d1129fa698ba7723&b5705c580704c485571d3edf967629a3&a79bd246b8533b611a02c1cfb76e5b3a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x28x28:32x32x28x28_n"a2fdcadba64c6ddaa1dd9a6ba125088e&25e0eb08b2ae62c36d5af4ef847cc90c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x28x28:32x32x28x28_n"39f5725fd3b3447bbbfb56baf51fc640&242cada60fa8b7bda65b0b8d0bf825ed"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x28x28:32x32x28x28_n"29307c917474ce26616f7820e6450b4a&8ddbdeaec8c8ebd221947d7b99d43bd3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x28x28:32x32x28x28_n"2720c8707a146b0279cc810172b4ddac&2d47f450a5479c420e162eaa53a1fa76"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x28x28:32x32x28x28_n"c35c2d12e8268f2c22980ceef82bc65a&5ec8f0784a3710accb3ce513265b3148"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x28x28:32x32x28x28_n"b651d7b9f0c6ada2d59f93d96348d4ab&a7ef88e3a779f3b6174c42c9deef9e3b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x28x28:32x32x28x28_n"6bbbad1b0e7929241f976f3fc8e171f3&1930569057cefd58651b84bab7f33301"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x28x28:32x32x28x28_n"fbf03cde88c5680fcb67a4c469d688c3&7c1f73368aa98089091788f57c5078ea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x28x28:32x32x28x28_n"3a9ab11084608e4713c90cbc7100b939&c01c60da28fa2e530529f39324271742"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x28x28:32x32x28x28_n"929ebaef240f0bf8942114921595fe04&d8620fd75a8bcd71e88a55435a81c13f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14_n"a267282f21326c494f2d446c7a0be9a1&8f9b9d82e4717928b85a0bb8c681ee44&c1aafcc678e0f76f5bbb6fb11c7d5d2a&1353dadf3889901fe32a160e88d066b7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x14x14:32x32x14x14_n"60e195e4d6bc9f42f2d9e714e36cbf52&edfba671364721aad7030a8b890f73b9&32897625242a46ab94173c60337b2507"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x14x14:32x32x14x14_n"ff0643ce9af69c192bd133f9c2c86e7f&c52ba67743a0a12f4f3a60c410706c15&372e2813c1ee8f0cb7d37874cacde1a3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x352x14x14:32x32x14x14_n"e0fd28f1e6abb7fdb89d404baa35e895&ebaa97697b236037bd9bf0f465f40033&62cf3cfdbd0fda1fcd0d08165b200a5d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x14x14:32x32x14x14_n"b986e4b97c483a8e1ac848a4155823f3&22958f85564b26bca909e16bc357b8b8&7fb8a0b7bb713411d88643b7af25861c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x416x14x14:32x32x14x14_n"5925f90f5a1aff2ee9220a298b5ef20c&8cba85012a739044593eac6ffafc71f3&ce55b3b7794194fb4a377da912cdf1ac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x448x14x14:32x32x14x14_n"40e44ddec6289352c7958eae9a50976c&6294503ad96994242390b125db88329a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x14x14:32x32x14x14_n"8e25538a2c4116c3bdb58da5a0741963&7964898bbe38c394ebe95fea3555aa0e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x14x14:32x32x14x14_n"2bd8823f719ac911c952b7116362aebf&220c59f3e9b3db5d342c41883374607c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x14x14:32x32x14x14_n"604aa0a5ab995d1364ca459308829b45&6ff6772a303ca504a46789cc1782cce9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x14x14:32x32x14x14_n"75356e15dbd80d0a2994771991654c8f&a4587ae9b1e6655d9fe4f27b59568565"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x14x14:32x32x14x14_n"20d567c0e9a5f5cc49fedeaf16c1f30b&1a9f270abcdd1e458e0c36349d202d7d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x14x14:32x32x14x14_n"eb1562fb5617b7da6d356d10450c82c7&e01b4cf7f8e49a528632fb3593142a8f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x14x14:32x32x14x14_n"3f0d158075871e2e8af1389957db47ee&3ba9ad9168f6d39ed9bac1614eeba4ca"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x14x14:32x32x14x14_n"be6806cbe8efb0a8100e703a511d7f0f&1dd99341578dd09e5d4ed5e22109668f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x14x14:32x32x14x14_n"6265df244279a8ac24a50a63d457c4ad&0f73c201bdb87b97c42d86aa1acf9d27"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x14x14:32x32x14x14_n"e2e55246e5453f2e9db7ed45f2139e88&43356698cce74ec9030f2a8e80feee9b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x14x14:32x32x14x14_n"1eb9e3b295ea683159e2a275c2a8a189&b67b5e6a47530ea4cef3dc1c01855468"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x14x14:32x32x14x14_n"9f39acc51bfee4288130ea887202ffac&5d972bd15df149a157b54303db04aefc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x14x14:32x32x14x14_n"ad3937f55fe25bbdd04f5ad97ee19614&343a5df1387b78e2f85c359c15cb7b3d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x14x14:32x32x14x14_n"dfa667736245f211bb2b7af476315ca1&e2becfd6951c7743a1225b9c4e134786"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x14x14:32x32x14x14_n"2a76f7c4ddce4cc8799698d1090baa9f&f4031e784136ab6d7b16187dba8104c5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x14x14:32x32x14x14_n"9c1e718d9f7f187ac02c84864d4ed9d0&ba6b6533a32d05ad8e7626eb670b21cd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x14x14:32x32x14x14_n"a46abd85c5dd175b204aa36f88cbb240&c78888262b0d5a1b83e9759a6f3b6d54"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7_n"0e5192e955b089e4692a8d0b8cc8901b&807297125f4f7a2f0b8e8f7d19ae9bf4&71e6d353a334335cc7aaaf18e04ba59c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x544x7x7:32x32x7x7_n"38f6a5523ebd0376f3e1a57b46a4d3ad&3a3c74bb5c20d9f1912846b3fa90d5a3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x7x7:32x32x7x7_n"a4a94c14d68bf0f98c4a3566df0648ec&f87b896354f6616b79c4450553a7db30"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x608x7x7:32x32x7x7_n"3a927c55a525a46395dab439a60862ca&d4a2e16fded89daf25d56c8e3067f826"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x640x7x7:32x32x7x7_n"26f8394c11ef513bb2688084eed6e1a4&e384db3a01211ed1725aa64c541b92ce"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x7x7:32x32x7x7_n"6b415d263554f91024874685ec56cf95&30d1acf6d4787e76cf3f8bbefd92737b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x704x7x7:32x32x7x7_n"c01f9b07b7de7a9e2f535f94cb69866e&609ed5614cd50631b7a78151d9b23e21"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x736x7x7:32x32x7x7_n"c785525f715ba0c01a86cf2d388938d0&cbcac5dc61b1532339a5d4b09de67d2d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x768x7x7:32x32x7x7_n"5ce389ab295716001f0e7419e088254d&e5cc2b87eab7ecc2b488950d807007a5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x800x7x7:32x32x7x7_n"e55ab6f1d075e0011fd691eff994f45b&08d6b117ab3ee1350fb45367e9b0029a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x832x7x7:32x32x7x7_n"adfc92244290346fd032b3d622cffd39&40e45bb03cd3f3c1d4f3b96524f3560d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x864x7x7:32x32x7x7_n"5eabd4ac9a8d781c0b28e377644490b0&1c3fb3b70ac54c46020e95bc63759c44"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x896x7x7:32x32x7x7_n"4a3fe9a4d6f6843b06a258680200592d&cf855b22756a8adfa2a36e10fedea03f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x928x7x7:32x32x7x7_n"76be476710fdc159e6757efeca3230fc&c4426878789339144f284c8b314f7fc1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x960x7x7:32x32x7x7_n"be277bc6e2f820617c732c10cfcdeab5&bbb3901dea3aab0fcf02d303a48999e9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x992x7x7:32x32x7x7_n"ef0762a5977a198501810ca6ca15de37&71ac0f2f7c8d23a4ff916564195dd32f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x192x192:1x24x192x192_n"70bbc5e1a59df62b91e93f4a21ac1edb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x72x192x192:1x24x192x192_n"fa8035ff72ffa9bb64b8ffb7f4ff99de"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x120x192x192:1x24x192x192_n"58a08b2d507bfd04768ac949856e6b83"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x96x96:1x48x96x96_n"9dc44c67816c1bda9544e5259f3f0624"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x144x96x96:1x48x96x96_n"30a2801e99dd262578cbc6534a33cdbf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x96x96:1x48x96x96_n"737b8d062581811cb94648265fde6b2c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x432x96x96:1x48x96x96_n"d7cbe74c8cef00bdfc8f4e006e9a9da4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x144x192x192:1x24x192x192_n"f1eedffc6c36f411e734d49330861f56"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x168x192x192:1x24x192x192_n"75d7597759d43c3db2a3a214287cb61a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x216x192x192:1x24x192x192_n"01283a7032c1ea677d3e7f1a5c885d90"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x58x28x28:32x58x28x28_n"519098e7bfbf00293ac0bda49fcb1d3b&dea00e6fc9c520f92fb58da2a81fbc0d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x116x14x14:32x116x14x14_n"705ad866598f2d5b637de723d554acaa&edc7ed7fe60263b2b155506f370ac2ee"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x232x7x7:32x232x7x7_n"51cd6bdf1cbd5278e5aef2094c7a2e7a&054fe7d04d0cdaca976ca8ba9b5cec9f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x12x12:32x512x12x12_n"89ecd57dd20222c72918f47952f36063&1588a15614227ed9e69f5ad685225f05"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x24x24:32x336x24x24_n"c277e85ada03961a4b8789c7fbd417ce&4a921968f0f89627e52315bbc7cf6b77"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x48x48:32x192x48x48_n"239d94dec4432ba712f97ca7be78c5ca&c7b72f089e6699ae10d215d380ec0625"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x96x96:32x112x96x96_n"cc78e5606487a6b47b598138c2fca9bc&271ef39588ebd586c3b1a2b325b23fdc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x192x192:32x64x192x192_n"25057c3a0d630749b81e9772459baa1f&fc29e28f791fcd2b11e40db6e9ff6bc5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x75x128:32x64x75x128:32x96x75x128:32x64x75x128_n"09c8584761eef0ff624f56f8fd9b95b6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x75x128:32x32x75x128:32x64x75x128_n"0539fa03b7e53a963c8fe006a8df30f9*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x75x128:32x384x75x128:32x320x75x128_n"81d7ab126244e4ef497da64160cc5581"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x75x128:32x192x75x128_n"2f445908f325bab3cb8e5587c77fb08c*20"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 3200x384x8x8:3200x288x8x8:3200x320x8x8:3200x1088x8x8_n"338bc1faf467b8363f92576adc96154f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 3200x192x8x8:3200x256x8x8_n"0d558c15e55b1c952e6361574bcd29cf*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8:32x256x8x8_n"466e048a3a8c1b56f3d78a4a4a554aec*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x55x55:32x32x55x55_n"3f82e6d4fef58e23712773f05882718f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x55x55:32x32x55x55_n"f58438f00ffdf9d6c7365f7aca1f91cc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x55x55:32x32x55x55_n"8cd54a097b141d27f97ef04b13a94ef3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x55x55:32x32x55x55_n"fb58690bc3f470a979034c134e2656a6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x55x55:32x32x55x55_n"e2b86ed072f8e305f396f51e452cb596"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x55x55:32x32x55x55_n"8115d19f03d079798813f5f7320242f2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x27x27:32x32x27x27_n"c5949d62990a5923184f1cdbb2e7b040"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x27x27:32x32x27x27_n"ea7c3571d2a46b9907a73635fce7e8f1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x27x27:32x32x27x27_n"71e2d7686aa5781e5449e57fb3095532"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x27x27:32x32x27x27_n"e39511ec620df6d4337061c72dfe1d08"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x27x27:32x32x27x27_n"ea99d208cb3a9a70ff2beb547ff761af"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x27x27:32x32x27x27_n"d8e9943661a74829bed55e398f3b4219"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x27x27:32x32x27x27_n"918009dcfa0017d958a62c4a94814881"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x27x27:32x32x27x27_n"dbd9cd928f615fb6b4043d131340aa10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x27x27:32x32x27x27_n"2e4ce363ebeac6fd0793b156c0c05501"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x27x27:32x32x27x27_n"84364162aa90610e64e6b227ed6ca045"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x27x27:32x32x27x27_n"24dbf29ac170b0d14cd9676eb016adc1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x27x27:32x32x27x27_n"c7f8c0485087c3d75ac8ea8205871885"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x13x13:32x32x13x13_n"c0776cb530e99e4e9bc1250349d0bd94"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x13x13:32x32x13x13_n"055e7ec44b5d3743f5d8b99a6d32c047"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x13x13:32x32x13x13_n"ee8a8a63143f77ef857e1b0dd9eecdd3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x13x13:32x32x13x13_n"62c00bd671fb3a70717019fa62edfe88"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x13x13:32x32x13x13_n"d2877661cb4ab61bdfa85e20d4442aac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x13x13:32x32x13x13_n"148560c85fa438da3345bd39008bbf9c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x13x13:32x32x13x13_n"0544790218057e3e2021f7b2a66008c9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x13x13:32x32x13x13_n"6cf24f685941f5c5cc9734368cc08c33"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x32x13x13_n"824d73d7d5d86cade121a152f11883a9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x13x13:32x32x13x13_n"f0a69f976f14fa09506c7698afd6f836"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x13x13:32x32x13x13_n"7cd5de45ac65bbe4319dcc8c348630e9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x13x13:32x32x13x13_n"d0bf4f13f6682425854165184c385a80"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x13x13:32x32x13x13_n"d4b62fd33ebec089b78db0b46fe6c356"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x13x13:32x32x13x13_n"f5ab675920b9aa12ae36a3383ce3d5f3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x13x13:32x32x13x13_n"0b7dd535726e99da7cd8dc779eea5a12"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x13x13:32x32x13x13_n"aac301849494bf8ce72534e2497bb0fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x13x13:32x32x13x13_n"37320f0cdc0a2ee5feac3673bd90e91e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x13x13:32x32x13x13_n"bc5cb8e307569c06bbf1a6680aecafae"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x13x13:32x32x13x13_n"463c141b3554c2f8e21ee096d8894acd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x13x13:32x32x13x13_n"615eeb48a5d38c5aba68f6004b8dad43"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x13x13:32x32x13x13_n"17cde0947b4bbde3f5b1cb220d1f9165"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x13x13:32x32x13x13_n"6118dbb4dafa4622fff95d522d0384ea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x13x13:32x32x13x13_n"6569a73f841a7b56ae032969e41a0383"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x13x13:32x32x13x13_n"bc2640861dfe5a22f8e9d67d7aa6e77d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x6x6:32x32x6x6_n"930d2d8d653670c29977bb86a536c4d7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x6x6:32x32x6x6_n"3405252700f604b839475220a82cfe60"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x6x6:32x32x6x6_n"cd60478937849a57a772e7d6763978ba"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x6x6:32x32x6x6_n"dadaebd733b8687e498c03ee52b9fb6a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x6x6:32x32x6x6_n"0fcbc021c5037dcbda9c2aa7dcdaab46"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x6x6:32x32x6x6_n"a61d6f9ff6f072aec0b1915ab32317f2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x6x6:32x32x6x6_n"638b910842be76072ca234ac04d35376"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x6x6:32x32x6x6_n"20e80d4aaf45b816b29ef3bc201bb6f4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x6x6:32x32x6x6_n"4c199a69433228b8c54d1fbf361ac72e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x6x6:32x32x6x6_n"ad8816077dfcc2d0e6ee3d0232bf8632"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x6x6:32x32x6x6_n"a63bd155f409454631731173b53bb980"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x6x6:32x32x6x6_n"47bd5168cc7794d7592709556f11c6ae"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x6x6:32x32x6x6_n"f5298ac623a103c8b5adaeea9451e2f9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x6x6:32x32x6x6_n"11700773aae3c46bfaca1e40dc3ee7ea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x6x6:32x32x6x6_n"47881037319710fc553abc0775f01972"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x6x6:32x32x6x6_n"133fc20a7438fad723aaaebdd8af66d2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x32x56:32x19x32x56:32x38x32x56_n"ea6392f6da8705f18189189363fc23d3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x32x56:32x57x32x56_n"0c34a3ca7232807f83f047e21da8dfa9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x19x19:1x512x19x19:1x512x19x19:1x512x19x19_n"3bfdc405ba308166170f7712ca500d5c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x160x160:32x32x160x160_n"0d417362a37ce8e137ce6a331450b82e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x80x80:32x64x80x80_n"e995999e29a35b4e3ba472cc4eabad76*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x40x40:32x128x40x40_n"a50c41e4e083a86899d24fd61c299166*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x20x20:32x256x20x20:32x256x20x20:32x256x20x20_n"2f43048db950d7e18376955b8bfeda55"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x20x20:32x256x20x20_n"fe1a684e7bdfae43edea08e7bed54c47*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x40x40:32x256x40x40_n"8e744af39028ed9fbd25f9f1de4abdb8&1e43e6d87715c7abc650d9efcf1aefe6*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x80x80:32x128x80x80_n"d0de8cc17f652d2d8bd17e48dbabcf31"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x16x16:32x512x16x16_n"68ce19f8825accc66509fdf68ddd7037"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x32x32:32x512x32x32_n"6345821f84d31aa0f938e5209aa01702"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x64x64:32x336x64x64_n"3f0a7724a39eec51ca8d0bc4df4bcac7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x336x128x128:32x192x128x128_n"bd040c82565020d7c1180c043185325f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x256x256:32x112x256x256_n"7210a17c147fd4e079d4509bcfe45bfc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x112x512x512:32x64x512x512_n"b28a97b141c78529ebd7df686e7ac9ee"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x256x20x20:1x256x20x20:1x256x20x20:1x256x20x20_n"ecd599c5824d26077901a5517997993f*5&6a6bdf70b83c436dacc56cbd1214e71c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x160x160:1x48x160x160_n"db0c00cfec66c078c0952b658034c2f3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"6d1c249b780f4bc12439840f624e7853"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 32x1x256x1:32x196x256x1_n"6495c9d2d023a180d6fe6c9fa6bb1f93"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x18x16x16:1x36x16x16:1x72x16x16:1x144x16x16_n"b0bf67a2e156eb8fc5e39553b1d268c6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56_n"c1814c434db3ed6551eb42ebd89ba40f&7a2fbcec7f55ee8f519486d8e8e3060b&b63000c78d1bb6b37785ba8a64ace392"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x56x56:32x32x56x56_n"8bb07297ac247bcc44da320cbf81d4da&eeac7cbf6f0cd7e24e5ab0ce80a7e104"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x56x56:32x32x56x56_n"afe2ef10203ec9a42bfe77a1a4957d64&0c4d54751ff5f12294dd10071bbc7a13"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x160x56x56:32x32x56x56_n"d68436d41276681f5342598d0bd93622&c261d995fbb65b0bde4a6f3e19e73407"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x56x56:32x32x56x56_n"a7fd76a6d8d806120bf382fd168221cc&bea44fe2a6da0dc9d7cd2ab173693f0c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x56x56:32x32x56x56_n"d2eb4a0393f23b939ba13aedd7fdbb81&f97e99cbd73e989276b5895cb75ac303"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x28x28:32x32x28x28_n"6c02f1a14588109d1fbd862c0948dc0e&85b476744aa1d32c80f57e49fecf04d8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x224x28x28:32x32x28x28_n"00460de6c3e3157a1b5bc4cdf2ecd4f9&23e5c6db69254c6997c8e2bb934f41d0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x28x28:32x32x28x28_n"0ba306a05c5ec39bd56c831d75c95615&410aefcc4a1ecec0539fec19ce4497b6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x28x28:32x32x28x28_n"bb6b9e390fdeec28e5a8f02cc87b32a8&a3854238321a86d5aa3763e522b9515a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x320x28x28:32x32x28x28_n"cd669801a002d34fa1f14928a877a880&2c7d1c23ad0dd898e9a56670898f7cac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x352x28x28:32x32x28x28_n"217a45b367b2daef4f2d21a41b446da4&55a3662fc4d147ee1b90c8f744d7ad3f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x28x28:32x32x28x28_n"dfb8b9eb45c0fa5f01c3709813deca04&ab0bdf8f4d67023eb9579eaa45a5c068"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x416x28x28:32x32x28x28_n"be2032ada29d61636846977a34c9c7f1&70448f73f7df5ad8daae6f5c8b676391"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x28x28:32x32x28x28_n"177b60148768f2ac8745ce975b191d6c&c0a38edde096256e2857d56883631633"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x28x28:32x32x28x28_n"074d1eddb481a143f31cf7b74a88b12e&30f7213f4ef4977bdc3de9dca0c19da5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x448x14x14:32x32x14x14_n"67d406a1f93aaa0a2d8c5905fb30850d&0da028ae7c4d46fc32ac36c816766ad6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x14x14:32x32x14x14_n"5da5836ccd347b2ad94b8eb3b29bc609&4d60636e7618e2e755bbd48ebed2afa5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x14x14:32x32x14x14_n"f4582ef9e8df1a661fc26af42084afb4&930748f9dc8b57e150dfa40a688c1df1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x14x14:32x32x14x14_n"9a3ba38c7caa3b46c2cf5a1088f43420&64e987c2f61ac6c372fdcc725ea0901b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x14x14:32x32x14x14_n"e6526727f58cbeea7a14c84b4e43ae8b&e3c63d310262f549c96b8b58bc986be3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x14x14:32x32x14x14_n"d251d699f804588780218ca17b434f62&b3f01f90af23ff1db4afd6f05612a264"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x14x14:32x32x14x14_n"70cd7e4ed97872f5b260cc23649a13a8&887c282179b94cec3c1c093fd02acab9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x14x14:32x32x14x14_n"b23a8af0013e9fe073629c695b284f3e&ac5a238ff1327c9ed93e58e702fc4901"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x14x14:32x32x14x14_n"fe56833bed1b3cd4af89ce3caa9900dc&c67dec580432f82bf47512f368be7930"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x14x14:32x32x14x14_n"68a5932646080ac0ac102c38098ecfd2&d6a06b02cede2bbfd38caad8020b7a1e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x14x14:32x32x14x14_n"4f7a0d43f981447c4c06ec7c2d2b8043&61173c7714c1d900aa5a44cd3b5b470b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x14x14:32x32x14x14_n"c0c50fe186f365cb66127b9110057e7f&3b81342c408c1dac5636f014cf7f0b30"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x14x14:32x32x14x14_n"b011913457aabc7d828d6d41c7f0f5e4&e88b433ba0cd0fd071534e226f70b4f6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x14x14:32x32x14x14_n"5e214ea2b84c59df08aa30ee72d395e1&67a6b1ddb8efa2660699dfff2a8b21ba"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x14x14:32x32x14x14_n"15ad31cc1199667dd9d338eacc6d3404&b42e923500fb31378984ede3c601fc6a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x14x14:32x32x14x14_n"8dc98902a410957c567dbcb92cd01564&cf4ae511c33bd31b50796c831d7c9e27"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x14x14:32x32x14x14_n"4b679e768ae05353ab465fc40ac609e5&7074a52b26a1e40f62ebccef432af1fb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x14x14:32x32x14x14_n"12f57c0afb365bb057e153ba11ec990a&150fac4ffc9a2011df5816f42285ff27"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7_n"b60dcce394a54a8ad2ddfff72fbdf3b9&241aedada3791006f5a31104dff9679d&fd1b6e134b7e709ad5dd8c3a85dd36dc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x544x7x7:32x32x7x7_n"823f098285107d99193ae607e7d98759&8f101cfd651286744f59540f4f9d37fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x7x7:32x32x7x7_n"5a2b8756124aefa05e197532991a6f62&4a031db6eac15ec0369750f7189f2bd5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x608x7x7:32x32x7x7_n"b03a2b63de87f2cdaa1da44daf9e17eb&0d3b01a706762efbc8a35759b59514aa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x640x7x7:32x32x7x7_n"fe1bab587a9b9dc136205b7a6052e52c&e16bcf24ff75e4abbddff8bcef0ab8a0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x7x7:32x32x7x7_n"d2d7db791147a3f671ff3f1c1125859b&198a92c3c504667793f63ee186856f1d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x704x7x7:32x32x7x7_n"97c692866d847097ca1ccaef331fb617&eceddc1493edb7a78c7eeaddc86a0dd6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x736x7x7:32x32x7x7_n"1a388fad4a3db69378cb11bd3b053b0a&8548b8d73d35c5cbe6714f60d831b160"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x768x7x7:32x32x7x7_n"a49096ca3f937c4984c4fdc164055d9f&1427efd99ef26735253bce784e461806"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x800x7x7:32x32x7x7_n"9c1d4433a639ecd998ebeb3f3a387bff&f5b65c62d5e91373fbf4940c776eb05e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x832x7x7:32x32x7x7_n"3d962e403e3cc3dad1382feff654a168&def4304f26ce7e755bd4f54b1d3b21b6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x864x7x7:32x32x7x7_n"a99bcf3bb617de0914b54e31cdf7ad4b&437fb64b74025dc79fc0803ddee78b1c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x896x7x7:32x32x7x7_n"e4938c2fb0465f99723d95afeb740b7f&ab24cdf744ba78c2e5ef6b08ecf941fc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x928x7x7:32x32x7x7_n"3652d6256bb97f11a8727ab22f9f343f&80bbf5e6be4edad10b99c78d59849816"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x960x7x7:32x32x7x7_n"161db566cd5b4e29923c3258064780b9&c72bf9997c7b5f35e01646e843e9a5c5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x992x7x7:32x32x7x7_n"785b447e53749ac5b7f8d439d2290cbc&4b1e32b049471fadcb51d9dc0edb6a2a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x65x65:32x256x65x65_n"22610c476b85dee42bd42d947e719575"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x24x256x256:1x24x256x256_n"5c96daf58820f30a809cfbc7724f6c4b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x32x128x128:1x32x128x128_n"eae4559ab535272ab0675c692da9ddf3*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x32x256x256:1x32x256x256_n"5f6e3d42482ddbcb2836ab6fa78b4fef"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x16x256x256:1x16x256x256_n"44e10c9d33dbb14ceb2659b2e6b29772*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x144x16x16:1x18x16x16:1x36x16x16:1x72x16x16_n"77fbf036929af83e792ac7c92ca54823"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x24x184x184:1x24x184x184:1x24x184x184:1x24x184x184_n"fb9219d44988c6f65fbb80e0f1e253f2*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x336x128x128:1x192x128x128_n"f03cc6e5236379c241797936b6799786&4b9ef1281864f947375bfe40a9cef9ca"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x112x512x512:1x64x512x512_n"39c43d6ea1d8c5cf937ea953677d4d05&5366671be85efc94f64b5fb3614e3283"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56_n"c8921de455005f3e382c03b98bad6bcb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"c6374b788a341caed10d8debe9fbc74c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"511f2aa8ce8ef99574a0e273c26dd17b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"ffd30b1ea17484b01b36d352249c6101"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"86edd8afac9cc16d97004f7f22142009"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28_n"26aaf84ed09a76ab1ef29f4948d6582b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"7444185f4a4bee1764c60c7d5daa1d62"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"a6aa524ff2ac4867b9f47861c768e044"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"1b47ec633a5e57610cc995eacf8864bd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"67e17823d982937db70eaba9c80421b1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"921a9480f42e48e15406e80e18012227"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"7a8aa1f6f444f15bdce886149e34a49c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"9460086135a0313fe9361aac0ba2c0a2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"8677044444a25a0e51c9aeef53a81539"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"c16a8094ca9eae9fd241906bd5e12f01"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"3dc6aea526e08b129fbcba067545f635"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14_n"b62fea1c2dce945ef7d8c4a0d6a3012a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"7ef796d6dd1d997f157bfa9d382ab933"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"7de63d0d6793800319f46f0cb31081b9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"92ed52bdbacb549110bb3c75506f3b09"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"5a032e81a5b3806f5a270d1ab93f616d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e5d1e3c35a52372c05b8af5364adac3b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"7b1c69907b871e5afe8596230778d5ef"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"6ad2dab489be09d1157c77b8f8761683"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"2ed0ef2edfa9caa2a88b6e139872dbf8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"dd33260bc3b0b0f72ab7d5049f84a6e4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"0a6759fadeaf135b8d5bd2ca0b3958d7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"9193a38a722d14d309c7cc2adcec9a54"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e0f04bd57f30f3a78f10d868143c5d55"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"9abb8a3cb75f0b3741163e5002d18472"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"ca8b5de4b270195aaddc9371b52619f6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f4d7e38ad1bba1d5ac5c17d8ef14bb06"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"154226616af512a577b9b47710810301"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"471ea95f15fdf294c8811d00dab4c215"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"03ba609ac2c28c47a2910bd5bd448518"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"10b69eab80f8d5ddc48ca8be7ce8a0f8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"a50c3f90efb5bcfb01d1906fbf451aaa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f3b089869596010817ddbba6c1bcd82e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"ae30165f9bedfc5f593fdd1c928a9c8d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7_n"86e07df10b097cca0f751926adcb4a57"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"5c7cbecc47bcec195d0d18f1915e0c45"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"b8f570b8c0f4566992cb1b6902369f5a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"0a267d11d7d7f192187f5ddeba04c475"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"adc5ba855924549392d7b23726e236ab"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"b8b984f5d95fa3ede290c4da681bb5a2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"e38346d51c5961796d3a59d1aa782185"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"b0d952d88d7fa22749d52632bec05ea1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"107694a90b528775edf56169cf7ab05a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"2a5ca30fac6a3453578913cfcbb74fff"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"5af2d61768b60de2c5020ad9c0a99877"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"a236488b50e6677658fd628a218051b8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"7a83631758471300f20e072876d9501c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"1ae78f924728f977be87ac8e648bfbf2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"96532d78e57f0459cdf4b4a08a16426f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x304x304:32x64x304x304_n"dbd73d13c46d1eb951a1b70b1b3a436b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x152x152:32x64x152x152_n"aca102d3351b52737034f524915321ad"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x76x76:32x128x76x76_n"ea9613bebaaadf0e1c341187b2abae83*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x38x38:32x256x38x38_n"65b17c4da7fbf6b8039b4dc04448f6c6*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x19x19:32x512x19x19_n"a05625fa3c30219abc326f2cf571cd44*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x19x19:32x512x19x19:32x512x19x19:32x512x19x19_n"6dc8b9c3e3d548b9ece3a97bcf2c9255"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x256x512:1x64x256x512_n"0ec6ccc399c122af18a4fece43a0df76"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56_n"efdbe72c25df5eda6b237cf53a72a50a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"cf51d3d0f6bc6a03656de3a8a9e7dd26"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"ae08ee2e41f5e7ed3a6146b5ae60fd27"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"c78faf3e941c9d922b597f2c5917d4a6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56:32x32x56x56_n"2e202c8f8fa1d8d579fafaf172107507"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28_n"131f49ed0065452e05992a2381beac4b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"d16ea6200d8671ca7d6594524253fedf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"8e13c7a7bfaed120f275a3284ed1ad98"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"2adef1ca2aa48bb9aa5c1a8127b86571"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"388950735a75ca0998c6f4a7128f7d76"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"6c6e1b9204e1169fd2fdebe27b53f296"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"61d9f0397ab4e3f759d51b36c221be46"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"8fac8db808ac1c9fd02098c8d491e893"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"e103eac4979185e1272f8cf47e13a940"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"4daded86591a2f75070e4dff3a77be39"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28:32x32x28x28_n"113547896b65671c4c543e0549805b15"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14_n"999e7ba9454a6f27d8cd07b0a72883a1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"9a2ace6972c1b459e04f4e9d0c4fc6ec"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"ed29576131b0a8f403c8b6a82b644159"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"dd9aae67bcb55d0f13223adedc89b581"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"e3b83819a9f0507cdbd25cc8fff87000"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"a4a0b64977fb07ede0eedb0c488e3388"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"4788bfeb33a62bedfa924eb687fc0e8b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"451162ad01ba617bdbe2ae82b33501dc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"6aab6856261eba9ccb1279237d65e97d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"548deb130bba091f34d6c8c1cda05cdf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"7146e31df514c4b3b2cd1f3318fedfbd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"d2322bb2736bcad5d50ba7759b593384"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f04f9fedf63e63c68a3ab057c8642f80"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"457d0fe741d9eca2d5f74f6f1038d2f8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"af4911952e4c3beda8caed41f7baebd6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"57ff49be3793ac0923527a64c4e81028"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"55f9ad3f457983c487ca8187eecd65af"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f1e55db445e0190828397997e92a20b3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"204acf05b792bc84ce2f4ca35abc316c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"9771fe5e17db014fe21e32999b0ce217"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"ccefebc8c7741b472eb59bedf1b39cdd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"a9403daa2a4c1aca8c9e5da79c6917ea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14:32x32x14x14_n"f40aaed4cf3aa945a823abc1978996a9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7_n"d3f08ae9b77e9ae3a71d5ab175f27d57"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"a6e1c0e49848d2d9d1a4680b81fbfa57"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"9f64d3318fa9627c7d18b95143302a3b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"47be8ea7f7ae1b989ac529ea5492068d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"00469f6cdccc3ef82c53de64a4cecc78"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"9a86e51429870a6f3ef7df9f598e8974"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"2d22eded403512b7e60459cdc74fa8bd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"2c0c36de3dfaeaa146d035b0e641b792"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"4313056ed2f4232101f7a6adf16b72d9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"d35af87e0f0f6b88a82186c5899e8766"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"001af084515e5a088f20a2557b91b931"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"934aa70151f39c0bf30154aebddb24ae"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"e6941c34560a0556f91dd759fb67529c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"26c0223853523b041454fb6b1cf1cd6e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7:32x32x7x7_n"dbe86ab006c4e24b9178f611a659cf8e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x192x192:32x24x192x192_n"31e7bee6fdf43d83d35445b43f6a55f8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x72x192x192:32x24x192x192_n"b80f4118d3c87f2d8150956112dd031d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x192x192:32x24x192x192_n"f566c6bffcc329927a14bca35f12cfad"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x120x192x192:32x24x192x192_n"da9762cbccdfd493b65ef95dc6d2c502"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x96x96:32x48x96x96_n"59b4c43d46d47514d52938af05d9e3d3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x96x96:32x48x96x96_n"0e92d026ec438433219e1d49c6d31601"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x144x96x96:32x48x96x96_n"fe9c259bf86bb3545a4f440d5a1b6be7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x96x96:32x48x96x96_n"3db8169511cfc5c683a0022a740260f7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x48x48:32x96x48x48_n"ac2a08707edf80cedddcd7635071f512"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x48x48:32x96x48x48_n"2677ce95445ef31764bb298eb3ad82ce"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x48x48:32x96x48x48_n"15fea50506e2394962ce7ca67ced8ad8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x48x48:32x96x48x48_n"5db48b4169d9ee8b1feb1725c1517678*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x24x24:32x96x24x24_n"2e5ddf8705a60ab2046411b4cd4926fe"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x24x24:32x96x24x24_n"a5b8510697624aa51cb59d4b85a09cf5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x24x24:32x96x24x24_n"07eb85135586d5c9f6ba2e959b82491a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x24x24:32x96x24x24_n"edca7b7beb2a9eeaa3ed01f8ca0a48ed"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x48x48:32x192x48x48_n"f52a5330a72df760e993c4c7792ffccf"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x480x48x48:32x96x48x48_n"3a8c8b6a9b176b5a741b15af1f714f9e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x576x48x48:32x96x48x48_n"88bf40ba82ad5f9547eba5d581ed5ac2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x672x48x48:32x96x48x48_n"97be60784293d6ea47dbb094700ecbe5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x96x96:32x96x96x96_n"29b825a3a381a8d50b52b6fa0235141e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x288x96x96:32x48x96x96_n"c8bb6265d450cf4d52fcdd392714e4a0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x336x96x96:32x48x96x96_n"a8e5e92140d67a64c72d4bed7d64f5eb"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x96x96:32x48x96x96_n"4032ec4a045e820fb45517d6b0eb79b7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x432x96x96:32x48x96x96_n"ac46f743a20b3f35dd83e8e952a6178d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x192x192:32x48x192x192_n"02ce520db72f7925cf75ba07d35b1c90"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x144x192x192:32x24x192x192_n"e373c706b73d8aaed8e28196ab56de9f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x168x192x192:32x24x192x192_n"160b9a74fe7a8770e36d36c3d219ccd9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x192x192:32x24x192x192_n"bb4cb116d3ec1da89ddf12a312f6cf7e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x216x192x192:32x24x192x192_n"fac0eaf14edd1252d32c7ba66ee1670d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x64x104x104:1x64x104x104_n"daa14d18ccd12af4d7eeaf821fbda4e0*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x52x52:1x128x52x52_n"aac582ef104c7679f8c25b382d306734*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x256x26x26:1x256x26x26_n"df003df585c7c9cfff35db858dd34970*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x128x26x26:1x256x26x26_n"17735323f5b61ccf0a1d5f975e74899c*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x65x65:32x256x65x65_n"2043e1308c3d1e8ec9e525dbaee5ad65"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x128x56x56:1x128x56x56_n"1d82bbced3a1d3ea7b441925a5bf17aa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x256x28x28:1x256x28x28_n"68d22ff56e75312fa81c2ff8a2e4af87*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x28x28:1x256x28x28:1x256x28x28_n"61150dad2767a80128cab9aeac6a590a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x256x28x28:1x256x28x28:1x128x28x28:1x256x28x28:1x256x28x28_n"0aed11fef98defa331288f6e8642129a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14_n"e3914fb3bbf009c9bbabe46ff2f23bba*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14:1x512x14x14_n"6345a45e83ec2430045415e7bb206095*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"6cc120addbf082353703ff74dd9c9c3e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x14x14:1x512x14x14:1x256x14x14:1x512x14x14:1x512x14x14:1x512x14x14_n"038ebbe7f7abdfba7225fd0aa3e3484b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x1024x7x7:1x1024x7x7:1x512x7x7_n"8302bb897d54a0fee9c659260b6e5313"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x1x256x256:32x2x256x256:32x1x256x256_n"d1494fd2a0a3adb547950a9cbbbc526f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x192x192:32x24x192x192_n"4500f4ac2ffe66f5b6c29ea2513423ea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x72x192x192:32x24x192x192_n"0c27f85ccf547fc93b24f06818449291"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x192x192:32x24x192x192_n"5cbd8e2d79c4d4c09d4ba18306217600"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x120x192x192:32x24x192x192_n"86b217005097f864ce937fe39b543b37"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x48x96x96:32x48x96x96_n"27d739630f1935b7c1152cd7669f6dc5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x96x96:32x48x96x96_n"44cae6b48bbcc0f29ea27a33772347a0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x144x96x96:32x48x96x96_n"cecf51085170c906a6f44afbddaaff08"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x96x96:32x48x96x96_n"35ec82e8a74b5b2db9efdfdde23a67d3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x48x48:32x96x48x48_n"06806e4e34043e90092a8e333e5d2c54"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x48x48:32x96x48x48_n"2a3a02172b28beb5422ead9d284e1414"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x48x48:32x96x48x48_n"a8c5b4800faa63e1f574d805aeb68c5a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x48x48:32x96x48x48_n"5e12adcea74c0691f63097fc1569199b*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x24x24:32x96x24x24_n"738390605095b82c39b444e1b343abdd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x24x24:32x96x24x24_n"b30a61505c031bf545b64c6a61ae48ec"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x24x24:32x96x24x24_n"fdb17f9ff176d1d46a524c98e6cef755"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x24x24:32x96x24x24_n"014520a07f08d31d49bae834de0b0fef"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x48x48:32x192x48x48_n"936bb7126892597c3d97b431177381c4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x480x48x48:32x96x48x48_n"5ec8a9748d79b52d1b2642fabde1fd72"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x576x48x48:32x96x48x48_n"fbf5e9b062c62615985126118f8669ee"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x672x48x48:32x96x48x48_n"4ae5313fd5d5329b6e1a011a252bca47"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x96x96:32x96x96x96_n"f4f87af4fd7cf1b7ccdd6f1a8d599d81"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x288x96x96:32x48x96x96_n"05678e13880ee350078e90a7b7757967"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x336x96x96:32x48x96x96_n"77888c234abb8037bce50aca36927155"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x96x96:32x48x96x96_n"d1b5280aff43f20702dcf1289ec582d7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x432x96x96:32x48x96x96_n"baade28285a003253fcd422e733286e8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x96x192x192:32x48x192x192_n"d6489c276a6cae64e099dcbc85f14ede"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x144x192x192:32x24x192x192_n"3b2884a795997dfb6a2a2e4bbcf65c74"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x168x192x192:32x24x192x192_n"537cfa94c4d297098d0f55dc00805152"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x192x192:32x24x192x192_n"4aa47840837a84226c6f30fe23f0a7c3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x216x192x192:32x24x192x192_n"c5eab8d7229a4e357344db584a8bf65b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x64x64:32x64x64x64_n"51938219ff56d3cb66ae6c302caf671c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x8x8:32x256x8x8_n"f37a634c8b3a8f56f61d4123de74c86d*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x8x8:32x512x8x8:32x512x8x8:32x256x8x8_n"f56f421f5a2aa39823ce0917136cfb74*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x56x56:32x64x56x56_n"f36d90a6f15a83f83df8b3b130670a6a*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x28x28:32x128x28x28_n"c9e08743c0d8f610b264d7d5319f48c2*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x14x14:32x192x14x14_n"c56cde6b9b206baff6306d7eb6ce215a*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x14x14:32x256x14x14_n"663fcb0fc01929687c589177f8497658*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x32x17x17:32x32x17x17:32x32x17x17_n"7dac9dea50b3ebc8c62aa837f2580a83*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x8x8:32x256x8x8:32x256x8x8_n"76f03c99b03ec2728918d64087cbd49c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x8x8:32x128x8x8_n"4c7bf39cfe6d31cedc7b671a88a44354*10"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x384x3x3:32x256x3x3:32x256x3x3:32x896x3x3_n"bf5d4c22565e0f6bb2babfb8462b271b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x3x3:32x192x3x3_n"cdbee7f0f56d6dab7d1a0de868f6f0d6*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x26x26:32x256x26x26_n"376cd48a84d71395e05abdc3229e924e&2c3c04c8f942154889877bc64b228a81"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x2048x64x120:32x512x64x120_n"f7fd0730e88f29f7fe671ae312486042"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x128x240:32x256x128x240_n"e974978513cb7e7059a971a8ee589ce5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x256x480:32x64x256x480_n"c5143a28c81878277250b6554f139a84"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=acdb:acdb --dtag=acdb 32x4x256x480:32x1x256x480_n"b3edb470cd8cbede50982aa4c940528c&9d2477f1729d7f9a381f4f3ed1aecc95"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x320x14x14x10:1x320x14x14x10_n"6ace2db2eaf28500b112425da1103186"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x256x28x28x20:1x256x28x28x20_n"65c971ebdf06ae0ba20cc713bb8bbe8c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x128x56x56x40:1x128x56x56x40_n"3a5909c0c5b7ef226880e01c6079d024"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x64x112x112x80:1x64x112x112x80_n"19b3855c3e03241d31c5f88cd164d8d9"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=aBcde32b:aBcde32b --dtag=aBcde32b 1x32x224x224x160:1x32x224x224x160_n"ba5c0a1cf8be2d4a08aef682fe71d758"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x320x320:32x64x320x320_n"03440d0202c50cfc6f13961bcee7749b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x160x160:32x128x160x160_n"25c573c5c55a585528c83da8073c5ed7*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x80x80:32x192x80x80_n"b965b87d211ea49848148b76bff3be0f*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x20x20:32x320x20x20_n"f9f2353c58ec8cec7c55ab0d8bfd8da1*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x320x20x20:32x320x20x20:32x320x20x20:32x320x20x20_n"6b348262a920808a3d02f2200da58aae"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab:ab --dtag=ab 32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128:32768x128_n"a4d243b9fe6d5dc87bc65f191420a113"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 32768x128:32768x351_n"d41f13c1c130d94e160713ca30a1079d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x24x256x256:32x24x256x256_n"0e175c521789b43ea1087e0fe57ba97e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x65x65:32x64x65x65_n"a57ec336e502888805512bcdc584339f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x32x56x56:32x16x56x56_n"884da99e7fbbb7da5c8ee7251c31a67a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x48x56x56_n"f1ddcad55a1dd44c2b3a76dd0dac2a07"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x56x56:32x16x56x56_n"7b97de6b990fe32191f5abf7e6ead2c6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x16x56x56_n"d7b56cd71384623b20af752289622a91"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x56x56:32x80x56x56_n"17dcb74cf2364bef5ea99833928d7a7b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x28x28:32x32x28x28_n"4cd4678b02e8a884ae2f0fc6a8804f87"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x96x28x28_n"c7d8985529d3a223ae90bbeb9b2daeb8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x28x28:32x32x28x28_n"40d6b6bad9a23ead231008b1879fe03a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x160x28x28_n"910399f56780beee64b0a6098a96cbbc"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x28x28:32x192x28x28_n"27af80538239d2b09dedb31d7189dc0d"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x64x14x14:32x32x14x14_n"14073a2b920b0721867c8d7e84c438d5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x96x14x14_n"aef60131b98a56fa19759926c2e552d0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x14x14:32x32x14x14_n"39e0e94998d69296ad281a358536e095"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x128x14x14_n"1b2df7d2757e06e48922cb69d9af062a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x14x14:32x32x14x14_n"c65720704401638588a62d0d76debea5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x160x14x14_n"dc598177e70da374f90b88c1884ffcea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x160x14x14:32x32x14x14_n"94b276936be0e29fb1a163c0b2c70e8c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x192x14x14_n"131f852ecaeca818507a293822f53ed5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x14x14:32x32x14x14_n"a998f04e0425ce88cbe5c6fee0609d1a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x224x14x14_n"31c8817909fea26faf95edccd376bce7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x224x14x14:32x32x14x14_n"7862b8664758e946e35de65615bc09a7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x288x14x14_n"e5be459c9fd9fc9f08c246a8f1602f70"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x320x14x14_n"0bb917a2b6429a54ca4e0c378c440301"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x352x14x14_n"6202d2db7cae9b8440f0d8c3904ab3e0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x384x14x14_n"95900d1e9a1834ec23eff54018f8c77b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x416x14x14_n"a57866780e5f61a5baf2d1a72c9b2e9e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x14x14:32x448x14x14_n"7b0d39c866867be70fd87c48b1cf7364"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x7x7:32x64x7x7_n"06ab8fb9cda8829ed38579c2ddf734d0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x192x7x7_n"d32025c9c30c98b070536e4f67423caa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x7x7:32x64x7x7_n"ae9f2f0da1c91b2e06edf0521869bf63"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x256x7x7_n"795714e00994cad6ef2492728d15de0b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x7x7:32x64x7x7_n"e8d8ebc958c8117fbabc94ebf2e8ceb6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x320x7x7_n"354dcf884e28e681dcf6f0dfdfbbfac3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x46x60:32x512x46x60_n"ebaaf49d4ecb0adeccd29ac2c6df7fef"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x92x120:32x256x92x120_n"971955b7787ab352d6f440003baac8a1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x184x240:32x128x184x240_n"325f1ceb2514941a687baa7a3a83ce9b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x368x480:32x64x368x480_n"548e933cf434e05cbf502e9284d97476"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x32x56:32x19x32x56:32x38x32x56_n"717058717322a318f5070b8279d97756"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x19x32x56:32x38x32x56_n"d363f8c8d8cc398e7b06d52b62119cba"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x32x56:32x57x32x56_n"78e3546749594585c6e186cf1a5a4e7e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x2048x64x120:32x512x64x120_n"ae0c786d86241164cf8555658799709f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x128x240:32x256x128x240_n"98e53190d73f45e2d1950d18b8bd8745"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x256x480:32x64x256x480_n"ff1224f0f578ef6c169c7ba6280cba8c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x192x13x13:32x192x13x13_n"a1d9d91927ff3f11a3d7e0b63579f90e*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x13x13:32x192x13x13_n"eb247c71cd68f7130a767bbccf0a3ed6*2"
diff --git a/tests/benchdnn/inputs/concat/option_set_fwks_key_gpu b/tests/benchdnn/inputs/concat/option_set_fwks_key_gpu
index 62aecee1b06..db7b3c5493a 100644
--- a/tests/benchdnn/inputs/concat/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/concat/option_set_fwks_key_gpu
@@ -1,110 +1,118 @@
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x320x14x14x10:1x320x14x14x10_n"56d07bab77a9595adf4e3dfcbd264364*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x256x28x28x20:1x256x28x28x20_n"e58f7923ba70fcbd93eaabbae6957cd1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x128x56x56x40:1x128x56x56x40_n"253c6058f59e17e05db894cd8c10914b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x64x112x112x80:1x64x112x112x80_n"131adb3557f1c66318904f0a48fa2d31*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x32x224x224x160:1x32x224x224x160_n"1762c8fd36ba5a687f2890c21d4eb9fb*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x512x512:1x32x512x512_n"c4ad01b452e8281d5c24fb02e1325df6*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x208x208:32x64x208x208_n"4d466518d4a0ece501634c6854fc72d0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x104x104:32x64x104x104_n"65b83579ba1ea375978f7003aaffd3e5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x52x52:32x128x52x52_n"f6a038afa920ea39eb19d5bd5f57edd0*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x26x26:32x256x26x26_n"12d676bf672d0432e339c05127bee135*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13_n"9f4353d47292dd9a750e32de136f2179*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13:32x512x13x13:32x512x13x13_n"01cd12e2994a266224d9a137d5d36053*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"42e38fb7c8606b78d2747b90b81c3623*1&c478efea0614a620a1755ac8a2925ca0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=acdb:acdb:acdb --dtag=acdb 4x56x10x16:4x56x10x16:4x56x10x16_n"859a9daf6bb3ecf6b6dbf24d731684a2*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x64x208x208:64x64x208x208_n"6478e902abde6d75c6bc35fb3366d159*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x64x104x104:64x64x104x104_n"dfb1fe41b1586ce40b7409960df3c7ea*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x128x52x52:64x128x52x52_n"1b5d0b0727bb619bd47e2f490f1c6a79*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x256x26x26:64x256x26x26_n"8cacf0ed78f13a96d26277a339f8f887*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x512x13x13:64x512x13x13_n"98f0930d175a021dba8d16b2a50b837f*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x512x13x13:64x512x13x13:64x512x13x13:64x512x13x13_n"efb9858712b8f9eae7248e274b12edbe*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x2x52x52:64x2x52x52:64x2x52x52_n"23ccd78b71da480cb48fdca0326607f0*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x1x52x52:64x1x52x52:64x1x52x52_n"27ed3bd2da6240e17ca638155b7c61f6*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x80x52x52:64x80x52x52:64x80x52x52_n"ded24af9890d4ceecb9daaad67222bba*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x3x52x52:64x3x52x52_n"e720a930a8c64d2a813e2be0d2b30315*2"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc:abc:abc --dtag=abc 64x8112x1:64x8112x1:64x8112x1:64x8112x1_n"e938fb759d6d802b10473b5479005fe5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x2x26x26:64x2x26x26:64x2x26x26_n"75fd3365bdc1b54872b4c19525ba0390*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x1x26x26:64x1x26x26:64x1x26x26_n"fc4b9665c393dd3ce11804a3d64fa0b0*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x80x26x26:64x80x26x26:64x80x26x26_n"69989b24c752ab19293fb219822b49d8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x3x26x26:64x3x26x26_n"6e83aa7c322259d46599246e35fc5b81*2"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc:abc:abc --dtag=abc 64x2028x1:64x2028x1:64x2028x1:64x2028x1_n"ef6136e09ac8c6bf7b0cdb25255ff67f*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x2x13x13:64x2x13x13:64x2x13x13_n"f74c3067ffc225732d6af4895a67afb9*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x1x13x13:64x1x13x13:64x1x13x13_n"3f52a09e8415066c4939799a87c775fd*5"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x80x13x13:64x80x13x13:64x80x13x13_n"668ac4b3c017b586c5ddd276afabf3b9*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x3x13x13:64x3x13x13_n"2d8b1c68a45827ddbe67b5ca8d599f24*2"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc:abc:abc --dtag=abc 64x507x1:64x507x1:64x507x1:64x507x1_n"2c3a6a452aa14581fb95c5eeb6085888*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x8112x1x4:64x2028x1x4:64x507x1x4_n"ca6d0a11d5c800ea3649eade964deeba*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abc:abc:abc --dtag=abc 64x8112x80:64x2028x80:64x507x80_n"17fb08a11c310784522024142b1d5cf2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 65536x1:65536x1_n"8efedab63b2976cb3680fc608e80c493*8"
---reset --allow-enum-tags-only=0 --axis=0 --sdt=bf16 --ddt=bf16 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2_n"a67fc6b1ad0fe7c532fcce6573c5d606*2"
---reset --allow-enum-tags-only=0 --axis=0 --sdt=bf16 --ddt=bf16 --stag=abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde --dtag=abcde 1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2_n"cea2ca97d3473b0dca6b0067c98adecb*2"
---reset --allow-enum-tags-only=0 --axis=3 --sdt=bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 23x72x72x1:23x72x72x1_n"e9dc409a90182f4c2cdef66a0ca6018e*2044"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 23x36x72:23x36x72_n"8aa7d0c731a302ddda43b8a51482bb3d*8"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 23x72x36:23x72x36_n"143e5631e50942469e6b048119daffc0*8"
---reset --allow-enum-tags-only=0 --axis=0 --sdt=bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"71a809c0796056bb0ab67a4c3d2ff80e*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x320x14x14x10:1x320x14x14x10_n"557729c8493a9f9eea5df1f022f7c083*67"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x256x28x28x20:1x256x28x28x20_n"f79192b6f38d8014d957c688008ba92b*67"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x128x56x56x40:1x128x56x56x40_n"5174689b6aa913cb7d8b3763a794c1bb*67"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x64x112x112x80:1x64x112x112x80_n"3a54067cad4e59353e6689554147fb9e*67"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x32x224x224x160:1x32x224x224x160_n"262c272c945fce1f60962c4ace73683e*67"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=bf16 --ddt=bf16 --stag=acb:abc --dtag=acb 4x128x256:4x128x1024_n"29f91a789b26d4dbfd27f46805e484aa*1"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=bf16 --ddt=bf16 --stag=acb:abc --dtag=acb 4x512x128:4x512x256_n"141bf2cd0e4caaabea254f6e816b2de5*1"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 4x128x256:4x128x1024_n"d46b712b70e1ad0cdcf40f51810be7c4*2"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 4x512x128:4x512x256_n"5b7d020084554594b17f92df76855522*2"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 2x128x256:2x128x1024_n"9ef99668c7bf41d99308a8b77b60af77*1"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 2x512x128:2x512x256_n"59042a34db03dfeda5ef53dca5987e72*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 16x64x15000:16x1024x15000_n"b5b0a972886560356440786ecae8f514*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1_n"eb26648649d229431132060afe26c528*1&eb26648649d229431132060afe26c528*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 16x128x1x1:16x351x1x1_n"a729fce1aa3090c6321dcca81c1b32c8*1&a729fce1aa3090c6321dcca81c1b32c8*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 16x64x15000:16x1024x15000_n"8f4e20393408565d2ad6b36c503d416e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x64x208x208:64x64x208x208_n"5f48ed88b4636120b833fd0b6a187d1d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x64x104x104:64x64x104x104_n"972219d79255a912f300d4fef001ade0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x128x52x52:64x128x52x52_n"e9dbb9b7346e75df7d845dbd9dea1c11*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x256x26x26:64x256x26x26_n"ee56e41519d1a2a8a806881e7d5b35a8*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x512x13x13:64x512x13x13_n"d4a73c10e67a95ae0d8c83b774a132b6*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x512x13x13:64x512x13x13:64x512x13x13:64x512x13x13_n"e7e15ee06ffc5da81ed6b2328323ca23*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x160x160:32x48x160x160_n"f5ab79b8399956eb20ba28494497083e*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x80x80:32x96x80x80_n"5cae5d8e4ed5c11365e1d7d3f5c1e51b*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x40x40:32x192x40x40_n"e0fba21c26c422a27633539edba00742*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x20x20:32x384x20x20_n"bce660b4e60a9cc8495a89aceaa42576*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x20x20:32x384x20x20:32x384x20x20:32x384x20x20_n"413e7eafa22e6ba7b03563b1a0e0c327*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x40x40:32x384x40x40_n"b868bb9a3bd7bf29e0d443f52eaf7b26*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x80x80:32x192x80x80_n"38b369568e05510c4b49e208a6e1ad46*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x208x208:32x64x208x208_n"19c0e223782fb058a35262320a12fce4*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x104x104:32x64x104x104_n"87f251bf22356b9b8dc0900c30db48e5*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x52x52:32x128x52x52_n"6600a6a29572df5be2cf401e15380398*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x26x26:32x256x26x26_n"f1eebd962f7171534207ba192d392c2e*3"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13_n"f23b49445234156ebf45abfeea57c182*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13:32x512x13x13:32x512x13x13_n"fb514d286ac5aaa83ef951857186b212*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=acdb:acdb:acdb --dtag=acdb 4x56x10x16:4x56x10x16:4x56x10x16_n"5e5364ad0bb3603b24ec9f640df8e3ce*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x13x13:1x512x13x13:1x512x13x13:1x512x13x13_n"54cf851c367d556216fa11bf3397fc9d*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x512x32x32:8x256x32x32_n"51016b36c7e2213c1f4889ad6d6f424b*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x256x64x64:8x128x64x64_n"d361f138e6d1a0cb931835b80e75dad2*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x128x128x128:8x64x128x128_n"b1d55dc15caa5e04b91fb4e29d64d519*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256_n"f342353ae57d8065a7a1240baff54526*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x256x64x64:8x128x64x64:8x128x64x64_n"6bfdd39bd46aac6a1a21865e29f57de7*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x128x128x128:8x64x128x128:8x64x128x128_n"5d19ae0cd9cd08a767d10a6ecd71590a*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256:8x64x256x256_n"25a4bad4fa36429a32d42e88dbe67f76*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x128x128x128:8x64x128x128:8x64x128x128:8x64x128x128_n"fb635d010178e9df4402f55c88dfae2c*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256:8x64x256x256:8x64x256x256_n"a228e26ddb9810e372b1c4925e73ca01*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256:8x64x256x256:8x64x256x256:8x64x256x256_n"e515ab2763fa122710cf3f056af0a1a1*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=ab:ab --dtag=ab 65536x1:65536x1_n"e6a58f2ca6cbf22e4e73c94968ea2dfd*8"
---reset --allow-enum-tags-only=0 --axis=0 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2_n"511f0b4f5be0b37b2d29e3cad485daa6*2"
---reset --allow-enum-tags-only=0 --axis=0 --sdt=f32 --ddt=f32 --stag=abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde --dtag=abcde 1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2_n"413887f4346b22d5d9eb9e687fc5977c*2"
---reset --allow-enum-tags-only=0 --axis=3 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 23x72x72x1:23x72x72x1_n"66a8b91fb0ccf86ad751612f1dbd11b3*2044"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 23x36x72:23x36x72_n"676d89552ba0fc2b9f755b50e1fe5dcf*8"
---reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 23x72x36:23x72x36_n"078be5ec44eb579c1ea1517ed9b932dc*8"
---reset --allow-enum-tags-only=0 --axis=0 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"2d3cd0aee4b688af321eb2e9df032cbd*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x8x8:1x1280x8x8_n"62a29cfa96ac1901f231fc50c11bc932*6"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x16x16:1x1280x16x16_n"3dc80e98748f3f1d7c2faeaee491a542*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x16x16:1x640x16x16_n"56ddb3a69d13366e60493a0dcc520956*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x32x32:1x640x32x32_n"ab396e6ec0a02f4a350d899700ab8d60*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x640x32x32:1x640x32x32_n"60f04294692c12fbaa11633c6830e288*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x640x32x32:1x320x32x32_n"213f51bd207a9b8c8be11b3133533e50*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x640x64x64:1x320x64x64_n"5553d021c8b32cb31388a0898964906c*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x320x64x64:1x320x64x64_n"cbfb015e4db11a6e43851e751cbecf40*4"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x13x13:32x512x13x13:32x512x13x13:32x512x13x13_n"5db9d5c1ca10e1c73bfea002c1f7acbd*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x26x26:32x256x26x26_n"3b373f090b300a3e7940757c0c11d5fc*2"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x52x52:32x128x52x52_n"5e0ade83cc223fac22105e7347dff756*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x13x13:32x512x13x13_n"ebf9e83b61d413277c3fde703b2d02f0*1"
---reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x13x13:1x512x13x13:1x512x13x13:1x512x13x13_n"34e9206c2c27925ddb33473b3c999e01*1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x8x8:1x1280x8x8_n"0a4b6d151a3c8fd17bec58dc5620f0ce*12&221fd14fcac8cbc09224289496ee4f79&d3c6ed57206f6af553e4416584a248c0*3&23b69faae9e0a5c3333c54c0d7ff8b1b*6"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x16x16:1x1280x16x16_n"a1e47c2727d65678e79779bbdb677365*4&a0062f5bd71a6152fa839597a8cfd4a4&371ff7006a4b9e852c2553f8aa7a837f&26b45f685ef01947a91d7c77b635c768*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x16x16:1x640x16x16_n"a53979f5de2d28249192f08bc4560973*4&e4cf7fa093a357e9c9e7860fbaf6b5f7&eb9d71049186457d8054cab23327a6a0*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x640x32x32:1x640x32x32_n"133cffe97980489f30a046f7f54d6628*4&989b1da1a458783eb2d379a2544ec40c&01ecfbc8452bfb9b81e23b7d3509435a*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x640x32x32:1x320x32x32_n"58990d9a2d3b75477a87255b05c09bb3*4&7fdb159506636d9a4bc0065e59c91199&f9b40258aefba8f2c4ec005cc6025ec7*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x320x64x64:1x320x64x64_n"0e6e4d23a9b559da21129c1f9d9e036d*8&063cb82f6f9b97a36b6f2068e6bf827c*2&72c87a732d04d02a0cfd81484647bb20*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1:16x128x1x1_n"0a819780fdc81e2ae384a68bb27efd09&0a819780fdc81e2ae384a68bb27efd09"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=abcd:abcd --dtag=abcd 16x128x1x1:16x351x1x1_n"2373e09d6a61700b91730d6802976b57&2373e09d6a61700b91730d6802976b57"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=acdb:acdb:acdb --dtag=acdb 4x56x10x16:4x56x10x16:4x56x10x16_n"71f8d2bd5d98f803a755dbdcdeaad54d*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b --dtag=aBcd32b 1x48x512x512:1x32x512x512_n"fca1e14a2ee6dba94e3f2294ecd1e1ff"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x1280x8x8:2x1280x8x8_n"b292cc3bb53c687733f26c156d561f5b*60&c3c055a884c35357c1b282240e909f4c*60&14d70e4060d48fbd3d780f8d8e82582d&5c447dc0963b0b63faba91ae829b014e*3&730b1ceb161dd00e4ba694e1fe309809*60&27b2ddcfce3167e6bad4d80402111f2d*60&e7ce3a2c5e533a5008954f2167167f2d&39641069970906ed6b4bd2bb81801e8c*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x1280x16x16:2x1280x16x16_n"b6377fe151715f9ee3bc3457d328928e*40&f3b4b8829dec4efe5a0b61dbb80e6fde*40&d845f358e887f3b9eae3c5469bea4b23&58a4d0d13ba5f233473d5c1d0a267e35*2&e5ecd6fdc396fde548de36edf19195b3*40&94b2503935159bbd739c5268cd847e67*40&14dc73dfc7e71856bf0f3a92a78a4d98*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x1280x32x32:2x640x32x32_n"eccd349efd3d63a35c201dde5a726040*20&4533b7b5e9633a7ae13de11720c5c15e*20&6eae6b180b47486b31b0a8a5a2918e1e&e4a5b14b35c4ea33503482c362b131db&59a338954c59a09c75c42a8d29d83f7b*20&814e2253bf35a8826e87edebc896a338*20&b3464b04154767c378a857bde31bcd28"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x640x64x64:2x320x64x64_n"3cdaf1c807a95cb8fac15a4eac5546ec*20&22d477d5f0cbddcf323442f3b2368680*20&b2f5c1e2dd49de40f192274cd2ba674e&a01f5452b69cc28e5bc441b37f6f646d&c3093b8d9b466ff0f0477237e601503d*20&049ea41e61dfdec67f9990df60e46a02*20&11cdefb7b3be76e45bf8f7057c15a508"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x64x208x208:64x64x208x208_n"d5ef7fb0eed6fb26b51b117df32427ea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x64x104x104:64x64x104x104_n"cbb86cfce66e3fc4f45b525bc16f4284"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x128x52x52:64x128x52x52_n"165d6c3475fb9cc85614a8d21451f594*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x256x26x26:64x256x26x26_n"d3cb1d13700e0f5f2c94c81065a84640*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x512x13x13:64x512x13x13_n"c8b916776fd8ebaf860ede487b29ef2e*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 64x512x13x13:64x512x13x13:64x512x13x13:64x512x13x13_n"545999d812153022cf971f52ae613160"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x2x52x52:64x2x52x52:64x2x52x52_n"1ce7a1b625b77ff7f077a8922e277c87*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x1x52x52:64x1x52x52:64x1x52x52_n"2ac957d5950b6a6cfa71e1f4d2abc5fb*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x80x52x52:64x80x52x52:64x80x52x52_n"9b5420cad75b131752bd2fbefa4edb6f"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x3x52x52:64x3x52x52_n"fd91017746a7940e5b454a8dc714dd19*2"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc:abc:abc --dtag=abc 64x8112x1:64x8112x1:64x8112x1:64x8112x1_n"915d265d742dc86cb94842d7ea9251d2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x2x26x26:64x2x26x26:64x2x26x26_n"6b1bd8813b0e99f3afe2bbd65188a8a9*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x1x26x26:64x1x26x26:64x1x26x26_n"fdacb1e1f20abfd0e518e0ef0255620b*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x80x26x26:64x80x26x26:64x80x26x26_n"e18582c4b3a9ea3eae76b145bbf5c082"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x3x26x26:64x3x26x26_n"dc1bf9cab249cecbc81eecf377078719*2"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc:abc:abc --dtag=abc 64x2028x1:64x2028x1:64x2028x1:64x2028x1_n"2db930bf021b17211bae01ec12c92283"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x2x13x13:64x2x13x13:64x2x13x13_n"1212b0f44856b01e3ea19a5125ceca42*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x1x13x13:64x1x13x13:64x1x13x13_n"c90815858c247dd4769e8f9c1a9948aa*5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x80x13x13:64x80x13x13:64x80x13x13_n"0ca5091c94ee9f543f7dd313fbdc65d5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x3x13x13:64x3x13x13_n"474277388b879d60a19b341a96210f62*2"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc:abc:abc --dtag=abc 64x507x1:64x507x1:64x507x1:64x507x1_n"a5eef46bd40ccfd126827d06ebab92f8"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd --dtag=abcd 64x8112x1x4:64x2028x1x4:64x507x1x4_n"0acf989fcf08e9da3fd900b1f882e198"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abc:abc:abc --dtag=abc 64x8112x80:64x2028x80:64x507x80_n"95fb8b7bc982b5fbf1feb8c071d3dbf7"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=bf16 --ddt=bf16 --stag=acb:abc --dtag=acb 4x128x256:4x128x1024_n"06120f9743cf50ac308ea4eaccf037ae"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=bf16 --ddt=bf16 --stag=acb:abc --dtag=acb 4x512x128:4x512x256_n"a2880a036cef1a6d5f82de7904b33f61"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x208x208:32x64x208x208_n"d401dec9e5a49f90ebacb49d80c53de1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x104x104:32x64x104x104_n"b7229ee7cb18d1fa15d06f9ad1c02ca0"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x52x52:32x128x52x52_n"01f202b500a4d237e3cb149de61b2563*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x26x26:32x256x26x26_n"b1612cf049322c78103d38695d56a9d7*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13_n"1ecb3ddc977b24b829b8c4814226faec*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=s8 --ddt=s8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13:32x512x13x13:32x512x13x13_n"97d491916509cbab331e8c256c2a4eea"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x1280x16x16:2x640x16x16_n"e8493dcf038166e302d53cdc4097b00c*20&5869caca53b5a8ab0ceb2ea2243313ee*20&4c8af754eb1690d5931b19938bb5f644&3ef3b7e64d9c5757bdd7a1bc38531592*20&3ed6c48e9da3f4a2137f45ee6a7f2cf3*20&092b65d93c48e7ae9ce995a18da1ef66"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x640x32x32:2x640x32x32_n"71835be6392e99326a1cafcf9dfbcdef*20&f358cce0b7ef07993240429058465111*20&516f9d4b64f1aacc52e0cfc8af54895b&d28a746686db0cd74905269a61e74961*20&5597cd6c3cdd5cd5880f8ffccd0e1cfb*20&652f17ab2f762aff27a96d626c09685a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x640x32x32:2x320x32x32_n"fbfb2ca95a7e62739641f74faf5cd31f*20&fbfe2704cbedfd4498ea074d1597e1d3*20&b065521c1ef0604db64c53e1da604d42&9398b27f7b38529ee87a5e86c630a07d*20&4c0e3b577e4946229b9cc887ac508bdf*20&bce7db2390609b4280587da321c7162a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 2x320x64x64:2x320x64x64_n"4e9c3c0f78813dda70fb7362895c0b99*40&61d598ce2502d4881c0819b6cb78706f*40&a84efa4407fb3e3ff7bc2c8fab8ca5b8*2&49eceded60d3b2b1a15c9ae887c8114d*40&15e9d10af32df99db42d0dcb7968d2b0*40&26a8a90bd024cec3dca296e1dab17ed2*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x320x14x14x10:1x320x14x14x10_n"158bf1784569a9014250c396eaaeadf6*67"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x256x28x28x20:1x256x28x28x20_n"52ee1da950d022b86e72234e53a47fd4*67"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x128x56x56x40:1x128x56x56x40_n"508e3cc6c766e442ac58605605953ed7*67"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x64x112x112x80:1x64x112x112x80_n"f48aaee043f2ba8a84b4b078056fd479*67"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=aBcde16b 1x32x224x224x160:1x32x224x224x160_n"a6ad918013aed6555254a92aff45ffb2*67"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x208x208:32x64x208x208_n"9c7fb4eb91241b95f11e507577d424b1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x64x104x104:32x64x104x104_n"dce3d09f497055fa9bb112ca0da5809e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x128x52x52:32x128x52x52_n"412cd36048b6a60764dc2ecf7563f2e6*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x256x26x26:32x256x26x26_n"6ffaca9b5f50c61edcc9b2329b7bca97*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13_n"e97b32981a81053b37c4dca90a298be5*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=ABcd32a32b:ABcd32a32b:ABcd32a32b:ABcd32a32b --dtag=ABcd32a32b 32x512x13x13:32x512x13x13:32x512x13x13:32x512x13x13_n"d8dd6fdfb9424439a00f1c105dc89b33"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=u8 --ddt=u8 --stag=aBcd32b:aBcd32b:aBcd32b:aBcd32b --dtag=aBcd32b 1x512x13x13:1x512x13x13:1x512x13x13:1x512x13x13_n"86de1b3f07b38f00dbeba1acc98fd8fd"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 16x64x15000:16x1024x15000_n"7e141978235caebb477746ab0efa97dc"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 4x128x256:4x128x1024_n"a51c55b13e4ac4ea2e9fef1021c7a7d6*2"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 4x512x128:4x512x256_n"3aa7fa8141732ee031813837e419a625*2"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 2x128x256:2x128x1024_n"cd84a6bd661933fddc4255728c9e468f"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=acb:abc --dtag=acb 2x512x128:2x512x256_n"8b2801985d72b97fa05ff43c2a9eb64b"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab:ab --dtag=ab 65536x1:65536x1_n"04bf43621b30d7b678da033b4d0e5da1*8"
+--reset --allow-enum-tags-only=0 --axis=0 --sdt=bf16 --ddt=bf16 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2_n"61c3e4cc96d9e46963323da8836dc2e0*2"
+--reset --allow-enum-tags-only=0 --axis=0 --sdt=bf16 --ddt=bf16 --stag=abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde --dtag=abcde 1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2_n"8d57b557c61f3da53b295b0aa9bdf16f*2"
+--reset --allow-enum-tags-only=0 --axis=3 --sdt=bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 23x72x72x1:23x72x72x1_n"31ac7d2138d5c79fc508903f31ae42ae*2044"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 23x36x72:23x36x72_n"c3888403b286f6bf64830a3a71fbeefa*8"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 23x72x36:23x72x36_n"47540620aa3f15c6ebf1bd9878211d7f*8"
+--reset --allow-enum-tags-only=0 --axis=0 --sdt=bf16 --ddt=bf16 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"4a196e1c3f4efc64c24f12d0ee93151f*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=ab:ab --dtag=ab 65536x1:65536x1_n"1cbd538bce41c948c6bb5ec769d25c5f*8"
+--reset --allow-enum-tags-only=0 --axis=0 --sdt=f32 --ddt=f32 --stag=abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd --dtag=abcd 1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2:1x256x256x2_n"4f62cd5a65138e4359a42feeaca831a5*2"
+--reset --allow-enum-tags-only=0 --axis=0 --sdt=f32 --ddt=f32 --stag=abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde:abcde --dtag=abcde 1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2:1x72x72x256x2_n"f274d4c89a1fba9cbd820bf044a66dad*2"
+--reset --allow-enum-tags-only=0 --axis=3 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 23x72x72x1:23x72x72x1_n"9964dd670aa4146bda98aa64e3c8aae0*2044"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 23x36x72:23x36x72_n"03daddf18ce585b9791567a3d0548960*8"
+--reset --allow-enum-tags-only=0 --axis=2 --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 23x72x36:23x72x36_n"571d430f06ef3b58869257a59a20708e*8"
+--reset --allow-enum-tags-only=0 --axis=0 --sdt=f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1x23x72x72:1x23x72x72_n"4c3c49925ece7b56011314779b476ff0*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=acdb:acdb:acdb --dtag=acdb 4x56x10x16:4x56x10x16:4x56x10x16_n"ce75e48f07efe1d2c3e59c03e0409ef4*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x48x160x160:32x48x160x160_n"0b1ac4faec6b8b96fd1257c9107fc3f1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x96x80x80:32x96x80x80_n"6ff4d1f417f5d9abcdbb836212dc2961*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x40x40:32x192x40x40_n"e75565c98b6351117d93037bbbf61220*4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x20x20:32x384x20x20_n"a87a3e8600c39f4ad48fb3b9a41c61d1*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x20x20:32x384x20x20:32x384x20x20:32x384x20x20_n"e4596b3aec96d309720f021a4a012229"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x384x40x40:32x384x40x40_n"0526aa9791d0082bdc5e8a0b32205c37"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x192x80x80:32x192x80x80_n"86754e7c2e5bacb227abadd4cf107370"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x1280x32x32:1x640x32x32_n"25435dfe77e9e38cea5913a2048afdc6&4153777c3371dc531762b1c0e42453df*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 1x640x64x64:1x320x64x64_n"96ff4441f20725187965e0c97f653305&a4fdacf3440133c2945ae045f16097ba*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x384x20x20:1x384x20x20:1x384x20x20:1x384x20x20_n"36bd3c15ceb1407197b537f07e6e37ed"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=bf16 --ddt=bf16 --stag=abc:abc --dtag=abc 16x64x15000:16x1024x15000_n"9f1a8422f1f3361ed7d1d8db4269d283"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x13x13:32x512x13x13:32x512x13x13:32x512x13x13_n"12a57fdc85a2d41b7bc8179dfb62db23"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x256x26x26:32x256x26x26_n"883894bebf6ea358464021c67a25d954*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x128x52x52:32x128x52x52_n"3aab5a043f2d329354e670311ac13046"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 32x512x13x13:32x512x13x13_n"ab050d1ab7ccfe4ce77cd7a45abb61ac"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 1x512x13x13:1x512x13x13:1x512x13x13:1x512x13x13_n"ee8fc6482f8c3f9b6b8442b236bc6bd7"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x320x14x14x10:1x320x14x14x10_n"ad2f0c7e4f89cae0e5b7bce51096ab3e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x256x28x28x20:1x256x28x28x20_n"c487112418e6233917d5cf6386c4ae86"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x128x56x56x40:1x128x56x56x40_n"f4231c105bbabadd702025b346751b52"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x64x112x112x80:1x64x112x112x80_n"56ae40bd30d5a5eec7aa9aebf40d6a29"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f32 --ddt=f32 --stag=aBcde16b:abcde --dtag=any 1x32x224x224x160:1x32x224x224x160_n"b7c690016b2f2110bab93c376a26947a"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x512x32x32:8x256x32x32_n"f62caef5ee8bccb01435153a5acb17a4"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x256x64x64:8x128x64x64_n"40d6f66779ad4a54d35673322a2272ee"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x128x128x128:8x64x128x128_n"e1b3a15e21461ca9324b7373be42076c"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256_n"fce114aaf129ae5bef6ab162c2c7899e"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x256x64x64:8x128x64x64:8x128x64x64_n"dbf95fa41209c3f6bafc7fba84a8b3d1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x128x128x128:8x64x128x128:8x64x128x128_n"13c5ac9c31c1bb27864c78992359b4aa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256:8x64x256x256_n"5a4d9b7aa8a066fb2b9063b78fe6b3a1"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x128x128x128:8x64x128x128:8x64x128x128:8x64x128x128_n"30ac3c275f5b49d85dab5967103e0658"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256:8x64x256x256:8x64x256x256_n"d4b631ef8464ace4ff67813978a1eba2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=aBcd16b:aBcd16b:aBcd16b:aBcd16b:aBcd16b --dtag=aBcd16b 8x64x256x256:8x64x256x256:8x64x256x256:8x64x256x256:8x64x256x256_n"26710e5d8418ac78803564598b692ffa"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x64x208x208:64x64x208x208_n"ecdf07160f738a1aef008db8f8abe1c5"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x64x104x104:64x64x104x104_n"e1886e0178d0ecdf4c7be6495a7034d3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x128x52x52:64x128x52x52_n"e5d37a67732767277f55950d0e1b1a22*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x256x26x26:64x256x26x26_n"a57a5432d1f82a70e272a13733a786e1*3"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x512x13x13:64x512x13x13_n"1654436f1d75221f56196b64422f3bb8*2"
+--reset --allow-enum-tags-only=0 --axis=1 --sdt=f16 --ddt=f16 --stag=ABcd32a16b:ABcd32a16b:ABcd32a16b:ABcd32a16b --dtag=ABcd32a16b 64x512x13x13:64x512x13x13:64x512x13x13:64x512x13x13_n"8e0ec9f4b86ff7913b9818634e153a56"
diff --git a/tests/benchdnn/inputs/concat/test_concat_large_gpu b/tests/benchdnn/inputs/concat/test_concat_large_gpu
new file mode 100644
index 00000000000..a7a37c6c91a
--- /dev/null
+++ b/tests/benchdnn/inputs/concat/test_concat_large_gpu
@@ -0,0 +1,40 @@
+--reset --sdt=f8_e4m3 --ddt=f8_e4m3 --stag=abc:abc --dtag=abc 1580x502342x3:1580x42x3
+--reset --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc 4x337810x992:4x17x992
+--reset --sdt=f32 --ddt=f32 --stag=abc:abc --dtag=abc 200x4334x2:200x1426859x2
+--reset --sdt=f32 --ddt=f8_e5m2 --stag=abc:abc --dtag=abc 7x177x104099:7x1538x104099
+--reset --sdt=f64 --ddt=u8 --stag=abc:abc --dtag=abc 48x22575x395:48x1x395
+--reset --sdt=f32 --ddt=f8_e4m3 --stag=abc:abc --dtag=abc 2x147222x3411:2x9752x3411
+--reset --sdt=f8_e5m2 --ddt=bf16 --stag=abc:abc --dtag=abc 267x4x7:267x743407x7
+--reset --sdt=f32 --ddt=f8_e5m2 --stag=abc:abc --dtag=abc 1342x81931x5:1342x21x5
+--reset --sdt=s8 --ddt=f16 --stag=abc:abc --dtag=abc 57x128x4:57x6335628x4
+--reset --sdt=f8_e5m2 --ddt=f16 --stag=abc:abc --dtag=abc 7x14x12:7x20834160x12
+--reset --sdt=bf16 --ddt=s8 --stag=bca:bca --dtag=bca 75x42131x2:75x8331679x2
+--reset --sdt=u8 --ddt=s8 --stag=bca:bca --dtag=bca 220x4189x2089:220x585x2089
+--reset --sdt=s8 --ddt=u8 --stag=bca:bca --dtag=bca 345x18x2:345x3534724x2
+--reset --sdt=f32 --ddt=f32 --stag=acb:acb --dtag=acb 8x43375638x2:8x85x2
+--reset --sdt=f16 --ddt=u8 --stag=acb:acb --dtag=acb 7x3x11:7x15515992x11
+--reset --sdt=f16 --ddt=f16 --stag=acb:acb --dtag=acb 830x5x1:830x1755207x1
+--reset --sdt=f32 --ddt=bf16 --stag=bac:bac --dtag=bac 19x5x2079478:19x16x2079478
+--reset --sdt=bf16 --ddt=f64 --stag=bac:bac --dtag=bac 233x3123x3:233x563267x3
+--reset --sdt=u8 --ddt=u8 --stag=bac:bac --dtag=bac 1x1630655x353:1x7001238x353
+--reset --sdt=f32 --ddt=f32 --stag=cab:cab --dtag=cab 760x20x60:760x12440x60
+--reset --sdt=f16 --ddt=u8 --stag=cab:cab --dtag=cab 2460x1x13:2460x63948x13
+--reset --sdt=bf16 --ddt=f64 --stag=cab:cab --dtag=cab 10x20x791331:10x35x791331
+--reset --sdt=f32 --ddt=bf16 --stag=cba:cba --dtag=cba 15379x2103x30:15379x52x30
+--reset --sdt=bf16 --ddt=u8 --stag=cba:cba --dtag=cba 6x247691386x1:6x110558x1
+--reset --sdt=f32 --ddt=f8_e4m3 --stag=acb:cba --dtag=acb 117x132x54263:117x31x54263
+--reset --sdt=f64 --ddt=s8 --stag=acb:acb --dtag=abc 2x219360x1493:2x3x1493
+--reset --sdt=f16 --ddt=f64 --stag=abc:cba --dtag=acb 9x64x789577:9x10x789577
+--reset --sdt=f8_e4m3 --ddt=f8_e4m3 --stag=bac:cba --dtag=acb 392x14999x522:392x1x522
+--reset --sdt=f32 --ddt=f16 --stag=cba:cba --dtag=bca 1x3848602x195:1x238x195
+--reset --sdt=f8_e5m2 --ddt=f16 --stag=bca:abc --dtag=bac 10404x12x2092:10404x59x2092
+--reset --sdt=f64 --ddt=f8_e4m3 --stag=bac:cba --dtag=bca 40x5907043x2:40x327312x2
+--reset --sdt=f16 --ddt=f32 --stag=cab:cab --dtag=cba 45983x3835x6:45983x12x6
+--reset --sdt=f32 --ddt=f8_e4m3 --stag=acb:bac --dtag=cba 25189x73x350:25189x5x350
+--reset --sdt=f8_e4m3 --ddt=f8_e4m3 --stag=acb:bca --dtag=cba 25314x59x78:25314x1094x78
+--reset --sdt=f64 --ddt=f16 --stag=acb:bca --dtag=cab 3x199x542180:3x83x542180
+--reset --sdt=bf16 --ddt=f16 --stag=bac:bac --dtag=cba 1x211x4605883:1x116x4605883
+--reset --sdt=bf16 --ddt=f64 --stag=acb:acb --dtag=bca 1469x124250x3:1469x28x3
+--reset --sdt=s8 --ddt=u8 --stag=cab:abc --dtag=cba 6x1489864x171:6x1069566x171
+--reset --sdt=f32 --ddt=f32 --stag=bca:cab --dtag=bac 10301x2103x1:10301x62502x1
+--reset --sdt=f8_e5m2 --ddt=bf16 --stag=bac:cba --dtag=cba 267x4x7:267x743407x7
diff --git a/tests/benchdnn/inputs/conv/harness_conv_attrs_int8_asymmetric b/tests/benchdnn/inputs/conv/harness_conv_attrs_int8_asymmetric
index 64a824638ce..fd4be5c07e0 100644
--- a/tests/benchdnn/inputs/conv/harness_conv_attrs_int8_asymmetric
+++ b/tests/benchdnn/inputs/conv/harness_conv_attrs_int8_asymmetric
@@ -16,6 +16,13 @@
 --dt=u8:s8:s32,s8:s8:bf16 --batch=shapes_gemm
 --attr-post-ops=sum:1:0:s8
 --dt=u8:s8:u8,s8:s8:u8 --batch=shapes_vgg_19
+--attr-post-ops=
+--attr-zero-points=src:common:-2:s8+dst:common:1
+--dt=u8:s8:f32,s8:s8:f32 --batch=shapes_googlenet_v2
+--dt=u8:s8:s32 --batch=shapes_3d
+--dt=u8:s8:s32,s8:s8:bf16 --batch=shapes_gemm
+--attr-post-ops=sum:1:0:s8
+--dt=u8:s8:u8,s8:s8:u8 --batch=shapes_vgg_19
 
 --dir=FWD_D
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2
@@ -32,4 +39,7 @@
 --attr-zero-points=src:common:-2+dst:common:1
 --dt=s8:s8:s32 --batch=shapes_alexnet --batch=shapes_3d
 --dt=s8:s8:bf16 --batch=shapes_basic
+--attr-zero-points=src:common:-2:s8+dst:common:1
+--dt=s8:s8:s32 --batch=shapes_alexnet --batch=shapes_3d
+--dt=s8:s8:bf16 --batch=shapes_basic
 
diff --git a/tests/benchdnn/inputs/conv/harness_conv_f32_nxc b/tests/benchdnn/inputs/conv/harness_conv_f32_nxc
deleted file mode 100644
index bb7be6a94d6..00000000000
--- a/tests/benchdnn/inputs/conv/harness_conv_f32_nxc
+++ /dev/null
@@ -1,17 +0,0 @@
-# f32
---reset --dt=f32
---skip-impl=ref
---stag=axb --dtag=axb
---mb=2                      # for fwd and bwd_d reduce mb
-
---dir=FWD_B,BWD_D,BWD_WB
---batch=set_conv_all --batch=shapes_mobilenet_dw --batch=shapes_regression_dw
-
---mb=0                      # for bwd_w use the actual mb for 1 topology
---dir=BWD_WB --batch=shapes_resnet_50
-
-# plain weights
---dir=FWD_B
---mb=2
---wtag=xba --batch=shapes_resnet_50_v1_5
---wtag=xcab --batch=shapes_basic --batch=shapes_gemm --batch=shapes_googlenet_v3 --batch=shapes_mobilenet
diff --git a/tests/benchdnn/inputs/conv/harness_conv_f32_plain b/tests/benchdnn/inputs/conv/harness_conv_f32_plain
new file mode 100644
index 00000000000..ce05f2f03ab
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/harness_conv_f32_plain
@@ -0,0 +1,29 @@
+# f32
+--reset --dt=f32
+--skip-impl=ref
+--stag=axb --dtag=axb
+--mb=2                      # for fwd and bwd_d reduce mb
+
+--dir=FWD_B,BWD_D,BWD_WB
+--batch=set_conv_all --batch=shapes_mobilenet_dw --batch=shapes_regression_dw
+
+--stag=abx --dtag=abx
+--batch=set_conv_all --batch=shapes_mobilenet_dw --batch=shapes_regression_dw
+
+--mb=0                      # for bwd_w use the actual mb for 1 topology
+--stag=axb --dtag=axb
+--dir=BWD_WB --batch=shapes_resnet_50
+
+--stag=abx --dtag=abx
+--dir=BWD_WB --batch=shapes_resnet_50
+
+# plain weights
+--dir=FWD_B
+--mb=2
+--stag=axb --dtag=axb
+--wtag=xba --batch=shapes_resnet_50_v1_5
+--wtag=xcab --batch=shapes_basic --batch=shapes_gemm --batch=shapes_googlenet_v3 --batch=shapes_mobilenet
+
+--stag=abx --dtag=abx
+--wtag=xba --batch=shapes_resnet_50_v1_5
+--wtag=xcab --batch=shapes_basic --batch=shapes_gemm --batch=shapes_googlenet_v3 --batch=shapes_mobilenet
diff --git a/tests/benchdnn/inputs/conv/harness_conv_output_striding b/tests/benchdnn/inputs/conv/harness_conv_output_striding
new file mode 100644
index 00000000000..6f547b09e9f
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/harness_conv_output_striding
@@ -0,0 +1,152 @@
+# This file tests convolution output striding in order to eliminate concat operation
+
+## SqueezeNet 1.1
+### kernel: 1x1 impl: jit
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=3200x1x160x16::12800x1x640x64
+ic16oc64iw10ih10ow10oh10kh1kw1ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=6400x1x320x32::25600x1x1280x128
+ic32oc128iw10ih10ow10oh10kh1kw1ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=9600x1x480x48::38400x1x1920x192
+ic48oc192iw10ih10ow10oh10kh1kw1ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=12800x1x640x64::51200x1x2560x256
+ic64oc256iw10ih10ow10oh10kh1kw1ph0
+
+### kernel: 3x3 impl: jit
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=3200x1x160x16::8192x1x512x64
+ic16oc64iw10ih10ow8oh8kh3kw3ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=6400x1x320x32::16384x1x1024x128
+ic32oc128iw10ih10ow8oh8kh3kw3ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=9600x1x480x48::24576x1x1536x192
+ic48oc192iw10ih10ow8oh8kh3kw3ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=12800x1x640x64::32768x1x2048x256
+ic64oc256iw10ih10ow8oh8kh3kw3ph0
+
+### kernel: 5x5 impl: jit
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=3200x1x160x16::4608x1x384x64
+ic16oc64iw10ih10ow6oh6kh5kw5ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=6400x1x320x32::9216x1x768x128
+ic32oc128iw10ih10ow6oh6kh5kw5ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=9600x1x480x48::13824x1x1152x192
+ic48oc192iw10ih10ow6oh6kh5kw5ph0
+
+--reset --dt=f32,bf16,u8:s8:s32
+--skip-impl=ref,brg
+--mb=1,4
+--strides=12800x1x640x64::18432x1x1536x256
+ic64oc256iw10ih10ow6oh6kh5kw5ph0
+
+### kernel: 1x1 impl: brg
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=3200x1x160x16::12800x1x640x64
+ic16oc64iw10ih10ow10oh10kh1kw1ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=6400x1x320x32::25600x1x1280x128
+ic32oc128iw10ih10ow10oh10kh1kw1ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=9600x1x480x48::38400x1x1920x192
+ic48oc192iw10ih10ow10oh10kh1kw1ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=12800x1x640x64::51200x1x2560x256
+ic64oc256iw10ih10ow10oh10kh1kw1ph0
+
+### kernel: 3x3 impl: brg
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=3200x1x160x16::8192x1x512x64
+ic16oc64iw10ih10ow8oh8kh3kw3ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=6400x1x320x32::16384x1x1024x128
+ic32oc128iw10ih10ow8oh8kh3kw3ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=9600x1x480x48::24576x1x1536x192
+ic48oc192iw10ih10ow8oh8kh3kw3ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=12800x1x640x64::32768x1x2048x256
+ic64oc256iw10ih10ow8oh8kh3kw3ph0
+
+### kernel: 5x5 impl: brg
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=3200x1x160x16::4608x1x384x64
+ic16oc64iw10ih10ow6oh6kh5kw5ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=6400x1x320x32::9216x1x768x128
+ic32oc128iw10ih10ow6oh6kh5kw5ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=9600x1x480x48::13824x1x1152x192
+ic48oc192iw10ih10ow6oh6kh5kw5ph0
+
+--reset --dt=f32,f16,bf16,u8:s8:s32
+--skip-impl=ref,jit
+--mb=1,4
+--strides=12800x1x640x64::18432x1x1536x256
+ic64oc256iw10ih10ow6oh6kh5kw5ph0
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/conv/harness_conv_regression_general b/tests/benchdnn/inputs/conv/harness_conv_regression_general
index 4eabc471837..5c07d4fb0b9 100644
--- a/tests/benchdnn/inputs/conv/harness_conv_regression_general
+++ b/tests/benchdnn/inputs/conv/harness_conv_regression_general
@@ -408,3 +408,7 @@ mb1_g50ic50oc50_ih28oh26_kh3sh1ph0_n"dw_post-op_ngroups-tail"
 --reset
 --skip-impl=ref,x64:gemm
 --dir=FWD_I --dt=bf16:bf16:bf16 mb1_ic8oc128_ih1oh1kh1sh1dh0ph0_iw1048576ow2097kw500sw500dw0pw0_n"dimensions_transformation"
+
+# test peform_outwork for brgemm kernel
+--reset
+--dir=FWD_I --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb ic48oc134_ih23oh12kh2sh2dh0ph1_iw12ow4kw1sw13dw0pw14_n"brgemm_perform_outwork"
diff --git a/tests/benchdnn/inputs/conv/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/conv/option_set_fwks_ext_gpu
index c2f51bbdc24..7a0ebe132ec 100644
--- a/tests/benchdnn/inputs/conv/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/conv/option_set_fwks_ext_gpu
@@ -1,12136 +1,12294 @@
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"238e4c946973bc458c6471af008762a7*5&1594f25239f96216f54ef8910e84b194*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"e415ec92a4ba6d98447f2271e9fd6fe0*5&0c1f0a7613ec6f33c5f5042d17085ced*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"688172c61a1db0f1c3058ec921092ad4*20&2a3e93ac9c10250ec0a5b443bd20f8b0*20&a26dd7fc2822b925b7545b21eb0697da*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"abb3eb62bde6ee335c5aa4b805d19c59*20&279a72a356e9f865ce5e041e65bfe6da*20&779bfa95af9a90379a013977ed61c82b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:0.271:0.314:1.234 mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"36d0c31489c5c360ffc86f34afeca723*20&85e3e4e32fb7820a187e7c854f4ee33d*20&6247d7dbd219a4e7d9d4dc2146203918*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c412192838d4ec0fd524cc94c22686f1*5&7ad5de02e0e127e8fc27c8016f698bfe*5&d9abfefa079a40fa15ea70b8dca2e8ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8b631dd664129222d78f6cbed0dfcfba*5&daa316834ce35dae25e132103fa73a6e*5&df92b3ef30a3cb8c3c8604c97be6e0b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g16mb1_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"5e653459f90d7d4ff1f0711b18950447*5&05d20243186f4ba393731fa1c6ba6d36*5&93527466e47eff934299c71e598cd535*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_elu:0.271:0.314:1.234 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0cb1da2ec1b038eb85cc4be7576b1431*5&453b186c3229d8ca1d5dc1c43e27eb6d*5&cfc0f6fcbe201bf50904b9ab308c0276*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e820370213d0c32b449e0db134296ad2*40&be5a364dc07f5d06450872b55528c24f*40&173ea014d4eb81530d77d429f482fed8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ed0e3c61785ea0f3ac021762a2e5b43e*40&a9ab640e29624131714ecef0e31098ba*40&f8a0f4eb0d1b3ba9d3fa5ebc5523af05*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:0.271:0.314:1.234 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"592bf762e6b3ef03839ab28dc63e4030*40&7651e8f44a9c120a263611866e030a5a*40&f9f9ee909a6c4c34079838abad2d5527*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c79e23b4ba04d7360305b17bbf364f86*5&0041215d1fb3c7bd1cb5ca33ec12756d*5&4eec8ae6f70a0bcd70297f6ca7af73b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c255ba9a2cd64fae9216992b4857310c*5&c2ba5a8350af891f106d76c4db7416ca*5&07f59df415dc33a8465c69b17572f3e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g32mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"be5b440636f2523dc0497c2685dfc105*5&ee2efa46f92fee757027be0427246ad6*5&95a69bc4528a3454c8f162b66fd5afbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_elu:0.271:0.314:1.234 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7dc43df620c05877e96095f95b574058*5&6660b5502217d4708eb0771dc78ded45*5&657a14c6c7612e6b0b2701920bcd37e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"565f069b898c43a49c11f86f1a33e13a*50&6b392ba1816b96ae8ed908f627bc1716*50&82cf69096a5b3ef5ab1c54f9dd947a2e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f24a0d29a556705b566c28aa9b728982*50&bc15961f824848ae81e73db11faba3a1*50&283fde6e044f5a8686d5ade42672cf41*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:0.271:0.314:1.234 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"22606553d8fb326cc12304de58522f8a*50&2b107d9389ad4ca4dd812ab0debf13a8*50&1995e5372e11cd405c524dd00c58c1cd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0e01277112b6e417ad26cfa51ceb4cfc*5&76a5f97325a6836b31eb09bc3657820b*5&e74224ec7df0324e1202daacf2427c78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"18134b6e648963030f5b38d6984f4ed5*5&6ecb2ce93b3997df02e197551dc6dfe5*5&fcad9863e5f6ecd208d62a88865fc0b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"25789c72eb8e7bd1247f5dc4977972ae*5&db343f1d5cb6011a43bda7832098fe5e*5&8cad607c3c3b0fb75f81416b1f2c95f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"89612af707caf8a5332c5de98971098f*10&e9f168e44f6c3a46d55dc17ce345faa2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g64mb1_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"0207621ddd6740014b4f926c337d2179*5&08baf68d1b25c2748f5ddb78d0f5184b*5&73b015d0d07859a10e9bfadcd7867706*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_elu:0.271:0.314:1.234 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7eec65319a8ab1ac9b9880b220486906*5&7c52cb962a801bf08369ab0f8d9d4d4d*5&f73dc31e1e56b9fe58e27abfed97d6bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9730f82556c99d4504f95fd5a9cb532e*5&1dbb0cd3d6ebb0117592a2c6b5be6533*5&e4763010563d32545178f966024b12bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cafb747fd7fdfbb00b53688cbf375004*5&b11052b9a4ee564207c22084b964d40f*5&e9ca23457317815eac28b4e3050c18af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:0.271:0.314:1.234 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"91bf5c984408708f96a49a40874fa21c*5&f4095f2c3d6bfe18ae5bef497bdaed15*5&3fdb22cb8aa88b2115273f710ea72e2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3713c354344ddaf64f0f93d08ff4efab*50&c07b470af7d3245ddeaf208660dea49d*50&b320b1fab79dacff5ab1315f1102197b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3aaeb746a8baf1071124465a31562fa5*50&fa78c4e858e059ae4216c24d68475d57*50&48d0ce5307c5c63a22abd3e53048e4cc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:0.271:0.314:1.234 mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"de23c1875fac245fd0fbd52e924c05c6*50&a18192faba58d43b597e166187a4b3e0*50&ffc24f849f23bcbc4993ad742f17a0c4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"c7904cd29b7be7a8a0a530104fc1f940*5&558a59e2893238e0b51340047e698ed8*5&2465b4cb4adeb0b9b8e006311a6fa8fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4fd6b069001e7e244ef18d60703b7c25*5&bde7faa2355d156a5568de97c8a661aa*5&c4606eb445d5b37cb4e933c8cc1c8f95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b6cf34457d56bf572f12d5fce995f388*10&251891a883dd6391d858c08f131953ce*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dc7b2be883aa2de4423d86678f5e4152*10&2d1688a55478a3e82d353fcb7bd256a7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"eead5c5085fad00daeac15a99c3ad586*5&3a9e27fb7c49f9787f1bc0229af62260*5&3bc07d8b8e2a1479135ae1ff75420c65*5&00ac48a27a27d163f43435e50460558e*5&1636a7ad5627bc1cf8625a33fcf9df7c*5&87d99ccfcb37885919408251d01f1028*5&4b8298e3e7e5884062513b603a70b4cc*5&0607f2b1effdf069fd0480e2dea9f658*5&a417a5e70e2679c2a3c8bb8e7bebb2a9*5&7c1674658e7f2ab79a651df092e1793b*5&f185eb7b9a914473902fca3ef5c99a10*5&4c75ec3b3de0a1b7f3c6e2a69e90745b*5&fb3c44e92ab6b3ad9e6c369d7cd8d675*5&aee0edef803e9c3e2d10f2079f41a4d7*5&b4e36f6ebb2928fcd9812ae275074230*5&af635180b1bb8d2e7e95b733d866226c*5&62b99b8336653dbd5a44c4b2740f415c*5&440865bd58645b5258192610286b721b*5&cdcdd212bea47db3abe0d0621133b48f*5&e76acfb11e130b622223b14f0ee97ac0*5&98315abb6c738b1d59b5f4b426790179*5&0a758045b0ae27382e84a7d76d9b550e*5&05b428104d46ac6dfc6832ec3ce33259*5&2b157e4a486b32c9e7d4987af4d8e99f*5&c5306c46b61983d7e54eaf192b2602f7*5&15a7d610999246126fbb8f76656d3752*5&e000c2a897641f9d35c3370d9d5c43d6*5&6edce6c669c897fca7e9517c15ac8b7a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6b2be6448bf722b923a6ac956ccb4bed*5&36c8b5bd68058ae49d574d9ecf4e9cf7*5&c0001169592f90ccc65100abefffd201*5&9843b3092890827a120ae59693b5ddd7*5&5a78030ee1110f2221809a9f72e5437a*5&a29a32dbd2a47ccdb4a9627ca83ecbc5*5&7331bff2dcce90c8f861cd35ab827e45*5&23a0aa4a83b66a6f4137851d2e67ef03*5&fc105cd9221f0f3050022b5ec8e1d1f0*5&f32b2845044defaa376fdd18063a14aa*5&2d6996107692ca515290978090909d72*5&1700ae92eb18f2cfb6688c99d81e4533*5&4e6d8f629d6253650da8a03f909cde60*1&3a75373c67bc21884e3e71418c9fe4d9*1&dc9fe3ae93d0bdb2460dda7f2a07180f*1&b94413cf5b3c793b8ac1034ea1417384*1&995b89794f0f2cfcad5e1d391c273357*1&e73a6445705cc11b74f38132a606f828*1&3ddbb5130a7fac45c3d4751b95787b35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"91d4386a2c79f6d0e927b8a581c64235*30&ecc01c7e42a80f13ec1dbbd17cb93273*30&67afad63ff8843d8124d857d5a5ce35b*30&7cc9a46bd17e3fdf2fa7355ddda3383e*30&226b0ed0f3a3532591897cdcd5357b6d*30&2e98f5712a4a6d5d26d5ef60becf5c15*30&e0cb40a747eaf9b7d35ff3316e5e1f69*6&f22af4f75fc8878c100e4ef237439eb5*6&2c3c66140a5ae86b279b7efa84bc82d8*6&aec6ad79f8cfba2bb012e2a4db263680*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a0c899ac8353759281f08608a1473471*5&2ca36872d33a6d0442140ab2f2b17e4b*5&998e8762381fbcc835fa16806047828c*5&9a6138f397775238afb028cd37814cb0*5&be76bb6ccdf7175eae079c123387a186*5&501b8b96328db710adbfdb77a8671c78*5&3e34e92c5256f087ae48be28952082a1*1&e2ab8ccf0dac771ee2f65d677ec0d7c9*1&d2203b0d10784073e1299d5d55648c86*1&946c063ab49cb3aada0bb54973a2afe5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dcbd8cd7849afbbaca8bc24cf7059d25*5&fb62819ede44b2639f0d5bdceaa72d3b*5&46338a844cf89b1b664e04fc618bbb31*5&e1650cb1aba4defcc4fb1bb5c0270063*5&a8616b6c0a185c16f6e160f54a53e748*5&387ffd66164203a2c60fb181b77b344a*5&3668a56d06048dc6b73fe2d78fd91d69*5&7c2490e6a52f7515717a55835abe6fb5*5&77353ae0b2aad7071d1b2cd8481b1c58*1&57e8f22007ac677a4772ebf14e3377d6*1&5bf83eec045ab496bdeca2a75dbc2195*1&e415a6dd1d33a636a63ec755e4c15655*1&89b281e4f8c3350effe73e2e2d438297*1&4e3334a7c6ecf9a3787af0ba09315797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d78c51bca396982821e2e9d715fa36f4*5&98418b3c33202cd6602649a64e40dad5*5&0e1265447f45829cd4c93f89fbab806a*5&4da4fcc8f1188ec45d27dd1340ae5052*5&bb3c9707b98e8d9d8789a1cef1abd1c5*5&2e5db9e52b32d5a5fd17db330d6efe0c*5&4fafa9faa2d8440acbcd27eae72148cc*1&741c09b45fca6076d7aa8bad49a9e358*1&aae4178af056d0ef9a7ad30fe4be099e*1&d7a97cdea69e91531da6ad98bc10bf75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"34f860c17fbaa1e90d3c4e94c1d2d472*5&6c62801abc173cecbdf85d3769e7891c*5&ef101199c21e80a026e30c89cfaf26e9*5&09eddaee17a09b0164c2da0d18b5b6f2*5&ec9d769e05f74ce862d1d97797fc8a12*5&3259b2287c9054bcf5ea80a329c883ab*5&5e7cc33f3a6f927f25654934fbc82308*1&79593374a73f4b081deaa9293d77759d*1&1233ac34dc64f780e426fb7085c20d48*1&d4772b5327692c7a0e9be61442287377*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b6d05213182d04fec123f730427e8a20*5&ac9f1437329ed077f33d1fdcba234ee5*5&a4559f8034247a21734f77b8cbca00e0*5&d7af8be1debc6f01428f88f390a48c6a*5&844873ef1e580f9a1279734556a6343d*5&bd92881c50910074145a40087a0eb421*5&cfddd9519a31b4e1ad640d8680c4ddc2*1&868614b476f8f5f2a1019b9fceef8335*1&4030e047d16dc3945e3b8099953c5f94*1&388c420ccf4c50d22812f988e5a8a851*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"95623b62e5bf2c1051b96228f03cf999*5&c412c07fec05aa18804ebb2457cce1ca*5&9e4ab6667cfd7de5dbe924813952c98b*5&f81e919462374dc31158e9b97e1db35e*5&107831a661239922fb3f565b4dc66d30*5&c963f40eda0dceacd7836f2f5b5b4430*5&408c39b5e10cf7a91dce9bc14a7270d4*1&f56f614e60115dd5a75391a857d41f46*1&d9d924f6d7e4e2d8222358da6ab0088e*1&f62bc9d1a468ea98fa7c1f60f04a0655*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8be7e2f34babee0e0a803c7734f5dd24*5&ec00c96108118fe05bcfa8a2ade4fb13*5&9d72d3b58464861b662044c1d4016dc5*5&c7ff3a8f1a15c3302b83a34696800128*5&0a926921eea7a77149859d827bfaaebb*5&ad9c715d740b365cad558697348ca1a3*5&32d3e8f1ed945adfb583b4ac3b971d09*1&ea5f56773727c75bc8f01a683a805f1c*1&2de0c612a884e9769ef335da5b8084fd*1&6a70da23c10dfdffe74118cb80c271c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4d6dfaae9cbea5c4f61ccdb709919791*60&e0e48cd03aac84bad41e21650a4a4150*60&d72dc86e91c908e56af5a35f2b02cb83*60&1fe5bffb8e6af0adaffd3560511f9573*60&4e6e6f919c4aaf8eff24c17c25e2c284*60&3314f9418d82de2147f9cc710bbafc0e*60&85a8bf66d569e5bd230b635696cb50e1*12&2d2cd18207bdd909b67d0c79ada362f8*12&50fa642f74e64feb4e2f2b680eb7c11a*12&8b51dba2ba32e9c33affced4ed6deef8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"900e8e73f51a00113ae66d3d66fd7044*5&b6a107d392481d2c53356c5b59b51e23*5&c84093cd838ebe548e564219468a635e*5&0cefeb85f0206ae47677da114227c9ac*5&d6b84ef042ef99581a74f35285bf3990*5&e372ac3ef32f67845949311539dd3e9b*5&725d9daaa7ff9a48247a9b1973a7e99a*1&9ced14f2767951bb52c7bfe8102240c6*1&90c3539e23f7d826944dc59ce05efe94*1&20235eb17e2c2b4b7e3fb72060eab55c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3d9cb5bfd5140d1c3a52ecf1e15a8fff*5&b0ac497587df2d20e092f63d948f9697*5&0bd559394d6794c85956ee13a3968502*5&ee2fe8f7e44ad68b16caf918658996cc*5&f0dfbceffecb81f3f66f8dfb9b1a3c8a*5&957b9d5077eb7a67f08905ffec083ae8*5&1cba15a972aeb254148a04319600d848*1&4723d7ee9741f7925f0bd22c127017dd*1&f4245c4428de963a71eaeeac3be997ab*1&c07628b8b4fee9b3cfdc672840c8ce2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bb0a31871ca33966a3d0d738f916d0ff*5&1f5d450eda9c4c312b588dbcbc33a928*5&53bf0b15b35e62ee4c53611b2740bd7e*5&0fc80d3c8f6f0f67e0512e0fe8964c4e*5&cb12e51e0ced1dc7d6bde313f250bf65*5&d0f8f60c0c00304ddfac7305cafe6e7b*5&37b7f8feb5b3cc93e767c134c51655e3*1&121eb76a03b21c915bd4376850cf05f0*1&08223400826269aa392d8abf955836f8*1&af832c496479f3db86d55d9e60494c06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"80a1864c9dcba7ada73bdd26e8280ddd*5&7f71b656f9e1379251fa630184bbc7fd*5&0c1f5415eaea7faa8a95771fbbb11d08*5&f1cad5e591bd88378a3d515838467906*5&5a42f549127cbeb8b2c5b1f4db263869*5&1ddf1ba1726107670d46bf1a1e4ebcce*5&a55c9dea52950fa7d9e05b114b6cf3da*5&298ec797bb70436506a6dbd55f281a21*5&f5e91b937e7131eb9f9c2b7734b9b649*5&54d0cf2bcd1c72d28e6db0a1f8f04a6f*5&4c36f506881127da2254b422538379df*5&9cea0b824fc9c75e3ac3a1a881f6128b*5&eb6badd89003305babc8cb8e36813186*1&f99dcefaa582bedb01ba030f565e097e*7&57255411162b5cac8837eaaba4ef1bee*2&5ffe7ea91c620856be6a67ec3cefe3ce*1&3060598fc0ddc2825db062f107ca876d*1&710198369bceb3e42d8ee6825da6ce52*1&23cc8178e6501b66f57389d94d35900b*1&a020d51f64ef58516a9cb93786146fa6*1&b32df744d90aa9d26bc31c6162e1a27d*1&29c56756c6b53c9b9aa197eaed0d70b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d09abb72a3abcdcf58d32f80ae1559f2*5&ba42c3d9f5ef4fe96b721c18b8b9d92a*5&591ea217fd05f20a8ef2e8d0e5ee559c*5&fcdd6b300d9dc800af7feacff521504d*5&01f2302ed6c67e2931c549ed2809a148*5&fb9b9807dab4c206a58ea0df7e26b996*5&e1d288b6a16ee0c9717b355c536eb034*1&d6948abd4ff449592405f0db68ba8151*1&a1b8d66d5d971e172e8fbd5477d58eb7*1&a9556c619c73e174145cb86aab7c9f2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"88560f227ee455cc250930c0af8527c1*5&433e55c9b7a3dd227128be5d264fd102*5&5daa0ee613bc9ee5449455b94b9585a0*5&9e2b4d277002ce4399ca089fad580abd*5&4cd90580ec29500d309dac2d540990a2*5&2100ce8e6cd79e52dd6f9fb091f26e8b*5&165dc235e7254c42332c40100262b69e*1&68e79317fdcc9f9ad20741f6d9ab97df*1&246df32271624c546e55309ca0cab3e6*1&0085a385f315fb62fad20d24375ac7e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"74247a5cf6c83bd34326e639c4e8e4c9*5&07e3d8623741ab8140b8c399da1de332*5&b40e5910ebb5476fbd7bf97ff0114d2d*5&be414aed085e89e65e819e2dfc536b1d*5&84cc0b51fa2491f93d0ec877b822bd49*5&30f457e9d15be21923e3dbb39f12afbe*5&19f5de2afbc6858c047fea90a0948e69*1&f41ea3b29ad61e5a1619deb924bdb664*1&041e2b86f0984e6e0b058a04d13528f5*1&71e5faa563d92f43303d215e5b994180*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0d5f59041104806707c70df195002f77*5&9d7115ea8ffd9213c769f318854e6b63*5&606b2191863e754762a6f0ce788c3402*5&8b593e8ff21e13c02f309ba037c9c895*5&0baf2c64735c43be340f394b4981436e*5&d3bc99ae7d11e5b1d3d57beca8532e50*5&6dd50faeffb5d52f514e92fa82a1ac30*1&ff64ce12c950336ccd0521cac57c4075*1&e7c8ac3b57b82fc9002b0e59020e5993*1&f221f8245995e6c370512c2ffa65ce0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cf300bdb6d92664bf360098536a6fbf9*5&35d0d6845e6fb77f89e70368c99bec03*5&254d9cf78eea7a915b815a211e2f9a6c*5&5d45e8bfbf04007215c692b30bec697d*5&426adedf6e973d49a101c4e9e186a0e1*5&59a7cd3c120a65a0e610515fa5d44a25*5&1eed77aa865cd71c33409840fe915a77*1&9002a9928073cc53d6b5ca7c09e686f1*1&5cdecaeffdfef8d8b31a1f10512a2dc7*1&e04c0b087f4d40318fd2c4b4382b9bd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9e1d737828470edbe49f968330e40321*5&d99ed5ee8c176f7d7d0829ea7b26fb2b*5&332e646d76254b484998e15f3c685ef8*5&80c61a9b075df601cf4f1b8767236c2c*5&ea89770b9fae2337d5a66b8bf1ca13f5*5&9bd4a87e52dab0136073ee6de07f1011*5&df3110ce8e20dc13dbf599378494b936*1&f78a2d53d454a08a3858e34f544ba4c8*1&183b73ad7f38a416ca64d0fa4e588923*1&8e130e78e9eb8672a1be29f97b6d4bb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"add804bfaf43a6b9ccec2a56b7fb9219*5&bdb6842e9ef467530f027d4437ddc54a*5&bf8d174dcf4d5d0bacc5bc1cfdb0bbee*5&21e7182792424f52bdf0b47d1f3ea7ef*5&1140446a1984a165ad15625ce463666b*5&3c43431c7c70f95e5de3a5fe3f7484eb*5&57757b5fdfb6e1146d8653ea6f1e5fd2*1&9bd831e76c5badcf66b900fc6191a270*1&747e933e8ce70ad6c175ddc2fddbcb6f*1&a432bd77888fadc04aae3adca5f8da39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1777f3f40e76812b34da7cb7d6c77221*5&d850eba37ba6d5a1d285217ecf318113*5&d69d86a52089d5bb414b912508cd2d7b*5&cf2c0c9154b09b0c75fe3f807d56a1d4*5&232bd5e3a426f3dea1543bab6356a632*5&2f68ca46aa3632f8f6cdf6f32361257c*5&87e779d0a1010076d04cef7ded3ad2d9*1&5b871818f8b048b723bb224979c7547e*1&79637393b1914ea027a68af7508bc1c8*1&c8099b77eba128afee370d6eb9923884*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c4d6e051980c32bb97224eeafdf08b61*5&ea30e0d5e4fa035a11a65d3fc669badf*5&fc1dd5d6e7e0a67c2acb9a1c79f3f408*5&bc6630cd31d58c5fcf093d11a0023b85*5&f69ad18f35ac1e4d3cb734c1f8837c59*5&d840d6ab0f10c16481564e9777cb044e*5&d2d271d8a9369d7e19665d08251d1aaa*1&b8c8e63831115ebd3f973e705c874956*1&eef69a7875780abaeacdfd83d85236c9*1&0c7c3c31d8cf2a4fdede506d08363047*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3bd4ed08efa1aba686eed07f65c4ea12*120&108eda325076ce1659e369f64046614d*120&b7b4f13038a6cf24ea637abc55bf5517*120&1e5a9ff1a9990f8620e8fde1f4f36c43*120&b97cdaa1bb0f046d8a68a65f2d7b772d*120&3f53a3ca2efbe930f940da70878ffb98*120&19c97b8c05c3c38ae96b2465d212a556*32&cc8168e1ddbf1b72e1c8c1f57db39d79*24&fe848b7f952311a81dc254b6a0d6a084*24&4b43c43135a10c39f64e33064a070b3f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5327b891fcf163834a9130908296dc82*5&7b35d2add35945b80c900e1d630f22a4*5&cfac85ec03363fe723cb9ec31529da14*5&ef9abdf1cafabeaf20bec26d26425080*5&ce6bf86628188f847925723a75e7ee31*5&98b55cabf4e2ee74186dbe47d21d4855*5&b82848f0d0666fa30ffd86c167f05cdd*1&281155eaf70797e71b2a03d39c83ed7f*1&c6b8592931d0f5833516e77499925a7a*1&49020a65a01b3b71d2afd218f5fe93cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22b0df10bd96d1fc231932f5362b5443*5&440dd46479e588f684a0792210348c31*5&f27a60da01217830f52ef3ec3f191097*5&5b90478ee65d1640e4116a5e4961401a*5&6e938c0943dc92bed5e0ebb46f274548*5&903e18c95c773870d3b76995ddb9acc0*5&672f0832ebc49064709bd78434f77099*1&e3104ed16ee3f495813d31283e57dcda*1&985697e186a413a08556d19ec76d3731*1&5f579ede468011fb47a6e91ee25a6af4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"344b5d8b7df7791b75b2050fec6989ea*5&0d707b469b06194c5873b37fde9e9943*5&47130302b50500ddfd077e0a0eb698f1*5&4be0a7ad27c6f46897e05a27ef02415e*5&a4573b1ead4a2e0d8e99fd22eac7a318*5&7220556ced0f8cdce81226572911c194*5&aab357f8303d978114902b641e298e0c*1&768e77f796cdb3c4bf5613a08f7a6cd5*1&ea4ddc81128eea5e7c96682c4d958304*1&93057fe0a1d8eb2766acdac5f00a2a4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b7815d77b961d6df375b0d3f3d2e6181*5&2126c658c0bf4d790ed63fb4ffd8b71a*5&3d22cf87b4dcb0086d45d11abb722cad*5&2d304838be937165d24b0df89c07ee33*5&9daf364b464f9daadee72ac2128fd53d*5&828e0536c15af6b2d96dc7cb6d9229bc*5&fdd3ab59cca36d7bf280b67ea85eaaca*1&910bbbc8ad52040d615d7a2195765f1b*1&964702de631507433e507c2e5b2c8b9a*1&1cb29b33f601dfe541fcfae094c2688b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"58c22b8b9e5129ee1e4876d1441c6a68*5&7775a3106bcb6b0035815cbdc36de4e9*5&09759158acec0e0b96f8b311ac499ef1*5&7979932fc9127419d0ae45242b31371a*5&c2f93a38d6316f0ce244ad9102386b03*5&3dd36a903dd2a5f6da699902569d55cf*5&95932820ef3afb803de4ec43441a9d9e*1&a8cdeecf0463c27691d199e5a3719439*1&c7274a441943b2244d1438f9dfa25e8b*1&ad3ba900a73aec1361b7c1148115247a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71481b48e80cfba138528467cba47f4c*5&992c279d6ee664e39f09cf05f01cca5b*5&161cd78063353a112367697f36bfb573*5&d82b80144716848a0c2ce044e74c80d3*5&cbe94ecda1c45a999106ecebc9275665*5&be188e6c561b00566843e211e9f4b169*5&3aade4e349564e9e484f62139e172515*1&d8865a81ccc29723b0202edd0fd0d65d*1&bdafc488c18cdcce3f3a91d69c4deb01*1&ab282237822ae2a2178adc66fbb88291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f4fb1ffeaf438de9c6b1b6cde6f255b8*5&5c813d3492f9cbb3124933b0d9482eea*5&aad1e6cd9ce8c29b49509f1fdb8362b2*5&76611025e13f31feb0c11a5d36f3e0a9*5&39b7a58f5b8a3158855abf245711e2e8*5&a3afb1f64c2395a812dc8c35e0416aee*5&1a9b40a750069d3b79138f70d2210346*1&fb3a1872d8c61cacd9b807ea4394b157*1&6c7b72be4d96aa0c309d2b1ae29dec58*1&309b94742949baca763fe675fa664dc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a1160142db2f4536639ca24ee9d07ba9*5&66e003c9ef6436a59249f6895f063302*5&319a807688230df26493f070136bd695*5&acbea234319ea7403b7f18ac47a92dae*5&1a608bba452a76214a8760dd03a8afa0*5&d6d7363a1813f7d515b8e616681b8b75*5&5dbf2cd7adb5f31ef40c9c2747c0e0f9*5&a79d6e0c6cf6bafbd64b460facb8d546*5&f053683eabea5de96e7317f442bc3ec7*1&50557cf889607334ce6d6bb04745c967*2&5544dcd13231ba5f9e3151af355dc291*1&883980c464ec6d01b61c78d22c7258ac*1&1a50be0a24bf61acb3a333b5c2c65d7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"99a8f669b163f2777681cc9f7c691006*5&10defab04058f4e4f9eb20c2290f898b*5&99b62ae7cdd23bd835377eebc8b2277c*5&afa2cec47d088d4fad5106dfb46865fd*5&81909567ebba76296f75b6073edc05eb*5&21ebaa9e365000b28b672547bd488599*5&aadeb8dea933401645ef8a3df8f6196d*1&b72801be0a5e017c221cc5ced7c187c9*1&fc276225740efc2c4c7d5fd3d82a1f3a*1&eac98e11f4fd7c59bc8e50fbdf34e9d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29b25c412e78256e5353ea9db7bfd24c*5&0a0ca18af380983a4a33be8a1817e563*5&f786d6a0337b713b1c6541b7bc1ff27c*5&3c2afd0bc102a7fd35b36df840d294a2*5&e2a0037b19f9a2afece90714f1b845d4*5&f15f9ca88a4857d4254363758a6abfb6*5&bd3bb87ea1db22764a2d92648e24837c*1&9a0666ed2a2874944744a4cb929cb6d5*1&49cdcce80e4f2f745c507d5e6ca4c959*1&684c0b4a54a08dd368baeb8a4b95e208*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"59102f5d0c057da49f972ca84723fc9c*5&eeb76cb23594741187704b2cdea10453*5&9e117993cee94cb1ee9ad406f7ec55cc*5&c6ffc576493b83ec64fec2a463e7d47d*5&88246001ee7f5d4011680b7848807bf1*5&68f426934b11cef33c9ff4936a27a3cc*5&6f2319d60aae682c27e8bd0891a752a6*1&a586ae82acfa94a62136f089bd1a178b*1&ec17ccb20b930aa5606621d10d29e5e0*1&839feb2afbcf2bbc6380ec79060fe8b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1b9cbf8812ba4acd4db585e72d7e1e0b*5&16b5af376fa88292385f1c0e69aa82cb*5&f55d96acd509dfb3ed6df994b1494929*5&3d773c3773f09b7759c8253818a0d0f1*5&6004b0006e7316bbfd919b2b7b466502*5&a97fa6ff6a5d24fa943d16f94669dcaa*5&df82c039ef0b11353a124ea16f2df7d3*1&435c3e70a700da36348c81fa33a1c07c*1&5d6291cb114200967e385cfa20c0e1fc*1&153664d219d07e25b5ce10198709116b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4f8f513f1b506bb12b2726d76977e122*5&22eac80ea5cc9b0a224ac40b59fbe97f*5&bcd58e483bd7444a1fd5c7266825937a*5&04d2f2c78a6010f05ce80e073b0e2397*5&4a4a5607f64845c0820aa0625ea87e01*5&1754e9a734df8dd91cea5347ea021411*5&7de20a550492c74ae5dda12d09ec40e6*1&ab671b3e4b4807560adb2646a793ac63*1&46f4604e44c7607facbb03cc7ffff10e*1&2c3ad855d0ff01ff9503a2901f24393c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b008033a14ef2305c820db8fdff85a5*5&93d019122d1187aeea895d3631e9ec48*5&2bd934d27036ca4ce92622f457234ff0*5&5b7a6b847b5b081b0b5b59bdbe30a989*5&7828b290d4450dea2de9354142b1688c*5&1e64c7a69ac888fe61c63a92d59e6c42*5&1974bcaa7eae825da1398db53901c456*1&686c7f519d7a5dd0a636c5c248d54363*1&ca53b2fce8a4d51c1189e77245bc4868*1&e56804d5d9772a456c46dd0b0f5934a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"549b9873c4ae12222dc6a206b294e1e1*5&2bdf58b672afe0b7ad1d599c068546e6*5&93a6ae212dbad111786e292991bc9bab*5&8ba7eda52968df9b9d29361447b65fdc*5&fc9f9f303538f764906c4fed0c170ff5*5&a5b02247996af9dc6a4d2b8e71ad4821*5&517089d213641ba440329f40a85ecca9*1&954b7cf69c9f860f1bde9643cd5b7918*1&6036d64f686a89932024f542b81bfe1d*1&216c03a704faddb1dc0f54bdefd0e5e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e04e0b75b74505207b68e5593b8d4f2*5&73314b020b8a8a4659214e4eb7586609*5&86f86659ef56f6f55337081c02d371cc*5&5f4c04d4c227ea9bc06b95f03e42e776*5&44c32beda35a5420fdbe9eaa029bf3d1*5&e64eff1243366b5d85b0189165189bb4*5&ce3cbc9d34c4f3c33770ef1590e9d5d8*1&3d2a496db821a5efe242eba1779edfeb*1&eed5234ec3407f2d45a2c0e88a0baede*1&90d7ec3ee1ff1cf7a0b6eeb81d4bd10f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4232baa1c7c7242d05c37ad1dd4b19e4*5&a7fe5971af5afd2140835657aa5a75f1*5&623a8a4a25b79152c92da6d515d6f7e5*5&f77eb1ce9b69eb71794281e56b3db438*5&1e68b7a4e4a63f6d26b822b5eedff671*5&d58c7501470176b60e5ca3703248bbed*5&99aa321310bdff9f3f5a05566303b3c4*1&6007ff155e2423bf7ca265f080c9cd2c*1&15d7f9874ad1ee284c021d42c2035c80*1&4b3932344768756b64ec23d8fe2a1f10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"421646298d906bba3a90ebe9939f3b24*5&11697cb6b15609befdb3172d88f2b599*5&3947927e7335f77fe8b74c06a9e2cdce*5&f7b695457b57d790c945433bbd53d1b4*5&6262052a81df01398eac3f174e7edb50*5&4f83217a875148bdac0d109541f1521a*5&141b62eca4ea455af2c0ce82cdf65e15*1&a88e5da60a16d2e5dd20ad7194c1bd15*1&efdbaf188424237c5d866fa58ed68f88*1&9ecf4c17cc6ee9d4c21edb917a52104e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e1b7d8bebcdb557d03d4d06ab1cc8a8*5&8d668799b62a157121333bdd88d946ec*5&880471fb673bbfc67c3a8b1d54b43e20*5&28b81441438558bc1c681e5d554edf23*5&dfdd2a603984fbae048693168fcfee25*5&60d1cf8f71f5b14fa416f049b9daf476*5&5f4cc61e09a31d135ede7f1fdc05fb2f*1&a14875c52ad271e52ba69d7b826be384*1&de8f621d1246366f53a1a1d319f4f7c8*1&b33f6035db87c713e1a8c90eaf302482*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3e66c230f0ebf786299c6797e5553f31*5&6e2747a6f19f38ff6904b972e74c03d3*5&b4c82d36ab6c19ffca6367e26bc2ac06*5&4049cbb9afb00415b942f67741b6993c*5&e4f055b1a78954fcefd5862575618b14*5&19a341a0a035160ee32435dc0ddd5430*5&32ff63c799002572d46c95001e2ba496*1&b9b05fd971ecdddff7fb3604091495ba*1&ec8a574b8dfd21b1025325ffd4a6cc01*1&523b9e08dcbcf1846627eb3cce102fae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4c8b26bf172d1199ae3a482006a83fcd*5&219af02ce6dc29c8232280af5acb0c43*5&2d148b3e6f69db29b900414a0c8ae6a0*5&517b1b510828b01ab45e4041a13c3e0f*5&e23723ffddbd4a3802df3ea0cfa474ba*5&00abf9ca638d58d35b4b4e39de5fc963*5&7ede97a25b1bd2bccdccf207c4970705*1&e3e4e478b6c1e313a835105a1e58d08f*1&c060a339ca6e66344f3595bd5b1dbd74*1&394b6184f819c6af2b0835e32e4920a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37c0af123b841dd99e05c3222f3d5bc7*5&db57209ae26f030a75be557a3aa79383*5&01de87dbbd2117fe7a2e54c1cd1ced6a*5&60484a656a913ad007fda92fc57e6607*5&54bbf4770474a2fc970ad3d7c20f99da*5&bb82fab36309dac3e26c2141b7d45cec*5&ed843c0b5379ca0931db81c9d1803d4b*1&6a10fd5a87dde2b11dceaac9a803111b*1&3353d61890e9c6b84a715e63a4e630bf*1&15444e71d3057bd52ae9ebf55fa779f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6854f9f9a7428cd4f2f0b9b34968246b*5&cb93a024257aac4fd2f0d459e4c7fe2b*5&3b52107400392f32ed6b306bf4deac0e*5&bb572de516f39c575685ef40f1065dcf*5&a823554c1ef4368c69e4ed79c51601ad*5&abd78705c08a431807d3cd025cfc861a*5&e8488ce0d1b201f5533b3773effbe239*1&057b88841b6c978779c44cdac3967eae*1&e4fbbfcbcf65f4fa845725975ddb3fb9*1&a346941ff7a0fd465814ca37abd2b279*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e30c0a21695888c9f5b0ed7e8afb5787*5&f4277aed09690fc8d0b133d97d0523a0*5&6d871a674d457c77474241ed395ec2d9*5&690d58e076e7df9693b851e309d842d7*5&496e27d67ce812b5e8e90f2f6b1f1534*5&f9bbf1780bd56504a7133aac27fd453c*5&c0333608e6849413040b5cfeea95e7a4*1&295fdde41c48ee734947e6dee2383602*1&cc8039b666d1825654f576bf6e3e80e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8fb520eed853e1a12c4c9f0a8b3cce9d*5&3322ef37fe954a35cdc17af06cbbc6bb*5&5124ba577c7339d2e469d2f30555c91e*5&8770d77c73e5df4fd205bdc741fa0e90*5&5c1988a0a2ffebf1500446ee8c23f91e*5&cd9188277f3df044b19cf70ea95f7b89*5&a479e2d2a3e3e6df2c9056615400182b*1&729243154443c9cd343e62b16b052799*1&d698aca80789c270c1b1ee00680a2ceb*1&2b087fb4b618515105165043f71024f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"66c1dd01dae4a994c1b2970af4a7f1bc*80&f52e6224ef0817132252eb98363de9a6*80&cf5d3178236ea6b641ef6895cf613528*80&62310837a1d0370eeea179a507e1b7b2*80&4b390dd0e965d0370d3eb8b4f92ce2b7*80&19ec9680538d6c5c6125d084b8b19176*80&fa1a3bfdc7966a42080cda6c51d39c05*32&d128d2f3f30d214cfaf5a472f21add24*16&986365dbb2b6ee1feef0b30cf26ec00f*16&c624ccc3134e3a5982dfbe1255d71755*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"12417f136c1fedb31b82d99d8b09925c*5&fecd7e7e8cc88968e89c1560b3e53077*5&34a0003bdb6c4db45a7428e76e490335*5&9f2aa9463de66d8d2b854b9f7fa53723*5&10c83a980aaeaebcf12999b2de0336ea*5&69a09e047f29cb951f2cbcb30688f257*5&d0cac877ab57e4a365fc27df04c7b688*1&f7cc83e4697f788c0a8a6d4f23a0b7bb*1&47a0fa9280e96214a0d52b5c30bf11d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"655d273e4deb6c9d11814828595a0470*5&36ee0c9a52d6a1a07fda59d52b2b90bb*5&ed27b185ba4378b2ae76c40d8ca4828e*5&8e3af36627550f778d7cbedb308ba9ed*5&9345618c500fca84b73766e440b18dcb*5&a68a97b37764c2a68b099215807fd8bf*5&d8f80b3aec55806eda6796500e533ea8*1&88295b826e47de4d2a3290e7c2afdb04*1&b8c8ff0bf3e8cf5b66562f14e95d0dc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"402cd50858e909e7f9deaccd0a622f27*5&9c2ec1ced82da865b38bf5821c84fc8f*5&a3870f531f229f644d8727754e9aba6b*5&a9c8a783da48a9c27a7259a33e73971a*5&a3ff9f39f0f0c3e4fbf200f458a251a7*5&788c6774f819fcaf2afc0b5e5fce6b06*5&5489c9f01fc1a33d39ed173bf57e7559*1&baf029613827b2eccf4fd0cdc1caa341*1&d0115e403380e11c97a1f0b310600b34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7545dfc69dcd1f3f39996ceda0b54f02*5&b729920441e75265200a3fc7d2be2aeb*5&4b38067a6ce9e3b24069e997f896fcd6*5&2c1c056deaaceda62200cefa432e396b*5&ae22bc60296e57dfa0a01785da45d39a*5&e60951f3dc56a38fc1c7a71dda52ddc6*5&70e7fb197f6ad3128217dbc7dfa539dd*1&2622f978ae51d5f45e3b4ae6e2089827*1&e334efa9fa04acdf5a3a6c501a4ad73f*1&04cf44bcf7ba9225efcd4b5710209c32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dc3179b9c1e91aed07c94efbdf9fbec2*5&f8e3d088bcb3f2e9d7dc80410e7bf969*5&02b60244f799d6b748e5599633aec860*5&e85f2dca9d41f09a386e8217108af8e8*5&9720196b9c9d8ec8dbc3e12810618a6b*5&b1cc5905084f96c771d404c0b5f98294*5&5a01c529a62fb6fab4c99611ee3c71a8*1&223fa5b4758c22a086afdb637338a5fd*1&2170577226ccc8b174b93d559923b370*1&96cf13ed6500e4b54b36dc570c2e7a53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"90088c378223b79c5ddaf850cbe8a809*5&e0165ed326489e142326c66e48bea241*5&ce31b02b07f7cc275c9e2645ca36f2ac*5&9c1567daf2da0c8188ad9d26cab4c887*5&6712ebc7a36bfa1c9cffebd6218d248b*5&33242dc5d4ffd17b4b36501f6b686d43*5&04439a82bd489cdc40f4ee37e4c2fecb*1&f06fd9cb4667757fedce127dae9d2333*1&f32449b323ab799a99060690d817c8c0*1&886af3d5dd6383e31c2182a03b317af1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5a6da8fe4f316ec005ba99ff72c27a62*5&401119403109329586b21bbb1e8dfd4e*5&f8504b39da9dfa7c66ffd934149d6f6f*5&c351547fade410f567177e21169c6f44*5&03a1f1cc64d68769b66a12e20918d04a*5&1928a620725dea2a05e99a11e9cb6cf5*5&3ec95a6810f2e263306c2e109ce8a095*1&60ed35f96ba85897f137ac6de18e6fe9*1&67748cf9d361567d08c7bc1196ff3c2e*1&d48c2a6868bd9c601b1a19b214fe327b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"29f57563f837070241c6072ce4896d12*5&14ca76e1491033006ffa363113de35b9*5&01e68244dcf415b3af659e7605a42cba*5&061c4b686aeac575bd256c9857c8c9c8*5&4ca6ead3a7d6f2ecb0242486cc8706f2*5&f286f64249b78fc217b1312117482622*5&bea2456ff5cee426d96f2ccfb0164bd6*1&94b20f74a7cc0723e262fa9eaea47572*1&8dd686d12a8f8b737db5478a170fd415*1&584d5074130d15d8d6012d8a0df33909*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"677271f60a9ec01525625ee97336edfb*5&215b887c7e9caa6537b7ff09ed52694a*5&e9a1277f9a1e0a6905b6b2c2662aadfd*5&eef460eb4357dc936a048cb3635497e1*5&fcec6144c01224ca0f663833702447de*5&8bc4b9f689fa13145dc86b83c10fdd1e*5&848789d26cce212931b749e25eba4a7f*1&ed769f8fcb02fb8be8b067a2d8b1392f*1&6fa7788bfece8c67b33d22e9f4aaa5a1*1&d086f22da1dbb209e254c7f5d8cf0067*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"902bf7e2dfe328bf74815aafe25c42fb*5&eb049ce7ac52ef09c0413c255e7097e7*5&d1d7055bebb9f71d150a71b7475e40a3*5&6e3f775ecdd24a91f54fa388c64cc52d*5&a917f9e1a9d57e90de6d97de2b937282*5&f199d7554b12d235601c0221a3a1e1f8*5&1e53f91ceb982188e81330e4399b4fd1*1&669a1331b9a287b0ef7f1bb434d7753a*2&62a501bf0578332e7456d138051a5d5a*1&341f5996bb0590262747d0b7854dc13f*1&03a2cd6273f0b0ccfe5bdbbeb8455ee7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"64cc2dc6e3b84d36f2ef3968ff4a8cb6*5&558e94a0c9dbcb5f8480f099602494fe*5&e39069d246e7d2a93f6bb7656d6b5028*5&2fa463780c2ba37a964f1826b29d3d7a*5&709c848a0f02880f77c45448e6ae0258*5&4b62af23aa8c9a5fd1a18cce87f9e1c3*5&37d26341df16f4ab15a720a2856fc972*1&bef9ce4485d66ca9a2172250a2ba6518*1&7058c4638680b50bdf8ec4563637c965*1&41fb1de55e54f0c7ae5fd29016c6620a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"920b5c18d009832f39758a44815519be*5&38857383bd71e877c3f767881ce2cec8*5&eacc58301737d58a72fc35240c8b9676*5&3d10528ebf8b680a9d1b6fd60bdc57c5*5&5011c4e7ccc90e596dcebd1286940cd4*5&70d2fc0ae546d1c7bca95296b1929ade*5&83ef659a9df1fcd29d886e74fbf27a30*1&aad16bd494d4d401abcc6b45a9a4b996*1&79a12f2de604d77dffd5f15a11258af8*1&f4b613f96f19cfffa0db1ab186aef223*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"77b8a7a502c9bf2178edb695749553b7*5&490c349f43d32bab17846db2b06eaa74*5&45e5d3a2627fd99afe81a0057c84a962*5&96175bc002e9126b0c5a26dbdc1c3a81*5&00c893f14af9ceb8d6ed01a2cf209fca*5&9be83edf599d515dad18a2e104fcf473*5&7b7364ddc67c8ea4e60efb41d5b836b1*1&397072e85ecd015bdf835cb4e40fb787*1&977c77c953dee54224999abab2ce036e*1&00d1b60f4347267e3fb180ec1527e667*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"789901cbda229bd26e88cf6093eaecb6*5&aba97ce62788c8ecb93790e6ba23ac0a*5&c3b9e6334f0a353a1e5aeb6a8bf9a34c*5&1d3de71a4ca52696b5927906c3ff86f9*5&9d47861bbd1c4c62ffc75920e445549c*5&40642650a7c4a043e181f54cd758cf67*5&6d5e3d9da0f14fc348b33d111f6d9ae1*1&fa0deaf003bbf28e671d3cea37be8c34*1&07a9eaf8f6f0ecbdf3ee09bd6710c068*1&3acd4c85cd9161c71a4fbe52d8599225*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c9f271088f1114519ae6262e3716a622*5&2b04bbcf74dc2f0db4993468e6e60f52*5&bb41c95c5a8c02b310d06d1824b67409*5&e6db02d50fa6a59d5a0b445b5eb1fcc3*5&23c5d8abb044d52034a3173e1944d5eb*5&67d8476a0589f55faab0b4f198a969d8*5&15c3e7bf458458f465b64e6e548dadb7*1&083b72a078ef528c1ccdd9c6106370cf*1&e39a938fe2723d1724a85896c2a4215b*1&23501f20da5b5da437897cebfb5da039*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c2707ce0c35a974c46567760d842c4ef*5&09f72b903c07477134c306c6cfe443e0*5&a0dbad6c7ea6a30e73e0f123da95c912*5&d9bc7b4dc5a2c23b3e5f0c6859220fc4*5&23f75bd5ed675e79216fcf641b95de8d*1&553cb99208abac0ac244985f785b1ff0*1&871e4ecdd0f97e6e7dad6f7ca7b920a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"96cdb5145f358bd010e9cbff848cca4d*5&ccb0bcc659b92fbd504beb311aa21b51*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d0ee10a1ca6fca7a51533e7ecbb36f83*5&22b95800493b6d0e11c6c8c870a06e8c*5&e4550d7cad263c1587c715c7b7b06364*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"8d82426612cdf486280de349a4521a3d*15&417369bd96db52099d1ea14e7f658085*15&fffad9c569569206bfbbc90ac7e982db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"cf18509d9153b43b90ef14d3c2cf3b41*5&9e7e3ef1634860b3876ab96b18c6ef20*5&34ed2a75a92ad15b76fa4bb0eedfcac1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"c6b3d24b01bc7e8033958685b22bebbf*5&e7d1c470b9559fd11c89a455c310f922*5&1c5e6b3189239de03563ab4409169553*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"5c300bfea223f89f75cd967c9defed66*10&3ca0708749c2f64f52ee90fd07aac333*10&d439d068fa540cddf250449050325f45*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d859e90d9e53af31f7c3ba56688f1fa5*10&ab0f2acba69d3018b915b2ffd7583b21*10&b4e73024c64228c378bcd403d8f1e4a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ea7fb70ef30192d44c40d1676442c4af*5&86dc901f3ca8cb3793bca6772091c203*5&b706dfc2b9792d09eb8be8cac041a8f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"769b2a931f13a46b35d55e276291cf47*5&a85daf7ed42f899bfc51637f62b92f70*5&8912169992110eb07418948d43e52011*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"487c7822d79b5e4201ffea5d64489fe0*5&458f0c5aa2ae4c431973f96d562ff939*5&b01895267ed627a0d44e646323403454*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a9550775a640afc787881d2b0e7ad2e4*5&3899d2c3d996cbb5955f8bd7b25f013b*5&d90e23dab2e2c601271504b073bfe205*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"366dd39617ba512f286a85e247820ee8*15&9eeec9799985e56e21eeb7be23d859bb*15&da56fbb29774164d54e5d5e7cfa72c06*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f22daec58c5c3a9c3878bfeb47be2fe2*15&284e318e732bda93f621b0f5258ef16c*15&7e8e3a7cf4d6edc58602308bdbf6daee*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"14bc2bab666d21c10a65ee43185b7df7*15&5cd44a2aaea84865a0125abe3dcdc8ba*15&5d29241828ae3da86943e613cf3b7a87*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"74f427b37f6104582655346a23a6acff*5&322e73949ec3238d7b264ab8ee823794*5&38f54a58e8a0f8f3eb719244b2188f6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"514452b45072620127f7540f0856fdee*5&825f562284ebf16ae983fad0dde5e046*5&f53d748b866e2397fa9f7ad783d6d250*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"64bfe743495ee1f7af835271a2cc7270*5&80f9df6d41584aca6aa662b6f2a573b9*5&d931df50cc6f967eca4c4caa8a6e7554*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"5e5c5bc4d563216ad601e9bc0cf406b7*5&f4049dab108db679998a67fc297bbab4*5&82be2db55ba517e2800ebd3c3d9bdb4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f639407392b11bcdf615b79564ea78c8*5&a80831238a207be937cc2970e92412fd*5&ec8bcd2cd51259cbd225061c68f910ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1582cb2494165449e10f27a95323bb52*25&8cdcf3af321ed3c4180ca7485b4e5b8e*25&deafae5ac79a9ae94d225fbad1fcc73e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"80083907da47db6d4f40bc263bacc523*65&30b071801e0c11470e5d6bfac14fbead*65&0abd0831d8ce6acf118bd469e0697a77*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e637759f58ddb779811372cdbdc445f1*25&095a63a5f84c3624082a34ceec8fdf60*25&00677a1d1d474dd34d588c860a5126e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b74b85f1d58085c84ba234ed80310fa7*5&3272bace1e005a6f29215bb831c58891*5&22c86155cd60ddc6566e039c8f57eb31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2487b593f9a21f6c100c610e5ec06ca0*5&29a5508dfabf5f02e2b9c2cf5cc66093*5&6a57c6197565a4a8105a55d1672f7573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"fd4fbb6567211899a9ee6bfd12c925c2*5&11331dc2a55539c82247b9db6e968482*5&bb50dc1b10dae7e7a6c60d2a2a342385*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"32d6bb8964a4c820023bccc4f0f4f789*5&59d2a700759fe52ca44760db289f8314*5&7e4ee4366acc2912237d6df56279147b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7bd1cd96d81e250ef0fb800b92d7eb7c*5&68387ba0a68cf864ca225aa2d524d93f*5&7f60e28600504b089410320cbb701770*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"78948e58d3fca098b3ac3b8a14749f66*10&b88f111b417d644826dea381eae5f6cf*10&4340c2a8468b3512574a9bb356f9fd5b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"de76f6d2913052a4df3cf7ffaec727fe*10&38b59cc068d8a5b34a4cfcee2174501b*10&e6dc8dd1035b62ea47a645b86b455a04*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6a27ad47d51cb48a7bc1f78411d65d8f*10&dba1346ec7a1493c3275d0e426d308e5*10&258715a44b860f780d51e57052b17776*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"792a6601f6f639035dfe21e90298eebf*5&dc7ec2c1d9ada6858926dbfa68f629e2*5&22cf7d948d610bbbbc83299ee6e63b73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"3a5aa13da624b0f61cf897a03b183c0d*5&135af2c21aa48dc9acd5382c514a5db2*5&aedf5c60f6d0a8ce327bb9ff781be55b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"404062b92259b58f71a1f235ece4a9b4*5&a343964f55cf5233c4e90fb10457e57d*5&5ea6faa025c0dbb9564a5749c006faab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"14b6100c3edb6ba629849a87691e397e*5&e8063faffb0a7df526f44686e4270e09*5&fdfa116700927480eb42262bc9793d66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"f24c0e3a80d9dc7da08fd4cfa64d151d*40&2e8f4e4cdc1c26f3d8fa3c929c025e38*40&e461f4cbd35a0aeacad0714cd70fff56*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"ba278893eb3df5d48c53e0f7984c2115*5&acc5c0606f65b4207d88176f5056cb81*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"f95cb6a63913456806ce65e190410d90*5&0f8f1b31cc9331f308200af2dd5b2fe8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"2a748babf14b29b99bf3a4816d4cad03*40&68c1c266b85eb362d129590713c6c81a*40&ab514244c170b44a766c988f727749af*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7438ac977e869792fb6405e77c3c6d14*5&b72651b5f85d1f47db41f7285ba7682c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"ef681b5604b0d7e6dd66f9159be83f6c*5&d4d36d24c9343b8b2d4efc1c5b9885b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f42e9447e52541f8f6fd768e36194b6a*40&41332f801a4f5b94efcdf65a44384716*40&c12f9371b8a9694c1fb14faeaaf8aa73*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"d928634637a56a914f0fd57849eaca39*5&90ee4b6ebd61e646830972050afe86b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"302db3ed0fcd34494d5982e790a88bcc*5&2b53141dc510f7eb1decde63d93bdafb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"241b62fab046fc5a6f55265bf40d89c9*5&bfc26dd9d7e400a87eb87b0bcc041451*5&3b0044ebf8b6a5d3733b3396e8e96559*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e64331b55f09daf79aba2ac326cca1f3*5&fbc51b9e5283f3385b249032c07a8406*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"10395fec877cf67c9016b62ac8316adb*5&a1684de7c1604360620aeedb365f4116*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"299fceb4c17a65e6ec7a1bfc47947bd4*5&cf9e7215c39262967082377e4dba796e*5&071adfe55def376f59521e19c1789b5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2e2592fce43d415738bc8b03078c2745*40&98a56bafb4127abfe9f0100d00a4ba6a*40&26b948fde4857189d9b6d3356b630629*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"e9f0889dff916aab0f572bd476efdc05*5&952b402cbe7c9c2925d1d9a77baf0c9f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"7417ad50d169a2bf93b3210d5cfe724b*5&e57739309a458fd59241ed5ad04e35e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e876f846a97adc3948048cce425420d7*5&23e852da306e4dad35cd9bb8f8c1bfa9*5&0d954329c450a580ce2a9e93f9918fc1*5&4ef1829feb5b808f6f667e4349fbec56*5&35c507bc7d6ae90a865f34c37213ef2b*5&5b941c21cfece9512a8561154105d9e5*5&c2c6c9817a2c99ca4e070e4233f3edcf*5&4e3e371f6ba9e3ffdd9391aa21f2706d*5&e47fdb68ff28685305e82192e5a8536f*5&b3fd31ad721d6c1aec622fe08625a870*5&f64d5ebbb880e8d7628786efe5674fee*5&5dbb92470f9f92256cafdce023852a00*5&3f4ab5d62dc130759106cc10f1ed0e4c*5&09b69612f71aac35605e2592e4021e76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ad1aac0eda2663508e68ef4d89eab301*5&b262b43b3835f1cccf782c4b62ecd7bc*5&38d6560c19aa1282e4e3d9dfd9a40942*5&578a75fa7ba495f83f8ea6d07cc3d591*5&b029b4bd8ff41636afed7e7374271e63*5&5894e8731bd6a4b6a3a8843f2bf39d1d*5&bd63fe82e9293b5ab20320788ff09712*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ccd4c77dad0154f28f23d8dc851f5062*15&5a012c0f9933649d297dab6ffcdb2a70*10&12131965a290facafcd00dfc2c64b67b*15&d6c5c46b50dbdab266c9e86ee5e502b7*15&6914e46bf52513416e91ba2b9d3b4d27*10&9c16b2e7d2ee54ccc2fd2856406b9135*15&7e38a06a42c3015d4e3eaf94f79db362*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9d64c32d52e83bb70766ab4aa7449e9b*5&d78400b1346c100f3cbfe643885d5caf*5&58ee21d648f81a8008e1e9ec53bf1264*5&f4d142632c63d002d92bca2bf732c09e*5&1a828f13aef55953da8b306453146f88*5&72d6b5153a05c2bcccfc5e960e11eb74*5&8d257c0c0f69793aaf132a191a27d690*5&ffa55362c0ac1bef8edf212bb6319392*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9db83b8c8efe239a01d77ef5d670e480*15&772f1e04076b4bc516c1d778fc194046*15&602194b53d2403a514b313a377ab4d7e*15&4ffee524abf85a52983f141bc6eddcb9*15&607b5e8468c23ba8f6a7c90e162349da*15&c149f7d4473130a0145b201330945494*15&a6d679ee5de7dfd25f1b9f5ca54c4e01*15&9c8f55d924dbc776147e59381bbafd33*15&d1cc5142e331fc17a44d88e4dd5e1eb8*15&4137c79df6b61ac6547e6b3693f89d33*15&ce1dceb0cce0b1d58ede23362db36003*15&983835ede993bdb9d83d9b293330073a*15&694de6176745813b9504884eebdbfd1f*3&c5466dc2750a8cc47cc156519a8f854c*3&ecfd2aea6c7571572cb188a685fa3aed*3&9b4a87814edcbd27705eaf3862e944fe*3&dba7ba1755778e38050c5f6e279456c4*3&88669c8ffedb360d55165bf4feba627a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bcb794eed3c0bd1cc2d89dac7cbefb84*15&5e2b2dc334eb9447502d4325f1c0c603*15&4c1c8d749ea251e25c78547ad0cf6aed*15&016c082b23ee19899e31b1f451e13ed1*15&557c1e06cbe947fad86f7039d9df822c*15&1d3d8b53364814475e20d6d5aeb497c4*15&fea1b2dd6e40e39453934817d27ffd3a*15&c626091f381df9499e624a08bd644aaf*15&73e6fd69f5772e1560e3c43b15f7aedd*15&3a5ce948684cce077ab9c2f5e893cae5*15&ac8833b4eedd419fe5b53798cd10d52e*15&37a21a2320562bd7f5a2bbc89d278eab*15&1a239da36cbce67ca0df239d1551cccb*3&e3d0d001ae177eaa2e644f799b1697c2*3&00c303ba336a5c1a30d3e7e963449477*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"acd2c58dacd58b5ade03bd758cafacbf*5&18786fd391bd5ecad84562024341b2a3*5&fac13867ddc9b4b51fa6c895fc881d62*5&49b6ae678371251801d9184ca6539ae4*5&e045a7938047fabb3b290c8f9e063bf2*5&79c98a858b783a9ba588c2ec3a1a5e43*5&c8e31b198951c91b5619f21f4615a589*5&355b9837f058d940789159f42d231d86*5&1eee354506cb75ce8b1c1da0e5be7436*5&9ce794e94377bf8fc807c951830b9505*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e77113d439c4dcecac3bc1c4abc25edf*10&70d69d22811a4e73db9b7119d65fedb0*10&b4263cc6325caa240555d28b677cd1f9*10&b3d35d0a6a6e8458dd0d3b366c5d7303*10&a88a6c00ab0dd503cdac3b047e030161*10&85d39d5403a96df90cb0d46da10ecbc6*10&4287577ed956ed2f22a9ada898349c77*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"186ff093883304c1901bba6f331eb8b8*5&1baf63046604528a1f6389e62c3c6bb4*5&abfc10e065dc6a6665f46beb35dbe054*5&58945d72a4713132c9172ab68fd19baf*5&6e215f40fb64981ccd0adcf400a6aeff*5&2ee609453366ceeb28353b01e8606a8f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"03595cdc7fc0c432022e3e88a641de73*15&bf370a8ed28f0ef462d38a957338c417*15&9b054d1be37acd847dd661d7c743e870*15&79f060bd96bc1efca638c591e7192689*15&7d3a6815055fd4061f5507fc5eca1ddb*15&901f5d59ad4ee4580e2699316419d687*15&1a9ffbcd2e0bf321b80ade0d22071ece*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c50cecec145ec4f66841d117750d989f*5&9dd51030e55b5b0bb292edaaa312f1a2*5&1e58f57f033ad4bee11fa5e92782b479*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bfa6a57e67ccf5d4b3355dc95c7926fb*20&8617de201887f272ba913ddd9abf983b*20&7219cefaa33b6e03c8de0ce42d4854cd*20&0fb8994664359a920beaeb334a6b6caa*20&bef92d31f7f2cdd2946bff688ad90921*20&0588b0b687d77e0b8ced0ac49311f94c*20&9a01fe2c03faeb34c7e3839156909624*20&5d3a8495842a66e07ee89d677e71767a*20&602d8aca0e972c6c9b330e106ad895e9*20&acf992d3799a6742b3dc69f3774a7f37*20&5fcf1ba7f7bf2e696b0f8635d89d81fc*20&2d52b469039ef028d916b2cd81231c11*20&53f785ffbbd202d2ccefd677da37f5e7*4&01283830283192e8b2da6adf85433289*4&cda26ca877bfb95c0d1c17c18f611902*4&9bad2e8d95984383be23a3b7e243fa0e*4&1a498ec4c4089c4d12119b1aa8b8651d*4&0c7d441c675a3580ab60e23e68f2ba2a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ebec8156f0b7682fb5a58fa19ee68444*20&ab4beebfb04059b9958dc3bcb6fae542*20&c77f0c02e31745c3eb6233bde6005743*20&d017d01ffa220fed7fbcf1f170718370*20&de2a1adde2980106df339766a3586ac9*20&7144751322ce3096fbc0d3e8392dad3d*20&bc9f930dcd0c5b5ed42e32eadeb9350b*20&2d3eb0b7370ca0fde106f6220409d1ad*20&797bf6fe2e69df2655528cd1e3af587d*20&b2099444de007bc0d287ee958a922697*20&459b777c1c6b5e7e3d86612649c10444*20&22a4745c7e69de8de21806a2d412f83c*20&c4f3bbc2a823bc0915d0eaca8231687b*4&b57afcbafa9499342e8484273d1fd285*4&823da3bd9bfc4678d60ebe01a9397def*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e5752c0f6184ea9140edd0ded85f701e*5&eb0d61c6eb7ce1e7fb1cc79bd06f9fe8*5&059b5e8d4dc4822f3a3df9658d88c965*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b5dd71a9733595f3ff9421a6d9204e90*15&949d5fcf1df633c0a34bd2676a138740*15&81397afdf1bbcd690de744bbbce3b44b*15&4144007bd0d824c5c9c6d21af5c088f7*15&c5d2e7e1470e1bff3cda85bc67f836a9*15&ede31f675dd7d8f6460fbbc3497c20f0*15&ea83746ad64bf857c475ec573315c941*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f76121f50f5a1ef6869caa2d690bf428*5&8a9ff20624e8149eada6c6e6c638902e*5&c490cbcac67b6d06556360bab013e971*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ba3de51f94a9cd379ec22f1ff4f450b2*25&dc25af001c2a79b0d5188953e61d274f*25&06465faed92833d7a7c22f1d976cfb24*25&f0d343830adfbf5bc165131c449cc6e7*25&87b3ce3a4f3161cdcf5c6c03e695e835*25&b74786bded409c36276416c1c0bbf1df*25&6555b407761ce47064873434f3115d8f*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c1f909245412d6f51cf66ca9150b6944*5&7016b5c0d8a38064416292a50559c34f*5&e0ec6e3f581e2b751aee71cdbef844a7*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"97ae23888f2e43580a3f192909af1bf4*30&219a017141f3eb80482ad8ed3c032342*30&743a78c7a9aa577b01429946b5299614*30&cf93487f0975d37f6476f5b2633796f6*30&cc1dc4afdf9b9cea83a0db3497e5fdc9*30&3401658fd2a25f5d787b7220e380bdbb*30&7da2c8810617cfed332fe3bf546cea86*30&1952ff1557dfda511fb27cb32353983f*30&40cd68cfc0d5c9043b1527295a179fd3*30&25be40485a5d1e8d9b8f7dd663a20b92*30&902673eebc38d3abe5eda0e784c5b42d*30&d6136073e40aa1bd630255f100e82489*30&a3b62a3904b569b233b1a8584eeeaba9*6&262c11c5a20bd554f9f0def91d467179*6&8b4b973d08058b6ee4b7bd2360bb72b8*6&494f46b33e8f1d0e1e82983e00eea2ac*6&6a9cbff7f14e5a6a90d7312c509357ef*6&78b5c3916710dae93621e8416df9294f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9be7a97a3cf86fc4834425bf015035b2*30&5e3e151dd6b067d2fefd46375e7f7a65*30&b5c7115bbc687829ef2462b5ca3a4346*30&db85535de977ed498c527c5f931576ee*30&1e5f0cc0df4e2c33abf3f353b5cf7771*30&8ea5a6c48f038843e9bf69ce9bd8a070*30&d0b66d263549169434af8bf149416b68*30&045e008df544db2e887544ccae1a8d7b*30&99050553d08c8950a24a71ef27a9a8ec*30&5167844d0058c36e43b987a187f4c15f*30&b55d1aeeb329b32ee3744c490a98155f*30&d51fc4bff9e1c29b33ebbe62208e2ab5*30&73ab384f223a53515d468c485e3d55ff*6&b8c07770664d87842d1cab012e2de9aa*6&e5c28f2be1a339218473aac27e93a070*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"17d5591bf9df978c534ba620583f9917*5&67756fc1d77811b8b5db8c1653a33f64*5&1c458b48fcad51c06659791144e83c78*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3c33665f7a3bb6ff54a7c7c71fbf4252*25&461870385ebb0f8aa3c7210f61a18826*25&5072f713f88a127cd04a70d95547c847*25&5a8db04f03c0c15c607a8d5021c047d3*25&cd953038079f98d88dbf4fb05e37d162*25&8ac99e5f49f7207a42c7cc900e83a3d4*25&7a20ddc6d5279b152d2bb4055254b9be*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a0f275c0a14f6064cc6b7a7f8883996a*5&72a25569a768b705431a7c6d985108c6*5&68f12d4e75a6750697d004ec6ada45fd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9e281906303fabede30cae75f7da5f25*10&add370a7cbae27780159e70a192f9451*15&90ef258e45f0c00c32f01ab1f3b16a48*10&b28c000eb5e50a4c36738e8b5e7a82cc*10&2aea9fbe6cded44d2fcedc57b2652fa0*15&b38641aca4627d5e3367b90e05e17526*10&be8eb624e135a46b1f953d17bc2496cf*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8088ec77856f9d5042549310a1d88ec5*5&e310664f3e5b0097691d145fd0360ee2*5&5cd71055dbd0bba69460d8cd522e09bc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"63ed8f96abbb1389745a6b856107443f*15&e0e0d850219475a5216b4b19be813e78*15&4a1b39b137a1d5ff9302d660430ec68e*15&d8ef8e93e58ac093c2cf11db7a51d89f*15&1510b5efc47a1b46059b7abdca71264d*15&655e30ba7c1f3481ae1447b6117cab24*15&f560b770821e05eea9c570b05a715fe9*15&363ac63cd8a851fc618de0a4bb8c7c2e*15&f15975b421c202026ddc9e975289dc5a*15&79c5c5b6bf2baedbcc610700d22d6b39*15&51b9625ea81b9419d8d399a99b670ef1*15&8400ce94822fe00301ed039b20693eda*15&0442bee96210d8524c8e1dd7d2f180b0*3&c1341bcb2ce765533cd7e04a4c34c7dd*3&89b37a987b0fd89a9bb69de3de7301e2*3&a38313f4720726837af217d49ce21f2b*3&61bb7f1d965dea6cdb77d44507540b67*3&cb978bf320e5d9679df23c1c795ae83b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"414d7061d9696cc41c158c76a9a1299a*15&4472d8a26e6207fad9b5741cbe677730*15&53bc756eca11871eb0f8f7136344e35a*15&2016f1ed74ec8f977bcd76e04434bae0*15&c7fc75e2562224c2b7ad6dc07245b8c8*15&c6152ea46c51d0973954968edb9263d5*15&d93f5698d2e348508035d0b65a9563e0*15&e217f9f4ed4e238e71966b57b0b1abb1*15&a79739c04303c5c7b1fd7e01665c5505*15&1d1c90131e927f88ea4b44f090c57563*15&b54be2553d72dc22f4cba23cc6e3b0e9*15&626b5f80d3ed0e556ea76d5beb3f80d5*15&b7ac887dcb8a068ec6c80b694f2f9718*3&3d194511e9d21392eb4adad166aba2b1*3&5e47426b9240a8ce45973be0338cc9e2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3292665ba4be6fc0f052566237e93b1f*5&df9fa0e4fa03e6fc41408599c91a50f0*5&68ee3ad808a138e6bd8e87559bc521c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"53eec4fe7cb278a3bfe869bb55f6e7d7*10&7439029ab5bd339ba7f02aca0998585c*10&a5decc67a0ce515984a0530b202ee6fc*10&52f7c3eb99e3a2e7b458a288f02d05f9*10&11f3ddcf092adad57bc7a5caad15e998*10&d696c6d51cecac21c117657bad8f5ac5*10&1c119709caae0cf447a904bbaad3e778*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6af12deeec780a8150c7be55997a645c*5&f5f5e140d72f5462364c9a43ab50680e*5&f53fbbaa7471cf204b6ea9d574a9c5c3*5&55ed6e7f66f4030bcdc34cda23ad3073*5&a247d8e0be41f1f9fa24b73d976b3479*5&078382539a1548d76f7d09005710f098*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7c6b10e0497b94dbe527cdf925e85ccf*15&eaf9321822661e7948897ce6cb008855*15&6b5904cce125b329406df1e090d5b4b1*15&b3af74fe05195dfa9f0c129952054389*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"52772654f4e88ec2d25ef90b6932fd92*15&3ee7eeb93e9cc0febcfb0a570263f6e1*15&ad52fd363524640284054a11c8169d0d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4c562441f4b14c6a061490df71b3d26c*5&3f8b6e2f901fbcdbaed3d98d4d4ec1fc*5&1e3846636ab4fe38de286b2c35a5d11e*5&a2a7a31bc4d913fe921a7a82621794a5*5&460d668addf075933528fe213a29b01b*10&91dbe0c27f67503a1c38c42aee5757c3*10&b633b09f53f4bca7e8a54faca5fd3c81*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3fd0e8410f8d7a4f3f3a1a3c53916886*5&cb5478a46a4e4a9a67d9adebe63f7497*5&f5f9aa55d0eac22af620288993e39555*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"86af517d676ad7f818d1f67646f49686*5&0fc4b95a36c4b6c88d7d3118227ab602*5&442fa9b0dcfa587d9f61ad807a65a55a*5&7553055fc8d3bb39b2fd9bd533091b80*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"67d90399e61c716d31ed74642acbffe8*20&56897dc2e42a1ed0523016b7e27a3269*20&cdaf69a476aa0e0272e4bd1e4401be9a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2ec4e56a7d6bb8e2761bf0a03ae1c69b*5&82e4f7106609c02f9f777e3276b38cb7*5&e568844cf65d5030f0481a8815fb046d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ad06a9db587678bf9d3a5a1c32b36adc*5&fbcc5cffa9b4e4dfc7ba424d568af5ad*5&0d46a62ce67c1410f16cf12deb32df41*5&be0946e6c23ce07be09e9776fcc262ab*5&f3decd041e1c955e4445673a9ab2d8c2*15&17181e7974eaa9dccbe2be4f80902486*15&9b2ae3c3cf3b8c5be29617c923a26a06*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"479765a7e9257f3bcfe6af13b0fa0ea9*15&949f9d9549f34037766ca0f48d96f057*15&9577c5d74f1aeaf67adbb8d757c250bb*15&059d29205b113078e1c62429432ed255*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8292b0eb5c1729936f9921eb1fb7880e*5&2a76706fe1d0d96f96f7851c8a8a8813*5&b79e65c36483220df9ac096ee12a5e03*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"00a9d4f777ba1ad150e8141f2367cabd*5&d453fe570a25881be5a31441ec1717c0*5&f70bca8cddb52e5e6940243d40cddc5e*5&eaa416b819d48c735323b8a852b5df27*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6f2758d45256595f342bd92704adcc5*30&0fe80377059ab8932f5531a1744fe167*30&0e41559f043d33f8f9d003cabbfcf0de*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"48582f20d2d8a2647fb5eca813fd0797*5&d8db4ab8b089687d38706331fc593c8c*5&bc12200ca48c7fb0e806678aa37d71db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1d60749151d025ad0eb436aacb844b07*5&aa57c64ac79cab5239c85acd0edb20ff*5&07118fc0670ff9cb2155b3d490d4d6cb*5&40c421ffebf1645b5b2ff14424607042*5&e975a2de7e4021f9581c62fc78b52c38*25&909c580a06b88e57a7e33f7aba4059d6*25&ad6abae05354d8605b66ebb7a48194a3*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cc9260099c23307974b422d97ff92a3d*55&98605f12a4466129badbc6707da6f3a0*25&da234affa88a5f0e98d7677dabc41a5e*25&e5b6109b57b3b9c59d59fa6eb92f4d1a*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"40c8ff05574f7d7a4d82b2885292bba1*5&a8664c48a99fecdec6a132be8e330ea3*5&704947ad835bada94700f1e331d2236b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3d3047346203d652f91df1c76f4f81c4*5&c4cead3b55a8206bbd235afb832ac08c*5&b023e5478034775f29fc69e95c5fe13f*5&c875c557b29e306cf559dd654339da9c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eca9ea5b62948ebfd5ef6f457800a4c7*15&f54149acccd0d79873b3ff19dc4bb945*15&29304456d1b2e4376c378f2c84639ea4*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"68c229e1e87d6b024d8c50c248cac762*5&aa9b724334413ff6198d7ad36f3c16a2*5&fceb22282d39fbe9e3736e673ff16fe6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8cac250abb9a8fe9601758dcf68d0c88*10&9c73ad9766e1c15a0b25b81b50e722ce*10&3873ba4288412936dcb37ebb81d660dc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a8cabe170ad5f2db52fef1000fb708d5*10&f9111bf4d8db71692f2254419b2c7203*10&ae090295bc4c9c71d82411f4ad6240ee*10&2f72e085c83d05f2fd7b88801b86f979*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"4a111f15a80724485c1b714b524b1160*5&9aafd1e66cf2ef128abc5c2c35aa9dd6*5&6edab877c7137aed876f1ed4dc006e79*5&8b678f9cd29b903a99e3e47b3d977be4*5&cea31d52949b011635586c517d4e212e*5&a93154d24a529de8bc7bd6b797352cfe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ccaa12dfbe20ea5d1a3d1843ab461ffe*5&3b32dc3868da7222a92de4f1fbc73615*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"535ecea365d3d0946837389a72f59796*5&c84d455bb61fdaebe4c9733ca8f14032*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic16oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1f9f9b8c7dc82fa3a4ab30549ad4e123*5&5fdf3c3e6432a6b6856bfe2c0eb105b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic35oc32_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"f4ba8ecb2a064c53a8761709c8eb58dc*5&9223fa3c3f8a8a7770c57af1153fc5db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-zero-points=src0:common:1 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c72befc2b0b5097301d8c2ed3b43e400*5&e3513594bcb1a07df94c135e01a96c96*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"5605e4cd0991a271e6975357b2f383f2*5&76cb199f12a780dc4fa128c132b41329*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"37dcd32edc4a1b673987db3c4d89fc03*5&61301a7c31675f21e6ca59c2dcd1898f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-zero-points=src0:common:1 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"71b853915fabfd9ad3d4446f61d5428a*10&2e704d0bda681a1e67b5099a36b64fbc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e5b0906202c1827c39d8f8a9834d6223*5&3c2b02f1d2371e7b32563218ed7d97ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"72a5478723bded69a23a54d50b928f63*5&39b289c43bb49c5b156cef8e1cbe580f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic24oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"512f7537debe62012a398c321c54c4e6*5&9a6aa4a17de0935611df6e0b9b10184d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"84355bc902ed85b38e2e48474cab7765*10&1869ecf9bbb4dc828bae10f794fb78cd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"57448f3aa27fca8c28c47f1dd9ca0a79*5&fd86de091e515bc5bad0480292aea017*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b2236a1f829b675723554d5d1b359093*5&2eaf2e61bdcbb2a07c6603a74bcded42*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-zero-points=src0:common:1 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7da17dd16899edb57a102ccaf345bafa*10&8ef9679821a827378f1e2c533a1acc2e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fb990b4fca341d26a48493bb7cd44f82*5&b40d06ffec36a61581a82f209a3c55e2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2f9e4bc85aa36b39a4aea2a68576c19d*10&2b693a89d552df5d15d10fc5a714e89d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"58ca89c140e892dfc68869c32a4a8e64*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6613230b4b9832cd28b2b60843a54535*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"0384931b2703b4c702dca4ea8cc143ff*5&80deeb0308e586e16347d598cb0cf1dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b1e973060238e94add38f0b09dc8b921*5&2eb22bdc518bc121483cd2e1257fe13b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic64oc384_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fb170c2b4eab67868150ec92d6e08a3a*15&26a2a17ffea3528ca0997ebaf599fe26*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g384mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e96d223b293d1e8deee742793e079232*15&8721732446dd8bc8a8256d72ecca863d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7c0336fac6e9b37621013ff9e8f86d80*15&decc507178d4f4cf6e2449d963f8a3f9*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-zero-points=src0:common:1 mb1_ic64oc384_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ac165818226d452c5e4266bb43b03d9e*5&b306445ae267b6d6cae45c473f27f0dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g384mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bffdd1742efde0870d34d317a231003e*5&37594037b8015917d1db16da3424b029*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"39289250d853876e57999ffe1117879d*5&2cf3889d91f0951f3e95d50d6c9a66ad*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic96oc576_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"46e74fa39453c3a014e0a17f48406af0*15&6a205c762423d9242198462094f4a77b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g576mb1_ic576oc576_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bd72ac95ee6c54de87b30ea306baf7ae*10&a320b2d57a0d81b1f4b6632356e5e03f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic576oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b059d9a4b62c2ddb6eb63f5fcc1001ab*10&ed33850f565dbc33a64b56a77fc0b4e3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g576mb1_ic576oc576_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"a13d45c2f975548c1e2e9ddb43e088a4*5&2d8c2d61614a28e42a69cb3c8b28bf12*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic576oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d535b031bd01ea283ae6ecbe33c9a093*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic160oc960_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b13c603d3ca5b764ddd21763dea068b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g960mb1_ic960oc960_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"075d52e37cfc05b3465aa9d1542bd46e*15&e503478167a78d60bce48cdfd2f58d25*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7ac106d4806e2e4045f010585fc8e20a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic160oc960_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ac368817c5637e363e4f00da92c31ace*15&c6a2ca8167e0b2123ec1f5bb1b8b102b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"554ffc22eacd2deb6cf0fb0218fb2d62*10&1492e1b82f0f61d76add77abcfa0052b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic960oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4d6bb5044db13cb7ebb9458e4a82d582*5&3141aa5cc0224da370daaef1c100b9df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic320oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"80c0b2caa96b05770a6ccfb301cacaaf*5&e2b0193f4bc767957ef779d4989103b5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1280oc96_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"2fd54477636472b3f0a75f587b94a000*5&945b3a68f8622a9ab3c295a9f8f885c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc32_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"b80cae9c6573d202211960ec20224bb5*5&2ccdde76bcccc2e6e4b737638087518e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih128oh128kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"5b167caeb2828700a4e321502589cdf6*5&54abfb8d467df26461380048cb338937*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic99oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"324f46e2ddd64887b6a99e05c0eb4581*5&e4d18c1f9f044ae4d48a20f175c7bc22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc32_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1e1b507bc20e6eeb81c2489a4ca5e076*5&a93e491e898eb0b2d3ad52f279c667e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a5bdbaea02614f7a7befe8de834968ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e3c4c9b512a79db0e4be663831e8904f*10&483e5285a19e75ef6f6c06d3803841b2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2f65e1ee6ba4f916074dfe16f0035321*10&49bcb1910f75141cf558ea84dc259b6f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic35oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"ce9e02ee20e0f855ce8ea20322b97784*5&b2fd42517a00647befd7e3eb8e68d6d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic16oc1_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0adc9ffb57e5d79d4cb00176527ba616*5&8732601ed54723fc65b9ce895a21e91f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"9ec81ce9fbd775753070db07bf787564*5&e092d603027d875b21a8aaeb965581c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"1876b93430949de5c1ae2b946ae33a35*5&b7b272470054966f1c25dc25957c23f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw342ow342kw3sw1dw0pw1_n"58499f39c235e98c0e28c70868ebe854*10&520781077b6b47464c4106e6dbb2e713*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"de0e7a13af2b937739d49cb91acf9477*5&df3d8da3ff2f6d3458575e2b8f052f2f*5&dbfc3a7ee36c7b999692e5883d5b69f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:14:aBcd16b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"80742c5e7ccb2f2096f54922645e4ce0*5&e53fd26d6602a717a4e6677218b24827*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"a9c75156bdf4e1d89cf2c71dae131df6*10&ac2aca4faee357af041f7499d289ad4a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"2beb37a9a5dbad9f58cee4dd54547dc4*5&47c7e4edf410d3d0280f302746af29e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw342ow171kw3sw2dw0pw1_n"db661c9a519a6e8f2b23e62dae200d6a*5&a6b19c7bece27903b65bceb1b7c3e8e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"5438f269deb5cabe7d182b4a8ec905c7*5&24ba71e38682e99c94b7af90b53226e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"807f08e3ee180483c509cd293132d48c*5&28d8f78f35f12fabf61279a3a3161670*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw171ow171kw3sw1dw0pw1_n"547d2a52c8e31a2c383a535cfe8f52c2*20&b0f5440d651c0574a175d146e0d640fb*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"132b758dad8f0449827aa260dc84d61d*5&1f0288f2521bb86b0b9611e08159adcd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"cd66b8e35d1e244164c3363e296b3160*5&d816e2a5f8c392cc6480efd2f5feacda*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"74d8934b69c11f771d7eb290d4edf645*15&3513044d197a0bcaafa66b1a5c3a5d10*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"a84bc971caff3b2b8625b4e44343e069*15&31f5758e031e4c9e4ac9c0614c7f1467*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"1b551e7ba00e0f912557ab5102aaa4cb*5&930c6639cf4b265aa1c5ff0cfa39236c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb4_ic256oc256_ih52oh50kh3sh1dh0ph0_iw88ow86kw3sw1dw0pw0_n"360ffa83b76c9e7c07ab4180a9b4f503*115&1615b53fb2607d117643db58a3dcd050*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"e0887c47e9791d34b4b7c847658d46df*5&7a370ec36dbe2e697a34fb3e1dd1dd6c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"4f388d9b09829e08c0bcb02365d3e966*5&b3529c3e114be3b5f74bd841ed80b4d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"2c22e532af35ea62b70cf04f03f5645d*110&88c5fc376539de410e9e76a666d5258d*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"eb6e2ee8b9b858c9b0c89d54c3f235ef*105&7cdf754e5862dbb8ca0dc3581b6e2b89*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"39f03f762a6bf9a0fde94e6cb948de45*5&995ef6c36bf281769519e09be03ee5cd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb4_ic1024oc512_ih52oh50kh3sh1dh0ph0_iw88ow86kw3sw1dw0pw0_n"65e497e6dc1c3ba463a7f0359f14db50*5&23ef9d4883558c2f7dd15a217ad8c51b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc48_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"d9efb3ea13de19520bc9ad95511c3dbe*5&13fba2570dac079547fa2c9da54da0a9*5&39abd534aaf5069a01395f54ccce52e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"aa0a5dd644725964272fa7feb7838448*5&0896b0ae9e13de8f770517b00a1ce91b*5&98e2bcfbaf975f53779057575ef08d04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f2ab0284ae4ee5a329d9f73a8e135ad1*5&6e80319de8d5ffe9a56c057986e90ad8*5&e6cc6ec1406895d22f4afe417b038e5b*5&104969bb5dce57c0a0955ef1ab7a8818*5&c465062538e5877e0912074c646a9901*5&10b6a0773badfd7f0030d4c4e2d1f025*5&6231b32a32c10d6025efbc394d19b4e2*5&22c80759a80dbc9f50886a239d8989eb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d62516dec5f2dfd05cc875426c266093*10&911d00814b327167318def8fd777866c*10&3f6612d5b4dfbf75f58a7dded9835f38*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"955b90ee05a0924d7717711eac376327*30&ad266ffee4d582f81106fe668439dc75*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"77a5eae561e7adfaa701cdad79b7fe62*10&46c7dc212d33b695537adef6dd05e1f9*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"75aae55f49c16f53e17ebf54cbe4e144*10&9876d25c9703b14c5aaf26ed63801efd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2820b8e110c2c9b689f0eacaf85a22a0*20&779f5726f3fc92cbf4015cdb1ffcda39*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"03dff2376255cd583420411936e75724*15&64b44e489836a62fe4ad0048b9b63bb3*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8538cb9797c37d986c973f5d7dd10130*5&c4744a9cfba71cd7becfd4a3a72478b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"3c717ffc2aa5ae5158d97bde4533af9a*5&b88b252b1d71c2e6b641875d67faecde*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"07939021ab16dba2987645ebe9fcdcfa*10&526723ea3d3e146251d0bce1acc8f2cf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb100_ic256oc90_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"fda1b7d95cf7e067257c65946b5af884*5&cb85706756cecba3955e3ef6ad50306a*5&ac4974a4bbe62f9a9ca1be017e414df8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"78a6bec2592ffda8f1ccabb8ae7c5573*5&3c900d5f5771e340fa8da222449b2c99*5&4d8b1df89a805bfd20d4d8c969ee6e1f*5&ee08e9f3b8f202cfe3e1246a87b05462*5&932eaa043b4df4fdb5b22a65fde13eef*5&f0db8070371a18704141b5ab8b9c1929*5&0ab5e78a4ea07b6f15f2a61553de11c1*5&8e8ce460c820cc41d10a5a94e66eb899*5&a9899afcd9ac251989c4bbf40a3d9542*5&01540227281b96174f019f9f2cd25949*5&ad88616c02a83978e70abdb8c48eb06e*5&094a1592064b1afb24f8a8af274bb4b1*5&22f9f2dfaf39fc17f6c751d0057b0c96*5&e603bf97864002c68852a65dc693a7c7*5&94ec160d9908d700d4f64479822ad4c5*20&e8d185ebef3c5dfa26c6d8f037543676*20&5a92e12c985b2e0e225012004694094b*5&15b9be4ff6c759850e2cbfaf134d7c65*5&3905c06927f71799975ec537954d480d*5&29c4496dfa5af91922eb042dcf4bf9c7*5&6ee467a91a3b75d564333052f7031f8c*5&4b3cb758a7ab38cbe177391ef4486f03*5&dc91badc52bcb273efbf2900b0e17f95*1&9ed70d1e0632922b051b781e4b661c06*1&5759199cb63d6aa70e2f40d755a435e1*1&a322d6ae6c71bf69b69a2d92054f651e*1&cdefefac0c6fbdaf8624980a1b407574*1&a9792a0816c6f1428607807055617a47*1&d59b8faff3ff9e2d04369b1b25e631e1*1&128bb3ecf021916af14f56c30e1841b2*1&09ee1a272cc3f60666292a63d3c219d0*1&2db3a23a4b8cc2259b37d26c5db6d333*1&3f6bb609c34d923ff04ce338dce5093f*1&3130d87e9990c10a2756064e614baacf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"de1bd6222706013d36338c2f4223d9ac*15&4a4d2c2a24ed02df76f8a46761f8a091*15&2a43793b2b0125701dac7e0e17f4de8f*15&e35cc031ce23821afd482565ab56c72d*15&4ed54267e2d3954af0de21a0847d70a2*15&d82a4480f8955f7e8101f2d78bb6a59a*15&9edfccf857a9efec947966a86e291fb7*15&03a4e247db0ba7faffec1662643a66d4*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0d8d789a7cc06326cd6dccaeb91d41c1*15&ca7fdf92a7028e363536c05a727b0773*15&6bdc35c063261076cea0f049323659d7*15&61d71a92fc366493b70b8b383247276d*15&566a89d6c6c83b75e4a43bc7f574301f*15&440f13ba72a8efb7d5c51145cde2f00f*15&718f304e5355cdc748054c182557c299*3&d88a376ec1ae683c1b5a5a4eaadca5ef*3&ebd3ce4e14238e7469dbf37533cbf6ef*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e92295eb41bddd0502330c0cc2cc7502*5&51c96979cdca72b1e9edf5d5d91e043f*5&9d8a7fecf802b872e4640c00266c8669*5&f2a59af48f0a4adbdfe72b8672cf924e*5&5dd397e5200702db083c6c9f0e33963f*5&2085c19f451949652f3619f9592648d1*5&ea4aff8c7f427e69e1ccf8c57ea4c63e*5&185aa5cb4cda445550d8ed021dcb42b3*5&542d988869a00b347052be785e86aeee*10&9a2b24082a0ba1b2bc634f4c171157f2*10&c57ea53fe9637caf5452c759102c2d75*10&4efb7a107f5c93e7ab6472bd57509309*10&5ba4a18961eb94c7db1be9a08878c083*10&706325b64e4c72199aca27cefc928679*10&293a883146134b74934376059f11c514*1&b858843bd703234f6ec2c365519de31a*1&05475ff62c74cd748f1baf348ce501c4*1&32417aaca0355e9de23011ea47831b73*1&02d7757bc5547f637f6a0721d3e7a825*2&1dccd2cd5a22b71736e06e4b67dbb472*2&29c80bfd666e35206ef3015542b8b02a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"725735d73252f0b1bb13db2aab090c13*5&ba134b51196c5ad4af223b0a0b89c03d*5&0bc99a9c1b1d86ac5a00540900a283be*5&e197e02f1236bd9d9d9b48399aa20bad*5&ac58c5b551242523354c7aae357e6ede*5&5c50ff6d418dea99fcb9493fc1d20537*5&66f9b413490aa2241587a91bf3f29041*1&6cca5686a7e1dfe028bc213d0991fc2d*1&3a66eec741f33cdec7c1ec34f7781607*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1f7ace7a330eeb0d5030c1e566c00b94*5&68a3e5f8301fde246136e063f70bfd0d*5&db6bfd1131c50c6b969711a31c3c6394*5&8a2b33f060869f1b8cd5559fdf242355*5&09c7737893bc67df167d10aaa8706523*5&2d959eec34c7b01f6f0e73d3e531a20a*5&a385fa478b41cdb3f9f6ecefb7bd105c*5&7152e832394c0d72c9ac9729812fd26a*5&472dfded03d2f2003fa8cf62dc72df9a*5&e98f39764d4eb54e7bc4ccdf5dde5e67*5&babeffcd98f6c28edf3d547cc93c903f*5&8fdc786f8a04786706f77386cd9b5dbb*5&6cefec60d6fe671f4fc1ac350484a20b*5&7ce8f1467efdb70c2341ddbc72d5b33f*5&93cb55e13a0d0f3fc1e09fa53cde4525*5&abfa45b6dc0552b264528461997f3d3c*5&28c360bdad8ec59ffb87af68c499faee*1&46d251790fdb7ad4001c894ec1cba3d9*1&bc5dd5e8195a8c5ff89802e8e4ed089e*1&2824dabeda2bb4c8b6b3e7d7af43c9fb*1&d13d68f22522b7a8ee4c88bc56effdc3*1&7eab441b36259aa12a7c8b0fc5329a0c*1&4cecd0599a9e2c6db4eb655dc0f12071*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"0774e650cf103d429a24e4f29af4a216*5&281e89fbf31c4040d7462b1db19411c3*5&86632a469b179cc93c240d7f22a13828*5&324b97e54c494ffa42339e6b1774ec8c*5&dca88f7bfadd149bd2f07b1c4b05fb27*5&b74feadce7d845fe8dfcae5c1d2a16fb*5&71455efe08cb98b082a0bee6d67416ef*5&8f2f2fc554a1f695b95a622d43fc7629*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe7799f4c5a64b17d5e41a6bd2c30056*5&d42157f7967b8d9bb44572771c4cfd07*5&f09a40a7ebd08433b350e1c6bd32d16c*5&096a0c8dec7bdaa28e8a63f90358bf1e*5&6fb9c81b1a1aa7f7981e321aa7f66295*5&11344a6d716aeb3451f4dd9dd722a10a*5&f2efd2a754dcd1551d8c4c73b0d7b6a7*20&8bc06736f1bd2da00bd62db0a48d72ef*20&4272ed6d80d6682f68aca00faf495430*20&f865e1ec3a53959980d6ace194240de7*20&a4eea67fab4e71d05d5111d15c344c43*20&8a3409eed02465a0cbf2b32c0451f227*20&3f2cacfba333c3b5c450ee347043fd9c*1&f405473086a342ae3de93e91505cc5e3*1&22d4a86cf0c0417e3d134118fb7cb450*1&b6bda9770e7d93773d9cbbd68096e684*1&8b9772522e6fc5918549d1f1f103cd1b*1&544777972942ef17a75e227d9d33e8c8*4&bb2a6476474617415f3b094180219b8f*4&aaca9ee8255721b6d981722bf8963fcf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"856f534bfe760f38e438f92ac0e4a58d*5&7aaba94478f840ab0ecef373a44fe315*5&0d73aec23f88e1cd86b83f5ec364d8e5*5&fc6c5a18cc81b2de102fe8920e0a6ae0*5&9d18f098bc336e304bca707b2f5b4523*5&5812f1e6ef1ab456d0abeb805a3235d7*5&aeebad341c9fbe8d9a1c70d1b5a0a351*5&9088eaf497a8ab6cd00f62fec19d2c70*5&b2ecc643f54a90adda77c6ec198df5d2*15&d9f37cb5d3f02de565b89edfbcf535e2*15&e09b2fe2f1c32ae7967d24a3a133abe5*15&01de8639d9107dccfcd39e0fec59e625*15&cfe94f6e36558a58f0341c91b7e832dc*15&c0b8bc759deaab5a74468be80e864b6b*15&a4058e08d4db545fc8b40dc9887ed030*1&65f4298cde9a5841ffee9db504c11184*1&eb1ee8c558cdb8f17fab5a09cd167ea4*1&43a04c8ae0a1ee7393161f2a51215d2c*1&d5858d0f45dac660f52c0034cc9fa7fa*3&802259a8e00a44a6f1435e35b0c3d910*3&6f85dbea3a0b01a42c83526f92178831*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f3f4fbee6ab97d043711a52460d5152e*15&dfae6901f9012be98ff856f2f5018dd6*15&78aa96234bd4e49c3ebd0026e7189930*15&004321bd0e0ff52871aa100e3c24a0b9*15&4f9f2e271d62636408bd0a9179a6492c*15&73caf33a1401b693e8feb9c33e40cd5a*15&66ea2b261b057fa2b41563adb71a4776*15&5567d7cab4be0aac667e9a33d11bed15*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5bfecd7936215ce2aa50dde83c838ca7*5&a19d953b214dcca7bd145d90c9d02ffc*5&6b2c07cc426511275f1a617a758ddf7b*5&6c2d14f3a36e8c79135819d4e89266f4*5&e10c5cc5c77b16e3f7bce52fa2e4198f*5&0fb9136c980fc056eaadfa930e725e00*5&ea8004bbbd6136d9a1eb2db2b8e18ecc*1&95470a2d16bacdb9df18c47ce6d1901a*1&440b69f115dd4a0c69e7a0d04239a232*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"3ad429ac7c450d7537c352ec296cca76*5&34dee6af79e4f23d1398fc15d7e96998*5&6311edc90ab83def34b2d1b4c1bc9c0c*5&dcceb2eb61624f584867da1af581d484*5&49527a44dfd212400464868174c3f568*5&50c095bd4446a2aa81acfb92e280fc81*5&0131486487f0172bd66a44d5d62c1680*5&a157cb6fbaac0635b71661e23a4338cd*5&e5f54bbcec16f97b0dbbfe16c5b9e286*5&9bc12db15aeb6289e7d838e7b38bc665*5&6ab7e1219b4a0e3cf967ea6c218ec0fc*5&70588ba06a175942f8dbc2630b43a38e*5&672548f0fe7641ab934a133886e304d2*5&49163920fd14174e55f1cd5ce90095c7*5&f0a0e35143fade066885b0adb4639263*5&266ea08805e4dcbde6a8568e33849f71*5&5e3ed27f4316a8fd52a5963e9f86afed*1&db62c97b520877265c05a8d20a575d69*1&13d817d64f52164d9e30b13e9a9629ab*1&4b7534b16cd2d40ee52820f8d475f405*1&81f60830d5fb7b8c0038812b589c8c8a*1&2890fe9e64036692a20556e7cfde9990*1&1a7782893210bd54dbc04329e6134a90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b6840fe30db9dccf1a1223fbb1b7a76a*5&fda01a7327b722a5f1c0bedc8c8d2f0a*5&e03fb4624382e16cf309d236482b76d1*5&dc144c24f5565008bc6bc667c910060f*5&29518a518875360de4adf816db513165*5&a43a86552cd243a6187684f23a5dcc9c*5&1803fb39a3dccfaff0b96b1ad539e2ef*5&ddc6ac600868984bfd6db3bb0d5d32c2*5&8446e32876f5df556de5e996c294ca6f*1&a50e2425d8a877729fa0199e071ce4e1*1&dd3246b0ca70776150c11fa5d59cc896*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8d9e36ac8011d4f481d50403d0b7209c*5&c64daeb6cc66f3fdb5a90d1161510af3*5&779561240428938fff4d086a300f3560*5&81b1dfefe63736bea67675628020fea9*5&c878f3eaedfdbc5f57f4dd767cf057e0*5&72cfb384c35e6fce21ed2657af0997b2*5&28db833d8a13613506e479c01065591a*30&6d0db93ba7fbf6ebcaf94ab8808b89f1*30&ba5ab3008da361bfcd3633bbbb638459*30&7ee479411a3a63358ec15415cd560de2*30&4a1a622264bda73c49e06ec97afc151d*30&4b167bf5defec01a6356f85f840d4699*30&29a28fe70b804bede93b6a53e82571f0*1&c6d5f928dae125acaa896c229a31ab7e*1&8b4e3d6ae9bf7f473174c271c0ec086a*1&df0de45744e2ff75c30c0bda82d0ee95*1&d02d2830974fd6ef17f3f43ba776b66a*1&f60d86a71108a3bfcbfca92ccc97d4c7*6&7dfe1bc1d9d48eeb259f907857d3d407*6&619c0f16ebf0514566c15f52267653f2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"67d65b57b53c7e92118950721f0323fc*5&e859e7e19e176c47f4647ac059a22081*5&b8e31983f444f57bf2296f60f61e7850*5&630ebda4dc541f25d164b0cfd4659708*5&095b46b7bce375e00b6a2b9533e99ea2*5&c560f1c0578459cc5850a2cb4b5562b8*5&26977220d5c13b3ab917fcb3d7aee8d8*5&e20bd6c217acf64a2b484d7ad246f8e9*5&ed2337009b2fa6619b49dfa3bfebc7a7*25&3685fa537afc087169d5c861e48a03a0*25&b5d9279d835ef9c6e1a7c2a9964cb6c9*25&0f66542d220ba3726793487de95a0bb3*25&713bd59832218f0bcd3ef4cf337f7c5e*25&b7a97604069a0ccc3efdf0817727bd35*25&24ed3a018bf08df4cdf240f0e28d4851*1&5d83d630d4f3700f337d6feed9ae466f*1&dfaef07bcb70d3449b6177bfca61c67b*1&3e170f1160dc9d81e6c74fdce60a2b74*1&f8aac0cfa9baf7b8ee17b4df93c913db*5&3f115d3873ebba69977fef487e94a954*5&f6ae3c0d533e9239ec92d10b85864d99*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c6a8bb804755ba89ef4564268017668f*55&eb827242e4cd55a37e334d809dbc8141*55&a8f01f78ab4857deabf568d7a9b64811*25&9105649c064280c319e35ed34950bc5c*25&8bb17932484f176b3eb46546ed0c31ad*25&46579e63701109996ca61bffbab94e0b*25&1fe2818e49011539d1766e11a23043b7*25&dea15c40c152ccb7ef9d9db96f2d52b2*25&1d4e02945e0bed945c5c12c4b2983fd0*11&9d4b3dbe7d6101a8ed6a34336de44cd7*5&59c96d70b2cae6470af94b851603f796*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"542c76f7548eacf075c2dd8d508c5de8*5&4a57062d9d5316884dd3789c6578724e*5&a5dd07fed230e60ca2048079780cbf04*5&794946f4af0715aade3ae855b602716f*5&21affbf105d594bb88d0cdc6561cc56f*5&6ab3a30cd23d7d53e9e6311b37ea0b36*5&19aa3d2c209dbab622878175ffdae631*1&da119b565c52db38e1b37ddb28bffceb*1&593269a0f52ff87f359e643650899f24*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"6535379b47197a64cca6fb503ff70917*5&8eeca90cc66b9763b97e0b41a481d97a*5&5a3f1523696c34a7fbcd3ec06b0fe5eb*5&abb7c417dc465be9e53929914689646b*5&c5b4aa9c4a2128674d527ac831243e72*5&abe7722037db7d22c1850019e4a90643*5&5cda741fef3c471a1f656a03e88e5f5d*5&32ae2f247937ea0aac2bec2e8a188769*5&e324ba8a01995e4cb9c761828c526e4b*5&026a395563f6fe857f3767ddaad0d9d7*5&75935829247fc8939d30b84890c597a6*5&2edb3e051e1fcc5faf3036e8144fbae1*5&4dc12bfc77870a50bc698e80fa5af848*5&db75af3a7a418d3a25dcf33806733618*5&1890e16c83098b6aab31f08bc4967515*5&8c021188bf47fcbda8654278fd7b9150*5&08eea17500a49e68587047108676aea1*1&7d50e52c975183083c745ec0f2229bdb*1&be4899e5ae23789066255c0b7916d9f8*1&5fd4c96004660ae13713ccd961727d2e*1&fb3b8fa46fe7c5c68e586792ae354ef2*1&7dc987c27831f5ca95cd01e4e91d637d*1&35ab89dc4541bad03290b2b28f655390*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"be5a8cb2db692e426cc87fdbf3e8ce09*5&35d5c2b13856ddba52d5e27c5db0116d*5&22727996504521976e65290100838fd7*5&b829ed718092bfb8a373ac1ca020eb69*5&5b403d2cccd15f6458cb05aecca38cc4*5&8eeed65e8f5e6cd787463433249a8172*5&d614c0204db8c9e5171a3f3ccf5fb8b3*5&9938f441270fec92117476f179631790*5&afe8c737b81fd9e8ea934663bf817dc5*1&d27aafd59499aca46857be7bdf7def80*1&e3b5490368539d232ad524bb04d5a355*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"57c535e20506216adde605391213abc1*5&3364085695fd97e74ab7a51914a4c417*5&f733001a3c987f4c5b484929462affef*5&a7548bc6ec358b90624dd79d805e0817*5&1f387561d5499983bdf881b068ccf25e*5&693fcb1d4a51cf8ba8cb5136345a65df*5&6647a42244420deee55049c134f5b3a0*15&4663acd7b4f271328694a2ee1243f225*15&f73aa88d9873a1e6784a770a234439fe*15&bed3d5587eb187ee1ad024ee0f173573*15&22bd0a2ed503b5bde35c0b2421bf43f8*15&4e2fa047aea404758bc2ca9d0c590f74*15&016a8f6f1570fdf39e571049c773f2af*1&21b83c7258ec2726219d5a68ef0d8f66*1&9e7893af5fc9012c34dc15dd4f8a983b*1&53bf3a96682a768f91bfcd6235137484*1&d22399b5384100320dd911b96f48e31f*1&4806e8b832f41bf5413d6b072f83345a*3&50a9e3185faeff71790f979566949902*3&96ed97b6fe3b2b740448eeb8e3ab267a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e025b30bcfd7d8f7220e684702733d1e*10&de2d84680bbeae275b95cdc44316ceb3*10&82bc39bd591274fba6072f732c0b19b3*10&39461da43c013d7c99a8b1f94e53f052*10&f13f6cd01bf68440621fa29ac60a124f*10&014a49dcc438e441af8af8d47c829b28*10&f9a234bb9ed617222d325c482f7e6ef6*2&baa55a23c0516c07461079cd20bab343*2&ac345d1cb7aba32715bd8ab7e10003c7*2&3aa5d4d6a3f8147be692b0b76e939d08*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"25577e806e93041d8541f099020c07d3*10&6844b87a36fa20b4265ba5f491cdc5cf*10&9c5b0d0c45887a836bb863926a80ff7f*10&a4f71ff7a22e29e4cd37f0491674a7de*10&33b32f9a31486798e64d278bdbb174ce*10&3e0499e2389a4db1dd6b2d621302a6a5*10&0ee7952cbdcfed3f96e4d351ea9aa6c5*10&9ea3f8dac24ba436b23b0eeea94fd12e*10&b648efaf934c659592cdf98d8ee00bff*2&340ffe3a624f92866cc3eca4d643612b*3&507c73a91c9107f4a87d3c4c6f3d95f4*2&886db73e8afa36e7cb42edc5bf021654*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"b05927e4c6babd7e7883e90e132da745*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"e3b081aa21439121eb58ed5fb9c867f1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"a96f17b04249cf51b52775e92b0fdaa2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"970b595d5ecc6ce3fd4de88b9763de0a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"388d2f945e040d42cd5542f1e9fa301e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"66a938ad6b6b9b5734aeaa55f4359d31*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"06adb27eb0c3bd4877b30cb67e8c222e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6abbc4e3ec1fb7623d7429f4527d1289*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"700d7bb4d88271c408c5ac4371419c37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"dccfda863a64bbc672d3b01d168069f6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f21c89570ef6feb3847dc5ec94945671*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"b2b9853707c64db0bc2977c0bd36f31f*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"9fccbd03501cb6fd0736cbde902087e3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"58f80442964bc3f41aace685da8eb17e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"2e7c7efb76a27b8912c19eff64f6d98d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"edba79a1fab7b13b646cdc727a3e339f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4a63355ff5e6653dd1af3ff3c453e3a5*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"d66a0e505e96da1f035e1b4cb1c765dd*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"d1a76ba3aa1f7126c7368a8ed4885d16*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1275ab37ee916084158a3de2ef6273ab*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b531e91565b5e8e700de8cc7953af8bd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"db55b47291830aadaf264e537e59f35e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"3173e6d43fd14d9cfdbcaea1800df953*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"bb5bfda854ddab29691a937e635077da*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7ad2c03b4dead17875f12b33ecc71ffa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"709aa7c38456f6b51cc383035de84961*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"b9f1d671716be431a6108d794e1763eb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e89a5d8422c355c47f5020536acff11c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"cebc111ae78e6e3821bd55ba159337ac*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2ba02cb34f6d07e58b060e14cc2cec42*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"db7f809a0aecea4bed112d21b2698ceb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"6956cb573c381283bab6089b9ff9cbdb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bf59e9c69cdffdfbc00b090fc5aac2c0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e56b417f0a8f7f7e7250b29206acb23f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"548d9f337d95656b536fa1f656ffb6fa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c854702647879bb899081650afcfaaaa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"505d507f902dc5894071a62805ef43eb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"b31ecd6275cc8833674a9093816f80a8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3c34d02af0c76534f6c9d276c5aab8e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"dc34a98ce262c6abb57153825564b0c8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1be96ff1d1a9384e0dc5cdae0c7d19d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f23a04aed27431af58341625a997325d*5&6f0a19bebf09545be9b967f7ee383782*5&10469b9675a3e136c2aa9642b6e8aac0*5&b26d00469ecdba7473b3ebc7f530e400*5&030bef5d7b132242501ce6042b9020e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5abb7f48cb78e054ee0bf11430e0d910*5&d4259a5cf1c754be6071cf478048805b*5&0707a7207d750bd46f7e86888e4d5ca5*5&4a1b5aca5951c3aae140524ddf2d6f83*5&42a19ad6d120cb6eb9d8f4eeea417da0*5&f7024af4a08b34bca6c20e8bab093752*5&d8ae607de3dcb208e99d2fdb0dd3508e*1&db8b8012914b7ffe1d4687b36d292e61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1890e8d3619e0c326764e731b0d1da6d*5&bd9406955293d22abe7b475c5c1ab8f0*5&3c6d16d5691585016a163ac4a40fa61b*5&f49011ae81d6370a72df4aa57c7136b8*5&163528ee41950d40f51cf18df74022fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7b3af3aad0a65ee0432f84ca21ab0139*5&51f5fca2dfcc2362fd9f02c220eb6cc4*5&e409e851b38bbcfeffe9ce703f06d2b4*5&0d2138892489843ca534ac4d6b4e9629*5&dde5ce37e8af24bc46491e2f8136beeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"8a2a554e93f5e6a11ecf88625db11026*5&2ba88c044e93c3fffcb26365538c33b0*5&c136a08f9c54bdbf654fdaf5c5d8592e*5&b403aa56185a687c1649eb2fb1c5ce6d*5&abf3cb4f2183db18a6fd5761d1b9c155*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ab7d17a32ef5bb2277874ba4ed56211c*5&4763c8a3f08c7b066b7317c080d81bd3*5&dad0cbd9428f599c411ce71ee0be7f1f*5&ab766496d1d4b29d3939e12a2d6217fa*5&ba2105da5c9ee5a6e1e522f985309376*5&fa2bce8819accfaf1d299cf242f17203*5&da71918121a71d0093a6a6f1931bbdd8*1&c7e4f8f6567a25c15826d6745d7feda2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a7837877081b0844df4f865021bd84c6*10&088d2a8532bf2094dfc6068824e6822a*10&37582f016fe52459f9f6aba51a8e5117*10&4e0ad44515a60384071752e5caa42aec*10&753df33215479f2130520715bb08ade3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic35oc32_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"37fbd528daacd2cc152d95a0092e6097*5&6fb49c7a89fc4ad4c56e08b351a9d43a*5&b2b35b8c6efd7795a83cebe537e1fe38*5&5cede84d8be678a7e9ce53d9b6efc9db*5&537701a3f04022a99799d0a78be769d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4ff0efc69e8485bf5aa2de9d9e30b817*5&440e2d3971ac0fc6d5f22b8f701d99d1*5&ca017540632c55be3bbc29643d1ac572*5&b3459490ea24f967c4c5de496125ddde*5&67493fdcb6a4e36bad1bb64eea705448*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"04ba55f805f0833c8c22b701e2ab802b*5&198b81ccbfd89c1e7fd64de99f4c829c*5&492d72a0a6ef166a83c80a7783050dc4*5&1d0fda4c51c7da9060503e9727c85928*5&5fa83512bc35930b25d7c173ff3a7698*5&d9838183e93f822db82806f1829967ea*5&5cc01c94519bb29bb9559d35af6b72ff*1&500cc49db28ac8b3485a4dd31284ec95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic24oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3c56d2169350f322fbe0ef503217126b*5&0232ec0a80965e700219c58c313ca85b*5&123fd35c72c41a8168591bd3a060382b*5&17618d557551ec6b063816a2c28bbd6c*5&7b5413cc78702c62cc764a018599ec20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"c43b52c06b2791150636c39f33f60d0c*5&15b22806eb804ca22956f35ac24822a8*5&0f904a06a356c1940e87657c3bddbd15*5&629f35cd2efbfd2722a078482b9aab45*5&ff443af30c2e598547cc786d1325e448*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f3f0ded5f0b4e6a3b7ee403180abea6e*5&de59e23ffad878c6d105a03c8f1bdf2f*5&58130612b8531db8572f9bb8da892fd9*5&b7efb57dfd38e2cdaeb9f82f04c7efc8*5&10dfc9c3a3bd8700e7218839c3f67130*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c6c4b43da64296388c1a0a9c5991d018*15&5bb42eaf1cf623586d431153c913e6d5*15&179f3f456dca20a812b9d5568421ca83*15&742a925839ea229778946e0db2f92754*15&78724f9279856cb41070355b12bad922*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a7d1d3790c585ad0217f3b967cb0c80a*10&26574f4e16999a2e52da51ebe9c9c5d9*10&c77d322f2711eadc6f4e46273afbbe87*10&e4ce2d1a2b4ef77864f953fe8bb81178*10&465cdad84001835a55e4953588b0a7de*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"85bab5422e702045ec0f38995b30edb6*10&12e30a54b18587e5371fc33262793a4a*10&d12ea883df423bdfbce1f98b4393435a*10&a7bd0e1e86e6cab8cd147c02ee72559d*10&c7ab974ac8cb988141592c3153208886*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b12e99dee030223114cd9217f942f60e*10&a1bb003490dc7fdc546d85efbc94e89e*10&e657c4033a6928ec40b14b97ba252612*10&e545db548e74d37c991c8325b92d7f32*10&9b0ca84bbced0dbd401cfdd474808a17*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"71178ee4b7b941c87f08d26901baa0b8*5&8dc1d8bb7482842fb3634afa2bb4ade2*5&79338216cb4c02d208ccc3525c6b2ac7*5&ccd5d682e1b8aef0c7c904c299471d98*5&c320b766411b367a323222388d9092d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"652eb4a65facfe3920111b7fffb45a9b*5&06bb60a16385b45193603cf51e7db00c*5&5df0dce63123b9a03959b8f4660fca43*5&f4b97d3df16f0d0b26e1910807f2b81d*5&5ef67a6fcefd7761650cdcaa29b33cd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc384_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8706936d73e8119753a26b8cbefcd22f*20&67a6e787ee93afe526246d1f0930f583*20&b44bffa49c876becdbb054ca54449c8d*20&ed3836f4b528b170a82b6bd083ed6fdd*20&96fd96f9e619ed3ad78b1dcb309f8fb4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7a84c48d033d633d7a11a2bf2e09f5c1*20&c9b5950662e25a85dc7b243b64af1595*20&7960989039a5698848bd6e6f93910fff*20&1c3330cb9b9bdb257c1fa0749d16229d*20&027b1e9ae226f38e666d10c64b76b81b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cb9ad38b56ca1db46f1c74e323240b2b*15&80351790268eb35ecaa6f512bbb8e5ad*15&2a6302c11d62583cb29eb49b13414bea*15&521f2e0e913cff10527cebe18561d34d*15&0a2c3011304abe861cc0e21f3efb91d9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"374d792f652128094190ded0304eb983*5&03f2a4e0f72b640b38267051b7c78431*5&9cb79f263ab055ff04098f89c7c9aa67*5&391f0db3f4b868a3416590c6f79cb87b*5&300a263b7c27c101843cc69e73e2f878*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic96oc576_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c31c489e4a58bd410b57cbc61d4cea33*15&7ce6b806797305db1249beb84f4ec0b9*15&83850a54b6532146e824a72531cd823f*15&31dcd1687259a9d77d01eacff2ac231d*15&1948749dc72c30339c811c4efefa9f54*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5363fbbb3ed1267441eb6ffc81886ade*10&e5067d4ef168a1e4b00cb6c10daf0ad8*10&cbba54b3e127cb03759a710f8881cbfd*10&d280f27854f67315ece79635dc52b87c*10&4b4c88bf6ba1af21b4e9a65e5e315534*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"92cac991e20e4ce7d13dbe075550d791*10&e17b590e8dd124aa4f335ee8c848c4f6*10&493f6f2daf3edf39a795a9641b24c33e*10&d7d573cac26fcbea35e08c01d90c12d1*10&bae1e4d55d16d156bdcf36d3f4d02c22*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"a65417b9d6fe726b44b8953c949d9ab1*5&2a806e69d310929327db562462d19d8d*5&585062253f4124651efb7d1016606992*5&45e5a1fcb3a43c52095962888988b453*5&c688edd3d54e5fd2778d75d6f01192d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"65ff5bfed9473dee33677854a7cfb08e*5&c8d49cdae7791a250eaff540ec7673a5*5&ab3a2e0e9ce777c9cbbbca03d35e0e0f*5&120fb7686028e83818bc2ce5ba80b3e3*5&2286582b011a1474937afc6b8bac61ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic160oc960_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a5d7f8afca74f88c9947646fd6379d60*15&799ec53c8d672b0923e5da344d48c752*15&c27daf98a63917a30aaf0519e535dc28*15&d139f69c0bab5bfc69a1abd93d261c87*15&b9f7415ca2004665c3aacefda190fd8a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb1_ic960oc960_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5fa1c3f1ce8687b9c0d5ac5d95b9ba62*15&5421fe525d86ae0f2972d07a45585bda*15&b22775436a171a579b2f905b5ba1e232*15&a10f675b25731299adbc554af5c8d9c3*15&808ff0d357b40d0d38fe74746487abe9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d955ade6434bbcd07ba89899e2d01ad3*10&41923b2269f4a6fe343f5e682bfacb9b*10&512612f633de0cdf693d1f7295a7196b*10&bfbab556d31eb20eb5a5fecb6d953ab1*10&9170bbe37e38a8bdf8bfdfba1013962a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"14f21f6428764f641773ab4824a626e8*5&6a45a0b401dbc4b61cf97d65cd210def*5&668e6a0bf1b7b09df0be57b717ca0f03*5&23c404806d28ba3842664f2525779526*5&a0f8a9a6e562bd6215a3e92e4a5e8fd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic320oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"75755448e983152ac345878442c62ba9*5&c109d0c2603ba298f89c635d2c53ecec*5&080dc1a9f3f690fc605056148891f4d7*5&216784b4de70833cbd0062ec67bcd2fa*5&90ca8bc2f67624a47eb0c86210da9559*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc96_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"f0626a7bf336362b4de7d36b3bb03642*5&f46df8e05bb08c8bdb7e1efc3127c85a*5&5bb9a70efefcd237ded2e38fb7037833*5&f880f8c2d59f621e47a4cc33e2dde99b*5&a111c661aa7848a715981b82d5743291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc32_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"02b7db6a020028f052c40dc18bde2585*5&3a24571a8dce96ff98d683fe9097b1fb*5&99b173691a957f3427c41f8f6c82bd2c*5&8707e86e2ea32259790facadc0dbcee3*5&9fea6dc429c9143c4d7f46b9477b6afa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih128oh128kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"a7de113cf806c2242433e75173c3f644*5&bafdf069b09fdb57dbb8f347a22cd397*5&37c50bc4fc97f898ec89b5b00f15d561*5&75802295802c75659747ac8afa8d8226*5&9758f1478be7ee8a009d00e3fbca02b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic99oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b8872bd780354a02e9de446f40383f45*5&ff17ea3fd5be3fe5b71936dab52416c9*5&2f430e033e5b35831dd7295a3b89fb55*5&2d3fc62fe714ccceebb373e5de667e85*5&fb2db3d631d2b60216a7dbc325e5b5e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"58eadc7c77094885dfb762047b82009c*5&f5b88d727b3a2ca5e11ca0505798ebd1*5&7190959f18a0be16802706715ca9fa2d*5&d7e91416ca34a17068ef1affd66b8693*5&5bcd7182a2f00910fd847c479924aeed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"20bb2e366ef51d762493bc82bd9c7920*5&2bf09a85451859ec6d1294217ca48d93*5&c4b73133d5c6a40fc615030ba493bb26*5&0341d3d82430ee5829bb137fa3b704ec*5&da614480c0130f608b8ac0d72877b30b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e76370dd59e07b0cc3fde756732dd40c*10&8e2e3b37ad577e949b2996f640da8530*10&8a285f1e9f52d1902714afedf634cab9*10&98b535dcce8cc0827836acb08d51f093*10&f76cffa513051b23abda49230bc0e936*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8ee75f489266241271191e1cf4632a5b*10&4557f36ab979e037fa51a5e75374d00f*10&13d71702ca83eb70a6eeab33758ac2f1*10&633b0585da16b84f86f3185466f0904d*10&f5626d841545f8300087866026e2c05b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic35oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"16d215847456259f563d6931f768b9d0*5&91c062ced3e15024535d009cf1646ffb*5&9e4ce85d76e9381cc25ab1b809fcf07b*5&38cbc4003c7dc2d6761737a935b9747d*5&b221acec43a44519a9caffa20cca7260*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic16oc1_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c561969d1b7f21694e3e7149258853e0*5&5be05fa2224c25cb1c0cefe3c8a9de87*5&b899894ce2458a5cb14ecd41d77a954e*5&96b293389d78b76a4db34a35352ca5fb*5&1cb588dc473d3fd94b1166923b073185*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"304dd4a24cab1dc473650f2901732c7a*5&060b32612bdfde2d6025b4a126edc3c5*5&837db7239c4e5fef9d3eee0a7e29da0f*5&d46cc7926d15c945927185b15bbbda68*5&7f17b81a0d1c49ae42dce4933d521ffd*5&3d117bb3adf8b9c36eeecc0de6ce61e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3506891fef53fe32e4127f9774779320*5&f07510785863cbe201c796a767ce389e*5&618b1d5414d93c0be4477fdf171d3b2d*5&6db91d5344a384f4a9bef4a239bc63c6*5&fbdab9d94612867ade712c73b79af9c0*5&8aa0a75ca6842bbd4e352f25327568b0*5&23a73f95df70425b6347e61cce88af93*1&19b594c35b4e7d9d639ea1386dcab6aa*1&676ff44ec60d5fe9350df082aa75b5e6*1&1e8479ac9a1aa80518917899daa30f70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"a591f960260bf59fd58232d1f37f05ac*5&f00bfce4e61825a6c5780f2aac9fd87d*5&5ce4983975db097abfbd0d47d1872288*5&b950c99d68a8f3d5af2a522ea3b5f8bf*5&4dedbf3d5a0060ad63702941766d3513*1&f3c95e3f791d78ba013c6ac544e3d31f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"196c33318ddd7b7bb221e3fb59a500bc*5&e8eaa6916bfad93fac8f3988cd9b7dcd*5&3a9344f67857c252f23260d65634c06c*5&1c98f04bd8ff419a38f21adc80af07bc*5&e431198e0fe89b55588c54edcca7d000*1&a656335f82ec64c64866901b6a0d77a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb1_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"412787296f0d2f6e52b7fa3db99ad0c7*5&4ebd75cad70ccbc833f6ab8691f700a0*5&6df39f8fde57e60dce4c0c9022872e4c*5&ee1245280b503eee2d6f98582720413b*5&94ecb03cd0dc2d723a6f9cf93419c3f9*1&9f56420fa36a0e1db8992549189bbca4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b5bdd3c768dfffd71ff6199f5efc055a*5&bb1ad79311443ccbc5b7c6d7ba1cd6a9*5&f4bcab7639c1d5e781da26156ba1a826*5&20f2f197f3bb87f0c18caeb6a18d241c*5&eaca68c6a9da331483c9621ab5f9b31b*1&9bbddc202592645afce685ca1fa9ef40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"de4b8b4aa3d85c1f55c558b0c7f8f58a*10&dfca7583a0a7d3552295cd6f4124c21d*10&a3fb69141308bf70bb6a5998fb8777f2*10&99f31f8193c3dbdfb91f9d79c94249b0*10&a9985d7c8b86d4b90bd5a7869ae53541*2&441db672a99fa9b6ea0590ebcaf67cf4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"f9627e5011736817d8d9ddc3c4db340f*5&2258c06f0abdb0d0563255605aa1ff3b*5&2c3ed7847c07d0f79a43c2a17e2c4a38*5&fa1fd7bf0683d2922fad49dcd1006dec*5&fc91b8343bc89da8901d65c774d6c106*1&17bad0c7bb5e29a5ce159e14d9868ba8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"05acc82d08350c06fb51078f68ccff18*5&80420c93146ab465d250f129f774fdd2*5&00f09cad09bb7b01775591634b6d3b01*5&daf1fdcf29508336c5bdcf74c3774a1f*5&03bee6a24f3f35ace89027de9a386b8a*1&b66266dde3a5ccf0b756001ba7a6d03b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"66d2a7561481c03e7d7399a993061d4b*5&d7ae2a582b5c95bd5eeecc1fe9903365*5&4cb20d5aa4b0289778ac9c62da10e868*5&4f3eaa02bd7b507470413d6472de1da3*5&b76c49076524eaeeb06aca381b0094e3*1&b278707656c55badbf0d80dd88fceaf1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d0539d1e089abfccd68b8d963f9add8b*5&cd8168c7f416e042230496b1ec3a9768*5&2d298467517dc44b0755a62e31e00f7b*5&99de9bc9b756436e20644284010f4476*5&fe4fd3799579d66465ba74d9d0518a37*1&3bc0a7c14ccd189941612e2fa4a62c02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"2dc0b3f0a26ebb2d98619e68c06e3d37*15&e07484afbea84b5dd94f705a11c21792*15&8038b936a427b6edb4263d744043d1dd*15&08433994a8cf423a3ccc3992ed227446*15&3f5531f45bcac33c0c83420b8bef680a*3&d7802724bf918ab4b627a48b31f1923c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"7a06726dd350168111aa288d772c2b0e*10&b538daf40e9c87b90b1bf9739b003715*10&aa3a9a031f878094cf3fd38c94dbccc4*10&55b4aca002c939a79606bca71d329e3c*10&614fa0ace27bd32aca5e2e08f0bb544e*2&6c397aee6cda8ad72656f0baac56dc70*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f3c3d595315bfe6cea75f7bcbd19ec57*10&980c60576528e0e81bf811e876c6131b*10&5fd74655908af75e53eb54c6f4f7ab75*10&2c24f153773257fd6fc55e9d2f6d6f1a*10&8ebc962573cd4835e6118cd3cd6fa116*2&0bf3a4dd6af7d2915f1b235d81c6c2b1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"fc091a06d10b91e1aa7931318d36c5ed*5&42ce2415361632e8c2d4ede515f8d7cb*5&11b5047425e07f618de130f25d0b2af8*5&448634c6a8e9636899a01e433825fd0f*5&57557f9bc08a08a5c618a3eeaa035aea*1&b81abcbefe3b310f4afffe53d7585169*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"431aba1bf1bfa2130c4dd2e678ef9e31*5&3d83c0c7e0c7f31d43c1675bd7da7b2d*5&6a2caff986770e8c5b7caaea6d0dcfde*5&3078428c3f97d3b3392c3d8b60b07c87*5&d4a2d4d6b91f41773cb71a757e25914b*1&d3eed2aad90eaa9dd38c0cbf388ddb65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"478b12c4ec3389187b5e0a58a0b88519*20&d0f388e9e08b7e3ea15ff8afb604dd36*20&7deca263d65a9482e0aadbad5259e635*20&5b6bb07b8f92ff6806ad8583a2d5ee40*20&9e38c5728f628d69e682f1803351c82e*4&a8d107dfe9d24cedc11ff09cd434dd0f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb1_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"39b4ac3c842b92799c839011e1e183c3*20&779fb08069cb6a029ca3b86309b2db2a*20&f33bad6c6c98fd40a2233ff1c9a40dfc*20&0fe56a401c5b3496e1b221c1dcffe997*20&59a9e434d81b03392c103517f76a7d38*4&880544863c055c444aa248e03724b444*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"16e6fcd1aeba2ed070e5fb91b785ca2e*15&18a795354bb0f75fa58d8cd2f0912399*15&dfcf87f01df1f2d519c60c3702b4bafb*15&e0fe4e332c9d710f34a521c1431da273*15&a4eaaed8708d50e5e5feeb718743b2d3*3&5bc62cd88ddc99695e26fa4f0ef36c45*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d0f269a906530efdb4377867596f8d7a*5&a7c8dc30c88dda07dcc9f2ebfdbfd09f*5&a211c76e0b14ba85d7a73ca5ca091388*5&3a22715efb6191b5d978695395288f50*5&cb5e83e33ae742b6a96c5a8a18efb200*1&0aa6079a249d1d586c6129349ca12624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3cf0db80d875750f321d4f4e451852ad*15&a756a3d0e4956e25f61e8390a5e1722e*15&22bd685a85900d2e13e46cc99e901031*15&0d48739699d39267c34fc3417157e673*15&d38012bdb3e774ef2e3ba705cd595efb*3&9b36295e51fe8857f8ee7cac18901b15*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"0cbe7df71f3c9cb91a3841a0248b58b3*10&914f79ce922f0e8219a2608670942b75*10&ff6009f63bb06ba94703fe2b80ce1c76*20&ef887a594b5464b73c14c327b8e49618*20&567077dfddf02c32778027bb4481361a*2&2b45db70d664f1313d3df7068ed356fe*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ebba1241952e387f35765a23c64665f9*10&bb02f6c7208c57f5a556bde86db98e3b*10&3c9e4501ddd200b9aad20da43853576d*10&624114fd468cff9aa6b22498efcb375b*10&c2da3d161ddd42511b3f2df54bc30642*2&31ecb4e9b53991d41ec050f3e065c6c0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"acbf79119c88e15ff7cfe74421c99927*5&eea5a8e423c7ea34f448a8a37f71a7d7*5&7e018b87a9943b9ecd791be92209c10c*5&a4ef2f7e5b7a1ee80c2b3ab4c62b6a21*5&4a7740f9754737b34f3f5fd0b944def7*1&eac4a241a617928136406711ab3c1e74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f3f44475fe1a2ed99d7b0c0487a00a20*5&44db4c29ba11f20d08709b419d204501*5&0068b47ce93b9a761a6546830714adaf*5&baa8460589521fd614f2dfb6f70cc470*5&e90be10d11df2221556ea1f93d9dd5ea*1&05cfaa12b7d2cf2c473e69800bbf7405*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"11c0f1a06a7944bc9bdea45bab82c24a*5&72c87008f246f26ef3a2945d445f2865*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2f745f0420f3355657fb8cd0c05ddedb*5&88b989dccfb8fb38097a69deed0b9ef2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"db6e192f2bcdd182aa42b3ca803a167f*15&ca01a24345de5564d1fd94162544112f*15&4c9ec6eaeb70645c00766a493c1a9198*15&9d8e6a0f2b96fd7a7cc1496641192a08*15&2157eb11679749e7a4bd64aa6a23269e*3&dae8dc44d1d9c09b5f939607f83ae86e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb1_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"3d3f76b730ecc7842dedb9f9347d430e*15&964c2aa07be40b345e31b4072189f23b*15&ad38dfbc3ced2980561a10fd47358700*15&2f2e6228f63b4da5e556ee684751e858*15&ad059e10fb2f774bf9e77114828003a2*3&d463e8c137faf30577d50469662efe80*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fb0cbfde7b940d8d3dcaeec491cd7e14*10&2ac5b6a797a80d4e8255345fdd48570d*10&0798485a8ba2bdda9fd64cbf47b70a70*10&11d277977a1181cfc24f70fc409d262f*10&73dfa7981e2c69703cdfe6c5d0ad886e*2&dcce01ba86295417f57cf55a699787f6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0d0910752d89d79e72ead1cf3fb7bba2*5&ed7f832d92693e4a22a6d98cb7c30109*5&869098e54b341a898c5c9cf9a1380034*5&afeb85da2d817e9edde6401a00158de7*5&f7585af5aeb15422b5809fca76866ed4*1&10b2b84e029f9ec31ee635163e48a04d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"63a7913cbf9c93463ecc92f052cb17f5*5&6439c3feacf9944f6a8843e1bd70bbd8*5&473796e35188db62c31d1cbf2619910a*5&65bac4115ad634108a1ee4fd8cd29b7e*5&959311ec72f450f1f94a812a6ee8836f*1&661e8d08edfd3b5817f2c57a14cc0d5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fc16aa8db03aed3c697ee7287f7b7a33*5&1c7f6737035710360ae9be01bf397b4d*5&5815e52101fce6961b7ff32e11c53d04*5&1236b5175cfc643bc68ad5aa76e354c3*5&00382a813f9febb953cf75fac27aa1cf*1&df7b24e321c1aebe27c0033b4b2231ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g1280mb1_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4108d0ff09692ba7148a6f5577301603*10&4a16a217491d51b6a59466c3601c2ddc*10&0b9f714bc3f2937977beb1846acddd66*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"822ad45185762ee4ebbad4b99bc26f89*5&573c18644577ac1b3397bb5aa2461e8c*5&56aa93d7573ad3cca03cc10eb46bcfc5*5&4362f8587880205e2b4a863312bf1ebb*5&10c19a0b970f8368134a5c477d167c89*1&54c338461d4e1130f6018a6be0509aec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"65bce8cf161472a781895ef39c631e9c*5&f9ef1b0136c40c081c930820d5f42a83*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"453c8bee9a4c98dc96203733948b82b9*5&240b27493d17ca86a035d046fa0c7252*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"a00f256ca8bb9299df8aae6dfc525d77*5&61c41e6b4733ab25fbba63d2475500f4*5&863ceb31e5ca45b07b585ee0e084bb57*5&3657b40703f148c4e5374806e9dd3f2e*5&55da1f7f58ad2e8860b38248127b50c5*1&71150e1c20168c9a74ad9623a1667488*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"2e1b65aa1a64533aa697d6b559ee6fc2*5&b9f9dad44884d8e71d5a3a0920e3150c*5&cf81eefcb05c3894eca3491c3d1e3d31*5&c32d8b0d776ee0eb014aec7626267a29*5&1a1b8eb5bbe5ca0c1911252d38706f7a*5&5f38871a17bec7ea7b8cb7a53f2d6234*5&5841510ed4e14371eff27aa86d8eab2a*1&3755d6fce3a92dbabb5498ee7fd8b392*1&d59fa5095e4ccc2bd29e45ce6bf8077b*1&660aa169149c8346945895f19d3fa4a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g512mb1_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"1deb9ed00836004e3cc52ddb82cb411d*10&5f196ebb75f5e752f22fed5a6e0ed073*10&95a13845cb0a7c353b265ea71f097bce*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb1_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"b3fea6ce7a6588d1b3009e2a303ccd20*5&81fff6f11d43d3c92f7d0f223d979f4f*5&fbca704b613b8e256984df09254de8f2*5&6f11dfa8e75ad8cf60e937a9952f49a2*5&41c54301427903bae16551b5ba3c58ae*1&961b95698d1f48533d8f1112f9717605*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"9c90854efc69b499d81c0a1197ff0c84*5&7cbe2a4f6288b7c0154ef2eb7eace1d3*5&d1a2e924fd069fc267f8a549c4ff95d5*5&ed4e67950adda53612b69256f1693df3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"89cc93e6d3b3fdce2bd625dc6488a445*5&0c70207ec37b25f9cde68bc035f13304*5&456b4053f0140872e1406aeac99976fc*5&927de87517fff6fcdaecb9ae366bd216*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"14fbe32d0da61261a6f9d8dc9278f238*5&ca3daf6f001c2d67a367d6cebd676fdc*5&bc9a8efa4c1eb57737d3e67a8dd68679*5&45a33159d1b854b25f864fbfe5ba9374*5&9826dc833f9aca8d7c6276fc0512b98d*1&b718ccddc37472695c25f69c5b5d3719*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"0db0b737037b72b3310330c66e017488*5&40fe7f09a715a2c06a016f18be6aefdd*5&fd65ccea4780ca4aadc942f89f595cc5*5&a41f5f8c4dad26519ff15f6875e92758*5&5c455854abdb7d57143bf5d9a6549007*5&aaa4763b0a89bd83e9a3ef6fab4dadef*5&4e76780339938cd0b0b887924a39b50a*1&55a07b62399d1e6973a120733b2cb947*1&9d7603d189d937ee074294e5187e1a67*1&8b4284fc3664952980699b621cde6ae6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb1_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"25933460eb7a7773c8d51364646b8f17*10&c8206ea87a45cd2aa2e5cfa93f45c54a*10&4ff6e71862660cceafe832a0674e4cfb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb1_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"b88e4552ccdfe2244c6dab6810baca08*5&69ec492a40b1ce47cad793b1f65c5833*5&8d11eb87cd9002e97dc90f4d3ef2abcc*5&bd9f6322e1b31e30485e1042e9f2ae76*5&bf1c3573da1e2c5329b2ad460b981bd4*1&d183f432c85513a92b51bcfffffff71b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"b8231762af5d9647d157e6f468940013*5&a6949b7a932e30dc320a34603118737f*5&39bc8bf27e16f054dcb777ab38c3bca3*5&58f810faa64fb86d773497cb47a4d4e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"35b53931a6ec121e3043c33f0ae5571e*5&e32b7a3b7f1db33262d7dbdd8c7f88cc*5&fc09826009d38d8bf02f7af280353814*5&257eba9e7892c89dd6afdc2461a6fd6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f6377faf47cd9c89b3d5db4455ad46d3*5&8f076d98a7014d218e5c1bfe41f2b8c9*5&c68759e39f63ca0c54d7e6614d868564*5&d8c0a48a3b702244be63781ef48f7d18*5&5521d014cd1b74cd7c881fc1409425d0*1&99099652dc3862aab0a5a3f65f3a2f59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ea2753aa516e06c27e7adc2d35f54466*5&1558db14c804f5d1ffb9df580616054f*5&bc4ae14279e36b5a51e4f5ca76db3677*5&0d36e7866abcb52761e894406a434c1a*5&09b18d7eb3d7701d072d7f2e60828062*5&4c921f1e64609c49f5c4d9c2e97ab57e*5&e65f2ad38bc55ab1885ba3f84bf46a2e*1&0fa1a7023f3ca2db124c607eb337c29c*1&469e0594c6178ed6668e5661b0b63563*1&4cc050426cd8af06dabac4830c535ae0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb1_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"1cb488abe30439588381e04634dc5c65*10&c3ace5c675c6faa6c0dabd3c6c497bb9*10&256664d8ce06a12e6cf0a52dc44bc030*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g64mb1_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"4f8701399e7c4828e4b1974d26868b23*5&3e63339988fea0490f62ea777ed4ce30*5&f11105dee3bc67fb1f044298e72f1c91*5&0200f509a720a243a9cdb40ac19cdd8a*5&416b71e3a281ab824d1179e70bdf9cc8*1&91e68ebb50503daa16ccb68a9a26f113*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"240d0b59b3ff5aa2e879328d46effc5b*5&4879aa969638d50b8d219083ba9e1061*5&253b686dfbb8d98240ec50cf90944590*5&240047958185fabee261dd5d38dcf531*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"66549918d70c5bc7fe1393d33f155af7*5&6ac0e1cf2445b41d1fd175819bb8f1e7*5&cc67d2f9d590541b4b24977ee496bbaf*5&76fec8b976abdfeffb886159075b0336*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fdf0f1d0fed1a0a9c80212d1e65ff08f*5&1b2ed581021d4b5d8bb7de5186931792*5&2941a74e886bb7faa474a6656125ba91*5&a5803c40b8af23ea409f5e68d6061596*5&bdae37c26a603af44d5fc99c03a6ed03*1&e2dcad9bf39385323409a5a739701665*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb1_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"34e2d2b993dd79784d23c25c7de08786*10&be922f8f9852f3c377b1d472c0fc1022*10&67b10e80be62b42d8c37df249a61a1cf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c64e059892eabaae5677aee40d7c827d*5&3b0f692fc4e0c011a8c36707baeb4822*5&c5f4a62c1481ecce7992148b334574a9*5&d65353bcbf561bf968f8df12d571da81*5&ce78da7382b1e952f5923198b2dd6bf1*1&d7d296795a64ef95dfb2a30f84bc0dc0*1&4a077a2772b426a87c9417160ef30411*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c3f55d958e634268e744bb2d1be2a939*5&f085e2ff024b278e25ebfbcacbabd855*5&76df668b56589fa3dd297758c46fec6a*5&dca027c13e2aed915f731fda7138cfda*5&15d75435dc917881dd2613111a3c7e8f*1&d6b9ca5a520de3c16a9922bad61cacf9*1&4413fd1b36c28e627e07fe7c3ead0e02*1&551476ce55d8534731361b3158a9f163*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"6bcf6fc5e4132fe307db0f799cc2b7dd*5&dcefeffae447bf1565ce850751c5466c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"32bd84ff144bdde6e21c50a70d13e0cd*5&3591f1c1d5a0735cf9a3ff8f3301fc1f*5&f5492faafb671b14d9055c646724e226*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"1d88f85b1e4e5ab5a89081c8f13dfbcf*5&05350f10f8812e6be32b01a151d99bd3*5&7213c306150060cef25f98ff7808db35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"fb55c815a8cc7981f24ceaf23e5dee4c*5&6a306637dded0fa72c0e7c56d7900a3e*5&485dd4559434745171cde8e228ae470e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"292bec2d938cb9e57c6e4c23d2103291*5&0dd0b3a707bd5ee9e2506ec50aeaca9b*5&b03599d41c57ab5b7461b80d39664ec9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"87eb6257b9d3becb0f98b9884915436a*5&7942d5083a301192929906047bede923*5&6a8be8592154907b1ce82cda5c0488f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"9e9371955af3734caa98d33fc91beb5a*5&b4b535fb43590b5bc9abe14985ccd95a*5&38fd3347ec908a7f8a498c70fab73c4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"0d84e2fe1465f05dfc7545f60d63ecff*5&9c1ea7f22ad4feff9237259e453fc5eb*5&b124ebd03d922c91ca69ab0ab149077f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"45874dd4b5ab794a0613a9668f36c175*5&6f8428d6a618b227f2fa1fbae890a89b*5&bb6e13750e32b9416619faa2cba69a7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a09bf56793ab1c10877b870c5aeea97a*10&5c2b0d4c4879e6817cf825b24ec1c60b*10&f1af2c38b256a985800d139ef4759a6e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1bfcd979fc47448118193cdaecd9fbe9*5&cee6fb45701d69d20ddc3f610e4437d1*5&07e9b2bc11cad7070edab73532ae2b7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"76857d0f6182642e06fed33aaf8fc3f2*5&524155b0b71d8eb99107c8b8669cfedc*5&512d9497009d499fc0d8b7dd19e7aa22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"8b1e70bcd641e480f185a3a6db469bf1*5&c5885f871ce7021d39924afb999b8aa8*5&f94960d4ad4e5f723eb2e9f7aa86f5a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5b8fb8b420df6cbb0b9a3abbe05c464e*25&2e019dbc331c8503e381e2580dc32988*25&a54685794f492f3adedeb972dc0d7518*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3858fbe47ea97109a4e3bcf3ea6891cd*20&c2cafb562852d08473b4d736a7a59ab5*20&d069bf9edd22485aa8865e0cd0587b9a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e0c5a38c06900efc2d7a70cb4a0f4a84*5&a79fa9285ad9de0dfe169edc921dc554*5&36837b60adb6e20261683140d10834dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fb7157c013ed32b4da12d4a73e383601*15&affd91c489102ea5a50b11cce11b87fa*15&6d02696ee6ff000dce470adfb87d0bf5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"99ad28f9d2d3542a029033af2f4aba2a*10&8c15e2bdcaba09ff045f559113d74c72*10&5bf2a5b325f7326f2ceb7c5ed7a35e9d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5a690b0703ec7bcf1559f9421516011d*5&eb4c3abba56bdfafebfdc97d4b730167*5&10f1191eaf5ed9b7dbb15b231d620cce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8b821b5458d9b85eda8c53c9ff9bb248*55&1ca097d680cf923f7e42ed99a8a1a166*55&fe66f38d35dcba7e7a3f7e9268f0e7bb*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c5f7d144b7e77bcc1ea6894639dca55e*10&5e95a02db9866d2775508c73fca50248*10&03b4c90083980720ee0de0a8d389fee1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"05788636dc9b3e971bc8adf12caf3164*5&0c06908eb4fa3fc3031fda1980a8df1d*5&c8be6a510b994791f925a1646106e0cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5ecd0787f7f334be23ced1f0b83db60d*5&e013ad80f077a10ef5f7291552da2183*5&8f507fa168ddb70afa66f25790c4f76c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e10478f133310a2300a6f7d5a13bb402*10&50a6f5ac921b6f1c57d4df19f3fa5195*10&16ddbc88e3e636faa1977a4cd9a91a19*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"f757aa887f50e09524755d19fd1b7203*35&6520b1dc8995116ddd44056dd5de0a98*35&25b0ab4f34de50eb8597562231f41eba*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"66354fd429fa25575df036842d65525e*45&6de6811d87c60823d3ce7fe2a5bba157*45&3f1f4cb88cbb63fcbe688eeec9d33b12*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a2c743a80dbc1b2d4572ae947d8abf49*5&4471c45c7e94ffa3d3c1a0fc9eea9b0d*5&3413351ec68b3823a7b80e9452a31fcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"86cb89c382733f687f14f9ceb16ef71e*5&431ee1b7ad96958ca450cba1b1c06088*5&9029d91140fe533ccc309b17c3624ecd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"72093b161a90bac401f1e1846b8a4bae*5&e17edcab0a6a95284bf4547cca283fb6*5&d116f79953ba3995708d6d14cc5cbe83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9bc5626b977077afcfe5de1f4111af98*5&1156fcbbc1768a66378bb8fe7b724d0d*5&485d45d82bb0ee4df21092cfa7848f6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1b0f04c0a626223b4032df041bb93407*5&8dbb26229530178d8803ec651b02e293*5&e2eafa6aa15945a1b9c0ebf4f04db694*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3b0190ea6891e53021276b2cd5ffec60*5&d961a0925017ff4273293a0f66eaabd1*5&3648ae13e302af29571d4eafe5716978*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+sum:0.5:0:f16 mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bc96fc217eae8dc03ae59208a2e4b360*5&1b5a96e4dc64487882e2ff8d3b870794*5&6749861629841f6c1fd5a752a0ae3441*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d00c0a4373f6a5729182f6f9b592e84b*20&1eec2237c28fa12e45b18af41c0e6ad1*20&9bef50b12a4a77fb80b4966b449ab0ce*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"af694215a14f98641177745b13b74b0f*20&6cce20b6f3316d08feac4490a73a25b1*20&427344e1b267bad3c0e454b37ca9bbba*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"577a4658fc55b5ed26043cee750170ba*20&6ae18f79f452e775f71ee91a76a85f23*20&f56f40ef89dfda5f86e410374b3e1032*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3df640945793979494be6f327841ff8e*5&d3507655ecb300d24560f0b2af7333d0*5&7c0c24bca7dd89e20c0f128e7b9451cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcde --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"d272796b864a3cae6a8571942ace57d7*5&82d48602f69e8f97a2927d276c7ca93e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g16mb1_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"c5f411043450254772e3d2f32650755c*5&00097aabd120e0657407b6d948063e01*5&3fee35a87046491c81a4896377ae401c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"566f302328d1a27e43bda301b5b671de*5&8ffd77b010271f3b072bdc4187b99a82*5&1609646025b168c915b27fd35f43380a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2575165b3cbfb97012eeaecf85cb7c25*5&9405df0c6605e2da94947d4ddcc797d0*5&92b0361e1f10c1f01de46f4e19659015*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"9a48a97c2fe27552c598eca93dbe65af*5&e78fa40411e1d105b66f0c1bcef72821*5&c2f2284f451b4e9e9ddc49de5ec9ccc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"70ab3c680c76a07db7884f181232e79a*5&2577602278163b8cdd6e1f98e62ed5bf*5&20442ad6c9b9db5e8158558721a570e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f50a233fe3e6d0cd835b4943519cd39c*10&27910ba380a9c830f6709dc2772b8518*10&764aaf9c1f099b7265c981e14e5cfad0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"48b3c7501a3cae343098d7c185a1026e*5&69475efac5eb6a4fdd6e38ac480cb7cf*5&c542be0fcdb02d7958c52556b16dc6d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c68f1973d5106bb4d3a5613428602637*5&905b38aa7c45bdb01d627e49807b46a2*5&7828c7cf1143fe372ad4802389d38c52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"feb2c9b479895f187d51c466f3278d76*5&d8fb58f1e2bf5107fe6bf78e0eb3be43*5&d8bce31026ab20c8f76c20323a93e0df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"903875df143e3784d59d86400f818613*5&5df2a7839f6f4849888885d95c96dcef*5&5e32d6cd8bf2f11b146a5bcca8213533*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7ed21ea1848f94635534fbf806f06ecc*5&a51d4b689808eb5d72c05c9a0b1064fd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"07bddf9a2e45b1120e28b7c728634b5d*5&a818f5cb186789b32000043ab904ce10*5&5f76cc81142bc224da78bfa9cc2f31bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5dca2c49016bb25239d65c0cbc948032*10&09faa47d4299b0e924d11c63cf912cd6*10&fd4b710a597d5a6e1ef61d8c4f3b1677*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g120mb1_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"0ae2cc28c698f29233074dc0bad90c6e*10&6a48da0ef3587d56dc7b623695528c86*10&2b150a600121da17c0e92e31dc1b1845*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"24f99a221a2c266113f363238d928693*10&5506e4a84911faa36925e8e8af05fb45*10&55a858aa8b4ae702ddd57d8982ed78dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6c11e90e473bb43802b0cbfd0ce1ddd2*10&55911b8e01917f441b6af6906eb2705d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d1d62615854086bd128714b5463c0f3b*5&a7999bc736b200eec35be6a4f08a87f2*5&b07f271cbe7505952b87f98a8ec67b34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b5b0ed7a70632f7b33781448f26b2eb9*5&69dffea61ed8a05e5fa37ba20d96ee04*5&ddd47c11480467326d6e170ee2f4cdf5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"624e84ac1aee86d3811573ee0d5b9e43*5&628722dd174c85cb09d4cacf44d69845*5&7c459d3723e91708ecb79eab820027d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"130014e7aa44b6e70b3d49b392a61b27*5&485802e31e527f6ff7083befbc833ce3*5&19fef054434c486de7859e5bfbbf3d4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5bab01f545841e09d027d5ac6367ce84*5&21a73e5efa484aaeac368dcd36333807*5&b83b91dbdcb314c882dbe4a4d8c2da73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1ec42991a5bfd8cf31e8224d89613ba0*5&e410fda1b2cf229704e74f2be365b7f0*5&de2057c6682fb6fd9ca97d02e84b8c9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g200mb1_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"93fc0477a2b28e268be448b0f2706ef0*5&cdbbe42043bedaa7dbf19831b8d057de*5&690404bd32385ba540c2ec3178d726d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b56db04853a381e5aa179e347780d4c*5&ff35186a03554cd137da7581b1994701*5&a54040734159127b969c862df64316ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4dc09bb78c497fe09cd44f6621d9a243*10&571e891145a90c4db20043468fd342de*10&5ad6187e625fcd8ba55a09cf9d565418*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g184mb1_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1848a8f33e6a5b31aa9d6b6208c58d60*10&dba6eae337f28cb60e69c3e03e1487bf*10&f23eac144f3f8f81f9b20c64caee6086*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16b7d40f6fe629532750be60d25e5cee*5&e6a0e7dfe59d9eee32f6fbd3a7cbb6d9*5&db1a49fc4539ecdd8ddc4499d3605986*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"487357fd4fff2fba5998644a7e5d23ba*5&04649d987cc664ecd6d98a0c5351c868*5&ef8785c63a7d844c544b90833aa2be08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g80mb1_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bd0649f58e7d399a71c6d2722ddb0ac3*5&ba7fe6b58e1f7d3473342fbfc428a45c*5&e552bb9af692fa50defc082167f3a923*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9102605db09a49f94d2b3641c6663eec*5&39a54fa423b10fa9e74eb727b5e81b12*5&23c55a9e9c5c2a82ee32c8dfc67b707e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ebdd17051d488dc3d52652f1f972c10b*5&40db1f89cea66d287f211cdc99155b49*5&0311d081a101c657e09c46a71bf37f6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g480mb1_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"351c8818739d2b698bcbac0b89e5bb2c*5&bf6d48c54a4686566084cc831c0ebb7e*5&b1c5a604f5522c4aef0659af02e45434*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8df731605b9aaff7cb05f11489d2a7a1*5&827d745a4a382b73d2241f60b6fe427a*5&40312ab3fbaf5fea2116ec8a9bfb4c55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a07ebc43b4e55be234b2ed6b6993f6f*5&f06b9d5e5d6e51fd31a01e70eea06c83*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fd4c61bc4625040bb7cf18f3bdfe99f0*5&5af7755a80c338a149e99e1baa05cff7*5&c575a9d1c8083fd5ab82a30301a9a96f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5b7af6a63f21236e7fa4c8e92527e38d*10&6e4d992354d558905ea69e5746eb939e*10&d719137b5e70163a07f424a4862124e4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"861c5e0cc072303a6b44cfbcb865bbe4*5&76b3ee579d1956357cbd6104371e8272*5&4cfcc1a01c48409c0f38d21f911013d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2e83b5b7e2c4b71660eead2da2d5d4e8*15&62c1d28d13956ec3c2f9aea8482c874f*15&806a3203aaadab84c399cb44420eaf64*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f6a011475afbd054ab73338139608f16*15&a951f76ccd185414cb2ce37fe4ffe73d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6965de9f6bc698bac0fa1f0138b3aa9c*5&248eeab412eabe8b00c69a7451279986*5&67488754339786e1c3fcb6139d52f421*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"458c7bdca64231e5fc8a7d5f46539865*5&753e91a2b9ca11847b3214c3a102f100*5&e1b5c8219988121352a41e8641569c2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a3c84846cf861076b25cb12b2f037caf*5&c11f39273eaf761ebb55e8c99c54afbe*5&27b7af72d7d695bb73932d92539588bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g160mb1_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5f17cd5d5910242fbc49e21698236fac*5&1871503915e6c0b496765589c7337a57*5&0e1b4f9d6f5eb0d1357368a2f6182d0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"790caef05f76536ab962517dba680df6*5&4dfbd5c9e4d4f08da3078e3b106a3154*5&cdbc5c0f1064d41d821d264283c661b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c59e6264ea5790b28d27b43ae3117a57*5&6db5b8342ed0fb20b056864ee5630e91*5&80fee6c624b545593d7994386a83465f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"8ce44476561da8dfacd42a73bb4fded5*5&b8c689fa8fb0bcf444ca020588ae337a*5&9e7c0dc5530aa5622ec47e6aa7aa2c3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8564ec8107ea113adcee81212c54ffb7*5&6033a943c93cf6f874699bd091a56dfd*5&2be8ffe4acad8d374eafcb1f0800a05c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bac8094f0badc66ae30a1f3112b1f62d*10&a7b0fb317b2bb3f041e574257448023c*10&bfcc782fc805231623c54778d617b004*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g960mb1_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"c1da355790043fc965d2c45400e0fad2*5&ed3fc7007d62682b5a48b785436d73ae*5&40a0d5903c8c0d5ecc36bcfa0f6e919c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"65c578005f21595b672dcbd62e152d10*5&6b068fe9210ba2fbee6ff7c08e1bdb0e*5&6347c6b45775c3fb1f9c015e4190ea08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5f206ee51568673b3ab8b3a37e336af0*5&75dfbac0b39d38df936b60d3c1e8d3af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9ae3095861336840afe3f3b6c1657128*5&a311c9fd3bdf44f21ee9e96df1124ab7*5&c81594bcb261f4987740430c1d96a7c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d72b7f38c3d7bf08b1156a504b469a54*5&7d3808b09995ecebb4604c60e9547316*5&6f403fd57becb88d9a41d5f3fb14d367*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih32oh32kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"754a9adf162f18f5c4a974827b5c7a9c*5&42b4d451b7a109727e2a53a9aaf7c047*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"c593c103d2f11f2bb5a4db97b43453b9*5&de874820b1b2d88c28e326300a9bdaf8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"7d634e83b16776944f6630d5d375a272*5&f361b13cd148c1f4a0909e579777ddeb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"88410371d538e3c30cb1f077d487f771*5&f8b6fef926b401fccb8ef229fcd9e863*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"bd56b2d7869afa4f887cd6509e00f885*5&487b4574f86adadd802416c7ce1c6fa9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"6995bc01cf8a3441abcf9e79feb49efe*5&3cc6def2504b7de0391945c0221677b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw25ow25kw2sw1dw0pw0_n"01a297aa49c84191519b2f55896f0495*5&5e600cd1e31d2740cc48e9479d5607d5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc512_ih93680oh18735kh10sh5dh0ph0_iw1ow1kw1sw1dw0pw0_n"dfd254bbea15d9ef7bdc24d3f5ea3921*5&0891467a40c2c8f5b58083f6f4d62a3c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih18735oh9367kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"d8aa642cb0f5db0a18a5cc85a5b29de5*5&38aa9024afcf4e915fb39dd10b23a952*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih9367oh4683kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"64cda7076bea3f53d2cebc1ee1c9f378*5&b16469e37bf40901fb843fa94bbb4414*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih4683oh2341kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"54ec8b153e7b6f8011a6678df9960f17*5&45282bf13f30e07098bc2427b956502d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih2341oh1170kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"7a8c11346a3310fb1970955a313d907b*5&d2cd553a944bc0f2b3572525c28f7921*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih1170oh585kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"1a19935fcdf7a3b2b4fc73dfecd18bdb*5&d932b403d5140037185780f62afb3ca7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih585oh292kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"a8bf2ddfdcff531e28ad0ace8b11943a*5&87d2c6e14671e863527a2669a68fc359*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb g16mb1_ic768oc768_ih292oh292kh19sh1dh0ph9_iw1ow1kw1sw1dw0pw0_n"190345f7746d7d6842555b11a1e1bb5a*25&b8d0d9f9a591ede5c2384019ab06f810*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2oc64_id79od40kd7sd2dd0pd3_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"05ba223a64ec70cffc6c8ede1236a2a9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_id40od40kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ed8b31f1b875773824abdb20847e6929*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc192_id40od40kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"facf38e8df747692cc02ae96c2491fd0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e8bc65d3eb19ba7adb03a9bbbb9f101*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc96_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"36a2370b8e2c325bd6fbae6c79e4a42b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic96oc128_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8af70569ed18f2f10828e5b4a4cca4d9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic192oc16_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"687eaae56f7ff37bf2b77793c63587a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc32_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5ab55a3d0c9d63d67d3649f87c2b0598*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b6526c57737356ac5f7150cc49810d08*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f0443f8aa63b3060bfaed14103246f28*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4a02c6eff8a18eceba795409d27982c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7429c91d057fbfce2b51499c8713b094*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc96_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d5e533983de3e7840729d1d3254c333f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"14c1b6d597b858741344d362a52b9a7c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc192_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3afd9160aaf84ec35f5eb5544907d24e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dafdda8e26ac3e19dda59ec3c23c7570*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic480oc16_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be67a006f4280faf414b21c1c22e4cc4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc48_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a2c2cd8bed416cb9ce717594992bf276*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc96_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"36e7ac0c922ed611494b46a72c83bcd1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic96oc208_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5f29c30cbdf09359708f3f957aa53753*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"999a86d7653f9d6d19142c696c85a21c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc112_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"98ea1481ad56608f75b0b185f11689f8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic112oc224_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"49a9cfa732c7c0a1adb129accba280b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c8fcbfeb2d415e9071874032ab69c343*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"aa3ffa6241cfb66eaabfe696813a5d9d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2c29a723f0301dbc9e2e08473e26ab4c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14668e73ad5ea5cb3447c514e9b17cee*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7c0a687251761e556937aacce398ab6d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc144_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b3d3b7820f23efe92c62e061e208ce0f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic144oc288_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8a05f2287eb2694b9eddf0de6373c578*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da9703852d6cf981ec8686855f899173*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"46079687f68d11231d595f5d7fbfe814*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic528oc256_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d07e85675b340fa6523ae2b9287c8475*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic528oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4dc023d2734b166b083fa1be905be75e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc320_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5d0ac4c962801807e6249500b3c82be2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic528oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bb3eb7a84aaefca15416f0856d422a2e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0506fd1596595f7937d8458665048f41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic528oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"07658f4d51213e6c9d4992687488f09d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc256_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8cb088f194bfdb1d6ad73e9707f3470e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc160_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"20cb632623c1188fe1a520d37d497731*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc320_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"357ed8a813ee978b414dbc9ccdac2157*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc32_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d588d0493cdd852b9e6efc9dd09ba597*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2b9b16c24ef931b6608dbce265a82b02*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc128_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cb1af132f9b372abecb1d0ca817d68d4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc384_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9145c410ce4b18df41dbde4472cee495*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc192_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"50adcd02a1def70c7a401dc4fd8c8901*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc384_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bc9dd4cfd99219ef1b238843b309efc9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc48_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"08c6fe6cd294eba9db904d90cf9273f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7c22eaa42cd9693254e54ccd01338623*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc400_id9od9kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0f12345966ff07a729774d0c1825cfd9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d2d64d69c2ea59fc22307c7b3e1ab785*5&3ada1c7c27d1763feadadf14b263a3dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"8741c0203c7f03eade914a59ae58f8dc*5&083acc2bfcf590dfff89a12f5312f1b9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e541bc26d463b7a5cfb743514bd7478a*5&17ddc08a330ddb8675e4902fbff21fa6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7acb5b7943119cbd2e3110e89775612d*5&eb13c80839b9f1c0ca6f4c9b59405178*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"b5726ef09caf7cb860e34eecccac3360*5&c45506298fe2b62929b23c963bc8a18d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"183fb7e3a52658a9f49d713022938d93*5&f4617a1f68c458e57a6e7a710eea83d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"2b5b318562efd1def16ae6c8e110ccd9*5&01041b666fc632ef0394b5b82319196e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"c181720396e7c85bc59176e21a92bd5d*5&13acea244ce009ad41c7a4605dfad0cd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"e3c70adab5f46d429922de8c57edc869*5&644783afe7c26b945f1504d0f3a930be*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"0624df87e23eaf3ef9e00cc9a71e7301*5&36854c75451ef577bfae858faeb8e90e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"8a9614da8884f7aa8230afda01ab1644*5&a36cf6bcb248bece3bb565077b5cc421*5&483cf0b02a383018ce347cac980c1881*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"909da483f97bfa604db046d3b2484b4b*5&347ea747cb73a85b57d145ac6cab36ea*5&43502392eea73fb8e02f73e7ecc6e0db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"1bf3f06a877b0a160a4623b73312d38d*5&11df1142091b324b3fdaa7840aa2a615*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc3_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"5c721fe8313d488c9967d631ce6b54af*5&1de2af5df36bfc1ebfb361906acc0e1d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"dc2b6abde4d2191cbf7b8c8daee8f94d*5&8ee9b189548173faa5aa826a048aa200*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c80977659ca94d2552c732e08da8dbcd*5&d975ebcdf7532a3c81df53af8f8ff7f5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"f3dfa11d67f6ccc46b63ac4c05168176*5&ec80836c305d2e90398b5dabb9059934*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc3_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"76f758847bc20cd905ee48ef531a9d41*5&c54c4b564ab79b8f0332308f53874d53*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih19oh16kh4sh1dh0ph0_iw19ow16kw4sw1dw0pw0_n"b35f942a1242b646f0cf1b31948f2359*5&5799f2f86189e61dcf6ab48f2b882ecd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ac3d431572420bbf8813c8630afe0120*5&e06e6a22f4b856e31c2f153de55bc91d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih19oh16kh4sh1dh0ph0_iw19ow16kw4sw1dw0pw0_n"c029ac226225e604011ba044e818e072*5&5dc480732da92efa09f148168862ac34*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5f36d26b119d1492fa6ef9f56f622dff*5&0f0b680e7c30e634c82ba03082c8460e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih35oh32kh4sh1dh0ph0_iw35ow32kw4sw1dw0pw0_n"32ee6825d486ac8ca4f794f2f9b9fe9e*5&a9ca9052e4130889f8e7c57a2d54aae3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7a58f42539900054fdd9c4d8bbf9256e*5&048fb824ae21e61fdedd944def66a422*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih35oh32kh4sh1dh0ph0_iw35ow32kw4sw1dw0pw0_n"66ff67bd1f5a4924d4fb1f07582d3eb7*5&7de7d95b82eda8aeb8acc9fb7bf15454*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"be18fdf541b565daf0930f1094653e6d*5&6c37b8165cf9cca724dfef63e7f2a36e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih67oh64kh4sh1dh0ph0_iw67ow64kw4sw1dw0pw0_n"5121b9747dbb0ec3b41b614b2e415193*5&3af43a29584bb91a7a800c8857d91bf2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e5966d2247d8f1e75985fcd5c8a12374*5&940d6e3c379bca4e368eee6870236d5f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih67oh64kh4sh1dh0ph0_iw67ow64kw4sw1dw0pw0_n"389342c9c4450bfd0e76010a935efc2f*5&b862e57b55916ca9ed836c5f15b8a76d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"15c135c24a2350d932e0d8dce8db1737*5&72bb8e5888e51bf8e5a60ffe44690314*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb256_ic1oc1_ih131oh128kh4sh1dh0ph0_iw131ow128kw4sw1dw0pw0_n"0459cf969e7676aa638b6b75c070189b*5&6921ff63e97900cfa3318360cad2fa1d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0b10023fa3bce70f845be411497ec254*5&04b8ee85e152119125eac19ceee84332*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih131oh128kh4sh1dh0ph0_iw131ow128kw4sw1dw0pw0_n"bcc81b94a601282e665b6b3495b2e261*5&f95ed83cb2945ed2449f9a2737a6a2ef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f9696491a8c883df7a2e4a6171b80e99*5&0870350c267aea236763575369d07621*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb128_ic1oc1_ih259oh256kh4sh1dh0ph0_iw259ow256kw4sw1dw0pw0_n"b75e265c4475ff6138fd77ff3132318e*5&d23ffff03a491da2ba65cfc2df1fa46c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"171baca0e39b7da0573a953ad254e687*5&2a5483fc56317a6a479f8b85090f49c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih259oh256kh4sh1dh0ph0_iw259ow256kw4sw1dw0pw0_n"3fb41119c478f5df1e5c9440f11d0a92*5&0d709aaaf66669036f5248c5b7c191d5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic128oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"32664789c2e1158547ac27e39143e9cd*5&64eb51c47d18714dd46a0e2888378f60*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb64_ic1oc1_ih515oh512kh4sh1dh0ph0_iw515ow512kw4sw1dw0pw0_n"f68b62937dd97b9529bc878216f07fb2*5&181c6a2a34cee30dd33f784ba79be816*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b444686dcef41abd18fb048b6eac1fed*5&871560808cfcfeb1ffd77c8551fd2c14*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih515oh512kh4sh1dh0ph0_iw515ow512kw4sw1dw0pw0_n"5b5c0af8018e712fabe5627edc618425*5&e6e26253799aa43e5a912330985f2179*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic64oc3_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7596401ef21e8282583aaa836f772705*5&907b7c5b1a7fb6b53a2a647889ed3aa8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb32_ic1oc1_ih1027oh1024kh4sh1dh0ph0_iw1027ow1024kw4sw1dw0pw0_n"31f12c697cadb1237ea9b1302e79a7ec*5&c9b9c3326d22478cceb6c9677c906e50*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih1024oh1024kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"606bdd74a045456737187a1e7c425d3d*5&997dcab4e2468aaee464736dbcc6aef3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih1027oh1024kh4sh1dh0ph0_iw1027ow1024kw4sw1dw0pw0_n"fabefb35ccfe829f9a48732df20dde38*5&132005d83c437e6e8b810da9dbf1a9a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:0+binary_add:f16:0+eltwise_clip:0.271:0.314:1.234 mb1_ic32oc3_ih1024oh1024kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"58545b44135aad4fcf5b82536a52eca5*5&1fd19c826b125abd47ca544b083db630*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"65c5827aec132e93029d53f30ccb7893*5&d669c37adf941fe9c287571e04291696*5&d53be20192abe06e91dbe0ccaf1828d5*5&b86fecc999d9e83ae8f11f9dfe14e4df*5&d1ca460d423a8b75dddc0b4a8bd2a0da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"1c15c0b2863fa61c91cf7ddcf2431935*5&afc6e6ad36d0638b653b7f289912a22f*5&9ba78b325fbccc5f5be4afb6b0c34735*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"749803e96150928aa6877a072445cc78*5&891bf9c775293d3aff34aed5769cc865*5&bc965dac426e58b4e85326e1a6a2e422*5&1875e844261e557135943b863387e010*5&c3e546fd58f41c7ba3a7cd2e6c5ddd95*5&f4e13ef4a2640950e9e8571d67a73de5*5&06ee3d039e7d705a1645205b40bf9dd3*5&d097a90df0e3a15331eb6a57a7ab8890*5&2ede2ce87af0a4734e1df236a55af47a*1&3fa5cc8814979d7c6c004e9c433f6be4*1&b36244fb82c002f464a9ddb2992f1d73*1&adb24bc27ded067d5845a67f35da6788*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ff2ff9e4eda3fa9fde47e5a32198d000*5&8d0d7d3f332f93845736218c303d5a61*5&9849d9d0fe3479eeb4c8ae594e613926*5&b9ed9092bdd9449d474f6306200b1794*5&6330849397f9357a250fe69420443a01*5&6c9da6b0326beedd56ceb9358a091505*5&072a1b6d349fbc56368f22c860e78761*5&8ded56b997074147d8daf3dcae12395f*5&4e23691b7eafe49ab1e7db9b7a8bea36*1&53ccaa899f4a57605a3ee48bee22ad74*1&6a7fcbca33c09a795541581b009020d9*1&4d56ac9540d7b27c6cac9b8f18f65ed2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c55e27c58799baa159f9bbb5eb195bad*5&c94aaef402ae7d9c0b12dcf18223f4fc*5&d773df00d2e84ba995a425237c0264fa*5&b0c4e7731ee27a6c465c4efba3b7c4ef*5&618a77ba1c34c0e5e046559471026fb8*5&d7e1c0827461169997d4cb02a4114b46*5&8cd15b1c2ef057b60dab9f4fc870f3a8*5&f778a959df6de08393781d40a598210d*5&62e6f1ff45f5c638dc43a3dbe6b2a22c*1&64258814a8e75dfe04d1781875724110*1&298a15b839f0402ce08f8e80bb2588a0*1&9bc8c116faa371d927597fcee4c5cb18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"fe7102e55d02897f92a072c6178b96ad*5&40ff34c0133db1dd3bf5955b114b6786*5&d7cfb060deaf8c48ebf44a8786b59172*5&6b94f6baf5f6a66c58b64f625a22eb3a*5&58ff2c00634dff6f6b2b4b60594cffa8*1&359627d98a076bc36a89abc0cd162e67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ba3e0c98acca3b77de989f4e050264e2*5&8114e99452ac73f6b8bea58faeaba216*5&3eb855be69eedc65aedbb680f41d7de0*5&aae059651e0c21bf15844e86625c3151*5&fe5cd2966be5bad1c0c9518fafa59c6d*5&b1e3007dc8b7364d280c82fc802084c0*5&3e31696c405efd562738149e68ef123b*5&a351701aed69ecf4c06b8b15e78cfc3c*5&7477323a1981fc88d4ca3802975db33a*1&e46e088c6b96ff08944acb8c129b65e6*1&871b637c8413dda90a957321afe7a6f7*1&72bb7387431ae6cfb080d77f48c19149*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"25d0e7df6ed3a8fce2cf3eeb7a6ba142*10&615ef95f49b2c454c9f818e4132db803*10&213dbc91c49b4a25e94ca1d4539fda8f*10&6141033b3b3ebe8ce4e8ec9502f51071*10&bb4b36b98a85184db035b94cfe45c8fa*10&a1aa9bde1b70da6dbcae540581a53e8f*10&f63513a7817ed4cb0b8b43d0018b3d9e*10&dfc90be6af33502368cff8935d53d646*10&4c531152cfc8a8ef59d49bf85d941b48*2&74e22fb3a5d4e36e2dacd93661dbba13*2&1f0700a318c55b8e664df31eb147f891*2&a2167cdcd9c1be79cedd34da868b3b7c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb1_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6a271f8dbbf872f24b1c8a10e240c5e2*5&9edf0fa67f9a8702b5f92eb882cd5a54*5&2f0de026d5dec5ebc797aae35e51c8ef*5&eeabcab831686c72ebf58c1a718ee959*5&e4bba9a7ed3f2e4291bd14feecd4a172*5&bf6aaa658363ad32739ec5a71885ec2b*5&6fe10132649fb2bd2d3d9f4ad06a6b7c*5&30c3b16f7f43bc6ed2afa95f0ad0145b*5&f3b5762878e4a139724a529dba39bf13*1&82e7021347347da7a6334a23e0fb5a8b*1&fb3c71248bf6a63b28831ac8a6fa7373*1&d0212947492dcd24c33ef1226257a63f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d3e970aeff379883eac9fb7d51f35fc5*5&d3e1977c2f8e5eda4595e5ebebeaddc8*5&e34bc0bc398a2d80999b17b4601a79c2*5&11f9caedef39433a82f553bd76c7030a*5&11ce967dd1d21dd54d01c4692938b35d*5&3572bd87bac5869ce5d82a4d161721bd*5&5ef14f3bb10c75d6f76cc07f4ea3060e*5&bd64356c38563bddd9c0f880b9409858*5&36a782bb0cfd16e5d014097c6f9f5437*1&5723c8bdc519cf7f398677e88976e699*1&543bafd63f7872f93dcb2504ffaad449*1&2f90558eeed3240528f8ed91e2a8a730*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"fc25fe6b55074c03b068ed8e5bcda74e*5&e213d0a84ff53b5862ec0c91fddeb237*5&98ff29a25a74d63aeb7143208df50dd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3257aab64fc4488b4ccbcf8bf0309906*5&cdc8d01de5edb67d5fc737e3c07cc269*5&20727f045e9dd4f85134f62cc6f527e1*1&eb6e9531c24066f8b439977b3db6db68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e5838ffc6a604a8c70fbdf86c995de1f*5&8e93443ca3fddd9b31dd30b279e019d5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe4725dc68769d89cef26508c85c42e3*5&c63258426947c91f640d1db1405dd69f*5&53508488e8fbf75c225f969bac1532e4*5&19395aed4bcf4a0f8726a1d7e4b7a301*5&457bf61a1d8b56416da08c3c3c0c285f*5&c011a32e2bdaab0685d98f871c2a9f34*5&d85f3eaedf79c923da81aee827aaee21*5&4e76ba5700ee944a2461debd7be8a55b*5&8bcf57fa816bb2787f77cc6e9fbaab90*1&dcbc3bcbd356eb996fd183b2c5f4d6ef*1&0d3d9af5ba944430c8d3653bb0d40e7c*1&e166ea9f8471af3a536a94788fd04000*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ed1214319c41fb18c3290f488a0abe04*10&fc008cc257c01b2edec4348932d9d815*10&94410f19fad98f996cd0f906a6a7ae24*10&b3f26e61021facb537eb6095c4d32d72*10&fe9fa6f2f9c1511bd31d7499c63e8579*10&2dfeb7da1804bf297a36a8e818a06d50*10&a5230a696e49e762300bd6af28ae9fa1*10&935e7e2212b4138ac560f9ac4b116fb3*10&af21f81f932efbcf1afca4ddea72acc9*2&0d3e67eea7ecb086e9d0818a5b580af6*2&5c036f484df230476309ad41b5b1ad94*2&e7fc50da34c51e5bdb60d1999b3d1d49*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"6c4cd0939ddd0a31c86d6053d6167c5b*10&925aaab7e35bb3c491a1c074331d7355*10&8324fdb87058d2274368f24067729442*10&6a257040eccad84d9c602d6c9fcbf7ca*10&2087c5afd8e636add22947495970c56f*10&11e3fde58da23a920d73c01f0cefa1e6*10&ede010e759783e66c11636caaba7450e*2&2865c25807de9ea8656200271e2b1dc8*2&46679d9b5b65762699ca5b00d0f2ce40*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2dc64912efabd1e3ab75c0aff78a2675*10&df90ddfd2af1115def3022fd96dc234d*10&2704a62a4c18ec0685bbdbac83da75f8*2&9102c424239694e53a913608f1fe932a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a37f5a60c4ca7d49400d0842969e5277*10&9b2eb4db7f331112e0959d169c434d28*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e05cbdf6c6e0bf05f8cf342f63859e37*10&d973b3e350b79038247ecea013d8d616*10&ec284d2a90fe1fceec1f12193852a11f*10&7b49299579ba6efd8195f5336b3ec34b*10&62274aca24d621c56c2c61939e8371ed*10&f4171de2db1f350cb4fd9cf9e8261500*10&0d8360955484c973208ffa3e17c0a108*10&ee512de5e853c2252fcb575515f2467b*10&d026c9c1ce33b594d0071e049b116184*2&45438d803d98a81d3404921d6456b896*2&23d3bb45f1212ecb102e3fc099623b48*2&03af016e4fdb736bf9e9b331aa455c37*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c7c0e9159204bdaf99cf6c909b7c6768*5&8bcbe8877b0ee7eaf2becef7dfee73d7*5&065b780ad48ae3d9cede42aabbe39568*5&d46a83c4a5265c1695439cae6b2965c8*5&2701afdf9c5b61daf5f8f6a3a6c19399*5&f8d0139875f48eac431ba2081cc496da*5&d2a3902348e0e10369aabf5bee0317ae*5&8498c97ee148efda2dbc5a2dcdc4c80d*5&a54b9d45df813aa72d3e68ad80ad92ec*1&dd1f0252df4a177dc3d33ee1258488db*1&9cf1d92be60c5dd976a13ac33e76dc5d*1&0d5a3d2c6712b5a04e9e987e0b3254b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f05701eaf82376c5c8b9a5f28ecc16a5*5&b250f3e3293dfc462054e577900eed4b*5&e1754b998a1e1814b4db4beafc26cb9d*5&4f9c7d797b4774193739fc89a4b0863c*5&8512fd85b9fb76362f3adf1608daccf7*1&f157fe90ac55aeb88f9fa751a0e009c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ec3a858e0f11c2cc304d8dcbfedcb26d*5&e03bf82586be1233cab06883d77ffc2d*5&eb1e4209ae250e8a2219f9099106e906*5&61f63fa7c1a08fcfd3438eaa53ed7123*5&5571d180405d1810532971e25c5d4eac*5&e98c77137be5c22e58c50f9b3db3ab77*5&2065a94bb07b7d73523c4d160a8a859e*5&dcf18de64865792ed377a4ad2a9f12d9*5&491d5cdcf943bf212f2ae148ec440318*5&8a9928fa7085acaff361b0cb3841788e*5&973bcddbabe9bbd82631b307297c00c5*5&544da1bc09e357604409ae0786cb6e04*5&d346956979b0e41ba0b9971e507de019*1&e2a96c3c22163d0adf9b25e5f409cece*1&b9786d0d9d8e8960232182210a6e88e8*1&4a7c5e29903dcedd668a52b4e79cf78e*1&b5bead7f3752d279898ad716bb1974b5*1&c479841fd2d73741a2e471ee96a19ab0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5f59d09e53de61907f1a40b14dfee2b9*5&8265fe143b1ba63358dc1ebb9a2caafc*5&2aafec69129f2a85ae7ee7627b9d94e5*5&fa2f612e1d7bcbf6f2f97e32134037c1*5&a1547103285c076b019f96282dc7c8bb*5&2c8fae6671e1420e06e8459766da9539*5&802d9f1fcdb5de0cc78eaa892830b0a1*5&b6ce64b7b0efe9905f3201ceff1649ea*5&b36bb9632285feeab654a72e6c944015*1&1625bcd898aa1b58ce52345ddd74be8b*1&b71ac1b4ec7602afcae94fa8d3421498*1&05b1a17f2a83b3281439017aa14e1dbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bb3ea09c3cb64984cd422f7338a09df3*5&1be700635f9586c51cf0add35ac39e4f*5&87a94e4eba77d4524140afb577b6c8fa*5&b4005de947a2f2bdc90ef3a66d362017*5&49d68de8a71829fafda92cd84c2bf751*5&6f46ff962dba0c4b567afac8f96a4bdb*5&9ec1d77f192c76d72600f680ba853529*5&b1630db19d567a9dad5498462b745b09*5&8fa15ac62b569fcfc6c4119ecee3b5e1*1&2694e22e813e470ff8fa04e6f9c78769*1&87a6653ce62c5afc460a90ea74e48e98*1&a20e8e473c766816ab389b17ff6dd88f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b5d7e37c35001ff77bc633ee4257fd98*5&a5d10d27da38855ee4898d3d342630e0*5&4a9e4b82c5eafcfb0d4655b3cba4c4e7*5&9bda1c30e408c8a5da35f2fe56823110*5&02c2a58a4c84678a67f7a4ae4dba8683*5&954b01ddccc2f27cd85e148e1c78ecf2*5&8e23b0c507eef550083c918de4e0c84c*5&a507748d806da256afef12f5745267c1*5&c3fe1a73deebc8c627a0b759e57e6004*1&8bfef1356baf6519c8342f8c1d4a5998*1&84d31fbf922e92098431af7ad027ab74*1&31f98244b588b5830676470f39ff2fe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"599cf19e5bf7a040dacb4f5639c05671*10&21b12e7416b5a039b19ada01b23090f9*10&c7bc37dedc870f55877d32cc6fb0de13*10&51494ec6018ad2d0854ec9e8e222685d*10&c6e4143caeb17335723e040cbc3d71bc*10&784bd723dee4d7a31451a48a879bbb56*10&e5823cb075dd95e6f6e7516b36561e9f*10&0861b0992386f0ad0e6543b4315b5bf3*10&48d8e17b1bd7cabe2ff130596092ed41*2&095b1ca7475c1d7437d46f8ff38593b1*2&3bc4b6b01db39146879f73b13cf05bc2*2&aa28189a337ba70b6cc481d42d2e81aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"411c7c09850c60881a5e03aeb25af86d*10&50f3f133c6b698d1383e958508c0dc80*10&c9892b88f746ae96575ec6e1d351fd3c*10&4ebef8256178667576032c85ffecfd7d*10&e5f2c154445681a72a21143698afbf8f*10&4596d07b783db1d10a2022029213fd7c*10&0901336eedb2c29724046bcf28f65fb7*10&69723cda15f4b9ddf965291956982446*10&f64e6d2ec12fe45c356d8732c5fcd989*2&27137f6040a4ae490744be68608e3458*2&46f9981b41397b21bb90aec249901862*2&243f3bbf99854fdda34c24ef7105774e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7405435b06304aad120a4393e2123de8*10&68e8b5756d2fa13e1bfece6b664459c9*10&cdd6c3624bd2089b5144220895c43339*10&69fdaea593c806c9b100d3df8af388c0*10&d3aedc37811cd1bcbca00ccc1b60ab34*10&7b2ed2548ffb606f72b606d404a81a7b*10&ddc45161b9e5873cf5c0af248af09339*10&d84603bf8e7b2aed648cf08f48ea3f43*10&aea7246568711e1c984d4710d71d57a1*2&9a07e5750de814fbc6be33bce73cd6ab*2&cc4a1c52d8a347c75efa59b598ad62dc*2&4e8eab5e4711e2b5bf85d2d61c193a98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7d721897eb537ee607189647c6bc8459*5&cac6993aca75e78a69710b90cc5f71d7*5&db3ed6cea03139503b38dd01b2625fff*5&fcc46caa96fb599f4dea61b025910283*5&437ab2c6b31199f89a35e4df5a775985*5&a706824901bd39117cc5dc69f077dfc8*5&ec501420fa06d92f6dc7e2a7eba37389*5&e4a21d536659edf04b1dd9f8c1188070*5&9dc9b55925ef8456bbfae4d34af49414*1&30b69c105657bcef04ff4e26da0c597b*1&671dc636613ac922cc60981bcceb17d1*1&0e90f5effff037c46016da9b34ed134b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"097cf5d1d0112064f8e2d92660e16a35*5&532ba03f241d369d6998178e93f5337c*5&e65441682f27f780cf67ada5cb88debd*5&1eb8fde3bcc029ca1a303f3d783bdd31*5&7706719e4264fbb582a09f05b4638fa9*5&edcd3d19c190484053c87b4884a5c25b*5&00bc7f0bedf45e1877eadee9a7fd2c09*1&9ab3dd31bf0e246021d03c27ade07001*1&7aee9d3186e905d2688d81bc02d9d211*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dacacc4208bc0e8c1c26fac987f26313*5&e5a6f8028964c98368fff52163ab608c*5&4197c1445a15e0a7021a67d8e8e150cf*5&3f5e811e021a8c56fce86d0bc7f9afaf*5&ed84da3cd7b0bc3c809bfb2754fc8f29*5&b77cf10878929a8ab5fa390ba880df21*5&1cae7e6f937c47c666e161dc555d8911*1&2853111ed5a19724067e654a4d74c6e6*1&eb4c8fb1845488b8690dd25eb7edd52c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f3b08cbf1d8286767151d493586bc375*5&78b11d47e81439651a38851e86f428c8*5&3041a8d4df4f3403b6321b65b45bd770*5&07ab9058fecbd07fcd3d10469c707052*5&338e6b09658a493f8caccf01c37434fe*5&1d27142d15354a9c3f98a761cab9dd6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"84d0ed1578787a0aba574f46581e9b83*5&ef60cdb9c4c8cfa37d435a6195538f64*5&771bfd5437b6465294875036607af4aa*5&64c17f4ba87d831cf5afa2d4488b45ef*5&7996bb2b33b3030ab22d17439e6dc285*5&695609b4874b76f433160e74d263ef6e*5&080755ed5eeef7a57c2765f96f02ab55*5&cda1e39aa3ad469956117349f8ff6f56*5&793e932dbc9d7408ba3ceea0c156bd1e*5&1027903e8e8fc66c993f283a1f2c91d0*5&dd2d4a5937fd1f9b360137d03678d2a2*5&1d1fca682755984f7d87e9fb31edfebd*5&ce423d91e2243379233622c4cc64153d*1&74b8886a3b23c1103df3e6e93a379c38*1&ca659ef67c65e3b46ef74b5fff8fda7e*1&26e829f9b926a7f46457dd625471ddde*1&1347855ca973303303a4f3274dff6592*1&32197e8182a0f9430a61ec26972cfd0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a303103f9b9c18a3b7e52df010d84944*10&daacfb35c0a137756817e44ff801a2d1*10&7e031b3d9db01449ca533e8fbacbda88*10&4e92d4aef781826b0c51a919815f0051*10&5b71aeb5eb817e2c43e961a197c784fe*10&dd2c8cb034be303c15dd3da0ec54c452*10&18847fee873ae079132c549a4d84210d*10&e11a3f653151d8bce4fbb043854d6286*10&2951e68112d27ac0adf022f6c1621462*2&69ff8b185dc302f8874c7cb682cd20cc*2&20be7430323abf77e3890263730138a4*2&9e23dac51501b75652f3068eb6fef88c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e0f79d0af08f3b73d39d88f54544bcd2*5&646350423f593bb1e8854182c216aef6*5&4bd9a992d5012c60e60c27242a6920ac*5&2e2204a873ab08c485ae67d6b19a2cc2*5&abb548b19a1beeb41eb7020b3738f33c*5&323013b2b2844c412b227eb31c88c23b*5&c5755bfd74acbf5c14ac4e36936325d8*1&f66572c5b6c32c1483859ca44582d276*1&2159f132d88920082423104771c78ccb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"26b1ffe7a65706a0a063620a79185a2d*10&2168881ee6ff5420e2a3c2ffa89dcc2c*10&501116838ea62ad9b3116ea1797fd049*10&bebbb79069ed884e08f3c36ed091a454*10&ca98c5c9135ca6027d8b4f108ca19f19*10&eb8255cbf3798610c8cb40f070e96c14*10&61672b29f254026d6f491295ac4ed45f*2&23a73a42829c1bd0cb09207a94779a5d*2&72ddc40c30d28ac2ad0f659dcd92c7d5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"03cf75b062d569799bfc4578abc739f6*10&6c96041c18fb04aa1a66270fb765a448*10&3f2b194bbf2b4a7fa4ebabc20d191c37*10&d7c62fe59b95e72fd0a4c13ade93838b*10&160001b6b66e56ba6c823188e647439d*10&2c455e81ea5a51436f52d53c9db0fdbf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb2c949638f9352cfd6daaba4a9f1293*10&e2d4b8a94459e6403e27124356afbf88*10&4bbfadc3f3f6f724032e194bfcde0b5e*10&db6ae6cf49b76e45183b1fd01c9fe956*10&85de014948b657857b617275f60ad3ec*5&b9c06c6d87d73bf7e91586d331acc9a2*5&2346ccdb9856856139333ff66c1100c0*5&c426579f51a4d1248ad1e562242609b1*5&4d295f06534aad76c65eee2d6e1dea4a*5&f6f173c4b8528e88b335b5cc301111e4*5&272f70024e8fb106ea81fe1022bc346a*5&75a9b9266bb7cf13dbe93e39ad0938ee*5&f0d56fe6ee63a0b28ef3b1001fc8dbe9*2&01e48009576d6b250234ccc3e70909da*2&b1358fb6174183d8208a23d35d5ee267*1&29081b9b95f646d554af51d848a44bdf*1&e2dcc6e67534d0e447b0b0901319b733*1&00cf150af2013bcbddf955902e98d58a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"6b90946f9289d7308a7f21d7062220d9*5&dd2b6fa7aedf7b36e7fd3bc454541c25*5&665e52590a4d7329674ddf0cef017436*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"28046e781c4e6200a62bcb3e3e270b50*5&5e31dfe3095a973423e5c3987ae532e0*5&ffc29650819366c7941ee9e5ff63a306*5&c30ce8d975a13f00bb170f0c6538b5f9*5&021252cd99fb8da57d13b33600d68f20*5&3375ebfdf1ff5684e7ca4b816e8a4933*5&b23e9b66a79352f41689f1ec2a9bfb2c*5&ce53c5f132933635576f7b8485b1f5b9*5&1dcb750f4f4289b9f58a5e52b598dcaa*1&3e5e91f08fea07f0a87af48b07d8892c*1&968ff3b61553e14f9ac0c32147e69782*1&d9b9eb41e61a84a1d46a3a8eb1ba05c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a2a08ad28f10c271d2ee0d362db40abd*10&a683cc9d443966c6f1819ca1911f6dbc*15&39ae490f72f5530602fcb026feea540f*15&8b5ee630d778dbeef37897c769a55be6*15&8a68bd3f6cf9c0910ba8a77effb27599*10&012d4af4e5305c7f9252d7569dffb240*15&19198d1a0b6a78afcdf2fea01a7265c2*15&64a81107ea75891eeb0ce0636b8d5399*15&9a910035ee2cef5d34ad1eac48a2b2cd*2&22273fa9a8bad6b831e7234ce414205a*3&31041d47ecffeb246852d2794dc8b73c*3&f4c50be943f3137d1d126ca7824c2022*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"cdf253419ba7f2d2b31eb66df949fc1f*10&eb18dbcf0138d87e0d93a15a4bf48646*10&970fc8f948f0a4d2594543e101bb226a*10&24ca9fb4318fa718c10048403be0cb9b*10&304603ee0963a5950fce9e492b40186a*10&89a4245b2252041d178018ca5472cb02*10&22e697a06519ae13b9a53e62b2dc5087*2&1c60e5d2a6e4f9e9ae5133407f7256f3*2&54429e7be99f1202e7a388e7bb8de974*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a20594212d122019fb9cea8ac97df796*10&9115cc37a7d852dc680f554317f5e08c*10&76dd09b29c009347c1340e36de90b733*10&bb8cb508c644f35d8679be51cc9842c2*10&d84a332f4aa1278ab869ac12893e38ce*10&2c57f8abf5be5f0e02f4a77412577d3b*10&9a46006c221fa537c30b3281acc3c260*2&5acc74109fdd0ba11321f5aa6aced6a4*2&1992bdf08b75196069ccac04feada5ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b11d0d1d4a3db06ed2de37e6ed735332*10&b19f60dbd705afc3ddf430724bb8221b*10&c4d73d185e7722a00e5596302d5032fe*10&03e0f223b00dd8ade49e69d707fdbc68*10&2ed20e4b8b11e5b93ff8c7f62a9ee1e6*10&86dc5c9c4a1c943aabd80b43229112c7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"87c591ce9dbd8c07985c555efc3c7b35*10&d0c4cae82a65e4a1a52a7dc5c03a37de*10&6220a684070206c1b7d3b5cdacd8bf28*10&45b1534cbe9a1a64834df9b8b8e36b15*10&5b3633f17143da8b29dd0d64d2351517*10&425a3600a9881ab328b1a611eca7d9e5*10&faa44b4eab253b74ba67876c198d0f7b*10&6e8ae42f7df9838496b80399f10cf15e*10&2af30a9c1cff76ce910b65297ea84fab*10&73a7767ecc4cdac07e0c74ad392fc641*10&fbae2533b2b7b2279048cba1a24c1fb0*5&e8c66ea6d2553ba910c3d7525e21250b*10&21377532da361ea214e627f46c2715e3*10&1634363d540a518696c733e51868a9a4*10&09cab8a3b8281763247b5c524c53b8e7*5&6dbb8ae78c6bbc34448df2ec4cf6f3b8*10&84fe4adf3e1330918affe9c1093d8990*10&41e65096817a21c00bf418085926ab15*10&cd4485a34eb0cc9f57ca420a2f30be19*2&920f0a4e8e9d03ba9fb0bc028e3781fc*2&8364a056271a7cd5b39753da2eaedfbc*2&dc03818809506ea7f2436ef08ba640af*1&088f263c936d18ca1d574c7e87d992ce*2&d93e2f890abee10571d6426d81b305a1*2&44ff871105f06ac7484d0fdb7eb6138e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6cfdf68085c9642e9c82fbdbe5d39058*5&f453e354caff5164fcd2d99a10812d5d*5&4599d27582cb993b1dd48333098d340b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"8b7aafd37a5bdf093d254a10fc81796e*5&ef686067d81af4744ca0a088255dd61e*5&42a192e8b700d111e24927b71ce58f66*5&a95d08db037310fdebb853ca64c9c94a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"05ac6c9880b8b4b10d6eaf5d5237edc7*5&71ba0e02b7e56fba86e3ec14712cdf2b*5&9605952e9ebc552441d047b586db4e35*5&bc710a2b08e94a8ec90b23a3b4dcf196*5&29fc87a637a5c57d95a9a928a4a4c3c5*1&0486795564419be90baaa8ea2af818fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"212ee017180f7a5d5a7cf72136b649c3*5&ae61fd96f19136b6c557bed844703c42*5&024b35afb37d014e595cfb69b6070584*5&e8a3f06ffa515919abaa454af9a7fdca*5&f6443bca085eeb56c86850626da74b10*1&fccaa963feb744fc68ee0a5e045acf48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e49fa62e1298c8da658d8a334313a2ea*5&7b8b3d323b284c68b9f0e08118b88d9a*5&5c22b59c4a638687740bd8a3fb8fe760*5&1b4fe28571c84588d5ada6af12f3a44f*5&980899309dfaad93a3432e0493d954a9*1&e296e8362efddf676a0dc88ca74199be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bf1f88f650e5776be43e3852bc40e689*5&ef10bc639bdf4fdb3355336e1eb5ee84*5&478f9440878ef33072025a78be8d34a7*5&fb8b1ae52f444bdcfc82139d054ad0e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6e586ae633f4ff68e2d21c99bb9b7886*5&4600ae30a060400df6d4c2a841c1fcd8*5&365bd94552b2814c7afea77719496ba3*10&b980dc35b1c9b85047da44715998d926*10&ed4b0c640a9d9cd2684abf903eba6eb7*10&ffbc8f27b31b4997721e303f48d9d431*10&6ab404d0a03aaa7623165ae2409bbdea*1&caf1b658ebf024fff26e6a6e97c7f44f*2&439c426544dbc730d80aea563f06f57a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ab70b7fa2b2524dce4075d9ef1f6753d*5&06ac9a0e0694d82368dcfafcc0e86d1e*5&da79214e6ef5a40486375ada2dfc9434*10&5c2ad1de423cf6a16f4cc6abd50441b2*10&076ddd7f6404b734f07af2571387d1ab*10&0c296c9ef9697956fc6db382585e4c14*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"0af6d6a14e089bcbd9ae959ef6339efa*5&9a23b10ef9668065b5d2700960562512*5&9fa9c17c89888b7ae4eab0ed4b801936*5&6b6465590df77d61cf37e3a42363d550*5&1a441e49f4bf1acff573a56554964706*1&0ef61815e052059973342b000880d1ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"54d3bed5f3f71586a536670bf85a1624*5&947efdd11ae159a13b0f75e177cfa20d*5&fec53a9cfe01019241e92ee28e91b079*5&8d139c0b7f4a4c7dfbf31a8737c229dc*5&bf1d78c8edba411c5db2cf9a1c38f5dd*1&2414a4b95190d5d6519cc843fa1cdc74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7c3536b925dabe5dcdabbb91bf8b98a4*5&121ceba8d76591d7859936916644ed19*5&b4ea9b8249ba5fce0f74e7fdcda26a28*5&9972d9e62082d6476cc8f76369bc0548*5&27029cdc3fc9d8254222a46bf415d8b0*1&19cd841aed26e9a74660ac11bab87e30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9dbd57e861ac96cb6167cc244af7aa1d*5&1ccadfea2b12ce7905858c95ab7aa2fe*5&464ca64fc47fedeb73b1c0197289806f*5&5e7f282bf0eb42d4d1b423b19996be88*5&86d04882c89a498e0c5fd5b4c652409d*1&d6fe130461ffe2126662d2ace106c956*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"ec20138871488d5ccc7a5a12aeebf1a1*5&a2c78a069c3bf186156c4d29f487a4da*5&37e0ecf788315d5d81548ae429424361*5&3422744e3541dda2451905ba86851216*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"5b3809729e365144278bcbb0b9034547*5&3a1c6481d354f53438d8c37f8dd68cc0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"894214eaaef6fe674948b2ba4ea0c2c9*5&fe2846f800896c4e29361839ea04c4f6*5&a5bf445aadf253d00b8df5f72400dabc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c2c7f72db4709c194ad4c060766ba434*5&dc93c965294d96565dc45ee181914d50*5&390fc51d846ba5f360579141227373f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ef4234f64b2f400dcfdbb7f287636f78*10&cf9e799339d5a2e2236718fd08d8af96*10&ca53c47af04f62c6ea604bfcd6c675c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"9ccb26f01d4c404755fe6abac5bb7819*5&c38838f98089e6a147012bfbf5b37a38*5&9f7a7baea0f43240be25ed57cf058ae4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"27231620fd45dc8525105c4c4eee2c3e*5&eb32bbfeb0d0f5fe7ae1fed4d6aa6332*5&387cf00855be509b349ae9d1101e1deb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"512982f44ffda6db0d7c5a33fcf5b193*5&1ed51eca2050f03299a3b2a36751471b*5&e78393498448d9dd3b9e5010af43259f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"cd67896e52274c2ea819edf50ecc6573*10&4d5af0938114731459dd277232a9bcfc*10&b6d742521bfc4ae963d0834bfb44ab36*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"823988a3220749160d50288e295a63fd*5&bd702f1a79fae395a061b7828d9450b4*5&d3885304ab9740555d0c6a66f51eb235*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7ce230a17a9f986f463e2709b1975f9f*10&bd6f868a8c6407fe2a1a75aace3576ac*10&b60d0e5a043731e95f04822fab3d6209*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8a8a2c7e798ec09b27428623ea43bbb7*5&da584dfef79429ce2bd5a1cb81551df1*5&f4633382d4463989a433bda60732a03a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"84769a24790ece53079a62664a7a68a0*5&e637ff099872f6a255d16bbd6fbe9d10*5&c9da64ee42449a987a3b81b029c41b3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b46ba49914353b8613f3c57773987c42*5&0d1e84705c036935c516348f45168045*5&1fe8bb413b020d838ce9a7ca57f80a58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e0a326526267e10f0709f526ee5de4ee*5&d13f116b272b5b552b196842e80e4c39*5&edc1cb993e7aac50edf1aba62f9afc07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cea9f3216ded8990bcb1477242c83b87*5&5fbddfa16b268c6370968263e8665ffb*5&8bc9a1fa96ba119b5987c04318d1bf95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3ecb7ea5f9209e304ce62b03a334ccb0*5&872f515645fdee77fabbbc487f8ff400*5&682eaa99d52844cae6a028d08b30a2b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4596a6b2f7973b0e6d36e9e24c64c900*5&c1b8a4df4639476a271bbe7c70a12199*5&5f8ea63cb7654b73dcf1b2afb7b94ec8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b087c84031ddbaf4d71112228e8a6501*15&6cb1079aff2b69771da392dcde266af6*15&c2992af5f41d53d4bc53f2368fe09fc4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c0c6a657daf8c4582cc06e939944abba*15&990a6f463eb115a49d7276c54b0a1791*15&8fca041b0e0479b864807e850e2483b2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d0ecd7c35b955a63900b48be4954f428*15&58410273b2c6741b6f640e4c9aefa0c2*15&63c281e127f78e2f7689ca7d46da256c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0ac386414cca727efdd620b05e031827*5&8026145574fec40a3d483780e0616cbf*5&0d0ec14a345bad1ed4d7e50d4563d10d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"368b6220ae83efc888692880fe12d27c*5&f3b7636ed2ad4b5b19dddea7f35b2cc0*5&5e1ce896846c3d474780470e84420d8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7ee714933399a2e3a1499ba9db327df3*5&7bf2fe4c85ad6433323e9902872b7557*5&e681315d0da8001a3d80b5217fe7c3e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7363aaaa4e8776368cf84bce4d5de093*5&7690ee13394a207dbcd60515bc51a8c8*5&c08f7a5477811b14ab76fb6aa10753c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f371343a0192c084ace9ef747b1536a4*25&4608b23731a92e11743003c3f64bce45*25&233f2ff18464324cb1a454ba080c6df7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"5a4db6bdc96d66779dde737b19e282cb*25&c62a67f445882dcc53d4918c90d57c1a*25&dc54ae527abea1f3e76cb34d87e296ea*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9b0e35981c5623f5b33258a43dbf4449*25&e40b51b5c23bb1bc8dd8151b7002b143*25&f100ec4f54d068cbdee227f8c3451c93*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"667042f575edbc0212453b324c31a33c*5&31c9f934c5afc7948bc4889cd6c46650*5&f90bf52b805512bff1ddb531a49a769f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1339bc1170399410704da481c2d57928*5&fb056d80003fc05338c757742ed59027*5&e1f63bb417a8a947014fea0c8b964700*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cfc3f0d48aff7006246684f05e6cda51*5&5d002efd378b7b4f2fd989477be7a7bb*5&5c78b3a152427806efb43f9a4a1ac573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3ccdca36ef4eef66f8765312ba3889cc*5&2f0d7a7a0ed8ed31f83f902f51108ca9*5&edd1c20ed40ce6e7c4866853c6ed8b11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5d02436cff64c763aaea3192b3ba2885*5&c42fdfc9a32a7c632fe69595095e429a*5&f1bc86f00e4167d725f0c5c73617a326*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5277aa5a07540b0a152958fd6e12b666*15&0e830d9fdefefdccd3693e03961d0833*15&8dbbb4f1b10fe6d2f030f7b33d7a7055*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"a13e987862e63b43fa53dac94f22ce43*10&7e4a6d51604f0f2ca00870873bcc5d02*10&0d038d3cc73bc298b3d3d19ba832b6b4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5112632e38b249d0dd4260aa10a12418*10&d6c34f04dd75635d2559f5a888a5ef13*10&b7de0e8ec000dcdcb0a1b94592fd434b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"718886a198ec28dc589c973cd70aaed7*5&6cea725d238830b24a756e5aa7153694*5&09c31fa8ea7241c41f5a22b6434fe027*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"3a833059af4627ba2f1bd2be7195d4ce*5&eb11fda040886e5d63542869eef71b48*5&f877152f1801c661afffbf964f0ce9b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"3bdd75b5d8118a1549aa3eb7eeb6111f*5&fe25173afea04b9fb2a53ebb13593697*5&3623ca2d8a7c11d91833615d3c7b3d08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"08b04cafac72733910d4042c8451d85c*5&4141c5c6a5349068631c71c9efa1f9a4*5&395bc1983202a882b2536d654c7c3dfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"a92b5abd59a8c7e3260da00f873e4470*10&4bf7786b4080941cc56921764e535eaa*10&2bc6e28bbeb012cc2f1cc817ccee3ea6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e3f073a41339ac11e730ff4661f9ed37*5&85e2eb66297fe3cb1438303a0650ec22*5&12d2438ab28dc372d49bb28f4198b878*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"15ff1104f5a96389b9dcd20aa8bcfc6a*5&a779c6e8fe851473a9a739bfc188d791*5&3ee4e644ef06a0709a45dc6cfc9d793d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"42d3edff784024cd31ec4a18836e1de4*5&b61c1922af71075bd9c3f9e90ae2cf98*5&15f8c5cd831489ddaf0ef5cf81a7d6bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"735addebc3cb10af02afb714115d60aa*5&dd5651b3a80283fd809b5500415c0eea*5&3332c2f3b9f94db4d22a2a87d70777d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5fc1ca5fc4b68d054128b75c037bde7c*5&b6f8aa27a8fad91e226006b71bc1701a*5&e691d167b080b96bf90e94e06cb754ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"755abb9cc9ea37c5b26d0823884928c2*5&511815779a5bf24115e1172ff26aba89*5&85bf0ed0653a78e841ca34cf4083096d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"af6515c784ed741bebf7d0ac915e44c3*5&0d4622823a2ed4ed2ab58736d80e4d84*5&b99847c4294fee9010c4abca05a0ce55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"416c0dff00ef77b23240dc5505ee7396*5&3acb76101bcce9426f79876d74718335*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c7cfbc5ebd547c86dd794f47eb903d13*5&b082cfac1bcfdd5200fe7044cdf90210*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"105c4e9f93c664ae0061c3fdae509e16*25&b98d82013ba7bdfd0a50042f3aabb700*25&bdddf3e5c98e45903c043f3d2067ccb8*20&807376878b3819ad0ca8d0add3c79a63*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a9a81d430eb33c38a56cdf4721c3d866*25&1e00aeb5a9e99aee55e65c3459f9b6f7*25&7b0ee77b683fa9aed2ea45207e383183*20&0e0e4cccb8c98b1bafbd6d8acfee814a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2a84d4ff477dc58164022d5f4f0b58c4*20&6e7fb754d3be4decba3eaa424fb734be*20&09dfa714f1f57c453b1ed1fac5eb42d0*5&3fd5d72233213b7d87c8f1145cfea0d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2b5f0e9344086ec4ab2732e2a8513501*15&bbe971abcb099afdbe8fbf956ce93a11*15&b051da1ae16256d84ccb5c98c2e9d51b*5&3c045e97d11ca4a265b82c9e095b1751*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"02b76f6a98aea3995dc69aa20d43f63d*5&23a9857cf248bff672efb2e747ac77d7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b2c4bd820892018aabcf3e794f3d8a53*5&cc5bdfbe31fcb7de699a0cc8e31ae6ed*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a91f23de0104dd514eadf35699603cff*5&e21459aa0d91fded85d06eb4bf9e088c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d1def569a9bedd9af0084c5c157357ee*5&4ba0690c64d1c9700039517f0e6154c6*5&16e5d1444241c7380b3f496e22ef7f67*5&31029536b11f74251ba04cf4049fed87*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"286c8179d41ef5e300dcb027a6672535*10&f3d9fcbd56468a226ff6bb289042ad62*10&7e07c5a649f950e396db282ce08148ec*5&7a0c9c97a935387bc7e01de3e2bdd8c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"da94a4d83f373c3fcd7ff9e99678e9a5*10&5eb9924b3a1f057548dae30d9cb042db*10&24b88e9c71dbdfacd33c0dd37bb4ec66*5&2460a86067444fae189445fcc546dd78*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3912115fba2c66aa5c0d88b0e368cfb7*5&02a39c4769ba7c90380ad22ca6c93f98*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"fbc1e384d332dd832943dabda8eae133*5&674747508346842f3354d36eb7cf9e78*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8548c5e1f05f550d8c4c38a0382a1f93*5&8649c52a2b28f495953e71f197fb08f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e3d1a92f9b51754967ceee86b0898eeb*5&4db7648f27128c11914a65308b5e7224*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"ca79186f5e81780dbc20c9a620e7111b*10&90c4a2ccb2719548d987cd632de478f6*10&2816266375fa6b0a6d0903d861b8ab30*10&af94e355ac60f53db6cf89bdb622d74d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2e46f50e464db1c2e277a31397b256dc*10&6f9f36e4011124e5fa77705bc2d2404d*10&0db88f663dc5654e900c5dfd88f783f1*10&a71f2e62941092e06f3993791f2d6f17*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"84418f92c66d07dbd30d7091e4189ba4*5&45fffd3a01bf5fe52cbac83b237ebb05*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"acbd128aeaad716c625b29918fcd3a1c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"90b6a50bf13474050dca0395165534ef*5&9815ba7bd897cf6cc90e5c9fe90fa9d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"26137de86905c72ba2551e2420ab520f*5&9ea172d35b596773e16bca42cd81acfb*5&a84003f8e1835a7ca461b40fe3554fc7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c78d58d58afa802dd377bbf2a0cf81f1*5&a413bc100f7303686019d2a51edefe37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"484fbd9da51fbe9fb43f3e4920a81ebf*5&ac4c2e1ec4f2656db49e3f09f25b75af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53028c1513da0b9cdbcee849f59d7a10*5&a36eec6e6f361c370d8da43e61b63670*5&962cb3d5636736c3af17bec38993b1c3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5d9a3b7275f2d36035949ee1c81eae57*10&8c7e87aecefee395f26872feb9f4ec2d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a16b993afb9f6b51201799c7d03dfe4a*5&b344ff6f321e5103246a5305f3bfd780*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a031a32e02e28c74973e7c521f3cac37*5&f26a8c066f73d5a70da2f34ef26cc814*5&ca0cd1277e5eed52daf71714fd8f4aee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"96f18a4895d4d4dce0edccddb4d1f1c3*5&2ebee341d2698d23fc1ff289a92b55dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f0184ffeb1740122e9e8c2dc35f13ccc*5&0e8ecf5bc2fc3db644d331dcc982bd74*5&bbb6451f20b9b72b921a882477c1b494*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b9488f71f0bd0961d9fae0cb37f05431*15&979667701e1f8211191147c72feb5ca7*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3324237f57382adc37bfe7a348e3fcd5*10&3bc5cfa97e11c8ed1f2e00c018c7ffd2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"50a621afe193fef06a84d39b991259af*10&c8b0198a6bd159360ef7fa4b9c705d07*10&34c46255c4afd8f5ec5de5afdc5a86d0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"78a4ab75f02940f121cb30458b7b5a82*5&30b29b9b5d49ffdef3c94471556344fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f9f992243264cadb6c0c59c14cbad939*5&c771aeedf3b15311e1d3db1d6c469796*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9301a2bc55883a0add16143ede8cafd9*20&9afb82bae8e81898a69c38e5454e6ff6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"db62fe1bd6dfe9deb4b1c885f963e0e9*20&9d32c7be6aba8f0697daf305494d7200*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"691cbfa2c9bd045314a79356c8544150*15&a11bd16a59bc0a00267dd38a3d7b4485*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"08553e0672454dd9faf9007d2aaa48f1*5&9d2c50ed3e1119fdffb814befa3cf993*5&6d138fec3e24d46739f66ebf248806b0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3f8695557f60a4e07fa09d124abd23ef*15&3ee730e642957d05c1ce1f25def12f31*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"137b3eee5db0bc17e1173a929d054ab7*10&36732f2c160cd3e154cc0aef0fb72ec8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"af9dd0aa23a2951489d3b0d91af0fb84*10&af211c7900d39924de1f28624031c3ee*10&db6e61330416f8744ccfe19b6a2903c7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"895b869c1213ef93abc7ae4a1d313809*5&56b88505a9f02aabfc27c80ee07d5d2c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e16799763d5c5cf0aa24ac5802a968e2*5&d7fa7128ed0c2c9e506c9d2e7bbd3c40*5&efa7a785416a4a7c53a2013f985eb4bb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"77f02a99ce4c02adbdf26443427b6e64*15&a8d810b44d0a99f0f5d47735be9546f6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f8bbd4f1ad30b03ad98fccae0fb7a328*15&06da0a93b4975fcf4241af32cc9e9115*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"27b84bc98f2916d7b0139f7a4f767b3a*10&f40e62a4c42f40b897888a7174750ea7*10&7d80a8c44239d77a5dbb86d7783c6111*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e2c2f4e9a835af53e12d75469f8b2f3a*5&b9b3dc37c22a8a5ca18552e93cc56c33*5&58b22aa29c5c2d1f93d417cbb4a2d101*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"49adf65ff74da60ca279569c2d6a02f0*5&53bba3388340c5942ef0fc4e7e468f96*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc8_ih736oh368kh3sh2dh0ph1_iw736ow368kw3sw2dw0pw1_n"9ccc5b2c2298d0f37c8e7bffda2321f0*5&cb7a674904a9114e46b1b1b682cf9cbf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc8_ih368oh368kh1sh1dh0ph0_iw368ow368kw1sw1dw0pw0_n"c182c42db341632eb4d9a8ca4e08ba16*5&649710710220ceef7628655a5ab1ca3f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g8mb1_ic8oc8_ih368oh368kh3sh1dh0ph1_iw368ow368kw3sw1dw0pw1_n"bb434563150cec46546effd0822ac104*5&1c7ba9b6703b323f82e3ab3018d16e5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:14:acdb mb1_ic8oc8_ih368oh368kh1sh1dh0ph0_iw368ow368kw1sw1dw0pw0_n"b95f1f3d2aea8747e70dbb57f636b714*5&468ca1960e4ef7c659c987fa749f8b9d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc32_ih368oh368kh1sh1dh0ph0_iw368ow368kw1sw1dw0pw0_n"08b2eeb66e90417b2eb0eb3ff30a1cb1*5&4cb4782007889a451d2665bc8518350a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic32oc32_ih368oh184kh3sh2dh0ph1_iw368ow184kw3sw2dw0pw1_n"06f5b5a1bb47038cda44d7899e2b857d*5&de4d19c8f12be05a2d8f2514fd2e0683*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"a8af306c84091e76caa1f6ed1945e54c*5&07a70c72ea59a3010567a14dc52d4e73*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc40_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"dc40122d5d575832a5333b131685fc8d*10&c9821b26f2bd56f3668eeb8c7b476bad*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g40mb1_ic40oc40_ih184oh184kh3sh1dh0ph1_iw184ow184kw3sw1dw0pw1_n"1919e63bd19461f55462dc41b640bf77*5&dc80a4453db5770d44f51cc3262b7a07*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic40oc16_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"5314600c2ee9b16aa0240fc96fdfeca5*5&d24568ef4549c26a58f1341c23a964da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g40mb1_ic40oc40_ih184oh92kh5sh2dh0ph2_iw184ow92kw5sw2dw0pw2_n"4e70ce37a4925e2e1ba5064564ebba71*5&07e73ea555e7ca0346eb572b7ac5f6c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic40oc24_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"e6a1adebc3afb553c279b5f085e3a0a8*5&630983ae5843b4f2ee7b92176938558b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc64_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"46c5bcb7cba527efe74e4944aeec2b26*10&212e8d206bf7853254fe65eaf4d92cd2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih92oh92kh5sh1dh0ph2_iw92ow92kw5sw1dw0pw2_n"71322a494acf2e37cc77eb176febe911*10&141ebcec396e29720026a23eaa7ff59d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc24_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"a4afd62563dfdba78f931db8bcd4654f*10&551630199a52bb7fad8f85303e6253e5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic24oc120_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"05d905f67d59f6cb80860e06fbef240d*5&534767afa0cb02b2c05b8b823d9ec5f6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g120mb1_ic120oc120_ih92oh46kh3sh2dh0ph1_iw92ow46kw3sw2dw0pw1_n"bf161f3465c4f45fd9a538f9e972b7c4*5&5775482789c3e1ff51f0807dcaae04d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic120oc40_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"b0e911f6fcb4f1f4560230a5da06d2ac*5&a59d26ad22c787e3a5399ee5dbe01dc4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc104_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"208b6ba2ccc86fc7eb0296eb03dace3e*5&8d7a06f6931a14656fa6198cc069b5c8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g104mb1_ic104oc104_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"0cad751015871064950bf5fc76e05120*5&3ff80231031259b9e716c07378f80cf2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic104oc40_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"73748f03843134bc26bbbc3adb4e4b7d*5&940fee090e5ad37e0b426fc8a7d226c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc96_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"cdbb7718fdf1dadf37071f3acf5bb925*10&0965da8705105cb34a6d0d3ded26781e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g96mb1_ic96oc96_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"ce065d6552f88ca70b44927936000a4c*10&ec5195bbd306d64e2f6cbd152acae32f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic96oc40_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"f3133468846db233dad578840fe824ef*10&ab58c16cf4ce9171abb55e5703df91dd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc240_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"6b15bae72502beef75a8a0a9e2f098ad*5&09479122453c589a0bc9eaaa07f69dc1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"f2a818ed2dc1bb6d6a921d960a3b5abb*5&c9ac3736ea9ecb2e1e589d167b0a20e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc56_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"2b32d4129574fd6685aad0d73e54143d*5&398c202a790acfd0d8fef2f2a057d966*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic56oc336_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"1e79ad610723560eb28d69552c7904af*10&b704146a0b49be27ea9fd5f4b69c0dc7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g336mb1_ic336oc336_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"a172f750021113f8cb74d5868e2c24f0*5&5a239d6bcd351c6253f6114255231041*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic336oc56_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"87ec46859ddd108e51c9c085758d1f63*5&0e1e1d61dedbac2ecb78215f6a73749a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g336mb1_ic336oc336_ih46oh23kh5sh2dh0ph2_iw46ow23kw5sw2dw0pw2_n"07e2694da165f564cc626633cb1bed28*5&06fbbaa3bc05641d78972caac700bafc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic336oc80_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"9dfea0130155c15b99434a4ecba6e39d*5&98ce3125afd3fe8e1c816fed423ac662*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic80oc480_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"1d81e8239a0ab91c42600bd650101a7e*15&1fe9b4a58e28267e31da20fc44ae058b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g480mb1_ic480oc480_ih23oh23kh5sh1dh0ph2_iw23ow23kw5sw1dw0pw2_n"271607f53fd832b9561d8a071a6db08e*10&d1fe53c1fa091ef7bbea33324c92315c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"31d9aac18a908e1a42f824690c708c57*10&fc00846f80b9f8be264e67bf5d0087b6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc96_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"713cbf28b8f01d07d8f8589b876e8947*5&2f55e291f00492fb260820e5e09caed5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"e937d69c7a6585a0315cb08fdc197dd6*5&9dd78325bfc04502ade8b848817dad8a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic56oc96_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"83775ad4e309bca8b60bfb5cc58101c3*5&74cb22881fadd7532b716b6b5a167941*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"1b0b4ee83b755877e16a1997b06e4a4c*5&cf4eb3099e73cfd1f4ad2cc4ee9bb784*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic24oc96_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"c8904079a073ce9ee3deed9f33da11be*5&baf289f630d12be0ea9ba89121b4cacb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih92oh92kh3sh1dh0ph1_iw92ow92kw3sw1dw0pw1_n"28036d6a5db3f6d62452c453e4b5c293*5&4d4f016289eacb7880887ceb2f72f5d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc96_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"aae3030e29291896303239b9c025827e*5&f111cdf50d9123b29a7e8814b5ccb36f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih184oh184kh3sh1dh0ph1_iw184ow184kw3sw1dw0pw1_n"6ea983826c4bd8737b5c1c5027011ae7*5&c4e1593145055535ba60563f4a82e8c3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc24_ih184oh184kh3sh1dh0ph1_iw184ow184kw3sw1dw0pw1_n"501b3efbcaeab966845c6739c4347b5e*5&5ea7838915b298935ab6fe1a2d0582a2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"c653090be0375847de08edc9d84b62ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"a475de0c2832144760ff6b5091b0f173*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"118ce1c71cc27d3095e491405a58ab33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"53d816de802bc0eb5bd9aba0c4b8eaa8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"c854e73a24679fa9a708676fe419c261*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"198eb803957832a68c765d5b6235d074*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"0d49d47feadecfcce19d3e71aff3bb8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"b1dc0d9cd261e67f240e817766a0a416*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"b01e1c6eeac1fcf2e3c0e57fd9eed422*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"76f86285ab1b76d9a78c29065bd9d569*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"a435f18b1247cd3fababd2e6a662260c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g320mb32_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"1ac7fb600a78bdde581687879fe563b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic320oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fe06d3db5e6cb19280f32ad50d02e25f*1&f44926ccb2f996db0782cf3d927fcfe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"361eca2e09a01d242b0db6363a3f8374*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"d5763208a60e6bff8245967e591499b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"b75d538fba5dacdb173f19823f050d48*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g640mb32_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"eccfbb56a2c52f94ddc6104270c47de8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic640oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"25047619faec50515d780e4b6aec2f60*6&008f6381c088e8dbf77db12f6c5964ee*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c646baead9780a2c6ee1d3c24a38e171*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"487d0948a77f9a49572011d0b1bd7280*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"3cc46a84fa58307209eb6613262fdf5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g960mb32_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4a317e1a977bb4f2d9e217a92f90a8c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic960oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0ed09cee9d9142a0567460a1d5a0894c*1&bad8a918cfd879d5b2a269ee0f48671b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"52c330e69b375627cf90bf7381800631*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"0e3e8a840cce9f89ee5dd75de010fad6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"e9e52356c5838c40d90ac15dc5a88d41*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1056mb32_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"e0896c6c7e61d59023fe41170240a11d*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic1056oc44_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"977848a405684736b519027f047f8adb*14&e6a90e61c26e3be0adfd1b82d80b376b*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4ee0e7518f07044d5ab5e207c597220*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"d50e957614ddc05a542a1598c659acb0*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1056mb32_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"83161eda050b5aedeedf256a02d7ea58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"a9411c1a7151055efa17e93dc1ab1955*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"5f2dcecb05d8ba85b588ffa300f07bf7*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1824mb32_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"00ccc60c295507963205b399693b9f0c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic1824oc76_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a002a283dd58d9e6872ccb533e7ed609*18&231bcd9549e5a39b5a4e656e997230f6*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"564e1ea510803d77b6238a79606c0b75*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"c46a4ccb2996cb7f50437a63826f596f*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"49a21abb3f3899d85e7a9e140e069495*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"28f2cc91a7289a7899ad0ee320c0d80a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g3072mb32_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"37addf4c148968ce74d2b1d15e0ca208*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic3072oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"50e8b83c00c1e89cfb451b1902be2f8c*4&4f55f204c077eff7bf290cc13a4603b3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ef34e06749ddffa684772b5bf28b2b67*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"7e4a908c71a8d166713f71693c4088c4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"416288d2cf017856b79fc9f0c95a111c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"77b8bd12f9334e1fdea977be1c4185ca*5&c3086ecb7bd4b04c9ae9954e4a31282c*5&aed8b6f07a830d4dd1a0c0d785ca61a8*5&ba002c6d26cfa711973e3ebc10131693*5&e0eb12af033b06929af53dbb59155ea8*5&605a613b5608a58f06c1e9a438d9b5b6*5&d758941d67aa569f16d35f8eb77411ea*5&1af9e755b2782986c9478cfef91896c3*5&e444a4916f745832ecdd2fabd8647214*5&5f1dacc784e98a38435266a532d72520*5&228cf43d8e1de6ea6290dcdb8d45501d*5&ebc1af0483da35985e028eeee01a92ec*5&69071326f707b71f1bc6a90fe3753a70*5&b0ed88c7b48ab4863d639032231351d3*5&407b2344fe891bdd54871eab375e92fe*1&4c05c34ebd8e7c18076b905b8ecf3494*1&caba0c50c404c515518d8aef14172eb7*1&3640e92c820bd34fb2c186cedc3fa959*1&4233397aac006772a462086d41839623*1&ff54578a4e03413948032226338c7c83*1&d6db724c0a09356e1626dd4b91858cb4*1&fa95064217f81eabb6525cf3a17adde1*1&4cd4dc90c07998122722046333fc9670*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bf39538627ac8985c6e5792279be2c0b*15&34f759b9063bc6e11bd2019b17bdef65*15&4ab45985600852aada4ab2db1ec95209*10&670fd66e722be2eae39eb28d6e2e9722*15&4ef347f92e436aa420f229512dc93b64*15&cc2113850a6310de20e6ae398e88a576*10&a2126a4d23928ad5e4e523ae5141ebd1*10&011f0b31b867311039c7c4b17fd5072c*15&fd557f9aec9e2ffc1babf2744edbd3aa*15&ed619c725a3c50dfe7e9303d3eb81641*10&d12fbe2932c4c878e21cd0733073ef96*15&27259454c700d0579de4387dc826707c*15&059d17d1059c4ed1770d62b98df50fac*15&67746bec55e5e63b8e8338a491864ccb*15&ff61959519491f856c7189e150aa2e31*1&728f4e6dc537a477e0ff18ea3ac6392f*2&48e796b0718dfc328f7c407f802ed2ac*2&f560b7efc3270b862d655cc8ba63637f*2&fb838b34ae85de0fa5e19be1b42998f8*2&46e35141de723f3a4aa35dd5071ca8aa*2&5959f412e183d77ab4d8a8a32bfc7be6*3&e34cabc0c5a1874d780b4e60b4142ad9*2&049542bd379594fd9345dc7ef932773f*3&6bb5f26c792c3bf4a80dd709cf70006b*3&a61ada8f61a900f3fe4dbadeeff6b472*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"41c601b51ff73d83c3edcf8b00052402*5&5d202670e9cde3d60ad5f2e83f9b493e*5&2b9aaeff9f5ffc30407fd1069ae58d0e*5&bc6ad09d9dba1ccf7820cac82af1f76e*5&109c5827b889fdba803e9be6e86b9c1a*5&5230072d055030f5dedd8d41ace5e5c9*5&e274a2daf02bdc4e33015aabadec9a0c*5&8b3394b91e1703661736dd4cefb08392*5&20bcbb5c959ca122dd7efc609ade657f*5&5d4fb0311e632719ffc66a7ae957e553*5&97771df77c5d85a806faf4af14cd868a*5&ff5a4a2d823c2db5cfc8a4fd38446925*5&b278fd2d4eb6b3e68c38e49132c418b0*1&761c56ce4d068a5d781bf56b80d9baa4*1&5aa6be4b8d4f7ac4bd24b1b4eec72bb7*1&39890f2c90f7681a03bae0bc42f976d3*1&d26bfbaa4682f300f6585e18d15d8b43*1&c5b355b518bb50e9a14a72e0fa3c9bd5*1&4c0f81bc6f6c63103e543d2ac7a9a15e*1&7c96b007f2e6507a5463575e3586118e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0302617c92f4716629e50171c5faf067*10&ddec7e7f4683444f41d65439bde75ef8*10&d256d5aada3cd8d10128c7df7097e7c1*10&0fad5a25b7577e261740dd5b45280acf*10&3fd539cdee780eed4e9287aef228d8d2*10&5d590af04f8e9945b7d64af8367ee5f7*10&961cea8aec5af37b383f849dc0498c05*10&203851c5b01ef1b75e1ed33474114f0f*10&581fff759ecb1669c59bb10216eae878*10&039fe034fcb2b59eb00d1c1c68f41ceb*10&3175d68072d9360c311d0fede0c0f066*10&29fe366ec33fa1c383792e752f3c827a*10&8eab383db47dc62e0bd874dd65a12092*10&7f7f35a3182ad654120ea17835f94e83*10&f48e0bd096dffb35fde38089ea3c8b66*2&f6b33cfd2d76d5cb395f341c11196dac*2&db42ff3aefd450b5930db972a6dcc562*2&8a445fccee66b94751ca14abd50cc7e0*2&6e0bfc43eb91e08ef76d9f0c44a1fbc5*2&e50c5249b413b602b136e1b34d8889fc*2&a15b732d7265a068ee4c744f7174d2ae*2&89470c03566c27f915de4b2ca60daa30*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c48a37cd319fa9db1a8fc60d37451009*10&501b102c3001d5d5817b589d1d4e57b5*10&006bcdd44e21dbb8d3dbcdba1782e599*5&83dcf9214e615048d0298b007661dd11*10&56807ada894938b866f46e4cedf33846*10&68b7a92c983579f5e6023207f112cec2*5&b48a7de0fc76509e0b1970bf7adde01a*5&6c3c2b2ed08df2e3d3ca2e75bf10043f*10&72f9d5e8b9e8caa9522e426509411ef5*10&66d6315e17bcfe443f6e191aaec68a54*5&a4fa42d6d90bc43fe8a0d3c8218484f9*10&72c768ee17a2810c17676ba923430f87*10&f7f21a1c83f2ecebac69c089f2866da9*2&8e58ef729347ec4d2712d688c777012c*1&7bbd1d73c0497881b367cc5450bbdc2e*1&3881271313a3436590c65b560917b2da*2&6976bcd989ae1dff8057e0e125188627*1&8822f3776c5e190eb3e52cb9f6ed52a0*2&5365bb64d579efa9e645e802419b9528*2&4adbc04e2f95b374f5ecb1ed1d137f07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8e82fbe1dcb89a0798017b1d87a22fb2*5&f25c600a59a039090dd72033925f0ae1*5&016584cb3eb9600b72745e4e0297e525*5&43a1e8a80ba433f5c8546063d9139e17*5&e0ffabd2afe2061c88bb16a9cd3700de*5&19f202a188c277b897c361fdfc4ee67b*5&bac17a638365218762677abd11131877*5&eff1a49b5580b0f1c5d6128d6201765b*5&82f6184b5d3e3da796349f7ef9858270*1&c90e2a1f9aae416ce571a5ced5419d89*1&6c8dacf616825bee23a1c74deb2952e3*1&1af9259a7a51b3f231776997cea118fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"22be8704e13e6b9fd7139b0b6b5e1550*5&bf381242bcb4472257ab8be1180ffe7b*5&64cab3074e672509eb9324830cd28559*5&bd139b1d1c100440f27085e40ff1c5bd*5&7cc12b2a4b3da7dad7f098b1a284a9d5*5&3042e0e26b63e5d04e83c629a0a58cac*5&38ef01c6e2cb6f0f4c21842071f2665c*5&73763ce0cc9bd0b4d31b0a3a0ff2922f*5&087bc56a1242dc2a1624a289019b73aa*5&bb841f5f025921d3a96d21aca7e3900b*5&d6c86a36b018d654f4d34e48e55d8faa*5&ab1d750612c5e7c827171c8c4176445f*5&e54571ef7cb3a30b1d5a324e8ff52c43*1&6ec496cd0b6f4c58b435cca88f4ff580*1&50e9dbaffb131f2c7bc0cc3b6140dd61*1&8daea00f072c6e6d887d3bb7e88edfd8*1&04722a8879d4e50781aadd048c39cb01*1&bb9d5524535488e13aceb5b7f9872756*1&84751626628639d936e3716c0b62cec4*1&9b079bee3e008b40f890134a69af0936*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"532dcf75b622a3600c854f3484ac3d7f*15&f4aa860e68f627c377ab7b1b66cd5f03*15&37bde70384ff873aee9385c2412030d6*15&722a04bde69c374863e493cf9fb6d1bf*15&f4db916b9a3f110b9dcb3797ecc1ed1b*15&a69205a19e9aeaf57cf16ec8c6c02014*15&baa042319e9657f13202a7e10eb704ad*15&f22b7135287f154b59987981f78a4879*15&d33fe1f92b3b9e4792928de24e3fe7c4*15&bdf7a466e389023d1ef86fd4bd8e972c*15&6c59c51bc5642351fa7b929937655390*15&a0b414336c4ff3eb7ebf3b1810c55192*15&6777541ec5c6e4604d974a91ac5ab478*15&f35e797c3448083d272af4e8941a564f*15&0fa2adb1b9b6b1cb414b6a0863432246*3&40be574bac5dbf8064ac1f2863dd3e9d*7&36d3252d02903a6b28038d3aa977de51*3&36fc45aa139d6de370b43f56d91784e4*3&341432dd1c9505e4677086e259f72c14*3&843c7a924984a0e0378f54bb64a9a5a9*3&4e3b8d66426d3e3fccef4dbcab84eb4a*3&adb54bf9bd638695f9b3235de2ca45dc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"aaddceb71bbfc6a3e45c8587f25da5ea*15&e1a38d4c011bd2fd17b7595326878e54*15&2d6b9f9779a5666568190401ae750a10*15&72bbf030183ce141b0178b16186e77d7*15&8977440a8a1326127c8f6a2b336841fd*15&10081e605aef5701029516caf4f0b70f*15&5d6f112752959a7f48019247f8cb2211*15&e442e783e473d49747c0f46ac3a6da5f*15&433b498b176648c2d484883579eebc0f*15&48003d32601c7fb135b8fb7aeb80a801*15&eed1cf463beb8f38386fcf1e52fcc2d2*15&5b6371cc9a1288d1541be3a3d17e54e3*15&0bfd579274f39ad28708a6e0d0b52b1b*20&9c355e916bcc2872e8f8fa2ab73ccb62*20&bcbb2c3f54805eb45215358c8e219d75*7&af8c82e58c68707a314bf0f91e00d728*7&59850682ba637315fe102e04ca75839f*3&a341bd7af7da333944c9f6a7bf9db91f*1&beb88e2a3ed9cc963113e74902c13064*1&e01219dd976df905a78ff83a1b8dfa2b*1&c8ec90e8b85412052ab5d1a1f8d162e9*3&a9a4a0a3b11de1de0d6aa2221e4db498*3&1ff9fb7d8bc038a4f9a80280fa31449c*3&9b695dc918f55af8f38c528428ddbaa7*3&132db6860906fe70e1643326e68fbc7e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"910f9b4222281403155083519ee11527*15&716e7bb88e5dccc952cdf317e06e1840*15&bed43478196c6480730539460476e007*10&ba6f7b2f09b8cd56916bb573d9eb16c6*15&2caa263b49c1578f27437ef30c41fafe*15&ffb1f2195e89ec81af3d85c7aebe15fc*10&b1ce1cf2e210e3aa6435d8373f6b7869*10&4645452cf3dc585bfe296a0d8f01ae15*15&7ae5aa4d9734cdc911fcf5393a0f504b*15&668dde9fc5ed61c0f5db99149b18f7ed*10&b58a8d344c7e4e2a577328a9eb6a3877*15&e3605ba8bf7da09819e05c7c510b1f1f*15&063e96583ef87783105764a18c81400f*3&cbcdf83776843a8c5a3e4e175e27654f*6&5bc4cbeb690eceb022063e67e1f9a28e*2&b07f34903c45919b4f82f03e396e307d*3&b3a328b7c4e1771916a06087ab86f5d8*2&4fd5f1f85daf5e48508ec81851b681c8*3&6f0ae67bbc71e1599336b9c7ae740fe7*3&1b5bd5919644365fd55deed0427e4316*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"2581dc25f99bb4e257e3d238ebffafa2*5&c5dc80463fb9da2636f4e3619db93f06*5&4bf736c9895edef4db41476a941a141d*5&76ee768bff1b94d8b0e86e395ac82510*5&0ddce33d24b9b14d7fc760f20b164a26*5&252478eb1f812395a10ba5f844ba1556*5&b1cb7010641a8f2ae0e0b04bb97e5027*5&9f8eb24b072506635801fe8866d1fcb3*5&7d0ae0f2784cb70d63fe4a4e0657cbb3*1&d3d6b1298ad7c4f71cd55bbb156ef959*1&44c3d7d3533ee259ab963f9dcd918b7a*1&600c9e6c3b3cc19a459e27cd62023f60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"956484d6c9e20049372cee5c5dead537*5&605619e03a1419da173c0dced7a131de*5&1bd3b879c26d1026fd69f3a23090629f*5&6982b560450853eead1b5f6ede6d3b27*5&bbe53a3695a0ec1178cebe85a8ebdfb6*5&0ccf22cc92ea369ab1ec75ee3c7f5abb*5&35253e664e559351ea17039b08ba36a7*5&ebd6a96db512632133b9267dcaf4f365*5&367f478bf51065a2ff639cf18d7a58f6*5&aec2ca54ac10fee6d3e83ddc82020221*5&e50000b89499aeb4ebea34492eeaee60*5&5b81f0b195942459676a408d4122a96a*5&4ea1f40e0605cb0ffae1feacef7fd897*1&b987005f6ca27133beffb33b3d99d7bf*1&d032e999e61682874bfd1e521db1faa9*1&492cda8fa21fd48de7b72c06b24fca21*1&fe12788108c1261467a0cffe051c1452*1&a8afa2ded6e69b46640d5898451a074a*1&e7724f12f2f0110585821f739fd781ca*1&b3d891411fb8a7a39f2865fe42dae0a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"480a29fe8e1cd2fc95e3d7e9c19e71e8*25&f64fd51cee2656906cf20fb9d56cc2f6*25&50ece34b717fd53ab5e27eeb1d89b2e2*25&c2933bde7e8dd04d9f0b3e8980239ae1*25&51b5f96922c38ed1e43feb4e1d4ea455*25&31331eab50f8248ecaf340bbbd94dcb0*25&da6c809b68908e738617738097fc4806*25&fa08fd1a14e877cb637ce1ca105a39b4*25&018d9b99ecc48da08ddd78e9fe5bd244*25&23fb39efc8f7433f8f2640e00f501556*25&5c00aabc3c8a11806e4d1739f3221db5*25&0c5da71d92b916b73982f07554bb0d4e*25&2e367e0e37676f505d02942113de6734*25&cd843576f93165b43876055c8dfb8c14*25&f67dafdf92075f7dc3b21788eb9ab650*5&dde3df982e18e4293dfa8deec82c2c40*35&9a1993ec926547d4076a0442bc0b7f3d*5&fa7ff55e297ca2f5aba5d1a6336ef2fd*5&b5184553b79bf44dada8b65e10c0f7fa*5&f0c8392c619510b2b57f807c678fd7b3*5&0f2886bad473a43ae8d5f341925bd4ba*5&419262cb7f878a0e9611196312ba975e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"49f1f69eef75cbcbe34bd5be43fb7096*25&5eee28821f2f126e6cb9d6f702d1c42f*25&adffb12969924c44c321c434329f6846*25&25c909ee146b3b720a03e3c60e0a7a8c*25&e3d454f9ae70086253ac38e00097805b*25&f4eea5e1fd0955c51c527f26fdbffcbb*25&8a7f64dd24821a4fbb5e80ce9236c39d*25&a8da2329f53f58b533809bfb3a686a0b*25&a3b84854ccd69379198f167b2606e2d0*25&129b286b1b0a48459d835495b72752e8*25&04f31a098b9f745c746b8d0b7ca1e6f4*25&f719e029235e6184960e1b18f5682764*25&77ea621150ff54b355dfa8ed457d3abf*30&5d0a88f0bc542cc57f2c7cf36b723dfe*30&19d15f9fe9d0880ae239e0145b1c9b0b*15&de391b803605563d7afa91af88c630a3*35&902d409a4758126c395be17b41802403*5&edf53382cb57b6210019e24b7fa4f629*1&fecdd098f7a81847ece3f7de350ad5bb*1&6a28e080468a98c1874d107279a214c0*1&9860b9e0b74a98a9a4565b5a8ac9b787*5&a15857df9aac0a745a93fbc3f2705858*5&894719b639d24db9bc6a001d9b856ae7*5&4ee31a90e00a4a3a7a6360e175e89398*5&2cc1fab284b02434f92268b7dcb28ef9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e9026ae97e3dbd04bf38cc930fd78e1d*25&7c0b6b4f47013a98f4f40c3fda42f3e1*25&e775184158cd730f7d944dec3885f26c*20&4a316b0a3aac3e88f6b9b10d7720f42b*25&e42115482357dfde2d05ee484d02dd54*25&09a5be203838334defef028c7a9801f5*20&fd0556ef8bd3e527a5c6f3dc09e0d91b*20&660439191409ec772f5e34f5e30f7af6*25&ac66917ec87f087bf4c41334918913f5*25&66cce17ab6c7be9e6ab3e8b1341280bc*20&f084354f068d13302d95e118426aa2f0*25&139575c162129d3a38d64a6bc035595e*25&ab941655ccf7b1323201a1f0cc9edb7e*5&ac874b88e279c6e79d161f3ca0299c15*34&8cdc623d5c31010d34ae375234a8a7fd*4&38041b512498a69f4e860fabed186f39*5&c551ba4e0a06bd88e97f14ea85ef7489*4&13bff9f4189491d35068b2d7e1951f74*5&33f68b843c53c2e44f58e508a2828ab7*5&b40240b9a4570b6ec1d596648cab5a3e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a7e86400de77f6f5d08b40340abe24fa*5&7f5346f9762fd30c42223be9c378a52c*5&d652f5613f04aee5710b8a273884eb30*5&c35ca4291cbc50b4e05fb220b09f74a6*5&f2dd41b851d83317ba5d2ae2ab30f0ea*5&e1c7e76fe6ac5df9cf837fa39125f180*5&d0af8cb07768193ebda2f1f9f11621e3*5&b8b322fe767202e20f12e44545b6b825*5&c37a1d6157edbf41a577c630150e752d*1&eea770f096cffff550f35b3c487ee7c3*1&cff8ac9e7f1f7394d6c632ce2fed3201*1&23ff61cb653c628082b7284ca6c8e80f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ddda4695804e0435eade6b14832e42c7*5&4fd09265281527d6065e3678452c29d9*5&7abdfd190631f874ae955586300b48b2*5&16b8c8029b0db109dc67ba90f5764676*5&b9d021b9a0de7a1723e53f77f376d4b9*5&66c6cfe5bb5243bf5ffb12da6064ea01*5&9191e56d19ddf3cfd5ccbece8d1ee930*5&32ccdba84339e8edcaeb92da4164e28f*5&70bb900ab79fccc7d4a6e52e229a694d*5&61ff6f40394eb0a574325f65fa19bdf7*5&49d15126e3f751b3d601767314d6ee50*5&0648294ccf9bd9c692a1ceed750881ee*5&ca38e20f92876dc5deac2aae35bd7443*1&c3c79897335f011574ddaf0b48faa42c*1&95d3177bcdf1a3e5a512dd9110c0b9ff*1&7835cde2dafe9fc05dc8a4cebce56284*1&1d4d8a78a1b35c38f0ff291562aa126b*1&4b47057a7d03f532d103ef5e8343d176*1&4f17865b174e60ab68dfa4feda484cb0*1&09f24ea24dbcfe8e237b42e0678e5170*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b63d7d5a95993a58f4607862680edd3*10&163f48c8b0675284784f578d77398a68*10&7b1ac0a5ef9ca8935418bc266545b2e6*10&538228dc2035698b70feab59726fe6d9*10&9ea3211c634f4d1eb26999f4710e2ef5*10&cd4460c356e71b9b93968b0fa253a209*10&5b1a6f37d9c1289d87ef3f279a6d6eec*10&ed0db6219a3408a5a58b5f9783f25f33*10&2fec07f8e09145a81fedb5df88a2a1be*10&9ffe8cf36e407aac1a3433d9299aec49*10&8f3ddbb5a47161fa6e2fb5e2b43412c1*10&18e81a8c3dfab5d2e3b14ac8f4273e86*10&b7bb77cb50af166e834f80b1952c9348*10&0350353d02ec2f98a28e695c434699ad*10&2cc981436d9f13b0ffb071d7fde173db*2&978fd65b1b016885e37054cb251e161f*2&e3549ac7d9446fb38ed2f629a259d41c*2&32ef87502788deb7e786c48f9083ea88*2&1b797b917ad7bce0ced17f531adb4bef*2&ab9b81d34d4ee7bdc9267a90b4f3e421*2&90749ddf007ea5e32aba2d1982729e64*2&748ddd2a5e6a87b8f751fd601a354f45*2&e6f1571d7cb331cef6f04f412f5d41b9*2&7571beafe55eba9c7a68db7dc4cb48d4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"95613a32915e88821e52bf5034f958d3*10&c2cc96e78a58383ddc554b3d9fc8edb3*10&73b023a5aab4c5f24ad2e3dfd5eb21ab*15&9366cb079b0946e4a8bb4f384da40cbe*10&034499fa76ae01df8927170c8ed4d2e2*10&eb882f165bb2eb188095634434063306*15&768272f3ff0a2439630ddec76bded750*15&48d7a91c04d126652176cb60516abe24*10&5952768f679643a8cc4c2cbd14e6341c*10&af103b2a68eba1992a1b9ca3f7f09a80*15&57a0fdac9d9ccc12caf012fb3117b377*10&61308ac7fc01b6fd535df9995e8206d3*10&fa4cc2996b6a03e18d37cc1144f2a66f*15&2ced777aab1f154ba17169e0aee1f530*15&222d2d9dfd62d88551581a16ef11a8d0*1&744795b594e8f7c4fbf864915e70b2ca*3&4b700bdb363d93ad771d6192f16bba03*3&8cc011b1ce1239da20270c93d18cc261*3&6e78c6bcf5f010c4c2b51c7783d7b6a8*3&5394db3f252c92a47683521d1345df78*1&49b8c713e8d0eb90711a855fadb03067*1&3f1081422056e434d2000ed3007ded68*1&4d1cc2f1bfb11f4481ce7f9b4a8c809a*2&13415265d99b5e3f20bdf74847d9ae62*3&dc75c3b6e2cd5c18f9877e8406797f90*2&946af31e47de20d0ad28d260c9a92e59*2&a0a593af2b5ee3f118c6e6743856ee70*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"45fa2de75ae82ffa8329638c2066b042*10&f8c1e6b7826efa0a9bffc1bf2ad70d25*10&cab276151da3b411013361e4a07ae308*10&8c3af634b6d1c3d8312c5f7520886b26*10&4d06970d7d870496d72971451a4fbfd5*10&636fbf2e7248e19fe037a209f756f873*10&133b104936c27818ee8425c1b033f65b*10&8b05bdd199d5e798b53000971f7ed1bd*10&afa81ff8f61601db33a51815d0c38aa2*10&b7042fc6a5831ec130ecd4ae4e7eafc8*10&10df89ed0fec94258d3715f4a2b446fb*10&1e257f10b8ed0bc76241928fb5fb8078*10&cceb67ce41b17930921cd56afd5bead6*2&4ce77a5e60f4c5c25e8bde448860676d*2&94b3d8870f8d7b59362a66dd4121e338*2&634c172a265849cc697d8ef1e2e40944*2&80a7405b577a49604b659ca5e77064a5*2&67ed9bd482f55f5740c138108d058160*2&7ed345ee7dfe04f8189ddae05a01fc3c*2&73a26307d6d50615d675115e70b5e4ae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"942305501cbe2a70e08ac27d0602aa9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"d1824a916921462dcf03978d6bef7286*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"7370ed2e9304a55927123f2bfd747d36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"8dca10b4ed61c42de9866c0ee9bfe094*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"58792bb94b04e09f3a976ce61a4ed7a6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"af36f995e6ba6ba1420cf5defca18d39*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"88b43135022d8a6a06df0157be796ce0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"466c5d72d4da75f9553d515b802c483d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"39822b60d7a79c40913c911fdb21af03*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"5983bdb5d59fde65aabc8316842ad22d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"14397a1aeffdd5e1ef30978fa20c6d5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g320mb1_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"3f664d2f31055606063f9c2764adeaab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic320oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8985dde2b78ce2401cbe590abf9b4439*1&a06ff251b9a39a12258b6e5d221fe657*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"854d22d9b22d410df634daff1a6bb531*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"ab4ac319f5c9a006f06e84589b4f3d08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"a9e98969d6e7f2bc1359dc24e4432bc5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g640mb1_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"ec7ed16879abf54ac384ed0fb1cbeaf7*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic640oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3d91d2cc945e33410d812c68ea48470d*6&03671a95f8a00ab24b17d75ac6f579ce*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ae0486ec04124ad296ee3cf183942a9b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"9c99322ce53390996ed2bf87a8036d02*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"c0107536a94c49588713dbcbf1932b99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g960mb1_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"707949839bbd373c4de36e3856105a7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic960oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f557e9a998f70a01066450b2238c435c*1&7567d6e09115dde5812bf72086911285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"41208a4fbad043cadf8393c2e1e8fa70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"f2e9aa70e2548636917b1c572f8a06f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"6ffc41e38a15b66b2f3ba10f184ff352*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1056mb1_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"5ebeaede7fe024b3c71077c72eac7904*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic1056oc44_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"96d42f76d7bf5e37a63412f1aa07a577*14&7df1f224e84e8f7be277678ca96d010c*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c6779efc827bca6b287ab429fd00e4fc*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"ec5bd35e0acf910f4a378769bf3658db*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1056mb1_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"10a489515e0a7b4e9b1136efaf31b302*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"f81dd0511eb3638c92694c5ae3b8668d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"f1e84d67e7d860196a6f5749bdc1f801*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1824mb1_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"d94ba2f808ff2d8654eca21df0691883*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic1824oc76_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2b88b6094af09c6eec9f592c875e56d8*18&1b82dd332607f62ace58c58e6d114724*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"adbb07464b1a975c5264fe45801479e8*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"265ee51c72488590651605d68809aceb*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"66a7ef82d15f9d8f7c92fdc78b2232f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"6a7a4808c2d55862d705da2d072a16b2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g3072mb1_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"8a6f22d90866ec9eb2d19f4e001b3996*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3072oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"377b44beeeb8eb700683c0bf06a6de95*4&eea2f2664745abb3d18aa5b09c7a9a4f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2a5294b2d409a70960cdd4f8e4415e3b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"3f26e050f4eb4e0379786affcd16058a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"f22ae82fd4423994c6a08bcf4d44a7a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"92f1c74b08b051b860cb76e253bd9fc3*5&8c4ebca7d5cabd8fc1b4c6259ac86e79*5&03dd9f59114fdb0527bceae9bb4aa478*5&44313273c578564ada00304554d166d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e6e87e408f7212a4e93e0d777132ac53*5&141d1e0a162c1141ec3efff93457b0a2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"a8b141cd0c6eba0918e40bbb669a78ad*5&2594b56bf22a53de2c2922a2c1636f3c*5&b114f60d93e0b2bec70a4b853dd6f939*5&2e2e4b4fd8a4c63e01147ecda6569404*5&92c7163b091c8ff2afefe07df1721450*5&6389af8dd72d637978ad2e3d1b759a44*5&14b992e0cfd0e43263d21c4a167ad7e8*5&d7287bc55452d9b9200251919112a482*5&43ae81ed7f65fb1e78daeed0c05f4a18*1&033c62f586b4ca400c6ff7b29bbd9807*1&f032215b720a8328b605f1ec6d9bc502*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"f7938f99e057a16a7f51965443f3dd90*5&6ed2ae10138df9dd98aea7553978652c*5&f0181c8c92c82747b2f4db61d98bef4e*5&98862cf0db1f592dd1b639fb31cacc9c*5&19cb98c9cd79d08783c8f59faa006500*5&d4844e20be1f1dee2d88b0bd7d3d8498*5&96dc6f407cd1256cff6b3d7018c123c2*5&128fc61048634c7c6efa2b636c94f406*5&60995b0842b625300d640cb5f45c3b85*5&55e7f457b1c6d27251e11d8ecd47e98e*5&ce0e7c11b5fc8179135797291ebd3b78*5&6a646aa0f5d53c38732de8a0fa81522d*5&b1334a16be9ec0bdc44064040a59e74c*5&09bc06cb894f3d9c3f27aa9875e4472a*5&40d19bd95dd158486caa46e5176e1636*1&1d304797d5e3d9c0257794d7fbc02d7a*1&7ac311474a387bd4698a58cccb68da4d*1&8121542e9e04bf43d68ef1510eee7b36*1&8eee5ce88d12348450f5e8ab2dc8f8a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bbf31ee4b17412d5d39be9ac51ba7294*5&6ae6b346be13501e711ab4574737e8a8*5&164be12c471bd9280bb3d35170915d9c*5&94d2fb57c918523719377ec4fe4eb6eb*5&e6755ef6f3c13655cd9ec247152c6e3e*5&0230f27923ac28662b4a6dd12a8167c1*5&3a553ec85193fbb283525aee74860352*5&20304763b4aa39242aef04d3177c837d*5&6c56e8e001458a9c9a8394d2bd53d75c*1&cde4625ea25cd440a489f662b1fb17d7*1&283bfc6795a103e394d9c1291f59ac02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ffd7792bba06d938ecd3553f0d58aa55*5&11921c9aa4b1f86f1bce828ccf827cdf*5&e3e8cfbe66132bb12e64fbe54a9df73a*5&eb2457fe34d6ada921ee94c80b67ab82*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b083884bc2f320503c682596623b2894*5&446e35252e1993546cb165798be8c66a*5&a15df22378e158b39325d7a80040221e*5&a1b3710434798caee88068370019eddb*5&3d1ccf4aded14728816f1864268ed9df*5&d8b9ecb1ca58b96c516702f490d2cba1*5&58a5c33060c1a6ad39ef7c2adc809d05*5&c857ffa0f28750b2ba2eb3b046986800*5&d27e6b695b79ce38eef3a36a3cc6826b*5&3e9e40b97ca71a371bc457403fc62e16*5&944739a2353ccad241e008cfca76f8c1*5&15b309491cd56088c9215b437aee9b40*5&e3845a5a772b7ae8262be91fa9147260*5&5f61f685115055181fd0c6e613f3dc0a*5&8d5663097b0e1bc466d3101d181fff94*1&589d8728e42ec37f4a129a2a1d4596b9*1&b0c7cfa02762173b120a86a411891cbf*1&4d8ebda96717da4fca6c5d3383088936*1&2004d0fed803d5cc578042309b5c5b73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"60c93efe007e74594b540259d4d348a5*10&a1938e6491036fce85c7a7c4db8c5e5d*10&eefaf4dc17b951c19ddbb97d212a6cb5*10&6ae8985f66868b343779235b6c6ee13d*10&3f847ff9d549dba6bc7e059d6f7ecd52*10&8f40d5c00d5530833745fe2ec6318419*10&4b00ceb42cbebb8a502558f4b5e7cfdc*10&5678c136596cd88a97b4fdc287c63366*10&d9a7ce10054b2869b0b842924b72310c*2&757f12351eacbe9ce71e3e619727a0a4*2&7e76d091a9b7cef415bb7d8e78a21e02*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3ebeec307d24d79083cfad0d463608f7*5&2d7350e098adc27aa068b4cf248c1bcb*5&4efa6547d2f8a0b1edc40de487a29842*5&2fc90ad04e427b6fe4eace40d2abafef*5&5d95b88f876f5cac62ae7d840dbaba5f*5&06f15f8332719acdd393b3f44d68b981*5&4496ccc8133eacad66fcd437a7c17fea*5&52bd842013d49342a2e2780257d25f0f*5&07f0f5c02a93cc91c2e4d0d5ac953ff4*1&c07988278b552ba2bb50c0b10be54458*1&c07ea38e59c2f2c06dee089879cf650d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4bed073c9cf651af3b20e33163d16a3c*5&b2fd9512e702c65f309a6819bb6047b8*5&60dcb5cb1fb5541090912e63751503d7*5&22fd5bf8dcdf7972a22a73a834d3f351*5&a230093ebb30c1710e8aa8db47fbf20e*5&1609a48b004c61bc0ab6e2fbd0d1b4b4*5&b02b73a59ab8819fb8040837ce3508f5*5&a6f8f255178cf353e26917f510e2a141*5&f27a07cc5770522264f5a158d0ed34b0*5&0251898fa67b85052c08fb79b4d4bb91*5&2e553bb0b524b9a4e1e7ef9dc86a9840*5&fe0483a128ae375bc8c8709ac5e21142*5&64e22e1b78d527ecf7758db92667463b*5&660de81d88246dbfc0daa2b6262ae497*5&4cb1fbd4958523836eaa7d5c95434b18*1&286b62c19888bb594db7ce01b6fe20a0*1&a1ea43938d4441e10bd355d923bc6baa*1&00084f5171423c4518fafa5142321800*1&5e8c3883737e1fcc4654dc75bba0471b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"cbf548a5622652d0cdc37c0fc67548b7*5&8ada41e23555923a673e86caad27c30f*5&0a343d0083bcac8d4f7add45496a40fd*5&85de83f971fe73648908ad2635664d3c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3226e49d7259a6b6102bb3bc58c9d7bc*5&ff743cd27b6586f01dcbfb70f406be89*5&880b6f4d9496c8451023752eb88acac3*5&8d40f653606fe6441d4d5388688bba7f*5&cbad4cd5d9a11609c7e3aea091878799*5&9f04c62987b350ef316d5d198dc11292*5&dad4d171e0d9101481c9924d31b499f8*5&f9ee34f245c02f3efd9325258358e3d5*5&af2cf1369583b8188abd045b39437ecd*5&bb70dd0e5c907a808256ef8dd13d272e*5&1cd77a51247c3260a6ed213b8b484e28*1&015576d1ecd56e0233eef22bd82898bf*1&d71599e1ee77b1c328a6c284282a3019*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ed501ab438da666f09d6e0ce305bd3d8*15&fcf436efd79bb7d6f0bb8407be8cf833*15&ea370396f691996cc6447906650b08ca*15&9ec5a899ba4e2be14290ce0ca01cacdb*15&fd8f71f7742d1de03b93a6f2fb9ddb42*15&03c5b21116fcc68f5cbc06d3fb846e95*15&81bfaf5717e37585b4d91533195b1395*15&1648e1eb88e0077ac6b9be257aba7500*15&3c7d5fcda05c5faf20f0465326de5ab0*3&ee203d4c1731843961e8e27e5d6e970d*3&184bd1b16d99d572bd5a42cc0ffafb30*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"acb8a860d190df4c5629e0fcd48d042e*10&28b6d01611932838953f14c3a81247bb*10&1227186f2d06c383a16a3e8cbe5498d1*10&69d04c35ca3e0bd4a4a6ce55e360df11*10&51a40c78c46b24d6df9b170e979ce429*10&a4dd8f67dfa6b057f57e7e78bce6c6e3*10&e020ff0b99e959fcfdd62e44919fce00*10&ceb5faac20a951e5fe201ac6a84fa75a*10&8860e10f0a2706b61a2279ba1d924618*2&cd2e90e60813cbcb700e5a816122228a*2&f03d0cf62972bdf3464c2eba979735dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"591f32fcab8ced967dee69d0f52c20e3*10&018d7510785f0068645365e0e973d4d2*10&9c8dd99a0aa259df93a3b71cdc04a128*10&8d2446482927331de296385c17210752*10&357bf9c24a3a6f3adfc48aad4317152e*10&07a0f55b9c67431b10c636f67e85e29b*10&8f3dc712c7ec714c55a21ade70d8180d*10&e19b13ac6150a4c2067cbd75bb53449a*10&e53617e2b455abf6bf940b3a10df482d*10&c73c91cee93612cef76d9a9aece3a34c*10&c4637fc438e87cf14fa32c758db60e72*2&c7750ab000df7da95a0006395d5a7419*2&f4cc63d6506c6a3704e7586df97b715f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"441c70d1282a9b0c01126aef01f06ecd*5&a9a97580d7517e9f3b38eb73dd44e1be*5&603d96e6755b5514cd0ab262c574659b*5&23184ebaad1315462ed6e32a3f3d317c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3fb46e6cd0b1097f22aee8012ab399e6*5&5460734c9a054a064fba29e6b2c0202f*5&1dcf7b56bbc7a92810fd6e6dd0575785*5&5c8a1e04ecd5b8505f6bd627f052f7ac*5&e541a75e2009a0265a4b81af6615a652*5&629cab7ffbd15d5cbe9887bb68b6871b*5&468cb2413af943e6442586f2d1da5663*5&cc2a5b6c9791d51a7ec60539a0668804*5&bfbff8ae283ad157c29326a543c5e33d*1&7afaf7e35f9ce43a49ab98da3df38acb*1&a5277af003958f1aa06c79473df37a3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14c0323e74ceb532b0d268ba6b902f0a*20&ca578ef7e3c90708975cdd311e5af225*20&4fb791aa146267847aa68caab961e956*20&e328dcd6c1297a8ed4ef7e8db63f3b80*20&6ab9221105f2b7f4be1ac8656eed9882*20&860bff7bec6e0a00a570dd58d978028a*20&dafc82eb89104b79252fc73ddcc14741*20&a2af7bdd0f02e4fb5e1b16ad6ce54a97*20&5d9587d0f8f95912109642cd1cc79296*4&97dd11c997dac79017ff6707b877cd4d*4&afb46308361b7165ccd39e164aec43a5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"22c9096a7575121cc6cf073a369463bd*20&5b643731d7196c25cab2a14385946aa0*20&59ef9e375964c71ff965940da86c4632*20&bff052d971c0be64223ff9dc7bd88cab*20&771d216d0b81ff7e7406ddafce3c3224*20&b3cdc7eae40283b3dd1658dba0da646d*20&baf034af4153a2b28d5faac3d7911bcb*20&ba6095cf1ea95cb0e9b49593af0121f5*20&12f0279f16f190432d8b3194ec9b613d*4&a1c0dbadca93d9a1ac6092b72bcb6a7a*4&9abd86feccaeb9102863b8aeed102aa8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b59db6cbd82ae6e49bdbaf8b7b27542*15&c413443b04a5585cbbefb11419a9fb82*15&74af88803f087da16676788a1b2d6a4a*15&3d3a4b9684704c5150592a2af33d90fb*15&0206ac2901e73a140f24c2d68681d84e*15&deb66d17adf4e7f4504668d3546b41bc*15&c741071d379463a2fb7460c1a5d45b2c*15&0fe085115c71d079e78e018d964b29f3*15&abb16739a02d38c508ffdaba8edeec08*3&2c10214c212638de87c42abff6bdae0a*3&15ca93e21976b190017cf7f76540176c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"99db7599c250563416fe0b2802f2a6e6*5&8a69aa06dad4ac16bb16607645fff5ad*5&15ade2d0f264862f4ba506c3730f8b53*5&73e25b6ca6059f69dcf31b85db9e3390*5&0cb5f5aab19c3776c2b9156d0fc95b91*5&6bc75a73ad777e36e9a78d3c82191732*5&9ec2efcd7b870d4b273eaec9fdaafab0*5&f9b839b358044cb6fa6b96c4482003d5*5&d921f74fe328678ce9a411b6feec4374*5&e7226af744192f3c2adb4fc64bf27250*5&fd32611856a5b6ae730f94e6d924dac1*1&94d7dd6ef5b0ba6f10a611e922103ad3*1&92968b9e033ed8efdf7461086380c6f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1336e0131696b2ac9464a649551c9621*15&33ed524cae1e7d4ebaee1f6eec878f92*15&411321127ce99d5d0c16a0256c9f53c1*15&0af001b53cbe5deb4add4bb1101c5626*15&408ec1ce2ae37e1b8a1f8d21930e608f*15&14762a4b2acadc7271be9684645dc43e*15&9866700743508ec9b2a29327b0b34f9e*15&4385ef1245964923da69495b1ad65879*15&b79011248b7fde5496ee5c1239af0dba*3&f483bf6a3309d9baf98ed8228e7573d1*3&9e9ae62632b258f3b01865100be8c6a0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0e294cfac94764a8caf083d1372be01e*10&f043a8858ff81621326fb1c626741178*10&c49f5e39f4cfca36e6bbcadfad3bd6d0*10&ff6f820fb7ee197de4fbe128067d877d*10&6ee55ad95decb3bd55ba3c5ab84d1fc0*10&5a369a29e203a3c18bd93a4acac42163*10&12b7e716b7f64d85183f12980a7de3c2*10&492efba5b0089bcd3efd8cf88e9cd0d5*10&03b35b19412a172cd008102220fc9d2b*2&1b35b2ee97e31a134874c00afbad105e*2&30bc4f74d575bb059e81d7ea71ed8cd1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e692a42605ebf547f2b96be110ac614*10&57614feb8d231c89a35ad16565781168*10&3de2d0eab42dd6bc16773b9677c64568*10&39859e082e7a5111d2ee652ef9fb693e*10&82bae82dd01727195369eb0ca8913c58*10&2d1aaef3da1224cee36df054ef763f56*10&38d293e5a0ce14b369fbe5ad4d8b5dad*10&e69df16d223a58c911679a11feb62708*10&4b184f76524c2ad603a1864942bc3979*10&d5dbdc1d0026e62047eca5819b9871d6*10&8e3d308e49f5760a38c2f7b57426c992*2&e6b734b080b04d52d4458cf649873316*2&9924d9779f5551cf2209d52e5b5e605b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"48e57dc7cd22e9ea9bdfb45d626eb649*5&38bfc49929c9fab5433dba8f3c6226ec*5&e88a413e79540da58bca986f624f5346*5&4df2414d002e5765b3764d40b165287c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8cc4a0b6d4a9be0ecb868a312db99ec*5&495b7c1102c44e876df1dae5dbe6400a*5&46a0b909240222ab9b93c404c20a09d2*5&23c2dba44e45dbad7cd1c7671fd06cd9*5&8faaf28b15dfa30a08b984a3319edbab*5&3eaf31c9cd209fa5fc519bd08714d78f*5&db954d90dfc82f849fec0a670efac567*5&07312c1e92d0c218bd8fc7a035b54362*5&8eccb3cbc71b17259458ece50aef1d03*5&114b96cd252a90bfb9aa963e79da6f33*5&62274e90eb9855c9f71bc83f4a5e0ba4*1&950c1c68ae420f57aaf0858405c467e3*1&71d71df445c8a78b9bc10cbfebd09cd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ecdfbf8311f187a02d3b0e447bc6ff51*15&98b1a60b7183958144fd342202a604b8*15&f2f3900e7fee51a3c40b6deca7bdc345*15&586c208fc443c2c938614ed851751375*15&93127543523d406e3a9a81cac1b7f65b*15&8bbd16b8cb511fd69c8736f49d9b2d01*15&e225934b3711260383345526024bc7dd*15&4bfcd162142214654058c8329d4e34f9*15&3aa62f07607520b77c7d427046845129*3&9cebf0172fafd7537a87edd7dcd73575*3&dd9debf47f43f4a2f48658f67d6ae96b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0ca28b971119b44bc50839648c16c7f5*15&cd0cb7c77227dfbb43171d094c578c34*15&4201bad4af7c98470674ed1909199de4*15&344a1ef0f703f6046bfe4738bbb0b253*15&5f743600f1f1f49acf7bfc58ce7e7464*15&6a4fe9f4ec075e6bfbc63a4321b5ffb8*15&03a5ca013ba968ce6accbf25ce585cbd*15&e1240007956bc70c6c4c2a96ab0223cd*15&8415f016ab6712bafc65f011f8735bbc*3&524aa5d9234a975304e3d085dda30e5b*3&5c53fa87f171a5d7fcf6814580681f64*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9ed2e36a374bef562fb5a8192cab308e*5&7ca5bb6aba58c4c81552d482c71aabe8*5&167aee4379658e71b2b47d2b89006fbd*5&9f45a0834b7ef90cfe6c8758d2ccdb02*5&a7860b4e4c4423ad61321cb952ca7388*5&3e4787e583fbf7aa0b60a8fa46f1662c*5&887837d15b9b639766fca6c9e072f19f*5&00f1f5f2bddfeda22c6ca39ed02b0a11*5&4e9b8c98197cda33e7a863a07a9a4be8*5&d6ff15b5e086007617678257e727ea07*5&cdd50c78e0ece6537c929faf1219b705*1&06f6f32fb898382835ef8359d0570b48*1&3378af7bdb6e718210d5ab7b34a7ed98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"250a202fe6f9dda2b0dbb4bdc4d6e658*5&5480efeab943a5835f9fc0fef5bf0e8c*5&f55676a4e15a1573879ef59e314bc235*5&5563df99c8d4819ec7adb2afa641504f*5&85b6a673e289fdd28b1c3e8aafa4a633*5&f5f615638179186b9cc983789707d053*5&f1562a9b02d7c7829645884d452d9a92*5&9fc914962f684847f9bb74e5b90e96a6*5&4aa094aa5ca196127313e7f0da0cb3dc*1&4fab50a3dd9b796583856e0bfa726d81*1&97d6c6aa466187fa583beeb82fe78240*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih448oh224kh7sh2dh0ph2_iw448ow224kw7sw2dw0pw2_n"07b1e4dd6cca9d5f7567106e13c12566*5&bff2ca79aa55888d9d9988da93df7881*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"71e9baaa4f8740450a2ec07a23b96f6c*15&0c85956e8e862a867c62d00bb3814630*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"0c4d0a7719e54bb07312735ab6fe4295*15&c38870bbf53feaa8befa8eb39db4761b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"2b01d3cea71a3902a95c77cfd2727e1d*5&7d7c1c5e8fc5d8ad045296101b1a2810*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"c9c29bb3d752d66719d92d5f91e2830f*5&3a1e87ddf8690900e950021b10b00705*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"67dd3a8a85ebabc1ce1f7aefdf11b24b*5&71ddfa8d3ce57cd6fabfbf214084273e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"dfe1649d07d9e8921be675d193e14e7b*15&72380aab5e15845bb74856c677087753*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9e5df44d671b3e0ab9bf2e278ab1fb07*15&22e44894e766b53dacf4751ab5a448d6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic64oc128_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"3136b12099d088cc8e7a096ae01c56fa*5&1bbc847fae8550bff58819880cfd764d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"02263cd93525bf40b2b43041482eb3ea*5&2f2f36f8bc33fb46bc469e18c46b8333*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d69079464eeeddcd20103a9eead0e4d6*5&b2ce175972f7c675b4666493cca0cc08*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"654e9ad3fd8be9358e79a48e0856a782*5&01760ce31d8d754e4c603f8ff068a267*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"296c984d860c187e605a998739aae0ba*25&cfe54c737549cc712c811d0a63671a2a*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7cd59144ff512c0be6efa6d9f5623283*25&492b94a8834d49333c529d3441b5f846*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"980a5534c0e71e4f34661370de8ec892*5&d19562513c9a4448978e1839e28cdbe0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"3be280191ae01feccf3df76a1342acf4*5&3ec27e19cf9e6fc1ff9a54ec97622fa6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b1f764f64b96dc7bd65263728417f35e*5&4c73f1918e297a9c9ddc0dca957d0a92*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1378a6c5b53687dac165d473726f0829*10&7736501a65c5aebc0a452402d892e395*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"634dfc957e604da8bf0ce6a6ba0c18fb*10&d33a6f467bbb69e0b4448f6296d49c33*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4b126b36958778379e1c8882cab1f827*5&06daf3711db6c8e8a613e124cf3f5463*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cf5425c1e45d8ff047f1eccbb9938dd4*5&107467c73f5ba5099d5f02881d1c45b5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"9753887279efd8fbf281f608d3deb996*5&9bed608a5e7c7ba8ab40bdc37804d1b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"3d26b154ad8d50970d9ae6a5ace537ef*15&909b5da365645cf39fa8f2315df0a0a6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic64oc20_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"49aa6d6107a7c05b094a0ff8ee7f9f0b*5&f2deba56a38f5bd33a7769205e75b5e4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"439e152982cb200ced8c816a48eb6649*10&3b39b0d9c98e0c0e9b2a68c3abc83d80*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9b5bb3324995fcab804706403b577599*1&51e3ca7af0e23379c7e165df2b13bea2*1&96e687d4f0e064d9a8659b2aee844076*1&a3ef6019c2dcc964d68732ae5e80c5c6*1&c114e5a97fac896c4df65391d7e0ffe1*1&99dffa6ce00e9e3fd617277b68c38500*1&1e2b47d41fdfec8909c83ccbdde56bc8*1&5e7a4547800b4a52635f8488def8c49a*1&2f21e9596a2d3c9988c360bc06a3ab77*1&7b9b9ff24125ca8197acd780d5dcb590*1&5fce58e607a569315492f9f9eacd06dd*1&9c181f4f886f5cfa44571fbf6ed599ca*1&a84de28baff83fd808b14d718dd05124*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef11264deaefe00b9f272a21f7b86431*1&19270dbf1916bb66e8e12016aaee8752*1&f34991e2eea8fa2b1bf9a838adffb51e*1&248d9a548be7837fc19466ccdd6b7a84*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bc8767f05867f5c236f8a279eb6f403e*1&982e149fbd804d12007dc69de2c37099*1&66ab638e3c4c2c6c50cfd9aa73d2bc16*1&bbf9a518c2e60034dfb668609a76bfc7*1&c7b523eb0e403f5463c336025caf42ac*1&9a59f657f525240bf10be5dc855231a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4cbe359f0c034469386db38625042ffb*6&5143c3f237e636986ceba3fa0b2ca29c*6&fd9e561102d829b7fd63de0fb0e05c56*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"49d1110de3b7ac496cfdde49ac5b7ee4*1&62319f3354db9469970eea22733da80e*1&e7c5687725bc8a6455de1f4884ab6a3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"69c44e33dc9b551fab1e7e475f2ae5b2*1&40aeee6beabdc6ce8bb14bf0074c2aba*1&78f530332c4b8a9fd2ed622e625ca398*1&e333e878a4b51cbef31001b65e0c921f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3b6c4b91a19e08f32e2f2216843aca61*1&f0df502d9f2dc633e5ab9155f834c307*1&c19172e242ebf1036f2f40447fa10817*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"14035b620706fcf239393484cf9eedb7*1&2c8895fd3601683b2d0f3f3b74d91a77*1&0816cde76190e4a98e7d81e27a55d0ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"802d0285e1bb28dde4c0b0cf7a9b0fe6*1&aabce02356e13721513899d10a912937*1&8fa1d09f527ef23ab92f6a10dab21837*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"87656c2e538c4a4ab9e3accd6b2e08ea*1&06812c9260f36b55e031ee5f7c85d9d0*1&49cfd853c9970b49d8348f1f91a06bce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5463b7baa92298000737773827e7517e*1&80f7909b0b6f6eb1a7765e77f5ab4ef2*1&79492beb32b8b487d604c3488505aa5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"98b7cb39f4722766b3cb4addc1067423*1&befc7aaa96be4db0b7dfed2d30338692*1&408277e083b8ba65ce08ab3d690c6508*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"86ba63b6b954ffc7efcb655ea640f502*12&c7e244c110dbed10ddfadfa8f07a6d44*12&18abca56de767235bea102da903a79e1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"406e7d43a32591fbd0cbf0ec35a16656*1&ecf64907460c830a9f154921bb0b1328*1&d719f93e499a8cd6f31d785bdabcfe60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"621497af7fd1c1902df0a70ce301120e*1&8a9b304db14237043be9ba20f3d5f01a*1&ee0e9c03fd80822b7b834073e73f415c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7478397a5ac105bf7bc0e0917aeaf40c*1&bc3e8e911d7b0bf6e219671c8e438500*1&f1be269ba26dfabe96ce9dceeb8ff6b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b27a00eb1ba58a1ef5d0fa8a97d581de*1&e3f12f5991199e293c05a84861118987*1&f6726fd33d94ecfec74a219e1e7729e5*1&e5d5c4e3779f3e0a588e98497483f5bf*1&e36c821fa36faa098b7deac351557258*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"987df8e2c2e9b8af2577c48fc7f756f5*1&c497a8cad9149c82c03e09ad49a8e274*1&3234401a6469f6580d03b9053eaf8c35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5ecdb34df4559d90e957b3e6b15131a7*1&9ae034fba7dd830e54de8aa9b8bdb2c0*1&237aa46f8650dff62fc7a9b94ced9469*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f8c0d4dffcf3379df4f01e76b521549e*1&5ffbbb6f694e9144316240dc5d07e884*1&c620a72721769ce1fdc52fb6637e3837*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"55a6f397584f44fbfd0795dfef3c0876*1&a11ca2a69287f5957a87877594deb0e0*1&6fd0b37b7331256ec8e2e8f8b927573b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a99956961bafcc2980604877e54f8896*1&86d33002d1b71987014e933b525803bf*1&3d1a2328657540aa72111bcd7d20e2b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c749361ca41ab9d45f552467039a31ec*1&bdb613fefd963cba7815abcdd9f99972*1&27bd7d6493b4ff5d7e9fe5ee341c7c3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"64ec1f766cc1f5952060ea81b1ccd568*1&59cc7d5b2d3d32935532ce1434696fec*1&2e361fac51bc873f7ef7de7ecf9f9f2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5cdd425d4b76f45dff4a0834a71aff27*1&f5672386dc4f857b39c0590387d7f5c0*1&34b94f90ae918ed4033cd0dba3e60be5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d17894a3e2dbfc9719de876a6add91df*1&ec90f0637fcb2e359abd383916672c92*1&8b5d01fa6da80ab2e70e93edbcfd7f30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"221b5a4f4a862766a85f22e6bde4216f*24&a13ae6446df940c54b87acfdb56070e7*24&3fa85fecf7c24d21f1f43c1458e1b995*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d8996e0b9860dcae67b5e00ced0babc5*1&ab237cf885262c1c60f53c6385b1e56c*1&c2a44c1bb90292075c788447fe2c7613*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4c04105fee1b90a7e0079a3fdb937ede*1&84d2a304bbec0ea0732ffc12dbd0ce1f*1&5c1e66f5e475a714221c4ca535b246e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8f468130d7ecea96f8def935e601fe90*1&10ff80b8c8b5b572402a0c2ae2aed86d*1&19e0d404b02e623e6a4005f923fcc146*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7961e2192787de300290b6bd94c15a45*1&0767d61b242a3598392fa7e93be41ad4*1&2de96dd3f8d954cb9054946d5f8f2c7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"848b9218287820e093dfe79e0224ccaf*1&59ed1011e458f3d84fa2d3d8b1dc55de*1&d9faf1b7ce41f7cc605a46661159020a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"24b9c6633b2364eb5ec0b0045a4f61a6*1&7f85b911f01f4069382e5bda256db096*1&06d1647779ce5e3fb9604dc262a9877d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a140c17212148e590da75bb06a085ef9*1&e26f2b7c311664c3caad92c9208aa7bb*1&1259d9212717499ee0138bba389e9d90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53f32fa60cf1a10442039bff75e4c89c*1&3091a6bee7fc6163cb41ee6e988ad010*1&38ac32f518ddf9095b64a47c24591f78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bda8511e0d4f3614ebb3f3520288577a*1&f2f8995cd36018daa4debf1db2a864b0*1&d96016ef69de29a86a79b28f2161ae9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"033ab01e9079e7963a4305e4206b34c1*1&e2bf7c202d2abba5ca8b253f053664a3*1&7b7ff2ef9de0442e9b187265ed3aad17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7280acc56fa172ff4a98831160f0ad4e*1&4176d01c57abca5f60a813a879d0b915*1&16c7cda91d2d997b9a42e1b4d1d3a0e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"048751dacc2aaf554a89baa6fe71ad38*1&335c06e068ae93bdf799a17dcb8c8752*1&a0e86c507b6612b39e92dd918f14a0a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0f2f663fb169449f7ee342922a49345*1&917e089d2c70ac9fe07fd92b8e04bb89*1&de8e1e657eccd282f4735020dd02e9d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e77c0ce9f1c9d231f2c0ba876c7e72fb*1&77d05d1a422e97ce7510706790353143*1&f25b9e5f9bcc17f37cd6e8dbde2b39e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fbf08112f44df907a5eb6aad66a5613c*1&1524a7c9edac0d5fa2f38d5b2ca7884c*1&749f9c32db46660e37610323b0c213ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"19b1cb54491091099a6a7c6a2c895db4*1&f24b9bf6527c1587f64e5ef10b9c4ec8*1&dd8ba3204e6b36dc40914dc1693900c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6c6a8b0832efcdcdb1addffa0b688b85*1&ac91dab0788848ac1a6a416018100aed*1&5b9ecc7916e063248bd516284f395693*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a3f92b490f72866f88142bcbcd63f0ba*1&e6e603d8ad05f2c2bf3f732b4368f0e8*1&61089083ffab9dec70517199fb13868a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4dbb3832a35b419793fe8b8942c9e05c*1&10cd073ea6a93f9332c2df3110320737*1&6eaf7dae7c52ae5425e3efb91140663e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d999ffe40e1cf3312d363e04408be92a*1&e34220ef99a953ced3fa43e2b6156461*1&de34a94712fba3c6d3cc34b335a14a25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60d88c625d19b582a5b2fea3967f26eb*1&6cf3dd26409239c158a2237e511e85e0*1&f8b71cf25b321643d3d2a8440ce837de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f7e3c1e82ea0a25ff3b0391129c9ea07*1&5bda50bfa4f0bf7f0ad197f68a119342*1&26d1d5aed0f0fd4d7e98a78fef030ced*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8f4b46ddf576c3c389a4f29a15fadaa6*1&268c516eac496c07b800f5d1c9bc2fac*1&ab235d06758ba2ee8083ebf51f4ef66b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a7cdb9920fda64da49fd3d311c93ed1a*1&310662b5cf54c739381aab1295eddb56*1&c04ed4365074c480fe388de01741cb4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6a5348672d2c0cfd7f0ba2c5efd3cbf6*1&de80d9a9613b7af8150fb081327303be*1&dc9d3efdff8ed3409311679133af19fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fa9297309adad4730964097d96a2faab*16&bb77d6d0c44d63b83c7f33ddc5925bf7*16&b2d9739e7f2f08c9d4a0a8d8d02e3563*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4ff36733dffdf32df85c5fd0233874f8*1&a832fb8caeb48bb3a5267150c6accb29*1&07a3a43125cc1ade8b513f739ea187c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7d9612b9ee9ed36c318e6fc226b95708*1&5e6295d1a95cfc8887c045210c60f36a*1&87ad4e5c3cb089db43e25dc14efa067e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"edde7ad52e4cdfe3d4cc06c20b405601*1&2963df80525e44a609e405f702c455df*1&88cd3151f20bc990d38b7b629f52016a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"628ee6a6fc7bd3a3fb73ed56b245468b*1&b8a1656fe87f9d0defb48bab12cb8116*1&395d2139c0aec8eef385b8a80a3e4ae7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"04c355417351a44993f884a0565c886f*1&d07717c94f6772ad8ff82f93a53d4299*1&e1c50c09e4f420723a2cf57312571d96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"128909a7e11e6aaa812767b7b3ce3b7c*1&db198415f6f5f7849ee0e067a7f53f0f*1&a684b769a141ad43bccd5e857372bdca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fd4e00078f24fc593feb14cb8f213b70*1&4aa7e4564f788e97e37f65d56423ae72*1&88c22bc58c560487877902a559a36aec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5e915b012e106970b1f30d716f1542b4*1&6fbe2e010e0a1e4f3d3f7d7e3bfeadb0*1&623eb73b99791fcad9c0f6882a2303e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"69bd08aa7f6c0c0f672ff10b267ac396*1&17b06678d089bd91089931c424ddaf93*1&b5be9fa015cae131df20233b4d8753ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9605d6c32ca7c27d0fa1a13942aec712*1&310b322a4b3459d367a09ba74bfb2415*1&1ae16c6ba4942f3da76cbdc85d89b6b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"28542ea25f23dbec264c78802fdd58ae*1&8a71efd5e34d98e6bd6bafacb3ab10b0*1&150666c4f38c8aeec8bc6a47b8a8b2c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2a4ff09d9844c9cabe92b6e65bb6b80b*1&42510a080c79535c8f19a1fa7687e922*1&cdda5c4b7bcfc10c268c9964f7c5427a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"487201d9a663eb50b35046d94ff7c4ae*1&34d35679eb6d3a153f4d9ad6ba8e24bc*1&cf839e1c36364ae5ff64586f7c80b8f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8c373e1ad632eb23573c773a230d479a*1&0b33b5dcfedb305bfe2adcffa576bedd*1&6ae735818027fd044b10bda01bc5ac82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4ea8730eb5420d4e3bcc6f4ccd4a7226*1&ccac232f1284f72964882b16c1963115*1&166c11757ff45ff9b70ba28c21429377*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"b4cebec2892e631807041430a4b57f80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"05a753f17fbbe2842ac8c2b25e067d91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"9f4286cab438226a01fdcf97135a5041*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5c2c7f6d3285eaf0eb3b8106341d2a12*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"33ab74d294c8848b9ed1b90af73cff76*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"813730517bf3add628083244741428c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"9760c1e8d27956f849fb5bf26f16cab7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d001e5bcd12587446e613dfa15eba5b5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6d3fec0cc540b8c2e65503162262826b*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"ae98f5cbe9662155a74ab6c7fcdb05d9*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"1c97eab41c6a90083a196e62b4440a32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"23781bd8122abe12b88d486425a6cae2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f7aa730d82ae60f5438de2f3673c4b3b*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7f6d704037ff04e156e0e22c614fad67*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a17660e8a11bddb218e84fc342ba9438*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ae8486d573dcf003a142aa896fa8a836*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"d13b5650e5438b6421eef999d9268d1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"fe73f9908215baefdc8ff95cac0a58a0*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a6bf91b7800863573fe59938633f4fdc*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"fc21e6dccf4f5f38b94a0d6c4e4265f3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"09d3737e596a88c4d703a73e13191bcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"fa35617294e47c00c10447aef0fa9c1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ab31d277d41156772c6326788b511350*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5da847285e8923be0b107dea9ec683a4*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ce28ddb6745b57a7bd00bdfcafb05a84*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6111e4f966f346628930638d4a17d08e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1313aeefa91875b11da085295e19e64c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c0218a459c281e10cacd1d41630633a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"f771c2c307b17b900b86669e2ae58a64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"44e7a355344b1d10cfaf0ee9698acb70*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7a1aee91b30efd361300dfa70fd89dc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2659ec54cf917022fd95e3db513668ed*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8007fc9fc76ca5e2aae8fcaeaa187691*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"ea9de0702ea74fada9ae6e78f5e6a6e2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"6db50a666afe97d7f68ee2a085eef4a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"dea2dfa7fd7c401c8cc5abd34368faa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"b68f610dee9f73c660cf04c88c51cd8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0aca8d179d257a00174301b6403e64c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"7880cf4e1d03a949c978c94b8f544636*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4a6cb641bb52431eabc5fc1d7926852e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"af1d03619a735c7ef28ffabcc58b9171*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7cdf51e464926b4b2bf7c6038e1013e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7a752a241b7ef10c75e36a329dc13ea9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9208fca1d122c1e6e644a263dc9ab221*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4015cbd7ccc923b2a19f94c76f70dfd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"3564e665e1c734735f2afe9bd937c8c9*5&bf9955d22a99b2ee5cc355ce6da427d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7eac9be8db0b371881d8834332ebcf6e*10&b8afc222018878abbead3d43b9ebc5c1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e7e0a46270cc586220f8d99e538a54b3*5&c98a5ab29b792f95033c38463a9c0f16*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"9d6982ca021e77f161de6de1fcafc9ed*10&d906e31de03c38cbce75562b5af78c72*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0a8e81ee99ab5d004a6bfa1ccba2920c*5&f551274d18375bcd29c3cab8c8643817*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"faf6e00dadea67af26669d2b2af8bd84*10&75d699e2a6c3f567ed3dd9aa47616f67*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"89676cf17b821ab885224a329a115a91*5&b7e51935686695fe91d5a7f6bfa97e0e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"d067529083733e5d11a01102336ba058*10&2fa13d480c50a2b986699a8b97db5ef5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"50b0064361aa3e05ea7a5bbbc5b97146*5&7ed79b048bbce65aba6032f16af4cefd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"c932b087d63cdaaa402b825b23f2027d*10&47d33022b604498353b9dd025bd6de01*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"c763a107ec73c005e3f2a5553f18a074*5&f96fbbe36b36947dc2f18bbd374197b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"9b429a38c502a9cd92d0b176e81f360a*5&aa608201192b0aea45e4449a0a082256*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"ac6f3cf7e20010acdffcc3944ace9a73*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5cc1b67c682eeb173434718145d100af*5&9cf10a5cf27799655eee03ec7a32b36d*5&b0fe24c543d461287ea5124b1d44eb01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"b7b88c795e341c5f9ee0a215202c97af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"26fee5da576e586b594684a6a1572aa8*5&dfc8b4f25ed0eb789b9aa250f17a1f41*5&adc2c015cf6f7a55983929120e6ca284*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"2758040799a2237bc9b70dc1a257ed37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"99151fd2828ee64b5a31b06e4cbad41c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"103989abf4aa6474c6fe2e46f81ad1ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ad9a527f4133285016d81509718642f2*5&4036f99447d175f8a0e4582edbde8f05*5&f179062986620e6af73828666868af8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"61434ad53fd071eb837947516e74d35d*5&e1cc666f4bf1908360f33bf6e77bfbeb*5&3f5b37d8ce5f32a7a497e42f45dbc353*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a2e8908c18602552db50db607f1035a7*5&417cd8e2cc8410e9d8eb2456be3f106b*5&91b320193e80006de2e34deca100e23e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c3c0336461382d2502f181f3ef8015a4*1&c70cb75769dd96a474c8ed3f301e9632*1&3f292634fe5353d08306984ff8e82602*1&037296dcf15164c481d403910679ccaa*1&3633a897d2fa03b182c172c879282cc9*1&44fcfc7db100484afb56d84b8abaa0e5*1&a8992f53294a758e6d8c88dbae8c882e*1&1e55a952d0f509add032a8c3e8f335bd*1&42269ebec77b0e64325772d2fa3592f3*1&73de9b2fcbb1364b16434da2669872e9*1&f906c5c77cf065625cb8c05bb88ed067*1&8dd1565ae6ea36fe70fd4ce19570fbd1*1&f2103471a2798d7ddfffe7b8622b9d5f*1&e2f9badcd009b5192a61c336f88b0b69*1&bb781ce23fe7ed84e25620f0938c4924*1&5f91babdb2a9e947637f8e28a01a0b7d*1&0b637403bae531a4a3453c789cdc210a*1&1887c2aec9e537527bf1f4ba37dd631f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0d9a47dcf552084041c867a7541a4f05*1&21bb589c405160ebdea53e899e90d0dd*1&a969c2c6ab0ea8bcc1c942ee48c6ebb9*1&9123070ff49c4be325c72497c1a82cad*1&76275b4af748e664b8f64cd62cf068ad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bf4b6ffe2ced987c5bfbcbce9f7b651a*1&3718550273bb2835b9a1d1ddd19122f2*1&a1e8a5c7d57b6c86f99476c4e1a9ed4b*1&6791b1649f2f485fd3a059971cdf9824*1&07b203853fd88ab67a47105b268b2b2a*1&daa5963dc841bd1496df6609b551b349*1&f25b87f896a064ef4e0729059c073dae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bc7582fd2f3c13bdfe6528ef9df2f3eb*6&f409441625b3878e798dabd39b5282a2*6&0dc21e81c8bbceb46efe56e2d44bbecc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb1_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2046d9dec5c02ca4495298abbc08e865*1&1647591fe384004641e83f51dbfa7937*1&0d204420bc573821f59306dce3c035a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0b2240c49c37b3eed935b81dcfc39134*1&4d0d74a3bfe5cb97ae3600af8231123d*1&a055ceebcb773bfba3d535184d39f474*1&d53a78907ac10a84f9fb5e85d7ff7430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"34d7388a94e54d4c64c8951c239025d4*1&68509ec022fa674b6355c0b09b938db1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cf13388ac8241509523e9213ae3326bd*1&3ffa69b534b6e512fe05ed4c92fe27d7*1&c579afff99a6dda78823f9ef835b5aa3*1&20b91793103c7df130dc15ddcd7d75a3*1&481d2a875d6875085925e279db1e92be*1&07bf11bc0e6b648e9a2d0e1a060124f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g160mb1_ic160oc160_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c23a4b02940e08f3f00bb6b1b22a7eb2*1&d6051a30ee6346b4e78f99ffaa59fd39*1&0965bff48cdcc8f7ada76598d7d1ace8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3922c5e2156453ff3a1ce2d687ad64e8*1&e7a68d8881667d6148677f73aeab98a6*1&86899de74e82e313788abd6a03ba0c33*1&8f5ddcaca94be25e8df47173d723bfe5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"250f6bed8b6aeb1c02cf6f4e826f5083*1&1dd49eae427bd7c71f2d11cf21466d26*1&bdc177b90ec031f35d008854354cf6c5*1&f16d5586e2295e4346ee6d00902cca8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"700a2f384f7aba06d77d39cb14b4034a*1&c61f948c189892944240f24166db7189*1&5a861125824b455bd8fc3adfeb5cfae1*1&6a67f760dc3137ec0391a27d2100dda0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb1_ic224oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1f905c6fa3422c08fde1457c7d8a60cf*1&3d1112595fa96c0e2ba41683331915ab*1&fb7527d4f7ae21ca0ee1a5d6243ca5a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"55a7abaeca5362f9af1d30ad01a50f14*1&dfd5a6a28e949d7b06402f5917fe16ee*1&85b939f3d8e7868a479249afc83d1689*1&e29659b14a15d6de62fda9df5af26cd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"907974a5d6b64a8b12d268daa1a085d8*1&d62b8e1987857432ee4e949ef584cc47*1&203c5f64fcccdfa5199db2b8318d1110*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"41dbade591d100ba4cf879165ff97f60*1&ae08c5e76ed03a6036ed1455ea3176c1*1&2e2e7db987afb89b0c516b82c1fed7ea*1&49d097e9e302984daadf88ed961c401e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"329716cc07642e75a92b40aa8cbd7c63*1&8c7a33bbdb4c292ecc2ea03fc291eb44*1&9befb79558e49fbc47cdf1ad9f7b8cb0*1&65d7932c9362251f169fa1ea9ab648ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"734ef81690e2ffd6adca2c4bf489757a*12&4e454edbecf7e2bbf3f8b00a6c5cbf80*12&8d8865940e8b5a170b02fb871bbcc32d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bba87d6ba5782de21eb72472c83eb11b*1&2c75326766a4a7be1f4f64dabc135b1c*1&2467b0b3cb6a5d67bbddd751dde40938*1&469b437b94d22e84bb5c4e5f65b85b5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b48b0efbcd7ae24004222571ca3c6a50*1&d7e294edcc8615384c898a36a52a4b9e*1&9991fe50368eade747ba46b2010a0748*1&8a68da1e579878b816ddcd7c652338e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e58692e0a97438667f844b9f973504fb*1&95ec1888b842e3009ed6db122bcaef0a*1&89eebf64ce84dd6d790c7adafe80d1ef*1&21af98ca76603f112f92e3e4e746fbc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb1_ic224oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8f40baf823d237cd7e972653a0b902d2*1&96eb32e1725e1ed859eb343181739a12*1&a3d44af9cad2e70cec4d7d21020f9073*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd67a9a18c6129cd5e7a3d1cc1da4986*1&61d2c99c0ef88749c6a145b43e6c4c75*1&90cc940f26cf7293dd81ecfb741bc210*1&e6257a42fca86b83607eda0513748d1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"331535cb9e6e11b580b1e12f43705765*1&3d233a77926c07edc442f3b800cae7ac*1&c510d2a1b7e1eae9c36cd9ca3d5d69d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0211d792aa809fb72479862c13f615eb*1&f1d36d6e7025bb057300026aed9ca492*7&f65416edb43c4cd3e5a5b163b25653da*1&b5c33aed47fe2162101a44059f29c1a0*1&ab71551d4c241fffeeb481921ed3fa6a*1&4e05bb432167b626998507969d253263*1&73bf089649cbb2760ad168aa70dd4a06*1&174dbbb0b120cec2b7a54699d005441c*1&ec0f00bb887870c1647b60e4a99ddc53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb1_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e98c98e39efe88838228f0f5c1ba0029*1&444cb2bb61a34e25524119585b1ed7ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c170430c63a82f48661b117f458ed14d*1&a1ab38d4e061f40789f0b10770b865b0*1&63f8808a886820c43cb8e84d7d8830db*1&11ebf75d4ec2e300c883f417ec838bb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g320mb1_ic320oc320_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6b0bf11ad69569bf96a56cf8d57d2a88*1&0e96ce557f3fe7f8959a3d5238ebe2c8*1&eec83a81904b707de2ec55442b56b8d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"27f091dcce881ac1d6b0487a05c050b8*1&467ef194e06459634615f13f213cce70*1&03b54d3589177a252b68f3c9355e1f8c*1&bf849cf779173f54c2019de7047afdfc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g352mb1_ic352oc352_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cb6873857bdd3299a0ec07f90b1797c7*1&6eaa1ad3ea6e6b0304fb30fdaa5dab05*1&e3bbb651544a1241f5517b30d8aa816b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8e4e539d1c15163fb07dc31b147f81c5*1&0724724197b7a30b32e9567f1e9d5883*1&f09d54d48569e4b356554c2fae72c33a*1&eb4243eaf6e52ca7f9b039a640162558*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"04effc2453eb84d0fcab59dac4b13906*1&b94fdc59c0110c0490b7c76b1ce51f32*1&bb12d1c6247bd349058eb54008fba4ee*1&df4b4aa24c0f347e9d8fb67da39beb0f*1&e3bd4ca8101f5953b7a212a5c4c4f1e5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1859624ba6b9cd7aa7b25ac926a68b96*1&5e9dc050ec744cdbef9f7e8bd906ff0e*1&512b84fe447e9e0c35774e08f7be197e*1&27ccc496bec6d38e0704e334c2bd07d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ffa90c6712a6bb45b95725686e6480cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b26f6211b1f666c57a5270d1764008fa*1&fcd982436142c337b58fbaf433a2f027*1&374a8dc268f1c7f41ca58ddddb0481bf*1&3fcb954a8b3944ce72822d319aaa2ff4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g448mb1_ic448oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8e6693feb09228254abc947db1e5abb0*1&da31003c851379bef29d607ce2a85e07*1&1f223685491ad9d1cc89821d14c48c12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e42e3a2c881e1508bcbde18a29df4ab2*1&a8e39329b289bbc641be03806c6580e2*1&4ac3e44f839022f1872c4bfa44f6a318*1&303f2aa91f8d171ab019e190029e7678*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"62a7d90dbdc8a67ca0c9d55b275dcc1e*1&a67a8716d3b90e4791c4bcc4a17947d4*1&c10f9550610b58504750f4664405048a*1&9bb08a3ddd98fe3de9bbc6e9ab1fb16f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"07f137edfa8518d54552a61fb1724ace*1&2f96e4111db885a81751d6b9f582b702*1&e55034cfd6f22540bba9f20174b35a6a*1&9b4cf72ef1b365da5dbeae9010d9a91a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f28f7ff1c9c7380eff42d5bc434784a4*1&61c88fd661b81cd314c60761be7794cc*1&e4af3db89babe0e71f4e6859d2107898*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"132319d02a192792e3441ee9dae12862*1&801e9ed8603ebc30fc0fd1b65b770fda*1&39238eba58e36b2d382006cb955d93fa*1&9c0fe3ab810fcb559704420a38016dfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9f5b1157f7517647777f16539ac11e61*24&28a35a4116982e323e56c18e326d277f*24&a905b7bbfbc05b87eb4bd115cba6e62b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c46943f6f2d5329d96b417e71a1212b4*1&9f56044d37a4f71a6fbd547901beec93*1&c5c71ef8181b19b7d39023c1039d08bd*1&489530603bff373e0bf612a1a5a5ab71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf20ed1114bcceaaa54adffb83501c2e*1&d3179e82c1fa7038df186c27643c91f8*1&67d09d5261894890dfab3d9f4feed5f0*1&515a32dabf976f349dee973648d2f758*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"46d68e43d96a546d7b572b1c9a916370*1&dadf67f4d9d9f6118c505ccb479f6e8d*1&9d5cdbf3a2cd0da968670703c3d97a00*1&0e4e1226588cd7db96daa683203d2228*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c41a9d3a76812705feba84eb6859d5df*1&83ad3f0a6e19ad11d9ade5b3564b82ab*1&6721ad336dc07d0fc6e6e7fae5dc425a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c88c37e4f7306eba3d478cb785400278*1&e895786d2a7f2cfa414599a88d50a257*1&1f29b9c51d0a0bd98a696df0a426337d*1&61b8a4bfec7b31eed226a5bcc2a8cca8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0aa47ad6720703bd375307e89617c785*1&390ff2b45110961b1e9a4c42b77424b5*1&371b8687cae28749cdd80e82deb1b555*1&757a6311e1534abd2abd892e57b56844*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb1_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dcfeae1b034212fe5a4aa2525ba8895a*1&ee59f380ac1b1a76c4216bf781e37ad1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f7b5c9bbb473ba7e22826a63787ee9f*1&0beb0de2e101abc1f0aef7f9cdc31822*1&d40541f8d3bae8ac13ef8a3e565b829e*1&e4e3fdedbae1a4529ec340bdd13b4fb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2c4eec3a99dda1909d109a6419dafb41*1&883d37c40c8b32b4ac87d1e8cc68e841*1&5b8ddada6c114b6e08c23bc1a2875e02*1&6818edc6777b0828eabba6c8329c7323*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"871942640d780de5f842466de71cdfc3*1&68fd3ecdf1ccf544854ef74673142590*1&c24b9edd9c1ec21613993a3d9ae0384b*1&e6c474fe3d2509f82d82cb657099d548*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb1_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"148e407d18b9762781a6fc7807d86f24*1&312a9d7f208ca6dabdb1e031c6f9c2f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"77698c0e4035a999af09d2662fd1a853*1&f9055de7cd3ea9ba3c806d9371397324*1&a5d3eee34df58b034a4b724c24d75f9a*1&99daebaa42e02a3d53c7da6e293e0aaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f078084c1421e7f807460e7efd1dcb8b*1&a55323e71566c03a3b668e62fcac0943*1&63bc2bcfbbd564e413d341faa207f625*1&3a57afc2e2dff95df5f2f0305b8ae79b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb1_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"10316bd3b498121731d40d840f39b1b8*1&861860f9a1c8382cd56f0a574f0e097a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d9234fc520754b6abf7e2b265df87c70*1&692fdba432519ab2c876193e3e170aa6*1&564d3d89f77db45ac3418263c70d8b1b*1&73ea86b2613ad4fa84af88cdeed43005*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6d6c55944398c4da9d2daba5022388c*1&6f684e0177b655ca00292765ac56fd93*1&f331fe2d52e2fc408348ed766443d505*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0c9f5a7174b45390a24edb77f6d90882*1&1b968c55838b3423db0c0d24e6d44197*1&d9eea71a12a4bef409d415e33560e6b9*1&b9d3d1e72fe1e09abcbb0e89f7542bfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"73b21eae7ca0e87ce115a142bda24578*1&ba7bbb28d630e110c2ef1336847d48f2*1&c04ab5a400677bc8466bd3457410fd39*1&32c921197d7600ecbf33739ea843e701*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb1_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"10a663e5df3f6feabdb40b2a16ffb4e0*1&bea88523d2e20002b2041cadddbfd375*1&0ebe96e58d587a20ce7050d8ed7bf30d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b1c2d140b2e401a8de0c483226ec799e*1&4c21c132e4e213e825c9532c6acd5f31*1&55bc593e42f2c652a3542bfa9f361ea6*1&cab74cd20cc097965dee19b2d34697b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb1_ic736oc736_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01f51d01adf5a50c955a215760397982*1&222a65c868018c24117095b5f66e1a49*1&059083a88760ed3883e1ef3326fc623a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0837cdd4344e7f2d9440e6dcf2725e75*1&5ea6b0384117e35aa4ec845fc3ac3dda*1&0ec1dbe3d473ca2291c57994ff4f5898*1&65c7a6355ae16ab5e6baed01c37959d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dcfb7dbe29a00dd29c24a3b4663fdfed*1&140fa1ce12d6d7fe89c55b4f807a2b67*1&0e6ca6c12636a63eae7c49cd7ece1f94*1&1dd7813136a8f7f4bbc6652d77066fcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb1_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f221678579095b2f083e2edb6eea1f71*1&b17f49c966c392592c4e1d84735cbfba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b95a2efcea968c22ba964ecca166ad0b*1&9a622d2ad39c7f6a87acc00ff35962bb*1&8bbe3d78f1a8534cee6d63490e36bca1*1&685594e9931844c67d10f5544852ec80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9c64f60933bb3c42677ceb311b146bf9*1&cf23e0a892f217f36e892341fb95a4c0*1&8fff11299a7514169c194f2741f0c21a*1&4b5aa090d52729fb835e362499c446e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb1_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ce943048e63cb4cc684c383b8770a4e*1&86dfa5ef051ba5ffc49edf28c1327102*1&24eb7c60ef459efe83aae8c19305201e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe9b3a5dad6b089b534d7d52356c1db1*1&c7233a346a80bb6429df281b4421cc08*1&2c8cea96fcdbbe5222822593ca743382*1&5643f22a2ae862fa37470198fd04ef66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb1_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b95a11a7ede42458fe7a85086883ed4*1&3070b69f9ff1ea1396fcc6f403d41dbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4188fc3e52339b0a426b0159f951dd31*1&0d6a01c09f2869e8dd363227e439bda0*1&6591a5a051b78dc1130a559e6a00925a*1&6c14dcdff771eb39216e5e64728dccd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g928mb1_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7e572865d4b809fc0bea69ca2a8b9a33*1&7b32644300060610f13bdd32aaefbb88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9b3991ed80ee226fbff8a6740810faf2*1&0b0a0d93a94a4b6a86e1365dd65faba0*1&7ef7a3f6ff5798625b6a2c059d0c00a2*1&a09cac5d1bf4c43ec3351d44e2abaca0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"320e79c7b1667a9014eb80c3b75f9c4d*1&1688cd36059eb58aea8413d64326db80*1&e947192343cca0e4c29c93d5607114d0*1&850ed2370de0ce2b4eb1142844414651*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"984e97c5d328de1bef3795670ac1a054*1&82c9074d7ebacf32942a66b2ebcd57e5*1&27cd1857e79883be7b2e175e0989500f*1&65e03bd067be57bcc97c96bf6b7b8d6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb1_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c1ac208724400a34ff9ac06c23e18665*1&6334da7cd2f1124a45f2fb6b01437400*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"96df7350cea992d8f5178398d31d641e*1&993af405120725a29c6a57cd63c4590c*1&d98df2675642f230a7dfd0888248d50d*1&b4211e7c30a2a9dde03e57e4afbc1dad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b8f79ce90bafb7cf94267ea5a22fbe12*1&995df3f776d0ebb0d3718fc769bdf40d*1&b4754e904e43335170ac2e90b06e8153*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"065ac5fe2d04f29eaf90470c8b99e7f2*1&40c5c5932d09b5443626e139e4121bee*1&b6e9fe85d7c232f4d4100c3860044ec6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0967a2e8bd05478160e5caef3289e235*16&dc49e1a5b414bdc7a05de10493a06e58*16&cb2f8831f709d6345e337e429c8f8980*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a5c46283a8c7939281a1e2ae0d2e2014*1&2126886d7f7eef571199ab5355851af4*1&73bb7ee135dfbaf40d023a947c85cdc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb1_ic576oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4fd648b66593982ef24b22f7efe7a858*1&474e5d87ef385e65f56a52983b711b47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d138cd74715ebb893a734de80482cc51*1&270a50e9308a18c333042ba578c9a779*1&57a272b0f2363fe8e05c9f3c7f1cfcf8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g608mb1_ic608oc608_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"444c68aa958055f1970dcd500dec8389*1&9059e4732cb2cce073fc4839d68ce72f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6f3c1c227e8ebaca4cc02fdd65d04c45*1&be612c8f9704a781a979e7ea27344d61*1&5bca125b3a1847ead8d7c19436a04c26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5bbc3d6faaf1bea38a456129eef5c80d*1&9dfc2fa5950bbbd5f3d6aa8b876e267c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c6b428cfc67c05aad5fd0667fe39bd54*1&5b528326f28dc39ac9c3b15590ee89c2*1&08bdcbd43e031d2adb4cf74c53f6820e*1&89fddee9a78a64fd65ba5e3ea81629b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0403911ef56dc0533ddcf4bdf658b38b*1&8ddca8f60a75618c1cf7e00fe247a002*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7631c1f7584707b0f273beac01c77e20*1&e724b4f43634db9c4129ada877cef929*1&1b817c77c004cfaf4cb9c2cf9a398a96*1&b59154f47dfeeea554ea7535a0822727*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g704mb1_ic704oc704_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5323d41e6d03eeee51ada781ff66dfe4*1&b22b42872abb9f7be6ed1a49a82441f9*1&8afe0b98a2e1f528cd5b5f4c5b0deb19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"af5f5261be0b7deaaf5e9cbb473e8009*1&7c9553ef197b322f5ffe288340d0a4c0*1&c2835e743e74205216d094dfa1585bbc*1&7fbd31907da53b754fafbf04db483db1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g736mb1_ic736oc736_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e62f4926293848cbda9759faa0154a45*1&0fb05cade69a216c3a16b58c2289c0ad*1&667cd88dd466b1fc5db9edb7128cd3b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"525a7016369e9bf1e73d9046a14ffbde*1&eef30066ce6c8f91df99baefb126b111*1&8616cde7af84b44625a792805a8bc01f*1&2adf48013632874571bc2753fd32a768*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g768mb1_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"19bcda7b22ad7e60a35b0a359eb52916*1&4ae7d9059626ed17fcdec7eaa1446423*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"713a9e921890f1843035d9def8b2f7d7*1&0c450eb7b2f7b3e6a3463312f38888f0*1&500718c921942cefb643226b8660c9a5*1&d14c2d7d371331e830bab67021988d8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g800mb1_ic800oc800_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cef80c916f70a055f4266eb2821e0596*1&8cce8035eceb1637c77d116bd008710d*1&bd6a10eb98cf81df0d39b18d2f429241*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a305dfa5d27be7deded128461f6c1d59*1&3a2521f8b8a698cff5c84fdc0b8ad539*1&b5eefd32c68537b9e179a00f329782d0*1&108975472e949bff8099efcb06b02c78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g832mb1_ic832oc832_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"da7778cd560f2df6c88bc1a2ab46c979*1&d92d47f601f825fc73fd0ea94f03e476*1&d7e9c3241d64e10647112d681f82aca4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"81cc776c42c3feb4b3f836b421f85000*1&7a92cdc4b572fe09d3c5a3cb2f1a38bc*1&07cb9116477e3762a5d538b891c45acb*1&75369a5f8766ff00d95e74e0960ed1d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g864mb1_ic864oc864_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5ad1e84d971560f2d3acbbde7e41e828*1&98875e47a3175293127361c82a526160*1&11f1ceab9e5138e8748c51fb5b859408*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"669bb33c5d8215311542e80715f43dee*1&8232174effffd8757b3f888503a86ea7*1&3abfe17443257502dbf43ce33f4806d6*1&45d31ead46157227d3ea55dc43121ba7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb1_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"44674625a8949fbbda5781407d079127*1&cffdc391cf347d7fce4f860e1f62795c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2e8a9e01936dbb26f001cbd4c950a5a0*1&0bbd5a42fa40845a279c211ec3f2312a*1&23482922cd82e79b0c93e8f59fa23a83*1&c52657ad63a09eafaac81a60707dce86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g928mb1_ic928oc928_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9538e6bbace8c0ac55f6dfd9c96fc5f1*1&875807b7c42305483702554a69bc0ae7*1&d11a6bb8f9a5e795b1c1b8f7fb51b3df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f24507ba77c359d164955ef667d15b65*1&25d9088c1d64726a0b9783b96cbeb497*1&372ea7220d1d573a6bc0e54362f6f271*1&c97f62d86d5e384cdac5424ab398fca2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"085efd481ab58a5f2f4ef3f1c0313ba3*1&3fb76d52a8312d0bf1cd1179b766fc2a*1&685d34cf2dea6733d3eb9e996fb74005*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"39dbb4ce22186aa5dbd55e5cc0f73bd8*1&70860415540b558de36b74619a4f71bf*1&375669fe6c93bf83585d42d6fdc2bab7*1&fcad4fd6029e7e3e9b59911b9df04f68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g992mb1_ic992oc992_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"064f9c7aaef1ce4a9e2b010c4af55ec8*1&27b291c9d3a002ef50fce7fa3b2e12f1*1&5c3c46df42cea9a841a26080fe75b704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3bc323639a013443f71342b0308d6a78*1&815bdcb3489e481d617204c2eb372932*1&cd65a019cc293d6f9a7ad6f2e6b2245a*1&0b1563e5eacdb2af42b96322ba37743a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"15f54500fadc896c522366883840298a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"f2afe6fd825b2f8b0c8373536b57abd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"f39720e2d669361d21d86540338ede99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"823e7fb6fb9731fc0ce1b278676e4e07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"f59e6bf23b5c3e56a83d8c4293918a93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"efa1833f35fd8735180fb08df0fb7989*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"8475b40c98e70764baf202448e7fdea2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"0c4ebbb109bfe88e7fcb08bd23f5b0d4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"fd37854100765b149e53580b1e6f5494*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"7267e53f18ff6124dd090d43d9d5d968*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"f66efb7f93fe5a3e6482fa3c2843e982*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"f538aec7cfcf9fa00dfe988c931cde0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"3dd22d0b9204393d7bf24bad271e20a3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"220aff0998c541d9bd3bbe415daedb70*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"23e84c9e9b052848bf3e8563f7e6cc87*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"610cc04bea91fb63a00b0143e99da345*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"556ca803a4cc641ad9a24c2e2a6367e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"c66ab99117a02257cd85d19a130c4762*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e4ca3d92bd1f2f1758028015d9c4f53b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"ae6eb791a955221539276f78e6bf5730*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"5eab5004adae34e8da08a783921e5bde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"08fc434310b8f4d34b562d1a093b0ed1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"82e1981cbfaa5ffc6ede00994ee0e080*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"06b886feae788163a6079b1376f1a0a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"6aa6b16ca0107874713282695080eab1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"59a1b7abc9ca4f1ccd6da713790f1fe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"891e1749a1466b05ef643f64cbf6e83a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"d6b1421cbd7f8525ca7565323c1ccb7d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"ef64ccdfe7e0d4dc1fbfb9584826399c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"f021457032f11c2dc3f89247feebc688*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"660ede77fdff8cc83c941c4ee81a100a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"3c020db4a8c18e5fae8920ae73f6174b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"deada6a11821ebe7a9797971f88b55ce*5&e28305062de3996f1b1b4705d336d4ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"55debbb8fd494393264084ac0d1eed26*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"87e8b07df065fa953be0812a9be95d3a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"4af23dc376d6cff442cdf7edbce24bec*5&4751d17e02ebaee6856e1da68892a01d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"1755379ff6693093f85f8303f232574f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"e6f7614aeefe0f80d2a49d0cc2f73bd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"d0e5ff9624f6e81940d5604b3ce43d56*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"eacc117861e88b5a393d8c6e739e4e30*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"6e91eb9bcad3389623e9a3f1d60aab59*5&8fd8995e1d0ae39c74c6d7ca2fa80501*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"7b893d3b23cb281d7970cee6f136de6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"cdf189b6b64cb9b4555d38743816751c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"d18192790cb1c6cc5813bab9473c47a5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"7027405e865558ded7eb701de093bf28*5&11746df960143a19ee14a67c77a466b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"9ca6647179a4c8b795640afc6a7e3193*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"b12d029706b676337ae430d2bcd6a8ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"d8d28223fc94eab3a81c2a75ebdbc404*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"766dd67d56916fae8e193d893627c5d8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"b12f2f656301c1b6887ada028482a286*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"3cc9efc95ff11b03114402c03d810052*5&6aeab8e6be78824a61c0b061dd70c96b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"7205b0aa5e85f84e9ce52ee7623d44c3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"7c260a06a9dac5b46ab18532e4b557fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"32d7813ce7700c98751194edcb72c447*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"a4676bbdd04313f77aff6071cb48088a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"d08968bb4987c35d3fb06af0c5695ccf*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"2dc23c4175a35fdbfd97af3bb5cd4f78*5&33689621525b55cc03c36aa336222dd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"7ebca22e3ef75e63bf2683253c6b1d4e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"ecb1ee5c650d4a6745bdaf78f5937160*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"cfaa963fa2c480181e47dba26b129a1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"00b744ef6d64d0bd89b25ca89ed8b9e2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"4e5ee0927b5dd3121403e7bec37a2070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"51bd80ff09dea20a5dcf16803dd0f541*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"3caa2b0b42d72ef05b97b77109e2e5d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"3bd3d7eed3e04cc00ac4cb3dccf0f63a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"8537af21fcf5519acb2266de8c69514b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"849140448727d00894498ee3577f4d00*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"8f9275b108224f14bee18616815062cd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"264eedda1969d585571428adfc0a7bfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"f9fb16badea6ce6f1ff3358afda7f1bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"1e2ccc0fb168fe5bcffe5c597bed5071*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"95b033cbd732bbe310b12f5a083cabec*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"11a9e5d7eaa8bf6910f14d21b9e9da2c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"940962e64fe171b5503774e73a6b8af1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"1f42c50199f6dfc26d22de04c32a7b9d*20&9eaf1895e6431f0e8fafd2dde0f9f48e*20&4e446ab1ebbe5eb2537e025e05110ef5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"84dc49127bf3d9d9bd1d78d1beffdfe9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"78a3bbf841fc7543b32a48a37860871e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"6a54c0c703af31d2942501ed2c2fa919*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"523bfb67749725225e7016e5ffb2b385*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f42453064633372a6be1a9325da0350c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"be66254ccd3777118ee11d5b97722167*35&530d3f2cbe1b67f7f36549df27e00a0a*35&956f5d43e37368247457d9afadcde460*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"bf27ade2a6eacc41ed49982fb6520069*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ab142ad4da1b7e7f9b3ef2ecd46a0133*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9515a99f350154e6425fd6e6cf2d444b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"0e705c25987c8e26e4876ad416a2bd43*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bca945d0e443124b11d2def82535c4f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9fc59172de92651be2d82e7037019857*30&a4b21a3855d0e8e5bbe7b6119a340142*30&088972516006fc261d78dee2adfca478*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"6974ae65f4c59f74fd07e67d3cc3a53c*25&8b5c89568a822386c2dbbe7aeee919e0*25&6923bd3b612dc89a4f711f415745b4bb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b81a6646257869aec2dbafb3c1862228*5&1b736674c3823d27f666c1fd8566009b*5&5b825c7899bd29c9dccca242f0b59e8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6bbc9be3a9deef13d965e0c59d94fc17*5&9a382cd6334ad705d07ffc66d0775734*5&bda2a520bacbe97a56ab355800a74ed0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"f75fee0cefbcdbdda172df3b5ac0dd81*25&b12b94fe4bad5939d699eb2675c1eb73*25&3a5aadf5ddf7a0accae6abbdaa93f5ee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b3404d63ce9ee20edcac5d92ad9f0439*5&b399ef3a2a9888e77d6b150debb2617b*5&b9e86d272da287355fe6310e9e36683a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"4283308ee8e26c31293ab81488f410fb*15&f696743960e00f0b589070a11948ebe6*15&7e296a26259766a749cdb2f6d449542c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"b5b208ab01bacd220d9e8f12c6fa7f05*5&99ea3728198c2ed0d79bc93d96ba9f3b*5&198e9a8654df61076e06f6f03224478d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"e691fedcd366e17f8b9f3f851e4f652d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"b6f2d1d5753040449d3d5e6749a9317f*5&38888d231f2694a8f066fc93c748b412*5&cede762d69552f95d5a75ad034f58638*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f8cd03c249d59e6c7a75ffe9f6a63af3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ba87b58cfd7751a356b04c136bd0624b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"61efc1dd6a58ccffc63fb7e81d72b122*5&43c95db4b4a1750f3dd6798d13379e29*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"40815fc93ae02394aa4ad6198f42540b*5&6ab7fce7879e421f570cba6f7dc5b92c*5&3bda6d8b7464c7b48649489a28651985*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"1fa5b80923b0c906d406b3773be33281*10&35fc32e3edbcd31b8b290080be8a96a9*10&4e9915c252ab6f8ccf93758618942407*10&545a083d0a2a8ef1e32a589d8c656fb8*10&6bd09ee46f04c01e7439d497e9e4e653*2&1540a6266af84a86f41a623e7a0af0d0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"e7b4dff96d2af1d7a6350739544947ca*10&9181f567d9a839f61b5cc6b1cca58896*10&eb8d4859b528e564e9ae332103cfda8b*10&8d216c76f52f0ed444d0b9b370f7a876*10&43859afb8b058b54eb293f169a165790*2&ddab161511e35995073ad2f06e87e7c6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"117a0362de33a92ad4743cfa541d6669*5&ca76828d0acbf6e86305d2dfc87e208f*5&0648ae7249b77faa92e5581c173659a0*5&909e1bfb7eb09cd104263ae73f62e071*5&cd0c40f8abc3c27689840ce919e2f82a*1&9e44ef78a450cef5e8acefcf64c6b90e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"edef1da3ebc5b9923eed48ac2db711ad*5&48c1f165b365137ef02f245c5b0c3b76*5&bc15a8c09f78b60d695def3b6671d643*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"b44768b772aaf483ef119aedd6d31f6a*10&293860872828d9488e7b4aaba68ea716*5&b57dac03a22f29dfca93c00505e8507d*10&8ec0e193bc4c6040d2ce89158cf32088*5&44f25ba5db97d53c715049f579f1c563*2&59202c409053ff38cec2070448bafd1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a501d593d174c100a69970fd3aaf1c79*10&1c7c65d1f23e0133dcfa57c14a1d44bf*5&38562bd15217e30c7d26b05a16a0f179*10&b91b9f036453e7f88289bd3761b9e63f*5&e7623f86cf03ea4de2a1688fcf0bea72*2&2caf7a91b0394858bfcf6e3d72217d7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a2f90be460519be8045bbcd4aca2dd36*5&015575c74eebb0080ad5aabf0e83dbb3*5&5c150dbfed7d6038a7e677c5775f4757*5&14d8f3a6a87f5065ea26aedb7f851922*5&c1b2d521a82f7172d225f77ff8fee806*1&eafbb620cd208cae93f5fe17cb8f1451*1&1804579e138bb7974f268406202f6f5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"572917928ca4d1572e08145091b469a0*5&3bccb53c495af34f5cc634ab3cea389c*5&05ec4a1474a32ee26d7c9086dc192abe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3fc962bcc68549c41c39bf3edc15284c*10&91cb9d798bca2b581ea62a76f3fa2c47*10&e62610dd5e4a5540b12b9db4fe30dd60*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c66ca309bfff13370cbaae1bb68b8fea*10&47bed9351353eee2bda876f0aeb53ddd*10&68a5969bde885db2e7b6f1cbc702e848*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a5626b957179ccad06b70b8e8cb1ae94*5&a1c712ee17cea712a5af59d20efccae9*5&a9aab894d7af0f6229a8ff1a3039abb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bcebaf695f477fd147d61ac62be73b65*5&ebf9252148d05ce3baa5feb56b9bf368*5&40a847aefdcd2bf74107d36470144eae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"fa07fc9469a80555b9fe758a7554b37c*10&aacb7c2de9b827ac055d881da00fe130*5&86e21037f0effbf41dee1d965442d38b*10&3debe950636696955ce6bffdc23199f1*5&0fa1369bbc8598f1905404d71168b240*2&4dc073d957bb6b2ecf4aa46cdf788678*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e115682d93d65853be01da57260d910d*10&68c911da647f186ab93ae3caf930f1da*5&08f359eb49a3c89ff27a5740d7ae1735*10&4e6f3868aa979595d8dc068a273c5622*5&90a7b0445ddc5354c023da20ea86cf15*2&6389894aaff90162b89b1cf9711fbff7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f1a536bdbf3b79e97566237a62a9eefc*5&75792014dbf4a2228e95c85ce6318373*5&5f7a2ec4e58862af5dca0574ed351696*5&305c665ba77facd48acc94ca8de68a0b*5&33bb9fff1ee7d2e6c1f19bafb6a51702*3&9ad09892449e2e85557c44bef12c211d*1&5cca9c4944658c4dd828eb8a7e036124*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9cf68db6faae71bf8b635ed38f8616d7*5&a5b125c97aa3a630022418a051db33a1*5&1838f38a42dba9ac5c53cc85361daac0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"130172873462ce88cf0e1e7aab85e3f5*5&5b2e9b54437f41859279420889b4b131*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"26a36aa34e9ea0787abba526e40ceba3*5&945e2ba551d7bd38f3b3faf524b12d8b*5&1b7b5ce33dfdc0f8d481360852d278c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"df898524e141218ee597792b4cd81733*5&57a0eb0d7dbcbfe755f75df363029af6*5&38fdb88954b3431476b572744cc1f851*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"e6b703061539aaa475a742f236dbe55c*50&b282e4455777f6c34e6f5df59e90d0cb*50&3ea9b223ce7ef63b956226b5ce11d885*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"2970df66e7932a9029510363e47fe9f4*5&e0ceabe93745c699b39acb98f5e85019*5&f408d5332da20b0a09eb32097ef78b6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"f31bf3598f6c0e2a511196c2e749d5b9*5&4f2fdb83d9deb8e59d25732ec99a006f*5&6fdeb35f768b130e4353031ea521163a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"c31cc9b907d76306d8835e0f22d6a39f*5&fa7a5f8dec59cd8e5394fccd078bc3bd*5&618ab89ec0878a1a6a0dfcef734c2c95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"21f200a48662b417cd83cdd09618133b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"5bb69e00ab3b850dff704f0812d035da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"0bcb82f3bfd7b23825cad0ae1016f55c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"22c96afc8290c2d1d4e7d8e543f6104d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"f23f9160ca64bcf8fa3fc7c51716211f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"9af2eb4a578284e804b119c2635a6dd9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"110cb075333fd01b55d6e262ab83f2a7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"485c0b2c075615410110895a9b640ec3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"a36c1716514b4f278c89ab2f537fe4c6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"3e022c207c65d6acc70ca7e66d5b7cbc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"a2db1cac21d6a496abef9ff22bc720c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"2a0700cf85563e73942da46c5a1ef076*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"471c3d6dd088e1cb46d92ca3d1a19650*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"ab6ae33f29d0e6fad1e1a5cb2a1f266d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"658f9f5ddb2f66b4e998545381eddde7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"5d0569cb78da936873627a4d7fdc676c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"28ca31446361198ddc23d6babbc0f764*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"26e5dc4fb9c7aaee4f47934119cfec9a*5&b69b2b4f853fd8b191aa24e8ee838545*5&84c267424630af42869fb0ea86d262c5*5&b0cacf94c27d3ad3b160f3d4fe7eec3d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"a8aa55b563a387f5e7dc400fc1ee6bcb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=any mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"635fc6bf8d257de38e263a9043248b23*5&b3703e22ba6600d2bcc0700f6d1b5ffa*5&a40ee802405bd981c24d749019d7b147*5&950f4b937d2444949054f954537aeaf8*5&79faa0399a4847f3386cf9c1c00750e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"321f3dc9551e119bcf1c5c8e8023937f*10&5413a40f31f3a6d2d0ec685107db0639*10&ad3d57cef3e4ac4e3fb2f26a81226c27*10&b7b0f3b724bd39e76a8e37f4b316bedc*10&6cbbd9b3a39007a50949d3c4c272ff4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"288e794a9311da61af68102ec93ca06f*5&1208f010ea76527bd53efa7c335ccfbf*5&51dfd01964616cebc934aabbcf99e9ef*5&65b0d1f0cdcda0ad5cb9addd4c0985ee*5&aa5d29424ef45b6f9964943f89870c07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d66f55685d5796da80df93b140490671*10&2eded74c2ac9003c5312c71a69be1f59*10&ef9d6306d195f53af9a829c8d414aa36*10&8e978cfdca33e473c3bb7b4c49b385de*10&6065f7be211c259162f9847191d86066*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"3c4f0ba2b92b8927a2762e7b8aba51b8*5&f70929953198ddcc261bc3c469a50898*5&cd9510c63d471a0d7748daaf1ebceedf*5&29ab4193a5d743504c7a57a508b243ca*5&300a0458d34afa7c301080ed39cae752*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"1ec2db425e4fc70626cf7a096efa521b*10&7b69ba234b44b4b267c3b781a5a2fa66*10&deb26817015df7e459b2b2d84e82b14d*10&cd02ddde3fe96113a17ec6f03a6e5682*10&76e9d45d9cc697a1a766789ed879044c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"4849816a280f23e8403671e4685c0226*5&6e8e8c12eb265da7eeceebaf6e158ee7*5&d3f8cb09820707b7f295c95cdcc2717f*5&e9334da2e5ac63a693590cb7398eb788*5&ce4708abcf3b2252765d5c1111cb31d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"8aee76e404c50607f758b0a8a56b4ac4*10&7e936ea13a0399ccfd1101f5ad816533*10&dedfb668d339ddeb9378b392622b82d1*10&1df6f24d9c69ab0657169ed3460ca5e0*10&e222584ba2194e4fca414c7df83438a2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"a7230c9d548f15abde3031b5919fc417*5&701911e1555958f18be641bf6afe41c7*5&d12a03bb850321eca87e11e4a9cf0d85*5&96173657ac3a76eda67894ec41cf93e6*5&f0caa2c074523165c84af4eef20ea57c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4d940cf5d1cbbd98e9821a9ab8f66470*10&3310ff83ceee09b2c873a000a6836167*10&f88a4508344746f86323b892f72abacd*10&2a3faf0a8f535e3a292c6feefc5efced*10&303e0cb4bb4d2a2c3c3f9fa3df30ca32*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"147ee2317f7aac340d613eeda82d742e*5&870d8185d566de092e370aac32c3eb1f*5&a1d99412bd624d6fd2e6573011f83e78*5&c268b7a8219454691113e0569c0f6cf5*5&ffbb9fb465efa228bab5206a5e586f3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"4806fd89426634aa241513ce9f298a82*5&d5ea75c7f9fda22935604c5ebdb3bca6*5&6f177556dc82b13e678f8d41b13a2b47*5&e62df35dca4aa4f1b5a61f27e04e1582*5&d3b36c228d817835da1725789e83548a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g1mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"73e46dd67ade9bb28bb5e65427db1f67*5&734f795c88e2f50709e460288cf9f329*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"89683f928ced6ad5012bf800c670cb1d*5&4fea901e55d85b794f87b4fa38ab7a7b*5&1ae46f1c816d17a82eb04c1db293fc65*5&9932911a702ed6b0cc267a4e8448f224*5&8148d649ce4ebc15dfb97bbf2855d088*1&e46dd75e9e670ac68460552abf70cd5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g1mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f452dfa3bd878d2763ef35766dfd0d81*5&4c95a950c45d7d2d2959ab69af5823bb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ddc74a125a5162ca98956b0512b2fc8b*5&62e8be8261ecf0561ef71edf3a5845bd*5&276fd5fe3f049b102e9eb3a81a6aae9a*5&31116fb8fa3800e668485e794de7afe0*5&d5399d7ef40b0c4967602c7224f3d068*1&9e2a70db2a7e623a0929bf5a3cc27742*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g1mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"3fff015b8bcc53a2bcd5da0433c4dc03*5&5026663512e6a17cbfb3c18c9a5d341b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"525910c9ef2182639d4dced4c1fbffb4*5&15b0d86884f853d3b218cf1b64f4f611*5&b9719b393effa0f241627c829785f88c*5&99f9825543f329ce17dc3ebd54e576f2*5&228c51703268f7e61c5dc30da9e312f7*1&d8e9bb1982d2a804df3ddc42db5d491e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g1mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0797257bf44eec21fb776c2869319486*5&c6930d3702332baad4ac279b0fd1ca32*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f1b8c4c0aacbb92bf6933ca966e82942*5&5bce7b6efc16dcf24eceff85ce4f6796*5&3b1e0649c4d657d60c60531bb06c83cb*5&5cd7f36757ad932e13281fa5c2a0fe5c*5&dcbadce19f2a5a8155352de30b902ad5*1&b55c2a6b3b05ab13ed7a4eb0d3a084a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g1mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"478c739fec2ee6d6551e9d76cf2c1c86*5&8c4db0739b27c76da7e9cde7fd3e614a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"0117c0f12558eed773745d812c43bbed*5&7d255b1ba5120994cb1e88153a4c02e4*5&51d59909793c28bca53523c4079dc87b*5&5d10426590ce2e61328b42dc08627d3e*5&b2b69e19a2e6a658d6191deb2f405eca*1&8770141821b6f1e80487dc4a182383b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"d002710ac7fa37702da52c44144b34f5*5&cd160d23991b0f7f36fb3157cbf3c9f9*5&5d8662511180e70867b8c4139fadc198*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"2008e11d5755557477fd2a7bf8c5ad04*5&2a790ae98168a64726606ed451e93354*5&30a8f912d2fa414029995a93d56241fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"a644fd50f6eb22ec653026844dc16f6f*5&cc23271567f20f9c934d34ddc8330114*5&a3fc7a99cc40ad51f1266277f02700b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"921c21a40cdc62318caa1f868a52c719*5&57105bf9ba8340370d3387c9fd2fb6d3*5&c0f78e163faecf8849384a521895cf5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"1d4774579c921bf1850863a84e3a192a*5&1a5f16ddad26eedeeb890fb949c51082*5&c111964d9717f3d79851dcfc7f050d06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"6d81b3563dce685e9bef29f5aff0a2b5*5&3991a265728be73213f3758214254420*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"18f6431000bd3527a02a36ac81b7bff6*10&d0bc6e180db96dba7d52fc72f4f02164*10&72be282874a45779c0c2b86e10b32195*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"6a37d292f8eca4ffbbe02d8eedc094cc*5&5d22f8831ece0b2e10680254eef17936*5&32b2627427047dc85af8f693432e9f6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"0a71169c5962e10924fde7ef37630896*10&a1703c98f4fb0847efdaeae41a8d85aa*10&31f4228c6c0f46fa1b8115831a0daf38*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"fe7b6ee5f3067b8a095fbdcf8c1ea4dc*5&ffc0e69e3db1edc92f26686e5fcb99ca*5&bec00d61a71033608ac29a174f587ae5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"1ed10591a5f773d89c5cb0746e639bd9*10&8c245207510cb1652f4c4cba99800500*10&8f36638165484b78aad8f55c08cb0824*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"3825997aa994393769d8717da442f252*5&f7928709ef15f11083e87de233463398*5&f5db221920a53ae935d8396611e5ff34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"5654b8b8214c6ce8dd80049c7ddbe2d1*10&d77c1fcec3f3044fdbd0d8ffdc417dbb*10&6992be73faad785b912222851b05dba9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"43da60bb2612ed5ef8f435cf6bd9a8b4*5&b403679363852cf8ad37781388809061*5&4bc6dd615a2d15f041fb8509f44e162f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"9e7c7cd5e56cb1864ff38ba90eb91605*5&82accd2b3506586e337f0280c3583fff*5&44d7d93082db8d6efc4fc23b94e4b5cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"ec4848bb3685f7fcdb2818da4a13dfcc*5&e9edeb33eacd0f48ed9550f0b7db65c3*5&b096c33c52657606c5fe35bc4a1dd161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"653d67532c61e66d03c18a7cbd7e70cc*5&098d95120aba0a50c3cf1750324aa29c*5&26077e73a7851aca04a5c5e96794652c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"6dfc5a1383c769b13f31458b6d4e8ea5*5&d3e2dbb7d1ec7ba227edcde763efd430*5&0f506f1288c058caaa2f08694854298d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"16cc85cf824b403119aa7759944886d3*5&c43652ce6f7965ec9ef4861bc6c4f480*5&a5a3f6478dc03eca3b2838ad7a3511c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e3fb75c2320d323e0add3e020af22115*5&c6990e2f5fe0f3541d58b33098ea4871*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f987c8242eb2d5721a47a5a6ce96b46*5&9b4c00cd804bee8ee5c8db081418e92e*5&34f5f6caeb0265cd663c004544c01494*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"e0bedfb43f4394801fdbd6f1b989dd31*5&d516a16613c5b860761764c937a4c35a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a72d23b511c23c5bdf870e8a4080901c*50&683a945262358362759e0a9c579c3112*50&11fe9c5b1c4b0e19a30917328231a661*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"171d8d84df9ff95363647d5a35c89272*5&66b76ac11d09f1d0e48c4a9a0d8c5b37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9e4eb2f55a18af2facd217529100d886*50&3aa2d1140eca872e0c004499059960de*50&5081e43ae4b3a35af3284eab389c7f6b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"cc36b51f5547d5e6a1f7f96ba3804f2c*5&25f6c0a7ca330e7907d0c4b06492e4ae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5677499abae2c0140b3664902bc02b7c*5&2588e41ebd13debd4fd7aba0b7c926e5*5&168c82cd455e33c2dfb2b5a47884a5f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"4eedc3d92acaa4d814dfb6595d7f57e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"e42e7b40d3258f3e6b569575807d25ea*5&b020ec28ea03144af81f227b7a3853f1*5&0fe81d16f9151c913f5013e00142e6c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"374c996986eaed1701a9626bd1cd4851*30&b1adc553d34260eb56826d2ded043b7b*30&e5c1ff0c8016fd9100ead4c3deef291f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0d80553bb8bf537c48d89e261885a5f1*5&12d19695d05d993e5bd45232a56d7b67*5&2245f7fdd98732ea4ca8d78efce6f77f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2b40f22391a26117e11b1cbaabe29d72*5&ed301d2774a73a9367f5a83d77780286*5&b0cca353d253459ce08d351a86529e25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"4332f79be95edaee59620ac6e589a5a6*5&ab4b7cf73916e6b40c8005633e3f98df*5&abe955aa78af6f3d6d7c8ed343ea3111*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"87e4446e5efe02d67df71581a392e78f*5&1d0db018a674e595c7a25faf4b91919d*5&17137828d315b843767032c0be948dc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a96395eda1ff2ca06ba78e323e076d16*5&32132de202b87a4a6b563f9f27a9523a*5&77610153f8c0d1ef7ef53e751334fe24*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"97d30a2b8915ade2b4bd509d866103b4*5&52b624ca400260cba504302d95d8a82e*5&31b6eb5e8a2a216a3c01c1268fa62c40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9ff47b3f46abc26409e159d4ab7160f2*5&d5b5c6561160a8c6d52ecb5c691b204f*5&9f47a9ab5e4f6e751ac36a8b95fb08a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"607b9d14abb5a8fcb5ac16821d8a95e8*60&51de4ed7f5de98a2af62f7f27feff668*60&e4bc746b9413d6ca9f357a6b33b6d33a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d06931180e687beacd874cda29ac5871*5&eca3590a385be0bdf18e795b20d0321c*5&6b35382e5036b458f0ad0f9fac65bfbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"acbb2823f1e60e787cf1d196aa4f098c*5&f179dbe9dde42f5b0a4cebf791079be7*5&636bacb68062f010c258022f4dee515c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3aa4a7d5ffc0262eedf9cd948307d29c*5&c05c3fd04cfc3f2fe42ccb5dd4fdd351*5&5788d4cf310f6e0594c654fd1bdf985f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2201632fdc38e37a86496931b984a0cc*5&3869f6806591d64111136515b1e87e82*5&c2f673e9e4d8f4e49de3d77dd3f667b9*2&3b99b6257a91f85fb677c7368dabeaa8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7655ce80ae9c7b9fc70e81b4fd584bd7*5&afd1ce856e9471bbb8ed1890a9d966eb*5&7e5edb74f2c8b9ae8c06dc5dcdff9848*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a48ff4857df17e60a225cecdfc83171d*5&6a2cc7b956926086a3d7bf00b7bcb657*5&9715570755e7dbb1a4c4c59223aab835*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"25a43b11757ae9f347bb777ce9c6d6ab*5&d188be3941bf8d6e6ddd25191d07549c*5&7989f774de4475e1fdd37aa6eeb48149*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0f2bf6ace5c058baeeaff877243383b7*5&83be825c791de11674f9387cec73eb7f*5&66d340405348b7b9a40b9cc40c38a53d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d67793971e7613bc493cedc73b22fd2a*5&267e934b638c9fff967e3fdb77693b7d*5&f7f320542f7bab371d9e227cd6cf3f50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9a0956d1ae7aa2d2a7035528db3e96e9*5&4577af7e6117d0d074c391b6517921de*5&50ce2e6f84d2f0320216a3307f00cce4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9dbc4ce7d1e46e74c89515e878c7517e*5&7b7e62fef222c765b9311444be7d0257*5&7af8a13f5035c9183fde2105e29131df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"090b1482675880d5d74c320b5acee976*5&4d2a4f42229ab8963a3d9460ef7cf3a3*5&9b4e5fe7adca5235bc38c8b2f9baba0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"945babee2c3085063f3c651acc81cfac*5&e7850cd5e64f8d640b7b20c60c996203*5&7444ba0ed54313064b8e18f1a9f6c871*5&9dd590ca49cfc5cd8f4ccc03887de1d2*5&38df5791dfde260051c2dbda0ed64a02*5&6449701c8fc4b38ee51ff1821c656648*5&47680dd2b0ad7464ed27936570fbc4a4*5&7859ee4645b00bcbe9b54cc3f092f34d*5&f0bd3e3e81890f6b657c24410d7b3571*1&877070a57214a14f4add59ab9d74980b*1&8002f666150120f5a4d3f90647039851*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"eef87a8f7ef70e99a31cc23d8b636358*120&5903d6cd63133a71ca2d997f03166ae0*120&72834799c334dfe9a5fa8f987066750f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"545a25b748d942e4dc6674fca3d63922*5&e63b3e9129e9593272459dade5595af0*5&d720c2d4ed21c4351c247daf5ea2627f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"70a2f5239f6f10722bb6b7a0d2373416*5&3c49f1ff2976cfb79c915df50743fad9*5&3cd18cba7a0db2196cf4713887c19dd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e6430cd598d5a09640606b1da659dd5e*5&3ecfa9d02f1739c3007ffc9e73441b5a*5&b59acc878738dcca5eb19c077599442d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fdc8a5075ec01fb66b06c308ad9e02ef*5&aed1ba1d1f67aa66aa3ecce801b40c99*5&9aae7df1ec796ad377c2cc2ecf08e536*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b642ed2d754eff2e571891083caac611*5&fc93c6cb53f559d1ad987e78baf0e7a7*5&d5fb5bbe4708f71761ca0accc97adceb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"10716341b0f83163b791970807d671f1*5&0184cfdacd41ff43f4a023884bd29d79*5&3ce26598ada7183402f2c3c297ca61aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cce13ed88946dd81fcfd56e523d9cadb*5&475db62144321f32e1dfeed8b6ed449b*5&073891794c95d409c87ad6849c355f09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"14f2a2bbb37bf9ffd88bf830a32aafee*5&88f3b5c5f73bbd7c1c965d00209915fa*5&8a8fc58de256e3b2091e14ced95c0f76*2&eeaa06e68344cf4b4c06157eabd80948*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4f08ccbb96f6c4b39336f08d29fdeb01*5&015535c8d15709df7ebe387498d2d6cd*5&972d505fb9bb1d2b8888fa5919a08cf1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"795b95a4de1282710970a314bd9ef48f*5&79f5c4f5361613f3a596cc0f8c2674fa*5&d5bf5e62ae3ae300438b7c0cea120692*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"90aaf68f270e2f1c7fcbf7a9a565a4e0*5&b52b15fc2abe4ca588e98b62a161c11a*5&944fdefb0d3fd3758f85a62d2ae77480*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"96342bb5c85fbda4c6d9bd4eaf775b9b*5&df1a334cbc627707c577a3d52ef264ff*5&3c4e5993d022302b6a8fd7d5c16e68dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dc5e8388066cf14e5f050532d7f96e6b*5&7c0717c4d51ed9ed76176a5fdf49d9f4*5&8e328e6991c09d8d79846a5b0f637299*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4ec614278959e7c2f1132d09920d4a96*5&568fb55c9b9932be85ab81dc6c60c397*5&e8319fa398afc516a0e315228fe4b9c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ae4d0c7ff052b775a0321e8acecdeb24*5&09e328bfe26a0db0fa036f82d20c0b99*5&044829f76567805774b385f291f2c265*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7651bdf84a306ef4af236472b2cc9860*5&f2b1ab0ba6820cdf405e46ca6e3a80d8*5&28b887cd05555b17d5766a04977077b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2a19a9c295dcf85894fe38fbab65c528*5&74e50b511008e364f89fd6c1690487b9*5&ba22f2f907c75353f8bfbbf1934ce918*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8a583df972d031b46858374855b3505b*5&6f0b84b95724a6072f63715bff876c86*5&be0cd85502d96ec74dce3fa50b27037f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5b2a3cabeeac2a3ad796d02d50ed9700*5&fff5bcaec18a8ac8697791450d04d6e8*5&e5d2e856915cbba4ac3d2a8e2baab355*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c24703173b5cd97192b644d03b7da821*5&36ccd4c246e40ec17cc30be3481d665c*5&7dac8c70382a1fad29daa65f01db9074*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ecec915ff2a3928ae19b72891b47bf14*5&fc5ca88953777a9a8f4ae487621c55ae*5&7298c8c8055461118b878b8bcf6dadf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bb00f81b4e7ffb45e1676193da69e973*5&93344b942e83c0e369e40d2e8b5f4d48*5&fa7cbbbfd2501651a1ca1e437d8675cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5f67ece3bc9d9b906537611cfbba4627*5&fd104dac5e383caa871a850061ea7565*5&08bea8935d5e15c58ad74e598a989c04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1b2c5893f9ee06a905c7d8c68f8a89cb*5&e6c8dd788c997a683630c9084c201f66*5&9e5303046117d301b6a3673b129f3a95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"18ce9156737edb0e22309cf973fa7794*5&32091a350439f97f8cd322787c17d6b2*5&06fb84130b6c2b5c96c462fda1c9aaec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"a37ee4b6119596f1dc0911892caad56a*80&cd32259b7300be6a46e6a2c528e6d75a*80&a1ca707fd95ab2e1de21600cf76f9e45*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"716d663e42840dba4ddc543d174b1967*5&e0bdf0e9cf4c974a30c02749087473ea*5&8be76e268aa3816b5cef965fa8ad3f7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"98e22d28c6c60ed56204a181801b47c2*5&cce88a025f4b02b4ae142922a69db482*5&02f1cfebe4cbc162c4eaaafb5af5741d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c6ab89a4c555b2f59450bf924837ad71*5&9da817b3920e0a0cd60be17a1e75a9e6*5&829aef3dfe65ddeb455dd602427106a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"aa538bce1402680e3b95d5b5d54f538d*5&8faa5fde7b6cd5fd83f4eb2b9782f00c*5&c926d50a11e3d9d9ad8256cfbcdd3006*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0dfe03797d7eda96a95f0f35713fecb9*5&f0cf406f639ad8d63404e1bb08c95cb1*5&7111126d2828238038be6a27075359af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"29f780f970ca33d32546499699a39bf4*5&ca44bea83a16ffc2db6e782ee8a07485*5&52795303582b85dd52cfc569615222a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fcdfcde9a5e71647b55e53dd91d69f1d*5&b4277040ef291b60f38c15745fc62f03*5&a50b0d68b0ca22fde238519ecb4a1adc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"48750fa108c7f0ca9baae0d1d7e104fe*5&2b49ae77a31f6f89fa597f77df8e4295*5&488604ce8982b783e4b56ab2afdfa53d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"28e4f1e6ab5cf0aae3643624fd3c9901*5&109293cf37fd6b3b00b4e217ee0507ee*5&13670a68ee5efd58687fc8a3026a3855*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3750058e7915dab1ae19c7033e41c8df*5&f353713cc49712b53624a8eac12f5789*5&4fcf721b68edce8c1c594e8cb8001e56*2&7f67f3a2b7d73a41a7ae95f3b0ddf4d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9e33aeebe863befa060a42fdd3ec5e05*5&fe7178f05909d92b062eafe76f1137c0*5&27a69dfbb9c72696e7a0530a0d2b1855*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7bf60c52acd81639ef7c65706330bca8*5&a00af2b1724870d26192097a09106c9d*5&96597a9f07c4fb6104afce57dda55eda*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ed554201508d1f63256fea4e144532c4*5&0695f5fa1b8c3c58869eb5a1d0a6fcbc*5&627ee523a9269ff252243b459e3436c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3c1a5a7f6d26ae258d56ae97dce0fc6e*5&901a55feadf7a161a6d6d6059c3adb00*5&b08a1d2e36d2620b8e327ac4c53f19e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"941c764f764e79ef4de68833484db427*5&4bce7dce5198b436fbed2940f5dcc550*5&768cbbc9d3e2c3936b1881dfc3e6bd80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"eeb171021ce577b2e4e3889339123c0d*3&d689ff481bb2db5b3e092b107734a1e2*3&f92b4796df1a57a9ef193566072af84a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8f6f45b7421bbfb48f3ce2aec4afc1aa*3&a479f0821068e4f27ea0748fa197478d*3&f51ef8da0b72ead48ae1f0fca8ae1d56*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7d154da9b3ab89337681c6843ced5b0f*3&2786de290f0ee1e861471703d915b93f*3&66ebffb82629a43272379882f91b72a6*3&f5b98e4f9d6eeb3675a8b962bf91b0b0*3&4cb4ac3ab30ece2dd28a585a78f295b2*3&040a38cd2c44c32f0873151c3ac1523a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9cf078e705156697f14772f84c272b8b*3&58d65b14ac336b98341df124b68c6ddf*3&971ef7ade379e93b3e5f166b33cefa15*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6671b132874aa7fe2ec60ab19d57fdc7*1&9f0a7562c811a3337fce1b8e7b5aa82e*1&c6dbfbbc981b0412591ab844f17403aa*1&11107f5d22f118b1f26c5cf33034db05*1&1f10fbc84ea1dd5f1feb405cce75ef8c*1&4405c7c1fe5927895abd709a87a7dc8d*1&45f5d8090c5492561d715c8fa72b9ce6*1&c2bbe3cddf1dde8453640a2781d1a205*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0491524a36096e22e4603c8a48466cec*1&d532a5e9af7cf7e2de80c6ee5eea01bb*1&add88e51759d9116566e5b228ffd4b6c*1&87fed0cb66511cd47c7622b0ebfa11ca*2&34a69797c51406500715cdd1f36810df*2&ca65913d472b3bf493b89df448c63f1c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ff6823b2552eb6bace1cd0a012260d26*1&f804e8018ca1128b691417ad7fb7a2c9*1&525bf9d4d20e51255a0aa1c592905dc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f65d64928a25f82ceb5263eac1683e53*1&0cceddcb3fa9dd601daecb259744c96f*1&f31794689ed73daf4931c9cdfd1122b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"08260a188f52e6105f488e415151632a*4&9614b97a500c4925255c3bfc81a7cf22*4&bf4f5ed9cda1fa58197a6572d54c9671*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"70514c4ac63f42a8f6e28431375f9b6e*4&a47220c40be132eba020a9857c6d76df*4&1ff6688485be061411798d3baa9017ab*4&5369c63df90c7a87a88bd70926cfc015*4&d7117c2771974d846e86c0ed5463b884*4&2a8a11613b5544d1e8249d45362c2686*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"97c0ae96dda97ebf867c1f94d165cf31*4&ea7c4acc40e107c0f721c3613f746a91*4&cf76546317e9de5eb29c91af8ed163d8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"dfa812c4413385c6c67897d70b926290*1&82c5091e97a7f2516de5bf6bac928bab*1&d3221ba69768e3deb5b8b313ad60a0dc*1&fc49edd7c00f08ddafae5203e827cb70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"55a8a1d37da247c89b28162f406caf0c*1&0ab1f5d10bc23d3fc6f72b43dfe4fcaf*1&0d8b4cc649a494e5c2cb6c28d2f2f23e*1&9c4275b66a7ffdc5a0e0b388e4611b28*3&b669adaff7094720c6e72ddfe53ba08d*3&844b976777faaa3ddde1385747948b24*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f7188670537b15245c1224dcf63e50c1*3&3d6eedaeb8ab04c3915dba743c129186*3&3d7eae00eb7afe652dce64e0915e75fd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3f1585cef0c839f1f1b8269bc24291b4*1&ddff29a5454a9eba78723234441a96c3*1&a394befa885c1dd13fd90bff859787e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"c169953be70f3513ce83a68fbc7ef475*1&590b6e53cf12c5da91b01eb2bc56b9e9*1&0759bc13ea48a2a9ca4f5acb2849edf5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37c733101ed0c9cbd6250921908188e6*6&5412a2e0fe7fa32cd40d4973ad701dc3*6&9dbd49d0fe60000191a805d7ffe2e0e7*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f29aa40e37913296ac7b192073ec968*6&51869d81095de362e14b13112ea4d1d1*6&f5402eb7c18b98a3ca5e41f08e73cd4e*6&751cb28a3bb479ab4dc6a32772de37e3*6&65eea77a0347beb49089b4a698427198*6&0062fda806622ba1636805388bddddb9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7abfbce07bd1739a9438a91a7f24c154*6&0c73f1413fd93a9168baa183a66cd70b*6&f99cc331abb94470f71677b4d168cc57*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a51985331e02284c78326246970ea4bf*1&db19349de7eb99c5d88a865dcd5619b5*1&b5bc212a1df7f2bb8857e661ca91a7fe*1&2b392fa63f365da7f6d7b7c354140e2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5477cc951c1adfb111aa2705692bba00*1&8f6988c4b11309f5f34a23900b84f935*1&4531a377f0ae39f601cdfa45aabbb6cd*1&af4d78892453c986ad20b5bdb5559b33*5&cb2b2b8b3835f1c55664353bbc6bb995*5&8aebf16cc9aca6d1a1ef59bbd649c671*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8c25fa6b8fc13446f3f2043845d09085*11&8ca4bd3e7f017f6080e8e0e36389d2cf*5&a0b0dc08cc16ab2618067845c067bbd4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed15e3988469dc0fa709108e53602be0*1&bb15d7d16f16e1236c0243aeb681dd70*1&31555cb0f5f07219362f5a2805034d3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb32_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"546bcf41076813fcec141733a8a58220*1&33f5218b7e40ca097fb07abad2afb81b*1&961234fda53f26639e9da624c5a8af6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6360a028d4bba0d5e8a373ac41691ac7*3&9cec3536349938895d7a49aaa28c6a38*3&31249961e69aeebb779241dd266af560*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"72e5de19617d789480dd9a6f711b72f1*3&4af34d1eda60673bba8945e0dc8a9ee2*3&166367c513d6cb31708fb2d3f226f223*3&c642021898d42cca61d2e7de7631d641*3&9c080f070d0395170dc15cf2c04480a4*3&36abbbaba6043d90fb32aee7a8214a16*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"de90848cda304baad9de6bb756bb4c44*3&f0c5fb23bcf5e028ba97954defb4752d*3&475bbb1ac962787b5ac89a89fd37c241*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"159fedbd87c5d6bed4eb9138126b82cb*1&a2f177f697d4eb296c20acd232a23d0f*1&46698406a2c2b48ebbebde3f8c97c9fe*1&9493f99fc4a3b19bd4ab7bd3f82096e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b2e599a195a59d71f03109210ac70ad9*2&52788d20f02a337461b335d9c34e0f25*2&c64c3dafb818216cd58e009bd4bc3beb*2&58a10ad019215a21089245e37eae1d83*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb32_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d0537fbc0c113e7a60f859cd3c94708b*2&44892c5b140fa867e2579eb7f85d7766*3&be16dac0dd364c297fd4608f9bd2d2ac*2&785daab397c90e7f321be02a0af9c795*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic3oc32_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"882f02ba6606ef1798067312ffbaebc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"6e5ae3c1fd8c06f2ff754b53ba4c798e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb32_ic32oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"deb0840cc9b0d642e73bca0cd0b0d632*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"c2ffc47979780a8591f5181f0300f42a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic16oc96_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"f495424468819e2a9f3127f80f3e18c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb32_ic96oc96_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"574810593ba0fe938488bc6b6f5122c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"67f9464df039536ca5c23e6fb0c1a722*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic24oc144_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"04f2ba8812f1447c3327b65d501dda99*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"046f864c4f574153f34f29c63b698140*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"1e08cb53fcd2a9b51ca3d7c2a70c50bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"3c6076b88c08378099ecd6d6d6e6ec6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"3d9e9c9c0d26a2b52ddf080eb3803662*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc192_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"d5bd4f295c1bfc33662f38530bd2975b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"f1220f57cf7ee413e8008de0b48dac35*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"46a7d283d0073ea975f359aea69f2a32*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc48_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"85ba336073bcf88208edc9dd1b249f0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"bc2a84be9e3f79bbca3297097f001393*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"2b920cccfdf387bd3ab8851ade7aaa3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc384_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8fa8dbf0e513336411491d7e69efa4f7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb32_ic384oc384_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"1b3a34c9cad67d3feff9925152c36bf9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"22401183a0fe5ff343e8492062760f15*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3d54547b167ff84bfeff2774ce7f208f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic96oc576_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"40abbeab066074fa428fb43d73227bc9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"f3d33eeafa4fb3ec6f31bbec767943e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"4660c62dc8fa21e94cb03de8119da09b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"6f10b89f31b75b20931e64fc4a8498d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1e9caa5034aa5be1153e2b2d2ec911a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic160oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"23a2b7cdd9c9d4dcc8fcea3c1c782d2b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb32_ic960oc960_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b3e8c43d7c1e6d8a5af8eee45398708d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"67f92235563523ada461d3da203a1bf3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"de9d92bca8ed605e97e02b4d1612a0ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic320oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4614cd41fc1c83cb32226cebcc9e51b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"1b516ce45ddb7d3bbb28c5745d1437eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"f612a9141bb289511eeecb1fada021b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"a873c58e116cd8cab6f79e934818aeac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"106551e14f865dd8fbcd25e2218786d9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"eee7ec6d145650f7fca9e6cb31ef890b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"1bcfa501df3676547ceb77867ba37aa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"6ffffab905bb1b3907ae1996b0741475*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d137a35223022da5927cb80c6d715097*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6d769a87b291734c7e20250ced9e9beb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4b05eb4babf51a7aff434ba2e3c98206*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d1d4bcf6adfa44e9a23be8e8f03a97c4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"802ce1623846ec4c8472a6fc20d17c51*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ec85b2b5ae34e3ebe7057daf9bc6c89b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"64168030d52502d1307afc5884f0a95a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"7a757d796fd3812c0739cc0858b773b1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc48_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"5ebb26520ed71c48cbd50a15f64b6270*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"08d69f34d03f5c0a46de18ed29232554*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b753a6235d254bff06e5c4992db257f1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"ec522c75cb79c344176f0b3520c08aba*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"ca00f834559a94d61496c23d507bd75d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"a4eaeabd7d48d448d120a93470726a5f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"c718dbab256b85568f35f44c56c46663*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"d049a53cee55a0501ca03e291621380c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"9b51e960c94a676939e9a3dd55830599*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"e2ca5c6f74ec84cc779b1db3f61e0b13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"edef4167aa7e886b1ae16c4a4e3ceb56*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"d75810789651ad10550a95a1312dc891*1&524d3b73ef5c86fab8fa9eba20087dc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"d459ed2d4861fe244a97367d1fe08658*1&ed5c31ca34a54acde5f21f8751961782*1&23228ea2dc0781438505e486b4eee01d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"a7cc1e2f025acfc5ff07556508d7cc70*1&3bf26639718eceec8f82530672ffc9ab*1&a2dc26a0aa10a37d8a2f25564d774969*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"2c875ea53007111d7d5ba761be9b97bc*2&50a2600e62d77390ce9defd53d8bfaeb*2&1d5260e7d44067de0dd618f37033250d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"a8eff497977f06dac557463c71452259*1&46756f3a99aaa35f91e4a8063a35023e*1&94066e8e936069dcbf8ace00f0608a60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"8bd275cce667ea6a2703f4ddadd750cb*2&9702744a84dac6c4926196ffb2334ad3*2&a1bb7883370c21ce29ce71230ca2b53c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"2ba5dc0bba499117059c0b4454fd479c*1&994e85812b7a28354874eb17aa1c2f04*1&de75aff6dea84abac353342d1f1e8790*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih334oh167kh3sh2dh0ph1_iw334ow167kw3sw2dw0pw1_n"aa14037ac36187642e6be66a54a4c61a*1&4c493d5af32fab48bbb0986bdd75dfc2*1&3143800fbc1d53de3b48e940ebc365ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"5619093a856481b2e15937eb462b3703*1&7425892907e03efd87ece5f1db851644*1&ad8aa846050c5360074bb77f1c6ad3a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"2ae01c30263a6918a52496018521b0d5*1&48786cd01eb1c5b1fb01735f4a4111df*1&b29471f68622dac21b5161e8995da3bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"fe6a3df99bc2c0574c8e745a3101f68d*1&2f0e15b07dc6dcd470e261dedaea0bd2*1&2e007b0996d849a33ba89cabc592ce5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"f57c7b53eb15c611fb1ec7481f96f9a3*4&1b9a9febb38632aa1bff10d00f06158c*4&f2fe8f3c7b1fb1da8655ad8ca0264139*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"2cf3536f999af1852f086435b9872217*1&af9abaf79b8be57120a42eeab0826116*1&9f393b54416f05f422175a4a9270914d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"b912912ef9e20b8ddba76d9c167286d9*3&c049b64fb73f15f80e1af4b85026eaac*3&16fd940898b1db278315744ee60f741a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"cf0dff5a52d659ff7cba6399dc144d64*3&c4351198d2ba60b00ae37d76bfd9edd0*3&b203b097455ea1ec5bac11b65e1a02b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"ea8dce05a0b1efc566999c68f3f1647f*1&3cb70f6c3ee9c5654c437b2a88e9a2bf*1&ade3da2b270293d804143df71dcd439c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"6f1403452cd072cae2f2d2a597055bc9*1&5988c62e049fa0069649fd37c030079b*1&1ab902ae6538914ecd1aad4ca50361cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"184505b9f278d9d7bd013c05c656e2da*14&e6af4c27d1308b5e58b94f4691b2104d*14&e07f482b7f1ad1887e60db1b3533bf22*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"23e82c5f781db0ec0f15be64d090f26f*1&a79067a617395365ca58c7f2c06484c8*1&bbcccf345b441e604c733370b74d0fbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"87c5f862c648c7baef7cddb4ac0abc1b*5&6f427453aaed192c3270ed1572738314*5&28c9193757b73085808429432c87f286*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"793851a8dc5b295c8a77fe274e8867a7*5&91f0fa048c88fd6f361d73e42f999e8f*5&1f045ff6f19fd6f16c2a9c946891df04*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"ff5b36e8901247e0466f4b5f62ebfec7*1&eae3fb257efad9fbb78c9ef8500c67de*1&f10e0eb42a8665cbee3a59874e828d23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"bb32a26bbfe19a3498d5dd78f71e5b22*1&2dcda3ab25cdd84ec0f201d4b858ac82*1&dc93e23b72b91aa0ba316c07a3483778*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"cb52834928b16a303c9e42389c60231f*3&bfe5a9dec2952818d79867222b895960*3&d35fb0c663f2bd57d18b6bea5841fa17*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"d035f0b027ad3c0fbf4156b933d7a4b6*1&28620573eeb59164d3bb03100c0bdb2d*1&03c7329450869443a41423d018366243*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"35cc3aea74afbe4c571136d62ebc012c*2&4389db97764c456e4f315b75ff448889*2&9ce80f09c9c0d54ac383cb74683736e5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"4218b8ad9ec085593c33e96bfb756e0e*2&9ae16ec89668fee0f55b6f9fa9dff31e*2&3dab5f944bfecf2a20311a5b46596412*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"2a4fd20370b7eab3c9abc7fcc36d3d49*1&28b61a9c5bfe550f33df35c7a8b20f73*1&e28e1a8a1a8b97abde0ab5a67c214bd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"c3aa79d6d31b70d9f70ea7a93e6617af*8&bed61c91b0cc92a5f9ab56f15def5875*8&406262948a855d0f01c631e11619e744*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"ef976498cc9d0f574ed0ec1de998d870*1&328dc2aa0b12d2a7f4eb3b149ed7de45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"087c3400032b39ed0938c1190d89a730*1&f79aeec98cdc64d12a8abb5a09a2c2f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"0fac935dab2e7cf8a765dfd04ad583d4*1&de82c960f3abf690b7ed1967ab7dbc78*1&39fd0f23d7f7a50d0da97163b9191ecc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"08b08b912588008e12b961665a8726e6*8&a9faf151f44ec6455bc3c55cdfd93836*8&86c0fd2a115a2f2b33f383d7daceffe3*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"bcca37c4ce3b017f287f84e92b774004*1&ece2d3bc5b2c54d0f479feb937a01669*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"58f3703c1d9634644f58d1aeb4da55ae*1&78d0fb0ecdbcbd9834f95d6753af2add*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"0dea899957fbcd66d7afa7761083355e*1&235594b0fd2b44c7086466cb3c2a89d9*1&2e5905492498b10582979ab419618bb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e5043aaf437654370a07c3d3c3738a5e*1&c0b95d0d12d3f2f1b3f6a2c27c86e98a*1&9449812450d238917ffbdecd4a88c84c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"c916b2b9501de9ae4a89ca115f08213e*8&7c5e54b6a21d3ce0b06ac875e2214e7f*8&72094b4d6d6292d3e131f38ec1001ea0*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"44d8758c2b4eda8031985f4a4f08c42a*1&45cd24bd8f923a426308b6d8618ca78f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"b3dd4a50b01aad87dcc8dac26a7c7e8c*1&06604b55f136a353acf9f8682a143f8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"b2cdfbf8d00b88c56e73b4d6b2e58757*1&228c852caf051e1a15766eb69f9c97fb*1&54168562f58c376339dc800c1488c8fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"1242f098a3e09f494643de62d7ff98e8*1&e35e4767c1a2ec728eeec44ef9a752d8*1&a2d774c4471568ed733c6ba22be27170*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b8739ae45bd8ee73da1ec2138725dc49*1&5d35517981823fe9b29f5e60cdf42b60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"cae8c4bc1d62505cc3fb699efdf79ebe*1&6b4d44dcac2246852896b85504ae4633*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"f5e82447ab318f12e6d427237c422fa4*1&8792f10513bfd396ccfa80c8bf7e283b*1&f348c6dc2082e522d0773bdd19c52f9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"28a5eba869f5d4efe298c4b333e4facb*1&9f85cbbf1fd7411b0d00fbf9787ccc84*1&ba17c9c6afb23316ed85146e58ee2573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"d6a94a8338ef35a2e633e0460b13992b*8&6838978394b54beafae30df21d326f6a*8&599e3a794723f34bf028bcd22e1c858f*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"8d1ae6b1a60e3981ae74e3f20f35b84c*1&4cc2134242d54c37b623aefb030fba88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"757b37a1708dd280b226c7c1487c8a7a*1&a44139b52a2e522a8427c924e18223d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"6d2e265101e7b757c4189c5b456a5d4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0c8593ddf65bb3eca3fc587e0e41ebb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bfafc4d1143c59c8d2145706e604addc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6fded6509a54d3e0d4711054de02b3d0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"485ca9b601e07955d634319955570093*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d34901924c9ccb7432a345aa6d2cfe78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f0ddc01c75fb38e289df6d122a5fa987*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc192_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"aa7584209b75b5b2001c7477ef544743*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"206b446bc017910839111382dde5630a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d5feb9a1748a3289077486fee0bc9db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"973f10b93ac8fe5b0e14f6981e74b716*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic320oc320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"fd75723982cc7a804cd10401f576caac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fed51e2a5323d4fa854012b9fd691772*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"251fc809930992de605b98a7558d3098*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"969f6aa79000f7e3c4feae2f783b1628*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"320c8bdb2390e55836e3043006d4f306*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"566b5425aeeb6d9a840c1d383d22d4fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"591c32a78b48a4aa9148977425d541ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b379f5ed78d84cc5cfaba072ba249860*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1fc9a347e8f7604319e28d5ef397697b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e3ae13e41a6bad5be3e159981a369aba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eafeb04899bbda6c2da4b88a203acb54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8b49755ea471d1361debe5c948b1b784*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f005984923ef9c6f58cb50f519d10a42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"05079a3516a34b8020f25bfbb2a85d71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb32_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df74a96aa297ab04423bf84be7127535*1&d35bb42e5190868c40451768d546e65b*1&3ccc9a906b4e7571f459d36c86ced80b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic704oc640_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"f3cb5e16a84978c280e2b2965732c8b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"868655762c672001afb32308452ae161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"91ee823d5e949624ed5f1e0b438350dd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7b5b56eb1a49aa5939a7f7c0b2c88597*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d72dcd3fab88187c8754744dffb19ec2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"78828bd0012edd8a7873150e42e2cf1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc16_id16od16kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"277ff63a1593c444e1e4e18273279cc1*1&894bc90db65f77706b299f52c6590d08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g16mb32_ic16oc16_id16od16kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"99125e0c1a7d7d9e94ccab9cb4213926*1&7f4df190d18f30b2c9118b0102a3680c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic16oc16_id16od16kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"1b30647ae6c2e6a03796fb3f66e8fceb*1&f0a595fd5c695fde07eea917b1769b2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_id16od16kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ab7422ab6515ac0d4122f9a50d3c3a2e*1&7afd1cb444cbefd8003bac1b140076b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_id16od16kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"177cef6b186eecca37808affd31deecd*1&d87012348dbbb07aa2bf81dd788f932e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc24_id16od16kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"94aa0d9aa6c1b0cfbeebc81130cfa6f1*1&1d8d811712f732551bafc940c35ed8b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_id8od8kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53917a5162d3cf2f2ea80bb14afa3bff*2&c814cf05734d1f3907b23c26c3671d82*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c092f73983ad7cc4c63e1e1518219968*1&1a082f6abb530069b1b08600eefefd8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b1f46d9cc8ff4dcfe5c056573955683d*1&4a2c782caea1ad039391e969165a623b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb32_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"2eca73f2f8a311fc184859702d3b739b*1&63e86b4033914566f25ec7dc3f7354e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"05607b957b120e20c5bda39ddac376e8*1&99b08bc18d8839a9f1da5d4f2b58e3b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic18oc72_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cf47ba9079e10c6299beedf5272e1c02*1&9a8d1296ecc8d9d0301202f59867b752*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic72oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"472b8f1918cb401d38e85e7ca5b77fb5*1&8ba9c2f24ac7fa6eea4f0d00918e580e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6c334f9646d583a5ac667e9441144188*2&81a86b5820f619ba77f14508c6737540*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb32_ic120oc120_id8od8kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"78ab5d8a4677f50a0de9683d4233af0d*2&6ea50ce5e0824a2aa7cccb170e7cad8b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"239e1d533d9fcb8cc7e989a991d34e54*2&7767309f927c03bcf41f9c17e91a2c14*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic30oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9aad07e10e13d1716d56c95682324e48*2&ec30d1f1de58416b9b6f775c5e7a11c4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2e529d40411b681066046836517aa7d2*1&f4625e3a570866aaee9dd04b9d30e93f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_id8od8kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cac82e29574dd4de640bfd44982c1992*1&df7e1d7f23f96c1046a70571a9e063ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic40oc240_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c4315a5cdb466ef3c88d352d258e4002*1&da74916fdaf292f9902ca26116e89f06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_id8od8kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"c63737139ef1653524595bd6f8214b0e*1&08830be133555263ec95d10ee27ef815*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic240oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a123f5226368e48935273cf31e941b67*1&2f121fc4cf81f93d9b526d8affdaa78c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc200_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9eb90a76a8498e25e1ea6b0c5e09a2ec*1&be214322c6a7364b32e76a693c7f2aac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb32_ic200oc200_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"00893d6f139b75c7bc0d95ddc248221c*1&39058a67de053b2657cff95083091ed3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic200oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"90b9625f367eb59b6110510a13965d98*1&e231f35651dfa4c488e2089eb21d35c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc184_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22438e5dd5196f7a3e5901c94264ded1*2&6f2bf425ff8b273b75b6b147d3be08d5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb32_ic184oc184_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bafd6b63edab090367728082b9bebbc0*2&372ee25172162b917cc0a71259d2d926*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic184oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"23f144399532cb5894041a99ac5c2d1a*1&2e6fcdc00fa68868cceb744f936af31e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic184oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48f34baaf2e315976a5fcff51d2d3ca1*1&533d82a60fece38d4aef188bef89400e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g80mb32_ic80oc80_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"501101b495371f2f74672a1308ac5f9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g80mb32_ic80oc80_id8od8kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a7cae7a20b5a37f452cf02690ddc49f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32_ic80oc1_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aba1e205209daf23b15a8839fc77a011*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:5:acbde+binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic80oc1_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"78f551ad41be10b88b39422b737f1613*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc480_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eaf3676e837ed495316e6153c1fdcfe1*1&b201d90762dc494ea1791261412cdbe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb32_ic480oc480_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5444bc65005a6097c99313eb4d02930e*1&c8cce461a24566f933094fef782a304c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0f524177f0f7dd083c87ac213a184621*1&9eeede8b3666b5aaf471e75b531ba8a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic120oc480_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fb68827c14cb67decfc84af4b33fb310*1&11bb4f1e226f004502d9f3caa82f00d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic480oc112_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"13e95d43f9a24021ab8da2355a6b6f86*1&4d1f2d3a674f7cec6723bee3401f9e72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"03b1bc128558e764e4b80aca01e7f7d7*2&f0c61d4dcb15a5a456424ea09c4c6c51*2&d8e21b907c773ad0a9c3c9bfaca505e3*2&544a1eb653b48b4d05446e4fea6b2a45*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ced5ae3b284db4bfe74a8bd611c94c98*1&1c68fc2cf225dc67616dae562132c856*1&bc1dd0a660f6d96de2cb376b000d454d*1&5c7dcbf7b337b718ad51bfc94e86e596*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"82fa5809894447d2f07479d0f809ed1c*3&7a61ac2296f65f5f8966ffa54df66daa*3&8e74027e53b179b7bd85f3f14ce078a8*3&0ebd33fb4b56abfa81339c4ad5eb3cdd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"53624de02ec0e0bacd7f9e68861c067e*3&83a56e0d02904cd9587da848851cfd46*3&4f59559eb301e7ca53cc3be93a48a331*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eee18774a6afd9afde69224962a201ed*1&91922561cfe1c72c6cc0b5a873215dfc*1&872d75904e5e800357bf561bc7e3bfdd*1&6c6e4d449963639ef483ed29606af3ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"0dc1cbd34f1f5ca804210a2b07891b1e*1&e99ecc48f0d52bd54d7d0de47566fb27*1&692f5cd22274e6908a86d95a5286427f*1&40ea95005240e67b90fd43b07b1d2096*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"361ac257d3e3a956aaa7c7537d00e5f5*1&ee30a7062690977e1781bdd561d8ab17*1&3785ba415787873081e0cf3ec3d74342*1&19acf5053e082bfccea1ed9ca2bd6b4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g160mb32_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"35d69c6d48c9866bfdcacc6c7303f76e*1&00f6c846929ec91f961461b0d5d497f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g160mb32_ic160oc160_id4od4kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"03b5a66cdcdb2d4dbdedca3ebfb6e890*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"87e69dfadc086d4d902f7b11ee1b9742*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:5:acbde+binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b45f62a1c4726553cf7dccb07d805feb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e558d5b981e2aaab50d496f3cb52973*1&9620cec5c12bf8507d1dedf9003fe077*1&e7cfa1d375dafd831e8fd6fd3d1e746b*1&0f7700f8181126d2a97fec12a7ef9276*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"8f8301b8c72fe617d8c64b828df4adf3*1&652f24bd049c1525fb1a5b5777571340*1&1e0004c9d6d80cae3616f67acca5bd90*1&f03ce34998fa582ea27f9ba340851ba9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ac01569517a4b46db6c75d67079120e1*1&d6c806b06026164952e15260ff47786e*1&8a99b3cc1fa0359d80a0cb3ab4fdc9fb*1&c24ee55f4461da3e85593990d4744fab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f881a2b9b69140ad073a9fb9e73802aa*1&24cdf8a5d9e0df9a83361b02e0a69692*1&79ac557493592c21ac37b03459031734*1&c598ccbafe72c35b31f613dd7bbdf393*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"e6b2166cd1d02e91b804f482c69f0105*1&4750b09ed169846db3b5a91ff2160d00*1&91a34fc418a87a55e90274f1a9e8e9a4*1&568e37a5435a78f241afccd6002fdb87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bd996410d2bd492187e23fe5634ba92d*1&0062933f7549ee57813ae984f87b24ba*1&4f5e077ef19fa1fed335df980c24b6cc*1&dc4ea1d12118bbbf3561c3b76142c68b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2580400fc9472706d8530c3c177e74c7*1&394367a0a282d773b2fa82f67add6ce9*1&64291bed59a12fe68cdc47a90fc31754*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f3d300ba3462d934d91a37d95bb8cd4c*1&df87fe056949e2e46d18137b07f487bd*1&f2d6ca71d1ebd50d7137c26fd627dc1a*1&b35c93b669250fb529e538c9105c9383*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"51b8412b813387eb934bdec9e4a81911*1&1b17e0ab93feadfbc72e52841da0c3fe*1&3e95e51458fb1cd1185d1df287ee7a56*1&f57ebacda888f4080f493a7f7d7aa846*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e0499c2c2178f0a53f660bb2800729b4*1&3194529b17330be8fdd781079b2a482a*1&6509cfe96f464ba4e33dd0636c1b6304*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"f9ebc54d2ac696280115fca7004e6282*1&0f6b7f68f6b13fd07fdf1c3ce369841d*1&ef9a3d0947744cfd410d060f7bb451f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"6b5fa4d00fb1202a779dd8f1e74e8c00*1&8a8132121656942473fc800d44056d11*1&83b0908bbc78cc19fdc544eaac86c0a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"04b53ca941a673195afad1370e7f1849*1&29cc7a42535ba262c61f88d0d5625127*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"52f67ba3c4586a13d99f164472ed5826*1&c1a4698e56f61514b9212fd81bb87b27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb32_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"1a64424b2cf2a0110ea7d58e5cdcc394*1&11b0bd4ea4b7546b4fbfea00e348db46*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"404d0432fb2bd0746147ebcab20d93bf*1&b2fe9602fe037ff9d2b9f11c1f8e8171*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"a638d2172aea334163a840b4d218a2c3*2&7e5cabd5aa4356524a09353298aaee55*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"23f13867ab9fd6e655c459d1e048308d*1&4749614fb36014e87281039c44b29cc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"e703ed64fe355a81c1b045c72b18883e*1&20e130b16c4ee83e9a8037018627474f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"99aac1e3f3a8474688bfff3b30998204*1&1eb62ce3cb3943e016701afca744585f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d99d7003b0296f5ae20f2aabe46fbfd9*1&5a6acab447cb5cd04fb8e1d3b29a36be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"3de7ec517301a9f7a4a4a377486cd4d0*3&f89aa5af89ccdcbea357e8c36223f6e3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"bc5deb7ab06c7586aa989c6c813fec53*2&dbfc746733508c41f895d2e6d6bcde3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"9c50780254844ca0db645674db277f31*2&967480d4d50f7aa8a8dcde6213f50e42*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"b272e2bacaf124eb6551a1c0d68d5b14*1&483a0dccb6033df9da75c0d44726e892*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bb30b12a06f1593cafa99a0223ea560f*1&9bc6931ece0ccec46b4859519b76f1c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6e7cb5cc8188b06b4285453e344b3886*4&e3496fdc408b23816e0944cb5caf4b22*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb32_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"cfbb1f39e4546d007904ebdbe7d63493*4&77923763eed25d65ce6f0cc15da249d8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"fd914d79b61621dc9e6fb6aad31edc7a*3&83844508af519584fdbcd6d05cfd0ae2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"70ce8dba82c81b45fcd0b265f4c716ff*1&7d143e40c1a1a674508074e00abbc063*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"56fc8073ef4681fe66429f1abb2e5bfd*3&ef9f370036ecd65354bc76cd9f42baaa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"330d3a7a1f457f6235baa11095027b97*2&08e76d7d8f242bb86c7068431370e8ab*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"039dbee0d7f2c7bb44fe9c9269f1d5f4*2&e605e0767f30d458254f32e55e21bac6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"fa1e033a9b0f16eef4082e866fcd67a5*1&89d451c7ec3f38536b2fa85028a27ab6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8131c972557db287edf3cbe1d9990412*1&92841dd858cdd81460ea424c5334d673*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8da435cdb164b187942f8fad7636cff8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c9e96ee46fa99df9fd6f2590e7bbe63d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e5565a1f7f43e59a394bef29784b94be*3&84d2e23943ef943702476c8c512293a3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb32_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8a1c3f0dab9f305e1ddc3b2e10331634*3&fb1e34d1517e6d060b9603e5660f7754*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f250b54e5d8ded961a3c0a4d6e6fe4f7*2&2229ebbaeb76ff4e840a50ede6151579*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"32edb7cc0c18dd3ecd8e8d40e65afc02*1&1446014ec2c258603004e94193871418*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dcc2dde8bae80c3efe1615c99d09ab4e*1&85298c958cf7d6bce71db9e31d984fd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8bfe4848391d0554846ccb0d138df173*1&41fec9343fea73ee7b5311e1b95372f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g1280mb32_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"72f2013948edb2032422eae0160c8162*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"966f91dfd85e5ff0d4aad49f8d513d1b*1&b3c94d370693c0dcae46fef817985021*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"55f80e4d7754a8f117f14cfb6f045880*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a02c637b54e7c8a4b580e7a0a5da9657*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"dcd65dd35d5fee0a940c55f9b92885e1*1&7813b810124660dcfe9b81ad283724bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d2a282dc821c7e33e92b7b222509ac0f*1&e32906cc755262d95bc2cdbbd0e790b9*1&1cfd396bf8916efdd94af968c7b733d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g512mb32_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"345a0c2190cbf7299c42639fe9e3f066*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb32_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"4e6eae0ba4f21abfd4e7b4830da0df40*1&8c13b87ff4967422a6068ad80946f788*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"36f8ca36b0ea6dc4826532f0d0755a8c*1&0a3a308e625dcef5c8c1b6595f6eda52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"c2a69f7a99817305baf73654c9381b67*1&04d3b6c44e549361112c283f434b32dc*1&5d8c0721703f5e106bd7aa0d1c5042d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"4e1b5aa84b896a46a00d1a467557f5cc*1&b2531482c7f92a35369b7c89969ab4d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"29912ace7869ab7af57515f511358d49*1&5ab9f15e25d1cd1a9517e0e2d032765f*1&72430e43c9c09814575178730391e576*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb32_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"fce2b3bc9e748d490ec541f71dddfc77*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb32_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"dec2d2b21669fd9acabcf61ce849f922*1&b55cba17194d452129f6ae225bb159d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"4efd5d5140915c25a395c397cf72ce9d*1&b23976c9225b9cb5f9eec4ba5e0be5d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"58c5c147d5a3d8434aaf8516013e9b09*1&4cb8f7ecaaa7df0d0620eea320020ec1*1&07bac56bf4dc918855d921e5b62e8d14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"61b1ad170e2d7e00aa2497c497087383*1&e360a83e21a7010c9e181dfef796aba9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9c37b086191ad6ffa13a83cd6ae1aa5a*1&2471c4b67f246b1e0b6aaa8205dfc532*1&0da4369a04000011a1a015ed65723788*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb32_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"0947fa93c15a40fce50349974874c568*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g64mb32_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"1570d48cf45caf8e4571d125c89b4aa8*1&b9c97ad1f74497eae8217a1988ef0600*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"3104a0d0e0158ad4e950474dabf6faac*1&9a50597591f1938140895a6c44882b93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"23b28468a1dff951c7acf672042519bb*1&8e505de27ecb7d43744ca9b9679e0ef7*1&25aeeb84cc7173ebb3ca12e38f0028ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"342bac96b12f7f27dcf1df433d6ac4c9*1&47023fd94447747c3f8c5b2d1a614674*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb32_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"d0ac12544ba7bc6e1ed62217fe1b8df4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c534fb6d74ae023ef0fddc4057fea18d*1&ef66c0ada61e9f6837cab3d7c4ae2f2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"68896a302442599811ebb560639a229e*1&9691026af0b4fdfc95ed23653b324c27*1&bf9bc45900638ad33018f44f100aab42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"adc4dd66635d61d3baee77907ee48fdd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"e0468b1bc5a1d8428431985f632cb91d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"482890bfe2100ed76800609c2de31109*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"37919b0bdd6d2b9b63bc9d49f666b07b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8813de8b63fa41fd062145a956ddc38c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"0cf07a5f03109453c68c28056762648b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"6b9b17f4af6ff64f9bad4eeaacc0adc5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"e95e043a1c74862c33d8475259488ef5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"f1826c91dc665b9ccc978f2f8312e3cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"680e2e98ef7d6f36813c19aecf9a23cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"34119b01c9d9fd170a51f88740c7bed2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"00be7c79a6da9fe5c0351638ef44561d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"7e1cd4f86edf077e4ebddeac61b1f0b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"dc0dfb6476ca8066c37121d0d1bb3644*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"4647097221907997b1f8d8a60d2ecd41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"c4b3b0a805fbf80819cbaabe36314b10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8e99dce47510312d9ab2c55ce15695eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a951ca9240c23c05126c1e4283a8a0e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"e91f43359ce814fd32f00ddf161bbe9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9b51ff6eb87ecfd0f10467d9747d81c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"e34016d0447108b9e8d77b103410abe3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"11306b45897358c0f1ada9ea5bed9843*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"16de930635a7c286a651ac907462a76a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"67606738e9b35796e576f8c2579a66da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"69c8959ed16d6f67876112eb06fb4fbc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"497400656640c66ae80386961f2df7c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d8a41a11c53c1441efebc3797f9d3f03*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"cd48473d047c49b0cb29b83888b4c58b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8152f5382ddd3e4d719b1aeda7310354*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"cf1be3fc5a90a678d360a1d02c17d4b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"7b727826b8dcc558158fe0c973d5e8fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"0bd4b264de712ea234537de06ae180bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"e4e9aa6ebae8f8c1219f722a9bd74694*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"8030d7e1a5c089cb61f405010d12d681*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"7d0d5289ab736701196351ed1a3bca32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"4a060cd41911bf142da3822fcf5825c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8ef979fdf76a755723f3f6eb0e2f4764*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8cc13655b57e3fdefa3f9d4591f05b06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8d295b4b2da80a7f47f744f28a843580*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"19d7b5d62f6a4d291567b5b8874c071d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"746aa827356953d5ab0c028185fafd7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"03d443413f4dd0bc77ddcd233353134c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"ad40a688d56034de4460107b923840f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"eb4cd46d560e15144ca6bf62fbdcc9b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"b8c23e834b5f5b841301d94a32d1f27c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"f7257fb2a361bb5b53776413f0a0d9f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a55be1a38b0fc6deb905d09b083fc55d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d085d36613a3ee3c6c8dddb8b6abb52a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"94281c982621499743fb50fbdf702abb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"fcf93b83dd40dbd440138fd122123a26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:0+binary_add:f16:0 mb32_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"4843d9e699392b1ee646c7caee541fe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"2c186feb7948920701e135de01b3340b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"a487e292bb2cea5e45c17f88a83689d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"bc458d9e8ba9a08ea79e958191eb3c3b*5&17358dcc3ef5ee8eb8cbcd74dbdbcc6e*5&00cff6384fcd16bd52b3fbbbe5f7fd3e*5&4238b0d99504d3eea4c92a0a6ec9af74*5&f324bf4226fa3486f1a74882c8c51d59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"ae519f788aa5dbe2c49f34beabfd82e5*5&2f6cf28a133486a498186416c3d5a397*5&331fd0959c2bc329129b636e0f784d0a*5&f59848cb7a5a3953ebe2fde21d41a164*5&ddfe9cef21ceb194474562066a0d2c8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw342ow342kw3sw1dw0pw1_n"1bb778b4520234315ed73410382d4d86*10&35872ec164f72e47f5865dd14e6557a2*10&774def8eab4f9cbfb3a190057d9f83ee*10&5b336657e1795eea5a7604a784f05c3e*10&a4d772512c7734fb58f5f04f9a2f83e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"4d70fbb8144bf553a79b9a1a74f737c3*5&a37fb6a2a41043f28bd3774547387dab*5&2727de12b5bcb94acd57a5cf0cc41093*5&d4363355a2df8a1794fc37e1d8aa661a*5&c760b05b5ad0fc28840522cbcfd98091*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"36f534ac1cad3be3624849b61a719210*10&dedb68c54991cdf7f6ebd2b58f9af166*10&260586d3374c34a6eaa1143572c7e37e*10&f2b14b8897b3489d7f9cb0780a506b5f*10&d2b94d0d5ccf7133952b92695cadbc4f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"6bebec1b93fb879a9447625bcf097fa9*5&2992cb1d9bfda8696f4ed4f67f86b59a*5&09ef47ffaccdbe540d4e437216e7f97b*5&50052b3ecdbb4ecbe26782342a173333*5&987632d80566286181a0a9b81bf93226*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw342ow171kw3sw2dw0pw1_n"3b3f159830b604598c2a3f12f54971be*5&9b63217e99753778ee7e32bc688445c7*5&b3c5c36609e2c1141bbaebde644a1a40*5&bb1f71f04a6a0bbf998b119ed959e423*5&7f953eeabc4137eb6c3d1273143395e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"4985933b46efc5a65bd0d9a7e6720112*5&989eb0349afda9ce58aab3749718bd86*5&03fb240cf1f4d7b47b7947479a6cb906*5&b2cac7a43b39f551759558d1dafb54f7*5&9d9b539525b821edc5f313eb8f0ddfad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"83a2bd42ce901d3e5cbf3c3adc997000*5&cd63e1247741b73fcfa296932a8ff8c9*5&6eeba68f4fca6450ac93add56612ffe8*5&23af89d5cec1089528c7fcc4a79664c0*5&41d908c132fbad9009795d5c251c665e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"999983fa9f0e4c33dcad62c5121d4bce*5&54144dbfc6d8c2af9c460ba07586ccb3*5&e3dbc3eed9c3b3ed748e2a85ea16aafd*5&8c4def45f3c8d46b2ce2267e6f422722*5&060fcf7a6edb930b8968b96f2febef77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw171ow171kw3sw1dw0pw1_n"0010e9729902b9a1d9570eb81254a9dd*20&7bc58e041db709e54cbf760a16516de0*20&eb46e8f958015842a09b1d6eadc4bb70*20&7b411909f02ca098ee4d41638bba865d*20&3c0a569887403940cedaba63448e512d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"352a2185d0421deee9e43a9f85418f4b*5&e2e27dc684725df74638c79399201af6*5&a44d947f928fd0b18df40a0e7af98db9*5&a7c6fffef2767fc0a0d361d9b04d7141*5&7a2d4c6ca03e4e2ee9593d0b07e550a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"ec1b7affee4d23828de92fdf1b497234*15&26115762eb8b8f6f65a0bc3ba63df0b8*15&2e385765ca1543133b898a04754755f8*15&7617ca7cea2a275608ae2f3f3c036d33*15&a5b6fb2b0b7d137cb29499d05de7dbdc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"c6be688aaba6bd168f2d00e1fecca906*15&b7132fc0aea1a3e798065c57339a2de2*15&c286de2cd632a2a3d2c93d8f5eac4f80*15&0d2a96ea1646d937acad7026027ac1e2*15&b0ffab3c9ec7139ef656f942e269a97c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"92cf5e6ba5157a2bc49ec7929eccc903*5&836614021e5f26c68f28a961d62105f8*5&2d0f74f9af09357889953eb03506002c*5&3ed82d6bf0d8a1536a80262f287af340*5&87bcf8b1c7749289578c6b54c4977ded*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"0e9e9c65bd2173b66cd14765b159962f*5&df10e59befb7ede8feb5744aaf0072f8*5&93b7e056f7be76c83940f88bf7762729*5&d74173f45d1753969fc8fd1f28a750d3*5&5492db3cab91980eb08de276a39eb83b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"663a8c4a47ab014cf1ebe762393dd5af*115&ffda0826556bcdba035cd1a336b56b14*115&6b02acfd00af091735a3f593e2d7180c*30&13ea13a9e70bc4e4320a674684c9cc90*30&88aee9c5240b0c6f988fc0d0c472ea41*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"c9425c6910ba5f7d11d133d8ef4ea4fe*5&cdee5fbc94ceac11c5c705252c2ff274*5&571e84619144327f011c49c6dc9a4701*5&0c87021349b45e09b452142bc7c6bea9*5&4499fad514b43262c18c1d033e0b3004*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"a745a252db3c0f2dd2a96fc84027d7c7*110&5acb589e98a3a2c82ae35ca97dc0d600*110&1224b2edc5468b9c05780d2b055cbbfb*25&ebf425a88fca22c10f78c84f2a3ae0d3*25&cf536fcf24b8e49e8076971e5e1a6196*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"e9bc46933824540db3ef05099d168327*110&065dd85b666b87c4e8d5468ce9fdc2df*110&a19ec2fb3d0daa0f9d0918e92316daa2*25&7bbc9db79ba36932e0a97b297e24a4b0*25&1e1f02a14002556ef4ce78e3607c9c10*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc512_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"e3bde0a4ec5712e4b67b00a1dcba226c*5&f973d6db80287c572fdfd8d91f837efd*5&408674bba650f9ee7238ba5a39d0eafb*5&7d202b97388381ac29cee1504b6583c9*5&c4a4d069d6a052446c8438b46340b286*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"fa693fc1919643e7cb504ed6a405c1ac*5&5b3da13f6905a2852e4dd77383aacffb*5&eebbbdefea37a63deefe9646e66da258*5&b7fa5e03749e5efdf3ad7b0f320d7a1f*5&0fb6cdbfff05130fa2d8539b4622f252*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"e4a42cc50434d509467faf82e1ac7eda*5&47ea322ae37b2f13ebae5ea9bc111390*5&3544d6471d84d4e091407e7e225c7e14*5&eb721815e934aa7aa8399eff65d464c5*5&73baa9d5c4fec68bac73769b6ad6cf32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b21b6334b91501313f167cce920cca79*1&f830de8e6b3d1b2b5d5c28f87a995015*1&7155b6f71e1ffc2f00f05ea80115ef61*1&3f78cd162ae887ab436b521f1b3ed16d*1&94a1ecd1da50c83acad28f50d4ff7981*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ce2932c31901f716c83c2a5496a97154*10&2a95032c4a8268140bc666e937b71df3*10&0fb029780dcd891525ca2611333ea483*10&c62d91afec1d4baeea857f3fc3000b96*10&92d790103ca399e53f6f9798b490cd49*1&4f62ef88a7f787e1b657fd0adece6571*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ec6c637496afecde5e36650a6d17adff*10&209f9794553088a29bc94bb6010278c1*10&3c24e6c3833811e016c8381571739897*10&af2f80fea4f716441b8fd6c788128d49*10&44f1eb1ef05c8e3ab8f5b45643a98670*1&dc5159124d0ea655205e9151d5d71430*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d71319ff42dc24c6171222c8ffcf1749*30&9f615835eb1c25d7693a922243cdcef4*30&04c6b93189067ff8ee00cc54be58d46d*30&d7f7f84e1db1c17a4d9299b3776efc7b*30&13880527c3b9705c3599f09e02428cc9*3&872130f6b97a40aead1e9d4fbdd579c8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d7006adcff6434212ccd557a60e114d1*10&2d18014e7cf6f91869e6198bfe94127c*10&a5b8b83a28ff95f9d7a8f8d7167aa1f1*10&fbf9e5ad92775a959802e894523bfbbd*10&ac09e39b13ff9abcebc100a714d71ac5*1&8cb67dcd23b6536255623c1e2e1a2650*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"32855055f16bb20b05654a7f557e0f88*20&485e780e3c36cec9090359cb85d40fbd*20&b953c33b01a703df8dc039c54eedd7d3*20&f1db818bacccaead9560b7faadc3f54b*20&8f48283a9baac709332c63a53fb08241*2&52395ffc07e14e4ea2539fe4c0a4995d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ed07135952fa9d935dd63c90f123220*20&82b151666fd988a9347058093620ef24*20&a4d47ff98d35a97be5eebab502ea7469*20&1626db6d72531ba69050736d57956870*20&8af12bdb5f75b51a3cb13476b7e5c546*2&57efce78fbd48756174575dd3d7d756c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"749f5d93bd475bce8e98131bfbae2f00*5&8de59217d977d3aed73f48e225d55455*5&bbf75d2381c05bcb51188429ab417eab*5&64f73e586fc09dd77a0782ece751a57a*5&1daff79a3446fab9706c60121479e8d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"a595b788016ce5b96e098cdb13d782d9*10&77ea07f04f530f3a898ae96a2d37a6d0*10&a56374778d4fafd1d93a2963c4c684fb*10&c14add158a60dc99d13b2166184462be*10&7b38b1c04b04468780617fc6fed428bd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb100_ic256oc90_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"a960f28d0ebd8b52118994ffff0feeeb*5&8786b6ca93c4d3c1c253695e545927a5*5&54efcb3963fdba3e4be4126c1daab9bc*5&acaea627b9366fe31e29b03df4769b3f*5&5fa0981e6d4cfddb81ab3d61c6913d67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"dc4bb5d73ed44da7605a2049a14ea42a*1&665ed4ae0fc7f471eaf540fe87725bdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 g16mb1_ic16oc16_ih512oh256kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"4768fe676fa23f477a81ae29fbc97639*1&227b129ffcf57746ad3d2ddd47de2783*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic16oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"db2dc02d27874d848f43c4152a3a367e*1&cdd55312571d1749e8a97bde687db80c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d981616371b9946d60bceebe9095e26*1&d3e9a710a9c5510900b4dc701d3e7b8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic16oc16_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"dfb3304c91b61b431f713f5503143e1d*1&308d3e3823aca2cd576412e5a19ece2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc72_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"bcf2076b8c8796f6467259422bbf0a07*1&39ecf49eae02d3d2301ecd9b3f89ad96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"af7bc5363e74138e44bd2393cb362118*1&46662d08d0195f59e8e49512975f027e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic72oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7ecd35dcf5d6eebbde1a4a7e4143e6c5*1&8e573e6f7d594c3e44881544652842af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic24oc88_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4406fa0412d0b7f6059b4940d1e10e1c*1&82260dba2d5744af94dd9b7ac473a1e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g88mb1_ic88oc88_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c74493fc2f57b1a9868706a25704ac21*1&5debbb3d28d19b9dfae2e74167125cd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic88oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d55f2fd90092bfbc50ef39b59061d8b7*1&db2ebfea589404bb86a1b332e7f6f168*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"55e4635d80ed73d635b30f872190c644*1&73a6a78397f3f3c209cc8d8cd1092512*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"672643f690c978dd282d8db9279c47e2*1&f2c9d7cfb7efb1d1a947be4a859808c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"48a9efe87aaf598c524b51582ccaf46f*1&26ec338c8b8aee8b582865307e5905ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ac5b600faf3536a08d4df6ff0cc8287e*1&a73a28071867e1628cb1e1c09ff1490e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ac086e13f8dbf318c358652ceb3bede1*1&fc7d2cc8cb7f5a171a7449b8ba2e7ab0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"40dcfb4dea10e56b1f8d2c2f84338ce4*2&8fc66701bde907135a16c1bf180ca78e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"fb8fb9ed71beb9281e6c68b526843432*2&67a683ab2ae653f91c94c9117d08dd1c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bb8a4915a31288f78788b116d66db088*2&2fc50b533af508a9dc4453704e4ce0b1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"70111c8b2a4c2fe3c60910a1355f1368*2&212a30379484c0cf21d7a6620ed4017c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e1376196eff259e11a62453cea3637b3*2&b2bf6e67fde27ccf9be45705129b4dda*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"91eec98158dcb655becf41faf4ccd7dd*1&b5e528fbd4b505fd72172391e7a2e5d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"f42125d37266afe8caccc989e25927b3*1&e997152d13aae137941dae4f99e48e31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7d774c41c29a0a884e1469f2eef6cddc*1&3438f6ec2c6835ecca162406485c6ad2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc32_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"8ded12192afc5d04d3ef7063e309ffaf*1&99c15f0ff7244784d09b89d466d6113e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"4e00d7e24d2a6296c8ed3d374def73dd*1&f2cf7a6dee9f6e29859477d3a41e1797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b6af1953969ea3b7673b6cdefd1c771e*1&daede14e89f44d4791c965d80a50509a*1&3d16de4cd980a521be49bf3dceedfba9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic120oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d9bac257b9387154411d87fb932d6cca*1&b84b03600c699488d8ee1267e6f008e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"65b6830913734c2b706d1daf206be818*1&dd2844b18e02f64a506cd0cfa890c07e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"4f2d770dd557cd0dfce9c7c0a6fda358*1&cc9d71eedf63815e9e3d4c204e97908a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic144oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cc71b49302e11cf77c7b3420b8de7659*1&e019988a171f8f8799a5fac3a93e0533*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fad4eaceb913bf77b94206ea5ab22855*1&1a5110feb81ead746a7723163380c9c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic144oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a4ff9b664ed4a16ea942be36f8fd99e2*1&45cc354588fcb2d900df72c232420127*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8cd915817b906d5d105403fd1300f7bf*1&e888d104509dc20361780b67dea50056*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"50a5846b5c9578d9babb0fd0886bbd9e*1&e742bd6327763cb03164618d6a22485d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6adaf87c63ef6c33f004ecfc1d300765*1&d0e2be13085c057fbc715be9f28750ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dde8f6445b23c33d8684e82f085e2442*1&3280cdec247408349ba0caa4510f663c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic288oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0b25d25aa34eb256aa40dd8865441287*1&85c7408e4e49cd6f5d0d662fb8cb22ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4b8d44e88906490722b873adec824133*2&b2fdf4dd7ff5f2f4f9b3272b48b0d41d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"305c71a8e036326cc796397106c6a82e*2&35f5fcf7ad0bcb2b35227b613d312da2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3dc91bccfdb8ac3b950e7f55546bcfeb*2&e95f72a697bf940362b0eba6de4643ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a8e249fba35658c440c3cfee02f971be*2&ae2b7abdae9140f42c2614d6e50d1d10*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c26ba8e212771e5461e8dfa95f40a550*2&866c7013000467b351f1b78dcb10f5ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"a2b744992c888143e7bbf4ff2b429ef9*1&ec377b1d8b0cfbe587167f20a932a8f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic576oc128_ih5oh5kh1sh1dh0ph0_iw11ow11kw1sw1dw0pw0_n"933a2138fbd21b97140bf704c074bfb2*1&2e5e7b8e498b3ed0b381566125ddd844*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_mul:u8:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8fb44ab8a680a1da2822619f124962ed*1&665bfeda4aad47649d5c27a6dfb8dc3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bbe4887cb445366a849de69a543d092a*1&9d6671abb1cac24b35d994703e487ae6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic192oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"120d4be410618898d4eb722683e384f0*1&c69fa276949e6e462adee300c30cd805*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic160oc128_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"7d1d62bb18f7a8f47c84ac102128dcc6*1&55eaf2f4b3d19ec9d52647356dd1707f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc19_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"cda335e13a481a9b5e05d9ec5567d08d*1&5e5b5c6ba87669ef9d7f2256f24f85dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_id16od16kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"6a8d2a36cd02d36115bfea608653df71*1&d50576a648bcaa4697bc69d97466dbbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g16mb1_ic16oc16_id16od16kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"f98cd41d563120b88077acb1e4f91741*1&2de6d762346d0b92e22907105a05da90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc16_id16od16kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ddc7a013508610e60f6734318cb60825*1&d6c4c6409f5e9e9f0b1b6fad9943ed3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_id16od16kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"a2a5f0deb450c8480ba968cc116f6760*1&d603f8bf096e4dd516a02747a29f32d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_id16od16kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"53e905868724a6254eae81b45f38a332*1&bd8d7251c200d85b263e63d5a817daae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc24_id16od16kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"73800112bcad90e82df25e70844baf3a*1&d480eafcc780c8da4e4ffd2fb1d69fa1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc72_id8od8kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2629bd5e5ec8a981162f1b4ee2ae9ea2*2&735f5351510cc4aa27285951118e7975*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"162125286f19e06801f632cf0b9ba925*1&40f028303711460974ac40cede57ce0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9a2a582a70291b936f85c9a4062a6848*1&7d4cd6736d351c2d0431e48d44bd74ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb1_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"1b73095bfca4a08fa06b39f60ac0760f*1&3c9dcc41c1a40b0a070c29dbac4f6ed3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8d71ea57e233725ad72d59125e15c3c7*1&f0fc9fa09b7bfbc9ccc4b3572c6f348b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic18oc72_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f6087d5cc9cd685b027c6de2c4732b66*1&6ac91e3756be511ee27530119da5bf92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic72oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"142c18ae20bbb90c36317f3257644c41*1&b8b41d65b71ec61c8089f165d7b9b9d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic40oc120_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f0a97e12cf9095f1f4ca87ad68e6fea0*2&dcd9bccebcd5f4a6ae8ab8013a8cf28e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb1_ic120oc120_id8od8kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"534eed6a56d1e9ac9e715e5caa4c3143*2&7c340c8dac712d0457d2601191a29a81*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5faf0b7c3ec776c932356c30acf26177*2&9a0420068fa0efed6deffac399f8a042*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic30oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1a6beabc8be105a76cfff2728969c995*2&623e521558977d74b4eec86e2d7dfb7b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7205fcf8bd36eb4614c3f2d8e88b8111*1&8b55b587e0cbcda982dfaf2b40e2b6f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_id8od8kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c120ccbf52a2d318f739524c611473e5*1&5a57ed54cfae9b51c706f12506fd7bc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic40oc240_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6d62c4dea42deba98951610495e990b1*1&a6ed6c479ff0c5de4ade916d3a49a3a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_id8od8kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"bfe7223cf8bcb3430382ad1e31d55549*1&cebeb23d431069a4fb5f867f8ea4e28b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic240oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6c03179b20ca7ba18f299f1b01740555*1&9ceaa22de08cac8c55533236616799a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc200_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"324856f7b8d1e4fb32aa18d0ce522bed*1&80f9937151b1b1197fc77eb72f2ba2e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb1_ic200oc200_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"52e30eeb27e2c45710d94948f3c22c77*1&7dc429212b21239e8e58655cf4e2e3aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic200oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1c962e06cf872d58c8c21fcf3cb73896*1&ad2cc595295c29207cf89fb8b73d92f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc184_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ae3a49f73569aa5787a6122256aa7fc4*2&f045487b39b23386e136d98c370955cb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb1_ic184oc184_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"eafeddc9d254d320967c6b2c0614e645*2&0ed47dda9945db9ec5e777ef9aa35c21*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic184oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c41a4d74abb0b66758da50ab3f922e7f*1&96e2192978c54a4705b58c8f8b1dbcd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic184oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1eb5bf9fa9a10b77334066a17e8869cc*1&132723666d66de2b71303969a1615617*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc480_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0c51573db9d6c80b1aca0f7680dd0f1e*1&c757b490e4ae7371a960760a10ab559d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb1_ic480oc480_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a0d352850ca239dc3232cfa598940dec*1&25b1ba0e9f27d007d527f613f7a70c86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"842d82487650d170e72404db834a9c73*1&85dcd78760915b39893cfb2eaf0b7a59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic120oc480_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9077aeecea47d776b1b1fb33fec9d8a6*1&b05d1f98f4efdfb7f4d8ff68c88b6183*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic480oc112_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ceb076152d55db51a264c5ebfac51df6*1&d7f6aafca0739b4bf8ec9d5b334e40e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"779e522ac82f066e5b6aa618ad8c2da9*2&5240880d3df68b678ad5780568378e2f*2&2d7729a3b8515ebcee9a64707470948e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d18b059cc9ecea29f62bea3a40c9e223*1&b4911ff67b246e7e74f017208053e612*1&637b3baedefab034cba3ee6c90b45624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6bf3f1537d43a8084caec5c7c650c077*3&893f5be384938caff0775949f2350758*3&033f510d3a97804bcfaf3dd07b309398*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3e5d0731097ab4108f1579a40cf9504d*3&f171670f311622c0ad96ecd08c56153c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2cb6eb628ebc05998a68f96f9daaaffa*1&7652bbebabd87423cc2943c5703ac27d*1&779ad849036a8355fdcd08668c63e52e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"88236afeeb5bb4bb322ce5d24e689939*1&23fa2b7c9e34a54a345811d1e592ac39*1&ac12081e59129d502f99ce61c2c2b77a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f15a88517912bbc157050b0d255d9bf*1&df864e691ddb30142938502b82ff59e8*1&94361dc360f12762090887162e4377e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ef5c97324dfc1fe29627853d0a2f53e*1&70f11a8b687bd19156344b9b6651e5d3*1&531f34e22ac0e9d9d0c52b8dbb6c982e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"d963a54eb0f5e647fad9e3078235e1bf*1&20f7fc623d65e2944c806bbd56bffa74*1&b31c0a78b5708781a6bda566e8ad05fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"89c9cc0391e42cab6bb5fed9575efe30*1&45d19cb939386d20c536f868d2fafa5d*1&955297c3a82bf98ad71809732111b505*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6b305dbc14f7e4d31b8893b1afd5ee55*1&8ba60aa1cd480a7074537075873b6464*1&2e4f94fa565efa4eff463c5b6989a1d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb1_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"c99bd2790c6676d69fe6358db9a2ad60*1&1e02bb466dc77d3241eac80c4fb40daa*1&5ca659116f6c9b698eea3b183c464c4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c7b603758e89c39209bb04fd4f7ea8d7*1&07c4bfabeb83d0dc3fa1e2bfb819d322*1&92f061b5473d7626d0caa97ce9878820*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1680225870ab7e9fb54d6ad95b6b38cd*1&2e36861b37c388d03e2683664b5ce3b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0b858d75567b64d0dec194a866a0c91d*1&90802456706a14078f775e317829dd3b*1&a1e76be8375f45c00fc229675084bd0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fb04b1b7befdef1acb3f9b950235868d*1&c6192a27eab5b120191dcd2b893eab4d*1&d638b9db2c3326af66eccc1b40f7143d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"0c1f7ae2f0ea6d3ca6702415dc5557e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"1217ed43bb9abbcf5ab575cbc95c0dac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"98afc83462a0ecc92956265e4234476c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"3b5bb6a9e7229956cc45109e5edf624f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"0ec4bc8f64d3a852b2145f9f89b5dc6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8336ff563955e346b0e17e73d5beb5d2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c100a301e77c16a43bec0f257396d081*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"79479cc57e766f2722253b9ae90bb595*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2ce65b220f0403b3c974ea6ef907af59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"cb1de572b16dc4655c482983e4f456ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0debee6eeea419f3aa3731acd48c4a75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6e3f4b9538ba20fc5db764121fa9f284*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f1314a1b674dc7743041a72c9c2de7bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dbf638bbb6f01bc115b9abb2763ceca0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"12a8af329abd50335d304e6b98c95d9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8a4b3594d078e1c8b1016bc69faf46f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"31a1d563b68b0250884ec138310f8d6a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a810c9a26fdb261f05e45c1a0b77fc2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8c84abc6fd8fdbd1ec1b1280d87068a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4e44e7a1789323e805b05b3d4ee3708b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c02921ff1e174a08e122e848c0aee1e2*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4d02da27d9920779de7898b746f32840*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b3fd8e933781b4ee70f2adc0bf054e80*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"27704478380e9848cbca135dd2739d15*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"2b6c1e2a9656565a79bd28e884e5ed84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"64d428e1014b2a21a60da4cf81acbf2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6e9243fce75e66c8c08d7407f8e477eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6bba4313580108e74ba019a4654c9d10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bfb3a92cef08811529c5cf903abde7bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e1688a78e05d863d3fe04e87c4acee3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ca15e130e8e5f631d771bfc8d9b34a10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"0c78074d48156ae45e14535b5586c18c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"ce3d369688424168b719f6ee3f3b4a7b*5&b4ab433e69fc159ae78dc3b1bfed100f*5&561c310dd2f4f65f98d6313ec45bf3d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"c0954f8887ad17903c72bfc38a30922c*5&8de16b02c8cefc31fe9cc23bd7e41dc7*5&751eb1bb486b99e8cbad639fc7e184ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"7a9307bc5d847400d8b54f5a158fdc81*10&66a6f1ad783ecf49ffada73613f15950*10&8aaed28d4e1fbe5c9c5791226c7d903f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e170781df81a777ec79218123153214e*5&443fc10b5b4fc7a282f829604b368c7f*5&543a0de3453d447daa1174e302d9605d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"68529050e30859ac1679c52b7d67d38e*10&0691d4d030b5c5105a5136674d6708f7*10&2eb4281a8f008c6d4653da217139aa6e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"a61b3b18a63fcafe678ca5ff9bc5631c*5&9bd8a2061c4dd7ac624dca1ec0e80cec*5&d580b30b34439a293ec4689d5c7d3717*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"221e70ea9800a939c1a2c9b27f672b00*5&e2b97fc841595bf1ec1e21eabde5df78*5&0e940abd2d0bb142311628b55ca13f37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"229ff42aa9b00defe68c8840002159bd*5&a1d465c91da6f026c789e378ee021bb4*5&fa75cfe932a1914b24170787a8c64101*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"8e12516d47ffa8b1e79eae60d9192a57*5&d826552055b3e5a06c0986576435a5c8*5&991bc0944032a11552bd6ff12b83fa93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"81645d9457e561342d2d94fcfac4530d*5&413bcc9032418b241280d696b43d9673*5&9dcc80fb7525c6d9c4777ba5a3ab3ca4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"6fdc81e0580eb52c8685604425e21fbb*15&a7aebdca4604ae6e3a71ddb984ac191e*15&70afd133d3904214a7d3a04a1324be6a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"62374c6103d3b13abc69e540db16ecdb*5&77f7c08c880cfd25f60ac850f4fd8bc1*5&3a6f4a574e35a3f075656cbff008d448*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"eb2fbd6e286bed1026de34ef734a9a57*15&626c2e2bfaf20876014f52e6a22785d9*15&7df688a566d9c9ae3c96888b576d359a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"b52087a216de966f3a2543a4151ee516*10&58d43ac47a3f1f9394e6cb65d5d0c8ff*10&6977712bc04f3d24475b85ff00624c37*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"bebcc1f40d2d3bfe54751ed65ffb5082*5&0f0d6e46bcdb9c7bff35c7d3b100254c*5&9991c4bcd03f2ef8c8a677d169837f5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"04fcf4122d9317506bb6858568118802*5&ceed71dcfdaf92c8b4072c5ce870ac58*5&90fe478d8a3db6d680b076d1c098f865*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"c02f6bf820714d77490193e9b32fbf14*5&aa437780f5d8d15f2d16f6e1c929378b*5&485de071b408b85baad10ce4f6d24f0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"c7f5d1b801610f36ea66975ddf18681d*5&71fa0dba21ab4f0c84a1395bd04839ab*5&f4bc78bf07b0787b817a9c840ef36f05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"09c8ca1065c7c9993b4534034921304c*25&c1914147feea637a066b269299ea2595*25&10216b1ab9f2973d285043b903109772*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"6e59620b615f81aeceefbedc502f85c0*5&2229329c4d725abc9bd7f75a1cea3101*5&3812baafd980d498197a98798bb54747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"919727d7d9f1cd9944669d732c9ae788*25&5a6fa1b5d0bdd4cc8bf4cbb88d745b1e*25&706dc71b4341ce0f7dd93ef5b0d1826e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"c56fcc585bb93398af650f8a43108f45*20&f1cd717d313f1f2f77161998e8ed7e0c*20&40415bc44f7c86a34ba9d34b329dbfff*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"767e984a5a10870fac899a239c012ae5*5&cfc7980deb63881912f1910959b79fcf*5&be1a5a775b61906f86f830e8e4ce7f3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"0a122cf592a9e890887548f152f5a8f8*5&63e7d1ad444da16ba903689f941488c1*5&0766bdf2df512c5d0c12cb5d78da1204*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"574368345671c27900e50fa8766ca3b6*5&7a3428e77368cb3732d8d9d9ebef546b*5&5838aee33625eb5d37d7ece2cac5a2ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"358b083af78c423b9a81d49d1a3617b6*5&3e186f2b100276defd052da734f70b28*5&81b9b04159b2d54384d34556b0f4a63f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"aa9b3581f1067390240f90acc1bf6bc9*15&a84f9d1b1125224bfc6824d7c21a87b3*15&7609959f6f5e320c13e09e247acffb0b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"a8bae094883798717afb4123dbac4009*5&6b4e55e3f5091196d4c1c9e063a110a2*5&b82e4b7644c7911bee70ffc7ccaf9e72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"8e7fceebbcc7df4b8ff4022419c50bfb*10&fc099c7d6df3c76692225bfa1564a42b*10&e278cb9551059c4aabb95896e4e62e76*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"d5a22b1551660005bc4f37529d6f66ce*10&b43fca628577ba257a3b1879d80fcb4d*10&0504d81a7f43e8aab0867356bfe5b905*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"f6d5803d35daae16ec1c3b305b319551*5&0856069b6960d2acb0f76939060d8821*5&751f4513785409e02dee42978e082756*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"968c6f00a1fa2f5f4d7cc19e8181b57b*5&91c4853dbb949ab116617e486e6aaebf*5&444ecf771886aad215407283defa70f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"e126af511732ac57f74d4dec86fe9e4a*5&cddc329e7336e626d77474c074735d9c*5&d5ad86fa8d4c86ae7159442d143bb68a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"c31fb9f58949c95bc934268693a78baf*5&5357cc7b07ac8e0262386f0af7584aff*5&2a65a964a36950a1c80050e3271bfe4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"ba7234649fe41d1137e3688b5685ab61*5&8e96135b6d181764f0d6690545a8b474*5&7a888b84bc8c46e731b2a76d16bafef2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"e3ec33d98998eee2f6fa7c948d849c03*10&bdbfdd01aba91493617e85966c3e8dc7*10&cdb084a352a015282316c03b4ee9b5e9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:0 mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"8a607dc58c3a2980d2aaaadc2b49e04a*5&f0d93d23f401cd114aaf757795275025*5&aacfbb1335613d7f245b16e11e4b3fa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"fc63f9076aaed92aa5ddd64798fb877a*5&e20449a0a666670a332ccfc62dadd649*5&ac223933b6107770e17c89dfdb97556e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"8c8adb981e2048163032af9102125321*5&dc8be40e8668ceea1a0f8d2bf14cb5f2*5&55b5b910c7cc86310ac96eeec19c5c00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"885f5ac5ec0adf19460ddb0c69ba1a8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ca297d2404def707a89d46cebfe5bf70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2a67f680f385930a9b562a87a0dae379*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"40da9e1f18b262c48fad72d5a2bd4f35*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"78c9d3151ee54f2649c945d71b956227*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"94fd30fef0fd12b116573b26849316d1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ddf924d7d5c7bf25cc840942b4739f4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"f351da82df61487e01a287b459c306f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a2cf39f7fc6b6ab955fad73544c429b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ae505fb6eaeab260a1c276201efedad6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7e4b2dee927859cf1b9c824b1c9e1ad2*5&20b4756e40bce523a2838960adeca8b5*5&3924f0081102ab15fbdca62b98457945*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"413a0890ad541971365a6cf76db0b101*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5d5c0c14a4fc40917d2c4b8bc586bcb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e262b139e643082754ef2d34bf8b13d3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"18e775b499e07efaa30a6baa4cccc2e4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"825ab80eb2e535cffce820c10a821458*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c891eaf772433d66ac6b19109d1af0e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"70fca4c371378f7a2dedf1d53452a2b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8897facb62bbb929eda575b056c6799c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b2bf85e51a07a282bdcfd2392fb061a8*10&10c1766af1f58aa5feba4f7415239836*10&b937303f88c1c46a3d1ecdff512805c1*2&69362ea7b8d8220f2937b31eb3e2d2ec*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"10533efb2801f76f98894aa663e81065*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c5b958da574e1d3dd323291bc216fafd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f10bddfd9bd0b1192d841e87b5a30476*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3d2aae9c322867af95d5d35aec2fdd52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"615cfa2e4ece81a54f98ca8b3d8ee355*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"eb3e11a1a02f7ac292e738421980461d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb300_ic1024oc512_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"df8c122cb2bcae37a4594ad1bf995814*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb300_ic1024oc2048_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"51d6b529456f09b123eb64118db59172*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb300_ic512oc512_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"bb3e71303ec368d7f943f37f06447fa5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"9a9e0a631f6a5bae3b5479a5858af5ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb300_ic2048oc512_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f0fb3ddbb07022f247b0eade94152d82*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"a745630192eb40b49725249f2fad3cbe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"fd596480bf916fde97e72aa806f8fa08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"6e60370929ebd64ff0179cb579520fd4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"4c6ed91ddf35841a78f22afa4ba3e3e3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"436a2f899dc843d339aba184e7532cdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"810bfe3a35074e3a9840a72e93edf4c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"2792d01c8fd25c760a04cbf5536e892a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"97804e67431d4663da5ca699d9ab0484*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"0453486077d50c025049fe48125f2fe7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"bbb5e25bce6386f03444040dae1bd34c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"32a1cdf9aa48cd498ddc9fd73e0c244a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"c777dad211d5b8c0e1a2f73c0bcdf79f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"62779c11e5ebbc4d16b668d49863064d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:15:ABcd16a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"d9449a9b66966ba76c0719353bf788fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"5f941f385a195555f32940515fff56ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"a419a0fb0f01a00f4b57997bd314d46f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"af01b68beabd6b91b043bcdd7e6c981c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"754ccf72cfde283cbf902fc8147e9b7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"57f9dce27598dd709031863a91dc43ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"90580b2b73b5d7f9417f5f2f1e75b1bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"bddf7fcbcecb57a6c7822235e212b757*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"9b81ec5fd93a65580e9a8644ea5e00bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"020491cef4d21f8630f3344efa7e3564*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"662e3e42b7c6bc9d045b725fcab2f30e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"5bb4c438925295fa0e73b1951b0c077d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c3f8c9553140f9c5d7102e1ba8271100*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"bd2a92d8fdac57b23ca63663ceb37523*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"c0d14586859769759c4424b474a9451e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"f22a734fb8ad7e1301e2872793ffef9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"feeca0177f7d8d29e38edcca25e5341b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"4f8d7b5d9a66818a1230b3f57c5d01c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"813bc1a50bd7a396ecfe522397c91702*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"3f76c56ec1571cfb72fee2065a5ff89b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"fd2e14a27a49f80027bb164be01b69fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"7d6f52db7b34fdaa49fd31ca13910eb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"55c8195b49160ff6bfbaa88a26662f96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"7c12c75d16c8e2e4a196e4a7e44e6e70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"9aeb28b8d7af525cc42dc38939ec3f43*1&283bdc09175df1992d1fda717bb1552e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"11b9b9d38924d907ce709f33edb6dc25*1&25b9ef2a7fc9d2af15b74e02a7eaf711*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e53b205aa73fe7fd1d396376ada1af03*1&e0bf72c94f0cd02fd914f2abb2af6eb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"4364d603374ee7830298a2654246d30e*1&90c3e771770c0559a10bb191aa849262*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"623e622cfdd12df1cf04acabfd2f3af7*1&c7c82f0820042a5d69c0d5f8b2115fee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5f6fcef9c3e1cd6523492b2e78469997*1&597d27247fc5959a24113b5627b9371d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"153446487d72c0376eb415222d1dac0b*2&6e8029f1e64eb4ecc2eb783dd4feb389*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b5fb49b5b317d3981c6156653c545935*1&beb7273c98a24bd58b11a39d0125e1d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fd1f9bf1ed4a10adbfecfea8fbf0670d*1&aa061d44624e6c285c8e728015f7ebc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"ad484f8ca5016d5c556b21c923bff3d2*1&171db942a24d496c0529374330e4bcc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fa20fe4732e151581dfc82ae2b9bb493*1&e1e15deaa51a149e1f85a2135f968d30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"77c27b6a7f855db6329d692d20933f32*3&28e029a1e2a22e304b1c945f13790a2c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ce14c0f87b009a14614df8c5d96515ea*2&e9f5dd3a1c39bf1aa77789dfce42e3b5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cdfd28b7fa97e7e0068baa4e9b02353f*2&4d9d69a0a0893b241299f4e9abbe30cf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"fa58f267e6f2a3ae77841eaff373266f*1&6af5f8a2344042dd1b24ad21e1c424a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b6036a1d4a14d4995cbbc6b23ca9acf*1&cccf1496c1a1a84b565ebdccaa98210d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b406035b0f47226cf5857090cff39b05*4&3705acb015d2e2eb597292499026aff6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a4d829f4e204efea20ce40614a440a60*4&545fd9a5988fb47680af2c764f4a5db2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e32569810b5aa7eb77d47aae0b0d601*3&61ba97efd3644757abd604762eff8bd5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"85221e2f3aa2d57f0352c3a05c9f1916*1&998118769c9ceb29082a962d567182c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ae17fe82001e95fe0ef37410c41d819e*3&a1e6d6cdd7f490e0c120b2b9045f9e8c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ddc4b3ebf8ff0d640040d5a0d1fc5719*2&6f5986dbbc2d9342fae3bd0d5c81a407*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"afaff350b011d100e9fe2ccd46794f53*2&baf7bccb8f10e723de0b418bf3bf8b63*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"e6102ad6b09526929c93c979afdf5797*1&f33c3d196c6aa120bd97ad7b3499bec3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4d392ce6715a34aa396f858e31276079*1&776821382d958ec93069ca71c34bf018*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1bb91081d49a888332464b2f9a8986ca*3&b4598ece4fc609696444480c5c86cf65*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"55340e901d82da5db93ca5fa5c76f74e*3&962d1904a44e14a75904dcdcafa53530*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"722b78ad6c4584feba8016fa8d2b309e*2&0044b0f8b53caa6f48da9cfdd4aef6b4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5cecafa481c1f5640f0366c55b100599*1&68fe265ff0fe34021409d4570a4c53f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bef657648929bac325615d243c66caa2*1&72c563e7ff15861f43f72ff94d711245*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"439c7bf6b683348304c24df7206ba624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"599f9308ed2f5aae5dd6e1dfd077bec1*2&47e7d7d4cab91256f447919a34707d14*1&24ff0ebdc52c79ec271497bc411aacfa*1&ef5de6bbde8e9d527d80fc1c7f5173a5*1&91188b55f9ff7a12829f2fdea23ee3fc*1&682edb48b4f070595262e45bc6d409a5*1&e51b45e66a8de9fa617357204b435865*1&76b8db738d5e622f0bf727f7f1c525ea*1&b0ef2180b07d778dcae5839fef67d917*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"674f1a8bdb77b1bad9413335be452c94*1&de96bee1b5de9179a97dca5bf571e8ff*2&ef9f71f7e775358550410bfc7345c0d4*2&c2e0972e426bbdddf56a55a40c0d61ef*2&be1a589e934213340302401f0e0377c0*3&5ecc8a75b18f8ccbe018d28a07885872*2&52d09e7fcefa14c741d9583531cc67bb*3&60e60a2addf91e5082dc492819b8750e*2&7e9a80c64908d98e513f19638aa203fd*3&97f20c1337c28a61696b87e646bfce64*3&58fecf2a992a6d4541a9e63a740936ff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"761be04a47720faf9d1976f9700b722a*1&39c6d661f77b13145e3e278a54d3645f*1&8290123e825b4b423284a1e6260c92ce*1&0361d743913ef524fc89bd687ba3485c*1&a13275b733285fc8681aa29b43e540a9*1&4d1baebacd53c5653bd21bf39fbebaf2*1&f8a159f1a9d60972be8a6c649ba93a25*1&8b89429f0bfaf2aec5ee5853cf6a2b3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"daef129ea1bd3480760d4f2ddd1ea8dd*1&08355db5dd53251f55e9c8aee5a6a4f9*1&367ff7589324222f899bde08a3e2cab4*1&15e2e0d31f0c8de89d955c8574841068*1&d2c50f1899802831c81dc8c7528218be*1&76e805d415f12c274f6fa3b09eede4a8*1&34c9745c3f8d8e0f941b1d020e4efeac*1&325c03f43c7c9ba0179d574c317630df*1&d4e3fb91c09a6e09434928be5cf3617e*1&1aa5fd79e8a3c12fdd402a3b3e5f74d3*1&0f28a00f7eda9495d7e3411ef1b2cec8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a0134fa447405d423bec8ed403ea7397*2&1b5576cd7460dee4992baf23b5173c21*2&abd8e98347b023934487cabf922c6cbd*2&c068a6eb4f06edede924ba9c227db323*2&3a74ab5498fa47e74725ecbdf861497c*2&a6ad29209c61f675410c7921cf6830f3*2&8f900daaf9f86adb742c3ef0a26ecd61*2&42da012c9cf2e4602fa1228dd7dae46e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b4bd5b8edff445d52a0721863a0c32f4*1&d4f1d1a8cc3b1524f2ea20cc4eab00af*1&798c0bb45b1cb7226b15c733501c0c51*2&cffc6e9614a32ebca6f5507a85c7d9a3*2&0c52dc66a28c0d03db4d2344922e8de3*1&20198fb2f36061171fee70d8cec9b93a*2&45495b7fac5ec60007a248adbb96e624*2&01d01949614120d60d0554e0d8f36e5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"63ac90547029ca394a8afa7766f121c2*1&ee7eff91047b65497552ebc3c0f80828*1&01a055e294d121304d5cca48f642d22a*1&83988774c19cb8793cf90d85a42aa18d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"30fdde187d26060ef138a1f1bd487285*1&47572dbfc405d79fed27f9449a1a9eae*1&3f51d1c5f3215ecb74b138b04dc62e92*1&1bb4bb42dc098799ad217ca0c565cd56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"db193f62f27dfc99790ba68d4916c921*7&d3b8e5d396beb275d9977b6c1db64fa3*7&0bf5aa37b0a1c276026f419f1ea810f4*1&95f425387fc8d460d63a87da4bdc4e92*3&10604a0934c703ca98bf22d74151da61*3&0799d360aa29fea9c66c942a43a5a7cb*1&4f9a9333854aad59e3ad27afbacab399*3&cbd67ecc4f79bdb7fe66f1cf7ba902a1*3&702a7c9fbc26fed77e7f0f036176f2e1*3&3505426f71b2fba38641f8f27b6f5a93*3&8d9f8c690887554d4e24d6bc5193a778*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0b062752e7d4c3bbfc2c4f0dca0936e5*1&aafdca663b6d3cd28631bbe1de09898c*1&45c84580aacc770e92c9dfb68dabb397*1&5a9c179ea5da4650586e83c4de62da99*1&a1757684d4fd7d071d63725b05ef20a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3c7c18a5bb167f06b236015ba573a917*1&5f4d16ef6588495ca967d995b9ae8cd5*1&3a5851c415f0baa2932729f0ab9ee749*1&776c7d9e24d91889e4b003962b90b81c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"084990e898534ce31a100a1d27400066*7&8ee3578655bdd303bd31e7de662799d5*3&bdaa3a9091c4aea92d887568fb39144a*3&cceee95f3c1eeaae8da56c7663049dc7*3&93cdff88fccb7d068059ab5e972774ef*3&c64fdb32dae3961318f01f61701c89f5*3&09902852b48786b8d9d7cc67a59e1520*3&317f1041034bfd10187f5435442ebb73*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"617139a56f987f1264fc367c827504af*6&017718ea4f20cdb5ad21b5e6cd09ff14*2&e32ab238325e1532de9ef83f7f9fc254*3&eb82b2ac041b1b76cd2a48a431b595db*3&c4e1848a70838da7c353194c55798ad5*2&b9cfee754939d53f1a948b391e498953*3&7fc857025c8ccdc5560549862a1cd15d*3&22003089c042423b8ab253ed0d96a2b1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6c7f62ad7f8295c94b2588d23626208b*1&a3351a17ea029ef26f7cbb5c10399777*1&005fa2af0d1cc76d4a1ac52a35bdb179*1&a5627727ad5f91612d723ba0f6e55807*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95f433a9134c797bc606619b1ad4fcc2*1&47962c30a86481aac12e0e05e8fb8012*1&ac2f2272ef7429c2658522fe82bd9308*1&f3adb92384cbb064394c62663dd15c77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be1c3ddebff191c263d3a0120c957e6b*15&1cdb7003db4091f9cffcb27287f028b8*1&ba1f4da2dc46df2df210561fe8c8c7ef*1&e8d9d2f0c1200d891ea00a1a67ddec75*1&d49b8038f4933e187951289c778a1109*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9319807bf475bd5308715618d610adc7*15&f01b4b9ce6bd6db11951bffebf9f9932*35&dcec072f12fc7f9f9c29c57814ac140d*1&bcb63cb9cd61d73c12a75633e257d501*5&578d0bce7c7526b92117c64be2e56f41*5&935c9e525a92f79449cc76f2060b97d6*1&7845283935e3d4aa599e70c0bb46e3db*5&1f477bf434b90e96eb7b60e750cec8e6*5&03d3a16fe337ff759b988325f00ee050*5&65f764db09a2d47df218c36fbea1bddf*5&f40805edfce0d9f574ad641f15cac70f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"55d1dd42bcf6571b4cd7ba28695051d2*1&8a79e1ab395278acbdba086168368634*1&74574dd463f4fd307663660d5c1b5cf1*1&7652ecd979c668a33703fb46b29bd97f*1&2cf7faa4dc93885e7c626df8621fdbaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0478b7afa704d32306dcc1cd1ceab57b*1&049c1d28fd9a0e633aadb40054fc5b8e*1&e69f5984cbfbc89156fa4a708c7ce063*1&92bc38560cfcece644011e68ca74a905*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d792ec03e9051b8ee75c2268a9ca8681*35&f914e09fffe2221f5153d4417c0ed0fc*5&4e9b822547d8eef52ac7e13b1966d447*5&c342c49024e190499805363d283e2c73*5&d7f1dd7f4268abde7c2d4ece37a59770*5&50a1e3c71bc798cf7ebb411f93abcc90*5&4062451aaeb3e8ae61eaa57fdb0c10ea*5&a315cdce56ac661765747f532639e415*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f2153d565184f5c0e5c50b118314a3a9*34&873b4fe873cb29a9dfdf29b02eb4b225*4&3299419b313490ea04e71da5d7a551e8*5&ac9333586ac38bd92c41675d6a1ed36b*5&2b151f18332fe1f832c1931535bc3faf*4&2763ac24a0185675fc779ee8a94cd1c2*5&c77337cb172dd00dbb51f1042e3f79b0*5&46df1f9a386871d842e644190bacfa80*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"c227c06f46b512b7a26b3d096dd35837*1&902b14ce819678ff851d42056e2cad77*1&aedb30e4ab112e7bd8a38c795f7df25a*1&fe1a8fd38da337b3062ee4dbfdc04124*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fa4e8e395d146ed55563725a124eb79c*1&774dfe94ea0d6b4bdbac8037ee1af68c*1&6c7659312ae47a20e9b958933002ce72*1&6c51317aeccf87597e5e3dd362e8dde2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2c4be70f300787274baf193df00f3a7c*1&d5c6cb1f6b332de689cbdfba619d3eaf*1&9a6b9c2172f198b240a81468fa788505*1&70735174ef20141a1fffe09682872e08*1&af80cbdf4ff11826b9b4886a6b64bd93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"207212252ca229172f145c7250f41fb7*1&f8e3ef55effc4b1a92f093336ac6f11c*3&c89b96f8a237f594c14a8bd19fb39c90*1&999a39bd4805fd0c1f14f758d61364f7*3&4ed6447f7b77d240595068240223e74a*2&70d4ea54cf97b40b4b73a4b927556ab0*1&6349e3cae6acd0e1c7a8118520caf61e*2&d0b0076a32e321b46e401247d582b73c*3&16c369c7abd8d640836ed6501a00d1ad*2&51b41f9e1a3331f9ec71070f9bc77389*2&9d57dd2dbd7add9d4a44517a5ad65f0a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"225916de290caf81016685366a409633*1&0a51c09251bea6ed74724849ff6cafb4*1&5b07d1fb0d4f8e27af28d9c259b51dc7*1&507786319748447bae12c308d45784ba*1&04288738ee4a2307967f69ee987dc356*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5d98da7af58fe70dbffdc1a1dd4193bc*1&51d9a506e57d82265b294d1668865ec8*1&e3f3a649f188134ab9e3009829fe95c3*1&1e6963658909370153498c4a85c21779*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9d1f6b073bd5bab2d22260ac57845537*2&42074cbda64c8d5b23ab25b8b96968ff*2&80cd00887973b8d598783070ea9fb397*2&6d3a4a0a170047c9d5b614fa14025e99*2&cc9d69191060da59b1d67c166c1d4e7c*2&2c6445182284e767d08ec5be42c846ce*2&b0f1f24a49257b0d94deb6f7d978f3d9*2&080d3f0977c7600892d210a3a16ca63b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"79d5ed915e5c1b9a12af536a3a8bed34*2&e68f4f0f669ae367def97b220825d4c4*2&4f4f20c6faca126588d607d8a73b0c22*2&d31cc2b1cfe95377ab552cd01326fc1d*2&b48165a8aaeb2fa54c221d09850356c4*2&48523165c22c13d535f11f65799051bb*2&b7e167e22fc6ca3e69b5b8478bd12e79*2&f0727aac5c7dd897b63c6b78a50a018e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bf9763e61e5359c2c97a9fe4e4381af1*1&d7456f6e15edaade72c1be04747148f6*1&b5dedf6c9ddeac1dcc63797e9b689ecd*1&d441ed6fa1668286a5bfd02952bc4135*1&4302a7f7ad4957971c4fa9fadda6bce4*2&b066e1d98e6dde04866c88579f66c0bf*2&04308cab41a0deaafab679d5d9c6c951*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"a620cfd5803a38768e55d6f0cd07865b*1&197acf83d87443e0d112638f09bfb23a*1&829979c104378c2bbde68d2f2565c1a4*1&74ae32bb0a7e0f225e37a7a469b42618*1&4b9af230fba57534164b06ebebcd5a59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"fada54ddfe9f3566a35c92ec2f4c4606*1&81acaebbe0ae12b05560df9b7f0902c0*1&d18cc7a2bc33171a0163f5e670fee852*1&9a2b6faf94baa922e6998c665f32f554*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"00a968d9f62208cf6c21b278849fe50d*1&2f3e5b69c4e8e96bc5c449b85a1ab00f*1&e19af501b47efa32d66163341a255c8d*1&6560f14d2e4c7f6cb469ecfc975f6195*1&e2f1889fdf5a4901d4be1528e1410a97*3&64724587ae9c802c0764c6363fd0014f*3&5129cdfb9def9e5034e4d90293c5458d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9fd1ce2cd1159507bb557b49be4c8906*1&f1ed180ffa07f03cf491a52ab207c8ab*1&2969a76dc3d50a231631972eeb0798cc*1&6426bf7bd00fc4f67161e4b9c1a6846e*1&dbf92c5346b73cca0f6a2ff85c5b2221*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"ae2340d16297f42a2ad8a04407ca7ca0*1&277bbfff16286cf43538f0878ed4dded*1&ac675f009864a8ab74695128af14c83c*1&ac81f860ffad8510a9f62b24ed08d5bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"91755cac2835b6ed2e0aad62a854e520*1&fe24aaff277e8295dfaf171ccf81c587*1&810708bd7f4cd7523832c7192268cda9*1&63f811e23a07312dc308b3ef481a6e3e*1&a353d2226187b6a2c31aa3e8110be1a9*5&6e3a6915f9ba0def1d33e6d8ed358d71*5&5a741b29b9cce566398dea627f75b69c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a4cb41582724e85eb84518d7f604c46a*1&0474e4456219e143fd5e4ae669fbe813*1&ca435ea99756dbd1ecaeb275c0b0476d*1&b4e8631eb4c9ff81148441771f20fb8e*1&be32a457f7fda4b55ebe1d335aba9493*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"761469b46e0bcf35b4ba204b668f666e*1&6bc447fbb4880bf5efbe2aeb3e8743bd*1&df94844c0f463445feebad23f17f5c8e*1&38f80caa2841fdb555de1bbc831c134e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"fdb28e4918c51c671d615aef65fba40f*1&5ae1d6383a34a4bf1d2ddc2a3b948fa3*1&2493ecd6c75f9c41ab2827903c48003a*1&63c2842248fd249c842c29558f53018d*1&cd81517b618ea363c4a4729c3cee032f*1&f7ad38113a02dd4ce0f2398580bdc8ae*1&434da518285a9e29398ae429483f53f4*1&8598e8e5c539dbab3f1ce8e500016430*1&51316c6453a7e770e1b98350b3681f38*1&6f598a81fc16aff14e12b4e8825fe673*1&c3f5d03d15859c963ac9fe52502a11f8*1&7020ef06fbf7c1be9d1ed42db4e44c9f*1&6248b835d4082054f3076ae2513f91d0*1&fdbbf4797a9bf5ce34f2346ddd3f4654*1&527940f009d213207923fbdb355c9438*1&4031430cf476d0c2868a5d19dd51fee9*1&6659953a8b5755e574574db7df96baef*1&535bd5dc6b4e9d5452a2a485e822f0a4*1&a575ac089357e00d6d4c8a248a263c2d*1&cfcf37fa628a2acb23e2a771d0d92257*1&59a0340c4ed117ec4de4555637ca1c2a*1&f5ebeafbb237e72e5425f662fa5f5ec1*1&d022d44b89da328bc41c9196d39406a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"a168009a5e26f9ee037c92f5b93b4c4e*5&9830e94cf89afa061ccdcb5f9e40903b*5&b6237381014838fb4d194591cc605aef*5&dbdb25bb284c21756b0bf20573abfdc6*5&2d7313d0445b8b59638723e956359d35*1&eb4e623b1491c1e1094667d624ef20db*1&cde6a3af71f0b3bff14b6fcef59d4401*1&0418faa922403ec3675ed83b82065cfc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bb071f904224326cdbc46ff96357d56c*5&3f856f6aae7e7ae150ddcec68c1a3cdf*5&2e5cb760144a814b55bba72cbb837735*5&6182bebaac4d5e840dbc07d279af8e86*5&b508787544b14e56258a59fe5c654843*1&afe197be0d15cf17a0b5957982b182c2*1&6f23e9526701eac430efa6ba87b407a5*1&0105d7caaa2d5b14719b4e4f483bbc5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ae096536937faf2ba77f2e2adfa69c21*5&59ba921ded6b4c86ea5baac66ebd7fc9*5&0b8b9845b15e7d5c19042e190cb8cb74*5&5753e674c1d9d8c848b932636af53197*5&71c576811589ccf7990d9981288a3702*1&b45bb510254bf07598678d15e79a3f24*1&492b53a83ef26a9dad92701f989a4669*1&42ed243cbe41a8023062cd12389821d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8bcf5764ea67984c22b1009584d3db5b*5&34f726920ca4b294c1bb21cc72a805d9*5&b13b9baefdc976e540916656c1d53f88*5&ba17585ee27de19ef6076ef1e33ba8f6*5&44cb322f07df3b7a374dec10face85a9*1&950179a0bbd6fc6d17c3b0a07068b34c*1&90c6de234f39216ca4a52dabda1e1b8a*1&e1512b12a12aca8df12ba77548f66cb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cde4569fdde118b4c56230a99947e1db*5&bd91520260c74f78e0f73d6e3beeee26*5&29fdb8a032115c1796113801cae5b7a6*5&817dd2fb0880a09747a594e216361033*5&5a926701c593e4a0536d789c799b827c*5&7e0d0ae6e447bd7614f6a5a8bee65f09*5&24af214606ed2e7eaa752bed1a9fb507*15&164fe014efa841decc6f67c632190b5b*1&541cdd724c990c5aa7810f100f6c3ddd*1&48da10ba9bba72a275cc12ac6b5fcd45*1&7bb0eb53d3260cc972af2a3e94684a87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"bef3f5a17bbcbd92ef2d6c6124a20f92*5&28b25e35fbad95771383bc60e939088a*5&0fd879139dd3160d359cd00b99c50f37*5&ead338e970d91b75f67abbbea21add98*5&831fda2aa4742ee4e9b6ef4df0bfc264*1&6176663efe24bf5ec534192541faf85f*1&e88fd1ae35939992264deaf095270548*1&8ad84ccf2eac050dc709b34ebbfd99da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3f90a20836054c26937da3029c4bce51*5&e10edfb673f2b0bfaa61f6c2636885f8*5&a22d343c386d483bb69989a36213c005*5&586903d1cf78cd4546fe9a0108da4a19*5&7e389ab064d894fd2f649d326bab10c6*1&ed2151c53dbd595a142338856980d830*1&3bc398ac176845241c831dd5d1bdc884*1&87e7ea276a8661f3c86c7fe68365b8a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"527bd5d5e860e720901b11ee22e9ed93*5&aa3f5098078ddc1167410e5e31e9ba28*5&cf27b85bd7a9579ee0ef37baff64cc2c*5&6029d65a386327febe0a4d0249a66d45*5&f27ee9abaae0a13cd618d9176b17d9de*5&4bd4447edace61e04a85400e3b23142c*5&e6c1019ddc6bc8f9141a9578e9c6681d*1&0cd7af4b9ca156071c3ae2ac987ca682*1&b2b31eae6c15446709cf7cd378b516b6*1&5f44c52dc8b808b29ecf7faa6a54c95d*1&cc6a7a755630540fc9a17065c830f08f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b463d1c028c0c72f08602ad8e582a9d3*5&b5a47591486ff9b593acf4283c81378b*5&93be9b4f295d08dba89fd70091662a48*1&7f86228020a19346b727d5edc340dc68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"173fdff682aa7fe27094b4521a13a028*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"b87c2fcde64af44bd468ab872a890375*15&68501a4803f0becd8fd03ab974029695*15&459c284a797caef9c624782d6bd7f9cd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"73503cdee1c7b9f91583124b4882c406*15&0dd95e9eeec4f804f3955256d5dce77d*15&0112fd295a78632b0d01b60bab3d0f2f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"7e8f93c1da9526fdc12d8050b8a8d518*5&fc0ea519fdd6dbcc9bc5a0361a7e7b69*5&668284adeb2677027c8e803df2c53bb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"13a1d77f2f2ea87ef418ab5922ab8bd4*5&9a7cf0d0a4e70d91ac6c1641e7eb6029*5&9539ca0370cf5f693cb5adb09aa82940*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"5e779d5a369a73d90c76a186129b10df*5&d6e7966565b8b51664fcbb2a1d1e5e55*5&f3ee38474e322b7b16c4ae1b527fe68c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"8a5aff197e74ab2d9bc7c2455838c3a9*15&0b317afa12998fb089b239ee94891345*15&0565fefc064d89c6c125ba1297caede6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"678f4c1fa975c1549b5be4d345228490*15&f2d33734bac21e43543c8030c7447d39*15&066fe6ba32f10209a2364b1963d67c83*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"53ce7d53dd7c3ee88bb9dc4deb46eaab*5&29adee635f15ebd636e4803cf0e31140*5&7e0825a053b941e2afce48ece652eae3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"16aa572527febda9d17ceaad25c1ee5d*5&6e09bf353c44a166cce18a959d02883c*5&e7fc1e904cd8dd7d27e082a1dd99a754*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"457b24f9eef97218df7e91e979d3803f*5&1bc80c85c42c21dde0acec281e2c5409*5&76e1148e958a28ed062691d1f4b84f19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"976d42d64d3f2b2d1a454da45207e5e8*25&021d297bc2c66fe4dae63986f60daaea*25&e0c2232263304c8456b587d38f11273c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"fd8a5216604b0164db9aebdf8539bd0d*25&a49b5d7dd26d2c0b97459bfe32a3b6f1*25&770568cd61d267afa66d64a58c169155*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"dc0450f49b7fc4632f84e926d3961521*5&2f51e9d7c69505f203ce2d8989f5c12d*5&339de9a48d6c80db90445f951298a45b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"26017f4655c9bccc2acd80d4bc4fc2cc*5&2a0533b2755efeeb64b3ad4792d90464*5&81470b8b0a1071cbeb668281b52f2a30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"d72ed1e8c7582d6bad5133937d874b5a*5&d24aedd3c4ac23d3f84ed29d422aabde*5&b3eaa8d32f1acdc194169c0b60c11327*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"4a02607ebd187c5aa310a41681eb7772*5&df040c81ed2f129e4fbefaf10d82c762*5&6986be7d75af13724b2489b9842477f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"b82047756fcafef1b1688743b47431be*5&7bf526b9b3aef16afebb46935af3f6c8*5&45cda1ce8760c3fea27fd08585bd9f36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"23afdfa2c266daf0e3d3dec41a71a58b*5&9adb13ffd4cc59d743229379e1340ac1*5&eb0cf5da3b780610b87a5210e477bc9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"62147a514056815dca263f7324c0ca64*5&56b755cd41b4daed8978d51d77c29e87*5&589da41b3ab42e481d05517d7f19df22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"e7d07a1bec8a53cd4544bf292ffe9d3c*5&2ae1eaefd15e5bc65dddbb36af20aa10*5&9e4395703029e204271182ba27c7fa2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"d66ab85872a30ada2a5d6bdda5a3eb57*5&efe9b6a3f413611682291d721ca6e4b2*5&6bc034717656a67b037a900887ea005d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"e9cfa05e231975220ef4003b9bd278f9*5&cc36ca39a0934fa89012a54d36e0badb*5&d072b89480af1762389f82f1e7f01841*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"53fc31798cb57cccd58d20e7760eb4ae*5&f4dae09ecbfced6aa76cac15a875d4dd*5&6e9d181c1ccadbfab57235bb22e78a18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"1a23886cf6452940eee499b3b4f61c73*5&da65138e82c4f53a51056918cc0966d2*5&526d2f12d0ce466103610e4ab63a814a*1&8412b2c2d95c220a58a045bc8e2c5250*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"3afecf45f6e96917a633387ca2b3ef93*5&93e08678b2cbbcdfd88a2514013c2325*5&675962a2899d6b13e26c3fe8bb635937*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"dfacebbaee5becdc814cece2a0c5b812*5&c3a6fa281adc99a1a4205d8ded63fbdb*5&c741f7cfad8dc3d73f4a6d3f397eb658*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"838ddde3d8845b43b3baf94498b85cd0*5&4c586ecb7d2b773afc4060a0a56ebe52*5&df24cad8e0bd962f6dd9da848b2c146d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"48612f94d1c02b6dcf98a09bfceb6cba*5&e11ae20d7cfb2857af8f32b730c47aff*5&479c25140a729b00a27e3a4d3c60af6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"25ffb3856c1c1327aa793688bfe8e505*5&e44b75a336dfa5cf7bee4fafcaf64e7e*5&e52896acc400cda371c0ced1f26918c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"5780402d909399b655187bf337e5905f*5&549175b9e0579b2446cad3a041810d65*5&7ec6362d5c59e872ffa695b1b27d014d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"97a534d7c064ec69970e626fe9b413c0*5&76835c63f418e38e01e82b8e753ad5be*5&935e2629f5b90aa6044ba1ff0785088e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"496606d0c6b738a55adb1767d7079d12*5&87c339ccb8e77e86169f6235f7a9b464*5&965fa532a140a20e84269fa30a2e9a76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"0a87eff76d0cfae2321f3bd943bc25f8*5&0c34dbf0fcfc4951c8db1abadbfc77fb*5&ea4e45acad6827fd311872262e2e130d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"9640c347d91122459485c519119f3004*5&1ebd3a418dfdb20ae0a0fe939efde6ae*5&1e98aff4353d24cf32d7bbad13865cef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"e2f197e161f5c56baa4ed8c9fe1ca762*1&6badf5342d1f75cc8aed48fb279b9d00*1&ef518862ba04e5afabe7e4506abbd329*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"046ca47377dfb80f5503b414e755b2e3*5&eb810557bdddce49728cc4d2ee838de4*5&9864ad8560b57eb661eb2d26cf2b7852*5&ee143649a85b0718a7ec8afe3614ba50*5&8eacb496bbb2557bf9e92c6951852f4a*1&0c800fd797b5defe3027b23598d9a327*1&160538779d6688679f65dbdb51b49283*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"eb73f9e1fbcc4939e3fc886ec237f946*5&b9ba627c878f295796dc0cfcfe5a3b32*5&1a0efa32b929e9cd1406e01b1f1756b2*5&0389bb554635b59c0b26cc70c7d4bfc6*5&fd0efb485253ec7605933523d45bc40b*1&bef095214dbfa294e399265af09f5577*1&ab4113cc748cf1610447cdf8a1525fc1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"7268714772f79742f8f0ae42789753f9*5&eecc791c2185368f3b3b72a209f36a32*5&90b81156edd256237b61e7afe2199a3c*5&22b7f5f7dc15db8b1c8029c86dfaab9c*5&eb819c47dab7adabbe3f41c05cc17c27*1&d4b716181ad7d2dbc7c93199f9e8c2a8*1&1185e66ba1ee4b8749d74d06577eb643*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"972c7da6a1ce1d481c293e45f110d6de*5&7792df46a52f0e94f0cb8f6e61ef61d1*5&e63607cb642a8360b7836f874301452f*5&5b5c037dd913e864ca5b89dc977bf661*5&7784195c7156ec50826d20ede1a8f83c*1&40cf8d7fc6b3f08231058ffc8e1048b6*1&6ac140ade15993725d947abc9f1ae94f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:0 mb1_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c10a564dc843f9f6d0ac6895bc961f80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"41a956a1eacb1bee7b0fe2afc1e7cc18*1&e0fd7a49b854cef6c28d4e91a19db1e6*1&e63066f6cf24e7a7823bc9dcfe4d5b3a*1&018d348855b773766d07f6ec0326d374*1&a9de65bd7f9bd11dc6504dd916f1a4e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"f31792b338507645a9cde0d1016f8001*1&ad4beeaa5ad9eb5d0f0dd1571a6c0525*1&208cf5243e58daa422299b65a4b950b3*1&84f27a16aac0ecc54ed89253b01e37a4*1&ec34ab9fb41d73e8d1b78a91b7fd8cfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"7f8ff0b95af49ca3b7e51022e8ea8e71*1&831a5817310e21efca10a9e93dd67aab*1&39a53637b62f49dbf487a0eeb2c72e74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"01c10709128f14cf714076ec8a7df9c2*1&a2ca84d9fc00e565524ac5f66ab444dc*1&b1678b842c2c95fcc0f361595feee120*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b98005b68a3491c921b05875e41a324b*1&d7c08e86371617586e91eddc28b84851*1&90bcc3cb2be56b0f1549896158984a4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"fbe2808d8cf4226d8899958f31d837a1*1&7b1a4f8f6c399dc4a0233a96c20e17ff*1&20ad4b9b293372fc1ef2ca1365fb3bca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"eea4dc10fabd26df9c0505eee1d75214*1&de3470e2ef1682e2826d97314940c459*1&7b30b307d55abf6743ac9798ef3fa6e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"a644a94dea5dd25b71fbaaa603030856*1&d888b7674cba3b9e219ddbbd0e32de9c*1&cd4b1d67ad3049f6e87d6ab13cdd8413*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5eef4b8f3560fb6a24a7b6d1039a26aa*1&f99943e378a3e123136fc322c37e3157*1&5e7ae432e4ef80ed3f6e8dd79bdb5dc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9fd836b875231915cfa05b028babb728*1&eb30f39e4e0946b3c3176a55a4d78858*1&aeed30243a9eae6a1dd7373efa94ed65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8c7fdca104a073585cb54d609c8580ec*1&c8bbd2544571e7c8c754e23687c98ca2*1&41d80b973056ad302bffc4012f350f0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"90f465c27c2f821ac2577b72c6cd6251*1&582c88b49a51d8f89fa38b0f890d1d39*1&46ea8b5e0d9886d49ebe55325957d38f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bbf7033c805c37c2f5568b80c1453bc1*1&907a84ad6c7b5a3f560756edbea8fe8a*1&a5910a1c4260034f431bf5eaf5bffd7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d651349c09ea40d08d6833625f97f059*5&8b7123c661c712cacc90c1ab8e63da0e*5&47b59342bd67b641d75ed06c932948d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f982f6012a20b8b6920af5ab4b2a0536*4&1eaf3697df4a4bc93be93413892cd675*4&0b3c05718653003c1f549fd82818967c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d16a4308dc757d0cc92f30ae35a371a8*1&63020d91ca9a7cd27639f6e0adae6dfe*1&163dd967be5365e1b646b5f9e2d6ace0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"16c817612620efa7b74e3f44ec8d6758*1&bd8113f02848715d59b1f14cca701209*1&575e73564e70068287ec92e64b5a96ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"fb1ad7f85e9bb721c987b4d2b5d2e6a8*1&b363497af823459752c306b80b3972f7*1&476785352f91c3a885c11da185e7b42b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f9baf2c2c178b0a812b95cdcae3d4d41*1&438358721d2c630de7220033e1c271b4*1&cf2e6acd3ed39ca9bb531158826d6f3f*1&82c87f220edbb36d48007d87c4300f0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f95883cc6140ab02669aac679db27348*1&17f0479f0d04ae84822a36ec8f83e1c0*1&7ba679a693aa2972b576b28fc481d38a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"052e304ed5be26db53f74eba05108b09*1&a258cad040d6a2959f8a72e76036810e*1&227af51224e413bac2ab31df7ffb0d60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ca88630e1d77cb24d69fe087113efa8a*1&6a234ecb9db01b568b09d37405458bff*1&41dcf3b286139178c6e4d7c07fc507f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1032140c2056d3c83d33b3614e357382*1&cb81655cc2961bc84c69aa766b7310c0*1&c493f10c3013849a4396e5771a7dec21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ec4659443f156b241e2c7c5eef184fbc*1&0e4e4d80535ece8632bc19cf26c789a0*1&708e726a873849065379c3ec98c9ffac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fe4f916e9c90287a6dfee7b5efe8015f*1&0b53776828107c2f8471110aefc16cb0*1&420d224a18faaae3c4ea4a62c0e2fc95*1&7e32cc41c0030098be04a240e5b67116*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"234056d05c080e3a8b879277d4372a7c*1&f6299d1b9b4fc962eac9610bfee3d935*1&88950007530fdcc35d728297ade2a8d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"36d7b9bc2024d15a7519be24f9733699*1&d502b7e4873e82f57b8fbc2ff65c2ce3*1&32f1a897af60e28a0cd2d4deb8b49bb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"a4f6124b47cc42e2747429956453eb80*1&5c728a10f1d08948f573f0f90d175ba7*1&33b2d9cacf8e8da5683374d270472ee1*1&5965cb1095fa2994badb60d87ea6a8d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ec8d1f6915c2cca6b66445b3d86c5682*1&58e52e3b69b383e41e0dbd4d33f131e7*1&a1552e97a304577a11abf3d8ffe40948*1&878e82c2f112e27ee3d6b3989fe6a0c4*1&46c1af91cd9c920dd33104684b4ffe92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"299dddbec99970959c07d448e06431ab*1&41dca63ff7e6829f4e4dada909c33759*1&d6d5ec538c720081fb2197fc92d52e75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"d6c8c1613ffa3ae67df9d920d0abe0eb*1&01b963fe33c75347871a3544c88cf0a6*1&a68944bfe5863722ba1a1f052f503dba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"8bfc15a981e02d973a94dd221df27338*1&93dcd5248ddaaeb8f3dcfb79429d11f3*1&c47ea07f2f78441ffd97adde98c3a134*1&d6f880a995d2eb90b19e1c1d0b0e8cc8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"439c54d504f10c94be7f1a34fb07c171*1&f3926556c22cd91cad2d62b5c8011b57*1&725d0fb8567e5964c1da4094b844660a*1&6c411e086573d951201ef92d940e3c52*1&4213dc048145289d42ccebde81a92b2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"070a9ab605764f0bb54b61ceb3a8727f*1&12e69baa25d34fd1a9dbeeb5d36a8d66*1&f21f7d9934c8fd5c42ae5ee8e5c75575*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7c9ce395470faca3b2c3b628a19de7a4*1&b4d0b4c07348c5d2b04f86c366f5e9d9*1&e347aba205a6145d4992a963021f896b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"d2029af83ca966e9cebbea2943e0252f*1&fd8d88ff4b8b2b4f4d6c00cfea51f65a*1&d7c38cc79c8a7c99897c2b5ca92c69c0*1&969d4df016cdca901cd10b4142af0fd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"fd9896501bb38ce4bf2f8ada38e272c7*1&24d13957f829a6b48e1123d886fb1728*1&1c7f2184c159ccbd63a9e44a0d5727e7*1&eec31003f7295e79065a253b9a517679*1&deed7c198cb920716bd0fc8719dc26e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"a513897eff3e87c471e26e4b9a2960c2*1&71cc140e5758450caa2d1f05233bdeca*1&f6f899b6c0b9927383256bcbefcacdf3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3fa6ece0f166c8846560528d3523222a*5&d0774c4f2a5bc5694e6068e5a68c28e3*5&14f60778967213d5ee12a07df837ebfa*1&dbe5ab331818c11152e27e4a3f92b2f7*1&b0d96103cea4d83e6c5ca5ee34a74d5d*1&c34b542f21249dafaf60982ac73d1c17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d7087412aed69e4ede5f60a9ca703951*5&c9969b40ee2ed9e78dfcd114bd64a15c*5&0f617e13f992f6a70f4b6f05001bb163*1&66fec8eb99ffbf46c14db6f91a513b81*1&c6151c5b2fa603209c8e67f9a2f5a09d*1&a426237830a466aaa2a7b4714038830c*1&304a4a3b895f31fded32a9c346a05709*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"3db99587cb87316973eaf05655e9a272*1&d278713ca52650d5580ff395b406317c*1&f98222a66b79a48cb47cb77398da250f*1&d8f249a4533679f0a0a514eb9321be52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"8ae81e1f56d78ebbcb0b92017c170276*5&dfd5571b612db3a85a551b8f1296bc94*5&be52f9c4cde3b3225fec63c016067c86*1&9c45c585ab42723bc58620bac68a695d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"0c9072be069ae358e014fb2f3673d95c*5&a527c3cd13dfec4fc6b982f458a8cc6a*5&f4bebbc151d22cfaa071b64f979c8252*1&a20076a8ef34900c54c81e29951dfd86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"d32d97009c9239679e2b83c5b3a9d27b*5&9bdca1f3a09eb5a0cbfa259b38eef595*5&11ba5ce03a46a6abc6d0edb1c3c31a4e*1&f0bb6a0fa3369684bacf53bb3fbfa1ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"ca3495e4ee2f444b51557f4f489386ab*5&b8a2f92930f84a90eab06b816314042a*5&4c781c063068ddef1cd02b158729ef1a*1&f66e1aeb3a8cd451bbe61d7833707bce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"96bd44f20a666c515d68a7106af575a7*5&325c937c4c804fad376937772958a599*5&f0bb6c1eb49362953de4e6fc14cb6b7c*1&86bacd21f1a2a093cc04d701a3300fb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"5ed86f94451c3244b86777a6d69f8b40*5&41bfdb6bd94f328b2dc9d81d8eb635a9*5&8c28bd0fc7e217900a5086f612fdd149*1&4bef15adcc111725603280f33dc43974*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f10b2a50e118f97ddd27859bc38e5fa9*5&e30823b4538ff92b5d851fa8ab12eb1a*5&193c0b4083a1241f6228f5a6fd3f4773*1&12966af77ee21bab2f501957b6d3625d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"7d3f67d1c39483a455e93e80a58e7e70*5&ca8452ea3e227e71e2c5d4e9e474ab2a*5&2653f9b7e3f8c445c4d389e13f9a3ac4*1&d32f167e855b803857c42d246c7d4562*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c5bc60f82aad66040ba1ddbcb2df668e*5&0186e2699b3f4a56c9d5284f1636649a*5&8cd4da9cf26fd512b9e3b6b1b5d8e36f*1&1fbd1e6a8cdd2edd5b268bb9c00b1476*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"d36400130f63d72ddb25518b22dbd6c3*5&9d8f6e73aad35de959e1eb7d51398284*5&7f69eb91094c5538e75d3cc1c125f6e1*1&e7b8e5855c664fc9a0337167389c1faa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5109f51fdcbeb2a7ebe3608460092e36*5&261b6cbaf5413632e5ee158aeeeea830*5&668f1c7deac524aa2a2ea0e1ece1acd6*1&38d4755717f4f14110d7a25395ac43e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"ddd31399f23064c87c659b40d83583e1*25&b9cc51ca49ae857cdc2d2df26212b559*25&5c36825839831d02fe6c8f3625ecfdd7*5&3e51431936283f25b17ee096dd3a39d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9016ad48f274672815a89e8005f96140*25&2922ef1567707afa2e5f0c9ce645fcf0*25&eeb0a43c5e578dc76cfece58231881ad*5&203048ff35546b162c61f2dc69057446*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"b11cbd146c0b278f71fe61a1d36186b9*5&1d57c65c3c46639bf08282127f1760f2*5&53383913156c3ed608f839e3c0115298*1&c1a7cbd275c0eda8e0b600674f5cfa6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3dfc38417f79969657721635fd1f4e17*1&bd35111a8be308bd5ea585b19dfd2d11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"cb4920d74615808d5cc372e25876bbf7*1&c5a1863ba117f32559781a4e22ee1818*1&6acad7bf7b3a4389b6b376a6f07c1068*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6a131e6f35cea693dbdd6fcd789f0c97*5&a50f6d042b3eafb72ddd4302d50518a4*5&e2aeba768403287888f29ca7b2e636f3*1&df22b710a87200f6a41f06f6f223bad1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"90a8b7954010df99f0d896e8576e78ac*5&c13f6e277dab677dd9f9ca94af7a44f0*5&67a1ec0e6710747e77f0ae1904c3be4a*1&2dabaf3ed13ea44186efd040f1d0b535*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c0e342ef43c8e2f2893e592b2fa87384*5&a49df3adac2096bd43470d0b8a3be07f*5&c2cce67ed6db47a59be70b7bb54749f1*1&26bc7b21c13f5e6c84c5a349212ace75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a0d186f6c93e5fdf1ad868a65ce334f9*5&a754e6b109f8b5dde8aac2a2499eef93*5&7f93f756832b07e43a8f5050448d0463*1&367f77343ab8433502a7d37549139144*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"37fdf3abf6d5e166f9977d18d40e83d5*1&99dcffa83016f9dedadc246bf09e1c49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a1fb79e6fb1aabebd7accd7ea737d1b5*1&4f662a3cfdbe3b80a866755b2a59b915*1&58dc2f2a851f70d8b7a8373f20045991*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"daa34dd32467abb179a64b921b13a65b*5&4b862a6043548c5330852353011c5957*5&2331202e5c6277e2f0fe40e4abd319e6*1&c04a396688fff7ce65270bbe6471aaac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"897ebd532894ca44c0ba584b3c8c2400*1&b035678141ca1defe2f4db8c71118c03*1&4ba66e801560588754db3a071f1769db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"6749e88b0275245851a56ce2b12bcdef*1&4fff40d045bbdba2e3eb544131057f9b*1&5079b4b6140860654540f92f5c16969d*1&bc4f6ee80755f6dec8f0139b267d1685*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"bde68c006bb91a7b29a0c544e9f12455*5&24b3123433b10a5154bced973cf4e5a2*5&82ad1c7ff20afb5cdb137920db7cfffd*1&7c36b769adb98223c3a7bcd9bdd991a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2678112234b362af94f1548735c95fe2*1&13eb50a9ea034b5b0d7d48f8e622fb74*1&c748e8684c6d6dd78cc75d714f4d997e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"9cd9d57d646cf82e86d0570f23101fae*1&13540ad6de70ddbec7ee401be4ee82b6*1&210f810304c99939e266d239adf9c41d*1&0db6f7685e6e93f09ee2df04bc9f4684*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"620f2eadb015833c16f8477a19fbc21d*5&dc52ea2b724fc4c89aeb4d86920cf8f3*5&17626c92adb335817ed9aab2da795370*1&aac86d6e7823dbdafbdff8ffb9ebe47d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"4c496d7833fa12452ff33b0cd4958b95*1&5d860bac8770de77e665c65e19e836c2*1&5ef73f1a033534a0fd2fb48c5e267329*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"19743ddd46c44dce03cf78c3ce82499e*1&04b139c7dc189dd592f12f8c3541a9a2*1&d1d9d270b814c164d7ce856b59f8393d*1&25e945896424ecefad0cbd30524378c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"d7f9b32d0d3f368c10b6a6a3beb603c8*5&d6fec25fb4600514ea017920c70d39db*5&cb79317537b3bec69714d5f9f6799144*1&c88cedc48119b25ed0601b27cb23e1aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0_n"6bff75c1ebed825aaa510501980e40cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 g2mb1_ic96oc256_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"ec2dfec79ac4cc4e45587449fa719075*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"7cc6628b7843b8569dc2660566bbface*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g2mb1_ic384oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"fe496e911690b3f314c6cb5d58b34a55*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g2mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ff668ad98420c66fc218d4462ee11804*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0_n"077c30109242164144cef844d2f5c1aa*5&edf658a9e94ac2121ade3b7636361a49*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic96oc256_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"d716577c2d6328621e33462b2a40d812*5&3167a851d0ed3195f760e53b07bc9cd9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5d196859c06a6424a4ec2a0a69422b9e*5&3714cd4c358de51a8d53bacf27555ce8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic384oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"16b07b3efc0efa57d3d209b06dca6378*5&df39a77f206f2ea08daeb3939f84b22e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4afa66e41aa303c0fa439102cdec69d2*5&3deab256ffd3e85168f33d74ae6969ec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"03dfe298dc765fdfe17821279b159249*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0b8d937464fae75b739de36efff03c53*15&c0e639055e6b3fc17f77ee667023830e*3&d8d6ad01addb9fa1c57a7c0604a450c6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4d50a605dfe8b1b3e963c8bc1b855a23*5&f3c6b9f8a6c7a7dff23ae75fc9ccc743*1&1ee4f108bc208060cb160d2ef399a4de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4cbcdefbf3c2bfeee20132803b671d32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3275e8bc6f8b11ac509bf0625850e926*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f67320481514fec7fcfbbc644cdf6e0a*5&35466dc2cf58501d1f2bef77676054a9*1&f9fe6e7d406fae8f57374b637e3e21c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f9b0f6790edfd6019cc27ef0c33afa13*5&87df005e03f3b96725967c500e8e2041*1&d4c9aba5b4185d3c4c5f98b7d0c5ca7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b7fcd0b1c05f945f4dfbc2f4ad12e29d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3f5b8695c902b51a4c58ee0ad5a834f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c65736ca3fd282a4080e2b662d3aec57*5&e35bb3cc2c37597b9853b13090112f37*1&90b81a4e2fafdb34ab77237264906c28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"656fd5a6e0efed65f88f51a56ff737e7*5&ee243f8b14c70ff71682e2a03db71d81*1&f66e57a8d536de32f391c6a481ae95c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6aab38b8bc3f4c7aa80b2f7c38f6e82e*10&2449113149d36c3a1e9af7b13ef3f174*2&42b868308ecfc3b49430c42bbed3b921*2&81fad750e70e20a588af4f6c4a5882bb*2&960cbd1125f06d520e8c403ac9578df2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a5514ee40efc383fd1d0486e6119ea9d*5&8ff57805a917ff08423affda1c44e28c*1&a8458bf226bc7c50dedc93af40efc2f2*1&705765d1b6b8576ac623aed6dbe3ace5*1&04e855040e5beed78dfe8af70cacf7fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d5a3b007cf2d85d9f3460d058089af8a*5&9e70a1aa0c077ebb641766179641d354*1&935c202a9edaec67fc23741e51a43224*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"402f09faddfa9798dd5eb29b723c4b96*5&1ad2b90436db396353f72cb51544edd8*1&97eaa9e850ecc95c309aad2596440510*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8753b8035d66ea5dd78c2830d447c909*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"623a4514b025b5930ab504af83f91e8b*5&3560a9cec0b9f6c17c1fc5ad1190c46b*1&48ab135460b4436d852b4f100a382290*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"acee0f019eee69b5a207e6856249022f*5&533f10cc74608e6836ee8279ea92a507*1&4c6a0cb49471ba5d5f689d3c40fe01b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2720b3b188e071ee474f5f7d8dbb48f9*5&bbf6cd126baed0b05c9d57ed99c74ae5*1&5fbc45f0775d2b512f683799fc5bd0c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"62e039ecfe46242e23811729eb1020d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"7f0fd8ab13dfc8c7e66d60dc51eb1bad*5&f7513020c74d3b6cc08683b7d36c9699*1&df2aed62afc7a84d0e0f0155c84f8852*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5989f8b2251c4b51cb56dace8d212b64*5&c97cf839815a7bffaf02dfd054257476*1&831e2ff3b4e1e2307c92e880248c93d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"56778e828f534087e5422d4a22396d53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"2ec7d6ab548bce095c0f0eae87f01246*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ca53f606896f694a682f3f137f5cdeba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"12d8d27720c0729b455c68eb3ab6a910*2&7c9b5fa599d0166fff49fab9c68a9cae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1e4774cd15765d80c37a7f529e8276f7*2&67ee74854606fc443f382e01aeded207*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"67fb7d32a43fcdfd47948dd553276881*1&27963468815ac5873113a8a0fbf3dbcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"5aadd074ad990fb6bde7ffa2ef32bca9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"600ea8f2f31e1432084d825f876c2f75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"72aca76863a626038adee18ea7877246*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c675c2729cae3ff7a240e1c43fafb930*1&7345d480cbab444ae42889869f39e20f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"91668775a47de71f21c621267b7a4809*2&b78984d3e8b15de4c7bf696c6437ee56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a8f651b264cc8919c53eba25bc3b6049*2&c1ba76b83b357f6652c6169388bd2868*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8749e7a5c3a93c7e25ef545ca50ed8f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"a50ff2ca9a2ec47317a31385e490d498*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"aa7b593f3c2586ed932df189e38f9899*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ce7ee7acd80eebcf0cec9048b38140fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e4ab0f05a3fc3b6b73edf57c7ff028bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"c4af2aeeb878966efa718705f989c945*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8d230ceb97753e7ef51e6747366fd82d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1e913102fe6d82a22cfc0246cc7900be*1&4707761386b16de3f0e27a40d350adec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"598c17a070b0f86f63394c551d505456*2&f34af229428621502185d7a5e3d496cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dbe35d4493be9d219ad4da80cc96dee6*2&37e7061f8ebb1f239f4aa080203b84a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4acb0c256df1dfa562e696a67aae3c28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=any mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"397519816638a1f1739f72198a10b1f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"6042f03749b7a72708cfc18a51d054cf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e0e1021e4b4b8b757c75f7aa9bd7c560*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"43354f173b9782352bd86c26725f2941*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"edcbe0b38da0f27fe43645cecf6f24bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"20883f3d1ba63dd6dc3aa88b1216db15*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"00f9b6dc09050de25ed6b82311de1d1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1a1ee6634d1261ed4a619380e7bf3f60*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"bd08346d0b2fb0090c26b69da8716289*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4717e518da77744718207b4e251d7ff0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"da920c4adcb04c6ead748aa6c0998b9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"f0c0690735298dc6b7b3081f7a1d6078*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"301967571c7c6a591cb6c1bae8cab8aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9b9f8e126f544d018528471bc20493fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"c60471202784ceadea96f6b44606b978*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"34f52bd7f9b870b41af789344dd88a60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"227d893c6bba226f99c57e5456b4e043*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"3395974182fee2f02bd1b402e6f1a363*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"813d55698ed5b57ccc4f07bd4270d91f*5&6ddf7b1fd1dbaa4afc2e03302fb5e8a6*5&2041c5203e193ce12072e794cda71601*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1f93434120fde1b9ec9e19a2bc42f174*5&2b29f96b47376861524df4cffaac359f*5&852d30c4fec1c23cdbcdf1a33a89ee46*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"aeeb031128a0095d45c3eb76fd8e5a4b*5&18e8c38a9b5b76927d1d7d7d16258910*5&e1a3a000794a5f96977bd1d584e0ffc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"d60b9893ef365192394dd4bbda440b34*5&a23646e26caad2b7c91c8420fe8b9fd2*5&0245926065868889d722050ecf363715*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6fb6a6fa571bd2dc86574c9e2bed5be0*5&f49f677d03cfe1ea1b32b90bf4f0db87*5&3e0509fc29f66d8fd78649f4b66028bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"b01b5776e87c73625a9f3277f7efa4ba*10&de13a2c5f78afb60da8d00a1a38105b4*10&2511dfb3706658284ba7252f38b0ae5f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c014a09842b33afcb588ffe576745a50*10&5db0d100fbfef81832a57e4cf3d0dd1d*10&f7c8837f1fe5f7d350b2bdd6c2f0b26b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4aa9735578ba71dde759f837c64c9461*5&3dfc4031cbb4cf8fc6d8d91710214456*5&26d215eea6700e46fdae6b6d14937e27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7f07e873a8bd6e2eccd8fe0a60efbacc*5&439743ba287fb45561bbf1a295ae9e71*5&f3ef713f8100e4b8df72ae3e40631cee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"4bc0dce3ead9e99fb5c14ef990885a65*5&202ba275fcfd70b6155edeb352219d38*5&7fd58fad08b0d81baaa1cef0b5448388*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1c569948bac834d79366a92a22c7996c*5&2221136945c153e30288d8210508d55f*5&83fd0e8bd1bc0afd7c1f4552f44259ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cc8ce463d8de4babb74d412662745f22*5&43f6f7526351ef600a897698cf86961b*5&58e0deec9b44df03f4aa367580b53000*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"180f38eca5312b32bdcff03d2d2985d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"827e80772034da9937cad2e3685e3014*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7ed1a0ef60d5a0cc9ccbd550ff563a69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"eca9b8fbd1d8f6efde697c9da8799285*2&bc896cdcf5c24b787651e2eddb7ffc00*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"95fbd1d671e8a18841cbd1e344f1fcb9*2&0cd2e4f269327a79543e2f6e9e7eb566*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1efb4576e307af94d39e88c072d412f7*1&4822657e6d9aff9364390c13ec77227e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"eadd6d08caaa5d5259ca57fd47464f00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"618fcdd03df92961f6904e7f25dfd40c*2&4c99ad2de3d391d4f83a63adc70134cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4849f4c319f55fc4ac81145020ab6b3e*2&6ec03282385e01ea424fcda2ff5230a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"aec7d5f9c1064902d7f1c7d66c4e7502*1&4752298d986718f351037ba58cdf0a8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fda65e1e73c87fc4060b929f373bdc6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"a46e4b350c2ffaf4fc1af4a4089053d5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e5fa3a6d176ceb305fd2a8904e5b537b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9ff478ac372942b4478eb95eccd34a02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cae0f7707adb9002c967d92462753c4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"635f49e59153f3e836cb475f67dbe121*2&0848cd6dc633af8cd49235eed04e82b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"15014fe6cd9c5cb4227d8703126e1671*2&c4db8e5f867e58761f4c97e6eaa9efc6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"aaa391d6ff99f1ae52df1d8307ff9e52*1&6a83b8a519f84c93a51ae5abe50f93e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"390d9f52f6e66521529c1619c353e577*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"9eb64b214a0ea9c31a1e0d280d7a8114*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"982204760a32ec352e74a41839842743*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"fae4d5a5f887c11234c6a7052766b807*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"b6db5bbaacc7d6f536682cf763fbf7b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"6482edacd40158291c7c75fcf7cc9239*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"e4431b58591a5129beb6b4a5a3483768*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"c836dd92c9ebe2cdf15d14437e8e1c7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"11251b9e67a0cf5b0a92faf13065a874*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"4f0a80c1ea4985bd093d77ee065553b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"1d680adf253d054898c0a0c5b9d85bf4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"c6a97927542e99c2d7aeda9ed4a21312*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"f3951aa10d994859278c2bd13b65d920*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"14f4d40f8f90b2dbe03b6e7821cbc30d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"0a8b6a253a48b957fb86982f058292e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"12e9b338c9021c9fb8ac7a80799e828d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f1dffcac4994fa2eeaef5dd17937e0fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"7b5afcedfc340bfce81f2f1049c59ae1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"5053efade3d18cdf2f55393238b9917e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"526ded0b781fa29f4c80460a950876a4*5&233b8c6220a961feea7ad3b561b590a0*1&ad9f1fb8ec7d41af7ba4911fe8e113e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"3744e8fa3869871da08cf92170b72a50*5&ecd2cb8fd81f893b4e678a168a1223d4*5&5de8e71c725ced4cedd4f4aea2ae204e*1&918f2595fad53c645e53d0cf7db12f98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"81292b26446efe5c2608f12522421c78*5&b14af415c7cbf4f4d47edb8182154c9e*5&79f1b44ed7fa19b193a9fcabdae0786d*1&a6a559ed72579e73252456207977f3e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"65efed42896233c9b55b6f5e057d77fe*5&9b3ce3823debe27765f5cd41d7bd6ed1*5&0fb4774061a9cf0fcf52977873f5193c*1&022fdab48bb4f956985d3fe7bada7467*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"110cf472f0cca2d41056550c46d30063*5&559a963a99fc207c86dab17ae7c205b5*5&1eaeec77cff68409e5cbde95ff65cbd2*1&69862f06dc4a02ba92447a2661a43601*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"6dbcf500d2e7c37662a3fbf15f84c2f9*10&5acf9ca0ce7faf4c2f2537339d8d8288*10&1c286ba3c610d81a1932621d37cfe03c*2&66ec78fc18650f5edf34ea794ab9789b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"1bd60a1a2f35107032be846fc1404edb*5&1c6b25289cdbcd302c99ec4c66deb3b6*5&52dd627314176d49d0a3ee22cce0ef8b*1&8ee79fe74baa90c163d37c0ac1f90681*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2f96f75deba85cae4dbb5ae45ea14ea4*5&9db858e30e8db7ecf8a3558155d94624*5&4af628e0eca5add9a34f42eb9541407c*1&b3a5a6813f3eb8dc7bf37f7d07710cc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0d8f5fd6125bbe4cebd8c5a9ba72e429*5&6adaa993b7ee7406e59d132067aa3e9c*1&4ac44de34a3b12d0cf389f7e92b62365*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5eacbc3bf8dde098e589366f825685c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"bc9020e96341b9e6abe9e402b8ddadac*5&cb034a12f8256dd1789ec806c9a97d5a*5&a3f431d9e3879e8e45619a04162b24da*1&0c1fabec257bad94c604d54303152396*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8f4d8ab45ceb4ffab1cdd1590544e6c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"f798f0561e9e3ecf421afb62039c58c4*1&55d1b122a50c345784f4263f57069baa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"defc4f3bfdc811a99e03b8fe81e9a152*1&b214c2da66a25a4fc78a59e218158cd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"89f0758e0777c07181b5e57ed032d2d8*1&c149c952ef6c4fe8c7c102fd15fc6a72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"9d99f825b4eee959343a1c7e4571eef4*1&f85f6b22709b9a9713d3c353537b0422*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"d5dbfb02277e299bdfafb5c733ecd26e*1&1ea9a54fbb17dbd3d7cae639ce2aac80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3cb5c78e10c56cc5dedf6556825eab4c*2&bfa55bbdb6f6ac8f3604e9e1add752af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"e5400920de48ad07aabbbf9e715e029d*1&cb22a7cd764b5a2b6f685f8aeca98353*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ab133a69fac107688428af8027a6e227*1&4893fe3cb32ac7157d9cb35a1b57e551*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"78983cfe96cf933f182176a24484f5aa*1&a77fb4eb77e5c89e957a06b2c52afd29*1&24a4710d390af0ae4fc04bbd48bbfeea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2bccbb9fe0872753529d3ca6948161e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"9ed4bf42fad6563ab0b9d8fdf2e8a9dc*1&161359b935fb824f97f4f61e5fdd30f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"1bf47f4750521cccb4848052a15441ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"c34da123876f75305f74395ebd3fa704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"847ef3d9cb325d2775d37ee83933c873*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"de3510603194888ffa5b4a6e9e71d7ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"5aa80a2cfbc201a86d5488aa31fe27df*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"a706a1c4e9b3d46bd82b57150a3819ca*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"fc2261f1a251c7e64079328304378081*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"c97758f63081a7241952f49b7f689b5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb32_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"b0403b76a4a0b454166ffd42ef4a67ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"547f425dffc27b4ba97f5500ca27b506*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6bd278d201b3ccca2114a64675a9325f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 --attr-zero-points=src0:common:1 mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cccd8383b64d18817096a6b01ea8eca1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic3oc64_ih896oh448kh7sh2dh0ph3_iw672ow336kw7sw2dw0pw3_n"85881c86a4787455738475908f7321f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"381a1ba67dc322ef34b47beb3e818556*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"6823eecef0a1f488cee02f8651e75bad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb6_ic64oc128_ih224oh112kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"c8d9eeb4d7732f22c2e17f1511378db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic64oc128_ih224oh112kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"dadaeb04aa0f9ecf9772afca8e6ac3ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"12c40994ed684a3d247f33a420cc394d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"396c1eec4eb3335dd97e0407a07eb709*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"258d93b3024551031c1b5106de1593d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb6_ic128oc256_ih112oh56kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"38cecc5b0e9c3f13312f1954430f5541*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic128oc256_ih112oh56kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"b5fe23b089188436c1f70f54c3c1076b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"7e5bb9e8e8c30ef5edda7fcb30575fd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"b4a9983f3036fcfbe9f18065ce22404d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"fd86c0e5a37d9e2669f40884ea1650ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb6_ic256oc512_ih56oh28kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"9e85fd6b974acbec4b54e33643346c9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic256oc512_ih56oh28kh3sh2dh0ph1_iw42ow21kw3sw2dw0pw1_n"bbf767f118d2c9ec0e2b6fe2ad611f82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"8aa143329b090421096515437c252575*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"6c343159ca42ea03aac580773837d57b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"0593699847a1c28a769e9fa5d2a792f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e7db48eba2f3ac900a416624ca037151*1&8be800f404c96e8915ac7fc2ff3eef18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1a9f1941a9696a73e72cd8599a473ede*5&d5961169e93ab9732038b346c604702a*1&41c0ad25c94181ff12bac08da5aaa977*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_tanh:0.271:0.314:1.234 mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7a247b769cd67cfaa5b6c2b95a1b5f9a*1&8671bbd4902780f31c7089482f0c959f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"879703563d60419fca2188b91b85d1eb*1&c2c91a27db66bbf9d29763f30ae19833*1&e97cd3eec71a6b25c5ddf5cc6e9708a4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9020c9bc5821c9f652e0c123d92da878*1&db423275903bd1c357f33cbb175cee7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 mb32_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"fa542d54adb3f928413551ac12e6c535*1&187240ea7b60946aa60a287e89fc004c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"8053480ebcededb68a22199b969e83e8*5&63064078eb210cb7168aeba7ec15da52*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"35a9b7ed07535418088668927b837f0e*5&402ed73a191d538ec60e2e7a9a262bb0*5&db9fd337de7ea9992106ff8a408912ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"892fd179c71a8425661b80091c21d1f3*5&9b261a982b8b15ceb6a9627af5c05fd8*5&7d334f426732d2509cb11e6f9ee22a9e*5&e818362f34a7d5235864da2422cfdda3*5&7b76c136febc93e0e8273b33267820f7*5&13be06ec47866b0ba14a5706314cd298*5&6af08343c7c04facea298972fca17280*5&0a7dda07a019585205d2c936354c7d25*1&12bb64e38a591251ec0d2ab3692d5e86*1&fd3d4d26221d5cd47ea6faaeef17c90e*1&3cf8fcb1f6981e0edfd57956c180d3a6*1&ac19ad1cd814cfd694df74b146e12ef4*1&8208534a6d8153ee4edee758368613b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"66bc9801b3f2ee0977611199abedfaa4*5&2221b76843ac11fb8858bed34783d18b*5&0c63c56637e779948b300cd68dc3aa1a*5&1e05e4fa645bebcea53ed6baa6552f9c*5&346ddf9b8ade67d0c32df8fccf09f861*5&3cb6db7ec3977c66c38f81d501f07c25*5&fc9087e9dd707b15fa607913f236178a*5&777ac6ecf3955fdadec8da8e0e081065*1&f556caca76dea46387f16e43647f6474*1&245a05fe241bccf48e46d53f1e6e4029*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d9dc56b201a3b04397b70cae5ad16039*5&cf9f2fb907cde371656a6219ece8a39a*5&6ceb388130d036c903e683b042416fea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"0f40f4cad66792982d2019754d5f2f63*5&c3ec6c9c081a5a152c60f410f7c4f3ab*5&f583eb5ac2ccb0f0fdd660cc883508f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3450a6736e6346482d291f63ac5bdde*5&a436356bd6b85180e47911c9bb97515a*5&4a6cd86a5fd18fdf4c6de9b9080ad5f4*5&269528b92581d76a539ff640a83be9f2*5&97d3dfa69ab83f6bff44bf0d519b0fbb*5&be8b079c7e2be5f29d74f19450bba609*5&322f1f733f988af09f0c70893563cfa8*5&d08c609848fb9acc4bd1665e87de7986*1&6e78ac5a4a9bc5c6f305969a24400070*1&9bf42012e3c0ad52a807999a9b996b70*1&b2f9397137c422f5b753066931a1d403*1&08f7fa50d64f529ddd707c46f4e75fbb*1&daa515b0029dc81889f04de54f7dfb23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b4cd90091acb8d6eba49e2617e7d1893*5&f290c9aa4fb76848e30586913af6bfef*5&92c686497cce812dd4237e896528c426*5&e3af6fed513737b57fbbeee7f3f9f742*5&8dc6945c0d9eb8db52602660f1a1e05e*5&b59cf43c0c54918a002fedd4215c46b6*5&5eff3a855d55185b207c448cc004bcc4*5&bb5a6c8a4ce115eac0a1e267ab8957d5*1&8eebeff4fcf368267da7cf629df1637e*1&b4ee4e5d3ef5a90758d2ba5f41ef0819*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cfcea7225f4050f6e1cdb6dc19a0a94d*10&6ff9cb8865a0bb7878b71c9ac6904f1b*10&d5d7b0d535be471c817d40ba5662e777*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a263d019150fa2bc5a64a54c1aaa5bf3*5&ded0d3cf7f636fa56971d24340a2e543*5&c58e32b103e314a8018f077a21cefc77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c92cf7c12985d67fe0fd0ac6585162fd*10&7b0baae82e4c3ebeb8010bc3337267ba*10&3d08a0bb8e116cc8d2714415945f0f4c*10&9d0bccd4abf5b9cd15b14e51d9e88b8d*10&16e0c29352db9f3d285afadb12209b26*10&668a1776c8028767d39cbd90c87be243*10&906f16a1817a3fa915461506bff1b7ec*10&acde5a168df1a21a1b3f56babc33955b*2&f0a311c523fb7af56f47dc8ad28af02f*2&4b44a1f8f084544f4b8f2fd26e3989b7*2&cd20015b1e8365e294ef682153511365*2&9d47fd73041ed482e5e668a375ff5822*2&bca90e8228f886b1b4393081c11bced8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f7a992efdae869405c8898d70b78e53a*10&23d196edd06054f0b42738e5f44704dc*10&176c6ed3105ceda4cbe1defdc5ff48d0*10&5d7b1d4e1f5f1d4ebb4cf71946bb5d11*10&a5bbeccd09203efd247e9e4a0031709b*10&6fef943dcc1c821a762d034f64ce439f*10&5d8304678f9bdc05754706e19f917e6b*10&91108f3f26134c831f80e34544f088d2*2&bdae869f8876d975ba644c09087582ef*2&0b234d071968147c71f643c2e35b082a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh64kh5sh2dh0ph1_iw128ow64kw5sw2dw0pw1_n"3e5ee4c122ad569567927221e0482b93*5&5d6bd6083b2db3b42dd1af898d678405*5&7801f5a8843fdeb99e4e3f17ecbc7986*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1631080cfc78a04f17430c7bed0d53f6*5&4f37fc1ccf7684d6502b09f1d1258652*5&12429d32e43dd45a07c23a2568a8eb12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic40oc240_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7f2d9db8b0cdabbb9d04e60afd478e3a*10&02edc0d64f6d681aa9401fee24076c2f*10&9feb2e2e99012894aca87db182ae88a0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"fc6710e97769c1f81640253b68d158d0*5&e0c4deea33535903da791aed96dec8ff*5&ec13481acf114a5e81efbccd59892022*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"229a5955adfd293e31f1dc92b1f8acea*10&0d4642606177288275bf77cfad70cf2e*10&cf46d15f2e16cc1df0326268ff5a0b1d*10&2037f99f25419ebd2abe8d429e45073e*10&41ff226931476e278d2324ba47ceb2de*10&b1263a2fc75085f9b564315bbd0fc3a7*10&d37dffd37a6cd8d99553093aff8a33cf*10&54d2bafa3779a4d6f499642996a6d730*2&c6305ee207040fbe664beade62baa180*2&ab4a9dfc4988bcb897b2ef5f1a4f167c*2&a5ac935957ead7de4ee9b516f20abe63*2&bc29d92ccb32b2de26127ec9b9f65059*2&f26224f6edda6c568b6ed1c0e13943bd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"922e81226c3c4be458d7cf76a81bb361*10&a0ca0bd9b182f97126aafe6b1e871e81*10&bc79e9704d19ada4938f03f1da1c8833*10&7f7b4d034a8b44fe4775b30e6ece8be8*10&562c14529846453f72a23df9d03aafe2*10&ed93a43152fad917897e13fe40b1a1c2*10&4e69814f73a5fe307075e0a2da45d2ed*10&2fb66705706d7da9058e0e17f4ae5682*2&78a464fc40aeac256e7bd84af6e212ec*2&6a4a147d2c1315d162e461fae0a722f1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bd22e80f9a45375cb8c234b370d6cf10*5&808bd1d95b665db246fa004dc3eec9b0*5&149959865724ddbcd8134e262795e793*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"283d4d138306c9c26960b2ef2a44eb8e*5&8849b4417c8bceb38734525a9a433fc5*5&90560005f7e6339f7da2dedc29304eac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"07e09f06703ff40f0698118686dd6d49*5&eb5823122a5f6f757f1045fec3cb88a1*5&95b56e291a775ce9816be6afd71785d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc480_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"90c9dd4bc14e0dc64b23320aad6f8ccf*15&d1380476f785d68223f22c749f4a83c9*15&23a35988d3a6f9b998cc71918fdcb42a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb1_ic480oc480_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ba715e84b6f86c36e71565016d6a19a1*10&e9c5f4312d568927663c7ac803e34767*10&51688d489517ed74e4d46b6d76c62aa4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3cee0dcdbea8486c9f42ef604f174d8*15&6b4fdf06e094123ded87bdda4fdb6748*15&983926a944d98da01e63523381478edb*15&ea74647cc5af661eb7893e581c4c8a52*15&81ea527f761820196d0661b683d8744f*15&167287ff0521bf540c0a5fac006aeda1*15&034130684573f2622314faeba2d3cbab*15&3c9a417854f406e003fb62aea34ee319*3&f8bf675a636e8ef350c682f00c9153fe*3&09508670b00f5539872e0289b23f83fe*3&f4d84485163fa9187de0ecd494e1cfe1*3&0d7ab5373c6db1d5c607bb23cc78e37b*3&ef29c9deb1d6b3da3974b55fb07260dd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5973ddaf7ef218bc26e2b8c3915a817d*15&e0c17dd2caa44b25a7efc7ea00b5760e*15&533df04bef4b5131edfd099d1d3096dd*15&72aa3f97aeaa1149979c1ce445ce7827*15&4c387cff2daadfd4a67a6b20b695ff33*15&5d8ed098243fd74b0f2d41b8c2677a9e*15&2cc88c48fca36fc340827b91b8a5cbab*15&dcd0d81637b807d714a0a1aaaba71218*3&7d3645a323cee823fa17230bb6a61796*3&17e86ec80cf36e9e383d34f1c60bff3c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9267e4741216b612dee7da756034d0a7*10&343c475047fa276cbc8e93e494a8dc71*10&bc7e114b9e3a9302878e1946026a347b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb1_ic480oc480_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"37cae23a0fea0f06473802ea964daf41*5&4c2d9d22816a60a415c22f4f407d9855*5&7d15ecbab75760f5e0f928401ccae2e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1fc0e83aa8d30ab11b107245fb600d2c*5&5d58f51875e40fdc3dbdb30182dcd3a1*5&0dd6e41ec9baa26d91eef6e09331beb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic112oc672_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"256ce130edf7970824f1a47e70212b6f*15&5c43ba94e7d2a32c4cc469d0af74f0b5*15&33f5d120967e76b41440cfb390a525e6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"c623a707154aef1394c5786462387f45*10&070e1883c0e89fd04178d1cdc80d8246*10&4f7b04fe624fbc85a13980cb363b67b7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2fbc34de349ea569f3ca5fce154e972d*15&8d83acc30a0e5c98b4946b577afb70fd*15&335e8e339c2f1f2020cae5e61898a5f4*15&c95934c19216dde11c160deba09db0d3*15&2bfe0cc4d86b9c1285f9833152cf83f9*15&22eb99f35aef723ad3aed0420ae99c21*15&d26b6101f3f026319a2f9434c13f2c1f*15&aaacb69911ed4e05b8a1a92baa64a787*3&bc255f2ada50f435e4019a83f5d55cfc*3&d101bbb3c228e5bdba655e2b38bf5eda*3&eddae5e96b07546c8e6e72966a849a54*3&d90df41792ad78bf76fd131702e4b2c6*3&3d054aa4c209b5758f3d0ebf39778dce*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0d4f1fd1ab5b279ce9306755d93a07ef*15&266c6056bbc4bc986979484375d0a464*15&906be7be8ae6236bc7a41c84ae56d828*15&59c57e160703efee935b482f8d6ff4a4*15&49d1cc39da1213cf400f6c1475924c30*15&8dc4b600e6e8c13198474b54756dff1b*15&5c99b26625ce90bf5af9325faf9ee0d9*15&6461f59acdf2f687615b80a3a1cb8da1*3&f03384fbd4f0db93f534bfbfd025268f*3&46e9639e8f912a69c9652f957803f601*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic672oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"18f6f74bf14e885ef518a394bf69653d*10&c9eacc872db2bb53d9274bb099704bac*10&f3f12ab264395cb398139c788c4aa04b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih32oh16kh5sh2dh0ph1_iw32ow16kw5sw2dw0pw1_n"b05ff1d1b9957725948f0c369ed45af6*5&b5e6f01334cbddbaa018374e01b486bc*5&6a4558bf33762e9348ccc1647aa9f90a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"37558727e50df22be1b127cae39e3782*5&6102bb4c1462ff05844e99c19ee91234*5&3bf49faa0b2beae86a3352d9ba295f4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc1152_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3a705c79de5b3d47b0bf0337d36d0f6a*20&45b652772f1005d83e98a29232ba5f94*20&0cdb09b42f183c79baba09a6951fe163*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"475e3731974b8e4092448ae5a66471ab*15&82819126e92c44dfca3428455283bdab*15&711d4b5df68c8c6029be658e8f30f6e2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b8afc18446e6a747e13443a8ccbf2218*20&67cd8780965ed4df72f52184213e771e*20&47c884f885897f319138070775c5f9c8*20&d41a63dfb5813058b82431be5512ad20*20&bcf74b19a1968aa8da74153685993664*20&9dba4007176b30ccabc3de0c7fa51e5c*20&19e723ca2ba293751acf437eb612dae2*20&4b37550059355ea46de610ad74a8489b*4&9b42edd8c93632e0f549d09eba72be77*4&a748302945b66b1f8b95887bde65fc8d*4&3f5bb576e1e5b4d0a9354fbe4c17f202*4&544e1c92cbfc9104d282fbec7c30af7d*4&b4659e7278902543e1c72e6fd9daed2d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"30de7aa1c8fa5962d3526d9853b7002f*20&2d38b059a6ed53a9d511d2c0b1e60387*20&4e8a9643768b708c1f3b64db40a44adc*20&06cccfcbc70178466d1e6fba8d48f655*20&7c77e21e1a1589dda1f0a04589f8e57f*20&1ed61c5fc9322327777e5491f5166322*20&13373f78a7a83423d00affff94aef9eb*20&df5013ac05ac4b75b6ce7cf8e2d83bd7*4&7afbd3dd23a02ab624dcc42ffc386b0f*4&8f0aa8414cba2c8106131fb8e8ff6da0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1152oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bf1874496b12e06dc3f5c26ab8e5b6c4*15&9399eafd781490e3ba41903e5c7859ce*15&565a3279452447380dfc0430a564bf59*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f55c03972e0939cfef849e8807be6e67*5&1d43217f189a72b89de6c138359aed68*5&e87abf2c6e478b4b06a935b3b38c00cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1152oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"80e764fdb479b4941f54c28f5b96c17d*5&0999060dbe186092271f1e84256d7342*5&30d282643111d5afd1e27f9dec8f30e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"76f033ef1859776066ee5f7aa2243dfa*15&20a8d808a0afcd0098e6b49e5fb82551*15&1051a7ad2abcf4fbc597a05a491aafc4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"323ef4f9e3f8920ea28c4b00ffc1c555*70&6aabf0f092c48b9b11bb0c490bb35781*70&ff1b6f5d95d76f73bf04d8021a2acddd*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c2270dc159010a6825e43390edff6160*30&b015e27253be3f0bc1f2c6a533b2e9df*30&fba22bcff833ec0330168f45b88cc110*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"10edac57075f228ee948bd940b48b194*70&ef70f1b2d5bb6f24ec8316928e888f64*70&988c724985a4b7b76d346ba5541cb813*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"06fc01643952944c794d0456fc84dd62*30&e7e45e917b16a6c6c000666370eeb89d*30&da2491467ee769dbed984bb53eaa4eaa*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"bb2c4e694c63143bbaaace3b4151d1f5*10&12573fd160024e7566dbd59556abebcd*10&ecae567d5d149c5c59a92086f17a01cb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cb7bebb0e1b0da098c34087287b77685*70&ea2721d0c18e8b0e876ba8fe7bac0f0f*70&b3ab5b71cc331b7e010216ce785f8768*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3f03141620f631bfb8d8140e888315e4*30&a5e98fd957269f834913c4710d6f6d94*30&d10169f71300fe7689853f4f9aa1a28e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic40oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7cb6a7af49e53ec330615125d96ce393*5&3eaa32c6c3999da5f70c3e185446dcbd*5&31b22eddd6c7fc89108a0930cfe1acdb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"45dd6d4b9f41ba7a4967cdfe53dfd362*55&10ca7a735143e770a0798d3e52df722e*55&9519edf6f66d8e7b903069506d6ed6c4*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"81c10bcf9449806ff77038240dbf67f1*15&78122356da8ec79130e63da0306881a6*15&0d3985f72ef1d5ff262444cc514f1616*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"38ad36acfeb213decc6df18a22de1d63*55&b9ba0bd1629759cd3494c5c72aa87ec4*55&89d25ce01fba3a50484b1b083072a4b5*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"53ec41cee62df5cf3b74912a6e39977c*15&3be57b6440defc94b0649da9018bae26*15&70b2f1271889f6965d18e95023b81eeb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"6899930a70a21aada7db3b56eeb37661*30&3063ecf69a101d524674b083d080a666*30&1812df59de122644d239cfedf7d1a652*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"609652cd693dca02a35b2ad19d2f7d81*5&dae606edc80159a6827a4b4ad05f5d26*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"143b91ca7835de4396ccf7c5fcf413a3*5&75dea78c9100b40d7336d1846c6cb4d9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"62f043653cc29390164c31bb2fd09057*30&cf33239b69e748ccacdebcfa0eded727*30&f916750eb0b69a3bc2db40545f529d02*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"514073dcf9bf2739c0bd0f0050982269*5&fe30008220e433ed1ee92137127c9832*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ecee516103added624f73f1c1c7563c4*5&63a667d52016a465b6f28d5ebdc9a03a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9218dc820460b95e0ebbe505fb54ec44*30&5bb486eb2242fb3a7137171d848cac8d*30&9b2eb08eba8d126cb369e7c24008bdcb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8ddbd029655693abeda34d7fbeb6ab0d*5&3508a239f82ad320eaa44ac8e5871440*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"97dd58668b27f90dc1937e2b620ebbce*5&749f342785023e4ba64458fee9bcda37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"be3e706e667c2109f1afae44a7c15f0e*30&f259bec9eb4f0f287f6fb08636c08ad8*30&9b68e909044655842b97c9c5bb2e8e78*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ce2777362ea11924336e6b66189395bb*5&9dc4e113526190227644b16467fee79e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"574a16556b56c4ff3df2bd9509fcd3eb*5&6cc281473752ee297c7c49a8251fb8e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"56bff493beffed0f743bbbbe568e3590*30&181ed877805590e34add5e23223b0fad*30&716db733f17527c24ac0b0ca344131f3*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7b85c0ab2c2820fd5c5b94749cf66aef*5&613f58c6a2e82abb54c4400223ea2d22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e9f33c5aa02a7be7a52ad8c199342957*5&6fae0ac8f4e31d08b03cf8bc100d0aaf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"01dc4257b630b80ffa17f55975c73cae*5&dfcf6aad6816b5bf07a077968c10c5df*5&5c90f70eb42a2adeca46580726f74fe1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"75b63fba682f0fcfbcf1dd3c1ea17cea*30&1a89a0318e964a5faebc8097d1e49d68*30&da476ada896e5ccbf4b0258a7f1b7d45*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g96mb1_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2e0df848c430121f8394bb92a439073b*5&4306ba685a1371c5ea487155e0abed2f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"480fb01d6048f35b4c53177958ba6e65*5&cc1efa1a99603e77e2f607c1b1bbf2e0*5&651e61e7cfc2931d9da9fc26d76c0b37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g128mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a2436aea0bc97366d27bddebfbc1880c*5&ac7dd61fef2127de2e7be84bbba191ae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0b97dedc3032b47b8a054c3649cee60c*5&2d5f7c9005bac1ae448edae7d2ec64d5*5&728e31036f849e89bd7ed8ea97733ceb*5&c0afc9021c28c35fcdb67a3c034937bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g160mb1_ic160oc160_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c0fb3fc709b657445c1f43eae51b854a*5&4bd8f501be4c374071cba397ee857cf2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"759284983fe58040aaa75dc19e7c59e6*5&a61c3e6e93abd2d2f85c368f427a6a2c*5&6dcfdc5e2fe568d3a86945aac690995c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g192mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d98f37d0056ef2788f5942c28c696ebf*5&be56f18c353ac37e86a4802064a5cc38*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"83a2d600ac2f7bcc3473db5fbbb5ba7f*5&a8872b75ca7edecc7ce02226c821d3bd*5&1b9ef6f16035c50797d660dc9ffec62f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g224mb1_ic224oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8daaa95b12d61ce9af39734c1255081f*5&33c8a555988c51e0d123f3c058a6b76a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"db34698530db524d435df0c19e041c81*5&cf8b6e6fa6eea2b39591339ada445af7*5&4bbf99be712ca856d5e0ee333cc37df3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9e0f3c4e7bd205871a329aef6ac27436*5&01a0dc55614291b2753e1ad9c3e56b26*5&700506c6d0207e46f0539f51ae5223a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g128mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"35cf1cd4c64e46e11da0e2d0641761ed*5&4137436ff00a7e32556e9351b3e6ff39*5&b195bd40ce155cceae0a58186ff3d02e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"16052177f986244d960442eda6ff9d1f*5&7b257dab50b0f4d7062a690492b8a011*5&7e0afa501f391e2b0d04f6bff0428136*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"396ffe6e106c8609b0e4a7029287c8d0*60&7594f957156730d7a52e6fb73f085c2c*60&d1950930fbd36f25322fd7078a14a9eb*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dbc507a1bed3ec1aaa6d4f531bf22492*5&c1139b63df9a9f156e768c1c9f178451*5&eceef6314958a62142d5d10fc18f3d32*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g192mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7161ab12a7cc26a3e0b20c54a5b6a95c*5&b645e24de46167d9bfcad24334779302*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0d5115e81ed50f0d769bba9ca52014d2*5&26cc985373438f40109b68267679ccdb*5&ece8559b1f6b6612628ef001003899e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g224mb1_ic224oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f18ef87cafabba907483c3d224781dd8*5&22c87cfd5dfd968954921ea89dfcfb1b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1800cd8342d01b9b57b23528d42eba71*5&0d7a62c2db438bdfdc1dab7563e74aca*5&5442f777f3094356e252d6e561031ef5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g256mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2dc97c65e676f31e73f9628639e6bd82*5&cfc6f5631144d214e50fe61fcdc09270*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g288mb1_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"57679c95ba952fe553edef6d57862d32*5&c79cb57045bc1d7d5d648a2b645b8b12*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8fbcb001d16dc3cb3bb55e9cbe3a9303*5&88c6686a3dfa9d1196ac2256c550e63f*5&4e11993a33444f67c422d5c5c953a815*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g320mb1_ic320oc320_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2dc397136cc51fb61b200b22da778cfd*5&c6bba51121bb4f539ad59fc578ea097c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"573c58c76dc575c2e4baf6401137d8de*5&f62a6086e91b06cdd3bf23383019afd1*5&7878ef7dc0dac38b8908c36c479d5f3b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g352mb1_ic352oc352_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b442b5b269b55062690b2515213ca3e5*5&2ae4d25baf5033cf5b49de6d74738572*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8cc423e4063b66caa5a455809a044ba2*5&ba6634c318721ab22ae85ba0d94084e9*5&6808df75b3ae1d9b57b937de8f6a799a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g384mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4232475195686c346e6efe429305e7e7*5&2ef303529d99774185b84969682d0797*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"430a83554988dbdb7391b62ec6ef6a55*5&7de3eb38fbc73f8827b2a042290e5ff0*5&259f66cedbe701040d6d034476c39297*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2+binary_add:f32:2 g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9c1cf50c512ffb5ef2a999c075992ece*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e91d26a584ec45cfb424a7b2c75267bc*5&133c62b498229721ef52399ff5ec4e02*5&28eb210d0782d442b38e15b022d1f301*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g448mb1_ic448oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c02c79eae9c2b08cee42ae442d1063dc*5&3d43a24c253f0529d0a71206705acde9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b56e0eb10dbfef7696ff73e32bab2b66*5&41e29d7389e069ae33bd6b2216d55e5b*5&63b9b595eb214f63080272658268950a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g480mb1_ic480oc480_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2f130638caa069d89f8f53ad725a3ada*5&8ccf279e9d3cec773a0060acb55e7372*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8696f4846593d18087ebcfd591fa1163*5&60c4aa10301a02f092f3d259a2408af6*5&47d42e0f8ba3c06e22c7c2054b261102*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"25b02368fad947c87e47b7d9859268df*5&b2a581823596cfff80be0fbef2931e15*5&9b6ab17024467c12c3993da7ec20c448*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e6455979d34a324375fe579b76b3aa58*5&382fac2f413de2216a3c4d5e557c8388*5&8da2b71f63f41a7f37ca30750acc138e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"709ea5cd108a64a66e3943af55d998c1*120&12b2c11b85cbd24d87d81cfd2c01de9a*120&65c2766ea71a6a1256ea23c9ef10f74e*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f2d179c8ee271bf1055e7ba08a3e1a8e*5&1fab26600a1559474d74aee8f58e0f98*5&cc6273dd63913e87c40a0be081cf5945*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f8b330be4044efb024ce6831628e4512*5&a5844e8671e0b3d027e35ca40a3fb6c2*5&82bc78941a42c72d24a013fd5c185866*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"257ce36e556dbf219b1e206b0a98fe28*5&da1f112778442d11619379fdcb465aa6*5&97765fa654d4210dc22665f047ec7ec2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g384mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a31b2008d74679388398d44a252c2d22*5&3efa00968a8051b8a0e87ee6f682d179*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6524238e07607a7abac5a6f2b18d4559*5&585f7db7f98ee6fba6aac65d11c2fd44*5&0b2229c0e8e695c1b4f7476d34f04b96*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"70e81de2103abc9f1a422c5d3b7ebe37*5&5118754f2ab9fc72d366033105e47669*5&a0719e79ecd6161b3b9c532e8a087d19*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g448mb1_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f4f467c360a86ba667082267a5086a01*5&9138811cebbaa7d706e85a9b417b412f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d23d5364b011c1d0ad44e67fd26ea2fd*5&f5f09108d6508237636638b0b96a0d08*5&da780caab45353ba10f6b1d796f816e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b7c1ed13c5a6d8e0b55e5f2598b1ebd7*5&114f49a5498ed59a5fb8a95c4cd27c6e*5&a0356ff1086e445d45c7ea6cef447cf1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cfab66112ed96947e70fc6f559a8bed9*5&86cd3a7370dbe883cdf81f230dcc4acb*5&e53d13e664699b36fc4df9e4bb146960*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g544mb1_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"84149077b27ba6badb40c0792a58bfe0*5&6b8851a4c6209f3b476ce3d71ec8136a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3589273ea64906c0538314b95d898977*5&4a0a96f2a9ddd1f53619fd371845a5ea*5&49f9b6cc37102a45ab5e34f2752a151e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2916aaaeb0357b4c5623445a8b7d0c1*5&ee1293731556b5dcf30579d2e2002c38*5&72857a6968eb0fd456a5b2b2a30efa85*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g608mb1_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"758ffdbc9969b1cd0fa8a25c0584ee44*5&0a9ea0695e7b73884885bb169553481e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a3f15a663c627258f063b7422b176e92*5&cfe11b102587925dad01a49fc91d72c6*5&0c6bd3df7216f5e2d56ed76af4d970df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g640mb1_ic640oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c7979a226453cc7fafbb19c7b2aa62d5*5&1ce1ed67301e08e9422f34ce8a7dbef7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c97e319c597dd28179d9af303744aaac*5&208df6af6169908b240afdf410f72b29*5&191a3f8e641db604e26ae00ee9d0d001*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aaa660f7bad22fa19c625061288224cd*5&5cd4812d7b1c9ade5a93e6d90d815538*5&35d1d8cd4b58386abf340c5e7f147d86*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g704mb1_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2886b2348c9518c46cb9d24ad5a68dc8*5&6dae0a00fdfcea08bbe93f0f321f3467*5&46c17b6d77dd65d835086588ef8b4ec1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"abe0dc57d285543bcef375d3fa8a636e*5&0657d26aec6403d22b00d93aa450a158*5&64137c0484dff1bce153c2c2adb87313*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g736mb1_ic736oc736_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"94a5cafb4f7ee51273fe25ee40d20012*5&826ef694ad67876e6fa01dfa170236d3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3b774fa1df746b655876e1f16343080b*5&9ef41be8bb262ee93080c188a3d1f40f*5&5868782adce449f3cf01cd23d701190f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g768mb1_ic768oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cdf947722d0da5155db3704b928f96b6*5&d626adfdfb29ddbd7f91451e5721fd6c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c21ee854530b3563bc8e0bccc5739b0d*5&d3b0913aa4a519fe4625b20c81467e65*5&0a9392e7af4027477b4ca4e3dc5e2d8b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g800mb1_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d4ca2f1d105d07c31c5bba386ec3b68*5&e63f60384066d95027b4004ef1e777b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"90a40c7070eca9f0d8d7cf6bed3c75f4*5&f1823a4a374af9ec995527141df1a18c*5&6a6182d4c4f9376c31379f7526d4d4d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f717494cabb7384ac41e6640b7ebe1a*5&6d43db206b3254359809a2e703b8e04c*5&ab546b9f0410c31fcfdc46cb349e7708*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g864mb1_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ab1a85f050a0d10b271efdf59d615e06*5&844f011c3bb661476ebb3f62b6ebf1df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e9ad9ed51df4ad7b8c5a75957fe2d8bf*5&f81a729786afe9458c450a5ff63f36b7*5&282365f781dee6282f1c3b70708a9c08*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g896mb1_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a5a640b3532a2e364a848f8f6a51dffb*5&6fc841fdf3959b229b81e457bd027d19*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a63c02d48d40c6e7a0a6cdcb2b78628f*5&067bca8b02bbeafbbf25a8b3de8c3793*5&626c1ff43e9f0ecd870594162022b23f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g928mb1_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25e794d5e60b7bf41645c3f07028b70d*5&2cc1777d2cdec9dde2ddff09ad638d43*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1967076c3888843b0ce4b3cd0270abac*5&18b79f3219341d842ae6333c1f02bd8f*5&229f0af966f35d46664e0e629c8c687a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g960mb1_ic960oc960_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b55d6556d3d714d6c89d79548bd1ece7*5&1645f41a38fee9d0dc21335134a35b55*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"406007f3870b827a9bf6ed740f071f7c*5&057c19a8310cdcd9793a23f97b587d02*5&f23ee3357d00fee5d0414d670911c5ee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g992mb1_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"705bcc398f6e01b5c773765c37c25dab*5&6dc376e3d71d1cdd9340e58fabc64474*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dc61cfd8bf8ecdea5bdec0002f8db975*5&d8203c94b63db7cf137ec698ceb67958*5&7ef6859b327a981f76ae043ab4a7a209*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"79135ddeebaaf7c11e41fd6ae49fa87e*5&d081e38f6c99746b91971fc02c6da643*5&0c655eb10ba79642c878c448b232694d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b298e3f2dbdc222239d710e88e78aa63*5&dab63236328b5105d82b64d5c9f4a26a*5&dafd50175a087eb9898362d192217354*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"540bb2a75cb559c83e2f12b365f1235c*80&7d24ed4ecbf0287d35bd1f43bf0b04e1*80&2cd756de5800affdf996df0cb9fdc3fd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e3db4556a3316ad7481d7c0509ccc741*5&cb3274219945c2ba08891ad50ef3ee86*5&e227cc46b666a2c2439d9adda8c7a4fb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g576mb1_ic576oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8950f1d4b125683a5323473f229a7483*5&2e439716293e7068a9e47ea007afbef3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9db664532a3f1f8eb74cbd3a790a7caf*5&bdcb458d77b412cad548cfc88484a233*5&19daf6d1c6e494edd8f8804b9ebe999a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g608mb1_ic608oc608_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f7fca45462bba64306523ed6ac0c9597*5&eebeef87dd180ad02f3ee6fe9e45661e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9b711be48f3c03c259ab8eb66727dfd5*5&4e0c5da96015f0b2d3bda516b03d8d6f*5&80b9161bf8750e336f1acc6ea5168e56*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g640mb1_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"132133f5dd90e4ffecde6316330416db*5&859584f392aa18b4bb49fb6f978ae662*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"103820ae0171097b2b7648b07749ac9f*5&2df3d7390679c7ebd8c37349acda5327*5&2bac5603523bef26abb52282087b6fd2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g672mb1_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d1e74a896c54a03823461dc2e969b42d*5&d95410b09d337219c0aae8e5f544edbe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8792ec1a95d2594536915541de6141f*5&2a5b0be224da80a4f29dd1a9a44f3b60*5&56a1d7c4cb274165aa50f0748a17f200*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g704mb1_ic704oc704_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a86dea667ededc691da1551978f59d6a*5&de63f69e113ca22e6930b94b026344d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"84bdb97203d7bab29e224edadbf63f5d*5&1e037ffa883769d6035c39b713c924cb*5&848088799de1116272817bcf6ecc84a9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g736mb1_ic736oc736_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a2b2a14871b775001e58ec1861399cb0*5&9e06cfe96ccfe7adc4a5caa795bfe747*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"63717b6d818750afa0749e2193b079a4*5&64a69df080c14d86d2ef74f69cbe432a*5&4c75f1074924a3599d14302a9b1a3fc5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g768mb1_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ff1aab8377eebeb6ac726b4e2ae11c19*5&97c5a67cd0939a59f76a7b07d54f5cbb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3712b9dfe7963ac707b552e7c6269f7e*5&59e8066d04a3d553347f6376ca862276*5&aeddb189e0fb0b324384cef7a2ed4f70*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g800mb1_ic800oc800_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4981dc2c0ffcb79196e04dae443a812e*5&b01b65a4b22daf3efbd4f790d0e6e341*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"910c90b41d0c72f6b045229999c79bae*5&ea9c4dac209a3ab1e18466af58ff8788*5&bf437516e53f351ec971b8d7a843aab0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g832mb1_ic832oc832_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d477f9f6a9fd9bc292ffa3e16a7cf075*5&aad789a3b3b0053fa49fa957a5662695*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9b279da56efafdf0377268b3b0e441d8*5&bcd5ecbce5d172e81fc98a5439bc23fd*5&947e80bd194a5c7f5fbdca2af7e80d20*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g864mb1_ic864oc864_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d45372399d351d496d3310a58ed3a725*5&c49604478e9174a5d64fd7f3b141275a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b30b9c4c81ef98d9a11bac80a9ce1ce2*5&e119dba7cad4165adbb42f6e5b952657*5&377f9908ec75b8a38d5e8bd1123dc90f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g896mb1_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7693fb9091814b45dd347d1a6f12057a*5&139cca113bc3bf4cc5b02cacd80926eb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"829a7b2de9e0337ce9800c7cfb233fdf*5&0b545e2bc7bd15ea759065e58b986eff*5&23ea649a5344a829c44d3ae161bdf31d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g928mb1_ic928oc928_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"247d77320217c9b66df2ee661b93ae70*5&180d054266f1b96a2393e9c00bc705f8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"98b7136539af5a6de71972c5fdbc4115*5&d4aa889b7b4cba607a5d2d60478e02b5*5&ba2f833387f8fd480a7fc3d368fb765d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g960mb1_ic960oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5a671752449864eb5dec7bb29ad08ba9*5&a93743bd9b7bd5735adccc1e67e45c84*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"106a133f41b5daf0d014691e39e92664*5&6c39e9dc27f3340709bc0c17244b7c96*5&863e0eb48865010b26d69e721a7c961b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g992mb1_ic992oc992_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4ff29d5827ea46d2ee8a99849b35e6c1*5&c93d204bf0f9dd794bb4983544734937*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"02d748bc54edcf8aac770a6787c401d9*5&1a44e0830829fbc914774adfa0fc4e98*5&f036ec8c8b468546a5dee686b991abbc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1344ow672kw7sw2dw0pw3_n"63cee7c2f05c6799d7993308c37db80b*5&be497bb92055f01b323cdce1a19531de*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"94e5884a07d3f2dd018f40d3989a2bd7*5&28fd623d682a9a9b979bf63e4f3519e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"898efbbdbe5004aff122e50f838f4eab*15&75bde21d5c581c51b484b7b79eab36c6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"ffe4a86f89939f7708dccb92dcd853f4*5&cc1af7382ef0d711886f5fb2ac4d1a0c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"9d8b4658e6a251ec91e2111d5bb51b1f*5&9669b14ed4f7edf6e1d67a0c1d5c21b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"49a4a0c0582e480a112326119ba3ea1d*10&e93078fc4f97ca18733edf1f56ec3389*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"60f698902258d46e5cfa655bfc447fc6*10&87ca88a79b5380dbee2c52224512742b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"8a8a1063be30e5648b74d58cfb6618c8*5&0f216f9f698bdc315dd740bb566ab786*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"6e0d0b5c0bf9a810146fa0578259dc69*20&035301af4cb70b72dce40837c9aa7fb2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"e613e58f9c333552f30039e131508197*5&b828ea74abf2d3572e2808ea6a1c0b3d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"9dac5c3600b03f6e1052291c03a3d5e3*5&1f00483ac49e5976cc16d9b444ec45a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"d9f8995228c74715aee72d4eecbcc89b*15&8d47085184e6fb0573146afafbf75306*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"cb64ea59ab8f956fbec33a5bec872594*15&f60964fe74c6c045efc89b5c3e4d3384*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"b96df6d0f1aad058c42229c5d3b8fe60*5&16470945281255cd70c2782f2dfef585*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b3b8b871c15711f1996058ae7219d246*115&a7e22b1634391dc2b31522f1b82ab719*115"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"99b2b44f4b3fbe03b9255310ec3050c5*5&d29b135da520e9cb86c2abd9405eb620*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"4622f9f2f0af0698d28544ae67ff4585*5&e43e8f94f162ea7fc116e4abf07063d5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"ea50ce795150e07c468b28d3806cf680*110&0a216f2199dfd2642f54431889b31bf3*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"604dc357a1ac4362a407790d0732abe7*110&238ff8034c4af734930143956268190e*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"b6c85590c3666773485c2270c832a185*5&1f8296558a55c186399ba70309e0a456*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic2048oc2048_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"7ac68b5eaa29388bc925e57cb9e46c26*15&0c63b2940cca6ecf0c5a2cb633f9af5f*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"8321a49a6283f7ab49bd0001fdc3975b*5&6cd2e0090245a65909316b941806de9f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"49f6f2b51dad689cfae1bb7c0b14797a*5&653d2f43006be9febeca819d2ef5e1ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"9e3e14a761c9947c8f64f00618487b28*10&d8d9881d6a00f51099168dc6978e46d4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"5f13b8d9b1f0fcab9665db428adefdfb*10&a913649f4b9c147622c8a1ce94f8e095*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"4376073dc9c05b4a102e713dd11aee50*5&2e53d22ebfca088ecc7587f7bcbf62e6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"8e112d13022c1a9af188b3672768ea17*5&1ae8b8b01c4b128c300e28a2682ae896*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"5cba84c0cb4de24024c1702637f48814*5&f7896727b2a906b7621772b368151437*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"770125f3f9d7819eac60f23bff1b7b6c*5&6335c56085703ef00d472233b4b2ce5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"57ead425f54549486a84055116c45fc6*5&a149db37c99e3bf9fcb8ba451105697b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"9eb2ad997c59d15754069062d5e5dffe*5&d1fccff7548e7545df4a0ee0c65d12aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"da1842350f2e5c19eda81a3fc3b7a1f1*5&4b6c2a582996df03261e6b182c3fd84f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"b175ff592c8c2370d9c16f80ee6ff788*5&7a42d8ea344154c78cb626cf9eb38ed3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"01386ed432f615ab9dfd22de1baea9a6*5&dd5da7483d50931f24b7bd1ca3d8c0f4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"394ce51c23bb1bd7dcaa5c8401926aa6*5&0a3ab87e79ee0f0b486b3200a9c00a6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"0658575ebd641020af71369ce7f0eea0*5&a23bd8d0e1a5c44e6ef421bd8941fc75*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih200oh100kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"34a0017989debac27a5a2cc21b024087*5&f523425458263244bc5591a53eed4654*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"fb942e5a398122541b177c7382c89133*5&44f52c342166894a369ccadc3ae79d76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"c1e4d17f0ec205be063709e1330683a7*5&ab49aaff96179ca179ad3e9086e71b82*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"5b639003fff60bee0c101ef2d5a63ccc*5&c2feac7e2f5d70e6a353761919be80df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"f4b7addb13b0acdf48ebfb4f5d9e16a9*5&fe574a8ac63590bd9fb67584843039d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih100oh50kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"729d6fce3fa855b8979cae31d6b70233*5&47e582b327faf921e01a8c524fe1e618*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"cdf1a63daceb14868e9204bf2d92f636*5&1757fb86ad986e33169c0545a9b73dd7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"21debafbab894e4bf0790632ee114b88*5&c0e36a286e0e51c51bfa0d4388a0c6cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a637e24710c279f639bd0b4a75bb9eaf*5&9a6998a8e8492ec7026bfb1b273ed065*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"fa4a66705143127eee7c75b5ef294d02*5&2929b5daefba47b5fed158abae8650e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih50oh25kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"2855e5dc8443018f9315b760fa254163*5&a03dddd4669a0b7f608b2f9c57f96b52*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"cc7eaa8071fde2f997281b5af67de11e*5&8ae9e5a6d2805fc2482912809e789fce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"fdbc754a7efdb3095e37a1803f9d0425*5&5f32922a625a250c1a1ad0b34ac7f387*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"d69d06b0da2af96a1b17d1fdedc0f16b*5&2f6c65cf0b31490cac91e462f6c93061*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"b9f04a90b6cfbe841ef045684bf7f90c*5&0b3b0c464fbc37290761bb30f79f4313*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"ac252990f5c668aa8ab69978c58bc692*5&faa679f2c1ae9d61c7b6fecbdd883722*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"3bddea52950f958d93ce9b197875ca59*5&181a7b29bda54160c21385b30da3005b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"174cbaafb554c2e7a281a32430684b29*5&a766e7cbf74efc5ee5e26ff5f8915072*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cbafd5d4c0e09105b339881677f96340*40&8062cb29ee017881dcd0f11215f15c47*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb100_ic256oc128_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c1940bf08002a8385be63d26f324c454*5&7566d759e765f03f82b56428a20f3216*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:13:ABcd32a16b+eltwise_logistic:0.271:0.314:1.234 mb100_ic256oc81_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"18f98bf19865dd44c4d7d64fb4d4b9bf*5&653d0aae21fb557975177fc4dd9af7fa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"f3ad089706f3b8c8bb07a8b2358e4c89*5&f3cb9657ab6865b541283d0da76fc995*5&a8c09ecad1959936470c22c1a90eb874*5&8ea62bb511f2119932b16141f23dc199*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"05630212ac463dbc9926f0bb1580e06b*5&20c09fa7db09de5cf61cb1cf6868fa46*5&43276d8a95d7f4f6f98eec0badc1c4a1*5&1d744b01e976044aa629c7973d689e72*5&fa69f1a2f293aeae814d5d52b83d0514*1&1dcfcd09e9f782607cddc6c5d7cf1c71*1&817c74a2ffb9cec742f1341c7cecf9f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"21896e7e35c0c774303fd67cf39951dd*5&26886db18f555476a481da998d4d6d0f*5&f12860e01f9bbbf46e2a2580c299621b*5&6cd7d204b5628b7e4006f178a34c8ba3*5&42e0de6c4bb212db63eab2449b4fba1a*1&9f668ec1603c67292ceeeb5fed4cae63*1&f172973f98037e38dcd0408926641b39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"1afbde38f2ff21307bc313c02d68cc8c*5&d53a66bf2e4d0c258bbcdbf81e7af577*5&f12440e2642890ef78eaaf86e6be5d3e*5&a00d937c44fdbfba566ec58c6b270b14*5&78c8f00ea18b9d8dd1818fd7322e14c8*1&e8d7f0a9bd5d13fb80dc2d9158467cd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"d6dfe04d8e35a78ddf232e35dbf71174*10&170eff42379f24d621642baef08da27a*10&ac42706d9303467846a3aa57cec52857*10&8aaa6c3a57908ae8a042e4aadf5f5084*10&52b3365c05e20b2698c4cb53c970dbbc*2&f99b6ae09d954640746d42ee1e7dd56b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"4f48dba4d4de9836fccb77c4f413ac78*10&04b04695d3f196ad2fe44c6603e44d78*10&c4bb9cca11739e061cc92631fcf41359*10&e891598f30fea30eb6ad5b30021d7d02*10&3b0c5a5e3a1950b7706bd7f694c31fba*2&f806f089c860491dc6f8a6ccf4386d6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"c18fd995e3ada5940018c9d3197c7ade*5&4b9b1368430bf87ff4b3c144be36bedc*5&a9d366504b85d0dcbae510d83fe6b9ac*5&2a86c2ec7751cdfd2f2819497a0929d9*5&0e3e08cb677b79a414974bf53e420dc3*1&377cfd3ce150b2870d2782cfcb6c6a56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"30bb3aeecdb8e0a6a4a202289727f8ed*5&805ac743cbe1cffe766233608b97ddb1*5&3d8b07e98b029c9b1c8d9d267c35d21f*5&fff680089a6872f11822585c2756d26b*5&9c34730a2fa13275765a7b6b4cec36fb*1&ecd0685e0f67bff84d8d5398f2d7b7ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"0ef3dd6f40585b99e38f31941b89f9db*5&544717462c1b8ebe439975d34e9578e5*5&cf13cb1ddb7c554412744134365bea5c*5&d5be515c84020ef3c482a3a3cb01a29f*5&d3ba3ea2f5cd5e57242d513223f22967*1&da8ef9fad198bbe7faf4c2a64a894526*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"0951d65a899555b79de1a7e02987adb3*40&aebd4b0e7b3bcdbda65c00e4508c7b6d*40&6db5817be329d84e2e88785d331be1a8*40&36fd1f7927d62ba0302e0dd7742c65cb*40&8a68c27397d60f1acf187d6243f2a315*8&c40bde3dcae11c13597e27edc01145cc*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"cd45626083dd7e3c3162240073123e10*40&0036ea00359b92a1489c27c3b6798082*40&d9dba3aac570b80d2a1a7be42f592faf*40&68fbb50c249800514197423a5eff1f31*40&d2cb83dba422310c574a6bca1c6050bf*8&aa3113d1300c9512027824ad3f4ddf96*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"51d37d95daea108dc5fffc4122cce181*40&94a86fd8a03e712793f3c2076040e5a1*40&ee693de98ae920d7a75f9ef0cace0329*40&ded51d1eda283b93c24f5ba2d9b9b548*40&b1e6105febbfca68a32370b10c4085d1*4&93822484594f60c01c54164f01d62fe8*8&5cc861097ec805b8675c78450be92b0d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"37bb4b8ca100be6adbefd9fc4b8a0572*20&6113602fbbeb3ccef4bb9b21b4c2bef9*20&144ef4921e613296fc16741f6e132c57*20&938ce6969481ae3ae82caad3e3673d28*20&d81d886fd2501c43985941121ac9bf0e*3&ccdd2343dda758239884cb30b6223e78*4&6e1dd4be438d256596960d161ee1a7ac*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"654209b2b9a16fb6491052deb1ae973f*5&d7019f490a8a1319c1b69c387ad5c486*5&be77d15de0a3d91997d4d4ffaaaf5820*5&d690a368f0956ab9af6a3d777bea58b3*5&edbdfd64646ad6562fdb050350bacbe7*1&f61a0b4434bef36848ce92b3c7279753*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"4c16c81d36c4a376ca4e19a95cff08a0*5&0e489ed39afbad32ed386d1b8bad6eac*5&0807d4afee8a30e924bfce54219b8b93*5&a6ca7cb3579e7fed7cf6127a909b17f8*5&cc0a1b943247949065f98c4b4ff1f066*1&8971d3a4978bd0f6e758a686df735f8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"aef42a80cb38aee001b291a12a2ebe61*5&abe63ff9f51a8826f954ea0330b95255*5&76a2767b366b97c70525903b65d6be68*5&45223c87d57a7e7969b882fa771a45ca*5&e5b31192a6e427644d1b7a081f8ef7bb*1&5f390f0e7e0a4b102254f444a3450b37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"047099795edb32a7f22a91a0007990a4*5&41d7321b800719282459afee217a918e*5&1a7f88c5d56364e21fa47d0a4f9f7a29*5&b22b4f08b879df456854ca387b8421a2*5&368a6215862c13abe330c7db2a3df097*1&363302a89641f8c541088769e94baedf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"da7d500e0fa9dd53dbcec09303e63f50*75&af536685094b12cc1db6bacbc3db38e3*75&9281d3490c717f76e9b9892ed86dd625*75&dc9a9784ff7f048d22f12fffc29df2d0*75&084d0b49d58bc4054a6db8f033474e5e*15&371779c34c4afb30602a9f9ae515da46*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"aff6d7aa87e5bff77a516cdc78637cd3*35&6b5977c88c8fefdbaf68cf071aa6e49c*35&203fbc460fdc4c694ae58642cd546587*35&679a18509a48a4e1b8915b39e6024671*35&809f223b5cc45e825e5683fb6032c68f*7&fc80bb855cd3274118f6cfafb0bd6793*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"8dd8d0646d926f70530908720d84a0ba*35&1393b43744ab26d22f618706472912ed*35&355e13cad3db65314b565a80a966d963*35&43735eeaae5593310c3f26487de18ddf*35&55c49c88e9c35ec49e8e7b21e9d30922*7&e65882724aae0bd7534846a24eb742de*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"e7d0aae68a6eee89935c985a26298772*35&c1a268d69ff50e7373e3cc98d2de025f*35&9ac311392e9d7a85f8783fdb8db13f66*35&42c88e45805b89f64f49435e9799308b*35&ee50f15720f7e5761282092088a1406e*4&aae48457b8835989e7217ef218f47fad*7&36adaac39d91d28d2b4d1a3b84b20c7d*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"c2d9b353cba8036343e451c60cf85b7f*70&9b7d1cd5acaebcf4244e0bf122c74998*70&5ac3b09dd7bbedc61a2bfef62c88f690*70&a8b0586919c92c62f235ba904b35aff6*70&3dba75e570e536e7f481bcdea9daef04*14&cbf158378edf1788fa4cb5aca55cbb9f*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"03c5d04f107c2e013cbaedeb0b46bc93*35&864616e0249c6d3822a440895734a038*35&15d16cca9b71d63ceca99c48f93e35b2*35&7c12e33fc781ae65cc82af008cb0a0fe*35&a5c49a26d59b0dc2034a6de3d8afeb5a*7&035167248840d0131a9ccc60646f9194*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"3c3a72124d1f7862d82114ababb56fcc*35&922ffc162fe04a2834b7b6d8fd75112c*35&49902471dc1a0ceef70fddd0b09b3608*35&c5515c9680b472b3209017f9e545d1ac*35&35fe1fb0299aa0595591658c983880f4*7&16be398032fb5883d94d8a229246dc90*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"c103ffd1de2646eae2334aa9dcf34e89*35&ea8d1bee5f4ebdbc051e5d22f509e60d*35&ae63d879745f30a5d0cba83c728d89fd*35&4b0645a012527b7cbe80f2de4eceabc4*35&dfe908406c0bca338860c3b7196c5dc3*7&2420b9e198d117bab61191d4431f7766*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"90cde02c90ea5645b5080474a58d8e32*5&352524bbf031cdaa504c59c9e8930382*5&eace784fb7825347346dc83976d93eaa*5&5c2cc58d1a5b92c793e1c5e17f603ca3*5&7a4f8e04c1f0f983e5bf44efd20a919d*1&db0f4f44f112f023752801bcf07b2d04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"c8b5afa8c7edbacf02862d326d1afb7c*5&a5e996d5f0fb83c01977408aaceb0869*5&5c8bd8c52f910573ef86232557c715b2*5&a90cbae1adcd6adc0ffb236dbe2cf3a9*5&232b13a8b7b309f86412c3a76049e447*1&f8e1a674229fd521f4b60cef9ae23199*1&5ce3f229b70fa02b17af628f866ef9f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"3a4c5dc831b1fe355ecf0dc1623b3f65*5&2d3df34aea6c274c1d7fa13622624e9d*5&08c1d998dc4f377e1979841d4f4f5abd*5&8bee10f89710231fa1a9aeadcbfe6bc3*5&77110970fd302ac2cd76810dd27ce6d0*1&8fae03195445960865618d17f1fcb5a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"24f744792f0b782c37650e15e2206c81*5&72a9ea03d74f0c05742e50db9149c76b*5&e2ba81c06f0514791cf78abaf9f218e7*5&6bb33563fe51807bf1ced90670b09fd5*5&837ef9a77ee100016b12d2e37cbd105a*1&ffef7b93f995f6032c8ab7e114a578d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9bdd32f7d8d3ddc56c496afccdf3f29b*5&3825c8ca13af3bfd0ff35eb8e2e4c334*5&45a0cff396085a49607fe16b4373f524*5&3249feabfd68c5f1a17d512be512de03*5&5b429c0aa6310698a5e6af09d4d23a11*1&005c8aa0ce0361a810db92972b9eda61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3e2797c1bdda1e57bef3f98597cb2444*30&e407419dfe5b6cb58bd82d9f6f2d299c*30&d04f90d86124eb79bc27b4a8609f81dd*30&893ff4739e423ffc80dcf6fcb5f106d8*30&46a700749fef7602e4adf03cf9b15146*6&f85c1b087782e491b5ea4a488ba5f855*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"0610ff5c35ed41ba749d6c06d6713f8a*30&b872fc77b0ea597746a09086da532414*30&dada27f8aeece2a8f42507832198f025*30&8917d0b8e7e19bce02a44de9da440841*30&01ac25cf282da6cdb6ea23c7ef19d0f9*6&fc8a46c530ba102efeac0342cc675ac4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"bbe1eda1b4633231d668a41a415eff18*15&e21b726272854f1f24bf5fbc3f698ebf*15&8267c0d04490453b526e88a8e8de5c0c*15&12e40735ade924a1d5626eb6f506b244*15&d9caf0ffb2398a673a963254edf8dda8*3&035704edc28b2525ebd980b87fedc4cc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"83d05f3568e009f825c4691bb7de3590*15&8211f954a45baa2641f351a56715c5b4*15&b48ac423fae9bdd704d2f1334c929e27*15&3a5d87e3a454397e5b53b3d755050f78*15&e3461d699bc084532b31e16502f1a3f3*3&7cb28096d3d4a53fc551c7ce98ca58db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"cdea682c4b69a6d0d837e00c5b801a5c*15&60f3ee83ed71fac88c2ceacd53602b3e*15&5a1fc25f7f9965922fc8d7c8ba581513*15&c5ff8b315a804a24cb2d3a20dd0cbc9d*15&e5966888a1c0dcce036b9792afe2cf5f*3&2f43f56d0fa907b13f08bcbce5ced0aa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"2f4ea84c9e348ec337d62dd5423cc15c*15&af81ea746ae9d2894fb133fa8269aa9f*15&7575c4094dca3d96a89b2e45a33e3f76*15&06fb22c15c073ec13ababb0fda20b31d*15&d639d343b9819d83f6d23a8f90107a9a*3&7be16e1fa608ab232e5f5e9ccff4ce41*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"3f959fc6e1e93c004555c3f085f9b303*15&52470d47c8aadff520f1c851e185013e*15&3c72a1d6b03e46d5509baa0a1101186d*15&a0b385c53284f8ce36e4c0d451be0daf*15&b80a7945922caaf67a828b6b3c1c0a27*3&4b036bf426e656e8cf9892898072e27f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"6e0080002222de8e973adc784094d898*15&6e4b1b4a9426f691c200ec3f43306b2a*15&c0bcf1df473c4911cde53198e7b92112*15&6528b2c25eb31a7f6f2a0196797318bd*15&1001ae29ff1d090375c1090477181e49*3&710b1daae28700148c0f0ecb9544a5b8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih128oh64kh33sh2dh0ph16_iw1ow1kw1sw1dw0pw0_n"c79e48c9a088d141178280f8d5301ed8*5&9befe71f4746c5ead2ce4072a54c7ccb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"88282cbfefe75a6a209951737b6b0afd*5&2fbeedf86f070fd956d517ab2aeaf21e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih64oh64kh33sh1dh0ph16_iw1ow1kw1sw1dw0pw0_n"0c27af2ab1c0fb74007803eb6dbb96f6*75&4994dc7a9941942d961ee9c7eb316d11*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2572f8431f690e8ba021cb43e3a70afd*30&d9e4f92e964b392407b98237c3d9cb98*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"831b54abf302294963b3b84f905a9c38*120&5ad4aa4ead9ea78eb9662ad893ba60e8*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e5ca518be5480508ca28b67b17cb5ec2*30&95195b4b7cbaec7190624fc77f295d5d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih64oh64kh39sh1dh0ph19_iw1ow1kw1sw1dw0pw0_n"17320e90c22cd07129dce7587ec72122*75&47cc5e8f560aaca1132ae0f85549a185*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih64oh64kh51sh1dh0ph25_iw1ow1kw1sw1dw0pw0_n"1c17210cbf5b6f97ffb4f4e0bc89b4ec*5&d2a0b32ac73c442b6b0540114257faa9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0e702d42d13fc921d266e0b773d1e0b6*5&1cc52584afb78841e6944ae2cbdce6e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"131b280a1d1ba05d047b9f7f2c0ce574*5&ad49cafda8ba5af06e63918f3e16ebe4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh51sh1dh0ph25_iw1ow1kw1sw1dw0pw0_n"6e32bf942d9005c304c150e51190547f*70&43990371d058f728cdb457c76fc85020*70"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"eedfa07b9838dbc84d00f45eacdc62d7*180&d4de01502fa5c6a3451495ab11a4a23a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3fb27f3a96ea71ab2066b9251893ef57*45&b9b4f6383e6768125059c7d8df40a1a6*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"376481554dc1b843bdb8b745ee263f98*40&1478fd3e32ead67a10afcbe261d10c3d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh63sh1dh0ph31_iw1ow1kw1sw1dw0pw0_n"3f0f5479995bb75f96514b84595d8447*75&d110c47ca1560f82ad7529f67cbf45a9*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh75sh1dh0ph37_iw1ow1kw1sw1dw0pw0_n"9304f52f32d13835a1dad732f79ab41d*75&c16d20b7d9d49104b281f29810bf1200*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh87sh1dh1ph86_iw1ow1kw1sw1dw0pw0_n"f5dc5e287e10d8505fe5d5212320b652*5&ae75333e9a2e2fd81c3b784872f98afd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a364b08fcc47fede1fc96339ca255f3*5&0c8b80b4167c9a39bc4f8ee4b1f3cf22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc29_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7693aa40272d6bad836ff5048fd46f6a*5&f54791f72d5c391bf82a55f2835ded2e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"cbe500fff37526585c8ad2442a5c4108*5&fcc8447fd86bc352ec48b4f180d4021b*5&2769bdd8d9fc98a9559ebd6b73f9accc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"c5b7eb2bbe11d0fc76abebb6e2b302e2*5&5a61dbfbf2fab1f84c3d26c602a86fc8*5&8c284850105263953bce71cba3899dea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"ed950a867fc29ff7beefba0962e2684b*10&bb8708feae6c6ce16b6f1a680a1fe4d5*10&cde0d21e0b1b59afb479097ecec35166*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"9e35a132f71e8ccf9e9a73de9038f01e*15&52c9a46f942983d4c2892597dc97d43c*15&f236a9f84d9d2af01281a18d9aa168e2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"2bbf6206858f4ab360a08315e0f0ed4a*15&72913fef0032004aa56eaf021af8379a*15&500e03b4e56682f21bba0d2d27c49959*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6eb72d574a0d79982263cc146265158a*5&1538db3e6ff68e2e98f9a7acd85341ba*5&afaff9c368e5db8f08b5a69d70c42841*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"931a74c090c44096564804bd9b574eb5*5&8063cea23d41b56681fb29b5e51d747e*5&adf3aee11308d60d0eb9c597c87704de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"b771a5fcb07bf09f0af9648d50ce1af3*25&56522e212514208f4ffc4c77b1aa82f4*25&29c5eff55f3edf690624d57e409b32c8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"09ceade2f1f3d9a24bb792d3f22d7843*55&9d06f6aa1bae9e8b63f4ced42c585684*55&71de162fbc1174122400e38e23d1d770*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"74843f1deca4173f7c8ee6ea50f47d02*35&8c33beb7a48d14b487920180bb1c29e3*35&a0ab87250c2900d859a72d4dc76cd8eb*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2514d9cf8311abd603909faad3622b30*5&a0b43227d08f293a3373285d9cfdef8c*5&696e9da34fed949abb2e8f4a231f1b4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"fa6c97807e1a61ae44e3623748ec8630*5&ef6c2e7f9b6d0f58a628b7256e7170e7*5&5dda174f2530ca0f355f0f196f8893d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"14815a64e68fd533ed684499c7e2aefb*35&2b07bb34a4ea87b65c048b8537ee8f50*35&6df874b3cca5b14f9b631c2e0a8db0d9*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8c9a742ceb6ff183e0985a0f3062f806*75&74473a53fac2831d0fb5cf0bb1d7ff35*75&6551b47b20bee62414ed09806a9d29b2*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"e877dff83b8c9d3d46cfc14f057700de*35&54e48a96e6a12280ff36f9fc7b88a068*35&03ab5f618a2660bdfa7c11e658e18753*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"210a4da7263518c5d76ccda69981623a*5&41f4dbe0b67c5575e3b4f2d3d53a917e*5&f058a150215ba95ec8a23105b6ced0b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"ec85b6c21e9f102c85744fdca39f55e2*5&3ea9dee2394a79f1e890068b8a201a16*5&08308238dab1106761c1b81f4754731e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"6a948b3a47bb6f7bff49b870ccb6f81a*35&6365676f5b87b58049e0ca8e229a7a9d*35&940435b560816eafc5dfdf252a64d63c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0dbf3113228d828b18b9580a5c3f211c*55&82052367ec7de3470d6b0987a313e748*55&28f26012d0f6599676b8f2f0e11f2dc2*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8a60d2fc6d9aaef27258b18729b5323d*15&ff12138891543217eadf1ce1a612cc40*15&06800d0d87111f7388e004dbe488e86b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4a3ebcbfa0d9c8bee53dc700c5129db7*5&cfb3bf6a918c607675be6f904d58c09b*5&2bb696df8e964ee1cd13361247cc04d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"f48607d310429f00569efddea4337b28*5&31e5b734a8a226d5d246005b74999de8*5&69802b00e58647134d0001e07974a23c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c123a4b9b44f79fa537859cd4f1030fb*35&2714ced889ac879d123e2876b650aa6f*35&7eaf5095d250da73cace1ffb5fb96611*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"86f8f5b834a3f90126866e52a809b09a*40&187039e0e18d511273bcdb7e41e23ca8*40&dca4a79be3374d8a50c554685dda1901*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"321d194bb7eb9e2693cbc476c858e918*15&325af49c2dd34d100890bc1bf651e256*15&a15e39293f4fba4ea238a79764d52b1c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"8dc6eb0c33da575de1b0b39ee87c8a09*5&1c07f0824e17150b04a68c70fc3314e3*5&3e415c504dfae14017e5e7e7a3099ac6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ddf37bfbcb161a3d087c3d20e9f401f3*25&087e621aa0a9f042c37abcc8cd3cfc7f*25&57deb997b2c2bb97ece4e7b03a54d48f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6c83e8893aa527a74195236703dde068*5&a8571face528f59f6a0a17e65461fbd0*5&48da5f647e60c7f4bb6df75c5d4698e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"17820506670f562eaf6d5de67423e81f*5&56621885e23fe16953a11d4c8b2ac6bf*5&f7827449c9cad70209d7d109a95f33e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"f45c50552b4cd87ff84001319046e626*30&cf4302743a88c7bc6cc907bc30f4b018*30&ee46c6062524d0f0f42ebbb820987530*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5ccdf90575ae3451d6c92bec298f41c3*5&692dd42bee89d47d27c64459160b21a8*5&c35f59f2273f34208a5255f8707d5c96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"e2bf4fdb9e360da67a11a6b029bf4d42*30&353251381249879335ce9855883468be*30&9dea3c8af7028f9d29468ae899d31359*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"dd203ac592306f4cb6df230ba8574d81*5&7dc3d8477adb8c57d0cc69f54cf12310*5&a6b057b9adea2b914be2367fe7fce23a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"524cd6fb111996b6540a065dbce2567d*15&d63096b7e9206d5c9d175d455a86ed2b*15&af257c6c3c3cc34dee8f870cff65446e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb1_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"a7f8a80777bf869f47b8a8256e0c78e9*5&0b7c4e8a4246b056c1888a3e7362b7e1*5&d05ee3d7bbde39c20915e6c3929343ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"a47ce24ebe5928aa9e5450781af026eb*5&d734bda95b407b6bf35203ed3395ba4a*5&0ae5ae70eb0c66a6bc18962729e7a6d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4056654d1ef5c80fe51c0816e0cbb9d0*5&32b68abdbd525c1f9588d91ebb4ca27c*5&b0e50fc92963de7e8428439637e4cbb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb1_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d46e452a2fd60db17d3ff9490346d6ed*5&393540f7e38e11fd3f70bf25611d4575*5&446eb2a8988044759be12776923d6e87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0f11aab9475698695ad1ea89374128d2*5&90cf1639548f845b3b19aff3820a8487*5&41bd95aec0706ba0afdbf1579f65f28f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"84e5c1769f8c3404c7d74fbd952c2987*5&eb6d70d93c3c5310b39784f84a3d27f5*5&5922d37d3befee71ac5fc18c7331db5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"1807e0679e09b629a1f86667c891d935*5&10d61a24f936171542ee5c37b367b9e6*5&2dc43bbf0bada1220cc789edf979dae8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb1_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"cdc945086f782c4453e90603f3825f2d*5&dd2d9cc76b9c107468e7971403c20334*5&27b7215cedea3222bbfcda719daacabf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5359a6b171110c91f8e1f54b810d5b6a*5&56614f81cf885a7828aa9059a0321605*5&59287f359789102e13aa81c740a5a1ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb1_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"e1bdf2d843972ba9ee82ef907e27f399*5&5341062985b8b5ef0a07ba006956b147*5&cd40d0a5f7a28c469e9efec05d84a11b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ffd19eb907215b9d85d6c112541b1dc7*5&3f6f961ff8bd2ac27e8f0e412ae78bb9*5&77d3fbdbd7e058b6fbd84ae18cf9a1c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw800ow400kw7sw2dw0pw3_n"84f644c2a0e10c81a63b775495ca3939*5&9f2ba10e6e0a6e829c31e27e8092dc0f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"2b1679fb9e441973140345e8ebdd8b0c*5&35123eaa225b102d384d7db9ee98070b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"2ba042bb0066827d86d4de11ce8c0ecd*5&3f9778344c3648e3e804e0729a71ed3d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw200ow200kw3sw1dw0pw1_n"f62859a43948d14be22b25e2f46b0b82*10&fab61bdb35bd0217e77943740b847833*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"be85b37f3deeaa02d89667ce8df46941*5&28c24355269e00746c6ce57706e2c8f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"2b8be75cb0eec65dd41126b0546166be*10&ddc9ed4723d1a80847852ca48963b706*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"4c00ade20a461cd7e6beab23af77f7a8*5&c8a24fd612a0392a39b065a0c70b713a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"861823f0e462f50b1bc8c2b33870bb12*5&cd05b1fafaefaafa9812cd54e24c7078*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"a83ec2ac6ca4ea1723db1a3e6fd2e439*5&e9d00e18af1e823c9c1e996214e7ab3f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"1523cb99e3f4237ca0105a3a70929711*5&17c69695350d203ce27f43a7f08ba08c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"b944d3c395155884b9eccee2ba1688c5*5&52ae2088c4bbf72e81c0b813aac8c64d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"c302b8e39432008e405adb4ae0051691*20&d90ed45cc3a6f2ba1ce4cdbe81533ac9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"ba3b6c5f739f7620eb3f1c958c257c4e*5&94482233170ebb039623de34cfa76355*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"07e7f756feda2e766522e23bf8e2e292*15&9af4e900a96c37532b183bbebf07f12d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"f4cf6f403a360be68d88cd889da0d3e7*15&8dad4d05db97c28e827b33f4156c6170*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"f1620903da2b4733a0834c7dbfbc3bd9*5&3cca335231c621085ad09545d6754af8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"cb2fc8d67ab990a6fc1e9baa4d4cc145*5&351f1eb013e864097d1c737b6d2695ed*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih100oh100kh3sh1dh1ph2_iw100ow100kw3sw1dw1pw2_n"db2b244f9d5d2faa6e23cbd2192e5d08*115&a36b91df1f90d409ad21cccd9f948c75*115"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"4d25e0743cf7574c24a290fca42cdbcb*5&1dae15d906951c9a9756c2db5e5811dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"15c7aea18ee0a7eeef58b9f694768cdc*110&ac6450e982874cee12e0b45504e0615f*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"200eb61bd608b9aaeeec427fc1f02e8e*110&6e40ef4d1225c96a4cc316a284a0fcc2*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc512_ih100oh100kh3sh1dh1ph2_iw100ow100kw3sw1dw1pw2_n"a70b50e1be398eaa2740096692897b95*5&d0a13aae070de8cc1567c235511560df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"3c50ae3f1f644e113460fccaa23fe7d5*5&d329054f0b579207b36b9c8ce68cb0c1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"2e099ee2a576096a55a997c09625b8c3*5&d4108fc13297df16acce7799bd714541*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d192234dcf0ba497df438d0d57010766*5&b96828bff91cfd9cfe95cd4ae4ff631b*5&7829f44a9ac7f029fbfa5dd87dde9694*5&a5d0a5053a169d97fdfbcf93cdc75a0c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"553b839f6da546df530dfa0b8c51e89d*10&ed89d82f90c68dd2e1870e152641b822*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4cdba43383d2a84468be640e6378c184*10&377a2b82079093be771c93a4678d75bd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"75e0ec9291a0cb3dd5bbc70cce22a7bf*30&65e6c4990853c39221a2ec99da9a3e6b*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cf0fcec6ddbaf35a9e2ef28e2c5cb11f*10&3c6e646482db6e11b244614a919ff544*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e85978486222c1a61abd4db645edba43*20&557526ddf4980e2400de4a7b26662820*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"06e752b1bdf0d671f4990ecd59b2cd63*20&b2379e0c4302c39e87b06df327497c41*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"e53cc194655782b537b8bea7c4645d95*5&8be4544dd1045862290153f6abbaf49a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"0eb8c4a2fac9495201176a8a13564bf9*10&11a6ed5cba627cfa17773b51a7917217*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb100_ic256oc90_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"80eef30df12e394d04e1b37ddc9e169e*5&143261af116bf0195a98fe8d45eeed23*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f290397868842f3b71ef9c2ef887ff54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"53bd3c47f617fa991f5fded8d1b9c252*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"85b751916733a5efd62910ca0bd34ca5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3f3448a11b3cccb91ff7fc9597dd419f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3993dfd79bb64f69f6068230880f4ee5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8a5c319f208db441bb6297ba63a1b025*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6d491a9e01b0214186e451f800dbbc93*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b98cc829204deda7a1037f9c8867c36f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"088887d334d4f14145d68256e5683dcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"998e1b32ff4e612a9534a03918b443be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b8caffec2213cfa5650bc21d84609af2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fca5b58695378e4c6a216be89ef7304e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1dea8ac57f1a501fb61935460988f8d9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d4d45f06df6263dc48a14b727fb8fb8d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ccbae3d12b8a70d3f20bbcd624c08ae0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ad1e57c028f1c6e3fb923b89d3bccee9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"17a78939fe208bbdfa0b6b000783cdc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f54bf35888262425514d9f7dd4733ca6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"da0a47b907bc94741e96da77d235009f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b3dd719d43cfacf19b97fc7ee00ef749*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0b781955f05c809c415f72494f9c8299*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5661d1690a8a3f4820c9fcdf8e78398c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"669c28aa5c16b7e96fcb8f2b4ccf43ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0b956e3ab21e355f0a6a4eaec530a563*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1cbbeb1240e1d5330b095f6387d08169*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6038bc102638287ae22402ce79cb0b43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"70e5d428a190bd03dfdd52cb1543f958*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"026b898cfd8ea9e71ce47b353be7f409*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"919f29d7803a0708f450b957b41fffa7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1fb5a7b15b74d262f0ad0fc109d1da12*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"547252673e92d7953307fdd4c7172b4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16613433a86fad29033c4c679f3a9330*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"41c6060d99c458726ce6fb92bae13b9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"74d59ab4439fb07bc06cea9e6d210ac0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e69ba44a1733b3f89fc6ec1793bb8721*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2b48c43fd2d0e8d3f86d6b5955861454*1&c1ed3057429d56f64dcc7aa81f365342*1&e8aea1e4f4b34d561d2962d61446c50f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aa0cdc03a92b14d3e4effe9a180999f9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"613069c77980a27ba44079915cb0b102*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"93e03d7b3d3df7b91fd4600769bdb9ad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6fabc2adffc8d3de366d2f8cfd8d12a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cac826ccb556637a276d1af2cbe215da*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee92984c7f7d1b8a523263abb9d0ee77*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f1a82346d8862a9007d0e02eb21218f2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b80881356e014b251889429870c42ab8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"41158915c8373c0c27c9eddb34dc533b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"844ef93903b179c1bbf222ff1b5deef8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"45d6b9e21157b0f6e5e63d2643328f96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2d7cb8f12df3630f2b7e299918ee851b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c759314401ca3706f871195aceddcdb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f478342c3f08dbbd9eca36e5627642c5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fae8d36aa3ec51924d506104446b5b65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"64d8caf7aa2d92bafc0910321c8b8c7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"24e0383dc6be24b364ad9a29c6e3ac56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b6a18d968245d2e89b4592233aa2a2c3*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"759346f7465a86758f56380068f424b8*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f6c668acc3177475bbfb705a1fb4eb89*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f3c26c69a2903627fa85763cb5ab95cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aceb01097e16f274c502754cfe9d551e*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2afccf67b5411405265b361c40c19adf*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6f77d6bb089778d4e0ba6d5282d2ffb0*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"032dd146f53d2973ba9212c695d16e84*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c1f517ddcd0683c28285ffbbc4162dc3*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c73ea4ee65fd657e0750e177d9791512*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"408686cc3f49e8e2a423ffcbfa2fc034*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g768mb32_ic768oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"325b64fee7a7dda061b689cca8ae9bbb*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db76440e50f9ff6aadfc6a023070b995*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1331cdcd019d8f1652601723bec9cee6*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"701d901f522fa2519408d331bc831fde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb32_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"21739be4e0c2738fa71532538c33ebfa*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"28fa1a1120bfb04a2366504a2c946e5f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d9bc39da02b76ded3e0f9f97bb9483ee*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2d72fac1670a652554428e36201e30ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed482b26828862494742c034c117612d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"51ebc9c09ea47eba950909454ae0bf61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0db235c53dbe8e09969f052cf10be0a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a80651ff5817a3a4ffc7e67695cb4a0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2048mb32_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ea7a95015cde5d06d74eb7df265c2d4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"80ad434c3ccc5623702df0acbc7d005b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"481c25ae97cbb13fc0751d55613cb36c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"ad2bc164d148414b2f528f8b1a8b45bd*5&a9ac50d29a208b8e14dc594a3f5910a8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"867fb3bf4f57d59177d86a745660c276*5&caf6f092647d4954ae1850153bf4be0f*5&cc0b872ebf396361edf94ce064442600*5&2d4cd63b87fb52e328ff18272615f1ee*5&9b13641edbbadd130e08942167cadd5b*1&19ddadff5ce979563e19f3c83e90eb0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"89d1678a41def85ccd4993404bc834be*5&f9efbe838d5939bf19ba58968b2a3389*5&6ad0bc541796573c22c2899a17a89e16*5&0833e87de79118c99ea9f83196cce5d0*5&6fdbb90b008d0b23c4bb623722972811*1&a7714d8bee35a0910b2cfaded0f04248*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"45fdaffc5f3afe18a7690616d126add1*5&6a269c8d73da771f9f52f29a28c0cc80*5&baf8e5275f8a19fed0d6a8423a11c944*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1849cd5f98986932f49e4f6e5ca54ee7*10&ea00822607b60d0794e42181eca06b60*10&6ab15f8604ec85998516a907a13c307d*10&f016eb45ef6802878b30683f50ab6a16*10&60a541eafdad03c1ec32002dbe14eb66*2&3682b038de49f116cd723e29f23ab295*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4ee594e480d78e216d9b8903d8893e84*5&d4a0a5e6c0db70e6925fdf99dabfda3e*5&ba24d83c6aca60615a0f7edb5dd53899*5&180855da9cd6f64e467d2d168b4c0e3f*5&98b31d0ee5b3bd5a3421134978915bf3*1&188f62a685425b6b3c4e81fdf6d86db1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"ee3605377b3cbc5649ac39e687385b02*5&c83d6a95188d744a07d471b3b1a38bc7*5&0d1685a1be10458685beae40d6cc1bd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8a582462e9f634ee172d1fc2bf32e174*5&d9518b9082ebd2b8026f657fa9f0cf79*5&ead35aeac872d65affc3842454a796c0*5&de65c3be34d43ce4a661ca6275758edd*5&945a71cd590c4f88f11507e6f2150c09*1&e929860be00f0d99292bd9015cd9137a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"92da663e588e763587b5f9b3c5d0dcc6*10&ef2c9e9bbffe999d2f4311f78668d5df*10&78085ed33bed0756a649a0e117fafe75*10&369a93f0f9d36fb7ce078aaef2fe5fb3*10&0f3723f438624c2272d21f6e9f5fa8f6*2&8ad5957e526594d6f45ca1338ceb810b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"04e23478b7f65d17e2b77379fa4c39df*5&7b5a23b67cbf9699fd96ea5100b1619f*5&77cfb7a8bfe544ecc98058d0d0ee02f3*5&57ee17e14d88ce36f6ddf6b5101aa3c3*5&9d9859944936b7cc4a8fc475857bd2d9*1&59d8ed0ea3e9cc6f900781b54b5d804a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"52ea02f19bb7fd02dde72c2cd74ea761*5&320d682cf5cd60aab26551e77abc1abd*5&23b34cf90256b8b9ef550b63f0ae7085*5&286ed394e3464e6b7ee5211663258c8e*5&f66e82f0937852cee155973a2a9ae2de*1&d86ea254d8e067d98a4ab04ae1c0ce36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"be556730602b24f3bdfc0bd2930467ec*5&900b135f70d94b649f5091f43e504312*5&429bf23b6743916a970509ac3cf67f96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e5179b130a08f919e84df05317fad06f*15&37bc76a14b5838434eb61389e6417112*15&47fd32f0dbe1f28da95cb2ca50aa0177*15&cfe4d5e6849d8330956d4c40f0ae8305*15&39e77b7e85a6846b3a8673fabfe70e1c*3&ac1944f4acf9f32aad23dd9ad8ded423*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b33fc68ad247cb708afdf82755198bd9*10&1215a20757d8d1fa56e799d99f425b18*10&d6c18ca873bbb8e92f0e5ca0fa58cdd9*10&6e920658dc9ce5cb8ff51ff895df1260*10&225e694d6713f785a0e44c602e6e8d43*2&740c231a8677b55d14faddaf2ed99e4c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0e96807beda809463083ad1cd1548cf3*10&92bca78f7bc178fec26a1579d04b21ff*10&40da67f251e33af8fd17cc912814a247*10&66c0e6c058ab46c09a12907ba631fba3*10&42adde547eec4a21c5bb70a618bf6bea*2&7f843d189114593bf572693ef2cb52b9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb1_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"b42df32f90c4726b42b9b258378886e7*5&e01d12f89a2f94c6c63b7a4f93c80962*5&83ee96e9643ac837e1da333142963031*5&5fc7dd4bae32a4ec7fdd089fe4551b61*5&8964f682b6ea942f294e6659c22635d7*1&0de687d01944088c2e4758e276f56cb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fcc7ab79231e9e530d681057a3604f51*15&46e4e1e35175b68cc94f24769b5485dc*15&06a88c6b4a2283ce56788da3cb482f13*15&76c373c380aa485cf9b2a6149b651c15*15&4c852f16c16efba4cf7a6d01f0c795c4*3&3c0482e14c1337b1ace1827b56bcfc66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"892bf5ef4a98a0f04e4ca2040b1aa987*10&c071d7a10e4e60826a3409971f7a5fb6*10&abe2dd691c3c19ff0096075884e7176c*10&67538b16e5cbaeeabe31b7923582a549*10&fecc115431080cf6357d32355f16074d*2&b3fa32119a06131512ed55025427b45b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"0fa507fca522e572bf8dbd7023357fb9*5&0af4376702ca569300d7cafc7e728268*5&c8ebaf610e1e3946a97c5dde24063420*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c1bc5e436590de80583321976b363ada*5&62feb35326e2d7daf7c7f312f08a2c89*5&4651ad07f429389deea14036da5462f3*5&cf794b92a06861e37e8d926e496f09a0*5&da7f8ca864e99bdda346bde71f3c1758*1&e1d5a58e8a83af26ba9750efa282b9ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"83dbae8b7f28a284bd067014f647d9d4*20&f7439dafb910612a24980cce8b800917*20&c80c1ed7c594cc02e56f13c81788be95*20&3196bc95a3b544596fbdad80504615a2*20&492303a5288934813194de5b3a3cc089*4&3734dd1d80e898190ee31d4a092d0153*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"90ddcc251f3233b66b3d194e462e0a82*15&2804ea128e83c5936915c0d1dc694c91*15&f2e5f9bb030dcb967e42c07eb52c5c8e*15&30f638aee305bdf66577186a8ea8d3ef*15&d5e6cd55bf1c995b4a302716acfdca2d*3&2d8124f1f56566a0e449370d4e01384d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c047d722e9499c311fb92a029f4d71b1*15&516f2d021fa406eeb30330d562b61873*15&fa89c3aaadf0be603eef2aedec0e4f2d*15&1392af7cc2c697b1b39f351ff1555960*15&66ca538b371ee61da30b45b9d3ef6d7f*3&31c746d181c74f174736581bbd331764*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b4232f5c165aef9ecb0c1e7b420e56e4*5&83e592f31d40fe44f851fce3e9212e2d*5&808d8628ba6f2cb9614c832580be4e10*5&278d6eb73b7ad1db41566844ffaffd78*5&3f7d52f65c93e4d78745baaa04b6f37e*1&5224d486a6a06974c20bb784d0d37809*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"877f723b05cf3e51780ebde171151dc2*5&1ae1a45e45b381e21c1317be6ee02cf3*5&7289a0065251f955629a3d375102e823*5&62e75d43df58624b88d551230d766efc*5&314bf7598a55616a45815c2f0730b32e*1&259fc417596cd9085f36eff119af2300*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"97a367012e80cfddfd34b51d79fe24bf*5&034f790b43b6508232ede43ef0858280*5&5eb3209e2c82a8fe5f1ecc09976be3d5*5&e8acaeb703a14ba8ea3668d002f3f7eb*5&b04037c858cb45d6fe223f43eb65cc42*1&eaf46891f5999f0077fa43aebbfd88fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"4ccd6b035d9b4e11b4a4ec6841fb0162*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"1377e99165ad4cfdc0d7f0e36cd668b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"13d5fa069eb042437a7403f902ff7bb6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0 mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"80cc14db591cbf1d18ef2de6e22cff83*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2 mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"a95a0f48bb26137efc3e85e86104d088*5&0fafb64ecfc7e4fc7b9d73505b48cce4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"4f7ae87cdec234b8863859692c01a280*5&e7c45b496425f1e5973c603714406252*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4144b26fa5dba1eab831d9d969904109*5&5c82100ae22fd09184c62c00e0241f91*5&5b9b234bce0dd0bf1f66d76b253171b3*5&23c2b286ad50bb20b63ab1c6426b6917*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"096a19f7ede098a7fb7b444b24e307db*5&13bc9d1bffd45c77c1e61dc2461d5627*10&f0e4eb6d6dac11a802420e06277bf1e2*10&ebd78e155bb38440ca9338044e2bafaf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g72mb1_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"abf6e56a4e8169f75fb17c07c7a5d58b*5&49f5398eb8d1e334ac2ac412584bb2ab*5&1172d03782bc4459cdb2c9dedd291897*5&972569f5a5f97907e80bb2854c7a6a00*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d5f7b53f22acd5503d7ec58820356039*5&4e27d2626fbe529c2d3358cb9e2ef736*5&34abf1f021ba706c4c3f93fb1beac6ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"eb80de38debc45f3cdb12023780f61a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9696d6d5b453747276916179fda4c657*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"518067250a051bc0af4e33c0ba0b89d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b3831273e5384828a76332eaaa15fe77*5&092521d4dd374f6fc316a16caabaf3bc*5&b6dab91b218d73452f1e22e15e91c6be*5&a42d8a68700d70f2ca7fc835c7e3aa30*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"059b1d7e73348c7f336713d49e1d8aa3*10&1393b8f27ce099ac704039caaf09cd0f*10&34dbce3be776535393afd302f8a68bda*10&06a851801ecef472b5d40e4139774cad*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"4aa422f1352891925e6e916dba18f2df*10&cea4e22e2156665f14bd68ef78bd1da5*10&36ad1f51b64a593f21db557943dea5b0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f1af174648ad1e4a6679446be8d82f2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e917049db993b9738c09802f9ff8249a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b3fb057a97d74b441758d3737040b7c3*10&0627ed1293447510f9ee22035f73c193*10&d903bf04ac8ce69dcf3cd3d3d3a8c8f5*10&47faca46250eacfc7809d1bab26aab3d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"812f315a36d954589186e8719f6db7ea*5&8a97f2fd3b8aff994de23c6effb7339f*5&ff080c4549fdbee314484a896b7ee410*5&81b989a12d746be512c77a302fd0c5f9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0771891fc1774e312ead6fab712cbfce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8cb5e481ce72aab1f6bf1c0fedb7fdf4*5&054d38d923ab37198e1368d306e7c758*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b238b2109d319bf25f3cb868c73f3300*5&5537c127edd7f257fccd4ad1d27a1dbf*5&e1df8aa89833f2eb1093b5f5e0e1145f*5&b1745c86a011830da2b4598adbf9015e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1772720c076050b9eda847617c4385b0*5&2d0b902a34faff36dfc5db055a9a76fe*5&800a15f8aea298aec359cb8f6d68a5e8*5&3eb83b12ebdd74144a5303070650b753*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"38c3cd9d87e0b5c5b3f3b133dcb9d460*5&5592163017864e083f9923c4016803cc*5&d83a8e9c897c8162bd779febc1614433*5&46e7b6e4b97ae218428ade7232480670*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5de702ecfc99cb944138b5153684db5f*10&e1a68df71072aab95bc33cdc4d02fee2*10&f4a79dbbe43e49b7478bc33523f8cb8b*10&6743e88656bbc7988bb2f133aa368eee*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b50a8974fe9400c6ca155d2b33a0a993*10&227e61aa6f22b78e616ddcd07d74bd82*10&3e6ee70a36b5c021eb25ca659952a036*10&4199ab340cc7e39110a05b7da4185848*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a589a30ea3bd79b5ac2614133e84106*10&4a8449e7f6150ac134587d3a4a532814*10&f9dc0d5d02bd0f7133fbea9e9eb3b9c1*10&e643ec996165624c1603bfac210af1f4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2717c47abd4c7464a5edd3eaacc1e1be*5&f6b987c9d101c8497ec2499798ecc0ce*5&5e0c04038b32f734946771d048d4a209*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"10334d7d8fab9e453e8c2d8f7971b59c*5&67ac5de6c96ea99b19a2abae20745c74*5&632dff25034de1351c4882e882347b1a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b278f720ba31ecf7a68c54059043a74c*5&dd58bd09a4349e023e006ef32b8a0972*5&21723f465a7f00191c91531c82d58064*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"796d465ae13238a2163149ec9bddb794*5&31f872b63addfe255b3379d9c375364e*5&06364d85d46ce629875dd867a6a00af1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"30f2c10d40acd946d29504f943776cf1*5&2d5e3266a9be96d07459536f3aa14d5d*5&6b50d0a229ba9e7f5bdc978451aa357d*5&ebb8c18f52c7b471f2bf7a35f724ecc8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b5acc10623934763d63af78a0bb3e936*10&95c1a7d68da2f8e9c52dd3d93b347deb*10&41b0616369991afb543b839ba8e0138e*10&67edde78dfd6a0a04c5fe1e729bd3fb7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0ec46dad33119f29d576b4de589c17e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a8a26157138f4bd917c93d05a6b40451*10&28b08e3089b316926a44b580b4008b2e*10&9064ecaaa4504166a4e0b60ffe66aab2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"006dadf61fc8c50c6ac27e34ee29a0fd*5&ccb3a131986b929e1ac007ed95a28cde*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9313fde44fa80340b10d63fe9be1de8e*5&6d877789193f9ea4e1eb08387e59c169*5&55e3f62c084c7a502b80a7154c43e1ba*5&b28bae74373479e1efefd47c07846a41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"7e0015e6c6647516f9563355b6ce1645*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4876863fbb0474fda98e181a1b11b42*5&6e2318d54d7e6d872efbedd6206ee294*10&423c44d7cd72d8d02a17e9cac877c5aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ebc2771316a832c8f776fc63b20b8990*5&cf2648bc9e3aa9fcc6c26b0213634cba*5&062794809ee2237a3617cd7bf6ffdccb*5&396aba5cb8932eea2fe1a448ab1555c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3a41d97d6b77e75f407c1c43fea510e8*5&b18c0fee0aa92269027337ddd87f0be7*10&e8ac8a02439fe0b63df297e24d93eaa0*10&f7b51a4073ea36085b2b51770b435779*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a93ba667e51ba5761ad1870e76ae07a7*10&c45648d0cded454b2927350a7fba2fbf*10&a3e072ccfeba8b4b4b7865ce5c897d74*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2b7313d834d212b904bd68faf3964886*10&aad29b4d3b2dc9c19d39629fe3298437*10&b3d7f296a72c01ceed063e6e70d638da*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4bb4fab5518bb04584ae6fe42492465d*10&6a1a55204508ccef540dbd4fce16d7d6*10&6e16b9ac093276ab43f8298b9f5acedd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a82e2ccadb0b63c390562f61adc0d3ed*5&f1074466b6ebe28f90a41d6c6fab15f8*10&9293b102d6316a3d12510cc7672fc324*10&3b8aacb17d0eea51d58abf82a70f6471*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9a6fd688c520ae7db04f042c8a610b7d*5&1b4a442781306361972513bbaaecae29*5&d73847a7ba46f501ca956705a35b2b49*5&e1f3e5b973271e99ce5f1656d1377d98*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0be9bde9c70e07071de9c73b9e62c9dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"97a6af258e729499fae6cfc2346069ac*5&fcddb15b3a8e9080ec673ca6d4c8d3e0*5&5f24639ae0988225a58f834b285be86f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e723c909369adbca47ded97bcf1cae1f*5&0f95e6965cdac98edfb3d155f052d22e*5&cee88d631ae8e30eb7ce99902f49baff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0 mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8adb7e4027d733126010e5c2be8471aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2 mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dfa2b51dfb0ef3a81445ec730e7ed023*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"fa858c6a97db407441506d5b6173e023*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"8f3f2051d2ad3424e80345fd550d2d98*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"fb96189b85115e4e0f6a4e211a4896d7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc592e0b1ed2361accf4fb50383ccc92*5&4bfb1945bccfc05637b81fb358c8b8e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"72dccb99e7d145b45316adb5f02eb7a9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f34f1781666234fa8ff4d164445ad5ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"08686d81b31fdfc25e2141348430fdd7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1ded625ab510b8832d09accbd934092c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3f34b2888e3271f6b92c61f0d7a41bb2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"47f4a9eba3f0744779304156d8952cbd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"d883820b22ed17f9044c6b53ec82ade4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2oc64_id79od40kd7sd2dd0pd3_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"d53d24505d082fd98821e4c083a342f2*5&8df1e1530bbdc20d01725360b20276d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_id40od40kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f124d19eb004ff5ff91c4de35ca733bc*5&4a204b78413b05f0ab54e1a1b90ff4fa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc192_id40od40kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"204f717c9ccc4d7979b90ac6c185b604*5&441515d99a3b72b7ece01705233053bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc16_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3092b7cb0d056c8d8f4fba15d031ea28*5&7473d4c08da520846bdd7d0429b6e3f7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2db1486cb0d13173b392f2563010cdf4*5&91f56b64e9b70fc44344abc6cd72dfdd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"af115cb1078f9c79d2af8b99418c1ca2*5&849230382ebfb75f2befd66825d54cc4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"316929240cad94a1361da28d8b3214d7*5&3c1934eb7a10851f4540f2f6d53eca01*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc32_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"73e1ecc173e280746cbe40f31913cde1*5&514ff5c20dde8abe720d99934093b9e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9894e7e8f2e0c3e7ce87bfcd956d455d*5&82a4189a751130da3ca432eecadfdca6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3e9be6adb2258b68e83e8e10c35cf1d5*5&74a06f9a0a2e16e198864623f8df54cc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6c2d8bb9bd8bc4d1261505cbc358c935*10&da73e9b1b1c5022088d3a40354001c7a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f27c4eee67c63a967fcb1a59fb1cbe66*5&6de036ccc43d4168452d93788042d731*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc96_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"97d4291f475b60dff97e9f50205cc650*5&490f9d17db4ec4428657d5f9e3d832e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ba234f8147fd603194f53ef3ef460145*5&e80c92cff106147ba1e18651bb181e4e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc16_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3974b621bc2bececb6fa46c1e0add243*5&ffe576d4651ab5c949091f6002dd7531*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc96_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6ee385675d3e528f20872d698229ec28*5&1d3165b61b19e2d02d4469704205a58b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc192_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b343aff8d2259e03aa201576b6872fb*5&f9ceb7284c550d5362a329a816b95297*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29380a8a389b988a2bd446c147709d44*5&9108acad1bad4c7e4fd59a0343bec3e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc48_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"611125d06f433678f019d2d6d711a570*5&c1823594a8508d37b1a43ba5ac86336f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc208_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"78b7d6f900ab2723eeac3df86c9833ae*5&4fb5c18935a2a1830c9d4ff68d5b0020*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc24_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b1572ef4cf8a1fec2e4fe722b600fd3e*10&9bb75ac4358f882b5b27ce0925c6270e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc112_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5e35176a8873be1f8731257b414a8629*10&f3ef81dc263477c84acf7c63bd6bbc54*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"81d80b672ef48e641ef3a50034e142ce*5&240ad4022a2d23bc8fe822bad3814c89*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b2f2ee0337977303c2f48cacbd08915*15&8a8d5cbbcf50a2c9c8ea34524db196b0*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1cfcb116667ce2d07eaf687e781eff68*10&69eb234158e17510153ae52cfba33b4a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc224_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a44123532630603542a1dbc395e3c300*5&3b9977e804beedf1b2005abbe4aca4f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3fd7eb1c22a8e132b9b2c409b82cf69e*10&cedd11c4d8ab1a0c9dabc404abe6ee9a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"43562e7f8ba9d1d450688b0ad0240e08*5&12ebae284fdf2c8459fcb26fc20bb4a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb44981628e3b096067829ffd0b674e2*5&0dbb70b40a36796ae99c70bbe9c562e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc144_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"72e5f02185d5c28a8999ddedbdd03c09*5&63fa5b6954e6fc8357355c3156747277*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f55902aabda0118dbc121de364f86ad5*5&0e01aecf44ee49539f6d7741bce61e7b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc288_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ffe0443eac6d87c5116c1e8d47410aab*5&2e9c3aba4584528769da4715b4f3e495*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd8b28cde51454d29bc42d78f6a26df6*5&fb2dbe718c0f648d2192d796abe23581*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6cabc778b9d21e39c9578d29a32b1da*5&cfabfe3e55f490b83b5717c1b12718d3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc256_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ec411d76fe946efaa5ba1db808c030ef*5&00a7b84775991ea5c47eab9c70b94817*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7d1394d6c08385cfe676104b276908e9*5&c8fb5fd88a87fbfd94538ee98c513901*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0a945a8ddf69893131933068d91201e7*5&d7f367cf1e909e91ba8ab11aef118100*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc320_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9af45fb9088ad5e53a9025d482ee4775*5&193570f907f03f0d4c47d7217fdee200*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc32_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"85e1a017a59eabd4100aa660c3dfca64*5&51392c63492649cb5363d08f41122344*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc160_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4190c044fe19975fbff61451cf778f19*5&58d02cd6d6805f6ba990e97203278f71*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc256_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9301788a2ab4d185c05e85dc57dec4ba*5&39f71b0d3b25c73834e307ee34db2597*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc128_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cc6459290db1c847085c60459f2cb124*10&c505a570146e203adb1d51cda95f5617*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2e58daad5aae00e33dc1d066e74cf9d0*5&49821b7c7e7ed42fa71898dbb5f35834*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc320_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ddd40ed58b6b13ae376556a4aa9f1852*5&1468621eebb1768fd60aa1096f0a8ebd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc48_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d589fa31334c1abc1d7339297a981a4a*5&7db1939c719566012371a427308191d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc192_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8ebc29d6582cd0a5685501cc68e8f3af*5&3315298c310143ce78ad84140a40427e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc384_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ffec9413e8a2cdf44a9e2e3a0174c44d*5&2f31e4bfd26ee8db019ce4ff5e6713df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f625d5da0336d10bb5d74334f0b425cb*5&634dcd374eb625b31dd2800c56e9fb1e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc384_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"93c65024db89a17faf2fce504481ef2b*5&f7a5cd34a84d88988ad9a76f3ac8faa1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc400_id9od9kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d204c1aa2932618741d52a6d1f674184*5&f2680d8e9f9f7493e911741a967f2b4e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"36c4691641ede12254144c06488ae1c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"c4162c87aee902e7e3f665f0f004fc27*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd588061362a37a65b3d38e71a2d8ecc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e7d9dc24d39a80b81d60e4033fb247ab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"db48f5fbc2c7426f3dedd6cfeaad43ca*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g58mb1_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b088bedd72e5920e9d769dd6fa7a297e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c0a530ad0958f4e234658d8b734c7f3e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3fddc6496bbd4057b8cb1ebdea1eb78d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"4c0f02e6863c54d0de70caf1d34f7921*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f7e24e49f74cf0d2d792f1b978deacc1*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b781cf6137dda2488cd2df1b64f795a1*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g116mb1_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3798384cdd344a500f35e5efd51b483b*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e57ab5852ee3864282a75fe5f2509b42*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"b1c3071d914f8e40f93cba5234e1ddae*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"736c70f5870920e80bbc90c0206294c0*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a3f80245d76fe2817a56b70be67575ce*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g232mb1_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"be0404cce699b9f02fdc30d7c46bb627*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"05eb7f0e03d8786d230c331319575120*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"2a8063c2599ca8a92a8edbb32316faa1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g32mb1_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"0d494402099fa4be3d62428ab30d76ff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"d0509f0e21ef6686de654aae0ef7dda8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"262bb3feac78824bbbf4e8fc148a275a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g96mb1_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"e39ddf2cfa9d744a8eb0df5c2981250f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"ecd0798abe9994039968905d77414770*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"63642285c800e00a5be6e648bc2d58c7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"f32e4d1da58fb484f8a09ba47d67777c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"26140f943363b884ed64aec51730402c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"70e9037dc7349ac28b813a5b03f650a2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"017c5a407dc4904bbcbfea175e9bc7d2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"b8576ecb37e51d13e789cf54a1bd04ed*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"b85da72b03dedded969606f565df966a*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"a6e2d396a10415fef9cb813ebcd7bb4b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ed7bb9f5acd05b6e7b840acdf75ed4c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"004b6a6017a6426d8453b0c7537eff8f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g384mb1_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"080d75caf8e456eac786bfbb4b951e1d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"30fe5717acfb2ebf962b746e704cb7b5*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4c29ef547d810d2b047da112e1e286fb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"1bd67b0bc0289cbe618a2e941c2c8760*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"f3da994f4d960a0d8f8bf2b7596ee556*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"6a15257b9e1648ff29adba0b84c64fa5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"776fda87789d130bbca2c3da92bd63b3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"1ed026fc7753f6dd2be88f104be38853*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g960mb1_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"396af34ef119cd19df88da2756ef07a8*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"228a4588d70a5a3e7fc352c6f5d48cf7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"8399dc6aaf1dc7d1927f3db6ec5ea1e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"2848e79d4cabb0764d15a2ec02a45d4c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fe4ba1b2897e7f293559f23093171067*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"425c1de35813358ed89f53f8e4f772d7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"90b0463af6d2f88c4ffb1d46bcca86ba*5&9d5f2c3155bd99ef92ecba4c634e60ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"9a6c683d245e69b773173e4c6ff324e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"ebe5bf5c1c49d840f7d37d085101bbef*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"ed4e47663c34d5119f21a8b8dd70d0ba*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"147277f0841676b1080d2102e5b83479*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"1b0288a5339f79b795c92f06715d638a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"6f724b7062621c38f6e9b55969bc1b59*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"a2fe2fa60bc1ee3425e2e4f1a40b9b9e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3f1e6b79e9666f15b6bb940ae9d38b36*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"ea0c25ea83290fe9170ef0d547df1ca3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"8ae2f2778fad155421d48c32a6d4f666*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"05319450d8c7a9ec7ddf7f23d6771f25*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3db9e3c5cb213e71e9ce829edb401d34*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"f70be1f5fe5f9f6b5e13877a675f319c*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"b57b7967285d18b913aea3e2f0626da2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"0713eea4ae5aa2ec91d1f0a5cf9e4db1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b43edb91ad00ef5269abc39624504fcb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"a70076db922b4a59690af58bc69bcf85*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"063e765effc1b574f89ed8fd45462492*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"d1caa728181731497742ad0d96f96b2a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"353ee22af937fc85f541384b74633d21*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"7cbd223d17f76aa0bef58b6bf721990a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"715690b1d0e3e2bedfd8110345885fc4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"945269a43059cb311de274bb1384429a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"3708b98fc5195c5c424646e48ebe3d28*5&1d0e1920459143a8a669912274f315a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"08b9327ab734f5aa06ed315dde78cc6c*5&000177555ee07633250f0ad323fa2fd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"f72d9dff3bb77c6aeb424765da2aebf3*5&019aab164288ce6af093f3bb0a425a3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"89f122b94490a824f1bdb86850f4251a*5&07dfe99a7a1c6f37bae81e81c5afbc72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"b6f954b2c0f678eb1d2b30fdf67c69a3*5&6925105bd0efd7264ade40c8d695f399*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"4d10fb18c693284e17327d7866d08842*5&bcb84cc0c38334bc60611e7aa67b1a83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"d2932520ba94f8f83244d347bcdec924*5&b41f9576ebb7ffd18dd1f958538d73ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"1009bdd0ee979271bcf3f2e9caf37bc0*5&30aa6a02a16241401137733bbaedb98b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"476566297f2809def42f6824e068f18e*5&755d3de1198e2c87b768baf010e5656a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"922fada8a7313dddcda2e958b6a96581*5&abc4f7337c06cf505f4927458162b8e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"4530d7b79fe4aa3e19c2379800ed5f1b*5&7e7c84bb42cf73b47e14c09afaf55cb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"4e0adb8b9029f4e05b592cbf5e9ccab3*5&e5397477a86fef945cb1e1acec5c2bb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw115ow115kw9sw1dw0pw4_n"3701d3431d49a864a8493c8b92be04ff*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw115ow115kw1sw1dw0pw0_n"048231baec625870d7408db5bd7c228c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih1oh1kh1sh1dh0ph0_iw115ow115kw3sw1dw0pw1_n"99212607044e295c16e6820566ba145d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw861ow861kw9sw1dw0pw4_n"8aa04e1036a8c635202da788ef0ff551*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw861ow861kw1sw1dw0pw0_n"851173c5bac86d3473fc04408ac6af67*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic80oc512_ih1oh1kh1sh1dh0ph0_iw861ow861kw5sw1dw0pw2_n"4be9d6d3ac087e87eb5582e62625fc49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih1oh1kh1sh1dh0ph0_iw861ow861kw5sw1dw0pw2_n"43af7b1e7acfd7e4e1555936d888ac44*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc80_ih1oh1kh1sh1dh0ph0_iw861ow861kw5sw1dw0pw2_n"0e55ad4581c51d7493b62418c79846a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw116ow116kw9sw1dw0pw4_n"db1dc7b31ba01c8232be6a3674040b88*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw116ow116kw1sw1dw0pw0_n"90ec4e73d4882156baced93cf90986a8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih1oh1kh1sh1dh0ph0_iw116ow116kw3sw1dw0pw1_n"da5c87373c4d53ed01b73f89609579af*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw903ow903kw9sw1dw0pw4_n"2044e068a5ddac8bcea17b7eb716143e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw903ow903kw1sw1dw0pw0_n"a65bde76924a94fb43a43ab9bf97cac8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic80oc512_ih1oh1kh1sh1dh0ph0_iw903ow903kw5sw1dw0pw2_n"bafb0cea51eb3ef942d5b55ac1ac1238*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih1oh1kh1sh1dh0ph0_iw903ow903kw5sw1dw0pw2_n"10a86595e5122e4dea01bdc30ce4b287*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc80_ih1oh1kh1sh1dh0ph0_iw903ow903kw5sw1dw0pw2_n"3bc3ccf1c116116bc889d4a319a6fdd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"95f454c17cfa4fa4424e057f0a1b44ee*5&ad30c957d4256d2f2aa444f0967ffe62*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb1_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"d9568574aaac91bd52ffb99dcea0db37*5&379346e064dadf66a13fa8efac85b9d9*5&d6c50b68dacbb630f4e89722b537681f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"106ca2a6163473b9a9c756fe94a7eff8*5&49de476ad7d3c570e1bb46345332a716*5&81dc51714d8eea4ba9189e4b78350c65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"9df7338e6c9bbb766f38098bb1b8f54a*5&d43ee84c31ce004349c0a1f23f9de175*5&6e5094e948e204bd8c3dceb36e85a134*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb1_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"593a85fff67e64b271ce13acb4b236cc*5&ac85415dd5d9abd990ca4d713303df9a*5&17b42241e45e35e2f3a35fe68cabd748*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"1477aed122352a1ef108baf84a86539e*5&fffc9176394f3e5127b1b332253b69e2*5&7c3daa537aaa978ba877c98dee6923a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"c174ce2d010cf8c08d6c36870ebd416c*10&9e079ff201e05347bf4bcbb6d0dd7db1*10&4c0a6b898b5dc213d25850042c4f39d0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"44b18f53b6a08a1194bf643fcb3e918e*5&b6dd61a9098cdbb3b9ecef8488bb480f*5&187c3c11686783d6101447c64a68d4bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"5796608552573830eaf889295bf6b452*5&895c7a5a58431b0eb69dccda3e5ab837*5&b26c774bfc1b707920e2d1b8f395f177*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"7d3716cfecb2f8e5051f2aa9b040f9ce*5&29d13ee5f2cfa949f19602c48877bca8*5&430cd1ea38995ee4152219a7af44bd7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"a041c114e32dddc255c5c7d7ac67314f*5&ecebfb0ee5f653633e58c88b987cb4b8*5&62f042063a8039d0bfd7d82c084e3b33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7f6a6a2af1708e84b71c75880853d27c*15&4d87e68fd819eed16d1ef709243eecb3*15&f28fca65c95475bd5769ce40ffd68793*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"708fd6f5cedcb59f5d211550dcc5aeed*15&e5ad31069836c9e108e2f530a5723dcd*15&e0cf1a2228def1c9704d7e8f3bd4f12d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"74af22e36f8d07aa81f1bcbd11146b0d*10&89f59482533dac53b8ec061605624483*10&8588d6b8fdc25fc91e73023c3a138e09*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c3d92cd56786cfacb03064977c6bd476*5&945c590f33a7aca65dada82434785bd5*5&0a793ac7596561884a56e2a320ba3e4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"fc98bffa9b734ac5b5945868761466b1*20&6058d86cd6c9a7e4c2f05d1e9fa3920d*20&3145d2fee0d598015f863fbbe1eb419e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb1_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"1f410b6bf2b836da8259dd13d6dbc789*20&75be45f04dfe58f790b71102872a08e2*20&e8d7d3d4dfea7e83247ba6de90ad6821*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"f38323fa5c51cef20106535dd3ffd423*15&1f72dc1368dcfe9b0a9d370ae8eb05f2*15&0212a2baa532adf1415d8c3ed01cffae*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9dc156d513acf9c8c33e993bfb8e35fd*5&b80516a5b67ead56019ee6dae20dee07*5&88dda65261d8c876e07d29eed05cea4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"0c56c133ef1e0141d745f465b9ad5ca7*15&0c0289ce9885f6f57a836619c711bd56*15&20d63a942d2af2841e9a256581a92b5c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"9268951205f3978a280eccc47912f0eb*15&275f7c51313e2ded26411f6ac62cd603*15&63e9888b7c591e35d70ba15b5ae0b100*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"0bd1e9334c04f82bf17d1f6c38f84dba*10&cd360b791b1226a5cd9d0bddb9ecd023*10&90f20183319ca89fd0e7c9dc8bb6b53c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"68635616c3331c4930f09b45633b5a9c*5&5759b38c5b3697f6aa5341b8dcebdc0e*5&923eba410501d824c9611a1202450aaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4c13f5e2f04a6984946efb745eebd527*15&d1d057e7615c873d402394c6538ba7e7*15&713e9f786b838acc73a62cd22a3507fc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb1_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"2b8298847222f99c226c532ee51a8e65*15&7db0687858ae4aaed70b9c23569202f3*15&fa0ce3ca5001ea8ecbac72a08ad980b1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"a6c38add151437e453d5055ae5972fbc*10&3ff59377712cf01faa6edc45c3d1ba4e*10&d7ed60bbe645deb74bf12e04ed6da048*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ea5666080836fb759f61edaafae4768a*5&65b470c184c0132108def08ae1d98ec6*5&0ed3da4a770992348bea0beeb3a5b303*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"0f1dd3f71be194dacce7bdc350f7fe9e*5&e18368701dd7a556be71be1ac0ed9900*5&f02af532c3d504ad4b775625362eae2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"828cebbfa5e1b551e0901baabc70174c*5&0c8c2b01a311d69402dc1ae7e9d2b57c*5&aa8fe159d7864e2963e5d73c6bce158c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"f968d060ef4a58b2091f9990e46b5416*5&e8198fcfaae7f9d57117b252e2d8dfc1*5&9b951298e79c66afe9c2745b599f7ace*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"8ac84ad661b011254aed0b46f293f312*5&d23c7a65105b2158f57623700349bd92*5&2f2f918702210247086b946a24a33bfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"100611382ff41a68a835c05620c50e24*5&1aa12d5993860d765f95c02a2a70694c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"55a8f4b92756fadf6ebc1eeef8955f13*5&6376628fff9c4817e3aea0119ba7e42b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"e7181717d25a120f51262c8ef8df5879*5&fe1061aca39364e51b55d96a560ac3da*5&b90d5c725e000019a8748dfe2fb52366*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"0ff4e8357c0f72861f56727d88c5d80a*5&930d9c604f1ae422cc3cfb7821aab374*5&9ff0b0b386a8c1f006fdd6404410dd62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"43d8bec463801503f188e368fa08f324*5&978ba06ca4c33cc99a0db55a1da3f09b*5&38ffdd4c89263d27de8230341e7cf0a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8f20fb672c5e594bf03dbb5e78e80dd2*10&39b59c7102aecbc703bab35c4c636db9*10&c633756079598544d05faabb0dc7e835*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"23ec494cf76093c2476af1c19cf9fac7*10&0717d8e348bedeea757c5f45174770d4*10&aa55eb7f180342333b60ee34e0224e2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"7b2ce2e4fd9dc4d9e7dd9b62df86a108*5&9ad2bb88cf064efb8d12d070ba1cab13*5&d688a8f8cef0423bc0e4014a25b9545a*1&2e84f58af96f72af7cd41dad69420b29*1&ec68003a6f65dce669195e21cec7bd22*1&1f97b35d77b29db19fea6e6e193add86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"992eca9f239f551b144bb1abe151bad1*5&c7e7218febeb5da9d848e1cb48816537*5&83c2914d823b8c39cae629867875fc15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"cbd499904fccd85692f14e35004f22d1*5&af11afd625bfcabbf51eacbebd7eeb01*5&068be48df715532c0a79c3f03b6b53c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2427cf77c9081aca3e1aef6d2b1ded14*15&7be44f2e592b2c48fbad3ea23ca295af*15&35ed810c0edae40253d97a0b42ee36b6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1f744727c9c24a1af6d674e832657a53*15&978a9bbca02dcdb29133ea2c0bfd6486*15&9479fbdcb011256da58311453a425209*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"5879450a134e2dfde4a25fec785e0228*5&121a25b7fd9d4e38f0a8f10115637e4a*5&4cb01788945b7193e5b6cd20c9479de9*1&ebaf16cb35655f4960c9e599ec59e8f2*1&cf3a8aa2f4930307aae3314299c91085*1&312dfe5d61e4acac937136aa23132b1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b7fa612656325be0b2042ae63c8c6d13*5&25a0c7889fadbf2715426f551880f9a5*5&e0064ceea860848d16f06ef572dd9783*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"96533859ee2cd9e48c3f49394ea61c74*5&19991abdde8d7123c9394e990c1bd59d*5&25058297392dff9820cafc017c9ecb70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c565a5db8d8ca9883ddefcff46182832*65&d95685cd6232a86ffd64b92393c98dda*65&5e6b18ba8c888d167872b2837eb582a4*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a0e93448de9950fd9450954961f967e3*65&555ae4e720f9d8e29341dbbae7ae7469*65&66b1a7a6798b5f78a9a22d1d29f976a3*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"251d3ebb7f7f45d2bc310845fd619a84*5&24dbcd930eb7302ed2819cce2b259ee1*5&a61d850961aceb348de394cfd1a22ca9*1&9028311bca62bcbfdfd58afe8765cb9a*1&f2028c03b7a9a11866018f5f10d973c0*1&c546fed17da9566abcc243838943f532*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"713446da416309bba9a25c4a8c120709*5&579522648eeb29ada955b6a30e0401bd*5&e71657c71cf076df83865d76208edada*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"028d0db60421d19b4caa357e6091de17*5&711a806c898e803126ebd74567fd39bd*5&49457ffef76bdb5f65a9fa0b1b98d27a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9cb13f899514a1a2bf4e330c20f42dbf*10&04244d11fa2a0d5f6548c33398175cdc*10&995daab561f8be61913fcaab069c273b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b0e673d219be9217f623ca80ff420741*5&4d01ad7166a96394deaaf73eb05571bc*5&b76cd67878c2dfa7f7d8dc83358f2268*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2+binary_add:f16:2 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"738266c88fb0d9f2fbefd2aa478f8207*5&009940a5688a837bd5737a0cbc487f1b*5&9d1fe5b21cebfd2197c1ad407dd13551*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"2ce79c818552c482465868a58086e7ea*5&32811b72a08f7f33951e36e924c3fca3*5&08d701a16d8bb82ca5543f68e5f98482*5&189f4ea541574f89f45643c3dd928a77*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"b494f2f96e6e2e9595cbb365e7445644*5&ee2170b890ce4d6faae0e76c97617d6b*5&2d5aa4db7010aab34991f061f6bda8e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"463f249b811649c224bb90cdc846e4c3*5&6269a0b0493dc1167327f53955f4d5fa*5&e27beb5159b54fc4872b3a88c2ea8d31*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"ec419fb496ba36adf0aa10058880f254*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"761be1c3653eabdf09d07fc4a0415bfe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b74761fb2f5f83d3a28f2289ebfdbc26*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"5b48f4757cfbeba0a6bb22adf61fdc41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"83700c0f443e02c2fcfdba427330e930*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"4021d3a9dc81c91dd38ac78329b55f05*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"a70b2bf3037172f6d639320b29079732*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"c7761bc0a0d92e4a4943ecf06fc5e043*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"56e5d40b364e95f70cbcc15f38d1faa0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"a31a98fcda8ceec71022d3c64d1d2c7e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5255ea098fb20a4d41d3a64cb96648a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"7cbe0be82b0f5913bd857bfd0ec6168d*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5af6ab56dafc725e8b5e3740db4b4246*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"aaa0b28cdb00ec0445b5e32a72ab1086*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ce916e14a37eac5a3521c5cc23c1961a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6b5f07f326c48aeea8652e99b74fba58*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"c8617579a4af4428023359980cb4c331*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ec806393df6f43925e9369cf96b5f8fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4af33670572fb7e934ab65de95b5a593*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0f40de52b39824a946ba5201dfa6957e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bf4015c4d84736977e16c47a33d1e58a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"75e3b6fd77514c57b8c7379ebf74cdb1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"9db61c7851d0ab4ba0d887eea1307d68*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"4633903d29fbd64b498971d64a0974b9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"245d9d29506fccf325a7d4a4cb487ce3*5&61b22f7ab2970f521fc87a957880e6ba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"40805d6793291b845c717da304710bc5*5&dc98c80901ba45004740d4bc5ec4983c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"e461a2a7685e24e179a19cdac859a669*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"f4702fc81e2f510a01e20464062913e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"893647f0c6467fd4739d1c366d8e1bec*5&4fe094a2c046d0abe2ec21e1fd894fea*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"49420d7ec60f508f58a45f48f82f5842*5&f2213a256bb42c0010754f3ec6fe7e50*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"60e2046615ce1b31336c48b1d252a05a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"42bd1ee29c0014b8387a4099603889bd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"96c662fd70e1764f0217ef7723e5c5be*5&b2134acb81b9162b4326c4398cc6a9a7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7e01d8f940bd9a137a3383d7fc062493*5&510cace74bcfe3b7dac8dbcd5bb6bf7e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"66948d51da30b7c3847cad40ee983010*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"a149743a28e00951ded6d16655ae4c5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"d4832f35e939d14ef9391c4a7ca560cc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"e668b046284278da8888755a839fce49*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"af2c10220c215be1d6f6690eacd1888f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"b64f62d4c84817699e123913789f5d60*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"68e299c75b7dad31f07e91cbb2f4d6dc*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"329eef2d1eb3207fe103731732be5321*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"b927d485c1fd83047b5996dd5144c46d*5&59a3445e61c655a70fc5a31f81adccbc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih678oh678kh3sh1dh0ph1_iw1020ow1020kw3sw1dw0pw1_n"df23c1f08bf05c0803a928051a63b02f*5&92ce55e337d7ff83589be2aca076d434*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"159e15f0883a294f9b0975e663e0bbd2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=abcd --wtag=any --dtag=any mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"fd2cba08b1c71db48784dc4c1eaabf30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4c9052e1e437966440fd343234ead6b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=abcd --wtag=any --dtag=abcd g32mb1024_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"874f435ac44aae610106a39c71d671d8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8075934ff07285d012ebf0b9f8da12ef*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1ac8cd55f477c079ad1596e379d5bee2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f31f8377715ea5a87c2722e1d4dce230*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1fd594288bc1da3c39e5ee042c9ace74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=abcd --wtag=any --dtag=abcd g32mb1024_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c56c30b6156490e59c205555861116d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8640b8b9431e00e86a047c4b85a63ce6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"4c41fbfb0244946de25b5956a184d096*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f2d5873766496d879cb4dbe41ee9159d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=abcd --wtag=any --dtag=abcd g32mb1024_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"853c694f7a942097a6a354cb5f116c5d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4643c35a996ddccdc1db3b8b578ac89c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any g32mb1024_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a5bcf0cde3413b2452c74048dbf515d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"239843f0c6958c6d6825585ee0ca2d18*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"d9c482ee38a3dda6fe6c21ffd4df3919*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed59fb6a670c864545b3c1b493f53f41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any g32mb1024_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2ba01fdeb8c546f901637ea4976b4955*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"862be5c345a19dde5936b0bf9fb36a78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any g32mb1024_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"473173ae4e4ad756e177befb5e3425a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c9a8959e216349456966e9eb43fbb9e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"dc1e3a92f269182634e09bf9ec78f365*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1024_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5170c70f6454e11e1336f73b547c03ea*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any g32mb1024_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bd215ea1c11432d2a4de2efd63f33864*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9ba7403a7dd5d47909a5db672035155c*5&d6bb93457ab0d2e4c8c08b5c60338fc0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:0 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"675bec1b2ebfadcebdafdabeb7216c9b*50&7dace996e736d30f9760984fc68f78f7*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:0 g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"ed723056d707812d17b69699854e65e3*10&5abffeb38da845bc37f87c1e075f4837*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0688f9d906fb404529d33804983248f9*5&2e5c3276e115d13ce9d0b6e3f11ae83c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca073ebc4b59e9bfd5a74b6982287401*45&98e8cb0c4a2fb1ae95d4791a2f4663e4*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:0 g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw1ow1kw1sw1dw0pw0_n"b9d03cc0ce0ba0a8f9891a7cf205fd71*10&a5bf8459b160955b21a6acd99af4abfc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"92f49391930aef6d7fb733d61f5ad997*40&639c10168429b8d5a794e9f12be7688b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:0 g512mb1_ic512oc512_ih32oh32kh3sh1dh3ph4_iw1ow1kw1sw1dw0pw0_n"b33c075ca07fca8185a318f7ba7a47ef*10&f677c7c0261490d2e29d44614b2df5f5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih32oh48kh3sh1dh7ph16_iw1ow1kw1sw1dw0pw0_n"9d98cf768131350992c16d99db16053f*10&cc84b8e80c7aafe8df51192d2cb64d81*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih32oh64kh3sh1dh15ph32_iw1ow1kw1sw1dw0pw0_n"abfddad45cb89136167d071034f0523a*10&ad8aa55e278106411d63473d2b76fe79*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5c6e6525a0b7ca63461870c93268a03f*5&bfd26bab51e1e865f75846dd6539737f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:6:aBcd16b mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f0f7950dd8b7e7f3f4532edeadde7390*5&3e146a458ed5fbe0e11d81ee700bc22b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"500db0abfa2b71972bedc7daf51e2614*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"99b3250fd7f1b18a36c44b40625fd564*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f6ab0ba2b605567f272f35cb2a18858a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"d34f51fe0d39a80ad7b3325e46eba99b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb32_ic96oc96_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"809d4bfb6779a713040e076a3ebb2545*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"8762a4d91d64669fce2d1c6e91df45b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"31d1d31e8c29eb3e42cd6e256a58116b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6191a10b105026ecbd732008511af28f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g160mb32_ic160oc160_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1c676a1c8299e283a359d80fcfedaa08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"36cf1dd5ab9a6e0e98d7c44fa83d6232*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6f2dad13332b5fde9c2bb84753bca405*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"75f3b4c42697c6fcf3cf0f109cc9af2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb32_ic224oc224_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"009459d6433b2e9eff05176c83e40773*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b6e82e6434152d519a0042c5bde85e9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a603c3e55ce746fcc9bc60d62a0a112d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"10f626765bec81833d4edb2a5e8fb087*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b85ec0e41245be9583cf9d70b8c6035c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"b7fb88fa53983b6d2c075582874ccd92*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"277ca857483f4f604eb933b4d910f7ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"57cb462baf853119d90fe3172357b426*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0d45fce7dcecafd1d9c96d3ac275923d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb32_ic224oc224_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"15554789f567336178bda4f782ec29d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e7e372bd2c08de8c5e318db312e4ea83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d4af6d0954fa1487e3d52298ad558a2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"977d88ffe713fc1b7d56ce396f50198c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb32_ic288oc288_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"73d9c0b3404e3057552dbb8019a7747f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0bd5403f1b11bc4bdad70650867ce5e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g320mb32_ic320oc320_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6f18fd8a15a636ca022e59f46edbd883*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"63b83d797045813231725c30a6642dd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g352mb32_ic352oc352_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"111d9da49546036d6215dffbb8922590*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a97fc1ffe0709c64c7857662a81f2a15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9b2b4dd23161608d63612aca38ea05c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"605a6ef6a132b5c3c35fa4aed281d00a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g416mb32_ic416oc416_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1e64906a550e787471bd9fd00b8775c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"09d66307bba4d4900b23ff30b8ac4c20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g448mb32_ic448oc448_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9e3ee6e5ec19da92e9a33d997129b8a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ddf38cd3b54136bd175a3d270ea50e73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"33c898539709d7ec346fc551415393e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"46b747d5f1b109a48d8238aefa4645d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4ae6d8aaebc0e88100989b65655b82af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"af1b5f034ef55b0343710b9600605011*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"78bf53722a93a8f2a35856d216e0b864*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ef91b735169e63efe9392f4dce39fa9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"289da9a04923c382ebc506c468a8b7bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"49c402010eea7e0f13f658991712493b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb32_ic384oc384_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"eb50c54fd53fb8cbfb52988f266a75a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bb972eba8a8f8a75bdeffa1de8bdee55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0bd0f5ef3c1231a608bab5fcc54d58ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb32_ic448oc448_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bad0f4ca0db49ad1cf31eaaf647e47d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1b16db1615b4860222b24d0f5e065423*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"85d4161ba1da7dc5275edb89ea24cbe9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e2a11762e0783b9ed2dec47f61a5f3b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb32_ic544oc544_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"13e99405b623bbba065ab5d2ec522d32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1de3724122f958005b993672b5c9d2bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"70a9e0dddddb34b6b67a244f1c7a4263*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb32_ic608oc608_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9bb59ad9dd83151972edbccd8da93009*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ac69228d151386c12962012f02a0b67c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3d738a05a4364659a723129955fda0d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d0f376bd41632f68f4904295465afff3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"15f4eeaeb681cfbe2c9a41b398321ec2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb32_ic704oc704_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"82bb672b88ec881c0ac7350ff02ee305*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dacf63ffef6e4154b3904fc80300014a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb32_ic736oc736_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"eaedb3f18a7bac83bbd64eb929335e8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d24c13c2046fe51c1c761f913428dafd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"972f597c45b62cae3980d53ce4fb1ea9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb32_ic800oc800_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0caf7ee1c8fbdf632f97e5b284dc6923*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cef8e709043942ef5ba9bf8741d30e98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b318954026dabf87f4ed82686a2e807b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb32_ic864oc864_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"036c3037df59b8dc36e0ef66df8173dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"10a4dfa97dfc44cbd22c290f47e95337*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb32_ic896oc896_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f634c162e40aaceed7b3a5b1c0ae1ac7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4e5c07922df2213159379790ca305bf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g928mb32_ic928oc928_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7cd6a620987f8656e85b543e732824cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"21e717d97cfbde08621c39a90b649d68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"af427a10400daab7c0a5ffb64d0c8e1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"113468ada4413fa9efbf7cda2e1403e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb32_ic992oc992_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cd96016f0232fc209010fbf48937889f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bd79ac2d09bb37d5a918ceb2fb43bf61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"256f5575bc6e9548c1cac9cf056704f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"00139cab58938b6697688d8c6d4a5941*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"12bf589d010d8c0593ade3518d32e405*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3f1fcbd830f4a031fe91ceb730044562*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb32_ic576oc576_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6c1e10915633c5a5403f9ed892cb7da5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a4b33c95b43f873c48828dffd069305c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g608mb32_ic608oc608_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1dd8fb9ce10c16d0dfee7e018dfe20f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"78309c2e0f136ed544cfb5196cdc2e99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"20417e89a5bcadfb5e2aba5ead6e56e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1e69444fb5909b2c22d87de8b9fe6675*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e16e57b35b9a3fe944f5277312bf34f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3bdbb67c8094d077400b102d30454309*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g704mb32_ic704oc704_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a076e45a53d8201b8a0732643ad17710*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8cf777af031eef1543d87091fe169bd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g736mb32_ic736oc736_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bc279df4069c36174d4d6d9d7cba1390*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b6e68a3e06889d01b04d97f25e151401*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g768mb32_ic768oc768_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f5e29bf1e50a3c3b84f16958479ea92a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"80f51176b8e6a4c60311cb5033e6ae62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g800mb32_ic800oc800_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9bbd655f198ea2faf5a797f7a0978be6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"96050a4358f482de9e3350d708a90cf4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g832mb32_ic832oc832_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"50be3dfcfac766538c336fa18cbc7e78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a55725bc13b5631404912f2a201f22f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g864mb32_ic864oc864_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e518a48fc6fadfeea8130958aef7086a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b6cbeba6c93bf509d9f0f2217959f698*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb32_ic896oc896_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a35ac86a6ba4036d47ecafce1f2a1078*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"80cfcc9d420d10bf34334d36f44b892c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g928mb32_ic928oc928_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ac9fd2714b63a23ba86a03be82733b28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3e81b9b8c0a72d84252d0caffe68ba6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5f0ee5c3070cf603eb595a72c935917a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a51d6d23117b35803543caf9b7c1b402*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g992mb32_ic992oc992_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ffd27df78f1923cb8e08273a34e7a06e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4565f7f9bfdb44262ffac5a55b815568*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9af92c960520d00ed64ea1fbccdf990c*1&eee0f7510c19c75bdc4b3f9aac5b976f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1344ow672kw7sw2dw0pw3_n"662681acc8ce889d1970cc096a05b1c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"9b8fead34d7c37f76760af8521d365cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"eb9d929b2d7ef2ee13f0c9cff9dd21c9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"d88ead8ec9cb4f1bef30df0f05485ba4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"b3de6a1be1471eb14573d484456a4d17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"9243c7ec3b86769994823c808cb59f40*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"9182c2996e42ebb5bb3c7b1211c2dfde*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"2593962f9a9b3b8ff20f00c1da32a810*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic512oc512_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"6acfb205a0b2ce930d8740c453aa26cf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"e5280cb39832a029b6b2f4d0e0056b5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"699cd43eafd15b93b73e740eac28631c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"aac3bebb0449f62b452a7b9c0bbc4531*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"df5989820f706305dd4ef0520550ef7c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"d9afd0691dfddc4a7afb195b04d5e46b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic1024oc1024_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"37a5024bf238d3b6b4dfbd89e51e0fb4*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"94b2cb2176944938f400ef0f5d05fbeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"4a33187496dfd065a93f6a0bf32f3d5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"acb8a0a88476440801a5a214aed27ab0*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"76ce6763905186a84ba98d0398af5145*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"905033c105782e014d2131311040ae5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic2048oc2048_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"d5cff387eb36c803e122230647f8edd8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"8704f3ec06c66f1a99f283ee80e09a36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"26ccd6d479f1f48f80482ac61b814318*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"a4099d52dc7f4f551b267046b3773c92*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"819f5306540a04332cd422aaf83fbde5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"5e1753ecc1652ee97d5d2066890ac61e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"b9e38b7bedd6649de4db28154f188b15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"cc1ca1606d33b176bba41936a1298623*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"00341c4b136a2dbc1beecefcff7008ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"ec9bb9864b030d8b814b2e13a8e5cd12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"44297a2962c3d8eb7dcd3e9cd688b30c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"bee1b57c4b0bbdf4ffd32936ff63b5f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc12_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"9c98fddd2fe8a8f2a48794c1e5e3e172*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc18_ih200oh100kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"aeee3de952480fa687d28049dd6386a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b --attr-scales=wei:per_oc mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"8f457aae69c74d5e78c16a398badd6dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc18_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"423e6b579e5b675098a67bd2fb266f1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"90943944634260a0617878aa80d196d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"d24de3d76bec879258aca2dfc107e037*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc12_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"eba754701d632a58016180e0cddc042f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc18_ih100oh50kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"1ba9a47420ffd19c299e49712c507d8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b --attr-scales=wei:per_oc mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"f6ccf33eb3b0c3879706938ba7bd9698*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc18_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"3a086607241a5310588b6d1527fd04ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"85ec2ae9dcb999a82a9a92dd6e475fe9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"bea4a22e25f601ad6facc98fbc59ab71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc12_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"5d69c24c3b41ca5fedf97209f5222801*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc18_ih50oh25kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"73c5b5eb4a754830e5d4cff81482c6a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b --attr-scales=wei:per_oc mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"d3902f6029f578dd58819c57c4a2dc5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc18_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"d237b6b89bdc5743cea6913554c39fd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"9c0d0c9bca2da41082397418e632a76d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"23e1d21e37789e5ad47ac6b14244b570*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"3faeeb7ae73724a690d6e986d0193d55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"d561fb48650bd463c87b0dab92d5ada6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"3114b21f8d47faf78b3bb3e9c171ce80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc12_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"b450d2c5cbf9c146363252f390d00168*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"81a18b0dfa9099aa7d9c73ad820de3c5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d667d0c2c60ab01ba86763ec7cb9f161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0a7be4b564ab21b9d7454667f04cf64c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb100_ic256oc128_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"736d51677c8b43590c5ef2f748af63d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:13:ABcd32a16b+binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic256oc81_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4ef601ffbea203e0e7fc4701a8efb5af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd267b0a3280ff12fb80fb2e258910ee*1&d5af1d8eac8805ea4bf4466c6f07a8ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"074df79f58901ea07622c89941f87be4*5&f9316e481d700dd0ad841301cfce449a*5&89099f4afa782f0b333a43fdd436fd39*1&d47527f0f7431d3d49261bf959afa757*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"17929475479a688f1295aec6337f2839*1&f03e913a29b87b4e231490f2391dd78c*1&574d6e16f972e3af0cf7797f34cb121e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"2af91854cc4aeda39997ac190b1cc616*1&cd6e97742ee1e8229fbd3ef07a404dc4*1&b54a9124859acb5b0a6a749ca12ca155*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"f7b18298f3cb4261cad10bf75609af9c*1&53616f548dc1e0d8d733906072f36f08*1&48821b41b1df203ffbcf9196d8110dbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"3293bac1c739bb1208255312139e8a2d*1&2cd3286ec9d6aca9e8cd61ff62935099*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"bb73c4e798ff62dc132a8aa4c084ae3d*2&0f0fde0649be46c4214651bc7e156383*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"8b63bb38e4b283913e4501598be2c256*1&9b2c5990ba48c181b2fcd36a9d5663e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"52521fe5f30013a89fb693c7353610cc*2&72037054b8599680ab462470780ad140*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"759d916732fab845f2cb4975a38ba1df*1&44306ec1c5c327a1cbc943f8ab255943*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"7f01247c1470ffdc056f04a8cdb15966*1&99a994704c47c59cb998fe5d0da1f65e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"ed28ba6c88be2dc06e32fc0fe2f0b151*8&815deb4525ac32f79afe8621bb090863*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"dc33fa433d1d9305b38b4c21b4065c8f*8&b52c227e003f7685a87e791bbff5b071*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"157a729de83fca01e043160a51874275*4&280c82a2420c000230b10fea55dea71b*8&2cc912b3b66379b6745d69abb584dbb0*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"88e2098642646db81685c086ca9b9797*2&db4003d4f3121e49bec969069a0bc934*4&941651ab496863cbfb3397dee6608cde*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"68957556bac150251bbc8989825dc8b0*1&39e1e2dbc0b40d3b0465f9926de1b884*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"6570a3b33ce3e14a9b904ad283c90619*1&fb40ea9b7c3c4990e9f80e7e70f7f465*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"56abf4e7f7d25fae1a4f00ca09747fe8*1&19fcfd17abef90154765e33bea1a3d3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"65de1df7b31d609252d6273556841ad5*1&14c7d8a57132155f8cc52d2bb52efbff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"98a2bd815c6cdb90a62261d9c8e74ada*15&6cfbec5bbf9250c65121a2e2af824f71*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"9c37f99b16be2b798c67f03ff1584c99*7&0d6151a1eee7789d92d998c9dce5abff*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"f70d91cb696d097c2ddb7ecf3f3112f0*7&5c1254e83387f17332381f75f62cb22e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"cdd8e3cbd2309262c8b977426bdf2ef7*3&2d0b169327db0b5f235fa0f3cab82448*7&15ee47cdad33ea92ee6e201e7cba80f2*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"f0a2a4aa61e91432b73240a6303a514e*14&ac8faf36ae68c5b4ce31363f8ad0ca7f*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"5641bb01c641d9c2dee1729beaf2868c*7&0eb1ded450163f38a2df2d89f1f5d37c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"7b9bdce40999b8f35328bb28464de170*7&f447d530e96e65e8467291aa70f65d46*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"0d9f0885408117cd2bae47373682e432*7&8d48925fe6c3f50ba65e241409c9119d*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"1ed511a8a8568d29e75ff3e8a27475be*1&adca92cde35d7c53ac8622db21e6ed0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"41410a5fc6c99476b0492bd091d03cd9*1&205bc601154c38953dab7971deb941a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"ed6b71e7b076e417789718b01342871a*1&2b99435e6574db175eaca3612512c43f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"f84cbc79fd2f7d7292118dbddf4c81fa*1&605b2321ad0bb059e346fe5e53229e1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"8bd689ec94c39722d6a9c8f24aeb9e6c*1&d899abb0a5765f10dfec9500489401a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ac2fa0d259310008a5a29289e896d974*6&7a5cf3246d80e326bb2568decacf79dc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"25a90efbe76dab377c1a3e4705f85570*6&c43f71a96383ccbe0774c0f936ee51a0*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"273804e5a87168119b98d8b67438ad51*3&28f1b254484241ac4ebeb97b51dbf0e7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"4aae8bb9c523dd0a2491308412bb97b7*3&70363dd2d72bba30ceffb9310c4a0a90*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"9ea99e0c6f7f3d703d61e736f72da242*3&00cfc5f6df69badec1003f15233ccdfe*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"b6256a1fbec3d96cadeac5b3786ef93d*3&c5f1c1d0c427b44af13f7ef217665bea*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"b7be930846937dd21f0f8c1a549c9f26*3&03ab1ac14e23180c198c29f136a41e76*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"db1963b97dfc5804b8ca466015530452*3&c43ae726dea6097f4dbc30a082e009c9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"3219c59809975f83daf740d07977564d*1&7f53ad511bcce48d453ffe3c9004482c*1&e3e1adc7402fb7c06fbc02e8d24e9657*1&feef90c0f9e9725c749d8d06cbf377c4*1&e72db59fe4ae3cc9c646b7d48d1dab95*1&996e8f00a14e20cd2917f3fb3bc4bd31*1&76820cc28ba6b8e7ba7f4a20cabd555c*1&67a1f242f0ce970aa274139a917775e6*1&b228d91b5b9b7a9843d0656d790f6d25*1&9598fecdd06e4acc01c36593f5bd6e84*1&fd9498b5e1da83092395c2bb6b7055ad*1&9019b2ba147e5d543bbdcfa12bf0d8a0*1&448ab810dc4d48ca45a5a401659d1c91*1&4def83d0c22ab526c1f63578cace688d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a4a134dbb497f36972d659901f39f9c7*1&93fb5e1c894e2744635229d954481790*1&ae74792de3e06515d0765c41140de3aa*1&9403752e56147698bf3b5fe0fd2105ba*1&3c7585cb918993a31a757f179b7f770b*1&a682c51c5d8e777cadbfbcc160f77367*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b440c2fbf63c63c2a8804305192f7413*6&106caed2fb41b29be847a2919f599d0c*6&5b0ad0e648120b9c2b6173bacadbdf44*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"43e4ad461441dd34a519986bdbf1533c*1&6fded0b3288eed4e39e6a707aa84d1f8*1&559e09862ff9007e8987eaf92f2abb6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7299a2ee70a6301fa50d548658f37b72*1&0fd001eb2996d857316e87f6e1846e0f*1&6b14bc441fc7b6584d67b2709c34f071*1&2b621bcb3e47ba490af52f56c1f30041*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"973dbf37e8da119d66901ed647b72ec7*1&a660a25c601461402637100ed09b1184*1&662f7c422063ccf5d4aa7e285a9cdea7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9a244f38417a927020ebefea224a2ed7*1&629703db37e3498a6e73c40efae88cad*1&418ff31080d9efdf31c50faae93ac8bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e1acbfd207b4f539f3ce93cea2f4b799*1&815ec20a6c3b3ffd0d631f6c1dc5d298*1&04f46a50855a571122adc8bf4683262c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"496bcd9f71161d09b3bad295ec3f422e*1&6c18b6b37311749ef8c5ffa376729462*1&c90078a79ed554f3803748b4abe51701*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c19bcf34055a25dce115d9a716fbfcf7*1&4c10cee6357685c970ce0eefad50acf9*1&c0b72692a7044ddc0cbaacd239ca58c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"47a56962673d26e09b07ab8a2cd181dc*12&2dfb65b1c7a4b4c2ec143ec570dfc654*12&82c9e2712bb26fadd96ddc8ee83c5946*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2758954564741c525993e18fccc0a64f*1&da026f4cfbc3555b38e609df311656f1*1&462ef381b74f61518ce050451d2fb6cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f5a2e4eaffbd5a9fec016bb5ce66b210*1&8cda2a5cc0fba9dfaf0e4d5dee64078e*1&d22ebc6a052b0b80d08aacb4a661a4c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3053961969442fa2ed60119644da845c*1&e5798c7257b5974f0f538440bc167ace*1&84211b4b9e035d439178a5b22af79036*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c1d88930d228416c9c384d1e33a96c1c*1&89b1f77b490051e5d11286ef764d7759*1&79dd6e5f895b5d84b568912dcb294f02*1&b8f4299af6ec4c4957255c5aa668f1c6*1&9f555231b566842a4c379d6ceb615e4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"95604286480ea2541d8b0eb808e56cec*1&8723ddfc3e011ab996b8c507585d48d5*1&d5300a033c7e14e41e4a7cdad44cc7aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6dd84a48624f074fdbac0adafc1a7f02*1&b8289beaf765e7eead6e8dc9d12479d3*1&05a59440867279f25964128d5964d83c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"330a6b81bf0d14f0a993091f5e5bdfcd*1&5126a63c49535061b36a51d7257b10ad*1&ab168f3fdc0365e6902d1a7de8954b57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8607e1168093d0aec05131c9b10d35f6*1&a8d5d0ea6ea6b1ab1a1513308ffcfae3*1&83bf823df972a5a2e9406cf74aaf8cc8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d484e123efd348f709d62433dc7a9201*1&b56682040d40a64d2cc6f5dfd18cc472*1&8884ca7fa256cdd607e203c7112b452f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"59f64e14570d1bf8e5dfbf55ee020214*1&a9b08d1550ff59752a971a0ae4d42a28*1&9729299286fad752cea1f564e9dadce0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ddf7a8918e87945a52c6168dd4fa5f7e*1&f25cc6edf23bfc6174c4e2b458c22e70*1&a86eeb936c6d6dd64f0fa69f4d53ecee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"52b1aa75fcd62118932bd52cc14fad09*1&adad86f1128f8dc85c2c0d0d55724966*1&5913fcef0a89c2bc7d72037643327703*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2136fc821fcb9abfb312e3cb83b3f85*1&56cd3f0ee47b61728a46e9c9d83d12b9*1&3fdba8f012388c9d92a80444c4fbbb75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6676a87a0829a58808842ac222ba5444*24&fbfc44457f2808e8f979e95911fe62e3*24&185c2b35ebc240c784485506a6aeef43*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d7a99f7653f852da837339e42350ae5b*1&10e91aa2a2ec7e3cef161819df463935*1&03cb7e4e8be5c41c6daeb388cfd8e1f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5a31a198964b641f603c4bc1d25192d2*1&770ad1d546f8dbed69f04e5f08ed985c*1&9589c03e4848c090926fa3e5b0ee56ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a27ec40521f0d8b485915293502ed2d0*1&748f013c85d96693c219fec34aa97755*1&d8817fcef49dc8f07e5740bd0e720994*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6f4e06422574bb7f11d33062d93c54f*1&48235352a298fe5984a05bbaf53aa8ec*1&7d34acd2eabfdb5de7db549f17839c43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"641c6fac479f3ad247c74002adeb5cb7*1&8afda3c8fb648138394ee88e1e3988e0*1&6d3d6e01e1528330e16ebf9b8ec88820*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"36842a467b39cc51a91ece7d626b6e6e*1&a0eadc5850bf56efca74105b68522563*1&3e7ee0184981a52515fd0641ef4b28f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e57bb8af6180c1131ef01215c7e90ba3*1&36468e80c7bcdfbfe2088ff9002c71a5*1&eef6b49799b0e9eb286c0a54d25d8ae3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60c71da07fd4c9bc86b30ee0200f30c5*1&683ba97710c06d3557eb3f33c35eb25a*1&a36021c54f7c14b0fed00ae11746392c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0864f704ee20931169be46f190dcf9f5*1&1763c7852a62e5d8c2342f12ed01aa17*1&9427958b8f784fc13b75686612dd9dd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d798d19188ff684a0e1a69555deb3652*1&c3b2c1fb0f416091889166c55f9d740c*1&bcaa7d29f57010df20432b162db677a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ddfcd93ca3c454791d381fbe47ccc760*1&499ff2b28c408fa2cab66cd3eb67db38*1&a1d4222c2d8546216281706688fcc377*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dae792d33ece71bb8345d5adf72120be*1&80d644d00232dba6c36d2e48c7ff3ef7*1&7ba0b3df8251c98e1b197054a91f816e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f3aa55ba10d4226983f95a824f701d24*1&4f2e5e19d54c154fb63385c106646a56*1&a1c34c331a9165fd209d04c472899ed2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aa77cdaa4262f6d89cc701c57f820424*1&04d19d72a52e49730a392a632bea713a*1&8a9236d3ff298b572496221b3f2005a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f623e2acc4d4314f32212f03dbe0a18e*1&bcd42a332d29cbcc5d7943ff20d96ae2*1&3e89ea25723252d37dd7405e36cd986b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a7f56dfe22713ac1ee7989d5b699d177*1&70da45b07f1e6484e5d941c027f883eb*1&f9ee2f16c73d9be4196e38f3648122cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a659199b06c425693ff6fb65836376ef*1&ed135f96f6402a112b972b44168f16a9*1&4108edd62c504ce78169dc5364d21779*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"69a1320dfaee36214565783259b45cbd*1&e5ca654506fa581e2eefa406d63fef68*1&b3fd01c9956e8dfd32a2b91a3efcdf63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4da0c0ed88367870a5996de0c8df4e1b*1&d1cf8c6773d459bf8d879c21a10e89ca*1&d56205dfb3b507dcb7a31115d1183631*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0818c89267068acee936b3be4dd4baed*1&2e69222e38f45afba253829b63f264ef*1&45f7a9e4cc9f227e96a06e2c2c9d30d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e9145b041417e96fed6561adbf8a15b*1&a23e5961bdd57b95b61d9b6d7cf17d6a*1&efe5a7a0fb80d0a7798de1535f20b1f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ec7cc71ebda9aeba0704ceb4de882929*1&125bbbb9bd136b235694c710aba3593b*1&c0dac0508bd61fb12c5222a4b01a9cd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"519a324c0b98ac44628d4d7ccce23389*1&e4a387e2ccba1bf2f54a293c45c6d421*1&47583e9ae8354527890ce2c8b11621cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"84397ba0931531c8428ef7eeb349d6f0*1&09682204aa5eb8684bafdbc3b9db127e*1&3da3d1c53d096f175fa8045660d4bd82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2434259e9301bea8a6c016195a021b57*1&009e69f5d811f0d16ea7b8379d1a6580*1&b867b0c1849a094772abb9ffb1d6e955*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9f9aad6beeca0ab80313d5b43b8ab3ca*16&673f1cd3a963ad39d806e24cd781959f*16&5c603f1c15ab2948a8507118ee17149b*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d0760af7658c04806cbd96ecfb6ad10b*1&03bb9376a8d63796fad78202f85f1128*1&6cba301d6d218405e83ccb6075b0a7b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aef63f2b97220adbf2f831435dc04ad0*1&85a8f6d3fe51e0ee4c8026b227cb037d*1&9f02a545f710a151d461ea0a8b06c6fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b13134fc111708ea171d1318984710dc*1&86599cc77d5c4e572a1b048508e81b60*1&23dac2261ae88f593a827afc6a56db1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7c51ba263cc1e41b6befd39e1b651d2c*1&f480dd3415260c9c6ea2f297a1650e33*1&43046643240ffebd66ddb8e291639cde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a428b1d5900d474433e9248e1a452167*1&8593ad11c21601c049b2d58f18f64ea7*1&c54cdd3d62bb1df52b3abed75a81694f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5e9d2221f4b3d5abc1bf7581f734603d*1&88d290c8800c947830a35053b3027539*1&7ac663d27ca17d316036ad6501ffd9b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b6c31087ccb7901c9aa288e17679af51*1&2cae2b43cfb25bcbb323763ce79ea3e7*1&03de7934691a7e8d8f243ec7538669d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"96e383533c32b9b5dfda0bab2cdb5393*1&a95898193d2876592595b80781e566f5*1&f2f56e62f754659ada1290f28ca1d161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"364f87601205c1a8c2b4aa2cbb41b707*1&8e9750eb5fb37fb4a71ccb65d3dadeef*1&15f12790f92697af2015143a0ac941ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f08a6ce842b5b34db0e1a409385e26d5*1&577d1967333600473d8f12fa085b37c1*1&8a1e1e85d5cbb78c4641c610baac0ad5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"adcf3b18c2664f08d9eaf78367ed0c64*1&3c433d201f7e35179ce78fce63add9e9*1&548b36098a8a404d685ca344667ad0cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"265868164cab5dfbd8f0c120d0a98834*1&6407bb7f2b5a9c51c2ea1dccd8d74bf7*1&7e45c85db63d308f9957057cb84424a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"59ee14f3105af25172ce654e120dcdf2*1&0ab273212dfe75a72e4b7ebac26f4539*1&5f9a16098f138b7d030016792f6c01db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6bbd28a35eb17d75084f47b10e52429a*1&0632f53d679d0980a2e10fba8a7164be*1&b247b40a83ed44a2f9c9716221f5afc8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"89a5a711491593021bf570e61055a6f8*1&e81f2dd9513f0eb97ce58901256c4898*1&8977ef701f6eff3c8a928028b0f0efe8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2acb1c26607d38376fab87a83e06c6fb*1&106282ace71800e16fb02d47e7ab5bdd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"614ad2d60ff941ce89f9a2776ee189fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"d3b731ea67fd60c53c5a97bd141d3736*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"556604986e7b6e5e1a16135fc06af6f5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"0fb0e9f5f021273cb9038f407651d344*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"dc9998801b24a11065ca4a517deefb63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"95a7b56b16717d5360553f4ecbb8e274*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"f0c879759b58b2c5710eea857a4beb6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"23f8f79c64b0a406fb7ed5585793fb0a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"00c1d5e3311c5c414d101645e25b18e8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"413bd256ab56eb3086a9366bee8b9f26*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"7361e044ac842c2f4bd9a5e6f4faa6f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"ab1d6254fa5a75377ab96e9cc3c79dff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"28066753c304bf3d95e62668bc476b8b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"238d8fc76f27afb962389385f3ae6c54*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"dc918f6f05b723f64d082f2c2329d284*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"048efeb064a267ff4964834fc1907d43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"7c95ed054813d93feaa50345f56d0856*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"93938e60f8156bbe5f24c025ed6526f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"31ebd4cd605954676ce426ea5793a9ee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5b0ed05a87214e3919b84f6da00bd804*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"e87a05eaebfdb9f774df7d8d6149fb97*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f74df3dc470fc8725d31363a57fbe550*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"99df21e6f7f899049d5d2d2a6a25ec7b*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"0aa2d1a8ecc632e5b6ee0b39e1d33d86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5e4dc565df077bc6dc04dac8c108d869*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"58eac8d4a7193431ec10e133ab5efd9b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d1307eb3be44c2a11e2d12d52aa6ab0c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e6e1427b40f66b7220793f1de000debf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a42908e62e68f80760d08698d26988f6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"14145d12e99d5133ea9c3dc3471299cb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3e04d6fb0bf76fd8f725b9625155b51c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4affec100dbd6666905b1ccb1e29a246*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"43aebe494cb9b0464e6f6d67384b52a1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"67573c2fe4d4eeac3748ffa612850113*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"510706eda661572bdfce80d2f81387a9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"32901672f28ccc1744b0facc5bf0d77a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"7f46ae7dce8e65a2c9e6fd718d2b3bb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"bba7e7263573edbcbdc1dc3c9f9830b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e4966819d5b8739f1742f8bfeaa502c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"48e1e97f10b688186efa31a60b749b87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"4c2b4c467b571a121859a42c7a043c41*5&516ffbc1959a47c35ea4ad1bbfde9024*5&f4e54487c9453460c9d1cd768814a441*5&104de2dbc01a1e40498bd882d702578d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b234718442f2cdc68f54c0b8e6604a5e*5&74931ebea66d8cfac86503dbc073d435*5&b036d1eca9c799efa5a7f0e9ccdc9aa8*5&88f3333301229c02f2e68196b4083731*5&6974be2ad866baab89d20da6edcb5acb*1&33ec8445af648266369cd48ed43a0c06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"74bf0fee1479bc5a1eac6de068bfb141*10&3f19cb9f6defbe0e4eaa42e232c9524d*10&d632b2744d65d779e8d79c3dbb87f5b0*10&decae40f0b8b6384681e86d4bd5cae62*10&fddce8fb3615011fb44d701a62c4b4b9*2&da7640e8720601dd81b96bce22d0c0d3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ff9a2b3df7a03a2c744e9ed98ce87c72*10&842a83f03610719b7f0c471d00ee5774*10&ec6dc36412af6aa426f01abbd3ec9fad*10&c4761ae550aba5e2a5ef8b4c099343b6*10&586f78ed587b6d537ed08b0a8cdcf80a*2&099b6d6dad7dc64ede3ec92916fa41b9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d0b566280c531d0c00312740028e3549*5&35a5f2961c99c8b337a06cd336f347e5*5&1d6b7d4f5d49a47869dbc6c009dca318*5&21a3b1f348f816acced374c2c7d10914*5&a72271a6054a946976b0aa799b404324*1&b2f9ae378f8dc7d1f3874851f60e4b13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c8b48c01106de33afc940e0b172fba9e*5&92b1e1a1d74d7aeee82d602b199cac2d*5&0c637acaf074b19db185144ee53371c6*5&e832cc79816a58b1c6a90ac883c75d32*5&0b55fca85214f525eb4ca6fa70e05b55*1&ed411c4b2ff3b251ca734bcea2936c56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"63bc37524a4fb3d768c91aafae803ea2*10&b7108f6527fb15e41d6b181910bae9f4*10&954750965ba087d9884347ea99fe48b7*10&6cb29245b9dc6cb21f677f6e27f535d1*10&bdeb2dd0a92659109e38490e6ec1fd21*2&31309409c4ac3d59ed8b591a7c7ba6b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9519e59477c5622968884409158d891e*10&9745e4a683ef638d26ea94f27b92f8c6*10&4606b83a35c11baf6b6ab16cdcd13de2*10&4dd499d4bd5c79237278a5a42e202060*10&f9a5d6c1072c17dcab9f4ab1ab29bbb1*2&63fd5b3fb45bd64e7584919c1a9d01c3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1eadd82c77433a996e48d243e5d4be69*5&bdec63223bec0aa6d876505b539d3658*5&99c362a87bbaad6cf87a015205531475*5&562ed9f12bdc2efaf66e061d706d1a9c*5&5d943f4c6949c7c1bc70cc56cb9c1f4a*1&c02491bac77bea146b3a807eb28a406a*1&004a59f535789f326cb997b41d0375fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"83eca27dbe63a3f74eed86b75627a2e8*5&980542d9f009da88a4bd39181238eb11*5&1bf012626de87bde770ce3e126a45307*5&3e2fdbb9dd6b7a0e7fca5beb34ca3c2a*5&27012da04287bc08f2932a1bfa66a350*1&ee07a09f8fb33253b9801ae85e0498d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"36b8a996e83025475d5a7dad06cb71fd*10&673e157861a647fdd1ccc714d2ea5357*10&ef1f1703524cb1fe0d3d938a717b2a25*10&65241599e42294b723ee44a94a313996*10&e260bda746324381e2997daffe7eb038*2&ecafe3a578e2266ab1aa7a779e4d1962*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bfe61fc882d94d212a12594e9bb7f709*10&b84822d5a27bd98ac57a5ef40e6373f8*10&0fe71f05fe91f1f859681780ad71aacd*10&b80b29d6a7c50b51a2016165593fe6a4*10&5c2ae1620d293a1c6c8a64f830b82728*2&7bc83b5f863d5b95a557f2ca8b1b53e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c27c492b099cb23a67ade5f0f62c080e*5&7cbbf3937f308dd79f2403b60aa8cd2a*5&5ac54e5dd83eeeba751ef5fc01bd0aa4*5&5d504daa53b7aa2bf02edc355ad94f4d*5&836ad5f3b779be48e63fec37358c2d1b*1&e79b076c91eaaca875b99c0e84daefab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"15d1f407013f81d5158ecc96421b4ce5*5&85b7c7fbde7defb3b18bcf9c29a9d3b6*5&9f879b226435fa7f20d1221df9765cb4*5&61a1589efefcafc3d64da96b53cf893a*5&6bbe6f856a8d891c76647099492046c5*1&c7b1390045b84ce8d470a82a4afa8e2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"eb99aff54571ef88972b77b1014efa09*10&aadfc1377bacc437443d36b98fc4f66b*10&af2149e7a015569f6bc9d0b255a4e38e*10&252e3455ba912163728e81cb8fc10d25*10&88202219c0a085e17981cc3211ec61b5*2&c9447d0a417a53285e57438246bffd51*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8a723759cfef677183deab04a3835955*10&a1004e9050de04a459255e086e4d1673*10&05b654e2c488addc55da6e8b7add9f82*10&ba9b9d504b4e434404f38ab56059ff1a*10&3af446f18c4e77ab1acc6f681b9ba3d6*2&a7d9cea5b697e7c8e03634695d1ec006*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a59bb2c4b6e49ef7a61457106bff2b41*5&b4b2c8d69172cadc5379bec003bb8188*5&b1f68593e833d89a66793a7bb485d01a*5&fddac30c7d6cde34abfa35c45dafe2c6*5&03cc34e2d8e18582e9b7fa3170a810b6*3&e78c28218f056f60083e972163fc7fd4*1&fabcd181dd4615ebfee86ca021f87504*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"765eb1433e15e41761a498a41e22115a*5&11363730ffd875d04e4fa0cf62c8bcf3*5&836b443ae518c5c3769ccceb4be69450*5&f292d3ada4a174042f96e452cfa6129f*5&09e792e9c868a1896fd344cfeded44da*1&31c1eebd2d41ada8277c3abc02a46c7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"c7a774229ebecfd89ec81b92080d1336*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"4b0dc82b60dbbdf7b723d4014f5b9955*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"dfd23b4706cfa7fa9c2bf54aec429a7e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g8mb32_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1aac767711103acb23ac58a7c69d086b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:0.271:0.314:1.234 mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0039c5eb5b274c0728e05c99325189f5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"67dc7ed17732cc50ebf3c06f47bafa8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d9b8ce616da11c8065073056cb0643de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g16mb32_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"fead3b578408a3ff5a38be848fe6db7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_elu:0.271:0.314:1.234 mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3e820d61abbbd7469d11ee58ec0521d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"798a9f66e42cc15281a988cdfa1ca9c7*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g16mb32_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ecfd6cd7aea0ae2c197128c21d191317*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:0.271:0.314:1.234 mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"23c12f43a74be34d8e8d02417f3f681f*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e242b02f076533fb2b4f58062f758d79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0539c9047b898b0f29d9e1d67c6d42fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g32mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"f7fb2f0eec95b4b50fe929148469a237*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_elu:0.271:0.314:1.234 mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d7972c1540ec99b44fc937285d68edad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d1a1f92a6d4f522a625334b0924cb8b9*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g32mb32_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"39d267eba000768740a2983aaaf2ad88*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:0.271:0.314:1.234 mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"16ac5c51ff925abfc793efd4b0b48efe*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d123909235d132e61ff3d81b741fdfab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0af58d21f873e0382bacddc7467f1d66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"682b0dfc8b7e0237345fcb69128e7599*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d85bfba8d8bc1dc9316cecd26a3773ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g64mb32_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"9fb336c031fb57c0bf2c7ceef37e3e2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_elu:0.271:0.314:1.234 mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a955f641af9e4255054ce30e2a38826b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"abf20f6a235f1cfb31e89cbdd6e050f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g64mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7de6a8ea020369b4e5f0245937ebb4e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:0.271:0.314:1.234 mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"119f30f56f5834c406462e459e05ed38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"087481b231a62fdebed46b1d8e26b2d3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g64mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"32af84ec90800f6c31a768be39a30c68*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:0.271:0.314:1.234 mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6203a7f44702bdc1519535afe3d76a5f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"85013e663a200445a131b872543b5ed0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"de94ff451f982c7b043998857a3f174e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c157990569f74eab18bfbd1d8370d352*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"32c708e042d5486dc243d95896fe613e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"01a134c13e2099a889c0366c894a9f34*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b6e1bba67a3ea02295d1794ec5cc442b*5&bcddaf4f4c83b0f8d565ae4aea6fdf27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b4f7d92451798388786c9303a66e3430*5&99640228f706f5a1b38c843d593a7510*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"6a64a7e7991ce9a7bfcc8f6ddeb89a18*5&122abfadb56df5ae3df37e22aae176a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"d38f01204b88d6fb040c0b9810186bf1*10&765138c68fc828421f7759e0e7e2a2a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"d239c5e79d0b1a1db4629936e4000c58*10&e07d99635f12caca95951c6e36a7310a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"aa549ef14a3bc8358ddd0af1244dd9f1*5&b721f8221c87c1e09f27548f83504f0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"2ec5c2daf47be12a00c6a2181fa10855*20&8553e483dfc1eaaa025d5e5ead5eb553*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"2536d0e59d64589f43897320090026fa*20&6bc7869da6855b84574d05fcf77a594a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"d823e9a11687d97484e7a0a5ba3dd27e*5&d7ff3822c5dd17d6eea5343375512441*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"1e6ba8cd325ecb5ea1ae18bf9b354162*5&5a2891d57d5c04cbe6eefc08b0287266*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"90a99a6d773f436609029bead708b486*5&7299a09250808c06bb81432f1ed5dd7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih512oh255kh3sh2dh0ph0_iw512ow255kw3sw2dw0pw0_n"117b289cd1400d8cdd0baeb890b0bb6d*1&400c079f97e1fb946082646d8ec33771*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih255oh253kh3sh1dh0ph0_iw255ow253kw3sw1dw0pw0_n"fb0f9c1833a48c6cd7c9b657714f863e*1&3c83bb98ab6cd672a29a823f83780a01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih253oh253kh3sh1dh0ph1_iw253ow253kw3sw1dw0pw1_n"dd190aca4a893a4ff9df10f02fe513ce*1&f9f0be6f15fb8ea47cf4644b3c8c934c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc80_ih126oh126kh1sh1dh0ph0_iw126ow126kw1sw1dw0pw0_n"63c8c9ba7e44abca809699301ebba682*1&298a98736cc254d8f9267aaaf01215eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc192_ih126oh124kh3sh1dh0ph0_iw126ow124kw3sw1dw0pw0_n"30ddb524b9c8e7176012895ca747ab7b*1&1f1cd0991800c62d84330e88aefa9f68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"a1c80cdb4cca3d5533fc862276514cad*1&e02b51e845aa727a16ec82ad10fbeb10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"e34150facf9c3c69d9d27f4418346fd9*2&87f3ad9730b98a8566eee394c7bb4626*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"fb2d17feb6efa6859024ab6ba94d7ac9*1&e9fdaa1ce2d5529800bb97f7ed212363*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc64_ih61oh61kh5sh1dh0ph2_iw61ow61kw5sw1dw0pw2_n"b9357730e60c5c840bc0b257240f8647*3&830f4c57f7bda29ecb7b9fdd1d15d51e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"56de08fb04a482c79745894622f64dd4*4&92ec9c669bb384ae7226f1a0cb38cf33*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"4fbf765728785d6e5b27b8273869a39e*3&c27c6d7bc0b0bb424c47aea7074d0945*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"4816b7b753130d1ecabfb0f9513bf942*1&2a48b91125b9752eda2868c88cee1cf8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"7c67598f824b440b1b6b7e2ca1c811b4*3&3065a9c706218acb0abed1eb2a577093*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"3d46b6a9f44da80338dd4332653c64d8*1&d29bfce2f05a316a1961f1e004face75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"2d58807daad0005619d46d2095cd1a3b*4&9e9c37a3671c947328f32d9c7085eddb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc384_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"255ef0f8921d116a08df3379a5670d80*1&218634f29a2889130fe9b6d148e2135d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"725c2da948fe627ceeaa92ffef9c1669*1&7add0a7bfd3ffe337c0e605769d00147*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"b05b6778d8cf5cba00ed77a5a8b33964*2&b27cb5d643a30039497ab502fcd08b70*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"e22eab0dc0c88c0906b09cce65eb27f3*12&776b6387f305b4b38a735614e37e8e23*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"0a59685e68a4dcf1323159869e5e435d*2&4aec5947a0cc6d043406b6ca03a74e4f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"e57a2d8010bc58af41a8330fb3b70000*2&7fe0ad39929f1af01d0419446bc5e14f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"787c84bcc887834fc5b7ab7d75c5d78d*1&0beef7688fce26fdbfef5574d22e3062*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"aca66318491f370b7869880fdfb4f50a*1&7433b8a09ae1ca5623d708e13310fc8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"ac8db3e466838bfce3982864ba90a155*4&eea1b4dc90a7e942ae237385c4b6c3b8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"3cc4665043af5032712d6ad3478df578*4&fc06b7abbf82fe9e98a8b2da68f73667*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"de84e52cc3b3066292fe79350044f86f*4&ce2f94303699a32a54373cac6f8adb43*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"cd5be0d1cdb96b6c5f9110553ea5d364*2&2d5a26f9685bd554dbfc215a4409b609*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"8a180ebe60346223047788402ff7bc35*2&f1ec7ce75808b2e14c4816d2b7dccf0d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"e0262e5149c02cdb5f5e5297342e6323*4&cd07dd0b3dd4ffec07a66870984b62bd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"0086a49d021f68942ccf965738843146*4&3130449baa5d250bee6f4a78b4eedd6a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc16_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"00602c471dc614cbc23e1017b0e29d1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc84_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"7024b073b0a53c8fb0735e5a689bc89d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc320_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"13ba96064b510b18a14a9e2dbc1af081*1&a9f3b042d872f8cc91da7b016e2d125f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"a1706ebf3b315312299db6a4142ea9ef*1&2757f22f707ba86b4252b3edf440fa5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71433c5dc2a82db91c3ec87f865c073e*1&9b854db8b023273d0665b303a6013d73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"08a1edcb0888a157e3a81316303a0428*1&eebe070977931520d6c0594b4ffc127d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"17eb43a007a0697ccb25477b47f2b9f0*1&68f3f3eea25f01f0f29ec629167da6bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"748f452fe36187270847c34c1bfdcb36*1&947af258ca21054683e6503690b06cad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw3sw1dw0pw1_n"aa67b01bdcf4d26a5917e01b6f071c2c*4&449a14f3e8000f6c449370b4260980bb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw1sw1dw0pw0_n"4c7f1033003dbd693e79c8f7fe8835de*4&087eae6ec7829c4eeab81ad1a785dbde*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0ab380c3b6339b151c92183124c7fbfa*2&ae40f3060aeac4b7fb1feb4b4d6cc86a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"580ed5316789fae9b54ee142693e0a6b*1&63b85439203bf8e37fadd556d76f4cd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d79bd5447e22cc28edc61e083e8e5418*1&de8f9e0088a0bfb87dec149331ddf239*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c605b53b0f74204ab991d0b5c08f9f5d*1&3eb12d8d4dcecb39756dbcb585401ed6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c43c1894a35f22f64817a6b8b4f3e248*1&971a50b9fff44c20a65f0a189559a1ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"671906dfb4116893c0d7325a70401478*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc24_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b8ceb4eedbf760bf951d2da1b8a48d17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc126_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1e0ec31b83f9b5eaee9e5dc976c5849b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"7ec51686438130d6734359dd42f602fc*1&f9087611333732af6a72154682b704e6*1&6ace8c4ef031fa7c5d2989c95f465969*1&314dff29ed0d0b20d213f95b448d35fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"61fe9363d24bc04c3de0688d36b98c58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc126_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6093ed0ac2db791ca6587bafb7061106*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"cefa636dccecbb121eb4765d2514a8b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"637c9860495fd57b1efe628e3bb5504f*1&904e6fc63fe995f9d7d84a487489212a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"a11819e34fdf318f70ccd14d7009e103*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc126_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"b43c938756b510519a1c2eb3bda13efd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"5c6f81f9695a7ebea05f07145acc9e33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"e529109679c1bd29ece2b9eaac765600*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"f9763da2c31012395df7626bbeae4eb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc84_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"64e91fa77dfef5daeb12724cda714cc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"65fdd51edda8b6697f9f51b7fd985aad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"aafc1beb1278b72fdfe837fe7444e632*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc84_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"41bbb85e8d186ffd805f13a6c32b35d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"11441c65fbf30c17639a2b590968782c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"5d936044e61831c81fddaeb3488172d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"0ed1c4294889bc4d72c109428ef81124*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"47f097db4d84427e26f60ea946310257*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"b1c28e5512a5529aff6253d3d5b4b96e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"0d5e51871385e72bf0f08997a86cec5d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"195e1943c563c6bf65a3ff5f8070e034*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"1c6e4b97b7a5f1a82abdc6808f188c9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"3c467750a4e916b5caace419742efa90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"11f329799d1986876ecdb2d606b228de*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"a7875152f0cc8ed40b17703252a1056c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"d9438457bdeaa158676bc97587050f87*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"8a55fbc039817c118872259d9eb6f115*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"1a0b809b1c15b8995a1af57f6f9af30d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"51f78b2b3d21d2388ec955c7a5a92bbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"9c4b260da38e6de24dc56655b44e953c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a7a9f5b592ae1065c96b5e59a7423e27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"7275cc9772baad05b2386aa755dab848*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"eabe4cececad2c291d55d2a3fa61ac5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"455bb4557871066949bb37935cddbd1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"5505d77398df8b6279769f35ff4dd44f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"213bd7b4210c4ee80227bfd56f84d9f4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"5350972e5b5fa1af2d104f723674e30d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"a4dbcca229402636181915de9b5427a2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"f6524792f1e9c9aabb78888625dcff0f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"3347b041f528a62111c8a0a74bfac427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"68894ef46f66f2a3cab3d7894b1c876e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e32a12a243860e3ce62d12f74a3222e3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"033923e2a41bd265b9bd93cd327ab166*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ad3669b397a7b55fabda9c97b45e3d95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"7e7d3fec622647bdf29d3697ceb926a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"1f294396195dea51d2ffcd64924b8449*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"a1582bd52a355e8d9a97858d2ab3992e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"5924d182b2e0d9bddda328d5d68fec75*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b02eaeda117d7b6b60062324e4b61dd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"6246fbbaaabf5fce1d5fdd67f692bc70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"eaf179d0b564b86f773bc99f0fc30a21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"092a09dc366f8f3e3c328be3848bad76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"41391ef16dd805de0ee5ca674d528681*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"4ce986d02c4b9ae774cc74aec23ebfd5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"dec30644840e9244c8be77e223f0f0bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"ae77a670cfa403328c1ae57e6a72e947*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"4d6b4b28d7f8620af101313fe221a9ae*1&81aac45a0d7f51db272bcfc303f9ca83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"9d1e554c7fec05fe52fc93bd145b2a54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"33c3b644af9844e3024afc0037bce165*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"3686bcfb66766481a918893101656d3e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"8782548d9db3f37047e2a16eb2d7eada*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"395629b2000f0e26290c656b12a8f9ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"3baba0b8cbc46328569aa4f007b8962e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"2201846575ada14a7b55d6a3d648d663*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"73a0893afc0c80b7496872072c2b7f4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"743831a7e596ed32ea4de257984b3418*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"778d576e15e18abf1bf25d92401ace0b*1&5c5367a21e96232431aaa1f58b706c7e*1&525532e6118bb090c5561bc4f0a74b20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"4dc39854190bb17f7c6e77707ed43dc6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"77bd03149d5051037bc9e536d787ae5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"47307dd82b88c567732a577cec7fd68d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"be92ec65e800269745cfe143a9fb0c7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"fd44d723aaa87889ee296067fa33fc9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"0072b21159935def25cecea9314d6629*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"095062c5ac87d823fc691af58bd82922*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"99eb8e7b87ad76439aefa3f79f118b7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"73a308faa375b422d872df56878e4ee5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"5fa25f01ec8d92afeec0c6d372a607d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"7b4988da720d9e4edcedeb88c127037f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"e363eea33fdd944381bfb1eef927a716*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"90b7f889d3de1411b14499e9ff481dd5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"4421d71d5008bc408db14a420852280e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"1fa61e656809038d5aaf8eb5051c7389*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"7859419ecc91a120ddcb17dfe59b5f85*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d491ec3f7246e0f8f15ba9422b7ee68b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"48307126f7a860e2a350e28c33f751bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"635d0de92fb234c180338bf12adbd50a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"40088e230973dbd6fd3de2522fdd06ad*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"754b455a73e144765d7aa3b33789afb8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d13ece0195d2fe22cc968de1064e61e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"033ab8dbfd20102f58aef7a4c7ca8620*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"1e4ace0d22d2783a910650a0d07e657a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"ec7f9a26f0a184969eeb1db2a77eb2f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"679355fdb70eb1f2f1633d5be5e53f56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6b3d9eec93feef44a6591317753311ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4eb922ac112e64951da59a6a611f240f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"959d6b1e314dbf1eda53a6b420827bcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b1948c7a8464320e4a5116b794915d35*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"d78747f2adb1d1fc514621026ce9e961*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"f0cc0f45e7002c1a25d0f4c2831a577c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"fe9b4116d41724302ee6aa10081eaabc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2f0702f5cf5ed64c73204afe84e7f16a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e3ad11a8f5b347871556130c44f3d96e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2ffb92ab677066b8354e2e6b1a49f000*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"361f557504337ad2ea399f60ec0fab99*1&d127d8b485011a620f978466aea360ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"53a2498b6e5a5ea7077d2cfea87d05d1*1&9a97ebb095226f508ca27d453ece982d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"059bf560dc988d4033ec1dc906687765*1&53012926c7f42d953317184db80a997a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"fdf0fd8456c3ffa374305318b0cd8dbd*1&0e0e0904bfd960af297d01f9a7d8ed9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"8b8967f1a0b600386fe101a69c8319db*2&ce4214fb537c2840bf22bd4d9923672b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"1190cee759a4a5d04db81b6a6c85e750*1&9bd2db46677d10843e340b605afc7d59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"a2270513c2fcc265ccb29edeccbde0c2*1&15a51e1de3514f38f2e481ed51f3d3e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"57a30579dea992f666bbabf69ce37d7e*1&6601dc98ec55bac819f097cdbb188431*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"781fe64cd58a7ef313e0d199ff243a37*1&091ac59b331ce2c9533bd779af8dc1ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"04ed28d6b2ef994903c382aa2cd1ec4e*3&3cf9836bd12ae0b2202a305ab728af68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"7b92fd1fa0cbe4c4f24a3314cba6f09b*2&878c42beccc152e1f69448cb01bda5fd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"dc31b4407c944e56e5aed68fe88619c3*2&f7c2e30d9ecc00347ab2eff08919fa3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"6759b4ca54bf22b68d47ddf492481a8a*1&7b86208a62eadee5889329085a5d45a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c03c0a7c86366040cec61df0df896f1b*1&32df705ec0f8d3f3481a44777a4a6406*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2826e5d202ebbcec9f4378494d4a6252*4&c4197e2bff537fa53abc6f6e28dae5c9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"7793a8b48630d79d9f570c112af91327*4&fb8e829f5075a8a7cf9d8dce319d5e9a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bb28f05297f3553c07a4d5be50a07903*3&77fafe057e33359d568b2e4d60010ed0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"41eae18438b52f4f1b13ea142f68b0b5*1&40a599be9cbc39d51e3ae4508dc34c97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"85723e6ad7435cedab3c8745dad18cdf*2&94d7d85ce0797cff69bbc755148c75e3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1e983b1464ea03afd2905c61fd37b09e*2&8e028e60bde2f864596da9fe430d982f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d83d1465e30892f564db94756bbd8a2a*2&d337ffad6b4855fde61b72da0ebf74c8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"cd40b9270496bc332b2c8b177e036538*1&243e630376f90842ab618a579d0e3303*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"72d5bc12c0ce2c3c4af55e34a77ce369*1&7bc2bba9b5a019be7da61a98f7a75db3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3ba6e10bbfe23673ffc96342931c5714*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ab30a3e2965796522f75db7d862bc90e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4c2003995a333bf9dc0762468810955e*3&e1e26b24008ab597f7e16246176c3f63*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e1b05cb2cf3b7459879df81249dedd51*3&8102462b57382570d069772d0d8ece9d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"92dfa6b87b6c516e831bbfe01d4b51cd*2&6dd0ef9eda625ccb9729e5ef61795597*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5a4ef1907125c17b00dbe6661cb13f6e*1&0e6f50c5986f0dad13f2295a80be882d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"458fb8350a61553e01b3cead82834f1e*1&284ca520aa0584f55cf340f270077134*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g1280mb1_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b21d65fc77f109f10ad523731d31ab6f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ed9b61280114347bb70f3c20996203f8*1&683e4e4fc7aa587601c5428c67c5738f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"7643f2e4722445ba1c1606af89c5c625*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dd2a0301710e7742400761b29ff06836*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"7c827a05d7b37b91f5a49c84cadb1bd1*1&c5288041893ba73685c91e350d2e1ad7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"32f127b9da94e5612dddb5c09283e8cd*1&b6f2f89333a16877bc19e4eaae40ff6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"978d8eb1087deabb72d86599a28613d0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"befd0c1d0b02c1ab2bddb812833335bb*1&78b453deb571c4d0eb69061d1c6cfa40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"d939e2565d42c64517ef74c717569292*1&868d05dc412b60abc2582d7fb798f145*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"8542433e2e57c49c17528c5f8a115997*1&135683ec46c4e51245c49ad847ec6bcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"a9c4d00cd2219e8c968a79a0fee529ec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"c81cdec5e0289f7cd7534ce2a523db31*1&1b87b9e940f0793a99347ffadbb52f7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"d0711151b573c802cdf863aedb03e43c*1&d4924586e8a26a03c3cad8d94751ca83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ad35d25156eaf72337be96c284c07788*1&65252bacd6092d7f7532432950534f34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"40f881dd30ad2c83e2ca7a8d64e642bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ae38e146a62791ff3e0f9b9b99d12684*1&fe787ee2d7289e90594d8f32e5989365*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"804e567727e84018ef3b0c52efd31430*1&bdec40f726484aff294520f33971f6ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"319dabd6735124413001a84f605ec673*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"8276d6f64fe389fc59b4280d8dbe160c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"f340abcaa8b25e66cf69b11018efc608*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f484630749d39b503a2c1661bb340149*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bedee905923badeafff417b3e0301e31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"21a1a7d5608da36c78cb3c25ef113ae9*3&337476cc356b5f378757de24773c8f4f*3&0afebc864092548376d6705dd2d0cd48*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a1ba8213659c450e5e503c5f056b99cd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9c87a9f00bfef44ae0aeeb41e8372fdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"51c258d86790a1de978e66964d648e5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"e0488c3003551195c8ebbf9879438288*1&cccec63e0051a7b2275a0cbc5472f07a*1&6ceb33e101c6c4de7421cac0100bea5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f888b00b48c3699b0a41bd148ddc73f2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc192_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"9db6fdc61483c33e476b53daff31b21c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1374904ef77be1f62087baa4ec21e648*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a9970a8a251850d0811b58008ad0e7cb*3&6d7a28882391c506e6ee684d59f7ebcc*3&54f25b3c3ec7e4e5a66571c4ce0044cb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"38bde8f7345b18164c64d71ea1249f5c*1&f147e0cf3f599ec6827f265f91674650*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5f61ec45c631076b41c187e0cfafcf84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"964c316d03e60f78282b3e40f3ba5976*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bf7ec533e548231de512ea368842a514*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ed9f7a62567748f2640b39ecdaa8156d*1&49d4ebc8ab8602cd0bf2429787b5a94b*1&9540f02dfd984254fb1b2b5d690cc52c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0ec782e46a015793953126a2b7c49af*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2671b256422cbab5301650bee9e41db4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7e13d9b3301bb851c33956ffae906036*11&d0078c0e6b8a93c9c9dc0b1839a81670*5&c52c2dc10a1e8d740c2659836b8c71f4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7563bc89993c8b4c3a83317ee9ff1b27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"959b34dd12099cea080e946ef9e642bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cf092dac8b1c29f9bf215cb108294b12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d3e6b2325adca95f0f93cec0ad5fc694*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0bf873cb339fe5ceb44b45c5261feb7*1&8ac27391d3032e0b7a796534e7a53a3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d8afa693b2d553c36220b2bdac954f68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d7541d50c5869bb06a15ff73c3a4ad35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"288056d4dd0ad97bcaabc8f22564350e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3aa46ad20e41f1f158d5e5a47dc7b79a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e64dd3a93200afba73a23195eef17282*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic704oc640_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a49a434f61fe762a9097c1ec43e2b66b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b4bdce23c814a77493a9ce0a038e9a9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ced37bc8df645e69726bdca0bf75339b*1&26426d43e0b9c44915defef19fcc0788*1&049397c9b7ba52ce67f0ae910a020b34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2e23ab8e89865ccd993aa4eda332a806*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"53cfbfadf217ed3ac6e61319818b306b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6bd1bd7f2af4b61b623d63b3169bba89*2&7baaf3a36b33cbc1bcb8593f8abc4438*3&85aa4f398f79cdbfa917149887f19dc8*2&af26bb00c6a9723b2537e8719dbd5199*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"34351cf5f3c9b0f1f7c36bfd739ad3a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"714a5964514ccae818128a332a97350a*5&a0f3d0cc7158c4f141cf07a82a30c220*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"82666651509a40e146fb17694db3b20d*5&a8fa83c6fd9f33b0d595fa26a12a61fc*5&083f8e94bbc15acde2c34c99578124d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"058e9bd3d35ea3dd4122cf599bbafe0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"86588911217fbd048921843b86704ba1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"0685a2baad828c112a2ff8cf2199e410*5&0aefbbda2ce965a8073ced6f03eb5026*5&27d3324d459674c739185331509fe1cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e73313b7c4d2661237cd0a5262984230*10&4ebaf87d3984d5e47fc5609d1c6cbc58*10&ae420cfb3113e94dde01edf6bffe482a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f4cb28e7aa994f4e61b9f965cea9f043*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e6b614da07ad60b890e441e8b3aa4ffe*5&07b38011a4b7d6ced3d19d887eec5d6f*5&6dcdb658107deb59751a3f76732e4d2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"b6dcc924d0a26222f6933572f8a1a085*5&1960023b3ae2cfa469d837e83d8d813b*5&fbcffe99df4dba41f9a5b8a5621c2549*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"ed3069557b63b423bbcdfa7fbf54fae4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1bde79932551ad1cacbf88282ce95b19*5&c42d7d116c6975e38181946dde1083a1*5&5944323ba1a5b8d8c4a276a0773d8307*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a3ba77db7438bf003f4c588f2454deb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ec3f0c207096fbe60be124b50d3ec44b*40&ee24fa3749e2abb127bb5b4053d96d25*40&56581c5ea791f10c96087bceb28b713f*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"070b3412460a5b32ce3f96ca9ff6227c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"459d06e818752c224f2205b10f24a3af*10&334a3b66779ec67966770ee92d00bd1e*10&380adc6cfbc9d659c9e6a40a56cd0434*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5c185eeef53f4327a02cc9512a0b91fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"16c03b2b5375520760c4fcbbdb80a7cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8bb218a3e93cb6cb2b88d1086558a7f2*10&226eb4e9d5b9ddc0fab4353dc09fd4e7*10&ce885b7066577540fd0fef7f29f31b3e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5027891c0fdce81026d0e661807fc9ce*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0abe26f1c215cd7f407afcc13c1f0d62*50&18f1cb7d22073145aa7ce5e863366bd7*50&e9595ad614aa0865bd351c1ae87dde42*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e64b9f5dc7ed2d493fe4c63de8ced596*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"57e6a2bd09adcb06ec4ce75861100e8b*15&3ad543a71bbeaa6a073595e719f11d6d*15&2d4c597f54f6a45b9ef8db4d6d6b5c8f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"19e99b7c5a94e3e648ef4e8ec7c8f697*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"7e891ea2aa6fd88faa00d768cf1e5e2c*5&0e30a4854aa8a5a51b8c8f931fb9551b*5&063c948d07380e4ceedbbf650251d339*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"97ca6b35c95e53ab3b1d72cc394d9f7b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"f554a3d21144580e2b6847c19123008b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"16f06f0aaaca6d97b973c0c79d2a54f3*10&d71094bdc70b30c1dfe1278e3efaaebb*10&3707556f25623b3fa1b979c77fd5927f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a9b7725ab199c60fd98360b42d8e8caf*20&1d16ad6eeea10ddc8fa6ad074134ab6a*20&18736393fbcc494d565afdea59e7ec02*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"bfaf758e29548485c5da118c14eab165*20&34d5c94f7c047a6f48a1227e0ef0c5d9*20&1011af199d562fea15a3430c7eba4b18*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"60579c2b1cd2918f6ff3f866fda40b37*20&3896a222ff6e8d4185b1b22554918495*20&151566844c5d865b6d23d8f096bb029d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"10961b84e9fe5a414a0d287858b2fbfa*10&4982084764cee440fa44d72da4588a73*10&dc9722af77a4b38fa571b4dcceff6b96*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"429777a7886ad3abed25cbfddda8e432*10&08301f567e475ca95b1024cabfab5b5b*10&2a3b3a1e37832205f75f0071fbef8946*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ab973903f434d7c0c8cf33b1f07a9417*5&9f73cc11ef5b953b7998a72eee14529a*5&49419da20275d66ce2282f48c264f4d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"17308dbd3ab71021e5b906877fe6ccb9*5&7b3e21f243dd478f54ba6350eeacf54a*5&cbee8be7a43c9c65567c1331e6a0d2e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ad4aa5d9da4099f322f5974fd85323c2*20&a66d670f45d5f4c633d0caf52cf83f1d*20&6cb18b49dbb7ad2ddc6cea64c699c7cf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2cd6716c380a2441bf44c5bc2d3ba45c*15&9c7b347f292246d864225be6b010a1f9*15&05cfe28d5f96d46e1791416aa226e420*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a754c2ac99cb8db80ff44954da360719*5&27edb203c404a5c4ed7d8287b039cebd*5&4873faf8b56600035ad7697158056ad1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"3d113a215a64fb7b2ba3c9d9a2ceab65*5&1a89b987ad1db1f9bc2dce7de444ffc2*5&feb15140b2819c334b280a0b418f7513*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2d4a9cf926d1da225877f6aabd5896f2*10&6a5d5d8e525d45fa33321dc3eba1a429*10&eae1a5249fd027de9f98bd84e842595b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"cafe25a2fdc0178f6ea17117dd118604*5&40b7e623c2a59d199f25529aa64e0f8b*5&cbd5d452ec0a0e9e70c635b02adbf974*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"9e9dec224fcf9acbe3f7d99e2ac6c2a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f9107f186f40fdc8166a55c76dc57725*15&e90daf22d4af52dcee9c6b136c5bf30d*15&9944d1c2f663ac0a8407a32d5d84c172*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a9226fbaad2439fedc5c4aae36b5bb54*5&a9e23c9b8945c4fe8851c232cff51ad2*5&f1408844c92922b718e28b3fa767b538*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"af151e8d883e2c1683c759702424872e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"8b83c8b067d510b5f6f105eb1acb5ac8*5&bd15a55f422e529c7fac39ede40eda1c*5&47efc0e4690fa9f7d04d51d883cd33c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"4cf090c57b3891f8e068d66a29e23a78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"593e00cc3ec52715d59c67d1b8e300db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"9752f5cb5c432da30a43683f134a78b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"6c913403321e09b62771faa7efb4937b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"14abc9910377604a8c8f9cd2f7e3b36c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9322529492a84547d03636553924ccd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cd6fa80987e9e8be9096c61cf669ec69*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"81a6d633e5fdf77eae54c1dd26f05311*1&d864c4384d2544d2ba00659f35f1e217*1&7b07a073d513389f165bd879fac10946*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8be5d695ce6a5e75c72340b81b2b5ac7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"535ca630710bc73f278b47fd54640d92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f82edf452973ad62ac07db1251fd98a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e800a6f30ebf113ae0974b7f1c876ca3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"30c548333239ebaac0aa5d9e2385088b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de90d063981216e6ef75f8e8a8f24b88*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"afaf026165a3e5472ceb196f987de82b*2&2aab578d5b72375d4ddec9752b3f6ddc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2b0bb4db4eff08c59681e07a1e11c7eb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6ef784c4415d39bdcc96363d4ac6f409*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d00577a6b2b9b6b98cd99ea34d026f19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dd75dae8347ba46dd23b28a70ab55d27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2cde5faebde2d930f41830309fdefd5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"f191c263e53dbb3ce0c02a13f311a72a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be36e761e4e786f96d3cab43f7fad302*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"825be6ce0b9807d14205077fa5427961*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g200mb32_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"834ca6702b2423fc49225870a54c2eba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4e1fd29aad01dbe1e3cea5fd2e1be813*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e3a8627fe99d8087baec4ed6a2d7bd1b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7b057e9acb225e8440846a1e2bdb219f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"409cc201f06431762888d2fddd20d5f0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"984f7bf4973e7987ede7e9a317898cc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8eff0d80de08c2ba7d58677fad71abb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"564923750eb2353a220392cb141b3a41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c22a21a225fc3897c5de5c06609f6e08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"359c62e5f4f6b9efff33cf3ad18423dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0eb85c5061321df601e2600f539b687a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"26a77cd8e4fdae484fedeea4f2526f3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"405d83c4ad73d40d4139eefb48f4a028*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9eca35fbf9cf5bfd266f0c3c4cab5b37*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b0fb0f56e144917cc7cd0e95f3db047d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1e3724ee9599647c160e3fbd48c84b43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"f5a0cea1be9d7cfc85a799376cc7f7d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c6931271ef845e4baa5b978f4ecb75db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"050844aeac4129af2ecafc677cac107d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"710507cecb25d8dc84ff9f46ebfc3ce2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1abe86b25b7174917c0dc9642078ceb1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b072d7a8c58963432391b14971998a4f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8f0f21bdc691053f488c233235ef85e3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dbbde0627d15f1912a82fa329b01f853*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"237ae289895646b5a2ed468c851979c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"29ea7ac136d0d92159eb1838ae7bd32b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"019bf225283c9effed08d4837754a72d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ec3611f24e8135cbd25e1a0f83bd7a02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"5fc0315f0c6615d736fef8c3c4723340*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1d0f8b1bac35bf65fcbc044d3aea31d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"548efc4f1102539a49c78a1a200c0dfc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"db36531a35418f104cb2b2af416629c6*1&5bd68b52e4f4bb62416c06cc01286c0a*1&59c7a449234965981549d260059a6ddf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ed889357368baf2e20b4c295e2ba5529*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"6ccf4595f0c3bb95cc068e3e2d7a3e3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f4fc55f0560010a99cd5fea3670eeae0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8577d47f3d8bfd9802e52fc1b01e3c18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"818927553aa26990c004335c0355b522*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37bd9f9627eac76c20ded85dfa1badc4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"01c5f12c70643fa3062b3553519abb3f*2&76dba88890c32730e453e6d342d54a08*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2eeca35f68aa1c4eb16a941604c1c191*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"db39ebf290dc096cd8a38158427bc61f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"282ac1f4a1f1def2836788dd0cd3034f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d2e09072b1227197fb43367fcbecf9e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"30e49f5170afdf05c2450cea3c4e5149*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5077a0efe0bf72235afb0165a34bf453*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f8407b1a0b2a24eb127437ea19ea3450*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5785b1dde9a93260e636e78f1cf3ba9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4d7204f1234e0b29cdc99f0ec2f29c1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"87fc2c2adb83ad067e3ea388fa2ea735*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9dac8321047ec551cf3e44f3e3003c40*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1b63a6635270f19d5d8f7d28d031a805*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b4f2737434241f7f8b2f76328b7df52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8db1e3d9b9cad29de0023ce0fbe57213*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6c9abf564ed7a15de3d746d8af0ceaa1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fde327b95f01cc3b1594f0eb36f51203*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f759f82f2ddae5f7d79da48e9a4431d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1e4bfc4cbee2c49a32de08f2b4a30a94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6a3373fb4df106b8364d123df35e8f40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b4162e9ba7b0c21c1e1d476e1ed45539*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d97dc5db9f5ee0c617004555877f82fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d751a756e75859a117f6944ebab496bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b9b947b0336e4f086b934fedad6c9eff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"1c125e6556e7f1cf9a4247f858dc5ad3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d65a5fd580cef9cb7c98809b92bfa469*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2fc69920d3c7c8645134869ae6cdbd11*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"32ddbd099cb9712a4cbd37b3a0e40369*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"41cdb00f09f35df9ce074f1f26e3d02e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8fbb4e24899a14f8260679553ea9b0e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ec2d7ac167e12dece726401d975692d7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"18d0271b5944b6fcf8172bb0d26f6799*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"25abc62dbbbe49122db1324aed0d953d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b4f26b5e39b5f9137ab99cee1cb3aabc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc16_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"13fb346b0e824152b7a4846d65e84b1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb32_ic16oc16_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"57c1972df8625cc5bbd06329afb6ddf7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:acdb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"460f29b74684aeb53fd26b86f64a5441*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3d1210bbdc294c3c9fc644c2967c8567*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"e8fa6afba54a4692476dd71daa3aba50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d2fa4d597f53f1122b093c3b234b148b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e6e2dbd3eca4cb55237aaaa9eb226d2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b077ce65a949b2fc3fd9f32589ef5ebc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2c42d1e6147aee42e8781656e2a8312d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb32_ic72oc72_ih32oh16kh5sh2dh0ph2_iw32ow16kw5sw2dw0pw2_n"845e2315a973bf778c316de6b9213313*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic72oc40_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"50feb3be1f2071fe6fdc7cfbb9328a62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"046c03b59ea7b3b18b8081afbeb1f1cb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb32_ic120oc120_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"41ab8b4efe15276c7a501e5c10c4fac7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ff49a6d2b0fb3c4a67adf9ac5fef163a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic40oc240_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"78ab2422ddc4392dde7535ff7d0bc2b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"5dbcb3cc79ae1a110fdb1a50a2a90ee9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic240oc80_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d6a493876f324cb83de5485932ce0d3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc200_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5a5e822bcba7ce98d24d8f88fb007c61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb32_ic200oc200_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2b411273eeb023e939651cb11b76a9a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic200oc80_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3b0df352b7786770e3d63dc3a8691b9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc184_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c0b1bbdf3a10bb63a844fcf6a7f61ae1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb32_ic184oc184_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"fd3d06b5736d4864970b93cc3fc7795b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic184oc80_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7d15f9db38e25590ce2fb2d2aa25d9af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc480_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"329be838cfb84cba60a0b14abc17690e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a3537de8b15222853086d355ef08d5d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic480oc112_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a8bc831dbd5b9733060a82fc38710259*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic112oc672_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"54f2cc6ef85cd322cfa320ec78238a3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ef201581045ea767f84f20e844618ffb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic672oc112_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"439c4e3407d019c462d758277e7e0740*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih8oh4kh5sh2dh0ph2_iw8ow4kw5sw2dw0pw2_n"1ccbb3bddbe96635c4489473e8e97ad6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic672oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f16a398f57a3834d6f88ec6dc65c0280*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc960_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"963b65b7f648a2a7c43e2d8dd0f7872a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih4oh4kh5sh1dh0ph2_iw4ow4kw5sw1dw0pw2_n"46086c8891a6b66bfb491e2e64faa622*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"a20325a59e94ee89bd61c7c83677cf50*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"5fb230dfc35a6f3ab33b7b134d4fc179*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"ea29af720e95e9ecd9a6f68655e1f6c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"01cb2610fce7e877f5762484924e0c3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"0872aa6bd57bd9e79608c4af0d1ab92e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"a2370da7e56a4cd3361f5e6a102b2928*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"4c6568e235f6350d5f7ff92f7b9d5ce4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3f28dadbd6a3d44e40610bc3e541fe21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"96b4f29bb0d933ee394035bf070b54ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8020139fe0266f57f6e6b13010b93770*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"55226632cd7cb55be977c470ffbda550*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4a8101d5e9d3a7315236950bf3c0e017*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cbbff411ec552740b41c3f6351735a7b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"97a3b486b36e0c0eadf8185ab212c3f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"623195f006b039f3ea10fac7c02c1e72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"68ef56a63a799f608c9d48ba1ac2e57e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4e04502413896fb541b93b8e031affb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"aebcaedc2d32da26068f94ae2c7dfce2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"29ac36d2ae5a1dcea96199734739fabc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b386431f3f4838cd73a6eea13a081487*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4c26a6c6e14babe25a8c14c87e3691b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e7e6ee48e6b3646df7a6954460beb601*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9fa30fc5ebb12b4129096c53bee81786*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"975ae1a12db6fa03299e63b28c2d4d26*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b56d6ef26818ad5d75132561b087dcf8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"242495334a9ccfc60efaf5750b088c2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d2ed26420e456cc20a04d337d7204823*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1cf5024915d2331bef54d6baed6528a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b577cb2e49b087da7a2c7d556e1bf481*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"fe9c3ee2c0a070646cfe1414f22bb30b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"802e5a0e904208c56d91cafb68c2a428*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b46fabae877e513ddc0728cee83d06a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a9fd73b116ee3562a138494150ad7d70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b917f44882d982df57d714dab5632ac6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9ed827ada6f462f85c0a777f28ef6b0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d0faa173073623eceb9a15ea1621d79f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"18983fc18c766cbeb6d0e88a5cab2cde*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"48e20ba9abadb754f41e636a18e47048*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a19ecc35f68fe2b118e641ab1bcc6808*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"80f5bc00faa808df6e3d6430bf39f3ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"448faad36231bf7a16392d0d9bf18370*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1b791eb067be730e7eb7c3d2762aba57*5&a236eb8ee5cb98f71116df770f6e9e58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"2c868b554627cdb96429e680665022b0*1&9e7c33f2d0c51364667e1a8f208c431e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"0cbd2303d2ba5f871f155d43bc0cb682*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"2ba37874dce7d06d504a646f1473daf3*1&818572ba95766bd47d2e9eb27f34a52a*1&07925339a8aa11a5ebe3cc31358f6a50*1&afccb301c4448433d18aa2dc10812a66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"b889f495526bb8cf59dc42b469e309e7*1&8dc92ced625b54854316a361c6c9f16a*1&89e8c28e2d36f907e5a5910f63c86fa1*1&afa2bb51d5e46b442ba34a5254d640e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"027d98c004c96117406b7d0f44090120*1&645e3ef52e829b26d9e1c9876994f31d*1&daaa059f84bdfbc8ce82984cd5b28f08*1&9ad626da887225bd4fabf2a854bee1b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"17212d52319a7de67e2afdbc333e40bd*1&7babb7e59b3f6378ce3449c0e4836493*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee79707be687e7217c104f6f9ed03a12*1&b8c8afff14e50a7f933deceeb9014a75*1&4b51b85f71d1f9442da074b6ea60108a*1&5610b3e03fb9b17162455dc4aa2a833f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"82045463c0e4a5db45770a398312de95*2&3e8235081051e683de564139ad14a9cc*2&e4b95d0030b39edec1bb68218bb5bd20*2&76e44e5510a6fbef4bbeffb4001fb878*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb32_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"cd952be631a1a2f37f8e5054f6f1e306*1&042bc805681b22568676b7e3702d82f8*1&0f5b299c9d325e2ec4eace8c07a2859d*1&fd77e67c812eaf7b76ea91bbdcb6f130*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"025ad0b1ae5437a6c79a39ecd2e0bd7d*1&da423edadcd19211b18b503ab06f7443*1&b05ce987c5fb0e4a216143b93c270488*1&a8e3b2b84901af4e3b5b4fa775fb2cfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"ae13d3bfe296381dae63048f930917ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c52151c06fa6e996254a45b0a12c0a6f*1&3ccdc91ecfc7b86007283d542446a5d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"13a22f8f39fc0600153d9cb132ce64a6*1&fc2ec6dcdeee253eb0f2ffed2528a3cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ee1cab20659853d1324e87b56c3c53c2*1&497e563b83a942604fc57e72f1afb2e2*1&a6744ad7f3903a1a9f1b6046a5ed80f0*1&3543337fb78736630f57940e4f402e7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6e7b491f0a4cd8bb89d3493b48902383*2&ddc1173abec4879122bbfb224f9a1100*2&45a07e818a01770097c3309f443ffb6d*2&22e1575242716f42f7c051886fec2333*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"634e87d4c604e457e34b6342db866bab*2&8456b2c51b260185ec12f368e7270003*2&ab2955097230088562e3936a2facb5ae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a02e6bf8fda407a4833452f7e494ffd2*2&0020b646cad82e512112d719b9db72a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"714c7d042e0aa28262c7701b5a3f02d6*2&d2cda237ce479e6b0576a62b62690b10*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5972a7de984bf8324d38a50af4790a11*2&691b3b8e8c3b6af6de6e49203c0765b9*2&c6e8dfd39d0ee4af71520d8e32297300*2&420dfd02408f5556d8fa3b9846dc63e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3a55680fdfe2ceb8a814957425fae07e*1&e8ee4d1873fdc5fed6f5a5cc7dc83175*1&ed6ceeaf3399650aeb397b8097a58cbc*1&d37188d8c937916fb8b8f7e8f009921f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a92b3213a0a6184787765adbc64975c2*1&a5697834e52dc76e0a09326a8f7b8273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fb3459be288800a72af86bf9e137f708*1&8f144fa480ba8889ec5ad7f9994f4c94*1&2138d091c56997b5ac334938b8a0e4a7*1&bde29d43024016b45e49d8a9e74b4525*1&ebb2febcf5f2324ade8cf07ce84088d7*1&658718f9155697ea09ed1a5cf986242f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"585d98fabc4ee675d736ec5b3a08a648*1&63bbf9afd71f14f0b4e6e5094400a4db*1&d9381e1c97f9d1b19356cefdbd902c68*1&2407efd42c025070b9abf051b671680a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g200mb32_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b9de6fd5a9e9c6cef98ffe566aa477c3*1&7f39979a834d64bc5010defe965cd64f*1&f700c0dfe3a52a2a169b7f35b92bd673*1&d6bf7e07b1b7668c4bcb12442c252dc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a4d5238a6713e659da3443028682bf92*1&c9e40f7a56f2f58cc1c57de4b73ec239*1&39e768a203afd341db604a116902cfd9*1&50ed1342c359cd6638079aac2766a652*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7142973dc592063ecfa7a4c828b4cd7c*2&c2cb07a6cfe72fe52d5ec2bdc2036da1*2&4056e26e27f54ab4e909f9e17b1bee7c*2&750535094dfb327e732349b0a866bed0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5f330110407c7becc357333afed5569c*2&902bd8b1d3845635cc31242337315637*2&774722e1a8956c83fff1bee48eec9543*2&86edf43f91babffb7baefa283861bf70*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1fb621d3439c1e64a369bf5baef5a4e4*2&c1f39dbd14dcf3f514dc49e81f91b525*2&4199db5535d96f0fa5d0d6c3cc6e92aa*2&fa1b5b8452d741fd656554a9388fd65c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a9036a1e73964954d9ae2978ee15ab84*1&2cad9b7f291b301aad0a1e96aeb8eb52*1&a7d1825d383819c550ee2dddf2cf94e6*1&446c9506c0b15b2eddb0a5830c76f523*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9f610538eea2f962b9b104e49729206f*1&ecefef6f71539774e8e363e23fd4cafe*1&77eb575a738916123f2a1f2ad60b9d97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ac744f784897931312971315d66c6eea*1&8c84f604c74c355d511f380ae4707801*1&072c1be24f5054b53946bd2dade63f2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"abacc8f364ba483ccb1892af5961b434*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9e45cbff72dc1f4160254b968ef4096f*1&abb449322bab65e49eaf46681fd7e654*1&50b59598ee776a0b6f9f9e7909519adb*1&fc3ebcf4343eb8eceb2eeadbba026cd1*1&01f84c4e73d8e64bd39a88e861e80b7d*1&a22ab909b379faeffefe3c54069111a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cbb62062b7952c1ecd48c7ba90711f3a*2&736e5e653eacb8346b43ce023c98beec*2&5f44a9a55599a8773af9740045367e3b*2&a480f8a8aac90bdf4cd5f818824ab71a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9308e5f56c830c0abc47732cb6ef871d*1&c784ecabddeefa887462cc2fd0defc2b*1&c186cb880a96e7a6e99060eaab54c8ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bdfded75db440420be0f7abd0ae06e85*2&61a3bbb075c0cdbaaf94f88a1c8f2afc*2&25eb801f06438d7e195cebf50f5d99f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"366213e91c6611d8109ab6157c62337d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c7012a7758299ab5a7c37cd5ae711000*2&45487f73e932efe9511759f520f9eea1*2&46af6faf35153664f6c0e8c455b09644*1&c9a0eb3b390a5cb0065e6091d0423d0d*1&7b5947422d26f6091557e5215a3ed5a9*1&99579942c5a0d288a50b23391f9cc09f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"2cb4c6f87a562b888c7ea3688d526777*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7f051a99212e6ab75fb235fa07faf7f1*1&acab852261b7f1d631c584fc0b78d121*1&754c66da5d0db37d6d016304a699d25f*1&e6135078fa5ba23f95ec412492aa81d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"674e9ff47090e86cab2ba5f6b726c9d4*2&895ff8804a76cab69a9e81eb4d992bc2*3&99a88c44e1ebf7255bb934b7947686dd*3&42fdec87209a91ddf3beb2e1c2b47757*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"55c54ffa24be3c3dc179890d2cf44858*2&4f6d81ca1bda93df06197bf48029fd8c*2&203b2f5133a6a1b3ab1d5c6a07e55ed8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2face05e600de986d21dbd8e0d7ab255*2&af7d6def10a367d6e9454940dd5ce13d*2&4c8bd528da81734b3d0294f5fee01a88*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bb4a44f4d47ff6692b972c6712f36061*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"71494a3f26ffd2ae2ac2911361933e2b*2&f7f0d3301a208f419689f6b12e75b35b*2&dd1a357fa1b90aa901b14adee1131881*1&69f6483cd2f711b0f318cc91b43392e9*2&27f65df41a82fba1267dd6e030eed271*2&5a0376bd01eb7e1ff18a8f51abfdd299*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4db9d6217dc17ec3dca3981c9d040aa4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"fa63596705850076de59641a00cf4b44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"a796af37725c8545a34c13e57f89e0e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"375da4cf0d4117ff3b6d249e76730b61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"222b584f1e70e70c2562cfd9792fa962*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"805a1e14480e6f46676324687d1c67f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"4beee292697a5f1bbfc17b9f9e37df44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ae25294ac90dc4502a4df4b41d85a360*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e71289bed00063f5e94ec29fb3985fec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"776004872128542ac16aa77cb07f753d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"aba17e39ed79cffb78392019d7b36aa7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"39a28a11de77e98fe0faa95f5a72a33b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8977dd4378b7c108917a64d544c8f95c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"f3ef68beba505afc5ebfa3fe7f823f9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"adcb090187d4e02fff9486d99dbbd91b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b1548881a35b69a5769759cba902cc98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f79194fae5aa4cd35515b29c459e8a39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"37ef58320e23328658489804203c7dd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"318a8f89afead235b38805cd8e4653dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"788b9dbb2183f036668cfb46703d1f91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ba7ff0572c1d63292b47f2c5c7880205*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"39223b9abd86468d4569d912745794f7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c39fe530369fac285a0bfb38887b6c4e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2ffbe6b483419e02d1906228603e2af2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"662c7962b23c03fb7137c406cb892bfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6b043821bb3ddc8f88517e58a8812ced*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"71ad0a44a0ed078b32eaa311a60bf594*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"08941b41a1d245e0e5a0d0711926925c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"c242983db17558440979e564eaed07fe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8d5802809cd9a3fcf780a36828f3dcbc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"365dc4912679f81964f9536d43144182*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ec75aa47baea0dceef787114c65ef179*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f84f2db0328460a809d0d580c95667ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ddf21589b9ab7b7206d8e068cdcda414*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c56b52c0c14b92ca6752dbd13e373548*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"4259f3d301dc4cc1ab2dac0e15d4ca41*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f1e20e694c70ce48ee691ab6aad96331*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"bae2990d498c11e1b45ca83d703bb61f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"f8cc247f9829cd3006bacca4c082691e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1e39ef9e5c0df3be806f8e3b808d2469*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"9dcedd6d18e9aaf71d3656ecd565abd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"7f1612a7e7a422d17ba791dcdad5b278*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e189459894d5a0e52d6693ce1c4adb4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3f56fc29972e2f05263510829d2c7a14*1&67ecfba8f3d45dd971a36d4348a1a1fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e8c3f3c7f14a4a9c1ecc2cfd89bd8fd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ec9ae111806e58b1f7a8ee6c95175a09*3&b8c6047f36152c61578eaf88f136e99d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g58mb32_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"31f6de9295bcbec662ecd04deb4df1d3*3&ed781abf4cf3f8892284ad559f975776*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d462acf03c8c783fae3855d09fe2f72c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"a61bfe71106b686837d5e1a86b647603*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d4b8cab4f1848f0824aa010e8c9c8ccf*1&d496f9381b22f392934f47e30ae6f93f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d427060a35177c05c90d66109badd72b*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4671fc6086a54277232c4beb830c6e33*7&2e7930af7f1bfa002fe558f1b8276f66*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g116mb32_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"dc74d813d3e974547f3916034c3de7e3*7&91c616714a0771a8ba5d3c6d3c74bc3b*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1e115846e5e4a9818c95803c974fbebc*9&420faab288ca41383ebbd9f1c16b72d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"b6aeb182b5fe3065e8290e2337887352*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"673b3853c2b87aa24d1a2a2766f9c342*1&516c0714f99cac49843215e1ef327d1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7ce6a4bfa3b03d2e9aa24e441ab68bc7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0be20a65c4ff2ecfcc2fe26a60b103d2*5&bf35fd3dae285f712729b3c60be82288*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"acc5f193422d16aae7b277e0ac43de5e*3&f452bb483ebddb52bd61883309bc50c7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g232mb32_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"220d1f564e402b4b8eccca75305b2db1*3&0b74dc0d395bc3a722e3f49156ae92a7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f6dfe82a520314cfb52c41196f447a9d*1&6e7f24719b282fdf8f25a039c3bda4c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"dc8a2c775cbcf7f72b231f65c49ed420*5&d20c5a57568e1284c883677b9e395482*5&923b1c7bb2af1b3fbe944dcb2b90084f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"2a2780a66b25b0652782db0354321634*5&46cd2ffe96ab993dca16f291afd1b69c*5&bff5233310c383bc7df27f8d8da90f7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"67bdc2b5c9dba7830bbd920a1d056b90*5&7847798bf924c83d8c45922bb33f5ff2*5&0ccb13df2862fcfb1fa4a11c63994925*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"565936c485b2c4d713c908afae049794*10&a1c26d361bfb655f77343929813d735e*10&9632554596551fc986056ab088ccd83e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d8a324851d0648c452ebed55b7cf2d85*5&581f1e80bf603a55acb229a60436756d*5&cca43ea05d6c3b061ebf5d5dae662388*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"5118894003ac5796ad92d6715182a6a6*10&877b3692299ed800d9a1b9258187b257*10&aad1af354544c0e5241a51d3b53aa6b7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"aef008f80724f6a38874562653914150*10&0c3a824c64e11b13008e12c3707f0024*10&86224dd433e3ef8687e4b9a4d24c594e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"ccff7e0f150f8ee79755d7688d173a7d*10&a3c805a8fca5a6294762ce0a1e07d4e0*10&6ad772aa60bd66b0dbdbf181836155b6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"27095cff1b76e41e2c20651702dcfc6c*5&d276514aaa435810457a5668e9513e6c*5&ef9e95ff301ef7c2a3df2a170467d578*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"208ded252c9ef76b47b5590f32875b0b*5&5343eb01e483f91360c0b3315ddede93*5&3c6113710d939d49f41cadbf83d43b22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"9d464fac95027260484fa5fafe64a2ca*5&82e3f875a56bdd1dcd81f5ee77bf3887*5&f01eee9739f88507938596aa4212d606*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"4390c5f8ac242edb30ba27112bb769d0*5&c6fc32f4afbf9811ea590e8402fc28d4*5&bde38d98be12e005a6d8610fad11289b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"40c3e5999a186cb37826568196a095af*5&fc27c3ce364108a9ec833ae899776509*5&beed99212480530e1583f92ec83e7ea1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"02a55e3db27003bfdeb887b1fa5091c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"923c41b629aabbe24857492d39a13ad5*5&2bebab51b3c1fe478c4bf2918f564564*5&26102959a33a80e53d023f718ab85b37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9a7f90921d106de798ae740f21e9ce2c*5&f2344f19610c124df3516b3535c2565a*5&d44162ad937f8227fa1fd746d241a696*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d19a1375f336cc17efc7d48efecb5a6a*5&da8d4c7e3e8aa1f1b7390a94666874d6*5&53ef8c1d6525e11081616b12361e0ed8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"b359b15c09eec0b6886d2622244c8f09*5&f3bf5a1bfa34b193d70ca38b816dbd08*5&12291451bf908deb417027a9d0fa0285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"b0e632902add8f9e92b86f2d62114316*5&f47c4c5a8b1a80e4f255ae93b68d33e7*5&25c28b134d6517984f557cb19c72e335*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"1a203300ab03ed62929db581bb48c147*5&719f0715c23c4975fd38f228ff627f67*5&5332640d1fa58a9627eebd24f21882b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"6afbd82f05ebca260d4036d03eae9809*5&ce916f50fdec7320ca15c5a6ac083b13*5&c26b3e6f3e9db5a43edbd5bab139b068*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"4e0e4000a7eb092174deacbde96ec8c5*5&a3f38bed2fc653a46b59508ea39e60af*5&0fd37b212b0b3ed80cc70ccf985fe73b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2dedba9cd63b89cbc00cb71587d08073*5&e8db0cf4e07c65720a4cad81fd582fb3*5&693b7b449280fffec9dd38b168356876*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f6dce8a79482705b5c3be2bd96711246*5&cd7bf1d38fa069e94e264169ee426f54*5&390a74e51016ddbc560da31a4dc9abae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"fb1767a9a2e4c5c3081e746288185b64*5&1b355f734e9b9d634922596515bb0890*5&fc2474b6450b5799cb3d10b34df92f8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2252c6f056eaf69cf5026f944a416b80*5&9b34a88e1eca422e40a6c26f16861787*5&b0d0c08c97da79cd0bea6b0e872dfa7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"870951d74e7bee69ab126045436afdf8*5&ff00e4045aab0001d9290b4a62b98848*5&1c595938e71f70d66023776671c82b98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"5e23720aa68bfb7ee23cf31c963f91fc*5&637c17d039ba875de2d07f1173d68d71*5&f98bf91e93edba9ee4b821c2c9629bb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"92cf20c71871d8b4b47e7454191f535a*10&eab5a0286f1647f49acc93cf78e2ccb4*10&f7cdadeedde991431583a84c3823e699*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"f54d297e8ef1e85bdd4bcb084f64f71e*5&33f7fb40393f7af27800bda87903163f*5&d25ae543e73c37d6e63f352dd3bb8dab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"f67c11c7101da46ef4c7fdfb998cadca*5&c75649f08af059fa6efdde50f1851929*5&398fb10730d7ed80a503f1f6154edf42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2600f028422b90b894bbd97d4595365d*5&c65d35eb48b7847ae3d01b523b4aa4a0*5&51c83ed9ed4bbf0f2f6015a13c5e5855*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"3cd24ca81d1801b7a3a634463544a0b7*5&48fffbf72b1213f3ec18734a86659baf*5&65dd788bf03ba543c02e9caed64b2aa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"afe404e6c9f619372159f847020e92e5*5&e609f7cc635dcc455ad1593c48a7366f*5&9b12532de155b595c626bee47a7c1378*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"f8a87cbe5f99adaf47fdd5b5c57592a5*5&e5558bbc2177c8c252231cad0b6ddb7a*5&bda5a7844cef388aa0271f78f0b17a51*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"489334a9d44fc05c5897c8c091ce10ab*5&48aed1fcafd871bebf64637ce43da754*5&b71e54a714f402d8189169c9bf0c5af8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"5e37ee3e97195a84dcf310896f3bf716*5&07c85eb285e63754814d5986422f6957*5&8c3b59796b687545310ca383d5f773f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"055fed66be49cfa5337d92614651688a*5&56cde0fddaf6f495b6d6843bff01c8c4*5&b57b014305a425c047d19dec7c4caf71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ee5a45aefbc3843d62baa39d1f2f67d8*5&fcdd6868c37f1095daf33feb3aa36cfe*5&266b6629e0223d25f5df410684c90e7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"26741eaabb56df3b7c19b279f4c88ad9*5&acce8d151a45590e9475b9cfb5e2ad3c*5&0a2a2f807cafd23788049c8350e85352*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"4d23ec0fe9ab11dd11ce5a204e023c1c*5&d62225a85274342e90b656a8d81426a5*5&a765f9e20011305dc3c1de37720377d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"56b313fc45c2260ee37a26b1e809dcb4*5&a36e7c3cbd593440555bff0963bc7ca8*5&3f263d08eb5286ec334383071aa98b39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2ec84b23023816c3b1d11e559d1b389f*5&429632ea8f3cdb5f88dc7d43d776d06a*5&f95898331678e01eff86860500cddc79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"112a3f4fa7b945104040a3fb5cb7a4a2*5&612e6bb50193d857dc84ced9db9540c2*5&9f1c81f06c2a6deea84383202e0f1256*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"e4db54500527523b77093e9fa732a74e*5&38ed8d9bb9700925520a479ca1d597f4*5&0b9fdd62692c88570c0c9d96505511a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d6ea12fd61375fba039920c1f39990da*5&f694123ad65250e52b8a93b7a719961d*5&85b30e34e971c5fcd010da3ef1f0488e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a61905c5d1e812262afa5985437bc560*5&1590fdfd9361dfef9d85fbf8fa5c6ebd*5&0d225a6014328b3e05a3fee40767178d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"dbc116383fc0d648250f3111aef66484*5&2d4805f2b4faaf18f860c8b6e3960643*5&66180cf548cdbae44d39ff8a17fccef4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"937f0b16364a7b54fe91f1b9ec5f272d*5&be96ce30ad09de8da4cba9689891f648*5&ceb04ed61f4021b73d6d8ca6cca7d7dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"128520596fa6870b181d2121cac57c22*5&055e91426b745df1a5907c1dae4295f3*5&67bc174e9b02b5f1f86d0fb801d0d873*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:0+binary_add:f16:0 mb1_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"ea1aa463813544c9fe7e934b7f76d6cb*5&b4fbbe41d411b7ecf2f3862baa0d4f15*5&224da8c978fa5e500676a7db4481236b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"a6d3e3d30ac72663d06261e1f70860ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g16mb1_ic16oc16_ih512oh256kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"a99c41dbe45d648ec36f46c6995b27d4*5&af558ceec530086e8f47db05482a7224*5&7be0414b4c5341b24fa1017a282e3fde*5&db833c26778f7f40a74fc58bfcedbb87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b85de54f94993779ffdf983cab060b00*5&e5a40cffd31a1f83797bd42fc3e4bf46*5&375d880b1e88125893d9e415e56d5a16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"575d2632fdc4c12d3d333536dc952031*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc32_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"d3e282c567f33068db6178ace430f8f9*5&6cbe9a13c6366faacf32542c07070660*5&c123a74e888f6ecc29a204e1ef265dd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a0614b621bb33109c3cfea19faa881f2*5&1319c052b6294e21fd17e73c6749a055*5&a9bdf686501f199159b21dfee51844f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc72_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a3fd7471ba31328bf8e8d75c3972aa84*5&73d809445dd9b875e6d5d7555106240e*5&1003bd687a7735ab1df69ca36bf913a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb1_ic72oc72_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"fb9745e5a03b74d7df15f4f5e947ae46*5&235f92b8da817dfebe8db50a10d3f1d7*5&cff684d8a1d606519d58de7b163b0866*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"677689efeab25d85ca4e5824b6be5532*5&500d51204c17ff193b9b33b35a35aed2*5&cb3045cadcbdd57d62295e3e2a7ee037*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc88_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"09f709f225766e7e5c0bc7da646e537f*5&25072bd4e1eef8fc3e879101405d804a*5&57d33f4e77ba39a2dbda65af2663c472*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g88mb1_ic88oc88_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2702dbd7130e75fce436d1dda7ffd151*5&9c680d0be66374eeae81cc5bd15cb569*5&7c098cfbbac2dcb14193f3762dd8bdb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic88oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8e24e5597f3d24500d2ce0a94dcd693b*5&bd1ecade5dac0a3e8b8f0e4ecc07fb84*5&7042df633dd4ab8c28e2afd05f2506e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fbacf3f82801f422ae18f6796632df18*5&da705ffb2d7786db110889248cef2ae5*5&ef732a2aefc41d7e5e8fd9081be375d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"5e94340450cf9a73abe31794b16d33b0*5&df8a9594fa2280c98cf73af1313b8421*5&f8282534098464443777ef946e55baea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2c48c36aa973357399a0c8dc5d22bbce*5&15ff0a296bc4d57689860fb4c636efe3*5&d1ee579598b4df4648c4d7a5a7086615*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3f602359615ff02ac742bc745dda041e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e132df0efa83fa35211d27ba6a239cf6*5&e2617c4b779785cdd4a08f1623f03f69*5&df758b7e4bd2e40dc29ef85d49d531cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ca605dae461a6ac8a58d49a8a303bb04*10&beae126e9afec9df4e17e97483880549*10&c23ba35c4b9632518ee46497e28c90b9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"cdba1d7320fb62588632a663c8bfb307*10&2407b43eebfc73f35ae8e4336f398f2d*10&808ece5dbc3ba91205794efbd487b801*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic240oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c63a530f95ff2c56ae5302aefceac799*10&c658784cd39c239dddb262d59b3b7a84*10&a18fe39b45d5b5f84b16d06d2ca73a8d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"75a3b3b94a32d8648208d4b9c9b06fb0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f8d63f090aaa83ee2ca5d71d8eedc2d1*10&93fa41f9f838e28afeb876a2a930913a*10&2cb18985d31268efaea900a5c630a353*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"07ea92a2a5b30597c5fc86dc9e453838*5&58ac5085958ef566c215846a09832d53*5&60e6c6545e82fec50fc8c92d44a053f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"0de0c1d791844c267a9acc5a4b87f9fd*5&d33e4808d9b2837d2c8d095059e32811*5&7ee2120ed78b9bdc837e7151ee8c0842*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1cf3deb4de6585a101cd9b623963a16e*1&f2ce0f15fd5f6fb88ec53ef2e23878f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic120oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bc85587e45532ba4947976be2c81add3*5&61311532bc0fca638e48c30e237bc34c*5&3843179c26139a504af56bc68425cc57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"77b12aa1a69c83e97cbd38d3f55c06c9*5&58ce345d25961951c1b9267a62a2d959*5&52c3a7f5ee7c7aa186e76249f9f686da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"e3582a945f04c0805eda4b6f5756e141*5&fbefb45b8c0a44e08084c8ca11aabcbb*5&7b11138ad9b56cd0a9400c94c7e465c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a243fdde00ceb9e1a99d371218940254*5&b48998404150bce42708c421bcc66ef1*5&8e7cd587a08d123729710b5bdf95bca4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d1402159e3f7e1604e7c5d590589dd68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ee463cdd769bb28b4ac3b878070dc7ff*5&2d4c943452eb06465262e59736995183*5&32437a3315dcc87ef748fd68a61bac9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6703a353017689ddae52afae9e223eb3*5&f892173ebe98c34a24599459cd6fc957*5&fbdaa3d026f167d55bca3f68db897be9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"c3655c4136dea9b8d04d7546d6910fb5*5&3aac8df64e397fc2bfbb7f35fb28cdd0*5&8418e0c5e9c19645f60127a3e345cf9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca56ed653f5d80c1ad92aa3639b809a6*5&306a30244bffe70083536f76b37b72f2*5&ef7806080b106dd50cb9093621f4b381*1&38b989c11c0f4717374c0b9a28ed95e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a2fa4060b7c61d7922958ec0b907be6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic288oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"eff8dbf28f12bde65635909a50555acc*5&e25134e72bd459ad3a97a24e2fd51f89*5&4218b1a24cd1ee46d8d9fff684e40cd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a21d03cc83b527f6ce3dbab034e4d88d*10&500365fd309608b1da10fa64bfb63c18*10&56fd90a816e9dde2eeb0fa398ba9a8b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"7516a2d61a7cfcb26c7bfac13cc31ed9*10&92c2c1ebe6a6ef41aa2d7952e412b27e*10&b59cda42b7df43a45fe216ca51db2d24*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f1467b59dce5481ff55b1c318af8bbee*10&720366cfcaf358e338eee0ccf951f1b6*10&6f00cd9f4c2d1a9671277bd596eea147*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"738b9ed3b42a34fcddad767a8116f66f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2b47a53dcd6bb4ad104fb39dc235b22a*10&f22a605224a7153e28a636b9a5fce66f*10&d4fb6947e1120e25198e914d2e779af0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"e55e2a14d1b7e0b2ba07e1058ed776e0*5&1aebf906031c2e2db5d44edeee473512*5&911fb0809f04ad8a65635305aefbf067*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c95beeef30eea440a235d9101878e165*5&770a9f1622df81d695f6aa2eeb39ca7f*5&8694533624000e1e75dce4d5e4d65045*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic576oc128_ih5oh5kh1sh1dh0ph0_iw11ow11kw1sw1dw0pw0_n"72b338e1105ea6f070ec255f2dd40e30*5&8ba0046546727e1e863a966e87d15e6e*5&a21d6565ff5ac99cb523c7f2c4ce41be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6d54aeaebb813101c8c02573ea537e39*5&9d0a3628e725600951a5102936b0c3ad*5&664a9d5fc05ff2dc1d57de6f5eaa4114*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"6859d353afdf3eec3ef9caaebe2aed3f*5&5131d35846593853551994ae78604084*5&03ca45720d6dd79b55a37b670fe4666a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"921d0e171e40439e33a9cfb0b6e7d09c*5&6eeaa89a2148a3b258d9e3b3118e88e8*5&71e368f5bdb67b14f1a77c0a10eabab9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc128_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"4c3f55c4393f0dcbc60f76df7da53591*5&d1694b59ec0b20e853c15426fb2d7f5f*5&f334f0950ab5275c59589781f3f268b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"5daa7cae48b0f76d0c17511da70cef73*5&75563f86eaed624c3fa49b9b64b3bc49*5&8d9bd984295285ef5d39c5b54744bd22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"315df770b868addf30e538e2d8d600a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"5cb0ddcac70f93e6244010e0178d4cfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7d35a1f48bdc6ded8a5e1044b631deb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g8mb32_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"2c4f2c673ada63f25ae5c06625036c2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"777b8d0ba8c0a3a845a3e8eac06fcbfe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1716e18407f8d9d6dcbba7c36eae01b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"682799fb354bd02d62673c2e1bb83a76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"370ff593d89f3931e78be3a1ef362905*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g40mb32_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"d73327b9a1414f8d3dadab137e79e2c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"42e03fd3ab5606eb28b258c76112f725*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a8bb5e2305f1e3666c91a95ee8e80104*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g48mb32_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"1c2ba8a68b5d46a1f6e2e30c2b08ac6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"597df26d005e003be547f55ea9073657*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"49251b361d5de13a831712f34e869b1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g48mb32_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"7f54396ccd1251a7cbc0f8aefb95b433*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4f5cee229d4d2daf49b16d5b87135e98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"76e7b36db6f54a56352aa3be9bde0927*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6d09e3933a4946160a25a0389ed0c71f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"687d1714d0d12198de79caa261c8fbb4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g120mb32_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"8a5786be2b9f04629e986c2feb53caff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5ed13c8a196234f5777976041e98a9b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"f16dfc8e2f23b0f81fd7d59ee367c4f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g64mb32_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"df1fe4edfde0cc7f4304c1ab166eb255*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8d18355bfbe23e3c04b4e58e0a5a01bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f46d34794e046c87228528eb8ecdc98e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5682be88127ba59fe9c284869ec028f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a6ef433e8fd6e46fae852a0752810070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g72mb32_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"144cf018a87a048ea0ef24b95ee5e318*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a88886a44fbe2f4fa39fcedededee530*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6c3f875d4ad51d393f9ab6a90e2e30b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g144mb32_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"ecd13ecac48af8fa0331a15b30210d4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4bf493f72d42cf8e36e400fe6c7b0090*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"eff0e00e8334ceb2a46921da78c3d1f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"d671595d2307cf8c8d3e13dd248b94db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"48fbce01c47cea3770b235f2d5941754*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g288mb32_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"8cf28eaf4bc4680abda62a74ff9ff1f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b3dcaa2af144d9e46bb04f6bb6e6516*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d6febc762997c35219b21cc6db76d956*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3997414b5c66c8c0766bed9b2dd8c8a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:2+eltwise_linear:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"e7f855c82177d357b7cf3f31be50b345*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"eb7bbbf4390af67e3b9b1111db413a8b*1&00560327bf3da9732dc33f6b2c9146f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic64oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"839751506ada0f915e04e7bf54dd7515*1&457e069908a2a4d66ec7bf2efeb0620d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc112_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"bb17ab1c62214e866d170a4e909afe03*1&c4ccc910bb33deffb69c0a782f67c6db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic112oc112_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"579638fb7ace6f2f9613796a21ca600d*1&f7a276113b26a4777ddf17f5d2fc9c0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic112oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d5a848d56e35a78e41e2c572a1811589*1&f0f54afddf2049f4c14a28dd1b4da9c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic192oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"e5be8909e5829667f0d50babec136eeb*1&c515ab4c0b35c0f0ff528546d59aa8df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic192oc336_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"60eec92482f16a12b3c2a51e830c68be*1&c39af785596c8313e0089baab6751c14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic336oc336_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"07abb7563a9b66200efe25e9db092d9e*1&bb112570d545d215a848fbc3357df93e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic336oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"50e4fd97de34f195af6eb8b8c8a1c657*1&5220722619dc8997976e741f933d4261*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic512oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"0eb9186e6da7788bcde768b97db9cd00*1&d9403d0a8dd088003145438586e6c314*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"33744aba9d733e652995f39c256cd7f7*3&29a731f35451cc2b33347c6ea205f395*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic512oc512_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"88a805eb9bf5ca56c4a27aa70ad2d888*1&fa95edfa082437eba2e4bb33dae8674f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic1024oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"3d035adf5a6f0b8800c0976d5afae975*1&36dba41e0b22ed2264aa9c821ef9bc62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic512oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"9ae2bd9f59d37dea133c6320bec94900*1&2ab833b6e01c3ce50c2e572101fb27db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic848oc512_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"44e997b236c7f6a9138c1bd6b140164b*1&495d7756f1c13c0d9f158890e70ddf3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic512oc512_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"33b57cafb2869961cb393dd673c5d99e*1&0a144168a01d788d389932bf061b01be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic704oc288_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"63250c04e45f688a17f90fe745d4d626*1&41965f85c5b131a0ad210b00fc3a6e28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic288oc288_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"48c104e305a3ffc57c439403960458b6*1&5c29aa9577b611c6e43b28a6656a30ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic400oc160_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"78dc1e7807afc960b159d2562bc8364e*1&47a503c8bfdc883c59104bd3ac0db4a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic160oc160_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"cd1953e3346feee5aef60d374d9b68f6*1&931397e74a9db35da64fd994fafaa131*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic224oc96_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"48bad536cf0ec26e55009f02d882d786*1&9feb3088119315f192ebc781f3aa223b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic96oc96_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9497eb82d4045dd921ea3be1a4c09441*2&5d5e309141e7e2ccbd575ed341066a2f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic96oc96_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"98a059881a885158cbaa03c88678cc28*1&905aefbb1f003260983a3a9b43ce3c4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic96oc72_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"c5f2650d0ff1c839d655113411c61175*1&3e8ce19ec5de4ae5d655051dcc0c5c0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic72oc72_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"4e94df4c05790a2db0540f475f3c1008*1&5ee37602144e892bf110045745fe7f54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic72oc36_ih768oh768kh3sh1dh0ph1_iw768ow768kw3sw1dw0pw1_n"5d579ce6897897b67334fdff087236a6*1&8eef5c01d841cfb8efb33ba67fb7a8cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic96oc48_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"008d993dea3abab58c3e76e02ab08059*1&01653fee9a67bf5e35f4930fc62136ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic96oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8e330a31c4a1042f2763e7126d6e5693*1&68c3397f97695aec7d64a7871667a4f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb mb32_ic48oc3_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a9880b95bdcdda78b7503458908f2fbb*1&fa7f04bbfa0e13364ea878965ceee12a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:acdb+binary_mul:f32:0 mb32_ic48oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"915aa012640256d07537b68d971223ce*1&30d69503c9046814f675524e9f883d63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:acdb+binary_mul:f32:0 mb32_ic36oc3_ih768oh768kh3sh1dh0ph1_iw768ow768kw3sw1dw0pw1_n"155041543cb4ceb31203e6639149b64c*1&89f2bd87122bcadee8a3b5276992c9d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"5cc0a0da357941e18be87bc29dbbf602*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"3b37cfed59dca01226b3f156ccc4b9bb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"031d8c6e7cdc981fcaca4dc107c3e27d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"9b2349bead685f650c3dcb9b5b2534de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"8d184655da6219daf0273bf057326533*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"050fb95a004c2d34b9212bd4a447ac7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"2727f8159d95b9804d396f86e6cb7323*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"5c261ec8dbf91ab0b916193af4956d60*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"7ba28dc4f8236d69e02c9ec97d3eff53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"1a70fb2489ca21a9a422cd6396acd63c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"0b5b085e21bdeaaafe5de3ce22065544*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"eb7ac8b5abf19a10b2d70878bd3fe2ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"965c8fdf28de28eabe8ced6d36faada3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"182db4c23d3395b5fb82510bbadec37b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"295e0c91162231e6423d864c76168617*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"6f3e770c3adf6fe8db8e09ca33537b66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"29ed99ebda3d80de45e1d122715b8ccb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"dd9ae571a8e023c56202c60629aac331*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"c7d4f2a058a8dd5b8035eb1fea476777*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"5cce4b92c3a3e14146b6618fb4f2c2d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"c21be66c5f068cc1f95090f8ab5d868d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"1011ded64cb90a667f0364d4eadaf58a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"a6505d2094515a463cd14d6e8c33800c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5b4f4b76d2780967f976fc8b36ef12bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"3a0063af7d7c2502f763a9382b010ffc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"4d9e79cde77c3593e242cb865d357def*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"d9491d0d03b7f382f52d43f68ac13169*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6b1598c99bf355a5acc195bce068d5d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"d976b74c7e0c7bf5b7e5d643d673378e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"8d75961295360fd74ad12af44bb7f2c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"3f8abfccb89ce52a985728c018ca172b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"7a72dbf294fe022a587cad1674b9230d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"6acfc7543fbaf1307b5d2244cc656a6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"80e1285a199dceb50cf25aff2ed4c0c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"73c3768add20a52c09912670eef9bbf9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"320b920b3c1f30b0f2e4c56cf88d97ab*1&bd0374c4fc65b945dbb950bbdf4dafe1*1&29054ac91a702ec53c187b02aca903e3*1&72f88c0e969b5995ee2f3f2ec77e783d*1&8820c4b55252fa39cfd8ac3422d19692*1&b0adaa6e49f11c44b8a01d246ee3f5da*1&e00e4c90d754ae4922ca918990fe4221*1&2db65900e70032020669d39ac1597942*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"67485757cc5eaeee63eb18c2cb0a2207*1&8761c7e7b33bcb0cb67fb5163b56a0c1*1&72b51fd74be3be1fc358d4ea63d1290b*1&99936c88517178d31a43a2101cd8545e*1&0320247684b514d88295526634ccd40f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7f850d3bc6adf3b0437212faa6b6a3bc*2&dccc94ae3723070d9b53b20a393733dc*2&8811f32a9ad9e7722d37ce9fac526723*3&1b9ab765228690d40e75f10b90bd6b73*2&49c16343427851c66ce1d4730572e2f6*3&7dc0313219ebdd1a4965e94532984f0b*3&a57dcff715252a7dbf8edd38f67446dc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"447eba51b2f9778fa30d9b47de01670d*1&65db0626f5b4f454d3dcc4c9d8d5aed7*1&6334b6c41a22dac2367b574458144c28*1&c9967f0402aef5f8cae263ca616d827d*1&5c076c19feca2536f6fbab3bcd5f67da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"527936ce9c4230c5a72f87235cbff68a*2&659247d74ae100a40788d29bf7f9ad84*2&73236aff998c339e986b4ec195d6ceb8*2&63810e8d7548dfe9174b10db9112f01c*2&1d142136078280543f408b2aca283b1c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e468f84ec1f88b92bde78a8c7a010005*2&d0f97d6d499df9d77bfc6b3d479d33cb*1&8cf9675a0307b3d6cf5c7d27b25e3477*2&0cabb00b35094ce0929e4697ee302be7*2&708dae5a116d11314fc1878aa1657549*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"36f9808c1f34fc3e10d7f5452636da2f*1&8e08653fdcf65bee4aefd4de3048381e*1&1dbbf7d414ccb472c5dee83b8e6d2666*1&1b6c28c753bc521477f947ba6ab9536b*1&441922b98b609257ec305f363c4a7768*1&e743f8e7e19c90dbe585e6d448effdfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"15aa8ab6f4f0c9b868a3b6dc54c037d5*1&1de415ca69208744310a172c95b8ad7f*1&a5b450fb7c29aa2e5139c64f8780e4c3*1&b92fd22a469b611df609a795359e63c5*2&f27de327e75e0840397915d4bf8c89c4*2&4f64fd682c1945c52740240441cc4866*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"effaab403843cdb5d855c21ca521d799*1&8f3c1983293b1d8be4c68fbc9d6a8a8c*1&caaa7cf75a0fb7e017952afe4fdd15eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e72a3e32adbf9991fc51395ca9ef236b*1&2157b29cd9f36b576ecda8d765a6e4ca*1&abaf2d632fcf1ad7c6251eeff8052cdb*1&09e874b952151edd4a2fecd786368652*1&2277305ffa4fdcbf544190f31dc3ee6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4637a26a434a81399558a286fea751c4*3&9a0d439a4ab73407795c62ff9adb6e04*3&4d25c049a81024eb7a28c7b934cc5065*3&9a0edcb2aa03cfa553d0b393e47ca584*3&96a64951e96cd31f313dff5b9fcc2e66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7e858ff44e9443f4f43524cb881032b3*1&039245fcd50e5e2ea7f7342868effbc5*1&3f54327045c47d7e6318a88f21981a47*3&a112f9a4b38c504a1970fbefa12d36ed*3&afc2753a4f9c2fc55f9fd983914fa2ac*3&c201fcefd9ad50051812fef946dd44be*3&373de578d3de088e749409484f5d1143*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"445d9844fa2f15e45d1f30b6a8c698fc*3&7ec7800badb03d4be93ea0e31552e5b4*2&0b9b5f6d42bd1a3800b6bb92201dc9a0*3&432b20b28408a75ebe37b1cc77a92400*3&c11e33e8bf8f1d74dd959def296f12fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"e3e6422f9909a5a7440a8f238b02fa3f*1&5662234582d5dfc5786cac615a0c76a2*1&bd10ffcdbedb950a5d05909cef554fed*1&69ddad4346424573130240a41588ece6*1&a086766ae074585c29513e41e1c093e7*1&c0798cbf271502579670ad400d5f667f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8754ed5b70c568a86cef878a987ef874*1&b53ce87001a057c905712920ce112880*1&b58b3d353d14fa854f8ee341252a87ae*1&b3da6f9d87a6d3dcf9f1f872c38d2ccb*3&d319c939691c3666f32e1f7eeac90f1d*3&1092e173ccac91df314f312d33d5c555*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"7a958dd90084ff8f47a57396aac82f96*1&ed35beff7946b6d467e60e93818cdeda*1&ee8ec52bca4c4bd0ed054de1a568715c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a37ac556065791cfaeccfbeac2ed6054*1&82c1ccf6f51530acc972fb6d6fd8b66f*1&7e73803b224bb47d762c2a982fcafb2d*1&e6f31882053f94c0a4c947d16e45a7c3*1&dca2b12b55e373344d10b5460a5e454b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"31aad6cc98e6c1f56d9a4df5a18cc703*5&eb4e9780500b220b4c001249b459682c*5&281c8d8aa290ddaa2640078c0898f380*5&18ae86a1d075d6b72d5aafc72bdb79d8*5&fa7204de185409bddf7845b4e8a6f05a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"da275c4c7c3567f0ec3fcd77a816f3f8*1&439c4c59ba42246032dcaeffaa3aefd5*1&f4b2d9f190fd5d8b867986223d90463d*5&51451a2dd2361b8d13f2587eed4afd1f*5&aac5443adc96379f864b0c5f8c147a51*5&98acfc27ed4b1f92ee7d9bcb8e3a054a*5&41592876a249e3af6f9b4434e49ba3ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9441d9f12a675b565e89e111bef5f7fb*5&8ed302ae0cbfa6531e9f85c98fcc863c*4&6799669c56bd6acc692f1ae644ed74bb*5&a18f8d21cdcdcf9e4c2842ec71950428*5&86c739f7be08c1db8f35c543f0c7d60a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f84e5eab07cbbf9d954e07f535b3f56*1&24f67b42c2ab3c6cd46961ce30e27461*1&f49d7afb542b60dc10d805b4e0242d07*1&2f70e75d0b9a3235897de3e62f49b735*5&34e3862e55d0d66f19ca59de6103707d*5&a40720a61209e102cb77bf53b92d9daa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"7b814aec3f0d24370dbc2932e65985e3*1&e1a3ae83c3712a6ae04c2a6886a5fe1f*1&b3a8d809db8dfdf7f6ddcb93891cf03d*1&c9e734686f95d7454573b8366c037f38*1&90a5712e946a2082dc2ec121b16ff91c*1&668a227a6acf45f1d1205d8f95e1d8e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"58e25328189da207a310fe1e4a644213*1&a935480cbba4951be34ad2694190ef89*1&50e7f999357b36685950e4f7f53a6745*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"24b70a74c50bb1ac2c1f3e18e8684f9d*1&506e263f18b5392e6d8a3aae0bbd3c3d*1&7b81dfabbb639c97fa2047a6357b96e6*1&622f402a26615b52ad69b09fe770773e*1&b65809b4159d1b6156866e44935783ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c3583d887bbc7ea470725be1865d7825*2&18fb89180541862c5d982f2c1f05b3ff*2&0b4e3a97550eaa8e661e9837cb59cdcb*2&59c567c51f98a50d65b6a6701b22ad73*2&807630cd13e3219cb4045210206827c1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fa20417147eb650d13f765cda444cd16*1&e82d0587bde791b660eb923d381cdc32*1&6e2fb3227721bc075df13f5ccfed5c9f*2&7888ad8fcf464a69a37e0acf6b13416f*3&82a061fe38bf4d87f31e17d0f384bd1c*2&3c54347f6e9fd332090227ff8e5e5eb0*2&773d72da03d32cc5f486bc8a16062c2f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ba61fd2ab6deb0cec47f0929989913a8*2&95f7313c3c053f24427c34f017723785*2&bbc89266790d9959113b462fd0bc5ab7*2&ca7a070487dee84b102a019ed5d9cc6e*2&9d580d441b2d3ab08e1b46468cd9d132*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"011ec449a4a81c914956e5c2c8f934c6*1&1dea843fda2d9bc772e2458475961047*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"e7c42bdba90d33b90832e9c9a8a6a09b*1&79533754a8208c645b000b1f331c476d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"92295723477a27cbffc5cdc40d13b6d1*1&c6974644671686f69ffa7539195cfea4*1&0852b87b61b23930c7d941d31abe03d6*1&7f09c4b10356d6d262d3744ac45264ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"07cace6922856cad672a594c5a4b351a*1&c153599bb86ac2fb50351073766ab794*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"115b0aef8ecbd6e8750fb27247cdfde0*1&7496dc8d46637287ce0fd0b5692541e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4755066a41d58689c4d48bf80baab288*1&0c881f95e28eb460b9408bdcc4c25e15*1&d025a8704c056d4d080a3a87c9ab090e*1&e44d93e491368c60062fe50f54624654*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2d20a2e42964711fb6a38d1031da674f*2&6817434980c32ef8856861d1c3a61103*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"445b9ecffc9b43188403f7a4ff9e5c44*1&93402b5f671f5cb3163aafcfca92a2b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bdd76ca54df102c5c0dffb5c65f9b7b5*1&432f278d2b3dd6386cf557dd0f1abcbf*1&a28018ef23001bb884c8268f27832a0c*1&0c4ef29075715d7aee359b0270ff72ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"411b585b67bd87bac2fd049b1ff30dd8*1&fcdcc2cfd11130b749ad7a8063d3582b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3f30aacc05c7bbe9a9631af47f8b3b9c*1&88945efaf613e83b0dde01a66ab123db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f4bc93c4715f98cdff14d0728e6fb6b7*3&210dff56b4043b286f2ebecafd29d85a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"94c0cfabe7e28f0361a345e18a8f1fe4*2&e650b1969b91ccd78c6fac06dd1f2aed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5996d77604ee706ee51db82dc70c04fc*2&5e76ef4a580955a0b4d2d2a4b4768f4c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"2d0a4a90ab7dccac7acb64125bba3b53*1&7da0fdfd6a3e857059d8a81520f9ca18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1fd8b3fb6f78d8a5536b5447c43c0e36*1&2f0064c996bf6babca6d6773680e9b75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f814e87def604285a69defe686244267*4&293c2eda4bea831b30b37b396cf1827d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0b9e46658920896b4ec93f1375c1ba8f*4&984fbb9a04c7619e65b2566fbda5930d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c7e4441edad7c5c0381186743b67335e*3&5e5ad6e696eaacba1184af5620c054d9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed643f8b7e24901b16d314f32702d24f*1&da72c464b27fa28d74a0ab83880c7d16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b3ae617de4429f3611112b2f9ba201a*3&c6c303656606f0d2d7f564a8aa508250*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ba45bee791299c4980fdd0c2ffb392b1*2&1f46f1e97822716ac1ee41a7913fcbcc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da1dd8a44ed66588457d2ede8a7dc02f*2&37f6f40b2414dc2445af0d97f926ee47*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"2d056d91318cc119d08468bfc8e8657d*1&54c6089a8e286b58b432313a6b7649b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"454400e9d3f2a99c2146499928338628*1&42c76af1fc7cdfe5efc70eb12795fb08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"746947a496ac6ba014257814e37d43cc*3&4c1b123b8ef1496c9112a537ebe7732d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb32_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5692a5751d81aa4da8dead58e90f1417*3&b0b295f886d27f4f3f87c144a1a3b456*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"143ca93fb06126b5e93fd7e9c1bbcf90*1&44f41fb5f3d4d103896dfaca4f188d45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3c56e2f14971fb9e4f31bf917d3a2e08*1&93621b7f21f7911a954b436f7b5c079d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9783bb87bd2e1e0123306f4597622516*1&9c9c215e8f422c0e30ed977093c7c10f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ea026b18fbe5dc056b8f76a0d7bdc67d*10&bfe5cbe33227ca6697c16b1cae5c7245*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3d8c32c7633ef9f8d7fcb9415c42b78e*10&52abf539eb732c8552d5117fbfc55e75*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"31e2008a11cc78afc57830b0cf41c3fc*5&68f20fb4f6687d3efb8996148cefbd2d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c9c9395befb2479db5b2d31911588af4*5&0389d3f4781549832f44bdca0e477d8c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"db213131ed9ce58eea8fdbc43f968e03*5&d009729b1d5138fb0eb9ef1fe613b7ae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3ae73b664db85507ad82fc917ff004df*5&cea9f81c0f26a721ed3e7e76a43c71a8*5&72a03822f111df0d1e48245ab207b972*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"caca8c126c0162586692babcb3de8b90*5&eded187a00737cdecf69bf3dff9ae6d5*5&4c3ea6cd56b7e4d46036b4e85eea32a2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bcf01a3b7e2bd27dc4b1a88130570986*5&7d2f131cefb4ef586b2b96c4a7bda75a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"010854f9d2ae4b62679af3f409191743*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+binary_add:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0b9515bf29a74f9e27b5da4c72e22e9a*1&8e7c3ed37df5730994cfb28ec72bb2ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"75757fda3bf88ca1227d28a9df5e5964*3&1fcfeb3887e9349550fef19199ed4e4c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1fad2cfc031c7fec89d0dbde5ab2150e*1&56f7b2de827f9808f296873aea15d840*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a45ccf18c1648acbe5e29e252a25fe7c*2&374f6e899c4bd31909fdbd766b84d585*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"379969800769941aa02ebff59d8610a7*1&1dd2bd06aff9489fc5351672a33831c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"625b1c8061fe68ed4d58d247aa073958*2&34ba9fd804535208fa6ebeca3d6fe8fe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"505825761039dae1c7d186ab8649a220*1&b59d7836ca643102dbf0991880b2d032*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"081f352b6333cf66768263103abf5117*2&ed580c398952f3d81c649f873624a7c5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9fadd555cbb9a0c93b5b191df715ddf7*1&eaf68572f471e777fe863a2ad41d366e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"235be1ccaf2025c5582b57e0fccc4cdd*3&b324a705a432d2740787dab07636026e*2&61b8c25a04de9711de192d3efb2a2c04*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9c2e09c57e1fc15ef41e3dbe4223d5f4*3&c2a649bedb74029796e9b26a7170b820*3&f01372feda25c58655ecd6209ae298b1*3&0a378fa2b25fa6f938b06b5619059536*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a5bcbf80213c34fd08e809c98a8e0135*3&825fe485c2f8ce94adc3a5ac94a30a96*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5a3fa45a553140db3ec0613c5d44740a*1&4cfaafa772c6afafacd11830f5c260bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"be17883e0df1ca2371bfd7784a72e35f*1&3598ffdfca41188add37ef6b6ab60e0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"15ef3b0b4334bd8d3164c16d527a326e*1&34155511c31e2823a730d977db27e621*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"860ebad1756196ed41418ede79027c08*1&6b2a8b14371f78359caf314aacf0d42d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f773e8a1e3e20219f98b77f204fa118f*1&c1f0beb9d0c014d03a9e5af8617f1da2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+sum:0.5:0:f16 mb32_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1f76ac70b7ca6a54f774d8d0430b4e67*1&d96ae92afc8a6aae76e93af67d5ffe6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b22167942f89f826026a4cfa025ac8a7*1&188b9820a5e5941a501624b169db60fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5a389060da5759349b1e5cd4c6ea44ce*1&fea94db38276a29d010e909e7802e919*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+sum:0.5:0:f16 mb32_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f83f149b2ffa2ce5fabf0b5a499a1c94*1&2df6e484af4789513aed7d7d2e071c41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c3236353354808da135f4190ff44fa41*1&5c9eeed1f4132e3abc4f01a90f160ae8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb32_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0c23a5d4af3b3b8b079826d252fabeca*1&051f58111e3abdd9e072f4a56c315ff9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b5229654b51aa8d7e0a112449e4f6d29*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"81a93249151e7c570f91835dcbac921f*2&aebcce27d8c6294fc4934b57c321a4b3*3&3b262aca8cea5216525044d881838378*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"53acafbc6217be194386c76a53472dc9*1&3cb25fd1d9a37267957c22ac488ae36d*1&2c4755d886c4b6db7ea00b673a49b4fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b835a18fac2a956e628892906916cf93*1&51b3714619bc733e29cd5fc525ae5014*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb32_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"467ada15a4ae6454af0775f0a387fc78*1&94c3bde08f0f543555552d944189a1d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9993706714ca51c2f98af4d4eb34ca92*1&29635dc0c434470d53512104fbd484ab*1&1bf1f5c476575f81025f6f98e9757dc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"88dfa2ac1e527bd01b9e173d6ae3550f*1&93bf7b14f68e58fa0e8c08146451b6c2*1&d879c77595d73f023e37f1ed12029099*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2cf89751b294dfd81639baf1d947ac53*1&632d3dd3d404afe0ddb56074033fa29f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb32_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f6ecb4cf7c7392cb1429aa0d56a13166*1&593af0527fc4c9df488db50811dc54c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3203ecaf8f31d2714527c8b86d7e6586*1&14c77a284c6aeb8b20dd6d2daa5ddd45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"54aa7841f5a3c957b48e3bdffd98d4e4*1&f4598b043cbeba78fae899f7fe3848c5*1&5b6144e31b62da19a7ea4901528fdef1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8d5df193fd545302506e56d0e416e067*2&d600db4a04cef86daba98c6b9d45b5d9*2&b06b869c1d65e2a1f6ef44c790b383b5*2&62ae844d768ad5c81babffa849fd85cd*2&567e8eabb3410e7f98a91222bd4ce4fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ffa4e878691a153d30ea17137c1c8acf*1&5c128c79d117fe4a576f1f2ebbe1c13b*1&a009ca66fa53248f3c5db18c9c7a544c*1&8bd8f41c91bcdeb2a68a5f8697aadb19*1&245544dfc09c363faa66fad36446b44a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ec2a3407e556444cc44d49cb1224bf58*1&57c79d273e4b6f78bc5e364df54bf35c*1&68aac66316a840f8fd4558ee5c1654be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"658e96ea1809c622a7a06ffeb8e7fff7*1&61ca315fdffc226ac75bdb7e336dbd68*1&a4a81bb422e029ef36c1514d358711a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb32_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c0c26a8e0f517066c34410e4fea8e148*1&18cf6cb6245e71bdb915c8678c0b9169*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1a70996e08827b66ed2c212fc692cc44*1&cc0e095f871d26c488ad23f647e2db72*1&280a92eea30252226e106d02f969afe9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb32_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"aa33dc702579d959141f30f7a9d0d4f0*1&fc357442159bd38f58da4149f34c8bea*1&f56c5f807f7cd1531eafee27155ae95f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"33e5cd9f70d85f508351702bf5a9b0df*1&8ff6e72cc8c07002d96ebffb02d568cc*1&fe8734a08806e7311526f5d3536d594f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb32_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f4d57f80aff12483a1cb1847ea41fadc*1&c84bfa7eb4c71586a3e461a43b842155*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"09696d3dc5a8a044e5ae2ee57c692c06*1&5ccf4de66e10a45036209f40fa8c32a1*1&4b7b804cd05fb9ffad8e818a179fc064*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"13a4dc9b6afa85227818238ad63b2968*1&a3578629587fdd4e53370b4d9053f82f*1&f403317a9c3ea1cfffa7bfe236361eb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"46d51659115aee513efa8124197b1566*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"930e2011040e0a4c7b6240a51d80f17a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"ba3e106bb973dcb0b1c228e3faaeae44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"4d9278f7abb9256dc44e1c01dc9ccbd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"808a0f3858996325823367280a4efa8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"df58f8ecb0d362d43a1516ae9e9a2dd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"4e701ab084b7ecbcc66e4fcfa4c65ff4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"33cf4aba158e2e06877f63f2ac26082d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"46895ae2b0925a1fe09ac48421a75199*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"dddeb0bb4a8071e220884c7dea96ca21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"e75978135ae6d7f534062b272555d97b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"086c471c3b48d733c3b81834a7147111*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"cd66185d211c8c482313503a72c535ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"88b2ed3202b7769ee96743e81a32be57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"08584754cba0ceae6b0561384196b01b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"00810e31313ace550c4eeabd2f81048b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"3461c06ef9e78dd226cd112e52319971*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"49a11b6966cfc17303b74c47e8b108db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d4eaf9c168d02a057e66f4685b97d89f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0ebc04acd84826eebc2d23474827b8f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"148d51788e7a914c8b2ba16f371eaf8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c6c93bf6296e0e29fabd65a0efe62b22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3a81bec2d57a2153e2a1b4d1f95dc2ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"746dcdba9fc0c5e819b7fd84bb3fbcfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"845ec24777199775a628c489c2bf236d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"231ac75bec38acf46b0e3dcdfdd0f8d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"47a70e246fb3fc5e91958d14f8bd86f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"6e3135996c7ea33257a76e54e167f985*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"f011295792d0a16959044158bf8f99e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"2053dffd927aef39c2dfcf9ca6cbcc81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"865f0b9ec2487b89a48f8395e0c95f70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"1d3ae4c68ed796794e0b477c6e99613f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"a61d9ea5e5bcb10615a65c5c7a789845*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff7b67bdbf4113a14d7b5b5baab4a039*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=any mb32_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"ffe5773552127d9e22972edb30617345*1&e11ce38855ebf99b96c2b35ab5a5d9cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb32_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"14afbbcfd478936355b05f88eb4f0fe8*1&951156f8105ad1bbb95c6bc41a27af12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"46359001e5d1db13547e3c56a75f3717*6&4a01907f7069059d7cc752aaabb2cb24*6&b16842373af60651a83a57d4ccbb3209*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8adb38e312d2b91f3a8c38af1e32c0fb*6&df58d1406fa35c734e582533c0d786bb*6&64ee1ac5c3ddb29caf3ce08bc46187a8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bbae286651efe2c66daa5998c83751d5*6&bdd8a1700dd3812cf70cad4124e9951d*6&a4a7ebea7a81321de9b8974cbdae11b5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b1fc356651c618c7fd18d6948a2993b6*3&703485d018bdc1530abae17567dc2811*3&9981d0a61ec5f32376306370d8c46ad9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0d7781fa4f823c89c8865e38ba440127*1&4bafcaa54bcbcc7bb605fd9d4e64b7ba*1&b658c3715157ae00fef7c2ace04ae1f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8e7e9f9d6601b45e8dd15e00fe38d341*1&9388fa587cb7f6a886734809c24e8ba9*1&0b634a2829993596d0f3aef9493bf2e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"be5af0265fe597986b704249294ab48f*1&a10e790e9a7b02d3a3b39e6ade97a677*1&b20393fd0c265379a32a76eb39399b44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"c95597dd5f66f74ca2350be5d2cd8257*1&e77a129b7ec0aec045cce6cd10d91428*3&f93b1b3fbd01f405bef08bfa248daf21*1&10eeba4e792ef49632194052a910d599*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9950b303466924b7e8c9f3eeb5ad8932*1&d24c68d318f58a12bedf14a888e5d1fb*1&2f896c38ec4ab7d68256868d021511db*1&861f754ac01be93ca7c88ee0b3df10f2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb32_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8eaa1dd85cb94a6818074cf7f948c2c2*1&b6f2633833df23858c39472cea5a4898*1&0f036335ad6896047d2885b1917bc398*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb32_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6be999031128334891bf8528ce0d2864*1&d559711f687e49b5a9e6fceebef04124*1&0d5f14d1db5b43f409483d359b719934*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"bc628dbaf176c13ac71c5f576c28bc5d*1&b2f20d23aa3476a2458c1e90b61ded4a*1&5018fe04035fc76f21bd7e3611f87013*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb32_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3c89e32d384f6369811d4e85bfdc896d*1&f79893511556af49bd8a3d672b3d4d74*1&152a1b818eead34c3449d85c4cdaff3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb32_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dbb8a45c8ee409a04e1821e6240d5df6*1&b9217995608813eb1f67408c6dfbf975*1&40a8aa504ca777559415844e933b406a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"b3c6dfa607d22111a91662fb972d4097*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"aa68af3f5144365bdd38943afab14592*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4060f0550dc5608a41e320936e828a8d*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"73762b29b8659acf6878469f2e6acb14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"03ea8abf95eac35b9f4dc5daca2556e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 mb32_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"b13c2e53e9081b9f64c81bd30002369a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"f0be43a5c9cf2181eef4953fd2295350*1&5d8327ca69797b84f5fd29235054e22a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"6a3500f98fc4ee19a2fd392f379d3cbd*5&e409c00f93a6315ddbcdb7cd4f394af3*5&71613f4aefd8223e3f15a71a1c179bd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"939288265501bd890235318c83ca69ca*30&cb689094eff17a30b960b8ceb8fa45c3*30&e209cd022b2f0c88d117e488d7ed4480*30&a0a0bb5fa5bfee0ca7b5c611cc9d1be0*30&c6bd4070f7e7c2cf2a24278047d00a08*6&7280d0a472e6262567405615ca53c3f8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b3d06c631715a60177798b870e2c2879*30&790f628fa42be3f30c69291f55a3f35e*30&c545c32a7734bc644402d73357d7a018*30&23a3733d9fa2cad5d22fe3d068cb5f3a*30&70f2300bfbf99ef37c0e891d7eb417cc*6&022ba7259fc16c6880d08d0498424f33*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1f183f6f8b810bc75b094500728add35*30&879ef00ca619557542b15f6de64bb713*30&af0b5541f5e0e33fb05785aea6d50e81*30&c3b4410e7b47ae626469cc6634dc508d*30&f480927f7b2beec902a89a472afeacae*6&a489d536d9cf9d9e27215d50fec9908b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"38e8e595acf490114b6d90c5de06843a*15&4ff67a4d8d72b498cdf1b27048c7a673*15&a3183109d7bf21f77175d73eb7adebd2*15&5f88f629d57fbfc5a55a4ebc3d2ae267*15&4ec92181127b742c1868007db6d22323*3&9f531ac1b691471abae913b0b9bd812a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6e2fd28372179df476eb69774e91d870*5&4cac0085c7cc9ea611ee55698fe54e34*5&6b731df1b63dcfee4b73962731bcc060*5&557219c1d1fffcd722abaeb13ccaa1a7*5&23b887f606b8d24f474436b4e06b7375*1&096a3143fdca25a2be7bf621cdcff171*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"267b795bdce0cfc7c78be3ec4aeda38e*1&8c758ac4b5cdd83d7a25b1a8843f55a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5571c11ec533011740f55ed74476ebe3*1&bff94c341603a61a4a0ae19b1951f425*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"243daf9e2a4046284036f614ec67c632*5&83efdfc0771517275f182dd804f54af0*5&81a9a5c483b2af9167a3e4fc2fbd1e71*5&222b7e239e9939791a59ca0705f327ca*5&0de714f3bca873035eb3087cb86f3034*1&fddc3d488d75707be6da074bfd4e9c45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"99000f0b0c01e03df1a00b3fd6b088ec*5&d1ac851451f2a0f24ca98b8a0ed5cf9e*5&32248d98bef4e47cb9d5c99df49c83ed*5&2d982122519d70f75ec7a1e664a561d6*5&4536edfecbeb643cd237ace120db969d*1&a111bd8c75849c7049addc24e8efdfdb*1&bf62ccfbfa8995ccf8a17ead71b90ba9*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"aae0957b4d96b4da754fcb82059960cd*5&ded09f0f34d19203961302da1635460c*5&df2a721c7c6fc371175f60548c57c483*5&29edf6d7b7a1fe7d7d86c331153097ff*5&b50bb058ae9083843aa3d96cb36e3e63*1&e278d4e113b1a5ecd0fbf0b7b836c3b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5e6447d56cfcbca3a7bdced3f6f084b0*5&5010d7396cff1e01c379bed19983f674*5&99580a1f185f012cc850509efee840d3*5&1d1bf93070d727244492373d91c8d879*5&92a8497399cd51da2e492d03c6d60bf9*1&9ab9f3abbe7447c7d4b7ece442995425*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"ca7bcdd8e9bd3cb4ec93f2e915f99e8a*5&cd691b3dcb34e444893c3d8c53ef9afe*5&cee7d75936491651180bf0d58a8728e3*5&c82fc711520db9f92a5aa6b6f38e6069*5&e1869d856def9e145d1bd8a7ce2239c3*1&6c1be18f85261e82146163ad5b888e60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c6a30b4ccf05178493a82d5c86f92203*5&75b24d58fc8dc47590a0e9986bff07a6*5&37c34a0fe8f988b339c3f896b28e8030*5&104808263ce9846422caab52cff728bf*5&9d47b7a29e02e7ed2b5564f9b9cdfacd*1&0bd3eb80a6c0c130d360412ff4f2b601*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"640d101d9c7988bef565b6c44585c88d*5&9d79c69f5ba0ef771cc0426aaa62de28*5&da8af49d833482a0cc5044d4761891cb*5&52fe75cc89bdeeb5cdf99321f32235ff*5&c3d766427d02acec392493411d415f34*1&85241e2a3a33e7b8520f6302d2a32f64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+binary_add:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b2fbdb618e0a0c4fbb08a50210294990*5&60156c1cf859bd724695ad37c798bfb8*5&d8cbab35260dc2e6a1b9e63a9fdddbf0*5&86ebc7a183e6a1f87dac8a4ac7233281*5&e66b92f451855e88aebd92be1d67ffe5*1&44294715cfe08a908ce201f9d054f289*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fefd3c959d683e5fbb6074664ae991af*15&28a79c866b45fe2062fbdfc4ffd65643*15&bb3fc57fad04269962520905422236ed*15&5afc8ba50c6bb1a4378da96469a03f0f*15&f1aa7de58504a71cd56270728bd8b0b3*3&591c150da065f5be9a32708e3c2ba29b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"89692a3a21730695625a9c699179e688*5&9887f41ae3e30517f5279d1e56eb1b3d*5&658aab03ad253ae169883fb99fba21ca*5&ce1a8bcd31f4bcb3b37b05a756806620*5&375ddda2009319a357035cd82f204334*1&93b3fde634766b983974dc3bf08735bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8e6e35962c0bd15eb2463e9b8a807461*10&e550dd904153a76c70fb429dc1008f63*10&48797e7e12594edc6fe2cb92dbc4ba31*10&7935927c68b61b0aa1c442a59e46a46e*10&fdd67744e0073ae21b9fb7b60aba223d*2&91042261c141b37f6e6e8f86752ce96e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e630304147188d2f614aa9e1c1101ec8*5&87476584133daf5a5cacfba60f683db2*5&67b07b675ab663a24494c91daa05a457*5&c16ee214d5642d3b3d24efe216f9d7a4*5&e1ef11c1f350f061f17cfd977a77fd13*1&2d1951d22129dcc7e0e8958765708bb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7dd3f2e997404e2dc08abddd6c7e6999*10&103fe88df2ea9a36ccd33492d417aacf*10&47f6eb680c0fbba86bc84f5e87a4a1a1*10&da358f272bcf988988e10add677ea22a*10&d223854f6115ae472be3fbd90a2a8bf5*2&9b9afc2c4a2e0e0df214ddcdebda54f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"21573c750899ebf783cd4c7e94cb6708*5&2b5f7ee62d69a25a8b6518249af596c5*5&1e26f8c835144ee2f539e325870c4b58*5&a407d2a1d78915b18047e2b637db93ad*5&e443079abed61be4a20e44d61177066f*1&253c74da48ce6aea5e34e826af6a95b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8b3ebbb9c4242271ed7290345927d697*10&dec526905dfa8254a671af0803ed21d7*10&d513055b2eac220787bee0c03b985b4f*10&72837ccf4dddc473dccb77686bfd227a*10&619200f100b4010ad4d964158770e5a5*2&1388f3cc7891d5fa28da6b3c6c3cee1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0cb6e7b1bfa08e1234bda4b430c24348*5&3d17410b0c2dd997f89a61d84464ed5d*5&88e1bdbd9da5e5b6ebb2b5439899c330*5&27a65823117c222404bd23684a9dcbda*5&8ff0e30f2c9e889b4da3d9d115aa4e80*1&14f0bab2b9bdb818eb3157c7eaed3b8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ddb3ceac6ebecc623209de6be53d6197*10&8fad7f7d448674aff968ac61233545bb*10&ec0944d3e2ca3d3dcb62c0239ed982b2*10&303186271731e09d6ef36ce3e2c6c620*10&36ad4abd949fd49a0897762838b07cd2*2&96c33373647f53e5d62c09ac513c4ea2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"06a909138c70ccc2c5059223d5854387*5&c6586d680cc4cdac76243e508cdd29e7*5&4e7be8b29fc4eae0b1590b96d3b5438b*5&b406f91b0faf3aabc8218bfa89c1c2f3*5&9a67547aad7ed6627fc8c15d5669ee3b*1&9a75ac93f8ea7d0db9e21f643f9b7ecc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1b4f9acf778c39b3757f7d7eb6021132*15&aaf0d59a56e18fafbef052649e0fb978*15&f54065afd37b611d976e0a715e953f1c*15&29a3bb94a6aefcff8ae9f5c975bfb3be*15&6b48834f699a52e650b696e95dd2b925*15&5eb987ab346109b9a0d0dc15e7a735f9*15&f8681b0da7c9cb7c28cabfbf58cd3187*15&c5757f0af62cda64ff913bd88ab9f119*15&a4350b77f6946a1735fb199a88324f72*3&cee9b881d2a1709d69dab0500d2fdcc0*3&d9c0e0599e04ba27c87d1d9e4cc9b321*3&0a6b4300c009b97fa75e8688a63cc34b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9898ad7544fa2f1c14e7f2621b1f3a05*5&160702e09274ab5180b4f3c446942b96*5&0eac8b1adf8b043bdef3d9dce09884e8*5&9ff3b8b2c587e8667de4c0b55637e281*5&afe0c00229d2c4c7ab72d90b4ae73d44*1&a29748ac8e32d89f86e24f7e319e7f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"10fa7ffffa12f58f4fe9d278613632b2*5&e5190dd0ab0567e64973e6be160ab8dd*5&e154bc927bc5f5e6f6103eaaa9ced366*5&afe52ef2be82a2608421ced8d94c03ac*5&f6ba81c443da88aa9719b6a1e6040258*1&796acea9b5fe5b60c6c75d241053e10a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b0265119c90d67e521c008db0a4e8929*5&702e2587cb4496c0ed9cbbfe5157ce7d*5&f3fd9118cc62ee59acb634c39db8ab55*5&3b35b8c84901f319f3bc560080b26e36*5&c35bfdc19b50596d5fc2c62e592efd96*1&036a9957b2390209963d888195de91e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"45bcde284f34649eba7179554874720a*5&71dcce058c985f6dc5a0e4e41e3d87cf*5&e35c8a62d097b0814d86c6958ce020aa*5&f4b343bfa9494728139d6b88094a5e64*5&b7734415e83a3d2fd2e68b3b89a6ffbd*1&c2a85fc63437a4b08e7fa204691cb70b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d385976959dc6c7bb0f97ccb5cf0f4d8*5&98684c989ea11f4f04ce1ad2a07f4b60*5&c2c23e48bf0b7b754d32987699e49bd2*5&9f04385a6524c458bce614237ed0efd5*5&9af7ba986b6a7c890137cd253aefcb6f*1&c8467f2554f6b541dd77f53589721ed9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:14:acdb+eltwise_clip:0.271:0.314:1.234 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"89481285001c407204b85956c5967884*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"ca621a6975b6ba7dd38dd9959f38499d*1&931ca5cfb074c0b0dc6de9740f6c688e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"16eccc027a0f0c6a52c2306b4026758b*1&f211f2e97305947a3f56560e13f11d6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8dc5eb58f88cbebf971c2a639b19240d*2&5913c17c43f4e9bef4b04adb9608cbe5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8cbd4ba8ae40eccf54998d9e6d00d6c6*2&47ec07142ec62447ee15bede1cbd38fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b45aff0589fdc1e0eb703b208468da55*1&c38f3172346b7bec2d71ca8f18e0916e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"99f27edaed38e4d96de772a251ee8984*1&24e099380e8b7ea7d50c3aa76b0528df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"96c8772d677e76d9d101fc95823edfed*2&fb2a4648dd1b53c2a361430b4826aac1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"802d160c6183583f538b93c19472c6e6*2&935c459dc6da7409b5501ce7cfa66df8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3ca951e03eba2b54845101af09faf6dc*1&d25d3d9ac6b26b34ce0754f6a2195bd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"43e02eb0c40ceda473cd372b5217877d*1&de103fc895946cf81f5a0aaf9870e741*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bd5c8b62903bb227a8fac2c98fb2a895*2&3eceeac3e9f15adef8757ce0b88e57d8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1259aa280129353e7a45ceeff91d2715*2&d3c5a73048097d345b4c4c7ee595676d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe0068e15c8e4df9d557a1cd4a7092dc*1&73dba541ad2f71eb67cae99043e443dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7faea74b6824cfe89837ae21b2ab9767*1&402063764564458f8f7bd5a3b0108ac8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f93ad57b06c90f0730e4b959ee2facc8*2&49da9e715be5e0d4179c853b76f48ded*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ddfb22791adc58fd51fe8862ed83d7f6*2&ef64626bacbcf2c1ff17fced26dbcf2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ffdbd5a3d5da40e8e3d3c7ebad9fc142*1&fe69b107f7af182b0754b8a9d3eedb77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d4d0e5009cdb6c5d2f2e37dd1bd06820*1&9b17c6c1957c3fd10f456a3b23c52797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"dadf91634b7352503f5e32807921c46c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"db33e057da62d3c7877289049b34a069*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"79eca1fbdf4a2bb4113921dfef91e2cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"096029553e3311bb8ce6aab54c7cde1e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"358435ed82dad31a9a4db29905df858b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"51e8d9db4ba375c825cdeecf20434dee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"406cbbc5d7ffc7018f44530a30fbb932*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"5a24a459d70e6be1d489bfff978effe3*5&7a226c971e0f013001f170455640f671*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"fc7ebbde7b313f015dc10697cdeedba2*1&5394a6124450f19b75b82071ac72a56e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"e3db56814ef0163b179ca8b850e17ef7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"9601178c4843f9a5b79be85fa20cc50c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2b0c155f8f1ebac66d273049cbb77a16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"56b94e66b88a9c283eaa15b81db4ad1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2ce7ae39b51bd4960ce4cc96dbab4158*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"78a5f84b9ec2ac49517ef3652be9e0bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"0e2cab56c563642a7f28d6d51ffe3ae9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"7e2bc2e1d3e2136f785cbe24e7e36391*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"45c82f4ffe3c9dd2a01781d105bedcc9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"aaf313a72f85f28cdd543d67c863bfcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"6236fc0e7adcf829e61d5e2207d55a58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"50dfa3231fedc6ea8890439a285c2cdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"0eb9823416ba9ac53d146f61dc0124fd*1&b8c8e5e50cc48ca3668e7e2c2455b955*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"00069e04c94b28819245000f219516aa*1&217d8e0473790be217ed3eacfbb1b0ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"9153c99f8cb5604281fb10620b55a4a5*1&d1ad95f26809972df62248877e7eb06b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"5f8452280a8ac362001938c9fffefa55*1&4309da0f878462d39dd488b9650e6751*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b32dcace014083fc2486d7e8901763dd*1&4046b0450fa2424eab11f256c36bb34c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ff9ccb07ba0a2701c3925f3c93f3fbc9*2&671c19d4a70d75d0410961e3ad31bbec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2768f39e71e4d9b322e6418fe55f637b*1&61d4ebccea651a57786a95786a005809*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4fcb6905fce7f9fc419c3b888b574553*1&9ef4e1ac7b7780dbc84a0b49175b6923*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"470642b7a08d3b745a56c9ac83ee98a8*1&a57b74359b27b31c99c33c312a179a4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b5a12a413494bd305f7d5a81c3e3bf18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"1981aecb303c6d57c575dd990b11cf02*1&2832f52559b8602f548aa83a5e7ec7c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f22bc7cc847a7c4f900ddfafa913bc9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1df19ba62becbd13a3a3effa0cca5480*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"2fd4744394ea8fa5071daf8e4c422b07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8a7b4e78ad0718729a3d4d224a2706cc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6adfefbf6617a4e313e53bb77bcbfafc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"c15d0c3275aa9a6604114c291414285d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"1ae5e37b33bf4028a92345e987e7c839*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"3b8f79ab54601d997be4fb3f3a269650*5&a298410eeb643ca8e210c94d547e8cce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic3oc64_ih896oh448kh7sh2dh0ph3_iw672ow336kw7sw2dw0pw3_n"9f30f63b30f274f5386057fabb7cc1c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"381e7113d60dfac4009e74d6a9f48e61*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"f32c3c3b64c396c946f2a0fd516ec8a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic64oc128_ih224oh112kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"7e91e6cc79030ae56b695e06ac45698e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb6_ic64oc128_ih224oh112kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"6581ba293db694c583e69fea10df9352*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"98644db16b8eb12e150020ac78965f2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"03deb77834017b2c10d27220c863f1c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic128oc256_ih112oh56kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"c249d0a75442bc12faa3a5015187ab93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb6_ic128oc256_ih112oh56kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"36e2fc6ebc5c50c1db2a1ac870ef5411*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2b0572f16273029abaee270641de8c8a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"55bc42cdc3b71ba34f2b9a08ea064fdd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic256oc512_ih56oh28kh3sh2dh0ph1_iw42ow21kw3sw2dw0pw1_n"f2690c7376738a32fd424126ccdd3e4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb6_ic256oc512_ih56oh28kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"a0bddaa85a937a29f6f1cf372f94cc9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"6e9506854715715a0ddb47b730a00fae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"ffa6f7f5e31b795b26e9a509485dd3e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"0c12dbf95fed25c61023bcc738363fd2*5&885e9192a7dc5c01a93f111d0f532641*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"2fb1f87677414bdcb2d804fa34ca8d96*5&59237e1bfd06fe317e7d1ac00e21a0cc*5&4da5cd247dd0b4b912880f3d2eefcef2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"fc62eb53f6d40d5eaf4ec2a6a13fb1c7*5&7cf2ebb76a9e4c85577280d0db69c836*5&44014a0f7ddbbc31cd57aa937e8217bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=any mb32_ic3oc10_ih32oh30kh3sh1dh0ph0_iw32ow30kw3sw1dw0pw0_n"ca7fe3ad5e753e40e1baaf3282719669*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any mb32_ic10oc20_ih15oh13kh3sh1dh0ph0_iw15ow13kw3sw1dw0pw0_n"5cd26700e3d85621d3477002a929bb63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic20oc50_ih6oh4kh3sh1dh0ph0_iw6ow4kw3sw1dw0pw0_n"97062b933100e88d55c4f58ed2b79448*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdb mb32_ic50oc2_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"a43011afd7f369ee5942c18d82a70db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"85d8563d1749b91e3c30e6980a6fb2d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"485c048891772eb62ccee218eb5574c2*5&9e3d076e8929a63acc54a402661e70bc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"88e155c617d3065507da02542b54698f*5&d486ba1ce5a5693cc1b3d949c1cb56f4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"90b1a82b7d58aa8597639bf7d8183bbf*5&aef131d59213cfac5a4d5b9397bcbdbc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"fb323b7b17e1090492e35ffa45a3bf4a*5&93cf1f339d798e5941673ecb9c1589ef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"6cf712a914a7b07af3f4dad0fc3e00c1*10&31affe875f96db174a5b5bb67cf21b27*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"08783bea4c7725bdabbdec43f6812e79*5&3edfa6d56e85054e63db1ec4b6e27124*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"60c6686fe28db1776f7ac206090dd1f0*5&1053bfc96c8b77a4d16f3eaaae84a78c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"6a832a57381a4bfe50b68a15361d4da0*10&3f617fe235a4437d6c7586ce58cf525a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"71e70a8a3fc4443c4a90941fd9c53aeb*5&6fae5e7ad25a39ff386ca837e00ed869*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"0103fb7a050750de82796d6c26ccf181*40&8635a326b1917429c264d4e5ce84771e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"7c79f3ea963f04fe02064f457e5a6f5d*40&21a4272660c47918a7e0f49c886d27e4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"dea16ddb673d8a93acaca23df2acd075*40&072ef234e782c6f81295c903ca98f65d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"7b75725c26abd98ac0b3a4e07161b708*20&1882ed065c2afac16ffe2d6bfbc193ca*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"f7a3d27681dd42425305af7a713a2de2*5&5d821901cc36d35dddefa2b96c3e4448*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"0f3e2cf0b2d6de4be7bbdc2d7172fdcb*5&c92489c0925d3a0d32a006fec6b05c3a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"34f2c33ba76edaada44e454276c4a4ea*5&323870dddd5dfed49ec295d1de62a070*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"2e02dbe5f07ed9aa0f91677c20c6ea81*5&99f650c41922d662c93072388116c613*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"73f9ad98f6c4ae76931885cb9f9e9858*35&57a3b369d8905497683108c043133218*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"4ae7cc7400ebf168bd40d6ce942234f4*75&9193a9c45b0f5a7d98d427d757e62cb0*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"8505e449bd5f64594cfa1b7b11ee8670*70&6e64d1b8c33e5ed7df3aadc7a3a9e817*70"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"31249350db9ebf7c13ab61bd08ca8a4b*35&f8f81652d52e50cde6df694d73372302*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d2f0766407b6a8fa89fc9835efb5217d*35&93da4adcdf66a393c1c6189df51a409b*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"870474d3244d69e8632cbc7957a7ea13*35&b9e29f64e4277fb1460371f4170a8a18*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"b2be238c38db39dedef45c6acdba8e09*35&94ec468681d7538811bdd19c382424ce*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"388444c325c82c25c9b65f27f2647eff*35&9058ddb1bec1f13f1cc62a8d2eeccd79*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"ab64a70525b89d46a4d42738f6cba382*5&2846743f4f69a0ffbc2aab7ad4cc3216*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"162a9a5db33e6cc971c122e53ae0ccde*5&8e244ce9b58803c364b30e7b99ff5df7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"173133b7b7c08429aceaea1784adb555*5&26d3472dc153824bd36137ff04a1f909*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"cc85a4b43a81aab8711dd957340ac91e*5&89a1e4cdd097872a97e991783752e4aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"3345b503a430705b85d9488396275d24*5&dad8d2b7d000806ee009cf363956ae47*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4fcf2813ae25bb587da0164264600162*30&1e27468080ac1a6411f38cccfe9e6e1d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"cbb98c503f278afedc1e30aba598fad7*30&c977532150c18d6272eccba6f925c97e*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"1028e938aea69454b8d6f26fd660da13*15&9a9885b0713c645c4db275c7684780dc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"bb644b0a09af388400a844342657bc4b*15&346441a7ffefd1eadd42e9fe5388c512*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"50633676726d61b689b9c6f719d3fc7c*15&56fae45285a51751788527f84b758837*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"5f3fbc87517f049f5257ebb1cb4f0752*15&ea300ed54a882870f7b99f2d03d278cc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"f20a361fc69987b9902b6922930e5475*15&00d3356bc45bc557fdc565185658a783*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"3b2c7201e0304099cb75d2a0aea8a91b*15&82751da9501f6dada1a1c133edb69187*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"160d8ab4c218a2adeebe8b91dcde0d90*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"5fdb7d59fe2196197b9b1c9cab518d18*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g32mb1_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"99b16ea661e347d866177433ee35bd71*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"245b1e800ddfd63359574970dd0bdd8f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"6cdfe9d748546fb985b16f9d69097d5e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g96mb1_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"f9adcbd2f86839d3738ef7eefcf2b6f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"3715d76e154204a79e4ec5c20809fb3a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"ac709e30d4ee28324844bf87ceae5071*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"daacbbd71380ede06fd5966932d27079*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"71849998d5a5fa63587697c8a5dd6c22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"01a6000a431514ec83e965595c900853*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"5a9b658d8540724828b575c2ef514beb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"a5e4e52c066add9f3b04a590851e47cc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"8feebeb9742dc3f790491e07e9c0af2e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"e837617e08f7351451647307d01a7993*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"4dc137b3a9db38cd3ab7ae4ba41f46c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"2484ccb13e766711d822a20c84913fe9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"a921597ffd617ccf0717d0962809fc75*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g384mb1_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"ab8def05b91980e2b0813023ca481611*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"c8f807afd9f3453b9174ec95ab44bce2*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"079f0ed8c7cd47ea2adef13e6682d1f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"56e07041c6fdc4c630f30528f8346cdf*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"11131037472a9adb0fe0c6e1af5a28de*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"4f305c14f09ac2e6d21e75e61f84b663*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"13b22fc8c3d20d1f7fa6aac04277fa94*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"630fc6022f76de8e7b5e0ec57b7714f7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"8562023f8e320c069ab63d6f73801763*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g960mb1_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"5b179d9efb89c1975677ae639f82c2f3*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"54a05b2def07acb945b85a919d63770c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"08e7bb5e7ab565e921cff25ad1390a68*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"2b4fb3a1074d9cb57339e8079caa1fee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"e742dced32ae6e98ac8e25987c12dba1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"b586d4d5def2e3aeddc3ab76a77b8022*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"ccb8f71e123cc26399d2246ace74ac7f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"198af11e2ce3b914c4687685031918e8*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"292a9e8d9887a96b0f4309aedf8519ea*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"63baa84e1bec40cdab8ddcccb029f891*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"042b83435aede0db1a87a4bff55d67d7*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"821b7a26a5f2557696b1a65b1c72c1a8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"798a6b686936f7d1836a994017cc38a6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"59867295cd1a359c9b529de694ed0fcc*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"406e902b65046f9993d091bc4ff5f544*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"3d8bd01f1b96366f6a4ccd99bec28435*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"789e95900b814868c865c25887629387*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"9fe33ad7c56a4c8fb2a9efd0aab3a840*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"19b99ac0c2ffe06174521a346c9190d1*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"aaebe1c71eec5c63eb805cc77ff9f822*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"f94205dd2b58e2062d9a3e1f55e56a16*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"43d4ecb935561f922f409a4f1e75e9a6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"a013ba0b07dabd8bbfab34abbb689474*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"9fa24486f63a01023454dcca298750bd*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"ca713a2410dc2e849993d1006dbeea62*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"080b3fee87b6ed128cd59b0db2911ca3*5&b7da44395689febee435ccf4606b5523*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"a85f3de9a27e9ab8b81cf6d1cf18c6cd*5&fba6f25d7c9c0208169ac60484e63518*5&03b0973c8d3a7ee38b78806d3a1e3bca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb1_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"8cc01b53a93086a9e70d104441025b81*5&2f62f8dc75accb692d6ad17c0329edba*5&9e0e765c7a68f3d818a02d59386f66ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"e7f471852e82d6461fbbed33b829c5fb*5&1abe2b3fd04a3f7605d68fbd9f1163fe*5&e3f6cda7e1f623cf502795ef6daabb41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"8cfd7623e92ac4f7f815d184997c8215*5&ccb56c2048534462cf2cb76ad1245a50*5&c05bad9d790b5616d181dba5207b1d35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb1_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"949230b7b67b79f658ddbfa4ca296c08*5&5ff853590ee7859732b64743b93648ad*5&75309c695b48e0778de9c5193f587919*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"2924e1b8666cfe3134224dda98e097d9*5&af067ed98228730f8fcdb5bb483758ce*5&ec8db5f60af5cb595f3498873cf72c84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"b290f5a4fc5e8e72d86e433af1a21c9b*10&f3b57d122e2439f4c653f713dcb5f50a*10&605d401c83ebea2244855fb26f9cc2eb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"fa85cfb2bbb2986faf134c90df3381fa*5&0a1be2220ef6d5b2e87a455767fea8da*5&a09a77a9c5b9eb7d9d357bfac7463cd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"f099f75b1be238f9c455e6c0e514e262*5&1b73f5e948ed0bab92a3c144b9ead974*5&402c24eb881edb73cb1d593d46f3babf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb1_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"2e53d36aebc7effda1b684cac5a02165*5&775f3486eb1a4318093f55889bc1d62a*5&de5d8a32a553bdc43783c85759534cb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"6165273a31383736071d32b7c10d64e5*5&e9e72802e53482422cd998253f6e1065*5&95ce9f7bf76e501ee4525b42cf99db72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"7e06f7018873617ca922f9edbd0dfde0*15&29de0e9c73476ac198192531b6e3742a*15&5ec1cef786386a62d2d2023bcf8d1b8a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"afc4cc416a8c2aacb532671457567519*10&f42fa54b14989dccd59fc82a884285fa*10&996700b8c3d10b9e9f1dd3dfaba24839*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"e14da8497d3c61be87e22309c424bde6*10&301b38ed668b91582d629bf9ae1961b9*10&0fd53dbf95761e69f8d2e017cefe2afb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"09bad7bbd0255f6594b29ae4900a0451*5&f7126f4598a870f1c5547918486e2c93*5&f1a87ea8943b7f2598cf93afea081096*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb1_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"c6ec2b0361de28836669503ef44a176c*5&e975c1c1aef7c427c6a265daed02ee99*5&7bca3871d6843afae41afa995300af86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"5fba8fc890e9261e09bd3938574efa8f*5&d2bacc61335c564045226d0981ebbc2b*5&bcdf41c7cb85d4b526332ff8485ca098*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"86eaf29620ebd37e8e374ada4bb065e8*20&261e7deef721f6351847d245ed6d3be2*20&a0d0c6c25e148aeac6be8e182373371d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb1_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e289b14c78312fe0b2a3ff7526314f9e*20&586f4a7230f667be8f2edcf73aaabd08*20&fdf4b42b019ee3437f7b6b0868ec58d0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"8e96a2dafcc08d7f999279ea0ea14598*15&5db76c43ea987cad1c60e65a376d7ff4*15&a14ca49bc7e2ac1ce20fbe36af9d0a2b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"63ccf6162bc93370bb02f0f0d5e438e1*5&c6f24c84ccac1c66aa462e78ae800bb9*5&0e393a8543847939850520e3391b44dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"eb2305ac4868bb6dea5a69d200de3ea0*15&9a30d6d657196af39ddf5c109d490261*15&afe065777937318d9c0476806410f7dd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"7bae348fab64d576772d7a6ab75b3e05*10&94cbd8c77bcad552e24de490ba539d63*10&a3cd2867d5666237fb3af53130f29a26*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"a53243ae148bae4bafa2b7fa1988b64e*10&ac6898913d2ed7f474e8b2bab3ac2fb2*10&777f5814a85b29d0c4230b8d78098e91*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"7876ea454fac26145fd069f0047d6bd8*5&2132cf1a441bc5d8f2f8b8bd32c3b4ac*5&3aafb3c4b1ba9e480c9d8a45cf9200d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb1_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"f823b7b2121abf0de1520096e69974d7*5&2ba7a1b94e5c06b2783203f0b1f186ca*5&3b1183d73563f9628182380d5848de4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"06a7ecbd1b0d50f04592bd02c9cae04b*5&8103ba7d20ffb90f77e69f1897dec8a3*5&fe75162f16424f9d0efda0769424e2c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"6ed4f8912c8aaa8293cef9c3c1cfb3ae*15&c20e9f9f4568da8b1e8a566f8f886ebe*15&680db90b8fe2f5f9b860766320894893*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb1_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"2bc2204a73d7e7d808f917624014e2ed*15&6c3f994d742fffabf8d196a6977dcd3a*15&184b45e0264e4f7987ea8e459187e42f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"88e9ca52dc736ab18f58aac7b51ffeb5*10&27a6024c2086feff025ecb607bd0c2c4*10&5c9995facbcbbc4424cf6b1a8a404ad9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"efc18b807cb2c564e2ca7165a5a73e3f*5&0614d854b1977b7a3fa03d4c95b83a7c*5&b0e1775efbfdff746c349f3d32bbab96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"414d2b9bfe7b6c5b5132ec8332df84dc*5&63f7916ff8a25823c69b01e95ff55f54*5&a2e400ba9f2dd1837092fb214eef5b9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"f1ca5eb5e911bed6178b925b2ce2b5ee*5&e2e48c753ee183b31c61a19e1cfb32e4*5&c1a221595f651b56c58c8bf1e874486f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"6ea0961a97dfc2657c28ec08ab6bee6d*5&c076fd0e03f1b5e4e5889bd17b23ee30*5&85d698233446e5252d8b2db70bbe98ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"b598ce467a11bdd167f0d504a07ca457*5&c4cf44294c0680db54d78dd41281712c*5&ff4b7e240d6cad5c950bb610123ba6d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"26608250bdcfc72af7a228f9dc82402b*40&fcf61e51360b7f5328a1209e8d0ddbda*40&bc4c372f4bdf8ba57d752939aed9ef4c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"b2a0ade01c9f06cc1275996d02a5b67b*5&4da49f56b05252eec481c76af941a3fb*5&54a38245974dc3e70b1a6ae201312f75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"e24bea9a1491c70f1f8a7e43f89ac5f4*10&a05ba23f5244d168f8721a785ff32f5c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"de853e0ff246e6812d5be8d79ee1dc30*40&2c735e32efa51c03365108c0372031cc*40&615e34991f59148247b9b8f0d5177a44*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"13a90aa727ff800851d8eed5993e08e9*5&07be4169ceb65eb0aa797e391e16ef7d*5&e145d23cd712df1a5b72a95be787842d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"44bc9a5b10ee5df507916e6085da9007*10&9819b6405b2143e77422d33f53edf29f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"7021271182126c894a749fb9659ca0cb*40&14942fd49240e46bdba7e1802887c466*40&64c37f700792f2a9171d91c1693c3eef*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"2eb8e1889f654ed7bab04b16cd197178*5&c4c70ea5311821be1115d03003298990*5&b49abde2f8f8dba72a40bb3488606d74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"066ef796a5594228d43aaa997a78e6c3*10&71c5156ac90eb72790b083e8b93dac7d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"903002c0903324c9fb505e5db8a8386b*5&94c5de482d4efbe7b4dc4423f99cdd17*5&6e4f886948f393ac2b35d742adc060f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"f7ee2e00b2b21a095cafb9a450b2f22a*40&ee3025bf7394ed75948058020de5d640*40&2834e268c3c006512c6f2bb60ef35deb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"aa4ec74dfdc5d650da316a71999367d9*5&8c2f053785563f681ffedcdef5eb0dcb*5&5a2a3a938d221161cb741287b7e65a6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"245f21915c9c3889ee804624d99579d3*10&8a7a457d84a4ef379841627492fd5950*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"7c117441658380365489d6a85cc2ffaf*5&6ca1148891fcf4d60d957b3bc9017a30*5&26923ade999b3f143579029623e72af8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"6165bcaacefee099f20552fee5129a58*40&ec6367fbee65b4d6d8f15a8e1ca02072*40&8e2136b21c118ceb3f34aad149e3f1a6*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"b043eec974d0d7f7f18306df30098194*5&1411ad82380bd3cec23d5be1a63c3360*5&4cf7ea67ec18bed232ceb8695ba9dc88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"a476ba4fcb68c2cb143ce792e8af8435*10&e3f68effda7afa7a099508d793614555*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"76622c9f5206a9617410537840f007c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5826e5d083550cf30bfb26a49e37697f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f1306be07d6e5844b000a7a0f703e4f6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"49814a00749b65afd97c074decb3c3de*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4992eb84a965b5859d3c47ed18a4f9a4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"78d65be3eaf6fa78c99eda5af7d65237*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e99a98d9e5d4d3137b5859c3ebe4bfad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4efa61e07a323448e7af0105d5aefe18*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fcbb1a93b5ae8f10d2a1eebf151a13b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"71afafa740d595d9a3d5c882393a653c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"38ce7798200514cddf6f01b11e999484*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"016372495853366c58ba1c7bbdc4d4ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"46a49a3ee6e7111eb5b8f86a78161dfe*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dbc61f3674abfb34514c202ab608453b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"26829bfac61bcfca77169fc6c2b20de5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2446fafc4647c04a4113cf57bf6b8fba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8d7bd6fbc34fe95876fe27c644f77e40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f40b1a7d0c45d44e13e73729adfa089a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3925991ef77229abbd4126da05400c8f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"238dfd8efe1fc20695d998079fa4116a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9d8c1528d6a7c5dbf08d6c3379cb80b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9f336553f8b8f86c1774abd813def536*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c68af999da4a62b946c0d45eb82758ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ed00e332c9d78374c9db1e1f8b868939*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5693d67ef072ef4a58364caf13e4e029*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a5a06c37dc16ca2604da032cb0981f2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a26ce79de4ae944882ecef2bcaefd96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6d8ea5143b607ab3ec00cfce3f2bcd77*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7bebdea5025e5a31e596a01dd53b8539*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"913a8247c10e0b29cca03d14b0adb3b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"494522233a21a581dbfa258cce888484*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e2198466aaca3cc1e1de5a83b96cd06*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2323a5b6c4ee1f4d0710a90c61f41d55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b372da1518e60a7f518b97f219198e3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d120341761f8b78c54952981fb8e3b49*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f0eb5929750485c56f2842910eb21a4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8917ec9a6dfd8c6dcd85be53276c0110*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3cc12c984bebca88b14c23234be4b6f0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"527d8be084f13e180eb1e7bff0afa8a8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bb175c52670bc41d8ac7f8527268d989*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe5333846053967d8519bc274e230cb7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c307dbcc3f0d055950fc28ae9adfb9b1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"340eb01e3fc83e4ff09497f4a4255a37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c6bec0735131594ce62ab5838da01e16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8828faff37bb6580dcf18583d0806917*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95d6180159a648ed9a627cd255e6d8a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"320a1d617a952dac0bff3ad5b8571fae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"791b22001e6b0b92900575653da88339*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c7c08016cafece01617fa7af1b496ed6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"73f12ca6809039ae7c68613d075ef886*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7c4d1c69ecb663fdfef7c19ed7b4aa91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"52029bb9282c97aa200d45592b932163*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4ea5c965f1872f07382374a67a6e4ec4*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bdc94f8d76ed9c336646c5d217f4425d*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a7bf36b773fda84a1b9294bcdad64589*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2e369bfb63176d5ac7f89f1d84ba5b85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f81280d6d1aa2ddb8129e466896c227*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"57675a6965886a6ef0ab37def1a3eb95*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fd597ca132031217fcd02c9b51d927ed*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"de03952a4d58f37250f035e08a7e0e57*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ba4730e7424e6db0c5d2ff223eac159c*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a23c26f27a78e1b0ae3a76377ac88b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"799b148716ee9d2786030956bda14a70*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g768mb1_ic768oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d8418f6f6faed50784094b2a310ede32*1&faa4fef0cf929e068c2cb04073b52313*1&f2c47a9a95e12fe6813bcd6329819e2c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4706e5c9e55baa80cfe20015b96867c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ed8420ab2c1dabe3bc6f3206d05d9507*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"34932e4c04a3ae9787e766328f8db5cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g32mb1_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e04e6e9637107ce0dfb406fbe5734234*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"420d6d0292a64b690124ddc8007ad6ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4123e82df5eaf3e18887e6ae01580199*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"45187c78f279a3853d278e3bf68335e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ba3c0f52f5ba936408b086cee72b4ed0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8d2976230d1fbe1deb4fdc38fa2ddd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"940ac2cc2d9caec0ce5b7aca68f2ed6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9c66df3e3a9afcdee8a265a68a3c6ad0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2048mb1_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"67d9035eddf000adad956218a5d6600b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f58e01679d2418fa9c22dc9714b5c506*2&5a18d5ad5b476fc27fcde4c84a8ed6ea*2&5bbaf14b4f3a3a513469ea8ef1f27805*2&a2334b5198bbea2098d2c238507e0bac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5bf88c786e2f63eba1b1f776149dcfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9d448ac77e414ead9363b4c355068305*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"73407c3b64f13cfca68664c4ff09f3bf*1&0e64be6ea6670e4c59ea3c8fd4c1873b*1&ba3850d9985ff2ece2f40983876c0c4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"49931596a7aa9246b455d453181e1dbc*1&479e07842e12f823b2f42efd46f60011*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc8_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dd52718680fdf4956544fdef6b0160e1*1&b33f6ffa38dfcfc46a4bb377aeea0e0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5f9a91283ec47bc3b680e15cbab768e1*2&3b6c8dd8f8e8d6ac65aa0fec6803b4f5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic8oc56_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"36a3cd22fe23850f2f7168e07cc02099*3&fc19cf98e507f825e1e8fc2b4604a13f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc56_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2d2ea660cdfc94b6dc196f1b40ea0a8a*1&92e13074e4c13a2ed49240a0397c2021*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic56oc56_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"33c9fd2bbbe9146f89b5c273934eeda8*3&1bd60f9a84021a7e69acabb87bfe5993*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic56oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2a0c677acbd725b482c5a8752a7e532c*2&853b9ebd4abfd8aa6fcff135e6101c96*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic56oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"cda7d855ee8f3a91a3a51b4d0cf3937c*2&4338a3db293a3ba14a6578232cb4d1bc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic8oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"48de3740b6b081b1a140cefa51047de9*3&1cd2b3fff4011ccf7888aeb2a586406a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic56oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cff3ffc83bcce443330ce50fcc05458d*3&38be60c83e07f79a860a22be289053e7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc8_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0de45561f6965cf5eff54d703c01fbbd*2&3c4ce07e4aecd15919ddb4c61f144d63*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc56_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3028ef3f4da0d24d90dc2fd19b6b587a*2&d9281c9fe034a9f52b47d7c08e407e6a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic8oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4dbaa3b4371871bfdc8420f039d71371*3&37c42ef069615285c2684bf3ffb16ef5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic224oc56_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e6bcacfb7b99b087224b6eac237c4ccf*1&2fc6d347bed8f3c7dc068fed778e403b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic224oc8_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e9fc0fa709025a4f6c248687ae27f5f8*2&4a0aee3e2802978ab9cd59e1ebf5a0c3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic224oc56_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a86b5c671824fb6a4fd35dd71d44d005*1&81e39f037c20f100cee5a514bce7d48c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic56oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8c6b829b1529f54f9e04394a58ed9706*1&ec86f48d6810ff95c79ff81ab5c54d23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic8oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e4f722d7b2b566857a2eff986fb784e6*1&0f422b5f50ec4f7f636b0c3c5ce1b2ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1eb165eb29d82ef1a5781be2732239fa*1&7566354717e5ad5f0f7769eb8344b884*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ee508e7dba0cb25501216515609b8e2*1&824d1dd2f0637ca960de618973aa22a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e0f0cf3c65a22dfdafaba2991a86d5e*1&3959a754b4b16cbe20922603631ac502*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc112_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc3534b86ab39dc5a636308998a2ff50*1&5b14caad98549c87e3b6bcb9f80cbdaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic56oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8ae966174ce1e231e01602e86e0e7d01*1&0b96680b068416afd57daeb116f83b52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic224oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"03be261b7881a888338e6cb4e47ab87f*1&eb3057b0c55b46ed57d3a160eba8658b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic224oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"031c8b170f699062ed9cb197f8a170f3*1&5511217cb85ecab819a85b07ccdd4a16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic224oc112_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"af69a5b6ef6372bb38f42d6c6176f1c7*1&27dcc89a2be0dfb037b2080356e3813c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic224oc16_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2d8543eeb0301b4af54e9a9b322d669e*1&07808788d168fc34fdfe93ef51f99bc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc112_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"bd6323b4060fcf03e56425a6c14aedd8*1&54a1436ce6b2135cba32d44505544d91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic112oc112_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"759c07dc835b2354f45d0da55e04a5ab*1&912c9a8c6007bcc9aa51306e450143be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic112oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a23593fa3878344a0422f25ba46cfdd1*2&db6aefdcc0bf153f918e35ffb48df6b8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic112oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"82811b0acbbc4395f377f3d8d8087e9d*1&7c4fd02024fd0590caea55baa4367672*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c15afda5a27cde0a0cd6275c0295b203*2&123633da7580c60936669ca39b270ab7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic112oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a6d90ce81a6fd6d4c66af9876e244ac9*2&eb99634262971c00397788887e598aa8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic16oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"38bd71738a68ce346eb55751e8afad92*1&c9b27da779e12d8abd68e8d378550d3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc16_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8d1dc7bfb2128a6d0160bb5d31d0b23a*3&bb485c7b5b1c8785dd096e16239ce09f*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"35eef862d5f019e8b30b3cf2acd9a4ff*3&70368327f899b6b0c8639033dc7060d0*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5ab0ead9189e51cca9ce47c0351605cd*4&5ec85e46c294471f5d90441def3b3fdd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic448oc112_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e27a4749a0c3ba315cff7e2f3054c05e*2&f88ab65c94c76366f2d0fb29b30a7fdf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic112oc112_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6a59a735f6af791fb36d97f433efcad5*3&dfa5b4625ba02b23e1e84240f75f27e8*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic112oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"943de3a0e03b6f99093bb6760fc6d453*2&5007c251a7a1f7ebd53a269411b1a4a9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic448oc16_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4df7dc211e5773f3299d09d354e6c8b1*3&32f3a00efb41a320fce53177a21b4d11*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic16oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"42456389baf3b597036c13b52cb34c58*2&74ada1a77c7f978da66a264c29c8e028*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"105051efcac23beb2c75d74ddcfc0020*3&bc65df8809baa7cc348891921d3354bf*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc112_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2ba1516e321965d5c87bd4a8d56d4cb9*3&5c9a055fffc59554a7134ab215ceda75*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic112oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"31c1b05e56ec8e0fb5a8144ada1b9b41*2&c2019bd9ef9fecea6c98ba839a9eb5f2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic448oc112_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df1f1bc6193db158fed8d44f1b33e541*1&6f66a417ca5f2b8a3f0dbc91bacf41e0*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic112oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14e2f8ddc6d1332104bda50a6e086599*3&12909ea476b6a97a671d44461974ded1*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic448oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5fad795574a7152664ecb731e9190c1f*1&53a4f5e1d328742795fc86c9bfcdd525*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ce4b9b7c4bb2f8e88f78d1890cafdc71*1&e4606d23a679082f7bd60b2710bb9585*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic448oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fdb954deb3101c0234c4790bf792d4d5*1&5a351655dea96216e382cbfd2a48ad97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"57f78aa6963949db6cf2e61c14f3e432*1&1f80be37fd6a8fa6ac257cd6d7ba7f4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic448oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9085ed51cb954f62d83add9f52680625*1&c2578c2254d22a0008c9b77dccf424f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c334e5f8fb7cf815affafc668f0a8b18*1&6e033037ebbdbfc7095f9abfcd25d90e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic448oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"505e96739de8bd4f565a63e9f2555b2a*1&d6cbe70276fa37a86c5770313be19d4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2d16d959c872a4c32cb5cdd515acbea3*1&268db14c1f80935e454bf4e1de46b636*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e9016fc7d1b894bd07dc001341487507*12&b0fa3990295ddadaa871aee276bc1f3d*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc224_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da1a9bd47be2d7d834974fe1b4397409*1&8ae5ea1df2c8d74491eb05da3843b6f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic224oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8051de9d437a643988c4c7d12461db9e*1&0c31376eecbcc04123ca04d1888bfe1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic224oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"08f8eb8b178b6f80ec141a5df05947f8*12&17b87e225748d4a5448320e4b51882f4*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic224oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f3d906fbf260c4207747f43788ba1df5*1&373611d5c95d52a0e3fb7ddfd318d93c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic224oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"975122b34e8a3e08c8fd41d43a44c879*12&34c90f1b91a7d10415841e449b071e43*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic32oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8a5aa1f72b7d8115e7755ae3c68e39d*1&fb8138e81cf3e89decde4680b5d450a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"72bf9dc3c0386d86c1a77b4fb00401c1*22&e4c29f869f8839c382e96bf3f6bf09d7*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc224_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bdc464e2438050d04e5e0cf3110264a1*22&ecadd1c4a03ef0fd85f4a14be419d310*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6f91d284eabfea7dd1bd72ba215b149b*23&1562e80e1b19e94f83a0ccc413cad08d*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic896oc224_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4176ddff3bc12a75f366013c4ef3de26*11&f47b7345c1be6733ecafa1fbb2976f5c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic224oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e5d6d1bdad444db14256c613c73ee937*22&f004d49655243483cf312a08c30dc339*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic224oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3b84390f9fb4eab7593382e1c6fbcc7b*11&e8fd9c47a28dac7154b4446bf4b224da*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic896oc32_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"28365bbf54efc636e507d30631e9ee12*22&5fd1dba6d35de5325e10804341d7f234*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3452e569a6851272ddd1aa2ce99145b5*11&b832f8bdccffd3325da3e81a41a32fc4*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dc5ff12aa5f408f8153fa3ad4e10e63a*22&440459da10c08c6378bf639c51a43fbb*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc224_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c0e91fa9b13d8faa2586038a3b0542e0*22&f5799bfc0fe6e67a183666bcd0dad793*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic224oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"88848b6fb36df5a9c66611ce3ab6b410*11&f5d2f8f3afd31111aad36650a0a09d02*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic896oc224_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1269503c681fbf9bbcb8d8c22c831216*11&7019bd85b059708beabff0589770112b*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic224oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"22704972b84c8438f8c76fa5b054b3f4*22&b77ba5224ec58d0ecfd82f65d329c9ba*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d2db6b873297ab23e34ea30d98b4259b*1&24b6baf8be2a87d4053be9bb992e37c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0cabf4f7ecb661ce262cf0d7982b40d9*1&35bbd18e976ef7dae618734c0be74a88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic896oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3f1aac486874b8f5594f08f14a26020f*1&4bad686366b6b6c2c0834f34a4f14568*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic896oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"300fbbea43320f85e978c012e233807e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b96a1277ac00ba5c7d635b74693a9cdc*1&5d1d00940b7d3c1ca6517c0ad06ba324*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8e61f4eb056f854b86a7043dba2871e*1&32e4eb6804da283dada048e645bc02a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c47767e081545c4aee7ec56136b4a56e*1&9d0f15b478171479bca5cba4f5d680d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"6f3883171d70b40eeb86a0030f59f8cb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic16oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9d926504db4e791ba19a6f462e381866*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a632feab639b603a971cf41315d49a8b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic16oc32_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"6bba081979e8c69c942182f240f313f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic16oc16_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"87b309462c566d2024ef37619f38bc35*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic16oc72_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"9dd0100acd0d5f1c5e4ba87de757a5f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g72mb1_ic72oc72_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"299089c6283b7b9e2c973e7fa70f9bf4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic72oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4dae51955ca1a7e30f2a987b9b28a08e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic24oc88_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b7e1c0c6d6d5a7e563a5ee204b0a2eb5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g88mb1_ic88oc88_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"cf81b3b35aca32af2e7c73b3c0e9325d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic88oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"823973c98d4db41de1c6296bb3515031*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a20f118ca63a4f47b9f81691db680584*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"5afff3645bb83ea8db0cce992d802914*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9aea27856ff2436cf145b1fa0f9fd85e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"965aceb3539bc2bae76f378e5af041bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c01a6c0b7a71e7304b7914d1e2306711*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a604342e1fea7e2aea5ec62e3af3c8f6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"f5fa91b441ab4a14d7e20c8e7e38d1b6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic240oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fa26aac4e96b5d06cad0e4a385583a18*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b0392e293753a5a8d52e4db74f0c7989*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic240oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0f671c914abeda4a7fc5c419c76c337b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0b142e095671d88bf9824d14c2219fd8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"7a2ae331ce931be9294e8b27359d5644*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d12b7b4935d3526ed5b8e9118b41404*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93a2ce8dd63d5ad5127aa3b3baf9c4e6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic120oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f1ad024cb4a1ab6ad73923d5749398e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c4cc001ceba0b42012dabdfade80d264*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"1ea96188cb2e2d4e782e2bec79ba0cf3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic144oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f39ffabb24f5c81d10cd1977c3c6526a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f737329b07d54bb6881c292b831854b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic144oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3bc5af44e6d6651ee956377fdfc0416d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4bc2caa246e953e7490a474957827e9e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"3cb9d21706864481eb334c12d7b93fe9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c68a1abb0279c47e78d5fe7188ac0f40*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fa571b05fc73329a12d657dcc3c5bb65*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic288oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"75bbc4241e2fc96ef1fb8647e73fce19*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3b2df414eb84f948a41704418490a7ce*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"249efbd387e31a9787dbe61fbc16a2d2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic576oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"66d429eb63e57c76e05e8ea2344ba19b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d142f36ace0cf7af296b5df2beed3cf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic576oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7f5ef632cdb0dce3f18427b380e94194*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"b610dba2eb70469ec978291340ef8ec7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic576oc128_ih5oh5kh1sh1dh0ph0_iw11ow11kw1sw1dw0pw0_n"024f07dbae6d608eca765cf30a9039b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_mul:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic576oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f2ca742f99395a1a65687680587b4b46*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b528dd7075ac443934fe02c26ff2cfd4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic16oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7b78b352aa05a1526c09b6e006723459*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"d36501146eb467dd7b5567ae3acc7052*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic160oc128_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"8a5c62e327e5495fca34b47fde6de0ba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc19_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"c3fa04b84d22e2e87ebfadbdf3ee1b94*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"a236643a0064dc15164b94d69b430358*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7c13ed0242f133b983c28c191d1215cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c63d24138ac7fa28eb35d72a82e04fa7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5e7d60a74bb0a2e90c8922ee4a2860ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"fe0a523119abfcf1ee5436184a5e59cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"705556f9dc6aa34e81c77444bc3cbb31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5a83b3f82bf3ebd6d09bd1a56f3fb30d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"13605d4513e8466c6f82c200c1769f81*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9fea7ce054faf170aa1eaaa0877e3829*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e244f08a2c480cd18ad92cb4e81158dd*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bd1a382478a806a4344ab824ebca2095*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"af799344e3b005a245e15967ae6aec25*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4931e49bbcb9f104487a9ea743080d50*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d9c7c715def7a6aee14dced8995cbeb1*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2169430e40a7ae6346ebd8b57cda5398*5&4f3df8f385266e47b2e4980f1e577abd*5&d52a41b23828d2863af97711a70d8e9b*5&0f45d89903f908f758c00c06aa78e84e*5&1666954f50afe24d02bf20d8dbf632dd*1&0e230c5d66e9fc25778e99ac5a153a93*1&c2d29d6fbcc402b495aa90dcfe5c61a8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d6acdbc2bba2ef2bdb4250f97304f29e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4e66dd7e2a15b6a741af38730be6d1fd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"454403b89c7e4c0afbc214f9073bfe7a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b4e617fd7be0001d1c2c9836a25e92a2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"0c7305ecfb0b3bd1cae2ef7d7a57d9db*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"e58a5d868fc69a162ef893a1a62c6897*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3fd24b95781cf48c84ce100cb59696fe*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fd756e575ab9d019d2682faed394fcdd*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"112bcbe118ffeba1f9cb19bda416fdbb*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"939349cbf35680421ad3361310481778*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih32oh32kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"322e2cc2d613d6d275bbe9cc38cc89dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"acb16c98951b99a0442284129b48f418*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"40f13a954108a3ce2a24556c5d2b5300*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"5f31ea6e7096b62be620084b8859ba63*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"8db7e717f643e4cc873288e23a60f56a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"1f359b1bac42d1eb5bfd5f40d966d55c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw25ow25kw2sw1dw0pw0_n"18c828be7e511e1d84db1c17ade73613*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"7f84371f9b972caa31839511873dd10a*5&c3980e5f51c314867e47d113fa5735f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"2d7cb36c3c98d9b72504c0e5f075092a*5&6c531923efad291499c6821156a726b1*5&2b62b86fb15a06f2e2bd5f3dc9b88351*5&8cf3c376476c13636fb3e7d67d2f59f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"c6ebcee8e1d3a6acda764d5ef1014aec*5&4b56dbd9e49648a304c385d713b46712*5&9dc5629ff5e4030c024738ba055f2a9f*5&71b2bd111f69d3fabf0db6ce4852cead*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"267d006326522d2ff5ee850d5d244706*5&28126d2eb1237eea2a0b13c2a3060685*5&bd329a13cf9ef55a2e4f0e212dee75ed*5&fffd784703834739c10afb7bbc5706fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"3b22e3db006e6faadd9a494b9ea4dc3d*5&6096064232d646aece90977d00de8b6b*5&ac874b031af4bbf23e0be40ff035cbdf*5&972428fd12341fbb962582c8273c21e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"a7e2769650a3799bd81e7ebc0a2e85de*10&3ae0af63992e4afddb930e990784afbc*10&0c3c619c79d5c81b9c1af284a95eb0e9*10&e0150384c463f88b8376cf815e0f83d1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"58afa146d1c29ce4d8f1c90c9851a2cd*10&96bf2a51df32c3acedf360f779a01b43*10&686874a5db734abcbd5be56cce6f0947*10&03f26444398de12c4761fcf57edc711e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"c39740692914dc8fb630d0cda627fb42*5&21596dac834b2820c3d68ecec9924df2*5&cfade7451253c242a4897957a47b544e*5&cd9fe8f2337549eb2026aea6ec7838ee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"0f87a08c104f974e523966e223ddb68b*50&d8c64e5475faffa1a6d4e4974547da56*50&04cfc6c2ce3f4c62554227fc9e4a9d13*50&045663df6ecfcaa0cb1de2d96dcfc9d9*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"b57c5ac82d763327500de9a9edf07ae0*40&5554ac6c87d4a60862f7f69bd2c0d9a7*40&8d0896c9181e1bf67b7e7b896a0acab3*40&e0a495e0e65d2b0d691418be6ba5e3f3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"2691bad5b4dcfafd08fdbf84d6dc4fd3*5&121d426b99a49cf85181151b7e2f0aaa*5&280fbb89759269f755bbe081e083df3f*5&7cde4e3c73f44e79e6da5aaf86f074a1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f698ce558833929a7e7758e4eb6ff7e3*50&9205006253c0d8d3aa99b5073125d061*50&a5e27295ae5d5f24d512b04d36fd8147*50&ace4777cbb0af895f919d0b1271087cb*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"420b2b30bf43bab490f16b7c6ef98060*40&ea1dff565729e0c7ca3db7618e97077b*40&ffbb07d57c673c882a87fb60710388e2*40&839ebd9ad0f4570bbb44901af0c338ae*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"01d3c637b2b628aa31183689506b1025*5&6ff690dd9b8daa2033500b91e88c20a0*5&d28b8dfbc7ef124e6ecf36fc9dc16d8b*5&b4bbb596f23b5783c2c76254a41f2715*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"78b6cdd65e94e9e48456b69e0ce5c1d3*35&1e632fb310286793c8b9b73a49699c37*35&7a8d179d66f23b95c78cd2fd3770b840*35&ab509ee2bc4d21e47fd66f00c93fbc05*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"42d92f6a9e66a9f9ffba14565dabfcb4*20&c8fc7649f8da53566757552b32ca007d*20&bc39501ad9bb96c62609f7d6350c8bcd*20&11f7702527185000ec5076dbf89e422f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"fe1d576acde93b84b09c5aea0fc0ee42*5&dd9f5880ecb4475c6f3e472d7dccb544*5&1abf1f63769803ecab355389b46b7a97*5&e0f98d5bbafb6b66e381b2f402e1dc60*5&6a09e363dc7b0c17a65edc2781bdddb9*15&f9e0a2f669aba0868c0642de2db71209*15&09468d1b305317fe7c59314c4f29f420*15&54c39363701097a1fd322e04cf3364cc*15&f53db47992bc544d72bb669d4ef8577d*1&b38318c79a61f58d909ccc37dadca5b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3479a0951659927cb28bf9622b3e8339*5&c9a3188446f9c31c44d796d5d70bf859*5&9f0819bb2cd416b7505066e7fc806414*5&d376bda2ee1385730e4b3f4519e56d28*5&7f94aef78f1a98d06b6bb2aa245421ea*5&3593e00a52db055a91b13becc172d93e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e871acdc5b13687aa87ea7020e002b7f*5&65d77c8b89015850b28217bdc4d983b2*5&d8f8a5a9bbee655ef5137b514e17ce7d*5&61f2d260ead7d5860ce8b1470f9c71c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ef129eaf547a6753b6e36cb168b68f18*5&0bdc885085eef029325b4390ce853d26*5&ab9fa5df42ee838a14e6e6ec7c58a54f*5&28eaf4bc863dbb26d5a4b4ba7501dfd7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"021689f6ff80e1054aac257462314c64*15&4a37d143fdf30f800eb54ef05ce736b3*15&e04719f8f7cd4ffc09a3a51a6f00d384*15&5ef9b014de597fe795c73b13364981b6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8c1beed721e9019e41649c6066fa8578*5&f3cb30f8e53e7c56eb63a6baf6ac2ba2*5&e17287e6ba050ef53e752c156cac978c*5&921a0111b18734f94e7bcad6b9ac29e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"2f99a39cf9bafd7b77b4b476dd2e5da9*5&9d7d453486251e7ddd93202d19fe0c93*5&cce96f8a47d33fcc51a3555fed97a4a0*5&fcd1eebcc76ab7a05dc5811ec0207093*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"ca952c0960401fd8e931cf9f546d86fc*5&7c0af1a47947b588ded0b78bb5d4df52*5&8c5850040e48c23ac3b4f936f6c17857*5&598805c8b1de9cb8aef3031eccc8405b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"ef36edb58c18ff4166c4d6c0db570893*15&1a81f8680cee101e23630334e5e6e6d3*15&156d529e37dcb25e0dc33b097bc208ed*15&5019af1b6a72293735bc1d0edee3a551*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"3bcb7513164c7c1562c72b32e922e9e8*5&b8e8e3d452c81f075aad0e1dea7f43f7*5&8fa28b45f5067318eec660696260c6e7*5&09cbd99708471cf11f99ecee705ed464*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"e827ce77ca1990baaf9fc486f1ae895f*5&6f0ca4c5b916b0217b28e818f5bf4a46*5&20cb1518a87d9e1a0348bac37d408079*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"215398a8a889732a7e4e9fa14fae1814*5&1fc8e045cfd5d245debf721df3f655b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"23827b8bef18a85c6d8f88ad455de13b*5&164f3aeb534ea0ea59f3579b990aea1f*5&5a7a0a311f9ed91065c32d982fcbc468*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"5d4deca5843a42439c4d9453bf742ebf*5&fe12a650a51d01622d5ff88a3e488aff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e9de8ce6aa88363bb1b23c59a9c48229*5&ccc3dfe5e194207613f5ef8afe7c6e60*5&df48b79d1c6435ecf4238dc3b764cc2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"2927b1a5b7259f4145ad684bb1d86ffd*5&f45efcd1f9dcdd1592a516d29008eee7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"a206d37f93dbe862b280ff4828c0cd53*5&f2943e03d4cc821f9348368a71ff0a43*5&25d17bc88e392457de52c5409d58a211*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"258e819661215326de8a9bcfc8476298*5&b9aab17f4df1fc0a5e2868bed0bdd96c*5&64de01ce42fe4d2e8bca63e6acfcd1ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"65e1551f8e68832b3f5b3870c1413cb2*5&1afdf7b2618b03d709f54fb85041c138*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"488c79ae77de2a3ba3f8fd55ff370e5c*5&d86c00893ee965a5e56c692b56792cb5*5&680352c3b83e6b38ea3de1122cf7a4c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c1e712fc52f3c04eee6b97e36561fddb*5&46331b172322fd9f13fd28603915f089*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1db638709db5ee4c527bb245fa67ea06*20&1be116ee1f1d0badbc6276c3dc5b2a88*20&dce6d55161f637f43b48908d5fc7791a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c1d0d8d4ef03855a4eea68306ab47d0d*15&e619209181d6f2be5c1b127998d8f408*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ba476a78ae175b984dc5a7a487547f9f*10&b3172092f6f951933ef3795eb6930024*10&cba516698c92c321e51447beefe0d38b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"356623dea61421b29edd65d379c76154*10&5325f4297ed4ec5744a8597f96206773*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"3bb70547fa785dca6faa82326b9730d5*5&cd2ec8e9c05fdd59d5f8abd942f1b54c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"9ce3538729029cde3a7e093e77795826*10&8620b0a653e33909ad1d6cab29800d1c*10&23fbec1b2fe023779e48dc15bc11f358*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ad1ca48e6285a270928071b8037da302*15&274094a69ed1df9e064f017636f2be6c*15&aaa466ffbee618e82a01a97f5c1575da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"04488dcbcc9ff6d5c00ef0bd350d186b*25&b4ab0cbe44acf9c36bcbc5e498a7834d*25&aa09f1df3c663fda9210c243e8ea03d3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"d84ab6722946f6ab1e41c2a45891ced6*15&beb13b75be58bed627173174fb70a031*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4d16b7a7970bc59d9648d8a54fd6013c*15&361fd88b1a65cf803ef31c36e1240d62*15&d16dcd378fbb2c8b6637dcc9dbeffb82*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"78490e411868df1ffdbfc429ffe754da*15&00aeaff0cc8781adba8bfc1b00eface4*15&1c2ae3eac3c2f1817656a7464ead6b87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"2f6b14ca72f37f36c8be43328e941999*5&dc23eb4f9fe5e1e62f077d910af5e429*5&dbf42f53d8422ea88dd0a8397bfe1fa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"024351149e1ca514ebad64fa115e5d8c*20&06153a694df13ef6ec8d5aad987c5134*20&072b54059baeef1dc7a5f79eacaf5682*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"35a7418e11f1bb1f7274e4ecb719004c*5&d532c694cfab5527fa1c8dd1c6e44ae2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d803eaf20773263e391ad11a248fda4b*10&9a100959b1613a2808efeda825d17dda*10&146a2e51c72e73d6abc6012a1995b03c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b4dbcacb55cf76aff0a11c8258d67982*10&0fa503ac38f210afb6530eccc5e083e6*10&df9260b7a7d8353bf45d9de61dcd403c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f0d7edb4a71ed8539f3f87f5c53e3597*10&9df2160261011541edcfaef70ee4b5eb*10&7d3277ca079d8b908c5380c7de053bea*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5ef5030b9a3de56517104dce675d4550*10&06272d855c05e45c6836d011a4c3a8e3*10&357f76139308b5aa0b22057f5f11866a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"affb80a5b83c4aac8cd513e3ee522e01*10&d15cdd4321ca7bdcf11a12725687477e*10&0d9d4c32649eb17e4864c43466f8c161*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4635b6a3137717716d04042b0f3c7e40*5&41e81a19ade72c2720af17ead8a61b32*5&d474f6af643a11ef75b50bdc390def0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ef3b353f95bfcd4231ed7e6632b71055*5&9666a8cff2e69622346c058858a6beb0*5&5eb083eeb5c0a77aa7e7e0e01a85072e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"1709f5905c9150599af3259f5b84b074*10&f9ef1264e2cb3d436e5eb65769f0c9fd*10&14a55baa1889e3339262211548973beb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"169a89de3618c38361b16c788149730c*5&0c4d1b95e2aefee009f9f79feb738174*5&dd799071c0533cc52123a4bd3e6c06d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"baf2617a7468152d908aeff6becab642*5&d76edc164bc1decc7f54e065379222e4*5&9d4c519b6be043611f996e318a875364*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"cafe58051565a13c96d61c38b0f49e22*5&f48f96008a6c8e70e4d874c6768322d6*5&50af717c2f9f53224aa9a3fbcf6dc88b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a549577e1dd34517efd23e24789d3b03*5&cc1d08da86540381c7979e207e45b225*5&cb1d3c17ae906a2e2bdb681710a51b3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"aab7a16a74ac07c0b99b99babbb8b8c2*5&f542b930546ac5c8591e961df81773a4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"50d0b124f7917dfbd57e4010af85f23a*5&901a1089b7ad462b978452057efc3b92*5&586c0521bf5d981caaaab861d45505ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"3e921c52a5b0394e09d57ea2ebdcd865*5&ad41509f23a9bb4d931eb244e97a45b5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"58c49259c8f76b27c2c7aef9e26d6cf7*5&3830cc62ac7e6ac208c15c8af6821846*5&13d28c2e0cb95d38e75edb7f027ff19c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"f3416c3bd293aad47dacc4cc39495160*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"2b6f46cd06eb96c281385b6afc3ac96c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"656f91326496aa2764931a4cf7b69509*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"530fb0b9c6a8dd5ffe58689806cf3284*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"35a5ab0b9dc64dbb4cd2462a84ed1147*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"149022f35b9c5e9366f8ae80743a08af*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"eb62208420e44fc5a637d9a7f8d2f43f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"504324ab78012f006adcdb963d017f0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"31f08e7182384d66b29beadfef24e230*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"3b6ac53d53c6fcf0cbe40b7b81e04f59*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"625002a79f56315afb051b8020d11359*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g320mb1_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"b5edf74223ec455c96d47f2063d73166*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"795e731800b7fcc9e19a3a9f0c2e4e66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"7f6452da424bd5694842583aeacac1f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"cd1aa76ca0bf9146d5f7d6a1614e05f9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"268e42b637254aaf2916a3f5bcab6e2b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b5ee4a081f8900ba57b660aa1d3f0f8a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"5cbabfd4d14ac0ef632209b712f53c5c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"baeea2bcc0e1dd4bb81b8baa7201102a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"c39ceed418211cbc40ae744d50f85db1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c04bbe049e99fa017c5899c5f5b90e09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"c413f13fc54d6c1bbb48668d305b1145*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"7541f3638335ed7478d719253fda5b93*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"18712786e18012a78f839e8efc7bef77*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ee4aba9174ba586741643de33300c226*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"a07bc5c304bcf5a4bdb84c0325e8815e*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"51d1602900916869f74198e58d7248bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"59874a3b80b94e23a439a4dd7151b22b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"2759960b479b2b917c5f0393cd06c6ed*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"7189df3cc2ad2afe69601954fea14e5c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fe4546e248fa00286243b9986f1c54d1*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"9ff0f15b5fe7133fe31c023e4680a609*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"19acfe7224daea8e249e95b5ad8ed1b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"8aa2b7038bcca8010eb9419ff264af2d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g3072mb1_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"048160b0066c7760f7436cf6bc6cb354*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"97e3e93906ad706aefeabd67203f9833*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"affe458f17fe0dd5f0577de2ba8dfd1c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"4380503450978e6b687c71567d517615*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"8f8bb1c6d73ea1f7ad2c7fc40979c97b*5&14fbc1ce3309f35d4fd2560bca45f1fa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7d35d8a8dc47c0c213e92f8de950baf8*5&8e80a3720a088f6af62230f2c3419660*5&2740bfa6b3382d4dabc68c951570241f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"592e4a1879b4c8837ef56e041ef18646*5&02edae2d0bbcf7d4457561855605293b*5&82c9d752fed40a0e7032491682e1466b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"21fdf73ce1a36db5a69ebb7389225256*5&aa46404a280a1f2e8d2a894eb7c4ca68*10&6ea33dd64339cd29359631fa9b7ff5a1*5&87ad30e722b6eb0e19926e677e0b2406*10&94d13d92f00e4b5d6b8ea055fe3578ec*1&e420f7d543f314614c794fb8689731e5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g58mb1_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f384d431225ac988afe4bb1f6df51a50*5&c3366b376b4f2688fd15eb9ba5a6b981*5&987d4b7799cfcaa1aa311a04faa9a447*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cfac290cd2e8b36769872d13be1642c5*35&f26280c3b9711df08764e6fc675af9b3*30&031364a9eec589846f8f5f637e328873*35&1c0bd63f58fc6bc4fec96ad288efb251*30&cfdf4999442d3c0f4989080cd50264c1*7&4cda22c0e3b32a98db252de2dad1ff12*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g58mb1_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1bab58ffc4c4022db02dacc5101dedef*15&055d36cd9221d6268298d92dd9f47d72*15&01422aca6022bb360af4090c3bcc6e40*15&19aaf91aa3d03d4ec37171c1685fa9e8*15&e48055c30505ef1f4c0fe6fe2bbb4141*3&0bcef7e23428ef010779f44759eeefce*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"96a50b26f5153acee31910af44068239*10&b087e28c453f94f9f4a7a512da90cf37*10&72fdefe4b8464734b6bc0927675eeee3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"811f6ac1fdc4eb7a0bcc9f837e952d86*5&79b2454cd175843ef58c740c484fa2ae*5&e92ac744c9aa50ba26dccfa74a30f842*5&348167b626727ad3e560b45e35c41583*5&6b5220139d957b69eb58446257a7cff7*1&19ecd4f8701c2ad2baf58dbdb92fa4dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4d5dec4e4e624740e4bda02330df1f7f*80&88172a86b668b6191fc30d8a5309e8b6*80&37cc736ca9953fa613b345b008a36a32*80&c4149e33d377095fbbf66dbf10bb857f*80&80e1e85398260f6bc3f596ba4403cebf*16&bd6f8467386b50d381d39d2106ee5ac4*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g116mb1_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e51e8e808184b12cbf2097bfa8815b81*35&93555580c9766f73507f9782930ef1da*35&40246adba611156f462dea8f07acdef4*35&00f097cc9e23d70f2b29bf5430001884*35&d4648da167bd2e505483ccfd798c9a9f*7&d86b04b7320b5cbd5b64e7d7648ee9fc*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"cdff6c157ad3ff25cfb987f8c73a781a*10&00099c0898cf9066290a2c502ada0e15*10&70c92623d09bf4936be61f37ff42b906*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4c7dd06958f40a125a2525a032507875*5&131cd50b7a4477c06074f87577910558*5&82f2573402d658cba059f4499ffe838c*5&168c6b881348882545beae060e09b571*5&31839fb0c733d9284bd501e5c4b1cfcf*1&b775a8e84b66970b92af9975471fbf6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f7e40a33d9b579ce467789de02c93cd1*40&38c8d9f7c3a5209318f57ec40e68c656*40&5bc7e24f983711c269602e3bc92a78a9*40&8ff589f12d3d306509bd91eb69aa6ffd*40&d250ef02b15b8f72be461a567fe49114*8&a98259fd810de300357015d4a43701a6*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g232mb1_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"abb953224f606f0deb501843923ddaff*15&0078bae04a6a7f937870193b6a0fb109*15&42a7c9c95c0734a8d956491bc55c29ba*15&3ddfbbf95a3a0098bc7d6a69670a75ab*15&6ca6db8b908ad462f0906ba7f56101d0*3&4185b4d12459cea44a129751b5ab9232*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"538b10f27c7f08f5e80e7ae5d3ceae5f*5&fa5b498c965cee04144762e98c8aa0c8*5&ce88580e6a279123274448f92a008847*5&b27d47f416f4d4cc6890b44ff5e0c4ea*5&9c246535e8d980c843001daeff64708c*1&3d512e9d702aa7eb585282c46c2d3bcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"191765ef13ac2355954c7ce57fcc63cc*5&0821d382d9e40d1d871384b95f5ceb79*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"fcb0e30a3c72ecd29976b777d7da5b6b*10&6be5ca3f58b615ec212d6638888fd2a5*10&8091c22982e7bda27ac29be7a70fa30f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef6c162234b8cdd08e7d23451a78a5e0*5&e460e6ed9a70af2601d6cc47214281c0*5&9d30b3e37260885496c0a6e751e3256c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"8dac7f72fc0b58475a67dfce789cd53c*10&2b4e736869bbfcf7e5f73808dc232f2a*10&8b3c98f678544a842cb18346b560450c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"0387876c35df86e6a94fd0680d964f5d*10&3963683b994102f879e451b5e09a2ace*10&1cfb5f6946d9da1e0a6f6e996cd116f3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"1b2809e54498e6b9f37cad467b62c02e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ed2475e6337703919378207802338855*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"c45fa3d556a22a59812388f94c595203*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"46873784de2dd5d7c37260c93df2af7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"034423db6b1976384548697404a04c5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9e818dd79d1d7a021d4f8c27e3097920*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"18bf9d9c668de6b2392ac4cb0e0f1d9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e138e25f55a1468085c0a67462902c04*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"618d342e09c0bf49715eeef35db042fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1681430734bd94f25903bf9cf7c27507*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4cae3dee1541f318615d3753962d209c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"ec243d735cf47b3c01661b399413a59d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"042a866587218cf5ff53b9ba523171d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"663f66f7ae0feb58e4cae5b5b6560913*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2a5c3d7267ef4d9c42a98c27e20e035e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1b510d9d0401dadf0eb1877921b0ea77*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6fc6a9ebc6a72f7d0db746a5bc641d6b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4a847dece1b377bd6965bf7e37b39a04*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b859e5279458a80721075e0283ae694a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1e1d5069bc27932d3d5b124cf4d7cf32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"efd19b6f8ae056ab94046f4479fe2161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"426a0fdadf0e4003b0ef052c17f170e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8778773e6eddcfb7966c749fb426b92d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"e2741135285884d2dc78a0f175541443*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"02bc6cf329bd5983475ad9dc1b7b4393*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"24badecdb65ba420f7e108e4d6a04397*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"c4a28884dd761ce5133956e12eb5b0e8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"bb6fdc4228e9fa70510114a47a56edcc*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd16a16b+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"6f20b419e576b23cd524a8f715a8c4b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"ee3baf1d11f1b0897a1ef9ec6f6ed025*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e9f9a89dd71d66b4391268d5632d245d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"261699b580cefcdf83f7233a876de124*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=acdb mb3200_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"25c99850d2ef0f379fb411cbd86badd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"85074860c41be123d42baac4fb43d5c1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"3c350369775fdf680a5ad13bcfe9d13c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb3200_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"f3d8c9743e56b3b9696619b4c469727a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb3200_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"59d086416ee40a303fd56422fe283968*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9cfab45b84a1d29ff7768879636f941e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2f3188dbc510c554cc35ae677a7bdc55*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"f3a5d909a78ff2c1b29fbe9b0e10d1ec*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"5b28a5f5026ff1c9f11bc3a81adde08c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4fb1e776adaac94b015b139a2522d09a*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5cf3747c91bc7d79ddda124c3e6c9844*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f048894c8fd0cd582c57aefe36de899b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"8c5f0c956ae28ec1f84dab5985a919d4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"67b640cc8700d4bbd1d89731bb39c100*35&d96e5d53d7bce69da965148ba8b805e7*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c7892b73764136b6d3e4c6cffa14c267*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6e93418a77c6f194bd4ae71c2d726395*45&c36aba9eee9b7db4207d08ef55f3c67c*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2ad9c76acb06acec6b3657013274b6d7*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"108f21835176d77c9d65b232bb582824*5&b6dab95097a2b964bb1f32582cb07d4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"51edceac562b3b5fe084403e0ca9d1c3*5&6fe3f21713f4fd1a870271ed20a88de9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"3f954c876757a2ccb3f389dfd4a49e64*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"ca9f7caa6c3db8ff22d23969a982f4df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"569bd815a582a1dfbeb458aa495886c6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fcb5d0b7d81559774cd217b7731cf6b9*5&701ed5a3c726a7434b901abf1a8af4fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"464359b40b62ee8cf7ce6fa277f7ac05*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"48bba24c86b296f5822f18cb9de81551*5&315bd18afc17bda3d83cdea244de41fa*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f47169611095c7cb0d8c056b5199bf60*5&797c8ecec3564d968dc9873810abc077*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"14972e3c0b3806fadf7d8a4a55fc8af5*5&6bf087e861289176f021ba77e6046499*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"11aa5351a3c8f1cfa690521e576e7f7d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d0d22a685b47b4a0d8e5b7756f2f3e2b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"275f74aab829cc5a5cd7259494b4d503*5&ae72c9e49a52f17861e29db275eea9e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"570094c7a2df9719df017baed35adcfc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c4a3296d9738be1a4a781d2e4e541edd*10&164f9d465c76572de101477e0bf89ecf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"5ac075fa8cfe9ab248f9a8d430f09b4d*10&60f114ae81672860bf9f99c55ee0107d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8f158d4c23606b50903e243b9855f747*5&6039fc59e05464e7a90513aa84c3e05f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5c4d5d4396234591645d4460c8386113*5&9b2e8e875443ac4e6877d6fa8f1b2a76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2a59aa947176e1c958d89de38a05c5b9*5&6e664703b6514ecda29175c03c5b0c76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"fc5a86c946fd0dbf2b3f2ba6df346c2a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7ad7e5f969f0d129caf879b524cdb6a5*5&556c8a0c96726c1d2c3356af75cd3e1c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"91693651bd862d258ff7cbb1a11d9832*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"20546f3f7a7272fc65c9665744a35879*5&a3d423b078cd2233b970516ee466926b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"03c11844702f0b19f00832a515b171f4*5&3088391aeea57fe1055e35215aea127f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"24360ac6197a3341e8d3152790a9be9d*5&ecc5985c25a200a603063da7f6b4c3d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"05d4405fdf86247a3b914e2ef2bdd3d1*5&53e8436e97033137316160550ef8617f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2c176294d2a6507d98204c1b590b719e*5&4cc686ca0d6ee16c2d3a314dadebc2f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1ed4cd771f5dc797c70b2c2c6532fa8d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0347f597162596e345a97076bd3167e3*5&5f464fe58321f0a0a444ce8c8aa7630c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"08370177833fb33f3998c46b5d1c75de*5&14e9d10fe89e034083a3e78cbcf6d41e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"93325cba28449978479316327f21ce16*5&18a17827b88104d946cf4cfd792d34c1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:acdb mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"402483c8debb9f65fd42e7f452c08063*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb32_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e5e68fba9cfe695c2849b4e0447c7621*1&ca77ef30ff326fdcb0786a75b4cb13fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d7442b043e5a62edf167c10f4615f969*1&f8b49cc7a67a1285172dadbc3572a17c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g160mb32_ic160oc160_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"caf63b5955f3ca5783ca95f2ae38c8b5*1&c401a8399211f94d164d81df1dd3bac0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca2dfb765f6b2a64b091c5757538e2c6*1&1d59a6502fb911281514d86db8239c91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb32_ic224oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cdaa024402f4afce4cc09a402fb0b9d7*1&fa02d2239d9c23efafdbde936a2f53ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"825a5ec4ab44e2a9411ece6f03790701*1&6cf81e0377be3e51f4d178b567b5dbb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb32_ic224oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e5d933fe965d529315a61b4528542d2b*1&5f991eac7de193286b9b2db20f17f894*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"69518d38de9a8f9664b7a8be95a3f0cd*1&9f73528f0c7c6245bf36f5b688b74997*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb32_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ea0163313a5ca0062b3f95b4a132f1f3*1&711b0361ef2592f30d3b41334ddb74f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g320mb32_ic320oc320_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"044c751b96e631559b0a44cca29936bb*1&8c41dab5bb76e58fa2d075096d283f12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g352mb32_ic352oc352_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f810fe7def2b6d9c1c3e6c8dbbca062e*1&b141b4d6f598587b55bbdb52faa29661*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g416mb32_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e8c2795833db2e1b9c6b19dc6c41c469*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g448mb32_ic448oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"443664ec2daa037fbb8bb0dbb798026c*1&cabc1dc69462487d5d53af03c689e7e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8df5aca58363fdee5b1cfd606f1ba2c7*1&9a2fafb780470fc4dc36d1d8d3661e71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb32_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a4ffa2f0e22a3348d8a0dd03b7ea7741*1&3b6e92b8105ea76ee69c07330ea8d747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb32_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"475b00b130a91fbc2fa7a9c32bde08d8*1&5524f89f4bee97343b0c545f3c6b0ebd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb32_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"87dacf3f1b2accd8bc4cf00c270dc163*1&988aa843f390fe8cfa5c759ab4e24749*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb32_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"72829dbef72523379bd44a351a222b9c*1&aa7a58ea8ec36dce2bc22ef060b0cb63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f6a7d421aba96d9e4661bf71247bc2b*1&1fc8620dc2f54d39b961e891acc6904a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb32_ic736oc736_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"67785030fb4e2e232951554f64a04366*1&65d4d426cc9de377a219410b6ce6c7a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb32_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"843fce8075e8d6b976ed66f634dcd763*1&e0f8cd4ae244f7fec5e924de7049f7a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb32_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"39a37c9fc7d269492900ec66ef4d33ae*1&55efc626a12f53fbbbc733cb621fac02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb32_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c3b73dfdced2a73cfc6aa272402d0c8b*1&00ad8a658092f7890bd90664dcf57f99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g928mb32_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"12759e14b94889a28499ae9cf27d3e0b*1&a97369203046fe5e586c520f37e0ab1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"858b683df5d823b7c10b3011d79ae706*1&ed007cd9cd163d958afa96b2559d2f0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb32_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aaab3c435557b2f88f72398acc096dc0*1&9e405c303af9b52402e73b344aa9a2a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb32_ic576oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8aa560d37effb99d700abdb9ebb6622f*1&bcd9eff7d89e7a3b4184054f8be4ab8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g608mb32_ic608oc608_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"300097b80eb434f2d28f84456f4c03a3*1&7b71581fcd781fa1cd144b07028673b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"933ffef22c43a1b33dabf84227ac4ca4*1&71d2711c6c0a165710f7c721380f8138*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5499ac7d8ee7ae948ae95703cbe6e678*1&ba5752e023e085a33e70a784a43aa72e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g704mb32_ic704oc704_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bd16741bfb9bc1810682b598cc05a165*1&910547df9cf134757ed2d552d88e32da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g736mb32_ic736oc736_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dee79e62b66ac41ac0e565fc6ec067f1*1&c11fe1b30496ebc49f68c46ac9340ed2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g768mb32_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b46103f6922637b08e8827cf1b8f0eec*1&ceba53037b75a8612d4eed66bd1031bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g800mb32_ic800oc800_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5097ea72321fd989931ddfbe7d5486c3*1&d873359f8f526802b4c4012d24cc2473*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g832mb32_ic832oc832_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8fa9ecaf310cc65d73d8a7b5e3b4a638*1&bb382b2409e03992aa63734f6ab579cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g864mb32_ic864oc864_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ad6289d3d552d465c1b94fd20df2db1a*1&f74358a2484de0d9f0e31158f3cc635d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb32_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"50b547d9052e6fde616296def41e09f0*1&497d3eb284db4110a844fe5b599c9560*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g928mb32_ic928oc928_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d91dcc1cbfd5105bfb3e1be924e3e26a*1&38bf9dc59b061a85a746e8e761430fae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8b54933e8cf1dbe3fc7514bf62adaa30*1&a636e5bdea9db2c4ba900a33cbedd4fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g992mb32_ic992oc992_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dc7c6ef8fb80d97f172a49e8f4ee5ed6*1&62205fedec2e380e13f9af0ba77246e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"eae7f6efcb7e7b3946788eb10db5508c*5&b9c5189cb6a17df4d63a7e05be528214*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7f28bb0fe10c5ea6c676a7f09155bdce*80&fc6174fe793e40bf5a130e07cea1999a*80&2f360b73aab498f97cdc4994241f9b48*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9ea692eee84824991c48e7f3431965dc*85&ae2061042bbddb838878d9901f1881e0*85&72c73a98d3674c1d6813ae1954eda03a*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1db302b0dc2ee77695bc6578b41fded1*5&14f334589ea6e40ff7356d0b93201a60*5&f66e221bbc6eef86a53a64b7c47babf2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"077250aac4c5bc82e320f8dd14c91d00*5&4e19c29cf00dda1fe3d5c72eba5a2715*5&71392104dc8a1960ec2958094716ce9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"9a966bf7929c575a3f5deab3e7f0b3ea*5&5701459d5b35063a799c80fdef67e54d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"bdf9856911ed470c7a272de51ccc6283*5&97037877ba5ce92dacc3df637b9a1147*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"b3f9da11f69c064629bbb3f406c1df3f*80&d6ede1c716e4d9be19e7330d3746aa3d*80&b3d9c4105e6ec6628395dd11c93bfec8*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"648251f926d3b190909acb68aa97b8c7*85&50f2b415472abeb5a8e66e0198689323*85&543dbf190c778f5c45082f59b5395307*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"7e9f332290fb353f784c8477ece8edca*5&7513888e1424e0b225db86ef07fb0311*5&d37d3b5c5ae0eb626a53636bc66377e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih678oh678kh3sh1dh0ph1_iw1020ow1020kw3sw1dw0pw1_n"7e4969d5dda892cb913c224979aa3c30*5&c8aed343694961e215fc301b10ff55fe*5&e199ecb3f4339a2f49875127e874051b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"e4047057592d46e0f5d0fdd224f8feda*5&a1c62a45fdebcf74e847ba01b3f1e5b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"677f962db364fda070cdf0ad019eecd5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"7ac64673b81669394d8964c34db22853*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"053637b93806a1798f45952ee42a1b84*30&b5fd468235e396c5b0be3c665ef8f856*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ecc8d4d0634511262a6b92829420e333*30&f66f7ef3555b9cfdd64f03930e8dae64*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0b3f3b07d3bf0c4ae15ca54871789aaa*30&d442c8917421cea39e60732cb1586f6a*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"69dcfd2a0969d8497969e74bfe1cb882*15&8f17da4d235c39b1a1ad906f8b2b7112*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"109d9de94c9932409b9f1a550bb249e4*5&23694fccde391a64d031014e5b6ccb64*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"6eecbeded9dd7f4debe04bdbd1a24f07*5&df061a459d4f2ef89cc193ef94504aca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"add09d7ff6020720dd12ffad9fbe42ee*5&63fb1b3fc1be5cc21fd3c0519b034bf5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e5ec5c9bb35e0ab68ca481317480c0c6*5&8e76b2f391ec61110f0865715f1ca7c6*5&3d0f9f7f9968867b3a4f99a501f8e598*1&eca92cd55d9c071dc2f0ee61e92715af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7c938015f762b61a4279e53e5cfc242f*5&0ce19ec88ada52138dc302d6a4ffa367*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d3c5c2d4598a6540b65a4964d6c60e32*5&151416b72645ec87d73bf306fa215673*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"be7f8ecc564567b56891dbeacaa29cc8*5&684a5aaa64e876938437e9afd866f771*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"0b4f84deed842574585f2ccc97a9c7ae*5&b29c3b2019ecb0df97854416518e638f*5&76a069912180c5b8f4889eeede55d769*1&ed6da543bdb349cb37d5724cef6ff8a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"5dcc230feb540ffcb9a19082fab39046*5&163dfc56fa4bb759bb03da82159d1bd0*5&a54821472d414e49a05df2e83a267da9*1&6821a72054e63c1f8433c50093881f68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"18103d2a490148fa267cc1f428a95090*5&16503f87b70aff3013aadc327bc31fda*5&7c15e824df0dc6262797cc26716432d7*1&3f63dd819b5f1b6de5af6c90435f7d4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7bf573538eafd3cf61346479909d880*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1056oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb656b3e05ce7a43c9df0d4dd538c116*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1088oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"926ad567d7ed58955cf4f5bbb08bebb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1120oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d7a3182df4563d9ee8163991ba20f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1152oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3097aeadea9d9896eeab184209ac180b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1184oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2743b2a62abaa20d5f16d5d768a4f1d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1216oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ff3d9a33455530f5ccdd82e04600091e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1248oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"134fbf658def00ce59b3de217b8af7b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a595b764a9da539d927409784e61ae7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2ff10e5bfd15dbae442b93177b92e8ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1056oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f5e6c2eb8ff4d3bcd4b847c4bfea11ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1088oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e560876c85f120a150c396960a01c5f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1120oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"82eb2e422a31d9bd00d2f27cc9f22fcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1152oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a2d28e4fa9af34a54dfe7928f22cc9bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1184oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a659beeebc91add7eeb08748f6e6964d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1216oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b0104540519cbbe9a27d57b9a628201e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1248oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"08e52d0d0c9c2bf92d136964cb210c0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5d8f778742d5ea6663a3bce1bb77d0d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1312oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"68d83b48432d4ad95094ca5522bedc8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1344oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1e7d4f37bf315fc4f23cebff225e70e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1376oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8401e7dd608c23663475b9431566d432*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1408oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3f381f3a7b7f477e3bdb0c1f74141625*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1440oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"27b06d07fced3f4cabe2df86dd6e0083*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1472oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6a55102a884339930e48633832ce8980*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1504oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ca77820e5dbf1a8181abe80faa5a687f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1536oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3cbfada3027ff8f1dca49f6715c5bb95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1568oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b59898bf56c33296d128128f54db6153*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1600oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ded7329752e92805317c70c2a76e3058*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1632oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"125ef75965cba758e2fcabcef97e46b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"935ae5932adb21f44354499231864568*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"f146e01936f8d36c36e656df871ce537*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2e0418059cf2280f89de62afc6e83b09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"392f1064e1e60747cce4556055ef218b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ad7e0a0b4da134217456f435b9584ac1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7fa399b137fea5fecda1183f0db2136b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"189eccd3ecc0c3e6ec48e43d59d2f515*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9278480ba3a42aff5f1cdef3ac9a240f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"c28b5e288c02de76c7e3e55e2b26c766*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1da9298c600913b3f52baf2b03a62624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"39b04237d7625e4ec99b0dafb2169ef1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9a0e91195bb4fc5fb099da375fa74b79*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5a00eb81d520d0fda3d2c958fc0adc6c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"2117f19903e33401e98341d680e8ae4d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"99cd447519415fa3884b656cffaa698c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"10b1d1e0c39baa4a0aadd2c736b9157e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b073a4d4b5765e117b753c2172138e4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9be21ef5a2f6aff02c70247ef1e11747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c79d62da8c7d7f078ed41aa45d7c9977*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d544b7666408165d00480fa225005712*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b397ce30bfad0904f88cb78e167b6ef9*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c9968170467caebf86ae4b70d35996d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"b04817b81a5f960068644c932761a2dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c0558babdd5e8d46afe349fb8a7bbeeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"254cfe4f1cf4d18977fe58923cbf8e82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"78b0277d486e7a75feb68e1d30d8b739*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8e03829224f586c65bc5594395213f01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c28a5552aa3707eb0e0d51367a396357*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ea36a8b39f1370f14bf15217060f2677*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d72f800d9ac377019130982f824b777d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b5f403e59457316e91389f23a46d71ac*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"22472b0cef908bfb2b2b4487bc4a5a15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"5450977c3bd334efbaa24a25533e5e1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8968e542a118fa108a05bff9f30e6869*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9064a62989a9a5bc78d2589036f5454f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f3ff1955d8965ecca07bd7282a09a821*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a94818fd033dfcad71fe354767cd0113*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"e836f359bb6c244a09be3ae90a345b3a*5&b884d74df6f579a13a1d026f4ade4903*5&8bf5586ba6a435f63874667283add8b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"f17397c1751aafc4d9902eec8de29a76*5&a2db6e9b511f51485e936b4eca0d5a51*5&9008fbefe91b432645c4d9459bfd950d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"a8ca45df619b93cbd5965777cf9d0ac5*5&ed320c16a42a64fc77f1202f23f60136*5&f2e7ffc372e337779e079f7162b43ea4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"7fffcb02924f8e7b8cf74ced7175daf8*5&34624139a1c804c991bd3033404850e1*5&531eaf86f51a98b7bedc973c71a76432*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"ddbdb887a71a4757972925e8687ff0f9*5&e7d064a8c70983c6a9d1648222a57cc6*5&62213d43db840c3e8231afd0423c2ef6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"3f890ce8858132d243b1c1c6f9ca4114*5&35c850b8c846bb63cc0e411346fed92c*5&bc6f2d13a8f32e05cd129877dc9f4b39*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"9c4eae290c405d1f16e92fd6597631db*5&b2e593a83fd56c4d5f2e5e3eae41c4d5*5&fdcead71b5205eef5f3bd7ccc9d88b6f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"bd09929fe5ab7ddb28ba2016e91f03e6*5&35d3a035f6592a58e92eb9e6fa808603*5&df40dc63987b8d911eebe4f9afa84ceb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"070a934a1853819f35c369bf9cd30b6c*5&f0adafced41dc291693c9aa2011af7e7*5&34c8691b7adcef40c5eeb67fef6ad644*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"8bc3fe25bc4b65acc79fd5d6bfff0fee*5&67918af1e68104726429174b1da3489b*5&21359d04c069256dcdd2729eb2e41a06*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"831a0ec925b94de1e19614e6fef5d664*5&e0552efe109b3d7dd876b4d7759472c4*5&e015e9895c8be41d997e968964d7ba37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"e232c291adca2b2d935c69ada467b645*5&10f54b0a5a604660281ca891ce18ca21*5&68345381dd033dd61e91a52865d94e84*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"58eb16600a2fa5b0d88c901fc1341b8c*5&3c6500e181376b460ff5bc706bca9022*5&13fd3354c8f346db50c679a446ab03ff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"ab83a0d6ff962be135bc6a0f2e424706*5&860dfce4c5ba5f87fb264fc33928a9e0*5&b4b35e601482ff546f7b3f56c675c868*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"20fa0bed02aac689413d2f4ba5435eb7*5&408e1b731810d0410204a5c1d8904f8e*5&574a9748d320590bbd097fb8c57cd21e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"0c708cad9ae3bee0cb6483e06b4c8a98*5&635230964ba9fe9f533fbdc55bef54a5*5&b260bb62e5b4868ec25e5c38adbfdb37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"7eb7788362b6f41499f700ed6c9bc502*5&4f9e7ee68de057dc96409d417726ebf8*5&601c7da3bfe5e40727a9cff14d1313d9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0 mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"1556ac5f13a1226cbf43e8244200b80d*5&cefa8f1b5ec28e6d8c200aab530436f5*5&89921d8b3d0bf35e114426d838118ec8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"26af1cd7fd2e7087d66613651c4fb626*5&01dcbda91fc45404ac90f9af503376b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53b2dd8b1a1ccd289973ba6e586a1a5c*5&e62d75d173c8f0ebb4a7f8729873a5fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1c84ea6205d76c05414b211c0f916d3e*10&e665830b3b01a46cfc0ab514cfef55c8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6b0bc20e6855cd189cdd4dd7308ddf67*10&e6ace0292db3d333b0d16eb7bcbd66bd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e8617a6a46750085413d5888e27043a0*5&bfbf34d18a0baef5a3338a5cda931a2e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9ae71a0322fe3e8c7dc5d196d891db65*5&4fd0670f882400b92974041c26650ccf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bec669c452ef68013a9512c3cdffa570*10&970db71b34c92ec6670c426f3f714aac*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d2d67d47de0ba8a5a3efb58b96d80f02*10&e3efc500f02b4d0211bac8d7c1edb536*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f82425cee597678cd0b02dd39e958228*5&b70bf5373913fe529ca67712f8712cca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9591e578f735196a849c14e249a7b44a*5&56f64180ecf43fc9d41e3cd67b6d8c45*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a743938c868e3d226a18647bed391e09*10&e21e92364a676d62ebefaa9b580cdc1f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"946131ab4ae8acf16a5c23c4ceabb4a0*10&a3879d4fdb3c9c21ff45726ff9caf4ba*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ede8cea2522c5c26497ad1dda0724158*5&ff41b2d8fd7a9c817440ac17117cf89a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2ada82fd45c4240236978da8fba05e7*5&4e774faaea9f43ff81856599bb905a1a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29e0cbbb0f79858db8b346a9d92bf8e6*10&df33746806330694ccd1a0e993554b1e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"be96450c02bc88da4e93785ddb0edf26*10&eb5770864bb2997518ae3ffb3c6769fb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c1dc8a54ea8bedecd527fff8ff2812b6*5&fd9e024ce39f84ef84d2c79dc24adb2d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"28c46a5ffd266fe4082b81f36e73b6cd*5&b3afbade8f6fa40f14819170c183fe03*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"03889e23acda21ae228c5916bb696100*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"247f09c5fd8547cbfd8b0abf5a3f16ad*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a1b1df5f371f53f0fcf5584122c24134*10&6e508415f1ef66c4987e15ae92f63f00*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"a626e1990d42cc1b4bfe58c5eb5f8381*10&6d041cd6489c11a13dcfb1eed4781527*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"9e93c0eb93137507da88dbb06bc6dd76*5&9990201f42fbbca605fce253ffce2018*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"69b2cbd090dadf359bfeec2e3df0bc6b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"96a6ac365a78e758dd7d027ddc9f54fe*10&f3594c97bb30352af8b744347f9bf95b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"b1a36e34b3d738986482c4fa9903a1bf*10&4c95e73a57a94a80c7e8ff86f9ba531e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"95ff125278d68af57dcaaafdfe04b31f*5&6a4cd4ecaa9d601180aa581680a6030f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"54d3ec49d2021654e1a54bf50cb4da18*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"28100b051278a9e24edc8301c9716dec*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"f8429dbfc0252d0c98c5ed52e96e9eb7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5318f27ded0c0c3290fbc56a500308bc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"71b83b25d470e5cfe0c9a93731091f05*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"46c7b7b5504b21852dce5263cf9b8d4b*10&d37bdb8601fb86ca1ca455a768ba0ba9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"1f394e41a611d7a08c9bd88d9345bc49*10&885509b0c4fef3accf4bc8d1a4d217df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"40bf900bf0fba61d1a6ed38ab14ee1e9*5&bb0a019ff03763ed36df2b00a9f87c73*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c0785f0a9ec2ac73e8b782193dbfb6e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic3oc16_ih224oh224kh7sh1dh0ph3_iw224ow224kw7sw1dw0pw3_n"f463800c8f3a63427ad5bd6cea67fb70*1&be6f35aeab38330fbc11e79ecd103811*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic16oc16_ih224oh224kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"98b49574d05e6136cd1b3a6b2a0f0495*1&c436decd61cd86c1e18125ba985f6e37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"da6e65a71295f6394c2b49050806f536*1&9da770e4d74f03422107ac902b64c4fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ca6df7d2115843fe56bce842a2e3f9d7*1&00f3c6e1ba38ab634e1e816d2ed5c619*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bc37483efe51d690649a6762124c92c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8244ba4fa6ae36abd874c47f9ad97f2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"aa9d3647d96ec4bbfab8175a9af3c4e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6866f1bfc84f2570f519caa8176a3b4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3270d04d521e2025921ecf9ccb2d2714*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a2321a3cd0b53d2e0721d277c0b888a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"195dfe7965d0f93da33970975bd4e216*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c8520e0c8c956de1b7988503d722f35c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd0c657810bd88b7694d5622ec4f8333*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9716e0631d7456c2e20af5af179222e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic768oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"42769895a5856da1e8c85b6101967a0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9189ea3f514f7a08c4a02ebb0f9cfaf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23815c4134b7e54604ce90cdc6541903*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1152oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b8b8ac5a690fc3783ca4d3e328c9e85f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3538e159943a1211836c289b60307f90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"91140ffe0152b62ba3c19250789aefa9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1ea67a9b0285d1e30f49c63d27ee2483*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"56881fc4ca56332486b2c7b072826867*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1536oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"399fee20114ec06e7bfc4ec050b6cee7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic2048oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3021a28dcfd71b438c65f928ae70dfeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1536oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"666c36e377f2fba490941afd7b5aa90d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9f1e37eb13cbf59b2a89c3728aac9e45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7e88525a4958dce7a501961351f832e3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic2816oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9af6e3624718a86f0ae9864750e478c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1b7e7fcc0fc6a248601e168cd84667f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"07f0fb7eb3c74ef7ce819434b25bceb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3ddb744f57e5ca85b29df066749eed86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic2560oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"04ba2b51221f4bcf23db1ae457c81459*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"86ff0dcfffe95e1185bfdd1b7414d5d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"889521abacd3c389137c37fd1b2630b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"0c21e7c32b4265783c9fe7f0ed33c178*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"aacc675e679ef6145dd66891976227fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"672862522efed21bd01359ecfc316d51*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e60baecacf7dcd2a2a5ddd089ac8357a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic32oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c938a688b5d5f85d041d372f09b7bde4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic128oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2c15211338dea05b2c4d15917d12abf9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d3b96298e30a8a8b708e5d45c123cb17*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic64oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bfc22db8669f0fd4e8d20feaec5459dc*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f6f5b53daf0bc1ac60f9c29f532e1e1a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic256oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5b2fe7d1df6ccfc7243562ee29332c6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic128oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a96eff8016c6e26cda0fb3d4c91f7af6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1571fa70c2d2c95aaa164239c6659286*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic512oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3e979fb2363d6163e0bc390ba5d74104*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3f10af0bbe488d910cf5aeb19af724d1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g2mb1_ic512oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"57015f6425df9b926053500667901735*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"5273e5700673f561624595d4ba539507*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"8807bdebc9a2f673dad382b36b10524b*1&1e854dce81a543a9078532ead4f909d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"97938e8f2f610cea9547fc8978d9aa29*1&6e559ef283dffa825caa50de19712d8a*1&289ad3b54dffa61cf768528ea25c65d0*1&945c5b119d0ba5a5d96b12f52f6e7cfe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c13107053dfc785d1cd487288e070861*1&f91422f68e0822446764e3ed7c4e5f7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"8c5ff153771485fa1e0a0e8d42c6bf27*1&743597955aab6b9af6bcb545f74219fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ea2ce5bbf73bf07a1c012ffb1deb296e*1&558cda11faac44d24ab958f199661619*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"7b218807cb8d89623aa1df4dce818ab1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b601dfd8ab62746d39cfefc1bfe19f7b*1&18a6a47565fe7d7e73e6848837bb5635*1&ff65256b8ad3f39c411df51a927c5e5b*1&bddd562a0f292918c71a6b54e22485a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f1c6653ee0c1ea01e4e1c28b06934be7*1&56e65e8b30e12646be3fe53e83055af0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9849c353cc748ca24cc23fd84e539dcb*1&b9a69df6c50e4fa959e354053d27568e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"773dcf01243718d1346bb1e1e220a7ac*2&214c788b666bd9b6c90030c191c18e55*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"dcbc0f59e0b276e64f0494a36c2533ae*1&f7ab21e79321b06a7dfec339d5af39cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"664645f89f9cd78a46f5ed2b14b00077*2&9d6685adcbd7a85d1c1bbc1e93e59fde*2&f4f21f347b65fcff4998d163f9ed8f75*2&95343a3a6b10ed7afa66cf5a71e4f3b9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"05a3fd854659452bb6b57d2ced4bf1b0*2&ff959434f9009f3ff8bbd3d8a9110590*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"78d51b13b44729c1b6ab27790cc0cc05*1&af8727d6d758cb7b8826bf27c531bdf2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"d4f1366f9a361aba8ba9e94a18988328*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0d796374340be3dbbcdd920dd5577a69*1&b45d680c2b476d4c6260ae13c0c8ef99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9a9cae807c0015a3b616a50b421ecc90*2&cbaaddec19fbaffe7e8ad19b59090d2e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"bb6d515145d8b6d76ded9c386714b69d*1&51be089ac8ebcc5000fbf6fab55f5229*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"62025d2838fe7e28f168da565ec6dac5*2&a80bba5eff8dda625b3ba499d5ce4038*2&9578fcf8d0f829ea9b959d289006df68*2&ad9793fe97f1af3af0e61f57cc5f0c5b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"56f034f5f28d7eeaeca0b0b393448835*2&d0d0c1d8b93ab9794635d17c667bfd3e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"90fecf9294f0a0ec8722e23f6da368f6*1&25977ccbaa885121c011a9aade4ab57c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"3bd940d490ee968d0baae107ebaa0e98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9e510e54a1fe319ce3b232ab932c472b*1&86ff60abb92a409715a45626853f4418*1&d4df0105493f309ea55e8fb43859a5d5*1&dafa1cc80f10a4f2faafc06821d7bd46*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0356821802521b3f42ff2223ce97ed11*3&4b36be21db59dc87f18904add2a26789*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a391648f9be3edf9bddfdee9234fce84*2&761ed101a313196724356bfc317ae729*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d97f7befb3ac74fd633753c4da33bddb*3&8a3fb73ddd0727197584e608111dea2d*3&f5e9b09356d73eff2bce108ad28848f2*3&da20ca1931ddd994380eeba595e8b2b6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d5708ddd15adbe4fbd924daf924c01b4*3&4d2a85d01e7952ffd8249e535d07f941*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aff6357ff8c52f603841a5e66730b9e4*2&56cfb3b8ba3bb633d4c81c7c68943cf0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"769012c5dcc3d4a055be1e67db796903*1&fa4fd14f94529f7c7078ce045b6cf662*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7e102559b5a8bde4412aaeb7d5cbad4e*1&ad7f8fbaaef74c49d4c8b49bb082a571*1&8a294771e79a0a797ce47e51caad78d7*1&2869e6a31308b3813ed31fe42f678005*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8860b647012d05eb0beba64f8db8ca68*3&e368a9bb584e42952efa29ac5c917dc6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"21b451359a69a58ea02c35f4ae06356a*2&cfd4a9feb1db593c5956102e323ce20b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d034b1a5954dadfcf14e01ce544d3f96*3&3784adb9b247451886d1c0a088de7813*3&4397672f0f9d9a5a3fa3c4e7c66ba3e9*3&0236fef5332e676c8ace69feb30c1497*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"752012e88616fae7a24c618a1a51da5f*3&2ba47c35e0dba44da135010ef4ba409b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"62802e8c28912a86dc633f151f606ca0*2&7dc3056d1dc5bbf74d15a6e86cb89f95*2&ecb874285a325b19621586343327be63*1&905c99f879830e101b85ae15e675cb42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"4f0f7984455403cc5d25cc485f83bfcf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cc2650aa44b8c2850b1be3929ce5b865*1&6852ca8667ccdc10982647ed69eaa4d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"09421315444310332dba8dc435079f5c*4&a2fb635576a61f87aff84ab87250702f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1152mb32_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a311d33b643cfb577afde8594034dfa7*3&28d8d3e774cb8c9dc68e7ef4f665c6cb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cbc0da3e77e5679958be12234fb651ec*4&e0498a4527ba32bf1e21f05d337c8027*4&a0463d6ea0c644133a558178f0e2e264*4&7d0c8711404167e3fee7ed28351de2c3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d91267bbae11554b55020adc9a0139f*4&339511dff7c64730ed3649d392a41f4d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7826322b39abbc4baebb437428510fe3*3&c1fa9debcdbc1b682c2791fe2c7a5cb9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1152mb32_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0ac68024ef26f0f57823cc9e916d499b*1&a04f2ab42381708ec989b43401267c78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"70dae93fa47a4ec7572819dceaaaa389*1&14db2203b463c39cf65b9c2487bccf7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ab62fb486f4b13f3e1493b92f6f3b354*1&da4ab26674d139ea5ac08467b6e68af6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"8601c3361738e012b3e70b48da96b555*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"5c58c50824e18287972eccab549f779f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"9e6023b617c0720eecafd232a5044475*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"9b588d59fe5ee312338c7be5612924af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"dbd86f6f816b2799f0ac792408a8ffd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"5d4161724794dd5d500083b4f037c38f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"3b430449c92f28663cfe27313cc7e73c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih334oh167kh3sh2dh0ph1_iw334ow167kw3sw2dw0pw1_n"71865a0c9edff9063efb3d1f892444ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"217ce906e70f143c0410901eae72232f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"1fec54e3d16918ad855f49ecdcbf0d7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"fb20fe7c493e62e5dee63361ab630bf8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"8702372c449d14256bb7d5efcab71d8e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"b131407719db3fd82c3f8dc627c40ab0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"1218438195580374bd3ee8b2e4bd0dea*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"5ec4920484ad636e06e786849c925a67*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"f9b753398d33e54d6cfd3582cefb3fd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"d5b424f0fb8f9ca8d7b71e767f49a1cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"473a9f2852bb84de4185f9e7bfe7e928*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"d3986b3559342de0395c1bb33ed36f31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"0041c85485b29597eb05aa223e7c9eac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"fa3e47dcdf020d06d8f97c5f862a984b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"d196bdb6a4e989f3f6e10a92cb20f23b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"99e0e226b01fa74cad8c79b5811be73e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"a94fa334f6d8fd04f54a2dc02f7e5172*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"1c381e87800ba620b7301afeeadf64d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"5daa5e8c7a60b5ce5083562da6d56db6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"65cd1ceb056a347d7ce4d9558c96c65c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"132faf2660cefeab63e22b55f49203ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"274a67b5879666ed42538fea42e7f865*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"d26f13c1aeb9e7081c417aeec4bd62d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"dadecc47197bfdd31f51a4abde150f95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"8879b819034372cbe17b47e9a1c9088e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"a3aced1bf11ce50d19a6575cb66902dc*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"70b4452bd0caa27ced2eaaffaf15ddd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"ce8b15653edb73dc6bc427939b144fcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"3100531f60ca5be3948e3ca4c02f303c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2652e2005df9037524db24281322abb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"c432a9cf1d3146cd9df820369c5c0bd5*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2ed0b581c412b46d27a8994df01a70e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"3f8672bbb7600f6debfe39dcd80877a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"70bec28656af72f1daabc257412a28e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"1a7845bae26e441f47cde574edbf6443*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"1defa00067e70ad1df4adfed370398c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"f1f42a268ea9f8e847039e4393abd8eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"83e31d845f1763950079209ab79191bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"16fabe21d7dd64817a7ae189ee8b8a3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"787190572312d082a5e2ef8872f6f76d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"bd397ed003efb2ff19c92eba7ded7f1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"669b1c97b7bd923b6f949c6a36624816*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"fa4a417e8141227e54efc6ea0517e697*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"319ab5d21c4c66714f10e483fdd9cc26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6959029324ffc9689d57016b3ab3aa29*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"c8262756295ca1a9b5776e6313308723*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"998acd749c754a0bbcd29a9e0783919b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"948b4b945ddb78cf793d65f834847a76*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"431f914640f0f9ff7b2a9b0f280b526b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d2aa326c25bdab8ebd15ae60a84c9f77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"0a6023d1cd4ae1aa128142ad94574b2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"aed06b36c8059b6744c357962fb015c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b2425a9b2372ce555e48bae0d6dbd7a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"642887384eb77fd1ca3f745e0ef4f230*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2500636512882d3ee3b9799e240ec41c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8e2ceefe3d273ff43e28215908b19314*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"70afbc33f85de42708a5e7d1fac21c6b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"7f717b71e479cf66bb5c52dd6240d265*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b5905961caffc3e16b66e5a930c5fcfd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"d97ac067eb23934755fed7847695b470*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"fd5fffad5fc5d5c87e537f1d63624b84*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ef9675896f780b1af7b4e34cfd182aa1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0f9f7868ab99dbb1a9bdfed78142cd7e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"70ad4858b5ad3fc0af30d45018daa591*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"6b1c875164140e540cf9de28cf3bacd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c2a12f52e83820947a9f3a895690b389*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"215e7af5f865e02bf2ad8217bcc9e836*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a7a715dff9d93c512d2c7d3f1d0e03e9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"347ec5c1964e24d1c6875699678e6a91*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"323354ad2155ce80a6030aa3f4abc996*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"333842d1cbe41d36afd74bbc3c6ac75a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e18959ac4def575bf108edc4247220af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4fbefacce473a7518dd95d36db541446*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"6adbd690d31e56cc98e3cdc9ae5cb2b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"53099acdb1f6e72111e8c469a12a40c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"186f5609250e7d8a14aa0a81fd449cb1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2ee454bfd0e828c4704140430fb6199b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b67c277a625a57be0c4f81fbbdc240c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"fb62f8965656b89254a35a67827a614b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d171ae9c402824ba0206af930efa0577*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4f63639a81fa3c5bb754d877f6ae471b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"684b8180966fa95ed88c6b50f5502d6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8f0887cf14f58e3fe86c477881c7732f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"537a84ec153ccdc595392b13f84ad992*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"ad27687160894fcc2bad0c6c550b800d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"885938ce75f9a409cfd78a148692f67a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"04eeaba9c09186e3aa2808f67e944d01*3&06cdbb4df05a04be9c8814d8cd69068b*3&438b911c4e8f1bea881f210b75f8a67e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7bd0a8f8f956a64970d15345025a8eb3*1&5b60b37caed396be35b0eb6787871ae9*1&e5e41cfe7b275792077f4ba4e1271cd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"902b71ae40682c68a85effec05b3de1a*3&94d51253d82eea435050ba2609411338*3&55087478c69b8f9d7598175c70a43250*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"f7f30983d668765f87c0bd9c5f3d05a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"cadbc4303709330dea2f55094e38e173*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"572fd699da595dad0cb5e6861f5737b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"88bbdc9b3050a125c91ed4c4b22bf637*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb32_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"ba4c8abe77661009647b5f5d997d444f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"013b2e10eca8acd9d411d16979b6d3cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"dd1c7706d4822aa268ac9900bf93fb02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"1de6a60217e623ad5a59306dd3ff2e7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb32_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"e36ef2c1e2b9d38ec95e71dc7e6089d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4950ac17a87a142568006c5d69d74df3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4977e61beb21b33574f21e0058d2b565*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5bce16eb68cf3155d7ae5d27fdbf301e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e3af54750976ffd6b815591af72d8325*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"e36d574e0636a271353488ea1e493d87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1436994bb2b7f233c1ba5c365e35eb2b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"29217ac16285b401dd7d8d8a7a1e7d39*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"aad8d61fc25d228508596a39b2313588*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g128mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"06d2d06959cfa503dd301861a6dcc587*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fd06ffcf37f51145e739747864d949aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3f34765c1e94df2c71c15ce6a9124ad1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4fe36ffb69250748d94e959a2e214857*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d12367b600006c1d62907d9fefdd3078*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cd27d44dfd9ee26833a2acf5ef344611*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b168ae9554d6fde64787b39f75c21f63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d3be47c6a4e36f7892e0281b7d4fb925*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"5595b0246ef29c474a716e63582dabf1*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5a4370f13268beacdcd83a4f3ad4ec22*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9d5953836e8ea1c2686c56f6c22606c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0c9b71d38252536ec0c9b7673b6a5b80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"82f8887e1a11bf89d0982e1e78ac3a5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"97353e0fb4a683cc0ad858b60ea1285b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"eaad04885c075d187f59049e0535c773*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9fa70fda90fc028a627f43cfa1e7c40c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+sum:0.5:0:f16 mb32_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e4215fb35b4ffbf09cae88679fbc6b4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"495853f5a27a9c6bdb9c6a811519025a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d76f4d32a0c6b074f1b9a79d860e95f6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"565fa1cce102fcd20ed0dc1b0f613c5a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2ad7d876759fbd37c8f6ada1c2dfb31f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f85f884f41c95fc89699d58568b3cfd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"91709672588e692107eeb26d2a2c25e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"4a5f6bbe3822edd1ac2b7ef0695a3efc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"26428dbde6fc5f22171b810ae616320a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"be646dae4dd9d135ef9f600d0e880fdb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"42ffb846b78dee853028344d694f7164*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"9c7b883a30884d502ab2243daf376565*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"470cc9905cb7e6e7d8dd21ea279ac9e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"db2bdb2e4a739bf26d94027dd4639655*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"3c97cb267eee8ab7bd28d387bacd8ecf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"eb65d160e69ad515dcc96dd42c201d6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"67df182391feb2431a7115acdc119cdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"6b222b3967df43863f559fe25e17b650*5&623c52f71f92f0c9a19634e74c600bb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"04d7bbfeb522e66821198076198546b1*5&66c5a138f6cf31052a43fbaf24775c88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"1b24c091f9dee427b4921aa2db01ca39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=acdeb --attr-scales=wei:per_oc g16mb1_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"dc9af19dc12f6fa3c85e494d5ffa3da5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:acdeb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"412389660cf43db8cd489c61348207be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"75dbe577e700a363eb53d51beab022ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"d7234c1a2005fda0eff62acd71ac81ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5626d937299c73a7d1cad396b2123111*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"80bd57444be27ac51a7cf3b08f66d4b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f0a2565f69e6772a97b8d1557f5e6ceb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"257375c1077e0e70e18e0354d2df1265*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"bf27750474dd6895d9fc0111dd88fbfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a6f531983af74e2f9f73d83ba12ea498*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"73e1c3a692a18b3fc877f88d5ba69f50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"53ff4396cbb56cc90f959a7bd5bfc9e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b129724c6af05f7275929ea1ebe3c1e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb1_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"e74102a4881de0db297ca6121508eb27*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"92f7ef3729eefe8bd033024a256edb73*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"090a3da972b86c31cdb3f894d89f3056*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23dedb9cebe9189316a5e41051687fad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1585d7d85a18aebe54b881553d1cd4b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2ed90d36b6547c478e3c931f7da630f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"880ef20e96b11c3eef639446916d8a67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"75c6ad57732cc3e3947734a354f33070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"248b6be3a3fe4336d8e9d329f55dffb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb1_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6b6a006b6141f59656dff64e35cf1e4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1d60924a22e76b10783a9b6769c3a482*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2efdf197fb08a9bf360df77dcf73ceab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb1_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c607c0c8273502ced0ae107ad83dcfcf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0170808ede92aaee5c327a042f945350*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:30:aBcde32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"76e5292ada01040f5db203760190033e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g80mb1_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b92b07f178578a94b61e40b489f445f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8801bc7d1f497f6646afcf7dd3197221*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c34ececd03a13b8705006123450ec9d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb1_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5d62bb807597b8aeb3c001e5d2d79247*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f1c6b79cd7dea301b1d7bfe82974ba7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"80ceceb162ff61010fc80671ed8889fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"012d5a3ecdada3b526ecc7e0527c13cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e6d5a5b3b676f47b1d78866e338093e1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g160mb1_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6d19f425620918462b3ba48b183bf0ef*1&7e2c66c42910bde5d76d680b9622af9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1df0a26bdb6db1c9806c995b8bc18e26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"178fb714975cf2e2b576d2d18b1f829d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d2a3ef8b5a4eec87ec44eb608e3a9a5b*1&2d10b65819ed7373c3e6f310f012dec0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"ea1615663f7df320590ef5c89ff56e89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"68a22b5f499df1fd5c5f547e23bbf1ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"77b6ac0d5da947d5e48321adf3dcb001*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"6a7311730b5e4ef9e08751c50efed0c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0eafd1f137c220c07a22459a428bed2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b4a6c0559fcedafaedf10e1590d78291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3f1dbabc1c46864289c820ce1a564d33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"87c94d24efef1cd74621f612538df1cc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5103827170a97002a52cd823b9056733*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"337a1151cbbe7634a2115363a4e45017*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"50ab8bc07e72a2572cdce04a21bc7337*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"90bc01cb20709eecd90f88895bbc65ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"83d37e6c8dbe3b427712ec3a5b324ef5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d5a13e0f8faddfc11d6274b8af515e7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ff606b446afec496718db0679c18721e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9c415fe9a9e1e2ca637b578c6d81e01b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c244393fff9526916896bedc2fb294be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9bad65ec7f6f17c30e1bfa46497d62ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"630ce6ac6fc963127e6ea898212a2a34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d70093b8efd9cb15b7b1440f9d784399*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"18642d2c0773454a3c42cbe61dde3039*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"791e646ccd11a5954fe9349d24b9089e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ff348adc41ad468ee066e7eeb758534d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"433a2e1eec4c01ec6df9cb47e74a7b3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c98591a83d239032be2c73d1a56d6c11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"270ecaffc362a982dcf3d74d8fcdd45d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"79e3a94f9166d7faa5c658f50d62ca09*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"d2a94d89b3ae18b50361ef75d01ebc4c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"72f955cc25c0eba0b3f4c1021b301578*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"97ad99ae3f4f70404ad39f212f3b7833*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f3decb4f4471dfdf991e42b5b626ec8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"03feb8a92ecead539ea54983089e6b74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ac57508d67177ef4a38d7508c35135e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5ce3ba2ea778b94c3fcec1437344d713*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"3840079cf27dee092acd508d58cac7b0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"428238e11b2f5b19883fb901df1f03ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5d54a59db7b6816b3f6172eb7e85b4af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"faf818f2ee65607499eeb9ea79bb7078*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"b8af5daf5a73e42bad0d4e26f791d619*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0369bc4c1aaed0f7b7d624d77d6bacfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"4744f673b119fd37733a383bb36b35f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"2224136d5df5a7cfb41821fbbd49cfeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g8mb32_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"9df232835602da5777bba32d5fbe0b7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"62b14bf50a8e5138b774603d60183953*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8cd3ce15393b27fd6971bf417b7aeccf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"157a0b44a62e182188928f2881d323dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6248f5fd3aeae94748fbd81ab3345fb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g40mb32_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"002305e28ba09599a63c8f0737e4c325*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e46fcc1f034b9290196b05cce9d466ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3d923edcb4b05257d1e6eb0684efd73b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g48mb32_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"58cd54dd7d7318321825e07cf7ca0408*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:acdb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"0fba416fd781488e4025e2449a2379d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"da9f80b4e36ac63268068d2a40fab0d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g48mb32_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"5c5b6a4667a3411aa62052acfa11cb9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"42a585c7a669766405a6a06aea96d905*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"68e712b80882d2d2a367c45235e57e5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"9a2381189014ffc2a43d1bb2bec0bc8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"1d3401fd0f29eb32a3a31652a9498bfe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g120mb32_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"892be018d32aa5792eabd4652e5a4eb5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"26bec8f6e9be5d1024f82fd096e55f14*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"acf143fd4b357b41cb96b6d0d4f0aadc*2&0d5aa5bfceb1b566aee06efe911a2168*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"1ba59c998e308493030dec361a6b3454*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"c36bc6901e9b66c02cf7019e7bea90fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"7b40090ad0352ae70b375f2748c9c95b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c416d5a0d1777afc81a37f0686f0b74e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e5c11f2eb8fecdfab9cf285421767e58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e6f344b3118b5993c291d0f20fb041d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"d33b9d411a95e0b9d45d706819554268*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g72mb32_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"f950ab7262cc310702fc06b7f916dcf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1117d234cc0bcc86deb9b1e5e0abc670*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"938811cc27da52b5dfcba9926894509b*1&ed5d04541c233549933f6b830c6750a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"292b5cbdb99b8dfefdce3b342e417626*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5750fc0b11e40b6f181354c8bcac9cf9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"282a6f1a89e6326778f223b72b7e30d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e568128fbad3860fda84ed34d73bc0d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cd9327dc17842b24703d598a44dcef84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"d2412181bc23e64b2b87a0d320a66ae4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"2810d59242eb5e05a51661bb4a0c2b42*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g288mb32_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"2507599ef5deb2825c86162ab662c916*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e52f95243be8bd948d1bc246bf71c19a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"722958b359ef0334d5689c189e9ff52a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"c4d266544e7f044b71e1caa740afbcc8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb32_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"869ac77e1d08fd8cf9723971b76c2a9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"ce9428a2df09ea53965ee7ddc5891f42*1&83d9cf8316760a4af21412c209a996c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bf11cf46358853df23af2d4b9028e5d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"51b3fccca35de423d1930cecd4fdc354*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a3004d125a2adce276a2e4057760ed1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9993e64911512f2430a0d5f4faaed724*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9e5907699eee216098c2a13d2d9c9276*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"1916087e54948e47f17d6777035da4e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"008afc2ade2066631aeb40aa82482f94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"7cf6edab43fc11d21f53a27a601e05c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"e76d3c94fb189a6a9851cd26c7f7f34d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"56230aa314ac62ce106cddb75845181d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"c212f184f1f26b4d7bf222c13555dbb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"48ae42c56e16b9d0e8900f8058b7640d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"84010600b3462b718eaeb811f4d561ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"64cf887aac16de839652857ad1d4b5c1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"286aefda2a2b3865db28cb02a6dc6945*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"3db1ff1ac1a4c7cdfeb5d67605f2af70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"508e1c01347bd94f1792cafa393dce5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7c3b85b33e0b70976f42565e041265eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"f8d466b846e5f0dfa1424a84addd7d9c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"fa136da766e33012a25292f169d0442d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"867490a108fcd2f6b60f9c8900579e83*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"714ec9082784493a68282280a48ed221*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"6ee8e1af456f0cdcbb36ccd27680e6ad*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"58ceda8f71b9fa65071a2ba460382c8f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4b442288e5b56d0974b288c2c2036dff*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"3862ec988c71447e6213b3659b072450*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d5fe1d744c90413dd74116010d9d8987*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"85e810096e19f2616c4d23fdbc4d2a30*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7910d465f4f900f292bc1ce2de0b4bf0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"6609047366fc06811f0c2418f3349eb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"fc227bfb988f0a88cc25d6af2fa4e08d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"85cc723f7dd11df22399a2ff4c5e1afb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"fdf4923a6f86442b33ab63c2cce8ed96*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"3bad9dc2d54ac61a8c117db4dd78d61d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ba06e79167955cf82e07fc5e376e33d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c2c7dfed1c0c24be668bb49facf6f7f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"6d80b20c6fa8e71e3473ac88923a661a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"38443eca61c14b48357375ad39269ee1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"01d0a2473647386ac068ddc9eee84f8e*1&b1517cd29d07a1e6e0759bcbbd4bbcd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"a66ccbab9d475ee6f9605d81cea5d3c1*1&f95a573d21ad80b777e4a0ae7e894a6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"34a83fb556e12f5fbc7c200abb65da77*1&2f3d66654f8cbb83190033f38204ee8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bbe7a21be73fa82016444a3ca3f1a08c*1&826cc0d3403654c2c00d41724ff9bf97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"079d662e61513fe6cd7f45451b3e05ae*1&c55c1a5c8f5fa7e8de5c67630915a59f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9cc7ee67e4ac167e89c75ef53b04941e*1&844f454dd38849b572ad1b1d0bed8a0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6a9526124b8300d8c0dfc58154e02e56*2&28f2b936841fb40440e5415ef5e463a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4743fba6dbbc8d77a415834bd9ef37d7*1&37c3cdc5d04df5b01bb4518225e5fbfc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0c67b115760d369fe75d07311b5058d7*1&096776328d4c521024a84c1b36559f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"443615f807c1fd3b98543f243b0f85a1*1&f089aaeea59643a5b92b54578bbce45c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e4aff93a1e94ab6c3cfb22eb19b26f4d*1&26667025827976039da7b7816495dd57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d46a18f891634c4a0732a519ed8f78e0*3&6d01aa12125c9ad5b674303fdf3b3996*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3a71b2166e75ddaf2d2020f33fb22817*2&a01b0ad3e58dfc71079b6d82c05e186f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"578eb3181ad218e65cc6457a17f13aa0*2&382f63d6730be305e87744302cbc25e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"73d0feab73c1b59b7e065493afd5a26e*1&d117328129142592f6fe81615b04f12b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"815798339a9dbb125e45c65e50146204*1&bf1f51f8ccfb23a17e54797ea91f2f68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"82c844e5629eeba72b715e0b855ff4ae*4&61167331d7bdd69ea131da9d066b6560*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"126a4ec79162256fa23ad6c0c6f16b0e*4&4774fa818c1007b0328d2d7778dd74bb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9529404e72849f845474270fe24ae8a1*3&bd9db6f43de9c1ac06cc02d9c84ff642*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"119253a442339d8903ac2c15381e4ee7*1&8c1b21b2f2bfae26971260043a5bcaa1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"555bd97cb47d27971d364847c6e292ff*3&b0b5fc31a30128cbd6964c78d5459be6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"744dc67bae7b2fa4de475dd489b5b3ac*2&5d72a335d908992fa235635f09f6f54b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f9a91188eee9535c847c38e69097435*2&b4a3c2b24edb9c58026b170c36776869*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"0c3dfba34ec36afeff51467a8515def1*1&6bbd64bef7fee8107f58b285a256b7d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b6beb5a5e15c30065b99842a3fab12a5*1&74a7f4d93c2d86f6c4206d63a45c047a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7b565bfe5b13b3874324d016d33b407e*3&10996e0e9da4751376649fb92ec2a029*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0433c2123971fb1972ec274efa826c9b*3&1942772a68d25e93dff9a7fb44730036*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"37f889a04e1bfd8112abaaedee32a03c*2&f2e80e8d9c88a9333941b7a21379aeee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ab2cc7818eb3f234e49157a9b6ebc807*1&06914d972be95bf3c2f9c44db24c4b8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1ae2f8dc5ebe7e27adca892571eba377*1&2acbe0adaf428349c33ca171b692054f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d387e96c522bd5b2cf26a9b06d4d0f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic3oc8_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"d9b7babdf1a0a03b8ad0815888601e46*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g8mb1_ic8oc8_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"14b24b629706e9862ea00602c409d777*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic8oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e96106a0cdd724aec04fa643f773f2c6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"4502407e520012071f665b3183d19ddd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic16oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e1a795ebea5c33c29b8a76b7069d07b5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1ef94bda7764dee851c3f902f3bf7113*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6769fb9f38f21b3fb138fa54f47e43be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"a7b4c8f8fe3c21ca77db7b35f789b9b0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e4303d5dd27b4994a19bb4aa62abd314*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b979781a1820cdec1bbdaf607e359e13*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e539ccbb19df44d51f9ae62a8e453ada*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"ec7fad6c19d377b6b859088399ba2b2d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"024869fe0f8119ac71d6d2442bfb8104*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f942d78ed87b149a8a71880284aabd39*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f938135dd360108f5f8f26952be4d178*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"e93c4fb49db6127105a4cac841885548*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"3d39e17b3cc1b9f9565fd22f50f1d76b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"856f47d8414060c844c1c9a78e6f2c44*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"465fb63ae76199c3c89fbf725b2d0448*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f97ce0d1777ff33c626a45ef87b5321d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ba3f8434e928bac39435b74767627e8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"2071c2090daab34b6b91414e7763512c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"47b01feec6865ab93637e4c367aa7c9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"189547ffc5b265c8d9c4b5b98766eaf4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"31a77ce74802cc36a0b83e58b565e32e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5adcf7e5870a7d0be5c6286f2a35763d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"62c0e00c842663fd0eb135f905bbe2d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e6ab9514066f26c765d6ef595dfcd1c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2f9fe3efa6991eb20ce59ef4ffef6a63*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"55b54a4bec3f98d1e279638936eee2d4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"242410a4d2ccb751ce2a641fea540c4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4c11a957a60ccdac430f39c77dd9a218*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d234867432a735312a156ea54ad70136*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ad41d8aade4cba0656cd97f05c005cc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e10af7febb8e1f0dee6c7a4ddb91a162*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"85acff81a099b1931114221d9a3964f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"135ff75b55d840d62e461c2caaaf5cc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"849d13f0b4ab34342e5bbc21e6282167*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic128oc128_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cff72dc0497e6bf1654e49c5d80b252c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c66c4a728c060781baa4e875502d157a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"99be4aee2c9da2fe0ce52c975644708c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6fdd98566537bfb6f880d1edcba1800f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"03a58e8dd6b302bf546f602d70cb477a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48e988f238b2efbe46193e6f68756e47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"93f8a37b569c61157a9ce41f5d5f8fa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"e945019d3e546c43b8782e49c7a83ec7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d3399bb9ca2076c4247d4782b98f3223*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3d32f270349b2a7aa4d74d6f10e7dc4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"02d9dda484ca6f6fa5c12deace6ecdfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5681869357551b9f26da536f99e33b86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6d7581fab1142d2999dae0a5c6dbff40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"34017f1eb071ff13d2d33234b3a6ce75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"1f3a9c275ee9cce5d93750e7780c7130*1&9746c5419139a903210eac8250163d5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c7d4a7d46e35913be87da445ba861c07*1&bee2a55959e147b052405315485239ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"72284cfc349faee2d5447d6da0804cce*1&9c960387c9d366cf6441357d0366e5ba*1&ef98ccbde988657a37bedf563838b35e*4&70b757ea204ab65d9f4c9f22126bdc53*4&e48c6eeb21b657841fd558dcf33bbc35*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f6871559895b775a70f2dc7968c0cb7a*1&ff53a7bb1358cc5cdee5620254177dd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8a7bc379d4c42e79496445911b9f815f*1&634c8ee494dde5ca3729fdaa553f19cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f893f923c4de54546068dd9886a9136a*1&7070fc343268cfea2bbd320888a58664*1&892831e45be115d967a12d521a930934*6&a95f7fb21ceb07678fa41a583499ec82*6&03b055de5a96839613ee520891c3e3f8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5ebd936813c54b624a3b409a40c15e44*1&aa8d4d9a1a5103ffd0079775445539a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a83a1b0916ac5523fe9a0b45c848171f*1&a8d193756e5e0761eaaf4e1434910cf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fb483827ece4639616b6489c22465a46*1&07eafae429d2a30cbb38b7a6da25040d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e9fda6b1cbdcecd9b7d525a97a4fde09*1&7d0d53653399ac787a989c413e451064*1&326d3a1e0c9e9b061a4856bfb05f9bb7*3&fef47ecf962ae58bc48893b229bf8b07*3&278f83275d66a3f1f2308684f24f39d8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"027a68470902c18c52f6b339b2be5854*1&b377b8cf9bbbdea3d0ef8575849722d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9b769a0763d37e6f0393481547aa2c4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"c63d6248a8ecf7ce77169156cd299cae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb32_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"e75708ca3887da4a1311e6a5857d4e8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"441dcbbe355bdf3e94bc1b706b32b272*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"573c1ed0ef921836bef15da7d5e37d2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb32_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"0a513531c7c5e4b17182c45bcfc7143f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"0fa805c90a8f36de084ec1395b7ef52f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"03abe9d69639cf25e2a30636008d82db*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"9556341f7f96edb14e554eba220d3f77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"8a8945e2d3bad0c60b147b2b545f7e15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"b3a3fdf8cda1e614910a6b238a88a543*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7be5b43cc2b47a929d452b494ff77e8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9be1ed14f4ec673f3c75b59262e607e2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"83609f7bbdea1af4e0c04f19c2a37dd3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"bb0674fe811dbe2cd87be1dbd17986a2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"cf1ed9bc410b7f6e8b6ee4206a29777d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"860b700adf71b36c9256867f38e5672e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb32_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"d4829d2625ba8d2e145b6f6ab1c3a91f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"761af1e88709bd234c4bef39fe9e9c92*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c40e3ccdc1a1316b9f0ea99f17d883a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"eea52a25dbc22a51e0bf8494272fd0be*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"e8fd9ae8192230cb8a9f91ed1c1ae23b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9f28151e01bf4c4e650656047055ad03*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"b726e1e6162ba29323b067e08f93b691*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7f050a80ab4141afe3fc139ac2df6210*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb32_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"e433f107bac8246d4384d5b75602c9bd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"21e1d99f59c8180b2471d7cbd821b633*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"659afb3e8cac3a4e04066145b38064ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"8cf00850c3682258d29fe703b017e042*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff67f7e3be8a477c22b4b465a8edbb32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"a9b9f9cd1d68e03d02aaa7a01b61c7ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"99e3ec92559838ae2ab63ad45720edd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c752aeb23ff7cdd1b918168960601c38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6ce661aff22b0f9aa3cc2c7b0a017aa7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"62dc98d26d2cd542e95dcfcbc8562cdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b252ab88e5f441a9a710d4be847ea015*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1eed7430fcbaedc95b41a99c7e4bbaa4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"230feaeb578300f8fbcb16d193751f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dfb2d4d71b41d753c4b6668fe99a216e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a21408c284556f3a1078bd62c7d3cfc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"20d77cb080cf1a05263871d2b9966c7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"23f2c6d628ea8b10fa7f7b5448aa966b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e745f62224b4164b6c133f7d31eb44d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"58440088af737f19bd0597fbbc15067b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"80e8bd33a55c9a572021fd727ecc4b0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6acd637382260106da6d7d15d456bbfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad72b37f989391e60eb1fb34880e6573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e87bd2d887ee3c85dbf8b1c4a115d2c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"42c64b91c85621342ce7fd9a00d330de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0db65dee9876455916e221df1d1e721a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2a85a0d786f0038873846138e89aae4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acbd --wtag=any --dtag=acdb --attr-fpmath=tf32 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0ce35673f43529395f01ba6f3fb1e0bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"85059e60ca16f86e6f882c57963f38a8*1&3da3fbea365f4e05eadcba609f79cdff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"b478e9e5f4cad9ff4d6377879f28ca9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"4e1a68286e6a274bd80f593bdbd28b37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"609da77a8514bd2a8abd63ac89d7e29c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"49b1cc1ceb4ed7087c0c3aaa188863b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"9582536a5255781773664416ded9bec8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"98ab187c2bb93ee95bf72debfaf08253*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"84e45620ab6c4b09256a8ca3e8bd07c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"b1baee01b8ec1257e25c022919b8472f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f3a8a80bf128ec95fbfb99591fffe003*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"ee20fbe038f76fac9a4147280c76b9db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"7b0f0cea816a63ea9a0841b16e634b52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1871a63951b33590f9909cc8455c0f1b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"46931d7c17a828db57a424667e60106e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"9c36325e7ff457f0abb2890af88432a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e4a8c17a67ab17f97111797a62cb2d41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"1d0d5c1a0bdcef7c399ec6c9a0b89bff*1&9ff622059a9b1c2ce4313481be17c88f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dcadc6d78fc94fe6d4b09434ad0173f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"d39ced1e9a8a21533c094ad5e844fd58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"beb101b031e369ff4e6b6f3347aabb1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"46ea768b14f593cf1fbeed82c54fbe60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f1f4e0bf6ff0b1c22dd75bb6e9b1e2af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"af24801ec98f1956c7f3fbdf0971bf8a*1&70740397c8fc31c10f6c245d535c7aae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"94238b22e36f6ed69532351edcdfb50a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"f1f40a2c55452a4bcbd46e14143dbf8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"f4feacfd653e30ca9d64e5d62b3fa703*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"dce210a94f2cb6715f5736dd52937c6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"55178dca48e75bac9eff71a0e7c53e64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ea1bd6124a441141eb1613dd1cbd8cfe*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9cf583aedf7bcc29d47762760e7e9526*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234 mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"ee6f2ef62ce6b2c1754ef0c1eb9d153c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7af7549f9cb642105e616060cd7c93bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"9e69cbe2727f8f7f10a8e19c5060f249*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"83d7bbb4c7b1de3bb24ff9753041de43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"463917a07642973268bb7005aaeac452*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"734a85200ee4b5734d7c154dbd93b3aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"16f1fad3f3a4226e102ef736fe197781*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"c94a6d6d9bd02be461172f5dc4a2915d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"c859d200bc212b2a7321702f89c7b235*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"27d28c7c43983f4cbaaffd10894bbc95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e1d4f174f2e01fff1b8ebac5d8e41276*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"baebb43db0f25ef36126ec0d0fbb270a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"668e4692cc32e3a6baa12579cc0beadc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"cc2c8abb5d6b012d4102838dd2f277cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"2ce043db71db44c497f1f06065edf47f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"61c306e60beec6aa14645f5edd006ecd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ed7f268125682751342639e81fdac0c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"40f2db8d7e1cda014eba957f7f68e93d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+binary_add:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"186784c340ca72d0c927b439e3f2af27*1&d79cd1562b345da8bafb2b6fc4496de5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7b56575e82d5dda2acfc5c7e81ee48c3*3&1693b934d56001384ea716293e5b24da*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"7d83e8a7303dc1ac39975ede0a5a5b35*1&610ad9a619a87e3edd2cd037882e00ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"91623ee4e10fc7a7f8f1376462849031*2&d083762fac70ecfb4783509df5e7a31d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3e190a1e32bd55cd1248ab55ee38922d*1&bebf587e7657df6e9d707cd6d9c0940c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"befe320b77e03224f0e428fa062c6192*2&63ec9c89ccd576bc2a033242f185e919*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ccf870e1ca0b25f31f17f71db7ac0b42*1&7cd93e6148f2555cfdb627d6ba29e6b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8b9c219b3e1f548c113166092b17ef46*2&344c39ed5077eecda6f3f727cc09a42e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"79ae9a972d456070b2be0c94b486bc03*1&5789047dd80364c39f52f4c79df64f31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1042b771a6a55b7d29c6c1422c4cfe26*2&74ce8176161ac04f201bc3d1571edd86*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"22a20d893e9d4b3f1ce100b87946e7f7*1&3abdb344b2509f57d0b9b637dd004551*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"43dea91e481ef9fb47eeca8afe63dcbb*1&cdf73cd6cedd9c4f1dbd8ad002c15ff6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"228b3b858dc483f31fdc3738a8cbd4f9*1&d02a596556705b1385d9f0f24fab4383*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2ed707ec37a3a5ea31bea5b362f9d587*1&a635ac28983736b4569dba050ccb9a7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"261320ab3843c3728288cb97fe4b3ede*1&6b4cc6ac5c19bd369fb5ba8f666c17d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"1a09a6089385b2874a1169c679ab6880*1&e8648a03f0360b1e5f7edbd872949f6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:15:acdb+eltwise_clip:0.271:0.314:1.234 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3d531d017df12a4bad816337139c2cd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc48_ih240oh240kh5sh1dh0ph2_iw135ow135kw5sw1dw0pw2_n"83f3d04c87f3d9fe21e0fb7a8be3e037*1&dd842ad371628bd072e46282f1da3f4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc32_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"500c8b4c759db27224f4c7fb9509ca87*1&cfbbfc51c375a634ad08dfd55ddfeb9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"1e48e01aa80ae4b2a3cbd94ff078987d*16&a932904adabd454a4e0d6dde469c76d0*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc32_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"8329b89fbcf397f6f3c8c92cdb2a71de*16&5b7923bd02752291adb1e190cc8e8a97*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic32oc48_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"e02a3a333de6f54da31134966aa7ec64*1&dfddb5ac75254f4533c4fc49b7eac736*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"a7766e71d4e27dde4e9791ff77225298*5&5a94cbc6510502cc0fcb61fbf4a9aa62*5&836e1efdafc11058c28c1f613bed8488*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"a4914b5f901d97b0b54aeda9098f556d*1&feb08ce2511fc77103590018e921543b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"12b25040ebf7a5b35c9a8e11cb258245*1&1006d8aab9f3b64ad36b55e744d69afb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c5a2c73a30f0cef3343618f941aee93c*2&39cf339f740e03a79896cbe9c6e74608*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4041973a3f5fbac62b74977b9d167a5f*2&120fa8ca7aca3c6fec2ad37c87c07935*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9e273e2d100b63d0c8a6afc71df71836*1&cf5bf5574ab52ae1404cdd36c99c3c07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ac28b2f14b71578293746df0bd908a8a*1&8f159a00ad28de66b84d76e7fa44a171*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b3fa300283eb848c7d7ae7ab79916d03*2&a9b2d4852fafd33485a443913ea49c2f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e6499f0e4b8d9b5e60f94fb1b1e832a*2&fa7c39b52de38d639a1fac2edc1238a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc8368d4405316c77177ead6b2815cc5*1&490980fcd8c77bf021a348984dfe5027*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f8e4ba0ad91c600119b147b10cfea4bd*1&e429645b7333164010ef323260024619*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ed9b8d54c46bf6ac6bac67259a4017cb*2&f15b6b554a6a59d6a4db18e0d9bb35ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3c1560861ae4e1943cfdf18479b6f575*2&b35c20150aaf97f94c1fef8658200b3d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6ea55bfdd6c774da0f11a031254bd932*1&c6a16d84bc26b045b198badd417302ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4a41f66dac7a5c9265a824fddadfc8db*1&c8e4bcf15f538d8196050e6324c2d23e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bfa7a9ff1455720d4a6841066eaac733*2&bcdf9938aa4b7b5e7aad10ebcbfe0a2d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fed6328bd4c5ffc109dcc54ab2509890*2&14f114f31428ee99c2a9bb605e716a2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"576381e9a9ef4b568426f8b1c5303728*1&7167df5658721cb199be3342a14d10b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a425bf450e10750b4f5bd3cdad30797c*1&124b3d08becf61695090a967dac0b525*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"6afec781bb12a90593527c515b7c3c14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ccfd7a0c4bda2caa3d4e9ad845937983*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0a30a96ab7ca0aadb87bbbf5878558f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"c438532818682e1fb50a1a0b50d9f316*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0c0625a0b1c424319dae1d3849ee94d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7fefb9479974945fef9d208640ae7dbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"e7a43ef5f513033ac5db180f405fb629*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1086eca8e664a84a3b93bfad3af614ee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"efc9f5d3be4c0985a0b8a01624c4152c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f9b35eb8bcb9452ae87033530aa974e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"134439a6243789e9ad0b7a8b4cabfda5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0e4366712d115e61e3b29967694d0db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c70a6a9d3601a3cca489baa04fea3804*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"b96cec89d0fe88d24a69c98553ed79c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"248a565808f3a6153b3761be600209e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f4c22df7d3fa5217389f5e2ac6c13ce0*2&975f32ff269691c7e3811d323f6094c8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"976ac2aad53bc6193171f8ace87ba806*1&f41ee42b442b9c0ffce4abe5974135f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"2f0a32f77d096dd6bdc8dd0243c1a32a*1&f947710acc15a9c109386371c4e42431*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"39baf980a03d2575edb50bff9c8d54e0*1&52f3709d78e07024b0a97d1dd99b4f78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9631603717dda25f0417f93ff30dc741*1&5bb7f9fab7f037e2dba0f79651a59e56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6e248b1496f31d3592f39ec956577976*1&b1e3407c782b684fe84722d39c422ee5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"232b58708884a4e199bb6dbf0da42e07*1&2d6aa0e12f8209455b9d9d3b779dcee4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d670bc5f0c0f699e9d510632def7907e*1&694603f43d0fde9cff26ec4f7372d7f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c6b38858d1515bc26186d8680d7366f6*1&93d9b8acd1e658cd512573ea6406404a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"30f6bbfa485374aa425ef8ca4ec2990a*1&2a7c9b1ec424cb52b23685b231175dd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b7aa7dbc61e41da8275ea8cf44507a75*1&03d0046133a9200832ce0921e3e749d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"93262812f7e15edfd709bdd046f7bcd9*1&257ab81e58c9019f8aa389d0d57ddde4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"dab48fe5453156125b8ed768b1cd1666*1&9b8874433ccfe1ac374a7d6ae8e4e290*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"30f0aa426b16d3cbc11d0d52a31c7da5*2&c3c73525c6c2d01678c764c6004649a3*1&c405081008a2ef69279f6762a706671a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"afba1137f2a0a75cf363a99b5f296dae*1&02e4f9a2807901f586214f333097d6b1*1&d3084e8a983d97dd97f6261d852f725c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dc4b4a4a61ab4002c39eb2c25891d48f*1&ee9b32657a1b823c4f03bf7aeb58982a*1&fa6426d3fda142e9d5f6fb2329a6d1bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"78f8ce033234aeb911aecbad32910eff*1&cedd9c8e170102fe67f69bb0671d847f*1&3d7059953beff32fee742e44d227ea34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"c14e1455b30417ee579ae4222d6603f7*1&5aecf762552f427c31614c836e3396dc*1&55dc7c96626a87aa0d2b3e56cb876c2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ad108e7f5a5bbb1ab08de398f932ff82*1&3f8ffeb45d874fd6a24932c439120190*1&27921d86f2e71e505385af6693eb69e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7586ac84d2cac616922872b85db6dae4*1&3d7330f7c712c1868498191601165407*1&a88634500d705e63126269da0eb15713*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"24c0e2fd6cb209e2976a0922aff50dd3*1&12407abbc6b318e46f66617b261f79ff*1&513115ae72a45af0fe323c5193ce66db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"382b584b811622cfd15ab8562f445dad*1&926e3c4a1972cf38a070f7f7f895fef2*1&1a3521632d71f3502c67c06f948f28bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic3oc64_ih448oh224kh7sh2dh0ph3_iw448ow224kw7sw2dw0pw3_n"7c267dd2fe4b9e94035287b9d965f9fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"9c3b5f467de77fcc1be8e44487288a19*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"9611ce8ba380f9afe06b9fc773a2a79a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic64oc128_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"d162e4726b3ab51e44397f4a4a7f8e52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb5_ic64oc128_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"d7a3b27446913306acb7d2992289541b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5ccecab339244bb8e28d7b206e11ae21*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e7cdc89bb49bc46af7ada63ca669577c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic128oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"b6f9a8dedaea95146b8ec046812da5b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb5_ic128oc256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"3c219bbf87df288ce049b40564eff303*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dd1b1c8b2ee35c7c8b1046f69c97749d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"de7b141ce3b5f3e6f4cac8f0e3d9107b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic256oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a200b4a6cac4eff410dcdf59c7a24de8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb5_ic256oc512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"988823e5e1a72b8e7286ccedf4d2a7d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c8278485689b6e922afca526a42adf39*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"72c5a6fb637a339c281f82c71870eba0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6c241ec81385bea1ad0e320ba8883872*5&a2dbd6834363d5398680e62f30df5361*5&31ed815c371bac2fd51d438b021081ac*5&b8313901f4265de1d2a09e4248c49dc2*5&57d58637664d6ed00f2ff9aac3f5d9dc*1&65d1998634c30927434c9fb9888b5022*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e5bc3e90e5c1288421cdd6582b4e0a2a*5&6f41bf2b8429fda20f53e8ff0ecabfdb*5&5e3a27533c3c3907724c5851196a304b*5&2d1d4c0f660f647eeb343208617fec96*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9c4b6e4ff8eaae1c87d85faaf45ccefa*1&312bda3955f12d8159e394a695772631*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc128_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"67fc72976c98e37e36003b17f8dbaf8e*5&1e02718b0b74665dc9c4abc868a3d3ce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"a78ef0f130e78971f572aef90451937d*5&3776dd3b02665ceaad2ad87de165a27b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih512oh256kh1sh2dh0ph0_iw512ow256kw1sw2dw0pw0_n"739db353a413bfce50627cfcacbfbf6c*5&b33505b4442a527bb1f15fae71a0f2b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d0ddf19e6b248f4420e3708d7ed23598*5&14636b71a0355d16154ac34eebd116c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f13dae80c67c8f0a3ff6af80570c1583*5&d855af6163b9d1e4a041218991d4f7e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"77a4b08f3d3a4f6ae793bbebff3ff4ff*50&3a7c81188a152c4ca7e255e10e69aa80*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"57c2d3cac3a6cfa242781e57db948a8a*25&96242cda10b2217a52341c877e3574d4*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"dbeef52c6c4b40e29bb66ca89dbab4c6*10&1b5d25c5806a694f85033ce568056e2d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"1ad243798f21995d5db091cb3ac03f9d*10&373872426b1d6b3cda9f5387a0df0ae3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bfcf255d996d4cc14590edfbf8eecb80*10&8cec40936fd405a3478bb46a5e1ae36c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c436796df351fc0d3911458d80ad446c*50&9a298b40641923b31550f808c4e377ed*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ac26119c466a07cba737b8d19d21587a*50&594b12594113bf71c6af28619cce5134*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc384_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"ea2028263e8c6861030c9a68af226d66*10&c1717ab6540c8bb5bbf7c52611282c5a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc384_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"20ac81325bf023b57f70ecc906084e34*10&332ee89f190992cd3707971f4b65f399*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0416f6eda6027f0f4138e3849758a734*10&4f479bca5eeaf8a5747e4592714b0a61*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8caa50d59439d157780c755c784f85de*40&0e5aba118c435f1cec2dff8e892c6665*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"55031579a3098126a583229d2f071f24*40&d69f73ab7bb58cc29a203ce7d26ac962*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"ee84dae42385ed3253ad4a894f4f5a49*10&f72d9ef3c86c3c6a574dc9d99bd79d6d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"a159039b722d8351d5fc28cfacc30cd9*10&815caa0dca5aae7ebf32547ecaee4be2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9346468a13c9c97e121e9b6c75b3f7d8*10&b8c35efa76774a4096d20af29f91802f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4e2ad72bfc4f870dc84b51243011929f*50&d4b76b275e71e296eccb5be20a9b82ec*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"23f7ad9e78badfbb4b905ef820d5de0b*50&64d7216fda4d5d600ba2058d0e561ac8*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"164c78b1bdc5f86171fa70473ed185b8*10&9d9ce34f981e864a2507edf174573e31*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"8cc5f8a2b5f843febaf65ef3bb18d5cb*10&78261b7b292beda41e02a5a13560e2f6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b213b91249b26a17f4b81578f0ae78a7*10&5a230ca76343c6461e820696fe38f441*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"589fe5f4e201968b8b9fcac58efd4e80*50&5e956b90762295e9acc8d3a1a335366c*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"59d1ee6fc9d31fe58fbfcfdcebc6203b*50&5d3bb904b2963f215fe45d0d9aaac98b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc512_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"6c5794cc0dd2bba32d15a86ced2c39e9*10&e7749708cd2abe9bfbbea170356ae476*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc512_ih16oh8kh1sh2dh0ph0_iw16ow8kw1sw2dw0pw0_n"e32cd2c0cbc0175d69f9bc0a81b21627*10&ec803db03ae80f27d3658511510c702a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4185527a9f00296ab125c28b399f7d73*10&7c7d46e856448582a8968966efec3a87*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a931c5f94a837a415d4c24390a7674d9*15&424c3a9794a8b0cbe32313643a23cff0*15&b42e1b99bb8d22028ee21fdd2533b137*15&8cc4bef8af05cf8ba3049ffddf1fe89b*15&17868d7cab010156d2d84bd02c248f08*60&5b410f55a93f077dad59db4fa0efbecb*60&f072ee03f253cfbe2380d78bf8ab93e7*3&55d09070d470392b2553f8985d4b121c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"19a09b33da3097f6860fd29785752212*60&bd5634838f258f70f84e2ed73bfc0f9a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4241e705fcdf6524562d0980b5a54709*10&38c746229b336ad2cedda02616f03896*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a3bf4226c9c50e2b78cb81fdfbc90102*10&47b1484653b48b3ea86939a5cde98825*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"45fbd855a3dadd1b30fce199f2524f03*10&99c0708b1cc5ee0b9cc8480cd038745a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2a433f42f032f253b7ffe407add51f33*10&1cb34934725186dde81cb2c956a50867*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6483fcfbece9490e4759238a045f4384*10&5d7f89e4acbce8158ab092c7a99d6dc7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ee3cecce958b682eb2213fbb8c5c9866*10&04b5cd0023c6a0fce1b156440ca3cb17*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"091a05be7b7b4b8e6192e73cecd39da0*5&9eea7bb12135a94511128bc0328fe949*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc90_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"49fc6ceeea7bd231b01586e4085c8eb0*5&33b2a03c0625d793533e1416b6c5b7a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1bf70a3808bbdf2af4a69f28e1b2cc9a*10&6452553b549e898b7a0f1bb59cbf4fbc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"786fbb528b4c0e32bc80327faa920724*5&4bd17514bbb3999b3df9bcc3465fa61d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"6cc9a369ab6341e3796f16cbdad2bf63*5&a0fc7be4f13b64772c3a5116d36074be*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"98e84596dba7ea12e4d07df1b216fad6*15&de03be370efdd83845a20cc11f4106b1*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"e81b2b5c2e00de107a752ec20d30657c*5&433c6048d339157133a4683fa0a8dae3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"fe860c7d1d4729556da96322cd1c51a4*5&e0148ec191ae8eedbc0d9a41b3363c7d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"c470006b6d1f12b79fe8a25d68a89d0a*5&0a30ea1db76deb7acc6a5c89c5efb406*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"ed967e382530a6cc9d8e2022d6ebb33e*5&ded70006de5c6b35b3a9f568d8b9be29*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"8b8248820117e7ebaf5d0c736b671f05*10&46541d5260e5e4e82148cb0f289c0b4f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"9534e4f0355f7df54685d2cb555fe827*15&14080d65d3de9c35fa9930e0d8b99c2b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"f01830cee51fa6788acb519fab5a9aa9*10&e04a30dbc5938be57fcead4c9f174863*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"6c47496445aa4133081f37c20ff2d2c5*5&a459c91eb575544d314f17e2441505b7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"c2387403e42b8559f200a56c0d30f7d1*5&4e95d33e795c703bf0bcfe543ba9ce79*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"87c68851306323eb0852b0bc213228cc*10&d423df42d77e360abbfceb5b51289c7b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"520e1ae26921d67a80adbc1ec69faf42*45&9b42e64f49830ea3cf7edf04793eac2f*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"2b2548cf447a143139b3c3c798071473*40&bfbfe478460a0b91fe58fb9641ba9bb1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"56723ea5646e3ecef0180126eea59ec1*5&349e5d8305589f919accc65092cffeca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"c92a93f13bfa7012ca83bf1d70031e23*5&38e5e665f7224a21633f4fa51f7931d3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b4f3401b76cf363a8b14048ee3e9dc36*10&a26debb620ba917a563c781025d1a270*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"4bb5866183ad793465a15cc3546edf47*45&92a642bf996b05fef3fe016aaa2293da*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9a4110e5b38a025566353adbe02cf1a3*40&2202a4f179de4f9861519e056d58f000*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"24bbf324d8567225a893caa7e0a1b4f5*5&245e317d8abbab4661a107dbd27e6ced*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"3b732ee3415aeb47e526d0065e1446e9*5&5df4bb0a82b24d370d42d6d440768101*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"20cea6d9b34e106497ae3cf7bd72fd51*10&fee40f4183086f99d13ca578c6213c6b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3f5b99520517268ccfcde3b1c11e896b*25&25b41736198a268ed80032e2c3b2b70c*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"5267cd9f7ae3ef99970edc1e0ebb317a*20&6f95fc2beec00afc4f0f14f330441df5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e674b45ece008088fd91f55cedee55ae*5&518ea7505d8b8b7b9a421e9240534feb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"826ad49c460ecac4e9771e7f9fd3e51e*5&b2dc51fc965de7d0c216b366a8951c02*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8577e803c9ded7e50ca5f0c403d688ed*5&e8aa3b86654acb9c5fc85dfcd397ee72*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2f03c7298ed246bf4d776674ccb2a9d1*5&0f27b5ba529e7c457007ddc5366baec1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"61fbc243d027fdd8130558ef371f4aaf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"0f23b4cb61517c62b8f26fb5c2e90d86*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"55ff72256f55864d89f898ece3af6fe4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"48d4c92c5dbe7403dd6742dae11186bf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"e3292b7fae82121c82bf524c2c63ef40*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"28067c8b544703170b06675b2168b244*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"3c50de95193266396a12d49de0c0ae37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"00940b245e3eeef35d13a9488a1780e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"43094f0a42699c249ab8e7b7f99d3182*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"66367ad84cbebb82a08de9a55404f3ae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7e1fdb12a51b472e293a1a4bf2b9809b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"5f07e8d062da61200ceeff7a9421b7cb*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"17152ef941eb2f6761791e51d472d772*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"13410839691aa342014401efd22afb59*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"6df3641843b0ae04b6711cbd6de2214c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"b94474f26033584fc86f443e4af9e27f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4775891de2e6096461e566f23b6f5d0b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"da2d3444d885f3a7da7f1960a96413b4*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"daf1281b14c2795e2bc532cb99fadaef*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c24cec84cc7a9d97f0f302bd3bafb1c8*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5198de6f2761e442fa3ab41dfd401cef*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5e18597ef009133d8edda1e36cfcb036*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"11c06d5aa774029d8903926af2296bf8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"13e62cb3c393b30be4a645eb0575f1ec*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1d0083aad8df4b8e82f8da817f4c1c9f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d7cadcde90126280a2dba400b5027816*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"aedf7683928e2489ad7c24bde8bd1fa5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"12420cf69ee44b4ad1fb18db05e36727*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6161fc63418aaae9594c93a2220b472d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3c1f9311eeef326cc38315b184a36566*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"137d7cacd0acdccdbe34d8dd9eec2a39*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"9a56548d1ac79b7de87f63f84152c4d2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0b308155d29976308b36422a0ebc882d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"663b91ca6dfd1946413975f6f964b3b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"8dc323497ad364cc20ab493364414d9b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"3b27ed374dd61e7553a561861d8fb577*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"d7be6c5cfeaa579cec4c9da8d0fbee8c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"1457da5c792b0d04509a0af0bb22c69d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5cb0fa4d60838e48aa23176fda91b0af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"a7afe8e1d7cd21c0f9ace08d120cc130*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"8f2e9d8d66d0b5d3790007d94297f6a6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"52a9cfbe5b720ca14ede7066fa3816d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"54fa6d296edb68b48f3814050719ae40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7fe05f6dc5944f163f7e0890821c9c72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"de0440e8a0d45577c81f561d8302ad95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"47ac5ca5a98bf3a7d812eba50e5a46b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"47b44778f17d69c2cfc9899034fbff01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3ec8c01b7fb21f9a59d21990b949c02e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ddac06fa6aface3aa95271caa6353da5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9dbef24d6807623cffbf810a1d7b789f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0ab26df73f954b4ae736bb7a0e107a3d*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cd89bf3d5c9cfac322ac8d6dc42aed8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"946aea56bfff0bca911fc11308741395*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2e73c5679a886283e71c255ed2b59183*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7116c02684086acb6845a166c9b27a2e*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f85a6297edd3c89298554ce751abde08*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"57d513a07db827852de934c49d3727d4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8343bcd3c025268e36033542b4f0ef2e*3&0e9d3f4865843d87fc423b0d02518710*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"741dd724a6c14fd3714996d27d576718*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"46a1404ad9f306cb980726c1d9cc830e*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8aa8039941a256347b0d3b55e821feea*1&16e58362806f1d0dcdb369699b688f74*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2dbc56ff8addc2f5a3358c5acb8d8b64*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"981456ee996e29fa03dad0a0f4d174a5*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"10a042cedd93a41497399530a47e511d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dcccfcae7999e14f44825fd9fe8c2cf2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d953762a67914ab4ed5d2478a5790238*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c603de9b8cbc33b6eeed7f449a9d1752*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"3e0737bd489d1d5e4535ef8e31a4bcf7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d6aea8c64f9c1d84abb20f289b9dc6b7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"917bea46495cf9d6ceb417b93078554e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"adef6d12db2c7ac9b54a4282fce22305*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b1eb348f196f2eea2d5f533276b5ac00*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4f3d8cdbaa66bb69c2078bb29db4b031*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a0b585ffc4af78db34cbb78532f7c4ce*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8a225f35354673ce5fb62863e0e9f166*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7cf64428169e86aa90a1a49a81c3c544*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4f3662a3382ee64726e37688e3bf5a35*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"13044571e1ed215e06a779567afc7204*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b50a2065d11a2c56b7af91b0ad4150c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"dbf87795f9fb8da114362c32c45c2c96*5&d7ba14b8c2f17777993472804efa1654*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"65e24baf3171222bc7bb9a8ade29a898*5&1a7ca192844ed968790f7a878c084397*5&239e611275596613b6e74e26c142a783*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"da1a1adce92ae19e8ceaf3779681ff13*5&dfae441c391c605bfefb6e2ab72ca7bb*5&528d9574ce2454f15c8e0179b71783a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"89488e7f58a1280fcd2f2f49b668876a*15&1f74032ac70f4b023b47c626070f201f*15&29359aa2a69174ed0077b0ed3db56afc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"83278c2a6bd8fb60037d10a634879206*5&d3ad9a5e96d1d545e10e3092dfae0fa1*5&94cce20658b054e1f6773588d0ab647a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bafd38b62c0caf92d6cbbb0430230825*5&8ca56858039b5b1716f377ce63588077*5&58048f2da4e03dede44ef2b050814d40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"64f019b1d740d59eaa3ad799ab9e61db*5&4233da4f46640f45c046ff67c5a517ee*5&931381cf5b27846b8c092274425d8ded*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"be3c04f687f50a297d770fa7a313d587*20&af6a4f4e8c14baed0c3d9fefcd1bdf4d*20&94ec26e16dbbbaaa585809b96b71274f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"95ceb07c414ddfd5f925d79586202f5b*5&75bd6a12ba4b53425d1eee2f41845391*5&29179c6b889aa24dfb5a9c52ae6dafa1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a5b8c93371965dc927ea615e8074cad9*5&154ac4252568ccc1a809bf48dba37f03*5&972f917482e47d6687d2f56c80c26e71*1&07296c42c5e7db299580ef73d398668d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"57575b90d01f471bf068dc750d828908*5&28df32e637af2555eb2d92e0ddc14c0d*5&97a33155c7186479455e113463d77f5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"549e2466f3a6f7e3f4f6bd3e6259c679*5&a71ab88174d2851e4a1e6b427d93b43f*5&5a9ba18884c23dd94b343c1e9ab2cafe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da0e921fb88e33b7323b8970b8242f66*5&5d76a7520e08146519b45080703542c9*5&6d50ad0a48a78f13f2f8371d895f8efb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dbcd5f174a124ed5b3519c07af3d5735*60&2a4a0d33f6538565ee74f8a8b98f62dd*60&0a8a129192b5512a72bc250bb7f13fe0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b75411b0dc3127a04e0fb4c9e5b9ca77*5&cb2ba787b2578dc99150bf3ae025d52b*5&07b8e5eb17d2f808c09d6613d7787578*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"612e316a9694480a0f519fb2b240245b*5&2472de0966ffb5ae7dbac5128a20a7ac*5&19de459533b4cbb8c44b009721ffe499*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4fff6f471d62cd60b150742b62522853*5&9f632ad6aa2546913a47e23f7ee19a01*5&8b4364c6bf01c15121ee854fb643d72d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"27f9f67618c60ddd716aac4152aab014*5&0ab117fb419ae3dba4b13e70da4b473a*5&94b5332d9c34dd494c7d60a50132b111*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"26722da8e9866d1ad181ef2073ef1c5b*5&e6cde752445602ec948ca91166ecaf77*5&918846149eaa59f092375f6927688d05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"942a106677840fdd45da683a4a9afc57*5&dcfb1e5f447d162d311e810fa1dc4bd9*5&fef946a423b2299f96d043ba691a03f3*1&f7455f2ddfe50dd2664670241ee92bcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"12b2c96b29fc5ee4dcd1fb8da8f63487*5&af4f56632ad1a0fbecac47f2c8c0cbd4*5&694dcfb0a155904fbbb8dc0c17af70ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb8d969bf3ef7f898d102a61d6862fab*5&3786008f66f5618a4b75b4209a6553a9*5&f07832d9765f4e4597919239afceab1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a5448024460504d7f5dc8f6509487550*5&7f535bd80d289a6d5ae5f59014bf8c73*5&a94211a1bcbd2d9b95953895d7674605*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cf563b620f5115c0865cd5969a8ff57d*5&85312565fc97abe2a343778d31f5bd40*5&37c2f57e41f8f86c30dcf12283e08f48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3fc2af281f48def303c5dfda234d7e8a*5&cc26bd5418f0638d8b5076ed958d3777*5&bb4e5bb8b3e3d5acf35b1efd2c5ec5ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf492f615bee91728b5d089d30e4cadf*5&c4a777c29ff2203ae07776ba4e70cde6*5&69b9f36ff80882c263ffa2b8610711f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic704oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5cbfc7678cd375851755798825a6c728*5&615bebac91f0f702f0e12f6c79340b85*5&516d91778cbc5e9b4fa47abca705021b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"13f9c9c87e3fddfa4a17d93e40bb95ce*15&bcd1bb3caa1281ab2853b6590442010d*15&4ea8de4fad610bcc1a4775464ec01f38*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"26567c45f93e3592a7aade79562e5113*5&7bfb0559f15870433902bf3e0a46a7fd*5&124dc8d5cbae172f70e270b94ece6e70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"675e90e9d380ee4f1ad21bfe29181311*5&079edef7c4dd65eadbc2618d2a4307e0*5&7f73be73971305c4abd9b1db1440837e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"179709ce5598b6e4c64b4925e1ff78e9*5&4e579ccfdd5a64dd630c1f3495b74486*5&ce599cad7ee5cca2c86e065ac4aab430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"b665cb49c9e14a4fdf50f7b19c2a2fce*5&4fd88c13d4e8c7cc96be46b7bc0ea145*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"aea65daa6864df279379ea5ed1099133*5&b7dd771cb6f4e7806b85c05003ba1726*5&c6aad07c81687f84718f7e1032d8bc7a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"d6d33622446885c7640129b26ef4a0a1*5&d35295c1b9c190525b12a7f8bc6322ce*5&f14dbcfa0021e318e6a11b74405f9fa1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b59b6d351b5ac0582bcbbd23803ee843*5&bf81b146435fea614d0378a83739373d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"d59bf79334839d22254f0b0e99d2b350*5&72681c500e11fb8d95867653354335af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"148c65e71c551f7f9a50de708fa02874*5&189a884a57bb526537e2d4a9a5ddbd3f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"5099a9a4b9fcb46bc198e178f60259b4*5&efbe8f52e5e25ec6e7ecc5f1be1c872f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"456a23b5cd1bf744dbe95734871f7220*5&fb4ac706e142f71a7064295ff4ad5723*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"badf07882e36d70c6cd09298e20835cb*5&c6d174b0917029cb85d8a62159c15552*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1b474589523e8f781282d11bf9472990*10&51562c53bfc05a2d857383d1c7e570e3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"29c789ac8eb705d307384ddbe83ded3f*5&31e85134211544960c46852401613143*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e0eb8e5dc47a53b188455489a9334cb4*5&73b555d869b4f076985e1b7359e78551*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"0d64df9279cf35005c50559c6f4e56fe*5&600011b85d4d6c0baef320c6eca925f8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"42deb8e2bbc4456d5054b1c6cebb16dc*25&cf2d1ed768948d0cdd77c30b08a7356d*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a1d482d95d4ce0fcd9ff202f33f60cf1*20&226964a4eec78726ff4573a285795e16*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ad7a18ae3065ea1092b63c4bc5974d87*5&f21f215740d5ef0c40d7c11b52ceabca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"90a4e18fdc1189b2b4d4529923178885*15&e1d148ac0cdc626a38c79195374b3da6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6280a985f5d05326971564a04e52f895*10&717abdae7e72bc21f0b42ebdda0334a0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"26d9610c72696548d3f13d7c64eb0c76*5&abbf62651cc4106f254aab984668c519*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6e36aa8a48be1819a9e7160d6ebcb76f*55&a527e142ce43bda8e2c9d38a838bd44f*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"24c09b56bac480a4a377c6eb0a6a0402*10&de406e0eefa0c6593e42be346d1e4982*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3c5d6944f3e7932eb57d56c31178834d*5&cfd83da3acc2e5843be1d960ac0609cc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"08daae70ac75592ccd0688f76b7e108e*5&8c7d84583a414329c58c80a260ca4169*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ae961489ca94f65f33d6b80255fdb88c*10&1a135b83fc95f1eabf565b7f702b9a64*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"43e8c379d7928fd919fcffe666b1863a*35&7ec9da57c3c2171feffe7c5779d44485*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"45c59e5d1fa3f521c5d79b5f6bbc2ae2*45&4d732822fcbee5a3aa6f989b380f6a57*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3dd7ea42a2ff4a8bcce8f6fa9a971727*5&842cf49180118ac8edc4ae4c81cea52d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a401b674563937c070bf0e7abb28ba28*5&aa23e5405b56f424586bb3a5cb286f0d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a87158316089f1f8c1a53b9fb4c213a7*5&cb5d02018f75aef43baaaad6ddbe0a3e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d65738e76649e989998b2b289ac74d24*5&dd86c881f7a971da0bedcefd89f71d15*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bfe97c46cb3291dcff16f5c867bbdd53*5&89f233eb57e75352a45f4b37391d23d4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5c246cf12ee967a9971ce68a28434618*5&5b837a23f256f77d4a0409d407e57323*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+sum:0.5:0:f16 mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fe96e024a5f72b48fad63cfce8f6effd*5&cd017461617a87310608f9af63a40053*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"712352090222408b34b8b9fb74d1a3a2*20&1434887da7b664e0dcb17a2e97f3b58f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5a5376059437105b8c400f97c505144d*20&7fe86755eab9282863beef0a172e36b0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"124e177b8f49744e6127946178bd8664*20&afd3553fb86c2671d07b5ee29974f403*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"456e4d12e2b26cf4565b5739128c3945*5&289f0a406701d6d210eab19e6dfa0812*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"d4f7ec499013a3e68be27eda98103003*5&d8d9f940b77e9c03872d8d928f4eb03c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"21e264ebac3bbcb6c054a341cad6bdb1*5&c4534e174f78a01b6f9bbdff3822b085*5&1baca79ccd2d48fab3619038d9ca1e76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"b37a73571d5833b89186fb54e5109a1f*5&9401dc97951b9a6793ac0898e423e10d*5&57a07fd44f5ac47830dd9b11b90e6e8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"28c578ea5e92a42cf5bfc3fb25de64ed*5&cb135e4997275788811fe673a2102460*5&e06bf91a1f5add3a03a5370f6176e70c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"5663502f14a8279d949349e6bbf7c469*5&6192802c58cda3a30f29d91dd7e44cb5*5&c465cccfc780f3dc0986bfaaac235c7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any g3mb1_ic3oc24_ih600oh300kh7sh2dh0ph2_iw1024ow512kw7sw2dw0pw2_n"158798a6166c022481b9b36b4417cea1*5&2fb9865c8ab9628884e1915b2d19231d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc64_ih300oh300kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0d9368818934f3637305c31888c39ae0*5&6d20a4e2b1e422c793dc217ff395d04a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"279b4de544f8d6325658d04122da1cfa*5&084b409076bc81fc93c43f39612edd4e*5&c664742b463b129d4651a86e0d24715a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b03133e09ad8a0679d1c0d8e3c888481*5&e34ea3e60b81581ceb75d22aaa8cc987*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3d195b82fa0014b4f2e14bbcc03a78ca*15&9945c73083158662b7483ab19df82bf1*15&1f411f22fea706242d0808716a2a4153*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6103504285809cd1c3d13580e13676e0*5&893651b2ba7a8f657db5d8d31dacffb7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9de4567b1c5bc7cb6baf284b193228af*20&dc022fbbfda7a8aa5e82f798706a694d*20&5765e3e56c295e4a1f5d3e70fd694022*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"fd4510298e406a0fa330b9805844a4b2*5&0d3355de16008b958e3aac5c2e865606*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3eb29041d38ffd91ca82fc60b907f852*10&1f1a9e6ec753f7fb6d80b3b28230f3b3*10&c8ce9d8fbfa6996702215ac95820c9ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8cc365895b22279861241202277a9c63*20&7aadc5701803dfdf1f81768b43b3e830*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"2440445ce011a04b5c2ceb06ddabd81c*5&3184717ad292b50275d7e8880dd83890*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b3d192d3d5f790f38f8fbb509514878a*5&fb473eadab93712dc39e9fc8dcfde2a4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc160_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw0_n"fede94a83f78e7522614ce5aede25949*5&bccce2ffd7154a19554bf6fcc849b08a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw0_n"776815ced783d34b256f9ec9a52da49d*5&a576b58a50d46f2299c10be7238e79e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc96_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"308ef894bd819d47b5d805d6fa3bfd8b*30&4db8faf67d17435b38f1a058b074e5e0*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc64_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7cff58b74e362804a12ea28dc11a95af*5&f30194c7636dc408f7027664f2a690ef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc224_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c8ae9278f6b81edd116c5630c32d3cae*5&b5b988f552db5411156f372545ed4bdc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"114746feebed587ab371dcea8530a644*25&f5118a2e3872af682754c49d4fbb0392*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"398d27e7c0592861d49740fa7c83f7be*15&f833c7e2a610616e5fccfe25384ac353*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5e5fa115baf614994d776e1f116b85fe*5&c316a9dc9fe25f6e63678aa7acd80362*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6075c1bb4b9101909fbfbec18eaae32a*10&16641864505629a257e091366bdc7cb2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc192_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f67a4d3f6d0262952134ff9dc4649c12*5&407c350e101d1594352067e092482b65*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc160_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5bf0dce84516500fa7d3d5d1816b46ef*10&7d63068efb9f946e0940e3e89ebc150a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc160_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"14baf69e89276711beba7a4f60b04113*10&12afd40d6135021e5f115099611aefc1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"80b7e04c34054614631b0d9b84acc819*5&840994adacc534453b92fc5939c9955e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"aa96048013ffe037b6de0a8a51483dbe*5&f390a295c6aa923b7e780c2d1ba2c701*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3bab21d219026df692296184205f4395*5&4b001181de828a41fdfc3210bd20edcc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2c98c11af079fa0a2bf31279a9b156be*5&d2f05bc5de5026d2baf517bb73370357*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic576oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"45affe05c54bccf1ef4e48e55466dcb7*5&f105c0c4a227db9755e3d090f27aa385*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e17eb9c4558b005ff635b779ad75e2ae*5&6c51cf696be21d0fa98e81709608495e*5&5fb03f43b699623837d6d390e136f21f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bd48990d7e910a37c86017defbee272b*5&2b93f9629ce001107ac64588fb9b36bf*5&77f4a1f3cde87d75a15f7e7b01fe191d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic576oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"de2db8932815a40159e3a5f35df961f6*5&0f978f59f6f4e04963f425e728cf7e6a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"608ea6ecbe9bbe6b6f24c2be4cd5d086*5&3d7320764ba106880cde69a0429b9257*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c3192e2a80bd3f4cd8fa834b5263ef99*5&548e35b5e251dbc595f1be6ad9d6fece*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic128oc192_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"1c827c16edc40abe723647f869c8a64f*5&4d1590d7275b99a1beb9b640b482d6e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc256_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"78702404f6bb23da39cf7117e59c8052*5&dbfc673d9c94bb7dd0ac4f9a9796c004*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c76c9ebed003e64097781b13a5141bba*5&89530341aa3835b2310d856e25892338*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc192_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"8579130736cd843f463daa279ca30e3b*15&3924ea35b8124e1db6f56708b6a66f44*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc352_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"479edc52103d6e9fbcce5feb0af93652*10&45db33aa08ae368482bc1ab2d8d287d1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"6f6957ded4a29a5178fc01b8c7268680*10&be37627652d3ed5b487e341c1490c737*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic160oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"3666245db3fd5703faeb6c3c2241fa7c*5&4a4460bfd25706bffd928c80e1b36c53*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc320_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"f26f3f093363bd4a4351eded828d4a32*10&90086da551a6df8c05d934b43689e7db*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic224oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"83552ae6e00f9d449e1120dc49060533*10&8511ef85ec6fbcafbc138a5b3ab4a9d5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"bf76789b32d4f78a0186fd2047db7b0c*5&faebafc33ddea1d7d2494ab3a9cbb8a8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"d4cfd51eb966a705cc2f5defc2b9b3cf*5&ff604a73d3c1601b7b988f14b9a73590*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"35b83f1c50ce4026f680e00b3c261b2e*5&729f3d657e25c59cc4b45056d9f05bf0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g96mb1_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"fc9ec804b9b79d1a42d622a63e3c6e70*5&f116ca0296cd1c8510f4b298ce417b4a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"616a82272ef49785629a9548a6768aa9*5&a1eb47570ffd560ec0452da0bb7fcb2d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"9eacca331019767f173af0fbc9864461*10&973ae7030a270d35f6889f9bf59bcb57*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"2defa1616a40430e5ecfc80b00d4bc65*5&be1642e06a52d3772bab37ad3d910a77*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"dae713c9b9a57e090023de069c581754*5&f3031caa556cd81809fa8484398b7383*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"f9fd676b30e6c87ca32cfd32ef4238e9*5&ef376d820fb3f889391dfeff947259c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0e0f988b2f6e14bfc4b55721eee81051*5&547df396c63ced70749241656e457029*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b405907312f00935b378629fd815eec2*15&487e0a3680f3fb5ac4ac69a19a65484e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"6be992fc0b07e3e33d9199c7e34879f2*10&31acaa02fc801f1931125bc3666f3c66*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"10a1ad9937ccbf3158db146a877cb86a*10&069679a3fbbb65d51eecd31d05049b6c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g192mb1_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"e37a01386bcb78b541bba24e8e0728f7*5&dfe49f64145a28b19afa954b9e02f1c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"504e74b661994d0a46ff644517e7b86c*5&61fdb21b4eac3d0292f96256db7c5d5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"28ead778f59919e23ced57e5faceacc8*20&2801f1a08e694e4c01e84f9ebabe5425*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g384mb1_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d2afc123c84e957eecc73782249afcd6*20&74878b09705fba3b5fbccbbb195f0fed*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c7aeb41987ea7a7ae20d89ba4e2cb776*15&8215adb5e093835b0a7ca72f66e1d50c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f53583757fd77707842092975b66fe9b*5&9d53a9cc9dac2adb4eb1867e6abe0f7e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"94f1796cb74ef464b425fe2aa2f13cda*10&ce192ccc0393596d6a1d6ebdc11d2374*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"001f55a17668ae2852720f3dc9f825f2*10&e39e735e54e10a796ba98703b508d2b8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"571ecfeb08d2d08230f3cc8d7ee07dfc*10&72888ab62a841734799a0656b106b481*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"eeba544599fac309cb7a561934f5ee09*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"e4b33ab1f730cb0f58cdf7152dc23664*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"ed4a951569775233c517c2452466cdf6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g576mb1_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"57246d5f18e8a0e02e04634964ef0a34*5&d6e780cc1d44d10c56971ad98add6272*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4b564ce928e69042359de3a340e37d69*5&19ca7609860be386667aafd4c47ae967*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"49484f89bd0b54836faa8ca2e2964854*15&da8523aa27c2355f9ca3722328cc0c07*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g960mb1_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"fd36c3e7afb5be893931bdc2d1341434*15&8cc9c959737448f4c635abbbcf038216*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"9ce6a41d28c4e6c0c6aa92a98bb66414*10&1619f2bbb4a5920526cc4e325abdda23*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6ab89b6ea4a821c40fa6649e369ef601*5&8e4e746e8e99f8f816099feef37eccbb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bbb92fe2ac8a9ebdd32d253d9f9cbcda*5&a9b3f3b8e558c34dda9c3c3d4c84c616*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6902ce09a7ec72a6ab15008997315900*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"60e6356c94506e5ef95b2675b53e9e44*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1080c98f12deff021a175b6b1d936364*5&61a6bd0cdd0634df965209a75024fd6a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g256mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"504de8fdb56811393f79a49761ee7373*5&41093e6a206fdae75a0f6b3cd1d20ddb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"2d914f7f619bd0256b7eacbe4e033965*5&f13a894774b82b6e6f55e0b48c85ca17*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d23ade3096b0b5ef529f49261166032d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d8c26731ad3f840b31eea4f8f9222d0e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"e01dc0dc3ad72112ce2ca85a271d9b27*5&de4443c78ce2b126081101b165ecc4db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"9a8e1fafea668888b2d9ba7f5faaf72c*5&50b86679a45b8450d8eb24beba054c63*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"29d9e9f1da175bf319fcb6773172533a*5&8918a218dd36d0032ae900bce185a292*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"5398bb065161657ce58471a23fd119a9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"cba7eb2a8c8e137089500500a36285db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"fd63ba9b213e882db8cd7ece1e0e6c25*5&90d14f3f03fba08e499a2a4a8deabd90*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"bb6df12c65206c83f32574dceb2d7e82*5&81ed8398bb4a029a7e2b6c1f372c3807*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9dda3dc6f3d2674323b4409866805ad5*5&a76218a92752acb683ac014d41f7a87d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"a21e41e1f9bba35cc5924424067acd49*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"53203c1f49ff8a17d14401f4da6d592b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ef82a6491f834a5829b29544cac8faa1*5&b61a0f878671f8af0c76993fd4562434*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g64mb1_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"f81d100b862f10ab19997c4faa9497c0*5&989f67b9c563e66b2795d613df0b81db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a80f54954d2c077d50a8ac86898d85b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"9f10a9e3e77ad77b930f60771ab12407*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c9d394a97ff5ddbd064cc1a4b0bc1e56*5&1d15fc1ff73766aecd069f62e826a3c6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"6292e2bb87002df675300e6035415580*5&8a9ecbd8881e41e105f0fb4c18efe350*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"545e537429693e70c14b1e2756f31d01*5&b4e84abcf85793b875e10bb67a96ca17*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dd2b346b6447770971cab4c6aac332b4*5&ba62957c4a8f9ca3a2b5d779f74321e4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"62113e8eeea3a3fd4dbd8040761f6b6e*5&2817f6157fe2255b019cc98fb2411ab5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"caee741ec430f1acb64880c539757a7e*10&1fd1b4fa0b94a577eabd82457df1ca04*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"248edd4142d1abac8b13ddfacae614ac*10&773a56847939d0619460f5f572197633*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"277c197cf90ea8b721f563596d7e6132*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0 mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53820ccb6a3a4daf338343e269275cb7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"54f6e57b01e51df1950fdf2bd32679cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8638b65e2a9bfd55d11d8404d8ecd278*5&50072b5798176a196e8613a9706de361*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"024b708989d5dd297994c2544fd610cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"10072537fe327ba6bb0533fa343f8a4b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4c64d96084e4e3a9b63154535eb3a962*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6ab444783d0da10850afaad01424e779*5&5dee0d9c31f6b10e3c430c6cdf2b6234*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fcb7d6038f425a658b192d16cab9bbec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"41569f1189390e8ce64bdea94b7bccb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"f294256b61928d67f72aceda9750b7bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8582649cf16b4a5847a40491d507377c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"67228a09fb540fdc3467828924b1342f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bb6f71331b469a4f9ea8d0c04583a061*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ecb0dee1812f45e6e67c36267cb7493a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9408bcf00d70a55054468f853fce56d8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8e6e61602ef5b01a4318c8ff91da8316*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9495d925dce566b474a387db134fe809*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"193494e6c8b68bc3f9b765358ef17b7d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7b7239a364791d49ca471aa8b3a99f47*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"b936c1e1492f7645611a6f88804695b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e5c1697a7cd333860435072aa842f843*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3b63414025ed0e255e079d58c793b888*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2338e96663ea46992c222cba24413d32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic36oc18_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6c61e37c1d5650724836109a62a308c8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"a0b476d6c4e10b0a84a7a553dfee6f52*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"cf224001f2862305e5b2e17319818288*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"29562b64e9e85d7915f815361c09ef9c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"c764f968554ca29bc43cfd3a5e1150ce*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc18_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"dd2b00967e1c3a1e74505fcf30c09ec8*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"de31c3a3f5d75fb4ac8cde67cb2d6dd3*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"905f45046ce25f5cf5eb44283c98166b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic18oc18_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"a0f01cf38e2766eab169e4edb676313c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"40547c1e3f40533fe555fbd5c85e6fcf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic36oc36_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"2adadf4b5ddd2a03389b94ba6ba39896*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic36oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"e8fb3bde8547cc1b26defc061dfb6138*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"1f0687f094f549031f3bf9705bd85880*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"ea31871753556fa3aceb679de46b44ec*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"4df648881a26de5ff31b196e91956ece*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc18_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"37f940c64155e424a01a8d8170be9fc1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc36_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"85be3180fa60be16253fde7775bb5956*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc72_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"b4af8821a6aaff7d32ba15e66a70bbd3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"cabdedb77d6a1e4798b4d234c3026ee7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"c9b983c7fa1401c78b8db06758eee874*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic18oc18_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"ff90d5b779db45056a04b3e66c8f49e0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"77ef47a2fbd905c9b8184a9f81a59eb6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"fd5d347ccdbf102829c0d531f43edf0c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"8aa9a27e1c6aa9508a1910fcd436dfc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"63a9cda99770a1fb306eb6278d08fadf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"00876dd50e014772089bf1f6941b19c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic270oc270_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"49aaf0662a6403734cbcbd0eb8aa7cb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic270oc98_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bb3866aefb0bf6cdcf3864c50d8015f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"ff3fe912ea9824c880ae62f4526bff70*5&fc2d1514bd901f21b9f474722ce12f84*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"df74c0798be380da935b4c55824372a2*10&572ba1068c73a8ed775e3905999874b1*5&59eb6881f38e2af87aa15fb9da741e8b*10&6bcb7edd8c19a8eb389f02c6d7687d88*10&bc0446dbe5ad70a556096318ca16a724*5&70e3a7d5cf419678201bab1cef37d81d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"273ceca22dad44aee0efe4293fd245d1*5&73a1c5713b8052d4181e3dda0d152ef8*5&80a8b81f289230c3882c43925b0fe7fd*5&52b22b3d94da53ceabbbbd9efd910833*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b0853327b7c90878f34654c8f61bde4b*5&224ddab942b9cc59bdf37ad98c79c671*5&50a948230ad065e784a944191e704020*5&47da26337cad4aefb15f52d2d3daf945*5&9238187edcba0e8fc28e9bc48812fa5e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f341e05912f8abe2319c08507928d271*5&4876081e3116981491e2fa3d386cbc1c*5&2cd324cfe9bc6ada47dd1321df99aad0*5&cfb4ba4535e5d251cf6bbc2f3e1c7495*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0e28c3885db090812c8755ab12890a4f*15&e749b95cd9613736c8d54a7fbae4faf5*10&6f8f489e609ff1e89f1167305144bfaf*15&3d2f37dbd8f869d84f2f3a269f69421a*15&cf221eee95b1d74b12ccab084f03b6a2*10&9adc333a6dd262fc33a77a399be6231c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ab2fd8e042b4f02e5257187ccf823abe*5&3c4097dd657d3706e391474201c354a8*5&5550e937a4c5e68436f83ac6ff93735c*5&72dc3b10875e68b127e832dcb4a4152d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a8162f54f859ba26135a6e7962304117*5&bb2a4754db39243583691348668a4f0f*5&ac1ddc669eca61e9456fd35cf3fb6a86*5&305201944cae262b6cefb79faf131caf*5&d7a9bcce2fe90805e188df7adfa10df2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2d8fd5690fb85131e0d5819251deac4a*5&1510b3fca5fb203f02b7457cbf797fbb*5&3ffc68d47925cac2bc3d6074223149df*5&ebf2f7f9a1a955c1191808912a10499f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b34e59d6369bb80a5cca9595f4685ee6*25&46c00e65e778c5e507b87ad40af5177e*20&4f2df42fec11195ed394c6366becdc2b*25&fc7f6eea4ccc5ef2eb98217fb35d4af0*25&a3d5e2554ae96ef8b740c0002627c63f*20&58d13914d55c1f470e24c1bb57c16f74*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"6c90e2c9c65fac8147913e1f20fd522c*5&15fd3f7fc1131b1d86984c8ae4bdf1aa*5&a187a5c91c104a37b304bc72d9e7194d*5&54fa16711cb3749e1efb70d181c3e6f3*5&2d4e39ff5820b10b638f6663073d14f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"e35dd74f365a0570ef76aec99b50ba58*5&a89de1858ec1fe93bb8e22e341ef56e2*5&bde664ad14ecf139e174df30f1539081*5&fc4c3e3ebd79151d9357067152fc68ad*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"426e49d723c290983109b0fd27ad7dd2*5&fb80d16fcbdd3c73c654452c7e6c9bd7*5&fb696f1bfee6649edb9846d40177b9e3*5&28cf8dcaa0fb07746ba807eeb23a68f7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5891458888f9b36064e6d9d3e06b5c9e*10&18c70b55a24722190263d7dda3841907*10&2398d749c1f71ebec800399d9b92e729*10&b631894658f59d029c328225c9e33437*10&cef247e6df222eeeb33463405cd43703*10&a3527c7999332759b5ebdd2c368d9fd8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"dbc4122a059975ab3f31322f5d9081ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"99c6d5a2ef160386f55b5e2daeb935ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"803d2b5e772fa655135f5ce69e25cd2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5390efffc52429da04ba7f41e4eacea3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1cb1ab4e0cdaf3274f822504a17cb1e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"91ff99eac368da7e1013eecf56f019e3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b9cb1551ecc00a8d43f13dc0ac29a995*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"38ccae7e663bb0a08fef8209dff02f95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"11d7626be2b36e68cb9dff02949b3951*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"be5a20f23d885dbe2186d2a85c78ceaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a7b80cfbd29f9dd67d8f8386b2f1475a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d771ce75011ce302e06d332e4edaed86*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"748997bcbbcd69eb3ef9fcf48c07cff5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b0cc33c9bfdbcec1c7448d1cd323ea5d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4f6888a97d27d3e40537aaae79bb4d67*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6bf0310ec84d33157da95fab40e6a414*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d5b76cfa77e5a8b2ce7e36bd6df5e200*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"91719d35fbef7666e28f5defd9f90e99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8ed5ed2c6e071d97b15483a40976d1b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"41b4210fdbccce1d9d2acce0f67c9e47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"eedd1510d9e4a89933d30d62b6b15afb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"c5e6383937d6caf86a5bd67beead45fe*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"42174bb2d8afefed3749bfe5ae553dfc*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"565b1ea3e3f4a044816d117325aa0b05*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"1d1e2183c6e1b9232346ca7b4306bcb9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"d22bd691736586ea5051446ff10545cc*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"99fc93bff5df690b054b59cb773bff49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"6b92b47dc970f8058a3b1e855ad5da67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"da600add51c6d39c652e7a5b63ad6e09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"04a69b2231df9c9e1b56604bc31dc27b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"817595d053739d29976c11a099482531*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b409a1d94527075a176a0347b07ebe78*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"a9c11a6f6ea80dace07e072dd564c2df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"23ae631fceaec924f56bb5230af36238*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"95c889419dc362371a7b7b04ce63f778*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"2e08d9c2f08eff4e95022ef07a373594*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ef6ef2cc23f90ae1435437fd0133c5c0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"b435ad3156bb0fd09d67049849d30a62*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"7b4a87bb1e6ea3ca4ec0ace1c83c061e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1099f0e6527d5f1ac8500d7ce5ca0f6f*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"adaaca94e59e57391ce37d11b891b7c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c3985e7f55e6751583d109fb4be927f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"6c62398297e037ab543485a766b8816a*5&bc1fdd73734bc9062c19bdcadeecfa8c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"816191d7af0fb9e4e317ef0a24326114*5&0f87101fa05b062c4abb915c170c3643*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"847a71d5ca1b081ed20e3215c342bc07*5&ee5726f0749023c7b3134b2f35269337*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"46d7d3b1523ac6cf423c1d009696f16d*5&faaa1da0546f0090e409c01325070676*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"af38f223dc1de0a53cafa3286359fb6f*5&7ac6f692a7bb9fe9c0ebbb6e3cf533d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"478837f872f89aa14e5160357d3655d7*10&2f78064a1f3df304a6ed38214b63dfe2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"68cce92295012e53d698e4fa8a702130*5&ac294c758336e776e756a07b51e30ac1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"a7d7d9cfa5a5e6052e75c4f6fc31a8ff*5&e90450e534205b49d7daf0d17fbe6802*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a06f2fb7abc6f9434287a30e9a2974be*15&091ea06b2eddb38bf06e8fa2f5f31a4e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"eb740d715d64672e4e7909a70177949d*15&e8122dde974f511fe903e9a09ac4e7fc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"863559c621cdfd57b930fe8de83db0aa*5&a2bdb98ec7a3d9376f9cebc3ba4b6fd7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"914d65212744d9f0902ddcf4b905aaef*20&8b9ea1a2fad980eb063a9b5e74e07491*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g384mb1_ic384oc384_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"84d0da5957ff9d7dc005c6b2439d95a0*15&8ed259c0e569fed79b898f5ad22450c5*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3c92e851dee64d1a868e32006f09fa6a*15&4aa950822cc92ab6b1cbdf0d9b23fd74*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g384mb1_ic384oc384_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"df9976427c0bb59a67da70e13d8ec90c*5&28622d800879b01eeaf23c38774ac167*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"318173db241e29fc3660ee093005f450*15&110245658f5e70d9d34cb72bcd0b5dd8*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"dcc97fe519c4e512cc2601b33dbc14ba*10&cef181a5bd91c0323b41dea4dc8df573*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4b7340c0e3d3e437387588db18e7a983*5&2c5f8c9cff5f1c4bb2c93471d3e82c56*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ba04a8ed20f6dfda0cd4a043beb675ca*15&9daf255bbe16c73c7dde415cb234252b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d9bec58bba476933e49d5a199a8eba5c*15&731b9ccd7d23243ab076f9b72d504e75*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a57a171addffa7f99b63b8d989114c1f*5&3c454b992e86bd6235f1c9f57374390d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4b44df3779e9cee7bcd353912ea34984*5&bd95fbedad368393ceef44fb07ed1fc9*5&698328638ca70851e1125ca2fa208fd0*5&dd55119f656ac5898c536043cfebcce8*5&1dfda39182583ea9f947155760c2103c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8cfe32121e8e42d8041fcb96053ad218*5&7b734a96f507e48344bc36a4fedc8cba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5656d1da90d74832a400a6aac33d4efe*5&b4281e138f2f7296f95350c9460cab48*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3928c9d3cefa78d91efeb13a6f51eaca*5&d67539c4ac7298948b969c53cbff42bb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"2daf0ec9d2166e611a2ecbc22d041a0c*5&e8d0b2fc5dbcb0202381a4658da97adf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+binary_add:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"792ab0aa1031722aea65648a01e10e16*5&b74d7cb6a0d87d89945e4399f3c69dc2*5&c42d5db3aada0a31ca926164a1628f3f*5&8469bf0daa705e9e65179336c39e2ccc*5&b7e3985715d74312f24d3724cd5b1906*1&06a8e0458762f604082618ec84e0c36c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"977e07e77a0f713999e343569728043b*15&b0e8857a9a178c27cde02977f9471e2d*15&176361c5a7ef522e7c9553edf4b886c5*15&ef9e48294a6f5784db5d9892444a6aff*15&ee8401d0b810ca0a0d74570c439ba76e*3&8315815fe8362700e64bcd25d1c44342*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f7d55fd0273632653334384ea7e482ed*5&b889bee66667185df56b3cf3d3308eea*5&d7c9ed4d5e681465a67774f7ce308fad*5&991d93f27c373b3f5a399262620fae2d*5&6f671823e8d88f0979aa46933a875391*1&f8c9c0e5f7f95d2954a9e5cc6688b35f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"bf5a7e6487001c6cefedde0491fbbf04*10&cb85e15dccb88541a0fe525c88fe8b7d*10&cb16b53781656a103f7e733e360d3cf6*5&ea3292980aa03bcaa3464ca37b6d4389*10&fa625d56b21c58f2eba999e0a759bdea*10&54750d99d15566c64275889f89f6a4ea*2&a3d6765424965b836d1c581587bb1286*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"79c4dda2fae5d3f212111256292ff89d*5&50ac427c81489b22acbefa82a9f1120f*5&2b4705fdd6e8099cc4935b626acfedea*5&f1082bb6f380f7b1817fa70befb550f9*5&85d45240b2f70689575b4a994aa5608e*1&deb15fde0000336565259e4151d25ed7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"739ef4bcbd7fe1a43dec2c9fcf622141*10&425708222f92bfb5e967b48372c3b092*10&5606c7413b7be38b5d73d6f3a91d497d*10&93452d3b54bd73a430a5b67bcfb3cd91*10&1fb15bdfaddc490ce4ba07ff20606925*2&8f5fbfa704ef3da2b04fff635ed026bb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ba71f4056ba5506a1be6ba19de6da818*5&7833c05e0b3187080f79c075e9bafe5e*5&78dd9956c483a09482351c15bbf5ae56*5&c6abaf61840a37ef2c904ee6e2a7e43f*5&f794e8d8a9a8a73ec5539f9184d99935*1&184a867399ac46f87d0c10edddc26986*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8e60b2649659523d497502e7cb0a39dc*10&6e5d89ccd886fb27c4c94fa35a853e5a*10&4d2fc528b399b7def85d4efd1a67b817*5&6df7ddccc97a057445c026fe3e5ac855*10&16be8bccad40e0f5a8b0c8303db406e6*10&7923818efc6e7e9aac6a41abf5602ca5*2&712956d653e9bd0364788a99d52bbb0c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d5fbda5cc22c21296d86a18df2d3f801*5&6591a6b1066370f9f8532650db237929*5&9c8f66a9d341da5e73b3ea8466216d4c*5&a458ac0f9f0f81e115223456b11c39be*5&8e4cf6a5842ade94e055b037ac928591*1&d731773268f8295d4f688e3ac9f6ce3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"948cf50aad968cacc06ddeda243cafbc*10&bb065826b522e05c0b07fa292ac7aadb*10&5fce24861810dc8973880035bf8d4dac*10&875a0f1232bfad8190f4342100d9e2fc*10&856bbe1bdb49180fa87867f5ab677abb*15&9d72efd6be47ca2e34fca7408e2633f7*15&b2e74f63af003534835d27386bb957a0*3&cc9a8982cc9c867ed9306f75726f96c9*2&0782de351d2318fcbebf4cd1773bf097*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c1412baf708c804c6d8a20b53cbf2e98*5&95914b790f53c52e8e4d06b265eae0fd*5&967a877e27b770386a9257d740f2e8aa*5&c8ab4817cccf8e59fede0f67820352c9*5&81b76314e3cd193e4b90c544ef4cb30b*1&72d47aa355ef36f9df1bb86d3905d6a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5383ccbdf8f3df0b25ba9f73052e5049*5&d623b5c0ff32c5fbadd2aa3a742bebc6*5&dd832e66ef817c10c1867d7d53b962c0*5&397ed39903cdcbb2eea94b59e59d08b4*5&125f7bfc1dbeeeca53c537798e2d542e*1&865d7878c04e704186f91ee7ab79d44e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"22934b409946037c9bb3bd2bd9553e53*5&1324aabf00566ea1cffeed716c96c11f*5&9c7faa1a796acfa1dff0622f674bdf93*5&ee8e33db98b4824ecc9fe70440e2e9f6*5&e87d896c4ec4aa1438abdaa54e7291d2*1&58acc2e8ca8928e75e8e718ff5a918db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cab9509b5505cf3ae20017cbc8f6a8b5*5&0cc6e36351bea47bbdf574088e347aa6*5&5dd6b972f2aaf36a4b08d91e13760b1e*5&d4b3ec8ac5f527e11492f0281f378830*5&95a20f6c5dad8eaf8b39e8ecaaef4ad1*1&9b161659d421ef519dceef05e66ff657*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"779f85e544a22899c01ca798f1bc30c2*5&db45a4f8c6afc4af4e14386f867313aa*5&82b8b2f524017cc23af3ce9ec95e01db*5&35affc8a2a712f6323f9edcf1dbc6247*5&6e45eedd0ca13be255a38c367de10cac*1&89c749e7033c76230567d63b55d3e2a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4376220d5019bcbf93989629ff8719d5*5&74cd6c9b6719599cf0d7888fad239e95*5&8859c59f975ed1b7ebe0e4be7cee9098*5&bc1890eeee71a372671cb1d927fd42d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ff827dff9e9dd0de88bca962a63490c4*5&3fce76b7e42917ce7c2abe2b09f84ab2*5&783c92c8fa054afd2027c4e01dda7235*5&72a2221344571ce0118b9843b6814128*5&6b90affbbe770f6c222284c09ce69821*1&c94ed78b0dfead46fba0fbbdcffc7f69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5ddaf6b185a008f302c79b61bc53bc4b*5&214be8f2314f4c2794186511ad64b0bf*5&cb46b49e01d8754c02027461a5bd5a52*5&57b7e4796c36b28e886668857bd433d3*5&f86dc9bd2fd37c44003da659f334f67d*1&1d6e417554d5a0887eebbfd0bed5585a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9d76cf9a331b8e6ad6aa3cbb0bf5f96f*5&9017fe5844d2a3278f10beaebb052151*5&cb1a5e76924cfcbd00fffa0d746a5ade*5&86f3794498ca093d61b6cd9c7dffb466*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8f624c4eca8211f5e3edd58adfe136a9*5&7b37285c68aa72aed2debfaaab541166*5&f4ae040d192c688e666b830819d4be95*5&d44a93bcd9672005b2cc86c2730a43c2*5&fe207a71d19a28ff5fa5e9f1acea2014*1&dce05dd0036e42809e63b7d8ad5d8f79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a0ce4cd71ee5e3f2b8489ffa45d04358*5&a8a6f20f5f602eebe47c80bee94fb19f*5&1644f849165a50fc5e289024c0b559c2*5&c76a9dc974126de9f5e04c4359621f47*5&22291d231b6cec7ec7045252f31b2df7*1&38ec36c624cfb5e1e7dee09861dfba6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"ae28728d70b2cabd236b153f84332d22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"b9d6f1f7c4434ba782ef7c7ccc173db3*5&37a06d00f6ee5173107bbd95c15f89a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"f120e0ddc69fd0f1a83df3100655de4d*1&953d2454b41819e46c9b955da1dac8c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"f1b07d1e45b165c336be2ee83ee6c125*1&9c708062136958232580092f216c9e92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"0e71794f534e9dcae3b8aacb273aec04*1&81554fcc1bed41402dc7c9268cf26740*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"2cf084f04a02932316ce78cf05000432*1&130a1845f0c9b23623baf6e36c8aa9a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"8a729355cca572640689335df5e562a5*2&8fbad564a112cb4645dc5c80bc2bbcad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"ff7a3a36da81e88d76f62b7b78b885df*1&99cf24ec3ccc7a07f2c81acf0bf004f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"9379781a59b88e625c50a950ab295884*2&a0f9b2cefdc5a7503aa310e71563796a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"de8f52b5a75206e6df52ac61d9b4e4f6*1&c9e4f9c8fcd9bac2de343a107d51bc1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"15cd8bcb7b80251984540d0e718e5b12*1&53ed56f843c50189a51eb9b7f4eedf3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"fa477968bb590d43e4b63f572123b65d*8&8d7052f9861217de99da6b1bb7b5d6a7*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"81195e91281b3d1dc56b7d70b584dedf*8&56b6899a0d36882ac6d8d8ac648e479c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"b08b858053b0fa2c8f796646393aff54*8&dddd146dc85ee240ac674846309cb86d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"09cd47b51734b970dc2111209f1ae3aa*4&d3c2509f1450b2a11fda9e4d7f843f7f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"31692436188320e96e4e398dac7e12cf*1&80a6884dfbfbe92cc34a2281d82376a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"33d40c215254f60fdc86d30d89a8067f*1&5d7a3bf0cf7b8e50df756555581526e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"cb0f8c98fd60fb68530bce2c79a1aeed*1&c87748341cd6fcbf88b26118d24a0200*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"d9b5e4f744c53a1ddf36d8f96eb3e113*1&3cbe7ad5a66ded2f56455ffc168d0993*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"5e1d061e327fa7fec184d8f356c14004*15&114d24d72d4760d168d5c665c31a7534*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"5520ba9c82d45d9c73a935f68881c527*7&cc5de154662929d1173d00d19d89818b*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"80139857008de883dca2c2854b9201f2*7&042ed93318bb4c7409c6b60fb01ee27c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"48eaae3a5562bac90ca14f8b73aaa9d3*7&a51efadbcf4214f0dd7c9f1a362de78e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"916d51a904ebf541ad20fbd38a936162*14&1eb257f6ce70c7d4398f04caa1b40730*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"5844f835dd84711af6621c8e6ded25f0*7&5baa79f1f3db8d4898f48bcb9ff1434b*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"f8124230f1ba0d02bb6f740fc4b443dc*7&759a9664698fda552c574f4fef09710a*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"ae91c9798fdb18bca287ee318f4b5422*7&525d3386eb02266c254f432852f6fc07*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a81e129d2ba9bbf02f56f419cac104b6*1&587d15f0949227952f2e34ed3d1cecc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"9b3797fa07976ac5d46d6664384bf37d*1&c269343e7d14bf1e99dabf425e0042b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"1ba8d0ca316b1847a15661566190c7da*1&ac676e79dc54ad2930c0e35b9e109b71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"8795a38ac346598cbf0c053e3ddda97a*1&26798c136c58f8e0bd1bcd0e3aa0b903*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"36627c4c560453572c3a2ab6eacbc446*1&1e715ca2065ee4616ed448032ccd6bda*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"02f86654dac790b37e2c205df2e0b99e*6&0e12adf27786a6c3faa8240d89d7e6b6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a7f173a96b0fbf817c7ed11c32213fba*6&f6038d6ce7507c48407fe4266a3693b9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"a4dbd41be444d0207fa6242d70c3fb18*3&5a0d296000d95032678086d7958f272c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"66c8c85c50f6e4b1617af40fe458acdd*3&1aca67e1873ed858318fcc8e3c639092*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"5350c4394880172a8888d804e11a9a9f*3&852335da7c66a2981390513384f35ea7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"5c0398efefcae4f786eeae7f05f6c36f*3&e8f85ca464823a57ce25294a755a9bd9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"67db1da551614fa61e5b923e3d151296*3&8990a156fdfaf96efb55d60d47e293c9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"024ff5e65e05bd3d19d3a29d5710d144*3&fce9ad8af914eefe3691ca72774d627f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"7abfed953ed1f838dcd38b51e0dc31b7*5&9023a74ce773bfc11542a527c0351739*5&3670d67c3f7418243a708efdda94cd48*5&4e816822065eeddccd9637614f47d013*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e08138a81f0eeb50afb6a818b95a5de7*5&420bddd217366ae0d7691024434df55c*5&2c705ad721f58b5d75be6a101f22e179*5&b316afc2ea7e1221e373a631974a4907*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"88029be838b4ce23c3be74757d4a0a6b*5&3d4ae4aed5d5b48425d8a58946053604*5&f4ea500fce792a34a495fe49cc0e3772*5&693487242ead019bca7d4ef6b7b9ff7b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"dade6fd58a647b2b7fbb594dd0caf24a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"957a217c372145409bf9e3c83131aeff*5&fca1dd7776aadf65c58f4d500cf47ac0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2855a087b5d75a28e6f32c5825ea8e0f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e5e7f90a515e85aaadf9397932390ea4*10&4877946c1033a57be37a5e1fa86d92c0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ffa618a77dbd258151efc5185d9323a7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d7b8d225762bdc8561344c0f22a7dfc7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"440b439cdc17651ae383ca30c100574d*10&0bdc55068890ed3c1f9216d328258197*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9b58287d9191a5a6b101cc17732c0479*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"15aaf574e2b8be368a6b065f04d1d2cf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"80ca76e37772fc6d5e3b826223387e06*10&6e2cd4818565d2b6ec7f18202901b076*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"e58045d8335b31ee0386e6e3d73a1e7b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"6ea0271ec32bfe23d80226603d08a913*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"41335da4d4457d5a4efa3342d30f4e75*10&3131f4c07a13f90083b33eea3b93f6df*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"605ac736e67da1a137940fa22dd022e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6cd7dd86fc7823b57e5a9637b4401fa5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5f6592309a3c7a3aae761b475c500e56*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e199d73ee45264af329e37a4cde0f68b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e6990b5783dded4be307867d4b123869*5&59d8c0f87e18d72401b97d64917731c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b40c5b0ee965b2ea1ce1df386afd2f72*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=any mb1_ic4oc16_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1587ee2ab48ab572a364cb380858e2e4*5&345897354af36a4f9ffcfc2f21bd74b0*5&ebef0301850ecf04adffbaec90cf2de6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc16_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"13daaa0796d273151960f4e81016a12d*20&2b310bd7bbe815c1d0f649dc2f26ba06*20&0d4443a9ed9ebbe515e4dad554d6deb0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"46b519782bd89cb85ddfd4d829c224ea*5&8e51514c14eedb2aa88b040db8e9c964*5&15b0188eaa6fb80225f12f24feb4c32f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5359e97dbefb654a0a9c1024f6f5f21c*30&1ef40ed8f541662af4edbaf186a3eb31*30&09e3a3d8a7291d8af6a27719dcb6d054*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"6c617599d60dc84c4118f624ba0c03ed*5&6bd08f49b4f68e88e94d9a0be5084a6d*5&3f6f597815b2d3b8d092d7cfcf3de7bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"39b27d7b6892430c15d176ef30ed330d*30&2ce4494985adf2504b7e8ab7ba1d88ae*30&075ee0650362a1b81b3c9fb8fdf30149*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"7f5a733e3aa43303a2a3e8044302733d*5&bac1488ecf6b7bc0a2c64e0254a05d38*5&f5aaccef7d267d9e9c53fe99ad85c0dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ef76bd651924257d0f05d48047b03a65*40&bdb412a2dab6921c2481d5a26a253b7f*40&4761d2831c0cc5ef5ccb70633aa01ba8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_id32od32kd1sd1dd0pd0_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"08cf8075f333b9b6e47783a69e058241*5&91695ab6b471a5ad8a99fbd21fa0c3d2*5&707154687697afecaf15cfb1470cd42a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id32od32kd1sd1dd0pd0_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e86def50e0e23815e5e5558d78ce4e74*5&2ab6a2a2f37c1c2fa5d647f7ef93a1c5*5&7ea28f826aacdc9f9d55555860012c32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_id64od64kd1sd1dd0pd0_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ec0e05c0b5cc1e68751159202819dadb*5&3b62bbb47b9f605df62fc18e9a759c00*5&1a497852acf39af720c08fc2867c0012*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id64od64kd1sd1dd0pd0_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"35ec07d1ed17d1a104f705530d93b17d*5&cc016ab7b2e1ae9f01d537949f883a43*5&52f6414d83d2e62ef40d610040eb42a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc16_id128od128kd1sd1dd0pd0_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e4d41397b1b0539d10bcb8a359518b28*5&10312465927a2ee4c3ca8ea1dd4c6b94*5&8d80712d450fc0086469c8cb7254fba0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_id128od128kd1sd1dd0pd0_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ae4c46ff98cde5bcd296deefead7f75b*5&aad32905e6817d62827220deb0b6d8c0*5&28919abf8776e0decd249700303f620a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic16oc3_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"351fa6cd3c7588075ecdb5ba06dfc3fc*5&0f4020c663adb8a6f25de678ccdb6684*5&f47dffbf74efaa538334d4750462a6c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1000ow1000kw11sw1dw0pw5_n"fb7d34e9630bd5572d5ae3b0ab9a97cb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic80oc128_ih200oh196kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a58c5cdc459465254f68d91e1898d6ef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ce1dfeab01bc61b4b1b1a78f70791b02*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"676e706075ea7b2295380b4eff97b7d7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4f01963620d80e0baf21ad2a124e8695*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"36148fe712506b1e5111acd6d479f218*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e7b34a51a5468f1e82b4565edb798a16*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"70d37798bcc6fefec4926856070b1452*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f1d07d3b8f44124921f118c2b33ac750*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw5000ow5000kw11sw1dw0pw5_n"a95b362cb7a5cc674cf6cc80d5b12bdb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=acdb --attr-zero-points=src0:common:1 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw55000ow55000kw23sw1dw0pw11_n"42cafa5db85223b01395b87ca6766421*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+eltwise_clip:0.271:0.314:1.234 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"24b6178eb7a2d8bce2335fb5568a5599*5&94d234a154c28615a22c6da2add91170*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8626a01ea1dc26c02af53d51028ced41*5&56dacfd0ab53793747f93f127aedbd5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"d601ebe956a31984fc86abc52824f074*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"da03cb0c96b9aa77e8e6de4c02d2d488*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"a665a141a12e2e2926833d12daae8d8c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"15ae72ae5d8c75ad2fe4402d2baf7c25*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"4ab0e349242a004d15c11c52dfb49fc3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"db3942396153965b6c72bf2a41405e79*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"b0d7606cecccd3e29d771ae87483afe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d0facf0ba61eb9ae7520a90859248613*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f8072a0ed24be4010f8d4effba1ca9c3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"85797ace693165745cdf68230f52e7f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"740e14e44be01ace8fbe49d7784e5fb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4da40a81b6ab3a6c6a9063b5c6f798a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9a40f4dfd1967481561688fb4cbb7a91*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"301f6980b2eb20d68153e1bedbc3f2f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"fa66026dee76813b53fc3533dc895029*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"46b5d6d63150a831e5565de1ed7f9a5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4e5784a56a76fa11ce26425c1d64eae9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4477a41624d73dabaf885a98955b98f9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"6028de5d0c4803981f577f6d876f28da*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"6de3f9081605af2a8d0a5c95300c23b5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"f749d0de7ea48381cf367ef3a9f7d847*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"95146839c690367d68ce382e6771ef44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"30e75ad2028d178a40d9830ec63b5af4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5c4b35762b3387290377c521695be4a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0b63263ad27caafeebc7850a7b95a9f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"bea45bda2d10c344c2089d71496979df*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"da163ff1f858f390d77ab9722e2c6e83*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"7d874acb0bb78bdc59770d8781cac326*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"fdcd070d16d4785847069f715633c641*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"89c7e43c6139719e6e1cfa2ce5581dd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"517cd248c3971d92685acc79953ed448*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"68db5b972e827321c06f74eb6e61b139*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1652e2d89ef4d264d77cb2f4eb931ba0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4e5268b551b18552e9f01e34d31c76ca*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"21148281a6b575bc18a41ae4675de5cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"36df80d15a8b78df2451222699b37553*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"e3741772b87153118c37cc80c4b95619*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"834b5e81d7317f4632356f644e8d0c20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"c94b7bcaa2e32e5cf6274d645a154368*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"330d063673fd32dc4a2c9379eba53407*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"a3588961f8d30e6b8feefbc8a476c0b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"adcba0e79c77b03affaf22d0ba12be4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"dd327125da5bad52221eec0cd1536bd6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"fd1dcc5891fa8fa595b5877ed8fbba5f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"3f348d15c0ba5afdf8746eab9c490982*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7be90bceae7d580cde78c539b522ae22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"38e863b0e428d15061d39a8b60734b8b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"8dffbcff6aaebaaaf5cadae1b829d6ed*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9f667fece9d43620aa15c1fd74666299*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ae1a1ac8209de9e60b874850fd6d2e69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"cf9551b4bd8c8e6d60ec51beeee9a605*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"b1b2356c620156dc8b86e379daa9b893*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"12941fb16bda534b70cca6685c9b29f7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8d60573e0070f53199bc178ce6119555*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"a6c035b00b0ecc6f97a4bb3b1b4e4fa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"579fd5a18705c8cd455d3512e958ee68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a81eb8dca01331152bf11ff210af33ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ba167029c0f45bdae057e5e1a50b50d8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"96b200b1a47d647fad0df7b442308611*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0fe60d1582615ea5774e4049510b72bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f2d3db54dd8b5c85ab6d9c6e00810901*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc128_ih326oh322kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"762b256dc336f790f28fd1e96a0c802f*5&9f6af3177f916ce0137b840ee4a2df0e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1630ow1630kw11sw1dw0pw5_n"7d7fc87fc68581f851784423300602f8*5&bd4fb175f81725ae3587fa12fcf38d6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"309719ec27da661fc1da9a74d0ebcab3*50&343bb3cfbdc68973c4c67c4ddcc5704f*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw8150ow8150kw11sw1dw0pw5_n"3f3d50914026750f1ab57a71ef629e34*5&b6e7d76f8ef7a23349933082c913375e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6753714f3de38c51e3e908a269db5981*50&7a653ab413ee96f1b7c5c828964c03e0*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw89650ow89650kw23sw1dw0pw11_n"b1b6e1671ff225c9976596b0efe7772a*5&1e15e827db8fcdba134a459bfd5e10f1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0461459fb07a126f637925abbc4ebb08*5&9caf75a2caef8fe436e375b1bd8acb66*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1oc64_ih1oh1kh1sh1dh0ph0_iw16000ow1000kw33sw16dw0pw16_n"888d7f1f721503d8afcb3e5141fa1883*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1000ow500kw5sw2dw0pw2_n"819d014274375412ff612a110e4e6931*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic1oc1_ih128oh128kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"65590107d2b760c9e2fb973c9ee0e59d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1oc64_ih128oh130kh1sh1dh0ph1_iw100ow102kw1sw1dw0pw1_n"f11f2ee4c744287caca6ece9d8612644*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g64mb1_ic64oc64_ih65oh65kh3sh1dh0ph1_iw51ow51kw3sw1dw0pw1_n"b00e98a73498433d2f3718834d4d41e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih65oh67kh1sh1dh0ph1_iw51ow53kw1sw1dw0pw1_n"56d2657c82000dc5dbf08a852e06731b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g128mb1_ic128oc128_ih67oh67kh3sh1dh0ph1_iw53ow53kw3sw1dw0pw1_n"a61784ff239773b13f603ddde24b7288*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih67oh69kh1sh1dh0ph1_iw53ow55kw1sw1dw0pw1_n"04fa10f796f9120f2c22739444026ef4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g128mb1_ic128oc128_ih34oh34kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"6c71cb8b2ca92b85f8c4e23e64d4aa13*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih34oh36kh1sh1dh0ph1_iw27ow29kw1sw1dw0pw1_n"877b4785380ff8fe6b6eb7d845a4ae24*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g256mb1_ic256oc256_ih36oh36kh3sh1dh0ph1_iw29ow29kw3sw1dw0pw1_n"1642e004ed85d6e35aaf176e5a7a41e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih36oh38kh1sh1dh0ph1_iw29ow31kw1sw1dw0pw1_n"4914eb6943c12b4479f1f0db9fe8b5bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g256mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"34330739418e4fda1703618e77a8b308*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih19oh21kh1sh1dh0ph1_iw15ow17kw1sw1dw0pw1_n"236d460d7e48a877a23feb9fee02f771*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g512mb1_ic512oc512_ih21oh21kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"84958ed0a77de9f5a5b67f96f4b1672b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih21oh23kh1sh1dh0ph1_iw17ow19kw1sw1dw0pw1_n"189a0b48860a6de8895172fb8fc1ff2c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g512mb1_ic512oc512_ih11oh11kh3sh1dh0ph1_iw9ow9kw3sw1dw0pw1_n"6e92e18fe9de1159e1a865c5b8dba40a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc1024_ih11oh13kh1sh1dh0ph1_iw9ow11kw1sw1dw0pw1_n"14a9df9516261d063ff750115668a4d9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g1024mb1_ic1024oc1024_ih13oh13kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"b21193a6641a1d1715521bb4de9992fd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc1024_ih13oh15kh1sh1dh0ph1_iw11ow13kw1sw1dw0pw1_n"d1482058b34f328a53978e4f1264a39e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2197bfbe603fee0f3092d49a397a147e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc64_ih1oh1kh1sh1dh0ph0_iw16000ow1000kw33sw16dw0pw16_n"11215fc42b99760c216a87d1b99085fb*5&57aa34714967f9aac46db94d1a2fe1a1*5&973f2257202b210cff98f1fd0f0f8750*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1000ow500kw5sw2dw0pw2_n"f0c5024aa0fc36df1f203f06abef3439*5&f1bf8312ff872c1c80eb99b57f0937d1*5&4c8d5f68b31d0056419fbdf60f6f4313*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc1_ih128oh128kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"05725422f0ed7ed1ef0439163f3a9571*5&424fc94d75c8f0f3bf04336898f84217*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc64_ih128oh130kh1sh1dh0ph1_iw100ow102kw1sw1dw0pw1_n"ef96214277db5d7b01edaed11b3d6a2b*5&a46a256dd9416cbe88c6d1614884a99c*5&5a75e40fb07f7b34142ec2104e3dd14e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb1_ic64oc64_ih65oh65kh3sh1dh0ph1_iw51ow51kw3sw1dw0pw1_n"62ca87826bfbf659c6c53d7636826e3a*5&efa569e3c0387ab90894d1a2ec6da82f*5&46cba09b4f1ae7d4fe4d7de0958cc72e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih65oh67kh1sh1dh0ph1_iw51ow53kw1sw1dw0pw1_n"83ee2c332b9a8a2822f5593014cf2b5e*5&7c83b85f9b7d3d96fe6525a552515e55*5&639ce53e32784586ee06bf86c230b671*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih67oh67kh3sh1dh0ph1_iw53ow53kw3sw1dw0pw1_n"99e9a8bdf95fdfd3ef4cc5191f5cb67c*5&5060ef6c2d1e9316bf31f9a0f7e60814*5&09bb4e515c9e614fc6eda45a3da15731*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih67oh69kh1sh1dh0ph1_iw53ow55kw1sw1dw0pw1_n"869f551d86ec01fddf81f99effa918f3*5&93ab22eb5c0e54c1aefe25d5c294f242*5&fe72485eef2f24022588b0308a34595c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb1_ic128oc128_ih34oh34kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"9dd91a9c2cb8440df8672699c3d4f282*5&213c063db62a16916afac5f888ca93b8*5&9575e751b184cb6fef0938730d768092*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih34oh36kh1sh1dh0ph1_iw27ow29kw1sw1dw0pw1_n"21e187916f368c9b736dd5880b13f0af*5&a160fea0beada6483e105f9e23eacab1*5&5473bedc7acbe9db3638856e9cecf186*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb1_ic256oc256_ih36oh36kh3sh1dh0ph1_iw29ow29kw3sw1dw0pw1_n"f018112ec5ff4373db392d8ce9085ac6*5&ee36afad19e572671a786fe0ce77b646*5&5f42acf3f2735e6023f76179e043d361*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih36oh38kh1sh1dh0ph1_iw29ow31kw1sw1dw0pw1_n"06806f887142e67088a3706d328e503c*5&9c8de61db16d5123939dc08636dbddc0*5&f70e71b755c6806f1044936154a8c95c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"ad66c2839b0653c9fb6ba8daf68cee47*5&0ee5c4a5cd4b0c224983472f1c06bb58*5&4433a7c97bd9b9311c6c229f191bb5b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih19oh21kh1sh1dh0ph1_iw15ow17kw1sw1dw0pw1_n"2f930106df2053018847af5d955f5271*5&08892c4a3b37c3e0486d066955c2827d*5&665c6022a32a4804e5cca7c99c9576e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih21oh21kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"76cf020f15fce11947cd32bdab9b8d42*5&1fce72266b60575872a1fb59e3bfff3a*5&ca7200e91e4fc1a7bc4c3db5917e7f52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih21oh23kh1sh1dh0ph1_iw17ow19kw1sw1dw0pw1_n"9cca2b2106227ab60d47446b0d7b8bc5*5&e9420a32d3eb37f4090269d0939556da*5&f7b27ae74e0355f6d45c2ba2f46056ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb1_ic512oc512_ih11oh11kh3sh1dh0ph1_iw9ow9kw3sw1dw0pw1_n"9d3f89b52d782674c4e87818c5dab92c*5&d80dd48020f8401580003f793067d729*5&9bd3ca88621defc1a729480fd88384e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih11oh13kh1sh1dh0ph1_iw9ow11kw1sw1dw0pw1_n"407dd3ae276f0e106e7d67f832583b91*5&7aaad110dc43768a70b920ec7ff007ae*5&1a9e4b7b463d555c0552d56635c563c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g1024mb1_ic1024oc1024_ih13oh13kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"0cea2e0647887d5c19adf8e0a93d171d*5&5ff894929defab844a6ce506431d9e03*5&27e79ca4cd235a16db899f2150c80ff2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih13oh15kh1sh1dh0ph1_iw11ow13kw1sw1dw0pw1_n"68dc4a97a8752776be6dee694a306e1f*5&55c3567335953c3cb3255f9552f56819*5&5b94c59fd3ed1975f24b2fc135d2b31a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2da4a90813ff4f517399354357fdcbc3*5&16fd87f0316783d51074cdab5aba7501*5&4ab793d3843b615138744d5c6c7fc991*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"8ccdf395bed0687fc67d5a7f66718ffc*5&632dac6eb763f6a7085711ac58a74763*5&13cf7ef3a623ad5ec6bd8f4dcc6b2e40*5&67e5d86e749e71acdb2c7308c4d18400*5&052e61ed3c954a8373205339b8d4d72a*1&3ad7ba73abcb496b18000134bad15662*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"6f57f68bc85243f4b31668bc6d1302b3*5&637abac3a41ae752cc50570846177c20*5&c2ff1cbd5eea839554360637926f84be*5&38cc54d3d26d3d167d599f994fef10b7*5&9507f255e088dfa5225e952c5427abf2*1&c6ddf0ae9ee5735a42ed9e312a480f0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"b1775c2641ba459bc41a24eed8daed4a*15&84ddc91369545a3a42c2b6e8dab44144*15&5b4b01a0135e0f7687d3f900a323779d*15&c188c02567d5c8ca10a8c7412e80be1e*15&eac6da7b33163fc3c1bc28be03a3bc5d*3&19bfdccfe20fff00efa1265bd836d39a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"a1259144f354eaf4039db2bc736620bb*5&cc3d9c64bd796e8ba9b26551e82af0b1*5&fa164cca07762b3e9f409c77080a9a0e*5&793cd57847b154824636714452d1daf6*5&b6780b13dd7033d1033107488105d0c7*1&4b199df8690fb3824000a00a62aab625*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"111db1e0c2909040097ef7906a5702a9*5&22c71e2c96e3256f9bd8c76894a4f20d*5&14bbcd24a3e66d5653dda84d5097a787*5&e883445f95dcf7f4bd16bdb3a585a894*5&597a445fa7ceb70aab61f2f886f20b09*1&db45ae77850f1d7816268e6dc802cf0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"8eb87834a454da3532b0793bef9d081f*5&36d48d877d18178eba55c218f27dff37*5&9c674d5912a6ec0ad3408d7650097603*5&b60e7b81c81fe55e70acc5ba7004e61c*5&e0d141ea6f434573459098b9af480624*1&9107b2671c3e368d72a90e4243e1a06c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"ef4b1e8412bbb1af3efcf075a67620d8*5&6a843ee8f3dc783681e8d6a4f38821e6*5&6f72d93754cdd32942c75f754c9e47ab*5&c3da166174f3d0c1679749d3beaee3b9*5&fd123afe325aa72a8c5308a9cc65b61f*1&fdd26b316334f320b00fcc81b7b2b29e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"977fe1c0200ace0896faf95eb5d47e02*10&0d455c27ca9f52951f51bfcdaea41f47*10&2950e6bbd89b3556ead574c29910cfed*10&606ce8d0610deb2815f3ce91eb97c157*10&35bf514d051855b7ff8a3040c8e557f7*2&351b2d74172ac2751083f72d584f7a86*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"cb3e0032a8863b0cc68b486fd30215a2*15&fc874083f4b4c2e3794fc8091520a797*15&2c09041746bfad6bd8c28c699c488117*15&2c4191405d07576a1608e57215a7901b*15&52fb995a1a8d84e0292d416a37ae3b86*3&85fbb0c6c38fdd8b27f212d4b4aaa30a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"20d3a351cbb69b73c924105345dc3a5d*10&8220b5bd99c2ce3142eaed0b6c189257*10&28afcfe82367cfd342dac6c03331553c*10&dd589a28a983491f526d36660db81511*10&59d7da9f209fc2e4ce9bce2a36c92767*2&0a2ac554b2ae1f5d060c4cf05a8357d4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"96f41bce3d3f8f4fd677159b2f2417db*5&c1ad441300148a474049c54699d49f92*5&ca0ebe477f89a0f5cb8a2fc5d95d08ed*5&fe63c108e38178015e5c74d616e1adf9*5&e26dce11e773d125f8a7d3a807d1ff99*1&6a680d93b149bfbd8cb4b0b3163504a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"2b6a577afd57d4b0b19f72a7345466d2*5&1d1007d64d20a83382c6b65dec9eff21*5&aeb0f636980ec1937427eac121a27cc3*5&4c212e5d57e1d218c683a1cca6ca0611*5&2f7e1df71b3c993dc58a8c406b7898ab*1&03035980a93a23a159b18a3ccff465e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"cd26a7560737e7eda0b693c0afe25bcc*10&cb624d7561784e2af0a556243db0e229*10&ce67b1bb4f47271760d2612e904a184e*10&ed44b0f0cb0a803f853cea4cfa92d14f*10&879ae18feaf3f83a90aa45b49f23cf97*2&62f99a4f662533629ff470b7cd7eb38e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"9f776dbfc81ad72f786940b6b653aec3*45&b6f6d895e02c9f5c6385b91cf3ffd6de*45&446a6ac535b8cdc48def0aaf7c6ca29d*45&6979a82dd59305c4ba74861bd1e6caf4*45&64e35bc00415ae3fa3d2437f632ceb60*9&4e6e9880c696b8800edb1ed5eea5e052*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"e5bfb322c9d7968336d6e72c3c6d4a92*40&e0c34966fd3cc23fc6bb4143f3ef7e1f*40&41fb70b7925119710a6b87968d863676*40&aec3aa98c09b176c5ac9777f2f495481*40&d477d283adfdc7f29bd8e4ee47c0a23b*8&6721e7c6ef1324a81626153906843905*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"aa929b25ce5c14357a2c834a3a8b7215*5&bf433d594bb366a39d563594cf5a1acf*5&792e4591ea4748ea3fe7c5c43b4d8f8b*5&a4bea3f454c537ac8fd9fde6b49b74c8*5&feab573d3a0ef1dfd5244370d2dd7db1*1&2f1c79a233d9c0fc2e77f39d3f549541*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"c6c0e58218e092d8756fdc7cd5b8bfa4*20&b36db77cdcf485cb43f005738cd3caa6*20&89c5b2f3e89ad01ee5d1b0c4b518c168*20&f1dea711f34f624dcbf50993c45a9126*20&8f55b61f5e6f9a260c1aebab9a2d186c*4&0d2dc402f86511a8e62d920aa5033f1e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"f40a96c9537526f46961a5cebdf16db4*5&dfb12c77740d45e1205140584934ea87*5&800d394e16bbd460b7e8588de2140626*5&f2130d2648faffaa3ccbc3b21baeba4b*5&2d3ee5a631afe5e1c9f88fc47bab9937*1&0fbaa01354edd68332c5988878aaa91d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"084b7df0a657bf83cc440245da9b56e4*10&3d4e540308fa2631ed5db7e1afa7904f*10&de7fc3b19ead22166e681546e7b53a15*10&e78a16681c0b34aa758a8b2e7e3b191b*10&d21bc0cec24d975372a619e76149f353*2&6f45cafa1b755f6be2bb82daa4473291*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f8d8e7aac5b6eaa2bbf30595ddae7815*45&837b1853506dc2102ea7877140aba2cd*45&cb13d7f7bc594caa561dd79ef4e8a44e*45&32bbc54ad324f6d520dbac8b83c49c32*45&8bbed06a95109294a4c3091ba3c48267*9&c28892f68ce5c3f5821c1531319da399*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9c267efcd1fe48eae159cb8c4484d1c6*40&ed4c7307680be0405f5470ea4ee5497e*40&f458b6f470b2e9dd1a714bc0c3b4d9ca*40&c91babfdc60a713a107d0c802538060e*40&59019b9368003b38bba4ee1907c2eed3*8&f3298866f662585ba7744b56645bc4a3*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d8b8959b3d244fe6d3316262a695b396*5&67dcd80196c3b1ad7cf1f0ef90640b23*5&df44a150167741c86d9766a9908ebb56*5&c94ecb5f518aff2c29786c57da3f16b9*5&655fde4628b68e55e3923e261e4fd652*1&90d9be34a5a5c28563a970675fbed8a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7e90de69f92d718aebc384ecfff73b21*35&ad5b289be3562d9113ef9afe6697fe98*35&a98a418943fdf65f1ae378649b8fdd58*35&5384ab1b2ab05ae0c5bfb6d27b9665a4*35&7e92536f21289227d70a77dd6f11e284*7&a20aee3139b6d003ca6e9011cb8a0854*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"98f9a218ff0eaf2944312dbcdf5f02b1*5&e16244656a0f1ef8ac103dd695e7e172*5&757a63c663c38c4d07026bc8c0578f02*5&94cf3f43cddd7aa27c3bf0ee94a08550*5&da8b83705e621bc8a646e9b6fa5612e6*1&dacfa123c28151e28f1300c7511b9886*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"926a8c4cad70984ff02a2936a059bb6c*10&e7ae80928b41ce7f7d90246ab405f74c*10&17d813f977d2360acc8e7519159d1b39*10&947123b5cd6814f2ae27903ceefeba8b*10&4a96cd777f1b8d0b4df6765c58aaf8b1*2&0eec094727385845d64fe9d73ef62678*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d52611a3012dd82a827828b334cac144*25&5e71f0704fdf2189619766a01a28b8c5*25&674f8f9e1d34ec696749a498b1c32625*25&a8feb55685985d878020530503a00155*25&663abfd0babc1bfa032c2a264fcef740*5&b502f41e91f3152e118ac4d6bcc5db2a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"353f942421d0a21989931d22d8c76a2d*20&85a5a23c7dd795a422ff6caae679da6e*20&238f14b1937171c7c215dea38dcbb880*20&4d61f9812a25bc7784fe83dfbf8f8ad7*20&abdca21a9db4a50c78332053cf7a398d*4&8a058c7aa4372c1f92635b436808dc33*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"318513df554ca3ec4a4077e41169c0bc*5&3c03a4d6761768c0207acd805fd0c5f3*5&100b7d26f3c11fef604cb9e58e5f37b1*5&829faca0f6cd39b48f8958a7b4e3b0f8*5&13d69543de51d066980ab3ff5c1c2b8d*1&dd4c6912cc0e962d74380ef076b37c8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"deb78fbf5a5213cfc84704bb4afc7105*30&d1b1e06cee7bed7bd8f14c4668e59d1d*30&52e04abe95bfbe94b96c346fd09210af*30&626175ebe9f770c70d46ae293bacba0b*30&d034747be397c4c2ca713c7250866726*6&6c1465deb591cd2153ab992bc7c994d4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1430e43b50d8d07b182754d918893375*25&5a498858a845a964bd8488ef4ddc769d*25&1d4bc8a0dd0b3b1e7c4930982401c4b8*25&4a2056fc285a535eb18ebe2aaf271ae8*25&f18bd6b614116d21997de8b2aae53995*5&88478f78f3ef7586dafd643c56704181*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8bdd1e8b586ab60040b1df76ce829558*5&1dd9ecdc282820410bc81958915ae3aa*5&0abd96768522c453ee67a157e86e4e02*5&26e1dedd80a03886f730e33aa330a973*5&259fd4344942034ab9ed71f2a38a0526*1&17e9246e94717608c653c54c0decbbc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"811be7f89c5e1eb37d9b305a73aae013*5&1817360a34208867f3b3a561931213b6*5&48162ff99f4162edc362269814714d97*5&8f28cb9b2bf549f941264748d8ac1f98*5&f100d6538cbec5d5950129d0393238ca*1&9773a907fdc50ad09b89bca12bf62a5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"a9c99a6cbe8ef5b04199cd1ecc1bc34c*25&903a4b857f6bb74345d6ab9a6e17fa1e*25&ce01d893a4bc265d0f1bf9db6976175b*25&49c51a2e4f9c6aefa2ccc087f3b7a26d*25&b4c45e8273a572af57b474885cbe4385*5&7270264d80b370a0c6d745c21b4d4747*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b4b1abde34245bdfda1defa60da7a34f*5&e281dc4470f46249345f130a2e433f4e*5&49851f05be305dc29622845b7e637f59*5&dc677d7d4b12e841ef796bcf1ec5cc9a*5&fd85a7d444ca5762d6a450746faf1e69*1&e5523f98e2434d69295cf9b7754684de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"ba3cc25208b1fec3ae703fa730c5a04c*15&ad693d195d290ecb9cc330a9cac5d24b*15&6e973bb7073857b8e67e4b3bc318f371*15&b7afc3ae7871422faa88afa19f374552*15&64d0936aee00bb7188b104d1c7113c54*3&eb04b09dccd543ca6b4917d0fb313281*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"d00d2867fbeedebc290220dda14b5d21*5&5e912d3555360c1d127c6efb40902383*5&13c616358e5e41df66911bc133429235*5&de5c2fe9be0000bf07a762ac93b8c9e1*5&e44deb406d6cf290842b4814f9cc7faa*1&05b51621b343135dadd701d2813578f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"31a51342f9a282a80f48845d12d391a4*1&1f27187783fc4f1f13aa4f293686addc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"399f11135505a0083aa626e31cf9c3cf*5&0ddb35a5f66a0d6a18b47f7c44313bf7*5&31d7fccbab2c07420b2f0b4b70ea0f54*5&0b2b75422fa198092c8e73a8f60c853a*5&6f372dfefa2da4f770c3e5ce6112539f*1&ee3c094968825e675c258cf7449b6b09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e7aaab74cbfeb9492e8febc057cc9ac6*1&aa2bf60c158464fca6159b6ae2c1401d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"809170b468ac13a39aab0a134f8695cf*1&883b701f47ddc76034d4fe282f26d421*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"f1f9bc8e23d1a63b672530c12e0bbd5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"4f555934a1d9a084bdd193f197bb50f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"807d738fd9833a0657cedfa55f41ddc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"b6de44bc90cccc4942a30d2829b6535f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"f797dde34aa42ce06edbc163c6fb82cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"b596d8c26c6ad6840ce353d07cfa25de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1ac9e5ea0d37166964d09405346067db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"3dc04fde4fff066eb4b6f515820dd739*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"e20ef41f5804906e7d75f3085f53769e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"af8a311127439025541cecf5761d9bb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"445a2fa75adf212dbc1a1dee39cee023*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh255kh3sh2dh0ph0_iw512ow255kw3sw2dw0pw0_n"78c5eebf10c5a2eed1a4cbf8a9253b02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih255oh253kh3sh1dh0ph0_iw255ow253kw3sw1dw0pw0_n"77f84092c152aa3966d805c96984feb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih253oh253kh3sh1dh0ph1_iw253ow253kw3sw1dw0pw1_n"15fa8623702c7bb7184faf6ba07a1ecc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih126oh126kh1sh1dh0ph0_iw126ow126kw1sw1dw0pw0_n"2f69ee04474ac8e7ac51c780bf3fe281*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih126oh124kh3sh1dh0ph0_iw126ow124kw3sw1dw0pw0_n"66d18faea65d011050982f57b4b938c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"21456e887f99759d7c282235a9fb8c64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"7f40809eec39c9e84121fabffdabb210*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"a40a4263a7edb5f97c8db5dc4799418d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc32_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"db338218609f04422f2621e0b5b84706*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"54edd82f55835832be38ae0a15b0f72e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic48oc64_ih61oh61kh5sh1dh0ph2_iw61ow61kw5sw1dw0pw2_n"fbe584017960120d19d20ceb5614deb4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"9b1ef2ca13f5b8ee8aa71beedefb3de1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"12bf8655dee78654e327d6ae526b55d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"fb84818f8de4104dbdf8f75ab4d80a9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"166ac149db97aa1d521b6113200654f4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"0ce68b79faa6d08257ec1487ca03f09e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"d34b0f36c22c0755563bc8efd91b3502*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic288oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"a3000517a066c9f1f578a4d64439fed9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"3d758c516814761d94a6d50d5827071d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic288oc384_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"fce61b4e1d8245b2b6e6b67dc093e8b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc96_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"c92c28aef5ed7589cef4a34041861c16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"b43bdc2db5e8ed96fec60757310fa425*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic768oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"debd60a3bcf9c470e7b673f507481f1c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"a499d7cf6fb5cec3c144aa31dbdff558*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"2f0060e372a5807855efce330edde140*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"44fc6125a3823ba9d03b3c8f14aab8cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"5c234687f6cbb6cdf7ce743cbe4be23b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"62ee710544f74e400df804a3f084e34f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"de6b84cc432d7d799ba75c758a8030d7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"449489cf28c7947a1b2215798f5d3afd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"7a3744132aadf2e59db0a8cb5d2daf78*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"7311ea8475a3f834f0815bf381c2c70a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"47129fabe092c696d73aadec9576e0e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"78978c0dc6554cf8ddbcb988e4a66437*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"b8a0c255257ffeed04169dcb8dc19c4a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"ab6b9fcd2c05c1a450789e6e5cd5fa5d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"5b16a0d3315cafe79254600354f125e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"ec46cfcd79435d397ef8dbaed4839d48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc320_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"6664cc7ddf801d7187b1fcaf45d6ca0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"ed801283ffee8a6b219bcbc706ff685f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1cc60a77c09454b544366c9962906cc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f56f60395b09007cdc3d6a3855533a07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1280oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"832727bcb40c7bf62efbf52c8685e1ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1280oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c1955b8a4da3af76b970442edee83c04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"45e0b1cd57481a74668d4db49f83433d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw1sw1dw0pw0_n"e91ed12f30effb058dca42e92181dfd6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw3sw1dw0pw1_n"fc2fdfa006119cfecde32d00132a2703*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw3sw1dw0pw1_n"3c35b5322a95558c45140368d175d799*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6c6136caaf856151bf6b12d39362b7bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"07aa32e5384b4c391f48b032fa3a4467*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"936d706a5438a656e73fc539ebc5ad17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"740e434352e6db4fb6880eeec0586aae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw1sw1dw0pw0_n"9d642d3e30ee8aa9806eba9b02fc5466*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"d6da61fc9b9e90d0db9480acae3e3e52*5&5a9df5517ddddd8a96d8c9267dfcbf24*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"16bff7074234ec0a4ecc08e37f0741ef*5&f08c7247ef1c129eb87cad5fa78f1214*5&c32536aa6d0c292ced19bd8dab66a04a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ac498f65e2838b651ff38081b88bf41c*5&d4e9d2d9064dbbc3611538110e91ee27*5&b138a9463d4b26e3526b9ccc9df7f33b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"510e091ba683b9ec11a007eca8110b0e*5&93ee8ee81287d147f4a75e4dbd24c7e4*5&0f0984f2d9009f472a8a79c06e446d47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"943ed2ab5f796d6256bad8e9eb20d27c*10&390451c46a30b14b614f4d333159a40c*10&3b4cd0d2085d0597bfc8555a0d0f018d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"8ce1204612254367fad3d6066a7b09c5*10&3ef7cad51ebdf966630f36f0448a3a20*10&15d503b654c47cdb784d51643187a14f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"95575abb363b8ca72f97efd123fbbaaa*5&588f0a373389de6e0b8f0870714bd976*5&28ccc2a39ccabce07b7b07b56ffa7b5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"dd216839ca0807b6647394747cd80704*20&c62dcf84955e99b894da786c44a3f566*20&e66814fb62b2048c7e5c27c30d050a4c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"5e04ec005a639a457d70084925146bc1*20&eabef191f8f0b727f8764683784a11ce*20&6988ed1875a1f641cee311ca38036842*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"81644763cb7ad103cc2770c710cf723c*5&9c2f4c05c64bcb434330b61a09c6b856*5&f593b6ae9899d6bb38dbea3b5c08166f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"2372d2f9d3abc2a91a55852be990766a*5&c70f1ea55a1095432b43912f4bdec568*5&631f7c312f14e9234e6264442f6cef7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"27ab4197111741fae29f38c6de7c7229*5&a768f71ef8934da0fa29998f4930b6c5*5&bf5280754c48c14745bf91da04c6af6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"ca2fa1a94e4c0dc9b72f1dcd5e2e292c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g32mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"b4194eeb67251669e286508d917485d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"85b614a69b4cad69253449d48f33f9e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g64mb32_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b888ef1be687407363f4034820e265b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c3d9f6145c880c0a66d1c89265f4f68a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g128mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"0b4c821b26a90282630f24a6f75c68ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"44a32acd20e10623ba4a5eec8f3ec6af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g128mb32_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"491ff93fab6bbdc1085f5c50e0d0b9ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4496cf2775b2dc50696141d26e4f3c8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g256mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f0bcde5c604abc3888eb55e085eaee33*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dd878264f45ede7f4ee7794e687833e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c6582a8639a25bf332ca01d415428046*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g512mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"9dd00fc1df4e380d480c8f0f094d84c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d191521c0d1fee26c6870171d8800b3c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g512mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3b64f613c07cb4a09e003903cbd8cd3a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d277edeb42099bbada3a7f3c12d3f2b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d0400d79b5611229ccbacccd17860f63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b65c34bf5534af3a6d1808c5d9d5c2cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 --attr-zero-points=src0:common:1 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3cc4f6e8e69be733b4e3ba6e6fdd5e6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4367e1e65e959c56e86935ee9025ec0a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"50095d542ea3b63b55f73470e899751a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2947073766ad6bac8425999ce71e5838*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"56759229fc097cfee68607cefb1ec6f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"edea9c91e82cd4ae0da8dce181e7f6d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bc45361169d5b6c0115c87983748003e*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e8e73aeb7afd945543054a6ae4965ed8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"32976973838499be02d7e0b7727d7314*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0918c739b6947c8cd382f46b387a622e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b929f3bc823f4e5073af14592cd8a83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"536e87a0569499f403ccf7d4538ff7ee*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4fdb1e5a37792bde58c41c3faa9be28f*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb32_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c65d8bce83391082988c898ee67bb355*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+sum:0.5:0:f16 --attr-zero-points=src0:common:1 mb32_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b725f31dd98f3179ce0563413cb95b79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb32_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5e5694dbfa704f50ddf56033620a23e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+sum:0.5:0:f16 --attr-zero-points=src0:common:1 mb32_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"364c9d21a1c15637f758cb81e3c68fb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1834bb57ca12e78d54b91c4a8dd9818e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fa1fd9e2328e63e1c1e598197ab77f43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"db878d60211cda93ab562b4e17fd62aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"35cad3698f244c11e6ae397c5c7d3da0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"07fc298923c2156bb3fee147ca80203e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"54742714c695239b2a7ccd02ddfef535*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d41a24760cc904696ab616a81f13e94e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e2fd9794029d8d40a4e30dda56c78567*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"922ccc17a3319f7ab44e8eead11b7ed0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"fa545402f8477c3c202e40082992de21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw342ow342kw3sw1dw0pw1_n"8e97738afbaec9658cc9c064669f08e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"fe5900d33f4188353c7a953611a40fc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"07308a2eb7027640e25e04ade62fa98b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"15d5730c46c5babc4ac0940b43c82608*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw342ow171kw3sw2dw0pw1_n"54ae75db6329050a08eb0b6278698862*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"05d116da1be6a1957f167e44d139cd1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"37db37b64ab134ca0da6ab789c6eb7f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw171ow171kw3sw1dw0pw1_n"4e0b30b07a625b1b562bd1c260469c5f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"f99c6b11e02fe0267ca1e49148d6260e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"7a1490e06b5be99c86ca9e2c28d128c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"def8000603e84a4290fbdace6617e80d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"b8703fe3816fda226ee343c040a0ef18*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"49e379e2cfe5824e27dc15e28b4c0ef2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"9217792b1354c9e6b904999c2a1095f1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"19d833faef5fc388e038ff759d051bb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"1226c1ae3322202eb7feb84711eb2bf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"53bd87a7d8a51369f9238a5ee5317d94*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"5b9c4b8561da9c4448ae2ee3c8056a0c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"5daaf358b74e21ca669c96d99da39d23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc512_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"e4a471e6da0cff9a2e96c8499cc8f7b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0bb3833c3416c0d836f088d846be79d3*3&fd4bfcf788337e94cb719d2806542426*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"12be443fcf90968275ce083efd18b2ea*1&195a6506b3edc01482f2b1e8dc671521*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"84c3a7b2e41d8008626fe61c7bf7962f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3d53a306b4293d8cefe8f7da0b55d5a0*2&da01425040f7d9971b2a02e0eaa7b132*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"25d19c41f550a5f02b33a086afeeedf4*2&e5f1685a903d04f1dca39c7000d80a40*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c97d2ad6ca8c75aae817c70f5a096f36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"b6e77f8bea17e475a61c7ef6fbaf17ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"04f8cf8f81bb670bf6cd275a14ded0a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"c5de71aa6450e41774b21bf71ba00983*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"298100a19757c9f5ea412d9be7eb6b6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"e6fba548fa7a000988fabb39e4ccc8bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"8e49f7bc279917346c515595ea0955c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"1c2dc57c6b26845b589a24e72e391c28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"1916f6edfb4f968f7f6403c4277f7eaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"596984a273b44e87d1fa8c2dc1d83590*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"68a5dda8988b4ad8b416b00774ea5c72*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"eac2426fc79a1be368f301a272bb15d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"33e26f5f6b282b7222408cb56b246fd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"85133f1689352b6d5593b425ffc2c8ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b15b040708f86b295bdb97e7168cdb48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"89796d17b0ff01ef3afeb103812fe3ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"70d1c128588984a10aa595fc4224d8c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a3f19873869041658a022bddbafe6990*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"4b6520967e314259a5fbab82ce013f1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"626384952b97e6adc0816a160082ca92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"dce2f155b3f091ce328c1f386ad635b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8ce88f979e9bfd6832e299567467d339*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"cdcfee172b0947a78f45620f0ad01a40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d2b32bfddb47276496b1c30808b8b653*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"1b61af7cb2f7fd742317c14486e914ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"e7bfc1a28b5aef755d0290034273720c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"98393db0304b164819d63ef09b6105c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a5636e32f595f72e04eb03201d1f55f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"fbc42cd0b18ce7242aec2240a07057eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"85bd313b93b194ad5c83f74e421fca77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b7dedd0a5df70e97154297583deafaad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"462e366005a5786f92ecd5f0d3a6de7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"a20ec090c43839d63beb7f54bcd34434*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"72803f1ff2eb87db8ddb4e52ebb85191*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"930d0d65b3022ddbdc37ef66dd8cfe12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"992ddb8bcbdd3f662cdf3a663915cfb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"0a735d74e717dd0ceb1a7ae4dcf61f1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"ed5bb4e6cf855bab791a5f09a968f7ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2a5f0b743df676c15c37c95ff69ba5bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5e8f1de3c7be97c80e76b538ed4b03a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c1c9cf04e411dc227d17ae3e79a6323a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"3db56f949d065b2469ab923434912ec7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d15691a9af5905e7fe1cda6f6e7772f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"e0aa9cf32e1d55e7cbe1dcee85a927de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"933a51df480068607c5a140df6f0a49b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"41a1282e88471edd8aa15f5f97fdc9d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2b1cd3e5de1f8975537ea8bd90026186*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a5ddb105f353efaa2e0c85b5ce78ed29*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"ee50163971c3d4dc5d330882c061de54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"ea2c3feaa50485ad4302d07386888e2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"7f8109754f235abf9678d0f76c03157e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"c3fccfc48b1724eb6538eae54e835568*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"797c5e6167187143aea9f9ba56e84b09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"7e0e8f75ea3bdfe9898fb93cb6966d0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"d06798e364d59daed7dbfacc4736fdbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"84453daf127db4aa140a279b91b27282*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"4a8a0b4f6c8dd24a8021b009008a3994*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=acdeb --attr-scales=wei:per_oc g16mb32_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"3c6a2d64502ebfc942b02cc2d2b3e11b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:acdeb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"75516e55bb82356ee65dd3bda81cef06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"6d66533cf182be345ccf06816e5feea9*1&bcb9e61771d5631553ff499903f38eb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"116a7edd6400f072f39b1ee1d31f74d1*1&9b0c298b2d92e8923c74539fed9b6b26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"40bf8b8fe0c6c375c736e8eef7a226f6*1&c043ad55e6098b94f856ef38d52c6381*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0fce73af12df0a3311c5cbdebe291558*2&2a846ec60761bc9a84e8683f0c089b70*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ef1cbb4948601e465f6179935fe3191a*1&67967aacf2063b21fa30cbc29cdb894a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef69ac3f899f0f3f4031721ba36af5c2*1&8a1fbbb8b391c5d2c132342b8de121a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"81615ebc2ac58e1ffbb322c45f13f25d*1&12b5b9c9308ee539551c4bb6fcebc1a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"41a947ea459bf7ccdc8298ca8eac474f*1&5caa7e4ba2149d68e1bb08e43f5b60c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"762944f582af722d574806edf16e5761*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"60864f8a35fc7453e21d52da21880786*1&bd89c4881b0a82f83121badef489541b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ce5ad126a7822637b57cfdfeb1eca086*2&ea5b3291f01b4a5f3a72a7ff14e2c3fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb32_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"6a3c35d48168de25b417305c34fa9b1a*2&3a828db94528d6816afcf83b03ad5aec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7b8ac1e75a22a63dfc70fa513c9ac79f*2&3ce39afd7fecd307dcdae52c9f5d20d0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bd15f66ac8e72063e3abc0c0d7962415*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c65d9823dab9f13309a0029544745c56*1&44245394b4b7d9687ca7442039b79c00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"39f8226ebe38fe9b7cf0da64e2977864*1&328f8ea5e89ecac727e4b3d52c3b5a50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc056b58fd73c044540b79031d1903b3*1&927ba69cf8d1c7a60633fe5cfa41c5ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4051fe163920b9951177b6e76d584ef0*1&74892cd379b62019e8317e7ed5f2373b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4a5da54ee320c864a811eff3e758cec8*1&ffa93d6f49b597983da383c76207c69e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8c5e7e57e6f91bcd736c8492ccaadf9b*1&603823ba16ffc3fc06f1bc6145dcd516*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb32_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ef6c457509b2dc293e3a9dc47fad694a*1&0ce67c8578ee3f14121181749c7c9082*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0082dc9e2d2099fff94c2709e5f87d98*1&537edcd528ec71958d419f09a9f94748*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1b6232d02e5ff9216f8ec578e22439a6*2&5c16e39b71ac61c88d91f2cd2bdb0c3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb32_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b1e7374286c48430a6c57bedfd5f545d*2&92664d5283deaf576c69a8a6172e691e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe3cdf130dc36551de5ddfa4367eb50a*1&87182c649cb52dd6109423f96012e6e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a7fa504848e267308953188ad80b8032*1&134a75a0148a59cb150e2d845d5343a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g80mb32_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4fc8b623cc1834cb92fe9f6a0a9e239c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9cc125a286d30804e42fd24eab91cbc1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"657c796995b631e62e9d523c1a8aa087*1&3f117e315ea457c0a16bfdf3c19552b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb32_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"60dac72bbf357b09156ded37adccc899*1&f1c5ebfc723ff8e30bbbe3ffb8e85a02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"33623bc82c79f95c4a977dda52e6b749*1&11bc7be0481814762df090f0e9fb19f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f89bf1c3291a3b27bde0371cb792f140*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"732dc19f7f6d8ae52e92d23d0f2e3da2*1&e6debe18f9d9e0679779f7a3a30b3ecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"142647509da1fcacbfa1ee0245928b80*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe03b844c0f11450dcc232555548d853*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad88b99b26a33178849e233e14f0c600*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"8e5310df6cd4e2e4e47fe65b16ee48c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g16mb32_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"a67d92c15495f97c0766cdedd79df00e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"3ac9c72f27a4610e23f07f09e7d104b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"064bf6802c3dec3fec632d28390b4f39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb32_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"13f6156d7129ca4d8e530355bf9054b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"95354901d98d80fd2403c67c7d264c94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a3dbfe42fdb7fc2ca1f1c6db0559145*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6f3a885209482609e5f8eaadb642e8d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b00253270905df254a3266831c69817f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"05e88d416ed11372022ab946472901e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca91936d35517ad0a3bc55eb5387c3a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"15994dafb718d7e3cc8e45c1d34e663e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2d1bbf85886b1d5283b7bb9db85fd2c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"81b68bbeaee3d8a2bcef05cc0f3e3841*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g120mb32_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"3dfa01d2ddb3d1a083c42810c84de54c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dbec9048a6ae4643f8334f492b2ffe60*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e20ffd967bfd762a82e553c83d9034eb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d180d955202e20a3610c461cf9337122*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"207a649ef73b5b239bd3bdc088ae9eca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1483677d8ae8934c3d9d80f59eb39dfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb32_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"45e57528862df1ec317e0bbaf9c0df49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2c375bd5839bb10a2e005a72e0f97393*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0ef8ec60fe2cb6815e466dfd2b26011d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g200mb32_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"80c954a5e627b3c60bf2b543dc243aee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3aa00a18e955141fa18b9907752d6b7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0b373091509b9945f358f35d4730b38d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g184mb32_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"341dddb1773af28dbdb458b21d80d9c9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"daf4f90b546a58a7bfb7a02e12f9c70b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"680e54169443c5bbe9cf0f7e60efb6c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g80mb32_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"517ed67eacff071c3158ba34ffeffc33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c2df52522f919b6b9574a6502fe7c143*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a9aba961039cf29daadafab384353b91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g480mb32_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8ce1ea1fb908521f60b7e965b5cef094*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"52ed270dded1a241c0d8b753e5f9363a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1ab1a64604c08fc1649c286a8de88587*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"632cb990aef36e2f24a5ded7c148fe1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e5460f45fb83d8bad86cd40deea2113c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"241e57d09501ece25e08418732bbb3a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"caffa4fd59e48c2c3bb1371a4be7609c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c5b9cf586530e682fe3e77b7ccb0820d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fbe30b16e1fa4019d349327712ce1262*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"fa7a6aaee6bdbff04ace69a873be9fb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7748a5ba6735af8f104f537ffff91207*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g160mb32_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"72fe5fafb5a926b2f08554067b444b53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed910a981f28cd17ad341b65f7bddb61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3db1dc3357829f43b56ffb3a94fa98b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"f690a3f9a8c6436ba87cc08f4b2d296e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"075efcc180ae5075b28bb96b7ce809a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dd132dc10124e1783dc37fefe7445b96*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g960mb32_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"db5061332ff8c7c49c80d1c422650b28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f76c614c7ec79f179d70159c00048270*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4f506e5289bcf27c753111af412a752e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b3a467aef3da08d020f50dc8cde9e291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4da12dc73ee94ce7079da1ca9380453*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"7db895578fd25f559b04f2b3fa936784*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"13408a0b6597aeabfedbcc83efe783e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"28124358e41ed4256a8b8b026e7515ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g58mb32_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"eaa60f6dde0d0b5bff9156e7ab7cd540*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d3a1cfe3a78fa5050088ce970816ab76*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"cceb020053a12ca6d54045363aebadf4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"1cdb6a5dc67c8f9648da526fe4ff5649*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"3a1dfa706c1318327807081d2760a253*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"056d8c177a036b6b3ade7e94a3e3eeb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"98ca1510904881fe2a8b9f6e3a3aed7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b5539d30bf917fa547e41e06b32488f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"6193746aa44f18bb2b7ebe689d55027d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"ae97a4e33dc0fbcd3b00862daf9c0ca9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"3a26fce63ac979bc057b8bcb43a960dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"028b23769183b59697b7adbcb7f834f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"735e9e1bf337499ae125048942036374*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"f4e750d1379281aa329c736d4151cd51*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"120359a304d9f48fd6567d42b61a75ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"b6c7a1eca80cf3acad218ea987eb1eac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fc70d0686b7129025b9deaeedbbe6316*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"94685de21c6c298afcc2d0b1e1ad4931*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"3d6f9c86e98dbfedca518998b1b4d31a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd62eadd9bb4afa838d31ff1e388e3e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6304668cb9df69a93030bfaeb8f6ee3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"5275a2423bb5adeb1fe8ee2bee92ad6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"5951f596762b99e0c40a57687698be0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"ddda45dc983afb389c84576caba6dcc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6bdbc97b6170ba4bad424e12155ef8e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5a6ed5f74a0f2c5eeec436e09853ff0e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"18eed9b3b7933b62c66794045c3db35e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ab3334f5e7b540488c1026362ded8f6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fb61f9c2d784d47c4342ba8b8cef94f2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fd75150a5c53dc8aadedf9422558537e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"83a730ba33f8dcdc3a16b689808fa0d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"dbd751eede07ac823be3d79f8f0b6e3d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6f224448fb3685db1221dd6d1d6348dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"888216e195a000572e45d7bce0f4b78c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f403e142b8b6c27432952a72268feb39*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5cdeef2dee8b4684246a958def72200e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"3885f0723cc1e8a9037a0bfa516d8443*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2c2f4117a3fc1efc5622bb7e2d1210c5*2&9a4da1b68c1b2f1d6c1c55428071679e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"35e296f0b1ccf09e9bc19a88209d34f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"24c715a8c87a76fab9ee477d6c76e97a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a20e8a2d28fb76e25b23f37b8e5b4f79*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2033a5e4743a7b59f3bd73d2e3ba9aa7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c775510efa52f57a17a978e586143d1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"63ec1ca15142929eed4a55a553365811*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc48_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ad4b6e102203556950d69a394bc032fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b5fd648192acf20c1aa95221911e20ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb300_ic1024oc512_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"07c4a73c761353ada10ec9ca314dd3f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb300_ic512oc512_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"3dc0db997661272940a7a6c61b52312c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"dadb684dc2a0b3fd6ade352cf5b04bfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb300_ic1024oc2048_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"d429cc9acd4b3a051126c67279822ee3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb300_ic2048oc512_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"0278c05a3542caa206aa76c8b7e12eb5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"1099ea20b55a26eede068281aa6b774a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0f410501f52424757b1e7412b732589f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f5d0c01e34518ab617cfb075f2273bd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad89091f8632ca4e8eeb0a7b17e9b345*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1988a6a430939514826eb81f6a8f0505*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc192_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d242e41524e09655b167effe477b1c31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"79ab31061edaa93b379dcbe0c62867be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d951c2ec2fa8888230bffe892b6aa7ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc16_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5cd8e780e0a1e41f5ed2734ef3d9b7d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"68e367a4a84e05dc0531ad1a741a876c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3eb583f0e39e938a1466d6538e0b6a3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc32_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"1fe9f00dbded9290ee644632981e2122*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"071cae356c8e74767520d9170c6eef18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc96_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"f345a1920b4011ece1a6e9b1fc8da1bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7f68daeca7aa263baf8695e588c0ed5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc16_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"529bf680d037f9622ec855e86ab8d564*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1304a288595f58db6fbc2dffc1498db1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6db3d4079fe7f5f623cab82a0716fe06*1&3180cf1b94ff80766a91bd6d3582bb1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e60616ef080a3fcb58b5b3c6550b5555*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc48_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"398b3e3825198e1c74d9836f4cd6ffe1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc208_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3db7e7b3b21c4e3e3e512221209e4946*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc24_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0eb4de6f05b7e924a80ea1eb1c4082de*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"74a9911a59fd34c471ed9b09a4025ec7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7386fa0ceeaaa79336a72a169695f8f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc64_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"495ebc13d8fb02dfb6c3ee44fb0d12b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1fd78eb713dad91863ea2915c0a217db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"288bb55c1068791a0ec5404985870319*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"72ad26b13d95c8152d43e6b9a93cfd15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc144_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4eb48723dba9414a1d8bc1ffc24b0892*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"9fe8ec600da387dc1b563635c44af539*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc288_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a43e878e63c7c956ca6459fb317e6030*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a32d9aac72aeb3942cb31486c74c811*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4592192cd2fd593f95e769663e095d1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5401e70f505368654e810a234fbd4d2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf0f80eaf15442a180b0c58687170e6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"334cde3f9840fa55fe3c336f4bd3c7b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc320_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a24973f2983b9ba1367f86540f777460*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc32_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d66246259fc573f64f4e85b4f409e4e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9331a3727a925b248d351785a26ab068*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"78184b0893fcec6960070542af2937f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"be5cc613ba6dd38ba2aa3daea346aa66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc320_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bbfd405e283cb020eca96411a3ef3ef6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc48_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"78823bb425ea15a87fc6bfbc3f759775*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6d6d5cbba3ac286abe73556afeefdd41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"af815840a627ab14c68ff55c2e481908*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc128_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"5c0b9cf685b0d4409c757c0083192d21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"80118be613da07ac4b1d65bc50656529*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"4f3b2d86ce719b6b91cb0a67a16e5b76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4c96aec756648d86fe137d580fa35a3f*5&88850add0c0b5a06c44971f8187b6c64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7a6591993e96277037d5f79a42f1a96d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"bae7a9b187b0ff49d9d6b6c1f9f8e915*3&c3719b4dc29684b06cf5a5db17c8edeb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1ece9f3354fb3c6e08e5af6b93661ac0*1&2dc721b29292ac56ee4ec82481984e86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"feed504d2afda1173fa3fbf342d0d05a*2&0a9dcee37b8a19d55155d85fab21fa41*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e4ba6db68e020f5de18655628eca26b9*1&c1a62d3d0bae71a70370efb5308dfc96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2f773c60f55871668299196a86a52492*2&e6b79760191e328ef618213a13bd7e60*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"04f33d15abcb84d36bfd7fbdf19a8786*1&fb47ab837181f51973ca37bb2c37a4c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"eebb5e3ef3f0dbaad54df00ff0e16c68*1&3375d066f13333c65fb51039fb667447*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9b04d7a1a74e299cc4e8f7e43bc02c3d*1&051e35ea14d5f2918ce8e2d0125f5a29*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d58e12a72ea8558252ffc1145d24f251*1&69bd0da3ff5cb322c7c7bfa34c5f2e73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"fe58a72780bdd94de6f67119d5fab818*2&7521d8a5f728764538f7c353b973a213*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"cf0ecca1c1604c2189475db95312d1ae*1&ca52a28e8fa5ae9704660c3db707f3bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0d10bd69f083f1cb822a570bd3a1ef08*1&576df14fe5bfc841e0371dc778285430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3b07890ba52e65662088e5003e6c6a7c*1&9211e26c784abb95d5ed7940d51f534c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"21df0353f4608d7aaccc75754fb9eeb4*1&6f7681a0d67cb7098d68de0b70d739dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8e7e24f27cb31d3d4e9aa3d31a441a55*1&b7d4c3882f531ff6c20d4c78568506f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"29aa4eaa8091a32de8ac153e0360b18d*1&ac390e2303d32a80257e5e8da6696a3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c4b51ad42c10291b31bf45f007713901*1&a1bb0542ab7fbffa89874fe564cf6d01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4e83b9906b331b38b50779b156c29c2a*1&f1e63b17c3c42f8026436296b34f09db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"090b84e76f9e21691fbe1f53d90e1274*1&bff7263fc43ed5f91410a103a1a29381*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d4d9229b5f9a957c463782fb164260ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6984f2c3128281e875b539e95c8463df*1&814760a56e6d30071019871995b84861*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"6604c5e4c2f93871bd3a3e535907c441*1&511dbd7fe7a5d491fd387094e2f1f0cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"00e57e0bcc8f39286485d42d56ae2ef7*1&36095f85d83989fb54d756ae0937fbb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"24ae3a052bd6614dafbce3b5b0009501*1&da5356f9e7fe3ca366d439a1daa36b36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"dbb4c921f2eba80a125701d52a3b0b41*1&cf8dc72b4b0b56a79176f79701ed7b0a*1&22c9db704047683b6c81cc0236a6d072*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5159e11df9359c39d7e0501b2b5d95c3*1&a8bd9367523c51cdd42d9db47caa5c06*1&fa8046254505b563d4ef6822be5c98bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9b7d945a7f63f78b930f2b7484133e78*1&d9d50ae5413ba1a8ba8cd292e0dedad5*1&7010f72ec3a8ef2036804da6f7fb1c44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fb63f5db9ac57d972cac266e00ed3478*1&657b252f50ccdd6cd1fcf1c7167272f7*1&2e03e02d1f626e1de8b958731ccd9ecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"8eb5c840ef90e3131cf1317df1099fcd*1&314036532db714651a3fd86657a5982d*1&8b1553d7bd09a351e2193928e804eb92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9b564488a616fa0c0e3ee233b5a32ba9*1&bbda3eff14fec495c942d21ae6768f12*1&0563230835eb98885d71cc5d189c0c32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"08b6d87930b74ce808eb2c8da28d5162*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"ff473498bdb390043617ddf7a9b4302a*3&d77b52a08e7df7c24a38d451f390ca41*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d4a16566623b32b15332844a6f48861b*1&0d26481d1f171a1b08e8a24b88dfa5f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1a16272318e16c10e08e826a41191a9a*2&dd2b175635604bce2c95a2df0241ae92*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"16c0e11070b19a3dc4214b384b5228b3*1&410d106aef1ba46b151aed9f3e73a78b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"621758a56fe2d935e09c079cb2aec2da*2&ad213c85ada81ccd693b25fb59b6664a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6996af324224fa25f310a69c7a4e2632*1&38fd1aa1d95ee7ed03f4eee4270be84d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0e48e726bb17627920b24b5c438929b8*1&78302fcd186f0c7c56d0bde5f2ac636b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"586cdc3b7d935301d300f9bf6d708520*1&1cd4da1bebf236ff56f648c0c626d8ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1ecf791d111f9bccf1e521a3c6587df8*1&02483a1cf952cce2ff56e4c6ca052401*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"21f4fb365419f8a89e96b5fe41c86dfb*2&62a3d231ee3e6ae6649b97c9c06e86ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"1f7535576d2867665738adde6841d784*1&ae56bd399e08070810701d3b845c7b28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"63652b43dcfe53fcd47188cea02d082b*1&22ed755cd8172aef2fea4eb32dbee8e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f91db7d7cc5cd4a1eb013870f3a1399e*1&30135f6223623a062ad00ad016b0ea9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c6f98c6a3b301c4f78fad8367e0828cd*1&2565243c6fcdea72c3621b3ff55efa9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"dab1c0c0feb308f5a929aade40cfd51a*1&4ee1306b1e0b8a8f70febfb785b2ca32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a733c12433516aa96b0ba02add3f1fe7*1&45dde85320ffd31b8da77d48963a0514*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"62236a6d5d79f69a29ea04b9653ae581*1&4e4b122e9cf9ae4b21233edc6d0c9cab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f016cad0f5fcb4819a6c91a709a7dee8*1&2d8d5ae1d1cd181c6b9ae5bf3fa64640*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"94d43d33f3d491187536e4512aa45854*1&24db5fcb392d4084c0803ae9def32f79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d0a0b0f80a3561238680f137e9852eef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"eb6c2817a668bf285c179dacef293331*1&fa2f89822283109513d7a8c3c000aa19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3552082dce9757c66a4ad6705d641018*1&3864dc109319a6ddb442711b7ac71b9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1c26fdda7807e7045cd17b1c9679efb7*1&b65535d4853ab93d9fbbd83f83337b1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"60e388cc4b91a09300bbb3a0520d2f2a*1&570522d939ae21a6e3e705e94c626ddf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"48eb3b6b0878a8fafe85ffc699131e5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"e58825a7174541947175f5cb0d5f8f9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"9c513f096bba1c314e41b8ba152d9837*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"332a15cb122ad8f23f91872513242ba0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"91485fd6cd0ec9b381cbea3d1731222a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"7eaa4d44611089eecee3ee06712df2cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"68bcc3960ef416b2da739ba48a1874c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"66d359c5932b8ffaf28a1c44ae4edf1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"32c262b5f16d5d7c25cef5b6af341828*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"b49556d5a76f59c9e7feb66469e2be3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"3808cd537d4e7c135cf75fe68e9c2b8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"eeb78bed72884c2180007fb184bcb14c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3d474b51c3bcd87a04d5bd070bac7e68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"4fe560a18e345e496b7e06a45d16d00c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"051fd07de136eee79013d9f4b68c1773*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"72af8bd9237d315ba49813429b064ab1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"98c506c3ce6cb017d045a93f3c1eb2d7*1&d9ef492b32a6b9f65ad5f4b0943f2db0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b975ecb272b6c70e46f2fcdc4d98f4d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"413af5fd758751a2c847f71621d8ee4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"9ae1c439e9dc113bfd7278876148ceee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"95669d8ce1bcb45b7ecffd987cbedcd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ccd4a00851b9c6bfcb49ac9b020a1bd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"222609c176e0c9dc3353d2d688490255*1&8be4956d0318f7e5c14c7118a5416c98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fe898f7d2b34e30e05f3684fd0499fb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"3bb151cfeaf68df4ad50186bf960e48b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"4559acbfaf2089d550dfc129467b3b56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"17c3bab20865240b7a764e0b086fb726*1&48e87e55307b945a24fc01501dc9f577*1&0db3b6a13960f915905fd05b95a2ddee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"1ac7c659ddd8ef601498fc53032e8bcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"6b47537726b7585fbd81894bd9ef1d27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"4d21927852ac0d5ea435abd62a075242*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"5b1f86973988e5f1dea0b9ad05fec583*1&806acc177948435763738984b14a86d5*1&33ba839f2379f42a566125674ac91747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"bf93327f94deb78b7e004cc092d02680*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"cfc43fefca1d22e34e83780520209742*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"05394d565e895bc0c269e57cfcc12c4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"769aa10e6e7c6eb50d473024234ad53b*1&e4f5aa4f5a459a4db64916b5bc582daa*1&ca24d22200daa4f1956c5424e5832efb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"dbfc40580dd8f92dac7d87008e204254*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"bd25ad74c8be0a52a9d08a366147e90b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"19a8778a86d578cb03f15fa258f32518*1&781fc0690a00a042767f8e275f07adbd*1&ae627bbf204e4a3ebcd15a5104402c81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fae4cd353635e70313da6fcfd7b0ad6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"331fba7452b3f0bfe063ea4748a9c641*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"34fee16ac88469ef1c5b2cc9293be729*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"f28805304e969078232547ceffb2452d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"d099774f614f46cbde0f962e7c4438a2*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"a593ef226bf96dc3e4fbe3adb3bbedf5*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"3bef9ffe270bdd17801d0d3451875d31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234 mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"09e124d90e8abf81632041f58941e51d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb32_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"32a66dc6745510208da21554f994528c*1&68bd89d84d591d8a465f99b219873434*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"34e754474e3e7ae0d5f778a7b09a2ddb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dc5ec8d8db988ef01b6bdb41f5dd2234*6&8550d92827df34d102a99148b67151ae*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3b8b3102d4a4b518ad77d38eeae6b100*6&e8784c40bd068d86f676a3e6a3339061*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dcc77e1c6bef3775bb56af626d472050*6&6bd1c00f3aadf032eabf8c97ccf62748*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"045f6f24ed625160746bfe6a6395d640*3&4101aa44f0756ed941cf752860a6f802*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0038d4d5b19ca05b9ae6a73344b259e2*1&629d88e0dd3caf4b497ba712d621ca59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"93924dfcd446afdb8a5ea47f78103c3c*1&28dce486064649dd350e8dd914283b44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"775abdd1a2cdb08a77e4fd3f98ff6d47*1&485718be7d370b73ecbd5b878efa352d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"375a1bd79df5a6a9606f12230402e25d*1&d8eb7142014007d29c8b0f293c80a35b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"27af0b9462653aa883f68dec24fb804b*1&702b8d851f0d10df3ea8475d69e8364f*1&c1cb5b89fcd1f3af565dff7485603a6a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0f814de535f24dd90d53552a6cc1dd2d*1&26c055fcad0b0e7575afcfae8f92ffd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"20a971ab8dfd8c03b16af6a2b2de59b8*1&f64e2e6bf4fc9f98ef37c8e6cbc47b66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"eb34a19b467ffddd9b0705b5851c7fb8*1&3a0f0a3ab9687359141d21d8f0b080a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b08652cc393e6a10c7ccc7b14e6a236c*1&d32884c97d0dbd539cf17df74940f9f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"5252ff80904cc3c7a7c28c5f72a706c1*1&44fd02fc9090a7fbc3713a8589a2fe6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ec5eb0f76c78fa1fdbc15e096a0d5dd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8ccdb70bae58c3b562f9081c1093c5fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0a442cfc2bce358a13d9b78cee961fa8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"053548a07bfe5ddbe31deb5ba89ebe87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c8a9c4fb8eeaff5a7e9b1c63ceeb56aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"0138f7a2e879b4523eba134c0a4c877d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"a541c2fb6a3bef549dbed0f5f7440070*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"536e3c451fa90326fa46bd469c779b61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"17977ea86e3d2dc048c21cb24d13e6f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2f273431514674cd7757339a10b75c2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a02698b17e60ff1249eb76a0e7ec4bcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9ca5b97bd5fcd439261c4652889e2052*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"aa30121ac561f5e97a8e0a13970a5234*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"aa8e865e26ec6b3b3ff388001a7ba6b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"b8714aea9daf6dac8cc6354ec9004f9a*2&2b8e29e91c1e60226a6a41b3e3d6dd4c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"202ecfe2f13dfe6aa5d2bdf44161bf60*2&d52154bf0e388032fddade49aec3146e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"91233ef9460cf6b90e26358ba9e1e3c2*1&01b9d5230ceca3f8775d5c557e54039f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"bfb9c15995503e6431d833a5f37eab1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"67c3c021b4c290f7c160b4177bd1e9cc*2&3c0ec1180ab9146413b86ce7470e6865*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1ebad232207589d339018f3cae4cd7ce*2&4d75e2cfe5cf5758c8a715abe3e4c657*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ab8c0bae3ada9d38e39153bc84402229*1&1719a05303a4f61a0b531f59fa01db47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"42bb0d6ffbe55cb892cd83eb7ea9a4a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"7da89fc8fd02b779ec64a85080a672e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1b38e459cf0bc6f315a1a3ead3a8087b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d422f5a678ddb5241e488bfdc91715ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"56233dba11f323ca245361f2a506b6b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"49d104138e722168a3e1ebb3ce9bc274*2&a600aaacb765979f2bd9e1b2ad7dc107*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5590e356dfae337b1f49b6fb00a07abe*2&609229b13663d79d8f307d966df4bff8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"19976252fef4fb19fdaddbcaa25e1942*1&5e2431ede6fca72cf008e9810db59203*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4de7448baa3379b76371a8435cf91d7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"8e35b92465045b2b72548c1f00cdba32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"6114b5a1e5bc24dde6c2f4346e3a8d7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"cf0bbb276dc3817b9a9c87650b31a902*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"fe887b2abf89cc7c31f358ccec166b0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"d8a30ccf88c0d890dcff016fcc155ad2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"343d14fbb048d18a96060afa5c0370bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"333edb144f030ec1e92eeedfc3551b49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"0cc35277de1bce7cff0d45e5a0d96923*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"10609ea87c253d35719f9540bb32d276*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"509e1b0619dac3c8206a354ebb0db300*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"14f82d31ba87e75677bc0605cd12633a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"f7ea0ff0dc28d0b2a994c3c7d00f411b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"eb0339783ce77816f0540c7e85343c1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"f3412239dcd7a3e488dd42f921476ac2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"4fd3ca0b8bb934060dd65433a9c7aa9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"ab48fdff745d3894108146c60b255d66*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0429ba9dc23e183ccb3cf9af0b9e4126*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"c5f6e3074468cbcc1138bc085a2fa0e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:7:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"3c6d0c564383f78cddab46b562a3d0c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_mul:f32:2 mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"1b05f13f60265f53e65ac332453c36fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"8b1a34e0ff57f148bbdbd550e78acb92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"15628f82ee9b622097c186da183a6f2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"464128a31dd45891b36cef2ec006facd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"07d4b5be652dac7c9113850a95821895*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"658756f1332636bf3e33e0356f50f8a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"5ba9a56c7b9313c2b9cacc50984252dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"fac55a746bd86c4d1dad53a4abec5764*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"daf582938544a8196bb24c5ba3e88886*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"df26bfd4441620624452309021c8f78e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"965ed0f267090c2621d37af7ee45a80f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"5c94c0a4f75c5c16d51087a1bdb7c7b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"c446ee52087a356cfa70233eabe0a077*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"bac70a3da90b57a8bb2038637ea8b2b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"3abfaf7fb6df4d7623cbdf59d866463e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"89701481abf1fcc7a46ed7084007af1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"9195372c22aa010a5e470b7ed650f6e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"5160fc1973a5efb2759081fc204fa81d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"9154b9d19fd7b4111cfbb449108c1a41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"698f730d767ce092c23a065adf283b4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih300oh150kh7sh2dh0ph3_iw300ow150kw7sw2dw0pw3_n"1af824516c2aac049fa2ec91ebbc3304*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"0c2037b111f44e3f0d7a321c55150d38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"ac3b6ca0cb984a1a0d5871b3b2a01126*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"b8bdf0affd3c497dff80428d8b2d0ad1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"a6c7d51b1413375c051aef09cd239525*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"50f2b83173da57fdf9d0a9db83a63cfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih38oh19kh1sh2dh0ph0_iw38ow19kw1sw2dw0pw0_n"dc377a764d7f6a1301283906936e37b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc12_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"093e293fb7063f81c62cfa462fa502a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc6_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"215dad09228971d56b6e872b327b011f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"335cb543d1cf39544b5666d3beb25c52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a4ef33f04c3d1ce9cceb0340548f9376*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"5912052dac49d48711c41c94a8b708f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih19oh19kh3sh1dh1ph2_iw19ow19kw3sw1dw1pw2_n"8da556a3eb506453deed121a7c6876e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc6_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"ba6381a7772f70bbdcad072caea5053f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ec832ea26dccd1a07dc541b2776babdb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"80dfc9a2e8f1fadaab49dc9385aabd44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"4529f20727045253f91f4e52eeeb8fbc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc6_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7aa184bae643059bdaa267c9627ec801*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5e9a2377d4e11a0992fc777ac072097c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc12_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"bd42f4f402b236ddd204196d1ba5d684*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"c6674e2cc3ff456e39924306621a4efd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"46a9464a49f8f4ca3dbd574ca01c9c41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc6_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"dd09821ef65fb34703701c9cc435a5ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a3bff8af7ba69ca6c6580b57d036eb7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"9e1a4d997cd451ae863de6a1f00fc795*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"2e03b3ce5c3129916b57cbc31ade3bf1*5&87998e4900b33396eec6fcc3a4a55c64*5&46e35bc46de1c9adf6b8d948c61d6ecb*1&0895481911ff7968d77be8164e0fb7a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"4adbe7990466f199ce33c8cabb8c3106*5&b20f573437eff24b59f789dd9a7440b2*5&528aee34cfab0e0ff624a20de4de0e03*5&3d3002d2dc89c94c9d86bc5aacfa4843*5&6d2bbb69ce33cbf1b5e6c23757d22078*1&8e8051ccae0e8684b72c21af65a4fb6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"d527e2c4dc5655c28b8c74c81208f928*5&f1af6968a86359e5d5117669d408e502*5&a95918f5ad582796dc3a7430ca40681f*5&ab4c448f814771eb6f6af0ef440c475a*5&3e01ce7853ab27446ce35f64c1ed385c*1&9406121fae2e5ac62e6d090e254c455c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"eb67ab3bb8d0fd80950ce42c1b1049f8*5&c74f796f490830033afe0c7fa9c43d86*5&94e946369318564d14f6967fbed318a3*5&84b7d88360f5f7202553670c6e653685*5&2d3698380a3f93b258802e9882e9c0fe*1&10d5fb981d9a3df4801db9cf90c8d38a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"e35d6088240f6d9cb1a7afb795cc8e9a*5&720c0acd6a3f5bb24e40cd21e389f532*5&5396392dc032bf5a2d1031de5c59ad15*5&dea060f609ed43703b9d9eb17a13655e*5&6765ae958892884478d300d636ce451b*1&d6f0d1d07ffa0a726686c681cdd245d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"1f5a3560b335f341d308116e47a3f42c*10&0fcab276759f429e630698aa3ff4b7ef*10&7ee503ca802000fb67d047e7d322f70a*10&67afa358604ef1ff6b145b65f9e5932d*10&a2b07db0740efa6c5fba5a53e019dcdf*5&57341c00ec854613d7f97f94a9929d86*5&d83c2954308d4f0c3188adcedd78fe28*2&64751fa0724ec48c2197e846a2a44f43*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"15155f69c7c4b23d50abbdf6fa8a385e*5&1131639a999c0bb21a04d68c743cf7e9*5&5be4c4941acc17980767e75ab2dcc016*5&cc861ddb16e3d92fc2a6cb47a842dd3b*5&a6c6d4a23cbe7fe7e03fb30b96bb4dda*1&43ab76239d986c74372ebf45312dd8c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6721b40c81eb1f7e650674bc9e17d1b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"d1caee57d139c74c946d1a7e9d956c7c*5&48615e52dc60280924d9a0a3b7c26f3b*5&689693b45c66c12b51b891ac60266cf2*5&e93cc150e58ef7c4f514fb48efd55774*5&f52707de1250f2ebfb774c89c41e574b*5&12eb36f1f1e6132570246832571e5cd0*5&58879e5787cc70c8d9099593e14f9553*1&91e821bfb948d1e607d79bb42506fd7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"7de393a7313d4f1fec5430aeabfec25e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb32_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"438428edd4e98e6e574436a683959078*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"e6a1c0b239ab068d494ca43a18c8c969*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"af9dada7b328a522340b3655f4cb3b43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"f1058a2ca3a6334462dddc24aceb383d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"6b1aea1bec133ecdbd3b3faef09ad75f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"ab489934018ae1f6d130c59c80671cbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"cc036692e2443e50683eb29724a1cc5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic3oc64_ih896oh448kh7sh2dh0ph3_iw672ow336kw7sw2dw0pw3_n"6922678b7baf1a0412b5b35bd42c7295*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"2bbe939a314aec28ec971dd6da9774c2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"26647d04942eea57a2866c9f86907d25*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic64oc128_ih224oh112kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"296bb7a27bd3f12e12d09804941e299e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb6_ic64oc128_ih224oh112kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"e7fff598da6a037e634fe4fb5fc1ca93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"d7dde2951b2d9dca9978a7839e35dbba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"8dbda4d9973c171b36864372162a5161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic128oc256_ih112oh56kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"f56df5ad7777caf81b8a38fc5dc28661*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb6_ic128oc256_ih112oh56kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"fcbb9cca3588787ef0d8944251f841df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ddc7746ee8a93162a0d12e275e4e3813*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"8b0a83467fb881d569e32c0863fc9202*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic256oc512_ih56oh28kh3sh2dh0ph1_iw42ow21kw3sw2dw0pw1_n"d6ded654eb41fc1404a4878315ec15e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb6_ic256oc512_ih56oh28kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"416f0c86391d2b17577fff022d5a4581*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"d842161dcc300ae44d82e5d8fce42c44*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"52598eb5685aaa6b1b0d07cd200ba96b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=any mb1_ic3oc10_ih32oh30kh3sh1dh0ph0_iw32ow30kw3sw1dw0pw0_n"b0b251e4556c062b801603900cc4c29c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any mb1_ic10oc20_ih15oh13kh3sh1dh0ph0_iw15ow13kw3sw1dw0pw0_n"e013de615f10fec55a589a97b1594def*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic20oc50_ih6oh4kh3sh1dh0ph0_iw6ow4kw3sw1dw0pw0_n"f54ffd4530d7977acba7df442c4b2530*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdb mb1_ic50oc2_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"446b6972c94c47c72ff91ab18e6c8549*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"6e97bc61ac0b69d2f4340eb1024e31e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9a7d16713ca760d188aa4f076bd0a995*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d5f1e6fccb6623dd4baa95ed41927038*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f5fb95738ee341559b19fbaabe161018*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"b72e75f199e7e1bb84dea99848ef1a4b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b2b4e647c0baa03ce5fee3fc3c2d8a57*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cbf9443d12a020cc957fba0478c2e0cb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"75682d6b169875fd24d0c9064a5a5e50*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bd720c67e5b05a76213b1bf8e21f724b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g144mb1_ic144oc144_ih128oh64kh5sh2dh0ph1_iw128ow64kw5sw2dw0pw1_n"36dd39a498709febb6f0f9e20d4ed870*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"85e37a2b2e88fac319f5a17bbb48556c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic40oc240_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"01e58e880d78654887a0a7f565ea3bf7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g240mb1_ic240oc240_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"fcdfefa3b6f5cedfde90c2fcea13726a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic240oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ca41f319f3e911aa72e0ea3085c0b3e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g240mb1_ic240oc240_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"118b707463818352fd611628e44415fb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic240oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"108aca6dd49f0b978ad9162096ca0f64*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic80oc480_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9a2a1128b2341df8448aba716744c1d3*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g480mb1_ic480oc480_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8863b7d9941372c373dc75a8ca46b883*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic480oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"addf413b21d9aa96caa42fb05bd25ebd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g480mb1_ic480oc480_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"5e5d0dd5c833d1ea466aac204b597f15*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic480oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7ae84a8e14dd760fe132b7f3cdd166dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic112oc672_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"149b3eca59564dc534a7f57639c46446*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g672mb1_ic672oc672_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"8ec45b1c7d92ec2ffad5b6953769af43*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic672oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"46622accef4f9016391aa511ebd1aa88*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g672mb1_ic672oc672_ih32oh16kh5sh2dh0ph1_iw32ow16kw5sw2dw0pw1_n"5ff3f8bf12a59b5a34a97d14579396e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic672oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"53554a40eb40f5872b2498fb6f136535*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc1152_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ac0451000c48973002746e62cbcc0991*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"47e2ccd7dddc3089ed0abca1600b49d4*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic1152oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f6220f44018935d567ad37b18716f833*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ef116634f7cf956458407f506ca521b5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1152oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1dc0b1a95032750a8b7118476277d821*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"afa93c2cf963bdbeec7dc0c215fce63e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"96acffd5e983422388f5b257e273ee60*70"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c0da117b85c11caf7acf2eaaccdeaae4*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"826cf0cab9107c3a8d9dfea850b44831*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"adb8e114b750d95254b864cd777c02ca*70"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b51bf486471ab77fc21eec36270568bb*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a8d3637460b5ed3b3735ec368e40ddf4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e923321af3a55d60d00b8f660ffaf494*70"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"33c3a33494876009669de13b9a37d015*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic40oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"abbfa4f4bb2ea49b87b58b56243ddcd8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ada7bf0dd0e5491484ddd48f3e39fce0*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2abe5e8e69592c9762d3233dc29f16cf*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5961ab44345e05acfdcd195219da6d8e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bc13e8aa6add4e066b92f84fce497e26*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"61f1b0e14fc49239f62f1486e9406895*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"df4bb26588e5b2fdbd86d1072b595a34*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"e61fe419fdcd430a9dd69ce0e3750466*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"aa2ea7ddf820f74826e579b0a7b8dc83*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"caa4bb32013c4db181fec04822ba0107*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"045ce622b39f418026a3c662a69e56d0*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"98c984f2222d421b00a7929a5700b06c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d15b3244b9405b148ddccf0004835212*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"eeab255ada55aecdd15caa6271ea3895*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bbfb95a91272cdec85c8f3d1c5a38c0f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2d113b23591d455c6b860474aa01137e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d1de06a45b651431224ab639b837296e*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d8844796eed146abe3e9f37391468db6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"47677a7515d408260f3bd9a7071857c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"901d277586d1258a2ea3f373562d24fd*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1f5a4a9c556ea7f3497a4ccff56207bc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0d4da8645ee06a0ee5d45ff40edf7a57*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"4bc04d1c9501736b0b96d7994778e6f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"e944b4b1d4a36b34764f30be03971047*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7a7ee2e9c9f3baa63da98a90333a2bc1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"40cb62878f007f8d797c61af18cf7d8a*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"f9e3b0426776a2250b76aab92e013441*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"67c1d4a29631fffc2d1fc0a99036653d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"a419857f26212f7360179e404a1f39e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4d883984b70f8a2eb68ef8e40c057d34*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"3c3e30e461fcd2184dc4a4a8957667da*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"3e4cdf95ee7d319f9ba69df2577b2c30*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"aa1b980ec4bf6c8b6a33005b1d4f6680*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"93878fc17c993306896c53a70e1ba24f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"db95560942aa4de2ee8958da2ddcdc08*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a4c8fb7d5b0fb1639783d4ad766f5928*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"b3f074f2747d636883745c57ecefe556*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b5247c6fb8e1af0319c303d956ddbf03*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"b3eda0bba86f675505c98309a665df41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8b4b7aa7018adc555bfb7d1af679f196*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"99b0e04a17c127405fc2865b19a2f655*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"f737d96a5913eb1d6b1e20c3063d3b7b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2a16b3d7859dcf8d8936074ee6629f76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"066bd5c416942ae1eaea9f09a5e97966*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c866a57246c4914f64f40cb0816d3ffd*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0d556083e169324ae8f5b5f6389cf7b1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"db55d3698f8478c4a764da92ea78731b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"fd7c7b9ea30a5cf6cd94be92dd98e618*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"db3b697811e9ea473c392e20bbd2aaab*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3cc97fbdd2d80a570ecfcd814d6a72e6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a7f308f687a4dbaf7fd6c2b0e1e36cc6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"56397ca18dfd6d93f7f295a825f1bc68*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"364b60381d4542f789773bdec620c9cb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3b8144839849ac29aa30e664bb62a6a4*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"aafb769c41d5800e2d8a0d4e080dcdfa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"8dffac79240f8ac75851481e39b264a5*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d679ae39e6f9e358badd35c5346f8289*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"be3ae9cf504b96806d15d2200f491013*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"58aa47d82be17e98f730489109a6032e*5&cb3f2aa995cee6b65ac42f1a170d2be6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"535ba1820d9f6b4cd566904975aaa327*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"c09869b2129eec495f3643f84fa7a6a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7c1484d287d9a89c1f1907e890f2fe96*5&d076d83c7007e7aa5283c4fd837df439*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"2499dcde4ce47c25a8b551b0faef0dfb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8c1a3be56242d096c3eb9420bb012d91*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"208fe9bbf6d3c748251f86b1ef89b809*5&624e6d2f593107e11c95a10578c911ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"89c4925762e9ef2145fc01003408c87a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"39544ffd5597511503e8afec56dc9228*5&19db4994c03e0cef019e169472f49b58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"be0ffed31989d22b74cee50108e23757*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"27ae139704d60e20ac0349ba364f20b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"64d37ec8bbe3f068759483d4e4cbadfa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f84584cf879f23dbc65a80eda8a45dad*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9acd02825af2ce9c89d652823c9879c8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3e9f906a4b131d6fac5dd0be2cda146b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4ce81cdd94c124f822b786486b5865ab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g16mb1_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"a99784d4715ad05b98c8e3a4586ef9b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7011c15d926d161d61f845f5d6c67414*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9903381aa2cd0590a2882b6da2d3635e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"320e5604c6a2f10483bd86c0d24bfff8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"71957275f202a0c43156dd391107210e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b696b822fc1a2548e5c67c072639f45c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"01f849e3fd454acb4fa5031d1f93bad5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g32mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"ab1b18a7ac66ddad56955b6aadd3b788*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cc81be890c86c6d5681f53647dbe51b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"533fdd30adb548a63c1bcf6fde5bba3d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"695cb86474e9b5441a82fa0a9a6dda82*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9454935d1f844ec5a936da1d442f20ab*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a38d5977b4b0b88ab7f5bf57e8d20f92*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a7019075555977e77862132e65942b02*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9a8f9eb8f11cbf69c932d2ef3dbf5d03*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"77c97896bbfea9add552797d80d9b394*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g64mb1_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"95a5ffa35a4e7afd5d242bb20b3f635c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dea29d003884d5748d672f920c8fdb14*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2586a8db2886d1cc4ccc6739e2f508a7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9e8cc26b669c2a667e9f837aef58e7d4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1631d23edf61ac81337b00721a3e6154*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a63ce22c3afc79d8b96d639908885944*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8975aa4a9fd9e568c274d48f5b32e976*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6dcf89e69ec00dae3ddc36493abe17a4*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6086282fda5cc883f5f01329b7751843*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c48d13218238bcd94bc35d2efb00c14e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"730ce2b9e1bffb94e4f699116557eeeb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"06fbe1211afa833b7d9ad8f794a208c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"25f1e5de4c270ddd7ed98e7dd096de63*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1137ow569kw7sw2dw0pw3_n"586e81f81748ea6ab355a56cbb76e5a9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"09647d8964cb9adb8242b792ac73274e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"f9d08da878a0504da4a780a70b25c6e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw285ow285kw3sw1dw0pw1_n"25c5ef727842dad0a8bc150b5e6543af*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"b3211bf96329c8c2400be46dacb32445*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"0c9453047da0dc05aaa371806f5c19a2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"6837dbde9b6c749b55a21a4f509570f8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw285ow143kw1sw2dw0pw0_n"125bb3dc70858bd14a227eaaf5df0695*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"6eec28d8152159d5397d504d81d089c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih200oh100kh3sh2dh0ph1_iw285ow143kw3sw2dw0pw1_n"8e41b8c13dcf2e43bd3fc2aeb736afdf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"a452efe1c9ec557316ef167b5b7fa8d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"530e841e7d8961143c8c6d21b3a618c4*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw143ow143kw3sw1dw0pw1_n"0f01fbf44dcfb16d8eb1ad544ec16f8d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"f1be918238058cb7abb06e6f26a7c232*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw143ow72kw1sw2dw0pw0_n"9504c82352889a3241dc7804d01423d7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"c86b4aa21ac94948ac356fd0240f8cf4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih100oh50kh3sh2dh0ph1_iw143ow72kw3sw2dw0pw1_n"37c95bcebcdbdcc9eb95e021cf5a49d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"8574ff96dd4d0ea57d08f7516f3aba3c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"c365017a0dfdc98701e468e9d130840f*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw72ow72kw3sw1dw0pw1_n"183d0707a48cbd3f1783ebd240506a60*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"635e28c03a35159bc6ff03348f6d8422*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw72ow36kw1sw2dw0pw0_n"2dbcdcf6f97af66f884976b782222740*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"b72e67e703978ddfacdb46d794fbc887*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih50oh25kh3sh2dh0ph1_iw72ow36kw3sw2dw0pw1_n"5d098f7cbd1692ecd4e81c1085a3ecc2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"b405608769ca5194996d2ceb846451e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"ad0a266cd475382fbc994869d40107d8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih25oh25kh3sh1dh0ph1_iw36ow36kw3sw1dw0pw1_n"2641dd69fe174332e839d74993a3c61c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"9cb841c84317e87787a6387c9cdbeae7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"c5b3c81109a78c6b7248a7e47ad7ee81*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"5c1b8d1e71373e8addd159247d7a711c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"cdc8e22a5ea1921cf0115dcf86e601b0*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"0db0d0179bce930e28d3e88851025792*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"d41a3c0125a9bf87715f9095275b4161*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"ecc49808095fbead618ec9dc1b386c91*92"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"15b5530c264f88b05bd34275b130c7ec*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"71fc4e792f8e5bd72d1465d840da378b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"86222d6c62958df2ba00e2b851300a5d*159"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"1a2167273dd265904291ac48d8dcc02b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"83111d7d18178d492d4d9bf3ff1bd59e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"b1b377878084a03384c55e3c5b92cfa6*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"94edf7a449119b9f8bc4969433599cdf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fc7d6e2c712c406f2fa38d54123ebf33*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2f60a21d13caf4e872cded74554709f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d597cb53e2b83b4a085279c021f74d2b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3b33b89bb1b229136f9b16d8ecf2e9c2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc1024_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0819a4d1c10846e1d669b4df691e236f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic1024oc1024_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"51bfcf529d12f1ebf85d68b5454ed710*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"525efe5e2367dcf32235ce28edfbdd6a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ce0b7e19f15bc89a3f84793c94f4f025*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic1024oc1024_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e78d67fe2b05279fac8ddf7b61226038*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"67b089129b87dc7e4b187c0c06d25686*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc2048_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ebb46b11fe4bff813cf13bdff4b2dc09*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic2048oc2048_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"70c2cb8bfb10a3f9017d2d829ebe5ab9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fd57dc03c2b4ebfeae2c557b5b7b9489*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9b67eb40f6c42bece4ff746ee604c52c*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic2048oc2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"46d827b3819b06aa9569c0b9d6836fa6*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aa2383d2ba05de7ef110e686a8a717d2*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc4096_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5bca52acc983f9e95d77261aa9da73f1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic4096oc4096_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"e8759e7506b8f521225d0000b45961da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dc9ca67989be4c1c22cf35136788097c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc4096_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f2540b955bfc50f6ebce6852c3b2769c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic4096oc4096_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fc3fc694e1ce72325486fa76d05224ff*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bb88f2c150466712e79f79ceeb252bcb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"4b7e3b359ddd81b8b76ba803249a23ec*5&827649889f590cc00f5e0a1b164f306a*5&caefdf84d81b566e527e4d30a6c50032*5&11025fdf7fc516b2ed41f817b1b985b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"453d926af59bbbf32f4bcdd8b5e950fc*5&fc94b08e899d7a30e4b61cdaa110aa00*5&f98f43587e7f06cf07395a7d071202fd*5&2d234d4fd28272c91bfe47dd49783823*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3798d82c21812877b5cc8cdf59378ef1*5&3a91742f5ce52f90ac01e3ff283edc1a*5&cf36d0a6a5cc6c7be5816e6c1d17a2f8*5&e5c9f644eea59f59ed15d7b8caaea9a9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"862cd7c992419ec04b87beb5415c3d52*5&f9b513e068c28131510a8c4072fbd899*5&2e64618f452fc95a82b7854fbedd093b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4fa1f47b55823fb091a5c8f6b980ac14*15&146c5d5bb9276a32aea6515d4c2c07b3*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e83c27339787f1350212788a2c1d01d7*5&cddf54b5bbe07d8a90aef4bec40a1862*5&6fc90ea8c131da423377020d568a28f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bf4b4b53a142f1037a0d3f2ce2f064a5*10&9af4921bf7d0a676d8ce58477a0c9f9c*10&9bc6e0c45bc944ae73a64c6f037a2f6f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"15b1acbf9c6488da5b2c09609bbd5aa2*10&d3137aa67b7afc2a020e0d4cdf1bd66f*10&4f8d9691247332b7dd0323e470132057*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4b38677dcbf7311c7235e4d83ffe0146*5&96fd06ed0f8baca0e6b73ad57e109f30*5&c049e933ae4d8500b0b3ffd06edb210b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"61aaa6d0771aa5b7a4efdf26e9ac8b4a*5&c26432bfcd9d990c3c0d8bb287f73ecd*5&7c5b49e92112c7aa8149a8e09302c99e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1819577dc7d68204e9299d325fa48401*5&8b7e2f08ca04865cca1eb18b9e52fa80*5&d2859dcde187e87302f2cc092c4ed1aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6f1b968952c7a625411a7d9168cc2be0*15&b1ec8e8b78a1e6e79c602e4439bbe838*15&2c7e146f0d357d9932b8d89878c0dd4d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4ce34d5643729d634bbc1c9f5012f8bd*15&7679c032b369411350061899f0e94510*15&dbb6d97f2e258486263e33fe5a1d407c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1b963e709e7da2a48de3a451fba3bd90*15&665ec453b87da86d0471f8f3706dbda9*15&076c95b3251911c5be57951a09f9a00e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a45a371fa2a08f16de0fd331a8e7eaf3*5&3b133a85b208f1fe524525673eaa9f8a*5&f55bc97e152c47d16933ca3f6e8b67c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic2048oc2048_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"81cac05356b5fae1203f7974be8b6ea4*5&ed3f6526976a06385afb359bbf9a1dea*5&8d9f07b99affa6ac6d63fa12eb146f25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"89244a49ae75c0dc61853abe5831779e*5&411da9242642c341f507cd275a4f0fd0*5&995e2eacc1ca16f2369d2bda9b974d2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a7a69946ff7d2eb7352e44457895d925*110&f4b424dc7e9a492ec166eb78f0003a74*110&060f43c27cd0d9c76d6d7808bdaa817d*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic2048oc2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"51f79509f030d705360edca1bd3c9da7*110&c8c30bbc9461e96cfc845fb1f67ef36d*110&6ae8561434cfe6b992226a85779e3486*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2a2c7850ef0a7f5c6f0ee7030b0495fa*110&6a34e54d7b22a3d0043918c4724caddc*110&be677932d6d49ce5ae9036aee9923ca5*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc4096_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1ad9119a5fa2525408daf24016b4c3b0*5&21a1a8e707d2e300051e80c1f5c5408e*5&7df17fe1cb41cbada7bb89de477e011c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic4096oc4096_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4b6b46147a646b216950dae1c3a470ef*5&1669e91c8b16863ebab5729db588f057*5&c63b2dfde654818bc4ec1ab5b57a14b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"43287d586a5c70fa16c18d69df432d1e*5&2249279a18f57dbe183623f757c00923*5&abef23059cf2a58c2b9b2dfc1a5147af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc4096_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eec4c106692e53cd20f6a6d2fe8d55e5*10&91658a7a2c5aba20977945942971947b*10&68a0088b07bb135e737c7324aac22cfa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic4096oc4096_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"042b6ffa26cae8e3c3e4b3d1a19cf3b1*10&55f16f5891c43413d757fedc30bf7275*10&ad429c1a194d088135712cb55b17a45a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"38052e919ef8e91c0f2355fa12eea923*10&46e5bac67a7f8dad8bd996177e46c0bf*10&088a9b0eaeed71d05dd8d716ad276a1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"8f4ff57907bb2b80633e874c2a4d2391*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"6caf7ad10ad5308c67ef52f0c77df717*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ba3cd126d07d62aef808ff17a705c11f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 --attr-zero-points=src0:per_dim_1 g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"76941fcddec5084f8c5182ad8f79915f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"9ca5397558a1825e9e1c1e47789cf16d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:per_dim_1 g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"37060c249350f24b6fe49bc003a85236*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"62a54d7a23360e7d4daa573d34bd1e20*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:per_dim_1 g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"73b579bbf59b6b1705185dffd9023d6a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"908383e9f0750c2d0e5caf4d68f60e52*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2ab75eaa666d52dba07e56b4b8f4667c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:per_dim_1 g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"9345700925c2dc5712dc15cad42bff45*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"23c33927641415210e51b7ddd8efa1a5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:per_dim_1 g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0fda2b804414aa3cb88ba787850c0b88*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b7fec68cbe6d78b445ef97495b291b9c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"de94dd54860a46b841ff68f83789dd61*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f32a172921ea7343d06ed5b67f8c2670*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"06c63c257a616eecbbfcf16ab941127f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"543dd387b9e3c9a7abd8612e7a337a38*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"28e1f39a3fff89f134fd36ddb0d816e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"377495829b4d7ac1d648a76356a43313*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e32d268fcdc07da0cbf9f1c9877382b3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b030a86229c4376b20a90540ce066b63*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"911b15ec6f6c8b3711eb03b1d0dee54a*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"afd4671a61c6ce0ab2ff14444fe01c98*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a3d6dd577e89f47ccfc651846cfa0be4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"64f96e44b539b6bec84bae7e66a33c8f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8311bb3a7353845f5334978dd0797e5f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"411338fbd3408c6cb54bd089b1c4f949*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0935aecf310ca30dc65dad7c0d130bed*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d2c24945d2dcaad4c7fe97ce6e40cf67*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"42288141f11af0510d08a2d3c33c4513*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"41165b84ff650276f2b1a8ba67c635ab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5c6c92e61b4c20b73990ecb3793ed07a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9df23542a4ef9488df67b692f81d4004*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5ebdf56a6b9c8b69a72ce097b259c31a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"12e8c327e82f12da737621feb556d122*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e6173ae756468047f0bca9da3ea5bed6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"378d8041a18cfacd530f352c5a075a0a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4dfafff08922d199bab5678925171550*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"00bbd3768caa67ac018180a2810a9c98*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9777428b8454a689fd888eb11ee1b279*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any g3mb1_ic3oc24_ih600oh300kh7sh2dh0ph2_iw600ow300kw7sw2dw0pw2_n"e0c3fe04ce4ca9ded5dc05c214bd7dd7*5&c0f77fd2aace78809ee67142f40ef1bd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc64_ih300oh300kh1sh1dh0ph0_iw300ow300kw1sw1dw0pw0_n"63f130ce4992938416f2fc695cd195e2*5&29259099fbaea517e2e689f0c76b7b67*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"74441ad4f60c1014739b0ab3dec2b305*5&11cf24246865572a30705f50c1e2954a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc192_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"fcc05d713a826fa0d22b0f17fdfedee9*5&20a4f8befdd17393d7a69cc11018198b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"3f03b2b0f62c1f450f31a048a75e1410*15&f3e63bd4daa2471fc384d663d5946b6e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"bc65495ca93d57bd827bafc77b8a7083*5&9c5c3261f79cb77d1df8232df68bf35b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"4c58e883e924f29c7e5dc71bf61a47b0*20&9b700cabe1ab8a0f623d0dfe6ba65bc2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"52c2fad5a38713d315937d3492d139d5*5&ecfba4e3de074442a2594f5625f9fda4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"2b600e074fed2559c701f5c161ec2221*10&f85db073b97b8e40b3099192c1fdf2f7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"7fe3c0881bbc142862b9e01609cbfe3c*20&4cbf689f8a6f2c8b29aa9809de860395*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc64_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"1a6be0ddd2e16772bc64651ce567c2f6*5&21b6b286ebed36165b73497dd0b274c6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"fd046cd5d99a37663c5ac51fb14e2eff*5&39948df79389aa56ddafffc8aafd00cd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc160_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"30f340ba28fe774f8366130d87687ad6*5&93ead4f0eaad156a21b7c6c2db4d6695*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"3f0fd6c9f1b95c92d5836115eb9bac1c*5&6108570622bab507a3162d61bead962b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc96_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7529e2b1f0624d2f8770f6a9ee39c9ce*30&ac8c8a9ade034d471f6d89417bd65bec*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc64_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7128ddb8095bea6485fe8522fa303473*5&9b8ecf23905eb1c565e968eccb5265df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc224_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"9e1ee699a4de3231601a65bdcfb7cb6e*5&923f8fe501c8b0bfe93b7749fd9ec07a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"a22f3659dbe18544a84d821d239b4cb5*25&18b5657de7286b8f60bb487990c0fe2e*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"32887460691d2adf322b8b0d909c8622*15&818e6eedae45586c44448c1bf5f7b356*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"208cbfe767eab9c82ef2fac52aba0feb*5&1e6a1189fbf32729a81c093391ea0f87*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"82e2893a38bb612b5dfd3fc3f8d5ba21*10&7dcbcd139873256c012385fc654788d6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"da6a16ce16adb43eda9f0c8386970704*5&cf8e168c376002f0e4c4a9cf39bc50d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc160_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"08f05f9a23b2cd106e021648a1d3f1f5*10&8118a1acd0e5729b420127a7d3bb48db*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc160_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"60d73ad9b979d44616a851bd0eda6d40*10&6e65c0c3460a856e65a8506ed7b29f8d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fd0cfa4c9cd88ab52f9c64c6fa789a44*5&fcde88800b67a1032f2cd1999f39d3ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"47a5724467b77678da93d32b3857681b*5&4c13141b061d2980a7c7fac8ef82a439*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"69c5dd3405c753990d22c8eeb11152be*5&da07bd30e4e674f7f393e4d9b4964933*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"ffc2968966ace0edf6084da6d412e24a*5&b536fbe669a83c99acf05a979df94f32*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic576oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"2cf21a2f67bceedde6537d4f1ef3ab60*5&dc7a1104544216aedaeee81958977e92*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"341839da2164326e2d0bd7538e4c5e07*5&12118fd60953023805437b4101f9227b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ca6455e6a0f884fd29fabefa1d9e45fb*5&779b7249a34bb9136568b005dd0d5522*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic576oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6bcaabb1b97c28ca38652323e927b548*5&ad8fecf80768d3634214471f311e2b9f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0d27c9e0d43a9dfe5415d90fb9bee9d1*5&c1c59f77a85567fb6169cc48fc0c00a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"497f0cdde55e53274dc9f1818587f96c*5&5ad0bd2ffcb897017ffb98ee237adb6a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic128oc192_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"2d58a30faa08137dc8bb18c8d680b467*5&a28fa3097b6cbbc155d00553e7ee13fe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc256_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"819c4ac288fae602e9a8f14bb22fbbc2*5&12df6b667e1143aa3355d04dd42d701e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"3f764315e46f494e182e7c275784d0a1*5&dfd1e6b63b9d7c4d8b2603f43263e8f4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc192_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"ee6ace96669e424cb504073b48e87ed4*15&5d351e00b258d5e83a69e5ec9cfd83df*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc352_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"0e927cded843c669c675c83d3f0f58a5*10&416746727b951681437dff4bd9d1b09f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1024oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"4791777c6f6d4f31d306ae1da10651df*10&a9a2897bbab79a6b2de89d453ca55a74*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic160oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"a2c974db13328c24b8f9f739054d77fa*5&75ac88f042084feda8c7f588fba5f14f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc320_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d05047725e5eedcb24a174fcf7f1eef3*10&fc91bd5acb11ab9b9d947131bfc7164a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic224oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"752342ff156556c7311cbb875de7a4d5*10&3b1f99f0925c57ccc6eb253ca842e96a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"648240457c35c6d8912248cd57558978*5&19db8e99d8ce2ea8db6f79b4cc7eaad1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"18a29b425c150942dd9892324ac83019*20&772760ce8ec8da4b337d574a89f340a4*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7742af746d565696ef0fdbb8f2643fd3*30&550a5530e91c42df38071f93dce19842*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"94af9f6f65fa4f804bca58661a195239*15&34673760bd861d258a1220a1e8246e46*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f91c092107daf28310df66f5ac55b7f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"345d5284ca76b6fe97db6893dc77e401*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"63441dcd4f71ef58b02169a9be29ef3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a237eede573583dd93010c03a564abb2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3f790790906ca8c50cef3137dfe48b21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37f71755157b415475497d15ed574d71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9857989b3117437f1650c0b5d2e9e30b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f40990097ccb984c24e1a253ef7f4ff3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0075949b48a0582d39637ba52222e8d1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5fc95f145df70faef0c957ae5a813f0f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dfe8ac968e3697f59eb5729352f0e4fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4c8fae2a0cc7cef4e91e6fcd0278c63e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"51fd3d0ba5a2980d270c83636bd28078*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe7eb9e2d9f6f9d5d6b824dab081d8fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"740457962d8e08fb78250c44917906e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g32mb32_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7b148ad1eb5152326c9ef8daa0a08b64*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"40e02dde846bbf1c2c9a7452a856336e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"45189f3669837020f9189fa886a005b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e7ec62275fafa7fd6917b771f055068d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d20f62e5c9db38f5cdc8a698065a2c35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"66d54f3b25a95b76edc7cce06a0d37c5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c4a9edd652b80e22975450b890c0208c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"74f4fa0e457846d4ea97a725be8bceed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1307d04286febe38cbea7cd8df6e1e95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"adace6132e6c240b6e16b43d5bf5f26d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"037460d4b3ad2ad30e9e92a43b8045c9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f2f0f546b782428e4f51487d4e79061c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8e75e10395beac459f214d8bfab44626*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a3e30d7bf5fee9c08196756c0c2b7368*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"50cab45c005adecf9ba9546a00762bf5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"65c746865eb2d2d37639e3818c4b44e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a9524f9bf34099666956dab80d42f01b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f4c13f22619e3662f62218a1e0284e98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"35ee38aa09baf038d5a82a95316d9561*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dbd4d8096dbd1006ea62b05b5bb1ecd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"15d67e13b0d949f354f6c87ca7e7ece2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"64d0db199731dbe617aed315465e1870*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"862644949fc86faed486a3402a82b299*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"438b4262d375fddff5fb1c168092e278*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6da3745ec6efd60afb41f0e5e621eb8d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"22bae6e1b11c408f9fead7249942260d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"386210f22c0fc5496fcee1c9d78a7f94*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9200ba065d071399d84174ee6fb59081*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8443457c3d9e82c4048ea449508b07d3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7b71471d03fece9e0103c8685c22f3ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e03dfdfec8f98596e97d65d9b9fd071*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ef9488c5af8ee9b7fa6a9b1633522cc7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3e869633e626b665d77446d98a071c61*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"091bf0c2a71c816545a9b3e499b329cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9422a7ad8d4d23457a8c9a696871d5bc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4e0ada41c6ca99548453559c2fbc06f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"502c45fedb6f07d52d41b447391c5c43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cab8fe0a4677fb81752d613cdf765422*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"176afaa66853e34f8d72291cddfcef54*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6629a1a7159b781e9bb29057c26c14eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"871cd52a3476449b6aac300d54608df8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2c8228211dc35e979bca97deaf1613a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"01b34d00c0b7b4b1517f5e1541ef00d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f80acd40daccbc51eb9b01f78a219ed2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"371a609c10a6492ce34ba1774af819a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c212d3cca0e894cfe0e22cfec3f7b8c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4fcc53e2b2cf7c5190726786da056b07*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0e7e281e5680c6f89f79ff56fe7e2e08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5f54e0a05cb57d0acd08ef94f0f39fde*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9965e2bc5eb760c4bf7f9e947f8ed31b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1dc92eedad69c81e13bed395eaedc1ea*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"54e42927b4b0a0171ba0444d2bd3af8d*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"36fbc6d0d5b667874a721f2d6f8db8ab*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ddaab9d8fe922194c0f0a4f2ee2413cc*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5944bd18d49dfe9ecf42edfdfc973291*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6494e163e34910bd0771d38c271324db*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fffc508d896f266ad496269376c34820*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb32_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b5c92a0bb4542c4a4ddcb6d42f447caf*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb32_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7981724970c87b622257b463e82f02c4*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9bce547f79b3b7b29cd399d1a5578ba8*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6b740beed1f1c14b09a627406903276*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d2e9cd2e3d54a07d5c6e5c5aed2785c5*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"88fb4b734231c8bd475dab32dd077a01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d8b8e7d4cc4e832ee4c901fc9e89d7f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6ab24d2e24041a33dcd109f9f583d2b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8bf8059f908d7f0e1014ca05e01a2cc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"51f5be2fc0d64f7563f0a40c080ce30d*2&d3be53e97fdf0bdda964558928b05341*3&575022ab6d6232aa40d0c8df052a8df8*2&3d9f98ae2311e8009539b4ee5f984f84*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3299a87b12367dab879f02932497348f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d5ae4213256110b551f7e5a6057f4e32*2&813a73c01672a0d78084e253ee8565c6*2&e638daddaf03c419aaefc951d9de03b8*2&9bb93997c5e04f6a27d77dda2d0fa480*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f59c1d1d300bb33d9a7a9f5e70ba2e41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"acd868959b73fe2631ab48dbd79225d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d52f9368129a02df3cc2228c4f1bb716*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f69a290c9d92a24d6013730b4040e419*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"033d0706dddd822f372b2be81db25287*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fd36e6225ec6baf73b11561be9d9c54a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5c4bd493205f82689b87ffbcc90c2952*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8246a36dc95524cb58f20fe4333ca071*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"61f270ca142506daddec580d6b2cec41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8ba63c7992706731bf24c5a024a18254*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"af73d2c952f5200997722509c30517a0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"67d60612fd70dfc323784d14aeee9b4b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a0055679a1448151b09ab51815080c47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c73b612f919ee46a1027c88c673878a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"82a059577da534f96eb26c234d501a32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"160559adc5c73e55af51fc4b30eb9dce*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"93060ebda767ec02a646f6b7e197eb2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g32mb1_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"98f4239deef523290da353d9468c7bef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fc0e7dcbd37d7b2c7487d6d2e62f8543*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9830dc1d37ef8462d203991688765049*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1d7630182ab723de21ae5d9775fe5185*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e235f72c6d569e926e20758aabd537b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"47c1e3d7d4fb5c4cd59ed95252204fbb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b1494fc99d62bdf79f3c8cf7a377094f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f55d9bf6e9a704528c04ac8dcd748adf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"524a8cc816842be08d69a3dee8d28654*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d94772e71ca666f43e534d41e39cf88c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"31a347647a1fdd75f30ec7e317a9c622*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0f3872026ec6e75f31f9f92db64b3356*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6a91d621e29936e755289aa72b5bdec6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"19040608fa2b80240598c56d6cd26478*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"753b1e2356d4a46670399184b60aefeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"350e78c08ed3a874cba0b7f48db0dff6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e7c006df09447ef1d914e3dcc7d101fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c9bd6dc84b9518a21c5384f6a13db73c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"91b96f6a443e1da116c4e6b69c926f1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6f5f837227d87c7f9eb1223bd6dce1a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0df6ad76977ea2e8cdae0249e2a57889*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f37ad1b632c2c4bc7b918325f73ce406*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"31c663e4651f470a6e3116962e403365*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db785e097e937cb7d9cb37913dbefc2a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"57a5b3c250659b5232725992e4770da8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c8cb2f45a5e897d3ca943dae95ed0248*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"10ebfd9979315bd19466b892d3a05d80*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c9cd17602156f116992ef3ad604f58d4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b1685dca48edfe123b642d2468d90cc8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db4c3623a766cb282a9c38cdeae2115e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"063de65fb58a333e05d038f6d18533be*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"985395eeedcb60d4e0d2ac0d7625f30b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"35355467094b394657b6dfe35f49dcd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"816ded36ea06f57b16ee8720a59edbd6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2bfda11df9a17089002f2c86edc18ce4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"999dd44883632541aa49446dd963f1f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37f4f67e419c22775ea4b0bfafa2e774*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8f551f0aeb4813c9a0b4c26ae64879a6*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8ea7c831eeafd5a97771311b53f9d92f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6534a8817595b169790b2f048f779c97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be628b3dfd6ab1ddcf3c7a1166bc5885*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3e04dcd3284cb55ac75fd326972eaac9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1d8c3aba57b06a9f0ff6962aaa6e2c82*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"76cb53ac4f925d637657db113298dee2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2fbb2e17d43e0d87f717ad94902dd66b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"638ed36284eb13949a4058d2c7d7d3df*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f29cac255c71350d195c0bc3b7398ce9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3ffc9d91f33979f3f105d7c29300d9cf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9ae70d4ce9f6768f5a8b10223815eca6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f3411d2a2edf64608cc44c74f763a61*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e851f3416f1323fb918a8892f95f652d*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3598c92813add3309c6d2b46a009e585*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"898e45cbf8b7b02f596826b843ac7a80*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"725a6b05a7eca4afdbfadeffce923f75*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c1bd445dbeb1045e40f0fb75b3b99452*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b4eb45c064c7314b36df6c6789ace4f7*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 g32mb1_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fc1dd1ca430a1fa753497da2d5f2d987*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb1_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e22a7778f93b47e69982b8bc8ea3a1c1*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6e3f97e35eb9b476c8583a3814a12dec*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"542f2897ce4c1c8289e2c19c1671acee*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0da6433fa219ae674edddb18d32018b3*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d1dbee00313d4bd3b38d9198390b95e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1422010309f7c4b34db6ff1f6116562a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ef21ceead8ac5c68af5c6815a1fd05cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"18a4d82296aeb3210eaa2fcd2b783912*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"97b39e0911c16f8516f2eed65f1f2cca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2de118166f93b40a1ef0d1d5a9e3c430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0e5c00f031c7ee6bfeadea6b989e23f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"32f70028e44c8476dcd5cd5c90cd64e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"9a3431d1d169b984d8a409ed9b222a21*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:acdb+eltwise_linear:0.271:0.314:1.234 mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"4c1b53ba06b0244db1e536aaf191d453*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"f47eb857da06f55086863c204f0c5478*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"ed8ce285de81406bc37578e1dfc82f90*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9fea3207e69058f4445875a24829acd9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"584f755df6d7705036e4c5dd62a2aa08*5&872f590016fc81bdbf63506ad656214f*5&31f24fc625dc753821c441c1ebcabdfa*5&34ae16e7187c65dff6534930d75ca929*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"b4a817c954ff5de9aaa17c1f1a19505e*5&fb213767b6f224622b62d260656fb42c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"b714ee95ac277746335a50f7db70185b*5&265bbb670b5e5d77e569b4c46430df8b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"354c5521b391858a9b1e14ffb1bc8490*5&0ba2ae9211b9b1ce19bf614821b0b637*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"bced38a54c8277a7a60b7d4521bc5cd2*5&1df61c912b77c8d4a21eb80f34cc6fe6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"2efcb434c49208b8c4294a7ce742d4dd*5&f568bfce1bc514e25e67cf619da5fec5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8b5678f3b6e891f7a27b529c3f8a0153*15&d0805d4b04b402768041a20da61c1741*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"4f080bb5216e94fed5bb1e48e5fcf4b9*5&9f9c581812ab84ebbda1ce915625e7c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"48a7d5b987f0500f17f8aacde8b16118*10&a6aab75541c5da660fe6b43f8317a7bf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"2a20722a242ea385496ffdf0a0fea4bb*10&8a0f0607ae6e7cdf2ca39c4d8b0ad9c7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"467647983af75d6c2181dbbad05359d6*5&016c12a02622895e113aec3214bb8a20*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"bbfcca3714a54059dafa37554d64e1ba*5&b7788087223b4ab6e48ea610a38d061b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6f6deadcf82c9fb71c50a5cfdbb2c320*5&2bf1ed28871978f8f6565edee17bf7fd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"e89bf2bc464e86a5aabe195a702e4959*5&06d73e29fb68f823c1172fd8928478b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"386342398b9dd3f128525132fe5d71a0*5&7d0c5681b7719a4459fdb08d742a082f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"212a22c1be8b4df4ed08e8ae338b32c2*15&a5ef459ffc8c8999311bc6e519d6b024*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d5da9ebfb12c7ad971b6ac52bdc857bb*15&767f55afdc78b3d44d15f3c4097df5db*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7a3e042c0687e030ba33101adcb75180*15&62614f07bc24f24acc7e735aa53a01b0*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"819e7a9fbf34a15a3ec9a855eba82b85*5&562cd87f45dccc9a263abf8e86ad5637*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e4223468b03055b18ba188ae12a80c89*5&764ce4c7f1b4d4576608f5e643dac99e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"ccfaf5df9dec5c265bbc9c7572e38106*30&3f8cdf04f38eabb5940af56e257446c9*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"89807a9b8c5a06ff965fb97d63b74f8a*5&cde80b10c350a9ecc71d3f40b8e5cd8c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"03cdd9fc5a254b95866073cf1b7216de*25&978264091a6103329a6a94ee7da2dbc0*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"22bc55a80a69371cb5a3420f6e20fb78*25&7643eec07d44caf9e2be96c970d95ce1*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6c6133d9e5ada945b7b42ee0dbc79adc*5&8fbf26e0c1c2e2aa041c2fa066f97f10*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d05b4933574836c359a4983b5a2bcdde*5&d9dcba2f32db6f9bc7c7891351536493*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih128oh128kh3sh1dh3ph4_iw256ow256kw3sw1dw3pw4_n"3617793b31e6dbb2936f56059d996ec6*5&6153e1b6d905015d29f2b96345c87ea8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2f911b4120a781abbf120ed96f2773c7*5&847977504fdc50916f4b49f27d3bb89a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6c0a2371f9db361de1b0b16d3ec1136b*10&14d6b237a1e345642e94db5238908c89*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih128oh128kh3sh1dh7ph8_iw256ow256kw3sw1dw7pw8_n"7bf0a7a36a1ef3248cffe119036b8a67*5&9f75d8c4b477eab941fc1ee4370d86c3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"28b97a435ffb8432125cdf6c9ff693a0*10&e43d323c7e4308d5058bdddd090695b2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih128oh128kh3sh1dh15ph16_iw256ow256kw3sw1dw15pw16_n"9e269c6a71f6367aa39fee09c95ce95e*5&becb812fae533283e3dd612766d5c5d5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh35ph36_iw256ow256kw3sw1dw35pw36_n"e88b6c4272cb6534350ace50a1710cd4*5&c534b45956240880314861a9d4ac5823*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh23ph24_iw256ow256kw3sw1dw23pw24_n"f442a1ce41b4c509b4784abfe82c491a*5&847d0121a787487bd422d7fd140384b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh11ph12_iw256ow256kw3sw1dw11pw12_n"ae2aa7d4817e0c836071b792694e18f8*5&5647d6db7eb2b860dca926563b14a2f9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4c51f8a924a9c0b0ca1fec41e566ebf2*20&48d24c0aca15c50b1f030d2ed1f1acdd*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca557b44d9c03382dd97595bb82b9bfc*5&24f96fb8dcc4a43c50a2ee183e1baf97*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"db5c5afbc7cfb3d3ef1ecaffad86c954*5&40ba00ff0528e119f3d0b03a554e1648*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g304mb1_ic304oc304_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c02cb8b26f0d790bfb3f1e05416212e1*5&a9f2289fadbefd4eb954470749d9107c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic304oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"dc28f6565686b160dadad8f21cb6d021*5&b21b946c69482c95dd6e93d3f3a1d829*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e704dd27bcd80086d09f9143ef68cbbc*5&a9aafca3442543febd27d535ce8a3bd8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"95f46eb8f8aae6e58127784fa1aa5802*5&7576b550769f5276ccc472f35365af27*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc19_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"68e61680415601ae5c1b26b3ae35532a*5&6426f40b82a770509369967e79afdea5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc16_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"7e8941af51edfcbd87f73a1d70b41e88*5&babafbc2063c1ab19893fabf1cda1d9e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic16oc32_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"4a1cbb44ac9bf14530b87c594c63c43d*5&e1e15a6c938e96cd080d21a159a7edec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"bbb911d5ae0275a7af7293e9ceb3b501*5&4a81e5f10b94cbcc48c5984226a7316a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"4f7a5b505903f1bde324172b7aef8609*5&642b93878b45184ef21fe3665cc76127*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"666f745d72154571ad65946b76ba3e50*5&4c08375d45cb1cf4cd4fcd2f8a2ed6da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic48oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4e071369e0dfdb371ed274ed5fa8c31c*5&fa38222c1177623975c9392626ec1a39*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic32oc64_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"a621142b79aea535e7ff4312f8693d2e*5&fcd86912e768642883f7f6a35f65afe7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"dacb724a07e5a4b14614e0ed355fe031*5&970b1a4cdec14f3bc92389ba0bf02a13*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c75b50039192db76fd5f5b98d0c50e66*20&4647e3ca11623f72020893d9206aa3b9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"960859798c22091020e504c4a0ca4d8d*10&a2493a87d4daaf65459251a8a5294686*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"6432e701c6c28d06ed56f39c94073c34*5&8a978217fc79d6cfb181b3bfa78873e8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"7f29db5fb6036506007eca733c850841*5&6c89b44e224c60ac8f6f502975104385*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f6cdf1d4dd0d88a735ba928e148193e8*5&807f76b4ba04db3f69acf344db11d1c3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"6c938c1b6c238bf3818bc61d5735af9d*35&a073a7c4bc881dd2e527248f11887bde*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"829ad5a009a2ad721d1f49a3c7c50157*10&eb1f6a41c8fd08b905a2a1ab7855afab*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"9e1de540daa2c863a8bfa2ef0e1cf376*5&5c6fd24e35e5dc35c4a498b6fdd3cd34*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"aed78a0a926b930c9d2f235d2bd7ed1d*5&ec4646101f88e307ab01d0197b426534*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1548f5ec44bcf82fa48625b752db8bcd*5&9c6b217d42aee6ee7a3fc0a75e2ba8a1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"fc8f776e14130c3408003a064f817821*15&d41876fce73cba9289b4a0c67b7b52be*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"a318f0ce396b6b594e495d13c501aec2*5&cf0465d16338568bd7da46201e1d3f80*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic384oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2526fe952c3ba6633e4c01fc981296bf*15&c7a10375d51a2855d2c6a4279ebea864*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc128_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ce556eb7e40b8e772a0e8c898ecd093d*5&ae1cb835699bf8bc73cd0f92eb2d1b24*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6e9b1253a9ec6eb8c08b38ed70e63a80*5&e7831cbefe7a7f733f1dcc05573baeab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic384oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4a8cc3a28255d46435668be11ab7deee*5&dd02e2c0550dcf620cdae144c89b8207*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"03bc5dfb599ff8dd5b1357b6067a8917*15&204d4fc5581a0d4356967fee93896838*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"279d961e28e9e9df682afeeb1885e59b*5&c9ee0b83d7ffb642d2a09277095624a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic96oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b41b7310fb2defcf494df9f63b5d8518*5&1bacb32db206464a7173a95cd6c0b0e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"feec50b43637be965ae20fd8bdf26d33*5&c9eab25c1e23dc2e810d7ffa98b33c18*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ff032c74f34f8af1cd12e3640fa98aeb*10&e56bd7975deb90a22934b3d2ca2028ff*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"9fe06752fd11cdffa57089eeea33ea2d*5&37dc553b42de0841b43f79ccacbf82f1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"b33261d504708d61e3d530fbf4e0dee6*5&5e332db17d45fdcb7cc2c3f61af746c8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic80oc80_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"70020fe7b61463b0aa85d03f53878bb0*5&388977304504d053e2a34af791fd345f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"aa5a5605343b34c903ffef35dd08079d*5&aee1307fdf67bb506037554f5fcace4f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"391c24dc3b92785271efa57f2e49e175*5&4ff1a251da830ca3b45278de8b1c05f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e4b5e4c5e3d1665de6cabf6c9dbd3465*5&9a7e6a7e04a0ea14d8385531b8888aa2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic128oc128_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"2706fad1be36ad636f1074a240ddfca9*5&503951c344a39b1b85d2247d26473014*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"2061a162ac6c88438b5baffdbb24829e*5&0cffe4d45aa8527a182d51b2ed1e8294*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic80oc80_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4897d39cfcf6786e52a049491c462d00*5&8b455789d4d31d93d95ab38b6990e782*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c2b0bbe1fcb5883b51adb91d8f1b7142*5&d4768dbeabbdb951cb26814deefe9c8d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"20dd88d50c98f67148b042099f8381a3*5&d92f333c79339a912d4415d827b6dce4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic256oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"e08e5042cfc18fab2d95010bb82ad5e0*5&11daa8e72d215f2ba22dd1a5b7dbb03a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1b8b87ee108a32e90d3f011c93526de9*5&0f24f8228ab9dae08ac01e95376a4836*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic64oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"cc2422a125d28fb6678b9f585d56196f*5&43cbf389042ac390d451c7fc5a6424c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic80oc80_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1f5ad525657c5a228f27438cd5280d62*5&b8a6c36aa8d91eb6cef4c7220b81f1ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b5222e28cfa18cbb0dd252f6b63d00d1*5&4067a0d456222bd760662bfe325c46e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic16oc1_ih4oh4kh1sh1dh0ph0_iw8400ow8400kw1sw1dw0pw0_n"2827f12d1b947c2f7c5d239b883bda62*5&ec43f4c1e1b6f7c221f47a2dbb48cbf6*5&0af15d6a5547a2c2843673f4729c9bc6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"a4c359f76cc008ceb4d283b7146f9a4b*5&7feeef05d18ad1f29233a46cdb143cb2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"d11513829769c4fa48e1507b4640625f*5&c334aaa06341a8c65b338298ba4a46cc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5443e1e2395384d8ce91f47d8f970348*5&0f2f24e0229be0a4d2c7a0a86cca5f6c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"81850e77323b244b42a17ab4b0460627*5&45ae13bc9ca61fd352f919f722e9eee2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aaa9b1a69f17119853b431d609974671*10&0c5b11f8c4438a5be2c55ba1e0854233*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9ebfca38e6733d8541f996bdfc1ff479*5&420d5d2fa336328baaa628cb1c6a3ab7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"571ecb5dc5bb32518635f97ae866aa6b*5&eba72cf204667ae2db07aacdc755837e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ef67202894b1e0c935d96d62a23120bb*10&391ae74ad84ff11e6f551fb63c9c3296*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"be1b2fb3d5e9e414615d4753b33d9464*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"3e984fa8aba60f91565c29687cf21208*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"ced36cae29ce0ee323f6a1bc3105b0be*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"9b782fc3d2aa5ea76df7f059c233192a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8c3baffe33fe41c8ae457cc6a97330d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"583a23dd88d9bc12f185662b5a99bf7e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0929e9e7c5cf8706d58da9b349c7d746*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"d94d37021ebb566cd49fd1068555e74b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c4aee7c1c36b2cf28ccd86695dd235c0*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"bf7730d3deb024571638e448a13c823d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ff3a69abe403dc4c49ac33d1fff596b3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f30bd8ddacef21e729786ee5a573bf1b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g384mb1_ic384oc384_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"033fc72b6e71930bdd7b498da74bac16*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic384oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4f73deaf00f744addb2f9e97c9c6e7b0*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g384mb1_ic384oc384_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"00e76682086535a8eab7fb67266c80d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"91ad8a172426670279fd97270230a1ae*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2bfdf431d97b01d529ffcc3ebe59009a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f20742371b1d17a4b79b7a4a2cb9320b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92995dd21dadb57964522a097c7a08a7*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a803b851d37f2c5b92d68a15e2c917c2*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3c34815f9245799cde92c7f0cfe12fe4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fa8fb2f367fc9bf65df7c79cb8dfb64b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"c9bc25be2fa8c23130260d91cd48692f*5&e26688cb8885ad23878de6bbf7afaab6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"f0ccf14a1efc63f90d511a582b3322fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d062844891f8af08264f3d6d65a27344*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"615409709250960f43409d57e0e92e83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e1fc86923be4b7479dbab3e582c61b63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ae86989bb4b4ad84aef3883a0623accc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"77b96a8e1e68b1f1deb626dd9a9d5545*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f1e0d25d9c580378ae9dfe5c6700f236*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"f2dcb1d09dbb52c9dc1a896ee2bc9eac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"90c266df0a3d70dfc8e750bf12e40d55*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5ba737dfb7437ead6bbe5cda6603d733*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5218e78046404d4ea59c105d7d687aa2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d260f7f68825bcc6d1c0997378ea4689*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ef3b807465de0512af595c4e18906ffb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7aac016ae6d5846dad668c0397083c74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b3446ce72112a23d9940beeee43770e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e4bc0fd2f701b690201d6dcf8fa58403*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"93eeacefbd2459a3caecd3c974ac66ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"325507649f761d5ddb67fdc3eaba0e70*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"3048c9870c18b36ec504030a7483b276*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"44672637c3d9dafd6a0558d87db2cb2c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"109830d69c0557e0eb29a67e9986b07f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"885b2c1273835be9daec041f4964b27f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"e9e363928ad1739591098920bb086d8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8ed4f647c185c8df7f56414a88fd8677*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"eff48cd21b98527b65671381e7592aad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"f760afd2df421a2626bcbbbe00026166*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"0be6912366dbd395696c1b946c9a5a87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"ed6d8c66496bdb28810f02bd978f6725*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"53792318ee8ec8ae3a6f3539cb0efac8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"4aebc59412c6aacd8d77247bf6d5cd95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"20323615b180cbc93510dfdeb68a2c12*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"6d2001b5b1f4431eff3480185b658591*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"021eb7412cf2d0e3b4ec6a64d156a688*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2db8b7569c1eba1022984eb78d3a7e31*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6cf6bf6402032a1d4ac3d186eb836136*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"0795de16d850d21e6609781d0e811e23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"af836c98b2c3aed4c4b0b2a902e0880e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6014c23bb4088fa80d178f7b832f3056*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c2799510d6ccf7df89fc1fdf3e2d0f4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"adc953e35e08155f00b01e06ebd4429c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g288mb1_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"13d9288b9d119c61a66751c3c86ba50c*1&1e7079b8fb69d9f2e462aad91ff8f3ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"71ab29f5875947d93fb882cd326186c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ed6a9a984f981d40ee55cab4f7faaf69*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb1_ic288oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f60995395952a2cce1e9625e6b71e997*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g320mb1_ic320oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"83775424e32d5793f83175e58a8cfd57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g352mb1_ic352oc352_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c9dca637b5e7f5160cd6d565c0909689*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g416mb1_ic416oc416_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2406be4be1f2915d403d0eed9a620b1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g448mb1_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3f352ff65c96d044689f5091bfef55ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"11ad3e380aeaa739a2cdf480eb20db73*1&abc463c66b3bf83862830491e4d5f34b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ba7582b42af2773c63bc7f6ec65196fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g544mb1_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"504f362bc4e9c31d514a2dd3735bae2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e273cad7f5fbfa1061e9aaea6619a55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g608mb1_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e87af73dee4a90a230daf0fc93efdb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"64ae62609c7b10f0363c494612487feb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g704mb1_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4d7e5199b13bcb35b1bb1a1fc5653d69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g800mb1_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b7bc07ce28c0b57dbfe4425fb243701e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g832mb1_ic832oc832_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dcca0a1c678d18cd7da78e95b2649760*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g896mb1_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"678305c5f90ddd2fd7759ad7b03ff70d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g928mb1_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b350f9c06323a436ae6a265a376569a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g992mb1_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"73b7e1c83f59e5f71f8782c7ee5a2345*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a423c47287300e8f1d66e9be0d50d8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"12fc109afa5b93efc080d30e3aebc50a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b376368ad9485b0213917acb333b86fe*1&e59b564c79b32e3d73e73d69ff1c89da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"50dc3e1ab18d863224befe3914fa08a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1088mb1_ic1088oc1088_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6252c470ae6217bf07ee991ff606432*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1d12f75d3fef984d31d17750eae9d84e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1120mb1_ic1120oc1120_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee636ace6f5a2b6a6cc4e32831c3d140*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1120oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1187bd5173ede2a253f1e60b45462694*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1bb2fdab2025c2e45b668318892d0155*1&b6490a99ee2350600a6b40cdb5af6b0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"437652d9cc9106fec3f5b74fd46f5a5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1184mb1_ic1184oc1184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f733bb724b4450d8ba3761f3a03f0d42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1184oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"36ceb67b321ccbddb439dde82807bd21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1216mb1_ic1216oc1216_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"33ec4f6fc8713f494af9f01ba7dde03e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1216oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6ee9356eb0b6cee9ea20fcfdb63ab159*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1248mb1_ic1248oc1248_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6c4bebb0f973a83dc2d4fd2da186d59f*1&b5d27d542fe55aa976d8a308816989cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0bd239ade88d252fb428797f01c97824*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1280oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3ee6eac030ee529ab3955e6329b33048*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g640mb1_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b9627ba14450614bc7715b2a66830d84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"884047be74ef4a303fb6117f30af94d1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g672mb1_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"810a2668554bce3b05a5d509b9c79c30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g768mb1_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5b54119cb1da39e0b50baeebec85bd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g896mb1_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8a133960eddaf074e8f249901f37acc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f2e7ceade72ca395ef4d68fb1fad03a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ecd95ae0f1e54ef5d3408e9779b3e039*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c468d901cac46496bfe98882ef3d1f6*1&a7e3c0854538ac10751d916f3c3d526c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2c1e86488383e4c4098e5f02c9047d1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1088mb1_ic1088oc1088_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"62f25231bae8ef4e28334ae18bc48b63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c85461a43ec2964f83f0c68d4165d7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1120mb1_ic1120oc1120_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3c8aa018b25f07e5ae27b09b9e4c0ad4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1120oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"031013b863b25ed78c6d53f109587ce3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"783d48ade73bdcb851c4ac8a567d8ecf*1&f036c9ca9fd2eb4919f4331119fade8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0299c22156803d26b5912412790083af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1184mb1_ic1184oc1184_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0a9cde3c7e79cd83e9dcc7b9d3d2d4a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1184oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3190e72c2731e1e148cc19bf15400420*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1216mb1_ic1216oc1216_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7206196d600817558ded15219bf65eba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1216oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"256f5c95064c5adb8fdf43652c969bde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1248mb1_ic1248oc1248_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8061d21b185987dbf22a5171bf02b0de*1&69ea1df408993e2798f9149fb63053e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"88ed26138223c9f1dace69e4b8bbaabc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1280mb1_ic1280oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bbed6b3f6525f5286f9176cd1d768e84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7e69b29c7f843ffc917dc0a3d222ac5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1312mb1_ic1312oc1312_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"115894d94959ceb97c3ee7c942988292*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1312oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d7b530eca732ebbc4c59e709623bc0e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1344mb1_ic1344oc1344_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e72b6b8a4b0f9c66ce1c014c9e51916a*1&8c64fb8d3d3572b328bf1b21e15f1518*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1344oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"484d2c5930e960df5db141a50d46f249*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1376mb1_ic1376oc1376_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8168625814328553b79b71b11df1c2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1376oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"27342f0313b02d1ab5820e17f3fa82f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1408mb1_ic1408oc1408_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aae42aa7b60f8d229467c04ae997b1a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1408oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c0f776a83b6e7155a8480b4d315efc85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1440mb1_ic1440oc1440_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"19cb7d8ea46a3ef5c69d1172c703a7ad*1&6f6de19240cab2d5cd41ed5cc5dc9122*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1440oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ba6d43e01732a469a43b77de9fae5a62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1472mb1_ic1472oc1472_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"390c40e5a0eb64440493de8bd185801e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1472oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"72b926ae67283557062bd2c7d145ffdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1504mb1_ic1504oc1504_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1ae98617df8b21a35afb7b8c96c962c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1504oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ee4c32e205abab304ad400e0c9481998*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1536mb1_ic1536oc1536_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"76756439654a5c12f71de5a863d3eaf5*1&1c56baaf5971290094c3c9592a5fcb03*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e31320afceba38253f03ee8dd81c62d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1568mb1_ic1568oc1568_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1c272e14c47724cfa41a9e250113d6bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1568oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1d5d0195c943767f1b5e4830baba1e78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1600mb1_ic1600oc1600_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a4c04db53f4ff4dd39bf1c855b64c943*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1600oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0bcd7b8bc1ec80ae27c30dc0b70fcb07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1632mb1_ic1632oc1632_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"62815822541b591bf8fee7d281d28425*1&e4843dd379c296896faef3c5dc5ee4f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1632oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b7c10a24a4aea12d2be84ceb4acbad55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"603cf8539c912da891c4709638314e35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"d09e06a0c615f9b98c945e4e509cb683*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e95b86e82f50ca47ab2e6acb642a9d8d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"88d9e665a871fd5dd7422d3aaa3e4ff8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"22d7df2ae67b5de1098d30a6789ccafa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"deb994055a16fbf0279bb43d65f9a261*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"05c2b6de4b6f7ba6b32daaa3ae81fa67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"a2294dc1af40f4ee77c9a339548c8fd0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"094cef3941e03a5aed06fd2d780c0b6b*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"5b05e8ef1fd0b5f02119ab6fd79769eb*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"8d4837f62ab8544b2f852691a73a41cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"b6cee3deb571ab2b9766ac270431ddc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"63080a4cc0e32484985a7f3c00239a2a*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"d8c738ec3c332b6eabc2f562a58687d7*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"7426221d8334063b50e2a87df82e101a*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1861c951345cc91b9e3aafd830ec0207*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0fbcbf1458bdfc16eec7ba2b34c0cfd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"27e2947fdb1c3ce836873cdc7ebe0e8c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"766d211f9b4fc5645575577b5ad3466f*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"0351242cfdcfd100ae61f891663693db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f5911d30ce344d724a038a478d7737ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"4637ffe460e1f3af0c335454bd1ac889*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a713316220fb85dd2189e9b548d31d2e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d91fb1fe28b7d6ced2722cac983daa51*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"a5c631ed9c678dd32dea81d387036ac6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d1847cf5e5a76f0432d56a41a7a15330*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1629c340b3af19ca5383a6e8fc717bad*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e8ba2098eb8f6ff05e56b550a023d4b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b9788786a8707eacc7ec7b6d7d9280bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"266bc31b489f75dcae8b2d3db08f5cea*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"71a91a635977a587c796c0dd5b52fbcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c991d8b9fe1a7976195aac95729834b8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"71a942d26c6110de3c63c7ffe801489c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"6f35e7724852639d88d4776058b23184*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d2f52146cacf6a6b3d42392f7c752eb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"036b8cdb6035c285cef2d04a079fbfd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"98556c47647998a7ce740f1f9617cb6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"a8eddfd91eb2521342f00de2f16ed797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"296ffaf6c123d31edbfb05d6dc2fbeef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"b69bf149a9457be58b0f83a7b2cc1da9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"2e6e12cc2468f5b78bd9b25c3a4dea27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4d1b2e0ae9155f4ff770d8b74bde8e06*5&bdc1206355fae6913ff95cdb80cde9d0*5&29bb9d469946e0707b2cc94bd953cd98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"937479d2c8d7f88ad5ba768416cc45d8*5&354204a3a9c0fe9d318188aa1c26e596*5&ee86bb47e9c5f7b40da90e63dfbf8396*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b18d23a3e0360b8ca21f1383e1b12083*5&1ffe3a5c20b007294fd0fd02c1da187f*5&a335e483ab086b5151ad407e39111943*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7b375e89582591bca6bff85c4a47d187*5&fd6c5595b546bc044887edd9ff491c11*5&3fae59cdce952f5e75ed9e8ca0564182*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2c3871875b169df1516169f3e85bd85a*10&c28c7105c5051c6a1275bb71a70538d3*10&7c5ec21801ee8589937a86424177ca4f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cd7723ff035ae539d53deacbfa6639ac*5&80f885677888f96142b1dd02a2567dab*5&7c4cd4352e94ad6986ce6956180db8bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9c9ee8d2fdb9f597257750150913cc1d*5&fbc35b9d86250165ded5f5f47ad6a739*5&ed23e12ab42725c3e7ce030e8ecd729b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fcab1eaa65ef0fb39345d9d68fa9dc11*10&c5293e4af822586866374e5e5b133f1d*10&a8b2e3dad58b4b9b5c2bd23a9fef9f2c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b93aefa28b4130dc49999f9dc3b57091*5&f61698a723c6a2229fcaccc30c151787*5&7dd931a065f6397f2fe712c490471cc1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"66fcd367357e187692272a960a9a343b*5&957ccebceaf77e5c898ef957c6522145*5&69b8c66108f25fd4027bb1e8b4c93b6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b41c3e80d26520ea7250ac6eb867872c*10&df3ee1589650014cadaac135858d3d25*10&6681969f6d7c53c341d92dd363f2b4db*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"447dd160c5643bd0ac0cc5923d3d9366*20&c17db9098f53b4ba62b9d90fc10dcc34*20&8e3acb77670ec637bf07913269a4a8c1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"ba5d1a3d34f5de1dd79fd6e90bdfd552*10&cc856466b68aca0c3862b2bad936eff5*10&d9050fb60584ae836894d684a9d38144*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9a7bd9b11c9e44a333129101d7a279ca*5&99a7226048021fb485b59f79677fa50e*5&1cc2d86545c6525000679fe5f6903d7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ffe1ead2a9cc4acb9e7be5b0a61f51d9*5&9072673a60664dd3e6d25aa6213bf11b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"0007be6a5daf905dc6fa4018a230d708*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2175f6655fc51479c6ea68f7c4f595de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"aba6c9ddb60570ec6cb9f4807bb9d530*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"3692e1b48e39da357e8d0d7b637f07ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0cd8f15420bb98da91b06d3c33e87513*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"aa209b5ad15bbe3acaaedd07c9977773*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"113f47d7355c4a8101580b2df7ff8b9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"5f814243008b9784e6a2d2e532effb26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"fc36308dff1a3f586eec897df1836b4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9fe8d7127e3cb39e4cdbc5f22343e71b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"ab6b7fda287e1518e86183a6e4c0cd3a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"72bff37407fe0a4406693fb5d4a88e08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d156233e9e94b3a0fac6eb5e0ef75708*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"29eb39e4bb326e23aeb51cb76df69309*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c887b6472f30f314ff067f7e27866b4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"66561b6c2490f4aee226591a369f0ff3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0a059f34bd8f9c440d8bde4f1878d7ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a7646ee38426ca672f40620a3ce88c62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"606bd1269236eb350efa6e38b40f2642*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4a8cf5a65aa0ddf681f79b9856c928e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d5401ee4f43688a01d94557379ef0556*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"91241cd321d79bae384522292728fdcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8c98dafff4600daea77614aad6d832e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"8971dea9aefeb05be86bac48cd7b15f7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a7b3cd680a6aa0ed51b0d7b5e254880d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2db6a4464ff8f93e7cd237da8352f780*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e877d9bc4888b359347a4c052ff1d843*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dd2020a411bcf00686f386a83668d882*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6705afde449947800d76c514803a5b58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"432d90ee42911238a860d4368bc85bda*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1903826ee3fae74ad7939f95fad622ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c1a65936791dd83c825e367d1add3fe8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e4ea68d4e8de04e9eb0110915702e7a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4e154c5ad5bf673222df1c94c9c69070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b2dd46fb026bf7d6378985ecb453e0ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"456b805abacc3e0cdde1a24f34ae7285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"126715549e07a86abdfbd55e34d4547b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"38c55921cfd5cf1470895840b2a3b004*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ada3ed2d58fecaa8700a6ccdb727b36d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c3319937024c9aa647ae8f908ba407df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6f672c870f7012e7881d30ffc804a84e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4e675a2f65110337a6718d16d4103995*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e9073712513482ecd2227d36f0b9beb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5df8be55c9ff21aeb3eea875a0086cc8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a97d5c6d5d805a0079c7f1e371dd9ea3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5fe2ed78fd0fce1a0b2677e1ee4b50db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"483196fc57802de7efd6f9048ba79d59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"63fb63e0fd2e47913b6563fc01a035b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b95993b622a8263af5fe01a4257c1ad8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"3b497f3964065dabd9f6c43ac7e43fbb*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"02ec3fc7a30200a660dc86109e91ed19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a0a8cab900dc772c615cc1cf062c7db6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"005a4a7404367768f83afeb0d6fa1a4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b368b5a5eec992cc120ceb39c85b0ec6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"960161e5e0210f75d4ec9b2105ec1647*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"797594f6575b88be33d7c3571ca2797b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9e1a108809002b985675e67c6472483b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"34131b667c70016829022c14e7eb55c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4615def63bb42923a94bca568c0c3d99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"56d94edff46f22de861d07e6b1853760*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d37f47ca64a0cebb83a96e2e0a6926e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f58e990cf708b3e366f53fcc171020b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8d1da4ded3875685e72217a427988712*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2b94d84243ed14b439eabe1363d3828a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"36e753c43d4ce28dc704d5c26fd1afbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc96_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e395688dada5c6f171410e533d6a52a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9ae16f23f68d147acd0bea4ce2236391*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"71bd12aa36044f649c04548265b12bab*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9a8c42847b80f683b0b66e9abd990ea9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5fdcea27382012bb0a49ed12a613db1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic240oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"afed3463687b174259f20af272e1ea07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"32c979dadd1dd4f065206785b7505f4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic336oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"749b8f7a5a9dbe2a2b0b1c1b0e87dae3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b41552a0b818eadd13495e13c5ee9bc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b8140296d1dd9814843814f382ad3b9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c9d1fd9188030538bfd807a66b894b8e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic240oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e275a98b6f75caba3b5d94d0d0b148bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e5761aa6d4319e3028eca26047f59403*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic336oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"25d46ddeef3d128d1d7dfd1d26b57bb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5fa39f65fbecd594eafc7844f6e51eb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic432oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23e1f09084f8de52ad12e5e37f0263a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e97aa08cb80ddf28a4d02a73d7067577*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"824230bec88b34a0194ff95091dbec72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1a39b8834a59cb77ac56edc11955bd01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic624oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cab67255499495bf6e8f6454e98a7d1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"999e10a3a85cd855051a7320cbfc62d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic720oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f2ab052e7be4b20896851d043c0c172d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8ac42a2a519bbe4b671fcffeac7199cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7208674de113c31f57f5085e20a343a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"90afff28f5452ec2dc10d3ad6977474b*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic432oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"06e8bc3bba6ac1301353b7282be471c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f3ab16bb8362ebd2e48b6ffab49935b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic576oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2a34e24f30eb6571b069d5e4768aeae3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic624oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f27775a771af072c51d29974768cfe2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic672oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c38d16e9c5033f398948b48fad9071a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic720oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8ff6f304d366bbe1783dcbf4851850f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7b974209932f284495707c1885ff4aee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic816oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29a8158fc89895950cba9f507a47aa0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic864oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5b67fa26977d30c488b18baa4f7d839d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic912oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e69d6c18c1399ef895e8bc22a29b4951*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic960oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4db818f755ea5a3221cd1be70eb9f46f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1008oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"afeb92bfd036ee718a51a3761a05add9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1056oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4b75aac95d978eac7c65be00501688b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1104oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ca82f0f528de9c9a47a0efdefd8170e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1152oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"08110fa31c19ed991ef454689e05f728*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1200oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9377efa4b00236d82440e72932804b7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1248oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60f3f21eafb95bc1f2e1e25849e399f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1296oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee29dcd9549a434e7fa645794b7bd3d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1344oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ecae38dc6e7d536472e8164a190a1358*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1392oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e0ebd50b26d4d44176a3a386e759f3f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1440oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3e58fa732ea82e611e2e98a5bd92d624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1488oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a503a8633d50fcc7cc3495ae201ad1ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1536oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0b6771cee20a5192ec4c37d09c27b923*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1584oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"274e4d4903fd2a9f209feb2d8f2ad54c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1632oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0dd03eb3a696b3311d5b7b9badf1a034*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1680oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aed4e990c3595ee5d2f5f802fbfe5731*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1728oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ab0aab551983db4d0d18db7299f7ea3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1776oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ca19ce11990614e8999a8f107b773fc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1824oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"120eea44a022954cdd76d6075ff0f7a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1872oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8d6ec4b63e9f0f87a2fbd8e7781cfe42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1920oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01b0153840aa78dfef370fef528f8972*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1968oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8acc21ab25fbb588ce315fee14fe8217*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2016oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b39b20387308bbbf7bf481d386d9b07c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2064oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c5e55136df30c9c1f48defae520b261d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2112oc1056_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"34fe2d6c616696f538ccf65b7ed4b568*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1056oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c82d6d309cf8948f41b19e19d166c0fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ccabdc0db8227957f2e3d5c7e7fa1e66*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1104oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a1727c090f2d00e87a5bb90f7554f808*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0d931ba0f1adead80721afb6eb02a61b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1200oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0576b578129505fad1dfc568abcb67e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1248oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e805b2a43930c038830a027dd5572923*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1296oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5944dedc4de682c294a24100f2604b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1344oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"247431f03f62e666550d04297cbdeefe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1392oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"df69e2fdb7dc93fefa854558084d74f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1440oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b9b202cc725c8f787a5f533865a73674*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1488oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a210f094b93d86fcf040091708d90457*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1536oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c7ffcc021ed51cbb8f43de6171f1e34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1584oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e1748032d27bbbd2890d9e323cfc7d36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1632oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8a851f59305fc2a23d61ada82882101f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1680oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5f8cc6eb93d857fd5eea0aa5b688790d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1728oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0dda1baf8c858c8c890f527a16f0cc8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1776oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"45b1ec86f8eebf7bcb4262d5903f4546*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1824oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1a5850c191e450753f9bf679f4136a2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1872oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6a979e48df994e556b7ad00f5c244e3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1920oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"94bb9eac1b0e465bce7b40d2dd3a2539*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1968oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ea59f98390a8ef08641f591642ac9e7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2016oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d387eec17ecfe4a74a2d408a2585b45a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2064oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"46254b914cc02e9706bca6e74e2c23ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2112oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8b4a374e9129eecc6825605245625e2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2160oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"df859a58c476f79db00ed418887277be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2208oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"84d1fdb986b5fcd8a1966612d2ca0cc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"d8ce77e885a72f511f53b10fe7cc1116*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e75aa2cdf8ca0a200799df7ba270c8c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"c045b20ea61ff4b610a08b15a5acdfc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"e50516cc4d2a3625fddebf130eddab84*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"0388fa1e5fc58b906fe7dea463e56186*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"f5194ee8ff10696cdc41fec068d683ab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"306b7ffd6e5c923f871c10f79fbd6a20*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"0f8663b47f8d1a6140392dfdfb0ee69b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"0cee167bce560bd4659707f15ecbd2ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e9ba3546ddae5502e365178fd4a844c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"18c002170c8766a22211b9d44221fed5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0f1fac2dc5237e8e83335f83d982621d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a2f6c42e34c6b0d438e983d3e98427bf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c4d2eb41a96b9510dec68f7d3051c1ce*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"1f5b9c2204f1df5845dcff24baa6cf61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7bda9ba8a52e4c374a42bfc2ca89f44a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"3c8401121c5f7c271619c94a954fae00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"6f21360df51227cd81eb42210df47730*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"cfa49c29d8371b1bd5048d6290aa2269*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e2c73eaeb95eee9d6adc2ffd26a160f9*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2a6725124ac712a39d31c6be1f415a9b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"b5ccfeecfa7c7678beabe7d5af46d85e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"25d2caa675a5b104bb48f69fe49ea468*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"f89a486f66699009625513d75012943a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7750ad6ac4005bc1799676c22877c874*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6b61057e6787d0514ede6303d3107a1a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"55b7f6d2800547ce74188f69152ef746*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"25ef1ee141a0e8d4f6beb35ed0d08d6b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7222fd5cdca20c6cb5f3aacd9f5cd4cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9cc7efab7aa1bbdd0d4638e80c6de8eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"209640042c3bae738d6faf313683ed1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"d90ec4a6a3a4a52bc31680d9b2de01fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"88a4700ff77e0aa9faa481a3f1d9b7be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"be7788b03f3404a32ea7f65efc90ed78*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"102998718ead13765ba8ace4f168d88f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"b1f8a318f5ba2e49cfc00b2b61ea8c22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"130cbb2557ba6085f8b44ead4138835a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"dc0cfd305d51beceb38c4c6714988308*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"51143e772e229ea8e6dd3539200c1696*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"fae11022477348b94aa5c64bb888d2f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"5fae23e460960027e98da6e539bc61d6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"b813e24c2ea34cfc130dab84cbbb9b05*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"061a18e749e407abff438c2f4a0b4123*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"2024cae459f52066f102aab5b59e7524*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a052a69a86c877a499c6a35191a2c84c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"62638ade2891286228e20c3d02ebbf7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"0169e8d93d876e1281d7a52f942e3e53*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"445be99f9d33b4aad72611b15ef06052*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"93db2e245a3b4a0ea05127904bfc309c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1081a63b46f4fb4f8e063d659f9a1afe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2a516354f2ff07b4e32e948792e76931*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"96754c7f1b2c3b6e7821ef5ad1c60dc0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a4f867edb62857438d68bed5c74afc4c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"9d0c6d60de9ade3834916f379dc1f13d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"1775d08de25f1e9496f914699d59afe6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"39d6cc3e017db4b78d0d7e9959a1089f*5&e4292a055e315fd0751f92985a1db00e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"6e29bf4b33e36191ca3b6500d7376b99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"ca29a533892bdb3c192d4aa7be025345*5&e16ab2473da56f5727f649f6e9e82216*5&361c253329ae6f11413ec62c1fab4d3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"d8874209f123361a8d58d54de3977a3c*5&cbedb0a460a5ed527f95ac71ff50a3e2*5&e3ae43db808ad2b5aa67907893500730*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e78850cfd7c917e0b52daa7bd3e887a6*5&560bf089e46bc65459d170657e56e708*5&49f38be5302134856d2b71313e893c6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"7aebd0a5f5f786b3b9c62ee5d7319900*5&f782ad1ca6521b945ce14a7c7b331f41*5&3245240d2ea0792b4971ca02b1628ee9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"5071b2c1bd366206dc5e5fef89a88586*5&5a3d450f7b4a1e5cb79d5bb7f1380b41*5&d18be80c6844e2797ea9b2e949d13a30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b9052a794e380b9577ca8bfee48fed5e*75&f7f3eb328c3ce02fc44178193b93a458*75&41d92b4abacbd0848abe995fa6558b3d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"03ee6b593d550b68765cdc05e1828a2d*75&ce53cba9f0e9ab4ace9f9ec6503a6430*75&e3e83e0d0f9846d7e248ea3298b3116c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a10cbe799ec78a2a11b55654e192081f*25&72569b4eeb05c93281f0613dcefeba50*25&6d504be7aa16936720403119f113effd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"68833fc01f5b80fec2c299f85367d5ac*5&ce836010048cb513ce1db7da4ec3c827*5&1ed0ac1505aef4eb0470285b9a78e3c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"398f85d8c6b5ece7327af425ced54af8*5&878ea82b2b41d6269e5b75aabc881162*5&c48bf19c7fd2e5558fc2244774392873*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"ba9245dbdef673237970c4f8c20c81e5*5&b070e8075664e13b0fcd6cb6e610da13*5&0f3578cf0a2e4fe9aa609ce761c6e99e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"172f07a594f08f11d79d38a0ea234835*5&2f2b7ea17acdb0cb716447e2661ad905*5&a34607d8a0d754e17fb97dc52fd4ad75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e0ab42621dcb5eef997f6dc34771bf95*100&5ef7d2c50565c662d40c9d994cabe7e0*100&022fbc458b6f01d8f6384d38c5b2d6b5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"be6df2312d668a9d0569b61e90580229*50&f21deb28e8bde2599092fd9e0bbf4962*50&681faf5144db4110fedf3ac4ca396547*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"af3e3b27f0fda64779216f3391ffb3ea*50&82dadd8bc5ece224194a4334b1174662*50&d92c608983ff8af93bc4fa3587a2ca4b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"141e43fbb1717c6875f357e1f1cb3b60*50&a6d0d8afa98eb9814b0d5eb9eaff804f*50&4b952197590dce3bb7449954bbb30347*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"49869c6a07678080e4f49c76ffbeb743*15&316453b3855ecc7f08e65b85f7f5d41d*15&45234009523e5c8e77cb31ae43658cdd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"37d2bd598ecba3b424f2815a15616671*5&c2320ef2ec571516352a9425eb756530*5&53564afb58da3cc38c9f0ac9234731b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"b1b9d4dea2385011f7ee0d3fe2e11845*10&f0b1d5f3bb1fe1ff7d974611d4eb8b0d*10&46b68f78b2afbf29c9c173b42d717641*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"492bb2dd3a6eb065ceb5622b1f61a6ed*5&3decfeac7b3100fd56c21b137391ab92*5&cd4ac32fecc7e66f06507a736c7a334f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"67f3d7dc5a854271d2a2d65b0df6dd57*60&201a0247d74451713f446465fa50f75a*60&6906a17a9b9fc525cb86652512ef0295*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"e968527ca28a54d01de4980f10b197a3*30&7f9f44b74e9a8c60b99e7441bcb018b3*30&522ab077bbf7da9d35ed4b804c28f475*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"320906a7d5a82eb6312b044ab8fd25bc*30&0cb5c541cbaafda86e527977487ce275*30&2a413cadb249effb1fb2864581793ad9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"b3779f1378233a412dc18cfbf960234c*25&b409bd81521f4173783636d86adb95f4*25&6f26bf5b6af73f7584913fad0107b7b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"ada29b28cdaeb32e24f05bede29e5e57*5&61691cf810a66cfde6c1b95462e61ba1*5&8105007b54b02b548e55520d9673129a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"c5c6b842e2564b26f84640e192e488b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"46b1a57f5d5b0b9bc57df9a16f9ff222*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"21fb4f88ea3a70889f75ee94f4f2d319*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"23f95a169be358cc4813f177f90e605a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"f1cb612a0224583352308384607c858b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"d6925f71df7db797c38c71a2b4739050*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"09ce5a096b592ed538c922d508bc7fa3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"799a78ba87970ddef950fb758f2b995d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"b0497efe3a22abb0e28066c85844cba6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"a763bd7e198f1332656657f1202846e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"9057502aeba407c30fa288160fcad5c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"f7f39de8c29e31127cb2daed3d9b4fc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"fdbacbb5b7ce0f53e678608f480c165b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"5a4dca681743b9a16ae999608f6374bd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"3b4b19212646d4dcb477efc8468493b5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"990058ffd6d0a8a036b487bc0bebe478*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"b08c01e08180a451a7a5d6deae3bc6ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"df865085fd4e59e1e377718a7bf0b373*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"bd6d5723391260806a8ec87df4bdff4c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"ebb321c0e2eca2dd3402122d14c594db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"53bd4cf09bc6658f1fcb1869698b885f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"e7711b0d94172fb9c9dcd33bc2ec4590*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"f25d69294c2dbf9bd7d40039be3b8b4b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"ea14d179f7b04aa9a4a050dadf21dc1a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"0d68a6d7b1e9685614351dcfcbff6e89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"c9c8a8f60b1084a365732a9d8526db13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"70541fa10eafb279622e4af437178021*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"2e4c377e88dbb0bb0def08d90f060d98*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"e4303085e3536b59360e7e0e20c1afe7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"14d491cb64b8aa17d027dc3f1e88f6f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"67256cd2ef705177f8563e690ed99e2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"850f5f480b245979c6057163e0a6f6c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"bd667b3d0786a70211596551b38517c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"59f6351aa17427c8858187d631d67566*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"89b36c7bb197673afa5fb972e53c26fa*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"0c536bb5dbb8e84f9304f6719a70108e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb32_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"89fce95b95ea5c288c73eb08cf6c44db*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"afd4c07dd39950e6aaf45cb8e4d948ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"07a06862475e65f56e8806bbdf80397b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"8108ff57e13e5cf831aea5778ae81429*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"4dd4ebbcd71f70f742d4e6aea93cd68c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb32_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"c7b5cf61ebb256c651ce4d55990c93e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"0f3d873ff2c479cd133d1e43ab24d8c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"f4a650d41c17635f934de198413f8400*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"dd76142df26a1e6464a4b823f161072d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb32_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"b790be0ad390c2aab70c7841ec9be2fc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"4a5eb5ad7c1d0e2d701b0df31d3eceaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"5da7e637f1e5054d490435c3c20532e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"5fc90ee4a28babc379572f89ebf5af76*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"03b6208957c0293243210dcf5fc7f9e1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"7abf988b691b5cba0928542f0663676e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb32_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e8f74cf2130120e7cb8fe893f1d9c1c0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"09a617d9a833d0e9a5f8f34b49293b6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"f2a971ff5b218af6a53db878bb7de7a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"05e7e04b44122bd11186275b6a1c5970*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"0413c468da3f153f76a35f4ff111de8f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"341d645594db6cf8f5f52eb86927f440*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb32_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"2fdef038e938aee7320f5664395024e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc16_ih224oh224kh7sh1dh0ph3_iw224ow224kw7sw1dw0pw3_n"aac955ee495def7e13981f0be9f75b2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc16_ih224oh224kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"c0307c3b06cead7c4d5a64942aaec8e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"3f0ce2b5d9099867e9276c1700956d64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"9dad00e283701c9278de41784eba6bd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"943723c31d5a787fe96439c771a0545e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"be630518b2d2fd96a25c86809337ff5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3b512a50a321ea46d01cfbb158e466dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e6c6f7238f3d7647733b3ff0b379c98f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5f6bcd001d2cbe0fb3d76bd7b17523dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c83e5af6876caa1da397c48323f6a8a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"59cb653238227ff2f363b84b164d2395*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3fe0c53923342d6b4341813accd979da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a8afe5efb88b2bfbd29458d99f27afb1*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7b698015afbb5365332de313bfd94826*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic768oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3e4bdc3c9e9857abc9279ff3f7ddb961*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic1152oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"39ff890835f9fe2e16877907d1bc3d00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"098df84f53029b57f4128feb47db73e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"78c6f62fb097db0ec79d8ffa411ff52b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be897fc6d762cd9e6a27f8c13c94bf6e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fef3b27b92728e59fe41bbfaed8a9564*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic1536oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5dccf03c9185739f465e8baa3a1e243c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48c92bfa7d482e66eecb2c522b26fed0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic2816oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"75183814a4d15faa655b6c6bd3de20be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ca1f8173ede1aa761f712ae25323a0ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6722891377b31f9e03e3c4887b59d0c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"24df9ae708bab5d632c54fc2165803eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic2560oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8782ad3515f10e83e11c689c5baf999*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"4fa0ae7c9eb626b24ad22dbfe87c2351*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"f38d8e5f492509ca32bccf3dac504f9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"1d455fc787a7cf248aa5d9b81ebbf5bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"e63b2655a53ad87e46cc63adcb203a10*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"5367a3f22f3751230a2b27ff6c2da23b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"07126265ba13df3c1f9f1324e7f5d476*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"5396a082f231d95e33691c2c12b03990*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"e57fcc2faa4deb435c730cc34c13c017*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"27138da0a2dfa41d46c2a7ed1170ad2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"cde5cc9209f18c48f3dc526c646f731b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"d996129e3ceab33291693e4801712e53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"b166c59d2adc105e602e7b73ba78c753*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"87adcf66f6cabb639298e8a64e29891d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"e4a029fb98be4fde0daac03680cc164e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"fe26f91c762da2021136871ec4957903*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"e4f183d6471612a674123892c51a4c61*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"aa879833b2167f765fa9192f5903862c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"769c5b92ed1109147edc33582cf95847*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"6d1b601236066424d774d7252bf6e69e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"63c2d27ab8761f774744a692c094335a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"c6822c72a4a4350aa750744ba89eb5bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"a81d44f0ad9277de37399fd48acebb7e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"7ada2c9b991ef3e5c14d62cc26bde4e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"376e06c2c42d59a52e00e8deb9024b6b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"11289773612bbe1cdee92aba96e3ffa8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"cd2a9705b7702ec76f3648adeba53ebd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"1c4df47f4951d8db5355a5f0791a8e66*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"210d25d586b6d6e288a15ffda812efa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"6ed1e6b2dd58ce09a97880e65619a20d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"bae7332e044bc4b5d7eaa247bbc8c828*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"2585d84dfb9353c5aae4a6de02bc05be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"c7316944b1f67beeedd61307068776e8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"ba50fd825c2410e23ec41b5b1f09ea1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"b57b13ce8ced454373b2c269975f7a92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"0e0d9a9190490fe10a4ac8bfccee5791*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2c58e3abe1aeaa3e339750ccea7ca2b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"c8eed82fd5ab07fd5c6e4f83774a6487*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e82274fe99c0e586f05a0e6095283e8a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"aa5ae4e2fae380bdf5955d2d5fd3c7fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"dd6977316aa17991986a80cdfd6b79d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"1c6a833c0418cbaffa986bf1fd09a64e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"d12758c88cb5f69c622cfb50e0973f01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"219f30d784926220d0b359d7f679055a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"2f0b64aa6225f5a15a76b1cc6e284b89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b68e53b21a64ceab050f3f3420a4fa7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"97e0a792ec988bfe9da6fd3bd19a65d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"ad8b96a466f319ccb6182ce8bfc8abea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"f2838cd8a1fc41c444bbdfce7a345a89*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"4a69cde0d987b9f7fc25545b30c7b4d5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"3d989bbe98abee29034fd389cbd37b76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"c99a40da74508c5171d08c9f36647168*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d77a78bd6c4349da2ac98bffce174c49*3&0763552d7770f2b48c382892e372386d*3&19849a34ba7bf88752f60d90b905c104*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"98fb0778f0d87b04ee6b3f5704a145a5*3&c7b133227133dd297aab15015c81efdd*3&0d2ad887d4e47b78d0f52ce39b847dee*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f174566d4c22d6656b733c5c9c4af484*1&94588963caa5b280b2cc64ff5b2076c3*1&c51d4e1d1d24e671926af53f4f38c4e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"57a9132161365a5e3df23b2023836c04*4&855c5cd7a99ff61e3b9d3c71c9ba74d8*4&8cf985e957d0952ce87614b66fdf9695*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"24d9c51b51f65f24e4ed2c09ce509150*4&5d6bade90e4d142f323f19c0457b060c*4&163c905013723b57776e6e42de121e55*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7db8ee6418e6c3831e6a2c388a9edec8*1&41d5f4e496a26a6a3430eb5c669b0f8f*1&941654f12cb77205512486b0768a4490*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e38ad1cad76fc7ed0a88abc0b9f44148*6&16989d7d691b0d2ace44194d5795d3cd*6&c47086e59a8f24d7d985dad46ac51dc9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e1f831873a36882286da07c078df9ee1*6&06964fbed30401fac6d41a599c8c2b54*6&cc6c371baa44ccf02682c9e8b400c13a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ae763f8fcae579b99f6e04aaab49bcf3*1&4f597d98e86381bdb0812c44f8668aeb*1&05795537726587f2fa7345ecd2f33b7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9892058ab5faadb4c3462d61605d26a0*3&0b7e0800e7bf764736fd20169f9698ac*3&c3a4f6fa6f8b55d20e6d902a5b6ec0e3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c1fac35d80c705ef084d8424c2b59aec*3&0b3745c86fb67142586ef7cc73d002a8*3&fae4940e1173d5b7409510b905a36853*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"e7d41a8678b9d2441d343ea961377a0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"52b13301a4260f5a2e48abcf501f067b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"49899def61658e09e9ba83d3d8ad39fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1387a24b123729d06897955b4bd035f8*3&ebe8f17e170e8204a9c4060fc1e0c928*3&a402a63d00943f096d032f55ebd446c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53ead31343864b5c941ab2bee8783dda*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c198bb4959c0700bd16b332db921c7fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b0b443ba722c29f719fb036437a8e97f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"eef5181c66d62dcdf19cf62dae64ce8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"3b6b9966c8772aa32c2d7becd102623a*1&206a898182899ca20ace894d2c6f8a4d*1&2c675e6a7c2f95fb03a41f490e1dc90b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2bf023b0b326c487395cb77e9fcfebcd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"56613ca80b67070b4beedebf890560ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"52433bc78925c4c07293d6d9dfadb5cc*3&5a74896c3f85de440773aa9868e9e6ac*3&2d3cadb1eb5b6a863dc581133bc2b1d7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d437d5b223cc57525eaf517d64842d50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"04d15622639a8c9df53d15b965041947*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9bad7af44c2963549b8b8143f545ebc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic320oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ceb04effa998a8a9efbe2c3db6cbb6f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0ec249deb75933276261b4d0accaa8c2*1&0473ac97cd88fba7ac4572b5469ff14d*1&448da657b8e1172d1612d74f5cb3ea4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"69f611a20e0a5a96346e8bf01fc2407e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5434921da593c3e256aff3dcc1a190ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"964be029a17fd952e68538526017286c*11&6ad525b55082fb49b552a67a81c5654d*5&d40da0e24e1f95b9cda69e4252295403*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2d4501526228bc457eaf69efb891ad59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"56981330ac0205e59aa207fd829970d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a31cae33f6b5e2c8bda32abfe35330fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2caf839958c9990c30e6c2134ffa3af4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"44c8558492043dfc4e91a98ebd3aa759*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c24058eeaa631bdd2482f45b51ed5157*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7679baf12078ea92e8bb88219fe876c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3e2925912356cb6f8beb9e125ad38c88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0077942f7ae08454deb86823fa92b0cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f18bb0192a36deb1e2430ac2a3846d45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b4dcb2deb953b01d6f7a24a3ea4d9885*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic704oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"41e291e32c6412b53da275e42d0ff433*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"18e823f6324088c298947a3320374b44*1&5e10b35bc3f1d8410e9f1960378fc925*1&cc7ae2e46c8a7f0b9141fdb2dfc4b0c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9478ce9f1e7bc7a8f673472f71477820*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a38ab627e54c3560af69214c7e122f7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d0786111020cff0d6a39e4eba4366f19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4df2f6cbeadfe8806cf8d0219557b76d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"41e7c64db2f36feb40ab39a92d5c02ce*3&0a44e5d6afc719d31e2eb3951929f1cd*3&14a787edc19b41a1a0c1ab98e3e8fdff*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3e0abc2c9589d34b9bff85ee994f1796*3&907cbef1f7a7d1c87a89ab2dc793df62*3&afaa6d8a50aef91e587051466e8724ff*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bc27dba3bf77a8b86b1c96d9fac16985*1&96695ed2712d71f72e0fe9c6df2e9caf*1&5e35c373519460ffa936617983f6895b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fb476a571912991b18622bfdb4ee6d4a*4&dd481ea5eab4daa05e722383cf380bad*4&4658ce7325c76626871cc16d620f0b52*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5ddb38a9fd74d187a6e76e4c0334aa87*1&d42e7996a8d36bb4ed80b780d2f33963*1&d3b77712efe899e8c33711c27aafdbc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9775ddffe362b9721b412c92a2646b82*6&6a7e5edd22b3d75cfac5f90fb0966f17*6&3aeba4d8cc53f2ee9b489735853f5a3a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"49c562b539f74f9186c1431bdf436b53*1&392f4d1a1aaaa598d49ad10c7fcba404*1&29a45afb28ce8ff57c5e0d094f597d3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4ea4c3a0574eb03bbd84674a19a51737*3&8477adc3bfcad6914f5115a16a445c50*3&222b70c98a11bd9441f0756fa267e513*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"a97d725c1ba12c399824f4d0cca74720*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"2c37ab5060a6d7c32d6f649a2cc9fb4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"706192eafd03f6459dc9e835fd429ec8*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c53cca87eca1c8bb8e644e19dede38cb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"3a558506f387ae8b76072cf7aacabf9e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"39bb4b3a6c053e6cf8f24100360c4afc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e6f9e649f8cf42efaefc8530de938e0e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d8e1099a5cc48d1cdad3cb8c35741b43*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"e96c6e25f1ee7dea277ec4665e649eed*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d9a97757a72a9ca3c197cb0d56ec93fa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6daaaedcfda97dc841655e42ff66fb30*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a30c11333ad54afbab803565191a6b67*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih16oh8kh3sh2dh1ph2_iw16ow8kw3sw2dw1pw2_n"4fa7bc08274ac48b9d5ef8133ae7345d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih16oh8kh1sh2dh0ph0_iw16ow8kw1sw2dw0pw0_n"1e69f7f31be84d5dfa84f24c58851791*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc512_ih8oh8kh3sh1dh1ph2_iw8ow8kw3sw1dw1pw2_n"a80b50a8b5f78affc32988d57a0188f2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih8oh8kh3sh1dh1ph2_iw8ow8kw3sw1dw1pw2_n"12ef63c7b042cd4d4e93e6347940ba00*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih8oh8kh3sh1dh1ph2_iw8ow8kw3sw1dw1pw2_n"44c8ff6d010fcc204db05a9415875ad6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dfb87dc250097cc0d8b765a71137b8bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"fe3ee9d6fbcacba463ad117d5628f67d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0dfad12a89274fbfd40df5c15fbb2abc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5080a9fc8ae84389ebf7daff651e7ca5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"44cdf63a369969f89656907346c54a71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dbf6a2181d374c16269387cc045ea726*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f88d46f5a4e085cd831ae9898f3cabe0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d6a973c058ba5a453ea4715986b2fdbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"46395648ea5cee850cbd78b0eecf71bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0e7798454eeadb1212160dfe2811bc35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"018ea0bef193c5561b5fad4eb9d63fce*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0dce7f8a45c9dce232a1a10733d5b3a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic16oc1_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"61c81e2d0e903a5aec1a63afc7a5cdc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"3693425a6497aa33fd2d496e657bfb11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4754e1f481d1d0cd5a5439668eab7489*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"80510efcf45ef3fbc4557e6deff929b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6b1fa72da750771bbcdabceae58b3397*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=abcd --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb32_ic512oc1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"48acfddde0d7050a44a7a406c28ab885*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0ee72982404447b8b0b9e67df25193e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a4f3e2a7fb454e4fa7681a8bb59d2725*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3887aa0cc0421f05669616e9f5b25793*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"7c6cadade35eabde86d09689ed1fe59d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"bb2ce925142c320a6e7f20527fbe6193*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"cd20f9ac2515f68fff39b6c4abab7f6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"a212a04af2e2c23054d2267f5a98c678*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"91aa62b1bae48808e496cafcb53ed9f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4211c5bf6cbb1d62ae2df129da381de6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:15:ABcd32a16b mb32_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f7cfae728421edb8897345e269d6ea66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"41ed4b16db5e80c906031a04463617ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"36bc36ad727e86d00539e7c81afa9ab1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"c1d6cbd60d33386215d358ace863cf65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"6623bc9ee8ba6782434daaaf0e83d968*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"075adc32087090727c7c3859ae9f6944*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"576f5dee9ca6d3d3eb1df86d328d9001*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"63f142d10f3fdae851755af3c54a4104*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8bad2ade6305c4d133f57a316f26d9c9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7d9bf9ce87ebdc6ac8902ce77ac61aba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"4a2d3d26582ab03e77e0d9d059c7579e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"921d7d0d97c071e308b3319902ca7cb9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5bb9c9cbb5183e536ded835fffc51040*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7f7f4c3af562072577fb29900aa27654*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"031fef35206a6e129a249302237b9ea4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3c5ddc24b7c2a0ceb85b518925ff3be9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1faff48d4bac13facdf97c4ee5b5c5d6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"3640198f70a7a3fda6fb862cd9760ce1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"acf004e7d13bf2496fa0ae52a0043c5e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e3c98fa21791b7cea3e7f12d789d20f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"8a154f3c91c46f9506171c8a35e9f980*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ce9ff90fcc87b9ad57f0e6a6c7729728*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1bcb71cafadd92793e1a1938af5004ae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"b3978a42e73b865216c2e029bf33d939*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"276aec39b04a4eaca11fa4cbfc6310b2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"10118e8a84ba4284e4ee6c6cca00e76d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"92a24c1c7a1a6e34bce15aa1747194ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"af9f459a35c523a354f91e7c4e4c3893*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"f9628bdedf253423e5541956b068b1b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bb3de67f9da98e06d0058a21365a2295*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ca61b0917f5326af4b1346334c4f073c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4a28e0bc8f788561291ec90ce8e6097c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"7ee4e3d544d913848ef137f3fbbcc082*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"66a4d70b11c64fb40267061f332705e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"26d13c96100e27b5f360085f20efa482*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"47dba7fab02d0e94deb613959772bae6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"86e3383cc6e76647ccc4f93bb6e69621*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"5d657bd0d736baeed00da1538892b951*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ec9ace1db77dbae7c54a1d9f388fc2a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"2767215f2f739701a602c7d48b02df39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"4e07a850d8550c597d81a95cf1429042*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e18d943b6623dcb01a1f86b02735229a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8285a960f9bb28964c8750816bcc014e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"bbcf938883bbffb95701c851e7b52e16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"6e8ba8958be364c01115c7477cca709c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"2243650a2a28063a15d868d74717b0a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"bfba44ec2a9b5ad4bb8cc234bbe68427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"fa521315c3bc2f842e447c6c8063e2fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"def401420a53925021f23081c77c1ccd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"ce8466cce54d2dba3f2290d6b02faa11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"93399ba9a861253d437f868c73774c8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb32_ic1oc512_ih30480oh6095kh10sh5dh0ph0_iw1ow1kw1sw1dw0pw0_n"3c0727468d5cd28191ce0a9e2446a285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb32_ic512oc512_ih6095oh3047kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"06d7833d3f9cb8eb0533aa8dbc6bfd0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb32_ic512oc512_ih3047oh1523kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"cf4e6046309170b9d308836cd45d885b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb32_ic512oc512_ih1523oh761kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"bc03f69fc411bc620cc00db54aa205a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb32_ic512oc512_ih761oh380kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"eb204da36206dd468cacdb272fc401e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb32_ic512oc512_ih380oh190kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"744c72ccc6292d44c940b1addb4a1cc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb32_ic512oc512_ih190oh95kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"09ca5dba5788cd373fab386afdf70629*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"dbe17ee062ebb6888c54fb99e074000a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0fa818bb4729f73b71fb7f262f0e9e68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"3881044d0a3d89d284eaf505f3b6ca8c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4a120c151b9a0947bf6f6e03a1a35943*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d088490de777b35f846a070362a2cb6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8d64fda9eddaa37575e0a360ecfdcbfd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d0bc921dad39710ed17d3f17aedf08c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"14b2940e0a57e981ecf17bde1a63f764*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"408100c94169f0407e56db30b420cb39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4a6cb0c09521396ecfa27e2db60aadd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b15b042856b2fe18a0a2253b8665561c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b411f7dea22ca9daa8291514596c59d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"80d14ef421848ea468a313ae95fde9d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ee3ee7dc120d877449a4bf256f63cb16*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"368c951a5351403682089179f7dee612*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"687d529f3dbb129c2796fddf89b0d687*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5e42cf865112a3443d3582ca8e985748*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c64aae3adf4294e9bf2e66cee88fb51e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b35557949ca7a0e81cc854f734d34dff*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f44d2aa79d06af24b59a7d7f30eaa535*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"908d6b4d476022a7a0419fc9748e93df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ebee0fd53c7814b2cf928415ea7a5283*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8cc8ac516606b0d54100efd2804e5a70*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd16a16b+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99bcf66f105d2deed3b4522325b49d1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8d6e88d112e7ee65d36294f551eae2f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"80e26b25b601a565bf15d8a16d9b1e44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e0188d6dabe6935688fbdd8455706cc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb3200_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff4c0f831caa2d9f17bda65e647deb04*1&e2a3d2c18378e3dd517b3c3a929966db*1&8ac83055e98f0075bd321b24d11e3e34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"294723722eecbbf6ced6f81f885cd3cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8da4ce9ebf83775aa6b76663f0c7564c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0e68a8ae50317669b84caca0dbe059dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb3200_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5daf978958f30927ef6363e72184cb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"11d0c0566ac6e59d1c42497fb79492df*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"448cbcf64fffdb13cfd50bdd7278be4c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"74f54b6ce7e0d177eb64db50d7e94fe8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8d0dce81f482500628b3748fe11acb36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9f300b0e7d28add069cce4a4472625af*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f92a069e10f6181d557619610169dc8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5eba11d15c6c20feac13bddaac72e10c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"46985cad4132616b74884d9e649df2fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b06a96f97e78648c656cfd5fc6120fcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"ddb223003061ffb38f865d060d4522f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"03daf325b3bd3deecfa0937a9d840be7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"7128b12098aa0f32faa96a723a26b1ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c085d79fb06aacb86b18afdcb3c59db5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b4c322a29bdc20da8c22016da8be715c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1c45382a128deff7c1d588bb2b2d758c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"506317acab4f4e5d3ff704df818a7e18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9cd2ba1c1a2768e98ed76235855e83c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bc470b1134647f8b3c73c027703efde6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"5be879a89a8c9719c7862fdfa346a607*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f463e6d87dace529e7a8481a61c79b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c3bb16619125413cb9fa314dd8a93e45*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a96f4f3e95222a2874f7a73e5abc76ce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bffd7924cafdb332e88194805b64ce44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"313b835890234dc79786ab86a617de8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0b92b9633af8b4cd91ddd79b46312602*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"5974ece0dfe9666b6a34366b7416574e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0957d78bac9fc85cefda98348e923bd6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2df8cff86a72cbb1601541fd659e6022*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7016530d87a02650b721cc46bbab1ebf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"54c7c427ca08fd0d1555fe19a8eb54ec*1&7ff06f2a2fc64cfc0feced4b56c5c5fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5c88ed2e44584b99728b234069ea8c58*2&ce9328c7229faf05b6f33ab667e0f09c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7386d24afac3977e98918d6553613380*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"00aae33754d9ec8b17c4180fd9898fde*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b7fc982b84eaaf046e513db7b0a4929*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"a4b2e98ef4a1179521a112ba36d3bfb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"479d1d67f2133af6cc26faa499b86662*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c1245557474063cbee7cc068d835b1f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8d35ff3c0e7675da18701eb577e9deab*1&113d01405a0d5610d036fa7ad8abcf7a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"12b0df18466444052406f16078adf9ba*7&642f10d088d1de1312c2f822a425b0e5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g58mb32_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8f0bbcedc1437ad56c7af527a9672bbf*3&5871c17cff92f5b888eb764dfe2383f6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"6826199990d8cfcfa758c31bd3a3e88f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9397dfdc6ec6f5b1874c8fc05333968c*1&b753828a1317e26ac104ac7451b831c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4998bbbc87cfe92e09219088a9c7515e*16&8a816de97bc3e088555a799db9893f9a*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g116mb32_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"23e3f15393312bfe2bcdd85099abfe89*7&db12781bf23b467aa72a28e50b059600*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"53a128d08e6c9e63a4952d8da20b4223*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aacbf82b644f5b1565bb8b5cab6672fe*1&1b55830906b69ff6db1ec40633459c28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cdac471fcd194d821b4ef58ecf43e0ad*8&e0ea76aac0ac515478a5719b46246c39*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g232mb32_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"070a24c28295a920d77e553cd5f4e158*3&153dc64a3bfabef89cb4a3eb8fdf5274*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"80ffbd17b2f20a3a5812c889025f0ad8*1&c5da7600b78e4188013f7294c18301da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b03b389775ea97068e593b2269e5f082*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc192_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"662b20e2c0f280c577ed93bdbda79bcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc16_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3944bff858d7518734a4e513f7b1d593*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc96_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b4369476eb6c2e835adfc9d91b5a798f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"77b8de1b94c46e3407e987b3049183f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0f94ef8582098f4f8b0ed020d972e958*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc32_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"3868522dcc98f1b941fa38d46d2264f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"3f4518b1ab908f8b72e7bd93529a62e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6b7bc7261e7527fd3721fef55c5adabd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc96_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"a9be91b15c1eaae3aabfda7928ff4128*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"47a8697e6aa8227b013414ef994d062b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc16_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a888dec45fd71a4b6f2e964aa1a44acc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc96_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"75a9e51b22a828955a2131d31ac88b30*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fa14f7de0e30b345d0a405aad8d08a3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic480oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a34fbe6527c1c0a334aabc60a9fb5b0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc48_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"d051690e156c89c7bd47d4794e233911*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc208_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"d6c8efe321f63e9740bf71514f738bd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc24_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6ee526a4297c13e7edf2bec8c0824f4b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc112_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5e5d0dbf4b403a8e7733ff7d12de9764*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"720f37c78e169788c6e04ffbc7dfe04b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc64_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"489f7a83f6b1f14541c5d92b70f91045*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic112oc224_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5160089bb672f9c2b59128c71ae43a05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"146fee667a45ed6193ea0d3d61a6beae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc32_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d4878493d3e7b6753ac4fb0ef7f4956e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc144_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1ee481ece9444cf70665b12718fc0013*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"e41340f516cd494d7a526199149a1c2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc288_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"9e93809d4f22822e9fa99e8ea515de19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc32_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fb3bd694c6a8d057f7ffd5ad5dd78971*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d8fe7b9fe9c60147f9fa9b43508d8a76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1402bab31fb68128c599f372b075c8ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic528oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2db80706323cf65c7cc9502b465f14ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"e2dff6dd0daa13ffa305e5a768bcf40f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc320_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"006385748cf6329fa6819739490fed17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc32_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bf7798f42be96f9bcbe494bab5265232*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc160_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bac4e4af86fa23c2566ac411c57d360c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc256_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"dfc4c23c56297ff265b8988e3d8c852c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih6oh6kh5sh1dh0ph2_iw6ow6kw5sw1dw0pw2_n"22200ff2b85c73c39cb00c27c0de4d14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc320_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"969153b2bb96f4c68a8c40ed30356819*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc48_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"20562cea0afd5a326e074e323ae1c55e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc192_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d82caeac2f5d22f17541ce70164cfb1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic832oc384_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"23ece2a512c94e8815e219ea36874ec5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc128_ih6oh6kh5sh1dh0ph2_iw6ow6kw5sw1dw0pw2_n"bccac05c50ececf5e43be921b2fe1e57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc384_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"d66b03c916f8249dee1d51d2d283ca22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7e0af1761d4d4ae8c4ce3834173a5410*1&540884afe12b11e440c0a30bf62db816*1&765732ea6d23d6e98185e18b2019aa30*1&5eb12c80dce467e958e1ba7e3e903009*1&6ac8fb99c4e40b68a7d14cddb8aadcac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9ec7d878e9fed532790bf561cb138f9e*2&fc9195c7fb63ee62d1fc9f009d948402*3&d13b178b27f0e3c3e2a8fa67e957425f*2&c50b687e91c637dca68c4f287428ce7d*3&ce8f5763fb1b131d012660436346b815*3&a5615c028096b7d546ab0b5028fef630*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bfc281b0cbdbc33f3594f478f11e08d5*1&4f4f60c69bde07499734929f0bbaad8b*1&dc20366767d461d88c0f3b1f8fbe2d22*1&b7fecb8d644f32b697260cc29e8fa786*1&6220d2f41c74393777c88ebc676b3994*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"95b46a4a7373c1426cb72fba66af4a9c*2&75246fe13e3d42cf87dee17d531162e5*2&6b5dac88150a4ece38946d7384bbd66b*2&d208c13d206e7b546a6e4c1476517526*2&1bd9471e1e8503766766d4e9d8a41391*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4cf628442a63be1f0b8d9f4c65700348*2&5508165b7573b43937e4f3bfef8872d4*1&d1fc4f21040fdcf4261a8117e638aac4*2&6a58b1af444a6e076a433853ffe8b107*2&56bf4dfac9617a4a3386b9a1d91c22ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"1a5dfc808698d403d31ad0a56e416736*1&48907acfe46cce2250f48c1a8c733da3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"112d63a03b477ade72654479dab028a0*1&5a3013c82e0bee867b1a94617ad307a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"cc91eadcfe0ac9781fff0dc4089d0127*1&17b29a28f4dd60b473b1ccce4c5757dc*3&87bc744cedc3d753a39f2cee501aadcd*3&8865ceadfb7afc3ab3f364e727ab5efa*3&c41e16db25a9e07f6c3e97719b41303c*3&93a8ce81682bdbf26361267bd2db8aae*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d2054b2b6b0037fe647a00cde5c9c36c*1&e7fe3d6a22b6637d8743ea4f7793541c*1&bd590ad9754f843a0e1848d02a3c26f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9e68d036e42af136afc5a84ca2f48484*1&e7cf03e0d5f18aba4d8f81a8609faf1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b608901a164e3f88750a66f149948a66*3&0563fc6a85a53fb8ea34d9129d2c44ef*3&e49c7a2c1740d1252ed48fe7cf8b76fc*3&5499c4a61446ddc52635d728f1e52dac*3&72434deaab5b8e323da2ac058128b3b4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"12294ba9aec58048cd3442f935512308*3&b83bc7feb62717dd008f2b360b03f8a8*2&2540d5500f21d2f0d2a2f2e75aaea0c5*3&4f7919ff9b2f73cc20aa5e2c8af0a388*3&966ff8233ebd31f9e42e75aa328024da*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"dbc3f0c1d97f4084cfcaecb692ac6120*1&31bc845881d43279083d16177e4c2491*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a07f5647a0a0e1cd5dcb762a41be177*1&6da553f884d3209a554381aeb55c127b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df63ffc8a40cc4693d6b703dc215e582*1&d899efa8c6cb3b72a901e4f538136443*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fdc0e33cb9fe39045a95278d1e1d7d45*1&9fe16cc0b6894afa80c15a42e8364cd3*5&6d12fc55a5ae2d236418f072653f8b4e*5&65947dfbf0d7cd14f98244a31b1e7b52*5&2cc69e9f55db0041abeaecb5f1dc3375*5&8c1ca978aa935a729274a7cc131c2017*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14ad735d22072a3d404ab74e99a8c6ff*1&1029573497ac3f60da02290da316c4f4*1&59c589ea62f4fa2e86a129b74f12ddfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7ba0b23a226fdd139fad38f15ae3a00*1&7169bbdb2ef6c55ff1865fc5fc22ce00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"112880efa5374eb0412d7ae75af920c6*5&f3f46b625675e105c8058e3bad4f7992*5&90b084a3c1d9d33c60f27f30067f69f9*5&4274e69ad1d1da696c577ec22be5cf4e*5&1cdc8b647c0817cd91034e4cbb3944c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9419bf9f3bc71d9ab13dbb534cf2bf88*5&1a69f4b697e6b5ad98af70d4054e15f3*4&52083c888e4e719394178d3881f101b6*5&6bd5d7cd82f0f06727e2963bac317156*5&544bec883c2269352def5c8221ebefaf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"8f563ea4916797f8965190410374f603*1&3fa76e08af8c4bc6214a8474c19ae933*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"97c9f01533a375f6fc2cb8ca8bb03ef8*1&32959d573411e66c444d01cd33d900a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"90a67c29af214ff8c4fda826d52f4b39*1&87aba10f03f139dd67161ff621dc3983*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"58bb93fadb7403850670e1da63daabbd*1&d403a9c18b01739e2cdc449ed5994e00*2&034aff22742dd2963aedc0b697eab416*3&c208941d9104b0a1ad11e12bf820c852*2&c57530a39ada3cefcc75e46989bc5c2e*2&1e993427800550d2d4a21d5fd1a91071*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f9195c8a1f8e2285048450cc41a3798d*1&fde1b9fe375bd31f40101a6d5a75ba48*1&5ef34d19812a3cc90c2b885277cdf2ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"83fe1445dd1840c225db2d49ebc93706*1&ca84c1442c32dce48360f4732b4aaf54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"29f53823327c3f45b559237e51be6c5a*2&d1ed5aaa20a5d09cb32b4fec22743c16*2&affe2fdf5e97fe6af2c3fc5b326e8dca*2&b76bac19e038eba2005959951d7e4558*2&12e684429a2ba630be5cc986faffeecc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"32b4fafe7b535a0d448ffae3729ed130*2&1f9cc9b69edc8dd599dd09d1cbd52e6e*2&caf2668320328aa19fcc40f4ad0e8b3f*2&6273ebbdaf8e5edcf7afcd7137471182*2&adc7cacbed215b669e583ff82f30420b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"51f1972b2475787d493e35261e84058b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"2ebfc4c206f6996e7211ddd0724f3829*1&dcc9c42d665f47dea8a1249e72f4da03*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"652c8434d76d8506096a8f0964dc5cc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:acdb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"32c8298be6a3319a47da4af0a9268451*1&7b59903b12851838df862db9c6b7bd43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c04acb17009989a48e2ea4b0d05fd806*1&b8f171bc8b4909ae947b73dbc50ee675*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f21096eea85f1c5232bc091f64144979*1&16e051ffa3037889e95aa760d69a39f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"16fa2aa815f496eac1ce5a6902ad5267*1&a303e8df2062175de05a6ea6e097a6ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5c1463b0c5f8c7c90fc0037c16ef4bae*2&c73108b2cf33c8de4cec538839cbd027*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"be859c731a063a066519521afdb84fde*1&606ab678c5167ca7a8e0f70fd143c7bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"6faddc2505bde5536a72728dba717f31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"512d9b908f04fc4d7e5eedd8508a922e*1&02594ed2cb57dd9b612cf0380569aa3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"79abb21217bca3b504fdabfd52a0d35b*2&f4c40772986751eb47a75e558e6aae22*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"690c3c9347a090f348f7e27fd7eaf3dc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f9ca44fc3beff9ccd5a691e089d4822f*2&32aa0e0779b699cec627c1e67133cb28*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dea382dc017235dcb8877245612dee83*1&ef21b9be21e6d3e18fda1c07cab34ab2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"67c713fb8064cb13803893a011386f64*1&00b683063fbf4dda9d9afdccf1c382bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0805889bf2c77166b35d8440daa6fc3c*1&f887c9e17fd7911495e98cb318b68662*1&39efda50ed60d78cb8040bc29532b223*1&d64a1a5a481aa0857363e04c872646d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"20538c680a005521c3eb894620abd931*1&933733e3fd9ec2e048ea9bf350789684*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1627ffb1f6941d25e78318735e46a284*1&ba38e0bbc9c94f507313f6b6a15b80f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0608f83b3f636713dc29f1cba626766e*1&e9242f28a18dbd0d5881c91d144e4775*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a837df42d706ce6f892f03f677102c13*2&9c285fac0461d886ef962532c3d971e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c5787a7654ae9a2750c0b4e7da8e133d*2&b9d0fad477bed7d22067e9e6fb39885a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"72a02edab75a1601cb08c463cb3cf934*2&6fdd6b64f6e1ff46e8287350e6fee8d6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f910895f02de0d224b3ee743fcecafeb*1&dd8abf2853d7d534c7d305c6a71050ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f4af9ddf0b535a689bcfe9aed124d0ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2f4bde7636872cd1adbb32d869251630*1&8aa91e99e2061797e56d3a7cc7edb453*1&08ba4717613702f749007ab1800a889f*1&212375535e99ad1fa6c4aed5b97e2424*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"06ec36533cbe49658f879570804ea438*2&d33ba7581731d05d2224a9da4c09a915*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"519f0dba849f80eeaeb3bd44f01ded23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a405c449ce74a587f33f1626b5c04e9c*2&419d793ad219d2a2e9dceacab67e4af3*2&431ac1bf67c60a9bca5e7575f28e10ad*1&3908a8e825b1bf012bd9fe5fa40ac874*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"3f5ca5db6fa6ce4e4cce31b9d3e29d83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"384162a0c6ea1661b5d8b9546c440c89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc98ff7cc19f3bb165b57561e879ad86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"c752c94daed6c64fe898048cb803e854*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bacd1084673a12ea7c89306480f6d1df*1&a9435bfe700989a35738978a8e8d7d4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c2e4d1f51b20b2610679395464d254f*1&9920ff38e02c27c2459e9bd60cfa8062*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"ed08187540a86e156bb94daa7158c020*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"37f68898308bb3f80b8338b8f1c64bcd*1&7ef5fab1ad0a2cbd3e71f2944d42b679*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c1b9ad96ee91d96b3e6b813db2416b6*1&9eeb20f11186e9920adbebdb89233961*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"f743cd02b9321c5359d06747e3a4cfa6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"34632e1b0b1d9f97cacb8d31100784e1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"e48512ee07df3026d2b55c9dbc83564e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"acd0b8e83f4a004423799d8dd28d8cea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"28760d70e3b39c424e0fce6a2748c8ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3a9dd70b5a4638944c3006ef4619d745*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"05d44ce9a3d916dd8945a958309fdd79*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"26c242cf52dc54e193f7370951d7980e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"0c275f8efa6caa208ee480e6aece369d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"54656cdc5e14f798764c694237758ee5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"b74b0f9cdda7c60997d8781c19c3b723*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"997bb13f21a039179ad4a9f02ac77b6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"f40e1fbf21a1b6562933f9c14b448b19*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"c60da632c038cada4fbfb13ba198e9bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"d161e37726eda40b52309a283fda2b6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"1a24d7c55c67481bd212eb046a985ff9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"7ff47a487899f1f72ced1d5f882c3817*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"5fd2bbcc51b0ba91ab94d0968e13ea27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0f51825aba35d3674ddfca5934178a2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"c85bc9fccb96c13f3ea4a40b4e28f45b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3981e24faaeb769ba7ffedee5a0c5115*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"969e8f81e2252de55fe27e7be248f524*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"f89817b42346c549a645e8fc8e140f14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"8fa217d44c8ef7849e8d498aaabf8102*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"a7d1ffa9553679d65e53991ca410fbfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"9a80a918bcf9b287b70dcb86339c7097*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g8mb1_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"aabc3d73e0af367a62ec092904e22313*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b32030f161ff6994c5bb939a807e8609*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7818a4a1feabfff9b6c095d4fba10fd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"4e4be747e6b0c1f907d8c2b35c695a5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"cf46d5e6352ee18df261d4aff5d11ad1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g40mb1_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"addf7322ff0c4e95d78961dd725d9166*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"52a31ab1a66909dc7d178650c9319fb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"483f12ded033ea2b840934a236669e33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g48mb1_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"883b58e5017bdb6098e52af352a8c804*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"eccac8d870aa6280bc0ebe070b05ca8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b655d9843fb395cb022668ee0a36107b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g48mb1_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"414a7026d965342906b6f5826ee2fd86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ab1534edffddac9b5266a99ae5ce3e37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e9c6699b55896855ee87169246f65259*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"dbbdf4be3d5298d57c454004a0a5c679*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"70877dc6557033d5771ac798e7718d93*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g120mb1_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"a49786ec84a41fb755cd4784da4e24c5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"de7795fa4b28896cf523a3148be3f8a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b3df93742902a6ee8eaeb5bec0f3df3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g64mb1_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"dd678069a0691118ecb0ec326f8a83bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"451863d4e20a3ffabfbef43923f4e854*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7c4ecec123e199b1d5bf12c04ac945a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b93fe47369b3844968401caa642de7ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"fdcfd5c0492ce89b95a122ebc5ec5fb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g72mb1_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"6bcaca5b5b26d32ca4ddcea795260c0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"587d35b43fe4385dea64d19510d274ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"0e50dda3f839e8896f0ca8ee0ded43d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g144mb1_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"2a98b3d7b7186e6935a6275b712b4cdd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1f8c41ec9c5636ce789201a37b8f4bbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"073223bc2e8377f76672bd2757aa9b4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"641359026b258bf28949ef1bfeb41968*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6abdad9b3a65d87938f9f5428ed393b8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g288mb1_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"c8274be39102352f1ffa9507a5536a79*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b0aa6cf57dc032a09e7be57b730f982f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"dbf1e2142eade0eacd83df11ca09123d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"be08b29bda84bdb53118ea46a4d1e2a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"eb0b86ddde0b1d1fe7ae5f67f0238b14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"adb5b68c6603fd29d578d7a269725a73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb+eltwise_clip:0.271:0.314:1.234+binary_mul:f32:0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e2be4fa98eea0be048ed02da137ae8eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic1oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"1066c3965345cf37c3407f2e49fbe89e*5&aaecf33c64031e68f50caa770c00424e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic20oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"2226d65c8b3a4d12f957758e51ad6974*15&ec3de85b905a98a7b99e4d960119e6b2*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic20oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"aae9938888591d2e9d20e07691d6ffa1*15&b7ab0191a3fdbb6275c6c76f29f7b242*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic20oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"2df19224261a8f7dddd8869c58326c8a*5&27912a668b040c7025da8edd1239fef9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=prelu:per_oc mb1_ic20oc80_ih360oh360kh1sh1dh0ph0_iw640ow640kw1sw1dw0pw0_n"f151818c49a2492f5c2221f772d98753*5&72ed18c0cb1490bf05f3b14ba8736279*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic20oc1_ih720oh720kh3sh1dh0ph1_iw1280ow1280kw3sw1dw0pw1_n"3fe5557a032feabe8c00393773ea3814*5&17255c2645cde12761b6f0f66ecad8aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+sum:0.5:0:f16 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2fe2d282a5dcfff42b75e73870098913*1&3353c204d65292e1efd4f81e0521e398*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+sum:0.5:0:f16 mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"20754c8b27a4a2c9ef2e0ed611f7b012*1&4c7e4ea0e178af3d684052085d02bcab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"3e42aa36ca436281676d18504c1e1152*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:15:acdb mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d3a7f364c102e7dafd89dac62ff3594d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"61d033c42781e90972021237e16fd756*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9c8ef4536fdba6276c6b46e127793f40*6&3d1898e3543b90606312ba6537906a54*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ecd9bdbd4bf49caca15e5e9ded3797f8*6&fd559aa4ce9ff0c5446adc4b616d60e3*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e579ee369881a5a66ed5031e7abed19b*6&1ce2f8d3ca092672244c9d71626496b2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"41a451a1debca6892ec13a658293fccf*3&0142af26197e6248fbaef2e3a56f05aa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7c36406cdc09387f6b30d7b16550efdb*1&587dcfdfdcf4247404d1c523337af820*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c04584e6bbcded857e5212bf27388889*1&1ab5fca807f16831f363ba53104d58bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cb0f04dfcd357b45bdedae6beda39694*1&51b63abc02c3a736b71e5d0d07ed096a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"516cc0311adce975b577a09a22349da3*1&466a5cc83b8023a2a3e2e93f4a72171f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"193df84df917a33f4f687553f5655088*1&f4c13dd768ec59766f5f19350f8a1114*1&2efde2e10bf745764bff7c92fb3d6538*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"6674f0f4abfa4841bb955296d8bb160c*1&91eb5c154874f2c41ed1e57789ec0530*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:14:acdb mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9266d1b51b2d9fcfbecd0a49b5cf6795*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"df119e321d06a3cb3974843285f757c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"4abbc29be2d59fb59dbca11e4db8add5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"fe4cd2cd46206db3a107933b413ccb8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"0fa1a74742f58f5dfa12c18960943f21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a554bbe156c7c4bd3f68dda48d69b0a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"598a30154da4474d26190b775c07a330*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"5edfb2a10f6bd6e1c91e14fa93f240ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"429755e0dfeb66702b044933c2b45425*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"fc79e5046ca204855f0f3e83d4771fbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"082a2fd29c294cea03e6dcee57c2e25b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"93fb65657e35406d28f5e6f0759e427d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1d48a877c6dbe50b5ce3b613bfef3ecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"37d740889e943e2825a8d7b9cff3bcac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"628ecde46c783851454f4e37579a3176*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"599aa0269598992b9bfd0df459c5f08d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"388bc4e8108e7ebff5160f79d8e7d7b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"c0a4334b66f6a74cacbe7f9e5da37e98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"716b78de01bae6e3905fd3738bf75254*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"aeee73646987ff66910103a92f52bab8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"ec16d68652147391bcd3e03046298704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"f9a8f09a704f71ff314ddf7807607692*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"04f79c591f0ec52d029cbb3b5aac8ccc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"40758355d0a919861f6cf81a12a6487e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"2137e648f701461d2d0436ee803deaaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"d17dc0843ac5caa5985461ae60daf026*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"9d2e901b124a18448a481e8687cdcdd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"ba8b53749bf36fe27b4b5d7edd277944*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"53d25994037b7ea986b2b4e15efc374f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih320oh160kh7sh2dh0ph3_iw544ow272kw7sw2dw0pw3_n"9128d0f8ee0aae5e2e287db56e9c9fd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc16_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"0cb258200ed23938f2143ec35fb7337d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih80oh80kh3sh1dh0ph1_iw136ow136kw3sw1dw0pw1_n"6d044febe611c5116ee2297fdb510d11*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc64_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"78ecee1b60bf6490a74c05895627254c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc16_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"cef42eee17c1a3f1eb24ae40a057530c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc32_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"70a817458e14e36623405831a869e31e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih80oh80kh3sh1dh0ph1_iw136ow136kw3sw1dw0pw1_n"00ef8829a5cff4da3334b3e21f5d551e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"32008e712af4df0640956b358c198993*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc32_ih80oh40kh1sh2dh0ph0_iw136ow68kw1sw2dw0pw0_n"b6435b028f779c1222c1e3497fd60cd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih40oh40kh3sh1dh0ph1_iw68ow68kw3sw1dw0pw1_n"bf161393f973bc6133d522ed6466c347*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc128_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"5ed23fdab950ead4f2d42fe162d4f302*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"ae8bc4cf2d737f6e4cd5a37a5e2f4db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih40oh40kh3sh1dh0ph1_iw68ow68kw3sw1dw0pw1_n"669e4c083e4d763f3ee34ba0bfe43101*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc192_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"cc58dbb78e91911c2af024c0b7c49d6f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc48_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"f28e5adc6622d63c3dfc1cc66b27fca3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc64_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"7113f05927e1197172a0c9bdf45da19d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih40oh40kh3sh1dh0ph1_iw68ow68kw3sw1dw0pw1_n"13c60cb6daceaf3cf2193ac80d4a865c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"306d7c7e1bd8cc6f9e35feac87a6520d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc64_ih40oh20kh1sh2dh0ph0_iw68ow34kw1sw2dw0pw0_n"39f7a169b18d947ca2d8004ee8abb148*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih20oh20kh3sh1dh0ph1_iw34ow34kw3sw1dw0pw1_n"a2cc6459e67d55566fb73dc752eee604*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih20oh20kh1sh1dh0ph0_iw34ow34kw1sw1dw0pw0_n"ebc6f0499078e92bc7f6b40696e66c4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih20oh20kh3sh1dh0ph1_iw34ow34kw3sw1dw0pw1_n"c15d3bd9b80a8101c4ba2c61dc6e77e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih20oh20kh3sh1dh0ph1_iw34ow34kw3sw1dw0pw1_n"0dcdcd99dcc3ed3e33329e685f7d8a64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"457fae0c876a3765ef57abe84812ae94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"2f4446f3f7b1a131a4e1f10c990ddef1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b2b4133cfd09bfc28e197ecbdba3b9f1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"3898739d115f7ab24f7e17c5d0c315ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:7:ABcd32a16b mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3775ff11915c603b941320c499149950*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"9479692998f69954e58403ea2d342e8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5c2f3514fcf93c6a77c826628909165f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"72af50be3d0be561e8bce0e6cf707407*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b0fa1498f4d9dffc5c05d74cfa78416d*1&d3a870ad7d9e2413a7b3b13c10ed29fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"097e724d2c15df01f5216d743a8f8cc3*1&2c582e6531a94baa48d3bb5365946a9b*1&7acf4ae52304cbc4cdb51ae077b945bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"42f2c63251d9e3a9ce65eec8955ea58d*1&042dc28609ab59f6cbb200bd073609ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"390eb7c3c7159e22342bbd662f3ee05e*1&67b3de8ca8bc321656976e0b3700ed4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8b7aeea32cf6a0c9bed43d83d66a1625*1&408839303d944148df3ee2ccfddf882a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"d1d1bc2bde88cf04257543fbd77abdfd*1&faf43bd3c894830ef78d55cf1ff1ba16*1&fbf3d4259e6b202c41851ba18aede34b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b56614b9f8c31c8c93f9b4ee4ab2f161*1&6cc028f1d45dddd56a400c3e70d4ca22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b97e985d7638e0cf97a6b3f6256dd08c*1&f9f2903efd3fa9b08ca2c1dd4c1ae330*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ff572053fd035b07d1fdf8fbf4cc4feb*1&39834ad113862359e30d7a9a839fa28d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"cef87ece8fc98184f5213320491098a3*1&b83cba46ea468a0ebfd8735cac526000*1&053afca0a7066bce1985024b8fc2412e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"1aaaf05b3c7930533310872a61d99ace*1&338ec572bc6b784e0cda5a86a1d4e0ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5a66c05c7f9c628b76cf2118a9980476*1&150f0e6c45fe88b7d1144d683558955f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f30583aa7409f392904a7050dcfb54cd*1&e973a242594ae49c782d24b5f182e0bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"76dd903058bbc7d9b9bef8217ee24ef4*5&d5c88cace5352c765fbb071ddfc4635d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3e74756f2a7790721fe0c83c91b4cb70*5&b16fb43ce7d4367754abe2e42c57c1d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0222d64371091515a7b2c4500fa45703*1&b52e70b2a0e441d2130aa7f554aa846b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e520cdf690e0f8041fcb8b7b667be6ca*1&c5f24b3f8a45717e3cae338bb0732127*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_tanh:0.271:0.314:1.234 mb32_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"31d050186c423164007393a6ba3ae24b*1&52f320e71dd6800b0645e8c57ac72b6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5875eff4da79ae519f85dda892dfe698*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9ea943f7e595042a40e0144dea2cae58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"056012f69642c4131f1e94d23b4e10ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7cf1fed0f92f90f8b03092af524e3c57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"0a252d503bb56496060cd857520d94f5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"a9fabea6792a87f908de18a5eabe0258*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c1733b350e876c52a8c7f92066428738*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"22f3d93ac8d1fc8426dbb21af268f7fc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"eecdc7c32ceea99e994e43e6bd408a97*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"09cea31f8b5da0fb4f5696e2a77d2807*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"beec96da449bd3d4fa9835cdf6961c6f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e7f3d72a214a1d1838cdef1492a8caa3*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"812761190abf8a91bce3c3c34e06bfd8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e2ffd12732a50a6c1663ecc16f26aa92*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8d4f06b84d0a9987c3dde9dabf8f7129*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2c19d784864fffbac85cbb8ec38c439b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0ae63e800b992463a98a93091173e1e3*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"11437cb0c750e20a44a3797f693c8fe4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"eeb3a1d552d0cf9fb11fa6824f0031e7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ae9aa620feae56a58cb1519105f31014*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1d1d49986bf625196405c9d5a54647bc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2d946adf562e91c9f1b1327d91cbe11b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3eec3ec9b4eb3da344180689baa5466d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f694b164a82704995011a78d65493d1e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"93a694f90ea49cf46ee4e69c08d570f0*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"a31eba2e1b345b92c9fa5cd7ee5bd1d9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"f5f3204c7d8636a1e97d402e612def4d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"144254c1e0b48c4304b26e7b42879135*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"8733362769d2cf520a6e81ff8f4150d1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"b3e0e2f9590ef6516009552a8c6ebc78*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"b2617b14b1c44886d5dda17a2ef502f9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"b948ef29625682bd7dcc5be0fe24ddc5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"77d7253efaa724cc24b1176bb1000eb8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"484bf47bfd62016818747f74745436d9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f715118c892c8aab4e8c571ec801a712*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"711955f1110029e469a25963c0b242b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"265b305f7adc6f603424f1a34c08bacf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9646c4e34afd4c56693b151c3f98b4ac*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2bfe10dd2e310ff01fdfda5b093b1ba5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"02233b682493775bccd193deac5ee30f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5e7cde2643ac6f331b9156e6a1365639*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e2fe4f08b882c09ddffc4fcdf5ebb1c1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f76fabcf35b4c2a3e79ad2016bee0b72*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d2bacf791eb7ed51ce879887630452f8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b1c71d4bc742693a61ffdb191a9095f3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"64410ce76fc98513bc7e814dc5e53872*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"94a21907794cb4253f6d639b6dbbb7da*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ec695d887168fe72973230e3beafd589*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1029f19071f00ae405f1668eafce16b2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"03bc8c13ecca834ed8fc932ebb4c001f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"58b339d4b734bc61dbad908b67ba7097*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c807d2365b91006bb9632dd1fcb1ab63*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d4d20fe8ce9f4ade9b7272e3d5ada833*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f14ff3b4a350404d2c9bca160062d4ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4872210683b1d0a7a2aab37ef3917427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"4e858b843351a948c6b569b044817950*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"887d7fe3a1f866faf6a514b02dc55b7e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0d1c63b70c90f7bde9632e81a2141d85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"590957b8cdc7b46e4975937ac859504b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4e66779142f5b7312d713e288d12eb74*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9d796376e93dfb41ff6f2f41173fab9c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0d4259824a1c0e8978da111e6f8a6442*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"18889c3f4dc799a7ac42d2b1fbc8d593*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"33aeccfed6fe467c8237b2c48337a1cb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ce9a483f5841b4e08f8d372d32f716c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e4c02d01e83a199497311f2bda8fab58*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"80909e28b53bceff7d6d8d9505f0c292*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"de03c3e01dc3cfb3aeedfc7364760935*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ad255b55c4b64c861b4b6f5dd0b14406*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"cf4bb0f0c93273567c67d94ff47bfdb5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b322610639262fca33f5f8cd8f914da9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"a22a5123a02b4dd0356548da321f02f7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic896oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9231d6e063bcd82537cbe730a498a2bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"3a757f330900a10f7ee2e05ff380c4ce*5&5b8733b41e5c4edceafe2417e5b70452*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"c6cfa1e63c2b565a138a4e52d4120020*5&51530c7095291b4e75a0caa5e506f213*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"aec5db0c3f1793490f573b7e573aad52*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"c4019749521e2c719c7fb0b7e53568f9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"5483b7d91c529e04e777d190c58872ca*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"a2a5ff824fd1317a02fecbe8e6bb6afa*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"2e55e1c58d9d88ccbf374746a3d5d0fe*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"4e732dfa577f9783c26c7da1115409e3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"78efa74d6c616e1d6891bfd4891a4c1e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"db1fc33c35da96bc93a9389c09efc106*5&f1d6608e97bdff73a01cb034e3187053*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"369ccf8babc034a5dd6d8ddea104bff5*5&2a46bc9beee6d6003a2456d1aac5905a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"69e8c28cf0972319324cf6f058bcd33a*10&9a40784b3120a5938b24b286fe63935e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"a412ab90755c84663dd05389fa049e9d*5&624adc8a9cbb65fc4eb0c85729a01659*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"cb691da915ff5c69eda764026d6cc8d4*5&d41301ee6615c0192811f4bf2973b8a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f55230fceeb897378377c9999fb19821*30&bb88faaa4575c0996009605398061e4e*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e85bda2dbb91612a80654a37592c9c9c*10&4517c1f10e9ec66afd77f925c9733edd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"67fcbbb2176c491e81f5b1ef04502f5c*5&a377fb30202397498915461de64fd416*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"6a4fbc158fe24153ac81fda7c974d055*30&affd1f233b9b4f4a0849e3d39de621b8*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b2afca5067ed8dac6d3d419db6844b11*5&576767639856045e925cd9d205505c98*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"9824d91612d8eaed2265ae0f11c223c5*5&adef6c5a9b5faa92e9a1c976e6070950*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"af7eff179cf3e70036b938184fbb23e5*5&56568ce1dbf132c8946134a8892831a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"876ea3a9cf9ee836dbb74beca58a57a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"494cf8c699255cc089bb5d575e26ae19*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"efba195b5a6abe4cb1aedc14d4145412*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a8a2619369ce534eeb56def1a0e10ef*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"58a3713eb679e1cddd440aae11e65eba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic144oc192_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"41d58c3b8f541495c240e9d634f3512d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0e85f8a2b71ba684d09aba8ec152b5d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"937318bb33c08f835eaa261cda0a8cae*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"28353896415d3e3b16f42fb76a87c45c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"24561e3446f2a619e03535844a7a6d10*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"60802ac1dc48e78f6c7f1ebc592c78ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"176af501c4c39e722522f894119edbe8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8d6a60a864f0e4f7b36c63706a1eec30*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"d5cefa0b35159f81252bd09c5669a1ef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c33c60f6ef009ff4e735d1785f5036c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9a12286ef85aad556025ab8c95c7f883*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd88327b8e8aeab3ef0a5852542fe198*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00d7cd268937e21b2a39c87e5bd76560*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5b76652ff4e93f107f20a7e79201fec7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b4a2e7c4b504eed0e1e074acdd5a6776*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dae7da8318d7be37399732d9ad618fd1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"670445c0fd845296fe795e2b0f1da22c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c171ed0cf3805f24196fad8e1dad007a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"83bfbc850a9d502de41c012a979845e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e5b34539d6c7c1b594bfa27062c1609a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic704oc640_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"890339f6d9439f548757cbc6b9862d4b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee791ca3afb517a720547605372193c1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cda6ca09239dc2a5a985187db3732bcf*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"657078154e0d6cd48171e3fce7fa1e28*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1e9ce4a42101cf2b872985da24420844*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc16_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"8cbcbf07be85ea4fc58c8586a3fb3ffb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic16oc32_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"8ea53e995c4669dd2f8eea65c60743de*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2d7a8c43f06ff027a9cbda882d3fd34c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"936c45305cf287cbf3d1d0114e7eaea1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"9450634a9105a7ae5e974cb77a985924*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic48oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ac00bb62055ab1cdd80ec7039daa04ba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"5453a1e870d1bd3c8920ed619aded88a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"eea41edca4733bc79790a35fd61fbc14*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2640b511bce3bad68b331d381affe2d8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"cde1215c535a8a62264fbf4b9c906e05*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4a9e04fe185b4be1ca37578fb5d32fd4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"bcc9fbde66f5a9b5dd757cc4330361b9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"da291aa0aa969c757d80b930dc4c50e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"0e6d3552e8ceb328c909be88a21bb13f*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"c2e0225542d4d4da281d52b8e66735dc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5e76f7d6093b2e4d1afe43b3728cabfa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"9cee36c4f01551df9597275cba8dfcdb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ad0f7b7747e23c993cb377a406a806e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"3b7411c1936182c0407237570508ed0d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ff1c42c77956ce979d0adfd0b1f86d3f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0e7cb84ebe44db0b669e8f8c029d351e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"dcde1c6d9908d6c114b42b6ac8f5ae94*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"194dcbc995ebc6dd1529456c7bfca6ec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic384oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0376c96eb7394816dd328b756bedc432*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f332e37331ee13373b1b84987e21d2cd*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic192oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5a0a2816ed6b234b1597aaf10066a2ee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"31bd3549fd57811e9b4091dc610c1a23*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"b49fe03b9a444fdb21b6f549c946c048*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"49b8a363bf9d3129bebe7c8bf8e3aabe*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"351a51d3a8a047d1973f22ca7ccd822d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"6678dfd2400fefcf79bfda3bb31c84a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"346b2b9f3c8739c5cae6c28fb4928736*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7bfac1fed358b28e2067f8addb43774e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"df34ae948aa9fe9b6d32fba2f406f345*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"bbfa8372c3fcc26f4854aa9ca5ec3348*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"42fd5c92c0049e4fe70846c508391f22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"2ffbc75410d81c4566b474018a147dca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7dd985212c523871b0e5ef8c71f23df5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"75b0721c8a310236e19f67bba7576868*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"7638a47c8a5624dc1d0119232f5f946b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"091bec04e98ba7287886131bd9ef57bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"64cc1665401877cdcc0d56f21456af5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f2aaa001be0c8f9f13d750a4a161245d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c5a7c2a21a37e5e3ccde1c31ad8ac296*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6cf1e855a621a4981fbf35776d96d484*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic1oc512_ih30480oh6095kh10sh5dh0ph0_iw1ow1kw1sw1dw0pw0_n"0842bf8bdc561960c865c217907ac484*5&8fdded29ab9553ec208bab343a33a89d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1_ic512oc512_ih6095oh3047kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"e3accfab88743606aef045afba00a1dd*5&dcccda4044dff4b5dd67dfad6a8f4807*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1_ic512oc512_ih3047oh1523kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff1523b34622d60fcd73fb4c4bd16849*5&dbccd85904dbfcb3598993b830820af3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1_ic512oc512_ih1523oh761kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"7eb2f18fc98a8ef84b8f50c1fe18bc39*5&c6ce7675d011906050946fffd2e61e1c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1_ic512oc512_ih761oh380kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"b7f60b9f7a1d5e6e589ee35a16435a3e*5&0f0a4b653c46a668f9aef084a3eb3626*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1_ic512oc512_ih380oh190kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"0423e477fe86c0c5ff86135fc12e233b*5&c9331ec6fa559a918429ef8f88f0c442*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1_ic512oc512_ih190oh95kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"c397328e9b7fd42495918e27a4e8a7ea*5&1da84a54391cb79cf0b602d0b08aadbd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g16mb1_ic768oc768_ih95oh96kh128sh1dh0ph64_iw1ow1kw1sw1dw0pw0_n"de2a10d26bc82a6f4845ffeb38979a00*5&86f46c805a2540fda82715de1fdbef1e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"d6f5b66a0b2ddd8536acce4220b3ad39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"f34e54e8c5e1fb6bc551bbba54af546a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"59d4a7783b49ed2aa68930696064795d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c8ac9c14c728be1b8f4283e5d0f8833a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b0e9ce5c8ba3ef249ce620d6b0e33260*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c6796fd717d0daa9bd1e0e10f50fcd25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"cd6a95cd53c459801af088150db03c8d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b1e03f42a15de504e60e77b83c592727*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"54275b1a850e096046e08061a5a98858*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cbe9b2153d68e45b0a566d86eaafc08f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"43effa8637204fb9fec96f8f99b4835e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"a958a7bbee73f63ebc5d802dd7be760a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3b093c263427dc5053fb5e5db6e641fd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"09441d39797dc1d95c7e616e025daeac*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"0b04d384c7285e81c43ca5902e4bd28c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic36oc18_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"200947f5d196645e4f009b42832b52c9*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"b5a2ab70e497c0178417a42cb5f8e93e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"384eb1399142353cbb1b8abf992b3d42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"f88cbd3885f8bd20bacdb6ecdc882659*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"7a822db62e86b3015efe9d512afa4d87*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc18_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"798d7fb52f9b0bb8a05a82d6e8697196*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"8db9e682ffcfd6dfe1a75bd4c65e6dcb*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"f6268dbcb6231d3f7901ccb5b48b9fb5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic18oc18_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"da506c4ff0e32ed798f2afe55a70f21d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"ecb4cab8bfa2bf315e95436e164fc623*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic36oc36_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"d768e44932fbb87f99cb33db48cd4155*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic36oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"e48893f41fea1789383ea10bceb91e3e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"01b41d3ef908f40d0fe45e850f2fd161*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"6a80080d463998c357ee9a6042eb5c8c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"6c3b51e29bb1bb1be3b826bf67ee892e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc18_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"3b30313dd8ca8f94cf9ceb4493eb1852*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc36_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"b6363cf3e5bfe4c39f03f0a8053a9cc2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc72_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"bc6a4ffa37d6d5c9e2bf21e8acf492b4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"c37f566ec2c4e5d02f2f702a6a6a8c85*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"17f911f339b860365a08ac58dd3f42a8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic18oc18_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"4439c42e926b6bb9b3862300f856faa5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"538e0bd11542d4ebb377c84e3b0c73dc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"89b1171c105c2f487461557344fcdc7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"bfb2fa26250fbf0570ac44f823d083c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"ab346e9653c31b7fed9a1d57c7c5ac6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"924084914804b29314b36989e70d5274*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic270oc270_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"635112deaffc94283dca26fd063cd80e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic270oc98_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"81abdb046bbe503ed694d3d7550cbe34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc96_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"d9b1cf6167a61ad56667d2d80d395230*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b5c9f3fb497a72dcf6ddff2e96168507*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a8f2a9f9fdf704cedc489f0bd8b5797a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc48_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9b41804495bc479454fcc3960f2a890c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9e5c0700957c52496df27f93e35f111b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0a87a2f19f61f65aa0a4ba33b82fcf50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"da97a56181dc358dd6ebaf41ba361894*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fb7c8a98690f99538ba6549039b0e34d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic240oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a0b66f269a89661b8bace8c16a937a6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g288mb1_ic288oc288_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dac47768b18310b432119dd92d535430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bed38d4eb91daecbe9f7388498b2a1cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g336mb1_ic336oc336_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b09188ecb3a7053e77809789ef3bc479*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic336oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"15b721139e152fcf68b31dbe9112278e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1ac83b4bdede6de949090922d428e46f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"34e1bf32267f3556f4426f15e9ae868f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc48_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0f53f9ca0e2f9b2bfa10573f4c8fac3f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"17eed03abb7a92c8217396764917a8ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic240oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"772ca38f717a1ef2cf29ac68f815a56d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1bd1c08c9d9663acc30dc186a73776d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g336mb1_ic336oc336_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9f20e1deccbbb280d4a115b9f8421265*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic336oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"318f27cee05c3414bc427e8878a306cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4fecc3350c31f31ede2440fd1e4b5704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g432mb1_ic432oc432_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8946a9ebf5a72b808ac8762bbaba8480*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic432oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d2b9a0a985a161150aa59127e6ef6e6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6e647a9c1e883c8eb2540d226d77756f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g528mb1_ic528oc528_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"09340651c20045e920d57d587e387ae9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic528oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df6539f4ccc82626db9ab018508745b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dd6ffcb6862184222c4a79cfdc05de57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"82d39f8f621d938d53e72b6c0ad7da6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g624mb1_ic624oc624_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e01644318de430162cd6feaac46fd759*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic624oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aba67ac76119b152ed2f799f4a75adc8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1e5877b3974f52005168e5919d977273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cde511f565afd757e930a9e2d57a9fcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g720mb1_ic720oc720_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1d79691a72dc83492ae2cc5f36454f39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic720oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ac8e14e8441ec70fb72100d747e8eca6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic768oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"26d5452846f37b33f5da7d678c3cbe23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6e234134f42f0e2e6597806fd780a94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4582ebbba0456d53b3a836cd0a21e1f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc48_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d4bf717c39489b6e7e18f23618d2408f*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g432mb1_ic432oc432_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d77ed299ad0bdea3a61204c69305a205*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic432oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c9d9695f94226765de71b30e8bb14688*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5bda7e8f702086d9f1100c73d14dc26f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g528mb1_ic528oc528_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c21258847347992a2274113dd37942a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic528oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6ec88774ff96860f60b9ef7b90044362*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2e6e5078d217b561cb4103533ac98b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"92bb2be5e031ab2feb8f4a9eaa3dfa59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g624mb1_ic624oc624_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4e916cf619610594806ca4f06a6c9dde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic624oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ef22370f820311ccfeb616f01bfac155*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f9eceb5302e66f536f10498c18ce94e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"296cc91310b5f051e637dec535abbafb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g720mb1_ic720oc720_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"09ed693c146b0823bd60f0c6e662c4f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic720oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b657b1e630f01571a5af769bb917ce72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be3f789de68f02b7ef1e3129db71f83e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g816mb1_ic816oc816_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9f8f3015b812af6d897f00b9b2f64ff7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic816oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"43f9c5d8aa15ae3ab7b2f3f0b22995a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g864mb1_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f34db5feeffd3f3feb59daab3af7621d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"658c3e2b14d1a4eae24027c9f0ac5a6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g912mb1_ic912oc912_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ce9421e65e4ea86187a7e4bb79459e13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic912oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"760ef19f0b19367a6382a01a79496337*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fb923009f711426163b359285235591d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1008mb1_ic1008oc1008_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"027ff0b59e2133651616a668bcc54dd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1008oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b3ce923bb6692bc1ff2fcea034dec79b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6aa05d85f4e725c6abc10ed8990671d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1104mb1_ic1104oc1104_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d190303b01883f747bc9e29fcf6b186b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1104oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"18fab9b8911de69a0fe2aedd534adbad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"445723bf99d8b4958b495d213eedf335*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1200mb1_ic1200oc1200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37baaa97d80065aa9e9d3ffcc248f0f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1200oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"12b68cf13441d6a43588b8a79f6a415b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c1640137bd90dc81df76d301ee655837*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1296mb1_ic1296oc1296_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f14e9a738f40476f40c4b3fb939b74a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1296oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01029c8897fa89f62629d5a587888be7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1344mb1_ic1344oc1344_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5024411ab25064494f5514f21eb7d8db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1344oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"634ff7452b666f40515941d29dcc4b7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1392mb1_ic1392oc1392_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e3181680d201bbf20f2194234b0495d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1392oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a285c5d648d0cfd83546d28c38c7cd0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1440mb1_ic1440oc1440_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c9747395fb621a507fe8d50d9e9ea33d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1440oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"648f93eac00d3a41f4612e0fee10ebf1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1488mb1_ic1488oc1488_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"92b5102a9893877dd6e1855d4df4ba2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1488oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"967af8ad4e21386dc6a5982bae709baf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1536mb1_ic1536oc1536_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8817bfbecc1da3ead51affe4379cdcc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"05ea887df51d8ca89e726ac1f0b13008*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1584mb1_ic1584oc1584_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a54c88cea97723648bbea257efef9929*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1584oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fffa34a6807301955d1d4faf08b3c320*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1632mb1_ic1632oc1632_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1c2870cf32df77247569ef9575dc80cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1632oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"12f3ff63f2487b5e069ace938839bc56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1680mb1_ic1680oc1680_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf6551722e8aa045d7ece81d89766428*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1680oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4f8be6735626e5682430c5809be7ae34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1728mb1_ic1728oc1728_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1db180b8aad05b87d5431abf551c919*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1728oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"28491b168b4a47c43cd238f09883459f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1776mb1_ic1776oc1776_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8b561b7b25280cbec02388e0c6e4dbfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1776oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25ded2b79ae5ba2b0faedf4391dbb55f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"23fc5171d8f5a5de7f8fe1c8cc6a259f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1824oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8a194e5b8912d5a1bee0d8e1c0aaad16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1872mb1_ic1872oc1872_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1686e27b70040edb61dc1f35788c7889*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1872oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4edb7f9ad43790c45e991df7fe191e6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1920mb1_ic1920oc1920_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"de8c6ab115761d9c63ed096243e5330c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1920oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9643d45abafc58c9e6aabc15d577c0fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1968mb1_ic1968oc1968_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df0544185f4de6d121d7bbf8949891ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1968oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f80a44085e0eceeae7a8a9eda8c35333*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2016mb1_ic2016oc2016_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2ee79b3cf499e1bba0670a2500da57b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2016oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"046d2d4ec44865c7b99f20c34f76fdb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2064mb1_ic2064oc2064_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"38c5cfbbf852b76a3deb2177fe6a7da9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2064oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6555d727a6c3c222517b6c127bbbe48c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic2112oc1056_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"84d1a24bac010ce5a04596cd977d656f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3d2ac8849336131dbd643f6d02287382*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc48_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a86e58c186282f8a283403bc92831a90*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1104mb1_ic1104oc1104_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8d7ffba47cf407eccbfed79f9cc39a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1104oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d8774eb9c15d62c29d35f92a8ba49aff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c1e1b3cc426573afbac0d48bbd8dce3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1200mb1_ic1200oc1200_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c161699b09383a14861f282ada0c840a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1200oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"772e4b263f17cca145878db3d758f1d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"65c8c338ae59ce9a4d7144e2f05eaef8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1296mb1_ic1296oc1296_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"61b680e1444a0ae86be14d10ddd45fd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1296oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4a4f17f53288f19970adc118940efc5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1344oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"72d2364dce9904cc7e1e5b21480bc006*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1392mb1_ic1392oc1392_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aaa06c2c069b042cc574975f614cdbc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1392oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"911da7208d7fc3bd34eaa3fe058c72dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1440oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2d75180f29e3e734fecae0cfb0fae4c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1488mb1_ic1488oc1488_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0a87abc8ebdd207af1aaff54ea363b40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1488oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e8e311b5b03cdadf32f298353acb1c0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6fac5c82f50c1498a4cbe5d4aefed985*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1584mb1_ic1584oc1584_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"37c7a3e1f5a67feae1e7e7de8629ae6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1584oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3ea1688870f40d265f796d3c8619cb3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1632oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5ff7c61b88d4b1908e97af32a5c5c9fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1680mb1_ic1680oc1680_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"065752b3748c3ca9fd4215d553fb0925*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1680oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2c937a3a26f6b79b9a4411c40b87fc2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1728mb1_ic1728oc1728_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b6a44c1da92bc10b74e875eeb1bbee07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1728oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6ab271519b86bbf567636bdba0886916*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1776mb1_ic1776oc1776_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"570dbcb4e31920f3f4404339ab772cbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1776oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e854bc5dce314718c8d0d7526dcbe496*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b5ee8b1069878903ccca5b70e9e3cc90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1824oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9d9a9bc9bc52d0f21ce8599df0b0eec3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1872mb1_ic1872oc1872_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cceea5c598ab60b94bef3ee71427250c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1872oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"af9b618615651cb3fccc4fa678502a0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1920mb1_ic1920oc1920_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d9d7e94925a34a42aaebcc6ccebf004a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1920oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cebdc560bf3c324ba941aa388c53414c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g1968mb1_ic1968oc1968_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8f1c996d1fae6d73f7bdf7f13e138e17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1968oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bea973cacec7e1577cc28fc083d5338e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2016mb1_ic2016oc2016_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"21b267a24fe80948187563c8f3e76f14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2016oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2c047c7d1e8205f75115d4814942133d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2064mb1_ic2064oc2064_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6bd0c91f5e18875cb0526fc6360f2860*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2064oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4f58109cb3b7ed90b11b6f842d8e75cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2112mb1_ic2112oc2112_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fa916de1d33cfc4b9db5faf00a716c98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2112oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1202a0cefecab5e12bba664b69459acd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g2160mb1_ic2160oc2160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"32ab79b2fd2dffe9018aaac615371c69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2160oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"59c5dd868844a4122d5b56a4d3bd8a73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2208oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff5cd7fe14983bdbe98816d84a48f4ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2b242f93e1df762133fcfabe77177bde*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1cbf7bbf756e1a29fe99b7c7f091932d*5&8a970834259a3cdcb399b24154bb0f49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"83796382eae472c86ba2a14733af48c9*5&20ad67e2c31ca37b92f65394972121d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7cbcf0cbdae8e0644a628e632d1bff96*5&0fddc3e20048eb269168497445a39292*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"98f385b54d49054b82fa2e9117b95b14*5&0bed42b73c52c9eb1f18cbc13f53eec5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"15badaf11bf0ca8c72a4dc37e09d031c*5&e9f4d615d90136c5484bb2542c71e1a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a8628287c5fa8b33fad9ccaf5eae23b4*5&9ceec664c83d799d207d0a8e67848ecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb --attr-zero-points=src0:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"23b9af5ec6647efcee1d86de2321c21e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"2e840c57b16f8166aa9d3a14a2356943*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"cf9ee164f884b341bb9e597638b017b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"f0d87077cdc9ffb28dbf59656d5e0f57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"0f41ca70cd30096935efc40096e24b6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4ced3b8674dcc5dabd86c3aec25e79c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"1c11097413e4f9f4b779fd71ec86e561*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"da8aec2699e20e918118ce3e7d3b6dbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"26f08e6772ec1df7aa4605430ee499c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"cf0b9ac78c67cccb0b0bbe1f46e2d7a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"34b504caf339568e499d1ff6f00065e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"8a66af98b5a8119e3592f896813d46f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"a2a408d1bf6b12dd1773610a8b3540b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"f3396214132c7862415f8a92869ded4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"e740719910dbdf9a5d4effd1ecd610b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g1mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1875f0fd4cf6b2d054854a89385add5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g1mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"556509b206f8472d22d69004a9f658c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g1mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ffa3b377dbf34a18f897d454aab44e4d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g1mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a2267744d1e4560b2b8d72cac2bf886f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 g1mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d77f4726aadf689a034abcfca32508b4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"22438aee3c16a247340a5af1b40a3223*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"08e821d808ad745a1c6b1dd757163ffc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e89c1a73ecf394057d165aec5f018d1e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"4deea8975269cdf17f2ba7b83716a6e8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"22c81ce5f6b4a4a49a2d54064e2d9120*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3cf491a71e9b3744811b03b91cae7cb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"93be3ac7ed3c4c21f0d9e6217508c4a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"06ac20207bca35e5520730cf9dae655c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"1b7ad5e729e85495fd81cf6e4a442464*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"895a965fdb0659892317c8ea7313109c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"5028031428c8ff4cb7edfc612f1df2a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"20935fd4af70fcf3bfee2a5f7d8a5f13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7bfba751c703912f53f9afe00115d4ba*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0932c1432543ec23110fcdd0be3e6b23*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"1ec7a4cf7bcdeec4a45a07eb1acbd5f0*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"36eb06353ef14fce218d3c66fd0db74f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"6e9bdd839489b4aa3624da283134d3ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"9863218aabbcbdf5419c872748ee1ad1*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"29cd16bb37de19e61594e39dd39235e8*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8587d228e6fa76563f24cab2143965c0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"955fb5d6998d439c6d84a47fe091149c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"9b51dd7a25197c998a3d17be7e5a932c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5548def5bdc24acd991032096f0616fe*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"727313e13591dca507127b909c29ee96*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"97dd4b1bc8df7292d8e4cce41d25eb56*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b986b1b1dd3121b40f51cf27ff2ae18c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ceb55d3a4deb02eb2088df8562b15914*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b8a7247be5ccd085516b6daf52dc2da6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6316e30bd9df4e521e8ea891ae46fd1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"4ce5a2e0d5cad486f0fc991bc044f7aa*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1153a04b38a28c39721bfb41d7b57067*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ed4918529ccd82057104da24a80ab635*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8004795ace17cf75de2488745b2fd91c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"951eaadf102424ca9d845568a49d27fa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb32_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"c56cca3e7f5663f25c27df8712ecd1cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e6647ff321a51e1e91c932155907a79f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4ec84c4d3658fd7c99b7eecf0981480d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb32_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ccd4d9b09dbec887682909e2c39a902b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0c5f245315c97987ccab0ef531852c73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"cfbddf9aeaa732dac345a2af8e57d142*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"a92fa024e02b2d6ea4e8032721b2e552*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb32_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"36b92f54971f9f4fd2db7540b6b69723*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1af0a68e17174642ac43b2d0828eb415*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_add:f16:2 mb32_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"dbe7c3ba42e64986faccf6a8ae717962*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"f50eafd28ceaaf1ae38203773758c33b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"e8ca44fb0023db78a807c9501c5f4e46*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"b5c7e3d0469aa4c59480794e08b02e3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d85d358ac9fb0b6909d2662b534f9602*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g8mb32_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c0c091fd59f9b882c8b01b22555fa154*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"07d7b28d359bb601d20f8d9e88beeaab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"365f26817fd433294a3f950307762f1d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"78f7c1bc1cbd339cff5f76bf600a18bc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9fe2f1e4042d52d289e1b843d471057a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g16mb32_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"7eae29e912d3bf84cb41a198f42b9e85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cf6085e9c5057e58e2a74cc6e0902e95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"aaaea6bfd46d9dd8f0f438c045aa9605*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5c9d175dc55984436201c81393e1a088*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g16mb32_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9755ac854e9738f87cbd61f13c388953*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1c441722f1c63f6c0962d2c48bf0caad*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3479cb8ba85c78b109e373704da8b276*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"f019d0be97a19fd5e42c6f41e1d085a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a101f0b471bf57d625a9cff25e2cf748*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1e28f1d353814f14d00aaaad9438a2fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a1110c0b70905c89feddee5b15e6f410*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"80ac41a1be3fae8bd5e98eb363ec2649*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1164a0f0ca834d64c97abf91b424b9a5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5cb04bf37651316771bc306c9d3b09ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"6692ce6aa1593666465973d37bb2f38a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4ab1374d8e0c107110cb84a7cfbc0d4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5319c22c054b2ff070290a1e5d62c36a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2f19309e10ac9073953b77b01be2669a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"69c09f4b4c9ae008011bc347ec1b1d22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2d7a55c80f6bf01efa70484c8b26dfa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"518c7e733fb2aa4eb90963c7c33c55d8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ba97cf084ce42d7e8405ad30ed358be4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0+eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"19b6b40b9370bdc180f8ea391949100c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"63f43e2e952bebdc2a3c0e1fe1a5bbe2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"31c3c69acac0cb297f24ece69c8dd547*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"573380ae1b18a4348edfd05483777d72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"603acae447a355fb528f32ffbcb6541a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1caa116665d65d38ed7d9657643cc287*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ba0705a83f8a7ef733aeb50bdc701ecd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"f0a5adede93dc2e0ac7c484d238d9a97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"4fc926b8a95439e2bd496eaf29b28d49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"b2882fc1af2108ef7a91317d5350e6ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bbc0e3afb6aac7865e3713b1af2c00e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"0115da03eb2638fadf6d49ceab712545*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"b404935ff894560fae35e27babcd218a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"e87c74caa4de9b8c49461542b2cb5b0c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"90da1861a87c3285f603e36629b6b0cb*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"219818b7d72030ab6449c6debabfa8b3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b61896352d95a9c1cc2a26bff78f277e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"cef1f85f266218327d87a6fd1d119ef1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"5bb4b7a67d4f91045ec901c9fe94d9d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"3efdffbf2ca86aafe6eb7b10908a9547*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"42b7be3ad8de0907a0d68f8ba129fccf*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"e9ee17bc39084d1e791ffaf122bd864c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"3229922fcb070740332a017731c8f8e5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e1fb8a0126a3d2bdeb3c1cf384c49447*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6f795080fb2c042176a30feba817f05c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"464d16b719d1f288e23f44a79c4a220b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"63c3331d07cd260ae4b64c105b4e58d2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"5b97964182f40f5feccd9b2077e5142d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"60e08c631f5d57de58e64e8ef272f1c7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"ff343a45b8e62de8ba51febfa882a0de*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"126cbb403b6fe2a50d5fe76d750478af*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"86881cad47741de6e7a15679df8c01ba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"12606262f1f624c686f338a223f58cda*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"2d5207a6be8fe4fb23329e566d980aad*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"0f39e24ab0b2a8c2970048c52691b343*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"f0a5d7be2d6fe7c8b0be96e97f57dcd6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"600a98f16f3b3bd02431badfedbc0aa8*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"18fa11aaa56fe9dcaffb2d500829de39*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"4539af685de37c913351adcc7bcf21e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"3c20a028d04f98cc8604d2aa2e2838ca*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"c1417f5f8f92fd56f6a4eb19ce97b412*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"8cd65de62f15a126cbf2cf776abd0d67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"46866d9539744954ab01b2f0266c0589*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"3793f8af7fd2daf6f993900256137114*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"0e07954f386b5ba42e6cba1d17625bd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"01248c1a128c7ebf943189df0c0a53a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"8b036f6e496f0052f250ba8489fba88f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"f6fb9748e272633a132fd98a9310a2b2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"a972ad6b10f8e5386021b0e7b1f7d770*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"caf250916370a65a37dc3ebb5a4cca00*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"8df58826d8bc5eea8d656a12269a0b23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"af767c9c6005ef119c43c140f0635d64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"7879305b871f422e144569c83326c305*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"71b8c8715d9702031d4a957983a48125*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"65abb1da7882d54d53fc02aef963bdda*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"90f8375989c56d29ab41387c57dc1d90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"61cd42a20847e7cb6667044454d53ac3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"28d2c04ebb831da7d539461aec3d123f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ffb32f62a016cf0bbc8793d6bf4df0c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ee3f28ddbf2117aa04863accbb611e3d*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"73bdebbced7e9ed5e757cc8df48d0ca2*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"936eb0d0659133f6f619f376d72e1385*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"61b8853653d067ae208fd30bada6c2af*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"f5c87d5d7d90b47c95ca72c15c2542e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f2a50cee838c17718093b6888c1686ec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"21ee9af3d3a23cb00f1a4870d09e0b55*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"93be7e8f32a91c1e4c3a68e88f23aa31*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"7864b319a1c5a066185536e69bcf06d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"39b28c1f1aead5e02b9bf05a80ca4618*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"66f344fc78710b0635a2eca2fed10c11*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"92037587b79ea324c619728f3156e129*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d813eb5db58d15944535f36c72444da1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"73fa269fb55628b254dc56f2a5e6f51d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0f767d70e1b366c79391edd48f76a545*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"cffb290012cd2e2feb83bc46a67101e0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"a647b8c324ccd5a68a80c0637f32a430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"dd26ed5d305df0be9e54285afdc71812*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"024216ad4be2feea5777dd0e948bc4b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d630349b4e4087f350058213309fcc72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"539830c7483b6a0e2df2ae468dfef179*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"ec4965440567efed19da37462ade485a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"7548854e9daea0abb81bdd6775c166a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"ae1934db928a88ba9cfa41905eb66f1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"79f75c23e11608459866e899def6c601*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e3594c9a2957edbd1b398a22f6e02417*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"4758a0a0d5c54804723bf4e3e448f8c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"a0f0ddc64c12ca0d0cf150c6121f54b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"439db2bfc6a605c7810c54a7ba959c8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"910e95818267c41533fdaf14b48cdfa4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"faff1f19dab9561a23c09af342ad1782*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"80589d615d87c7ae2ca32927c39f483a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"d557e8562eab91205a7ba9c458b27ff0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"5ce551bf211510846a06a75dffa4d703*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"58a2e36a1b1c8fb1343ab9f612eacbd1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f477c6f8a66b061aacd750d08e610fd5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6299469b5d8301d24ad1e8f3aca1b7a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"8bec1eb6d7529947d9d39cfde14760a0*5&11a0651e1de59901f527a8b30bc3eec8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"e7ae82291d3bfa91f16809a4515ee36e*5&92f59c3840cefb725913e4fe07f0f630*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"40d22317cd7c877f51eda2b073022ebc*5&229fe185d0dbeefaefc43421acbf3a37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"dc4548d5d76d8ef2c176ff9b5497a51e*10&3641c0f42679949d754370ddc8902f3b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"05a04317762e936dfa4b5866a5c17b4d*5&8171a6c4f0643eab405d9c9e60577647*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"a87e31bc416d83483c177a34b557aeb5*5&a56a9954cdcee1bd265907fdaa60453b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"f32cabad33126997d8b38b86b0d65316*10&e25f9b8a79e5f07bc9d42327009aa8be*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"01d652a9cf6f07c4d21c785236154319*5&bf97621d6a9fdaef100d35dbd5e4e0cd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"055ee2b04f67b81b7a176a37d00040d4*5&690395f8376ca66fdf2167ae5fd786ce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"ccd693bb1e99c47f64fa366ad8af308c*10&b1a981f53b175fc544885993a82c63b7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"d05c8a925531f03e0c19293fc4bb4c02*5&e0f8983525fa91d23716262d0839f022*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4dd2c191e3a3de2ca6780fbd865ee4b6*5&e2d5818c6fcb745150118b6c732180cc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8296625efc702bd9a22322626ffa2b96*5&49a2f5c880cd22be79a7f6d61b83ddb2*5&530e33b539b3b12cf7b0e0a0fa362a33*5&5005d16ef2646f6c6007ef73dcf9bf24*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8ba796b84728c51613a83b050f4acbf4*5&6c3313c6f9cb2fc1ee563c6d435dbdfe*5&93bc344d623144d1992dc4859e97ab52*5&6bf023a283bc16d94e8380fd4dd9177b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"bbea9b71c638c4db56161b6f4240d71e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"b343bb089b5c3a8164c8e21db56ca2a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"1208c2703b2e2846737b8d259c50ea22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"08713411e1f8cf527f7a50989dec3f25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"2a923a76e3397308fe492adb5f035edd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"463e4976cec3c050c45fc5a60e391c3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"779697357a697bd6b646a5ad48746547*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g32mb32_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"65041a1b6bccd8e8eb04043c456c3841*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"89d72e2b74f8b0deeea9dd92f058ec88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"25f67dfdbbdd9bf1a31acc65266519bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g96mb32_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"2fdc3717f155bcfc626a6b1895209f19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"8a38c191cfe8e091919fd16e9fe97a4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"009edf5b67c1b4513ef8b04aaf4b5aca*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"dc9ee24dd5c25d0e4a143518ad9ba71d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"264e2a3fff1a5e129b6ae2e322da75dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g144mb32_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"12c3947c8c69479c6965f67aaa8d5b68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"51bda68671ed52fff81e511f67ac8724*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"2df660b8c609626533a8df69265dad31*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"fe1ac33249399854626e454e0339a1e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"d66a4c433620b51c59f73520238075be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"bac83cfd0d8577974b05d8c1e2e944d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g192mb32_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"ab4707d2b31125baeee58ce0c5fcbe00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"17c52291404a1ada8f6082ff4bd342c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"5a0b6fbe524562c6bee3d52c9dbd15a5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g384mb32_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"b3f1e6886ebf5db40684ba6f3295d314*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"d645e60f884563945560f312763c5f10*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"1511c9960a1acb22720d0cf7cbce47a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"4b2510f5472b508d92c27db7a11e0f3b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"9c00202aabba5d1ca26b8ee262b7d241*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"fec924e89265c2059e4771a125213f10*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"b8eb9e73f3bc26c1dee181765463b5de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g576mb32_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"ebff6c7f609f39bce2c63edc0e9a3d51*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"3cf6b7fd0dac4df163bfa6dfce572793*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"53463dd53ae0040e46766f6a7ffc1479*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 g960mb32_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"c6590c407a19a39b1eeb5bd9852e42e9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"3b4f4f92f8528c1bd1506fbfb6196b31*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"2f299fe6e2ec6eee38135eaa05f671b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"6cea7bd8df5b9ab5383f2d371671d680*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"39329e25056806de6bed4af6c7e296f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"6f3a0e589f205f170d68c316bff36f4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"7d6384bcb5ab5aa96e33fd2c24d34f72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"9e437073c3397a27daf56c62d0fabdbb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"43d5bd8b512da7570833b03b24cb54ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"d451975c549a292a504a9911ad17ff7c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"179b317a19d6018a7252b88f244af209*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"53a58ae8ab5826b65e851799b00127a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"9405cf36e3f268c89e01d91120db114d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"8e2933e3c275e4b07540ec449c3e2630*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"f8e0de112f3b8f023aa4ddbcc9683fc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"77654be6dd6c7810596e5cd6b8d2f739*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"b13f160265f5f62217689c7c8dfbf0d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"c2860abe49b54fc7578cbf079ba72636*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"202c4ee951dfb261137ed9345156f94b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"375de0a461bf8de579daaf8afee7ba8b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"35398bbe3cf2cba4c33c31f8a6dfa89c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"4a61f920d06be086cb4d47b9f36cb9dc*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp:0.271:0.314:1.234 mb32_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"b0585da74f04d3da0971d7009095e4f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"26cc760ac420996afc54ecda435c8ef7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb32_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"0e08c40d070a83bf3fd52637cfab4f86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"80d07d98b6a091c0f1f5e2c28e1a5560*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"dd5dd414a0526742ed240a250f35acd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"130b724e3ba184de61f2fb97f27fe137*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"19f21e813fabfd554638da6dea4787f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"cd144b4feba56757efce69030fdda898*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"74be5e8bf0797cc00225fef1fd715f0f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"eb1102b32dd02d50592a037d1755118d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"f4ce14819e6ba0fc8fa22ae1d86d91e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"3b4c3fc4924518e954a065afe78a469e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"080ec617ad3d886e8c20adfd8c08d7b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"e8fef359063e07e6134a0ccc9e29980b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"eb8d16dd21291c71e79f0a926f657d3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"73f104b7e5314bc894cd3c1817ec7463*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"14fd1b0400334769bd8da92f853ea3bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a06a3db6fe0f3c0895e2466743b8c685*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"f2b5afca54f85ce22482ab636e8e601b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"3b5166078e64ccee5414e11606c04197*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"2849f5ecdd53cbbe0c79a2dc6cfd7671*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d4731b6d4d789a8cd1e15a1cc34951c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"fc84ca330e63148ec1e75f00a33dfd77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"31b907fcf54bdd354c19ea533e7d1522*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3aff892b7e9e97ac450bfc1139ac317f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"9fa7e41261d9273ac9d1a0e90c95d3e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"0a1bd21ce25dc04c27c1354a685096fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"ca8b9f7fdaa782c948b2218844e5c560*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"4d373a14c59b2198aba5492a3dc595bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c1c736f6bdf6e23ca9a9f6eb595890f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"75c9056d91efa68219c19b24221b98eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8146e53656c6974e9638158e63d74d11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"552b0327e086480ea1863afb0249773b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"7f7fdbeb913ff4ed2dd9ac8e2db389bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"309a0a72a315f0f06d61c977d394c22f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"c37628f3774615ac235c21866967525b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5a2b520f72f8c42e0a35c029dddd0c81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2bb661de23d0b91acf84b0a3ae85d051*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"31555029ee9544a4e956c54fd35df451*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"4bd8e0a955e69336b53f912c37d04c10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"be44c8bdd81f79346bebd626829a6d6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"7c02224168a8a18efd6a0a6cd7361340*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"81dc90e14eea1cfecc04e453f81fd704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"7f0a8ad1662ec7ec554f1c2218aebe25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"80d739d18da300c0c479dbcb4f121070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"9ec9e136da848e4501790eb05fb18659*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"4ae19510c897c7e6ccd5f72f8a19c422*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"923fb527bd12e4df7bc5e7c1e14c643d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"346b05ea7fdf8e4d706c5b02947a6f8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"dbc9cc06cd6b64dd53062c207d112805*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d6b58b893f9e56496cc21c04dcef817e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5b9038b89d30ab0b34afa4bd580ae543*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"ee4a1657398029472ac33df0b24f0997*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"e0c7e6401bff360e2a9870b1ba8e08f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"552614e826d7f1e8ef9829fcb655f29f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"b90a3cb324cdac99a627a6e9257e00e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"f4bc28ca7683c851df43409c981e1ff7*1&ac110bf8e0520154b4a6b405bf048974*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"356614d82f69fb884384945c6545a91d*1&eddc75971b2a92df513be515910ab118*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"5ae2ba3a76a56b21c8748fd65b7dec07*1&a8b8242729b695d4f9e70ff469d18a64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"899f187b8b46a0a99982dc040827f4d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3e0ae55e9183a5a90405a42cbf72a96d*1&fc72f8832245c2722e5c0b56452863d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fa6a9f9a56c79793510a1f2856ce0bb5*2&e806a28407ce34e759f179081d98f30c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f5c177a7f51677899af02d4462d93131*1&3049ba65306b35978356912900b55a79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9830c37b42eff2f5e5913a3f5ee35d9e*2&b3a9864530a877e2b89d05f995858058*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"f499f23c37737bf88ab5a654f6560137*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"82002dd9b7ec5bbb9493d8a19ac1d12e*1&fdb0195f3d5b319fe67c1fb7b5b9c22b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"97b034697987a5250c7b21f446da70ad*2&0e3792fc627168f1a1513d3aa756d9ce*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb32_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"5f18a479524b7a1548839f86d456e8f8*1&a322f0725909a479d70a6ddf99b5c65d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3edf33a8138f8f1909f0b01669f63534*2&204f9f442de332c3160fb03ada9f7405*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aee8402ababee5e632acf5d9a16ae617*1&8c188c2b44f03dc4c82dfd09e1c6f06f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"6705b5cffe0d2559d0f2abf2381149aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b6224769e9b2ec653eb9e2cab8849299*3&452e1b8e115efbe2ea08363a77336cd2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f6d7cef1f6fcf9a33333b8c4eb18bd37*2&587bb22abed8c6f9008ef51df22436c4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"492eb722817a19a12f1840c483e6d57c*3&7ae25ca67b47306e35ee2a03b4a8a8bd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5ee9a6b170bffbae1cc9dbbf1555e3b4*2&ee0d0641bf3ee54a60b9f616bc071920*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb32_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"33a46e35032379a66462734fbd12c4dd*1&c4f9176a1add1034e05ec5bbafdc1e44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"020f57d90032ee77f0be95f650b32877*3&9486eee461ddd07de7e0ad537f0d24aa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"1b282b2dac09c770ade2f42ef17f9718*2&1dbc6c0481fd5fe7be17f34f2fb42fa7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"12e75b47d9ab04389b854becc3f97782*3&acf1bdf367dc78822ccd3bbcd1d5c8b7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"895b72b6d740db22e4c5d5189020183c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fa4d8162e5457994a1db9fba71191e2a*1&155898c459aad9d38c5d1bed5629657f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ac6514c0f442c25f5368c2ced968748*4&bd172d4673b6af7fb3bfbe99c6c50f35*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb32_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"87d8b76ed79b2df23d76b3e21e5f8c4a*3&a9f2c2de60462791e162be1ee60b9b99*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6cbfba8dcf9a0ffade345aab943953e1*4&d2a92408ab215f45fe861d1133083fa8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3e1d08649358775bffc7071dfce45dc0*3&b816a49e861b7ad595d1f8c4d2c9b09b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb32_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"643f6df0e21e802cbf04d16698d99e15*1&4fea4434b0a89e7d6be444960018a0be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bfcdaf63e44fc92f0e776ac46c71c82e*1&2f93ebaa83c5e6a0280d3ffd2dc15728*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"41fb1e241eff5b86917078b615e61c4f*1&6be7c4ee5bd76637c0b3c0dda43bbd26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"1894170fa27b7533f29bbaf5946d16e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g32mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"26e81e7a85471e48c661fe93be1975d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"2d810491c44a79ea0dc20de79467aabd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"1b5d5d2b79fd93430a2f7dd3c2e79bd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"8fff28a43f6633448fb60ade9ae3452e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"73988827d4ada2c96d54bd1608d44db8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"16fb7fe24b1bec4370a3c8e639358c92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8fabd2f221cfe5c4f8d0bea982af27f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a33821b9d9e2bb81c6aee0c9a8d5cf1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8c123761fb0ee919e4fa503ac2287487*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1d8446c104ab0ce3b8318dcca8fd0919*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6bdc201ce8bd3a237f8f6147a8c527b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"6b5fe0e97d11743f5c048cfc9117b2d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"db44560f200eb8cc865051af3059d7f3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_dim_1 g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1c28e9c32fca3dbb1375116779531721*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"74f9548c34202a45718d04b82e5ae01d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f46d1a31506e32a61fd3fd2aecea3ef1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"cce7d119f2a3f153053418f9b51e5ee2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7612bc589efc27ea339ab7292fb768e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5e854ad0c91875cf057f3c2304dc1ad1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f57b7af352bea6fe0f622f988114128e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a9437d62a6e79aedbcf1fc9c729ee66a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a22fc533431ec4c222db01a301a15a54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b53866affa90ce0211bc7d56a8841fbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8949cf7341dd0ef28a9b648a08a5d3ea*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"de4d18bcaa719d95b299ff002acbe97c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1171c7067a9d23c2fc97668533ea3770*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e9c00518ce4c6e6d82f35d90fd9dd395*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d4f92906c3ae8c82fc53598e04373aaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"caadde7c60c88d13d0a68d244828e3ed*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"36de300da0a1eeaa50c81bec54ec3747*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c852e1ae5afb23abb02f6b176fc1609f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+sum:0.5:0:f16 --attr-zero-points=src0:common:1 mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"46b07403dcf74ee21d921bbbdbd7ec79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1f6deeb1a4c30ee8dd3cb83495f9e75f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0+sum:0.5:0:f16 --attr-zero-points=src0:common:1 mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3bc86407d9a1ce7e204cf74c78d04f64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b63b5a5c8e49ab70c3e42f4a31c65778*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"269fe89754e788f9680e4d327f26ee1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"09236c8a7f36c115a7617fd05c91604a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c9cc729de84ebb5cce29208826fb613b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c5befc29e122c553551662be2922aa18*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ec485e5013ef44b00c23ab943ab55555*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"733a8c360d6e44185977ecb012664b6d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"63e2eedea45be8cf0c005381b2ec43c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"e1e1996b20abff7a171cca00298b837e*1&51aca7c8a5cc5cfcee1be0e310d80255*1&9fb5395fa84e2ec741abc535b9dae444*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"c9c7f742e12cda12f845cfc5b416cc00*1&abde97fd0f95d23cbca1f581a8d4090e*1&5ea05ab07891dcc843abc26063d3c944*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"93bc219cf8fe5e3ae434dc79853409bb*1&e28ba9d075cd5a7aea1d837b287a3e4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"3f75280db2f576e93c9d0eb9a395d7e2*1&0f94abd830edb038328379342fc2e865*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"11d4fa2fb6c32fffa0b2cfb33623ec6f*1&427b76edb30037fb9b43315c326fcfae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"240b0bb6ed5f74f180f99bda49f23063*1&c7e5836a8260ef36f97ceb0db6860547*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"2add01268297c4d110ba83fe78feace5*2&94bd85c3fffca1a5956d8c08801683f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"50c315d739b5be7643daee2dd9131de6*1&6e989db0b726e9a7b32be05cefd020fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"1e389852360b8784310b1c0eb44d1aa4*1&d72356c0873054b947b64898b40cc6ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"e1ddd7c09ad9f6a200cad3e496fadbf7*1&c389a5cd72a7a4167247297d3861e5d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"de40b54b663f145ed9b3d6120b827d4d*1&9f3e137fc009edaf8b3841230ba0971b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b19e86600c61595c34fddbc8fcefe84a*3&99e1fcc283cf015bfb93e68ed4c870f8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"8b7dae011f5dcbbeb304a2d7fa65ff47*2&925fdee951fb1ace6da5c032f1a96bc0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"48b36640c3fe2691e84ba829a75f77b4*2&f0d177e6172f21a530ddf12cd7ed7159*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"d99367fd3a7b7384cf7e62b7081142a1*1&069fa046eed2f35db726c4559fa77bf8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"afc4296acde992fb0ed3c5e7fc70dc9e*1&48cc945ca83acf32fc90fba16d55f18b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a9cf64372ff3b812b1c8b56df2e51ad9*4&0be687ef5c018eeed9e5c2e9a3ab7a9e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"9b6b53c0ebb8ee6dd867e88c040a3dfb*4&51a9b4a08ea51b24c7127cb063e028d7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4e2c631fc0e5bd721ed742f2f9afc7cc*3&ce2ca5d38babe38cee62692f95bf2350*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"61d764a99dd09387a34c0fc05a7d6e0d*1&79a603ea967ef4f8feff8c00f17135a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0c05e4e789414d6ed1cb3b86e63d8ca3*2&4d18dbc9eaf8e3ea04d0a0e0b0386fb7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"6f5832e30b471b3ba08d91bf50826e92*2&a5d3640edc7c6ac17751aec0fa28eba2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b1fd574c287275fc91d7b9cfcdcfbe5c*2&6dbb8254e671986a5d46b77a5c1ddb7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c6db581a3a4633cda19559f4f1c8e1eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"dce167b0dd33ca8bc22f6dff6599e51b*1&06b2df17081fe6709897723c6a312cf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"396b8ccb5e3d333471ba58e68774816e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a98c10011754922a63bf39d22f148801*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"479c0a31c27e06e54b6dee37905c690d*1&0c5e7f5d747ac0363748224589352ca3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bb34307422bf03c12e200f7600c42fde*3&1b23dc0268310a1839420a916ebada09*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"757031d3695c9dcbdbd210aaf0e6867d*3&bb3bfa55551bec686f3a9e986788d78e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f2fea7dd07f645e68da9a1b7fe8ea57a*2&a7ae90795d82298898da67eac42e543c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f1ad7179419806acb3acf803d22bcfef*1&848bfc589983477f5c0671085272753d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"672eaaf14011d2545df01e08c55dc36e*1&9f1617c4b14b29cd12057d7e81f6b37f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ade447a0d91955ad4bfb690c6a94877f*1&a463c728dc3b5fe36a11bf284ce52655*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e6af12290621cc0baba0dba37424a9e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4fb0a72f85b5e433c323bf70cf3f37b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"ef624ee44ce64ae026ef091d048f7279*1&9f39c9e30f1482f2dcb661e471e08ce7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"1618327e65566776114f7e52fe1f84f7*1&d68b8ce03f1dfb71b61af205c096918c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"7e12b84f52c2b15b7bc25fdbcbdf5b70*1&b8c14b43bf529b3a303332c30080859b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"cfcf5ea851090dc32d9e0d351998aa0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"cd6014c51cd59b30fe56ec77452560ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"3371290b51fedd1186f19b46a2f77c4a*1&63e25061f63206f0a1b772e7d3b3348a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"740b437ea18cfd432a80935a4afafa34*1&f4ab6b2ef5c523d79802470e634a7189*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"ea8b711e969ace2f64386c78a71c5a2f*1&8efb38e282c68266ac034753c0b28165*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"60690b893fc47b3de85a47011e91c559*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"e00675fb1316fb67a6ef2be8b6ce781c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"613cd0fb59a3707ebd267d01a278f507*1&08a2873b4757da92a9c98f4b84b2a189*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f23f524d397f6a9f5ee3b849c978cef3*1&3776a47009f6d8af26b00603cab589d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ae89955ebadcb86f99eadc66729b53c1*1&0fbabaa01cacbf2e5f430e192f115de1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"12e255bc797345c59793032cbc9e7b52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"62fd550af31ffbf7eff3e0a7909de32d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"814afd46f399428955d98c478a148a20*1&f0cdf694c36ce5957ecc08083603e877*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f70ad6f2016e72fddebd4dfb8294a6d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"e11cebd1d4b2b28639ff31fb0dbd94e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"4ac4ce4983f3fb8233c1dfdbe13d8cbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"cbbaec8cedfb859334b723cf374baa50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"39c071b642bbbb068487d9c131ed2a2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"46ec5fcae717912ddd41073989b053a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3a65fc31c1cf0f7dca8877ce1a37515e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dc4e8c66f5bb1516b4b221b3a292a82e*1&b6d5fa449d46736a39f6dac65786e3ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2cf6ea048fae5bd257d32d67503669c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"efaaa1d05dbcda09c700913067dcee50*2&320418e5265ee7b71ce4dda7da449533*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"59872bf42b680be4837ac395ae8129aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"24efac6bb0d76a1b451383b4e686c290*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"80f10a29732f58ffc9dca626222ee2b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3706976aa7368d0e3bae17532e50c876*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"65bdc9531c70242dead74b55c48109e5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0bc7d445ec94eb5cc24477e48d6572ab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"019c2fb5f21bb45c67ed410b2bd91b69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"eb4035897608de31dd4dca7c9013a2f7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d60573252a27a1f03eeb0d3efd0b3239*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f36823a0ac2b2a87758111c52f7000e6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e398a0cca902cf1a9965eba887ef0c82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"5fbc07061f7fdcb9b046573c61616279*5&778226a20fb7b0e42b5b084a2cdb41b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"85dcdcb903581deb9a72fd8e5aaff8e5*1&30adc728de9a5dc962766ac662c866d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"688172c5a16b6075ae5ae006a784ffed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:acdb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c6579f2eef1193923fa3f3e3fb2997c8*1&72e05e1aa125b3f2f6dcfbdc585eb4f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"02d81f5e35220a36302068ce921ee916*1&f87788da69941ab82818454f2a3412bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c411a559fccefacc8c83310a0af0b0c8*1&e8865d264c179f85ff359da27ae7367c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9f314a0e63227d1658b85e8f1c73cc58*1&eeac4cf2f582f54cf507720de96271f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"79628952190090466cfe970e040af675*2&92755843f51349498b31cb405903b047*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"98bf49c1af80d607953f87274eb23cce*1&97ff4b7ced11d70564fdffde695cb9b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"0b3693c358a37b97f30ba0715821773a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8387465967c3f3e23da94ec7292e36ab*1&547d5378ea493f3f2354142bfbd11b5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bcc9c4ccca6a2036b417b80e7f09bae7*2&6b0cd73aba8587672cbfbe2c26f50807*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"bcdf8550da713083adc9d5207cf25846*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3e1954f921bffb43277a46e77ee72f2e*2&9c0cbf1f4abb1970f435cb3d4a776fec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a2239c6460671f9eaa9842a603db1237*1&6afde9643df8d4c15fda9048ca8b0e5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"35dd63796ef68513da6873a85cfc6d80*1&8d38e9060d9dce6a911745f4c6c1e4bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"88ec1f19ef6cbe056c28036967dc13e8*1&d9915de3f2f550f83a8882df22233e05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g200mb32_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3865dee365f9d03c9ee3723dfa1107c6*1&412e720ab4478f0329013de03b834dbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"64d98ce63ed6aba7308eb486043c9923*1&fb5c397f90872b64ef22c84a02380faa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"62b999845466fc2ff6931cf5831e7123*2&7177c58921f10a5a61ff815e16c13d1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9e8dce06a91b26cc74571d996358ae8b*2&d16e41e7f960644b0f2f0f9431931efa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4eccdcb083ea8ebc236f88b7c450a0b0*2&0a9b6034eca0724e61dbf03555855dec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"198123a58ad662a93149f9b7e9f68485*1&2ef9a9d4c66027c61e6d40588b98dc1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"06874d159395fe7eb2b8d3b1bf02100a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0337399d04e481ac65adfb65b5e37ee6*2&00371d33978c3137d6f5cca1a0672dd8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c41e3033fa0119219953d73641795a85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"f8141e66353865672f87ac274904723d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9727c3e8d0cf7d2ebfcce577e7196cc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c3e0c599b24f739e09c2be19811b80ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"135d36bd204be3154db8af83fb1f2376*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"94d9eb2257d64d11972419a4f905c35e*1&58edc301714e1f45489725499d2f07f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ae6e138823971e8bf36555654cdbdd14*1&de41e687616131dfa586f63e8f929796*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"1427530ecd0663fc083d287477eb32bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e888a1ead8806525e110633c110bcaf0*1&6672d9f59c2e443d7ebadadb6c882c56*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"77b96b6090fba990de7fbf86d4b41429*1&3ba4cf98b839290772dc8bc34cac6718*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"be52ba09cff6e5d4b035616380dfe786*1&974625168c4d16206b1d21296788e71f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"482a2687189197a8513b0ac225b20857*1&481bc5f21cba373f5250c289e4cfc344*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"57d7241da8a33a80840105585bb270b6*1&62c2e6e6aff3b24cb361b6984120ca5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"60d3b03a66c9a705bd32d7ba3dd97595*1&62f3d77eed4ec323af9e8c5e3fc43168*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"271bcccd4640d05520603fc9f8649abe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d5c50eed6df55d5bd64cd542748c8126*2&2b71b3343951f5194250187082f73490*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"86af95fa4d7e79041b5f1f435f773e93*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"b6f23b99d7ecd01ee003c3b982dc8175*1&2916b61b1c43f7348a688cbf3d9ae71f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c69162867259c9c909d50afadd927df4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b285ff35891bf568bb623f44f7ea8305*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"256b06b5392be2f1812cd7ec7129171a*1&f612a016113e423e530c1fdd6fc75bc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0d4023097ef8dcc252bdb26add56dc53*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1cba427c279e8b4e9eadce578276a4ac*1&a59ee54c9e7424d14456b93dac90001d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"41a2706b443d78b78bddef46e3bed508*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"94c4bc2eb1ad8d9c0d14923e62d8363f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3b1f182e3e344f49a7aa476e0d1b5594*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"30c25a7a8a250235d76139392409f559*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"82e9b4e20af2b5f78d7e54910bdcf28c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3b5e89b93ef49d27eb10c97384cf2850*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"cd741139e4107c6c6d177a24f1a0026f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"605ddb443fc3b614f9221f915477e333*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"a88ff9048c80b3f81825ace47ea482c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"980d1b777e2d54eaad423527a2d0ddcf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"50ec2015d50d5b9099499ccb64bf7c0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a7782051adfc859397e4383ee870ed6f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6ffe6c5cca24023dc70883abd53691f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"88414eba82bc8641abcfdaffbb80bc1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3e0ad28fb7ee7087bcffd5fbee836bac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e48056c7c6ab0b6b08f5ea5c8f5efd68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"ba98b888b4d10efb5d298ea9e47add1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"016bb9e8d8bc53af897ce8fac8517e8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"b9387a776c848c505c2636e3a651ae71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8dce74f804a09271ccf3081ab6437bdb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d83f01c211467b1b1f60efd3ca6a6f26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ec9beb6ac751b76f5bca580b4ef8583c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e1d569a34ed7b75e5aa56dd12cfd9b31*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"44bee77dbe0aca1bf27d0b83b236c16d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7fe4aa937eaf5b94ab2f3ed6c5984732*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"1528aa5f17d9882879bd40cc1224ab81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8df84e590e71601af3ea3b2dc7eb6618*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b3fc5cb3427287b0503b45c89d0a7cb7*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e494901f1c5edbab23fbb5b724264cbe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9c269f94218cc8904574a00ce25f998a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"8cfbed26401f78323bca42b01d1ce265*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"976115904f43d062b57f4ca708f1a56c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a2ac2c6c51681c133ff23b8ec47b4766*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2bd079a226098a3244b440e703e6c248*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b9165b65e1dcd53a755de6eb22faa404*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"63f88b01cf9adbd8bbb598197bc030c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"55bf04ff1fcb429f5215ca4decdc1327*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"c4cc41e4725b0e5f554c887436c28108*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"1d3db604a2f3dd1866110d05b7bd2501*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b014be4ee3ec8aa661daec0eb83222a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4f10290b3c06d542936b036fdc3d3229*1&a4b4ec4e6c27a277f6dc1bf8a090fdd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fa8ee6ee753e1d9ae01002898e2f3586*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bda0a2e00e9d8473fdcea91f65690815*3&065ce8dfbe4274d8001d8e8e32466936*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g58mb1_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1fe6759e26e7246d1873ff830e7fe85d*3&37017e55833cf9356711a9f8595ee1b2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9c91786fa082601f33455e80543c61ad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"6ee3c2fc85f13431f38520f05f00034e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"07ebaa844b896e9503b01f56146f72b4*1&b3d19c974fb733ab61530d49a3b0a897*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5c007a32fa39ee6f496951f3ee1e0994*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3149a66b6f68e5e1de1200ca1316e17f*7&2cab3afbb90e8f39228b9316ae749bfb*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g116mb1_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c42b64df165d74c80e4fbf9776b5c23a*7&f6971f1e3b1383f3bc49cbd03e6a49a0*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e338c99e74b3bd687fc7c293c3a2244*9&c61385d92ea8f9544653a6201674ef38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"2aa91c773c32c738affda1d0f9b8f7c1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"62f9dfd35249998c09c17ee16ebca9f0*1&c09f2b4fea37371d71fb1be0a76bd3c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"43f0f6a8d96a707ad28623cc60a280cf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"07101cd5c31b6fb3e1b0f8792e07d443*5&f344a073c6a89c176821f81e212dd231*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"512848ba796f2467107faccd7f2202c2*3&250836b213b1d6b7054c41c3ca44a2b6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g232mb1_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1642394680b5e0b1b0c42896343cd347*3&11d4c0226d95c7d8deede3866bdd4776*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fde1231842d94440e135d50b632504f3*1&8985267c9cd768e915abf1ec2e9404ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"a886e3cb2788840eadb3176fe3a2e101*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"6c7771a67ec900ce70ce3dff3c0221d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"9eeb32445c621a4bf9ffeba165067c5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"655d522906a6156ba326c86f1fc10523*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"3372a3887c80508f41f2df4b20a57a0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"9afbfa8db4bbbfaf2cd8bc9583b0b750*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"5e9fd986e2c04e67b572d7329180fb0f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"3f91321d498dcf75c35d0c3995a809d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"430b5f7cc9c1365675dbe9e2b4ea50bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"79e96e10f07610917d3eb37ed2089887*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"8bb52870ba7adecbed8dee8b85267e3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c2db81d3cc859d2e61d478b8300511c8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"54c204bda5cc1b6d476089fe8b705bf4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ff8b045e43919e15859d62a8c8823f18*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"5537a658e466e69eaf421f4d4c057600*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"a42c43fd56dbcf99ac6f81317c06b3f0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"70295478f6cfd00f9ed0b5874d15fb97*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"44f7f03da6b5397f3f4d77e426a6ae2f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4b75021a16d6e4eddf0054ddce5d9565*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9e94155bd0e2aaf310b2c18961ac9404*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"bddc56df6d8d1e896de7ee71f9d65833*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9e1444f9d7bdfceb3fa1243c56c13a86*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"97ee511823674e98de1f07c601ee1240*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb1_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ec12d2a3ba4f25b4ba56afd8f661fa2e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"4ff67f7ae24cce020710119455f652a0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7e07c5fb31466d738e97ea8a1659a9fe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9fa3543afcffdac7ebde1325130fab45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"be5166ee56efafa35a9d80695e956400*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"54780fba1f86ad9a4358fd5b9e52a9e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"3d8744ad1ece8e26c09fc63c144c53df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"56f1c67f00b42e8fc5a3751d2da8478d*1&5117d3aace7593e9202da25ec36ec2cc*1&4b461c8b2d4aa71188fae198569f9c58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"345d866f0e2581924621ca20439084e0*1&fd190f03a41b93167efc5612648835c3*1&35c4c3e3246a86e8adaec41fe416b342*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"01044b3377a486d6b4266d53cec8c1b0*1&283373036d3c353a2a3c7f59f8b08dd8*1&31162c93c136bf6b5f65f10755f27acc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"72f5f766b855f5162b149d3727fdf42b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb32_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"799855588011057f72b3d1bac7dcf297*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8a340b7546af90b82d21a4c224a82ff0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"31eff98cb28eaedf6dd4650e4344c3e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"59ff3a1be4b5ba212df386be3dc44934*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"cf8d7ac840c1b3afda31adfb9bc8011b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"545c2a5b96c3283e2471f181d116548f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"cf1837cfd34dbab61373320eb913a814*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"f0fef39b9b8288a0721af670d7963c3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"6ecb6e81b563dcb2afbe12681d217c97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"86238fb06cb6bf3cdc3cb996d337ad5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"5efd9b4935fe75720caa0b0580699fd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"3a12dc6c5e77702ee1306e56b0e7c66a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"3a89fddd5e520bb519365f7f5d680aae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"20b16ee00a937d217e8bd87953d8a63a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"0ecc5dbea71e7e5d0e8b1dc48a061017*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e1a351a660a691cdd6d167cf87e4bac2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"959d265f3f6ea75ccef5901ec4b0aab3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"da0615fe8666635907ebc0c4a58edf51*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6dbc9b41de3b52f1ccf849c1480aa37b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f77d8c07f630434b029e918425fef805*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7eab81423273411b0a8c3179a267c5aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c10303188ec73bddb65155c4b5b1a018*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"2c69c950d731fce1317945743d0206b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"08471481e485373c96ae2aecb3433237*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"16efbbf9eae02b916ee3a581fa1ad68e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"f3eb388e052255d39d90b2a7938d16aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"5f56db4198dcaf6b678c1f416dce9d19*1&4cd739d9657e393ea513e3a4c7057bcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"309634bc0d1ff9b8eea0698b59467f40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2c064fcf535ff4d08640dde8a5a246aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"66d2580927667076a61a95c9169d7201*1&dfc8c9a6c7f19a3425366d86c0522cb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"cf34117436cfa914356bfd8174b3d6f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"c9cba8b9e5c63d51aecfb75e8e97fd54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"cf2d813342cfae0e623a278bdafc5155*1&89029eec3db6dd7b84c333b470091a4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"3755555382b5cdc0a94e3c86f64e4bd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f3219ecdb2256f7eef8f7b54e099b5ed*1&9a00496e9391db80fd19342b4812659d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b98b4e7203e6cbed7710803e446275fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1c3121b28667d5c7bffcc0d759ca70dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bd057bb9e79d752ed609b781bc2bf09f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"67a8c441dcb13187b88a3460bdbc8f9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e63bea738f5be8f7985bd12091c36467*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b6ebeec052f88c88c73bcd4e3ef2e624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f41688d4ea236bdcfaecf2981db1134c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"03b604d92c1b711e29b7495bc8e70052*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih224oh55kh11sh4dh0ph2_iw224ow55kw11sw4dw0pw2_n"cd38c5019d6cc92575271e0ed539c886*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc192_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"5a913f9201ca4d4da1b5e08d4277fe7e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ec1471d12c7408c750fe7c4f2465d8fd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3f7cba058ae57217978dfe35d4acab62*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ba14167417d27694a35cd9f960740d90*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh55kh11sh4dh0ph2_iw224ow55kw11sw4dw0pw2_n"319519a80c3b6ad4c890356a916f2d5f*5&169626f745319519dc4d7eba52cd966f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc192_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"f8b77ad690ed0e3b68d292db8576ff73*5&1471593648f1656dbe45e5c3cea2786e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"298381841dff0883b0d3b3f1c6730819*5&97800c8ab9c0bdfcbeb41f5abc616052*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c74d995d07d80e6c1588e0aca399850f*5&7272c22405aa682af5f21fef64997d12*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"037687e0884e6f7dbab2eabe855dfb48*5&bdd861304a9dd66e9a51ddf2957be91b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"350f2d43f2ee478342909f4533cb4ef5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c6768ba140710af1f68c7f39e35f02ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6e4d784196a537d688823b5149de16b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4890348b23396ed214cf26303becd717*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a52f487e9f88bcb2b668d59d1c83b12d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b4d569fe4f6e84b584e185dbf635c617*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b5cdc109141f483396b82543d3037106*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"140f6a2f7c0398ec1a082f7b563024b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"85f18abddb9ca5abbd90427358acb322*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2bd3b0030b5bc56cdcc07eb3d1f25728*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"5d9df7183d06909c096b81f870506dc6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"8662d804283dd08cb006a3e73911c4c3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"438487a6af9e2dcb832bf2cfe49cf96e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b3d2bc426107f44b456bd06de188e60c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1f941a260a549592bd6b4e3d0ce4760f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1a80c971c457bd57f3c859cfe7fa02e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"fe01c80739cf446642bed507b2fc861a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"954b761bc397b13e7e75753ddaf16727*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"566ed03e63cd120e3792e8d881d9c717*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6dd26d35d39c7277bd56abd2744f7fba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"dc400b5a6d6ccbfcceb92e1cb80c864d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"5731bc2519c050b4d82b7d695403ab36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"20495adc816f10178c649e8287ea5656*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"092bcad6b39615c6969a992f97752fe8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"61fda8cb7392b6edbbd09506af941e26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"2b44d6953a43f04dcad9e1e7ae88892b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"5f63c94456b1df6eecd890c0e47fe42a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"f52fb3ae098cb7ed4a35bbd2a54a0759*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"f99410fd3c3bfd334285db3008bb639c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"8ea0c586ac4fb720246a44c54e01975d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"0eb32d4ea17ee0d173d1633c0698773e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"65ed9ce68d9cb884317759d55ca421ff*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"ccb391311fdfb2f3f1bf24e0afe153b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"63e631ce08fd93702232f806b0bda21d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"0f523e39fd80199bf4eb0ce834647dcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d6e88c286c38508f7f197ac917a9abca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"7c5c19b45239eecd3ab47096ca4afc17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:7:ABcd32a16b mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"d1643c6a367c0b292bb8662822703910*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"d5c567f5c5fe07586853521991f48ffb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"fdd324a3fe7b9efb6c59100ca0d90539*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"01daa6002bff2e7868d4e54033565c25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"3c926e1e9fc1d22a14cdfbd29c061ab4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"d29e8e72b6c20d1c21806be34330ab1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"e6d3cf64950a8d436ab05f4f58644b1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"e22b6b85361aeae3ee14904297969f35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"519ea65bf2811c7b09de0471238a8fee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"72270b8551d7c9d9d38eac22a52c37ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"41a2249855141719c2387b2bb784822f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"efa86d0e3d7be25316db8cba991c46e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"f20d2735a99cfe2e2e62ead58c96aee2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"512e281c89818424e1b007c6c81d8d0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"ea37bd3c53d8df7451fdfdf164ee8dff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"957c5334ceeccbd35e8ecbb1881c66d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"f6605386e1562b1b6450b8d052ece916*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"31e3a2d5960786909dd2baaecfd29e12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"1165b5af6e80d2ecf3d1b0aaff5d87b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_exp:0.271:0.314:1.234 mb32_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"7f070268cea59edbf68c2695796d6e70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"c418815ea54b6e9fa0419a29845cdd16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"38fe8d4dc98853d47f7e46b9fe3efdee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"800dc19f488ff392032d81ce4e32d258*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"6a6bfb99efc2245b7a33b2a193794bfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c851dbc963b033814d2eb5241b84c993*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"a0739ce2a099dfccf65001fec50ede32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"470195bb3b22eed5e58066eca71a58d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d9e9785e396edbba4e53b7cfa1378e89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ecdd83966048b3022057fac9f48fb70d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"b3a52e7c1b91483dade7fcd398bfaf79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4d605092c849f8a9110416dd9d9706f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"207ce9a2e153e9ab4e1163bcc9052919*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1df8f818c6453917a5c954bc6e3ab602*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"7f3249371a80512a3a4f6127665d3e60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3dc4892ee63fe8d4643f0f8073d6b184*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8dba328f215c3db9b5fe56be629e6d33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8e81cf0831e9a0f21656a42af53d7730*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5f2fdc34cd440e655a2b6f25d32487b4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d91dc4a2df03a11dddc90e9a863a124c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fa787efe863fdff7224664f908f164ca*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"6cdd727ab26a482ef275bb277646d261*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic3oc64_ih448oh224kh7sh2dh0ph3_iw448ow224kw7sw2dw0pw3_n"d13a135fa503e77ef833445369fe806d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"179fd4fed4ea81bc08750096fbed7b7c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"fc00a48c8413710f5edc585830ecc103*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic64oc128_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"0423406a076df014a7bb9423880ec271*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb5_ic64oc128_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"2588029c93fd3108f1695b1f14a221e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4317e69f2bcba7c0f4922a6885d9eb11*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e2f4d689e9e1b2f82b0bb808939f6326*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic128oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"91d1d9e7e65ca4d4dd0f963a4fbeedb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb5_ic128oc256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"06e9f8de6c341ece2f6aff635b194848*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"76386eb4f0879a942fc2927f34540ea4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a573ba1197c087ad952b879495a5ef7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic256oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6485019038551a098caeb4bddb7f3cf2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb5_ic256oc512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7807cc9ef38d774aff37059dee952cce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"355d1329dfcba7d1962f7a3a8676f282*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2877ca7ebb6a843b5dfc2e279c9f214b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb100_ic1oc10_ih1oh1kh1sh1dh0ph0_iw29532ow5905kw10sw5dw0pw0_n"3c72a50bd2f5b657366d56004b4abcd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb100_ic10oc10_ih1oh1kh1sh1dh0ph0_iw590ow117kw10sw5dw0pw0_n"d30bcf99b17776e854fa5899d9f9a91b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb100_ic10oc10_ih1oh1kh1sh1dh0ph0_iw590ow117kw10sw5dw0pw0_n"efce75def9df6c6659e3c9ff10f7e7d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb100_ic10oc10_ih1oh1kh1sh1dh0ph0_iw590ow117kw10sw5dw0pw0_n"e19b429df862b74628223ae0e7cf2ee5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb100_ic1oc10_ih1oh1kh1sh1dh0ph0_iw29532ow5905kw10sw5dw0pw0_n"5d8681794ea705ca1e38846293557e6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"da0ea3c60bbb870c3b7f314bb034e5d2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"5b47b154182139e92d58c6bb28e0ee37*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"8ad91d7232f460340d34e45c476ee127*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"a9b37f3adadb85ee69463a449dfc2a3b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"8e0a16a3225903747e6c895939c52a91*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"7aad7baad1cebc1f80a9e87c74359e88*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"f62d471f98adbd6c2cad672177fd8d69*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"97ba78570e0f79d095e8e5ac6d1c8228*5&4fd18c32da1c84b9a69f5ad6ea80acd4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"83bf3f48c02a9e25bef74cced77862a0*5&6302dd2f0780e136bf7483f0b02aabbe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"154a814b3d7f86d7d8ca7f3b62feea30*5&47ea217b26e136e9c8daae477bea177b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"0b1a0e518b8b63d9fd0f78261d109419*5&369135b4df2b9434dbc67b6813130925*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"d503917797ab2c2c6166b904d127dc6f*5&e63a4dd38fd918c3d0dff562f99732b2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"917a7e827ff905cd49f750bdbb1ffdbd*5&5508da5e562b23074107debac50861ec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"5813152d63deb6a23d797ec616e3f05b*5&2c07435baf38580960fc3e44047c26ad*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"9917ad0bde540152c3ff03be53028819*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g64mb1_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"95cbff782da92bc938c3bc1446fd82f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"5dfaff5dae4864c3623e47e8175cb6f3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"4a5600ee603a7a5be1124dafc9312dda*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g96mb1_ic96oc96_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"76df5f1b8a46186fe4d8effcf33555b0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"726e8a30915677142dc8a5d3721df305*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g128mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6625895449be2da97d1b6147a8062f8d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"c10c2174ccf0ace07bc3f90997c6f0b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g160mb1_ic160oc160_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"44d2510d9adbeffef0e5646bcb562bf2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0aa7e4e5978071aaa43222a36019e09c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g192mb1_ic192oc192_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0f65ccd71d25fcde899bf35967b3b8c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"62637be04ee2ee6a51bf4f4dfe05f92a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g224mb1_ic224oc224_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"629e3a0339c387414e0d710191eae4d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6b7643b1ec95bf85e36081907b88021b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"4a9a69b577ce4957fdbd0d5e367c62af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g128mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b855de42c66ad1352868d7fde4229672*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8f3d0fc399760fb98a67083e3130d1fd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"3fcbaf4acb0a004695e2cec124d949cb*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2d2ee3549c99e78d61ffc9ccf8e4dda8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g192mb1_ic192oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9873568a0ff3063ff4e2c022fe9e0ec4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5baf11c2d7707adbb3f1b2f1a6696ce9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g224mb1_ic224oc224_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8a0524761f1d5ca91df865eacdc40dab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"688ac5a13092e2b81621b66770d58d6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g256mb1_ic256oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"451d334a50f40910d3dac5b1ff450c95*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9120649faa02c746c4fadc1a0bd2303f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g288mb1_ic288oc288_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"af87f2b7689aa969275e0a90e55d0fa2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ed22cb924837d9d86ce98e2bb1a7a3dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g320mb1_ic320oc320_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"91d00b69e123afe6cde4a24439cf12b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"332102de530bd7d7a75c199cd78bd17d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g352mb1_ic352oc352_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f72f1f5c6a9b41ae84479007fb204a77*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8d05fcc5810d933e7139bb14c88acecf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g384mb1_ic384oc384_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"37a256efb72be16a106c12e8759b05e1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4a6285f3a0851c9fd81a9d2cc0179764*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g416mb1_ic416oc416_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6fbb49e6aa8c790b3a8e7e67ef1aa7dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"16cfc347731681b47f664fa38c2b4c92*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g448mb1_ic448oc448_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8836fc4d966294d556a6fecbe5adb4dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5a3d6200ef1f13d518d2162980cac237*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g480mb1_ic480oc480_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"fcf5346924cd21c58977da1b1a4df93e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0cab5eb374647ecf2a0426f32e0acae4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5aba13a8f16687787e562fc023c1bd8c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"419db183c44dc0ace9e0449e137285cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"866aacf7f27b3100f7c492405f6018cf*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7104e0df5a042965569136da3089835b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"868fbac8d814654991b3e0d7334530cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a3343dfdeab5bedcb70129f140e4759e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g384mb1_ic384oc384_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5a5d2cca26fbc6e677cac95335054ae4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d3bb8b120689f02321d3a6fbc37d6ba8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"65b4579b90b50b1cd7bf80418c4672b9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g448mb1_ic448oc448_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fefd5d64f79ec2ecb3f62ea461849766*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0b3eee4da29e7941b01334b260e60521*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4f3026b27ab5c41da5743a50aa0ea60f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0291dce024fb36b6e858d79fe1885a9f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g544mb1_ic544oc544_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c31a1d59156e7a8ffd1f61882c0c9e39*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1c180b54374da9033c75d81e0d06ed57*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"712d6a537b4c46798806b57095dd6daf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g608mb1_ic608oc608_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d44c96ddafb525e1024874fa876ad392*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0212469ef3c8acde32ff04833021d1c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g640mb1_ic640oc640_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8aa387a0bf5156e0ada2dfe9288f1cef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bf280b45d1a9bfa75f6a6192a6550ef4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c1906020f82a542615dd736ce863697b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g704mb1_ic704oc704_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"749625054f4601625d2572b377d0028f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"51a29be09b78cf3e850bb2a7b00f12de*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g736mb1_ic736oc736_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ff63faf8b5a2bf020425b7ede9f162c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6da45308c0a6b7af99b9ad20aa1ad526*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g768mb1_ic768oc768_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"645f6e59f77c6cee79c1030005cb356f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"eae3800261906d2d4f7554047cea9062*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g800mb1_ic800oc800_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e7bfd732cf494ca92ed6ab31b05ced9b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a3006936ca09b515125359d7809bb486*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"27c116fc32940a7a9a7c5c1c2bee7193*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g864mb1_ic864oc864_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"79d28e82cd2d5e9da6b1d32e512ae800*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8979b3fa61b3409ba0691572c86608a4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g896mb1_ic896oc896_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bde13359e5a4fba137316c863b644846*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7cc7ad313ec9f3f5d1166ded231b8bb6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g928mb1_ic928oc928_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"72524dc98369624bfd6bae9d6f79e977*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a1d4429014b2522cc215ec0e113e37ce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g960mb1_ic960oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a0489010586d25e5056432789573795a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"edae8e143647a6a3968e61b90cd1574d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g992mb1_ic992oc992_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"24ac19e0b4d5a19060ed541e04b7e2d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f253a91c8c4fe0502ed3d8e6c0dcb4c0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"22069a98d9fc6628c573cf3f8714b02e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"827c45c4f74460b4b707e879d612412f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"85fbba2253eebbf8b9541c7025a38f51*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2e6aa98400b4c4c1a60be8ab9b6d6c03*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 g576mb1_ic576oc576_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b403242aced06fb4b334b9ba4efb7708*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"70149ea49029a82cd5a30a0e2fa7b5e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g608mb1_ic608oc608_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2fcf683302ddbd585647150b74e7c2c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"59b3645f7e6328520905c1747cd93a1d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g640mb1_ic640oc640_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0700770a13de3334d60c538914aaa1d5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"87f00b24280439db4655865493c475ff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g672mb1_ic672oc672_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2e8d649b86da17f9bb998fceb9edf81d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9869b5093c1c91fe7faf80993ad32799*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g704mb1_ic704oc704_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"694d6c4b54ad57444fd223120f889cab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5c5f2cdbde90afc2514041c4ef886ee2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g736mb1_ic736oc736_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"91b4caf58e499bb17f199ac6b277e791*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1edcb7cbe528f3308f9ae7e463f56230*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g768mb1_ic768oc768_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"dcc46bbc97c83af08347db29e1dc6f0d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"cc42cda788cdde19c573ba625c5607a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g800mb1_ic800oc800_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"22cc419e947612a235f7310d55d61996*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"63b8ecec36f55e39596c0c58a4c8e211*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g832mb1_ic832oc832_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1324f29d5d0d3ec22cce14d5a310eba7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7603a1f59e7041629b2cd40568296528*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g864mb1_ic864oc864_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5b11d74872c11331ccfc7581150332cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2e785a27a4ec490c0ef127f36d3d4380*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g896mb1_ic896oc896_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"18dbbbd101dbe6ac8f8b705ce2bd9b70*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fd8487e3f4dbc839342eceba2489f8a7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g928mb1_ic928oc928_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a31f35ba677d70f0d69668aa1d856f33*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ef69c9630c62387b15f383ed8e6b70d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g960mb1_ic960oc960_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3114630dc733d4c110996410567b7b90*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"cdd8ad82fbc67659204de9fe1da21647*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g992mb1_ic992oc992_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f545ae31bc41cf732fc4f5cf2a9c02b9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"23e1d2e912cfecd666281b58b7230488*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"2a5fcf316aaba8c968fd0672f8ccdb6d*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"6b9d9b5070588a3ccf0bcf62f2793a1a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"4d3453f534b62fb1adee7691a6aff362*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"a80dbb398c6c80d74e4e1c143cc461da*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"616c6ef9094f71d0db143f652ec3e638*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"5a012600b13ee23b6228d34c24ab51df*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"fb8177e4832e2926b17d193e528ac347*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"510a3af977fcb2a2a31f92ee916eaab8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a6828f54f49b794b1668c2a30655648e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ffb72d4a1c0c496cc6a008dd8cf09b57*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g256mb1_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"1c2834dbe7faaa7dbcdb1c1e6ba3382a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g256mb1_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"c36efe70928c5470e75ad8db1823e922*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g512mb1_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"2d0af54899d449e7647c8719c98f745a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g1280mb1_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"76192bf76c27a1e627c4b6445e427c50*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3a179370081698df497aa021654ec653*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"66bd0f2f75ce8f8a8a07c7e935b646c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 g128mb1_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"81c8ec0a85d18b3fe1b391739533fe13*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 --attr-zero-points=src0:common:1 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"23c23819002b02ad5c95ea7c7f9b60e4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-zero-points=src0:per_dim_1 g192mb1_ic192oc192_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8d5b641cd4c9fb8ed92ab86e9fec9fec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic576oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8a130c9dfb72254cddeddfe265b0c0b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a07c3a4faef8277e541927f22000f4e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"0ad7aec00c089c1e46cbd823585a5cf2*5&abef4f014819c667d1fee7c29bafd5b9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"9d4116a7cce0bd1be5a68f78919f5b1a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc64_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"494ce6a219648f335e40a2609c069d1f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e608849f7296b7f6d716a4777dbfefbf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"48799b743d16efcec8291af24d531527*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"3988a3d7f9da1c208bb9b2ffd5b4f468*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a4b046709c5acdaa28ec469b46e83f2b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"be130a34bf9c988c8405fa199f2fe2b9*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"898e3906428b67d8c40948f1e8e241b6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"aae4ba7e0ecfb81cb9f05c6d8909d115*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"d50530f54b38dd7bbde1b0c3d3be5a5b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"31947db6bc8e15277236b046a9c0f9c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc48_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f49aedca1156e525ba008778680ceb2b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b06e9136f8cec70480723718992267bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6141cd6b659272768a936fa535dc2b0c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e2abbbcf5e7b1067ed293c41de9be9f8*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e18b14be79af527a472c9e1c3b83cc3f*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"14b516ab42a36e06b62064c5f8bc0ce2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"2eb54834f68d9a8d57eb9b83a79e2934*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"06910a0d22d234c0020821e7bbbcf814*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6cd34b5a660e560a69bdc8f0bea3c5f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8e86320cdb87e1689307254c3943b101*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"91f1fcc44972c08789397cd4518cc3f2*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d371f02c9410ec9f111bf4677ce46de8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih128oh128kh3sh1dh3ph4_iw256ow256kw3sw1dw3pw4_n"f31e734d2247a1fbf7cb633a65c7478d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b659ccbd211b0b93e23d7a3f565f9b0a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8658761839a97f42fdc482819cbca6a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a257ac5d471f818196b9405f795cc10d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih128oh128kh3sh1dh7ph8_iw256ow256kw3sw1dw7pw8_n"07284d3d608a9dfca6b4862fefbfc616*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"20889e76f2c8afb446c6487b6fa42a90*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc512_ih128oh128kh3sh1dh15ph16_iw256ow256kw3sw1dw15pw16_n"1da49aa7c727e61b6de676314bd3aa71*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7bd763099493c81ab6a1d26bf8b4609b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh11ph12_iw256ow256kw3sw1dw11pw12_n"f5a4d78ec8faa79417d9778ddb7c5bec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic2048oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"73f159522eec8a3a6bbf8b4c6e71e477*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh23ph24_iw256ow256kw3sw1dw23pw24_n"ef11add8128731c55013f7925d69bf7e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh35ph36_iw256ow256kw3sw1dw35pw36_n"c09077c2713429f0dd40de2137e9882d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3db92abddaf75b61af10a708afb8eda*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic1280oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0ba3a74772358044ce7e78699034a2cd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g304mb1_ic304oc304_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"4f30e21571a525692a1f7061df01d467*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic304oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c8602bfaf42639b5a03ad4d97ae55e64*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g256mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c301b1db0e80dd26496a4142dcd15b15*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"4f9166bd17e1d1d8e2d8c64007c01127*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc19_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0ff2ab4656dd42c331c0c2f4b5b43766*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"084f09043bb8830e1c1dcc02d05cfbf1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"bd3f4c956df0336e0fbdbd44d05d70fd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"5f7626628d0f036b9fa176b9545d155a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"1fc928c24bebf2e05d48e1c54303346f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"a1156a9760026a4c2da481cb3ae082bf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"07248015a0cd4a0929526049d204157f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"87f11ef23cab681cdc4cd5651052fe52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"a78501da63998f8ebccfd9118962ded2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"bd838c74ff25ce31de17ecf882306660*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"aa9c4ac40b110243f6bf59ea7d6c17cf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"5adf5707f34e6e43cd9904dd8b51d863*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g320mb32_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"1478baea5c17042f5f00836953050c11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"676d9a53e441dac204719437c96398c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"e351f3062daf7f022900dff26cb40183*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"2b35973dba4caa28bd613f74acb618e0*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"75dbe0e8676b49e5ad474035692f6ca4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5aff5a1872f868859c3a453b3817d4df*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"6091c84bb950382b2018755b7f3997a7*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"20ae2bb7dcdd97f1422b82b2854905a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4406efff18622e1e262e5444b791fe39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"139dc4ba16fdc0367dd09310cd11c9e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"3c8a64955874ced54ff66bda7d72e091*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"aa3d7ba9c65b142e5f53f581680c8df4*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1056mb32_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"3d19298eaf544db8680feb0be1644948*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3b995a49829322bd1b39e0dd00a0be1a*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"dd1b64fedb3b77f48f64d61081f6aeb7*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1056mb32_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"6dcebab7f99a5384d3761ebae048f348*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"7fc0d1ff6c21b5e212c966894f1a5eaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"0f0bb704a73eb49a7eaac61b582755cd*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1824mb32_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"5916e09330030f586d272a9a992947d6*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2caf52d8618351da1d5f3b5cd408b182*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"4cde617a0788f3e1f9fe7d1c4a003cc4*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"124a85c9910c560a5f8a0b85eaca2296*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"3c4e06dba51aa77f633a276f950e48ef*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g3072mb32_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"1b201240c1bf0b210e4524f6f395865d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0ebeb2d81acd3a753a4d4056c8a6aa8b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"4626d93e7740968950fbfb9b1cee4e3d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"b08bc96a6e1bb98ec81339ccbcf97a41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"646e02b242aca9e79fb4b03791c05279*5&7f97972a65e6aec1bc677e542441273c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"44e5af742ad60b841a2c3c72baac006c*5&6cd98e66aeae0e408d79d9222f2e63c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"86a82d3be4fbb610f5ee0267ccef4a98*5&c7fee71e9e21e95eea30e4771cae7d5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4498c31b55ece27df5554f99eff812bf*5&281c458f9375219231fe4dd0a3cba93d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"22528dea62e9a0b0121784d55e13002b*5&80582c9b5632535fa6289c20795ad418*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f84b27946cf2f413f1991253214c80d4*5&432e877d4b72ffcb9217b60e375c3ece*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"f9c954502123564688c8c494ea25a71b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"5f996f840936adb912e5d1f06f3266a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"53bfb5e054bcfef8c23541f34ba86fac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1838d58e496363601d67696d227c840c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e32702114d7187e2d5ff50678c782b0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8f6e694d0281ceeaf50f8a61913383a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3bf8a93a5a09ea335de666408655b80a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f5411c1655d7737fac4cb1a9cbef5872*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0e139f2bac33f955c223b680a5f1531d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b1b1b820bf4ee20311a3fc05f01f5e92*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e50bc6b02566f579c7eb70ecbca43169*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"4430780e148060f25dd597802f022549*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"9bdeda68fa1d75404aec46327b3f8548*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c3d6af8ebc2e9a3b3e10e7fda23f9255*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"da3895fa4c9752c1f9e11d5ea457559a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic36oc18_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"864b0222d5908e6bb5b2b515fde82814*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"d5ff0bf31c9819540b30afc602fcaff6*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"16e78af7fe5d711f5604562d1a3d2a90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"5d983387b23ff441125c0435bb9ff1e2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"078f50909c7e0a9bfb0278e9b3252d59*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc18_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"0a1734a04fdff0c35650e0af1850ea34*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"4a68132e35083b97d4286f9625618865*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic18oc18_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"d804982b403616d80194d5badf8a8720*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"bc877b0a4bc121d84baaa12d7abf187a*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"847db496fb315b28416b2cd69fe6ee6d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"4349c60bedd3728e47e0d510021b603e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"8dee5ccd95cf347401b98657322275c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"cdc80258f4d81d05810bd7325f682156*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"3b31bd4587a2cafafa9caa95a6b7b9d9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc18_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"19418962cac22a90f2ef8a0665f654d2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc36_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f883d2e7d48dce24c9e5e83121099a77*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc72_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"22e982843ce5df6985992e623a5712cf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic18oc18_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"339237dde31a60db2a7ec06f1cf26e2f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic36oc36_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"c9b4bdd219380c8d5af05d4b48c24aa6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic36oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"781148eb9de3a8207c8eb5e3029c824d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"71598214dd5f4e221399bb52a3f21ba1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+binary_add:f16:14:aBcd16b+binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic18oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"f71e6f7403f9eedff4ed1bfd2cb0fb0a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic270oc270_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9ab7cdd3e73867f5832ae58eeb2dc754*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic270oc98_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9e2553331816c7386dd1e3f3e3c77415*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"d7d7440f9a89ef72040e893851537610*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b59958843e52b453f391758a51ed36da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"cdfb75f22e055046a5b707844eca1044*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"42b4dac1b2a8fcd115d48902f8995bad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4eb1fcc7492bafc4ff08ea1bc15f76ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"46194a785f2c08cdda7397def98f16ff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d51e40d1640edd92d62fb51c0c1a322c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"19ccf4a4c954e689c3d2fbad5efb58c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4c004712cb5b4541539ccbc819654500*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"78b6bf74ff6f186da3f4b177b2c0f5f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"fadb09c5c86e81003de595507694b596*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"895ba41a1ee5c1a88d4c575e13a501b8*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"04ec104ebe97be985c4e1a435d432aaa*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"59f08758ca2daf4065cdf85d238a43cd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9348694a5e9b5081dc1eb50f6e3d4607*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bca584ac351161e0c21beb79d6571fee*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"2a69bbf15b62a546f562051be05ecb82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7b108ee23246f53c9bd20818a8908747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b030ae7169bccbc0d92d374f25dfc7a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"33ab583b8e50457242c4347385ce8279*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"1de5d850c47c705f558293c00a8cce33*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"0f92f3c4c2423b4ff1cc9ea1de86cae5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"2dbd60aa57ad114ece443796563d52b2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"63122ded75f27b112cdcc49d743eeea2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"475ba71097f8589b71b4c4b5d7b30662*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"c810bebabd5d48b5f24b0c0c9be339c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"71946ec4f8b35c7bad72ec2e8e0eb05b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f8d1fa84fef7b747a2ec0124fd88a2bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"0ce9173bf27e5b6e1bb15d8c2741d8d1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"1747387d49efd384ce1264b861d29c0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9fd0661b024598d117c131775ad4cb31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"406eceaf91b80ab3c652ccdad4fdb2b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"cb45bf7790cf8af28994131267638c16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7d7e4a4db9d9586d7979e0123f1cc9bf*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"7ac60c632722b3db4cc7964cbc81f5b2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"5f9779a67e2f831dfef010d1a6c5a74b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9ae5f64d6a6ae8cffb2a70d39eabb0d7*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"07a771b0f0524c14a977c64a993c54c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"bfdec22e98102b6f3155d2131c3ac05f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"3625599fd71f412bd1d69490f03c8fc1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"de736e9d5c2b028dcd7168422b6e9ab4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b876e008ba909c799bcd055dfd716045*1&f86eae8bc84c549d9898386fd90be2f7*1&086bf0b49222e8cd2f37ff5efe58fa86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"70dc29ad3ecd90e06a58b3237c570098*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4bfdd875fb1f0c8c5c06b7d65b0cd51b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"b341ff0db43ff2694b4f29e51e4cc632*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"79bda20206dc2aa6f7855a14f3500024*1&70434a97004528ebe3238ffe766d5cf0*1&b49d4180ad8f79d49d37b4d47905c4b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"35a36c0f55c41bfb686f3651eb6fd963*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3d32bd00f072acdebfdb5ee444846084*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"78d16113c7f55fb90d2327b1fa1307e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"048c3c74aacd81c8b023613f1604f2a2*2&90a8c9b8a96a66e5b3c43c736fbdcd71*2&3d35bbbfd3f648e7777dd6c8383f6413*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"db702b5039aef429d5072b97f88de3eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih128oh64kh5sh2dh0ph1_iw128ow64kw5sw2dw0pw1_n"6fc630e22224958585472cbf7f003b8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b9e8158292c7f6520c9e0309cd724cb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic40oc240_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"eb156f877e780f6a32ce20921b7ee99e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"0a94d08d920c9adb775a4f5928b9860b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c68e36f29b21827d53e2ad10f7d65661*2&d0ea179e38d4fdba5a6e547fc2478c70*2&fd0c983de78ee7ce3f01ec3e25f2a4e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic240oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"efa083fa606ea85bd22b76082b82ffa6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"27fe953db5bcc1872ec2b1806d453a3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic240oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5d6a53ad500a4bbeb6ac8a4279f388a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc480_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c2f9569c3ead0e77c50e9220fd9b6a12*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"21e6e286f99d0b4748b54f5752d15c29*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"73f9e18d75187b7d2b23009b04276818*3&ea1124945919602c83a7dc68712c8faf*3&478f7ddfc855bd3a9fa616487a93f325*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic480oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4b1f22acde669beab1dedb2a0e6e5302*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"ade72d7f64c4158323ff1baf9545c827*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic480oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5fe25f50ba233ebf6d2e9e6a1973a650*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic112oc672_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ab07f6589081e0eddb2b269f7d4d529c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"5240e030a554c6a4857601d7cfd6d322*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cf5d4717ccde67fdf32d6845680b2e83*3&69ec66fb3ba15499040cfdc1a1481e3a*3&deb941192154e050094820b282d58e1c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic672oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0a2e6e40aef142e1673278c8d79a3ada*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih32oh16kh5sh2dh0ph1_iw32ow16kw5sw2dw0pw1_n"e60ae875f810cf730f55976a7bd1dc9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic672oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8e4098333f270c50147dd69f981fe0b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc1152_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9e76e4159dbd0a383e827fc8af1d2bea*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"4cdef214cb34c968c6b7e238d593cde5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e51e91457c93c03b6ce2a6489f0dcb88*4&6275d07b0d40a44707e7dd2ff884b0f2*4&d767d536c2701c435371a0d6922a348b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1152oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"52695085ad47012b7b8f09851bcf161c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7a9bdb7a7b7632007055130c69deaa56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1152oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3595e6a12022d1f151194a08e89453d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"57598a881138ac0324af93a2d8366825*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"45179cfff68e1e4e4837dc372c1c58f6*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9003195429760df8d15dfece00bf94ff*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7c67b9f5b1fb0e407f99bcb685f561cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"fed890a041586b9f0b9af9e35b1ecc85*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"749579eee5e5f21d368f46536e06c2f9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"74ae54440ab5305f2afa83c7a6ed306e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"73ccceed421b8f558b5fa0c2c7031691*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7689594d8601dbba92e64c606c9239a2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic40oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"092e47cf181b7b5ec12bddcef4f15f2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"01f36a31497f7f2703e1e423cbfbdd03*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2752ceac67a6020633f2e202fbfa737b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"df55f180624d6a0a35c4162ea603fd8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2a4d00ba2146d1320a341046bf0509b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"faf150329324b247e85e3cdbca48362d*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"32fd2b141f18e3fb636fd1a3d72e41bf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"56a532f8a794a4f63793604218fec501*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"415c670de7dac862bcc0dcb2c5595b92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c25e28f883965d77253f0f439633ea6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2fb5fcb62f9c898a1755fd2051af3684*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dbd3d9431abc6ee0cf7c8d402827bc95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c9654cd3eafc00f5ad2c4b63ce333ecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a87d328d4ce303e2e29e0245f00c92e2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3bda8c82198713dc75261705fedf8c40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8d9eb10cc06bae7d634da5aa3455fa00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0a76277fe21f7c4d6c7a8f56e8d026c2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3040e34766bb0fee20aa13d7eecd9f05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7baf8c20f38be0b1b5ff74e044bc78a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"23c64a93902248ca30cd671652ac5313*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b69aecd08c7768c8ef4915f6d7ee9de5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6d6aa34a8d989718cf39e53db4050dbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"af8e304e538c81bdfde6ff58d2f38eed*5&6c6b7956fb891d302e06efcf6af9f898*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"02a7e0be48c5e1605ad4024bc86a09b1*5&81c913d0cdfb9324637ce640478c2aae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1ea41f2ceeeb32ff7e9366803ccf6276*5&66f99d2ca40ff99469720268fcbf0d3e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b0720cd15ce35bca1703ccba5e121ce7*5&dd82cc9da25967e62109a4d2a1ceecdf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"4076e0d8d460e4ab14990529b0c733dd*5&830c7713ee95909247ff8ab9ce7ea46b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a7203662df1fbf25416607dd21d20a1a*5&aa89a46f4527205632c662051ffd0d5f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cc41e316dafaae4d45a39b028714619d*5&6eb616c419859f8258796e046abc3eaa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"b65e4525af535ff8e820807f5026d183*5&3237261efb6b626435c2d3b3ddc919e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc8_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e61569128fa1220cd3453c1433553ec5*5&4fc30fa81d3a55210ea7e16f50b6c216*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a8c494ad41fdc3b575cfef99c08fe00f*5&c0726bc2556434e6adcf9f9e8ae6b81e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"2424377fe47eab13aea7a7006d289a83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"6abfa55c8f94b7afa988474c94ec0251*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"21b71e635d229ae7c4ebfe77596862e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b5da00b9d4c3acc16d6932c78c19d441*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f3f333d42e6f2a96e4dd0f1e7a553040*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6c80607f9881634f5b74a7f575252cc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"241ff5b3dcd488ca973b627d4657734c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"753f0b49bd2491d52ccade1a3a9def61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4bb29142bf1e6b8ca14bb22855cc5748*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"90576f4131e32c0c49e60af738efcc2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8e4f14ce362fd4e656df8ab4b1b05d7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g416mb32_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"04c623ed553e52256155b92e097ac866*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"276cd16a2f7d21b04e7e39ec39c62ffc*1&f0f1c4becd4a83f9f89e584cfe452b65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"9ea8959007bcd38d3b8218cb95b7472c*1&99e5c6eca329d15c758c5490f6c47382*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"27f2fe4ef0b4e1593569a71c5f12f1aa*1&0072664a6bf2576991e2292b0682ffa0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"0bb1d840eb67a32e882b2878e149b73e*1&57cbcf69539a840f76b2141cb1732f9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"55972757ab4604023af2bbab767bc762*2&b83c33037ae6ea94081100d64d783b16*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"3109501cc3f0a5ff674899c2f10639fe*2&7cd0e2eddc04b22c25189328918f8511*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"7dc7c36b4987d2b22b0ebb49611934f3*1&9a1e6c99f4e3f658343b9bbf8c3f8296*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"b6931ce66d586ac7f67cb0fc4442e178*1&10ee0ce92ac24b631899150dace33fa4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"1212df7f8a32eb84a0565116d258a007*1&4b1b99d9a0decec9487208396f74e98d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"977e562b58f04eb068ce32a1f0f9867b*8&b502735ba061a596ff618096e53e7a20*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"d0542ae913eff181d29b5dd7617fe2fb*8&63b9f65fb112744d95523b2b3eb4c5b5*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"52aa14b7b3a923efe96766811c47db04*8&1ce37cdf1aea52ebc20fc30fc44f2d53*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"d9b6afdf57919d3fce91777c0422c7cb*4&08f967984df05309248c51b5c078c06d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"a7b914c836df92b9039a25e1d8c3695f*1&91c8b52671647c2f640d8b1b67d985d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"18ecd9c16c3c5f701303fa770b0ff1f9*1&af552fc5c95fd51a31a204fc69e01db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"0a31e8951fb839afb296e8ac96a04929*1&0656c09be06d109d30ea94896ec7e548*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"2afde713dd7e21ea493926444c97efc1*1&64b124bfa07866e8b5afc3234535a0e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"c975b6f45773841bde7c0b99af2bb50a*15&f00af2f0f62c83c029968e6929f58c48*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"0d1881ed032c30f5930d09d7f5356111*7&241a2a0d942ea4b6018de6303d828231*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"ddc415411b1b73a33ebdb939c8beb7f2*7&0eccfe9648462ffee8c59b271dbe75d3*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"a583f2de838afa9835c4ca15ef79b7cc*7&6ff180082ff7b54103ee17560f9bc54e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"c6366ce776973587f79a3fb1c4b6a663*14&c72469ea0158a8cf571f06b65fe3c385*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"64036dc01bad14a7af9f23fe892e1a95*7&dd4ffd9f446c8f324d04baa13f53b47b*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"1977aab23a840e46268d57e3d730cb06*7&1e3ef59a0283ba87a4381a2b25507505*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"94bc452ca7bab46bf1190825312c4378*7&202b72ffdd215f496503c1f19d5182e7*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"00df4f810e34edaaa7e417cef1c46326*1&0926664e211f6c32c54cdd10dcffde78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"a727278d24f729e9ea5a34b2af8ffb49*1&326f62d505be2c202af4139a8fee31d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"39bbd416d17c1fa54e1b6c582772cc02*1&bb414275668702c59db6e8f9a69554be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"41c45e77e2db4c7e95f6f9436fc60b85*1&977972a61422f7d51543b34beb95f601*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"7886d7124e38cfe9a7be5fdf074fa6a1*1&08af2eac195fa4d847547a80d30cbe67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ffca0d84c672611d56d2b81a99d93ceb*6&5a43a455c21fb271fe23fe91ea7c4d42*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"78fd7c24e683fc6484e0063d9efebb34*6&b640687b412b637ffd674656e1237b8f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"86ad378574ed870ba2229b7adec2e7b9*3&358fdec42776244b6de5f30a5c8ff106*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"68a029b15e9f0f6a48a8d185ffaff60a*3&5288ae00af50bf9f25820209afac6f22*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"73a4678ff0f57ac1a755ed3e84b6828f*3&3597c9c3f6bc7c0278b09c15e5359d66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"1c119ed0157a7565538ea13a234fce08*3&2ddb3578eb096dfe0153108c6405dc78*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"69c3a1de6e77a0a9113eb68f03ac5c4a*3&29e28e8598d184d7715a8b8e6f19981c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"a9374c575cd5c17b3881664d97340a6e*3&7b367c4006acf25b0883e7ab3e8a8453*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic80oc128_ih326oh322kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bbce2ba7ce9ad27e27862171c0685079*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e7797521b976e63b949e12894fcf9a7a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"261b22706bf6147476571ae0fe19f20b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"13a9b3790829a88f7c510de20ba5861c*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b5c58759e7f0416eb507b98fb7277bd3*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4770dc8a6a142ee048270a1ba43a8c78*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1630ow1630kw11sw1dw0pw5_n"6f0a6baaff76562fd62c730d1c247ada*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw8150ow8150kw11sw1dw0pw5_n"b6d4253052cc4c71e55bf079c2064c77*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw89650ow89650kw23sw1dw0pw11_n"5eb3b3462622cd1b1b4ee79441bda067*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"16f5a0e706898613dddb533b216ba75c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"2ec5315a9b848ac191855b2ad6e897a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"d4b1fc5fe8c87428ccd5b0c14681ac7a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"54d648e4a1eb71a2814a601ce6f6bcfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"52d29d6613219eeb7d8ac9e5f0ed7213*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"ebf814fb8d2f48c39ff7d20e3a9b35e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"526aa7434428167c0559e0d83bf6daa1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"9a2e4fbe282fd84e58d08737d08b17bd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"9fa543f220fbf7cc0a00878d18f0e623*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"341a99be79c74f559ab6329368b01c7f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"d53a90ca9bd446333f4e267a4bfada03*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"b715b85d1084a9e8368e8645774a7f39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"9951002d9ea433042f1e1aedefe38c6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"0908abe1db309d7a357dd55ec41ef630*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"760e69c00096763848d4d082ed9f32a8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"cd530adf4685ef3991e5a7d4cf6f1d82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"ff276a0ad4416fe956b83c4267387095*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"9409a62a1e72b93b188248c7635366fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"46f199a9aecb1740f065f0855d05d414*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"9c161b3e6ce362812a353a4ac3c70eba*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"18d05b3a1b96a98403f55efbd9c11af3*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e9d18f9584b33bdedd459ebf5aa6e886*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"3d881bbc90161d6ccdeedf4dca9a95af*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"85bcccc2f80b833c366f079a8a9ce85a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"329aa37db42daa3029a21af06ebed1ee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"44ef1cf4cf4a0bf541c42880be3f230b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"b2e5f12e1c1aa1f4c78713278f4fd5a6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e40365c123fd6429ad1680c418a3f84e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"69c004525bf0e40728bddacd351bfd42*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"29131d9b2f6c44bf11f47aa69d08f68f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8af2dd4bfdd5e1b7b7fccf6d847af45a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"1a0824341cd13298ae1456a4837ebb2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"2f4c777925f88618e2a19c707fa6fbd6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5c7a1f17d4e0fadcfb9b6a5171e25403*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"6e64c6d45957614e1375e4f196ef485c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"d9b8e0ee313fb4930bebf43961fed907*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"0fc74ffa1a526e57d71919cb03951484*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src0:common:1 mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"883cea7f2d90488aad2c39410a4efdd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f127e0e04020ae62dc75afd295a718f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb32_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"00b1a48ef1d45a5fc8f202f391784263*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"cd39726632c8e84b807f60f2b85bf47e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0f73cd239520c1266724e43b3019f60c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"deba22ddb8fc29fd6828fe98d0e5d273*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"e61ed1b3d94f04f54d09640f86ecdbc8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"392f165597ba8545469d01ab3b44c6a5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"da9ccf2e4bf7c9d937c1bd88ccba3ad3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"c5a9ab5a9152d473afec2d8a4a46a2af*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c1a54f8eb94c0a1dad911e1bdc4899d0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4d0bc59e323b297b548fb9cc8bd91f14*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e1fcc8f2886d4fc9eead61b77e459219*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"e016691fe58daeabaf58d1868e5c6a77*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b2a47625a60a9b9b843a822963eda7e9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"04389eb5155c1984d89296924e1ad41b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"de8a6a5bb8088820eaae241ca6fa4234*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4f82fe37cf8892fe5bf0ec6c97a566d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"91fe06fb79d5fa55191b97cfc8c387a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"a702f6530e1a83ee8fc015b2475d839f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"0638148580ca234e8fe271adb4f6193d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6d607ea23fbba22a31eed878087e6282*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"523be2e903121321fbe024ff8b9288e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"fd8b559928e36cdf758caf0da7c436d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"1b00a02dbc93b2f3d6190fc6cb3beb17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"3afb083fe170dfc432804c143c212ff0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7477430f4d8df6a1ce2ae443d6fb3d22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5a4415b39d601506159ebb94d84ee03c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a91ff5da285909534e74fc383faf01ea*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"275fdf00a6d1c4a1722368a3e51734e6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"707ba562fed771a1945d2bf2f70d7b43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a11336013796fdaaca9e678ee4b3f4e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"900cf41b07ce87f5907e080f1d30270b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"aff2fbf8b7ad538a611e854123d55fdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3571aa7dd0f7a00d57d9b808030b41ff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"718ff522ad578a55ae92a27a47c19c35*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1045f7c6dc9f27313fcb424383ded6f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"a3bdf22b02dd78d1cf4f1c135dd24df7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bcee76f18d217fa62c412ddb4a885139*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"13374363faaccda3fab025b9316e3c88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"95b931f416601fd065c13dba4333589b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"bdca7f5128b0e083f3644493ac3eaf2a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"861dadc422134e611b6515c6966566ea*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"20eafe783d040530d92302860b3ffff9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"61a7ac7f5f76446ee2efc9303c4c8f1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"be0a8697a4f4ed2d16510e26f66d725a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"b54e94cb88c70bedfd5ec94938c17422*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"ad54f19af827fb804e0de09423d626dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"329c0a94a32a448427cf530ac9538564*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"0afb80ec0745e674a2c006c8ca2466f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"168cff5132a440ad144ec513d0546143*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"a9b46ac521763e0081ca29cf0c9ac1d8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0240fda2af1e4198e3e81ff8a05d0144*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7eb46b0965ff8816610a9ea2f974943c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"609da027c0480ea132d521542a6f3a48*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"930659c114a52a22cb7149fe0d67378d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"7c39acce7099fa308eb9c106d2c49da4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7fe87d28dc6e80bb39cf047ee403c33b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"9d4b387360ee7f59ba39a1df9eba53c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"55190eda4fd42db723a30f97f5cf6e64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"2eec78ed02143280fe75d24b732bf513*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"522c9f543790ae7894018d6aa30bdc83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"db691a41a9a4e7a93800e3a5d64cc16f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ae89376f12611a67b047e67603e8ee66*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"300cf11b56f57ea3682d812e524e3872*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"45c7af3665eaeca3640d7694f5775944*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"2d84385c943109c9b692aa221b3e45c4*1&db3024451e3c12cee4e145272a374bdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"92af32aee75b156c2e69b6ffc04ac3f3*1&1b295d1f3f5df1138b050a2b697e3bad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"74bd900befdfacadc3295adfd4f26cc9*3&55968b3d6e2dd9ec656c0477c3fe5a14*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"0cad6d5eaf9f7ea6d577d786a254aa7a*1&4eabf1cdcbd44a3308f38c53dfc21575*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"551a5f709cd4cea95c1330889a8a4951*1&1f2599dd5738539314452a6ebc99c581*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"466b9637e36e29e686c05aa0112d96dc*1&7e8c67866169e6ed2b6d5bfc99b99ff3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"6fe2fb1d614294cbb2046f03b437d438*1&02004bfd915e9f0a1e44666fa459bfc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"64da0b71027e4b0beabd4565a0d6e6cb*2&05ce540b54832a0b348f8e698af38d5a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"fa0c4f5e06b29bce580fd870aca79026*3&aa005446bb012b5b30f6fcf122439c11*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"0227ebd1913c2675722395b96fee56a4*2&ce5c1a375da10860f57c4a8eecc22b04*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"8ee01333d7988068c9ac6edb3034701c*1&2922e16d165bf74e1f797613c2a6f6a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"2ced67d6ce6c0eb0314bb22396892cee*1&0146f6860a2e408a8d8e678b8c928dce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"172b587f170dc0868325154ec04b69e0*2&0355b44fdd8942f6b93d9bf0343ba438*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"929e31236ffb8bab039becbc05eb45a0*9&8db78f1ad6b4d40033d4a21b8bdb7282*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"15a6d9fa504b8f8aa126e41c69a69f96*8&c1e73ec8ab1def7959e7f080b553a2bc*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"dc180782f67e0886ce2f72ea259a6378*1&69e59c0fe03909ba506b81b6ba11ad0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"bd7182d69ed77a4f9f7167c146c2a77b*4&f39933c691a89ce0da46b8d1407f36ac*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"2b99e82125bde6b3ba68b14517126354*1&c08028ffa414111a2e1eec6d68477f96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"047abe34964252a2145370a73f3b3d9e*2&566dfab2761083a59f7cdffbb6fc1025*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"fa8c7cc782fc06e282404848f469dbd9*9&eda36fa8d236a1133491e397217b8023*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"cc82af6c0ca2df92c60d63099ea633c7*8&a7a96d2c34c5a056e5921e12050dfaf5*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"1fbd6e75ca44efc4f00f494918dc51cb*1&3105a7ea724bbc277546b46846a63f0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"191e5244de171e1eba980d27c88d4b0f*7&163121b5c2bbc63e2740306a1ee17bb2*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"0cd623da32d5b8b6abcb27cf04c201e6*1&6dad301c0e5714751065054852a12d54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"90377a8c8d14f26eb7da786981549ed1*2&6f4dcc6f197f82cbd02af7dbbdc70325*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e67b7afce256a01b12d21551c401dfd1*5&8fdf953181354863eb3221f0dded66ef*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"05440e5427d0d522c02c174e5b769d54*4&446a92a3871c467fddbde345a04f707e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb32_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"15b2f07e5a31c262e951f056a35b3756*1&a2ff2cb59815fc991fb1eeaab089aa73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f215f1dbab7d33d68f15053733f53921*6&752698442055a82f3dbbc08ec3ad7a8e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"5c16b7020e1a1ca1ce209e9f9a2564b0*5&8206181a3b0a873d2dca3c4118badf00*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"535f2b5defb231b81aba4792dd9d3c08*1&4fed0689bf35308e706052567be262dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5e31642b043dd458054338683d00c157*1&61e4168e2757b9b364af64ea8f257636*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fe4ad87ed36e9806771ab08159d06b5d*5&4f0feb5283916990e4a61249c95ad642*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"9c95df80e688ad618f80dc99fea5b5b8*1&7f41177abb002e54f249a3a035704518*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"2d10640f8c9bc59c1ef88ad0e0a10875*3&694223c4e5d85eb8c3dcd4f599f236ab*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"85e2bf3674a96a17f7452f32db8aa86a*1&af286e1c3f54339ba1f8bb735c34cf9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"a442b83b9db112bef306f018a78ab9c8*1&c2002438be3f7061501796588d81fa8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"47f08d09caa0867af2dc06e4e1612c48*1&32bfb3784e57c46cef8330917ffbbffc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d7c487fced10ced67bafddb580144c57*1&35aa1162a1ce214110ffbd5b39b7d34e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"63706100487cd2593bc3fb69c787ca71*1&f95e61f35c76d4f2d387eeff2a1cfeac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"0127eca6e8359e8ffcd4e9730f162de3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"37f8e6ee5740e90226272b5bd60532c5*1&da3a62045768fde48e1ae0a513528708*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"66cd8bb312011142d3d2a99999154508*1&e18faf01767378f7dbf850ce96cb67da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"dda728fb8f75931d3ae75a42202f7eb5*1&1b5a2eefeb67f3dbd0ba4ac9ed6e9ed0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"65cd8ccd875041075ad3d01f0aba69a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5f757acb0952eecc42dcb8cd14e175bf*1&d7ccc322c44830139f248d0fee8210ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a825617c8b66fea25f0167d6cf42aad9*2&ecfa89b0f9aece8ef5a3a948abe91606*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a86c552edc6db62f36ede96857e50a9b*1&a95460c5861bb89a4285290b00465e17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f457b6f0e502656884d4bddd12305419*1&42f395ce80e5f295014d7bbc96b96da1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"4e858c37b30bba6865020a38499f387b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd8c632f11ecbb93669256705b085080*1&9e7b393af689bfc6892ef639da50f2b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aac4c595cd8b40d67127e05c3acbf345*2&cea675f1cfdbe92069d7708dd9db4d96*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"c6907f06e4a62b3b145d5cc92a2ae2ea*1&a96f460a7edb61dd8f5da604c77f65ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"29fc4d67e980cdbd35bc5d70280be0ae*1&bb09d09e270e2a48e852e65b0ec89976*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"6dc685444ce6d3794f9b84afa3d90fec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"15cc81c8baf02a041641ea7cf758f266*3&dd0a706943da17f89c5f7df41a33a4bf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"076691a76431b5c0f8367b3a1ddf2ea3*2&23e80ffbf13cd591adfe62ffa663a682*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16ae1d65fa629f1bab60cbdd5f7d97e4*2&8700765db321cdbf53e1777729ad4e3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"819a2b2e84bdb1a9fb4d94b8586bf8ae*1&0ec5d5f0ff617da66c81d422e24b1f36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3c00ba482798b1c0639ce9c30ed8824a*3&f2f21b892acb220f3c9699153a3d8352*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"bdbfbdbeb71e52e24d11772d6de88005*2&ce75f27fa04845c0de26871cf8da1685*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"964818d171a69cf91a6fb5c7e9180042*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92c8cd3fd85a5b590305313d2ea6acc5*1&4b391ed45d87a02aa5afb79e8c61c2cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4e0ccfc4cfc5f25c2257fec1a416d949*4&70db3eaa706df3f57e1767003b82ede3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"8a3185e4585aa2a791d59cf4250ab6de*3&f19e6dc491983ff19bd58da79234e041*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5bcd1f4113615b90277a26e5f3f8a7a0*3&745dbe23729de973ba39dfdf12caf1a6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3128be465cbde3fed6026118ccbaf2ca*1&0ac016d866629a7d21e2baa9c13fd3ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ea63b622ccd6f09a3d4ab43c8988787d*1&b49eacb0ceb88facbb378b3ac428c079*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"211d3f5bb6d3fc1d99105d0103cc888c*1&0426e5fa5e363667cb7329d8972f0f11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"80c730b5de2c048820e643637283d7de*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"e3698711fb358b7d0c5256ec1246f68d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"b4a8525be1ebbae7446ea3872991a88f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"8faeeea0961345acaf09300f0adb8c9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"e3fd11b02b84e89981ff21650b51162d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"fd45b1bfc6b8b28e14fde256fbe49357*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"1af782e86182995737887d2b45284ee1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"5983273919506264c9c1f622d60f5e1c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"7f299224c1b8f79196c64e726eedb12a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"7c61818154d109d8a5ecc8ac8b827130*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"1b6d64e0630f22db305b647ee2e0471b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"12cf703abc02126b6da6824d8214e11e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"1dfc2d9419fa08bcd174a2f89b116595*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"faa946928cdff5b79963134910d772a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"88b51420ffda89ff52f2b8400a77ef99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"c37795d93fd6d9f8d2f1b65828db5d05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"5a0be8730b7bcde495190d47ecdd0981*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"f023c5b491a91332d52a629a13531ff5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"29ca0e6500d43756cc4cb743d79971ae*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"aecc5480d4a1a764827c9780771371ff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"7bc31b9d13bfe9b04184a2904f9115f2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"a408691b428a04db97b65e5b9dcc3ff8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"254600a8fcfee5382109b670045dbf85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"870e7b77e14626dfaa55f0ee517b3d8f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"ab71cb0d8223df1304515ab5b18574da*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"d86d1b880a6aab08c980980c28230416*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"93f42d93b682713218e2c907447d671e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"2c58da76e4497a1e8d1edca3c9fdd565*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"a019ebe61462b95f3f5573813adf8c14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"e388e43bc6e3397b851b658df06b9ef3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"aa8447c439fb1464b170053cea80987f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"60152b4016cb03badc719e5c9ff49b8b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"8a99a6230fa0115f088c97c326e67479*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"3784070e8383ea4cf3aa53428ee0b201*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"b7a08cb4357f1d2f3f673ac06a4696b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"1bf24de752cdc017d052e47f18a6bed3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e87e2ac2caa3756c36a627cb408a7999*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ca8cb72655b17ce3047c4e9bd0b2f701*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dd7873728699fcf34b37924365d6e3fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"93d0fdab7e9206d9b1a691abd23c4223*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6d047d020813d5b1c7347df2c643c4e9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"66399d58d8ec1e87884b1922e101e824*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"9912e96e052ad32debb065f28714faff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"e68ffad14f4d61162944155857a16f61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d517adc3955049a627d32faadb96b520*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8bf6a88b1df3b4431d4f7febcea11396*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9d533d6602f138404411c577392d7ad3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ba4a686b67c7006e5d0a72a8576a2420*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"31947af9c98a1cf4fc19b2af82af90fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"a93f979317e4316139230afd9bf6f56b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"27fc4eff9e091253d973c764d09946e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"40ca73a4ff9487be446e527aed95a9e2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e01d59d46e228b0411b774305bdb3bf8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 --attr-zero-points=src0:common:1 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"5ec7f6bf0147c901c0d8013f7380c3f9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g80mb1_ic80oc80_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"425b7d19513bb02f52f14d3483fa7893*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g80mb1_ic80oc80_id8od8kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c6eb447c81a45d0498613c92822c5e66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic80oc1_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"796cc279771c0b8888fb1e5af6b9c9ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:4+binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic80oc1_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"323acd6f99eeb3440617e0621d6c4f74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g160mb1_ic160oc160_id4od4kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e97c5e835238de95fece150a6ec35ce8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3cf127fb7dc608478dec67b5705ad76d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:4+binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a5eee4e3a3a7bbb1263b0b3bbfce3d32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb32_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"888ef16730c633367bfa6fc8f4389d74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g16mb32_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"2cbe75755118ed5c574b3a9449d116f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"900e07001d8396f89939eb685bf99659*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"25f4e018fe337a3ed07e0847d25db8e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d356178b6b79c47c0ae6f1baaf4d9368*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5f5bd4c1f77c3cf3e7fa5ae88aeef05a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"137d0d69acf9c8f144d713d5bba12261*5&1d5e2ffa6d1a5d12100c7e0810d96d91*5&debe12228355a112b63e436fc76d46bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"164905396c06d7353a39f50c5d3e74f3*5&600af6be43e8a67f374d08ae3ddf6d42*5&376feff516a56a10f962b8979e2a9ebf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"950e573e8b0e0c98920ec65d7aea5a0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f8afaeb781a644b02968d1411a9fe76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8a16ad9ced38969fd9877066a055d38f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9e01fcd438737c06ecf5df8ddbde7e7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c9ab5bf591b37abd670f7a082c62d7dd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e534ba3f72c7432133e45c1ffd66cb78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b27e681e49d4528634a740e59abc540e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5c6420d81d655cb22554e64ae866a06a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a893948a95896445df1fa87efce8b513*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g58mb32_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"67f030675c76defd661e72791f4f6322*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"62c084c1dda4cf5af9d043009c60893e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"d02edfcf560dd55a3a1471c404e21089*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"61c8bca8d70f27c0f7e904712d1015dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"f65be8e3906ebc9d9628a9b67f51166e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f4692fd6788652f309ccf2b9f6bcb89d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"9f05f142d05122d63495ef52116cc4fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"740e0187e1f10362380a99b7ae5b27d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"5a3c8250b653ba6c03b5d0d05300b336*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4da2f7f1f86e94ce0edb8da2ed6d532e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"39ba59039976d08954a25088683481dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2260b2d06b40fb36e7d08edc6c432e83*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"51ac3196043568690064d08194cb5910*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4790246b7544e38acd71c47a8927d509*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0965e9db09a856bc45329e2a68853a25*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d00dfa78d55e45a74ab38031c2024f44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"ceefa48cdd32b55d8c11631e884f5d3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"e15ab340f10b7f2604d0f86636fc3b89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"8c499642f76c6994b4f4c802f8ce4807*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"55a23a395b27ec2fc14e72ee0209edaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"59b0cbbd2b32c579179424859755e54f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"7da86ab4ac77493a46bd4003537a81b5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"4b353bd5368a5495aab6fb81aa0a3094*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"89ad5b27f22bd0b0bd5b76ad547881c5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"ed8310123bf41771c89287ad32527191*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"818e1ef4255fa4b30f792b1fb6e8e8df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"19fd25e425638c59c69ae5bf655334d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"14170f043c354466512d7eb1199ad631*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"5cb1a8927e75abd2c16e546185496925*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"08003937d372d1d57a675cb343e57f0a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"c0fab9f00e9f63c7fccacc884f9dcbf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"91b097b4aa06e79737c03d6d8006c651*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"22ab44996cad84f1039274a9e2e09354*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"b374b817a7d3453addf9026bf4971186*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"87c75c43314175cc2bdd0bf8dd16ed07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"3164bdc9604e75839253adb1d105c573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"e38df0e52e22b70c572027730f031d25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"35c13097c713073ba50e088a9aa5c02e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"1b7fe8916135868e89dcc9fc527b71de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"7d058361ac0e4917b9933015d5542841*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"b4250a26bb62dbc3606243061d1d62d7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"4440fe62c91671d86807116976de203c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"3cc21b3c938358c3c14889d15e55e427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"b1a49e1efac38926cbdeb031429b9776*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"c39373792a52c9b938bf74e88eb5fb80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"356fdc028f7eb2d02677de3917aaa1a1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"13752f0d59c34ee027113f96e54eeda2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"1f354083e3820bd68bec4d9ae82407ab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"7756536f4081ef0968a1a546acc43d32*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"ae829da579cf5edcb855a07b2f5511bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"23a38510f76435d4db02d1316e12f839*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"cca21c2cafcdf0b0adcd02313e7c7857*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"1dd0bbc95325e973a1eb4a453c6f83a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"3a812638fcd1639e631d62f376a40eff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"c3cafe442b54441fc08d3e41444af59f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:0 mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"3794fdf2011bccd09b7336eb8549750d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"fd426285895163a7c065159955441c37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"9021ba315ca2f8b4c61526a72521a427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"ced3f57a1d849ce15c3790711b7a3f23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8fb95b20eee4a6aae62f5c89f95ff1b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"376a47f9f7405935a99926d4590ca574*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6f6209eead6b48b9e196f3a5c4e891c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"911aec609b19d090df45d0ca90ac62ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"079a0661ddf39571de949b5d4af04fbc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d7d5f40c1719200a3a76eed552b208c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"3ac3df525b4a969b54de2f6da25dee76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"337ef4c0d07da394f47e037ddd069096*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"352745d702b5d5a707b9810aec6d8888*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0db1e9221b1059595a786b1f91703f7e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"810b844fcb6b21f0fbe43c162b39f4b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3ca934698c581a468404be9e9ef5c961*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8fc90dde01c7a6c04769478a452c9b68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8c1ad5f86bb6b7de45bd9ed07b8bdea8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"721102e3111f17521c76904f4ad54817*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"248d46effd7d8d04b2918868551e446c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9f52c4faef9ffb1ffa91a6edd939c3e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"367e5e0cce337dd3280476f790eccc3f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c63309c9858ce59f9761eadb6a0260f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fbf596b0076099e0892b696e1fc737a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7bc58fbc7cf810f3153e67af8cfea4c4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e1ecbb5bb378400af223f6c4c7b90b3e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"579b0ac95576bca7a144b16efe1e12dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"be6e37d57113e5f156b44c3b9d5f16e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a7306a59297f2d8fb353a6a0887b9c2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ccb265c4c899185f1178654e545850b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"089a38a7fa2586413290709fc80bf1ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d53c58053931fa0761e242a5195c0d05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"ce7b301a299255361f1b9699e3b87b33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6443d69b02c83c02c292892e3fc259d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"07ed852fa4c557beebd3fe1ce7a9bc75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"611505f110bcf898708dcbe5c55f8736*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c030da245b54acc8cc2e87d9f799a6ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"81521a4e5847a64e7c579b958c2e463c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a2d1622c31cf7e9ead7e5609ae1c22ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"621e785bf0c011d03e28c0a807eeb374*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"30fb72dd3baff572187f819764c0eebf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"299e3258ebf4b4ec0e6b59ecce4a132f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f86450f8cc7ea786e52448c242654c58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"89f21408a7dcafc0b41830fccb71c34c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"485bf21ad5ac04d2f8089fdde90661db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"851db5b7e749f1817daee150953027d5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"fb3ef78610b57b2b0827d2205e4cb434*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"699ad00c0afe6174d7d8db3abdef2a6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f7d48b2d8dd6546bf15ee62c11b8a676*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99abb7ca9eee3e1ffef0a8e50411d4e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"861b008ee2197a9d8b826539155c4c85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"775e6442c73128b4357892132b25f57d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b648f96a0b9d2f9f110a3d2903dab6ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e10ad96d3da4b7ad9e97735186d16641*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ea76147958b7993eccbe288eb8e8fe7c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb32_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5f15ec9dade935b873135584e84bede0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8c76bcbbba672a881b40913a9e736754*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3877de7eb5ef68efde49bff54d265e98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb3200_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ccca8aad6e8edaaf06868d8180af20ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d40b8ef3f7f95983207f7769c08ae871*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fbe38474073cae88857f372fd55077ad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6181cd67b8d7db5bbcfee87cdc0d3c3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb3200_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f8a579d1fddff5fe7d8c28b6bc3dba66*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:0.271:0.314:1.234 mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"747c4c27f6005de0bdba09f4b8a5cd92*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"6f10aab7c4bc79b46ffa74e882ea5bd5*1&4308967475efe2eac71db1cc9f6dc7da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d952079cf7f7df616e456611d1f1b78e*1&e06bbcaf584506e3f1c17b6e1c866c54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"fc57c1ecedca573c615ff4a4656b6f5c*1&bd91f82b6d7af3984245aeb9192f55f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22227939b2e63812ec88ffeec45fe824*1&581071edc23288d63969aaf770e2d410*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"5f03903611a67417bde2dd288f4f8467*1&e5c0916a04bac498a786b4a946008913*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c96ceab0762c912332d72478393b9f38*1&83bf8d3de5efb11ce7de9400ef74f01c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:2+eltwise_linear:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"f7d3f7f0632916831e03edff771a89ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"d1be64baf661de8096987196f66a4c66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"52320fce78fa1f9049c75a5055d62dcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"b810f17817b361b05bea2be25bef2ae5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"e23b5da07b9f28cef138ee8183a466a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ca3a1abba29ae7b25730b1dd99c7962f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f87db5a9b7d2fff009b6d305c8ae477d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"01fa11283e8c4b1d4216286e568ba4a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f1140d16e83b7328eb8662255b9f8dd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"68d91041d91a6987ed069ae721622d5a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"129d0c6e476495bb29590d99e9fafae9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f62c970b8acc3a00e5c4ce1719bb60ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0f854d42184aa3696b2508e19fe23e9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1f70af433c69d93a081a9ba49bb667a5*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fc5184e1082f25a5469ed0430272918b*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"17f9d6e7aa2e65d20448f4f32d7cd7c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"1a113d0ef3e5fea435bb20d2503bc655*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c1d893dbe79514f93568639b77c18d3d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e9501fed2b848e82a526099a7d82de86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+binary_mul:f16:2+binary_add:f16:2 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a7bbe289d4bba500b5e9469ee206f08a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"8acdf6178576d534be132070458ae611*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"63cb6cd8eefcce3206a06ed26e0d4977*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0ec80e1826f5df24ba8f031bf3e29dc1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"846544f953f9c9f431693cf12d3990c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cbe83954bd6065d168620b8d4a0247af*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4d1b14ee2710966944a14e766332b4e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2cdc910c5aa0df7d385ae30ad78bc127*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7cfbca39892a4217a93c8c37ca89dd37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"bba681ab18c1bcb7f50b8ce594976801*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ecb37f0d44be0691a8e8882f4eec6235*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1820801cf45a28eb69b187469156e56f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"defca72f40adbaa27ef452ad9dcade29*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"87e01f1b36ab192644291d4e3ef3468f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"18a56ed3409f793f8c5695ff694f8bbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"1da9e52c05f293aa26c6eab67f874509*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"635fc5031ec408738a0062657f5328fb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"55a2cf3e81f7b7f06b612a51ccb33f68*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f4a2f49ffbd1db4e69546e03afac318f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"36266e004b4b750d7d8a0119e23848e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"9b6e310b6188703ba53b030e95d92914*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"b9b271f1aeba34a70f112c19f8a3e1fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"480c7d89279611777c265e609acbeee6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b3f9201a8471bb5c10867e3f4afdab6a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f84f32dac35486f3c82caec9f1e9877c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"13c51863b837776325f8c2bd167642b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4496d56facc56d395cd37823fb8d86bd*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4d297501af3deb62d2763962d5b99123*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"bb8776fd0f6383d3d2d90ca32fe45c98*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b692f70200845f3681416b4051bc50d2*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"74f40e456edec7c62e406006b76de96f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"544960d2da1b2646eecd5965af6cbb6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_tanh:0.271:0.314:1.234 mb32_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"d8ed2318e7c3927b481e1ecadf431d32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5d50d5b18ad8c40924aab72db45e57b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3a0c521443edb22a233383bc162d2178*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"aef7e397a13a0552bc60f80d07a2bd5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"46a194c39d7c85cf11c899b2f5c3f181*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b0fb55b09b76c288282c28a94b8e2ba0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2fc9d07c06fb535225ecd11e5cebaf7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"27cd7fbe92e5b151f2ec714d190d6db8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0454b9046addeca98ea29bc5d31cc324*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f81bf2f562145b82e927fc2177d58e6f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2ae489c48001bf5729ec5e15542b50a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8423878e753820879ade34a42afa27ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"933f0cff698b30bab8c2a8ecd41cc1b7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fa442ea8664ae7c6115655cb729dc9eb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"afeade0cd8fdac8b6ecd2054707a7341*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"150f49555a24f49b6e16a348c4d3ff25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"da77ede5f9e1abb79b45a25f0ac94900*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"2dd91908dfda39b2df92f9c0047abb05*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2 mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"5bb37a15801357fda48d77f18fd556b6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"54fcee2cbe73126d476321f3dc951f8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9bae95d78f0fb150f3c95db16a513a7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"f43b3f423dab71c7242cad99acdfd52d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"afa832cba5ac82bc7c3fcc70228f1a16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"44199d46f812bf400ee54fac62d7f1d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ae4d1c8bb5a358dc7dedcf35cf896b2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"633da3157e391f0cb30b4e59f6ccf3ac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"44cc3f23ae4ac791e8b57f2f2f22dc27*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e74d19dbb4109670522328fbc070b50e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0d6a978750049d4db4c31e2d7063ce16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b5838090b5f266abf2c54e7d0e35a1b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f9ed4b26bfa82abd470b94538277734b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"90ec7f73e24ae99088dd11fb6b93c10b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"76158a534d9a2f7c3427058b799cc211*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2ff865759234c5e56d939f7a5391c7e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"c62ab8e1063973e93350e95f1f340ea6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e0fb0564693501bb01d216caf49cf1aa*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"2e8567522dee2397db679079481c4369*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b0488f8d1a3f43be651693708841d595*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"656c18b102c1a61566a364cc05f66b1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"e0674204a936075bb964205ef6ebd27a*1&63f3f225f189759bbf807881b4739d3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4ac37fef91157a11601a8e4900b91e46*1&511cee08e162583f8c33f839eef88745*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f5b677e045226c40850629fb1e4de96a*2&5fec93c40b2ac4dcf478dbfaa11bbd2d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a019f831ecfedbde60e4ffffa75f6bad*2&363f527d69017b1e598c5825b606c8b4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"82ad7cfb92a2225aa1cc14832982853f*1&a5fdda3a1161a5f52c4e09e18549c8a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c36db7893b3a652a4df79491ea2e4d6f*1&ccc0b45a740c4f4fec8b6e72a60ee062*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3427076ca7976ad223debf5612663676*2&65a5b4ff29c893518a39c0761a285e46*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e6f38dee34755b7ec48651295469b477*2&171b81c99483a02db9955b3450c0fd1e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8e292dec28c4a33c6b111a096a792499*1&4b34277baf016190aaf3590675be3b76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8c7548d3fea5a3478ed3c5d4d254f9b7*1&965695e37540bc1efb03b66dc359fb11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"da451d28a8717157c7e3b11ac6b02a42*2&e0a85eb5c676a5e99142038e75963e7b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bd47600829ae55355ff6700ddfb708fe*2&f4a83740a9d9b7262733756d1bde4ba6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8b7c9f69a8f2d99cbd863eecb6d8af1a*1&43ec73b845411406aebe24feadd058fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1043a30e03bb1b9ba7574854335485df*1&ad512d9a8308aba229132e01004cd79e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8bb03b5ff0e0a267a5cfad800303c330*2&3fb0a2713082a3b0f881198bf90e383d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0bc92ef23720ec3cc56cb7ad4d579628*2&1f9838ee3abd689e0646bb4a5b7cb962*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b2d5b8bdab65f66e17f134b00fc60564*1&03ec41da0fb914f04aeb0de9fd7b221d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"06c78915826ec16b4a0bd9f850ab9ce8*1&256e03e546c78c3b91883f8c673461fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"8873b38c52ef468a454ba1522ccfc02f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"899fbf4ebecc8cec4b7a47ec12b11fcf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"b8e9c99f65f7ad1edd74d9a1c1fa5a13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"d32a0ddaab0935cfd8ea468dcbd73bf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"47f147a8cfe8e75c358df5a5e2a3e53d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"c50dc97b9981faa2158dfce51d4bb4f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"0749abc7aadd7c9f3e5a890101a3382b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"35ccc45bd41d5038a5ab2755fcfbe3bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"99936f5e6feb97d8bef1cdac7691867c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"141a3f1de6c40180b9ead4829d637b7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"12333d76067d24025bdcd3fd502c8f6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"d594f5e5c6e2e67c0a4b8eb23efb4596*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"3a936d0071fa5fbffd869a13a93f8e7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"88c3da3dceeb77a13f405e7c9c467555*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"17095e292bd16861f2a1945362a4051b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"64ba43c10e44cf352411357bd8171e3e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3850142d2d5331399d5f7779b7954ee0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"68f31c22e22bbe88a6aa2f47578aff68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:6:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"d196cbd32dfd312e2b4776f0a7061596*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_mul:f32:2 mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"e4676dc0ce69f8a8f419f99f2849c865*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e0688ca109062c0546cb3b45aa089090*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7f59f47b6f3209d8807aade1aa7b1125*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"767b1020e2fe7fbeb56da1b0a26762b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"32d1a66a669583e0f9fa6fca18cf1392*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"97b44d5406d53ba444b4820691f0ec80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"f1c86f1c75583a9fb68861fd777cb5d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"ee43a5f41aae9851a6afefc11af5e600*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"7a3fa464c83d0aeca3c14f5ec2e7d187*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"f200b1fbeaeb10cdc83a6e5c81c5b65b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"eda32dd0b4aa9d7d3dfa8fb776110cbd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"1952c0030701ebff1fb39ce1ca0ef703*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"79372ec347a64da07f14764467bdd80d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"d1d4bb5dea32f86aea492b35705cdec0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"386b59da149ad76a316ff310b5e8f9b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"cc3104c6bb6deef359adc788df3f207b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic1oc56_ih320oh320kh5sh1dh0ph2_iw180ow180kw5sw1dw0pw2_n"d5c22e891f0d2c828463c12971e7be36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic56oc12_ih320oh320kh1sh1dh0ph0_iw180ow180kw1sw1dw0pw0_n"f2584a46568e595067ff800b34e1a246*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic12oc12_ih320oh320kh3sh1dh0ph1_iw180ow180kw3sw1dw0pw1_n"9bac1b72b676f04b9fd468942930be6f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2+binary_add:f32:2 mb1_ic12oc56_ih320oh320kh1sh1dh0ph0_iw180ow180kw1sw1dw0pw0_n"a0fa877604c9f3dc2e9328238bb6d69d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic56oc16_ih320oh320kh1sh1dh0ph0_iw180ow180kw1sw1dw0pw0_n"bfed49f5e722706fe6236c5bfa69c85b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"7dd6c6b8dbf9003d4a4cea7eb3c7d561*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb32_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"4d00c755a0414c13d0c809018f2af410*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"a1caee643ac503b9da7a47b9d0e92b64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"a830b5dfb50a015e420c5faa1c1351d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"f80d8abcd0d2d94216e282efe01a96af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0 mb32_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"21f688494ea1243ede94db52152ffa03*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"c457a3d7f59b6b1e525b6dccd197eaeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"2265a02eb289fb2ae3ef04e60201e735*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"75bc2ca924c653f850406b15033200ae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"119328e75aee98d7ba81d5b172ecbe91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"63d0fca29f32f6abcebf4134ffbc7fb1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd3sd2dd0pd0_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"6e34eba684ff845ca8426219942796bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"71133c42a14251ae89f67f2b6e9143dc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd3sd2dd0pd0_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"edf1d3d0deaccf01e2b15aee8e56027f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6610c5dcad456de90333b04791ac4abd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd3sd2dd0pd0_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"4c74db463656969131d386d738572dab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e6b6eb96a66b0a488661acf37b97275d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd3sd2dd0pd0_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"2b4a0821660e96aa4ec2c8960c10e081*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"625a303b0a827839636b6b9ff616aa6c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd3sd2dd0pd0_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"5ce5eb3644c02ed1d9a50e4985a877be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id4od4kd3sd1dd0pd1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"5b792c33f6b93bf22a2213b31ff5d41b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"e65efd6f34db73704161d1c0021fbffd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"7284ed984b04242a989248a8ab06e5b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"716ecf3f53bcc627f34893622a018d74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"967aaeb6731b0c605ac4fb60c7c50711*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"cd5d844f316e240b048ac30b54e73aa9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"823e153e71aacfbb6fa13f7878fc42c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"b9357b3cfa567b62e057358ac3e66dfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"40dec36349963cd1b7047ecd466432ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"5cfb212e6fbfc73cc026009dc96b774b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9e95a1b59c640777284b2cfd45d3b165*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"eb466500872ce32caa1932f5bf028d39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"05aa1fd23ef761702089b08d509a9177*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f8ec3fae91122f4590167611ee5e2af7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8f02426c03cdb7d3ea885a9abb19008a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"46df24f624072159b9e543a9e8bd0e3c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8bfb4cd91d1afcacea3fcca3d4653fe3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"de2bf476131874682ac49d0aa8266992*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"4dac3193b7f0f9aa322ad46764c13849*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"5a2d535fa249430d60394aac82ab1327*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7b69220b5f78c33d82cfc72588abe5fe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f45a300064e11611c25c91801a5c5973*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b00ca77ae06ec14709d7fff9c8dc5450*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"73f2940520c22be2b50369997601ac28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"cf3d5fbf68c796dee761c28a3d0e0f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"86ca219bb6c6ff853df15866a64129dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c777494c8208b268ffd759715dface63*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6b75e0b106ae2e1c35fd535e6ed76ea3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9fd9edd13621c6b29317c23dc46d5d0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7200c8be5964deeb9f0730ced38b3fde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"8c862e0d3ff32d947f84fa8daf3a8a1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"ff156c3f3c4cfefb7e0f36c617f664f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ce92327f40385d7de77b21a279ae963a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"dfa6d2d3461279a1cac984217974f9a4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"823d92fa41520a42af064c82eace1fc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4e9d354f8e8efbd7fe244e20a568739f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"42ed7b71d9e6aee3f14fe22728588075*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"cef551e9b6f924497eba9ae4547eb91f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f421a06f710e56973800d35eb0ddbfef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"aa52cd9d37291fc2eb5f439280934799*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"89b69f02a3e39f0aa78ca4ee9bf9a7d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a4eeec66ce0f1b81f55489c5b62d24c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"9c9d51269dd1fbe4f49098db1de26b35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"c44e4a4d08ffee12f44cccc52b815325*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id4od4kd3sd1dd0pd1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"7927fe7c069197691c97d5b172216bd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id4od4kd3sd1dd0pd1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d4aa065992b0d0a44de35593aa0e4958*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd3sd2dd0pd0_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"0ee99625a9f3173c719dd72f5d91cfce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd3sd2dd0pd0_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"ce23e90a53f52d02755e1f4047fc4553*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd3sd2dd0pd0_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"43de975163c87fbb3b7f2cd938998625*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd3sd2dd0pd0_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"252d245944f0d193d11e715ffc457cc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd3sd2dd0pd0_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"2fd1cca80a23ce9982a7f7009ff7842f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd3sd2dd0pd0_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"831eed7a0a5f782784c068aeb96bea1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd3sd2dd0pd0_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"78275bd78077868d0d2a0e4454df5a54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd3sd2dd0pd0_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"aa8496d483e819f536dcd03d438752ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd3sd2dd0pd0_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"b1574e510d8c801c0dd6e8e9142f8fa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd3sd2dd0pd0_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"62cdafb0b6a172ee6d7b3ee0af9d7083*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6a3b1f5ab6ea90c4168e038360b6069e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"b359efb0cc8b3683b141dde3672d6be3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"244aac88b406381192390adbb4dde544*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"ae3c8f75dc8f6021c3b2c67d76c9a3e7*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"e0aa6ebe6f8a454f55a65a9b2e8679e4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"441208b449a857f578bf05d9c4e0a151*92"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"9320db30fd532a33a987601fd838ac9b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"d6327caa1c379c2bef4f9e9b2f1db43d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"09a72310b1d108b16d4a9187333913d6*159"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"59b299d9921eb80c4863db09ed88e353*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"ae8991f0dfb0aae9340ba4d0deebe45e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"cc7f1938a8c2b708fc4a63fe6da9e637*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"10c134e892227f27982f44c147108a69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9c10be28ba9a89161d64d635be200c4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d0006e4f3b78a09267fab0ab67be33a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b82ec740adf822e975be1522fd6ad514*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"21b84dbc78f0cc5af8434fb4de1b1e2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ac5a4251613127d41fdd366817a5e82a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"11828e7a87d064d3ee7a733d12478808*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e195171c97572a8e0a9e47d5f1d1ced4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9d3486348ddd46520919e5d6e1c660ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3764370cd45b027f0947db2d2f9cf0b7*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"957d8a60284b17377cc8eb0752c21dbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"18b9e058d399f43f057bc537cce5b614*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"01a2d1185874b26b423fd0d2b12b9895*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c5e188501bd8b082d09ceafb47adda3e*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"87aff697e8f8629dd9f33926281641a5*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"24db16cdd42d44f2eece12a4586c1fcd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"11ff8f272657e1b6ba2f94123795567e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"225cf899b85cbe6476697f11e3c4d952*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8b917f2d91459347e606ffdf20637ede*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"725eb45360303ed4d6a3c64cd523d4ac*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c5aae9055e28e100f20c2bb9dadd1334*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9769f1aca8f0b48177a2251563a8d4ef*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"cc3e9e9936588de4cb5d96ae3edc179e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f52d2faf8ac629fec83fc9664dc7c162*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6598d80bb3d61b5376dac45386961171*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"b198353993e1057af12a64bc43e7c6fe*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"738a3aa5df288c1d2840230f8499bfe0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"0aa6a90d70e3c7c4b26566e14e704d52*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"bd1579e002611b1620624b9132524546*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"813919075ace37ed5121abc2eede09ac*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f255148ccbc84547b3e8390d2169f8f6*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"99b29e3eb3ad63b6e4cbde3fbaaee5d1*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d2106b860ced19260ec1c50954b1eff1*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"df4337ec8f9dd905ac246d0c2200d6c4*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"db307c8d8b2f81feb32adff73f8daee4*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c6893188d5422dbead6da6e541a75150*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8e8fd85b9627fd418f1d956bf11ba42c*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"65cadf842d8d74e78ff79926a56f8f12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"ad06df2ddb617f0b0ee94d2aea01546d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0aae790a28620bf760fa39fa7247523b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b9ea81e8d72aa41d6b41c906f6fbd1ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d186bd4280cbb7622120c8bd6f473e81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"690e494c75efe6c20ab5f4445d703d5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9316e98c3f1f5ba4465de505577855c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"76dbccc54082881ff4a2ebef504046ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a1f320f9fea5395f7dca4696da628536*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ba1b39628ba6ce34fc498e352e5e62d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cb012b4a54d9155552ce2ee41ca767d1*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d11d1a4e272140693cd874eed44ef0ec*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7637ae843fcdebad49a5448e65d7b912*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bad618b952f5b6cf63a9fd122635dfcd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1a7e50ec19dde64fa7590e7a8d3d9615*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6cb6d69af2236ebad8b81846b07389fe*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"140ad6e5bb2ffe5384e493c78755367e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dfe95dfcaab237e30e401723dac66c48*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"241eb632a8b1190b1f7f0dc4352e8a8c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"a9edaa4ffb279852d1110f85d7604fc3*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"e135547df58dedb7d3b30152e2e61ef5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"de0fa9014b76c3026ee7a13f05fb874b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d288e7335cf4c61a177cb3f5425f5aa5*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"18440d2779caf000fcb79e5c12301e10*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c80cc1b976c88a31ac312752bb9f62fb*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb32_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8e463c518c99d9f1d388cadbccd7da31*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"a27f57fe41e1ce2db91b488651a81a01*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2ea4c6bfc44ab52a2ecadbce3f415291*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"faa6a1d9e94d40ff2220531fa1f42024*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c3e6afccf8f1a68c9d6da3fe7d478ac6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e906526b4c7306a63d542c45dea9427d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"dfe87494be7435c8591b3f64a1ddf5a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"e027e1f1e7695feb28740bcd7702f468*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"936ffa41e85a2110c48854039f060cf6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d7dcfd86642aa16ba580208c1acc1aae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6e5e15b3ac3766f5a9d7b9774227bae3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6ed05728d3586619424719d914607232*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"51de984f48054b421aa7744d3da0fae2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8+eltwise_linear:0.271:0.314:1.234 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f7832f134237b9463ef720b284954547*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4ea32874d44978a2a42c54f19f3a18f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0111ae07728f51609f81d52e27e9ddfd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0bbcc81575dc1d6dc96087d0ffb5ca89*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c84019613a7eecaf9bf18f549de10b95*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"43c984a9e3418f53a2c466766b6f7a3b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9c3f057b233a43c411a2f88ad6c41785*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e83ab1ecbf7204ad87ab7a3884fe4794*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"144ef399e051ecd3d90c026c6a12251a*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"131cd58c5dadb4eb5bc5f4f5a71ac464*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"06229817af79375fc4e70ad9f300a293*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f3d26c0acc41cd39303a96b6e40c7608*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"59ad172d92016aef9883d3663e13b6ed*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bb9cbec5d380c8f3e34267411e608a75*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"1cb90be4e88db43ecbce5c3372710cdc*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0e1579a8fdf100103c134d12e9bdc09c*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"61acacb0a0454fc1d22eefbcdee24a15*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"57eb5de5fe65ca7a510c9b43044a0c61*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a0022f8f672ab82dcf1813def21764db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"868605bae78482ec9a2d7eb423326a25*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bf17362dcb3ab4e84eea34f39c342f36*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"92b7d9a1b701b7aa575ec6575bf5ce1f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d1710f2b67357d5bf75614a7a73beb3a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8a14ddb5781a0c9aee1027a735141853*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"d4e786f4016bdf3f7398b59cb9901147*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"2fec36b21fbd1ea4f545858acec7b6bc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=any mb1_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"66cb6abcd28cee16c11dbdbd8dacb70e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+binary_mul:f32:0+binary_add:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8b4a0392213df81535095383b77ee255*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b643c36625d4a65ea85046408470b771*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"41e5940becd66f531cfce3d7beef20fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b1d2110c94b292a8fc2e2c731b7783e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6417ea0287887c5f1e16eee34a502480*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1833a684390bc83383a583ed05bff3af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3cc932cb69bf803f72f361fb10cce542*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"54749cbf5e42166640add04491c583af*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"024a7b996e2d31a6ef31a5f368912710*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:acdb mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"66847c7085bbce2769bfa0f8c606feb2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"c481c638ab055b2d269841f36189f76a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"88a0e83ddf4bcfda7204cac0008061f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"54eb54740146877231f2e452b53d2d5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"a5579206f4090e7e64725cc848296dd9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb1_ic96oc96_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"68972f7944abad9dc977e4a6e1a23004*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"15956f9e3823e86e823f9fe087c20f3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b782e5910fdc05a4ba547477fce115d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"55c4dec9d3d1e68d873b61e675483f1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g160mb1_ic160oc160_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6947553362ddfe5fa315f6ca9ddf7a33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"fe402007de2bd0f0161a6bf539ab42f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"d3305131c1e1e4bd2a8e6028e082e21e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0066bebc186dff9151a9756c9f9a61d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb1_ic224oc224_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ca362af6db355227d9bb8248a87fe273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"238405071fc112b852f43b8cc6a39cbc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1cb9ebdc787afe38b1e4bfa62280b4dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"06d8bb87abc28eedb3d17306b1803559*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"71b0db05affba96c8ce0d31f6eeab948*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"68d57c0e24531fb072ad7f6107539f20*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1037514ca3002e0332c802fac8f21104*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9e0747565761a140496dbd603944c699*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5c35031feba1a727847cddcf45b8e326*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g224mb1_ic224oc224_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e8d5b1404fb2de31bd28adf6cc1f9250*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0f80cef2b073e6de8e2b7f746a02c12f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8ac58794bd7da59fa7b153109172b3d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d47ad8b82ad575b053c5ebdf1225fb31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb1_ic288oc288_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e216813e5ab1d5538b8ff412ccc5e7e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d29fca0f2318f6b9f39bb6d581125dc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g320mb1_ic320oc320_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7a75b06c7affa227bd15edeadace9817*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"38b80ffa25618ba0d01eb2e93fa95d12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g352mb1_ic352oc352_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c73f1e6a65e584b80cf241aef8f7cd7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"40d92128b1e888725124c0b389766d73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"61ccf46f18679f5a78657d026b6a7984*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8ef2338aca4efc94160503e043d11650*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g416mb1_ic416oc416_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"593473e835e155c10555ce24ca8910d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"32bb6b967854e777586f88f4a7975904*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g448mb1_ic448oc448_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e816b1503a181d884b4a3b83c07a6cb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9dbb7415a7d2a27913e01de248149f77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"72bf9bb0310ad1117376b2e797db25b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"afeade3bf93fb8a791d4ccf5300de0fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"bcc3b3d1d9f2a8adf20388a7d36be76a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b4a9cb2c830f5fadb9dc2b7afdeabd0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"999779482b96c1c7eff82f857daa72f6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"94374576ff8bf920afaa6e915cc7b10c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"380a23fafca87ed3e10f0093df582917*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"776014ea9f746d7a97ba8c4e8a31c66d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb1_ic384oc384_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"84f4b313ddd9ac8bc79856075fa18fd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6772f66647bdbeb02e6e81c0eb743625*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b67fcc6f1369242b548c7275c1e39baf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb1_ic448oc448_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"034cdbed699375e58f44502525e9f3a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"99d8e06f1bbb640b7a8749a870a30d2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7ec3d63f6563d61d3502ad71664aeea4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"03c40757ca1a7d918a246728d596a558*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb1_ic544oc544_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ee71ebdea3ceec4e137743eeabb45dcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"37be09c0a3187c9907db6a3cd82400ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cb3f3fc21e2f1c208974f0d2d2de7244*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb1_ic608oc608_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"65960445b39e82c3e0c5bb5cb1b5131f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0712886eabc126c22cb3a15cdefba098*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d44f40072e65a74994cde0b9b22ee0d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b1dd99246027ec5341c649937a75e764*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f9f61758a9402a57c1d866271a784054*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb1_ic704oc704_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6c0c3f532b08f785fa5bf5e3cc111d38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cdf4e8c0bed9bc4c68734390b0a5d7db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb1_ic736oc736_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"88b9f1193e053711ed4a01aa5e40fff7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"38002926980f5bb1063897fc7292d4ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3917f8477834232fad0cd164b21aa08d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb1_ic800oc800_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5b8b5f34a0275f98687bef30a5ea0829*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"49e4feaa3641cb05e4a57abdab796e4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b47630eb3348219734a2afe2f1e9ab11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb1_ic864oc864_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"21f7e6c539e8b898c4f8d43d8224af44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6ecf54effcf3fdab6b7a7ce56b32c43f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb1_ic896oc896_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5ad4c9375e0b8727c0ce404f87de649f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e9778eee56f25545dc109a40a40cbab3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 --attr-scales=wei:per_oc g928mb1_ic928oc928_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f88f5c5f1e1845cefe5b9be818d0cd31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"addbe8309b32ffd4cba2442e3dc6d9c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"418a437ebcd0f616f6c766eeb6f63c5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"219ea05f6d47ed58cde2772eb337c292*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb1_ic992oc992_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7ef671d0da42cb0e56736c65c4db404a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"475b292bd79523f10a54ef803da82514*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"93f943361aab8b06d08a49af4a63fd85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6d88a89c3303c51ae17a25d9b3f36ed4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"a865db1d3a52ab296ed6ee419b37acd4*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a7efefb8133364f0c8424922a9473130*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb1_ic576oc576_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b81644d35cd4e71e1cc28fda1eb53c01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a50c5804fd8c51db239ab2eab75603de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g608mb1_ic608oc608_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"624786f516a0e76e88a29d6de0888721*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f32da061e2fb31d1b853b3e744425e92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5748b29ec84f491ed244f11197b9e081*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6edcf9f155b5f56991cbea03881bf918*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"72a65950fb8db2b1299629e7e4fb5434*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6715817b92bcc7ec386e7f6b504f4ff3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g704mb1_ic704oc704_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"63151c269bdb2588896b33d28eb02663*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"492dbf98210d5a8c7baad71b793ef62d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g736mb1_ic736oc736_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8f24ff66969934f550d19fe644c5523e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"76d273b58ff020d00a243efbf67bd5d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g768mb1_ic768oc768_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fd35449f12c6423fc5786470d2b7f79f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bbdbc743d31b5c587c87f5365187b975*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g800mb1_ic800oc800_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ea9ebbab9382e8f4bd41f2aa9d8c7531*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"60dc71f4f5deb2e428a39f61b895e1e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g832mb1_ic832oc832_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"eaa22a692f1797c6a1b4d880bed8a797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"33d72b29ad760c2c388c9aa42856d072*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g864mb1_ic864oc864_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b5957ea4b18a5085a8e2e4c28e20e0a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d3b22682aa42526e4815343ec2473d96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g896mb1_ic896oc896_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f9dbb87e70c38734f0e2abc9d18afc6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"79c2b49d7459bfb3f8a7797eb09ba807*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g928mb1_ic928oc928_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"70f5faedd479ace50a8c8f1123bdbdce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b8aa34abc3d4d890b3c27f42c241e06f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d9c29572195e65fb0923bfce91832dfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2af209a949e0139d1a23fa228930988f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g992mb1_ic992oc992_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1a4f591192d5c83c69f02e22f95d4b1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b9cf852dcaf5da12c91268710be0fbed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"9d3fb38a25246a4b1c7b4914612ba1ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"f63e50abdaa7c3b190093e325cbda65a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"52fbd918159f184bdb1f34f7c9d84e16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0ad6f04cef2a3a74dba64715e89e5b4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"b573f6d5a30d0a5cd4f596d8e04ab179*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"5d594367b2fe1b596c187f844bc3ff59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"fc1254d4db4a8129feb3e9893cd041ee*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"487f67ff5297ef53a00152d450c2615b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"fc026e71056ef64c47da2678e03130a2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"219a4431edb77c7ec7e9552de073b8f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"8fa1b9a048c53640eefe539b319dd4c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"fcd52efaeb644ab11865f0ef7f99267f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"bf7b6fc7c206f275a57bc8c890f5c673*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"1d141df56782393536cd719a723b1530*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"be0b209190199029a0c9246d03cb7942*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"db3a7c68cd42991cceaab738573428cf*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"93188e070417e9e3139617a8708fe49f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"de34313a1bdf13f12583af820aaa7131*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7c9a733c71c24230373df31109e35017*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dc4cde29b5d73b8496ee76669981ab34*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3987bba83afb3c584f0c7b188990c873*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"0050fef3651949bedf07466441eafb92*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"f6fe1e4c2491a2082e4cc4a3778480d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"fbf5691fc93243ad80fe97000bce1aa0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"419bbee19f246651ce1b0ad0c0723620*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"a297a844c63fa0887400893e2c857b06*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"20e35ef6928e3e7c3b231c2a3dc091ee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"c517c2955cb27aa6ee3c0e598c49f957*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"ebd29d51fbb7a7e185705e465b964bb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"109eb18a11eb3b7dd3fc77ee5fdf99ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"68696a8bda4b61c6ee57c88437a6a22b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"cf5087a8b6263c8ea9a7c3d560dcb8a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"30f49e89b3b9a9c6e2942e5c341898c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"4aa23fd613169cbacf32c9c5735e378a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"03e69ec1cc216835eb66907164f900dc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"2c05c04f391e3ea1c2701d7107dfcbaa*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"7fc21d90e236557d8671b399dbe37a40*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a4b38c58815399b0f3d448731d079f8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"350d1611cdd08c767be7fa8094e86c1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"1f688006357204990cfb6f0beec5499a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"1590437d46cbde7e8dc9ba0146c82ecd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"fe0bc1ba70f950b903b72ae4e7e604a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8ee0629ded006dc8d03b4f5992302e28*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"a985ab8d6acd7c9c46aff8e2d54b1aa0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"ef3f4898454b118a129cfc7433b1ffe4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"718246251bc254d735bcea0d3590fa9c*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8f900adaeb2afd35f105c548a1f8cdb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8032efffd270996219736c365fc4c533*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8e30b89840fb04c06288b53e7840f0d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"0fae2c119f8a2717bfcf28bdf12f9ce8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"0c8c54217a2226056b9d5f852b6c395e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"0d18d96b4500e1467830679043e035b1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"d1db26617bc63c51f6efe68754cf5974*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"61e4874d613de892aebd9cb6bb4a7732*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"332a541c873aca478b8d51108c956e8c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"0bc38733b133ed319516a2939c18535f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc32_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"8f5c8a2b1a1f7876f50b1405eccc894c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc64_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"9237729267c34f37d204858bee6e42db*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"922459d948a1e447f1f8cc0e9c2132d4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc32_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"90f9ab635a63f54c2a9bded1be0df32f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"d31901de72d9b7eeb53ba1636ad47008*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"4b30faff35714928ac9319676b923efa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc64_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"52131014c5a6ea0b4162d59d405d9d9b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"0a83dfbe4b539df4cc497fb5042a2b84*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"89982c6e98268a85dfb099a688d87831*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"7c29960e80e0914ebf9b46cb9cc7fa42*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"aeb0483077afe491a9a21f47b265e3fb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"07f8f09fc7609f69ffe4dde0072e5d5d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"500331eaeb5417e0af46bd94d52a3fce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"37aae6a14e8e134e6a9e0089340f65fa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"f4f773ef5582b1ffb863ecc0bf917cdf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"c71c88e6908585ae0883eaa241cb20f1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3f7ac83cf50476fd18804bd1783a5301*5&ad38ca43086ea9d08eaf5c548e4dcc27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ec903e9f5937558377225b722e7efb4a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"518ebabbe0d2c561f853da2a512cfc0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"13a3d2cdd21b619c7c56cb98d39be690*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"98e48b3b2e7c74956f8908be7543d1ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"de1d8b96da5112edba6342588929bfe2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"e7d09cb4dea9451753c7476c85c95aa6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"4bae3943603fe5057b9f967766d994fe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"fa6d44b8f452c6a804d002675810d558*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc512_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"f04b8433a54dfe5194d316f0218e08c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"243c358343be899c8f10e4a368cdc713*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"5d279e387a7814ba7657ce4c1c746536*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"c661151d00b3af19b1dba3a7fc55bb34*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"a1f6e836c1a14229396aae79489a170e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"9944b0fca72ecf35c0f581c9e682062c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"d63514e030ebc20eccd157fe9b5827af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"2dd6e7e854ce9474d99a7d888bbd5692*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"036039e510a329994bb52ae6d86e3d78*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"56fa0b0968a4161185b7f8ca8783414a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"1bfe7a74d075ea234040595edd51e12c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"787eb6dd21a6d29220de0a09e143add9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"58d58eae756b64f0aa97f45f5582d35f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"e55a00562a3c03dc4072508957a20e67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1055d3c07d36ad22c5f748d7a391714f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"1227403202e9df59eb516ddec8ccfa21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"93a2fe5c2a69cef0aa1c55ae8aa66c0b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"8b880c9ed5a47e2040809d0ff71afe1b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"ec7e13199eae2d2c2fa84ac27083ac50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"8305db15f9749dfb9f58303d31f3a813*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"df546fec8645f5b4cfece58203dfd0c3*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"2ecc6480f6854f1185988c8c2b515b5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"0f4e02fc86f6ffd4bf4ee420a8451f1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"f017e23fb612c7cb5d90a7a9522ee8f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"1372956d8b2076913d1333722a18f810*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"288267a4c6e05133ac084f9e7fd3f322*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"4feb211ae34cf4498a9b2815a9d357c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"a47c1d3b2349b89f84c6aba025658131*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"d713c74844c8c98e60a5b97db1841d3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"a539e8c24c5953a59d26da36ef8a36a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"8888a1384578b3353833e4fa48da4148*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"223695f106f0af23d83338beb7ad5af1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"edbcdd0d11f7f97ae9c13e92184f76d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"d2ac2ea525f1137070d7fdbcd14e554b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a110f99188f4b54b3d49120d60fe6dff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"46c4f904dc96e1bb7121608a12a3b80c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"d4e002ccb4b0a5663e260587c1f407a3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"79adb91efd8dd9f65c8e26e3c64303e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"31850dd6b316f7e74a0064b75682c620*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"cd6ef58c67d6e79d963d5e77a68a7876*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"f8928eb92b56ad47f6f43636c2afa06b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"96c994adb0aecca7bdf95bdc71792010*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"5a0bbccba8c2d936681ba762cfba11e2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"99479de7a3a4a02130c78b7e37f7addd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"16a201cba2f043bbcacb9f367b0c85f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"0c72471093f56ae478e101415f45f8ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"44dbd18b8dc5398c8ab5d10dca5247ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"2d38539767efc3979ac05f3626a922b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"07e77c165d1f86a82d3e1e9ef336ecd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"8caed1fc1f2c259d22c13ba91995aa58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"59e2c4adc6c18b4e05fbc8122f5666d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"7904f3ca88ef1396a713aca534dfe996*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g1280mb32_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e0313becb31079e69a173d00b6a01edd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ec9cdbb7e105a5fe50702efcccc3dee3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb32_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"51c446188acc5ee0eb078e09d47d5e1e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb32_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"bee689ce4488514cbbfa808f3ee6843b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"2157e0dbd89978f812fdc2cf64d123a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"58481533e8b88bb94b04a101f46ce78f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f16:2 --attr-scales=wei:per_oc mb32_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f37b25e1da5a54db0620dd74839e492d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"b3dc39f80de74cb5c334949c0d00b893*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"aa5b0c9689b567c3b47961381f3ba926*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"3f1a2842b2b85c2c9a1c486416d313a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"a75cec7d1e508ba2ce0a85b1ab81d4eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b6ff9e5804ce8c95325bc6c28f2c26e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"1481d25eada552920657efc40b35e6bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"7eab600e0b72a7c694b554e8f79cc8c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"1aeff555e1bcc42e73051155fe987057*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"7b9bbcc194158a64072df5faf3d8e0d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"5336854060591391fda13bdec8ce9e85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d5a678c53b07cdcc52bada55ed380a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e8cf6e451e73ccbf8afca58d4db8383a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"801d78af8cdaff4fe7f95416b2b800b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6354d06feb1191d541c04875adbc3851*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7950e3d298e836bc59cf4e0ced5dc871*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4ab2a225bf687efad5c496ccb9f35622*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0bc7ee7488db7eaf296e2947b8726907*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8791d9ab95554326cd061d19225b3afc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"7f285982af6edafa34d36c2a6c6d3359*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"9ef8737fe44c8b53f2a3a3cfac438eb1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cb989cd196bf140a4133b80cbfbd2c6a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3ec624aa8da33b0e2a1884c52bd06ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"627eb07a0badafde6162c1db098f5ac8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"bfc293e6e5763646d952254039123551*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"079d6a716488b299eec8c95248ee9285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"4f2b2d32d41d740ddebe886c1cda43e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"eedca51f2706baa17c41796685fb6489*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"19eb2e680460c6a0effa715bc327c164*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e8ac4c7e641b036539c8f1d8981c5b0e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"0fd10e38628d85e7e7c2f092fc783e4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"7be2e0410a055ca95550bea2301ddfe3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"e56a42e05861576f9c10d1090aecf8aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"1005885f2c4af39cfafd393493ecc3b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"13248a5d4b5b8d18135174a59bc645c3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"918aec65a77711566a4e0d0dad3c174e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"eec479eb5d142178be0affc77e93921a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"96277c159fb102a06ee572f08e263782*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"5a8b5e8888459e99bf558312459f713d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"985c75c166765a018e702b3bac548f27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a1d840a7418b37ff4097a0231302afa3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"e7eb1340d2b2444458510242f92d9f7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"36b09bd00eb75691896af1753f443507*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"008d5877b0a3c20094a443a765e0bb57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"06e6a0439102fdc516a9723bc3e06de2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"6f66c8a8ac74ba1a3a542c00cce48c72*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"767004560b30326c2d5dcb58c9144ff3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"6a02ebc2052139a2be96415f70479a2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"8db041c61268baefa25991d42267019b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"f675cb6826fcba56d48cba6cff5a70c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"46ddf9001aba3525b719dd20ffe80671*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"73aa758da0201b26dfba0033cc679855*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"43c74e7424909100f7d8c5fb60273f56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"786fb8e268e284c7228240afce749e17*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"654f66dceda241b5f8961f1bf17cb6aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"c9e2899c4fd43499c88cbbe6970d8b00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"eca47fd7c07c64a5309d9e5c476a4fad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"61356181e6a7ff298785fbc2c1eeb035*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"d839191ef5dc46d74f03c32acb59d5a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"ceef160848c2509fc52ee3c7d2fb18fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"81c47b50b14c68e163247d70f97cadfc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"ebb5f844e99bd9a49dc8640b87c081c9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"81300e639822dbaaf7f37d2d3c1f1684*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0 mb32_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"7ea3fa86670820a12400c37492a7ec32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0+binary_mul:f16:0 mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"a0cbe536c8df91387d9f113391f4e3d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"885cd76590188645c14f3b1885fc3ffb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"d884ab10071b31fcfae2416f9555be5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"a7b814d65e8f4b7693b5642e7af35d65*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"92c83987233ad8f2fed5c7bb5f468257*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"8882dfecaf5c45368587fb0710a614aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"1d66c3e55840c6b354326f94f6741fc6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"956a3e02876f4c282147ae055fa682fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"e2d38c8e8a5837254c77eae9fb6c4f60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"9875a5dfb80cfc995eec9aa60acba7af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"edf83855b5116f9756ef5d6ebc68fb47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"95d860ea2ca1a449f2d41b525ab94e50*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"312861dbef7c76659a04b3f098d18b31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"1b30d2e2b16f1f1b7872ad25e6d1455e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"f0e79a5bf346e5c57c3630807bcbf28f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"f90e283e3805b1890f43772067894182*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"502ba67e1d8b069c8cbd0cf3ec5226f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"b90e8bc8ec950edbe1d0c615953427cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"ab93e45e9df93c947a94a329a16c8bae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"68e5055d91e826db17fc21c43d913f77*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"1a4e7bc9def2a1c35d2af82840bf777b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"6cd475938fc6259f585cc03331073d75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"90407df3e0e0df22fadfedf2b29452c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"d9ae557792ccacd1bef593a77ec1610e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"afe9ca0f5d33e6e09a830c6f81d5e735*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"c9119e06849e3bdd00cccab13e8dce88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"23fcbf38ce70d6e6670ad3153121882f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"9d07329a86e4e721152fa5ebfe52203e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"64898c58b96db225c83a2d4e7dc43b62*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"21b46ee2f54ee3f916d808b0cfba6bf8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"285488cb055f09ec597e5f24dd2a37d7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"4892c739470ff66e65192c84b5c3c5da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"6fef9f5962aee924a5c0e70eeace46ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"04749433c03b39b70ff741045214ad7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"eef64662cfddd52c7bace6192334cbe4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"950e270f71e027d3b1363c5d2db3d21b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"0e11090a61ad2fb29c48af1fc718c781*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"98b3009e00d78fb81eb3e391a8f7fdd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"d037074281c1168051ef3971186627a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"b97e033f0c74d468fa1bc817bc6dc2f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"b4841d5878b76a38c4a2943b23d722be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0+eltwise_logistic:0.271:0.314:1.234+binary_mul:f16:0+binary_mul:f16:0 mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"d3520fee1819de8f7d57b8b6dadeb8f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"686031975685cc7d5f632672460deb70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"69a7e93c4a2c58517691085088ac9b26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ad925e92c26946f1b35144b9a9c599b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g58mb1_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"199bcd5f3301ef0b25865b9b2e93d259*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aaa2bf655496ad34c50b58fb53231431*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"010abcaa9a38c44dc815dd2cd561af47*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"0a3105622df825d14101c73719089504*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"221643e4ddfc0e0be4f714627a6f74cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"40cebb71ac1d92692b4176ec107e0379*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g8mb1_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"fd8fdbc12a2fda17962ab1522cda0435*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8318b22061df6632d43e1b985ff21146*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d95132f82dc53dfc276d5db89feeb213*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e8456d6c9df2c0faa55187cd9be73adc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b712c8143195e21b952c37117209e582*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g40mb1_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"24adf8e5cb4e648a98376525576f01be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b2b586b5fa05d6e2b1c939bf0f98ef99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"d3ec7ead61554b967ae91b75adc4c685*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g48mb1_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"0a33e455d200f4eb03779d15ff8e2fb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:acdb+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6d32486836cf1bceebffdc928c02c62b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3fd4f63e3d81f1c38dd58006d6d4eb31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g48mb1_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"a006132bb2be5111c35cab06f6889b7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"34a7d993c7939123195cdb1e35343bd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"68b8293cb29d1c65d4f9ef03dda4a144*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"41954c100e816ade9aa6270b1a6bc380*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"37a219262bed86553bc70aaed05ff966*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g120mb1_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"4b417503e87538e0bb1a951c0b2a82a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2da1bd98be831faf7308e21461cb5763*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"df50d0ee2390deea2c6c2152b9c71cf7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"2ca2a669850e31e7b08a28f6adabb0b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"a68beebcf87f65e57487666ee61e4688*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4a103b67ae7a9b1debc1db603c58f06a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8152490c8439c03ea08e2ebc00afcd12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"170ebf1a4b73af17e728ccfedd07b20a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"948b77e0d241712f7c46de38bb372956*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g72mb1_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"3bdba9160c4f59c307db7c9a7522c22f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"71e34acd9627056c70a05bb63902c3a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"eb1ad2a966746588377e93aad1f4fbc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7e53cd05f055b71f3c8fb8febfabad9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"559b3a815c0a907e070067d9259bfd9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2fa2b44a29519d1266f2c97565481d1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"678719c07c5d1116212e3ab62664e47d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"008c504b173e663d03884f3a0a62fec2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7fb68139190db79517ded5154159410f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g288mb1_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"6a5498ba89b3a5759f96619e0c719f23*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f76bac1301d659bb6ada2c88a0a07f26*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"95bc4130c1a1c0ac22a696413e0eba85*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"cdcc7180282cf6609ba0324836dd305a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"668485705d2cf083d459bac6fd9ebde4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e082d9811e68fa5582fd8da741c34215*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7fe838ec47cd495cd08e6ad14ad44df9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"513622354a04192cdd254492193fd67d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8a3973dab8ce0b9bd3bc7285791c1d2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"486fa61696ec13f2af2443310a9118c6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0749726c0a50d28d9855d9a4ba086451*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2c1a69f8b4cf8905206adab1cd65c6b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"39b587827af1b17c57d7f21c22a0531f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37904b9670db5515fb486dae5ab33e39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"24394e78a8b7c5d86821764d1b7f5b98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"94832efc077415972c2ff74378e4edfe*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8827ad3218c06793f15728c065bac92e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37dd19f8230d3a5498bfd9a231f7145f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5f8f07ba5642f56f810d5816efa96870*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b98fc352d14825cb3a9f4a733268968a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4d78a2981365dd5132853146ba4eb5c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2222799ce64f982f6337738c9f19c836*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"62364fb985c6fd386cfbb2f0b95d2a19*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e6929d0c3220ef18783d7ff33c01c04b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6f42d1064010e7343753145c048e308d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c27f0548da469360f190f3955ab81ba6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"e6d01ed82a3281a659bc243c6de7c5db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e1bbcf34e9a3adbf0d5f6d992582bbe4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"991bd13337c1657876015393b7e24653*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2e1b1155c97d96bbce1640aab9fef98d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"458ca698728d72327996ba5fee9af9dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1ffea86dab89296eabb2636844a21875*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"55292d298960e8cf7dc2e952bc78a24e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2f2d8348cd37528de1e97d9d5fdd3ae4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0e40e621ecb704d319b51d9a7a02e4db*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f1281a588bcf28f7d4c4c4d7764abade*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5b6de105b44177ffe5f3a297cd795afd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"485b808ec420dc145d4f3a800ed84ccd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"070878ad3de564a96d62782020160393*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ec15087e825dd7a05187b724abe18fcf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8e2fb4d079542a1850bd97d32b3d35fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"485882a0cce2025f31470729172de98f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1712966f5a43d3f5e4bf032b04e0ef3d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"578ca97f066fb3f1fb9049674a6635d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f13184abbd231f9908a848a4d231da6e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"dc60c60248f3ad51199fde8f5602b5f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"88153d894fba364f012b50e62aa94bc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5512ac486c67056705907ad205a57762*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e5d737339db3e7d6e2f533dff1cee1ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7fe9c4fe1ed037a9bb3a990d254fa968*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"859a529e3fcf330d51cdd0a0e25929de*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"52ae3a5e14106428a8afd2772be09ea2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a8397386276e06fe9ced76b11c49f294*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2f4c3b9327b1d65f968706429ed4548d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3b28975b54859476f644227391656727*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"0e3dc8ecc276ef74d30d10cc9a7be906*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234 mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"a85c602a7a27f7d9a3217cec7efafa71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"a0d55ddec4d2cc1b0650aa24e5b10d5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"5df2deeb4467e51de2175281552a1909*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"bcb727b0db292693227413bfd3b6fd5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"e4b44ab20b673ed9729fac8b8fe56e60*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"4fa811046238c31e45d4fd09ac74e37b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"bcdd1d200f29c2f2c27633456e089046*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"512936f3e5cf7372a8a43f19850bc818*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"edb0373e9c706bcaf44ec3593f44885c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ebafb78787a2c1f3b807756c563d86f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"bce7fcb455d1ad47ee2b38fd52464d7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8de57ac2fe051f0bce3effa652e8dc56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"16378d1fb383fbfc182904da517d9465*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"25f8f1202b740c610876351360e49bdb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"b87fbfb34fde5df6b5f68bc23b8f0a73*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b123104153b07289b73602b1db5ab722*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"8ad050ebfa503285d9d63acb5c804aae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"1e78b10dfd91da9db5e3a16a628aa5f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4b3058daf14b03bae3b1f761ae78cc16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6b8d8ffeb04f4158e714c35598f3f1c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"691d9a1401358a873215f85161cf9872*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"18b9d8c6eba7cbaae0fe0d84ce0067a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6c890ec9d4843c4aa6e235b336a67bfe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"3efbd068a7915e3f62b8e1baa86b1f6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"63b6e074348c42ac50de5157b15b11ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"5053da1bb91a426cd2af3ef9978a25d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"f7c16c44c510938f6beb8e3df39c09f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"c3bed4b38e0ac8b781f08319add65a0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"914beb801896aeace174f4928ec84b11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"f79b991cb5a5caf49e71096f772864b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"462bccd5f259c57e5887305fa2ce5146*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f2f5ab9ef50996c6eb9dffe1fc7af4ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"89eee57e0e3fed120d0589662b61139d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bc9d0d011aef6df9b8906352ae498b68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb32_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"f601bf31f7911811831eac62fb2dd1ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"ad009dec83710c62fd5e367dcd8c2bd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"430951f4761ab94d6f68b2480d48f00e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"cb032e4e214b5b27e21489c179582325*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"867f02149b4d903aa4b7c1ecae29f6d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ca25f394a628fba22a7f8990f691067d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"c010ada479f2e30eda2cae4335303b22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"d749179bf88bd6751e5aede3ac3a201a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"ca711eae75ff68302d69b26a58a683b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f269cdf24eab1d66b5796dea1b863179*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"656a1705eafba891f7dd652865c11a2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c1ea8ad2e22ff478c6cc62b461c76d95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"f9bf6435f9fd0c89a90ab2d99d8afb45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2f33ab53cf711df7fa0624923f496bcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"e4871c58c88a3c4f174029ddee2baebc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"32b7f0b1b64cf1080a1bcadcd97731fb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"270103078fdfc17ea0b43814a3b43e3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"293b2f4e8db19958233e1405e64bbdb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"36031ace1408c6496ff61def45a72b02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"30f533e2644eaf61d523461dc286a11d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fc534a71dca4eb86812ce32b809ba0ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5854ddfa15a68d272e79507f6f084a3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ac74460cd57f3d63cb4242c9d2d1679d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"6ac35a1b8178cb20c97e5c0f4b211a98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"e6c76b6526d6b247b56fd3c870eff797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"4a1eff14bf7cfc0fcd0252d61ebe07fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"0fe3b5280dafe96bb9da8c663ea47b20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"958b01e44d0ca767b09124a928acfef5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"702c8e31cb3d1373705c58b06d7b08e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"a5959c3967968d81548ffa1ad13e889a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9e9b3bfca219a0d95d1b6f3319bc58b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"db9e007c87ecd0ed1b2c67f66995ab39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"6424f250808b2b0bd06d77b3daf1896c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f7151198a9c030ee3c18a695e3fb0180*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"ac427171b958f95b2972a2c68ed94199*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"7a461e0c5726af1562c4d7c66fc40c3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"7e858bc9b90ef1b8ea1aafcd64bd27b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"b37378189710c76c2a2cfc276aa9ca55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"4cf362c10b5ae074ade55afe76baec98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"ec4a90b17d038017da9cfcf8cbfdaf0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a933eac77f2763054e7c3793dd007858*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0dace495b4369d2e53179153d03efcb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=u8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"c9e33263fbd961018a7b90f30ed124a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:7:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aa5b2e168f832a4b10f7cd4054276ac8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"790958c564531e3dc0ba9caf464e70f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5f4c8155a669ee36440c4d46b24e5a4f*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=s8:s8:f32 --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"957747e3c2a927c626236052613af03b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:7:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"60910bcb5bbcbed58bd3bdf1d1d4a934*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"606131d05df2a54d4bb055bb1d111c86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc1_ih128oh128kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"647b8a9b89a898665e750c276bf8365a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c1cd9fc150166da86eb04a49f38ea70d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"26035dd0a3c0f557649c7d23dd660430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"151b7f9c24868fbd0128fe0b5e36257f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"5330651d128d0fbdd58b90ae190aed4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"77b8b55b92c39443f0d0ffbf4ba405df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"73d5110db004a8d840106ec21d6e81b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"8879de3ba768de97144caa234ade3a2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"0d7307bcf26d888d607a591285a27705*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c2aee79a6cd4c4b30670d91508dba04d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb1_ic1024oc125_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c9ceea9d24600eaf3251232a0d8f82c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb32_ic3oc16_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"674b043349a12e4dfa92e6690cfcfbd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2 mb32_ic16oc32_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"fc2aaaac8473c29cdd6a2e2a92b9a4b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic32oc64_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"c8851c593e265da0e2ead70d782f67e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic64oc64_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"c1738383d23a897c27ec06f3d0afbb4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_mul:f32:2+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb32_ic64oc128_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"8da76393ee50f7a7dbf1e40847c5910b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 g128mb32_ic128oc128_ih6oh1kh6sh1dh0ph0_iw6ow1kw6sw1dw0pw0_n"03428313bb12b97fa5d9f7ff9b612e83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb32_ic128oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a3ecf03d133d0a6e9ef0d9e98f661928*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bbf2d52fa7ce4d209d6f96432ca0acd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234 mb32_ic64oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1cdf2c63403724af3f01308d4219b479*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"7046437122fa64924edffc58712d9d05*5&c6729dfe178b89ac71ecc64e4c6e36b4*5&e2d8312f0d0f0d4de807d133a109aca0*5&0b2746c47a40da6eaabbf41a6790be02*5&36dd338ea227a2f16ab0f7d80ab78dff*5&81034627551cccbf5b32ac961fdd87de*5&be098e10e92d9119732ae7d6c96b7d02*5&ca5a62408a2b55abc76b7ffcad7c838b*5&58ced99a7d84811f9bfeeaaa39e7aacb*5&5e7f2896f28bf88dde952481b61ff024*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4d8350b5ee2932d00413496f78f5e74e*5&fb47a1b90709222004d67fc3537ce524*5&0bec9e595132add202523d69b5af07ac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cc16a0e6c5f61c2ed7c9112dbd4b9bee*5&c1736cbefc9fd188d25351f8e1401567*5&2d7366149278f0dce5e099d3cff94baf*5&247212b307eba4ec8877a805cc882367*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"485afbc99c35c3d7f5e0de73b52a0639*30&ecc6f1df516b23ec7b51e9d8305019c8*30&f265a15d3812346535914a3bbea94b68*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g96mb1_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a738afb052f2304b935fd25bc74306e*5&ee441702e51b57f74c18df47145d5928*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a63445dface2df9bdcc9a4c470d28f73*5&5c028e7bc252f72d3f1b7cbf45edba82*5&8a0bf56dd4b473b831b901ed6db7c10b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g128mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2933fe85edeb957fb760bf9891f8e4c4*5&5ec92614c748a0f4a525dd934eed4ce1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fb0baa00e7eded79f12828ea3ed342bb*5&fe3cb1923ec4720fe38705ac3df46836*5&49211c39e87e3dfc34af151ac4be28bf*5&fc41235944227a540ba864a0feb02548*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g160mb1_ic160oc160_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cf4767574b5f622bbf8ae0ee7a79dafd*5&79360ff8a20f79185310e8c526e49053*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca0259efcf7b5ec159cd484dcb4e1bc1*5&264ab8d16634595656e655149007b3e5*5&5ee0536a4a6409d02697b7f3a6deb49d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g192mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8982b008676a098bd3290d730680d28f*5&bf0bcea56e3e2a23a2a263c9821ccc58*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2aba6ef2c0e78e3e3341b4457592c643*5&f20a0328fe3325c38a41cfec8cd561af*5&ca096d035367630ad9cf91c58ab6374e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g224mb1_ic224oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1bba521167a86661cb8290f60ef737ac*5&d3974d4b7adc5184c41afcf3a3e11df9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a1c3396a3c5082fad9b32374420f894b*5&c1d984d252e646e2f775ffbb4bc42766*5&e3cf53f757672d7b158153c2f1b26775*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f63a005e41234f37ecea884bb0b2b2c3*5&6c3ee39105f07de32998747ebe81e7c8*5&c2d5e8358645ce8b59cf78080b3f6029*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g128mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d4b029f42930196361136d8f8dc2bff7*5&d51639ac941586355edae8dcfce4b165*5&c2d260d8d29921f03822f5df7b243186*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"66926ade9683c6ff328606d8a6eab28f*5&7bac67bfbaba93d93ced727ec3f4dda0*5&a05e32f79ece7ecb5e2411f4eb8cb7e6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0d9c700d446042101773cdc4a5502099*60&512f7143906c028eaa98d6f921477a65*60&48a6ea630c02f447c7526c06cb41358c*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aa6582d202379b45574557798f9e5b26*5&e6e8b181f42e59467bfd23af0c24d068*5&cae28c6035a938206d86b1cc93088197*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g192mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8f9df6501018b198895f08fb554fac28*5&cc4a7c19764a373e5abbd0b9bb58895b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"945888c6aea3eac9dbe908e636e102f3*5&1c88553306d6f4e1c6171ebc1bc818db*5&06d3c77778baa1d3a7e4f303c6d98054*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g224mb1_ic224oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3973cf09b778589903cabce38549bdd6*5&c839c1b5b8578fe8e2950fd88abc33c4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc9ec18805b68e4f1f2c167fe1fde638*5&c78a17fddbee4d9be7fd172c88dc84e8*5&fcb821fb15ab00e03b58103d39800fab*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g256mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"61fbc638637abbacac222a0db98d1f66*5&fd13fb61f452f586e5dc34c4a3958a61*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3005b8ee02299a91ebdbc9851d022aee*5&b3d6e08a2c73f9b43582e25923721d69*5&dbe1165278d5cbb38a2d93cae1f240f9*5&3f4cbd58189b8f11a3572c179d2498ce*5&845e30177392f7272835266c5fb08d04*5&29347210b149a6251ff05b27a57992a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g288mb1_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cd37c7b516af6378e624cb990d9d1220*5&5b3bd350efce515586deea6c215bc5ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"05d771248b038a57f4383fc05c50338c*5&3d1a97edd66b664ab4322af9973cdf38*5&7bf4179e36b4300b20f52cc9e9d82c3c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g320mb1_ic320oc320_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"80232a708e1466d651cef5d59d56040e*5&bedeebcc14460eb9947bc46de0e20535*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b4973c079c9dc4928a3bf7176b60403*5&4b8ed03734f949e431455b86d308547b*5&eca7ac64d4d5d7d611ff767c5f92a3e3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g352mb1_ic352oc352_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6a80f40e77cc772ae337135cfa48fe56*5&4294a8a1bd551b3431886537bd406f26*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3e55551007f2413003907ce6f5dd0bcb*5&008c51c70f658e58de5fa2010255e49e*5&8ed00e3befdf6352c163558ff79ee615*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g384mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"88d21aa6e310f71631b24051df61f3a1*5&34f8781d09bea4cead195de8923a186b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2c76ff658ff820023a22bd9f156ec391*5&ddcfb850c36c196c88a38b4e0e690078*5&20f8f37c70483935c735fbc8557d771a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e60c9afba948d23be40a553d62f68ae9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5ecf6cbf4c595c97002996df3f4f09c7*5&297a7e882d9bae47a47578a18bea1615*5&1c04952ac7f19a35cb9ae0c919a25af8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g448mb1_ic448oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"68fdb4ff8f2971b82324bd72c71dabe4*5&68fb4256df67e0686cd301e5808c7afa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5fbf20b1e16e8464da6012e6905fabeb*5&1e2c5c5712feaa2efd290ab79b16f639*5&683257aa4ee67c401fb2d4224234fd53*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g480mb1_ic480oc480_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe4cfccd476224327b1d7d07dec3e5c0*5&2887cffa542fa4fda2fe7261acffff5b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"499e1ed4e4bf589b5da0838e10a485e3*5&816edbab603171edb592e9bae5e679b6*5&0f98278db2cb990d7a11c8d93fde1916*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23f0531d9df8508d8865cf0b2737ca8c*5&ced8e1ad8cae3e9e83ed38317a5e678c*5&38ac49ac53bde191d9e4cb2b1898e66b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"47562ee480abbee0b15677e4f9481eaf*5&8fcad93818494f54afafaab80f1a8107*5&4d14d94c20f11de4a04354418d9dc97c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"00d18343da0a78c8d4aefe06bad7f975*120&f3e22c318080cd47c075d50442bc8d86*120&febf8e72fe92b5ce631bb0b4b94941a2*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1d2b967ce175dd802f3993245acf976e*5&e18a18b3d78d725aa228430e04f71975*5&59dd9b34fad2e6e023082d3f8de83b1e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b29238f7752b71983a568ccedfbbae14*5&8127acb5cc77db9f3537b873df36ffd7*5&4a332ca52de73f837e3218116fe0ae77*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5a5e6b8a166e603dae7fa24d4e3e20c2*5&50bae5b1c0cd4e4573fb34b0b4170fba*5&5f0184a6ca1571e9ac794c90dd258cc6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g384mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95d8bd94ff7a41fba25c22f3b1bd835f*5&58727bb10e449a684ef6b21fa96fa7b9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be68a22f8e51e38edefd056201e0269e*5&1274ca8fe74a5660cebe60c87c06d0f8*5&6341ada40283cdd0f5b202e33b7b5724*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"96d507a9c01b65dad7fd51efe0f440b7*5&45840ad47c3f3800660452001f37a33e*5&fa1eaa490956f4f5b658318e4ee93438*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g448mb1_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d2bc28c396eb3c10ee3304b2c8f91d9*5&5fde2d9db73e455ce2255a4df272fe96*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4e6f6601839759d09cc0e971faee2bed*5&cefa02def74fe81fc037b09b2b52b7d2*5&1c290f51403f77622cf0c0a479ea3a64*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9cc13fa4c38e73b3da7446982b69c50e*5&af9770f81826ec27b4756854a251fe38*5&d09c5ed3accb6e00ce71ffdfa9e9d241*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"76a345dd5f82e109fedc23c4d67a42e5*5&29eeab88acbd8bd860d5af7f5cf61895*5&9860f9fea18f8cc2b7dd8d3df1263f68*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g544mb1_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7cfafda4ad0a0daea2fbd890ad58b1e6*5&cc2999d74e73c92a1c9230ce608d2925*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a4e801e932cb884986d2d122f984d487*5&a8e5d5b45947f3b3301da616ffa1d7ed*5&3f760506bef0fd4495de3bee6f52465b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2c97c19189d2d825c9214c3291b629fb*5&db5e2052b2d26ba046a22c0b3fa53d84*5&2f1842ff76c2854c38f9af0182dad123*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g608mb1_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df9e4c3a0d738326f43ae96303afc420*5&ed6601d2a8367f480cb9a2c9eacd7a22*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fd20f812566b0fb454a191a74a01ec9f*5&d81488688a54e37a3295357e42697b6c*5&1e9378a9330dd62b4666ee9f2f8c35e3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g640mb1_ic640oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"66a8ae01bed29417332c470ca10606f9*5&cf2ec5ca014ccbc5b05355a9212dd527*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"55b35521eb3089b5b412b98d600359f6*5&433eb27e05e9c9aecc45551967690ca7*5&4e5023c9ba9983cbe6056165b36903ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"713f03bf81df467097e89c5c4bd52326*5&e391b9871a9becae29672357e41601c6*5&cc12975468c6df4a8932ae20aa0bcce2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g704mb1_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"76085a74f9a67c920cdaa52bc6ae8b84*5&11782d8e28d0c4b371e7d17511c7acbf*5&380e4133e04241a8fae02d7aefbac7e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14dce962088524d032f6cdc8aef1e09d*5&e1fc9f9720498ac699bb5ef71c836f8d*5&38972af4394d8635b824a17288fd9563*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g736mb1_ic736oc736_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1cb579ce4d8fb4209fa63b63fde6a92a*5&db2d75494022b6654b3e22dd6f87b1a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"89aac332f8aad995c2a28394508cd991*5&243b50cb47372bf1629ebe89f9ad038e*5&5f225e05bc65a341eb2df57a58883d11*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g768mb1_ic768oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1c8aa47813e7b8478c70b6bc8f02b6c9*5&cbc6221bae673ea6e242d587677171a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e239a6502d04e3af78eb9bb7d947374e*5&ba7b174fd494d487c68ea52d2b908df3*5&1fd9ad5f39f5125f1c2990d0f239eefa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g800mb1_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db7081e8f5fd40009578befd0f9dcbcb*5&512317b4d36542895e9b034f5021876e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f0869105549190e226154985aa7faa6*5&a624ef687624af37efeeac2347ee62a4*5&e72be8c961d2e71d8c54c2f6a566ac8e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8a4b26b8e55527241cc6e2dc5fd0ffe9*5&3cc28cbba8f24560d9ec6040cca18755*5&7dee8d184eacb8157378093caebcb16a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g864mb1_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8bf4a94f3cc91d60fbc74df20e9a4eed*5&312cc3ff5630d0d07ddab57add5bb72a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d39baaf0f2e2b8398ff516c7e39f291c*5&f931f1c48ada37f2208cd01cc3452ba4*5&d5e5521c845a012cafb9386068bc0733*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g896mb1_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe6a420f3150ecd08ddf4f81c495e5df*5&a4b54022872cabb2df162408f7c67583*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16c05d98e0a0e236941afb4d9bf75453*5&05d8d609b2e77f3a22fa34bef8b87342*5&ef673c00b45cc3632ffe35a5edbe4649*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g928mb1_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"98d9dce54cb5b1350a6e256840636af1*5&54c9ca97353bdf77bd6050e99aac11ad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53f605613875abcfa2fc3252f3de3441*5&141db6a4c2e2618cb017cde142516918*5&95d452adc4de835a65fac672148c0df7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g960mb1_ic960oc960_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a41c5869c10dbd304d053c2528924fba*5&101af1ea6e768d0f5d699cd6c18f8e68*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5034a842f34d92e74aeb17fef8524f1d*5&0b5ea433c6715d69fee537b5100d1eae*5&f4a3d0bae140cbbcada83bc9e049ddfc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g992mb1_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1cec94c2226e26f5461e9c1051d75b5*5&8a6aca466daa245ae57b37593fd44ed3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00d41fe7badabffd1a52e31315e9ec5a*5&e7a148e8fb95fa77827431cfacc69988*5&18099194cbf31152eeeddc2b77f4b273*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0aac926a956bb021188805dc2a7fc8a8*5&191e83062fac95f2898f0ab031112c58*5&b2860b331565d4d4fdaebeeb81b75215*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"05931aa052b07ef4bb556112cab7943e*5&e973894604730b1fdfc7322db9ed6731*5&2ce045c6c50345ff2a5684adf4c285c0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3aefb9579e5f6a5392c96d24e246fe07*80&d8d6696372330af3f79ad596db1f3985*80&81c8695e3d6aba6ca03d438c4cad57f0*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d9eb828c9feb3a0ee9a454c4472be634*5&c08c963a92bd0535abc18f9aa4458e66*5&88644ada56ce17ff33f28ec57efaf957*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g576mb1_ic576oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8dcf6d091b094a8cf8b37bf8f60f808e*5&4255631bc405fb332e1087ec9ba7c93c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"50367b036922b8a80e258fa19e2ac54a*5&7cdab84ab1cd72db126cd0c7836ba987*5&69345bf1bfa50daf54fa3a3502c9e8f4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g608mb1_ic608oc608_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0aff951aeaaa0fa241aafcaf998edb99*5&e0cc6232f5644c369499acd471c8ab75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6ec48868813b0192ab54c31b663451cc*5&a2f7423836a8d9c1154ea2874611d120*5&f145e2d1937d5a04c67667fc28916a5c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g640mb1_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4777bff13e3b065d7bb3d690d8b2caf7*5&20b3e11ada034dc7c1da39554dae1fbb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f9a82ab94c09627fcaeb9cf5ae0c871b*5&976f2f597c1423a3476c2bfb5970a91a*5&550ebcd89a54157022cfac3104cb6234*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g672mb1_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4a7709c08945f187435f8e9f0d91e1e1*5&40d34902b12f4a4f564d6cb343357d7e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"69b4ca42633ed5680fa169cb1c72e5ef*5&79d1df79d4e34338476a35c4abab1354*5&e39e14111fa4ecd11115658debcc5d2a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g704mb1_ic704oc704_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"82000ebdc891a72b36f15b0b01f98e44*5&41615e6d470ec8acecb2cd1f28cff04c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d247cad65c7ae0028ee85d57c3a2d6ba*5&1e0fd3777bb95d84e609e661246acd21*5&b5c8add82dc0e486545ee7e77dcb9ae5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g736mb1_ic736oc736_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e92eeb130ebf06d3d9be67eab1144446*5&bef305b9bda5db8df95c1faf5c21aa4a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"49851bcc086ab14a08242dcf8aa24a9a*5&971c77afe60f52c9ead7d9d467b434cf*5&aebf1e82d16b76430e9f44edbe60996c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g768mb1_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c6af95a2a4ee110cb006dcbd5f98070c*5&6074c084c5ca4164e25eb3e01baac0d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fc257842c3f5905a65e25562c54d8a57*5&ccee41b49fc437885c70319cba350f31*5&94cd08c3243e5b24eb41e036097b5ec3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g800mb1_ic800oc800_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f55ceb335ec0fbe9f7954140d3d2fed9*5&2e8da03e658e049653b4d6548dc368d8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f38efdb446a8b915e22cf4bad3496041*5&e41785198bba67e02382eed08fd74b22*5&8df827c5a2c57699189f6928a2132443*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g832mb1_ic832oc832_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"532cfb87c8f41d6e6398dc8da3545c6f*5&01f5ea83fbb34e358f11085725e7dd54*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0e3c4d0e06caf6eeb250dd20820204e4*5&c5d58721d898d92a3df7686739471fe1*5&7bae897e3aff44e7f754c88efb2e7fb6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g864mb1_ic864oc864_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c343c49d2bc2ca0db075287e73558cb2*5&f76792b7fd348a147cf6ff911c9a28a3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b2bae84682e9ec3ccf7b3af8cd7d3dbf*5&faf9d7ee5b0063b34cd7805cebc4f39a*5&2af16d32c08806ed3a612eb92906fe7c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g896mb1_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"010d7e6662abf4d0f181095a6bf412f0*5&e105d60b98fdcee4bb29005e697a97de*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e8419b42beb734d59489bd19d00edd84*5&65345799d4280fde2d752c3221f91947*5&8771767169161a259fc2ad559c175570*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g928mb1_ic928oc928_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"642099a1f1260ff7b59f3a47dc829543*5&e9dd04c6df9d1ceec25a96a614fbbd51*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eb2d3c98315088081bff7085fe9e86a1*5&c50614c9f13181f2f30475fc0752f008*5&513ecbf0d2408594b838b3ee7aa25865*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g960mb1_ic960oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"52b34330b7d2bad78866df23826c71af*5&303aa16c22b0323dee52d51b0573042d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f84a939842caa92ac783ffaade5d691*5&a27fb2de302e8fc214f6a33142b308f2*5&6ea8ed48cdbde6d2c7ef54ad749e9218*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g992mb1_ic992oc992_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4b2a59cea578b8db64b06fc6e91f87b3*5&b96410f64f322fa45dd989111276e75b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"80812698b8f8c37033c3d3f888205ab5*5&6856ae4425e955e85e6a7c5c08f17685*5&8ba4848143bbcee473714c7e93b870fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"9ea9f7f0128f3eaa0b044261532a9aed*5&ce8cdb383c2ae917a02d3422e57e9896*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"0a543b8a3363a846aca28052716a838d*5&91ba495793f2639e6e02f872706bc277*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"ea62601d02950cdd66bf1b73584ea09b*5&0d35017ca51f4a8909ed6366376c5599*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"37d0232404518c5a42b95fca67d63af8*5&5a3398b1d77c19d6216fe707abc54a36*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"39cae46d0d2520ac2238eb9c7fdb9457*10&ff908711b0367e5355d39d44b84dfc29*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"a47cf3f05b2eb1c4073cdd3fe76fd2b2*5&dd3752a1965d3fd08065ea2f690bf14d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"f5634df8b6c90c6c32170d57f8bf745a*5&e785fd4dd186b6f696d4679472b3c5c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"2c376c689464d6e408c63a517d9b5744*10&215e5456b05c7c5ea1cecef4e5e32456*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"2f1716aaaa5696d3d089ca23a1be8e94*5&e29c1a5b9292716174852046aa684ff0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"23ecf1f79d2522d5c501b9a9813aae9f*40&e98bc6294d0c5405e9ebb81f224044e9*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"4beb04977b7658de806db4d4b3264d8a*40&1c13a7d46048784195c748b7ab4c367f*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"a2e9f609dd17ec0365af51fcf4afe6d8*40&eb433db9189dfbc3b2e897fd183cf7ec*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"6ea34daf851091e7d4dfbc1feb504244*20&300f16203916b0ccf5b6eae704a0128e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"f3fb73b86a36c9d3286d15b97c26b74b*5&0908b4206101277c083c0327a9080a5b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"3822dc8d930eac72538f59a64760d679*5&0fe1151f7372e01d941055fa062a5f84*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"a1621553745b59cc635319b5c65301ac*5&f0d62a2a5ecd2b664e44491cdcbd069d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"d1c4948a9a6fe8ca5547d41b4be1f6ec*5&2aa2b05110262a3e801ecd76b99136ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"80e2eae21467b368d95bc89f49fd8a2a*35&f75dc90cc2ed5b7c7f691a982260ce08*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"c0fb412ef96ae80d8b3065b519b1befa*75&8a3a29d2296b0ca3dc53cc6a6b024d44*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"ba027f9c70f268d78775b04fe12d000f*70&4750c980d65343fdb0cadc567c179bb6*70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"06fb2d8c20ad1a9594f7042976b38c9f*35&ac189c889823b3f376e98b2d4f7a98d6*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"b353b88e07c20707f12436d1099b0644*35&ea7f2c8dea31d7a0aa5cd47ccc984af9*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"db809f67d24c1a25b96f4d54e0b89559*35&2c35cb6c305acd8535929247b44b29ef*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"76a6f9fcae1f2a701dbca1beff894c1e*35&eabfad6092140973fc7f67360e361ddd*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"890a987b46b25448e73632aa7404b42a*35&dbcf23e70beb148bce90c96722c66246*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"c707ef339e849f0d784e9fc0a357f295*5&3e419c8dacf00a95c2f7f4e71c48f93b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"320a0d41075b8aae634de40272a2698c*5&3e9ff63d592ea9dfbef91859f06e04ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"537479d28472fbc4bad046409974bce0*5&5ee22430d7f1ae8c82777be2a1f6512b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"43584b13d614a71430a698ffe3c47e34*5&f8681d5a46a5eebf656ecaceeda76996*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"76028f0872ee94f0a2163069738665cd*5&cbbca1ab59a74cd096b31ec75ba5d8ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d38051d21750cda523b73cd2af6e4da7*30&6d8f13f9162bcbb6cd87650c1c9dad57*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d1cd24721c7c9c3e251f8cb242c1669e*30&d34cb10f01a1ac2337f6a34daedce0f2*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"8ad558d13d9ea2b42846c14cf6516694*15&fea74de94aa6c214a6e7149b1d821094*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"72e2bb332297e9be74157c1483074026*15&634ba63d9ad1af621cce8ffd43b14716*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"99d8f0a095fc7630d7a3a2ded6293802*15&21e47e9e016516095be3d93e2e43c404*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"dcb69404cc7948b454130ddc3369c6f2*15&cab04bcc95a35d007ea4588c926af5d0*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"b1d2dfe801e1f634b971388b361a0127*15&7f4fa327ec2275947a5fa728968abc53*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"79c0723e6bbf5c739a872fb6d32fbe15*15&2b1259f37aca9c89fbbcbd3cc44e9a68*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw800ow400kw7sw2dw0pw3_n"13cafacd996d203a2d3df5b4f22eef29*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"43dc354160017623d06d1b7305feb76f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"f7630d3f71c4ee426dc1e030ef6cc0ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw200ow200kw3sw1dw0pw1_n"46c4b1a8b98b9d54a145f9f8a84a8a9a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"a3cea0f3017fab060f78974977fb3754*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"c8d535bd8749968d05cb79bf9f920a17*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"c6126cb6223ceb5d101b713fbe84a3a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"7c1ba2a850dea78727f8bc0146e26a5d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"34ecabe1c56e865299d2b94ab2d1e99e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"60d753bf59b1a2a14414db24bf87e7f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"b73f91471024bde0774c0165b91c522a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"56e0f8a34134bf97fb3ebaa1409ff35c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"03e4f806e83fb0eadfb5d674c027cf80*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"c37d40388b5bcbaa4933b08a3f3fb8a7*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"32d39ef4fd8f9285020aa18ff3ce2c2d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"64357f32ff8477b94c5e29ced5071627*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"ebd3c7b951f6e7a493bb736d26917e9e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih100oh100kh3sh1dh1ph2_iw100ow100kw3sw1dw1pw2_n"2916efd70cf79ac2660e6c163c570d54*115"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"0e9ca86218f9cc099c0d778c568959f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"0d05cdcb022e971a8ff99a1225d8f65b*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"b85c45cfcbb7c7d8be7b9882faa2e598*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1024oc512_ih100oh100kh3sh1dh1ph2_iw100ow100kw3sw1dw1pw2_n"ab708ddd21ddb137e3e63ea13497bf57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"80619df0b7ba8f10c0474b66c5ac975f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"f4776041231de6c44fd7094b82a59910*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"846a7683b55cbc4d6a0b071f705ca80a*5&e17cfaa14800d4577b7806f483eb7983*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ffe4dca57ef88edf25f6a05033613893*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"db45d866d0a053b12c39832210966727*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0d3af953aacae1da47f640bc2a45490a*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"abe0ad2a4346ce8453d34b76ef1badf9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dd5cb7ce74a4fcfde2395c05e7c6746a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"db332e6fc2b67466c5794f60982df049*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"3910500303b46989028711cd954cabfe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"ff07addd71d9c70896a4cc7f7fd1bb39*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb100_ic256oc90_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"d35372c42700b111199e4832bcff7b54*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"d198ca87c955352f5b442d913f7f16e3*5&8a24c7906a24280b4901631dba672295*5&e4ff46653c02501db5e2770b2e45318c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"786a23d541c644018b7408688767c3db*5&4ecabbeb30251f87b69b747e1e1d683f*5&1b6690077b9c019902da46a5915c71d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"c25735d82584b824ffecee5377fd01c9*5&7cf0082cae4b2fe11aae31ee210e3a26*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"3c4a97aa6229d659d4f3735d9723fa73*5&9da11b8e9df78070c2076e720dd94e9d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g96mb1_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"df9044a1707acc1717ab1f8676f574d3*5&707d8ca273ca83c6bab1d852d0af770e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"8062aaaa1fde89363fcfc167638a3b67*5&b7d1724fb030f3a953022ff78c30bd1e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"4e6f83a13f588ea320fb6b348d64b354*10&ddfd5d8d103c6ea1dd0b956fb5a72fbf*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"6e011aa8cf6c54bcc6c17e8ff9483faa*5&87ea7b441fc4909c42d9b0d1cd17690f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"24c2e35613278030713a87869ada266b*5&bc6c9d790b9cf88985fd6642e5a0300b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"79cb64444267eef508dc1538e247ceaf*5&3a720916b8e7846f59e8877053db7d4c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"74464abf5f18bdd2273b65b7afbaba1e*5&564541de9119a8c4b3de7a70d4f5542e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7a9793b200dbb48692f4d11f3aa05aa1*15&7ce856be19bef6bf5a476986186dd624*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"e7a59f19610460ab97793df8437879e3*10&33ce5e2914273902f94e173ee65f4069*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"21e15b33552ea623e9472ef0689ac2f5*10&3827160e5463b69cd36d3c792528cdcf*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"150290a021caf8a7aea881931e946130*5&66a7716ddbb2823c80fd5b2a3985f9d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"41e55d71ea979d7cbb8bf31d10112ecb*5&63349ab55a6dc5033e7447520e60c60e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b6109f29c2a3a76aaaff83c6ae10baba*20&597c8415dd40403cc015fb391fdc1eed*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g384mb1_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"b3e4d5529f937ca9ce31b93bd7c5265d*20&6f66b688c9fec21b328cfa6af6dd981e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2a9bcb022bc7a2c845a27fc65f4434af*15&0d8be738a1874fc9fe19cb2cda88c3b8*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a557e0f5b4a1a4c4753021999a38c4fc*5&d8c5ccc1a578899101135bffb36fa2d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2f163cac91d1dd58e147f3669ebbf4b2*10&5fd0a496a894d9e4d9c4c46518abca6a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"871057fdcd2208f5c423fb2ae316ed22*10&3f9f064cea2ab59b49c7b8344745fcf5*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d8d7b39470ad67df5fc1352ea2eb2c27*10&543e9032bb790b32526bcd74e4a16ed6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9fb03403a179aa59518c9cc593af01e3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"dc1572a1221b669023be812849a2624b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"30ac7a84d98fa65291a2af465ba0b939*5&40ce213ca4c519e41514d4f68b9f7089*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fd2b22c1913113895b49698ad63c1b3b*5&59917e60ff8f821a88facd322369aa2b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"fa5ac4d5ae9a13c927162c24e6a561a0*15&c83ea67bccd0b23ce8d68e6dcae3a388*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g960mb1_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"14e51b0463fb7e4aad6bb6e46b8e8cc2*15&adc9245bc96c3ed3e2ccc222ccb9ac33*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bfcf264e7ecdfcf48e9d2764508ec84f*10&9e3bade82a721edee93961fd6b2bdf10*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c6940a9e8492cb5367dd1c3e064a3c45*5&fb58b6feb77ac8abd5c3cebd0ccb1b9a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"7b434ff47764db10d23181f8387714d5*5&710a26111f9a9c1b91deac287b9a08f3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"61db70d74091f007d283c06a7c050e49*5&44baa85a02abac7817079a14f30a17ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g256mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"9b8884caa0dea9d7dd015a318bcb3c76*5&93c9e4cb90b5549e8c78604cd0a6860d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"fe485eb753d5c704b8704d2524e360d5*5&97b5f8aa018a230cdad4a446ea232d60*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"a5e5f8c17d00149fda46c6e6d36f7aa9*5&c9953d9daa2b2e9a1899545f94f3471d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g128mb1_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"7123e6848110caa9a0c0dd5e35ee7c91*5&6ed7055d3af7fc01647b56b51980ce2e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"710c3beef08882584a2f0cf5eac2ddb4*5&d74e051c6a6b28d6fc85957c2fda91f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"660428d01140a904e4aa74c3fbcc2085*5&0dd84b0d0093a50f583bc33610d7d61c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g128mb1_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"b693149f21df752f38c077c862411a04*5&6bdf0fd7e99586ca864069e2c61325d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"df36318bf80a3e3df08a9e479e117dab*5&e5ada799c3eade3b10741b64b262d30a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"704fc12dc96568e8ad483dd077b2b126*5&eea1cd6731df8f64d9407dd581e6d92b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g64mb1_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"5457590f706c0c4613822d52b6dbdc93*5&3795fd26ad0e700ecb890b91927f269a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3c43461c71feadc76a1f3ebd55cfbdbc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g256mb1_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"c77b6bbe1b57cf89ac73bec81b3ae350*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"50607c7dd2b4fe2932241f9b22460171*5&87f33c1df09392a42cec8b3897a61d43*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"3c44e049cc20b6e19420d5e715fd3dde*5&7a01dfd72120e3a4484f105d9ae3afa3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g256mb1_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"b27d9ecf796c62f3ae10e769352329a8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"d679087b4b25d8c2af3e97948a9dbf9b*5&c7ba732bb197dc9878e9e70feb27692a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"72d554aaa263a129f802f827efbd2eed*5&a4cf6680257f7cf95898a9628087b7ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g512mb1_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"89ce10f1d0eb686e5b3e301356c60b86*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"4926d3d53cb2df01f2d039c1bff1da92*5&11390a3173c4ac59a5a42e8c46102466*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"8b8271f23a5a040ec37645797262c884*5&3145ec42c4caa64b26ae78b2ebde0529*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g1280mb1_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"76d2693e01d4723b2de5a0f5a19e68c2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1f966b6f1c40112baa392a3264148b77*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"99f898b7196fddaf704632a463985cd6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g128mb1_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"002eec126f95d20d96ff9982b55e0f62*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ec40325d26d2c41a3ffe7a2d06ecba13*5&464f6dca9376f75ea8433b7dc8449af1*5&e24565227a243e00d137b566361e2336&350169ae90f62b072876f61e26f8d6c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d2f4548cfa3173c434dc7bd01c0908f1*5&1a969e3d75e813da047fc35003466724*5&e8acdaf1b2d40b13f2593ce981ce7781&ab31fe7f9010015e2fe603d58911f421&946b8a08943b57d263b5688639cf9e68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"4b1d1d44780b3e52fbce1c3f292d6806*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"8ac531fd0c63ff402229c6b64edb8170*5&4781335ea01466e8aa0602fb5ae53a67*5&5bf3cbcad69d816c32ce9e09db0f674f&2b8ce60fc15804daa2434189e852f11e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb1_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca6a71952f788b30d93ac6d28194f8dc*5&4cedb204670051275d6ee86936813279*5&f10f72cbedb9a5007f4293e4dedc1bcf*5&c005b12903b7e072442eed3926b908e0*5&ef68bc5431472dd590ed3b600c1bbc9e&81b9eca7c3662284193bc3625366f172&33c023082ee626c6c60c8e844bb90abd&fc5770c3607f429c13146b64ddf85f2c&5c5d4659b0506afeca7a5b659ece4340&6f244f23e700ccd2dc50b681190610f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a859f62b5f96f2416f57ebf37971c68a*5&7d08019721ef2827c578296635a33417*5&aab287bd88b36ca8d1a26c5dd9884ad2*5&16feb1203f7ad6b1c3674ceaf21d8fa6*5&9f478a44226a6dc95316b70863397049&adc5e4375f7ed8bbaedea82d5a4d2a0f&c300d7ceaad39b3a5c83502e47edf78e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bd705065c23ade5cf442a4006de93a9f*5&ebba0721286650ab2ef300cf730bf743*5&179efcaf1301604c0af5b817ceed4a54*5&b5770f62dc98d2a817279ba24a9501bb*5&a97a829a415ffef78e1518be5c20ea82*5&6822b0bcbb51a9b25cfcf5688d6f1c76*5&ef8fbd5ea9f02b5435d5085185275d87*5&85c4d91b346b1b7402525bc73411263a&2d9e88e72db7f3228c504bb89c6c072b&2ccf07016577a3c98adabcea81b0a73b&e4c26f55df42ececd93264879b92f7ff&6fe1eee4243912ebf52d2deed1d4f7d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"b14d83082694122e011809516f687f29*5&3d587d066af78923fd3025d4dbb5b304*5&84360de9ef991149eb5c4fff0dd3c1ab&9919415c737e6d2b85aef5fe37dab47b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"0132f97076646f16e4c1eccf60081238*5&2df8cd3b9f94de9a5f31c4a7b0132c5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb1_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cb95fa73d1ddace0ab2e86447d1c5673*5&5cd005bdb5798266786ff0d2cf340ec0*5&08e43d95c0ac12786b372afb44c3ae87*5&1d9b83044a7a999cd9f2a94dc8d87744*5&aeed5ca025985534243eecc4209d66f8&32f5a250395dc47f56c272b49c541742&8cd974bdc0f968a4f0a7893a6b481908&c30f62de5e390f247b6610d0c9d6d1e9&b30d4cbe4f71793d2143871a90ba308c&d9487e57e79840670f8d1d2a9037eea6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"400bec33bd56c98b776b7fb002b9d13b*5&39a22541f74a16e0ef0079c7a523df82*5&cadba9a560e1bc832b20f5c8398846c0*5&946c873caad60d109a5dce6ddaa840a2*5&6bf0aa97d55d75f9fb06deab5b8ab876&707b8f4c1149e78b469112035d5300af&ab330cfeecc65e65829e1daf83565e51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9c951659815e0d650e4d938602250090*5&b936d1e420af2d94bdaf46cb7e45c9a3*5&5eb72c990f250b2df0d61971a5d2baf7*5&d4e8f81d4b8deae677eda85d3910b50c*5&e6795b5ddd481efbc31f7455c8cbc7b1*5&d7f8d4a932989d6303a23be2442fca01*5&b1b5c60f95716f8b085a6800150aecca*5&e1f2ea202c563d24298c39e7b5eed6f8&04be4c2ac890bd2ce4220d375755c371&5014983dad14bca0b50c3e4fc5feb7f1&95ebc8d5542f3808fd2511c504404548&36225c160b9534520f433020d7618e46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"66732ba5895cae590b4a6171eed89baa*10&fcb9178a75b658553ac19fa7c64f1799*10&d499889abe9a2ec4c56697b6d7b97070*2&6c7137d0a9a4ca7286f677258b1d1c70*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5305b54ac7d0f1bed4a44f3b0bc882d7*5&f0b5efb4a084cc1cced37fe7b09301cb*5&f639539b634da4f34082ff215f6b3b3c&28f5aa9c25b5b801fe6f01d94ba5ca17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb1_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"31cac213851dc88a175246b8757ff174*10&19843e1767d974110da85da357933293*10&6c7aaac97ee34d32c3d28c7e3a0f4719*10&ad332826010a1745ea29d55df97f2845*10&f6e90afe22ebbfa209f947563f068063*2&fa3fd541050af212e82cf007966f83f1*2&65e4e9d79cd701662c1c78649c0a0c1a*2&7a5eea1b7ef901f662f525a3d10ad560*2&b5105a813688a4ae4fd0751f00ced43c*2&f93c40ae44847645f84373a1ca2661dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e595ed29296884000b4bf873bf79bf31*10&ebb184c393ae49e28eb02625610dac83*10&e96112290753190784ee6ff0e0e17e48*10&33a2a620bf50363c3af3f01fe372405c*10&a193105f99036acbbd587fb40489a5c7*2&86482cf9319eaf22a87e8e3ce4bed5b3*2&398756adbd2c486c90ba2fd6ad35427b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"05ef2f0e947d941f8d3c441a649843be*5&c2efadf4e089fbd0f6ca39e0a0409840*5&7a257a703c07f3748d9ea2bc245946a6*5&8ae2586b8ce14c75495c57dd2c4cb47b*5&c37471a981d49e2be9e5822efaf677ff*5&3d655ec3881f245d97084d86cbf08590*5&e43bd58e34720f0757a35ae1e90c13cc*5&6a57f0c0443b793f711145b471689f8f&9097656ae1b2e5463dc2f53ed2708558&a3a6098cc4bec28ac5be518319d96fca&d5e1d24a904c54efb507df29bd7b6c42&efc691110e0353c650cfd3a22a66b065"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"6b9e29bca2c95872a96e1c5debaa25d9*5&ed3e9f6e210d2f3d7709fdb78f7ee11a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"344db9613fbecbe0895d915bdd6f6973*5&31f3d74a9847614d29647c420255c6f3*5&50deed8d104795e93011171296d83770&67db1ad973d2bcf790af1c01af3ffbf5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"78cdf796a74cd844cf569454d97d2ee9*10&595516ca3d8c7da4a1f119ffc3ca161a*10&3e12dc6cb5c5e9d7a31ccd113d999085*2&d6d3b9102584278453b1e756d3cb017e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"fd0ac61af46895e334ecaaf819c4e376*5&adcd83507b61dd262e13938c83a04f08*5&cad0a9f736a30bea2a1e1b1811d0bf1c&0c8080c3880b44e7b6beca2585c54dd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"78958530c43dd9d863a6d267755a8ea1*10&ff02c4c311093d3746cf8e1d55d45148*10&b39ec4c3abca0180cf75bd6d595a34c2*10&8b850461630365c68879d3187f715f8e*10&3adf114a7ee2b0d9f243295d140aebfe*2&2bb59b604454cc18da293123048cd76b*2&c76cfb5e6da7ca30e207ea662c5178a9*2&8ea9f91511325305ff0485b1f33f8079*2&3a09a03a6f1edc855830a50bf8d5d271*2&b9823b8f9c5acb75181379e57cd6ef88*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"025e18e58aea0eeb48fe025dd157b815*10&db921309c9e5c7695b1fc15b3604b688*10&d4c2ff8cf5fbfc3cfadbf3176caf9f5e*10&7487764f6ffa8e881fa68b8e69a892b2*10&0e0ef88fb753948a193df1aeac05a9f4*2&cf6ff1994751a1dc64dbddb221477d63*2&a79340f61b3a1225929bf2bec742651e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b43bf575f7b79fbb5ccea012cc12e2b4*5&3073a75e5e125c8e6abcd5084b65c3e0*5&6c77f227a1279e4dc0ac501a9b9f5bbc&ca6751041ab8666b4fbd872797f17f8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"5114aebcef817ee9a176728ffa1e2002*5&390682afb040b6cc30a4180980501342"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"265a350a81d87cf650d8f06ed1e26e36*5&671da2e1f5befe38b3cb60c4ef4adc7f*5&388793f2ef83f41f23263021c429ded2*5&a9d69c9c1125e6df0d0e7a00459bc7d1*5&388204b79d185b37a7196fc76c854d87*5&d75edca142bb34f1ba3aa399d5de206f*5&1ef91c4334566629041bcc47ff8984af&4c4b5e94ebc41d6407885acffcc8cb6c&1016dde5d5269e796b67695942654f2d&0ae5b257299f7a0537cd697ee63e660a&ba40322bdae470c918ebd0ca5e686e82&a7ee4cf0a450ee7c7a727360cfe23ae3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cf0ff2a81b014b6c1b5c9d2323a641b9*15&4b573ab565365f9b7d2c9da60667b647*15&b4143f1409d43de50c8dd4fd878364e2*3&66256fd331a1046e5d20e8c5f2b344ce*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c8bb410d2fa163afadf15cac59301839*10&250cf021a56f613883831cf0d12577d3*10&3d7196f95816693a239f5d821175709f*2&8629bc9913ef7f61c79bf77927b3a8e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"51d0d8787895bd40ea270e0eab9b9562*15&0ef38bdd862cd5694e47317a8dc4ddc8*15&cab072736bfdab06363abb7e2b3b5953*15&e629f454213fafcbce9b599efca80331*15&193a4af2176dac7de18853b55a3c8aed*3&3868a00f5948b113de1f146658f61d09*3&17c775c2142012a32393121b6cb2b988*3&5eda56d9d21339836b62fe268efc29c5*3&2422757fb6f09fe183d913a13bc77025*3&20123b9cd96c32ce7def007466af6fbd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"15fc43a2b87504455fe60a55f57b343d*15&ac8631111604171545966a76369c7005*15&cc4a65f21ca46cff4ad2b289b4dc3a17*15&b45b93d896dc39bbaf5ecce7b04579ca*15&4b5c552b85981add44326404b540855e*3&c46086a1ab23029481192bceb16744d5*3&bea5791285981dc4797d8deb6817eae2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"10f7c153ae688092af1236071b2fee51*10&cfbe2e58e61d0db0449ec5ec27374475*10&30dd452060e54d322ddb3acfb6a3f17b*2&b21abd74b02b88c4467f97c853c0b5c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb1_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"28a962e5169ef26ccc48442265cc7c6e*5&d98c56c202b33c294fe45766ac36ae02*5&f4d28d06a717c590bc01047b07b91eeb&106c7c7982ed76fa69e272aaf2e5e7c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9e443a8c17c8b67566c3e0061c31bfd8*5&ee34f066a9c654a3aa8b6aba3a925a20*5&c0400d49375099736c6cc48b2463aeae*5&26dbc6fdd34bb4fe8310a623f9ba647e*5&187ffb8cbfc401ddc62906bc5147bc11*5&8585b1b173718cea76ab40a1a2989f77*5&efb56ef4ff50b658f91626da7091a2eb&893202b3e7b3a464c09e3fe29870d6ab&9fb45d3882d0e0393bbf3b118276872b&f30efe6c43f161943dd67fe285faa162&cd4765478c72fc2e47597e9aff87ebac&b3df73f68143527dd51818527c6620c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1cd86cb077c12685de0740a3e8f12b4*15&4c89d2d638bd72f195e743d2a6e2fd7d*15&00ca6b87015ceb39658dbbfce519eb2e*3&1968c7bec4ab9d9ec2c4f015b2c7c374*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"7d280f569b793b82fab76361d4e2505d*10&1a74337c74ec1d3a26fe233c7309d428*10&1f58daa0f3fe96df5ca95077c06e5578*2&98a7eff314f89142bed61f9c1be90b52*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5e975e7aef510e027a8f8aa053e017d9*15&07b164f4a6a123df7156ff9a399276cf*15&da191ca8e2e0c7df3cbb24fa0890fb13*15&83ab4ce66b3ab1bb480926b945c3cc86*15&c78484f24da3a65d83071cd576af0ea4*3&1b64a9f0c0b29a26d03b8e358a68bec8*3&662aef7d344a4a585da14397c9154e2a*3&a1c87fb989422c1ec4e991541881dbb5*3&c599d8412d6adc950349966b8486e362*3&7cad1aee9919e97587b903da6f98c21d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1d476600f2e71a8cac6873e711cdd54d*15&8b2cfac6c83278843f644acc709c61c5*15&e50066e581418957c9f61747131a1252*15&ce5b4400e4ef680976e6c078dc9edc1b*15&ace9be9b44a00b40852a10a26acc14a4*3&c075e4a04de32e64f486d0d88eed2de5*3&01c76bcbe189efa9f5913c4c225d9efe*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b67ec0d932618e187a6f5b7ad0edebc1*10&ceeac9b57f22021dcba254376915c34c*10&904b540209cbf8da00ec75ea2f5719b7*5&93b729d4d01771cc80c5f724519c8503*5&aab8f5b1ca60478361dba58e448cbae6*5&fbac3b7860c76c1735cf7f627d51e73b*5&7b8159d534f259d18f106c4f46a9017d*2&f6ebf46cee11c014a9d2e6e2e9b06421*2&649164e9b2dd411388085cf6baa19d08&589936124407260d4ceb67259abd2ad3&fbca722ec20192f45af843a2b8d63392&278171f0ecbe3d232136cae1c029efe4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"870f1becf499f1feb8728faecf897401*5&25ea63e7e97c5bec02ab28d5d24b0807"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ae1f950eeb867fe1a8c31a6e8f116a46*5&a72edef53703a2762517e86d00a3f21b*5&e17516cd409903cd780f1d8901f0370d&26ffff613548d3a4d7fff45eff37b9b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e6e595d89f774377ab81b5a15af93781*20&b8ed440bef1679c846b4b2f744875bdf*20&4fedd83a8e2bd4d4c15d41e5db2d16d0*4&25c81066f75548a63a8b958b49f8b77e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"2c3ff2dd6fce829e2a051efe0276e6a6*15&9e2fbecf5d5172784f743b0a60990ce1*15&51e928fb34aea5847642b25bec3f2d50*3&83bc2b38aa34d8c2a9d1be271b3fdedd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5dfd4839be11f32c6fae721c1cab5c4c*20&2d9e4a37d9296c6e50a330b0ae18b525*20&c06c7a8160c9c45a8419bb5d27be3d30*20&beea76ec9b28d134f8a18a67bf9c0601*20&980262acb4250f6ef1a664e20677fe11*4&57657a24bd65287489e3a27a4ea5808c*4&4f9b08967e7c3ee20d021773bf574f42*4&41fcd39dbc18c3c7d267899ae7b8db53*4&339ddf4016c237f2dc694736db4771a5*4&13d4cc548cf8abdf091adbeaee5257c5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"60a532bcdf1bfa85580a8a939041f4a0*20&4eb5fb75de2c1c929e0a122fce2aa247*20&8b65f4c1a499a8dacdfd408f97ea8eeb*20&b4c1eb18262d7bb62171cb03e0dd5d96*20&b79ab87489746e1145cffd2c2a2b0aad*4&9eea448576bbbdef9707130525e9f059*4&283f7dee8c8ec9de81b67c6384679ba4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d5ad8e42028975669a779e14cd783b23*15&bb4762d121ffe701d2aeb19ddc80a684*15&3e93d842a456f9cf6fb4c09ea8ab13fa*3&b2e8f9d2fcdbf70e85492b502b03a769*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b35913368ac7a2c15ca1103227deff92*5&c456335541440ac7372ccf773b31a6dd*5&2c1da913c59ec653c588ca1ac865541d&d709ffc4d7a660bacfdf53e108f9394d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c54d938dc9b2526fda1688fdd646eb59*5&7f13833ce099edcbc06cac4b9bc3e23a*5&b7d8fdec69c051a3a9639003441c33f3&48c800ce0994aa839c974d2dcb84c08d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"20a1a483d9209a33e8e9ec288225feb5*5&1f31bed28104c6c0b681caaf55a3eb06*5&f39444ed2db33f13fbf31efe3253cfe2&c95d5fd3e24f8fe5660ab354c8cf9549"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"4e7846d8cf1d9d5d1d9b27b59771105f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc3_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"77f7a430dd49934ef1b4a104445813a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"72d56e0fd4b4999dc242d91da3d6ac2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"cfcb88269831440a60707aa0453f3764*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"4ef65e2343a4be0ddf78822ab7bcca64*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc3_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5dc6f694cc2eb4aba3ccb00d5bc0530a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih19oh16kh4sh1dh0ph0_iw19ow16kw4sw1dw0pw0_n"71d7fc28b76da3f2603b887b63181864*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"74cdbf8fb759e451ec2823d1e07b236a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih19oh16kh4sh1dh0ph0_iw19ow16kw4sw1dw0pw0_n"6820a1f1520673af41bd6b1c28a1fb6e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4e8622a7dad9ba74810d6dbe0733072a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih35oh32kh4sh1dh0ph0_iw35ow32kw4sw1dw0pw0_n"4ae5403c2167d734e718adcccd2740ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e17e3a942c8028bd107efa050211c084*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih35oh32kh4sh1dh0ph0_iw35ow32kw4sw1dw0pw0_n"e65deaaaa37901cf86a1c54a1e044964*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cfa1bed396d9a400e9e7d3d848e4790b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb512_ic1oc1_ih67oh64kh4sh1dh0ph0_iw67ow64kw4sw1dw0pw0_n"e8e0576722a3c9c6f4513ff8991cced9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7aa17c91b3548edcf013f7aa05d04654*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih67oh64kh4sh1dh0ph0_iw67ow64kw4sw1dw0pw0_n"505ac3ea43e3c6a153a5c78bdfeb7ecd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fbcdc74a8a0e91bb58e1124b09ef10b4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb256_ic1oc1_ih131oh128kh4sh1dh0ph0_iw131ow128kw4sw1dw0pw0_n"f0c7f9db528761a6baafe7ce9475faed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2a4efdc2e37759ca232aaceb7f86d1b8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih131oh128kh4sh1dh0ph0_iw131ow128kw4sw1dw0pw0_n"2960aecb53ea42ada903345a3360dca6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"2f917086d1133eb355e37fe793b11a92*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb128_ic1oc1_ih259oh256kh4sh1dh0ph0_iw259ow256kw4sw1dw0pw0_n"7d895c24f9ea401f7159e1703c5bdfe4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"64981fbb9d99c2dac23312162c3e49ca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih259oh256kh4sh1dh0ph0_iw259ow256kw4sw1dw0pw0_n"b8e0f36606a5762db1aa8c044bbfc6f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic128oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0b94c8a70ba8b88cac79a1097e3bdf63*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb64_ic1oc1_ih515oh512kh4sh1dh0ph0_iw515ow512kw4sw1dw0pw0_n"ee9a51e8904d0072a39d01ccc1677e02*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0+binary_mul:f16:0:abx mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d5bfa0c917a19ac10e1a91badbef683b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih515oh512kh4sh1dh0ph0_iw515ow512kw4sw1dw0pw0_n"0f54e975b2e55a1edf2224a581107f3f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic64oc3_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"8c56dce2cbfa9a88bf6666a2ea5c0b2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb32_ic1oc1_ih1027oh1024kh4sh1dh0ph0_iw1027ow1024kw4sw1dw0pw0_n"d482369e04fcf83484831d815387982e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0 mb1_ic32oc32_ih1024oh1024kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"039145f5aef2e3a3c920492c9bea631b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb3_ic1oc1_ih1027oh1024kh4sh1dh0ph0_iw1027ow1024kw4sw1dw0pw0_n"9f493907b51b149317df96e8407008a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:0:abx+binary_add:f16:0:abx+eltwise_clip:0.0:1.0 mb1_ic32oc3_ih1024oh1024kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"4850970572f18e0cc9407eb852be484d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"59063711840ff9e7edd538432195bdb8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb1_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"4f867ff2fad7a2c5f869ae715e72c579*5&5a5f1685f2bcab1bc0903f97934d8cb6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"e47c55534d732de87b7792ec4c7ee537*5&18445c37cda1989a6d16b183abd38b49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"78ea925ec4634b320a746b6639d2faa5*5&9be0351fa41440b1e7303b0c49e733f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb1_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"deddd2579121e1147653a6cc68167271*5&02a28b9f63f4be5edf669d3eafc1118c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"08f5cde668d4edd9bb8c67fa74b28e6b*5&d185b698bab4f96a67850ff1e083e921"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"8dbd1c3c6cce539c05bc43e54a2c3a38*10&80cbafa26982685580224646b226e93a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"3ea12bb12e166932039043b720c9dba6*5&10acef801d031176abaad654b4513d71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"61b42125dc754a6d24592665236981b5*5&3a0de56daf86204fad191315116204bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"6444f57158e5f353bf3df598f1a9d3b7*5&e94105bf42e9c35c89904c265c7ae698"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"0836b4e088565d1ba0cfe1bb4b668451*5&8d4fcd437f1121ecb15034277d1a7523"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"626f8fdccc6c21ba10e7c43b3723bfd9*15&6a1fab2aa979f83d0a4fd611aecd319c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"bed31a4f22e4eb930cd45ab95deafaf9*15&ec6624d791189f0ff88a214065398dc8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"026d0900c695030e72c44b1cb57c0b7a*10&62cfee6b7d5f8cc4ba9cf6148cdf759e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"208260df050ae9e8be2a38a13ac7e6eb*5&3206d76ef3de5fff28184124fea70092"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4aeb345cbe618678e60666fac0f5c296*20&47823670923b21d1f8add6cd71cecb2c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb1_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"452a3382138174419f812db50dbe5b0b*20&1e2dbd87d7ef9b30c48e968dcd1ac7b3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"bee97077e05e17ee41f5dd39349e8b74*15&947cb323b599b9a4305851f0cfa704da*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"688c7434dbf013df9ac12a1016d014e7*5&89fe860b2cb4a8bffc2532c8f1bc1937"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4a2f993b0914fcdf856a0e6e6af03540*15&95aaf8bc1de6e0d95b26bc381133ebbd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"1806b8c595e95d70c54a7e6613bcf0dd*15&5759ff4a6b93e58f97aaf1c03df1b08f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ba16528a934f368e9d456cb93735c917*10&5657b9e5bf059aab5a93a0264230c289*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"cff025c35e9bbb5df4fe34cf7b2057ea*5&b22642382b5260db328f8fb95820b390"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"373c8067645b89311904a4cb39202947*15&377b1e4ad517df4c372e3ad91680a500*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb1_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"3b88e9322f45d9d15fdf9bd7d544dca5*15&a506f939be937fa587d0e0d19f34a865*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"795f6f7ab25afd414a38fe1ad9f6848c*10&0c47c5ace0f8c62eadf8a262cb25c8cd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"f38a0ee0ed9dbdd72f095a90ab5257a5*5&bb2159671e02f583ed686ede370a94d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"59944e455894fbd880d743430e54af94*5&8d3c40970a4294afaaf7db5fcc314395"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"47c5d4ccbc5d3c9d5bedd5e602bf395b*5&a69afa8244f101ff41cb74969933d1aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"6efdce2f94568e41b089854633dbaffc*5&f44e30484b941c085bbb6abcab230b9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"12ea6337e069a812e2e08c1ca9e89b00*5&844ba4e5e873b3808ef2ad6781c4bb7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1344ow672kw7sw2dw0pw3_n"62d079ebf3f983bc614a61fb0f9cc1e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"e2a96d130fd6e177efa38c9836282972"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"c79e8096bd51890dcba47af097360164*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"81a138b9cc69678824ee9bfc1dd3eb9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"2f5ffe4d3e3953a3498e57964bcf357f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"931c70ad56e88c07398d8d59c54bf304*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"ccbfc697370041257625ec41d41a46ca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"a74b8131bf804fc3292db4208d18f2ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic512oc512_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"5c32b3dfe970ac2dbb82252f19831114*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"3406be237fe7a742c6b68b8f15a37aa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"47cd601e9133819a3bbb37a5a9e0832c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"6b97e03507a24d4136931ceb05642a7a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"b14343e8516102163db03b344ff789ba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"1d89763df3c4e36c5795a3d9775532e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic1024oc1024_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"dd58b069f6cc38833aed8d3f0c83de5b*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"a30eb51254b8f9b8b0a3235eaf841195"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"91cc5235cdd2e0e3a77a9f00f03631a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"2ddf6d1d6abe575daea90f8d507c6595*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"35914e5daab088861eb77441a23984a5*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"9dc018f6ffcdab6f423f97d02ca3ac62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic2048oc2048_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1abd4378cc18f5afe4b81174f0a0f7a0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"b1cef92e5f10e9dddcd182a730ee5276"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"60cd01661ea46fd269aef546d3219d31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"0f3b5591ddcf358eb7aae131cdd1e4f5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"7ca2d29b396d34036eae763475c20671*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"8fdb1bf6056aa892409c2a310e353bfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"1919f551aee85fb939c06de727cdd6af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"a71446a3b20e954413b20be181e5f6d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"407f126f579b2388d84fe9d0dd5601b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"0f44f1f27e1b4df73df2d4b3b878fbaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"9b9ff7d17e1db4775e236635e6a28a1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb1_ic256oc3_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"0f0ad11875408132a7e20d8a7d8cea29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc12_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"8f99d08da852d96855e09fd489f1deed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc18_ih200oh100kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"7c608e9ff792e454ee36476ef90842f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b --attr-scales=wei:per_oc mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"8cf5e9d0c00d4edc1364762e39e4e526"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc18_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"cd3a40079ccf9692c597db6adccf4302"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"6942d120292eae8e4b46a39f9c9c8916"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb1_ic256oc3_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"63064f010fdff6789a1163c79f62a259"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc12_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"204b9f9c77f4849aeaa7904a1a4ec40e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc18_ih100oh50kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"15a26262e2235486ce2d5385c7ec798b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b --attr-scales=wei:per_oc mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"df402a8781a96edaca1001feb04ef11e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc18_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"2f48fa67f4fd3695ae065dafe5a19684"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"e7b375eb45ce4cbac9bf8a12333cd778"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb1_ic256oc3_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"8fdf2f2fe7636a9c80e2c55925489133"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc12_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"6c3b10b4d892f1a5bcaae096adf5dd12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc18_ih50oh25kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"7ab1c4235dd7a2ffc24f71ce87b81ee0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b --attr-scales=wei:per_oc mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"c1fa97bf44c626b44e6763689b498ff9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc18_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"b1dadb16e87cfb22ab92c7aca48ac35d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"3a48227ad11fffc836c8fd41a1b91bc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb1_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"2712a2d67e68d41fd44653506fed1981"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"e1dc406fd44b4643d31f8702c7e0bafa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1b627e081b92c7307880b89cf5f07d0f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb1_ic256oc3_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"8a5be2a8be5765b93eb689dcac3332ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc12_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"a4ae25a111a08e96b22fc61610acc460"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2afd1eb84238d01f1aee6869e3d2f42a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b61e3906b6f1725681bd633b9baf698f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5b977fcb66c7adfb61618046c4844029"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb100_ic256oc128_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"dd5f34d3ab2a89a303f915a432d2c378"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:13:ABcd32a16b+binary_mul:f32:0:abx+eltwise_logistic --attr-scales=wei:per_oc mb100_ic256oc81_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9aab05d125640ea6a3baf1fde3b79115"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"2286b1422164c75e66ee0191bd0085d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"bb7169579eb74f6ff7fbd3f1e1eb9b62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"248cca6aee037a6fa6ae389696c1764d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"059ad6c812c1a6e28eb4c13f59adfe15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"6b40e661cdd0051c4768ebef82178ce6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"f40ec7c1c374277307e666b95c430a85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"1086fa22341953d2e0bafe0662b72a77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"e0fafe616c7cd19e98a78855b8f46551*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"ba6af758c8eb2b4b1f12257011d57d18*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"135b3cfc9d09c6319f52889dc85b843f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"11cd032025f312d70c70e497f652a3ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"fae25e98e4a89e6e7026c9ef00672612"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"c18d69bd596ea9a7c482e70f1dd7bbfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"ae032c13e4c711e4daffe89fca4cf7c4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"72fe98dc2cc9301a2e917e98d6c897ba*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"6a08f6f7ffee05a3fa292bf39c3fdecc*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"89975da6f4805a66d8178630de58944e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"5bf4b2c5351b2ed205e48fc156f74606"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"f0cc1e323caf70805760e8743ae8c91b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"d1454b8d22f72c2301cc313948ba0045"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"cff7871e33f7a10952d94e5d0f383046*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"129e45508ea3f2d65e39f6e41f73bbc3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fac920af433f6bb17541486ea2da53f6*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bdc89247e382d59f7cf3b93af6bbc221*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e3a91f43765a19b530839f034a4ec0d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"468088bc2305ad9cc6314754e891740d*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"0bd6dd98113b92265616cfc873e2ef4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8fced8c70b3de729c42762ffd835b58b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"aa046f474e3e756443b09ed4efd1de7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a766ebb143f7502eaef9da76db1f6b0f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"473795e56247bc292211807aba9b7a74*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b752b62f9b44ef1646f9092ba73d5db0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e811fae9e94bd2b8248c6a9f49b09c3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"020d3d23b480b932cca10a06e4b3d7d8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"90f62b11cc4088995445b7fdb1324197*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"114d1bdbff19b438d66e4067491cc098"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0f3421dea26f2bdf81f2cadd0cb1b533"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"ee2e6e910bebc434362775513e949a09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"afa9e0a48132fe23c236d2c6bc2d09ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"e0354db37b509f467e4aa8e92d5f6e8a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"3d6d4f4ab252c93eead77eef62b3fea9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"be4096d3504f3d88e2abed841009d0cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"0b0430e998275f987426d904f0f23173"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"dc9c3d9ed03fc409fad625031cf3cb4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"96137c08e523e6fc9bdd531867165cbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"82eb9908beb3cc0de328e27a966b36f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic3oc16_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"ebfff489f52a7c1c6509d063620e1599"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb32_ic16oc16_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e9befb528a3c1fc770300ce341c0189f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:acdb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"df6ad04a72de821d12555bdd3967c3ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ad678b73e70051f9a4eaf6320088d425"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"2bcb0ce3fcc7f69f14cc30c6675c8d7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"aa49631f6c6e4b4940c88b0c9df4640d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9ab9c246802f41ee79ae5c09735e2326*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b2c065e4959ca6de99459d7637da956a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic72oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6d3fe2216b6b3499ccd399e38d7ab0c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb32_ic72oc72_ih32oh16kh5sh2dh0ph2_iw32ow16kw5sw2dw0pw2_n"3eba647918850836e5ef19e4b704f48d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic72oc40_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f449c5eac1b55512d3730da3a33ddc22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0f7728e3b8029e71691d3b20c4e3af72*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb32_ic120oc120_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"c3843616462a05adfaf457c00cab9f31*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"95d34d553d87da26a134d1510a43c2a2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic40oc240_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ef0e0bbec059bd24613f1af7aa2d6ab5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2b87f2bdb149ee82a236b825b96cd086"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic240oc80_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"106d2548c91b57957689732f398095b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc200_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"66fb442e20001675a063993db511f7d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb32_ic200oc200_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"19376809bd67af9c9f13ea169f1e1381"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic200oc80_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"13086529e060fecf0762a6c9f52f5564"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc184_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1bacff6d383e1735fd791f0a2c945a4e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g184mb32_ic184oc184_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"55d27210b4b90603f0bf1005a344eff7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic184oc80_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dc88f723f2eccdff2b78c99d2060e1d8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc480_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f1893f5262d4e2a0e48b66bc8d064796"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb32_ic480oc480_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e4d94ea5c7801b7d19696cb009d65890"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic480oc112_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"244cc8cd22394e2297cc4d8213917bb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic112oc672_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dec58067b26f49dbbfb5ebeb36ace7dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a9ebe2fcce3833608a7188843b6aa65c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic672oc112_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d673ee086b3779cecdaa28e6801db41b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih8oh4kh5sh2dh0ph2_iw8ow4kw5sw2dw0pw2_n"1a3888f235e6ee196f2d7e6c7bd13185"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic672oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"4d09dd50b95f973d531553476264c831"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc960_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"2935500c51d85ac45dbde16b22c17c37*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_ih4oh4kh5sh1dh0ph2_iw4ow4kw5sw1dw0pw2_n"f2a84875c7b4969c19e4a54fca85802b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"899eb59e39ae1faa1e8ccdfafa2b8b96*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"b5bf8f38b1e2bb8cd6c2157d0a3ad065"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih224oh55kh11sh4dh0ph2_iw224ow55kw11sw4dw0pw2_n"cf5ded7b0fa963777944df178a86995f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc192_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"5b5b100c04a915d5f4c9131dad36d843*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"078f55f05fbde92a6ab93cfe99d35b0d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ee2aa44a9d9bfd1f78f51f2ba613d2fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"249541139a00fd048d04668650c4fcdf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93d0119990e024ed89fe6018d8f19af4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"05955f1bc7aa9eb634f34ce4a19c1c1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"b85f06996a49874f2f8bc81a78fbff82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:7:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cd474569cde006cce4ad7e7031d0684e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"9e5ad574a138eb33bdb514adb64c9b4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d4a7aeb335ad2822aa91df1d6f818835*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"d033f81b9f917df2653ca6a8d7bb83d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:7:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9e471bac29c5baad3e1deb86ca942492*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9178177da369af3031d492a8c9b37e3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"72b600cb3de8a99dfa869cd30dea4e22*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"6eed3eef4a4a207c98837d363902bcdd*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"23abdb85eea953fed29f743b274ac370*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e8eb3b052eb736b2d96c85a51b66366c*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5811263709e41969dd6692913c9825de*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d86935c52001df6fee7085f1199b095b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"fbaeb9c3d681f98ab4fd4173b63d7286*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"25efafb596bc2f0ab1245e4eb01bb43e*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"66f89262d8dac86cae5090a0d608afc5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ecef0691a77ca8c889c930a8bf414d5f*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0e055379b89e31b28443c8dc1c6d18eb*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0f6b63018bc41c4010362526bc9aa575*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"411b06cc054d20f031a928b8eb63f74c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ccf079bf04955d047425d2706bbf9c69*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"d36496df23f1695a2cddd0ee9635e174*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"803de1d5f52a68d04374a158e2fde841*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"394f34b76192e70437fca6630c3467df*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8b0282533b6843300453efefdfaffe96*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9f7480ce11d19b0031f5817caeb84fda*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4acb50a739b912dce2a0bc8a7802c07b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"3c82766842078ef99dac8704e865d06e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"c20bdb0497fc83ab4186541a77b9f06d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"9eb3ef7cb2079e5e3cd78dd920489037*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"74487f976bbd67238847b78fb3f81cf8*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"ae1c6b4974f1a02cd1de4f40583f3a20*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"9cc01373dae9f47399de18344f5f857e*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"3ab1dc693992a2c42e6dc1ec0b2c54e5*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"a8a64636156c2612a8d0046cee25ba8c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"8ee29b40912f9dbc9c101443a54e1181*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f8695ba78b3f0b42c33051e0dc702f38*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"23b326982d68b672bb5e8b9b0d433a44*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"42b21d2847145c0a20a0d8e8eb0b3f1e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"fcd61252f7c2d61bf79b5740fa41113d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"bd8327b8be5507ec38950121bd1e49f4*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ab993379b347cadb156194374d6c6926*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4b910537f27f5c157307b2997cf4d62d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6c76c6168fe92c5ce9f996868f6683ac*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5c4f8f28d8fe266b122e8e4784bb6a64*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"52b3653b6c1469a725632330e9b689ab*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f5cfc502f1ed2975f806bc2d5e589341*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4947dd2f343f949575acd6b6d35c7b72*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4e3c3b3cf1b04a60231d3ed54d9177ea*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a9fe6c7846fef7c5c535d52d73cd56d0*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4d11704ae11acaf9f9868a3abf26e2d7*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0bf671ad1a3e83e3c56d2f72b6affadf*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c2b5b7e60960b807c6dd01c9522af912*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1a1010f566e57ecb53df56199659ff66*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"274988d95c6e37dbbdf19f10c78ebcde*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9d11acb83a8ca54e2fac188db106078e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fbfe754d9cefc9f2dd369bc5e2e2b9aa"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b682d5ee2f127c8b409493ecd4eae096"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"fa28148f28a0ad6455a696721269f12f*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"e79dcdde09461ba839d27e78b284f933*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7e57aa11c0c11a4b39e897220b41841c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9548ceb362a6a67f58e2308efb6d18f6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"90c378bbb24f3e9dceeb9d7c045adb8b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c1925d474b89c054a3c23ad703b0646f*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fed52dc7f6900cc0723c388938d8d9fe*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3fd85454e627a6130af1b2b6fd872a81*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0b37b4bee14baa20da29a8726ad2981b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"84120dcbbf6400b24c695aa23178f43c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"57c7a2c38170f11438d931929702a262*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b45c6b1d369d5feea6932f524768308f*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e774bb98e5eba0e314644f7b1e0d65f4*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4b571f1d0ccab8e1015d356a8e027dfc*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0e5a043d54e556f5bdf52fd0fbbefc37*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb14_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"49002b1d2d2343916d88bf51db065d0a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb14_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"c5feabdd8002e00427eb9c6c22bac4a8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"5fb2bf332602fad82161ce23ddeea18a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"a872263ae97fd9580167dcbd0035b789*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"63af71aa43d5454f03218c0325b638f3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"6a9f4750f5edf4b0693cd15f7e8e5b42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"96fa7a5c05c3a2e423343b8d3baddf9f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"e50adae0ac3fb9d979cd61fe20fc5290*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"3445de085e8cd327a9dfa9f7cf031bca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ee260a9edfa9fdd75a77d0948cd26326"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"98b6dfe07d31021700225a41dc805be8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd3sd2dd0pd0_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"08ee74e88a4e7cb20f989a173171271c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"683242af8092f22315751cb485f784c9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd3sd2dd0pd0_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"73b4bc239a773210a9be502b1b9c8453"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0b987c5c35d83233f240c8903f0298b5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd3sd2dd0pd0_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"32439924c4a85a82a4f2b5abcba76c96"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"389a63da328ad599158a72c2d16d2a93*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd3sd2dd0pd0_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"7cdfa270fb8049f3f8f00149dd4fdf39"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e026d6c387992965f66bb95c975de51c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd3sd2dd0pd0_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"33bd0e63bec23217d1ae741a2f646c61"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id4od4kd3sd1dd0pd1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"04db55f4fdb5f0916ec5fe50e7db96f2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"c171ea48278457dc50f727d88ca83608"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4f224a0740fda3f35f5d71302ffd4c4d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"920681c87bfc32a04317d3e54b3ee216"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f5c30b033b4fb5bc3bb5c216c8540e1b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"e25284c3bcba75436ccf892372294b3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"284598e5d43a1b34156054063a11f820"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"af70fbbd7401b20724bbb38f17b698ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7d5ff82f19c1b6131a0a777335ebfca9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"c41bdd08650641835cfc55fee550e96c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3dbd243bdc28df2c21d63377a9fc9ea7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2e5029ced137edb0fca0905f59a4a14b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d8b775ddcbd23f286b015c32a00e40e5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"dd8c54461f0de50baa20fc56399ea8d7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"da524cef53ae07d05c346e096434b3df*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"57ce304f006959de6213f586634a9b7a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b021b1396af1f2c8582136d1666ec9da"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"86f39948f000f9adde3b65243e5b601a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"2dcb00b02e7f1c7d50d9a77ed47a15c9"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"9396f41fd65e4b1d5c5184bf244bdfaa"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cb048f8d1c5f5e08918e20b36ab1eee6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5ebb0b17b7d2f92c40195cebe114c5b6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3ed51b61013349e36211a8337eea2d1a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"770816ed9fc4c6102e1fb49d270446b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"fe31fde03fc64fd2fda714ada5eefee7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"d9ac3fbd951795345867dd41afff7eeb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"016e9bbed66cd19624cfb98d0d667ba9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2296e082ba2d2bc9e979297dce26115e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6fbcec1614b1ca2e3ea0b53c15d94486"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0874cfcc52cdf7c5878760c82f28b381"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"1ab80c2d1044580d74da449e0c071e10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"ddfd0e882f5c4cbe00fada37103e4ff8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"bb97a653e48709654223cda9182f8dc4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"af22047c0bb42084411071505e2c9629*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e0a3282131f963c0e81e8ee766f83dee"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"dd078b482d8ac0c6405050261323f372"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"7637ad7b6fa97a03cc7a4d2fca7854d4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"d23614cd00946379da96175625824b71"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"07adff2cbf7905f77cdb7e4a71cbea05*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"68f66df73e800539b52995eba8568b7f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"d37ae0b01b443ada51647b146966038e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"412d5ba738e3900e06726efc3dffc467"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"883f8e3230e730999cb6f93ae7d4b1f3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"b53115f3b5d2bf0a9c9dc531d09f3738"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id4od4kd3sd1dd0pd1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"f13db44703bc4449f609b22d0bf165ff"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id4od4kd3sd1dd0pd1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"fed4dd47dcc991234134a94edeec8697"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd3sd2dd0pd0_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"a5316ff9df23fb6a15b92c53829ed611"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id8od4kd3sd2dd0pd0_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"0f25530276d2261c2c70d05996986010"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd3sd2dd0pd0_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"b0f1828511e5966e9be99fe1cfdd4fd9"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id16od8kd3sd2dd0pd0_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"6d677faa223833722dd49ee41495a600"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd3sd2dd0pd0_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"3c8d91acf456aed412eefec795659bc1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id32od16kd3sd2dd0pd0_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"9d7b6d06fe470832943ed0bcdc4d1987"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd3sd2dd0pd0_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"53c953d90404a75d89d45ee94f2142ec"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id64od32kd3sd2dd0pd0_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"7375b3fc689d39d32d133e81ecea23ad"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd3sd2dd0pd0_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"b6c1baa240d05a4575cc7b7e1f4cc4cc"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id128od64kd3sd2dd0pd0_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"7640219797c04b0e31d64d9c40f116af"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2356cf3f892970b682ebff75b9e722aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"c6b65fb8acabf04f7753008962eb3536*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"721ff0f76a8c943287b506f647077b86*5&c14753819457b38b7d5c71403d3fe97b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"85ee81f53fb36e52f781c8b7860f6693*5&9fff81d673c44300a51094c423a57556"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f0efa013b35f39310b81a7a143fc10a8*15&eef6db19ec095354ffa7038c69621795*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c9d4c58c5db098365d208ffac92077b3*15&03df65e8f6eb18a749cd4fd614c7a01f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0ecc54fb944f76081907a02aa62e012a*5&e47942d2a5d8892ca2db5b4e6375d0da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"eef99c040ebc4fde39e17e76927831c4*5&72073b4b79c29cbc81f3ed14949301c7*5&08ab9c53bee1b63e2e8d918eccf1e987*5&181a89fc529a5b3fa437c1a5bb6c72b3*5&fc361439770993cf8e57cd259613afe1&b6cbdc1ebd1234a770bc9d06122b2857&29098654570ff080c798462d370affb8&2cda944887057f38446e88859d6eb22c&581cdfbe925624f1905001874ff15a12&c16fafb5d2687229bf2d5ba01b7f55fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a95953a8072af0b606ef8a14b292c3b9*5&b69306cc4f3dbc1ea732eb825d8f7870"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic144oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dd2f1be1afeda36c51c60e7828e5c2a4*5&55c2a2b1edda84f758b8899f0f64b1cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"96f0d3f4d611706d9662f7f55f5e2dbb*5&32f8165f34c06cd8d4ce5e74a7cd5b75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ca6446537938a0ac57f1ab8b47278c7e*20&b248fc9c8d7e9db85c4311bff2d4fc66*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"44121d8fe5237aae6cb316b1ff348254*5&10163f8674e0942c609c7a91261266ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5ae41a133cf1510356dea9cab0019c55*15&399a9604ae3abb9034c17a42c8af5de5*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6dd640ca5a21b5c5730d4feb290efffb*5&e8587ed3ff5edc91d62ec730b4ce1330&6a4e82c0583e807c81142e60b7e67e4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ef6cead5ac48282f2328a33dd7c76ee3*5&cf2bb0240e69a9d636040cf760163fd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c8320065775c25b1fb073e9a90e9aa48*5&8791afef2f53c8fa7b2ce15cd4feec8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7b09fc55ff84b27d33418598b60888cc*5&5003ef53ab18972f453ca582b0685262"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"757fd1638d613a476f62000ec216f2d2*5&01ed006a35c3215f301b67b1fbb35903*5&ec8dbb233d5f47df07396450ecbb38dc&311677deb117e488184b67530faee1ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6de52e9fa02c11c25dcb33219950de1*60&05b6abeea69f8c17c8d89495acc485e7*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"04a31411f4e74a5e9e179e2ce284bd47*5&94f109af9d8d806b05bc8e256a59af74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"eef5251ee3920394d1361c93c5330bd8*55&d9853e1517229c138f8fe955054dd505*25&678a420de995186c0fee5875bacf03d1*11&49fcb96ce5cfba22f8c093e1ad8269f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"88423a00f01d42cc9139ea93c70238bb*5&8dbf7db0daca9e2e9bfa5c8cc0b0d414"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00cf226a50839c3653531218cccca9c4*5&c725520b036718235dbd7620e0d4ee4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a52ad6fcf1e2098b344cd8a80febbda3*5&c229886fcc8f203110c3e58d93e159f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3d7dbd31a17b695a60ab65183347cd9b*5&0d86996fa892395a94a1431126f2213b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"713bf087ab433eb0890e8ce77b9838e9*5&f097bd74be65e54958acf787f097c661&35c92c023a133d1b59a39f13517f686a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"188f9a3af81ad4f96539ec998a97a454*5&8eceec472cf190a5656b466304178c81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"457bb21cb341e12a02e594151877d29f*5&77b3e2b957f4e141036a8a9dd22b8737"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e9c79b9ca36b2418f71d7b35b90d9124*5&4653db46140ba80f760b7d07d34d77d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"520620107798b9ffe57863228ac2da34*5&204ab8e2831488dc2e1ec3b04c42beed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1aed954eebe679ae310212563cbc6c6*5&8a17864adefeb6e1d3fda86dffaaf808"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"94a2bedbffd07fb09b8f5fe7360f55cc*5&ed638315c93b5f6d7f9b129fb80076dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic704oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6f8b0a4638d7708ac3fd1e5b216c0dd3*5&83f6a042da5cf3ecdee4e7b980e5a99b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f4563d36aaac95b2db33069501696635*5&d7488a6a5e9c1aa18d1a038e3d5df5b8*5&947e62210d276f0384c40f8867358a07&d3c2158bcc1c7d02f532ac2b5e1a591a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fa49b29591546cc7743dba903448848a*15&439825fa8cb728959ebcb443d0238a70*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a8132e804d0e502ee434a58111150256*5&e4c7b9f43cafaf2801e07c2e739e6612"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7b0f2e1e5dd9fa1494de089a085dc5a3*10&63f2dfb650eb2a7de3b2512e7042a7c2*10&33936d6605f932a91765c248a5006318*2&3fa69e16e2f36036eb8293782fcde577*3&beaf034e88ba3efd2a78432ebd29931c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d26ff676b11cf1c72785e002234dfd84*5&94317a96ad2619ccb8b0100e60aeb1b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ce31313ae9833e748c5c3c26f3af751a*5&5ff4dd0c71a7299e79c0b984b3e95445"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"f681a14456dc5e77312a63e74d455483*5&342bb80a4ef218ed738739a7fab6f901*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"74aff06515a0b602e3a0af32d8220dc8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc64_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"f386ca8d194b357823227c276c17eb3c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"896deccf45efcdc761c8fe29d8ce3609*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"1ffa9fb33dbcf17b87c778685f98521b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"9eaaf7f2ba940ced5a90c11661f5cc18*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"5b167b00f7a5b7ba9a58afc861af13a2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"82a6635d2463c5c7a5347456d8a52dde*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b152ecbec0d393fb232ca9200191e7d0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"d2b177caa91481c90ba564a987a540a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"e90aa049099249595d936856e66ca975*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9cd420cf4225314a8535fc37deab7943*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc48_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"507b56442321a22977106e1c42e41135*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"49d6a3e79e297caf9c4e02b97e69b818*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f4e17653b89a20da41277c413f675fc9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0b9466feb9fce08e6b2f06a54eb117d5*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6be6f8cdacb5dd8d283eb0c9fadd44ae*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f019dd9e671a5e467bbf5d4144ea4bd3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"9bbec2cc7737cc519892094f4c1e8337*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b69f62878fa770c93cf97ee619091039*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"218ecbfa21b81ff1c7a4ebcf7bbfb0e2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"761d8dec6660128fceb0bc5a62463069*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8de6ccece2094c34602a3fed36d1965e*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c4cd13e36489359f544b6f0546f564fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih128oh128kh3sh1dh3ph4_iw256ow256kw3sw1dw3pw4_n"54b636e5c553745eeabb61d4f36cc1b7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"aa6bf3aeac1331e6053087bbdbf4ca87*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic1024oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6086a6131586de6698ab2591c0b60877*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c0c374dce09044846b6218c761c6e6d1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih128oh128kh3sh1dh7ph8_iw256ow256kw3sw1dw7pw8_n"732b53ba7cfcb197445edf1c33d22030*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d6f37aa948a50f09738e475b6194e1e6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih128oh128kh3sh1dh15ph16_iw256ow256kw3sw1dw15pw16_n"2b5d8357c1c6be6805ee8d225fba0861*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"222e16a869b9a865a52b7b805b7ddaf4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh11ph12_iw256ow256kw3sw1dw11pw12_n"1c4c2bf21ca7448d7140a6eb9639f5f7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c9870b0f2bdf5dbea26b20e7820a2409*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh23ph24_iw256ow256kw3sw1dw23pw24_n"e1d54d4baee411e37ed20158c7759b98*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh35ph36_iw256ow256kw3sw1dw35pw36_n"0bb4c33164641069027feb4206189597*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2b833d2abb09c6faaa4546e20d43b400*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic1280oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bc5c7891f2a8b7d89eb621d15d4a2c61*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g304mb1_ic304oc304_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"847cc504d1e952a34c00063408a33882*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic304oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"8aed0e2221167c5c660a4337961fc5dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g256mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f74e9f2532a489a19c3afee7529efda8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"01520bb6708b28790465ef634d653bc2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc19_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7acee4b3f4f7c968996693e928a1a61c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"3a764ab6151fbd366a01513c03d31206*5&5d2ae337bc0f1d76bd64aed737205f52*5&9fc2e47c66d226df519982f853745f1b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"582ee2bb8481b5a13d3a4888aa7b5598*5&812d36f981ae065cec34717e01240177*5&fb1845c15e2336469599961e11ff7a53*5&c1fb7e128553f3704ee41375627559bf&e296cf9f0de215304dc84f9a9827683f&d2893d9ef5c29cab5b86b42bfa93dca0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"c661e4db32ced40d79c28b06ba515cee*5&c7e8f951fe3f3c4b9c3feb148a0e8ddb*5&8d48ac305a42ff3000da97811c015ac0&5d151f23d7eb0e20cf58388dd4d96366"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"67797690e37b3101b1b84420a84db8c4*5&259bbd7fcb7086bbf0fcd6a7a4799b24*5&73b80fd8de7c2eeb5733603bf4931855&29e6198bfa82c73554e0617694ce3e67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb1_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"a8f20baec00e606ca86366c37ac9c377*5&c6bece780674eb18c09aeb8c229f8152*5&a760f8f3d3e4636e8343c08b77e79e93&8d081072e197b7f4edd002130952bf5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"a63f8debda222fdcea6bf235af85a851*5&56dd9fadf5d9afcfb0c0861ad260116c*5&fe6a0de370fc0d30bbd898afb3e97743&2f6a60bbd3feaf1a1145620dd0219f56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"22bc44f573c5acc81fe7f0062047abde*10&e87e672f589dca5685608790a8722bf8*10&91910b016906717b5fbd7e469d833198*2&5e347a3ea2b8c27504cdb6fe9178cb4e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"80348bacd8eb6315cc32af9ec5e8d6c1*5&6fdd96331ae80cb3d34a3f64589a68de*5&e65d8c2b03e617bba9fcfa2948ae6312&ce5d0b2fe099c2d3f6d08320ed5c8578"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"95f101fb1534eb51cb0f197d4a7ed677*5&2a208ae30e169258cd71d2a85b49a406*5&7c319ed9c42e531ca9d8c5ada1ff6252&7d3954942c99c6a39cf79f1ad3342028"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"55f2b5160e9e45bdc1b8b26a93771b7f*5&5a52d9f558f1eb7b10451d21bb2c5dbf*5&63623af45d5993290f43115863823ed7&2a3401c1c5e437e25ff633114c8f81bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e6fa9fdb0d0e4b3eb8626003c6dc8cb3*5&89292218a4b356235262cadfae2f9d5d*5&38e7ee153b09ce7575f6191bd28a9553&c0d2aacba86cf4a4cd65509cf21985b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"aca95144e87f788a3a251efa53339df9*15&f55c9f75b931c3aa845449755c3ca5c5*15&9f738c8a5274a823052398ab4a519139*3&7b83940bacc7bcaaf5c4384029f65153*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"c12e979b283f1bd088b25b60877b58f1*10&0305da365d10e9bb4db17d5f76e1f7fa*10&37a91e75710135b3cf94669a7e91e0f5*2&32d83b4c13344cadb97439feb4f7c304*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"61afcfc32a07c862c7179d40806620b2*10&cbd9dfdbd5b5a7aae7e59a37f13436fe*10&85c2880e5cda12af4dfca47841c80d11*2&1d1361efd810f1fdab45ab5b63c8805f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"7eb6be186c98af106bab9ede8ae767c3*5&0e6da54eb41dd381bbf19563cb5f1b58*5&964b8f720fdcbb1199bc78052e744e0a&3ebc1c5dfa76ad72ea840508fd2a9d29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"86f3aac436bafa7c11f28a945e255d4f*5&5ce3f62fc6db5e87d2d0afacd8a75c7a*5&4ede3824607b1ccae430658dbc406575&5898c4e98b7ed0ba95946224dff32431"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"dde61c5a2288b96475c90282c6010623*20&d99f8d2976e7c7b7d916599547789eb6*20&2c5117ecf3a7c661255b78b59386aa71*4&cb9daf773a1d269b878611afa2b174a6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb1_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"3a271c550870ac5225d27f3d2e320494*20&adc54d5868529aaa112204dcf479f4ea*20&c790418595e7e8ea7b4a05686bf0b393*4&a50de6c787948ef454bbfab6a889893e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"7d667a8046b37c686b09128187d81089*15&01ea05fc54c93d29d5dad59841dc9c6c*15&21f71f108d116856d47cc0ab63c36d2c*3&449ebf3b9822e60af792add62b735f1f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"7432b71ed32b28db3863f852476a256c*5&04eb3579974eb952390b799afbec84aa*5&03c21a235c25e438d53d2cc623182d2a&12e2ebf55928eed070ba3f3b11dab687"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3a1570aa5888666fce3b9fe545693ed7*15&52ff7d7ef90043d283abe6f81b32bc72*15&139026fccf22fe79096e03489112444f*3&5bff78f849a7620ca0d48bb532294e7e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"22b9fb4a8d46afd6ecda0c3688747b3c*10&4e7ce917ee35c4b72a59256669ac38b7*20&87568bca5f0f145795e1cdfacde1ebda*2&106682b7e471dab6b9c7b912501ffc7d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f8c63ebc4a15e1ed29872d0c1f0c8e7f*10&0c5c6c32050b1c8422a20bd0d0aecfa9*10&4d20248baf0d1fcedee22651ca1a31ca*2&2deb3fb7093b42a01a3025f78fca8eb4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"ddb94d16a6f9ce53a5464016ff581c91*5&c29c9591d9560ca6ccb9056b9a21271d*5&b1409321cf5c53c385cc5e6c41cfc61e&d01828acc1a79d9c66720860691652fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"5b5b89cf617e26aa2ea4ab6bd4743a51*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"babf01c4e39426c5abfeb2d748c3d092*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"58290f11bd64032bfa3c78baeb71d81a*5&9431b6633934cc91e3750bcba1f84d45*5&9adb53417fe22e09ddd2f3ee1c47eaa6&b84257d8036c91d5870957ce4bec9d8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dfeb558871a36f912017d1753ac74d83*15&3664d1ba20314e297a9ce0efcee37bd2*15&777e86a94daf1df18410d8fb859e330e*3&2be5a8e34576ebf26638026b1050c854*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb1_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"167891e31ede18db27d8f79bb55b89a9*15&b41b495b481747897bff9894400e45e3*15&5a05964febc1dd35f4ebc19280e70db4*3&54c20199fafe7819e4bf5e35e8403617*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ebb97e66c73e5feb1fc271b41af177ba*10&a748b933c4830f34d3c497ddb320cb4e*10&b056567af3719fc6ce16740487f04ba4*2&b120c8c64f2c1d90ac5a450cd9669dad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4456c2ac7f0c27cc8635ca1f6a1d422e*5&9a34744b91ebb6aafc246d3b525a7238*5&b82b1dabbb40670688adbc58b2a802c2&a22ee7afb24fc08b532f30eab69cc915"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3ff369e23ef7a2275b41948394942c18*5&8adceb2d9835f37e8de6ee492d567bc9*5&2580ab7a1d8676d8fe8063828e6d4d60&07eed2da6d740ff72e058ed6662b398a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8fa42a500e586e75a423e3db61006f0a*5&a0a867372b3d5281d3ee06617c66d460*5&87f40b0febe11a74e130c928243e6450&9c21b208ba2e91a6efa2366457f7d163"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6b6633a55b4641fa6463a4ac25adadef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"f86f7cbda4dca49c1a352476d13eb0c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"7b2cd7de06299a054df9ee2271bc7abf*5&d8db0be92f1401b69b925ee9ea1abf2b*5&0fe7bf7b55e39c9519b2abfc95a85229&ecbbe9c626e25deb85131239cc44f087"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"4bd38852dbeadc9e9065a4ec6fdd64cd*5&e7a10e6fb15b4ad22129d282b476dd8a*5&ad67d4e84e28f4927cbad451787ff611&97f858463e8c62179fd13479c6812fbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"c201cb8a823067dc4c905f30bcb1b9f1*5&a1b8ea3507e231a4a4d61ff68cf735cc*5&b785a843167f3b459c56e61f53f66e19*5&d815fcadefd4f2ef00b88fa953077a98&5dc525a6b3ffdadaadb3237d61e62e21&2161e8dc9808cc2f5969d3a0e40e6e5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d9fbd1557095890fdb3628dc76aa9d02*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"c5192cfea841860394ef5ccd3c78967e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb1_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"6df251271de4384816e8473eb328a0b8*5&e465e3269ea10bece1da43fac58f8cec*5&15bb805eceb33aaeec89041e1a0c3cab&d49f2c8fb99cefbb5fb19c2985ab5e07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"6e44023546514ba0e0645b5c86c05b68*5&a45786d8711b33aaccaf45aa8db4f9ee*5&ab46b74243b64cf859efdb56c0627605&92551968d7d2333b22afb6c22a6c9787"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"1b6cc0701fb7a66daaf89403e3344569*5&21cf83903e22f4a1909615301442e76d*5&b2b0847f1d1e19354b5135bfa346e051*5&ea7d502f30cc21f47d541913b5db6553&95b1a60007c7560bdbc4d1e42c23174f&e94db38536cb5d0accbe63475d35fce9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"4065a91b55d584aeb1e09947333f8d29*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"7eadfe4e192e8c638a5fe74782a4c295*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb1_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"75f5706ba520ea4509a232ef63c3018d*5&6fbda6432716f5cbd835fbdb7771b191*5&40989a4bed78c5397f5d64791634edb2&b5f80d17a20b92c19e119f6e1bd3d2d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7f225415afefcc60cf833d916d664d2a*5&c6a076bd5e0ac00a20558ab5bcaea60b*5&2abe7ac4aff0a383587b2e3897e0a8ac&4da1d6b6836bceb7463aa7f1bdee3423"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"943159a3093edd2a3e19c5753bb32684*5&29f46a3b5dd7ac7d36152eb48a1e07b1*5&d6387be1ec7272a8435c817b8d54648e*5&98855bf2e76a5f09ffdcbe9e35ef6ce9&5fd31f164037a5f1e5ce439d715e804b&717dc7dfb2903fb154b0b7e50c261262"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"621d8a88fcafdce3ae33bef8ea8e6807*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"5d35328ebe44ed4adb6c5050a7388286*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g64mb1_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"6a676c69e67ebbd67619b8c175c32648*5&33d78e202047a4d0b07fb7474a2fc933*5&ae694f00e7c2371a1b8bab9e161593ab&7e93729a8bce1fb3b2d26330890f3ee2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"252fd1956683dd713fba5ef72886b900*5&1bdfc140d2d39df3956dc2cc3c69a9b5*5&8df36a812c99426b3f6238af0b22e0b6&714476cacfe6b8c304061481071683d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"ce228760f8814fbca814c8db6edaf7d4*5&920b3a53a6742d119fd7edec946b61bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"77ec89b4c4f8475091af5fe556d6cf67*5&13fcff7a9d5ebadce42a0503a7d35bdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"676364265f3689fadf2cc1dd3885fe44*5&2942fdab728112a7b3e5372a4b4e8004"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5301bf9d3eb4ddf36fb711d1c14966b7*5&0dcd5d506c19bc6053950205b7e63665"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"800a3b35297b0e29832cc6f4066c488d*5&a9d237376fa3bae4c07b16839a6004ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"321c6583f580f0e58e1f9e93f383af29*10&e8a66792dac5cf3ac8d601c95c067293*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"61aeef3dc7a769c01287c1b207e2ed73*5&4dd6b7cc510aab84b8a0842be58ddb24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"879793335bdc192b0fbbf98192099a26*10&ec998b1ef2cfbbc105e85ec22615eb1d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"edb25b091b94423f102d65147591d8fc*10&c668ffad3376850e29a63efde348eb5a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0 mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"ab00f6752b3d88e4342b39689e380c26*10&1b838ace2a7939a22660411b5efbab90*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"cdea4a704c5fcb1c8dd158d691738a3e*5&e6893a50bce743e5057796ee593d4d35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"740a8b6b6fa34382c9e1b6fc1db5ca53*5&de06796b5990aea829c554e80217075b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"1746b4d342ca4b648dec480439ee75d0*5&c4bef1cbfd2ffb5e5502789bc4b9e631"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"49e11a46e45e63f32201ccf4ce6cd788*5&988149a4e24f460be68287d81920ca87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"c80a09acb2a5f3e6d234a14f1274e333*5&c06819e496ef73977ad35be068b485c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"352c2340572854db98e957f402b7d426*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"404af16e3dfb1d6ba7a7e7dc0ec95018*5&072b71b5c0bd769b4c675b65317dc534"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5420a8c794739fd6f37e4aad39b99597*5&7ddab85ed6cbe5b3fb1cdf8e73390d85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"aec76dd6f1c29e5ed4837fdbf85f166d*5&d8d411dd0a0e9cc9385785e65a6a2245"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"55b111be5a110d29d08b3daaa51af73d*5&a8edd48679bb898e54fc6291ccba822a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"bff07bb11d8f9644fec98a104f065552*5&7664958eb63b6badced78eb4638bc53b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"36b72656a1054f4f4a288204d58ffbc1*5&cb45b1079edfd0609e2d3487da9d5841"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f7b43e817090131b49e48edf6e0144f5*5&32560e6a75f19231307724ca0adaad01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"ff270ad04f0d676aee3c0e858c7f38b8*5&aeccd1f906dd4fca49d0afae31944d16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3ad651b28a5a1acb4da5906c34b57a9e*5&217798a2588739fa3815633b7e520eff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a34487d1082ef1f34149dd2201dc4779*5&c0e7096a536e07e8dbf24889853a1a75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"de2ff7aec8050414b1dcab7293fdc2d5*5&c6e41b1541c846f2b21cf9f732214a18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"f6a2d96a88498886f5d17b21af945e6e*5&50ab31f8070f11a516ab4e2706216b78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"a67d33302b1079b02c4890af9ffb89c7*5&cd92e6bfed4f24f6c6daa0361d424c6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"9025797a68c00c7b20473ea5650be526*5&b63949103fe859cfeff5b110718d95d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"7b6190ad85e188d8b501b0471a430a8c*10&3a387ad77a23118717666d365a75de38*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ddd72c23288fabe2b8cc83064e065ba4*5&533ce695d3203e61251a61f6954cf5bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"b472862768311d05303e0d1680d5208b*5&6bcf7e4b5c184dc94eee288903df8605"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"8520293c0393b8895d7b9d2c7e1b2076*5&436a89f1112539f0a2b284ec3ceeabc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2be2e685add07578d96887598d9a2d88*5&80f2f7cd5314028615c8e6a79323a903"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"844cfcafbb205d64fe2dbeaab57e8fde*5&787b625b5284c762597ff414f65422ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5b965f1ece292f0e7a8c917678c723a9*5&a9186e0ad9459ec74785ae5452a1ede6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"bd2bae029c3128b2669c288f0689db11*5&a2479e0e7d13158fb92b259df1894d43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d1c5a3ec59afd56b7da670faf9102b46*5&1bed9c02ddeaf1bd25407c88391ca433"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"276df15837996eaea0aa1cd1a782eead*5&38e2c94f7e902d22eaf1e34c6424656a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"5ab10fce0b8f3bd1181c1e3203f6c85c*5&d3014632d28d0d9abbfb9fde75ac95c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"c06d139045844f232466a4b94baf41ac*5&7e986393d1a8de052dbb1a6e4843bf3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3c381fa56b8eab03583bd0dce44e4999*5&f2618ce3682862f8786370b51cc60c34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"52e3213c16afbd0af6e9252f1c8c0b22*5&e354645571d0ab89f5184824e3b5deed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"99f8329558ff9dad444bd6efd66ae0db*5&71d7b8015c102a90171842e853a8ee17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"feb0527d6f3f10b7dd0f7203dce29a50*5&b0d11fcaba915c512d59db9eebb7ae0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"1cb3d8baaa842c3b082b9ab99d1ce14d*5&93db978e064439363878863330383879"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"c445b927a2a558c95c5fdd3153357a0a*5&cd7ef510c3e668100cfbd54d993b4186"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"661df13235d24cacdcbf28f82f39d07f*5&7db4287e935d1316bfd5ac157400203b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"2bedd7c583fdac30042f603afe3447bd*5&f887981f456872ff7605e807f64328d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:1.0 mb1_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"f74a66fbddbdde8b4bd9f32dd0ceeda3*5&13d2adda07b2f11d14ed14e3030117a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"b2863a346255834a7eb966916fc09fe2*5&af6bc6045a6b552ac95bcd4f51a5113d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:0:abx+binary_add:f16:0:abx mb1_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"9ed27339dbea0cb2246d94613fe55e32*5&ad50fd4a6fbe5751e8aa4474a748b61b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"73b1339c5922a71d8df862b1fcdd81ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b74ac3c92a2040e3b8a60627908bae62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"09eb630cfead7e01254de567fa3c8f80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bcb1e50626290048f888cae1277a4cce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"32a72d32414aa0bf5b1e085668342a1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b7cf37f99c8179cce57750d2ad1fc630*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f05bddd88d083aede85f0756abe01e45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f97070c3717929e5e696c55163ac02a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9c76616a7b921cb5e3b88f1e2307e41d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"840832bd0778a160c8e9cfc519172186"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7ce4b498633cdeeab7cc7749fe0a69ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3f362156d3d1e74024e56e70fa1074ac*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4add81d95ec8774b5d849f158459f398*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d610ffeb6dc1be9890b6b91eda1c2719*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c83990f0837972f89cea254c040dfdb0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c6c3a4c210a6d4cb6d94578f4dc35c31*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3bd04eaa7b151c8f50880b2c1ea8dee1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d8ece7792808f7669e4dbf35d2845e00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e0167827ecce526550ba0d28455ab776"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"205bd11fdf360255cb1c9c98e7378b67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a612f334302c4a6be8e81fb7f3b1c810"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"1f25980b18914fa9180cb4260aaa74c2*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"b8c18ff1e5e52a5a728982b2db736f40*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"824ac42222bc0ad5d1b9f1557caf0cdf*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"83668f68e105add7dbe00a4079ec25f8*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"9c93cb2d6169c51a73966f3d655094d0*19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd16a16b+eltwise_relu --attr-scales=wei:per_oc mb32_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"6d6cb7deed4c7fcb27ea05f5d0573ad3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"aec0ed5bd1fb7fd6d81b889178d4d575"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b1dd8947e06c45a96153d6972194c772"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"64e63f7b5f1af560d0fc2fd6b2db4c15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb3200_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"345504c36b2781343c0c0c0e0b664a81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"e8829b8e298e0e680f1909958798f410*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"2b22b2a61d3af97f6a8a29f6d7a753ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb3200_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"2bdcb3ca0bdd03a707c548372454eeab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb3200_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"c72296e50220ba39e59066f89d13dfc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"d563754859de427f12e1dce8e5c291bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"40dc85ad93bcda06223dc1c413561bf1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"16713fe223e927563caecdd582c9de14*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"ac40cdfb377b3e5a25c08264d17799f6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"fe68ef2f79ff003202422964d4b3bb2a*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"bc993a7691abb2ce089ff46ba338d83c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b2f1865c1074c12e3f9cd5454a810f6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1000ow1000kw11sw1dw0pw5_n"4f5c3ae7abfd5eefbe763d89f019f950*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic80oc128_ih200oh196kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"719f4f8ae4f69a6fdd82347eaa65a86d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"32e169c59d12d473846d285af00521ca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:6:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4754b0d52b9be788faf935fce9ff5353*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1e7466e402052877e04fd41f4094a3ab*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:6:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"19c30e46c52039bac2fa6403c7578b19*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f3024d6aaa526af7b8b8463f01fc37fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:6:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3181966f9b091242e78465277de1b7ee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc128_ih196oh196kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c43b5a5e12a3a85a3a2f78293d493af2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw5000ow5000kw11sw1dw0pw5_n"fc41e8ac6820e3014f60710a9d59177c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-zero-points=src0:common:1 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw55000ow55000kw23sw1dw0pw11_n"299b0f39662bc336eeb4d06feb294438*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"b2065394bc41157001d817362734961b&5fb21ec658ccf98c399455599354c734"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"e827c43848f8286b3b110e1cd1a2c751*5&8e3d3f24a70703f72ed07c0dbd2a4f22*5&9267afd872aaaa0cdae213a9dc9ccd3c&2b9738ed9ef29f8e51fe10d7b759cec6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"1a483992b402351fde54241261ec8a14*5&4d099fc6f8c85b157b69554f4ae9a167*5&a44043c724a0c4fd4f00f9ed2338f955&964bb5bb49be1b150cca7115117662cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"1bb091eec240eb4e16df75322aedb708"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"e63cfcf4a978ded588ec83b6eeb379fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"9fd4a4878f0854aaeb1dcb1737c7018e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"a14c27319a2dc13b0acd9f4e5bde36e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"587d5921976940448a2de1e42d5fa707"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"55a33f923505732cc321d660ca46d7f0*40&4c688250848dd7c7df1b66826bcba3d7*40&772466098c2be1c3792db0ef5c998b5e*4&9380fda9abf52329bbc79e9f6b89ad2d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"aa25f6fc65cc8dc354897e80c1c5de58*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"ecdecf585ddae82878395bfb0ca3f0cd*20&8ba03a255134ee7d8282ed9ec718251b*20&098f431143457e7dbce0d8e759e0471b*3&8dea4a73d7cb1d0f63020e1f2f399e96*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"bce585378ca92d4816c8357c6e3b3812*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"56a7b644f36faa5c79a9a252b49dcb72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"55dbbe44dbc98c2a39fa90e33d4bf79f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"0402190eb78b5920e9ee19f67889bd62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"e7900fa49a78438436cfbb646002ce53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"5d2307f3f44511e3f9a881b778134ad7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"bfa99bc9f4a71e0ac986be1d2b4ff84b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"45a713eeae18d4375e7bff0856684489*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"da278638cd55beeae63e4d385a7bec69*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"83c22c10446def305eaf5b7d93cf3b6c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"8c84569680ab8ee1a1ecbed3566105e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"12be2b76bdf3346b6d190d4f169fa22a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"72eb1ad4e78f4c6a9d9c59f106868a05*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"6721de0aab3e4fa491e1f5bb3c8f68e2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"fc3318b43ca59b02dcd92072ba877afa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d4cd91d22cfa6cd566425f7d91c5c69b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"b3119db6bb43fa8ce6031fa968f64ba0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"b28edfa8f8e5d53156a5e927787d10e3*35&f653e7c72a7f23c18ad805ba98ef8f10*35&8e127f8a7ddf142eeb1407878a92a8c8*4&2b9171810c5da0837cfefc9e83d69d4d*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"c23753229f6f0db91de449816737e6b1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"1b0b18bcdf103079d6cbda2f02c7dde4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"31a23398e5945e414f586f4cd9d414cc*5&828ffd2268fdad2adf5fe12965806b45*5&43bb515bc65e337e1edc68f028e4291a&341219cc2baa07d190052a40975c6a67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5076f30082aaddef0bf0839d1ff6512d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"da2e21f2c33533435d6a6c899e64da95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5c956761b0c20ae214532dbb33b0ca7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8d1c1d52aabb251404df5180e8918ca1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"642c84e764b0d81925f936477b64edca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"5cdcfdfabc4148235a0af7bca132594a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"cc2ede45190f490ead2bec1efecca00a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e2f14a7ccade5b8351bb87e583ae8ab4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"352e26481f4d66663a5bfbdd448fd10d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e27b003700224afc92d4ad8da441b102"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"badc3895e713304a9b03acd0e6444add"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"fd89c5d6abed9ac6707ca944efd52a96&6bd5f4f65e31e6354cf401e357c7a5c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"6d96a3977b1c8ff0f278ace3bec7dd51*5&1078b4193162c32627b2c6c088ea64b6*5&7185b6da249bddcce014b946ea824ca1*5&fc1e08b6bf11fe83f0b5374f1fc45c30*5&6d2fa1c6a92a6eed26e528e2195b5a52&202ab61024e56be8279823ccef71d983&a3c303680101797a08a16a566404af10&b46b64f93d46e6fc4663cb544bab76b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"d4fbc185c77fafb109b5f8f8e64b43fc*5&f8f63e93e1377ba9a0dfad9812b73011*5&e5be54833fdfd82e14757975b0515cd5*5&c8d51ac816403e7be76d0fd9bf9b51dc*5&4b50ccdfc42a96f19e0a164cf0848163&605aeac80638eebe58e774c308968366&270b4de4970920648a99e93887d0a634&14dd9ab82951dae9a4c32ff74534852a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"5c08fe0f79783bd53b8237228801c3bd*5&282ae6d5701a2ed89c7acee5b5d90e5b*5&e3e3805cba20853092b523d531a4d704*5&ea97d5a1f3e768da8053fcef376cde71*5&ee7edf32ef2d6474744bebd667899aa5&7a9008923b34e8927495610b06aa0cfa&de71b0ab31fa2dd61da3186fe90bf6f8&7ecce7ea69c0668174474a80da914586"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"fee2b067917cb488276e2f7a735f8833*5&e112c341cdc177387973d5dc7cb3db9a*5&960f92a3ebcc7c8f6a1da467729f6032&24756cf24baf3026df9e9e0a42554ca1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3b260fc9f76e57228786ea6ec78e7eaa*5&9c657d9c826d7b551344aa34aa35340a*5&64c339e3182a13f9ff22fd6003c3777e*5&faadc08becd74793c5c22f9eaeac78e7*5&6d5359b684c0b8e2bd09b38298edb879&33122472f12f9b417fcd4163fefee38d&a07d71f56bf7a9905e718fde93390d8c&09110bbdfee148c95197f694b91f0203"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"28a42b3bec415d4e07020e5b5245fc4f*10&6905987f858055d35955cb2adba20a4c*10&667d63258a242da91e77e7914883a244*10&41bd7edd06c667658aa73a30366f73bb*10&f9625a7b9d474d38d1e5adffa17c56b7*2&ffab3dcf2a772d24ba0967d62a727e36*2&1a060dfc6aebf6029fe88f83b6c70645*2&285ba789c8722b16ce853c45b77c7431*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb1_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"32ffb039d03ade72e6213e54380892d5*5&138956a9a3624d18a88524ef57c85c19*5&1d0e6cbc5caf1759b160494115a27594*5&8dbadfc4d78286a37e44d436b20f3139*5&501092a6c2a3c1ecb1d66a94d20c4ecd&109774381b9c25e4a95e9b4df1002e7b&5771e46e70b3578437d19119660cc36c&34405982238eef0742d2b5be84606628"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"76227d2cb17cf69b93ae60e77eb20c78*5&5886aca764b675e99a6c39350b4d41c0*5&98a27d3d3d1d70501b885be2b57c68d0*5&4513a8ecd586bb6e15abe5d7d024a745*5&5eea1ae9e30a915e1a947276829d589b&0ba39b0d696b55167b104eda5f85302b&8c205e4324e22819be18a169a923d960&13cdcce6a0dda77a43238a22a99738b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"24103f5b84d8fffffc35dce7e7b37969*5&fbcee97cea471492e19e28a43ec1c0e5*5&c4e8fc3f2bfd354143810f0c0cd03e65&f2ed4a0adb3910a3f13a46065a650b96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c646825b50f0877f2764d4c8f8d15c6a*5&f76cd6c9b61b3f1f045cd7e06a9612c6*5&96a6ef6f22ee7d0f1b8fbfaec416b68d&e4ff28a7b4724bb00e19a75569cb1c68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"84ed04808e58b7a18452d6bd62bf2aa4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"45a03c8eae8f7337f556c298d51040ff*5&2c2c196a05888ed5b02a7f5c718ef788*5&0b9cabb75dfaa441a5034f0be2a0fdaa*5&709da1d74f8f3715799eddd1630130b9*5&53e41626f674100b033ca08576d0a9a1&bd49302edf5179f2c31f404c20ee6a22&d8c478fcdcd205a8470ce45b8876422b&5eb19ed02a94b6133f06877488cfb952"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"937259a4f28f9e87d3dc599b2eb8633d*10&94b14417cd62e505d21b71998ecaa44e*10&1f281645407fb4e72b1e0f6a8a93bac3*10&4d6c0592c1b0fcf8aabdcfe88b1095f6*10&1e622fe1fdd3a66b5c00bb128007828e*2&686b9b127d18880d44e3c14be910f4a5*2&3cdb4347396f98c473c51801899cb6a2*2&7ab228c970e595a48e0083bece728829*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"6b1d423c51a447e66dcf3cc7e62537d9*10&09b672d42803850f1283529c0e8bc438*10&42b05e1d9f80d2e84b2b785c2f769cac*10&6aff4ed3e326c29d08c2017d30da164a*2&27180f3efb3a228ecb0b843eabf986a2*2&f6f1df35473696c54aaab229097c31bb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"af6f9511973858ed1964e216712cfb61*5&2aa2d0bcd9f924564cda723dc117375c*10&71a3d06c821633ed49e9fdc26cbbb710*10&b6d2c6a8d2e3a965f50debfcb1329b29&07f6bbebde8d0b43be8f4a4634538d2c*2&96f8b45c41fe695db9561539e29826dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"475e760fcfb3a71cfdad4f188ed23021*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"722308bb66229de0d9900794f03f90d9*10&e49197c4cb492c6aad541897d9b630d8*10&507e8be429c61dc7ed7877c9da0de8a8*10&0414f393c9f84dd5ef1267ba8e3928a8*10&476136d421658fc813d8964d0887695f*2&569d668c9e99d9bcf61bdae84fa4879c*2&b644c8fa34a1885b526e1cb5ff0b81ae*2&db7b5e406a6f9f67641dd3239f78aa49*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d6cbd592fb09253c997b6326371fe42d&cce2f0b973bf1cea2dc7717a48ba923b&749d7874088a9cb76688accbef34d4a0&aa504c3c8b6e630fd8e5e52d7c8c70be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"368ce98f6b122017649b6d41754646ed&addc0ec23fc6dc13661577d88ac96532"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"713723ece5d2bdfe817d963a3d0ba050&95c19074a5d8aa95abdddfe20d7b2ec0&9ffaa699abb70aa8037d8cd19207e080&13757928b27624ea83347b36cc9a7e93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d2c135c30049faf05677b584aa18cb77&a1dae0f8a1848ad8cbd0fa38b2e3ffb7&78e6eff192b589ce7aa91ed73f1b8ea2&b963a3578c05a00e7ed292c1b26062d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e52c030db97be774d9fb3529fd5dd863*5&21d599f750a2667e88fa287497d4a9e0*5&56f6ccaa639b5994f34a95b76ca1c760*5&1fd616aec821f77e090433eb70bd7161*5&d577a02e3928adba54d515c6bd69f862&751e403176204b90bff6b04e76b80537&544387b0a3a22c30da8a2e39836ce3ad&6c70fe9a34022228c759313acd991385"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1e2159d46c62484d5b45ca9a465887d7*2&704bb0cd7411c55094e85bb0d910884d*2&1170bdaa8b38623f8ad948e471127f26*2&1c36b834db2837427ea53f698528012d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"44653f814f2da94ffaad51bf773bc24a*2&0088fcc9d2fbbaaa6751d27416134803*2&7ad95464957b69fb3b1819858a0cab0d*2&2fa51b818af290b5ea8eb79ffbd246b6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e569140254768c3ad4a8278c8f42a6ff*10&64b1849a0fb734172eee4e27a8f29ebf*10&7bc22dcd292741f8906ea579c6740c80*10&506d4c436985c0511226b98a10971ea1*10&1964b52fa8769b26810f971e6fab01e4*2&16ebfe500ede314007b4ac1535690f92*2&0a0576d37b6d5369a4c307c750699336*2&7bbb5dbdb5bb36f0f1f40c0b334aeb26*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"35bd9455a6e9130f612663b8f97cab47&49c65551cfb5935586de72fd1452cad0&32136d003452176f37e14c656c05716c&07615aedbd05c92ade3aa07cb7f5b765"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8084dc88de5050788903e3319a04c8b8&23c6c8dce9813894a5cd91cfbabe1f8f&d301fe09dfd3646d1281a718a3cff48f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8784c9d1cb5b452c951823432d38a32c*5&5e49f8910c7aa139d032574be6f9af6a*5&a08ad8532beed7554f086c5a54d3cacd*5&37ed308192c7830e83b7964dfe85a670&5c60962cc725f6827dfef11d529174de&34a3d0f308dc602c677d786606cd3767"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8a75918dd311066111c4433f5d10abce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5fba4f2a5c885b83458908cf41b9df4a*2&f2060a3eeaa11a784d96dfb232ff76bd*2&5c8351461a5781add2f0d5db70499f97*2&5c1dfae758b1686dbcbcbfab5768ad3a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5703944180d693f933b19337484a533b&09896dc7a936a69b6a27a30c24146dba&a7d5f0526e257ec7ef9830786de9fb3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b1b58b74bc6dbf120c388021aad23ea*10&5f967603fd5aed80a76eb0252e80e36a*10&c99eab316ca5c88879eedf5177e30ab0*10&38297ab43f05d79b619214de7c4c4372*2&537050eed0af1411aad804e660edcb25*2&8a8e10c6d5438d713449975d0da6458f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0e2dc4058bbb51cff6bfe4628f95df8b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"610fee5a27a39f20001bcf93f174508c&6d3cdd0c7ea8a8ad21f6ddc993d75e9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d2886d4cd30558e96ba4d7c0e7baa0d9*5&99d477d7681ba117a2ea0a8a26684951*5&f90a6bd8493c644d9604fa8d0681cbea*5&ab408d19078ffd6ae458bf32da5ada3b*5&58d5fd1c982f1b6c70a3936ae957cb69&5d894275836a555f43fce9a00095b505&49414dabd7b9a5f6c64a8421a10dcb8a&97c371cf351780050e1a97906238097d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c6e8e646aceec4827b829663758e381c*2&35ebe122a1ade3d0e0d2a2912d274473*3&f41fb4a724816a08d0e9224fc3669c7a*3&3429defdfb1813a1f0fd46283ddbd37b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"2ab0ad54f3bd3aa19bfed788949cfcce*2&bcd0bf7f4c0ab8fe40d410a55fdc6664*2&b90b4986bbf1e06f2977481c08407eb5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"095ab66a15c8ef546484d8d1362bf2b1*10&4e55ba46a54bc12720736463db85d03d*10&1040ad97e3aeaa0048dbc45e2d5f1101*10&4191e3d6568bc6c51f446af10b3989f2*2&158898e7d0bcf467c7c0d194d8939273*2&b209dcc7a6d1ce8b58972aa07af7b859*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5bc69673d94d21c7e14b9ef107c598b7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"48d8d678c5e165310b5054dec191fa3b*10&e70449920702b5e346030b258c68adcd*10&6f2ae7d09c3559b5a1ab27d6062a1ff5*10&9bf1e8a2ab60dd91a1630a941d6d3594*10&d8cd918227ee68ae666aa4a1f746b204*10&afed2e076a76e42f3ff3db564797c206*5&30f90ec95256ec484fd356bfbfee96de*10&faf56a143c56998c8d90ea1961c7810b*10&26d052bce517f9cbe895310d09867c88*10&c63e91d4a10e65d484e927c51704beda*2&86206667c7a21eb3deba8f41ac7298cd*2&8e1185987fd61d90cabe0d68aad7d121*2&94cab3eda6016bdcdcc9c4055145c9ce&ef6d52c9ff0f1028d93d18793ff3b0bd*2&41e96cb179f28251cf1685dbd8149375*2&02f3d334f7b28662745b6fe05e221d5b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aa1a8b335d43ac3dec095825f2f6f0ff&04412a1a2b4ab7d012a74b553f1516d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b0f9fd4e7ab718bb38a6431629daaabf*5&1ade182c10eb6dfbf137125cbef6c203*5&813bcf0f01477ccbd585edb404049345&12830bc9fb63d4bce80378fe02e287ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"60564624f9ca26aed78c1285ea8cea01&3bb6a61420b8d53e02919c217c8b0cbf&197274a0c284ea4fa07185a4c6ec78a2&d9b59f09431313127d85f37798d11f38&ec9375d6d36392b56c3a16d9d93c5cdd&f60ef56cde1f6fbe3188db3179707bad&37effd19d4b8258cf0c9c6eda88ae775&043ef54762d27d3b7cd8c398db012541&fdf4e812e434b678a1b4dc393adf6d45&b539d14b76f5f553f5def358155f3054&b8583988a3b3d87e990643c69d28d8b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"151095aca69005f56bb17b95f169adaa&a9e22d922c2e5d17ff2d981fa3ef2391&4d4825a9b4bda5f239781b5595d3260d&e92de620b82b5cd6d7f5a43534a6b2ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"46818ec13344659b19a3a6c6f1ad8585*2&cd42270c1b5cf514af6c74373bfbc532*2&41dff4cff3d80bcc060e701373c3c8d0*3&76cf6d388d257531daf36af5e1be2f11*3&0ab8e0c4d7f86886e8e582d5a21ac3c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"39f67f1d23b8ab45ed710a89016fc1dc&47b420ce3ca54d6463ebd8a5170a6688&7b24dbaee0b2736e4da9ee7991630d2c&ff029755f1d771dcafbc9e46c4d0dd23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9210f648dc93beb93e057654ce4cf665&16693634822713a1afe05c42543acbfc&eca62c3f7deb6c139eba9502470ba804&ddc64dc38939196851b1aa6ef6a5b38a&521b1195924a96d3c372ef3d22ec828b&6461802d1b6cd85a2007597288eb7e71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"96107e5db7a7f59217d77edb4d0c1d92*2&94a70ddcb9bca992eacd42d618b5c23d*2&ac7afe8d8180c763e478a6fdef717f07*2&e69dad1185d4c65c5d2dd7f3d511fd18*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ddabe0b02b9eb67c649cc6f6d497123f&ae767bbc021b3f06c6868766bce8ea97*2&81518ded29efbd1bf82db04d8a0bb633*2&9e3ac1bf18a1615fa23016d52dd18582"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"45d72c91bf6ecfb96baa778076575c60&9a4ef1942e7bdd2900baf7b9e6b1d09b&7a6181361c4ad59e639d7a17b75f5d0b*2&ebcbc2c0ac805c9d96a96c6aa4f771e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c7bfe44a1885188bd16463fcdedad5b8&f6ff0444c9632d0da4fae07a707bd3d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"325f387d96d5034191bf024d42554de4&1b9312812f9038ad0d32344bcdee1218&a046f36b82860a260705f08046f3de16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"6fe077f11d5b3f04462e08a6525f6b58&36d17557dbdd205a051712a3273d1559&11625c6564196b098df32cfd85365ba0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"157ba27901acb015a1828fee5b900f10*3&a1e0e9e9efb4ec263137d05d4552dff8*3&f258e094e35d062d9251649a4dcd1cfe*3&f53339edce8058bbb6bcb979d2080c49*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3686e7a5218390182fbf9af5bf994514&cf8f8a26560f120a176e510fd7f11032*3&aaefb80dc434b048ff5a434100735da9*3&ad3f79b9db545942a36434826e2f2191*3&9cbc6e17c382226d47c03b2d53e8eebc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"324d8679e2c3b3e79fd2f0b773a34044*2&1949dcbf95cdd08e8b8b2787deaa9a7f*3&0e2039b9a5303be236b164d25fec7019*3&ab6c99196945cb2256c82a3f89abd745*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d51fba9d0a15617bdba2c751f55e32cc&5861dba82b3ade37396a0878b2a00bb9&4b970fb6f693eda3e9c86f3ccc376e10*3&6ea9ff3d3052797ae9b4bc783eca3081*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"16c54fb3fbdbf8479ff17c9c011ff28e&42999d1652f556be2a9cfe980398504f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1624408a2c26bd0aeb59594f4b67537&d8b85c5e99428c066c05c9ab9ddae930&7b8b2644686b87019dce67dc59ea98dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"82c25dbae159dc6601e014054d99a143&99986b17306dac6dee03fe6f205dbbcf&5dfe3e8937cd7ff36b70552f0d66a1ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"51e3ed431aeeada40234d56c74357dd6*5&437bb4fa9251b7fc3fefab93c4467704*5&272d7e0e197948144abf692d5a16f098*5&8f2ea6a309b3cf1f1a2579127972d7f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ba71bfa0275923d2ee7b7c999f838236&62b089aea796972974d8a0a8cab74ba0*5&1efb67030431c21b0dfbde4a2ff788a1*5&b33cd0b52ab569dbf89e598adb9c85f9*5&a25784a4640e69c289491c60a81cba2f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"942246fb9eca0c8018e8a6a142f10850*4&36d0032c73c427751bc7af2ea5d901a5*5&d3f6f49b9e7062a047d0a7609e96d546*5&f133edb67a0bf9ca3684e495b4217447*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed1b7731196ac4a2ef0788d6036bee67&6703812e0223e9d4aafb640c61dcabf9&277cedb44835ca1bf6ba855c1043dc6c*5&4eeef4f6a0d93cfb167bb1d2f9a838cf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a8d26e7e0ca44242b435ff0addc9cb9a&5a4c39fd670f7fb4be78705b42800e1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d75ac50497e31e63587c770a98b9cf36&2a04a78659bcb4a9ba574f5e5cad258f&581d8261e25f2b4218a11b18013fe540"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d68bc08d4eaf973a2a11290fb6cb4e8e&30d951553abe646e2b5476907650848e&9117be63b6aea328a45f98817bc97f5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cc5892adfd2db35b88ee3f9c5f965eb8*2&d77409e191833874a179e8fe357e884d*2&7b7ff7158237cd211c298b71bd349005*2&4fa6d8b4babec0fa37f884ee29b04ffc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"cd05cdf0a18d4e2bb022c9519da240fd&90dd7a82059b03732670c17a48fa2172*3&5a7faed7d1edda0cc8adac5bc7607ac0*2&70bdd46d70cbdc984ff9d29e9fe48419*2&dbc5b84dad9acd499e0114c7a58481a7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"afff978fe78638e46f8901cdaa33f2a9*2&ef04fbc8c2041da10ef3e09dac6918f5*2&958dfd43f500d3734c04fa167816e9d4*2&2a32531b0efbf7888acf2ca8fdb4c3c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"a7b045f255126f18997854d1ffe7736e&3d020fe971e49838a07d6cf30f8f1568&19111817ca1fb168d33b35b931e5c6ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"85946cf7bb133b87ce78f30f94767f43&1227efb477e4f668db88971e0e558014&4c4396e45557e9da8e53c8d1f6ac0518"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"2f92dc4257008559195dce4228746f59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"8183d06d81013f501f928db267338c5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"9e65832beb5959d83b1f46a0553de821"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"8f45f7b2d0a6bcf9e1321c65ea932d10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"9fdc438267e93bbe06aca03a81d8253c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"00c58e436a61bc95e9ac85e76b9545cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f1ab7f62035c234faf0ff94dd7ef3519"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"19ade2c721c415d3f3de4857a6f8a7f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"9c9868194318e27d3bc8beca2241c20b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"f6eef3f293c5d3db33d2e0b312e68e0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e6491f93ed722b2a034998f46dd7bd7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"75f79e2be99ec25ed28eacba25b3b200*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"36c30f390c073eaa50405faf0c9548b8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"bdde2a5bb9c387195ca7098c49ca1ca0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e7b30a5c20d5e2a1dd921b84b4b2f9c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0335bec5418cf4d5c2d49e57c46b117d&e6b81281b4c6e8e35f2ffd1895f95e22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e638a1ed9c725171218bfd2568635eec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6623463aca3a3a468bddf87d0adb69f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"b1df2b324a384ead32af83f1ad9c2518"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"34340286444f50e0f5a25f92effabcac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dffce0af63257f143d9eecd094cedad7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"30418abf4df0ac6d774fe931abd08b3f&7a7b83f9ba02eba75225af8c4090118a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"d4c9513ec57ea13972dafed041494fde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"7a2554882e8bc40d11c29bf21b65c2ef&20d73a7f69c383005ea149d4a414457d&5b1700dbaa19f0cd49cc0164f32becc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"c21de7bc8e04bde9789be66756224267&721279677a5ea1185b1879dbe0473021"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"84b421f8b97c8e7a0c646f208101718a&f80567b46969b209b9f2ad8309a22412&67fde673179aae93119ea07b105c8883"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"34aa2c4bb725106333f18cb619ebc556"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2d752f08e7819454c6f6b3023551d949&ebc58d2f7c5c56f293b721f5502426c1&c8da31ee0af9f3d34d93ccfab82bc323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"dd2ffea122a5791a0a29a312136fef40&fc9cd677c7c25fc389aea4dce149e6ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"4070b4115c4b31fabeb38c95376a96d3&25a2a4e127ec72288c8a4968a5c7da16&5234c816889cdc75748580cba32ebcec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"bc9a3808edde28bacc16e1f329cee9cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"e2ff57d1b44e36c7755a82fc56e93935&a7927fab994909a827e93bcc8b72094a&0540d03c018c1d27fc15d454938baf82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"d5fa0183d01b43e82f3a5b9d4b55837f&fbfcb73c0c884ebbb9f46eb5d55947ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"bebb2a84427dc0d9a8491d16cd5592b2&c98551871e30ca4fb0585b9479d046c4&7610d526b47548ccf97c434cb42efc95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"2088f8a870ae19d1be789dc9d913be6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"236dd230fdf5d3242a8665c215c2b1cd&35cc1b3c50706450106809b8b0374d28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8fb0bb48f996cfe6a85f10dd349a7829&f1ea239eb1596e0eb7d74f8414914c6d&274c1eef3028d8d567b5546400f77770"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"47aacfc16575a49e6c733b22456bc54d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"c7507bd1615475f08490bbbec1730261"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"d7f5e58ac7e9fea6028858927f318215*2&593aeace801b344f3a3546308a415214*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"46842ea0c34844e31649edf33d1aa4e2*2&cdf64dff570a87c02b959d6f06d27be5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"dd3dfaf95dce7927fb18a1889238e65d&19ad1ec1511c9be20bbb04a62856fd78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7d6f13c772fc49f87dd3e15708b35ca7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"42e2a4ad1b23881aae8910eb2786369a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"59c2376f4bc22ddfb0babc9c6b0f72c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2f28e03ca6d383410c9e586f7af4d1ed&20fc29d9cf3e9c53992f87dd0d4ab559"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"1a69c174f0a20e5ac76937de785dda7e*2&cafffab2cd177b9a7646c580f187f8ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"71b4ae41b28dfa1f9ee655782596bce7*2&f89b696c54c215d01415b7e39b798664"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"703d5b7f88012e5fa8eb1aa603896902"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"2eaa1406a8c26aeb57223e29415c9673*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4e48e61b1cb949b7e59852549b2363a2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4cd365fdb5ef1ffbd84d3e0033919735"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"db888944f232bec013cf3c674d8f3335"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"c2f70b40b7089b008eb3aa662d318f33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c6af4a0d7d262a3a1be0c7229455c3ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"94585f87b8a7624852a923da3adacafd&42c643129790e912605a5501c2979b33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"286b1bb14f4824e8c63626d38c797f0c*2&ba678d782cb9a59ea170d259202ea080"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0ebedf1465e3f4bd3f4472b017f8eba0*2&796a0b274bad5217a364abf7c5b35581"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f83f335f9ce07740242da8bffc615c9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"3fd3e1eb2b6aded187a291632824408f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"a57535700990841c9bcaf5d088c9e8a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"1e7c656b7a22d03eb82d283643c7a2ab*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"76a746dbcf82cbb86b7f3fe98ab2f895*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"a3248c69cb11f77392723cc54d1f0fd4*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"49db3979a56abe8ca2a32dd6e1d0ac6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"754e14a995b6c0375fc377e002c2fcfa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"57e67a1652feefc161940a428507a125*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"07c3bde461be59f73830e8419d449dff*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"0514116d9266dc969f4d7dec1b6cf93f*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"797c4898646b21ad63911bffa8dfd38c*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"13319f3b24a81e8b2b3ec7ce5b101f84*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"b00a631fa2f67bca4ec1727ab390b9a1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ba8aeaf08b2a378387f8e40f0472fb38*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"6956e56cc00de532fea87e19e5e68b8b*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"8b20adadfb6d98051100a701b746345c*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f68ec3fa47339805662f3a028ca3e248*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"45f23cba7a59cd8e5a43bd2be5864c46*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8918d747e60a287bc9848c01ef2b2968*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"49f3fdff5bf1114cbbbf3aa7abcc31ff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5215951ec6b4a456d267a98fb9fdd348*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b36e49557285f0e3e38405a79675e023*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b76cf39d89b1bfcd2db077fe98a41755*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"06b2f3a9870c8f33ee1b1dd04489fe21*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8c4073af2869cbae2254ab0e96201082*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"d7589da2abc4c282d4058b6e99f586fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"9240a02b042822a558d33da8edea0fd1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5b8f77c60ef4fad22dfaf59ad09454c4*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b5d4692dc63a2023c70ce3192e745212*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a4a4243690656899dd7d5557e0d342f6*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"894ec6876c632c4533ef84e1d4d68f33*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6ac3771cfcf67bb3ab7dec3786cff588*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"bb4227e3888b1fccad3ce7e0d481a2bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"6552b7b4ccb1825e289b851cec9b5fc6*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d7891eaadc4f31030e36acc37e0c2543*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"b4144c6256dd43ab5e99a0adb3b77833*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"cad27ef1fdaab33ab6a44645c6e332fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"6b2fbd6b7c6321d7af7e27b5beecd67c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e48e4040530152eab3b692c905e4ce2d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"af7dfa80eaa4c57acdb7d12bacc6d03e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"5bf24c9c8b40b058594227237773ea77*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3cdc2a40e3dac2f65b67c741079435cd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"fa2f67ff43f0e925264e16869ae06113*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"aaf31139835143c4f264f59d659d6aa9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"b322787d1aae76cb0a011d5ce4140823*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"82a56f1aa88a939b2c0290d2c7cfbb40*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"ef04311f2b75acb79371baa3d6e992ac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"6bfb8323fd740326ca5a51ff37765d09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"915ec9ef332b0f45e9cb331d40cb45b9*5&c82158512cf1b9bb7d9be356df3fece9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"b51244202fda094030aac384f910f541*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"02d7b6a8120784e83d41d6fe64c29ab1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f0c850078a827be1f44168d0e63cd356*5&3da2b7a950db1ed716a3ac0b72a17977"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 mb1_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"82ac459c450c2954347d3fc628819278*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"4df4bbecb59eea91afb8901e8b415501*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"60f1d6109db6e7db5478984b3c63c3a9*5&cb5a7e78fe84d3d57f9b826c7ab9d113"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"c0bbe8e169a3c38cf943deeb6d1d9da4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"df79d67f756ea2b2c572aa63059adad2*5&ae8da7294a393601b69959a3fccc6286"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"70dfc593af175e929a6e383ab277ab84&fde711d6dbe1a842884f850df90e6e08&8795622d477ca0667127c74f9278a4b2&1e1f0ae7a42db5f6d9669ee543eb277b&978e5504b8be49260832ab6518eee43d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"c8bf56279429a1cdd256a41897194a34*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"1deb296979282196fd788d43c3de0d66*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a5ecd953c57dbbfb0fbe77888cf8c8e5*3&d4f2a06430421621f85a57d4691a434a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"454beef9030cd0376c8628a56c5fbec0*3&048becc468c54e3ab1e2b1a1d3b3fbf9*3&ea81507a20d2339a318d4bd537c6d9de*3&baf1d6f52f7d8ddfda2063e9a62a426d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ef1a96cefa658ce6449a48d23e88cf40*3&f321c0f245f51a01a5e7882b0b2f04df*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8900da21c7ca2af9fdfb3f9b09330d26&57b495d1a163359ccb476f2080de054a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"16544b38d3085f35c957a8bb8360c879*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"f88263414a6b9846191cb31a08c834b0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c74a7ce8b941d3fc73694686da81bb9d*4&7dc29ab11a8f8288bcbd336f2e370333*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff4287081c2a5a7bc12b0f47074a9302*4&f0ec9cee404be3f828bf3caed2a8c053*4&daa6199970bb0e36fd82e96ad9d82552*4&38cac8ed8038f8840b2b3b1ac326ac87*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b8ebdfeaf7f8acd70626e9f8f7fd6b9b*4&71c64be4e5c6f4968ab737954410ed5f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"6811e923a9078bd68554ca44d7e00a55*92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"6b47b187b14e698a8336f4ed5670d485*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d6d10911c59571e510df4b485eae605b&9b29be63b24c00eda8f737c03e8ff97a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"27c0ac55d9bcab9ea6e2ae5a7f4ad3b8*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eed0a080cb78bc5a68dd70d373bf1cab*6&aa34c9ece724d712da4140bad976a0ae*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ba157c2446a4ee930578aa9f6c6ddb5a*6&4c170219e1886c0b7af8c7cfc793f6a1*6&595a1c42e282087c69f3f6b656e4f6b6*6&7db8f595e20578859e2ea767123518db*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"64ec8f9073da64aa4d58a4570ee2a7c7*6&bec25cf70d600de80e2e07dc94df3050*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"a3a0311cd1235824bd4f55da2e59ec66*159"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"909ca357c0bec47be3b845a9abcb8fcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f3ea19d9e9f5e6108dec7ec2a4b26cb3&5c5676b9366db78187ade47e3b4a7551"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"48706d24cf3d1e48dbf484e5a6018819*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e7cdc39a9d505899126fd97ced0b7198*3&bd2359821a301297bdbff6ea08b0c925*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a60f5b1d0de5457143e84b9ae0a8832f*3&d64faf9f84f16bf7ff3a0eabd01e3a2c*3&31510812c0e11f1d53db598fd345093a*3&5195898440ddab3b84492bda6f084cbf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"64d6b09fb27e987da47cd17073318b19*3&8312df0c56090bed239108d6474b03ae*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f2e4219fbafe9add8df48e536ab65e3c*2&baeed71771fc74128e77e0a19e17cfe9*2&6166dec7b46fa71e6ec3ea3e29e1669c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"134fbecaa5901a482f7b06f21595fcb1*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"af9051a7513a8b42a9de8c05e5e40aa8*5&19904f3b78d7c593d7f4570505a3fdb2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"5ef4b8e2e0a7d5f1410e595b151c03a9*5&5d95d69150583edd915f3284d93c5959*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw342ow342kw3sw1dw0pw1_n"c750a4d02e9a9b21528e86c55fbb3b56*10&4ef850c38757efd184737373ee1fceb0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"9b2297412f6b8b809d01497b5a80c8d7*5&82701777ef842ec2c1a45b07156d794a*5&a44be8a1142d97c1bf892af25a495009"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:14:aBcd16b+eltwise_linear:0.25 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"82511253c404d4ae29f94bb0a045c8c7*5&8b7c6c5dc16078cdd6f1705cfbf7b63b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"fe71e62b2fe321eecbcc09e794ca4b29*10&4006804f1743aae8a86d8ea1e0e3ea3c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"24e0de126549733c7e402bed1cfc391e*5&0aab0d1a132e2b9ce6808e56c89b6839*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw342ow171kw3sw2dw0pw1_n"06d56a0cd400d3e52e8162f27097e616*5&8ff642432b232109cd1335f2e39edcf0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+eltwise_linear:0.25 mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"de70a73ac56961a0f1eda4b601c4b663*5&b14fc478b03c72789dbf19bebec66646*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"268f323bfa34baa6a240f6983788b897*5&c4980d8c7de6e47ebd5163456ef6c57d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw171ow171kw3sw1dw0pw1_n"dc1f2e89f1b6b1a8e886ac0f70bb7556*20&ba4b0d30984491d7fc9405b69bf6962b*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"af25efe3d3739edcb3fb4377efd09416*5&1b3f07aa43944151a43349f3964abaa1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"4196667ce58ba5544bf2c26c39a98a4f*5&267337935c19732621c128b9ce198e84*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"5d3e734f933bdce69d8d460826a79534*15&e6a34b6f337fad676db275a991b5cd9d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"a884ee9d740c41509b8a028675e1c575*15&5b55a2a7d58023eb6a71aac1680ee63a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"64bcf42fb76be17320e2e60783783b32*5&3a2c84187d8d3462a98877b86b08888e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb4_ic256oc256_ih52oh50kh3sh1dh0ph0_iw88ow86kw3sw1dw0pw0_n"1ae1b880c53733b690c84c2a4cb10a27*115&664b0befdd279c128b9f56252092645c*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"929a052fc65f81dde73dcab9f14c789a*5&84059fc181f804435fb47532a4e5a7ae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"31849f7eb6ac7ba0c04006395ca81ee1*5&c9d9e84933a632986a7a9905b5aef51b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"30b68df871a66341f550b90d55114930*110&ac42c2f9da52655f263daaf4e0fc97ec*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"759bdefba567f7fc4293cab7bdbc4243*105&bd6349ceda4c53606d83ee0903db37d9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"ec95a0691dfdfc09c232ba041482df6b*5&0d28920c0c734b756da95ef12840ed8e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb4_ic1024oc512_ih52oh50kh3sh1dh0ph0_iw88ow86kw3sw1dw0pw0_n"f371ee904891b98028a68ed3eff4726f*5&e1066f4ec6997dc61dda26f3193ee16c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc48_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"76c80f3ec69d706bf35ed950fe12b78f*5&7e6303ad2d0a7de0c4bfbd914f895cff*5&2adbe08bec8c8f7fe2ca1594bce76c2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"99cf4c9c5c5b852c8da9cf90d58904af*5&74af90ac0ed597771518510524010ba7*5&6ad563f7414a041b4c883d9f5c80dcd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5291c8cf7840ae2ab43e9cd7ccf73939*5&b879a3fd7b2560ec218bcfeab5918762*5&2073e02b409d42f7ac3acc9a740ab7bb*5&f36b8a7ccbcc997428bded3e08c7e428*5&bbfc7bd802987cae0f9c5fed6b620a0d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0a9ed3f196ecfa8087fc2572de3d6289*10&e050dd1cd71f59eddbcfa07a5ace5b6f*10&13e91128be6eb4dca87c694db6c705fa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bfefdc063c5ced345d11d34eff9a596e*30&db5c42b6b081eb53d67ec15f3b8d8442*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4375af0e8f042844fee4c30cd81f713b*10&1984e65cf55b21aee3c8f0e8fed98103*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6dcdcea38d277f8fa446dc8c22d887ac*10&74444bc8d3679e48b835457b492e1994*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9a617d08a661f8747da3735251c281bb*20&1416d5bf734ec4268b9711f5904eb8bb*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"478b71f1b8e29931ee32ce978115e807*15&32409a08f0509d905e8c055bf99ef507*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"44538637f3735064d3cf14038a38c0b0*5&5f3fcbbb7fb127a850983b6e826c4f97*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"a8a9e8b124d3528570a2f7094f3e4cb8*5&0bffef547d24af01e5ea8dbf592bdc60*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"2e85bb1c7ab6fd28414eef8e7c4f3f97*10&b9936900313ee575e54ce21c85f6893e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb100_ic256oc90_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"eeed710a5494eb6eb42049c1784e5ed5*5&a8a045f32d1a43a795a9391afad0ad4a*5&3b28347d2452aadd5b4634eb9e03c997"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"2854c8323be9724c45a655b355249d65*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"b5816240c195617c612182b183f31584*5&d67af19381fbd853cac1aa0ef2c4be2e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"3ba25e24e8ac4fe74c3008bdf6da4cec*5&1d174abd52826f1b7aeca64e9b4be095*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"bbed72d709e891831d359d69082e117c*5&ca690ceee0023e1e72c2240887b97088*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"5512f848bf2c34fde181a803a8a33c93*5&7cae651bfdea6b4363cb4827aaee2238*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"9f1b77eb14448664f2c32e70f78b7cf7*10&0c867249e516c02a4570df814578b6e8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"da19a2017a3104b3a2f12b0bb5bc03ab*10&28af2e37620a72018e0dc4e2346338ac*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"e5f406c99ba5cfdd795f6fb735079edd*5&a3154051cce0f4ff155d29f591110d60*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5c358146f638c9d6b183e101e351275a*50&30c23356d67fdc10fca6d3ff9a4ff08e*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_add:f16:14:aBcd16b mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"48166c57d3275746d4ba87b30aadd632*40&97b77568fb4e1c06f5041355008b5acf*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"a198157de0663cb571ed4342d3e53a2d*5&caf23c23fc96cd9b6fb35c2452e5817d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ea614fce69229391f551799d78752558*50&0977b8716d95f01c794ae1f82aca8ca8*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_add:f16:14:aBcd16b mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"7970a8a470b2e9cbb17b59055fcdf398*40&e0039eeca4cf6ae6229ce1694344bc9a*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"fc7ff502b9cd68053c979235dc22e9d8*5&f23290af22a881c58a2ba075034462bf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"70c6167bbe4f20e555cec165d5edc88c*35&e33029d4b4205d22eeb85aa361aa8ab7*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_add:f16:14:aBcd16b mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"48f5d527bc6a3fa94f72735e78617adc*20&6e8b2f9fc94806bd1c410fecec26f0cf*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"85152ef2309111d704c12f832d04a271*5&6a1915e5ef8cc3346eb654e37bea24ee*5&2b3a0317ce9c63383323f2909cb83298*15&4605cb368dd3c54969a16a8314d1e446*15&43b226b6713468e5c030ccab4a19ab32&3f3d5004c9801dcb920fcc2bb1251c6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7a9264c5ac1338d4972fcd82467ec721*5&4ac93886e45aeade78973f817407d133*5&2da81c4c715206e14ffa02a3d222f3b3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fc23feac023ebf24bbdb8173f2bf77a9*5&577b2d97b7b66946bf3ee528d5685bf8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic768oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"13e8e99baf1ba32392a65026cb8e0f24*5&3322e661a93e5521a824a85606f62ff9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"0d09fffcdf4eb88ba662a6890d9254b3*15&e73e2f7fb292ccffe38bd45c5200d662*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"62974ca50e0075d8dbb6f5cc680c9e99*5&d942484c0c731213ce2ab3de7645509c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"85d0346d6182ae05b01e6a8db4258f8f*5&b6eb1be29dd41af454fe68972d24db82*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic384oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"e80b3b86628e4030d995fe4cc1bea3ee*5&4fa0c365d6ac4c49ea97f2b05f79aed0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"64652ca3d7e0a1ca9b33dcf4b45965c3*15&d09413679d98c9029796fa4f584a8562*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5e5e5f5eb21a6f301765da88211d1687*5&5bffb390d26371a959f1b9b8a2a488dc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"f9bdaa1e4e59e088a93a9833f6cc8d7f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"840e2fd4f00b3c72837ba6438377ba6c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6433168f313e3d95cb844f799ab68595*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"96151d0ea266c31aac9cc5eb7d478f27*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5878f1051f8c8047e588102b551c40f7*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g58mb1_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0166d94de231606f00c43ccc3e04e721*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cac1baac8946cdc43935e3ff2ad05551*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9a842dc598df602b8848e8d78d7cee71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"d491880f5fad850f9122bc69a0ed0cda*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dfc847b4942f77854687e28571ea9e58*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"10a0b14fb34e05cc5e58a0bb7c786b64*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g116mb1_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cf7d25500a55a2a65bb0ebbf6ca3d19c*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"014c3284c6c04d9b75c01dd5698cb94f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"2e15c7c34dcaecdb56d62f76c22efd6d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c3ed19da44e5de384093da327b06d1fb*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e1b08ce23e40397dd65dd81e0c45c488*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g232mb1_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"16e5cfaa6007ee195c6106b756bcd2d3*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e9990be865862ac307861ce830eca724*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"504bc532bbf9d1759aa1258701c62796*5&f45321c017c348afb3862203c0847dde*5&c284e8580eafdb82b6b8e8e16d8f588e*5&e03cd6540bfb0fe40c76f60d74dc691a*5&a92d4f95d79a4108efb6cc3f8e60a304*5&f73be0deceaa8fa7de1a9266a75d0c86*5&7da457d195d20e030c8947559d6434f7*5&3d4d66240a0bbb87ee83d7a657f06252*5&d7a2061786ef928e363ed0edeef75ba8*5&a18a6e66b2b13fc5dfe91db16b621c91*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"692ce510cd8aaf24039b5a45e0f6162e*5&822c22cd12f00241eee4f8908b52fa01*5&ffd4c5b10e5399be602eb66274092102*5&72093b8aecbaf11f985f50333af8d54e*5&74d3422c23c7bd44404f8c322d507eca*5&84679c56aa7ea046cbc67c61a75b735f*20&a1cc52b131880526bd8dbe2b7c16b1a2*5&810cd13d6bca40e3b8b2748275b81c4d&4f675ef87a9a4501edb618ce0d44d6a9&017da5f72fb23114221d61d7a6ac4cf2&32629ec3cf1c7ac16f1cdc5bdfb102f7&0e1a8a021e313557f3c5e74b7285fafb&ba506492377c2a564ed1ad624aca29db&d1fc9726be5cea380d32998045198d4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c49056cb1bc59d872857c65eaaf6149a*5&fc4b94aefae6fb16c4ba63dfd2d9d227*5&88c6568af73f27e07057a602dd11522c*5&48aad00ca59c9d4254cb1077a7a502ac*5&451344b57fee38bd83ac69e109fb495d*5&8cfbb639d1988e64c4b4a54fc68d9f58&1616e02ffa3f43caf0892ee6a646fa6a&283c4dbbf4dfddf69d991547ef34265a&ca9ecb08c4db7a9b000878a345a989d0&adc81648ac4cdf87f5cb953a6f55e40e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6f270cbe38a771c4a35773168f61e987*10&f9216b287d62512da52210a7634c19fe*15&dd201dc943f6a0c1108607a242b012d5*10&c2911a6f2368beb9cd9144dd067f3018*15&53d56f6a5b107af7f03d24037535e5a2*15&3523f3e076b7119f1823ad9010d723dd&d47eab411f72a9ba2ecd82809b9ed39b*2&dd1340e38f8a52b29e5983ee40915fa8*2&da599faf1a9e062f074a38798ea6bd54*2&54784e4ebffdd0d35ec921dc275685a9*2&ab870b618ec6797a307503eecc8242cf*2&4cd651a2f74b012674279f530d0b2bf2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3c4703e95e70e5e9407063924c4256eb*5&cba095c5f430d34239e0d3ef5126b4e3*5&d2ae83c05be2bf3920a2297d95bd1230*5&0071f6e41d8f81c7c788a539f6857bea*5&50f2dd610f80cf8f4620c04b265abd97&a7058f09ea71164a65da9ab2442546bb&6d00f7b3cb85c8e2868e7760423153b7&e5e517d059777acff55b066cb0220141"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dd87ff8ea07f577d4045818ef81b625b*10&a4fca68b10202112f205c234a8a96de4*10&c9ad29c050192515c27fc48b5de7d5af*10&f16bfb80e0705562f66b1bd5cb516e92*10&21136d8ebf8d89f38e0ad18b8369c483*10&7019bd625e59db0ffaf614c4d7cc51a8*2&20f9d3a50bb501a10b9783ef2c05ceaa*2&035e106143ab92db5b63e7d7693a844b*2&3a49565cab504400afc3fdd992c0a580*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2212cbe856c580cb882b10c3d156f31a*5&ea215f52d43e774bd932f0eb221fd7ff*10&e833f9ff5bf632b3d11bbb2df814a620*5&3df2402920d9a9a7e605e776573c513b*10&91bfb4a8097512a2fd693516f3bf77c2*2&12466399b9c6f46648e96d5bb89cff79&9e64ae36647939b4cc0444b47bb2ca5b&a2a942425ee88bb043adacddfc158c09*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"4f814eb3ad9a34d095d7161e869b944f*5&4d1f2bf81604cb75b24b360592c693b4*5&ad37c658600430a5506d6a5a6e8403ee*5&be5fe28ad7c47d1c4246b881c082991a*5&3874ff25df7a4c0e8003aed8cd24baba&ea4c6ac72de940004a9aa3f23598be01&ae3404fca747d725e446c1dcfa3bfa13&9d2e10b46df46b24dceb1ae490b1b223"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"150db4734b6331e1e66144bca8dbb513*5&a19ea91825f2a3c8eae8d69617c8b794*5&41194645fb66916c2f295506f307065d*10&5cb0c01a40b4d4c6858b7685caa6fd8d&607f717e1f9a80561d7aba8f4c709bd0&771a3d2f59d77f0a6439634dd2203b40*2&781c2894faa6445fcd042ecb1869646a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6b1ad965b69eab3717f722600bb0bce6*5&af7d099a2d2da7c74ef16d85761c5d0e*5&c85466fb62c5369f5ad670d7eb177c6a&84693100350c32eb79e51abe7cb4cde0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"80390f22926f4013ae62513aa1a5e5b4*5&9d562c27944ca05a9c16ce0f30719182*5&5d508126a09358431a4a8b31c1d66dee*5&04ef2c88bd0896154f124c09467925ec*5&5d588713d3afe6df2cc872cc25665d1e&88a3c558763fbb2d0d56c6dc8bf548fc&b49bf3514781adb59d2d6aea0aa73170&c9063a87eb819c9052c40f63498cbca1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ed41dc7504ec694606270a7bcee84cdb*15&164110124368a312ad891e0f1496babf*15&3143b2a17442c1885a3e97f5f6106274*15&fc6c55e5a02c671ca99e00c843defc8e*15&92b1ffa3ba0387b4aff791cf31df8f46*15&c069bc5c3e4ad6adddf94cc04f1fdeb8*3&ab76a23d0234343fe2bd2d17e30fe456*7&6a7fd6ad764c8709dc9fabaaedca9462*3&46f81ef2256364590c234279c0136dd9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a8a13466e7f4c82747b0ebf8b143866c*15&b024f4e31ea489e5268e18cc0f402ef5*15&26eabc807f4ede7731aee619a26d5209*15&e5cdee665321a0916ba34e7ea00f2a29*15&7cb1a4fd883796008e7ef88b744c1bbc*20&e977663f498f5c129248c6e30d28234d*7&2936617f8c0a697afb36048e590b5f9c*7&804fc1f94e75644500eeb212264baa35&39a36f7ca6650b355e9b6d6eec4c8322&68eff4c1af2ded6e082cb601a8dfe0b9&c241824934a6e37c6b0f0ab0d1d52646*3&8513cbdb0e9b56cfc2a4db31cdb3c067*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f161838350b6856f4b9edeeac675abb0*10&753e9fab280bc33a1374a12eeef4932b*15&94dfea2b30e79c08a571ac370051cf8b*10&bd7d3fe3cd196c4542af96a9d95fe34d*15&e25e9b777f1d68443e8054206f588a08*3&7fee611f99a922d0e3184f2090d7f1c4*6&a8e4d569eeedbb695006376c16ef5bc2*2&d64a3efdea4758d2a9b76b9372649807*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c58950230bf5703d5b12b473cab541df*5&c2a2b4f6df01ea1508305fbf44af67c8*5&3141f33cfd5741833d8a03b42652b4ed*5&e703b0984e41d0e5ba95178b70e206f3*5&5a0aad25bb97192cd5a72e01b745040b&8c264d62b3e54d383551642fa6d68d75&baf93f2b3362ec18851dc54dc5644e18&d29bf8f4c4d0bb20c1d7f9b443bf54c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"581902787e5c56dc6af6e4cf95b090a4*5&08a9a8a457b5e2539f5aff7e04b059eb*5&f8d6517bdae45b01f7d37e6a1def427b*15&b9a572ee7f1911d6b2c890882f2fcaa4&ea2947d204bfd87da505af28581c653f&3403e4e47a0e2d534bf86776a87cdcc8*3&44d6b9056e721d065b86c9301d9180e5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"739b5e5b0d2f3f4c6cf758c54ff69bc6*5&9be432cb23da81df37daade197f1df00*5&49a17954454af1442bc8099cced250ce&f97051f6d70f26ad9b48fdfe16622344"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e0df2127f57e294db80a279d01e18a71*5&be2adcfe9ff0258d74dbb6e6f357339d*5&87adf8aa631e7ee1b2bcaec4025b50e3*5&82ed3f8049312cc795b09ff6f0438500*5&1b814be261eb5a8dcaff4b5fa0d7e304&2e30e83aaaf420eced2299207548d786&a8bbbcabfb358944b46eb6f3e26237cf&5e96645b7d8cf6b494499e405b961bd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"004a83f0670b9235c9af76e64e1eb364*25&9009ae4779db56a6896916aa74b52d98*25&2e74f3f46f898285b45ba0c7c140b7db*25&79c7f65713d7cc2c78749484b85665d5*25&46c517a3f9bfebec0f2abe7f4a0eea9b*25&a155b10959ae6327484c761a64049326*5&c93b8aa42c28058953dd5252be50075a*35&f7d2c948dc12bd6db5aa3784bba8f233*5&561f2925ce31ddc3c70c91876b72d48e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1e3222167768439bfba98427ee6572f5*25&1a2bf478208ab69027f33d4d2c9e225f*25&abfa57e23eac4abc95b4d8c678a0cbc5*25&e28f7ae83931bba9dfba9f6f980485c4*25&ad65bd70c57c5bece4c7e2ab233acf26*30&d1ade210d6eaace564b34d3f3280bcc4*15&915c2cfdcb6020912f4e4e42c40bc132*35&ddfd7a0f5ad8a2382fcf9d5da41419ef&a9f723060620bc70b3ae6ec25202b809&740aef446bed46a153c49a42af3320cd&a279d4d336f3c512d5f228ae25c95381*5&a7a2195c4a3be04a94a1f0d7c30d2836*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe11cfb098e77a88dc38034f61f07493*20&b5702f7475f20f0eb993e83394e3ad9e*25&4ee11685aa32fd07c25bcfcd728eef4d*20&96fc333321e51268c04bb08c51e37821*25&b25614fbfc962b8ba2529ff551619b79*5&28b6e39d754ebc835180750e4f5a32bb*34&a1aef379c4f0368961c2f2cae3ba30a2*4&6ba974ae058a5fa9f28eda93f48f421e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"580654ef8a95d98b25785ab46d1f203c*5&2586c51419cbadf03325db791d119722*5&28efacdc4ab8a619fa6e29856547e7f2*5&1d01d14bb4dc4ee2d86e0d381e3fa53c*5&d6aed40292c57d9ab5c4436ff2b0c8d8&c55fccea163659edd17ef8db604bed76&735a48bc1fc240d29bc11df7ff8ffbe9&cffcaff3ee89431e421ff431ff7edd7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53dbdfb0e6710bd95bda7f9e0b15e21a*5&98bde029658e14e74a6b191828f8aad2*5&3ae686adb005a1a69b14601864c808e2*25&03f9110dea4b19771df529f6bf9a1afa&61e311b7ebacd782679937ac908f355a&a8636a51de8313227cdec5be5af10c80*5&ec91ffe734574ef766e77e16bc9da912*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a52dfdf6c56656b6917f79bcd39876eb*5&7f70c44b956ad349629b8fbc2ccbb70c*5&19956b1dbf848cef16d1a80d4b316723&707bcdb3d5891df3330908915544a97e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7b2e194d0a9dcdd2bf44ff977ada60a1*5&7bcc3d6cc76e62f367f74511f751dec1*5&de2d943adbf93a171159cb45b4fabae5*5&e401047e1b8343bb13153972d804c536*5&5865110367f7e67a941a790907545300&c99fc06d502092e63ce9d54b5ae7fa4a&ad593710357ea2e6f138bb84e2dfe700&fbec359ab224279176a423348cceb4f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2c87a0cc790bf8e02cd4a2ecd57fc238*10&fc712257f47ecf466e28600f0d87f6b6*10&29e7e58fed0d84ac974a74ea787963cc*10&df1b1c19850b637e7b4565c5cf554d9a*10&c929f4aeed27db50b8d54a1fb9066afd*10&b6fdbf8b2deb48789b61b23dcf37771a*2&4a08098634c7722e08be396a486dd049*2&8067332ce583b45bd789f99353ccd7d7*2&2c16bf0c447ce80737b31b443dd9f600*2&0a26f956625dab4101604a3c94be85f5*2&0084ae8017f3cfdb9c98c36e85fa9d84*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4811dbcea892b49a81463a01cb37fc7d*15&a15ff9ac657ec77afa072c8a3222c748*10&8bd14b2cf63e8a238b5109a4ec63ea9f*15&e998851f942838ea52272e0c4f5b173b*10&696017e63eeacd06a52e71158ee1b62a*15&8a88a5440266404f1b571b7291a5095a&851b9ef39360afc18a320ea5ed1805f3*3&94b769c8c373193d528333be88906609*3&aa3396271eacebe1d1ec5a2791731393*3&89a363f5f986bc978e5e985352cb9c2f&bb4a2051b975daf3f65ba5b20b4d7c49&397c13dd6a3619fdda6f6bdb878e1799&50d17b0c42c9b6908d0c050c71c5a7ef*3&21d665b4c0e6fcdfca2c3bbbf650e78f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4926d0d9a3bfb980d53b76bc640f2f62*10&ebb6f18a4ee23d1ec2e0586f80a8d548*10&d64c403e25b0d95bb8817af769391514*10&138ff69f7a6355be891456eb3da76616*10&e77f32d60a236a09ff5368fcf118f2fa*2&02e6805d4b9c1d1fc052d42ed4cc6cb1*2&13f59c106a61fe13a75bb670d82888af*2&40b332fabfee242beb6186c47eee6d12*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"b70d83e48b0e5f93b6d244b23645dd84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5f3f25c6621fbb729128e99c7e11243a*5&409311b76885ed2e9dfb50f63008829c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a9eac2866b661c4d59b90a007c67ad54*5&e02e01c9a12b1c015181767a090b76f8*5&ee596d2da42d452e62f3829be7e46fe4&ac2a092a1048671ec6ff0f528d3b9589"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b32331e62911569c2ccd3f1bb45e0b50*5&89996c09ddb72a7f544f5aa117bdc7bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"0f6619f11809fd98e4ce3d320849dbe3*5&678320c0bb5f9b8f5240a5cebb71b108"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1abd715145b82360a9362c2f459bb706*5&d986fb0bd71fe00e78b60f6c447e9ac1*5&5f0ae4c6c63ec304f2f63537d00ab52e&9303f0416c91fdfbe83a88fc1df819e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"93404640afe9371f07dbcce4c524ae01*10&3c5b0a0bc261ac368a66a6adff747f94*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d7a95c3570ee75b830f554caa0415ba6*5&1b11636ae7e03b270654d19bb85aeaf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b1ba20b9338d001c4b12a0ba5ffe8e79*5&df0312543ff1af5b551c569eb4ec157f*5&e934e447bfd042e629e374e80633f1b9&9e0ca9cf61cf23e39c36d12a3acd989a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih128oh64kh5sh2dh0ph1_iw128ow64kw5sw2dw0pw1_n"2a6b125b087be285c00d9f960c8887c4*5&c173a9848d4079add57289245819e497"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"83a41ec191e11741915619d45de4d18a*5&80abfcbe96dab88b67618bb1b5f4c5ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic40oc240_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bff27af693bceefc3c600bca04491f34*10&32968cb32834a7498aec4f4083e483ff*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"b904dcf3d747f0954e457240dc23691a*5&87efa353235ead0474fbd292d4674da8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"212cb22182ac789894887e4e34d33f48*5&65ef5b90bc82052661b575ed1f32f8a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"a742f68ee43bc89d12a4bcb4179a01b3*5&c53f2dee9863d7ae6845632fe7e9f728"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0900d4934eefc30f0165f20b744b2e57*5&0463dd943179841a0e0d986ec212ed25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc480_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"270798b59ba730149a8249066afc0e27*15&7cd68800fcb4efc9455607df85f23abe*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb1_ic480oc480_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b7315d600cda0b0fe41cd5bc069d9d1c*10&7f117f6c87a803b36c50f81b8a1da372*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cd3fade33b573afaba6d18b05dc21b7c*10&9226c7c53e8bb0c2566633525bd5c4ba*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb1_ic480oc480_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"c371d5aa15803cfe12b3bb7fdb2b498d*5&87060981f4f9b7fa7726089bf0155686"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c6357564fd6e6ce2001483cefd7a6ea4*5&01703ab6c640c35629b37bb350dc102d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic112oc672_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"466a49a0d88afab55a320113481e785d*15&68ea1ffcef94359c29424447f058b820*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"237ab89c92f48c04b45014b2dc809b1f*10&c51e9c682f708610d2e839a05aee92b4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic672oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8c87d482462980f8f3b4e234cce0e867*10&ec3f1c37ffe7258a4306e98052edbf2d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih32oh16kh5sh2dh0ph1_iw32ow16kw5sw2dw0pw1_n"0820656d2438e846046418c50f5b0318*5&941ea8df50d08619ce1f5c71e8be4918"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"73d7997e476e0d648065af93fff035e6*5&2020c9c373eadc5ee0787673d7dad415"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc1152_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"af483bcbfd41c51a9ac71bd718a07094*20&d5190dfe5c0192efea70e4ece14f1db4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb1_ic1152oc1152_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"ea20c0911be09dd3a8cac9699db66324*15&5e4e71fdceeb4946a4ff6a9836319b63*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1152oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bdef67eb6c0f06133d45be2cc8acff83*15&94ae1818b0e9a3a1107f5dac81849e8b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb1_ic1152oc1152_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f82c71ef14b89f428c6368347165cae7*5&17620e8a0ff1136d55ad3c7a3ad1a84c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1152oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d4b135b4ff528c107d39b44f44997ffb*5&349f855dbd955e06b9e8ad4b8e5caf21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0f096fcf629a17f5d02c56c6570b3008*15&f9322a92a063721c6a95475084b104de*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"0d92e162983d1fcae826a4c7a56a510f*70&043d0e98c2f2901a0fb6404cf33565a6*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"fb38f4da0df02464957b8a3578e7cbc2*30&d7fb8f9dda50fde37d3e208f14fd4e26*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"25fa43039db858f8187624a263418c0f*70&8fdc512d3d82660bb146a8dc1cd2b05b*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"972d070deab3bb47ce89ea91aa38f865*30&f23ea3c5835957ff5f334fdada02f7a5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"66d32a09042f28a451d09af4f3d568ad*10&759d52de9f59924bdd7c4d8eed122cbd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b694f8acaaa108926e8b270801f1f225*70&76ea4f2c8df7d580c291afb2ca4f7522*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"896dd5a5ae99c80f22d161b8b79c1ce3*30&c11c6a109149a6b207eabb926be21141*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic40oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cf364e2d143b64082b1a7f63a31a98b2*5&36c42d1987f81ccf5a8c4b833d172626"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ebc6ab04777a71cfdcbac36a99880ad3*55&fd1d9d8e088b00c0ff4883da019ffed5*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ecab99756cb5cb871cc3419bc9a8256c*15&c51c7ae99512ea113cf3dc33d315d818*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"77d250d3797c1775467f2dc22baed403*55&f5cf49261389b0e239cad3e674ff8fd3*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"3416f5c28ba62b3bb9b62b17b0418f75*15&0acfa79a8f38a633e6fc8581328ae75f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"d34e07524107e1ee28a0038350551c5f*30&891ef8b34c2eed1b9dad85f11b2b3172*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"612023921929737d96b3faa8bec60741"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"8a93ba30e16dac6c9d01719cf8d3e981"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5a90bd12000eedfaa7d3bd4cda40aecb*30&c88cd33d6508619c21d25905cb900d08*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e30433046309aba354007037c45d5cb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1d89d5378b4f07e313dfde856e7fbcdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fe1c03fc6f8bbe9526676b4a2eb9fc11*30&d85b43080100e6430fcc3d1df2ab4a71*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e61494d8652726bf5a68681d5d294a4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d3fb3de549b8441f9fcfdeb7cd618315"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9564e72f2b353dde2d1bfea7cbad0d2d*30&32ba5e0b5018f813d239905a2daf38bc*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a972cc37cd75c6ef3b9ba19b3ab178cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f1625c7c9dafc415c7c664ccbbfdb2a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"97aded497ed4782db401bff01c1da39f*30&2ee57af9b9f4a008d692ecaeb8793704*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"faadea5e4817326208e3f1018c1f81a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"db2b308268199c509e3e2371b4f62684"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"f0d58284ef86ae76d1991e8c7a9f036b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"2274ff4ad55b484cd8ae1dff2523e170"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"818789c81558dd470c8ace39ab20660c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"fa709d982630a1632c978774d3a87fef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"37d6a7870fa17318c968c15bdfac1470"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"a0d493a045a4b42d7745ac0e761f05af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"9e7ba210390eb9f351637a9f4a9d2ec6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"f141f12c0758dd21095604edc5be8961*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"aa72bd2322618b1162eee8ebd653ec15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"7d3a1daf43bc7c7ba91af9fdefc7396c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"43662c87c3d59186602c1674561f6df7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"9e740ca7ca5bc50d3483880696752a5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"396411b08853df343069b098e222fa8b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"fc84076a80b61a3d2a5f43acd55c2301*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"80a7e0a7b4b203e4d7018e08481d5f8c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"ef25e789db84135fa4d761dd7cadb7de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"5041d875e0675193a8cc8db63c291692"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"9dd0a9063c74dcc96501247d04935bd5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"7dc2bccd8e55dcfe4c049b905e9cc77c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"491ca1efc80e1a23abea2ff6ad16c740*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"6b338dc364a7d74dceeec41754937cc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"b3d288716b1d6f66a9f281e63236ac1f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"73655bff9b1699f6264b7439cd01a56a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"994c1784073db053db3f4bc80c672a08*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"de67c273c91487d9aa21dcb8e81d2eb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"6dfcbf9a72dff9f00d15f2a0e7fe6478"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"a5c8a982c8705847d478ccc054f99775*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"d67de82dce0b5af8e445c18677970b2e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"7683a04f041154f5ef2010bf570a5704*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"4e654f4257f8e9b1467012cbdec690c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"3f1ead7d7ceb3a5444ebb14445db7468"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"2797fcd8158af39934589f78dbc89b52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"75cf5a413d577581fd08a42a032d036a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"464b93de589d74e3d15bda81a96374bd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"35d093f89100c9ee8882b82b10cc5c0e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb32_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"da6d238e9b770fd4400b4ebd88166926"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb32_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"a915c0d978914cec5da454bd548c4ae8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"a96b4238aad340b2598383b468ea3bfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"230cb775cb9966b0b6e257690de8461a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"c281633bb389e4f9fa2104051ab2f45d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb32_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"4cc8eaa3fe83320b04c22e064923869d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb32_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"a5bfe9267d29ac4c01d32a5065812ee8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"fdbbc8d778be7c5f9259543702fba89f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"bb299428fc879d9ea4e2a155915061cc*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb32_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"84d8f2a0755372a0e39ae3c97c15a5ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb32_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"6f895c808f6575cf4fda8d04c5635005*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"b31ec6ecd8c3f2cf0279a5e5da15ab18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"3b92a641c148cfa748cb14d913193ab3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"9df4a65c084c9716500494bf45168597*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"4e1c1ea9112bcf8b58c16f281f6a5db3*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb32_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e391362485bf62bd36f954f7052dceb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb32_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"0b0692a0e210c217b5c0553d66bafb01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"90f7acfb3261edef36a0c37cd0abe8e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"cea83e71336399e3fa705aa200cff861"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"1faac4c22461bd10f15fd84f43b31014*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"99e8f5e25063aaefe38e7ec693986b52*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb32_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"fb65eeeb9c3fda32dab233ffb1d38303"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb32_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"53e8e5e528c761b861a886357aae78bb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"4c5b0f4efcdfcf500180a769a3fbce16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g32mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"9b5456b73fbe0c0a74d3154e67795f63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"94fb9b6bef3dc2588137ba7b869f5e27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"8e94d20335e76f9de79d95846e92e323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"db3855a34cf1ec80fa4dffe7ce9bd02c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"4814e60f1baabeecf5038d7b856a92c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bdae3fa88678539c8429b60052cfbbac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"7849992c37ac295ff4f5604f9e66a6f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e88f5954c24592289e64fb5b569503c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7bb4e46fa55c4379ecc555b1199e9134*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c13b6f89e313bc2f247614f9babfb576"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1faf0d39a28612a3face6f0999ec14f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"5b9c2c23c6cda089ae44e9319748fca5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b43fd80d7856de47f8bd59ce5deaa419*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a3d977cdbbde4bc60f52d5a1d037387d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4f3782b1345b252eff368066cd0f53a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fedee44823bd9157b07b9e24dbb47be8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ddd0dcc3aa7a9021567f04522ee64480"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=dst:common:0.5 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e2b15650982c11c21918acb7311f0499"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f9512bd32a4bba21dc24413b3c7f8a58*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=dst:common:0.5 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"60f2399ebf500d8c497db7e887378da1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2fce9ed6288a7725b75f9bbbec8c7fa3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7aaafb3cb6d424958d53b77aa9e111c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9623af0197a9be16c2e49cfcd7d39b12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bf8e8daccc7128ad174dae69efd1161b*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca6379b15b44209fbeb3e586f0e10ff6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a7a79a7ad1a2424586707429952f95b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0ba7cc661de9b593603485c298bb565f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"23e5dd9313dc47969a2db859422b7a15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"bdbb3fe80026d027759c63a03370a821*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"915deccf53d206dd3ffd0203cafc11ff*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"78c907991d2cd862249ae0abec51c8b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+sum:1.0:0:f16 --attr-zero-points=src0:common:1 mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"150f045609dd787a49ac99e1c6d44cb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3dde7d7abcbb1be5feeca3deb8fdfd28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+sum:1.0:0:f16 --attr-zero-points=src0:common:1 mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"03b3ae5e8a53a1c296bcc5112d8d3550"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"948bd65995dbd154fd477e64cbb5bf0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"09e2623630f023f38f7665469c10aaa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3c4c30fc2332edaf2bb2473ab16bc6c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c24e31df62586c6033bc47145422bcb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d8463e7b100c0c85e12abb83cb002276*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"aa1052c6c7840013a63d8d73531ce952*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f766a496be7fe442a0bb8987e5029ad7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c0d86b7d77d224ab1dca3651a8b2a392"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"57e57db0a0eb59cb8f9cd33432f5a6a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"3305affff65476350a7e81fefda651c6*5&65efc2912654265dff8f4f24f2b0879d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"0efa7f9c2be5941fded1ea5b3cfd5fbb*5&203d873a72a26c0bd9235efb66d90e10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"8f38fa41875a76bb720623d5b3de982e*10&e5bd9c32dc831842c62732643d6e3db5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e7aff99647d441df191afb1e8be358c0*5&cb6d5d3e21a58211a0f1396508092e2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"0dafc9bf7dbd3bab564a35e89c79dedc*10&3f4da785ebde2108a979dd77fef15070*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"580e3bb6cca247d2121ec000de6c63d0*5&c9e944447da21240301f1e4c09fb869f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"a4bfd3bbd7784a3a71edeb4ffc0699bf*5&0f8799e2e737f4bf7cc7ab7cc862caf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"05ad944370cc4b9450492da195fe69c4*5&48e1c36d6bef22606bcb80d4af7b84fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"39f1492e024de62f626738aea4a47e38*5&ceb91e0410da27def3d355c639cd9673"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"947770ee1d80f90604d517bf422fc043*5&5a55bd1c7c7928ea267b1650aa51078f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"a773e12a5f80c54ab37f0fc2ce79161f*15&b15639b7ae264afc4707a362d43cbaa0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"3b8adb6f7745c2d44355fb3b3e9e0a23*5&3fc9d6e053c240a33e6b2294625d1715"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"b6ce492702792094a8bf0f10eed7ade1*15&d7629a424bfe351c648fb8617d2678d2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"6d8eb2ea0377e9ae5957f786fe932a8c*10&0bb5624d3cf0abc9efdb86abe13ad5b5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"f30b5eff9b7fe08fbf248ca6620badca*5&e0840a216f0a60975e804c1dd98da154"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"7516555cc86252c12bd616179199f446*5&77c5deb0256679615893d73f8ce0b089"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"02427a35d27c82f412586c1c2df7db17*5&2b2d292d7a27e9a88562945de71f1d94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a84ff021fb597a8ea7420e85a25f2a2d*5&343064bde12d45cdfb06978ba7cfece1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"02f42f7c13504b2f4699248d87e16173*25&136934b04c06fa22045725bbf5a7196f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a30254b473587a2af15927aa603a7b3e*5&9d7f04bbebc55865a2cabab98f2ffaa1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"3fed3cc80f01bb7d8483df8723cada83*25&7dc87d9c10e1e359d0772e2ca2e04b32*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a105627edb2515134958239179f265b0*20&b1490277c8e14298e5885dffa8078eb9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"893c1f2d312df0742992ff361b8792bb*5&fa68fba61a4441ada7bb5347f0ffc0d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"6441f8f02dd7e07c2a63c9310e23b166*5&fadd3fe39637ccda5e4b93586dec969d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"108be57a3eb7ccc093db2a3f9d3a9f74*5&cd9c3e6dc7ab797db4e1611ab98df751"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"1b962a8f2c2bcae0edb526a7352e9cd6*5&c3617d98666df6a68b59c62cc5df387c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"7b9038472f058f5df7a19b072edb8a82*15&9290c8b8ecc96674e0abf30068e4f20c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"8efb3f433b4fef0951c58ecb6312a88e*5&adb7c31f58b6ab0b99cdd5ba874b0a95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"8389d7f94d1a7157e3e6f6f3b40ef00e*10&fd06517a5625533984144e4fe1cd7602*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"27b4a06dccd8b7a0a4ca6ffe700ca5c5*10&bdca3d1c1b52bfbc0a3503c963ec3f59*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"0c0d7d690abc89b63816e5d445ab76a9*5&30683aa7ba921fd66c268fdfd2d3318c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"d4d8fd91d77b1e2de1e56425a395cc64*5&133da4015680d3e88b8c6ca2dca34f6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"04e6b17d653f646bb70817158c1a96dc*5&d1fdc33f8c928f63b1ecaab3b621d161"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"9c6aeaf262b68f8650183be9e68d0b00*5&0ef474abfdffba0988b4658ad8119231"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"1682b3ae19f2156d4922763660a4e7c8*5&a0fe9099f136095ae0178f01d5701357"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"e37c5817d05300488ffb01843e45a19d*10&eeb3279455add731821585e8b5cab141*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic+binary_mul:f16:0:abx+binary_add:f16:0:abx mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"028a6a8ae900e11594fcf243beb8b2ef*5&808087a019ab99cc554ee7d3d570e393"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic+binary_mul:f16:0:abx mb1_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"9931bcb4715dc9f5f3fd1dce0f08a3bc*5&12293a68fadb3be2e477ce372092e6e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"1795b8bb010e1863d194765ff54abdf0*5&f018fcfbd3bd5372150967991b5ac725"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"83df68a160da89098531bb2193315635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g16mb1_ic16oc16_ih512oh256kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"93b0683b786ac7cf8dc766f3e377ded7*5&38cbab00599babdc87d31d11f75fb498*5&5bd5ffc10de1d60de7e5461d51fe384b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic16oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3b438d651b22def2e50c29ba0db96160*5&5db9effd6fb81532a4b896cb23c6f925"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a33fe2f05c4ed7c46490236b7aa9341c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc32_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"ddf199579fddfdfdf305c31d39686da7*5&30b4a12c6c9fec1ce99560e6c7e7a93b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"77fac0e59d2cb7361f390b67dbc2f8fc*5&80e40e7f149451c669f5aced107eddd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc72_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c51647b0685ae03fcf3833b924987640*5&71a35cc4f567d3cbece60eb93a3cb04b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb1_ic72oc72_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"3f2924f2f4d686a9215e2252e441e680*5&f09056b5a2c567387f8fe06e11763fe7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6d094031a2d9e9c837e8bbcf858f371c*5&6caf0dab51740c121e13edb8a721e6d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc88_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"57dadfcfd49dc0a32e0fef578172d852*5&c18920c38d6037cd98c98f485a92267e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g88mb1_ic88oc88_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f33de3df9d7f2234396d521d090fef3e*5&420675ed93859676d6e4167f0214c713"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic88oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"621ad0381dbd732403505170243cab20*5&8424bfac2510bbb6b5250530cf8fcb28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8e30a745e4e225661b1eefdb1b29aafb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"0c2fcbe3697e88137dc654b8fb3b53e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"29c99eeb548e8967a5a74eb6a1b840ff*5&f4df3fa8798739c0e7ebf6ad11736b04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bc06218139a5e114cc81d57a8dd954b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0713dcc5229fb6f74e3902f7b7aa2bb7*5&5d2c6bcc89a7aad6b9d7580ca01e7ba8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"00243a817668e86ed9d640b24b1fed34*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"7a30f674919d33129ed760ff2dedea0c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic240oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0d34214851c41da000e9f15110f7c79f*10&c20495a148873b0bdc58ebf100f87290*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c7841caaf17b2856760564fdfbe9898c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6a3e6694f261a969b5037d30a925deb4*10&23e06aa02daacc0873a239b4b8d95d7c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2dbdb32084dedc84ec81b564a50b9388"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"2dd16c30a0e66562a63026ac8a0e7966"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"55b3e4d2b6e72a30ca3cf9d755716bcb&092a7b120569205929ce4062ca7cc008*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic120oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"15b3ff8ad951223ecef3ddbf6fe34708*5&3e7dc9fa64c26edfc06f46bd128af640"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8735f5c44a8348a300508f1d5594d768"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"5871ea3440bab467b08db57aa164037f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5edbdd436c7ab807fabd01c98f86eb8b*5&9e8056ef4be4a5e7d1646b3d3f0c0337"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bbd80253ddcdebd9e0232075db031932"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6cae1167977845a7ce6c34071d2362b8*5&e54cc4dc98dcceee78aea0b42f3ecb74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"929586eaa3385a89a876523d99849454"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"e10a29ef98d5000f558e3a8eadee8f2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"504b040e87e52b0a695fa337a4fc99cb*5&ea2c2e5a025b5f7418d6dae938a71c21&3ecf08fed62b81e913c29a6216c1ed05*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"af5d3553ff750978e181cef64ca28306"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic288oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c697b723f23e52b5c1f59c006ecc8540*5&bdc9b06918dc305357f028475758b8f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cfb5b7aed721c8e16be3f2dd5102e593*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"fd1dc357b8703af211a902c3d1b231d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6805fbfdfa4480ca7ee827b8917f228d*10&4f2612f40a049918ff92b0cbae56621a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"67cd8677d53264e759211e8ec925d7e9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1262e35e47adab9da17003c35055619a*10&31e3b2188af5e8f38c8688cc41049b5c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"c360ddf32d4375ad873257b707146ee7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cc9f24cbe94a964d42df10f2536dfe7e*5&c2c5ca819da9233c8255456e9e33c9ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic576oc128_ih5oh5kh1sh1dh0ph0_iw11ow11kw1sw1dw0pw0_n"4018399b4d6d5f41b85a4cf2dd4aab62*5&8c72d9e660a7e19a0b22e489a7f4245c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ecee32424eaeb1318cd62d612d66fc60*5&ccd14899792a407586a17b4d4972e1ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b91c734c1c3f8e017e9b2f0e0a8ba986*5&6ff8c41e6ed41b37152cc7b8d5852983"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f49bcf2e47bb7e27699b6c443b8440ab*5&e3948a2e40e3d313c36368dffec6bdb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc128_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"3091c43b41ffaef52431dbc9f8f54d7d*5&d0c99a3ef08694507cd28fdfa394211f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"6e9e8be05ddcdc9ac78f686e79354c84*5&52b27edfc98d7bfafabe3b772bc1ea35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any mb32_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"f7c54d068be308e8295731c5cd68b418"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"6d39b65488dbb6057da497545fb97462"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"15c3086161d1b716f90a8f914f0cfd1e*6&e17616572c0941e9a30026a9760878f7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"626dbd164b0ddd0a50166a0b134d6dbd*6&677134ef3351ba960e74daff324773d2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a6ddd4a9bd775cb80d619baf6091facb*6&c3f723f47a4b08d3a6661ecd72c0e63a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"144a99133a0889fd4fc881606fc94beb*3&399230857ffd36f71bf31ac67a177a77*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4fb4ced65eb94156a9f9f7f30e15ab33&8395f8acb7f66677b35b332bc3c80522"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"402decfcde437dafcb01104459451bb3&7b258064a22210e8c78e0282936a5845"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"69edbc264478ba42e4c2b11a101f444c&837788ad46fd180cdcdfd46f7391a7c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"afc6c540e5a953140c930d63a5c75b2e&a76e576de394c8b62e558e2555ae4087*3&f30ed1807f27a5e7ddafade674cc1504"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3943f079a2dc33bc9477af90a6b2839d&fca9edda214ff3d04ce1b69eda4f1548&957475631599537b0d693d87ebbfe1bc*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5789e75f82b6b3048a0f3cd3521809de&f128e44eb6bd8c76c9640b9e55ea9997"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7c9b85a4165022b8ddb5f122b25df962&906ffa0f79b07c1631201123365ef53a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"54feca4d16842f45d52e269228c9527f&ce5a6b04d4b7a8250a2e763a51dd35af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"52fbce6609388e09871bc951af5ee9f3&cc34b57c9ff227e36d6bc63019cbe03f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"04d1d4b95bda3a810c7487ceeb17b86b&f59a2f2043821533a23c8aa6e3b37bff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"60883be845977b57672fb8c1a249faa4&40042f3c30928262ce62d8e335c4bf08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"4272caff8eda4b6a3e5c39e434f639ce&7f64080aed1c1610b9092912a9e97067"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"e55b13022c64750b9e01c8372f739a5e&268f67722d5af09c60ef11fbc9a16d04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"4bbe1976ff1c95fe33b4989b8eefeb95&6ecd5f09e50699a9d49b726f15757246"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"e11e74a4c2e47eeb9b839de08c62a2ca&3c762a4dff944c7646ee2940220d8954"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"eccd65b22d1bf0fefa794c0a6088744c*2&3eec01ce5feafc90f1e76bd21b4afc83*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"418345d7b5e88a619d9455f185fb55e5&eb9d81eb224949d0cfdc2aded3c810fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b6d8556a22901cf26a419107367a5523&64b78e6326fac0d89c48d890bf57fc7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0e1e075ab69caf7e73ef91ac8915a684&193dbf00ecec4583daf18f676be53584"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d5c69d2d80d1fa96030d6f2bc30b8891"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"ecd16fd845741fa410f785cc46ddb78d&9b87c6493b63a8c00f1f3c2699e1935b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"737b9373516ea7dbafbb18c6245b1ad0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"4aae1fe2a65a897b4c02b22bee85fbc3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"aefe1ee384002f421791949882c041c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cb37e01a4b4ef614e55d854cdf24ca4f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f12fde1714e3884683c098da0f71d16f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"c89fee3d7a90261f7ca8e6d97262af89*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"254496a280d605ba2d670ded2c77337f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"398acd49150d99bfe565787e97dc79ac*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4ca0c97d2a075862fbd2b28bef4216ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0911367a93c8bf28219120e2e9c1f4a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 g144mb1_ic144oc144_ih128oh64kh5sh2dh0ph1_iw128ow64kw5sw2dw0pw1_n"7e929fb067bf17a77b6be8570accb4c0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"51ff8160b88086450ea2162108a4a37d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic40oc240_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b2a6a8bc02f6706a3bd66b5402e4ed37*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g240mb1_ic240oc240_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"eb82addd91983a247a3516465d51b633*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic240oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1778c36a22fc257e3f96d855ff78efff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g240mb1_ic240oc240_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"5d913421593862619fedff146d1e9c6a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic240oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a5bc08d3f1837b5f7d970903283a830a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic80oc480_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"57c4e3f22ebc0efe22e2be351de84578*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 g480mb1_ic480oc480_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a0f9b2cf5b1de652c338e55364915784*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic480oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"66547c7c91dc0fc8b1227d7ecff764cf*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g480mb1_ic480oc480_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"a47594eb0b712555dfcfa5903318004d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic480oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"25e97284ef3acec6e0ca9e9bd69b2a01*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic112oc672_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e948b532fc6987ba12b53a091027b881*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 g672mb1_ic672oc672_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"c24f8ba1835b61c9d7b7ada329f6e593*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic672oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"213c42fad735d36998be62054f5a01e8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g672mb1_ic672oc672_ih32oh16kh5sh2dh0ph1_iw32ow16kw5sw2dw0pw1_n"a7079baa7a8457195bf22260bc2aac13*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic672oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3762c4c0a0f14905447f183ba6cd7f2b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic192oc1152_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f9cd256675f64bbd66594cf6ca69b3bd*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25 g1152mb1_ic1152oc1152_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"67f33e9c861b41de4f2a022d983319ac*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic1152oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"866aab18067882e638ef440087d005cd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 g1152mb1_ic1152oc1152_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a77a4ef2b7608a67bb433f9c74b0818f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1152oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3477c3350b220a1556d8f3008ac149eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ee18cb5fda3969b287baf3329552baa7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c488dda2eadd14634e36b7038dce9bce*70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"eae79aeea5f602016ce9b57972dfb07c*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1ba06ac57d56d90552c874e7a214610d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d26ced0a872e0bd4ab98b5f5a633a5cf*70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e69e6fd0623629b9237ef1e9e606a17d*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e5e760329aab184551a8d24d3080f8be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"14c365b60fd7aa1a8218b4648b7f6298*70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8e395b0021b1cf2eee7e7698b8846550*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic40oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"aa98127ecce72ff56bf3e9fc9e0c9c0c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ee075afc28865b7da1e8274982ae4501*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8919b1863d2e15874694f2ef6767ac5d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4fe8737233f3e5e2f28af101412083f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b73adaa4f5f1953dec1c507bb8ee0a4b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"cfe4042dfa540e6a0f9d2c7d7c24a258*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"1dbb574a189b73945eab79bc657221a6*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"0215e2dbcc3edde500677ac08066ffc2*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"3a3c288b513157eb8b28c1a2b663ebac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"0a0387be27de1cbf25337b664e608a5c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"379019197ca8f328d97fb1451b1f451b*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"0640b27ef522e10cd2955a855bf77a3b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"aa6e622fb018f4d27b7172b72dee251d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2e736d4aa8f175bd5c443b108d13de68*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"80c42b149379758d6047c8a9e24f43d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ef8cfe02945e7251c472d749504994fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f44c72d1b5476978a24a00f039b21e08*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e33e3bc45096134a882d440bfdbc3608*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"08fca22a4e339e551cfa325c13d9ba85*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cb4826c7cc1590b0df09cf92197113c4*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"646ac8d11f9a16ca85d811a7c31060e3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"03ac7b82a5121ae986135f7f2006f0fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"9b9e69351257cb487e4f914ae40d996a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"44867c8da43068d0bcd9f552c13e83cf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"82f1022a3c64e9d95778177884e89bdb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"7e02c21f3336fc043ae932ee945b496f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4eb641a20e86f44a817c446fcc0d1904*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"1bf971988af16d21947b32c1711a3f35*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"e5b164b8fa45704539b7149ea3fb3c40*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"9c91e816084d3fb61db3fbdfea63223d*5&69ea3c034bfaef7bafada9073eff0988*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"af4502bad000fbbf041e05170e394ac7*5&782268025549691bb5f80d632f5287c3*5&b9472fed04ef90cd8c1d155747d932f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"74180d1ee071d5260974b09309cc1ee8*10&1ced1db2ea779b63d18625ffd352a343*10&c2402033da8a52b835f4390d7c0c911c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"9d9d9552c5bc750ac68c360027cb04d8*10&21406b57377d52ebe05b9e872e02e30b*10&7f6f15e285bfd52ebce08d79d21089da*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"3e765984a5611d4cc337ad0e9a606e8d*5&21485a1e0b1b15f79ebe353a4dc364f1*5&7405e7be18c1df639de5c2251c70ae08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"f564f2e1efd8641dfddc45ee7700b668*5&1e25f3b44585eae1d74b876b46f8f6f4*5&ffdb0a2f806776a6c031d7b583e4924a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"89332354ce51ecf6dc7b78c47280d3ee*5&475cef791d4090ab9cbcdf0695a137f9*5&f58f61ceeab21229204317302f5e45d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"991044932dfb30966624bd2ea645d4f4*40&55d0067e79bf0c255ed858656493d078*40&63ace73ced7fae43958fa96f4f3abbfe*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"1459f9e82a3b1164b99bfc0c0709d257*40&0d0bdaa07585eeda28571867a2a1cc22*40&f767de490fe29e4d4321376b6affc61f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"6a0f5ae922967d6174cfc4e6fe6b37b8*5&39d78eaffd8b0715e6377dce166e5f13*5&b81f268e79082c62ef3f7ba8b57cbbd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"49ceee5703f0bad5cbdf59f7cdd8aad1*5&7aded040cd0090556d194edd1f397bb0*5&852eda695dd1edf6917c010812aba27b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"3693ca1d408b70714908152e92156c3d*5&7ed66470fb2fe8c6a6133f2d769f133f*5&fcb27585bea479c82bfddacbfe30e5a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"dd1a4518110df1df9fdada2179622798*5&1955af5d989fec5fb696b1a0a9a9333a*5&a383e1d8876118faf5ef587156171723"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a615c7b525933435905acfc4c5c2dce1*75&4ec696e1c106dcbd09f39e34daf67efd*75&bce9cf1b990862858ce9dee0043bf654*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"641e9a65bf147c5d075e3b4dac946b95*35&46597031584c224c9a59eaa372d7a867*35&7514874db9840f36120978c4890cc4ea*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"00ef305060fe84bff696bd6394426992*35&fe223978ac375477099c08b9af5a1a8e*35&647c868319dec4b4cb298be67bec1a0d*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"41c2f80c0b21588b74cb75da7cb4a332*70&5e711f0e72fbdf27abbdd47d05e82405*70&2d487b22388386291db81e84aa464072*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"623842ded9e31ad23c485c168c5f1117*35&a2cac1850413ec9d8fa543bf6e689699*35&a1980933d28981beb29663d3f37cc7d7*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"f3780a8c5974e93b0eff44acb6da4433*35&b4e2224aee1925d4266b709d81d0dd74*35&4da53899c6d4dc5bf7b20f6e2868c3ac*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"df5cc86a20d70b02d2371ef123c6ba1b*35&a78831483e5290f9c799c8b2c2015a9a*35&2d12faa5dea8fdefd7c225f8cc390f60*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"150e5cae9c1d0b683997125c2caa358a*5&fc02355b98e5e97c73f50c2eb402942c*5&85b6e0aca3ece605feb5830ff10fb46c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"a345fd11dbdfb47ea8e7d762158ba95b*5&a47124289639b806d46ce1abc34a9854*5&842c0d7a7ffb956abf94db2c89336576"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"29768ab40799568d65a1b14c8f089f16*5&631769a1c752a5b74c5c02001c5518bf*5&c8b46aac86aafaa76087e5e5b75e5436"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"d0190d34f2139d6c9e2c1f1d31b8a8ba*5&b4a7c0766250a00ff74cfc9f2a201e65*5&4d1b0eb8176e3be87c45cba7de47d4ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8b611ad727684e66cc6d66807b72650a*30&e831851c99e15678b4ed41af44d97237*30&5e1a00c01048fd81bcf617a64a0b03aa*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c0e9c51deadcb7201252c22dc83ebb9c*30&6797fc6448a8191cbbc4c090a6e637c7*30&6ca9fa396a0904ac8d36b7b66c2048cb*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"989aa56ae3ae28685c2a3987b24aa5c6*15&591c32fb761770a24a0f7e8df4bbad0b*15&94a1ba4f6673e7c7495ba3228b8f649f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"4ffc7a5595fb614ff338dc9134cecc55*15&f3084d62ba36d54ecbf1a00df9150ad2*15&fddb85e12a3560417dbaa94920907dc3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"51bebb77979c1fcff976b01ea68dac75*15&0bd7129dfb9a87cb9e7e322c94193112*15&0c509d4ee3303ee81cb28edfc1b58691*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"c7a741509c82409693ee93abe621a20e*15&44693585f13dc6348bb0e5ce7c7d03a9*15&a2a5567514fea95d8f4ccc852edd09f2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"a770462b675758871a93de22010cc29c*15&13ce54b1f10311d574d0ba6703491eaa*15&ddf1c4f87405bf9b85c4caec05013900*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"e0b77165156608987a248d580bb16afb*15&b2ed2c54fb77b53edfabe58d596e57d8*15&55daeffdf28da1b165801e175015c874*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"168608d6110197998b47025e1f6fa05a&a11e4d06249e35a18ae0d1df530e5b06&aaabdb76fe4aa5f4d1c9fe8fa3560c59&a634a40d8f22700ad40989e99e3d2ce1&441080800571c668c9c322e4075f00fb&b7de20743e0158ce0ca94528e89c9534&235b3decdabe4524ca2e9dfca7672208&2887200baa3a3e36f6cb593c2d7da6f1&547f0eae430bd4b64e89276c3dcaa1a8&74daabde847ed08d037a85c23897e10a&ed7a984afb4c8e16fe5f451c4ad71c39&ecdb53711aaac3d69f0cfa2fcddc3157&3d09a3892c77ce2f189caf031edc36a9&97b6f6a7a97f48e2db6c777dd2e6c183"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a3dd08f850b2b6868f49763cff38059&b6e5c3f51fab2abcf44cf3ff25fd167e&cbeb1682d7cf4a60b99217d0a870e471&0cd94d7b61d47fc4a4540eade4b5cae9&388f675f2f3624c5624d76f794e423ad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"934645014fbc50f04a3fd388df75c053"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2d82a79ea930ce25cf1250a94d1d70cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"cb507baaa04fd5ce6907daffb295a77c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a67454eaa9c0873c4eaf0c50d7c7911f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2319117a989ae25c1b6a18d62e78e62d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c307ab71f4d9f5f7b483f720f7c5d2ce*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"64eff769fb8e1b7bbf3117629dfde304*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"98ca62c650708d2e37342d6bb9cf2ac7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ea8eed0a355ad9619ce07ac98c31f247"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d63f14e4f6a83f74d2fed349ddb9c665"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4177bea6901ae3de30d4c7cd769219ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"080f648f82385c42fbe163718bebaf00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f2d026ee9801e0a0963720d830169902*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b8ea4128b7f893fa94ddec962b06c4c6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5021a14aba7bae69edb3203a7503e462*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cead49c5eb2927631d6f2799eb118e89*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"275b4632ab93c2dfa03cbce4be17d926"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aa95fc3e2935e91134e19a8b63fd96cd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a208a0acf3407902966f8cec498776d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"65721c018cae390e8198dfba03debf0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c84795989b8bdcfa8b57e3f41e58b3c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ccbe590317c6f702cf5cde9ff6d14399"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"031f9edf442c6f4dbe5f27a70da8de62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f8399e592eae6a8005cd0647ebb29785"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"dd6aeef179860630ec5b069e44b468bc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8022b78aec426ffc8e3248bff1d47e11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"88fc769fef25fcc5db2c04870a19419a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a38017e1da3f1d7ff0406685365b7792*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6550151a53bf72b1fdfa8e5c1fff3f06*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1dfe73e914cc2a9c32e72df973deeb6b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b545ab7fd51fb665579f13b5fcbb58b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7602fdc3d04a89025f19579f05673436*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"637f0fa1ddf44ec55e734876f30df47a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b2fa936c2c29dc15502f3a677177144"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b96a3bdd63f1db0bdffa8a308ac6952b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g384mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5d2e9314813afce1418ae3c9d4814612&45e65b479ef682a11ad4e9d75713d6cf&de16dce3a1af6e5a905036cefdb484da&6b98443aec58c0c31494fc0b4a9d54f0&aa00b376ea32ddada31c8c3b75719e10*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"15b0ded2d4afdb46820d54b24fd58902*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"13765af8214f070ba4a3c0c18f776453*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2a697df1c10b9d0c96df92863b7e01a5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"07319ab88d45d4b5716386df84be69ab*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"20e1400cb694939e9463c1a050238972*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c5cd445ba4cc96dc31e717ca15bbc308*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"362926bbc81074c886a653c7c6ad39c6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"374adaadb0c7eabd1632fad946c8bd53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5a40bdd186619ba637fb1fbafd03eab3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4e632d6cd2af97c1f0fd7545bec543b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"504c837f8963a8c2b4caa3814a4891ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c2d3f398a358a108f26fc38d9a11ede0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"52ef0addea63a2c7b26fe8c7fae22173"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1cc06dd7ebee0a52643432f9ac530b69*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fa8cea7bb2241b17f45f95848d0ef078"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"febcef220dee2c12a83508ea4968bd68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0c00fa5526cdebf74b2bd87a467895a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"85fbfd2a3aa1f71c41d8f8f2e9fa1a87*21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c2e57d69eb6bbf7dc414ebc3d80c5c5d*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c2163fa61aa0939171580c393839a31f*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6fd25303a906fb416e94703ac96931d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bdbbee7440a0926c0f39e3ab07fe91a3*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g256mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4bf6ba07929ba46f57a302fdb007b8ec*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b2b8b8324d6ee7446f0b556b64e3f584*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"238da234a13afa6273715d6a55b7ba7b*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0d60408bc3d59e9c4cd3c5ea305ab17f*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dfa5bcd05998cc3bab136d641bd1510c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48c645044bf14247a684dcddc9b00528*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g768mb1_ic768oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"56557ad9dc6b51e7349117ad79190eea&7d402c466d3a3e9dbdbc4c0c6d36d8f8&dcdb471573624592cc8f0505bc451cc7*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3eca63a548c85a12bc4d250d4fe5efd0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"01e0aebb6a1ccc939e02d81cae4ec43c*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9b71ef3c41614d66c2773beeb339f59f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb1_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"420ed4917d77ae246f8a6abcc308dc7a*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"caa7f3981df6a711f00ee7acf1661eb5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71abcb277693dc2177dc43b7cce1e1aa*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a3ba6d359a8112c6a49452f7afa600f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fda16c4e1430481ccb2c54ae9b50e011"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b4a89e9d8d74a13de6795a7163667d80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4bc87179aaa96da62a6131b581358ff1*2&20b1423908012506ce711c89d11d1cab*3&2f875823db498ff967527a4054ebba08*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ccbe5493b6a612ae3c1942dca774e323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"feb84a8620dfd9ecad4de31ffb73ab44"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"98f1c754ba492a7fd3c52ea0cc774b52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2048mb1_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"daadfca3f3aa8f2859d1cbd74224287d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9a54bc2f7decdbe30bfc68dc5cd0eaf1*2&91f47546e3a57543f5a4c6b6f6530936*2&c4f18f57441f22123883ae15cb60e862*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bbd3de1aca9739121f60cf5b41b93cbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e6f6681337f86cfb4f2dcf3449d6c067"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc16_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"b2351b8601aa51d2697bbb638cf13cb2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic16oc32_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"90a30e98e1b660a8036daedb3dc8e4b7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"504586011c6ed935c0a7f37bf70d8078*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"40d027d217d850534e72260e780ae8ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"2becc5018b178512c3b7e002c6157589*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic48oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2361b35ef705da23aa37cbd0996b5bbc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic32oc64_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"a2a33f905155cf79766d1e1c00ccaf8b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"869d56a361038fcf2c2507064fb8a420*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d27cd5b586129e5ed531a3944ff79a6c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"e3b8da6c7476e8aa3734e0f64464d649*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"dcc028856637b43aa4eebe5790ebdb81*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"1d2cdd3fa0137ac413138ed87f3218ce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"160fb9c26fb44ad1921127194dfd5335*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"53210e976f00a4407a27610556d87610*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"32a5e202412b55bd5ab20b570bbd1cf6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"021650025038b696074ff6e6dc0353e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"a2bf206c4ef3e1cf42a2956fee1116b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c6d56e45d5f3037fee6cf37aaa16b7af*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"72109752f6b012203fac3db0710eb4c3*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9d442f7aab6217bb146c9f006211e8b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic384oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a57afd3f349e985cd77fccb29d917c1d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc128_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3b309bb4f10551031dd1e8b746b6849d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7bde53d40dec1f1811221b7182d72254*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic384oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2f1d0c52bacf9a2a68eb49c6d5a42142*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7118ce9c153303116850831ec8023c6c*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"164694945456a2081da31ef14d3fbbcf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic96oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"cedd01ba0c351ac39e5ae4e3e76a28c9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c94f37ab3585d51f65f52162f5dd3756*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a455a55115d533eb40813779258d9550*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0ff86c121cba22027bc144ad08c9f4fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3482fc156f8d4d3726ac0d7f7d485d32*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic80oc80_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5ae70ff772c6c721498cfeba72037a02*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"74de4901f9e9345ad1a8b09161f523ac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e0bf974bdebb64c3fafaf97a98a84213*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"3dc2726896a701c89fb544d2afc86d3c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc128_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"4bab0bb02eb02c9928b5196a3cf8acb8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"587a6724b2241b073e7a654bc528f451*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic80oc80_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4a00a951fdb093d83141ad07bfae1803*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"fb869ce84d8b276ddd41044d71612956*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"31ee82fcd671fee22a6e29c92ff620a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ab80964e9f1633216c8207f4095b3a36*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"6234db93ac68f564a57a1a8785823075*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"c4b314a1e0c3b641729128ff32201bdf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic80oc80_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"9794d74820aa7d6e09765d7e3d65d18f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4a8bc3b02ac8b8662dc5edbad29edad5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic16oc1_ih4oh4kh1sh1dh0ph0_iw8400ow8400kw1sw1dw0pw0_n"a3cc4e8f93bec689ccad99286424af35*5&ecef8a7fbfefaceed6565dccda545f6f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"a73e6cb803f0a0f52e90fa7251e32fe2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6315b682a964ffe9bd853d9ee01a9bd2*5&3dcc968c3520c830cfccd3cf120da184"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0d4fe56248f2904ebff26027594be58b*5&b069e0b0832d2132c1a69635076c720c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0b645c56c752b1a0832f75df495aa0fa*5&962d45ac149e4f6a379c5caf6a2d72d6*10&42cf7efecc2e46f724bf278cca0df1e7&3410de969ba33ab6569402cc266cc753*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g58mb1_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7c6fd9ca4f922fd0d86f3122ea967743*5&e24037e8b05145a462711b85f02c931d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9c92ff38f420d333bbf5653a7ceef200*35&2200b811d59b9720f4251c7161347bb1*30&2d333643750a5d23809dd4bf29965427*7&6ffcd179e7eba87f28c4c48d33b15543*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g58mb1_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"87aab0fe5c59fbd1a79f8b6726c7f3ad*15&d29e9f5b10c6b46afb602e40e852dd9b*15&7001c8f18226a68fd28c46aef29d3ae5*3&6a76d7b9cdc38bbf2e9bd73fd47645f1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"084b60eedd0457c0717810a625395faa*10&a990c4ee76707d7d0bb4c7791187784c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f074adaa9ce5f3830e358436f1fde641*5&c27b668569c9a5b40c86baee17444600*5&a0e334691fc65b38be96aed2812218e5&e6152fd1d1194c3faf55d4e699b70a01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"af772be23286da3ac22eb88756f7cb9d*80&77f7515a60236ae0a9dd88b0a5152409*80&b2fdd3f34aff0b7a89e3b7ea67ff8b39*16&0f3f48dc604b7249841394279cb8356a*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g116mb1_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6ae2421992a853f0ae059577fc8ddc00*35&74ae0f340e4901f7c43ef8c96d6d4d66*35&9a19bace4829c87fc1fdbe6f9b6d5569*7&d45071c94206e0c92eaaa896f06b9086*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ba4b4c4a7dda3475e8809e4d03b6107f*10&911c69d141b29112b3929e6c53200cb8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2448ebc4119a1e795e403646547548f7*5&be0e9495ed1b972dc4e089f83c66dc6a*5&6a7856bb1873afb7a385ac65e17f9d4f&c151b8e2ac3d3e8648498048e103348f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"072756d329d16609b09b41c22b209c23*40&bd1bdb2010380cf0e9af6e0997e9189a*40&952874050585bab9fa5fbe950b2e959f*8&d3be60ff073b462664492f7d48beba41*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g232mb1_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7155cbd59a66b50ec14f4b5171d349bf*15&17dee0d37d52ccb59a261f07f66260a7*15&eaf0397f9edcca8a41b7107c9f315835*3&3e2c19053cfd897a4e2f715cec1b9d87*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"43c1dfe59cf3641c5fae2318fefc99e1*5&154a9e7ccd7df20393b64d16ba21fe5c*5&0a0c0717a196594439caf96ed0938174&a0681545dc8e2a8d551c8e21ed58d633"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"554e184a711c535d579534cd98c03226*5&ea4a7ad449e0881935e8bda81033c23b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2fe01b252a9864917727e6bcd1bb3e5b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"f5c57f281fee92c5bfd883d772ad2a07*5&7c176b8e16d92bcb34055de179efcfe7*5&87576b060e1dd16049aac04234671736*5&a6fb67ab88de9bade9859dfdc5449cb5*5&74ada463173d8efc72f19eeafa1f1fe2&c8589bf9eb6070a6bb6c9c5d21fcb90e&fd92dbe49dd654be62bc86f3553ba514"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ed33f7738b91bdc50d4ec8f66feecc44*5&d0b92b77955610c6751ff6e00342c7e5*5&8a9a7010c919d955f5c6ad98bb2ffa34*5&19bb6c938eda3a1c3351d392d3f7df3d*5&04e4c5ce975491694254a5f2d8605e24&100b69ac4308bcf1b6fc7883f8c6062b&09228ccc1ce1d0a5c43b3a0b7a8ad59e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f4c344dc3cce013960a831737d24323d*5&becebc55e664dfc76e0a8c0a3668fb16*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"81aa56d256f3b1df6e0db0b840b6b2ee*10&770db7a4692ce956ab3cf1fba02fd46f*10&7b351c7aca433308995758aec74056f8*10&6cab3c95a49618b5fc000e8a08143de1*10&7a0718687f0abd340d06e9d59f7921fe*2&23923466187b5305171330ebba8c5293*2&87c751b518ca5e4ec1f28dd23f7ac18b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"539c8cfdea27105c987cf8c0d3f3d0b0*5&b92039c966b3d3c037b8c1f148301a86*5&5d5b26a25479bb993f59c4201e6ff411*5&692e285d1e0f98a04b237c3fa3b4ab89*5&8708048817237887bb1c7c935edb7cd1&901b845a3eb6ca160de8f9ff353fd6d1&9d0bc7d16dde4c77597ee1e830c6c553"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"1889720f2ba1875c8eaf418123abc470*5&d3881ee353819cbae034cd64c7c1ba1d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"078edd07ca4c7d66a2f72dd7d7b40333*5&65503cebcbb240d4669f51a0cd346365*5&b6c038a8626aa37c6f2c65f60ae0df54*5&0cce307b05f1fd414892ec4bb4390052*5&e90ffac1bf0a13b1abb9134d5c3889d3*5&c7ae65b536c713602ff688d062544ad0&2fa1eb6c5f76d6cc8d3adb43459bcbd6&87cc36980627659f10dc5808c9d25cfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4ec1f4d3141358957d6c79104078057a*15&ea340722d735e264322f17949d0c1691*15&ceccf1118ca2f9570e731efc478da4ae*15&e0c08471057f125e62de28ad6c3eebb8*15&70c54bde5007d70b33f13c14a0f76719*3&f3dc95df8f2283a4154badd403804199*3&f12577571b4548bb0b80f7f0ca825099*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3bbb359fb348d3d9b1c4e40294323efb*10&867512c13a283538acde573883a99f76*10&87100fb391c84ab19671157ebf86e3a5*10&4f598ffcb9fb7c5bca0a729e791f3c14*10&300d1341c19baaa2e20d0d403255cd84*2&0c51de2ee2c135193e7f285651d8886d*2&104531997a56ee995d91094fb0699452*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b6e0f8c2e749f2ad7d672e6558c6b1af*10&ce38d232d6d2c4a276095446df0338ff*10&9ec05115947648aa82463c7792e17952*10&9eebbd88aead5e7bdf87fd81a543db52*10&55fe01333065f90e536b99edda020e0b*10&08065177c6018e5877331564feeae729*2&edc39cf88e1af333ceafc01de0451548*2&d13d3ab36fab89d6bdf71915d385567e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9e857414a6a8c1a0ac0c80e161f16d67*5&11b9b8870e113cbc026e4f984a688b3d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bbf54f76cfdfbdb2fd7694834e231ef8*5&807d21656f6f57bcb8381d9667296cb1*5&fe65fd6394be41b6afca03a950f8a805*5&b70d20c01526483663d0676181784733*5&3bd0e526ea81c6312960fdbe34a32087&2928870703a06dee12d58e5f2d859003&dbed731e63b76362c3624762eda64faa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"474a65e690021e6bc361caa0620a36b0*20&0909c8c065af1f78482c8aca2ad1e30b*20&d29651a90382439882c1d2ae55c4856c*20&02739ebdc0757aee7c9fa4d191f8bee6*20&6a7c76c25457324a6dbeb0597c5211a3*4&7ef071e79e36f8ff84403c131ee95f4e*4&04317a7500877a88b6e418a4b51c2271*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4dff758e7ad7f3043876b1f32597cc9c*20&ad9606244397159a0af2e2b5c5ee460d*20&297ecb2711a5d1b7c9294387be3e13d5*20&c100d9a192908b63cc02b1aabb446cb5*20&f4714d89053bb06adeab091f3a364c4b*4&8f2a7759f30462013cb63f3be7573b88*4&29bcfefd2cb87908c8f0773c11083ad8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a8b1ae1119a0fbb7f6ba56cf5c992382*15&e9c8a4db469b3f6108d5aca4212bd260*15&4047c26d58e912bf539d119c0fcdd73b*15&3c3ba1a09225fd4ead60cf5cc97e07e3*15&fee90170f0866cfd265bcbcaba3a531e*3&ec8b5ce55b17397deedb8f61b761e0a4*3&952924c891e4cd84c158b49eb06e0a5e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c0e4426c10ecd8321d9056eacf4e02b4*5&f60d871f384427b292697916c97763dd*5&94acb1ccef314d357ad55d11db84336e*5&179b88ecab58c57d9721439c67165015*5&43569ce6897a91fb31fa7ea4e76fcc2e*5&8b148d96b54a100cbc72fbb894462a49&6574814e306fe95acadaa365d81e9965&6ca72df3ef6694faf3ee4f79440b677f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e3dafd8bbc8631b7695fed85ef4508d9*15&618b14d068a475734c52f354b1c4bb1c*15&0ac2a7f94b19ef862dc57f528d8bd328*15&30e6c9ff119e337d0fecf1930136aaae*15&1fc97977d7e376874fde4ca897321ffe*3&5eb182dce9a13abde9d5c0c773d3c63f*3&a09bcfd4c4775836d0f73ea0cddb6a18*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"62749c8a27b2303c151224b64f2c2bfa*10&df25e01599e75e69aa7cf673a5c3e8fd*10&294a4dd4fc073b8b4c1837fcb3f47894*10&0f31a22c74a6c1d9e2748f6d981c3dd6*10&1a82972e6b552d6270580f3af04454fb*2&c6c3ee94f3075f1e3daceda86a51fd31*2&282f00f1b8a6fc52ceb09a943eaf2e0b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1be94bba053fa6acf948ec004933b4e8*10&01903faf0508c1c8dc0c1851d7167f0d*10&e2d9c4ed0fe22e0320b98c31916677dc*10&a0aeed02d6d516ad64da11786ffe72f0*10&ab98c278a9f01609c8fe00d8da936671*10&fb17ccdafa5a6d2d2f4fb08bc2c1f7f7*2&4c32489ecc05a53da629fda28b3d226d*2&ac3e9ac7f97297aa7892381aac27b2e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"9594128c1b9e3797d316e28e6df84d2a*5&038115ef8a128c0094bf1705fb303ca2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3b4613b01a78200d8f76ae66923b1c6e*5&982b203ec3966f441f13c91e54fc2e89*5&cb53e4f84333d6c4d42518d295eeaa25*5&1734d7cdf0e16a7de33f254935809eb3*5&c5fbc668c281b25271f1683a5bb09727*5&c02ab76924f2a5d1ba5eafb20b7c1977&07d2fed239221d1a440cb0de31a2b3e7&522046363e524317edad52d778a99a77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0a4289fc2cfab76c78861ee6598b142b*15&1833fafcdbbaecb63ed5ce5437427cda*15&0b70c62b879d8038acbf9b1c7af4e5e2*15&cb03c254eeb0db35af6bb3784e9194d1*15&6dce5e4476de8028a1c04cc1a523b6f3*3&5661594fa48c0286107a2bef856f617f*3&ff4a53ea85c261377b078f0ba004eb7e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"91c681c6f4e85de61466735784df5ea1*15&6d410c04c8a6bef36b464eac6adbba90*15&e931dd3bc612979b4cf356b58db4dbfc*15&e3ec9e1611bffa8cdad03a1b3264c27b*15&bf816628024122e48eac8fcc733be2f3*3&74652d4012a98ea4fb794e415c7657c0*3&ff52b706115a6334ccb77f6a6d4990c6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"461130741b3d54c5310f70b8b352ef0f*5&3cd948dbebef2d18361dc9dc7239d1a0*5&e6316cd142e125e7fbb71b0a6ab319ac*5&7bd7eb5f3e0378c20cfb56d2bf1bf976*5&10d7cd61517df6338fc2121a3471875f*5&bddb283f78dfb551d5ce92c777761da9&721aabf8535f58288ba2a6726a96e150&c13241f6cb15a484919fda6f4eaabceb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cc6f238f7d151e81fafe3c4348edac2a*5&7df0986a90bb745be5ad3465a1254e3c*5&8b4c2369d3c174a9969bf5af9f2163b2*5&a7618827bfe42a3f1359d83a72d144c6*5&18c14bfaac839133f3e9ed9d0a2b1094&18cc37965658ccce8925d328a4706166&b0f9c0e9810ccb6040aba9db902fca68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"7b9791c8e325b1993bb75d528864af3d*5&ed0cf8b750570c6edffc070ea662165d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"c77d0f66cb22da834194c6eb0b482e9c*5&1cb7ea210bcb43112fdb1026d4bea219"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"5a26e93929c10f3db6f4ab5043cdf0f2*5&73dc7af964cc30f1f14c9e9ced89f865"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"c14ff1850097c426d5e04c0b0b73fb33*5&f90672a8a28432bc70677831ae2a9060"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"319a44a353ecf60aab63d167ea9a47fb*5&653562f15cb3f9014d441a673c0bbe82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"5c18ac68d2aed41623fb0c4244161406*5&97f9a8516de3a1c18bc393001a88cda8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c0a4d8e654e083f43abebd1c7e99f8da*5&c09c84903f80186209aa099a5533b089"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"a8c5f9473286b18d453d527458590378*5&ac993c31cbcc51e0b7240a4dee39bf9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"cbae81eedc04f597b1ef2f4360474d47*5&aefd53d3cac5a863fe37963d1240a1eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"14e626487f6067cc25326cd61533974c*5&c69674f8449bee500fbe5fd0ccc68e29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e7bf40e338ddd597dc4b3c0861861000*5&02488cddab0c451418afb7502f850abe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"3f38c4aeaa880d44dd7532cbfd130242*25&2a3899068f185b095fde18fa05181708*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a2f126f854a5e989ba1c001a951ed007*25&72584ee8a8789a0f4ba0aca7c4d679d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"f1aeea12e91cfcd63edf4087029cd049*5&dc01ec97520b256d8ceb7f90d83caff7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8d0142f8e5e5fa7d8cfafa3babbf842a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5e69d2c22670c690a83142af5529250e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3e9a40a16acd22c4073f03f8fea50979*5&90cb6fabad8c713925095223dba39d7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b2b5f45a190e2bc8a757560fc1e2ab64*5&ec40d427cbd2f992a148b1320051918b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4803f9d5973b2072c9fb9f31523b693f*5&f4dd9547936fa6fe386325142cbda95f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3dbd0ff5dc092d8b955101436d275cad*5&c05258d173b7d9f5b4a332687da4a441"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ee6ba0a25a0c768f18de6dbd4570ba76*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bee906c8461c7cefb89136102e5a1bfb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"7adadf72a5df9dc28da8e8136db1ccba*5&24f5af6837cf464ef4897f16dada26a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ed6785c56c7b51d885162f92281e5f7e*5&3fa5f095c2365fbb771631f892c60d9b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"8196db4801e36d09c08796fda7876c45*5&19a1c9fb3a9d3bd82e6e038523fad8aa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"6279dd33f0ab4a22c8ac2b161aa5fded*5&f279d7b1a97c632bcd14916258a775e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"b6f711e9b932cb45ae43a2bc64d39930*5&fb25cc3ddf62683f43e65080a6cc9ed6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"f95ab1eb57081a86283cd1363e710b2a*5&81de486b8a8c382ed707ef9d2765c07a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"410d5da8bf315a909a410acd80a40874*5&a3dc21daf0185e82901fdd933e74a426"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"a469550e0048987d56e0f049c2f37f94*5&cb1d7cd20fff424b42e1640614a2ff0c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"adc806047f1d96c02c2ab062856db636*5&5a9ef3e875fc5c9d79f53da3cce010c0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"e083ff76ca1ad82121d798b1a2d728f0*5&c9825774626753e297bd98113afd2a45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"55ccc7225ce0f5647b320f507ba6802d*5&375375c4032c553de6c6613627fbb2cc*5&1fb7692b10e264e3eef4b1265ebd9261&fe54c7f025ba96b0eddbe90b7bdfd404"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b60a0a6775d21924b68b44df4b7045d3*5&1d1af8bea1b0133883a776f7a7555400*5&e0428c7f123b12396b4412fb38ffc895&3b99d0980025180a622c8ff8dcd0f5e9&de30c8965ba21d8babc3739f9cb0d5f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"a129b128c0488f92c75a5a4811005941*5&c3833cc7c9960326b4beccd62d93e95e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"c4e8eb51718a59e28932bc9caacddd07*5&ef42b89ef19737a33549b573bb981da8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d9672ca13f0c542784877533cc82f0d2*30&d6238d264d1d0a079860ecf1ca127f10*30&df1f288edcd6e6b24b8b43aa990c9721*6&e6762d3f6a2c0e4a9b4d1f1fb54bd70e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"efd31fde6ac69b32f10df1ecaafb8112*30&60628b289647c4c06b5d5bae4dba30d9*30&22246b879c6ce5210ab4f10a29a0c6d9*6&4bdc2608dfe5f05fbb2d6db8f5a296cf*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c2ddc59c54928fd75405ac6f881c5805*30&95991d3c8c1a97e53750e97a7d92f2df*30&9bc7dd0c75d99b0ef6bcbd0f73ccd3a7*6&7526eb1b9d98d9bb2f0642852e8c48f8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3f68ea7696ad6cff2ff6e2d57beda4c0*15&6f07112b957b4b82cfb2d47d8c4e9c8e*15&7de083f7a1b207bcb7ad6a04d1697605*3&a4756105f84840198bd2092df9827ab4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"039282f6566b3fb3fb0fbbecf3bd2c58*5&f10588acd77703e6f98878bcdae5d38c*5&0b97469a8ae72657f793aeb8d1cf4bd4&de55a87889d50d2319d822398505f1c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5ec30ca170028f511bb1f4ec98a71e69*5&b7a740c1bd6c4201e42fb550265467c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0abe49d4a725e9a460d776be55c07505*5&6cceb7ec4b1c9df5a8648bc1a7d45a34*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"8e4f369658a01435ca6fb3de6fa3c09d*5&e16662a67c46b09edcb654fed2dcf466*5&c823bf7ea00b46b7cc4ffd567bacb201&ed4b1781386785f80b1fa9e518313016"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"00330a3ffddd400b57ffa9b799a9046d*5&3b595ab81c2c33d590fcc2c84750b872*5&a7be49fa7098e2433708744812b6a13a&e4cd1c0d10f2db490226c848a02c7a6f&7b731c161e1b38e34f5fc0b33de74537*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"908fb5e61a654b56d55c804cb998b263*5&5e39c96985d9bf79db4f274a5562b547*5&22bb8a8b17ce9ac3ddce4debabe17908&d795e911381129c845c3595c0d607f6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4b87020281db1b5749f8725c1498ce43*5&6c89fb620b9d149aa1edfef54669deb0*5&bc281153be8d729b223f732f6f1724d0&b1aa646523a2c6c47b4311eb765c0149"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"3a401502f0ccfb35a38cc755c5136f4c*5&3f46135180f45388edc1f23614efe1af*5&cdf47d311f9fbe51f0a3eab062aa2beb&63fba9e085fcd15feef52e58c0baf7fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8f38854cdf555b155feeae8b31912536*5&d0de96af61a4ed2baf1c84177dcd88d9*5&48266580702bee964d9c56599a68c3e8&d249e6e1cda7eb75cbb6e4d9cf8fd796"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"57466e9e29625f06a2ca5e4afdab373d*5&2b9f8ee8fbc5ccced3f8f69cb4758e36*5&1590201d3a57bbe0185ccd7e86a9f782&5dcedcff71aea560269034350bc6f037"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"b6283c384934ecd21d29a8f4b6b0a923*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"d323fdbee63e2d487b219d386a3f7fd9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"6ff90c5cd3767667f5e896f9e79ffdd2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"8655e74da94e40a99a598c55d86639a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"93e10599b16d29828dc5bdf3c7d3f1e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"38a8b720f64a6ccd6af2ad77e06250ce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"2f0e3e170cc32b386615913f51cc1bcc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"548bd4d4772039377c8620066277fd2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"b137821756abb47aa78656c915d8dd41*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"70c96b229b3f2eb42d8d62b3792127f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"691eb091fb2fef9c2fa43fb796052fbd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"c6212c0dc3f6b1887f544aa9780c914f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"20598f4fb90cefcdabd527035be3b7e8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"75ee3e6edcbc99c67b2df85cd9bd437e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"0ca898e140fd37b91cc7c048c7351a1e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"070f8c418759d9e08a363dfd3b7a08aa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"d899cffdcea617c794251c5ed2fc6b11*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"f40b19f557aa0448b3f987133bc0f37e*5&3cc26faec83e68ff4ed84f90486e1ed8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"3b47f3ddeaa11c05640fbfa76b1ae575*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"bbc60017ec377a2dad13312bc0dda8e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"4aace0ed961b6d6752fcb243e4fac114"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"0b3c67644aadd33d40f6f86b12ef5767"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5ca41200a1b882aada1b2a254d036156"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"5dc8fa1fe8151ff4c135c245e3fb20f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"2ea9029f2e342e54b06d3aa2947bcc07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"4cad01ae3ddb4ac88f5c3d3f8b2c695f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"35c3a73d05ac16e8ea8dd32cfe177656*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"7770b3c24f89c94b329febdbad6bc359*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"590aafb8676702abef0815eb49937472"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"f1aae647849ea445d26b528fd231c7f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"3449bc145831ca68e250f0f32334ade7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"08d1dd7f05d224d3d6ef55c6e7511187"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"7559aef66fec1ec8d25ff5262563483f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7d888426b218232931f76d4d771716d4*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"e17ef143c3b457a62137a12d729b5d9d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"2e049335329cb3e8f366c219de208102*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ef9aa7db46e1051ef3c6a448dd595239*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e96c02d247061933103f422fb6be7660"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3f51ecc365fcc9e2d719005d2edc5f5f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"922586c2b93cf528b772b03cd215e0d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"b0fe21b01be5708126ddc15095e1efb5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"257d85e1aa0e65b5d368dca0da0fae78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"80811ccdb85de5593ec52011efc378b2*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"6fd6dd60486a5b5bc25c0d13649baab4*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"ef548db6a642f8ebb8017479f129832f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"3636e1b26a8b8adc71d9dc2ccf02f138*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"d70d39cf5ebe017893953e6acdce5daa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"2311c2a2e23b24ad05bf8098f877c34e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"83c6f3af53977c67b61ff674a9ffa866"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"361260e3c2ec232b9a8e9fa4d0c4e550"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0a28f737180269453fbed2958a14ed0c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3d2bca10c3182e30595e1672bc0b5c16*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp mb1_ic32oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9439fe228072306d784091a720fca660*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23d76a6e57aa0d4133fec7cc109a86b7*5&95a9dcd75a866c00065869a79f79b857*5&84f5d943f206870e57e46f0364ffeeb5*5&2388e306f92d73949cf9ff5420293346*20&efbfcce4e5c217941510e0852f9150f1&1abc84e864f14caf78b868d6c1efcd8e&add85e497c493d58ad5391eb82e5a529&f6de72b0466be17f3fe61b2824dcc7b0*4&aaae48f9f1d15803fbcce105d8e383ae*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic128oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8491c93ec12d4661150e6ea0ee784938"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1e47ca35e6577a0d512a67d7a5a14fc6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp mb1_ic64oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6d96af2808ef52895c33303803fe76c5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8415a6a66caacc7fad1f38ef2c47e625*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5879fb5bbee5266c0e59fb7433ed9d55*5&cce6b33880464a4d56304d274ee00134*5&77622bff75b64331a17f38609505a5bd*5&5d55757169ce009b5f3ed026575588d6*30&efb1618beadd37cbc5a09d97d86923aa&e1690095039b29b102c545325c1d78e0&57d398323585ec98b3b43d8a35e2e66d&597652b4ee6b68dfb22e53a799a09def*6&24915373bb3a82382517da5295abc505*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic256oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2bd085166b649656e2066e81a319bee2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5d00f6f9e45f524f3cdeabdfa8a2af2e*5&4106f24da5f841604f74605ee2a93850*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp mb1_ic128oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1c317da913a2d401b0ea4948f1e7b6f2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9dbe59c87073d202937d9d979be3ca42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d99d9c4bf06cdc7c338489f4c707ffb7*5&79225a88550f1cc7a880827ddcb9e086*5&f30400015399c5b8243c5e57ce4386e6*5&1f570c8ff5aa5bd2321b6acffc619dc9*15&2c7764399de1bd6f8d187f6f1f82e812&dd442b070bb640096a85534720838afc&268cf8120e95b2ce252f30fc795dfc81&6e43d7ce4468406ee487fb1f293d3fe6*3&e82161b89d5bcf3c2eb15486bef90c38*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic512oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7df8ee589bc70fcfe31eba7e71281809"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f18457a91aa30a9cc3739fcd8a1d6f8f*5&e9c099b99c1cfc04694dcbe721ae00e4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp mb1_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e65578f8a2955f69265ccbaa58595924*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic512oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"debc00da12e4a7868b276fdeb363f9e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"3dce378159f2040d33dda141002dd711"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c26df8b8a6c81d7ad8e584e99cc6f29a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f0ad7712728645e3e9f3b2d3f91c8a62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fc6d42e59340f727864089c49c08c85a*3&9b636393ba3d622551a0117ab6ee3e85*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c576735b219388261dcdac0e05d9d473*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"72e7e67a6ae2b7a8745911c5d5cfefde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3f99f18993cc5231508614d174091f3c&340a77ccbad73da85ebdacb6c6642c84&5f84b9fcbfb807ecd5832b18cf5daaf2&c8c2171d98c0e6ba290f29e0048f6c67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"90e73aaa4bc6be1d37e2e99b2f8fc784"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"99bd49d7a5d3241046ae37e61d0fd1aa&d68f081bd9fc08f7ebb7e1d8b0739005"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ddc748a5d8aaa82530d9e1ff16e90374*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc192_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8c40f7490caa14633d57e32e95840951"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"94818b5df603cca059dfbe998cd44d91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"32c72492e37d053fb8324f9f9e8279ec*3&af030920fe7f94d359c34c4c64f2ab77*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aa78023ccc418d352e4896561a0d1177"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6e8123fe22b7cbbbb260f85f4aa183f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic320oc320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"21886175d5972ae669cf32323c9b5adb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f59bbb2f1de40d5f1aa149a099d4b2b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"89db49d507f6598e844099dc023a6667&8af2cf1417b718b1183f292e36288db5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6d1b3a276db7073e88a226df6e987ed6*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8312289c1ad124f686b28a9bea904bdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb32_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9c2cc543ef8561f2827b553d82e18c8c*11&421b4248b8114fd1a1a19cbaaf349b27*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00b2097b998246ea9732a6ed85722b1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"85f21f9e04ee47a0b06a673c43a54c48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a401e50c28043fcd27ffa89c258ae3c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5f3814a9d2047a37f2a9b97f6fc7bf5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53495d7fa36748b5057f3b43be6500c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf7517513b0449bb5c05deaa8d488afb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"74dd4d996045592cc251d4cfd43f2577"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ecf5de0c34c0a3ec4daaa9f9b4cd1a98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a14eb58cf4db07aa3b32bb5dfb63cd71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efa82d50013ab6038d5f28119683f490"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb32_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3ca6ad9ed69078a17fe67792e8bc75e4&f1c20956f2ac804df72f5cae290be038&4da8b761decbc1a19109f07026e89c09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic704oc640_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"819900249e0994fbdf5e6dc816d7c807"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60e28eb3ef61833bbacfcf1f94ff8bf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb32_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"bdb5faa5c6308519d80dd180e80b4bd1&4a9ff3d45c5e657aa024dfee75cc116f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1d097fc8572286e1d49425f814d65eac*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"def281422f8307ac9dfd41e7654b5baa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb32_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6957ea18ca45ac8895056559f6b45928*2&9279badaad4ceabb69c96df444405d28*3&68ec4fc7f28e5a55f1e31fa1dc343d82*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a13d32abe15e6482286eaf4dbc70d9d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e7780f011370ddf38079fb51c3bf5303"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"e9694f43315f33183bbf25be93299cc9&eb8f398a3e0576a7c607e727e3076ab4&aa6ddbd8cff702c16bb034ac5669f9bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"19491ee512f362b3a1c15ac9e9501f1c&c89680fbce9f3f510503d29cdb35b636&b9dae8ad32e6f7ccc1994254a17974d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"26f2110af4cb20378f504634ee181aec&407ee609d01fcb6a70355d7cbb3e540d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"5bb185acdd55c0de17670ae9d6faff4f&ddbc366d3a0d061142e8e01546c0d08d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"a2085e3e31915195da80421dbffbd728&6ddba61c1424716d1f14348fd0dd5661"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"309dd4f879ed5964c2331b22ad3c006a&9c40b0d8f810cc8d0e21c1a19d9bd199"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"52a10388e0075be5c238a0d0fadc100a*2&128c36d7d35e0b432d47f1eedeb9a58f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"191086bc35d18d3e1ff0dc3333824f47&ddbbeefd1a7befff6c3e4e2c098e4204"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"0edd15d6fb99dac32a899c892e10e94b&640fed411f54ff601cc7921119442b15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"f2cf97f0593036ae54a02988d9bae6e1&aa066faed30b01f308e670a207a83201"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0f9b2db508d0023fb0e71b5fed1be6f7&c02cf6fe9b941b76bfb6fe388bfb9144"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"3d10924239c10bc107e209b7540d893d*3&2639badf90caa3f4ea7a53cad759dcbd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"1105aa595dc992a9e6c17952a3fc84ca*2&10396ad49ccee2c64b2179b7c5e26593*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e89531573b322a3412e7ee69463e393e*2&51b404b2f06e0c0625dfb4452ed98ed0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"816823bcd1c67d89222220cabd56ec83&6cb2d6450ae049e901cc9a482e322623"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"29279773a4547de3793a6a73cbd045af&947069b2abf0820c723b287c262d474e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f16bf5d77f73427e7dc9413c41f1b06d*4&bca12d6621ee14db321eee0b84103318*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"dd2445ae06c2b5e86ca286ba993664a5*4&45f46a882932de00144ba55715a7b39e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9188b92c966a7434ab2b7bf75c4c7368*3&5d9a12207b98cedbbdc0f629e034e75d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ab0681b38050f8dc45a8d0adc9f5855e&920f99660d26f7e628266901512418b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"66890813c97ccf9b541e824f7312875f*2&9953f0134e468e32cd730b2bd9689bad*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"41754e8be53a8ff218f2c845bcd5a989*2&00041534262e6bf52ae139e68360dae7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"863effc6b7537319dd46d503c51b5455*2&e6b6668bff69667abfd9f259a00ef80c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ea6465242e68dc2b56d98d132f0bace9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"8fc5c368a28e2b9fa3fd420ac0609329&8548c0551092f266a701e1f798a1b766"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"34c021ec30058dffd8fbcf0db751f388"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"12d072dd6d1608784d4aa7580fb76cde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"47da930225a75f24843f9316f77b2cea&4d63bf352d100fb04477a84ced9ab6e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0f5ef6d87d92762302cca9974c7a106d*3&eae4a107dd5a8062bdfd5bb66b2e5e63*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"7f2976969b3a87f42cca0abb92f618eb*3&a0acc550272d971d67c25c4144755dcd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bc2c95e424019441c3a4b6516e3bc752*2&54c64d4a1046810d477acce868123d75*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6287b5694835e6dedb5f962c9d69c5a8&b6912e597b830abebe1a240014dd4c45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"88cd98854b7df2712af6dd3d0efa0181&5d0239deb7dcffc10484c9d41fa280e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"2fea4d5a3e88baf8fec286b165966e9c&2d6052de6ca2fbffd9b1bcef8bb1395b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"487bee92f28f855da6c48cf57edb7647"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"994be55ddc92c69b5f404b3860c11b5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"1d46704ab49d4f84ccdb1b18b2c4ae2b&2ac8b4af45bf4c9039c6c16f9fb26154"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"5cdc38d1661443a54e3af714afaca7cc&6bec583abc7ba9ffc0d0d31841402216"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"b33b482af24d579558c3de58cef93f4b&e31e68372f36ea3ab83ca5962c0bcd83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"5a66a3270f10942c4823432312ad2a81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"2148f5a209fa5638692b18ec9199c231"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"04ac221917000451e4f5aed01e997159&338ed8c9b3dd8b70c8158d84a4559836"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2d656eb89208473f01658ee7451cde2e&10fa548987498c28bb9decb6dd856296"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"226c1ca1e2e30ea539c12647b56da81c&9b4e9ae7887688e592e99739916e1082"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"df523fabe6547c27264b8519ff21b6c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"7aa0f4ef2de7b09815b756191056d3ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"7bad5eb5210b8e6d26c2bd693a6e9cf1&f3a3440bfd0d0eaea1c34573c534273a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"83ae3ce3e41616975bbc678ed7ecf5fb&7c61b721283f037bb800cc2ea29857ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"2b05bc32ed2fdbe5caf7c4df2d2948da&ca53c1d065f805f24b67d9f016046eb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"a7204d5ff5f821af9a36a12b6176d5f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"08cd4e163b3892e3e67597f920161216"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"76431487b5779f19c552f061f7b50429&c3fd3d82ba4c15694c58f8c87cdf8008"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b6a7e0cdad08778a3fca224dbc3e953c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"899f1ffa027081e13e15ea40985d7083"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"5e49d46a5e62429c93920a40080c3928"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+binary_mul:f16:0:abx mb1_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"39f503c34b4adaf7935baa4d72bf1993"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic3oc16_ih224oh224kh7sh1dh0ph3_iw224ow224kw7sw1dw0pw3_n"2fdcda7be005b6a46ef532e7744bcac0&1960c6ad99d7607dbf7b0c898d66c972"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic16oc16_ih224oh224kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"ab1daf159ed63bdd0c6db8764d2405c5&bdd40d4e84f0a61ce26a0ca6743177e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"3e3b55fd201f0d7313e077ab42d73f3d&204ec351fcb04d9fd87262bd8d170329"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2f97e4a7bd1b71697fefc29d81a567d0&a311bbb2be97c535403e9de41db3175d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d3f51c75a9b16cffedbb428c04dcebcc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"58182080255f1fda3ff1a73174dba2a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"653cccd9207d33bc6a80134b73a3a020"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"43ab4f4fcb11048144f337d641fd9419*2&1eccb054e4868a17d6939531e7eb5b99&dbf53e99a73f9e021ccc4a8b6a9f7515&7a15543d99a0e51d37c56e668f4fd15b&f0914570ff26016af67bfb7f815abb29&38b17fa313dc264abae100ae27f8128c&933508ed2c1caf3cb62993740737bbe0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"33c7350c4add95906af325e3d489ccd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d78af8598e705698ac7936bf7df09306"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8f5a4a719ebae5b726dd285557aea9b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"12045f877da0bca9d80b8ecc3cb69635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f83c903176ecfdfe6d570cff4b18ef1a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"31ab7aba439ee3e29dc998eba0ef4c7d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic64oc64_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"244a43527eabc132ff56d4c28ba82750*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a65c2fce8f692466fe11c13c4361c83c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6660b592a8ecb9bc6f5947f548dbdf54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"beb4537c2dcac863fa90ec7d8faecabf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e0bc43f9c39f6b6ffce66a95ff4d9340"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"04a8bd4396e7fc3737f7436f515215b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"043751a704138a89faf4aa754eeff11c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"812b9695356237068bd66f7d96af73de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e5728e3aaf2e29432d370c3197989a90*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic128oc128_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fb0cd4ec9136ccbc41c996a41b443371*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"33d0d5d7bcf6b1e55a7f0f807b60e4dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a7dacf792b4a147aacb14a565fc28dda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4ed7949f920ff1cf7da64dc3e2e34daa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6aac71472b5ef8d42aa25fc864bb4cd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0b7e072202fe6962d53573a28638b7ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"881eb962ee6ee63223aa434ae15c3f41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"249f16a777f021764b309d827c1b125f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4b716b2dfa0f5af806f550bcd9e10730"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"97b93b89c492dba3fc47674099e1f2f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"41708c72e2cf44443489c073d728ba15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2ca00d6de8b236118451a7eb5ed77da6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"048abcf94d4497f91135348789919635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic256oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"db33941bd4aa26c5f50487738e39bdd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"526346d896213baedffe83a5a346afd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a8d6b4f2623a70fc19746a81d0592042*2&51c732974294f2f52e80efaac14c9e67*3&311ff236dacc8d838401e41c8dd39670*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"49ab99bb3b63c35dd6a22bd9f5785a9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"689dac83c5ff4fad64451adb924199a1&21b3604fbfc6cc78c2c288e4fc842b75&24eb78f43ec3050ad36ac87d0ed655af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src0:common:1 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"65d750535c7bd3183e88fbbb6cc55fb1&f4e379648383a9bb45f5fb6b00e4bed3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb32_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"30ca62ca5e5fdf87f647608f1ad81b41&aba69f828b0aafd1a7f8b102df0ad2e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src0:common:1 mb32_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"411f23b6eed2e79fc6dd0fac4be31696&d9c10d6dbc737b8aa56064d44ddb555e&dd29a4b4dc7284958b2bf1600192e519"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"dd5e96ff352eafc0d0001eadf8b56878&14b2286407a2f5eef94cf21a53f4063b&8db174ef2923b39a18b72a48734baefe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src0:common:1 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1872df4ca82eb75f89be993a24dd86e4&483a11390ba45cdaaf92ab00dfaee3b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb32_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d559e3b5a08e04a95017d8651a7b0bd5&16084ef5477e139960cb259dc50a2efd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0dad307e5578c338644b0d9c415aaa67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"75cae8e0fcf6e714c39828e3e1e79dc6&c4840acec49230b4176eab75318a8a8e&5e1940fa2ddf505dad97c8d893b5b504"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1b8baab9b564b24b0ac2ded70076e9f0*2&ab9c8e7e952e960038c114122d9b7e7a*2&6452f89c7c63dba9db83b4b196d666e7*2&ed25074b39f36d2ed30d1bec513f6c39*2&93051f823bedf31ea225b510014fa740*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"362921e6af5b604deb790c202d3e4e53&160bbbbe51187cc6a64b8fa60bef21d4&273b7ee28803c9e2db3937ec942310e1&161f5d94a4a31a12b66bdfbce00edca1&35771cae70bbc9dfada19238626c8aee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"429619312c354eb80776a707322273e6&931abf3456c5696a56b5b2c13e98a062&f6a9b85fba73ed1fcce790e9316a5615"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e83a1c9063b9cc5aba30a4bde2cb6e2c&862d7771eec280e7a7e8c1e93db2a685&45d566406b69428818922b77e3b30cb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb32_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"34201f05732d2819178afadd51aef5d5&3b91045b405561517842f185f752a945"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6b75c8cd015a87a5b427360f4789a009&6ea834ee75ec8a218da5e4f2f77ee9be&f3599529875c52ed957b0a5a403a7564"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb32_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4d700c8ae7854789f3fd985ff587fb71&39d92662cedb75a96a06ade5bdfa1ae4&d12bcee29cd278942cfe25c59a5e9ef9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"63852e5c95e4580ad84143332d778557&9947383e7da18f2443b22e4abe3cb8c0&967741d48153f65c04436e19de0aa566"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb32_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5cc9c27f1b41180120ea6bd0e75c9b62&0f8d9c1e737943a8b0cd3acb775aa0cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c9293ef70f2752f05a7cfabb8850bfe5&00e637a4b59b7b1fa52340b18a58b094&9388d2412be4f5edc9716e59115fea0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"17b2d8264a5f6031936a6446eabc815c&f357091aeaaefe97051560187b387331&8fbd714731d54abc50e4d0adecb1e9f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb+eltwise_clip:-1.0:1.0+binary_mul:f32:0:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"15714830acdfb6027d0efeef7fc91ac2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"861a6789f8455fbf9fd34f8eacaff015*5&bec5f8172cd255b43a9bc9b8a85b3846"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"92953709677aba2530720d2095f7921f*5&8278c95c27d61fb34ef2b16575a640e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"413b0fdda839152926af00e0702f54c6*5&b74dc5aaa4009974b00ba2dbe02ad2b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"40b00da8ba68a570c29d62951c2f30c5*5&0527b0e9d3ef076923d010f7005a07cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e2d61c7b1d107b6d079c298d82521741*10&f0c50c68709e7b1fe3ae77cfde78befe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3cf4731a199190357dd38a0aa2956135*5&5f0b00ab1a93abea056683092729728b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"305b9444bb12bfe8530a2aee8ebdc2c4*5&6a2ccfe55da1128590987158184799bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5bfaee025da8878abf6d69be644191bc*10&ea0704113637a8a7dc8170ea0386c55f*2&23022ed2fbf7e922b8b1563c9bb70962*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fcb1da3bd26b06e541c82cd7dcbb8fbf*10&8aeec5b1f06f2b38ad085a900819ad45*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"07234ec443fdd44b522a3fa27813d058*5&b63a49f499806d50c56824aa2d882b74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a6ff94fe05f4037e9670c1ebe935af99*5&b76085c69cff95037421b8a828cc29eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ce9091b930521183dbb6979165df3daf*15&4403cf28a61b69291ca3ad531b1c4218*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cfef47e8dc46460142fab1f4f0cf943b*10&1b1badf0b13bf74f71f3e263276f1217*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"e2f0d1cd8700ee255147acde26495229*20&b416c59e131733e07099a2820779b7b7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"d500f2c70bd1d798e9aaa69066d5ab88*10&aeedebb586093601bdf030e3f253a675*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d33996228685439de93f0e253c8ad79b*5&615009371bc9c024c39776f725b3dbf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+binary_mul:f16:0:abx mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bb762b5244799d035faf9b629e67ff8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"f8f2aa0f72b2bed3082a2fd8d0e44877&a994c34aacc2b0b1ee35e713a45edf9b&db762b13760cfe3720f87a5154f6c00e&67811ffb3ed6d933142bdc36c5e6d5f2&6f9aac5671b8edb5fa8d8450346c3239&abf299b214bdfe1d2a0e1eb6eaa807ab&12ef66c67eccc7aa5f68deaf9a692975&718e27917d58678b57631988522d86af&7c9e6786c0d9e55c4d6549c1474cc7f2&b65cba5d849e305905c4eeb30a2e731c&bd7ff959657bb9b248713a9911f9bedf&f7978d8fd2e6c2df41a65b16346b11b3&8f4690da470e92d4c019b2949973a99d&a1d0aa55acd20d27ab2dfb31c7b0ed50&d60f6319ee6bb7c58ac58968a07ff8c9&789073885e5eff87b19a6ae00ab357dd&1d72598eea26c820faaeeaaf3cf81138&8cfd9fed856fbb04adffe730f7a7118c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9dc9118e55c649ad7a07729d56ca8f7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"780f50b6ec0d19f341b669a6f5ade6fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8a05e9cf23fac1b13675aef8014e29ab*2&be5468e68ffc1500db67ba57fca025ef&6d357a8611b4fad3d694d7b32edfc8bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"0882ccdf202f5ab9513feb9afa2b45f9&57c2ace09a2f5ed3705ec061c781d719&cb99c73cd71e2f6dc2196c44b794bd1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8d04fcaba18c5bcd96f9213d9a0445ce*5&520d19cd00990590f9bd35ab22032f15&5213e049aaf1d073a4fbe3c905917278&be81a422ac6e332768ebbd4785e55f58&4e390cce7bdee537386f606e0a4c2f53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3ed8d9564ae80a038e7fb23002e424d5&a093409700f47ef1d4d4279fa1bbb87d&829db5bf92421072babfb9e4a23ca703"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d953ffc6faca76aa449f4cadadb552ae&ed2fe69d69066f40cddb1ded7c1861f4&6375af67668858abd860953067ae99d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"db93b1eeb49cc2186e0d2fc75402a654&13ed65bfc2027a8afc65ee9748211b33&edf83677aad6a7a9c4801abd07c50113"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"1f841b8a8aa05773e8dc32b2aabd92d7*5&fc6c060381f15b1a2770519e1974c957&c0d9b691c7ab35e22080b5c3d3da451b&eba4933e4c70d3affaea8fe832f1b690&caa31420ee149f50ee4d64b90909af42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"02458555c8f79cac984184d22b82a966&d67a718ef7910b120083eadb91caf1df&f6da7144100eabb1443e7b40101c0d01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e179fc63c9648fa0553386b9f3079992&61b82624cb24631240ac9a198a011dad&88e451cd4504d2484357bf1902808d9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"05f90cbef5ef336bcb6c959f735af1ad&b2f680f10df8e5db1348567de850552b&66af3b8ecb2ede9a0b61ca6f5ec01350&7af332db979c6cc970e01cabb85b37fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"f2aa5b7d09391d7d5939e237fb372687*5&edfc87ba17dfe60e472f14b453f32163&570ff33ce50fb5b84023ed2ba7150eab&282a9ec6164c0d9cd1ea77102f80194e&0a6eeb99df7ddc440e959b497305afaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fe118dcc23c4b55779ae8330afe91fa3&bd45808dcc6959945c99d8dbe85e403f&ccea1d00ad0be5dd5c26e464abf1080f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"15fa3efe2a0ddbaf67ec00d457557fa6&7684534221032c6dc7a9d54403d3ba50&1fdd61bbdc3d80f5089a73459dea1d2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any mb32_ic3oc10_ih32oh30kh3sh1dh0ph0_iw32ow30kw3sw1dw0pw0_n"741487f7a406b9ac44d8ac47c0285219"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any mb32_ic10oc20_ih15oh13kh3sh1dh0ph0_iw15ow13kw3sw1dw0pw0_n"3352462c5760b5331a4d88383a3982c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic20oc50_ih6oh4kh3sh1dh0ph0_iw6ow4kw3sw1dw0pw0_n"22a6f5d1d83e8d82686d006101a8a22a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb mb32_ic50oc2_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"870b98c0585050413013ef2c9a5e72ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"b1b9b041dc71d2ebd12b451d477a2e96*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"2dd53015d84a8499740e3c774c4b09e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"02d1326d4c5ad7ff9d176799f7e4b45d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4ddf1e1ad456d31a1f3f42f9b6fb9e7b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"03094a0a9b9db8119e5584086bc517c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a4ad8e47f1e1ed0a0fe9a9dc30221664*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"18dd7ae2ec1cf3e5ecec91dd0c2de74f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"381ea6274aa529ae5a960ef8decbca28*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"20b58e8f593c7c50664ddea182bca9dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6047cd9922d18c5f6c2d9da1d69898c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5948c4e1c5313404a9885a87be56b919*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6fce1056b0e4fbad16e157b2e9594ed7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3b93bfd084b05c0ef7c4095f960bbc7*5&5b63b0e8fea587e6e91f15bc9b1a8977*5&b3a08dcfc2120fca40178e5110a9af6d&2ee370ab9881fb057b45c80e5c9f1712"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb1_ic64oc64_ih128oh64kh33sh2dh0ph16_iw1ow1kw1sw1dw0pw0_n"4a562c8ca00f12d64ec026bffa3b6fca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d680056925d6b4e875b0796d70907fcf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih64oh64kh33sh1dh0ph16_iw1ow1kw1sw1dw0pw0_n"e2db2c95ea27c340022d0f2d397c53ac*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"254e48f29dadd13c0fb2ea01577f4559*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b8e17bc818970e044c87c9bbeafa97e*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c5987b1a96f7b1c21a6baddbfc30728e*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih64oh64kh39sh1dh0ph19_iw1ow1kw1sw1dw0pw0_n"b057fd60936c66e67d369adc6973b301*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih64oh64kh51sh1dh0ph25_iw1ow1kw1sw1dw0pw0_n"0d673824aa0cac2c6b4739a642a4b432*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9d1078b9aa031d98f887e6d754992d0e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"76e16a393bf9a0cf9fca985356f8c99b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh51sh1dh0ph25_iw1ow1kw1sw1dw0pw0_n"e5a18d5dc2948d31d3dda8cc02785650*70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"779c03aaf0f9e2a95b6a51084945d4c6*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"36616c96381ea058f96430d8a5159194*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a2c767af930e2b731d077e7e281e1d6*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh63sh1dh0ph31_iw1ow1kw1sw1dw0pw0_n"079f58f18f675b28e9e82e6ca4500e88*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh75sh1dh0ph37_iw1ow1kw1sw1dw0pw0_n"04bdea02b1589c6a9b2c5ead79f7740a*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih64oh64kh87sh1dh1ph86_iw1ow1kw1sw1dw0pw0_n"b236b996d04ce214a7ba4df20efb88af*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e28a1d66d177527aa1c174901e684fa1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc29_ih64oh64kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c4a076f49984999ffc0909b178c1fa5f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"030a5ffed3b984a679b0039af6c046c8*5&02afd7a7928ceff6996e230725ed197c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"3cc8c67db6996fc5692a7f8c5be315f9*5&be6b3e9b45ae174dd05b5ae34179e130"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"cc4c3d0c168445c04e278b94a2c0f835*10&c034c031f5a6c069d0b2dad04d10eeab*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"c5745baecbd8ac37525d0ec1cfd57622*15&84254d562a9401a1ec5fc3e0aeb04868*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"23445e8ae4df118711450ce9f99c31e7*15&fe5aba1547dba8cbcfc375638f2effac*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"39d35b0a04533ae0910fa98b43130929*5&00eae8e011fcc9bdf3e405f58b93b572"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"048d74fb2fde4752fcbe63b80c8c83ac*5&7006e594b82500a265f9058ccc404501"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"7a65a4e54a6339de9a450da53ff26629*25&c842cf2b4691243ef7b2579988ce6302*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"fd91e0f001ecf2a99e1ecada3aa4a9a5*55&11c86768188ef9e17bfecbf9259387d3*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f3a14310dbf90ae21823ac911fd226ec*35&6a69545b78dbf4c9b24beba4f4d2c142*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ce87dcda442359f5112f72f0cc85cb4d*5&77769b28877dc94435ca86864aeb844b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"a420b7dd7386a478ce8986377e9866b0*5&66a5930dba5d3ac9582824851487531e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5ea96af6f24afe5d7710c012ed86327c*35&cf7b2f6c9fe1d3f1c9b791567c06ce86*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4137a0a25557ba6760d8f383da55f01c*75&177d96f9c620d21f03c2395feeae0c3b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a46b1c401fd3c098f8a56986d11cf36f*35&84aefd6b93e5a9a32e2f296b3491028d*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"332b29f25237e6ce3d7db8033293334b*5&dbb9ab747f6776a83f20a6c399a3d6f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"a9e2d1430c4604ec1957779ed9b64cff*5&8fc342f4195584f2f9edb60ca21d386e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"44fdb6290b4c6e87e8de8a93a62fe1ed*35&cecfe97ec1c2a921f359d9d89ab78ba7*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f83e9a66ee51e7505690b22947418bac*55&85ae9df79eb88e30e7d4d57feae8722e*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ab78d60f5f178c827a18381b5e100780*15&d755575080e7da006b88c9178b7a824d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ab7a87cc6dc6f689f19126da95c89d16*5&617aa7a3cef5803835fdc580279c8d88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"eb20ef4f8cdf34820209dbd8623d6bff*5&63af469de61c2b0f4aa01887b6548b03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ed15adb99e8bd73c401f88b4d5454f2c*35&93962de7c2c5f3ceb18a9673a092eefe*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"6695569cd3fde7ad863b15b006c24fa6*40&0b676273a349939605e58db9bd71e514*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:14:aBcd16b mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f3c7b2886644ac8901ee0bf0bc7e2032*15&2bd464d831c28cca9258f0a90ef0db7e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"f95dee1da63b2a7ca18ecbdf44bf4418*5&0da4e007effac5d81def14eadbdd1803"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"8bf4289220b19510469c1e75f051a8c2*25&7813d474872a7e8716eae03c8c212e81*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1eed1545a9c3b41dcb0d3db6a350aa28*5&7a57977a81cebc85b6181c477ba4f056"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"152df8ffb9e539d75f9cdbbcc2518d0e*5&805f23e1c27b0162998ee9cd0ddc3c37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e9edd48e8949178f808c40991a0fa744*30&a6f5a0ca125ccdbbbb9ff18be75953cf*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4d2e47661b03ca5648ef9a9b2cfcb1cb*5&54f1a78275903071420f8aef487dd02e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"15e9c9a6064a0babd225f3ff246ed6c1*30&02d9a125a0cb2ad7c6afdc7c428a78bd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"44ee733285949da54c2c1df35338775c*5&14ee35d016e163096a30215d7cecac65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"a94b6c76ab29836abc5432b8cd859f7d*15&249733749feb964c38c1c84b8346948d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb1_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"b373c71a60dde0dcde0d2db207a8c7f4*5&e2fb5001bcc06aa715699177336ee5f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"4072b835c3b6c7e412cb47129bb83409*5&cf94f3a95646cfd2d17caa68640f00fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"f2e39a7b423f43c3f690243eababe2e0*5&dac70f84ca18d512ccd15912473aa190"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb1_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3f8de07ab1f7aa9a2d2ed889381169ef*5&6083230f13041097a1864dc092002323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0cbf713c46c3bc2431b30331cc14ce91*5&c7dd383da3ece5993737a7bac9a268f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"23dd32aec719817258d280dca3b3473d*5&0d9f2cc2c1470b934b5cea0ced0e073f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"436b648bc02b5cd5e3a14c08144a073c*5&3ed5a4872f62d4eee4fff4dfb6b32e96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb1_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"3a61418ebb765492281b1b0fa320a9bc*5&680d76f6a32f976acb9c91719089b1a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f0e9a085df9370cf219423743deebd4b*5&f661f65f24c5a23e72705bc64480ee62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb1_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"5d4f41108fb2bd61a876f6a205b7a3f3*5&7538bc03555fe9269856e9988810c5a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"794a562bddb63579010ab33820d7337c*5&e78f6336dc6f8566fdd34d7ee943f4d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8264b4b82e6c41e138c72efa6c206f34&039c5cdc33afa4f990c9f7d3a714d9ba&b083e381cfd9acd76d147834a8c69179&975943a3faa5eef9dcd62bed35c45f17&7421d57199a27915e96b8aaf1c649115&8a0d7187edeedbde3358bac60534d062"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"538466e210f2bdfc685efb7c4be0c8b2*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"f6256a36e4d549d10cb96f1f7dc39b56*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b5552e9faac5176a4af457136fd0d993*3&0efd9036be5a732d879bed63c71de1d8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a62ac28b54666221077cbd1ec3848460*15&a10a25e1b9a9f4b282fb3149ba58ac67*15&28db545acc3ebb3731965d621e22b6f1*15&d090d936d9c4e62ff4a576440a2c5619*15&44ec8218e8977db68157ff9ad5b07a1d*3&845c88c03616f76004fb6ca7346a1fe7*3&47b68326dc4e34aeac8c8d470efc3608*3&6d7534bf9b062a78c2224a0851689124*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"be7e622cca895a140d31de48115a5882*3&b1ed64f685fc8dc34945d988a28bb287*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"84d109b4b86486373396a56a7e72282e&b8f84e05cc5f3241de9c2906886879dc&14bd3491eac0a2b3b1d0a581bda0a208&71de4690e51d8fe3a4bc297245ef8e93&200964e924135aba8a8d4df411458552&03bb24fe2717fe0cada1f63de67ab9e4&3311a0c3726ef2d216a222b039246b9f&b497d67a67536917f48976c2f09138e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8f8dab50d50d6cb7497983f5a685cde6&36eb8f85822995b968a861526e288f2e&74741d7abc8163fe009bde322d1f7d5c*2&43c07c1181a778ecfc6aa501fa83f4b8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"49c43758aa4fbff004b81f78b5ecb05b&e06b7b7e560988d7ef809e7967339ede"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"b0a3d73069483d96b1869aacd2d8c5e6*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"52269de787b6e2e0f8d38423e3f34488*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b3728df895136db084237e6612cd8ee9*4&4a33e9f15d655eb34c212842b18c82bc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f8b548c99eccdf420d9a6f141190d149*20&14acba363eaf08c361a93794b61ac229*20&6815aa09f518931c5d69b0c0f5294b27*20&c9d23aaeeb62954e97390582bc058be8*20&08eb8945d8cae173b59a14033e712643*4&7ab7103a65e509624e6f5771956448a5*4&ca529281aaffb7c88f7ed1697bf2f463*4&5cd8547c72bed1b6ea110c4ab9b640bb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3f2a1655d9a389f56b4b482aa43a72ed*4&126f5a0c172aa706bf4ecc5a8ca0c97a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1fbfc7f42d34ce2041f6bcb851597bb2&a8333edee8b35fd5802817e453b56245&7841ba5a26285fbdfd94a5939d6706c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9e849cf451159e4357d8512606b2c217&5d5c2674da1fa03bc4d146ca4cc31a47&b5c52088c813001e2c28f3beda2bdbdb*3&fdbc01693f4da48fae2235bf5bb148ed*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"773d12f442f4b9c8b7ac49ab8d872dee*92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"536ae21dd817a392d1c2a033a63dde91*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0af5332af78e2ba0816740a82ee6dee8&a88c6097d810f5b798b6b519ec7b7808"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"01634e0372d042064a32f319ba54006a*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b2273621fe5faa6519da2e362206398a*6&d0ac74ad560f04f8044ffa06e30b194e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"986990220a2762c4d1cf035db809ed62*30&5d5bfee2a8aaeeba9051b63e93aff7c3*30&c6b641ef166dd966ec5f8d563b064b2a*30&1f968e865504d0d4a4535699dc0a4671*30&7dac680287fd3fb82f762c2c62072b0f*6&d0bf7ee39fdf4c62d26ba380682bd31f*6&1087f60f6443ab547b320ad39405ced9*6&d8aee2cb572c55d63e997288b0a3f4f7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"480c3c3786d482073a9e638a75bf11f6*6&0654d50ead0fef486118143af61318ff*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7d7b249e9370be7fdaada8e7b0f9ae03&493dff21c218472b7bfcfc93ccde6adb&89a92b6bbf1a831d95146d22728833fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f1d2d60b621ba25407ff1dcdcb022b4&9da86b5109a872902cf0ccbc24e39c45&709d1b5733ff0f85748210b6797dcf02*5&d55a1a92a7efa4b08b6b7ca374fc70bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"e492000169db68db4c0c47604280e973*159"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"d7494dd10fe4d62e55b7f7d4f7264be4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4254b2452f462eee44ee80938ca66e04&6d43b87f1c5fe89136e634d5f777b537"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"ef35a66b3bdb408b7a5672b535adb8af*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c55ece0673545ba69090a1056c4a1ca2*3&4659dd0c70ffd038691947113c8f4ebb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c4efc16a7415fc3d2b04c23db9ddd3f2*15&4c0f3b55d795c153fed0c1a8986a6992*15&2d11f0f0347927c7a138f24bfce93bdb*15&b009a9c882939ca785cefbdedf092fb0*15&daaafe6fe29ee94807f3c57229459c01*3&ba889733630cc2fdd6b2b1b0a104f8eb*3&344ffd4e352ed283fd54cdc3ff28d64f*3&c06fdd6fc4f0bc61150e7e137808a602*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ac9c27bf50689d3628d5fe5c69a28dff*3&5072145bdb817ce2ac4874afb978f0dc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"039d00c4076b2c22c4feffb5af0cead7&d272d797469b63cb30874a2323f4275b&70dcc83f5dd1288b51a42d3adaf1b3ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"467cbac6f83a720eb35dd45264f46a99*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"4f981f164c1511b2ae440365079d43ad*5&e5c1beb13f780b8e14b5530d11a20de3*5&83307a2f4e684ab907504e750ec27a10*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"61a539eddc15464714e7d814c1b2e177*5&49a51c051de64904e624dedc16e8009f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3b2b68342fa4111bde785e4986891385*5&0fd94afd2a1f162e21d3435b8c0de3fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic16oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c8e55b6dbe197b983dc21dd0f94f70d8*5&37d4d3f6190db84ad9fa88a0eb3c1d10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"639803cbc8747db88bc4a264bd3f0141*5&4adce83fece4a29772d7d726e213aa46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"89cb2adf244554f98ee80bb94df1c55c*10&cd9317a5bacfc2fd0807ab238b197362*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic35oc32_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"8d70ffe2c2f1df62e72e4d9eb6391801*5&4bbfda7dfb28e73be2384a7cbf7a047a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4300d618a5566f56c4dda9d8eff2a71d*5&732d425d023bda893d6ced747391990a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic24oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c0dd909b3757035c4b8e5d278fc686a7*5&11695d076afa125ed67ebe1dbfcddca4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"ed36e41436e7bb0089ed01d0ff826d89*5&bfa6e287b3038faec6a5c5dedb203151"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"05e89e1b5a11bdc0da4291e8588eee2a*5&37a6ab9f47ef677c0daf6ea874148968"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bb573322bb055cb5c1529dbe8b03519e*15&5e43f1fffc7441afbfa53962524af14c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"937e3406e7d0fc5c03d27f7a8a12bb89*10&524b8a5f9a0e66ce5f439b3c8e145427*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2431f6d9f6a3336c9ea8bfa9ef225992*10&b24e6370e3959b366f8efd6661f300cf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a651f43131874b5dd837d2b4461c7c74*10&7859b3783f2c7f4fab6e6eb7f1296859*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"fdfd498c880ff306ef1ca92d32452732*5&7f5e1d579eef33d7a32dbe9178125434"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"949daf8ed6b6bff34649ee5196d7375b*5&810b74041e8aceab6bee6d9a27155d37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc384_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d208a47935ff237151878fae8ac8ffb6*20&b2854802403d1a2c35d77f8d7f38f965*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5a14d3d75cf1dabd3d02fae8b201f0de*20&b8f915400cf3144f70d76abc457216ba*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"496d76fe02bd4cde305e2034c3195ddd*15&a0bb2138d5c2f34c0a0b0e613cb08a13*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a69b2a308c8e7c7dec15bfd17d581c90*5&69e0a59c93032fe6892d375096db6a06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic96oc576_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2fdfcef3b3050afcfb75ff3f203c7a67*15&5ff40cca166e7efd44a6c1a4ab6b3175*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"62df15f9b3e0773fbb6e953e1e09514d*10&a4ab5ab8cffe1d633d137252256d499a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f553bec20a9b73d0e7fb8d195c09c24d*10&fa3ece2584a27df2a958298a56af563c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"fc6333e10538401cd18ce40ec7aa861f*5&5307d98a427d455d1f1dd8a00a30c505"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"27b80c040ebd7e41e55ab258bb9c6a2f*5&5a20b5f5743508153a4f124838d3ad97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic160oc960_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"13fcf9ac3193eeddba82815d0fe63698*15&ca72ad2622877773e4e20553dcb7da1a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb1_ic960oc960_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a7d7e56c36d5c8736816193f05d0ddd7*15&99c562f5baa7002bd595b2d12c555932*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"223c2f2e4dad86964a057766ba47696f*10&96d1794a99c2c78aa2ada173aa7453dc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f0a4528f94074daae2c4bfc8593c86d7*5&14e594c60145569d4f37406a10b99ac8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic320oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b0af0fb7c59a7b05d2639b3d342d112c*5&bd895c3a74f1c2af3190be69352b5642"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc96_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"342f3e5ae848345769bda454d957fd6b*5&33e8f9d4ca72bc98d6a56a410fd79463"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc32_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"df01d917a02fad48b026b1e318848b2e*5&0e9934d0e701552f614d2de5827995f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih128oh128kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"7c7c6abb715ecad4699222c3c7f9a22a*5&684534a628d805ede74738eca1a345ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic99oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"99ee8962d5b37682495467a5cdcfa0c0*5&35d225c48dab7fd6a28a9d4ee522e0b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"07a252a1da50a045a9f9720469e357d9*5&1f83f599dde147cd22ec98a4ade40010"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6aa56ebe1e6bbddb4ca23001f6ade5b3*5&5e717d8881c43aefc92a2a4c62886425"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4fe93d0c41fbd9c74ad3fd9d69dbdd95*10&148c0ca5eee75ff02077e7318fcb0e3d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c23b669929c43bdd5ff11fdc5e8797f9*10&a233f67daf6502d34a2434ef9976bf45*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic35oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c8990bb40457375fa3e025215513b0df*5&d3d75e71469c42022528ddea76243bfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic16oc1_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"6ff21b2fd5f39632409d64a5f68e0705*5&4f2ea3f1fbf37ce995c623c2bce34389"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"c3b18cb5f92d4caf145b433181d9a2a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"5c741f0c04ae120438dbbc30de3909d3*5&b869e21e27f58c8d753a44608f59bb5a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"41186de0088b5fcf8af94d4ea02f45ce*5&255429f54c56cc5b34682fd24a3622bf*5&6b793e8a0604fba19904553abe05a95c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"dd5572fcd7266a92c0cdbb063b65d738*5&58fed0a1d9fc6d66c2e50ee8d943f735*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"154fe9aebb8f2b4e43b71cfee39c911e*5&334b4c69934d8d644b53bbde747cc07a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"410c81ff20e6e32c5a4e9ab1014dcfcc*5&3dc267bd66a7b35aa535a1a6ecc2e981*5&2f9d0ce041ed39d18dc9f5da4dc5e6dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c3330975f8e9e3727ef1b3911fc56189*10&eda96660b8a23a675375dde9fd661f06*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"68ab98eb9aa89d0c0431a652d21c3065*5&1d327915c4db2d111b3e95f751289b60*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3d1fa2231cd411d7481b7b0749d5715d*5&2ba305da844f28819ca53beec44a14c3*5&ed1d58d63b310df1514ebc61244e8b07*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"05ce772e11f3b1b7f1ba126e2468792a*5&adffc0207ac533575a4ec305d5ac5894*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b2662d6eab722f1ba01068b17716f21b*5&a03e3b5cbefc6c972d49a12cea642ddd*5&e4adaa8140356031f98553f8076cfe38*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a38e7708cde97e280597f8f49fd35b35*15&62fed94a36ad18da7cf3ad979e94ceb1*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"bb3383e5d74bc2e4d56a8c31f23c1a31*10&b007e89477d4937e725013b7e35f7ed7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ab4e5832f568d802cbeb3248d71e3015*10&65cc4b2e17c7b53c264c62df7e50e153*10&0d1e85e998cd5fc8dcbf3739324c4e13*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4f6601e908c3d2cd0bcecef57750f4df*5&686b830b484332f3dc309398a5ef0a1f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3c0c3cb86f1d7970009067daac82b2ce*5&175bea45e2260c3b9c0d29ff3d083bed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"30bf138be52e486bac56a9778dbc6d14*20&97327cc23f5ffbd5d716b8e6e91ea028*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c7227c7af75d50f7ab0c04f02d64cf6d*20&140559a1a88696e44d06a525d874a97e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a5040c050b892d71c99be72b95d503f*15&31ca2975d4c7d5e7936fb559022c4388*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f5297424a00bdb991ad4ee8c8a3f3c9f*5&94c33a21f430ba98c4ae09e5380bd108*5&0fb2d1fa2f9f2dbe498a901c8f653008*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"13318a543ec7c38f199f86513f47935b*15&619c84e8a87942b11458f70883ce811e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8fad0bb2e59a8900be399a275eaae151*10&9eaa0dc0f7b6115083ed3426952be390*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"672e76abffad0e872862637ce2cce025*10&049acf8758b1e7864aa7914ad8b4dbf9*10&307b5f20e792156257eb90a31cfda64b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"539af982596cc29c3c4ed1ae9830f50f*5&3db8e75873216f03eed31a6f5fac7f49*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f13436f221e587be5f5d32922e84495*5&d9638ccbb590e0ba7cc555c0b879d396*5&e0b2512c179a6bb6671b51b85a48b896*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a12ee562ef346371cd80deb353b1156c*15&8244a0099c99e8f9a9a777168c5fa688*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ccbe863e762da13bb53e6960c2928943*15&42e463e1b92d54b89cd066c1601e9656*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"43caabeacf0b19d301597dc6f7b3c165*10&b34342b457b3ecc33ba8557f31eb24b5*10&aced8e836cc78b1c3951de4d3a2b33b0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3473a270cc1c2149fc9d20cccc6d1393*5&ca7b13b00098e295796970ec6a88ad6a*5&dea7e36d86073839b24445d1c3536de5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3122c1ddbf09fee80483185a73bcc033*5&a20a3eff2989837cfdd8207fdb39391f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc96_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"246baef30b153e9db804baca7b064814"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g96mb1_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cdfb8371b07f077446eac5c6b5f8da79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fc0809b282ef8f488eb5b4fdc3a0bd9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc48_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d2e9597420d8f8a2459f95414d032ca0*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59a53fb387106bc0cad0b8e14748272a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"686fade5a48430350835cf11b6911bd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b147cd35c383dc67ab10bbf50090cefa&753ffbb14ca080d33b3637f2c2ca2f52&53128f2b6002f1d42fd34cd863dbc6dd&46d155a81eec1b3888a679c7b2814a49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"be0040953ec7881185b175996b4de30f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g240mb1_ic240oc240_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9a297bd21424d905f6fbdc1117afadb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic240oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"41960b8e39a96a818734fb0ffa822fac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g288mb1_ic288oc288_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d11264f8f6e578d5ec0f1e52f45b7911"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"67fd664fcf3ae322bc50ebce63fd3273"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g336mb1_ic336oc336_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4995570690188c5678a55bf8d2e21869"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic336oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6844a8d5010b6da678adb394174c466e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"169362fb25e05c52795c09dd37378bd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a1b37286a4a3d070bc4c1426bfe30938&23ad045cd538bffbc07fb0108b6adf06&eae79396c032b821dc05aa16f81f4aba&d03edda11c8fcc3108186944376783d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6199d4510399b21bdd1d856cec2d6deb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc48_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4e2a7ffd9047f368227cfae2dfc2ed83*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df63fc0aee7a9914f43e06ff88140239"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic240oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"97ec4dbac09a242e7e51e3e6008d4ca2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g288mb1_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7dd26440efaa4276b2477ebcaa68763a&7d758ec455f65520709f2db33927420f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f0810290a4a316048b4c743f8d597570"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g336mb1_ic336oc336_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e5534dc11d90cc25c3f41382ac7ef6fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic336oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c111c39bed3e8c761101af7ffd9c0f00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f6dc0ced965092d10feec85592fd1724"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g432mb1_ic432oc432_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"177ec57f64106eea5d4f29ad55fbc3b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic432oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c47a259aff1c19cd2b7f0aac5b3f698a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb1_ic480oc480_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b58def592a1987f1b44c1db866d52f0b&36b06047da751db7bd7581a4e31cba1c&4dd30b0d9efdf2384ba62a8c50074a92&b4dc518ceadf185e8486143d1667932f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"907da33a81230f5f4b4bc6e5dc5fe688"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g528mb1_ic528oc528_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9c37f524381ae2cd61d6a31b5b4386da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic528oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"91aadaf473debf947b2e87c97029da4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g576mb1_ic576oc576_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"64c874c14436a824de47fbef881c442a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a1c510e75dd40e344c91b67295b801b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g624mb1_ic624oc624_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ad33490a3b2c5077321f72a424002d92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic624oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7650cb648acbfd537c93cd1e1843ce89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c1e21109c5830b55dabdc9be7bef0e73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"be29703e3f46dc57692a2da45d45e5f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g720mb1_ic720oc720_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2a8e15dc19fab4cc8ed37ab76b9fbcf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic720oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fb4d5223029238d48656c94a2e42b91f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic768oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3979590e4003e549868341df5a001a0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g384mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7a9b4466d2589c158d0a6becead379f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1aa45c6204fed1ea2b12b9971ba5781c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc48_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bef04eab2138e498838983e6a7756dac*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g432mb1_ic432oc432_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"19f034489e561a45316ca67956d0527a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic432oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4f8810db188510251be3171204c26511"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"75d43cf68ade10a8d57eaa44348c7509&a040f6b9d8efe605341a7d091050f924"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ac6b1b488bc7bb506d5b33b4a72601a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g528mb1_ic528oc528_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d209cc3a3bd107f060084a3c17b5bcc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic528oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"38ddc0334ccb445816029d8c1e7b59ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bb18d5e3bd6d9b324941439b8aa530d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"045b33e7a6df9f6fe0043c5b4f551a1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g624mb1_ic624oc624_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1bd4d2e6a1e1693520632677d7649252"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic624oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"23d7383f93f2acd0f022498834b82cf1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1b9410efc858dd6da67ee984e0b93e5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37c5e2c0caf6f8369f5eee65a4c0016d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g720mb1_ic720oc720_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5c82c1984898a9e3d4e29e0cd03f96e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic720oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4fa89bf761d3b4817a0bfe9f84f15a31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b73fe47e6134e7efd505b9b884605ed6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g816mb1_ic816oc816_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"339dfb2e52850a2b2d0f107fba479fb3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic816oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cd7422f2041f2c78834bd27a5282899c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g864mb1_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cad9ca928779f47ee5d7784486f129f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"57d7c6a9f14d9e2b51eb505cb59fdfad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g912mb1_ic912oc912_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c85765abf0089e919950a22350d963f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic912oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee2210fe00242fd70dd898c5a9c9ac12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb1_ic960oc960_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da899f45d64dc67a747dd2e5b4740c0e&1881723522d43a76d236a23558e8041e&ade1e6b2aebfb6388406b1ed19cb8c01&86db767671a2e0a48320fe53e7713890"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee0fdc5bba14b8e7442f40f034ecb328"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1008mb1_ic1008oc1008_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d328255b90e20e5505cf2f781b68d5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1008oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1d42c4d294f6237dd71999de62508702"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0b43c721ba7802ded1889911fab7c39c&ac20fa0c9e7eeb2547b51bfd659f2b2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"03b17e965c324af7eb447307c5249402"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1104mb1_ic1104oc1104_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1726a69bd34d571e7de35952ecccb0b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1104oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d8be09e5b7ade59d34c3bb9b2c3dac0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"241d55a5c9b59480b92650181ad7eff4&08fa94ef4131cc13accc791a0279c60e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"24603d3592a4debaf62d215d6485f9fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1200mb1_ic1200oc1200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9dd98df535f0696a464c8b525b8f6d4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1200oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f2e7d3126f2fae492771f5d092c8a92d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1248mb1_ic1248oc1248_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95f87eb878f79f1b4db84947d6b05675&2e6150e9037dbed52f015e3c61a2d617"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aabdf12a9dfff9d19e039eb1728f0ff7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1296mb1_ic1296oc1296_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ddcea289cee7f70e86543585a8c810d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1296oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"472c408970cb1a7d0a569411f03fdd70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1344mb1_ic1344oc1344_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3297328db75390a7d6d25ccec4f89b0f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1344oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0901e0eb5a7746bb05c707a08cbc48b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1392mb1_ic1392oc1392_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"33d4b702cda4d182860b53fdaf097343"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1392oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a8a125dce922ed6bde5c5e03797224a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1440mb1_ic1440oc1440_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db297da13d5cc06c3bd86999f178252d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1440oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60dd528b0ae4ec3b99670098dc1b1a8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1488mb1_ic1488oc1488_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cd5e1685cb8025908ac0594eafb03913"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1488oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1c6f061f1b79cac1bc2b28cb9bcc1843"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1536mb1_ic1536oc1536_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"232674b7fc3f0c74ccdad7850dbbe16c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"28ba8ba10f9b84e841208bc1f8cd58e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1584mb1_ic1584oc1584_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a409cef0ff0398fd9af64a24a1900e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1584oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ff48b1f61682b00881c41f843e50946a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1632mb1_ic1632oc1632_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"70647d56a2e6920153d533f05d9cac99"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1632oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"09573dac96089233e3a54150bafa854d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1680mb1_ic1680oc1680_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e378158e5550880a280b2f94aedf428d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1680oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf96b2604f2322e415e985e9d7709e79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1728mb1_ic1728oc1728_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"59b2b976ad4a57e925143ae9afaf7de9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1728oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f18b3e95f7f9e3a5adcc104ae511e850"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1776mb1_ic1776oc1776_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c7b8e361502239f960dbb4919be202cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1776oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2d99c8bf5c4b325c587df8cae837ff3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"52215a5f3598b87267a4e96afbcf927b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1824oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d849ce49452edb94b6f413411da6d697"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1872mb1_ic1872oc1872_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd800a19f7112b4a5cfd06bb57d9e2bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1872oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e01ad6152a2654099a713501b47997c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1920mb1_ic1920oc1920_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ba4bca5a49e706678f342307c19f7842"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1920oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d0e26f91b4fb4f5f6bf39910eac5a28d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1968mb1_ic1968oc1968_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a89f3022c360f2c02fb206d5c4fa44c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1968oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8a64cd293b7ce468af720ebe00344e2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2016mb1_ic2016oc2016_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5c79e55011ba94a3cf80f20b3f90a51f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2016oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"40637a92638c806d85bd247bc7998766"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2064mb1_ic2064oc2064_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ceefb52139c8468078921b6a66a560d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2064oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a9ef30ec84ea02d62b77e9bfadd6702"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic2112oc1056_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd5b9f1ff2c14f65125632f6de656591"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5a17ab4d93bdf068090d219efb5e28f6&5c938f3a3e72e1507add5153916c9ccc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"35082472d42857f36ed089754aab9536"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc48_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2a2cd9a78fb956d8ccfdd1717ba4ae93*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1104mb1_ic1104oc1104_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"44096b1f0641950ba947ec02034b64e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1104oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"70342017cc2fafb5ffc187613d8fa1c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f1e94667bb732cdcceddc510b40191f&c4d05177374e6d3c70dea94f0dd2584a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"805ce2816f906cbbb64164eb80a3c741"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1200mb1_ic1200oc1200_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8f8c856b9a8db1d74412b18f5ae6a0c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1200oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"73f0455935d055cb5d5d1af6d105f4db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1248mb1_ic1248oc1248_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"743c6e5ce42f47faa328449e79077106&697e7058ad140f6fae4bb2b06cdf6fb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bffde515ecad6050a35d2100a9e01065"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1296mb1_ic1296oc1296_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f27bc2eb85a50c655794825eb202ceb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1296oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f4aa2d2d2681ab5f8bd611cd2ed9a64c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1344mb1_ic1344oc1344_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"709287801f91eee865d66e761c94be38&1c1c50a02b0fc1380826e62f497ce8b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1344oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8e8d7aca61c08f8af8aa92b8a4ad6d74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1392mb1_ic1392oc1392_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4f571ac75f529c48d617d914e1520aba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1392oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a92b2234ad484e4a9c18fa376f72116d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1440mb1_ic1440oc1440_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"afffbe818d6cd4d8837289232d1847f9&c63488299e33f2156a76efee3149d5a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1440oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2c9ff3fc51e365ef9c1811892355e1ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1488mb1_ic1488oc1488_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"893dd65bd2e07e8c051d1aa4fec8916a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1488oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"77b620f2dd58bdf99dd5fb6e902c4354"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1536mb1_ic1536oc1536_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6d49c0549902b1deb303079eba100fb2&8f8a2b99ae545381520e0efad23ba434"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b41ae3d0921d353a57df9cbd6b4c4fdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1584mb1_ic1584oc1584_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"15f94d8edc8d6a4169b2858c6c9fc3ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1584oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c05010dbebb5e9604dcac37dd4172d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1632mb1_ic1632oc1632_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a42f1397f76bbe153405d586fa277ad5&fc0556c68bd00185739d7f7759f8894e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1632oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"74b000a76fbebfa7bfacd0a53357a531"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1680mb1_ic1680oc1680_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fee3e692005828f611688a17b170a980"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1680oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b0c0d9cc9adf81260e5398da5116f1a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1728mb1_ic1728oc1728_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7db5b9d7c61910f3895eb92e3ce11416"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1728oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d6891d6e0c952ac29908602adde5377e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1776mb1_ic1776oc1776_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"82044a9f5e661335863e97ef99282a92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1776oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"efd60cc18b17fbd845c7ffeccc717b6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"607b97caabb4205eb75b6bda94a80485"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1824oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"65c4c6a310259f6f931af2a98cabbc06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1872mb1_ic1872oc1872_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"69d9a0ab3f3fce0ef803659e00e653a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1872oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9031cb39875521145c57c2da0ca7cd9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1920mb1_ic1920oc1920_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"00348e99f8ccb679639559449a9ff1c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1920oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8322d373afb6e81b975a7c6143bb8aa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1968mb1_ic1968oc1968_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"322cc012630554cea74b396721d9afc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1968oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5d93a5b574287b26c5db41fe030dfbdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2016mb1_ic2016oc2016_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"91470ea130432b7de8bb8edb9885557a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2016oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bc6e5e504affde970403d17c87fbc913"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2064mb1_ic2064oc2064_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92a7788235572335e48cb7d14b62d5cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2064oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e59f728afb2d4d0faeb0764503bd92b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2112mb1_ic2112oc2112_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"69b36799779b100afec5219a2c53c63d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2112oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c635524e9a582ddd7ae724782639a813"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2160mb1_ic2160oc2160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0b6fb6a923e4683398157430d295e29c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2160oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"06717b9b6bfe6a6921bf84b23c64209e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2208oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"078a28d921959bc63d7434cd5ba7cbfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"3d7b62699cd2baa10f4e1d23f3b671f9*5&1249e6fd4e5f1305a2adf5c04216bf23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"817785ec0e6e3c8e62e4747bd2620475*5&a49cb0d81ddb69d6c7380055362b47c8*5&e2d01ab57e8219619dccdb24a0e9b22f&b8ff3da4809e4044b4b705376cdc7620"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"79b0bf84cfc5397f18ade87d314b3510*5&b6ff4822c6e192d9a9cdc7fc0f9a6c61*5&5c28f674a7535d6fcf8dc73fc2ad380b&39016ce1f79959962f2ce5d4b8a6fc87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b21dab03b0dad6c22f518ae89e2773d8*5&0a17424b8bb39e884ebd7da12513ed03*5&560508bdf4be38f2b0e6bff285dbac55*5&86acdbaf08942a756c95ed7b23ab0c5e*5&edbddef918ff9783ef1fd2323cd330b9*5&e8c342299ee2ad4e576bf513f39eaa1f*5&9de6a18e8f0e1ea4667ba2b2ccb0b6fc&0fe5dc0c075bdd1aae9911dcd1035988*7&67fad9781b4a3f4b4944dfe92b0a2353*2&7d5a1d67e96e1eeecd05a783457d5546&deb52e9f9ca529dd3a4ca65b6c85f4ff&7fd26531f6f3925164555e71000dd52d&163098b0ce63ea0d42a8e31ca959973d&e33ea4ed2c468de89db6dfe1e641cbdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"32db608161eda9fd887a8461e1e6b3cf*5&bb20b6930ebb0b9ce3095758f83c7e30*5&46b4b53c96f26250d76f5d6156529ecc&2f60c41c63495caac6d6e49eb9365aea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a5346896428e04f23c355dbb163edf75*5&b8993ec905219cd2bac8bf6ffba8f1a1*5&40d076be967a5d4fc9fad233c7a32b13&805ad4d18f59d54db4f8c7f6025262fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f0e9ba0d10a94737b6638df01e18ef8*5&c6ef28b7e03e295cee278756af3f4554*5&befff18d589ec35e65aef1504a82f4b4*5&16282fb151ceda1cbdb96879204d0353*15&f2a8e0c4c9c5394be85546ee0591c5db&c280994ed10a7130c75dba8bb41a059c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3462d6512840f3b471cf53123193088f*5&f23f1314ea86ef177199fe26a8652828*5&285476a0bf3b06836c241de3416441a9&6896b1f05972a75ff45fd23fb902fd36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"06d3d605e3fea821165e6d5f9f70235f*5&a70f12239b4208102f6017329516e997*5&a1c663374213ae7e615af7cb49351039&1f9c696e664d22a7d61575813e498442"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"27507684a596a50e9b5d0bf1f86bd855*5&8ae4afa831f3f7882309f9eb098156b2*5&376022709e7be499fa0c6572ab066d11*5&eea568119d4f42f16b1032b73ad996ed&9a52c1154d96a350c10b02198ec921a5&9748fa77dd2d258e093a0d8764f3d6ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc32_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"28bba0a4ef14489edb012f70990eb1f1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc64_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"b856b259480da23d93d89d0338d54ab2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"5c94fb941cb9afc28a6f16dbb94661fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"55feec3ffc696bc6bde6726fd21a6143*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"b549b5d257cb0e411874b59e63072f8e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"8db425500e1d05b8723262fd6d78d389*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"c3313b35bb4cee05dc4336efd6c85ee4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"e8753f221e6fc94f8e17899dd56ca4e0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"f7debf65be7a5a46a8683a83e74ae171*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"3cbfe2166411ed9b27fefe8eb3fb5355*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"5be0d0e429a17e5aaffd5718acccd4a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3f4f94fdbaee880d46fe771b831fddc5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"09acf013294f298b7ad1632d9cbdd76f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fe3f9461f7dcbd1471b50c002bfa921b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"c21a844b39ba3ba4493853dcc0d6c52b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8f96575f5ec987f3a7ce4df804b24ceb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"975aaead87d75c8ca8222225338cf3ac*5&96bad7f44b659509ea7dd08986dfacdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bb1a72f80a122f2bcfcfb90b995a86b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic3oc32_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"50e3ceab70bab3301da315f1ae5cefe9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"c96c35df3d7e3801a9796194b7f16c14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb32_ic32oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"c12b40a1e00a26a1111dcbf7b0c42e11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"ffbadc6a9caab79c574c7b718eb73400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic16oc96_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"179020c073d2b6e42e4b180f769fd5d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb32_ic96oc96_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"20f57dc1c147535ff8779d67f945ff63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"6dc38a8165dcd8f16a8f675af470920e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic24oc144_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"cf79871c93b4ab358115461e659d604e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"165fac826e00b9f6deaf58652814c45a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"9ea86d77654dbdaafeeec0490b1ea5bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"9989e9f27029d0dc676e40a4a71222ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5963d24acd0fc48966b5fa1cfd2f5ca4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc192_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"51e176fb3fac7bd8c540283a095a73f4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"f85b2cd5265eda8d04739718eb8448d9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"89e7d0095fd0c089f64ff65d8157e2c7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc48_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"c55081de38798b7618fe7d879e2b32f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"a209e8997a0a357c6374bf56983dda8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"00fdbf6f8fad5a1365d20ae905d32805"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc384_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"4e06ebea062d313e83e984cc80d8d33c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb32_ic384oc384_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"4e6849254a8dad7c59bc215794efdd3d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"04a64fa9dc1b7159c83361c98b53a4e4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"5576ee8d8d72fab03a7cd8eb45688015"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic96oc576_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"2e47edb0b823a7aa9bfa170ffe7661f8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"5d4c115badaadc3dec53a07799479e13*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"90b20b1f4f4e9dcf8c72338023d147f0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"b02ca349c9252bf8a029cab715b556fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4d4173617c09cec2b047da30e302724b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic160oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"912b330794c9f55fa27461d5bf661584*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb32_ic960oc960_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b6102ac90ab48eca9f634f9d10bf82d2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bfd3e6d8aeb3c24989fbe5b615bf3701*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"84a6572ac401b06d0e33661a63f250c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic320oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"72c311e9b573efff10c39ebd950f3d73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"69134b4b9bf172e777211c7f0de0d0df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"a4dc4ea0c5900220243b6104b2bf23ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"fc2b2474372ba129bbf0ab7e9954955f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"4bbb03886bf62b03307109d87c78ff4a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"06bb0690d02ae6c838ff478c8b328a0c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"8b1aa796479c9763603ca4b9e39a3e40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"8811a338def705d928c01d538dc762b5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7db170fb7519e4ff7f9d4516ad743606*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"043cdf1e6086ee1720c3a5f8a08515c2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5631f3c567ef2cbf6ad232bb8342cef6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d4acec090d893bd54d9eb6739512ebe1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"184d59088a36c722183ba5149218e842*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c9254c2bfc00d1489758bc94773c6fa7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4a99b10c797e6ec8269cae0c41e8f210"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"40bba68494b4698bd2bd07d81f937f9e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc48_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"531ca479421d095cdcd815b3dfc26d28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"06d7e32fc998b58726fea38c90e84c87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"7e477a2613b5b6898dd14b6ba9582056*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"df332eb926d14b58bd464c25a2f7faee*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"15fb9e2c7abec44045990070519f1271"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"17c6dfdaf5b8ec8c6870ead9fb6534e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc48_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"760e4e432e290fb3e22f3724d7334035"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic48oc32_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"b9472dc34cfd23da16703e70527f59bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"fbd18afe6abda5e0dd3286798a6e08a7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"b7ca8c8aaa2dcb9adcc72455029c741d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"8afdd138937559f3362f727433fcb61b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b0f2fb64beaf8f46e466f72237c1992b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"55585f560fc8e08f908710c3641c8d7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"1c38949943facc8aa32c997b608e8233"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:acdb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"c8991b31ae9de12638271f08eb8640cc&f03d24bc05d9364aed0c01f226b9836d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bd493eea2a00efcf792a3f354661c5f4&7af075476c7a5b914f02bcc163dc1140"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"10ab7bff7330ebb35e1b9fbc6e885479&13adb3bf4feb78a7b712aa9ab175bfe0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4e6942c1a7047e85f272d6cd1405eac6&875acbcdde8a1f903e62ea56655f972a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca79c0782453e3d5891440695f14bee4*2&6bc0a449482af6cc94e220103f89b22f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"de553730cfb4473523528f790dcd5ed1&b675c32cb5af9a881e86f11374f5a67b&38538c828e0321f9771563944945b1d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"45d5a4617d0d72dc5114f6a65e5087a5&ba6cf18e6c3ca54030c66d138a445024"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"fa3132e7f9daa248e5c08cb55c166f53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4f1aba5c4a094126e6ac5aefb959d387"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b03f166931eab98252a7f333c7d954a&77a19aae0a719896797f32e8f8a15ec2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"05759ff7de2fbe565f2a04d9ee82956e&94c43f1654f9a6cfb1e6466d37654e59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de101985637cfa2c854ff588de5f9c8c*2&a79b047b10cf8a3b12bd565a8916d1eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"0548b92aac9634b0fad2604721a74691*2&81dbed382c2b82b79284ab7a1f2baefc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"317523e638fc7798899645e723be1ff9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"38ac4b72774fae5e12145b7563208abe*2&8ff2a9e37ff9245c26d4220a9e3fbdc3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1f3a87c5c5469e33f204eae37085aca1*2&7831ae4ddaee5c7c6a64a0ae564a2200*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"221436fc29f6c8b4bdb522161f655cf4&7f69383b4b784f1ba36ceb7d2e3965f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"8797757c847a8cfb0e6736f320019910"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bfc1b8860b7d40eaf23b22a6b3a31a01&812e45f6698a366165a09f083bfb3071&68857fadeb25c996f02eb85e0c322ef4&1484c87f1f2898a83a768db22b86b3ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3954eca203c6de47cd81f3010bb10fc8&279fcce84c3d814d87540f2fd4b6faf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6faa9c25df4d98242d1c768357288ffa&72da30c73dc7b6530e9df09fb9d80785"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9937c20ebc42ebe491ec71d76480b708&c13d5a67c7cff07a3a4042e9f2fbebfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3e0e4bf550c91628b4f0c74bb99046ec*2&58cedfa8c901bcfdde3da65b5af503ec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c34d4efb578ecc420626aa75d5048f0c&dc39562bb75f942029c4486d098c2c36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"07eeae58b568ae640d21e2e6e9d7075d*2&227451eae3ab6734f871dd9fcad1c653*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f821c047b1bff4eb8cc2193a4605010d&671598bfb4cc841372023a2eb1e15203"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2529a5383ca26f2b6c26544b3216a32&81e6da1960e93964eae54cdca99d4682"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5738bd1d29d3a67cadc63e237788afaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5fa64c912d3a79fc75a2d7191a452a05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9824cedc889525df036a6ecae30fe7a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6f6b366f0e1d46f4d6865feb49e6237d&edac22899342810966db5a0bd5d59b5d&01bfc3b60c0288b72fd1534b905bef6a&3950646f240d129670eadd5095042bb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1732e3b4c759764dd9e252e9b54bd2e0*2&5d1093d5d3e10cda2818a2d1c19bb24a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b272e0439367d88286f433202d37807b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"94012f0ff979af5a04a7ee891a57af58*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9b8c9abcb99011685c19badfda6e9f7a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"066e73c6418d22c9d22b809bea97c86d*2&d31dfbade5b31c21d30987c64dcf0470*2&3ef97300628876b66e6df824acf5cbce&6e7dffe49da56a0f7a3d223dc36799e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"a3aab0ae66cbe5486193cb68b12ff625"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b36f0b71bc9d2341027ac68f6ef9db9&73ca63deb998ad18615b0c4c134dd569"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cf20f56edc8c395e8b0d2c8f86a867a4&717056f90999a13c1574a31fb09632b0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"abfa08c44221fc4b81112a6a2cce71a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"39d4e7b833be9877eddb9b19bbdc452f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"859727f3b2ce3cf763257113ba245e0b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"180ed663e5a381c9999990c203b69c54&757035ff90e266875e8f96213deb12df*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"01355b908d22633ede62bbdbe8f875d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b85b5ef8116d4d54c3ff60c3fe1768cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d62bad156af3dde1d34b74bf52443737"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"ff48cc9e5d086a0670530e44ffa81409"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3bbea6f840b41f43b46a4be8388037cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"3af994bf13408c9ee940fb3b8e12618c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"2979a3d80be22a0f99657a619a5fbd90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"cd069b7ae5b2894243e7a5d044fe3760"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"ec23b79dce66b7c506caf94da7cc1083"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"0dda578660c81295355663909da75834"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"58067187820060c611567dfc9efbfbe3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ed2e9522763f643e3f2bfe1eaa5355fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"01e4e25e0fbad9402abea6c7bebc9c53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"802d89968a142e075535edcecfef5a88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"57f03ff1d777018a364aae76b6e87662"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"35bb8b6fd9b642886ec0edda4c44543e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"106f53535bde815cb98a56c29380111a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5b841867aad308cbcdb312e58a1d65cc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"37f1d14e3b0f4bdcec40145c47a1d550"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"932ced70a35102a1c8d14e195e89a835"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"12b35db2b483120efabcc48a4dc7b97a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0ae3fe17789d217ffaf33617e060f044"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ec3599c4c6b28b9e88e93a9c47611cce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"95ed1f7d141390b0573d7d43a10426e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"589103ce8f5ffed535d74c2d26037432"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"ebc959526d7a12c2bc11047248e533f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"cf8758a09f3a7fb35e018a9c456b31f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"7df6e2809684868deb3232fe31867e37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"aa43af5e8195dda6d1b9debb6d2cf4a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"a7a1d1bcef188ed283eb87852d6a95e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"64461f15633159453704efc3c607a35d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"a04fe523ff9c0e4c2295f84a4ca13e64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7b5edbb4da4ae53d6276260888579d80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"4fa332429f3e5b80ffe0e7a1b93655d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"a3243986c78799063cb4c30a3ce8c746"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"33aa44411bf45d8f4f65350ca4179d68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"2d804a9cfb5190a637f6f61e69dbc9eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"bd834e19bbbac7989990948e75687102"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"d76c448a985939aacbb571b24cf40ece"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"293d606b07b1aba5a2e5ab0841bd06ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"4f5d18b7ec89929138a93cf532f24ed9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"69241fc474765427dbfa99fa64603944"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"7cc597c6a6c74b9ad34156933c47ce26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"0188bc8a53b33dc2947d5e473350c054"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"e5c32d742f38892b89851af735dbc855"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"1980d9afca5dfdb4f88e8e498bda5db2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"1620a22a49301dd695acd929f3b1ed3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"79cd4e9dd276051fb50de2f62f4aa187"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"b0df47ad0b35388662a44d21ab173842"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"e8236d241387925734a786c9cbe05bdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"936007e3a9c3a3b994d59bb32a2213f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"c6342d39e5ecfbb823577ef28cc2da27*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"400d02dc7abd16beaec0881574cfb694"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"24696bf34bb56ea70d65c16d38822154"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:6:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"54b33413a413ae7cabd93b20dc0ad47b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_mul:f32:2:abx mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"2cd98dfb6d23154098d61271d3915a8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1137ow569kw7sw2dw0pw3_n"5bde1d89d7bc3f8888d9f766808922a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"5beb1bba524b87b5f37d09653b4a00fa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"adf51b34b44543e21c719095691173aa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw285ow285kw3sw1dw0pw1_n"19556a07f8a67062c557a2ee2c200fcd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"fd75d8efc4f39b7472b2040bcc3765c0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"0f9ee858251a77242d74309927fb71a8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"e17ce55f01f7824f7a9449bf67b43f16*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw285ow143kw1sw2dw0pw0_n"827bbd1cfbd746adfcbb89c4fa224293*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih200oh200kh1sh1dh0ph0_iw285ow285kw1sw1dw0pw0_n"83d4d39e2099d40392d159ea86676efd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih200oh100kh3sh2dh0ph1_iw285ow143kw3sw2dw0pw1_n"7f8780fc9aa3fa61db8980161d237561*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"45f3432f1083e47fd9b60573856dc74b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"88684da0d4a79de2f3f7cf17a2407754*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw143ow143kw3sw1dw0pw1_n"7fa00364b9ed6582bbc63d7624b4f7b8*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"0da6a5bcb61b29b84dc71edb2a8e6fe9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw143ow72kw1sw2dw0pw0_n"0088b381a42bc595a816492a323c1397*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw143ow143kw1sw1dw0pw0_n"93b8ad398be25ef75e26bef95c635121*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih100oh50kh3sh2dh0ph1_iw143ow72kw3sw2dw0pw1_n"efb724a86c2b2cf7dabd3f8b12b611a1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"66c383c12e4da4fa40037f04161e4ee2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"9475d372d832c6b18535e1f47dd7f81c*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw72ow72kw3sw1dw0pw1_n"f38d1968e97b9d8ca207a5b5ced8a7d7*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"e84631c41d3c29651b2d002595ec6dc2*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw72ow36kw1sw2dw0pw0_n"838cbdbc7d4195985052c5de8b2524d9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"c4ecffc3ae2cd0002e19a0960ec8ba9d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih50oh25kh3sh2dh0ph1_iw72ow36kw3sw2dw0pw1_n"f3948ab7f6eea2e2d187c8bcbe651b4f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"a6ceeed61a73c85952a745d633df4341*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"3ecdb039872a10975a32ff1b34443481*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih25oh25kh3sh1dh0ph1_iw36ow36kw3sw1dw0pw1_n"5341d61cd5f66acfa81b1f7ed2cfa80b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"264b13316cbbcfd3788ab66804ca3e6b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw36ow36kw1sw1dw0pw0_n"68f5d85190f7cac2715a333c26a69dae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"eb2d35f38a57d519dab09dc50a2356cf*5&2bf829eb173b623cf46e23f7ce81c71c*5&7eebe60d5c0ba3583fa07cb2094eda39*5&96dffb5d8d8f0ef4fc42508e9a91df3f*5&c9d32bfd767fe39e2e04c1f380cfc3fa&e98947baae22469d3fc13ea0245bf4c2&e492c239bc6e76ba483988a410565df3&705beec4348e8866ca6146679a28ba2f&1d5ceb3fd56653edb1c3fab1ebdc37e5&70dec5c5ad548a99ee9fefd064d62f0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"59365571ee508db7a9fbc2d32304e42b*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"711b40e4eb2f479d2b2119ba5b723333*15&1110f8940e1c19995a957edd23a02239*3&19ac0bfe2f170657ebc05f02ed9e5fd2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"953a33adb82e3489967d13e5cb4eb488*15&3e2eab21e85e94ad25028c4dfe115fba*15&a8528bdcd4f216fe8a5904ff9350aea0*15&522c94c98a2b85b111a74a1114f6279e*15&5b0b0c0bddb5aacae0733b356c9428df*3&274089cc921bc91f95bd5db6e67e0d06*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a261d745052acb1f68f70c167e03f151*5&70868e22fa53428e4e74284f3dac18d6&629d06cd7fef210cdff03224652f42b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"743de864db60a2e65ccf66e1bee7057d*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1dc3f211301f5bd064d9d6d1a64a43ef*20&749ed7fe7356b8b0dcf8509ba00e914d*20&2a7f1204f6d7627a0f6970a97f3b1606*20&dee9c3a617d96766ee310a30dec8427f*20&0d9d3d181f02d2328a52680342884ac5*4&2fc12d6bd58ee0ad590ab8ace294adbb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"2f4e307bec5b2d34cac6f94f26f22906*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4678336a397d72d92a179e2259db5629*5&f3607e7559dce2c0abb228cc55b8fa38&189445521112ba19b2f26098573a68fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"4a89eaca1b16edf13a196d31a4638c79*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4e56e59ce12b7118b5021f1d49d12087*30&1155fb6db63f41c807580e3bb72f1b5e*30&d891b1551a380422baf0d0427a0b2959*30&e3c98707a329e2fb62bef073fe266ce6*30&705917085529495b38d56ab3df8f7e84*6&c9250d1fc9bee2284256760d4c42f4f5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"9c6f6a27ea0492ecfb823144176f4e45*160"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb1420bd32f4abcc26c5e56ae2d849e6*5&6502017621082ad0419932e51fa195fe&281c95d37aa77fb2e8badede541078b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"0e97c12bda0968ad869d4aa70ff52815*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"21cc0be724d2d32810a6b742b8e54409*15&5d93d82bb32c747a48c261c6d714fe39*15&e5a679f12a8756da0dffceeb3ebd14d4*15&a6d7233a345d787afa2f954017497384*15&acddf412421a72090d408022b1556b13*3&2e66db49dd2db7ae186b36d9f8e87d91*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5d732f8d50e431e2dd08399f8811f54c*10&d67daa4a1be01c7153b0d5f86d129fc5*2&fb194ade73292da4cc9c406150fc81ad*2&8a036c8665f66362e487f61d63535aa0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"3a40872c7daab587b55f28e24f94bf40*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"847005125ed5e7c2d828e366f825f537*5&b4b7b14d4c72088b5fe74749f00bd2e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e38fec7694c5e81df8cdbbffe647a59c*5&2a1d9ec841a9c4cfc7e6ff60aa352394*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic16oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3ee45db645349e38b244b023686733b5*5&d92b749d2b2fbe311f0035555e8b68fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic35oc32_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"cd2aabcbba13aea6553afd8dda845837*5&4c3fca1d850be0f9197586612dc9887c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6ab7d05ab97410a1ad3241c301d21e5f*5&0b2f1ad4161b68ce13ed6bc4e6147748*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"3f7003f46386d7e92d01c28f64688bf8*5&6b1f6e6dd0d3deabdad09aaf5b4a7fc5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7b2d7d842f946eac56c4479a7fcd7a3f*5&c2c71c629a86ab74e353720249a10795*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"351470fae7f4a784d22832335245a516*10&ff3b25693062c8b61a38496b5be7685c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"31d4091a2c037cb1d4eb568a6f6ccf12*5&2e8c32992aad584bab675872322be22d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b30667eecaf9f330f478e2a8b57f7008*5&0135c8e3330fd00cdc7cdbdb5af8fb4a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic24oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c6008883ffc8b53f4c6650804bf265f6*5&43c3cf1c1ec029f0c06943ddc1d4d5b6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"db855865251cecfd775a7f4baf6ba7c2*10&9b395b75b249c9ca8976a27f8d44cc27*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"0286ffa873ae556423e36672065a2910*5&d462a3e8f22ae3ac99f2595958374b09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6cb47b878c2afa2561012a0a587d786d*5&23b58aa604a3bb86cf9f0ee292b042d8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"80f9be19c981628c72ea58e0e85ffbd9*10&cad4e77656fdc534d8da569f4aa0c300*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7893d4999ed853e96b5ef4b25be78623*5&677f8316995265003eefb105050cbf64*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d7b33248674e08fdefd9b5bfc9430a7b*10&eb4e7049bf4d7d63e8eaacecc3a4c9ce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cbd621060ffc01fb7f40cfabe94939a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-zero-points=src0:per_oc g192mb1_ic192oc192_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"14a202acd7c1f73c1f2f16851cf12a8f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"aec2d9700f3d5fc4334d542700559cda*5&b7add5cb453f2478cfb8c54c8b8c35ee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0740f9e63b0df6ba14d8795d6ab31167*5&c7463a6e6580f9d0489634d5bdf7a54c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc384_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"596a4a59e6903a2e53e279a40de71062*15&0a25d437820fa85fa719f16eb8255a37*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-zero-points=src0:per_oc g384mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b15e17da19267ec2789e8971a14eab80*15&26cf334f732f461173007e43efe944b4*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic384oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"54032c7dfd26f3b3ffdafe79d88b3c68*15&865e2a1c02d4bf2d227ef47d7df82cc4*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc384_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b3d4fdac56dc8eb7a32e1a8333fb1fea*5&5ff8fda82181aea74bab571a784bce42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g384mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7d764f31b6ef7630e7f48bc54a569b7d*5&fd8cabcf7d3fe9f973f6550764b3f6f0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8c697b2c4a9b87103278199900136a71*5&332ae4a24715bc84f66c2487d60f7e0d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc576_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2446a6412626941362b7f53b06ef3734*15&da9d76d09a1867d68caa5e563a138f02*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-zero-points=src0:per_oc g576mb1_ic576oc576_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"78288034734b2926e3a01d54439966b9*10&c64272556b218e2f9bad31c636a4dd0e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic576oc96_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dec60fb22e29583158c8cd9ce46372e2*10&49f5bed34f9543e77ac0d04aa8cb6b00*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-zero-points=src0:per_oc g576mb1_ic576oc576_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"41a57b141d7d1d083e194bcb8ac44b76*5&b0bbaadc22fee5963263c171e082f1c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic576oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3204b22e818782adaa671aca8b56fb29*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic160oc960_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9a6e97b0b4a9056f7186cce3f7b84a93*15&a94da09572af5d81fab7450af334c987*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-zero-points=src0:per_oc g960mb1_ic960oc960_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c70505078fbaf0602fea6a53ae26c335*15&1584bce0d436cd278edb0270981e6e75*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8ac78bbe019c1c373c0b371b2443d8da*10&c5e8b4f07fa5b6c07986c8b3e34676a2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic960oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"331f91b3a838c5616229f153b2a691d1*5&78db0b9fb0cf27cc15c7e0eb9dd9be1c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic320oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"952c0de0eddda56c0f99683a6c9c4831*5&9ec5ad6696cab91124a43060b4fc977c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1280oc96_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"ab16cb42dc6a628719b600580b6de8da*5&008df88f29cc3bc44e17b66ea7fd24c2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc32_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"4cb2ac5e5ff2c293abfdf63633133fa6*5&23e74424c11f662b7b9d0f2f5e65b5c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih128oh128kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"44837edbfb4665323fb39c5ed3d6e5bf*5&1481778806630fdc2be6fa576daacc62*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic99oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f4de1215b7551231a650591c2ca8bf53*5&8dc7dbfe9f5cff5459d63a8e82291363*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc32_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"aeec2fb204e9e4621c25f378a56da7e6*5&8927ac5d69ceac15111b722e5eb50053*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1a327bb52f3429a8911c5d4e84b86298*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8c586502c69476cebcacf78aedb42e75*10&f7f7427432c41932b4829ffdfdbc53f6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2aabf6b72038749ea7819b1f5bf5764c*10&6fb537326373d8a610e2810c7a43f619*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic35oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0457636056a5b77b218dd70876678c81*5&da65580c4f883174887c0f155691b283*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic16oc1_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"fc08166124366d306364098e08625b68*5&f9fa63b1639bacd41fd91883847096ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"886da09e3459a7b6b1e1c0ec036cd555*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25 g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"f6de029651f858002efd649d018c52b4*5&7164fa00f582dd4fcd6c7d7dd61ba78f*5&8b0e6fa2f3787220d406b44dd378f678"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2c7924b594c77dc1426b17efd6b4283a*5&710381a22c92b7d6cd8c470f774a06f2*5&fc624077afb3e15c5bf1ffb5a18a9cbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"5f30e11ce7913062bde38f64273f74ee*5&c23fb5743e9761b772da1f0359399bb8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"08a4df51998e24f1b52cdf9c06d161f9*5&a40d14fc4f7b6d5b63f88143763608f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2d3eac063b8c531facb5e387dfec9ccd*5&f4d8c67e2dd1ce56e87473e30aef70d5*5&20e01e87bffad8b0ef69624b58dd4c06*5&78984d7db32f6238525aacf254c552bc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b7334a929ff6868cfc2b98159a49e930*5&5f9d34b4a282342e301fabd9dfb5ce4f*10&f995ee107efbb16b08ee76a1c93fa16e*10&7d36002aab98e04ba0d9d0a678928497*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g72mb1_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"78f136ac93a5d2f6e589284762ab731c*5&5c9914c74c609ad9d495b69ccae14b0a*5&840f796395156aeeb236acb4cc36b565*5&04bde6645a3fc87fbca1ceb57498bb34*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bdaefdb4ca32b7d13c620ca8854fbced*5&aada71c5ef7bfbfcd48ccf9c78f4970c*5&55f88d8c8dda1a035aa9489be6e65fcf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"29b2fa965cfc081df471d7cfac040c23*5&92883fc11d92719505cbe9bf954d2a1a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b6244214afc298b554c01a2de1102e4c*5&da60291473d3ff0b6d421d847fe0c672*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4af434004196876ff218418c29f3c58c*5&3eccf460ed08f39b662c5c126abfac90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bd3fce19f05f89a61af47cf5fd3b5fc5*5&ee4d11125e374f8efae2eeb95c9086b0*5&7722a6c8af6b606f92ad6b9be686ad74*5&2dd2dd30dcc5bfcf12995b9e27340f13*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4958cff2d0bcb9d310d52223dc350e22*10&8161f0f13b827a85bbb25e0386d905d7*10&d6430a89caa3ccf935ad558111b893b7*10&845e06a082b6e8a57769001ca579a9c2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"fb581c0771c473eabfcf2a1b4ac34aa2*10&3b70b4831c719128355c08ba4f62bab4*10&ee51eb75de319bf8f1b52e4154f54f95*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3cfe5216917aa3b4f1788b3307b9f96*10&89377320c97b2bf23f092b3e8f6ba5ca*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"040e04e8dcb94daaa5d58cb6c08561ba*10&4a2b2cc04de258b347d6c4d3692e38ab*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"77269b2b3da3d892f72dd4004a42309c*10&d8725ea3d5de9e235856409f5054c7c6*10&bb80023901396301d5cc7404ad2675d3*10&971dc8af189b4b8b535363e7c4682ca0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"22d3d8818e3109d651602499af7048b3*5&9cefa0308d62b9f2631b3236862bbf50*5&0066ad49047aab2fb7cfd16007970181*5&0f91709cbd97f6a862083999bb5b8c51*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"95386bf8caa6530a8abd0732dd828ff5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"51b9cc97d5f1f75fc24d1bc695deed1e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60423ac62a860b8fa59cfc04050c1daa*5&5ca1c10d7fea6bc6f14ec2638d032c36*5&5642df4e5aafca4d5e28c056199b62c9*5&2f0bab2e83a7ffde13a972ba9b8a68cd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"758fa989877ba977364ff9f6f18c08d5*5&aeb0692f75437f201ef3eb330e102641*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"93444a97209503960b1c6d94fac8a119*5&84a7f505a9706ac07e05e8980e6be064*5&44c2ce38614e7c628749ebc8f2900527*5&2be9dad33db84ea6af5aaa3151603e5d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"82a5f8172cb6a14b1b5150f7260d272b*10&78ad66206b287b966e0d086913e24953*10&660be0e0b0c06d13703aabc83d9878f3*10&56b13e47197423b0921eae17b67de1b1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"565597b106259b789cd9b62747767632*10&c489dd71f00f91bdf96db7c436772bc8*10&f344eb72f0108d49b52b0edcbcba9014*10&faa7e69f402acd9facb487e2c6f86f33*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cfd77aa988764c3b7bd63a0f760d43bf*10&0d02fdd81a5658eea6355d4e0a099094*10&4cf2ac69836b418af719fc313b5137cc*10&da799428ffad85522cef804508308fc2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e04062278bc642fa1a595860115f5e4e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8f1071b9f901ff232de7221a19ba08a4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"99608f2f52ea69979e54841056aa82f6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fe5a2d8b9d5f05a79f78d647227a5e61*5&ea08bf4d8abe0b929a40e389699d147f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a043351265b75a705ea322d92601579d*5&f91dcfd0df065d6e7e848f70b2360dba*5&e181fb329f6fd3379b904eb28d80c363*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"75b650e42e9e01b54223053d946a3636*5&6a38d4a734f7ac2450d38a36c56368ca*5&f95188b7a9fd6f6d2ddc7a4c2c45d53c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22c566bd3dc95bdd5dd56e15d148dda0*5&f0c285043e3baa79fdfea438ef6c06df*5&adde15d9af75b1783d687785beb28e67*5&94850b11713be6f2006dd27be4d09146*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bcb0c29976e00841944338298fe581a7*10&c3472ccd04781167407ffacfa8c4717e*10&0834d380b6f185a75a36eb864c4664a2*10&f85568dd5530716e030f5f49bcadd2f8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ba4058ba04cc5aecf74ee3c43ad1d9d3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"40b9eb32e4c382b714550db2b4272ba6*10&260614c4fc3d475d14f45f8bea9b335d*10&f44ff6efc2c7e168cf52eb2f9fa646c3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ab11125055ab5bb3746689d040ee65ac*5&c4af611e710d6ae0c19c552f05107aca*10&4dc9ec3320877041b9388174491d65da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3bc1afbfd21e7a23dd3d0913052ff5a2*5&e49e27470f1989e232600ba8cede4437*5&ea55ec8e0843b71590138db787d620e5*5&79310db6380321a38d2ef1ea73f01026*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"8d00d7665b3689d4499153d334c1a6be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d47de6589fc48f6d668da3f15c17f606*5&a2b5ab1a8c7b65e8abcc94988ce17ba5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0496feb65963b41a509e215a4b60b3fa*5&60bc2fef502920687c61aa23b2408cab*5&6cbadffcceb87619f949c3fa88cb7729*5&effc7045a8e184842ba31d209503f40b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1af0f67168e409e8c00d68d5b67c3f8a*5&f167e372a87a1bd85c444a4b6a3d6170*10&48a7262f47b26f6cfb44c725d05f9c04*10&508eec97378a4ab091ebc87e7b9c5f3b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"fba9bc5fd72ba323a61ccad2e48132df*5&9c6e498fef623f57b427952e4f7bc61c*5&d0f9412bfbe667d3f9e573416f40f6c9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cdf17ad4c679671f95a8a52fd41a2537*10&31f17a4d8447c01e530189e84a6ca27d*10&b476006395101340514bcd211d59c21a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b8e00b86f716db2e158ecee8bd425563*10&41bf0ef8409bc9798f5be17b29ce39fe*10&5d4e0dfd4f3d3c0ea3d5ebd0b9f291d4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e28e08ae763fb029d12b803e83d6afba*5&bb93b986277c45850ba20a5657bfd82d*10&74140e79410aef29cb1176d63de503c0*10&98d04f853994ad62277690c234371123*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"beb8178a18df48fae3402c74b03f2281*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3917ff00f5a868c0c1640ef359eee5a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e64d2a7bb35a2a13f4d9ba7c21281f2b*5&57ef8d9459e8e5b40dedbaeb5a2acc05*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cd0305591658be036fe72ea7c0e76fd2*5&27ea5d1131fea2825dd654ddeb1b068d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"842a626e048f147fecba0e597be6472f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"41fe9c170979f1ae2fcbff053bb68d18*5&bfe63b1c005bc573a92aa4504ec93a37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b41be44c949c543e853216e5b39df31*5&4dd0e00b02255dd11c3ddd304cc2cb81*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c2095563df98017f758f627a47e93879*5&d0504c9e99727997dc41933a86abfb20*10&c39bd1555f9d6d65d632393fc41a041f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8ea59b2dd0e1729fd5e3007d83ef1abe*5&0d316a2df22a707028e5bc3d006ae6db*5&db76e21811cf74e7b105ff9c291a964a*5&81275b9b1e69a08a10f265e89c8153eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"1589ab7995af5dbfdac5045a88c1a457*5&dcfc57680a4fc55c07cab51b6abf392d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a74895271fa7afc84e3944512d820551*5&13fa3a1b557058b48a23890224241cd5*5&1813eb75f4882ffb8b3e3912a50ec0e6*5&f1cc01fb32e1868dd586d2d3d6cf86b3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fa472cd9b6b7874ad0edb2efe4d1efff*5&a49caca1a54e99db19459716d4e6aca0*5&60b19b374fbb51713c364a5726835d3f*5&b9223fe9ffd04b23480151520871b6c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5385ee265c7b54a1a408b02647d9307f*10&f9484fecda6c236eb14c4bc8e5b7da5b*10&d38e9fcdb970cad22c07d6be42004376*10&b5105b38026d7fefceed378ff6b92c52*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"594065a0ff1b0002f076b448b6baad0b*10&8eb5871835f28a60e33c5b4d19b47f2d*10&b5d5271f9ed1f7e6a80d59694e329761*10&1d78722f2af7e8606f7e35b7f44cc9e2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"653aba88f4f83d2a7ffd3a952ad6f8d0*5&a5514da324b12c2fe2f659d6742430a0*5&e4ee55ccc5f5e37da00d0f0cc5c72043*5&bb2fcd3d53f1c00fc8fc28aeb4789073*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e6d012e9b36fade4c42017963c308359*5&f3724f8dfbd1ab5e3d6324637cc4e318*5&0b849bc917af39718fde4f48a404a402*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fd7c923cde11bbf38168161de32f33b9*5&9e82eee2f985a9176998c5241026063d*5&79f21c14eac9707fad207a0b5c20f7c4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e9f6006400f4ada09348cee71649083c*10&08d04ac506925dc6b60debf773776520*10&9df216ee42907c1e84f37596839f6102*10&09fba48680a8b87bba146ab4a7843951*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2ac284314e5a0cd61091437ff11a8102*5&96977a5cf4e7c6fe04241c39f19d50a6*5&ad78dae4c0f2e268d6d33c43abb99f9e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bd7bcb020077f97f8314070900dc5166*10&c9049746d4cbd5ddf22fa3199ac78d4b*10&be4e7165d5a247cd11c4cc0e5336acf6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"8e4fba5a7c49c8f83f2cac183cf6fb7d*5&f3bae57676b6c4240a12e9b8a17c770c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bc89e29a9c6909f8d7667c06bd455b1b*10&6a85a9c7a77f88da56f88ccce55ee819*15&74724e463bb90f3cfc97d2b917a217b3*15&510ebd19b0335a01b5a617827fe4aeb4*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"b2c6f174866e394cafe9f737f862d1d0*10&072d509ca5e7746d6db05526b860823e*10&4b6aba21bb534828cb3dd35e1222e777*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4b317fc3d6c1cd9447ba6a6629781867*10&381942d6adada2da00ba161c6319a84c*10&dbe284b913870adfaa32a573ab8b5b97*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7de0dfd7e424bf04d0aaae14ae1922ef*5&90fc449f0f4d57538a4dee135bf89be5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"68c6a1e0371cf3b0ae2dd508c0ed05bb*5&c8720b081ca4022ba599ec7fea605641*5&424cad2f030a58265f27d7422c27301a*5&6593740c584af37bf9143f7c6d4692b8*5&f58b2c7703c1b84fd0c9ce797ba0609d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"480ce9dee61eba7cbe014dbc413bbe60*10&6febb0c3d3cdae492a61bdd982d72f6f*15&76374907d708fcb13b6c1e280effa15f*10&43b988500e319091335f280cc8fc75bf*15&172c1f6abedb937f5d7d4d0ff93e2966*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8dfc6f69bb8fad4ed5094faf1c1b891c*5&49a0f5cc27b300c60d53a41554ed6236*5&0a10f4384c82c3092eb887e28860cd88*5&069c8045128e52756832a907fc58c9ce*5&84b76f0c17b197346b4d73c3549bd8a3*5&ed9d83faec7f0c7e0731fa0f13a973bd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"717c1daf9dda2f2069c3306a6e4ee066*5&3a9eca3c6947dea7c0b5ac2cb90b5170*5&2fc99ae41ee6e30437898a3426f1a995*5&1094abb761217a03a1697ad1b7690d23*5&cef4da9c7d975f27d249f0a3fd42a930*5&7e71c7ac2eb74099832efea1dda5797d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7910155ab4b8d0ccc9dc4f166b8e1af7*10&21c30cc2e2a46648d8c0b7a3b74021b9*10&4543623d1c7d6d1b9b2c749d143461da*10&e427fbb63c8db55162900bef9065eda7*10&63bf466365d6c057a83f1a1bff1ca6d7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e67767a51ae934dd6025c4ecb1340b56*5&978df1ad9e7913394b98ffde2f9ae600*10&bea277418b9e3c8e57a0703184743815*5&0b4addb201187ad657a8bfb091e482ed*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5a0bf6125052c149558bf2086a2da296*5&5850aad92785455d78d850ea260b0046*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+eltwise_linear:0.25 mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a04c36222ccda95540c30db42243874f*5&2305bd165a6c8456493ca8819428eda9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5c8dddfbb0db37822a7f54ffaf561f67*15&005bd2a089c5df5f9abe9fc0cb122df8*15&2a4dba526157cd75fc2c1a941d28c73c*15&2730dab6c7a09841540004d13f48e9a2*15&351fef0b04d9fb452eee608baeac6ebc*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e91cbc1f2778f8dacefdfc3e6d08c9a7*5&91342ea6f38ca7fc1f05adb067e7c44a*5&8faea899d42a736354413380f164b1c3*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ddf472361d503fe9f8ce1380f9df72cb*5&9f6b9e0d181ab74b5a5e58fc95aae678*5&bf037c8338a5b9f01b5c444c77f51ee6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0961c0fc91937956958bff8ce8cb90d4*15&1ec286532bb1051d83be792af8bcaa5d*15&ce61452cde2cf07589399f770e23aa6d*15&06cde14bb2f9dc69d4120ab5d6850823*15&17b46a85d5dfe9f8068c23c5bc98b40f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"43ee7f5a0995524fa70a00a0e942f75e*10&c6fc7a548198652311fce251242cab7c*15&19a1379678ac7a7960483cd68f2b4833*10&c29c0200493657f36607cd0d92361ada*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"def246db24d9fd30f31f58dbf8cf6538*5&ff13bed52665290598ccb6d79db09e68*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+eltwise_linear:0.25 mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fcdc4dd32fac1e967505daa5e9655f2d*5&4c5c8e69a4ba15643bc17d1ae9936894*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cd67154e7c07a0a1985063c73a58aed4*5&c18ccf9c5393f63f5b8cab31af204664*5&9d074613efbbc61cdfbaf49026a4b2f4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"491ddb1f24092149f73b0c06af57db3f*25&c7b61966d638f29a1643159c42d35bf3*25&3bd40a978da5f6748320511302cfd4be*25&198e21691ffae116340151898fd5a7bb*25&bac34a0a8d52b83b248b15feae6b0524*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7071384699798df90755186d5adc0452*5&0e4b18811dd09165319b4813f40f1aac*5&6e27ef09bd1897495337a26955142f80*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"778fa8b5ab70b7dcbc39bb9a8d91a2b6*5&5ef1602a4ff3c98bfae7e5b4ffd23f0e*5&c038dc3ac04dd6bc9c56719a883f0f49*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efcd757e4c51af6b6bc0a27abfe08f93*25&c3320d47c188ea20b3d0ab0f1a099ff1*25&e2ddcc1413be2959599194d5b22ff775*25&e4c39110a7ac3154f31b98621c1d9c55*25&b3a3cc0f0bcad2c2039ab72462bda0c4*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22b8368eb211eed1e30b9ab635a3996d*20&ba1d2dab6260ec889416f20df7780365*25&7159473bb0189b5e419876c6ea82241d*20&a990953204630f9d25f3926c54075b1d*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"149caecd8d33f8622c7066ecce08be22*5&059ba80eadcf33dfd7f6f69caffb5ac7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+eltwise_linear:0.25 mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"845aa211a3e3cff72ac2e4ba9aa88b3c*5&52530b9e821804af26eb95c0ec7cec14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3a67cc886bdbf754e6a0b1e0e15d707f*5&8ffb00cfb1a07c44e0875ab4bac4772d*5&5b30ad1ccb1c7a075f4de16243879072*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"80b3df0298de835b370eafcdc6c193cf*15&3aa143f297876b44ae6d636f8973352d*10&33a9e5f66a5c92dded98498827a203b6*15&3e5c620075dcc3924a6cc61a5eefbf2b*10&f81888a489d2a737c9dcffb857a7b28d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f77ee5bacf9c7afde756773f3a5075c9*5&d97db869b6e2fe31f690b9a9841ff3e4*5&eba56ab6af4530d43c8d8fd6ed255b74*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b421e3d35c1af9f4826be2bc93c36b72*5&e11f231615ffb8a3e0553e95a9f45425*5&7c5dab517ec6b70c7fb24af845760de4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"913b04c377c137bdaa124f2eb4de3e23*10&ae668567496462862fb1335935fb88f2*10&a58aad22532a6c8d1084f3fb5ada04dd*10&98ffeb47ee14846ff1d41d9c3ae8882d*10&4140d500503ff973cce539363a6a3f3c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"830131e9b0303bf53791af97e4100d0b*10&e6d1a2b4f287863b2ee57cfbd57d57a5*10&cb99526a35a76b08d95d0c16cfbb3c28*10&fc8bf8d36f690b7bf61cfff3722189bf*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a439eb5d10f80d584d60373810d9c5d7*30&025f7f41e9a2612f9ce6f8f404f0d7ab*30&e07e1548bbbd9eb910302b0031b8c41c*30&417cb339951970744fb28444c83c92af*6&23eac3ee8f6775bf89978a9c1d4759db*6&b926f5cfb47e1cf436dabef040826efe*6&fa4a7faf7c9b1643b03112c2e67ebd97*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d6429352ccbf01c828ed973285028512*5&660bfff48b67ded59c30bb704572c716*5&87914c3633d89ff1e89956a4b7b5f423*5&d271991362360bc27d1a712835017961&6613ca2a0fa7d33374cc238453a53786&2fb38e382f76eb3ef21690f7c96c8ec1&5eaf069fdc901253efb3216019a17e8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e37fed3addbc46b5f09816f6f33eb228*5&6f1bf5955d9dd95a9fc53d3576ad8821*5&1637ccef3b93c46aa77ad3738c843a46*5&4171454794b8bb19bae1c3cde0d556da&63befe2ec0e33ae456079daa270427d3&d88db675a5c2e75f5ce56576c566045b&3c439ea6c5fd05232b21eb7a96fed99b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"722dd33dbe4fdf784231360da418d518*5&1b7dbae62aaf27ddcd1bbf6b091062c7*5&ab80e043fcbb05356e86a929d4ad290b*5&fdcaedc88a35189ed7aa01cbd1db2f84&9fd0252923076e17eb4e5a53dde0fc91&4f3254663a5204daa637510a6aa5955a&ed35fb6b9307692b1aa0228b8af60b73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c011af9fe791bf640c3d6bbe6ed4e4b1*5&822dedd525971cc3fb0bd3bad48644ce*5&cf938646758605e69db0ef3f484f3904*5&28c7faa40aab9b2cbd71a7fa1a271aa8&e308e8ecebfa7d9543bbff77eb2d4854&fff9a25191f0cd603426057a88392e09&bca8478dc70fb47105a3411d8d284ae1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d076d5a529310dbe586638d279108d7a*5&3aa66966c88966c2a3bf9023b648f8a9*5&9315089fba15c240a3612dede7b73eec*5&20a33bfe1182ec1eb79012c0020107c9&6248fa8a1c68f0e624414b1c021b0e8a&a0659dd7efb7488191f3692f0ba24537&0a30b45876cfa8204678d68b12a8c70f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c318f3dcc89c0bf556e2fba0332995cf*5&1b3760d573fe1e84341fb4b508cfe249*5&8c0950fbd8af2414170ec4877b879a66*5&d7b6913c6afaf1ba53cb42624e40ea05&9c638d37a211effe46ef3617bdbe50a9&dd690d93b6bdeebdf32c027e107cba02&a8303dd5bda1ab177afcb9c1aab53d36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6476fa3dfafaa4c75e88d97190824ecc*60&f70cd50133ba8861e2fef248b6d67db1*60&5d723f38102f6400a4db9469f8e85326*60&d323917f4a4e63847434c309c9b613f3*12&3de082918ef47a0d7f28c0cab671e9b9*12&f8e05e1bf1f23d97ac984f83799a412e*12&73bf007c7968ea8d9d39d6a92cb0953a*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d285084196e8fd1468f6c1b66c51effd*5&23842f8a5b06d7a193adec31f296e939*5&92efa74cd38b71d14cc1403ed8ca6568*5&8af9cbaf944a4c45d7c087ffd4f4d82e&0b2d166e173b06555e03da3d6884ece4&018bcd2c2deb77c088a04485e6d1f69b&1c55206c7595805b83d873edbcd1ad56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ab93e21a627b48467fe9075dac467f47*5&0e040a4815bc6109e0a323839dbe4b55*5&73f443b51c0113a1eedc6fd95ffe1df7*5&018a24824fbf6a05049debda5c3566ff&b1ef18e475a2db5544592e028abf87e8&55e85f9f1f25bb33885be16282037ec5&529d9e784bdf8ec369924318441e1592"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"01286712b81718e3f91f38e0cd75459c*5&42df0af81f7f3ed2a8c44a161d61bdad*5&07c6d22d0e3ac97ae55c25ee3abb2814*5&bf22adeabdc19626bf46c3f4458877d3&93a4545d338f56b23119776ffba9751c&2f8ec5e6740407db9131f1e96dbc7114&3f9eda697037a5ffe24784067c00b9eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f8a026967270d75b5e9323f29bc2281e*5&237b5a09749be0515f932181ff13f53a*5&1654308bb1ea2e4dbd9423a11148ef78*5&b96a62b909604a1a20005a8bf90b9e16&3374c9545e4dbff11a928ed33fa56236&f2405931fd70668c5b05d6c764c92957&f568b47e45cbfdbd4e6973c19183da56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f8ccb331b8ca30d5530dc14664603480*5&7adeed51374df5ffa3be049663d69ed7*5&ac992deea2b214ee54717be48611bf83*5&8f8020f0dc3fd2394e94aa2afa0276a0&f22ae3f1d04abf61d305c10146eb0cb8&72b652c16d2d337100877236bd5e471b&f5967ef32d5210224ea9fa84a873bcec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"06f9f6302d4899b5760189a3b6544713*5&7996eea5fb2bdd2ca9155ffa1d622f6a*5&da628d111b1fa58fec503c54eab8430c*5&109252ffb4177f6afbbbe19ab3a01146&a7dcbbbec352dcd51c53acdd8b882fd5&494890241f3ddb6fb1fd5c921044fae0&2eade9e859e30dfdfc5d5324a0b9aaef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"48989c08f230d66655746f02f475e727*5&1d454774e1c46445e442dcb8e6721dcb*5&4ba209b82dd2b3fdeb1ccf6d792b6073*5&9e11aa3fa5ac6f0fda2b9c145bacfb53&34c3410c7d448278aeec1610fa23df08&63b0183d3526429af1cbcd8d16c2a03c&e8a904ef8fa1efb352e6570359bcde95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"82edf1e2065e6f8d65183e716094510f*5&a945b30c81ad8ec7737035198a489717*5&dd83f63a2dc042ac712a2c24d554d0a6*5&7487ed755d8543858d65add2ac63e8f0&e8374b6aef8af8a8974944b31640f61d&92a7657a509eb395ef2950ea98d67065&65dd78278d25628df3f36a37c75c1e38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"15fb3eb8050757f519a5ef55489369f9*5&e4b593b23a4e546c07eb05e86a41e183*5&b001d26e0d3b8dcc8de4d813df80fcf4*5&e7ce8ae1c1f21918ea604bef9a326037&b7cf576bd497132f8d05b6a555904660&f5cefa8900fdc4b27918b425e4802850&e97df38ffb55d05108aea8a6a4d39e95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5f5c2b24122bd5b7fefefccb23e50ff1*5&fbf28b05c143cea2b4b6fbc826de178a*5&9a39758ff47d44e2fd4d2509f053c04c*5&c004d95297009122fcde20ad7e9cbc1b&3666e8ed2a91320ba9993701aa83140f&88db699c3487c33a66d34d459338a970&1d6f4c1433a425bb769305088ca329fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"83da729120598ce578a82250d426e478*5&e1bd2d0f3865a01797570593446ef2f5*5&e7fb60f1439a292cbcee2f2a00ce8b80*5&a72544dca272deb5b6f9c91c94c70b6c&a674037844a3f27de304f3dedb7adbdc&5bf2bfea01e4e61f478976886ab9b4c5&73ae5e7953a35bb67ce55c523c8ae01b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d929e9d41ed0e479fe7f69439b2b410*5&b402b6e5148890197895240c07d4d741*5&b73e44da487603e96032c41889b50ddb*5&63de4f3ad015dbc086691549bea8a550&fee42383547dd04cc93ec425c631a3b8&9268c455c731f699c4eea736e55ba31d&767437b5d40a3ce4fa158389618473ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2613c6c5a27815ab002a6558e5c09038*120&7137ae11c8863dfde57d2be2fe4478aa*120&3e8e99525de4b64dc01afedb8d1d6c68*120&860d64d014abd44c953ef008ec8ea078*32&03fe0ebdc25d99bca65f64170d2f675b*24&41ac423dd790e3bc68318cfb68a5ee88*24&722cd0fb71958d516c03633658f31a04*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1e5677774cbb5b51c5b06730150a2d15*5&699d3b63f63a95bd0a88fd2b36244d8e*5&7c431aebe684bed5a947aa1bb9132dd7*5&86bfef3ef5cd5d1c1a90f36513b2eb7a&7b108cd3cf2c91f594dce2938efaef8f&08666311104cdc1ecd948797e9d0450e&35c37fbdf0d81bb31e85dae64b0902cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c638d75091d4a79214a5c47ef449418f*5&f4c9474b2aa85d0715745dab2cba6b45*5&e554ec2d7eab1eb719c43c3d206c9f59*5&97b2698e84dbdf6c3accc8ff06e26acd&47787ba144a72b95212216073676a1a9&b9f58b9ecc6fb023bc94a94e0a4a5dba&e380c5dc341e247bb9b8121e24136141"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"442b27b1cc4e3203809861411832a148*5&76fb0bbf17b811ece72d8f95ff477eb1*5&42d525bd3614b2d775b8331bebd1a668*5&150e12d45e30c756a7f22e9057ad0f52&96ac7f7be55ecfa61feffb345af29648&24f38f9a63afb0f121429bd8a168a48c&13bec57e93a8ac50ca81c4c2e2f39b15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"11c68bb8d202c8d45872b38c974afb28*5&6a226c2cac8f7ca716ce1c52784931f2*5&0f6515daa32cdb164807eb2acf7b8828*5&7184ede52417aa2ee76b2b67612a22d0&f89611d1d803ec7d7aaea602f98abcce&ee3c917a601029fe6d99a035e4c44d09&6dd98c48463c6cbddbf582610bc52382"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14f440b8039a33dda0e0d830e3afd66d*5&f33ec006977fc11e83df387f5e82eaaf*5&93666aadab280dfa104277e7f1810e4a*5&c267cffb8013b1ebf90f43810cb70213&e540f98eaea0a1a6c3d2189cb9e82e27&91644e0d77f4d842e7f71fc182c899fa&ec6eb2e1301c570b58a1e81a8028923d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"008b070cb9eb962624bec0ba8e64986a*5&65e9db9648f2f7d6ec191403d1225b7b*5&6dcd2825102e2f571c46374d9fa0cd2a*5&40f0eb9b2e3d710dfd84b81006798394&1d29f175f2282baddb01ddc46a639590&69396a9cc7f043c3eb62035e106dc593&b2ea8e5763be30ccf3191343ed9e745f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ca7b6eb1b771e063322231888685b47*5&34ac566c8b86a30953c38b7f8ba6fa06*5&0e25d6674fac9f1eb9e8174871f4ad6d*5&5c5edb9dcc70a4147d6ebb782d2dcec1&336ce9c67dad92be9a81c6bfc4f27979&537c716c536ae8e0cec4d6707d4410c8&d450275000397e7ad52c310a9784e248"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dcf5c8a7b2b82646b9ac029eba5669cd*5&2ce12ae0a3f5fee6b383d1309515fec7*5&0206e5b8965a20191e90919874f0fdb8*5&d57032f05a4f191b092b329795f2e889*5&fa5e51a3c008373254d42c6999c6749d&6ef841a1c2f48993f7ea7f5ce2fd8a31*2&c70c96f614a3dcbfdc49a29f49cee6a9&5280762d68425ca94e50b57b4b743712&5fb15ff1be42bdec168d54f707f987bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2b9bdcfb20716040f7b2aac993bc2ff3*5&e7bac9c2358d9ad64b866535e8a103d4*5&4878e4cbee2d686284d04f984051be7f*5&3f9e67eeb1456b3249fab10a61d06f5b&d9d95a2e3e08408337020dd439019b7e&557e1ccecff02779d892f2d7b2d5a50a&f31200cd6db41f4000f32eb18bed1283"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1cdf2bb4f04c1ecc7db409abb25ce50*5&8157ac67b4ce3003cfa0ec1d495ac32a*5&5359d2a53bf0e9d34101646ac45e719f*5&8ffc49df8a111d1ca35721f6849ac4cd&28c316cd90eca507a45d375a4613529e&cd62a575f630149d3e0cbf596f5018b5&d9971c9f7d797392e281eb5584b0a37e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0127bfd0faceebdb46e97d92ba426417*5&7f69abb6b01fb44e2ed5bcbab7ce95b8*5&203a80e62861706cf65fefe2bd75ff5c*5&94a383f70a4b7d0e84914d3329613f7f&c146a1a5da7cf207331ca17143675f7c&c2c3301ca836b777417eec09959c4b58&87010886700a9bd939370b0bc021843b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da53555fa05442b04cf8f002ad3eadc6*5&35585c4936a7ae13586d4559e6733fac*5&2f2d775ce298cc6d2c7794a26921974e*5&fa96f584fb63f58a72985373b7aab0c3&022a1380b99be405e3c75eede5a29d21&48a7463521cbb47ee0d03a85691e702d&3c22a49c23d06b4c3ddd35f742fd14d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"faebeeaf67e3be8ff9548f94829d6fff*5&66c507b4d31a8ea8f122436d5c27ff19*5&c610d9c0ca6cac2ce52996438f3b6379*5&207ddce66992e01436d903f377784648&6d43800c85b6c51374ae385a426482b5&d69922637ac9a58d3f6c7e8823419662&600def058756d3f629d1aa837936660d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b6062e6cdd469faec1710c9c099f00d4*5&1b86eda3793e1c6eb374523b791fe930*5&d4997d15b2d6ba620e3ad035f7fcf559*5&35da281ba64d0a0f3a077240da986f42&5bca16ba4b092a92c338f3799a5612a2&c370588a7d92f465cc2961f59ad5b0a3&2868e8fc3a60fae8e969e7eae2028d78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4f4db25e6d7fd8c5fd15939f33127860*5&541be84f40f7d9fa6856d81fbe4aa474*5&384a3f25dc4a0eb08fef89c16c9c1f6e*5&d5b65a69d20cd301f3786ce6248f8d76&02155bbfc6ffddc78b8b5ee23b9a7105&1032c60c365dd51efc0818a24f7852e2&ddccd745bb8cc4d270536e25e7267973"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2dc0817ab17b78561a918a8bb604c4f1*5&9c1987220ab15d295a39badec45a32d6*5&7603b2c15b9dc1df7ff19a28c25aa9b3*5&5fd39b848abf5ee8b1d2a706859f5292&b3aceed769da6a97b3a6cc060a6399b4&4cc76ed64f29f377753f03be14273547&0728793e2ffa65a8f591d0e48b254766"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"17a227cfb9f63b9124eda3e22c12d23b*5&0123263a0accff74816959b870cb2095*5&9804bdabf97b3b0c1fb78fea27f3a20a*5&e5c40fbd0679cf041e03b711b9e1cc47&ea95dc8196c6cfadd9b85e684c2500df&73f6cae5fc6182ceea56b84ffed21f3c&81efc274b07014886c5a160dc54ee962"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"40718201faa90d4eb164c14d314868cf*5&287665f70c55d57a6ed1c6bd01236c31*5&7e02be0a27f96783c36a86ad6f62cdd2*5&203462de642f22e1f44933fae54af36a&fff6b0f0ee0bbce4a1f0bba0eb47274e&9ff266ac3241da41c5267a5e452f7ceb&38a090994ba17e05babc6c6ebce9ed6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"62220a338dfabe211ef9a36c29be4970*5&8d4bb01c3c076501d7acfd3325370ec6*5&b660aa4c956c98427e2cb0fa08f672d5*5&30e49ba52cf579cccf30e04e2fdbd726&c8ceda66f4555b97b9fa7ce40e111f27&3f9641ff81cf6270e5413798afcdd295&6899ee45e7d5f20925273716f8772c43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"20d9540ac55a004df0f5c6ca9ef2bf37*5&f345875578df56fdeb8188db85384472*5&c3ee350d647f5652d07e327d51d464c8*5&9039683920725a21d146782da0bb08b2&4d2aabde6f7307f37d827c7770ae89bc&fe2f24449a26640fde9134340f6bff09&b1c281e657e7af1485e183831292f78a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8c61308753b1550d3770ba9685009670*5&a3ab59d1b1eb282a5b56b64c2b1fde80*5&3863c895a64a599601719fbf908c638d*5&158350054496a0240220b51079cf3698&bda4874d27dcdb60b0ff6f0f0d4183b5&6fc459c8eda94af1134e8e3f346af27d&23247c232fed30af71b95031b9223dfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0d6394fec33fc7368df7ec8a05635fd4*5&35c216b93c2828890aecae09a52be4a2*5&e0633042cfa2ea3695b302fe891c4875*5&740d4553fa1a24bb63b6dedf1432da71&ffb8592003a43be9de09d798c84d9c93&56571d5ab09d8c5d1f8e84f56f410cf5&4b66b50a54289daa87fb210b09d1f771"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ea8f5a1ce3c61bf64187b4638df45aba*5&456aaec6d29fed507f1ff144c24f8cc0*5&ed943d5360a09038f5c887834725c64e*5&921d2ac896474557411c27fd36034408&7ea5b3384cf4070eea70c056cbb5474b&b654759827ae84f90992b65db70de99c&fc74379f61a3ee76799bbe8e4d573174"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ce3a2163aaa0b4156ab41d5e650914f2*5&451b25015232778f9ca9172407c8f9cc*5&a13ced5c40e93335c6b2c0900be2856e*5&b825fdb8c7912361ef2f43c4c5b36c96&11fc0c6a31b5899a49b4ed54d24201a4&2aef1dadfb1161b071b3636fd538b60c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"23bfedd0832ff5e620f9ddb80a877630*5&4f9cb598893e710c3acc4455803cec06*5&913a69d60625900af738fda4037e0613*5&b68bcaf8deaa2e58aedfd29f2e315ed1&6f36608c9cf46eec3c59169745b55a2c&76455e6ed6c7c35df65a913891ec0a6d&9b56a7d8d4ad9ea97d6ba52ce7abdef5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b39f9fcf59eeed84446a5ecc54a1b2e4*80&8f08b861f8e14eab9ee526f36532f78e*80&ec79d564688639192675b509d5cde346*80&c8c7329670c6b6432584943379e2e23b*32&334526bb5acbf78ce3dd281e02db5722*16&cc2749072c8b0004b9fadec80a471a4b*16&49cc0935ec6bb730de6280316f825a4e*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fe838521379a795b2747140cb32f934c*5&2358f51bb9c85cbae7461ba4e27d447c*5&77af97ed23a897c9ed4006e779075882*5&3ac3f6a2996f07c50c7c037a95dae539&563778661f46d9e62745ae19eb33df8f&882fac47a953f8f0d72633241cc0be4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c72a2e930b24afc9bfffc53454a602e*5&89fb438909cda657b2e1c2120815183d*5&431608fe4ef00a8458a945aee58b109d*5&d442f84c3a4dcb67a35123f2345650c0&aa40f20727efc60512bf55968fec23ab&49e6f66d4f26062b614dd04a7d01d829"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ffc5083ccc3b001f90d22af1a2933656*5&5a0fda78f10e8692aa596e72ee45df39*5&919990dbba6b9d845afc5d109f3d20df*5&4a2b017df17c273fe96e060e09b4bc1e&20a68baccc9baf42277f64641fcb58bf&4d8cb6831e29e98acdae9840e1f109f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5495bd0d84af6ddc65614d5a99f92be*5&202763b1b318296edab1213b8df96c19*5&73b62d782b2fc599aa9e309a3885a5f5*5&c20d5158d010de7d566651756bbf48dc&89120ca74a2770f9755b5898b0bcf6c4&2a7c4b2bdf70893d1e288b813c307f05&7ae72963a73b934a54355bb967bc74c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8aeed9424e02b3fcb0d7f0536d055872*5&8908bb7d1c7b64d09a306c458c7ef1d0*5&e42aa4288ee8bd78ae2c0b6fc8abdda5*5&b7db4a4194ea08222ebdc1d6771a09aa&73918b90a75d3ea4a28afd8c2659e01e&c67fcf8c85661e9274ae1b2b44b934ab&311369c141850d4fa5ec92432fcffa89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c2ce0c17f55b84255ae6ba6e56275c50*5&87289038f577adaf351d024a775ec10f*5&89970fc6bd5cadee973dbc8a7e9c9d28*5&425b6904d568a6ac69d89831911c9f27&a9048a97c986b915443698b7ab392cbf&caced0a283e1c78183bb2b916af18f5f&4ae3dca570c25896bea0db8f93fa9a1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f1ecf96a8408d522f1abddc400fadb9e*5&51a5f87c28b406325a8314e17a61807d*5&463d26c101dd4153eb119fe565dc4236*5&b31498332717f65c02c163499ec16e12&2256d512ec41259caebf0594ed423794&9ea9104a373fd80a619f4afa8eb7bdce&11066c6e6cb23ca6256e200e41aa9039"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"31e21a0e15a685820309020eab91fd20*5&5074b608851620d442eb5f2ccf77c5af*5&6341576d1390b1fd4717305319771afc*5&9c4f184b3288245c2d43e3ce5f487ece&77e3d3c77ec4ef105b596d7bfd459a5b&92af036d68cec3543ad24226975fa898&9eb057986dd997322c202381ddb1fda0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1352e23e3991fa23a91ca273bf95da97*5&0b773a077fdf0ba636e0ff92c36354b0*5&34baf975f652194685265175c1ae1491*5&b0f2b3908a9250b602e02a0f3c5ff89a&5489b1c5c57386ce71c12cb689b5d7ca&a2408eae495c3694d399d49735c3456a&a2b3209b98c48d2f586d8505994d2908"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"785af58f27b55a2275892622c2d7d089*5&d555c74328a3c76633fe1555ebb94326*5&c97eb90c70c9f0ae225abf8c979547f1*5&c6f6d649e7aca6cbf44a7d47fd9523ea&b65b2c8fc98ae5759de47416294f928e*2&d6a064662d612a60f7a8523a300de675&2b16bd1e0e4c29a027fe74f4d7b617a3&d704e1e66f698f5f12c439054a04e726"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7f125101fed26cdfe0c367bfff54c5d9*5&eb01ae8136e8416c3b847ca741f7b620*5&8222df140a7e564c45b53ee09f2abb00*5&7db4e9cbce655d1afbcaf46f316eebc8&ff1c8db06246a8c4a016213c75382baa&98fee21357cc2fd085b3856e44409b95&7a858aef9eac5fee1c0788864f686a3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9da416d3fafa5244fcc395d5b9dc0d71*5&b6c17c523ac9bcf3a5ffc748b125a62a*5&c498ea1ed1bc798bdd23b7bffe7fc155*5&693672ebdc2f666b60745f62f7f824de&5038512e2ed72156dbf9f9ce40c7a922&bf4fb75bc78f827f2188404dbe917307&2d4ef009542f6c638ea55b6430a8a0ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b7b41a3cfe7c04ba5c2ebe4e231e3591*5&0f91ac1cb4dec2b182f9a20f2cf67dd4*5&d3e24be9dc8297e90665823dc4e7b97b*5&d7f5aeafd69e1364bb6d30287ef0adaa&4a2805779037585d85baf18e9ff013b6&55dfb87880fb2d8f8b89c0fc7100e9a1&48bfddc7b59fceb9b8c879d828032a5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"426c43056761ba33c32e1c2185aea9a0*5&7f796de16e3f1b137f12e304486b88c2*5&4b0ba00f2f8cb229ddc7787ca0fbbbb6*5&165242b737637028f0559f9d803d7ba5&141f17032c57d04e0a37d229f5159a1f&c3c90471952ee6c2e743628b1a2098ba&22e1f7010ea3d12717610970d46dbdba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8af0d3ef7033d5bd7a31e232c382d677*5&ea3d6d42b5947580ae9a0ea97bbd1f8b*5&bbc3344c00208f41f0dbd2fc8d33bd20*5&35d56d00a5d3e3a2d96b8ee179f8585a&c90d33f53e84b94fdbd5f3d5e4a3e3dd&93c349c05c8702b9488e03d113b266d6&201afea7b51d0390c60f667ddd7059d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc96_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"8ae6a10722c3905868c109d4e2009581"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e748d6ffd1f26ad7894b0daef999fbdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"84132d24b663498e7a2aba55ecba43b2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1943e924a2fbd261021031d450ab9890"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b0a1639119249492b6cca8bb9eac149"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic240oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"46eba3a526a6f738a6a20ace10dc5d63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"890f58e7cf297badccbac1210a6200f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic336oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7781a60f0f304d7ca397ceef2b56b518"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"51daa509070159c7e60b5a276e0cf91f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a725faec58abefc1663ab1bfad5e82ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1a64144a9ed4d703d8a5d7f93b730837*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic240oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0581bc0b3f6cddc962f6b7e937d0fa49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"defd40718c92d582194fb3d51c976f63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic336oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"58b30271ad99933393587d56dbf3dff4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"da11edb2ba372be69ac1eaa1dca1eb07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic432oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b4004b4a11b7e60b002c884eb7453de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cf7a99a38b457434e857f288005ace36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5677326cca175e4b16ffd3e749458db9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f35960530193dd6258db6d7458922973"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic624oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c93e945dd3e85bccdcb61e52a38e3168"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"533ee5b46d0947a0cc5f3daf9d15a8c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic720oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d04f3f1f58333597a33959c0e0716916"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f8dbed06dd799ae0f99f74f379f05472"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe82fd6a850a2fb68f3780b92280ea35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2196d0925a2271e9c1cc4bfd192b4279*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic432oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a566823b48fb8cfd57161456f03fcf12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b95c4f89bb595e8819f4619d330d9f46&c7e8edd60731d653befd3ddba00a0084"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c99017cdd498b69050a0b75898c15ed8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ff54675985cad2eb5c25252af16a3f8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic624oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"10e950eca36c4a7e80e215d481288c1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3197a9c13cef7b70bf07fc6fffd3a168"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic720oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f4df7d4ecfff8440f78a8c9652841346"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"091dc6195891a789163cec4e282210ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic816oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b3739a4fafdb018fb65ee52b8ba988c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic864oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"26540af90e169327d82aff6ec05b62e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic912oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d34d9d335e23a92f4433b03e59cb5db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8ba64ea65957c61a8012e7310fe264bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1008oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ca6811da07fe884bb54ce8a502a5a72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1056oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"63775d7f2889dc1f55b3ce2e85b3df36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1104oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4a4083a0aba53d710b2406b7a8ae622b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1152oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e125fc03bb89b4feb9f113cb0765ba21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1200oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bc567dc1db9c27f08b27c0cd095d3dd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1248oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5fbbfa34cfa78c027b3c1e63b8309b70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1296oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"497e6a19ff3f2cce016cec09912fb975"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1344oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f1c792e4d303558a39b49294e43a7735"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1392oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5745175637957a0b20078dbb52d0a2c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1440oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"57ad9ae35c2d7cd31956959aaf50327f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1488oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"82defe553971be6e09562f9cf46da21f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1536oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a440267aad84d905c1f7f15c07002f33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1584oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d90e02aeedf37e6eed95a6be17a4d501"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1632oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fddaebcfa94e6e294f04f1043bd53359"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1680oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"814301456a89ece7a55532c451eb8765"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1728oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"271e72d5d0df1cea371146ba9606b040"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1776oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"106d7dc769d979e2b9f4d2fb8da36421"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1824oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f819fc72165b34d8657b5a3079149b40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1872oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d7bfd2096025eb5da625c30862a87ff2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1920oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8c75f0c970b899bd94f58e2f775d230b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1968oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"58107f7f0157666a0dcc9aa61ede6c73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2016oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8f3bedee0837c99c5acc9c8852fbef81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2064oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f61373f35b3f91035ffdc15441c8cd85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic2112oc1056_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"734eae07b8ef0d157248d2f8fc80f6dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1056oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b67bdbe417d3eacd3f02a122ba41492"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc48_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"55388dc7d2664462162a69fe70fe0d04*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1104oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fa8e89dfebc3923c0098e200250cff62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b2b650ff257a1ea98317525cb4a41e25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1200oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b46cabb09021965130c009dc8ca62bc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1248oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eeaa31338df0dd6bd158a70bdc679489"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1296oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"55f1c68f93dc3236e9d241c995df7a81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1344oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9496c4efd02fdb72decd3b94f483457f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1392oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bac9f176350e35c2e9892da7c02ef6cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1440oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8dbcc964121fbbc047b693360accf871"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1488oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f3c4350b54e3f7d54fa9c7893ed4fa6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1536oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9d38dfe1ddf38d1f0cf7faf7f83f1e44"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1584oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3581092f449f6d327b422179cc601af1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1632oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7038e668d294d8fcaceaf03d41475b72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1680oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fbba7858079d7bf74ed69881e7a1f883"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1728oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d3362562277c5eacd1b951efe8bd2c83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1776oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"456f11bb368c8301fb7bc83d9a1284af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1824oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0d549f4861fdd012415e47ad27fa8efe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1872oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fab074569810c8430a1873323ae9ffc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1920oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0d8abec2f352ebc272393f6743dd8b7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1968oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"02349ce7cd7b1c2c8365392e3b2a8dc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2016oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"94a8c8cba76b40e5cd97cc3ad33fdeeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2064oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2d1bc0b167dd3535a973bafed3615c26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2112oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1a281234c1cce6d13749b5b60372f46b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2160oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7d12d6ad37725971b1e31c6edb9c0450"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2208oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"67291f3c9e11d99e4d7862359c725be7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"f5681d8b4f72e4799cb886e9345504a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"80e6f0ba41de84121b0b1f43aa541170"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f27e3368d7b7ff53b17503f3e50dd32e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"cae085fdc90e30451b029573955d1d12*3&85f5fa0758bc875f1a27be3b5dd3be2f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"72fb03fab44be811affcb84b8fda1d5c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d78ec8c5fa1ba1c4be7361b4d14f2158"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"61e4a7845156d321fe3ded26f9826387&7e05cd731d623aa3ba585c7abe091fc9&45a32609d656bd3120e27c0bcb7ef196&52d0587ec9a140f4d2c6ebdf3215a00b&c4a206033d185985eb6d5d5f88ea77ba&5893754beb86f7eb2273596f7341b5bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"008994c15655a635db61d3499693c79b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"33a13a2a7be2ada68c33d7fc1cd5cc58&34c962922ca2cfab1652bc30f367ba28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"078cdc36db5ea27a1feb3397dac67a60*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc192_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"540c3f2168c929fc372b95544a70e79e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9cff24b1e5d1ef0c373f903527abe674"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4f607579f2e6cff5259482471aaa3039*3&4959d14b1bf2df833ac9eaa50583ad55*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c32f60b71fe5fe90f4fba42b835cd110&7a291784b605af20b3f3090b63134367"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a60f5b5175d601dbc3bd71b288254d85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c780de72c8624cca1496a643532eee85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cd343638d72095916930951b4c2c4033"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ca0ff1af5d82e23790db4b339b5d0bb9&7e7486e8c5b9933e2c3f175b1c4935ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"51bbf4e4f4b9ed8e29f9aca4bc287189*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b3fe5a6582ac6a73201b584208ede7a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"22b63ca10043fa1e554f42cfcf12ed41*11&f6609630f199ddc8b40eb89620ae5cad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9f0b193f6bc120b228afdbc209bbc961"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a2d7cb05feec70a31f241e81f99d4181"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"490ffa58526e0ea161a3cbde6efbe806"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"008fcb030e8e6518b2bb1a1a44dd90a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"537cd872e9f7664dde036c913e5ed61e&e0a54121c949c6826b359a31b615dfb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"46727ce5c51d99186793ad8746b153d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"013fb536e9c9ca891fbe1e1458632ace"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f9b78447bfa8e9b21fbf0cc5ec738344"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e90744bc25ea843fe741803c33338e16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c483d63bfde6f5b9d70dd86ea9ed07ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb1_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"98906d57b31f63096780f032f2c75d5a&899118514530a726c4d993fb96bf1700&e4d26da13a1605bc708bce133c411988"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic704oc640_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c48bc4af0a109cf99ba4a965c0a89ec3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f8f3bdc11f9a92e7f336aebfd523fb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"689485493169a3370a25e0a807ee37a8&6f33089eeea0ce041fd007ec32a5a0ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"81ad1f060103ff16aa4fbc583545ba12*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9a1f17f43c898f74dcfcd5913ed9585c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"85aa251b9f1487e9a4b61696d10e90d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"80d4b47a04ecb3e302805f0d987bea32*5&50848cb84f46cf7745a9173c35433c6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"3a2c39cbdec53d674f9b16bed3959a83&e69c7aceb82fa12b87e35f9c9c103f57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"11bd41258919ef6d7ea48cf586190a00&230e2f1b100333ec86f1a5ba4812eb10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"0fb42f6e7ba4e171666c29268e67f6ec&f7ce6bc6dee6afa199a4612fbf63bbc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e147f371ab9358434138260a2d269256&0343a14dbdccbeb0eeb8e6243af5e69e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"f256abbd48a7122de76cd196c59c9b30&e63ffa89f783acdd0a31b2be8b40269f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"da08531caa7b27c611d2f3752c1def70&fc24dcae273858896f516ab5f5abb4a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"72b634b9db604abca51d37b98c99f24f*2&2bd3631a91ac2493c220f9795c2b385d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d25233e54338ddf6938f2f6ecadf6ff1&4a61afbbfa43b9ed55bfa529b77e1e09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"234fcd5abcd1ae4d33b77bf52b0afca4&0609f2c3aa7688871314b2553daaef43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"3e77310aa0b6a8425c05e4e0b8d08bb7&00c2ab28276bf7577e29075719941466"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ecbb718dd0c7a20dbb3faa2d0e32c656&ccfda9ac83eed5e9043552186dd1ce51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4db76bbbfb6f029acfff8126a3dda7d4*3&cdb1e84ff2e0953717f8dd360b04b75e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d1aef14c99440584a6b791cc81f3738d*2&188f13b8a6ef8310c6bf6fe4fb0b90b9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"97facab86b263ddf018d32a1fcc376bd*2&828717a7e47d1b1ae0d6eabcfeae4a26*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"ce8b7a2e8c4e55f996cbc8772d36be71&265a3d7eea009ff8d0b206e93a65857f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"faed83ba7f4c2001ed9eb14fe8a3c3f3&8eb53d63ecf5a1c00890e1bb1d8ac7ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6dc4e790b939959d8a078beb61e2fb92*4&81d920d99fd42435b8b2fe039bf44fff*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"32fec7c647acd9794581948ca4320462*4&f5f04d1b728a468479640b2f6a7f14c0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8b5b5dcd7931c57a679fb6e368949fdf*3&f3ed2d2d013e8f07a2d9abda3ad41720*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"225f0164632a907a64bd7261c0b6edca&66ca1c187fcf09a63e43a1c44651ebec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8fc8ca7c1c987a698fcbd7f850fb979a*3&970e75829c90e38910153d1e696720d1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4f959f6f74780dceb020a891d28cda48*2&8051804c098ce302a06be0a09bf9dfc3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"029d2127a5720ec3143e49f5cf1c2a30*2&3cdf8580c112177884db494f72e6a949*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"15b24b0cde25b987c8fb9987e8bd3ec5&5a774ddc33266a5f5d97a4f0a42db906"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4f7174701e198c29679fcf12d3374bd6&0ab0fe86e6eb309c941ab10241616d7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d1f8385667ab88bfad59317a4f06e06a*3&e1e0c10e045d4c6877a0deab6bb8b051*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4a12029df3e40a0f80d3f2d21c02d798*3&12b210876fef45ce05b37e1de4f82190*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8c82cfed782ccf178411ee551edd715c*2&8e5fe80033fef7556527ec9c35b0fdd9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7562acd56ff6743f904fb6f224aaadfb&9fd103ed7fb7b78ac10e196aecbc46f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"620fe42dcb4bdd7d92a2b96ee39eb663&4d05eace6e58c1e31551e08b94b81275"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"7d18ffb1171a3b3e3856592ec5d6343c&97267a47ebca54e8161d7a75c78ba140&8a6583d4891d6b753eacd18e2d69e70f&cbbfcc329ca6bda65721bdf762e47350&7be98376a24c4a6592d3631befac9711&09d1df36117d9784ea11deacb095a54a&8214933c773bce2494bed16281de27b2&828c91e0ba8f533c41929671b6226210&6a8847466d0f2bc001d8644a01871133&2f3323a8899fe3391dccf261c5fec82c&4e92621772f8636132b975b5a8651067"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"95280bd24e6f040d49c64a01c258dab0&596452fbf244381de12711df35960b35&4b499a009a3dc7895682e74f4951a970&6b0bca019d637a1afd9535d4ed8dcd48&90948a835b3c612d947c18e163739942"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a703caf5424ab1941c2ba3a3b4bb9392&e7f5da54ca94325f001b8df5ee84f93a&e7050bf25c325800eb8f07cbb3f10231"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c0c22bfc10ef239d18267ee42cac7314*2&29d2b504b16ee4e966a2bf86be2ae924*2&c19d7fc6c915465e44ff545114776197*2&44f8d3df25fef7321a451043704fccd3*3&8db4b8b84b8ee877335bc41d88819221*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8b8a7e95c6cb8b8ba67ede2aa30a5648&a15db5bfb1816b233db18f60f6a42c74&35d76c9e682754ee401bae159df19a3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"565a8122fc758ae72d16fba5a8e4b69e*2&691ee8ba992d881ccc5a0ca3b647af20*2&baea9fc489b7a1b82510bba9fbf566c8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f9ddd54f6746bc1e7d8b6956bfd8454e&1d811bef770d96607f93868f83be9102*2&9fbff231624ac467b4cda78913206dfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6ef8a672ec8a459eb565b1f3dff6cc9b&656e4fb2cea017b7e7b20bad73af7357"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ef6eb688ba46273823bab051376383f9&7cfeaa90dd929b59cfc8d14ef9ce0325"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bd343ccfd0605098e590e648cea40e0f&f67536b9e1b222d228607150da60998e&62537db181b32fda1a54078d78489bda*4&c49b5e89cccbb020e037d96c27d02841*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6ab1184c77d87bdc7a040dac39b68234&7aa7048a1306f43c946ad2fbec1932ff&c2c946e5688bb9c98cd9a8addc3b996f&fb5e3f63e34d8a85e14d5fa5e2fe9ba5&7a2d2e6d6efeb0f645f3aa5bf0c6eddf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e66409c16ca01c8d312283b288c07688&a8d500657fb6bb590e2bd5c0a249e087&83823067005c9e45abfd17a7cc6bfe45*3&0507a50557a08e79cf13abeb97dd5547*3&9005c8eac166e1c57b5be1fc9e664c5c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2c76a5b773a047a9012d746717347025&fc26f14702adb923ba8f33420d7933a1&0a59e87e0c6408417d67e7894c1885cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"00e44bf27d5ca128640afb3acacfd62b*3&0c953746c48e17c5f133f35a17a161f2*3&fae1112736f5526c964a07cc2869e2d2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"27095fe782c9a46b479778c27b8da6b9*2&18b215af62409ce27c2087bbcdd85111*3&4460786c1fae4a4835ebf75a7704ac78*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"bd7aac607454eb3b81cec4011c02817e&1ef9116db7fcf4afc1a6e5adec927e78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"61b3c7373136edb111aa795ed4f027f1&c10b23da06843cd786751be76e60b4a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e97f299e55744d68f4375b169ad1f71&712aad20c63ce2440c3ee209971bcafe&d714676b33c6c7732d0259bf3e0eab12*6&4ca03c64b70035facd767437891dd71b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3ffed85b0be3adaadb283f4ef07ff672&762f38646884b74fdaa483e8a3f088d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e968d12c463efac0ceb9b1a2210cf57f&ceb0198f557eda6dc4d6ce1d69630a50&b6bf48e423bc8081779a2004ff8ae1ac*5&c796507d7501d34b4fee5cf65b9d7210*5&761a6102a77c14ef23eba8eebe81fc28*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9bb190a50514f4bb75dc8a25bf7ae46b&b95b6a75db6fadb5840bf38ff508cf06&bf90a4ac694ae56ed83a8b7a6e537029"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"769e25dcc64bdd3891a6af621a915116*5&fe1d09ee1b459f1f010f9cc679eff508*5&f21f00f35d225e181f7a741d7a45764a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1168f8fa21d5ea6d04bc217fbd75cc0a*4&4482911b393763348c1010ed6688321a*5&a5032c42b8c00b26e9ed4fa97a9f1027*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3bb12d2870071b2debc1799286915c5b&032bedd2f973c10a5c6ea581a88ab4bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c7121ca5f0f6c5c586fc9bdb35109cf9&165a685f5844b1e688f768b11ab0a22f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"75c4a9300edf7e5c3031be1d33f148e9&55e7a922db14f9f4b47e5d3ff1a01753&01ede2cae80e2f9c1d711663c43148c8*3&617cd2b67940d1d1b70cbd26f1008539*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"10b182ff15220fdc7b00c51aee57c0fc&ed9184aeca0fdb5ea8491f0a8d24267f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4efc92a99c5a88f021cc84e7b651ab87&99432b6df2581103e1ae247586706459&19056953fd060865d218a52e20dc641e*3&99809761c1adf7eadc68e6c7b17fa701*2&a3f02cd707faf2ef6770b66d2352b7c1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6474f6697d38b117897527eae0add453&736ac6a83480183510c92457da009192&f7ac93d16debcb32e5bc2ad85ffcd609"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"64f147eca00ca008250ce3b8ae8fb168*2&f5532353858e317a1406fc724e238d72*2&853fd7d765e3a5096c52fc20f9ea9b8a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4ace921ec7eebd496982bf1bec273561*2&18e7c1e21013c75997055b9ab3da6d4e*2&4f2f36acae7def4c0d35b39b7dba9a6f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0593397d6bfaf385a207063f7b295ac7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh55kh11sh4dh0ph2_iw224ow55kw11sw4dw0pw2_n"8aff69d5381ded17056e6c1208460b8b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc192_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"69d6795d57fafa3571ebe7315870702f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"405e178b5f80ae5e5e9978b8b2a2d809*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"1b47f3200b715998e7ec8be406a5d7fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"a97283eb7dc62b5beaced34a4414878c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"2f6cd1d3aed5897e32588caec77783b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f873154bb956a998b1a145a254942a6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"a5020f60bf1c1ff49dd23dc2e94f1557*2&35bb2cc5202d5a3cae1f7b19a74f9fc5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"8e7eed3876af261a957326de2b36d9d5*2&a3a4464f0f06028d440b98629d81f61e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"30a77332134139bcd3c3b304eb320f81&57311e8934d1193f75291fdec51e217d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1e82b68e5213759fa308d18604951dd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"661f26f0148cf3502fdfd55a40417d25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"343c0a0322d7c7346b5b9f8ba78e60b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"56b73ea4f2b199de087a79a5cd48fc6d&435e9c1a67ce49abde3ceb3ab5e5b46c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"a1016081935d0a0bfc7deb54e2f66cfe*2&2fa5d6b106757d9afefe4d2d7b99e307"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"49a4f7aafcdd6a3231d9431682b8e27d*2&6a28d22bf378f2473b5c73d910abee8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3e2c35bd62af0c023aade4e9c47fe180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"9dab8996fc87fca294950f368ffaaa6c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a3008e1eed3f0616ea2621cf4f74ce1f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"52f5ebf7930692f9cf8f4656d5857e6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e77bda4896b1b3bcfc22baed394cd4fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"a7fca411ade26d39e1eb93353a8649b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6e556d8b51fac3cde693a738ab86c015"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4a31ec4053c62751c2f39154bc7ea557&3a84f11ac44b53787988be219bad5ce9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2e170172c640526818442ce6617dc64d*2&aff69ef1304b7db7e9d45132a6c252c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"093856576752882bfc341c274e4f215f*2&f63348c02ddf754d4f0c4e37fec95dba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0158554ccb109617a2323af03b7a5832"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"112e983e0f224bfce07c8e797d193f0f&074a3217c8ef8a01a8a81b38849534d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"714795cea752cbd7f43852d6754e8ad9&f82389a2373ef8d29cb49627c7431bdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_tanh mb32_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"65097328c2b4b299dc86d0b6837a8559&b8f94d19d42567a0d0430b5207821ef9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc128_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"057068162e51acb20795e55063e6a9e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"c4a7efabb89926baebc3b2a1c2f9a0ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih512oh256kh1sh2dh0ph0_iw512ow256kw1sw2dw0pw0_n"9a2c8aedb6b75b3065444db4e3dc2513*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ab7d89e1e39d3cd56c9a83e4bce2a95d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cc44b7a1fbf92d31de1310616368fd3f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"cff8558c5cfbec40bb67ba749d2b63dc*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ed1ae03174fdc54954573aa587eeb6f6*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"662caad8e56b1facfe5db37368c817db*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"6c3b33c2d2a95919fe815371f1b173bb*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0aaf749f35a21366d107d9c5922aa9e6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"09fb5411c80a51109271510c2d48fe62*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cf8a207180abc33abb17546e35fd6cdc*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc384_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"9005f4b60368a4d7b5dfc612ba1e82d4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc384_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"30baa1749be0daa3816b116ad303c3f9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic384oc384_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8779db12166cb0ee74d1461b000c7ab1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6cc7f70372024c6b2e32e1f0eb5354b3*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic384oc384_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6099c33627a08052fe1376a8273859e0*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"f9183c467f3b7a82e67bfca6eb77a4f3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"07d1aeb78faf49a5be4b1bfad237094a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0c6a68aed4b6f6472b1892591dd658e1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c0249e2e632a91b9b955a22600df2bec*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic384oc384_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e6b4846f681492ce979690748beb37c3*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"c614b46417516122fab7f2f85c402d41*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"516168b438ac80bcd030ca3f458a4015*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic384oc384_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a3853ac76e52e88ceba9716ee9e8d00d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f03dff3daa59592abe2e0d7ad08e2668*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic384oc384_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e28c164fbe29280dcb5800d01771feda*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc512_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"fa7dc07d69fb288a0124dc523904b999*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc512_ih16oh8kh1sh2dh0ph0_iw16ow8kw1sw2dw0pw0_n"a950d9ee915d6cd7ef48de0ab1861977*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2b1f773b68cfb774c1fbae6a1e6d94c8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ccd5675e044458ce7c35cd7a5b6ebf76*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"0894f0d4da06582b1023df9ab0edd20d*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ea04a5fcf23217e9ce0b70e31854607c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c54c32308747992d064755d130339df8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"635a0b2d46146d2f69c4a880a14ee18b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cadfde09951f072b9c534246e607af85*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e74e08adb0ce1c4a2d7c3cc20e3ef4c6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5e9eaae5c9fa65b534e78c3e489c0b67*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"96a01e1584d223904153adab7c01d39e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic256oc90_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a37b89edec966466fd04e74d0888f7ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9c1512afd33462ea3156bbffcf99f186*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"382f815009a066e4b1ee9a77a24cef21*5&47d0162237e5056323e91e77c570ab6a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"fe55f2dacd4465be6a911005f7d0f665*5&4a42e0ecd200ef9fe7b0da8075694a67*5&dea9ab1c6fc1eafe459f76c5da591dcc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"2a9e2a3065b661a36430620a2b0581b1*5&a7c58b75237fc13eaa219c7b9d403a22*5&e7ab462581736218193fff6f32605025"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw342ow342kw3sw1dw0pw1_n"35e11480d1ff298152e83a4bcc59e6f2*10&3d8517564fbff516322882f7c4d1f4a8*10&d7e93dfb6fbfde8eaf3f6af3def331c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"73e3d6d138d2ecebae4c23faef0e02f5*5&2f282ab31914bef7e345a6de61f9b449*5&c2e46f388a61b1305010e4a9c5c2fd8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"87d411f30289e582666644897cbb08b9*10&40613d6b1b42b6033b3baa5e4567f17e*10&75c757bc547b7a3abd31c13d41bcab13*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"5659d1704bb909547cd5776b763c308f*5&94fb7f19eecaf2da93135586047fd52c*5&72c6a8f948d01d2c3aa6fd63bcaaffb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw342ow171kw3sw2dw0pw1_n"0655a465353574c1da438ca6a3ce2848*5&8a9bfaabcc4ee6833612966b8565ecfd*5&42bf38400ab24a699c322a2d212af4f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"19f09728710a4c2f78d27d3a88d6eed2*5&965673f0a4454de0c200ff9a9c732dba*5&049f141c9dd111c1d71a34d02688e93d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"7fc1603454ee4352eac7b570feeb5884*5&97cae53e09ef33deed47cd517e6fa30c*5&55f49bc39b9b4edca3d73862d49849c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"a7e2e0f42bc4cb10c80e862aae0f2b0a*5&4930f1cae2b484941eb830c3e83ea5a4*5&3ed815d96e61d7b4489740820622dcb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw171ow171kw3sw1dw0pw1_n"4875b6ea32a1a9bf1a2a43472c638853*20&a9e3b24d1ef9fdec5462eec7b8150147*20&2a7ce3bde52d03ea0a6c4bac84eeb7aa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"a15f11187c696c995e447fab3959d04c*5&c7f897a4660b4761db3bbfc4543eef51*5&15b89ad0b7919187eca09973900ca625"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"7d24972d5b33c7281f75e4e8a13281db*15&4268aea68056ac33fb8bb8d4a69cd394*15&c93c0f8b2bd60f4a7bd1eb278eb2f89c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"5a6af427e7c7ca70545e3dc420277831*15&0a059e292e3e6e7ae7b17d6bffca8d30*15&d6e0ab55d21c4a089df35d55f70b1aa2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"9b7ed6c3d979ce3835a7c6bf3f81dd22*5&4ab695a68fb8e244e98b9d4d5eb0f73a*5&72beed49f7c5485047c1198904d87e25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"5b7afa5c6ca7249244f665deb36f6a0b*5&348379db5158f1bdb2fc04e910ddc6af*5&68e79563c5ecacb7d99bb0cb9cae835a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"644c7dc6ba8df1e11e705e728239c878*115&2abc6eaa4ab29a17346a7cb73c73c888*30&6fbbb4328b1655959bc516d37b9122c5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"ee71ff910848009ae470671af79f7802*5&1b70f3ddf7bbeb47c0c1ea3e9d3f2e3a*5&db6fdff0846132550b248525c6fecba2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"91f00f9a7daec7444b7be9df3f93f23e*110&907a265f2e65c303737a266cf56e468c*25&5bb9743e34f97df21e917aff13e15cdb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"6fcc5207b6ebda91c5e41f20876b9fd0*110&19f8b2a5a95e27400f903dc76acad9e3*25&f8fc565b3ff42e934eaa5be138f8486c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1024oc512_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"473a2a47dad968c53970437637679c68*5&d1772a15768f62194f6bf5f8a6c123ff*5&c0e2750167a3bcdf528ee01865b264a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"25b8beaf2cf064ef5888fd71f5f6b853*5&91b5927cfc875bcd694c2d6bd0e7eb7d*5&e3b0901add8bd6a780d51677d4774bea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"bda99a161780c84360bd08572344eefc*5&016ad5846ba465e869511cb815b6aaff*5&dc4f6408ace2235e71431fd1aab2b716"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c50d646c71b50fceb1becd0cbaee308*10&3906e026423254d99c9aef1997e4abec*10&2cdf28a8a082dee5723569d5e3d186a9&e95d3df4213f24a70c6fbb15a9866495*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4e15e40a046c48f779241bba6bedc969*10&9f3b603aebc59e10afc0724cf5da6025*10&cefca0f2e70733c7541149f86b582e58&3234ea29087957ff750d74e13fbd322f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e36100e52c1521b957f59859eea67280*30&c872f7180daad883b4c3a2474530f072*30&42f0245caace136b8937d654e9d0d8f7*3&1806ed881a1bfb95e0b03645dd7e5e87*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9f1ba54881c6d8fecbf8e36663907769*10&1b40cea96dbfa55c0fea15d1c5564027*10&01c85b127eeb449a89a9f1f3f7a1804f&36a5182059f62a715178d6256951b4e5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"111b18eb730464ebe7b1c140d0783bda*20&e9bf9c0e239cca66887614e1f48223ad*20&5a2c82f360000c8bc9e1e5a5c0728f87*2&0d76fc34426e695eb34b95828d159724*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"56a272d859f788952e5019588bd56720*20&92459ce6d4488a58258dd7f96f39ba73*20&657157a376fef396f65d1dc34ed39380*2&94550e3c82a9fb24b673f073ab6b276d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"63c1eb03328796b4e65701b03cfd7b51*5&d9a1db7262020d230fcebbc26dc5d56c*5&49d881dd1ce964e13e0fd03940018dcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"df07a4524219662ae5ec04ec9c361965*10&7fc6feba1481dd9f602b062692d24798*10&c15cc8537be7ef29694cca7eabf63241*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb100_ic256oc90_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"c87cfb102a38df7e266321ecdbe5fc52*5&a1fa384a7e1f9a6fcf469d693fb23365*5&8b18b6691d54051335c548d927408e9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c9df04c846b8f94f5007e102941e367d&ff43457328dda0df8f69ae4c69a9013b&fd3633870e0890f635367d574f6e7d6a&b82ef01700ff957b79ce9f196147126d&6e5bdbd3956a2f92bb019734795b039b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic4oc4_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"d8e6459771262a810029458255c2f9a0*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"80e190f6d86ee498457c1e86a2d56869*3&b6defecf769692915c408f6fbf67e014*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f794a19411435a853532ef91469462b5*3&ca466f3212e922011b20b26fd86fae46*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9f42a8c89012823b3eb6412c8f0cbec5&65bb5b0134e6da52c499a21ed883c3d1*2&77e59daca80f2cf6284d66ae09093e4b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"eeffbb7f1ece6b53cb8028acc85366a5&7a05a81cf23d17fe6cc63716afaeb081"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"cb9dad12bab0f184587c34e933b042d1&41f769bfd4c675f9a2d81e11ae6074ed&19599ec608a2754a0b96b3a504163051"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic8oc8_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"0e503b92beca41669646952bddbfd050*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"71c544f9fd3543b93880ee60502f8b62*4&dfff19bd73c67d94902385682850aaa1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"924255116e6a4aa3e586e8ad2da693f5&67c15bfd7c3abe460cb06dd249bc7aa1*3&c9bd93721315b4cb135034da5114b45f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic8oc8_ih30oh28kh3sh1dh0ph0_iw30ow28kw3sw1dw0pw0_n"d1b6ad9b4de97d4623b0cc11c2d2579b*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"eada67bfdb3a9ecc3c1fbe971f50ab19&7adf0e92ee4eb79cee648803ed038bfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"eda5730a2eb985e45c7b6eb16d15b2ef&d5bb3bd48fe0a916c2fcc299152a5d99&caaef0ed830daedf6bf8d7c7153081e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic16oc16_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"91ab891883f92cc9b572b01560dd8e8a*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fc764f0511beb3b1b97df6cb167ad008*6&553872103ecf99e8262419603dcba9cc*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9c749dcab5171ad61b7eef8bbb67f992&4d7d0b4c256467c9450252093a388434*5&e0b8d0fabfc218e28694b53512b89a88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic16oc16_ih16oh14kh3sh1dh0ph0_iw16ow14kw3sw1dw0pw0_n"65e272439ec64bfc976e267e2e65e085*160"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b28ee4441ddb867690941fed93d03f94&d43524489f4545d7f238a9f8a26dc8c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"62d0b6aafbce271597f75b803c6a1d5c&904b08e2c7c48b72f17c0efa77c9bc02&188b88594646c167d5b216071e5e21f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"0d19964004ed3ecfc1bba499bdef1a72*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8104de7f02e71b45a63ae8dc25452487*3&644c837f4c6ebd3016f8ca04839d30ac*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4effdd4da3a8c484d4cfc596bbd99e14*2&2f731843ddedfdcd899bb6fa870910da*2&f8d55fc8ec81909f50d4f658be7b7942*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"f5d3949d14c6cb5d77f49c38587dff45*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"2d06538b286d9370e977611babf42f41*5&c8ec9ca4a202c78728625dd600828c8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"424ba2d67af379dd0f88469d6aafef07*5&24ef8f9b277a71f85ba13bdff208da31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"39add1005746aa0e5f50533066d41586*15&e8a5476c6613818828f83498924ae080*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"d52d72594f450399c1ce7e1b7897a028*5&0881e84e1600812a87a781e3b5ac9013"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"3083e62a115af698939155bc4d76f9ce*5&33afb21d4861e3b63f243ecd053acc58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"bb1d79f12a7943e2107b73195501d40b*5&b0edf43f94de96880336d7cd964be988"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"194f0cd63172e0ad136fac2ca536f36c*5&df49c11e436fdabc66d11e18d2ce0b69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"51c0a4c4f98af2acdfa251499d92ff88*10&adc1d68e5c65b79cdbcc439bc89206e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"9713acad830e5ce3477c6f02afaf98c2*15&09fc715704c974b22692d908cb34805d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"9a1d2142d661af572d93e2c2f6e1715c*10&4a7a3bb5b867e397bb66054ca56c8237*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"8b7f7ecd7b6b5487de715ddf83c0d220*5&e1092e242834595533557d2456031ff6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"abdd314b6c4bccfcbc462125d1f52629*5&d4108325dc1690597fd8993728dc76b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"080af3afb180845eb6bb2677e0c424c8*10&971e81ebfeabea72c95c64ff9fe865b9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"6ec7a28284af1702469d0e220b60b3d0*45&b14c487e5d929f04f04dfd4544d3e9f7*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"67d0634b03a57ef5af1d9535ef560fbc*40&ee1c31cb855ba7706ced6d1e3b0f1b0d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"2124d536401cff0894110db7551aa2bf*5&cb6fd491c21cf99ae47236c8f830a92e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"d01922ecb580b8aeb102561d6167036c*5&02aa3104d57b0bb216c0af9561a4b940"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"03929c4b0f21b5318f3e8b6e1a61dd5b*20&e19f3943832cc71afc2a5229da81d7fc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ac3cb701a35bf7359b2328610654583e*10&acde4e2906b7dc571fc692f3296d6611*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"de83e4a25629009586706451716484be*45&18ecc4272362dab29c63ab9dd7296fc5*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fbb1302d7324e99fb347b6cf11e96264*40&ab12854d560195feac673013d20900bc*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"23adb0985741473b11078eaab33742ee*5&6642e053238f6c8f2ac2624eff0332ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"80c68984ae2f4d20201badd6642a04bf*5&f71f8e16b4dd7176581fe8e07aed40e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"06e946e8a989c6f612dc55a1ed1e74d8*35&e27613e70c42800a153270027f4a5b16*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bf8a1bb4c0ddd3193d30e87334cbc27f*10&edfacb54138fa70784fa44aa0eeaae78*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9175da3473b544fd3dbf0e19b7aff201*25&ce3cc40e9254b010cbe9c1c1808dfc3e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"863fa846e9d4c5b56872b0a96e4b2bf1*20&e2e4c1c3f7ea72d3b43a8e226032a09b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a6e71ade490eb5dba05c8c8e1339032d*5&db3d101e58faab25ea96c04f32630210"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b1a29ce0d62c30b494bb42ff6d476a9d*30&ce62a742384c9c865ada291af6a33ed5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"51b4ea40b3800dc77c2cdff54fb7bc39*25&cca1f8c059c28d2d4104fd810e565213*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a609c93648d9301785086f78401e0e7a*5&06d63e6f330fb20aca73f70cf10ca8b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9d85726e3fc273408ab19c6757d68f5c*5&3516220fa47a23eefe0781ec3fedb3a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"0337e4bef055115db40cd79cf80ef4c0*25&20af28096125af513832eab446ec26cc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0d2f163e9b59461fe2536a889e22d30e*5&9f73aba349ded02d4503df4585f8ba2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"5441402400b54fce759dfbc68d0637cc*15&0e62202b94c4d3b266859c139bd4613e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"87e26968f14d65475c09554fe3795af7*5&0656d8c5901127b1f37d255d04086c4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"b2834f399d5c97b6ad22cabec245ec7f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"c89dd0c4648b1086109fd4f51592ae07*5&0b75986eaebda5314f834d72d7aef44d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"84ee355dc6ef8cbe316945a2154a2c1e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"35b384b9fd84ab112fd1aea9d4362f59*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any g3mb1_ic3oc24_ih600oh300kh7sh2dh0ph2_iw600ow300kw7sw2dw0pw2_n"1119abac24c6516a63785267b264fe19*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc64_ih300oh300kh1sh1dh0ph0_iw300ow300kw1sw1dw0pw0_n"8a79950191dda50cd2085d53381d8acc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"68e938a7ccdab97bb0cbd088dd9f4a74*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc192_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"1c75095909dedfe158c93dc88981cb08*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"27d99315bd34157c4e19dbcb40601633*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"304ba54d6ec3c454219b9c07eceb93b9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"0696c40616a43f1804e3764d20a9a1ee*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"deea32d580829e0bda18e0b7e15e3fa1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"c022a4df0454c4becf7ca20a994caa0d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"caf21f207625cbcd77d43c9c49deb36e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc64_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"4a48181ae68277084abd90ff0e3e5dfa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b39230a7d80992579828eabfb75acfc3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc160_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"491fb074f23b9370f57b9ddae6797cf4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"395aab4a59c94a8db3975373ba07bb5d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc96_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f79a510946b3f4240a51749192a4d741*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc64_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8959b8fa5e66646c604b23ac69bad3cd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc224_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"da2c443d104721b5c79c5089446507e9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8a162fdf07340416c15e52f035e35f9a*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"41a7a090d41039d4cfa04b62f7481407*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"258a7d1389e45be3f1350545cc46f7a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"2520209de75c77c9309ba354d1e7b2ef*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ed6b95ba8c25689fab3ba121637ae8ee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc160_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0f3f067c88c1e6471ffdd70e3fdba18e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc160_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"3e4a4e3e02f8d98a57368aa163b904a2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"8ce24baa6a31276746fa10e8030a63d9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"1dd4d8530fa41a328f46ded8b8ff6f90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"f3fa95b48a6cf587e0d97ba4239fcf9a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"5fdce1cb0fa74c43b7788b5247b5b859*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic576oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"27d3c42e33f60d57b9f44904881a73ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"768c4d742c73ce7b0cc97c329249422b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c2c87257a653deb410a46366166fd58a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic576oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f6c391a0abc4fe50d1fd942acc02f20e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8cb78f728ea39c4a85eb096f98f5ae5a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"40229f8f6472e378e06fd034ddb7f719*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic128oc192_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"1c58028f4c90dbae99910708301cf6c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc256_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"6c75198842b21a93d7c5220a325ba7ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"5172abcd2c29c3676bbbf4c4ef4a5868*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc192_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"59f9259bb2e5050cf827b791280c52c5*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc352_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"5961f51468a35be32ed0433a24ff8c07*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"de511c3756e8412ecd170540fadf086e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic160oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"b74e593ccaf6715e56663a5e4ce5c406*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc320_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"504596ae9a7e837c7f54add0681cd22f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic224oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"992a43f64323a30af8b29d5170aab605*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d4d66194f1ebfe581471fb67ddfbcdc8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"10f8f455666e14385afcc12397162313&36a88cd023176392a98c00e03451cd3d&6ae3d45eb7ea9aae312ee38cc8c59c1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"51940b57a8bd67eebbafc76cb1533ad9&0f35613da9e8c8c3aa57ee1f9a43aec4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic64oc8_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4f5bde34b41fe2c300592a736cc8b5f2&fbb2ff3249e2724b37c8129328767fcf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"410889c36c23365d7a8c4f6875687ece*2&1642234c5c2bf01fb99fde78c59093c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb1_ic8oc56_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"92b26366ffe4c13990f9db1b23dc3828*3&def1ac687d480857455650f82754c8f8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc56_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5026d531389c23fb9d076922e9cd7a5b&886f1a3370cc46b514655eaa8a9d898e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic56oc56_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"98a6b7c4012ffe9142943f951ea3520a*3&c26023d2249656ed4d4dc1209643aa28*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic56oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2a20b12dbb90051b3ebbb4e71acf8e8a*2&39e726d5ce4d7c2873e3da73aca1ceec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic56oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"23edc30336556dffcd7d20d5a67372c9*2&ea64dedf858d975eb92f1e21fd91cf8b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb1_ic8oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d8b2f0a2418dda54c855d7d4803cf9d9*3&7f2077138aa037949010199806ddc266*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic56oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"09cb8c8fa03631c9b9ba86d51a8e09fa*3&bc975815768ff762e6813951f49d422f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic32oc8_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0100e4b600ef64906f8ebb6264e168aa*2&c14c5c1aefd52e0bbd1c34024e7b17d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc56_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"27434a09442b8112fdc1645b8dfd6a86*2&63871fe436e5ad4ca2ca0d425e22fce7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb1_ic8oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0a47cb7b6f502a0cc55a753c1c0ab07f*3&b50334c7cb44578c63c220381ef0fbb7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic224oc56_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"db5c064700bf5e9fcc301d016beb03fe&57c6a1414c2242c9b709ca9bc058de50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic224oc8_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a46c7f8b2ce34f303bf7a84a4db471f3*2&a8a194cfeb333685f1d4b54bf5194240*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic224oc56_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"630884ebd256a91a58baf343a28fb162&7268ae961801300226fc8082f9d9e76f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic56oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"062072ad95595d2e37e6e71d92e535a0&6d36141e3d7e1f8cb74df1809d205b10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic8oc8_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"612a9c4f6fbb19b18aed068c5d72cce1&6d9129cff34626b21fbc60e99faccc69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e37bf7ac08c5d8a453c4c2b6aab89b48&9155230e89314762ff8f60786aa45ed1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"484a6f57b64584821bbd30af551cbca5&5f08e048c3657757b2e95f1c14546c3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c2a7a2c8debfb135127d9ce71d9d57fd&46ef6aa73290d7bfdfff340e19022d09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc112_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ae7e4bf043a8c92e1ed49a032e0dd87c&83195c8f9450b1403e9d524879de6b83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic56oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dcf343e0650314187feba6bea890fee8&0a8e9c41648fe6a1f5e87496fc2f2764"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic224oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"15b81bf32a8b38a156311f0d46bd4d81&8ad78548cec78ba1864149baf0eb975e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic224oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"383241a83e159165a4d9cc098400a299&b74e5ed801feb58fd6f783f893cf6d17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic224oc112_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c842b2faefce661c780057f6146d51de&f0aa8ad814e43966199ff2f5559aa133"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic224oc16_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0dfd9fe272067288678c9f8e8515fe66&5cd2fc1ca14abad834bdd6093437cdc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc112_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1133c22a5aa26f392e8bccf2f6e46529&4d772631ad07bbe1058be10a0dcaa4cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic112oc112_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ea145fda9c3d25e30149187bc59c7887&2bd4f47c8ea5c56dad5fc27cb8077777"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic112oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a474b61a391fc19b8d80b5f3cad2918a*2&7cb2c05f82b9b98c9c35b29faab5d655*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic112oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01d483820f53804c84887769b77fdd15&aa45d25cfa7334b89c7b6c275e81eeab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"73eda35bce868f4fe7dcc3ab1d355537*2&b7dbbcc9baad9576183e118f978b0b7c*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic112oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6f35a41c5c19d10c015edd507330f2f5*2&79ce7d9e49f704376c064714969492ee*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic16oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1dabae78b396e30dd3c91779a38eb83e&98880ebb293eb48c06b26368aa778720"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc16_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"204e46b41fa1435f3c2c1e4692d7d674*3&d9beec70ff815d0d5cabf42bec65a141*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b37421bbba4168ed5a32421e9020446*3&ed15378258a9b15d007a26324e10526e*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e642e6b9a40294997bb6626145f02e4f*4&b1921ecab5e68304f5b48eea34837a94*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic448oc112_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ce0b6e2b1b8f676f5dbf551a37357dd3*2&e5c275d5bbc349008a04ba34d7267d96*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic112oc112_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5d86d6b3c198cb254f8b8d2aa55bf9d3*3&d44447fef5020fe66596d382c41eb8cb*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic112oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0cfbe18dbd34bde15a1de53226664123*2&c14bf9368aefb8813f7ee5638b2b192d*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic448oc16_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"41751f24efa13ee3814eb434b56dcc91*3&10f83dd2e852e5c3dc0899bdb3f33be3*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic16oc16_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9d905edeab330a09bae6f945999cc500*2&12afceff519ed41aae2786be02e49079*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4714fea0cdb7f34748f5fbfbfcb06161*3&17f1cfdde25ab8899d9577c0d1e3c99f*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc112_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b33be7fc32e85505127d19d8dc17c245*3&84472e44d975943ec53d5e61b6cc9343*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic112oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cd5e6f05ca8b90931111d1edb21afcec*2&4484bbfee2d23ef9cfe1841d234054c7*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic448oc112_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"93d86c7979097103490c3909d928a2e9&2e2fa3bd77f3196ca4f81d6c04501d53*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic112oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bed095bec0a49d9d55626f1cc9270527*3&e7ad63455b71f033d69e63879252d3b7*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic448oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8a6a3f69c70e81fe486fe019a12395db&a70a7b88608ea0260d430cfc2a9edfbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14eff3bde0bc2caec13dfe470c23c7ea&12ead5ac77ed6e964b9a324c2fb4c2cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic448oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8f33acb96650418f0298c70f4df77b60&e3e31f21bd6c9fb9bff96f733638ec23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"de11810b1c14acadc48362961bd8e82b&fdc9b8d8f99ef743e07c858e22506cd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic448oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a3ac519252845a1c3793acc3c9885c39&ebce6406afc809636ceb91e1bffc6079"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cce5b97461aae292b4173014dc64f49c&ec8d909e98a39184aa82f59b07a3ddd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic448oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"93a5c6e8894f27211df61e2c1eec97c8&60b3a965257ef1bbc5f082c106018bab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"11efd5684c822871ffa248d2153faa4c&d6fc5b873ab12348ef15a1f8ed535a36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d0911cd65233b91f49b73089de0d543a*12&f47d7f8765b280e7f388e1f40d06950f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc224_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e0e483f3ef27472eff588d4f915f5a6d&c64cb9d0b7b3a12a136a063baf5207bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic224oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5c8c3512480c7af5f5074f3b7110961f&ff0bddd2a5e5253dfdc6dac8ec6b7d80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic224oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d94a67225735fc148132fcea27ae3c3*12&dc3e17e96c55c5f1bb145d70c2e4b496*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic224oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e4bb68a481e55307849873897800f420&a058925d18a822e907b2a7d06753a0ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic224oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"53d1227c7588087ef457fc856a2cae3a*12&98e2e3653c7389f6ba49d28f3f2b8ebe*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic32oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c10204cfbd314bd4aa9801ae1c6702bd&7fb4ee2254c3c8a9bd72b7107d15b8d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b1aef2eed00da8f950147e6f06d0edee*22&f874830f7f68ab27307f9397e20b78ec*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc224_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"030f0a35531e2ad4672bbcd04d5df659*22&bf855ee9b22d7aedd84c733abfc64f5e*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"27465cded6a2726a21acb6a645eb3ff4*23&d5b7cdb02ccc2239d5d45890e3265cbd*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic896oc224_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5502d16a07625644ce8bebfca344aea8*11&fde71b0d54278fd397f636012cf9be93*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic224oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4d3cbf3d3159acabf9b045f4fabaf821*22&a2b494436e6b678ba7a5f8b76796b2cc*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic224oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2512b5b59b243d301690e1bc9a62f60a*11&cb2736278356e873de5930910bdb6831*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic896oc32_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"56afa3df15a9aa602d9982167a9f9e58*22&1da1869c9b5e300f105785cd1ec9266b*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic32oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"af187cc3ba345d680efa33a796de1620*11&7a5cf7dcdcae54ce1b1870e938ecdabd*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3153d92ffe9c269a0a2db3ba539f524a*22&940eb81d5bf814872e4f0242c6073587*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc224_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f3dda178531cd6e96933f29d64fcabed*22&e7de7238443a8fe09d9293a5941d1d56*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic224oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2ce4cd3b26fdd1ca1959df61c33f6492*11&ba0bdb8bcd8ba159f69b7602a3f9fbfa*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic896oc224_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f0153889a31a9944ffd5035b2f04014*11&25686d498e44bbe58ccad6047c25ec4f*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic224oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e628edcff3d253493c19ea46f6b223c9*22&d0862c7ae888be1516096b5fd4681cd3*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5c6fff42ded399e92fce53d86f2e8b72&c4d5dde3b3e4ae896649e4e27399c545"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"66cfea27d097eef5f5fb9ff50695a7c6&b77befa52488c6ef8b81a4d4f3800b27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic896oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b997d49071ec229e0d4a339cd53b4e9&17f1180d57b06386948b9037f9068ff0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic896oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3dde8911cf01c0d4ad61afe0a467255f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"812b9855ee3a49175ed9553d36341169&ff5e612ab78f5b62be66a44da38ccf4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"415259aad8b659f0d5e58d96644c6459&5bdffc8fa6180cf9b174485274d6404f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"367723323620b1eaa9498c0be80cd7d6&ba751db13a9b2e529c16adbe91dcb8b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"1f4b12f45ea9c45ed3d79409dc168894*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ec1c23166f8c5ad927c14d51add56398*35&1ac641cf10e0d15a21cb6d68536c31a2*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"39bf9c229721e9eb34dcd8b104394d99*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3589263f613089157472198bb650b59a*45&f13e013684bee77034a7309e17506a41*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fcf932f303e9f2e58b1d13bfe0ac9953*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"59129add49c2fe23b4f390811cadf945*5&4d9407eeca1b91f126a2b047acc72019"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"efb811545a7370a6478318c15e22e09f*5&012d68219dce60b18c53b8c8de15dcfc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"004f4e2392a6b72589f526464eeaa6d0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"7a16eef822e378fa33f0a4b2fa55d531*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"90e7b9a8633ac63429d797b96bd8d4fd*5&58983402c449d11a2f4699c0d786f34d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"60cf6e9a4e838bfc6cdd194c4404e154*30&1c1dda7678b06755dc8c58969b69d4f3*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"32f16ff4d16f86105f2fe53d899f7643*30&65b872fc8b7e38f73767d8a315ee79d7*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"15c4e37e57a1ab2f363a9779f3240195*30&848ddfb3971a1eef5fbc10b732d67310*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"660eed3d999e0eab9cfd0dba1765c29d*15&9299e8bdaf4f580b010d998c35e57030*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5890af6aa03c23539665c5bd580de02d*5&0388b3472c3f380364e6eccc8c957806*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b88e219290764272ac93b923fe0dcb2e*5&a3d5bc59a7a17bfc263b3c44daf7eb0e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"ffe81099d3c8f98e4c62bd1669603acf*5&ce5765eca4a40cc613c216396b898b98*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c0da54d851272c39cd859082529d5149*5&3bc3011865e6d7f917b98366817b9f0e*5&1e6f60e50caed5ec6e11a50277b416d9&9b443873caebaf3c706853bac40bcf43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5c2a865d5859a8b42cd3fa991cca86c8*5&f5824fb0be07a503b692b51f5e1bb298*5&999cb1fbb289770684b647c9e717d0bc&c3da0be5d7011d83141da64d37c4c402"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dfad9e2f171ded79574609777f2c0a21*5&402c5a11317594f5dc0e29e96b7c4218*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"786b2de644622bee980b8212f5040e9a*5&ee9e0661bb3954f8ab606925241b1444*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"0881a7d7443f3f25c769f69b2c9f28b8*5&9432c6e54cc357aac835190ebaaf4574*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f69346171b7e47a048cfc86b53183c79*5&14aad0b2f0921d7a5f9c0f852c9566c6*5&cb91244ed813ddc4dc1fbd1478f3ed31&c3ac3542fc14c53f10d226b9ba50e027"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a4cfeb507ba45f185f1019fc348506e4*5&e8a905665b59f2367e576eca343690ea*5&874c992cd02c72568f97dd0a8074f9fd&e50c6e566fb36590ad97b7045b4d7000"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=any mb1_ic4oc16_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5f42d7deed89cc26b2388f429d0fc362*5&282b8c856e88eb4caca9be521e73bccc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc16_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b69dbcd525592eb1f6bcc41405767ec6*20&1aa8c477e0af25a586fe303ac67e5083*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"96e018db21d188ba48d525334e6bc301*5&cc599b2c1e8358ce3a0218a0ec5de6b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c2d9b4e9dcd4fd7aeb8e912fead1b665*30&e1c25cd89fc2cc7ce2cffc09d670c790*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"386a5f96101432b2758906aa862053a2*5&466ee693bf490e068dd38a869b22f991"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc64_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e0b213046bf58df1ef842619eef66c7d*30&4bb7e90ec449e672a50a38071043e642*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"67bbb42e375a90543dc7abd3040e6a7d*5&5c62e646565fdfb293ce2f319802810c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc128_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ef3a2512f1a396974156c965a77f97d6*40&ad1e47a8f0f7e0a30b5609f38517a47e*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc64_id32od32kd1sd1dd0pd0_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4d3200c3011202d40fe59195a1257cee*5&59301c7f77e75610986bb36220956be6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc64_id32od32kd1sd1dd0pd0_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e43ef963efb12bd289ca042f0ce13fb0*5&e57fcb63ba6d74551a19167782faa832"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc32_id64od64kd1sd1dd0pd0_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"49fb9d75de1519dc948f31ece0597d66*5&28b70ecd7bd352c279bb80aaca4810c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_id64od64kd1sd1dd0pd0_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a5e5e0d54af528cf7c6b0290f1fb3c86*5&4806d306ef0ca1670e54085663249285"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc16_id128od128kd1sd1dd0pd0_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"84577f6f6eade815fc16afecad8012e7*5&55d27b3060cddb8038fc13b6b0d8f078"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc16_id128od128kd1sd1dd0pd0_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"edabf0ebfaa3a527205ee824871955f2*5&6f1c6ae9111cab06d1e47053694da269"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic mb1_ic16oc3_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e258ef63bb875917c27f9038a891f26b*5&2299f5474154563f26c4bf06b8566ba7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"7b13bba01715e0b0db3b1a9295511738*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"7d429d9b4a0483bc2eafcac1295f2ae1*5&f70fe689a1b034e45851f199f8a2115c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"743d1137a919fce84f0d88aea86fde40*5&adcf75b096d4407e8fba1be4c4983690"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"648d76f825c589ef49fef90c75ecf84b*50&b3407f9cf30fe5dd353cfc6b14c120d1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"f29302c5467efc3aa17e0b2a3a1cbc2e*5&f73b51d266e7a3b219ae8a579a5ecc6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"fc311d0290b03c8d534d55fa07a39bb8*5&df82441d05469bfbcbea08cdaf90ad88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"c0cf1c563d492fd1d0a90dfb0a8464b9*5&7818488bd57ea7702949047337ae8dc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"08d51dedb095f061dc3f778b39f21642"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aa7bbe20681779125d424ed1267a0e31&52ccff226b61b9da4a6e568be8105fc5&db716e545bd4a62394b065474b99f7fd&a734a74602a3128d19eb333e0f19f949&5ea82bbacfb3c79636b685aa7bc6dca0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"d9a7dab41fa0244ec5fb3be76b6dce1b&42d73fefe4e0e2e5b13567dbd7d0faa8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2852eb7c868a5974615e1a6e5131c30f*5&bfdbe6a5439ab1a67e1eb49a23f4cc60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f1332e444f21472cdeaff693390fc2c1*5&0e514cab7fe6cfe41fccfe46db920144*5&5118d4b030ad1c7fdb01e3bfb4f3ec37&fe8f9c2da2c2053e06879f20f6d07c7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"748654ed1c24e6540bf5bb87d04993b3*5&69c70c38da55ec7387e5b1fbfca41fc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3415049bd74e8a05a3c1dcc1f84af665*5&f6209ddc4ae567b695b069adc6522ad4&4aa65b1af73b7a682c01a7f3a54208da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"96887b3745765edb82035605eb0d6a65&23832f6fceb7afb0e0f1f2f2aad29288"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"acce2bf6b359ceec8290fc727f232d92*10&512310139361dcd29a54bda81adddbda*2&b3812c9409f3f69b7deec1bed6a9c7d3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fcf28fba67611caffdeffd3330652aa6*2&995a9f64a69c674dcba292e5dce74865*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"e86105f52f212d62a9a5c70135d568cc&75ca498fe01161b0ed105ae042642ccb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"53a0b274a6abcb7d9e85fa224414ca1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"500fbb23072334f4a3ef44b5d29c8aa2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"7cef8e5c604c6a278565b6f87649d80e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a2982afb3f4b2ed034974b1093ec7812*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d40ce325fb33c48560c57d2f20a1a874"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"c3da87068c022f97fb1f7be21c8073cf&17ceb11449daa013e7875889dcbcde25&1e5bb50affc1e28911b18910d1b60c4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"a32b1d27d1e6006f559fc8ab8988d1dc*5&393807333c0290d38c0746de12440f6b*5&ec092d9a767312f0ffabdf52e329b2f6&737f004836f33aa1b9a22560d1d03803&421e2a9b7734628b48befabbc4accacf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"2ff57611d50d07f58e00a3291e1d6dd8*5&c6bd6ea0251f8db0eb9f9a425b4d2f31*5&817cc6fab1e3de3555c7da4a195e1cc9&78fbe736e833139e40a7c4292da3d2d9&ffdb51c7b70c2753e7563153850c0249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"249c5662ef2eb7b1c692e13468c27a18*5&2cdeb8bb1e193aca645cb634c7030a6d*5&a8ac3601597ef72b486db36ec1e56b1a&095d12ad7f4404f23a0b014c4122c320&ead3a7906e476c3f4fc54c3fe4346184"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"e5b5ed7772f08867c0f85240d128e3a6*5&6c1ad765256b4dead6b434cea2f8a388*5&19c4c1ae8a79aec2ef1d73be518e553f&f46cbaeb9542df18edc511a103cdb47e&f660e35dc86ac3b3a2649f476a5b5942"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:0:abx mb1_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4c8ba58fc3bd2eef03e58713b45c13b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"505bf41043ede785ab5e6472b9639429"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a5719822738c69f49c11a6a12156e3e0*5&40f792b863f1acb62fcd3c1126089d6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a2f6270c1e90032b51dc03c17e83f23f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1df0fcdd5ef72aad442258595cf968c5*10&473813719b6771abf0673db1b06cf4dc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a6d697655e5a290b2797052cc50660ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3cd40cf2927fb9ab6a5218ac5112f621*2&966e1e81c89731d2670e6a779e6c13ae*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"724ca76a292b7cb942bfd7c4874ef710*10&331ef4d6ec367acb5cf75be6c2bd7060*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"63480e7d8829ffcc7b1e95cac898f7a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"81761719054b483399ad1e06c664ecd7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"925922d6bcf7b966dc91cd3a7c5f58d8*10&592591bcf320e6a176d1c01d2ecdbfa5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"f45a7d3d9a520b45bb716b523d6e008f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"b05ef22e4aca4347dbd0909b820e0529*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"9d86bd32e4440b1e7604158b9d37c073*10&5bb13e710987d52995e069074f45d1cd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1abd2b85d5bf1a52e4b799e292013f4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c06e44a3434a6b647f2b136aa9460f21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a98e0aca3460e89534d56e3987c0c41e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d48f44c11759f0d729ace36464749518"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"22dbbe010023fc3a5a40490290a642bd*5&fc0187c437895c8503cda59418e317e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b4cd3e6c2130984d655db025e9073c11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"5dfe1d3e857d0c0f4c5cabffca4e2e6f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"16e7785a0fe0a9d2c57fe2fe86d16639*5&7733451b4f65cac81ed396ac54545f49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"e18553ce103f2b49b50064a2dbb73d53*30&8d9818102e35e81a4cf322b45e790362*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"4967f80302931ad217ed911ace83a56d*5&a7e9ed31181bb9ff59d88df4402e1867"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"5c45b50518eaf96601e35c788e7cba6f*5&8685fc91937fc192ae94aab10ab438c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"9996e0c65c42deabfe3f8a5f4f291369*5&73eccf041e6fba1746287080ee3b39f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"8aaa63ba347816dbf095782115bc0672*5&57f7cb75e717d2672aef8e6ad5a42da3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"60be9ad97044439c946c580931f3c69b*5&b38b24bfcd8118d7a7fb41266fe9e95c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"936c71387c8418bd856dceaff7a75391*5&00b4057a120fd7f6bcd6c081b5e349ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"20083d1c7c710fc3d1b94d3d221340e5*5&4ac20a825729e3ef8dd4d24d5d15473b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"80db49f21fd33ba635655c327f123668*60&1ccfd059ee90d4bee4723fe8ef3d5b51*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"17bb95442e104d2e35aac3ce9ab0be19*5&5e68f5adf29cf446e7c8ddaf041d1fde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0bc26e096f520c8b5b3334592bb343cc*5&8300cd7485ea69b83fd4afc4ed98fb00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0a48e09e58c1686024fc8c8fcfe8c2c2*5&3da8b0d044d4a1ae47f169472cac6b89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7da1b3fec3e00897bf1fec09457141d2*5&6ced7e79486682d5d1e010a04b1c51dc*2&64ba3e45cb36ac2cd3ab4fba8ea7585b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"469ca12539c2198b168ef9236c48c4ea*5&9ba9e89cb03e6cea3b7d575fac122d6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9f6e69be2f7856f26f3c3b82d544039b*5&b8d0a8a15b2fa7b3232c24c9787bae91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f344b925330a167abfc244ace13f548f*5&ae9cc90eadb3146c715fc6476442c2b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e4a3b1e07dbd356907aa1bef5a32c267*5&228989cf3113a57ae83a25a910164fa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d3a1af92ba34da5add2fb82165447986*5&cc80eae2873da55aabc6fa46ddbe8267"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"aa0db7783a4adb5534029b325f20a522*5&e55ff4ae8bcc821265c15b87cc3de592"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"56d76f55dc24bf282e66b00c2649585d*5&da567f7bce46f48bc981b0cec7afbf6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c6d2e660b4ce1bcaea0f52b8f024a677*5&8dd7fd0593720f14f9a033119ab297b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5fc99d2e14e3630e5b47bf4545b42ad9*5&dd192a1b4013df5d0d500d5f8bc78fab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"bba3de4e2559278b6a7acc98797557df*120&b4042c3696dfe9b819bef9e2f59e6bb2*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ce3d658a5ef656d8ebfbdcdfd8633a96*5&9bbab4306f9cd2654659fba035cdebf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a3052b6817630fe545ce0338e92e277a*5&78c1083674fb0030edf7817d64316543"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"572c5dd4b5f69ec0b2db64a594d05862*5&271c01bf21f4cfc0b301e1a669098ffe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7bbd9ea6c4c6c120580c360595ee611c*5&7bc55a34cd94a28608aed11b4b033a9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c12d42394a8f65a8536a94b7a815089b*5&2b96fcbe95ba761dcac3a0e41c15a6b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"97260967a887f55d37a6f98d13a5480d*5&25f55a8f6232e28c97eae9efc2e6811b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8ad45f29f4719a4dbf469ee5c1a99888*5&45d2dfb87ce960eb0df9aa8198c48488"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1360209e3cdaf12152ee969246775e07*5&78aee86aca2228b83f155acab6986086*2&645b085481629c610a8864697c46d0a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f6c2114509a9dcb613bf8fc3c4253a62*5&52476722607215bd600276f2e4b8e21a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"42647ad7762d08b912fc22e7a9e9737b*5&70cbf021580329cae389be2e0c44d51c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"503b68bbefdb327279acb1b6aa606a7e*5&4eb3e6d6fc58a2c98cfb932a8cc07656"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7c2b86af45739be2138fb5602fb5a95f*5&53d56fbe648e20eeeeecb826fc148296"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"04e518a0b54c01219edb5b4a00774be1*5&9f8b76245c9d20455d8f34e4de5ce0b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bff358c4b23a013c54a9a9fb78044ac8*5&f179734c3d50bf3a173109df9bd5ace2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"544dd8302a44ffec58aae5d1d3cda7c7*5&734bf2893b78617e281a368c80ad5ff6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2474f6f6cd505b8d19e15831b328b574*5&8db56f20f5aa1dae4026660181b8b80f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"839086a118f155a3b274f7bf459feaa1*5&6757464b1d0298e2b5e9a9788090a00c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e38889a411185ef344750f8ad1cb6c48*5&7af81b69589fb9de2ec4b98a04aa0e11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c5553557793a458a49129c9c8219a082*5&b241a2d8383db15106ad0632e30a50b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c40f79960ecd58698dc233aebea121cf*5&00ca0c3c4c4f40b8f30245f85a09f3a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"daa1913a0f658ec989bac2fe10f2aad5*5&f333f5072a52d14c45d8e92efadcf986"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7a89bdadbb853f6e6c15d79090d2b40a*5&38d28ddca75093d4dffafd919d1652a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7379f140bd8d4e35a4275da03966d5fc*5&57c3b9c18fad023c12d8a4e4c1fbeea7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2502a9221c32cd7a1d7ada21c005b364*5&8bce0d98418ddd4eedfcd9f4bc7411ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"70b20826417ee927e028044dd240a459*5&1ee4877abf9c246ff753049d127d2572"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"0545ce47621c13f3cfb2a46c0b6b63d2*80&e36adc7ccf2f6c3420c88da087c85bd1*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"52fa6e6732a4c8158212dd69280014d9*5&be53edf842e856b192019c0eff1e0b3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8bb29c425a6168775f75a014488cdb75*5&b8502f0994b03ff1d21b99515feaeca1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"40afa195c7ceb8fb3c0f58c3de4b2b98*5&be59a379e3d87426853e5dcb47d85592"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"cc4e2d76a075903051a3b7a738d5e7ab*5&5a5e2539167063d621250f56650d65c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b2f09785da638f27377a3ba8dcd76090*5&cf8ecdf6458437dcb06f8b5a3584306e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a8498ede2893d28763c8c28153fdf049*5&cee154a97d8bb15035160b8940751784"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1e00d7401cfeac46cdeae564d45e0bcb*5&ed5de0b2976726ad32ae7686efc33c85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9dd274d2d5980f9b233f04ee5fe0413f*5&ab9d046e189c58a3e4cb4a0da2ae378f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6dd32fe598f8c7b4e8af12e74e21a96f*5&cf3746a5d9df039736d65356fd4bbe4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c940acb907628fbf8aff6045595c1217*5&1477c7b704755d2c8be0dc7a6f14b1e2*2&a4baa261ff9e9933b226170a9e27c6e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7f8abf1918dd07a25310bca8a1c65855*5&dd2cfefd819552fd2cb41c6cfe2b7aa8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0cd16d09415537aac5d398780715ad7c*5&79c3cdb1c40a9b0cc065b0ce7ac6887a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ce98861ed9c111499819ead81901ca68*5&d0790bfdb2db84dd4f3eeee06aad7679"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"154a82442e5cd89b90c407d77048127c*5&cd26a2de7ba93506ba78f2111c2b73ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"90c7644304b0c061e687e5c60d2a64b3*5&25724b6039199a2aad006f80cd55a900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b34358b5e9b4c1c090a09db6df94b455*5&261e3d304b9cf14dfc6d94b43312adf2*5&713155dd19809824788da70d0b4ad9bf&c6688f3765eb41b42e93fb9e06c2e1fb&d1c0a2fb4202900d9a252cfe8f82e733"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"ee57b951f66b9442b3a830f3a948bdd2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"0337bc3d9b568df1e1ac4e383a4807f6*5&a6d24e1577a9ba37780bfc733f167707*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"799e0afdfadb4497ecad4bf3fd2fd40e*5&22e1ac16402060d36fc4b3e631953b5b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"271f968ebeee70d35e7d7d4ea55f57f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e364b802dcf5f826057b640ae8c3db0b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"525e224963346b53879138d82fcc74bf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2931faee520d09fe91a626083c279f96*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"dc1774110e5f11cf8ac53a1ec93106b6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7e8fedcaeaf23452777b168f44c0bbc2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ad2fde9c3200745c1c96027715707820*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a32cb0762d2ffd5e11152dd59e495cf4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"44c573f3740076179b66d58490a76ded*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"270d5c4b35315e53acc6bfc4f0278cde*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d7ab3eb2908643f15684ff7af4e3c176*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"583593223dc2be7018afdc5f19b53fb8*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b822475dc88fc08d6ff42883506a7d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b308333bda27b23f3230debbe23bf1cb*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8343051f1674c9d5c87b804ba52688f7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"78c5b921e43779a57fbca7546d7fc54e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5edd912a78aa5d54ac0546d7b4311ee2*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ea1f2a456da65cc31ed5b637cc65567f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"033b20650a0a4fe837a77762af791700*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"24bca8327b0930d3edb15e119dc62ed6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3ee39ef2f462d4812925cc7293eaeb41*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"85f5207f52233b2d8589f23a979db86f*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"48ca2761fb181b33b2f3713c8fd56420*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e464e68f96fb339069ce4aa18f2afbae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"db5ca97255222ad593b1a653ff30f2c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7ea5140b98d2e4e7c2a887d9eb4870ca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c39a477d7264d7e13f0aeef50df80482*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7e35b9bd6afc64394d7d300c52cb4403*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"00f6dc8d6c4c9909f5f709bed124de08*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+sum:1.0:0:f16 mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1c415ce60590fe36f02691d6f4b09fdf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"326fb4bfce9b9f40db993cab2682ed07*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f2841dbc9bb6a44eb06c76997bed35b8*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6d343cb48ff5af2f11843e850744d91e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f91c3d1f48165cfde275187426a4ce7c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ca5f0878a0c54f602f198e885b78e232*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a8874893505fde6bfc2e7b634aa66ca9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"130b77cbc5e80f6da48294356d63eb69*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"f7d6c04989054a683bd7055125fe0f24*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"f34053f587033b01b11d75596bdc99d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"7bf84c619c532eeac2f7fc02a93712a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"eb63630ba21dfc896da0223af86f7535*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"4644e2950c5915205c4f45eaa730b76e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"ce21fa38983ece12a3abcd2cfdbc93b8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"8a7d81e6d5af2ab2302d0ed20e04bd5d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"ddfbf404216d31ef362089097b5188f7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0575a35837c21ba41ac5c6a27f814a52*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"ed635a8a1db59ee8260477ef83272e64*5&7f9ffbdfa9a5319070a67817e236d76e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"0ecfd0c78a532a2110002775d127879c*5&c84ecf52b586b72ff8ceb8837cbd885d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih32oh32kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"90cf0140e0ffed96f8d17f5361225af1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"771d3e9dbb3d492b9b35302177027426*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"89a08f726c4041be0afb16085e9be3e5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"215b8f466e1efdb311dd88e6f9175241*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"9eda33108240e4cd1ae7579cf93b7a05*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"23b5a8594c2f2d28b346dc9cdb9a4841*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw25ow25kw2sw1dw0pw0_n"b8183164ff8558e98fa7831e6f16f132*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"ec937a8d159758e8e1aa4a19fabd2ed1*5&932c07d8f9427a21824646856c6397a4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"bafc643e9c6cccebfbb8c68a4e83679e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih512oh512kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"38f27411aa22e22696075e6381e5d6d9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"48ada50cf26722cecea02e9f246d8e63*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"746e13ee98202053da4ed16ffde4bfcd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b818bccd015f6fa6a076a18f8147940c*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1e8db196d4a04d456d9472fad351b271*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7c39d4aaf950374bf20cbb82fcf70869*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e2de57a4f951ae871a80a5327d591666*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"bb4b12ce3c9846716a2dbd88035b8ec3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"033a6902951ce2cd6a5ceafd459dd4d4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e99887151f61bd4686473c29c26115a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"ff5f249cc2f4218588b62432d0a51b3c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9c3c3c43a716eacab06b760cf1426bc0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a0a6c401f8def549d8563fefe725df6a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0cb28b06a8272ce47305f00d9d947a2e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"aba85f518239f0b415b1a3e193408040*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2d60460228b75401f07fd44a77e1a67f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7b0f66577ba147d51469a3affcb0ec86*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"8b36e3a64e2cb5151678c77af8ac3d5b*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e0611d4fe5a2658b1b2615f7caec727f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bc610683762318dc61d367b64e4633a0*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"525bfa581289f2798415650399f4e6bc*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7d428d82cc3db80cc12eb9174f685a9e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"75268c2b305d903c5786243bb84861fa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih128oh128kh3sh1dh3ph4_iw256ow256kw3sw1dw3pw4_n"f9b36a2dbbb783d0ffba31738836adf8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"670bd1e0d21b1356f388fd34f78e674f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"086012abb5aa5b0b7b395d6fa5e5f388*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih128oh128kh3sh1dh7ph8_iw256ow256kw3sw1dw7pw8_n"64dcbc59e5997cb83e36693fcb325f59*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8ebfe8b5a152287ff385ec36cb54033d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih128oh128kh3sh1dh15ph16_iw256ow256kw3sw1dw15pw16_n"a817b1201adbe7ab640542c4fc368a2e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh35ph36_iw256ow256kw3sw1dw35pw36_n"6bc4fe29579a51416c7c60b3d1f5dc70*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh23ph24_iw256ow256kw3sw1dw23pw24_n"580d34f815e232ded1dc4b60456881eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g2048mb1_ic2048oc2048_ih128oh128kh3sh1dh11ph12_iw256ow256kw3sw1dw11pw12_n"64716f2d4fe66b878abab89ecfa931c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3cbb96336cd41d2963e150740f22b1b7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"88bd1a3056fccdc048a172e0966cbe6a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc256_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d7357685466b79568007b8f36322c37b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g304mb1_ic304oc304_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d0d7b89e33fe3d38fe6d94faa8983db1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic304oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"eb3fdf0a1cd876abc3fc92de10552300*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g256mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3d6edbd20d31ff2b824ae8154b84c047*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"9e1b2dddd56465d6db4e774e1608eba2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc19_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a1296f81ec88aa637133b9e2f533345f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"f58a13390e6afc76f4362190605b3193*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"5c3abfff9f70e0c46d8d3a28cef01dfc*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"a565e1461abf9da38e0372d380253182*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"1b1344ef4819677c6e356a0d5dea7a77*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"07cab13b4a96d4530eb2bd7591685810*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"eb0c3957b4222e4df323c0f43ce2247d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"ef4eaf52e42973ccf326d35c3638a58b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"0df613e3b048992155d79c50348fbd5f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"6f97ebe2f38c82fe920675031969468d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"5a42f94927fb45c7cef9f292fdb63288*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"1928c4f6590fde7223374fca74ac568c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"16fc183b88d136bd02caa0eae33001c7*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"a97715e625ca9a46ef2e2af7f9e624ec*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"3b174368cf38e2aa1c74056f40140243*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"83e0c4b0c7682bb3064f5b203d4c3cd3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"8ce5cedcad99765dd8f581efb4c84b6f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"1754deec1477475cba26e55a0689d2d9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"a97f97a4c06b0d71886f4951760bc2f7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"8ffdfa2f394ef4fc7f39f7dd2327f75f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8d3358e06c2a01cd21a68887d15bc5e9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"c9b78cfba571fee153190eb457c0e36e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"11b72199ea24827c6d852ea9f8c302e8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"b6c528bd898aebfab51a24c8a5d0d61e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"3e698b660893f0119666c2b8f78be7bc*5&8d8a3b860eba4e2196af8b837b985a24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"6c406de6c756916a4451d06c1b56d944*5&884411a1a50b24fac30aedb874112aa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"7de09fbcd972ffd6b0738ca7af68281e*5&e43583621fc7629af89fc5b13fa81321"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"48edff111fbf72472849f643d8903bb7*5&36fc7bf8bdc69f6c2f0d43a4184deae4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"c3aa5bf41ac63cf02f22a78e80bc2940*5&2c754ea5d8cdeb03e9ecdb7f9a29f049"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"afebf2682db60e563f1c6e57d7885af6*5&880e7695ef5e72f7931c8bd38bbc5b4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"5fa6cbf2c78ae8092068716b69dfdade*5&7f4c65bdced2be08c26ae8f043a117f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"47fc5904c2bf594cb5d01d9a70f6ecd3*5&ef3ef706ee5733de14be1d56b032ddc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"06f74f558d88c586b49f467d22c50c3f*5&3de6f42bab0dfa44291316753c8664d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"e866c7080ccb7fd87dbf3b4c8e1d2766*5&9dd22e1c5d1fb5b014c9491cd2fd4eef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"3021e5d1c19f8efac11515b898db75f8*5&555539869a2037d7e9655a88e46be5c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"005e1e36ded2116ac811160162789623*5&921b744465f69cc242b7d1a5d761293d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw115ow115kw9sw1dw0pw4_n"a55b727ac7b1e35c0d3e397629f14d63*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw115ow115kw1sw1dw0pw0_n"f9ee9125f24702301664cbc1308f3efb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih1oh1kh1sh1dh0ph0_iw115ow115kw3sw1dw0pw1_n"ebd739914e824c0a63d6662ce6c1053b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw861ow861kw9sw1dw0pw4_n"f03bb849a0351b76ee91e6b7df14549e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw861ow861kw1sw1dw0pw0_n"e6b58bf7c49f271f0d13177c1d9b6ca7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic80oc512_ih1oh1kh1sh1dh0ph0_iw861ow861kw5sw1dw0pw2_n"e3f7f21dd27177c4ae1468d5a58b3db8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih1oh1kh1sh1dh0ph0_iw861ow861kw5sw1dw0pw2_n"9bda2242d83662f95939a4ea403bcb70*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc80_ih1oh1kh1sh1dh0ph0_iw861ow861kw5sw1dw0pw2_n"e471e7a8746e061963947ff868b7b19e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw116ow116kw9sw1dw0pw4_n"470e6e82a8416f402fc9e0b3b5d4e366*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw116ow116kw1sw1dw0pw0_n"a0a3006774984269d517f5dc9326f609*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih1oh1kh1sh1dh0ph0_iw116ow116kw3sw1dw0pw1_n"a198a0acecd709c7535a68764a10698e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih1oh1kh1sh1dh0ph0_iw903ow903kw9sw1dw0pw4_n"d761529f9130dbaa697b91f0f5c75393*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih1oh1kh1sh1dh0ph0_iw903ow903kw1sw1dw0pw0_n"1a35e6dde08db3446e150a6dcf480bce*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic80oc512_ih1oh1kh1sh1dh0ph0_iw903ow903kw5sw1dw0pw2_n"3878684a1f68586c5ce7f52e93908724"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih1oh1kh1sh1dh0ph0_iw903ow903kw5sw1dw0pw2_n"dab4a64d6333819e8fea48c736df9603*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc80_ih1oh1kh1sh1dh0ph0_iw903ow903kw5sw1dw0pw2_n"65435b5ad69a7ce809f42ffddfbca13c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih448oh224kh7sh2dh0ph2_iw448ow224kw7sw2dw0pw2_n"86a5f02d290391517f112560c58fd3c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"46817f2e3374b8766063d0776ff18f3c*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"95da2a780a6a58406ff466358d0f2914*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"ec36f31a90ad5a23b105627289af23cf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"5d078abfe156a9c329aba6e477b420d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b6f6b2cfe6e2f5e87bcb801008801c75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"283df8ca78d0f22b5aa65b3fbcbb48ad*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0fd3c5ed38a9ce97b4d5f84e0c04f7ce*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx mb1_ic64oc128_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"b3d8eeb9291400a4ff103fa213e197e5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"49f73708e0ed513520916aad53e5717d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"00cb337514c49450450aff79d35050a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9aedb4ce826bc9f154fdd5481a07d7a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9e9611ce7b51d1f0b61d53a546f51b73*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0448f324b4c0a13b9aff3bc99802818e*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"e767900137e16b947b039cef7bc0adfb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"98c1a76c40f14febd50f482f3439105b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5284bdb6868e373f59b9fafe06cb0049*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7c06edb561f2346cd71d6db92a351b29*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"437cec54f1f29815bb278eb8b2a45088*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c9a49e07639f6c044a7267b0bf725143*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bf69168c281a81f0329362e34c209da8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"642cd272787242d3dcdbb5ef14d1aaea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"078ad7283861f774ada0295d37f8d96f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic64oc20_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"9d7edd1dcc6388a422faa7f310a0c80d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"8ec76876a1d512a7a08af89d70020e28*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"2a8773f8b786e071555438a7b95d9812&bae378470af63267564e599f17c61d47"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"84c4956038a270839022d37c26a58365&011207c9cd863484ee73c02486ce805c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"d27f136d1f4334e4a9ecefd5fe33848d&e5425b9a6bda1106ec62103af59cdd0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"66e50d318016f9f3240ee873277907e2&3a9154c93bedb5dbf4d60fb5411686b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"864b31ea80fd79f42e0d4826e90e98d0*2&038dc233db484b829539ab011e1d7d02*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"9d516060e79f0ce5a729103bc518e7aa*2&2c8434dae8be72751514f2a3b4c35b1a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"1723c84b7d6b759ce187e090b7fec041&14789f11dd5c6abc945e17063c545b76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"aadb1903880945c40a486520ddf262a7&9bc953d2b25437f36883ed67cf1bb9f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"d3726d6f601e7f1b2d2db1591456078b&8b0be52d79c8495c284a30a191c59faa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"30f51797040a61514b67f1e60c1f43f1*8&3653f5373445324de9f573858b7c6854*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"e2c6870453829d5439f74fbe3cafed8c*8&ecadb3e07288e232de32d551b129d920*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"ca488880edd8979961fa224138fc144f*8&8e0e0e4c4e2d8923508580ddd778e895*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"5e62e3787a50b021f3cf7f666790b73d*4&8fa8d3c5ed3ca717b2fef06e60b7a123*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"3aa5b1a30197f0e3a87e4fd3c69380fb&ac1f595d449305f2c90fa311fc7455ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"6d23e179feb5a60f578c7a01f8f34a17&5f946c016a678a2ffe8c98adcc947b71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"019a673eb0421b99bd07a4d522513dd1&a07e4730254407bb4c346920013fd880"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"72f5e97f819c181a60b175fc571c1134&59be2a89c470b84bad4acfd75a960e26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"1e4f338a7f973c07e8b75c3a126fdb1f*15&6d48ddacd3baca5ce4e349be65f900d2*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"dc73c82ef1caab8cfcd7f50648c63f83*7&33cd4f7e74164562f65642489a83fe61*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"753e92a91549b69d58b00e968918afa3*7&d6cda93111a0197ccefb62380817b642*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d23ebf6a898deeff8152053909a970df*7&f172af8501c9bb3119882e046f5286ce*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"183d730a01f3021d1320c8b08be7b622*14&3dc5146c09b2181b3be521fd2e9d4b05*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d68a998f5a4abb654011d6f41a8b342e*7&e2978b67f71bf8bc23fbfc9215cd4351*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"b7ee0fde8a905f338504ccc988beb8da*7&6549c4a3dddf672ba8873550c3e758ae*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"92f46c389f7e0e633a5d6670f4a2d68b*7&da4c6d161aed69f2510c13f586755205*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"d0b236fe1d8a82a398a90ebcce18d2dd&b6bf8759530bea062bc3ff8102907155"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"efe9224acfc1eb74d1ff2c4d507c58fa&1979c5e901689af773cbaa682f2afe34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"37fa44831b9c4977e1ab808fff25eb82&d693e9220c6ca19ff7e95cb4be21477f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"5bf055832b77f5726b7fcb4dc9d404cb&563fd405b793883d854cab7b9bdecd60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"d255258e51af66ed277e057f6c63bb83&0c2fcc1b675c643ed366b55c216c246b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7c27031d4e5f11f5976068e574389cce*6&a35d3085cd4e0a94a484af2c04142e2b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"0077d38109fc15020d4e04c8c78f1530*6&b11d43b9f23e55a4c1582f5deeb21119*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"26d3fcc048a4f0089624f69b7faa5425*3&52b9fa97b44919b32f3f09c75f608581*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"a6fb61891a6eaee552811d4165d65ef8*3&c10e32f0c9ec6360a5609e98e742c320*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"1ecf2a6121036031ebaf651bfb4b8a90*3&47eb6a3feb1d2fccf0e9642e0a4b96ce*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"70f9f35a82ceb48fdb4ed9cb158b6470*3&420e1b7b074db4f9a9be172f1c2872e6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"6953330f3c811671984f0b9a9cefdab3*3&1b75329b0a203cb1648b595a433a73a9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"ead9d0ba1dd5631d3231fc630d339a34*3&61f9cc294d028c5c4ebba5906f5f3001*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1908b2ff9f1cef9ed29dc5c171b94fa5*6&462613da3f78e4740742068858bceda3*6&8359ec7ade7582216c1707e1b5db97c8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb1_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59f8828ff4523585f1d0921e3cc3dc0b&135fdecbe9c1c7769bd05660bdf2121f&af28e361a0779d150de0b286e8468139"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"048f203456c2c1a893a6480cf38f541f&65c5a651357dc19fd89a733e7a8aa33e&1e00a6696c01a8d7707570c411c34155&cb4f080c6a608233ba0e4410d0ae408f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"58f3021fa8be63d614c124a5106e057b&61d0ffdf3317af830f62b4c83e51edfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g160mb1_ic160oc160_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"61f35509ac311ba7140540ee6697cbb8&ad46031f412a085742d9535a267d2193&48a688507c6a3d842fb67457742ac675"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"060f276646930c3571da41f8bae3ced6&84b448074545a04c494c0976784f3cfe&9c08e97752f5c7fa17b88f8311f656fa&791784e5be2bb00ede9ccb18c97d6e64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a40032c17a428da73350b27bdb108488&7eab6b902248bca63cc39c4b0dbfb808&e58e105a2d8055ffaf7f1c41337fb149&af1f4900f01227de54d22e1230d7860d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb1_ic224oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"120b1e7dc68cdb1a6e92b600bddb0cf0&9cd3c45fa697a3f79dcb86e2e05d1454&437d618a8563124f5efd9717be18be53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ccb5da24eda50fe9b4ae0096e6344743&5f063a55d93cc57bbb450608c341706b&cc160ca306cdcbae5e33f3756ea0dde8&3e5988cf29b0547ae3b57a7d52cb85ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ed27cfad87428966326d972065e67dc6&65cb1183979490d54b509dbd41d3a4dd&6c279c6f65a2eee53dc9021ad2655400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3df4cef1fb5cdf1cd3da6173616acf05&24a6a155af0bc7721cc7a1e3cc839689&3cf631d02fc07a74a2404e7bb7836790&96b9a977337933c7b907d405104e4d8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a9093392a4b456859f30618a20dcab00&85abf5adb8bb721f6647a26935db4c68&ba8bd5be4dad33fef0d005b2ecaac365&2d1b8885c2335fea268327101d8041eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ee73afb5a0cf7feedd0b94fd738db4ed*12&b3631c7275bb0a7d8b9c20dd42910087*12&83dcc73d634f402dce533fab777810ef*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"77958cf782bf1b8ca4226477fe60f7a7&a5399b9cd3df43d776bb0237d908b65c&126e48e8914722616c72402498099075&26405158766fcc4e4b4a668014525ae6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"10cfb67459f4744b22b7b90be4401974&a3fef9347db6e05c48bfd9cb29caa069&ff9bbcc8a0c1a3828134042af3763055&e1ce17acf2fb4bd3cc12f91f64b4e0f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb1_ic224oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c001cbad5de1ccc48b1ad82c981d12d6&7197be8f9a72724bca8ff1b9c4ba3352&9db9c2cdf1c4ce4e2dc96ad28c1dc577"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7422c2ce992e3077f5d83372eba33d3e&3578bccb79d1828ebe7d66be0491d0a5&4e9c3d45511ed8f3bc4fdfd958b88b5f&4d95aefe44427fa814fa92b1fad2262a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g256mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e2aa7a715f39b39c5e35afd634ce6155&f717f870e8f473635724c76928953ddb&737d5a66601fa6bfca340b3d878120fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9cfef3e4ac70044c9c52fdadf90867e0&ecea056af245287d7e88d14961b7032f*7&5f580dde151a6e15d22e13889f009772&b169109451af13b0a735a025246ce3ac&9d63b0aa193710acb20bce40c3113ba2&73703104b654a441d351e9aab0708d45&8627436134c74b60f95981b98d64ddfa&a01050ae3662a646f86c8f45a5800e41&e0126f45b4f4bd718b9c4037ae5f71d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb1_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d9d9c8d70e348dc037062aa42cf5260a&75472291060e113408e56496831e8e44"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1ab37d9eb5cef7062cf78df86ba80a77&c9ea9216f6c7a0dc2f5f12f0e4166ca0&35408e43c7560bc4cc4d17703f1935a7&c45f193f2f113f0a31d81f6d547f8587"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g320mb1_ic320oc320_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c53bcacd58430beef27b1a1b7e274fc7&d5db362fa970b9c571fb2e54bfaa81b6&931b1b46d26896d4164bd4a7b1e76c66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6020445656a9ae3483ac1d5ff95b69f4&8dd1e123c8c333103558a108d67ca9f3&3741dfa7a77e302aca4fd30dc39909d2&6ffbe47ad151374145f24e501d3399c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g352mb1_ic352oc352_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"03a9a89fe0075c1e7e51d9331089de9a&fdccd19f0aabb7c303cce1360903e3b7&e41924cd5ec458659349f16bdca5860c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2bef683987ef2de0cc5a7cc80fd550b6&2fac458004b6b1477d7bd7bbb3f1eb4c&e9a7eb90499cbc10f478dd778b12ebdb&868079cf003702f87c2f38b6777e3900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"92b77b694593a56d42dbbd8929109383&e88c6dca35cef27b159b00e269be1869&5ac9d21dfd7f9534d1c53d083f7133ef&36ec816ac4b03098ba3cc5c2905e3b90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"547f32a5d8078c1e14a21771f09b851d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"eb97c88fdb0e2637f385e92c6ad140d5&1ed1e1cd0753cebb947bedc9438d7ece&280d56273a462272a337ee458fe0a45d&368abbe1a4ea426a85269ff4abbd2a61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g448mb1_ic448oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c7bd84d0d95d4e24f2a3cb93e4a673e8&52690a3a3a4b2b34eb407dafb4af7fbe&3d11869c4105e8e009b81998cf3936a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"92544ec6010b8a3edef182a2fef2c5ed&2951d3555289d1f9efad94569bdb99ba&7b46fac4fe7d041708e8427a412c7daf&2cd757bb66ef439a2aba17b729e93ee7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cab57671b0ecd84f938b2af505d08da2&73866cf15f4f67b2a2ba11fb2a24ab5d&64a9dfbc35a2b473cf799e09c6841d85&04696ea55f5f5eba63c0f55f8b1567f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1f580eed15143d9f6105a33a6fc5b903&5ffeffbc67ba4ea4cce974c233af05ea&367a98a1382c0ed8920fde659773a6ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1cfcc23b334704064b027cd1a87dfefe&639009dc8de78b896fb1aaf64313c3c7&fe70a94eb5505f54cebc5522ea63e604&4d77ffdd240f4b3c6abcf59a6bbc1cd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4a628a2367debb4c3084fbe73f4234f5*24&a9ef6a4d01bd3ed3eedaf4a01e87b89d*24&8f3618e93dce71dea4d67b47472127ee*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f02ef31ef007226ca33b0e37d81a729a&13f043ec963acba153153cc004d47d20&bc91e54c3e5e2aa381b41b48b5f9921b&25f2f7e6531300004f2e86f46cd23af5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"848972ea68d3997f0b52bbe5c08949eb&4027b8a6409040553afe2184f0beb62a&6647ea6ff8e1f32c4b4be576aa5ef44f&f5f0cba29a8e98dd76bbefccb695ad34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2712e53a16c0150b9ad05854aa6564d5&df868f7d79bdff16a06fe1d68e188955&79a0d7d6338c709d990a3cfd35b98897&d93514143e9617c7326a5b3219216781"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60b207c849136b4cb174583e74fc41f9&54868ef712e11d4c62e5512072d95a53&8118180c69c0210844258108d5024572"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9941e81cb994a972673801e6c1a0edf0&1559903b5cc51cf0a5cbbf073d049a3a&ee736a35baa955d9b8813ae4605d0c10&c3ea4460f5ca9db640f1035964d89150"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ae72add0b4477a3d0f6377c5891c6c50&7d16fa485ceb57ae27f4b4eb9408d5a4&b5074acb53d39fd4e7a132887d490f9b&6a492c866f4e1ad615f9e70102d44484"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb1_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"76730eb8242a8c4235f3a020f333cc4c&b59263fb6059035e7e61720610c34a95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48faf9f5d00c05652317b9ab89edf22f&9c7475c718c894341c5300093a4c929e&143e1bebec3bfc387f10e83a0188c9ee&0745267660f4fff37617ca4ded26e457"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd108d1d1a966fda4606a666a25aa54e&ac8bac62332c0f0ec8bce0720c479167&1b1d2744ea4ea81467fb5087d59531f8&494b90259cc6c133aa82855755a2e2a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f92ce10ecff608fa34ce9f9c70130e66&ad8251a515e11b5699331af43c715232&c69d5e6444577bc8cb0cdbc347319f14&282dcba8c3eb63335d1e799edc8f1a7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb1_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48005d2e3bf1ab26d1e23e5b98d28a1d&a905822b249b3a66f90654862f81ba4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3353121d28054c7403d118f7c36ab505&d24e7cc3be52db0bb6ec3186a4cebef3&6251ad3db45d715d07c947991bc89f72&7be469800dd8999b10b194f733722a10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9e1c81a11f075d24bbbdfbc83b6e1eab&12a7c005fa538e77d03fdd23bd1cb3ec&3a6d5ae511ea905b9a8bc603e638b0f4&c756da78379f6cca60d7467d22797934"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb1_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01ef56445b9634e96361325d2470d380&f387b1f67de30d71b458ad677d9f7f26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"91fe4ad54332b48c5f4263e0d8b43720&b968c59e5c71d99b15f04910f0fb2e09&41fad06f28df1a4e17eda262021fcf80&7b6b4934b220ac4429107b8af75759c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb1_ic640oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0aadfbe52ff8f9f9e4890fe48bc29dd0&9b608243df917e5da0ab67ed81cf1901&7642fa86e0e86cfeb6164763456d301a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e121f1580085d66d0c580932983fe0bb&58c4eeab2af17a67fab233957d6e4360&f3fab84e4a32a08500f14077edc015cb&ec386a2ef5b0fe069a589fc12616a686"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2914cc806dc3321b513ac7172c02e60e&d2b9baf9b65d9c1231c2d2eba011c876&eb138c6c59ac11872781ecd4582b51bb&2537ac86c33ac7a1ebf20154f97cad43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ceb4beb69b506131a9ac2e256357c47a&7fc1540aa10368a6276c1779987202c6&0930f1e93a5798f9cce3411703bf0a08&22da2cbd58e340cf073d8de2ef8e7211"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb1_ic736oc736_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3027756da548f35af2007d12a053038a&d31b6281d0670ce5d157fe0eb5211c09&d3c75b099055ebc46a50ca6641316740"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0e0c14f3874252b7e1d8a4d8aa797926&60599d6f4441011dc94a647519be7bbc&0663eb346132a224aef16e42ad3a9bc6&6acdcb0352a580ffc19a0350e0304e27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"edfc605c68fb5d202fa48463c8b2ecff&4e0ae84839ecfa239e6c891776a2dd3f&8f6222eaf4d309343d317d545da9d287&1325b121baf341cced53ca5bd40577c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb1_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"472ef04a2d16e2eb99bb158e17127fef&7ff46f88817d6abe66d608d45ad28533"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a003b672911ca7e106084e416496799&4dbd5f07c1502d37c18d579ee5a336b9&97f3c187340b6dc50209df83d6637d03&1aed1ffafe6575a6455ea4a76a81b1a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7d3fed53b55ffc102fcede5da8185d95&49539550a8e0aea8904ef875f8d57e64&24002abe772d1d6b4c6e139a43e6d270&e2864adcee2dade71439bd6699efd041"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb1_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"160b86537adef149c919cf072d393cc1&2250c6670d4e9fb2bb4ef56021723a5f&74ac3bdcdbfcba00a23dca3a356b97ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b5a4476bd22f57d4e56f589da9e14e9c&bc6470bf39a696bfa4327e1aa95a0f02&67a49b77c530e7e2855629d218791bd0&d187cb7af2a6de380138f1d81ba6d73a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb1_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a396b03a6a95278a09fdd6c9d93f7443&9dd84a22e3cb77ab35ac49d6baa16bde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a3a8a3c5fb6091dbbc3490b4448c2355&37500008c3b0767838b5b78ca7c3092e&9f14fe422b0128b51e666d307dadad29&6527adebd0562213723a31a16d153579"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb1_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d86c1a9c6a2d17a02120fd0a2bcc5792&2c6ff32999bfd7b01ac6a1fd5141708e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f636f1f45cbea1d8d24d90d977ddc80c&9914060e4c7e2a1c4b9f7a7e27655bb2&95e30351054a176520580258fabc3d8d&80bfae49ad9e6bf11f8ed01a59f8fc97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"21ca4c51b745c1a3c86b0322ee198408&483ff14e61086f70b47e3a59975568cf&a36dabea1f87e9b5d7af97060fd2e8e6&aa5e1949b115f2bdf23262eac2719ea8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb1_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d0c62c7c3b5fb439c49ea9f037ea5cb1&6bcd989ee08d56a44783a90ab0738a2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"35653529572956061b7f54e5c38e339f&f09856b996d5dd26561653c37bec9010&71441936bf815f9a6ba40fefe3ecd0a2&cedece2b4973b805c1b283f45f7fa364"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"634ac7df9de4ee21d28b45ef364dc0e0&6ef0cd9eccf29764d51c06d786b335c1&a771893b730164596f2ef7f7b358fcf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d115c7abb3405beda8b91198f88d3324&c8fc358d578dc72d8878f22eaab9f85d&1f81eb53d9b8be60df387814f4fe26df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"02bb3e3f1db37e91724edb6a32e288ba*16&f3ca4b3231837c99005295a958f162ad*16&23320ce20bd9cd16b879f3f20ac19ca2*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f185ac0cb5c3e9417a549f4e369e2675&3866542939cbbf7703d2890b8598e7f4&2e3c0eaed2c43a36214d0b333430e6b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb1_ic576oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eb3a46a04122db155bd8cfc5b47dfb62&d3a4dea303f66301515bc7f443f77cdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"64c3124b035d0bf9a7d1c588f36fc9be&979753daf6a995222273d85ab1dc9204&be1286b07ce7faedc588fae730781efc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g608mb1_ic608oc608_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"116074552a12a1dc93c8891bc504deb1&c5f7f90d0bc7d4157c621a38d35de0fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cb08a15f35163c6332bd7c754277360d&73183f6285aaf3a54f1ec064f5fd73be&65f29271f68b2fd76bd60caf6c8a229a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb1_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0f956ee69ec5061f538d75ac844b2870&a5ac4c14d6bf521f4a0bb631f80f3cb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a633ab8dab2a5f401d0b6269d7b85bc8&55c62857770acf32653ef7c7e64ac14e&5c22528f85e6a5acff86c936d6f189ee&28dab52673753597ae3f0b2e3eae34c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f7f10c825844ea0c25ca8728117df74c&ebcec57d18abc55aeecde5992b539cfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"49d326ce8ea96d4bb2c5ae71e39996ad&0ac6244bfae3b0ca9291d87207718323&a5214abbc898ddbbd9bf97c19e8f381a&c290737257ab4dfa26776c5eb8bd261a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g704mb1_ic704oc704_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0911c2279d59dcfc55a4d0752de3902e&397d10030646b486a47d8817b87b7e8f&69c7e18e73c3a425230e5ea6a1bc576a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"be2a6d26ca04f4fcf5e81e505f4b0d73&debeb044f5475c82ef238a7915b37ded&16ba45bf715d73784a634deb3b373655&e0dd9f7c0045f84c376eba23507c9bc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g736mb1_ic736oc736_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92e0d908c5012d8a425bede39d73a8d9&fc9a732f504bf9430d1c55ab64fc05f5&29ba3154f8487d3d77cdabf9665140c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"197a1100483280ff61b583ace9e48239&86e52a7664722aa506a8bfd98facb056&188185a153d7bb408c7b86a32576f039&b994323f1ccc04ddd625cc88354deb50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g768mb1_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"46fd94302fe11d7310b873c8755c32d3&c85501218bf7e8aac07b255c404903f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c7a354c6425a0f1027f872642e5bb58b&3e7a6b6ca23df4fdba0552c903aa8534&7c6a015dccd7c2e258046cc2ec5c70e3&b6e0ba6ff59df83f61c64933cc301d6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g800mb1_ic800oc800_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f95a181c1f3678a467e91b0e87a715f7&d28822b7b25c2349a421c41c0a9a58b2&cf9dd7f9a706774899aa5e09369e3c3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"33df1c5db25475281288f87ef58de439&63e1071c724ec9c85e50031286eb3547&92c2e196f94f777fe41266cff7d2fe7e&bcc9b3f3d090e6c92cb4871c699fae11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g832mb1_ic832oc832_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"59e9fe6e657b16513b54dcf65650b723&a5780be35f93118bcb0952db1b39c660&ebc90cd02b77e776a96618c5fa251aff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d71103309d362765b702869e40d0f494&9b5776c28c07531bbbf494027c042e16&581ba560d79ae2cdc99aefd2e6e4b09a&7cf30c8586444439f7d373b60f331472"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g864mb1_ic864oc864_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dd024cbd69997319f7241db309929e69&69407a502f17fdd8749eb0a5905822e7&f57c854eba59b7587586adce024ff32c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4227e61e6f209ce419477113f22fed3c&d224a3a830fa49658728e28e1c941083&ef321bf47b5f1566fce8ea2f4d285ef9&06888ba78f83986295dbce4be77a2a47"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb1_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f6a96d5ab5d44ed10ba043515bbdad2c&243b344a2e7f9815caa9e1c52e7fa032"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a607f967536620fa42adffa43c67e43a&226768668d4fc8944994c06f5bcc551a&31c845f7f051df99fd4b91073cc29ceb&9ca6400e80897713c651f216082244ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb1_ic928oc928_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"19ab892420f98168dc2cac0540e3d275&9c3265afeb562400bcec99a0202bbaf8&b7c1a2c7555dec9264c943febae52b20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fd92f79f4ed642460fed3e184d4a291f&b6a90d4653e677b60cdb86f5115e789c&73108f17c5ed5592862e4a222ccab506&80f6e149bbb64914d812df9628c4553c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"75530eae7a3bdd309b52948e39888167&ac5a22ce6a6dd57c250d595c3e6dd66c&ca7634fe33fe49286885d881941dff25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cbe09a5179899bb89e7a08ec63d6fb4f&d0f5ef05d12e66b15bf767159fed80aa&2b50e80920f57542ece8eb26b80d982a&8c4e72bd34585bdcce37027f3ac8d3cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g992mb1_ic992oc992_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d8f6d3241fd8bf99dc08eaf65a91d93e&b27dab475b944e1c6378c6cf6f9da6ea&517b77c1abbfe21acbeecd578117467e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"089a1dc96e5fe944f4356c491ce785f4&ba0f643a19a1799630179cdcd71c9b34&7eca9982ff80f39481d3a290fcbedddb&09bf9e2a5efed11de4927d1864f9b45f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx+binary_add:f16:2:abx+eltwise_relu:1.0 mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b6e7921c8112532bff21defa854d1717*5&68500a07ce67951f46458cffb4b668f2*5&c2a8336aa64353fa5f97ca71b6e7f977&aea700b239f6febcd6023ac0527d20a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"946b991962a224f38854cd107f655728*15&4344a0603ad4b7c3eace1f33518b621d*15&3f0a8dea5c841fa095858ca886f75383*3&860bbe72a98225b80eb47adc57f2674b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f8a3a7af9661e2449f661cb213fa86ca*5&8ed7e2b1f5e81f31e0d0c535fca76c4e*5&ae620a1d53472af92575fa5e94c28c18&36b8a1c984b6be1e27135eaea165ba27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"3b6d2f865d880e0453814b11501f6c3c*10&9bf4da3846ccc53a3a3a5769fb6703d7*10&382fdbb541b6e7841fb2817fd8cb080b*2&05f9788ad91159ccb9775410d264eb6f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"02ecdbb185114f9ffd5782dd812fba4e*5&8bf20bc3dfdd79b8ccb46afa226f97b9*5&32a3eb3d0a7b6eb37a1b04967fe2d94e&37956f49b676c5b5328066d9fad5d7af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e7e09ff28012eb7d1b138b127048cf31*10&b3aa8cfbde585a33fb7b199a480d90ef*10&51ced9642a00b711f1facd62fdea5761*2&0d950b9f0ae8be7576f00b4485ff2f45*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3986e8df52b12b7043b747af10bbb2a5*5&632de36930765991f364a357bd0644a6*5&604d8ab12ee63d9e3a87b75b662b879b&215a333efd4e54825a4dff6358cacfa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3bad3b95535006942eca1cf7d4f8ccce*10&6690cc90b036c56d63b70074207439b7*10&27871543cc693943a8e8c9c8ce0d77fa*2&e587817f84572e4f3edaa1f765ee8609*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"121d8377e38a2802372cb94d941d0395*5&dd39285d8d5419ac0ab36cd7c0550dd2*5&2a46d46082847aadfb88405500b4e1e9&ab315603a02c041927cd415461ea9b96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8bd819dcefb5c42e36ce1d52d3e07aaf*10&040bc0416214eb1444a0f864c9dcf405*10&493023a2c748d8509d57c62a34ca4139*2&2e0975a0eafa1e4ddb2d4f31d0bafd4c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6ea41e084f43213ef46c33e90252ccc1*5&fac2bc84ac964eb5c2785b00038a107d*5&0500c07af7426494a3700b90feacee74&3e3d5b111b6f247141e9fe887de42fa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"59fc05cb26dbb1a9571394c10ab44e6a*15&57ebcfacfee48b37c0564cb865317924*15&29a878dccc85c980d761a06a65d1554d*15&eff41eee10c05cced6fc3b10eb69c289*3&094668d6094534214f00201094ce1265*3&ee78730cfcd7bd7082568dca738f44fc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1bfe08a506198bfcfc5c16a4cf11b1ee*5&e437aa544c940708014869032baca600*5&da8e606833886e418d945958aacd5044&87f19b5de987ddfb01453879357854c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e4005141ecd4d9406374f778c086325e*5&64d216a5206f18feaef83b82f65af6d5*5&ff29992044b47f4f78af1f7dbd808dc3&117195a5df09af7752bcd4ab033273bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"862bfb58bd00d25f0f5271386dc7277c*5&daafc7588848d8586b213c5a94fa0a72*5&2ba7570f1b38ed1fd4a9a8cb07df8e31&967fcfb575868c9cdd13ca19489cfc41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9bf042da445b2d832186b4dc578796c4*5&7d9da244d8f309f8206feaf28b61c5ff*5&0478acb7d23ef3d47d02149ccc332a11&265c1df070f0d1af5ca9e9762cf44fbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"93f7e891ecc4d95d8f324fd51f3cfe6d*5&e07cda0fad6ae15ba7652520f7f57f5c*5&69b5d8bedfa2bc37e3a45bb76c599b45&13fd8e344077d4b988b9cc1713959dd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"80acf4369cfcbb86576ede28d755a612*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"8d9268695ffab7e44fc1484f2b7e7924"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"9b172c57d52cb8f276b548bf8eb41b2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"fba52ccf85a8edf31602a8db0abea8c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"ae32d2d41d59eba0688c0b03b85bab1d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"ec3f491fd4846498d8d4a37229ac797a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"82908c59c8f5f7b0aff87d19a5c05f9a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"e1b54953708d49a43357f2aa6ff98276"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih334oh167kh3sh2dh0ph1_iw334ow167kw3sw2dw0pw1_n"bfd6516e30d71009fd67d8a61048b7bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"9fb9ac1dbf4635a732861393b7de11a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"19c2d9b7f31671e9d6dd225408bc8399"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"7fb3dff50956aebae477ba884acbf91a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"4fb295d52c5eabc9dfa120499909c256*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"9addd745119f77082c09ae37062187f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"578223218cd176c2508034f3ea3a1d24*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"8525eca5ca6730f7cb0647ddae655716*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"e1575c9778bf2f53af6c7c850e2b053f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"7dd1c0f25c653f9d2f17810822b03fd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"28d43ce8b216be539e2ea64ac350e967*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"aed4e48fd8f6c567baf3572f018027ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"6d3e198341606fedb97a02e4578400f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"0fbe0982533d03b585d6ef957ab7d4ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"c0dadded0e6f70dece305974e26c9f85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"8119a9b6dfeeee8185d8deca834cf839"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"c59b1f2e6d6313573cee9845ddc4b600*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"f35a656934587a2ae60ab20d0770333b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"201fa1bd820429fd31032240378e9db4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"1a96e40800b29dff98fffe29231f5510*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"8faf107c774481f9e4ba70d4db927471"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"270983373caf8f75904462223859a599*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"42e96e6fe23800bc27bf8720b290f6b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"9d166ac63d67a87c2e9f5aea8e903c5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"665f569e9a359885a60e014e03c28940"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"9222596068f04e7af9f4edecdbb02496*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"7837ebb5df72e769ae06af23aab84ce9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"ddb8b276401ce318c70baba88a99a76f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"2fd645c26d4eec9dc75e5552bf69935a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"de2f271351fe9dbd914114ce42b1f315"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"606edc33201babeed8d845e3c1b70cda*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"38244770f55f2031b09c34d0ef01ed28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1b8f15bc17604193a1bfb6e6f6cff0c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"9ef2dbf304af31bc066dee7d4cbcd428"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"fc25ac91b731c8a0a842018d38dfa79b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"64ad388cc5a15ca0f9d7143cf1d47ee4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b9803c068aea01ac55b32831636db79f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"7eaf418ca414f6f7615e3f38c90cf97b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"b4ba21e35a2a7dfcfc29135145b6f59b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"104460ca906907cc75e778aba670e0c2*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"938fd11ae6c3e4d06563ae0249019a90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"d3e32441e5be7d650a3571da3c89f91a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"3cb028dbd974bc346dad4ebdcb79226f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"bcc0d4c2940e1c6c5ed9afb0869d8093"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb32_ic64oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e273c3c9eb1baeaf1e901c19f10f7842"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"7e51484b7853884a40001ec02234a78c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b41c29abdf6f49956444abee23a8b55b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"167c0b0eaf937d066b33b488c4156dea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e99ac3be4dce3e72974ef8c42b630ab0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7719faafe9ecc1a2c0882a8c742b948c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d520d47911c611f84c4fe75ce69803c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fc98b378454aef29d8488563f2044b17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"daa8fa421f874fabf384f935d0eae582"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"11356820807eab6e3709be470a266723"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"4f30d17c52ef673010cbbf02b8575fc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5065926cf6dea186e827ce4bd33b0395"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"eb4e57ee61fef05bbcbe1dedd50df249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a59feac406c5537bfd565ad83906dd7b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a55ecbce89fbb17000a1096652ef50d6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9260fbd4983679109df56b4b5c9de3b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ad418c8fb339a8b79ff21b61549b7d68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f6dc54a8095d0e4fea97357692da01b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"426fb37fc3e62208f3c350adeed41028*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"597b765c213aac824bdd20772c34a2a2*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4772eb739d7af29e5de57751f0ff0e80*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"811ef3e09a74f21ae06e06a5169d2a8a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"3866b98988d3c2fed4954ffcaedcc09e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"faaac8609089f3e974fff5dd41d12259"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"764367a02cb405e3adcbad75c62f2762"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fb8fc0156fff863c60ab6484260155f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9d2f298547c05156c96056c27f758a40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ddbe887fb60517474994956c7e0b6376"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1e9a94beb4500e2889d0095e449d5ce3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8a4fab9bb4b81d41dab154fe59251f6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"92b9e79468a39fa61181d1eb6909d0f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ff4ccedd22caaca532573af9b359df73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"fd90d8cb690f20033a0ff4fe44668ffb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"c61eba80f4414fa43c0f058ab5dbd1f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"dbc36b1c31716d751d2f935381a3474b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"a8ef53608f2b23a4d1170e37127b89bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"c0bb5cfe881ab9c9435feb7848df0566"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"e3457c0975da2bf3d3687787b21d499e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"dfe67299492fcc44c049ee15553c7e74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"250cc3f98d4fb921053022f60eb1e9cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"9027883dbfb6e6d1151b81d048896807"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"7ba480425bc1a61aa3412d9711370324"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"452ce0e06f96b8c47cce49500423d224"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"c5e7be1e0ddcb86787f691929d414c5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3bae178c9157f779932d4a8234208996"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"eae11f75ff7f2b99f14c2c5e76959764*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f97311161a48684de7f742296cc11a79*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"9ebf0f030978e2b53a79380f76f3cf7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c523990474bdb5ac6ad1abc979bc4939&6f739ff8d2c1f630a24f3019fb2056e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"05fa5baec9d566343cef3d1d4e554f6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d598ddf5592656db31c66d729538372c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"f7e67cf4d6abb97a50d1c69cd943c804"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bb937603c1ce6e86e5b743b8227a1159"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c30664eedd0409646daf5ef98691e50e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3b99ffaf2f78268046052b069670c6dc&a778ac7cf6b62fba0a8c8d8b864a314d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c5504346fb6b578751e9ddb8981dc30a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"dba9b7eab02acffbbc91eabfb3b24c98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"2709c114fda2b09ad37709ddac73dff1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"969b19f31a38cd730a7709934573656e&a73424a267e9cf327614f64e352d111a&c0e6b1fd47f899154eea9f5738329d59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"7333b3fef3297a479f400daae53e466f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"79f45ca3f5ce360adb0939fc1a5061e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"730124504ed4e33efc9d12addb4c49b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"8131879bb0e4e06cf98fc90a80bed847&6af369cb9b648ce3946a6cf478e51cc2&03696da20d219de2067765cf7c7499fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"4522834172e7876c84755e1af1fc6954"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"d072ca236af409d19a3b398f9c0ade9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"61a5c33e45d879688b0d9ba42b3c826f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"38fb4b2ed9a1d0db607ddfe48eaa8183&355c7d091bdaf88bea54d7f7becda13c&ce274506984cdb3bb51cf1666d07d758"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f3eb6c823965e02eaa4bbf8847bf80f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"0c7f8e064584535cf2a575f9e5f703ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1f7a7419d0234ceea9cb5056c9be8ce7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b7f3ee0ab6d0b52ee4bf7890a3767f94*5&1059a4b34d98f83b2c5bd47e392816fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"515e79fa92f92ab49b185d07fb387db6*5&35d4aecc26e4125508aec1f440539cd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"200d50491f187aac5b0c26365c5c720b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic896oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"86a1674752fd03761bcea0bb8b5a4095"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"fa180ee98f318beaaf2899f3d7446b01*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"ed2ad08d2a766b74e6c861e300589106*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"fac3131b7ae0d3a9920374cab338b2e6*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"98e722a91fca2962de5502cbd14f5e76*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"d0894646a7e7939223a1604297a40ce9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"76afe14e4d87881200aec834ebf07338*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"6ee410b369fcfc6da65e0ec403df0532*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"38b05037b60e3b48ae555b9a47f86f25*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"ac83d940d98b135ed4633f4a135be48e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"8ad10e3b943a6edf8daeb6e35c33a137*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"e83ae773602111cd2b1d6e54726ef87d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"14777940d5ac19aa476149748e74f845*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"163eaa8eae424ccd0d0bc6e4e55fd82c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"1cac5f79ecd674809e0219ffcb8f7fd8*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"1d0e94938ad4d6d8ea2b759f89bc1d06*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"1c1f7069011ae0f6c150dd2658ec2689*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"507a0126591c9721204e078fb905110e*20&a4c28b54348574ba4fd18b5ddaa2f1d6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"3be8d403b1e51206e5bf4d5a2e5ddea6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8a3a3e270fd3efa0639a667627202da1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"09c9c499793a6e7d9e1c8f1b52c7b7fd*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fe2910dcae61d37146a07da5c2997627*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e6a2bec7f364f2485d1bf39d12ed5d15*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"dc389fc56791f0b413c71412caef8a75*35&635157c87faa2c8ff884f50ed9d6738d*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"749765f2169b3926be622676c26e62f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b762e9d4749f651e00c0c35652e4e500*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"83fce66e10ec000896069c595ec5d27b*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"f5f13c6dfa1a3e264e08f96896eb2cd8*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_mish+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"99b3cd39e13312075e83e4bde882d617*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5086c197e937beaeae48f1be5ed908a1*30&b8d9533d4930d869f25182e45e4cb327*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"7a0eac44985e72df48a46acf557904e1*25&5a13939f0434a2abecf8253c9137fd6e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ce2f087663cc10ba6b6382406750804b*5&1aff50b580dd719d4647a4ac4941d773"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8cfe2c87b5eef34ce84aac8c68bcca83*5&b8ae0e6a2a70a4b431dcd9bf9edcd0c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"56ddda0cf181c64452bf3ee68bc5e339*25&9eef9c6c2f71138fcd8508ea98af587c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"816ece2bcfd1954b1ef49ba619ff1d9f*5&816a86619ae17cdb01a65ab11c5a083b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"f788036a165a0f02acb6a1289376ade0*15&de56310d582558f9397c3a9d4ec44ee9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"fa14c9b3e73019974c2215808b65c124*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"84819809c280f661a8cdd07f8b9f5c4c*5&406faa89bcfe37f1dbde0486ec2e740e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"980d1567d95f60a7bd5a01c7c5dbc808*5&c0e2e148a8572b030c29556dfd500ae4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"330408fde35ab9dc5a67f37ee3e6c22d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"17857d4d140776965e2311f63fdcdaaa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2cb51aaa6240c88d5cc1963fa8751498*15&049e8f214ba86ddf694b12bb75ce01ba*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e6177971b7dbb8c03fff41c576426bfe*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7fbf103579ce04a0fe78cd07c4c7c2b0*5&1f0a2de035f6919826ed9ce4fedab29e*5&7d51062cc77748142503dbd794f3827f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cf5eb7dbed936d0c88df677b23b560bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"943d9fba8a8f9ef3075c8d9e024df84c*5&f975e98e4cdba4d456e8daba87d3c1db*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"84a38f56ad67acfb38968aa1199174cc*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"7ea77c658f96beadd92b3cef382bd17f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd798b5d197697ef06c942002942aa87*5&10149da0a1495baa2dc9bc39b1880a79*5&9ae9fa4e9f4ccd09445fea5124ecde17*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1ef5026671b49f9195d78f740a9898fe*15&3b79bde4c2ceac9e6b03350a481b1b8d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"89bc150ef7142690c2fc45ad6972a5c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"306028fdedff694179de830c24319cf5*5&e4e031f367646beccabc4a23a90a6761*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c917c9a292362190a3c6e3ae2e820f85*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9c3b94cbc9562fbced354e6c2bcf6c3a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2b2eeae78f3ec182ecb9116c99411ba7*5&c43cea8272ca6eb971011db23d266b9f*5&a40cce202c65b10b614cb67c7b043dea*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cb46538136844748045ab10fca1c516d*55&628568e089103b98fae85f31dc0210be*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fecfaf89b31253cfa1e0333b19548b4b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"7aea0cd078f6b83e07453fe922139ce5*5&3fccecdba5b9b01d16559c7e755d7fdb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"beff855b0c709ef14e426aecb932923a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e266e77a73b3477a2bc69021d9b19d2d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8a128bbdc2ee634e74054e268dccdc88*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c61374f188d3d3c9413e05fab81e6ba5*10&fc04265706617b692860cddd42e88356*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any g3mb1_ic3oc24_ih600oh300kh7sh2dh0ph2_iw1024ow512kw7sw2dw0pw2_n"fed52f0100cfcd317952f3e5a34ae1c7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc64_ih300oh300kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"5a9783af7520ff936a56fd12b14e6c97*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4ea90fb489aa54daa02452b68683664f*5&c3bd85fce323e0b60901e3b9023f0ac4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4d7ea0e7f08f9613a2f213fc0f41024c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3721be478ff2709a63056ae151f6b60e*15&66ca46cad7cfa46c1a0ed76e137e1c7a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f89b6f6eaf4869aef65530cddfe9cb3c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6532eb01b8835eb44039ac3ea59f71c0*20&698fa946b8eb1a4689d5899571039a4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"faf58eebd684004de17ee477ddbbfca7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cd60f0077fccadac2cff555e5de026f6*10&ed8dece628921e59b76096f97609d9a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"57d6233c5f4962b106ceffe4f0a5838d*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c6646833ecbc502dc29b634981d7b3dc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"20a4c0f7bc02e0e92f1f6a114d4f0eae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc160_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw0_n"d54516cfe45c382d262feb5338b8f81b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw0_n"f050e571d8580cb4cb348a0870f3e946*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc96_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"be2f7cfc2498268068638bb0bfebd594*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc64_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e627f196c0d83f5588294a337e59abdf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc224_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d33b2e84c8d2f9de505bc1ccae9362a2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc128_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"63f2e2490948fd58c9397e995c5503fb*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b69417194a909e38eb59c4c781b2d167*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"626b3858470edf69b51261cb83867b6b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"91a83e78112c809ae27de6c59c52618a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc192_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f6afb70b4b08fc4a248dd6ac39559b75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic576oc160_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"aa138047bc1d89a4707d087c41491b8e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc160_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"10688e847dba0aa70cb11bf6183d9974*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cc1d4e060dfca272f603c0408b0e7c77*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a7e9f70b4799f5ea41c3c818bf3ee29e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5a99c9222e60ba7b9ba10a592f0d1980*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e2590ea705b1e7f66fe7537e62b4e4bf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic576oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fbba28b8daa9c86d8786d01e337afa59*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"595173635aa9be704d355ef8c256c787*5&381184b4eb8e3b9f0234373e230b56cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"da43a477b01bebfc450d935d0d557376*5&a549235c38ff75b4a1f02b1c61236043"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic576oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2128999ff91478078019825a3c17d941*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9ee892c142efe2246120fbb3e8792d9f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5fb714dd8962b7fa62bb5e7767e84d49*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic128oc192_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"86ae771bed3d3e0ec273e6404bcc72c2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc256_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"ccd333780aa25739cf57ce91de41c90f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc160_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"080d9155de20b264cba00c9ec20c6d0a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc192_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"6816522458484fe445e0794d3f8f18cd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc352_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f92a7b006de5061c5afbccd0c49755d8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1024oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"be6dbe8a11a121c53904d63bb0b5a6a3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic160oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d9337cce56bed0f47682462fd0eaf042*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc320_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"3de08caf1439fc235a2d11c8d8bc40dc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic224oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"37be9c8dc1b1de3251b0fdfe49602886*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc224_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"05f2f507e17988b2d4e08dffccb47d06*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"2b8b3911229bffb443bf8a13823f16ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b439d4f3bcc0838f77bbb4c095d613f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ddb3f187fde21b72ea3f65ac710d4786"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bae0ae6bb47027cbc90dfb35434d4d18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"135d373ef390352f8447401953c867b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"68db0bfa7c8d68e862bd4bfb04891533"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e6f40dc88d3c7b02eba289aa030881da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fe304c3aec119b18bba4ec716c5ddf82*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3c3266fbbd6350fea0e6042f2efa3406"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4f5e0c030688776b13d866a5401308ff*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"672997717017fd474abeedfd653e127b*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e52a159401251ab3dfbea4078cf7ec9b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"735b1cf8dd278fc4250b99d7d80a8d2a*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2ebf2d1fd6c7d9a48456b355db41aa01*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"62aa1aa6dad3bdba86707a1eeb547ec8*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5da8989d1c418732d0febe207f43616f*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6d2690482a82466afd1cd9f038e2038b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"d63b76203a954e4735c61cc5a0a82114*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"bf676b6ce5e837b63de4b73712fc3eb3*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"d1bc464bad9e157a6e1a3b74e7c57816&286fef8b25f1631fe1a54b786ba64075*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"2df2d5cc8b1e5184cea6f3f4ff73b2d5*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"69a70ab8181f9b782452ae44eeacbb78*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3d49921f1456085442728e315d0ff8e7*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"11730694dccbc384de25b97c97c1c83c*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7460399a883ad49ecbdf4367ec7b86a2*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2513b253bf4f0ae1e61c67cf3c072878*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2oc64_id79od40kd7sd2dd0pd3_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"d1074c44b5c4452847b3720ba9220ae1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_id40od40kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4c2ffcd6ef1ab476948c647b617e0436*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc192_id40od40kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0ad598155ffb648079427b0195d857e0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"33b089789ea8f94d9da874ac0790a3d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc96_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc0cdbda6c020f150e8cceeb97a1d03e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc128_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"629f6b81fe8593b9beaec8cb3d4a2cb9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc16_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"578617a62fbe5aa1e04cd03cc64c6212*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc32_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e6a64fa7d8bf5803f5cbf245644a8bce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3a4e21f28bb8bd37dad90380e84aa7f7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3686320eed8952d4a9add5ae07b056cd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"600fae8003a5a11637db5e22c190cb68*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cdfe29ca123f741667293265385cf1d0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc96_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"118048c854899dd0dd91beaf71ada856*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"731e7d66e1c44cc93421ff609e771e3b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc192_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7785b7665a63166463560d393ff5b23d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e85ff7313877abe27e7f70f30d1e461f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc16_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95b22d195276ce2ca24c45fc9f2362e9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc48_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a65469ad77b86389df838664c8ea8c7b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc96_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6d5ee1fcb69aa703b7a08c7a30c98ce5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc208_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7d114179ff63cdc514ac6e602a1d76f6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"156d658010a1eb3751416f512c0b877b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc112_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f8dfcdd4d2fd862ccd804dbf7f690e6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic112oc224_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0a9e2b23a56846ec7507e7534e4f02bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dad6311de4f17ac228c3185c16434fab*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7ea16b0cf761f2ae0ab90a15d26600d5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f84b92ad0561679e28ef865912fcda97*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b9bbaaa76b4fc69b6b9417f038d01c07*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9cf8f2f1412e5338876c17bd2ee2bb88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc144_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2ebe70e935cf403d067c05c097e5eaa0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic144oc288_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fe1fc441692f4ffd52ed8c6f4b97fda8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2a5c884d51a455f28561afc9ffcb69fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6de8a8c2c5e467f4f9e7c1c5c82d9c38*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic528oc256_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"45ae46244cdb801172d2b531741ee421*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic528oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d81ff25a8456d37b5d2b2b10b099b77c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc320_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d553c07356b2c3f894c6459cdae35624*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic528oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3d4b96680981387d57b5d1431bcd9c15*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f7bf708e5680e9262cf3dcd85ce4cf1c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic528oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d4f161a1bf101d0868ad4b18a0b288f0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc256_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eb6c85aa331e6100c533b52ab2920754*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc160_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5a1849343f73756a522808810cca2e09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc320_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"fbabbe127b3bcc57ac518dfe6399c1ab*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc32_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0fcb8e765f0bfe3fb9e5de5066beafa8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"096a97d50259b14b8744c79556bdfa45*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc128_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9cbf510aa0c802217f760994f532d5a0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc384_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a5b9f6e0179e8ec3960fd8c22695c388*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc192_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d8fa06672f709df06e854ccef3f5bf57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc384_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3068a2ed3e6dd7f61d4fdbb6aba45c31*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc48_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"962860a23766d0b23bf69b21580b9cfa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1572cce1ee4822d90f0b956ac609741a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc400_id9od9kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"863d3ab1a34c07d31f731e1f6f164939*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"95fdf6cf8c89fc7017e0d04e6eaea688*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"8541f706c3eeb955a492c6a8f2615fd9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"d8d4c48ca977f6bbfc160391b0488ccc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"37ed68528a340dd189535886ca6a27d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"77e9b5448d20f75ccbdfbe22bfe03730"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1bad28da3df082c407ae09d3b08e3fb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"7e35edc35d68ff6ba861cd2349820f3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"99f24e062f490f009484259daa152d26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f24e12f0a06f3c36855f9d198ff15b40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"c5903a8a7a05b0ae7c3fff0f5baffa54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"141a9e473c3d162f2e6939de78d8d585*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cec03a540a600e1e5822e5307f5c80fc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"122031ac55ca7644d470a187fafb985a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f37222a68db3bfa5c6898e272d79d5df*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a7c00732fe7007452726aee95346fb20*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7ad173d2ab05345763920a09208e9b04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4060062343ae54ad726527f0fe557bb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b5240a00e0274d3ccc857bddc52d78c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"74905ad9bdfe1cf6290e05b0988f96c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"1c475bb2159cb88a1721499025e06a6b*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"b13631ce3907cd10c2936fa9b877c09e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"3ed18a8fd818ef4c86e4730f33886b77*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"dfe58591c843039658e4dcd7bf9fa57c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"85cbf3f43066f210826dd343d42af8c0*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"93343596cbb0950972a31dc33ae8cfa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"419dcccae3cee597a090cd653d8acb37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"32ee58b49f0e5ef8669012df1c3d48ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"4d2e06c827fc40ac4986001da2be9f4a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"9e3d8be84726c02f5053f8d4e1fdab66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"6d740fbdec4b8d46b6701fc4aa0faf19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"5914f4511db514d0b65c29a41a0d25ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"f1b4634e538a567fe7d26200af37249c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f35c5354b48459b833484aeb1273ae9b*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"ec5df70a6bcb715112ede470f16165b8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"95c8990f0357d210ff215b0537e368d1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f79d9ace7e0b0b53a0b936d2d5519e57*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"192d74d7386a0fefb16182f88c809b25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a0735c1f79e1db68529f67fddf2d1501"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"6bfc978ec35f4bf7a6051dc746557402"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"534a67d0d3323e4140bcffa197811ae2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a55a0994cab214eececa35307f0ce160*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"ea6385b2e43dc8ba613ae18b3a41d61e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"8bbc13bd793e73f299bbc745b5703fa7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"19e53b05d6aecdcfd4943d11f1c80b97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"3051678eece338cbed023998daa18193"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"214371242c7c47509fca3c28dd44a339*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"cf5f967d1507d3236c7d108d5c6ab951*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d7a84d60c287641acbb99edd4b0834f6*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"c49eea9039bcf4638bb16bf448e18081*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"47e6186cf9e407499bc2e099edfa013a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"4ae1cce8dfefa3351b3d54a791c9340c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"2fedaf1fa560ff90182184a207d57171*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"41057b1958df3732799ffbdec789ea84*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"dd2dadc3d916b1e9bcbc44dd3ddbd589*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"12cd7896359a9ee373e4b26ada32385d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"45d2c0b6b0f46671ce011f2df35c664e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"901633ef852943e8cc02cf0fd4880778"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"fe35e22a3e3c2d1575146d59bee9e81c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3fc6044200f5551d9b0cd3195d45024c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3273c9e88d03f98219ddc611a4cb00a6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"055f7bc14682a6a7d78713695ca74fea*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"4db199bd86fb5e9856b826900f1c2753*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"157438d5210487b24ee015fc8cf8be00*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1d1bed170bd07daf97957c41868e95a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"3b65fa7d1c6653a2a7ddef04fb700dcf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5a60ed90b99cc256d09e7f7d4fb7dd8e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"50301a0faff354eb6ddf16f7f2af3795*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e29c7c0fee3b0b7fc0fbbc2fbad09096*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"a5cf2240e18846fb8bb9bf8661f8fb3d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"39d74ab54fadd365fb908ebaeeb2ea6b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d578cdb9fb6ed1eb41b0b672ee8e6f9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9774b9c3d7d502bbc8c216078feab85b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b8149849f595668e2f05983c5b510176"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"4897e79800adde0e7d3f2dccef2ae56b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3390215f46598f82c25c439aee31783c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"d04cdaad02cf38a5583de92a29736d13*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"37eba8c4c89b134b4afecb2e75241a01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"af7921201c3aafe4ba0a9793605b5f9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f2de7c7a42ba14704d9344d396517211*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2fdbcec7112092fb0726af8e616a80f5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4dad6026d1d4f5071229de36482be258*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b7ba1cc02ebd488a2d8ae42af45988d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"4f9ea21c13046b1e3fe3ab906e5b8d01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7474e9b2a5554e40ca0de16d1b12193e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d8024ce37241de3744178ef5747b2a76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"368b74f3fe89786e485a030659972402"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ac07909d94d3fe7119b8d449ba3e1520"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"7dbc9faf40039a4ce3855515d4fb9336"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"b11fea37335667668f31c15d9b593d5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8795ae4e2071062ad3c75bd4b68f2bcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"b16bb226d24e057af39b34ff308a6485"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ba15bb52cf29feb35a88af20c2e01134"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0e727e76993274b6657d833f500545b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"78a0fc5a163a5a5232b6cb1bdeff8783"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"74330cbfc4a8fff60e9ef0ff0844a943"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=any mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"5ae3cc572e00fb93e205c5c0670e27a9*5&1f30e19c775f402fac73b96cd69e1c66*5&221d3228eaa70985e2fc05e637932be3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"65e0b74d212201d27fb6e79ab201a6bb*10&6cecf82f4020acd84c7fa2112b0a99fd*10&18a32cd1e3e81f55db9a6a1c21157674*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e1c6b2052dbad7ba04083b4faaf90c32*5&d46d8b599a2b6fd510618f551b7dfd8c*5&f5d2f5b750a3f862bba0a8198cc56137"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"053460b6c6703e9817cfdbe5b00ee19b*10&bc5865adfbda4ad6cef2e5cb8253331b*10&333c96ea5c638d9c3f1c3529ea8968c6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"2f65bbd4db31a5c0d36edae96cf7966a*5&5abe66a56b095aa58cec186e3033e43b*5&a4ed3c6b01d4a3e87aa437d31d45b9e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"128de3d2b789a57c0c59a1797ca4bc6f*10&d405cfd9050251cb3316b73ef959e174*10&a9a7eb04bd64ac301d3d98237208b141*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"6833feb0d94bd618cc2f66cb661bc845*5&305c4af606ba554e6d8266bf72a5e9e9*5&00a516e473cd0a87db1eede32da63153"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"0d8700a65276fe5ed8612585280e9230*10&60481fcdc8ebf551cab57a14792bc7f0*10&d48ad5ad5e9dbf2afdae2041777acdee*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"26ab7cb1d10ba31ac98208bc0455627e*5&8d2ca775df79446de4f963d10e179547*5&f9064aa157b441c6f9a318922651499a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b2b2a5d04200b663eefe84fe0dd25176*10&f1d2c949be089d3e898337eb3f11b5ae*10&1809778ce7ffbb4a2c27113bf937a128*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"6ce5dc5bd5f57f9c8c6c5b73e868888a*5&6923d6d4532992a760f85a56181ec90e*5&81da6720425911edd2295216652f9c3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d466f81518402697e2b9d45c3ffa71fa*5&422c458f7811ebea4634836708d5439d*5&e3e048eefa904bc3afd8724d411de5f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g1mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"43ffd14b6d0418b0492336f9453c6c99*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"177a7b62c735cd26cc0510ae7a6e73c0*5&1c8cebad4d9c0e57a1dc9804e6944eaa*5&424858f007a598bbb4bb7a02718e74fe&e8381383ca55cbdc5c7176f439c727af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g1mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1bdc888da4b1a53fab58928bd5bd625b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"8ec37db26bab1dca1c9612f6ee83a44b*5&055f84bc79a1bac121a4b07f800cf6d7*5&bcfb3b56e8ef9f92b0757865d8460f5e&1624dec7887b1449e476a3d4e03bba3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g1mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7083f975f83ac86d8e8b4fd66b9d6ee7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"67b771271ff21862433e51f7918d96bd*5&ed7810bb9abe59b0bf9e78a197fdd92c*5&e0c2621f68f036de68d4495059421a54&4d443a55112a2793220ea7f380cb881b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g1mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f0a2534c08d8b6b443376043f39bd323*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e23fbbbac6b8d541f664d0c21ca6ee4d*5&64d4a2037a704965fc7e95ff279d9916*5&af92a1afd97a5d312f41c765a6c36886&b34beac4ce0b53220bb10259018d8607"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g1mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7021edcae8e5b53edb3725adf6eab8ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4484a433489cc183e8354c71099ae58a*5&a08a595eb7d206d47c90702afbf3b733*5&b254b09a99412ef46c2a4aa98caf6885&04a932fb05a72556866477c0be129dc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"41d8e0ec36e6568dd05f096cf9389c41*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"22274c567f384cd7178e40626e729fae*5&018bf7c25da617a41a6c3f409d805839"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"653be4ce5f4d5c03ead88a8aa30b39d9*5&3128b25319b57f77e0ea6e2d16f382c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"7569cc543031f4798864577ec7a11501*5&1dff8ed8ff1377af689a39a6c913c958"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5889f454a21b89d0cdd2a3387264dce1*10&353c8c5a6fc17b0349084a4853d0f157*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"d429e8655d5d1601ca2e32c6f029761b*10&87b44820b11c0a4e67650fa95fcc6510*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"8f964b4fd20694d47e3b0b61038a22ee*5&f8fe1b09c14fef2fb87ffb00959d933f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"891cd0a6c6e65924c68ad560f982594f*20&93ac7aa5eec65ff35e557975241434f0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"cf04d58e3f93d23afa2fd2f696dff674*20&98369edec6bfc44a7064d4c12487dc74*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"063a1513a654f0540fc7fa3d157d1f7c*5&a841e12d3bb57183c86aac38420444bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"585d713a85b7eae81dcbdfacff7bbbdc*5&b730db0a1ffa4f53d2c9e88ac5cafadb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"18b6cc52d7efb1002e9f57f7f73cd88d*5&35b401954a264216f52e56e04e3cf1b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"573dc99f3737017d4ae399a0e552b8d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"441a8a7b37f4ff22b7868358a83f457c&5e88e2364d84826de0b856b233b69b00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb32_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2ca98596e5fd4ac7abf7430b66c48236&ee128833b70f6da692ed40daaa729fa4&61d04f29139c0f3aa9fcf331807f21d4&7710c4fe3ef5069b50e7863191b032b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fcd5d177d343bd30a3a07f6dc6f870ab&0c665d1a1d9700bf5dd14da8f4fcf36f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"26801478a79df31f7e437660941396f8&eea51574754178588d1e9cd7edc008c9&550445a682fde679db3d04ea3be181df&2e9948c7053ffd5b628cb29f87d9f30e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ad8b1d0ad735652b4cadb6aaa6604cd4&73ab2b23594a48cbc20a745cc265dc9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"220b4c9223c8384efb95af7f8d2b451a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb32_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0141add0a7f942743ce4eb5e1c2fb668&36752c0cdc61c2989d19a0cc194ee7a8&348e33ce2457ad496244288f10a29cbd&0c7bd5994af56089952cf264185e3562"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cf4e861979739c19ce395afe2fb6dfce&c4092bad291c7baf7ff4725c94fada7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"100e289d30470f8d80ce5c81ffe03fe6&d0af3c10dd42cd99e8e82f1ffffc1bc4&bd1d6b1d9ce80ba3470561c9d2bc3dfc&ea55a78315e3beefd245b4d82cd243dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"16e0f7885e2761fd97f244ddecbd5669*2&c7da855e687f2f50639638c74aedec5d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b2eafe052949d9ba77d30060bda27b77&4154f86be57de703c33cdecd3d200b1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb32_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ed63a0fd57c82348d219ea69f17f9007*2&091535ae9478cd13b9169b7fc7707341*2&014682355a7d93f338ca99cf3f10ab87*2&35ae04d98421911abc0704af2b42f118*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"427679509e39b501ee776693f7f369b0*2&21d7833349a2b1a9c43220dc58282778*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"652cf474077e90521915807e34114a4e&4a4e1cb2c588afa2cedd99d6dfd22b23&36abc8eed3dc3ad1c0be21d78e418c28&f7864bd31432891afd2461f890dfdaf7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"1d804290fce32d7144e4e17ff7c1ff26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"278e3ee020754657b4d337ae97ee6b9b&a43dfce56806d97096019deb00962972"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"33b4bebc81cf71ba3bb7a78bc8cdb471*2&9b0fd18f15b7ba9cba3874fa9267cf8b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb32_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"62e53eec37f5ea76914574935d66a575&eb098860415a9f2742ef2afa2c4386de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cf0f0f027915ce4ec2ba074741be3421*2&ad6d1667861fe207f0036c2fe1cba061*2&601ec6772ce5cf6f9a01945243ad3150*2&6cfa619cb6a89e7764ca4158ecdd07c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2964f5bee4b8fa554d9a8c8dee8f3178*2&3cc54a95960926b4ee2dcfe55785aca6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7844c2d44e892566fac9c621aba5ce74&871b7d7a952f89cb2038cd69cf5540e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b308decb33e3d1a5db52587d19ea3854"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8c978ffb37cec53ee5909bb23d70f001&e1d0456e956a36cf980efe5143e30c48&fb7460794127f8449b2f363d7d191a72&92e3a3239a5d6ef48a53bccbd08d26db&2764db19dd6f97174384b26856e52da8&d02f6339b12413acf97b0286f6316ad6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c63be5b887ff6f1d5ed1770e6e9fd35a*3&c2e9c50927f14a25359801115fca1036*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"263c19904211afa037cc64c22e061864*2&968e0d1cc6f353ad4de74a46cdaac242*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c78c7a9066a3ca9edc47221d88e8fcc7*3&c16dff985b59a795d28b2f72a6abbc9d*3&6991b2400167939a37d8a0f4c8d4557d*3&0bf6f44660f4b86036b9d25973b9a506*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"10142cdc8c0b3fb49398040cddc81b3b*3&3d04bf9103ede464220c8d1ed1f2695a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c44beb9e6ce710cc2ded80cfe9d0a950*2&d408ea713a91a332bc365c256b5481df*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb32_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"bd6b57502de7d9b5b667715cdb9d6bec&0ef20eac7ccd971ea59ff9796c95b935"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c8773b8464bdb1318169a07ff66c1dd4&4d8467c084621d24b14800f19fc6a913&2c1b859c13099c0f600efe8e6279477c&d72f1b6c113a5c192197e789b31b6a16&c5276d1e93aaf949e2aa67af82696c4c&e74616d5f21197cf94817fac11966763"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5a6ae7b032eef9142656f37abdb69db9*3&5aa35dc5b380815d95f574afbb6796cf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"8759cd03485e33e19cb67b9a4e385988*2&cd4b5cc0f0f974b6a828f188e1a65855*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"63061e9145c7690ecb36e43c64200ff5*3&ab5a7533f1b7550ae2a364b583c019cc*3&5be4ce81df7293c1c03b9a541eeedeae*3&46d5f6f52f7617f2aac91900ede97099*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"02e10d507149f44f62e23a49da5491b7*3&fe7b80e6f5d02dbbef4cee8385e66506*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b775c9e48478310c33847d4ca8952f05*2&d19ec0e2fbc4fab9f5068123eb157c9a*2&50c9ca6fc67c7f37cd56f2b763e8864e&52260c010e342db7823e4e9734b2f69d&25322f46eb4c0a9a6126377e46ec941f&3889c1316c3920260840f63c6f99bae3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"275639633d614b3769eb423ca5dd86d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"461a13cd183f3343942e20324e043695&89c22e0dd5c4087e4cbf287d401c5649"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7762d2431e8df346b458f4cbc54c621e*4&d6b610d5037b937e2a2f8b70498bf5f6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb32_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"159ceca31bd388ff4078c6ded7c4f885*3&487eaba075eeb66b27594e9385ca5cf0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2998ea098faa8e7ef3462ec02e783a09*4&a479bffac8f3a24b6540580c9660ff80*4&7a9a23f1ce5d7e19b10f157ac5b0eb5d*4&4db6378da6720ccd4ecfff948165f67f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b591b156b85dc586ac43dac594bff0c*4&55469cbfbd238604a7f3232925c235be*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b9098b8e92d026de4933536f6f16bcf6*3&b99637f72987127ee807dbacdefe3998*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb32_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"008b6c6e8c649e90c67ecf5588c29380&1acda835f004a37b6b4d91f1e22c1811"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d78f24f468c34861584ee112ba136ea8&746e4c403e3eb5059cc99a048d43fb28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c7b40f3c09b85de5fdefc5cffb68740&c73e43fa01792a8790f833e9e2238fab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"594949b56a750c095f7275c81e286b71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g16mb32_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"8da2615ef502157905c5752a45d43c86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"79e057ffa8be1b98798337b55e4de5d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"834a93a932d23fb1898303777396c274"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb32_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f8a90e5138d104a44eeef0bd93c288e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9e5bd63c5bd20f4c1fdb2512069ed1cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"119623451a8e17f213ad62033851760a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"015e803701a7c31a779a9e7a52c417c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1208e4e6005d05e281a1edb4550f271b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"de437c9c49ef3aed151672be2331710a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"002a66da6b558f9df8310b5d0f04d06a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"45e2218aaac74d96ba33eae0ea3f7b67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4eebbdbef3d3b0470f0a1e26990eac28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"43782040e394023f8fe56c36f109918f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g120mb32_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"9e319c2dbb0ffe1c643afd0c3b6658a9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7e9b75088751cb80c57ceb605aa0d168*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"829ad8a454e87b794c79d0e4493f5a07*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8dc37d0abfb657e6ed71fac936d73329"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"51e7ac546abae208c674180f6f5b31b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"275ddd312a3641e52950476dccbf67aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb32_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"92a4b1181dc806b137d38b9f9fdac563"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"42dbe5b38a54a2bb0239b4710c11d4fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fdb3ed7d9e0efcfa4158ac54dab85026"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g200mb32_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2fc80c70598f25f82da0110a7991fe6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f718bb3bf3449bf5c2254edbc35fc1ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1201e651600e58f26343524c77925705*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g184mb32_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"72893e4b40435ea2eebfb1bf22bfa8bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"951652cd80217765b216de7b7f58c490"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1c34200f8c859f96708bc77d5f0994cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g80mb32_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2acbf808268080ee85ca006d5ecea765"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic mb32_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c77fae41364c7abb38d3ea5d5ae2d107"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5c42f1ce768b3ddcd8985a9f40121dbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g480mb32_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2a662777fa825629d807e891edae73cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"724bdb87f877411a54db7fe74467fcb6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2fb709f93e6075904c17feb44f7ec040"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"34e42780a51decb550d8c19382b4679d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c4f539128c8da97fd5f45fb1b96a9a75*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b1be2f91a84103036b8a72af62e8cdce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6d2d7dc04a783d2466d5e8254586a405*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"01dd1df87ae9a4ffcd21515c788cce45*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee052bcc822911d2466ec4aea419f1dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"d22b8ac2fe3bcbc7b3432790935e4809"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"50396f9925ff726cebf8458e65d2648e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g160mb32_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"713326c98d647501864aec3e58d4bb6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"06dbfe5bd9b0d1f5ce7c36d19e432ba1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"698325dc5f087553f2cec07e801585ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"fb25456661fdcecf7dad567eef1c6ee2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"693d6ffdc038f184ec10e560fcb74e57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d5bac3114817ac3fc0c8ef0299052087*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g960mb32_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"1c27e7e8583ccbfd56921eafb040d992"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bff9fa3c1b9334abdb6dfa70c90a86d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8fa95a5758aa759e7edc116060f243d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:31:ABcde32a16b mb32_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4b1f9e001e4acd901d967a309053c150"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d474eb381bd83a2e6ea9e7d6dd03f1a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"45d5be20e997f5aae935f72072903b36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c70218f11ec7191f97b3dceb1e85b6ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cc6b9f3637a167bc2061476919d6cfc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4061b9d84b64ba5c10caec101a4f1fc6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"23f304bfc12003ce2d59257a9e70ebe8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d0a46b4a8b1d949a392be260469431dd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b8bc26c10b16273cd4d3ad2ec35b1d25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7a57e9dd22f6dc71180276475b44eb95"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"639ea61c0812fb01baea1b9e3b6d1f78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1015f61bdaa30e1cc6708ff340fa2f5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"90d4332a77fecea5789112f12de40aa1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ef1f59ecbbaa8c487fdb73a741cb91c9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a6636546ce5adf9f0acaf477e66abd69*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2a67883ae22bfa035405625f0f1ced5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"07dae333154ee43db8bfe4f9b14db7e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"e2810586417cdda1b97b70e77f3e8fe2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"af8ac3bbcaf7a5fd73b64bf115dd9ee5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"88215388133a4c53f7fc3e97096be248*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fe995d15bd4ef9bf34988effdd0cffec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"daa9fc6d3e326e8584754b6b61975b73"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4159f9836db1a5aef8c2a93b9d7c47ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a15a102638a25e0f065c2a7a746271f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8764a6a414fc40762203bddfd0b47d7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ecdbe9a9c7771767b14dc00aee92645e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"52e1af26833e30b8723456d12c585342*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"43fbd1a793b613abab8b698465333071*5&98487fcd77bafb1e00d202c549f2bf4c&4b2e1bcdc74db050ecfd003b337956c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"3479d68c701348017ae7dc5fa3f746c4*5&f7a601da874766093f6e76533236edd9*5&f28ceabfa3416849900311a3ca786fff&5c23125825579ccb80fea96c54243551"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"4350034f41e2c309cec1c33a956cfdcd*5&428095eefa812f0b9443cfa2e9f340e9*5&b12e51678b6315e38ada45c08a9b1be8&4202f51043846e3450cfd7ad05dd8b69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"2258bc8566d4630daeba84fcc6a6e884*5&4f5051c7fb12f908b8b3d5ad2bf966d6*5&d73245b233f56968566ec1e4238c5531&13505637538e481ee56b1bbe9c9cb37a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"6b6c57b4f9e758bbee5b12e09e6b1f6e*5&7bfd7966e4cf1da363e398977c8fc81c*5&a574ee9a3ed20d1ffb1034e765668726&14ed0bdaef453c3b8d721ba6a57dc8d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"52374abd0e1064b2b00639e1fceb5e48*10&8bcadab0fb8cf2922d8f682664a6dbe2*10&5f1b4f0b726cfdc1a56b8ccee07dd11f*2&9f88e7f502927fec6dfce7c2b137c377*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5c3a36183cf07824d03e76f7f5935c19*5&23b06550af0b045b80a4698c7a48ff2b*5&14409f08e8644cc6dfeee92b6c5cfbea&489cda93c0b000837064149a4ca2570c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"adbe611f1f4217b17a8944468da5eb1f*5&331e88e1c3b40ca8c06c6d4540999604*5&daf2f373a41e45f8701a4e572fd8e5a0&7d932e820c08728db0fea286b68a185e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a66fbe099d8ba066a9f32b8730622ef8*5&758eccde5b7faf02adffbb34e4c23366&0d7167eee3954247574aba73e14e337f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3ea738663a0a8f738a44040206484846*5&302ab98c1ce79e812fe7a5e5a1a99784"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"868e271fd3a95faecb98b95a74bab480*5&312da3dcf187d46c06c92627386f23f6*5&c3fdd43e38044a06ab7b0d591502f5da&c5c543b30deef0a2bcb4a558b2336f0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8110155027655a91a0a78e73291a490e*5&0255cc9e3bbbb04964cca38573fa74dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"f3fed24d329f5a9a3faf3481e3080d75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g64mb1_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"9ec56d2ca95488bcedafc547b5e9b827*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"8d222d61dc8d5ea3a4967f0d9ac19011*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"8d24840617cdb66d1b71ec42478259dc*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g96mb1_ic96oc96_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"51a6ead1be22b2269b046d6ff68ebeca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"d16ac7464caae119e4c0e6fc9917ee18*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g128mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"342f4f8bc3c88b1470b70b53a5049b7f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"9beadbd5ed3e7fcd9bb7acfed23b3ae3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g160mb1_ic160oc160_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b62bf23f5ceeeb51da35a7df6454b33e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"96e89c177ce53ef51b4a1dc319cf0bd1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g192mb1_ic192oc192_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"d7e184c0c64afa49c13625edaab1affe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7a72caa84f199e808c9022f61b6d381f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g224mb1_ic224oc224_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"feda18ad2cd40ac53348a62c59b1bc63*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0b3a470345ed5c8d6ca77e68161b5098*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6925f9cf9ef181fa7cec96f5635048eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g128mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"583fafcfa7b5309b30f42e9200b161f6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a89051f79b392ac86759bee98c239f35*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"982cf9dcf97364096271c5535dd41ad1*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"49184274b56c0888a8fdfe2525abea49*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g192mb1_ic192oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b39060f69b1d6e89c571454d19f1055f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"99beacce250d946650ddf18cd52b5c6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g224mb1_ic224oc224_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f2b8cdefb5cfab321d5d65478d2ba404*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3215240fdcfa912dfc62aa53287ad1a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g256mb1_ic256oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"900b12df16fdbfb0e24efa885c0497fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b93a61ab522a37547aa1fca01979eef3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g288mb1_ic288oc288_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"75a34ccfd2a689d3033f75b0cc7ad5d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"388eb4946c6a682e81c8872088a780ae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g320mb1_ic320oc320_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4cc44333b91529d0f51bb3ce9f4bf410*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4102a47853b85bbf3c5f614098a3dd27*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g352mb1_ic352oc352_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e19c976723b8180f4907e2b334a1cbc8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b2cb70be80af9270ba192490eb6814e0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g384mb1_ic384oc384_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"db6bff0730045aa85791c57822e52703*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2256fec685389d03ba4e156121b905a4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g416mb1_ic416oc416_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"af517a3a2cc63c99deba8ca1fc80b351*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"72614ff960e9eb3a23ba25576770cd50*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g448mb1_ic448oc448_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"96b5e444015b8c1e088935cd8f3c6495*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"aace0973a6aaacd4e5a5f24bc0ed5225*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g480mb1_ic480oc480_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5a487089308be4ac74fa4d06879c5b8d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"932ff3ae4fc113b3271e81f561269565*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"28cfa7ba40a5d4e3923a35d826c841fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e5541fa9394c47bc0dc1d83ee69118d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"04b3127b04f4f3a984b0a0bdc4a9356e*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dfb717e4d2017b1e98f77f58b5b21901*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6fb85634517f0d9148687a1b28f5985c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5158ea99c864b86437da189e0161a021*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g384mb1_ic384oc384_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6bb8efb0652ef00f86e344f35b4acc5a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9775d44cb54d3bffade924486bf796ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"55055f96054ecfdc970255a45e39954b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g448mb1_ic448oc448_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d68bf707e20c45414e605d1ec043e13e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d3871c30b06e44dbf482d701789ac247*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c51b7f934f43ce1dcd4f39db5ae2b925*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f960faa8c946c4a8707d97eef62e8a44*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g544mb1_ic544oc544_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1e3ae1e3993170eb37a12f30ed60a6a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"52440d5fbb2bc857464c5f0fce76f35e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"56f366eaed17cd4a62a991258ed30a17*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g608mb1_ic608oc608_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e59aa13a13f81d92a28d2c7a320c05ee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c83b4137118ea78561fbcd3bb0c462a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g640mb1_ic640oc640_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dd4ddc92636c783636a22e553393892d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"699a4b8705284f6f156c36d3507d9a31*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d2ad4f8603035013f0e89ede54e6e5a3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g704mb1_ic704oc704_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"da5ec51827c7a303e81b9117e9d83a10*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"01bf3bd479b353abe15f33fec0947998*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g736mb1_ic736oc736_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"413e28a6d28296cd1e1550789e74b7bc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ad118be2693a3db7f2f57e68b71b67f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g768mb1_ic768oc768_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fe6b12e59053f6cbc567da641f409617*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"32853539cca4968bb3dbbea9e24e3362*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g800mb1_ic800oc800_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d52e76c5e0433fc988b8e64dbe280d3b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fb8fe1b598cbfd50aa51091debbd8fb1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7f7dd19a2aac8915b21fc70324b41f7c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g864mb1_ic864oc864_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"86fe124545396d5d287eef7f4d3d5199*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"37e044bbe69d0a955972371fa9e35f38*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g896mb1_ic896oc896_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2b1e23157b51001fbe5bdbabb1ecf462*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ff729ca2bfdcc1d08ccd988f3c0a6124*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g928mb1_ic928oc928_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6355e42f0aad1a4c4e5ddddad914a332*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c68b09e74d57adc7f83d8fd693e42f77*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g960mb1_ic960oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"84b32296bf6c62466a56844a55c562b9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0ed632558bc0932290bdca7113ce3fda*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g992mb1_ic992oc992_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5af847e62954e2b77e9769542c3f8ee5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"973b2075156f98e4bdb929325a35c520*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9d0e35326c43d6e902119eb65c8ae70c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7916eb7e533fdd1a507e6b89a448268c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"94c1fd17d9b21e89e405829d94a6700b*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"86cdc79aab6a0976aff6cd295953e832*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g576mb1_ic576oc576_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4b5e8c4e36f34724bfd4a2ea00ef9b3d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fead81f16a824b9afefdee866696cda8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g608mb1_ic608oc608_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4b93321b680a15fce5a1dbfc8c8f11e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"78bcec69dc7a62a490d297bbf78d9504*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g640mb1_ic640oc640_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9111a86fa287e5e04f1980783db2d3da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b465e77a58d7c0c8e36c1576f290426f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g672mb1_ic672oc672_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6c8efbdc9271c03acff6aa5f0c4af44c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0f7f28aaebff5c70f7e5f55c2225205d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g704mb1_ic704oc704_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"56bc42f48688821d70c69e165f81b09d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"dd132b0760c83a842ec0f3ab8ae80e8c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g736mb1_ic736oc736_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8f9e68e5bc84991aa202256d3d510d89*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8f93057a675b33f53e92daa1b3caaddc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g768mb1_ic768oc768_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a6af6fabcfe70e326baf851dbfdd8edf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"759a01c1d389455e16d986e30889103a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g800mb1_ic800oc800_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"14c9014f8eaaa52741326c491c6889a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e8acf40ff58883af3d2e1813852a88d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g832mb1_ic832oc832_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"030a9c13f942bb94b1a3022ac765b073*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7a6bade9ab651d71513fb6baf85a1d06*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g864mb1_ic864oc864_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"890cafb76cc4c9c1e69c477c51c1eee3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8ffe1f9ca34ae7547750ee41dad59e7d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g896mb1_ic896oc896_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"26b75154fb08fd1b3930f88c15297a02*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fc3eecb39346a2ccd049ed8097993ffc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g928mb1_ic928oc928_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b3f98a16043894d9be545b81c91cb933*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9ec7105a69dcf4453c1eeacc6460c49c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g960mb1_ic960oc960_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e4c7d300fe56d2cccc8da7baf7716745*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ca3ff5f990a7caa8da99d119bcb680bf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g992mb1_ic992oc992_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"91e6818b7a69fd4d35b802f797ef4760*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"13d3111c9f176646c0f275b71be1adc6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"abfa61954a0bea2dcf9500a237bf3e6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"de2dbc4d3c313a027abe14b3e2820c8f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e8fdd0c56fe216c7ae4bb815b651a9d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2b33f5bf4d7249c62febc28f787feaa3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d2d9da0814885ad1d9cfcff7d6789419*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"1b84946558478f0fa66eeed1d9760ab6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"8b5beac90c2588c9b027b074d490a0bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e095a230bafa160183d3c0bbcb7819ab*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"50eeef72760d786a33c98c0d03596c67*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"29a6a9d43ed5a1f980ae44ae546ce890*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"d93c71a1e468e816cfda3c65d5942d55*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"6fbac13495112728d5a6f89f2fc546a4*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7aa0ac63d5911cad82be7620aa7e0ea1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f3fb488e12f376b5c8d14425e9de9a3a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"da9902c8584616ed584135553c181f41*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"1b4867f12d017a5a63557cd024d5be53*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"be2254e0ef794af9c216e6f677a96382*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e81a7bd93696cefc15db6f8f1b837142*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"82d8be34dd5398af502301aab7f7e7a8*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"356db31051dc05dc3b239a360fec15b3*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2c063d4658bf984ef2ec23e1e7896377*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"76af6cca81fd32b57a480f0a44af8154*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"1db526ff1a27073e78bdebba065af3da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"645ed1c19f06fcd791646670be038903*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"9a41de2095d71b22e5b44470cad5059d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2ec07b28945b008ca68a69dacd3cd861*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ae610130dd94967261553a21b924a782*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b20d39e588310c414a2ee26aef7848f4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ecc9b8aef854cc13be8840cd041660da*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d52c484fbbae9756a4cce693bd061f76*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c4abbafd184a2c07f9a1998fca93c217*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"a062575b096ad72ded0393822ad0daaf*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f76ce165045f599beea000e921f88812*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"9d3d8f36eed3eeab37eb8187efbae5bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3e5a34aa45a089db1ebbff97d1d10fec*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7a8eb39677579b5e0b6dba3a2f425e57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4debf10acb089eeddf12ac148a43a2a2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"4d8e06a2568dfacb61e73a69c1e52733*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ae7a575c9458def3703bd77ec0eb86ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"daee601f4922898f2cacfef7a5da04e4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2271df3456a9f8b9f3e5e88c87df2bda*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 mb1_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"3de0aa4fb603f35336ae1ee856939431*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"de8e31ebde00726c78df4ef3cd9e700b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"81b91b78ae67800c0deb31c3c8fad2fa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6340c856aa313cc98c7b46d9b908e009*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"7e00bf8c37c6a7f858d5325127d0f896*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"578176158254d0f40a7169202d6c93c9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6ce878caaf2cd94476134fd3bd7bec17*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"786a0376fc39469656bda31d9edaa12a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"4b6c4ce76583e0d56d91c9389147dae2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"6d274b7f93591e481ccdda0bc78341db*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e89c8c61bfe772c8928cbc1444517d41*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"af3e0aefce01e821d077fc7a3c0c9742*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a7c93ff63dbe79fb522f7efd733eddf4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"89c7acb74af652f3348d744c202ebb70*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"af7897aaf87ef2d7cf277c6edbd15c14*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"aceef074d3f242dd62b8d0023bd33552*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e61b984071db7ef03ea695c8b2bc5cc1*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"696d4529269e19f99e9c4637550d0078*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"f47d0ba60f73dbf153327b5df7ea47ff*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ca944c31fa1b86a7f017d29336ec5a32*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"55943d3de1b28ebf34c8a2426a494724*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"40ed5ed371054240b731da434727872d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"fd240d7f7e5aa66893c47728b1673de5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"350fdd2fff2249afa3023ac65aceb755*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e7e9f94efdced256771ebcdfcb826f1f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"19272880c78716583ae99d0c357125e8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"4954cb69e967197d23d28a5f66d93902*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"9f8707d6613cd176b3e32203a8937c01*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"1868eba28d3359352f76ee2d39b4af6c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5acb08933a83f74d0eaead5bb6192d03*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"38b15f53417e2b2ff0fd2478870d3247*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ce151e9878ff2fe06fcd81c15e6993fd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"34f26f3fb3400345c602215d8bca9a28*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"12363b3badabc92c0c4b836a64df341b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c3f86712fb59c9cffd52767edb5c6f1a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"151bf1006b9392fef39628aa50e504be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"64b7aaf2608817ae20cfb6938f315515*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"b0dfc66a2ab248023f6cd2a16b803559*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0858dec4e594bf15dfda1a149db9d3eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"2de414f1471a29874dbc15c193549867*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"783cce0e256f269fc783bcc0aa7e8090*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 --attr-zero-points=src0:common:1 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"605a8ffd5d51adfe7124f9a2a9fb90d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"40ef347d1c146c4d2188a03413ac7625*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0 mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d1026143abe2746a7d2687a9fda4fb6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic16oc32_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"d1c80211099719dda63770f2d25bee36*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic16oc16_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"81c539de05316656a87abc9cc111bb1a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic16oc72_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"d6d8bc7714ee37c71b20f2105ef8e1f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 g72mb1_ic72oc72_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"9447f33c752e2f43f76136800cd83cb0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic72oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"439151116331d32df1764aa55a48d895*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic24oc88_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c531a4b09f61346af5e1283a7ef908a8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g88mb1_ic88oc88_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"746cac8cfc2ced56aa0ec55941e48a04*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic88oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c3ea0dc4d1172dffac1a80a3824f5c9a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0cc84350a9307383915a9577c0228365*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"c6041a06f319cfa59899bff62d660ba8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d17a384263c05a402ba6e16ea0c11f60*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"849f2baf9155c2b129ac3673f9794bb9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4cd906b4d380f310900d5071a612c38a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"082faf31d90d60dbbe6a6cf0ee316666*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"c444c7bd4f7e0762486934fe7db01aa8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic240oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aef11be0836fd1cd132c4791305938ea*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"daadfe41b828283b705775fc96f955ac*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic240oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8e0f5167a9ad2865d0c0400ae19efd0f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0acdfa55b88676f189e4e8faaa571792*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"1eebfce676b105f9c5f9e1fd04c496b3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"56eb47b742d51afe42902a421e5afdd9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3ed148194bbe85661c39d0a1b15d43a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic120oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d6c54f34239ab48e35f8335eb7427985*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"837ad58312530bc916259e336495c5a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"4efd6f65a129c71c7a5dad3ac12d37e0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic144oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b167fb9d6bfb30de3aa2565c9a69d97f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a186f6b2fa3d0234028beadccf954081*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic144oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"af42d02d7e0f33a8bc619955c8d5a707*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f5baab17a81599a8677035f0d755997f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"522abec63d5d6db119daa492657400da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e47695170a9c9bc3130c988364a2b988*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ed64fe727ebd1ada268da430d674fdd6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic288oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b9e456028d8666ea1b8f273af4fb9889*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3861b9e0721fcf14d062236a6d97a7cc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"035ebc3c9243434a9ca598066fbe919b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic576oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b0887ade20e60365de9d6937da892ed0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"15754b366e6c65b2fe352a8d1f649dc6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic576oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a5dd5ada19ec2d5ac2c276502cfc8462*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"921ba4dd2eb5b2ffb7460cbfd4f00d86*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-zero-points=src0:common:1 mb1_ic576oc128_ih5oh5kh1sh1dh0ph0_iw11ow11kw1sw1dw0pw0_n"53af3927cd9c728b4bdf2ff60bd19e5e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0+binary_mul:u8:14:aBcd32b+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic576oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0963d4044a00c823d01be5ffc3e079b6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic128oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"dad65a20885d756fe6065001598565cd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic16oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b91cf66558201af1d05e55b290d46062*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-zero-points=src0:common:1 mb1_ic192oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"6e4d026932232be8a01e55c871504f47*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic160oc128_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"fc8b2a3a1c6a401fd129f43fe1305f60*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic128oc19_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"6157208eef2223a0cbc22b40ec596880*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"a7141440b3157da8ef93c7ce0bb5f57a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d549346acdb6178081e7f2738b291735*5&76f52b7a8cc812c9e5cc5afe4f577f49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"744db7511b60c9e1bd03f971067efe62*5&38e41bdf6feae9059f522c88fffa3255"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0b9987497f09bd345653d3c81475a236*10&d4d97fce64d148f0934d76954eda56f2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"ebf448933ff284fe5dad1963b4dfcd1a*5&3a2c0feea30ff857a7238ee45e23e2e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"2d3912ac75fad0f343ddafc3baa75c1f*5&4b70966176b20f8dcbac7202d67fd968"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"046dfd53094a6baf1e9bd18b191cebeb*5&a17a9681ef973dc3dafd790411a1e769"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d3d9ec8ed23658b86def9283726992df*10&12b32b40a9ab1fa91af443e1caa680c8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f5a37e7bd2ead3e5b8a9bf06a3cec00b*5&1893c57c5923da92119bbe992e0ccb2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ea1ccb30d81fccb977dfc50d7a6edd22*10&fec6df2a420581a8210103ebece12852*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"187d71d00ddeb130824983f4420ebf87*5&0f84906698a1427005c1d4cf01927410"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"a9c203b9b6ef94ed4e87d308cab0f402*5&fd016d259c057344a641ba1352f2b817"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b2a6f99724f7c61e2814655ffbb40508*5&24f875e1b16c512686922b4ead77773d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"356f8c8c30845b9fd916a72d73590e5b*5&d20fffb4625def9c760b721fb084639e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4f73f96cc5aefc51cc9cb335f41e6b5b*5&188659c76422bf10e950347a5d97f3f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d213f538a52e5daf58067586b14323f9*5&b1ac7163f6de54c8b72cc2993856f06f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5e540ba16b74f3836fd7fb8b50b5f6c5*5&8091e281475fd6d9497e9423b7a335fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"887e16f709db9da638b6065125423a9c*15&fb2f2ffbade96b363e359409dd5ae180*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0c90db4830549d94910b302040a76893*15&a399d33427bbbc43c9aa68c98a2a97be*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"014da20de7d173269ee9e71efca522d8*15&8366c862c1b77c732ad91434907165c2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b2478b6125777c86fd94ccb1d2467375*5&33e15fd8b16045ea6e49e5d2bd7ab999"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"db5a0600ee962d570337763409839072*5&ebdfdd9cf1f9607433a417b58c2b752a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ddbdc2726feada30b346387e285299a6*5&9dfb8a9a04b4b4172fc06d6e949a05d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"eda75bd60766cdcc18e76ac897bdd996*5&ee88d86616fd8d6f715072efbc0bd631"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"66002ca7f4d10c473b731b9aaa759703*25&ca6937eb8ecb4906e545c4109550c514*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"04403ff90b77f38331631ae1f8fc5f8b*25&c388d60d33e2b7ef0fe18bcc5dbce6d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ef6b1174bf63a4bd17d7b06f1415d6b3*25&43badbbd0a29338ca3b1c601e2c306b7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"afc4ec2969d71da34444d5db97181fae*5&27ebc08ac2e24d491fcae497381bfcc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6c3160e4832e2cbad39d47a595878254*5&3019fa292e008e43e27cfe2072a69b8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"59f8e3b91d2ba9c9a7c0868923f4f56d*5&fe8e8804884e16bcfc71433fe546c00c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"60b2281424167220117f89432c4d5af4*5&cab3f4c4cc469657ca47619bccfcbfeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"425c13aba9ffa11b552d7fd1cc8e7251*5&cf993e451ab20c8b8859ae71ebd445e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5688c9e3730000a3a033e1b7166fa5cd*15&06a92c5bd64a2ccb4201cfaff5416655*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"feb8c23c9e9d844636c1b7cfd0b2df1a*10&5706a399bafb107d748c4e56fe9a31d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e4887c5aefc33c579d66fec427b4398a*10&1e1de623f9ee90fdbda636ec6cb4938f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"3d75032b9dde261e3eb8aecff72beafc*5&e95fcb8149b812b5964cd344ade68331"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"ee0a1765bfa6b6e683d8f986000268c5*5&d4ceab18de0183c93d029e7c051dd781"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"6608f325a3c641d74104565e7bf0b611*5&7e15fb48b3288ea304e7cdcd2ccf7cee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"407c0c35e5d0afc70c197c60a64f0329*5&df6d6a3a512186268770a41351cf1e6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"2fd8e30939e26971d9005f9389d39e24*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb1_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f90f2847adb38715b2c7d11ef19f026f&9c3d74244b352a30f7600e9f6b95b2ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"92a83e74fe7bcb81fa47e7009bd499c4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ce7ba7c775fb4a10692c9f6761bf20f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"fb6a0600dd359bea39bf6577b6416e13*10&f4bdd477fd5d810df394db89820afb18*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"2ce6e33fae28ce17520924724de91531*10&a819b79b92758518a688eb2e8e3305e6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2c88e6e10810c95f92c4ebcb967d6fe0*5&d0d237da55e5347512eb2a33bc45672e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d12a6bcf43893024ddc8d8d97689c239*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"54ba8f1b6e3126d189b3dfae1560bf14*10&18fc357a4719be1c4c5b943d8ccede71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"904a9a293da526995d8f3e4fda91a43b*10&d71599a82e7d1022b94f0435c4683be2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4c2bf044690c59d7c393e499743d42bb*5&c6876bbd2d2827871a8bd7f18b8e4482*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9267208f5277d60ec75d8c749863a697*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ca161654e62bc422a66fb5dbd0395e0a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b29e741c2a05991b09fe5fca6657b87d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c547d49c481a6e28a6479b3948a2833b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d594efaefcf7d572a2074ebeaad6b6ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5e4d4ba9767a9079cac0c7aac439cd9d*10&202932c3c55e895eff25991c4812713c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"bb4676057c8c62d7e4e4f899db903624*10&fef918bca6dba02ce8dd13dccd56adcc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4a3de8a874f93979cb0bd5904b445783*5&5fd97f7eb540482f205ffafa1d44800d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"11814e1fd9fbada66177f9cb5da10695*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc16_ih224oh224kh7sh1dh0ph3_iw224ow224kw7sw1dw0pw3_n"07f7693e1546602d7a8cd90da2d5f4b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc16_ih224oh224kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"4642892da8dd09f7c32cb084abea482e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"28a00bb6836550d19b7021e6d24e6639"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"d93f6303c80e496c28d1092da91ab515"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e696ef907b71bd8e819f3023619463e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"67b1e4dae367368f6881de037a9fd6db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"802c9527a3f72c22f6f5e0484e85af3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c718832f89b20a67b854f1894078462f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f82d07da3b58b9ab915d270290b69900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9a3f1ad7f977b44e50484aadd904fd49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"12fa67a30d963a081551e336a4b218eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3cb5d083d1e3607ae796475188bacb94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"10f1a659fe8411ad8bff81226ea5af0a*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ac3ca04f6c19f3fe4a4597f499544ce7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic768oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e90dce5cdfaa16b2d1cbf8885a75f8ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic1152oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8e9721203ab0cb20c3a685a4e546f0cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25bc96fc447274f47e05db1631a6ec35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"31c1f0d53367459e5229e8f7102591ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"410a134134fd7ad366091ac1c2442377*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6dd86b8a708667eba8ac87f957d1e417*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic1536oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"379d672c0d02785bea7ef42b08bf8572*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic2048oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a16e254df9ea49393c989a5c1e3e01fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic2816oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d9d98b1bda7676eb01122181c94ebc7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f47131548af8005736da5743040ebaf7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"73d66bce91818444cca9ce933e0c1828"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7c8ddcd962fa64597f9d71a52bcc74dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic2560oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"62c22a0133006f58e6e07604a0ca4343"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb32_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"a29cff91d4175d7d9434370e41f9b3d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25 g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"35c005ab93cf7fa6c27751382e7103bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"eb815e04ed6284ea596b47f4e8dd3330"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"2a8f0e06fa659805874f5c043a8983e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"2a8ef1cc136f0ced7bac3895be5231d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"076c45ab180284e2c12bfc82529d7ac3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"310a35bf7a9e787ac0ea7b2804fda1e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2d0040c98832735730e3e3fab5170d12&0aaeed7a88588742dae8a7096c3bd021&52ade43a7c594142ab05a66b84649e9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9c0a6b6cc795567f7d07b9b4a215beb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"d42f4d37ef1b96205dfe35665166042a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5f07a2fc3e31dd6c15c0a53453abe0c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"738a2ec8f114938a39aca237ce228a28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9e44d49e0dd5e4e1ab5deb051ce9f0d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0e0a13f5160062a0d4c7c0084f5fffb4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"f27712670958a50c2061518330f9c10b*2&31d5fb0bfa61f5b84d76815f05d03ecb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4864273160c03683c6648dc330a8807e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"01a9a66bc73a0861311b79b6b67a843c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"67dad34f12fed6bd28ef1c6db571ef0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"203311717842373fc85ea7a4dc1b5b4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1e81d36e1dd0df40a64c6aee393dff68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"f3c8e8e4ae3a500863b6120e8bcdecb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f675686c3d8b461fb95b3b4cb01c2f43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ebab6838e61dc5bacd881d9daff100b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g200mb32_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"322c2c40ede9114ef0d2e639e7e47d32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6cae0096ee4a65bb5dc7e7c0b38ae6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bc27530874ece9a5ceb19f4517470072*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2e2c43c3f7f77519dc87711f76b7bba9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bb43771dfad44e8c26cce10f6abc281d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a42799a9b217dcb46098f41786e908be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9ec3beacc97983704fe1446e3daa5931"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"59f8c2383374c81b115c6cd0bd3953ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"be56040da0f8bb4cc06a6a804c46f225"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ad73d028d8811a39ed57a16795223eb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3a424913c997dd967514a13a35ff6138"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"60002c92e899f70b72215530847ed5b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5d5c8eec86e2b18d43ac6567f547d6d6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"13a368c98a3156e2da5b3c7c4e62c012*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0eaacb5f4b169526d4886684bd687930"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"81e4f981117ae1a35268532d6351806b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"5e5cffd3610a815101bf1ace6d744b28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2b1ac9a424157cea474919e8d78e6042"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aa8638b329623dd8aa2128ba739fee34*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"839aa1617ede55ecd7a21a4722c7c8f1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c807ad479991d0060211e35d0522e34f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"47de64e91dc1aab8be6ad9b55d03a28e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6274a4448d83cfdc6a1ad051c2ea8a55*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4337c44de7ad716c825b77ada512ecf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fca60d3386bd8b6772c685af562d5a7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ec4d7736715d753366d0ed8b9aec3ac9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"72f406e5f5ddb4951b52904954736c52&a6ab2dd231b404747ddf18824b6dcbfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"c6b8b43a5da4663b806f02aa3c356ff9&ba00cd44d0eadf48727a2cdfce3c4b82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ac24a7eb0ba4695a582df546e2189747&9daf0d46c45ad255cb7cd142387e73ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"0c0b9a9a582f3abdc8ea5d68f52aeb53&92514c3e4eb0c3ab28545dca3adfb90c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"739eb6fc3fa62056e4ecdbbc6a6c12dd&fd2429fdc3d5c413392abcaf34c8ca4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c8c86e74580c00248506d03a35b190e6&996cc836d21b4e9ff5bea94b86acc122"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c8d46c9006220ca4e433c27774d565db*2&420d42882c4a239b5c3323d9af7474c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2bfb208df2aaf7ac32111cedf7793fff&31beb8c798ae7def25fd28b31cd71731"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"999d4597047a4b6fd08c4ae110dbeecd&e4aa11e92a86aed63b39165b0763ff98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"17823b8b2b61061e2418677388092b7e&ce246b49c4955b6f369dcbfb3449e45e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"19369bb4c765740f79693a296d9c0160&99531c7aae9e19a6aec74c246ff8bbf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"336ba7050cc6fc0a5c7dcc2084841503*3&6a66dcbb2dffbf51df6e5e7c688de623*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"db4cf06759dbbf96274f43e0fe3cdd36*2&0b85fd216f2e774e5b53fb7e0d151232*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1ee8162e51452407cee6d7ceef28a8be*2&c1286d4db5cff61fcaa12767536f499d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"b14598aca583bbc010ca30383f576abc&f2bb73c5ffe5be098e58566fc8970e5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed4b4e8a7a0db1b9adc2573d315169e5&25365fc8e17d5e53ca29cf36ffbb97cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c69b4fb67ceb83834ee8d89ea222bbf2*4&734c963b09718b1b901057e5b379a502*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ad55b60275cd4ef1f4ecdc13edebe4e6*4&e22469a689ae567c782fad1b342998cf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f1c10f27cf9bfe6a8a366cb67a816b87*3&3933a584b05f4b5ae695bc513e268414*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c4bc2e06ffe5197caf39e1da73e71978&f7ec225047f666184668bb4b34430f35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"68c6808807427ead48205ce4d0ae6dde*3&7db1574fc476d821536dd80f99f9a6fc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0b12d41965ff4f4d68c281db4d1ccc25*2&b9d28396c1dae24f214b178bbd35b645*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e5ac4589ef81a21c89f1bab6a5cbe04d*2&43d611723d86be9e5a8d0b7c6c762eae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"08484885b08d59ca5209f77441190403&4d22fb27c426694f321e76d31d32dcec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b44cf4e10e520e5c3372030e5e597069&749460d33440b885d35d14324d42493d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"761837a24039dee5fee5f13ad2247b00*3&df1b4f8a0363704b251e5c723710890a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"98e65e5beebffa1144aa309e95b197a2*3&eba5b1d975def848f2d0c9f4a8f5a2ab*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8f34bf740f75d2719bc1487d8307087*2&157b58e04a12a105774538558438b5b3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"be97b0f17c3be675a74e8b223917ff55&35a5699c7d2d81cca99bb82e297a463e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bce001085fc44a563fdf2f8432c21db2&9e6a371131498c21b00199407605905e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"386555770285cf506330aad81116cc49&c35a4616d2cdcd27f4a4b195c1eca7f1&ce4e314f1f32f4d02438a0bfa24a4d15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"623a9bc72ee88b209e8b8ae53c695384&a7f919aad3030d4d06ed93f8d51414f0&b173b7d09466f24b7272c01f23eeec4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"8b0f42f2906dd7e40208977ff05f9f19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"71317403f6a51b8f0fa9ea2606b49e81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"06a6b5ea7efc8b2d1e8eb6592b837a8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"ad866f5ad3c76029c5f3afed21ef3454"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"f6334eee52f470b434781758201bd192"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"6a1947bfdae905d43d73a2915d296f36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"af479f3b66a48f1e7dea9146773dca8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"c086d40e4ac5539cbfd05ab92a2a15e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c639345ee13992c0d017cc196042dc73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"27b81603b27d770863c4d9d278cb28aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c73bbdbfe61e1d04f87f994e08312fc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d52b041c9219655481e493987247277e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"77af4b111f7326324a56cad82f8d438a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"76d067687b832ff200a4183228f557d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"890d9d8120173980405ce458211c5311"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"76de3bd36e2885f1e88878ed3d6d2816"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"42bcd49ebbcefb534932a07e1352f165&0dc3390b72c3fd356c466d9cbdcbbef9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c2a672d5bfdd4440474fbae46fea107b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"86c85971b795d502a63bd61f5afdda4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0ee0d0aad77f548981b9d9f17ff6822b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8e4b134a8c77ef6d3677ec02cd78e7ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8f79c2cc1796da1b2f1d1d1c16a3de02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1c802402a88b16ec71161eeaa593b00e&3efe2d8b3a78b59db6904d3792bf8ac9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"9b5d39f64aba4b3e199c423d836390fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"7cc890f0719b5461d3b2e2f67ef87482"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"76e8465c929bf45bc959ec63093da563&444069404e66fbb2ee46e87eefa6caed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"50b9b8d3e053bad2f54dd13c0bfefc48&8ba5ec49b97d62cfd455f8c84364a022&971d2f354c07c4afc5f4fc3f8d1212a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"c940606581e690004dfd5598f85255e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"77c4c44ca7fce50833f75b169ca125ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"f1cf994d03fa3fba0d1c2dc60908e4ba&ea0c0d8b7e6c43e4a837d7e104d030c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"1fbbec8a1e8784a6ce5149a633b361ff&c12ac8fe483c6795d2d01c326fe3c05a&d3f431d4f05840634f9a174e20e51750"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"fa0abcb50a0605d8b0aaedfa603a4027"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"a4ec81069356eb300c3fb900a2174eee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"fcfaedb5369eef798870dc6c15ee8846&2909018b7efc2e9f698ca665c52c7f31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"59642d30890e429197da80da503a8631&c7f6b3c607ed61f332979867edc271a6&a7e5a023780af187c64d3e1045e07e17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"b199db55fabf15d11de4d814d24d5cef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx+binary_add:f16:2:abx+eltwise_relu:1.0 mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"12c020e9a43b9462e77e5654c54e9620&55c1fff242d320a54c0f502f4be6ece4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c8b356f464565e1c5d37b7eee1694b39*3&f9d1b614396f2db80a718164ff9142a8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"aea34abfac08138c375974f2b6780a4a&ffd80e37ca9123c1076366d46070fb1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"780e7fd35a3f5c5cc81b9b23116d9ebc*2&379ff7c73bced32d55edeca6d89f9485*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ca9e702a823602d09a64629b585a1637&0bfbf0d9732b53cb4e4a15f4ac1a90a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b5783aae3ce1ba16030d6fd5935b34c9*2&8e453638870f53f0bc06ca3e7c4f5b8b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ad7406b65d45b4b2dc453f65319cb022&c4827407eb5e7186bcc60cba124bce08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"23f9387b26ad0e3e07761a4928be1c85*2&c532cb6239dbfbdbe1fc24f129da85ef*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"816075a83012008f882fd64efb548f20&5df911508ddc3f37801292b7de32da44"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"fc32e545a874a32f003e392b2875322f*2&4d11e1bc475eaec1987f58779039dc70*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"fbc21925a0ea74c697fcde02d84c23f0&723665b23341a033a70886ff97c2e90c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"05889ea4ec7271c15bd2544cf8cab41b*3&8f589ce000f5a01b844e200e4edc2c70*3&a5820c73cf84ae2da35b6aef85569e80*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"13eacc4503da2d1d792aeb1489adb84d&d4949132d8c09a12f7464b597fa4d9bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8c56f6bc90ab7cbe4b74e8bece1aad64&c8242ebe700508eb10ba4313cf34c51d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"66bf1457d048b0b0146a25457abf1789&d52c4f1accac0a21b26fd055972c43b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a140db694281ed846163336deb2cdd79&2ab8e6332c15bfdee4275e611b554764"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5eb4cc905cc4054705ed32646e91041b&b58da88e3e742043c671a93374c155ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:15:acdb+eltwise_clip:-1.0:1.0 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"39a6cc15f05d3d28c741b08d82a3dc60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"c59b8c4e60cea5a52fd981c40e02da8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d8ee8435ed2080389268f6557cf66a11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5d0f19f553b87b4e7980db88c4e5911c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c96cd449e68c81a05dfb03ce10213c3f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8b87c8f8c30db30b35a06367f4d08e24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9225ab9902a2b59fb5b41e54056e3abc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1548bdb7f16a42e47caee1f01e419f9d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7d008bbb207097a438998511c4374b7f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f5a6a699e162302cc14066326e2735ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a0e9d0cc4db1169ec2c2de58301b3fe8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9c889060ad7aa972df7b00502115a1a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e3667cda43819a16e6d88d9873f5d3bc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"54b78336301ad0ee9c75aa0b3181af07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cff4c30ac4fb8bb7d2b0961731dde927"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"53c62dfd9197276be265bb840d69475a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6241ce0393e6f0a832cac841d6433f25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"520feda594b797ff8647519fc5f522b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee5e5e37ac7b42442c8bbf137c73bc7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"95bd0982923319b8073630e0e21991de*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"29aa746d5ebbe002ea3b23f0cb0a7249*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"e22a5157394bb8936dc45d7d867110d9*5&7472a7167b65a57837c62a2da1e3a4f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1344ow672kw7sw2dw0pw3_n"f1121a660ef12ac30badb95453f2dde3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"f04b376d5e79387b63b03b08a402c27c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"461196ad712fa4565872e73792f063ad*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"d9b5a53496aa19e91d741dc7f72d6c0c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"24b9af135e0e9dc6b29073c73e7eccd6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"8e5eb81d67d40dbece6d8da008db1fad*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"e983681466d4e9f80ff6f09fa122108f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"ac96f75a570699ad74e5541f75374f22*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"c7f6921112feca27742cc6f05f2cab53*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih200oh100kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"cabae23ac87e339881cc3f8413250522*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"7622d24bf9ddb8898eacd60607307957*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"fb7930c5ae8289e251e020fabcc8b2a6*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc512_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"56e3992a49e2707e67a559022f50700b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"c119d1328c73b6366e91b765c204f102*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"545b2ed6943ed132f35a5b6f21cd6e8c*115"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"82cb0a249ae4531e46d8a074aa195535*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"88f20710b260762d01a127dcce50a305*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"475a3fdca72b63a64a19e5c38319a299*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"9773f26a05cf341a137dd22ffa09bf02*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"6aec45c40be4f4db41222a2e887ed028*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic2048oc2048_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e348e9717bc990aeaf19071e00f35a81*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"eefdae668e24624a1578cf4bff298801*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"f9819810349be7c5bf8ddd3e4ea90380*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"660df4fdd982136feb07b3989c59bcf2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic2048oc2048_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"c5e22abf16b5184c0aa5556d65b5ead4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"fa8881c6579c91419e301709b30fd8ce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"fd4aa49ec4641663fe6590f2dacebbcc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"e6d574044c24aebc5e63c2292dad3009*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"59cd43f6fe2b71f742215d44250ae87d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"7a9ea989b8c3416d0c853e26ac8e0472*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"dfef53cf353c0d310b3dadb1e26c85db*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"bb45d04a4a7046c82e55944f80b1d847*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"fcbcd3926da361e1237ddaa81cd484be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih200oh200kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"668c210635aafef811b400d5a2b96926*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic256oc3_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"ef4134684cf526efc7cef47a58cc57f7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih200oh200kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"679ea5f24d957826ea7933d2fde57d2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih200oh100kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"4244eda28f596aadfe83b5785ea83ba0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"07edc4c86d57cb3b495cba780df4960a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih100oh100kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"70bb07adb2cbb7fdb13ae014a5731ecb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic256oc3_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"c319ede62119fad8bc0af4ecba9930b1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih100oh100kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"353b3f106f9411d8f72a265f0f74246f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih100oh50kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"a563f63a2948ef6430e9aae4b7d7187f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"791293312578ee6616a4dd954e93ab71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih50oh50kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"77bf69a68c6a78c872499bcc0a365bd9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic256oc3_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"08f0a306774d9c6b5da1f634226df091*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih50oh50kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"5233fa128a721d04b764a5cffd821a47*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih50oh25kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"10437a081d2e5ef9652480f4afbea326*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc18_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"b60e8b9df80bc11eddeb2defaac1f8b4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih25oh25kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"4f2136d2cfd9263da5f0b40c59b39817*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic256oc3_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"5e428581199f864853b51d0989ef3eca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih25oh25kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"4888169b5fd150760ea2ab709beb239e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"fa69a4587cd88a34eac998afa7d92bbd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"5c9eabe210fd4c4bd90fa942065c2785*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"636663945fc20eefda5b46bccc621b92*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"adff7d873393692313138ea6c5f537ca*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb100_ic256oc128_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a9f9be58c032ce485336c40e43f9da14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:13:ABcd32a16b+eltwise_logistic mb100_ic256oc81_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c4634f85091ab38c374ef19aaa4499b4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"64ae228dca8a5d2d81047f000d3477ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"98019eeaabc2bda0a77c31bad16033af*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"108749d6590ef13d58477f6f72c208bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"84277785f938cd1a5cbea1401e757ab2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"de27970e9366dd078591d9919d06ade5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc1024_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a3bda58c8a2d201e90a76ebf53cf5ccb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic1024oc1024_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"b2ba8c92a6fe4ea70bfcc16cc010fe68*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"eb3a814869e6b4bdcaf4b8e951f13c2e*5&1e23500a4eedcfe8d8cebfb01196feb9*5&1f6f604d5920ccceb3a4d28eff2bc483*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"780495937b4720db8f2d5840322a40df*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cb97ad1e52691dfc5b14465a58827a07*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic1024oc1024_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b9ac4d3e456b5a2dda68d454223d5c19*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f673ce5d42da4041eab5dc6622e6a80b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc2048_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6a25b706404d511a94b06422da6b3ea2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic2048oc2048_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"824874ca56d904b70ce388a8bd31fd20*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"e76ec3b736ea4df273dc34d7da85d443*5&b0dadea290fe9cf93c901b0b49aff388*5&534ac1bc683c0940bfe4452555db8830*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16e9293fb64dac998762d6c2c338a965*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"546b403b587ac41d8d41089559eec0c7*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic2048oc2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2b5b014affa190fd818257098008eb21*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b4511eaec2fa5b09a6fee7d7e4b880f7*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc4096_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b6b3dd4d241adb17a05c1661ca3300fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic4096oc4096_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ecec52e5517446b2b21662cad8858573*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"5c07464080bfcb4ce87d7bbf186d7773*5&ecd57a565c1ee44b775bdff183b545a3*5&0783dc5a198e608aca1fbba50b11c539*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2a5b5aabab69e762c88523cc95b6952b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc4096_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5300a22dd5178043ea1148eccb162d15*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic4096oc4096_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4be1134ba5c6539bc9f8ab0203808134*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"78d24a1f188fe408bbdfa2126300376b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"97948c8e8be26f2a9b8f5f6339a842f3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"e27fa4a97fb027ea37d30b0a5ec2ccb7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"b8ce4ad4f564f1ee69c14bb827b1232d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d9758e22db754b6e22e0fe0aee0c6260*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e35a4ea226b9483197ef20dc9992c6ab*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_add:f16:14:aBcd16b mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"66b4dbd2d991ddcffe32399b3aa059ef*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9c6b5cfd3b74fece10818bf137c587d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2538abfabf9ef876a82b141576e0f4e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"c227d80696846a99f64e854f787ee4c2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7933b4c13dc4d6c921b3d5518cc7e22f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1d5db44f31896c4a2cbc34f6f7905234*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"25c6c082e2099e10283be599aab59a56*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_add:f16:14:aBcd16b mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"6d96d441a184657a11383669f304aaf9*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"87f135fbf917e877519cc91060e0a158*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4753403a2287529f127e5bfde0408270*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"33f818c653d8bbe0a6e8aacbfac5f2f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"68f8d0ef09313f46e3747d30620a3e29*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"6b2c068a8c1db2173f835c35e1898bc3*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"758c30b76164edb95276397fe2c96392*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"52ee5328570df597c6db31f89894dcf0*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ac5d11c9b8358ceafdd74533978c956f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"672b95c39db337e8ba3b4515c8810e81*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"1605e1f083eccbf7c0ee849dc37bbca4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0b5b117f8c61f61dd13f99893ef2ce31*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d3c66b1e760aa5601f2f5984a241821a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0ca121089f5b1ceb8dfdf3c37a29c348*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"32533839147db08c6e461608f4e0f548*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f53414e09dbf7f117bd6f8fd2b82bd21*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"79f45f14a98d38736a90c2b1d8ddedc7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"eee90fcc768dabed7f62fa19a81e630e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"384f65db0da78d9479b8b34febd842cc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"544be18b6765bd16024553cb6b0b1de9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"5a4bb90c6c3801b531663faf68a7a430*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"d5d52123acb1b4ed42d53edfc2ea015d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"67875dbfc48b600aae7a8b7fb4a9a5a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ee1d6e27f59f6ea938498951abb75e6f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"2527e922e1a355e94d2fbf566aa3345c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"2af5474c5cb32a5fe585b85f0d99197d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"085f8d881bd460d95bc31fafee3d2ebe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"18437717f1c8fa0b391176f341ac17d4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"35a3a2581d123e1bb6f90f9acb60995b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"bc98d300d2918797224bf31f194e875c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"d17bc3fe48fd62dd21e5dd128838de1a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"e1784cf8fab235d3a535d37bbf7f7d88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"95fe06b2dea9c3a643ffd42b35f5195f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2:abx mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"f0c553e622e2a787247e627094c09e3d*5&78ec7fa67b94c0ef489d3eb503fdb2fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"01a9a3e74cdd948f30b236468b72486c*5&77ef54eda00a21f81ab348590cc23cfa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"f1395578aea4cfbf81b044438f494845*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fa54c0ab25434634a82fbec2d7898e21*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dd4eb18e6efef298f4bc931fb6f25c86*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e3a95359e86148eaebc2d025507fd444*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f43979af54ba3623d417045d9d4750d0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"42e34cdd9231e989a5fa1ee23c15f2be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ba2f7ed86e97188fde470d6907df4923*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fa4d5206504a16697f3d637cedc6de68*5&2e0d9fc213dd7a60bb0c3411529bcc66*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3822bf0ec21620d66301572bef18edff*5&6b86baa68a4107ef64f23d21f0789723*5&3f31d2b36453d33ed5f0a5abdbeb15eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3dfc6abbfef26780db1c92f3d86fbab6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5f4d4387f237c450b3ec5eab0fa54922*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"cd80ffddd462b6fdba8a950c92028593*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a51903f4908c82e3e88b15edd106b230*5&436e57dc1863459498f8b566200cf51d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7e2281a032b06bb93c98fc81749ec558*5&fdeddbd1a135929da3255166053a8a9f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7c33e31b0d9e28deaeba49c4c3688350*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih32oh32kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"79b257a641fa39e25f46025aea62e795*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih16oh16kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"9ce915243442e5e86f9e96597b0ace35*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"7bc7c8edba3949c0d6794ccf3215fe73*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"c3c8b98b82fd165499968d7dca21c7ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"826c6a67b13e8c24af7f2827003678df*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih4oh4kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"448dc8d2592c9057125b1fcd27bbb6bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih2oh1kh2sh2dh0ph0_iw25ow25kw2sw1dw0pw0_n"a6f43457f175142f850b72c06ef52156*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"86fe5693f41b70384a5f33ebbac4805a*5&49e36b80244d2370ccd83dc130f2a4a4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fc94444923f474f6c95821ef75ff24e2*5&b2316c3558cb4540e5cccbe01576890e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ae2d12508ad3fbe8646f2595fc73ac2e*5&3172a5a6061d1163c29773987af8147f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6d06b1c5a7c6a215b04b5aa16c23d80b*5&94cedb9f51635e848494d30a812a368d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"1a474b61ffc1d880fd98687a4ede5df7*5&9c6d3763b49ab7741dac1308ddc49a99*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"32b27dd56298467863d0d6b9806a9c6c*5&2aa7bfe52ad7a1b6d7a97689910273a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"8ceaa1acfbb9bf506f7126363568f9f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"165295d6a148651184569275f8c53a37*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"440da2435c7c0b5bafea3b5b0e60d852"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a672149c6149f30d3e0e9ad8b70f3020"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"414f834c46dcc3cc13f5177f6a32e498*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"c84b9ef9ca922e57599faeebe004724c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"47977c804bc0f43dbc9abee08c9d10de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"b49b0a84613003cdc52d1b837123286e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"b8f86742159e20fb503cd6609211b846*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"984579a4873a90e07a65dcc994174e8c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"4f70af513af7c61541bb0c69c255283d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g320mb1_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"526f34ae3c0f566997c2226f29ba66d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic320oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7a5b35112b545af97dffd1018f22adfc&0d4234b7b9b831b2a94512f211bc8ecf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ec2e061f01e4d70601bf29966c40f5d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"a8462d71f47938d2ebc511cbcf576de2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"7abb56bf1a5bf85e8dd86ff8ee768f9d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g640mb1_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"978c1722f99d6c3972cc9271f89ff9ed*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic640oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bac0bd06d3a5503fa48604481218b7c4*6&af1eff7b1c8c6fefeb1cd57eec578a40*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7a774fd19e5b53d18485b6d542714977*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"fa15bfd65764684012907bef458a971c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"ec7ac683b4aa3b2d188a0ebf4f475cd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g960mb1_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"9fbc0003c3a7444fdbe72254b7e3e814"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic960oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f7b21a38798784d1bbea4d667c726f5&7cfc877b92feec80a6792dece25271e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1fcf79fd5d54994a3a24c15dd0c388d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"2a7e85636196500a4d54ba7b85d7ad22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"af566c99c133635df7ecc0e7239554d4*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1056mb1_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"cf31c3342f8c57cc979444cf1a704c09*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic1056oc44_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c9c5e902dbff1ffe0aaf2440b32fa709*14&547f1982595466a9839eb3df882c82be*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3a829245dc52b4e4dbfb2e3c52555b91*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"382612698a8bf8fea7766f9470d1051d*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1056mb1_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"83561784212aa0f3e34e478f6188d6ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"d6e2175593e82c1fa73bfb31b7c06a6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"e6953e6e8420ba7e3ef936fc7694afca*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1824mb1_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"d05d8684957d1f64112f7c3ca576e9ca*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic1824oc76_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0028e3c748a5c3e96d712eb78ecd9989*18&f70b30a7b4c36166c0d8d7960399f254*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6ccc15fa7a9198c997b368c1c7e2c6cb*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"a661b9c09431c418f4105cafe76c088b*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"2531fb0a1b60f90c1e0301146164acd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"5d1d9321f088d6dc9bdce44662431602*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g3072mb1_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"897dfb34e08e44592823ea5ceba8070b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3072oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ec79c8407209972ebf83ce2d60ca9a3f*4&b4af8fab9c0bd13f7ed1eeceb01c3853*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"793362c155c0b1f5d13c14596262daef*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"790ed535fb8cb4d462708693c87728e9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"56c69e4373808d51dd5217bf0cb647af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"146a3971849720ad6b6ce030c7c5d907&034f0c8a8de418e2eef1ae6610721e2f*2&3901862b34cdbe1457bdcdc611b381b5*2&4f8bd1daad30afa7a32a64a7923e5f1b*2&99caea8adebc0c995446d96861cfe93c*2&bdb4d799be22a04a5d813bda20b80ab5*3&a205444d01cc58ffe2bbe4ea13989952*3&49769ea965f52c77fb088a2765df4a78*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6f31f41771c01a946a9cdf381cc63554&1862cae78bc9d4dafdee5ad0980f3bcc&31428231d90fcf5336d345b6c197083a&349bd0fdca9b31a0ecb74ac8b4f1f477&f0bde74977cbab2af7d3de52f86559bf&6ff14e24e1d0fa2d906fccbc71c2240d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6eaa3eb99f54ba8baf25b4dc5e9406df*2&ccc30589b6f5f73835225de50da2ffc2*2&4711900509cc39ba076c4462cead3af3*2&7f8ae36a6a13ec4ca4eecef205b33cd0*2&6005f0f924fc46c665343b5ca152bfd9*2&c0ac716cba4034096129b52d8eea2ac3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59f08b17f400ed7c0d8fda8c922877d3&55729e363ed1c69e2bff16d8a61faecc&6f5f738c491eb881c3211e9cfe08ac29&7e488a5c77e14675c32e851af380201f*2&68383ed3e972936fe1ee21321b30b8fc*2&35960c646fa3e2fb869827f8a1430c58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"b8c3798efe0d48355bd0119284c2a971&88c71ad8162d17903d7daef483933ba0&652064c638afe94f5d74e904da7712ae&8a3f70ecbb7a46504bfbcffcd4ec4346"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0230739e9a4dc84716354671c4eef4ac&2b3d0038d3cb8b8a60e4649a194e6c0b&b5e1b52f69465654966ead3fa4de8cb3&2220536a94ceae0fd5b8c5e47dc2c0a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b89842e09d240b474dd105c202a7d74e*7&ae247cc19c39b1586a55dc4139834e42*7&9a9521c0574725771acd3cd063b4d38b&8e8078312287b9fc5875682a2827de25*3&4f40357e9a88f5767709630ed883d8fe*3&980b42bfa779ed719b6fdafbe446a5cd*3&26811f41610d279a7a3e30d2ac3e72d2*3&b61ec43f533551a7d1fa5bec08d3b40b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ee76c65ab164e92fb9c66b02184dd252&d57bb8a2e3835d1895d24515ce0e0ef6&7f1b7ca402b74e63b297cdeecb12ee13&87d5cb68781f45468772348eeee1e31c&3f3256dadeb42f835aac5df11fd73ae3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a839b4e82393c7cd1c6dec89590dfa42&79af563135021a195b0acc986d90d2e8&9144e5ccac2a681363cb2f0f31353ff9&14ebb3e00147a8d2a34289e8a62253f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"399621053cf45e3246009723373c09db*7&79dd8ad989de5caf1d14a965b0dd5591*3&42c49dec58466a0c7b31970009103ee0*3&ef92986cbeb47646047eb4d64efaab74*3&e64217970f716b12dea2b10e2b8f4e36*3&8af1e834ad0d0c0caf33c5f8d3d72800*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c55e4e0893441be1fd3e1898e965abf5*6&f19d86a0fd4e4893b1769649242ba36d*2&b56d6a3bb5baa194033c05ecc96259d3*2&b4c8953191858fdfb22ca04bc99a27d8*3&07c89fbb80e58c437d6a42d2abacabb7*3&433f22e911743bfc655725b6acf81a0b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f9e0619d00b69c4fe1d86213d26184bc&dc150f81ae357eeffaa7fdeaa9162f52&c7223a86b51cb34a7ee4fc4272fc54b4&ed36fd0508e4c3b1ede9d486ffc5afe7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"44890134196e651ab66e6e3d817129c4&411a768ff80ffd6e8febfce4b51cac66&59dffada4d5acecac5f99f7009e9b1f5&18f7792cae24e4930cb8d0ed0a9a0881"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e76dcd7b33e4062eeb039d4efceb03b*15&dbd90443704579aa4f70d904af439a27&d1d7fa8100229cc7086a2f8319cb9995&6f9e71448a1661fbddf2938593cf138f&809469a5ea0d6cadb457565228fa4193"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9a77f660a271535a8ac709fe862c4a54*15&f3f3f1b6f3df22834e3228ae58d9dc60*35&9fd3b2f2996320c6936be6380fe2bb04&120dcff2542e43b6d460c100b0aa429d*5&836e5eb0830f0ab41c3aba8926a06f7e*5&297f959bfa94a683c1ee0ddedcc43105*5&9d5cc150dab4c08282c1543c17c289a6*5&2b451e6a38a279e299e1267b56953c5c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"54b7647a8b7dd4c95242fc35ece064ad&024a5a827c6cd1afada89b749dd348ee&45ce4ff8c8714e3ee719471586c5cd27&18e1bddf3a192192672f14ef7b34e631&220d9145e902f723b1ea520a3c6cf972"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29825b8a40604a201d3b8ff3df93720a&2912731d714eceedda8523e88db5db30&add56aa87536d4251871b543debbf4bc&9e983a2a1a0dd38b4ecb2291c22f150e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9408f4894dac9e1f3746fc3152c24386*35&20fe8cab9ef7cd3a2452e47080750780*5&e372b5a1d2563a90e19134eb175e6edd*5&d0fdd669cfe11104173903466981fe02*5&8f31a57acac5099665e7a446e3dcda3d*5&1469625133328195d5eda4967884f08d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cc5160fecf3b57a8f900ec8d17a52f21*34&f5893750cdfda2edfc43b005d14d610e*4&514f2b809199515981cfd9356cf6986f*4&6c3df049b8f26dc4f45fa8c96e9baec0*5&e6248a0ddb7b88484d86e2fdd3a700f7*5&dd05f88567546b60e2326f4a029a1929*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"5976b97ced1764e5b4cdeb3411e2df77&0f97f7ed2b282f9a145bca9e4e33c539&3bffd3c7efd34e0182bd3abe6e84587f&ea9c92d30c508898c9442ce6260b4c4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cfe1076496032b4576748569d1478ad4&dc5095918a3d832d6df503c97e684662&1f2993c24fde4b0684ea01b09eee45cf&2f79f37168a17b50929fa5a8122e783f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"22be69b375a4752f1b4d2817ea5d0a67&2046bc99699318e90cf97072e23e2eff&d3cbb0f660fab36967e58755ec0b57f8&dd055537222c2f36e5f8a5c8ede6edf7&55aa4ee018b64bc0331de4a018c4a249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6e8f556e974bca1fb452139aa1c98827&8d6b80eed54c30ae00204a5974d61ae9*3&c49bf35f35a83645a271595b84324569&637e865887b332864debcb06785cfd13*3&55a556908e5922377614e17af4b19c78*3&0e76d7ed195d3d2fff2bdfde0389fea7*2&3686fefa39e771c961c06f42ff2f34cb*2&5de2409a519c156f1c553ec307755aa6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"307f055631a9d74122cb486c79530a72&9a9016abbf599a42753cd589a3a0d6a0&0353a87766a682778e7056fd04bf2d56&fd719a113302ea6fda67282463f4cab4&91a7f71d3045bc45d400c791e14e46be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b35a7d9f9ac79f3fac8d08834f419c88&84e9e1b215da3ea9d9b9acbbba6c0a26&e2cdda0f0c48333640f2fc07e61b207e&463c1656bc3c465bf3a69fe689dbc197"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"68c4dd327185b175a1a60b3d268c27f2*2&5d1a1298a6975bfc872664f23d986872*2&769db61be936278067666741bf427be3*2&83996c26392a01fc587a799d3a9cd297*2&e1c79e4336bedf193748423951d5144b*2&6c3d9d9446709fe1d1765af071bb31ed*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b8ccd8d78b35f318df1d9328f2e21f79*2&476cd2b148a51841ccc21ad0f3074cba*2&64287bbcb495c5430f917697f5cc2a64*2&155898e3e9dd6b64473968089af331ac*2&b3fe6463b0fd907f9186e38ba7ae3276*2&ed0626cc6ed185623ec3f7b4bf5b9e56*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"7463556ccd81dc432404b02a3173599a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"447830836f13f954b990ea7d0a751210*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"353adc2d3596118d3a2bce6c69298d07*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"a84044cb314ab3a9cba4b89da68511b5*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"572d42677645fb01c0030099d8947f42*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"64d9f2b9bf9ca3420402e5398ec2adcc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"d4fbd1300f2620015db9cc6d5e5e14d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"50b06404ad8332b0ad4e80190cb79631*5&5d55c4d646bc8da4014b8f8d3bf1df52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"df64cb0ee6e486b01cd30274e700ed14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"388f0466330071ba77534ca316dbada8*5&22b21fff7b4f824d4e83fa228555681a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9a90290713e4817341be2b7d11854b94*5&097a4881a79f92915c37b48af02b750d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"306cf03a47fc4a18b0aadcff2f4ec69c*15&53689b1971d702b6c67875c4d3dbba4b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"7938f9e9df820b2205cd1c22b042e323*5&0eba9f8ed582d9036ce45423c75d2c08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e90412fab09b4cc3d401dfa04ef189e3*10&cba30578f1772afe8dfa06df4cafd1f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"797946c438259b385bf0fa3387026e3c*10&d348f5335df59c28b99578f643b254bc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"f7a3e9c5f7f75dea142f3a439fdd1309*5&518e579d82c67d28854850bd8287f9e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ea5ba92c80aef4efa60d6d661ea04a89*5&799dda0e9012613ece6784eccee701ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"8d5b9a90df4d393b2be261c1cee9801b*5&6005001ab984ec8900c8ab5574c000ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0d7d211413582c574b55781c1716a883*5&0b1526f31e27e41900e95dd0294ac444"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"39ce11e569834993d7ad2847a458a630*15&7675884b60c447193ad4d2959e0731c7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f2203e77c230c3f9ef94c96e309e1153*15&694d1a5e501e345802b978c2c577613d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b43cf303bcc148ed3f93d2cfaf86d6f4*15&7463643a1d8ed788bca377d6eb9cecc2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"8c99499429a4352e4f443f9b4cb3832a*5&2e01a9d084be1845dc874cc241c8e763"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"abb8c89b35737797862446c1cf0df7c6*5&a89601358c139baec0fa449760bfc29e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"34c0fa46bfbd6610bb296e2f874539a0*5&2a8ee17581257728f8ce568121ca4932"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"683bd9e376d3171df3b55858f00aa897*5&18da1fbcd7355f5fe4b2a8c62233e33e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e61af9927ce1f55d2fd41c4de342bc68*25&4031a675f8f1f099c1eb1ddf12f5e7fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"93ddd3de1c46dc3a817b3834f5534be8*65&26294a931ecdc248b9f710759fa97583*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8822216dd2737ed40b6c20ed0a26af3a*25&e3bc9b73aa7a31cc9e2de23d0744a12d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"564bb300121b62b908d5d2df568abe33*5&3497e9e70116aad37b5c8c7d667a3a5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"986294948e32f211d0596e9842a244d0*5&72db49ab32329a8290d51991b046042d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"a09f10c0bc4a6600ad60a062ffd743ea*5&c27d638688e70524de26f17b50c5b363"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2d751713e539192e155b5c759f9a8beb*5&dc45ca65bd48c808b46ee8e60c051b81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7bb2e2b2f0586ca01348f316f7c52b73*10&e686a727a488f8fd6464d26756f6ccac*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"07b10c893507b1190d0d4650d8ff1a6f*10&e35e604ba5a2564d5b02b46b2e1b6bdf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c5f08a5575680c980229d07256d2ac5d*10&f12ef6c93d2be4155646dfb8fee7494a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"38c82d371f6f9d36463de10940defb9d*5&11c53d279a0b218016104b1574b0d455"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"efe0fa471c3400a20aa38ded5d69c663*5&bfec7f2527fa75549b15c678c9ebe321"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"21cfa2127b9de5f75965c55833a604d0*5&c038d6e1715093a856919f441b179643"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"fe991adf871a0621b1e35c8ee8188710*5&472a4415032fde56c0d027e0e6adf657"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"26a965d229e7ad29be967cf8adbd0f2b*40&d79be6f2f211c6fbb22aca9f5aec4183*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"e7a32c4f4d474254d5e21b6ee120d05b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"76794601a8599f38fa95c762a93e5631"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"bbf13dd67aff72c62fd3367ac443fbd3*40&27b75b5374421d785a8bf12b78ace13a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4386c3ebdc6ab30b8e166df2d40e75d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b213464e5fa6306af7189cdc125c301d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"51e1303e8434e7fb119b6c26eeb5eb1a*40&59218c6dcdbc2b504e8039ed8d6117fe*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"265dfaaab7893c34a73246fbba3e363d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"8bfe98f5a75a4ee8a453932bab1b1d93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"d0ede370c7a1be87fee9326854987692*5&1209be432992144c0e8de1c56b3286a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"61a563addc10301176160687cd0f0e56*5&93a5302ccad6fd3ed48740d52b72ff67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"c0a61b688d4dc04b4cb4ea9f77e6d8da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"374e57562664fba5e72dea8822de3591"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"041b4fe81891247f16768244204dc228*5&4a5250710c3662bab2275a1fa7f3e335"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"9e775a10a81cd6fbddfa0ac2dcbd6a8d*5&a362af8b076850904552a3250156cb6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3f180f9e0d6390ac9424f08af851e44d*40&68a234bfb548900c689ca3a32428a725*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c9d65592d0f2fe1fde77fa662503d45b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"805a9cff960edb2b008812bb298d37c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"4c1f230ae59108b6aacd3f993ceb6bad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"f8efbfe7c1e77a2b37380dac8cfa6ad3*5&7f6953cdb6c44f56596d299c9b04c8fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d826b7fbcecb9d7606bd0a67dd6ddea2*5&1aad2dc4aa80ef09e72354a1c39544b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"68141f5c99aecea69ca29c9515820683*5&ba65e206884254821a71db95237a772e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"bf85409b6922048b82e0df80659f6e7b*10&ae4a47c6128140239ad598ac70a72b54*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5bfe89c2256c34587c109955f1f8be8e*10&b5b138267485db74bcb12df25a52289f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"84bec46a56dcc606ea809bdf3c81a991*5&facb9e8f85e987d13092731fdaa2aeab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"572f7451ab7a98cd977162b7200760f4*20&878d3ebc1eb609f5b9d5bba9ffc278af*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"37da9863b0bb8a2bf5d2c00198c8a37c*20&c0374d7d9a5753fdab5bba99cf061b8c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"d8f234bca45b83855e93ef80d99ace65*5&953af4a4d92f6991053756ba9f719bc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"cc7562920d0442dc928e46f2cd0751a6*5&a24cf218fae99c226026d8a7653463a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"ea2c51aaa2d0637e275805f48baf7447*5&c5a35fc41988287e391e3e93cf9e8f7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"02b63660f86fbc0976503c7a707d4f86*5&518cf841742c93b038de71aaad15407d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7f1f6e6a8e94e65d0bdfae44edc510f1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e00d17b47f3fc1829b74e9a4277eacab*5&cee8771dc6d50cfede295551c3884237"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d4bb429e9eaec0243a34ac8dd0722e43*10&e286498a2d34dd1a01ff2f01710637d4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b542f80011c5f6290bbfbfe3c9ff012*10&a7a110ec4e5eeeaa08385191b0edcbe9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc1024_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f7cd3c9a30718c44d847368aef81e82e*5&65c41959a728a963559e100c2bf0d4b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"90e1beab98ddd662bb57d37a61cb9cf9*5&b51b6819a3f7d63e633ae8cb205fa30b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1522816ac27721381174e6a864d18f07*5&20a7680ecb3d9d5e034e9a6ddc232ec1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"59236e427dc64796da96dc12d09d9fef*15&846610516697856d63a956f9615a1798*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"73f2ec88c28413882dd244f7e08b4f70*15&15e25c3669a154be8aa6f5a86859b909*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e4439d86484064aa62e1c3e0d44b0af3*15&84f4f7a4ad43a4c59b7879c08e4bbf91*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc2048_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4e539b3063f9d7272639496e3ad08f17*5&fc98a1856a911397b637e2480f76a5af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic2048oc2048_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"fd02e8c4dad016db45fe10b0b3743368*5&984da7b211de80a644483901e7bf1900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01ab41de1fb95a22aa8c004fd9774a9d*5&d50b4c4126cb077c9a392f6d1328c62f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"067d59f832ae0c3740d82335c1d05d64*110&9e94f2b87278658e8e8e5a698d2c03fe*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic2048oc2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b69ab2d4b5e3eb7ae54c71cfde649f5e*110&9b08e9a2b9a08fbba9f5894dfc37eb8c*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f848477c160caba5dc23c4908f161822*110&e4f914981216ec13c9568c5a692bf052*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc4096_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1e8e4f5a880510b67262de0c94af5a8*5&bf29bdd2bcbca0b71c27c3f78d618ac8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic4096oc4096_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"326804bab0e901111cb53afdd176fea7*5&be09a6bb6e8295d143e0c961a34eab52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"39d31ac4c1c559497f14266e0647fcf7*5&7932e7d32b49f991e5616e24e2f0d91f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc4096_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f27b8409a729f7cffe68fcf3452e7145*10&78b0233e26ffa58587235a4bbe7c5a60*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic4096oc4096_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"dbff8bee3438fe63fa718b19451d4cfa*10&0261a773d6290ecc85f673d6b446fca9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7764f06bb773226f65f766a630a45cbf*10&622315efd3bd4884605fd66d353faecd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"efecf32e808c41cb1bf86a636766e78b&317d2d474f233a34cb966795100480b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"0484aa3b508f9bd071d6595b97072de8&5984af049cfc3fbf45d9eff530f54215"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb32_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"8aa7a22cfbc832e0c8423189f0efa75f&ee07760e0df501fbcc9fb29887f18565"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"c0da0df0c1b941a4a3e917c94a431e17&091c639ece8b19e1f4c094d83d59b460"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"3235c227907f6b6972820d934f49f8cb*2&40e78991bf404a4468306b732fbf85c9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"f8063eabbc7aa353a220142bcb5f6697&605bc0543b681771ed84cc474adc5d5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ac2db6c26b45b0995fefd222a119883f&95a2797f1ed5920027cc52f136e9e239"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"345023ce0adaedd7a0e7016eab8f89b7&5298268af0e8db63774f10602499c5b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c921734b95c6f98197c5976d98ab900a&c0de19ae3aceb3c90f4cdc45055b3d82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bcfa1c90af9504fd30ff24be7ed5c000*3&8c4daffbf9267fa30dfa99cf88a2585d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"3f07b056723fec5663efe70afebd9a38*2&9049c68c154019e7988fd1c8858f2392*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"059cb2acc50b4fc6f83988fec312df30*2&217504117c00d64cd2fd960700503745*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"ca967ecf9fc47c0ceb615c80ab8b7256&811380b50011581341657a1d916a2910"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"238ee4d7b4fa158db7c292a5dfa42ab4&feb54e778cbacf1c3d6913a1f7a61b00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e6a2ba450aadaa0bbb1916ff190aa0da*4&4dd1ac039b10ef03217e5f0f3b3eca7a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb32_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"76b6858485edf88314627f6c3a521f47*4&7230e8d002a712bb6cc1b5d437c33e84*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d4f622c33b364e0750bf72399d1f7ccf*3&b8c40772e018f78972aa841314498497*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9129c20946d60011c403c53079d16611&03e10f64fffad26cddc2a3b35c049424"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e515b04f1afe90c3daee87de47c45320*3&9d271e0245df38f746965886c2aa836e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"45dc5878439e8b1c820a02b6334bb333*2&9a32556c8b95e4dce74bbf8018e8ddfa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5ced25af2910cc1a381749ea12198d5f*2&c902e4081fceb49917eef2d8e0aefbbe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"d0c22bf998bd4ccb9073301e072103d3&80dbea0b381b73dfa68cf8e6fea9e660"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1350104257ca4ace383d30ef729b1375&a2191f487ad101b624287c0dc42297bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"da24f492b26a35cb7473574b02b61e19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d34f5848407a1011eb52970169091e4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d3284d0d977b9d0ead67f69c5d525b8d*3&f1f3321d850bf1ac2583514d1ededee6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb32_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"137a67f28bf3fceec92e1b295c0f93c3*3&90f8a3e0e30832684549a03f8bb1bbf2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5f672dac63aca40eddbd9735b3056cfe*2&639fe4abbb2f161c8588361c8c2f0843*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d331c40cc959baff0733d460d552ef8a&a12e021f7fae7acf7c87d5ad887f3878"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d3cd2578a5a4fb2faeb7de6aee945799&01293e4938281fb80fb00e57d3456c8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"670c400eb58f684e3363619c445660a6&3d153617f2241572c44c7cdb0ad9a92a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g1280mb32_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0063994d22f54aaf7b23097471f1eba4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"aec826bec06840c078fdcf088cdda74f&3b1743e37763d0a1385a62805dd84bd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6fee8392749da0fdde3b5426aed88789"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"87b3fb87071d97df97f3ca94e3f7ba74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"3b032194f03d24cc488fa23402b93d81&31a010f3113152c945b3de3372c87ae8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g512mb32_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"c5ea7f029d93ae4977f16e1d5a9b117c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb32_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"32a4e6a6f808f8dabc30c5d955fdb7d9&070aa0a87078f09a391884daa8fe7575"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"6edff7f6cf0a7e31bd46427d954efc3b&e9427369254f450406077acebab68adf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb32_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"db0fdee3cadd837c939a8468c0ab7352*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb32_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"2af45643b6637af9b87cb0f648dd408e&5dcefe5eb3eaffce9f0ca91e4f3bb80c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"3f1a080ca0e4194f162d2868a7aaedce&d6314cd7cce67c540291c0e8348a90aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb32_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"d1021407b269ec66727ff090dc7fdbba*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g64mb32_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"c9c7962d73d2fc6b081843153f438e55&3e64e11ec7799350e0160930274cc814"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b5d95e43626c8b17c6b76108ae0009d&740f0fab637fb426f03a1392274cc12c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb32_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"4957cb29e7b7d7fa7a22c38a0c9a0f35*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"3dc9d606ac787d2ceb973a2e252c3aef&01de26e15f391e9e8f19ac3f7ff9d490&9c05718d50f38a0f5484051fa43800a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"33e719e9e13d8506f5507b1144f44e26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"89c9f7ae2d2f3e95d783ebf6f70ad4a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g1280mb1_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"c4dff3d9668920377aa133deaf5e9940*10&ceb1a6f3759601aab54544bde15bd9ce*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8d60979a5f8b575896fe8b1bca78ed65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dc9bc3b5e8864172840fc222627c6ab4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g512mb1_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"900356f962f2c971b99823d7e614c6d7*10&3b7ac2f10cfde439fee0117567efb1b9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"eb76e28cf4b63f8a2cf219201b5fb8f7&5283d660629566793956b25927ac043f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb1_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"108a2ef7594f59fb8213fe11dcd5113b*10&a057d770a20721fa333f3d7c760987d8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"93f31fd695408dd908e7d4a0632715b2&001b2152ce647a4333aeb438ff9ae42e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g256mb1_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"86086b0c1c35130d0e1c6426f560b9d4*10&b0f19299d1b1c8a1c1dfaf1d5efbee60*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"4005f9177116cd27d779452d66a5d470&9c0dc51f7e50b11da6cfa5ba4eebef7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g128mb1_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"cef7da3522bb1820612373ce2379e729*10&dac457321b6acdd711c2d7c0a0401830*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"9c11b8c45685beba237eb9ac91208ac7&ce424239b53d0108934fd1f8bc822421"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"4f102246c0a6e3114dd1b559ed9c79dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"a80e5ca75ca0ef75ba8cc082ca89070e&bb537702e0b7f2f2a59150ac2c030381&695888d7c73ab9602bbc64f8d3f99e48&9cd5a73fa9b05bd2c0b204ce3b9f37c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"5053011b4bfc90c7aa7c4aebd47cf2a8&ac697ee20c072f67f5a73c51bf962d09&27a64efaaf6382ea78d80fa89956f3fc&daed185c636a7ead195456fdac1ef1ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"f32c935d4ce767874aa67bfba55bacbf&e9a86e71cc87d71e4ca3e5d8a230bc2f&695f7683a053e815587214ce423cab71&c86f33f5fb31b7a430623818dce8bbf7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f1bf4866c4305f264232e76d146a4e0e&1a3b4be99c6855dbaa476d15ce73f0c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5db4b2eb1d4fcd9d49c879a3d1e697e3&e3f0ddeb4ae2867c771aeb8825883159&2b1f14c974d3b24ceaeedb93f8debda1&0dba391129241bdb7cbbb0cbe54ea7c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"043efe82adb3ae498b379d3cf8528720*2&2ba35ac1e0069608b9d8a1fa2be1f523*2&bf87e503203f759fa083e11a2ef9fe9b*2&c86b841c65e0452f8b72d0ed33eccbe4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb32_ic72oc72_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1a81ffcca28b81af1518cef2828e9a4f&5e978ccafa07e242937cbd47e0b166fc&761e63412381fc260008a64c9a0acd3f&5e38f150cc47949444fed8a79a252dd1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d4cb51266d72b7faf29c69e6be254595&024f8d421d9b54424af489b1d1f1384f&9ef77f01123a4f50f9d32c26537d7c2c&2ab483b40e35e9ebb0c20f66e5c829a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"59526c47e56b208fd9dfa761919f876c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9f1b5358979eb9fb2a3b6e85b1f58fe6&08d04fbfbbcb654e27fbf94a9df6a7bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"81e87693f441c52b93c8d6fbfb4106f7&42bd0912b15f0c427f737654b5841f15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5ae7f356947a0674ad985092b7697105&f9bf51d9d44292fd7c975258bc7d09b2&c06e55229ed4b1c27e3d421738dd718e&859a8881670b7cff4442e69ed367db66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"acb15923671bc09a2885354d40b7e4ad*2&3659bdfd2e3d699abfb2c0401e5ccbe2*2&f312bc2652994cb832ce3225a2cc63d6*2&695364e952628898a24eba0c8305a319*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"fb14e07117ca7121dbe3a0c0c81629bb*2&b36f806be3135edf1ca83e3ac98e08e5*2&2cb59e6e41f860ea96b6a0b17181af51*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e7046e16aeb8de57fae22b817f8a6a7a*2&39e048a8f874b76d00ccff5261fc0f40*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"de8caf60ed03c3173c40c9d953568cc6*2&b00750f98e2fe5d5fc208629db7872b7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"43588872d0c5f2f365ae697b1d611cd1*2&b842d39551139b2cf9503b8f68f14fee*2&dbe8753044e1e87cbeeacd97b6af1d06*2&ab25c86802aaa551c5e562ddd8292781*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e67899d9c6f184b45725cd22270c1c38&a7e7611a4793bddb6b064cad8a6a81a3&866e0a53c594ba99856bf4256a5f63aa&0405c00a2ba221691362350ee16dcf89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"cf9837a6ff0dc98e9c972d64b8060310&7513ee4e2ccd00771e810fc1673f26a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"47a4db0a095078da552aedb5835c4de8&e214eeac638d7d90ffe825d477757788&65d3a363d187ba5e49a596914868bf18&f560fc314419892fca75a111e96f90be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g200mb32_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"54d7b2972ccc21b71e49920b23d4149e&bf4a2e6da77ccb5280c782aeb979f4f9&38da5b2638d66f50644b6c020cb09d1e&8129d39390fa048b9ef9c331416bb08d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"89c82a44a475fa78174c04fd4bac8c2f&5418b2d983439bf06dbaf0a432f1f18f&03d54c271930b3c5836625aa5af09fbe&2154824ffe22bd5ebf056b01c3594c8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"54ec93746763b15b7394015e089d72e6*2&eefa135b42c68b88d152d226e49dd891*2&796db549b7fdb74b5c6c05740f5a0a4c*2&997498c31bf1ca9968febbc241f11aa1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9246a43832bd0da2d80829acab8358cf*2&d40f94008aecc53dae21353beec6fd21*2&76e0add7d1e5e5edb8a09f0eba34ee11*2&8baa38a7ea526582fc0ee4f0d17bc053*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"420b5aebff985f36547fd42f9ebb8a04*2&614d53fc168635a0cf89bf7a2cf08e8f*2&c36fb6ae0384816fcae1f5b4f3310101*2&18cab53fa8764fee2e3c13f26f3cbce7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e6c9533d9df8b14d277478cc71f289ff&caf540ceac56668a4b5c91437cdc6adb&69d5c3b914fbacf99e549165aaf62766&4d6b19b5f374a3c7f082ddd3366a7364"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0bb544c1a0bed7effb7b438ad6087d79&5e76a96c6e06e696dc2d57abd64644ef&90f1ddc86224a7b24108a9b0e39512a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"72c7c19b86d68ac3df6697428eb65bdb&93ceb893634698689400468650b8bd3c&9bcef8a7304f08b20aab74094249fbb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c295f1ac6275e4116d6f9f44a2750904"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb315d88b020cec55499b42768e02dc7*2&4f8ee6b6c4a76a1beec479a799478a7b*2&6aa4d5bda35c5723af90b516e8aebad7*2&592dea9075de6687514a0c5ca044f290*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"35e228f6d3383fe1c0c409f5b433c815&4f59d3669acf706b0e86580bae56ab0d&f69fed920dfed9e1f09341dc17b2412b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"98ac7e90999b9af0d91f0af8bfe52251*2&ef64db2e868769e09d4ba51a62bcaa96*2&4f177afda3afeee9e2f9b0144e231cc4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6da7e8d8e028d26745ff961cd9c8bff2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"88130e5b565287b7300c7fc2472dee7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e8b9efad6009248d3e66efdc1c0121bb&07a3c94cbc1279a39f136d4dec24f26d&7537305d26552088a8597a0dd879a899&bf17202cf83861130a3b9fc48cd51c0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8175345b317472bb4331dd7049bb601*2&7e74967727574ee187e90e8523365808*3&729a172d411f5ae091c1623afc80ee93*3&0e3787ed58ed0dc9f25fe0b6992ca5ec*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"9bab8e74d3582136b1165477360b3841*2&f4c27ae11bd68ca9c8620486d0651fc3*2&9976827c8a6a654d8d7096792765fcee*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cd52f413d86dfbf631829192479480c2*2&4ba5e694a30b6c5fcbdec416d8d7ce4e*2&988a781c7752563c39b14b90ffdc5834*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9598a7f37ae06a4805962e398071eda4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8ae4796d6b7ca59870b970f33de8eb5*2&6186d594e99a23b8f4adc8ed893debde*2&e7201cc9271836d360e055e878a4e4b9&881d640b7469b22f4ccf3cb0e172ecd9*2&bdde6a8d2522185dc04f826a4071acb0*2&48687d1e966433face2d3adb61e5e0a9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e0727b4eaa781f773d7067d09eb704c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"2f195d6ab47463966ec22fd82d59c7f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"921f6209ac80c529f19c6ca0ee64b183"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"fd65a9c9863ae29b77a4bea04bcd50cf*2&dd370dcb62922ba79dddef82f005e73a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"bdc272ca7ab1c8402130f45473eab047*2&f347c5a32c821011e9a64d7dadb3c59b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"bc985f04065c6bd622d8038ff4fd9b64&4f0dd6084ad7106de1dd6a667ab16983"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2651820d8b7a4b35e811920d6644cfdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"44f0629ee52ace78f32bd2d31dd16e17*2&2ca56f04ffc3265d134c1dea107f5e00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4ef09f6d96963dd8ac115386f25356fe*2&a4e5fe539bc9e41c602a8ca6f08a719f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"dfa4885d63807cad530e8fac075af6a9&55d7564e7b3dafce8be4ad51eb323303"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fb907e6c580dcc43e608f302fba75646"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"e726f6c3da4e5de3479f3637df484824*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"158969348b56459e7509e648c41efe88*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bcb9515bc6863b937f61c5c8cc60e53e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3c0dfb6f9d18053fe4b9f908c81fc139"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"48c04ee7bbaeb7936ae64e719e209be9*2&6ab75202df4eba8fd637259d4ff0b84f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0e3856bec18b936009955916a0fcf537*2&55729462770b36eb16a61d2fff8d9232"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fc6252b26f9d8d6c446c142dfe028ad2&c8507260f84e5087210d6ec9836abae8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d8de4e05b862e42c9eeb62e48d5a87a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"63e2278ca91e009595d5b111a0d0a302*5&3a2cb63ace3944685070c7d1483d8d54&f76d5c2ff759a1c152d7bee90deef3a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"a2f7e93f4f5310e7060eeceacbef0c0e*5&843d867e9996c2dea65b4722e4cb8bbd*5&3c5dc263f649858b5658c6dd3713de33&661ae1b45827799594627fde02b18b5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"e1465bcd00a426dcbd155629cbd86273*5&184556293991914612874dd238e59dce*5&136d7a80ba95c16a519a453d0182b10e&4ca1dd2b9d6ed5aa3c0d1312ba2b695f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"eece5ed3aab5b4b9958a6f30bf886c6f*5&668de7013b20713e0a512981928fe0d0*5&589677ee728289a28ca31be21ebf2e48&e956663131802e840df3ee9743b2e900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"48fb19f2655b94eefe082d87c67c3a13*5&e94a3101f71162fb063cad9ac124ccd7*5&d8622d4ebb25c1d6304dcbb2e844ff0f&e2b6c576ace4cb55d82f734a5e60cf4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3f99557d3d8b58b4f03aac5924e4ef0e*10&2af35b637c5089fd85b9cc127e75273b*10&22c81fc83ccea9b28944bd82e6d62fe1*5&b97af156a5dafe1524444135b02c7971*2&c9eee952ffc44a1b885b2b09fd3b58cb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e6d909e0c51a10b22e6d3b1598800c81*5&81621d14cbbd682d4bea4151c6d1c56d*5&ddfa50ae30f8913341205cca3744c634&45f15bcb8589111e0f0b806770131cab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b4b15d4cf2b9c0a2e02579732d427c92*5&562f611db67c93adeeea9e8b5adcfa2a*5&f4493644518d59e32dcd604966be9a61*5&cd9203d178ed54f814da683b9598ffe9&58039c7bdeba91669dc30dcfcbc96ba0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6e0d41a68e6ab59a1cfc7247cca932f5*5&cc736e0f98dca67dbd793acffd78047f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"25cdefb0b761cc563eeb6737eecc5a23*5&1e3fe87cc8680802dce9ecef43a45694*5&f911aa64e0e20857d9f69d3886fb381e*5&b36e37891420ae7e072acbf68d7724ab&b95064c6b58d4d290036c0eeb7b73b27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"48bd576f8e3759c901096f293d35a6d4*5&9344cb03da672782f1e8ea82285f89e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1oc56_ih320oh320kh5sh1dh0ph2_iw180ow180kw5sw1dw0pw2_n"0dcc15e0734ec00c04757d8685b7d763"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic56oc12_ih320oh320kh1sh1dh0ph0_iw180ow180kw1sw1dw0pw0_n"5e801337300669417d0721daa90c3ca0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic12oc12_ih320oh320kh3sh1dh0ph1_iw180ow180kw3sw1dw0pw1_n"e8d416d39baabb15a91637787dc07437*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic12oc56_ih320oh320kh1sh1dh0ph0_iw180ow180kw1sw1dw0pw0_n"e6c02e4e4ba2dedc045c7f7a2afaf27d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic56oc16_ih320oh320kh1sh1dh0ph0_iw180ow180kw1sw1dw0pw0_n"56f2dde900c3d2875bfba7da4063df36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"a6433a84079412d7fa7c04b05de8e662*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"5615dc093e6f32cf1196aa80c61c56eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g32mb1_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"6c13d7a1e553249a6c8ba7b01b4ebec2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"b5eab190d65e0aa8d140aca323221c46*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"a79ddaec679ff40521f816ed64eef14c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g96mb1_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"bc5e5c1c811cd511d3c4de140390b073*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"aef279f43e5262c93f4849fa63f89a46*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"c92e53deedfb537d5f7f0486c4e73420*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"a99adc297d0934b7e0229ad64a4c82d8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"a154eacbfd5eae199729b8bd81f3b634*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"6787b683543b0d51c7316192864581d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"17b44aec37d8dcd2be94a136f1f4fdc0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"fa5eae7c9b14386df9b966483a26e52e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"13c3cbc59ee5b6b9ff7a6bae7bf79933*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"cd96fde8ca0b4b84e5bbf87845d9daf8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"48d33122ed95b0a03adb97c2668135f1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"8ce3c362a36bd9223088a880fc480b90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"a0833cb7f3b8417afe004b60f9807965*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g384mb1_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"101973df26e432485b8c580a52107fc1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"28b20150322fa60a39af7d2c962859da*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"63773cb6734a597d3267ebc546e6459b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"1f62a45c579dfb4ff36df4f399d91b6e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"211fdbd36c6f9a51abc0a29757e36194*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"961f799f103d185c6e94bb3e3e033eb1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"de821563ed58ec8adec503d038f3a204*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"bf6d2b44996d0661e8b9d53709b54a71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"fe6a208d5c6a06a689460bac08fa462d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g960mb1_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"f4a0a4bffa55c298e8d2f4cdb6b9a7c5*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"b31956ca5a08c31516f87cb60819cf36*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"6abf017a74ded4885d3f930942688b72*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"5ceebdafb2011501ce2efd9266cb190d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"b4804b389b3121ca43bfa2a7a21b05d4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"aedcfceeb854826f2a59498655de53e5*5&2a9dd9dac5122e7bd56859582aefb512"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"10f7da5b33e2c118aee757bc5a85056b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"83b215a83f80e8ac6895c42f2f275071*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"eaf212be4f40a4e301433833b0fa38c2*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb1_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"5596571fcd018caa0b3bb6cc59ee2881*5&13db7b9491b7e5bd46d37cb8a84bccf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"911594826376ec62646324dd3b5d0a21*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"afcf01faf83bfa911f47e7fd3f2daa13*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"bbd5419f7ef51e52f711f952b0a26494*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb1_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"ca554c5c72a547f0e3d3352915af4d02*5&523cf2d2d6e199bc683519ab6eb08339"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"b4290c9ab333419a5dfcda97a0949495*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"389dd992f8bcc30653725a47e84682ee*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"a61678ddf41b15932e18b4ea21d33098*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb1_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"1fa5d30bcaef462d3b9cedff1d6b020e*5&a3d29a5adfe820e20f90791dfd9bc4d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"bc72cc140a20bc67c4266c921de77ab1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 mb1_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"d44cadea60bdd614ed43485451b12226*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"05b4672ad8bc5991e614a7c68fad8015*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"1009b040d5703742df4015a4d9ebb9b8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e9f7e4544ae45f95a750d31fd6300df9*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb1_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"27755eeb871b0819549726a19e125c35*5&ef86f2099113e475dd65c4b4ba556368"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"97b5e89751af9e8650a2ef49c46a9361*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 mb1_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"295b98d55a6bf0fbff7afc8aefb1af6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"5dc8f43bb0a8580edc36ebf68f838943*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"a94c2572f95300f49dcebac65749a15f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"f16a80b6bfed866be387b2005f0b2b07*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb1_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"f7c47ef2022f24474f65ee45a0243261*5&9f5bd54eecc10147d20400ed5894b6f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"fa9f98585cfbc84933535c5ecb6d1289*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d83b329c7bb83368fadc3478d9ab3ee1&9e150244fe08e24677f2b5dd13e7f1f5&f531ccc09b0af5c536dcb48796835ffa&92078f0c2af63155cf5125919a671be2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b189a53370d9bfe158807ec4feef140"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"635f31f7b95d700c5b5eb6209799a4f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ad1b6123b7129ebc61be114fbeb3eb84*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"80bf8b0bcb543fb9d982f2024f3390fb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a23d839a5a1d7c3849df3cfeb5555327*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"61199b5b4200afecfdbfbb02e2d76e8a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"72dd23df5e73fc3d29be70d5545c482e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"dd69fc025fce552fc8251c1cefad0eb6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5d4ccaa8c3fd4c9bf87adb7bdffcadb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"309e24e59dd1f75088e44f90343b7d73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"84f23c23dde66e842821a51d4453851f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5a75c8f162e5af4ffe6e12c14e49cf90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"280f400479074917fe452eaca3a3763e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0c6ea05d3ab6baf6642e14b977d45dcd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e72d7323eac7161a9d92d8706cf9e15d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"725f70fcaed3dabc57d50cc147165eef*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"68aa22e05d17f29c281daaac2692f7be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"711f96ba72a6b8306eaf6547c62ff4d7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0c9d16524b8a93a11f7b669bab7f0783*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ff5150d48e3e0afc6a6eaeffd2dff9f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"47d70e919606ddfd4ff7b487b0a8d773"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc07ebcd543e3911cac86280e16fc9a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ce9eafe6d17deaf2ea31d6783de3d20a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4266e3b70c2871bc4dc2f73d2eb9c63f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"52bd18e33522e32f173b3d4c08ee4a6a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4c4d4b63839b17777858f74129b412d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9388e30c7baeb5a8919eb134e148f07f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"de42c751ec0fbf8a75281dddea695c1d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"16567d26b287b27ee2f469f554adc963*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"344f74ddcf19aa37dc5dcdf710f99bf6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"91d5f11ee38545856665756b89acefcc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db0cb3480b27ed44e3cdeb5b0aea7762*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4becbce78d3ef402fd01fff793fd8f27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4b923ce6807bc5e76341437c56f40a2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"98f4d8e9a76047124e6e4ca25178fab4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g384mb32_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"47995d1b8e0023d826966ceb9917b062&26e2166c34549823b487297938d3ef18&ed3473747ac28f25c14e94d6b6aca5ad*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00e7d1911def956ccbb1f5e5ad34ee42*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb32_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e8678d5634946504491320936dd05cdf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"934260e099e3b0c71bce670429a10e33*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"97247184e757b0e8a3a4948126b0e69a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"059165eb0e2c6f9729e5f82d5bba073c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"768e8b81663884bf45ee512cd07e8b7d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1142e70b10b413511be2803dfddf43d8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8f80f6d0307a75b93625ac17dfb2e4fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a9d7034b38b46a972f1e7702d10a3806"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"18d61f3792bf44df5c2834915a3d877f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d4bce76053fa54eddb4cb1e68eae5c52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"331a7a3b313608d0eb5c8de229bdddf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f84dcf6be67dfa6f911d9c00c67d0f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"78f032547cfbabd0d156ba86e0a54f70*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"56d9488375d6aed6868714c1a2e7472f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf7aeae4ee2077ab00d6a62bafd9339b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fd2bcc0a2b14b6b882b79784bd3ff72b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fd5f7611cf050581a55b8254752ca450*21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"271d86835cd1a1f7c51a91005bc640f8*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b678e559c616de48a6a84a1e7b93c4ab*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a7c1308a552d05c799759da431a5d2a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a9636104a4c3928a62251e21eda00676*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g256mb32_ic256oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b74c4f5ba25f14f3d3afc4e930c4aa11*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"789f2a6288c22ea77ae4f4196d0c8e10*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b3a9000ca6193610ab1caee257b5d80d*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a12a73bab646650c14970ec423d8aaa7*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ffae9288ff4a74f891cb24bf8befbb89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ebb845c8f9e750cc7e8c5706295c1f0a*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g768mb32_ic768oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d508639850ddab4d72032ff425f66da*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bb872d0d4dc84a056e1ae6e75ebc2c03*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"df16eb04cc05f847bf89b62bffdc1803*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7593e45dbafe83f36a730cdfcf9f02e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g32mb32_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b631fd8c2de0ac74b688019e26da31f3*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bd59bb60d4b7dc7a485630e537b74c43*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"386fb6c7eb0d1170b391d9439a5b6787*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6900b8615ed0aa71060721b5f9cc8f7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"06ebafd448fa8d255433391f7f2194dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6bd0be45340284d72252aff936d060b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"56704e7ff17633102101f34542618f9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b12b765d736a5cb35f04f340231cf5ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1b0d466bb7f7c3d01d4d6082a2de3fe8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g2048mb32_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"525f76ff00ffa78eb99b56b956bd438b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2f164aa78e81affd6d2d9e21be529b4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ceec4f537a3f15a689ffcbf18240b7da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9aab6c24a600efd500c0eebe95db0d8c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8ba33d85cd0dbd28d3e8cdaf19f86cda*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9113aa81b3a5fad3feb5e91a5c3ce4d8*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"253e90da634520368c3ca07cc7e25dbf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"994d17ef6027d4cc2294a1aa8ded87ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:acdb+eltwise_linear:0.25:0.375 mb1_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"459cab73bd8dda17cb5fd76361e5d81a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"1ca791921fd56dbce57b27047d4ff4da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f13290947e69b2ef466dec89605dde16*5&6404d561b7287ccb1b546eaae4cfecfb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"12edcefc42633b38ff6f5268e6dac9f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25:0.375 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"891ba36f02f038f2747c82d273f3532c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6a7bcffa955008079213a11e89d07011*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"304511bb15915c8a92ec697ca75a21fc*5&aa5cc41582813c925eeeb6332a79fae7*5&62f65153912368ca258971a47c0aa617*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0f27e2aa0d49f3acb35c0ddea3d7a550*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2:abx mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6d2bcbd2d40678d20b0660f93c3c1c14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"4bd999098abf06a14ab63a997b4e6a0f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"0bfc56c33ada83c85b59def042de527e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"48f29baafcd3c513d72974f61dc36371*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2247474899e6b1480d8d38f135b35a33*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"313c92fac86ed0807be1049fb94e4558*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"3b28a73ef814cd285064f8eda7d2c098*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efd2dd115259e5a3a52132a6195f1ddd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ab008243900d58e2575b83313e5f6d5d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"6d93718d616be96583301f345912781b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a6dc200921b557d1efaa4afa46e49187*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"3998bd7422b8a05409f2ef29502e4239"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"feb4a4e9797e7589d616839086c2916c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d447a671cf08616eaeedd7a5b37df32a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"351c78b9166e4a1ec4a4bde31164a11d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a9eaca0dcd707c71f3e1fc46f2aba6f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"936756b424c5bd00a5c9622dd74e7eab*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"28a3dcd98918d1aac146731d4d707892"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d15da072a66fde1b7f7e133b6d5bbe4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"545f92f61d2508c0c78f51e7c807ad63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"ce21c32dfa2220016f9f90aebe8a6da3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5c287a1721de773dff0acbc9d70be8d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b4a5878369ab46c9ff2ba408387adcdb*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"28e0971f07da85da31c9282e56c946c6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"fe14719c1cb7694c4a88242ae551d9f1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"251637ea0423e258f12098c9f2bcc536*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b1832b0d1a7422c0108e33aa19c639ad*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e63b6a4ebf664b57b2068cab93a9734e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"88ef885c825c89b23568845259c0d0e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8edd8102e12c1a9f4fdeb6c866a38939"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f76889644b2e4b5bbc53089d5442a846"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"76a6fb769f41e8c52c8b84f3d7e6f057"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"a74cb9a586510f5cf81f6d83ab09d6bf*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"5a7f55d5efe493b78625ec11c74d7771*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"0e978a9ad93395305d2324a2b19919a9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"965b3eb07377c5633fff3579327cff97*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"d768822d008ddbb83507b8e38b5e71ac*19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu --attr-scales=wei:per_oc mb1_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"37187828ffee06388257a8f22e149ee6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"7cf25fe40c8d1694f83fa1589ba4498f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ccddd345445ddd6aec2eb403c6f1eaf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6d39ebe80c85f41916decaab501b62e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"197d774a64187104a515cf68970c4b29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"d07ec71dac57cbe84d0faf03814724b0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"1d0c63b8a8bf7737ff6b9a7abfab1617"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb100_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"3c2212817d1f6daff05831f0577e2bc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb100_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"909cd372e03dbaa58b9d5ba7ae764ceb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"0a582630d018ddb95bf4f6802f361317"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a2d021dbeadc60b5aa0e32451ad9740f*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"ad16a538f823cadc2d7deb7b770d9689*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"56259e3fb4dadeede8c28fb5d62d4329*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"322ea761c8785e7d690baf7eb3afbc11*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb100_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a8e6e681f894efdf06c03e13c48a6856"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"64bf5c3fb645a10f4aed184491c42755"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=any mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e4d0ee34cb9ecc59253d68df85bc5598"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"21fd0e28f13604a7dca3c0070ef00b57"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=abcd g32mb1024_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9eaf817ace33655d63b5b853251bc050*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b388d01eb8ca87c1435189f2b90e0b8c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c61de67eae41aaa1e1a5fe55e4dc79f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2d3a9bde54f358d147c943a7c24298a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3bd21d31525b9b61463698660af8fd90"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=abcd g32mb1024_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"e94f3c35f90cb6d7dae51b76d08d535f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"764c736c06ee241c2d6483caf83343b9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"aeaa1dc634e29558e29fd1b3a5391406"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"56a66e8c8a4506408683115f7e9ecbea*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=abcd g32mb1024_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9a0f5d8ce7bc5977b3b7483b063e4ade*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23732db77457295aaff51871fdf567b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any g32mb1024_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"08a60fc713aa2979b5f833fb57ff2ee3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"337c4472be4a59604446e3128c932fa9*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"994019cc06b8f4255ebec501981d3e1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c8a5d0068b175b6f1735ab2b6272caf5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any g32mb1024_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e9ef9b2014a9f419d5086863fcb2bd63*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ceefc730b863e752f683a9390e20ffa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any g32mb1024_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"c930f521ad3326428a853624cc1a78c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2b3ef76c3215adcade28968c70789e5a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4d7f9b4bdd94b3bbd203dbdc2d490c92"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1024_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"375bc86e0ce3c994c8c6c12db0e481f1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any g32mb1024_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1200aae95e096fd6068538f7269c529f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"c5fe2720e09ec09ed8eea9ee8a9ca416*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"e6fad74babe0b2932d8d8f2fa261ca52*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"3692c5ee4bbb823aaaf0d65ff5e1e95b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"857a3e82a8373393aa45965eb96942b3*5&798b58db9b20f01a3d00779dca32af7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb1_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"58990af4b56354e596f6bf3b10d13d32*5&66163443f8aff1cbfa1a0f5efa6f9fd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"5d5bacde41b5ff7ee4e43ed205120162*5&06c96e9164471ebfd621a421a723c28b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"efec0fc2bfd2436e17c21e79ee8cee14*5&ffef4d88f285cfaf371a970e54f5be0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb1_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"ebf474395b69da983ec4b03f1d6e9c30*5&8b001e88c9fb7b3446fd5fec112116b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"79915b6862baa8e7f38f66bf601a6818*5&8dbdc788df4d4fe871892b29a3238268"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"d119a96ee7460b851d0f4675bb7e9cdc*10&635c941276ee905872b44165107b2f3b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"67ae6faee0ec461c5cdf5314e783ee36*5&19ed6f9bb8916917359684828a923161"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"a91a1192fed5e9c8f41ff1671c408a32*5&fe2002ba934737ca0cf4d0db8fb89e7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb1_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"ff0e893e855da35b8e200dcc726ab201*5&ddb9ed6cd5e4429382d72aab43320943"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"2a32d1a7f8ebefce1c6c1b5905d9eb16*5&5bfdca5ae59d62f50764f1e7e035acda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"f39cc12e6a145a8943d81c76d48d6918*15&dcc6ad0671a06fdced2f0125b31bafa4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"781cf170ec4a94e58ca9284d81a2e0e0*10&701e34174c7820310ecc50f040f783eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"5163e57d0d807ba65e06c771412389ec*10&2807d4a228d775f1a5cc9557e93b84dc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"ba27944e75871d4689808f7f972a2139*5&9cc549d58177d8ccd4dfc9a514cb2003"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb1_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"ae0abde8f2796167eb3f74575e4c845c*5&1ebbabf076b110652af87eda194b19f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"9c8d6f2e9777e106d1aee0cf3cce172c*5&122a44788c90929e052a62b645fad1dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"214904a1c65202453e51b874587b3bfc*20&df9c1b24b84210ea8bdd5864e30eef3d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb1_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"f94fb1dd554bfbadd989b4173e16e9a2*20&7e6304609502229a266d97abced82de5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"bb2efd215ba8990e12305c4a1ba3409e*15&9388d29540b9a53fd2660ba6387248d9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"855e327cfc3a2206e8128a33ebd2fa6d*5&93b86805bf54281da35a75431c09906a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"0644f28ccba3ac48aee8b47e48c7f5d4*15&7172fd3652664a01712167161d63ab36*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"50a747a3da527df5e59dda70efb9798b*10&e3cfca74e656192593b2b737514c1ef0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"cac1118d17ed2b3978f812ea23284113*10&9750703ed3223fc617a62b812b16f928*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"1adfedb2211694232bcbf6353cb74aa3*5&51512a2fa5103bc77f4a34da185bc84d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb1_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"8311db166ca9e30837b38273ee87ea0e*5&0968fae0565499b6fc28b18de4671951"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"5c690a6b5d6bcae72700c3ad0d8545db*5&414ea016d9a45b5126d901a018d7de8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"9e5b3bb45a17d30d18a3695e48f0df72*15&af5f59a2b754ccd176ef48c0cc129c17*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb1_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"67d9d7a58b3adc0e051000372bca8834*15&9a990521e0a715d6898caec1dcce6a8d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"e03092734533315d2fddbc04197abe27*10&060fb772c7a9dfcece440ed7631f369f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"c8b0d457e38c98d4c6f7466f680dd618*5&e35cf91f65990ac24463a26e5a4f6b07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"9e8f0203d8d46c84cd06272a87e3c602*5&38b8615f3c9c6e36db9df8a59fcc09e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"492ffb8f37aec44ad42b0d616cbb74d4*5&31a2a5d139246c11a8c41aed4f7e3c51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"fc96ff7c0c8301bfc6090a6fd7f23e8d*5&b632c1ad38a6f67aed974f19fb4964ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"096040e36e7c88af37f2e54f0b04ea3c*5&c6543f204dbe37e250694681956eb973"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"65c73f648d38b109938805a8c100e591*40&414e692d41a07eaf0a3ddedd09bb490a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb1_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"e2da37531b5ba4010efbba013e35bcd4*5&ad4ac507cbb562ab82b67b2dcd940eda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"3932a61429c8310c8110277bd49f87c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"581c5d9bdc8f4ac74140ed1da3f6a763*40&d99175406f0bfc924af5b9e4ec9893da*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb1_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"f4fb6f47ed8ea506309bcfc0e01714e5*5&5592155a1ace8abedc8c1b5d75013d65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"2856ce8a61b51d3e7fd3b65ea18d942a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"66b508afd399b3e0f802dcae26bda9a0*40&8a9552c2fbd3a4c0fbe39392e28f62af*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb1_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"d8888338c3e2cb3d5a90d0ed800dc989*5&40e9e2fa01e85a121a3984844498700d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"eb1c7d6819aecba583c038c787467b2c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"466ba825d0e089d3aa3102523056356f*5&24071003c890d33da5dfa7e3a890a975"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"9acf4b910f1accc388202222ab259f1e*40&f76c1370b2865531b91b22333464a919*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb1_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"1fbe54296da81564dc39a992739d3c56*5&e51b144e2a077da167a84cff67b3a992"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"13cd52d968d7f6526e5cfd69140fc7d8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"fe2a1dd6c3a60428b1bac20a9847e2fd*5&d04b950890dc1532e192ff11d4b882b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"d3518c38fdbb41a69ecd4940ced7daf8*40&6f3a7e376917f66d6a110ffb74de4537*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb1_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"ea887debde576b02a9d6483e45671b37*5&1615027c9905c40bdc50a53aa28a0760"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"0b11f22a9b42cfa9bef538015767bd80*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc16_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"b5984a837a7c2bbe0087ca9ed5ce7d31&c29fd1951bee0706a4ad080e13025cbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic16oc96_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"d5c3a02451c23009ceed72d8c7d7047e&c518c48f379c3b89f6fdff81ce354cf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"d8773dbd8e2e3c004be75bd729aa69a7&c6a753a6225a8bce397d7f373d108256"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ecee12a2b185dbe888310f0770e00719&306d548824960a0492bfa51454607219"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic24oc144_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"17312828fb2392080745eb4f701e92f2*2&0318dd619fa2b5f6da1c993619575202*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"e82f548d4576dfbcdddf1b3762eae3e0&4d9e8fc5eba5902abe489d710ba16ea6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic144oc24_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"493ebb6237513ca4774e8f8b5cf1a157&3d711dcafc8e2d8666bbf90fa7715e28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"79b70375538a0f5e3860f42ba31633a6&7949549cf0fd51203fa2e168478c59b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d0e402bb987088fb3f2041e6f6d4719c&fd525f6253fa2a6d40ed2c3406c88f1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc192_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8340631149d8935fc732edbab1850db4*3&258a6143f2268fe5ec569124bef9205f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"dcb70902a3d0a7265c04114abff49870*2&e268c60fdc5930af53d8a5011df9f2bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc32_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"05ee2a8145f3355a199a1fe66b5fb84a*2&093a5cbd59d288dff6bca16b1fa25e79*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"6ff2aac6b4f8aa98253eb7ae53c689fd&ec65cb86911106c84c35cb9c8a20f21d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"70520dc70cecd28c895af89022e88ab1&937c5e34e15170eeb1fb67207fe923b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc384_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"55f514283b61dd2d1699d079bb657b3c*4&7adc3baea42e284a2404b2d153e2c1f5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1ad105315853505c992ffc9cfaf7a881*4&abc544c82e768202f8bc62a833a9262f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc64_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0393feb6d81719398a61d340542167b4*3&73ff0e1029d7be0ce2a6f48f27f923b2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"15c2e4ace9b5f51d91d20108cef00b48&814fe1c11029f65960f023b5538e5e2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2a22eb69ebaf614620f8d8f10a6e4852*2&1bfc4a038373ccc3b808421d9d1102e8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"10dbac73336a40b49d0ca376e0e83923*2&037e16f152b8f29a60eec8587a79f0a3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"cf113be9ca8cfd07be593a7036661ce5*2&433fb52f65fa3423dbc4e936f837d86b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"4c2c9719e7ce407c97ea531a68060a9c&aa5e8d423de11771c80d8d337c724c8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic576oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"cbd256238b54f0d1c5501df720af97cc&d94a1ee5f1fe3c5a01decc8934d59832"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4c88da128b8486c0a9f0ff47181acd97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e613ba5b003c50bd6f359bacc07857bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic160oc960_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f973f7d3f8858a58011c3318872a13ce*3&ea4a2fd32ee7dc2b15ca92747fa77807*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"00edcb668f78abd4d8fc69ed7baa0152*3&6f5ca91daf448859a1b94d23ee4ba4ea*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc160_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8cf1474d081479066a0d41037be9a30a*2&a73a762cab0623437818a44c3d336925*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic960oc320_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"70053ea64beec0d0b31f081c615e9e49&c2bda59856d8a332d44bc7365c18e580"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic320oc1280_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"20c8bd03d56f9241ffbd61dce0268831&3662057dc2a86753a7f515eb924c00af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g1280mb1_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1fb840f408ab8c9e1bed2f6e64466463*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic1280oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"7cdb23294369cba007d6918af3310c14&eec4d335c19d9a868d3cb80be0031f36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"84b00b2fde7a405345f83d14c7d146b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4e5a4b4aa8c688021da397f7a52f0b83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"056032054414a200d70f688d51f75d02&1f305f1c98a79227febcaaf98e089cfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc512_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"cd91ee7a1f3ad44c93737e21eb0e9635&441492bff0632094dfa67994cb9c033d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"af84e7cae293f6237e12e788fc0ceec6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"fdef0f684b1ad3cc1514aa9a7c058ae4&1d72472ea8869f1373c20d356ed55e38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"5b3bb8ee9f33c63b4621437062581838&b5bb93fd25e5522382f39e0aedcb7705"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"002469a9577a16c2da7bca2dc25abec4&9c5350dafc58d608fd8c7d442fdf0b8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"9c31e8ef7db777acfe6ad2e657e48d62*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"6de616d4538f530d1408a45b5c1e3f21&617b4c29bc1dc0dc6f55909d8a28ca3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"3cfa4a817610609c8a186f8a7a31352f&b58b31f2d1927f15915ba84a2a45b334"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc256_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"51734a6db204b2ec2ae0aaa13c2453d5&1011f7b2713dc2c487c9aec376771f39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"8876fc9d7e780fe7bbd3eca93627eae9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7d377871e5095bb0ae8b2b1394d116b8&aca8f6ddfa4c1b095d9e22cfd07dcf14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"1126bc2bd4ba8bde19f49a6d7e03666d&570060f222898fb2a5c2d01c83ac5b62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"738a3c95ad13ca82b4554a681f5cc795"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"8bc82b8d0625b036f087860be887be46*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"e59d118dcc01cdb5b623be198eb0db50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"e3a0d9176ea4af465691e3f4b49819c7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"dc3b25fd8b9b48e13f4ad957629c86f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"398023cb37d25336a984f65759172629"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"ac34838d3386b3e328beff25f58df1bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d9fefcc2fbc84c46b9cfd5ef4f48ba9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"66b80e3907b9cd37d22d76dc10a22ce0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"ec1dae5a26d75ecab9590288e130c953"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a38de395f737db6e836df33d3cbf27ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"9d7d88a0738815ea2f270775d505dc8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"1687fb0c2b4a14b1a2dba2492ac7714b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"2d898dc13084fecd420026889382c119"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"95b0af641d1926b577a1128586a70a7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"b4660687a78d07368f6701c146539fc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"9f2860c6b13c65b79460f6de8581626e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"6cbd6b8389ad9f17a059f52a59dff1ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"3970474c5c6d4b1ef57330d35fe491db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"f18becdb6429b91840c7ef9bc416db20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"a1aa45d6243429f1943674718fd62ec9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"9095ba83d5eebd128e18ed8f0bc1f5f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"86c38a19fa611092ec2f33c8a98b8d61*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"1cc05e0c994b053b6d8417b062cd00fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"884187d3f615f46c7c73f5f8296ed847"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"d180cf2f46b2878e11d90f78042bbbc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a7f13431eb82ac2294002d75c49f166f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"fee127b5eb87569dd88dc655ddef948d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:7:ABcd32a16b mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"1e879473464ee4fdaecf4587be79ac6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"5259e632285df9402450a5624a0ed1a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cbb82f6558dc8fdf590bb1c28c07ad41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b984ac969a3b029066d7d0cc8180c3b2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"16163d96d6c0a7105e524d1ad9ddbf47*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0e8fffee0afbd82f44bd32ed46d921a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b209a8674b64b7c5ce25d540b6676014"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ca75570247481f7c9a3deaaabc89aa73*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"595ec8cd1076dcb12efff05a5573bb97*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8aa19a3af3162a0c692198dbb4c9e6b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9384c7f89c26a806dc66c8b5dc33a93d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"92e597c03c51e0bbfc676fa2ac8b57a0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"626cb4e057050c2bb212a1d5f59009fb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00fb65a757c0bd68c602c98fc6eee126"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0c64d90dd4b37d839f2a85ec5ac18c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"aa173990677470e912e05f86a610b3f1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"08975b791902ec1aef0a3b86115ca157*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"442b8e51351513914206c2f18977cea5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db7753cb70c988ce33aba64f5046bbf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"84746c73504471b5af3fe243d3de7ec4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"68a3493f5cf6d5c598d9048c75a98892*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0 mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"25478895dcffb5ff02b27997289dfc40*20&074d67a7425a0d59f6fb632be0eb4649*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0 g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"80bc20124ebcc389ebb3b47047009ce6*20&e248836908ce2f94af2120b521d43245*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:1.0 mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fd261e39f426fa66996b1bed119b3210*20&e490906754031e97c8d587dcf4b64001*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fa0a58bdc9947f71e97a538434decf7e*5&7849790125f910279dc52375b624ecdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e3ec969a5f9f9ae733e168f72e2e30a6*5&ced21d072632175200c68ef56addd348"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g16mb1_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"a0ba62d13d0c049faff8214febe40639*5&865fc60f99f0cba00756d08dcd0ab156"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_elu:1.0 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9ca13f62b12d903472f0d8b42aa294e8*5&aeb5bbabfbf9c871096befc2e9dc45b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b01e9690c740d633fcd9b3e6fb9097ef*40&50b3896955337360e64a2d44f33e1386*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c1a28b2015a3c94dece76853a573f3cc*40&49b1ed2c1bf9623b042d054909727c4d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:1.0 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9f91a427f3f32351ab802a44bfc1604a*40&33e25b6699c665ab502be35845cd1cfd*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e3ee912bd077b843973e63b134cb6378*5&debeeaa1ecb1fb90955ccbc5ba967dbf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99fdf5e19ce83f96b85f73352b02c95b*5&780f27d69198cb39e350e0fdffb87c68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g32mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"d63e78626a54d64275938ede8d4455a4*5&bc7fa4708581e440a49618ba5796304f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_elu:1.0 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a104b01e235c0d28ff56bdc50f636583*5&47250d9a55f12847b4dde6fdbf939c64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c176d81da2099f89f3fe9960f36c563e*50&c2c62e3fb2599df79a19c5fd5d59c587*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f4304a57e10c23cf42bc9be19eccbde7*50&e7a5154fdd318df64399920ea61e864f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:1.0 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7642df2734e8e50505ed35d93787d833*50&a45411c7ad296a8849b3eb8b884668e0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fc040d8fa1942c888e52f036a858607a*5&21e29c1b3fbed1f0043b18ecee991fc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"865630cf55354186f34d1eecfb32f4ab*5&f7871ea78304931f3f5985f212ab873f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d3e1a16e8ebbde1fcaaf291883f71207*5&dd503b1f9278c8b19084090fff6fc40e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"be04ef467b07bb8545395d637ee79b6e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g64mb1_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"5c674d21c3a3a559d7f229538d0d11e4*5&822c37676f0cb6ed3ab75b80fd44c44a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_elu:1.0 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e4b0b91a451be0fe9aa45872e75484c1*5&d4ca5a53072c53cf99adacc37c78e126"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5d01f85339ce200ddb552b9ddac33de8*5&f105114226d31e7552544dd1d80966f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f92a3aa57b3415f582ceb68123d0cd33*5&ce64f18d3f8210da0da3a40849f595b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:1.0 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b11e12c9b9cac7b5ff92042cdab9770e*5&b9801f4c3ddf76d1ee59e32420060d57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6a31450932e4e854d4e90bc681a40954*50&3802042a0ebf3bd58b7eb37d184f3f80*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"63518b5a350e19a3eb48dbbe41565e62*50&027dc68484f54f41d885383fea7c64e5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_elu:1.0 mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a7ac90ca36531801af7d094015d1a2ff*50&0976e142366a0f93d333e5e042061405*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"d092227e3762aa75bac62334f56c9388*5&5a079b1fa3559993381585b3bf3ed85c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1a5e0c89376a8d90b5e9128603a29992*5&34d56c36f5638b330e196e13ca0166fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"42ac5df36aa0efd3e0f6e107c685d1ed*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2367b78a0fedbdcff2d76da5af25fb94*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"327070079a6f82a21da65e762726fd90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"35d5f1fec26de57525fa2939d7e3ebed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7010db1e6bb6c9c8fcb004f1ada93196*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"056747c8ec104a1b1a7825c088e7b957*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"66a5c4c4ff2f87adc04baa7360aff1d0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc192_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"ecfa3526c25d10b1617ed387b7d99726*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6576e4fba75d02e4df6d28d9e69ddc20*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c1863f61b32a5cae4d7dc492f724f527*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"94efc3a231993f107dfd989b8dd7c0a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3af2dcfc9727386c0030e539fa776480*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"41da6e0c8d2866c353b9c14eb5488b62*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b044cb357507d94ff8028a855eec890c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"654306e8ce9c484c168fca4c56629fd9*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"368acbbba1a5918f0639cddd13953166*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df6db535a2ea03e5164660dd98d51ccc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b99e5e8e651f7b9f5c1c1b9702391876*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d58a88897dbcde169c5c0616ebec4705*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7cafa0e77d08f8cface8aae7b589e162*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25b744d9dd4c36560c583665aa6e060a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"abbea3a34f47fb94727f6aecbeb5c2c0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5a956c021451f98853d2aea5c4021d9b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"392660da440150f9db8b1e8ee9d2ba50*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3a5558a733925191a5aace183090ac54*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d83ca2a4c61e87d119f31f9615b0da05*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"33e55e17cd132836ba309aee7e9a36c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic704oc640_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"f862aa1ec079c9ed356a2ef7785144a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b51513dba096e686a66164b19484871a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"571eb7022c1e3710755aa9b3857fbb35*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"59e70fcd13b43f3a1bae3e7e6665fee0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"54da102914bc441ddf482486ca1ac311*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcde --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b23bdedee3d64f6c5f80c43257f7c337*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g16mb1_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"bd4d29fb00538cad95266aeefc16bbf2*5&5896c682eaaa510e82ff4136f2a26f15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"4ccfde4913e710e6abb2834062bdb692*5&ab0c3077a2305f3b02d7739eeddb48d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e2c8ea1e62d7fe8cde063e2446a35676*5&9b2c3d22a1e160fcc7688113bce81a0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"2334893935836c445008283f2275b5ee*5&589c54b27a05903a279ac8817b5be565"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b4a47975f539c062e40ccdf37c4d0d10*5&29d010365c268e7132da771955ff2bbe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"63e5d60d83c7bbc5d368c4544af90041*10&10df018d0127792c8208839cc0e1af78*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"dc280cae5b2e7e1475af30bdba2e8117*5&613c5015961c170a27adacd8a20e68cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bb36125ec1b1f24044c1c47450867274*5&b74c5a46cb9c33c3e2756fddbc1fc2ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"f20930d9bda9c415989b35dc9586ca98*5&5dc15ef6ac0008f2f388ff52dee2527e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"10b784cb16038f271582558607e5467d*5&7aab9ca138da27d2cefa02f8edf11242"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a54b9a40dff1aab71dd853fbec246906*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9e6ada66dceb86c53f1abce879fbcdfd*5&a1a027adf6a0aa11a3b3f0768dd2472b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"963ae8c61d5890b777be292c80c8f334*10&0956f8309012107551614c2d5eca8068*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g120mb1_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"0c0c9265a9312873abee212b20b967d0*10&8eb2c2c6e94265b1c2b551d748796115*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c640fa4423dd0334786c6e52205a61a9*10&70347e6e051db77650c571e631914043*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8918817ad84cd6edbb6a2e5f5317f12e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ef165fe8cb2a2e0976f19f36eaf73e8b*5&d474251a867345cce4d210dcc7c76781"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e1307f70943942951c367f2e870c741e*5&4a91ca39bcc08d2decad77a6a8636df9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4670e4afbf712eaa39a03a66e51be6b5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g240mb1_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"417732c13159cfc4fdb7acb1f3f58471*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"110ee0150b427d5afe645dfd9bc25b70*5&3c64d843c732b4b84f4b3695574fa37f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d23dcb3b1ab09da4e54b94c447d0c057*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g200mb1_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3ecc40b3b06bcafa348475abb8fa30ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6d1623853e84421476f5fb90787e4c2c*5&639098022fb0d7c0afea4f7212a25309"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"07153ddafedcefc03a12194de91921ba*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g184mb1_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6fa86a110e84171b16e08cf64377fce4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6652faa0e3d34eb1f3ac2bb27706df4e*5&d52efa86cc45b96b86bc9e2b2019c388"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"068005722012abaa26a70d0831dda43d*5&75c2d7e19c1894bd1ae94525542395f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g80mb1_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1a13d8213e8e136b1582c881144fc766*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic mb1_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0d127c751b0c0bae74e903bd074e4689*5&7595b551bb23cecd8911af7be312dd16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"553f6621acd339f37cf77895df028e93*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g480mb1_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fe35d13a487b0479f442050bccc5a90a*5&589aa1fbb705d75b2b189ce59ed20ded"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f47a3a7507bd81e417ef9ad3288bc495*5&8f36053efe5c32bf9d0ecc89abf452f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"732e62a8f8c0036c4f89611107f30463*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"83b000a67d70f086d170cca4627349ae*5&4b7eb4cffd090990d930473aeb8b0c98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1477af4cb6d6c6e79c465568d4c1f594*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f1a078b355bd3c1e7d16f18868d896d5*5&50c30303473753fb5157fadbdbf699e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"676f88c9cc0dca9e80ab0e204cc7eba5*15&a8b7ba0f67f1a3120cdc3bd0baacc221*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"64d9106cf5cef2102774c9cfd861131a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd19ed49a2cb161549a77d663070e369*5&b969e26cf11aedae304e7468d1f7bbfc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"c6e1022396cdaa47c4fa1436b4fa7691*5&ee7fee530761661f8758d82dadaf7604"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a568473e5fd16162f5308b02d84a5230*5&64cc4b93b94501f3b67b6871f7a34cbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g160mb1_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1657148397b1e782e89493fc1f97c7df*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_logistic mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d9313c7c839d0d0f9e62ab3a36fab73*5&2bd65ffe3693552dab9ad618bcbd5f99"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e8aaed59d271845e58d08e51922d26da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"58b547ac97788b743fb1300bd53f451c*5&b8289800ad06875b845d9502094eeb19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"578135df3373042769cb7f9a60756383*5&f6271f5170360475a534699920903439"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e6417e9860877fb835e2b4175ae3230c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g960mb1_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"3d35d12625689663cc05093200cb0b25*5&d7521cbf7d7b779b990b7cfe2d3e1405"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dab88984716288cdd3277d2744280d44*5&ba276a87ca4e83e13f6d8b8b1fa896e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"79306d9c59a2a8c6ed5d010cb2145291*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:30:aBcde16b mb1_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cbab234a375c382baf9c1d2d30b5fe77*5&fec3d270781335c44060eaabf75b9686"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3e1d4de3ae4f49bc25276115d8c726fe*5&eccff465df44744defd8feb5d332dfe2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"4da25468f6cdfafe380a1ee93549ba78*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"758d3ff2dd5d187c5d0807a12ea1b772*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6a65310f0b573997f665dba2d4ed9801*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"9e6ff017245e31c00acd61c3b4dde25f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5b1032f4fb0a6ddc515a2d6549012499*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e47c2294c4ece37429cc8767654e9193*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"78c61b92fee3f232d49eaaaeae6f4d63*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f45c6bca09cb8f8bc2ca2ab6bf522e2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2307b3a749b8d30af26619c46a0d3162*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"48c82de2222aa68b000d51ca6dd23a2b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"472a1d53ab50099b8b58446c70626aed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"89aaef62111a604beda52696a47c613d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f2d2b34ef0de08c1cb6e9129de443570*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7568240a39c92329ba96f4a9301a1d0e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"655cbfef01a2455f17c68c98c41950de*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"00bf2859bf7fd27bd3a6bb5628b65344*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"87ae0508e5dcb815c09ae97374ea24ff*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"4604fe6e19f1ee7929bc0c09fa8d112a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8989e0f6a3270802253fbf7cd032ac23*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"5bc2315dcf862c4fb2d9bad4debc5cd8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"66851dad56affb3dddba4ed483eafe92*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"cd5e072739ec2e4371e693185282a06f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"2acde35d4a3bf8ffde371e1db53eaee5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"d6aead9e61de2f53dc93b94defd5f2a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"cf7d234a451c919853631182c56f22d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"3e75eca5f30593645cc658d6eba04052*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"6a270a1d003eb8a8bed6e180fca14455*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"7eaa8fa503a8ffa357e571ffa77c2527*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ac0467bc1a4d8ca80860519eaa3bcb7d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"d25db472f981c491ab2e63a7458bbce0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0e0794f281be0339d207bbc7e533d3a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d96a03610f2cd8a7509e2cc08b6be210*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a04664b37e8052cdea80158f7a3adb3d*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ad6a338f0e0b648e14c08dbf47927d5b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"24c8538f6ea2e5fb0670be477a684063*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d355a155f435e433d846bd7e98cc5008*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"c68fec363ca57e0df2482a9e0b40ab9a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"be6aa3941c703033ec81bb0fa0f13ff9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0729e32fd0a8a30153e838cebca41358*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"012e4bb446feb9d81a5b3498c634f9d3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c6c8dba275648979de6723fd5116e5a3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f176ed9b7d0c09c1be7d0601c9a4881f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ef789c29e82fee158da3e1225ac14fbb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"48bc82abd025113410631b07c3176e32*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"3d85658afae23f13795850687ec597c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"b7a562d2ee2d813348e0f307eced252b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"829383fa43c6b70d5743b3c514d17046*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"131f105fd1d353d19aa10cfb299168df*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f80a8a6bec1465432153c5eaf711f9e0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 mb1_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"113967424e22fe62e53b6144a805952b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"2a7d93d13c559d98d7159decaa1cbbd1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"d3f729d2bd39118c939fe554691369d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"74fb66f9838a81e62d42371ce2296cf0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"30065771ba663b7f2c74fca29ed6f875"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"81cca2ae4a355e95da3974ca4ebe0d40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"90dad4bdd1444467e49d60edbf15d46b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a6f38fde408667229e18c4c221d0e507*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"6e08a5061fae51fdaca75194135a4845*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b52d229f34aa7ee11f2781adb34e04d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"cf749a53c6c8be33e7a6d79f820c4694"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"d76bf77217ebc22b746406d32a931b91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"2461ca1b905bf0dbedc69195cbdfe6e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"ff55bb8122519deb954b3f3e782c2f87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"552951f9f0af146d91bc9560abadbac6*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"abfd6fd8e731f6c0a81a1fde4288ff80*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"6967af4021c9d672a4b15c55c8a5f362*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"23050280d2ecb1ba0c2b6dfe81626cd4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9cbe8845fbb94bc11700728abc3d6965*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6d2c4e9fc414961c257c55d78b379408"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"5a2d84f5441f33e7314e09d9fcf582ea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"0223a11143e6c7838380b2cb121d1deb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"914716617a135d402aff9c59d6822c36*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"831ef0c951217d8be4a0b2d33d9c4f4f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"b40144b09561ea17a332a32ed09b8c42*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"9983294ad2e18513542959c720826ad9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"31c11a8b66020c02530ddb81a29bea87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"5587c5d13bdb7ea5572c4e5e7e3c27a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e628195ede333a336700c02c7589e0ee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b047c2a9c88a4fe0259b2b7b24dd103f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fbbdc0d904ba72f08bbec5c82e402598*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b4738b9c61d6e114c35265dc6b69c4f7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"91bbfa96903b025799147d0296cf97be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7dff644f392abded616533b4638cdd0c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ae0513ebf73cdff35826aee60e365042*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bbf1e1248858dc96bb2519d6ef3fd945*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"145ace2810770ef6d45bd7da9c9cafb3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5677351cc09a7e9abc8021b59efb9454*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"491125ff16695d206216fbdc0c6446d1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4daad5a049ea66929c047fd2d3570812*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2809121eb87050f36034ac84dca5a04a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"40ca9e2f1fd910b1b01fb46a0e440527*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ea7c392d56df0290dab39f04600a8b1f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3087c38cf7a9a6b4e44c7535cba4dbc5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ce22c7d487851f8f3073e4d21df2fa53*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"be5ae891748b28d3f00d703abfd3d25c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"622b6289086d2e02cdb3364454f01530"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"9ad89627b13cc78e6d51f0f29fd0cfda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"9a0628fd56256aa4d582ecb520a097dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"84bc65949fb476deda167963c756a5c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"5d243741fa40fa6014ba036d9e1f7161"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"ccca3534cdae6c2338c38cd91c86ed0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"860e4ee601129283764cb23418b5cd5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"9b0bab7b7bb43c40fd6a1856bcbd55c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"0c742079662338f058603cbcc897f1d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"5476066108822e765d4911447a1c81f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"a3c20db95d805567ac96b6959f687bcf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"908fa6f4077bf06de09bb3dc478ea693"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"b00e35e00a205ceaed6814eb67ac8d6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"9ed4e06e0ed6ce72192ea8ec0a8b8225"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"c892117a16174c9f1c788fb6e11bed0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b3456880464c9de5132c93eaef445250"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"293bf5efd602c8f534cca63eee3cfc6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"1b03027d9aff1382032759097114f8b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"f0f6ad9dcec221e16fa39b07dcd6d050"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"24de632476ba6f30c3c299bced077cea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"f06dae245ba6947a2f27300e8c27fd8f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0af03a5d2452657fd17397c691da10bd*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"543c3426a5aaee22bbb80bd8be7c112c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"7087c825911bf53a7517990ed3542ab0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0945473438ef091a2f521928c373b9e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f9e2ab0daf00e740252bb04b457cf11c&74f1d59064859927424e86ef111efd23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e7f9d2fa3b28b00557caad0441619703"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"ec1ea151fa9c052ad5199c32b8279ce7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"b6773eb5d80abcf5c8c2fed29250deb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"68f7a0a8efc388b4098a73e41f26fb65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"25b6d791cb9166897f14e9af3d25005d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"66787405bc4ad1be5ed29188129c0e20&cfcfc49a23d15cf8021b4484db8ed95b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"dfe1eb72231f9ab509c57d0450b38c14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"8cf0af4f9a445c428946d3b63050028f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"f009e1e5577c0b81d4008b78aa99063f&10f7f7ec0c15cd097c6ce0f49195a5db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d0f7849a3376538b213b8fb4a83b03f2&5c36976ce0365827b4261d758c945ff6&bcf0e2d2a0b44357f3dd44702ca42b3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"9acac0ee5c8ccb5708c28103b26d08bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"610a86fa594a1c521d635437e2f68ea3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"724ca7ecde6ec33f53554f8c653f03dd&5be7bb0c277b4ab640fe7c391937504c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"9cfe19d0d85d8c2897320cd01129b046&f3174435af67370a8acd7f210e444374&a345d228c5914276c03ee33ffbca74c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"2718573279c072d6d83a401cf16f8a73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"793c373188b18e0ef1b50f384f7e6fff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"657349f5318b892622419b7762f7ee94&034cef468330aae08a1c7cfa40b7ee5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"d839dbe775e04a5728a4a6ed2ce39b0b&2b40061a515f3d1ceb9dfd76d3106fc6&b1df993f3e48635a51970d5b1d944b35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"18fa6e1fbce35936b98f72cec351d11b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e209043d633c53e16894562752f61e99&7711c24fd6eb9252d954dfb851df4d4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"66199b7a57a4770575ff921b7aefb066&594f9f383bb147925f36abeb5147199b&62cdb2bc5afc8b5fb732c2b219f3dd8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"4c20040c8ee1414ce0107575cf411551"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"ee397d9cc2093f8f8b89ff5153ad9250*10&1814512f16933cf2b47f08d494b3f1be*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"4ac83de3362ca0d4426d94aeac05bbd3*5&2ab8df44eef2c3501b427f0e7663aa02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"ff899834ebeadc81be14fe2f5c798897*10&a2a745d92e6ae0a20cc66be21dd40c79*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"ed49eccfe247abd361b5f549684056c7*5&be3371257199da7bce03e6277aeb9cfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"481de8e3912adbec084ab5db7ec69be7*10&6065f65d8cc9e60bc3effdf607d28cf5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"8754f0bea8058074148b1f68eda2f79a*5&4852a44a1344289c183a87685661c272"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"e7c7d4a7b223df55bd8cd8c4b090ee8d*10&d8f8c50be563e33145290dcddb0f51a6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"5cc52ad1d1bf3ad3b7d3605efec00d9f*5&034ba819704cd32d10ffba1af8409c65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4ddba03e880306ab500ea768f1618fa9*5&7871c8d9921c8ecbbfeb6476ef08ee35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"d712c35509a9aba5d54ac5a54e71d546*5&bdf937e994e499fc55a936dce6427087"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"aa4ffa7c0a7ec1178ae74d81e93921c8*5&c3b7b8d0e7dfe138e28d160cdb505e14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"614f32bc0ef78f20ac8cabd83a2bf701*5&7372f6b9a43954598482db0d3fa6becf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"bc041a17067117e4b05053f9177d5882*5&091d8ea4d2e4c008e59bb5564f27b5e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e32b2195a309ba0ab2e884c236cd9b51"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic3oc64_ih896oh448kh7sh2dh0ph3_iw672ow336kw7sw2dw0pw3_n"fba415b6fc9cab68925ea3bac4e83b15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"80f79a9e07e9cc260c0c49a40ea89ba9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"14cf10e9f51f31afa60731af33217218*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic64oc128_ih224oh112kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"17c7eb450bba4ea79883e7c0f2d37bd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb6_ic64oc128_ih224oh112kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"701ed88385b1f75da260c7d480d68b15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"5e51c1128cd7bda78aa90a069ccd788b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"5cf5f18bcab8276a31b950dd299df30b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic128oc256_ih112oh56kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"1cd9df275f4c318b0c482a63ed947bf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb6_ic128oc256_ih112oh56kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"4ec62695d7e659880f0e7e0b97a5fc7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"41731df708749c593164528dafec5be1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2c9600beb3e7311ad2e621c084b2875c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic256oc512_ih56oh28kh3sh2dh0ph1_iw42ow21kw3sw2dw0pw1_n"cf737cd6c17da6aca961090347dbc199"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb6_ic256oc512_ih56oh28kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"30312ec686146d9002cd9ec6e8cd9b95"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"eda824b1202490644ff8ad73baa973c8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"2d3b205083cad5fbe1d3e40c9446ac3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"953b0617fe67f5d5176958372d17fb48*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"95f8a444ca7d27e8aa9a971c1c3aa7aa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fe1b462e3283515fba09d7c1cf6522f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"efb958c6d31337bba83f084078c2f0d5*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3acd8f900d7ea7728df1f00cc0f32635*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0616c83497ccbe919088b54adf3b9635*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c62ee9f95120ae05407a0ffbbe0d08af*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g16mb1_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"d839881ed0a6ea8fb2c7b82766e50749*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e22008afcc942b11965cecbd4c2f9ea3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"eeb1f632b59a873adacaa9a942553491*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"96682cfbd89e3802fc6abe95ba386d80*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bdf49dd26e8c1919f9cc89a08c500a40*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ec28272efc651b65e6c705cebb824cf0*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0dd14f8f12cb708528649833ad552bc3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g32mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"ee238efcdc0436e303499a016e178a6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d31823394e6ee533b7fdf16fb1061f50*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5fe0fe410abe79535f67fd76c9869e71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"014bfb710284d401528837aaf998ef67*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"138cbc94175e134a9bb62f5a72154ded*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"685a809aae601295a9d6d184344bf155*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"43f839e0ffe587cdc298541c28f9b0d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"79fac21ff433571457425b76ed3318dc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"14a13b05b525e9d3e5a822a6d76908ae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g64mb1_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"7d92a7f484bd5c25148a2bcf63754f9f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f6ce6f1f0099d2f4e3c080c7fd00b1be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7c5071fb42bc65709389ca257f618856*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"52a5a88d6bff6bad2abaa2c50844779b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8363643d95230a433c569c0d4a1c86c9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"73d07227294da62c38696c43b90d46f1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1831f7ae398eb7d247f0987223ca71b7*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"22f42c96a2a786cdad871c551c41af66*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d2bdfe150ed55a70ccd7f2a823a65c5b*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0c1ab1b9748bb985d1123d9a7809cd36*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a733557d8d258eada1c4fa5511afc338*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"801ac942c08f3776a382aaab83eb7680*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"56231799a804ba4f3a0ddee9039f5b0a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"fbbb97b07d006027aab094e1dc706066*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"a39e2966342ce4f9f3bd4ba3890f9da3*5&d9e636521dceca8671c5fe895b724d5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"054e252f50ac1bf7763b817c0621d7a1*5&a0f71fb0a823173e4ed4bccdfc41f10e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e128299a0d5b95d613d8adbe9d8e3332*5&34ac3a2e9ab47ed20b2ecfccecf49287"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"3c0fc5659d94449e2feefd9d3599db0c*5&3cd14e27d247fd0fbb4d9bae387c0cea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"5b738e496e6470c77c12696959c39af5*5&5644e8e59c8561e30b156eb854a00d3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"840039538589085fe6da8db63a557c12*75&cca143361d6a35de19429a7e77c8e7e2*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"2a9785b65760430e1d047cf42c15a7ea*75&0107beaf5be47f2365536f3923db78e0*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"18fb5b347079581cc45b2d2a54797b98*25&419115b0ec009715b50f6e88128ce89d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"fe51203ca60d59f88a898a1e74b7fe91*5&3cbd5adfa11f7df007574ad8ee4b7288"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"f10a4f63fecc45321e69000cd278b80b*5&a4ae34633dc900cd8be7fb1e8a957b87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"d9166ff78423123526229d30bb28f6ca*5&f1558630ebb5f0cd38b77f977ecc27c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"615fe2e7b2dcb4e96410f2a7785a3a1f*5&fe380632283696d9f017f81278508657"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"14baf51cd76e6d765ef4c28f890dec93*100&89fbb778a744fde7d760363b610b57f2*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"3b945e100f90dc81a4503bf6d3cda21a*50&0cf11d42e9b18509b1e78c9c558869c8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"fd278ec28766032f0c86dfe86bc7a5d7*50&c743977f6cfe9af5040c1c7006a86a24*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"00137523a184dc687ea42df9c3cb8fb0*50&9a804fdd7477694712d765447716121c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"45c23cc1ea57371bf2ab7839751b3c07*15&148f65a2b2555c7a0019c689e31655d4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c6ba026a1aa2818232dd22e454fe737b*5&71a0db56e791b6f8525ce4a82f62811a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"91ecbe7286bfe1466065e15bc9d33f24*10&b09e376c31d8a2340b2e6d308c06174e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"1d95119a793ee27389aaa82656154281*5&48942f5fac1b2dba367322bf7b019526"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"8dd8392dcd01de2066a9847d42e3bb37*60&da5ffcbd068b69ef73227e1966f1a94a*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"79606caa55554af657904cdf08e1b37e*30&e5e3a8b28dd832b8e139a3c98b6a3d70*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"04db7b133b6323b8fd801a13bb386aaf*30&f1b195641decf4660a0812d6d9cbe5dd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"215a4d08c0a56467ffe1530f08782348*25&d162c578996dad1698e4f76c982419c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"8e5cb8b4d9e9e5fffce8f0823c58e1b4*5&618c26d65d8a630d8692ec04586863ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"25b283987565bf236577ef7d71366de6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc192_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0b237d3e5ee5601839fefcae961c6a74*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic576oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a98dbbf3e668efcd40c6498b11e97d8c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic160oc960_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2eb40a003043256a1c500f15308e748f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4f75174053240751fc00db2eeb1d8d8b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8da33fdee2e3634b7719a289c5721b2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"25269af570c5207ca98358bce995e0fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"c722609b7fab86b60774e751c3955423*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"89e7a2620586d8888a0ede7d745db6c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"9d64c1ac7d9dd2fced15f05722671f86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"237744a6282401f81e3d0a7658c98d98*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"02fddffa561b93c275b46a4716f94ae5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"83a1b387155ce5a02a1c43c8e6c58499*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"3f56192227da423cb103fc398c83cc70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"eabd7fae1299523d711a68afb30adbc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"5851be5c235148869dc9099900a3edba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"c0550318e56686c3d2e0b61edf2deb1c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"1fe5cc29a25c6efa80b4286954d7e32d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"626b8ce36e7f4af0caf11561d1b8e042"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g320mb1_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"b255543ae013202014db4b02f5c452f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"615688850e7064ab5a70fab72fa217d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"13b9f42e70de563d1c28721f95983635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"8df18a114138d93a5c69293e799545f1*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"d6f14c5177b46a0962acdf22d2ff2f92*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e3c3b30a699b94453cfc77d823ee4535*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"44c47577309b1e71c40113fe7b86c5cb*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g640mb1_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4fb553c51fd48fd3ab2f2b0bb5eb2c19*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"960e2d5c8d9ac10d731dd1a9ea29b75e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"0e30dac95be6b0918f44a95886c4c22a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dba2ba636d0563d6afc7af5c3b1a761a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"fa4a82edbee1bf8a4a1b07e7f9094b9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"fc3696b1b7416c49aad02183ab98037a*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"11864662a23e535855568371e6394c71*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"15fc69fd46b484168864255180b144bb*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"d48139769aa23a0ef352e8f5d060c24b*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"7e1aea8a81e4956a3b0b2adbfb2fa86a*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"4936e2c683ca88e9ed53ceff1f6b4649"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1056mb1_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"e2b8bd6847417923860c07c0f05b2f1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"a4fcb027529d401f2c21c628d63694ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"25ea9cb5bcbbb01f1b489c10b6bddf0b*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"619f1b24d706487d8c98192c461bc2d6*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"646d75bfb2c2112da8107d1b9bc2e3e1*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"398c26fba06754aa703192bdde88c99c*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1824mb1_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"d140103009fd3e54fb1f6ade094564c4*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"b78c820db23e5fa7f5403c59733ab23b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"6de2036808bc0be2b1aec13bb02f8135*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g3072mb1_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"293e3a50336b44609510777c33968294*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"68bec9a697be35fb89923d50f53c6529*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"6702d3d952da989875f583ba48a519aa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g3072mb1_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"61ebec601621ca3b84afa29c322371e4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"a19526db5b41e8964edc1f677aef5b6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"19e9dbbfb1394416038e06b71b6471bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"74596fecdeffae320ba3698a2c6922c9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"b314e810268b6d6e426d6b1cfe9c6b76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"d9f0f64c3661f7ac82d56e5a25a78cab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"bf8e9192b341d13917b8085790d44780*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"0565b6de13dde51523f5c624b529e201*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"0e2bfcbf17211c9d96df40208f0167d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"f7ba0a9d23b390c2145e060eb8b0186d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"f79bbdc1481fb283ab4c118a53cf72ff*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"548ea5121f2d5da50e8abe05ddfd5bf7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"b522b24294ccd01577d81ccfe30663b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g320mb32_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"31eea67b7a622611fb05e1ce78c62047"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic320oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1e1aaa833f8d5fdb1080fafb021592ed&bd6705241b3987b3594e8679928cfecf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"61725a816395da1f993902829f17a92b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"48946a4be4c440f700ef463c3bd10f7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"d55bf3691420c1873ecd19248c957021*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g640mb32_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"b6fbcff064db0e5ebb1a600670c520c8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic640oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"15916c04486ebf900206c1fbd3b9a6ec*6&7c8bcedb64c289eab4c4e163854a0c21*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d291abe8044d50dad6c78ac585797292*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"722da828949e286f7fcd4c9823b37137*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"67b53e109f5306a4e680818d896537d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g960mb32_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"b4609a672657d7d763bcdd0abc2a34ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic960oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fd549c0721457443488c335d18b7af63&e7cbd831a7f7ab6063bdb6e2275a8237"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"31d7be4524a26ca847fb8521e9d1fc56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"df624f58e701d2153416124057783046"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"33298fe32b8310d539276061bcaa0dca*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1056mb32_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"0ff1312d87d8c10171607919577b3be8*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic1056oc44_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a21361f8463239cccfcd7b7ae86b5c3d*14&65167ffea1336bf4b08d4ef4405c187d*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca4729cb9c7db19937b2102a6a57f5ac*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"ef90ce22de2a4812b33910bbfb989bc7*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1056mb32_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"3b13f7b981df47b7ffde5d8fe92a0390"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"c781f1d10379c673657f38929cdd055b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"65f297e31b8787effcfad8541f68289d*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1824mb32_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"038ea3bd3d1b4f01c01b4e0d7a47a445*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic1824oc76_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"49891653648710f26dbab070f7ea46e6*18&9a3fcb883f7eca9b3752eb7de68f1277*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"319461cafc98948abe9dcf5138a7297c*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"058936672677c891d557baa15e4c9d43*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"1f5918331ec08a6b387f48f6e0d5f7e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"b4a298106a6e32f7bff76773276e4ef0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g3072mb32_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"05d5ca6cd1c46b2abed4485f120f28b9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic3072oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"243dda0c7f4759cb80e3e83f29f85582*4&5d16af6a5c2f84e608f69d0726135cc3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb32_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a9ba69d604e94f222ebea5eb75058b40*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"05822b7211017038959fcd8aaec7f2d2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"9003791eaf6a2e09e628e4c2d406874e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"1f7c274aa53004039e915ed2090f81ff*5&149002cdd7e28d63dc36de08bc561707*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx+binary_add:f16:2:abx+eltwise_relu:1.0 mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a91b5b2ad1b9043ddce59071c43b4b53*5&2d48b7b11b6101814250bb3556e9cde0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e466dbefd925344da08ae2f8d877ef76*15&1dfcfae68782b8682b5d61504f4c3f3c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0fa3f4bb5302e17caa3e238e94f2e8a9*5&b34d2409e4bc3aa5146856fb4c7e5418"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"115552bff8b0345bee2fdc81ef44b375*10&c60462e36b28937426b3a66d854d226f*5&581c21f6a63aae79c3bdfef77fbef50a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"419a7e583e3c352d24382fb0993fbd93*5&7dac01eba85d1a07bb497fa8c19deb2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d88f4397a0403d09ea75e485408fc1af*10&8fa55f8cc50a307bf378bb13a7f33b7b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"263e328f3b9ce72cef8349db38d427f5*5&803b3bb030f5b50c04bc4ee0a5835762"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5804f7de883483d63f26cfa108b8c688*10&9ce56d0794c0fdb88e48e2577985d8e6*5&fc4f579489c46b5ed004e37a3b5b22c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"aa2aa2ceae9f3b1529a5dbe9aa4c89b5*5&0d440acd166088e85ec2a71af358c5ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"aae0e0573221e3a0e6d579b62c7e237d*10&77a04f80ac1346868d8c3b05d3c60dec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"420e9c28ab81c46325af3e8654db639c*15&01543132acbe0865334f639a403e5ea4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b90e88ef743538c8620797adcb1a31ed*5&fa9d5c1e9c9d8fb91e6c9d9cba0e4f95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"edaecc7293cbede5f60ceafa67657e81*5&55a25711aa1052756a7b05e7fd370c36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2b564c2deaa7d3b9213b69036b4fb6cb*5&dded78fa804d54e840a3d8ca56d94e78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"83ee930314d8c812d582b8f5313819d7*5&747aef3eade97c7477dbcdacb1bd627f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b27710123a61f6b5643122b763675d15*5&cbc0681d0e059864fb7489b16ae4f038"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"44fee3230ae9116155c538272b50316f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c89342369c6515e0286bbaba960e1147*5&24e88ff369ad010a88a8ceefa4aa70ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"86e950e5d703dd9d7a0b252af92b5f9f*5&ee75887ad7bb9498f94cc8d125cc2ca4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1a20c5050005156ed334bdd9a5895cfe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e8c0b277fd02eeac5ec4f1a279965911*5&e33431db6f6f9e7834192e1cf3535315"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d5a9e529b43776a12f00a41357707227*5&03ad4e06c52f8a171cd90dc3444e408c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e9282bf4e45ed800c04cb276b8a3b740*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"267cc0ab8d6d5947155add5bb0178f90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ed0a98e5db4cec862e712399f7bfce78*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"83291cde2b1bbd163897e1226a64f177*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f6210118fa8413b67aca9f74c791d380*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8f338a49d08f6de696bbaa37bb89a907*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"efd2b2e25773a05be3386bb0dfe38caf*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"99ce77070eb837e1f8cc838ede7b2444*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"651a9c311f1160fc632f2b30ba29577a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ae89bcc3d73548da4d15e0a6973b0e8e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6ea08597519dc603f513db84c59bbdaf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0fef8821315e3f1f60cf1d76d23cc84d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"be722b186bd8c3640e1df9081ce68fba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"e4d314bd1a0d09a37623bd3be8eea48e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"1b74d52bb4de89ce9462b4d2f4b15f20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a560608107f59ecb7ec08bc72d0246ea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"fc56f4a8304a9062ad37b522ffefe328*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2ae4e34cb22271ba0bdf520770028f65*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"4230908d7f58571d5a9edbd98736442e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"94b571d926d01cba93216df4b8a0098d&8c7483e21e753a50b800b9630d7b83ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"7a3deba0670c5cfd91bbdd50aa835efd&035dcf70f31ff9901ec8dd7acb0d2a6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"003b181d589831db02f5e53843321adb*2&8170a7680a42a23b125db6e02fe2dc0b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"c7823bbac3256d9f0a169c11bb32d917&058616bc2f0a1049520e351855134140"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"1336c37db224ae11318f98bdf8fc39e3*2&9125d25586e388b4db85cb53dd319021*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"41254b2a8466fac32ff2c15609fd81f7&61b06d64cf00c17d6805d198db59e15e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih334oh167kh3sh2dh0ph1_iw334ow167kw3sw2dw0pw1_n"f4757648ec9e74b5bb41fef9302569cc&3534dc0be51aac95ca0c9e713eed3e36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"ef76e51bb3b0d664941b1de294340d32&4ddf63a5c474adfa858521dc1377b2cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"f4330953a6d806db7494e5b38a77aaa7&fb2ff3842deec0d68ade2038203745c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"75b6ff65cf9fd017300d25030fbd0969&4ce039410655964499a2863c8348e032"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"7e70fd3c748bc82b52fcb7ceb5c5ee68*4&4709343ebc3a5e2067a8d1c5f3405903*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"0b5010adac318446e9895cbe63ce0ea4&6e4c5d643e926f2366481bb6a36cc77b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"8e1147655a9610287e524536b3b8bec3*3&65ac4976d0042ee3d57ccf4f309708f1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"c6a79d929f97e92fddc44db9928b53c3*3&bdbdc65054158415382ebc8b82c6c552*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"13bc51337c112086afd3c015130d7873&e96e749ca72a0906141eb341ab81b5ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"8c734878c41fd9c746f5a2f1d55a123c&051fc4758b6208e84f7f8efe91b93092"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"53ccb27a66423e5a8e38e9492208010e*14&304814cf5b1826544badec9e57dd064c*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"ce96158f150713c8896afed5cbea4bf2&a2b3d2b30557f6437c8ecb5cc4adb350"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"1c1ab5f31be0b0e5f74ec3a2051ea0cf*5&2475c27b96d7acbff7e9129f0a60f7e2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"28ca5f307552daf2392cd983229ccb72*5&5f827df2efce161f8dae6b350c7efdb8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"6ad5c0794c077718ce16f77c2bf90fd1&a4619987cc6af150fdf9c0ed05bce5d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"0380ae3d465e5d0ac845cadd9d583a66&ef398dd1d20e58959e512053ee2b38f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"9d7ff805420d4f96dddfcee465b69a9b*3&957a046ab35e4cfa1ea1e96cd3e86657*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"d1cbc60c91cdafcc9569f59fb1cd90e5&4581606e41c9a92612baf1974aa32672"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"30a5c04d6964816d3a5ca44a6bb45b84*2&691c0c63bbb025c5606f9ad56c34a16e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"318f1cc379c8302831cf634c1dbe2e99*2&28319c7c78fe55f7a8532fa1d8b2deb7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"8b7c5e23a1dfec59222279f6de3b7c5a&a94cc543388047472f330b8ff44fed80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"d82882b4889581ac1e9795212b001fa2*8&7400ed9fd28b2fd8eabaa43a40a6ac78*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"71336c11baecc6200a514d449a454e73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"84573d9ce1d6a43e0aa43272c6a01f29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"39419bd9a92d800f75a13bc2c5d354c4&c916487c872d242cbc526f4960d16c52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"6f6a3939f77e4125f9dc368604dac123*8&06df836b0247ad11fabf66848416da54*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"68fb28e3272d0c4edda9c4a2fbf9a1a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"018b0deb81b63557a8e4133f62720ce6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"2b4b3e3167e7635bd56978e4947395ee&27804b8a39fb3f9b5524afc2a60873df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"d2d03cc94ba24839d953e01ce2454a4c&57fd142e17231f813674757f994f401d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e01640f7d8a0c32e0a0b8a8315a86a38*8&161155b4eb077ffb6fa4cefa9567351a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2bfd8de68657e608c047622a8a5bf8ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"bfdee3146e8decee1b62bbba66e77921"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"73775d9d1063835d81d072faab1e62f2&97d9443ddf4c647b89da656e9a652ae3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"42a9f17e191a277c2425faed1ba9ecf1&a2be9861060e946cc90858d932f9ae30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"3d6ad2fa581371ed029b4836dafd14fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"bb1b666244cb950355785411f19b7dd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"cbf931cda9ef8a5b9a27517b1ce05729&f20591c518931f7845430e3d9b4c58db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"fdd7c83490f8884f9ebe801cacf8141c&e3f92338d9b504f75863988067ad3f91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"7f848bca9a4fef4d7b21d699a75fea87*8&18d8da614f3ef3af684c635209c328a4*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"55018850be07fe1347066ace32d6aedb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"b5b07c5d85bc6f455386ec757081dcd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b31c8eabd918d2e8fbdb56d77460e96d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"41079d3417c34648a62da057ed47d7e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"9558a311d9ac621b661c343c918b85b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8f6b096af1f325e0d17b548eacf449cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"9473e37a4adcf8bc7844d26bd2ca5203"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"6022e0456ed924778e68fb380dec776c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d3f2cad6c95ce71ca7a645455476902d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"e31d491707f948db7b756ad784af539e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"bdf8e96f38a6ad9f0550915ea7b4aa9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"baafe759de736bf7b5c71cbd64a8513e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"ddc55a89f77f3f3cc219e4678754feef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"61535f9134065269e18accbc07e59f3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"6b1757aeb25091cadeff992fbd6a3781"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"ef96b55f3c8f4c022bebbe9733e3a84d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"89c78f699cabb68fa0ee1cc08e542f71*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"42f1d3e56fdb25c5e03aeee2f946c892"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"4dd0c1eaf6d5b269024e822a9812434c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"b5d875a588e7bf2ac19d6ac38b4b7c78*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"4c3b7a77a757850ee3e970f725e2348b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"ff7cf214922511e66e087a01b1a01193"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"23f52bc3f2e492750614425451279af0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"0e52edc9cc90dc47379413f402727fce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"7fcabf57b515d9e83d7b70a19a8e977b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"7826ac9085af4599774369d35233d22c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"95ec56475f1b240002c80b593f165365"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"bdd6fcab36547a76fdeacad132a301af*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"c03c8e95b201928715cf3fcd82e8b4c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"e8028b3ca67213c5bbcd588ded6bc50e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"c718e6431d22786487b940fd6982a4bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"ea94910327f6c445051462cb5ea871b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"1882fcac6ebfbebec10200527e21c120*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"7c230df049018c38422600fc1c608c5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"77c50f02e2d3b7d626403e5b0d314279"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"cc4b2939187cfd0f78c947a2c8543cae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a13d00314c7ddb9481267567388cb9dd*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"92c8fe3b7eaa9fff3452f7750f2159f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"3e19ae284c9eaf02f845331ea62f12c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"13f5b995a27122d855e86554bc417057"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"a7031d817378da7db878926d7f096856*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"f45e0cec7a5e03649a0e32e075815a50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"7b275b2b4220181b7bd1c5a291c8419a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"db75614c76a4f368a25c23cd03883d5c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"0d0b64b5a9dc89cce94279c6a0ed0a7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"562f4d87f509e73a8d1e4da3a0a3f34f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"f6a3417fd0cb7345cba5113d2c38dcc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"9fab3ffc0242c13d4118df72050baf44"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"c6ebe6f2855595522a83af31de2bab5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"1777506384ebb4ff3e08550e8c944175"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb1_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"138cb3a5c12449894572a52fbf06d2ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"ddde9ed6bfdcf86ffe7795148128083c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"5665a6aa6296c0412e40a83958b26a60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+binary_mul:f16:0:abx mb1_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"dbd33a3ef6d6cc2959717be2c004552d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+binary_mul:f16:0:abx+binary_mul:f16:0:abx mb1_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"f0fb154d3e914575788fcbdcc40f8695"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish mb1_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"ba880e6d06387e1ea88bd05c31dc6968"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"ee9bc7c72c6f2f1fc7d444950f1b4ff2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g8mb1_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"d7ba260fc40588f179b75490a189bd47"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f37c9424803062831376068606c29a01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b65d96e14155522cc7f9ee37faa989dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"97e5341ecdd60e2579cb9d17f4b4b7aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"edd3da5e3443268b330102ae8621b8ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g40mb1_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"e5aef22c7b312de55e0e1a488580624d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"ee27f221b6216a773fb4d972d4f0732b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e99dcf270fbde2e8c199159dc2639141"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g48mb1_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"e885cff8f149ef180b1a0e368fc061fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"010dcc13e5b3a5dfe98bf1cab292f811"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"08c77b1c69979de0deb19c7fd8521511"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g48mb1_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"10cda4f9aec4aca700921cce8997709c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0abf4b1b6d1bde132ca15871607f7086"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4b579e8359158317ff674739aadcec1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"1edbfee7b6e84617d6ade6b4d113e807"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"112a6ebd3574327fb7615c82b5b6efa9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g120mb1_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"476da88c05ef392de016d8fb0c7fc3f2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5348a331f3dcdd749fa722404184dd75*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b21f276520a26b39adc0f40bb554f8f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g64mb1_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"3a2bf547644ece5caa0e9f0e4bed28c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4549cf3d114c345f6682887ac426304e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e62b92186414e1df65139abec6843255"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7e41060f1a5385301ee1e38930eb5386"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"dbe683a9291d104c006f4de3ee5038db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g72mb1_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"457a6873fc951f74914d6110dcc48178"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"f35fa99119944057899536ca04951bd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"48764f0b68d3b149c6840eb4fd5cef85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g144mb1_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"16eacf0b9063890bc2f47b69448ead47"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9855c6b963795e2d6d9a2240d37f4f1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f04e82edd36bacb21490a50b95d55fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"bdc882f1655e1060ea6013c16bfabb5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"1edbfd8c08db4d52f808b3e527a20ee7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g288mb1_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"81926cb4a06e4cacb05ffa3dfae257f8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f8d1c006f221c6ce71670a4d89286f5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a7aaed0ea9daa1e82d16b81626aeecf1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb32_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"22382a07976e25b96f97207c0dfddb7e&59b7156afc1816f13137160fd129c93b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"85ed46d20329dac1e615775aa140106d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"df94dd2c41ed19f6bf25cf1f2cc8c6ae*6&f2b18f822dfe1cf07877d8302cbf0ec7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"949db2fd49154fa597cda6d5e459b0c0*6&332b7cae4163e3d7a64c7010bb535f12*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"649d1dcf0fc2d3e4aab9b40af945e5ce*6&8bd6a6db3de2f6c434b035b62a1fc8fd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d2000284490acaf6f818dc1669570ff7*3&5635d70d41d3523bcda28968479f5afa*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f8912ea8fab9af8e75c0f323a7f8f8c3&ce7b3a74258f95aecba991798ce7a1c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2b7abdeb0b94acd1f96f9d9d87e12001&01847a3638dc453e4b022c14677c457e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"13441fdb6ad9bae7b8759db968aaed9e&68e79b5f0fcdb124a7b122bff293fb25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"57b3eac1192fd8faa1b24dfd9d0c620d&bac2bcb093e662526b5d3034760a90fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e2b935b06ee33dddca075016560452a3&135af4ee4906ea2135721baa2f0cefab&79382a859d3c00750e0e27ddd6dfa403*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc4_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cfe51c390a9afb76ba8ab135da7126d8&d3604e0e9ed3777ab5664962e92f0735"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc2_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"fe84d3b9df87f5cefd2681a52d6d1411&272068551719f98fa72440ed18512f4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"e5cf585f9cacc73659300c7d6ebca523&3f4f89c7b1367399a2ff5fcea6a0ae89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc4_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"68e720a5e8cc7d4d2f8e50acb7fb4b01&59ade010cc4084ded558c2c36bfee3f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic256oc2_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f9aa5b7c2d7e794d2473e4f876ec7c29&a413fb81756bacd5672ed59845bb5192"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"20776f2b0e6d5aa765e757aea6fa7a5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"e1e673b07133e4050288f69a274e27eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"bde0800ef8502f6bd2bac0d04d7bdabe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"45a24c494e8f5d638a36b27c1e19960c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"f1170bbb4b4960df8d44b50c5ff2009c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"09d9a55c900ce55e1b5d135f15fda12c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"f166aff45b38c11b182844f6fdbb798d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"ba22e191e0e2d114fafe208b02b7af14*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"53bb8413ac7d61649c386d58bd3bc81d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"47532d17223ed0565b9d6e497ad86466"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"e8a22d80ebcabd21c0d126fca7d9704a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"4b0b23acbf4931d13eb78fb957a01653"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"7f45b2cd063abb519fbe65458c8c8ff0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"4fbb1a492fa2565ce36f8456ba90e6eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_exp mb32_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"2f87c4a1c807fad953c37ebd643b2efa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a30021a99383801d473455d288dcec1c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"224774289e5491eac3e9b94adce7031e*5&aa983d8d7cde089a5e84c9e0303e0f9e&1f19015b28db5d7a1fc4ae82931a0a3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0cc18da80095d1fc190d141833320e24*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"271028de1a4791be3d375e277cb1b9e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"ce3e06876ef74ad45836ec63a4d023fb*5&3d89ac6552abda9f24b16ae0c711a991"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"7419d22fca7bfc4083a12f8403208797*5&7ec6e7931ff7184d9c9f4e45802412a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"c931e6f159cb993e3804129d7d2d5bbf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"f7bac75909b184f89dc9242816ebc228*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"9e18e7fc8b5a0c9a60020a7e015c43fb*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"740099a047034c78f151da65883e9080*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"13fdb193ba4709c670c9bb895f64f3c4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"a7246b76f1fa4137ecab5651fb54f7a9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"a8a124041c4d177250ebf6f8f0fedd13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"22e1c4f51da4276e42b702f995ede51c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"06d20ff129e92434ca9f3750d593c8c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5e98f060898b034805064d54612b8a84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a0e4abb75558a1f425d09c68e70e2dba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5dfbfb853aa1eac7f1eba3481ed7f6ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"067260d083bbd311b95cd1ad5b847408"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3c1168f2244f53a6cc0924ffa17788a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c94f6e47df6893bc8ac3f2e0d28c3024"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d519878d0abad30a5c312fa80446dd8e*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7dfa5a382ab0b88e0442961a8abf313e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3249474da25e5f8c8ae8f90c5510dc05*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b8af98dbe2fa6abc9fd23fcd06b725b4*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"98e1389025c0a79a814a2cd13912ff06*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"144eda026a060f672f41e85c7808b160*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a8e62ab0ce7c4b27869f7d27592ee22d*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cec5f9507885189542b432a1538deeef*3&600a60aef82b041ce4d36e7235d6b0db*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e1a7850056a6fcaf9268589a3f6dfe60*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e832160683aff06f0c2ca0a4c501451d*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"65e123cb57e065974dd21d042a5ac251&932269fe4388a089c8fa9332bd62a2fd*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9e1d231268c11b7fa649596a9e9c50f9*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8a326f147755f16f20f111c9e193682d*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"312be36cfa1ac278a850624f64afbe75*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3946a23387cb4ded57129aaffdb8ad24*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"181c13f6c0c94aaa0b9ae76954a3a8c6*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"d9a99490378b3a3d5a2ea31285011223*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"2e85b8f5a3c2768ab0610eb4e560bf0b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"848d988539e36c02f00ed23f9e61c8f9*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"d8ff46d754625e9c3f400643e554429b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"228aee189c66e37fa152ab140cab4fb0*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fdddbe70e956733f3c829c1f989db4d4*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"92b3fbd20405cb2847d59c85d5dbfa03*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"61dbf38725facc03df0d66ec99d86f2c*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f4fe79c76073fe2dccff4615abb71c2e*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8861a05f5e4771d01dc35287499add26*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1641784f4aaa8fb5fb1ed227a40ea719*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"42af7fb3b1ea74eb09c949a370583d33*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ae8c77be6fb0817fec80af7ddf8a9768"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 mb1_ic3oc16_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"1db8184706aa97fcce4800259223c6da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic16oc32_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"f102297b485987b60bdbe59f789e4de8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"015a9f8d122400ad2e7329c632a9bd39*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"0a6c2d209674cacc7f90fc9a604616f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic16oc16_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"231adfbc4e434ace2b830a75b3442c61*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic48oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2b4210942bd6849b5d5eb9998febaf87*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc64_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"d0e9167b58582d994cdfa38bd6ab85bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"de2155ffe51227ba469a1c9bd981d1da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ec4b6f704e27be970788c0da7f3eda04*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic32oc32_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0c29aa6b1bea21d88d012d4696d2463a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7eae8f27628a2c8a6162de29e7ab587a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"b8fd31cdd6aaa43e6dcd0f0a539d415d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"cac4ac6e0c0bcaf4c8c447378ff1e6fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"f58dd2ebb9d3baf033b6b6539c58f80d*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"198f1af685a7c9d1197642b287ee637d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0563994b22aab41441fb9fab5caa9420*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"22fa484590e843c7f1e414d26b6d524b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"67b27befd525251f36c4426d5b7a9dd4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"68e4f3e137e820b3a839934e4bff4ab9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"001f3eea8b65ee68f49b10bc0e8b35e3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"267eb197e84173fe1bc7c5096700f216*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3db85680e08f3f321f6d7aa9fd4cf7f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"18feeb9f3966c8d5ee43ea6a6b7efa98*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic384oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f6dee7c73f5abdbbad0ac8b30dba6873*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"538808f7f4680c94c754f3d549821092*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic192oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"36828bd471ae55eb41520d80f14d43ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic96oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"784776b5b7b4d2a7386e123635fbd4d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"98bbd958aa53e05b285ae9e11b4f342b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c4132535c5ca91a4c93e53be9982b551*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"89b7f91c3fcb06e8a29ed4caf28f28c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a9b93d7591368d10c68def81d2251317*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic80oc80_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a574876425764198fed3cd2a6129756b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e1cf3fb5c407570c9d2e507ac9e05a40*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8352f316761256e3b0149eefe18c52e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc64_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e5735f16a86bbb025cfec93227872705*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"ac4658d1a4ca444a0482fa024257b405*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"3a350e1b02594548c0deb40dd7e6f461*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic80oc80_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"05469326ce0c22653253384da71f9067*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c1e6792b3b8c290639466b45d6edd151*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"1f27ef79e0ba4c4cc04d3a00a7a37792*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic256oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"507017c9a115524a62d534658427c50d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"38fa4ac579683689c005ab8c1262acab*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-zero-points=src0:common:1 mb1_ic80oc80_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"0305088e0badbb2c4b969228fda12fbb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"480f7775d6732f203cd2ef50b8efaa3e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic80oc80_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"51aa6f0174b6e036f27b79bdb939ed41*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"976460c0e66713bf6a07bbb9a5e748a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"143e0685ae715bdfc103443a442b4740"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"36d1288966a62760f274a441e5ebefe4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"bc4062cd37c4141b6cc610e6042543ae*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3d23111aa4a0f151d6f74849362966ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8b4a00a40dd324a56389235ea9a00a05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bbb3c970ac5a671234f39c17691970a4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5ed55d9173b98fc7e2490e536f2c30cc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"455f32174d0600dc73e1250319250dfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"71184d7c638adfa1700caf5d5430e482*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"375b3a35e686797bee2c8aa0fd19a597*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"cc88e2f002c7c6e02f8a948023cde99b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"9b03a38759b5f4877a1a095ec9730304*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"529b8f9663bc917051ef7f2970eb383b*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"7fb36ade1516bf9613617935c19cbeb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic36oc18_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dfcae37fc49464ec5f698c0fc4906f19*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"9b3068eb3d54464f8e75671ba8620c51*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"e44fb515873a5dc34669811e9c397758"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"caea3bcb1f5a6b527474da9dd25dd6d4*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"a28d1bb5deedacd084e0d5c0c5e3e55b*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic72oc18_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"682e82fcfe274709a3d012da0a9be799*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic72oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"58c6f7295eb4127a2dcd836e0e78f4c3*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"a01f1d1046fc002cd4d94132e1184da2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic18oc18_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"d5376c160e88ce1a2ebb9d0e1bb56673*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"af60c3d762ef11797a6300fc80a3d46e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic36oc36_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"bb16d19afdcd2545993007d36c5a5fbb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic36oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"6d5f34341c3efafda185b30f03c2b959*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"dadc0cc37d5ab42de0e6d98476850f4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"b7233422a09058fc626ec438744acdcd*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"4275749201abec861911079380abea1d*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic144oc18_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"5e00a43197eb6fe8c8e4c52bfbc019e3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic144oc36_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"85fd9d0c2726812c36683feb803e9560*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic144oc72_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"18dda7b88adbb7acec3eef60e967fce3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"df1a38d875ff2c0fd9f3bbdede4ad142*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"48fd04e3f5cb9b853d85486d0a25062d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic18oc18_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"0fdace612cbc97e523e88b6f4186815c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic18oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"d71ecdb282977f4615e46eefd5fe24ab*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"4f3e7c03a5f6ee6765a5c01e72a6c295*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"20a1b6be63723a84327d699f4ca55154"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"eb7163450d813d0aa86ceb3a06d9896c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"34d12b21108e7fb7e0eb121827006bbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic270oc270_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"dc044d5fd5cd0166b3134300d52dc7ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic270oc98_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"70d8b714231d241279b49bec72f036c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"232a75492b2ddbbc02289f36c7cc7648"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"be5cec90c17b3b3e75aa83fcfd410677"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"4a0e9bb6b1af3ab68c1f67f986b1b6c1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"fc85b59991e7e44f7ebc267f430deba9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"a7d2bc712cc559cf5c1956ea70f0094c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"b52b0d779d75f6c1d89ac2e546dbbd37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"ee66e31f27c95c4a0f95743416f6434b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"0faf0dc150a7c5e4b671d02ee33d97d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"4a156b48ca5c70bff9eabd3397dd1f38*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"9ba24be011b8462b0ead3da2551ef02e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"e465f9986601032b88872b11b0025ead"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"3e9786aa08f879d8b9b89405168417de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"6a81e3114a8b32ec5295260a806ebb9f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"c0513abc59fec5e68597dcf866379fc1*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"297141aa7ef740545df5bd79d0b472ed*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"97bfc547d56d1e79f40ea772f24635b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"8dc7ee9f3d5821925d2355080b74489c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"0103a699c53de8393b9bbd0d5eb0e2e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7b0755ab8c0704f975fdb7bf5c4600d2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"260aa8a66316629157d73c626be4048d*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"60ede2cfc307679e6e139e40dac02328*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f3c8597231848f9740f55e82321ebc7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bfb206e3ba4b62be7f29ee39838c3660*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"25fb37574040d841f3b5b0d6e345bdfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"23e4ed34d0c62191c479d7039d23b6cb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"58d362a1ed217c2fd18a565a6eaa877a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d3730022a76324e418b364adf1e03d61*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ee2dffdae8b821a016586b82b45b225c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"085153a949137431e41722827a2c1b02*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"43e98487a6ffba6342061dffbe8d4a10*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d6dc4cfb05751589c04cca65ef47f14c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a7cfbef8a1e90e098303887dfda027d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fbc32f90775d9abada7e5c06aceb15bf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f1bf087b921566109fa9c1a9e2993f18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"82507f0d110b2c69daf550c96af026d4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"c45e754decf5cbbb8acd0175a240f670"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"f1e2063fa189a14dbe67aa85ee1ccb36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"3023cb80564584b5a05570354eb8aa7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"81843d77001e822a8ca500a7cf5ea52d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f240fae397eda200ff999c8122746fbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"53e0b05a1c6b3eae7bc26ddd7be5da5c*5&7286f3ca9050fe98e8c162606dc951a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"7fa12a2c78d542f52e58364908eb9429*5&31fa4e620eda11896ebdf23cee9ab51a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"d0737749f9a98f1564e0a9e1a30952a9*5&dde034a1821eed1efedd6f40c0c55dc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"c7107056c51bcbdbfc0628b25cec5675*5&801ffeaf0218f865d039b12fb17b2cb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"c38750b56571bf8167ac419106db5672*5&8211b986a892923614326ea009af1ce1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb32_ic3oc16_id16od16kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"c163fc880a1271d5813878dc32cc960a&60bcec24b128b45242f1fa5a4bbc9ccb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g16mb32_ic16oc16_id16od16kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"ff6794848b173e63e1f70c26162f4f5b&7183a8124722933339dec8f10ca663e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic16oc16_id16od16kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"889c00468e86d5b1fdb9ce037f92125a&35c470fcc36c4f3ac766e3ce45e06b08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_id16od16kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e6da6eec037e76b6a8c0ec377f549c78&bc82cc198fd991bc2539ab4d7c79d956"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_id16od16kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"7ca25d0e005b39457666789c2f7f4e30&34a860638f638bd4f7f36718c91f6930"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc24_id16od16kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4ed2a9d4d4026d406d199cc9ce3676c5&d44325be5a44426e4c5bf0f5deb2a9a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_id8od8kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4dbd87a660e9f2b674a28f0a83c48210*2&eb0b38b80d16e65cfacc5f04033132cb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"99bf0218c2f3ca3d1ed7fc8958ba513a&93f5247fd29cb3c01f22fb22024d5172"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic72oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef2c93c4650d9a767f513aba068f6013&a52d9667803a5593640d53f1d6f985d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb32_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"bd3c3e931e7e78f2fe3c7042e95b84eb&e6f9322d9f332b64dea45df9e19a528a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2c74e0b0e57a4f93944474cb70bde64d&2189f44889d8ac5fdb720acd3de6e81b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic18oc72_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fb740248c6cfb44b980e8c5a33ac3dc2&6550d1843902be439b0c57f9559098e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic72oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1c6f8d98104c101a8cc22392ffc68482&914898f2e53c278cf6349c6364f06e1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"274381573de1805d13ed39a44d116f9a*2&d1f505b266dab5afe8e97c16b82f51ec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb32_ic120oc120_id8od8kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"ee8cf0318faad0139ed8b25495768044*2&535c642eb2c41b05d752e7c1497afb8f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e7a002c839f48eb1e9b80e35633aba58*2&25e0ea9f990ab99552683e98fbbf9752*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic30oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f8c2aae7c45cfe7891c6c46ab5a9cf19*2&8d985529b841d2cddb427002b9e615db*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"15c729c9bb1d007cbe48228cf93df5ce&da37c5d4b8d8151067874d641633deec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_id8od8kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4a24a48f299cde5e4398e258196aa723&fd8bb4a66b670fdf1225c0b45b99c1d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic40oc240_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0e85565e3b404e866369a07a025af085&4e5e87130c6d8556d6d83a6b6e1e1401"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb32_ic240oc240_id8od8kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"bbb2e69e5fa3d7bc74d2678582cf87e3&1369f321342088dda71407d690622a9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic240oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3cd2d4ccb4cf989bdaaf9d72699604ed&e61019c2b8697f8495bd81461b741205"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc200_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d9b312424abd363cfc5da280bde60446&6aebb36fb85f82a9b7995b150c7dad6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g200mb32_ic200oc200_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9877b85dc0d86f5741f61f8ed8cbb1e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic200oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"67a840eaa85afc2d521093287f07c5df&1763bace940f95e3e5eb0ee99ecd3a65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc184_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1acd7959f9817a3f1da0218b1a8d03cf*2&4bd23a0a67cd58ed42849f5387e42a35*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g184mb32_ic184oc184_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"063d14dae0b26c6fda483eece277a97b&0bf571539ba67b4e00b05df392fd1e8e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic184oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ef1a7efb61d5ef497d13dbe7ef29579e&0a568555672804d230a202e01c9345f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic184oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a75322d85d7d7e848213f3793f19a0a7&4a4bd038aeb51a3d1e086d13bb5143ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g80mb32_ic80oc80_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6193eea3b349a14929d14e0512fd5806"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g80mb32_ic80oc80_id8od8kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c9dad48b2a7eb2b4b3c02f6c30194417"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.25:0.375 mb32_ic80oc1_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5716a75f587e6490c80fe98e282c2509"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:5:acbde+binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic80oc1_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0699b3e8c35e34b1554332120c3b37a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc480_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9418610754c8837cb60ad41d627a294a&5c89777cefa3622d8be9a80befeb1413"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb32_ic480oc480_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"691ed5d95ea7c0919bca8dc4536a7ff0&f7dd8d0d5ce9a2a20ef10c28dc01c763"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"04a6d17d9f19270095176dda26f5337e&85b2b99702090d945d7abb4020ce835e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic120oc480_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"345790e65d6c74f59c1496a7abfe6922&24bf797e6e3e9ae29f278bb0464351c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic480oc112_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"396a857d52c0c95b7b2089d305be0927&9169b9efde8d249b1b1310e5f862a937"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d24e4c5d011a7c462bacdeacc19454c6*2&88c2f7492c9bc93dba77cfad1464418e*2&9c170be7d3e281ebfc93e92fb41f9b38*2&3d439c0c17f0ecf95e61e72d4691bec8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9150e3dd17b7b77b65e6c45bbe6c7362&dc6f502182c2a185c7a496bd6b341da6&631f9993bd45a87584b707dfa7b5be5e&1d712ef9c692002740af6bbc6ee872e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"164eff918a0d19dcceba25466c70d199*3&bc61c8dc267374e0de0de817091d72c3*3&3fdddc766fa953332c86f76bd1b115a7*3&2bbeebf33121a1b00f43f420125d0748*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"14859653c78a0967d8ad539923761019*3&d8beb7de588115280f0a02ee1e231e01*3&af4e869d8bc87259f973ffd2ba0cc3ef*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cdeee5aee5403ae1a1363a422bd4b807&70647ce1dd459b2988ab4c4b00b9b7ff&d150e95bbdcc950c80480c71f407b416&b5585e8f3d5f30660f2fb83849d5b1b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"7d08db779bde333cf4a3c80693b7818a&b8e633602ec016ac4fa1f97245efa46f&955fb8cab194feb93addad33b3df959c&bc77c5963ba5a4a8c8cb4316533ecc5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0cf5e97eb80f315e31a090df8663cab1&910bffbe62735d425645ffd9a01022af&1ad418768c2be3fd4d43a25fae0d1199&3af83d4cb45366df5190e94701562853"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g160mb32_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"06ede384a71a97a6a6ea65601c54bdc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g160mb32_ic160oc160_id4od4kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f0e5fc2ee6f07fcb16938f417a44fdd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.25:0.375 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"65aef264e0dfee287e6d8dda9a7645ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:5:acbde+binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1a2e122bf2c591c996336d3f120bff20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e12bf4ef548ed9481605de0528738aa3&4d30317023c048c05ea7e8125c3a84da&de69ecc6e2ae38dfe82a6d1f4e174318&2beb8ae801dd249cb2eddb2b378a2350"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"e2696eb8f3903aabe3197fd3e0aa11c8&15eaec2557ed3c29d290e654bd331f14&27bbee22078a1e1a406f2810c29695f8&ef48d98b72cb7006e4e88c4fa8387f25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"21448e2435e5c5bc56ebab78d92476ed&857f52976db013d486aeec0cd0ed0a1e&35125e1b7474487c0e195edec950bcc5&0e445b17be0317ad5ea9714e1ed53338"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f8d69fa2edb94ea0e2b6269ed61cc2b6&8e15513b41cea70db9c702c580b345a0&81ab34c9c41954e5a769061f86a8d5de&c763775ac0ff5ac71f556110872bf039"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"9c5a6f8679b3ff4b40f93b66cd34a09e&b63167d6ce236929d481919d2547bc82&d8ebf00103881dad04e97aba03991d5c&1fdd07ed990cefe63b3d36f3e06b855e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"453724d9f621a23b37b2101b6532f97f&67bb8d0a3e5c5325aa9d15f9b3933e68&74ec891ac200ea5e927c07aac48b770a&64662166da8256bf803c45a89302fd32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"598a9e84807f4890ec7a52267dc246b0&6baa4227eeb4c59e85b58fe95c0b7e0c&6b6ce7c7096ce31ea7ee6d4bd15c4ec4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"49cfb566ee8408adf881857bd1236110&34a6c4ecd7ba1c57c026cd4ae95fc0ba&3b757c0adcc7618bcbfa4fb65a94e649&4807917022509f68028a92c7abcb6937"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9f9c9f0804cdc355bb6250d93b13322d&34de8dc05de67481c32d1aad9cdf7511"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f27a2e72864292634fc06c43782ad747&2c59894d175944248288a8b72f30555c&5512d46f64dde3ce219693bae46dcf85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"06482f395a2ddb0793aedb112bd605b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"a272ad1d3a107274ccf2cd39be30930a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b072d88b584258b884c321ba269ab7bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"292eb2fcf8eb527d5d6ec1f5007c9a33&e7f2eb1b125051906f423af6fd274e64*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ae38baa9449f069a9eb722e2df61a543*7&022af275750b4be27d7a5bbae5c154f4*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g58mb32_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c06d7a99ee26d7b54e17ed43055d4ec2*3&77ea43cfaa39fe31cb0ec1a14a4af668*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"34f952a958cbe8bac28e1f9fc5f30fe2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7c5e86a034171e47fe5aace3f869cfe8&69f12df32261fb73cea05b0a317d02b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3b46f1838a6925e07842bd3013188b29*16&2e65071ecca652a78e3878c3fcad39b0*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g116mb32_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8e6700b8c36f402178d10c2f5664623e*7&6e4ffb75c9f6eea3ca99ba6ad1f68cb2*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"892927b3f80b9cef6bf6d860976e0d2c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"401455a76465f4b78d18de864b10dd64&a98b973f2b481a3dce55c9e627912f4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"734e8b9f64eefcce94e6c0061c558425*8&a401d26dbaba45d4a4725a6ef66c9d6f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g232mb32_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4b43434d21e740f8c210a2c5840f8778*3&c367f9b573d833caa0c74b67c823e18f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5e09ce21aadb5df92186e634ba4e815c&64f837b7365dfdb3844f179a6acde8a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih300oh150kh7sh2dh0ph3_iw300ow150kw7sw2dw0pw3_n"bf911a7f068aa06ad5b75ce930669bfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"60d1284403b66230b268584c788397bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"87f4dfbfc062922a93389b8b50a22469"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"139b8cacdc23bc1302bfa09303ef9e3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"c3387f6f9b9556c73a3f8141e873c3c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"b55e50c79feba9d5ee261d9343615d59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih38oh19kh1sh2dh0ph0_iw38ow19kw1sw2dw0pw0_n"5619ae8b4848b7dd1ad923a3031024ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"bec158ac3ccc06fc1163fc8b17bc2873*5&cbb3cc2387f3cb5304a0008d4c9a9e7d&d1ef9e70349939a27b6b9d619402bccf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc12_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"3f4c02eaabfa782dd5b9f50c554541be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc6_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"292b1242276ba26ab58d76a6a28d39c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"c9be164a7851476b0a44a420077d42b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"dca857645cb1444a1612518114c2d0e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"57521b8dbd9f251e406518eca7f5dc1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic256oc256_ih19oh19kh3sh1dh1ph2_iw19ow19kw3sw1dw1pw2_n"e20d070089daacb61885863f2593a9bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc6_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a5409e3bc4d778777d99320311112f04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"54c08932fa1d4920229067852cddb8bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"cbcf124776824d49519936d2ae168ac4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"3cfd7625a7d675d33fe0c886763baf46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc6_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1d071cb01a9a94b5e284e0902ff7ff38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1735388f91a2b0dad303749b53191335"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc12_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"9c99c80b876c99edbeb82bf3abd27b11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"88c71a2514a75968977cd3584e243bbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc12_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"7a7ac2c84f849263482a3b2ad3725a87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc6_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"f4e5823c81747c763c10a8d1a6d7eff3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3df2b8c31ca06129b6d55294dc9240b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic16oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"aeb830f3102969bcc0e7385cdb43ffb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a6e9acc6deb921e46338641e3f0b4b1f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic128oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2110f87e705f841e3d115b2fbe5b5ac5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"514aeee688823d1395e8ac2372d448fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"8fb68cacc1cb6330b7d17344dd0970fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"26393524fce692b9d3b4f33adc7dcf22*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"19b8eb673f59cf23630b6862e6192d32*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6c800001a55bc64ddcb7389742243456*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"6801b70b136e306e2fce2d5cd135f53a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"28ec1bfd95c9caa0b9cc40a382e2d659*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"d3c2c077e12329087f30efba05a8a124*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"6c3e3a2ff2db18242889e669cbdf177f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"7b29a6e2d6475c9b687f0fcc710938af*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"bf1a7d08e3eae4c365141fd9cea74f42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d931ff8f4e94b1a674051c6ff5f662dd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"b993dc1df0e7e0e8b7103be99ffcf681*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"5200a1876d7fe5f11f7fef53d78d8b95*5&1faffc06ab4ef0eaa4f297d1efb8cf5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"6fbb69d82963a43d1135c2b8bcd54f2c*5&6a77447927c5a51452924db2d20c35a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"e5a85407430a48d2a40ae720d05b7e37*5&7f0462b5ec8aa94ae93bcac2eca5332c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"416571d369801023a578cd4430417399*5&f676e95eabb91c659ea2e72fdc8296d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"079efda377415eb458c41a2e79877a64*5&08bc1682788ed82a727a91b5d20df582"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"a081ca83c096a82e7fcd5ec113bf0086*5&dd1947ab61fde7956a2ce03255c9e0fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"dca1eae09af22ccfe6448c0e4f323b11*5&261050f269659dff0161ca6157a6dda3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"adbf6097aac67bcba20e48fb3aa5f3a7*5&7c8f6d4314040cb23b58093bee71cce0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c87c38c0e85ba816f046b20ba5fa1d2e*10&818b5ccd2fa28853b1895c574d2bf5ae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5eb3acf0836de4ee66c3de03a916948e*5&7edfe337a95942ec693db3bdcda0bea3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fa32147573dd7afba5c67b370bb6108c*5&f1e743f8177e563a64d38d871bd51f35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"89f7713838361c320aa09dd6072618be*5&0655e3e5146b59f04565463ab3229274"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c4b0e3b6cfa061bc1ae301bc6c328f2e*25&c3489176ce2f1961a6496f890dd2874a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b94723a8d94a348585c10c442c21617a*20&2d9e87f04328444ac230236c593d328b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"11846ae3f6bef997cd71e4280ecfd513*5&d4b973032256ebef6085a0b5770fa112"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"12a67b0ccdf32e1a02b25aaf266318d4*15&d6e555e3cf7c8b564357b56c91163495*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f10c27d70f5c28814aa94660be10d2a9*10&0f408c67b3ad14fcc9d732140d9dcefe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"25623f49196fb6436d56218de75b9b0f*5&821c05db04ffc98fd8fda2cf1d4a788e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6ab221d9a56c3ec9f9f6c0183b65779e*55&9808bb550ce7044fdf10ad644de37851*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"606b3767b4ed7b79b552a6be98fb2bf5*10&2fc6136287c3709b02246189ac5de66e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee1ec21d3aa518de3d5f48267706c7a8*5&9b53a37389a9d94eadeb2d776e6d99b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1cc990f631d924dd2f23d3eac5a33de4*5&5a7612f7c4f98339bd4e422e15f225da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dd053f61c06a62b3d303ec503d28945f*10&1a6cf34ac371479f90d4e76052def2c4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"475fce196c00d50ae1d9b8888bbcf986*35&bd016711b4a3ce1e5d0a02ca2f20539c*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"223abd86ee0e6e49d8e777a53cb69805*45&b3b3d7954eadf3ffd55e4823facb2691*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"31618ba733f8d052178285d5b5323e25*5&307d2adad8142e6635783c565058d1ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"61317a10b8b36ef81951cc1f96b62f11*5&29cce904c19ac19a29f9c7784bff2f88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6ae2d6671185c72a1591241cdd321b8a*5&165dae4c00a0fdfee80ee80512aba35d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cd94665780eb6b3dcc4862037013d615*5&df1fb0df04086f0474b0c29cbeed99ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fdc4d956df5ecec512f9e7c6c9d0624c*5&11d6f87c21b4eb525f9e68f6c13c68ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"27231d1b82184cc77ccaa5db1b0fcb4a*5&e86122e971b893139caa0ad6eac43c1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+sum:1.0:0:f16 mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c16dec6c9610bc6320324584766ceba8*5&eba0e606d77d365b083dfe038d3a7dea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"38e293ac0f720215fbc53def7f5b89da*20&408eeb250c7d21be202bf8b4e27645bc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f8cb3bf48b760d5b8bf72e1b967af563*20&b78197961d226e5bd9a9d807387bfb44*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7f61b58925abeae25817a8c6e67a1c9f*20&48362d5853fa1d94b38843cc7fb99b4d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5db7af72fd8a38ec4c45f8a06d05f730*5&366847bd5d143fa59833868f1e645191"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc512_ih93680oh18735kh10sh5dh0ph0_iw1ow1kw1sw1dw0pw0_n"73150723c305b3bede711802dd663126*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih18735oh9367kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"5fee14a5ea3236299b67af1e24e26edc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih9367oh4683kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"d3b18b51bfe6f779c098b637995011d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih4683oh2341kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"b3d6d90336f409029e8d4c50747c990c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih2341oh1170kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"d48467881d20220a60c27261b978089a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih1170oh585kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a2a7f756b9fad654eab1391fcf1c204*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb1_ic512oc512_ih585oh292kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"a239725a58f39906c60bc8155e62215f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb g16mb1_ic768oc768_ih292oh292kh19sh1dh0ph9_iw1ow1kw1sw1dw0pw0_n"e4abf1bb62671310d897ebb8f7358b2b*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"3dc693d2ddf57a0028062a186d7fd52f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"effdd1a057667e4ea0bbba10d4d26f2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"d5d92983ca3deb61f16b7ba81feda44e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"3ca801f30dee5dd5099116f462f0265e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"9c6f1a3c6b6de0762238a4a4e65f0ca1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"e39a1c75535806135d7b6fa36db1aa8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"3a47f834cdc98140cc1d65958fd430ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"783e9f354ede53493abcfa8f87028310"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"117451dc63ddb3ee1a05b2b92c4be92c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5ca4cc9e117442890c1c0ff3a6427edb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"af4aa4fc4f6ea65dd9c7279eabf4d3c6&91a60f28c9e44e751095d23633ffa5e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37d8b9f591cdca82bde03704817201d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2238f067a4eb828c96920725f659886c*3&cfe2ab436f80a9e2fc447983b75c77ff*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g58mb1_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"969f0cb79b024dcda0aaabd1831028fc*3&1fdaf23c4c3cf51ba49c38c418632e38*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a8a7da1de1588c3ff7376c462df22179*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"95c83abab117e6ef8400f518d818d84b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6c7573378f65543ca2a88a004f4eda65&18fe48b536a3bd5c6e24fafa55204f49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a6cd05a5fd87bce2dc9883d075c47f2f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e4c09cf3388b421a73b78565f95717f*7&d96a4852f6100bab6e499488bd9e42e7*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g116mb1_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9973037d5ed676b7e20ef710cd526d0d*7&e5be9c963a6606fa37e381b99452f64b*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e81d49dee313c448c2c05a00f56333f5*9&ca10dd3bd227e31f6781209a51ccc4c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"d4819ef27f6d427973d29f7ac2bd55e4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7653259ebf214dd8760411c5e4a06724&168cec2d30f9f883d68764a858a4efb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0b473a68498fb745aa61f9a6a0b0b9da*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c9c53e7326cbed37ceaefa0fa46da1d*5&06427203f629d7f8feb0aee7feadb05e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e42452a603c3ee789588000fdc08a008*3&a95fced69e9bce49312330d18df3b6a4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g232mb1_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1cd3917f8203ed8c89a18f3a13c2f5a0*3&a08773e3f249df5d001d5510862199cd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"796fb1d55c05d6cfcf96879a4ce56bcc&b6ccc99241d118a5c0fd64907fefaccf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"617c6d6be395d361730268d17e1a814b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"639e9d3fedb80cc535cc2f2d3b46fb5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a1cffcecc718e8c0d6fc2245fd5bac05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4875a5aee7ee02fc246362e82d97ee2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f1d7280c8e91a3a36cd3d866e0035e86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"33dee1f01989a2c5b148b78759adb506&45774158c593e26642f4455a101f1bd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"767f18fb00cc8473f58737a0e510cdc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e704f769ab3b71cff6b7c5a7f66da136"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"1fcd0f5bd423620d15dcc868fd044dc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"d8501393416a0a2a70ff9816bd47e69c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"49a63fffe12c252c8c203c4536862da2*80&5ca0ca80da7a5b430139f3f07df29554*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"76d259c5393634f349d5171810ed258e*85&4261a7b40ef815e5601475d1c7beeb6d*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"6a1536926d59256f35842b72d7c07f79*5&308867df6e95354307745f0258bf9412"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih678oh678kh3sh1dh0ph1_iw1020ow1020kw3sw1dw0pw1_n"53c00dee7bb473486c79340430c024e2*5&119570482e7ada83c07c3b67addb39b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"5a9d3addee942eaef86c66eb32b2cc46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"44958711d6044ed3e7effdba9e8b3436*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"97d0df8b57ec690b7f9581a3484d41ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"da834e830b4668ac31852689922e42c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"01ba421052164f499bc2fcef4c507146"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dc887cc79cb7e3da5b02c807d922347f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"e65facda74a45f3bd1deb7643e2cdf51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"1fb04df8659325b67184cdf9dcd2166e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2be50654913c2b2914bf338e2b07b583"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"346bfc9024b2d26d36bd902f688efde4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"56f6d27e6d169822e9764195d634101e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"42ec03b0e56372f2a21131f8ac7abe5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"dc1245fb54e76e61d46b357bc9d9d53b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9d431c47c1017d6a700856ffa5e7c719"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"90d61b1dae3e3f3bfad2aa9ffa00b4fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb32_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"f96c8b9b351a140483319951544c66b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb32_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"e519d6b2dfa395dee566297f7e07bb01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"8e48ed0d40f2f9118bbd4ce8185130a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d9ea8e47fce565296266046f3ca1c432*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c113ac65eb3167c1660e979f69aad421*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d94e140ed6a1365e5ab4cf7ded48eff9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"714ada68cf45df555c97705c67728a49*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"794903d3d9c323ef7c5510bfbfab7001*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"d4206c063963e0ed78e530b4f60db5f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2fe0d328817ee870e6f89bf1011f2510*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"54deb319d6103b468f4d6efd50790ba5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"69b1ae3d4db9a8121d044e948e25d33e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"002ee744c9fe366b77a2ae0baf0f67a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"705ba0c0f40d66c74b80eb8f6aac0d7b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:u8+eltwise_linear:0.25 mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3255b1b0e7246eceec0cc471cc4bc9d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"388ddcefde0f408f575c3d5f5a8f399b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d25b9d026b6c8b4af08d32b56ca57165*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"dc91d288ea9c5ac5db02c679d2d91ab5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"da250130ccbf45a49f315a4bf548ec02*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8b35788c43bea178b0222ff29f3a9b6c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"443e2e8c8035a28255875f0b575b69e9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6b31917455e1599f6defa4a2b22135b8*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ebff58f02b014d45a92ff4ab5059e1ab*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8dde13a5e0710801280530a9457aecae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f11e0b334b508e396ccc804a916a8bc4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"11f81fbd8756dffd50479612e5022dd5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c46a2d34b5f5f6d61e9650dc978ed8e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"34d7b875e24021a89d2f3536c5dc83a5*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"edcc4c48e876f313f7e421fcc065f84f*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d8fe8429e4af8ddae65cccd81ef6d692*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"794119e2a186089822a6c1f1a114d8ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5e02725bc22ebf94c783381b2bcd8ba8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7024e11266c0d5f538a2db4c9abdb8ba*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ff08148ec09b29eeacf82dfa280b44ff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5f06ef4d23ba7eeef19d4ab2a4072e45*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"40857ef43cd5fea1144d76a6e541a5d0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8068aaeeba3e9116a4fae957069aaf00*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fc764ce8c904cce1581dd58b177c8009*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"920c20e56ceb0ef52d14eeee9c0d8695*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"55b3c7d8a4a28f5ddead2a05caa2d0f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"d3ec9b85fe08c7d130caa1e78075062e*5&0620940e1367858dc7e5983e08b31b16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"dccfb45dfe050a00898bd3387d271d5c*5&ae42a818b3fa25be5918ba8c7ca56566"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"8f322aeb4f35ed819be1682a9c044749*10&51f476015869f85bb4baa89a393a10e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"35b54317697da4e94c540e6bbe228ab5*5&d265528569809c4a9670dc3db4199ab9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"aca2f89e9328f67f5951404d8d3f222f*5&426baa1596666c1aeff52d6bedc63134"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5d8184f3b1903de0ffadb67a2f0e19a6*5&4647e0c686ba56d689c4ae907d9349d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"cd9f418982143b5900ead6b2371e50b3*5&9b5ff7bdd5fc1ca6b0050ceefa649375"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7ffd1382cd60e026031c99b2d96a7e3e*5&964078475566cda33c6df727c19311d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"69f25ef4c670fee59fe636fa0670b08b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"81acbf01a6e0c1cbaa318e6c2a6d2ca4*5&bac36f08af035d47b46a9eafa7ddaad7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"7b456cdb7d48b0bbbf694d20bbb92064*5&b6360bcff34dfb03b6c508c74bae58b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"41ab38cfc4db64c166a3d3b775bff918*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d18249affc17b0efdf5b11533ccb0216*10&3991ef05894fca1adb043aae0442864d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7598e49d4bd9d35be9fd8a4622758d2c*10&b887028d2e45f7cb47c666f373fd2e86*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"d7aee1517f23116f41e6053bb9882dd2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0d81b9161e6baf7593028be5b84bda58*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a7abf2cfbdc63fcee37d56dae8a39c24*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9c3bec05fb04255bf5326789335cf6f0*5&2cac57a3a8a9e5003b72842e9c1e1dbc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"871782eca7ec618f7e375aaeb85c3fb7*10&1aee0465d62a0614b0c1441c8f0cff57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a1dde5bdea9a4536845f0d9c97cb0a5e*10&703adc3a03bfd4d2e8610bba108a9499*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"f10c74758302358721179ec6068b3a53*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"93c2bf548fba96fb7f042f08dc69e71d*5&586a529bb0422cdfc6cadce982ef1716"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b19e5ed48467960d5adeb636116fd022*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7ebdaf052e22a20cb1398c014a9c7584*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"87d700ccee747bf1ee0eb3e2bac24870*20&507fa9759ac01b5156967918a93c62da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a1ddc40564f15106f0a2ea54b1a42744*15&341b411594c644c2a45cd896a76d8145*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"e489dbe9ebfbd478eeea82e6c695e805*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ab959c111fb4845e75432b51293b4a1b*25&cf3599c84c9e1e35e3d8403e5bf31d82*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f875d13e9b1f8029b8a78e478f806254*25&88e92142899925f515fdac9fbdb07ffe*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc8_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3a502f53bd1e78d1682cb6aa6e7233a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"722613489486791b3c0515e46f45f0c8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9f3155c57408e1197145d952bcad7cad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a50b4a701ac4f4737826b23c472258d5*15&3342bcad165577e5825591a2270fef96*3&4c34e8f177a652a26e4155ba6d77d4f0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"340857fbeb39bf7e1999f58a2d9387f7*5&1985853b0cedfc820bf56a51eabadb9d&14de1b1acae94ba0af5e5f58489402f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"46908ad522ce6847453b7a4299d44814*5&4623983b6e7745e4396e71cc3eec6c75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1a4065d786db2220b4fbd7fc8d5f969f*5&89face0a278f7c19b8851ff998d3f622"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8eeff0c5e0e25583a8ea9eac70edd0b6*5&06bf536ea05dd299072bcdbb3f1023b8&29514fd047600c8be3e6a32c06c343cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fbc16448a0e168016d4e993c523bc26d*5&968b0aada03d2604697adb5a652c4d7b&1c4995dcb5eff1ba68693578ee764d59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ff9e9b11385cdf99664e47c76451bd7f*5&8ab4039e060564bce0862eed17ff6b7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9429c45031289a7180b1898fff8c59d4*5&9ab1b100f76143c8fbdc8a8e534893b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"de114f1ee53f4ea1bb3779d9217c73e1*5&1865a23b8caa5f7b165fa76619718c72&7915fdd7e9c12e4f695f9ee8b9fc62e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"43d4b606bb8fac8aa902014369474b9e*5&dd95be0a4c3b8b072876af09153a97df&91352376dfd3d3039c94365326229049"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"30929bf0d539384e1f99364b1227e7a2*10&9e9ad57c4ff95c5722db79479810e240*2&0506025b35d335bff4dc72fe5decc810*2&61ac145e00515940c207cac75756eb04*2&552db8c6249da6367e377b1e92db78d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"061678bc2fabe245c25335fb86bb05fb*5&d8a9fad541f95ceb7fbd114766eab290&79f8d453ad8407c9797aa5fe7066ebd4&f5bdac9a821e2d8ab96963e29b3c241e&a9e430e5ee7921e4e38e018dbc894ad0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"91be1b80f33585e30fae14368d0cbbc9*5&0f5a435823fcf50782222f26e9b1dfb3&e952e1796e0ae8be9e5abde6e93a56d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9cc1102a2ed49a49b8750d8b41594f3d*5&fd018b2ca7180990539591b513144611&2681f8ffca91c3680dbbb66f5dc8e2ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3fe0d03702b1491a6290f3e2adaa7223*5&6ba3c20818e6f3a6afdc36bd7db6f9de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"df8a6670417f24364d194b70cfd22b1f*5&7d0ce44408be0d8f6f25201528f0c4dd&77e49b30d8e9459038fa6cdd8a26412a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"77a49d970419960a730bfbccf8f59734*5&64917d71e21a8253a1b5ee262e081ed9&c6def518dc049326239f4b23886c35ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"91b1fcaa56731c371f75d8ac0be69022*5&afb1d18b43cd30117af6104d5eb399fd&4d80a679e567b44b418eb1c410e33571"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8af1604c70b1347659e095beca479512*5&553ea944390ddd340bf7276671e12224"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"68b9de1046f1025dcf125ed362586429*5&4da58d3bf07d6052ced9188798a36d1e&3f335102f2bb0a591bbfd9806a7b8731"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"6d96af7090c2896468eea7a3932f61a1*5&db914afaae470ce688a226774b55e16f&d1bc2611486f1a3dad5e677197a0b004"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb --attr-zero-points=src0:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"cc7f6f2fc69e01c586df49f8f86250bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih512oh255kh3sh2dh0ph0_iw512ow255kw3sw2dw0pw0_n"94ebc71554df928179c3eebdd3c49beb&ca28e390fd57a762e77296015a0e5929"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih255oh253kh3sh1dh0ph0_iw255ow253kw3sw1dw0pw0_n"dfe636c6ed68dd953e9e2b84d009c515&896bd44c648d98bfacc3de22cbe4d7bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih253oh253kh3sh1dh0ph1_iw253ow253kw3sw1dw0pw1_n"d43dcbfe7b1434fbac45120a78483cd8&c66f3a8dc9344d6fadeec14b85f486f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc80_ih126oh126kh1sh1dh0ph0_iw126ow126kw1sw1dw0pw0_n"2c7e35ac19991349e0dca8dce02d71c0&fe826ab6830d50cb885b6659988a95ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc192_ih126oh124kh3sh1dh0ph0_iw126ow124kw3sw1dw0pw0_n"f62401d3b72757e70c86d8be3cdac1ab&c368d524246c2546e0ec33529cd5e755"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"23cbeddce7874bf5b8f1121e2c043cad&ba674351dbab66007413807e1c1ae93d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"6cde11da3f860b40b9ac9fe6d656ac12*2&a6cf59bb18e77975b5bd10f461857ecc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"2f863d3c70a30f56917ad894b09ee8f2&63832c7e840d12b6de07e8a49d5a5922"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc64_ih61oh61kh5sh1dh0ph2_iw61ow61kw5sw1dw0pw2_n"bca99bd24dbcc07b0a55df2ec833d27c*3&ef886f579305079ad228e3669e4c7485*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"d5c91563dc0ee3dda410402ef6432fc3*4&ae896795db11fbda7d32f54fe78bd957*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"64dbfc3bab39128b5c9a3be83136430a*3&8f03c63279eef5d3d5ee2388b8e68210*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"6d507f0aac5360178960e0e72614171d&111cb5e581a2be9aebdd4e938152e3fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"ed25374cef16bd59fb7a4e7720056567*3&174de84d649dc8c37f9366943dfb3c8d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"a96e736653fab95dc52967b1cd512592&fc38376d42c897cedcbb154dcf6cf822"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"aa0545a599c2e54cadcbde01510a77ff*4&b9de48d859cdf2de64cd19f7955a8b73*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc384_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"e096bdb83da22fcbb1f070ae9e6d9cdb&9d40abd7a14bbb8c7aad9b1cc672114f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"75a214ea83bd155458c3ebe52b834df3&3a43e2ad36cc1910d23b5386ac336fed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"d98562043c869304489f904677f0db7d*2&2a2b078f69a4c81a90fa2317fba344bd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"4c019c20b1df3eab3d75de66f5568665*12&ef07900217bea5689a1668167ccc55c2*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"5f791046064ee0ab15ef6ece0c691101*2&bf8f2e6e3773ae3c1d3cc321ce7f0e35*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"606907121b653e669668f026f4649e66*2&6524c7d15dbb33c997934f57fb638446*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"2775c1c012d1d6e72c7519bec68b2f99&24d20873c805fbcd0b0d16eb94d9976b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"5834f1e8598638187eb33b56f4f12483&db4443304f9edf53c4adb13283307197"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"c0f02eeaecd8b9150394dda5f0f120ea*4&da7d253620529869f8ab4a1024db30aa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"191f607ec3f51d82e8a668e00ee54d63*4&c8fd27252eb38970e6fdcfc9c26e5478*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"e9397eb2b6afdf95804e39c9755d9f91*4&9cbd02336d4bc79cd8721af5fa9ee69b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"3603328bc32372132e75465cfbaa9961*2&13e1eafe27d65916f49289c9e7e3e3e5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"a40cef538233ba1b2c57ed585c829492*2&b2dbf74a5c33239245260c21a93f92d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"6049e3d3ed086572ea50cb7e3b78025d*4&6063130d9a491626870116884d06914e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"1108941cdadb04acdffcc05d185ebb1f*4&c9a2fbc91c6ce7c26721291a4040f190*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic768oc16_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"ec23631f6a47fcb260ebfbd4fcec99d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic768oc84_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"d26bbbb9ac2c6b8edb2939d74ec932ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc320_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"6962d19c89d05097ac927ecd3f8ee5a2&849b6e2e0334c75bc45da469afc02c1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"0375cd1cca37a1fbe115cc258b1bc65f&cbbe31d5c6f0b9dde70d9dc7d28271a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efc9509b15b2e992b4a9204fea82365f&5b1ba9540fa536b0de2ecd2a3bb4f21c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"67cb0e4520dfb4bee5b773be89e46e6a&f92d5732ed58e7c26b3124f6eff34e64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5987dfc74fa45e91b8d3647cb9a78192&cf09c3f4eee6bcaf9b9395dec49b9c63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"82bcbc12d6ad5c78bd1c0cb0ef2f5350&ae9b7e6d3d9f851bde89c4b76548ca81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw3sw1dw0pw1_n"14e4db740bb5a7856667bbc113f126ee*4&3cc1e2286df35e60a0143d19f4905046*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw1sw1dw0pw0_n"ff1e2ff97ed73260dacb565d25b63772*4&7892d6015b85c3265c537b6db4221780*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c6ffa7a0b632c71c8555e9068ae676e5*2&3478efafce1a24c364a3a0196294473a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f253ca58eab334dcaa49379ef9c8e2df&472bc6ae1317093816e2e83cee064b4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c21f2413ad5f248d75cc9071f2ec90dd&d58443a0ad8476e6b00dc585f151c1fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3055a042e08094815a3242ed311bd7f7&7bfd7bfa22866ebf54ecefb104370d09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2dd17ec3c84e0a6442212dd79f0d145f&b087ff8f4628b9b2e9f9800cd5408546"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"787f16771171cd2e31e2768b59208cd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc24_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"44b186360c554b5c12b7d4f80922b997"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc126_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fdcc9856ffb82d7eb115ed99b83cb743"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8dad371db816caa300c81eed3a92282d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc126_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"90e48f02cc9fc7ce4b7e332a51afe149"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"8db41a522c57983fd74cd1df6a38a3e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"4f9cd98133173a1e73ba71cce109bc1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc126_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"29070cd53bf093f58059b17511f52009"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"73938640b7d40544ab67d60e29e28f76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"bf0775a7d2694e516f608e4cfc4263e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"648a4d65a0ada1be3202169b52e69e57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc84_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"df9e033092f85854d6a4154c2251828d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"9376753583d4e23799348f7ec97692a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"826f0b4c9c4f23201801f65ea31ac83f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc84_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"d6c86be5c2071835bc9a6f0d9569df33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"eeb4808bf957ea3803424bc4b5dbe351"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"707b4fdcdbaba64de4fd7332424c5347"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4893463626c95bf77aabc5853d6f92f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g58mb32_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5bb83c886d7beffa969d97d5fa16d113"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ac5ea5bf0b76bfc15b637ee64db8ba84*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"586ccfacd2ce14c4cfd38ec748d08aaa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5be279511442c28d10a60807111b1e05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2f9141675b60187fd26b568ac8d99b5b*3&00cd4134031fc43ac5901d469d3c3287*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"fa44d9b8e13213f4e479a2761ce886f5&45d31c7644eeee789644bc6dac229b59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e56caef6e8788007d77f6359186bccbf*2&098c882ec9b8acf56e0de6ecf6f578dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f2e50056bb25b2f9144fbeb8494410eb&a4d9b6aa0a8dab4e30c9ea4a9929df38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"18010a5740d27f93c258bb8fec0b01de*2&dd59aeafdb81cfe6224238936a8371c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"370890f3af309b4c3ba6b730c0c49e95&f882f5c2c66af505606c315a6eddcd23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"45e7988aa757e1e3091acc2e481a8b8a&3a5fb5c293a8684b8b900d28f260761e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"764cf48debca306f85389be85e0cfd19&4dc3f114d7628eddb7fccf09b8bfbbcf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b13f15accc4f1b07eeb647300ce0e3f3&2f9cd055ce7a9f15089dc4a3a36c884c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4bbe1a3839f3334e9947e74ca50b1e43*2&e008692617664cc23c0f03906c55bcae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e4df15830665c0a399450c322e9d0eea&3e2862d9c0ebb26e2a83cb85f886c4a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9f4aed7c2fa8e723204ff86ab965c84d&d2e29c7120693f51fe6a384c81b5181e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"143a253418d38b4ba3fa8414354052fd&f79a68144a878e53bb9807936f8e09de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a5efcde26ebca9b20165d562e3db35fc&add6baa2adac20e2a8cb19c258768f3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cfb3bc5196640b0cdb08c2e65f43ec9a&f7e0308df99d67ee2b0ac1f07bedb4f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3902c9e37b8934b3139c7ca61537e49a&1818a09a8c8212940e70db8d7cb01ba0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b2529d14a72f5ee44d8e346e84995862&2b0d19c559c8164c7ab7e186200705e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c4c153ec46336a5e6d870f3b18d3ef8c&174b1413e9726e3c112b271f94aedaa5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0 --attr-zero-points=src0:common:1 mb32_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"46b5e0a368e211f4fc0a95759304391e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e11cfd4b78b19c74e780cd7d016415d1&4fda9b76f20107512336afb7be8f1459"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9f1a6094afad5182a648df5cfe21c626&01434656c5a40bd4a8b66f1278f9ef27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9973565248f2e5bf86785861dc720a81&ec1cdfee89a1117ada025826e81f0346"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"da92aa5c799917035ecc05ae3fc1da50&01735a48428217fba108ef21e0886305"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0612cae660defa0f8cc85542a20a4204&0d6fdc6cfde90350f51cf4458f001f55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"6f799c5674255a1d5d0c4626ff8d2e4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"7206e2501a04dc7d52e3d4f0314a802a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 g64mb1_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"48a3ea9e88562f4dfa264958cc13a1c7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"44720716200c66f2d9b80b142d8014d9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx --attr-zero-points=src0:per_oc g128mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"09fd21b799253c3363d024c9bea17aec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"47ebb83caef7f352363a2e8160083c0e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:per_oc g128mb1_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"056fb645ed4572999db0f98bf2b7ea3e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a36b23711b858dd07abb3929559af0fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:per_oc g256mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a2389d3d4aed07005612c80e74251c11*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f6dcc86e871c85a07dfce90730ac702f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"09c46d8ae43b524e8907b179d2ab37a8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:per_oc g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"4032129f23ce7bb93ef482692f988625*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ba7dd3613bddba8ce49c8b57df7b462d*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:per_oc g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a45fe5affb216e1e213f30d5f6773af7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2f892cf8922be8b58e11b626afe79d87*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"03f3ca02e3fc133d8e007b3a67fb6e00*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c652ef2807ba5b09ee80e1da4ce8e4b6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2fe5663c41e5afe65ceabe7f9fbaa147*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_elu:1.0+eltwise_linear:0.25 g128mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ee7a283402df2dda1a81b9067f6b4c93*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"52e976c47fa7a2686cb1c2472dc4d755*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3611d4c407f7462208fcdb2f1d759d07*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e198c8e32a49697a11cc6feec7c02188*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d853cd8afb6fb98fb80e8c1d4a8a9c40*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"10fb636cb8eac08ebfb2b2af83474181*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"54f93a1344cbe004615c49e7c2dd4afd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53eadf0d6599792c0bb08d1de7c5741c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"096b0741fbb79739847df0ecda042c09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"215a2a3fca9b5a2ef20aa98c67ade904*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"a54c9e7b37db131755899fe45cb1b084*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9340f0b9d5236379ec67cdcf2275e973*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3559730ea2784de9f9058d83f587c850*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bb2f1db927d85a2750c0ece001e28522*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2e2f1deca83c5e9c5507dbbbf62f1a23*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5195a35002e58a0d76fe63e8a3bce28f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9e09f9d07d25acc7788df370b6dfdf94*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d7310e48d5d5a046b2a9853a258b8428*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d60bf62d5138f6e2d65fe4676d09e3d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0+sum:1.0:0:u8+eltwise_linear:0.25 --attr-zero-points=src0:common:1 mb1_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e655e1e1eb2872743cb68d34535e3aed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3d95da536837aff2dec487891518f603*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"74978c2384d50bff819f5bf35e2ddd85*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2a22e6a8754c054839f3e23e5001c2b5*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1c8804b616ce02a52f292140f8cc7d56*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"43d5c4ad1d1b022cca59b72aaa7b54f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0e21627c348fd84f68dc23c21421f2c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7f41c565807c3b7f9b197c81a5a7c882"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1ddc022b304b5c5e4b0c4901185422f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ed9f6ee3e6893b565e25ac0479a66f91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7612e9b8505d303653206e65a385c9d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"dc5c6c110537f1db03e2eddc09cb1b13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f7c62d30a781ae8ec332222d96a9688b*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"20dfe9cd6d8bfbe31de0b3747e9d7906"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6461fcc39dc9b4315025c6f014477fb8*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3f1c3bd34e1c6c3bfb7a3101565f1391*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"af16bd4aa1813da97fc36fbdc49d6e5d*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d5ba39ace25847bfec03e9f8aaf29281*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b53c497ee5b58ce6ff8f9484e57c297d*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f149d2696886eb7ab05f223d7d104467*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0e2739bd503fea59f7c465cee3e3ad90*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d2c2c52b70aa01d04747094308065675*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4dac4a101b0891f653266e0ef707f9ba*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1ec556c9d3b0ac84db8bea27e54b0908*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"bdfee8ea9b5530400952b24ec24dc33b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"2dd569ffb817a0bdcd8483cdf0bc9d17*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"5ca060cea305202b47ca2b2325f26759*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a901169c800d1b9212128d7fa8efb247*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"53db0f277908f71d227afe5ecfee175c*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d27e8b9d5c30eb62ef8f8380cc3b3a4a*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bef7a690396a9bb8e76f3a36218c4477*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"17d606f1de31ebcd7edf8ad82ae82ca9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardsigmoid:1.0 mb1_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"70be6069bb599b9228d74e9d2b046ad2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"7265f456a4fda9950d985b5af534b753*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_mul:f16:0:abx mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"94044d5e228ab38d63dd09a93097343c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a6bcb14b19508d55312abd756965b32c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"4c104e1c2dbef14c90194cb647496fce&5e711e41718359cc9828e72f52ecc880"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"0dc27a955e785306d60380e6a7b9404c&8de06a91157c2789060056708945dd4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"74b5ee80a35e07025151d3ec86587502&49edcc6c56257d4718276a14906ec30b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"ac4ab6ea9701d9a150dd50a941cd7437&1fb19398956e693b3bb74ad159d467f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"e286e1396e0abe389621cc50324748ec*2&cd0ac503b9fa1bb021f3960238e612d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"540edcca632732f8f85a706dd5db1093&dd668a2bdc9889eaeaea62f47f2d1fcc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"3b64b4761be1451d596acef2bd4ada7a*2&a11b696209cf35183de017d10c733827*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"4470b0beec0a9f4e6142d67f4c178497&7d14b75fc905e7f90b71198424de4368"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"ff35aff465d2e90e323c2a56b9e3bf23&8c07cf7e2e8070cbe02336dcc78fa6e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"d650ac030cfee93f4cc5d137caf432fb*8&1469be8713032a4b6c9c3e483f941936*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"9e2dbbccc0eb3320ccc132bd372d33fc*8&e77c37c54ae0a2b4e025ff5bf00f471a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"8bddd4ed78708917d2202497c058b316*8&2c768406339330237d3bfbb222efcdc6*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"8d5ac9a7cad74b03691a5a63786b5d63*4&6ef96d86676edb0a54d2ffd1cd4ba1c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"a4f69b5407a11f19887e24019699e8d1&b1a4b2a378c9e78bd3e91542f1dc043f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"cdc36de7877504b7242b2636b1b3a297&cda13ab93da303f138c9faa5ab31c3a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"13aa9b9844ef5c8e1c4ba5e91d1fa329&5584b40e2f5d1f6d6b599c5f2adf5cd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"c5ece8fac960d591d85661d88d6d3ce7&7e795b2676d6d440cae96dc49fa6a97f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b41aa12a7b1da336e832a4bfa9c23337*15&0db262bee4c5d0caa16023659dfa5ceb*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"f01cc76c3791e4c1315e162b9b5055bf*7&8b3db22450c2a08ff897fc59661828f9*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"cb2331ef03e6496fdb044a5780f7ce91*7&d452367465bae3242660a1453658d63a*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"9a8f6d7ed2f14d46d281139866d8f868*7&879bbc490d99fefbe758ca5e5d394834*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"e80632ac0c9622e317db9811b1eab347*14&25f48d03b200aeef6a5b07fc7acd2c19*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"2c419065432cfbdfc42c8b4b89e73d3b*7&7ecd5eabd5669aaae2b45e1531f8b37e*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"604a34e1e4853ca7780867064b6f9082*7&2307e44c0fbf6ed86a1c8d9fa948b537*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"0fa46a9e014f24baccfb5677e0f710b6*7&532cd1a7d2e85e7acde2f571c554326d*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"09ceb316a32c26916fa4c4d2cbf9022d&cde4f5ffe318f7f7f2bc3f7afa01a909"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"6099fc1ff248687fdb7788766f6bd4eb&fae7382405f2236e09defbdaf7a6f58a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"c09d4121120f57e473cdc3e115603206&b40e97fd5aac7b498f50103de4293845"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"ba4068826c11ab901ea27c77c56c72c2&e05a5fc3f85a7eb8caf9d115b3d7183f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"bdd10ebf2a0d510582a8d3b169a06fcd&21dbd84a685724e889d4a4dbb989b550"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"80b36a8dcdca9155b6eb54e20be3b199*6&599577086d5921dc63672fb50c03b8c1*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c73e230988985467d9066b11b96d6f83*6&8e19d584dc40ecdd8c0aafe8d73c5922*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"094e22ecde305cd6fa39b14524d64820*3&4717ca487e4f9ff16125e0316eb68944*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"d3092f35ab9e6c2bbcd24344de05d1ad*3&4978aad8863ee394207a247b5574ab7c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"14a404501f2132b0471289f8e5847d28*3&8bac4a93fc50632e395666c4624ca2ae*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"686ef7f4f29489a810c66ad1f4ac63dc*3&58235b0320a6d05f7e6aa9b9291984b3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"c4e0ed4b841882112dee35cd8f26ead9*3&bb8409eeb51e401b0875885bf3279fc1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"40b3f09518a1caa3272527ab06740cf3*3&a345b5fcd69720c899f83b10fd82f2db*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"ba83f99960d4de5fc0718a685f278b52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f2f8d264d57b13705b82b80f4017cc7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"2624f9885bdb41a87fe2c95dba1764e9*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"95215aafee049881ed9e9b28408fae5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"bcbd0436dd883ba4e29a67202afab0e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"91535dd625ba76911c3310c1138219c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2ffaf4d5ef3037b586ee965f7cbc2694"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"e532c12c01dd6d9682a99284d5207880"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"c5872be90e56c89f112c84887adbe3d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2eb6b5d7e7e6c7a64f40618a6ce78187"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"0886ed99d3ab7a06d92f569ec9fc2b37*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f2901d4ad5ef3a90356d6b9dfcbf31c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"49d5c33577fd62ae6a4a2da1d0b82264"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3c4d392e093ea84ceabdb9b6455c34f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"82a332d8d9bd1b82b4321b3f9034b2b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"49f851cec0807357fceb94d63726274c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a38da4c177e3572c00843fed764e84f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0ac675e9dbf13048adaf202172689147"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"162ff8127595ae6c628030a46eb99f98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ed1012fc769c32473d951f349530ffbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"fe7c6899d2dbaadf2d88d851554e4f48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7f0a3db73cfff35196b543bf852b06d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8a967bb3be820ce236a6d939e84c6cf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3f4564ce51a95bc2fc1504d24517ebaa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"02d0ed70f02a45b120546bffd27bd594*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"559c3b08b10ee4351c35ce5844a46086"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8a4d3a9d11a181323824e61ae2544eea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b183d898e1865bbcc267e31738ccf0bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bca47ca9669c4ff6f6e753a01cd1a98f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d5c1306d6e141ca34adcdbb374cb62bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"80f9a71c3fd13333c89d61a16a20a262"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"754e8fed2ed578d8f768f7492d5dcda8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b0c794724cc1b937a407b070a7280a48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"10ec40a90f74a9b8a0fc20fca6bb4ae7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9192bb3ef8937a767fed603ea42e8236"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e36d97dd27fe438f0af9c7a538c99b62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6fdd901c481c445ae0348eb727b9fba2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8bec504fee8bd5a5e927b53811123ad1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b607c199126f7a1e9002aa666d1ebd88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8386a77ebc15f507f69ccda6abbd0985"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0005292ff1bbb9d28e07c5d5e833e346"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7d786ffc703c87fb2a243b19b27090f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1bcad3313bae247af42ba936efa02fab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"81ec6257b7895412ba006436dba07503"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"96f0d82b1ca563ccb5d66a55f1c6dd26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a43d83ad7027192a09cfe2f26860624d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"73cb2c6c5c997184497309dd3c0c5888"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6fd080ae5d6afaaa3bf24b35a10e46d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"eb721304c7da1786d06fb57d4912b029"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"74119a4f9719e62421c0756fcfcb5866"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"82f6d2ae6dc056be02b163d862f2978f*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0f1d2a5831f065175f474869db58a720"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7dc6fdb5e5cfd781ec638288031c75e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fb01ae0fc963ccebf02bdfff97a9fcc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"812e1d18bd5295bb910945d3e18d8a49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bf7d1533aed78f55b83d04a827a078e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5f3ffc4a987a448b24397baadb5b12dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ff0f984acfae7d39fb9f1625908345d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1d95719a336fb6e13d288aafefa00b15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b016157418dd1fda2e37c1eda192455d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5f7fbee715bdbd06f22543693a17bda6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0461dd0231590ae253035048b8ccfb95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e832025efa36327e78df5d989c26809b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"cc876e762cc5f5d3583c824f43d99561"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"45a3761ca3c0a816f6b4f125874cd695"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c875f5c86a41341604009fa95f4b8385"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6fb35b7e45d33a1ee553dd0af7232828&b2fdf5239ce261a32df76327fc34b904"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"ddb16ea37e4281f9dea74ec40fecd2c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"bdb79461e894a488879c1e093c3120f0&ddc4f345d14d86efd2d916aefdaa7621"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d101a8ad76187fdef823947b1778df56&c1dc789efaf13637521f9cd42a8c41cf&571ad938dde9fb9199f39d2f559e8590"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bf3274734050f732f1e660ee0d661647&1c38ba6f5b056ac88ce745ffaf3f3f33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e9aefc8591884bed23aca806e63a12f5&d27335e08e3e01a02128cf5b5598df0a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"e95cddd43adeaeae3a01757ffa7c8cb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ae729f2637bcf36ae0e7a8390be0bcda&7cdb7b5f7131e48eb5c82db7f0efd204&a39dd5dd36852bd5a979ef089ea10c63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8a9854fc6ff928a81b57846a35a241a2&4d050dff3b4c442ad6f125ae499fca00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"18b1b7dbe89fdb7182c030b240ebcf1c*2&92d3b077d6a13ed09f7db6ebdd2bb6f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0462d3fbbb713ae7412fdb6ba3cfe96a&8bfd33895ab14943ca1c1f41aeb772a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9b3a914e2fe5b9e8bbb607285866079f*2&8ecda225a466be322811ab429c73133e*2&4e6f25a9a9c07545c0ee700280dbd09d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a7a523bb904e81d794efa21ad7e95e05&7edca06c04f703463b49ddfca9ad8506"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"01153e68c71cdbbb737f158686c94476"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d6205eecad6f620334ff7e7d2c850b8a&4d7ca2b1c56cc18e50120b257d29ace1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b2550ba679178cf0c73a60f419965414*2&1ae942129586b5c86a4b3389fa40450e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"815d66cbbdda6ade4b63ccce0f7e96e8&25e6c4b9a332aaf2c3b5a237331017db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"202872c09b59dbb420da0945522c2bef*2&37309c121690269ffcb84acba285d1c3*2&1e4e3eebb527ee4969a1d5880b09d096*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0bccd21c1f6f9ead5642fd2e86cecf34&90dc867094aa20599e50b1406fa965ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"bab57accaefc50dfa7fcb98a17118943"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"754f65bc586f6a7969dfeb7b24627cf6*3&138720277c5b6482c5e7606d1d7499ec*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"19ea9cf7ecd4b6734dd1e84305e78d35*2&f583bd0f2fb3119da712a27694ab9fbc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c6da088f6b271b549cd6dbcbbd9525b9*3&758aa78e19d9ea0127788ef76d0a0e5a*3&c98cf364c5c800675cac298ba4adc65a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"016ebc1a6996bae3ecebbb37caddee8b*2&431077a392a606933f13ce1c59e9d07b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"adf9808562d888e73575b4b0ec68fbfa&59c981891bf41b2ac34a148dcebe9d72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"257c6c7a6c02390288ed1a551aa4ac7d*3&e47588f613f4c26ab68917e9166e7a31*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"247172c8d8439c6aa5e57f137ffc7799*2&b64228cfc1d51bb1c5f88689dd2ef217"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cdbe5cf8249c88f8a5cbad2887800453*3&33dbe7470ecdd022fd46dcb5ef7a3269*3&9c0232dbdd2d684eb7802420196c70e3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"bd909a991c81494dc1199654a2253db5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"ae525c68734d6d40006681a295c1d311"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"327ffa00a687c0a39e10a237ef12c7c3&d3b94290cdc6ceb7aca8e50ee7e8be70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"15cc7e106d50f42282a124e7aeef74f2*4&3011f2d6400249d9b6880f616eaefc16*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"31904a51f6c07c01fdaf192bfbd88b35*2&ff796113a2649b6ec3eefa9c7089dbbd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a59461400742ad9b74551ba7f18a67e*4&d917c040c797ffb40ce2019e945dab30*4&46c46f5a33b939647805ff2769612515*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7ca9cdd4b9578ae90458973c19a7ba4d*3&911bbbd303e522bb47fa671c79d02767*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d46c00ad1963b1951e9bdbf822598fa8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6bf94dd8b485bbe69f4867fec87fdf68&e9444d4af538a5693498ff43fe33b4c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e5d8e3754fb1982dc37001b16c5df734&0d0032211089fc726f865983fb55cc64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"4a88cabf5d209b49bb7363ec20450f23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"91c401a4906753add9f8ee47c939d034*3&5c390027de9bd632fb108733826ffe8f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"b4aebd2f787222a20464890d8473852d&abcbfb8d91069d8885c633b947359616"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"66d088d4d453e2a865a3d1996851e86e*3&b4c353fa28663b1e87ba3c8f01c03717*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"b1d32342e0a227b14f99fee6744ee28c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"bda5c41968ddd4b2db17ba570f3ce501"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g8mb32_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"61256386510bac5f7d56f8cba43bfacf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bafb610ac73d383497eeb330301fec0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b630ed8cf9ddc4739dcb42e6e2f7851"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"189739553a4054024111bf2886b5ed43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"40bd02185351fed68901224c5e91557e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g40mb32_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"80079014355f3ac8e499806acdbfeb8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"ec5386e3e8fe0c01d8a8463bdc6e6c10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"c164466ef95d8ae834f65d3f84fa8f72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g48mb32_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"3340e58845da592a21405c8ade7ae3f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:acdb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"94394500cdb4e6b85125a920ed6b6384"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7a48565085cbdcafd230d2b25482dceb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g48mb32_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"2b40ee202a752db4be4667ee5aafe636"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2941f897e11f32ffe8d06cd420175fca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"75915aba5d6e43ffa0d6ca7fb7d03bc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a0997c77a5715aff9427fe0cd125531a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6189e008ee5155391375cb00b8d7223f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g120mb32_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"bcff4eb7f7af733f0082386495ea17b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3df73f06ed1eb1d4a41ef4beb4bce384*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic30oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5977bff2748c2b9a32595cbfabcac5ee*2&b02c6e78b0be851cbcecc7023db44ca6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"0416702813c6b4b63c0ae257c8fba4bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g120mb32_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"9e4cbe633eb5c7492394b52557dc8941"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"c6c1350cec36c79061fd19c86757705a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"3f9065a845189f5fc1b0c63d58fa8cc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"35c76c8a55edca8aca7d2cc2749d7e45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d7e049b77dc9dafd1370e0bdfb2f0887"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b5a8784199304350e571da629c172cea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"73298feb57b6f1b546db333464700234"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g72mb32_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"7f1e24b583f022dbba69bfc1efbb0afc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"96e0a9885ce3456cda485a68500621ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic18oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"41e8a7bd29e683813d357b3e0dde2138&6492561101cac29ae7c6cbac4a3354b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"2c636e470deef03e68eb3870ce54ddf7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"4b62ec9e8549ae19de3e235ca7e7a978"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"97de6fba90fc52885c4a02c797960d2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"66d3b75b0bc26615deeb046e14d10c9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7e68335e24c39636b4eec365657dac9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"72400f96076c635ba920ea9d9ed40efa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"976ce6f06bdf3de4e95679c5ba0605ce*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g288mb32_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"44f1c23f7974d83670a2e38dff6601de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b356ad4d930f0c807476ab42585e8a08*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bb7bd2f57846f6c8f86a709e2dc275ed*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"d2f90e905895a3553d4a4737c13747bc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g288mb32_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"1f49c7dfa67e483f8b220d6ee5a4738c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb32_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"dc48bed2e7e5c4fae434fd6305aaece3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"eaf28e1c08cdf1928f365a92e6ae4881"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c50efc5064d765c4b94495c5999b2304"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ba19531750c752b768d7101aef48f7c7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"454dd53bbc254240dd099d95785794f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7766ebbbf1adcac09bd6b35e9ca94011*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a9386c0e394c815ee4849e16fbd8919c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"49d0c27b97f033056dbbfd467b86317b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"d49613852cae4a72bd44286bac32d245"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b551e5de6ec7340f4afee2fab06a4c5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"5c9ba82003b6e31abb5b9519950f9db4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a032659b5082873d937c83206dc9e1c0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dff1e7f2be042ae7d3a6bf1b8ba9a444*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fa74e88e583ee0acd5ac2e3c06804c1a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8c41aa43ca0472e9a983f5cd256510ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b3d9f34dd22b69ea9c322b0431a75bad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"384a385fa0cb804c1c5dcc987fad74c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9537bb84e7a68f8006d48fcd1b591425"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"92df1cbf5cbc615e3a01f62ac882d045*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"71009e7585cbcf7454382cc0201cd12a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b683c5dc0febbeedfd6564c4656d5620*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2041a1bed1a975b8b2fb4a142b46d648"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f095445f66f1e682e499624a0768f335"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0871540a959c6544284e32e756eae3df"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d11a1bb91d794d8efb5cf464af9dda59"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7719e52c2429b6a3c97b684cda42b11f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"cb0327674de98f12188043bf0fc2b5a0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dec936b9536aa2d425fc0870745ff937*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb32_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"56dcae224738f14b83b0cff12084aff0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"804aa4577d2dc9876625827a9822000f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"93fc1249bce5d98fb68afb946f0a94c0*80&551af886cf0440b465aeed1a3addb0e4*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"dafa310a5eb89eae812ea13e1d063e56*85&2e40a29ca25444fd4bdb45f01370e776*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"571eb5ae2c26dce2d64bebac9dc6f97c*5&0427c3680f899ae8db1e78bf9e47332b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"65f468d4cbe9ced36898084b6c8088b3*5&d23a548e66b56a4961c7aca51050be51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"fc79b573a4ea330a49034e2dde06b8fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"6a6662b7417fb9232bce65bc9df7fb52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"ad84fe24ca338449bb3aaa32b3ab4da7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"56ae1cd205cc4afc8570930487b373c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"06b03ec18fdc2be40323f92dcf1329a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"aa572a90b6228f728ec006b82a3e2585"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"d0dedc5df579305b3b5067ef631f0ff5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"0332714a013ff64a900f94cc75b5ce40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"a16bc9d01435bc2ea0519d93c08c7358"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"80d1aac7d7321c07e0b84c106227119a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"a85abba95bf68f703fed97ba656c6397"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"196e7f7528b2257f5da9ebec09ee5186"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"96340e5c52642ead2b41a54363db0bd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"b23356b62aebeaf0166af4f145c43ac0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"ec7eeba06103ba844042193ac0194442"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"65851ef6aba1e8855132fdf1748a23bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"d7fd7ec7d6fe30484a867ae79669cfd6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b5b2f716176fb45e81e80c2e30a693c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"b8512048f7d8c8d88a416a0593c1b65f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:7:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"aa37f6a84fe99dc4cebe5138419ec0e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_mul:f32:2:abx mb32_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"2fc2ebdcd8deb1a818c3017f14b8effc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1aec68826f8fec5299fd4730e10944c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"b42dc326b7d1226cd1fc4b244a1bf0aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3e58bd86c87ee292fa57a3b7cd094023"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cede432ce629df612a89e0a34ac6a18e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3d50bd56c2a132936e64c27ff63237ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"74cdd5f9eaabad24f7ff8d63212937ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"08136697ab16d37928071497e11662c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"66548dd6a5e8e8a712190721a5178579"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1f8d697b20d0c847712dbae4e2e58f8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"92f44ecec465191befce6443a3873fbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"bd44a69f3a92f78416df648dbce296cc*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"792946b41e5775ba748d92b570f56fbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"de513d3287da62237b3fc536e2378c69*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"02004507d6c1e74f997c22cb4fe16268*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"06436d791d9ec2b067d7655acf1998ff*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7c35594fac8dbf3e91b1a898c2bdbbc1*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"331fa9fe5620cb0333028dedf961d121*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a779abbe932f074523bf8cf34b982136*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"518a891617d4e38b801c9705459489ab*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"49738c1ae3c8b06147b1ea135518a4a5*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fbdb0a967e09dbf88627fda9fd096fa2&3d19efc605ade39636298735ef80e3aa&8ed8b8a91836537166c2620a0cd2bb27*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4a89fbb7ae4228b1aa20e02ddce1bb8b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fb2e39446791f9b02152d554e039e6ea*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a84bfb5eb9be7f45d96a675d998781af*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b7ee1d18faaadcfb3d86f9d9ffdccc07*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"fc38c2c1287be8c50703725281d1cb28*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"625edfbe030f4ad8efd40f9442e0ea4d*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"b2eed09f9769b367865333d66e959d3b*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"2ae6985e627c7609a1666dbd1733f6ba*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"094bc2db66b0fb504bac24c8b1d660d3*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c386a9b105f1f0b116f9934247d9660e*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f682ae3d27dc4f7bb8b80c2606243e06*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d98c9f8b24d7fd0b0c78392f030cef9a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d0004aaa385ad6e49120cc50b7714912*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6ed850f987f17231cec0c91f1f702859*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2f62609a91a2a9af1d6bda88ad1fc7b9*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a11c72c01a38e9e10bf9be3a26398ea9*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"937dc3f90099a294107db09bfd136d1e*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"76b29b577dd193ce491c809e597b174b*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d3de8f497b8ac50e930371e4a5cd6167"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"8f0c636a6bbf04652fc2f4af301c90bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"82dc70e27baad839ae9365b4b3a5e7f3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+binary_mul:f32:0:abx+binary_add:f16:2:abx+binary_add:f16:2:abx+eltwise_relu:1.0 mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"be70ea5aced6921834f1b7b659e0f362*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d7b23ed5d2ab7a400ea9046a1149f688*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"1b8aae4dd390b8f57e4466c6f494e0aa*5&89fc1d1f583c5b1e70a41b4fa13b3736*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"43361a315ee81f98246ca66a88f9edd0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"56b6963a9dfd848a67fad972f92e364d*5&ccb42c2f617fd6d207a8478d95458d3f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5c9062a05b1aa374b1573eafa41e94ee*5&d2280c05bd9ca835bbdf871af8e3eb9a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"bb605b922ca9b8bd4a1a895331ddc2ca*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"41ce99c0422e292416fab7acce23562a*5&ad224813aea1a378da9a67a83bfd2ba8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d00802d3464ca72df27cc511f5cb729e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"dccb65f36177598ceb94415838848f3d*10&6a0c83afb47e42261c91be4db9a87ba8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"638d9a9484048402092445d4069ab829*10&a030e90f3238ebf75fa623ae3059b676*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"61e4ab459ca3b34d598c9de40de7a8a2*5&be16ed45a22940dec83424592937fad1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"30a12858438a9c7dc0827b29c7bc20f2*5&b9ff2e25a5116ef65d93e8863128df5f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"697e85576e41886e8db86fc0e982d4b1*5&f1eba48c744d9cbc146ba07b78c25d56*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"de612a9550e2342ca789d1529c361136*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5ff7cfe419a90209a0867fc6f44c3ee3*5&3f8fae04c36013fc7ca04cb825d15de9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"70095397feb8952030ef7413fe36ab05*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c3dedf76670f981e22926b7f42e484d2*5&f710e3a66244aacb3f0ddd6be1832e14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cbb1a30093ed55cf6f9c63023eb7a214*5&0aa4a1160369355935330d8fcef41b0f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"fbb6feb6ffda9ea7364c485a06d66705*5&65030577a06287ffad95f9453552e6e5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4821f24ba5df432a1c19f7346e1b371e*5&8cf54a3e7c34a3972693d16b7cd6b537*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4051826dc80a5d26d1a46656a6559931*5&4da940a852ba53b32b161a4f1f69f10d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8f620d36f41699052e9195e595cab7f4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b911ed18b593fbfca1190b757a396e52*5&ccfbc7ec90091105da6aa27d2f2b3db9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1feda4c44e77c98bc4ed4fcb39401e97*5&0ab4c56fa395b0d44ee54b1c5c3db69d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6be8efa2be12d0353bd2190241ec0885*5&5630b419401ebc9d0d8e60f6c71aa7d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d96522c068d58a25265783c8e5c44c83*5&54287002821e4fdf1a1928f926081843*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9a63fd2dad50f73682d1107e8f6a537b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:acdb mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9243d32a9a322cffa21eca61dce748fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3b4e3b23c9f928d7452842e213adcd7e*6&74911ff3d4f465126027e23583631737*6&15bd86a255d7b155f323f8cc092fe28d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8110d04350595fb4fcc502b18d8f6d03&52027204d3b4b3774f1363c52a0f7c19&fc91d296de67ece7c638f86464ac9481"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"945419d5ba0c3bd1a1cfd8cc299eb291&6c1d53ec31c483b21fabee1441add6da&5aa0d879979d871299dba0a266e4982f&923e2aca946fe4bcf02c7d697f44dc8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"455bc4a625a33ee51c864333d28f9d17&607036569d2fb8817fc77add56d164ef&5defb410978ef28086bf1fd2ed818fea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7d2cd75ef5d60cd37442a61e5d7512c1&ddbd6e4535acd9e5dd97a6e310c45061&23b2aa4dffd7fbb8fc53e3d8cafa8d61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59d052f8bca43fc17071b4b39cc59129&e73d31fd1a50d44c90eee55f9385e1ba&e505870a3fe5e6d29147802162c76b61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"230a811d42fbe9ae6471a58d8fa3cd54&2b79b2f79226673000f1102739c25999&e8efbd43fc5849bd3a2c02b20a8f6b21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d521843eba3e69881288542678e3e23a&5ea450ba91e0951d7ddd356a8051dc86&0ff0721544eb27f4b6a0daf097521ac8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"efe9f8f56ef3e480c53db9c75207a459*12&1c4c3c4a1e4f1fc7202ecb07a0536368*12&69c7105500e06c392752aa6862523c5a*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ffeff251511dec2b2397aa9c9c5bc43f&c75ef72644aaffea2aed36fcd331113b&3e391c649a325e60a999fb28ba4f39d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f05830a933a4202acf0d930deff56011&58760f390a7ce1175ddeac860f810694&13292a9efd0bb67ea6e203cbc60c74d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7d254f18393389a104b3c9e1dde05bec&88d46b7f805afe24c71222173955bb38&ab7297da0596cd263d2763c64a663706"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"24e7db804bca42165a501df90544107f&13b5eedbd01b643c6df042d0f0215eee&c176ef7b75ac1ab4386aa1b530b6f65f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f20cf9af6903d275b9648a866515fbb9&8202f76da3497d162d4620175f6c62d9&e062ec1814af8450833f02646d76e7c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"33c824ab2eee0599f703bc9a58cf1a1a&fe9f604f9b3b58993d3a374dcb5e02fa&08fbd5bfb05ad9f89808fa98f5c70bac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7a65ab9f272b7fed175c605d5e5b5d1b&d16b6f93ff39f2f8b0c16a954cb8198a&f5665b5eabd6dcb48896cca555759373"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0366c68d7e015d39152d87bb55d11bb7&dc2ec157828978b2ea3026b75afc6878&83b1ed101d46f3a282044e08c4151eeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e667a892ee4049848c06c9123492c009&a2d42b434bd6c642a0a7472fb7873d46&026644d1f07bcd36bfdae2bd6589cdb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"88e2aee41fec7bd4310470c68e28fa70&9281fd571b7bdcaf857393ffcf046eb4&6a3f262e937f2d69d9dd12199c7c968e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e020c97cab55fded8fc8a5c22477c81b&03374738793e937d01c5480cdc692b2a&116c9fbd08186ad1849ac40cf36db444"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"83ad650e77d6bf47aa363c0c99b164e2&627b3d3a27febf95efe1c58c7bcb5e9e&cedcfe77bc3588f58ba90eee10190a33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d1269e2d0740c674d39f981ddfe25123*24&61e821a151487f0e64433f7dd81a8c71*24&1474fcf13ef37eab3c946205a24a9564*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd0356f7c48bfc59c6ff6458e65c2ffa&4bfc075af0491728ed6ae4451c19ba22&53709e791850f8dc8502c1762bf152cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c00da8a8da58f1f85b7e1d568c829415&55661c9e292114ae368e203078eb6c84&0d88fff7f9d2cd9ffbc547a2d6adceaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"69e31c87be19d7e8b0320a51667ed433&e5c9c8829d1c0354712087490f9c1972&c5c732879fe32f821fde0ec4344b01e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48bb775008362029686b93360fb4dff3&b6ae7d0ad710a07a1cc26c85f2361c81&11fdf5c08f3dc6a8192ae3a06810b1d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bfa7fd9b49c1e3448c56de74fbc84058&aa0972008ae11aff604d4ff33a66191f&991e15a6d32dc2592f9cd981882ad3e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b262f537571edba81051a6c6184852f7&891fa9c10c3d6db8d1240b744bd58843&ac5904445bb8af91637619c08f806dca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"413cca42f0e623f452015f4d577bfc9c&513dc95aa07ab31d40d1b8caaff26d18&81f059de316e2377085c08611d4e6e53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"09c713a5c1f6ebb4ee62bf3b12467ded&76f185853aa02e308a51aa4d07da9b8d&f14458ed826525590e16f0fc1d6901d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"baeff15406158fcfd8d3d758b95d861a&52264b194c89d89da0888ee0a3cb17bf&98c1b5899266c6be3d017e1e4e48bd7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0647b040dfd7b589e87281ea8f3cd3e0&af0edcbc0d0d87f508a6ce9c18e70f3f&88e0e507af9253c3c52629410444e70e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"af6453484d1b7ecb3940b8a4331b819c&e9c1b3f0685187d7b03fb01dc42522c2&2b5b842a35048d59fd6c02abc2461403"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b0b3251c460bf723ad9ccaea1132fd7&3cf4546781febfa0c5f9a08010e0e5e9&d1e33c6faf492d75dd76f07ec730e7c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5640fe52f7f57aed7ac3fea9c15f9735&49cb7d1f1ade67a875e6f452e84f4821&0c70924422de1e87551bd51314b6f585"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29a9fd982cfd0a974121fd26c4216479&e9495faa362bb8c52e8913f41fc0f251&9c3d97a59ec9730eed3dca8550d64886"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a1ddb622b1d075a20de696878011a216&f8d91c9d5aa84484f9b8f944ccffbcd0&d8ff93a5136a70639dd6b6d89d506953"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c83f1bb6f149698cb1aa1d02ff940d2f&1dc5eac32194eb7756623be51b7c41e7&7d744769e6c79e403d1d45e30e1b0b65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"15a25cce456211e17f4d76c0a86b1ec9&7cc3b21b1bc1c7573ec363816ad55afa&8d4ccfc2e6b41fc59d545de8a596b859"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"036ba0aa76e38241d196a6224d1be3be&8b4e81e02896e387f2380d719051e609&f55d71539a092f062b50c85261c1713e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9dc5ea473e223b6a1ddef010d8dced3d&fa5f91fdb321c9e96b59375f2f1a957d&6c6b8c580783471661106974f51950af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe48c772724b2e53761f74766d457705&176d7a9fac5ed5cf77be1e430296a6e2&f86e7aa369403f874055f00487b7189f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4d7c43877d1b4ddfa665f2757a036de6&3d19c52a7f12a8d33b45116450d3d30c&26087ed419e156eb44cab600375bfc00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1a23d9fe0061ad710f1d9bf0b23bc422&27cb812a1c7253b4a03db1affba42061&1cc061beb0a2d6de1d73e6016d01d3c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9cbefd9807e564be424769df9377163f&801fc323c087a6d2954e17851c3c5d11&568f5724a1971931a33168a8cbe51305"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"913db7bd7304075a834fc2b1ec5c230c&1c81356404bfd435c69d146f13e1c86a&4c5146d000e9a394fd2e2eab9aab648c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2696db07598ee694de63b0c7cec6f230&524fcc01dfde1c3d4fc4a7e06382726e&28621f64b7679516a9dc4b029ee8632d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"60bf58514bfa0e24bbd17f247176d129*16&86e4aaae4184142787778291247eb462*16&d286c35aff14c026688a1a83c32b10c9*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4d1758dbab3f91b7151cf9d0b2bcbd46&8b63c6c585cbceb4e7b6743a244216bd&cd8991b8a1c05df6b9027410b81303cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4e74f516d1a7411de5fb3bc1128a0cbf&b9f0f27d715c3836578b7eca9a1e134d&7329c2b8000f31442d7d3f13737d24d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a4883d51ed02a16c208e81880a0b4fbd&f2cbc224197f4610a02a6b7612635bb1&d64e992c5a932abc6440b3ae30e0af13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4a08e78612340152c3bf3e8eef543896&1ae9d25344dd0a303c0395d304e629b0&ab5bb330d1bdc94f2a89fb6dd550b453"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f0b3226f35d68c58d47333df3d17f916&9337d5ba194a93f0fd083de5137343c6&196085e52ae7009fd698661f0a28c948"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3f996983a16254f32c9c752117c7a6e7&125eced3dd6bdbaa77fbc10933154685&122a67348d66c0798327636975727f96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4ab9ddd3ce3b0ecb0b250518209331a5&4f3d7a4307a0eb8791c1ac27cabc6040&7310388333338928d0d69f370e5e5760"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b124eaff3bf3c373a23d8ca01a32f32a&9d828bb524e64985917cada54a23cce8&a5bac4794464f596c4ce07db47d7052e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"59ab3c629ab8617c9146282f4c33b1ce&f775954e40de19d65bc4d3cab0c69278&c405ed432c32e151825cc2d91ce7e11c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bc5c4c4ba77cef7b5bf8216a519007fb&82cdeec03cbaefce3ef05308864cbe68&0ec1c3467e5e359e94d1eaaff627fbb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92e22ceddf2bfe167fdced6c775a182b&6263db8c03c52ae4e5e984d6db85d20c&2010aa02e22ab0e8f97be7efa1d33d38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"499d7dedf772818307820d6be8c2242a&4ac6282357a9687a918a1b9c8b66ac32&410c0b12df640fe9ebb24147c0b2ad66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3e7b8a332347078d626786c3f2189c9e&840cd72d1e26de365740406aa5ff5c2f&81c61cc67efe5791b0d258df325688bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3c65da1985978d79a31b734e15f0b9f9&4e15dd8c66c75037307b2afd2f718406&7b5bd27d333c8348bac13e32bbe66ef3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"772b932b9327aeecaed4322e9140566c&aef88345907a7618141de88024a3d219&ef4b14b8d1e0b588d58ea1cc0fac99ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"a07745cbca56509f5794e2a2a0815d5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"89e932ba9db0897efe942fe306af596f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e0a427bb8ebe1d381a206948f8488ade"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"7544be1a9ce9b344d955645bb2659949"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"b3b46ad74411c86017a73bb2c3306288"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3748cde5ad28e67eb90ec2b9c8d8eda4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"0b35c125f08e1b2ef2a6b47db5c69c04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"bfd3e9bd2ab1897caba6157fcb142519"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d3ae1cc96f1ab1badac900bf82198c78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"e8289a90c610d421ae95ba02290cdc1c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"313f2444bff5b1543a57c65766912599*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"14d0b27381e8077a243f3d0082bd1eae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"bdb3017ce64c1bc33a5ef9d44da45fc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"1cace662fab3a895edc91b59ef8bf6a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"7fb65c6f9563fd6f1eaef76d58009622"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2db487cc5eb2af549b6f2b0850dc0a8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"8ede5628d7b36ced1c1ccc31ff991de6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"fc37c167c43736dabec998e51e8dac88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5fb8e1478a4cb80f5b6fc8e6fcb806da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"1518b4bae9bf1e05a9b31cdf00594041"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"010a1d27056d06ef7bce5a49ea7ef6ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"beaeb057a650f196fdbc0313501ea4f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"3fb78a419b0ffb2c89b8902a9afd9c95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"fd650e06b3efa8df63b72065f9157db7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"3b4a9b6e4093de2fd2ebaee28c8ce1b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"29033a037d44512079f3286c2c4c35a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"0b07b0795f2322b02b8769ffc41dbac2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"e657155173a439866f4c9e113b922b3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"76870700863e58a3333091818cd80552"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3f1569f57451060a8d1dc1100e267cfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"20b8637afc9d44299060624c39ff3268"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"3a2ed6baefac5bf5eb6f6f09271344c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2ca7f294132634661ec0a2ba0ff323d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"f85067089919798cb716be0821af4dc8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"6422afa80705aa82b0a59852f251ce8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"e5a6a431ce711665d5b746286c1d00e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"4909ecf505c09315c9944bb65fa1fa06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"bf1b143c60e4df579ae2a9da45989427"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"d7b1c2c87e1d4f00a7898efe8f5dcd40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"05529405b026d9260414e62c1731e177"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8c4f55c04b1ab119df3fbb46aee33c84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"22cfc8d70337ea91aaded07365891df9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"98221acd06f1e7200c5a0265d541fbbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ec07acd62d1b2ac544c4f0a191fe4525"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a9cec25840de6c3208f22aeaa3d057a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"9d04229f76d76a8d5e120b147a6642f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"6d0b310c0a6f1f19966c6eea72de9e58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"5d458fb0eb94806c678e386acb240e27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3a93a3863a06d2c4df05ab3afb5dbb35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"7580b323acb6cf0cf5dc823e8f5e6758"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"47442e7d57850d731df4f68b264c4d85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"57aae7f1084d207b79673e62db1d843c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"bd8c0d555445a2945d4c7e8a8e970964"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"b4be67b56674033e3462927695432f18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"375cc86051854257f08de0af5150cbb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"ec8a7640d0a246cda72e964ae7d98eff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"84f8f50ea81d2c630a63c43996474c3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"4cabfd0c2d233b31133cc20065bb2600*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4bb41a49ffdce353e3cca1c1b848e428"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4f6e87df15ea3bb1be2080b51e1f3f5c&57779340ba32c7d0ff5246fc4496b7ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic24oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f0d6f4ba6a7337ab8096e5c201699d11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b4ac47df2bafdab1fb4835f06b984b1d*3&fc6617313f95044ec671db4321e95d84*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g58mb32_ic58oc58_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"bdd56f815d60dcc426c1450037320fdd*3&93d4ef98b5be5a436d5fabf1c8870ed6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"67601232e922de47e302842e090eda80*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"6274674241a8cfe970fa98ea9b224522*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic116oc116_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe67a59d1a242926132cf9accaf63cc9&43e54c85011dc5e085dbbc6c5838559e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4a11170cb6caf1fa3c2fe1f3a15c9c2a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2fb29abbf1a8b20d55e4207df544273*7&f2024c7c972ceec5122fa8f6f73294f9*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g116mb32_ic116oc116_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f6542acc27a204b21c2267f8d71f023e*7&e21a56e57b6ffc8608eb3cb3296c4166*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic116oc116_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b94dc662d1605e411e45161d07c73db0*9&7b2fc54cf7b71d8839c3d5a484d203bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"a461fe1b269ba40b1cfaba6cd8786fbc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic232oc232_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b0909451f71cb1050708f070d61b6a87&db7da7a94dcd464d2e62d41f1cac1745"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a1383906bad9f8a918735dae236a2739*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"511b2229cee00b4f571302bb286fb410*5&cb9f02f91eaca63b83c59c4819a0a15f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic232oc232_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8d3226790cfce65d2ceadf7bc21a68c5*3&dad670250e583e128619f8aa40bf3809*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g232mb32_ic232oc232_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f00d423661fe36255390f9a68ba65fd1*3&55813776d23c39406746e225988a278f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic464oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1910ce967faea56ded29623c24423395&2f927d137cd1a95d9136f0b5c328b83b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"fdcacd8b79196e6a6f31a5e677d3fb48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"f4df945bd3d95f0d3376d30d2c523d38*10&5634b3b353a114fd2d9c4daacffd96c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d8084bf5ab9cc12f37a200b4a4f104b1*5&80a42cbf979e96450df78ce74567080a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"eb14f078a409b9afa1da6d1e17c5bf38*10&b81bc63a841021910df0234121c76944*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"91fce7f851d6c3796faf943234920a3e*10&9af68d5a4b859c3f42aa6aedf758eb52*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:2:abx+eltwise_linear:0.25+binary_add:f32:2:abx+eltwise_relu:1.0 --attr-scales=wei:per_oc mb32_ic3oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"f4eb9d177dc38beba7b676fc875766e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"e871342c81e0b72391bdb7b30a46ed86&a800172adb0cedff2c36f07d296413f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic64oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9b32e72643a24229f0a62973f5437fe7&751cd31fee9f3c12b9b57526c386a790"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc112_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"d2fa0ca8a68ae293236c7a53d1237f00&2cf2cf148cb72ae3bd95188a235357dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic112oc112_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"5820693da8c86ff8eaa80dc08e99900d&9eaa4db5a057b37183ea68a34c856d24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic112oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"fbf609f3a99860bf96839fb220a110df&75c71ae91f306c557a05eed790200fe8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic192oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"9b442fe1ceeeb968383b4eb672617b34&e4e036b1c4bf7074c3dd5c69c4012f22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic192oc336_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"04e11e59d78dac3c624f5a01e2828783&cc27d991404ad4bad4a8266c11c307a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic336oc336_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"85dd2369f7a63e3623a259904e7fe21a&1a414b436c4385c309cdd97fe99b88b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic336oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"05d5585345cf6f177b88eb13817b70ab&cb9cf19c0bcc6808f63d9910a2a92e42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 mb32_ic512oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"6534269c2e46d3cf327e47f1293b72bc&804a8cdc63043e4e63a0628c47a3a36b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"fe7965bdbeca867ee980020d2ea988ed*3&13a6101987a9b620a11e26938897b67c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic512oc512_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"c5be6ae46deb3bdaccc8eb407c31d81f&2075d26d1918df99ceac09844e4e1272"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic1024oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"bc01dfb0cdfc7cab672cf050e7611a91&653e0e27517ca93c1235e8be70998c98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic512oc512_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"b1f887b0cfc5c487fe2145c10c7e68c4&07a1bcfa69d2fa58d4730b07cf2955c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic848oc512_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"31e251ba9d74776d11de1c789215bb3b&d1aa400d1f3d963f6254d7ab208d0046"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic512oc512_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"efccd49ad32cfc2d351ec3350a1a6c76&3f7c5ab2881568f357dedd592f010829"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic704oc288_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"58bdb781535529d14bdb401c514f0cb8&ae184eb944119c0896073e67a4557bda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic288oc288_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"bf8fdae30943ea7806a7714ea16390d7&b72c37e23fe84ca7e88870d8d0b57f85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic400oc160_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"13bdd3e74c35f271ee59a147f8dc335e&2c702444726123bf8edbc58935bab73a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic160oc160_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"8cb8b9bc8ac5975303f2246b51c80482&51e07258ddf4c4afb276d1b266da01b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic224oc96_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"f4bf9ee86c3b032bc9a32343177c4d50&ab9af89ddd32cb841b5f85f1c6a57878"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic96oc96_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"0bfe4d00764b40d7e6510aebf4406aa6*2&78572b7180410739b4e889e8b85f0354*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic96oc96_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"abd7e0a908d1c27a9238fb943e31afdd&c33db96fd11733fb3115621c4f430fcf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic96oc72_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"1992178631e2fe6d579de2d775c2e618&1a5c97e8dcb53fff574b3a012a0697e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic72oc72_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"7d39b4895790cbf07609a5e47fc5ac1a&54666299771d66e511e8937afba19166"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic72oc36_ih768oh768kh3sh1dh0ph1_iw768ow768kw3sw1dw0pw1_n"185c3a3c2a295cb893049fd72e6b3a91&5aef738243c237c47ee8e3ae0c7e4619"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic96oc48_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"acd6402de45a4747c0b45e1f8f57526d&e2f7c7697f6af472170bfc66a04780e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic96oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9a1ea691b3bc544fb3ad547c420f22fe&57dc16530f291ff3b6d2e9eca082bae1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb mb32_ic48oc3_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"a86dd49fa747be66f1920887fa8beadf&cfbd90c1dfb0911edab3b0621e3e3e98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:acdb+binary_mul:f32:0:abx mb32_ic48oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"2aeef2f510163b09f11a067c631c457a&f077b8a4d76d7c99f7e49b27fa3e8f71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:acdb+binary_mul:f32:0:abx mb32_ic36oc3_ih768oh768kh3sh1dh0ph1_iw768ow768kw3sw1dw0pw1_n"ccc6af71e3e637e96eda19509e5eda51&38a7f1520ca225bcf1f6ff5c99b0e644"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a58f647c6a6597f71acf406a8fc3f57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d0a1de5d344557f811024a7965a36171"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59f92b386ac9ad91553f331f17099fce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8d6300fcaa71a105fee5c2181059cae6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ef9c97188dead45348c90d765c1ca9b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fb17ee5c89efaed8a30e6cc0d52b87c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"007e6d0838c236fec8528833f05005d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1307f1cac02fe8685360d34db197581e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0e75638267ac345fac1cfc9db12432f0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d995600fd8bdf5ff2aa93dc69c2cd1d0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"782b48b7b1712e613fa61f727e2feb29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3554618a679529ea05350dbe66365de0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d3ea53668c764d0f6a71ace1bca2f0d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df6cd6c561de5276a3968cc0a2fe3d7f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"377c513702c84c474e1d5169ec5c8f7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g32mb32_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e61671a320b0ca6040a556b64c866743*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5f83fadbd03e7f4e58898224e28656dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1416e164e08afbf6dfbde5e1ac155914"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5f2660e4249fcad0e6e2e10b4806fdf9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dbd12354c971b65e9cccae65df9a98fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1385c51d136f73a83133387d079e9425*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"55176f0b89b1ada8524f2736144a73a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cc9a810f96a33ccfcf6d41079b157d5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"789e762979d060c60d68b861c6a92b54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2b0f1642a8dcff90e58c577f46d4babb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"94ffc39a64ded1de2dcd47f73847bbb3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"598bb5773a8c70a008585fd6d4a2be97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d88ee0fce45bbdd27891047e616209ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"668ab22f6e8ab0c3be8a7ab17287131a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"797d4552cced30b442567b5cebe2dfd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"97f21f8a398cc870e32b2121b0a49b40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e938744d1aed8872b2ef9b524919a45c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f352fb75d746162b5491b46fcd39b252*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2e51dde4037e245094e9f89fdc3c9ba0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"97a9945fb199db444c1212eea5801508"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"eb5d173ee3b2e422341ec84832e1f5f3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"45da31a5e13ba33d88f1b4ccfedcd483"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a1bc45f58e3346c89abc1cde9db3f67a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"717eeda2fa71ccfd8d5d4a6016800f2f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a174381b2d2a648e8fd035d95b506f1a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e6b2b12f3dede1d6e100406af7721c0d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e9ae69eb5dc35897aca3c0a92d2808ea*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5bf1af66ffc949e1fac48db0f590970d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ae22f20a0c4cab2b00cf1c14d55cca24*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4009fa811ecf61a9a129248b63ecceec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"620a05391abfd863da296f10e677a3d8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2dfd38a13c246ce75586ed5b1dd8ceb8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d5e1ea42245ceac8c78b0bda1bd57ee7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c4d0cffdc7af0fa41ef0153600a01522"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"590b6e36d45b60e469526ffb13c78fc9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ab2c0a93c7b8d65b8f2a6dd58f21d5c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"35fc01a1f413597f28c5793ac7f3048c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"49fab7017fd6ee427ea2d013a3276e17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"632818ed659f6120ea55d604c326e95a*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"03612974a5505b757f38694fc04dbb94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"888a2f00bc81d3e872f02d5654e65c5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc39f0a71906f5d579bd8ecae9cb0d0f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ba4cff983c485095e05f25c2f31ee370"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"35f3229ab9ca9e6cb9d42b1d009fdffe*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f36886c6465c4989c491b6ab6e32224"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b3c52af58d4aa02b65b9194329e2e7c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"30a90e74e049aa5a2f76f4232fc97480*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8af21f3d2d309e52c22665814ee315ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"553cb55fe37174bef627fec801af4a0c*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b43a8cd62958c5067e6c1d2d7d5f645b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8092f1ec205001322dd120a25043d936*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d9b16b86499b2202b103ec7d08f5d45e*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ce866a7653fdb8c5abe0fd48bf12a79e*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4bbc4cb4ef4cc783f9aa2444d7de03fd*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e65762cc9d98970f8db211318eeed1fa*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5469955dbe343fb167db31c3469842a7*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e680fac6034945a39329975dde42afb3*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb32_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e007a67eb48d9a41fa083bcc5dc2929a*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb32_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c841bb641464c588e1c921508ea445f6*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6de7301712c72e5337c3594458db789e*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d12b882323de0228b98ec6ebdf071494*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:15:ABcd32a16b mb32_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bba552a01235a89efed258d5dddb7f57*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2bd40da47161eca82ab93eb57dc0ce54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"187d90b8c01c8aa8ff75a04834cb8d96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1678e7ff66d9b02ddc29e5f607d408a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"66e252e7946ab15e15d451c0ffa07144"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb32_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"44ff183736721ea8d54065e6ac2eab2a*2&a9c0273a480de85795eb40c2fd76925e*3&2bbf62828d6bd1f9088771c55de685c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c927938575c3d67c80f4528310e405c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"98cf68f89dccb913e39b2fdeaf9ed748"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5ae53b26aef52b049861eccdcecfb810"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2oc64_id79od40kd7sd2dd0pd3_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"8afafe894eefa36abd66547e368db100*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_id40od40kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"67de7305ee005275b2493f912b6d446b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc192_id40od40kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c34c791560dcbab1d1f2c0eeb0c9a595*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc16_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7b064add0fc477cc8486877dbe3d23b8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc96_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"48de7c434a840862cbf771469b6adc11*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"40017812bdfefae523a895d707bba831*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c8327541479f5d3dd0249e28d92cfe7d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc32_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e00eb4884463a2e23afd6f965a7a79b9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e9ae559db21ba1374027faf8b9fec6ac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc32_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"31877f262fba0fe9f7397a590136a484*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b28f0e762d3263c26c500919cce00f2c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_id40od40kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c87f1ba420d88c64bb175e1c7199a764*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc96_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"56ff6d1220d069bd36d8013883cce84c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8f521e72db864c1e832e801b5ada3f66*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc16_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"abbc0f7dc23b849ee8cedf692a18ed2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc96_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7579dd9dadcc511a3da7166cfdcf7ace*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc192_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"109b8faf400c27735c3063cfaa39f510*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f2cf9c0ea7f9b8e5fd439ef5dc83d868*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc48_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0e8300b364a0e97186e3f5bcac717ec3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc208_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9c5e44efdf3d0b6ee5a74b380a84d36a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc24_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"90496bcd6c4cea23ab0eb5e12c8b6cc7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc112_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2231d3bd3d00e322ee69ee01a31f646d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"35f25bf67ce34188d6c4d1e9e24a26d0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc64_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"64c0650c5ab052ffd95db8d5725e9858*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"15db14632a995d8973767573cad0a603*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic112oc224_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4fa698e03f187cfebf983429e74f5c17*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"baf3d5a164aa2c8a84c726183b0975df*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a57802744f6703d7cc4f3b72d244fb44*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cd30be4d58a4edd8eab4858ca60715c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc144_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3b71dd6d1538560fcbd1962341306ecb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3ed8f6a2ed1aa03ba84bc61ce0ce1b42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc288_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ed83c16cd181f8f68a2706db63537a0f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc32_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"46e263be30365b36509e4fb63a54a08b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc160_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"42ff47cfe36b97d436096f58b9896cb2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc256_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95bc0faed64e1953f490d54e607828b7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc128_id20od20kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0277ef17cba22fcb75e19fbc544a7b8c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1bf61b815ee96eb923285235a5a20a20*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc320_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"70f06ee2143abb7ffc91ec87fd362e37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc32_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"699f858b139a15be44e8bc830623581c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc160_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f7d14521cba67a2aa0e017b04c88d1f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc256_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"604a17d99b4f94898f1c01f6a89b73e2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc128_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"86e8875e81d316100f6391e1e7355677*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"97413742b6b7bbbc6812f04fbbcce010*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc320_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ea9e537dd2df3b49950deb63b4dd5eae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc48_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"89308545914ad9065716d3f9e168740e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc192_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8b684c3d98fa374524d0b027db4cd9a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc384_id10od10kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bac181b0909f47b0adc53a724c67777b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc128_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b5f003174c25af388ac34cb122acc239*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc384_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"45a25e4ac8d33aa1ef2ae1ae9249906d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc400_id9od9kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9b4be25cfd2b531dd41b6397f44b450e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih600oh300kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"7f7a5dde088c956394c7e2baf32d0929"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3111fe785bdac1ffccccd307c0dc3693"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih300oh300kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b4ce569d1c7d8b901f92bb94f4300775"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc80_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bf8bec1b8e9ff7d66469bdd7cda6e3b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic80oc192_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a91cb6ee09333c1d359fcfd0f1f20c30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc64_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9fb8c491e3d2f341fb65416fa18acdc4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c67f4d08221a475630959412331d7ce1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic192oc96_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c1e25e075eb211e1b5d03e052a0efa32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0bbcce6acee9ca2fe75793d81a1c4d88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc64_ih75oh75kh5sh1dh0ph2_iw128ow128kw5sw1dw0pw2_n"fa1184c3f3154e2f0ad3de205f30163d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic96oc96_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d95c74e011a7af27cee1649fc9b12769"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc32_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cedcd89ff3378e7e45a29c954501e503*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc48_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6828983a7e655e2a6c995022e453363b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b70d72277c0fbadbd5e79d4d7705f86d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc64_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7edf97bacbc3e973421b7cc60c906b2a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc320_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"42e5b4db5a07d8e2f9a4f15e9db3e226*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1ce4628e670b5cc424bccca417eb0215"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"968bf88f08f19d27690b6acfd1c5f3d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f60d3e34f019ffd7da4b3b2a2ddc48f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc384_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"767541e6c5079741cb5420938f08f594"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1088oc128_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"be582d2fde3a8ba78833df486724ebd9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1088oc192_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"197646212a9a2e5e5ca657a88310f8d4*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc160_ih75oh75kh1sh1dh1ph0_iw128ow128kw7sw1dw1pw6_n"88d47552e485546ee689e1c2d24204ca*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic160oc192_ih75oh75kh7sh1dh1ph6_iw128ow128kw1sw1dw1pw0_n"15018c6330e1de652765bf680d934edd*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic384oc1088_ih75oh75kh1sh1dh1ph0_iw128ow128kw1sw1dw1pw0_n"5ca0e1de02c5f152a413f1a713c59cdd*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic1088oc512_ih75oh75kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"1c2dd666b189ec46b0a52c4e668937bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"606a07e5f68a65a509deaf51039d87b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc48_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"10f9838d0395a423a75e2e3d0fe2f16b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb3200_ic5oc5_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f521f65e6747b6d6930e1811b87d28d2&498909074ff3daf3fc5f5c7f43ecb9d9&1f10d13d1bc9d12b0dc50b08ffa8b470"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic1088oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a457e6b69b91c0c80337446379a0ce22*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic256oc288_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"57c8733994e834ae9534b2ffdac27aa5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic256oc288_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"21dc6c752169ec612a0afbae57048c71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"95734d49542dde76fed1911140a2e892"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic288oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"c3d721c05cffd591ae3ef322d169ddb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic2080oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"854a7e276c37c35842cd205a00402759*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic192oc224_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"910ad3b9d8149dda140cf74be5c880bc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic224oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"599b7f7b76460962b17eeb1435ffb7dd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"858abe667c452c28c70b1145dba9aad8*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb3200_ic448oc2080_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e3b8716b934dcacd042db56a1998511d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic2080oc1536_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"920f9a81247043c47468650f9a385cf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"16a2f1d5f5d52c8d5118cebb5fe13d61*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"852e1a4b9111b95514338da0ff670d55*5&3270c23617aaaea88e2ca708c6a641ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"3285a39ab77daa9292b056dd2008995b*5&1193e240fa25be0e4afe0769ee668c3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f85fc93e33887fdc341738c47c80ae23*5&0d52efca10e33918b86d52e49875a049"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"42c66ab8f0881cd583e8bd66e97026cf*10&064c9433fb4044008d03bbc2c155ef69*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f75bc922c7d972ff06506a542d180365*10&312c61c6dcd488ea98a8a5fffc0f553f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"254c1ad1212219a89aa3c868c7c320aa*5&6fca5975b27f17e508d8ce759aa8f7cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"14f6ea1ba971142aef8356aed59497ad*5&e5830d35e32b467167bdb4baf34aa0ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5949788d16a1ddf9d85c604d4c660862*15&ce5654e3eb3865dc815726846f45464e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b284b9e4d8a336fd9b2a683e8ec8e017*15&c473a1bad60507ec0408bc1a3889e34a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8b90ce2112e83425b77d12eba26971bf*5&3dddb90e0afadaa33e17bdf6e55fd992"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4cd72e7be3d3b793fd506c169f46fcfe*5&884a29baa293fe9717fb5fa4fcc5b6d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bd21e47fb8bc2b89845ffd7c3415b5be*65&9c39af3eb584b1ac7effbcb6d40dd077*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"293389a33136e56bfd9f6c102791dd9d*65&f954972376252dcfeb9c4d79d7bd30a9*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b96da1349de563c1da8c525a57fbabdf*5&723edf0beb536c3af41ef1fae5b1a09a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ab80baa57c81dd3bd00a5020f119102c*5&2061686981f58524963f3a30e96f61bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5fc4815901d1d619d41e2e17088bfa9c*10&068642175efb0a1bec56f6394baf3cef*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5136272d24c2b915dc0abbb5fd3b0fbb*5&32f85e089fa979ea129385ab68530a20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"234c89be3ab4ea6ad258b68c8d6028af*5&58fe3592bf524686de6a30295a435b16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"70b064505b06b611bf9de016927bb7de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"5fbc16986f32b7c59be581708f13b3ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3a624baca650374c61221448cda62386"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g8mb32_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"3e6de4ede0c4348cdec3fc70f4f17ad3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bf293507b97f9872e1ac27630a939749"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e08488fc8fa5ce1aa7696a2d9128033c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g8mb32_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"036e4f9e6cfe87f2d87e2960ecfdbbd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8769d9d488e50e4973406c29555add46*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4f1326d5bc281cdee4e8d23b0a9c72de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cbbb7164df15af46dfb3797dd0b82116"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g16mb32_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"d4c07f4c578962972f9bf63b8521a8bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"77dee496ac2f50d2a78f37f7e25471c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"afc78aba3d6cc194aafd19ea69f8c728"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"24da682cc8ca9e6051a7519eeb4560c1*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g16mb32_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d1e64db6db86c2977b986e65aaeb9880*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f3a7bcb8ce38e706dc472c113e683dd2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g16mb32_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ab34f28d81bf00b6bdf84c457ddc335f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b3bdf779de9a81975160260c113784bf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"438a2c6ef7c22a2d5cfa20d236fba2a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"8f8f55293c2ee169ad3116be21e68f6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"820755143ece7085e07899e5d4c302cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1823e4b66e1b568e007543eac75e2ebd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fdb623a1bcd4cd5488ae96004d58549e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e30853c55252cb337c0bdd94b2f9a6aa*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c55c7af048dfc2506652d5e94aa1a90a*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"984db36afb204ca826494d0428027898*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6f93f73f6e6d42227cb8e305a0ff7d30*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f0105927bf57437eef7506fc51153d29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"25071162550f2c4c9a8bacf070a467f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4120a2f58a285f0e3cf5ad0ef1f1b9ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b8fbe557e81a72dc20676f0659625af6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d877c2d6beed5e5b08061dffe6b50569"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"512f61e4d938afbf07e019132e5e2557"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"15a59250b855f84b8937af8a200d3fcf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1c29bf63b335a3e7f0c7bb932bde081b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"97ba5b7525afd128643e7f2225e5b21f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9e458b90a3fcf7d8652cecfdabb39e09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g64mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e7449a73c6205d4f95fefaa98174180f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8d4a56e7d4e5b9569287c65fb0246fe8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"1fe85e11a299e3f30d31ddd4459f0333"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dca7af79cc2f9246d0ebd4c3b1aa7ca4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7bbce89ce261d51c8f0729f83ba9f4c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d425cc3746c1b56f98f6fc984f8bde3e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"57ea6e21d2576303cade5507647f136c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c057d5465105c2da47400c101e5ca3e3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb32_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"136008d6e383be36c856e499ec77353e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"e2c3594b3e33916c2e9b5f43ddfc4f12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0 mb32_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"40c125c515faa06315ac12efea0ba7df*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0 g8mb32_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"84f40dd69cec153b39ba1796c0471e9f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:1.0 mb32_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b017cfa1925e35569db3cb58dda25b45*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a51a89ad117872c0bfa7a00be269c7ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"16de9d539859d476ae2427dc81cc6cff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g16mb32_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"2a07a3cc921f98be515c218e21c06dbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_elu:1.0 mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a15cc05ecdd64a248db7c1e9b69e0b7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f11ff79dd43ac66ea4693d6f1b605476*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g16mb32_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"95b163fcbc698a0fe8db4b93c616624d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:1.0 mb32_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"13f31a65d35fff695565944c85423625*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ecd61987d472cd34ff245730165c2ade"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7836b99e38609f4701cec164fe33996c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g32mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"3d78099821e5165cc6e049b15b3dbe97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_elu:1.0 mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7c8ef2cd9c0b89a4641ef0364ab54a38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"68edb4a3334cf612e7df0cd8ca738064*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g32mb32_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8db680653b3a595259d0fad380902ec1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:1.0 mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"804a5d698b72d91cf220c1ff915288d3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ff4aa8f322b1bab4ad5d954efcf1dc90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3c1b40a9acdc9b51a7fee852323ba204"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cead0715a1dc121e07af4a2719567f3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"95212bb0b726548c0729e92929d37866*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g64mb32_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"ada41d46c9385366ac3a7042529b3f3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_elu:1.0 mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"642dc8fc1d5ac04a895afdd76c18fc4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a34934b71dee6edf11b217957141735f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g64mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e42944c2d57b7b4c9da439cfcab256e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:1.0 mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f8492fa9aa46198cde5062fafb157804"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c9f0b414923bab821b5bca4fc4611d24*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g64mb32_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7f17b4a479715d3298fcb0aa38123192*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_elu:1.0 mb32_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a18bcb9db6eabc789610cfec5d3b262e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"806ea8ac33d6374282f2b3ddbcacb942"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"52c15d1ddf0eb7328ff416229b5604f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4346c69fec2702fe4c873076f9872212*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ab674733db7ec146b1543921e92a930a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"cbc140c8f1ef157d44234ba2d7dc376c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"1eabfdc3c5f7a6b17c1366049caed71a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"820b3a901da81cf940d0c14f99d3c70f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"b3edece9cdea4fe1c57846f1e76cf960*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"ffcbc7eccdda8731d80e04181f11fd43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"f4f6dc0ce8a6155336d7e1a6dcbcfa7b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"ccbae6d979b303315f9c8b2f60094f9d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"0941ff4f384d62c175e7b92e53594d62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"bfa672701f7f56395645855b0791e37c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"6747fc325165e45e6908f53af51265cb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"34f2e3da5862728132dc3e5d290e06ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"733312374d887aa3f1ecb1d0f4344778*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"3f79df743f892410508e2a5454b034f4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"a70956c60df3b290c372dcde08ada95c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"a0c5bbfd923e8ae56d0c1323a61f3513"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"eeafb93f46358d2ca5994a43e1eea6d9*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"7bddf29b6dc999d44d9ddc7858c1880d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"ec838381c703dba8028e5d326f0e6f3f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"4d0940c476719ddcc9cd90883cc29b8a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"09a4a375400f47364f983ceff1383e58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"77d30e39e958470c62a73a993d7009b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"22861730a8fd7bed4312454849130fcd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"5cef3146fca2c8241efa5498517baa42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"6348244410f5860bcd4b52ac7e434112*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"ad07f77367b3c4398e64f9e5d8721c79*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"6f6e604fc4502a413b68d4d3d6308e40&bc86e9d37b3adae6bd34bc4ab63b1a15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"cbd32a6254404964f46fea23b7f0983c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"dee0233439e34f23578a0d84bcd85e24*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"64eff34d85493caa3338fd536ed064a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"dbad933cc5e3723ba621c7ff4c07e29e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"9825a3485c5dfea603fc7298354e80cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"1172fdc3b77a51ab172566c6bb65bb22*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"5099cd1bfec7cc8c0530197ae92452a5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"261ca6816155934448b68e2b255f924a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"5610a497639747bdb2e2a1ae088541e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"ca10addc31b14026c7c1655b5a057f2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ce10164e88b6c1c7cb2677f05fde0629"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"6db77736a85412020ac5013963a8a0b1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ed514e6d723ef4d386bb5fc36cb9c705*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"7b1ff62ab4ef709a6ed9a4707645b52c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"87513f31ff3446e2958ae399a42711b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"8f9cb17a723b0adb34447d33336d683a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"a3137433d01e83afaaf1e28e4dde5120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"96fe7da4f06baee7bfa6fb59cfc6afca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"24cd7daeeaef08d373445d89d2ab1a2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b05af02e9f4e2524a5843c841fd41c9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"c06b5102c6c40ba24040b9659c41089a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"7ea2b7cefc8f5f67c08cfe0ceb2c8904"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"625d0521f6cd8ba954896fa00cdc33d3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"522dd5706162a3be9de73d689ca7d2cc*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"8cb0d7ffcbf3ae004d02424592467955"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"ca58e2c120ba25b4fcff7d14bb92303c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"ec03db1054d7c21613fd6571bd8b4094&e0ea855d6bb2c16c09a844dc54d7cfbc&c43f6cc6aec2930cb8a64a0aeb78bb9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"61f7275e0352c898b4af5145284aaa94&7988a3095fe0f25ef68caa9b263eb10f&4d2bd9a48bf523572e0207ca43ba26a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4000c60be109208d5f26f84a152a7b0e&06247e4577be7488eda9551fa0befd5f&395ba91abe3f6ec068a96cac4562ce04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih320oh160kh7sh2dh0ph3_iw544ow272kw7sw2dw0pw3_n"d681941795cf1c86daa6f16f2ddd0a7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc16_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"8d5c387c6df6ed417058404b15c122a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih80oh80kh3sh1dh0ph1_iw136ow136kw3sw1dw0pw1_n"7968ad724b0d41e02de6d61e25ccd89a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"0a329e2c136efa5b20756acfebbcc870*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc16_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"88dd8bca2825cc2041c2b738c8e9d3ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"d35bd221557e2056ec09999f6f1af414"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih80oh80kh3sh1dh0ph1_iw136ow136kw3sw1dw0pw1_n"9e64e7db53e9162aaed5b801a907a965"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih80oh80kh1sh1dh0ph0_iw136ow136kw1sw1dw0pw0_n"8627d5c61ac3efa0f2afef82a46579b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc32_ih80oh40kh1sh2dh0ph0_iw136ow68kw1sw2dw0pw0_n"b92fc75b210f0373ca0e9e28d4a90b95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih40oh40kh3sh1dh0ph1_iw68ow68kw3sw1dw0pw1_n"e97fdb65bbf54af5edf88d13e5db7f8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"67235e37e20aee504cacd3b53892bd5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"67290ac4842d72cf54f93cbd65234abe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih40oh40kh3sh1dh0ph1_iw68ow68kw3sw1dw0pw1_n"84274d993403b7b0b33d586f353ddb1c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"3d8b586854359f688fbe4486dfa43866*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc48_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"fcb865dc1b3c7bb1e6a8564dfda3417e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc64_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"10eace1b61b2538901f61f02cc547f31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih40oh40kh3sh1dh0ph1_iw68ow68kw3sw1dw0pw1_n"c3eddd827c216ad800684604e9a060ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih40oh40kh1sh1dh0ph0_iw68ow68kw1sw1dw0pw0_n"02a477ec6284e5ebb484e7f28975f3d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc64_ih40oh20kh1sh2dh0ph0_iw68ow34kw1sw2dw0pw0_n"62f1c76f606bb3b7d6087f76ae0dcd57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih20oh20kh3sh1dh0ph1_iw34ow34kw3sw1dw0pw1_n"4ed3fa59e9100030e58946be09f55a29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih20oh20kh1sh1dh0ph0_iw34ow34kw1sw1dw0pw0_n"be5cace1af2d65ead0d9cbbe09b9e09d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih20oh20kh3sh1dh0ph1_iw34ow34kw3sw1dw0pw1_n"41be46153f4d79838f365ed0120462b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih20oh20kh3sh1dh0ph1_iw34ow34kw3sw1dw0pw1_n"f0f7079dcd807c08032d028300d52db9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e79c107ceaf8c269f74edc9c42041f00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"af0ba26fef0a29a5983ff7d4c73eb479"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh mb32_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6b83c7fa942a84dce1d428bb111e7920"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4e5763bf3236b8fd93fb88e8fd1d3df8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4ffdee8fed9ba66d812c4d98d003757e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0ee795bfd8ee0f244247148abe276a40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"18a6410c60a27b217de22440a869834f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0f76fcbff0bcec9284b4c5a357277998"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9089c588fb8f6cf055846eaf30b57ebc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ed60efe0541fd92266af3f2e3663b5f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5dececa56d070417415e1801d73136fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5e06fbdea692d2ae510c7ba4465ce0b5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic32oc96_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6d28dfc4819309240716da671cd07c55*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0fc86f1aa9b585d90855c3559fbfd1bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee88e0dda1ee8c04b23de635eb9cfbd7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d0ecbd0968b428088ea38e3bda140820"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3ee6899d725d2ed40a7e16881c16764b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic64oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7b2a518fd1d0b667e69bd096a8d66b24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g32mb1_ic32oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"804febe9ae923c773e2707ed4a783cda*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"589cb859f5bdf7fba72c751f6b04d685*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0daf5e4501d58e8982d78f80f1a571a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8bc1260356b960978c832786585af077*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4e2f5a082f19375eb12d7adaee6d8a48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic96oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c5ede8f169ec6f6dfc3a69465d4ac20a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c85f385bd0aa933a4bac9c5c85ef8832"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6bc9e90c321fa2deeb796ee285c8aecd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"12cd6afd02c725fc428f6682f414db9b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f85bfbb66e7b683896490dfa7dc3684*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6f1f44ad5bdb01bf45e44b9c185dc81f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f28c727606c4c3b832e7295c57e4613b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"19c3f7cce401b469e75e38f1722c44fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe49e03bfe0245f5149cf552eea0d94a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic64oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"730bd8b594933b5877d4cf079cc843ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic64oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"64f4ca73b6251ab1c83b5c4366198685"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5272d7c5c392b21c98fad0344ec2491b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4c5d86d8b51a424e93fcfc6bf014f237"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b2065bf7e0acf2badaf92a9fa596a77a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"af6daceb1638064beea3214030c7ebb0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"59b7eb539701960650d13fe1b4d13819"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d182afcf778ca2c6f1acbf20cbe24eb7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"18774d686beaa496ed290e178ad2e61e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"78f1bc13e17c869698b193a70bdac8ed*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a2505501203d05f0d094b52144136497*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"fd4a4d9de92cc7f9f9e997e4f500001a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic192oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b4f21fb66441bb11c59c943e1133fe98*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aac555ae2b664cf44a76e49362ef5292*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic64oc64_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"830817647f7d3ad85a1624e70f50707a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b48d0ce615ba982823a7ac901fe9d2f9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic64oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5ca582462f86f72fd56abb4465becec0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic192oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"955c5577a5a67ac5e29b837fc040bfb6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c5332b710813061aed680681d5998717"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic192oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5414166425779d68e44dc4b10fc41041*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"20979337ce089afee2bf2d6a8a070f9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"439bae8f6a13aabb61cd196b732d84a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f361ee2cd38a68ed21af7ef7f40d76ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c8a8ceb7f6ceb9e47ce98673c86b84c0*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4e67dbcd646ed626756ff46b26a515a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9234c2de56d9720b3fcabe49f9b07138"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6751b782e46340cb367871718a091248"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic128oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2f2e2cf41f68894de7855726efd2e222"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"23d3f05dde1b4668b2dda231dd582760*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"877fd3fb5fefd7592d4d57bd166fa335"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"31f34e87f06d64617ee819bdb592bf39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"67f1b4b0a7f023ba795195d1fc7a6aaa*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b1463574e97136c4852affd33837e2de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b421a5fe46b71d5ca53c89b71c795069*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic128oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"14b1eee2b194e96c02f0364a4313fa03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"89d5f2e49f9c049450729f3dc627be9e*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9e859d6501d925f4317a97c991a6da95*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aa25eb71b2d6d90aed0f540ee97331ed*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f30d8a3edf054b4cb4313d5e58bf9b44*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8f8e0a040d55d4dbeed8038fe4d5d5bf*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic384oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"272fd9d8db7cea643b05e68b8f00db90*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4147c99e8459818e418e97366a3f2381*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu g32mb1_ic128oc128_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ae376375cfec864dde845263dfbc3949*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb1_ic128oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2394e4dead1245e575d7cd648cc21359*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic384oc768_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6cfc27c649966510c87e50fb60247b77*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"24269172e9fef3a09e239b422eb98326*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic384oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d0e3f0693a12d7549f49f30e05215c15*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5a09fa919056874bb3e753c7065d188c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a6c1371e402763b170e230d488a86444"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic768oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6e17171c9ac9a60b309578d98676acd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic768oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"330d411663f17f231b9bf0907516211b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8ced6325d2e89138bce973aaab3dce2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ebc7c1f5dee6e76e40eeb4af16128cf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8710562bc15676fa6d8a59d59e709569"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc24_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"b1f9d698448bb377e25eb7ba33b3ff6d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"f73780c2a7614797aa45f48f8c52af91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"29bfd7fb1d3157d55fe7ec9c666c8caa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"acf1d9193eaf1b8a1aa5c27b1ea76041"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc96_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d3f27ef34a3bd46144f82d38821a8f72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"937f07dd9f2b873516025c8648e1c737"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d6bca6a8fc4f0e3a43306d776b938279"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc144_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6990ccdf87d466899a2cd2518b5d2a2b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"391f0bc467577e8b98c655d8462672ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic144oc24_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ce04b3c4862510946cae3ea5e2fbd605"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih128oh64kh5sh2dh0ph1_iw128ow64kw5sw2dw0pw1_n"32a588c68608ccbbd9ff576ae30bb03a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b116f3e0f1f09ff52bd6a6983c5542a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic40oc240_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"47db0e19365cc17c35b0708013799075*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih64oh64kh5sh1dh0ph2_iw64ow64kw5sw1dw0pw2_n"4b134d60300bd6425e727f8948817ff2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic240oc40_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"61024884b29a5e4ad6c54da70ed54f82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"4edc9a0ddcfaf57eead640d235e3c955"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic240oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7c33d4a4acb46d624eec779d8cb89708"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc480_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3ea3cc33084d1e0eb854e9aba3bdb80e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0aa5be85ea65066cbe9bbe8d40e05eea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic480oc80_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"27e4ded3e15c4cd106852895c0926b7d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g480mb1_ic480oc480_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"5111a62e5c81aab2b6aeb7f631209f4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic480oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"456ef2f5ef7924332a72a8e38eec5337"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic112oc672_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1071f01b676787225f69c0d16b425085*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih32oh32kh5sh1dh0ph2_iw32ow32kw5sw1dw0pw2_n"deafd757ad14a167acf81a9a4016cc52*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic672oc112_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9fe11c232ae6c10efc06df4d555b1d86*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih32oh16kh5sh2dh0ph1_iw32ow16kw5sw2dw0pw1_n"17c347f61f5aaa91471236f008f65339"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic672oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"651dc503ca1518985abeccddf138cdfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc1152_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5b1b8900ca1cd7db007736774570a1ae*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih16oh16kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"779a7caa61f5b16d305abea7b51a5e0e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1152oc192_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0373ccd819ea842af45b01bd19c0d3e6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cb6df80ab7dc2c50a899647935cdde81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1152oc320_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5131d6e0ab8bdf4d9cdb8b2abd72b61f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3d6e8578afd5f9101ae36e542f6ef981"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"77c8cad5643361e82cb87e68e794737a*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"eab1f3fb9e381c3c3d449e5d7018cf18*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"81c0f232769b432b08eb22e9452cc1be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"61eb5ab0eefd2db96db1bc366acfb0e1*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9e175df537a244d9889ae6021f7ac778*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"49df3d15baeeab6be6386c2844b64a2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"652804b6b15c213e4312bd1516bb018e*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"787990c105bf5ef320d3d4dceba80d90*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic40oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4cc9e1f66f2dea603f9851c517e6945f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cb782d497531c54fa65b2e1309d0060d*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"566f5c8bedc00a76209a0d8099f49adb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic112oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f29674fa8a825f40aa6553a61ab38a2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"257e10c29c1139133656f6d4d7453bc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"fe8ad484da4b72030bf34482e9c6e29d*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f589fcbff762d684fa4fea8a82341d0e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"87ae39c9b9b2d825f8c5738eb77fc523*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"6a18d708ad9da31ca7e8534955b336bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"7ddc392f92d356c095d17bfdb8e213c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e15e3efdf64e9abd9ae0b4ad7571f922*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f433288764b6b5dd2b4d7e0011043851"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ca1463f59f5d72fd38df62bff64530c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"32c901209c44f3b8cbd154f927214010*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ae2a1fece6e633f97f47987736038a86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"55450034c1cf0cad37f1fbd6a997405d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1ad5eeb1bd34ed49a8863f24af95090b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1c1127c100e74a31d5036c8d07392483"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2b237cc0a3d8c40a96961acc5df520d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3bff1e3825ad7c8c1f2231b2c7948963*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc810_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c193b04bf877a64efedfbdbae5b3b98b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc36_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"790c5403db82fbd18f7df9f0592eab88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+eltwise_clip:-1.0:1.0 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"54542db8b4aa8f8eda4478347acf8b38*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1oc64_ih1oh1kh1sh1dh0ph0_iw16000ow1000kw33sw16dw0pw16_n"41887f194ccbf711731e14d6e2baaede*5&f983613869839d73c04b275d5131f7b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1000ow500kw5sw2dw0pw2_n"e13480977e3d3b3787af7f34b5b9b759*5&48c7ddf8ddd7bb23e3ffe1c4571e9f71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1oc1_ih128oh128kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"372ae75962e90b05c4a19cb5280012a8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1oc64_ih128oh130kh1sh1dh0ph1_iw100ow102kw1sw1dw0pw1_n"eb7eb0fb838fce05d9d32e48bf0d9444*5&ea074f9598a824279f461d9c9f0ec01f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih65oh65kh3sh1dh0ph1_iw51ow51kw3sw1dw0pw1_n"111ccd4c7dbc3e5ac38c8926d69dba40*5&2bfa58c383a7364035335ec00c03c38b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih65oh67kh1sh1dh0ph1_iw51ow53kw1sw1dw0pw1_n"b12960df02f425ddab21382055c89f8d*5&047769c186d8e4bf4e2f63508c6934b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih67oh67kh3sh1dh0ph1_iw53ow53kw3sw1dw0pw1_n"5d66650bdc020fa42b1128cf9d323cca*5&93c8cc3d47b74bc75a0eeb1631c598a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih67oh69kh1sh1dh0ph1_iw53ow55kw1sw1dw0pw1_n"b36ac3a015a00424bc98773a1c8b2da9*5&f090f6655b1c768376701205aaeaf717"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb1_ic128oc128_ih34oh34kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"0f0ce9ee48340af2b2684a2931528bdf*5&8478d686f19cad37f53bf4abe9d41b7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih34oh36kh1sh1dh0ph1_iw27ow29kw1sw1dw0pw1_n"55d854315c470879b28a365ad2741528*5&d77c07b4179024792aaa4915191d9265"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb1_ic256oc256_ih36oh36kh3sh1dh0ph1_iw29ow29kw3sw1dw0pw1_n"4aec33c7ee1362cfe7250e6aec252ab4*5&9a2a1df94ada0ba3cad64268393f12a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih36oh38kh1sh1dh0ph1_iw29ow31kw1sw1dw0pw1_n"c70a715e3d162dadbebb1e0a5384f138*5&43b12c2465c59c9d7c58b6d57a007100"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"8e1790ee2ad8d093afa4424c7a8595aa*5&c649841c01ba736bc0adb7644b7c538a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih19oh21kh1sh1dh0ph1_iw15ow17kw1sw1dw0pw1_n"a7820617ce83a6c6c7e1e528a8d345ea*5&234c04780cd356427597433a090e9565"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih21oh21kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"a54f38856522479cc72786c5be1c3bbc*5&374792e045173bad7bdc3da75ebcc7cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih21oh23kh1sh1dh0ph1_iw17ow19kw1sw1dw0pw1_n"0b2dd8e3db0108163a37f10bbb09fee6*5&6ff85eed0a0f3209c76ff365a93e993e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb1_ic512oc512_ih11oh11kh3sh1dh0ph1_iw9ow9kw3sw1dw0pw1_n"514cad89079cf362f75db17bf056f8b5*5&25f2355a39204d3482f3857e807727ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1024_ih11oh13kh1sh1dh0ph1_iw9ow11kw1sw1dw0pw1_n"76f4f4a5f1f0ffad9c0cd5fde269fc11*5&0551d8929a365ecbeeb26a980b069bd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g1024mb1_ic1024oc1024_ih13oh13kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"518410a13df1b7a1369b6dbf06700515*5&d1ef4c3bc3d738e5e65420e931e6301e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih13oh15kh1sh1dh0ph1_iw11ow13kw1sw1dw0pw1_n"486845c3e501613a2321002f35224dbc*5&32982d48593a019459c8cf1ad7e44709"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3b6ab8eb880552d4cb9fddb8b00ff700*5&76fc20f6b5d474fa14c836e47d99f211"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih800oh400kh7sh2dh0ph3_iw1365ow683kw7sw2dw0pw3_n"a120461ce064595ff28761619f514a5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"0902c4349ef93bf40e6cf26a2fd5d3bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih200oh200kh3sh1dh0ph1_iw342ow342kw3sw1dw0pw1_n"e811d6eea903b98da413903a2ff7c443*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"27a11d6302b599c8283c5196b88a4309"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"400399e2d8da655767bb222d35ebac1b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih200oh200kh1sh1dh0ph0_iw342ow342kw1sw1dw0pw0_n"9a38859525e4df26f2aec28b898bb475"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih200oh100kh3sh2dh0ph1_iw342ow171kw3sw2dw0pw1_n"36146412d0943efa72338742b0981e3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"c6e415b24931f09c2f6087bdbd33c94c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"bf17be8ff9cd539829c19dae324a4596"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih100oh100kh3sh1dh0ph1_iw171ow171kw3sw1dw0pw1_n"01583d5ac7a1b9b2c6fe91c1947633bc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"7ccd5516e1079ba9210152c55a1ddbdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"6b6626d8fef663494eca8e31517472ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"9fd441fcc51829dd99a09ce663fa18a4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"f4db32f8a966b5a087bd8afdb7098d27*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"ddc0f1275843cce45185dc1b56ed77ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"f325f92ffab676f773a99983b39fc72f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"743b95adbc37b964754781fb1415cc1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"f051348cac3b6bd801c5bc644eee077f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"48e79eb3027e867cfe8afedd6c14cdc5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"7c29c3df4d0020a997f6c80771c8d8dd*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu --attr-scales=wei:per_oc mb1_ic256oc1024_ih100oh100kh1sh1dh0ph0_iw171ow171kw1sw1dw0pw0_n"a67524f79fe081547f2793c051261cc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc512_ih100oh100kh3sh1dh1ph2_iw171ow171kw3sw1dw1pw2_n"094db4fb66c687ea0485b91a9859b912"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3c8d2276e3385c0a4ed64eb7fac5c034*3&b1312827f06e2183ee62012221fbfd6d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"13a14258f3f1f8346b8ff60732c77b08&ac090854db337e2877ccf78ab2e50e86*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d283b0487f0837d012944115dd334326*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a78278b158d2d5e78283757f39e750e4*2&8be0a80aa3889ee34ddb16879addeefb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ee97d8bebf73dadc8c84824f1ac3a1aa*2&424d69e0a1f888656ff5c3551d4f64ae*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb100_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"213993b50203d212193fea2d82240e00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic2048oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"4929ee967dc31246b13104a98fd052c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic256oc256_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"1c34f1fdf88fc84a95d9db75928db77f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1e404c1768111520a5cb0dbc3896f876"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"48223a0f3d249d0a20e363645df51a93*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"fa392711ab2a6581b9a906cb01a2435a*5&97eb2e5c4079f5430381fed60bbacb41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"5b0b5bfdd249ac94a4c37b77e8aeceb6*5&4a249de6a833707f50241fc7b9bf3f2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"eb31116172f1cd3ec6db466140ad5a49*5&de447b225a30b614200ef0c63d629028"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"7d249613581adaa4824f87e68ee1ece9*5&70b839f2ca79a484a7959c5936726f39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any mb1_ic1oc512_ih30480oh6095kh10sh5dh0ph0_iw1ow1kw1sw1dw0pw0_n"72e4a45b77c6a1f18289d500c65764a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf mb1_ic512oc512_ih6095oh3047kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"dbeee52b72e38a2246afb9da18df732a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf mb1_ic512oc512_ih3047oh1523kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"f4a5feb4d41a94d16ac563f68b4cdf48*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf mb1_ic512oc512_ih1523oh761kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"501918aeda1933db458d3b2acb4c9236*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf mb1_ic512oc512_ih761oh380kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"16a993af11139ba659f4677b3bf0d5c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_erf mb1_ic512oc512_ih380oh190kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"4f695c6633afab16df391d04c1edd5dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_gelu_erf mb1_ic512oc512_ih190oh95kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"053e86e06d1d1d303d736c6cb229b858*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g16mb1_ic768oc768_ih95oh96kh128sh1dh0ph64_iw1ow1kw1sw1dw0pw0_n"124b8be89df00ebf14fad0f9a77316aa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"968874edc571df8c7a04cb5b0f9dbd8f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g32mb1_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"4209f590fcb511a0ff37eb26896a4d93*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"d6c656c99b752eee3480c7db01294e63*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"7d75d890e9c9722beb0c4c2284ec79a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g96mb1_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"d48bf8212b93dd57ac5098f5acf7589f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"88723e63158f78d5302a80b123b8d49f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"b1cdb249040a1ca4c0205b73026773f8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"171c6a8bb804ab2df80240700ae81b6d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"cf45b79a705f5ec1ee3d15f1cae17ad0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g144mb1_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"e8c521899f708937c3db3d45aa30f711*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"345f354d0b21da75dec1b1fda91aa827*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c07509521d41581d7196ce62a6c5d3fa*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g192mb1_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"c055209a7c967b8a666f41b0872aab2b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"0ae857cc9074175856442877600fca25*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d483fdfcdc5e5a2848c3a8592736a8a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"673c4f19dd3bc8931ef5068d0efec6b7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g384mb1_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"927e2c04d3bf928a619a85209abc5a3a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"5f5f7a5ff7a1aa0c887836f3d6198947*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"1e0227e39e1e0a6a61732bda810634fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"dc7842ed408f2b56ea428730205ffb09*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g576mb1_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"1c80e4ad967f72e9cd60c2b351798011*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"2c3da958f3ba6cb344f686b0e973829d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"26c6d00cce1d0c230c2b2820c2f44f3a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+binary_mul:f16:2:abx mb1_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"1fb4fed9f4b3b94fa23d0b3fb859f7a1*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_linear:0.25 g960mb1_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"a161f0ab5711167f8c79a810488785ab*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"f0d92f20023432222f06b8a95ef8b00f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4830af156b18106b8d2abf9bcf0d3e09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"eb37eb310c4ded0c7f561fdd7f04f8ff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 mb1_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5aa7edd79f2d688cea0f89040078abc3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"23fe844895b4c2f947d140a7f0abcd1b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"39d54df2eb03cb07f18ec3c0f257839e*5&e30525a87f12588e2feaeebc5da23296"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any mb1_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"ee422024592d094455b48375efb045a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"73fb59996d08e9e86bdfa5e194991669"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb32_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"fc1e3c197c5e5c48ada251d1f0fbf28a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"1417add11c39a8e8b77ddbd928a199cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"102cbed9b1a42498d34bc444fd05f79d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb32_ic96oc96_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"4cad5fb7bdf6209018e1649c2ba0389c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2935a6081808812b00d51d02467a14f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb32_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"0f9d557a655a9e3eb8d4bf72fc53f182"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"67fdfb7ef06468f7280df47b11a55e3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g160mb32_ic160oc160_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ec43a994de8258ed5dcef2d3da8a4679"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"de11c9e022f5068c62b8a0ea5a9f6fb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb32_ic192oc192_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"88a7ba96bd5f8074de88720f6b5a154d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ed1f9011ecbe97d169e5fc9aacc431d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb32_ic224oc224_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b30fc2cbbbcf377d2c8076f2fb485980"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"981827413b9335e2b27d9dd5acb1679b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f472be257114bf76ddb9e474b76f2ea9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb32_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"14853cf065c972ca13df4458aaf9456a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"c5d2bda118c43cc09f1ba3d47edd4bdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"dce2459450b3c903f627df5d07e4a701*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1f42e32a4494dd17d6599b5531537c84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb32_ic192oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"05a30d377fca81992c5e369098567941"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a2aa89398a16ea17462a52540b770813"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb32_ic224oc224_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f1f92b1797e6e9d1bc9b6ae55272da6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"fe1819a0477dd5e98e5376dfea2fda3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g256mb32_ic256oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"764cc48876ee5535b437e142c765dcfc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"17dc980e35a7d5e689dc5d393491fb6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb32_ic288oc288_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"cb232312cc84820ae5ce1cf323c9e94c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9f78f0102ac5b364b256257ea1bfb115"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g320mb32_ic320oc320_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6131d098df7d80742d5a2a5f031d944c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b73abb9b35cdf08d69ac133f0c2a9df2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g352mb32_ic352oc352_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f66628ec7389936c7ec5cdf989597e43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"79798480d9fbea98f44e935dad3620d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g384mb32_ic384oc384_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e0bd32631e99d9590d420741ff1b42f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a89422e7c739355af7f97c7a897aca4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb32_ic416oc416_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ae5ab867f2ab7e9516d0625440eb4c5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7678b34b04b25f849bee90f341a98b09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g448mb32_ic448oc448_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"297ef7056efba991d09bd48d0c595fe4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1633372aecb0aceefc5dc48f9783a3df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb32_ic480oc480_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a122b28a7883db549de515235c9ab421"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3ec87319846a577a73c32d0b45b1cde8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f1b84e38344acd3385a096426accf6fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"24449afdeabd799ff66a3e59cc9004d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"56807ea3439b3786bc2978f4c9245ccd*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b4f5b4ae78695d9ec83049771e91106e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a8cf30238031f3ae6d200a25e642596d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3f7debf3c9e7fc29258fbde6e9ef9d4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb32_ic384oc384_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bbb1d5b17ad0edfd6d4366512e90459b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"034370c1de5f8178f9290d9e9d52051a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"56c359dff6150c918872dd1ab4760ee8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb32_ic448oc448_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8c5f29d7842fc350021542e89099c881"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c2d50f61c4cd9f8930d752c603356072"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"72e7aeaf5d87888fdca685ef3f512a51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"619eac60042a40d67480274892446048"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb32_ic544oc544_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"41176b31aad91d2965e51371922bd9ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8cc37845f144e6f9d3ba86a015d1dc16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"168dca78e93100b1bf8e5b006c987e46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb32_ic608oc608_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2499a9be58b5c7a92c304da1313c70ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9df30e012a6e2635764c4899b57783bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb32_ic640oc640_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b69df527b5e990b4a30212d61a98823a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b38ce09b7031335fe11d428aa3de3d59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8d6c7c8d8408683a177ee66257090741"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb32_ic704oc704_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7a22b1cba558c897265a38f7574b2e8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1cdb26b6ee57c78804fc6477ffa6ec11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb32_ic736oc736_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8e7d676345ebe2a2b33728227f1c4814"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3676fdd1481522af5d8879adb827b317"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6bd883aeb2a6e48bfc98a4fc95a73b08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb32_ic800oc800_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"39441e02a74a31e7c9c237b3017f8b3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b341e06654c3bca1bae569eede064d77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e09efd03fa9bfcd6bf792f7758103dd1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb32_ic864oc864_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b251e45b815e060be3887894fc1c5af3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0e5fc327c93879e34ed652e4ad6fed1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb32_ic896oc896_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d50d16b2637d4d835370f6119c8c5889"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4b95608f0de3ca7cb0d6f0ba8fab996c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb32_ic928oc928_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3462728a5bf3991936fa6c795b454e19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4d3dee2c93be6ddb409e1fad414fdaf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"59239a7a0300f5a041cdfa72d79b6fe3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"645a6105faf2f5f2fc3d662586c048f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb32_ic992oc992_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9182d814b3c011c27735d8275fe4d911"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7ca8d8281f938735a747bb713a0b5f1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"368c6e12393170fd872a5af22e9ae77c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"1d19059e02efffacff756ae15a63c4b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"89a2be5bad40e8912f17d1ac91c2aa3e*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a3a4783da116e0fee3d3651247ea835b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb32_ic576oc576_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"beb80739b0e5cf9280281f239ae25bf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d1de2164995928ea7534c97ef62b1175"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g608mb32_ic608oc608_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"aa956d8ba94bf283484cf109d29a6631"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d3d243267e1dea81ec793298648c0ee1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb32_ic640oc640_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"748994001a4f55a59247b36d01f6ea38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c13c215f0cea92cfbba2e34d148d566e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3f653b8b966464d9026a723c778c6519"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d97ed29363cee87e8a51201855515f17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g704mb32_ic704oc704_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"29fcfac0166ef9e32e5f598c76edf1ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c5bbfe96b2555ee32e3276ef1159df18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g736mb32_ic736oc736_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8f265898e9459ad0655a2f95ce5ec709"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"0c81755790ab4210031e5e16a4b7455e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g768mb32_ic768oc768_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"93ddee1ca4d271d982377dda2e0ed44e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2736cf802cc0a8ce87d01fcf6639804f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g800mb32_ic800oc800_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fba737e9757a931a2b096827104ebeb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e2611faf1ddb32fdc1c998d2e3c54b9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g832mb32_ic832oc832_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bccc1c3f082c2432e1a9f1773b60a331"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"fd8b4916bde2d207a1c4c1765d708cba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g864mb32_ic864oc864_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3517724922a0742344a76015bde319c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"12a60f68f8d5cb457e6a5bb3b29ca986"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb32_ic896oc896_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"19af18006f342b434a4c51e7c472c8cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e141128c9e6e4d22ae9263e7d2b56ba4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb32_ic928oc928_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"593450570c4c6a5bc2434e1c597658bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"03eca116d9fefde49417ea8219b99c37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6f6a50b3dc3d5ca45660f2df36ace079"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"07123ac19fa69e39762741fe372d7dfa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g992mb32_ic992oc992_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"bc5e8390746b4ccbf48a5db60f0ffd8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4ae8654efaeecf21e363e1f67ad8752f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d4a5718dfeeaa5ee2e7649a58b0d0359&1e2e05485092e176665911d1076a0128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"eef9d3dbbcd4ee38022d47cc76fcb05e*5&84012d3272d24e2e1797ccefb300fdd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"5ae71b367dae9f94d4f2e4f78db48b4a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7744391a7d63a8e92e1caaca484d8e8e*50&a7722cd14b15fa6608f5c99e7e1c47bd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"9c541524083f7df72f04fd308f1a461f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8f00368e6e5d91e85acf0c455e662daf*50&14eb9d45902c487e20f4196b27cd9740*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"0286df9ad78ec5c8e376d3b16006eb10*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a9c49c1f8c67747a363af3633be40f79*5&4b47e7f588f69c8c2fc930766d71f631"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"8d80beb2e70b616fbcea42c588e22381"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g32mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"b3a367a09dab9e9934fba6d025777e2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"9f967eac12cb2da151b7d64c8261bd20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g64mb32_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"4a48575fb4c5c2914ffe43707f8ed0cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"a9c303e51034f00ba2e448b45ec763be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g128mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"a53ae6d63e8b6cca15e98855047b64b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"b3c05f30a643dd0011f08ebad1c3ba18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g128mb32_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"2c66dee379fbbbd14348aa1d475de869"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0b59f14e670039ff3c7570f6e61e21ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g256mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e1df65fb8b7efc4bdba299afa7446901*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1f9a80010653687b2243863837b45fb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9f235f294f09fb77daf543d85fdbbba3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g512mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"c28aca3f5a83e98152170d8deffa4aa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"95244d36c03b8f6a107ab9aaf3e785e4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g512mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ce35140cf42216f9d88aacb8c92646ee*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c8d9c6b2f14cebb81c4945d6f5cf6114"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d30f4c2bde0b269343f64c9126d75b6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9fdd9ce9f9f5e651a2675fc9931e2137"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=dst:common:0.5 --attr-zero-points=src0:common:1 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"99c056e545992ac7fe6eb66b7fa68df9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bd432624685bf2575bd459494d214862*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2f0c19dfdced93a36e9f9a7ebc83622a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53e2c9d33d267060bdd251524eadbe32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ca5305d23618f14bf7a93de42bd5a8f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"66964c52ae266afd7928308700a7e5db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9f1d9946e217af98fbe23b5048bc716b*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"21cbe4ec31f1594d63debbaff63887ef*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1ce78dba897134f3aba420de5b2e0e95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"aec1eeb922a1ff1a7e85c9ae990f3850"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"43268e80f823f0d19d2d9fbd3d9895a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"5782f3a6ddafc677877686e4af0b9617*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"293a011cdc7d6f9b63e25375390232af*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"15714e0904ae6da18d99d6f026ab5d35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+sum:1.0:0:f16 --attr-zero-points=src0:common:1 mb32_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b8c965c6f0aa49ae4d21529fa5b41c35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"aa7f23059e8bc238ec9d5ec88f56cf45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:0:abx+sum:1.0:0:f16 --attr-zero-points=src0:common:1 mb32_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"40f29e85b425d451650f529557913819"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8dcd2d21210d2e6daf0aed9f2a234fa3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3c4b74ab362ed2263f5457406ead3efc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"075598b96476a8bb5b0ddb86641182d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9cac5cbac9b999b18e24726200890fc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a76a53ea871db755db40d685cb368a8e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"38eac52016539060172963805d3f09b8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"11dddca2be60be4ba259eab50d9829a1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"71ee4c087085f49a936a8c7b3f3384a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb1_ic3oc16_id16od16kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"305c4d5ad9fcabe9cc8f52d12fca7463&d9efdc4da31c670f3aef2ef2a3576b95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g16mb1_ic16oc16_id16od16kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"8d6b678d1a19a7dd4e02dafb4d25ac9e&454069d77ca5cf112f99adc9154e9d71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc16_id16od16kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"b143e507714f0bb8f5ed2707054bc77f&088880f4594f108dd7fdece19e2250f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_id16od16kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e0f34e4e3143f45da9d4d8a9efd39674&a4e28b7a11c529fa0be0501fba9c78c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_id16od16kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f16aab68ada17928273d2b663881fb72&d68e9a71db1ccbbf6b1b541a3466740e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc24_id16od16kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca312c159230c1f876970201d238f057&f151a5401990bc657352332ab3036bb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc72_id8od8kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a28fd8415726b8370749b9d959ed9e4c*2&fb4b7d59999de8c218a45dc1309dda2f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9cbc503a400775584c45a29bd4d4ca64&ce5383a5d900821b573d26396c8c5ef0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic72oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"12e29677397d4f071528c654bd2ea3da&47446a5df593522614669db7a8dfdcc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb1_ic72oc72_id8od8kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"9ce6b7d5510588a60bc3ae72b5a06cde&c998e29cbea3b50af7385e00a9bc43cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5242b9381e697f4a92d39c5cceeef320&64f7195ec86338f1c9b17b953b724980"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic18oc72_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b5ea1273f1aa41890d40969e2e844a58&620bc8e195c70c9da18e085fe270c3c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic72oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"27fc324fb68c20d556784141b3b34ccc&34ea27563a09f744a76f40630fdbd2db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic40oc120_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bb9ad295cbebb5c0c2cd3e257667a43f*2&6b612d3e09489f3f3c67fe8af0f1cca1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb1_ic120oc120_id8od8kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"300786d37bd311513683d2643c4786ab*2&3034ebdbd05d7b5a088d563bf68e8cd4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e8c6e0e250a04eef2e0f9cf722e538c9*2&5ef90a3ccf6566a2edd51637f130bf2e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic30oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f38e2d0915776a295dd3c5ac88989066*2&6747439bbe143c7b15bab86b5b731180*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_id8od8kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ceac4981315e3254fc2ff9b391a149fd&77f23603052f20fc8aa0dcac81f50278"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_id8od8kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c67f60b284727e992f6e0e262d4857bf&0436a1983d357801fe147d3f49dd5b94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic40oc240_id8od8kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"446558e4c2a83216a9b04cc49c4bc984&0daf0b233780d27f5af78b779d8a1dd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb1_ic240oc240_id8od8kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"367fab14940e85b1b9a7488d65e67257&7e4d77c684b6bcbec4303be8ad27505d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic240oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4d68e2ade826ac24e9feeaf6d2a91a1a&6783b289f112be417c7ee9af8e14c742"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc200_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"64d687d501d27d7797f69f9bf3dc323a&1cf8e458575eb7ed9139112c8f427682"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb1_ic200oc200_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"73c743d42b96c1f46e0ff286be844794"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic200oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"31487b1d92729c33df90850b2e18d499&dac8dfa7695f5a19ff4223d112e84054"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc184_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e9d18690926365f08f08b15cfce1337*2&372ddedd799a04fe17b06f01cb3b9bd6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g184mb1_ic184oc184_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2acf3ee45008e8cd1c74f0acd5fd9276&08bc453c2bdd705dacb4b67a6fc79567*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic184oc80_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b8f98060af9e3a79e78d64fadcfe5f9d&8de35e1095562ec1284ce50c675742ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g184mb1_ic184oc184_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5355c23724540afdde477499396ce293"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic184oc80_id8od8kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"674a85b760365c89551135d54a41e0fa&ec7ef34ecedadf6c6ba15de2199684e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc480_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3f0bdbb0d2256357926c6cfd1e4398af&c46f40b0a129d4e58f67895d9f65fe3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb1_ic480oc480_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e528b23c3a14fb3063e1fae13ffab8e7&0826918f2e2a9b01838a581a2f65f85a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc120_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b9388aaa04935831318417269d41334&d9fea45b0217feb87f02d5f49af4e560"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic120oc480_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b53564b364f7e18ae094e137441f7e9b&b8984404ac63274a9d4308402ab7b7a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic480oc112_id8od8kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e2ea97327b51e507ef580e703729ac74&10584a276c4fbcaf5edd7cd65f2fbc7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"62517e48fa8295bf01aba599439fe77a*2&4483b677f19483a7580884a493d18ae9*2&c4374fb3e8095a9386e6825ab1ec5b60*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"05fd740a283d42e5dcfbde48654c4044&b7ff7e5102d5ebc643fe0b753ade0f85&7fc69b9980eda756cb5148fa21c9df0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc168_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"031f954ec3026c6d5d7b022f0caaefac*3&45b80e72396e14f982e18923297aa1b2*3&d2677c573f47fd5aeac90d98a5b189d9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1c794ccb4638a95242f450d46e4608ad*3&1ea2d0c0505cd35bb0e33b731af300c3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic672oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e587ec5a547597333dacaaa914c07796&38313ae9264a204dcc39dcf8e8049aa8&504d3c2b2dbb55a757787100257b5e2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"08c762078b8f73577a06daf3c32dfa30&0c35d201f1b488a275c3b49db29e07a3&ccdb9198180e932597d3857cd25075a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e3a3de83269f72be97fe8a9ac9e9fb9c&5a18224c8f6e05dd7e47357b0bfd3f35&d0668ac5d02ada68bf2eee76c1f68d8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"74028cc5ed732a82ae43b5c5af5cbc83&66979a60093cb31c0d9e752e5d719e98&8d409ffc4efdc3f4831db7109daf8053"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_id4od4kd1sd1dd0pd0_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"f7cd8504428278097e86392c8a9a89fe&4208b28f12066a0c9e689c8ad99e29da&7a9d516bb034ea5bcea0e3a5efc79d51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic672oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c1e02f2688b6dc822800805ed1d3ceb5&52e6bf3538b12a4139c4d41c055cb4e9&728278f4c2c8eaf74944d05866821450"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"11dd16fddbd067114cae33f800323609&956208cd1055f1e87656647d7a26be84&833b8eb4fe1f61649374084a32b7b98c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb1_ic960oc960_id4od4kd1sd1dd0pd0_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"1ba83022e20e6d9e28b55ffb3c444e6a&29275dfd041b88a45886623930428c80&b53e79bd3581f1abad970d57a44469d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc240_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"40fc567d0fcc7febcfe82f84a4846c05&cd8630f123d3513cd5721f78f99e2610&4389532b8bd503b3665a654cff02ddb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"85c09c4357206c036a9950ade5459e0f&7a974436d528e66f0c0df812b3a6f788"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc160_id4od4kd3sd1dd0pd1_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7e2d4f51590a642bf2ad3a0ad55a42ad&26b3de4d7206cd762a515c858757b5a1&7c965eea11cf2be78cf4ddf70ee98095"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0328aa29c1398732cbf890b60dbfa8e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f5ec357ad90ac91d5186f83977193078"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"53c206e06694d5d9d123fa246aa5d9b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"4757ffb2699d0f123ea1f94dc2f1e80f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src0:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"22061b3682b09a661dca920e25c42d77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d5fff4f50fbdcee3240183f9c95a168f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src0:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"01b14431e2ecee8279f77311157fa34d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8bc9b848e8501695b2c7b9c72828d289"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e2158ccf0a48ce27e447d4eaa8fc5f03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0fafc7622702ecaf1fe1381f08270822"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"bba1d309e143d82385b75e1513f5eaea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"ae47133e9c52482f12233709471d2cb8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ba747545e21d51e8c71a90a9032057b6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c16263a7453425a46a76496f5a1cedaa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"b72661931cca906e12d3cde6e664ad89*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"186032b496a831547925bf0cf8a7a831*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc24_ih480oh240kh3sh2dh0ph0_iw480ow240kw3sw2dw0pw0_n"b60fd5a631666b90da270235d0edbcbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic24oc24_ih240oh240kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"d5cdc0c6a64e79993bebb4a4c370356b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic24oc96_ih240oh120kh3sh2dh0ph0_iw240ow120kw3sw2dw0pw0_n"872666d1cb76f318accfa9992e48755b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic96oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"4eb016635b9b7b3c67f8e38254b6caeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"1a5318a0d14db6cb5eb34b5e5190028c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc48_ih120oh120kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"8653d9417557e2377d4a6f538dffef79*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic48oc192_ih120oh120kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"4969092aeb69415470924b880f1970f4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic48oc192_ih120oh60kh3sh2dh0ph0_iw120ow60kw3sw2dw0pw0_n"a021c7b26f948e081273dfd5e256b1a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"e2d17ab1dde90ec1b4799d4fd012a988"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"df7c7d6bd840d93c10bd5503a55ac737*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic320oc80_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"35489705467e0ef9494abcc67c5cfccd*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc320_ih60oh60kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"d6463129cad548451ca2b2decb2fc7c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc320_ih60oh60kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"adfd5e5f958f29f0b4ae379bf79ceb7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g320mb32_ic320oc320_ih60oh30kh3sh2dh0ph0_iw60ow30kw3sw2dw0pw0_n"ac79675d99afecdb783da7d538b26d33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic20oc320_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f5502d902200764c9d431ef6a0e3eff4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic320oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"629db71b9774509495018a0966501dc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc640_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"9929c969ab59113db13c660534b3f8ba*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"b923671780c9867d5fb58e58062c8b55*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic40oc640_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"273c4ca845cc9f34ecc995c39228c338*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic640oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"b4905052fa8476cab7f402c03db688ab*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g640mb32_ic640oc640_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"1a34a1a2d31e93b843964f6820f6f595*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc960_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"2f3c834e9e2f1701ec62bdc11fa118aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"1153038d6f773f57c52f01583c444036"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"604a83022d30d54a1a967a66bf163adf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic960oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"980fba41ae43270b1c3fa92e97fd1a7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic176oc1056_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"801909beeeb270de5006199841e46ab9*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1056mb32_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"5d868ddc9b549ce84b4b2a7de5aa0f9c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic44oc1056_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c208fadf5789b675d117b57e1905b714*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"f61ddd4c7f4c82c454e600c4ad6c4853*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1056mb32_ic1056oc1056_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"97bef0c9915862e0f0e08771107d9b9c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic1056oc176_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"f5c21c6637eac17b61fb564693294710"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1056mb32_ic1056oc1056_ih30oh15kh3sh2dh0ph0_iw30ow15kw3sw2dw0pw0_n"dd3ad3615b67ad4805321cc54951ab03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1056oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"ceed0eecaff078e2e4426ecacc901c53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic304oc1824_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"243cb64b033269a08460d60819ea3c3a*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1824mb32_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"fab6344edce57e78d3bffe014b4fe39a*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic76oc1824_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5c70107c8a389e312d8fa2bae76eca58*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1824oc304_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"4e017e579826a36c55d18ed6652fa12e*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1824mb32_ic1824oc1824_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"38ee39f37d17fb75b4bd8b85143a6b5e*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1824oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"96e157065beb9ea69e107c489c0076ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc3072_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"dbcbb4347bce769d030038c38ff28436*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g3072mb32_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"77acde4dda3b9ed54032432900c73d04*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic128oc3072_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f5857e3f5586b910f50d73d413d8c1f6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3072oc512_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"be2dd7c3e0f839bd46bd390bda88a21f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g3072mb32_ic3072oc3072_ih15oh15kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"3721050835ca1235dc1068499a0b0eb9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc1280_ih15oh15kh1sh1dh0ph0_iw15ow15kw1sw1dw0pw0_n"e585109b467dd66cdb6f7a6e32f4f17a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"85467b2ed4aeae45de59288cd5746b88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c6ee9d97744b098a96e1ce6a8dca56b2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"693dbd477736d4a2fda9f4833e78d5c2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"26a3ebd284b63ce7b50fd9e583e1a60b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"80f69d962d040c2135a7943f82d1a40f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4a31791bfd4a9b569569c69727fbadf0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"be23c390e0007740f617d8200c275acc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f515a49ace7540953fbe7e8273de909e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5f088eacf14674f7212a2493a52b906a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:acdb mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f5b528925f721549e01b2c0dc20c205b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"e95c6c66bff7db2a3e3f34a481120d71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f5838c06c003bf95155a79ababef2462"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2d13fabff3aeecef3e38d7cb1aa9367d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"594bd4fd8fc9fdb35c91b29de34f2abd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb1_ic96oc96_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7e2bfff60d9f01d5487ad6e0a83776d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"6ef665b1e833e82c8a6cce080abc425e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"fdb429a59d4974fc952660fa3fc1d1bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a962ca13a20aeeccb3678550183b041f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g160mb1_ic160oc160_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"822694288bdc4441193e6776530906c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ad743047bd2d515c1729596bb5a7ccc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb1_ic192oc192_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"2ee26313bd3eb6594bac929f9fde1c43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b4fa3c7b46bbf7a20d797d1cc9654a2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb1_ic224oc224_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"421b1deae31d43f680eaab973d5b4a27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"af2f9991ea80f4244c7dacc4c646c686"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f34ff2a9e38ed66aee29f3d0245346d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7c4118c3e64a2766851457d99fa62c92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d64db991e8b3c3dabef5a25d17c5ecfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"f8761f52929717efdb41f287ae413228*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0f8e578eb3b272677d23236b896f7b92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb1_ic192oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"aa4a698f1969ed4277dee2b3baba3cb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d8516d7edcfb69b29cc921dce476610f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb1_ic224oc224_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f88f4df9f975f71c54ec53d24b490616"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"80f600006af0f2a5192c4246598ad9bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g256mb1_ic256oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"04ef7afa969036cc166e4a2f90182bc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3c45038eb16a0d09a7135db9932cb712"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb1_ic288oc288_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0495929d337167ddf525ea17e97628c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5ce9feea7e996237c66d1cd795fa5cb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g320mb1_ic320oc320_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f51afda22bd1699b4450a7b10071875d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"3e80a8006902989575f259fa61022bcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g352mb1_ic352oc352_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"116c1cc505ea66099faf8db18c7a1071"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"105cb6ba8f99b4320969ab20dbc83f3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g384mb1_ic384oc384_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"b29886af4935dfba5f07daebca95153e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"2fc86c7704d239ad247c013a221b1e9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb1_ic416oc416_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"db58a25ea93065044d57204b86fc4853"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"d825d43766d0f37a7e14b457bfbfa50e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g448mb1_ic448oc448_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5daa840b61b5b611b55dd6d13c3c0cc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"225ec658975cffc499ab3f3d301626bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb1_ic480oc480_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5f21566ba3530a68a03b56bb1aba5235"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"6c185cfb59c4da8a0cceffd80e6fe88c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"23170cb6db7ea0a5c2ad8c507b7ce89b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c5e24742514142f16554cc5c5d99789d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5409c58c3b4c7a752e98f63541ed6258*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ef51fd0dcdadfe7e7ada4d8760184b99"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"04f2aee7b411e5ec0dfb4ebb84b319ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic352oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8349bb858460ed178e6dac8e7a8d44b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb1_ic384oc384_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9c099385dd12761abfdcb1c001b419e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1338f3cd0d7ac08f0e192aa2f717779d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic416oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8a19ed7fa308143abf6e364b0f3fbe9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb1_ic448oc448_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cb612ead2892446bb39c780d9eead30e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"37e6577f5bff787ae106b5ea7ec47af8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"90196f4055854b76def0167968dedf4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"89fd6bd2f23728d3c4b48aa1f1e0acfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb1_ic544oc544_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"aad5b894430f92a23010c3ce7aaf9bb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"040b2a6dab7c328cfd7b4f0b8c2d397c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0bdf7ed0f2929fcfce22e6de408e0c62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb1_ic608oc608_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d484ea4ea7deb63fd1cad3f21a2b61e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b186158ee7fb5a102fe5f7d52c40c8f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb1_ic640oc640_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f1007394921ffdf4f07f4ce6756ae246"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cca127e76f201c1012c05e65cc166520"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b808e4386571f65fe0640466bfbfb4fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g704mb1_ic704oc704_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f431cbc02d8b42c584809cdb3a48c8c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3dc4fc92a8c2202b1b27eb7205cdf60d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb1_ic736oc736_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"497e0060c738d4c292a49ed081ac0f14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ce17b401f94e9fd306143cef52c0d287"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c61b7da011702fbc684478f61949a81c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb1_ic800oc800_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d761c3fb8728581085cee5674a5234e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d4990d4607c19cdb4b498ae56ac1b123"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7fb26c4b818c62e994d2b3bd7e7a1741"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb1_ic864oc864_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3338e96728c2eb199f32d8cae073e646"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6278b39fc7f15892646657e8aa7dd36a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb1_ic896oc896_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e8345bfd1e884c41da200977fe77c8db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"325d8150a22d91807f7f3e2b4d90053f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb1_ic928oc928_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1afb8335d1cac0c4cf3883a9722147cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d45fae2d4f2bcb4517f480db0c020f04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb1_ic960oc960_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5a400ec6dae48b47ae0aa7a91bdb2dc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0203b65e4a6c40abe6bf81799598fc69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb1_ic992oc992_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ef1ed48d2621607acf2d3dec2a75bf14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dbd1dcef9223285ad0ba84499d1ebdea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dd598d5f09f55c7084198b125c377151"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ba1e85f83569c3edff25de5a4563ffbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"4ffd96c86cd008db9b88c317c7772b21*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic544oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"aac1c747f95109814eea80ba3fc36355"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb1_ic576oc576_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8d7c3b13923e13adc94ea2efc89037b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic576oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ec641251ca721d87eaa24c3ebb2349b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g608mb1_ic608oc608_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b0a094445ec4b35c24c93aeefdb62159"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic608oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"61bbda4edc2afd67066df8888562e9d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb1_ic640oc640_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4809693f63a96b67566e701e46da48ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic640oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2163a515bc36c380669b06d023a52f5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"341f94f385b6d07795596772edbd6918"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic672oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"74177ee704636a92d0b0183046499749"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g704mb1_ic704oc704_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"aab431006c8293ba326c925feb773660"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic704oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f4fd740c0fb58de8e59a89b1ed9c2bc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g736mb1_ic736oc736_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e6fe1db255ade276629ab15aeb621277"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic736oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c9c37b9605f2476c49683bf1ddb15b77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g768mb1_ic768oc768_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"cd9c0386c99aa7e280c274c6b2778d77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9bb8a53414d7c930e0bdb97d542e9275"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g800mb1_ic800oc800_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9d59a38f70b5f99e81c8720c24af12fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic800oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f44db99b765c8d6532549eb48cd4624d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g832mb1_ic832oc832_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"e2305b8ebf4472e361b3a485bcaa1154"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic832oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"426e53ee9d619b45ff53a67f9f577c38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g864mb1_ic864oc864_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"884f6966aace6f7ca9388a25ea1af6bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic864oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"8e194c29f1c73661fd8c3cfc7323e7be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb1_ic896oc896_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"a58dff1007700d656df20647edf4e2a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic896oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"7dd19ac324bbc5c5358f46e81a3af2b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb1_ic928oc928_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"9989d99ce18b22fc790078115e354102"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic928oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"b6cd1c6ce128df2b6bf04629eb1e08bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb1_ic960oc960_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6d0b2cb122442cd1e89acdde8831796b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic960oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"5fa7da387dfe534c3288184a45df812e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g992mb1_ic992oc992_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"f256b99ae56607e86be9e9b3bd06b724"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic992oc128_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"c8208f8899e462c09a6df85bf22c5384"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"018372273809cb7df4f05613fe03eedd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"187f5d583d3f3cfecbe858c622b366f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"79ca6adcc1b14af5664f4ed4fa41d05d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"641f11611d554804f4a4eb43461889f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"5675133635ead457f521678adcae499d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"14f7af69bb273e8bbd6f0559041dc3be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"589d0f372580a7ff5dc8f33b7b73822b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"2ba8ceb0a9b6834abb656267311d026a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"443792e735d9ecf5d1502ef63814d429*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"15d56cffc60f62bf4ad6ef69fc5ae47c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"467019c5af3b7712ad849cc55f54b59b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"0d329f8eb717e5a580b39433cc583969"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"63daafbc1b871d71d41f9ac3e3e2acf2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"10fcb72bf4938c5550d40e6d9926acf7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"c8eabb025460fcbfe29f4e51e889bf88*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"bc0ee569a158c4299312e02ed8de99d6*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"d6c97a24e6ad5931ae7030eb366e3140*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"b5d5e3aae1977afccecd172807c68a5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"50b002a88d027cd415628037f19d2fba*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"0a99e27e62165ba4296c116e3431d8fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"93d72b5c1a6dec29efb9d77865697e5a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"928a62971db60579ca4b823eb6bee684*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"108ffe1388b57f777f6eedcf84623b84*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"aeeb54dc339d58842683ecc58f376d32*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b8966bd2596b2f765cd23bd3f31e7a79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d54aa1f80b71345392a53a34a27b09ac*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"0be644e04fef85efe3b0aee380b80646"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bce92bdc8d877ecc3ac1b80876b01d33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"fa8b40070bc1fef49a418597cb66001f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9c55ddc125f67e73dcb2a5ed5f841236*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"3afdf0d6b18ac2d6242ca5248b0acb66*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"dd0915bcf18fbb6211aace91ffb1fe7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4980ffab0efdcb036f43e453f712046a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5122c66d5813c98a659eefa9c06f45e7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"26573cf0b96226f6f6ee0a2f898ee921*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3882f64cfd147c94f18fb37a30701a10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b87995305c61bb54474c9c2597a537bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"32131eb0508826f069eed3fee2ca8fcd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"beb1663ffb13ee0f6955793b2bc87bdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"2c10f8275e0fb7b373d5f2115da55ebd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"bfb4b44de0defc99ca28f186dc0c3e7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"4e549a22023b83b77a3c6b521dbdc779"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"5f5449b8e9baf41eec807893896890b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f87adbfa2462e16d7d0058980511ba91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4b69a364381bb0cdd78da65f32093f97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"f128965827e17d26d295e3f27a5dcc5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"540eac6b4dfd0eaaa503efeedb89bb01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"aeeab5eb327543c076efd005c8336e3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"27c1562037a07ff4ee64b36ad9d24f7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"517faf3fa1e6b0ddc453cacfad109013"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"50a1c89dfa9d0dbef71b75c78af336ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"d612abc96b941691374fceaf52885e0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"eeaddb3cf8f005cb8c262af9a4058074"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"f2600eb3ab2145d9c5ad79ce9f763ee9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"fd5ee425372d2afdb5cbbedf1c13c43b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:15:ABcd32a16b mb32_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"bb7b39847d5e98b1b26a8b98b58af5ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"1586b31cc73900fd0b1fccc3ba5e7c0a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9d5eaa714aded09c560648c862dd3a73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"388878d842aea6c8753a8f355cb567e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0ad6d4c1aaabe9df893367ad36a5f89c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"3bdfcad258b8a8c7dea7e9af650174f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"14cd2dbbaad698235a1597abe58ae502*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"09cf5714f756c10c3fd583cdfc31a1ba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e648732870fccb8e72805ac0796611ff*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1add79933d0c304ad4ffb65e4ce849a9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"5d18bcd398d78b59303788364f7dc07b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"784a276fe0f4dc5ac819c124e5406439*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f7b49cbf67b77cab383bf53d93666139*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ee9e83a00d7a304fda28014a8ed14aac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e9bcee1f96dfc7ebaf95b340fc9b8914*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"33931e0a21e2aabc2f6b51c1bdee118c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"fc7896dd154cf299693bb78e476ece6b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"0278a474e96ac89e807d75530c540457"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"170bcb90341acc3cb6a472ce5e56573e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"937dad80066aa4e6942338dafd206e93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ab0b11dc93fbbcb92a79c4ce2060f783*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b3d660b907fb3cad7e18c6212de05b87*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"9b08a5ef6a3be7ec601824092f6e8417*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"38e6d0e6832c8f4dc34b722c6bc27c62*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"55361878b84f7dccdf533101f13006af*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"163a18724172b370e6d6fe3e7186f236*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"542be7399efffb822265f17bf03deb62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c4488ec359557c22b0215412bc44ece3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"651c1972604a156f786a3012bdb98c32*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"01317df2c215e1bd58ebb5b65101d29a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"2125b4de4f4d507235ede1c1e7f1c6a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"561ceb922cc38fd2e2d2e3b854553a7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"67daf6efc3b4d166ac93da862ec1aa75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8292ea6fd76e36943da8b5e889be0d5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"778381675f76f724cc65907c53342b8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ced742a2cb28061b850550c3cfc324e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7c2991601faddb46cdee3257d37b2b9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb32_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"90ac646815011ff3b0ad1aedccf3686b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"093de42c1b451ac9aae30fa4799b39a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7e4777e9299c76c42ec336cecbe636d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc192_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"e2742454ff9314acf35591d65225fe3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc16_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8fb4a25f892120ce17d195c6e2ffc01c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc96_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1f985f53566f5250b73d265258e7c25f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8c31ae337afa39fd4f096af8bd5df9bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"4c65786fa6d1a48f98b91b729f4d1fee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc32_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"fe6f7022b02ec329a3d4036aca8ef9b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"659d9847639298704f4e1e0120cc285a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7417892afc4f74751783949669f19204*5&89463e59ed60e8365be1962ee369cf3b*5&25d1a9fb640a5124a77109e20ea7ec58&302f356099962e3ea0c02c677e420fb1&b9d5fd805036448b5c3e9fc5e4fe6739"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9fe7ceb7bbe953867c102cf7e5fd5798"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc96_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"6b3a1b74e08f69f63d516e9553007d53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"e17cb8323ad091c641bd7177286fbd8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc16_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"28639fff1ff25b6654644bce6cb00aa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc96_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c1e2329db72549317f4ce5203db5664a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ce03a6db8f1ec253aa28dec306313393"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"49727796f37e9e2f5664c8e6bb3d4d22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc48_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"2293a0aa779b0976ed3b8ce4ebb02880"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc208_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"10d8a37726effe09b5b92cd30b57a37f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc24_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1637485fe45b4c6aeb2d91e82e920eb6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc112_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9a6b333f0526e08a9a320b63c3b9cb0c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"03db7e103aac4b561dd5c3938cf90e5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a64d9d2507c87836ecccf9baff3a9db3*5&4076ae684b64fe135ae9bb873fda8dd8*5&5bd610614f1f8dc60605620565b83bd8*3&d08e1f059e10163e79e51761dc85811e&917b043c688f43e4c38907c07fd19144"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc64_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"b115048e6479c976625d1eb514af68ec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic112oc224_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"cc3e776e443844a2bf6e40b5b0e3dfda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b94b6640b9e8ed8a7c6c428f601bb7e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc32_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0e523d683bb55d08108d571920da7b90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc144_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3f05c0720d96cfa319f80f590a281aa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"e9d8a9b8c9b0ca5a5b78b7042cff779e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc288_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"a7c68b49e137c8f4d4cc678ad423bdc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc32_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"df9aa4491ca3f11e1f9db0fd14396eea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc160_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1e165c2ecb24d3155fba8c59a64d2dc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0d0bdd46dfc7a9da5046056fab382b56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3d66c012a08d42df76638d0e86ac284a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"aea6c445dc48ea6637fde6d8a2435a98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc320_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"93d8ed0e61b70a8b57615bd5864a6716"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc32_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"73518b973561d920d3567c98c1aee500"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc160_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"3aa64e493072e9289cd31a0114bae793"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc256_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"12b4be80c78f2bd04d6a4f11d9eeb34c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih6oh6kh5sh1dh0ph2_iw6ow6kw5sw1dw0pw2_n"df6a13713ef3bc355e442d630bfa1dfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc320_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"ce8291288fc5e601efb8f9f59ff848df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc48_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"4f5e3e9fa5700388e9d2bdab0047200a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc192_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"ba8ec9b691e81a80edd86b5fce8c0448"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc384_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"276832e7b90343363313e92da95517a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc128_ih6oh6kh5sh1dh0ph2_iw6ow6kw5sw1dw0pw2_n"9b98ab9a42812a5775490556884e81af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc384_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"a3bf3565a4c911d1fd0ac2b5146a8088"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx+binary_add:f16:2:abx+eltwise_relu:1.0 mb32_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"6cc5ca6ae1a50d3d6c432b94d4eecfdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c0012cdfe9240da044613ad7d18881cf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d3a835d8b9f6b0ec8e4744510ea0f1ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"473539fef21d72f50fce27da87a72a25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f543d9f7353968115b72920431f0415d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d5885c30eb1a8aa0cfca78f0cf005458*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"30704da4093120f7deaef96d1122f0a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d72df1c071ce3823c4a346292187aee8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"94edc07c0e7fb47563147f9ca97c9724"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"96177c8512b24e34c930282cd3a814d8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"faeef6d536827f56ceddc71ec173c48a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"44a644a50a379dfa8d6f935bc84ab68d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6cb7a936d134f666dbe8cc154993b18e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"43d9c5813e9608c12b04de9581d1e67a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3931fb4f8a900c4f64682aa9f146a8d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"172d804b865598960d5c116d6dc2826e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+sum:1.0:0:f16 mb32_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f0ea8c76243adc18c54424485850a0b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"bfc5d1940f9f9e9a53bb771e07b957c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b4d2ad3e02fb47c619bc7cff1c0f564c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+sum:1.0:0:f16 mb32_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1fe5845b08d539f2f4a081b3690f0a0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"611877a0756dc5935eedca322281247e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16 mb32_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"cdd03bf2cf470d99d4d729b07e86b02f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"7815350db62d8c54372d01d820545c6d&4be2658d8a8c82d9aa6c76cdef06ceb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2b4ac4c1b1087a1709c01cda9cc06579&253bfd1d07d6e5747aca8abd4913dc1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d2a2ed27a61c3ac2cbc829f4bcff43a0&bc200421734e4f44c67f8b11bd9075b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh16sh1dh0ph8_iw1ow1kw1sw1dw0pw0_n"b033fc78d67c4e6bd84d7ba92c8b29cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh15sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"a93a446e40e5a0c53d31f41e18c3a6c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh14sh1dh0ph7_iw1ow1kw1sw1dw0pw0_n"525201dc940497d04ab983bbee00b200"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh13sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"17cc02c6bb141757684b80f81f31ece8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh12sh1dh0ph6_iw1ow1kw1sw1dw0pw0_n"913c6d400f64f60f3f8c5042a327092a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh11sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"08f5f92456782f9c542369647288af7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh10sh1dh0ph5_iw1ow1kw1sw1dw0pw0_n"ab01d9c08d4afd60dcc1b99fbad66168"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh9sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"28ebacb83517e827b6aa4e9975c9bd6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh8sh1dh0ph4_iw1ow1kw1sw1dw0pw0_n"9cf01fadc511465774ac11d600fdebf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh7sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"e4790dbc37fff3f2b39bcc8380224980"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh6sh1dh0ph3_iw1ow1kw1sw1dw0pw0_n"98540651f244e5a9c2aa8c11c37359c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh5sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"113b9b48fc068bed24a745360cd55766*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh4sh1dh0ph2_iw1ow1kw1sw1dw0pw0_n"e20791346104263b438ad75aab9096f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"ead0fcb76ec055406f427e7fef99e099"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"c3738a3384bcd7a8f33c060d03f11ec5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic256oc256_ih166oh166kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7b69671d3dd7a0973552180511986c6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb1_ic4096oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"13337299286390885e1e985c801e258f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic256oc256_ih166oh166kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"6f9b702fa8996eecc111a93c8c5cd4ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"4d85cd930041e0b5003bf2d45fbabe57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"a87e940274d2d76271b5d198765ee2b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"176affa937ad4f6156b9a04e5db4c876*5&1aa802566018825c4e6a73549e627c76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"cfdc28275d3fae3f44e2565a201678ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"b5de2eb1ea22e63527859c63077f5afb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_add:f16:14:aBcd16b mb1_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"cc5a0e7628c7cff9ae781694163740cb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"83660dcc810d5956706a1bf76dcbdb1b*5&5f6958fab55c6974cb92ea1d97dac08a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9f44175ab7e9221960c89a424082b1a4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"38f2f5a0a94a1f06d3bc61ad73cdc085*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f359491085d6612b4cb2146479d80a91*5&f070c4ab7b9726fee85a9f88833a57fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f2095e978414e4c5e3165047a93412eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"231e13c312ac18453eb74259cb4790ef*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"1ef9a74dbe6270b17caaa3d0dced2d35*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7fae97f0919e45f0bdc71b4c4939036b*10&5b8fa7874e3bab26347928feda8edb19*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c8c9cb8b92e817bffbfbd3b8fb05e98b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"e41b3a3dca970a7d818736c68b795d45*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"6c6a034e7b77ee3fc32bd4e9fef0ff39*10&bfa31d83f41feb6ae11abab584e1c402*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8be5bb5b0e9817e6f7586b0076622bc2*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"d37dcdd4850811c78bd3b4f616d0da3a*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"cb26f0937bd6f4158bac5cff4324b633*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f6c1a8e9a0ca48854e1e4b3be8ead2ec*15&d835c94df01a33da375eebcff11efb0b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"de5fa812e53aa7c49a7b36e0adf37aae*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"3d59724fc54a9d494570deaf634bb9ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"048663155b8f519492b067a89f48b2b6*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"31cd53b8ed145e25ae61074a9cb32afa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"39e30c4ac349437c50f15608244e30b1*10&557be005d7db96bb1694a5b2318de29a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"72c77bc0b95f9afd5e1c8daa3e2089c3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"bc978c184ed796a46348c1eb47d5e071*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4aef01153512c4fd68cb2a7532c4b6f6*10&acda936b208fd7e928bd14eea6842bfb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d285a6bd65165be0475b0ea7bdbabe15*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"d3dfca5ba1f7213a3f7658e313811e38*5&4141980ecbdac7e14a2b2682a080c9f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e79d21d98651e596c8efc881a3f1d98f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"bbddb746f54ab6a80b027cfaf98453ce*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5fd95e236aa466e4ce4f562d99c28180*5&c77dd2cec328b234e50f03f4c88da564"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1f5d88d21dfce98fb27c454a215a4a9f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"5ce33d43cb7348efc9c9efd55816adc9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7a8e4b295ae0df943f24860789486c65*5&1923beeb5f5ef1c420c5ae0c502eaf04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"011c8d75da3314c22486af079da84d24*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c8c60419066f1857e9e6cd8f9860f0c9*5&28092dac8d513a5f6a6d2dc7fee81911"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"0454fc911e7df67c4d2452185b6383ab*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"20b85b60cab7be19d704d62157bb3bb5*5&c153072f29d436e4d6098a3d094600e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"858d750790088e369c0351b70d4048b4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"37153fd1dc465fc5010e8f55ac79e89a*15&d33f7f025713a0541bef0337dc3a062a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"a2952ecff9aa40c4e081962550ed908b*15&e8dddcb871cb0b28a4bb2749ac15232a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"a6afa614f33da0c479c77bfcf511b0df*5&6b69b0bff4982b0a5350e924a11d7512"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"dcb17bd86f1f82b37bb765d5fe3db6d2*5&3e56342ea122cb57084da67e5cbe304d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"527a60454a3c7fbff16790684fc22bda*5&c8264619a93dc5b8ae5afe5ab4653249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"51ae570e5f71f47488fc938a05c92609*15&5f658daec0fde2e18745ff22230d9362*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"2370ed4ef62f3660f336167a9d9367dd*15&eb063ea6aa8b087675aa716f46c7a0a1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"55c5754dc060159f0b010c31027bb0c6*5&688b94b7a59b2858d66882425ba9d231"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"ab70376ee767b80fbd096d235883a4fd*5&162023182121833a55d17f001671bd99"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"505ac13f3983aedca5d3757fd8db3ed7*5&0050791e46de1c3236cea2c23a541fde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"61030e9279d8d10beaab2d5c2ea4452b*25&1533fe51269216eaf75007d75e78fb7f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"40ff0db63265d037b19537392980f57a*25&caea9a6a18bcec26919014cf17f989b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"7c0f09c0864e3494b44fa504bc4acfb3*5&f756da2752312a3d1b5270d824a11beb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"a5af60edafbf2b15f3c8d013d6b9a84b*5&df0ba0a0e13c93f4b31e1dfaffbfb45a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"39dda07912922b058cac75c58406ae3f*5&b1aeca6573d2e3e8eceb6cd0d3d23420"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"6a981433676efc9cf8df19f3931fa988*5&48d2d027cb046ba8cf6139016602e9c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"1b90f77ff62ebb82a8fbe6853b1d9617*5&ce93a8db044efb40346df66b564b143b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"4438cc22660f598c9b383de43ccad597*5&be0ce4aa44d667a06a80153f6af9b08e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"f8acef4dc65c0684f25528eb695b4fcb*5&ce58b038ec3c37db6de63352713fafff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"22783b6f32ff8e9306040e10bbc14b1e*5&3fd2f08bad6ed6f6fe5d5d0a8980f8eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"263edc2434baed7d02c5459c4db90a8b*5&2931f86afe3d78e534c918cdc7decffe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"a367fbd81e4f083a4aef977babfce1b5*5&2fa211a1af51804928eae8e576975b0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"524702c2da3968bf23883ef27bfecc68*5&8564bed214fcfb3589b33f8755d4b175"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"65751016ff76cb7a20ab295578a1ef21*5&1b480701dcc133f0dd8538c60848476e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"dad39b5a3ffb159fd0364940f0364511*5&1afeb3d0097ebc1e0d4cc57e28e3abd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b0dd95ed35c516011dee3116dc2a3eca*5&1bf56a25c6ffee15fc703787f8258428"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"50a664c8c9b94432a196ac995588ea6f*5&01eaf27ae4a611125bc028698ef7a2ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"87b1d2a6d0b347ee4dc29254e3675402*5&108e34a8174fdb57015cf51c1b92e1a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"477dec0093b7b56c014876361268182a*5&6e16cda0af1758656d3d557c6a6ac70e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"62ba0acb55b2d25437e4adc5fc2d3138*5&42eab90b957bb5962825b36eccc8ff34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"87703b322d7a3cf8d2797555098ccfee*5&ab42f21a34e3263617fdf1928d3d8c2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"0973e31a8bec07ea33fa56a7335bba1c*5&fe578ccc1a1e4060a9884f1ce3172b94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"5118c769281c5877c8e7b625eaf16543*5&1990e4459b8d739c3144021535508b6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"aff4cf042dfcc774bc6d4e8e49aae48e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"af7060f8f601e75834847652148bf523*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"e2b1a0a0ed92e7075e8da31378b330bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2c1978997fcbdf80e9de1aedb7dfadd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"36b234eb9ba8983729e1a87427844967"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f25584988c3364bf7db9cea249d1b662*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"078b2324542d846834013d4274f9ee16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"5bd9913eabc19316f0cbeb74f64e1762*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"363c4729cbacfec06d8497300e8fc4f0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"d1ef5aa07cdb55ab2997d6409f157755"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ea158466b54d94af519cc0a3a2e1d629"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"b389c321c9b4662e54019fef78b86730"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c15e4cde40f10ddab79270ef89432afa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"27e8d5be5bd8dbb8e437240b159e568a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ee6a555bed0d9f072237d9a07b8fc8c2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"07056f3b66d508f4c809310a4ba270ca*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"03c0c752c9c84f6a4cc04109f48f239a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"dd116928f66149d4140e9367b0340361"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"0923d4e6d544b01b9b1fa997e6c8bf81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bb3f1eecf1b776a8841d5c65082c7ae5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0d054dddf1ff3f20cf471bbfa0ce88a1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"0608f35ba484610620a1c924ccc1678d*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4e9a4cc6aeaac2290844221c531f3371*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"7b952b4768dde18d3d97ae7e8984b771"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a1905b6b354f002645b70b34699d92f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"0d36ba4fc9786c4e5717328aa321449f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"76969c15b7dbc98c75408afe13a63453"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"78094f37ee3b49896a2ed220e490d793*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"dab00a811152a20a3cd1d21dcc1f2890*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"be3cedfaa4b2169a283155d59cd4e514*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"925b9033f7107221ab48f4c1ef63181a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"ab50f71ab5d87b32e6ab406d420cd3aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"918f91db13fcb4ecba10251e0726df28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"82a707f9fb46b92e21baf947b7f3ee56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"7139d8dbbbb5b5acdfd18a43403f297f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d630aac59ed254cd0e3a06dd2203d587"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"ab5e3d0bc899117c86aaebed2c627d3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"578b7bb07c529e258a279367f329a606*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"25b0aa23b2cf899c2819da33db11d9ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"208c713fdc43da8fbd7e48dcd347dfaa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"518d3f497c7315525333f799d0292987*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"01d89da2e11003aeff3db88a31b0d33f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"5f4b50dfde44754350e323ce08612813"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c2a815f4419c06455545c559a9a71e9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8517ff855d30fd6ef5957c65bb600e83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7fb9c160e6e906ba6d34a0cf0f0d4e1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"975a2b1cb76898c8d55cb91ff87253bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"89fbaa73e03fe908630a0ca58099f5d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"340755682d7f454d328de80fe22105b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"879083ca681d048bcaa541e6c9a04f09*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"37c1cf49b3b695adae8d063afde1989e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"316f5a3f1921947aca0809803232a735"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"81fdccdf5b8a88d5bd78087d6d6aa6ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"8c05b56c54cc6e4d3754e2f4a0b71c6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"21ff77cf7a670dafdd0c718f404482b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"8243ed549e59702177390636d24edd03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"99ecc3998558e6bfd816b7725a187764"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"91453ffec4f4625eae2506497dc65df6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"dbb972d77a1f2a05bf8fd0c6d88c5cdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ae827baf4a7d6441a0b409af0c949b94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4b6a8436d3830936eedbe96d8579ebff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e705045c09c4c2a0493356287a3df099"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"d863b42e4c7aaa25846dd07c0bd4c590"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b08e6a8744ae77f60cd89ec50d3dac7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1cfa642e50726e5d5657222f195952f1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"6bde170ca8ec37d4c4682ef2ae4942cc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"76ef7433678a4a2630ef78617077de80*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"499bf0a05bbcd5e7d2c9b1229bd282c2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"9af84bb1fcebf0a6ab4837f8156a6089*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"03640cdc91e4dfd9012d79a205f27f06*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"061959fbbf08a161134e29b49413cfc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f14dfb3a87f8384d661d0e4e47df0d39*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8d9b630d6630de0a0cc0ad46ccf71a41*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4e19c2ecf5e045d25bea55e7cdc37e1b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"a050718d61be85874dcc240e388f2a11*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"990ef649525646fd2e565800464c6a45*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"716e69be918f4b167305b2f9e5408a4a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"437065f6d9480e61a3a489f0da0f4339*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"fe9da857239e352694e77eff58bd1ab9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"4388d2f7d5f3571bce971c870473719f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"cb49f50bd54e245f317b6b39945ef992"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e18a7483425a015492375f7adc55e154"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d8b15f3285416219fad8a0c946537921*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b1af14e5e9ce6301482c0f208ee397f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4099edbc37ff398da460e1610254a549*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f899ed768e852909bf708c5d722374e7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ffbc45b1d7262cb9f6740bf924ae7298*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4738865f89595120cdad86a4e8f060e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"36624aea9cd737d8a4fc663d720c88ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e5f0a67600ce0ff8df5820638be5d5f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f474b327040ca6caf5d8280a2f67e770"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"bba3662baf8c5a1c148a5b6f80f0d727*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"51a5a74e2cb4ed8b1e84e927713d3047"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"b8c24b8f99c4885c1311bdc1c8f156fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"9db0eca34a63ed8686f7d0ed1e8f074d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"af11773031f4d0ca549c8219ccd3ba8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a6e918a87b582b2a6bdfe92806bf3706"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"41735c4b084ba91f265d777c74c8e48e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"f389b959329a61abc6c8b95169070137"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f7db0123008c9add181949c8daeeaba7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"f1a1c2a685f66056ed962ded8176b1cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a9721061a5087be14e9555443ea35085"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+eltwise_linear:0.25:0.375 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"64362c51a06cbdd11df03d378f13fd33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"575e68d834822b2f66b172dd3843014f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c3604cf0d2ca8771049d9184b6a6e340"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"4b3e52a10a6425d3814d2f2c89690a7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish+eltwise_linear:0.25:0.375 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"04b686880f9ac2b674299bb3c1c5b279"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"341eb1dbc2a84a9d84958f09df32eaeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"d50f533cecb9613272d270e1fba7a250"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"00d58f6d96b515b550ddda1dfaecfade"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5f60f9cde320679dcc704524f0f7a34b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e335622fbf2fdaf574b4b840e6d556b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb1_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"7508d5b4e975fddd8e3f5db5ed2d111a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"919f08e231ca3a4333235f9183811ec7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g200mb1_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e253bd6203958c664451c00916c0d8d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4f5f52bfd09ade287e604e40456d29e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g184mb1_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3209dd57f1b230ba5dc58abb0518ff24*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g80mb1_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c48903a223e0747602f36fce76d99403"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d8dda83cddafddfebca0f438043973e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"31df9b08c2f686c129e287a79b33fe9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic112oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"145d1a5db42e1a10ba2c167ec17f9f40*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ea90712f594ca0c361abc60b5b289686*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g160mb1_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d4dc5191ebb3ec10b17323997ca711d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic160oc672_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dd1516652c0d7e9e1adf954dcc254ac4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7cba5c911843aa47080ee60df2c5803b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3fe8f8fbbea959b2940a40307db6c73e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"e70acb020c6fe2919d21e94f37d2570a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4d371ef14f1406f93a441a23a30c3f96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f28989f67a6a854f9700b5b2568c65d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f2d79c98caf4c669a85667aa295db93f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0d7ea0c8104a6dd94a975195d9e4adc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0ad26d895be8cf6569b80a3f984be9a9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cdd5d94847dc9340ec7dac3f7002c7a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"ffdfd3f763b5b183c831ccb5a8893037"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"39dee4a5e867b7001822c236215a3360"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"47c88754f03e312124ae1d1a2c450fac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"22d06c979460487ccf15f4430e74da02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c815e76c14a8d4d0a8cd82c70b47f2c3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3a3956f6d315f7078c46d5988f2d7627"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0db0ba44cb2743159b9e88bde0b4851a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3b029caea9ffb9804c27ed50ebcbe67d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"e426d6f8a7800f637cae0b757d81aec2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"55375ede0c34f73d6e5952493eb3106a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5601756f310e0e2bddf6fa265ba3b229"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5e43744309dd272745f9d7e1e75bc5fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9ee2012467a4ebecdffdc06e0c76ad4d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"eceb84cc583be696933b9aa4960110d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"dce38f2d99964d3fff623f2295e5a0fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cb9d54134a5ceedb4a4495bdac029d02*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f06949d37fa6cd47dfe5120acf54208b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5725f33270b218e8802eb57607edbfdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"467171a54ed5ce5035986048510ccba5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb3200_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cf2efe0da4a1214d25e719165791bdad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e744303a443746ccee6ffb55daa0c2cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ebe2aed46f7281fc1a0127b7ee1cb29f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2236c9bf0604884b657bc6baccd41f0a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb3200_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3635163171ef4fc3be130242b6d67a12*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8ca5b512ab79a1a094bafe4b506b0556*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"29b99f45ab717cdc96efb0071e7edc7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6d7411dd3894580855c40671dcbffc71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"5c8c9357147cdcb4dd1762911fb9929d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"327b010a242ecab4f9ae17950acf0b74"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"dcae64c79ff1381aca30100fa6491b80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1b375d8e6d505d74f70b19bb9b3822c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"f142379f120d1b183bfb43c04506c6d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"7cf92a5a234a0a1f9a0a44a8d7838ea4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3309938951249f15343c9d2c8ef5052f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3930995374abce244a524eb44df579bf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"209e612a0dc5e8069320cc729b9b014b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ad8459d04ff0ad69d9a5a060bd3feb49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f945a27bd30eb40f5d5043143f711089*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fde78fd770f0a42b58946c4ba87492f6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"88978027bdc43a1950c35e0e30e7fd80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cdf1d741ec6dc6973890b5b817525fb3*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3f151389b2c1096c288a59934c963c4e*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2613ccd74679a9e7e5a2b912fd1c54d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"392695bbcdb14ad86936774ba74d5bb3*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f1017908c4e7cac6f178a3eb34d6c972*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"be0b16b39ef88e4d2ec28981ed71752c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic36oc18_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ee7095077cb9711b7d5a30740a3868c5*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"2b9b3675bca92f2998a154720c0d1c84*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"645a09db32fb112eb1d1349f37df8d1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"87f6fe3549dffb1b3a1601561178490d*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"84cb43b06d2a98cab79f3352f9ecabef*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic72oc18_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"658d3bd1ebc98b9b4ddadc8f9d79fc05*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic72oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"1f8531194cda6ae225aeba8eea1b16b4*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"cda792195aee5327159c1a3c2a8d9a4a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic18oc18_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"bd5e411175dea7ece2777f81a82363bf*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"fcc3b3a288db3b9ac63ac43ee9a76fbe*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic36oc36_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"c17b93be5a0bdfecca2d66d46b8cbb7c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic36oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"42c009bd969a27ffe1b8d9f3f735c60b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"1063096d58300805ac572198dbe0eaec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"1dfc43d1566c9a618d3f7fb943a54614*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"7ca9ca4ffea5f2977db05930bf3bdc2d*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic144oc18_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"147555e2268c5b84cdca22c89e3c2bf2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic144oc36_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"cf9bdf75d653fcd7fcd4ce8897598855*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic144oc72_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"d123053c6df0dae9ff755df358913ea5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"89acd353b1f0af322f8e1246721f0e76*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"ff79e8f0385414b4fa868058293306d7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic18oc18_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"937a152011af4ead422e7502fddc65cb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic18oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"e16e0c69929284cd35741a761106b15e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"606f16ce4179ca070e8ae5326df9d288*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"5e78fc1ef33619514748200ea1885383"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"4b12c54191f55b3ff243efe567aa7f3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"49d7ed3923a9ff85b0f63a10cfba0fa4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic270oc270_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a5bfe9c2ec8aac3e2061d1e866eba155"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic270oc98_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"66e27456390e56161dc864b55f48ae87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"21ec884a00a889722e49fa0256f608d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"4bdc183b7fc171ee0c6f8edf247e7ba7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"da7ca317b95292ffea082a8d03914bfa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"92b05a5397c9e8e494e08b40ce5d55d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"fd8c1bf08cc45b811f519817f2c17e34*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a95814a1d978d825b8d469982673afc1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3e2c6f9f814dc479ad427c5a8bcee925*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"151b7635d1c3ee3824b70c335e073257*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"68681b7156695358949a921fab7d0627*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"bfa93faacf5c785863502f6fc11bfb10*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5cfe9d796289f0c2d314222650d2f632*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"760fcec6e1fa3175cdbf975c6a87aee4*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g384mb1_ic384oc384_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"703cd4d73a3052013fd2d0f85700d20a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic384oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c2042be2766066383172c1cb8695e43b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g384mb1_ic384oc384_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0c6ac4e3d0948445535daa0b522e0c2a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"106ca41102d3249bc35de484d963ded7*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4e25e9fce9ba9211aa273904af9d7215*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"bef2b54a619b5d5d279af8a32d4fd30e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7117d71fccf80da82a692aa780bfd08d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"aefb6d34327b56f647b5be80acf202a3*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"15d624654aa5e5c3626923eceb9f22b5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"22cbd04c03f2a3058ef32ada6e1d80b4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"93a7f39cad5388d9184e94788cbbe148*6&6751935895fae5b4912ed74b15133a77*6&e7a94d30c0e919f9ba2cfcefa1b537a4*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g96mb32_ic96oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2f49ee0e82147f98e42e9625214962ae&2076fae6739c70808bc8733ec46f2e71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic96oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"43ee711b765b99844efc6e48e29072db&51d5fb46a0a3a52330b9c2f8bbc8d5b2&04ea415394f8d8e858276351d16883e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb32_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"48e082aed7d8d15f96e92914109d12d7&0a1ed1f4fe173bc054856239a07eae48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g160mb32_ic160oc160_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b2b8638ff50938a7de407669de9b3697&832fc143ce5363b7fbc0633ebab1c831"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f447e3bd8729dd8711805ab9208217cd&437ae8e1e7fbc5cabae90fdce09244a2&e3537cd1b26cd08aa7e0d48ffd359c5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb32_ic192oc192_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f5a36d4f7b12bbdb1a1a9bb784d8a8fb&66e69a75813c3c62abb99df6676a786c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2e678b1a832879adb888d81e3407ed0f&83e25955872dc74f6f5f93c4ab494cc8&9fcab6a95922702f5d27760f43cb85e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb32_ic224oc224_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"408c98c231d87991d828d9585814572d&33ad13ae16c373ec90dbccbd88bca675"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d40607d5be10605c8a0aff64e9bffb6a&2feea4568b86858d331e747c5f9342d4&597f9029f62d8532c332e39710b21e12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3da36cd9a1051e59cd9e5c6b0e7d6705&9301bb1e7e92fbdd104e99b02e9340e5&55cf5d06201d338ce49a79cba9a679bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g128mb32_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e6f287876307cd13595d1db5740faddf&47900b5a36ddc120532ad887bad00ac9&cb4c34b236431e054708402af25d2df4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c9c927baa042b8e9300c6d8945e1ee05&4de3248660569a6ddb70db007994e4ce&000f1eaef74971c45a0ff39b7ed94688"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9738b8e948a90bc28d4246d04b536440*12&68d1ad0c1383754985d2d3d9ba4fbb44*12&7bb09e4785da118e6dac2c4dec2a3c5d*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic160oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de64e9077c5b40c003da8c53c66fac7c&d707032e97a74b7aab005e1177ac9311&d29d5721ec6e94513013a2eec63b0ecf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g192mb32_ic192oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0db2c837ccc0af8586730b93b01949aa&4c1df5591ac80ca4f95180cee87ad891"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"19d2f1be25d2be70115a12ff2b14f18c&2ad4149652c8bbdd1925210b87b91216&f2906828bf5d7dd208c7999016259109"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g224mb32_ic224oc224_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2bb1fe5ffae16427de390d5ce4ae4636&15ecf397a3c06960d9da1ee978d98fa5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic224oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de9bbd7abbb724dad3e816d81ff15432&ce25920779834c67273b248caa009e56&be870633ec292099062cb24a34146e02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g256mb32_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d4466d4b0a455cc60a247eea79a07e80&ce125d35c6301b640575c2fa9c45fb4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"243ee87ab70c135188adea38c430f2d7&d431599cdd3c95f2bc51d1c5ba7c295d&a63060bb54102b0fc6009735ff2d8393&3b247e4ce918ab46f7918bde14d92497&9389c1bef48601e8414cde03c9eabc1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb32_ic288oc288_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"24c37608d3d13bda16b9872542723574&6359c90ca55ccf8137f9d7d3f0404a09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f6f4e3bfea5d1aaa4033315cfadc5392&9af8c8e4faae8b0d80f86d786ce94701&81df1a36376084b3a11466a90dea3db8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g320mb32_ic320oc320_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"975701e47eac8abcc7fff2b221c9e4f6&503f42c0d9c210a203bd427041a785d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3ce42c3b4e59a56f8fe8d3088933dbc0&8a36b49218cfd05709e4e1babc758a86&d9dab665798da93072e2cebc8221ba34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g352mb32_ic352oc352_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1b64365430eba77d42d897c2be9be90a&7c6eb50d6be30d0c1a4659e25fd68dbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f29a0f865d386970599e608b0c2e20a3&a1b4183715a289c76f65c39c62fd45df&8e9dc719c50c80db10895957501115f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8587af01efad3e6e0720aeada11a2f9e&4886e6c0fd4a03b58ef350860ea266f7&e4c585f75e4aa26825cc6d4167b6ce81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb32_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e90b0b136464dc722f269c1c29c0bb21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"936fbc60c139c692be2673efd0749b3a&7e7dc4dd02306105992b71394f5b232a&6f2822faf2f0985b8ec292e727f37397"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g448mb32_ic448oc448_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"49eac7e688f7cd3d1e82370a33f635cf&0a2b31f43df3a79b5ae23506e962aa49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fba1e054d0dc323da8efb88c61f15d83&77417957801444161ed08c92f8b27653&4b58930840dfa65a2595cfa74e4694ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb32_ic480oc480_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"96b3c23fa5b20d3cc05eed1b6de337f8&d1cce9244ce836423d5ac82f665b9be2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5e9c5eae623a4ea997fb217eeadc15b1&fb9ce8a5bc2f05913322b5d663918e43&3b18083af4a5bc426037e51c3da631ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"189641997c7b1b58e72e00c3a19c2f12&40015be78b7db2a7561b0a8712102d4b&04bc30835907cca368159cea507daef9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"55f74b25575f8b853f128e4c4308052b&1fc788ff81da79acd37c3ace13e6e787&91ab1a08b1776fa2d8cf125808344af5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5d9dc2063dee52af081e88974eb7aa14*24&9d1dd6dc9261837df297ab777ee96860*24&d0a808de26698e88bb913ecf2967e632*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic288oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e4f2438099d5239b749f26998c43e7d1&490eed8cbb1044dc7dbdac72769b27a7&802ae35b52bfcde0aad13eb214c7e5a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a14d75c60199facecd7d73722452c5ef&3a16f080d5a42b96b7ace5d646423687&d283f1a44184627f5836584af470b50f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic352oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"377fcbe59b8870b18f1ff81dc4a356d2&e1e073ae6bf98cc3585ad472356f329b&9a4e7b6b57473af2935cb75befa8ecff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g384mb32_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2689f2a0fe7eafe39049cea08f3448d5&9fbf719b9c51fea4b61c85f9740ec6c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"837833f38fef9b60ee7708e0819388a8&b837327af68773d5d483fa00fc7c8b42&9fae3e37ac1a30afbe44ff81273d98c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic416oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"50663d7ea8431d1fd34da3ef88b54115&b04ce8c75cb4524b7d2ee1d0d3ba9b52&888c597e403fafa2c6253e21b1fad40c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g448mb32_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e38b802f17849f2b0b195375d6d411de&f4a6237f21e4b9503597f2e697262705"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic448oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1959c8b51e691201860bed798e3c0854&c7d6e0c959ccd46a1dc498819571331e&cc4320fbd04a6bbe520017964ea43140"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2bf2a70d3fe34d09adaedff641ed0bab&f98c53c2c5711aa3f711be0444dccf96&0dd99ad921be042238c80a3fc1f5586f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"313b6c388e35475922ee08c8c381822c&cd1896c9fedfeaa6f4f54751a7e164c5&d4e8f6e2801cc859c829405c6110dd48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g544mb32_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"34c222e15341d115013b4ca7ca4afac7&732d51b7c95002a91243f384835d91a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25ce7079cf609fa38bc8f6eb7192cb11&b69f1fd4e1af38401e6db5c2c4a3ef35&70326e5b7e85c6eddbad3c1b0582d4b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"364e611605297cae22c2de9af3b44b79&d2c16dfd38343629e9b20a0896e7a9ac&8735190db4498579697731a5e112e6b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g608mb32_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be83629d57fa75e3d60d032646a3461c&6e6583d317fcd0690d7dbd8018c30064"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"802d4023f28a63a936eece9270b9ad18&b1fb575ee7b7b12b959bb7bdf51f6bcd&f3fde3fa9c18c62251bb088642dd3f64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb32_ic640oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed0794c6eba719986a8a6f530e0671a4&f18e30a1c8ec3a4946d68eb7ff0e934b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d12b3eb132bf4a5c8f1e2ec8f4a47d77&c5f799f6bec53f71294390c2130327fb&529afdad041ab66c645ecd4050dc4e67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"19c3d8f1f00e02b40f988c7df8d1a256&469862126bad4923d9c094f64d39c38f&7ef05825f3025fdb15db46a733553d5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2371a9e62e5351ea6acd7c4ec5a802fa&b0345dd35b9173868d051ff66d81c962&0eb96889941e0f611f1240094f834e87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g736mb32_ic736oc736_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4793864bc78cbef922364d62cc35d5b2&b771e624313c62f7abce7b5f46afc8f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b93e24a4b874119d18e0799dc976b23c&32234edc499ef66276e1a65c63498c94&33eba03b92e66f68694db86222120a52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3a78caaa567f78af12bbe6742c126087&224319d8ba3d85959e28d6592684ef01&d11c48c5067e851e882316dc4995bf83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g800mb32_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc6a6eed32c4d3aa97f56c555d46fb83&42c6caa3b968cb4d13b409511330cb50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22b0eaba43c977f5e515b49bc3254ed5&c502c3676466327dff76a64a8fcd4bad&a2e47e21f8410561ff31dacfb1a2e0b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df769e097d16b6cf1acc7092ef54d10e&0d2e37e152fe5fcf7e18497d9ac16761&92287c71e6f291bd5c2b4ceff09391da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g864mb32_ic864oc864_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c5e93bf0bfaf660098777b568635d4bc&d871a0ca1ca59b6285e9d61915cd7ec7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0e58dc715c81e710b86b848e57240146&bcec514398dee9167523e4d2846ca3b3&e9c7588de255bba4d1a2cc324d7f3cd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb32_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f46a6da1c61c708048cddee9739b4ea1&1337a119b525da0700c1e50acb94a865"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5bae792056bf3ea3fd77371b54966492&906052fc159892993b8fc9e787147e9c&bb24ca4c839bb6bfae725b4d2ba6fda2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb32_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"68f6cb1e58e3b3b5458fb6f37f627199&58fda0a9cd9f0007fea5fa9f5076b865"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3f02f1a5d8407073d1bdb4ae14155feb&7cebde313126fa6a53de777ee698ab4a&df1ada663d266409c5e5610116d94820"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"00d7957054aa62a4c72f4f3caeffe547&5170f2fd60b5763c5f70cecf805019b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"aba67533c3608e0e5f85b1f0b072e5fa&b0563bdb0c65df71b137cba549a8d361&742b9fc3d5e5421e1dc4616c8aafe2e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g992mb32_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f200cc8d9c3d08c8db43886bb0fa68d5&4d03b05e860cc0d1783e1cdcaf38c948"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"564b43761649f50714938e1094915ad6&54c71f5839a47884b1d7e5a3addd5bdc&78281dd6a60265b4af2480a4b45727ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9c3c389ec929434adaae33a78479476e&6aa43c9fe88eb9624b1bbbcb8fb0b328&b675905f830070200f2c30c3f2c9b146"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"34df5cba364994dc4d83fa4b9dcc1a4b&27301651011f866bc22ffbaf7e26a3cd&440cc41edbbffab5c9cf9e27594a2795"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"80eb81a5e565cb93ab1f5e5af5e24d9f*16&74edc3d76e1ae4ff509b542e86e8865b*16&96c5c60bbda967a8639c34c001c5e51b*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic544oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"87cd3f3a3e721dafd9acc8aa51d0f58c&c56fff8f6ed4e91f419627c37ae4500b&9e8e2a1ced7dbea38d1f7ec9dd25c227"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb32_ic576oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"528a5404e59cb60c3c883221b1c03a0b&59c7ef6d300150dd19beb3cf6780d606"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic576oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5241abe5ec87d0121d78dd423f122337&752b26a9e74a8ed02f50225ca3dd6a2c&06e329de25710c4c094cccdde52099f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g608mb32_ic608oc608_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ba06f8732190d81c60cc399f4acba4e2&6f248aefbde69d5c20035fe1de857f4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic608oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"810c19799d3b67d91d4b151c79891dd5&c393c98c1d560e1857b8cb478c8b59a1&7e070d7ef1167bd535e9e17b9ddc2a86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g640mb32_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c26032a85ee52ddd5d8afd918c0d8c33&770d3f17a89fa5116192c29795a094fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic640oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d785c872a6a40995db75bbd41785656c&58dbf46e8311830503b4b0c066d3bf6d&90db889f24fc44e9c29bf972cadf73fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"30bc871afdb81144b80bca263cbc492a&71142193015efc00f1c5bbf9fdedefda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f82747905c5d32805d6d673719bc30a4&0b425b16873c613e116157ad74209ddb&3ee92f418b980e727ae7f60557a7b4d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g704mb32_ic704oc704_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"380b22a95caa0df7bb60b5acb468cb0c&4c84a9692db17d45bd1c8a077e11a4ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic704oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c0f6379d6a5fe864d076ca3153133c86&923d5c806e627de454b743fbf0535cc3&feb363bd52fc7770989a91c2efd8fd9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g736mb32_ic736oc736_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"052f1f25bfe14ca13638afa50ce2a6dd&f233ff6b19a51483bcc5d7bbbf3e8f65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic736oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ade4c7ed55ac4412ffc26737ee670b12&69179147aeb5cd9daba28f86b14e6996&a8cbe05e107a5352ee00ad87de8dfc6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g768mb32_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"29d273c5381a9a54d457ec93ea7fb19e&ccbcf9a42ce0943acdcb01eb3db54639"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic768oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5f06f9dd8a194795592461fb638e2028&d5a8371475c6ca0c1fe2849f49ab979b&1fdfad2501975c14918b19a9e6211f4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g800mb32_ic800oc800_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ec6036ce6126e7d45d9c2adfdb78ba68&b9225109047998033fa01e6da5462fad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic800oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ecf76498125c73eff23bb1a870e1ce32&e311e8fbb9783cbe49cfae7a7ef2f5fd&efb1f7faffaf47b2b6f23f595ce9fb1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g832mb32_ic832oc832_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a306a87589ddd5ac131aee259c17e103&66f1b686bd9117d6154a3c9f857983ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic832oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"60cac5b86e3f1769bf24e6eaf70a2b4c&0ca1e7ab91e20b413b8cafef385d8c04&336f8c29381f2bb1e94a94244995fa6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g864mb32_ic864oc864_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8678edd1e389fb8fb38d1c4843af5c4e&fe3bb6771517561d2de6d341b03eb4d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic864oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5adcccf3dfb3194993de9811ae34796e&464cb3be63fb9e075c50eb36cf95a2c4&f89c2c4ff2732187cb15ed096cc62f04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g896mb32_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dcca8db14f1c326bb8e8778af8c6a5f3&10fdd14455bac07f66cee388b4865025"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b767f516e69e8738016895790a3624b7&ba1747a67502732fb6101d3ec24e889c&96c231632195d0f1527eed18353bcb94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g928mb32_ic928oc928_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"34ad5d33a8d67e961400a10147f67172&b346bdd1630f9a7294d6260cdf688be1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic928oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e4c37861a213f504fb24215bed2ee58f&373dc5f740011f36ea9153cb60039a11&f70a0c95699e1764fe7baf7fbd7ed640"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"85bee72399468907dad0e0825d63a757&ea75e3f2823a0d6089d030d41113d63a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8d2089063a2fcd9c3168ec8d8fc67e79&fe38332ade3a2b047953356de6df17f9&3aebf87756b039fda74cb0a56f07d19c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g992mb32_ic992oc992_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"735d60f35c0dfcc1b561b66be4832fea&0b0dbfaf00cfdbecccde6ad75314f4aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic992oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"679c6efae0363e6229a2ded65499c90b&a5a743f00c07d0c6c5e5c21b2bbafc58&81fa0e87c36df5ca8620e08fb0ea5112"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"1d0319f16f292782c1de3085d58a1a82*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7d2c7bea694ac1d4d1c96324023348f0*5&5bc23362588d6f4f7d8efa7ba9f8cab1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a26a9d2fad1d381d84c51a20d861e6bf*10&baa89bf9e72712dc2e67612db43b11ac*10&741885f59fbc9bda9a504c8ebb5bf4dd*2&a596cab387a401e4fb2d5d40ff171f73*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"d407eb4d679ff9ef5f868f8413cb43ae*10&bff454c8a9984809d2c4100c2479d1a3*10&aa7736fc94da8c5a2fc8cfa0cbbd5e32*2&5e270da93b0ee37435ca360ddcb883bc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"ac5f7b452650c919bd0fcfd172f01885*5&29bb7a3e936c45715239868a1f9e30c7*5&bba92db4167aa10e9d5a1f5b2a1fcca8&6e763d91e342868795395e6d8b68a23f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"9448284900d51b0f28ac20c1747a49d7*5&fac614691b77dcd5f40a3ba834ef77d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b3d3a450c1fab4d410d537e8212fbb4b*5&9a6a1c4c3d918518304912c9134ff8bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"76b18523d0a00833a84ffbc04388406d*5&7c5eaecb6a1e3a2248c04dffbc1f12c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a47fd173e8631d7ff1f85b6c723f85b7*10&4b1ace7a358b14af18446801afb72ec0*5&a796a6a1942da94138b6a9ff1dd932f5*2&8cd908db4e26de3a15abb28ed1741d91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"579601d47078b29b078d0c4d4c0b78ed*10&764a7f6d1045c4f89ed6659fa35885be*5&0b7198245d4dc8f4cabd7d06ebfeb79f*2&c266970af0cb0ba5098676895205c910"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0c78bd65c483b7798f57ecc28337e89a*5&1377d97183ed0c7a427be8cc095c51a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"9c690c7370ec34049ad039c23ae0ac3b*10&7a7f98f7addd7f291b2661c5ac60423e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"121147c979b6b96222002e6ed0655164*10&0e0e414df200b61ddceda539e321945e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"50a5282e5a048c9dcc2565e7d25a7c58*5&42bbb880a7d43e9f991fc3bc847de410"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"0e79ab27676465193b634c3775723778*5&776dfba0efb1b4f19e06cd7745f173fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"5ac79900c9992a546d121bad26b0c394*5&8e6c4f17138bb687b2fe5efbd661b1a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"0d35845f772aed53528796e3431c9467*5&9bcf2354126ae8c0aaca9fe0a4d525b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d964cd0cecd58ec9e6ca9f8cfc997aee*10&765e015b830e000cbda5a6a04fbbccc1*5&65b76f6f1ce3d21bd78dfbc7798887ac*2&0e08e177814b2135178a3dbe3ee3e47d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"303acf63fb81e32a66430bea0ebef106*10&997a4a7de382516afe97fe3e4b369290*5&6ce332cc7201d7f9dd2906a19f6bb594*2&5070094007b960e5d9b7f9f0f9c055b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"bf32fd350a87aa54148dd97e7fce8432*5&8e9ad5f6889aa85af950ffd6b7b120a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb32_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1ca371f5014547398e191fda9602e9c8*3&a21d470ee5b712cd3804ac5209259ac4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb32_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"e243986a95266ba35da8905caa76f6bb&269a733fe1ee93a8153f6609d579f0d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb32_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"26a417971f0759ff978f35934e7185f4*3&4e606fb1db88dd3dd00023f254e4802e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb32_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"064ad785e28f1d541b99ea0b6a27a101&16e2f2a5c8ffa80d09de4226c5a51182"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb32_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ea313384945ae39fd23f11fed7f39b81*11&f8181e1fa9f291b22220e95893e7c6fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb32_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"85a87a48560114436f2cc5d2ae7f44df&a963fbb5b109859d0988226fcedffe3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"afe6db7c158d7fda18a04000255032ef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"85d8254b6623fd31c16deb6c317a9224*5&da0280b7d225690ce5a6573b3b8e1ec2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"278ce385d91201a802e241718c7de5be*5&5e659c3de2e182a93df70ba78f7aac8a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6d03a472e0756204670e9076897e9754"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4ae6eb6454ac924f8239f9037fc984f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fbe0c63dc4a58e4694376fffd04d799a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"700a81cb9cb219edeb32d85fd5af95f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"600746ca5379ee53376a6d0dbc6527fa*5&ead732df8d7c392bcf3f30321e0d2156*5&75c10016c790d04ed47f43f96fa9afbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"23000fe43509ed010e2c23f9ca774952"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"d980800bdaff3f8aacc10bbb579cb282"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"bf288d6f8931b8e1896f14dde8620575"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"6eec04454d11cb963512c3b5ceb2e08f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"aa4a5262378b3dd7d922004327da848b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"f98f2b564f31f0ab5693bea9d767040f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"87317e9179da36c04cf54301cca511f4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"b9d0c27e2693ebc1e2571e1556782c36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"7f336bf478c14ab4cc83dabf4d9af7bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"94d9cc1a2c3c77a38216baeedf336fc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"f13c90ba135ac9c5959ed07e602d0d01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9443b4b0c29b13cb577045a856fe96cc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"70a01e83c36313c1ca3feeee907b5c0b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ac0651e9787e6ab45af2fd251424eec7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"1c8496ab54fb49c4009986fc4165e9de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4f9a74b52d50ee545f5938ca7a6abc28*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"2b0bd90e608cf9564e40a04846bec16e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"651ab64f0105814f5ebc5e72eee73fd6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"9c525575407901050d2c66872245f151"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c0ef6ca405b9d133ed28ccc19d0a4fdd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"dad8e59cf6b9dbc66cc41042a8deb9f0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4eb7304c5d46610a86e08993545f5f4f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"cfab7611205f9e07473b269a0903edca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7809194dfdf60a9f9080e6479b9c00da*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"33b8e6dff89a6951d11f43f95ccdd302*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"38445bdb3e6e086c62b64da54f7b012f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"2cff1db3742f6bf40bb2b14a81643a8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d28af8860b423f8f5ad2d04df8eb187f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c24c217614187f6463c2d202d86adfa8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c67891712ab084d345f520efb6213040"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"ecce80829343ad85e73b63491bfaad5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb32_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"a6f9af1302dc4b619b5705bd4c507c27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"b6db032533752c1a1f24f5a069f5231b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"3b1a57e578f5a9788170f07e914cab3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb32_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"e1b05c714eeb7cf5aaa71d6b452c986f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"372e559732c826545dddd13238508a9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"581eb8e9a9375e27390a089d4a44068f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"7956853e2178a820a5ac58bc4c0ad0f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"4e3eeaa2ad25326a17b43d0f9df40aec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"5f103b5b54c3a1eea767b5604399e4a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"6b56ac1da51ec37e26094da25dd412e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c2b1f92ae4034d04e76c811d0788463e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"c1f5feedbfde09bfe994122e519fcd37*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d37cfb00457d3927150e92c18ed9b0f8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"565e1760f2f3168277a7874bf0e03665"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c269e3e866054dd46bbb6828eabc8f1c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb32_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"9912546cde0bdcda1e763da345dcb064*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d33fcfc951910c7ddee9bcfec3768cfd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"29655721194f69252d50ed9dac8d20de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ffb3fea2e3a1ea4ae393b44cc8cc9dfd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"ee85f85ed17d16104365c5ee509ee924*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"244fdf3b8b8598dc3158b7425e767914*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"cb79bce050b9223e0d08b023ab26839b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d308f6f029f55c301fb42f1a9ba8ec19*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb32_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"670f507284818383d108ff8fa477c010*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"2365ca42ec035bb0277cfaed141a6b1d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"95afabe59a92ec8de820bcfbf81d1916"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"5d5e9ef383218090067a3cac699b0a9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3369d41f9bd7bfa4792520123e04d183"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"bb80f767a517fff7bb40984b4a976da6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"17c296799456ab3d318c06a86a14362e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"1f546446b251112a08697d048dcf4293"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc24_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5705cf775cef639230e2dfd054bc3def*6&809a6606b18554e2f18ca742a4c0955a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"612990578ada87cea852879ccc0a7533*6&76fb80d9acda03ef04f7978b4a81480e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b1a90f2b6fddbc03f3dda2a6b7cdd48a*6&bccd8752b22ca4e4aefef00532125ccf*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"34cfe23f86bcad0d9bec0318ab930229*3&faf7c5ebc06d28fe122374699eb06d58*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4679dffc272ea85273987245e30145a0&e7935af19c8df496219156bc30a10c7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc84_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4e0a5bb587e5ae067f7b755572eb332b&ab39df2fd82188f14472a402985481c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc42_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"816f98dc75b07a55d6ccebacfe88b8de&49a598bb31ba01a59af04aab4424a250"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"0abcadfd5e8dd1482c558160f663e4af&3ca3a65a1d76f0e5a5a50f6525ef1f3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2fac587b6ecf0504915c64f322d88e6b&a73b93dd9f3e08297c279853161d699f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7376e71a6eeb02f9fd4e103cb2f477d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic16oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0daf463b9a1eb8583f1a5633d5128a00*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"380e995f753e16aacbbca67b6d76943b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic128oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"aa8391aa4f139ba2b5e355e6a7756608*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb32_ic416oc416_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"eca7bd8e70b9a4d1c8284851209394ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"9460bfd0f19cdeb11ec9db0d86d4004f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"b6d9af145358b97542f4027ef7a2b09e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7a66c4d20b56eb0fc63f481730ea75fd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4793f0f42112e6b18f1febaf67271505"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e4913b08ed70a3f2c49d153ad0589fac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"cf9b18db7b684c14f1f21b606ab16afe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"c5b57674af4e970f176d4a1b89e18bbe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"d5b6e1da71e53adcc54e7d8adafa8921"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e6fc284d4cca23c73eb8029037e04921"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"921656be99cd351f6e81f1428ee9b635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5d9e05e0747a4d670554341e293a6382"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"04e42446c61b381af0d64b5fea774a67*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"1b2811e54a25e0ccf8769180d48843c8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"01dcb8fb2b9c0217c57f5182012e9055*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"43bef33c8bd39263f41ac4092d239904"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"fe006e2377852c34f3effba093f51a5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"1bf681c9622247bb0370cea7af14cfb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5582b39573ac1509a0a5cf528b7bc5bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"4c87ac280291205ff7879f900c58cf47*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8aa6f92e2e72efd3515bb6dab83a6e0c*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1aa0d3dd80954624ae7d853158b56c0d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"3750d8dc85b4e9eef2d863ef0f8e3124"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"89f7dfd295021e1442f9edb27ba55617"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"91a97952ce1efea8ed5f05081c19ef4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d820517f20feb46067f90ad4224d6fcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"03f946ef14ac73b4729bfeae17bc683a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"25d38d337d0bc41a130829506ad3d444*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0dd067451f947926c043bc57970f95cf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"a07d7dc11e02b666b2f4ca946374fd00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"01c6c492ddb408cb589ff36228605843"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"4ad7ba73580af99c198547c85fc949d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"2e47c158b1accde0f7d02ba68a5c4530"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"923507bee4dfbcebe2e9f05ad5b8946c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"a145c138f1b0686a26894c5a62520cc2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"b75ca5f327282e2f46d50947a3264160"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"f1b21c6eb56e1a059f175962a79c15f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8c62ed41789d84fe75088a141d60a2fa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"93af4f8c37579e22dbe43338f9113baa*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"018588f182cd6ab22bc9a79dbe2bdcf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"55b611e35dad94cd2565cf5bdc34dbba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"6a60b768db251c1dd6429cedf6f7bd84*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"099deb1e4286ea6732ead3e4e20ad295*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"588c60d2835346a2dc7acdd66dd0fe30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"45b8fa6d27d255f2cdd8022f2003ad3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7d3bc4e5b89e01d5350092a615350d20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"8f8f12e1f4c74066a1fb1ff6b0442159"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"935a17f34621974204daa33ad3004015*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"fa4db0ede7faabb54146f3efa77da732"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"3d992f021554285505579652dcbe529a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"83048e7d4e6115fbf6d0452d94c8c936"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d9ed42ecfd5f1e01c3d14c8a4caa2509"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"77f9a53ad648950f8626470380b7bf4c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"959db3d716cac92990cbe89f645e140e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"98c13b4c971f88f7324719787f9db64b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d9a137ff12559d21f0cd90aad0509d71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"575e3a740f6e278f2293376b77c97c02&c6df39b1049bac37d5faaf430a242c9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"3498596c1a0b83d8387f73f9d540a0e0&1106d390ea2883661667bb4aa0e7056f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"07d3634ab70856b10499e38c3dbceb51&86c872f1dfce9929d7cc5d6543dc0d19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"9ce0927645a40dd1755ad7cd6e2cd02d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"836f53ac2829e860967de6b10ffff0b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"c7207c026567abf5bc624dd89dfdcba0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"efafc93d63b35d69f94dad0d13f4b06b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"3011f042e975dde772ed11f633a9fa84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"d0367330532d558b302b2778f708084f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"27e4fa0508361f45d9dac389af084c69*4&dac470c3566cb6ac924002e46803042d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"ab5319451a1caf9c00363ab73981077c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"2039050eb752a008d00d6c993145a2bd*2&ddd82e3f12da6d8e76f1bc564b73d348*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"84ba39f3356c625524f2f75bd6b65148"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"15219f535b7f6bc0a6654cba41e0a9fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"456554514bad5b6618c3f03ffeca8286*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"89d1816ef2ce73af54761b14fda2c906*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"53168085430b1a9d56fca37fdcdf2300"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"bf6e98b63ef6a8407094cce1bc3541c9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"92a2d6098000baad67a193b436235f9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"e05c6f9dce0f9bd268fe385d55d5f111"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"8417dd1f626cf109186cbd28d01b44da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b6aca44e1c780682d113477e0d384b24*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"3a8860c96b3e014222df44cafa7e6aef*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"b8b392a5ff1f8695da1beb48533714eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"6094fdeb415c6a7913fb807152f2f8d2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"77a6303c8a7e8c6e68b555e2e243a90a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"e777be5ca24121d60087cfb941e09d85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"4d996f81a4d9ec203754c00a9dcfceb7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"2d9c4e19505e2bf37507ceb297837498*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"93a1c158221fa377906cd589f3512f30*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"571752ae433c5af314628ebe85bec128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"14b323e48fb8187318ac547fcb7beec3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"9f9495e17a2c75f1555389f620311e4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"86310d918cd5dac9a77dcbd07af8f731"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"270e00694bf19453e14ff757f7a5d6fe*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"e34c0f451fd58d9e2f0e0cc7c4582dd3*3&1496b00639d3d9aeb7f5c154c7a74f26*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"49ca82f7fe65988df9512c75230b89fc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"41e481a0ef50b91d8691b05de8cc7b7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"8518d4812f3b2f45ba1ffff3202a5288"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"d68fee69bb3b9b2ddb25bd03dae96a9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"83e9c70ad34074ae233d17d6df173f8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3b5c21980d45a7dfe2e340dd034f8028"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9d2196a1d010d0f5eb15ae8cdd0c190f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e570b2e27395f684e2cd3b945af5f323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9013a82bdd05d507f45bb73ab55baf66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4406e8c6cde22a0826252f15e2e73653*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"5b01d21fa4a948fa5561d88d4e2c8833*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"4ef9c2c2ee36416feef7f299c033d466*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"1880053c00b1bc03159be8c9ee14edea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8c3530f27a058106f860883e8edb369b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f356106cef66321b9e65bf9b30472fbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"bd2d4c3370cf8c09d78cd1799c8beefc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3cdeb4720fca262c4fe5dbd2d31d2556"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"c871f33adcff10f97583eeee9f8e408c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"5f52f4cb8178c5c4126dc196d83352b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"7ea9b3cad5b037caa474e1a1ee357205"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"cb608c51e96d33de94191b7c9cddb7de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:acdb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ad62a3bd8f7ff5b735ee721f96e7f112&9f068dcad2bbdfff3511a1e3a9ff050e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"251308f37bbbaf1b4ac729707753985b&40ebdec6163737d2d46c0c9a91f48958"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"1a5ce46013140b1be0d437746be25ae7&9c71a12a6ed0edfb0b7c4a8abeb2d685"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5cb70ee00c69039475ef7fb43559aeb8&2598258c9dc9c020fdbb9278491eb561"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"324a3a63c0c11b4284fe92c02e491297*2&1931d2e0066afd1ce9ff93fded89b922*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"36d00f4ba3991e3c2bf027f24d538fd8&61da68183e8f45f342aa2ade9781959f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"2f9c424c9b39541348416ccd2e306ce3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2c3184c75a7d1d112cc57ed82282e249&c681b4fdf8f6adae7f72c1826d85a197"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bdd1bb4b7f06f6fe06cd2b75c69619b2*2&dc86a5efc8da770fc5139b5f92a66f61*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb32_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"1ada594033b21a7103df7e5f6d6a3f1b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"913a861c1c2bb8c5cf488b7642850489*2&2db7214155241c80cb650e0f5499a1a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4c931b5c4c904056d9bce888a1d4db67&76267de8fbe38375fa3e59c2b545de43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"079a20d5d250c8f3559e06a1fabe4497"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53e1e5561db5d31f93bc455ea00519e1&7e4e1afb798b5567269293b1176ed11c&4c6b656efbcf8161d3ea170d557c970c&a5f4ab1724c1b6effd8c1a214961dd15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9a038c574250ea39b30be9f03ea06907&b1b8cda9e80ba48c401b80c878fa751f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb32_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7416523b0a74ab106d736affe13e3796&ce3c33499521bb678bf1d8eb8b0561e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8daafaf27b7a8422ee9761b6447ceee3&e4fd3bd63d2256c6a93d6c35a059b1ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"af463889b38ed7daa37e9beaaa9eeb46*2&488994e0859ff1d680f8996909024fbb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"29e851eeb0cbf7daeb2f89868c3e45d8&0149761dd9c8ed52a13cf0e97e155ab7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6760ba29c6e401271a7bd78554488418*2&66ac7732e63d0d069283c8e5a4b9fb7f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g184mb32_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5b29914b6a9fbfae0613d8139e1bc2d5&b33ca700b3d3eac3818afd95e5fe81f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a1129155b24501a2e74fee3a9152de42&6fa0ba66c277e1c37f374515cabfa02b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6c2fa4f3ca61850f9952b87b224dec2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da1cc635c555716beaf1b66d10cca3a3&806775aa64eb3d8c29abd97aff9696ab&358fa2eeccdc4ce23a0cb3c4d76b1541&b9fc336997f27cbd8b6295a0e42206d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e709ff37b556c4942be4b4f7944de852*2&3f4108eff5491b19bc1ca5b9436d9551*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b90e88fc4cde9908b454d2457b2b7a04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a153b7e065aba3bbe1d6518f35df09a3*2&36eaf1365261fed81047a1e305ef3ed5*2&79d82b5973a5cdd5234a5c439b742636&819f44c99f63277fd614f9853f07388b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"49fa4dd75611b397841b2e0393a179e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"58d2a4d2b00224fe760ecca95b144a5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc795764e6634f89c80b7952cc7bb41d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"741580d192c0e73e85157ae29c350b3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"016842a35465e33292c63563098833bc&c7ff06d94f0b5fca8136db750f5ba3d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4796877efa35a4c630ae5ae39fb92cd3&7c55b7612aeedfb76de86df6dd0aed10*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a1c0f6fa0ab42a792e39b65e320231d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"802f1501e6b7ef246f6da33f267066d6&afcec673da4f725c9ee0e308295a87d5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5f9a70a216d936f267012cfd31d6572f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"492849b9adf61c5965bb3bf4e2b97d3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"34df2c9c68d8894ddcfce57f739c4a36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"6cfdb406fc5ceb6026a0e89b5c80a5c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"6b0acd10e0b6e0b7aaaff40f3fcd4ff8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"cdda3ac4ee1fb5b535ac7d7fadb7cd77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"44f02770073922fb95602429555e82a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"c954c5be18db9ccba206ba2c876be2be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"c2a5b5e75a707ea2e747a3d82caa75d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"6d5e989f687a61318d27382d45e8c4a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"a6e9a362fb6ff06907085dd3fb6853fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"bff39d182564803b8728fa2036782da3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"114902ef704dc210c0d79d71b7aa11c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"de76b531d07b6b768268ea35006ee817"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"b4d2470f691372dab93a222f9a163082"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"9a682089cb81090ed995c7e6b675b7a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"d39c01d97213748f66a791454cae5bd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"88569ce776fc7f26e21d769a97c0aeee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"51ced68713ecfc7f2d4f277b67e38343"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"0b4e397ec86d4e935fb289b2d570695d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"75738f4a7e172d036d94e01c723bf36c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic3oc64_ih448oh224kh7sh2dh0ph3_iw448ow224kw7sw2dw0pw3_n"13620d82aac849262d65ca80a1c8bf83"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"8e911c9ab879d2cdab92c00bc07c17c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"1f62b7d2cb0300a86f1cef0e5c43b9aa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic64oc128_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"e8b853430f9a7c6f800a32ecd287afe6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb5_ic64oc128_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"47a4a107e27711a43d4462f1bf72a23f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c0fac82ff02645176a93bb1521f6ecae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"43e40fa6e539d1ee095164df2f452d17"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic128oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"00e1e1b990237a49b7fde62789d6a8b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb5_ic128oc256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"5dedab2e36dc8e1f29c0f671cb4eb567"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"fd6f1073071468684ffca9308473bccb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d0867c6b683509107f7eb43e87552410"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic256oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"50e88eb2c031df9c860e666d310903b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb5_ic256oc512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"b7a478041119daff4bef02571b191830"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5362ed6ccf79ecfc2907e552de19bca8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3fb1d70bb735565e9ef3df6f89064f17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"bd5a25763ac1eb519d9ef15bdb8dad98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"ff11e5a4a2a51df4ebc3597aa9fe073c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0ca11647ce6f6dba21ab3bd115996f95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"154397d7803b8a213212f1f372044225*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"907fde3d4cdbcb0013fa1f3876af4cd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"32ef3a0a1fb4110c888c712b822670a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"21c0483832f59019bc33128d737211a4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"febb60f68e9bbdfbe48bdf8102bf3b46*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c9bf17f49208219694bbb4cc97f8f00b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c338f82889aca3fec93f4e65f491bdd9*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic18oc18_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"36fecfc00bea5ca39c27202e98f2bbc0*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"6dd8020df4e7b09b41ddf7422e986dc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"bdc87062386740af765db74beae09491*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic36oc36_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"39611ea035c158194806e53aa7cf073e*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"b6b8c864efa2804e1775ee0120d4d956"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic36oc18_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7849be78d44f832920f582290304cab5*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"5e93f5c3d98baa253977c331e99b13b5*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic36oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"8347110af7102a396ccd53999108f381"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"6461c553c52589347ae0744f16c31a71*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic72oc72_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"e6492ff50f96630eed824b9637a7b5a1*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc18_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"3b9d51859c7684e40b377729fbea9c63*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc36_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"5409f2da4cfb6103a6dc2f5a76944eb7*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic18oc18_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"c8cd81d72978e370800a17bb579a9fbe*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic18oc36_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"4d8afe0be35ba5a899d5cad333da9ce5*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"9f443f25697369ea39efa299b2520b62*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"37ddaf4507d76d3eca5879a70b0c3fa1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic72oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"c928224e0c850d8326167eee452bcb2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"fa44d0b3eab46dce676a9f0ce28ca27e*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic144oc144_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"628a541a3304ba1bd09f172c23d269ff*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc18_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"17bbdd2aa6c3d014676e2bad6b96c283*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc36_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"e5467f171053ac89c8e2b837802f6b1d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc72_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"b2f5300f19276beae2868a129510bfba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic18oc18_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"44627a2441e62687d22ea12e7fa3b320*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic36oc36_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"cd2615e90f3b6b8383eaf436d83896fe*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic36oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"9553e527aaffa7481db1684535b5d842*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b mb1_ic18oc72_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"d5a63a0b54b13f1487b4aa3c8583a7a6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+binary_add:f16:14:aBcd16b+binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic18oc144_ih4oh2kh3sh2dh0ph1_iw4ow2kw3sw2dw0pw1_n"3b69f52071a635361f561eb8b0809802*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic270oc270_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7193d9c42118f4634e07c7d2b738ca95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic270oc98_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"93f8c88993ae7052d3cd0f1a62293ab4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1754d2addf388ff947739d4922ab4dfa*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b463920b576682d46b063367f62385be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b0f87209cab3801d330b376626686931"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"70591fdf38077baeeadb8365ac60a32c*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4c3b7733e115f268161a902c14b58c13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8a6889d45de29a00ed14f87480887775*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g288mb1_ic288oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"88bcef269e3bdf587043b916ccd53b62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g320mb1_ic320oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0193955b0d5806a6dc6b39f783bf9faf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g352mb1_ic352oc352_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1433cc29f453266ffce352c7e0046784"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g416mb1_ic416oc416_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7855b23de09374d5456fa6ef290a107"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g448mb1_ic448oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"645f705eadbc82b28cfe615fd0c21e30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g512mb1_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"da5f2eff9a80a7ce5ca96cd397fecbcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g544mb1_ic544oc544_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6e83978e51795732be841837601bd27d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g576mb1_ic576oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3e95fc5af04fd94dcbb17a00efeb560d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g608mb1_ic608oc608_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d558f4d4b22ce30fffce758fc506d824"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"75164e604fdc62769a32ed57b3f1e53e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g704mb1_ic704oc704_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5f145f18a9628ea262e574513fc8c630"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g800mb1_ic800oc800_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a10f09774a81faa4833aebe74cd630f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g832mb1_ic832oc832_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8902d6c4ebd2db11d7fafd8a05720c63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g896mb1_ic896oc896_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25617eff914fd93cb441d0895d45da12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g928mb1_ic928oc928_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8582da08274cf781688eb47ddc7fd272"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g992mb1_ic992oc992_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"489cc59b8eef72e3b7540a455d760b3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e63b33da836212416008270b62efd8d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e52adca38d106566f56df3208af0b3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3492f51c20d9e9dac9b9801c1abbd900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1088mb1_ic1088oc1088_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f7fb3a462319a5ee874df55c5739e6a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b1d1a519e255ff5ed94f799526a5852b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1120mb1_ic1120oc1120_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5968155d8653371096ff349f09aa0026"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1120oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb55ff7fd8df8996653f129e332ab333"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ce5d60b7659ace3ff7b0b521528de4d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1184mb1_ic1184oc1184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"14de96bee859b35aaca21a684eb15caa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1184oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c32bc19c88534e6d9600307147faa303"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1216mb1_ic1216oc1216_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7316f46af5c035ed1d9cc697dd41811"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1216oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"466c46ec967a6defb0c22f5af7f7e362"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0a03d8834fbb9d7f007858b680b7156"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1280oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d1d782708ace029fca147962fe7fb999"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g640mb1_ic640oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"63cd60435021f36821d7637c417dd623"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc32_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4aa7ffad087d34efc6e9742c2a3619d6*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g672mb1_ic672oc672_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"476af660d0d0b05c02a06cb01e31732c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g768mb1_ic768oc768_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"86852ebfe06a80e719ac89e029d54f06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g896mb1_ic896oc896_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f78e34cf6daf07a84764d79740983445"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d92c6bfed322c0835fb817c4b9b1f8e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"15a3124d208e11e6955795a271c9d9b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1056oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a876c0d4cce22b9aed25202e2f4b942f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1088mb1_ic1088oc1088_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"edbcdaf6f65e16508da0b99c6645b5d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1088oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5dcc5e94220eb465462e549a6b1ecfe6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1120mb1_ic1120oc1120_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4d55f302b5e90b27b90c954d79049cba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1120oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ddf67bac11cd78a3c15356a479eb2993"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1152oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0358ab8503a4b6705d77d104a5003da8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1184mb1_ic1184oc1184_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"caa04ba1890cbd70aa2348096d50cbd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1184oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d906e1dc0af0731e8417a7d82a2a4b16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1216mb1_ic1216oc1216_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"50ade6e48adcd743de3aec9aff553f41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1216oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"279491bba97cfc415b0e11fba73f84fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1248oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9878ff4c45583a2ec365014c6c1afd81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1280mb1_ic1280oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4b3bfe7e102892a3441e982981a17479"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7cc20134c3b81a6597f793a580e3eecd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1312mb1_ic1312oc1312_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e680b9e5ce34e009495d9e0a68afee3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1312oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f5ec2a95e522727923df4f3c01c4b7b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1344oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"89d441475207f7afeb7025b4eb7d8fc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1376mb1_ic1376oc1376_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"63a0859607857f5be0a70578c00f164b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1376oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1599b7006df422581ff228ad34aa9e92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1408mb1_ic1408oc1408_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5efa765d8dbacb3e65191fa5c860d7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1408oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"de1124627caf1245ebbca3940a1d9d6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1440oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3ca23872275265726703ff9766fc50fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1472mb1_ic1472oc1472_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4d44fbb3cc46180c78ab50a547d32c22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1472oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"83e6de37b48c57ee9238932e2b347160"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1504mb1_ic1504oc1504_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5553f5ce2622d80532863791f0d68da1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1504oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"441b51ee6a49694bef06aee13822d9b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bec06c55698a8111a7204143c1c099bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1568mb1_ic1568oc1568_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"43a8c5d69cac2501087f683401f2409c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1568oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cd0e768b861001f5fa28704dd6bb96c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g1600mb1_ic1600oc1600_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3a2639a9a2055b8e2be511249f82fde4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1600oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c78a384f5b332bd140dd7fdba8c0dec1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1632oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a1b76702ac7545d96de35ae2b2dc5d53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"1e72a4fdaca4d3d5d3795404608ed25f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"f124cc071adc7c60d41e934a6bad62ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"84dd86aec441e88a1187ab1bc4ae1fac*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"d2cb89b4b29d93dfc2a16b4af2be67a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"06d9342186d60181e29a9cd2e8a1fdab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"f95a7f1d61bd2a40508bafe39f915cba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"fc5e2aeb7a60c32f39315313fafa614c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"1533c8e7e3b2e8096dc94e0ef87cec19*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"79326e09e51763818fd8ffb19f15ca93*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"c7eb703877a9ac3ef8632d96e827d729*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"b557269a769aac4cf77edffe5490d588"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"55593456542bf335e14713cce6a4787c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"9bbcf153b02589c72c6fe52b0b450dfb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"5a4d8853d9d5a1c8a8ea6eea7d78a043*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"c7196500cdf5fa3a6ad87006e7f4333a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"f058e51841d2f6b9401a440b6236cf31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"73bd2a77a93287b062ae80e75329620b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0a23ae354f0e9de460de2100b8c5fac1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e0f193fa85ed93822c90dec5d75c8340*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"b38aac0b886d2b09518fa4900eb20eb5*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"2e7a65bcc7564682f337f731fcd51a23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"397d902cea83652dfd0ba52317228dee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"79675b961a10739ee1b1b919131646ac*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"626924f1e403a20cf43c846368856a8e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"c2ee9af31545581be5e9f5e172933129*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e3f287b24767ee8073c4fa0a563578a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"c5843b3def253e8739d7799cc66e0006"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"82bc58ad7b57c5c7c5df2a3473d1ce3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8e7addd39d7c55bdafe163a8d579cd02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"b1e38e74df03b9ec6897da1216274d46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"fb990271062483897a760f7d7d4de84f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"98e2dec788a7e80bec386e2d3672e589"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"c22cca10c9ca16f2af1f1c6ee2fcf0cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"0806b3443064eef199c158d73382c9d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fbbe0dfb3af05a1ffee3bf62914c8ea6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8c4c8f21472c94450d653650ea0914da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1a40991ba5a88ed111fae201184ad5ab*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"64684108bbf5458ee87d581921c89361*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f6a6086c0014dbc07fd3d08f8bf5ba5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d8ca3c932d3910e1bdea95af6ed38284*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b861c1acb38e0faefe238c2542b8fe1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"2690ed7c309ace95e3191804d989a5d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"39886178f9f41f1ad220a8211f2a65ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ed36cfa7b4b84ab04fc76d76385df435"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"70036a144c8e14fd918dd94572b30be0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"407c0c716ee3ac8c1bce3594435ab08c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"605ec01dec483c5873475131ae53062d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"09685d530ff948fb3779233ae9419b3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9dce342938317015c7ae27731bd59309*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"df3bf0f00549737379d0705f837fa7bb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e32b9886020090baf3785e401cea8a55*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7b2a06759d831a137afd388d6e5e69d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"33961df1461ed5faa72d61e712adb6dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"09de181fe69a95867521583f86b71773"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6e1172186611546e0cc2f9540f37a315"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"db5dd01cb311ecb87bcdc4240ad819ce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"be3dd7bb869628f9e6e4e94aa4d43997*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"215d74d85400fa207c1c135c4318d838*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"594bd3576d9b9bd0d8961148e7592036"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"bd4948a91c87fc4c0e12bae072970bc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8c0ead7049440ebea9b84efabe3d300f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"915ce5f75f31b9b1c70529d6943df05d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"38193c1b61d49602cb217aef6e1e6e19*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"25c62d2d410eaeed417ab6dd78939fdf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"226258bb19f87e345883c23a1c7e7180*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f9134a050711831b51c534c376591503"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"4c79b31d844c4f1bad0612cd3ad2ad5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"cb9683c578a106c6470af489c03546f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7fae05526f734a45309b610d3ab80fff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"3afa9a94c7107b06879cddbeabd51687"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d9c450fa8eb820a8be9b231980922c39*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"00c685b279ae80d941e39f1aff7196e7*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh mb1_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"3c233080b4132e66af161630cf6d0658"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"92be117a0ca2f86837ccc3200d376159"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"094a9288d0c4da53eb5038dd2c24bfc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"f957f8d13f991eb2da918ecfed9eef3e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"eb70c28ed48d7cc7ed46aeabeb999ea5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"dc610f190aa035513efacc1615960822*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"8fe7d1fa12f09daad9c38f01e1bea9f3*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"67fcda4256cc3b2771f2df43f51174cc*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"ca02823544f9ca59a47fb0b5e1f7194b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"98661d811a6086fe2905232f057df6b1*5&5a9a8f85b7685d0060c2748788ec0176"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih678oh678kh3sh1dh0ph1_iw1020ow1020kw3sw1dw0pw1_n"98560ec15e3f84b9bb553f7f157c50f5*5&c85e32ef2b865ed126a13674091b995f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"974d044d561834f69445a004cf6a3307*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic80oc128_ih326oh322kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9d9b9cff29a96c30df499174d3212600*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"187cf08be06afe233137a257776d9471*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:6:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"911eec147930aa1e9a49a64320ab4230*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"70dedc48ceb91b6465d358687b5e961c*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:6:aBcd32b+eltwise_linear:0.25:0.375 mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dcb7b84a8de42d62565c46fc0f182e11*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fabe8ce4200a01cdefa224b86acfee0e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1630ow1630kw11sw1dw0pw5_n"f427d3993e1b73bd6eb8e46fac8f15a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw8150ow8150kw11sw1dw0pw5_n"2977f203be7af62c67e6c0b83bd184b8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw89650ow89650kw23sw1dw0pw11_n"d6c67dd577d0e83f65f308e007d096fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic64oc96_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"8668f32a9b25c97c5c12d4614603276b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"66db4496e11dc79f2c661d71e2b8af06*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih73oh73kh1sh1dh0ph0_iw73ow73kw7sw1dw0pw3_n"38d60f137aa8b0c4afd3ccd98b797d49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"705f388715397b1daa66630760ee4350*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih73oh73kh7sh1dh0ph3_iw73ow73kw1sw1dw0pw0_n"6f6cbc07712ecd611cf2b714bea797d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"a825926ada25bd4d5f8d807929b5b4e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"0f57476d401c9709efc75465447cbf15*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc96_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"e484e3dd7048f50e0bf8ea4f19472783*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc192_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"41c7a17f84e3748d378d43a338a18121"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"39af3cef34f0c563459ae42be7ee049f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc224_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"bbe408b3b306e6c3fb0b2afb8b124fb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"83980b3763a5075bafa188f1ff6263fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"033418cf1853e609bb7290dcca1aa8a7*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc384_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b4c87cc7dfbc14e57029a00f254eaaa8*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"83e6f6b28bd787a83e0b25afaafc0d15*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc224_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"407366ea1e4b03464a6242288cd38cad*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"63dfbe1c9b28acb092f2367c8c426931*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc224_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"2858d98202b742428e86b90eacdd6d92*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic224oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"54c2eb3a28131eb102e941fa96bfabd3*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b4786b26fb6049f3a53c162f9bc282c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"b257d2649ed4dbc25fa6b645a5f3add7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"5841dc4fbb5f5b266ecea6ee7878bb14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc320_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"feebf89cb82608b319aa42950d425ed9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic320oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"d83d3fae1a1df24da49bf79516ee0618"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8a3410a5fc8bbdab4b771bad7daa2471*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1536oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"39155d06e489c063a48093bd0b1de057*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc448_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"ef51b28940a06b39c9e9016ad62819aa*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"c473ac3291972587ed13aa76acc93183*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"fc137d18de4ed25617150a2213d9e4f5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc512_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"4c276c900fba388753c451094965054e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"c91e715149c686e4f8cd84796fb337ea*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"561e10a92b49caa067a9c5fa34f3fc34*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih640oh320kh7sh2dh0ph3_iw640ow320kw7sw2dw0pw3_n"edeb3d27509f84c6f95bd683c80b47ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"69c6e5d096926c38b4abe03946e4a414"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9a055841d10c7a076df547cd2c2a063d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"f09b8ec1924fb16195213d52bfd236eb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"2fa6338c4a871aea2e27d6a422f85558"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d9ebcd1df4d67e343872f68611976f79*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"1fa579311a794a99168697b38097038c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih160oh80kh1sh2dh0ph0_iw160ow80kw1sw2dw0pw0_n"058f9f37f2f484a3a5e6d11ad746b86c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e25bcc34bdb52cd9d00c2e57fd138ee0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"242d15e27b1517ac7fe4018bb5fda648"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e35d707846ce79c527e346d158370edf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"973adbb65cba6cedcc491a4da8c98642*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"1855cfeca7660a49361d640775cfacc7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc512_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"9988dca90ea1944f7957d2eb114ec764*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc1024_ih80oh40kh1sh2dh0ph0_iw80ow40kw1sw2dw0pw0_n"648be05197a82f541d4baa49dfade66c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"36141a56b2f8e3369da061f63002c368"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"270f6ed57a843c9def55557a8979f87d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8c7b48f930809eabafb725f941f984df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8fef5e12edf5d3920987898940888be6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7a83a7b71736a0012ea0d98e66c0620b*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc1024_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0b10f16881c6204e39bb0518d618057e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc2048_ih40oh20kh1sh2dh0ph0_iw40ow20kw1sw2dw0pw0_n"c371d417790dcd35ed5047c70bd5869c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"052522a72d5885cdef4f3521821f3fdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"26cd6fc3969cc8a8b3b1ec9b9f1ebed0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"aeeb1f793f1e5726233447d97b36c274"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e09fada164c87432014cd28aa4406b46*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"d0a3f5d5ce150daf59cb553dd234573f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc2048_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"76c174f9812a1f8cc2bc490b2b60f3c7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic2048oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"9305456d0c012c72c9d91a9da2eb6b4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"08013c97bd0fb2489dc0da8e0fc8064c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih20oh10kh3sh2dh0ph0_iw20ow10kw3sw2dw0pw0_n"60d82312d120da5d961310d53ae03bc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"bab61dd363860674a029d3a9a999b726"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"8a8756b1cf2ce6aace9906cef9293bf5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"45b242f5306be2a44ba096c241537188*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc36_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"0dffeaf09bd08e26751c13bbdd11fb46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc819_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"2c57b6b54b70486a32e648a76805f7ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"3f224cd3f275a2c1db14cdafc2a41237*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"51d11b4eb7b77ac29eba8ecd23c760e3*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc36_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8b6736fe7185f91586b7489c9a129cd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc819_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e36468591ca5f212248955b1a69cd592"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"b98a9e469c70203112414a71a0b96e51*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"022ecde09ffa4e3ffb5871a1573f731c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc36_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"3adfa79f206f2323f30127874f1cec2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc819_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"8477e839fd14b37e15161a56e2089c8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1024oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"8fc66a894682fa23e135b7226774ae6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e6c42645ce457d2b01f21818f33d055a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"89e8bad62f5cd88d4ec4a30bf2cbadbf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc36_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"ff93619c1c9022317f0528bbb48176dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc819_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"2ba5b061fa22f52cb091a662b3bcc96f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc256_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1c1fcb459b9a5ad21d78e2df73d0c30c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"335949aaba555cc695ebe746a2826231"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"23fec734e2d5e820bf355d1329118da2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"5cb86257c4dabffd7d0e6608cf36a084*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc36_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f5d7b138ed84878fec47b3fff9e931db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc819_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"8058484d17623353b90b230ad13e1367"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"f5908e3feac88aab827e8568c7b19734*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c40f047bed5ed47b09b760cceac2c9e4*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c98982127846fa8f202f1c1fa67009d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"f620a94241f903567054be7a20ac42de*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"b30009cec6ce6435dbfbd1513377364b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c005f08b7cfdde8151862149436299eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0a95911a4d0e00537050759bd397e465*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"cba1e30ad8853884be6963da0a0e7871*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f5c3f53bd7984fa2a4984edbad275041*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ad39ffdcf757a14e052353f16165ed8f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a7a49f04eb8ff0753834c032ad2c87af*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih16oh8kh3sh2dh1ph2_iw16ow8kw3sw2dw1pw2_n"ab80d9121885818f1b31e8ffb7dace3c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih16oh8kh1sh2dh0ph0_iw16ow8kw1sw2dw0pw0_n"c6633db7209550ec1527bb68bc738e0a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc512_ih8oh8kh3sh1dh1ph2_iw8ow8kw3sw1dw1pw2_n"f527a8525e30a38bcd4682106d5f528d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih8oh8kh3sh1dh1ph2_iw8ow8kw3sw1dw1pw2_n"06be55d53deb55710439aec06b781898*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc512_ih8oh8kh3sh1dh1ph2_iw8ow8kw3sw1dw1pw2_n"45b9fccd2d3d21a7769aa32ae59bd953*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6ebbd272b8a415f1a0fe56732265dc41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"d4c4c5a74c54207e1888a521dbc04058"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2bc5edb1c5c8a14c6fe4a582fc6f1489"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5ccbb018f4d673eb390dbaafb6824c50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c57d11bb3f7fda90a9c76470a32208b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"54e692a49ab9bfe6283a668ea42622f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5530ce7d81c0a79803586755d022a448"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c18cd97678fbf38e673e5382c7280bea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"91d49c46dc53f7afe5d0a69a8b836fca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b7e5edfce7e7d2e6797c0dac4a8752ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"66b6ea3644888b282b901485c6523085*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9ae68fbe9fb7fa20b8f80802dd90172d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic16oc1_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8cabda88bc91e13940f22bce9ee1796d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1oc64_ih256oh128kh7sh2dh0ph3_iw256ow128kw7sw2dw0pw3_n"dedcf9efe66439ffd2d2e70870c6f730"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6afbb96f7688d219ba5b4ecc69eb1c63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"94da5d4294b7e24a85d68a3059a19c2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"021ccab910e5a6fe53cceca6dc5764a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=abcd --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb32_ic512oc1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8e7edbe48451ecfb7b0d0c7895240dd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"0c16073d48a7573046e86c40d7f20c22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g24mb1_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"92e65dc12eb452a7d6310871e13b8383"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ae8ccc34c055cee2d4b03d06f5adc3ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g58mb1_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"a0a92983a104ebc38856dbccbb8987a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2ff4945b0b32e582c928c8be2f34a803*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g116mb1_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"43f7378cddf355fc431257bf7737377a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g232mb1_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6958c656431423aacc82c6f439e0614b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0_n"7d1836a166e0a0b01cf092efd70cf7a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic96oc256_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"c2acc5c2860a9aec96578c6d0ffc6fda*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"cbeb62d7e4ad6b73418b82623fabd6ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic384oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"e3b68b7e970af51466f51f8b8855cd2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g2mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"0f4e6d597ecbe9d506ef1856a154487b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1eadecece2f74ab62cbce175e0a5a650"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"37a6d718ae2b7902494fa4617f22255f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a418a8b0de2eb764e7e24d1a01dd1483"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"12dec73d5d4845c91f4dab7d4a8efca7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f35634399f6eeaa8b3d9f506dc005b70*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"378569b04015951c006fca591894d8ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d0f2f0bc1ce56d66c450ed36b2b8d821"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"054660d2faf3f18aca30093a2292c836*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"39aefae3f279dda2022913b93eeb95ae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b088be28d0b026f8d9f5d2633bccf1e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f26a110c035e8f7e4f1e6b5af1ba2550"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"04e0f120a0e0c440606c9deecb04e702*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a8820ddd43cbdd180fbb96b25d09fd93*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"3ccb6669baa2550ba7c09d9a5a8c34c0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"4504da3cfc4aefc88574b60e79e13e8f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"258e6c132176339f4aafb09202a5cd59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+binary_mul:f16:0:abx mb32_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5186932f3511389a1f89aff06d3a09e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb32_ic3oc16_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"e75e9bebb6b74cc3509ffcf5fc930363"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb32_ic16oc32_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"c2b24c8f389c5e6caebe7d39352fddba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic32oc64_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"82adb3dc1836ceca0309ca2e4beab206"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"a5a7a5e7959645cdf46bd000300bc0cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_mul:f32:2:abx+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb32_ic64oc128_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"e4839a5b5fded61c8d3f1fae9863c2e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx g128mb32_ic128oc128_ih6oh1kh6sh1dh0ph0_iw6ow1kw6sw1dw0pw0_n"b58d19d616608ec84523132992723a21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25 mb32_ic128oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"28482333bb3c3d7fdde582a380da9a93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"64f0362af2908f86e67df7464d6dd574"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic mb32_ic64oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3304bbef14e539a77f580b3a736df2a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8f84fa4b309b64636579906ac8b8b71f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c2e430cee7f225e9acb2d07a15124748*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"fbbed073d32ea9453fb856e91579c689*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"189c55e2dbdc47c025199e2226a85098*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"63894a07506ae0cde45d6495d42a6073*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2879432f04f093ba9f957e119a658784*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"bbdf1180ff6aa45bcd79dd684373e2d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fed67da69e447b62ac7596b61c492b3e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c57368101c650636b85caf97df5a6fad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"54882efd2eb3247630b7871cb3f77b15*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish:1.0 mb1_ic3oc8_ih736oh368kh3sh2dh0ph1_iw736ow368kw3sw2dw0pw1_n"a35ac37956316af68e7463e1664f51c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic8oc8_ih368oh368kh1sh1dh0ph0_iw368ow368kw1sw1dw0pw0_n"0bb6beaacd7143870e957915de93ed68*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g8mb1_ic8oc8_ih368oh368kh3sh1dh0ph1_iw368ow368kw3sw1dw0pw1_n"b01fc6fdbb0c512258bff5ac9403cc66*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:14:acdb mb1_ic8oc8_ih368oh368kh1sh1dh0ph0_iw368ow368kw1sw1dw0pw0_n"c3243c09bf2dc0decda6903db3e68bb0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic8oc32_ih368oh368kh1sh1dh0ph0_iw368ow368kw1sw1dw0pw0_n"55466798176d4bb7a6b2e994bebab65a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic32oc32_ih368oh184kh3sh2dh0ph1_iw368ow184kw3sw2dw0pw1_n"973073379f8f0185628feeb45f2a7abf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"91d94ca5b8512297391baa48820402f4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc40_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"4bd5a04115799a7169ce8e550d837e1d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g40mb1_ic40oc40_ih184oh184kh3sh1dh0ph1_iw184ow184kw3sw1dw0pw1_n"aeee4a5ea35622389c68c7b2b56b1e3b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic40oc16_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"90140e100924b66fe7e9a76c0990e1f6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g40mb1_ic40oc40_ih184oh92kh5sh2dh0ph2_iw184ow92kw5sw2dw0pw2_n"679a3858ad0cbd999958203510824678*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic40oc24_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"000dc93068f406e5b1d0082af580ed67*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc64_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"87fbbe25c607738faeaf28684066bbf6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb1_ic64oc64_ih92oh92kh5sh1dh0ph2_iw92ow92kw5sw1dw0pw2_n"caf7f2b1520a19e577ca8a63f76068ed*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic64oc24_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"24e0ff00b5bac72bc3a3e9085fea8ae3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic24oc120_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"d40245b9e36a3ce4d05e84700d254633*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g120mb1_ic120oc120_ih92oh46kh3sh2dh0ph1_iw92ow46kw3sw2dw0pw1_n"3e3c8fe806351ce967cc58d8d4a513da*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic120oc40_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"3cd54f698d6da950f736464f4b95ec4f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc104_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"c1806c7d2e655bbfc3763ae22226a410*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g104mb1_ic104oc104_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"24bae7d7d04c7bd106da6a548d91d1c0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic104oc40_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"b5691055208e2ab6227e22d30ac3168a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc96_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"58ad3026f06f2e92878c512025894c86*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g96mb1_ic96oc96_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"71f6c96b2ed042d1cbe331f7177256fe*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic96oc40_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"c195dfe2c4eb3b423e9e6b6e311d2d63*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic40oc240_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"e0dd701379b506f32fc95dc125b38ea3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g240mb1_ic240oc240_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"f8ec9d5eb2c91e7dcd414223837ad48c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc56_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"3118293cf2a9a0d190d8c96fa0ffc2d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic56oc336_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"1ddf84eaa8c30d0bd80027891849bfab*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g336mb1_ic336oc336_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"308a3745d2cf9e18ca172914247e9ee2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic336oc56_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"edb31cf48ac95d333845b65eccb5089c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g336mb1_ic336oc336_ih46oh23kh5sh2dh0ph2_iw46ow23kw5sw2dw0pw2_n"04661a18d5bdafc63f25cca95331155f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic336oc80_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"5b1069aedc7968b39d3822268e4fe758*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 mb1_ic80oc480_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"7271206a8196cca45af554ca550877cd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:1.0 g480mb1_ic480oc480_ih23oh23kh5sh1dh0ph2_iw23ow23kw5sw1dw0pw2_n"74c4de47bdb04039c571db46a978924d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"eae9e2ea75db18ec1189052ba7ba2bff*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic480oc96_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"9591a3d123012d23e42efb2664acb20d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"eb90af58a0902026af2a009310af603f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic56oc96_ih46oh46kh1sh1dh0ph0_iw46ow46kw1sw1dw0pw0_n"713fd5c20b459acc620f6cf665ea3bf0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih46oh46kh3sh1dh0ph1_iw46ow46kw3sw1dw0pw1_n"f560eb0fe09d26b4ed27d86ecfd3f2e6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic24oc96_ih92oh92kh1sh1dh0ph0_iw92ow92kw1sw1dw0pw0_n"cf59939d39fd0ac1c2974d9116c17964*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih92oh92kh3sh1dh0ph1_iw92ow92kw3sw1dw0pw1_n"dcc7f384b42dc0b02c3ec8a94aa5b771*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic16oc96_ih184oh184kh1sh1dh0ph0_iw184ow184kw1sw1dw0pw0_n"ee1b395c81207090d1ad21bff6b90cdd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih184oh184kh3sh1dh0ph1_iw184ow184kw3sw1dw0pw1_n"fbb9678f0d42a283df578c90205e1c20*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc24_ih184oh184kh3sh1dh0ph1_iw184ow184kw3sw1dw0pw1_n"cf128965ea24d28814c4e19f73273d23*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"3441ef1c29465a9f845382b86bf3c3cb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"1d944fbccc945576ef0a303ea6048a23*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"a0dc9e20addd070204a7f525e305e66a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"066620082a3eb06defc21671dc334c57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"c2ed108dd5663b4177e575ce770d5e84*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"46e43abc641fbaa5ecdbcf9a579e457e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"744ed063a2fbb66a9e1a9cf3a1a0986d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"1e2565371a052f8a973f432e5c171d17*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"e8fb000139b9310583c8ce3278826aee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"d2012562224901696ef20174e0ff56a1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"fdece8afb17f74181ed4a6609d3906e0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"5c4e68d536918d49ba098d606d3cea20*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"89146446b3f0b8893bc20e9dd2515874*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"eec8fb05764f12654fb9dee4762b2593*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"268952e9e6f1175a8985c1ea56a66c2c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"d88aa4652d780a77b1b33f4aebf67f0e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"d8d8bf76b0c30531937f4e897f68e205*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"65db6bb9a7586d8fece564d0c5422edf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"80115a68ef0f7c466a4244f236905d20*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"31e3ae663f164d215b2ff6d99fc68599*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"324246d17b87544255efb1ccd7d71dac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-scales=wei:per_oc g16mb32_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"409ff71c97b6a3cff1752c104aba3b9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:acdeb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"35ce8481f02d4c00420552f37941802c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"0d9f75707dd912589ac6b1a998139bca&2667db846571ad1fc8a349d1267aa297"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b8aaccacbf5042d4a8a8f788a16c5c7c&7f835ae3e66085a667df235e822b0d75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9ae10d4edd5f12f2e224aa5e213d53f6&17a4ede773979e4d122a79604d97f227"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4b53d1a2ccb71acda81cc3a38637237b*2&ba9d2b46bc857f800a9ea814dfe9b5a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8c881f7ea0c1ecc9f78035bdb5ebcc4c&5390554946e1c39c041e54ce7c80ce82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cc54c945ebbaf2b042636826bbad81b3&a737c2e6d75f50a40b4476684bc6611e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb32_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"2d2e942304c5e14998c93c840f302f74&61e0eef69cd2f5b563978d2931102e78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4a849d690204947261e79064a5e0ad9c&6b3efec9acb938b39de0db3c797daeff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2c93f05a0ff6280bdff95dfedc162c30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"59fd35d3189bcb78b55f3a852aaf5a83&5fb2e30800cf4dc69e59bea3e746a293"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6d0b18a42dd51c38353bd017818c6781*2&62b69e63b01db61a4eac1b0cc1a270a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb32_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"c45fdf79bbd5cd098a478ffa1ff69707*2&745dec1ec58ed6a105c337e960f4dd31*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8bc7584ff55a3325de18fe5714855010*2&f13c0d99b4b43e536d25f8c8aeb04945*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4648c3f51f08a07a77aae52f37fdd8a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"34a6b5120ea40e0845413bd3265891cd&36f572b1573ff0468ae507519d07d865"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fde0f12f942e00676d703407cdebd425&e0b105db4ef4d9a8b6ea7a697010c472"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a622dec52c0b89bf4930e44dc170b22e&20215f13123542c969f613ede98acd8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb32_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6b1bbad07bb8c5c4c03cb24e166cc3ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe5903da71aeeae02a49c2871eb4c0e4&5a0a33db2d02dd2291c6e479acc7efbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d8ba634cfcadad4a9b6af2670839c6d&ad74230d49e62c6b3f0b03b0e6199cd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb32_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"259711eeb1a1492c43fdc276793deca8&365c3dce81971e80096d40c3554e0757"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4773c5f2a8200f12d25daf853cf74b01&466f113bc9113612712070774d26da38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e76a90f6016cb3c6a121782ccdeb6355*2&01171a523884c762bfe7e81f00e5d0ce*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g184mb32_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d4f3adfee5c37ef673e3519af3ce8e25&c1e0aefc9d887332e842291ac4f09724*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cd7d6bbca207e0bf2c315541815f27e5&b2a26030a99f161b88832939806a2f9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:31:ABcde32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6525b561762934e3fbd9662b436253d5&d7a659b6c1eecad4c3a60a7ee1c3e356"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g80mb32_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bf5720d45ab32a30faa1c5e43cabe813"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bc4ceb526fb12d0631d2393fca03a06a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"47d3b13ffbffe6778428d8968cfafb1c&e6f2608093805c4e51132bf9e02576d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb32_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fd6482aa36bb5c640dae5afcb26687f7&4400c58a2ee91360cae4be96fda1f2c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d32eea986edaaf5d4f0542919eaaf731&2ab24c3676403ca664719cbe20d05833"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ddfa9cb04ae6c0ad99b68e631ecc6ae0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5434bf566be3db7987f75945f35b62ac&fba1ee77636d71d01283a6f825f8344c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"64337ec507956a51904b82af53905d0a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g160mb32_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"36ae8df5f83edf3950045988784428df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0e9fe4253d4564d8db1f8163848bc539"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c12003635d5a5330398f3360b48003bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7bd063607ff004394d9346bb9b274d24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7fa15eb9660921f8a459c2e65660267c*3&4ab6da3af2144680c12a8b327c1af989*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic64oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ec4113d316c7184b948ef814a9643b95&4a0b932a8ffd3208fc109e047e68329d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic112oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a1a83d2fe01720f44c2cc3ef833124d0*2&54228816b5d46fd2bb9ec3c3f3289ca9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic112oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c92aa6f158e3d68f29a4f4370d790fd3&cc639ad4ca5002e0809b9373819f6b04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"21591d26d900d6e54562d1c9f7b17fc3*2&3b2b6436992ed6c50d46ce1d84627c53*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"93cde83c02cd7615aa65804ae237dc71&3017bc546507b80ad4d7783f47467fd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ada1a70aef684522f02b0ce0c39d7710&07a297c8b00ca875cd68b06d5774bf64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic336oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9cfc0bf96f589e119405b8dcd37a86fc&4a34d3a8511227ccb838edda26f50eaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"52a478912b273a4cbf9ebab8b61f59c7&2556e1001b54ee049675169079f85447"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"7f35cf9b434d9787195a6fd0b36fc0ca*2&98a95326368a4c8262d20205db53643a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"5599764e0141549331c98065f9905cff&0082a3ede66978066787225905ae7c0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b2880ebb154b4e471638073bb03b3387&31ad20d70fbc75092d0094cf40dfede3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1024oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b48a16dae8460e9a37075d7be2685b0d&f8ee31fea7e2d4415e8b08c9fb4bccc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cbeb2740b06b25d624ac025223aa456e&2369383447a583f8251a5da910a6b7c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic848oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"49ee56681ea3910a62dc79128f2cb250&2f04e477756094d10ea2e211e9b8ff83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic336oc336_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f69765526b80554e3e83c5fe9de6734d&cc17c51ef3d36ba9bff5af513cd532c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic528oc192_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"589e5ae1fb789fc4015743bd69b130d5&0c73671cf3358cd1c36ba819ae064e50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic192oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"2923d719a7e8a8e969a9e34477502eb0&a1bebfd6eb54c7b03b1670688ec9ebb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0 --attr-zero-points=src0:common:1 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"25f098f64323eb1655fafddae3c78ac0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic304oc112_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9b879543ed1509eb4286092f7acefb07&8b8cc760afaae483f6cb52d1da3c7751"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic112oc56_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e03f8beb0a9a63d93d014a17b954a7f8&2687742b7a92f1e44807cd39687f3b78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d840db958b0537755d24fab15d29376f&c8b2d17e5e9b05fed09af452b7bf1720"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic176oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"be83f4896411bbcffbea69800df29b98&f74db9c9ab2f62a40fa31a9f1b351576"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic64oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a51983c898c8b213b16ea871afbb1199&fe3019d8fdd4986dad6295d8a20cb3b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"a8acd03466b3c5ef7869acb594d01aef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"b051475ae668d5323702977ffb2b799a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"31ca4ff7053ef46cc19b6b2869b373d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"0889837dd3822a07926f2baeb4d794af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"e852cffef55f98b381c56138c2d4c68f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"8b05f9d408a8dde0f02a67afaa3f99a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"9a2e569be4346ecdc4109382b71ea84c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"94559ddaf5ce4f0de4f3b8e1afae75fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"7a166f20e0a9d3af9ae482c386ca9450"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"ec6307e1d1c3704cfe4fdc375319b263"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"d65091cc84d23a51042c8154c1c8458d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e4189cd442b1d28e0b967ece2a1f2c40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"01af8b798f45d886a4a0de81664b4be6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc128_ih326oh322kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"575eea2f7a3c8172388c96b025811413*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1630ow1630kw11sw1dw0pw5_n"1bd5572cc82cfb8b8a44af60e48f2365*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"130288a263dfeea05ca4e61979b9d177*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw8150ow8150kw11sw1dw0pw5_n"52ec1aa8c58b7caee37aa9612d9a88f8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d92e1f84a4d7f02d1cc84cf6662dee50*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw89650ow89650kw23sw1dw0pw11_n"ad555c44aa87ced88d1d40638689176b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih322oh322kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3a763db765977fd6e0dde0ac62e0db96*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic3oc32_ih608oh608kh3sh1dh0ph1_iw608ow608kw3sw1dw0pw1_n"3c5cc64dee448c2c95b1bf733f35affe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic32oc64_ih608oh304kh3sh2dh0ph1_iw608ow304kw3sw2dw0pw1_n"adf42226c5d4d825fc344b353ef50df3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic64oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"ec9b0afcd035ec352ba9bf2107cef63c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic64oc32_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"8d2834cdfaaa7deb65a47f76f9ab1431"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb32_ic32oc64_ih304oh304kh3sh1dh0ph1_iw304ow304kw3sw1dw0pw1_n"1b5ad2589dd05d48b46cfc3b6000dbf1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic128oc64_ih304oh304kh1sh1dh0ph0_iw304ow304kw1sw1dw0pw0_n"2c445e53d80dc5df0c2ecb291b4b3310"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic64oc128_ih304oh152kh3sh2dh0ph1_iw304ow152kw3sw2dw0pw1_n"ea81804ccd2c28c31b1709d07770e27d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic128oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"23f0f49864087aaf502ba3bd0ba4ac27*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic64oc64_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"f1f32fe58885557142f2a9e8274766ca*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih152oh152kh3sh1dh0ph1_iw152ow152kw3sw1dw0pw1_n"5bf471af27940a423206631671fe4b77*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic128oc128_ih152oh152kh1sh1dh0ph0_iw152ow152kw1sw1dw0pw0_n"4d781a09fc4019e63895f3d925746ca9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic128oc256_ih152oh76kh3sh2dh0ph1_iw152ow76kw3sw2dw0pw1_n"6d154de0d9ff455148527da22fca5a40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"3b88a7be86c4fd90b9a08e9fbad50e3f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic128oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"6013732950214c5016017f7737658459*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"b53738c43ac705ad923f41fc350539a7*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic256oc256_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"1949be90a7b5e90fd2356a01670ca44c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc128_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"c24fb5431e0c39298f9607c5eeb47b3e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic256oc512_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"88d0a5b266f2c3a00af4e935b6b863d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"645d0e0c234353e63d0de905cf35d1d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ffb75d9b8d44946619bfe18981f6d9a7*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"b177b4e95cba20e4ba1f054b5bd614cc*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic512oc512_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0a19d858831e6df90179c6c05b0a6b9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bcd454122bd93ba423647d57c7a8e6e9*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic512oc1024_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"cf947378992bf4335ee486175d0e0b42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b8bc0256253e7d448402195c0d06409a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c6e29932db07bd7fb38dc3d056c324d8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"ff95c6c166b5810735f312a7e2566071*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb32_ic1024oc1024_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f9a73a8416c0d66428574d8bafd10082"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic1024oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4f2c3ea1886bea5fcb965bf92029e3b5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc1024_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"6d3d4bbfb4a85ced0084c8b2e003193d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic2048oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"38eb9f83c33c3af2397b3f0ede2f30a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b687aad5228589b66d91961653b81ff4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc512_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"8234eece438ed17c089e770df5e5bb71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"39fb6822f19e802e738b30a587aee56a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc256_ih76oh76kh3sh1dh0ph1_iw76ow76kw3sw1dw0pw1_n"e65e322b2c19617cbb7c2d0c56fd214a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc256_ih76oh38kh3sh2dh0ph1_iw76ow38kw3sw2dw0pw1_n"232a668648592682c2e5f490884352d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih76oh76kh1sh1dh0ph0_iw76ow76kw1sw1dw0pw0_n"ba31b046436f47e82faa456d634e4fdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"8f0500d7cdbf828a2cfe96709b03d0c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"2140df3a48802541acbd41d3ebbfc30f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc255_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"58e4f5c539b07beae7c15659b4287773"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"f92e507ba323cbb8050546712cd22a38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"a6f519b38817bfc4f3656c3034fae46b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb32_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"c31cee776619757faed471a6a1ba9dc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"930729b8d7e45c68f1f70b74d0965db2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"4cffa8ee8d4b85f744cb4858ef0a23d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb32_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"69221591d142de79e10e8739345d14ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"a798664aa70ab151834cef79db5b0701"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"e22d9553daf1b04ebd7a7df821da7d3a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"3b44ff79e08e25a9ae5914e1b3475ff5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"281daae144592ae953f3fa186399d2c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"867470477cb416523bd7f64ed7b562e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"d260ec0398d9369ca97aa935c14a93f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"e5e15ea8684f2c0c823015a9fbaf5d15*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"1458514ef435fcd57dabf091052360f8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"299e96c71b74e1cd709c6f735f85e531*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"c4a8235ecbc05421ec3b60ba20504bbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"e2df5ffc4f4b01b6c9fcdb02d6a5b289"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"94e09d01509af9aaa0fb4cd5f718a191"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"de256295f27b692522d729c19b2bdfe5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb32_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"f429a4cb06e7229ae268198b0e27ce7a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"f5cc104d1cdf017b930b63f724e7bf18*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"bcaaff15c556a295989bbda2e3b24005"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"663434cddb3872dc89052d84e52ed7e0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"590d389e9b302814b6af2579e9e44e7c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"106f5147131cc127bd8dc71eb60c5d8f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"9b7063767642d1da0180dfc3e6974b00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"0920ddbb4595a22a2c4061bfc0733bbe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"e7d70bb4d22d387f9c81212851cfb58f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"f3113870bb263021d20d4a84d4584974*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb32_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"5825d51946cce265b80c825a4f733455*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"7e34674f88e0718e2de5943e6f3c59ef*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"28c9ddb4ccd405e5fd6f8418a7643387"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"e660b45cd2f43b8ebcd8badbb3777021"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"90574e9ced068dfdd2bcef0f51019280"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh11kh3sh2dh0ph1_iw22ow11kw3sw2dw0pw1_n"4a27168d309de411bf88b0ae0524ad80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"e16d25bf31293fb9bd6306fe913ef54e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"6f6d9cb85823d48d41f41ee860d3d26a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"842edd4377847586192e81154e77582d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"46d9e74cb11cac98e706e8ed1b53e405*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"7959c31979f716b8398956e4b4f749ad*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"2471812148fee49c3a79d825c5a1bd34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"d1818a73930970e6a3cb99e3c8548884*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"2122b3af9757424a9d0a85d118186f98*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"eb23dcb7d6c7cd2b33b746756ba2ac5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"e6c0cd572413d3698f6504242a23e29c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e7973de107ac7a1bca09b049ec73a417"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"1a1ca5ed04777de8722dd97fecf6a499*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"c67aaae1bcc188bd22df704038c00f4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"92bfe6c6a6ab16b888bfada0430aa7cc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"ac85808654b6f38a13088deef203e230"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"ede00919f66c900b4602eaecafe71ff9*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_exp mb32_ic32oc4_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"7f5f6b7779ea17436f4e86d51013a5fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"40c29f24910899a8909938dd384da20a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic3oc16_ih1024oh512kh3sh2dh0ph0_iw2048ow1024kw3sw2dw0pw0_n"952581406d94e11999072ccdabdabff0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 g16mb1_ic16oc16_ih512oh256kh3sh2dh0ph0_iw1024ow512kw3sw2dw0pw0_n"3fe3e7dc7e06fc094ac295e00cbb981e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic16oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8aa04ed58503442cff77834e185a1e55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic8oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cf124e07f7577afa7411c426710a9756"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc32_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"2d4a8ac808f7e870b47f2082ed17199b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic16oc16_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"8f4042e610dd1dd343ad17bb30930ff1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc72_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"88774508af9bea0f31c8aaea60c29fa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"f4e11d681efa8953d9227e34a21ce96e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic72oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"502463513e65ccef99eb3a1797947548"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic24oc88_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a2999efe4028b57f3cf8dc2aebaeaf3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g88mb1_ic88oc88_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"cb01063e993ef487d6e63631357d631c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic88oc24_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4c637f5e6c682efe098d65f9b5835ef5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic24oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f9bd883e3a1a64252bb7f9af30a39014"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g96mb1_ic96oc96_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"f2dec7807534a4bac8387c4a2ec29170"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7cfb7f0ef981b09e86fea8a4d5726b13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"083295c621e2f55c8e9689b8e945b490"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9f06492a9b1a609c1b7d602f90faf2b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic40oc240_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0617d7bef2aac95641c5a0e6652645a1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g240mb1_ic240oc240_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"7722fbd65074186df68f787c39e0bdac*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2b5655bad1208bda900a1e3d7d2b93c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bbd7cae2dca9a83996171eef790699de*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc40_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"50741d1fe82af8182b0ca8e70b22ba6c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic40oc120_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"495e5c2bd5050b8f6bbd926470ef4222"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g120mb1_ic120oc120_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"8e8416db1961e7848cd0be7ef708e3ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b3c6140568898a6b78226d59fb22bdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e1eac0a4f2f697d26e6524421256e74b&782fccc5fa22689cabdbba161288f8fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic120oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f03d1295fb636f6a10d914eab2a95985"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic48oc144_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5f471de4282e985bc204438049206f9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g144mb1_ic144oc144_ih128oh128kh5sh1dh1ph4_iw256ow256kw5sw1dw1pw4_n"9989c477286be27e4493048c9797667a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic144oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a20fafcf5b56eb5813007e6a6d01f676"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic40oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"05a681320125f8c0170c6b966c65b98b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic144oc48_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bef5c9b2cd682b8e8f966f1433f05872"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic48oc288_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b730b4eadfa96ef50edc6de266f4cd73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g288mb1_ic288oc288_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"390d848447d43f49a2dfac2c2321a3e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f771264a18da803f90c7afa3a5d3ab3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6aa3dfea708398b3a741700fd452fdce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic288oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3e02074966b92108bb0e320376b9ffe1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"90585b82edab1d3c4939be516c54b521*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g576mb1_ic576oc576_ih128oh128kh5sh1dh3ph8_iw256ow256kw5sw1dw3pw8_n"d14e1d333e8fc3cda71993795e522384*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"08f7d5966a46a0a1f3f8d01a704f3bdb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic144oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1724c9ac3d70abef0abc40fef979a0c4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc96_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e26d76728634bf3d639924efddf9d17c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic96oc576_ih128oh128kh1sh1dh3ph0_iw256ow256kw1sw1dw3pw0_n"1cc708ad2f0821cc7712820e0e8600a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 --attr-zero-points=src0:common:1 mb1_ic576oc128_ih5oh5kh1sh1dh0ph0_iw11ow11kw1sw1dw0pw0_n"0d24c17958299d9853d51bc06b6a0e06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+binary_mul:u8:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic576oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4d5ce5d39f2468f07d41b9a769ee43a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0074dde40bb363f19274f0e52fd7d21e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc64_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e368db8027249547e7999040ec2713a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic192oc128_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1b74a09a9182dbb0af9deb7f3003da6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic160oc128_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"1e618ef4074fbec8c5a16554b586e85a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc19_ih512oh512kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"f23a1e1bd473bdf34d414f549f7fbb4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"8ea1ecb7db8ca6a4a3f8594863e72720&e2909126551101bb59866a36161cd05a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb32_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"e8aa32d2a0c459aad7f6c6bb5e5b0abe&39a524ec5b7f610b527df0ec7e8d6872"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"a88dd79991cd5fd117acdcf3c424eb71&69d78aad7ea9e79c216278c9453d2b78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4b4d7d2fa2ce34397a1fc2c82d1b4bd3&c026dc270e4bcc2e2afa268da05ee50d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0669e21b220dc975b57d349262b53b17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b5b1d30d7932848d10176bdc3815364f*2&2ef2150f08160dd6b731e395914bc425*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad6f29cc3d95048d294956ba4a94d1f0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"be0ea2382871629294d604e8fc84833d&eaf17db68f6ee2f816b7bc76685d096a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"925129699173279d548a8d27d0e0266e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"422dd959c2bddf070ba469a3f2305314*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"e7771d4a8dc3ed687c49f31865b622f8&c33ba79f3a7994259a48ff5f5a103c19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b1f0756f863ffd1c777609d38ffc2888*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9028c9550bb35c9a272cfa5a9c1403f9&7c6b755f2fa6370e98be1f477df3ccba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a0d95b5a7cecde9897d95ce5f32ae508"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic3oc8_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"a17ef0a9c48e52548c5143a1001c4e96*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g8mb1_ic8oc8_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f700aa6c7655af8e67a34e6773caeb4a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic8oc16_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"459d034749d830ebe6ba5cf7ab8afdaa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"a7517439db0555c7dd816c4dc739622a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic16oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4d6f59b9808db04641e79f56ea54b542*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8bd405535e253dbf00fd216227596663*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2fc88d1a96ea3ed1e716beca11485aaf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"2364e3dc3abce2b9ee548cf301a71fb3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b2d46cb050c0d5b23549c98af4207a6b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2166f6175c1426b2da25c64c6ddd1a7a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4a930ecd99b80467fe92d51fff17373e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"06fa62fb3d51921c2878e4e5971fca8a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"56af9dafef86e8f7db02b7d0fee5e2c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"71675f30b57cb9fb8baf20923a2da2bd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"45dd1c4d47ca3c8b5e2d571c95996726*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"367b55b74080e6c8e928be6a920eb12b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"61e304713caee7d5e239251f9a559fec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"f87a0331d19bef8a3b786840db8ace3f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc256_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"2797898a5e59901084bcc9a359ef019e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"74be6784fbd7351a3789a1edcf512722*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"34c17aa34915aebade3189b5b5215baa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"de6566f0721a5dfc078ba8051136b082"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"eafd8fb2b7979181caed67f9a1dd97cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8e313ffe771af6271d5dc5be00b707ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1653ea2dc1afb26f78594c9184e021a1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3384da113de403e6de70d1db2e5ac517*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"30ba1d74fc6167d8039a320618a1b35e&1571f1f75435be0937477e754dcae715&56210851cad55e357f07445e37b9e063"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a3d08782c9241055ce9937c7b52e1095"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f0dc8aa0c1c36f7a6c79cb1779c9e9e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"754dc4cf39aa5ca50b84b3c933663f76*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2a62c57466319f09419d65aa5f3d6f87*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"0aec313c4265b5b2ba49be92ec2c7155&b56a0bc1b15d30c1310ca4b187926718&bf746697411294bc3a31548c258f9385"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6d36630b2f86f1e4801178048d718e1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"aba6e9435f3ff3860dcc7af1f05284ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"44b71fc27ef067d5573f883f824ec8e0*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6814edb73f6cb98bd081f6791546bdfd*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"97708ef953df136eb75215119279aef8&14e6441750e807e08475d5f5ce7c127f&3ad0047b62b1554a88f952f8b1879131"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"aef75bbf0aef9a304fcf483e063a9d08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb32_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4b643a94510af88ff509400b53495ee6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"207996a659e6902676e7bb6e1737dd4f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a9b5f00b608df9b5ffcacba1b579e6a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+binary_mul:f16:2:abx+binary_add:f16:2:abx mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9921b7fc1ff216a8aa82fc11f4da05b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic1oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"74709bba65fbd10f8e4ccd7df1a0ca9b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb1_ic20oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"d5e935a15499461813f5db85ada0b9d5*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic20oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"dac431db18276cc28c26dc719be9a6ef*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:14:aBcd16b mb1_ic20oc20_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"12654e0066d85485a8239f3f7273cc06*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=prelu:per_oc mb1_ic20oc80_ih360oh360kh1sh1dh0ph0_iw640ow640kw1sw1dw0pw0_n"65898e03048b7c7fa0caf5475f0e1e37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic20oc1_ih720oh720kh3sh1dh0ph1_iw1280ow1280kw3sw1dw0pw1_n"4b6d6d7550d6461a63670c64533b9f19*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"d7da90eab33af3c8b229720e230e5806"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"99ad6812598a26a121fd1bb69980d6ab*5&79b88e9dba3c5aba7dc8f83e4f194b3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6627798857c459edcc3b89820c6a1303*10&f9af2a142474bbd48fa1fd9a5dd69ead*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b1ff99e329087132300da67dfd6adea6*10&9804eb6b62efebab6417401d80a5688c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"33f74174f360a423bbc8e86830db6eeb*5&e28affe2d06d1f6cd01c1ea4dfc44a98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df4426504f21aeef31f6c06c17682b2b*5&1cb31403e5c3a3e8a86a0466ef57aae8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0ce9b4e44b6736b652b0496ad0b846ac*10&0b8d2328ffd939f20db6b9e2a2a0b386*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"af04f51f2c96996e1ea098a1c8eb8da4*10&b7654e7595547d82e857d781c0283630*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0bfd99cf3cb3a32b03cda9a5bdc22093*5&523999f992dc281381cf88be9082892e&b4b8359dbeebbe01405c9186c3344864"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"40e77657bbd78f0435aaa82260a3b5b7*5&f08dfa132fc057affd7c6e8c3ffbaaff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8d435ed269589d21a55230977a84768d*10&367010b3f4dad3b97b8b7a1ff18791d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0edf87b79a3fef9802a08de1248e4c3*10&4302cc5a67d99a6152af60e67b71a912*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"682edd2b58b9c55ecd5d4fdf8334ba8a*5&82d3f3bf2986ed8d5d8c6e7890d62fcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"24c2f6a65348eda58f4b2d6d4dc9db04*5&7757f33cc05828585a749141ec69cf95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f5e3d6f80438b7a254fdde2411f5b908*10&8387ee41dedef3c87274fb251d8d1a0e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"03b0abe7a116b49dd088b3b598b9cbfe*10&69bfd1bf9401c4b9d0409b477e273b01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b22d3f56c1a7de2d0ccf8d03bea49bde*5&64eb11a683958284bd752224c5ad021a*3&e771ed765f5e1923c2773673020a9702"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2683fd9853e8959bd98283777cdeb057*5&0d1df88f411a62c668cc2e3457121e61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b18ff008b6b33cfe6720789d144c872b&5f714e59f90effa13a132f51d42b3d63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_tanh mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"17da320ed35c4a13da9985cd300a3809&4e0ee04ec7a64d5597e35eb1524317c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"1bb01db0cfb8fedfd2090c6da94f561e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"b4761536f24c0a3b281481e244df86eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a09a4f0bac5bbba2b4ee2f1c4968c74c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e8551ed00d19e467e3c9d35df73ca310*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"288f9e6e82105b60c96e49e5fca016fa*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a6c3285a8a54e3b554ce7c86dc272d29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"93a446e928177c1188d0cede74253d59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"5ae555c973a7a88a7ad8fa81de0dddaf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"39c5d12befb4d884f0a3e923342fb1ab*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"fbe711d645c2e106078b1856d28831f4*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"743fc982df0779253c3adf260937da7f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"405a503e2621c39c1ccc119d9136a121"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"ee490dd04e6f2807e33e6ae18ecbf157"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"3ad03d7ba9da6a9efe0d329de46e7e50*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"596d9883feddef637c824d0bd53cd8cb*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"4cc81ab194ded5f079ab725ffc71deaf*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"887ac48609faf72754880bb7c8c5f091*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"2068c5c8710e5c10308a2285a1880303"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5601d944a601bcaba9a1718a745b6af1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"fd57c0716cc53461efa998efefa56db0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"256f47b0fc2ba99b4337e89377876b40*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a2ff5e0fa2c982c4bd89fff37010d869*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ac30f472bbac2f73f03320060e690121*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"bbc6d5bf53adc7f0e1571ccd24c49788*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"5edb28a5f795a45a5621b3b2e4291cbb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3b542453a65de9629748dfb0b2bd1715"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"a1e816957ea8b027d3bf55c5421c7a83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"22af4cac484ae8c20808f3e6bfe67111*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b29bcd69817807484ff98fe4e8360bc5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"9f92e5e1a94686b0b11a6877bb8b1ef4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9483305b9d5873dab8d909f2fed9cbb0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"5e9723e23363ad2dffaab9ad4c30ad59*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0f79d7016a2f8e687bb5df3dd9d47a20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"82793282e937a31519287b8fb2262c4a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"dfe6a440d8ee44f8071c545ea4d108fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"5080001dceded82ce5352fdb49ee1997*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c8974c64751ea6888142939bd750b3b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e2f620232c998219ee9cdc250d30ead6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"2c1ab1f4c1d9fc652d225cf04abe3e10*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"63b3fbe6102eaf8d766478a823f5586d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"d9a71dc252e0ba677c68fe6199dc9a49*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"8785c7f7ad292c2cde22a51ac2c66836*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"24a009cc1030a633d0417d19dbd7286a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8b898e070267d72dcf8a508f5933eb4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d330ad7d8de8ce2e92ddf1d890762543"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"94b82ce14d93d9489c4dd5e026ca5c5c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"ef5cca0c320743d9327189a2f7c8b75a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"880292fcb7b2b08c99cbd5091299e057"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"11dd6586856034300e09447f5f7d5a7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"520e6949549ec71a0bffaf281edd070a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"16c1b5e8151675699f5f4147af47aca6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"341bd77ba59ab59796b9d28d2a71f6a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"a166ffd921d67b1933edd013803ec97b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d0b481ee21b867118e9831c4ae3bbaaa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9671130f8748b7e0b54a85e7a6316caa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"9b5a0660e3b5fd2560597a62594edddc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"17d293258093bc537b817bfd90792238*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ab4af90b1504e8e853368dcc87581adb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"0257b580895a4432d0bb380c197c25d5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"41b43ebceb0fcc1debbdc6417312b9af*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:1.0 mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"81070aa49af84ac1a0038fc164d3020a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"33f32a53899aec8f7b75a398fcb2d7bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"c6c224abd0abb1de006d37b7f3d38605"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"445d324fa34d48b6913546391691ff21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a76fbe87eb7234779cde3bea5aac98a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8aac97d375eb6b9655d0ad660e41dd77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+binary_mul:f16:0:abx mb32_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"69a65b84e6f59007228567b87ece11e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"01c7ce75d072123a3e6cd562126607f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"33a5f2498c0a706ca77b7d48f48803ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8992a9d3ea86db1166d1f90b78e7922e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8e31a7eb1c49c57e1964456890a6a265"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"90b5091427c0699851014129b180b035"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"58de6949565fb22d0f17c752d893c635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"69a21c370fc50ea5c990841e25b29406"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f06a9f6070e1e3e9009f7827069e3ce5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"48e112a67105d23e157357554f8fd145"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"09d797a24dbc24d57f695e55fa566f89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"0e30a641216a3b91c8ac6cbb8a06e387"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"f391df9c40bd6c3276cc7b03c06c7bdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"acbf83daed711f959fb27812f7498431"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"5b03ce545bcecd2f38815f1386cca204"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"0deb7933486effd96e97e96c0cb39250*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"f2ea5905cb3f665de76aed10555085ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"a03fcaf20a4f4dbe3619fa6c092f901a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"cd26f2f8849279aaf1b258aef6bd6224"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"935cd3321775b63dd760a44311fc15de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"253b908220131f8f0e54cb70e81d32fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"744bbc3a29d5dfb1d69f7725d32d888c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d7580d47b27d866fed14d73d54a4ca64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"0365d3c3285dfc9d86688b411b2e1916"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"14669e11e1cdd06a8fb2fc6393bbf103"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c3b86bb9d804dfba0f78178b93cbf121"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"53e8b87630f56837586de6a3a5340426"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"ca8c62f66266dcd86de5d62543dd9019"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"16e96ffc61a2debf086364de6d23179f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2aaac06ee79d6137342af602e20f9f59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"26248536631adbc41f646afecb2aa7c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"fcfb990c2cd17de1fff178090ca9309d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5a0f998c9da8c59bae76f915241ec5a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8d6484685f814b86175f6aa22d96a1c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"495bd2a3b251ada2dd80302cd906e605"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu:1.0 mb32_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"ad9ef9b6f37f53543a47c6a33d350f9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"51e9f64f55e8e300a674ec444d440f72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=sum:1.0:0:f16+binary_mul:f16:0:abx+binary_add:f16:0:abx mb32_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"a9170a377a90d6994a69b7e99a72e9f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"970441c49db39c825555365c32df3041"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"c40661346669ee8a7266cc8d1d755136*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"da591eb9e4dfb63aa343fde64dc98260*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"f32b3a9e074d69c90ad1732c526c27bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"14b6149b61d38553ebf34977b5143e09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3a6ea4aa940277ffd29e14400e6a7e5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"16b4428a542158856bd38b282fdbfd07*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"723259aa42b4051437e005207b0fef10*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"7db89b07e0083fa86e9cea613fe65efe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"35e014141dcdf5606178a28b1dc791cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"aa0c69b1b30dbd946802fc8940364457*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"de1ea3a4568d8ed16a54f4d100e02ff6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"a7a0c686e2d0bdbab78a09e00b60858a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"63c1ffb16c824a25da4643e60ecc2772"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"af19e55b914be5c0ef60665882de40ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"5d5c4b0bc99e39e70526f948b4b9771a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"d0bd886514495c35078f9ade79e6a2d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"46f921f18bf7784b0653c0cbd6d25604"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"267f6d5338ba54c4867cd53b4c1003fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"502abc34b812edf2c298997deb86dc5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8fa26e84b74f543a35c105a06fa0ec57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"dadc05fe64dda82d0a032636d083e4c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"888d16dd7ed41835d1b29ba4be1e7428"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"d0f9bc98491ae8ae9440c0c7f20af14e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"df84c6c578fb4eb38db9d9d2d1bef85d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5a1c3e53040ec65bd8345bb1c21e4ef3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"57af3ab8d0741cd6c3f33162da07c178*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"50f96aa13807b30c31718b2c97efb5f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4c7e17be11eb5455f0f51d73ea4d896f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3d674bfd6bc5282f8d61bf8418b88dee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"b8fbdb9c1df5575b085e4c2bd8a96611"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2a180e1e9f79547031208f51d51e648f*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e38531a4adf5cecd2282671a477ecca1*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"bb4d26e8df5eef05a4439c36ebf74395"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f985a47f9c7eb367f21a37bb67b7fe70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh mb32_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"6d2958843153e7ab1788d6c8452adbcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2e332378b41de547d01655f364c7bbfc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"fd5b5235a311557acc2ff3bf2e006eab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1e7eb7fb4b86864dfc0091f744b60c9e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"c9484c222051808f4369d79090f2f8ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:7:ABcd32a16b mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4dbb3c3884d46e81f5c76e86527e6162*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"69c7a784b9177dac59dc1062a34e57ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"96755dc069408ef2c46b529ecb4122c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"53184272f4453c065f2607df607479f1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bb9b270c26def150feafe1d2a8461e37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"f19c98f74698f5cfd60850964f3e4098*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"b088a770b39bf0fdbe58102f381381c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"9ccc2f90a662ab17402bc897afa5d335*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9aecda5d7160635172b0a27fe9745297*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6c0232702a09e086017cae4962ea8270*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g144mb1_ic144oc144_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"506ca4e45ee9cc260c26643e62b4cbe6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b5e70af923bd1ba9d857a5a2d8511127*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"71d8193fe9a9e641afccf5cee02068c0*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"38f9173692c7c7fa2faeff1fa7e76f76*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc384_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9b2afa705c6103a67e69ae0afb638f32*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g384mb1_ic384oc384_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"15a9da85d5ff4a31a8d8d7798155c29c*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic384oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"52c0a07e78be5e5129598e9c07dd215e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g384mb1_ic384oc384_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f38ded7b939d0660f151e76be09d5deb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"895f6163bd2dfa46499415685e585e88*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g576mb1_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c7794931704ac6431f2931742a98f866*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g576mb1_ic576oc576_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"effb7a94b338d7992517aa66a3dc938c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"10204640c54437c9f2979236b33b3ffa*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g960mb1_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"660462d1a2342081d3ed9a2a232f55b1*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e735f2df2b38961d32c91fdb69bf0b97*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8661e4182ea7b193ece11f2eeee3f374*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"988914e4b74c8113c4e7896a7aa0ccff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic96oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"329fea278e93cd00430b230123d11ca2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"56409b99740397a9c70fe220afaf2788*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"bf4598d42f0792105a31822a73d54c12*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"3b87b40c221df20d0e7967996a8750fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"cde1fcf212d15f7ade9cf572ff2b26af*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"b51f0fcba8124822bde81081fe0980bc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic48oc192_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"a77e53a64dc60585f3de63aaa4da87e5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc48_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"e713dbd5296cbcd26b366fd8fff515dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic384oc64_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"39105aaec7abd0978f7762fdb9ff85f3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"951bdc97b118df9f09993a3a3c7ec866*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc256_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"1fe4cce6b7d625e8745e223005ecee62*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc1001_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"89391964e08c869b9a2c373dce06b294*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic3oc32_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"198033da095ee3aa1138fdc264dcc4cd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc64_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"33911d82d8a5b05f3bf10e50ae9bd489*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"847da0169478c93153a48e6271bf396e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic32oc32_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"7a2163347e1be61863e8b9c0708e91b7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"3df520653780ba5e85006bc28ac67f92*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"17077ed60ca9f2fbe25f2b00e0adfd76*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic64oc64_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"35c5bc6175a3497921e11cbdea17cbae*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"1eee03999eca7ba74f6e982ae839845a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"57893ad4a12f9660756a5b52678e29ae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc128_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"7bedb995520a5093e92f72417b6f62d5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"b6a2b66150a9aa3dfd180ddface12ebc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"48b25c905ad17aa6d224d067f02d4038*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh255kh3sh2dh0ph0_iw512ow255kw3sw2dw0pw0_n"de31e5f74880ce07d87d367187183b9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc32_ih255oh253kh3sh1dh0ph0_iw255ow253kw3sw1dw0pw0_n"9a8cf68b1956d36b0587a88dd771e3ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih253oh253kh3sh1dh0ph1_iw253ow253kw3sw1dw0pw1_n"82a32c8bcf366b06f6d831760e194225"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc80_ih126oh126kh1sh1dh0ph0_iw126ow126kw1sw1dw0pw0_n"9c081e4cbb1a859a715586615ef7feca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc192_ih126oh124kh3sh1dh0ph0_iw126ow124kw3sw1dw0pw0_n"9b6cdfe85f1370f7a54f5e59b5237202"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"335b4b8a6601e0f823aa6df3dc8559fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"c7817b9c5ad748957ecce06b282b1a40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"6c004d0607578c0344d6b190f4bca937"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc32_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"7ead0c7e4e549975bd64ded6601d4856"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"e93b35ea93bc2195ba856c541f967404*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic48oc64_ih61oh61kh5sh1dh0ph2_iw61ow61kw5sw1dw0pw2_n"c18a0e3896da4a89bd13191d908cde74*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic96oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"fc064cc3159f50430efaacb27822e551*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"9a86741c8a1799dadd6aa6b755d19f41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"39e076d5c7d6285278db18d6e716ada6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"06ea98a55d14c213ccbedd3ca542d278*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"6114f0ef0c59c7b292f4484e65b86ede*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc48_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"f776fe6671897078060cc7a358594c0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic288oc64_ih61oh61kh1sh1dh0ph0_iw61ow61kw1sw1dw0pw0_n"22168cde3334aad99932448fc80c2f39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic96oc96_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"336000346a36f2b4cfad45293238fe11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic288oc384_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"0a440a8a50237032770bd72ca8aa0874"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic96oc96_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"aaf613352f23cb794a93d1f422823971"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"5b6b050b94a954f5f1bb9a19bcfac86f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic768oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"fe1b0c5d9b2393ed819dd5e1787431b0*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"18d749fa8278e572986dc220f2480f54*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"cdd0943e5317e5891ffacbd4ce153d83*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"a633c28c7e72c41c681e1b1ded8219e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"6c8bbcf414cfea20f420ad5a2a88f319"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"6423b0b1168ce0cb30fde606c01f31b6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"d59cf506a22a9a6884ad1d5d50b588ff*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc160_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"49b4530fb0f99b41c833fcc7c2505a83*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic160oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"2bbc4944cfd08275cd75123c7999b74a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic160oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"5bc165de469d1212bdcc46e18142bf53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic768oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw1sw1dw0pw0_n"67915b31bdc9208cefb37521997f3fdb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic160oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"509004e33dfa254ca0994b2334deb497"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"64c2cff1672074dd95ad6a3aea714713*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"43b820d37201f087e2a8dab36795387c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh7sh1dh0ph3_iw30ow30kw1sw1dw0pw0_n"04fb769d8c38f48483ff192e8752403d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh30kh1sh1dh0ph0_iw30ow30kw7sw1dw0pw3_n"92dd1d31067fc753d9a55fee5ff52bbe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic192oc320_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"a0f5cc166f563f38d799928a551e080e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic192oc192_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"b654525973e6f20745d9003b783e50fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"509058a286a6990ba1a52f568d589ed3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1280oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"970156121f48ecc0fe2a016184fc35ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1280oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2b6abe6db0bc7087bc8ee31db95714a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1280oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a4c8c0e5aaf1cc4e8813a7b312d9432a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic448oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4d4e176ae87c4a6296a9fd2772844464*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw1sw1dw0pw0_n"0d0a1ccc9ef9b7d673059df83c12bc2a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw3sw1dw0pw1_n"ef15fd4218d6c45c3466901e671669d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw3sw1dw0pw1_n"ce82fef88e6bb13923232e20b526d787*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc448_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22a8f3b8df973f208d242bc5bdf1de64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5a2460d9c050607e688cd5f331bd71dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b877b7157930f71fc1230ed5ae13e0d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b0a41d1cb824aaf59c82677ad5375892"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw1sw1dw0pw0_n"573bbb3c2490b8d009cc6080523d61ab*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic12oc32_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"462fb9af4a78d33ad6b18e4d9a957227"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic32oc64_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"c97902efc938c1ca4ef748c3e1fafdc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic64oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"79596b6009d93be61b8c1ca588f00e29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic32oc32_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"3f4455a7b3915739a6cd93bab4f32ab7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b mb1_ic32oc32_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"0fc417c0dc12a82aabb547fda781dee1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic64oc64_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"58a34af557f2cc6f6580b43669bb3adf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic64oc128_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"285eebf787248fa13f6780ecd85f89c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic128oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5609713556539650710cd3bd0338b590"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic64oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"3ac48adc302dd64eff7cbfeca8f56a33*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"652c83e4ca1dd9aef703e1c30949a4bf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic128oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"12625a7894dd838e7de107a0507a359d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic128oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"a7a9866162439776f644e8f505192689"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"dffbfac6ae92d05da3cc2f368314f7bc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic128oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e07e05ba5a6ac2138470bb68df1cf060*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"9a609ba449e2834e202ea7118ca59505*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"c4cb010b3a3d451f5859e3906c97ba25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic256oc512_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"533bf1f0b7a7e9982ef8df49302cf038"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"32b77a9c1cf95f5f00a4b0f709eb119e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic1024oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"745c698fd20d17bee0a8b81e70721b48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic512oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"073fe435462443da845b99e95b41d62a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic256oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3c2dc4674a8936d2a2beb1d1c73d62d4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic256oc256_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9bad3e4a57bccff75a78c46dd4bd68ae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic512oc512_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"8b9c6a390643ab7a665e1e9f77f3f61c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic512oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a0ad2138257bf7bb21439569eaec2b07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic128oc128_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"3bdd2de6a447d394b200f5f870292293*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7604679d58a3c814360d98c83d24912b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic256oc64_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"78d6696219a99b8bf87705435cb94e2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic64oc64_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a9d3ecae35e70cc2536cba045fca9b8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic128oc128_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"6752a755baa07bfa7f131cf3eee62795"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic256oc128_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"285d16635a71d2a3beaa8817a6bdd29f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic256oc256_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"b00c70ae9ab93675cd73c30b563cbd65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc192_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"85711f9489b7b4223865f097fd3c608a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc96_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6c62afe9e02121cacef72b19d492811b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e58fbd7aedbe3d4525305c62f11b77af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc16_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"26e7fdfc35825a852b1424310a4d1c4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c42afbfa526ce03c1ca09005a59031df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"631f654ea9b0a2320f66ddc2c3b39148"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc32_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"124c53ea6984ca700e626054a1dcf009"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0ece2d6587aa39964e71e5d537100ec7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc96_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"aace9ad50584e3a4ff38b83737e9b0b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ec8f9150e54d38535c187f379a5a1f49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc16_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"58b5e4b22cac6ddd8a7ee669719de58c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e66e718cda54b5c49cada9168df3cc97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic480oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"95a86647b6200078383dfab37e2d50d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc48_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"9884afc10c5187d52266494fdb704323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc208_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"87c386d2867016409fb2bf661a940c89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc24_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"35faf2a946eff5bf5a62cf7a3705f8b4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b2127171544b9e7c2492dfa103e5b76d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"359c069ef61b3db54fa34f2453bda7b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc64_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"00b6c15c94a0ce32ee6b69636cb853df*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic112oc224_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a198adc50a8a1ae784e091330a67b14f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"51e517fa2532557a93732205c5e4efb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"48804a52a046a58f96bb609701c9dea0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc144_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e0050f59d9b688868bcde0c56cee1544"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"472974c8047070efae4384b20948177a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic144oc288_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"446a77bc063dc3ee6c194df3c04e76ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc32_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d3c4f7add957022bf0f90949c02ae80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee2fbbd54169dc6581bab30abaad0c3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"50f8516c1bc7a300b0a57e59f57ea72b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic528oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bcd17798c1d89a941d44755b7888bd1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"af024b0fc364432e0504ab86befff1df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc320_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a28fee19a48107320388df7af36f0ce4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc32_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a16758a8286f6f413439f5f97260d077"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d3266468bc53135354b6f67a58ef29e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc256_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"83ca355b5cd45fdf5396351bbbd76898"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"f48354ea3ba826003358cb349c2dd87a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc320_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e80b063c54c9549b8d5e4f1141e88997"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc48_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"607cc0bee8f2d856add1ea1568c17986"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0423d10888bd4613a06c2f8f22ace175"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic832oc384_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"07a65d068d535f9da04bc722421fb1e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc128_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"d5e3d81c9c935e47d8d2a4dfb9c7b371"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc384_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"49a3ae1979ee92111628bf6ebfaca74e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish mb32_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"d6f351cfa1ca6a77379959ebca37b54a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"45bda79287ef66980798348c5050a3fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g8mb32_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"dbdb7d8ef12d3a7e1453ab32611ceb6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb32_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2abe0658f3572930dfa24ca62cc00d91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3785ffeff33e80389eddacfeabe17be6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a3ded61e693c791739b9d0a78bf7d148"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"01b4bfb009c592a406507ca1092fdfcc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g40mb32_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"478e437033f340ca5ae0d9109ba94c6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5e0f99fc962225e24cc6f9bcbaa622d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"8048e5c0867c3838da88fa376bf8529e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g48mb32_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"d329891552d05642608d44ae1ba3c2d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"8047560eba4ade4876a26cd6d57f6094"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"fbc5caaaa7969d7560eef9390e4ac36f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g48mb32_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"23d49f353cfeaa0b845e8b991856ed04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5e49f26aef4e63f3cb74acb4396a24da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5fcf3c578c5a949b4222defb3931219a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"e7d6061602b6e16b0e79d9366867f9d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"6a6e3328f35acaeb8244862b9ed2381a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g120mb32_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"7c27979633e0f9cf161a0d5737f5c27b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"951014f00a1f5c150509208c2f2598b5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"faf2070b19357a94e039ed065baa0a33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g64mb32_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"a732bb2ba49a2dc3a22ee7a2086a5ce4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6ab7739af0ce2a2ca4a5d648bce33739"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9be0b4b9a0abc7d71a500297c3e3f92e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"5b31fca08f54069c2ab4a0686df3b095"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"98af7717cc387a4f94f13a16f0584b13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g72mb32_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"ce2efb09a1fb0749802a6f321ee85400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"08b4aa8c8b007aab68c47f4d74d0bad7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"41ba672d72c5460644ebda699e604f0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g144mb32_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"c2f66609720d360e5dbb4fbf1854a2ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fef74b77fe578746bf8e878294aba9df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"600b761bf5f84c55530086468292a6a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3736603faa052934885613a77bec0429"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb32_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"168f70b3ce5a4f34b2d75645fa20178b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish g288mb32_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"f39d5df65744a4c2c62fbb7335ac0d8e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8b6ffe3c0e372259123bae8321550a86*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb32_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a57cebe37da8be6486c82e318a7d6aeb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"cfb3ea925b2709a5be528383d567649a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"fd873926c84e142522e9053f2ce0852c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"d6ed592b30a8a22e92b1484dcb639825*5&84434d8bcbdfb0e8d93b5e992e17fa47"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"f7fb5fecc13ef2bb7ff5f69b1effc27c*5&4cb0f191f44628a77cbd9ff849d852e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2f2b0386446c705796bca250d4b02d6a*5&96206207b47acde6f25407043f55067a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ab4d6d43c9e126baa567502055f4a94c*10&36943b2cff41cd62790587ca8de104c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a31b81153e9234afe574dc62c2c695a5*10&04f5c3b8df4ea40ceab9bc8b26b62f51*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d4931ed760f2f79b07e3f2a2a544e042*5&477b55f88b9b4d10a21f31f431b8873d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"71b1b9279ddc7c5077f36549f53b4fcb*5&a75e610e1530eb70fb4da273d79975ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"113e61783516d2ecf43d9feda5985841*5&477af639a3b9a9280ca429f46caabeb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a4e5966afa1707ad050a3f19743f6a0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx+binary_add:f16:0:abx mb1_ic128oc512_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e5ad8405314c546717d7c7dcc93b06f2*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx+binary_add:f16:0:abx g512mb1_ic512oc512_ih32oh32kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"0b606ac70ba564488f4a5aa480146214*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"df53194101acc991f0f13b04962bfa3c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:6:aBcd16b mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e1ecf1608076fcfa47a606232be2cfcd*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx+binary_add:f16:0:abx g512mb1_ic512oc512_ih32oh32kh3sh1dh1ph2_iw1ow1kw1sw1dw0pw0_n"3d7cbb5402fe7d3eaecf8988ce42aca9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"560e43d0efa7c9292aa2307735e9cdfc*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx+binary_add:f16:0:abx g512mb1_ic512oc512_ih32oh32kh3sh1dh3ph4_iw1ow1kw1sw1dw0pw0_n"060c446c96548b091aca7a0dd9793f84*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih32oh48kh3sh1dh7ph16_iw1ow1kw1sw1dw0pw0_n"11b9cda41f3980f6ad6c1d8b2f03ba44*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g512mb1_ic512oc512_ih32oh64kh3sh1dh15ph32_iw1ow1kw1sw1dw0pw0_n"dfbbe90537a7580d348f3300741291a2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc128_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad77f0dd4fb5ce82c12ab657c3e49c2e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:6:aBcd16b mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5184ef6231ecb1e7f99f33689059c822*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7e3098840ded62466bda163a100ca03f*5&d2b613ca94d097b1e4996b986f5a8bb1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"c68d993ed9c3145a4381b7369a12ffa7*10&f007e6dbfe4383106da9058f8a9259bb*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"893dbbdf1aa0d3410be21ddaf260f132*5&d81a953d944b68ce4f9724bc32c0d560*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a87feeb3c2a58236b0323d69dd62b3b0*10&172e2d45555c11ec136e565f648ecf9a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"99f79b58480f059e2ed4a16552c53198*5&140c63c8a7573d5f82b0c1421fdacf53*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7511f8834d05665ac79cd002a1315990*10&2d144b985ae00e92d8321637523fd6a8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"14a4929f19ae4430d7c306d8179ab0aa*5&4c1c18af6af64c75a2832062e4e63aed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"8389761176c95617ecee8d3645f87811*10&25c5d2caec57d4b230b1c5a203db1b8c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"a21c8bb9acbd510d379e497dd7599d2c*5&4efa13ba52360732339c996f74e96adb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0ec1d2bf6a7a8896c327d1ff82591929*10&4573f78c6d556530bb62d9077a258e16*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"fb6534a72b4f37a8afb68f9bfa551a51*5&9873aaeb8835c45a0267d485926a8c64*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"469a16b1ea0d4c1c10da43c4d15d5bdf*5&c61980352dc184f7086ba6cf4502afae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 g1mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1278977da9cce42160a5e473ad72235d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 g1mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"43b001c1906745cda20bc1135a3361f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 g1mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"63793e8f8f69020b7bfed523339504c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 g1mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f79da09111202cf5fbbbc7d2e94fae96*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:s8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 g1mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"65eb25dff67389499a36aa8bf44d4320*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"914d426d0fd484b5e89a8cb626a23790*5&191287e956601ad6a5486610dae5db6d*5&ecd111450e77a74f524803fb9f6beb5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"93b3171c6ff12c626f7ba8478e3058ab*5&940bc36fd76ccfb60b998cf2de6246ac*5&7086845e0e2c5a8ff8e16ce6a0b7bec7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"553426bc9f881461de4997b81a2eb0cf*5&464016b9c5cb2341372a4aebe2bb4e4d*5&518cfb79d0bcd9b1c1fc626589147d4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3618075b73d5facc3fa40bdb731d0fc3*5&795242b293356399eb42e22c8a8b445e*5&317231d7bba2abc3031dde4fa24e102b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:2:abx mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"574366256fc95288e4da8a791a7af78a*5&71ce3916cc2330adbafd944342a36f5c*5&a5009dad52ac1dcdcd38f225658b0092"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"c2988dc1a2c9af2f48130dd9091ae46c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"f2b51a0e24926a9ac42a3154e91e39d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"322eacad8ff7e50b933042a77968ac67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"30a3a07e8d96f95240c6f042b4a82834"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"df8ee42edbf6750afda95d38837ecc81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"779afce5cd5cd4da5dd8d31f0b14ce89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"50863458cf503074eb7a19d590a0b063"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c19eb65d4cd097111e6f6f6879b18f85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"e5d89fd85c548396260b91a7669d904f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih257oh257kh3sh1dh0ph1_iw257ow257kw3sw1dw0pw1_n"ac865d7a4e095f70d343637e7efa8c32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc16_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"bd518072a14b1ed67777d087616673c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic16oc96_ih257oh257kh1sh1dh0ph0_iw257ow257kw1sw1dw0pw0_n"0376429378870c9cefb4227b70f48cfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih257oh129kh3sh2dh0ph1_iw257ow129kw3sw2dw0pw1_n"4d54a789ca10fc266b6a9991eafcc616"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic96oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"dcea4f3b35441e2607509491d8af511f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic24oc144_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"8aa1550f4ab34c1b41176a83c41847e6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih129oh129kh3sh1dh0ph1_iw129ow129kw3sw1dw0pw1_n"7dbc95eccc9f041ad5cda8c34a8fb257"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic144oc24_ih129oh129kh1sh1dh0ph0_iw129ow129kw1sw1dw0pw0_n"3413b5c5187cde75f2b96950f1bf3ec5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih129oh65kh3sh2dh0ph1_iw129ow65kw3sw2dw0pw1_n"e403f1386e67968c0700239821db42e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"8828eb7cd0dbe560dc58b82cfc051ac4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic32oc192_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"0dc2a6c01c4281908da55b4b3cd6cd1f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb32_ic192oc192_ih65oh65kh3sh1dh0ph1_iw65ow65kw3sw1dw0pw1_n"f2816c1adfcc19be65242a58e69b5edc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc32_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"3b3d9acf0f1afd3bd8021aede2d8b8e2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic192oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"ad784a8128710f67c672ae60ffee1091"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic64oc384_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"7190772cdb40eaaa3d20c09b43e911d0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb32_ic384oc384_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"fe7541ca137d637ff0ac69f9a6a2cc4a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc64_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"59050e99411f2115d6b1e6db6a81c4ca*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic384oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4ffc4d46dfa7bd23f9a6cc3f4de25d06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic96oc576_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"36a5ae69b2f21589595f15c38bbd37d7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb32_ic576oc576_ih65oh65kh3sh1dh1ph2_iw65ow65kw3sw1dw1pw2_n"b5c1e1d41f33f045033b14b81d5b7f25*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic576oc96_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"4b802714760b4434d9a6097d99eb4f9c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic576oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"a984a896a2ede5c1ba278cd84a95abb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic160oc960_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d8d706949bfb3649b744df0c8160e233*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih65oh65kh3sh1dh3ph4_iw65ow65kw3sw1dw3pw4_n"9f8b32a52e7b8c7cb6533def596941e0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc160_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"c06aaf2a60e452612350913c6b7064ea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic960oc320_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"49340cf1f0c5c72c9047465f4d1973a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic320oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"d310178d8e5e25a0d99705f3318c4373"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic320oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"896847db65ce529cd1eb438580e7b5aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"75c70c3d9f390eef5535714b4fe85626"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc21_ih65oh65kh1sh1dh0ph0_iw65ow65kw1sw1dw0pw0_n"cbdb6427a816cb4eda94b11091e48e0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"cdffb5cc14e1c45a8a47a5676df2f213"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb32_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"dc6ea6016f0c8b50bf6e90842d9076e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"661adcb3c5cd71c49fbf3ef290f627a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"5e3b2ea69b2e49ff3c3f0840cf7f99f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"c1056ed894d1e025680cd96b171255c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"225e30cfe7141b4ebcc957f5f7526ae6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"e489184ed4896a3ce426b66e53ea7ca7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"ea2b21e723b0e38a3b4f31b24e9b0cfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"86c3c53921bb793e1e7f5dbef6acc9c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9f55e31fa4e20027dd812105c726cce8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"90f9c4e6cb910da02c93c3c62d8e3edd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb32_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"726b5a0257f1e48a6babdb8ff3cb1336"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"57edd1e7e11127a6ae91a9ad49355138"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"31c313e36adda7703905f07cb44fddb9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"96b5d5919dc882ada639edee59b8be0b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb32_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"dea402c9ece76bd36285852891057e9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b1cb6294564a02a9740ca736c166475c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ae90ce45d2c869f2f975295fec41f3b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g1024mb32_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"62e0976edd4fa2c6c8191e1f9eb7362f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"889c0d966a10b76c24b2f379e41a59db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"feaa658d20c0de0fa56f5769a52a63e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"cba6d3d65162faf0e2889e3e0ab05a05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"21cb4ffd1f58d235ef54ba085695a205"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"a50ac908edd8a92c989111b5b4671220"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"af1de524506f89a5a4670a1e4c700b90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"e8f903b3fc20a5e150504f2819abf367"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"4ec1ac225dca4525958976d2a29063be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"1962e7cf1186e1d33ab25afd299626c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"beb405fdd85c2f016b2b13f088a7f410"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"4592bd7bbb65b119ed6325f44415d2ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"b484e2e49f2914a371d8a6872633a89f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"4d59ab4b344457931729affc95443414"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a739f62f30f9a760ce435cda4aba903f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"542cddd60781a5378d1aae269161fa00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"c9953a57610332ebd17e24bfdc8850c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"0e6e4d8d341e081723372cc9aa38dc6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc mb1_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"d80a9c9b57e21b5d9338d225d7a86860"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8602e1bb5e9a1ef82815aaec21f20b10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1056oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b78aedcc2dccd44965d3c0dc5688931c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1088oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4bc0f498452bbb862d39d796c84d3472"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1120oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"12cf8ffd9280029945346fc69ac2fcf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1152oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1c3b5d380f09918519a4653f219226ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1184oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b521fdd1a0c3a22ae31609685718c70c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1216oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3f09d6b8c9417f60e6fc234f8e0d260b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1248oc128_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a8b6c4b6baebf9eb0dfe7ee67292581e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d11a14bde2ac8a4fcac40c33de25a93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"108d5846f47001a6487bfe1c9fe6a7ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1056oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"60585e3b8f1fd68caa62e902066f5de9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1088oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fc3f1cb0bc8f9c9bfd2c9948cfef16ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1120oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8edbe1217dfeb464733d095300fe2d10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1152oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2b143140fe6abad7a96052d33851b69a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1184oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"53e25f9e691f297f2f6017c1071315bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1216oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"634837ab961d0802e6b5567fa1f9631f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1248oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"791659ad3b84cf09ccf4a10f0a0f8930"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"30a5347ef9f3a72c27ebd145016eaa9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1312oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5bba0f1de83c5920040e1baf9e226e77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1344oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"131dec428717d2e704602d6dfd5818a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1376oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d3d9620a87980534b1019f41d7d5f1c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1408oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"910c48a05132b2e57cefd61bafe4d08e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1440oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"32ab3e2a41fbb3f2e7ed33760ec92940"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1472oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"57baf36b3d3ed5063545eba4a00082cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1504oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c351543509e3b24b2514a4bbe1a820c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1536oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ca65bc29db1fdcf9b348b36a958deac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1568oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6d151c385d6ade77d1cf3c3bbc7ee0fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1600oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"166cdeda32deea8f7b324931aec70a35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1632oc128_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f624f8704630a0515d1def85844cce1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e0d76ef78aa44b6cac2060d2860c5eaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"76e160630a9b05768f44600845919d91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4e0368f29e9b7649cced94ae161e147e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"34f93caeca77409542bb0b6521cc797b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca250dbd0ff65de7f4b715334df7631f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c2fc9c62323dec3a261f0e0efb3f7951"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"503d516eda6e7a9130d59d9848da7b4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dfd868e8ccf40ad561613aec09cde01b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b14bf8d6f0f88eea9eca42b99cae849*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"edff0fd5b91bfa02150587e9ecc731e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic768oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"be1ddc02bdf1bc4a67e38b30d08d422b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"78ea9ad57c77acab629e92346c2db3da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bd5b01d63ea10b0159ea9c0613ee4984*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1152oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0785cb81bfedd27d325d033297d8c90e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"899c15c855287ab33919975f3ea4b5a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6c802a1a7e881d5f2589c265c99ce2f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a3f5f4e0bd9adf186211e8ac57223b7b*13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"68bacd25cd0898ad59833ec6b88746ea*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1536oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c0d2b3e53cb295ec2d9e430143b8007a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic2048oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"113df72cb51898ca3a6e6a3f0b33ef24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1536oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"328773b07b0ab50606652af144bd01d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2a189ef144e35521d99a93a195584dd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0708286c474c7e94965de3523d12ff1a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic2816oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fb4ae1964049dd06924375a86b4049aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"63d7666c77c0abbd5155b4a87bfa4b1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"58ff86da60581eb309f066ac8611bcbf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4a7f518643cf5bc587bc8224a0ca915e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic2560oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92b60873960c1690ede9bd50c59e9675"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic1024oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6205eb746545e33ba0859a248c721e67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic576oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"464ef391ccbfda0c2dfa45b04179f197"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic576oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"adbe620dd3ebc9eb23c644e0ffdf958f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g1280mb32_ic1280oc1280_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"18476812b4806b333d81439812e78937*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1280oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e1fa37b4e36e04a60fc491beb88e76a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic1280oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8e045aef2e02a2839b53a72e4f6ea92e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g512mb32_ic512oc512_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"3837c9daeb8a0d1ef681374188b2efd5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"8e46f3681dd1e02bf8cb5121491c6d94*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g256mb32_ic256oc256_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"aed117aaed7b425ca5d6b08b4597284c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f28a71bed75909917325dc08cccd9e3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g128mb32_ic128oc128_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"82f1d03168554306e0fc76ae0650f39e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc273_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"b991b941faf2bdfb9a31ed7fad46867c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc12_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"00564277d9b6db00ec789cbe14e078f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc546_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"38cd6cd3caed87e32ac0a10e390c69fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1bb5c0cd3764131aaf678ed5f78fe43b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc546_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"eb59f6214162ce796e0468108e77da27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"00db9da8cae47a1ed0dc696ee68bf10f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"2cbc119eaf6b8ba682dbdf871de99d6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"e90bf07bf014be69f1cdef88d6c8b3e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc546_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"f69ed3e36e24d0a53e8d12b52692474a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"9b6972f0ee29a3ae50d46d191d6a5e6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc546_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"a7e0bf627018e11a4357649a6d79ac76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"bcefe75ad483639518287e5112bbbcde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"e1b0503b4b3210465852a2ba30e3efb3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6993bdba366077563b6087252155e860"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e4dab5c0193e69311d5a5793f5c513a1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0a1d12cf89b7904b8ed211134d551d33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7c77c458eb4fb3498bddc2b7825f0ccb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f28409564d553d6445c62b0ae86b9f88*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3feeb653e7cbc6bcc2b6e2d768e7e6a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"26c4c70a90983ff5c07fc0c86dace46e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"82057ade0afb21f90c9fb1aeef21ea10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8c7eff822f84fbe38e3881ec17087de5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"eae8165f6839ac03007509ef18f3c155*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a34d6376b65fc45610ca38129fa52ad3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e33c81c4bca37e858eb4b931332cf77f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5eb48f9ee6c76a0992b4759334ec210b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"324ef0a154be2e2a9f932eebf57597e3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"5ffd85a0bcfc74549a9e2a0d22732382"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5646a45a04512e62de1dea5e4920d041"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99d7e4d3c875a8cab11d18fc37ba0caf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4dd3179ccf700646c2ba3fc016e7fd97*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fc5612834531c004a2c2d47974a002da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8918dafa72d95820ea987eacead29a4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"eb7232258d3f2362f9ef68f47c91f5a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"83f7f8e6fc7c053ac1f3c2349f2d0e7e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu --attr-scales=wei:per_oc mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b6d8bc92b2b8c5a352830a280e779cb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1ebaa69de5d7540951faf1a54a00839d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9456bbf8db305e158a73f86640fe66c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e58ae24dcaed52173e019b305ec6136a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb100_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"00c81cf66dd15f6ad580219170779b4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb100_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"817d9cf790643b037c2a20338192ea16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic4oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f33ef13e44e18370dd9c0c4cd7096839"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"3196076e050d75ebecfdf46039f305ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"6338b5d720fed0183682a083fbcc94a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bebe257948ea8995cf5c998558f43f3f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"07ba8e129541598d6310d206e10d3204"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e436c12568691f9fa11103f402f15a0e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"19686409b7f5eb56e7b39d88d7bff5a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c2d5d34ad18ecf885300ac47247e116a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a04f3ad28538b4bb18d11098b27ba389*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a251b473c5acf648469357150dbc42b1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"57696973a6e59dcd9f9828bdd12748ca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"02e7d252e2545b7e675107e8a0f9f6dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw32ow32kw3sw1dw1pw2_n"85ed4daed67efa3e40fa29c3aecc2c41*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3bad7d6c3742259afe20106bdd71c4f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f52560da8a7b6f7e2d8d536fa8719735"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9f75a330c24754d234690c8e324bb95d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1bc3454633330c7d87b0fd402981eea0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"bdbef47242974c896ebecc1d3230e820"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb32_ic128oc2_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7e59aaef99cb9bffbfbd33714aa6fdda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic3oc64_ih896oh448kh7sh2dh0ph3_iw672ow336kw7sw2dw0pw3_n"eaebf2e169676c864e5bd5d8dbc53b52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"ccfd8e0ec5eff60e9e582a7a8e062781*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"ffff23a9f46c83d901d7ab502081a150*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb6_ic64oc128_ih224oh112kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"1963caab8cf005a2121cdee15c4ab352"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic64oc128_ih224oh112kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"3ffa08478311b1cf478f12852586049a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"3313c84cf1605fcae249e35e95798104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"00e80490ddbc0eee170bd41f0aece31c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"131e4676087d5d3336c606d79a9a180f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb6_ic128oc256_ih112oh56kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"27944fd7f9a4e889c5e82d80786c9379"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic128oc256_ih112oh56kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"8dbbef1c2273030a1a22e385dac3b1df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ee19b9ecf9b35971b6c6932edb642321"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"a0533244cd5a3bdb9c1466e2d4bc9439"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"dde2d2d0c25362445cc130e21f2dbc3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb6_ic256oc512_ih56oh28kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"6c8a80beddf42fcdc7c7b7fe2fee466e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic256oc512_ih56oh28kh3sh2dh0ph1_iw42ow21kw3sw2dw0pw1_n"db66d285273186c37f3adb124e1c3d45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"0b43c9eb0e424929f1b43956a7082f84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"bb465aa8d414f1fd088f942608512179"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b+eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"5fdb959ddbfcbf31b40ce62cff9fd245"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1oc64_ih32oh32kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"be03cbb19b595c363039821e47427694"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih16oh16kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"6cfea9f3f8cda111533441caff5dbff7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"ee5b82e1e9a6938e58051c53e5bff6a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"1c560b618c62c90e02c7556f458bbfb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"e36e292f26a0f4ebd292a39b4a1ae502"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih4oh4kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"8c972ca5643ef985094cd2225e77dc1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih2oh1kh2sh2dh0ph0_iw30ow30kw2sw1dw0pw0_n"c433852160d19eb622c7d4a414e69e4e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e003f3755f83c6bf9fe96eaca77d258d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"c3b14c12c971127815d690624ff70fd0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"9f5eebda37a3f2c8f0ce4ad1de698462*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"5e9df76607810a7a69071e50b4872fdd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"979860051a542bdd9137477a1422bb30*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb32_ic6oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"4ce2ad2f0c95cae8b2fdfa7acab16b26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic32oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"307dd0f5c300a4442ddf7eb72572ca94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic32oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f90f3dc49492ba65a5e14ed519b27b5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f333bca0fe688faede3722beb556fa58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic64oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b1385b697f96ba7c7878b3a1c813d8ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2f864492273f07c7614a45bf584ce727"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"73b382ce139ce188ce3d7e1969236e8a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic128oc128_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"ddb9ba1cefa1d1b63af9e8020a79f9d2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic128oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"c67a1c5337d41936b0dabfff1c77b0f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic256oc256_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"d9494c129d8e73216c4f82166b0d8e9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"49cc289ceb9c8c5685a071de8936cf51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic128oc128_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d69a90449409c2d4b2f88b602804c6a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic128oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"1cbae1b50f4b221414fc5aa069303779"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic64oc64_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a724f81cfdbe6e12095d4a07ad9ecdc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic64oc32_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"9f3962d586eec1437eb5628b8dc80113"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic32oc2_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"eef83ba667fe0abb4f41987842eb2f7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 mb32_ic15oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"26ad4306fa59b82b237d7454c4fc4ba6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic48oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"656e77069536571e292ac88af70780e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic72oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"8899dcf85b82089b0475ebae33bc144d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic96oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5eaf553f9ae5e7d4b847977f6c919388"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic120oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"80d458e935d77819abf9fe1546686bc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic144oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"171c777ca4589bdd44a4c8242c0f80d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic48oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"1cfa39d6e11d7f2114b2d3b92e5d8383"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic96oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"d63058d9b42101184ef41d8143f75f46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic144oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f6d24ff059661d253174eefa13ddbc17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"5a9e9500ae3f8c6ee551547f1462f98f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic240oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"36f32595035f19a0156d4a8fd9719c8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic96oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"0097bc8e6b0e1c48697d696f1cdfdddb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"e298abfdc86abe42d478df984646a3f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic288oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2a11ac66319fa9dcc4e12254bdb2b27f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic384oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"92fe1544fbbbdaa2247f9cb3272b834b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"356179d100ec7b7d55bc1b15a1e0f609"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"f6a0270e9cb02c8fde049096d05e7cd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic288oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2d96328d81c69b29d2b67b551ed13735"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic384oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"241ef00329fe3bdbcdb1a051fd4909f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic480oc96_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"40efce062c09178444a0cbdcb35ab98f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic576oc192_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"7916bd747665137a25772743b7b33304"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic480oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2cad8ae4d5248be498eaad43b635b9a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic576oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b0ebbdb9f546d865a2d9dd56e04b6f00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic672oc96_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"373fe8cda7f3a25e7bc44a18e969c9a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic768oc192_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c8259834003c38f5f25b4cb25213edb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic288oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3609bf09a45f0fd0b593eff6cbc28a28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic336oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"7ce5871b8afafdc26e34db79b5c7a49c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic384oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"4a0975772c435e2ea866b28a67e575ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic432oc48_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"73a8c90e46f2e00e0a9195a04ecb69da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic480oc96_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"13d42b6fdf432e8e1eeabb1f8dd381b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic144oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"13ddd494eecfc72993d7a0817e03a96a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic168oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"10f37e00ad9f5f35b5a0d6e6703a1162"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic192oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"e4d67b6f26dec60aba24fabe9a31e181"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic216oc24_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"93205f34183fc91b7d5c20207b068e54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_relu:1.0+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic240oc48_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"d11a539368e61eff251c6bea45261e49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic48oc32_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"5551f504831bf63c2bfa203fc489ffd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic32oc3_ih384oh384kh3sh1dh0ph1_iw384ow384kw3sw1dw0pw1_n"50342b81a67e70633c182937f0f1ad94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"f8df2f4ac8a2f7947d6634e1f205574c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"53c7241c70a915253e9718a129125806"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5f22beb80896e8e193a7441934e7ee35*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"df1976d493931e3674b8b5da3e9711e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e8ed18c7d74c4078d11c751b28bd1e3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"de3458ff1dd50c7753368f68f3f484df*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2527a7caf67c3ad2f99bc80f2035e4ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"389ef429f9d3ab43da9bbc5dea0dac77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"65e30f5e6c344aa3dbe161d4f5cf4817"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9ec9469c9d93ed205e246739ea72f78d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4a2839e6c3b7c709c00d0d39e8939e8c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e4bd6fb03cd53e76b5b9ecc63506f27c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"2eab408e1b364141ed2d5cdc266bf340"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ae98ea1715ffe3055f0f500edc9f2396*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1ec62d75db45c134f537c742b4b2b188*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"50ffc910d99cb144cd95ec37b6c3f448"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"213d15c20875d3493f040861e790e1a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cf84dc6d1666bcdbe30ee7c1b233304c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"80e1fd9d8f90eae3ba881214602c6017*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d4535e440c0103afde4a1b515daffb32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c99a722ef44a48c8a15aaaf247348ae6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"058c220c5892a1b262971839a04ca213*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a17eb647815fb463b582e2effef7406a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd16a16b+eltwise_relu --attr-scales=wei:per_oc mb32_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0175d5c25d182e9db46f146248b9e475"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a219b2f5d182fe041ed89e356d3b50fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc48_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ad332f41b00d3456dbc282f4da2609e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc24_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7267e5561c86b928c4c4f4674d5b265a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"83e01cf975f6e0456c79b77a1fac88ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1b6d9afa280fd3fcec37e8dbd88e7fca*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"663d3329dd8197ff2ef2e89f96c9cc15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb3200_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f55346833a3d3aa4ece8e294d27050e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb3200_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"901e622708a67f8800519e60bc3d04c4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb3200_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a19fcb960bc04749822f31e1695cc2d9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"d3f00c13b7bceef1572e354f4cec59ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb1_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"b484f859bfb0680a22a89072af77cb09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"36fd183a2df17c026501573d5b4ce2a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb1_ic120oc120_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"4a7209403f71b59c4cbc66e751b202b3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"d5eb41f25234ae08c4522601f4a3365f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4f7f9e734b460aafc3de9ff5ae1c7457"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"77ea95e569b047ec65b78761f49cdd64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"9c6b24044e00e65c548ca53a86e317e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic672oc160_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ace7a0bb0c90140e7fddeb8c4f6bc418"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0d055c17e28266aa0242e6e1adc869e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"1e0c1704fa2edfabd4c1af0f1c098c4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a7487d3017d55f5a998736e8b7fd52d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0dd8f973223f24a4cc1aaa463e4d23b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic48oc64_ih128oh64kh5sh2dh0ph2_iw128ow64kw5sw2dw0pw2_n"74a8d9abb2bbe0e097a9423e8d492aaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"de7bf83180983a75978894749f30e7d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"fb0cc1ecff87117ab589e678d547fa03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc16_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"34cc1e88f62f8cae75c45dbc781eeea6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic16oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"988fd657708fcc6e9637b14252d78223"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g16mb32_ic16oc16_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"249a41ec10dd05b9d7ed6bb79bf0a94d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb32_ic72oc72_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"e33e2fec45ec0fcbb2a32f814b2e9268"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e6b7037d4198603d2240eca194928de8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1253b6a7118e66a3194f10a134ac90df*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"7fdd985b743b7142913e6c84825be552"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7b89680c552d412515cda630c941dea2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0889d11a6e58b3bc0a579a294e05780e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fbdf93fbf067f78a4823ec1a0dbaa89e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"56eb3a5149023858811d5d85fea5b838"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3586caeaf38361276ad6fb36843eb44c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d8fbe62f73bffcfe4ed773515e2e9eb6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"b740b704b70c0ac349fa603f6ac4ec68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"ca1bda2f11e8a9ffa1529dc3611e118d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c4b3a861a7ebf18ae1198fc9c3330197*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"be0fcc34ef8ad07b1dc89bb1e3b4acd4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g960mb32_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"48919c6bff517aabb0d1216c1c5d3bfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"aa2cb3b783e70ba9febfc518d03b6d89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fed8968ba324b09c7188f96a47b41f9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic3oc64_ih64oh64kh9sh1dh0ph4_iw64ow64kw9sw1dw0pw4_n"3c8308774d022b94098826d30e5334bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"194f304ed1c60fe869d2471bab0a1977*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ac099688a789dfca3ec440d34b2d0192*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"efb283634d3411f8859b13997ddaf2fe*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2d706afcf13618fb1f5471d62dd5427c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic64oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d6b47c8c0c182ca08285fc943c85e0a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic64oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d5154b6cf37d84f5a115e052a536b483"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh mb32_ic64oc3_ih256oh256kh9sh1dh0ph4_iw256ow256kw9sw1dw0pw4_n"7e039b8b69d62fbb45ff7543c5904d6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"f32583dce686f48a83803859abb66715"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic64oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1f3b3fc4f504d17a2e54ff37e8dc00d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6ac6f2636ef1d9b24aecc9e253086fe0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic16oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d810c80f7f7002f67efc86121d12355d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic128oc16_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dac3d6483dfe6d8833ac09db58819f51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"da43cf57a389bf927d4b42719503c89a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2084d182868e1dc44893d383078229c2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"faecdfc034a2fbc9147c8c1e2d255e25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"149e252b8d9c9517882fac20114cb4b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b29da900ccca02112a5d97a1a93d931b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e2a85a05dd6e0493424c1f3e265fa811*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"99f3e06b6a9429a4ab5167bc3ee4e61e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc48_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7d065dcbd7f4f05125b062ebdb179e47"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c799204d982d0c05b2278e315658f4eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7c16b002a185f2bc1500e04f8f5935f5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1880490f3825cecb812e88893401c2c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b51c917703fc057e0a429ef2430944d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1000_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01bfaadff4cb8dec781b83a068a99e2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"aaee3eb44e6bb60b89bbab3225fea84e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih79oh77kh3sh1dh0ph0_iw79ow77kw3sw1dw0pw0_n"d138a48f5b3f8096e8437519fce884ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih77oh77kh3sh1dh0ph1_iw77ow77kw3sw1dw0pw1_n"526e9bae4c11ac9812e8b1f63e3d0ec4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc80_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b2302d0853eb16a986e0d0f6989a1cf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic80oc192_ih38oh36kh3sh1dh0ph0_iw38ow36kw3sw1dw0pw0_n"2670f7b2e39eecd0db2938a46e8db5ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc256_ih36oh17kh3sh2dh0ph0_iw36ow17kw3sw2dw0pw0_n"74fe905aab67f0f75cc24a2b6903ac22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc32_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"c867893881310322a218ecf33d16007d*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"283bbfe51c10109ca606c54a8bfd1136*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"d83b0b8c9ce511fbbca151e226aee5b8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic96oc256_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"682374a704f36fe963472e04263ed109"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b00bb077cda0f23f596490879f8b8986"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc384_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"126f7be00e724ca7af3f010ba35d45ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"f1e15b969c873296b2b23f4620dc1a73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"f990fdab337bd76831ee5d9affe8b976"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e200983587f63087b7c77e13e67a5cf0*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih8oh8kh1sh1dh0ph0_iw8ow8kw7sw1dw0pw3_n"c03556c8d8976e8f121145015916ce90*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih8oh8kh7sh1dh0ph3_iw8ow8kw1sw1dw0pw0_n"7d90187262dde6653dea29b2a6221700*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"91521cc61adcce138aa525efe0ead7fa*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc896_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b2eda083d1ffcd4224f2ba44af3851e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic896oc256_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"cd1f564c5f0de8606b782987e416aa7c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e9bdeac3648fed04be561867e2496839"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc256_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"3fc7aa91611d63e8e11ed64e1b5aa148*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc384_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"31e91293a37b2c60572df5234dcd9802"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1792oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"a223d19e285d64cef8a1399b6afc4a30*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih3oh3kh1sh1dh0ph0_iw3ow3kw3sw1dw0pw1_n"afab93282ed6fd0f2db707b572d7e407*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic192oc192_ih3oh3kh3sh1dh0ph1_iw3ow3kw1sw1dw0pw0_n"3e3e0f5b40df50a65598bd0740ceec00*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"b62018c0689a0742cb7fa7e4126d132d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic384oc1792_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2ea49b462c1999fa3ab4b8fead60ab79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih227oh113kh3sh2dh0ph0_iw227ow113kw3sw2dw0pw0_n"abba1cf4503429ffd6eaad56b033eff5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"8cf29bee78a36f73ab322217c1c2d31a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"5ecd393f9a7865ca414b6a921b35b3de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"a094feaa62746739269219662334387b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"661e36eda41c3d890a6637171fe337fb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"7010a97ab64a7465c2bc9f2bf31c39db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"6af9f19c9e14778fa14665c617ce5d18*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"55dd41a358bc7be6b8493dad19991120*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"4c125c912f68adabbbbd3db8fbe07bc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"b75fca81f480661ef219449a8177439a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"7074e1861cd53964017cc04f95a12beb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"fd3612c536e3bc9658aa59ba9da1c0bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"15baa82e8c7fdc842d3e496f5d8547cc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"207014e8bfbc9edb0a365faa4c00b983*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"9bb36534ac333a9d568c776124213552"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"32b617953b3c5a9f431a306d3ea9ca02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"d2af274a8b74f63f3c0e8232aaadcb18*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"4a837600d7ccbc0ec0f74c916c563d14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"29bfe6c01f5ba72e58bfba7516e29788*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"9f3678bc2bbc18a115b423e4e787787c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"b9b966a094be2f4b3b64c88a6ae28d95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"fbe20fb7c686978735dd924571713e78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"dd2b7fa0baf23d285d4abe99562815e7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"12ec65d525c74ffb786f2346264cedf7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"779bb6e8832475a6194e8761945853a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"ea5985773c9f4dd74bcbdde79b2d6f98*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic2048oc256_ih42oh21kh3sh2dh0ph0_iw42ow21kw3sw2dw0pw0_n"9f71999d0128cbdf22f7d9b7f3fb17f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"83b841bb8013614d51a9bf70ca2d564d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"4171c215cb8b60d0bfae8211be6f21f7*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"591c18c89a63108de32b5953ce79bb78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"4a6dfa0c61feb60659403359332a4558"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"c2de3ece0edde7cb477967f00b3a03cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"50d3317450b2b65f896623e61a8ed85e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"4553739cc1ec2179667ce5db62ebe37f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"d3a780c06a396d7c5732f4303f5648ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"9e08500f6c78b1227a874909bebef7cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"2561f59a7182a9e9d12d89839c87d5b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1fb15dafffcb2540cdf694d1194888ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2bc976565c205f82f9c833442540130c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"fde3069a0b5c296e42e5f77be841aa52*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"aa65021ff00c765c354617ffde5b9942"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"9031a89ddab03e205b642b783fd134a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"046b0761f00e8140e0c21675812b0cff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"909680fd1c9f029b48f4d7876fabe6c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"6a0db86ba891e147f04e2acdb4b46076*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"0c687efc91fcbe3607dfc828578624c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"aff49b5d07461fbfdf0cd379ba8a84b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"95ac2cac2c7d44668388539c3a41f812"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"57d4f85665ae2cc220cb20e0aa287867"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"194f589fca9e26bf3ccc9220253c74ff*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"fca4acfc85e806f4c8579962fd04cbd8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"82d7df1c2c3056a2886cc0917165296d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"22a6f18eea100c1d229302cced95187d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"52d1a352cb20035d54942f68fc7524dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"87121743c13680efca7e02ae942c55ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"0886f70786be2b4b4b3a57ae37d212dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc64_ih334oh334kh3sh1dh0ph1_iw334ow334kw3sw1dw0pw1_n"462a128ac11f72adff553181c8c9d3c6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"d3e86ed81888f646a8298ffd2aee3a43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc64_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"ce192017532b1872a52148ae4a17ba07*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic64oc256_ih334oh334kh1sh1dh0ph0_iw334ow334kw1sw1dw0pw0_n"88045f8728793f5e10b18bd302b3a51a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"36d37cc606e328a41def0d1bffe0b266"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc128_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"6a98fdce271d7628a5a61562010e0e63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"540be200c2631eff49654570779f37b2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"1ff4cdbdd6b0181a8016b3926520450e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc128_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"e54fe55451e4fa90e983954d1866f053*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic128oc512_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"1ae88df210c14ddec2d56aa52add470e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc1024_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"ff54857f223944f0df6425957914728c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc256_ih167oh84kh1sh2dh0ph0_iw167ow84kw1sw2dw0pw0_n"1a0a8486024d29bc074f428a514a27dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"3d942e4cc22589e86aa64590725d5d1a*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"21e274dcd8e653a36ab2aa1b279404ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"5a470e0f5fe54e3c6d76bc4f06381996*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic256oc1024_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"ca2977939a071fc6f3d40ac8fb003946*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc2048_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"e334b0ed1e67c122bccd7cd1cb50ae1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc512_ih84oh42kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"3ea852fb4da90992ff7afadc851a2be4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"075392d631ad249529f47246d32e7e9e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25 mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"7f249d156a4b976af2aa376fca24af96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic2048oc512_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"66a86561fe2404f8ff3efc09937c8627*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 mb1_ic512oc2048_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"6ae023ca35a21ab76c5c49d9f7c2947c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic2048oc256_ih42oh42kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"b8d6d73ada0a9cf4a4f5494c7e671016"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ca596f46f0405d166e68e35d736f721b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"b63abe0900a4a2cb79f953e716f95c56*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"caab582197d5d3a37d46432cdd1d5e83*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"ac670420968ed32ee39a606127348c63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"fcafed63a90eee697cd723ed2e49779d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 mb1_ic1024oc256_ih84oh84kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"4f43caffa35ef55d169ee6960d2f9387"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"9d41a893392a0d028c9bf857671950a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"ecbc6414ca7ca5ae0a867db99623b31c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b3962e5890896d5bcb29f95834e6aa3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"2e5d8130bc7afa062f58f5dc7a383c71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 mb1_ic512oc256_ih167oh167kh1sh1dh0ph0_iw167ow167kw1sw1dw0pw0_n"4148c4b93ae444c36dca8ec94a7abc94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"f83ee7540c1780d002e35deb06fed9d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"ed7fc96d17be76242f34d5b2f722e022*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"c5c30c14055bbd2eb16c5eae815f6c55*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"efab0b53b4409a11ede7ccde47d9c101"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"bb58939c5a4093801728f07c3dcbcc33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih21oh11kh3sh2dh0ph1_iw21ow11kw3sw2dw0pw1_n"fecd99a9e2746558e4bffb692257a043"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"76e4e924bb9046fbc1f2bc192831d878*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"12b300ff466cd1fd25d2c93f5306040f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"4f5009191553fbc06e1d3fb8e943fded"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"7b6b797814ca878517bca08eb09faaea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"239ae3e7d8f9bf2e92cdb0981d5800ba*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"c9037b9fa1a8c3d19333a2bc1373ab02*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"e7908abc18811870a0c1880ec7c3406e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"892c2297273acd81641d570ebb3c86a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb32_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"5335ca9d39e351d822221ab61ec512b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g16mb32_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"c86d1e40cc504a50f2f2da33fc2d16ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"7a8a9c63f5a1c34569fd103767a4f436"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"152bae3f6ba4548a9db29b4d8960d2ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"14611e78c98c984b5728fbfd90088095*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb32_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a3afba08e0d42b24fd060491e703a871"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g184mb32_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"be82bf3ebb15d9c22077a1dded105024"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4cdcd04135a6b080d6c367e548419c8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"69f54edc7a328d37081ef2590fc19ebe&964629d1e936d2114bb582991fd9bfd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"cf01af9beebc36513a8918fe6ab46cb0&361c2af0579f6dfb3b13bad27cf0d365"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"865b8dd4af1da7333a65b904d40d863c&3d640087ffc75dcdf2c08f57c5bf48a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ed3ce1d4cecfdcd8ec752aa2f68b9ac1&893c17da7f71ed8ed9154b1fc9821553"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"c79c9fcdde6c09eba046b24520748793&f4f270acb6ee414d2ab1c5fd59d67f07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a8aa8f8e6df3065ec1cf4768af25fa20&efc0ac4fcbe02f0997b29c36f1147c7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71fa4b24b7c0c53b41f7fbf41d719fe3&cb3c357d50d4170f51fdc04e6c81e642"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a7738796d79b7029971da55b73b69759&7f3cbb9eb4c1dddc3cd1efc3ff268c1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a3f63de4002a6fd733c1a82ee7bba95a&6a0e5743630bb9a5205ab3c24b596724"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8b50fb3fb6b8b5fcaa97fbbd2acd8279&df29d0052c3e87163e98fa167d570219"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1812625179138c4f2f30d5733e6227d8&f167f2c5e387cd1cb28396a37fcab809"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e4c3fae15055a90c8b5b3976bd8267db&1a6e9c3b9ab5dbb7540656a1a8c6ff16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"f358f5992cf263bd2b927fd692a65d94&dd91aefa7e2cbc02fb02314d3d5b014b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"4895eda7d4587b559c6c1dc74d029517&43758babaee8e640dd0cf795b16ac6e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"343385d24b8e23eea0260cf8b083064a&50594b74670ce8d0a7d67c092a1fd324"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"bf125cc14dc7d204f37b9e26555c38eb&ef53eaec0b4b9bab13408716c86bae2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"1d6dfbc877dca8fabae7547b40da0009&dda1d1db116d9fbd940540b6a73d7f23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"8eef877ca27644068ec116139f11aed4*2&5b71a6eeca20ec354dfbd52256adc485*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c3be551d9b5dce5ec52bc767e1aed73b&8d2c7741ffb16a97965b1f1b8370663c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic1024oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"b4354ea730f5d816bff780d6b464d5b3&b362447c681302fa9aabf3a27a5b90f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"58b4702ffbba0f1414cec5b96a767842&52412e9b861908e1c24718b7b8ea30e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"af08fab00789848dc0b51b979eb21ed4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic384oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"f335cdd357e401f47fc22057d4d9f170&8815593c431b9a9a91d88d2f4939fb98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"222d802f215b9a821d208954580cda2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic3oc32_ih704oh352kh3sh2dh0ph1_iw704ow352kw3sw2dw0pw1_n"20c316087390a5d1c64029248fc29739"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"3388bf5de68347d223f5c71ab080ee7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih352oh352kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1_n"4cf16694f9a06bfd0083cba099e22421"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc16_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"caad827d5199d1adfade4010abad6f5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic16oc96_ih352oh352kh1sh1dh0ph0_iw352ow352kw1sw1dw0pw0_n"70fa8b11ef9c4e923bb58dfb7431e6a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g96mb1_ic96oc96_ih352oh176kh3sh2dh0ph1_iw352ow176kw3sw2dw0pw1_n"fcd7a352e8a88b7e46df9b108fde967c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic96oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"aead939980051e6396bc91375210161d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic24oc144_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"51b7a02bd894644f1ece9b98db51eced*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih176oh176kh3sh1dh0ph1_iw176ow176kw3sw1dw0pw1_n"c6c47136a5aeef761f6f4baa44da5d6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic144oc24_ih176oh176kh1sh1dh0ph0_iw176ow176kw1sw1dw0pw0_n"da31f015f22ea733ea88aa47bc35db38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih176oh88kh3sh2dh0ph1_iw176ow88kw3sw2dw0pw1_n"ee6011bf1efb826ea64163c880af46ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"2d7001ccf74f1e51fcb4e5940a3cfeef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic32oc192_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"5901673f7030c55c5093f133c5244553*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"72f6872c0436a8804d473c406ec46c7b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic192oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"9bfcc01506b99231a2b7fd96c939b833*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g192mb1_ic192oc192_ih88oh44kh3sh2dh0ph1_iw88ow44kw3sw2dw0pw1_n"da4145fbcbab0764e3c7a1cdf674126c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic192oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"2880c566c1939661b32e3a008ab0cdcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic64oc384_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"f7c7e2105b0f7b57ac7996a3a7cd2310*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g384mb1_ic384oc384_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"0ebf00eb6d9381598c0d6a2e880a2a96*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic384oc64_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"019bc0649e64d9a105735f9ada10fe5b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic384oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"25985e3da6a2a2703cb5fd7d102f1654"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic96oc576_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"4dbdd9da3def4a6a5bc732ea5d080ec5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"359f13485b1323395537d019f6f34547*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic576oc96_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"84a1c95fea73229a5a02f2c4f3a4eb06*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g576mb1_ic576oc576_ih44oh22kh3sh2dh0ph1_iw44ow22kw3sw2dw0pw1_n"8c5c89d9873dc9dee56daee47649ecca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic576oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"39ca44afdae6a78167e30f11174483c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc mb1_ic160oc960_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"a347f1690dcee65fc8d7621cb1460a61*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc g960mb1_ic960oc960_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"94337ff2e6c3ad1582afe3832ec5ec57*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic960oc160_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"6ace988336e83004e6d83e9ec67892be*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic960oc320_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"8298d72602e005091776ae6e7a831883"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc32_ih22oh22kh1sh1dh0ph0_iw22ow22kw1sw1dw0pw0_n"5bee38968e408fbe962f65bfac41b76f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"70cac6a37dd78d5f0f5a468f316e52fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"c109330fe923f1f392a1f3f4a7b57221*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"eb91c2cbe22f36b3ae403e7393963126*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic32oc1_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"5061e14aa36b5e87ba52eca9116b42f0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih11oh6kh3sh2dh0ph1_iw11ow6kw3sw2dw0pw1_n"a0675cf5d15c84dd623165822a432bc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"14007e42c30215507688c9236c892946*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"f7080fd1ef17bfba9c175b8e145ab181*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic32oc1_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"17cee5177bec33c46a30c180c1737ae6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"1d5367c7a7905547fd6db753efe885aa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"efa899275bc034232ec4d589acc265cb*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic32oc1_ih22oh22kh3sh1dh0ph1_iw22ow22kw3sw1dw0pw1_n"43aa54e670bba781bed4f54f7ba52dfd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic96oc32_ih44oh44kh1sh1dh0ph0_iw44ow44kw1sw1dw0pw0_n"326bde0ed6d2084b01796afb8552aa23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"af776b1109660a060acdf1957e515d70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"312b4e38c7632839f3950062f4352259*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"02006ea47ea4b0b638617ba052adfc89*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic32oc1_ih44oh44kh3sh1dh0ph1_iw44ow44kw3sw1dw0pw1_n"e953d861fbeb3a84f3aa4bdcb98ee649*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh1sh1dh0ph0_iw88ow88kw1sw1dw0pw0_n"ef25f81a7c1970956531d8f4569b4d09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"b92228fd82e55aca256cfa97e9fc861c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"bdbdb349e42003f42f9ad85ba6efc2ea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"68385c540885bad83efb64df19c2de0c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx mb1_ic32oc1_ih88oh88kh3sh1dh0ph1_iw88ow88kw3sw1dw0pw1_n"47f2f478f11c28dededce8b407fd9e53*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc16_id8od8kd1sd1dd0pd0_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"ee4e1a19cb4dcca611fb2e086fa2db27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-scales=wei:per_oc g16mb1_ic16oc16_id8od8kd1sd1dd0pd0_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"6fe712b547eb8c01475b193f63374c3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:acdeb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc16_id8od8kd5sd1dd0pd2_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"ccabfc219ae04060ae6e9f620abebc9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc64_id8od8kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"36fff266430f8a52b20627cb0dcf8470"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_id8od8kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b962c1a8eb53d31607930bb81dba85d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc24_id8od8kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3a720c4310b529dbf0962b336352d7ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic24oc72_id4od4kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1fcfe16fd4683c3169d202cabf7d4056*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8829d04ed1e5077a4c6536382ca3c521"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic72oc24_id4od4kd3sd1dd0pd1_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b37742cff028c8d7e9018c28d7dab7bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g72mb1_ic72oc72_id4od4kd1sd1dd0pd0_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"8d0e9058dcbf69f7608f90876a330a7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8447c27fff944d6e496c528981cff0bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic18oc72_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0963cdb14448cb78ac410f0d8468815b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic72oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ac9db5a4496274305558d91c56a43cd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic40oc120_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"aa9c2b5bba78ac933d0896606951cda8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g120mb1_ic120oc120_id4od4kd1sd1dd0pd0_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"d0a909c559fea0d307e3c379968db02c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"159bb20adb6b057400b9b1bd1aff6a9a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic30oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"919b8acd7dec20c28fe78cc7e846a9a5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_id4od4kd3sd1dd0pd1_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"74279967ea21d683608bbf57db773665"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_id4od4kd5sd1dd0pd2_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ba958fb9843cd7580e3a2b211cc2e74b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic40oc240_id4od4kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3da4879310259897a75acc42dd8173da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb1_ic240oc240_id4od4kd1sd1dd0pd0_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"cbdd800994cf264630e43e7c384e1d35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic240oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"10f9e6b03bd0668cd3de0b5978b8f225"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc200_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bb90248a0131af0ddd215f4a04b2773b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb1_ic200oc200_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e2c85cc4d6db2abb4a712fac43ba9c40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic200oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"66f2c7336bd313dbee101e8b5b64c05f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc184_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cf55d32d65215bef34cdd61e01bdfdfc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g184mb1_ic184oc184_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4d5ea05cf0823bdc9919dc4bfe29b630*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic184oc80_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"04b236492a8da63b9232bc3213a4c8fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:30:aBcde32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic184oc80_id4od4kd5sd1dd0pd2_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"486757de8e2d3c0b9070a76215796a68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g80mb1_ic80oc80_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b86070f4124b27a1fe67a24c631157ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic80oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"830dfd30906b79596df5d7cfea6213aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic80oc480_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"72d2bfdf53f0a25ef32cd8a8f4aac9e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g480mb1_ic480oc480_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"63dd20e35ba63f377570b689a16f5493"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic480oc120_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad307bda5f5d95a9a54cf2e0ecacc087"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic120oc480_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"87d2a87fe81f39c39b8d88c93601f5b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic480oc112_id4od4kd3sd1dd0pd1_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f751041c6cd6cc4b3512717bb426589f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic168oc672_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5459324e43cb18a063b9fcc14fe3417d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g160mb1_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ccc14e41086101f378667475c71d26e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7a1e080830501697f6351a99a432827f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic240oc960_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c46f80d9834aee6f6d4c481f5f25f14a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic160oc960_id4od4kd1sd1dd0pd0_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"571cb2ab1ccd4908321263a0fd423efa&978c378ca5e40e514edf307270d5dc21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic960oc256_id1od1kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7cc6cfd21e3bdedce41e5f561230e29e&e5f47c9335e289ad6929eb1a4e971c46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic3oc8_ih32oh16kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"b8bde5aead692fbd82c97a2d06adc834"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"8aeebb14929e6b8f538a6e090cc1c657"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc g8mb1_ic8oc8_ih16oh16kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"9e22369666dc20abdd37db3cf26d1e77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic8oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a22b28a7d64b3db7dfa59c4843196ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic2oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"df70170f9dc3940053daaa01bf0f8709"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic8oc8_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"60904ded8b2fc47b309c5a3412391191"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic8oc40_ih16oh16kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7cb38e4135d3e860c5d87e2594ded052"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g40mb1_ic40oc40_ih16oh8kh3sh2dh0ph1_iw320ow320kw3sw1dw0pw1_n"64b226b7e808f6f140735c3237fcb4f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic40oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3012b6bf7b81a8dfdb373171fab7d423"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"2125b4fc20725ad8c82a31ea35e1d077"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g48mb1_ic48oc48_ih8oh8kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"42e2b1bb430b1b3b6fc21a82244e7d9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:acdb+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic48oc16_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b3688eecfa91ab1f9931619052dabb44"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc48_ih8oh8kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"98393f317343de85e6418a404d69e8c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g48mb1_ic48oc48_ih8oh4kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"d07a455796df0a33e42e6cf0c61ed366"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bf508378586a6fa537472b94be2849de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a02f3f0b289c090e984f4d2709f04d45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic48oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"010e7ad3f72e3f0388ac275a86febe7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc120_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"b320402f0d6539d32e783f72bd410de5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g120mb1_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"1c41b85db782864744812713ab4148d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc30_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f8c0617f07197e4d1e0c64ebc2d5b647*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"a4031cdaae836b7cce49d90311563dc2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g120mb1_ic120oc120_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"1beede50279b2684f1c5b9710741b70c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc64_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"81796c784206ae2ac585f06397bb6e9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"56b1c422d572e0cdad80a93b09a30ecb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic64oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9ace128c88a6d7586f826c00b8fa3d2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic16oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c5bd788fec2e5f63db4bac52c699691c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"3a03ff5ce42d1e7ad1c50d91c1a54cac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc72_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"23c24e9bac7752d914efb330bfb72188"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g72mb1_ic72oc72_ih4oh4kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"0fbeb157e463fb5e9dc7d14bccd4c9e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic72oc18_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"055595b022caaf786d85b333de954a9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic72oc24_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"4363c812d196609613631dd285fa2349"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic24oc144_ih4oh4kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"06256de36d29ab9ad41d96e26eed3473"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb1_ic144oc144_ih4oh2kh5sh2dh0ph2_iw320ow320kw5sw1dw0pw2_n"745f3e6e9e57d6080d9e42ce6b58a857"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic144oc36_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1976a40aead8052e5fa61529b045de89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic36oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ee8675b4718e8083f3e074a1c9d2b80b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic144oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7fde9ed6916108a401404425d2ad5a23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"f380efe4a2ad99c1771d70b69efda5c4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g288mb1_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"3737bac67ea6dcfbbdbf5415d3975cf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic288oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93506643f37753aa16ef1af05c2ab278*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic72oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ceca9a4dc42389998fb000b79eb7d9dc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic288oc48_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"1ba5d853a4f109e9b8c0dbcceac8b9b1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g288mb1_ic288oc288_ih2oh2kh5sh1dh0ph2_iw320ow320kw5sw1dw0pw2_n"29ecbffdbecea0db5a4b9abdf0aeac6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb1_ic48oc288_ih2oh2kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"7467d1f20c492067f591de69a8113138"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fb1f9a50e0569bf40b2908bd8e94c6b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"3437b1b61de3bb705486e51e2903ddff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"6d116714e17be2070abe82b2155c6939*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"6b63d1bd90f046a02a90f891fd990e9b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"b3b126475c8e5948344f50ca8d63494c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"326b337e4a36a7a697616b4d1d7fa00b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"c8965637fde854ec02889c93e9b00f28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"4273539c34cd5b869adcbb32007d608d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"2d27f4506e5dca410e56351c5556213e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"02573f1d6d160900096d95c8127362aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"657803a3adef38f03c37240d03610560"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"8461c83a574a3905e915a5dd6db1be4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"1ccff874e5f5dac88c34c40cb021c9fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"91fbae16f045ce6651f9905c7273b86d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"8ad8ff709ea7b08492dddfb4669c97ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"094b1e58c327510c6d21b9fa2de9c097"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"c6c7ed78d5723c944ab901faba6dce0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"27fa15f1308c83eb45c00482fbf2487f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"0b390a147585fae389f7c7a5ca1c1f8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"4b0c32e6613050a39fe2d93d44b6fb8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"db12da5c52d9358d9bde852e1ed81f1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"8db4ef8a14fcc932463d3c6a1bd89a91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"45a935b2af0879aec1c5aa12b94ee03a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"4d19b401c492fa5829a49198e0b70d7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"eedf8ae3e2f1764b035d56a2d549497f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"37dd22aed63533336e1ceeef03a34c1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"3eab8f083b6c14748a168feedca7e4de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"92b6ab669588f8559d8d29ca5ece2d39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ca9b59bca9de924b450994ab84c1a7d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"013ba1433f319ba646c7c276d57ec142"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"0193c80486252f2840295706c1d97ed8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"d65db41e960c78568f43bb6a1057b6aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"4c0a982a5f0d02d5f320f29292d1b4a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"7800d87082887f00af918abc894fde1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"3112c44dc469a45fdf79eeaa46af813b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"4a020ed56f7799d22ad76d95a6e8741d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:14:acdb mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a84d14235c333b97218a5c7a6bd867c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"408ebaf18a17d34c864c5433080c80b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d839671f4e479ae8c7c1766c091a382b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"8183e342aa78d4eeae0ec9bbc4c777a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"7c8c5f84af06a98921faf77dc0dc5511"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"a7aa4b1dd15fc4a15805e09303a1a3b4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"612e1569fd91e663be815d1dd74c1a55*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"d657893948f5da2643515555df010af9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"86a21ad9888fb960b720b33d80e162c2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"85fc59265540aa8bc3f89fd723b11b11*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"bfe2e4f4a31f3116beb2e3420787c0c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"8354766a8d2054fe17d33e34434a15a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"bc47f912fcb40aff93b2e31e190e110d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"d99733873edbd3e2848adfbe63d85ac8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"734d99ddc8e6955dfa9566773cd06a88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"cda15db047f2d51d1d6b2a9bceed040e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a40ac5c7ff7ff245203761395694c63f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic256oc128_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"88417a2606daa9c1e354dd2ed1a6a977*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"166c5bae2fdc5fda1ce0b98c03d0a6c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish mb1_ic3oc16_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"049da3a96c3685c05ceab3b32ec2be50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic16oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"340791a3b509254045221c22a0a1db23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"1ed854f625f0a66e5672ba13061921aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cebecc61de6fe44411bdc909525f8dda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic24oc72_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"255d4ef043534488859f55d507beaf55*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic72oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7050e370901a63b4a79f9c0a6b66a177"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g72mb1_ic72oc72_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"ec72fe3cbe60fdcc58083a20c7246cf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic72oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d4889731228aeb6536b00154922a06d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f6fc6e44080dac5dab8757dd634e227b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic72oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8f72a7462c99e53202c12d4e8172202a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic40oc120_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"48a1c597017b10d10c206c6e909d11e8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic120oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"50f8a05a1cfeda928cd02afa9853ac54*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d0cfc576e54142df2f4c8e2df83df00b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic120oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3464ae6d61909bc3893161e12f262a64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2c4bf7856281626dc867b1ba5ba0e169"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"1185355325080457613efe90029ec532"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"eb09bdaf7570a7b4d4c2bca4a7743c4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic80oc200_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf1007c4e92545b07078e7f200ff4210"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g200mb1_ic200oc200_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"54445afd4a83971a8e4ceefd61ea5630"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic200oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a34be5af08a0be6517ded9b6fc702d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic80oc184_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6e584c30d9a71eec97df459580ac8465*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g184mb1_ic184oc184_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0fd7f448d3dd835bb2d6700bc405fd01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic184oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6e347396a62938e4160e26db43730260*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"afe51e1b9d3e4a4931b5ce59fa277a0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c5052427d3f72d1fa7cd11354f231874"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e6f181595d35d0927707e60e749e7ac1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"be872ee3805862b138865b5c3f9956cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3800f7b9038da314c19e96c65e50eb01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4eb8f87fa184fff4325a145d3caf3ac0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g672mb1_ic672oc672_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d52664df6a0f12779e9810ea82a981b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic672oc168_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"27d03230f00b1e2d7acdb48a546cec6a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"071f568cc9733fcf3dbcf083fad0e3e2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ba21284e25b57c80d486adace329dae0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efd19fece95bd52ce7a4a766c58239d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"693cf08734c8c2fc0949263f7025cd7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic672oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c96ecb5847c05e32b184be24d9c45648"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2f888c82c26d5c1815d207ad05c86ac6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:per_oc g960mb1_ic960oc960_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"1ad1539882ae0874b647a2495644a70b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic960oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f11fb68e69ea5440b6a59734acf480ca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4dd4bc8aa06ab11efbd5078f2407959b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic960oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ead43a6b6848679d73f08f566fe0f1f2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f1fe37d294bb72256c6279b1fbf4a143"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic960oc1280_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f8844326708ca71f0fe34b17465022fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic1280oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b8bf4c2e4e8735d6afd558507d651f60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"4be4b34c335df282ccf1747ec8ae7944"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"5610e0110d66b9a8e8c7ce29e0592818"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"093bf011c1fefdace127c511390de18c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"c7508e231960330f2a8203ac7afadb1a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"2a84afc85924f99312951564ab0730c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"29f931883a5814d4b7a34218f36dbcdf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"073ef25303cec87f38ee2583e925c9e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"97b73a307591da069bb22e78d5fae9c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"b52981b60313a233e58c7f96c51c36b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"21d57cbdab5a62c45a46439d83a094ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"3c1b9163dd8a1aeac2964f73365b586e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"a35c9571bcb1365748c447a455980dc1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"650a4aacb257684b7b163267f819c863"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"2361cb25350a0d99a8bd170edf3e6458*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"eaa5958db47cee1bc751448232ba997c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"a9c208d65cfcf1e0afa12ac83bba6468"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"83143cc2dc8ae18b9b031d77f9b56773"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"aae3b159430d27241f570490e5088ac6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"7e15d4f9b9721c6a983f96e5bbec9a46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"a505f88ed63ad3b4e8db0bd882f9a88c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"98cf4f88024c99c82e2bb451db1b1eda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"4c79cffcf65bdb6f46c7464165205c52*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a7dc080a98c1d712f0e0429eddab2212*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"f9376cbe39175404c4bd4717a7928485"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"10338ab6de4e3cdb69ba43f759b05c33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"58b0f6bfed93b9afc2bfcee911a8cb13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"f2f5e27bb24fce3ebdddef176335a385"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"4862b0f03b74e2d89d6ee145f1855093*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"e68c847af38b1107a2c91e162fc07d75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"f3966cab9f824fea037776e70801d090*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"d724b5fb4c854563203d81cbcc254762*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"2a877d5f1784f730129ec5c27bb8ea68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"cb52a8a71f021d7e3f06b6d607cf9d20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"ea6515bfcc40f36dfe5fbd9544460b4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"ab2faa6f393bb299476ac3a2c37b105c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"a9740e9f7d4689099053f84724e8fe61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"b32614b90f6808921b85c7c0d0a1fc4e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic+binary_mul:f16:0:abx+binary_add:f16:0:abx mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"47ba5659a6cee68ea438123721e0b73e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic+binary_mul:f16:0:abx mb32_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"edb6e961a8a6e57663b329bf9639815d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"7f2b916aaaa123dc81eae4180899df4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9a9e94cab9439591f5ee3bdbd35783b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:acdb+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb32_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f25da3983c7a5eb9bec102b0236f20f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"9def89663dde5010677b6d11e01be14f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"85ced44a04b5fc0f8e087f88d5251b2c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"66580848de344ca2baed4aa00f57affc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"8d91614ac7bf1cab1f09b719a84130ad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"39d39aead9ac0f1578c3eb2c686cbdd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"5ba6599d7b6ab0cc45f4d04f58fda6b3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"1902614f854a73dcd57f827baa0eb17b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"411ca6a3bbb37005e242e6677081a34a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"297c72302d55f139bb2f772b40012fb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"aa5d80badcca14066bbbc0f020b980e9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"a998d7e806b889224991e1881fdb00e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"376cd48d21728ff05d21a7bfb3bdff2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8ff8b4816c8cc4fdebe95a2fc95b802b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"7b544d39f7b1030fb38b78b89e45facb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"1e8fb1c04b6ce0b68da4af52e618ab1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"3b66de4b444a5538feb6953b3264ae21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"ebc866753cda083ceacff9f4014b3bf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"687c2d12f53a4e1886ebafadbe35dcba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb32_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"e5f297d9d26ac1c44c7200ed11218118"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic1oc10_ih1oh1kh1sh1dh0ph0_iw29532ow5905kw10sw5dw0pw0_n"6c386233f84bf5c705164f97f8ea209e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic10oc10_ih1oh1kh1sh1dh0ph0_iw590ow117kw10sw5dw0pw0_n"8fb7168f47ebc4a1d321cac635152649"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic10oc10_ih1oh1kh1sh1dh0ph0_iw590ow117kw10sw5dw0pw0_n"538681d90723d1c6284d8169f0ac884c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic10oc10_ih1oh1kh1sh1dh0ph0_iw590ow117kw10sw5dw0pw0_n"5e2991c16593e0c409a63340ddffa31c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic1oc10_ih1oh1kh1sh1dh0ph0_iw29532ow5905kw10sw5dw0pw0_n"2254246f7a1f39c785984ee244728767"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic12oc64_ih640oh640kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"ce376b66ee6608fc12450b875e0a38cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic64oc128_ih640oh320kh3sh2dh0ph1_iw640ow320kw3sw2dw0pw1_n"ef34ecb1980842d83f4c3f75b7d01409"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic128oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"25aff1eb3a74039121b96aac31e5559e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic64oc64_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"191b9a7f6b9e36085d008e2cc84876a1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"2f5bb786e1689dfb09e5360a00080258*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic128oc128_ih320oh320kh1sh1dh0ph0_iw320ow320kw1sw1dw0pw0_n"8f3de5bcb0b86ee9aa9fd813df3557dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic128oc256_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"30b33eba041512e442da98c9ec9136c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"35673fdfa9e22c84e57a7c7c09fa00e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic128oc128_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"7ff3193026779035c2261270680653dd*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"ead966392a87afdb6c3a3407aa070ca2*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc256_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"aa34695d62f2fe8cb0783d36dd135cdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc384_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"29fe436c30f631e7dc0fd70093b8b0fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"aa30a3ecafd0396d017c0611a2a08bda*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"5e5ae759e02c37995c02810515dc9a8c*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:15:ABcd32a16b mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"09b4d074219c596c383b71b0ad8dac7b*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc384_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"a24d4a779432a68222e36e7a456d2e08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc512_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"c146bc9a311abd3a4cbe78548d0cf6c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic512oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e41377933cc91b9f5e594196b45c40c9*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc256_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7a78bc909c85cfe764e15b22626d77e3*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"5d91255293b88a08f46b13c7b137085e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic512oc512_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bd776cc502ef3644b348286d52731090"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic512oc640_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"5570928747786ee22dad3189c3345af7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic640oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"974350b663183e1a021ec7f9047fd365*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic320oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0673d78d967b516a26e98814eca0e094*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:15:ABcd32a16b mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9725766e5d20f119e907d2cc179db319*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic640oc640_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"df41295e1f77b6e3d1727dd2dcfb2db0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic320oc320_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"bede7a827bc7874aea20cf3ba6106f7a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic1280oc320_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"896c312d18ab701dd9741ae6f6ec93ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic320oc256_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"0e19b22038a7352930aa0a140f6d3466"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc256_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7d64d4f1fc8f00136392fac335e0b60b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"77b9537b6901f62ffdba71c5666aaeca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc192_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"568581aef457d766bba6545d6e8d42c6*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc128_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"923858cdb18f1e1bdf85d5350db5385e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic128oc128_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"8a6c48843dc764786530e7af51bd9460*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb32_ic128oc256_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"5b3cbea1a5ef09a9767acdbd4a50c22f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic128oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"cc0579ae6f5d6bce3ebe8975de406982"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"cd7f79cef9592b86d91e516a21ebc193"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb32_ic192oc384_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"de88c09786cc7f4f12c6141e1c4f8b93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc256_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"31d7ff08074397bdda05220cdb53834b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ac5dcf9486ed15ebbaa1c660c2b07e77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic256oc320_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"8e0afab9ab88fc8da7518a03f0ab93a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb32_ic256oc512_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"dd0d8125f1ad5abda9d86a133f85eda4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a88522a5c3bcecf57157973a87931034"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_add:f16:2:abx mb32_ic320oc640_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"2a4a999a863e6933b735be6969681057"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic640oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"66ee3c95c5438382c4be7bd00b9e9172"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih160oh79kh3sh2dh0ph0_iw160ow79kw3sw2dw0pw0_n"bdbd53b7721a3e3004e8259d3b763a9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"6504f1bb76c0c0feb93f049ac59e3b6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"bd297d06ff69574b8c3a76ad8af4a695&f8e4d4723a35b37b890d20465924406e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"52911228683de75f2c06e39eb1a14843&2de5ece44da73aaf3df43b30c99556d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"394589d6cbce0bb0679264acaa5f2c08&b5e151e55993917d65849366baea90c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"e4a2fd1c5bdfaed72d96950c2f7f025e&be028a3f8e2d8399111d870fb8e7c2bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"927996d7938ea980871e3411bf177d0a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4eced8247eae5aa09ef2b7fc2e81c50f&08bf5eea9547f225d4a61d32b934d4b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3251021e6add90dcdcfd36adc169465c&ad751fab4e8004211ef2628663f27e16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"644a94c41fea9985d1ea2bd6c643ae54*2&7df4e52f3473043682eba3e0efd8daaf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"58550586f137a221b364f0d4cdf88756&dd2c99f2988430e882ca046cb99d569e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"65d35ea72cfb59eca99698963e996b3f*2&4e06b9bd9d78eedcb1b326dae887bfd6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"65fe002eb50425d58f8c8f571a4ec1d3&ec234120b712f26e09a1180921428727"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"480f9daf772c8cec9fa6dbaa1f8d5a01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9f191f5d11f4e436bb19b0d6e99371be&4f2411f61fd699cac109e5c741f1d9a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0e0a8336aceba2c60cac2d0a35865616*2&4afc8d7cd896049fe2aac02a60e5767c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"c6e9ab43909adfaaa177b489cb54677d&fb13a3e4cb50e89628bd35f4a4574260"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"050e6365a73b2ced0851f616f8a01577*2&9e30b4552b0f64244cde8a3a0cf9181e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"30fc54b07a719ba64b3d88b72c362326&7d9789b09671e646e81fcb66d643211a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"7be27ce07762b1851303561b9de1319e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"96e1e3a0d51f2addecc45fb89f512dc1*3&1f9dfc17ebcdb2bc97e485b197107ee5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"31d320a9aed64433d52b02fd3e1a053f*2&6def1594d51e674bd515169fc39b6e09*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f5218aff1b9ca2dbffb91abdd6037d8f*3&c45a44124c3e2d694556ae8d17903fab*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"09bb582263d44ec43b9739f797fd46e0*2&053de5976069bd92ef0e93abc4ed5de1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g480mb32_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"b0f8136104e3a479e51aa0ef067cad14&a07168a7d7504ca1bc89ef3f7df5411c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4e7ddd8e3423229f05feb15cc1f7a483*3&a526455dff50d0da4c1c97277632ba7b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"b1c29e3dcaaeedb8a8a272af0e8e9856*2&588f30d00b439ab36470ebf16cf0ac34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"af42e5ee22cf7976023bc63b2e531ce4*3&f122c867b2879c5d68e65cbb6b114d25*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"160d33eac50707a69006b08897707eb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"6d047d2a3e0c805e6dca13839862911d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8017a25adaadedc0acb38aa4ae627a73&c29dccec2574a230a287a20b41aec42e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d36f50af3620731ae4c373130cfa042a*4&1ed64e3e1e8ad76b8c0492f38868ce78*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1152mb32_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"cb955a2d20653ccca9ee751763a6d87a*2&d10dc81c7450786531cc4754865ff444*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f32 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic+eltwise_round+eltwise_clip:0.0:1.0 mb32_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ac95725954fd9b11ee92fde4954dd0b0*4&6e15689d98d18bc258ffc51672b0cd7f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1bef1e390e9e463dd1cd5db357a3a199*3&f5ee0007042c2a798b17e4e35b9afd01*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g1152mb32_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"033381e06ae166a42989b048acd68ee1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9a5eaf53074bbf47ebe67fc7c1a36810&ff31691e7df88d6ac1a0502e009acbec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"38bc8fa7860394cf62e4544941d9517e&be5ee706c7646855842d6a4f8861444c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"2a89a676f30f870b9bbe0bb78c89ff9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc24_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"110b09bd0359a1e3cda5492b7263305e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g24mb32_ic24oc24_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"262c974e15cc7fbd84601d3fe2dd838b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic24oc58_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3bf14b42a9df0b3b94a1e95ecc564f05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g58mb32_ic58oc58_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"faec39f18e200f6a0ad64f6610fa122f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic58oc58_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5c7dbf0970dfc8e88512fccfb2493a60*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g116mb32_ic116oc116_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9df178d2198880647794558da25d0508*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g232mb32_ic232oc232_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"20bfd9f007e9cd8a9eb9a743d97deb53*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"03e943ae5313216a4125d8ecda4842cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3314b55386b8147d8ee1346a26cf2e68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"dd2fc44798f65802d60ead87b67176f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9cdb9596397ff41b63cfa396b6f40d18*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7dc851c9ff56088a22580f6b4e55debd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d0d64b6bace858c8d54fe067154cda12*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c7d28c9e6f70eac8777f5dcf8301aa25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"5fb27f3d81da82d407b6cf0577562266"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9742655ac90ccbb83cabe0df3f28bf08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"926a1e057aaa487d464db0e82ebe4a7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"51436fec2d31b53464cb6f5148d129ec*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"03a1e2446eef281210cfe9539d101f1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"14808b0934f6370dcf99778a230bdad0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"021b51ca16aed7a268cdc5ed81aa940f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"417900a6dbc24bf643a5ca2f207342fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7477fcfeab9ca3e80343a3b5671ba00c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2a4a27842db39daab452a6fb92b09dad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99e7694cb9736dbe6814d351e7d84cd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8f9859ef29177ad6348f3d0edf6b71be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"67eb96ca04b3bed59be5e0f328324a58*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6615c5e84f3cca5b24515670921b8f78*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1024oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0be3bd47d973b68e465b21768cdc4acb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc24_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ade87dbfb13a5f8dea0c99d2d773df6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc48_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d646ddef8f5c8b9d0eccf18e9ff8a85d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb300_ic1024oc512_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"42623063dbb88249c00d8a07e4aa527e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb300_ic1024oc2048_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"6f556de382dd94cdde30836d8be3483d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb300_ic512oc512_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"1c9df6355ef64560af4d63be3f11b683*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c4ee59a179e6395dd8a33e6c46db942f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb300_ic2048oc512_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"8881200d42e6bb2cb76272e33a952dfa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"051a0b32b1a2c6966074b2ef1aa2fa51*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb1_ic3oc3_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"85915664a99db34d53d842d7c05bc9e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"e87cc5a8d106e5c9650e915165b4122e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f7d08c5c1989854e6cb5e8f895189bea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ebe80c1f7240e8e14d4dea731d1b26ee*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"64353062c029042bbe8d652f7c439812"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc8_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cc721c7f204068a9820b9b1c31159292*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g8mb1_ic8oc8_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1f2a0c252df464ee52ae3174d5160de6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2bb3ba4bff827c3f07f0f78f5d8be8f3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic8oc32_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"50928f74e6e0a40784d22ae950788894"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc16_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"530c952054a62767f113c5a327238900"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"62eb5173ccdd532447d6c996971a310d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3af9713635bfd30db6ee470fc723146b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0a7cd174c61040f5b23f8258750c2889"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc16_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4e1da8fb0da027bfa6e59ee0ed47c8fb*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"94beeac576aeacd75d70823e40a3d9d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"23999939c953ff387155f9584a31d9b8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g16mb1_ic16oc16_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"72c8fc3bd41954c069af39578805cc9e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic16oc64_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6803207661d877ba2c9f4e26f3a2fba5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc32_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4c6eb64e4cee293e8dde2739db171c9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"6990f7b25313042cc41e6682f6a9e0b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5e05e883977f44241577d7e91f896c52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f521a341cc5ad4f267d58897a45ca051"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc32_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cc379edbb35f10fbd70cebeb8f4c9764*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"33f03dfdd0815a3629d9a3e3ed7fd184*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9d0c786fdfdcce62251bee28cf28fb48*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f790cc03ffd302b3019d4c38fc69a386*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic32oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ee4e607cad345d666befd19dca9a65e2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc64_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"446e8441e81a0b82c3dadad552a52710"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"4587a865821bd6779804688cad0e7f2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ad28a360c6dec8d376ef8685f0cf33b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2aeabae7513e8d524284518da4df60a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc64_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6b9dd6931c85557bcd44dc5bbba0a5f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"91de24e082d51d0dd6d48c23b638a7e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e59d1aad24a3f38b6a50640c6a4b6e93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc64_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"58b7b44c8f79ee60039506b742fed5a0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8afa641072ff65644aacfad9600bd3ed*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"de375200d6d7456c70217745aa773462*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"23264e62a6dbd04367d86adcfa2e9240*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_elu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc256_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"357cc0e06829ed324da20f1b394e8416*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"65d76cd3e6e0f39316410c8666d0b7ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc32_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"69f285957aa91a946343834d0ec2d4c2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc128_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"28bc72984987ab2fb5e821edf5b7daa0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc44_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f50c90541d76ef518f35304fa8224c62*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9a5b45550bf8e3fb82ec417ec1c75f65"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc44_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e7e8eeec7bbb3046ee05e49a8799966c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any mb32_ic1oc512_ih30480oh6095kh10sh5dh0ph0_iw1ow1kw1sw1dw0pw0_n"003115ad0f461a5e11e771ebe4b0d702"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh mb32_ic512oc512_ih6095oh3047kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"9d3bb2cd1c226b450e626099c0880a67"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh mb32_ic512oc512_ih3047oh1523kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"a6a6270a99f77f93983a3df4abb91d7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh mb32_ic512oc512_ih1523oh761kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"63486e4fe4ee43f7830d3681017fb7b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh mb32_ic512oc512_ih761oh380kh3sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"20d2a692d495d65fc3c0b34eb38e9a49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh mb32_ic512oc512_ih380oh190kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"96afe4288068cbcb0f09bb0a4493c627"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_gelu_tanh mb32_ic512oc512_ih190oh95kh2sh2dh0ph0_iw1ow1kw1sw1dw0pw0_n"9e96d546502bb932ddd901facddd058a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"44a4b51d241e0368509e8cfde4130d9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"b0f5bb5b98b1d94e1401cd0a5e850206"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc64_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"4dffbb2e358c4ca2b5abab0e97574f41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b713f7a628d7d5cc7f164976744b7fdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"4d64e88fee89a7c0f0c14ec78a9c5b02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"95fb3b59a5ebbfb5aeacc2c1bda4b248"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc16_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"8489c362fd1548c9f5272237e91e376d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic16oc32_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"11a3c7569561ef6dbf74ff9ba2b0e962"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic32oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"0a01a3a6a1410aafe8c5abbd6af07959"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"1f7c91187d5d70d34f74d7e735b8af4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"3e6956a96394193bc4293a923b8f293d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"e5cdb0b4a2388f9b0d1841ad8639db22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"6912b00ee97569f81001b0575b39ce21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc125_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"296c595d23899541caeb11e06d2d2836"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1oc64_ih1oh1kh1sh1dh0ph0_iw16000ow1000kw33sw16dw0pw16_n"e3597e998e20bfd47efe9a72a63a4a5c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih1oh1kh1sh1dh0ph0_iw1000ow500kw5sw2dw0pw2_n"91ebcf7091983fd9180236cdd8c843ce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25 mb1_ic1oc1_ih128oh128kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"80c0384e1bbd0ef40693fa3f9cae4356*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1oc64_ih128oh130kh1sh1dh0ph1_iw100ow102kw1sw1dw0pw1_n"70243ed9821f0c0ae1eaf48517b96acf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g64mb1_ic64oc64_ih65oh65kh3sh1dh0ph1_iw51ow51kw3sw1dw0pw1_n"71d987371ba027b07ddf1852c1b82b11*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic64oc128_ih65oh67kh1sh1dh0ph1_iw51ow53kw1sw1dw0pw1_n"cf43126a55834d983aeaa95e547a7af2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g128mb1_ic128oc128_ih67oh67kh3sh1dh0ph1_iw53ow53kw3sw1dw0pw1_n"35a79d731220d2975e39e08ca59105a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc128_ih67oh69kh1sh1dh0ph1_iw53ow55kw1sw1dw0pw1_n"c0f285bd1a7a577cdfcb9c0f1c4db4f1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g128mb1_ic128oc128_ih34oh34kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"139a0dd13527c4a5488917f3326e3463*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc256_ih34oh36kh1sh1dh0ph1_iw27ow29kw1sw1dw0pw1_n"89a4d5c6254ffe9d32df290e1a94a93e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g256mb1_ic256oc256_ih36oh36kh3sh1dh0ph1_iw29ow29kw3sw1dw0pw1_n"e5b4a100a95dd562d1d401523217e1f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc256_ih36oh38kh1sh1dh0ph1_iw29ow31kw1sw1dw0pw1_n"6f040006f36d13f8b7d0a03bb8a9cc47*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g256mb1_ic256oc256_ih19oh19kh3sh1dh0ph1_iw15ow15kw3sw1dw0pw1_n"02ea8773121464ba5a12d11337682057*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc512_ih19oh21kh1sh1dh0ph1_iw15ow17kw1sw1dw0pw1_n"3fe2193c1d77e1f112cecf8878c140e8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g512mb1_ic512oc512_ih21oh21kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"809cb9dc7ed1b30d5909a0de00329756*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc512_ih21oh23kh1sh1dh0ph1_iw17ow19kw1sw1dw0pw1_n"c2bd0c784874e12cd40754156c0ae42d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g512mb1_ic512oc512_ih11oh11kh3sh1dh0ph1_iw9ow9kw3sw1dw0pw1_n"abec911f7338454ab3bb7bbdb3093a14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc1024_ih11oh13kh1sh1dh0ph1_iw9ow11kw1sw1dw0pw1_n"dbee61443b691d5d042cf35d0ed13c24*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g1024mb1_ic1024oc1024_ih13oh13kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"9d1d536dcb12f4ba636eea3c829c40bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc1024_ih13oh15kh1sh1dh0ph1_iw11ow13kw1sw1dw0pw1_n"87c7ba0cd77efdb6d334af0ba501b0fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"2a51ede241fa42f9803840b723100ab0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc10_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"9c7c4a85336d5705f1955c7abd0724c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic10oc96_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"97d25b769b01a1f51db667927f813e63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic10oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3d62ff70b23079f3fa93ef5f083e5275"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic128oc80_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5f969685210128a681645478aefb71a3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic112oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6addb65c72c08cc9020abe2f280adc55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic144oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f60b6981cd7de005f49cd833cc764167"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic144oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"09c9595781618bc5f22e67296863bb07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic256oc160_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"621c780d92bb6d3fb29f178bb0f01342*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic224oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5abdf7ff7c59a20cbb1d339180ae37ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b354dd63374b4fd2dc6dd00d60e9ba1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic288oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2f8e42035e637a3b4206415a65b397f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic320oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ec0f664487d77df61f17095f615b493d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic320oc320_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a8bdf7def29cad6815983226f3265b9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic512oc288_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"889b63661ff0fec738d3fc381268a3ad*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic352oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a75c8eba2a0407736d09c8cbc213330"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"04cfc672f3a1d0e931ee6dfb8a41363b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic416oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"962d79025f5b705cc06d8c873c2b856e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic448oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cff4827db3f87547773f1b7ed1635e0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic480oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f97ae1df023bc02c3a439dc41995f42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"edc1cf1a28ef2b791dc17cc140193b05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic544oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d33c4888b0f8000e170fd9c56b69968f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic576oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"248dc99492a308755ae5db3f0f12f2e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic608oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b036cce4687bd8199ebc49cb164017f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic640oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bc542f058497b4bd001e4da83f93883e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic672oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fdea0420a98a1b3bae3f05d98736637e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic704oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5b8690673c1daacbf927cb15bdd9d195"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic704oc640_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5172abb48b38e45617046b7d0668a734"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic1024oc576_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"920657548827232416b35e0ca72e73b8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic704oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"73a9a0ee0e9ab7c0c1337eb5175debff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic768oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0e77c083fd18646c7419f9523e434600"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic832oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"86421e2b478116f1c31efc329224dc42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"852ecabe9a4aca7dc0290398f4edb95c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"1b3a2ff0b056ff6b15272e43010d0a6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"f7dae233343cdb625d4a90dc1e378abc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"25b876163fcb4279e6ef9e07b8e87a54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f79abc8b76ccc9a227d141d63c2889d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c4a9a7f9ba28fbc0c92bca8d4ae1a31b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"90c498a7eaaf6da9e168e903050e747c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5041515a79d21753c4588712ba7f34c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"87d19a75d0d0bfd39423e71c335a0288*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0a3e176735da6c0be2ecef851f24e40f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9ce910eda5a839e0c9d4cb23362714d2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"376a5f96ab11d8cbb63ceca1f6f81f17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"55b6a89d53799502aa06615eb728889a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1e3d33942def34dd5da28ffcb37541ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"769f8a9f7034e230c2b3416edc381f07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c90fa6ba83a33ae71b9cecb8382ec5a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c82468a433482ecd78d478d7464fda33"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7a530c20e042619365d088eff4b84447"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1e91d5cb7cfb5ee1cc4964f199dfb2c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"577843b30e0f1eb914b95bc13a7448ee*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"970d067a93cb39871dee0b3edd472216*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0172759781f06b934eed845d93b6d9fb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a3da2b1a7302350740396df7ca310118"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8dd1e14d6fb63be9db1d622107033032"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"63a6675185011ab8df05604b81dec879"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"396f70d2dcbe122578442c6920b76af3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ca9ae9c19fc11d6d6efd4cb66a087a8d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"627a0d1f924ed7c3b76b10af7d871724*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"00fc78bc639e8111cb0dc9acbe90b26d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"03a883ee8c0267edb066a69dc56ccc46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b402fa7654643338ff717e6df7b39d0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"639b26baabeb619d603f6b535be32ab3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"89a06fca838c3032cb3f5551c0c27a29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9728af607b383252de800b45ed8b81c8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"7a3eb73df978db10310b7f47686e59d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2afded178806672b7c32b68a4918783e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"49cf360cf037253fdad468b56d14dd3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"6f6ec5d1ffb78733249f0c292debb637"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"0f740372a8d97e7002e7acb4774a3c60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"dd15e48636c57a99faa813a7e438a5d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6cf5ac91ada19e552deeed0a5e227249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"36561cb1dea567abab52a4499c448a57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"272094193e4bb79cae095ddcf247f7dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"a7ac9b587b5bc7274193081ef81cb1bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"b8fc5263ccd87fd4534ccce952ff9959"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"2094a53a6c55a7479d00353e519f6aa5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5f4aeddabfc42ff1e1de6fdf0c0eaf84*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"59ada6e1b70a2e27d40c0e7cfa5897fe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"14443b3fcab70652cbce978d91f82dda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"f7618c9d66cd01bbe0df292c3ae546d8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"d6f4306caa4fcd0ca7ac8292df39ae2a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"991661a53ec27a9245a2360af7d15321"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"0497d6eb9a78e906361f462b99b13038"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_add:f16:15:ABcd32a16b mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"dd4be3a925c7c4d0c6f624b94215d6cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"df02bfe7f0c0f85f014b8ab20f7b49c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c34a0a2c3d34d1a041dacd1d3cbda578"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g144mb32_ic144oc144_ih56oh28kh5sh2dh0ph2_iw56ow28kw5sw2dw0pw2_n"2362dcb8562c3ebbbc330af3aa848ec6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g240mb32_ic240oc240_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"91593cd7e9786c83be5d3fd8853f632c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g672mb32_ic672oc672_ih14oh7kh5sh2dh0ph2_iw14ow7kw5sw2dw0pw2_n"56e81fef0b13566c13c9b48ca6c635a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1152mb32_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"a7c9c11f2a4d53872c602fe07856879e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1152mb32_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"85fd1faa55f66009c5c6da2bcbe5dfa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g200mb32_ic200oc200_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f9acee08db23db5dad88c8c95dc28ce5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g184mb32_ic184oc184_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7a23bea2cc75dccbca6075c90ab4dd69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb32_ic24oc72_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a1a330765e369d1f270271a466b3926f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb32_ic32oc120_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0a8a7536e948fd76ccd03146f5a48461*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb32_ic120oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"55843be1eea0cae10bb0d9de155c9f95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb32_ic168oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2db262ba4925efb48240c2133703c50a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f16:0:abx mb32_ic240oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cb1bad723222bc443c43a19aa03e4db3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"966e8d096ad594ba5682cb1a5890b184&0287984f8cbaaba5619da82b885bbfd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:2:abx+eltwise_linear:0.25+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc64_ih192oh192kh3sh1dh0ph1_iw192ow192kw3sw1dw0pw1_n"5a099a495a301931aef5bda7c519044d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx mb1_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0_n"f9a3753a0f4d25fc1534c44e2d3949e4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx g2mb1_ic96oc256_ih27oh27kh5sh1dh0ph2_iw27ow27kw5sw1dw0pw2_n"0b32cbb35878835c4325d23a967a60ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"baf4dcc3aa708d74e517bbcfcc6ba07a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g2mb1_ic384oc384_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"f33c6371c3a07efd42cfdb1d8fe4a97f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx g2mb1_ic384oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"cf89c195fb3624b6aa6b92821cc59556*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic80oc128_ih366oh362kh5sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"062712fc3cc8b27e49de1e3dd44154bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"77140e910a5fd005a9eca636b01c84a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"f0e711a1ed645f9e04e57aedd07ab194"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:6:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"20376117c8c05ac21300b9c50586640e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"86358d5c1c2f7af071f3143214d68adb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"474dcf745abba98065d41cb4210aeafb*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"1720dfa355907ce47d95c5bd7b1de74f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:6:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e58e9de9c0a902095e3575479c6a7a11*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih362oh362kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"47e110bdf0e7e60b79be23fd64a58853"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic3oc64_ih896oh448kh7sh2dh0ph3_iw672ow336kw7sw2dw0pw3_n"575c45c09e3a4d525d93dea2956fe4dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"a3986abdfed49a69b51d89b27df645dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb6_ic64oc64_ih224oh224kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"f715b14f0a64f0e705547369cc357ad5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic64oc128_ih224oh112kh3sh2dh0ph1_iw168ow84kw3sw2dw0pw1_n"7333d939025349637aba2fe1c7c044bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb6_ic64oc128_ih224oh112kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"f2eea06e45181e85584004e7e0120513"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"39a8f52fcde5ff1633d07a47ae14f003*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic128oc128_ih112oh112kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"4e573a872f0272dee9e14d98ba53b5cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic128oc256_ih112oh56kh3sh2dh0ph1_iw84ow42kw3sw2dw0pw1_n"75c6cdbe4ee7d4bf863595bff9a9db3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb6_ic128oc256_ih112oh56kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"68f70ceb7753f3948a12c87f7b42aeb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"6f3a668003f7b19474a1c65a79698a05*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic256oc256_ih56oh56kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1e6a032a007a42f34d2270d78387549e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic256oc512_ih56oh28kh3sh2dh0ph1_iw42ow21kw3sw2dw0pw1_n"95bb165d970a336f3ab14ef438bf981d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb6_ic256oc512_ih56oh28kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"833df5589d19e20ad228ac514707c9c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"23067d233517272f4a083a6b732fa5e5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb6_ic512oc512_ih28oh28kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"d1f2752c382a043381920cedbabb4b87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"bba4ab0d3849fb18a899124113a818d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"83976e73e38ea182a922b1038986ea89"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e5e31602d635f776a905e831082beebf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3ffb167d613bfef82b13453ac8e15607*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"75aabbe4bd02785fee6e563da3a9a572"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ca500dd5d692e680bc8513ba740e8490*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dfba3166a50d22c99de37f04366bfbd9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a9db2830672583bcfe16418fe0f0b82b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f8014a64989c5ffd0c74d7ce37bd3d34"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b82be7f850db2c766bbd7b5a5bac18bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"9e4a10a69b23fdd10d1e6e1132bcb432"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3058fc117c5655c1d3a9e7e7643998b6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4511d9d28c7b3c4e93f225d9a22e6578*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1686c08e6a5d6aa4f2ef994654a63d79*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4bfd1fd975ec01c7e6cdee43b163e826"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ae2117d7905c068bed313bfe95eaef52"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"54fef83b716bbde3722b65e8b69dfc86"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"91645157b2c38325baa62ce8cbeee1e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"85cf1d7af61805a1bb62f6fcda652e47*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"84d1e93e4763d3f91a5613fdc9135308*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6fbe41289b0615057be2df6ea6059d9e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1a3c8db7205bd6507d610a4589be46f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"bb8a5487a9d9787aa48d14519542aaa0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3370ad1c057bbcd20e8f1c57d57d03f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"fbb073479fc7760746f8a3df7e999616"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a70cbca1053c0723e1be54c599e2d58d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"137daee2c954835725a721263f215018*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu --attr-fpmath=tf32 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"87f83782088eaa8ae7f50815edeee448*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"324c8d863c7283b75820310fb48429d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"662e20da96d018c8e383909e0dbf6b4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"2ea2c62017404d1de7c165ff0a68e838"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"e6af97691ccd9a06648001c83d81a274"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"301953c9db9758998e38987172c113ed*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"1da4531a349ce2ab418b78cdd04e461b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih339oh339kh3sh1dh0ph1_iw510ow510kw3sw1dw0pw1_n"c7f12d0e0e589ba12c91cd01a004ccaf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_tanh mb1_ic64oc3_ih1356oh1356kh1sh1dh0ph0_iw2040ow2040kw1sw1dw0pw0_n"efe65d454c8090eca4d972de5d429ef9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:15:acdb mb32_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3b76630e7190f9d828c49150bc608e39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"a419298013eb03972b369f9cbfdfd629"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"b8a3016be9454be7a44e575cc0eea87c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic64oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"99829f65e1ab8bb686f63d2504ddf65d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"0f8d3fe7c124f2980c95adb234575b1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"70eaa87051b877098e8c8707f5a0d4e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"ccb5e32ad4846cab80c86f91915f4c5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"4ab6268d1a641a34b14cb60be8b499da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"4f6af4d84fd12b7c45c803b8a487716f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"4fc308718145449fd82b2e5197d4d6cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc1024_ih23oh23kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"17e1e64287cbd2124463c477427f441f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic1024oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"cd9769dd5cea5bd4dd2bde460b9c1c70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih46oh46kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"d367dbe7d902716bd5e3c9da14af7594"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic512oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"12a6549ce76aaa341a996e155b8a78da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih92oh92kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"7e6866db20d66a413297aec3426799d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic256oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"83c20265857addaa623bdbf50aac1c9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih184oh184kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"b3015182a3aa518c336587c0451d49bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb32_ic128oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"9085d9d7886b515e8ecbe6c4a83723fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih368oh368kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"beddcb11a3fa081bc0bbfab3b61f12ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+eltwise_exp mb32_ic64oc12_ih368oh368kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"91243db3f8d1d6590e5ed6e3ab7af82b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb32_ic1oc4_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"ce96a19d927b06bd68bbe634734312c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb32_ic4oc36_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"c6266475316e1d1459dc62e160c48c28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb32_ic4oc1_ih1080oh1080kh3sh1dh0ph1_iw1920ow1920kw3sw1dw0pw1_n"419eb3cba65e76c8be66280e47448ece"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih1333oh667kh7sh2dh0ph3_iw1333ow667kw7sw2dw0pw3_n"96e4fb4a7dba2cb2ffc825a7203f887f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"ae3d64297455c388b7b64e6687982c2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih21oh21kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"ca95957e99fde033cc6933fac6bb4f00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"995db26f82cc77e1427c47f18e4f5e73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih11oh11kh3sh1dh0ph1_iw11ow11kw3sw1dw0pw1_n"59b205e1dc2dafa8b4514cb2d53c5f1a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"804768947e1a9df6efe1e7e50fa07635"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih42oh42kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"8c2b401a86618736bc8d2a4103d423c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"f9fd11824c5cd05b18e9e81b868135e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih84oh84kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"62ca649ab7dfd67498925c6d6b634748"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc720_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"9f8397b72dea06d016c66345c6e87cf5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic256oc36_ih167oh167kh3sh1dh0ph1_iw167ow167kw3sw1dw0pw1_n"cf3c8075527db70f89c0afbe2bd44aa1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc g200mb1_ic200oc200_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a5f25585180a65fab172e7f6149ddc36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g80mb1_ic80oc80_id8od8kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8cffc9b3bc9732e8c5f6534e0b04b234"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g80mb1_ic80oc80_id8od8kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"44d1160f5469315a1dad8b7d70209121"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic80oc1_id8od8kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bf210dd938b71c566b40e1c38a5d58a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:4:abx+binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic80oc1_id8od8kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8653c6bd9b79a1940caaf4e80957056d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g160mb1_ic160oc160_id4od4kd1sd1dd0pd0_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b5d51711203e25e217516d4fcbc1089d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g160mb1_ic160oc160_id4od4kd3sd1dd0pd1_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9ac94dba474363874a48942d16e9e7f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=acbde --attr-post-ops=eltwise_linear:0.25:0.375 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ce9dd70e564b620f9cfa43af5dc1129f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdeb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:4:abx+binary_mul:f32:0:abx+eltwise_logistic+eltwise_linear:0.25:0.375+eltwise_round+eltwise_clip:0.0:1.0 mb1_ic160oc1_id4od4kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"11c1887731b0c0d32aaa69928d9a6a12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb1_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"43094fae5ce5b218a404a390a5eb06ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"32d33c2519561a86272d5e49e0761d9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"bdc5f59b07bcafde3cb1b47ffa80ea4a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"46dad47cfc13aa12053fc9f3fa853756"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2df0c810940e0c0418da7c3c78320a37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7b849e69024c4799c643daf800232940*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5555daa66796cdf9ecae2405b73b0534*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"b224ef699d31d2972d55674eff580cdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d15540d93ad6a524a315d01ffd0bfb07*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"181a5977d64122beeacb1180288ddc23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"26f6728a9563b6e2509b95b008eabcd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7999e9588a41dcf276397965517e59c2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d3c24265de2f2a273cbb13a92e12537f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"9f062f1ed81331a03565229cd439e77e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5b0e3b27a7be17e67fb91154cac13a9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb1_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"d2c36d22e0bc7a496ead9b13c1845bef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1e9c3c0f36e20f5d2a5bfa642260cbd3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"29856db49e88c6f95678de1d99105eef*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_relu --attr-scales=wei:per_oc mb1_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1e0fddf9f1ab7abc9c3951af7167aebf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb1_ic1024oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"75758016f44343030356e60e22d4a563"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc48_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"375f296ed148e7fe20f3dba4faf80f88"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc24_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5105f278c0bef2d111dba47eff17ddc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb300_ic1024oc512_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"81f035f07275dd2866bae18c41ac5c95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb300_ic512oc512_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d82af01a79298657640c0ee6c4b1bc8c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"7bb7091ba757d278b3a37e8e03f14027"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb300_ic1024oc2048_ih7oh4kh1sh2dh0ph0_iw7ow4kw1sw2dw0pw0_n"3a83ddee8aee8e89ab7525e6d5d01217"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb300_ic2048oc512_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"2ac239d349f88de92c1df624c974333b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb300_ic512oc2048_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"b882e0757516839f59e5bef073c0322e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"1ecf49a029a80fc721d087387cc9b6a1&b363f00dbfc8a8016d5991a06170905d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g32mb32_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"b8e4523a1ff2104c5e130b0a4515d121&1516a8e025999598faa2164461fb47d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"bb4c2c2851ce39ee0f597bdcd458b8e8&ba0ddfc400cec50ecd831f1c0841c570"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g96mb32_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"101a2ea835105ce9026c924fc1abcdcb&a25022e6d46ab8f82fb63624bbdf60df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"643d957c3259698e5fb56c4b03579095*2&5db3e1f9092a19c687ee2dd91865c5a6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2ffb0c5055cdf6a36c152b7b7d17ba5a&ef26be317d3490d48919025b54c51de0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g144mb32_ic144oc144_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"586c40d2345c4d34165ef6807aed3795&26943873b380b08e306c0b8d100ca9b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic144oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1b062b577915a830368aeb72eb489dd5&08adf182186924789b8d131a9a187991"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic32oc192_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"70064cd35dcdefb4dcfca1ba017f8f78*3&9e34876e2d17226dd69874abad93f0eb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f874b968a0071b35ffdc5ca3bd10a3b0*2&7d286030c0141aceac49d352381fb73b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic192oc32_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6914a252c58f4ee820f463dcf3fb9e96*2&7e0db2c0abd2a42a2cd6932cf3911e09*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g192mb32_ic192oc192_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"220f97738b038803f59da7f4b7616328&44906f2eef75069a7fa48288d5da6183"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"992111e369c4c15d32803228f0adec24&21301fc0fe8388fe03cc4ddda39683fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic64oc384_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d295966735d09790f17ed4806c7df216*4&ca1bd366a3915d8e9c12b40da9ca150f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g384mb32_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f7cefd61499bea82cf9adc89b6d9f397*4&c964b6e61167a94e9c465f32c5ae4d8a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic384oc64_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"24a9d5027bdd42fabeb2ae6d00e16274*3&1c3efc49bd59e9edfbde70a75793b0b4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c172a7e4205ebd092ca1d90a0468ea27&712149c277d7ccc3032b368417564579"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic96oc576_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3018d38b578943e19a4aecd1c1e1f83c*3&9e21217adf2171a7d71c7dcaaf43eeb5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"980b6ccb5b599e2d9f270c6db8f671a2*2&3109a0c172f84de6109701ad2ffcdabc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b mb32_ic576oc96_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1bf0fa0027f2b245097488e5fcb4f789*2&61b2a32a567b88925babd4001f6be890*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g576mb32_ic576oc576_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"4640570829835df7509b26613783aca9&a0f3609aafaf018178fcdb8a2972c50d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic576oc160_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b52f936a6888f64ea5201f605bc20db3&ffba1a9d1c14c1ea32c9c4f999ddd3aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic160oc960_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3bbae9a751d360156f6736ab35532366*3&e50139b9a0d069fbd8a2ed060000dd27*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 g960mb32_ic960oc960_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b65428b64a4fbabdc5d669331a5903f9*3&46244f4603c6afb94d52528388da51d0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic960oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"efd352406c87c40ac0aa9a694794bb83&1f3148ece68660908af484619759731f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb32_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2a2aa972f3aac9d757fb52b39ef1681d&4a6a68cc09e24f98518b145ba80b148f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"bb9c81f8ccbc718894e1cbd418fb42e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:acdb --attr-scales=wei:per_oc --attr-zero-points=src0:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"ef1bd3432327e0f228f3a2001adc2b30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc48_ih240oh240kh5sh1dh0ph2_iw135ow135kw5sw1dw0pw2_n"e7a82a6d6812e1db441a8dcd49156633"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc32_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"62b26435ada8fe7dd37ad59bf8ad6037"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc128_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"3d90f020ed12be631bd78108ef6f2b82*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc32_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"d480727481a8bb0eb9904ec2bfbe5d2e*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic32oc48_ih240oh240kh3sh1dh0ph1_iw135ow135kw3sw1dw0pw1_n"6f2c4e9004d2f8e2da4f9e9e97d175e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5d11d2872775923764876d93219289dc&b08b23961409836ec9c79bab47f41520*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"3db715bf78d680b7ec7f77a00485a32e&b5ab771b98d250d9543fb0aa0eebcea2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8f60bd943f1dee6cc2ab9a285f55b6de&c9c4eb89fd55f83f737e435cf5c512db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"86a803d9dea014cd5312439a0f723adc&e3910eeb1f076f5c344e52dc083b06dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"c3dfecafdd90dbdc44b25b3adfeb9b90&b9009a16e44d74abfc796675ce019906"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"896e8730c17dbc8bd447cf2e95d759cf&f901742372076337241a2b86ce81b894"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"54e3449339a39182f8671f4de218f2de&e5d24683d370e8e6f6c9aab49cdb8131"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"a70297bf35fcb92d53288a3f66c13436&631a294b05a5e8d226ad526441eadc04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0f7e92deb289d47dfed7d56ec0ae4ba1&83dbed00c092e94bf73fea1a76984fe3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5b3ed9b5006af35b10c89c6117562233&e70e25360ae004aabf781c770da18e9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih256oh128kh3sh2dh0ph1_iw448ow224kw3sw2dw0pw1_n"923b1305d250bd9d6cf66c1fda5ff354"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw224ow224kw3sw1dw0pw1_n"25441c13bf32b427a79f60aee8daebd1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih128oh128kh1sh1dh0ph0_iw224ow224kw1sw1dw0pw0_n"c56b3de3ffbd1f9ef51b4dadbe3f4984"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g64mb32_ic64oc64_ih128oh64kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"019719cb07019f3cbd7c3565bd30c44a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"45c545e9436574bff509b3f84f3f8672"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"e0a16be8e9b1b8efcceea5a0e2cb34c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih64oh64kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"5322f2b9d940e1fd09f736c2cfd5368e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g128mb32_ic128oc128_ih64oh32kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"e5b581575735f8fa6fce1d190de30222"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c32e62721022b8e4bbb6305dfed0bdd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g256mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"61ee82d9d11be79709d15e96c5acf35b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f8e0cf50766a269f9c98213e41e2de92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"100d8e6247b8b7226e552c3783e2dde3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb32_ic512oc512_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"cf9ac75ea430fbcb5d7d37c81ecf1e34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8ddff6490b29f581d0c318aa50e949a8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g512mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7b2d6349f790cde0b0a778a695009728*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"71d60be6526ea59536eae6d14a9623b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 g128mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"beb59af4dddd16ed926e52611b5bb1be*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0 mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"42ca130adbf915b37e8ddebb1d0896c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_elu:1.0+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"03c75a5150856e98b9da52407ee8c83f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4c8f6652ecd3453c7b6e570ace877c24*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b1df4888fb16dcf69bc5d39adf27a44f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ec259919b0166fae8b3196329470e7f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"63bfa8117b2fe2878232eab49b07890b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic185oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"eba6037628c2d3ef26fdb330ce4799e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_add:f16:15:ABcd32a16b mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw56ow56kw3sw1dw1pw2_n"59b62c41f4b7f260e06e70c97a32fb1b*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"57f303a92705697e39e396bf69bb3827*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e9e81156ca901eb152f671b08386ed0a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1ffe7aef1f44a65781d696f857ff1b99"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic38oc38_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"aa5acb63357fbdb008427e22b39da76a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb32_ic19oc19_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b3e8f608a91716385c990cb08e7b4bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic185oc92_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fb4d083ec207619c8622bcc2206847cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic92oc92_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2c897cf3d5d1381e710265b5f91c11b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+sum:1.0:0:f16 mb32_ic92oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7206440ea5eecfb4c78de8180c0bc1da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc64_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"006e1869e346529ea4887532da23c842*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ab7d9a8d25810d2bebf89953a64e90b3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu+binary_add:f16:15:ABcd32a16b mb32_ic64oc128_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d60f750f129d5f0109857770abaa1314*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc57_ih32oh32kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"562c0a5a68927fd7b8bac088fbd315fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih1024oh512kh3sh2dh0ph1_iw2048ow1024kw3sw2dw0pw1_n"fad7d512661305815535053cbc92c90a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"38060d68ef1a6c54e16e517d0d7e82c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih256oh128kh3sh2dh0ph1_iw512ow256kw3sw2dw0pw1_n"697db4f8da107a47400a9212e458febb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc32_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"37495d824416b98f23daa3254f6ef4c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5d711be87c5dbcd78034e659d5436540"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc64_ih256oh256kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"5584a518beaa54175684c79c95b0cc1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8c06774131961e5f4e5c711574b88981"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih128oh128kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"27acc8ef501170ee329e84e1325845c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c4da49743ddbd39f0a1f66e3d7d0f90f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"db498d0a866a323800a6a1dd6987b906"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc32_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ad5c5e0c2188d3f8cc94e171330f489e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic32oc128_ih128oh128kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"dc170833e657b9394217b9a3d76a6df5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic32oc32_ih128oh64kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"d6fd6da5d03ff8e59339c98ca67c6e56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic32oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"06916000a46cb9dd4497cba090a8c3d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc64_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0be7c684af68327b9a0026ffa3c69dfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih64oh64kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8fb0fd70330fcc4572e9c422e7b66313"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ce124a36ca68c2429e58a5fc6853ef51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4935f046ca778688d0239965ea69f5b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bf7e6ae5e157a5487a5f7e318f929b56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc64_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5254920c6aa8dace8418355729dc4168*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc64_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"47cc0a03264b911029a9da5ecd780b65*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic64oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cc22ecdfbca186250f48456172e7ba89*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d4a237bccf9845de96164affa9b8f777"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4046f1699e968696b984fa5b7a38f121"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"01e8c298a9cdd6283970c4f5d495aa0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4fb6e76083fcb0ad50405ea197ee9cfc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc128_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4f19b7e3096e8474c415985948067e26*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc128_ih32oh32kh3sh1dh1ph2_iw64ow64kw3sw1dw1pw2_n"267dec5839208d227513bb9cbd28f4ae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic128oc512_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e66a027bbbb637546167e46a14402909*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0893dc8818a319589ae4f0e1b1100f8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih32oh32kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0b07438a798e49d67d4157ed543f8b28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0bea84250393b9b2933c02b1af162be8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"43bfe3f44f51b63023124b9fab83604c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic1024oc256_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f99999627744c3a43462804cba0999b1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc256_ih32oh32kh3sh1dh3ph4_iw64ow64kw3sw1dw3pw4_n"b04ec47f8178cd02ac9e573b838e31d7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_relu mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7a9a877ee40cb8dfbcfd01e5296b0487*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic256oc128_ih64oh64kh3sh1dh1ph2_iw128ow128kw3sw1dw1pw2_n"95ad82402901942551775506e1299fe4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic128oc128_ih128oh128kh3sh1dh1ph2_iw256ow256kw3sw1dw1pw2_n"fd7ac542dfaa91198f8696f6f7435200"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc20_ih256oh256kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1633bf07b7476bcfd4c470307a5aef4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=any mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7a03b7144a3bbc2b168dbda71ee69da1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"999aa13cad0e243e2af06160c571b636*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"3a807db6564e4a94fd0abbd8af3a656b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"f512ee53166917f5fa68de1c4a642415*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"1b7002801663cdde7038e61ff1d0f76f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"1a7af45f9aff14f89e494e0542b8557d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"06c22036d1f43fedbc93447359dded41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"660f00dd74e795fbd5b1bce73c5f0de0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"a2209a0a84cadb0762f92c9442dc3c53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"d12cc88b2f185c87b810a56585702d90*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"50d1fa9d268b19cc43c72c4a454b050e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"7a502ff3e3efe3a50b4549e772e8ef2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"251067a89fb4c22034d83d362963299c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9b7eb275c87cae514b6e5a3e354af46d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"54ab887d61022e558f6322cd16252d7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"9c12a5c3620fe4b3f0668b58b0316968"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"84fe97d96be23b60de19715298d53bc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih1024oh512kh7sh2dh0ph3_iw1920ow960kw7sw2dw0pw3_n"947e383a37824f626020ced9a91b9e29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"319160b812331b60252d99325bac6ff5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"2e07cf992f3f2c247d1c6b2b89b44b7b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"dd1bc5fa4a4f8ae12ee3b5b7bfc536a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"9281437cd7c54687f9588ef7ca963b19"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc64_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"101be417185b781694d40a7f068a8341*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc256_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e22e78c0040f0da5bd10446767ca1a03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih256oh128kh3sh2dh0ph1_iw480ow240kw3sw2dw0pw1_n"70b9191e62fe3cdc794e6086b52d4ba8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic64oc256_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"a3a3ed2fc3e55d72ee1633c02b712971"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"2dbad4764765bfd0b09939d3a2d32515"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"25d58ae795b72c7102a0f23c8d9433c3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"dc4f073b871575dc92e345a171efa1cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"08bf36cc0c54419179a0265ce17c04fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"a2635be9c1fd0f196c66af54a18b3bc7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc512_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"2acc2c27b2c5aeeb5fe8b7cfede329e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih128oh64kh3sh2dh0ph1_iw240ow120kw3sw2dw0pw1_n"85ae5f73165676d769b8d8af510994fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic128oc512_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"a2762fefb18d2f733f8024cc79324aee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"968d5918595b1e4ee9dc6838d765088b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"04e3300d06cd3b413cece0b58287b97c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"eaadc16105341b56d9e4d74557609166"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"eef77238cb93dfeaae392c89397a79c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"9ab8a2090de754d10ddbeccc12a2638d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"b2bb7138bf771b5e2e3b20abe8b1f60e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih64oh32kh3sh2dh0ph1_iw120ow60kw3sw2dw0pw1_n"36f76f5eaf73d496eb5cbc03296ecb6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:u8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"fc376325feb967c3eb814b8b6aa7f339"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic1024oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"efa45b97d7f213747406b0fb0da33118"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc512_ih32oh32kh3sh1dh0ph1_iw60ow60kw3sw1dw0pw1_n"adcfbc81f37c8eb5ee7c88da08a9ed51*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"5ff069830500fd3ddf36d4cb03aeba73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic1024oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"63fd08caf50a59924aef8d79d41a9d2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"36afb7aaab9b4622c31dee1c81b8cf7d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"4b403e25abe8a018cd28c9fe9a116ecc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25+eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw60ow60kw1sw1dw0pw0_n"79c59095a21ea510e360dbeb4dcc0869"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic2560oc128_ih64oh64kh1sh1dh0ph0_iw120ow120kw1sw1dw0pw0_n"c71b92f1b13947dcd3492d70346f18f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic128oc128_ih64oh64kh3sh1dh0ph1_iw120ow120kw3sw1dw0pw1_n"e8f01d0024c7fdfc92ee9a590ffb7c63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih128oh128kh1sh1dh0ph0_iw240ow240kw1sw1dw0pw0_n"1b753c3a6c6f46cd51cde1fd9dc347e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_round+eltwise_clip:0.0:1.0+eltwise_clip:0.0:1.0 --attr-scales=wei:per_oc mb32_ic64oc64_ih128oh128kh3sh1dh0ph1_iw240ow240kw3sw1dw0pw1_n"65f2536f24927e192bf7722475e1b49c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 mb32_ic128oc32_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"e1bb575b1a861ee7bece5b7fbae7c059"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic32oc32_ih256oh256kh3sh1dh0ph1_iw480ow480kw3sw1dw0pw1_n"6293979a4e020aac28bf64a1e0b8e281*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"ef665d2d8d86de3d92e3557afe57356d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+binary_mul:f16:0:abx mb32_ic32oc4_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"ad8fb3abc7a1583f2602bf34b57d753d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx+eltwise_logistic+binary_mul:f16:0:abx+binary_mul:f16:0:abx mb32_ic32oc1_ih256oh256kh1sh1dh0ph0_iw480ow480kw1sw1dw0pw0_n"8d15b6f46f1abedee17d83ec79bb06c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1280oc1001_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cb8295867a73744b06967abf61d866f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc96_ih224oh112kh7sh2dh0ph2_iw224ow112kw7sw2dw0pw2_n"31baa9e7765b206a5d7f4d2e9a4779d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"fc15b51ebae0c927e7849fdfb063eb85"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc64_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"a7e99ee7b832c9a24e9aae800829dc2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc g64mb1_ic64oc64_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"292f406043b1c1dfea1aafad45aa3ab5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"2027fe3428bc91b50a3da508cd77d273"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"a3d930178cb028f362a088817b9198ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"44436077f1eb3e359349bf93f0a449fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"19490471a077135030e1ab9f9ddcee79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c47f423dc049e7674eaa1273159c5529"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"24bb77723b62cd3c512553c760690082"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6f7e9db07ebaa983ca90f10fdc70ef71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b30ac112139e894f5e9efd8ca2fe9d7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3d5750012d6c3b8191f301e8d509b238"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"908bdcb818e9b9d23e558e2639327c55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f98d8180629d4f950c125b2436efa20f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ee56c9bad2f72a07b257b1e15689c820"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5251b25e45a634757a74b8b130cdbd7e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0c9be012f414a45b6c6c0a7e3e981a56*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"b39c0a6e89791c5d9b33ebf8ce96dab6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"da4d38ff2d309def83cbea425b46e3e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"027345bc83fc171f271b9bbb13982974"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f04091f7d53ea3747004833189fecf77*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b6ad05bf4738c04ef2afb8d57e4bd25d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c1e7b3d28bae9294d1b18e553182bf11*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0537d361757e4990a306e70d4fe0ae9f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"9017436a0092e401e23a13291af5561a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8ca9ff39c9ff821c03611fd582b493ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:1.0:0:s8+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"fd17874f5bb9b669d0408380721f9e23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2becf0307ed9db08db02da15a9c3fd49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"cd91feba783f17003c5bf984be47b433"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3df144ed6bb377fbc5caa3cd6943d720"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+eltwise_linear:0.25:0.375+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3a75b039145465c206429c0986f072aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"af244f816e828f0472c4f5ae395f8db1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"e57b462764700dc71150b9dba4afea93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"96958ffcd316312ea8afe25ce1ab4585"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"6ba2f74b2f69c647be05c4a329c4298f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"bcc2f5a6d907da07ea81cfad0ea48b98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"623a2d91bfa9a8c0bf2b8cfebe93e774"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"5b20e57b1f7c8d5c24ae8fe8a7fa1a60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"e93165b63af193eff17fb70b49f0a033"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"6a2583f2d03c90044a59d6b5b30328f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"84b85cb637a505dbcf80eb60468661bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f94a63bd6efcc1c8cf960b9ad58bd0b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"6b352986717ec1431a526e7fc91da3a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"7d3654861a626c892c3b947a98e283be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"0837e7af071a49877f6c70b236d67936*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"1b3803222b22415c3ac8b97b8d99de79*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"e24ab170cb095826798c3e4261b8ef6a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc63_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0dfdf83b23a17b288f925dda77da24f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a254c0e190b73ba36c042277ee5d73bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0a53d8fc9e3fca75bdb19a593e0d9bca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"b68cc1d9b8da228cc52b9cec7029c958"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"857a88c1480e3cda927fe0bee42625c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic1024oc126_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"57d1370a2fac34deda7f474e805c5094"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"beba897b8a9798695c32af4482036722"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"6e2bd10fa51e1ceabbb6dee36883b6e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic512oc126_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"e608857452d7dd1978772f0a3c5f582b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"b6d48f972036d04a1f06af33b01b5e4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"630fac0ceed73146fe33b2feb5ecf4a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc126_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"5cbdec37bd049bd220ff06e0ba74982c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"560861096615bbae0f0550c4bb354ce1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"0236bc76c7ea72e52abe08071c2a491f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic256oc126_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"15c8bdc60cde0d6c03dbb6b8dad745bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc128_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"6bee41b5d4aee9bc4ba76eab85cb5581"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb1_ic128oc126_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a7f5f406f0641dcd9bd9cefc19501ac1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic3oc64_ih448oh224kh7sh2dh0ph3_iw448ow224kw7sw2dw0pw3_n"fee7f673cb0ac33c9972d517b4ae9192"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"289c07c8611158cb8f17b865c427118d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb5_ic64oc64_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"2782f4788c73b57ca41a11faa8b25ba5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic64oc128_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"fd69b0ee831b52d6b2407f1de3971ede"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb5_ic64oc128_ih112oh56kh1sh2dh0ph0_iw112ow56kw1sw2dw0pw0_n"6ffd451ee96605b90e421171a4874990"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3aa4cf06a43e18eb326aff3b3356e57c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"022a3b02206061f6512bb42d7bbbdbf1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic128oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"58c896ca6447e5d8fcd8b3d30d26f182"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb5_ic128oc256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"7665fdaf11b5a6ea0d8cc6d57e549319"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ee30befbe5afe2752a62b3fd55361b35*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c8c96ada6f2d745866e7c0b9eded1e38"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic256oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f9dd2f9191653698e5fd12298263e011"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb5_ic256oc512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"dcde2762f3abb2dcb13105dd05f639c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c6c5dbaa2122e995c7a75a07b413f8e0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb5_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0ce88912f17f624e8b090a9e0c33f7de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"a85d50073bf072fa5f0afb3312ff221a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"d0aaf3f28d4e9b1cc1ea5cbffac38505*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"c69cc7c91b6aa12a1dad5642b8379801*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"e0c64f3bd887bd007c2736de593302cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"7032ddbee80810c6ba374c41ceea133b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"5e5a33d3ad4e5bb3b42f72e977882733"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"b9582682c9ef3fd458304630bc2e1b7d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"d5dd80c876e3b528dbea289934daa879*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"dff4fa4bcd3f4683a42f75bc1fce19a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"43b49c097a8d60ba60ab30a9b421e59d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+sum:1.0:0:f16+eltwise_relu mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"1cb42d5c67dd5bfd650c604eb9a37e8c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"273d70d07a598487187c746bbae04c54*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:15:ABcd16a16b+eltwise_relu mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"af0b0450a746a26b3a58822d89d7fc97*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:ABcd32a16b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"58e1554201eca9d4f12fb1e3e74b5f09"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"1e093ad1ce4c242fb877b55cb3f1f5a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"7cc2708ad7aa6a348d8fcedde535e3d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"fd0d66853d9a795c7322da0284021db1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"20bd34a3bfb35de81b8ad9e2047000b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"d57acb4918f44e6c72557c2f9d3ddd76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"3a40bb574fe93ccc0f483fb01e4a1dfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"d4478f4b52e6be79f4b0ef405236bf32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"0b9f27c7bec85e801a4ad40ce34402dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"fb5062003abde8f944d41badb5f652c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"9481e3f9ec27d0d376ce408f9e2e9f79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"69f49dec9bb0977cfe76ca13d6deb686"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"4c52fab7048643b5588bdc276d935d0f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"d1b9d74cc7316ca5dc8ce1946a8cb746"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"87fdabba3c149db82cb0d768069f1b9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c33cf759a40e2f31a570fd9004bea76b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"b2735e333e28e47f9be730166ef24f17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"3c4ff4c0f2872fb849a1db10f9aca99b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"1401c28407ea2ee4b5c4b0726f2cca28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"b7e34e3b392db42e66c1b14ccbec40ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"bfd20d3a2d2d06f27f1cbbb1ccdcadbf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"d7fdede5df1abfaf468bc10a28c58960"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb32_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"a6a2b0f6e421071cf4403e96c52a7580"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic3oc32_ih513oh257kh3sh2dh0ph1_iw513ow257kw3sw2dw0pw1_n"e9b17add37b999f9a17d7dc279815c75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"ade038f0b6e8b1d247fd80af30a97070"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb32_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"e803069109b26d2f3023ee9e0bcb777e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"09770ec25bbc1439eb8f0e0dc047fa20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f23a92a58de0722e8f5263d1cebd6587"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3572e237b9fe32dc1b5134ae7e4bd780*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a1ab0afab3136a8a641e4b8595e6e8a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"86881cefffb35bffd59402183d5d6932"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"59f017e6ae55b75e401fe47ddbb769ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"debce504c3d5753fc0d9c9f9df7c4e51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih600oh300kh7sh2dh0ph3_iw1024ow512kw7sw2dw0pw3_n"ff96afec5501783b1035b6a0b7077357"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ea168bc3cc90460be2054a2ad661f3c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih150oh150kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b397a3efb359424f8c1a37311fa25274*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"69e569b7e688f70d80903bd0bf32e90f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c48416780d0850838d6106fa629bdb8d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih150oh150kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cd7269e765cd68e7f6d5090acb9a40f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"a55ec820e0b216a532926a1e1b943fb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8e836f3c2e18e7705449eb81147fdcc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5036a1bd8f6cb3d4cf1cab44420ce323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"beeb7466646aa794814f32f3d2569c2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a405b9c3ebb46e924344c0781b0477fa*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"93eb654de323c84dd39bcd45bf9a9eca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"31998ab12e350a42d5e2e4a74e68aba0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih75oh75kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"14aded706d8f7733a3f6b00704c84e6b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"ff371c77574a6cbf8df9af1c6ab66bda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"37e9f244f81518c2d6777fafa5f45c20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2153f904b64c7309d3a4024ca12a7a4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0bc078b04ecc71c2a0bbb7b647a8be87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e3de8f32d9a96b7da8c72d7e7631a262*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"adb3f0d7d05baa312b6b6e862cc1be2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6c6f20bf089841c25855225ddd1cef37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih38oh38kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5fbe453786ee57f691bea02eb1ef4ea4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0 mb1_ic1024oc512_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"03fa46f8c4d495f456ae1088d0eb6890"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"4542f1b3cfb5c54430a2e1078c81534d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"9c6e2443efa6ed316e6a7d987658c612"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"1c5bbe4bd80df360ba8a22b32379b0e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"4fbf49165de5f102598bb0db260b5ff1*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"520c7dd96fc11821ede3ea1ce53b3b6e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"921a747f3259862c4a83a75f6468b25e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"8563dbcbfbf2b4b64cc3eaed30cc8793"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb32_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"563572875003dca116a8a8444d3b4013"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"748f7ea49a2d896474b9174f20992f1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"ded925888229e7dd834dc4e4f26070b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"4d8e4d021e10fb2e030e73c65852e1ad*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"a0c990a02153abf5ffc562b5c443f9ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"88ee6f33995da2da91e197e261991593"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb32_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"2b856b662371742c08ceaf0564887a72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw1830ow1830kw11sw1dw0pw5_n"1495ec816e4fd96c7d64bc1547ac79db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw9150ow9150kw11sw1dw0pw5_n"fa2d774219bede4d232ae78ec52f4008"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1oc1_ih80oh80kh1sh1dh0ph0_iw100650ow100650kw23sw1dw0pw11_n"65cc16a124e8ab9dc8eb90f86327d5ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b46764d5aa4f1e410f8826063122b384"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d307957e71ba03eccca2d32e0aca1d73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"98659673204e86ccb021f029e487a73b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6574428d189e4cddf7f6f8761d7e1ecb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"5916bd156b5ff320b8ba9d8c9f9c8ca9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c90ed56edd9a52ce6d012e92654bd3a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic3oc24_ih1024oh256kh7sh4dh0ph3_iw1024ow256kw7sw4dw0pw3_n"c5bc3a2b45e3b10d7655856d4a9608ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic48oc64_ih128oh65kh5sh2dh0ph3_iw128ow65kw5sw2dw0pw3_n"a80d4b8f21cca2e361a8ec9ffe4ecc68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic160oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fe8cf2526cec1314ca4ce81e41ede1e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8bcce1b26272c0ba68d7831801af5467*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8f88c7c4900782346ab0c47abe724b7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"57b24dc10011b3fd67fac4d74ffbffef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e7d25850b32cc9ab4f6887b48a08d45f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"857627ba6bea3f68b00237e462214074"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"bd3bfde40cc4faa0d3a21d31e7295703"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"74c40522c1f089c2dcbfec5fb34e7446"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"036d06d6ee63481b7953d3a60b934edd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1a5c3bf863f2cc2c5f9710d9d2ad19f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"85eb43dc3b3e835ffb68f21335feca37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb32_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"1adba3eb0e09608d78ab2a40e80ea78c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8bdf094ad26a1e29904eddcb2e840c58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c1de18ac2a2056ec7ce36013ebfedcb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+sum:1.0:0:f16 mb1_ic96oc3_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c9c4fe80190601846c52386736ccbb2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_tanh+sum:1.0:0:f16 mb1_ic56oc3_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5dcdcb5f43996d449f85132e577373af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic3oc64_ih224oh111kh3sh2dh0ph0_iw224ow111kw3sw2dw0pw0_n"9b84266c178f58cce6da7fc670e2f4ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic64oc16_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f7abfe5cf10a09dde3e8928c3868b2a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic128oc32_ih27oh27kh1sh1dh0ph0_iw27ow27kw1sw1dw0pw0_n"50fb1510c3b33c852edf4f37f61de6f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic256oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d819c24fdff5bb0e84aafeca52bac2c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc192_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"686956afdaae26ed979fd72c371c667e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic48oc192_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d5c24e87ed666e8b586d1022f608386b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc48_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ab3a1a8fe5e2b80955be5929c63d5d2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic384oc64_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c57dad81a30f10fcb37cdb99aff9d7b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic512oc1000_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8cf5619b13ea783c7d82de2f61ee4bd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_add:f16:14:acdb+eltwise_clip:-1.0:1.0 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f05bcddb73794ae0b851e03d703aeac3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1oc1_ih128oh128kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"e566f6a03e5bc3bebb2e4e5d8f49f36f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src0:common:1 mb1_ic128oc128_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"50cd517ded522f6fc26ae81aa24235fd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic128oc64_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"34370895f89cf28b11438e404d9040a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:0:abx --attr-zero-points=src0:common:1 mb1_ic64oc1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"25c44f48c063065779d97e8f1a344f90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b8f9dab8d6675ebea73a3d5661217147"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb32_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0a9876efb210bed1711a96656d39e5ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any mb1_ic3oc10_ih32oh30kh3sh1dh0ph0_iw32ow30kw3sw1dw0pw0_n"516e292c9cd2dda61b5fbf0616cfd221"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any mb1_ic10oc20_ih15oh13kh3sh1dh0ph0_iw15ow13kw3sw1dw0pw0_n"9374d38ba5f15f8f3ded48d7b8b4a37b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic20oc50_ih6oh4kh3sh1dh0ph0_iw6ow4kw3sw1dw0pw0_n"0d3c00494db53918050e47231eb784ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb mb1_ic50oc2_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c1e697fd68d2610a289c1d75fbe7a8b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic3oc32_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"ee88cd5a2e994af4784836396b985dea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic32oc64_ih226oh112kh3sh2dh0ph0_iw226ow112kw3sw2dw0pw0_n"00742906a712a19e66db6e8f32068a9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc128_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"c7401a1bcf436dfe9b0be911a15247d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"f2ba503ac47f54a950a3e442cc0d039d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc128_ih58oh56kh3sh1dh0ph0_iw58ow56kw3sw1dw0pw0_n"d8fdf077c72dd19ef7813231f9c7f8e1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic128oc64_ih114oh112kh3sh1dh0ph0_iw114ow112kw3sw1dw0pw0_n"4740209238a99fc1367da54343295b8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb32_ic64oc32_ih226oh224kh3sh1dh0ph0_iw226ow224kw3sw1dw0pw0_n"9cb262b31d79e154d000b8affb5441fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=acdb --attr-post-ops=binary_mul:f32:2:abx mb32_ic32oc3_ih232oh224kh9sh1dh0ph0_iw232ow224kw9sw1dw0pw0_n"20d628baed3d66d88247f63f816925b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic512oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"504012cd4d0934b06e914eb8883a277b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx --attr-zero-points=src0:common:1 mb1_ic256oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"0ae9cc91f7d468cfd02ee537f90de628"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=prelu:per_oc+binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic3oc64_ih112oh56kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"148128f09d7a0db1cc2ac8ce7d82fc8c"
diff --git a/tests/benchdnn/inputs/conv/option_set_fwks_key_gpu b/tests/benchdnn/inputs/conv/option_set_fwks_key_gpu
index 9cae23ccf16..9b48416e1c5 100644
--- a/tests/benchdnn/inputs/conv/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/conv/option_set_fwks_key_gpu
@@ -1,4198 +1,4388 @@
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"4b71580f965f0f2e6932ccadbf08ab35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fe88e1f0532b9bfb4a713b0e5279532e*80&30b1a0464b89da1962bf50cff0fcf01f*4&5f23c1c2eddb6ee2d9441188b15b54ca*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"413dad432c0ec88cc7bebfb7b1c12857*20&e01740d7da0568eeba70194be6b4ae9e*1&2da2021ee4e2514fe6baefab699fa198*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"22ad7f5a70e6f5da6c543b39cdc09918*60&b14baa337ade687ed8c3ceca36e95b33*3&6938a477018e3e27a922f38c91627925*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ec8d40c71ec30a8fe07633cbda4cc825*40&93b19ecbac0739f353a69fd4fa804356*2&bbcaa04856d984036a28eff248d11ea4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d9bbefd6764848f90993914391e66383*20&afe56429ab5d3e1524b50a01e32a755c*1&5cd927d1148eac6313e704e4d05030cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"408323a08fcba79226284f99f6a27b5f*20&80064e9ad707486fdc9b3226171754f0*1&f5b64e952ccf73a1bd9670eb811fe070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"e293b983fdc0c667da4b826d2992394d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4ccf7517008f8dfb12a765619d75fe41*80&4ea7229fb973829d3260bb2055d2a9ab*4&c2c013c6b474a395be44504cc95c11dd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"db27276c061e58285e7d3dc2cb222501*60&1926a1f91e3a0fa2c539352667cb130f*3&328f941b3b3c6c0fae4de7ae9253d250*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e6a49e5ffa254988fea98eda23d5e744*60&2ad21acf172c5ac9bede6692c4b98903*3&8dd3be8705b23c191fe2240125ccd690*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c9c7f275c3d460f3cf9861650e7b01fa*20&77ea54e3010af30146a55e5abd1b23f6*1&a26d8d5f7fdaa29fe7f8d870f16d709d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b489e75a1aac3da51b879024b89d3d9e*20&fd2b026258a8725c432708f81859f2d9*1&ef21e2146ae10958e53d6092e1f6fa6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"87a767187c3b7d10d4b4b20cf156d24f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4e188efa1587c5a1326f86c81002e6bd*120&1ca6d999ab096b5001c58d17a46f1be2*6&810cc264b0e1fd6fd2c243989d8118e1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6a4b510daef607e16162b6a2bcfc4dd*100&9c704a09cd45a01d80d2c192565f1c97*5&fabe9a517f7f05f07016e5c13406fb1e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cb80d6ab69a8f208644b8dfd7641ed17*100&e065af185b8c4a7f28d8ae126a5d014a*5&19d5ab4c853685dbeb1ac9231aeec16e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"ea879d0d020c480d5f49babf2dd5a759*20&6a78e89137f3da8d32c6ecf425083490*1&f28795e91c673b904474ba829562d777*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"db69e5a3f86020ac3b902c48e4c00313*20&cf9aa00c9dc81957bb9f52408a2d2081*1&f583ee6666419c41ee61003a34ffd0b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"4b11a4cbe7f0045adc04f19e16ff50c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0112a0230f0abb78c01c66270d7ee001*60&c45214b400a54a7ec339e52a05dc5172*3&0aae86291e015ae9782c6b31378eb722*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f7f1579db8c03f1f14dd5f7e923d62d*40&1b1d1c218cc91b0d8bd3d1f1b40288d1*2&fa2105114eb79f9bbb65f9c707687fb5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7479f17eee4bd9f027d0c3092a800615*40&aa7ff485c4e2a981ffdfcbe805bc25ce*2&44840dd7e2ef46346f15789c2d4dac49*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cc21dd8fc83ec81d11e51c00d3f34fee*60&9f2464031ca4ed215ee7013cc2e988a8*3&d921ea24beaa4194185071a7e762771e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"72dfd99a8330d120addda4e20709339b*60&b292ab222e2d1b2f464e327f40b5bcf6*3&71c5b049c5d49f1b8bbd281ff1ff039a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5ac9c2bdb6f34921bfa2f7c338f50458*40&2b14b4b6832175546f648be7e28836c2*2&9eec7e69cc58b6908b76ca32a48535c1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b7954189faa8616208e6875d0891382a*40&57c8a05e5cd310d5641e24978c078d5e*2&4ebfedf0ca096a4646ec3c348b176249*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a37857cb160dd05fb0452654809e2878*40&f6e55eecf670244107ea4cda801ba347*2&daed21879823c02e67aa9817f74f0949*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"233e86899c374721900b6507eb329b9d*40&62e7d3383860cd0bdb2af60e141bceb2*2&ece7d2cd5d8a6a49b4f165b8837979c2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"21715aecfd6191ff975856b449e43ddf*20&02b304292b19cb59b36ec5fcb606d602*1&fd87d95e5343043202bdcead4a96ba5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"790a694de95b7da74aea5294951dbb11*20&2e5340b14585577123eb1096cd56f75e*1&cae57da26c74132ca23e59bf6e537eda*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"50172bfb51cf6696f78183edc1a889d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"38f1bbea5cceaf03bcb38ad6c3be4d81*20&d258a61fa6664d130888068eef3f5a9c*1&18ebf65bdb2f0533396a3c38da9bb908*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2f3f750b39b0eadb81080c87ca5b6be2*20&1b37e0795cc5c9beab631adc0d5bc7fc*1&3f2f0f9ed124bdf8d5add52440021659*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efab8a7d34134c2a151a10b308ea7efa*20&e3b031613c9e286825876ecd80c78c5e*1&80a45baa447332f7a0c6403042742feb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e4f42af0e4c97f7c0560c31f7aa3eeab*120&0ddf4cb620f4b0ceeecf749e278f2842*6&4b06ab80e091e77096deab2ed399ec99*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d2b379dcccc00afb05ff2c45327262a1*120&c0e2a8e8c7614e2d4df307ea9e379b50*6&937a0776cccdb75eeac94005ac1a4e58*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b82231e209b715d6d66c5b2639ebd4c1*100&763e781380c9d6c5d26e58ee8f7c9071*5&e71981af537d9b40dd0bf2baceb9ac1f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4bfc19d431a8a5d3d1f876549ea56f3f*100&91622c8fdbc7d6fd95267b5308898226*5&6f4775fedaa1d698cbd3464ae22f9535*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16417773782faadda3f337dedf057ad1*100&f32a00edc3372d3771151fbb64f21e88*5&a47cbd8d582b640460d73966d72db8ee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e258d4d6b202187ec8c8069c191efad5*100&a9c92f45077016d6644a60e8892b104b*5&c398b2f1eeae069f66d87e3a80e3c598*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"eef01b384d28704c2478c6ffc6f60cf1*20&7669ee538b940a2c7a06b802413e7330*1&5892464f363ffcce1c4f6b72ee02c08b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a90068068852232ea1fa7ca4e1b3d849*20&b20ffca7864fd26895c989d8a617cd53*1&ce9d36b0270a22b7ed7fef126903cd6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"d8f38d3e9e3b10b40b9f0a0e19bae9ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"20954c5a6ca862322eae25c92ebeecc8*20&46e0b3a308944b18d3799ed5430b44cf*1&deb415ddb625a7f5b1edbc81146d21f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c61c3069608a1fb28545aa03c6bbdbef*20&8052ab3ea5f0475bcc8a030ca42d619e*1&bdac5b0186551106a489be374713f3b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8d1a5f1a20420670fd01e4568abd3edd*20&6cf991a5a6658894a47fc48540e32e09*1&9aca9f2641e78db30d601f76c9aca694*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"089353023bf1e118299f8e279287ed7e*80&75188604e9b9fc6c61b8f67999131fa0*4&4498c3cabe35f42415931fcd61a1421c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a1f142499ab2c5f64ca05c4e13ab9cb7*80&9766472d921deeea7a379317fd8c18c8*4&d6ffe06d3390400169deadf12970fec2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9e3924dfe46b20698baefcdb8d89da95*60&58539be044ec5b0b33d5deb91900ebf4*3&560e830a3a241d3e6781c7edede269c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d06eeda2ab58b89a2b027798ec3093e3*60&fdb2a009e0c1c788eec56efa393c8c3c*3&33e4e21a748f7b700fefb3a760c1affa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8a64f3aaa09fc0cb96f70c3723b96fff*60&d0bd1d986d2cff919cd6756c802f04e7*3&f37880d53bca06b4b1ceb69335cbc45f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a1426c145ee141a331d51d2a67378897*60&961792cce0619a66a91ed980943f5980*3&2d22ddcccff04f1bad4d0861e0e86576*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"cf73310ea31515a7eeffba282a54b4b4*20&05e7aea962ff74e21e457ef8367bdaa8*1&e7a388aed6ff09d84c852266017c5199*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d87b652cf49a5b8704049d8ae497c492*20&4c2d8c3db9b827343c29e645b4d26108*1&a54359e710e4922fff7b1fb516a98f07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"05f282af29576b6fb55eef2319c4089e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"3aa869ec1081f74681279f600005dfe4*20&963581753b1985a24a1c72dea349885d*1&ae5c8e79b86d1782679bb4d1ddf2bcac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1ac4bb2e1c0937df2301bd7e919fd332*20&c75342b890e751a66c5246ecea3c122e*1&b24acc7bcf80fbf99c0ba4acbc3582bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"31c6433a0842267872b727362eae3c17*20&403603a2a93ff58aaf898063daf22c0d*1&a949cfe77b2f3e033f2d87dfdbe6db8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"69b652651dbdeea71122b70aac0717b9*80&08f9f08cbc6ebc9eb2f3015c2ed28a24*4&08f3a4c221714e798e671a702d89c23d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d9732b7222ea9b09734b9c067c21e5a5*80&b6737e5a86cd3e27b22d67c8287d8884*4&b4f15d00c336cec7411943cbfba7c2c5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b2884c232e62cd0366c45d97194ffdbd*60&56e784a4a74c72390d8a94e946e1ac72*3&ff26576252d7dab2676e0a44c47803b7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4f9afa1d32017e8bd621e49c2f91b57d*60&973ea9371ec37fc70e73c316a075383e*3&020d6fc2ef4929117ec8699b0a15b531*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4cde95947421319b96d64e1c46e4bd48*40&876bffd31a2d49b68214a4c2a23c1149*2&145b838a4b47160f1661ecdce9221dcf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"696059d12b8e68cee98192881cc4f57b*40&6f6404771a748ec27803641b241d7153*2&219579b595292855770d4019edf620c2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e3c29fabab0cc8eb94209fb9f6cc4f0c*20&279c3cd2dd5b6e816137d5d348d08f93*1&fb8162fc872cdbdc3499baeacafc72c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"41fef2cdc693591fbcb36ba11e012a8f*20&bf7cf4b813f6e825dca27785a7e87e18*1&b04648c303c67dcdee790801d4ef969e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"0bab708673c19215247cc6b67a7cc349*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"0094bb2775d886310d92ca4808ef87cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"fda2ee24e2402622413c990e809b2e91*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"80f856da821ce52aa671428b41f980a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0d6ef53dae0d9a7cc57438757a5ab962*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"3507707d5ee9000e6ec2a9f117d9e6b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"afe559941ebd9d6ec7e0ec9942d35074*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"e89e27f1f2bf7345fe8111732059617e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"431b3bd4d9eb67993c91fb85478f1104*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"7704dabaf96396f89199038713758000*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"f6b2a8fea09bc1de042c1392cf60e1e6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"aa9d2fd69190d50e3c4b0280b4ae6d43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"4d8e51cef370b24fe45947884e774291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"16694084ed390740d137a933a3a3ecdd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ceec91d4e415b13d2cf0f53228d33ca0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"d1946e47e7faa8e0a718fb6d3d440dad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4cbf5c2549a99c6e382257d86db60757*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"a0efbaf4d90fd4308bcde0f549b2eda8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"948ba31e70f5f6206fd390c776c0e763*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a8c5248ad2d7619fc3346b561bf09ac0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"05b86f469a81f8672a50b2a59d0816ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"7b5b59f1236dcbaddd6fd0013a1e64d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"90061e523bdfaf11f8178d56fe3aa96b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb512_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"10a71f622fc9bf65fb1e0a64c9ff87b1*1&06a7641a9defb81b6c5eb9fc7b021b23*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"33cdae9c2cfa02be53ce9771d5f19d53*1&ef6372c0e38c9994eb98584618ed9086*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"6ab1c79fc983226528aa61255f891611*1&37393795e1c76e74c062fb4b64f6120f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b9124dc81124d47763601c76f48f9211*1&427aab3333d648032816ded483db307d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1532d5d71b3da1cf29b286e1b5b355bb*3&6f24572b65b20e430ea82aa1c8d1805f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"694aa976dc024d9f0ea13be3910173a8*1&3786e9d11b054176b859d32d1f4d8ad9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2080fedbf863ea13b32e73ff9f1fa68d*3&7dcc8834acf5b1d40fdbde64cbf7dda2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5df4b06b252fb136e2cf769e40a5d676*2&297a74844f500c0778cfec03594f49af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3a39ba65565a90ef612c6f16c618e073*1&3894ec5dfafd92d2b12182c0e79d96c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6ff6dbbb64f70cbd674974d1625e1da0*1&7eab8c1aa89f56e19eeaead2564d6be6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2f041cafa1b9000572a164ed7c1fba86*1&86676893903a2859a2887b54a02aca80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5e9819243c4509088e332f7a0ba74fc6*4&d31d3c8d1c7114e518a14979d3c627f5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"47162515280721b5546136c86b61a3ca*3&b3654f445dce5eece2ae48b24cd0d650*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d3d4f8e70814429c0e1f1f559ac74df2*3&51dfaa8acf446e5c4f9eda5f570ca534*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"608c930d9585097b64bc40158703bf2e*1&d6eb64e9a7f8978286ea1ebf76f20d78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a0203914d2b78b06c8aaaa3abcd5eedb*1&638a2762846cf2ad4cad2191e67a63be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"912c2956f32b68229ff166fec68b3f33*1&afb63cdcca11d7558b70252809b3d490*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71a1521fe4d62d545e459620756eb3bc*6&ae99fdcc8a5f9a723ba2c3140eb3bf5c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6fcdcf8adb43e677549fde9cdfd863a7*5&5e08c6248948303c415dab1bdd2fba2c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"18f103bdfc92191ed8cce6697dfebed8*5&133626d041a3d534cb9ee3400578d4e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"42203f2b85799cdfbd90aa843a59e927*1&1f5cfd15f20d2f0b28fa8e1ad5d5685a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"20b692f9c311abd9f2ee5ecff483f047*1&ab7c0855ce64553d4bfc538461ef8feb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c1f55c54e6b6e954fff8b5ee10a52985*1&3d7f6b9d0d6a1758cc64939c854bc64a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9c875fb587097cc895b7c7edcc19721a*3&d1415968fd84f20f54dbc1855daf2b4e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c83e0e42cc0bd609d59361ff1065d3de*2&572090a318271a37db92a44ef42d546b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5386cab5addc359b1be5a97cb15e318b*2&05f859dba7698992143999f582321983*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c752aeb23ff7cdd1b918168960601c38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6ce661aff22b0f9aa3cc2c7b0a017aa7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"62dc98d26d2cd542e95dcfcbc8562cdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b252ab88e5f441a9a710d4be847ea015*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1eed7430fcbaedc95b41a99c7e4bbaa4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"230feaeb578300f8fbcb16d193751f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dfb2d4d71b41d753c4b6668fe99a216e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a21408c284556f3a1078bd62c7d3cfc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"20d77cb080cf1a05263871d2b9966c7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"23f2c6d628ea8b10fa7f7b5448aa966b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e745f62224b4164b6c133f7d31eb44d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"58440088af737f19bd0597fbbc15067b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"80e8bd33a55c9a572021fd727ecc4b0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6acd637382260106da6d7d15d456bbfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad72b37f989391e60eb1fb34880e6573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e87bd2d887ee3c85dbf8b1c4a115d2c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"42c64b91c85621342ce7fd9a00d330de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0db65dee9876455916e221df1d1e721a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2a85a0d786f0038873846138e89aae4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acbd --wtag=any --dtag=acdb --attr-fpmath=tf32 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0ce35673f43529395f01ba6f3fb1e0bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"d3387760577891f73a3cb32cf4e3de26*1&56ba8164ae6aa51645365d2095da9059*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"36810bc085f3eda5f9651851b6af2bb1*1&1cf94076c2b1974888c3455ba6c1efc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6c89d8d16ce717f1c8731eee76108149*1&016d12192e84e5a610492d0a1f6c0090*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0dc58e20cab562444618999a1d6d5190*3&77c295faacf225e42eee4015373006d3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1cbe782057367aa64f66850bd251b3bf*3&ad77602569524667941e76acf7a2cc3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a7bb641feebf2f8c0e9e1731e2d62122*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59c88cce8527d898afe9ee3506a867c4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"849dec4649dd8aab8dc40774450bbdef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"189f20bb80bb357c4b615849b956bb6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"dcbd778cc1d84aa5b553ce15082d25ae*1&658afef76c5d34e7426da745fa9d87cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"50c89e17bfb52c7f52972104dccf2d12*4&e72e94107a05530db821616f75ba5362*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"10c4c0ce5c0f0975339df43f654900a1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"96703f227df10bd5aab4acbaf23c11b5*3&d17f4948321f7657e38102be87eba9f1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"10e239a4e88fd763963db13499b52eca*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e2c888771225c82712791506bbd4ca64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9ad740f49ebd7587511c697059254bbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"d741aaecc4637ea42bbd0005dfdb3465*1&4c816ec0681400219a7ffb26e0c159d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7080ae9d69a73a5424088ce129849f06*6&8ca94be71c30bdca8eac8b3111138f35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"51481915438d7cd06ae8e0f54bd1bbeb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"20f79509245f552166fd9c699173e798*5&a27c821ec034bd48c228b5658f7c2544*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"178c3f4decc28612bc90897210fb2aba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c992b15f317f2fed2fd4ec0c1ca6bde8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a0ea24900aa2370a1d887c4a9f3bd54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"7c54510eb6e53214f892c6d92164bcc0*1&14cd2b91820bc86b6006316538181a0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4bb96b452d2784621965b0e75016398a*3&d0a67d6e16dcf69fd648940a3487f695*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c066e6ffbcd79b5ecb5af6965a665d20*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"eb1e2cc7a7bd7483a7a6b2c3e6f63ccc*2&3bb9166dcd08a1fc24c3d26c1dfa65e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"56adfa14e9fe0b7108cb57719cb1fdec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"aff0d23ee9ec5e2b6cf4e032ee978961*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0d8641d5972f15f5a98194ed21d1fc53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"161985b7b3c5a48be4b748402c2b6485*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"44c092107fd870e9e93c5bafedddb35f*2&9f42465dfaf0852af89e86d678937a3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8e66bb12e36d2c1b4adc55393ba3250d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bf32593d244d5e47b921e0d8662d4282*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a97e34071e03e2f2369bffa6b74fb528*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8860125099c9cac2609ad3811f08e0a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d679dc43c4637c74b5bfb54765a7b7f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"edd8162acdbcf1dd6ace15f3f4f2e3c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e4cc57dfe1dc76d9ff3401d078b7dd9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3a337d73ef53e843751833da896e3643*1&9e1aa8a82334560d303c2d90ebe09d28*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"30d00fe19e2e4561cd0031d6a41a1665*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"794247bff1270a56ec4fcd5b8fc3c763*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"83d85421bf7da8640e0a265b94c886bc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9efd94d05e995d545202760b752894a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"96d9405e8991fe8412f490723c5d39a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ea12d351fb44d1b6efce6897fb8eccc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1de72c4c5ed33ac90d05a831561145f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b5aef1fa6d4f0c4979cda4497bf8a04b*1&b415bc25cfc94a9d4de01a1bd92c7a11*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dafbfd4f0dc3a3840867ca0e77abc4a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"516880b3da416f6f7d4114a2068faf95*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0f1d62b745a6150869ca2b751ac90193*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"72a37d238570aa38c20e7005f8aea594*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b3e32ab822c5e2f132b31186c65c1847*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"36ce0e562c4e086bc97813d2ca1ed433*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bf11d1c13f83a85882cc0c7e8dc931d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3f00f5968264c31a9ac71b343e798932*1&34b7a15646bb7f8a018c47a439e259cc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7169ccc9406917ef66b6b7c881a326eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eedbe5d74024035f1d4a6e59a6b70f08*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3817a0543f99040774cf9bf2ad6d9cb0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"564e59109fc869751e46abd768576d01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:2+binary_max:f32:0+binary_min:f32:0+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"789117181fbdf0d0ad4191c7c595a708*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d3b407e3985b8d90ea700ab0029776e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2548b1319d2afc77d0e3a54ecea87889*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ecaea930f364abd7c7eef91d2cf5668b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"09e5eb9e2a652b3bc70c65e93297da2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"23739ed335fd0d9e229c2b519ea8e660*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"107acbdb37947f672b1b72a15196cc0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1ccfd263f1df4e28060aeaf52219aa9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"43d6fc53f6d7d4f7f18f559a8867e378*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f28dbdb2a789b6afcdc18e7efa97fa4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"029e8186c99dbc51c920db23d1ec69bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d3c801f372ee8694959b6f819276c050*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"440f54f7438a79a136c74b1b9da7a2ea*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ac7615f3d9f1f6788e9c3745b1b83aaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"63b9f5b35c14b304afe6d880598aec91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e38ca6f5d6634416064934b6fa0665d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"125280d86d4be31e734688247a304d59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"182ad2bbabad0d6d233b6d26b4749d07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"738e04e904ea7b08871ee13bfb0269a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"26e4ef89addbbd8208e9b28f92f62531*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e7a214012bf72c12a25225d5d9db454e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 --attr-zero-points=src:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"3651920a8bc2334d1a714c83d04b2c4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a9bbd2f83580e996d8f323c1b957df7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_clip:0.271:0.314:1.234+binary_mul:f32:0 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"ee04b0bf2e9216368572e1d509a08379*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic3oc64_ih1024oh512kh7sh2dh0ph2_iw1024ow512kw7sw2dw0pw2_n"0a902703eabf2ca05b3d2d894938227c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"89a5c0aa06a3143634ba4cad16a6a5b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c2d221b19121b3c2a79ca90eec68964f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f43baf5d8e2b2e8cadab2cbdfb59160e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"46a8e8a27128802675dc024dc3e50c8c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3c51f53e83ab2c46ec6eec81b0dafb13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"97487e5ec032c50c6f07610dfb3605b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"b2358646e2fe257cdfe9590158184dc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"b7447e1056110c916f04e708445952f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"85995a3e0545a23b0228b36c96406132*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e95f5e2b7e6729b79c2136bad0409b2e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0fa9b34d1070eb133573b1edf409ba66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ddfb2a0808591725b98188f181670267*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"964a82a441e1c042c1438f12d08d0efe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"5a885e96eff662d3336dee31c42f8a0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"fac240ed367bb0ca93666dffad18e4e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"910a8ea6551e4e5cd31251a12e206ef9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"45ed3ddce5e70e2263738769db62fd1f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3fd6e030d2f61eb7c566fcc6f2b8a03e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3fe7ce53b6a104bb18d1b6c76c5a2e53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ce564e5f585bf7dbc08ec215e4393c5a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"e19eafa62f9c3b572245c83cf59238da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"326972881bdb0c42d8e34016b639fc7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9866f31a641beff1a31be54c04975976*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8e5d7f6770e094b1d20939ff43ba6af5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"50783992d19a37a32ff42de30a5d18ab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"068a9750db9cbce0f9220e51f3b7e204*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"115f6f1ac7bc37708580edc67da26287*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a385acd0b1f1ddd7be1054dc48f5d7ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"af4bb3da26c8f5006a35e2f26bd09e20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8be571fcec305e0faaf38cd069ec74d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ed4dd7aa41af927f35730346d13fc7e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"02384a05dfa374996b779849fd0762ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"cfdce90f6433d85c2f1a26b03c051ef9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b3b6a3f3e274616e3f1604da778cfd05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"10f903ffe434a85008019a5ee34c972f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0911f8c3a50673b89be131a0d306dc08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0871e8c2d22d21b67af3fa05c68e3e66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"645c05161ee9dd32ade1f8bbd55daa96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a4d4852c6061d1a0c681d1b64ce49285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"273de3602b858ed0d3a5969e6c40b80f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5567b2107538319171f82b607e3c834e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"11840dd93dad2bdc0f9143f03a7d16a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1e4957dce5cb8b494a737e06ba99035f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"db1b8aa9d1032c37e0d5e9c9f3822940*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"46b787ed964f90ba4932a39700801be2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c9676abf5413828c0e6926c2b9510af6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"08ef6ceecbc0de4e7d2fc48ef0b5e9f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cce44846a0f0dd4971c4eb309ba5c405*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9c7e29a9edddf8f31bcea53d94bc9c11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2274043f361003f27a3b6c37e560c8a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"443a024c69c4f4f4d7c58b9384e7d8fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cf56d794c9584900d72704bef4839a06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f5ac81fe0d07bbf6bc8811b55f8bf7e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"01bac609abc5032de09e0af41b253c83*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1dade574bd6a878453c80c311732e16c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0e6ea0b93e477d17f84b7df5884f2ec9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c20832e5be8ddb6abb63d4764c2c24b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e5460dfec1c1b1737330603e7cedce9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2f58119e543f8c37a561171e6d7c58e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"556ae30bbab86b12c01f25b094b32652*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0ae8e6de1df563cc248667c1d08bfee5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f7745c4fa985b65805aa3f02b40685fd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"eacf2e09048a5e2606997dd5d8b16b7e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4bf380bee2ae8d084a2af2d64dd98203*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"98446942a085314e2a2fb41959f5e3d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"310261b381d128cee58135d59cfe4565*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a982923afafd4e5706f7f93a5c1ed5d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"05fb0015dbe63c0c15557aeca84559a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"560811eed8610abad694fbbb47ca0ea3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d0a179c4d904e28213f38b343301234e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c9007bf69c17b67c9439ea1ddc552eab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7b908b883ad79754e7a7e2872bf361fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ba7c9723d7291cf6500780d6c7ab126d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f2b0eded0c7272feb7683807aab18b24*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b76a8cf06fadbe2221c631d1380bcb98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c9c45b250e69494ab5f4817543b24bb2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"116daaaddb2be3fb0047f715aeacc6df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7a2173d7f2d82c66a9fe23e54e65633f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8d59c618431088ca155d5426467243e2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9ebf25a42c2411d828a983df4eddeb9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f111068ad6830d959cecf647d6d6b22a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"009e29f79cfb6039aceb33951d09484e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e3eccdf67131a5a255322bede5e29f85*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"395990f137f187924e7fa68627f417f1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e5bf7884098256bb36bf1bf2029e5a61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"302b5f52e8fad7d3710915144e4698ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b2bb4aab5eb68792ae4277fc3a31a110*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ba50101b3c7afbacc181945b985b2ed7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"dcc7a3ff1f791adebf3960c102946426*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7f78af368f1f3241bcc201a33b37c612*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2e0290ef708fffc114ac453f09b36e4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9ae70c672da11d84bb695f22e39c633c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4370ebcfcaa5e136cfdb0ac3790ead2c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e1dddd84bd215d5a035750ba5165084c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ee9fd4066d9318dd9972c5cff508f5a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8ed4fb10fe5be95f6b47462c40813d0a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cb5ed2b5292b2e9ca4d311d8e5aba36b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"1be1d5ae272ff96985abdd02c988ef4c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"18fc5f95e77a6154543f0ff10e5a6cde*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"2a473b4519c5dbb11a8352c4715266b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"60801aa91dbab303f59987c7d8b5c0da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4ea12f7a7728c892da2d09d91e0efbaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"886676e1a59185ffb09aaeb6c60e79af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"89114e7215fd1bf1871f455d88e96ec6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"15acd15de6c199affa859a334f4fbc4f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f93c0c5a5cda965c65fa881ed83bc2fe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"273cc45390e4afff3df31b5ec61b6caf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"cfbbb6bdcf682233f55deed9ece3302f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"c1be0f7e48b16d9328b6a8db5c5c40ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"e8f1923db83375f888fa7030fa83a78a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"45fc2c552f43c1d2beb001ea34dc9b01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b910a74533eb3df0d7d0d49792c4d3b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bcadb798ee8b350dbbdcdad3b82208b5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9a453c21c218187d4436063b0f3fe00e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bbd9beeeaca3bf9e946a2e71a5ad0184*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3c41acb4f46615935562c40bd7273eda*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4b65463761033c5e30fc899807d7cbed*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"de37deab5dc20878142e9c7ef480edbd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"5f2b0cd763277409c741a093985de553*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"5e1968c22392b74c186cb21f7ef1fcc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"9e016a07d3e9e25d9f58b79d10673dd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d7e8f4c2b96209f91f2578d751bae541*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"f50f243f40e8e9c63756953e33be995b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ecef2151cedd1e62161e5351496f871d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"206330a15a5949e26c13287b1eaebf0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0d7105783746c749865671225b88976e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4878092310f0f4689d4a445dee7282b0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"265a09a1634cf12ec5a9769a6111a664*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"39ad3351cb54e221473d75253aa9d6d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"eecaf32422eba698c86c36f31d8cc0f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"164b8691066fb1db44de538fa1ed45b9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"12de3eb8f54615def2c1d0dd5d930ba9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"83a538304f908b323db0e86ef9a3a2f0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"0e5d7b0e922773c769ec71c7f6e139dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"71c83401ff6506f3f846c11313d4f074*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"61191d0c01e8fe15884344f7e97a8a0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"22bde82abf4bc5e04d1eee8c5bde094c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1bdb0d0708394cb90bbf1223e14f0ab9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0910195a666ea8303028b1a06d2814dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"b2897389532ff286cfd328db6b6a9c22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a0b167cb6bfde128f02524256aa8cc35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"cd6b7834e859840e05230be39508e1d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"78259a5c4a1d16ddf226ed0214718dc8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"36c0491c1fc7fd817316ebb4c73f9ef4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f80e1a601e0e4e04c324c2a8374af9f5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0a4de32fe96cd897e718c96dab0aeaa9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"07fc5b588d8580a10c64b978863cbe80*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"74a5e29d5929895684e1307423d4f48b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ad97431a076e9ea8af51fc95b7b80d06*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e37c5ec96ca1f2c1a8d57f90116580c3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4a5fe80f6b3d2cb360e154ab398c2567*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d5c60206e08bd5151c8ede03039fe485*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"25927e6e8eaa0542c911d68280d220e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"fa18c6de4163d265be5c1c2b9f9f062c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"5dab29831513fe59ae49a73c6161dee5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c4e87e57482f6d27a34e8c80ab6ae0fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f415b473ee73c9269e02b4c112b4111e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6febaecd55d3e6368d77cd942c459aad*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"29c40df6a0b7e5f5cb14beaafb275d65*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f501cff74e0fe61254ad2bb29f2d2b94*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7edc73a76350fe0288dcdd8d16a7dcc3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2f3cd19ce8305a385982a4284fc5d7d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2e5609735bec0dcd846b573e0fd74e3a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"3fc5b5ea1e3457dedac817fae2015bc6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"376d863e8871fd2d5f697b611ce1f56c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"d06e25a9e92d807083c1e68bd2dfc986*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9b0f87773962e062d53399fd1b4f4b74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a21fa39eebd602bf7c4cb7764667d715*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0367613e91b708f11df1a06759859085*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0a5f0c8ae3f4e48dca2ecc0f8e337547*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"decf4cbf5e9641aa1e8a0af4a95ac879*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4fe0e7b019ec47e2f318fa09384268f2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"cc0281273ca6391e230eab9444eca07b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f1e0ea0b1e68f160780f09481783695c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"29a6366a7cd47b8f86db5afc332d1c88*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b86a51a8b4262d62ea481cc50e5f2222*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1afa6350f0462063709c50704b3ce8ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"b3bd12808aed471f2590e1f4b76e31e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"d50fe217acdb229c2845adb3d8226cce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2c4d2dce613066fc07bb99e260526958*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee90c746a28c4ab908d6bc9e6e94772c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dc0b93d10e8fe6fb78c7685e7dacb8a4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bfe9d32b603774f15029077716073a46*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6442238fb693848be8021be11f1257ed*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"078cf77cb6ea045b58bb7b8a5ba67983*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"08a3a80141fc2746cf115249f0ab9b98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef0ca2738e6ab6150e43b6203964af38*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"75ccbe6ecf469cc8c55af14330803c09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f16b15a3b436955a1898a10df099c496*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"0be6936328338d7e9df2d46b4f6087ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"bba067fe627010adeae642d66b16ceab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"f4b34da39f2ae8148ba5f1f9bf957ea1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"ab6ba13e52042ca824aed85e6568ecfc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"e21484c0d42ebb368b3795df33690345*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"2e3c7f220fe5777c06badde3bea8a14f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"fd2f56171e38e3ccc51437a6bac0c497*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"566cc2a1da20313c0166b43f2c39b94f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"33e39e0a9d48221177051e6bade040cc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"ffdac62fea390d11995f6c13044a6589*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"a567941e3729612f566211e4947c5d0b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"635a2b0ab8d90fa02a4c4a591d823a69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"2754f926bb1aaa7a55047c5a96d1909b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"85f2b22fae17c3df37c76b86bf8074a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"a500290ecf125458eaf357dc36290d60*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"5a23b6302bc45932d7984e7664bb047c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"a061c6f39c1e15c1022e5cea754a483a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"c818b17b2a642634f257a301bf594a86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ffa7d1e30835f471058e8e686edacb2f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"88d5e086ae0ba7f1d5facd4c455019c7*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"869dab7e0a545ef47c01f00829d7b357*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3952bff2c9080dbb7f98791744c8e2c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"985f96d05eb8882a72d3709df40dbc4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"953c2228d7ef880fef2f6d911a9258c0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f866b9652108ac84228a2d64245b80fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4b6e32b3110c4fd7f50201bc41f42e75*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8a052b1695dc985ddf4c9a3a0081abcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9ccd7fb8903bca02379c9f975969b256*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2e64e3ccdb2e6bc99593d3846240764e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"eced2b36369b5d3234f44d8f2bb4debc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"eb340448ca9342e41b546425f8a5b695*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"254f034f36df14871ea6703cc8b705a9*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b5bb298450cb029fd90da5d04c0164fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f18552a32e267487d544ca63be51d5c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"299e33821d0dedd96b6fcd2cb4ed44dc*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"f3c13eb4b3a9200b1c73f8d7a2ffc738*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5b6c7600a7ffe3044e411f483a8fbf85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"3f88406ddd7986f770462b7293cae013*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"a8738d8ab643d5da69332384b7fa685e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"2eabd39d2a179f8be38f8424e64f3330*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb32_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"08088fdf032842e43996f4378a7e6920*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"12147405e6e951f3c664e18ce7e44ae9*1&2fb13438c46dc672d797510af1d58144*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"0f3b4a684b71f36bbe40a741eaf2c5b4*1&71d08a5182f74bd84f8c0e5aed2b041a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e62eb4f20483472c1a7bfa5bef70fd17*1&1385409235ab6a29a75750c7d8d5ba14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"02695727aef2776988dc52cc917d2d9b*1&443be91e856067d32e9d4143aa3e218d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"760dd1190d7909933cc465a69bf6e5b9*2&8b363d974296d1b922bd92723b4c608c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"57215b4f3579da4e25f898ace42384ca*2&cda585f79adaaf1afb86477f72389c83*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"10ff64fa87714330a1a89c91d5b5dde9*1&a606ffc4a86eb72adb9f09d271ce8c78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"dbc81dfc2d6a83536f8a296e985956e1*1&94b39375dd0095bbae97672765aa11e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"07f2595d6fd2de95f73710fb00447187*1&a97dab53a2427c2159e70ad59d569c2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"2bb0b2b2ca38d8b0024bfb6bd4e3c212*1&71542af49326b1ab3aab610fd83640c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4994c1c6475c48bf2393eaf56b120deb*1&5919243916ea8a4692c4b574b850ea69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0a12c2af818ca9eb9a989fcaad03d5c1*8&b075d76b17c4f49127cb6e6ff42cf041*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0dd537a35208d6569fb506a8127e630e*6&92e761d5913eec3a5f74a017a8d09ae7*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1afce1b075607cf9674683e80ab626f2*2&7a894e5377568834356a1baff00a2432*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"f62783336faccca88a1468e7149523c0*2&117d09700028a43ee423f71fa752e932*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"ee0e23cd4e23e0bb7940ef96af5c2097*1&00b4daccd42f45c726a16a9f4ad5d107*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"72f68882ac2e0a6827db99e00a19f288*2&04d5575e4da2b6dfed695b7cb027f3b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1d5176112a234a8d634e8fa379711b8d*2&de3933ee056ee43070cd1c06a72fc1f6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0c9b84db4e1550aae001be86dd6d02de*10&73715de41ca4deb4544b7fc006e1790e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"fc204f10eb38f452c2cccdf0e59c41c2*6&e69cf583eca8259e92b941ea1a365a7a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2cf53be5d60ef3a346c583455d54ea97*3&078e8da745153ce49a02543cb8a86021*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"ee5030dc21d293b5b8485013b931be32*2&f6e155cc7049ba5f8ca5f187755064fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"6c9a41cff1052842ab0a103d57b2b6b2*1&3a576a6074b7158987f4f0d7f15fb5d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"f38d105a94a91f77f07374bed4c07978*2&7664f6aedaff3fea044b54bd323afaae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"fc8bea022ec763c48dea9c4891fc9557*1&53f23bdd32786525edfcf3365447b427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2ae0bac72b2cef1343363369b2ff65d8*2&8f6360d9a44387ca6d06f0d75a3149db*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"f07f9e8520b47440ef7abcee69aa8f24*2&d4725e8c26ed51d3ad51c95035c963dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"dd0d8b02fd65e46b23d38afd5e77d62d*4&9e02446bdce2b38aa28bec95019ef4ee*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"41c283aeb475e29c00639122917ea2f4*4&0030cc619fb4f3ec70e583c0c8512763*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"28ead8fede8bf70440006bb71978a2eb*2&80421e27b127a3fe9c3700129e37464b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"709ded2588f3a511153aadb3874f3544*2&97fe5ea23acff8207fe282b2f88965fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"96cef7465532cd7e0ef497606b7adf2e*1&e82acc60570ebf1601333b6599f59669*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a06421ffad4a18f6bc281cb409a67855*1&3d8ee1591bf3daf805a0f8b8c4b7d144*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"18074de0554efbd8f9f1b07edb9a70b2*4&8074bf8a505092f575f998163035af90*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"97fd406bdfc2af7a4bdbac8f6c066e93*1&3def130f6fe1f175ef9d5a9c658d72fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0cd8e1a38890d315f39dbfefa9cf1718*1&395ed92b1b0488029019812e047eaf28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"58b363999d7b4ce1b0a2092f0bcaedf1*1&c1d514562151643469ed306e37053b02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"0e1767c6219dd3a5867be52b3e48d180*2&4319bc59176de854eba746c7bbbc60a3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c0667c31e4a2ddab8edcf44a193675f6*1&404b2aba3bff0e82021f6cdd0550d37d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"cd390ea70ee28f66ad2ddcc922b906c1*1&131864f4044c60935072e91d8aa1141e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"68f0fc6c049b56b559d6e054cf1e1bbe*1&96b64197771c3501681e650a4981c487*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"9f1e1818481f84b0f56e2c195a1d6fea*1&1ab9c6684808ca36e7d5de7000909eba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+binary_mul:f16:0 mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"c05791f15fb8f3d12d95222536e15dff*1&ad8c60ad5d3a6f040fce1537eaae3330*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"737cc89a61c6256db5d56dd6681d663f*1&823d8467f7734f59074a172b98db2022*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"ec8605ac3035f5b493c26325af6fe00f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"51a911fc6b4cbc9d009346abcad94c57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"5c28406c0919b530fd6bf401323d41dd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"f4a7568a4e4b651119222836d58eca48*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"49d8f339764b34691165691bd0512316*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"642e4d01ad614ad93fef0a0f77d98e1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"846c376ff324308de41396efd6e6e1f8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"0143a0481f84fc5914339936a06561bd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"52f7c5b3b1d53625efb10da51c1dde69*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"0d079608f5e071740997dbc6e275889e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"77fe391a7fdf5154686e5512d5af60a1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"93aa6f8714a4beaaad0b128cc801d5d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"39cb9004003ecbe4598cffbaa4180dd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"94b4d19a8cb2fe2154bf515aa5f5fd67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"1aae1eb44f3d6edd204c6b70ad450551*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"573108d4ba0568cb75636442425c12a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"dbbaff0e4f8ab91c32bfb15faba75e8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"61299b42540c0d486fefc10f9a489058*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3207aabf8530b967331fedc8c902c777*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"a4a3938e751c3cc5707ed510fbd20ed5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"b6c1f4dd738f6bd8703c519f5da8a7ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"54b67da896268d64c4977d98554796ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"d76a09000c7f831c21612181fb3d010d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"3de2cc2f842ae0b131f353e6ebc9c277*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"889ab22b493d8c068a334fc41aa66d9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"ee89a64f6327dfdeb9c211c72b93c51e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"55f35b119b921dc2627a58235cfc8a10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"e2bbce8ae7ea04bb61911835a8a07124*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"77dc6575de095ff1fd5d61e0d8f8c9f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"5b112a9a902dfda0114b5c3daeccacdc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"596c40f1c30b1eb6a16a9415146c1857*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"a44336350148ec0dba5e96e114f17120*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"8bad04c31bcfd30b160178e9ca3fafb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"f147131a57de3c77671160b275d3fb81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"c8dc201ba248da11f185fd1a9e029ac5*1&c02932d995bd235862301ef809c87ed8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"83b564871b48257c4791b050ea9719cd*1&c6206bb57c5988ae008bf79f77246fa6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"b1715e9c639648429767d7193926c28b*1&bd0c8ad635cf8a5355013b3fa169b007*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"caf4303e0637a61b11de0d1de86a7f09*1&490046e6b27cfab0067d6f3a04b2ce20*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"c5920c5ee09d3d9d1e5461fa1491dcab*1&14a6786887c5b5aeab49b39fbc461b94*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"d8126c59198faae4cf3ec945030ea09b*1&af862fa3e1198924b35a7e5db40d870c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"16eba3acc5aa4ddb710f319a9af6e861*1&b91fe06e068bf3b013b692be2bcd19e4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"ca46cfa3a696c233d45b9b43c1458d5a*1&321300b6aef9504cd644d96310b504bf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d9fdec306257eb2ce1c1eeab82da5ec2*1&04a2001ad97f33bda36e1b9eb72455a4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"5042affd602a6001722654e300bf28ac*1&de1d400fc88905d1ac11d868d1479abf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"cf02c202214297be63149ba0ba7fd822*1&f9c92a155b3719c988497affee93a083*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"e66514259e1a13f835405e275988a408*1&3c3067b4f5135dd26a8ce28b1511551c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"96b5b381f549c20595919433da7e5887*1&3ebb4f6df88d6a1fb2028a54bc863bcc*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"d3df910ebd00988b6e2bea1bd4497e58*5&eadd5fd77f98d92112a04e823c805273*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f288bb4befc7667d89316c9bf1696de8*5&6c84fb00be3c5308f377327d63deb3a3*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc126_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"2aba93228ba10968d623673fc7b951b9*1&5cac186965d31af535d768647af69db9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"5d3987cfa9be855bbf190432deffe041*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"a79cbb8532320352ace8b58c21dec359*1&faaf08cfe97e40d2050b2e8000405dd5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"42aa26aab74a1d60fbdd7f86f39eaa50*1&ed790e83461ea857a9d8d9f74a7b588b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e4a945333e69ebea1cbb982d7b70fc6c*1&798bbfbb95e8014328701eb09f9dbac1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"eac5cb3ee3cf3be8ce841568db0befe1*1&1dd751975ce6b47e540bd6deea73848b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc126_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1ff8f9530e66463d22a76da92f38ceda*1&44a4c521d31b88e4988d5169126f5ffe*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"047f620c73674706e3f399d5c1ef6a97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"b333168093951b7724150e21ed3ded7b*1&66132ff08ff8f718134b4d869a5c0fd1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"50385b1a08414140d7b83c71cf761b46*1&ef570af9975e0ae82113ef0a46e92675*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc126_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"4b86e303f693afe147879f447675e559*1&6517edffba0e4b32d82771eceab105fa*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"8a48fbeeccd1310a99aa0f6da68747d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d1ee464b02648e1bdc2812fc33f60a3b*1&ed027d9438c5a584d90527162d9aed74*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"be2583f22191c8b2c7c27edee2f7f468*1&9a33f2160a3e31aa4602f737d730bc6a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc126_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"bff92d883c029e5448f43af790f5a268*1&39e0d1ede297c1f8935a0b3cb749e376*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"87c45ace4e8e904b818a42326dcc212d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"71042043711dbc2b8399b5d7baf909ac*1&3d1123ea73b43b2676eb296396a0cabb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"b5262d64d19d4a2d75f80aa5b594303d*1&d83476fa731984630831a63f9b2901e4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc126_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"9d6460f26c780ac75c66b9ce2600c0bd*1&01e3ad99887564ec4f22723d1ed3d5c0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"bbec749e37723c29290afa3e7fbdf07f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"a92ecde741cd6cacc601a7f65e967b25*1&76f9e450c93d0a097f6cf6739466dbdd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"b7ede6f42212a36b91cfe2f1932f7c97*1&c56635143b139aebc91a30f54f97adb4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc126_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"c705bf7989b79f2bf861ec367acc3815*1&9d341decea1a76c631a739a555a086e5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"87a38706e0aca4f64ec3ead8d542c282*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"aa052b5bffed9ca8695ae6dba0456448*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"e10473278f25b3e1daf6b296f81891ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"65a8375d6e592f2497e796144eef6029*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"5030b051adae163aad695d37e0ce9706*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"0fe0fe130dfdd7c1e345ad4bec653025*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"e571266118c45be9e3dd8e3ed5191c2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"5a7ee3d8d18cac4c4a1055024fe82005*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"dbc6be6547d91c764f901d029f7fb42a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"af5ce61b7880f7bade680bbf3b503dd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"012c119432131eba5e2f6a8d34340b37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f99fc4965c02dd3a31764d2b29c7edd4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"9cd0dc698fd89c49d695a67fcdb8f9ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8b7432f9b53ae96cc4b94229abaac240*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"56a3d9ac5bd39df16dd3988c8ddbd6d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9124380b9c321130b67962b55a655cc4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"db45c98953c8a5b6ad2115175e6c41f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"3cf52d49d395b0d28d49e948b1738421*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d760bb4335c5e7ebfd06fb75b34cd63e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9835a294a12ec35921e49183f2e7c601*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"31364c096cb19e3dec1a0889c7bb959b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"92b02477414d5ac0722d545ebdfdc5ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e48207ac4b998433165d5d7511bb3247*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6cc55b166697875d7c2051550db18eb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dee05ba5507ccf2152b3f0fb1e70993e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ea6b8fcf98f0ddb0194b659bb7fee2f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"2679342122ba6057b5857dc2131b6b02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ac265ec33d9fd5b3ba0ea206068ac428*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"37d1a94f8d614f47e03e4418160bf146*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ad731de039703547be6bf7e777a7672b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"020020e4a3628dcf290e12bae9ee4a55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"d256b655cd758ea453f88d7afc9d761e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"5c5fabfb07a2bdfa5c68b16b17d43d55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"da184cc7a41941fff341a51d61678633*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"b9c5110a1888a13d29e4db968ab5bd9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"eb5439505c5e9f38cc1e05f93778d537*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"a2318c12eefbc7aafc6675c2d0b10fcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ae5332c9a039b2c6f6f14c1773293826*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e7dd31f7e18b9a5d9f4ac32cc793aae7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ba5f68451ca7edc1817d739334d9d566*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc32_ih225oh112kh3sh2dh0ph0_iw225ow112kw3sw2dw0pw0_n"2d05a60daabe1cdc4df989dd6d666997*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g32mb128_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"c9878659d32ddd554ea3ba094212e34a*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"80a433a7f7f4a5d90610ecd21e3130b2*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3d425c992e13ebb9a8f1c48f6eeac06e*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"dbd152f53767aed6bc3ada3170a21e19*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"1c99b09b08e0189310d33c8af44e8f37*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g96mb128_ic96oc96_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"f34633109713662f607992d95e763abd*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"90be76664412ca0bd73114d146659ce8*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"49a87f3a3f60f8cb7f892696d0cdd0e9*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6992c4462c3a56fcef493fe1db852bd3*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0bc982fde7f9f046c2b32401d11b1209*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"162148de37d2709d7d00bad6b9f087c8*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7fe5d25da74a5e0e3b6da8e081c13149*100&90dcb8141cc0dfb998a78821703214cd*50&94752622572feeeb3a75666b2e6bacb5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a8b9ab65e19b19c92c9dd1fae63828e4*100&5f89b34410fe6619a6a7cc872d11a068*50&48fac9d98e8fad3a5f6a47f1a7c07e78*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"52abde9ce0abf9eae5640dc886f9e026*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih59oh28kh5sh2dh0ph0_iw59ow28kw5sw2dw0pw0_n"e3afdc01d68e8491bcfea9c1614308c9*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1734c502cdf7e5122a25d96fa4301e5c*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f55c1a61bd9077e1b85f28c89dfb2573*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g240mb128_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"5b297ff092613076ebb4bc0e7bcfb460*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"329d5e1c7f04b1b96b690e07b495e88e*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"08056022599199539105500e7de08877*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"45b636e868e92280a5e3ef6137b5ba90*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g240mb128_ic240oc240_ih29oh14kh3sh2dh0ph0_iw29ow14kw3sw2dw0pw0_n"2f4e8f20bdc1ce833c7b8c2026b44c01*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0ad177620faca45f4cd8d7ae0b6d0094*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6ac4e37b4a532714f9b1032dc231f38a*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g480mb128_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"48b53b25026d16a02c2e438affbe587c*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9603c1eed73c8aa263d50b53ae206202*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2233d06c3cd613c9336c0a077d0e9962*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0db4d4c3384d0778bd07e77ce39066b9*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g480mb128_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"dc4996985cecec7bc59f941f7a8b834f*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8955c3045cf2534c2db997fe07ee0da7*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4680d598402d731155abfc7e51910fcb*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"f3f3716da1c822f496e8bae8549dab62*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3bf3e2268a3061c6a5bbb0ccd7514098*150&88d1b1ea17900b43e4e9083217bfe0f2*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6c0aedb9a8cab03e446e960944de04a9*150&31e4af496cc6f0c55e2c0c919f4fce5e*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"74d6cfb378b32193b62e3d19d038f11f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih17oh7kh5sh2dh0ph0_iw17ow7kw5sw2dw0pw0_n"6d2fecce2f02795240fa9768d14bec7c*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"afb9f8325487c781f02e547df6a3b6af*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"647990600651751807c06856e1b06fff*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g1152mb128_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"bd71cb5fe33510fd27c4b3a2f4b7af49*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4d1f4e4da35499f6a8cba0adc2532fc9*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b5a11fef956c61a94b69bb6acf7ef0b9*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1b98ecd745fa21548626ad8eb6e294a9*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g1152mb128_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"17a533673d26541d029dfd04bdad7e3e*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"63ee197f90e5f5586f409b5a9e888ef9*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8f6a1433caa3370d5ef5dc967e281fe6*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"6884bc51327dcb0f8d6c9735f84bf923*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a16a86bb8bc8c6c9baa9695d06c5ad9b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5e90843c7a2b91e33d65b7f8172afa95*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"aa060c717c3d36448b5a1c03ed076d49*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d963812457ff246614be51c162a853ce*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"95ac7d2668809908df218501ff87e38e*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"804dcb9c823a84b635aedeffbda27502*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4379ca30efebafbb58e84fcba25c32f1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"755b4b6c90d0977855da598e58b21a61*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f35166d47759a8c0f42544546dfa0b85*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"40844bb1f3b342e3a3483eb7a17beb1c*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b8a7a1ec8877400feb8543be523527d2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"3309380f55b4084ac867b3cdd4380dd1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"726380e7b78997f0564f0388f70ad494*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"b2e306e0c1478717a91e42f521498032*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4363d094d93b5fed6e93321e7530c6a8*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"bd480d277eb34f4558329349b81ddbb1*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"050e235a7acb8ccb83d5d6d29188b27b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f919caa2655c545efe3461d61e023cbb*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cd1e788db39f95c2eb6f28b54379f072*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"1f0c05313178de72855c35c9b14db9f6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0abc2ec1dae6d10bcad16569449ead9f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"28185f46f82b6664ecb77db6eef2e02c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"eeb1bb5192cf65d5cad0a11f5d3bd374*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d54377f03e3f1cb71d169fce58f57350*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7f10bc2f1df1f7b4fd878e7716be33b2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d7279c64f5c13c78a6c517286d1f93e8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ab19f91934e7dc102ccc0b61bdde8466*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"419ee86b51de8246f3b01704bc1beca0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"ea99e92a189e4f8ca56fb42e46ca8ae7*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"0673a361df9e9938af4a3f1ccf40c5c7*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"740dcf01a9d70d513d51f495ef5bc7b1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"70f2e7ac1ac3438953486cbad3ed83e7*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"496f607e69cb0293fe15c670d97f58c4*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b552f5a8bf9986317e17edf359723939*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c75b85c58f4d4ccb071e83b8b9e84bfd*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7bd9039e6895f2fbea135f2935b953db*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f0d27aac2661062388260e00df9df83f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9def5468668dd8d73c6d7132538912e0*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6010bcf98b9bb9f01424588279ba8c6*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"16e0dabd59d6d9f5c2fd9be93c1a0592*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"3138e7d8e151f89041ffea009e11929d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"65163028053568afdc649ce643dc8fe8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6ce6c84c35178d42b7fcaa7120320ee6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4ad32d77bbd23d4677d16912688f31da*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"75d5ab9ba8b1aa089f8ca4a709fc7c49*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a182ec88dc1c37b2a4cd0698b2e18884*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1d36fad7d357b9127340d27f3f81d389*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"02502b2c92adf8eae461dd53fbdc1d42*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"71d5f46737b2eabeed57b47e1f218a19*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7de5c654245cf148593fbf0e4af543c3*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f691b1d59c6c186b7cc34b297520e119*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f8e324de6549b4cabd8b5f823f62a0a9*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"3e923fe2f67d4cc638f0b291b91c9ff0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e51262d6788ca25e282fc65cc4ecd86f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6ff73177e4ae119853fce93f07b0ca65*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"cb75870a2522265ef9b2e96398ed85cc*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"01d45b710855e59605cfd74220f4062c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dee3106aebc65bde8313e523d6f52cae*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2982a693836d4557848298fa8ce06050*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8ac0d85ff9191c583d2a081a614ffb4b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3f07e8f60aadc45ffcd12de9a098f1f5*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ba4bb3befb1fc367e90cfe4ff02fc444*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f8841d415f66155ecf693bfb6b4e6c24*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0f7f97b7b38f4b44ea132040bc08f5ef*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"14946755a49898ff45f13da1e6adc42f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8f808bd500cbbeccd8d5a4f7b88493a2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"ac405b688870cb03725311a982725c8b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"b530c2bd9626b4651722e071d8352c7f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g32mb256_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"a64451e9274f670368bf80a09e6a3f6d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"1ff3d581e363622c178be62bd264d27a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g64mb256_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"cd1294b8351084713c7c74ae413e9e3b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"72f996308c4609452bc67d9789b0e7cc*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g128mb256_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"a5658447d0d301e8f2375fe0db8ccfa4*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"562605d469adc332d21605ef80b16329*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g128mb256_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"d22dd59c10c0924d7208f6076897114a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b9395e9a7b78ee4188f455ab6eed4059*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g256mb256_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"2a9638f93c0708dd7deaae027f7b6aec*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"3702fc529e88f72acb94a95560b96151*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g256mb256_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"fd98881b661171b34db130bf2406a989*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"029c4d4093a1acdd50f1eaf31acea0bf*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g512mb256_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"4107dd88bfeff78918ac97407f7304fa*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"08be9d0061c4ad68537cacc828d90cde*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b2b5ab2859150f80b9cb901ba119717c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"1c299ae0022b2f3a02c5643510558161*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g512mb256_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"cfd086f32c1c9b2ec32b4991be1a3f96*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"379910b3559691769de7ebfb942f4fda*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 g1024mb256_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"36bd363169a68ff7aee2281369287740*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f7c1128e9df949ae7198a7acf5627a56*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d46b4ce2251c7dbcbaf69cb567602b16*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f5c52c6b6fce86a610d508120ed1a8c2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4ed54e005410cbee3602e486dcb809ea*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"fd05f2c446613299a086b1734a330a29*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"3af98f617134c7532bd6b932d3567b42*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"dea0d43a9fe18eafc1f26a6b8c306ed5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"b13cd542c6c1c36a3fa1dbd359a72594*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"cc4e3958d4028ff567c90d8892d6c112*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"5e8ff40cf106512c69c05d86366888f1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"7b712637bd24bcb559e8be94b24680cc*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"89f79c039249f2d2f284ef2d38c355df*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"32045e9bca38e03225b3e4e6d520a7fe*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"6572af925f5d2f06dfcbd160234c0d46*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ebd8d695161ef9dd8457b8d559eea2f2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"2ed6c6e21f3337e4dde25312c8aaa576*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2:0.271:0.314:1.234 mb256_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"5996e978755b7b019e140e3167a85e00*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"db92431dbfcae8ffc79ae715a47868aa*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c8de8aa143412f89082811c570b851ae*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"0a51f97880f8322f6513c9bf14cff354*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"cd029188a1583802dd6a53a4ede44a59*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1105a9c24511284135560d086fec0c13*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a5f7bba527630d0e3692892c6de9e05d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"71352e16d2d9e1839ab5f7a75dc8a332*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"71a953c9787425f7394699ccb7a61b7e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4fd01796c19db16b2d464c2d245a8d68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e36303d35be23568468a83d4e3eecc63*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"aaf2347557ac4da1fe98d0dfe38b0578*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ba43b151ac59b5a984ad1c85b9408c67*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fe2fbfc121be09b25f230b3e32ab9799*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"087b648ead6c2582964fef757c9e06c5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"582070774ef3a4a51911fe124a99e5c2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8fb3d3b3c594a7a010d0d1bc7b6c4de6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"d1d0ff2f8ab69f1dfcc2e1b7f4422a46*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b01842bd6236d39bd873d2dde91283a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"66942f0c929d3b276b88ed942026b790*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a151b6eae7715b145cdcb601b4d7c748*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cb96de4b098fe06a2e724954e33a15b3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8fd4d4aabbc8e98d53cbf34c74b08f29*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f911c9a7855994f420482dd386f8fec1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"3b5a0d0b54df4aabad23acdfa05a1b72*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"7235fe7e89bd532419f2dae3f2e83f40*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"6adb8d604ebcffbb3779a4095c623d8b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"20bd0ce27ac32342a90c8094de3734a7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"44a1798fa9d6a18a824cda2d89f482e1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"883c26473c84e9ef54da8811461ddaa2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"f2f846d2860a4ebbe5eac3aa4910a386*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"a11f8841f00f13a18ef8a9767665abff*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"bc47fdec496dc11f922d663167a247ca*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"27252f3d7033ab022630f39321fa1efb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"71bc8b809c2f94c27e053bfbccd9adf7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"da030723819594e47b229bdca893ddd7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"9c81a228d5aecbe5d9919f3f32adea9f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"92a011c3cf9e16fa4651a4f353a11e9a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"935473784c1b1e5bf1e518d3502b81c8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3db23e3e2a689fd4d9ac36fea4515a71*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"22e8bcd44d9f2bcac08defe05f03e734*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"755503198d9aa225ab5e7ba09bdccac0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"48241d5ab8716af2c9f430617ce87a49*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"198dd72463d907b549203a131a6f223e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2e2b543fc2e8f74e141fcb7b276b2174*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8ea3fe6c05639d319cc6b815b1451054*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"70ca91fa06b5408c947603b6e41c7ca3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c744e4252bd966a4390156b47deaf181*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8b4091a1c290d906ca4583d792e0bfa3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2afae46cec70ee03930fc6b2614a9518*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3a5878b1966bb7a32c1cf9fec5cdbe1a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9075fc48f4da702b0c9aa8ca0634ea23*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"69b60a7286c4d6fe153709d05d47a21b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f8a061d609a8169876f4bdf4da9721bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"5ddfc4f0696b867004292e5822c0833d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"2cd5410d6b1156e7a36fb25c7c14a40b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b8de55b9d21c9e3befe4467f6a838865*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2f40330ebe62465101c8a2e607699f53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"8401eab65f7e5191ceb691bf6391747e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9a86d150ed9b1ec1daab0714bd24d9b0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"689063f02dd0951c2195b939b79ef0db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0fe5893f9f4c5a3c6f21e510ecf4ba17*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1144ba8c2e20da0666b418bab1f4a40b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"98a7a7d0983ca260af70f1d69be020d9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"28a7999e487dfb203106dce57d536ef6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"690d95d7fdb5f158cb901b1f825e62ca*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"69b72abefb8d70378c5e11733773d5ad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"79e21d4c2e6b6742b86b3e644a1cebb8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c37364e901fa81f3c8976dac5b8285fc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"dc322c5b15e92ce72f69eb1e0f4d3f78*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"c01dfee2eebfaadce33f6d92fd416770*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"5190099a710dee63fed44d80274df7b2*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"43dd2d3bc19edf0ecc67dfdfa5d2002d*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"21d10be56961de66c81d10653872e189*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"a66e7c2d5bb7b139103a4f702c478e41*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"5cad225842e14e62cc60ca74492010e8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"5d7326f1a4c8b471ceba084728574202*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"2dc3f0ad97d00f0464f1a6ca6406ddff*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"928c26d498e7fc3ad263a6022ca49dc4*31"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"be2f74b36c387fb86d5c54da0e4aa6e9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"d0ce66286f01cc8279def7e30ae7cf9b*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"47e449b22cbf435ef30bf88cbf08ef9c*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"eee5d6cfab1ddee07ad7e9d663293d7a*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"2e2efcf168477f1cb142b83572fea33f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"3079e9353724bbf0ffba23d71babc6a7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"f1b51244ab765f2a32d02146446d0205*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"e26dbf240456017f1951b3f7cacddfda*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"662878a4e574212de134e9b359f45263*31"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"ec686a2a5ad27d6dccc64d5b124d401c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"f6473658ec7d28b02d73f1850f520538*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8478d16ada12f8ec71bb44e56443e6d1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"99377707566c1d6ba9f329d357235899*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6c1608d098f85a27fe7f9dac50b2ba2a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"80e7f42e8b6f1204078e3fbff8e18108*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"9b9600a3c6f7e0fae0e45454c1104e04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d786fc07be8aede26c1263a3d852d6b1*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1288e9bec9f8a6ce0dca40e50397fde1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0a40e949babb3bb4453e1e416379beb6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c4917469a76ce05dcdc5c925265d3624*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0148fd1a69c4c5f78d1423bc6f4a9d92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f7ca0e41a950a6b28ba29099f1f276c0*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4900e59c27feaabac30c18b6d8b47f67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1fa19d7bab892161d043e1add0a036ce*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c2098ee7914c94c5b2cd39fed473d1f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b24693230cb8c6865892af3216f415be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5e183068d49667708b8a7204f9a8cafb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"3dcf859a2e6c3e277effba163afc43ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3615994b61c99bf03da2b27cb1d03615*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"458e375c611de11fa597982530cde03b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e0f9a4fee7c34138897e28c277b1ad4f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"add9773c0e40bfe2363bf40e04cbb8fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c82cf9e1f3517cdfe610232f60304e3b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e44e4f3a0a77601906cb595a28fdd94c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"34e16e826f9d661724698c76a5ab1e9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3b2b7494c756000c0151eb6b1b3468d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"0771301c6a8427ee42f824cc8d382929*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9cd373d5e4a9574a7d3ead3762add3c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e1889b49001288a081dfbdcb01432a21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"63dab0d346d5ae7c231b71b4ca437281*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e9b8e9093a8d54466d9d0a34040cd208*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3a3eba66326d5aa75b3d82b2f9e8abe7*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b0f2a21f9aa64e58229bb61dc5f51b87*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c580a127ae7f2c42e75274fbad504e63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9f120a0fc85b733926e4c812389eb098*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"25a0077a8d25399102f008dd60da5632*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"e445c3d4cf3c42ec19804a6ea40f7211*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dfdf581da7b858c21fc91e1e27b476e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f37938942536756f22919f57b7e147ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1e7f538341e87c41b170d72323ba0703*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2499b130d56aeb5071e27c4f5a57d8ba*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a4e08a49f35a773909f60ab12e120f98*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e3afdecd0fccde07e8875f21fa33cf94*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2d2d6562879904777dedf330b33af202*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"abb899ce8dd6f5ce911ed6030a453ded*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"2dee20875ca663b8afd44d3d4da59dbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8f9ebae73bed3cc8fe9a8886d84ea5ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c67858331f89b6a5c784ed2115db9800*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a1ed2265c25c0af15cb03aab4b0e6e2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c8e8ff2e52bb01121c0a576017e8c17a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"73f475d33399b107927259c09df70f5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"690fd0a12a93bebb2c6927d97e009d1f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"86335e5e417d5cb9efe0c40157be9468*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"908a8291b242eaf4ccf98c2105ee3e0b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8bd2a06ec709f536bd60985cc5a299a1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"862d5331a0b09077ddb1da0b8d3d2621*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=abcd --wtag=any --dtag=any mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"592b6751acdac485622f0605734f946a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d1712824a4e12e90aad34120bc999273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e16294292b83b4cb543071626f89963f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"874c3cf8112b5b86334a715642fda26a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c627f1c1897788d44331338b8b88dd6a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7774ef44b8e656c17a07a62f4d710d93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7395eae38e9f60fbda55f711a45ea22c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cbcf0e73dc5ad19e578785c6cf8faf98*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"380ad44a4d15094eb6343e401a329e9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d9890ffa0b79a646d9eb15d9b68c0dc1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4f8d8da08d5a6219e19dd118331d6e9b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f9ded4a6a59001494529a20770de7cb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b01a05c7fd7bf185a4b10fe2fd286fbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6963fc8c5ecd03e61c7f0adad38ff44d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"26affbae0fc746db52e89f7ef1ad8bbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"90558b4fe653696cc2103eeac712118b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5002f274ca7888547e3c37047d0e2eaa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37ac9a159aa0dc6d206d4732f68d2863*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6a13b3a62ad6051f55ae44b78fad441e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"17b82c48f6a89b061bb7995d050e2bda*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"f6154e540110be98c96d3eb1860935fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7b863aad10334f77cfec21103f77f369*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d7e1ba028e856d2034f0d5dc025c8caa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"318f786c0f8a5658baf881925a9fb344*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"71c0dcbc554a55aac4a7d4401e1998f4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3843055712ff3330551bae6ed733a7ad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"17273617c22788f3b181c33e1b07af14*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cf03711c5132a9a4da9d6dbafc4c39c2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2f537056cfd249cf7d043838ffb147c5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"23036d49cedf9dd81dc3591ec2d0b339*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a1a9bb2a30744e0a61e867ef2fe430f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"5b3fefd6ec87c9b2a49530d05ab627e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"e147ebb1e35d86e602e903ce8e76b9d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"989e079b1266f0e9a424f2ee8d838608*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efaba8c80ce46be27cd1e2c39e05ec2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b372a961a24c8ab0a1aecc19dbab23da*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"33bc334a40e92d86dcda5fd531af1674*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4befbdf9ff5635615ec68e94a1a27b3b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"19f1e52dfef829d3bbb9361becd9dede*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1b50e1c7f3af493016e07761ecdcf6ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8ddc8829ed7e46d55c072e0550ed3a58*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"271fd20d921a3c7140fe16ba4f9fe8b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2153a0b7108d2d97fbe155f1d440890f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"88f745474fed9768ab03504c44810627*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"607096c301d97fb74a5d2180a6cd1006*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ed30b597596573591484d04ef6b38796*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"84e0bf9c7a9edfbf081ab1ab61f7e01d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3f49e7317806711e1b0a14dab006592e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a3c19536e3686ba968006facbfb757f6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"f286b713fcab7d95775b5dcd02513ccf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"15a789c6851e2522d5a7961cf2ed3f66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"55d68cf048d762f36c9992bf7dad95dc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1325d1ba8d16e0432dd377473799eeda*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"6457f9e93a152cdc6184cafa6d17b080*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d845e1f8725861ac082b7418c44d4394*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"db2c0b8b25caf9f1b397c369554a47e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"9cf24b65e524c7990ac2f20bc68abd42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"890fa80457c8f28b7f58fb0bd3e3378c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0e2863fd34ad177bcfbbe0671291b255*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"02737e027481dd1f9caeabf240a2b180*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a0cc5d7b3865f7e12d17e389fd7b8aa2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"91b10a5081d123138607369f99e7b2fa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"57e15a3c15b1b43fe344a63f0cf088ea*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ea258de9b54484cee7d06a924836b26b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a5e60bb9535c57818c908810701acf0d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c2115f05686145611eb6bd76b6929940*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"176cac2e753aa6a1ed6044de355d4b7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=any mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"0042d363b85cfd9cf74835490f1b694f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"37a0dc2998e0ba2f6377b26a75bfdce5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2be87a58fd37e3763d5109d63b643c96*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f9ae69e35781e9edf96b98c4db2aec99*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"082fe7dee05c3a694334a37521a76fa3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8e81357cf5b72d8a65e68e0f3f9763de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"93426a2abba2bb27796b9f56dff912f4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"98eaaa8c6fecb4b39a6afeea2d8dc9a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"333ef4afb67d8a163b80bda0e571d98e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"059c3be2c419001233b1aeb1cdd08441*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2999a0d68fa82148e2b351b65d1264b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8d29c2c5d56403ab9e0f61860354707d*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1b8d9d29d632410dbcdf973be658dd95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6b0a78c9fda9244379e0e47eb96b45e1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2a43bbb48f9591b47aa1bb5cea3616e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c99952f06384c0d8ff89e90eb95d895d*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c9d0e14c63945f7ecdef2ecc2fbe9b3b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"7aeebf733578fcc806034ff215d74775*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b50ccf87899c00dcadb8686e9f2ce943*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0d9793e81ed7de51b3de6e89a55571d9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6914dfefdbddf4e7bf05c5381226dcb5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5baa86e6ec825b15c58648b8ebe7a16e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ca1bb63cef04c73b21c1cfc2cf71b1be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"09fea4acfd6e3072cacdaba69629ce42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"61a1ab65b31e89755df5b6a5aa7328c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"88d8ae1122c8d67eaa0e0de6c9e685b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ed94fada8c42357422db09266e0f9d45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"36088564c7e733e124d0d92c1bc626d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4b12740c51d043a3864e97d3401d6da5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"399da05375a5555273a1f02a162f9b6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4f03e9f829c1274148773f14aa406795*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fac10a58336e044de5f29316d5529b8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"079c6c2d78635e0e0138fa8007b34883*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"afc6c6dbd8fc41807d337ce41168c838*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"05947834ab54bff66b55818fc78ef9e4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"40d2545cf0c7a9d428cfb680c72b5970*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc64_ih832oh416kh7sh2dh0ph2_iw1344ow672kw7sw2dw0pw2_n"6ae53e6a1f246267f0ab599ca7121345*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"72b83034d27448067b5a08f3fb87a4dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"1d0628d2b000a1882ae64597b501273c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"44c5c9cca3ff4d3a88e56f557c8d30ee*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"ae55baa5bd55b0eebfd9cf538f420085*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"b16d11abe28803ef29e3b7dea82a63bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih208oh104kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"3744df40ecb2e5383fddc4fc565222e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih208oh104kh3sh2dh0ph0_iw336ow168kw3sw2dw0pw0_n"fba7b5cf9088e1ac112b55b465d34732*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"4e55864e510c6c0d5bebd8fbd7b59877*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"34f2df990ef489bcbed642c95bfc2930*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"fe139c2f00fee47fce49a1fda1d0f11e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"0fad5065469bfb6eab5a0fbccfd2ae1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"aac3528f5ec5e2fc9b78570ffd9458e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih104oh52kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"944931cfc9b6f22f534a9ab16d7882d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih104oh52kh3sh2dh0ph0_iw168ow84kw3sw2dw0pw0_n"6635e67e8255c753b510b06eafc60760*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"048a46564e73050483a6d743eb11132d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"6a31ad275539fa70ea9411904f638884*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"8e67a0887d5629df90cd584d1e5313f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"088c4a726eb3b5d12fc364f5b19d1064*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"abc01b2fa8f7e643a34d225d7840d2a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih52oh26kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"f32e200e2459dd90d6f9732b7c5c9487*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih52oh26kh3sh2dh0ph0_iw84ow42kw3sw2dw0pw0_n"464246ba5047a554f8bd81e12777d633*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"40c20e9b3539505411c7d0bad192c406*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"05d68779dbbf27b9ead67ff0a425452e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2d9de2b146cec1fb3270d86701bb5b84*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc256_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"412ff4d0dcced9670e9a4c512ae39d6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"61aac39253c9109ecdd073494e758f13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"3a78bfd89ca35b1662f1807c91ea47ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"1b06814fc74883da5cac8c7a3c77dc39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"0b251d9227a1749b2b55029372e73f0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"3300b0ca9e9ce76a748f51caaa8e368e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"368028e6d2708180fd24b11ea7989fd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"a2977d31a33185b4e34d7f3b8e8dffc5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"43803d5911b013f42a6036db06f9653a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"52afdd5b26366765a8b0bc090bb94d9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"0376d298838953967ba94cc1adf5802c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"7cd7aa761b21c5b529707eb6bbd25c35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"13c21693543fb9bdaf7d359496564846*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"5eaef4c157f50a37790e6dd0aa69ba48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"cb92090ee105b508b17cc0393f003c2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"aadbc7c77a20c6dfd36f9e96f63cd11a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0 mb1_ic256oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"6b1152b75267482afc1034daeb7593cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"b0795ea5724b5acf1a0a7cb35ce67ea9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"48767c10fb1b8e5d12f8177db4c5d035*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"d781689dae99445c8992f070385b7bdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic256oc3_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"735d77e6dbe93e4c4e947641d213b6ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a9f4e19eb01dcfdc8f5d4d64eaecd3bd*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb100_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"0089cf03791e7ec9b20907da920b9d99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb100_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7bc389993a3a5f12a3f541202bbcaa1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"a2fc7e2d7d92872f66eba22040def60c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"d14a889b204e2cfcbe317f5d9ecbe0d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"110cb6c51a0f627c3133ecf7ef54fa74*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"63604febc8255fd8dcf27dbe2e16c9b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"ee72160c7877bddcb13bd0cd715918ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"b9cd9eb426d1deec354eb7d2fd34a5d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"680394ba5b3edc8b26e62754b381295c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"971b61ac575f3fa8597123756e7eaded*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"6e1d5c8dcddd260017ffa7960c8790eb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"fec22700b193bdf276ae90758e6e3ee3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"3841209f6c500dc10312e56ab1082001*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"3746d9b4056ffc534856deccc4fb4e1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"682cd9d634f26117e11bebfae05c0d9c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"08456cc112d1cd0ff559b11c21d0fe8c*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"41d82493c5ff7d87d1c64b384f239068*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"4092786b2db7cb81669e32b223c5c676*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"86d926b4e040e4e25dd4cc8137aaefad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"945c0f8e61543e69d51d628307855d2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ea5626931d21e74446e3ac680fd7973d*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"bbfa168f3f5edba7e1aa5d94f4757cf9*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3ad6a525a8eea969753cf96194f1d520*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"21c5df54110739d3d52aad8c7bed2511*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a84dfbd434ae47590007f7d754d50567*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"81867d2c772600f8debedf47fa8dc6a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2e8f75dfcab58a7387242109ec2dbb23*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fbe1fea6022298651fdf28ce75a1fb75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"92892937978a8430f6c5370698b91958*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"e6a89b19e644659e62b5c6d9a0ee333c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a4a41c6a07cb05bed1b850a81b1df098*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f9a11e6d4d11d0c13eae548df308d1f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"07a909997557456a3156a65808d64279*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"e0d738a50af446001970ecd2f7cb0eff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8a7dfbc3543b725483bc12c1bb89d03b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"34116c6b9b850fbb88d289e5ace29e4b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"ab50c0085f34aafc10874f9a964cbb43*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb64_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"a0fb96975343ab6693db0787c320e291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"35cca755b7d0d89a0c9034ecb96b74a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb64_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"7de2644d458eeb69d945241ea9920b19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb64_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"a75c1254318d38409d46d9a8f2e30fbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb64_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"85d2266c4e1338048fca54bed798cc5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"eb2e2203afa4eb37d5d2149d60b6a628*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"52c5ae094b0b79a0e5417b8e28263c3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d04533bf96652ed1106e49eee3d9b8af*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"26dc95e2b26ba370837292093c1a04c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7fc72e3bb849479fe5e45cf9e9b92052*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"939573ff348d1da816449802a522cb1d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cb8b62ac90f718b8d272b8c4053abc2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef681a408d0f81c5ec764ef49cdc5019*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"12b575fe19b98da27702a6548c0e50d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f062d978e46603ed6afd448af53ae3c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7285c8deebfad089a8e860d2aff539f7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e8a1d28b6f82406b9dc9471f4a427cac*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff73c11b6080f917790e6c074a934bcf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b893dc7651bccc169cb9ca5c16230849*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4223a221f9f4e90cabf9acb2d0e00455*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e8fa9b54550064d92d52b98c76b335b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f02dc9dee495ef898a63f8af166db3f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a8a84a0a03306909e67e4b051d4bc27b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"90e9afc4e21d86455e2dc72b6f11dee5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"652b5d4a6d4aef17cd6fb98350ed8551*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a475438309bb9dc9f375554d2731524f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"31c9056cb99acc4cb9a674c39e93da7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e0173ad1c4b741495b7448a5086833a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ce8a216fd17526b4bedc7cb47f185d9c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3dbb0ac6e30eca1b5dba5b3f0071315e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"779b0f22fc45672d9a036ec6034a3a04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"78c5d8161f7797614103a9b41c0ea498*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"76ce22b8d250c6458a0f417dbeb824cd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4ef4df7d8ef81f9480e43f0677a7a782*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e2e9df37c06209a5f4998727c69b7877*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f71683dc845a6c81ea814348716aef2d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"63d4718e6a0d557bf0e40bc73d19322e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e5995c274807e7f30c369d7d9979f9e5*1&6c857d7a0cbab67fa72326b621857e57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c1ca2cc5a88c9b08b7c19042509490c4*1&5f9fed5b1f408d4cc2ad398df0900131*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"49ac9605915509c7d729b6bd664aba16*3&b769ce18b22247c585ef5ee116d7f143*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"630a29c9a8ee22f5ba71dd6eee42de7e*1&1c2d9bd1eba8a4242efb06b1c93fcffd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"877a75cae1529aa4e29aa8213a6d0b8b*3&1f053b27218c46729844fd5a511cf020*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3fbbd8fc0d6b5a01a1f6b84f7d3251ef*2&f9aef2f424b3b8b5ccadeb7f3270877e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"036fea6c41f497b41a5665bb40c00ac4*1&5496d9e6fe433e3461225e369f34ab38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c4d1977a310d09fe85f834d5eeebbfdd*1&bd38bd3651b24600741a65c445f15088*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"ab5a34805e44745b2bbff511933ea5d6*1&fc3152c38964a98f29b38c578d99115a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f9a5bdf92cc0b0d2a91c85926c5e3a10*4&fd3570d242c5c8e16fc10813b269e462*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"810a5fa5df8263598e43ae8ad99309d3*3&5c4c9d4a0b0fd90599d8e9f28f2dd250*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"96729febd3cefa249ed497957077afd1*3&52d73d167021a132b86f160e1b22f665*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"16671eaacfabed99096e32c095a8dada*1&f60a8c87bbccd004f89b6af3afcc8a25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"dafb6cad24a112315351240c6bc71d36*1&cc120034e5fdf8b9df8ff58ffa8ca689*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7b40c87f01ea0a36423095dae86c6a82*1&dd93781bd2707244d8f3a52d849834aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"358e754975ff7f7f762bd7bc5580e1a5*6&a01801cc10fe57bb06590f6b78b5ecfd*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"793bd6030269345ef192307cd9f4632f*5&56dc8392c0f335e08c2908d42bfdda76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2ba3f37e6e257ac37bf56bb4380cc399*5&4c7e6d71e027b319e85bb13b61e26262*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60a374f900e62aa47741fccf5f53ec91*1&1a831102119e8550713a0da7b5b4faa8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"97dc786cc9693ef2dd44a1aea89639d7*1&ef1805722090556a93e9fb17718edf64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4134ca82daf8e0b93f9ae10b96353deb*1&08e791db3dfecc1950b2bc6cf58d7037*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b1ebc35d7c36c36b5d39231a099c116*3&260d2e38f85e481469fdecc44ee0317e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4405727f16d9fb6cba8b14b6ed70215c*2&998263ce03ae35cb6eb5036fec142085*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9efb4ff712079f8732346c78983e5f31*2&0b355abbfed0f64b3567d512ed761b93*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"5414454d29970a796f41bd0baf66823a*15&61f2eb189228d899935f8f31d22ef2f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a886c7656d8a585c1983ed941dba5831*15&687d47cbf855eccd9893bca88d2fe19d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6bae04935f6008258d0ae762f21dc8ba*45&4081edfed7135b64b34759f51d84fc15*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1e4639f38b66759880bc8a2e8443c872*15&30f994cdb5311b838503f9b3e2bc7493*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"88ca37b5135a02584078f8efe41d760b*45&d18f2ef407d47c069115f03b39aab6e7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b7ffb63017b789e0b77c662ffafac058*30&21800d4cd22d9679bec2e907a4fa9e68*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4ec11cc463d111265283dca729643828*15&011ac0591d1e4437fba043adb06382df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c3759a3d33f78c3c8b6e9e88292c4c2e*15&47e74f8fd297f04651b6ab1603019369*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"68bae21a02965757b6a462d0cd503d6b*15&88f2762d0108bbee81d5f22ed8e091c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bdf34a563ee5702555eed453dee265f1*60&369717b28c2a35c54cc3ba81ac14dc0a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3966c2c6622618fbed41d9be11101c84*45&615857a530ddcb210730a19ea061153e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e705c20a75deabbe500df1b14a283c4b*45&352f162cec99e72147c8dc3208ee3184*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"15fa9bfae60a2b7eda3876e236a9a80b*15&b0b03e6a2511c02bb7d73bce29971562*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"38cde43354be0080dc48c43c0852be47*15&19f21c3f06e63c0886121af614214f68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"f6317e34616bfd2efa32bcbb0c1f4ef1*15&5d3dbb104da12e3fbb5fefcd5f463329*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c93cff7a9d2f4acbf307ce71dde59e4c*90&8c872f6c0d3e707b414f6ff5a05031c6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c3ed5fc65cf7105bd1155d1bd6108429*75&383c7b85e5acca74d3e9159592b5ec9e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fe8a980eea7e6df9a183bc6198599ca2*75&df39a0d96946cd2bea87ecdd44884535*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cca70e5f335939b993a2724b64d710e9*15&96af97940eee8bc981fb3a66eaaee643*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"aeb535fd567b9bce674a184fe31e5e93*15&11981e502ee8716088f88c5a68af4cf7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"bc5e7b14c3369649a7c9b78f52f0881f*15&0fce3cc45d41e16e83112e841e6130c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"20f0dc58fb2e16e2e34cfac081da7ecc*45&bb901e4c188f970151add35e434b62b4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d57450ef7560782eea824b3124e34c92*30&cfd97b14a64afd1868ca09dbc816e7d2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ebfd747192c5897e8a0391fa17948665*30&74b8129f6957bf5ff544adf079dabb33*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e6ba2d0b39476546ac7f5f5bee28a88b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"510999f950fc8ca3ea883dfa0db9b2a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb64_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8a5520cef8618705af4b5477b8db5196*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"22a0d8cda91e37371a45b45c22fa15ec*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"08d7975f8bdb78b3f9867d66958578da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"da2cb9cdaab1ce7f03aa1284ed061a84*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb64_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4351560c2783c071b0fd1e3b5037a31c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"67c165502e45fe5858fce5f8a12ac018*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb64_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b75de998bb1ddb11f926838a02baaab3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"725d3acd98f014fca3e73bee7f1da1b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c12fb0c166fe143c35d5cb6fb5f7067e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2e24beacbc055376e97212479b6b7e06*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"81bb1b1126c88180d912079c2b10758a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb64_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0f6b9395bb8584ce6b212e99b46c9fc5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"564c3022f129661642c8b4300ff77e71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb64_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"8d86feca1a4a9716b3c0631d531d112d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"04d17b885288e454373e08d287fc1088*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c46e7558cdf875137a7da282fd16420e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"59b20d45099b8ae57814df972d4ea923*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fae48104ceaf996ef2b081741535982a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb64_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8acbd6675703b73d1e5fc3b28e4a8e90*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"392f181f33ca02f3ebd7f1159a63c833*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb64_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"6078e21be412a3be6c64e7efa4787233*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6444cf9d14a60e926811f7adb873b664*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b953bf690922724fa914c937a44b35d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7c5bbdf6474324bd01bbf8939292cfcc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4c0d6120fbd7323e1d56b343486ea9ca*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb64_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5f49b18d83f32ba6c1ab9e9c8ffce423*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb8_ic3oc768_ih224oh14kh16sh16dh0ph0_iw224ow14kw16sw16dw0pw0_n"8397125187a7b8a824936d725dc2123f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc768_ih224oh14kh16sh16dh0ph0_iw224ow14kw16sw16dw0pw0_n"b49c21888827bf3854d46ac498843dd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"852707eaa72ae6ec1e5b60eb0fda6dfd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e2a0f91ae4d9f0139c04994898f6261d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7bc7c41f107d336d66c984a39e98fa31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2938b04e7b45ddc4b2725f69d8f20e29*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"23c8a5fa56f9609c2ff81dfc5e0b78bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e2cdf8d03347f4314daf90dd3f454bb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"88c84dbbf274727ed2b2f24f46e38c09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"40abc1b466d0996a4a8a3410b515f491*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b006416990ec1982531df8fa01da119c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"579bcd3b42721f537441bf32bfa8c704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"87ec5f580ad5694e25041d201a82b741*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"12ccebd0a325e49493833f0938b6f62a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"d3d4cd3b2e31069412f1aa2aafa98c7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"90ad148b0f38c0bd76657e60774876cc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"92b68fb5d149f42154224c9fc045d18a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9ae1df8beef939ca5b6efa02297635e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc40_ih301oh150kh3sh2dh0ph0_iw301ow150kw3sw2dw0pw0_n"8984568a745c821852775fe69ae062c5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g40mb128_ic40oc40_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"b2180aa038bd3c0ddac0f26ee080a237*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8e0daa8a6eeb40a9486d7e4bf077c939*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic10oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c04b7effa308ef7b82cabef41c896c38*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc24_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"c428c6b61093ec0c29c3561f75ffd580*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g24mb128_ic24oc24_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"f0e0c10ae20ae15396aad1b4235df8a5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9224c25cd0e61c8d9a3e07dbe821585b*50&a50ec4f1448d2cdbd2a4f22aa26479a8*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic6oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"428a77837e0188e8bc9c932cfd91869e*50&2f94d67455836ce7e9057809950405c5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc24_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"c9cbbaa184c8e5a0a32b70c6a917fdb5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc144_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"0b20657631ceed39b03e2ce3ada42ec2*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih151oh75kh3sh2dh0ph0_iw151ow75kw3sw2dw0pw0_n"58081cf8db7e8a6a03d0afd2f956fdb0*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc32_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"90e5c05188e441ad69a14adceffd0719*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc192_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"875d25713ce82f029e2ebdebbd81018a*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"33327731b3f744b5b58faa3f2a4cfe68*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3948e785c5c6c49e8449a7159c06778a*150&de088e4399a088dc6823d7056d806e9d*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic8oc192_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"597df2ddd9fa79c7b82ebcb8a76cdca8*150&6f8406a978c0515942e89479cca7d35c*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc32_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ff750cd93544f5fd6871f0f43496dea0*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih79oh38kh5sh2dh0ph0_iw79ow38kw5sw2dw0pw0_n"656c95cd8ca43e7536b40aec9bbd3bc4*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc48_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e4786546efb0982358d5fbde67cdfdf5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc288_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"df9997467e0dc37434d59a989c0b7d75*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g288mb128_ic288oc288_ih38oh38kh5sh1dh0ph2_iw38ow38kw5sw1dw0pw2_n"6379ae61fc62a9e88e94a7522d51d4e5*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic288oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fb3ac39bc3d03e5ba624bf7d79ff57d0*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic12oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fa94e8df17bbe8432904d64666aca013*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic288oc48_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"085b7813dd5b102e57ee1ea2007dfeaa*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g288mb128_ic288oc288_ih39oh19kh3sh2dh0ph0_iw39ow19kw3sw2dw0pw0_n"9797dc7e319a17150b38ed431660aa17*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic288oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"809278e21445ac03d28c5772daf64426*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"768b9ca0db523c572ed7b2378d6acd4e*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g576mb128_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"6599c30feabb0be967d5cda5a2f9d3cf*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic576oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"20b47e5caa1b1163dbe4bbccd7e2a462*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"79e09f75b29a81f94bca22c94bfc2417*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"0060ba93f2f0efcc6b14cb92875ca271*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g576mb128_ic576oc576_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"0ca12643841cf59fe704c8c51d2de49b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic576oc136_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bc6e9bee112d8dd0fa8f05a794ff3402*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic136oc816_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"613c8a81cd6bacd5bb1b9fd8cad2dfa6*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g816mb128_ic816oc816_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"cb472ad15d3c5eea09f83b3f58bf2087*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic816oc34_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b7d87ed8dd89f5959395683f79a76ed*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic34oc816_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6a7ea97a2deace910db2d8c4c8978546*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic816oc136_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"232633d680c9e2e5be533512211774f4*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g816mb128_ic816oc816_ih23oh10kh5sh2dh0ph0_iw23ow10kw5sw2dw0pw0_n"1e85979d58b026493b5d2241bc345d63*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic816oc232_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"cc6ac5fbe4c6bac0aa0d09c855365974*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic232oc1392_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8af1d48ea87cd53acd3125508ff51d02*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g1392mb128_ic1392oc1392_ih10oh10kh5sh1dh0ph2_iw10ow10kw5sw1dw0pw2_n"94d80703b5e104a2d0e589f0a6a4b3d4*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1392oc58_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6b855407ef23d70c613d10001db39303*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic58oc1392_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"22290f063ec7f72999715f58700cda76*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1392oc232_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"b0cd61d5eef6ac7dac6ab68d4930670d*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g1392mb128_ic1392oc1392_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"dfc73cc16ec87f49c2d137d7fbe879f6*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1392oc384_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e0bbf3e06e7c8354899d8d456a5af85f*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic384oc2304_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a02cca04e47f797f1af449bfda43b5bd*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g2304mb128_ic2304oc2304_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"c8d7e1f26eeb7256481a406464d7bdf2*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic2304oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fbe07a124dde7f31389eda7a5996c01d*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc2304_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"41b4633d9c3aaa724072cc751e373400*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic2304oc384_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8657d72809996da1a91768124d175982*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic384oc1536_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"52ef8e7b472465a3b61a5c0f9eddd4d1*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=any mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"98b0034d6996637e05a84faff5c6a7fd*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"b5b01a0af8ec41445fdce39dcbd01734*134"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"bedb15e7ac6fc93a1dc389288df4244b*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"41a8ca67b76110b75737205161535084*134"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"9d7c9eb6cebac68b07070e363422367b*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"d593b4977262f289dfb28e5d2e6b0cbe*134"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"4b43de672d306d94f994281512a7b1d5*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"e2f5a5abf3c8776974dfcea8ddf78f1b*134"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"5b0babcd1786d5432e465c4d6f6eb41a*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"f05d49e2241d7d716f13bd6e1c60aaed*134"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"124765ab6ecf341f958109cd24da6d84*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"8e7e8a397eaa76fdfa41efe53608162f*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"cedb735369bba5414e3fd9618c07e5b2*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=acdeb mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"2f9fd944e224661b641fbec90b7126ff*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"19a598012b82dd7da636aaf5b7ae09f7*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=acdeb mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e146e0925205b8fc45e798a607e3337d*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"742b730b127482ac1634459881c8f7e9*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=acdeb mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3515adf60bafc4a86684ee71ba00bc43*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"eca578bd27f40e268a52b47ff62c7e71*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=acdeb mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"cfaf30b55f407d6053ae7a6f1dd1e9b2*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"04b4c1d9f3ed1f0b4cc5e5a2300682b8*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=acdeb mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"15c881221634a9dd1e14cc6e756e4017*67"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"6159c71b70071d466895760f14c525c8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"107cce97c6d5041abb2ea5a7df9b5423*40&3ff11dcd3d48b914c3ca8de3d6b0d70b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"556f28b785abdb74f6d5e5dd1abd5fd2*120&7477fad19b3fbd614a889e38de57eb30*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1700839d2cb08bdc88e927a698cba7ce*160&a63b43d967b052a4a1c812e0d7b782f8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bd2e322dbf7adf5e1d94b85b71ae9193*80&f9f64566d257c572400878fac49722a4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c8bb92677da60f465427c0cf6498d4b8*40&d7b66c32f366dacb1b0f43c54ce38769*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"05b20bb43382e05133aa9897f6a0c8c4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe6733b0d90ae13e6efc6fa698fa9991*160&fe284a07200d2d2b7065f4c0dab5e532*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"94e81312ee43259beb244545957703c2*40&624c1c1ec586c70aa5df4c4cfc34c1ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ce8fe72e8844d2aae2b78b59148e135f*120&cda0ed49d18dfba0f374400a2d772d34*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"42664cd58c85930686591cdaa16a58da*120&92f33ed47853f319e1d2239728437043*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c7ee7505a2c4f14deef96c9400768f7b*40&6939ea72cc2d8c39facd74f138b1b9e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"2a02c0e725b2065b8a8bffa68118da1d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"deac1697f212b0ec208125a1d9fb87a8*240&e2358342b89d03563e356790f1433a4b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a10371652ce06178cbcfbeb1effefdf8*40&4876c13b4a54789835d4f9ee94c0b4fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e243dd42f836923bb1419810f8817a76*200&78fae22209f070a85f3fa981530a616b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a9cc73f0d38266ec167810990f3b8b9b*200&86f0227c7eacf46f93ceaabbd0fc5e17*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ba8da062672f57bbf0d58af348e87c90*40&55dba0f3437aebfdced1904785d3b01a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"e87dc4cb4d27c6ba632121f7564c0ab9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6230566f33fb3bcc28eadeca3f790366*120&140a4cdb2b1b95d206b5b1b7136ff81d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a21fc3d47af24850bdd049117a20222f*40&1e47d63071ef4b08f76f15f508a2f402*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"403674772a4e6b4e8b28df2d9751c291*80&27392cc16a2b415816f8d58089846bee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9138228b9bc30407080f59e90ba961b6*80&dac8403ef6546669bf9b2124cbd9d7cf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3e5aba2e4a53ba9244a01954bc943f4c*120&d3d8eb39917e3ce50a9cf5f0ce8ce3f4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ebc8d2f6ac24740686370e5f0e064aa4*120&8d22b623f5ff438bd36d96ae0309fbc7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2ada7ac9f2e93fb0e1fe0316e2a4dba2*80&0f55de667d6ec4a315c36c7ada725ef3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6618623514de06cd2db484547460ca05*80&3594d4fe4363097cdd42ffb135a6b0e3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"198e4307e15cb7890f6951d1f503ee54*80&44e11a7541068a733b6a26b96f5de93f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7ab9760a9fa07f35152001e0b8e0fd16*80&34533f4920abdba01044c206bbe37d49*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"9d9b3f91aab36047b552bdabf7a8a003*40&af15654b343e6da8877a88e011d8be63*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"fcd6711f9d60dea5d93a02b9b5e5db85*40&af9fb666b760fb24fe937a6c5e34ead5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b01308471be885ba1587f4155fe68067*40&3f2ece7f975f1feac4af7af794f1a37a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"595f394214abbfd8b17355aac88be62b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1653eb176c07cf2ff90f8df69f759912*40&3994bf389ab1e8e02b0ecf6c39acc5e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c4718be606cfaf0458f1b6b1f8df509c*40&04717cc2d3ea2432596ccd5e64604c67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b5980c86d9907ae57b50c6d0e8bbc6e0*240&528dcd887e37b8a26f8263a5bf5dbaf1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7279c6abf68487b2bf6c55ff8961d93c*240&e326d89454f3c71bbee9402e1fcc221e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9aa53e1b255b5957100265412693f038*200&97b619cfd38b841fed41a94a52c9c606*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3007a6c65e4525c6a72c6534ab0f3b90*200&6f3f03f76b13a917bbc317ada517aaf2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d407bf54f0068ab84f8d881a9a5b5e4*200&f7d1ded55a08103989b7f01c379e93fb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7deb8cba8f5db47b316d1fdb083bca5d*200&b6fe22ab1be0f3ae0fe64d2d12a1bed3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c050ecf5b496fa8b597f9de113704726*40&95534080fbc2c777ffd6b18999d87b13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"1cb72713e3f841b05d40b3ff5e6bfa72*40&deea0e26bb108616c3064f055868dac0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"fb55a9704776b277eeb919f1c663cbb0*40&2200059b45bb523286c220dd147632a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"962ba701d7180acbf851c053870d5f76*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5c8570cfc2371db1ee9ba3b82b3d9e4a*40&3c8d92cb86a8614e500b3c3ff275a065*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"48c26c77fd2c08269339407412e1b0a7*40&38d1b204bf6d6771c8f59e1ca0253cf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fd872fc2a1437dadfa14a7b4de5ac7a5*160&ecfb0e2779a5e5b4048e09ae38ac0083*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9dc9e16e976929818cb7074e73e5a9e5*160&b42e85b452835c34b73c9f1fdbd65144*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3cd24279131d54569e9a03f56148288b*120&e10ddb61e9bc65e866bb530ad7023936*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"11f8beea4371f48764e6529b5db07ca4*120&23494816d2538b778818babc3b330c9d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"53958e7b8ccd1c8807d8716ea65d884d*120&f6456f032b624dc0fec954b2df94b6a9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b6e2943f70d996e2901d147c3bc7eb41*120&c0ab5102fee02430633f3c2678dfcaef*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"6d85e21a7ded1db2b91b518f5091bf3d*40&9ef8c4c23e02deb73e411bcf2dbc5165*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"6593c30910651383b69c93aa79f758e0*40&d67143467983a88fb9c1e66b87bc93c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"30050e670b1b0f07aaeffc57d761e986*40&f8739716d553aa218e2d333fb999f719*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"4203ab0747105b51f2ec0dffbfa7d181*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f66fe047f7dfcf5201456fba822717f6*40&4eeb9e805bd17214f6efac64953fd7ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b571018ed50a9abe30d2764e84ea710d*40&fec1679653fdb8b7244f5afc0f208c9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b7ab3c52fb99b7429e773d2dad1c51d1*160&1843b840365ab0aa3053114c51616c36*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8ca9fb2161c09e35e742b9e49542d3e7*160&0c63b364c2054de7b7ec05ebb4a06bc3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"02937a9127228a1c8f4b6929c9d332d7*120&bb7ce54f12b686d72aa486522d687331*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"901257edd09137aa0b690137b3215baf*120&879c7627c56ca8c18ceba00854db34d0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b97849bf028f0e29eef62ba74ba3c7c0*80&2ec230f7f822eaee13491449e0a3f481*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"17cd866ccf5304c7b366c9f0f70a5ffa*80&c70bd667575610469cd20f802488ba7b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3c1858f7df63bc45987f4041af74ee68*40&5b5d20c047342a1c8914da29daa276b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9fb6c820ca69f5c313e307d8d2f1e3af*40&a64ec4943e2e9d03f255b7dad1773322*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e0c71edffb47b8ba821d2b90e192e81d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bb09f30ef63357ab414ee88056177cba*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1d0649185268473f88298da733123560*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e7c3c798c50d64264cbe17c16da8846c*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2840ad5642773ad95e0f7a1a5b7800ee*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"31d07ce9089f09f9a4fb9004d749500e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"94e5c2b58c0aa65e2175a47627bfcba8*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"56b8ea109290095bce9d1baa94b74392*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9ce2e837ea896c2df6c688f08a0681d4*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"844513122a67443f6f6865ac6e542048*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b6e1f1713d338a2da510645ea5f00463*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5c5e749987696b57c2d1f7c853b6d6c1*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"471c3bfe396b4b74dc535be36a89d19c*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"444784ac2138f06f9b9a96fdec710f66*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bc0de255239ec785abd04d407b935b8e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ab3894ed3c7976e474281568dc234b97*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"da855167cd484991105cdb182d698e66*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a3e4b20c974c1e12bdc747027a141eb7*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ab5021e1f71efe53a85b6f5b75a3c52a*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6695c825c052c2ef8962df1a31ff3ce2*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"426aa37350c07e35092f70149cb980a8*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5de04e6343e67ada4f0413f5be036160*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0a09793feb6a643e351b70887379d620*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"165486ba5ac403b028490da06961ce87*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6d0d4463b093a144549c205916145a74*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b5436f9a0a4ecbefe5630d3050129a1e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"519a674625faaa2ada1ebb007590330b*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"28c54ae8d7288edd6df608bd383b1e83*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2875be93a575874e6da26f620cc85228*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0287a741336de4a6ddc4ce0530ee3ee6*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0f27751f17a3d8678108f94acf24ce6a*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"dc661255708b78cd2eeafa5838940109*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5ab964d8b6d4b4db1c56b4a91862ecb0*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cff934d927389dfd32cc7b0bf81d5bc7*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"86ce804e3446061c22c9872d7b17544d*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"10bd2c99479b1e4479b3fe4596f0fa23*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"088e0dbff14fa0dc69dea84238cdaff2*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a0f76972274981d57ac90a99861cce9c*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ef5ade162fef24a7aa90af844ed2e346*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic2oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"db11388d43bd59c6b41c151e27b25b36*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"48ed198cf086d91b50ef27079080fdf2*288"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc49_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"c087cb7fcf1aba15f6152b759daff835*288"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"1efebd76e04c65065fc368f11457a68e*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc49_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"3ac59c089f7bf0612739fc429cc4c229*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc49_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"d067b92f7907aba45f94241d515142f8*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"ee0d57e7323c604d96f06f4266332aa2*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"ef455f0266d35fc481d5cd16497c487a*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"418e4361580c8c1bdbd9d98e22a3682a*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"56a7b92cd4d422a9c0785d22691936a0*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic2oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"dc2875c8e8dd069f52c8655183789c6d*47"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f64:f64:f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic2oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"719faa2d287b278c591ff8872dba22f5*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic6oc64_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0_n"483c4620f213224c54ab0d6877cca67f*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc64_ih10oh6kh3sh1dh1ph0_iw10ow6kw3sw1dw1pw0_n"1915387d0c61742b7902cec7a70afd2a*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc10_ih12oh6kh3sh1dh2ph0_iw12ow6kw3sw1dw2pw0_n"f207033db6c660c841fee83ebced5e88*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc10_ih12oh6kh3sh1dh2ph0_iw12ow6kw3sw1dw2pw0_n"ed2f9643e2d1465c32cbfad14d381a67*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc10_ih12oh6kh3sh1dh2ph0_iw12ow6kw3sw1dw2pw0_n"9b4e3050f1c14f8cfa9b82df96c4f3ec*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc64_ih10oh6kh3sh1dh1ph0_iw10ow6kw3sw1dw1pw0_n"8a200bb15491d65e598a0a6efb09388e*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc64_ih10oh6kh3sh1dh1ph0_iw10ow6kw3sw1dw1pw0_n"450ef9725d902561f99aaf4a16b7f6b6*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic6oc64_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0_n"3387983c02cb207dfb110227ca975c94*47"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic6oc64_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0_n"7f9f8f2fe071d6a38830db6162e8dfdf*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic3oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"dafd7ac81bddd97bebd6235862419f10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"00c1b858fafc28673bed7f73df46ede3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih257oh128kh3sh2dh0ph0_iw257ow128kw3sw2dw0pw0_n"8938e536113ec409044a82dc95f20394*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0c25ba25417bafe0c6e6d0885067f30f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f02bcd5604d54bad9429e6f016a54795*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b1f1d960fa41600ce221f6ef6b90a271*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih129oh64kh3sh2dh0ph0_iw129ow64kw3sw2dw0pw0_n"622d3d74ba3226a800a90139c2e28266*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9e90d5e62336ace91c361b285e964886*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7b8bb142ad34a96c38099bb418bb772b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bc27fd51d894ad961d7df3e1c035e668*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"1ce8847b8df255eef04ec384ec31576e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"26da0e68e297a00718000ec5ef3dfdb6*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f0eeb75b23fcaf4086bce42900725000*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc8_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b4083059f8a0605cb416688ee12789d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic8oc8_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"71ccb91fe3afb0ce731f20bef208d8ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b01a749e4e4b66b52f5d0738fdff4d85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d2a369bf8f92b7f1dcb609a89b8da322*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"10cbb9f6cc93d6ab9c47374fcd3a5002*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"093791ec775645fb5b6b838c42d5648a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7308c6fc756df6a2701298d06925a9bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"21aff0e169e90b40d34c207883dbba32*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6346afdad6891e09862891381dc4d31b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4319c11b25997288e85fa1c850c90213*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"aea4234ae0b3df9474dc3bc871ba88ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b99203cbda7bcd657a006611b0ae914f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"38da30b20ff4ead61efe80aea6db1ee1*13&70f95d2c7a431ff0d9f443ef08cd3663*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ab7b2ea9afe0803e4ff6b0487b3ebf30*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"246aa394d248c20fab426a746287c557*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"882812f607c8d3bcdd152deb4469a797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"1f5d4c9d8cb2a40d75f1878f5cf15e0c*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"302bfdc9dbe323340af52743fa28766c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"fac9451c24055f88d4da08c3dd7a3547*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"e65128ab2adac4f0c50d59e01e142cbb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"719ef1defe232574fe79aff475690f24*4&f1da94002a948e2a554be7d365a14782*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5a32ab0529a558aa2b01c400ddfe27f7*4&a69d7c66bfe2ab6956b0d7b21842f1a4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"9e8347e60e24f14b94cb6597018b2126*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a4fb49c16f8717fff294290b9edf9c6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"64c9e62adf2ef46a6ea249a8d63ff897*1&a05bcdcbe2636cbfce637aebd751c2dc*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3d4997cb68c3198c608ce63e975e6319*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ae59cca09c90f8d97bfb8324bd753d31*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5a1ff0f61dd59eea1581ac9bfbeb00e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e0e4a3dac4371f475e49c249ada2d5d5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"77f700fb23967fc1066fa43d3dc5332b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"71f48bc9ae9b3f404ba681220f1d2ebe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"040f616c0a8a47908fb25203a82c1e72*1&78d6c46e12e500aa56ad54439f65795a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"56fac812941d65016beaeb5ff3919679*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"17b0029aae2a942e1876fbf17cdd9400*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0de19ca549a75a634b3bcc12bf2f5434*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8b3019009a8a91f13cc05b9a833ec7aa*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"79cabfdc55bf103c4b25f38466a14e57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3c29acc91e41ec508752214af89c3c0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"26bbfd52353d229bfa382a99c3eb954a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"39f9ef173d45cfa1949686a98a758034*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dd5a8a30dab35084dc97556ffa9d1e5d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"83905eeddfd77dac52e1c681f535905b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"62621b3fc5ed4926487b049426e44964*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1535db69c0b8e88ff6d49bddb679e62e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"26fbb27ed865819c3c92171d3db5ed36*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bf321d2118e2723fb7a5eca478b4a41e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"da63780c8588635aa957291bdc3d80eb*1&700047143ad0909534b8fbca03f14a5d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0b2ea232bafc7d90057472382ebf7ef9*1&1b6e173cf102756b708e6e96c464ebf1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d8aa3abcad7d99891a14ceb9e01c60ff*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a2f030ed7c89cf4bc4bd6dde31d861f2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e4a03525bf18bf3547bb4d0058fff154*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ce1a84611be541d5d11e89978b32f534*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7cc634c3e49e294dbb238356c5243c8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"865efbbe37fe2f9afebcbf28ebeefebe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0b6ea1a9d7ad6a3e408a7bdae66ed5bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"667b67cb7325ba62f3a20718e6079c01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"aa4b82f62e685abe22dc8b47bd36bcdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"fc522c52e3990a2d055f1ff1fa27ffec*1&0bae031d30ee091fb0929ebc6ee93607*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ce6cc1825554c4c40573a2d269744222*1&d9550c5e6272cee1e685811f72a20563*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"df68e27213d5a0d2082c462fb9a76f7c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"21a1261bfd40f5a5e9685b10f1723c59*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"34788101bcd366a9b1943b9da6605a70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6f02084b2c56b3b431435f5ee7a856f4*7&de9a0ba73efc21206c25bce31bdc46df*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8a8fd81e8f3b5b922b39b14cef1a5c2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6bd00a98ba41a9170b6162405d77d490*2&4586f16dbfdedbc52aa8ea9048a50310*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dbb6b1c233db8174a7986dac429f8a06*2&abd3837e7c9d93c34d861f7c869e4e40*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2d1449164367342293f1ab20e27e43a6*1&15baff1de4897354d3261404e730d707*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"182ef07bc64ae79ccb929e4148474572*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"4cca88509b88285e155133c809eadd00*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"4e8e8d8282147f62ba3e8ed889921251*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f3062a31eb145c5c45fe4ebd868a47e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"03b6f1d264bceb85287353952d6fa244*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"67022024883814801ed25d72c99b1947*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"c0d3f891ca32393b959cfb2fb1acc3dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5b9c9f935f1ce2db22d9c0189f8e4779*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"bcb4f5c04e69075729bb39edea380ae5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"e11f5d839881316a0a0567821ecee875*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"6aa82abcb3287fd223b652e185df6a88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b6c5b7acdc437fbbaa17da52b30833eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"65980a382b894f58ce981e65baa8fb3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"d843d6353f3c8d9769aacbe44708a927*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"f9c82a0581c1837e6a4afa7d99a16229*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"47fe148923960fbc528a4a51b5f1fd39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"5eff777172bdd4154865b0ba27926f25*9&f3a7ff9b9f68171d3da23018b350b884*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"b5145d00d88af8e9c488c7e1e8f9159c*9&3f4892732097b811e4117a954c593804*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"243c25793b3b88acf409a626a888b240*9&3f7e9f55d8c048cb51f7a403a727f493*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"af973d908f62932a33551293ac1b3042*9&d409cfca6e8d47156cc8cc03cfd890df*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"0e2bf35ce9ce2ce7b4f7a99e17ad0a89*9&3b57dadd794ca47cfa891817047e4cfe*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"06a56582b53a176f93a1b24f20206de2*9&363fdf3869876cf41432b15878b6b54c*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"4a52ebd49bd0d99ae4f9542f30a5b07d*9&05bfa298af65073a4106f232e95612fe*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3acfb71b76f871326067f2a0a8eb371c*9&567bc39e2d54e799c8395ade5ee3c7fe*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"95ef4c716270605bb5e114f6609bfb6e*9&702cb30a6ef1ed41fea116a83a1f2b11*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8dbfffce3cb3f6c2f65c1d11a70c1d52*9&c91e0dfefd212a69c8571b1ab6421d82*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"299b6a4e51d52cf124009b6c6f562be4*18&65e059d6d265520de994a54b568d1432*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ea44d33d81bde97fe95d3e15454c68ca*9&c205550537db8df2ecfe07d0d9061cd0*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"649a4a303a576010dd940ec6abbe1a0e*18&97ccaec9b4ce4ffc511f9beded4d4422*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4651b0bc39fbe5c2eaa0bb48ee492d9*18&2e5f41de902bd9747437dfa14db5d67c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"33a7f1cb7673e0bd85be02277b42834e*9&972105933c95a92585fc6830787c5e68*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"0a06ed7951f287a327bb6cf4da9e2489*9&681ca4c39c63885d37c22c8b33f82c01*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9c8087cac390c44e1192c5d13f7b7c89*9&2e601be13cb64e304edbee7a1367ed22*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1c41f191d09023aea2b71b47bfe85c52*18&fc4da47b68c997707caa0f4c7c073263*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"a321cdd93b583acdc9a4568dea0ad12f*9&44d6f062d65a1a76d949ad3f5ce67bdb*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad6c975495967fb992e6e79b7b56f87f*18&048c632a6282e5dc57ad7607738b2104*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f8ed2e7928665285f27690b3164e2a87*18&3e7d3f0a6fa9aecb5be0667e9aea1e81*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d3970e05ae1a002d010e42959bbddfd7*9&fe79f74f9b95bcd1788827717ddcadf0*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"f47f9b8af08d8970b9ed3ebdbd43d150*9&9648e9b29c1f2a08989b0f8a7444d060*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"76dbe0d1f55061f7474504e437173668*9&128d75b54bbcc3f9481919fe82f24634*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d34739630a520d66b81d2f44ed33d092*27&bc9ab73608b95e9a392c072cba6a925b*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ea5ce52e1787976122d9a46e6bca98dd*18&34a7ded0d67ee037b19f72bef4ab4938*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"21992a7c19445abd0a80b5fabfdde986*27&800bd232259377bf328de55b60c4c6ba*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad409d491302c6570d8fda9a2348b296*27&3da37249f550aef80828cb8fc2a9ddb0*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f0f7f2c84d30360db4e69ebd1ba28d21*18&99233e01a3ff78eca61e1b67974d7531*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g480mb1_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"03296d26df67528293ef3d820b1eb8d9*9&54b95af005682ad175ebe19b297c509c*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f94e7712c58ad4e659a0d0552bf0f628*9&249f9507dc3b16df26abfebf9ab6f8e1*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"24f0afe853250906df677be396bbb489*27&dd26c902dcd3976858bd768be277bf53*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"3980cec26553ee9016270f88e5e2a194*18&46764b713f3d20a49d584414f7698fa9*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f1e857c291d79b3d44d981d32004e2a2*27&70af52c509456c141f8712cb1743ba74*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6efbd1c77e4722e80be83e4f730148dd*27&31cb4562ec80e4684bb3616d29f1244b*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9091de01b4c2719841102d71790a2a43*18&aa2305eca6111756a1361eff8b4a44fa*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"2448386d396822b534343a70a5b061a1*9&4a03682a904e0c6abbb9919f5207725b*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"40423ffa84a874ee25041da30414924d*9&6ad0e6e648b508fb9322f4bfedb07b2e*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b2abff93d8b4ba345cecc1bb2073e57c*36&0197b244e15f54408a4ab316fc4259e4*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"0b3776d2877cf2e5c485e0ff2e425230*27&169efc3838af075541c39104117c3a0f*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"055b63d7b01172429b9225e1695bf940*36&3b5a78968c403010f1af32d3bcba6629*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"44eff9901f6dfecb3d4705f09ce81018*36&870599cf27a05f29bec50a8006541152*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1500f5b57c591cdd0b1e73b004e29967*27&97fd0b3acc3a3427405dea2cb3af3961*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"804f31ef6f361e4b71d540babcfb5fac*9&d15f19dde13a2778d1bdb3eabd0358be*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b4ad3b2278a853f9d23b4a4e393a451*9&30119d9921bc3e7358880ce081bd67dc*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d5d3c04efae15e88af6c40fa9d5dc85d*9&6b99a2eb74f40d701967212a726f4d57*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"6112f9fec3679edd813e155aa1d2eb97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5c1bf39396cb83f3221f22ef9061722c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e7d28ea2e948740e5fd836ec7496b39d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"298c6eb2609b4bef070b0e4cdebb54f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b4b656cef16a919c5c5d58c3ecb98e02*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"71d5cdae03984bdafa40debb36e46056*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b89ca682be68c992df79f2b2e31e265*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"56671571068b9175c0244cfc29a29665*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ea5ddca005d0adb94056a957f533a3a9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de98f99362fa02ab4312c5dedf54953a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"88b849b1418502714b282d7b9f619336*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"44665243ffd6032c958dc0410aaee457*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"8ef24c1b97fc25ebf7ccd76b59dee252*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"850b20ba4a75681f7d861c66899729ff*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b8ea8059f15822288e23803c06759468*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"749a328fd5a86639f134cf0edc51e255*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"35ee8a43fc4bc450e5b1d0b6c942bd65*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"6eb85b6a29208349fb0ad0b685d19195*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"be18a4e12ef1d1701f6570a4960e6ab8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c4e1ea56a1ef88e6d67c136c3fa98f6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"708c19b0a2e8b195a311151a5648efd4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"20260ccb11a08e1a5eda48e7a50afa82*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"484f619e12dfec4d78c436fbe2ebc907*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"db4f6016c2d85e6cd7e07161824c9f48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"410df002308ac2fffb704456c895d273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"dbcfd1be2df9a8a37ff06bea1ef83502*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3f0c35ec61782d969faaabfeec09c4e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c1eadf94def36c46aa5910f31790f02c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0fa2a78dafd44d13f69ca8528d8eb7d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8b7aaad475ed6986b37148d50598d454*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e5f4075ae4215603786472b3f915a890*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"addbbad7fe090b6375beaecc2a0779c9*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"01c2e84146e78c10102b247a30e08157*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7e07d0ac5759d2988c3cfa7144e5f95e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"aa08290274a6a0656a092e824ff46b52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ee434f2cef0c2af95953892302ce32ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"49d824e582591c6dc634109185624659*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e70404c93a55dd994d9b9130d8b4434a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"571f544abc88ec8ab7bf5ccdd409bb9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9d2efce89b0344484ddc8014ca462cb0*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1dd9691e5707172f6deba7f4a121d8ca*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2fe9e5e471d3bcb4b774bf65e7a5b7ed*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"59be4f94792cdda912ed74b80b5a92af*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0fe430f093ec26b91c3ebfb94c41e384*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e8c433faa7bbcee90408a1ad39245a4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"485a3028d6a5ac553bd7f8ab4a07d388*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"93ad5112af8a341e8b05205c26b65936*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4fb4b8cdf44fa4f6f53f9d2648eff63a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e86437271a6af55c0dff184dd69fbc12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6c4de20f1d454fd1d1c320c94cf3a336*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"02abe38389ffbfe055243fac9ac8c010*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e963eabb37c71f97d7333383a51fd56d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f2e001c2d968e3604e16c65dd480672d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2a7db2347abd2e0d9338d647bde446e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e374a6c882ca3720eea660abfea0fc53*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"f0ced4f93f762961eae1c314177bb960*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic3oc64_ih1024oh512kh7sh2dh0ph2_iw1024ow512kw7sw2dw0pw2_n"5f6e031cfbd92ef82371e1a3b7abae3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e2ed70dd45e83fec0be09768297d902d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4ab5ad0f7260617e8e706ffa917dd7d8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"714001514de3f6b64b9364a4c8975727*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"a0a54fc5a117c02a0e4d738f0f5328cf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e7faf908b24db5351bec14d6f2f413fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9c066eb7f77d7758fd0fb16740c932a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"4c8a97de40c562dd9dae75e2350138ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"a2948b3b1e8e631844057a8015980160*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c3fb5a89cd487abffde8a2f60c3e6058*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bc809f97cf30098c4100e8b22ce6353b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"688133144a7399ab4e68dc95dfa5f3dd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"ebc3f53680315c35bc85032d0969af6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"595b42648071f4073969691a9cbf1c3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"14b2c8c68e962f44ad4d8a2d6828fd3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"87951f002535dabbda94ca56f28a980e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"05f0dede5476e51dc6af5e9e477df812*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"08801d8f238d76894969559264e031be*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"25d807606ca44c4af4ccf23debae3811*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"44eefc78343cc6e3bf7310ec7d66e0dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"73065c5e353157d21b39d79f58deb65b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"8ab976a5ddfc44251c0858e366b1df84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"9f7f9a23a8b289e53f32529a85ba4ecc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"977854b748c71c4e250356fc1934f487*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6e04013acf5ddc05aca1a4005f65b4e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"929583024b5947e82d48731db5a95d48*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"18261db6c2c6e52d6918e660665508d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d322107af9da45a8fdc6001d1c8cac8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ed03175fad179dad2494c5936ec7bf41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f44b2b641038b7f36389a7adf3775ec6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9f331bef3c00c1e4739397064803ab92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"669972ac8b303a114904f71e80f0a901*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"06ffa79d857b272475613f6c005da271*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"23942faf605e3fd9774ec587b5a78698*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"772d01085563161e29c0a44583d5cbdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"72d5a7aecc63caca2911eed5bbf5d853*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e1f3e81b0c4fab3a4a3c9f7652cae37e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ca0b881a01d5854b1973c1f5ba9f850f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0d4f2eb1a1ddd6aefd4a6b70939e343e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3b2cd3a6d70728493fdae67dd5a9ad37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b03bd965c862e2aef8fdbecf0aee285b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0e3a4fd58cbdc7d1acdd907146e98bf9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d9ef55b730d5e5626389108ba8b7fad0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"4581524a19c349ce7ce9350b4fbeba18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e09dbecc0f1f750fb1b00b62760b259a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"40a629b2fc1940e2b2b537ea9d4faafc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"54ba7f39b948a053eb0d3765395953d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c67d35d9969af5f1e1674f6db19a939d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7870c006e28165cf4ca2e347474257eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"25c229d4b44157197a3394fc6ee0f8e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c4b7194478d95af8c69a7da1d207fbd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a784027ad896854ce97276de23e8cd55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f68f556a02a06cd84ac1f6c30c31463f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0abade1292e39ea89558f76db68bb6bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6abd2a2a233e2816e9796c8048b74686*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"35ffcee1394b1c7e57352242bbf4ac42*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a8dc26907086c724e463cc36cdb4f3ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"f8163559ced4a8720dd0ab0e873daa0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ea3c804f2547ea5d01cf4ff2a18ba81d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fff31cbd5164d992200ef18b0ebb806a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"52e4760dcaccd0a47527a248a4554f4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9f3cc787d765315c543992165566d4af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bedf263035af2615d4210e55c9130769*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"083f35660d715ce8306caee6889e8e0b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"b6072df82cef805868960131c8448f9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0c0a926ba1ec249ae59371d23cac29e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"29d684bfa8f5b306699a7aa55b403829*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ef28b49e25b70901f450964fbcab4c62*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1a88774ae4853ff6acdbc8a21e783078*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ad8310934bc42731c0911f54fc802191*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e9abf1c6d34b6a05e5d9e27079fb5073*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1f386f6725bd274cd5f76d03c6c6eb80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b2b1e83de4730641c29e05eed1e2f713*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"26e2c6f1147c321de338c2563144c493*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"536cc386a849f2dfb0e02b8268e50f9b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ff1642b975160de7cf7becf2e59ac9ec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b931418ecebc869f7b03ae5e3fac20d7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"7cec34aa1b08cc72ee0fb5bf5371776e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2b50b6f1dfd86ddbe85b84e4398d8093*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e7ff47c72eef5c48b98ada69655d1c2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"43d6b32f39aff99eb8be39d0b696be38*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"cdd08ea3b0cf530f4eb295da484fc18d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"2c69297e065f102636b64a8beb3991a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5f55c596ad09bde755922f8e3fdb1a8e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"643c30ec1d41653d2972ae1d673c8ab8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8ce44debcb02c76e6ca6a694382d403a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3c9703d6a1fefa53fa9aadf53c4db4d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6ad3e4297f07f5ec03da097ceeeb99ac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"98c7e4fd4a3d741ed3eef9c9a88dc60b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fc3b8bd0e763a0e84e0e169224024205*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1d8402f35c8df59bd8fbf888944c68fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"16df6994f244408e0c83da4910d3b241*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fa09921d0a9a6b99112ee2a5a1eb1486*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e962a37670b1ac4539c2b797f9d909df*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"07a662cfc1d71ca7c57e658b163c9a78*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"aee0e6ac53f31b5b4de2fee79ea8023e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"05e7c782a0948ee736a915197bc43194*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9ef47480402ca8f20a92a3ec50e13ee2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"f722a2f84c379ec8443b62df9e96dffb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"55533abe883d880d889784cfa0e058a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"f6767d6e0fe1e0ec6aa8426479ecf348*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"c5bd9943fa0ee3b1e3f5fc4df536a87b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"47e6e1a98f4524039256e2d3c14546f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3a62c73bf11654b191070b0486ce4453*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b6aebf31e3d1757f9cdd05a67ebc7b88*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"33edcbe056625f71762324aa025e81ed*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f4c6577d72f110a95386b725c423a866*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e73417f6a5a7beaf122e04fc455c4c5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"45c64d5aa50ce2bd1d65d13fdd14e408*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"bde3a4c5ea13b74eab2c76d847e88fa4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"16305d5b11bfe70317d346248ec0df48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"2ce98d2c0b401ee101920414debdfbce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"272d393b0630a9b0cddbed9570582a80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c668d5487160bee8e1a3cb8a90e94bac*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"25c043ab58a1c55b270d7724f633fa7d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"31303475fdc83c7bd8673ef68c447e46*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"39aae114377be9475f2a2888c80f2ea7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"516be84bfd2dd0225cecbeb79e3b27a9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c7cfc673af6cb771a0658ce7e57436e7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"a3cd9d142e9d7ae9cc8820ebfa2f5588*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"70d469e034175b35beffe9b3c3787cf1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"bf6f7c584515293e4a8d5d73ecddb585*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"77b467b2e9c4db7e6d3216fecf5b1587*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f294e7a194114575463641528332898f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7736e9fe7d03db128679a88a75e9666c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"86719fa32e5d3a9286238ba902a03c14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e745dee3cd9bcab458415302b59c5531*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c6d2cf7c4ce4513968ab0f3e72b7952a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"81cab7ae8b36d007aba33a918bf3f6bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5ed7aa99f692e16187091b1a4f28c000*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d7244c8ea03698545bfb36f507b9b45a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"06858bc73b88d248f6d397b36b916720*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e52bc283ef893791434f58049bae68c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"667f3f80a5006037da6d1576e6f7acaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"edd5cead5fce3fc5f00b21ce01cf4be2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"9c57d70b46b954f6d18e7594f51fd791*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"d15b40ea2e4926386c5873ab60d71487*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"2e4261de9e95052b7ddc3d5a305c4e8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"11644fdfc16d8b6badd06f42ad607c23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"5faa89672332700af94fa20a972a25e5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"f53b628e20c3c86d9a13268c28f3201e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"791bc0abb70777d00e97b27ef5eb6db7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f9a5ff1d57bcadf703dde44970bb4d96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c381fe0751512e83eab0088e18c296ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0377d7f67a635225b09d3e21c33e0824*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"547ee10c71f4a34ce3088a921c403ee7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"17e7dd21b46f616a3f1ba70808e421db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"85ad1508612c8e84b27ff35738844c98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"91b7259251686379c94aea756ee1e978*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7908e4abb4c1c2d0c7429609d53e7aa9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"afe764788ff57d166cd236b8d22ae46c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"03fac91c3d64f26b567bf63e6cfa0343*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0a118f8cdf8706ba850b1f161106f71b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"77e8b8cfe6efc4f7261f07085269f7bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"83175ed959bf4174312980377c014fe7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"16dfe9ad35639fd09546affab625333e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1ca13cf96fc3bfff3e4e4cb4499ba942*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7195f9a18879b6f44c410e2807629d5f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3aeabdbec668e9e0671a26ffd5f338d7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f336ce7069571a46530f137ea71fd6b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f56f3d865e9e4242b03e6956ca5c2692*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"679227e21c429f454d9952644ef005d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"5f3d7f9ed1b5d4cabeed20a2a928608d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"959263f27e87cb5f8517a06547d6b696*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"536187404a238bceefb4ccb11e6c6d89*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e0ad6ebfe10fe9c0912bbbe49d09289f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"2e1921295c71764e290d3a761f196a1c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"246f5c62ef5e693c36c035cc4da48e9a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6280df0a22fafa186a190847c074360e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b86b74f4de2cac3e97719ea0bc2110b0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9b09aacbd278c54821975e3fc443d8c2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a3327f2255a1b36b44ed22ac6a0a386c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a95c651003b15c69518f89a7451f6e32*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"27975debbb9d4725ac2bedaf042e84b2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a4181495f683a5d214435b5ccc81e2fc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f1644c98eb4679f9f35a0c261c604826*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"33a0a05d46f1a95ac4dbf4fcc290b1fa*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b545cd95cc5c3ca7a14281dc26031b54*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1614a0e97549445cec6e4929c59787aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b57094e79d9cdd9af8a57a716cc3ced8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7e1c20bd31b9adc9dfdcad994cb15bf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"443d1a1847c708a69fee6ff833de5705*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4a1fb1db30ecd0e3be90f3715aa5e0f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ecda3a36f93484ca0e0db9a2976b9a0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"deb3a7b804ffe06bcbf14a646c9f8bf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"07a524e1f0cb5f9a7da863558f02c33e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d720ee25a8c1bef9bb3934fde99bee1c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"29528bf00ebfe20b769c9037d289a0c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"2ecba5e64bd1f99f3ff3a3158bdfb4e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f69d72bea4ad114462456abc279f79ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"6cb5705b219b5b50e748a282e4a58745*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b9ac016f446828074d7d79d095fc1f3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"21bb6bef3f9a1d0816bb114968b3963f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"62e28fe5e10461d23a39b0bb44a5ab0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=abcd --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d42acf488abf041123cf7cfbda911b96*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"71167313957a6f6801d45f1fd575c80e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93501429b4398e265bcb965b6945e66d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dac8d1d6c81e614ac717a7a546c66f68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3c1615f75fd938dfc7c8db1afd318cf0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"42165208b5d2ca822fc57daf4d2d073e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b1976e7ec7d3bc4ce5763927997db408*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=abcd --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"32febe45493f17c2f9ba209dc0b543f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"676a5b86bfc6cd9a3d54e306aa35950f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"739eb380b2ebcae42a5d0532cbe70a5e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"485523fc0ce0f0aaee3fbd7195a8d8e6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"38a8471822a49f3cd295b39fc7d3f724*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e76e84c3419d38d58b17273e631f3ed0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=abcd --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a4b82a4a0559d92ccf7dc98cda942f09*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9829479af638cb107cdd0394451bff42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"cff487d40bf7c5c9c4948b299b660ab6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed6917af5b49c7f704317161cd35f0fc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"08fc2d717f198efec51c151107decb73*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"185607f7bc939cb67756fd700b8e45bf*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6feee05086a10f52c16c16ace7d96dbe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b175a31bb79871f7aaa0d87efb5e29c7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7b61a883c1cab187f0ccd30da674a595*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"86342509ecedee3e7c49faf45f63e2d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"04be91d09dcf6abbb24d83e583305941*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"db2e2386c1c4ee6e88efe1d5895d6f7a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0997995c4a75727585d6a3076665a341*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5c5b00c58b246b9c53530185b3d76286*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"ab018cf43b253e897dd4a791fdfdaae6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cd022f706e9e82248feb2850154c8641*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1bfcb89777c84b2aa262ef3bfd540dfc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"5fb7003578aa108fa5f218b57d63a0ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"a908023921e3e8c3ba28d96cdfb82027*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"db33dcefcb658b840814aac71058d24f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"e5da4f73a63dfae6380720e9d4963f8e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"be64fb3d763e03e6dd5345a68a8a0a6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"438f9449e1400867acf003348816ad88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"4f56858056551e7883f977a7972f7650*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"8b1ed8b0fb05ab80052554e9b34b4cb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"5c4f4b4a34aa9071c4ffb26342ad0bc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b596ada4c365791668c91c1b5661df5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b8c1b00de6a555088542e77c6cf2d95b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"4d66bee3e3e17b6b5b7c66e6ec113a49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"a88307a8baccbaffbdf789b3cbaf36de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"586eee43abca3872b823a80757dae68b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"cc43e85b499dce6d0992acce70fde5e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"e604d4891373e9b63b782913e688ca32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"8834c1d4d9beae406f7955d133a1a12f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"230ee3b915c994914443a52bd60b3b65*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"5e129b5c31b4f225b8e92ff68bd3fcb2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"3024b9f03587745b7e1e33192fb1dc72*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"1eb3163d3e474ecb30924575c2ddf47c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"3b7aa7dd0188abf435121efdf5b90c44*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"70e2a0fabce6e2b747bf4648a9a79f81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"d1aa08b3266fa854f071a07b15977e88*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"6ac3ceb4faf6b1bb29ed0c357a66c021*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"6f2a0e28d77f4bf5c78b2c2704f83d99*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"a7bd68b85f6de98cc1141bae81d55ecb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"7b9854ffbbe9b8e0989341a8682bea85*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"464c6d9b37cdd1564773566fd60d2877*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"004181bf19f01296429fdc8e150c5ae0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"0b7ef08162515f91826c384efa942201*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"ca53b0b9139cd107796faba96e0a224d*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"dfed92f20ffd32833ec8ec6d410b5be6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"3706a76114c49207bd743af9aaf1f500*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"510417daaf2637c8da22b80ecdf51e8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"94d4db58ca779f50fdbcc63d5e2d96d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"e7001916389f4a8f65906dc596cfc83a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"cacb20fe3777e5f5601bc9e14d4b212e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"b02916d0afaa5a90a4357d0a3d429685*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"21d77fe5149b9c677a6a858b61af5d10*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"17a9795ebb426358bb7205854242a225*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"7d7e9cf95a739ece67a8a98aeaaa7107*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"10b7b1f5f3751e3ba90a3e1e18060185*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"16d64803f8d50bff45b5fb6b8262d865*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"a14369a0f0d038f707747199e43b5a29*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"d2a60feba50b7311d29d614aa96b0504*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"e4b97bcf510c916a17b5c2907423f544*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"e7b3eb25e2543a1cd9bda0d129343597*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"d27fd84199fb11af62bc62670e04d479*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"de5470417e54f934a10e5fab4f504ece*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"a663f6347cd7e7e607aa560fd7df6e17*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"cc7bd2a79aeafe659a5ad5fea0ff41fb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"bf781ed8ffae0d2cf1901be2ca6bd14d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"3b546817e3ce7791122296f5dfd4a41d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"666b7a3cc41fe2a781df2ca7be37938c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"06f5851ed41fdceafa887365435a3f2c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"83ce2012ff31c6cbc8ae266283869e25*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"778b9843cc26d3ab1e6cdf30647996e2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"629a6c7a018477fe50becca8fde3284a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"7353cba0fcce681ec8462ab82013511d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"28a9df93b81609df0df6ebb88b759199*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"ba9178e61576381f84d1fb09ac31f740*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"6c18bcbb54e65375f85de0f972203fe2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"f3e80fe010ebd8dc0409444a8d08c41b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"1a741d5987ccc7ebe646639a6816f67a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dacb5bf59b7a89c9621f46a974ce2346*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"fef5e6d1232ecb3910289be94112768f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b4635a5f095863a1462bb6ff16000542*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d437677f8cac3f6a70b2f22dd9b4c17d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c943ca8eb75f46410063b0da4a773310*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"0d99ab80d0e173ded143cd435d3d8710*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"458d9931caa015e32363a22c30c87df0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"86590fd1ad252ad4ea197bccac5e35ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"df0e348c7e97f72df37a848c087b2b44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d86ce23d3d8f4cdb99458027ffac5c26*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d1c7c04426bbec90486a6209432fe479*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9472cde04cfa00bcc20e9572115c2d59*1&cebf255cca144c2e7134be34d43ddc2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f8d0177c970dcbb86c578ed582b2201f*1&76432353fa8bb8a385f59c45dcd460d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e3a9749e9e7e9b908157b8db3ea90597*3&50d31630ae585c3b2d7579caadd9a1b5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6dacb1d3718516aa92a16e0157086db0*1&78f8b72c90d2bba39ecf5bb9622fd6b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"54fa510dcd5113024e8aecad20565337*3&d6678f33ec0e3ad8af17ea173284611f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cf287f9a426ad3e27ffbd9d612245aca*2&4c9bac6b8a22158a6848c0b11aa300b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"52adcad95c80560d699c942c5f446b8a*1&e0a9f13e413cc77f0f786201e57b7f71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"57c27ead8525e67da17313a03f9d0eff*1&2c2173e42a52e4a4401138697b3c4183*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"87dde833a60ae3499c1903bfc9995e07*1&3cfe6969a3c90d29bd7929a771e9497b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e0fffe05e22af5d817b9b82101c8fcd7*4&d3e37d125c6a7e4dddec4d4603b8256c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"89755d09fa73d420caab47d66b087b29*3&0f31a418da6562ab47dbc55dcfb3fd7d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e779799849403e54a548014462006111*3&3128b4d6a96ba8f942c53d29fb16b87c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"878f1531f5093426f6739785f77f7725*1&cfb12fecf44c8b823025e8231b37c48a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"06a50281cd63fc03093665ab42ab1fae*1&805b371db3949e59bda22ed0052fbfd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"3c7869c91d1acc1cda46990b6842afdf*1&01eb76f6a0320e640baa649a580fed48*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2b7da0ff42b47597505655ef7c9af0c2*6&de6295d210075788253ac383244ff4a5*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ee75d3255fc639a6cc49447ed275a8f*5&7a23fc8a095e7d8ed03c70e358370386*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e7f305eaccf30b1b5f9ac46a91b33f7e*5&c45dc3f5072f3e5032137cd55b94a71e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8531ce888d72a059d9a4986e593610f0*1&1e406b12eb2c5672ffe47fab4c997e89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6d98c43ca1bc6ba8c56c2d78c3957fec*1&bac190c18e504a778a2d12fe5a3f3e0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e26968e79ae401e97de2369c6e85b259*1&cd08e697f9ec327a01b2ab8e7e8b4f66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0b378e9bc628a08dc82bd2e5818cf078*3&f94c6479d10d82c299c3b9281ccbcf85*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4231e1373b6272355e3fbd16c915e99f*2&ca6a010a77e0ea65b6322e970213802b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7865285ebbbd1a73735d6853f8253796*2&6cee7b27a9841c9ed329870fefe38a20*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"a66ceb74b4e4b76c3252033936dc0ab4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb512_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"f15fdf903ab653f93e173daca51600b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"a7a29088665a2f52f134b4f3feb6ad50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb512_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"ef9da7fe7da45737d35d4eadc0da4096*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"63e81418a85db561180b903cbd86574e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb512_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"042a195350542f5fe98e778967796620*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"a0d017e713cd8a9fc399b878445e1bc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb512_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"ddf1502a1ad80088a07ace1676015c58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ab5cee55f09881826eec39c8dee0c35d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb512_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9dd56a3938a597057d40ad28b791f0c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bcf962f333ea0bc79783aa830e833e37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb512_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"f8dcfc439b32906cb331d8ea9dbf3242*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"57b3f45acda78bf62ed46c6686a064ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb512_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"2eb302db71aba73b4b43caedd0f09dea*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"4c164d903f39ce7a2c6c4870d838b077*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb512_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"d44d6bcc6dbdcfa2cda19e9f1a0eaa79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2505326c683ad1465970ce083a23466c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d4196bd922695e353bb4c66a5f75d7ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"eba8598f735bff983c4b7ed71460b5be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g1024mb512_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"c3e980cca4f3898e5904234aca50c5f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5b6d5eaa59ac410219653de041d55520*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"03ff4081b53446a8204971a542638ce8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8c8c9068fc85aa5972f599c49f3774f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"28cbc1877d52a2701b3c983fe79671be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"3f0fe7d8593e5984700614eed8feaaa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d9f1f06ed9f248db9cbfd9baadd1d91f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ed5428e0c9a70033fceb61b8183e93f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"5f24d7fbace8d38390ccc93ee1e3c4f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"01e8b81b16ee5464fa3d09b7667c1a33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"6fbd516214e704658ec6a5e3c2dbe171*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"c175ab9be242c88f6cdb34d660ece6a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"86df99b487cc19886a87f40bbdfadecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"c7ae00bfb82f00f44779cdfe9dbbce0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"dd72e306765f4dec5dc02398c72bbc20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9fedf35d846604c5d531e07a68e89fd2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"6da64a143602d7df52301e78a3469940*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb512_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"cec38fefe9385b2f5bb5f07872350c17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3dae78e619504daf0752eb22cb64bd47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a111a78c8d6f92a24937f2b8ff68e180*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"8629a1fe06b55b3e8ccdeb3aed5bc595*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb1024_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"183e17d83d283f83a948a6d74796cd08*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"cc62956fd8f45bf69b7a2794db7a9eb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb1024_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"9669f1a2cfaf6d6665ad429ba1507618*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b7ad0f9379c743f1194b9f67ec008d41*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1024_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"fe0c00b9ef9dbc68ca243e41efe6da3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"2e35d0c488692218b37a5596cb338901*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb1024_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"ad505d09cd4da8ff3a1d0a3adf9a9a09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"1dd08780e0f26493c19b938668d07e1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1024_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"a3c7db07ee0df95f56e7e31d6f1ae52d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d68ebdca84149a1c99e88fbf69add50b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb1024_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"59209e96a5c9bf0521e9fcaec9b094b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"946b86ef6965bb11e2e35f42e7a1fea7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1024_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"314cabf12d84555b9bfad94461231d26*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"eb11f98f01206d543ff8c018ea762dbf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"cad35e4d09b63e11268c656fb60f449d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb1024_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"77532f73f6478c427878140fd056f23c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f55db8d691ed8b4e8e28d5ca7ff93179*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6b649725a304527e83671180395a5dc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+binary_mul:f32:2 --attr-scales=wei:per_oc mb1024_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6ace15b2b8d2a607c9b02cf2176c9a95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc g1024mb1024_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"d49e62126bd724bb6d9be974bc5c31d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"233b42c85e24e670deda1368933cbd15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e6f2e23b045a7a94b2f309235d11c88c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"435fd898741705ec913eb548dbf1f50d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"19374814c3e53dfa29afca1276f260d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"483c0f32e89099a0f34388cb2c6f841b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"0c6ba47738b8534886c029f71e23978f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"f27fb28c883605064127ef78b3913256*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"033a7879b0ea043ad73fa93d78d77b43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"56651927893ec89e7470a60d896ffe80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"e3024ba6082438f29ac45f279ffb9bb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"1fa0549c8360dedb78e01652bd35d73b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"7240808d6add53b9bc5ab405ffcb405e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"090f677fc97276cef3666ea4d6f59777*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"078fc6241dd4cc6afc05b0ae47e46264*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"0db87a873f65856c1b070427088c5ea3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"fc6eb0266c5bf1cc650b0cb467b1d18f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1024_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"f7d3af40659915ec5372e8015044014b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"abccde82d27a67819113e89c350cc075*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6c3552325fa7672f7778337d7abd9cf2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f2da1d24460a761dc8789536083d8567*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"793c76e18852fd12b9f5f2edfa43602f*5&560e6350d23bec35d21efc891d8b9a86*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"98ef30a60031604fccccd2afffd0ac0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"d2714bdee20d668f736d40e44d9b9763*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"e2060b0769479bcd683390c707f959a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"83703048ba024f3525dbc3756b40d5df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c7d1629f3536478ac08550e74dfd8e36*4&689b585918a8e2d785e3e198fc876769*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"156795acc5989b9b4b644c96fdc27bd4*1&317a6f91f0d143820c33c79c8c1376b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"41fd37dbee7249a4523ff388dec15a95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"d06a6bc12895642ef3cd60826a9ab33f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e63c58929dcf8d10c6061edd6d822f2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"49ea9c9da11ca8600fff798774097f10*6&234e89f83919499f5319a03947684703*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d28a37240e0e5cb1a78b292c3c2efbcf*1&dbdf9ef822dd975dd247ac7ac2ac7e96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"956b2e6fcca66702453b9f47b1c2da19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"781dacd56a0c691fcb3ab4a0eb9f26b6*9&6f579478c33a3bf1a45fc7081e38c467*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc8_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"82a05f518e5ee60163d66d2a38889ecb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"41512c32831ff66b24c16e202e7d74fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"027a19568da51d803da9544e7674f5c9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d08a88fac955d718a56cdc4edd62d522*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1e39bb71fe3a700dd58b59d2a50392c9*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"66493d977ce52b656b85cccbc83ac3b4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ea733b0be80822224c913f60409d2383*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"656a9558e302d01a6e1218dbb660a2cc*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"33f53aac501f501b9cd52aa0ea762d88*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"805cc596a89dff77705c589bf8cba4c3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"002a5183b9931fbdc1c8985ebb396e3f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"13988c4708ee52525a412d339d1a0692*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7d7c050310ec210da685fcc916fefb1d*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3aefc11b36f0f715ed75b8794e6ab50a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6b4c1fe2c5619bb4d659d49a7097d0c0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7a47bcd849aa7fe52542a47f20ac2f8a*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a8c9791dc8c1ec258fd344ce477df7f7*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3a438eb9b17b4e37ae852eaca9af6dd3*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f585853c3f9f1b924f431df8f4feaf80*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c29b4de255ef9bf5b98d6bcdf8a16ef4*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"90dce444c52c042b84432b50d4d5441d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ccf93d3324647636ac9b797fafc1519*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a247cbeca7d9fa030319a73ed128e7c0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b5842bd401d470d27343e452fe0aac56*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"285e326351059020d676c934d3855daa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"205a9c6e9964913ac869545e525e8720*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"12ef04420b985a8b77fa6a964511ac03*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f552fc78602110c4aa30531aeb54d792*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ecbb2c16acc9276cf6d4667a8b739920*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b39ad0958152192f1160bc6c5b3adfdf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ab5f20d1271f3ab84b9ac4226d661ae*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4a7be33367bc22a6663cba0a52fc3d3d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"996f40bbee5ba2d83a1f1b26d21c0df6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"af0df2b4f25dbce7ba9d4a770f8d1267*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f84e247ea928407c087779478c4f710e*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"592ace87803548e1dbd14eaf20f7ee4a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0ca21998dc4553b63b6910cb9d3443ad*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d6f795db4b2fc52fdd00045be6044592*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a45cec4bed20e61f54dc779e871ba474*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8b58ceb52a17f9c5131b413a49c3bd35*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8946d41f7f8882e37325fbd44acc35ff*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e001b96b1a615197e5892a9a8b371f1*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6f1bf2f9e7c75c1ed2e51a8b4ac2d36f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2b99f8f3aae27a261a186375cc21a608*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"bcc1190308cc98ca250e532b95590a90*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"59e06272539e320207a149b19531d49c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a9f81edb02cc344d96e1869d089a148a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7116ec9bad3bfab4fd83ec4c5ec23971*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a7f8d8d79a70de4964f4966f484f9944*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e3fcff4421a056c31ec5ea6e7231d9b5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3c2dcecdba805fce7b04c339fa936edf*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7f0fc67bf9b32ad6076cd422c5aaca4e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"02ad8f593d4336353024f3f4cebbe1df*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ce131ebcc06f4b14bd2cbc78ed3465ff*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5b921cd4c76dddd0ae7232d00f1c80a2*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"c7be52cd13e12fc872bcb1d9ecc4b581*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d47a93d4d89beb92759898f4abead534*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"ac2c24a4271d139a71dabc2c436a1736*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8c57a16815fd7b6baec16f313d17e80f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"adf51fed50151f4b60f0d8830ad8b4c9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b955cafce53b5203c4af317fe0e80cce*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"36fec1ba19e8b19f7923e42c1a16ab23*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ae6894399644030c19492115def379aa*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"def5f8c5bab40ed715b0ae5b79c354bc*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fad63953b3f28bb9df3b5b8a208f89ad*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4f1b39e23119dc097c1ce04909f5a865*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0ca86eba838eb5efab0006bdaefea79b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0395e51fb60c62d6de75b600cfb7d6e6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fe333325bda4c75ffcf8bb32f4771d39*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e40c659002a8269b4bbc8ccd30291c94*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"057d485647db0df477e0b3899575cb7c*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"d5a4de6cc01d05be84756fc9f40317ee*210"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"196fb8a145a48f98bdf6d702252e539b*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"7063a8d74ef08e5d76e4bb5222ee6c78*210"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"90755a34ab80225470a7acb5016b702d*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"c7a1af0ce1cd87789ba96d27b21e00d4*210"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"b12171dc6f81f6ce2998f5c497519c69*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"145b6df5ff363b558f4895260ace3584*210"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"a92a05aa4254a03630d68c7a6560b810*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"74cee5303c015472a2956e1ee6530093*210"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"7298e0970ea564fdfa86c8a91d92b219*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"08a84bef5edf0e993f77a2174d06f423*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"794b77dadabc72102a1605c3a7e582a3*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f68b33c048d2cd2337be3980161b1cbc*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"eb7a6660c0b11fd6629ddda6ab36f9ad*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"d89e228ecd01e35933d42979ab641c5c*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"c4bacc1ecf94c4dc4c721c167d17c51f*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a6c5aa6f9de4a40b08acf36c78ae0071*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"e5ee8fa1534503b1782294af22fbdcd4*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"581c8ca768c452b98cbfa16ad83eb8db*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"0d81f59c344a8510fd869d07d73a5604*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"4e506fef4b5cdb2c24dbea4820e74825*105"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw1424ow712kw33sw2dw0pw16_n"1f69dfd2bf477452b46322c78b0301b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw712ow712kw1sw1dw0pw0_n"3a64f98f0bcd1cf67e6cbd47f881bc0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw712ow712kw33sw1dw0pw16_n"fdf10dc67105a47e45bc5c74587465e7*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw712ow712kw1sw1dw0pw0_n"c8894b738b6dde307793ef81d6ce66d6*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw712ow712kw39sw1dw0pw19_n"5b6079d44f3c0ef5a37a1a41af83b9b4*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw712ow712kw51sw1dw0pw25_n"6e13243f6690b362e6182314fc5da04f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw712ow712kw1sw1dw0pw0_n"58dbbede338b8062d8d9d4b7d073fede*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw51sw1dw0pw25_n"4314c402d449ab78daf7329601e67aa0*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw712ow712kw1sw1dw0pw0_n"59c995f1ef3b0e26068cfe9bccdd5c18*53"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw63sw1dw0pw31_n"89f7afabb27977341853ba67f402d7c7*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw75sw1dw0pw37_n"2c6b91d049e2fee395946cd4b097a0e9*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw87sw1dw1pw86_n"be3176c11814ede51d7243140815ad7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw712ow712kw1sw1dw0pw0_n"8c1b0a251d941630d823255c21f7bc1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw712ow712kw1sw1dw0pw0_n"e3d5b325dd4c05e9cdd4f82a687aa156*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw2240ow1120kw33sw2dw0pw16_n"8d85815508bccbdb4a2c08c121764747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw1120ow1120kw1sw1dw0pw0_n"cebb2f937c0bf9e0da9dced48c9eb71c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw1120ow1120kw33sw1dw0pw16_n"82ce1b54c9adf72443352332fdeb6999*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw1120ow1120kw1sw1dw0pw0_n"5d3fd565fb983e3e9e1c220eceef0c8f*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw1120ow1120kw39sw1dw0pw19_n"bbf4c57260eb8ba903ebef4e6ebaa6a6*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw1120ow1120kw51sw1dw0pw25_n"e472283361caf85f476ae2e0b06dcb74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw1120ow1120kw1sw1dw0pw0_n"dc7b1ae24d37d74591d853232c991270*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw51sw1dw0pw25_n"da7d7d668679a28513f72b869370ba6f*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw1120ow1120kw1sw1dw0pw0_n"7a25448f14f1d2c77c288643c6e78d95*53"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw63sw1dw0pw31_n"0f536a5622395a728f90260bc2fd6a6f*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw75sw1dw0pw37_n"fa5ec85372b22d5a18623330bb8ecfed*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw87sw1dw1pw86_n"5d7cfbe03dd93feeccfdc759dce68945*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw1120ow1120kw1sw1dw0pw0_n"941e97cbd29236393f3a0b8a3a747a50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw1120ow1120kw1sw1dw0pw0_n"0ec1f731057cbf22b53857b609fb9203*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw720ow360kw33sw2dw0pw16_n"b7613587b4861cd7a0ed29d8bddbc987*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw360ow360kw1sw1dw0pw0_n"5784395b205e9aab80c2181e89f407b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw360ow360kw33sw1dw0pw16_n"260e3c3590be83714dcb76aa5c7067ad*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw360ow360kw1sw1dw0pw0_n"7118c7da274940be47c124a1fe992c06*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw360ow360kw39sw1dw0pw19_n"8309ad820c3c966fba4975d1d730b10d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw360ow360kw51sw1dw0pw25_n"fe63e450a69e2ef2ee8023513277043d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw360ow360kw1sw1dw0pw0_n"f41db7afeb0cfe1eed215dd10b869e1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw51sw1dw0pw25_n"3dd20ec03d00c5857f6b0d90b9dd06c3*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw360ow360kw1sw1dw0pw0_n"4b95729a1fa32f2b661f467f7c6bd4f3*53"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw63sw1dw0pw31_n"eb1102963ba52bc85fc00451698a5b1f*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw75sw1dw0pw37_n"61cfc8cfbfbf01f721f203fbd10e0c81*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw87sw1dw1pw86_n"fa125415cddee91278d7f5a689621648*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw360ow360kw1sw1dw0pw0_n"dfe03ec8679b8256d4795647e13faa4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw360ow360kw1sw1dw0pw0_n"3e1d61b67d5ba60b4ff8adb1ef2b62ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw576ow288kw33sw2dw0pw16_n"4e96747934280a5fdc687d276a040268*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw288ow288kw1sw1dw0pw0_n"125db35d3ecb8456d121411a143867e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw288ow288kw33sw1dw0pw16_n"f79285ecdd9b59cfa2132f403087aed9*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw288ow288kw1sw1dw0pw0_n"eaed55382ea2910cb9c6829649070822*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw288ow288kw39sw1dw0pw19_n"00b766c3591f7a3e76e308d647746708*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw288ow288kw51sw1dw0pw25_n"fc97d72fa7619a6ac54a2046b304d600*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw288ow288kw1sw1dw0pw0_n"e3a124de01c8a747403c3ed991576ce6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw51sw1dw0pw25_n"29592a921e59afc4e05fa1ceb96a1917*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw288ow288kw1sw1dw0pw0_n"70eb65377ffb878ca85c2a4052b7f520*53"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw63sw1dw0pw31_n"c10445ec3fcdd016b8c508da335118aa*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw75sw1dw0pw37_n"4dab59b2b9117a6d549dd5dcbd31d60b*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw87sw1dw1pw86_n"dc50396e47ea63be5310d11c00a259d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw288ow288kw1sw1dw0pw0_n"7b46f81a455b3c6dbb7a40d5bd5a62cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw288ow288kw1sw1dw0pw0_n"d48d5799b91ac0a9c87e3b07ef47fc17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw1120ow560kw33sw2dw0pw16_n"7ef81ae09db9c6705c0db80a2014293b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw560ow560kw1sw1dw0pw0_n"0eb8699036e814ddbda99cccf6251b05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw560ow560kw33sw1dw0pw16_n"add06deca720945c1fbbe3a9538ac9ee*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw560ow560kw1sw1dw0pw0_n"39073a91cf3d30f18f06f9c162995bf0*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw560ow560kw39sw1dw0pw19_n"50e0db2db37956ee5eb01a3e4fe532e1*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw560ow560kw51sw1dw0pw25_n"508494a3b8ce256462ec542df5cd3e98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw560ow560kw1sw1dw0pw0_n"f7240a18f5411433ab86c700498ee8b6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw51sw1dw0pw25_n"5068a0fc7bff1be22abcf0bfaa285db8*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw560ow560kw1sw1dw0pw0_n"8a6addd73e9a35874ef88d5971c8705d*53"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw63sw1dw0pw31_n"249651d1a2d6ce7451966de668ad5815*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw75sw1dw0pw37_n"0f73a377d290835dee1028138f7be4ee*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw87sw1dw1pw86_n"29afa3dd3928e5d028fdce699049bca8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw560ow560kw1sw1dw0pw0_n"f9be45579f4b2a82c94bb824ecc539d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw560ow560kw1sw1dw0pw0_n"3f9aa38d52b4af56ead1dd43597cd46d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1f6ea2b26c4cd2d2c974a1c3f9881240*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"852401240749f256c2bcfc3e0139db34*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"200ee08e55b067a9ad5f1ba1e55b9aa5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"75fb06956cfb9c12939f8792243f98fc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e09c146abcf18ce1278b4c3c9ac8bca9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9cdc93e68a1d24eadc2eeabcd6e3a712*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"74760ad2d52b92578ccd36c33bc3cea6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a9e9e2e4ffb95ccae402af8fd34519c8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8202f020ae503dfc87edeaa6a222c354*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5022d10bc2244d269f14c60659ffc56d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"d2e0e5f5ce7ca1db22fd22fce3f2139a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b8804976ef770b9293c6fd535babbfc1*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f73f2c5df0415cf3acc658be06639b48*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"aac2e3db5f34b53d370be082f9b8ecac*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"16518c28546fa5ee642df6794369f0e0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0f872c43b251be952fdb2ae0ce8bbc78*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f828a4b4765fe67c5dd53699234136a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5383fb6932f45e54f78cabbe34a4e034*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8dc6f5040f2f9fcc4f460b7dd92eda4a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"074fa219adec12a3c22cc4bc085ec99f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"be7cfcb979d4c8fe99b2bf8cf130eec8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a8ca0ad5ba8311ffb8e492ca4da2caca*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fd348f129e2bcd6305df4b00ca59db96*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3df8e5228470dddfea8b7101cb754807*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9d45c9af679cac88d86b1fc2f16df848*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3a8854ed4b2a2ea5e877c217ece39728*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c4c689a7a72c15541e8056e1f39fe7be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"53243f65e8c4024d85e9d8ca1d352fd0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8b74976dd58a72cb87d395f13ae1c72c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"33b3c77f15c12f0cf9096463ee353b43*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0036244d2a617c6c6e8b2a1c4772e9c9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4245816494c5d0e3cce2d60f1948c4df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5a9966e860bc1575c3da70ed48134371*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fcdfe22b0cd62dbe9f78ff2acb4654c4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b31c61cc9e5d29f80835b00649c0ff7a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"72b3f1f504bbcdf335bd7324df0a7149*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c3ade008f3b8acaa6c72329992446273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a061d0f1b2e239300262a403373b9ca1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"11c0588fb091c42695c8d842015965aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6dca95e12fbeaa600a6fd4157cc42c71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3e051a6ecb5a18e15a4a9a3063cb805d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"cb8dcdc6eddc3528c139f4f9cbd16344*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a5df3881f2fc2cbc50b1b64ff85be594*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"5228061bb2ace4b4d87802194219ee95*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a227aebd0e4ca29f5ef877515739bb19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"e12292c906cbadba979e46f385b1f9a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"e07b3d862a4787d0d04a4e5955835b65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"6b24375c6e4696a159536cf99a2856be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"a4d404ec5ffe1d3ee791765c46a051c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"48e825c7c0f67c0a74098636cf12c5cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"26a89d6386b1357510561193f1b3998b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"089ac9c1b0c7831bbd1a2cecaf4d3ffd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"3d4896feaf4038595da293ca15f7b144*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"7559814a1085a651ab2bb462320090be*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"c95e81fe80598f79b908a4c970b90020*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"ba01314f8ddf9043347ebeb8c59a5b82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"b89aa9cd95b53c758a1c6c6ebe6dd430*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"4f81fdd3aa1bc83f9b5ca7b05167a937*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"9981ea0da42984993eafd0b1cc5b4310*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"678a4ba304be9b41fc7c67e04b002040*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"fe64256a1cf612395c49540870cc9741*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"d08b795896c112037346662e581389f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"d477a98f31c73aa0dc3fa6c0aae63bcc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"f1113ce295b4e8ff2c98da84778d9c44*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"b163906add3090ec7257ae4ac35ab6bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"4c6dd583d4efcebdfe2ac32f85548329*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"55ba22b27a3f320e2b45164c8297a1c6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"803ee92b17c44e101be063de24b88c1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"7ece809bdb7c0cd1f590b1448ac61dd1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"3efd1b204798f1d0fd4e91c23c06778b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"514ef7466a9b052b6db6235ba6a3cb4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"6d4502f50e097ca1000fbdd8f6fcdb24*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"3efed8fe9f4f869ddd01767dba6391b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"7c710b1fee061926241b1de00241bb8d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"2637204bb20855474b55b915bdfeb301*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"b44dc21ad04347bd6566631bea1ccfe8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"26cc66501c001168caa8f8b07fa2bc4e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7686a0ab46f87cccbd5275266d9a2897*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c6bb0f2abf802409a322b909dfcd85ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"8e9931bb5b8bc4badf9fbf049e4212d1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"42e2e643e082dbebb1575772dff4dc54*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c5ae5e96c928e6cda2557bc442182837*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"88e22181ad4e51481c99c2626db2b871*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ac8ab30073f7f866480517b6cd61f17e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2c72c082a1042800fe98506ebdee560b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"108069aa5b0447b151433f5a226dc4d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"cc7c71245a536dc3648ca46d426b84a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"59832c305da6980e333264f1f98784cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"24349549cab5a8dd5ac882c31ce2595d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5d136a9121ccf92fe3ac132d312694f3*20&cacc509d5f096bc1ed86dffdfef25c04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4f8f96ff162280a291cd602cf598ce29*20&1b4b84397c7f227bbe152d5784329633*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"61040550ddb9794821892e1ecde9bff7*20&38a0b8a00ae2ac819fe41f9c36edfaed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"2373793d9910aede082cfc99faa1a9ec*20&5e9901c4b546d6dc5ae65ba5f59d9ba9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"331186b317caaa9ee8dbea6632160a24*20&b399b80b43b96e8a1efa383fbe70d310*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5d32b748a5f57903bdf4c2e4007367b3*20&ea85e318204c4e76235941876b82a798*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"d2b58bf5c811c7bab50425a14dc269a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a35356fe66a93941f2d76710c65d44d9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"bf9025fc4b3b256892e8af67e1c4b446*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"e68639f32edda02a40a2ae2697a41cc3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"75bc5eaddb004025825953f80f725ce9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"9758f57fedfdafdf9d2aa93fe68727ff*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"b753962d0e226a9cbd8faebe982cdb04*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb64_ic3oc64_ih360oh180kh7sh2dh0ph3_iw360ow180kw7sw2dw0pw3_n"41fe404932d79e555ae34f25cda733a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic64oc64_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"3c0d93c89bef14d48149407c3cdf69d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic64oc64_ih90oh90kh3sh1dh0ph1_iw90ow90kw3sw1dw0pw1_n"6e9069f93faa6a37f38925222d8b2a48*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic64oc256_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"8c49fa0d43ee2acd820ed12dc1f8bdc9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc64_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"fb897ff8aedea8edb9915cf8ae8517b1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc128_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"5a3567fbee48618b3ede95e3eea35e2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic128oc27_ih90oh45kh3sh2dh0ph1_iw90ow45kw3sw2dw0pw1_n"ccd16fa19a2d3b658d1fc756a00bf0b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f32:f16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih135oh45kh3sh3dh0ph0_iw135ow45kw3sw3dw0pw0_n"3d2a1b7e9a61af5ab6ceaade833df866*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic128oc512_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"4b93789f3d615d2e4d55f6c00e4c5d6e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc512_ih90oh45kh1sh2dh0ph0_iw90ow45kw1sw2dw0pw0_n"33a804cd552f2571d61734a7a1130c92*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc128_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"1a10f7286568a91fe6b958b96a64e4e3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic128oc27_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"43d4c0a9818cc5286172072d7c2247cd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc256_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"47bac2aa18d1e6986db137d2c7a8d4cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc27_ih45oh23kh3sh2dh0ph1_iw45ow23kw3sw2dw0pw1_n"6959d521ed8c93b4b7ea799349dbfb5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f32:f16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih69oh23kh3sh3dh0ph0_iw69ow23kw3sw3dw0pw0_n"8f078a091065d6d6736de0d65f96ca89*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc1024_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"411cfad5ba6a562df93807dca7cea6bc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc1024_ih45oh23kh1sh2dh0ph0_iw45ow23kw1sw2dw0pw0_n"6484bd00e78cd0f602b7f44e2e957496*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc256_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"20cd2e6ad4147dbf775e75042953b5f8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc27_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"f42d93a244e4e500b028b6bc70701a47*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc512_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"1050bd2bc7c412bad03296d82ac235bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc27_ih23oh12kh3sh2dh0ph1_iw23ow12kw3sw2dw0pw1_n"fbb45f3122cc724fed1a5b63e67439bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f32:f16 --stag=acdb --wtag=any --dtag=acdb mb64_ic512oc512_ih36oh12kh3sh3dh0ph0_iw36ow12kw3sw3dw0pw0_n"ff520edd4a6c5508e287984b909ddf7c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc2048_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"ef1a965784d88216c4cebc20d0d9879c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc2048_ih23oh12kh1sh2dh0ph0_iw23ow12kw1sw2dw0pw0_n"a8bacbe47684a66fbc74c9240304dae7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic2048oc512_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"2c155ec68c071214f022030b84dd01dc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc27_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"c7645edfa7dfc4de433b9caddb760e0a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic2048oc256_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"3b764d1d141a50774e93eea5442b4efc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc256_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"05cfcc9af21e56e54434eceba5f978db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc256_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"3fb314e8b7c99bb12f18f7d451378b5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"9df90ad5bdbfdc6c925ff8df1b51fe3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"8c399050c3a70a37399c521eaf294244*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"5d1eaaef0e488f95b6ebd03e76a620ae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih12oh6kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"653ca8aecce1b9e2130fd39d3f322b3d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih6oh3kh3sh2dh0ph1_iw6ow3kw3sw2dw0pw1_n"d0716c696bf60388b0efb5702ba60715*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih90oh90kh3sh1dh0ph1_iw90ow90kw3sw1dw0pw1_n"07954e1b4731f785f6f06d937fafa4c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc32_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"6cbd6f9f6262c82b9623d25210da937e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"f9358d41d2d89d3cc13aed4cb263d26a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"96ac5e2072b8142f87aaf0f51e16a804*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"e31737089cebfdb301fca6c55ddc8263*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"a62f1d8ef7579c01582dd74252361835*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"58537c25df16236a88e13b47f952a757*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"322e40dd6f1ca01e008730a7ebea2e32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"1ae8d74bd7807a8266efce0b508f3a3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"1edad8906da0d6a71225948cda7e69f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"80740a7f55ea57748dc7ea390ce6981f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"fd712a5d9379d350b9253a4f1169b31e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"81c8e23c1f429a62d9587a5826b1e481*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"f4cd4f3b24f448b32b3ba3ccc8a5efa9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"da78dcbec7e4ad8f95aba5a92a673dd5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"751c1da65a450f02307bbc834dacc430*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"c81365a64c611883ca11c3dfc400705f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"7b982d1c3bd6c9a17ffec50a5636e557*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"91045b6cb0cea5737bd96ba3a6e67833*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"52900d57270181e4331c6ebe51962c84*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"802db944b553e25e675123edf6a7f940*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b1189f69bfacafbe0bc9f31b31df1bd2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"395c77c50da60d1588cb7a74069937ad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"74068463a3dbacac5d79364ab01bc7c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"39ce63e35f5488fce04979eb71242a45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"10b026e4a3038559a638e014d04a6f0d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"9e48fd7d5f0284ad5ae8343d1b244b75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"be698291af6702fde4bdb5be3d803c0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"525d707a8d5b10a5670da6e491846914*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"142623a63d56a195a4226fc486dce795*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"3050478ace2210183d285e8b67fa6824*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"305a2e0bcaea1803c0f4a8e9df667d0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"6dc2532ec3dcb07dd49575de265d2db5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"c5802f6eac9044a101bd20580689bed2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"546cfdef52e183b0146b6f160dd1affa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b1f68aca1979d66a4ebb673f2e20172d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"76c1080b426b7c05b8747eb74879ae7e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"3d1fba333d5663dee05e6fd4bc9959cb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"80c5e7ea501b5ea23e57e0cfa67bc15f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"f3535dfdcb8f8b20055a47280580c9da*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"785ba388ff1d45ab18bcaf23b1b36af4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"12ad7af451f7a13077619701dd3f0cf4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"c82714a51bfd150f470368b9b65500ab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"0b96dfc1515103607c4d7a65a2542ba5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"37df492f89683f2e3b9b0a44210f46f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"4e3a071b384136052e5a50d8d9a3a98d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"239c0791fa8a76fa1faf2981cf0346d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb64_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"1fa8d959663e6b4a29d68c1de2817e1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"532369c8289e1fe2cb8e2ddb22cbab84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"74d9ee553df9cf15266d6e253c98e959*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"ce76778c0dc41ce4ecb5cbdde7446248*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"59a8bf8e6b1ec04230a5c26a698c51f1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb64_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"c84d0c490c422df7e1a6d1fec3856ac4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"43b0606cabb7e85780b0563598d219e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"fa531a453e603dc0a8e2c2109b43cfd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"abbdd283f04489e0f9b2854bedae53e4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"80334ba24c45a6a6040e339bc365f8b0*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb64_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"08ca559f49074d5ec0ed3374272244eb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"1e1d19154712653852e3fb6062b5a8b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"170326f89c2ee15945f44700a5b1c90e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"ab4e150fac77ed607e513c8fb922eb59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"d8c269816d4da5c50eedbb84e855a03c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"9085c41faf57c0f9c5693606511a08c1*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb64_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"c0dd3899bb647d6b98c97ab60eb41619*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8a7cfbdebe473c6d71e5ec882446a969*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"327378ceb31e2b1b07e4729a9b70b3b8*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"5db721a548cb9cc85003a8557e9ebfb9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"a06eb82ef120e3a6aa3fd593561570ec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6f51667fc852a1f99788d5b2f1628418*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:15:ABcd32a16b mb64_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"9e53638aa9d7b3600cb95f2aef24d971*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb64_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e9a6c3868a96f7c448af32a6ac41c6de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"02fe1d1823a6d92bdd4a39232108f289*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"53b6ffdd0684b43528632a9c16c266f6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4b77ebb4d49ee57a29dd78cbf5585990*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4c9f5105b2c58214d268476691d6087d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b7deffd2412cb6bfffc8cf769ae64afa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"59ebda0e3d0967c9fbbb9ece2b5f7b11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"a433f68f889c4c22b1439995fa3f03fc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"89c86d7cd64a7b71f4ee47912b18816a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"de74414f154628ad66b54f0cf85b0f32*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb64_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"5e1a36485c5df9451b3244e3b13d8654*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"231fb395fe235ca18dbae07f74fdcfa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"98599305404130e4e09780032c9e8aaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic3oc48_ih640oh320kh6sh2dh0ph2_iw640ow320kw6sw2dw0pw2_n"94e454a0ae0cc7f1cc199abdf8d88065*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"cc9ecff2801685144147cf14572cbb6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"6810decff72536d13a79bf9f37fd675a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"8ae3b352149b38c7ec2858188bc0f72a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"5af603bd773ae5cdf5d96f57ab3c6265*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"3cf26a735f93ab5b0aeab0ce876232b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"cd616abc17b5058f8ab4d409a9ced409*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"ba33c3f51e9d201166298bf8b7c0f7c8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c3aa1f21bbe402b603645386e35f237f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"34fef2f1341c490b8f9e10f238159b2e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"654e865cdde6aff6448098382a661834*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"d87b8180f15dfaa9dd6f8e7af6e4eac5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7c0be09588f159919dc3fa76e0ee9033*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"9dfff645376ecec9189486e7f1d52e73*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"19cd30164fa2ad192e8f5e66f0e9e060*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"dbfb09c457f20370d4bf4c9f4396442f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"aab6942a4a1f24692536cc69c8dd5f96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"51d2ce85640b469c93ecf470e6f39be2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"909256c355a4176d3294b5234d7fe711*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"e233068cd08adfa7b4b65cb56a5e9e54*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e0b15ba40800bbb17157d1fa7a402fb6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"b834623de1e8d788e904a2f817b6d489*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"325fb3ba77135921b942e7e1c73aae7f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"1124630fa90cc7264661eb0f7951a8aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"e7cbfbcb52ebcf62932628033e0eead2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb32_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"2310b2534ce04985a68cdbf532ae5b7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"bbd65562ffbea628f1e868ffe59f4163*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"788f4fd37e08fead3217a7690f4033c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e6c5f55b596f21b3ab80cbf82bdbe464*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c3062c10526505132c897907e07fad49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"613ab241dad556f4b12aa61da4f149ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"ff214c3896f48046ce06f0ca0b2645a6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6536e193fc0085996d3ab485dcea844b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8c701c0aec5659fe447023e185ddb197*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d1d1800632f4ad099645cb773f48132c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"78521d1603ffe2c03d38dca6d9523d7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c01412a516067beff07ce05d58356a8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2827e78f22e0ff11065a20e9369bd5d0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e6dbfb9bbb94cc0e951f3046400e2c3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f7d14fe82a42b54d033b1a2e9ea03a56*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d808cab7490db9fcbb6a9a0af3b492dc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"69c04740a57c164438a03f538c1e4710*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"8741ac1b95461ae0d6aec804a71b033f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4d7d8de79ab7e2e2671bc4e096c0f49c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9a24b45e086a657ea7b904d4f585adf1*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f9c10dd2ff4cfd1e0dd27088513f677*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b021f41bb08b3341b86c15604cbaf5d0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"87281d62630d33a83669b67fb0def73a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"c9f2095a31e45f430022a65e8f910424*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"119c45a9184f4b05a20f0448abfabe55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234+sum:0.5:0+eltwise_linear:0.271:0.314:1.234+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"36f6c52a7b368d7a5c9df90ddae2f519*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f301c07be00c57d694cdf25163a3ae9d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0e7348e2e9a5ce74a92001abd9f15794*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7cd5262091bef9e59b6fa5d23edd7736*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1a10cf7e822dc048b7f4d9965f74d2fc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9196b7a96e256c22990e9706fc88a3cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5e5e5652489ad91fc4c8a2b46dac7ef3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6d5f1bf9cbb478e23745083f21f416f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ffc8423f9e77da2e324dda00a29b3e12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6acc1cacda987a6f600b44d7cf0960c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"42171b0526e1e63c871a3a650270bcc2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"679bee4ac93c3df78948f055684198fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"868f6b7e1d63cdca46c0686f49a074b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7c4681863d9adab7d9062844215814bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e200004f3cc2bb041482def7958a4fc9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"950138b608117babbc98ee726aedc7bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6c20dd42a29379120872c3c6e6159aed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6a9f7d8090fdf36b27e4aafbae074c37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"76f2ed55dbd8c2f7ae35a95f2962af1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"02f3c30b98f576a3c3e5fabab8836401*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b6eba2ebf43752ce12c34e8fc87c3eeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bd92a120e1e2fee37a5709cdce39a01d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acbd --wtag=any --dtag=acdb mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ca6541018c17a6479a37194a9bfbf9aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"866de815aba1fe7b7a4b65bd895694e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"51e58da6ae5b66b1c88d42bad62ce690*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"a07a567a59069ae1d98ae2c3c25d01e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9100436b2b36836c75dc30d523314320*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cc64b485b75fb4ee3e8ef4c2e4ea3578*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"915a5d176837aa32597131cda0b1f1ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ec7e0c1b449b69bf572e8eb23e393dc3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"54e23561e6e62d3aa6bb95081c88e355*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"1e45c7fc27751e2544e5fb9551a3a748*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bba5ae7a387931646b334a9908c8fa91*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"b6fa2354273d4090b2da62acbe40513a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"748ff2435c84de965f23e1d1f126db83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4fc4b78a82fca22672326f7258240a2f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bb74027da05981853d5e0a73487e09f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1b82beb0c42069b9dd6f5216602dc8e5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2a1da5b3a97baca736a0b6099a2d5f8b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d233d96680110642978309bc131e88b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"61763fa47d0a753a6ef81abcfaa4029e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e87bce103b5fb928187d4c0e746432c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"c9d9ab00877ef399c4e01383d8d86f73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dabbff75d0e94ea0bd90520bd61a13c5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a357cf64fdbece42982b4b3dfe654912*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"eb2d15d8e8106f5406a8393f8d93e7f0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"54e6869e7968da74edd007d7b2d37b9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"897a487e8c0b11b56ea3a5917e3714e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4cdbf357759e00f7da931df417e0f42d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"adc1be6d2421b7b448dec16b0ea91f47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7e852dd94ea38208807d00ef78ec6d5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"dfb234155ca7c56e9750defac2205cbe*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b351edba3792b2742bac702a1be72127*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b7e8f86fd1ed56aaee081f99dbf26a9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"9032be7c25119026936cf4f997aa67e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"40bef5c3ffb6d31bbf0f905399ce3552*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a2575f72de7920307edb2d5dcefb5107*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f78bd8750a78b540a7a0fe39384c19fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9b8d2d87b29c790c114967e4c951379e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0f2374f587008ee85db6848cbd98620c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc768_ih224oh14kh16sh16dh0ph0_iw224ow14kw16sw16dw0pw0_n"bcb01ab16c797b7fbb5cfd3af3a55cd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic3oc2_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9a1596d13da8e9d452d3f94c7dae9fcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2oc2_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8380f22154a848f1f84a192b457b8891*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2oc4_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"9dd584c49fe5495f0735f5ea9dc54eff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic4oc4_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2d3497a08736d66a422a0b8872714ddd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2oc4_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2ed30c800db695afabf1a67b1993eca8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic4oc8_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6643352ea87a4cf5e97bc3b67847078b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc8_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9928e7870b1ff0f8fe6ccc7e829a19ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic4oc8_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"fa8c11cd0053f906a1c6348fca604257*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc16_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f78eb7122f9952d313113d59b3904e99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc16_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5e550a6a946fff58677af9783dc4c1a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic8oc16_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4d6d93f8241414e4a62ecc93629e38f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"892d741376e9d7d23c55f88d386f56b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1oc512_ih1oh1kh1sh1dh0ph0_iw1500ow299kw10sw5dw0pw0_n"0821fe9085ef46febdeb0cb35ca5b0e7*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw299ow73kw8sw4dw0pw0_n"f01884dd406131666d5f35d3696247ed*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw73ow35kw4sw2dw0pw0_n"d04b044ec5c6d7d336f53167f1167236*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw35ow16kw4sw2dw0pw0_n"52cbb2cd034d5d4cfb0d69fb5d71217f*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw4sw2dw0pw0_n"fba75e1902972505eb6e34f881d1503e*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ccd6d4e79ce90c7ddfdb9e18a440797*1503"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw8ow7kw2sw1dw0pw0_n"367e99b2862cbe843e0f92d2e7dcc931*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"61dbf1ded4bf12536a4c96d0656592b2*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw10ow7kw4sw1dw0pw0_n"f76a10d88b083d3ce694eb7f07b03537*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw11ow7kw5sw1dw0pw0_n"c989a00da0f9a9507f7f2130ab2e2268*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw12ow7kw6sw1dw0pw0_n"199baa813e671a6378d48bbe613649f0*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw13ow7kw7sw1dw0pw0_n"59fdc6a86a441166ebdd1b24cae6fd5e*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw14ow7kw8sw1dw0pw0_n"14d5ac52309c9f3e90d6115aa6cf1e17*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw15ow7kw9sw1dw0pw0_n"ccf447466dfdb1b663f7b8d2d2c01046*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw10sw1dw0pw0_n"216a7f5cc63f91f87ad3a4647ae3aa48*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw17ow7kw11sw1dw0pw0_n"cdf7482ca86b75d5a990ce0025211985*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw18ow7kw12sw1dw0pw0_n"8838f1b2cb7b8c83a2b608231bc42f46*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw19ow7kw13sw1dw0pw0_n"854bd7006b5222078a0d725ae9ec2c03*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw19ow7kw13sw1dw0pw0_n"927567ef1d0301b0471d0e9473c31113*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw19ow7kw13sw1dw0pw0_n"baf40fd7a39f549dc88551bfca7b0943*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw18ow7kw12sw1dw0pw0_n"e4405b4152bdab71f5f8eaed9adab97f*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw18ow7kw12sw1dw0pw0_n"07e23890b88cd0081cef59fe2014138e*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw17ow7kw11sw1dw0pw0_n"33c48aaf15b3a6bb611da3a237240171*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw17ow7kw11sw1dw0pw0_n"b75e64e79d9c5569a282c48df1cb0a25*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw10sw1dw0pw0_n"7d8637193dc0d9ef358c6600d3fa8cd0*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw10sw1dw0pw0_n"a398144160832527635cc37ba38dda1a*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw15ow7kw9sw1dw0pw0_n"072cf96e144ff2b63d998466bc8e3d6a*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw15ow7kw9sw1dw0pw0_n"d861aa19202e2caf2c6fa581190f693b*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw14ow7kw8sw1dw0pw0_n"077ecd8066d3d613676d525c6296d372*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw14ow7kw8sw1dw0pw0_n"6da9c067046068f425e28e224d164a4e*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw13ow7kw7sw1dw0pw0_n"fb3670ebd15a71d800edb301c0e91389*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw13ow7kw7sw1dw0pw0_n"d515f7e8ad4e1216d3373423cf99ca08*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw12ow7kw6sw1dw0pw0_n"c32f568544283de93e7ebfe2e9665032*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw12ow7kw6sw1dw0pw0_n"2ba8d5eda0cec13cff4ff9ee1b877d82*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw11ow7kw5sw1dw0pw0_n"11a80decf65e40b04088753ff9d974ae*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw11ow7kw5sw1dw0pw0_n"ffee52be001c2646edec667d6c45a959*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw10ow7kw4sw1dw0pw0_n"8fc2b2e42efe98e1d151d217448818fe*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw10ow7kw4sw1dw0pw0_n"37ea1d027dfb7e28f1f7977a81fb867b*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"c6c686c29b20ab868c51eb6896e201ad*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"f1b7c3fb7d51051f23955db35172521f*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw8ow7kw2sw1dw0pw0_n"5a48f64ea23299df1d29d83eaea711c0*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw8ow7kw2sw1dw0pw0_n"78b582a6f292f9bc04685e1826da3ef5*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"085deb77bc6b0b2190ba1f16b3dfdb8e*1503"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b08599a2e7a5d944813bf82bd3d1eb3a*1503"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw4sw2dw0pw0_n"2ed2e13ebe21cb052639ffc6d90f55a9*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw4sw2dw0pw0_n"47fe0c2f73323d23b13494ea40ebacd1*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw35ow16kw4sw2dw0pw0_n"c9042d6ccf54355714229ca559d5cd41*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw35ow16kw4sw2dw0pw0_n"4971d421f112e3575f34d2d5031a62c5*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw73ow35kw4sw2dw0pw0_n"77fb2369ae780c499751c77a5a545723*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw73ow35kw4sw2dw0pw0_n"b80270da312c4f032ebae497c942d81e*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw299ow73kw8sw4dw0pw0_n"d71bfc8e1911f04f23f06d93ae7ca4d2*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw299ow73kw8sw4dw0pw0_n"c5d19af5b3f7a97201b22acff1ef3849*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1oc512_ih1oh1kh1sh1dh0ph0_iw1500ow299kw10sw5dw0pw0_n"5d18fd4b68ee17b2dcdba3c825927b80*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"020c27b56d94413a11bb8e6b1c319ada*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic140oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4110e8880c5fd0b5a4517c4de262a703*191"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"011fb00355f20728e69e1e3728d33fa2*382"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5e3ee481117e9ff69e19373e706f7c59*382"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"95641afaf94a8aed03e2231fd1ec0a98*382"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0b0c27a7889c0950441e933693164079*191"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc25_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"426e82e526281f6ab92e915d998c2b35*191"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"958664c2d01f6dbd218991a6c6a99ba8*191"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2f07f97a3ba07f2bea000c2fe372f030*191"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"4ef1897cfb5d76b1fb970dbda7cb1927*191"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"5a4ff99e0b33f19f9bbbe51d6052d315*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"30a7c977060a38a19c6d725ba5d6e24d*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"8fbbde2841b7407a055af3bdc471b030*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d1473929027afa530aaadc02e7e6da6b*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2fa4513b260426a0e02e482483643b76*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"50c08e859c917087e3c79da19c3976e6*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc25_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"440f141601c61bdcf6fc076da7d74f2c*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc25_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"abaff4171b1835532d298f7d9505c79e*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"e577cc4d08f000ab40fcc90b0bbe228d*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"aef82971734129abd1eaa333f01d8535*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"03199bd908a38b971a4a8a8ed7afe660*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5d015b2579fa9ed1984c9b0bc1bd4e08*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a28521ad951b262dc296ec7980e4ecff*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f1aebb97bc54c78b514bc902927516ff*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"27d05d272aface7a63d86b06470fae24*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"656c12bdfaca7d7cbc9695cc4cb3d44c*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic140oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6d48ee652942cc5e47ac20bc3f5eee0b*47"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic140oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b252596571ba63a8e9356873cacf91fb*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb8_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"b1f307c0895324690c778e917af57293*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc32_ih768oh384kh3sh2dh0ph1_iw1152ow576kw3sw2dw0pw1_n"d9bc2e8138858515bc431c53b743a1ae*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc64_ih384oh384kh3sh1dh0ph1_iw576ow576kw3sw1dw0pw1_n"ef5de68214b17946269493329eaf67b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g64mb4_ic64oc64_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"7aac4387198c4d64ac371d7f8e5b2004*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"254014ac61e70e19ae075cd6a25f05e5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"5aa0f46241633cb99356ff8d3902f30a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"9166f58825037d876a3bed48b5be1744*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh192kh3sh2dh0ph0_iw578ow288kw3sw2dw0pw0_n"0b77833f69b4623b136614d848c92120*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"7fd9230ce269980bc0feb9e1ace84fdc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh192kh1sh2dh0ph0_iw576ow288kw1sw2dw0pw0_n"51ac30ce2d41d6c129d59bcde488f30d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"505f5e353f67bc1b416fda02ed1ca543*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"163366252999f94b09ca65bc3c70b5ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"a8e7da070cac38ee1b5fd94c47f4190b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"f945d5db12973b8e154687a365f35fbd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh96kh3sh2dh0ph0_iw290ow144kw3sw2dw0pw0_n"4127fe8bc74ce27e490a1549352402c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"bc3f5d031172b11a0c5c4b57e8dfa6dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh96kh1sh2dh0ph0_iw288ow144kw1sw2dw0pw0_n"45260f7585b03f6c974fa2818d4e975b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"3453c7d02964ce8d5351af85ca8cc1d4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"0bf223ca1b4273f601d953d6c234567f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"a2e1a1aeb6c35aeb68262bfdf996e6e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"46359b2c30d68c08fc018397053f1d3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh48kh3sh2dh0ph0_iw146ow72kw3sw2dw0pw0_n"56c1f3cb5f3141ce542ec3a223ec4bb2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"082e9db03a8a3574f5077f1ccd90c213*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh48kh1sh2dh0ph0_iw144ow72kw1sw2dw0pw0_n"f60fd5b3ed773fc2906c3095fd34972b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"59256496bfba0fba6fee3b27ae65f4ec*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"d0f6ce43540be849e8ab56b014f68b33*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"5cd5620cb5dc64119924d30755529713*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"5a11bacb9c48cfefdef16324ccd56fed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"2bfa13fa3466c56d26bcd5bd38ef4498*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"8b8d06f127e5659ff4525287421f6378*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1536mb4_ic1536oc1536_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"c348434928f233ec9020429ec9870ff1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"e789060dd48018bf098dab525a509c3a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc2048_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"0c587b43699f341968b5d925bea2cf55*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"c2fe2309b6f38e385c2237f153c352ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh5ph6_iw72ow72kw3sw1dw5pw6_n"ccd4c93a75bd742606a4455dd04fa6ad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh11ph12_iw72ow72kw3sw1dw11pw12_n"0f33e3ede19b11d7eab42ac6f61b562a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh17ph18_iw72ow72kw3sw1dw17pw18_n"f453c0f7a55e591f252bf24d1928bf7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8700de867faa334c23ad184f6f6b8b1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"a0ec82e0dd5a0c5c9de004ee00658309*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc48_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"d897fedeadd02560be376ecd01d7e361*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic304oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"f5a1ca87fb78fc106c2f5ecba022b34c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"c1c733beb5c6dac9287194ebc400089e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"e6a008e73ff1a0cb69e4f81b2c0e00d1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"688ea3f22078a3d8eb0d480207a816c1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"9cafd6be50ce0121d300b0e5fb3c82f6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"1e476b4f8361eec852d78e5a4163f0be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"cbe53bbbe4630263cb6ded98073a2ccb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic304oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"859dacb5116d63603d08ee3d731a053c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic304oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"5eed6680021399b505cfcb2314613e30*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc48_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"f540488102162aebf07f1088248f54b6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc48_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"96ba4815a2d80e24022807d3e363e9e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"5e97a73ffffbe9c04097a45eeb42c1a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"3b260ac124af7b00016be579483e06bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b9e0ee11f9daef584581542dac84bcd7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f081d5f751b72f6ba377efb5fc5ad780*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh17ph18_iw72ow72kw3sw1dw17pw18_n"87a1b08744d4e77823668b62fa087120*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh17ph18_iw72ow72kw3sw1dw17pw18_n"c035018deb7620b102b56ddad7e54e3e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh11ph12_iw72ow72kw3sw1dw11pw12_n"51f3a7c53a382d487a9e54eb3089675e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh11ph12_iw72ow72kw3sw1dw11pw12_n"5a4f40516a37e07b3911252efc165128*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh5ph6_iw72ow72kw3sw1dw5pw6_n"1d05f7173cbfe741080f24cd69b12757*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh5ph6_iw72ow72kw3sw1dw5pw6_n"412e934464a48c7454467ede303c2511*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"377c5fed8c18f05013b10eddd5827038*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"42ab16469122b563c0e0acd4eaa5a7e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc2048_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"6286fe29425e1747a7dd7b88a1472022*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc2048_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"639171a3994dafac968d3b54d33dedd1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1536mb4_ic1536oc1536_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"e7ff93e230bad9811e32a663dc167e21*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1536mb4_ic1536oc1536_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"d8905e4ef97649cbe3ed66650d978bb0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"d317709314f144b62008253cc86d023e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"4a6519b7972563bb32ff62c9d79f5940*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"c03e2944194729abdbd8be6efbbae976*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"7572074438e5cdb371743d5afa4d8777*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"b66a3ca1e54b3c532292d9b2c1c33edb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"d1385a2c08e18ba1f0b2c77188139feb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"20d4bf47c581e98b9c6825d85009e208*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"6f10aa4071e2cbe54a9584178d995cd2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"0be1cc7483829b4672deea88b3aad35f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"1695a79802ca9ef8407a72553bbfa723*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"0fd398d315d49f72b221382b230a2878*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"8506dffc7069d71be17364ad605c2166*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"5aae5f63be4e52cf9cbf73ff1a48fc8f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"9843eb826a27bcac7e7f5d920c341108*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"f0334a5719ad95c8abef2d20b2ccf3b6*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"4fe17c61116917d294ffa072a1a0b65b*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh48kh1sh2dh0ph0_iw144ow72kw1sw2dw0pw0_n"e5a64012eeb54f048a59cb915d38dab3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh48kh1sh2dh0ph0_iw144ow72kw1sw2dw0pw0_n"0ab4c43a94e3fd07a9177d7fdf906c61*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh48kh3sh2dh0ph0_iw146ow72kw3sw2dw0pw0_n"c919f564d934cb9caf10f1400bc42117*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh48kh3sh2dh0ph0_iw146ow72kw3sw2dw0pw0_n"bf7ac593fff5b97db45b11cdceb38987*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"6e734192fab88dbde7e49e53fc0430e9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"273f974c23fa2b461d08c785755cbdd8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"79b917e8811db799e3e45c22d8e48597*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"34d016df5c0f6d6679633dc46f916ecb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"ebed33a5c6627089850dcc4886feccd7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"da95acc85c8fac05c13032c6146e2191*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"5b5a573f4286d01b72289277de98d999*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"26c5c9daf1594ef077797cfbe089be76*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh96kh1sh2dh0ph0_iw288ow144kw1sw2dw0pw0_n"ea33eb236031a465ff48a34124a74422*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh96kh1sh2dh0ph0_iw288ow144kw1sw2dw0pw0_n"903c64ff7dc18611ecfaa10d0c04fad7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"dfad0ed58be0a8838c2155d9056700ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"a475556030e30569986991278b166e39*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh96kh3sh2dh0ph0_iw290ow144kw3sw2dw0pw0_n"866331a7edf479e6f061b67ca3712264*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh96kh3sh2dh0ph0_iw290ow144kw3sw2dw0pw0_n"cef9771f3d1e439aa08a2e4078673787*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"6ef2b5a57615180942c70c33ba2b92ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"07310b65864dbce50dc096bb2d2b202b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"403c8cde29fedd76c01b3d938a1db3ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"e9379f0327de4b5cf1e52bef8fcd8834*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"3abe3ac9e9be5e71cafae5d1198a5d5b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"ca816a727722cc6bbf2d8c0e54606a32*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"7f9859d78b9d5f4a97b8dca468b9de3b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh192kh1sh2dh0ph0_iw576ow288kw1sw2dw0pw0_n"44ef80788d7e378c31c8ae2cd7410de7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh192kh1sh2dh0ph0_iw576ow288kw1sw2dw0pw0_n"fdc6c880efa0786268fb662eeed3464d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"a74b300108969bda6fe1522638b726a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"cc03cf4f0fa00cacde4b46823299aa2e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh192kh3sh2dh0ph0_iw578ow288kw3sw2dw0pw0_n"b91b4932834887ce69330a055245d2ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh192kh3sh2dh0ph0_iw578ow288kw3sw2dw0pw0_n"06b9d1d9073132e24fd586645cf2d6b4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"4523fa2aa6e6a8484e6e9ba8dad90430*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"ff68ba61a76d5c24e8e50fe99a047e59*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"3662a30ad825555852eb906d3f201b67*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"af1cf02367496543abb7f418462f0bac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"b699f67177e463b51b1d69e4946ad780*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"a3a744ff99cb237e6a6a7895847795e5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g64mb4_ic64oc64_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"c6b7ca0aca74ce75a1980882c35fefa1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g64mb4_ic64oc64_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"b9a7db540f770a3f97681b814d1ffd11*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc64_ih384oh384kh3sh1dh0ph1_iw576ow576kw3sw1dw0pw1_n"81af2074640aed8882f049df5d77573c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc64_ih384oh384kh3sh1dh0ph1_iw576ow576kw3sw1dw0pw1_n"12e3074e0772ed2b2950ae60fcc79858*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc32_ih768oh384kh3sh2dh0ph1_iw1152ow576kw3sw2dw0pw1_n"1c2d0a84d4ace55c20bd14b642398e8a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic3oc64_ih800oh400kh7sh2dh0ph3_iw800ow400kw7sw2dw0pw3_n"5c254ab89cc2f4a3233c66bdf4e350df*2&01250f92b534088cd55113302c9366e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"ddde0850e29a51ad1d61eafcbe562809*2&d3bff528df98688cec9d5e9513a229ea*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic128oc128_ih200oh200kh3sh1dh0ph1_iw200ow200kw3sw1dw0pw1_n"8eb301cb0d114115cbe5e93351737c98*6&84c21a839da43415e11b09dc77ef5654*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"077ef4d3bd8c81b3636d74719505dd69*6&7a46ac34714b798fad52a176b7fe4153*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"465a5b89f063c02ffefd8651e14d784d*2&1390ee83af40338ac965e98e5ad263a4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"ae59af033706dcfdd9c29f6deafd542e*4&1523c17fe250b82203f58262d5e81936*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"b2d69665411dd54d41a4a3a8dc833780*2&234613d9dfafe8d1604ac10bb6387fd6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"5fb88f1a417061431acee0215190e2a9*2&bc89e449720899efc764086468adfa90*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"d1c0137224bae66c0e2d4326847ce7d1*8&c7799a571da984ca21c47c440ec77995*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"40fdf50e0665d707c75798b16dce558e*2&c34a5025604d20fd353a02569745b87f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"60e0b232335cc81fe39e942104846225*6&078981b3221280b8de907dab45416902*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"001ed7598eb9d304096e1761450bd4f6*6&b071fb96f98b4003f6bae4ed1e986235*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"ce5a50a97ba2bef02352f70101592029*2&478b1d98d43542653364c6579b7241c8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"462a96018978145805e3b13caf1a9480*2&6f177e7815fadf420d0136187ea75886*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"ed2619413e8c2cd2a3425d3b58226781*12&fd71c195450771f1409f02ac0732e407*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"d5534596397a525799fdf858f6b0befc*2&a6f37edc750cc8f9486fa76368af4093*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"95e6338d035c3385c61a8b366594a30d*10&999a8d22359c9d6ce49d7eeaf927c2cb*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"34a013f35865082d58fc8d3c9625fb6a*10&d2c53166aade0a3fac81cd55ecc0ed98*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"10195a7f90a41b8e52763a5e457126d5*2&aa48524e25070c8e04cd076dbcafd44b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"1a8110745dde3150c137ae04787195dc*2&c26f3b29dc218f609c5845df52a4d28f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"0679a10e715f43403bb112c7bf49c779*6&0007a9609116ea65adc92d75160c49b6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"2288dc498a1be48f9ce00c6e793aac29*2&a361f7f157970814821ddc8c8e524e22*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"89eadee14206acddbc0accd55855d5de*4&6b286fa8fded4465cc9763f92e37f07a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"deed18bd1d622fff031bfb03b7d7a7a8*4&76145fe2804d625c9bd7cd84b98965e5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"afe75f9437c2539f2b383b3d6dc13205*2&9290f5679c44d7d2c823e81d9c4d21db*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"2c3d2fa3510e6960767987b5f2e24861*18&5f6e115a1aac9c346064b3a52f1fe4de*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"a95415b75b1b0c548c248b34282daa9c*2&43df2caa2053699673bfc2b3b09a3bc6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"91f905d5d06c8cede6c704af5e1113c2*18&44fb50a89789c5788e51c775db676d65*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"963655cf062ff308ef4648062849955c*2&b38d494e90d05e861f5f89e778a10a0a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"b88ec14a1c576149d099a00e42feab63*18&4ad0b127f9ea65ffc180468fbc458436*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"616cd3605786bbe97a4c6b32dfa0b5bb*2&89d96f5de1eaa107c133c9d98200538d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"61f3ca270d05fd763eab8e5e064e04d5*2&928b314f732fa96240e8e9cf0321b892*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"f8130fffe7a164fb8509e5f4a9a97d9a*2&8dc6888948253baffc6fddf1a4ba3f84*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"c784ad16968244ae026dfb75ecdf9c9f*2&913d43555406cadf66ff1732f7896812*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"4aaf04c9d47c550f8e017d4d907d1227*2&a01ce31cea5df4e9294f233814efcc1d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4f6111a9c9f7b3b553899113cf8953c6*16&748e9346f62ff107cd88d8176204e851*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"f14e37a74c76e3c3ddab2bc560998c1b*2&2c4f7582b7258dbbd46ede6ff41038ec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"25892f91a3ddd05746d59f899fc79f67*16&77f1fd76426f829898e0d44958587797*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"48a9e4c61ac75a4e7468da7057a94507*2&a3fdb4e1a5bbd0c299b12aa662e084f8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"b8763c6ef1d90d0cf8403c8b9deebdde*2&7527caff7899c12c36f76f2f2db9f383*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"17d800c2522411dff51bd4cac0c8dc9c*2&464e9b82e5c8af910a2b2648d03e823d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"337e6c57d274ac3c1f86cd8441fdbcaf*2&a667b82b760593a44ead23a680306f22*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"954ede3d88596067885c5aac42eceb77*2&05668677bb30664028b2bc94779e8294*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"57f3f0bb7a161bd97c437ff0b653777a*2&ed58fde5ab6440f4a9d459ee914d8832*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4dd2def1caa451d57c2bb9298ad6c797*2&18aaa9cc9a64ca3e2efae86de1d606df*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a632437e7c093b62209637c278d76508*2&e25a27dda6b379c6c4b04c3d7fa5f0aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0600c96de8c816235d4d934f0c487afa*16&4d555d3821eb09d18e5a63f1922e4bfe*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"658e0eb5fbfbdf02cd3e3fd0a77de14f*16&5deded13b02bced23d205054652af26f*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"6c805129407d0e435efefaddcd79388d*2&d4e0703223b158b4658397c39528b61d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c4f88ef7f46adec7e272c0463142b0ec*2&af34d12540fee68c4dcf037a42c90ab0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"303ed048c6e2e8291b3c942cad8bca98*16&d4094fbb6882976c3102187985757ab8*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5e2c3d64670d31646774733ef85f33b9*16&42f2886c3fe351bd2e5d3bc864b0260b*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"de1b8e1558ccbb6c642bf2168e36d7d2*2&00774624b97c0399015feb254c2d54f5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"8c7c6eff5dd2ba358f6d732e65acc7a6*2&be17b3e96cdd37ce58da3372764341bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"4aed342de16fb16fa6fb33269d84f756*18&f83b872f61f396645ce4441feff511ac*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"0bd20e726381203dcb6d64dcc12ae22b*18&2b2c78fc747d8259da207823c48d3ce0*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"225ec2e4fb9b5a813add21f30b707e8e*2&c8250202c7b35f7e48b727058a4150b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"fca25959240aefe0a0a948e35b6716d5*2&9ee387cc24c9242f6219a656ae9cc0af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"39ef87414d817e441a3e079c78fd7e73*18&2a307a58f63251805e9ca6a1c8246ca0*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"444980aa737e7312fe8acfddd97b9a1b*18&511dd0bf34399fa01b1c74da8bb250ae*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"9df105a9bd3e44029a4d1e4f81e3e2bc*2&c46354afa777432ccc30acc6d06087df*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"62b1ff803314f6ada69a2ab7d2f725e1*2&a66d0eee20f81ae59376e70e53447438*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"2e87981027efe9c5733d5bde5d3103d6*18&2096c775db80fbefc75c6756a820f172*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"bd483fda5207dc71e4db1e0b0be78afd*18&47b562bccced1bd68c50ca8d7254d7a8*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f12c669576bdf3e593e0e378cbf8671b*2&578a7c04d7079de049c5c7b36f5e4995*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"b6d790f8beb4acb0fb2f9bac6fc7be9a*2&f05bfbf85a7552d55469b9ea406ab91a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ef3028d69d80b2d2f26a8281d9cfd6b7*2&64cdead5b0b52abd48468b1799b85de1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ad043fd965ca2fb054f65795b5f508e8*2&ffad725e9346371fdac7f4b684fdb2f5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"a45b90bd39aeef4170d04c643273e7ff*2&bc4d439da2d9b869d2d54904cfeac8e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"168f476e3590903b7386423c8bfc854b*2&793c612c0d3d8f41a0d15fbc747c34aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"6b37c881447dfabd10900d4b00e0c8dd*2&03b9dcf00be151c087873d52f53484af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"bc4cbdf93d7fc6f26f898c6f216f0a19*2&10b0ac34f04f537266af7ca0797b8de1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"fe139afd562e5f8158bc9b94badae854*2&3591c45ab73d5de8223f17650d8f5aa6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"c1e3539b6ecd9909c8d5b266d6188ab7*2&4057d9fa230277d8a66f0ab252995af5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"99781404004d778725a2d461f09d968a*2&04224590af3fffa49c212c76211b3048*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"ba2d2691897a4e8fe03dd64ec7a61c04*2&6befaf4921e68d1c42defd2c9d4f1ad6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"b5e6e9527ef79cb16abdcee2e9027ff7*2&c48a7b6a2bb605b53e8f8fa4fa18de49*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"c7ca98a7932a77bb41cd8a2361364ee8*2&8cef993cc0eab952497964ce52405cbd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"33762a9e71a85690edaacad387bde664*8&048336d6befb49341dc3c34d79345d44*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"4f552cc6d807e7f7358883d2438cb563*2&7d8f472180d65ca152c98603d60aa950*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"1eaffa11dbe77effe4a48e579041c258*2&a4011cda01fa902cbfa77b3738efab5d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"46103a5df43961282519583e92d7be6b*2&ffe925f3338467e43345a286b41ac8ca*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"6fb4798b53eb72c0ea8ed70c307d69e2*2&ef1928138d03be26b933680af81fbc4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"7841f6064357551dccf766093195fa5d*2&82bc6d27aff3865693a17fc729dac2b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"9ead63cd8da6ab51de900d2aed574f3d*6&5ebdc871d62e51d41b6ef6b18720f7eb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"33e101c52f928946de86bd85854ad3dc*6&532cfb8626dcbb758af960d2fc2fa62f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"d769b6628839147c2db38c5eccdf7fa1*4&e78b1d45a17821a7cd3e3622178bb7d6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"d69c9bd38926f30766c4d6082c3285d5*4&c14dcc9ba9dcb42edd2218921e67b9f6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"46126720e45df3f614e55b477e9e0b82*4&fd3fafc3dac7b9ee578bfb62b55129d7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"b6e6604050aac32950ed5e917e5f6240*4&fbc888e8c97f3ac32a3672c4c728f53a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"6268816a095a25e86e5c67469546f11d*2&86d79d4a1ebcf44360af5c591b7c36b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"90faa43f8c60b7194a3bc0ed53236046*2&22f8399b81b2f47ba94a6a08be459f7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"0d0e02f6775a8bdd62da203d68214e7a*2&cda5c1cc4bdf589100bfe50c8228a5ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"a3cb43b6ee932419e985f6e29920ea77*2&0436b4170f45f2a67415d1c0d6843898*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"e53400d13b66e7e55ebc3ffbdbb86b6d*2&bccfb0e1018c748901127c09be6c3284*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"765536458e35aecc299262ea09e51657*2&1d619066317e4e67868f256524bb4dad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"c659063b3a70b9a394158038cc77ca2d*12&35240479dfb83177d3a542c061db192e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"bf69742e06ca86ad1772743a8a6037f2*12&5c618c4b7b8587f016a823e60c7786bc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"33ce679743d7a03d6d9867f1b9785048*10&aaaf5141fdc4c81f4bd455b9c413ca72*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"f7b8b9e9125bea28b763f332ba2447b8*10&29b76119c6a8951603daac33f9e8136b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"8eab95922177f8f7f2d3a5cca7ada39b*10&a3155079a3ba5dab9da40768383d96f2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"2ba7e69580526afaa752d1ecb580f0b6*10&09af409fa9143a284a2acd8a3a19d2e2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"4c2c84d0b2fb6422d563421f4521c099*2&65140a9ae6428aa15ca923fbb2aac65f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"f49214cfa3b8870372f17c37054b55fb*2&c844fdf3ad3e0c664774ef11433905e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"7c4f1cdeb855d903fdbd3360ff5ca7dc*2&ca36dfb8fe29219bac422e5fb2660199*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"08e078a6773c039269c7ee1aaae11be3*2&b39cc5ac1d64fc24da70c762b10de244*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"9f395d4a7dd1456f64a494bee47c9cf2*2&4e46003a9bda9b96ec02485729677de0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"2a85ac6ef0c5fda2d6d4709a585ef089*2&f02d8bda3386fb8189ea2e7679cfc13f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"d816436e431f1ea3e8e9b95532ec64ae*8&5bed972afcf409c75135ad615162c7ca*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"79ba2ab7e9c247bbd59467f12a6f572a*8&53885a28bb78d04ec1d3d357c4fc7181*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"d6cf65967c9b106f35b8a6883e87c67e*6&21c434b7ae7c6b3655b80e01c3b4a9a3*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"7971cc09d76b46f993b48934bb874a5f*6&e0648056ab93b7da1e7164d4519b57b4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"0a8a2bb1d88a9d7518b4297e21991f7a*6&7b95d494a131d47bc5a2a51fdc77ee66*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"b384ed53f461019efe41131ec36e1e4e*2&b3d9cbda0de1c72a7947196eac73393e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"4de42b9ff9d4f8ab86b24e518ffd00f8*2&eb188cfb5cac571488a5830bb2b261d6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"e41ef6a796c049e9a956a8ff4134fc93*2&5507155c12a9e0d4903c3c7746f9b23c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"09f8bad837bf6051d07ab6b5988f03c0*2&a3a21997f9ce278d12073d66029ac603*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic3oc64_ih800oh400kh7sh2dh0ph3_iw800ow400kw7sw2dw0pw3_n"1c4dea151791359e4cffd6b4956bc076*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"ea2eba726f1eed1188057579e14beca6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic128oc128_ih200oh200kh3sh1dh0ph1_iw200ow200kw3sw1dw0pw1_n"e9d4b1fae78fdbb93217ad9623de7d81*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"d207c5a72a29c2d4f3585de2fbcc1b4b*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"33f19c35d2cd23e6d75f878c7fdfc974*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"fcd99850366290cb900e8aac4f345f77*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"bd0817a58db9ff2c221fb2ad688c7c47*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"95d000848229fe831ca8c574aee540ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"6a96bc278c61e2b82910dc3f6bcd565e*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"369bd5c52d07576eb5bc361f2c6a29b0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"92c125c6e6c638df5f0abf16be02cdbb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"b0edea450df2f706bb08e50c9fabb8b2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"15b166bba013ee99167df8fdfaaa3275*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"037c43ae246e804b2ac0f5999ec32cfb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"6376bb6d129089ce4a92a6337d20ccb1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"4ed876c03b36265d99dedc28939a8ccd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"6db3dda47945a0fed48e16f321818af7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"e84dc3279f20f9e3bd65ba705eb75baa*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"144df887d9de83ddda06db91d5d37470*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"671d93125f439a5c7dc5943bc9cae4c1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"e70fce52ae5ac03d53ee1882cf768551*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"ab2d1950c9038a70f793ffa6337ac25d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"3e1455e0a6d895f17c390a612581da4a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"f43339dce531c60f637907c743fb96c0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"d118ac1bbd1e5c4c2f29ef6d3cb5d7b6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"1e53f90c15e18f28e7a1862580bd788e*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"bed9a69b3f117e4db7bc345cd77fb770*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"11d3c0f8380a064bb0fb136c00e68692*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"3bef4a87864c3bfa9a039649fd89f86d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"5b760fe1f7fa9a49639ab873c73f2e33*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"9cf015815d31ef91d8333c3305a8667b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"ad8e377c8c3efab1fe64238494051f1b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"8172d13ba8212ee7e54f6ac9f9e8f35c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"a4b8788128a07b924d92cbb91831c367*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"098b1d34fef4c30f57d69784c28c478b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ac84b9c564a06d8ef5d621f318e12ada*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"fe0660d06466ee3149ead2e96333be46*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"5a366e15b60775a65a168e660837f860*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3c84fe4be945d7c3d194935897a5bf21*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"592225f864f627328e5c387196fc3828*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"6dad1066c7604a3af3f17ddbc66ece2f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"c85df1086362f28e7661a8d7ab432583*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5d4e62e2933538b1f4faf03c1b02140f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"62c681ae3f1ca57277e6cd0675bf01e6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6a892eb9a9244bf13fb35f7b56a3ba02*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"dd0c89dc173a7f010dc423e5b5535af9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ac0242619b79283ecc0062bace609c25*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a330c1b494522a981f6cde1f0a483406*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c94281d1fdb166efb447d35e3032474e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2272aedc9366720e3e8bf2ac12c640a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"28537c35c62fd9949cd27c6fc41ae5b4*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"975e1d9cf85a1e9c970d9d032597e8d0*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"ec0659128d01065224f326fc9d4d246a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"1a545c7b679f228a4a0654f49fc55648*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"a2f0993edf347b665c87a0cb47cf1ec7*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"67f17b494bb8a3135ed87d7b21dee0db*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"a663c23befbe197ed3b8f3fff31e21c6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"74a2479d9a5b53c0b57e45441884747d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"2a374f7f0996e32c304d622693522d07*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"513fe98571afc58e3f908c1ab615213c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"051a3f52ef57651c630abc3a02dac09d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"7e6e4daf40bdc3bb5ff695585f25c302*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"c8c99bf9dbc7124387f79af0ee5fad86*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"d1e3d48deaec207b12a4642af876a613*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f4ceb4fbdc55d2a79d6ee31f33b1e802*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"22217e57715d9e36c64dd46bb1b22c2b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"01965d959e45dc19ae564bfbd1433848*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b885e9b91f331e0696c6d2dd2973aa42*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"b5436ad18dd5335695fdc630908d432f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"451468bfcf8ceb57b0c79d89bed977dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"a1d494470a986d21fee477d78b7b63ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"fb6384d1c2be877b017daa166d62a9de*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"5d7b0efd9b1e7e13bd8f71977d08381e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"8ddfd598d369467b87cb1200e648e896*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"7653d7acb672f6c7e8f578db7a24493b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"221e5cc62523a6a6e788b429b63ea150*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"b9162d5ef093b130e149b371771027f5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"bab6f5eea649777cc868fa83c07bf731*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"360aa81303e1551c29571ad300a1e4c0*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"320d76e691a93f361dc3785f4276ed65*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"1c37d5efbcb9a13b57782888a34438a6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"b711302253ff0e28cfb68dfacb8b3b24*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"cde7299226cace281110dfefbb9e1c53*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"9711730bbae2b4c0fbc3d8381cb290b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"21f6911a7eab0ece0afcbb7404e56629*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"06288c6c5bd83f992467c1767a4d5f43*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"8420896ab1162b5b89e6000d807fdfcf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"b7b37403e7842fba63d7682f798f347c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"6ae885a8f56b22e186c270e037f505a8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"2d764878f7a2d76b7868c070407a34bb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"c7fa68d07fd11b59dde3e46b0ade8b26*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"c5fbd6ab428095765aaf05b5bfe16e12*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"8b2709be49350edbd2e6654be5783e15*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"e7bb920618957c988bf8b5d839ec62a1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"668da949e1bd17ed2b888eee54d05265*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"94dfbb11050b6553c994da5e7d9c8e83*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"41b534cd07838715bf0d7f90fc6736ab*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"f342f4abc08e2f122d0778e0a9586237*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"76bb1a252eecfd2cff63979b459175ac*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"86a11f640161a1797d19291a02f85a31*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"07b67b28dadd167304bc314ecf0c6691*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"79b3edf354eee02da5fe1286086cbec1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"ff07b6b1335bf9753a6d3355f533a2e5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"30afdf44f70b2f79337cf8a3ec2d6f27*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"6e182fcb32fdfc0785902ad5bd3a922a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"e9fb085d607b37f383c9d16a684c89ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"0a85cfc3cf8db0a9a53ef5acb3dcfc38*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"5870e7b3912c930d61b918c6555bfb79*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"492cf50b8dedd7ad5df1f15fb3830dd5*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"830aa1ab86caf79ba6ca81e1839377e8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"37d4993d2061b9aa7b506986727a8a8c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"3ca2161336c559356e3953cf01e8facc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"bc6f169d2076c3c5abe1f341c593cfd2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"6059baacf695181f1376f4cd80d8c6a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"df5bf5106e447aace11d3c4a6900cda3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"d7aa64b2c40616daa3f1d8ebde03c425*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"6e0dc819debd0b4b07194327cc0f2a5d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb16_ic3oc64_ih832oh416kh7sh2dh0ph2_iw1344ow672kw7sw2dw0pw2_n"af5ae0dd69d69ab0ed2799ca789cfbcd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"130ba65e0e8ae936f52b703de280ec0a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"e8eb983f85b4b7b00ee4788e3c0e3827*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"29e323bbdbb4bb7e6cb57821ce21d384*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"00bc754c887d7f4442d160789cd7cd5f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"2520e5010ed329e972aacc8f7f260966*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih208oh104kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"5ed86c28e83f7dffa9de6f9cf9c48070*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih208oh104kh3sh2dh0ph0_iw336ow168kw3sw2dw0pw0_n"758100b15dde80b8c07394717b6695cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"9b48528c663e1b2f31e996a7a55741af*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"c96ba15ffd4d104ac08248c37088ac2a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"cf91c607b14a4b6d56ab7cd5d7ac93b7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"6ebd3b457bf8d457b9785bfb0db96de8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"0b9ec4aa4b57d6df642f8a6ad4a59be4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih104oh52kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"cbfadfda49b2eed1777cec89cbf4d704*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih104oh52kh3sh2dh0ph0_iw168ow84kw3sw2dw0pw0_n"5dfee7b40b1e172ce00975ddaa5b560b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"cc605fe2596c1da2c72f8267df81e028*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"fbb4d2e44078d936e57da12a1a5f0f54*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"8e0e77f6d8c9851f493ac2cc2930bf45*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a8ae0f867f8fbacf2666455aee5d2e21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"98d3c0c6503de25ef592fdde098fa747*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih52oh26kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"07d18430e9c4833f02b7188edced5a07*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih52oh26kh3sh2dh0ph0_iw84ow42kw3sw2dw0pw0_n"353d111e461af4fe2345b9fc5e08901f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"e204d1662d3b226362f97ddda7344915*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"1ca3be07fb74d8704e999ea5895358d1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e2d63a3a9b9f22b145a6ebe587b0ce7b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic2048oc256_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"04eebf94fd1ba20db15414a27fa35d45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e4d75d139a3dcbd9c04eca4a955fa9bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"3b813493a479384e82d728d1042669a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"8172eb5dc3493d94983b632a1380eae6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"fe32cf3b8b4fdcf2c9902390d802e74e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb16_ic256oc3_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"028e738f1c71a5b9fff5138f10852392*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"0010089b84c22ca12359032fa9a93150*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb16_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"6e9cc61a4a9f992a8c0513d9b540aaa1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"2eab747ae5b0fe77df4ca9aef70d3eb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"d4eb04b172af5b93d0ab9593db2694ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"b91fa971aa1d736771e3ec72eb68aa73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb16_ic256oc3_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"ad0983ac08b5709c1b44c103b93a835b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"542adbc29483b40d74b173fa8d8bca24*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"b509b0b2043224371821cede3ef3d268*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"39eb0aa2c7b1b7927cc2becf446913bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb16_ic256oc3_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"f78bbdc47b045177822d937abc272ec4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb16_ic256oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"686a7bc83ce05253b8c44c4f52e8649f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"ba49019032ffc3978b88e0c91d120d22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"38c8cbd092ab42b345976d6c07a0d0a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"727276e8132cf73eac49a985e3311d3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb16_ic256oc3_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"f958c6454883f3b5f69ec4149dd90dca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1600_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"242ec4f84c78ccd2937c1443519eaf4c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb1600_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"c594b6f3ba6b90762f8e6425114f0715*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1600_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f07995dd62c5efc4bbe7bbff6ca051f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb2_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"88d68eb3627c7ff2b5b2d5512098351b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c7a8708cf5387d9b9f2c068bf21ccc3e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"40e1ecaed66590b6b72c3e996f18372a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"870715f9d029e0877c94b26d5163e230*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"ae75d0c67ad2615012e9989baec11e06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"36c6320fad2b4b1a74332306a36a1191*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9b5782791490d19f0d945a33ad87965f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8a84af3883ecd6c15a2aeceaf6ea2877*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2f1853a3f517cc1c225a344d6405478d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3a1b6d568948d9eddb0f97b811c5a50e*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"cd609b6facf64c60d13cae3f35f50929*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"fe58c3470422b602657d59189ff83c53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8254f549e1215b2fc28580bd000fe309*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3e951710941d3c742600774102d060af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bdcd2de9e621827bf64d702da9432764*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"638bb7956966957d551c333e97c4a5eb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"c0e0a7c40975c2525bc50973785bd490*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"0baf52e8ee888f19368c7d5ee4177ab4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"aba08a4c9b63d798cb3de2b2312f90d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f9a0e131afdccf7e9f48864419c1ead7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8244d85ee113e9a338b952fc1ad94ba2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ddcef383c41584de6b99ad6183359633*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2efb962b48a1d2dd22687dcaea4e0a0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"d8921d395cd3c11c36f2525555f1e14b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"891bceba4be69ec196d4a9036bf75f40*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e7864447643a12468a9ba69f02dcb8f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3dd52d7ced0e6e2fbd05762b36c6d8c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bd10355a6332d07667f09bf4adb32603*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"189bb2d361bb8c84d7311e9d9ee8cfc6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4001dbdab4c622d2e6437b87b1b6d808*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a44c34257729fb89ad7f0fe0fb80a05f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c1ab8bc2b99de93f390e95b68dea1611*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"89704fe97b302d68660050aa813d6a1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e77fca9a13af574c594e227b813656ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"efd5d4531e3e911e9e98eb2482f66c7f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2c745f2546decfe466a4419b25e62416*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"76cecefb3888978969c795d7388a052e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"09ff34bd65e0bf71261296a8e0a266f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1be689a476ad8ac4691838fb362d0504*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"beb15900fc887aeb5ef224e6457362ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"dc8db6fd03d584d20c2f90ae156b2e65*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6f4da594329e9dfde72c23f5fa28c240*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb2_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"47de6190828b725e65c64b041f1be682*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"ae3ffe6c0f3fc5bf1931b74e0c30d108*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"f6d88daf50d9d0e14391baa70f427497*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"b4c07e5bd10652fb98f2dacfedfbb3c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"7a3f5cf949184ea6181e162e222cf424*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"0f7819685231ddb162d9b4fec2673e37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"3e101cb46d80c5679037e4a08d3dec67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"03efdb5b6b11e617597e2c26f2538031*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"a0efe6f02ea28bd37b56e3b0f383072d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"2af9b1e6227426af6121587e071aa9f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"43db2edc9d1337393386d8ab338fbb8f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"1d72716b9851672aa0f1710a24a5e423*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"fd5ee16df1d829ffbe5debcf85c719e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"b0c7397b72ea039a8e42f0dc7804f3c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"00e9884b1d24afd3e488c329c2dda841*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"9370965c65d63ce04f5e68b12edbc914*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"7ecc26503442d0571cad5d1b68abf7e3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"d356d7199d957c635a9960408d764136*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"81a47dc28d02ba90eabf41ed3ba49a65*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"5b5029224a971d86151d8be800fd053f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"cd76a43e825c22e9a3ce8c256bbbc954*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"04222955e6c678f784b42db9dfde982b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"3bd1eba5f13a93d961c4cc3d350b0f43*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"7fbcca15a5ea1b849cd6e63946b1973d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"70ce0c67dbdc8a71805941841cd157aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"30b3efbd3bfcdf97c7d25c049b31c9d6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"4d0da21bbd04dcc2bc96eb10306d130e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ad52344296ad9396c98510affdc8ebc0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"a79270b2ccbc9f85cddac452d22a1c5f*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"7f3c6121380eafc6ed3d605ef284ebea*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"da009c35373f27f2249662a8ddc0f1a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"6f28d0582084b5434f3f8cd8e940d30e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"93ad5cd5678a11b704249cddfcaaf1b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7129c9cf1578ee2bae709664c3577b29*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dee213eabc6d8c715ea24f580ba9dcee*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"25eda857f734085d66df55dfe69f299b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"9d42cdfc7e3a2c28e7356103134fd494*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c75fcb6838641dbd4bc6e9390920281d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ecea17b25b2ab85e0fcdc0faa1a7d6e4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"de18077102e3ac99dcf60170e1b3f0bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"28cee2de80e363c0a02af1ec9f36087e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"953cb02c49c2f47ad5415f4525d83324*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"352db329bff7c0ff5ccd572d20d774d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"283072d6f275ef8ece1805ba1f078772*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"c1caa77231cc4a2837f92951782d7a31*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"e465c9b9abeac583ee1c58fe463e5c71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb32_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"02486d00699f8175b1a3b4215a1566c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f95746a7894c447531124fcb224a9c5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"467545c9b7f40b5011aa6bb0acbaa3f6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"6422c871957810395e77d14531ee4d37*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g32mb1024_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"24b9cb8f1f48f73c1b3002d282001b7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"38131400a5ab0d9bcfe0c52a8b07b4f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g64mb1024_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"d698e8ac3a0598eb6fe8d824b3a19494*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"07da30c5edd23af60de4c6fe6b9a94b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"0218d28f76f23c25f871fe90e53d73c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"4a386f82bfbf50376371328c9d88d813*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"cf23f020cfbcb944d6f6f885c94cc271*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7f9de26a66da75173637cc51f18420f8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"e1a1d15cd8d134be2563e68b0e082fc6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"a24c5c666dfd7118a898ea64b5e5dd03*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"58ad8bad2b6e88b994137152d6faa9e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"950a67d683d6e5e82245b062cebc6306*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1d98ca2024df4d19f85e7db3b036b90b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"64a8b43d75fc7fbe78e135cdc9ece47d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1bac978702ab303083a3390b93bd4fb6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"b3c399302fc7b7e02105741a9eabbc24*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"53836fa793aeae981701e5fc15a8b043*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a28afc7d7e5e169043874e73ff8c24ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g1024mb1024_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8f854af5a16c5783b4da03c88e8535f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4d52db62b287aff1710c089110ac3a33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc126_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"3706aa0bb6502e943f73681f909f58d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8f522fca07f9bbfdc58636324ce803bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f9f952f0773570923aaa8b4556958a65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"3ea9f7f2387d5fb82249d3f5afc9eb1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"943f3ade1b693f09c37f42b82b70fc59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"469dec6a588bdec4bad9cdfc55a0911a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"184a6f4cec17be64f806c772d46cd1cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"563a47ca5dc281c19ccd2a63c5fe97ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"f9d999d8947c0a4a9bf8283a3e8d7e9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"1ee8c5178d9faf8a92ab3431a273c054*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"ae8de49b1dc9dfef6df0cfadd3b582b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"a3c6b39f47c361f68b0d36dbee56fd42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"69128b9e18b41f1a72b09efc9744d4d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"3c19666564d55c56ceb7be580424bfaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"5796683189e3b18a8c721f6939343c23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"1653450b1235e4c6a6dbd9f67462fa70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"59497e89c5c70d9a5dfb6518a4f3f92e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"6247d37d43f0c503d5cbd7ec2ccf4e21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"6bda543461aef4b23e74fce4eb15fc66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8fd6f686cf54345fbf9c163087198b43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"180c1396e916feef43a7fb02ba067948*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"28feebfff528a1d8a2f2ffaf7861c217*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"02fe96c6bcbb95db7acc6d93836b9ca2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d029705ab984db53e35434857e7ad271*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8515b325de5e830e42b5a3ee43320eb5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dc7b01495991dba052f036be03e85929*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e6ee31da2322ce25737f0627d34d6e5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"9d3c7387a1eea73f416209e659d099f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"52ea89b737e704375348393a39e8783d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"002cb581e9b1ac78b120ab18d434ec7c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b2dd1dacd93e934909c03d1270e80856*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dc1a6f5419a3e511aa02acf4424910eb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bece77f3c3c2e95ff8d6557a55d37264*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7af87ab73fb75731035f1e30ee4d81de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"1f853c8ec2703540bd363cd1e229e5e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"15f9a5798a96af075579e76fe7f74353*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1ed8b7f1e6a6d07ed7837c9ebf0f04e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"96727386d65a521831eb0d8cf4ca35f1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7278dedaddb9a3f9e4af60075c81ada*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc23bfc754686cc093babf63ed002f1f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2 --attr-scales=wei:per_oc mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"162d1d5b2d79bd6d4a847744c5342f00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b0185b3826e3317ef9ebcad7bfae0adf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c63dba77fcfdf1980b90c8598e4f1664*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"66829fc8b94cbbd209e5ca37306b9b6b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"33fb7ac2f08e33c5a710fa7f8496fc74*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1edeed81d6142b8bc628b83eebb19a20*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"7d355c0fe2a751c2642b545ae7e9159c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic16oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"6ebc8dbc9ddea5b205f38623ced9b935*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8d19442588eeee8ee95cd428d8fd5ccb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"649f270154289558865377c2f77003f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"453a97405aaebf20fd20c93bedf34699*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"65f2659fea85c97ab7054f9293a667ce*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"bcf62693a53d976a8b83d8d0d0c41b9f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"1ceb1f14218f9992485e31a9f2b4dc68*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"0c575b50b47d59148364fece66bb3052*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"fcd973e9a53e557719cc5ce82538bf47*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"313964c716247113ddc66b9e7b6cba98*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"3adbcd3326a41c9b305999cee81da774*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"dca9d1415e36eda4674bfe1709cb418a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"1fe679d5164eb2d3905ca81919d63ac9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"87fac18e4ee9c78b27ccbbab8620c4f5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d0c112e10ff0e1ebecadd5c8ceaa65e6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"029a6a53eda028b217ab0e2fc5201c4e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9bdc8a8ca6744fd7381cedabec0305dd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6af928438e6cd3b8000d0ed8520299a4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"277cff1a82255146d0aac76a6f207994*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6532a035b3642b339fa422a0614c8188*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"88892b318f1a74159d02875171ac95c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"c3e2228fd7de14802f11ebe8d3d20687*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7ca3639a558b4be4d15e5c7fe76cb565*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1ed178e46e001df3a3a3fe718eeca44a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7c9c2c2fadb764b4ea2bbfa77ec69d5c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c5265a56cad1fb256c1a641c3edd9639*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"aae20fcc407b75874cbf3ee447d97bbe*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e80e2085041777912726cae123bca69c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"6dc5c89c1057fb638cbeff5446e665d4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"23a34bdfe66ce173a5961cb793db8d6d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"d96108e16904a18fcc61c29e8e85bc76*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"e557362c805e4287e09fa6999977c292*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"7dca12505d639209ccfe600adc6bd2d3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"2aed2fb9a4788565b71edf587ef24ce1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"a92a552364c3000213de743a11a4777c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"dae8595763a093a907aba5981827eb72*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"dc321a23b7e0bb3b98b13c71e7144e3d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f24095bf9cb8eb3e5911f42d724f456f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b30b09b901ddbe7c1a866849fdd71a32*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"5e17f6a97d012a2e838cafc4416e9c8d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"00f46355e0bed40775f569c2d0a726e8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"af4bfe34585c5339154b5bcae6bcbe17*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ede0fd0ce70b9dba648e94723f52122d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"89279466ec9f4e7e8cd0ff704d2d501d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"15971d566598b45187b1254109e9d8cc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"05166c86b69367cdaf44131d9464421a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3f6ff4ccc0053c39ffdc01672c267fe9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"dca1f91024e34322ec70ebded1259839*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"99d73e51653247406d02d02fe46367d5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a5306bb6b32b4fb21e878a51e59ef7a5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1702cb957ff6dfacba7d15dc848f48bd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"806d07be208cebee51b48bec60d5cebd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"79e3e2d50c455f3dfd7e1fe562bf9074*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"091968ca97bc33a1d7646fb12a4e894a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c99ebe055bd3297ce8f1532cf2c86965*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d1892af70a3a169b9135232d7b398aca*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"048e9b2bf968ac14d0e114a6286237a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4b56c1313eeda94a1456fd2bd9c66bd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"f1a4aa733f5f99e672cccc99e7afedac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"492e07bdc48c8b55fbc7b7bd7b173b8a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2df647ce02155e6ddfe7d6c49911a114*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9a59c07b0eb6215ccc144828d2b1fc8c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2a27c7965a9ae4975440cc2b8b27272c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5984c77ba4d38ecc1e0e5c8a961ad18a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"de37971efdde0cab476b17f1af2ad47a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f3299623c697799028a662e158c8087d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"75337e20595e24c02096aac5593ec2d5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1c3464e793004e55681264f20b6d5aaf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"50b52fe9489098fd14f3d7384fefc292*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"41d8df76a926b93b7d5ab356e22b6990*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"99a262aa529e87a1d6b20a7419af1bf3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6ff4f64791a2e4d6bb80e7bce308ac87*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0605299be43e409db2455c63c027e423*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"85501e68b12b8796fc8cbb9f72e7a357*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"caef7461aebf8825612ac68fe8c849b5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=abcd --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"6d7a96eb1e2f0be78f2b6db537fe2e2f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d406bc7e8d1ea2d7a959184b429475e3*40&09d599a83a027fd2ff6b0498c18b1784*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"be7d11225b3553661672112430f5ce74*120&fb3257e6880d2ec63eb6f24e65df4462*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c05b634d06b49f2dfa4cc01f9208525f*160&3243bd50ec752862fce380ce7f9a9f76*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b3726f6ce039a4055b3aad1c37fafe5c*80&2bf651576a42b90ddd83ee0a1050aefe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9cd5603e5cc32ee7b8ad2762ea5bf55c*40&5c38433c6fc836a6dc11c5cfac7e2578*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"139aeeddc27250062e4a91582f25695b*40&f63e9f9e4753801395e8a6d0d74f71c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fc11d9f5a5e3b089751353b9c429bd4c*160&7ca33f3c1a9c03f8deeab9bea91944de*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b849ccd1edb22ada1b7aa36a09b01af5*40&1d7c598634654cff31e450fb486a84f4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d6c980460abad2ecb5490cf6cdbe3823*120&4bed37bcb88e547a5ff013620deb799e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"bc4525d0149c4e3355715a0ffd1c3521*120&d3c2e030bbb2c2292772b20806067f40*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b058fd0f493e609ffe7c0ea14c96c756*40&c0d46baf26d6fa25a2b19a02cfb6b027*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"37d8faee8603aa8f539a05b4f2adfd43*40&92ac0e6754d71a9cd0fe6070c8e12f3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d57407de566a378b7bd9e03b7a9d612a*240&351ad04d7949090d5246200ac96b089d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"585ed97e6cbaa9da3e04788438b2730e*40&50149470a73c8b8002df5fbaff3492ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2f8d68ab5ea70a44f1d14ca0ecd001c6*200&5d527fcca424520cfad88c462879841a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"61350e32727a1a365ca957e2ca693d76*200&263e0a1fbaaab8d7a6f492c69012b47d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"13eae5b811129fdc55b2a90b41b26a47*40&83c7192c0e7d74c5679686282423e63e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f06c71e081844331f0211fb88efe8172*40&e569b33c9b8000ab23c29a9b20b32c2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1f8308d74aaa8686f6557c67147efbfc*120&4d808a61221a8695bb15cba973cbf87c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"dfa5f92ec87fa7b5234efb49635445e4*40&aafb3a27142304a16eea55c09ef741a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"927c5a5e38cf934ba337bedaaf4bfd51*80&0bb619e2436dc7fa3a6cf4cb9dd7d520*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0b2dc56078ba1faeaefb53badeb51ea7*80&61956df4be0c63caa16f4542b5746d3f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0e9b867ab8cd2d8984880d0b1efc74d6*120&9685688adcb329ac24a07906b2220dc0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dc7bb1f1b30f812088e108588d01ebdd*120&0b2a262b76db0541d576ae7dc3176ce1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c8663b9dd945fca977896732e62e3203*80&43632c1c75fb54331dbf72e95ddf3c45*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d8fc7ce9364dab1d5b34b83e84526f56*80&0f4aa6d01815590b950357b157f80e2d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"904377c0ba552de654c7242e9da6ed43*80&5df32d9e4ff8db145bab6c1428824212*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5dae0a2344f6e9654eaaa5aecdc36c56*80&dbd54f88fe79f2fac743a76ea34454fa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"acc96041aae30f6c0dc78e0f576279eb*40&6c8c1a7b9cbdf7ce0c05478d26c51b7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"94016ea2b3dc6a19761ecdc81918dc6f*40&61b7c7fa3ac025ca1cd65f65babf12c8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"91ba0a3a598a23f83619f861812198e8*40&e9932b861aa661a3f3ed491043ec269d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"dd6869cfac676a3d02cfa24fa13e5db7*40&dce5df39072b339cde2bceaa756d770c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bd58ff6cd1bcd1346d960c54ccbc980f*40&1046e590bf654606febda990db905cba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"94d64f2e2eef8e6a4dbec6bd34cb4540*40&8490b98a9e37a92416504d11b2ff4e78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d455b01f66ac4aca13ec15eefcb3cd14*240&71f90f8826c10a59d4bb75da3f1b40b3*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bf3a68f76565bded25e3d358f15b6342*240&50197e3ca277d162446bc94d76d50ab8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"40a635c0bb8080879edf8e0de7866b8c*200&4596bb4656f3eebbda3ee7045e8cf352*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c9ffa7c09358b7c2fe5733a0974904f4*200&39e9b31b3def6643882b953c454f2265*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c5dcbcf9606144f2f9cb2048ab46bf7f*200&515df0963da6f80e885beb7281c8c2c1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0746593f478b59cff946a828fd3f47a4*200&79f07203cf995f2a95be5ae566b20c7b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4c829dd1a8e3c77137e0a531eb14b593*40&1812ab6f2cc9cb440ed616a370f98f97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"65048405b1d7b26a82c913d226c30921*40&c1838b1374b39f29c9edc880bff45b1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6dc6fdfc026fae2a173efe8389834857*40&b246e088b7f93bd026403927a2bc5408*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"07ca0eef371fe2ee09b2e81ea2ebc0b8*40&8ad171b6ad007d723e20e84313340b9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6350f191498640da7417096137428ea0*40&679db70de93e861a48a1ec649362bbb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6394f4146e28b7a64b4f8cb4600bc533*40&72f7ffc206c66becd72942bd8ddeecf1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"facb76b1dcc1c98f3bdcabbb56ce2711*160&772ba539e431127e475b0b1abaf0ed13*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d034ecc8005b8b06d0b9ef3840b349ec*160&9d79bc023d80a50234a9628a1b9724ee*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5bc7a0e5c1a8e1e0744c6c378728bde9*120&8e33263cee991c59491a09876268282f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"89ff19b99b9aeec5c15d8fce59038bc7*120&1b3f964ebd0e1303cfce3cbe8e36d286*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1be0f33e80d7c74a3d09f3ad5b6e84de*120&2d4f8477793d89aab76a27b78ef2ce86*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"04225f7c0842a47677345d4ab2016f07*120&3425ce2370ba85764d52af2ffca368b4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"9bda44839f0d49b788e20ee60670d2e6*40&b1088c776765c07ae6bfad910c7319e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"69dc7af243819715d41fb5e710ba675a*40&3782cdf9c4a71950941c292fd5c659d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"00b5986060be79a9b44def17f2c66a1d*40&6c4f9261ee2accbba48838d68e3bc4b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"2f2c61f8fc4e910afa4fd121df9c653e*40&cf48c72cc802b198d8d3d88dc46c8053*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"82336e7bd51a74a44294a0db958e4642*40&b4328b7649a87dfc1abae2eca941e3f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1de27451a62cc88be0dc26f57a835324*40&a3555d65bff74e54e7c324a363c7d228*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3f71ef737a3eac2502e986cc4aa8651a*160&33d823b4c17fe7e369b364718fd85706*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"eddf40f0cba76d88c27931acf546cade*160&815acef69966de64619b503f303ef685*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0c718518b850683c6db0e4d94f8d51d8*120&6bfa4c99e2cbb769319d09c43f0c468a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8d0976b8870f045c198ae807dc75a98c*120&a0151e34d4809cbb9486892548219de7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ff097e90491bc68ccecebe4fbbf15126*80&33bebd1d11e33d5d90653c2bbed51dec*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bfd86c069e849ba9c3a1eca95966cf43*80&3221951a91479900973e801d6945b566*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3995fc4fe3fe8ef9aa198b18fec792fd*40&f4037245616cd03ce116fc7b8cc2f728*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6f688f9100c8aae5fc82a5ab766b65d7*40&024fe34e9169df3f8211d2e42b76355f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"d83401ca147de94a8d9bfda5f927d689*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ddec5aa038ec6b773d416f8840acc2dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"83a17a7ab5f82a387b630cc7a9a568de*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"ae26afa6706df5d4a5113ddc6b68dd0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"97cc9527aec4122ad4011b5681f40b97*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"666bfb31feb07781f115d67482d8ad6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"3ff6c57b5d5763878469966db30f15a8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2665981d2339f2b44fb8363342cc4bf6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"691cb894b3fee6da7d2e5365e4efbcd1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"5bee97ffad7028022da1c9509e067cff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"bcd955bf199758d32b7b1dd044d3b463*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e9134073d20d07791fb0c08f21699484*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0ee8a37ecdcbbee7c039690583e88b7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"135a494928429a91ccd51e732cc10ba4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9e27b705a0ae195e0ae92e7da1589801*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6e0312b533d77e9ccd780fcf17d0a229*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6069ff5ffdc02d99330d21cb01ff61c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e0de554ffb82fb61f6f97ed2d4d17297*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fddbcb3811f4601f8ec71fb07b961697*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"67bbab4be7eb131fb7e01d9b545a590e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2d9d780875fa677209e95892006d27e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2fae71b86668a9616755fb419d091b77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fb52f820712c204b6119d79333ec9bc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"13b1d090e35c964b75075e58a3003e4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f2160fbe5160b1bb6aca7f61699d741f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"337245b3a90410db3355351612089c16*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d97a12f128c9cd1b85fbe0790bbbbd83*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6503e7b177ab8eaf3a71d1cc478c3840*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b62b7d62b1ff4ad94f11958e83102a68*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2477b474625722701a8c706e5e68f43c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5f46f27f65acf10cc63f1eb170ee223d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"aaf2b4f1c2a1899891d872c4ee0144ec*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5f3b6c65050b2c5f59c45b92c2f9ed83*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7ce71105a951c1a2071c03e69b01b53b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0893986bea7b8eff79f7d7ab103548c2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"12844554f72fc7e8475cc67ca7b366f5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6b5760092e5356d0b9e4e64c92184ac9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"09aefd6935530f40578af089e9014679*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"caaff08ae9baf7cde3b3cee7d8b5a261*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6020ae47ed39923ad2507740484100e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d5c5eb7b6e25fdc6abe9efb9c68ee2da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8ac2c3348f824f4baa5abff1da323cb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0a2e5f847a58910bb28dc4c8566e506b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0b49d68476f1db6ecd55a27ad0657026*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f6be0cddbf14f09fd1e43b755360b0e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"de42ecc3c7b9c7c80e95229119347446*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3632d52294d63fd493537c50124d9f51*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d899a3a3a6f3961453032b46ae8f6bc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"fc117bc7120c8658a75938fc88273e4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"49b7e5b5618b9ecff541911c1f23b5b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f07fdcd704abf816b9d3606748a2bf54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"fcc2f99ba494a75d44088365d1da8307*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"40290849e9d8317469e85e6d34525970*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b61ea6c70a3a2eae310148ce3624e66b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5d7ba5225665fefca7174fbf16bc8332*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b2614b38f125fa64aee77f00c776ebaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1df9b1b895781dc773aa8b4badd336cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4b12ca71a9bda6f6558a6a87728f5de3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"773f8e50e435514e8eb250f274761e43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e1e659733d2626fec0e7e14573863a18*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"52a76c7c8ef4ef07213e2dd98be6c5a5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ce50cb43b2708fd0a26ce53e35edc1ad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"aea56b4b0949d13848d6520399a24f6e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e98b52f62e570c90ddd55b349c4d066d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2d6763dc4108c55d8f6c5685cccb524a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2f246e668c3fdff25fb4bfc592f14fc4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"78e10ae92713bd4f73f071d340caf2c4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6a374ff33e90f5ccf0be7802591d40fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"cfbc44373640513123d5dde5204debcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4a80ff4133d332eeb34c53fdd411d4bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"989d1effd7641884366fa9d3566f2cfa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"f37ca6ebe31a623cbc7e59f09faad181*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"b9cd4456bb6d1815f23cf51d9c7d9ec6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"484680afffa0b722a1d4f6695241c760*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"af4454edf4006dcd01f4720e33e3a742*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"615eecafcd7939af697d4036957bea61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"85b4053e0093c784c861e2a88d0d4398*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"63063a153ed98a4f89ca1166d1ec3475*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"977747e860ac8ce36d6dd74e990e22c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cb7be258738e36aed701a1a7a91d7439*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic80oc512_iw283ow283kw7sw1dw0pw3_n"9306b9dcd0d67cc322dee31a8b841705*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw3sw1dw0pw1_n"02add0b88888654403193ded166eb1b9*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw3sw1dw2pw3_n"942ee62d2d1a82c0e590f6adf076527b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw3sw1dw4pw5_n"37fc8e768709579367282849e6d52626*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw7sw1dw0pw3_n"d796c1e1f1fa292769ca58ced94da197*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw7sw1dw2pw9_n"1adc06ebf502ccbc1c81ca5aeb88567f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw7sw1dw4pw15_n"3abe60e28363d1571fe246111b5c33d2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw11sw1dw0pw5_n"696699db7d96802d522dc7454716d23d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw11sw1dw2pw15_n"a93845f2fca1ebb69d8aef6c976b521b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw11sw1dw4pw25_n"895f4ea033976d203ee2422735976618*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw3sw1dw0pw1_n"8b8c890070074ff68cdea9dae8a242e7*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw3sw1dw2pw3_n"530acc80a5c91b46455592c3d08a5f82*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw3sw1dw4pw5_n"c1decf7f4834e244f0c09057368fc6d0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw7sw1dw0pw3_n"7b776ea4741daba9c48c61066afffa72*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw7sw1dw2pw9_n"b85ef1806bfde5867f13f2c8231ce491*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw7sw1dw4pw15_n"82f319bcfc6e46de3f7ede7bc4b3e73c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw11sw1dw0pw5_n"ab393e4d380033ecf2cc48d18aafc56f*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw11sw1dw2pw15_n"9c79187ce19615899cb89f886c5ab75d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw11sw1dw4pw25_n"f29607d7e66d0dfdf7ecd641e84582a4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw3sw1dw0pw1_n"746eb8a7fcb4a0ac350cc9840806b174*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw3sw1dw2pw3_n"1665b5c4ac8d10024b9529f34065c74c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw3sw1dw4pw5_n"d864e5a0df3a6b3c84363a50a4b19efe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw7sw1dw0pw3_n"15e9f36a2b23dfc38ad6243fe024aeb3*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw7sw1dw2pw9_n"a7d6925429fc76546b477aa3f7b3b153*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw7sw1dw4pw15_n"c49d966c3b9c3e4857d849a74621aabe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw11sw1dw0pw5_n"c663047d93daaa1b6fa4d2a2905eda1b*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw11sw1dw2pw15_n"908c59e43d272bc30246c8f8dd7a593d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw11sw1dw4pw25_n"d888fff68441404190adb2aa153bc240*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw3sw1dw0pw1_n"91a31c22f6ae654577b324a90486e333*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw3sw1dw2pw3_n"ff809680d8a9149827c1ae5da9e79eb4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw3sw1dw4pw5_n"7769cec0f6892ecc5a08c52043206e3b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw7sw1dw0pw3_n"9496f51968b360734623bfc1df21e3d9*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw7sw1dw2pw9_n"2534e1b7755da6a719032479b7d3c66f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw7sw1dw4pw15_n"4ca02b10159a121a40b019b890419bb0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw11sw1dw0pw5_n"1d77ceb64a90731600c6daef59bfa1ea*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw11sw1dw2pw15_n"eec004ef9931636f246e9afee96ccb6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw11sw1dw4pw25_n"88476349cbc7a92e567a4f5092d2bc67*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acb mb1_ic32oc1_iw72448ow72448kw7sw1dw0pw3_n"b4f5b99e52ab3306e1ff8752837a1779*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic4oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"0a18f0de03ddeed8e73f7395528d78a4*1&f4669195bba91826cbd52e9f11e1c0c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"f93a51e1c0d5377d319b301e2079d62d*7&a76dde48f287552dc3c0a711320d8a80*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh48kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"46f3d2f2d00b662670019414ec49e817*1&5f95c2c1722d7500a18a85ad15a09d9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ad2cfe1fc93f6d1487f3b962c8569af7*1&e7b9608f9e328313fe3891382c75a1d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"a92bb90f2c802c7c511576c4e088c3af*6&24a14cb1757da01ff880ae9ebbfe9534*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"c02a37e33c3c82cffe2e63a1468c1448*1&16fcc51e86436d7d31638459235f14c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh24kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"bc8649efe9403fb8c84547a8475c875b*1&4fb28e04bea4e0ca0cdd81114acda66c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5d4a7653127038260bf7dc9c5278498b*1&1630426534af12ecb80d139d5b4c1090*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"59e57a29ad3dbfe4d1be599572ca8bcf*7&43741b2b530f1eafcd519b02b582c062*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"71fafd7c9474d5eacaccdc7e7e876aa3*1&30a44c1a71aaac3f44902b2bd3611284*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh12kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"2c844505dc6e0ab2f0cca1143b24fa20*1&2d064b2909370caa01afe64a0622e2ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"804497535543e12da5c4ceca90b34670*11&8ba92bccb2fb9929469a71c46ddbac74*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"3223924641e5c8551b009c2ddbf88cf5*3&dedb61046d54818008e8869b34a38dd2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"101cd772d16a923ea0585c4f97aceee6*3&ddd0a989c3bf36f2da772023a4a25fbb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"8de10394fa0c0bfb3dc99e0cbc0b8d03*2&71ae9aa725a3ceea69a64c5af403493a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"1118ccc4d5237d99e9360a9ad40258df*2&468ceb523af57e5fa608220a479953a2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"f67c44ddfa41a8fa306be1e749953857*1&f10471d82553dc4ecfe6879520847eb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"5e6d6f264d7c6508c78272f96802cd2c*1&ffd69a6993cfbbcc842196046349fd4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"9af619f3a14d95eca8018b2648b1a82d*1&c2b8358511bd960406a8e6bf5a2f33b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"58af8b08c3e2a02ffc9dc9a2a5c6a34b*1&a81752ac3d33c9261a572709388832c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"3dfb3a64e590d5f4e251153949cb9a5b*1&45965d4e6d3205fe31043675572cb711*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"2179ad3cd5a2e1bcf0b38ff25a06a231*1&052a88cd0557ef8769420a750089a0c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"0f6fc3114f6eb7a8c9941d73110f580f*1&121be8e0fca9ff462a617e30009f1a91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"cc8cf2270c7aa6ca9f98f998473cbcae*1&bd12ea3a9599226e579c66861981eb13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"7ecc827806ac920b39a3e9b22a41d4aa*1&ea6cd821b5178b092f1829e2df993ea5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"cbceee76a03ac1f464fdf946163816b6*1&4a945a98ce489f1842dbdacde3a5d49a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"db19761ddad6eaebc171421c9eee7c7d*1&f6823f61846e5c0f9033a60dca255ed2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"5b55451e9cc8bf00c0d2c5eb7535951a*1&0058012e2ead6ff5ab6976e8396c3694*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"00e5bbc9db6b19c7ad98dc45cada18f9*2&7a7a2a5ba7538149addb61303baa94fc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"6b7fcab6f1897552b9d04f63702b2190*2&556ea1cf9519655ec76f30a78a3a3b35*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc4_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"861c52f7f7fb1c75940d03347b36458b*1&2cba1a0eda7b81757a8035ac0954161a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic4oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2ae560fa4bb1c6feb511e2446949eddb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"7127f28539a65f843f4ffd2df3528262*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh48kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"fb02ea26fab2d462d99e937fd608fa8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"67b1dfc4ae72b9ba833ab5526d6c0c2d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"df24ef6c6409411ccc774eda34f045f2*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"98d088e72c044542675e51d95f9e1faf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh24kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"33ec253ffd6f08dac493d5a21d5ea338*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"c179109777ceea0d31af867533373c5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"57fc732fd00d72b18493498aac08edc0*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"e7a21efa6b36d72e94ba72f3262c1164*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh12kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"832dd2a047e8716b0438160b6fa3dbce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"7a60e857cc88d316a036102aa51031f3*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"a3351a680273f8ba0c0e36badd70faa7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"c37e37a38287d3c8c72cefbcd8fdf3f8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"32ca0d40a2295d56abc022ea0b8fbc58*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"eafb54bcffb32f01ea5a4d6f39b4a2ff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"914888a1756446beb04a091b40c77766*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"50a201c55827d07adbefde9a027c7d12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"94ce881f8895b09fb7a55ac9b4f6f5aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"9246a9f1eb9184ce8fc0f01b3a38bdc8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"b6f51823d7e52aa4ae03a4b92d96e495*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c4dd4cee3a9460809f4e8bdfd80acd70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"228c9c4bdf4cddd7c7361f3c15e08305*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c62bff20ae66188cf095bea2ec4068f7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"138e9c2f9de4da4ae0cc4503fc734252*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"e78fd307a617e02b4ef4d43d924efcb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"44f3cd3df980f445405a101816babf6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"0664b6f4448517b372e997f01232a971*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"a170506335d4702fd43810d706c1eda6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"eb2295b3dcd5e900a23a87960e4bcdaa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc4_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"55ff59d063c3f22a5b6d4b487f025761*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic3oc64_ih300oh150kh7sh2dh0ph3_iw300ow150kw7sw2dw0pw3_n"c44402ad015f4922f089c8596318922c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"d8ba08e5f3a675b502c2bdebd11a291d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"3ce9a184780a2b3fd9309308ebf5128c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"bb24f0f578d3882f4df0c2e165448866*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"36fc4f808206cc37e956dab35aa34ac7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"7d616adc3746105db48a90a7e6f9b732*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"ac8745939c215ee5b251a92358c42476*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b8e6b13924e5c0d6c1ec4bbfd7a5a2e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8a1d867f6b7af622bfd28d690fbee0db*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"9d1566873d63a295bae1bb6b23903d96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"18aeabcea0743cf4481e6e4e4c22678b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"883fb3730f6f52a1ef6e637e2c0b95ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"9e5da531e7780338999f4acfde051293*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"f828ada09eb31c01cd273c316969d0b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"b88b0eadb640a98d86b34dc028a42514*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih5oh3kh3sh1dh0ph0_iw5ow3kw3sw1dw0pw0_n"0a99e151cf2824556e4fa206cbe4fffd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"26e2f49e6907ef95ce58a99ecae01287*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih3oh1kh3sh1dh0ph0_iw3ow1kw3sw1dw0pw0_n"9c3af24e87eb77b4dfd8a632724412a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"93145f273de61d7d8a78e289fd32e5a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"c6edb45f5c3461526c261d087dec2f05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"090caea5ab61f021f61aab4b62c611bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1e688b5ddb58f63b2af0c27a2326cf74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e2b7e103f4677453b368aa2ecb53b0cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6df5dca85f25fbb71e7861802a319313*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"38ea3570636b309d709422ce4690a32d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc486_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"bf6227316601b75cc08062fe3d56bad4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"e1eb5ace40e2f9811407f19cec2ca700*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"4bdc48940b196133361f3d44e86c2c35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"613176b75c6f066905f908a3f43088c7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"119e329e8d7e83e7d1f0b465c21ad9ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"fd154272d03980939c60df5a846ea1b9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"96248d289a65919e28d93be6f191d665*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"89d2d94ed5144bb529bd62ac4b732830*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"3f3f46b2a90384d5ed1c04a5ef9fac28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"49f1dbc2cc6871b3e0177d92e077968a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"7e3d46c6ef60688733f4f18203535893*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"458e6a94bb36fd8585177d222096c4a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"d281377dc03ac1cd6e4745142a435090*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc486_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"d848276be04a2bce146f5602d94bf016*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc486_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"87171f179174c333d94c2371dbeedc2a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"a91b23c4cf127255e62f52f292068eca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"f41bad4d35cb965e5189cff7ec1db092*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1006d8818e2bb5222dda7175740c51be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"80d3060e32604e074eb81d874d007839*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"601cdb5b7cc11fad26b91d2858f37b87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"01077ef75e3dbedd05b7b6a9da1ccd8d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"f8d1c8cb912264fc5684bc2ab95aaa9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a9f76c1db3a62f226c3626b68593e596*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"466953678d28b6feae5850987841eac7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"7fe435490a7d0bf9bebafda198dd94ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"f38d1cabb448decc23a9efc46115de96*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"f6e54ada4c81cb73b3cc6c187a98e37f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"2c79380cea70d866ad84941a3c1ef4a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"09ffe84b10c24098fafc78c4fe312ee8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih3oh1kh3sh1dh0ph0_iw3ow1kw3sw1dw0pw0_n"ee24ddfec90bf58a339e7d4e8dd8b9ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih3oh1kh3sh1dh0ph0_iw3ow1kw3sw1dw0pw0_n"1286a2bf7b47ce1ef072f0418a445dab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"9a30ed82288a1a605faf76714d9adb50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"b9421b3acd9f274af4d37a75c15f5dbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih5oh3kh3sh1dh0ph0_iw5ow3kw3sw1dw0pw0_n"8c94016af60af8e97e80b790fc92add8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih5oh3kh3sh1dh0ph0_iw5ow3kw3sw1dw0pw0_n"2b01d578ec84125b79418779548a1316*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"69d9713e6202d728ce14daac505ac097*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"b12a14c6f58efedeb4e0ebd99ccd1ca1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"364fc070ded7c119bbeca4f1b29f8a52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"87c29a4c418da2dea1e7f4d082b39539*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8b3a87c676a3b598d42c474e5ffe7527*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5af4f2685ff5a3b5ffb2a80b6fe64819*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"b64676228e47475a11e3d7c92075c1c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"fb00dd002b28f4d52a35f93cfcc9c0d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9233d7269bac95b9ab55b1bcb63c334e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"40242b9d77233711a33d9e246dca4eb5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"8e7a60679bbd416f424574b5c4ed8d72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"8f1892342ca2bb62df00090a62821d27*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e5e9959f6adabf272013505c7f355b93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e8f080b9896f537a7e6f292e2d1d3b5e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"8fc4266c283bc239a968e791581870e6*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"cb0ff5a91b80dbb88389b6df490e3248*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"6326a22751aba4993f238c342aa8d27d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"7e512f9e2ecf6fb1b1958d73f0629382*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"cf5570fd4882e18a8240a371d81e290f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"bc6b73d74e5e7a505bdbe24956c60c16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"a0604b8a5fb200cfea013ed7dbeb5a5d*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"8573dba73c87b9466c70e1e675c6ca6e*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"57b7e5af016420eb91594a725933320c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"542893ca7cb01a1f452ec0ab2da89e23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"115abc81cc45c1dcf51532b3564d3c11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"8e882f6f698a104991b932e9e46384b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"a02bc74e47512810f661730984b6cefe*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"5f6a81074dcad2724db7f67a6665ca5e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic3oc64_ih300oh150kh7sh2dh0ph3_iw300ow150kw7sw2dw0pw3_n"ee4a3336ac1667e8cce183a86ff7f112*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"14683832ccc86da5a234d1466e562fc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g96mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e2bf87e884dab408684fb79225f6d1b7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc96_ih56oh7kh8sh8dh0ph0_iw56ow7kw8sw8dw0pw0_n"98a0af7b66b98474d56b25f74cc3420e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic96oc192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"bc503603d02cd7af29aee23db02c9896*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"61b2fd9419d51bc75ec54a48cd2e1a88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih28oh7kh4sh4dh0ph0_iw28ow7kw4sw4dw0pw0_n"05fc734fc45dc70884b3fd2285bb74b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic192oc384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"51d84bc8fd60200f15608f2f4024f1bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6e349bbef731d5a6861621856ad69675*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"59bbabe84ed1de82303e2fcda373949c*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic384oc768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"de02199fc77279a16f32317e8fc1f582*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any g768mb1_ic768oc768_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"291f4c565e628f67700bd94276d8be4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"255941beb8bea06fd8d5881b4aff7f6d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"54b2854d5b14c8c589fae0ef5a857f5a*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"28e72f1caf8eef3a93531af20bd6cd88*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"68030ed0d14c0d92d8207f9982b86b76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"60ac08b412bc2e79c83ff83c32a2314e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"c603ce3b7aafd76880db3036df57b560*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"54bc4f6633d6705d3c100dadf9dd4b28*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"08a967e9e791f3927fae96cdc5aeaf0a*17"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"6b4a0781f3f2d665613c3aa6c2c24b5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"13d2cc09cd1fcd8ee1806931ed973358*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"f7ca4b15a1178e184ad3cadf1402c009*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"378e6f6620c53919d1b7f28cc97693e7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"f16a19be3d805ead7c95373edd4e6b39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"7f51a86f39834fa58915f31ac96d4092*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"30fb79fbb8ca2031b7c194a6c47b039e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"f364da086cbcb41725487d9e71a8510a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"62f48a5f3512e06360311805b193d9b8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"a07aa51d2069dd24b251c15cabb69f8f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"4e422ffa4de9dd2c0dce076b6cdd4eb0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"542e4d27be0bb03d87c15b5b47df1bd6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"1b90d1f3a02ba4a19d3f92f1177e6c65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"f21c44b8746f5635371a37d8eee06c4b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"eb36fc8a6c887216c076b47973adfa89*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"402dfe09a395a170821db3e00127753c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"8c55f592036328032c8bdae127d2e4b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5a79d144eaadfcb90c21abea73e115d9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"b9c2ae7d571c82b9acc5d1ae04bca2aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"501b902f9c8aec852361a4f2947a6288*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"2bc0c5d0e1d10eff7cdeeba4d2cc2307*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"fdfd3e076daef716f7801cbbf7ce8614*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"57979906c6f233157c1123e593db6909*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"9b061e696d56d65e9f9a1315e318f463*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"02f04c9c60a743545abbf7f1d40572ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d0894b1ab24e0db6e4ce01f0b1af34cb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ff131c98e0a31106f49da2caf25adced*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+binary_add:f16:14:aBcd16b mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"8c0f1428e3fc308eef375881f7679add*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234 mb1_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dd9b2024f8783c69f18db0d759afcd1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f47c364a63da37aadc84aa4915a32d4c*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"f42bffa9ce066717438b272d8df3e6dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"03f68994a3d3cb35ed56cb4d178a08be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"d28e6e0b90c6a40f1af9abea2cfadd7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"d2c408bc510c970715531979b1af1c41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"32f7ff1465617fd35d43c5dfaa34f9d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"88ef245acdb205c3793c463de66bd44c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"c5f6468845f8334e7ffb954df7cb453a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"c566350809b3a9cf422e70ee103cbe9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"5b66353472715626de916a93d7fb0e5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"94641da1b107e9e76e9feb811678c092*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"30a19665de59f9f095db27f40c37beb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic3oc3_ih360oh360kh1sh1dh0ph0_iw640ow640kw1sw1dw0pw0_n"7e696f91b4bf9b1e383b40e37c375fbb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"b40fed20c15cb3b0477eff0617181776*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic64oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"55720b80e42dc7d65174be1a27840a0b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"4450efd0cae878cc0510410ebb20099f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic64oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"1d69cd1c28e88467798af244e7d8840f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"d2531c731aa8feac1641d2528104f785*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc3_ih720oh720kh3sh1dh0ph1_iw1280ow1280kw3sw1dw0pw1_n"bb302ce1cf611b0d31779697b238f8bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic3oc3_ih720oh720kh1sh1dh0ph0_iw1280ow1280kw1sw1dw0pw0_n"02f53d085e7752ee8d02355092ed4abd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic3oc64_ih512oh256kh7sh2dh0ph3_iw512ow256kw7sw2dw0pw3_n"cf83097b9a6ebd43894028e66a43e09d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"71929c1b923e782d44530a5ef2dda530*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb8_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ba9819e7c34d941ae89341edbdf3b755*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic64oc128_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"b8f23559d39fb25d1118ae7d4093d29e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic64oc128_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"9205374a1fc3cbbc28816e565fbe672a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb8_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c4148ec2d0497264295f15667a920452*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4972c123b9aa3f97e85eb3851fab288f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic128oc256_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"c66e6bcbf9809c30995e0afb195af07e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic128oc256_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"9deded3d151b989880f5fdb86736678f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb8_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0f3fbf58dca7377477d239a8ba389a8e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4709291a60dee01456f7d6a52011e152*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic256oc512_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"dab3edbab21c4dd70411ce23a765fdc6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"9731eb12903508f04a00125de584534f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb8_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3a943d618a7a2ee9e9b0a36c8d18c709*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7cfe679aae4cd09c742f801df38ad74a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic768oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"64c4f39700b85cfb3f8d19950f3f7bea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic384oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3eb36e515d78892e973ce1aae1a767cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic192oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1d619973fe9943c6b9f55f0cff1df69e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic128oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8a13b17f2a405509ff2a6d202d82be50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d14b3ad5600a27204e55331fc9c1766c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic512oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c3fc84ac2fd16bd776871ba032b46978*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic256oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b4f2302dbcf29f57c6048a1dbb67f1ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic192oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b47dfe3740306bd6cb63a99f1da34662*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic320oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"64de780b6167d343b111d09e357ca1f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic256oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8d786bbdba8be05d2e07765885d4c885*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic320oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"01b8fbec6c31fbbbb5d5484d24ec8417*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5ce61b5888cace1d9454330e2e4f61df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic32oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"f312786c736abc83e9141dda155c9a93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb8_ic16oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d1dcd7684f15de96820eb50afd2ad856*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb8_ic16oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"84491652e88cb4fd4f0d49e504b0e082*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"601fa632c88d49ffc48b9e832525fbd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g32mb1024_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"f6ab85bb423e22327b14d2d2bd23f18d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"507586ccc128c999d3d4674e2933fd52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g64mb1024_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"229e5bbe6a4a0b5aec457b70c1918cb7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"61d30ba6c4ec2ee27fe8b9bf4d7f6586*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"70850d4dacc4a2782f591f133a0fddab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"eb3964185aca145486d8e3b64fd7f357*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"e71ae3a2d561a18b432aece0327081b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"8513b787e2d5679496e0951996171291*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"09bd2882a01ade44f69d71a54116ce4f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e6886230bc7f78712f228c58a22df44b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"0407b2b33be03bff2cea423289494d93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"a02c55270990addc9976dbb9ddb7b9ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"c490641909f90d4b56487c9cac47fe82*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5be28d9e6a30db8f6dcf7ae08cf9fb3d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"cde7d0c5b199027721ef2cbeb4551d81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a28f1e0fe95967ade9da02819f7f4cf2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"7631ad8e4e74bcdf284c9c99cee3cb44*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"3ed197a83703f1f5349d826c3ff1b77c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 g1024mb1024_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b86612928a8736b7787c05babeaef7fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5901e17370fdb7514be4076107025c82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc126_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"462d6e26aa7f7a5e8a73a7b63c401987*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4369b0a3febec6e437c6a8eb4bd2e2ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d5b30217a750359b48230313af857443*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"6a85d792752ed6a312a613d693c8905e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"4542326c2c8ae03d47a4b433cc2f972a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"4f1480a9b083f664443efd6430db2ab5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"f26e75e52d9aa2f10cd084f8f5feb629*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"65f8d1b3faaa3cd2fefbf38dd7a5d5d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"56a93ff7f40de520dc3da9f8062aa586*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"6b8cf7a470f1e93e93cdd2f44c3b60e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"c8a562b981c3bec02d9e4b73d8bbb852*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"8a70a79af75e81334f0521e099504aec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"b0c2aef068ef9a4a3457c79d819e192a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"fe31d19664ef6d8025c0bcdd89c46525*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"022546670e4b50cd1e3f4081e6b7139d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_relu:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"77ee9dc3b7ded6de8c99f0a2fee7cae1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"2f4e579001c5a98199d5f15cd899013c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"5a186a85be46bc060e0bcdcb031b6307*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6c5cecfc0a5a64af76a6bf2135e7370d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5f8b7d895a9acb5db083c5deb929dab3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"595ab0682461404ccfebdd3c1bf8588f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"08f52424f68d9705d4c16dbd7f16a357*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"90537197db938d5573eca94088ff3f17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"822baff85bdf5c57575493d145f4257d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c5ecb5628c4a5a447d789dc12edfe22a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"be8a62fca43978ad23f964fc9bfb083a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"57ee24278b656b525e79465edd576742*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0274b596457c6e233a20ba3fc9ec4b89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5823693e0419e50033ae2b0d1e837bbf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"28daf03f676f1f8cd8b4d4a8d75f0b6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5ea56d0a0383eaf6db1ff7864757c5a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b97e1ae51a2ff9332a8c4984565fc8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"28d43dab3747ca0ef7d211f31e57f978*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"606371d0004c9aaf74df3e64bd352c1b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7cbb0b2c716704fe1ef7edea6f1dea22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b5b12e81a294362228f160f516a2d77*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ec36a30c6ca102fd31b7fafc7b12ba89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acbd --wtag=any --dtag=acdb mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7124ec83f48bef6ddd2ceadda0ba9085*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"aa547fff06dc0c018425c8dbb10215b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g32mb256_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"796912cbd6b7291536e6eaf876b391ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"dd5c1eeac543c952b7738577d43218ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g64mb256_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"4060897a9bcd77491c44425941438370*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ad81be3aaa564d06d116e7618cf9d17d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb256_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"5b13a436c5dee2b88e189dd5d46b5822*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"0ffbb78cc6ec98d1fb7bd6bcf11139d7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g128mb256_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"695e204781224329ba991c34968e08d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"b4cfb066cc94d6f57d826ce5539f51aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb256_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"a3f6aefe3d2012c1c67b666e993bfae8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f338099234abab4d70fdb9118d0502ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g256mb256_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"fe9276767a91a44aaf4d6126e7747323*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"20637c4450f21fba3c29579e50ceb08a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb256_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"09c201da0443a4a267dc880a372280d1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3651df4fd00e1c60000d3abd22fd989c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"5d004b20c5561e555619b0ad2d555698*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"eb13be689857b90211fe07fc29074d74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g512mb256_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"c4d83fa4737de52b54a4d17c9b454d75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"363de1772ead1dbcdf2637c862d4dcac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc g1024mb256_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"15e8dbc3744826041fa9e01e24cfa99f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"9ea7f6735591d39c202440ec395e0491*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0b1c5a124dfb05777385441452b63c7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e20d01e13d8a9a3e998657595159e3fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bfef4933be198bb214e2a56c5801d8ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"059dc93725dbf42cbae4fd4e394fb462*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"e4c47c299f01fb55ef0e9295dccee658*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"916a9b99589c66a7aa5bc8d2cc0877ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d310e5ccf781c66edbde752ed2f067c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"19aba0551ba76630dc2aaec94c2dbe35*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"e326c783d4af0aed4291a7c840c47aa6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"616f6c56db263811a093b23913b42f9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"28d73c7f40955a3bab9a32c167edf867*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"94181987c1431baa22b4f83dc7aebfaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"890a080e1c9cd5a5cd15f8cdd61c6c49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"e8814a6c174ab14e3742ba86922e94eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"5fe0268319ca710c00eb8ddab613b686*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=wei:per_oc mb256_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"9bb1e71840810cda5f3426cb792dfadf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2bba0ae3e906627e2e7bc43543240b01*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fe4fa7fdb0fa388a6917eae7b2f89947*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc48_ih381oh190kh3sh2dh0ph0_iw381ow190kw3sw2dw0pw0_n"d27bcd0c65a1c34627ad2da60cd638e4*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g48mb128_ic48oc48_ih190oh190kh3sh1dh0ph1_iw190ow190kw3sw1dw0pw1_n"5cb193274bf4c53bae3475bbc81396ea*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2822ab544cf7aeb56245faa64f86dd3e*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a1def5d9f791bc08fc1314cd7a6ec187*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc24_ih190oh190kh1sh1dh0ph0_iw190ow190kw1sw1dw0pw0_n"1ea426bb22102c0e7e165082071dfa2e*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g24mb128_ic24oc24_ih190oh190kh3sh1dh0ph1_iw190ow190kw3sw1dw0pw1_n"4502dcbdd9122031dccea0e64059efea*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc24_ih190oh190kh1sh1dh0ph0_iw190ow190kw1sw1dw0pw0_n"971092bc82cea5a2c8e6ca5ec3dccd5b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc144_ih190oh190kh1sh1dh0ph0_iw190ow190kw1sw1dw0pw0_n"01a780f3cc759192bf2b66548b8d924b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih191oh95kh3sh2dh0ph0_iw191ow95kw3sw2dw0pw0_n"70e407e2c0c78d8fa76c08115a911544*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc32_ih95oh95kh1sh1dh0ph0_iw95ow95kw1sw1dw0pw0_n"69aaa22da41239234dc791843fe91b94*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc192_ih95oh95kh1sh1dh0ph0_iw95ow95kw1sw1dw0pw0_n"6e0ab94358b14ef2607521f3160187da*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih95oh95kh3sh1dh0ph1_iw95ow95kw3sw1dw0pw1_n"aee690a1d8a6fa421859b435f6453971*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc32_ih95oh95kh1sh1dh0ph0_iw95ow95kw1sw1dw0pw0_n"b65ab0d909d507a085a908c3ff7b86af*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih99oh48kh5sh2dh0ph0_iw99ow48kw5sw2dw0pw0_n"a26886f949303866fee46fc2fa0a9d2b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc56_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"4cc5318eb92f134552d0aa5f21799508*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic56oc336_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"fc065f8708f59a0a869ab26f200d23c8*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g336mb128_ic336oc336_ih48oh48kh5sh1dh0ph2_iw48ow48kw5sw1dw0pw2_n"59a2616d0a7dc2831d9f75f13d932625*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic336oc14_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4efc1f13e52ecafcdc18498f0bf0ea03*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic14oc336_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3f1f771dca2732d133a0430a3f3c3711*200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic336oc56_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"7ee6b5114fb4ff5018ea832fe2260a70*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g336mb128_ic336oc336_ih49oh24kh3sh2dh0ph0_iw49ow24kw3sw2dw0pw0_n"35fbd1e3f28937498a14a74c2c13dc80*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic336oc112_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"a269e4d92493b417ea3e6b20535fd6fb*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic112oc672_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"b354cb5e99c71bb08d7c4b5d2ff0012c*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"3cc5c9bcf613573f851f8e9f38db6a4a*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc112_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"9f7d90c7469903de645f4e6b22bc9121*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih24oh24kh5sh1dh0ph2_iw24ow24kw5sw1dw0pw2_n"9de5da93dd8a843c4db658e81b5cbffc*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc160_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"fab234c1f73a4d33319410cf0f6985b5*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic160oc960_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"475deda7fa3da25b1245d31dc9d2ec3c*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g960mb128_ic960oc960_ih24oh24kh5sh1dh0ph2_iw24ow24kw5sw1dw0pw2_n"ad2fb37bda8692daf8bba4a7cb2489cb*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic960oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2a49bac2f32185fcaa9b54ebb5588e0f*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e8c85963cdb29958fd8b00abf734d74f*300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic960oc160_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"9b6add3d96fe38bca9f204dbcfc4b215*250"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g960mb128_ic960oc960_ih27oh12kh5sh2dh0ph0_iw27ow12kw5sw2dw0pw0_n"2c7f23bc818569ccaa142540e507210b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic960oc272_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"bc3b402fea34468c37031f01aad18261*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic272oc1632_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"ad756095b08741b13d3bc2a3ae7f2a89*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g1632mb128_ic1632oc1632_ih12oh12kh5sh1dh0ph2_iw12ow12kw5sw1dw0pw2_n"9b5591d68eb27c8fa655fde8f6444842*350"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1632oc68_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"12f0a7629c9109e99d93f45aaf6b2384*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic68oc1632_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1741c54e07769cce0f51c907aa8dd5c9*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1632oc272_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"dd20495f7c14e2cb1f6b727832c022cb*350"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g1632mb128_ic1632oc1632_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"79244a550ca212eb24ec1be781c3d832*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic1632oc448_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"140456e1b9ebdbbad038a4f62b4acaf8*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic448oc2688_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"93494fab87b96ff442682e149f2ec224*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb g2688mb128_ic2688oc2688_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"d2ddf67d9609ec7472cc7d929feb1b37*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic2688oc112_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9a12238cf2f42f5e2369658e3e536f1f*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic112oc2688_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a70ae92e89240f99abeb5e7f1b4843cb*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic2688oc448_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"ff82339d400f1a0fb6f3ca50dbb4243b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb128_ic448oc1792_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"cdef7e899cba2f5a088d1a6494471f0d*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"1697fc3dfaf72fcf4ff108c30ff5ce57*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"a1760271089faa10a3b61ce510e2fa0f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c9b27c6b4bbbd09c0fd59dfde6c820ba*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"849301f71cb9df11dcc258b16d9bfb40*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b12fe6d602af92e731f2ca1f5dc00a2f*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c3954dbb57b0194b62b675db9a0c743a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4502c0b9603e2d7b104e8976b7f8914f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9a3c30e21ffaf02276e0f34154e34b17*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"edb597fe0e4d0b547d5d0aa43faca2b8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8acf49142ddea75294a7a07e33cdc5a1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"cacfaf103805a4d2691eb8faeceb5d75*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7a99b4f8e0ff68572cd30033a0874735*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"6341749ee8fdf62db51da5a96f5198ff*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"66ff93f77664b8ce5e0b30c6f6c6464f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"af8a4bb0acbef967132cedbff9fab691*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4063d6e48c5429b312f35669a91e2aa4*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c75c1faaf05d12d21f8785cab0730889*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc59fe293a71905f3a1a140750adf5c4*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"690fd862a15592e7f337b3848192708d*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2da3a57b9b4feb6b76723ee42d67de7f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ac84efb1ef19708f6d9208636757c695*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b229593196c20b5c0419e190149a9cdb*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e29f82ae3bd5456d02743e1307870e59*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5f20d27b086b4307ee1f42f0ebb34d52*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"eac16b7ce5fb15df1ac9fec252c5b454*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"23e4e532653fba6936682c0428d93817*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9dc30f30d140a093a1ff166b93b1ffc1*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3d08b38e993e9f03a7aa4b86f8218013*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4ddd34b71fa0496c4fbe00c81eaba75b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4bed8fd128d155cff71fc0b1db67ae50*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"92154f860bc56f93c4dc2d519f29768c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"16727724071f8cc9982f94e4762316b5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"42244dcb982639616e2db3fe0feacc47*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"2a71c0cf38d958b6ef1ad3d5be74c639*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"9b66c06ae2aa53afe18f15c1a9836b6a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5a676a492a7ff4c8319856ab7caf1454*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"fc5f1daf1b1926ca1563a34ec7486fb4*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f6b943f8ad96416ceb7801e2fac8dd23*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2371bc93abdce282ce18283dc538a8ae*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6a4474075c5022bd27bf52d87c8d3dd0*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"66ec455447e218f8b4fb725a686c5ca1*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f7fde3d861aff5212cafd280cdfc2096*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"dbf95f28cb8cb5480461a8197744da93*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"98cbc2b12d7af794b084e3757c5c8bd1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6134c070d893a35fce618d027c0d1961*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a353a4bb30f2b8cf9743014b110cd652*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0f4e07e5d17421f3e5348bac99542bbb*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0cceed1a2bfd4fcd6c3d057f8ed38e53*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"64ed4b9b7692222117c1b38625fa61c5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d6a78a061f18e00bb975ae17c2a3b5ed*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0b55e7a2280fe00f0136a2b8b5efa74c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0e4d5d3379202e6d4d3e3aae1207308a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"eeebc36a2f6ef8c5cdd0c1918cdab597*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"316ab51818730600d9afc66e93b10528*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"41b471bf2d9a8251bf316f7e9e0c3351*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"522cd6ce33dd059bc1e2de7fb507fcf6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8cc6b65919a5818add7faed0144c2a4f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"0b2a7feedcd39821ea9ab7622d4c666e*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"63b6e6f6e415df27a5d19ba63cc45aaa*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6ef3cd166439de58602082fa1e6fb43c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1147dae6001ffec874e8455336b5a163*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b37f82b03b22ecb34b230e65bb2611f6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b4c5e0efc26c288e51992cf6d9a7a700*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"113008088f09a9847d686655cdb549e1*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0417d88e07a9fbcad1053de9548cfa0a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6d2300d9745dd2834b0ad9dd4ae29d52*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a79b912946aadf5010b1db2ab4686a37*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5d363f21bf09936361071a536ce71b8b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9d3e3e49f6a5e7d80ac2e7814c499628*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"8a4c717cadd68555ebae710b0669bebe*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"78e040765c1942edd483b3b2090ac426*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9cc3a2d1227bf8f3e8840b5a7d61e880*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic320oc320_ih66oh64kh3sh1dh0ph0_iw66ow64kw3sw1dw0pw0_n"9096a8618b9c2d54286b2dd9766c2f12*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"08402e72bae79283c3333af5f1bd444c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"16df5cda9f9bed00fc990e3bab12af52*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"123ed4cec957d176430f090e44ee30d7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a7a26c07c9ee03d71e59ec0030ef703b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"aab2277d03aaeceec82979b72c204d7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic640oc640_ih34oh32kh3sh1dh0ph0_iw34ow32kw3sw1dw0pw0_n"a99223e1292e767214307f1093cd9639*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7ecc6d5737c62bc5f25c9efe335db84c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cee0dda4d6defe16ce21f69a8afcbeb7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"24f0d4768ebbb4f5b977e5c930829a6f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"c22cefbed13d1b199d9d6f23756a6090*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"60c89433e59185e376cffd8985010e4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"588ff48a3029c89cdab8ec57d826a5e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic1280oc1280_ih18oh16kh3sh1dh0ph0_iw18ow16kw3sw1dw0pw0_n"0baacd5721533eff70b4b6c07246bd82*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"cb42dd966fed115c77f1da5cc993d0d3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6eb8b207034607aa964715fb14b07444*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a20a14872513a40f79dc4dab95cc5516*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"7e447a0741e1284c02a6c434e1acfc4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"473c976ba4f66820f549b71e2b7ec40e*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic1280oc1280_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"071c23860ac68b5e581c044defc34ef0*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c1e0d8b362a3df2a9796e5fb0fc56ae0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0 mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f7e8bfe8f912409e9d1217a09409e1fe*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"786fb42f07280122814344df0dd5ea03*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"13a5396aa25413f405b44177583b31b4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"cdfc4662aa3dfab8968175389b2034a0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1c28c5579c257cdfcf5557cddc6c1542*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d0370b9f8bab9308dd247227b6b499f9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4dca7a0139c17c03b507fe3e8b760d3b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"be2f220e68cde091528dc31d07b83730*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"873407b542f5582c36418e9bc925ad69*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"65359719d7936b65364cc20ac87bea24*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5da9b279af11f70342b02c685508c873*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e2fc3b4871da80bd971c4e623c700527*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2ac1d822e721791e35a16f0850ac148c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"24d786077aacd7bba5bca3798351e02c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9ecd822c0e65c68219f4296a0bee2803*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f2bc2eac8c0dcda7e898fab43c35e948*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"958b7f0831318042f6c08e9919e4acf2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6e36c91757e82a65faaeb6205a3eb734*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ff49bcdd3267b0161a95333e98470128*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic320oc4_ih66oh64kh3sh1dh0ph0_iw66ow64kw3sw1dw0pw0_n"5da36b99a61e2449206981773f762f0d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"221e88ff443c11114e1bf53d907caff9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5e0c9c0ff9008bc2823251872244c466*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"fef0309531b71f1cb401d4830ebd2f74*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1f6ef2c904f9bac9db9661024adc398d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d0d16ec3f427b739514c7d1157e3017c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"eac4f495ac1d96a4d158e3c110aa8969*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e6bb83d671cff3952293a13dce786902*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"ec6a03d70e8cb4511c61576066aac189*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8749c89cf69e59ef0f45b273e6ee9c9d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7059f816ae884ce6e62a49ff29aca85b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"fd893b16be05e32976e971d70cd7f5ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0fa634245fbe08008097c74283d70171*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"cc27c4639f4d94cce026b45be1304bf2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1c0d464d49f7b19ecfe1397001d20330*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"602bd25be3a6f6e5e0e4fce3eed85576*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"74c1920636b629de5355b551d0b8b1dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ee01c92d89bc717e69d283e3004f6612*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"595ecfde6b48fe5d52460543a1caaf73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8b025f36697ca290c9560342ba425754*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4288bf632693e565596246fea8d79fe5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"516a73e088d198fded3427afee6949a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"60cb29e3879c87daf8600de649c37d87*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f722a7e724e72ea8e10b21c7c126a9d8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"cbe6ba19655d1addf055410ad2bd0100*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"1eb745a0d3ed4098795f3948051750ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"2137626de81827aba3a49c2718209d0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"71c1125da832cb9d729f45cca94ef573*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ca8de31b1a0d0d4248db015cfc751617*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cb8740d9ca9706cb7161cd41d4c9a1b2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"86be882decaa2e8c5fb32080865dbb14*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f9162e947455736ed2e689ebe9433386*45"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c221952aff81413baa473ced59ee99f1*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3d61fabdb4efca353ff789d375449627*22"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"adfa247c30f75cf0ffbaa4e6e47b07e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c26d3c86be4bc4c599c0e5f762f9ff9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"fb343cc983b4f8a79d67b24ca9476b3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"444ae63189878b1b10557842cf0aa778*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2cd044fe15fb6016029dd2be6fe08061*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cb1daad47bea1b9c3eb898ce776e802b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b12d3054f60e42649bd3f274c9fc029f*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9d8de5b637ecf62382b23d175d89d95d*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"9a358a9f715f0909bfaca0dc48d26223*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4f1b52135c8a664f97cbcb3d922a7d8e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"f127aa8adad0d00848455ccb0234424c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"c947a0669b8f77a6593d5c27068a9955*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f85bf175682202e3993e660a198ed80f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"3f77e4a552d547701c4b13e393c34a39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee24f54cb141132dae6a51796173bf59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"41ab5a09d3f6a62903d92dc280728543*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"298853dd98f3f0940dd7befc40929dbc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6733d36cab7aa2a24ebc41d0f00ad9ff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5c1eef0f622f88b3daf62c9c1bcbcc74*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"2444184f9cb23e0057c4f16fd54f021b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"269980693e3a5b159bf4c237d7eef015*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"96e5c3940fa1a5ea26742af884cb02af*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"b232f45aa8257b31b96b871a015597af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"f47e1305ac1b70324540458a0c97bcc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"56e102d4454dc159922266b87c63f03b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=any mb32_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"47490369207156f01a7613cd45a7a782*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"495a77473b8dd89fb257de6cc8193b0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"22ad6915452c7079c2f0e99d02599bf7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"a25ecfe31b26c6f67d09c2d825617eb3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"95875f2119b5059dea17f4d8a4f9043b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"0bc578b065cbd0dfb54c8650e4838312*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"875962e61f2483ca47c759231bb56c3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"f4f766cba43aaef3c5e9b2e0eceea218*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"4c1764c9937149edce56a94d526f5ac9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"78432fc9b0d3da18cc5f9d425361a35b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"0502735106d181922e7d5a84a5a207a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"4bf1b2985519b30501e62be4ffc5715c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"85339fc5213ff4dbbb34f9a888a2d9dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"fce89e05dc3a2883b81cfbe62864dc71*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"7fbeb323e3a2413cfb51ae45be34e5b4*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"15583568fdce864123cc522d6035c83b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"43776cd407a27c1cd5f86ff85451322d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"b6ee250ea836331860f276349e225975*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ab99a115115b4dd1ac285dfcc471ad6d*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"1f61e9c9812989dfcfba49f933d9036e*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"9ad40177298e66cbf26d3b652bd0c721*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"3a03818ad10e41878ea1722a3c444651*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"37b70e5e0891a86596b04a242b0ef990*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7cd220aad3ea49dfa510fa736398ca14*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c305502c5b553e01ec8cc929d371faa2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3166397aec0dd9556833b9ec0dfae461*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"460fc5724e5f00a01fe08eeef531ca5f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"f5430402f78b733361b9aec43b2f37ab*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1e881512b556cd8f96229380436e7ec9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1291f5fa4357bf3a197988bea5e9615c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"143dc4be8f44eb16e6d65ed57fd3b28a*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"f26614ae231a85d57396cb7ec77618a3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"b5563197005e00d1e6653473ef815521*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"33272d084e56f0edc296385d65ab34cf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"8f7c91533a335e1684f829f0b9a43831*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"bfba66bfe082164f288344234b735cb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"8814342f96eb3fe2df8d8bfbdb8c6ba9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3c376f1e9f5c980540d6a67f0ee6aae4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"4b4984680c1fcc9be82996721dbfc861*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"85ec38181c426e58df25bc5c9cd0d38c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"f0ecf162c8c19002a167840cfd00a6e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"5a8ec3c46b954212b4e4351fae00b116*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"ff311680f63670061ef5e06ad7a2789c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"2dc7e92729447afb3ef04e9b211cb25d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"6050e7d2bfddaeabac4384b3956082e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"3c844f4b96d2c70a7776ca1e0c2c4987*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"1963ff9c0d5df647b42a348f0457fb4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"1fcee859562a93b46332239aa3195611*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"1795c1d2a006e89d9f5808d2a2df8051*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"ae7e81337ce75016df3bb1bb6d5fd5f3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"75cff589d4f2a2422c6a57bcd5c8e678*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"34b7d8fd63a658c927ed83914edd50f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"28933ddf661e3c2f6f67e19b379e1157*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"8e12d95c19ba7c01b658d0a1b3ec2c41*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"23ebd9e85fc40df90c1fc77ad2a7883e*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"77a5aacb186fad1f6017068254d6728b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"0737d3687e25ae131a50c2b7a84db754*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"0fdd526c219ebbb57e1cebd8a90dc853*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"4a273bdd835af279580f455aa7539b23*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"c92287d676d12d84c92f3c411e885523*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"cd98c342df0d6c1f7e357b557c8ea8cb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ae7511b14b7988b6dbd30f8048422786*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"58ed66eaad02ede3c53ab05883214a88*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"d8c0d79af64b7c7f3a4ddc272b7fc509*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ccfaa7515477355307b20f9dcffa55bc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5fe43740daaa73790065fff5c49007c0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:u8:14:aBcd32b+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"35ad5f2e1e640ba9c42c2cc460d9adec*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"205e3aba2cad297973c544de561f7fa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"942f9b70aa8aa1f7c844153a3756c453*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"3bec63ebac83972762da26c3f8d322aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3723980205df9facd1442864bc722b4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"84e160574bbc7b2854dd256093726e70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"0001102dc52c0b7b11270f712740e090*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"1633cdcfa0eda03fff4ec7596947d462*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"5debade20fca437d237cb5bc2f3f241d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"9b98c3a812c1eb16503a173315b0e34c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"7ef5bc3ebdd10854dbd85c449b2ad05b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f32:2+binary_add:f32:2 --attr-zero-points=src:common:1 mb1_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"5543a1bb5f11c00b94526d985b809afd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"aec7c19e37c3cf52dd14111cee491a7b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:f16 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7ba91f5514b2d9f5c666a63bdd8a0bec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic80oc512_ih1oh1kh1sh1dh0ph0_iw450ow450kw7sw1dw0pw3_n"43d19dbac58f6af6d43914d8b00002f2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw3sw1dw0pw1_n"089a3296ecc80c6a5dbfe45320ae740f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw3sw1dw2pw3_n"b8eeeec23523426ba14a368187192145*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw3sw1dw4pw5_n"97442c0fa6a86fa0c1fb463578dda863*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw7sw1dw0pw3_n"7fbdb4f71fa40f4accd8124cb7a3b9c6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw7sw1dw2pw9_n"1a711f75b2a2584076e430bb21ae1c5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw7sw1dw4pw15_n"8fb2a78bd6d506092305c58dc15c1fc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw11sw1dw0pw5_n"f8d469db5bedcf6e8077e8021f380b87*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw11sw1dw2pw15_n"dad199816d2a6d7cc77ffccdcb363bd8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw11sw1dw4pw25_n"71a9d242f2be51063361514ad8c074a9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw3sw1dw0pw1_n"c162b176cbbf874ee3409963f3d2639b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw3sw1dw2pw3_n"02481108081aac88204494fc90007aee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw3sw1dw4pw5_n"e27e21788a1794208563d63f506ee037*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw7sw1dw0pw3_n"660fb5355c4b35d793cec4760932eb91*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw7sw1dw2pw9_n"6bb77978aa201692840cbc9cd14dceac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw7sw1dw4pw15_n"735f58d2ef5ee83bc3bef9b4a09bb55c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw11sw1dw0pw5_n"9b8a92cf8117971aa3e0eb9377aad9ed*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw11sw1dw2pw15_n"6bd5e16455ef5f0b0c4639f0555de8f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw11sw1dw4pw25_n"6c72bde42b0ec903f4c1a7800df97f23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw3sw1dw0pw1_n"0d494e83c72c3d122ccfa07264bc1645*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw3sw1dw2pw3_n"397580546e71f2259d3cbab80c725340*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw3sw1dw4pw5_n"58a0cd4c1f2753a6ef5c653872bced3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw7sw1dw0pw3_n"83bed7d7c4edc57aad5e450a24b2f9ef*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw7sw1dw2pw9_n"8b8a88367882e672e368ee8c1b2469cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw7sw1dw4pw15_n"bae6309576de075eb0ebe49ec1a7384c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw11sw1dw0pw5_n"2e658bc587a0bcd770c19a748a7b5e26*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw11sw1dw2pw15_n"e9bd004eca708fa2331e4ceac7007c9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw11sw1dw4pw25_n"a50f53815b4fc7a85c1a4e7d4f34cdf3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw3sw1dw0pw1_n"42ff450123a46a0d11c8fc6fba188ab6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw3sw1dw2pw3_n"87992dac4ec77831e5c0f731fbcd08a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw3sw1dw4pw5_n"6460c5f6748015255d6941704557d985*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw0pw3_n"3783cbbb052360d66ea349fecfc5ae89*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw2pw9_n"b5334232ad4bb36bf075c1f5f2035195*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw4pw15_n"bbf657b7665e8b9fba61fc51091ed7c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw11sw1dw0pw5_n"24788829355eb92865322462cebae205*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw11sw1dw2pw15_n"fb66ccf841fc13cdbc2049d9f1f83896*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw11sw1dw4pw25_n"5c076aad60717d1efbda154e158b8755*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc1_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw0pw3_n"6e55b515eb107bb7f58cbdfe57b0c552*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=s8:s8:u8 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic3oc64_ih224oh109kh7sh2dh0ph0_iw224ow109kw7sw2dw0pw0_n"d53152ccff6950c1489c12a7b749fa9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"06856b876c15bbf7848f143b094374c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic64oc256_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"f5a211bf8510d397486ee83e2482bdba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic64oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"531adcd8fd685011746a101b89538e68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic64oc256_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"319f1430e046202bb35272395b64ba59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic256oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"64a366ffedef0a4d5843793e43bde66a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic64oc256_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"7e694f74adddaae84de0780c2b7ffaf9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"12f249f52ce91e763e2d909a2960ca49*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic256oc512_ih55oh28kh1sh2dh0ph0_iw55ow28kw1sw2dw0pw0_n"b101c80985630a2bc82bbc147c245e58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic128oc128_ih55oh28kh3sh2dh0ph1_iw55ow28kw3sw2dw0pw1_n"170204dedd308bcd78330229c18bd99c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"650b3bedd884264c4e7903c8d56c8635*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5ad27c88a9cc948505d0c9686edeb30e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"158cb6e18e92abad94605bd6ba803fbf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e2727d5d99438ec76f2bb1d8d57ee6d9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"156501d6a479e2174c54267a22a8003d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2f2bb5573e61da9a34c35cc84e2712b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"01a0202794f18e909ecead8149c7c49e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f76d69872a6230e0d7b958692869958e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"87e3780e0f8128ef2b2b28c0c1737a06*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8d5043ba447eeb352644eeff81f3ca6e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"98751e27de8941f3f631ac7d52849ede*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4a695fecad82ba6ef51f8e110d2c4e5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0fb3566a78b990152da87817cca70c64*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"c90fdc815d99d63348eb79a0182d723f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e29c0ac8daa870a3e5c423ae175a11dc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"98a26cadc85778fe35836906f7c61cf2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a60da820c86f0a75109f917f51fd35da*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cf0dec129da3d68ede29cdccf9beda07*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"5c0df4ee23c93b529f4609eff3eb0d6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7a4fc624d5f58ce6359205a6b7938877*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f5f1e4fd1cdae671b97713e33e795360*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"03b9b8c9faee54935132b33a2fee5ef1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f962fc587063dbc030a8c6ce6f981308*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"01ae37b8953cd9547e666f5611be4672*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d57ff7e490fac37ead51d487a150b00a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"9637f40e2df420344096cffb13ed9e57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"028e08b7d3605063fe3e7a2c6a10c186*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"61d3e4c2e0b0d24b9f163050cba8a9a2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8f82fa2c300013bbfbed8a30ed7747a1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1e77659378b1e5e1cac5597b6e9254bb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"98a41eea22e5c5b3d32de881cfd9f332*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"710a69720a8ec7b06ea853b8b7840c50*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"ebb2dcc20f5dd83ed532bef7447de518*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"32a158ab980c122e0b190956bda4bce2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"5d6769637283d0a21c4f5f8a781ceb43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"df5efa6751241dac97ec9617d5cf6869*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"92a2986c1657cc59a2f0ff01c3ae748e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3c326b0977c1cd8ad207e5999da7929a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c6c34a9adffc40db24d06cab8bfd3180*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3298511306d15a0579aeb9828a1a9ba9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"42735522ff970545b2317aaa2f566bac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"765f077866a4779756520292074ae5e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8e3220a608cf67ea1b2699399b3cd9a3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0773b9ca562ab60cd806e04d08c54791*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"fb7ffc0bf871d3da6e617f25c2ee4140*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"20287e863e47c05e697748f0a3a9af6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"93b1c3207a913acc00d932504830ca71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0:f16 mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4ebeb05f35bff02d59844a132ec6e8b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"30153da4fdda0ebdd5de5414fe38df0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cbd5b6a09af6a57da7cb7b2a432964ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"519d1e607a58cce3875e248d253dd4e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"bc3593edc01c5abfcf39e0cb99ab5797*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"df84ff452aebd0097d36e9952a781290*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8bb58b847fdebd51b663a508b2e02aa7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"12b79caf6e295b14d88343c5554da1ee*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"582e8bbfb02f0629ee173f92f1751dd4*4&01100644ca89b264311d3e6484641543"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d543f3c90e1fadb7c15044f433c9a4cb*8&1a6ce68088721e03aec2618c13bb92f2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"380ff30f4eeeab073075593f55a1d76b*8&bb9af354aaacb8c5a9627191e195b7d5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"dee8e8eb51bb3bc260f062e5f21d8590*20&b302d577fd68947d879020d32edec6d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"800f58041878dbd2b34bc92e999a0b1c*20&895a59bc2d6f5badadd8d18ae40df190*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"67ee1143dfd964b49cfb9ce1314d8ea2*4&acee3130a958f3ae65325fb6fc18fa9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9192862439bd20587b2c0465a1eb29b9*4&6dfc5fbb9d25be03033f81407d1e6d02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"aed92a8562973379154b997c45298d9a*4&ece2b0a1585a19fbb5c06a45a639301d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a7ef9d982fd36f7a652661bd2c93d270*16&c8e9db332140afd1810ca670d8380af9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0560c26504f98af2c04e558a8778a5c6*20&eab5e4cc4a587619059b842e4e136b74*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d7208c3ad834df3e930325baca9bf9a7*20&fc3f42bf742bcb8ac5072ee3dbdceda4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"721c291650c3bdcc14dda55ba3d57079*4&a665f7ef60097c724803705c1ff782e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4fd18fe3a9daac19b4b200181c261190*4&2189a33b55df60f92319bb2aac970465"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"3ebc936b0b9e7c320c1476b14c526139*4&1b6b6d407adc6d856d6d07dc633399a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9b3b08eca1bab1305fcbb9638e55381c*4&c7822e5c92e015c7e5c251ff933f1e03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"473fc0893e1ed8933729f89cbc5195eb*4&817e94de8580738bcd7a4e775c1f3c6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cd1cde150cfe399e857fc40398b94bbe*16&aa32d7626f1103079eafc74a087b8a6f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2b7418c427f9bff5685b752177720edd*20&93d05e1d176caa8f26ad290c80a26a2e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"47937b31147a3314d5918c16122ad1df*20&e75c769aec301af8568b5b40f843f855*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f732041caa02b9e6a3db82b7bbbb563d*4&d9eec42aa41d4e7720f2614a7a3e9160"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8720f7e6c26a95de07824b5f2ab999a5*4&32cfecdc0361a8b9e7fb5db428346748"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"2cf33105e7a7a2c432e213095ec00ba5*4&f1953642b9a4be5bf5bb61341d413dd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3e6c954cd6cc0b0c2645ae6c187718b7*16&88d38a01803123807647229ae6f472a4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"278f342114bf79eb799f52ca50c2f917*16&7aab7a0d9915ad3b28a016e1e32ac0dd*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e20e3af8d48bcf0b62542f5b4ec8804a*4&e6042669427eeccd6dcc6f85e0b1f6b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a23777251c9abaf59eeadd83708aa8c8*4&67672b3e1e3aebd90efb877545edf38f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3d5679697a6b716766530dc9d7afece1*12&18887afb67480e84046899fb84a63dfd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f6c2bcccba2c767438d0fbd10f13b970*12&9f95717a40f43dd3f464dbda57e2f3b5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b6e06a6c3b36918803597d3ca3576b0f*12&379fdb015410b3f21112870a8ebbe22b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cd0edd8bd37d21823ca0f66c77c2dd46*4&3bd4c9b1f9d1ff3e34a00ede4d9f065a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1eb385e17bfaab4c70ffd9584127aeef*8&a6fff6257439cd92457418aa83a9eb1b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"cf5b3260e281927347b91f3c992bce13*8&716cdc0fd85c29e6e5575dd0c3ce5233*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2406736189187af0f44b048351a872b0*4&7131686845e8b4cb94210ce287555bef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ef761bc82625c5789edcda3d4d94794e*4&26bc9909ada73799e9235aeebc1b0470"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2a7148fc82ee6466ff1cf614de0b220f*4&a5165d91b60df9bb9bea2b0e482c60c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4d412d581c591ea0d5e70b3540aaaa7c*4&8bfdbdbaa28cd1e872fe8019b168d13c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bc55879aa8ad15a4af97897d21564f89*4&3399efeedeaf59bb976e579726b3f27e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4b62082c2b676e5a1ec599cd1941ef0b*4&78e1f7d2b00906164cbc5e538f5140cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"11e9376fb6ef3699c68a339451c5d63d*4&c005b2b24b6fef78aa1cbb7fe92f7df8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4a8a1d71f97934d3097718f9edee1711*4&67426852cab20c1a6b0342364e2424b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e3ad0e448f2166622c97461cd09d86ff*4&b5b2f1794889fa2bc4a54830cb4a9750"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8aacff0457526fff9d631c3af2b8c2e1*4&8559a09d1345e435c49ca4975d2a6fbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fec2d7f8744a60acff6026ec1067c35e*4&50d84872a7ada47c9d314a9ba73eec0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0efb567969ad778c9cfd3dfc70608e66*4&bf1100de7298f003b4628cb265f4a93b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2cbf39da89c5bcd6c4345a0d675ea5f0*12&a03aa0d0d7041728911ea4289453892a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3898a73a47ad1edfa4880911ceafa44b*8&fa59965c3b6fa916bcd9259ea22708ed*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d7384bb3ea0f006b165aaf4e60e9a7aa*8&720a6b2c1d237502a3d93da34795c0b9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2a5244c733c6052e6bd77b115b9b7955*4&e3e2a07c077fbb07fac6a7ef2dfaba9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6ab061efd1ac38f9b915e1aaa02748e1&7467430503d28acba22cddb775846023&702b728af6c482546805246724c43493&65b0c2be93823ecf90179dc88cb07cda&d4dbde22e2978f9654c8a7358761201e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cddfac36c636e52cef2848ca675de6dd&64c31462c642edd56a8337da85a74005&195821bf6995b475a0329ee36087d32e&040554bf29a94d6cb6462fc06d8e16a2&3f17d718d737fe7835be3844c7f61901"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f61a8e8c07293fa13527919b698c0a6a*5&e8edaa9b74b8be8e88477374132c9dc6*5&64c8b0c9b6b61e76074dd0038e78ed00*5&2b445dadd8c18a8e1b8908feb2299c03*5&60bc8ceba6320243b02f9b35fbaa1dcf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6b275a274affe9732741b39ef2f0fea5*5&ae043741d8d8896a20c9e5e85725214d*5&d8a545759c22cca0c581bb1b8289c82a*5&8f0f137720537a8362d4cd859f6e2dfc*5&8236552b388d8d72f0fa36af0bfb2997*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"63c12db71a3ff8cf8c48dd1843e275ed*4&1f73be7b4f3912a544383f6aacdabeda*4&058aebff2d605c82e17ae3f4ee55bf96*4&4f3dae7f3080c289a2f80d3d434b01e1*4&69077a2446b87de476b5d1b2c07b76a3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"453992fb6f88c86c51e38730420efcc0*3&bc8ee313b1caf6181054df22442f91ca*3&a0fc9c7e4b24a894fc241d4036146465*3&d34fa7afb748ea885a3b1cb1690e8ba5*3&e5800102588fe0faae275c594775e9a1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"3630d713e7d8ab330259ff91c320344a&77c4c239ec43ddb94b86f4a280e91479&c0ccffe433be6c35f9aff8605d3d2b6a&48c19b4c7264b9bfa73de53cb3751e2e&3043b5f1a30b28fef18a832878e59e5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"39bf1c16d24198e7a85b78c4ca8ab499&19b72d6f31d671e781a838a4391b4911&3e6449f79a724e9b7d2fa805bfda11df&8b8af45ef180883ea43d20ea5ed67296&a5b645d2f1ad3eeb516990d8ee208865"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ee2ff3b808eac92de7f09e0db543bcb8&0e6fea5dd5634e55c8029303b51def19&5eafde2eb970a4d6b7a94a894ac5d5ba&45e8570636416f44443d0cfc9893dda1&c770008124d48eac453ba783b9cb54aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2eadf52a8e5c0b5c6abc60edf5948f6a&4284ab4b7fb25fe928f42fe551f396c7&fe4af4e2c85e3cd49983d1d6cb65709e&11d03a4aa4df645e9bd2ef09248dda63&c15cd2ecda826ba4d36bd411cd4d3ac0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ae1b9c02f4339aa8e6e61adde017944d*2&afce017614f531fe2e947224e44209c5*2&e1e2cd428ba28dd21201c9150b3d071e*2&3bc2b64f485a00339c8eb0fdaee7dcea*2&850ac0b8d16c90a08cb9873421284d1f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"8960085883a2816bfa9fec80e5679dcc*2&893d35583183fcd3db849e2cc0d1eace*2&336b69cab0496e00a10bc57ed9d29bb8*2&7fa35b2b25de0cfdf050c4eb000b7bad*2&c4b0b0ba3723696e9f3ac76640e4cbce*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"166a262d686318734a42c1f0f3747fe9&550ba0172b936f33cd4c54cfec8d21d6&d5e43989a05d5ef416640b12f5e4f7fb&c6af6b03652259941cf5e5a23246aeb7&dd9ccc6222c21cf696e314ccbfb67abf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"3b5fa94b5ef2b00f8e4d8f167fb8cd77&cdba1893527c4f91b0827c0dd7c00663&526591bf600a9081194c9306ed0d6c70&caf500ed401a8b6c26b931dacc6cc528&66c7893c492314a14f330f4cb0985402"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e0d086068ac8c8d71d3fa3980e814753&7bda94b83c6e7e895d9dbd9a67d0fcd2&832df60e996be6bd73560fab3b0a7ce3&d0755c0bc41f509178ffd9c139578d3c&4b7c28370f9b88b0d9f7bbd6e340a2b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"47d241213052e548b09afac4dc12f932&697f156cc47a445e5a7f00ca1a921eaa&85fbc0b53540c78eb77930bcb0cafff6&f3bda42c9ea543f21bdb4aa2df0e6461&b7fe1e9d9ea5528eec97684ded30572a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"51fd1ccec116d604c3d64610e6317882*2&0dbc69b3e80a39919f272fefa7f37d1a*2&7718dd39d57830765a79bdb896dfa397*2&30c1abc9de234584e5fba49f5131c84e*2&f9205741f2c112d66a3d79d218250fbb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"00611be295b68c57347166928dbfb96b*2&407e61e56825c0e1fd60511b1c364ba8*2&b307bd4139e56b247399c7f97dcc51a8*2&f5bc336d3192dcbd59fff8bc63b784db*2&534dc69d9c28078500db8986bd3b9ad2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e76507b33c3b25b2d65300046640106c&22d0976aefefd83266e8b3ea04ff1714&873c1f0b154baec4c312205af69cd91a&2aa0dc6a52ea220376f2b50595ca4276&90115c5c7107de307b6c9edd6d4d93e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"54c8bffa56aa13f80bc414e18dbc8f29"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e4427d2c4c6335f99be2289669ec9238"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2b25a338547588289e15488a71789fc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d886b717967eba5b2793575719aae666"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1c2ba829caa6fa9469a2ef455ce04a17"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b28ca21b7ed1fe69f2f320a4c9a16d9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b39c17843fca3714c9a3dd0a6cfeed51"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"87c4ec41db51d3576fc211c858c38a6d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b145ba550a8a2d0b5b10f875b4fbe39a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8b694201e23d791791e76eeca25f0eae"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7aec2e3ad88d52c22ebb2954461cd614"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3ef75710b524e4eda5022dcd15487fb5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1b252af5e7c0a4649937a6318c4ef47b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"aba3eb114bb61b99aad238965ba24791"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c559e5c7acf1642bc89b2520ae5b9024"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"de4199b558dd893ca0590351af7c0020"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"16c99c4bd25825e10e776104787a4f77"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7acedf9f90a4d9537f8817182ea3ee63"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"118f6b9a7130805c29e4c98d4e96a258"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acbd --wtag=any --dtag=acdb --attr-fpmath=tf32 mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b1299d688897c5cbccd9995f45ef4c30"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"7919f8c92650eaa369e1959f80a8e707*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0635f49238087abe56daec3c3256c43c*40&604ed17a50c8769cb4adc4c6fa5710d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"687cd60d7542d826eb14f50604e4540c*120&e721bb116715d876d43e715bc7eb9f04*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"31a41d3c44786c9ff005c733a9c26885*160&3daa327ba49355eb821a40144de080d1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9e9aa4e4669aace056af971f145c034f*80&1bea8f9d9f7fcc047bd69a4880c65e6a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"12be8444fcd274a32915a9f79a5e9e28*40&0897d8fef1c73629b596aaac62494447"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"22bc5a375c4e659b581189355ce2350a*40&eb4b886e0f9ec2235cc0e8dd4daaa61e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a71eb72dda670ad8a95d8e0e4ea7ffe0*160&3bf73ad294f0954a95bedaeed4f42071*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"f400a58f425c04bc1e992c5623240327*40&e9c2e7e39e96615ae9d52ca457c3969b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ec016a743b30609de01f1a89b5693495*120&95af7ab6912dc0035ceeb517ee349fa9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"528823692db3c4232aed3e1324d9c362*120&6ab147be697a3ff242ec4267a9960b54*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ffaed4936bba16073233ae4c286c955e*40&0a836c6696a09d34b92a092f966674dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"3f00769e4f1a60566091d8141d005980*40&d65908f6e973326941dd3c07946f5f73"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f3b99db63b3ca9324989a04e4a2273f6*240&f1d72954d57c79f5d9d48e0ce73c4b8d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"0884a715c044111173c8bb3a5a358298*40&bb8b8c6740d9610cf7ebc46f1ecc3f99"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"459f927842e036ef5b4cd9e0fca31bf2*200&f2b63c37334740eab5ed479ec26d9470*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0069bdeafdeaf78eb3b6703a98b70d9b*200&13fd0e1f9c976696b1474dcd2405c1c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a720239d5a14c5139c8ab03a4ce49a5f*40&d43a4d86d15a4e83ae8401639686732a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"359fbd3ed3acad825876557262a482f2*40&906fcf6e7d72ed8b228787bd5bb49180"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"46a96ccf23c2afabc17180507b0fbc58*120&dbd1128a556a1183d35f6fd5bfe4e0c1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"9acba133f5b0d8297443b5d9ec7a0da1*40&66a696c8201bf8e1ff41150dad242325"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7e63e570be3c8c1c4b28631319b79e28*80&71a2e7ae346b3b314cb64c23fe38d7f7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f311654e6fecaffd2d6f73dff636509d*80&1074c616bb638685bb60fdf9b6b08f6b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7cb3a05ddf3d06f81d2c390417d22440*120&923fed1e5f473d4504015284c6f93dab*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ecb116a4eafb365b4238baf553bffedc*120&b94842c7c0a4adaf91391511d9899409*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"15b3ee6f5a7f55694f29ebc1c2f888e2*80&5244d5087260313cc3a6308d953e0148*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"506feee1279c500fc0135495c1178a27*80&6ba91bad23293e45736b20d0224bcb31*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ea5cc13f5233ef7e6cd941879f6ae104*80&3613f68885a46cf0cf9a9b0fafbe36ea*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ebd451f9f0a8787bb4abaea943188556*80&873f37e79fc21e76055f2695e676bc65*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"88d652d92487b1cedd4d6fd9dfb83782*40&fd0ec3ba7753de8e1811947045559b71"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"5c7bc8b4c764280365378fbbae7fbb08*40&4c9a3cb6235503f246113821ddc97963"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"764a1841cfcfdcd1a833952f52339710*40&fc97bc509dd0f0702c30e2eeed7d97b7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"35a0fd8ed233b6be57c531dab9a4883d*40&eaf7f48b4db4c84d82fcacd7508b53e6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b45820639007dc641e77095d24519e00*40&cf8dd470cd572950f0b10ecb2aa68bea"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b63ac5d1fd682e8b38c395f6bc502277*40&78b0bdce452cc233ad2cc502c92fd00f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1e50084769271f4db58110b3979ca6ed*240&9e93050bf275f31802f31f8e893b9148*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"20b49bd9e12393e2b6b26383a7e2bb90*240&31930c11e534e965891055d05648037b*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d5e6f88e75fb99563360e4666dabad94*200&3ce9e51ba2c260f215d8c757a6e3fc24*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0334bb4ac1cc746c7be46599ec9c58e3*200&067db971be5f155712a70163cda93113*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b957c415b24a24591bbee7607437ebd3*200&15b28e7a51d989cadc61a7c6abb5c7af*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"cb681ee594ea09db01d425b5996a51ef*200&46b266b5a990f8759d48b5ffb8402241*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9ca0c86a55a42c64bb839e83fa28e736*40&9284a504d4eb44a2ce84465547ac2b49"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"1062366e8d187fff7bd7fdd63e485bcf*40&53acfcf2fee33d143f18b0a7b9c2b349"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"31c81ac5f008181f993d84a5c26d5be5*40&dddd6446bd5cab172461b8f4fc05c1a5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"873ac4b7f16fe64c297bf011fb956663*40&d0f42214263b60a79969af8075b36d63"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8177f64af1b1e5ba054e99caeeb5da0b*40&345456926127cd2511570a2e4022fb4f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f530be4296b29ba30bf337de878168f4*40&26b2af209824d6c0aacdbb02a43705b0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"353e45fc07f19a5a158102f4829c349f*160&ac70368d3fe592c1fe30cbc6f4b29fdd*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"66f96152664fac6737fc2d70a9463750*160&ae68cf542918623a42c6072e59b4bf94*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1a0ebfc4b9ab60a965d0782e1a85bd1f*120&4dea47d02856a8133f9731d026e9af10*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"349419b1165439195a1530872cd90909*120&e872022eb12938970f950d3139d37c34*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f7a7ee656aa9633d5cde6b73df0d3f63*120&a4c3e23e15c8d6641597c9b56c020dbe*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"50cdd579577be14853c9e8551198e480*120&5dfbc5b183a5ca28aca030692ff0fedc*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"6036ceb246e964195ffda093425087b6*40&b87be643442237fd0570eb2783fcfb1c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"90102d992c37d89b8d6890b447deedc6*40&23ef2b16913482c1ed4c15018d00795f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"01794e46154ddd72d03224413f3cfcf4*40&90d98bd6eeb7a0f67c1f180e68ac55c0"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"0a5ee2f08048358033e3b2922219bf94*40&9f5c2d282776c35ae25c8a9d17f200ec"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b1f8a454951db08aadb634d4afe4d009*40&dd23b82512a476680254280b2aa02a5b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9d5c0d8583068899afbe66467e67ffe0*40&e6789c359010d845bd2ef7b4e1d5aa4a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0e49631fd4a1a16cd627a6f247e6edaa*160&a52634014c947b8f55f24833ef32b726*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9157b0b8af4c503f30c96276c0b30909*160&ac8c69e877b37f2ce0e751226f554f01*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"df23026529c51a6a36f25a46a800d6a0*120&b18c240613a4c3a9bb7805e26f13d12e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e12c852393faa8e17cf76e47ae255ce7*120&6f5f81fc8f745ea61850094db68fd017*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"898adc9da72c087308932e33e0cf1e48*80&e9a58600159314fce56fbde55d3e5291*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9ef2ab22715ce180b16f439fdc47c218*80&e3f31f060afa9ae66f5af2a10be99fe9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"63103f3d57c6ac16f9b8d764539366f3*40&6e599bfc8b300d67b7708c85f9f943a7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6a5312fb185f65f61dab537de983786b*40&c89a65897e35fa22a19cebae7b075c76"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"ea0567ae88c0bc92f9d5c8766718edf2*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e9d23cb03f05c33e6c2ceefc03db3d3c*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0c605b8891d86d4dbb4cf07845a38f85*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"57400a7e4488bc250b92a3d2c729ffb9*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"667cf08752a55ab56180b6e372887495*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b1e6fab907e554a183c32e79f5b7f11a*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"87d35e124ab0b8a205c019d3726e7472*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e4966b1954ce69a555836ae7e6177637*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8910d962c07c0b6f370ac75d6a66202e*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1f612d3925f6f774784804851bcb4b0b*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"893e094f73a19e4b13e211283c7f2f58*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"65b9d43894cd8b143d3a7667e0f0f8d0*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"94a41e4fd4a107e4e84e595b13924822*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c8339742b709089d4ef2d4e6df7335fd*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1d6ad4941c145bc956282637f8f999f1*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8d9075f323598fa24ae9fe42f684c8e9*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6b0085a3157866d0e6bc6871834c6940*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3e25923baa35bf2246d983f82df8189c*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a6c5250a0da19b1012f5a5b749542ebc*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"09880cc55ce59fa72cee1dac95681bda*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"489946ed491dd706d34fb5f171096834*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"6ae3fd75a0882bd9fb5d4e5c046b53ef*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"4f48dc42f6fd91138129ca8519ea0f1c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"9988845f0c658020097f2b0c6864a1c6*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"efd45b5a5cd9f5e8c74fb547917be257*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0f6c50d958721c9591e34d96e3156fa9*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1e8319689eb5936f1aae59b20f417d28*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"df5c6d2fed0afaf017f0c6dcbf9549ce*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4445f2ee261431f8b97730b7548fdade*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9ed731b86742f993ec9a2e522abf6b6e*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3b96a542f64b99d43538b2bea0d989be*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"247cf3e30395c204d065a4586f258f83*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6b096c52803e3eb888e5756e02bb182e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"13dcfce8fd8220a22c23870ef2cd617b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"6c91c32925d2532a8adf9af81f46f30b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"fc123c8620086977231233f61aa19946"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"107fe6f25ba53d0dd24c3fa4debf8e35*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"318acf95bc9477f98c4a93ad4acd4b62*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0e83e48500958f736f97f53951408cab*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"958bd8b15b73bb6bf75fba2fa10a64b5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"124bf61d958339a3f827300097b9b728*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"0c3fcd8f598dbf4fb13031ecbe874252*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"a51dc672de42f220351344267c9d8587*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"b5d03a7a19fcc768a403dc830237f9d0*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"9e2cdf1bfcfef9ab8bb5a3b8fac25ca0*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"d185e1d13712676c3d5b1c3e9dd67596*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"5074867e1e3781398d9d0fff0e70e8a1*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"f655bdc4361aab5fd2f88194e1330dc3*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"e5dfcae23ccc500e31f7985f1a467fba*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"6c8fdb734627317d63d720368bf48e2f*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"96960d417a3e6324e6bfecddaeb7717f*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"53a0259763a56fdc053c070fe30f105d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"02eec1c96fdfdf6042761f0f6cba3ac3*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"aa5bea9c83ccd9e32133dc2bfda757db*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"43cefe442cf5b7f12aa911836d754695*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"464b794ac0a6e5c6bf0af2f6e930b98b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"7284e166814d741089728a13a2e75ab4*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"dd338ab53d4a8fc70e61d247fee6bf20*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"49b8637712006300df0fd2261b6e88db*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3dcc6845a40c185cdbedccf9119ea7ba*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"17d537d9aa7f490f080ae98136215070*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8e28acf8c14e25ff2f6e4b128eba0508*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f9d26d087db6c0ce3175492fee15dae1*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"39d205aa39017b59dc3622f18095ca4d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"bb249c56190ad11a79b09a29f6349911*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"b2bdd52d30ffb3aedff3261d3679c8d8*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"983653a7d019106e44ad85dbf001f172*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"23354ae7b8b7272942d6d8615a435a53*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"89fb4c85363ddce3599d318de19d943c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"9e22b408ffb0fb27482af21c1db23c37"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6c03693753bc47bfa474bcce8c5ead36"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"bc8ee747bb3cce3ebe1815b13a419932*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"d57141b40c8fc642c92a010cc59eff54*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"30c9ef7daa7517b8843db90cafcaa792"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"176ba8ca9fedfc8fe91c43f85a2b1ca0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a786282614c56258cfc3f67820a29262*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e033a31add6f7699c1cf8441e7a72926*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"645880e9a450aa27c5a5f69909005e5d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1bfa85c66700d33b01632f0cd1199b18*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0dd0a5dc6bf61ed94fe51653f42d8d01*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b234b7394210e3159971da1c775856be*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"05ef2369e13d2eb75c7f1fcd906371fb*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9515c7226758c381fef6ae7b1c6e94aa*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"6587784de35d26cbde16b25805c5141e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"dd27a232d39aea2c8bafa712b8997cd7*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"eca329f3dda695717626e97c5099377b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b4282e756431accb5cffc6bbaaa0babf*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"c2ac5377fc9e5cdec99737cddcc373db*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic3oc64_ih1024oh512kh7sh2dh0ph2_iw1024ow512kw7sw2dw0pw2_n"c6b39b95dcaa5220d2287dae9243acbe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"28b3dc979dad900a6da7be0d618b5f0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8f3e174e0b87ad0b9bb304520d70ed4e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"c689681a10774c086382055e84486a63*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"2bd3dcafc9b13e9e5dcb84e0bfe5d10f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"fd4af6c8ee7b0acff9358917745d4296"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9a9e256181ecece06553b0f16ca648b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"04ff6bb3b14049ee10a420d8349bbd67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"0966db75ae2c8070d1b2b34a599e8e66"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b005150ff34c6a4e2b9a9847c047a27b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"cf38e2e96ae44f9b960f09b334db7296*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"05d30ec7183f3d5d2a64b49084a1d824*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"4abaed66ead3f827deafb5b44d0aa86f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"1fcafcaf31dd02cb8c5ddbed92794b61"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"766174a0ef5105b5960adc2159180f9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"32bbbfb9db10ea8d06f8f950b3214e4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3c318ef2bec01c7c467354eb547423aa*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7a6fa24fe084fd95d2b391dc0a4c079d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e6d332abaf4ff09840a8b5932c22fe8c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2533ccdfec3aa6664631cb8199a7ab00"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"72e52905ba961a1000b0e8f197b9af3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"b9b15c5269db249b47b55cb06192eb34"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"8901ed313dd000282afbc2a60c57ab33"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0352b39eb9b535ab20fdad56c5cb4b8c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0eabc01bea198d917aa82ff7993e6b05*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"aeb02541bcaa153de4158decd4bd4f70*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"268623f36e30905036bfe0f127a2ddc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9ebbd7cb89ad214cf877e944777ab344"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"45f5ab7fcd713d56eae48a9c6980de4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"58b09da1357873eff8b9fe3cd32db639"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"69c0b28d7defeb852a5f192ae24188fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f8d35144893c383909647ec6a14a7a5a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e833bdd106af620a08e42c030a2d3ae5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"f78d8f8b7c085b140b50e8003704dd2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"96a76222c03746f1046ab2bf24f356bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8d01fd2162a1c35d398924d5688f6623"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"93a6437861fb8478133a8b225f7e446a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"64de527bc3a2df19c6a2b83fcc7093d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"17c1d0837ca8b80ec7f236616d0874b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"92b20f4a399c4b1ab9753c82146199e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"4ac36cbbc58074e4d2bbfa136619b038"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8c9ef24c52a8c579d83fadb36db381f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e1ec3a465321cad4d4d4675965ccacc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f98a5d24ad8b9f337e32ddc02a52fd37"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dcb2f332ad79a5684bb63483d33bc32a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"da7b4296c458552311bcb02d7a58092f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c45fcbec5d176dc9fb9541ade63bfd17"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1b06dbe3d5d7ccf386407ee0a4c823fd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7c9406e745fe58c3cdbeabe245c71947"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"83fc35fd53e46cd419590e99ac445ed4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"acbd970a5d04c80bb4f26a1e0d012511"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a3d04694657056545662a9e32ea6ae7b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"17a8e37eecdddfadd68e5c4302aab97b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"08430e608de4dd1d3141b363182cbd81"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"6b40fdab384aa5f71c03eaf473fab40c*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1ef646dfd5141e2a70bd63c07ea12137*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"911c842826fecbb1a132589f0cbc26fa"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"36a9c1f4baa464fddda08246757ed55b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"973b6bee3650aa9b4fe6d767a3f88bc5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0e8d0e164e83de3a21c66135b4a1cc17"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ca77e13815e294105700a4435f0cdb3b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d376c609da6d1dcf60030d6a8e3ee8e6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5c660a8ab5aaf27e5435fb6ebf0ec056*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ab6ada63f177dabc9cbaed7fa2c8a3c6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"ad188839b61fec326b49904a5b03055a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d2f5db823b39fed65e1176a436b14a3f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"d8cc8433923c535ce0bc8e296060e176*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"f101b50d123986e0c3b0049c10219fc9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"39457f8890d623e2ef130174a7fb0ea0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5f90681a9f09db105551ea32da9e4854"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9a83c571e2ea9f0f2b27f097b8c333ce"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6a21055595888a86134765431d166021"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a549dd7b07c927be25c377372df0a5fe"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"35398c0c0b980c888f2b2ead5ae6dca3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"56847827cc9fcff31d059c4b3c900516*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"b37e33194684f9b306c9e01ca46c7bc3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"21f5512783d124b9556f6d3e414e4bc7*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"486d284cb453d30d6fd021e584b3f161"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df24732cfba22516b644a7c362f39b49"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0bfc056334b9f78cc03a658e57bbb572"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f68c6038210d3846bb70d01f93c87ee4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"b18bb0a4b16946acba96c08276ab3bcd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"0b6d1494cd176bf61a4d36f9c0f3bf44"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"51dc6ff48c9d2ae5e0e9f0bfd5cde5a4*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"47f1eb2c5bb1779312e740e7cdfb9e9d*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"8361523cb758bfb1c8b758e03064fa0d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b9b48fdcae3651587f05861711a54fa3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"45cbca5f679c5effbfb5ed03970845e3*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"934650c683aa598c6d18f38dd089849f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f2c8ea23c5af3d0a6f58adc0337ac2a7*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1d2d2ea85c3375c5ccc9297ffdc5ac8f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"045f7cc797ad8da9b21c0fb6e0763267"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ee3c5b1349b8c16f569a87e830185dee*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"da65931f1703922d7479dcb653d9a606*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c800021e8b350be8b9620e1898add1b0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"292bf1f03bf58be9066b22033c53473a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e7f20f5a213c273f76102f8065b4e26d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f01f1a4e7370628e613e036bffbb9d6a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"22dbfdccb673f8bfbba4c9ee23baf2f5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"4dd1f9279844db01c5770f0ead026160"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"7ba9b2898e8073a7ac601451e96525ad"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"6f9c44716d421c89dcc0d5c77264dc6a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e27ae0b1ecff5cf48df47113f735b3ec"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5da2cfc9670a33f2b5c6034ace2a9d72"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"6eba4bea0fdbbe10f9506e3844cd2ef6*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"989f2ce5cd37327bb0e87d975b9a8023*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"b4f6090ba2067fc3c0bfe1f7367c8425*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"62e56da9fd68aacfa05a6b16ff1438aa*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"f1db671421c629d7c999046ef037ef50"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"c61f6e96f604e352c2fc7fc2074cc655"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"aa26b49abcedbae69fe3195e6859aed1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"88381537b3a9394c721a978137a3b7bb"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6bc2bb7ca7a1e21fb1bba2573372eae4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9a5ece1247adf693f9cf9142012b55d1*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a3d7b76edf9c9824a6fd5a008a7b3737*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0affa98c5596ed3fc27799f8577b122a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"cb6e36d2a9733cbaf6b72dddfe827774*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0a32bc4ac17aa09c33c08c964419dd44*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b0d21a60f8fbeaf0d7f9f862c2f397eb*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"2a300f26ca2593174df25d6d66844d76"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"d133e0e23e4d02abc1d856a8df87ffc9"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"257215e39fb8b8fc8085c4fe003cbbee"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"9962e2334fd6aab4297f00f23451e55c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"f70e955c27efb221aac0abedd51ca7c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2c673339be635d1a0e791ff005129379"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"1f4c4d9de473fd16f83b64cbb338a199*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dfe81d17e2139c23c84a4219b4eda315"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c1a288d336f46d6a45711514774cfb2f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ab1bfe75d4926b0efad2a3481b054d2a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fcbbb804ce3c50c80fe21eb3925f7f28"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7260cc7d81e476127a0e8c0575046efb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7b4d305315473e109e1b087f53019aec"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2ac49f7e9275a1e6bcd4047644e1ae83"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"dd650716433c0fb2b8458a1a6982097b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"dbf113d61978ef3ebaa1309396713b2e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0b72badbae55f923fc9a09252c2f67ff*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7d92066719cb802d0ecfeb5d1001514d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"d07ee653562fc6d0b6203031efc66ef2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"782005a05f027e8aa31919233848f61e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"0c7d430b0f0729cdbe70ab8880bc6f47"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ee38b77c2857e50bbb69557ffbadfac5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"15e0a3abd70524bf6b22ab3c185e2eeb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9861254b44bd24cf7476fa4d1480fbff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ca66a7c0425d07546ae7efb3396ec22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f5430148f701be77d7d91947aacab789"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"680a502636ae8c38fb4de7eb8ab9cd50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"8e7852efa1ac3ae6c1d1cfffbd16f287"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"009503e6afa4dd4f796d2cec17739a94*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"12d712120aa8afc5cda5e1b2feda0f64*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9cac2e7461bfbd5775bb7155a822f2a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"caadc948490fed608fc21ffc2cf80c7b*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g32mb256_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"eecaf63a2ccdd0c7668593e25967a45a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"7407def73484baa51b7fbfbacaeac534*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g64mb256_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"2b75d71186c561d2b764b3c68986bd16*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"dc8d4c5196f94bd08f5f55fd74c04836*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g128mb256_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"bce07900210f2a924ba136701ab4d000*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"f4b1ad9be4e48bf504ced75052225516*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g128mb256_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"5f3249e262a5a47c73f2940a80a85223*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"28557d10834147d2faf5fe46fd2287ff*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g256mb256_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"74a7e7af7940ae0d380fb944e8a15fe6*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"22cad8899ca655d4268c279606b4e89a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g256mb256_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"27b130aea608207c614cb7d4940ab11c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"2558dc92f5070e86582990f732f57618*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g512mb256_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"7d283d05f3c1c30c06360bb50d553253*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"d2d5529e4e6e4990682efed95c2bcc29*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f7aabb197ce4a35e6f6c9eb6fa7a39e7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"26d2d866505bd2eba4acfdf00bce5574*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g512mb256_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"96e4be688be7fe19b154f2e0479058da*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a9ec501d6d00d827845efdc39ae81d92*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 g1024mb256_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"bb5eee118f514feecf7e8d576ed06a21*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"dc4d4d23a5047a6b135f9a3be9aaa324*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"e008293164cc5142d2c967d82d8e5ece*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"cc5dc1db7b29860ca1f9c5204366a9ec*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0106fe7985ad3527532a7f8e86865f5e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"6fab2ac11418030eedf494c69d9e2d83*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"16fc3a61158f669c5e06588b7b0203e7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"dfd37cbe6d47b87509d04f27aa14585d*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"834ef11698cc8e097a79fc8e886c34a7*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"8c05c3b541a725bc9a81e8266ceb02c9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"93ac3b1584cb7b23bc50128044ddfab1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"dffc5452c4aaa5972fef16cc5c3fe13b*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"92e1f70d2a9e32755e02b5a58002f145*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"fdfee90e8198303b4c3ed92084888e8a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"648c82226dbcee51f8b4b4b37ebc798f*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"0cc7cd1c5f8de6d0444df9763f559cbb*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"36752ea98e5129203fcd1b851b33c65d*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip_v2 mb256_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"bf8e572e22dab8525026dce1abcdd142*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1ac7a1f5ce6432912ebd075d01d4ca9a*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"54e73fc0417f5d1a2af50782f53ae5de*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic3oc64_ih800oh400kh7sh2dh0ph3_iw800ow400kw7sw2dw0pw3_n"dc8e5944bb4e6269bc4f952efc6bcfb5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"9cba61a9d8593c979397595e8152d023*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic128oc128_ih200oh200kh3sh1dh0ph1_iw200ow200kw3sw1dw0pw1_n"283240ea3d487761aa57278d683bd31f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"cfca410cbb2ec02c9f39c9ffbc4625df*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"f098995f44482be4db9ff58a1413933e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"0f43da8f1d954cdfc651ae156048e5e5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"2329cb65a23912621100067a5d0914c9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"9a2c0a80e91138d84a4d063475a63522*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"981e01eb8bcf5129a85c30dc2be36fb2*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"92867c2d200a3e7042a0d4d6d91a4322*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"14573526de112d914b838866786a0310*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"2cc6260c847aca69e546dd8609aaeda2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"b1e8191ec8e234ff64079a66cff2e444*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"3575a0e4a9f4d868225f251d1d7c9d35*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"d06c9768fbc28029450661068c1938f7*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"584f39ccc6fd8dea2221a228a7bca5cc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"bc5c2ba6c71f0a951299b35b1eda7f38*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"701da170964bc79cdf8dd5ae6f51308d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"1c8c70ab1d6b0ef01e8db040409f42a7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"415c42096465026cf36c0b377c05cde1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"21ab234dc01aaabddbc41d4398d39bda*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"4908015f049b5577a4ede01b0ae2fa0c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"81876a6d4cbc45a558126eb9dc2bc4d9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"e0722bccf0c0fb9638089e08f8e78b48*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"f1199deb89806fcd9fc693c98d88cc6c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"2bb62a0bf8824862b50d60b24296b7e3*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"72b8c51043c34b0986399f32a452c3db*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"3e6431ad8321abf48887d6bc2c07f48e*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"bf2dca8fefca46d5ce53aae1c8214f99*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"e6e9e560f990d9807fd9327e6d36f2b9*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"94a97759229da6f6129815df16a7f265*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"49e6c1552c850d7e90f5ff50fbea245e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"beeb85703b5b55034d29852032d89e21*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"5746a6d61d7cb34180c79af0037ee5da*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"32cc80c0e8db2f6e2284636607d3ec21*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"82efc4efe7a0e806a4e15e9a64607bd4*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"35b31fcba25f2c8372c38b82144792aa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ae14080e314f051f9380881c1df6a451*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"88b5fe15681df095ac0c10fe1dfdba80*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"786dd40d1fb5d150cbcba953119a4df1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"c5f2639a05f301eca1bbdb704a39b2d3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"9794d9594a7363d4bc28a5aef1b06517*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"c15ba45d36adb9cb23dcd6d0553ea6bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4f2b22a6f798216f637ef4ff01df2d1b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c4420dc565002645ab65c911e071ce60*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"22768c2bf3fcab9745c26c4cae4c49b6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"65e77fba93b72646c8cef30eb006f9f1*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"796029d688a5774dfdcf3b6a7ca552d5*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2fc903153d4e59fdf000cc7fbec81037*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"178248727cbb285049f26ab11fba3944*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4ddc15b285d121dc74e2ffb12e85cc3d*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b9257b4c86d26ef95e37678e579f01dc*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"8da42bc542242dd7ff423f148ae99fcd*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"4f1a11a7a59868151a060b5ca23aefef*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"3da25a7543e396f5a1868d73344abe5d*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"8164bef93824f44abaeff861ae6d30c6*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"68379a558e168f34e43c2b6718e6ab85*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"73119db9f48fa5d5c0ab64cd736fdaf7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"0e4b814481e29405d979e8a78b36a4fb*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"d736fc94fcfe0618c54b6bd6c5d21b27*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"69d31b70b01c0aa88f30bafa62260d53*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"ccab44c11ce83dd99eaab5e81406ed7d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"9672801e3eedb2a67b487502bb509d82*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"75517a1538975caea067a256c83221b5*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bd7832b5255d9adb344f39838e916845*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"9293ac16f9d16784610ba399d92eba2f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"380f0dcd58375039033047c0cc363e93*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"39cae9cfc5fee5b562fbf5935541f1fd*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"3e0a303c7d98e4f3e1fe9e959af80e45*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"930703eb4c38a0ca0971497518c7d2d9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"64053e220a283a32d11a8772ada85d70*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"4c33b98bd479a8b85b286a13d3e05047*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"ef554d3b8157c655d64b984b79d3aa60*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"d4b49c06af26beb16b15a6cbd20592b5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"d399d4ee32cc9664632d8712faf38ccd*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"51a07515a8699f026e8017c40517cfbb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"d31017b9bf3044251d114c9f5e97c21e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"0337efa8ee67bdf8b2d6ff244efdba30*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"f5bf1adf91224355279f51d902b50542*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"8ada830f9c3acfa95fbbc4acb33ec84a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"72f27cce398a185f22b986357aeb747e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"8118da89317ad998b5fa04f626e7ce69*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"9a91a3ba1c85f81160a0cdf1e41d22a0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"b9248f38bb154064a580447e2fdf6b3a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"5db706962998a7a76663bd745d34e74a*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"b66ec413f30c8553020e454d65e08365*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"37786b77f21fc34c76d88c96014dc7bb*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"f6e87a91999d643be66a7287c67e3f3d*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"5ee740d876a354722a9a3685cb1de49c*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"9b26afdfc06a6c947056959e06ae8772*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"3cba19b2a1da368e4e02728ce91ae339*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"b6acf5e4d7ad7720236e314033b09787*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"377ef96f9d7a4485b9be51caa582c869*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"2ee7bb69bdcfeab47f5c6256fe1dc5cf*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"c406091f1e9264eaf8dab48b5152875f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"e6068e4a46c59a0b2502ef01c0114a4b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"2b95caf38c2a9f9a71e3ecf437e86fb4*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"c7fbec59c91387c5bafe387d27110953*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"0463b71206304fbc4f7f3fd866cdf062*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"fde7f638e173ea0d4deb72e0cbb961bc*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"9ca3573f6b051e42b324d84a596a54da*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"894f1f55c136a461f372c0937d3801f8*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"f3c5087fc49d729ed9e61460ed102c0a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"ffccf2038a467572330d06375ab11ceb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"a3ff3e34a70ae58454941ab91415205a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"386339c90addfb41c8175a93ef2fe59b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"a3a01e217b1e75cc4e5b95716ca0fd25*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"e325e0e8a85a15be8fd746f0162382ef*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"75e7d5c54582398d9515c357720cff1e*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"ccea0062b8a937a98d8d59b1deef3aaa*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"723f815695e28ade2f966d666ee34b40*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"51e0288fe10ad778513f81cb676f9184*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"f35fb7b9b17f6836744094edd8bcf822*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"7cfd24ee7e0bb545b70728f313ea127e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"29e2b3a2ef0c61d27d59b759578d1907*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"ab756abadfcfb5ed13b266758d70d3bb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"e90595e77fcfa0e37e97c8db9880c530*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:2:abx+binary_max:f32:0:abx+binary_min:f32:0:abx+eltwise_linear:0.25:0.375+eltwise_round+binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e5ac2e2cf1187738ee532e54242cebfe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"05fb531c51b1beb37291cc7efc664072*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic32oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"09dc00c4cd9b0deabb913de005fe9c8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic32oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6a728b6d471135eccf0ba208e0588c79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"29496695b4b2691202ca50544d6da048"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic48oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e857fafe3004536a8510d081db774581"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"7063c990871695ed338fad02b699b894"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic96oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"06dcfa52e916dff61eae333c47ddbf82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-zero-points=src:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"fa92890df49dc8c06471025c4fb67677"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic160oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"82e7c6f614606e65b84db39dbf65fcb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"fca5ca002f9dc8af56369071b6f5563b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic304oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"3248def1bf04207abc27f873a1ae370e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7e0131eb1b754929d0a964b152a5ae43*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"402d66af1f279400fd5a2f2880c93da6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic816oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"99de7c4a9738a3f765ba237d99ec82d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic304oc304_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0b9fb1e37b402d5a54f6039bcc590475"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic464oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0aa555510d770812e0b502da45f488c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic160oc160_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"ca444869329ff9e8d9e346f942cd90aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic256oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f52d5ddfac733952091421f01ea59f02"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic96oc96_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"56ebe24373eff2616ce1240611706743"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+eltwise_linear:0.25:0.375 mb1_ic144oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"ac6b1de88ee58d6d091f8b26c81ea150"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 --attr-zero-points=src:common:1 mb1_ic48oc48_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"652d7b56d094265bd746abfd025a511b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic80oc32_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"de357a2ebb9fa9cc338b2c5b5c7e05d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_clip:-1.0:1.0+binary_mul:f32:0:abx --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic32oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"29e187a683b9ff3d424bf193e59bd914"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc64_ih832oh416kh7sh2dh0ph2_iw1344ow672kw7sw2dw0pw2_n"5959b6cf000ff8d49b84c351a953e7fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"bcf8aa233bb2885ef1060e9b2280bfec"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"573ddbb8a78d9d82f8015ae9cc62efc2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"11f5b1c59847e834241b563b2d62dff2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"c6f50c09ffacc2c35e3db0994c102de2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"e9b8ff332f7ab78084785a3bac85b1b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih208oh104kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"5d2b6eb1fc974b75c8eb81a6cd90c2b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih208oh104kh3sh2dh0ph0_iw336ow168kw3sw2dw0pw0_n"2412359c743bab1c42bee4c98d163154"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"b0230c5c0be1bf1017f9d2da8fc2eed7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"662de623f8d2b6aef2fefc1444f2cd14*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"6c68d755c88d34f321abc29405e6e625*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"a77187654439421e1121f1a85d7f9b57"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"3e7615a922bd7343b304533295116d5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih104oh52kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"4313f6844508596569f5e0dc67091544"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih104oh52kh3sh2dh0ph0_iw168ow84kw3sw2dw0pw0_n"c7af2f72c8db6ff5f45fb1be60d9270f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"d787513a8444fdc1eac103b75642c2fb*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"72c225b944136ca07fa544ecdef5e7f9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"c9fe20c17c0c4a84ca1f57cf5b7b71e1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a4d64ab6106867e018a5218e59d77351"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"2113ce9aefbedae2446acbe46ca8988e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih52oh26kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"a6d47e6fb38dcba95b0fa2cf41e7f145"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih52oh26kh3sh2dh0ph0_iw84ow42kw3sw2dw0pw0_n"5174a3104ae15e281a4f6663c209b10b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"6d7b09f4794f2c26735dd2041fbd8301*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"8ce7e661e9fc5ad8cce632ac6fdc3f54*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"1f4711b2456a8dfe0264c7e555170f7e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc256_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"0b19750141e557b6367403954a798740"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"e36b4204775ffec904b20137e8a4a045"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"5b309f758e78b4d43c3746351d83795d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"cdaad13809158b241e36a6ff352255bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"21d43b64178e2a29f85eb3383c91969b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb1_ic256oc3_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"e01537a2861b8c2131aec9759968d4b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"22d8885bf2e3addd3caa2a689ea40ef4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb1_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"2433a1946b132615d6b773d94fa70292"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"81e863232146f791ead651121e249223"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"7bbbf00487ab2008ebef75951c1e1be1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"9319c00798b73bac073ac53994600513"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb1_ic256oc3_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"9198590dfb409a7a1be68a221bf5a09c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"916eebbede7799d366f648ed85e00152"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"3729dd31c11fc086829f7c93cf94bbd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"069ac881bad5875b7b0f5a2ad5ba6869"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb1_ic256oc3_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"b192eba1bfc8fa1dec913c843251cdbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum mb1_ic256oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"5ec417b41a59a0616878e929320311a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"5167be0998afec3fc93f4501d741f07d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"d3a29bc8358cde8402579d3f72614ba7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc12_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"c59c9b84b7cbf7abcacae8dcb8b239e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb1_ic256oc3_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"8cafc71daec6b6550064b09916f835db"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb100_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"cfe96ea930a94a96750b82c71d6a3f56*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb100_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"17ed47571e1a86ba7d2f7a80f1c0a14f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb100_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0c9b673e1e2914e6f7d9d9c87db572bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb512_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"69a74b00104fafba693e5d65b1360d81&59f33172fd964e415d25d997b1a223a5*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"3bb9bf17abbefd1bce8ed434e9f77f58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"c0824bd9ab35d4e6713afcfd8f122089*2&df8c211b3021fa5a3fec72012fa06341*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2ff5b0bbfcea0243bb1c9484ed23d6be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a0cbc4f03b824e211e8c2d2574f318a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"7b8c3c3cf350833996b95ce59d403ce6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8c384defcc427751d29ae1a5d901e025"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc128_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"63dd214a213c7bb52dac27227b193a84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"aa8d3790ab48b8b5f9fdc8ea6a0e3bb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e625490882d0b712ae038716ddd1f792&f83ed8fcc555f9172e685a7488fa1ef6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ed0de7e76832a12be1c3c192a8dd419d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4fb943084a97c0bc88eca30f4013386e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"5fff6a84002ff3a3f551f312ca8eb471"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9c31cf4f1f71aee3c08b6a5c4ef12587"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"07a96d72b0ba53be2e54581e33f86cd6&dfbdebf371ce5ea635976e46fe5f8e9b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"72fbc7330c63d85586f27abf54744339"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"8c5d8148d4fccef3766aa939dfaf3193"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e20767cbbcaaa5221e186942525f6807"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e3b6ad3bc9f1dfcf94c24741edf5ec37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"acf3c25a124e374a472d3340d9be8b67&d3b4ccfaeaaefb35fbe8dd301e4e70f6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8bc92e108c971c716fa984ae7fccd474"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+eltwise_gelu_erf mb1_ic80oc512_ih3000oh3000kh3sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"46a686a2f9298d7fa628d77d555ce1a9*21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx+eltwise_gelu_erf mb1_ic512oc512_ih3000oh1500kh3sh2dh0ph1_iw1ow1kw1sw1dw0pw0_n"e48bc034c9110ce74fecee93513c2955*21"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"73369a112c52c31e37b509acc448fb94&c56d1172d41ef738b6a920211e0822cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2c1b33899c07ab5556da888ec6380894&1e3083ad252e8002860314dd28e5f5a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"adaadc54ed0680097013882fdb62fa03*3&7a201e06059961921c81ccb192584a64*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"01739e2d3932ef35e7fc915a6bf0e6cf&fd7d9fd8fce16825cbca48f2d2da65d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"30036a4c7fdb2b3a9f541242c0c9d594*3&711b3d9318f2c64a77e9b952373a3bed*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e929905eb7ae347a18327e344a56b9d9*2&1c06bb521f58d18bdefddf4c4ed5fe52*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2fe09958314689aa5ecd54f48ef41e38&7a6c79b97de937fd17d3363eb7b84601"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5ae668a4c4f8aadb070e622c0e618a32&4becd2c021bec5c1ef28e4bb778fc66c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"5bb7c711b5a3481523925d11f4cfaefb&f94e2567f40b88cfb735bdce5a15a569"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ea7cfc90c9e84304e96c1a079e381f54*4&adec700c274c1d2424695640f7081f8a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"841f281432c80a660cbd37912cd10686*3&c1adbf4fc4b403a8022041b7cebd437f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c2b322bbc6773c15e95d651646946e6c*3&233b9364a1d03584eb44ff1806dd38d0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"83862442d165896d0fb90b9c9acf42a4&61625dbb221c921368ae13438ec2bcad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a256272bf1f2651231217a87131b2ebd&118674d3e6411149c3acf8e186ccc4e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"50562ac1649e58d6ab569905c34cd39e&e03185e0e340afc3879beaef52deeeb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2f919329d649adf65463375e6f671588*6&ccae73826611d98dbf83a25976eb4038*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01fec791b54f0d35e5f1a2ce01ad2638*5&03d64c02909a492c7dcc7beb6e0354d3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6560bed7fcee8ef72dc82be12c75aa0c*5&3f3030c9f7d146dc819418c0d8fd62af*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c189480e9c00fcb1e94927c9d5694669&55df6bb22e3f3228b4802319a0541026"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b8496bf0282861765b1b311ca8ac0047&529765ecc88dff292103517c41039bdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"0e8c1199ee273a3b2274ab3f1797db75&822cb3617de6e557ef1ec9f6e0d9e8aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"685fcc98b82a9377636fd6aa598f5ab9*3&f8a99b8f9a082540df178da4f058a1b3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c5f81e509efd11a5694585d779dd43b0*2&f5167187820e472d12acac4e754ad7e3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"95e68375a9e6a7ecbae20a686e3988ab*2&c58f0a01e624cfe1863f49d99dee3e3f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc40_ih301oh150kh3sh2dh0ph0_iw301ow150kw3sw2dw0pw0_n"da2b00ad2d764fe13760ea28f21023ee*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g40mb128_ic40oc40_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"c8ca04328a059335b1467088067fc715*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"23e2be9ca9c43c15eeec09645a16db6a*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic10oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9d60dc45942044588dd92e52a667ba34*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc24_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"bb7ae5dc63c8098ab55656be89b64386*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g24mb128_ic24oc24_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"8cfb0853b0e7c6f9ef9d74bc9dd2e688*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e88d26f39dca0ee169f3680b617f2875*50&c507aa9ea676ef6a1abcbc607c0711d4*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic6oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93d652259863b6ae6bf8bbcd40eb0f73*50&03539c0017e5e32eae24e45c5795f956*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc24_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"7c57c27ebf033ac25f820df63e458abc*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc144_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"2be3651858f8e51770d493dcad772d8f*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih151oh75kh3sh2dh0ph0_iw151ow75kw3sw2dw0pw0_n"686780ce951ebdaadec4969f789ec9f6*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2a223eb0b3c325077f3dbd1b9c98a8fc*100&3a1fc276fa3316392f2282040cdddf8c*50&ff42ccf758348eed97a2f57cc59a465d*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"62b0250299f81069327984e8706c1708*100&b59dcc70ea8a5d3ac8c6de0f882cdd81*50&4c38c9932b7aae075302f0b7bca233f7*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc32_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"7821626af2686b89da33df51ea182891*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc192_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"b11ed7cd58033f7422fcebf0e2e89982*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"07d2c3feee34610fea049121926bea4e*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c88e5af34d028be27d4392101a06dfd4*150&4c59cdca0a8290fc267a263eaa15f07e*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic8oc192_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"78d820d6bfb5c042fa5f07aa34318584*150&bd7e99f56ce31f894a000a81d9fd11b3*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc32_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"1913b26307d12b12118565c418ab99e0*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih79oh38kh5sh2dh0ph0_iw79ow38kw5sw2dw0pw0_n"d1009e97998f5fbc7d8f8fc5e81bdf21*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc48_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"bd21aa37cea6dc2df5bf25892d96a029*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc288_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d4b1dc595a5cee2cad23341eb8160cd1*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g288mb128_ic288oc288_ih38oh38kh5sh1dh0ph2_iw38ow38kw5sw1dw0pw2_n"fdbe42070ee114a8abff426e9e9001de*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic288oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"531ccc0fc9e748f1ecf3b838d1e62ba7*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic12oc288_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bafebf6b1d020cbce5574384b4dea0ff*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic288oc48_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"eb2f692eec853ba092c698e80d0b9c44*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g288mb128_ic288oc288_ih39oh19kh3sh2dh0ph0_iw39ow19kw3sw2dw0pw0_n"3e922d2f513bca8bd377c8e52ac7e040*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic288oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e88a6921b99387fd7b2f882a0cb808d9*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc576_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3b0b8b338c3d5e10d22e2a68a3ad7d6f*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g576mb128_ic576oc576_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"dbba0c1806d664634aa7667127efaed0*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic576oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ad00b3abd6f96cb9b146994bbfddd470*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc576_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b2f457efd91efe6aa4267241701035bd*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic576oc96_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3a16980c735eba98fcc6ff1369c3adab*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g576mb128_ic576oc576_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"d222f21e048294a45daa53cc78ef21d1*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic576oc136_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3178f353dd48ee112272d87c12528b4d*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic136oc816_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"abd4b0963fe46b6439ed6bd51b08e152*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g816mb128_ic816oc816_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"159848ec4160e11b766ef76d2ea8f88a*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic816oc34_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ae1126030761ecd6315c58a5061681ae*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic34oc816_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"800802d09845948d802e53d02911331e*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic816oc136_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"14b955be0cb18d943227fe9cf1bcde3a*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g816mb128_ic816oc816_ih23oh10kh5sh2dh0ph0_iw23ow10kw5sw2dw0pw0_n"c5dc59bfb657772263b80f8168f2aa27*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic816oc232_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"cb7468d1757551716eb48f849aed8667*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic232oc1392_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6d45ed4ef9e705d71e0af11dc9c7d2af*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1392mb128_ic1392oc1392_ih10oh10kh5sh1dh0ph2_iw10ow10kw5sw1dw0pw2_n"d0bcd9f488511c78be00762f1e61cf71*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1392oc58_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ceb3acfec60d729c3772532108aef81c*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic58oc1392_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9ba826d780af262096b0c01d61e974a4*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1392oc232_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0750bbca71d955b53bb51e96ff821abd*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1392mb128_ic1392oc1392_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"84384982f96b670460eea867a75805b6*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1392oc384_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"206136c113d72674c53383b6e294a377*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic384oc2304_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"40cd63c2980585a47457b52148623d25*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g2304mb128_ic2304oc2304_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"ded20b1a6896489f743694fd8bb9cbd8*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2304oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5b935663b6aff54e1436c44840d633d8*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc2304_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f603740ace78492d1d82e3f5f4626d29*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2304oc384_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a31b4459c36417a40ceaffbb512b2534*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic384oc1536_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"56125610e8e157a3c5b70a21a26be884*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5edb99e9dceb727f4db17f16d4dc873e&72f37d74b1991f9de3fb1a0da182a79c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"aeee0f1f97958e62d6ef2cca71c6230d*7&bd76493131911bad1320ecb42774cfda*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8aae7f2fd16b18b1ba0c337a2025c08b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"9ab0da00e68edaa58b60568f736282a9&6132c01d08fb7c321dd93d9ec6267a46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2a58fe23cdb9a7a7ad1ac8bc5940a4fd&a064fc2c4cc79c0f1d84a65f165ef877"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6e50dfebdb4028a18976a33933cf13ee&e1341b644b975d5fe53fd4aa9d55e661"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dfa613b134f71ebf8a3ad982ec88e261*6&808131ea3d43aa157ad758c6cac67e5b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"de913ac7bd20c6186d7cd5b65287da77*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"24254d3f5696d95b29174eb13d99c10e&805fcb56393490a85d41f01503a24dc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5eedf80d9acedc47a156089855b21c83&8ef0485ed78878429ec94ad8049dd240"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"18bb58bdfb0469ec64ab364060e1ed78&ea3c108d440afb9f3e550c77b4488b78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9cab54b213f7f5e49ec9be91c5f9a9e6*7&0c9308a0076cbf64ad438c563e54d040*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0c7a5a071f04280e41865e033b6b87c8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"057c8b93915320c97b89625390805c7a&cefb02cddc651d18a043756e67a91c86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"08993d26898230de59b909c853998b19*11&8d794f954dca151f83e3f823e6d4aae4*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f4e78db8c41f76132517e75fc0dac6bb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2b18eeabf32ca1180108f3bc24b1825f*3&54b6596d2aa26ca830a5268231864c65*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2b3afa995a761446845ab38bd14e4cb0*3&759e537bc608e33ac254be49f5c134ee*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7a8967c8800c854f81745dbbb8d2480b*2&d7ccb4015b61afef76dfb15a6460dc4e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"da4555e89f3502371453a93d0995adac*2&e70602125a5dc27012c8b94378246840*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"51bdcd4418ea3ab550ac9729a57ded6e&fb8da5344171a4466201af70597147c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"7a205460fc64ad6486955fce3e8b671a&2af767d191291277372ea3493589bb25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c006a7bba080f0483bb8172167c95864&4cc817e49af9030b77c93ecf21a21012"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f71b6fd7edee3436a0835cb899875dd6&c8107a4cb3d870c88e4c50a4d5756951"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8e7e0cae984224be5450fd5c75c5ce87&407e9da40e616dd5adc0c0609c525ade"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"ddbc0d64a3cdd134989c6510e0e2826d&89d1868012ca0aad6a35640d395ae446"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8238b4e7b8e9ddfc9b49f146d8ad8132&2a783ab6342b8116501b54c3959276d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6ed2f5cba7faaa5147e4edb4c55acc43&f7b99d9a460541ac36b55e224fe3d914"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8cab124cba5a051a1354af6e576d5358&6a623339e5f6d8c4edf402efd9c76648"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"79a03cbdde5a61da1a0183829151f674&08c1bc96d40d2c7c62140e2827d8fce5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"c5762c4ad0f7aabc7ae9ec94983c0af0&a872bd1a7afb8432f6194a074a3b9dd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"05ece94bba0bfd7e3ff4fbd2672fd674&64c91e5b0a600a5449e5d5ed67d2e6d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f02db8cb214517e954937c9582672416&43043c3b4cd6c78ec9d2097f02f346ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"762ca9a3f06fc8a1054baaea8848afc4*2&e65abbb8a8eaac111d7a7f3479866f44*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"97af4014a6aea91e022376afcf7e584f&4be89382ed121a583361ff06589fa7d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb2_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9624420490a9c5b407a2d65340575a51&5ca5ff424f4473295e70284c4b6dfbdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"d7743296f46b44e0e4cdf2c14f255f09"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g32mb1024_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"0d44f8e981826df1a2e8d7988cc6c1e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"1e299d7660961a58e4c38821254c25f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g64mb1024_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"a90be2658bfebfe95fc92ece62928275"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"6b8de06cc91444144071c64b8bf6db4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"59f00c96c1ad7e1a25662d3eb9db944a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ae31a60540768302266eedcb7c8da203"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"b127b38f049c1de7483205ce82697708"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"db8fd11ce9e7bf6f9b25d8d933d57ce3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"83da8949dfb65e5eff3a0d80b30eb401"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c489e8fef9a903fe335cdee354d73679"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"71ff382b8a2f1cc42e7e112c031a5bf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8dcb0e631df4a0ea6698dac39a2c56b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"62ace5f0991f59172e2c235abfa7c287*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"bce21c02d0b9e141e7dfbda4d532ac17*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a88cd396a7c66220fb1f5d44c8f83219"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"dd5913652192bf95ba90693878ffd1ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"1bdcb21523c2029b3ad617206d98ff89"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"8309486b5e29e4875e5317e8fff91cb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g1024mb1024_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"e99fa6cb07d4fb3cb9270c573ffa093d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"26ee8f7a99f4cb91aa0bf23234389547"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc126_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"fb06403ad0fa0ff67e17c7266e26bd2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"ef3b45e10e5f49ababfa51c30ac9ad86"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0c4537076451bfbc093f08edb686b6c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"9d44277013184a6e5a91f5b8bd753398"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"95d09aa3c3531916b803944961ed0a56"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"ed62ae2f4dbc8095a75352feb0110ff3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"592fa6d1b75ed7dd1fab018a5619ffd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"db5dc670b61d942084d3ff9db74bb9f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"168a73016adfc8ff89d45e3d0d199a38"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"0bced2348e85e1cf932c5f1b8bf4213a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"89b86da9700db5ba1360e869ea79f68b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"6e7579241b6829eee461ef7ebb0566b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"29b3237d56a70f96bb195a38f83b1a27"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"d7778268fb7b4fe39e798d6f57c628d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7ff0a73109e4a2e2b998a64524f6ce46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"23d4d6163250793ed286203e90a3293a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"19fce37a8bc25d8c5bab4da076ae8953"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"0fb2baeae889d4d2763a021770f37432"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=acdb mb1_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"89969ae2c82823647f2d8cdb9adadb83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb1_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1418aca67fd61c1323c12fccc7e28ffa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0417397d28dd9a45643268c7c7705073*9&9aa1a578b041e3d6f91c302054dc334f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4f293c50273796bbe67f8e027fafa330"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"12bb677bcedde233c09adf7c48bf6398*6&faa9c1db9920398cf33b0a96b68d4b31*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"07a16a070cbb4cef4af458b60adcba4d&c88574e6963de8394ad4d0cf2681d5d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"743e5239809b5d4bd8e23077d8060929"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"da059662f03cc4c04256413136a31201"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e12847c8b40ee01643e49b53b2f7277f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"845968c8612cc1d0ceb14f146f7ceddf*4&c257694c7b76382b87cb5ce4cd5553b4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2995649e0ce4ec0154d609136d4b3fcf&b6879867bd6117ae45824e4f23a39924"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"15d55f409d6812a4c809db5267517702"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e035267d45fd840e10f04f812830b27b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"cbecd7fdb5501e5d4ec7ebadae161361"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"dbe23bbc34cb54442737d9d4231ce658*5&525ff0a17595ae8e1aaace5b31d4857a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fd5aaaf8d1c9bf85fe00a90629dec7f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"40df0b1a8ae47e20ee1f9c885d0a620c*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"55713807abfb984d2f65cafff97f60b7*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cb29baab8904432e0d2cb8b58aa600f5*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"dc784ea532c7955862ee76b1734dc01b*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"19664ac1d686d7de9c81ef4b17c7cad1*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c2cdbc70e387d5a804af633c249270c8*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"fdbb9ea7f80bf7b83f48d591d0f3edab*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c832f65e813ade6b4cf333a5a97d0bd6*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c87ff9efe36c972694c0e6f907a875bf*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"72ad0892e992d16ab0a1a3a0d9cc7baf*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d93ca1502be5594431175d420455b099*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9be1c086315cb5ccc93b0e4b89083642*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9b6e80489d4a78e603393da444d1edcb*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c589cc12c12722c2a4c13d2659b95ede*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic4oc32_id128od128kd3sd1dd0pd1_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"3d48b0fe06a900104392a71fe7216f91*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic32oc64_id64od64kd3sd1dd0pd1_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"80c9df9b7b6e3636d9f51a5c767d0223*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic64oc128_id32od32kd3sd1dd0pd1_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9fff62d52a94ea684612437d3575a491*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic128oc256_id16od16kd3sd1dd0pd1_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"e02683c12b2f55f90e2f68dbfe43d58f*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu:1.0 mb4_ic256oc512_id8od8kd3sd1dd0pd1_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"95bc78cd3896be0bd2fa9697334e4f86*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"0ca2a0dccf3572b6da7bb9a5b3892c52"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"df79f29c4a72bd1efa04eb19928d3052"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb2_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"95b70e856984de2fd5ff7ab7d8e10a8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3b46458529835bdfafada2d73981efef*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b1c6ab2ce4759efddf704f7fc24013e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"38ef6b0a41a56d9016b183380fdf894d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"20466bf7fb2df2e4e64b52e2c6fedc57"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"22c22ba1b56bb7ddd18eaeaa3e8c4841"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"73071868e5a3db929586b9d4faa56ed0*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2bf23201e9ec5d2063a36f31881b2e6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"92922f9f609edad6128f7f47f94627aa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"90ef95f6f661c8a278b0bfbe3da7da77*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"0f5ab3a51d0ac502f1afa5c190746b48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b25788be1393fe7c7721aa569bcb22d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0c94f3c606dcf162a8cc2871708b4f23*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"b64979964a98b126130439599d5525b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"af28226d655e436cfe0e6d0031731da3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e2df87b6e1e40b2c34b277cdcf2e9dd2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"4d404a4a098e4ebc8f9472abc08b0317"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"0fd012328077200b656c543786368215*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"21aa6428af6b80bd83b4f9b908f018e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5b486bbc1e140c9e1110951ff54c482e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f15ad50aa98b824a82336e6405a14096*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ae85c6e910ef53aa0a646f0f02bf15fd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4266e1ed593870913547426e41d26c82"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"667ef9b244d7e7a7c496805848dcdae5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9b05582470af36d057c6dede1b0f4345*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"c807fb6c27244dfdb43d4c299c31de30*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1823db2dda15251b22f4b8a9b0168f7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"96904dce77c0a5e67245d7f699ec59a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ea3ae7298ab8ad34e98c8b9d8f708efe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6c6d4e349e93ca7b75721d350652077c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9e8aa7011486425a97d41ab12f4c32ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"11f12c0f08f37dee5e1b59ffb464a138"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8d81ab80d06682376bb51583b5897008"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"af6a44c32caa127089ae41357ac7650d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d213a05f93c15f880c8862da2b2f46d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3cf756ee5f78f6cd8e769998b0c790fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"241130cd50c19844000fb9e3f862adfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5d684082a241667be898feed2b4c2012"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"355dc4a2094072930a4288536601e27b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e5e257b5860b9eb369e9bb6d6258ac66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"46ef7554cb6d535c7f8208280f2fe45b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cd4d8dd794b5540c3fcd956293962fea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb2_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d38ffda40bac1c4e5493d2531709c45b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"b4a5c38f8a347507f38a9779528d7699"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"f50b6866ddf963c21d142ff843e391c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"b5a0e56a78048c70ef7b48a67edf7f30*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"5c60c85f0023cebf936796f8bad854fb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear --attr-scales=dst:common:0.5 mb64_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"8226fb39c949d95b63770c06e2eaa650"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"7dc4c097b8ca7c2cb94a43fe13dd1000"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"e8dc06330f318c996f1a3953e2bc117c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"cd1f68904d269e11a4cec6d9471c2227*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"da85bc79a7f5a0015e5e4a8176d37ad1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear --attr-scales=dst:common:0.5 mb64_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"b9eff786d30c3bc49e3ee9f6849e58c1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"059f3ce3ed9e7417a65d120948e98eaa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"f703f015a40847bbbf410a64d72e9659"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"6a176b231d24082c92e3fc440619ae04*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"3546e6224722201b0540c909157da901*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear --attr-scales=dst:common:0.5 mb64_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"735af6b8d8081d79b113091f3d5eaaf9*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"35a2af1ded9d72ceafcbcacdd07ef90e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"407e6a3bbd7895282af36fd96c727bc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"270bc4d1496a53f809300213036018c4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"04dfbb450db19b9e9da681a112847325*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear --attr-scales=dst:common:0.5 mb64_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"e0e7f4a20b12a50e77b3c7c3fc734fac*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3d9efbd685b36ee15290e53ae8d881ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"e93295edd879f68b191184d960c3cb5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c6b1379e519bef7e61c619c6cd07a1ae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"674d027d0efa858ae530d89a27515eae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear --attr-scales=dst:common:0.5 mb64_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5e6ab4e4735dbfe57ea675c6a4e44d2b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish --attr-scales=dst:common:0.5 mb64_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"53e7528dc3dd9b4e0a7f5a8eaf5e8d92"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fcaca9f315ee200ef6f0889defd5838f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5cc8a831273438e0666d1dc2fe317717*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2866c477f0ff8baa02da4d3ec873fc6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"ee9127da977ce4ac9b273f8950f0bff3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"9257882b2d23d6a6a9f7db2dc472a1f5*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b0b7214c2d8bf740bcf8a790be976cd4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"bf161c41ca6f8dd49be87da9ef1845d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"17f07442773e91aec8db088096eef7fa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"8697f48930ac65ed4c15d1aefb0ea1e3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb64_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"258fe6882220d28e33080638963a7ff0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"6ae9f2467b7682decce0a57ca71f4deb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb64_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"c05a6315f37f144f6a5359bc128ef509"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 --attr-scales=dst:common:0.5 mb64_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"26ba18ed3696cc052b60fb4a140df1db"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb64_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"65ff3fc2a691c6efbe8adaed4bec8deb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=any mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"149d5ad458ffa0cfce2d3a4e1661d3a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d6b458cb223144baeb28e274c197083a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"159faf46958a6d75852e4d6872fd0b1b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"91d4bd270b8cf164c1b3d0e98a3f0f2f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1e40af3b1bf41a44245c61dd612542b6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"603e0334c8c3053660f3da05a30498a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"32685a95f55578430351411679b7bb30"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b90ac0823af906e6816e5c8dba8c116c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"dfea058918ba8afca7a3034575f869cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8186db927b983a473824e0e791603f3c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0b480389d22c7e12a67cd797cc4c4266*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"573fdc263f44d00f0a01fd5d8170ffd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9e493a05504904fe78d148cc3c143333"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e0472b2422783b1d3c3c2d304ad7f01a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"01be92ee248a84da6a54bd6dd28d42cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9c689cf987d3316ecd6f4b711e85557a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e0c3b07321e8930fc81d26b23c5f7b71*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e7aaa742e3dff09e61292ba911522746"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"57e98215984f94fdde389a0fc7b394ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"09325534e9f0fefc699dcef023a07c16*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"bfd73e612fa29efe5704fbcbc50f430a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0a77b09c200c39e7221f6a2ecdd6f76d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"cfd355adc7f90368679ff9f441f609d1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a53326c97be11aa53bac56815a19a02e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"e44b5975fd57dea9077872dd13b4c8f1*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"cf990960c0ea7da9784fb2c9949a0dc6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8006437d0ab45e910b617dd40cd59897*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"773c0658558804eed45dec61f3979d4f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4c8fd99ffe5ea0f406928e939be946f0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"b06a11422c52a18408e7e8913220b159"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"aa3ec0868d60fb83f9da1ea904ba9f53"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f7286ad028163636bccfcebdcfc98cf8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"2f483b44959b70b700b90de1151900aa"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"18114b869c7092595e4872e11337af15"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9d6cf5c589a6e5bb76764c3439d1f533"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"06add479884db25372a36d2fa66b5cb1*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"be05357f35b544f18a722b79dcce475f*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"df3c6fb42d0c63165f5d0a83fc0b8107*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"96231e685c2ee3190690d06b696b7c4f*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"32ef4c1cc0fc56bddc1146f692c8a0ae*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"93b858509ced622bd92545c2b9b20e19*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6bfc17578542fc0fbeebc7edfc668927"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"ece237e7419931cdd7848ae8c34b4d82"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"80bd2578b8bc4e2cc42846715f5ce0e5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4ba23e1603475f9246bb91c426262aeb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df0eefdbc5f52be0f0c6cecac6fd15f5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4b8a8f669c370933a2a9cff8bc1dc9ef"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c73796928f563c4fd9192fde15acea5d*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9684e85885989f88d81e66eb14e5740c*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4df2f15e4647fa43a5052e4697f8577d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"04d82cd23d56a1275f439e53c1952cdd*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1b9a1f17440656748b68700a344ac8e6*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"81569687b9db63bef4a1736b36d3478d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"790c02019e3af3bc8417c62c80c1b64c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"cb57c695fcc523aa869538a041d8760c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"1ded9d9389f10719bf8340816952cc00"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"4b3a76b4874cd75eb104fd372e5955d4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"847e1fcfc56a051c6786399ded5f8d77"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"865dc13d6e0059944a6cf8d192645ee4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0bc7922866d6cd52a2695220111b3fc6*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"21f9a0d478e526d644994f934eaf80ab*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"936502ce104e4969e0d0374dcae5e345*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b040919abb538a84520aa943f69014bd*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0759cb57a79b5ba5762e43c13285a626*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b2440552128367003b3172863a82bddc*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b2465bb8e690c47d0b69712820c441aa"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"33931c96116918c794bce247cf327ce1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"db2ee1c92f3b71c691bbfb70a26232a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"3935dd7c0200eb166aa30a34780a7b65"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"343a83690b87b9d59a57d888ca393e15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"b544e7a35a1cf0b7997f0cf80c160035"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0be05f0d0b58d755d7c66c33e44f6db2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"eaaacfbc17d5cc93e2e0cfb65d21c693"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"85d3ae23fbbda74a58d6753004580457"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"90f75f600ddf06835305be08b4d27443"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7e3ad0d6aefa3ff30347c0f79080f0dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"a9987795d9dac30685cf60b50ff3941c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e9f26f3d3d5743f694de8d6ae6781169"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"709f331c8f6f105264ca1dfde183f328"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"521cd0267ca9ab9fb907b7cef27b5f74"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"be033f751290f76307300dc6c2509aa4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"fdefd5ae95fbbf10992f79c70aa25493"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"44bfbe1cfa2ffc16c54fe227e658352b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"66cf6e3590a3209b0cff3658d82a9bfb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"3c7666b5999473a96de47c3326e69241*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"a370753ccfd31a1bc6592c3ecb926858*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"7d7ce6f9fd3e7040aaa66dc759e9a2ce"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"67b7bf05627b57e90cf8e4584d0bbbea"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"997dd6ce531135e17a55dd3e27323ae5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"65b1ea61fd8b2625dd696f6015b17b0d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e4355aaae1cc58862a3e5b8bc60a914c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bb2379b432d1304cdbe4f0d18591bb9b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"5cc33c4f2c42e8475c5d8b70734f5d09"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"772548f9ba0b9c54799f89dae37eec9a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93a05165999d9d27eb8c8dde5a8017d9"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3efc0e11cdfbfa2fcf73898fcc7d778a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"57ae7ca49af52d55c6aca9d4ae43aa21"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3bf3c70a0688d6206fa9725c397c84f8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"931c9eec25f3f9d42ceb6795928af3c4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"93626e6a53a3b01ce6507e74ac94fe66"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0f4bade6edcd573ff3f41bc3587f0a02"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"adabce7b4ebf904aca1e5a1ad3409a00"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c533764efb6eb09d49c859162e8fe58b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"05bb0baba4f8e9fdd254c4c4271fa83d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0f15e49c18b4df38c51f109fdcb63238"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c84f7034a1f94fe8884e0dfa052dc75d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"d7ba73efc7d03ddf6b324074b8578199"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"ad1535f87cee44fa7276bc5af9113682"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb4_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0d95bbc59f6cddac1849073d78fff81a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"fc957ad72e1c541a029f0c2e10462209"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"84eaf76b8a17e58498c6a20500928c0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"06646e01f93e26c2300bb563465517e9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"1e04aba7919fba657fc77e0af8b98ca2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+sum:0.5+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"9a4466c1bb92abaafd390ccfc64ef77a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"6d93b4dff760abeed2ed0243a44688a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"0afb87231036fd15596d0838529e66c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"64e439752197065d5854227b7de73381*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"e3682d4a2c4193bc9c506bd1ac96c42b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+sum:0.5+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"72e651590ecea37d8f7483659d10f54a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"2c2fec0fd3bb2f432fffb2de83bf7f0f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"af049d134c26ab19696cd56339b9151b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"1b7d468aaa47f055cdcac790b632e4e0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"64236b377562bbf39a4e945e8b19ce9c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+sum:0.5+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"25a206ea8482007f4594e6b38dbd45cb*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"ed3061da2e8e23e0d5c89a5cd307c1ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"96fe866ccc66b9b9629819eecc551f6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"424c06cdf1fcf149b838051e10122afb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"22eeeffb38115db54ea866c390962e6b*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+sum:0.5+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"998fd633d1044709742ea301d5e27685*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"9fd1876ee37e0d27033888c8a6bfee7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"bb67aecbee25f618a6a7f84ec6294e5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8bc3eb0f94a25cacac196fe328671fa4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"aaaf67d10981b0ac36ef137555b16929*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+sum:0.5+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"df5209567e78d03876f97a2d42aa4b7e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_mish+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"379fd6411ca22b2317c8e80e7f0ca891"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"80af0a1bb3890779efbafa8fca0a68bd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"8c33813039e33b2ace1c3c2949cd4360*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6b49e3a1a341ba5e6f52bba9401235f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7bed62538714fbd057c7d0e1f2bcca69"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"e7d6de879b96f25cab7563f843781e43*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"29d39d971905e4e940bdada4cd28c732*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ad371a41b2d5ef97d771e4953c5c3cdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"84363337a8a56543e977bc21c8b388c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"eab872dbcfd3e61812c11b02617b3386*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"440c9b49f6db5e4c718df2da9e061f8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"c0cf1c1781ee90fbd4c68342904b0131"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"5814d23b8750e18c8f9efe8d55f41472"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu:1.0+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"a128d39c836cecc4089604b5d671552a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb32_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"85669717fe1bce23e27ab25ff631f2d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb2_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5fbc2c0be74bf978834e7d3d45fbea3d*20&b70ebed33c5ea8e2e14dfddb53d19f34*20&619c861317bdbb9eb26a75bf24bb3f21&40b5b0ebc8b6bd9a1ce27ac86cd4cd6c*20&ee230b7adb40d17db08dc675600376a6*20&994e1ed5f5d835d83b4829b374510470"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"31ff1c03705e47936646e6af6edee249*40&6cafbd5e57c94454b7b0feffa65367d9*40&37cc929758c775d5eb75fce37f295714*2&900482cd6f40f81b5d7d5e0f7c368c75*40&76e15a917e73be26e69ed166c7f8f8c0*40&fbba91a5a7371347d2a55fb98cf453d6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c6e658c221bbfcbddbd8a35948feeb2c*40&f96fb82593444a70502803094c432876*40&cd29df44e2f4292f5858a555ac2fbd7d*2&e224a6ff08f205839b16415ae3518d51*40&0841af4cd4bc1e41d0ca49bf1fb3c0e8*40&ab01f22b98b22ad5616d608a6c835281*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8a1d0f9ffb1750e035026ae787c09528*100&48f41e9b8dfd72ad564189fc30dec105*100&7a6b8552ca28af2e2b93414874d34308*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"3b5e738d2d461d089f321826a34996a4*100&58fbef8d7399abae809b03891e90ad2e*100&739918072681e2b549bb32b541b899d0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"17758d271ce84f5bdac0be4acb56d543*20&d262443b7a7d065fa0671676084c2e56*20&b6280647f8ded4d472cf5e25e216a2c1&f34e34a102265fe78c7c963d6a4353f1*20&16f6672afce2f84b89c2580b7926dc80*20&febfb5105037e7a5e4a09636074602bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6bc6001cb4e6ef94e76e6e0f6680f71a*20&f37afc2536723b30bcc7eb4ce42780cd*20&68c802c852aeff0890b9f24d24f8bc96&b5da0dff1310ab89d9fc38add5c01b49*20&a1c97ae6a4f874957da92fc6531f3732*20&47d3ed637802228593eab7a1feb81c6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dddefe327b98fa773628086479945fdf*20&83c90e6f00a43473401753d4dc3b434c*20&d375ec0e073843e97a4ae7d59bdcca06&f022c74df9ffafffe1688a960ba5c309*20&1c8188daef04971a768a61c14986c95a*20&62b7efbfbefeb9afde025e265b709681"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"27530a0b7b23a94deb822c09b7debc4d*80&df377086465b43ee4be82c7f2ca5509e*80&9bb4cdc1803c6d4e216152401f22e3e7*4&7bd6b276e45f6b6a321af300c977eaa3*80&7606ddd41fe28b9a18aa3e7ef00711ee*80&c011c25f977eb63807ee4b935ac54737*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"91e37361f9637e38496c8f08532eedb7*100&09ce99e2e92a57d48d796951fdcff23f*100&37986a4a35eb309137ae3145c7d19a6a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e1b94c088d8c29f755b4ba312ba2ae29*100&0792ad7b259f2a59fa4a4fb376be94ab*100&a5162703230755f95aac5b76b3f322ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"453ef501136cc16500b2647f17cfdb26*20&8d790d6fdf33ff9e9af7f94e0e1608f6*20&aa554c2537d55b3c1ec83202aa422a0c&a3a39732f6eaa082af6d94223d68a84d*20&38fa931bc4952f30ebcfac492ab45dab*20&dc82895e7903b0c22c028e2939c5dae8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8bd4affbf75311fcb211efdd26ee62d8*20&fa3d0e1ee7bd4305c13af2ef0f987191*20&60ccdd5c4c4bad4ccb11d94844e90297&8e56b6e6ae5186725181764cb00582cd*20&83638279509059594d26d574bee50a46*20&49c78f964a4aa768d9bd576561b760d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"bff51c368fd3c41b9f6e4326b62f0f3c*20&93bc99bca5f9fac98fca4237c344be1a*20&90c127c0973d5f4c0fca8eb2a20f34ec&8ed6bb6ee0a6253ff7bf6894a5158251*20&424081bd80c17d3cae688ab7d773b428*20&2d89a96a7b9126ed623ab664b0a0e4cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"bc979f56f240262da73eab2db05aa01f*20&ceed8644576780b711b18630bbbfd6f9*20&e7749b4862214cb78e8b24eaa16dd0cf&32d3a4cde848877fc877626e8913ebf5*20&2bf92514093815c09f4a407199e36195*20&5c0e5b4893bd4e71a43918992ee17cb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"173f5748b9ed96b2507ed86fb04cea95*20&ca1e95141e8ce8c1cf9ae54d76f4d01c*20&0581779836b83e9b16859b8b33d32f00&d540281eb300ac2a6a89d45d625b94e5*20&4989e3daa06dfbaa222484c7a69cc270*20&1adb29a831498ad9372c4ef35a075ed1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d77164fd90d1e0217e924fc533da53af*80&488aef1b144f4eb8c4038d3208a2a2a4*80&5454e7932504e812238b8256239fd9e5*4&8606b746d292b5d19872f5b83a9a5588*80&7e43af88dda165f2165785549fb68351*80&cc3b19e40060ecb7ffa0660e4dccc2ba*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"404e2b5e0e3e8399c1a236d5e37bc396*100&3839f00a9ea014c150ef1312d4e3ab07*100&27287c067ac7799f5aa669e564f473a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5f3e38ecc7ba4277b57a54a87439cc33*100&279421cca9c89e048bbed9478cce2faf*100&520d13f64b11505b23f2d53541ecc351*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"90ab6b6dee4770ef57c47b6065787a09*20&c5d237e5487c76298934281f716dbae7*20&e7e5b6fcbe6440b4a42cff8a49231dc0&24ab3a9b7bb43b12205403a8392cfce3*20&ad611a26e0fcad11588926090eea4247*20&ba8cd99f0246116fa8847a480f5db84f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"bae31621cbd1724aa9829a9d6d3818fc*20&59fdf0c4dee4c6d2e4b91b8dcff4f446*20&f143a5c0e0f47379fa74389c88ab3ada&cb0ac627944f876ce520ffa233bb99f4*20&451eb6793b628229513dcbc21a33daa4*20&c73c27b3f7a937eff6b12d654307883b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"7cc2ded2eaf53977b76b81dbd19dfa8b*20&bca393b7a26e6ce9c05e2313b3d07c55*20&b2b0d9b974739d6f080de3be7533a3af&5e8c078a29959dc42138095d70cec9b8*20&69c3741985a99cf85d4c3f5965ea9abd*20&240a55341a43a358ffe41985552df803"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f98d07866ce5aa300ce4b18f17558e7d*80&50c9ac24d06ffb8d407268724ba6a9fe*80&5307b5e6f92fe22d31b0e7d36f4d5964*4&e5fb88ca76b6c801684b7bcbe886a187*80&5507a51cbc1efdbf4493f0233c362f03*80&be93f1c0c5cf11348bf69ca7747ce264*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"e5b2c637ce55d0040e5106959d3b30cf*80&637b7188049a08288c82ff11484c3740*80&0c224683126e5982528d70ee0eeed718*4&89c1309e61842f91659bff07007d20e3*80&94a2b54aceca62147c9ad0102b84f480*80&22eea563ce83ce0753f89809a5bb763f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dcc1a95a2a56c6cf511ff1f6e049cd8f*20&aa792ce0440bdf435113ea4a494be748*20&93d25a93c5c10b75539f46156ebcf1fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:15:aBcd16b mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"5f2df2b56c3821e80740a67873bc6514*20&10a1ff908115e20b1527cc1e9789b929*20&a05482dcf9a5a0bd46bfb72df2ca69d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d97ad2c2043bce8658c347937b34685d*60&d1688ed497bd55809b83de4f62ac344a*60&ab4a229b71a1c7cb5a40226c147ec78a*3&f5ab7d57129c1145d34bc48b392d6628*60&59abaa4ba82a78fb451c78fbbe690813*60&e2a315b32096727b3f7cadcd07b1759e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"4932660a9d0a6aeb6a1c52078cc91173*60&de20da41cabbbc7f8d7feee83ea3b658*60&73dac35edb3640c0ac3527f1b9317b10*3&3b5a52d5710a8eb9d8bde081dbeac5fa*60&72e4b482c4f4337b9056b5f450f1ad09*60&c0de8b57ae4a21badb5de16cd1ddc1c6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ef3cc1c7c32a8e16478a5e722ace3546*60&bb7b5870ef18a576a59686b42e93e0e1*60&a8d552673b21ebc5ee6f985d381aabc7*3&387172ddb8f128270cd79e404a488083*60&4fb4c86cf831cc4ea84cb2bf0c3b5ea7*60&479bc8b4a2d6ae426fa03218a641f3b2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"48666700eb5b7f9782aa3d1f07e42ac3*20&9d9fa1716a99d69295ef3aa6d7f4be77*20&76c641c957aa8b026dae57041ded325d&dd83561670969d2d0687cafe259691f7*20&d41712076af68a205b52e8f04b0e1c76*20&c6b863760b18b5d812b6cc511bd765c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2444112a3e4f0a51203394cc0f31adab*40&327d765cdda8de44a1811f8dc9df14ea*40&d7e1c049247176b94a707799c42aaa86*2&5e9b8e041b7458cc6abeb3f958488c04*40&6dff14a0016b7a69bb97bf6450d4a0b2*40&9df23795147a9d249cb98afccde6f5f3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"dea6499cf23c0f4dc0fe28d0f749adde*40&7eb2769992382a2c3bbaf6285283b975*40&89beb459af039be215b6b6ef3956d841*2&016cd8e45510e73fe1071c0bd9051eec*40&ec83b65d632d576a86625a9fc45cabb1*40&f8482414925c813d0e48859f9a2782b3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"2af3b4aa1bc7daad0ce17dab1eb9f4e6*20&0c5bce97a302f8169a890decc25d02f9*20&86140711a5a443c052a7754cebf57043&af1f43c9c5ec81e1047d2cba2e9a3e97*20&593e568e4be6bd6fb84c3297ba54751e*20&d12981e4cf2f20528621a96fc04b4a20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a8447ee18278d66d593dba0aca9fa947*20&b6b5bbf3dd30e7db8098bf66c49db8c0*20&853fd5ec86b6ab3ab1b1072425c04a93&9dfeffe446e4453c354f4e37f5212846*20&63ec01af7b86513467456828ed225131*20&1bf9b6c4d523015a2c234e4cf8bd5a38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"155da4c22dd7629810d7b96c513dc013*20&c920e0e757170ecc4db94b2023da1008*20&9129f8fb7edf3a978a0369eee9c03d85&e6df7d1f785c38eac8fe4a7396eb81de*20&62ea63cd1a9935e1794d43ebb2462999*20&45483fa1ad6ddf4697dfe6498c8d471b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6bac6cf67e3002b777a4c0dd00290d78*20&7413687effdfb3c55b9a4f73cebf8d41*20&50b7d766839481976f2601dc4ce15d8b&3d34af0849e1e77912fe868ec301948f*20&fbcb18d9a670b9f01f25284956c5c64b*20&29e8cacde705cb82f706e5d662ae3116"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"98f0c62281ee09da0b15f0aead4dfc12*20&9bc9672fdf7dd07f72d059705ce509b6*20&80e2c4bac57d8b8f49b84d4fd99c5d6e&cc38741acef9e87442ff512a5ed5c8f1*20&2341ee76dc163ea3477253d617a81372*20&fdbb725f459d04bcd2d9351be25c3fee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b1516568a54082a342de5b9ea9eb9bf7*20&c4a27d4e7a6f709d4dafdd98e18579e3*20&12f2cf5de6030b2cf2c0e756fe3d623f&78b4f0cd0d6c0ee4bab4f1ab7d6a9401*20&93bcdc2ec8ca5baa541b67d0a8e9a110*20&700bd2f432064dc731beccaca2434c3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dcd10256855e0302468d30ce6f97fcdd*20&9bb71327b5ecb3deae751ff9817a4f0e*20&404e7577b6510ba5c2ea2afa2c59dbf5&01ead3088ba648425e98e0736b0f09d1*20&bd0f938516dc2384e0d29f497a3ae5fb*20&06736d3e9009808f5d69eb3c520abda3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"457eb1984406a9cf017c83b1d63a70af*20&255a100e809ea351059d3faa685b0111*20&6033b61df3e104edb8bd83e7c59f4d3f&6577fc72b0be56463277209aa713a762*20&93a1153994fe218f68a63c331e66c958*20&e4223a107466c118369f97d89d853eb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"264ec7596caf22d5cd391a3d34996b5c*20&bb21c116e5934c077212d78b7ea89de2*20&ddd395c5c8d74d1c54471a4e5c2c682a&8f3256fc39ea656bf330e53a7cd04086*20&924868c45d829012dd327850f111a786*20&2205e695a1850f56d829a84e712734a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"78907c6b43e6b819ec0d18d7c5984c94*20&a815ed9f7e2d33b823725bcf33301eff*20&aead141fb1e9e53ebfffcc6a932d2252&965ddbdda5c9dc036b3a8da3610a10fa*20&3378da8826bd70378961fd10ed0be27e*20&5e7a1efe5773b5387c5bc3474f1ed79c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e137a0b630512450f8d10af32bafa6f0*20&f87b770e28bf11c6e1a1c8406894c7e9*20&afd0aaef3c498691cef0bece17593f5e&53fea7ca9ff2574b53d38c905f8bda61*20&ef6b3f739f303394309d8a6088bf24cd*20&8ae08ae528cd26b31b7db3b8bfbc369d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a36efa07a00c77a2a5a743f26aebc263*20&ca58ef4811ab9dd8f38d71234a12b09d*20&2360a398f9d3bdb91b5dc37a44c2003d&a02cfa900a407bf33362f7338f8884e8*20&2b5af86abe9a1912efea82f1a6f0126c*20&01a72797fb51f1104bc86b81f9b5007f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"006ade6da3415139055d7684f193c624*60&1a4caad5cf49701bad5159b8159e3bb3*60&81bf05843cd23f3382c4d7d256c06041*3&81f5a1823aac158650fbc17174b5f698*60&21152a835e277b0b91d737b854d73874*60&ecf2b07eae6e9db69793a3f14150cb22*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ec5745ddf0fa21b774ff9888437f8784*40&2cdbf5d13b941fc254db5464a66ac736*40&0bbdf650ec75b87cd1a44a28b6e9137d*2&53350ac9415a7dcaae8fa637120a6f4c*40&28c75e00402ed0ad4257b9c05ba74296*40&a4a48310e80e0a62085011ec6dd2bac9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:3:aBcd16b mb2_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9b32c50cbd84227fc53d36675f98f111*40&bc324d5b74e61d5017eae18193a6c141*40&9b8adf660eb6fa436566faf65a0a6e54*2&37048f9c3a493498013e131ced8cc45d*40&b9694d4a2cb314b6dbb197ab4e26b768*40&66e6c160c3e3ca6969f4b25e58960501*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb2_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5ac86abf2c5d5b51025350e0b911295a*20&f2c675f78bc4635a78f33ce8ee8220c6*20&99e77574867227dda6366a09d2c8c8f6&2e7bef339e330f4a0f771ab1eebbcf18*20&1bdbecb30d8cae590fd46db860f3c07f*20&ae858f634b3e2602c2028dc827d1168e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic3oc64_ih800oh400kh7sh2dh0ph3_iw800ow400kw7sw2dw0pw3_n"31ecad2ef37e7f0e74c554b6d8e6a5b3*2&7e2b6a0feec2c3aa317f16d94ea2112a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"75d93562ed5d3a886f95fae44f12b812*2&29599f935f02ab057cee4544b0c25bfc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic128oc128_ih200oh200kh3sh1dh0ph1_iw200ow200kw3sw1dw0pw1_n"0fbd48da58d3701cc572a6dae7dba22f*6&11943ff94ac5e78b32ee9d9ea47be527*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"325d3839b15b6a5c492a9ad15579e44a*6&987e21005f80009b31478d7745293d5d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"916c7d37423aef16cc8eafc87f681842*2&aef95fc2b77982f288743bb6e6483295*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc128_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"ac3f25f3bb9b78997565c0d23e8c4677*4&b176e2f0e44908fff1515d3bf8eddd50*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"7c38ddd96c70373a9ce0012a69979329*2&75265145c3247b1915d4f89b45554c50*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"a91afcb76000e48290ef340aad63a920*2&dd0222c7737414650c4e9bfdbcb4a6ca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"8842efca4961d53d3acfdfffa368cb00*8&534e1213bf06ed8fa9f72be61ccc8a55*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"934046372f612c7c8fc2c5a269b2e743*2&7737217a57b5c1c481d50f38f24b32f0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"5d794381cb50c32151235cc1b99daeb9*6&d0253878d19d8893dcf2efb28090ef44*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"d3e8039ca4dfd0b60711d9a69ba5cc60*6&80d59b859e0a00a3240abd7e74770a53*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"1a6f1cfab622aa546fb90ecec7db05d8*2&0af5c84a3abea49c5c5e50e2e86038fa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"d17c458216d708df9398b2193529ed07*2&3eba375bfa48ddbd28e588e007677ade*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"63732091c096b5599a90450308c13600*12&46143e4d79bc269a6a2082d7bfa31d27*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"c051e97c0e349e8842d007274ed8f919*2&8ec942c53c3efa3e5ac7727960bc1da0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"de7fdfc7c82195e0549b35125457247f*10&7560e31868ed4f269dbcacecd1afe9c3*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"d770166a368c374aa922371902f71ce6*10&b93d7ed033ede83bb10a0b4991b97dc5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"ac8f0c8f620fe1ae937388ea99c7fabd*2&3612abc417e139e0392638be65c6dfeb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"4a6e8bcdc8d412953d34a777be13d0da*2&1bc1a67787fb22313ec0bc7c96a5b6a0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"ac42e0be2044da029c0c72ff1454ded1*6&4f228440893ba58fabd3076f003ad66a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"ec64de723ead42d77011acd74d4e2067*2&fc5a663559229dfa5854f44bdf277f19*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"83564d6e8f621c261fd6deb58c6096ee*4&511da2c37f41b2adf6c4af8468ab5b5d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"d6ce0e1c80fac70dee7f059629ee6a3e*4&2b480d8fa31032ac696445055333e043*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"1fe83f344b6fec102e1e1c0e860a3478*2&eb958fdd36a1df1b8c3a86d2af36f8ed*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"5f9d686c295b5f8b4de0b81933edec8e*18&746c9a5681c827be0a9e652b9f3f9a98*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"f500ac7c93f20704e384734b3f4225cb*2&eab0c9d9beb28552937f2cc10ed594d6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"10a70ca3158af9c9f5c38f1f275f4cac*18&bd8b96c32e3abcf914624e5cf96aa30d*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"c1cbcac71bb57d5fb295c2044c9c5c08*2&3460073a3b4f9a8d83314359600b0f48*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"09b3c1833303987d90124ae9d0181976*18&e7289adfa0bae93204deaf2dc5749d84*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"f16f85ee1649e2415467653770dfc783*2&ea7e994a2969ea90378c76c5c8b101ad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"ee66d3fd5ba688fed364dfcfd9ed08bc*2&40fff66a99c1a0f52b76ef2042dddb43*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"fa21d4059fa8bdb55e9b946534bf4979*2&0edf6a23cd1638bfdf09cfee0cc77dd4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"cdccde19fbbad1645d60442720f71e10*2&e609b2f14b5f21e47af5412874d345b5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"884bffb65f913d519bbb2a04236c7d66*2&192a2dba4705af881668beae2f82efb4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"38c0cac009adf340941d87d6fcbb7889*16&77d560f665039468e0916d10a3c87d01*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"f72d855d243f7306b9c86f1d8dc3719a*2&3faecd23536de92134c1870257b44bee*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"390c9d065e17ef3d5a564037e691aa3f*16&38f08fd531b028f6fb3bfb6746c68628*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8e2a2f41e0b92693afed14dc80389699*2&5902bd4eba0f83c3506b7c0c16818076*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"1d309d11312822693b868092ff231ed8*2&1d27fb438deb92162756188bcd3f5753*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"9253c6bd39a541b5d7b9d666e9508ec1*2&fdeffd3d443cde6821b18da092a9fe61*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"1a36e8183a72128a7a264c73fc7aba93*2&fd1780cfbd07f22ca524157763a9e0d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"d87eff97b2e687259b4da4976c1f60e7*2&89e97832e2c89ec866c7f51572fd0558*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"964886330f1eb2e3eced0dede38c08cf*2&0c2a70427642160eb07f26dd88c3ee58*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3522d12de27e73a74810c26ce6861799*2&074a8d10b813d414952816afc7dc4ca2*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e2507a465965f72849e6a176e1a986f8*2&250632ffe96fd50566169dc823b1322f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d108c0e25ea8b4b6b5b140f369152788*16&164adae3fe69f463058c0f764c700eac*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6e33d0c7a8f979fb9b3661923829da84*16&b3a0f5c33b6ede077457781adc897377*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4f543066fea755b962ae138de6f4146a*2&38a80239b09167dd840f94a1ead87fda*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2d02f52a98ae97e428f4ed4165cb3499*2&9f5814b128ca7c9bc41201c5f39ded75*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"714c4d98875d16f7ad575fbd4c012ce8*16&cb80a055304ac97b6a28c1f839cabd83*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ce5bb0ae1dca351e61d3ac43df1b7227*16&4cd953f4f23db18ee8e50d694dbc69c5*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"f394220f8ba254f8c2da6502ff0f1a75*2&84207ad512e34de854659f152c5d1517*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"897463823d23819ae9d5cfb8292468ac*2&59b7d2bac6484e8b3a81223c059c63a7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"ae61903871cf40e1dd88da6c078d639c*18&65250eb55dd23c2c684291395200b27d*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"eb0fc1089a8955138886ccb177934831*18&10d4fac81a7682bc922e4b645274cf53*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"1243ce9e9745cbb49691fcf581819379*2&1f45f6e4f3f1bdf68cd50bf872fde430*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"b97f73df3778c72a884d1e27bc3eb645*2&0b269050258bfce9b74bc90098a6bd41*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"ee5d1e0a0b406ab061811c5e5d45dc8c*18&71c9b6e82ddd31d0f428e4734973a091*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"bdd3dda3e869966140ab5153afc8103d*18&4c604340de1838f8227e48b5fa016fc1*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"70a55e522660c63f8696acc882ebcd07*2&a784b5a0f7df7982d370f3019c37c08b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc36_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"74cc68ef4acbf8925f1d1d85957594bc*2&548f33c27cfe965b40f95603b5ce3440*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"3af9274cc4d89673b981a4a613397fc5*18&05b9afd90e7e58a8e478c05b875a2878*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"2fcf1d48f532cabb28cd9edf06636f52*18&ddf4b5c23c852ac067f1d17e2928f47e*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1424e81ccc6e63f82387bcb79f00b318*2&d1ad1a7d9c3a05d1aba1ba1a40688695*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"71383bd510e5fc3246596523fb42515a*2&814a1bf72c362751ae78233ad863f309*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"b0c0d6436fc860cab840f694ea5a5802*2&9f6f75ec8562b38ed95bb1b0a7be672a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"efc81fda1258b6821e67bb38fa7d7dc0*2&78f0ffd1fab1b9abd0af791bd8da8d0c*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"2d7d13eef189ab76b9535acb40560752*2&b159b015049afed917b151ca8dbae598*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"50e7a88e380412937e733aa6e71da2a0*2&15b088ae382871eb0eacdc59d5c1a935*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"897332c1189f9fe679b34d83c001ef90*2&14828183adf4c799c6e9b8d8539d2a7e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"62ae22dbfd8289539812ba338187ad64*2&776983f0e90faef2024401c15b2457cb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"cd5e6d6ae28911776a23733483a52bba*2&7c167cc3cba8792ee74f22c4290614a7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc819_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"01e4da89aff7527cd0e1f190dd8c972d*2&5a76a8ff1a79d0b2724d7b97486e034c*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"dfbf01a0cda8d893bbe2933e0013c1d0*2&7d086614b6b085109506f646aa965180*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih13oh7kh3sh2dh0ph1_iw13ow7kw3sw2dw0pw1_n"8cfaa9c5b27efcc26ea184ec67950da0*2&f946421047c4d8474ef2562e1de9d414*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"a1c6053ba3975f8a05459f0475887d38*2&aa48a378a616dfec729a2b37f523bb1d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih25oh13kh3sh2dh0ph1_iw25ow13kw3sw2dw0pw1_n"4ff684523cfac0461378a3d0cff08c51*2&abde76b723149de9641419a0c6233207*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"009dbab5e3520f941332c145fa5150f7*8&65c890a8b8644e5546bb3834aab66b7f*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"407150fc1e94e7a365b54463060a5fe8*2&1541681eba803368edaf955b15151c4d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"b9d4e1674198ac559834e1552533f421*2&489bfa9c1d64eb4dca2d68b47f77e222*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc256_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"24680e970d8f5a21ee9862efdd013d3d*2&6719609b73e9c3855a0f2b140e01f70b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"6900a9d08db8ec7ba2dcdef6c2d0dc5f*2&995e5603bb4460e16f537bed880d64e8*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc256_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"6520fc92a20572ae18eee657e88b7347*2&b93c5906dc9df5d6178427d2f4206fc0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"3e0b0e01041e479836bde512172f3a9b*6&b93e771880590feefb8df31e5129a448*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"c4b64f46d7c8d6e30053c2cfb2ba74fa*6&35c3f97fac9593a3ae2ae2999f95cd6d*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"b9723e2aced35f088ef839e07293c7ad*4&b41cf43d1ecc6051ef017e873a40ad8a*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih25oh25kh3sh1dh0ph1_iw25ow25kw3sw1dw0pw1_n"579c49faedd580a7f440c92a37100b4c*4&038debe03cab993457792c0095e8c8d9*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"5612c2a8f739df5e5596c982d6a02d4f*4&735d3f9f4edc2f4a6386c2483aa643fd*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic2048oc1024_ih25oh25kh1sh1dh0ph0_iw25ow25kw1sw1dw0pw0_n"34cff71bba306ec4b7d049a61a8cd1ee*4&9bb2d986d12e425c2a8dceaa2e44695e*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"584fe7eecba7a9ca25eda2f52eac682b*2&2a3ee787ce568c8947e4c394f38a2717*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc2048_ih50oh25kh1sh2dh0ph0_iw50ow25kw1sw2dw0pw0_n"4a61e6388a2f7b28c6c54b685cea3a4b*2&58ebb1b7f5b2dbb5629b50998579628d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"7f46c7daf4ae1bef786d1e805709b3dd*2&b16988ea82d9e9b13e3a4c7b95c645b8*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic1024oc1024_ih50oh25kh3sh2dh0ph1_iw50ow25kw3sw2dw0pw1_n"08ad8f2d2549d4072e3f7f9754e46fdd*2&aa61a72f737344fa74a46c45c52b1006*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"2307ee00a15764b8cdfba07c0090f744*2&97f15b8221e279f9e6afcc2b76048571*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"cb6c5f9750da4cb90010a7716435367c*2&aead18ad88c5417dfe287766054b6e5d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"ed562e765436efb1df5ada15884348b0*12&7ad7d68fb6a33159dddbf2877f7aa6b7*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"900fc791839ae9520e284cde31387863*12&9ae7a4fe349aa45049536b14a71b4b7b*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"caaaf8d03aa9d1ac6765059dbbab07e9*10&98efd4ccf61409047542edcffd885742*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih50oh50kh3sh1dh0ph1_iw50ow50kw3sw1dw0pw1_n"bd3d74f75e951124f046886e7a46c9c4*10&ae44374aa0b30aeae56a07ff61e86ffe*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"c4245bc03853fdc7938573b449391598*10&f9e57de8f8db8df6810910350812a047*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic1024oc512_ih50oh50kh1sh1dh0ph0_iw50ow50kw1sw1dw0pw0_n"983b5bb3ef8d449453ab841a19219601*10&e7bf5bf9a05c50f8bc4e36528c16d36c*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"cf9a65d01a53ec3d8bd90febf0d50b10*2&f5628807c01f8d1e4f02a60b148842a2*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih100oh50kh1sh2dh0ph0_iw100ow50kw1sw2dw0pw0_n"12833849d37a7a10612423dc7b3ead22*2&be289c3d9a6e69762e7c2c31783c8800*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"25633e10c95822fd8b98296e0193bd6d*2&49fb44f3ccb4e8a1eb307b2c66948487*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic512oc512_ih100oh50kh3sh2dh0ph1_iw100ow50kw3sw2dw0pw1_n"f6a031939d2de0b9cd11c12ae73e8938*2&0916bddf6cbce3583a5303e67c190a3b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"3edf170f4b78b59dbe776f2c4d21f5e4*2&25778a3ab9878fcfb543f2016532e530*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"0356422a64cffd4cbed0843597c8ce20*2&63b84e323b16a95ae3fa49563cb7fe87*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"9e05c6d9552b8e12f53b51767e054967*8&dda824e7669e54e48233f34d5a3dd41f*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"1f679cd62fe1e8b1bfbca0a5a10ee00b*8&5269c9bfd43b7ce2baefa20157895ebd*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"4b7bcea2a965072c1379a312625e0d56*6&0252d7e007c2acf7626439d1dae567cc*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih100oh100kh3sh1dh0ph1_iw100ow100kw3sw1dw0pw1_n"c6c74891fdbc73e6d652d3b44366faaf*6&c5681f885ef9d52feca029a5fc217f2f*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc256_ih100oh100kh1sh1dh0ph0_iw100ow100kw1sw1dw0pw0_n"becb1221f63492367df25086e1d84ad2*6&6326627923410311b83123b50a90b9fe*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"b7fed4dd43b21090b4f86f89d79531ae*2&97b9ab9d93e9515b683360c0d4154ff5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"2d108ca12542401ed7686eb83a987953*2&1d850d3f061a254f4c62125457fd46af*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb2_ic256oc256_ih200oh100kh3sh2dh0ph1_iw200ow100kw3sw2dw0pw1_n"c9f3446f9f7534790267022a30eea775*2&3243d24f7c6f2ee32f950771a11d6583*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih200oh200kh1sh1dh0ph0_iw200ow200kw1sw1dw0pw0_n"8dd41d170d4244cd1842df626b02333f*2&619c94fcaa363f1e64646e851ae343a8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"ebbb7c5c31e6a6d9cacf97f101493214*15&9584454c1cc8df51b61cd1a2b395f66f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e1d30a76764ad953dca4a26162f15806*15&356c6520de15fd093c7254eb8428bd84"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"72e12f37a8c0a9fd002f3d0f5bc31237*45&842d56b7f8d06832e95d52e6f5d60486*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"be9213253605a089fb4f9ad1b8b43a2a*15&3d505ae2a7fc19d01b4e9a474b170abe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cef97e183bd500f6bb51565e4366e8d0*45&553a77739af96793995c55c7d8ca6973*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c448a761555d0ed6b1353256a6d32759*30&a647f1164235161150bdaa3c7097f171*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fd775bee55dbabb471ead74a4c9b7f57*15&d649307655a2241097a346fd2107123d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c99909773dabdcb27fb1ca7cce12d34c*15&fb06e5f19b6e3fedaa9f27b11676a502"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1d7bbd995ac557069b016b60523d42c0*15&e02fda3896fe26c0ab8b000733eb0364"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b841040aa4ddfc1cb82b29c119541cff*60&605d9d9f73ebe571138d9ee975c4fa7b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"43d012d57f8aa1e215b01a309a660a49*45&80c78faeb44eed5d56dc394dd716b477*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3cd24f7627dd46abf14ff8c6c2df027d*45&5761245be2d34c060edc5ea3f38941e8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"12e23858a5cbbb701beb9e5c527b73ff*15&49cc044c02d7563f8376cf2c724cab2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b6da5f88cab5d572583dacfc89e84e73*15&2225d158d04584eb0d8d7bde71de5113"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2c1e8694f632ed9e1befd0a5c427d227*15&c1fb9e8c248cda11da915c61ce94b334"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"42118027b7df7b8bbfa29edc9c70c1d8*90&e813aa9c8e77617325c1fafd9fc40a64*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"09fd84448ca9b34c6fe69ccb84135062*75&184734c9fd8028ea9463d802de59011a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e34eb7a49dd615ed31a4b74a8414cab9*75&7b2b62ff5864ed7270b5605e861a5da6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"98039239698caf091f4eb482785752a0*15&24a97cc450a6485c1cf17ace36ea0858"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"2d23a36744c182d90c3612e9664fad98*15&879de2c1c16135f440d1faaa65a9767d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"0ed58e21e0e94b5be2106cd7cc508227*15&d338a175056f9d78368e1c212fdd4f38"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"86a533f22263c7581b976e78f725b345*45&0442b5e3a77632d435e7962566a40dbe*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"21a8bad6cb77db11a0cee5c453345957*30&827da7112574fc998184cf89641af326*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4d9a432c787ca008b579aec24473c1fa*30&21992ff840eb01defb914ffcc7bdd3cc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic2oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"530b3762cf1abfe96a2a8d6037162a0b*144"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"a9ee285860bfb79a8994a731244f969b*288"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc49_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"66046a52c474269d253ef2c17f2bdab3*288"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"c989967aa191c6132c814ce3a309f5d1*144"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc49_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"bb13a582c9ba83c8d8cde8464a53fa0e*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc49_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"75619a04d2235f70ce19ad44928e3dd9*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"fa912b6349e5204753849d061d3322bf*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic32oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"e17d8dd5ac9104fa4766220755779cf1*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"67249592ac68d5f69502a955be1ec2a0*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"da3fa1cb25bc943c241ee48c62a2a4d6*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"8c765bb4edee8d5ef02b7f4dce0bc9f0*47"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb256_ic2oc32_ih11oh8kh4sh1dh0ph0_iw11ow8kw4sw1dw0pw0_n"a7238a4988cc6155f2019bd3e6d050ba*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c6286da6fc4d23b8454e4018015b0873"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"0d85ed8ebf74da6fff875da69d92800e*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cf4fda4a8bfe9c62f44241df109e2eb8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"70fde9d6f91592f639787ed523302d69"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"291cbc6cf92399357b639921ae96e9d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"161b166af95a9a32b90e9d6105ff1229*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9dc37b7cf4825dae906a3f881a1fbfd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d6159047ba2b9fe476d2e1f412fceb0c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"9f5773858265d4d7a97e4bc20e58d809"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"02bcf59890450b02a3247c34c0371d78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f829a5d6d53ed1f0f0b21f0ffce92d26*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7b5520c17e2e4055559e8582a5f622e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"61eb214af65608a61ab67ebcafafc9b8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"aa0380e949430a880051c3119ba5d5c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"10249526b55c583013e3a7397d1940bc*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"7336ad06f89aa16286c5a24e35581043*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b603cd587b9d7cd09b49759c74917ffd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3054bb43c4b8199ee11766fdd6095859*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4e615b952cdde93a9f0cf702d387bc22*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"64a3a2d1ce43c49ed349f8fce5bdfe97*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"832260b6d47072bba242f372aa2b14ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"916500d485afd24841e48953444fb1fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c4dd197ad10b266359dd00786f75155e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4ad8c809072d48ce6be03cf17cb888c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"82762672c2f9ab36a531f4ed2d49e1f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9e0bf30fc7c689df4934de035896f471"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1d1c5a24973c8e209cb0851e9b84cc28"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"97a2c1ca01a9b0df8f314ebdff007012"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"754113cbf3ec450a742e3b236c9d642e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f9df70e9379e6d24227c58d4cce9886e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7c076c22ead1d3650e96371e9563d419"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8767f8bc64910b2739941ace1b4c0602"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"5a7a8f5252d47c25083e2c116d974d61*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fd3bd2e81e4d9c7247e95ad52661ae45*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"57460576124463c0fc08e171deeedfb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"24de8c4aa505ff917e08f5832a5f8ff5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"e478956857d67b61db56399daf1ce00f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"2210373761db6364cc289587b5d067d0*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"b09324a6843cf18b15734c04db90aa51"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"c260801049ccde554773aff32e0d8220"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"ac517de2d4bb0de70cbf948d843344d8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"eba1b0a7b66b95649fa42711cfb1d441*17"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"986ff1c206d4d9b78bfe702f2188306b*17"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"fa81fa2503d46ceb781707d6b9a698ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic3oc64_ih224oh109kh7sh2dh0ph0_iw224ow109kw7sw2dw0pw0_n"fbc50443006cc1c864ff256d16eda953"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic64oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"09adcb4a6790be25f0fec49242e477cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic64oc256_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"036216d5554a74227b7c96b1d8450ba1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic64oc64_ih55oh55kh3sh1dh0ph1_iw55ow55kw3sw1dw0pw1_n"bc39e00bd507ae177c8c1f7e3f9f067d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic64oc256_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"a184a82c58343c4661f03eb39d9b3a97"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic256oc64_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"dd779901fcb45a63f26ea95bcf91b8b8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic64oc256_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"e422cbd8f79079179d66648744d2144f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic256oc128_ih55oh55kh1sh1dh0ph0_iw55ow55kw1sw1dw0pw0_n"8b52a79b6f09083a5ea6cb725f62caf2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic256oc512_ih55oh28kh1sh2dh0ph0_iw55ow28kw1sw2dw0pw0_n"f65e60072b61475bc4dff4cec34adfa8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic128oc128_ih55oh28kh3sh2dh0ph1_iw55ow28kw3sw2dw0pw1_n"d4186007e90412ce2bbbf58ff5207fc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c9fa8cd814313c74e9746bc4fe041913"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"194d31f4dd91c546b9b1e0e4c93935cf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4c871bc77c7f27cded1ea237b1fc3f02*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1c0db8594a1e0c3082df439dbe8544de*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"f82778dae865be617891820552eab6bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9fb9b14911ceffdc97da2505b054dc15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"e727f1a41feb74dd6a27b373023964e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c53981d1971f567b75610d1bb1d52838"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"192789d5b898eef1c228f63c3607dc9d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7fed788e188efba07b3d87cc765d43d4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3aa02fbdf5ecc7b63f99c4724aafa4b3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"290097b7aadfd4fefe1131eda15e4d1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3dba02aac351fe79774527afa0ea5d08"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"33756edd01f287983bd60f7a5a1ae5c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"4995015ae54f4943bbfa6a314be37f5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7bf540ae92b7b6a0021f8fe6513e5d58*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a8f05967511d4b74517b9cde80115c1e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"54d8dbfb092ba18736cbe2107b689115*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=any mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"5c860b18e8eb4d6c5d73583931edac17*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"116ed6af29d04b97b03753d9654a3795*134"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"397c0df857c8daa4dbf2d7ff176b7e8d*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"96fce8fc21b493470bdfa2ab959743a0*134"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"f2e79c08feb5083d86b3d88db90213cb*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"18dd652230c817fbf7aa77a42bf5f339*134"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"e4605da13537cdb8be0a5338933090af*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"5ce1f61f96f90ec4ace4dbcc2edcfa81*134"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"816cdc916ae92856b2eccf697e67d6b8*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"cabbd2e6810b3f64e65e537a9785322e*134"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"a5db06260a15fcef96fb20f4b74dec2c*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"148bd835075e086c13b17183b17b534f*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"ba25205b100c766e8b5aee468ac81790*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4772d0d7318e10752f3f7d2135deeb09*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"0830cc00e1b26d517184ad5c98fb9572*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"13b64e56d8468594d72fe3be8132394a*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"776440d423f56c141e387e14c1523f3b*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7309f554eb996cd301bde634bf8de461*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"a4dcd52bb026503981f7d2e6f4e8e432*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"66974e981825df9efce52d8dcd280604*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"b6909371e701188173fbd483cc9b7031*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=acdeb mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"335b827e8d6ca44e651492eeb40d5bc0*67"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9b552adecdeb78be2888e1d973e40910*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"681648e26f9877f2b389a47832e21a06*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4c39e7e88f667e71371f90b4df1a0b1e*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c34ed0503f486eed6f26020529906512*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ccbc3b9e7f0697751b5714f1e96c0648*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"02e6bc4edab2ed038afb48cbf2070fc2*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f8d1e30a68fcecddfb83494eebe8a193*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"10e16af8ef12aa634ecba8fc52916459*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"a18ee56adb0e4fd65cfc1b477ee7b8a5*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"07ccfc97ee4eed2f87f65ad8c475ff23*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"974dca1293283e7eebd717bed955e316*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b5b00daa182a34347868cc90270b0e01*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"8f76e61dbaa6e8404f97d9ab1805ae49*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c28684b5e6b1ffadde23181669949801*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a95f3ffe6cacc2395dacf4eb04b2c1cc*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2b365ea465e607189dc5378b2f8a5104*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"41fdcacda720b85f0af876e888363428*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5ffe9cb0268757fbf04d0b2f5233aa45*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"02e70ece21f7a322aca8f2c11b56bad9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f92cb6da57af21686ce487590507fbe5*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"95f7ded43988dc3392140b833eb7450e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5eb57ec479122d79fcbb017e52c51372*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c95c751af480f359c9fbece42ae0657e*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b72b9b29ff63ee3b300b6d22b553b81d*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"248beefdf4c0fd9e4cb543ec95231e2b*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f2d486ed13d32db8982dc4ffdd8f838b*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"042e640b0acafc44326b2daf1394ee58*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2f5787f288071e7813276da57b9ed878*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"607fece8be81248fd4c768904288e2de*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"cbedb902fae1655d0d3d5592ab1e8f28*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"b727b851d90a9593b56a4024746f714b*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"f046fe11f3466403328747af23255eea*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b91ea9f7ce22e87f87cc4a45e3be2992*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0483cd2cd9bb9fd51b3f766bfe8c59a6*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"94677fdbdc4e099f1b9d47af127bd465*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7158f78101e458bcff1756c9caffc3d1*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"44f44ef3abd0a68e07806f993e9ae906*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3c6c6a441e31bfbf3d0abe9f3e7f878a*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f42668952a1b5c7fc1a1d05fc6e2d848*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a2fa559f666432e35547da8d5bbe576f*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8e7dda5f78fa0d5c974b20cef5269e15*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"818cc02aa3468175f10bcb933ce5dfa2*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"0f0dbc16d148b7dd0f9a54f1410f8208*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"1742f66bd2d9a8ab3eb0fc8fc89b58ed*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"894c21aa47a0aa0f32f90b8fc24e401a*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"eaa54e06c4c2f7c3758e315e48106591*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5dd2ac818fb5a4199d55898d7ebd371b*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f2636f8670e76f0f13fbbaac60096387*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"85e4737b77cec0ecaf744a70f6f70a6e*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"8fd593bf4d158349a211a14f6a08faad*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c003362f1d1209d65d42a7f35a3b0369*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"27d50f47b5f7fcc80d71f840e762a17a*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1f1399ff334610d4486780e7ebba17ad*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"5f016eb800e2157e5090b02be42065ba*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"98688f16712f351f076580d130c44056*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"1534ce6686ce0663c24f6f8adf1f711e*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"78b7b277145e0c8d410ffd60dfab9eee*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3a859ae6d7eec7b258934f19908566a0*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"5f07cc3ac5e8051fbd9bdffe2bc51326*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cd1bead6c15ba7e8ea2319a742c575fd*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d33c18e98b43c0df9541a9b4b042093c*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"66285346bf3bb00a11e715b2f98b0961*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b9272352a6b7b690210bd51941f2ee89*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0cfc2c978240f4d1169081d8ffb811fa*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2b7b79aaa506988ff560851e0add7bed*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e72b4acb9df6f78fbaad2904594a7993*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6b661824ed44b6d6b0917057fe1efa06*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c54e638c8d563fa79e2ee920c8755ca8*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic3oc32_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"230d9cb342ca9a2dda7702810884fbf9*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g32mb1_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"a41103c8677a5828e52116f81dd93994*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb1_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8247100cec38fce787627470cf351503*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d581c21e53a1734718cb161cafe86a64*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"a69b6f928de47e120e6beff4bf5bdf7b*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"0a030816044ff490d406b6246a6d26cc*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g96mb1_ic96oc96_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"222317eb9a389c2bc47a665e4b496468*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb1_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0e28dc765793a42dd936e0a525942cc5*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff1ceaa0e9c3cc99fff89128e04dac08*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"06a0a58ef763f29e94aba307780e5476*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a8fd7b111a1321848f5cabf63ace0f5f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"893250ea022680c48234acb857a1798f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_swish:1.0 mb1_ic144oc6_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1da5b2522f4a2bbe620f982aab5333cf*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic6oc144_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8b45273885e658d67232acff175bc30f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"560b954e55b8e58a4a502cc3a38a6dc4*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g144mb1_ic144oc144_ih56oh28kh5sh2dh0ph1_iw56ow28kw5sw2dw0pw1_n"fc017d91e7e2bab55013e8c2a0de3e6f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ee82fb883ec0c155e85103d357e9e68f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df38e0a09d0a7c90b5c2a40e77f46fbf*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"95168c315c9cbf9b667bc9bdcfdc9afb*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d7f189d615968b3ec6829102cf58f60b*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c35f5080f5a9134cb1ee3d81256a363f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3256e55f4d1adcf8dac5b563002a034f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g240mb1_ic240oc240_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"ea4f9e3b337546d90666a798d2c753ad*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"78e16f31d9c61b89fcdf6a967e169c47*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d17043b1894fd24dc03ba2a86013e5e1*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb1_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a712a96fe2a305bbb1707deb617d042c*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"12a7e5d02f509391acddc6662e911104*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ea3a4cd6c86dc545982e19bbb75d9088*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1f64e89e3b081b9658431f0e27c1f951*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g480mb1_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"f566fcd47d425d4c047fb59cf1b3fed5*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"99aa554e5a670d7d11367fc357623e4f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6c195831936289e5d1e4a3b7c9cde686*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"c26b9d64eb1e03e3c60c8607e0a3ef28*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dfa0b08111dad0eba1e3c6ae8da88ec5*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"617b071d6a1c858476be26f6d102aa93*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5e0fb109e6be0f7ad81ce0fa516fcb0f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g672mb1_ic672oc672_ih14oh7kh5sh2dh0ph1_iw14ow7kw5sw2dw0pw1_n"794c18d9fbaba59585b9e8d566944982*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7cd7b942308247e2765c1465e8f9eb97*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1df72b8166b40971922881fa6fcb1b38*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb1_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"4a4691161feaea7d380095354e53708f*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5b46ca6d5845a983af494b9f62261b18*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1ec2e98ea3b76bcb9cac51b1ad714a96*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b mb1_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c478130aa09e1897e6ba1193a5c66a0a*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 g1152mb1_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"334350c78cb330d5e678f3f05ecbbec2*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"011795a48c7162f6912d86e8e8209db5*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb1_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ccecabb59ec4595abe0cfcb64102c718*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any mb64_ic3oc64_ih360oh180kh7sh2dh0ph3_iw360ow180kw7sw2dw0pw3_n"c852fec385dca169bc1744600cf3af66"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic64oc64_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"8dc0cde03c089ac9dd6a0f8efdafe60c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic64oc64_ih90oh90kh3sh1dh0ph1_iw90ow90kw3sw1dw0pw1_n"0d16a7d54fd3737561b443f4378bf541*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic64oc256_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"3a3713334a4efb6a2ee441fd57f74988*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic256oc64_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"13a5ad1714eef27f8cab1704bdfcb1b8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic256oc128_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"322c840b545d0d5302709dbdc7fa8ff9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic128oc27_ih90oh45kh3sh2dh0ph1_iw90ow45kw3sw2dw0pw1_n"f351856bdeca51d5b9cae4a5f23e8152"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic128oc512_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"774a6fe561c8e5dd1f9e81975cc7c937*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic256oc512_ih90oh45kh1sh2dh0ph0_iw90ow45kw1sw2dw0pw0_n"85e762c349125debffccd8ba969ed1e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic512oc128_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"879dc36606e42c223e7334d4588047cf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic128oc27_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"ea671eb3c4b9aeb5134fb1b96000cd48*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic512oc256_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"ebeceda39aba88cb64e29ac10e06212e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc27_ih45oh23kh3sh2dh0ph1_iw45ow23kw3sw2dw0pw1_n"7b5d38d94fa51bc5d7297ee5f66b434e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic256oc1024_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"b6d489166c80efae17c7d949708c4963*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic512oc1024_ih45oh23kh1sh2dh0ph0_iw45ow23kw1sw2dw0pw0_n"632735183707da48a29b689a6fcc7233"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic1024oc256_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"88d93984172167b47eb6195604185076*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc27_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"b5a3bc501aa01e7f63c790025c74da59*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic1024oc512_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"0d62c0be3ccbb6c4ccfd089f1327bd0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic512oc27_ih23oh12kh3sh2dh0ph1_iw23ow12kw3sw2dw0pw1_n"dc9e5871155ba9354a79e94422a11173"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic512oc2048_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"f7da758a3a512240b3e40524dff47f5e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic1024oc2048_ih23oh12kh1sh2dh0ph0_iw23ow12kw1sw2dw0pw0_n"7f8786abe2e45334c9f5865b5a9d568a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb64_ic2048oc512_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"8103480d353f809366b70b38187c2ed9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic512oc27_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"bce88ba7a424a7446b840e5505eb9055*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic2048oc256_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"3c34eb655fb404e2a424fa2bd415e047"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc256_ih23oh23kh1sh1dh0ph0_iw23ow23kw1sw1dw0pw0_n"8799c493a4d600c183422d7ca45c8fa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic512oc256_ih45oh45kh1sh1dh0ph0_iw45ow45kw1sw1dw0pw0_n"ba6d1d7fefbd6215343c46d4d640e329"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"7cd1bed673d5bcd63c9b3b3ced29ccd5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"e468c0a80dd62d889dab71abd3906943*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"64cd3bbf1ecf02ff1b2a5d4aeef39639*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih12oh6kh3sh2dh0ph1_iw12ow6kw3sw2dw0pw1_n"ff158e789fe3735b635cee739dff16e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih6oh3kh3sh2dh0ph1_iw6ow3kw3sw2dw0pw1_n"2d39ec079a0b53d527016d83864f5221"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih90oh90kh3sh1dh0ph1_iw90ow90kw3sw1dw0pw1_n"66257f3e721034b3ca6a80ab9f8915f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc32_ih90oh90kh1sh1dh0ph0_iw90ow90kw1sw1dw0pw0_n"26a96d169845641b510643dbe3351e91"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"ab603f076b3bacda89adec417b9bd08d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"d127e85edacd804f7889fb32fbb7ba3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih45oh45kh3sh1dh0ph1_iw45ow45kw3sw1dw0pw1_n"633e4486471b68892f92846d39c58dc7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"b92963ca9ddea6588948420bd40c72a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"903f1cdda799b4b0cb726c43f1c7576d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih23oh23kh3sh1dh0ph1_iw23ow23kw3sw1dw0pw1_n"65e8dab14e4f019d04ef0642187f7c77"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"b434ec0ab32cacb230605696ed302efb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"cce02a47fcd6e8e5871ec8ddca04920f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"5602516fd14e02730adf847f8d238bd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"a194622e99c65d4b6dab7e649a08ee42"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"9d919e50c1830359d89675568fc258c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"ef722ebf0d8287235aa2ea1917100e24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"52afad66e436d0df0592797b8063258a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc256_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"e202fefd674a49da6347a0086e51744c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc36_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"ba06e0119da236474bd89651b318de6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc729_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"40b60982bca5c94c811b45e1f1a48379"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc288_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"87371baf7cb2bac10475a0a1eedeb6d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"82331e517fada6faa2f2183b8f4b73ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0f460436d098dc2f368b94e224ec7dd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"49c073ab2e4c619f6abb0b6581110674"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"577cdb8537576874ca6d270de4d1e17d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1d8f84428b234d3ea9ac20adb69e60fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"980ebd5f0e1fe80d4ad4a6d9c82de676*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8bc8b43e2071556e25023251b27c3fe5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7f202f330d5ea08b3b6c364e1321abbf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b600999c4653f07ae13a4908e0023304"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"d615809bdef910cab2e4dc2ae637e407"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"214380fa5d0a5f8e66ba848ba2459c89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"179c20d116f8d08ba6f01f191582c126*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"acc554136b3ea97de7707d831c03ec7e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d26a7a27cbfeec20064ea6ed28cdb4f8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23c901c1a097cac0b7d04d4d1947660b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"64776a3e2115374f295e430e89280502"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0711b9c93a85022e965db98df05380ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e8eae9aff4803f01fbb7fb38f819ebc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6f33ec0139a81ae40fafe5a420db549e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6107e84be55086e497865d9a1a706ca6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"80241b343e1813f12678199684a27005*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"34588235541d00d24add547a514d8fc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"a987801aacd4e6a51e5a2ddb2259a915"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"cbae1b3dbc7b755fa8acdeef9c341ec6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0ec26f1c8ca85eb121ed86c02a4c0e71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8b76796cb8a343df378bf68e44e2d2e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4a98e21f11eaadfc6cfd24542638fc82*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0f1f8c640d42a02b90ff9d13e211ba68*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"08ef23b5b5f2a2a428a3d1c19d67fc08"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dd79c770d9f214117b9efd1590a28b3c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b0bf30c030de66eaecac24d828db045b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"48e5d19e7f5ca3184fd937ea67e89a92*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"472960cf2fdee2957f34b28c3838ea02"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"41c480dd5c278ab8a99ed8ba0d3e1838"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0d525154d68d75632361bac5f86eafbb*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"86a5a8f4c7b3030e127cd57d95db8344"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7c6bd6e51b5226fb667ae86c0b3fb866*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8618313fd7b8a82023ae96d8e03eec29"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"e0257dd86e33b7f1c51400e3c6f876f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d7f93f898e9211102a0d001a7dc94fcc*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a5facfa0dec8b4aa91e15067ac283e0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3476861920a9d1adfed7131c2aeca58b*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3028a9f4e07f8cad3d708b8241181b08"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ca3cd880301796629425821084311127"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"17efb8d0e97a264b8f61ba8d1df0704b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"8707e6b7919618a9e39b60c8c38755dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"66bb552e0bde305434bd5ee38eb14a57*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"dabc6574aad509f10d0a2cf680b65247*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"3b5621abacd313f0d9677eda502b5a38*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"64ed1002d078b213e04ede4e4504dc88*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"61fb482b40b70c40ce92a63da116af02*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"3af84c92aaca1d09074031380c4df822"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"77f79fc5ec43dd83c519247e89e8973f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"bd8135200a36f7d1d9b7458df4b231f2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"5bfa89c128e43c5c627e4f25ac28e578"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0a7c8d4aeb967bf3b4d1368e202c36b7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60430763b345b3b9ba16789df991e435"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9a99ef298cd03900aec0547193b4129e*45"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2eae7fe82c9ae3c81aa55efec60e884b*45"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c0a0a9688badef61196198d5eb317a3f*22"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"64d29eadb49294cc18dddc421d8b9441*22"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"358cd31eac65c4b5034022efb5a36ea1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7bbcd16e89bb058fb6a55bd62b985dec"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"6b692a936420076e51f9aac418eeb674"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0dbc4b4b2165e5f497697249f3fe7e6a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23643a924fc33f8893ca0c07a0647754"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ea9fed12507ed0703525c6600d84eee9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"87c86665b1852d7f4a83c2b4a32e9f5d*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ac8fffa6b80a244196d1f31a45e10e25*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3a0d9706642dd7a91d348d6a59a11d82*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"aa1975682a14eac6109e698713a9dd81*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d70b7167e4f05833522da08fe38cb306"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e14c3e996e9ae95b4b6a1e48c0ac66ba"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"2d630b0e4dfe72ef815733875599112f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"e0a0721b31f049fa12ecdfdccc550433"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cb8eb7aa96d9b8debd21502f33ffec13"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0048a109124dbc22dfd354748526ead2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6bf0a88678e7e704f2f5a485a218b817*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fba1de241bbc716c20ac3e63298c060b*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4a64a434a59d4161d2289fa1f42ccfe6*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8ffd8bc252e0ce545822b4dff2219784*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6569db889bd758964ab65bf26a599efc*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"31b53f12f0c7edc8f36e02bf4b4bb65c*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"1c6b0f96ea12143c2ed6eaa30e228bdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"6fdca7b196b087a9ac7536dfeddd288e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c345e0f04fb7e9eb606fb3557ca27157*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"599bf2bc00b140ed41d2e9a95d651c7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3f5118fc4df2805ad0de52c874b383d8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e57bb42a20b93de10bde52148362f557*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0527ea1fe0bf869f5cf545b56f9c8be9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9d72033589461afc98e5f6fc9bb8e8b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"6f842b8f7a7aa0bb58071bd3da08fdb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9be11e71dadf8b9d770b4bdcb1de19ec*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7c01d91e37a74e46441ac3e5e6fa8fd7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"5cb6753e8827b4584028edf3067d7e2a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"361f79d66d181482666cba7daccc634c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"515e15c776a54cda5d9aa3f4fe657e8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"3be67879546623a2b7ee365c1c2e21b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4c9da76deb9e249324f632ec098601a5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1d60abed268efc09a1d684836b139e78*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e842aa74f42e431e838e9caf3b598f5d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4c16e96e9afb6a4b6b259d204279a655"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"97869119972db82eb5829d3a355ffef2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"3b4b67fbfc19fdb1227ef0b2716c95df"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"bcc4aacc03743d80f5750497a8ed66d6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2ce66b010aafe28fa3a0818bd00a30dd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"038bef5856bbae2016166e965d620d86*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"38a1ef65cd0fc3033f1ba2b9e749a131*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8f4f2984b40cc45cb8a2814f5fe25a91*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"74f738d227cf7e564c8c8ae9c8cb2754*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1969b9885a7e16c04f929a82ca2857b7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d625ff7e0a7309b7fc0c75a00013a50e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6491fedd62f37cdd77d3ea17a5407e06*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c83386b146752f9214454e6eb47d7af8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"1af0806a7cdf450d7af03ee2ff2c83dd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"3bff7290dc8a2ea166b5d1d86300b1df"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"48914cb9b15cb22454066b46a773292c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"02aa4bee64b8c6e21ebb6da0f05f2031"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b6662c35965ceb8d4eb302682470aca5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3a7ca448bda284e5406cb815fe751d0b*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c20758335e4f1688d4704bf1c1c64136*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9e0057692ea999ea33aeca883f3e59ec*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"67c47ed5c1e618c44ff62b6884516414*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"989d417956c104cf1a4a6945e110254f*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"25d360593e85889aaa264fa0aa1eced1*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4a15998e9499c8d7e513668b37ce5e56"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"05272839b3815a83651af25e29970285"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"de29d8c6737f6f2ca71c54bd7279731c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"832cb726ec2dfe6ff495e157e7c55957"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0db0bb8a5edbf5805d15bb9cc7fc83a9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"616d84c5c1e5ffacc661907fb2029262"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"966bc5ca08c2740e742d246993fa335f*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9322391d83118ee223ff783025eda045*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0d6362718cbe224728d2ce16ba59b4e3*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"33519245826872349b0802cb18713150*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b5557fa5dc2c682aaefd3468c84abb92*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"af11d27ec0ff4469ad580bf3fc888643*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"381c437e321c3e0fa40e73d33bb6b866"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"4785e5363fc8765123515a0c628cf533"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"8037ab9721b5f54a30f790a876c871b5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"259b5a87d3624c8612f5757d7b8fd1cb"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d70608f3c03efb2c60e421cb816a055c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0d46fb6ee1d30721bed598e72e4bdb5c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fffd6c4e83c5e5e186c7f0e14fdf7493*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b38538a4425d62ed2f685397f22e77a0*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"57be4a07b630350b6011f2cfec962f28*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"b1563365fb4925987854e4809188ebd4*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"58d87baf8c55e6afdfccca708a0550ad*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a653a9779b39f9ee846d449d76aecde0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"654df2e6a7b7eeb87a8fd313e1294b7c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"59bf0a6673e1b52e728fe957b3bb258d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"5975bbbb54d16a98961b2f44576b3001"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb32_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"b7dce8a80fdddd7d9cfe4154e2f32853"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"3826d2b2e18ce45637cc8086bbffcaef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"58311d340f14239c6f5d208bd2631b9d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"cd960f86875da655150a351920f65993"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"d806e77a393d2f29bc05d3078db439eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"ef346cde41c19cd8d9990359dc309779"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"02547ec726e52c223b2937dca0ef1856"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"122b4568a2f3ba87bbe4766852e835a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"fe10dcf7a862f11c500f4906ed03c2ba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"d6d51a6685cde42013188fd3fb141fe9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"722eb2d7d10c084ca59e8ce694d96788"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"0dbfcfd3d32c7275f632f03c13c67110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"73e5d2a5d8818d33c1f5d5379873a9ba*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"b67e8913638f82f5d161883a128a734f*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"c382da6f0098d56f46b19984452a2ffe*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"94d3037bb040a244327df3f716f68d81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"505ed8dcd3fa22e301b9b10e11a1203a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"6292d59d14c219ab0bf4caa191b5c8e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"914a6dcc7d251f8e0c6ee4efa033be61*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"83d2a2f98544c04b0a63e916a521ebc9*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"bc09bd84543b760eace11bc4b76568ee*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f87327c8511f52c2db458ea7109ddfa3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f3c85e2a550263dc4c81fdf26d46ae73*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"d48074ed63574e0283711a5d831159f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"71636443b2aefc6c197872592ae7770f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fa02106192b42579e5133650b72a9396*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"acba8d48c30f63b8d3429d1656792aa7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fe6da73a0e78da1d61648d0328722ad3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"e2a8cb1dee911a4dedb881d44e0e2b19*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"edf6b9cc501846f1fea83775d5a584c9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"114ca5d16a390659ad2c2d2569dd3fbf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1d2b746225d23d74baa1a28d145d6b91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"525d5a2320cae2275e73d15b51797676*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ca780719c343cb5246681fda5089334e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"bf2bf5f178fd2634cd2ed71db5c3ddc1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"5ec0182107c90d99d85ca0721f022667"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"99f46ce2dc3fff6096492a9516abec3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb32_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"a7ca91390fa9cda41df30c2ce789c5db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"02db74b92d4d53d0b322a8c0bfb3bacb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb32_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"270e904f013cd8b9e3ef46ead7a204ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"464334460064249b638a17bbed82ee7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c3ec4f93988cfb84a36a60fcb4dd9a7f*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"53cbe477f7c1a153e19ca5e849738212"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"50dd7deca02e48964c4fa0c95256d9bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a7f0af85d244422776a830f6852d0f33&42882836bdcdc8941c3e691ac3502d65*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f771dbfd3d9adf1da51f90d0273a4158"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"a189f7d5a67a47a76e8d951d75b80aae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"77a3a6110b80bce45cc7be26fdb53541"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"00ecfe63c5828e129b0cdc7b19851c73&981dc8d615893d665597cf39f1cdb384*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"10c464983d599632efca98207276c978"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"c3f094be3907a4d9698ea85ed81556f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"39886c4c1cbf8829344c2e45b4af1ef5*13&b83d3723bc10c60fbe6c193e33660aca*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"f069085cc4418a3ac9ef6db7a1bf6a67*4&a4e35fceedafc6e3ea21a951b1f5bc66*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f29a8f12ba3a9f00da564607836d4518*4&916a041af0cf06d88e8cf79541c1f80b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8ee2dac87dbd05e74bc228791640db01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"07004f400823983ef52069e7ddb04db2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"af2fe477e3b2c390662d3e355a7ff0c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"14069f5e57aacd3b957e9c347f3cdbdc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d251a3e82c3b619e5bee05a40ace3766"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bbc878eb3adeccb4d0ddfcbe867dc1fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"82cda8ff2aa16d7faadc74a1e59d9e68"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"368d0a3324cd727bfb3557f7807f7aa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"92ebb234d8f859ed24bb1b21c3f09f3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"218d5d30db12f63191a5f49f2589dfcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cf48450af70c6ce7c3724d5f9a59a0b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e400b59bbf5db61c666cc68ba4e3c920"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"c2257c66ec3f98837828e18b872ed424"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"424a2504ea98c303d3e1f46c1276b4ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4f6d3c1f6005cfc21ca49be2d263c724*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99ed466a061b389e03b7b3975c497b49*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cf1144b0aa757860d8d3a5daea04829e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"669028ddf5d6bfbab8828c283cb480a0"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a14d43f3315047fe4c69d1a08388f02d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f34d0809f4ccd8d8452718f1e85d4f9a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"47b518fb78a30d207502876ac35d0b12*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"90ecf2116f809e97d9eac3403c79a893*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3eb4c0738fb990faf7c5101de70692a3*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cbbc2a4b6237d2f87779a2edbee83c9d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"72270017cbaa6217d18efc6532e0c2cf*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1563eb3e449c41f05d047195ac142992"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"13acda1d59715d28013ef1a77b621c6b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3685fede71d7abba15edcc967fd13838"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"edc232b7c7543b1c3790e9e61726310e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"605aba8ab44c921e4e067e84eb90d63d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"88f75588322cc46f6e578be80c0f96b1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dd1612496e5ccb463ef04b09fea61c8b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"1fade34a20b831ce5351336b0e2e9718"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0613442bc72e4d0d13f9deaa6543c867&a05e88df17dafa8b5dc2fcd229ddb4ba*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"6c1d25c29a12c00eb55081e63850d436&bd9e6425e35ad3fa65c05a415385c361*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1b6b232de21ee5c238ec13fb1600c72a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"69c6f4c0f0e7c9c118cb1a7e3080d13e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b784ce6c7b5c4c247c157744afc3e44c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"48a007b4870483072a5007d2e7f4e4c1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d45d7a667f7576a39565c985193e61c5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ec21e5a493995d0ff861df839204f3b0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"18f089c30fab7991616a7a0c0d75a272"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"aaf5efa0f3b87c88f04377982b946a87"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7245a39d42fac49e695cb21eb20b33cc"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"28f46aa35f459ab596519212a629ea7a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"68862bf5587003a9f1db38010429d09a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d757a7dc4175e259b55dc8d9f2593f94"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"63801dd12a191773012b48064d2fc615"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1dee61ada3c8dde7afde8bdc6f6830af"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5cc72920d0759db14a18fa65872d3e22&fa483995c237b72916d26e44ace426d8*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"4d600afe680c18dfe93d7c05188f0dd8&91e5fe9a076ae68ac390f29eeb568908*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2b767bfdc1057b6371e6e816795262d3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"2ead15b92a591d7f4ebaaa7b4811ab6b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"43c638bfb9c79ff39975e1671a8506bd*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"55446951c3d863a3be4917ffbe7fa0eb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"193a06fe199eb2109a1a1a13045ec7d9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1f71a0aaaa900fdcd962bbf42f468581*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e2024c169f34d3626d046fe179cf5ab7*2&bf9ec3bc9cbc54dd8c07883fa8eddf1d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"abb61cb952d3098bae9b709cc30f95b2*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"54b3a2815d34a8b999ea8506a9d7acb9*7&843440900883a847fd863193521ed985*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"5e65900d7815ba921a1eca96f9b419d3&502785c5593811a95b82e48690f62f5e*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"870214cbe0d307c7cefa0fe73c3c1fe1*2&d9c271a9a31d47a1e8bbee46a26faa08*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"83e42bb2f259dd18ccac092b472bed17*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"00cd02c71a34fc59e006fd78e965966b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"b67c54aa1c9ac594bbf741396dc63043"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e4c5093446769b6b4de3a2f6f043823c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6cc5c2bae23568daa8adbfb7055879fb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"d5dc768057178686f5df88db91893c47"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b5f61994461329788c0e812816261113"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"481493e32619c44564eaa6c612e239ff"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"c747cfd832869e98d737eb4222baf878"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2969a2bb54d581f8be93afe9ac177716"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"a66e33df9c5e4e8c28f3587fcb84dbd6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1e98070649fc38581b72aab9749e2590"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"57f382dd0f0fdfc8e0f7a0d14f1abaeb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"3ecc3e75453f41d9b27f7491c7aa30a1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"3a2037e224284bd87ef0800383132ac5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e704d2e72e70513d2482bbfc798811ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic4oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"e862e4a94083ba9669d6fcac5132ac87&7a9dc41421c847600ca3da4ad65db68d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"812a6afe8886e7e5d9ddb9624ccfe1d8*7&2bbf30bc48097330f37d99a4d98b34f7*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh48kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"1f5da6a02de0c4377efa9bbc9fc3e129&6cd1a4770006005b61e3afbdf9d196f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"135d02b8c5435a99e466577ad6c338b4&7d0cb2061894da9e5c9e73a5f3b882cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"ef561d0ed59dbcb2146a4b6908d9fcdc*6&803e722f87390412ee51bb0a432f7180*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"287ddb2ee61475fa3b7bf746d3dc0477&26aa5ee6196cb11fc4aef42e508cb886"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh24kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"0045d00ebea7203d00abe5eb77eb876f&2af2754f8caa7dd9303de65881ab0a0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"ef06c5ac96de0d2435739b6c4289476f&85795089e90389bbe380d9dd9555e138"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"570e4a5b484e71a31cc43efb4d9d2a38*7&fe31be1f0fe6a0817071f16fd03eb005*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"6548b4f510500cb36fe68dc1ef7b6771&d642d4140cba5aadda0184b19750f993"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh12kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"3630fdd3f2088af42b16a47e7c6cfd23&e9d757288c2c31176e12a3f7b5969fff"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"c6521462c804e07876027c089e334e35*11&9f0796d1295ad650df5bf05eed92c360*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"0c1a699cb3f0a2964b0873e455e34e4a*3&bcec5fa43978fad6cffb4807d57b2230*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"406556afeab56866ffb6fabcd1701d31*3&f03a7d471dbce85960279613a7783a16*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"8eb44132ed050a6fa236c39144785ebb*2&dfe4f4c676429ab7767eba409d8795b7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"e8be99dfa83a95a47999c2ccdcaecccf*2&3bb02e91ec37d22810a5bebfdd51a66f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"2c0eca1d4cb72a826b72cea96f0b71a8&09ed3592b043333203810ffd89455410"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"7a525b662c1f9b5b7189c6aa671026c3&2b7512f813a71bd6a9446119d35c5321"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"6e6cb29abd21ba7f9390422fb17cfa56&a3a8ed24566b62ab1cbee31bd5387cce"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"7a2080da3355e2cfba37bb51417692d8&89145121824bd3612c7f9a022de8c5ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"df3935ebdae782796964860e246be901&3c832cd5cec8481d7a2d73c90f8f1d07"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"3fb05a6299b8272df7a07206d0e99695&9144366adcbed4a4e699be82c8e6afa0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"15139165770985a6f8278afa26a60884&1268f8b5815a1bc4cdaf95060ef3e025"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"3c985c94a0fff0d4aee46d224a195804&2679cb8c22dfc6aeff9016b9f2f43d39"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"a5d26a25cedaa3571a0344cbaae7a5e0&89c80c1c761ca14b10c58cbda3495c57"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"3c88fe4eb69563558a0dd62c6acba0fc&d4763db61ff0890e61fe28798691af97"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"94595dc6a22ae8850e90c823c27120b3&5a25660ea3b8951deffff3569825ce02"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"2b9fdbd896b72e203ef898c3476b2ff8&f6ead41aff53167dd095209684a73261"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"55b62e66995bda0be489d0f9eeb50024*2&1ee6b080b8257b58915bed957d748eda*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"4598d11aba88376b6e4cf2a54935bc6f*2&31c2548983a2e75692b07f9b25915e5e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc4_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"657dab3cc9d2eb33604203dec28f7438&051b6f0680c12c13870886d5403d50af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"98b53b5feedf366c9b2c38ecef5ae7c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ab40257bd28b247bcb5575e798397a69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb64_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"25a9ee83c3f987bac06c82c4f7c8defa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4a3d421497dbdd90de907e6377392e08*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d801bc424800d79b0f74f6b0c6f73388"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"825041f6aa7f6d0580d8116e219f9d26*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb64_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7465db9bb7e230d894d184c5bdd36057*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b1412f437feb94a849ef289441989cb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb64_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0a4afc679cb0ed8691962201edfd7284"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"892012529a0845c4c3ba730eca1f1a29"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"89fb35a570b880bfbdc744d4e1ba56d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c98668f7817ff80e032d70bc083e8c30*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"21cc74b74f67986e5af520e8c596b0bc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb64_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"80078be2999d7b8246af86da4b871229*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"29904aa1bacb9754e6b7c602222a0520"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb64_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"98e8c2ef850391765e2b42dab58df4d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f8c14a5265a121a9f4de6dfd53d1799c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b1034cb4da777eebffcfa5f16d2233ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"79153065f5e0a6cfd5e380fcff2ec1f6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6bda3a4d5f179fc88779629ac1ee5ca5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb64_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a78e24b11e78f1c736493560805450c2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"8a9cdd58567bd4196901a23e8e66c492"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc mb64_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"577183a21bbcae3fd6b46311bbef78fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b3a5481a64ac8fcacc49966144a1ce25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+sum:0.5:0:s8 --attr-scales=wei:per_oc mb64_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"41fa80084e07a424dc91d6df784e8a8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0c063f23757ea1371b55a32a1d1029a6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb64_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"d667b5a030fe670375d5d8efdc532355*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:15:ABcd32a32b+eltwise_linear:0.25 --attr-scales=wei:per_oc mb64_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"75f21f3e2f7b2f88b33f19a857795c89*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"f018e243b5a3b19661302bfbcc54a554*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"818e75d3670e2be6cb6878c1bc65bcfc*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"98c082f6a8ba7da9a4628fc5e3ed4e4f*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"990388f4801e12c67579327f9d31446e*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0a8f5d3f5cee929fb304aadce429e164*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8207e7e89fb2ad5e80ce8a0609b681ff*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"63357b2bb51b6a42220c895ad9539440*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9600463a9a92ed34e5495144b1316f79*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"56554e592efd76ec0324604df64265c0*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"bd9619f53c12ac1cc447c154739cf7b3*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3f05f855aaedc9074e1b8b6c04da600c*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c9866565f1712beacc3ad503badcc6c1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"2b9951a75042f673ed2f52cb3471623c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"200baf1f2b951177891d808c0d1d7974*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"e9922c8a2fda4b21b0d7e118338d098b*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2b4f0a6d42dac761d69690841e2bd62e*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5300d615b2148156fcc6631184730b3a*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ed2cf97e964ddc80b5ff72cadc4f1b2c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"585d82b868b412dac7a27bfaf6351b8f*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b007cad9a682d70d24f96a500148704f*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d09e56750cb65b3b9dcabedf9b55dc12*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9ab2eccf1cb26e26aca21e46b774364a*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1291ebaca52542c333304f06335ab67b*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8faac35a6e63911a0331057c01fd7e2e*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2ac4c06e26d0a21f90b04a8abb475afd*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"cf37cab15586c9b09c7b1f8a7d0686bd*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6e60fc722207eb3e7abb67c504b14ba8*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"f7fc6797c6c6bb3fd435c45b6ab491ab*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"60f5e5cfbef2100f30be845422d83989*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"6c92376af0fdd11f4ae59d6268f28aa5*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"81502c47ff89cc69c2896405a2822aff*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"cf89ab6790f5bb13de0e5d0a145dce3c*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"34f94d5a815245fa2eba368905371504*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"968a897a95910277963002a413749101*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1a4e7f55a49fac0a2e2ddae2a489df14*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"23f831bd64bfdd97e10c7f5129c337bf*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9a78a6798e65d04d9c3363ee69e6b2f2*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7612940deb96020941d6aa9f06655ae0*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"689561836b166bfa59876aa8a6edf872*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e4957f85b534c2cf789176941aebd49e*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9cc9be82c2d50cc612f416b8e1f0a69d*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"827e283126b876d81fef463ffc879145*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6a8480c545a2d713b8c6da287566fad5*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"3852022b1f4b96f4488f8e5ee8e5c92b*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"d46cd10c6ded7081e6b56ac0ac916cca*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"02f3de2706065372d5729cce31af8711*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e02500c47cb13e408ee058eecbb04436*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37b524d37db6c44d35f912f0e05917cc*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c76ee50d09a39c7a720fd5d039ca1284*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2b2927429a26d6c95be57c2cedbeacad*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"48f10064cfa4dbfe9c773d03af2d6298*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7450b2a1e9ed1eca145a023b11e648eb*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2624f170aa2803af7dd0c32b5f1ef23c*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"06e91dcbcb4f8ca61c3b93bc18fb3ea7*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e37b3b8f16b27b61bf4ebc42297d7059*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c7796e845159d99804d2242fc5f1c932*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f069ecd690c9420ce6b51f1193b3cc0c*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3f4d0c6b3186a4f4eecd7907c173e176*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"97c1e4609f8db0b19af32c164396f88c*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b48247b16757b6dbba515bd607b7d49*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fb889dbf05a4d3399bf22d4ee4da491a*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"701f289175e13afc8570088105b1040a*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"f4a3b029324aa7323f286e7b22a6ae6a*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f7a624eee823627fb48551f46f137c63*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e1da5b5c57fe5ea7f0788d71c8b88d8b*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d54b7b595d3b7f1f2bcf995507811f27*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c969b661b7d6cd4837efaa3868ecfa95*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"fd992561062a4736ed19cfc3e236a5c9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic6oc64_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0_n"78de51411535977dd367883eeac35bab*144"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc64_ih10oh6kh3sh1dh1ph0_iw10ow6kw3sw1dw1pw0_n"40c78b91db5466e4467c844ea60f4962*144"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc10_ih12oh6kh3sh1dh2ph0_iw12ow6kw3sw1dw2pw0_n"8f101cc3eb7eea4ebf870b343666abed*144"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc10_ih12oh6kh3sh1dh2ph0_iw12ow6kw3sw1dw2pw0_n"687bf82f73db7b310fba0505d0ab218e*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc10_ih12oh6kh3sh1dh2ph0_iw12ow6kw3sw1dw2pw0_n"b29fe32b30a809ce43aefcbf54bb8308*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc64_ih10oh6kh3sh1dh1ph0_iw10ow6kw3sw1dw1pw0_n"80e53ae67bf859817b5388dff0858208*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic64oc64_ih10oh6kh3sh1dh1ph0_iw10ow6kw3sw1dw1pw0_n"6bdcb6984e7a9417f66884c04d6d735a*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb512_ic6oc64_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0_n"ac0447dae8a4e9617077c19c3f4bea69*47"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb512_ic6oc64_ih8oh6kh3sh1dh0ph0_iw8ow6kw3sw1dw0pw0_n"154c3101ab95c207081852ab8b1b6d2a*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"834e62d7cb03f045345f6feff84ab4a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"506f818ad2bdd6f6f61538fd59eedff8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"745fc572409efeb52b0e6acb34c0f3b3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"bec3550fe1b3e023c5b2924e424d01a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"5c07ff1d2d0ba069bcf414342fd8f014"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"31225e38130991ea879090c225b14da8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"1b3c5b5399edbd66768b6f74161aa278"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"3ad800db5d13ec579d0232a60c8c929c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"a02e63e902bab1454096671fd789be37*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"26790635e3949bdb418e2ca10e4656fd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"ee7017c8c5d8892ae586971e69ede667"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"15d72a49fe06516b78aad786805ee872"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"1a2835c334e1675a20a7e802acf59544*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"560403c7d6b3b85acf47df79fffd0b31*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"3108a0c8a3ec32cd6c027081fb4e8f61*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"78cc031d61992158a2e3fb54bab73aa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"818db18f7e8e2a9b0b6a6c262e922338*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"846bc00efd9ee8663f83ae9a967b0dec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ce5467869c69addbbfdad815d97ebfb3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"8bbd3483c2fd696f7fb94e6bd23dde72*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"ca24bdad13156ee3c0729650e4f108e8*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"c767e7c77b99838bca115465f1c0c4a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"de8777502cd02cd8bca1f2673fc7a46c*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"593393377ebf65f3296754dd798dbbb0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"3c59aa9d1d8e258e022ef93768a15a5f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"2a4797c984378f635dd051b6d704729f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:u8:14:aBcd32b+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"6cd8221e80f7e8d76d2f33d548ebe9aa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1107d6b15fb2e9e962dfaaafc9dc6d59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cfce1b9db0b48c7aea45da804bf80e79*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"e01e9d6a1b6127851c5fb03d91831416*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"79d134282c0a4347627dece96c7e90bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f5a3dd5ae86b1195c4e00bcc3ba70a51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"71f56cf580501c2c6e7cb2ef1e8b4a12*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"efa2068a11d1478e15bf90a63094d05e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"29bb5f230c04e2f34b24be89cf4c5da0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"531eed82ec4c4efa63456768d22ca279"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"114dfb7c9be96089ade0423ad9833e2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0+binary_mul:f32:2:abx+binary_add:f32:2:abx --attr-zero-points=src:common:1 mb1_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"9d2f6dbe0cc3a2634c657a025ac44292"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"70d15c8d53462a064f5d637de7f97e4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc --attr-zero-points=src:common:1 mb1_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f6d92476b5168d690b3f9c445d386324"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc768_ih224oh14kh16sh16dh0ph0_iw224ow14kw16sw16dw0pw0_n"6846be5affb803d2184b3400f55da01e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1oc512_ih1oh1kh1sh1dh0ph0_iw1500ow299kw10sw5dw0pw0_n"510c1ac47071509b40c246104d528c8f*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw299ow73kw8sw4dw0pw0_n"d7d91c4d9c448fc0ffbbda84b966637e*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw73ow35kw4sw2dw0pw0_n"1550d053f6965287874ff9cae153b15c*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw35ow16kw4sw2dw0pw0_n"0bb37362cb6c78d662a7f9e63d2a2bc3*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw4sw2dw0pw0_n"cee51e3e9b5a5418c275f30c6ba9363d*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5b677b075d0b905cc14cc30e049807c2*1503"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw8ow7kw2sw1dw0pw0_n"17d2ea615087398aa34b375b107e217d*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"5b993938cf508e8c6c462ab5b7b5b93b*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw10ow7kw4sw1dw0pw0_n"b3ddd299ba331ef771f80af6831ce777*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw11ow7kw5sw1dw0pw0_n"23b294418f0663e2f7914e91e77d02f7*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw12ow7kw6sw1dw0pw0_n"34c4577c1180e71568673be9e1c3bcd2*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw13ow7kw7sw1dw0pw0_n"9c9cae6766a9fcaf5ca5741f16dd4818*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw14ow7kw8sw1dw0pw0_n"0298eab59237303e307b67f4076e289e*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw15ow7kw9sw1dw0pw0_n"4833c644a03a3fc6e0bb617bb4bec105*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw10sw1dw0pw0_n"ec28f19d9e163ded1c6fe9355e07dc6a*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw17ow7kw11sw1dw0pw0_n"93602040c02161a3876eac9a9b0836f4*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw18ow7kw12sw1dw0pw0_n"f136b1c3818f1fd96840f5be71f541a5*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw19ow7kw13sw1dw0pw0_n"a4debc5f76a4e159abc67feff6f0fa96*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw19ow7kw13sw1dw0pw0_n"c6a710bdca7b7e256549e197fdeb0bc4*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw19ow7kw13sw1dw0pw0_n"f11e4ed806f1a29588b840a21e0cf755*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw18ow7kw12sw1dw0pw0_n"b44563df933b9079e80db0593756153c*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw18ow7kw12sw1dw0pw0_n"84daf462b414232954fb489a4900f70c*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw17ow7kw11sw1dw0pw0_n"340db88bb823c778f19c41c3245d0367*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw17ow7kw11sw1dw0pw0_n"d04a6e2b7830a35bb6a14a268b8a47ae*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw10sw1dw0pw0_n"aac60c792c55c1081161c2f42cd8c43c*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw10sw1dw0pw0_n"3d06891b9b6d75a17fed64db281441e7*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw15ow7kw9sw1dw0pw0_n"2af8cc65958d40eb9570fdb14f90df26*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw15ow7kw9sw1dw0pw0_n"7eb60405f2beebb7e7b8f85419a11fd8*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw14ow7kw8sw1dw0pw0_n"986f32900f7b608622d2558fbac7e71b*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw14ow7kw8sw1dw0pw0_n"f96d8fe0caf63a01cb74e5239a8d475c*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw13ow7kw7sw1dw0pw0_n"ff838d13fc29a183222379152ad22060*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw13ow7kw7sw1dw0pw0_n"f0d5b10136838efbecee394094974dd8*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw12ow7kw6sw1dw0pw0_n"c2f711b88991a3c283ac18b6ad6cbd8e*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw12ow7kw6sw1dw0pw0_n"8f583e07959a9b60f582d0122503bd20*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw11ow7kw5sw1dw0pw0_n"7d384b013c6c71c419e80b007e4eddba*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw11ow7kw5sw1dw0pw0_n"7b72c7f51863cbb48bbdd7db25d86cb8*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw10ow7kw4sw1dw0pw0_n"32841dfcbcfab85099515213a2772cad*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw10ow7kw4sw1dw0pw0_n"c35cee540616a6746af9a11e071e912f*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"33240ebd838daa264f1e798442daddc1*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"1accda771ac854e5868fe1b964046eb7*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw8ow7kw2sw1dw0pw0_n"a030d411a94c34d744b0adff68a5d2d4*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw8ow7kw2sw1dw0pw0_n"cc93d878ea3b06c3d4ec1f1b77e0e049*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1ee28de16e9b56afb44fcda111ed9480*1503"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"652e288d8e83cd0ecfb51b3c8b0bb459*1503"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw4sw2dw0pw0_n"1753181cf1b7cfa8b421574ed1c6cbb1*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw16ow7kw4sw2dw0pw0_n"c53d1ac0ff6d2c11db97d10b3400651b*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw35ow16kw4sw2dw0pw0_n"81ab5385439bb033b5b74291f54302c5*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw35ow16kw4sw2dw0pw0_n"069f160d54e827ab17ed685c615b5c1e*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw73ow35kw4sw2dw0pw0_n"7dfd115ee85253cffd6039bdc73e3553*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw73ow35kw4sw2dw0pw0_n"8052a9a5d754a264fdf963d2d08e539b*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw299ow73kw8sw4dw0pw0_n"dd51dea1679806e5dde21eb072ec245f*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih1oh1kh1sh1dh0ph0_iw299ow73kw8sw4dw0pw0_n"fa5a5d6db390301c2ae9ce6b2defddc0*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1oc512_ih1oh1kh1sh1dh0ph0_iw1500ow299kw10sw5dw0pw0_n"9d1158e028dd7a4319057763e343a289*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc48_ih381oh190kh3sh2dh0ph0_iw381ow190kw3sw2dw0pw0_n"e3c04164c21b9120c37b5a5256214c0d*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g48mb128_ic48oc48_ih190oh190kh3sh1dh0ph1_iw190ow190kw3sw1dw0pw1_n"d8c21db5b242c1cabe2b860e3636a8c2*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc12_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"44e271715b3725d58303817c0098e75a*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic12oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"479740c1928077ec20f3ed2e5eb35041*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc24_ih190oh190kh1sh1dh0ph0_iw190ow190kw1sw1dw0pw0_n"c31f65cedc859ea4f8a12d7d8d24a835*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g24mb128_ic24oc24_ih190oh190kh3sh1dh0ph1_iw190ow190kw3sw1dw0pw1_n"969e0fab1ab710d3509bb4925e86af0d*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc24_ih190oh190kh1sh1dh0ph0_iw190ow190kw1sw1dw0pw0_n"2405cd1e20d6252d1b7b1a653d268bf8*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc144_ih190oh190kh1sh1dh0ph0_iw190ow190kw1sw1dw0pw0_n"10ea1c53d851179ad8f75176b714863e*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih191oh95kh3sh2dh0ph0_iw191ow95kw3sw2dw0pw0_n"ea18e6e7a863650e084a5ecf821041bd*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc32_ih95oh95kh1sh1dh0ph0_iw95ow95kw1sw1dw0pw0_n"a8ffda06ba54a71dd58dd5822ede46f6*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc192_ih95oh95kh1sh1dh0ph0_iw95ow95kw1sw1dw0pw0_n"91b45ac6033d7d1416d07c6ee24c41e1*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih95oh95kh3sh1dh0ph1_iw95ow95kw3sw1dw0pw1_n"c91f3e6becc212ab32ff12f7bab2fabd*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc32_ih95oh95kh1sh1dh0ph0_iw95ow95kw1sw1dw0pw0_n"9df281b011555cff7a508f937f6c02ee*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g192mb128_ic192oc192_ih99oh48kh5sh2dh0ph0_iw99ow48kw5sw2dw0pw0_n"0075407a017902a606a3682f8c90b12f*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc56_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"299f412bba094def23ae781b64664a4f*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic56oc336_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"8fdf11f23b9c368cb1fc6082bd6b9346*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g336mb128_ic336oc336_ih48oh48kh5sh1dh0ph2_iw48ow48kw5sw1dw0pw2_n"b383b9ff0b86b24979457adcfbd1343b*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic336oc14_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"87ff3f51eaac03dbb9cbc430e77462f5*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic14oc336_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9806a3b58efed682528ff713ab3cb6d4*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic336oc56_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"bee932a63d3d281b06eaa1b58cfe7adc*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g336mb128_ic336oc336_ih49oh24kh3sh2dh0ph0_iw49ow24kw3sw2dw0pw0_n"76e2d6a7cdb9c3cfed3c11d61f25f5a7*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic336oc112_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"272e9357702aa3f5695c3d0143134e43*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic112oc672_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"4803c65031e5a90e3974387f6dcdffdd*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"a477b5e01906d86ce3eac099ac7df72a*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc28_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4644aad7ce0c3cdfa48df75d802d6e4c*150&73012b62dbc2b0a4a185507c62df7902*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic28oc672_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"584676473cfa20f0ab0e52e7e51a8860*150&b8ce29aa663cba4406d437cd2d0d1cc8*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc112_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"9f994e10b73ffe7d3c49cf37fcf230bc*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih24oh24kh5sh1dh0ph2_iw24ow24kw5sw1dw0pw2_n"b48832f6fa846f7798dc855a3a269937*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc160_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"c9c80c579c9d849ed064c565286f0686*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic160oc960_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"da3c47f03b82d5dbefb806c5c6849030*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g960mb128_ic960oc960_ih24oh24kh5sh1dh0ph2_iw24ow24kw5sw1dw0pw2_n"82ea89e59e4a6dc8170d16e9a056c85a*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic960oc40_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"ab405c3bb20debc6180871cf2e7a1495*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc960_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f53e36839aec57e031642793c52f5c93*300"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic960oc160_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"54dc3ae94a00e73f908b59fd429e4c81*250"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g960mb128_ic960oc960_ih27oh12kh5sh2dh0ph0_iw27ow12kw5sw2dw0pw0_n"9b6bf9d62c97959c99282af5142e9eac*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic960oc272_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"4574cbc01f7e2a0e657415922ac1aed1*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic272oc1632_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"814922361042cc0153b735f71deea687*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1632mb128_ic1632oc1632_ih12oh12kh5sh1dh0ph2_iw12ow12kw5sw1dw0pw2_n"3555fe41f2cf927d4bbf8ef826c144a0*350"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1632oc68_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6b3b7f0b1b0e18270a134a4add5b42f0*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic68oc1632_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5c730686fb29253c4aff5d191dd844e8*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1632oc272_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"1d3c9ae1374b31986bba3d488dd68726*350"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1632mb128_ic1632oc1632_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"a38afe2fe66fb3b2174dbd6af440943b*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1632oc448_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"bef913da70c529f44b8de77a3bb64e72*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic448oc2688_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"12e5df5f77123685aa8f6a882dcfaa85*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g2688mb128_ic2688oc2688_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"d34eec419caeba20dbedb464e5c6b0fd*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2688oc112_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"db03a42abcbbd3bd34557db338c951b7*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic112oc2688_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4dc0233f53f439c32429add5f008a2d*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic2688oc448_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"f70b5c7dcf81be753d9e6606cac6d9f3*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic448oc1792_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"814f5d5347e82f750483fc39fe5a6f9e*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"0ce86e69db01579e387d93834636d5d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"ebf0472847f097d1426608cd14b91bff*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"d2c2adb1bd7e1bb6583bd9c14a7ee729*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"bc358b77ea4ce42a0a162a2ed2290973*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"65a2c0a5a39b350ebd361dc3ae0be6e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"942dcb4f17f6edfbadb9bdab0e49721c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"47bd3a42c88b2077b4ce26d8277f01a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"a2ae70d4906adb404aa049e8aa02ece4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"44b866db6423591684c0169293572c94"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"bbe7197048f9cdfd0bff9615ae95e58a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"61016fa5d2069e3bfd99f40a4e364b0e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"07b1526e13ce4876f975b80479874ea1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"e849302bd0a0a8db11e7690e239ab027"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"0def8315c4bdc33e8da1b76e226eafb6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"2a42a185c7e2e5edcfbbe9849b8a23dd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"f41b34e3769e8d700637dee18ddb65aa"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"72f7663c481b7b3426a8b85700e8d9d2*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"e16160baadf752b83678e58af6b376a8*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"3fa81cf8645ca87be0911c3545758074*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"30906ccf55486a3c02bc18a1bfe75c46*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"86700c977600010c74aa7102142ea100*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"e2e1b241a939c18a16f2f531e9381ee5*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"989eecbdb1984bdfba1904fcd89c14ea"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"b63c7e12a1a0524c8ce9d308a8af1d00*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"d8302e700b97e6dca5c299bcde7450c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g32mb256_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"0a1a689bf31b253a0e44e296b495cb1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"1923416badcfd0170e7fedb0a89543a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g64mb256_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"998499eb61bfea7512aacdae3cddc6a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"e7d9919d783a74d4fa6268352c28f7f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g128mb256_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"87b3499603b6d4bf089e0848fa940a7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"1f7ce9cc28929c8514b246ab560b0733"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g128mb256_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"27b45728b491336e23d8e0a7c3b40a4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"44238e37326ceb99a42d75030efb4d01"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g256mb256_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"3bf23b76ddf62822a5fb963fedb59f11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e81ee0a8f64eef8bfbd8e793475ef068"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g256mb256_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"f591f32ef0e2fa972b88ca815843ffcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"79104366d2afdc5143057be7caae8347"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g512mb256_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"bb2c39a6ce648ec00a056fd14aa3c21e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"667896a8017311b92940a28e104c213c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"27760d114170a7f3453b6435071438af"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"92226d2dd1245cbf03b82ec6b5f6f17c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g512mb256_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"3cfadf0756771205b36d0fbc7767f559"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4fc38b96e965380e7469f59ec7adff52"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g1024mb256_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"77c84277dbc4b84bbbbc4186f4da9181"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"0f899d670d9c67ee18c5d83078b54e91"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1888e2dc322cf6ac39c9101095b1355a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"c1e0aa7561f28dbd4627f47a6dbc0642"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"5152525761642731b1548ccc8bdab649"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"4798bed9ba4166ffb7c22526fe70e884"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"5fb8f7a5f0b702422f8060cfe78c3207"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"2cce6beac6f251308a40d26fb57fae9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ae4e53f7aca40ff67afe8e83dd38a1ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"a9bd669199da994fd0e1afc825e1c6bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"9b254a6c4dba97d3c2d1df2bbc7972bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"abe3d811c86cec74d15ee19031abefca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"f22d41599dfe5b783ba7bcd0c28496df"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"bade9576c3acc84dd20d1925779129b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"e01e0c6bfc421a06bcf511083deee969"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"ef2700a86567a57683cc620419665b0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"f9f3f868a2d3a58368b71bcbf643c8a3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb256_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"f755a9392a4c0cb57e81001f00257b05"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"48ec0879595efd5693c91d0ee572c97a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:f32 --bia-dt=s32 --stag=any --wtag=any --dtag=acdb --attr-scales=wei:per_oc mb256_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"eeb89729dd4594d0cb40aa05b5798b64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb8_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"8d76b6a18353e13af3cddcefa0dc5912*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"bc40998ef968a0ade0e93f30ad93a29a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"e19c48b98010eacd221d37bf4b808f5e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"cd45aab2d630732d1b7048e33c7692af"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"57734545f64309a137e7d842072c854e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"21d87ab74ee1fb9bd9ed7b3370eac1d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"e631e2049c16f81bad8fcd0479edf28f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"abb37f6625b353cdae982e3806ade634*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"48fbb8f20e6d9074924cd2c422862b25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fdd241ae79f04f455406600adfb17ccf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6cec03ea173aec571c214c6e00fa149a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"814077365fa42ffd10a479342ca75e0f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"fdca3739fa63140c6bd74b6e38d39b9f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"b5b483dbd3e4fd6b79422e3f338f98b2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"eb10b794e46e287939456d2312ca7e17*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"c3d598abbec0b6c5396d843d2f2beb67*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"91d4a3e03a64f5bae5b6a8a018abb714*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"0953d7c42fac149e06ab19d70baed572*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic7oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"acf7f992d3bdb0bd5178b05fc399bcbe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc64_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"0aa63b1f87e2fe30ff47c9d2fd74cc4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic64oc128_ih64oh64kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"faebc7964d91f3331446872e48ae7311"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc128_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3f5f4fceb784387a6f8f71ed4fb13605*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc256_ih64oh64kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"ca4e7340e554eaa5442dcd77e5816cc0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0ed7542276a162d87c8e72d3ff20a04c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc512_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"09b5519c99a188a6163b09baaa8940da"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic512oc1024_ih128oh128kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"42a0dd2a3c00ee02110b16a25d172bd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"2b049cdbbdc3dc419b127bda3c0b9bec"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc256_ih1oh1kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3c11042213c48efc8a4b8c8f6642f198"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic384oc256_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"1ae956a51bc2ec9c21e79f98095e3150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic256oc128_ih1oh1kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"fd1ce139100954e4bb6400d82d9ad58d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"2c703c91710c354261af9a9a0c165ea8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic128oc3_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"84f11de48d08d664c35e0326dd1faa6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"8849829d345f6985f10ef1dd8af695ef&ba1d8869f50db52b351126e28edc964a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6b6dbd54b3dac25f42b2c5767d0c81db&312a5572ace3b3bcd04b70f589c91140"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"cd14fcac280385e008e604b9cfc62a42*3&1d426abf203ff27aca91c0b6c49f3489*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4439b2eb3705693eebf98530fa23d3f7&12c58cf6f476e1b4dbb2c7bbe38d8ab3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3dc997db79444af0ec1a29734b2ca1af*3&ded60f76ae93bee35fc2376b68cb3cfe*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3fbbef21c15e2a4d5483bf3fd128d455*2&0a00c4eac67283a0334a55865aa04bab*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e492b35a8fcf1e17a635c5bfc79aece3&1dde9cf333e4357757ae9f68495e526c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"5d0303eee573ef61a910911cff1c9483&0eea9d0fdbf62a2fb3740de912406bed"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"39503e930e24c649d3733477387e9d6e&cd47ba6ac64d37bf72b8787d0e51db0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"adb02b17d8554ebef09a9e60307183c5*4&2eba681321bca902fea250192a04e4ad*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5ee8bd3c2bb14e34dd97bf251dc32b94*3&4f485f46b44a3bacc104333d3eb5b155*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"63a740adbed23d0ca70053380d84110d*3&24d11185a0145d8fdecbf7f9eff76997*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e7c4d08467168a808c02f4910887d72&ae7d1b6dcc0e48a8fbf266353b3fc4bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"06c09dc687d56de9775654d828f2801e&8a9a0ffc59e9647ca9136980989f3cce"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"ef85ea198028918b85050d8750dd3d7d&dbaa8ba2c6864af1cc408f207542d86b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5731878d155d5bf83cb93ca8ae50a501*6&27581cb4283cba82c8b068af210e468a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6144076cacf318927e2e3736c07eceec*5&10e784a0fd485eb23f0c43f51152e57a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0f9b4e57f197da6639a275c8f746c5dc*5&37ddf7d2aca92d1fb7c2f2f8f4b15d10*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2aab599ca887866b9d4a9fee2abf26f1&aeb6a13d33b7086b59d614f9b106385b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"626c268a41156b404ef26d52a94cd075&06de6b54e57f2b10c3273136de95ed4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e74e4dff0301c3510c910911a51c55ea&1714221f3da0641d2fa58f85e0cf792d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c4144883a1f197825a16d2eafcc311b*3&da4904045b8d1ba55eedf1408472226f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"29f395f1c3175c7c1f79b0b41095fdd2*2&5ada719ed8085eb39ed81acd415ac337*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"c2e313c6c2c2db41095d8c9b7571c4da*2&f02379d46a129fc0c42fbf0340077215*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"0612621ae324d5053d950b057ee0b3d4*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"b4e9376e0766d8215241bcf278138ae5*210"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"a5a6648fea532e17de9e587c14031d42*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"bebb84ea558633659d653146d7b9975a*210"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"6e38ff093e2c80431fa1eebb7fd54133*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"820da99a3960cf3d9b3ca6dc3c6b9554*210"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"bdca4e5cc740f5d2519388653173fbc8*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"f7dfcb2770118143a8099545f503b19d*210"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"90603f0e0a54b85d506fc4f83c393546*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"45d6ce0b430d23e4133c20e282a744b8*210"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"a854c72aace37a7c69ea62aaf4fa8b2e*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"123d1b06e22001e23e2ec668807c57a6*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"3ccc6b2c67c8ad89c9d8bdb86d69f373*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f60ec582aa6db7122113e555038b7f94*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"c92f5cc3186ef74a790aed18593cb26a*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"28a354889b78a4a6a9eec43106c6d3aa*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"e6941bef0a446363fe9b4337ae544dd3*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b6c6c86ab7edaaf45b998b211d109ed6*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"50dd32b0928c92e4fcdd423bb0fad187*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"12e1e25b3425326ecbefafe794b9cf87*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"b94588cfde2071ac0e367eb003d56fa4*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"c611c2ca90a31131f8743d1950e2b81f*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc32_ih768oh384kh3sh2dh0ph1_iw1152ow576kw3sw2dw0pw1_n"4aa0d2a08b43c28e11ea5918862340f3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc64_ih384oh384kh3sh1dh0ph1_iw576ow576kw3sw1dw0pw1_n"b8279eae429b9baa76f41eae83ae49cc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g64mb4_ic64oc64_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"9432ecd07683b7798afc895b500a45f5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"b9d9ca64b7589275eedd8156ad53f39c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"6df3eaed86fa86a14734c5d722788298*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"321750ed02da90333bfb282394d51335*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh192kh3sh2dh0ph0_iw578ow288kw3sw2dw0pw0_n"2615094f705ab7c4a1856fefc393d182*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"13f465b13b0d2011fbb68e3bb06a0675*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh192kh1sh2dh0ph0_iw576ow288kw1sw2dw0pw0_n"ac3d8ac2dc0f4b2347a623f86e20cb97*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"67a9d2242ea56b8056195f5cc392b4c2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"a7a667d226500c59b28829d0652d2eb9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"b4787246af4fe4713bad8ba0b14037a1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"6c4db0d493929df44dbd28e0984d1064*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh96kh3sh2dh0ph0_iw290ow144kw3sw2dw0pw0_n"312f5851455c5a60a07dc0375b0a33c7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"d24e87d66c0e8decd10ba17888626cbd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh96kh1sh2dh0ph0_iw288ow144kw1sw2dw0pw0_n"ea3164de03f070c4b144ae9349e94f4d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"077a87a25643752fdce3631e456bf653*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"6da6e9ee8896669c1a258bef8f21135c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"eb9718956806c0b173d229badbae7869*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"ec7fbee64e5cddab90b9eb92503620a1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh48kh3sh2dh0ph0_iw146ow72kw3sw2dw0pw0_n"34a7e5834abaa13a65a30c5e82a8994d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"529a95c774d0219243ab06a87a0bc0db*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh48kh1sh2dh0ph0_iw144ow72kw1sw2dw0pw0_n"c459530ddb168d041d42578128c589e2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"92c3d7f7f3f07e1d1d8321d14a108a44*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"a44eb609b9d37ce04bf5932474d4a516*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"4653e98f627212b4509209d61579e89a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"0dbc240cc05adfe39577311295e56136*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"d7056a00208c177436f93a2ab5af1e26*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"cfb992b75cf48c42d9307e3722dd8a26*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1536mb4_ic1536oc1536_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"16b5984d68adf24aff937dfea53c73a5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"96a3d6e37e8bf10be934a11f2bd4f8a8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc2048_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"ce1933b0904eb56187c99a13020b7586*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"1218b065cb0a9f36c5565f97836042ee*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh5ph6_iw72ow72kw3sw1dw5pw6_n"aa300521f7b58e7e3a1a30e538dba9e9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh11ph12_iw72ow72kw3sw1dw11pw12_n"2686f3627824eb2ff000c29364d86b41*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh17ph18_iw72ow72kw3sw1dw17pw18_n"d1aa0fc2d412351187faac54a258a728*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a9089877430cabca7a17d39706b0561c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"b7c3ba5894b2858cd0483017a799ca54*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc48_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"97cd509e40af44adeb156e8adc335795*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic304oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"07b116462fb5563c1bab2a780a1f3f5b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"211cc9fa7645c6a7de651235c5f28276*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"532180181427fbe0dace8309661b0dc9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"90961add95681acabd306033a115d9ef*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"a6ec14e69da305f7ba7c49f30bad18b4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"365315b320abb31f867d29abbeaad50e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"87e3c2791a71bf95398b8031190b2696*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic304oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"74b5bc3a6874f7e6833a280cb3e443c8*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic304oc256_ih192oh192kh3sh1dh0ph1_iw288ow288kw3sw1dw0pw1_n"03e6d15e593ca807811e9c31b07ffd6b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc48_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"0486655d8652ab71ea5ed79f62b76059*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc48_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"6bc7b636c33bb98c09eac3e427b3852b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"4751aa583191753683137948a8ef71ef*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1280oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"d76b08ce9ee051350ad4576990d36e33*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d691730b0fba581a025525b2a22edfb9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0df06ea46f5af9c8e697f0a7c2cc435e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh17ph18_iw72ow72kw3sw1dw17pw18_n"2a9a5ff4c2cb85291d7ba475300188f4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh17ph18_iw72ow72kw3sw1dw17pw18_n"1dd1d862c6a54fa638ac973364252e4f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh11ph12_iw72ow72kw3sw1dw11pw12_n"2f98834b533df5cdc90ec154b465079e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh11ph12_iw72ow72kw3sw1dw11pw12_n"6eafa1771089ea0d9af57395e38ec0de*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh5ph6_iw72ow72kw3sw1dw5pw6_n"7931d9606cd5cc9851c4beebee5e8c21*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh3sh1dh5ph6_iw72ow72kw3sw1dw5pw6_n"0b95a7e95d0479966c31315a7d38e7cc*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"6fdade62f04fa27dc9fb13f18f10bbb8*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic2048oc256_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"7d0a95ee1e47f402abd546af24bd2eb0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc2048_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"6f44f969dbe6340ef6c6d472ea64f2b4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc2048_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"1b999a66bd4896a8a9967d73a9a93f4e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1536mb4_ic1536oc1536_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"754cd96418a08f9d79bca5534e5ab9e2*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1536mb4_ic1536oc1536_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"cb33c5e385f58e01882cd8229ee5906f*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"70f6115330048bf5c997925d8a411e19*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1536oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"08b8d2b6b0c84510a11bc266f2a64120*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"6ef24cc48e981076c3d13e194e043c5a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1536_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"bdd65b96fab40bd3bf683f8305279f80*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"981b82b987e735bd5673f8c33d6a01d2*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih52oh48kh3sh1dh1ph0_iw76ow72kw3sw1dw1pw0_n"a83b6e95631de24f165ca31a72e63c57*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"42d5d391defdfaaa409894c30eb573bf*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"9b9662527317367efd55e1fdceb14785*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"9ae8ac3f3f56dbaed2b859439bd8e7c7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic1024oc1024_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"fe695a9eff7b389927eff753f1f906ab*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"2aed4e7766a2f344a0684b57090555e5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1024mb4_ic1024oc1024_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"e9a2703905d2dbda6bcb6738add4c232*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"4dbc2ffad8f744bfc8fe2d9a0fd04b9c*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih50oh48kh3sh1dh0ph0_iw74ow72kw3sw1dw0pw0_n"8118f475e956e211e03b133639ccec96*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"221150e2a4f51a9deb73f2378c3ed5d9*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih48oh48kh1sh1dh0ph0_iw72ow72kw1sw1dw0pw0_n"4d053f0738eb244ef980e08ba758c121*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh48kh1sh2dh0ph0_iw144ow72kw1sw2dw0pw0_n"99c09c106bb9cdf0c698655c255d0b55*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh48kh1sh2dh0ph0_iw144ow72kw1sw2dw0pw0_n"1068f289c81060f4e4a241f96ba1b16e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh48kh3sh2dh0ph0_iw146ow72kw3sw2dw0pw0_n"7672e12f12e8827deb3d4e0584068cf6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh48kh3sh2dh0ph0_iw146ow72kw3sw2dw0pw0_n"61b47f98cd71f56091bd09d2feab9d29*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"f0e435d28f49baafe89803625f5c851a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic728oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"9f8bb59bcf90aad3de703b9a662fd4b0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"020ca7f02cf428ba2ee2556e016901f5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g728mb4_ic728oc728_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"590665c3fc4ce436bd28b061317fc045*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"47de0954bf65e96a72aa53a1a4fbe360*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc728_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"a4391c6c40efcefd51296ea49cffc0d9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"d49112f353531b9b6cf57c2072ff81d4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih98oh96kh3sh1dh0ph0_iw146ow144kw3sw1dw0pw0_n"a5a6438d9d5028e9db7faec0f6c97d54*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh96kh1sh2dh0ph0_iw288ow144kw1sw2dw0pw0_n"bcb2bd81013c5ddaac6b88fbda52b613*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh96kh1sh2dh0ph0_iw288ow144kw1sw2dw0pw0_n"d994268b6acbf3bb257a88dffaddabed*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"8207aaf90f1c797ce6fa10e0d3e262ec*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh96kh1sh1dh0ph0_iw144ow144kw1sw1dw0pw0_n"f892d1368648e60cf69d0bf7124a46a5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh96kh3sh2dh0ph0_iw290ow144kw3sw2dw0pw0_n"d8242c3369f5ab3114fc8139a3526af7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh96kh3sh2dh0ph0_iw290ow144kw3sw2dw0pw0_n"358cde614eb2bba8fb4a3bb519f8a1b5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"e852fae6e2e33336930cd068dd2c40e1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"d7b21dac5e7f8e877dc715da47abfbe0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g256mb4_ic256oc256_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"d2cd670c38e09fce987183c98e313e51*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"7086c21afe19ad810ab9ff5b9f781d93*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc256_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"85ca30e2c54efca809fdeeae5a0ad3be*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"12fb2b0f9ed9c1cfb7523a4221c16ffb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih194oh192kh3sh1dh0ph0_iw290ow288kw3sw1dw0pw0_n"ec0bf38290f3c2457d67c524d684374b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh192kh1sh2dh0ph0_iw576ow288kw1sw2dw0pw0_n"62e727fd76dd4857c3de9e52214ec9d4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh192kh1sh2dh0ph0_iw576ow288kw1sw2dw0pw0_n"f699406c23690ab670c3349eeda23cb5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"dceadcb89ab3f463115cdc0bca90146e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih192oh192kh1sh1dh0ph0_iw288ow288kw1sw1dw0pw0_n"244f832ede5ae15d19668a07ac4fbf88*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh192kh3sh2dh0ph0_iw578ow288kw3sw2dw0pw0_n"86e838c4caedcb213e42f9d09f3bc20e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh192kh3sh2dh0ph0_iw578ow288kw3sw2dw0pw0_n"112e401edbb59d4608e06e41fef26349*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"cebea6cf3c4f9eeb7cca0f391fe42a0f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic128oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"6a163920b44ca26f57023f602dcf4b88*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"7442150dc7c2a530c3d1a563ec950060*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g128mb4_ic128oc128_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"d961b48070e92cef1525acf3985b1911*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"fb8773b1339ef82fd4bfaf885e7c82ac*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic64oc128_ih384oh384kh1sh1dh0ph0_iw576ow576kw1sw1dw0pw0_n"5be6e9e1843d79f64a751b5b22e29ae3*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g64mb4_ic64oc64_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"e5dee7818357af6f2cf0a15a250d47ae*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g64mb4_ic64oc64_ih386oh384kh3sh1dh0ph0_iw578ow576kw3sw1dw0pw0_n"cbab37e8129b58f3c71789ed5abeb43a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc64_ih384oh384kh3sh1dh0ph1_iw576ow576kw3sw1dw0pw1_n"4488d41fce45d2d0700e4730d574b208*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc64_ih384oh384kh3sh1dh0ph1_iw576ow576kw3sw1dw0pw1_n"aaf0e15c97dd2e74e8d9f9ff05bf53bd*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc32_ih768oh384kh3sh2dh0ph1_iw1152ow576kw3sw2dw0pw1_n"98eb3a1e9504cd4529c8abef9e9345ad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic80oc512_iw283ow283kw7sw1dw0pw3_n"a7812bde341563a90c84fa94891b371e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw3sw1dw0pw1_n"6c9273da955a3f91dfeb40f50b30cdb9*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw3sw1dw2pw3_n"f59c9a9bf7ca18df88f4067940fecbeb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw3sw1dw4pw5_n"8cb7e8dbbd15224e4b6b6c81e41d1693*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw7sw1dw0pw3_n"071a00f17552e87eba7203136ea85008*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw7sw1dw2pw9_n"556468fdc5ee4702c04838282dd793b7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw7sw1dw4pw15_n"2dbc5905291fc91f58e589968017414a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw11sw1dw0pw5_n"66245bb7956ce35edd6adabf2a543664*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw11sw1dw2pw15_n"aaf7d6c7970eb83033b1f68db9b6bfea*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc256_iw2264ow2264kw11sw1dw4pw25_n"8b637613df474118bdceb6546674d154*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw3sw1dw0pw1_n"6d68a877e2f6db3345a048c5e66b47f8*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw3sw1dw2pw3_n"4251dc4af20b6934b78e0a74995a4b56*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw3sw1dw4pw5_n"03d28a1b23db3eb9b7277c258aff4d64*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw7sw1dw0pw3_n"03da0296b66c4af5845c1bff736c6364*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw7sw1dw2pw9_n"ec4c2d65d731cf76cd05a879c0498401*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw7sw1dw4pw15_n"6d3607dfb03bc8eaa91a0974863603f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw11sw1dw0pw5_n"0719f741ccddc5f4dd220561512fd21b*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw11sw1dw2pw15_n"55622bf863474b6973072906b2f679e8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc128_iw18112ow18112kw11sw1dw4pw25_n"c020ea234ee014204078e7603ee02ca7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw3sw1dw0pw1_n"6a4918bcfbb5ecc51bb6bb60af8303dd*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw3sw1dw2pw3_n"9c949b24503b61c7f559717da53b7eb4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw3sw1dw4pw5_n"a549bd07d7d987fa2ef07302c50cdb85*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw7sw1dw0pw3_n"c8336bf9eaa07a9ca08a4ef83bb1c409*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw7sw1dw2pw9_n"ce52ff547152c9ce46ac90655e10288b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw7sw1dw4pw15_n"8c3b370ca8102647d79dd138739148cd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw11sw1dw0pw5_n"ee9dc25793bfc4cccb364f108b3d9682*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw11sw1dw2pw15_n"5deb15def4a7b17155b85e7b7150a173*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_iw36224ow36224kw11sw1dw4pw25_n"0ffec6aabc9d6a7904c5d1bfe191fde7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw3sw1dw0pw1_n"3206bfb8cd88b073621d6304c8e8b080*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw3sw1dw2pw3_n"2a58cfebece0906421c3c3d741184bfc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw3sw1dw4pw5_n"4701de24a984e35e7fc4e6828a5f14e1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw7sw1dw0pw3_n"87a95794d3793c4d7c96b21aaae7acff*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw7sw1dw2pw9_n"8bdd3bff75c899d89f56bf9389c993fe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw7sw1dw4pw15_n"d5db4f47a4a8e6b273e590a61a1c7e90*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw11sw1dw0pw5_n"033890b114cbe3d270dccdf003ba7f6b*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw11sw1dw2pw15_n"2b9119b407cb3cec78f187f974dbe74b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic32oc32_iw72448ow72448kw11sw1dw4pw25_n"48288811be4e669f47c68a7abd92809a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acb mb1_ic32oc1_iw72448ow72448kw7sw1dw0pw3_n"c84fd8e06ca1629bf7d9982de88eadc4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"fbe8fd8ff1b68454db1441d341c668a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"1b772b8390ee167d3e543f73a8d57198*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb --attr-post-ops=eltwise_relu mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"d1290fcd9845c169ad72fbdc897f9f6d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"542aedfa4e90b90b932dea59f0c3c108"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"447f8ce41315bc0a2c3f096959488cc1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc1_id17od17kd1sd1dd0pd0_ih33oh33kh1sh1dh0ph0_iw33ow33kw1sw1dw0pw0_n"fc7111eb64d36a6705266f0851f84ea1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"1bbf8f333050bc070f928477a61ed871*17"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic32oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"b0eed9630216c25d9f731127b6b17362*17"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb16_ic2oc32_id17od17kd3sd1dd0pd1_ih33oh33kh3sh1dh0ph1_iw33ow33kw3sw1dw0pw1_n"a6fdc7bf6ab7b3ae9a3f6d280d2457cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"2654cc697f3db8152769f29dcafb8b42&41d79b5b03129a32edec788261dd6c70"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"29cccb7b0844fcfd3e7e4394279ce8a4&612bbfe440411be0d4232315d1285b32"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4e192e038beae6a8987e6b950ae1582e*3&786fe7be625b0341911d01144a20ca3b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bd7431e83e147a1889cfb2f17460929a&12fbf7bbba657d40b15e286c19597b28"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a02005a4a0b5a64bd5aa627ff8e6ef7d*3&bc014ac8accd129cd5b1feafa69fdd95*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"373cbf66125fe9eb879f6ed760c0ded9*2&4c6c3958fdaf46803e1c82fbaf38f63c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"abdb59d730042c1399b3be60784c192b&81095c71e35259005999415bd5f07e1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7fd94bc48dcc0f35b8886fb838678039&c821c9a1f6ed6d507a7b6e2fdf76c194"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0e2cb882c1459006f54ebcc5fc8ad589&1950f1e9ae961f9d7bd16307bbf29211"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"57daa27593cb4823f1fc9cada14cf53b*4&1b8d50c438a4ccd289307874a479d861*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de68f838dce633d18b1229b2fa58b4a0*3&dc00e552421af18734cbfab3d4ea8357*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ee51376ad5c14123ee85b47645a26da0*3&7d3111db8d58d8bffe8d3b8478908605*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d58763ac43cb420d20a593c618d61a3b&b2bcab05f76b09b98c9dfd1c0587a017"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"13bdd1b79f46e7220541ce8c08f36632&b81fc6de916601e9f4873d14edd8967d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"51b9b2533ac51ad90b5ef364e58369b6&9a37e5b04adc5ba8184739b71bfcc4d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"4fb5108b5a52114efcaa8f6383b9052f*6&efc34c2304f9d5e6f19e7a8102648042*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"37e3b8834d1e0f9c77837d6c3b94d12d*5&623cc08560c22cbdd3a91351a75f7dd2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8284e94094c82a8e1d4c129756bdb56e*5&b863120d4593653eae68e64b834808c1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"01c80de409b3dce416f410d4fbd60be3&a585414e33d7839f2d996dda84e54739"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"c32eef5b9d29c1d1ecbe81be685178f7&6541332f227d4c749dd20aa805c3e60c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"df84111402d42e4a18b8b651b11fef3b&bd2c00310869068bb69ae93045442ee8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6e43fb9f5b6420651432ce6c372b8e7f*3&5ac24d006cfcabdecbbd2db52cade176*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b903e5e5b2cb85cb9f4317433339cb7a*2&cacbcd861938fad2d33a4d9c7a115744*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e8b40a990dd3c89c1af05e7f0f858064*2&ac5e3b453699a8906c6c1c204f50ab40*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"44ce6e38f29ff2e4ecdf6159a33073e5&a472f732b816ad6bee03461d7a68a43a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ea3ca5df784b76028faf64baa970989e&a317764c251e444879a4b74f75959bf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e6dc6bcfc3d80e8f9728b9849d1d64f4&e7179f51c517f2450c344b9c1e46c283"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"203739259dcc583a4a91039fc98e2d34*3&b2472db1aa8565062ac8beafa14dc1f2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"80ed305f9b406e20144c42e70c9b2843*3&b028d9caf75c04a2bf3183fbc5128da3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bc6e4fdf37f97f3ff0adf7728151c9e7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7b126f9c7a255def01496ff830aaeb00*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"f7525120287c512126acce09056d534c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"04772cd7abd42fd7ca310318700f0f3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c42fdde032507f97c32929c66e191996&6fcfe9ecdead109ba504ca00e444c386"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"844c0bff4d04c4b5a0ed8081c6835f5c*4&41e7f04383a3e9a7eae166e84777a28a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b18b77220619914f80f438b3efa4438*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1bcc42622d73516fac478960ff122b8f*3&8620fbc57ab6ec391e4bfea0d31791bb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6bffa2c224cf526f6b3c55ec4e6187ef*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b98c2bd8ea3b03e070433645e1be72f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"27f1028198e19eadf4359980b8739ca8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9a5778cec6efe518e8fe899a0a02b706&4c9c06f632c7ecfe7a4bdef7f45f1e0f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3ea48e87465a800237a086463626c2a8*6&64cc65a53f55d02d6e98325abeb6de20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c6fb1b03fc76c536637fe7d583e3cfa9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8e029d68022450e2d6e76ff88eba3a8c*5&acf7874952864deb038527850040b17e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6d188f395584338e0ebf36d1183c2656*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"50c631dfaa11aa1d76d5121ad681f839"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"87d2585568f58a76a6cf79c315409321"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ebf68a2b3a47af7d809c36c0732d7f04&2ade072d2bfd127afc9310dcbf300505"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d556689ece46c4254ee24d8195408edd*3&fbcde0de57be6254d8e7f8928a7c9e2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d49eb2a45f7de48f27763c49188be6cc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"96534a05223c551c05ac06cad1d2df0a*2&f1f56e6921f9bb9b5af23ab32e1e8ad3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5+eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6ad10f9369b43a847279af51b97a3478*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc96_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"f739a78edad55e62acc84882a957637f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g96mb1_ic96oc96_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"5f68205b0e3ef42a1907f4886e0ef249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc96_ih56oh7kh8sh8dh0ph0_iw56ow7kw8sw8dw0pw0_n"3d133741f7580585d91c5d6c2add8b3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic96oc192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"67128340debe75bc8d5eee9bb3872dee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g192mb1_ic192oc192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1f590ce1ef4240f550ec89febc2fe7a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih28oh7kh4sh4dh0ph0_iw28ow7kw4sw4dw0pw0_n"65eb2babac04da9a200450d229e7d743"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"829a647255aa4891f0cffb415755e4f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g384mb1_ic384oc384_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"37857cc47b82a9f7c44efe1633e68ac2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"77e5b8b7dd2f1197f139a532084fdae4*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"696f404bbb00370074d99dfc3e5d1a3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any g768mb1_ic768oc768_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7511052b70445f90ea6dcad69e3274f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic3oc64_ih1200oh600kh7sh2dh0ph3_iw1200ow600kw7sw2dw0pw3_n"d572dc3f2cacf42b9af73e539bf8721c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"778a6d6ad2042aea30782a1a416ee79f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc64_ih300oh300kh3sh1dh0ph1_iw300ow300kw3sw1dw0pw1_n"631efabfe5a7f6b7614fd168829bd751*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc128_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"6ef53dc6022177f4db6e62fea107c9a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic64oc128_ih300oh150kh1sh2dh0ph0_iw300ow150kw1sw2dw0pw0_n"4e085ce2706d74c89a41da0118c768e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"e7b3981c4b47cd6e9ed23cca9ffec36e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc128_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"b88af0b61f129560932b6b2b9480aee6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"4e66ea9657309e832baeb9e467a590e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"6fb33a4c93e248dc32fe74cde74a19be"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"becd5c570c9eb997f4d334d013468083*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc256_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"7d6a5b4b44586a2ff63a54280428d888*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc256_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"d5854d2ae190906ea60a4f8c4e05796f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc512_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"13a0a38286631462ff686e29f9e08d86"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc256_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"26883cfdd441ed012c4c5de125736b00"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc512_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"78ce369afa77c8bec57692d496e78ca7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc128_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"cf29fed8055bf49d69f3d9876110b408"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"facd0b86225f40a32cfe00d0ec4b9ef4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc128_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e34e8a768a894c596bdda73ecfd32576"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih19oh9kh3sh2dh0ph0_iw19ow9kw3sw2dw0pw0_n"7b6390ce51c9aff75023f6df938c0112"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc128_ih9oh9kh1sh1dh0ph0_iw9ow9kw1sw1dw0pw0_n"abd8a454217eb10cf6066c4cebed1d1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic128oc256_ih9oh7kh3sh1dh0ph0_iw9ow7kw3sw1dw0pw0_n"a1def670ad468c99bfecd2fffd94eef8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc16_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"fae695bfb10a69e10ef2f813e13254be"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc324_ih150oh50kh3sh3dh0ph1_iw150ow50kw3sw3dw0pw1_n"591908f1ebf70eec55a401c320ce59ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc24_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"72c085ccbeef06bca201041ef0dc078d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc486_ih75oh25kh3sh3dh0ph1_iw75ow25kw3sw3dw0pw1_n"76ae23a000ef9a62d512386e92e3ad18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc24_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"c10b0690a660751e37e3c9f4f596297a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic512oc486_ih38oh13kh3sh3dh0ph1_iw38ow13kw3sw3dw0pw1_n"ad36ee0610da916049723f0d08b98b11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc24_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"4f0eb7b4b34286c36ee4f8fcb820a6a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc486_ih19oh7kh3sh3dh0ph1_iw19ow7kw3sw3dw0pw1_n"406e3b2c854faa5e87fb92118f16d8dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc16_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"2a5f5925f52f35cae1f3c5cd604b586b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc324_ih9oh3kh3sh3dh0ph1_iw9ow3kw3sw3dw0pw1_n"810a4a47c564aef0137b50f14ae1b36d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc16_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"337330b4288809d5460aab37c76e3cb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:per_oc mb32_ic256oc324_ih7oh3kh3sh3dh0ph1_iw7ow3kw3sw3dw0pw1_n"2e91b6d872cf7418e788a40064dada72"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9c567fe891057c6056907f8cd8e46cea"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6886872d9d6f326c27689d98b94c8041"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"9c3678a587bd3708037963b4866bda67*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2710682404438539826e218dee5a3e4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ef35f6bb979ed6ba23fc2486c6eaa2c1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6fee6d1f2551878f42692b74d7d06b2b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7446c6722344744b9943f2eac36dc0c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"26b2b61279e45b0b92d3f6abb49b969f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6fe060df449d95fdc56dcb1c1f5f7987"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"b655c754ae87fe30f0eba5430e9b49dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1cd2e52c37842d2206093f8d612bcb0d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ea8e93e92aefe63cec811b6bb9ad1e5f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"93ddd0d2b7c3e2b188bf1917a20a8861*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"326ec68716f9b4672a62e7432c537a1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4b94e470c6037365862bbcd0a0a066eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"98aa6f524004f5134724d3bfce188e79"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4d337afed1211c250acbe397f0394642"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9975258e9f64350280eb5446bedd7ad9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f70e4330cec331e5eb21144558238c11*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6a7357ffcf20539542f9ce7d09e94ae4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3c75c7ac94a3249dbf9969694d766511"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"9dfab7072bda064a47fd44e556fa2780"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a6a16ec190eb81d460d82038afe8e23a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"ffe3e2eb141b1863bcb0c683c28bb6fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"1a53be8604d5f7a248b0f84bbc5c6275*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f2c20de65b3713c0ac606e5a64a4aeb9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0514cdd30091b863acecc686b586c70f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic3oc32_ih225oh112kh3sh2dh0ph0_iw225ow112kw3sw2dw0pw0_n"c286de56f70eb03dd88ef17dbe58aaf2*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb128_ic32oc32_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"b3ef2fe56a8185dc890eefd22fd3c10c*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc8_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4fb1e633885a988823ad6e37f43bf10*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic8oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e706b1423e78abb63255e11519c67dd0*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic32oc16_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"f7407c83a483fa4149d74eefe21007e1*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic16oc96_ih112oh112kh1sh1dh0ph0_iw112ow112kw1sw1dw0pw0_n"fd16fd8650f674574d30aa401747b1de*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g96mb128_ic96oc96_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"62a9976fbf62f6705790e62d04d20490*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc4_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"20e21642c1a5df4e05e72871654683fb*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic4oc96_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0630f2fbaf4f2ffd72280613b7f5d07f*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic96oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"137f6f4902c8c182c5b8c0e1474d0959*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic24oc144_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"002da6507fb4ea1b9e73065bca1d81c5*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"cab7a843535be39e8a6a335c654ebe24*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc24_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"178ae7e11670255c08868e5242a6bc57*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g144mb128_ic144oc144_ih59oh28kh5sh2dh0ph0_iw59ow28kw5sw2dw0pw0_n"6dffdbb713d8f487eb3f9ac8ba77def5*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic144oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6343ebfafa844c427f935b13b3d61c28*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic40oc240_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"df8a13faf862db4a1b928e24ec2a8788*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g240mb128_ic240oc240_ih28oh28kh5sh1dh0ph2_iw28ow28kw5sw1dw0pw2_n"33ebe9fabfaf18630dcf9fb9cd01935f*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic240oc10_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1aa4a6ee3b7c623065a9c987e88e00fc*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic10oc240_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6b33e02a585039f87e7a3b55de249edd*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic240oc40_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c7c10c1e98a3fed8ec0b34d0ece92d39*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g240mb128_ic240oc240_ih29oh14kh3sh2dh0ph0_iw29ow14kw3sw2dw0pw0_n"15c9cda510d6d64324dc2c78be55b8fc*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic240oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"36c2aa0e508b8347a77c3191daa8e18a*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic80oc480_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"705f7f1acc8b3b0476d077c4f6073573*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g480mb128_ic480oc480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ca1275e05a3be19131d7587d755faaf9*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic480oc20_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c1a2e606f9ec072df549a7be287ea0f9*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic20oc480_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f7ea120776ae5bab8d2115bd83289d3e*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic480oc80_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"6b28ad5f0e8de1bc4c2c06ee1ea4781d*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g480mb128_ic480oc480_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"6994731670f013e5b456de3d928b3e60*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic480oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f163278344a6d723c1ef8be81ec2015e*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic112oc672_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"3a8036daed3635fc6954ac0113318944*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih14oh14kh5sh1dh0ph2_iw14ow14kw5sw1dw0pw2_n"a5400bf5dfefc4f5ea1df8f7ab5812e2*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc112_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"995b3de34febe84e7500101895c9a708*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g672mb128_ic672oc672_ih17oh7kh5sh2dh0ph0_iw17ow7kw5sw2dw0pw0_n"c79fa8bac865dac6cfc9349917013e29*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic672oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"08d11708185f01e39225ff0b678f6844*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic192oc1152_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6abce4795db840398e7d6dc3b28f35a9*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1152mb128_ic1152oc1152_ih7oh7kh5sh1dh0ph2_iw7ow7kw5sw1dw0pw2_n"65f27d030757f64c0e7d746b2e877cb9*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1152oc48_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2d841334ab2ecd5b40d67caf2f3074f1*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic48oc1152_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1cc0c995a4fe46021c9e26b0b872ed7e*200"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1152oc192_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"af212c12a5a47631c99b85a2f7363a89*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g1152mb128_ic1152oc1152_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"229e9ab143808eb21b30889f7b540fae*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic1152oc320_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c84009f0af11afb85fe228d48d8395ca*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb128_ic320oc1280_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2fb7f085d6ded95e6e29b4e5916f14ca*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"caaa9aa490569c5c0029ed13dc2c162f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b845d12c6d21d6ac47a27f6c67a2b6c7*40&a375271e307a0551b0dbcc3d26683ce3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ff6ae28688a5857c5fcf0a3367ad0b9e*160&3536fb63c7f1ce2fbe5fc654d4355e98*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"0a8d1d00b961a7203627a5ffedfdd0d5*120&68e82e0a9f016004daff5c87e39d2972*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"07fa2142c2ae70bc9255f2bab8c50388*80&1609afcdba66dd591ea54ea8deaab29c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"7ca5e09206e0bf63ebf7a52d4f9050de*40&7cca721c08b7dac8d0bf3de94c3cbb6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"27b3dfe13facb83089751afc9e04569c*40&fc37c4aec798531d1208bcb576a8f683"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"451631d14d43a8bc83fabc3beb8dd636"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"51d02a139f4be1cb1b568932a7d9f500*160&59f6da1b8ed5e842c59d75a32bbf0d25*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1c662ead17554e4b07fe642a56fcc193*120&7b97b2934e7ac9fd04c216eb6ad55e6c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"30e5fca1a901b96c8d38657d59d382f9*120&430e2fe8580b242251f39329706acfa5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6c5bfbe46213d73b1050c732f84c0e0d*40&cee9f82527ecec9fe974311472036a68"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"fe7bce4546e532e18ca028f075360e89*40&f966633ea0eb2554fda61cfee697f14a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"df791ebc24f53194e65a21271fd10621"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"bb5638af05bd53592b6b9a2c4f6d3d93*240&1645079a753333057c400f244cda157a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"0610d65792a8e2ee6cd557a2a5edc284*200&a93a7a8bc210b501e0f09b64bc18f4b6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7d5f31e34caf3af459bbbdb31801fd3c*200&d9c64ffb1c1a575cf865db1bed014252*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"15bb230c041178b337009657fd23f90e*40&6505a45d0725f24563c71705f5d226cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"85bed4b0f1c34993437e9bfe7e736097*40&957ffb37df2061688910b4ef2d7210d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"2b251bac03ccc9aa0ea1363151c040d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"56341b2de95202779bb3759ce61b20cb*120&45ea0a2abe35829c45c85a79ef4eb34a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"54339583c170f2945660afbe51aea2a6*80&7632d9329b8ea9a20ac07c76582529f8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"bad3144469dc1b472299b3bb919b3df8*80&7721cc50cf918b87f1d3842209f4210b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"507a174d59358dafbd95f1c604ea611d*120&4bb8cdaedfc196155b3d5631ad36cd2b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"8bf71b9a818b429ea78fabd90d87945d*120&fe500772da24d5209c7c4add16730424*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"6914e27c005e2e413ddd968391b7365d*80&6414fec7ee2edbfde5082ef539de3930*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7d1ce1bb683e1bc3260ff03edc11ff19*80&d49ecc4780e23bf58478fde744f81cb2*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c6f06dd4165dc9101b228813ee209c51*80&9726474e4a2368dc19fc28b9ac9c02c6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"659c3daaaddaf20b87321f3fb7e2ca7c*80&9c44698b90f2fee3870326f70faade90*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d963cdb5f1081a3f9448fb6afee31d07*40&d1ad0a7d341f590dcfaab763de92c1ee"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"f58cbfa115b4f31ba1118404c5a9a123*40&7b040ecbe617032c0f90bd4e6913a9dc"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"9fdde4551c7aef43c47a9dde1b893892"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"56eed640e5ae978871bb1a8c2de5507e*40&8c431f8841d3bc490f2fc2ea26ef55ff"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"53846d85b356fae98a2bc208dff6a221*40&56bb45f56570fd6cfa21898497b19e13"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"1215d4624b485820106b46d60dc48f78*40&05557b8b371bab193a279f71eccc3807"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"359efb84bea09cc0f7c95e3206fdb552*240&2daccab4c74d832eadd1d8f97378e904*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9c1c643d8c2f0be1a12fd066ac84b785*240&dacd8c0ec42258de37073c0670040e81*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c31f6c5bd34e1057856e1d166cf6295f*200&392e7434132c8f6dcaa534d8a9be9ad6*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1f108d4a9ee4433a3c18cf3d25a4fa98*200&868746a0221aa63c14d5e22496a75fd9*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9bc6e54b64f0ecae8286198879d214b0*200&9737215a0ebf021383cfb0204db5995a*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5af102c977d8f866e9e0980548e81dd9*200&5b607a033b8003f6e4fb89b7cfd4bc59*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6180306b64a84c44b7bb4c17e328784f*40&7596fb4f7f42bfbc2ab7d93095914fc8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"8555fb21317aec933ea1c272b2a4f457*40&82c4ec42f8991277e223401663e30ec2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"89bc0488feb99445b0274508aaf922dc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"c5becf59e4897d5a0959cfe97a10fcd3*40&1e6893fde4dfdd7e87ba056bc4fea42c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4ebb9b3ffa594439b5926997d6619c7c*40&1d23df918b2c0f6489970a217f3c1ca3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"409ecd0e2d76e690a692784bb358a608*40&da7062ca98b9b24716a59833cb3d82b4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"02072e9e0df5ec737053187efad0f6fc*160&78c5b36df63a3679952670726ba46546*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"e05d9c6c4d70f8a48f8e8b7ac25d333e*160&1d5b78bb13eac5fe1ca4ba4059ef0d00*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"679f516b6413e73484c71b1d5ed0a9c7*120&12161479a12de9f4b57d510cb77e1289*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4215e194f3668f57d165bbcb77626bf0*120&d9a8e2af8ff951bf713bfbd9f4d8271e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2461a9222b79f4493ee9b6ec20654c3f*120&4609454a1c1992111f329efc5cbe043c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"3363ec3f4d28f48c544ace25daa4ba09*120&6ba0d640fdc8c4daa8a72dd6b9a72443*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"48a1ed50d9baf0c42e8012787cc9389d*40&c41dbfc4a2c38585746fa84cc0245a52"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"2914f3afecf0077c72c13dabac72f8d6*40&16b2630a28c6a414d8e87f7a44837f45"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"2495cec6cb02821a2d071eaffe2bac4b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"468c2be4f9f623a300f0d78c18a9df9d*40&739e69eedfd6587d5a426cf091029e7d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"36d9587f2efbf516ab60b5ed22ced2fe*40&1eec5fa1876d9ab17ab78366d10078c7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4c68a0cc9b6ab5b74e35e4b77a7d2609*40&51f25816fa01693567f0987332955620"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"2f60c49462300fe559c5c06e55b21811*160&80d5294b3ce89f2515eec1c405b6cb30*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"020333001ece88ec843685341ce58248*160&d4370d3e10be22f9649fdf51c0b455b8*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"10f65a6dcb4236caca307dea2acea4fb*120&775c400c84962fe8b63732d99f7d1eb5*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"6769a93f0b49f2b7952b5f358f7e5717*120&55c80bfbb816285cfb7fd6dd99a732fe*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"248a510a8a21aeb2b1f7add9abc9f563*80&fba064d468ca48ddff786613eb9d7dee*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e2eeb25421b6337cc3ee0dbd4a817b5a*80&e31278b758919b7991ce7269854e5d7f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"00c5d9a26c7ed97a294fc5e450278a05*40&b3c7a390ec7c7ecda7867bfd0b3d8a3a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"72b017a688f275049810a055b122a135*40&471aad0f8b460c7baa17a17fc7d397ab"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"bba35e3272a2bd6c75916814a75840c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic3oc3_ih360oh360kh1sh1dh0ph0_iw640ow640kw1sw1dw0pw0_n"57141de573b940a281136ade83bd131f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"14fff63b2a2015560000fca40c7e1b78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"108990e5bff476c30b13b83c1127af0e*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"9892c8e813a3425de4b7b906a7487a72*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic64oc64_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"a4f40bbc7e87595511673ce94520975c*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih360oh360kh3sh1dh0ph1_iw640ow640kw3sw1dw0pw1_n"c24fdd4a84ecea253fb7f185628826e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc3_ih720oh720kh3sh1dh0ph1_iw1280ow1280kw3sw1dw0pw1_n"8d15875319b7bc2080c85361465479a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic3oc3_ih720oh720kh1sh1dh0ph0_iw1280ow1280kw1sw1dw0pw0_n"5d0218bce804cf1fa14a64190a7ab72a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"5e133ba3f56177ee08973bce9e1677cd*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4f4a2b4739621aff997933122aa926dc*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a241c773fd06a001de3258b36c6cfd45*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4bc7783344722c6c9291a4ce622c8473*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b3732a6c346d40c6da1258de16845e85*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"53203fb17fcbc8c03060bd55c02edd97*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6665c71ecbad45a661ff5606f1383d25*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b3bc251d7c1f050a8679f8aeda88c55a*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"4056702a164e99e4a50055e3b4ed6b94*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d801a4ac7a814c4de8407e1ec980709b*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c78762a058e56973187f526930cedf94*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d6b7afcdab8c904534dc25d60627ea52*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"8b56dcc3c997bc4afb9d45213dc47c05*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"21d7cd9bac83960c1eb63c14621114bb*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"1d3b750069257494fd6ea76fdff8af7f*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"089ff4c304a5cbbe8254bc908f09a0ec*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"27c884436118d2afa0130706030697f1*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c791432df4314a1d3d082b21e73b7dde*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"ef3fe5b45cd4be2037665cd6706de3e4*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a455eb7ebc6f3b4fab03216ca4bc618f*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"0d20690edca570724e8ebb68f4b412ab*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2079ede07ba3508e5fc7009589d8489f*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2563c03703bff1032d113c5eaa4146e2*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"01c53f3fbf53488ecbca04308d612fc4*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"9f360a8c23c7ad95683c58510ddcaecc*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3b13083fd710e68b549d75fa65ff2fae*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"185e8764309c2e3df32cad662726a010*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"ab3830e7ec1a0b6644dda4aa6b3b9868*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"32995378c90572d8a6aa93e63b6f3cf2*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c204831f2fc60411ebf6f5e3a68bec81*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d99839fa26bb84c94a7a610224563f6c*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3dba9d3e1bd1f49c0afe32370900dd3e*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"7f5a540084b9c708a0c3f7b00683685f*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d82a0c92e899db0b9231ac4528bcfce8*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7f59a40f0834098988f25c85968077a5*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"528fb801d4271bbcc3bd65761cb9f20b*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"291076320f55d11a66ff618b3f2903ba*120"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"4a86d5c1fb742b1987b5bc2f7daa1808*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b74749cd696db60275cda4a70a0447ca*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"2ecf79ad2ee7b7d484dfbfdbf1166ae9*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"650f80cfbd25ffe61d4e7c1fc2f6a1e1*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"90de02192b2e7a7495f044d523f260cf*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4ccf18212fb315387839cdd42ebf194d*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"e19436b6ad2ac4168edbd6fb10eeb6be*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"7e4cd7c91effd582daeb3a9f5a360cb1*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"23d5349bb8884699af70bd0563be6059*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6e9cb069ad249bfeaa264f6171b6af95*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1fb15bf674304ee76d20d3ea6a651697*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d68d5786fb8b63f6ed2d2b3eb05db002*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"e890c44eae5a23bd4f159d57458aa16c*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ded992874c25f8d7d9d8392d99d03efd*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"64e27ddb79017a722c29496bc69662b5*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1acf4ad0fab9cee997d6499dccb28df5*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8176471ea434b1bfd8d8ffe9bcca6c89*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"7f39f50e42a83e5d85e08047efdde23a*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8f7b7ca00e7f49a73c7b33fc07c26ba6*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"50a211b3c3584a000895618967cddcc7*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4ae5bcf06cc2222d19041d1d6c7fdd5d*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d8f0bf30088a39c5404cf58ad81c6444*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"89b9d044d25a106250c362ca389afae9*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"849fad450c2cef414f1f1fc9e98390cb*80"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"50b81af5bf51b3153d9eb1c2eba9973c*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bc1661399275f44c756fe8dcf80267b1*60"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1934d0725665f0f079f383d0805cc6d0*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"83e31e6a0ac5bb28d2270f118ceed5e1*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"df31942621359fa65c1d3f01cd674732*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a64c1a68c315e2abdcc50963be9de7c6*20"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"b1d2cb0141b7dced5b56d1333c5762e4*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw1424ow712kw33sw2dw0pw16_n"5fc2a9105c698b1943bf241fd5172bb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw712ow712kw1sw1dw0pw0_n"70083ecbe96dd7e81d441ddde95d4a31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw712ow712kw33sw1dw0pw16_n"f8ddc0857e0622639098c27e718e467e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw712ow712kw1sw1dw0pw0_n"de06e0f14d99446e0f82063c3047aacf*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw712ow712kw39sw1dw0pw19_n"f0bc2c13670d683d693a5fa29b096e8f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw712ow712kw51sw1dw0pw25_n"f08b894a6b882c5699fe30a36f1bbddd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw712ow712kw1sw1dw0pw0_n"3570a7806969729620169330a492f2f7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw51sw1dw0pw25_n"e59808b41d3168db808077684d62c9a9*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw712ow712kw1sw1dw0pw0_n"8fb78b953689d6e556bf300a7050fdf9*53"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw63sw1dw0pw31_n"43b89f10e479fffba1d4becf97f0eea9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw75sw1dw0pw37_n"6b930ec19423aa84f18bebbfefe0e954*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw712ow712kw87sw1dw1pw86_n"c46a02201521052751523ff8e198474a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw712ow712kw1sw1dw0pw0_n"7bbccaad312278c86f4fe05ae19a95a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw712ow712kw1sw1dw0pw0_n"7e70ea516e0ec7f2b1fd190c69b66a4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw2240ow1120kw33sw2dw0pw16_n"77caf1dce934a04df23ca2933043e910"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw1120ow1120kw1sw1dw0pw0_n"4f3824f8ac6fb7ba738e16fe9f655ef3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw1120ow1120kw33sw1dw0pw16_n"ecd518162eb3e43348dcef29ed9a9112*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw1120ow1120kw1sw1dw0pw0_n"4462109372ca3c9771727da703767fee*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw1120ow1120kw39sw1dw0pw19_n"b6754c172035046304ca2407b3a1faf7*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw1120ow1120kw51sw1dw0pw25_n"7d313a3c76fd5c7f6853b04a901fc524"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw1120ow1120kw1sw1dw0pw0_n"7e50090da3024ce8efdf0e284b53e638*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw51sw1dw0pw25_n"451caf5635782a2bddf3aaf6eb80e0db*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw1120ow1120kw1sw1dw0pw0_n"fb5ecf2b2699f82679475537d74aaf80*53"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw63sw1dw0pw31_n"755cdcd042cd472afb426d92099e5260*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw75sw1dw0pw37_n"36f5ac9a352fd86320fdcc5be633482b*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw1120ow1120kw87sw1dw1pw86_n"fc0ad901c81ac95229185ae837f5d72b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw1120ow1120kw1sw1dw0pw0_n"f14d97fc202e1ca777d6fe7b6224ad69"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw1120ow1120kw1sw1dw0pw0_n"0f8492fe9dffe8dd952cfe599893b89e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw720ow360kw33sw2dw0pw16_n"32bd38ce9fafcf69b054e29cff672188"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw360ow360kw1sw1dw0pw0_n"3de14c1357112054fb81cdbf50a027d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw360ow360kw33sw1dw0pw16_n"9a97f32399b7538cd15de90ba4c0dabd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw360ow360kw1sw1dw0pw0_n"be238d3182c8a18e1f97b096fce2dbc2*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw360ow360kw39sw1dw0pw19_n"74cf063256ae7445025beac748b49e6a*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw360ow360kw51sw1dw0pw25_n"1acaa72db20c21a31ea227c113c485e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw360ow360kw1sw1dw0pw0_n"ded1c19080ea8500cf5f853c092bd4de*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw51sw1dw0pw25_n"5d39b0629822fd3e6ed4cbefb86ad365*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw360ow360kw1sw1dw0pw0_n"b6e891a3232c49850e08ef73203f2c15*53"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw63sw1dw0pw31_n"3dbc183bf267b86d4edb8754c8e7229f*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw75sw1dw0pw37_n"174c23c5c47e4f4ba446fbe2dba45ff2*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw360ow360kw87sw1dw1pw86_n"027c0995205ffdc94d4113882717f3b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw360ow360kw1sw1dw0pw0_n"460c02db90679f9832a066481b5a052d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw360ow360kw1sw1dw0pw0_n"7289b639401a4b59198fc91987302d03"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw576ow288kw33sw2dw0pw16_n"56ccfee0afb632a018d5a85eae9baf51"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw288ow288kw1sw1dw0pw0_n"deaba1b60acac4957656480412b969c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw288ow288kw33sw1dw0pw16_n"00311f3994413dd67a5a86477b3d6ad1*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw288ow288kw1sw1dw0pw0_n"4acf51775f10ba1e514bc463636528d5*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw288ow288kw39sw1dw0pw19_n"7cd5f5f8e859cae32ab2f68a1995a1fc*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw288ow288kw51sw1dw0pw25_n"75a8bc23fe7901ec15236f6d33413740"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw288ow288kw1sw1dw0pw0_n"61b0d4ec616f21e3de932336aa7bebd1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw51sw1dw0pw25_n"e53c09c15e18c8c0f0b76e6275261a7e*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw288ow288kw1sw1dw0pw0_n"12ee7491d30d5366515f4b9929ab62ea*53"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw63sw1dw0pw31_n"8652760bb5017b1342781f0ea84a550e*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw75sw1dw0pw37_n"5e9c34097b470caf1f86277413012f01*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw288ow288kw87sw1dw1pw86_n"cb0b7a9a8ef3c9f6497d73b55b0b9c52"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw288ow288kw1sw1dw0pw0_n"66d3b18eeeae714f0b3cc3f3f99b50fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw288ow288kw1sw1dw0pw0_n"365a0f3acfb183c5acb271c495524395"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g64mb8_ic64oc64_iw1120ow560kw33sw2dw0pw16_n"ce7b85432ed71e5fd497041da52b838f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic64oc256_iw560ow560kw1sw1dw0pw0_n"040460fc16d168c819ce0fbbbced80b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw560ow560kw33sw1dw0pw16_n"31e0fe531af205738f4cb5448bdc9147*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc256_iw560ow560kw1sw1dw0pw0_n"6a5ea0bb2e5ed788cbdbd5f82e03d4b1*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw560ow560kw39sw1dw0pw19_n"6490a9d96389dbe6a49dcfd8234351aa*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g256mb8_ic256oc256_iw560ow560kw51sw1dw0pw25_n"71e3fe9921821f93f9c537c0f1e3b54c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic256oc512_iw560ow560kw1sw1dw0pw0_n"4f75d3a0e5bdb17fd1b7c24543700066*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw51sw1dw0pw25_n"2783b0ddab9f81439045b76b4383887e*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc512_iw560ow560kw1sw1dw0pw0_n"bac9e096ff4fbdeb16759719c6ef63e7*53"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw63sw1dw0pw31_n"76c550533c865526ab27358e64de45dd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw75sw1dw0pw37_n"b03374bb18c6bfe2fa490e36283401ae*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any g512mb8_ic512oc512_iw560ow560kw87sw1dw1pw86_n"44bf808ee327c215743e0dd92bbb677f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb8_ic512oc1024_iw560ow560kw1sw1dw0pw0_n"aac6e13037ee69b386d7b7c1671744da"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic1024oc29_iw560ow560kw1sw1dw0pw0_n"e3144f7dfdd2432347ab8e082790f5dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic3oc64_ih300oh150kh7sh2dh0ph3_iw300ow150kw7sw2dw0pw3_n"db4cdd8258953304b90abd58ff01e6c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"2a1b7d9672afe791f1f919e70fc0cafb*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"50ec2aaab54144ff84fb354fa0f3eb25"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"f9c849f85c03c92e474afc3851b0a6a6*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"9e4947db53d3334a39ea36862ae8816d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"89bb52e9c396d418c6c2bf4fa81a18f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"960bf0c108ef6cfc8404bad17e0daf28*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"c49ac3fca3833024e1570fa5d8f5fb8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ead92719e29399a8ad891c36cdaf4d2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"d66165a3c0e49371e11705515afa3481"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"3f2fc10ac4e13d566b7e55d2b1927edc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"b9e9612c6ef64333355ecb7789ff12fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ebe6b6a75470dce64f36df84fa9334e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"5a558d0ff72d96afb80cebab79d1dd95"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ab9550c37f66859165c19e4461279f54"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih5oh3kh3sh1dh0ph0_iw5ow3kw3sw1dw0pw0_n"ce9e3eae53a516c73c7ce2a34fcb7f8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"52cb121cfbc119717ce27db9750cff7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih3oh1kh3sh1dh0ph0_iw3ow1kw3sw1dw0pw0_n"cc42cc59759e2f4e653e35179d4d94fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"855e06d5137b5f94822b78fc65b14bd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"a3f32ea5f6eed88508df97d5fac87d85"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"3f95cc5048ccdee87ff78d4b7e011d8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"0eeb3e2fa1d77e993653e0fc26394d81"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"845c3478c2a2d6a4270dc13b8726a762"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"89580b076602862172a656621026e9c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"dddd36d00db490596f7bdfb22cd92f25"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc486_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"0e3215defd058643ade03da5d5c7a48c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"4a9899f6ab593a7007ee7ba3f3c30e04"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"1d2835b0cafb84ffe8fad1411392b7c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"4ea486dc079665cde0f8a9745689348c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"2994fdc450674a40ca21ee94bc3cbc31"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"212ba58fa966cdda6086d51faec9a456"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"203bc29b910572e5daf05f0ae32fccc6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"a1b901f38950002ee7257152571a37f3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"a5a9ace81bb08df7df2c7e73be8b9b4c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"5db7b26d070764de43896e309ba9d1d9"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"cd521f24a154d878a6b37e12963ea6d3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"528cb1dbb2c00e9dfbe6bcf1affd5358"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"e6db316ff5525bf6f109a05389da3426"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc486_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"e982724ede975ab75ab3c6abe6da6fb5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc486_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"cd67327bfbf2b2977550ada78b78b6ec"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"7a825f74a41cc015e49008a5ab4d02b1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"a86eb86d78f21ecd3779224ec662c5f0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"25fb178702a552227c511146cf29c548"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"1920fd217f7ddd96dd87667bc3a262ef"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"59b3d402f7547c3f7971730138baf89e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"4896bba8520831c7bcfedd3f5a4761ed"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"841a5849602ef1372b5ee0a45e8b830a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc486_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"fc9b1c75f56192b667b07154fc148bd4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"965f2073512b38e669be449a23453973"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"929f2edf13eaf1f0de28e756adf97788"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"b4fcf93427c8d33b60d60e4220cfbe4c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc324_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"cf61d21dc15d350a5489216acedf9177"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"344a0844aaafc1d28b370b608586d15a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc16_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9234be868648c7afc3af25eceb9f9519"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih3oh1kh3sh1dh0ph0_iw3ow1kw3sw1dw0pw0_n"247d0bced6180dba6030c33c21e50b11"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih3oh1kh3sh1dh0ph0_iw3ow1kw3sw1dw0pw0_n"8d4f06082b2d82256a86ac30ecd757dc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"d5b9310cc35894352337aafa0200f831"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"08d274923b81b011e1095c844a92f1da"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih5oh3kh3sh1dh0ph0_iw5ow3kw3sw1dw0pw0_n"0761d82e50f67f037567dc7639c45d47"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih5oh3kh3sh1dh0ph0_iw5ow3kw3sw1dw0pw0_n"a675c8a87237ba425fa50648c620266b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"34d9f70aa676ea5499c41db1056b5a6e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"dd27103c5c126679852683865c301f16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"64fa5358d8877ca0a2604821a9d16347"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"9ef4609ffa203c24408e4eeff13c8e2e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"14ffea49210bbede2fe4a1f540879143"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc128_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"193ce6685b45d7a615b7fd0e888111a1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"a82b3908498e08d880ecf749e5d12576"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"f9ffd79b372dfe4806ffa0675a682596"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"f08103dc328cb55742fab84978f3d40f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic512oc256_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"66075c19ae453882bd4b1398bc3bff3e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"41d05c1fcf19d77c1fd146f9c80a4c95"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc512_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"335d283eeda19084d1cd06ddc716dda0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"90d6f346bf0437568f22ad7cc420ed7c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"e942a4b825fc2ef5042e4ce41eb73386"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9c4219a172e68a13963b41b4fae1a16e*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"ab8da2fccdb17c2734f01af416b74cbb*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"31bbf2387154a60fb8029ac28e360e6e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"ea34bbcab23237bcdff8b61d2ae26ab1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"6eaff2ab2c12b8f9ebe6b86f86f4e72f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"ee9f63f6d30ef1d26cfeaca0ec059df3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"9113403303d129e0e41e17985d3461e6*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic128oc128_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"fc74f6c53f6be3814d85f443d3044af4*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"8330e1aba55f03a990839073ed4e70e0"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh1sh2dh0ph0_iw75ow38kw1sw2dw0pw0_n"a1da57fdb3fd6b62036d633b65e2b4c2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"0b5cf8170fa85eb69d50932e11c6b013"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"d835defa29992f19df065312a9810467"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"79c2b37ef2e21c5418384af52a3ed900*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic64oc64_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"3099175a34f85ed9636c655f5f0c028d*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb32_ic3oc64_ih300oh150kh7sh2dh0ph3_iw300ow150kw7sw2dw0pw3_n"6793fd12fbc3dcdc6f0b5cd3a13ae535"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"26448aaf5cfc6d199f596c183843b442"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"65af7e3e5707867e4e3197bc24fe90fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a0e29cf5e73331ac4a7653e4090ea502"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6f5417d4ee36a0c68529d8fd3668481e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"bd09bf31ae02cff5b8d96080759eafdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4707279885ac685291202673e11f018e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"8d185b21f9ec03fe33b74718c26c334b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b98ecd72d175f25d9a10a8c5fa37f0b0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"4c005ad3fad1d406cfcc976f5f0c1e66"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b1e339af7d548c7dd86e7f078d93379"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5198889c1ae09107fac03f0d118a49eb"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"00cd9a218c95175f50c1940267f6d0ea"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f9890186f5fbd28090605279857b565f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"5538cd0288ccf1831c51265432511b19"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"53b47b08aa3ecf94dff40e861c4a8360"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"837de6b7b38de67e108254d06f09e704"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"138c881febf72666ec24710553adbfe5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3eac9be7e08b0d6e90056548a7a9d452"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6b4bf07da8f2995c3879cf0f6157d469"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acbd --wtag=any --dtag=acdb mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d04daff292ce405befba1009e4e9d14e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9781883f725b83d3857aaa11a74959f2*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"806af276b6e1689283c29b32e94584c5*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"48a0a08f4e5bfb4bf6a206e70f82d37d*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"91d02481782c67ed1dcf1dc6625865af*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"aee9012612f2b398de8eb98079a5a6f2*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"2bd8cca8dd6626c51c45f1583b6e06a0*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"ee79d9605385302591e95413fc9be5b5*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"7196f1f77d4ba6396905682ab544dc3a*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"80598532314f33313c1ea2eabf3d2a1f*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"30dd670ba446aad73679ae781dbaae7e*20&2e8591a6d05fa1d835efd587705f3553&55af4243826fa44635f59f7422f6915e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"492a27a76444a8029dcd22bc03c0eb86*60&602e341bd6fc9a1ecfaeb85633327bb3*3&d67687c306bbd51f5d11d90c472e9ba7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4938507a477af9abb6ad2b2a27150352*80&c4c4c3b0e24a8f30e87d2c5f3bc69557*4&c7c5e6fca394ac697c8ac206176d8f55*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f3d1dcbefa8262710ec6fca064240973*40&b96be1b9e2d359e78fcc5642f68e9c0e*2&2072ebffb75bae0d3a3bc3732adb33b6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dc9b869fe22215b6170261bf927611e0*20&40edc147bf4b6ecf929a6afff0ba4ba3&537efbd880bf7627cb3a75a50bc21111"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"61f1ce476982e83744c13ceda15032e1*20&7155ebc7cd1957d90387a6b3623e1b21"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"309ecedf131bccaeb968bcc63585408b*80&b2ad71a2c35d76468053bd6967c7c73b*4&fffe1e27fd0adf13e1c9cf753a6b89eb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8b106ebacfa372ded169dddbc5a1fa94*20&70fb91ace40a4a65f2bb7f44c0101e5c&305839627714852243bb5511f2e7bebf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6f4edea2068a35438bbd81c61dc61eb4*60&46e403a8382f8a202ddc826e5407d0f8*3&a34b2152fb102ab47c91750327cdfd6e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"b7afcb06855c82f421ef4a9f48f78cbb*60&109aaee82cbfb3e5cd75cb0123b7b570*3&4ac9686219d949b391bf081ef39f86e0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a44b79a2bfd9fee7e335736300712233*20&a8b4954bbde58cf85e5c2e21b386afbd&ca3c43f1db332030790d35c08ed22bfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b1f184099707bdff7dc570312f19e54c*20&ef31efa1e7ea9e135b978fad315304dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b26df621268affa7485c4a785b7e3087*120&7a05e8b104c935d284f51bb4e1cc06e1*6&f03c460a33e86c8203f16dbe1610344d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2a156a25ea2250a5fee6ff2927ac71de*20&4b1efbca08920cbed926f1a04911acf6&d5bb0eaad07a84d5ea33983304ee2b58"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71c1c3207830deced48cde0e670da0d8*100&5f40cb2d58470418f2ae89d2669c55a7*5&c9b92c11621b15dc5b40ec930b0a5ee2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f5a5ac82b6e5c01f205c4123be9d0dfc*100&4804c431da439253d7b2dfd31c197ec3*5&150f4c1208a33d27fd5aaa073307d236*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7dd10e8d8bc26b7ebe5aa04932b70d99*20&df88136ab75af8c6ed0bb180995ede87&b66da4d898cd3d26a06f9ea475ac7a78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"61abd8ce91789f690881827a1c21cc78*20&fabd181a58045628be0f97c9cf31de50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"a73661e527bc964cae1625c99924f1fe*60&0f89ea42e7a66bd5f66129b429ae5be6*3&3bcef1c7fc28a32c50b3a289b08e4d31*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4d2b932ba5c8d53c2f3d3f3acb7d8683*20&11a7d4adb0fb60bd226a1bd18c3075b2&b3994ed767dcd0722c0a32b9979179d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"73370661c8705751425d3e0b260fac7d*40&3d05bc41117adea8e0405ebcf366245f*2&248a30017e43d6bcc69f5224ea1d7496*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"7fceffae3ba2e0920914e3858c0f7812*40&38237811b4e68d9962fdf38892250079*2&c3a8f08dc3bf71463bd84a9e908b1262*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cd9937186ee57751a0817473ee038e00*60&814b2fc2aa8b4f144d473326ab88f019*3&67852e58accc75611d326f8c7f8b6e0a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"915b22f2d912cfbc3763ea8e30a10786*60&3352c4bed5750300049c2a61c62c96a0*3&104ae24d23ad11894dc21eea0ac80834*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"1e93572b7dda6a66eb33563ae543182b*40&841550dc2ceb2719c32d1ae7fa837be4*2&4f4c1bcc077045767220598e6cedb503*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"3ca1f8cde2e0b8228f7c4d83a003d974*40&1ceea0db114faac19bcfb4a360a1a352*2&fb3ef8a56332bc8c6fdc40e85021b897*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"437504cc06e3e35fca47051fdc277c6d*40&41ca0631d86c5abbead027f91647518c*2&8843abe936736132da36b4d020f2d7b2*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"2374129972b4154eaa49ebb3da383f12*40&9db76bf6eee69ffc2998918094eee038*2&7772fb05dc16fdafcd63c479c3149f85*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"b1ba43f7cd117c7808303d06e6519380*20&54c4908e00dbf37eeded57b903057b11&aca0bee9db4a6327157fe259086e6fe6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"5a18996d177329d13d5e6e234df4f10f*20&8ca14f80eca0328da7377f0b653b22fa&6030e98c097fc10c4abe402bfb799752"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"15359b0eb341781a85bd095c88fbc94e*20&e0a53076d30b54afbd69b047fab170ac&4a188fa136f9729048719dbea360fc08"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"63a38159149bd825b7a1367006d07bd4*20&0255d9580409fcec46f17962f5fd30a6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d915c2715eaae1cf30a93ea6f8586b15*20&c19151692cf55e6608306fb75d259e1e&7f00c7cf91e13d60f2e13e1d3f38a890"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a79b575741a53b9e76b9eb35a5407898*20&16e60da6cd9cb2a6e308423176eeb09a&effd40a4d46a38842acc725a41b6484b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9345d3f30bdc77875cbcf951c0c007e4*120&3f3cc22de9c800732dd3dfd3f1cd2833*6&fc7e7e404293fe40ade8d72cc674d3f9*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"ea2cebfbfff5106dc24839447025aa2b*120&7c6698b9da6e28cbc74ce8f3902ddcae*6&32a51497a9e8624992d321f416f98fe7*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fb3e0a5f16a46852a386f9c48e29100c*100&22966961bd1dea57d1cdc1cc052ef546*5&ce5089dba7c25b38afe6abe89c5cf693*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"79da47b419afe4ddae48d349c5f8b044*100&da98553f76b224ca4011d28e91204de6*5&ce99f134693e65c267820aaa1340bb9c*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efb92930fa047637218ca998a8957f3b*100&bba87a09e61b95011df53d0e92cd65cc*5&4a01f4a06317859db61d126de567eed8*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a24a888299d9d9e14a7b2e7cf3e92ad4*100&012b5ee778fe461740bddb0a19e2f9ad*5&1acba36f4ee0bffa660203715fd923d1*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"24f119d9c77ff38a7a6ec7d8f588134d*20&ebe1449d445a3c5f82ca2bd591ca0af8&3d589fa87ec9b6bffefbd2dd760363a4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"174ae8c76cdf21581cef369379ef491e*20&f1d4c031df1049f471d5cfb92b5aee4a&21b94608c5b86ea67384c6c840428ad0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"89285cb94f9ed75e036a6be59e9ed634*20&cd2a6bbf5236447ab71152a21923671e&c9a3557d29655664a5bfcdbb013f1235"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a87a8ac7433a850dbed99a3cef98e4bb*20&cfc633b86680b817fbcd8e657a105f30"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f1996e906296e0c09671212b59825ddc*20&feef0562a015e3c6e657bd46625f3bc3&c8e62f0503265fe23a2d3fda2d8a5fd4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"ea4d4c17a52cdf0878e1daf1483c0b1b*20&579c9e2c354720496d7797105177d22a&45eb6765c1a1dc87c411ee1a1d614185"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"941b4183309a4378b15b6f38274d06da*80&72096c8d4ccd2fdb31b022e9e265705f*4&75ffbad542f545f7b2c504250b417bb3*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"a2f5c1893168b979165c1117e157e09f*80&4cccc0226934417168d20ba76bc027e2*4&2fbb4ff556e265c7b75ecfc64e71d7e1*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"7a818c64ff63d61ab14687b35f95d63d*60&64fcd1c70c78711df873d0f377b2206e*3&7f84310f0f83c5543a841a455bcab217*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3a3ec91c4ce512e0f6ec3507841e3863*60&fccf2d67e9062ff97413907a96e9619b*3&ae174d4b5ac8568953de259d8c9cde1b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1d6c64d8ca25c1bd949dd02b8ec82287*60&75dd966668d3d59b26b4886ba6d550b2*3&5bf6bab8e721380e780fb6d610b4046a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"312855d084474ac2a6b0b0bca2ddf7a5*60&653fc99b95127ef3aeb200aeda8cac7d*3&177ba16f7ac0d3db7f3c47c286b81ae7*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1aac7c5dd2598f0e60aa291669c1be1a*20&1f1e2f66214f8e59b599bfe0f965dcd6&b155c2496fe934a89be7890ff01aa0be"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"be486ed6ecb45d804637ed1453d06904*20&5783c2999ec3e3cfcac3e990fa82d2ed&ddd777cf7d002fc7e94b3d64feefac0e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"621198d6c9485ac8e3533071dc9c491e*20&2a823733b57fedf5e16e7831a10af024&9bfe9920c17b6d105ad5420ec19f9f00"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"fba15ac6c66972c1998efd0002724397*20&177608ebedd56cc4e905531b4717758a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c4634ebb9182b0e2792aecdf3916c906*20&79ff2d7c1ed18ac0b17f3bd9460a338c&dcfc7fe416bbd772bacc3bdfe5a63ede"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"af906b0eb21bd5bc8947efa5e4ec32ad*20&3f4aaf903d51be1e957e68e589f47a2a&5949069386b71d841122ed53a40a6bf0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1d823a4fcf768fff08906d716527cfe1*80&7e7dce3541635826925caab0d67a9973*4&f6852f98c9f13427c60199b21eab4b63*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a61bcfe6c031ebfa61a0f24f28ca52c1*80&a86d89819e2bbd8c786cb466e89ed70a*4&99f2088dddbfaea59bf68d15d4715e0b*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"a5821c8055150df89c6852477be73244*60&346f9d83efadb5d718c03c1c9848adda*3&0c8cc32c660afc5d171fcc850d80c23c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"76ea1634c639b0747427dc601ab1e435*60&80b363f34dad36791681e9eb2b821d66*3&26887f0458da19d615791eb225d12a62*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4b785bf405957fb415b01612cc263190*40&e01d478909065332d08972128bc1931b*2&9164478be53f8d58fbe56d4b9eb10410*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a549c0c389bd0117bc605e5c63424f74*40&ac6bcd41a1c9e0cb664f628c7b6891f0*2&82750d2e7801b4e7d6c9b3681ac53990*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"1a9de9183f9a1dcf04f2254589e61f43*20&0177e3bce5fd1eb40c4d4b1c5ba8ef79&b97db47d8266a199b04ab2ab3d95ca9a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"98c42243d9d584c8dd9441e361308937*20&9dda763cd45087f5e62c056f1b1c5310&92a6bf4bab44d13b6567641f1e93437f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"912aa17ce94ff5c6258c7f907bcc24ae*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"93e0dbf2d17eb311c77892202f20d73a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"6e44a5c84720925ba7a74c6570c00f89*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a109fe7ea8ea29ad2e3e8468c2ec1ea0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d5c0e2ee648af00d620e53a1ee2676ed*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"73ba98a88068b55e0067d260357585d2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"94c44800b3f08184a602ecebe3dcb8be*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"988961a9a0a0657378be4e452ae29809*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f308b3b77b8b02208d3907102eeb0fdd*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"746070e98262ed3715c41032eabb6859*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"96d08849b91ee98f2073cad0e0f0cc55*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9ca5eae5ef4551120a7930233c697dcf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d6f8c79daf5223a49e1377af607f567a*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"be464ad3c3ea577d7e1ea09d61569602*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a617ed77dcafd438a174bada0fb6bfa9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"8c5f6b1a1bfed2b66f3f1db1be62810b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"11fce2d89774cfe076c7fd16c8b941f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8e39b4257d7b856189d2f35c746b48c9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2d895f63b14744d77cb1e796117bd9e0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"394468022ec241d3b2a5a14f9b687e69*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"0d2892a95bae5d28ccd33c9658b649c8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a668d56d3ee25f761f6f8081597b8cda*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"b41f26d2b308610413c9eb581781a82b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"e44fd5f1a21b8bb39de8037ee26c2974*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"1246d1e85e0a1710c7d4c08589c846d6*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc3_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"9167ab882ea8a5590c0140353af5f11f*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"1d41012bae7bf12b773cf99ea5fa8b12*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc8_ih640oh640kh3sh1dh0ph1_iw1024ow1024kw3sw1dw0pw1_n"fc0d6f5226bbf0cde8ecb040d4b19e59*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"c5d4d8e421fc9fa76f7f9997dd13a12a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc8_ih640oh640kh1sh1dh0ph0_iw1024ow1024kw1sw1dw0pw0_n"dad355acc2c938c396787d280a2fa191*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"39f658c359fb36213ce9fbb4fb5fa403*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc16_ih320oh320kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"a64aa9e2d4cc862ccc25509ad790e872*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f311a510f85cc9c0ff8dbbf7b4161f43*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"fbb256f8526e0e1fab0c02ec1512bded*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"2c2819185505631408be6a53c03bf90b*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc24_ih160oh160kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"e4c1023d42b29781e0e2c215f94cafa4*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"3d844eec8bf7be2bfc07c0448756196d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"5e9f8f5c5ef108ac35f023db6d5e4135*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"1bc18d7b12c0f712b3138191e131f055*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc32_ih80oh80kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d147e4887c97729efbacf779658ee912*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"bb4c8271f8b5728049c800205f628aae*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9ac9f1397501cc0027cd2042b9d86c1f*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"75c7e7d1a34dec4ba17ee2188f8b60d5*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc40_ih40oh40kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f310caeffb06283011d16d8a6d56c924*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"75587e44ee92a1cbc51b48f8180af133*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5261e08c042ed955f8c37e5332768950*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"abdd4d4a4e0f11b56fa88260293e11ec*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc48_ih20oh20kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0260edc059946bb7c116919c6f5f2000*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e477ead87245d8c12afb854c4bd5122d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic56oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fc98eb6933b363debeaf9f392d3c5e6a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a1f3903a1a65ba4eae19c8be4af4753b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc168_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"63b69858f165f7b0bccf8f43e488cd86"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"ae53661397bf5a361831453bc34180b6*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih10oh10kh5sh1dh0ph2_iw16ow16kw5sw1dw0pw2_n"0e6a0be96c7c140b565281bae8730e72*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0263c5e6ad138857e4b4ab616b8c6291"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic168oc256_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ebc7d089e35a521a92bbea533ec253c0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5c9684a92e9b26a90117f5f6cae11d35*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic48oc56_ih10oh10kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0e360efc2a238809afa5a86a90f845cc*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d12f6a792876269ba107c079df07da6e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic40oc48_ih20oh20kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3a1fb8b6eb626be2916950767b0a2a8c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"cefbdbdb5170881222cb9a5367ed3127*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic32oc40_ih40oh40kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"030e1a272cff299fcd8cd735ba96c7db*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e555cf32d218ce3bc84de253c126ad2c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic24oc32_ih80oh80kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"57023dad633f12b94176a3bf3a4d4410*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"80194eb698321063df1edb9814c2c9cd*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic16oc24_ih160oh160kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d756ee20937cba7d46b55e42b1e116a7*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"f17ced48219424277563130249f3e5d1*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic8oc16_ih320oh320kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"a82f7c3791208f9d1d010c6e56067b51*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic1oc8_ih640oh640kh5sh1dh0ph2_iw1024ow1024kw5sw1dw0pw2_n"05cb3ae4c629c477f4b35555c1353bd2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic3oc48_ih640oh320kh6sh2dh0ph2_iw640ow320kw6sw2dw0pw2_n"28c27b41f788a83a4f3de28485f6fb97"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"ee45658b7fbf10ad0caa11f363c20a89"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"8abab39a1e629ecc7825bd0982f09108*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"bf159ad48c88b8aa2748165a5c68e043*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"9194fc0cfdcbb5a34f709d83ffdd76c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"5f211e1eadafe9e6c54bdcab9cc7963d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"c4e6fdcf4f6b698f662951a1709958af"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"13c755079b260c80a1d9c328f1ea0ea3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"663c5d3ba54346363cb2121c9eafe1e9*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"74c2c56f3f0403ed4e76487a52d169cf*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"92f3722d10afbfc3cf73b353eb5c5b7f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"169c9e236d92df0362d67f371636655c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"7eb122030733b60eff30792be9d8d6cc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"b48ad05e85936d05e9a7cba30d252761*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"7ddf03cc65ea7621dc1de8ea326d6498*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1ed1ecf8357ebbdb2eb0d5a7e5ea7c8f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"bbd9fd8c5423f4d6e2af7677fa39123e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"ff280e2291d61f35b6302f87d29896bc*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7443c9b82739d3dba806298acc1a85cb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"9feb8a56616639cdd329e0ec62dee5e6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"3a19256d88cd0c015e47bd09c3e9f580*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"e2f5a8f82196447648ab03e61b289c65"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"e033f123d397f5e3a43780078461ca63*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"44e3cd88b2d3358de167cf684087a13a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"65731b3e7fa5ed142638985130e68f20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_swish:1.0 mb32_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"a29d8d04efb55d8351f5366b7296a6e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4811c22ca9a041b6d51d9b3fff096b18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f78bc5d30328d98f6184432f7a2469e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"2d93af3bef8cc6832f9d81236bd6a96b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic140oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fb728c0365bd0f8ec4f718a3fa291c21*191"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2c04d4f6a997dff526580e730d5f2ca2*382"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"93b22df8095be5e01fef369887ea5f0a*382"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"a5bae97633a11010243b2348f0b04b7b*382"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fae204910a2229349f1f8cc8492f42eb*191"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc25_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"68b92aed41b68f56f66bfa94ffaa70f7*191"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5f89a51743e2dbf3ce5408d8f4ceb43f*191"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"95c7e5a3c26980d86e38f04ea645f875*191"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"a0be0f0a10142fc53ead1118ea93e36c*191"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"750f1ce8910c104c2895ddbeae352445*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"b2522d7ec84577fd61ee7158b7892cef*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1da667c2d0df6ba5b8d631bb787d4480*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"c3e3e4696d0e7c0d5ae89d56563d40c1*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"093886a2275ce88baeff6e11fd7ad159*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic25oc25_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9c8451a552c85e1e2e1b868209f25c40*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc25_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5a85d5109d014adeef79580bdbd6624f*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc25_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fba8dab013ab67a63c75d2e8a6a3bdd9*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"3258b03da9b0637b93887d9ade89eef9*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"2b6602f9dedcec22d31cf80f20c7a169*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"d8cfb6ca4c8fa1a3b68b9cbb330a933a*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id64od64kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"afeccdc56bafff713a81f865a49e519a*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"b03ce88150f34422d2baa745c5325dc1*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id8od8kd1sd1dd0pd0_ih10oh8kh3sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"a97f3712cd43138d61f042276b9ec024*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7c301b34ddd870e0e5d7777a55abc9fc*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic64oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7fa383002d1f0ecb305542398c0c34b5*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic140oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"2ee8de41561babeb32fc811483ac8468*47"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb28_ic140oc64_id10od8kd3sd1dd0pd0_ih8oh8kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"fe117573f0f695958cc2be7a5cdb721f*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a7bf91bec0162b138e6877d91d088c3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"58e191ef62003f68422f2b29ac06917c*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4624319889b777462948c6f219d86385*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"0eff54c711eb390bc65d2254949dded6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3dd448dd1f75eee7c0bc70670fb3f277"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"9ad27949ba9c6a6920fa8edb50fa7a8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"510655ab96e8649417c2feb30d72ead5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e85394104e4b8be812c268737d4dc6a2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"61a9269f03ed25cd13fcbebe3f58ace6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"e622a87e986f66964f2976a7cf69f2cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"83e178bd3f844f8641cd172d74b83232"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"9af1b6d9d631fa375239680998dd7f90*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"75ce4a5c7c13067bdcd3e0d75a7e4fd9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"08b1d53b29fe342a886ef04d9ef34193"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"de93f1e3f1a2e847acb57b88489383fa*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"dc5a9d622be5303f1354a8eefa28453e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"01f2997b8f0053287207b4264ba010b0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"d4e6b4ae506285fe92fdd515c557b878*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"01488889a16d50c68f40f68c3b9137f1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6c6c4ff67b30fd0f588835eee0432c0f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1bcb23c367eaf24fa6823209ba3c5854"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"58b11a28e9475ed86790f3d4b99cf016"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7158c7afbdc695601a5fbd6ba7265bf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"02d8c5460dd1d02cd8b2d85bd4e70788"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"2ee012530411728ac64df242619aa87d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"8e2b9cf0c12332a6e42e33ba69f60f22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3dd0e66734ff880de6ef8c32e470b98c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b3550f7c6bee8d01fe34543569d7eaec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f3cdd83287d39c648d60229b9ad2778d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"40101733d9830cd8a7a5f3237c2fa2b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e6961b1dd805dde91f191b000280ec1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"edd31d27d74afa7055f4962c29c806b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"47d0106f8c4d4631c397651170cbffca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7c61ad434ba1a574897742fff954fe60*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e02d10e9528453ffb66fe65a99a6db83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:2:abx mb1_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"21e1a2fa21db7e8271a8a4043107a4ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"451a4ac79f0cfaafcbfea0d93c8f28e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"2a70e5cee126ab8c13a317dff9d19b87"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d6ca92a74fbeaad18d17f5c61800561b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"972f9c493d66263e8215beba54d3680b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"078caf492390d3fe887ca8ab6c13dfda*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9bd682293ab2aac7c85f62e5e5e35218*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic320oc320_ih66oh64kh3sh1dh0ph0_iw66ow64kw3sw1dw0pw0_n"cabb3103e2ddbfe1174088e736a5841b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e414454a540c8f51a328f589343133e1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic320oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"8164cdf86a48bb5ba9d850d01456c704*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"f630ecea32156ca1eb73d7e7558ea9bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c57fea7953a47c6666d9f83ce6b588f0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4a848f4ff94d51b29e5507595f66345b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic640oc640_ih34oh32kh3sh1dh0ph0_iw34ow32kw3sw1dw0pw0_n"1c5ee0022c3ab1a9f1e93aab6bf9fd8b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"dabd34f183c0eb110e3e7e8657bed740*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic640oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0d0fdb7c68c2ed5243593ede6959546e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"5ab8bb7008da4094e7a254fe21555f5b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"139cfab2cc36631614c03a5e62b2e3f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7a8579f0b880f8c7f4d66e8971ec0d61*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b442bbf25f32866beb71dcffa69cf230*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic1280oc1280_ih18oh16kh3sh1dh0ph0_iw18ow16kw3sw1dw0pw0_n"28c9de5052aa0df57bf9aa5e42713c5d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"cc177c1a3ace77da114f7250292720d5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic1280oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0f127e4e4dba92c44b87ff76b18e7bdd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"79d1f033c37a2279cfaafe11e1b723c7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"f7b5733a65fda227993b0834a0ac88d2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6344f9119f12e18e881497606461b8d0*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic1280oc1280_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"0af5b8a00ef0b3fb85190f9ddf72e4da*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"22569199cb5620283fbacc238da78716*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb1_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"b39a927f5fbc5ca13869004ea807425b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"209237ec8f440129839609f9db3a482c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"1fffefb3c479e1e1e833f7f3461cab23*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"3fbae91776c445af3c786568cd9d4094*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"df7010a82226172b515dda8fecfb3b72*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"804ff8702bb84dc860c168a9ddf0acb0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"6730dbb3011eac306087bd0e9ed4c6bd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c357358bf2fc8bd9a08e8c45d8768cd3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"9aa0bf8d3219182354669ed3c961649f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"97360678d80ba2a77f0da95b92f2343a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5dcee92aeff61bc46417a0bb0bafedd3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"05554aafbe8268d219a5243e26cf0c48*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"519c212369f3216cd7cd19af5e83a391*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c1bf73afdfc74d07067f14c93b7f0157*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"81e1c6f9791c23a7c65dabc7318478d7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"535e81be606685073253006c3e97b0c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"cd48796b2d1ef9b2ed86310bc0f3140d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9b8f144f5604ad158f3a166c7d6030b7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"d8c0e683167dc98354802874750a6603*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic320oc4_ih66oh64kh3sh1dh0ph0_iw66ow64kw3sw1dw0pw0_n"2cb4b3f09f725128c6f5e4bb9582398d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic12oc48_ih320oh320kh3sh1dh0ph1_iw320ow320kw3sw1dw0pw1_n"db01009d0db73b5e0513ca2e9acdbd13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic48oc96_ih320oh160kh3sh2dh0ph1_iw320ow160kw3sw2dw0pw1_n"5b089e71671361e00881a3bd94c385ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"ff00d41bfd739a7a5fa532a5fe755011"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d82e8b012b4f8292dea1ea87bc76f25e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"e94ec602bdafe9723959f7d030d2bea0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b mb1_ic48oc48_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"46ca6618e5c2acd1858088fc02736fc5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic48oc48_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"9f64e532aeab844f9aecc6bed380a273"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic96oc96_ih160oh160kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"d4b2f8f9a2050e5715e1a9e758c1cb5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic96oc192_ih160oh80kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"dcff75f51765b6552f8d8573dfa8fc2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"8ff8085161ea8b8aa3f817859eae3548"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"795e6a1dc1149db4ad9dbe1471bb95c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"7b984117ddc99f342ec57774542d1f3e*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"bfaaa77c42f0a55bee0ea7eb66752fe3*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic96oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"eacd89b4209b9916f0d888b7f7830e6c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic192oc192_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"4dd21eab65a28ca97a366353a634477b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic192oc384_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"786b07534cd30e0a362bbe454f3e8674"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"f0f1d591f12f82592717d6e3ed0e2393*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bdc18b8191c8431f587175e3f28462b5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"49fae0b2d6a58dbb785f433e20a91c4f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"eb20ccbc659da68021f024559204bd2e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic192oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"54bb049572b848f9c122024009ba4c56*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"a5522bc05ceb78681d438fc0e59e9b80*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic384oc768_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"c8139fe29d94e47fb5a62be49d78728a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"36ccc5d6d51db42376ba595e9bfb9c25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic1536oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c0e59366c3a8fc93b42c013b1cb5d946"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4dc9a009fcbfc8528f12d8205cf02ea9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic768oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c16a8022610433909feb74a8aeb2effd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c7bfc90cf3be6a6294f058cdd6b10904*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic384oc384_ih20oh20kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"93d00aa9495b1b521528af78676684ae*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc384_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"c8bbc5f5fd6a615bae6da0a07d0b6d13*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic768oc768_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"7978da3876826e79892636822bd5d318*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0b816ff45e0c8ba77fe118d097a85543"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic768oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"bf8b961e7a9165c6fac42e11d1106721"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic192oc192_ih40oh40kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"98af9a9ec9d6b147dd2e8e9faf827520*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic384oc384_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"2cc223b51e5f05d9164b5ea636a49f3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"c7daf4b5b4f28eacac807d51c0c7325c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic384oc96_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"e409340ad68423716abcb56987492c86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic96oc96_ih80oh80kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"61e0f605a0c087c11b58c9fc02e89fe9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic192oc255_ih80oh80kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"0c44c355c7732675ba6ebeba18e4903b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic192oc192_ih80oh40kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"08aadb0fbe16f8268cbc2836ce8b1cf0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish mb1_ic384oc192_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"0bd0f008713bcb3d6f8541a6dc7eab91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic384oc255_ih40oh40kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"3cd888c56e08b191a59e7800854e7976"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_hardswish+binary_mul:f16:0:abx mb1_ic384oc384_ih40oh20kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"b308ced0bc1ec2fb57c97e1c4b8ec793"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic768oc255_ih20oh20kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4a86fac4f05edc29b602fca387ac6a53"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"acc1f51d5bf98adafe397700f3e8f727"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g32mb512_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"3edba1adf4bd9c398dd4344ac1559f7f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"e687682c51e88cecfd50c6b8c0de1f59"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g64mb512_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"c90e11d28bc8055031476714914b9bce"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"54865a4136cb803d58d2ded46ccb94c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g128mb512_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"d14204c8d137a50ec599ab5681d2eb56"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"c1b511aae759686f87011e618938144d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g128mb512_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"67404224ffc575aceacc428543aaa8ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"0f4cf8a08e2202a4642ecb0765e49548"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g256mb512_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"ab27d0c6b34c4715b8651f4a038f0f2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"2f6e196a26211bde8a41e1d2f911bf13"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g256mb512_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"7dc95a0b5b7f1bbcbcb7aed4142e303b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"c9224ca5d860f422dbfb7cc633c41ea7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g512mb512_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"071b6943e3e4bf635fe70a3ff681c46b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"aa02cd72b38c4f4f4a72eef3fd6a3b31*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g512mb512_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"db6b8ed64bd2790775eda53e8e68aec5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b2a83c561f0aaafe80042904556c92b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"072688cf5183d37b28a080e82d25215e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ab0440de5a94338af2efa60afc1bb073"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc g1024mb512_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"637d592543be1f7d50cb1031bc0ae1f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"9819135c04e9e59576844a38f3e9a977"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1cd39b8e6b49105109dbf7fd6b95bb09"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"bf969c47f73b04d6b12154d7d87b94e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"ebddc3f31c6fae20fd7c179ea0ce8a8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"b8403876159857cc847027717311408f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"ccfe753fda15976c6fce064e0b44ac36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d62970398ab21861f5439c209708cc48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"78500669b5af3e88998bca41cb81231f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"246d57016653f64805719d16b3653515"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2adb7833279d29b264664b1cd2f86e25"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"538bb43afe1a1b5b3a537f42206f2103"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"85cce36bc6d669a9c25765e15fd4fd4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"cf9a47a380d75fd96fb7a78438fd2468"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9fbba42b2c814aa9778e32a031646513"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9fdb18e11ae66825232981bad4df915b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"b7aae30fd4a1198d0de55aceee1d5bd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=wei:per_oc mb512_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"f1c55d628e194e089f5e94db8b84a4ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fbf120a3acdd03acb543ec6c4430580f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:s8 --bia-dt=s32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb512_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"812a38bc6d0533a3a61c9dbf366b0b29"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"8b1f96e78764d49de2361938117584d8*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"9187a256b3a5633b8b44122e4da9cb88*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c33b39871403add6c2ddbe596754b76d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"3fc8189d12f4a193433dddf643f53619*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"4f0fe33e1d6889656eeb358e4ab64781*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7e50f513dd64e166c238bf9395999582*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"47b0d2ea473afefdf22ca9fee636c8c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"fbea3c85fa10e45210a59ad4346a0cbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c811373cbf37d719542e84cfed8b4c74*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"4f52248e6419e6b12654a1285525c2aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"c93374a2c67baa2a2e9c3803150b34ba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"729dd41daa4a1d587e187450b19df603"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"a0eea78cf8214ad9facdf2da2e2fa0d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"5d3690d6f83ad2729afd86bda2d76475*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"7a0570a9c2ed07bb0b7e8e143b06efdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"8fc0a62828ef2c48b0cfd3c008c49e0d*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"56750a3b1df050f36ec2d49580cd6fca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"b1def288f58dac94379104b34430ffb3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b036c05f25b20a5122c1f8497856ac39*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"1314e9686868a162310c9003595cb7bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"0dde01c8b2fef254c449ee7c30ebfcb1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"cf92d39ecba19661c384e933cd3eb590*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"7db415863a6d225b160e199916d80baf*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"aaf3b38b90242e600f9571d8d4936307*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"4a516f5cc5e9df8ba38d27cfc0d435b0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"9d8f542a4d271da9beff89427ad3f3cb"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"8d80dce99bd7a94fe4969ac40d92473c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"426480cc937c7c8098de59cdfec22dc4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4c27292489b33ebbd6cb4141b67d9b73"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"edb3029fad1ed834fc1a471af0fbe24c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"04a13a1b33da6827c1013d8119e4c87e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"358e41c3e1612c4d4b8656cdeb31df66*45"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7142778e6e52cc037b79472242aa43e3*45"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f347dd0c94d31cc4b975b879c814177f*22"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"28b35348d6f3489db72a4d3ccf6892f2*22"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9edb0b9041e1a73f45f0575418172b13"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"cb5d4ebd186f24918ea1236c208239d0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"f959fa866f3deb0de25d9a7c581b62c0"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"1e3f2f4b7c3539c0e5cf519722369e85"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"9e710290238144c465dad4e33b9b8feb"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"14d5fe4343141ee7bb6fae47d2a2b7ec"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2bfa0986c8a0a9834bbf4264612d487f*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"49e6a1a080a64bb9063b13d3e8ec8d80*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"22f2d3f2733c926d9e1468a8662fe8ca*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"73c619c4b798e639fcd40292855278c0*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0377309ace542278ec10dfcef25d7cd4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"fd22d791246b164f0ac5d471ecb7a54a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"c6717e8e610184a84e4508b0da4adf48"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"6aabb6ba702a4bd6ba2a9c34fab2ee6f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d1f3b58f98641f586c149c4a84b71431"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9d6a1474a41155276cb7192e01b1534e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4f7de344abb3eca4e450b3eb656cf04a*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"48837da1f7408eb2767531106fe35d92*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"3cad22766ea61e3fa835165899d82857*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"8ee5a169441f353262f504e74e01e8ca*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b01061c80b782553a9dc07cf8a528232*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"13be2e006b09ddc76d8067e356e4cb3e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"4c9be2072f332eab29c5aaa1ad7f90c1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"c447777b37aa14d73d944a0f3ceb511f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"fdb6a2d1496b95a510cc9eaebb88ebae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee923801f53d61fc09cbc8a024070e18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8c591a606cefa40437763e0ed228002c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"81a111a7a2d4a813d05a853c8889720b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7902abc0469ab3784c21142233ebbd23*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ee5917efefb55f5b083e90f63e311fca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc64_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"f68f823196c2202464284620fc8f3abd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic64oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4e616e43477532bd9c7807084099f157"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"581616ec5c3344ab4fda295c4fd157c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"decaeab0a3b60f87a5795e22988758a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"cc60bc2342e33e01ec54f3729f33cf50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"8b28fac1b9cafb3c6a847b0a039e42aa*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"de63b8204bd4a8229684214be1076312*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"d5e7337788b74632449a2fddfba5c76a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic128oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"770f195312b5302f0d0d47a2a20a46df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"187039de698d81e602e10dd46f5192dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b5f6cbcf0514d4a798dad7249823d6ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"095bb040f011a1a4784fefcfd7ceeb00"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"66877042c5328f7f82e19fd43a0269bf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b2892e74b590e8504cba9d738cf2a1e5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6778661d7927df0b1146d2fd47e13902"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic256oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6c9d24cc2681a036b288c7e83b131a66"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"60aa181fc05e6f7f48f2abbb62ca993d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"73004d09cd4cc9e3d14b17907bad5e76"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16+eltwise_relu mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c88ddcb295b1927dc0e4c4b004b718b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"068ccdb0b43668b0446ad9e2d54fc11c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0325afa60355a62799b0eab7403bac31*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2048oc1000_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f937c3481a3a4d1eaae687bbc4acbc35"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic3oc64_ih1024oh512kh7sh2dh0ph2_iw1024ow512kw7sw2dw0pw2_n"190971bfb528b66af834af5f6fc55db3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic64oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"81ef71ef98cf10eba7802f064d011a71"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic64oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c7010fd912018ca5e8a1e47c51893338*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6496642c839004c4439352f6448261d9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc64_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"4f29204642fb09bb72e5a992edd7473a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c624f9dbdb2ddc52d19cca51e3bbecc9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"77163f3dbc6d80588dffe1ba67d9e4c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"849c07f7c27d9a2bcd8e1c777588feb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"ab357ee6b556e851da2a6193c841dd4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"04af50f4d0bca970509fe2d68581f953*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"81b9e7aab48acbab61733fb2b645e86d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"4ae9ce7b8ad89d5800e86c9dbc4000a9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"fd0fae75e8130cd0bc07b7bdb950c046"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"52fbfbe5e0f10bca18f3a6bf69b368aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"8f6922dacca8943f629162a0e336072c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"55236607956ea8394764354b10bba523"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"94b4c6315b284d084e073138b06a83dd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"4ad90ef212d14d12032022114b479156*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"111ff7dff8e92f8e845e73ce7c222374*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ede8f294139848ce790e94d2985ec749"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"41ed267656d7f76804526543ac5605f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"1b47a5ac0f22f314d57fb631c1dc9364"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"4f27d9fa85b071d41814df13e8d3a1eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d298a8fc1c971d8ec14d50ccc6e4fcd5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"d9cfb8be752646d4d37857f4a55a5bc9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"82252be4490d8fb0a99d8014411cb64b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"4eb3d1d1f4a9720908c72b7311379b2e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f5054a24e3f5d3a2b99fecd5243b0129"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8405286af5830906c7b51fda1198eb9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"edd1d1d899484f05f6d032a9d538c108"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c3fefa2adb809c6f103a3fd247dbd5ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"edff3036f88856098563475a202a9f7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5f2e4535e9da3d00a1eacf92c17ba310"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7e18929b0b5b0be84129bfb24ac3f906"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"2e2dfbf6a84b05ad2ea1b25fa8fe72e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"282e8346418c932a59b214a821dd64f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"d8dac24c9ad59943cf4cbb84df246a75"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"38623373357766470bc24d96cfa04aad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"dbf9e3fa45b798ca49f4e310f94b9d51"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"bb07f42cc922e1995681d303c3221d3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"c2af469204f31c19d2f0378112a684b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e023ed04c1d666ae986fd0919cb1c166"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"68896c300e69c00e19fd551baee9189b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"726b2df555fce3db8857516d461d5d38"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"702dab9119adad972b86b60c06e8a9c6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a328262027f9e63d752218c6cc3515ce"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"28b5d55b66c2973dfa14ee02d1b1c2c4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"cba6049c30ed9b80a3d5df7acdcfd6d3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"fc1a5cdb6284c0cf8f2fa27a5c3aa28a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"a44d68218906dc7594bbb3a01158e452"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4a3b2a295d349b096bbb016aaca9f998"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0734d0b0e52acde30bbb9c7b98f5cf2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"c0d3ee4dea1cef59f5c4e9e0580c315d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"d895bb7929a10c600796dc36c7a38362"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a1c4849fdfabe0348e0b2ea6eaf4741f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"629d0703828860620dffab7e372ed0cb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"ef61c5cae99c1b500c38c06c3023c6fc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e0bdd1d6fa6fd8d4287e7f19246fccce"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0308a3a87f55773dfc6a747ef496319f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"ba702e24adf3e076b6f8030926adc01c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"1cad079578530a2e2db01377a2e5b7c4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"f230219a76ba7e7f2c366ad5787eee50"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"3b06af805578be425cd3203afbfaacaa*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9533d4162c7ccd97de9935e556683ed7*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"339a3034cbbfbb0da8d5a59b28110f09"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"e817a0de0f7056429d659db09d1e71dd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"428f7364a3385b7c05c20c3d35bc2704"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"0d84cfc854a1b57aa2a8e52155cf3852"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d5173ab7b8e8f212e1947900d65b58bb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9ada2bcb594bd6ee75e658ee04a50ca1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f47ef976a4d39bf2d5504156463dff1d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc3_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bbbca928db0f55d6bca666bab1624191"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"07bc7c896d2c9731521cef047f7895f1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc12_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"f5058f420ced71191372fd971613e74e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"82c2e0f6c38777ef35329f93011f796c*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"36cf20f076a1b9675f6c350cfc06aeab*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu --attr-fpmath=tf32 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"92fc0286bc8ab4c48c717a94118f4092*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"cf9ebeccc2761f431c3d73538ecc41ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"1b042e2d597b24543d98df98e38f8b91"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"37a37757fb4996ccf80da3110c03b87d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"b392b840a9a22d484668f75441156822"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"9fa2c01001773b573774813ee64e4447"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"75a824b253b66e21daa8fe3c94f944d3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"479fc99a71a54fbd22fe756fcc6c8123*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb512_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"656057bb76daa6c959b9fead64bf9b4f*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"1f78de19d2e6baec39bba15a3ac3a7ea"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"28221c0de851bcc011ad79cbc89e6728"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8daccdb2a33037cf959eb7786010a142*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"9575b7753d171816631fbe85e3b23b33"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7f10cb3eba101e3823005a2c4759310f*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c6c8dbc4ea4ba1e6aaf5f5916732f389"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc256_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"b521beb4eb1f3a43da77917d954db914"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2b2f21b4278e53686fc2ee49622dbfe5*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc2048_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"28d50144777620129dbf4f1515142ddc*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d651008f446034319259231a423ce38e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7c775f414b6e7ffbebf8a2a2e4307609*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"363f9be5d8d9eaaaf0b53efa2458c1d1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic2048oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"f842ffca8517ed461ea1dd77a3e61677*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"8de661e71c69a74516918e31122916fc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc2048_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"daf636626ac9f012550f8cc73bf1f7af"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"35eb177abfd2b86a056eb966da0dc00c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc512_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"0184632bb799f1c55d0e848e0b676090"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"99c577b7b54d3446151b6d097e2dfc35"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7f205f55ed6c3bb21ab38aada6ddd111"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"0f2f9316da13e8caf9632a04a9591b6f*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc1024_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"df67ff52295ed576a458505b5e54315e*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7c4e8fa6624cfd05e2d7a5d7b34267d0*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic1024oc256_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"5298a840906b61a94a1891143d063e0b*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"83bd2334a8be255e4c7568ffd35f0fc7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc1024_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"c5b1318ee5c7ee2f1ff9d17cf70ead02"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"b0c43407a70c1b0e19293b7633e1733c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc256_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"1b2316a5cb928ec24b39516a1f01249f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"6e3883b37340e6adeb3427f150b3ee64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"842855dd07cd2665fb57acb8c9505ea6*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"3f6b3fef0c1cad0c5572a6661f07c079*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"689073518178086538f7aeaf51784e51*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"eb42b94f443a0759da96a730c64f7496*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"7ba0cb44ced5cef97a903807c1b2fe85*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic512oc128_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"8c528ddcc201d27071cb518da97fba06*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc512_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"56130e2fd98e87299107be7264e0f82e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"18494d8f0e1f446398f457e8ed94b569"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic128oc128_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"c996291e413d6cbee08375d9804787fe"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb --attr-fpmath=tf32 mb4_ic256oc128_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"cadfea2c6fd73edbbe7d3f6db4336d58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=any mb1_ic3oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"8f00cf26856d539c2f11759d95eb5fc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:1.0:0:f16 mb1_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"eea544925919e416170f34670dd4b55f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc128_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"3e6f93db9dd84138fb9bf50c7af93bde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"800700458f8e09d845055d3f24ba8477"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"39a3c12914f1bd195ce3bcbf9bdbf650"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc256_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"f2c1fba7cdce226a7dd7911bc567546e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"9879908fb911c8ef6d100ad438aa9c58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e6c8a13c364b2f532a6d521e3c0b9b6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih128oh64kh3sh2dh0ph0_iw128ow64kw3sw2dw0pw0_n"8a2fe9e96e7e5030f73f14c40c63e0c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb1_ic512oc8_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e6025b454f4c2c801ffa7791116c306b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic8oc8_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"60572dfe13325d4faff204ac75ad35c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"fa0c5a69a1619ecbc52fab09e3585d23"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"cd84f9f0254077ce2749294623535729*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"75ea5a1886c6a5a22bae13ade5c73919*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"65ea2d19871960a169ff3a5f5ec6e3e5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4c56e8ccd6b948c20fa7731f217d20d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"7b5162641c59c3e3a98c4b4b3533463b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0b8c7994431809db5e909d975a78c2e2*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e2d8181d9aff08612921b17923513d78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"70b74fa3503826ee86ffd7239c3a88f9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7e79dcc127b6126a2793f97d3a7c32d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0d9dc75f4c2d09669f5cba7b878105ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"7978cfbd500b4de7cb0af07800a0b86c*45"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"85f0634676c46c077176d8df46da376d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"fb48b547b5deead13d3cc7a92a15d0ad*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"9ca8b7f945f42831fbae510bb6afa4cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"4733f5629e493f1c8a7e322cc88fdb96"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"150739a357505599a62ca6e84ac78f0c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"cae00171d906ff394bdba8d416ed5cff"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"f92862d81ab1e41cf5e31394129dc7fb*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"5f4b1cb7ac62e90e4f04b3208bfa4e35*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic2048oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"748bbde86849d99f0cc43d62d9e5b9ea*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"a69aa74a443fa0ca5d1d16eebecc8910*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"dcdd09bd425fd578990b9f86402bfa2f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"ebf501022e882f020ac8e9ca726605f3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e362acd0a8562482d739461b0efb0045"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"6ee6b609b947fe45bf01284177946670"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic2048oc2048_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"c5537bc9f433ba8b04420bd1edaf47d9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"49a4246c9731acd95cef76a25d74e603"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"50e48594d6056506b380f8e6d76ba1ae"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"57e798dcdece01e84c6f42eac70b5fbd*45"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"a19e2aa33f919a46ecd6c755ecb816e9*45"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"0315c90f16f141c52e9189520ff36ed6*22"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"c7b8fcbd2cf4c6493a30880a84a5dee2*22"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"093137c82adcdefe748b60aab9abf28c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"49adedce5a5ed701d452c8764bdcb03d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"03c56f4b72ad5cf2ee8a8a3c8bbf09ee"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic1024oc1024_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"d544d235c5c2801b3bbab176f629570c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"951f195ab91543f37a87c39ae1856f18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"60baa1ba2c7c3e91cc4df3d2a53e9d1c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"6150051bc4287294a6662d0e08efa06b*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d31c2f59920615b6ed932118f0c7b8d5*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"a2426ba65c89ef03736f9528d6edd3c7*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1f47af2884a7b8f96ee381bbdeead230*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"36ed3a7418cc676f8ac2bdd223e1bc3d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"88e941c9588593a36fd99579da8ce998"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"8be394a08cbb9cf18449f83b020fd896"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic512oc512_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"963fafad0eaa8d1d6eee41c29c7cec96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"c391bd4bb254ee9aa55d51bf92e8baf1"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e96a7ee2b296fc255eff29c3b6ffdf8d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"d624a960f636a0f5c9541f7f28ed49a3*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0eb3d68931fcfb40c1070fa27401b13a*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"bed36085f49edd9589fba12aa6e04d90*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb g32mb24_ic256oc256_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"62f5aa63a93ec6e23cc5da5155632e56*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"8f1c9c6bceab22b5999dd483f3e0b8e0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"bf95bef93a23309fc5acfdf451a7ccf6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb24_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"9cf2ff0928d934003162008da75eaa9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c4fce24115a157c7f6a8a8e003dc985a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0e26aaef072abb11f3247a1a735a1809"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"e3f1c3c23c8b690d570681f09688bfbb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"df74e47ac0a7cd40a3bb325940190fc7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"a95d2a176b144e69d0c80b87841bdd2f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"37c28a9f940c28264a23f80e282781ed*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"22c8d7565db7bcff46dc98f8458d2b31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"83a9d65de465427f096956e99420fcb5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"b779182904765998f0e3672152fb0997"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"333c6b45112c7e73bcdfccfc8bb46a42"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"f94f9078002c80c47c15b2a547bc7ee1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e07caf18a0a322e3d116aea05fdd85b3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1396481799322bcb72e9a14ed9a02a8e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"dcc967d3b3a218af6ec8d5c7a8f56779"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"c9ee6caa1457e345f18a8feafa730375*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"33b6c30b4a2aed98b3e4e6b1e61e02e3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"21fa50cb48825d3548fde14465f75b77"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"b6fd9c9315b3a9898b5b855608b9bb44"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"58d9b9cda577f8f9fa92c751866e8800*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"565606525e3b40150eec258dc334590a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"e505728b408589c7e2a68b1a25f80d3c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"b7a104d24cac061ca9244ade760734ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"abffd869198f897129248fcb2b213ac2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"f184a005fb03170f89d11ce0dc28e9bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"19251385143f270fecc1f7e0df62ddbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"d614256539591667dac544032217653f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"0429a54d4759d0b4e1b195f0ae5e7051*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"53ce4512224b02ce0aa5042fc93bb7e2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"27a583d44df1c198edd9559c5920eb53*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=sum+eltwise_relu mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"10a347703aae94fc26df0638ae7f6b51"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d14453816c7827dd8469af627f0219e3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"20c8039aa333ee3b5fd0dddfb35c91f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"e28d5ae8873da5c6fdd8d7b1f0e0d092"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"4858b59be35221fa264867e101f6f564"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc64_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"46c4ba4e755aaf8af56a12936f0a49fa*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f76883742fe88a9b98f91f4e3a9ad78f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"78a93c3bda16ca9c31fb85050f656e65"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a294bf9095bf50855725bdb0b562ddef*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"e2a40028d394b91afdc44a0ade41f49d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"66aeadd57bb2f4ff04f64c4e58e94c8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"e0de1af71b90a62b7746b42c73f711aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"833a9d1aa81f682840e0182987b3417c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"0cbd3cd920f6e56de55adb80bb1f8354"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"11f68905c9e4d946f2084401609b8738*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"d4b48c3d2b3055b637a3a12975279ab9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"af90eb91b207c8c067c7f56880689046*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"2b6f6e9fe0632ee7761502640f1cd05e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"9b99e1e6afc98b1be81e3ec9b137dad3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"9ba309ed91135a39539eaf575eb57fad"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"32dc9ec7c533f42d1043bb4787368b81"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"e2a7e1a98790700314b24c975757fb93*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"5efe389d90fa265e20843a4acaac5b08*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"efa8fe3054ccec8ab8f32b79a7cddd88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"60dfadd56f59dc9b8d12da26191f39dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"3ee8b2dcc5d434417722fde2a804f3b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"2b5464b0a32ab1d31fab5f71bc598652"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25:0.375+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"20085c39ed598abfc3b68ae0e54c439f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"d81bba56bb37646a643a982e3133cc5d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"707fabb40fbba09d0f1797c74f73dfb6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25+sum+eltwise_linear:0.25+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"6db08febaebdae72fc42bc73ba0bf7ad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acbd --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6aeb91aa7699ce6b52af39a5cf923625"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"cdfe32fb5f125fd67aef8a4da27ee01f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c4ca0451ba4a22c0ddd17e20fdb97bac"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"82126349a926be70bb3b15eecf711ccd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"726ce57bf226f859d211a7e228dd6f72"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"858e4f46c98685767cbecc6fd881f7df"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"fbe5aa70cadad959555b585064afdd23"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"27cd018d9211301adae9de929552ec76"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc64_ih16oh15kh2sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"c9eb6308d4801bea0d0041f1531a6f1d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"3e87413d7624e9c8f2896d948aa7151b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"db6cfd0ec01fd79387f99fdbb009a94a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"39b3401f6aed3b688b4ca2ef50b89f9c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc128_ih18oh15kh4sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"b40150918a4873361e1d3a59b308e4a3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"838e08437e68da57e32c4a317b63f247"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic256oc256_ih90oh59kh32sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"d21af680cb3ffce27b7d219fd3a98cc2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7cc624e9e306eb86c0c49ce5d63f3b25"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic128oc256_ih252oh237kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"2106dd8bb59e2097e85b4a05a21bad28"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"008fbe90acbdfcbebeb55adaeb4f4e93"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb64_ic64oc128_ih1022oh1007kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9949c0fb6a543e98cabc68131aa4a17e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acbd --wtag=any --dtag=acdb mb32_ic1oc64_ih8192oh8177kh16sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"73d62d460d04165ffb14ebe57a846c6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"7ae8fd83c633387258a90a44736546d3*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"05bbcfaf014b042950cf4a450b23aaa0*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"94231567268b462c151864fe09761435*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"e07c51e6e0b5f6dd3a93d96c56e0e93b*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"d354716dc3aa06d6decfc261b329b267*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"3619200d00fffcc6306ef7e543851241*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"a97e452dabc2a337756fc2c46c80ef99*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"f94fd37f298d5dfc72a1e0a01f158a0c*31"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"5da92973086e95e6473fc7fc98b0adb4*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"61e91245803a6d83be65293451f5b4e1*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"626d223c4950bd0f169e44f1aaaaebc9*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"9bde045fa77b9f3503503c8c6a09b701*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"42a156e6f51bf222ef9fd28b2c908b7f*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc10_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"d68ae44115b19173efa79e35331e680d*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"c368eef88ac723a155008fb89f5b6a11*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic8oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"1cd78a1755a33301d1896201ce7a9c11*32"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f64:f64:f64 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"c04176263937403b3b2b8da43970f64c*31"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f64:f64:f64 --bia-dt=f64 --stag=acdb --wtag=any --dtag=acdb mb2048_ic2oc8_ih10oh8kh3sh1dh0ph0_iw10ow8kw3sw1dw0pw0_n"a813e73fe75c028a1df6e157012be37e*32"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=any mb16_ic3oc64_ih832oh416kh7sh2dh0ph2_iw1344ow672kw7sw2dw0pw2_n"44a49a03f9125046ad0f48e788d55c68"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"dbbaeefac2f130c4926eb21025e3a795"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"fcf5bf7259d8d1efef4c1c9a597f075b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic64oc64_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"1451a44ef93765651c324b316c426ae7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc64_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"25d00a20177e065c7fcb251f4a6ad467*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc128_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"0946dcce395736b8ba58dd0879fc80f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc512_ih208oh104kh1sh2dh0ph0_iw336ow168kw1sw2dw0pw0_n"38602bbcbe283c867574cbbaad3ae5b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih208oh104kh3sh2dh0ph0_iw336ow168kw3sw2dw0pw0_n"61e5788e816332de96ab471dd3306b1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc512_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"1f68cbf290a9829eb9dbc3acf527c37d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc128_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"da5b7a4cb3268e2a2236242223bb6c70*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic128oc128_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"4a3e001e51263a5da9398da9c1b88c7d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"5581ba7aa3c175e3c4e28be97e424bc1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc256_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"2b11776016309177153ce53f7d7b26cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc1024_ih104oh52kh1sh2dh0ph0_iw168ow84kw1sw2dw0pw0_n"83a648843bf11a3dedf7c43472615920"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih104oh52kh3sh2dh0ph0_iw168ow84kw3sw2dw0pw0_n"e00a8a1e67dcc8bb4da6e2aeaee5f9ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc1024_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"283be39cb66246d2c597cfa95bb7807a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a7be6b6cac8c07cc26f3adf92d6c08fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"f999d5e5320558f706d4692d1fd800ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic1024oc256_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"5d0bd551dfdbac82c086e243624e1f9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc512_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"eb14307254beccc7d352f9bdadc032e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic1024oc2048_ih52oh26kh1sh2dh0ph0_iw84ow42kw1sw2dw0pw0_n"1af3a95290635c499f19a088c0c9775a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih52oh26kh3sh2dh0ph0_iw84ow42kw3sw2dw0pw0_n"82d3cdcfe764e0ab9723258ff98d141c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc2048_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"37ca32959f06b102dc4d25567a3c994c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic2048oc512_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"11813f8964cc181ea6c08946d4f5e277*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb16_ic512oc512_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"41f6b0b7ef2d85c7009e1f6d1035ed33*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic2048oc256_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"74a307aa7cf34c5720f1958027475b81"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"914f1486f040b0d9c5e44dbfdad072ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb16_ic256oc256_ih26oh26kh3sh1dh0ph1_iw42ow42kw3sw1dw0pw1_n"2ae44ba34d27ed6e0543562b97dad3d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb16_ic256oc256_ih13oh13kh3sh1dh0ph1_iw21ow21kw3sw1dw0pw1_n"625414984adc486d4f411c4b7bc6ba56"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"fcbc82bbbe1ebb994bd7aac680fada92"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb16_ic256oc3_ih26oh26kh1sh1dh0ph0_iw42ow42kw1sw1dw0pw0_n"b4340caaa04c436b3116b1fed48380b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"236ae17406847b62db688cef04c1eb0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb16_ic256oc3_ih13oh13kh1sh1dh0ph0_iw21ow21kw1sw1dw0pw0_n"0819a7901dccd15cedc9ed5d07aacc13"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"b9e5f5ceb48e2b091f555b569ff514d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb16_ic256oc256_ih52oh52kh3sh1dh0ph1_iw84ow84kw3sw1dw0pw1_n"d116226f5b4727c709102c4331c6fa58"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"6fffcda6c1d519821f3daa7156ab2d73"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb16_ic256oc3_ih52oh52kh1sh1dh0ph0_iw84ow84kw1sw1dw0pw0_n"a915a63da6792dd01022e71cc9451241"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"fc45028d007cabdc1fc45647f2b6fc59"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb16_ic256oc256_ih104oh104kh3sh1dh0ph1_iw168ow168kw3sw1dw0pw1_n"227c46e489687c6db9f07b92286330e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"3e9d843fdf7b8be529b87c973c9d5d14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb16_ic256oc3_ih104oh104kh1sh1dh0ph0_iw168ow168kw1sw1dw0pw0_n"5056124c9a758380ed3085ec569ac7fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum mb16_ic256oc256_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"5ac42967e8c03c505c83105466574f94"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"8828ce351328919b8017a3b08a612774"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb16_ic256oc256_ih208oh208kh3sh1dh0ph1_iw336ow336kw3sw1dw0pw1_n"08d1d0842099cec177bc1de0cdef2589"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb16_ic256oc12_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"7b8c4f04bc09293b5dd789893185d616"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:1.0 mb16_ic256oc3_ih208oh208kh1sh1dh0ph0_iw336ow336kw1sw1dw0pw0_n"d7a0738e03b949eb278527a68e4a63d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1600_ic256oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b15f93e5dc68ff6f5fe3292178a2cf8c*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb1600_ic256oc256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"c6b9e7ea241ccde46815e7e250cb86e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1600_ic256oc91_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"4db9f21a8d93ab695ba035d0269d4414"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"94b7697ac6df2a1e221801f979408931*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"d8d90b82803070451f8363f5d8474247*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"29d592268002e8cf12953ba757419ecd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"27635ce61d2e527f8bba8f0916095bbd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"2d1a02ab44d301f25d12b75e390ff6ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"8d3698e2b506bd04848d6e6dc4eb5ecc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"8244fe792c0cda675200a76a1e12d9e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"88bf8992687fea2e0fc794562c16ca4b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"0a69c700aed5082d55bc24624c470318"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc5_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"57280c7aa64e42c74e60c3c394112afb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"ce9ea425aefa5ee122c60d9bc7e15d19"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic256oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"68bf4f6197ac659ff2fdb963829da219"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"24ff47138143f046a6766c666504a786"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic512oc256_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"bdc8cbb380fd963c28181c6866453f8b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"481b75721a0beb1b29e9a1aa453d730b"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1088oc512_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"74384becf4d62b127fa751e1a6e6c850"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"16676d9783307d64d4289dfb4293debc*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic128oc1024_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"9caa8258052567630a7f0a2d54ec7288*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"675ab58b71e17c78677a8b6fd39be0db*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc128_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"058072477abbae2c6dc99589c5823d34*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"bbf792c3df55aba491751c722108e384*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic64oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"f1f696660f8b6bea603cb45dc6e06b03*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"dcddafc63dabc492f3dcd79439ac4314"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic7oc64_ih1oh1kh1sh1dh0ph0_iw15000ow15000kw1sw1dw0pw0_n"91e494c38d34dcc793384ee85701df61*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"c6189e0017264144f7667bd9279e4977&c623e8e26fc126b514e7cad8dae5402e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"4261e3e9cb3758c53b96d494a7ae4e25&51f90977973e97e8e56cc4a39b4fd7f5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"c7596f77c6fd7fd44c8c7c2de2e107ba&cc4e422ae336f4efe9dc34e28c0c162d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"7982c99f1a229ae134bc4e67ef78adaa&18d86b335f146183c8d89b425fb1c831*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"4cec8373b8010b27a6e685c789574be7&6fdfabda80fa390e6cc18638919b9e06*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"f0f3a4c6ac1737e1dd26c54e978c88e9&549fe497cfa7798b6725ecf09a447d68*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"ec168bc784d7d737c1bf5a5e6228ba4a&12c4670246a1e4b1123f8e03ecb6faf7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"2ccbb82d0505f81563dfdf06d053865a&45d84a7dad2ec8bc45b4b861172cdb8b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d567df88348118d749a8f001fbeae462&5371bcb5064b81bb390d536c81df526c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"67481799c87188d94927d7dfe114e44d&75c01e24f1dd6af781cceb388170959d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5d208f216863316d88c4a73c1d0288ea&aee4adae0076e92c90ddaa3b2a3751ac*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"dc90425c1db239c2db68d8f9e6dab13d&cea0f92d0dba0a790f214de08dfce7c4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8692544fb7cbf5da92d8bfc9c1bfdb10&6544bd9831465477e8949d9ce5af1249*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"1d59716dcb306f3e9a0a9ee7530f1b5c*5&5e9c94f6748f582ee5340bd8f2c8eb8f*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8b388db8d4c7dc0290a2e9788f24ccfa*5&70606a8ad4714e976dd386c1b01fbd95*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc126_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"47313e24f4d77ffb12de679c1411ebe6&7e8e248c7fb6002d8908fa22bcce9e02*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"4e947829430430cbfd17f0dd234e718f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"52ca53ad2285f479e5948558667936d2&b976d0cb0f47bbe5c7599876166231d4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"1e100fe068b7b41c92625fccbb3c793b&05f2ea65758a482341ab6089fb1e683e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"25a1519c91dd796405fc5f232a2ac813&0a705e048c0f8dc39178a43c9d9a74e0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f559c9458fc8417e421a57b32ec18c18&59c98fd4a59f7c94e79cedfd4246032a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc126_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"0be7ec0f670007d902bd8c26f22c86f0&bdf13b50e8d9ba2884869b7f6ae7178d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"300dcac320c0f2fa23c9d4ef06c9810b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"995e4e09ca213e619cb4173d4c237118&b9412980f2f865f502b65e8ce0de4e4b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"e5f84baec518966480e4da43f45c1139&2e83d52d650b223cef1ac40a025f4e29*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc126_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"565afc2dfb33102e29874bf4e4f3f2f1&7afd3ad959d4f3515bd38b3f9ba65c48*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"a8930e03c1253c605f76ef2ad324c48e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"195ed1b6f5d0d8441ea794d781ba01cd&301b03713198e27fb6cf1df2da9ff59b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"4a5a05dc62c461394de7ebaf5b0e98ee&23d2bf9b89a2ec64823156f984290215*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc126_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"51a0ad43d9a1c1abf595d09fb2e5eea9&04ad47941cf75ab6ee32abf91f3bf1e5*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"be4d8251669a4b577139986077b20c6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"92b3b8cbeb973871b6e9c6b12573f598&a9687cdaae4d93435da832190a4dc571*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"06914bd58e0dddb3ea23dbec63d8bda2&88a9884a54fa8176094955f8d6d1cb6f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc126_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"9ef1463b67dd6925dd68528b4ca06732&ee93e07a11e25cde3c493a8b32a6ebee*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"780150fed611a4efe7c7ed7b48947d56"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"64c4be4f15333555e1986d2a533ac231&3d3a48aff032cca8457ce16496a46afe*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"6f448cb32bd6b0a5a2295c0e8865a209&69ecf5760b105c1d6cd106211d825e99*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc126_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"781435abcd4db3ae9fbe937991de3428&b504f9ec586994964eba6057062a3a61*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"ad8fafe3647455982892871cc0046a64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"83d4691bcf423f172330f69cbdbec13f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"7f7da3623e6d6f424dc3fcf51a649779"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic80oc512_ih1oh1kh1sh1dh0ph0_iw450ow450kw7sw1dw0pw3_n"98c80063035af0aae985c8283c20a5c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw3sw1dw0pw1_n"ef5100308a98c75a79806e29bf794e3a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw3sw1dw2pw3_n"123e9df11da0292425d8b802ede4ba32"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw3sw1dw4pw5_n"4dd49f88be8c58f3538e0848bfd2630e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw7sw1dw0pw3_n"7eb25f1bbf0246892a8e6af123371c4f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw7sw1dw2pw9_n"fad345f294a5dcefff5dfb4d5b7a9a95"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw7sw1dw4pw15_n"dd0cb8e199f3fd83644a41bbb65743b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw11sw1dw0pw5_n"e706745603477bf1e7c8de96e6c0c20c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw11sw1dw2pw15_n"ef28b33a77a6ffbeb1de81bb61e29d4b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih1oh1kh1sh1dh0ph0_iw3600ow3600kw11sw1dw4pw25_n"ea5046e8a7e0a6e3a21a704e6c9f6640"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw3sw1dw0pw1_n"a097432b8b912b96a598c349a245cfec*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw3sw1dw2pw3_n"d093cecd4a976d15f0298e0ed17e21b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw3sw1dw4pw5_n"c6e5ad7db5a459a727f58b7e6e97edf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw7sw1dw0pw3_n"316a56470a035f4809cf7c248d1cf28f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw7sw1dw2pw9_n"48ac3b63d0c33e605b7f72605b857ec7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw7sw1dw4pw15_n"21118869d988d495b2725ef774ac3168"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw11sw1dw0pw5_n"af8dd4b2dd565dfdd7105cf92ad3fc69*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw11sw1dw2pw15_n"ee82cb00e354385a89ea4c09a30ca8b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih1oh1kh1sh1dh0ph0_iw28800ow28800kw11sw1dw4pw25_n"0707c2cbac2bbe9f74ff9857a0092e0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw3sw1dw0pw1_n"9b4b9faad4105ba0171325dcf471aae7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw3sw1dw2pw3_n"a851cdc985f8724da8a695d0fd953af7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw3sw1dw4pw5_n"c841a4dac13ddda1495ded72ea0398a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw7sw1dw0pw3_n"67deeac4091a14493d8ed06b74c67ccc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw7sw1dw2pw9_n"ea67b19c59f730accde058d638547b51"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw7sw1dw4pw15_n"5dec8a898ff38a57ae74266aea07c93c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw11sw1dw0pw5_n"01c3da30b6caef9eaf0ee42a6da8f940*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw11sw1dw2pw15_n"e11ab04b2225e8d525df5eb45d0edd3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc64_ih1oh1kh1sh1dh0ph0_iw57600ow57600kw11sw1dw4pw25_n"2260f3509cd300906aca04eb23032dac"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw3sw1dw0pw1_n"645877a0e95a16fa610fca00813f8873*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw3sw1dw2pw3_n"af9941435615ec5950b18abe60b040ba"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw3sw1dw4pw5_n"9c94c4bdf0c05776423e8c38e553800d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw0pw3_n"f54b1dd5f594b1aa23cea1ca538bde65*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw2pw9_n"7eb3ae365ccae9b9d00bc8e9a630b9a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw4pw15_n"08f536faad10ccadbdb89c3f2e75fad5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw11sw1dw0pw5_n"9675d8c758da06e203f791cf48593dca*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw11sw1dw2pw15_n"bd50c7e8d4da0b1fd1ec001171e8fb2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw11sw1dw4pw25_n"370ae29920d1f84dc17815be8f48ffd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc1_ih1oh1kh1sh1dh0ph0_iw115200ow115200kw7sw1dw0pw3_n"7611d32411baf929fe022929c076ed81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"b96b3d3bcb0db7e945127c55716247b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"58053bb4954cfedc4a6676f3d7dbe208"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"5f08982a72f805a309c9582568daa5f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"3a709f978897555b6070ca4e61f36062"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"b5861bbb5b2b637f915b92a527995e0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"d4b1965ed1212fa41df4dfb5b342a588"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"a0efc87349ae1839b58cc3319921e04a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"bc7fec61beca22c113fba6d7fa7ddcb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"2be1d8720e191bfed10957503bf85d85*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"4b40d748d019de9322a2de28307c4dc9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"ee80bf31ee6f4132c791540497c0a516*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"4bf6c13a9556d9bb8c619f4f7083b2d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"60c2287640e2141d516651c496af1b73*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"5cfa2f7c80bace0f0ffa1d479588eb3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"fa4761826a84086d336bd4e7232e349a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"c5667d7c8a820172401e23600404925f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"9e7305165f0c83bebefc778659c29412"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"d4f767726f4dd493ec020848e0d17b45*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"bd8902e642b0003d06d5062aa80c04af*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"9a0dae40056ccadc2ce4aaefa2097905*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"0002f48cbf4e0d0257b5ac3503a2fe2c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"231ae6e9411b73ea8afa19387def7ed6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"99f94c180e858958a0307475d7b23e62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"8b1f609452e9d6307307c960be18d5c9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"fb2b5e7cf4f97b148c8616e11efe742c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"88b599c33300f699336d8ce7b80ec27b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"6c9067cf973088ffce62bc0f0e8d2624*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"b0f9d7468eaf01070c4184580a858ba5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"af6dba1c7eff8ebc1e7752126dd925c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"c97a39a46df6145c8985d0d8936f88c9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"83ac48d8ce3fcc4aab9e57589d4681f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9568c03d233bfa0542541dfee9187be0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"06d1324524cfbfb5c0ab0b777c6b8447"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"ba0cce448cccfd02cb322ed8b4ac525a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"32458808e59a3cd5f0b534ad4dacc27b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"53c2fad614fd2e0d26334b0dd3d5a189"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"bcae646551237cf42912866915c288ba*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"0078a5967dc50b7418269c4fa2761a1e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"7045ed168614d1255d5b8bd4b7f12674*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1b2353709f3fa7d3c00daab1538a3e7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"9a96e4bea53e5b4e858822180bdefbbf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a69f8387ece2819cb0c0ba6eea4b32fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"77efed20a1fe90786aadd10efe8435f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"028a36b4a10cefd0c7bd14a524bfb5a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb1_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"e1bf6ad8f2aee07c3a0c145a2760289b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"fc427ebc80f3557bb11afb873a02d1a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb1_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"8eae40095d4b85142ef49f4e191a2026"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"d500415ab570bd7ff7408f98eb6449ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"51c585758b4d905aec568f30258ac16e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"697d36fd15ac29fdd7a6b6647e32de5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g128mb1_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"92766ef59e3c9d4aaeda6d6a9e29bd53"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"cde6870972f009a27415da65e9df752f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"1f3de2beb77219f49a4fc280d7f6cf12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"5ef841e7fe088290ec411490a0b93a48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g256mb1_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"273226e488a79cab9af010f858b1f1e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"34e6b213db679a86c904be501f0d5faf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"adc8e44e0fd47540302f873edb11430d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b2bce6dce407c3543e73b4dadb46ddd9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"9b6535489447541e831c2378490448e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g512mb1_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"a0921eba8c529f49519a5dedab9e369e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"8c13f061ef477a8b255d91a234f8b588"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"36abe2e4a272e3a4dcd2ee9d9a091d36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"f4d859e194d43d9ca8e0871ec87a9afe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1024mb1_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"dfeeabb6bb880a5f00aca392c0bbac3a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"12dc795724e9cba50387b703db2fd6ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"7dd5e162f4b8e3bf102ed3ea6ee31702"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"28ceea42d0b1138857a0fa72708e897f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a522356b93be5c11accaacda20f7e4a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"0bcecae324292d2e032a469956390ae6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"9370be102412d26c1ffd5a370a25dfd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"e050cd956d9e4c819aa1646c669c4cdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"d1d0583f9cb19425c2fb395c0612247c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"382cd952cbef7ae43ecd4e136887871d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"e9929d83442096485d1da80b58e8e442"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"8c1f215459fe78f14a4a4031d1121753"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"2eefffd1888c45083809474c41587259"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"caf63107cdea172b31e27ce44793cde9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"2cee513261134c4962550a15aa753a89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"b193fb7aa70e4c63418104d9ea85efd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"3d92939975ed5add7da48d06d180bb81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"dfa31cf7e0abb2185aff6db49d41bd43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9041e6d9469c1f9f6d64656501ac07eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic3oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b2815280a58f264432f54aa361447758"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6a2ab816dd32cc42aba4cba381af1007*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih257oh128kh3sh2dh0ph0_iw257ow128kw3sw2dw0pw0_n"49c2d2551f478de9c17e3161cad365e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"d159dacb5410dc44f0322db203e640c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"e8de863b52bebdb65febe01e76747dca*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc256_ih128oh128kh1sh1dh0ph0_iw128ow128kw1sw1dw0pw0_n"17abdc186b9a0496046ccc9bc17b6c34"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih129oh64kh3sh2dh0ph0_iw129ow64kw3sw2dw0pw0_n"12aab4954ddf75cc08da4a3944e16767"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"e6f9a126f8cfc93e8b3a8b949c428241"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"f85a676d3d01b03f46c1f18c3d70929e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"7a81cdcebca2363ce009c49ce17318e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"b3ac673f19a88aa060df2be5c274eb1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"dc41376beec0ab230d6b42f1f1c2ca72*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0aec9023df994152511f45cf0451cdaf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc8_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"bc2ad7ae0180ee9933d18604aeeef8f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic8oc8_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"3853f148a66377ab8174d642575a78a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"17c5aa5b4f0bacc48855d13a3cfda7e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8dbc027ba9356d9ccbd09c01d8c621b7*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"48d3ca98fb52b41bf9549030114c8fbb*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"d942f92ca921bc1a58ee9e7d5bc06305"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"acf1e04d92c4224298f2e2f8a7ac70dc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"728343ffc05cb6dd5e1bd5e00b2366fb*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"d9ba3155a5c47b52d1d341256039b3c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"7610d8472f549eca898e7efc8e0af141*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"6a8e4e40e25f12e2fc1a56bd9345eb60"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"dabf970ecf0bba07ac56fce6a2b2733d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"1f156240f87360aca6241e000a1ff566*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"70a5903da74588e61c843ff523d64b0b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"fbd152c9c3657fc1b11c73a892aa47d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"891238a8fe962f7013ee79399a5fc662*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"dde448a7f5b06d4f50ca992a06c41c28*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"686fec15eb339b40cd1298e519222cda*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f7c565499d19a3dd072900f1d4405df9*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"3536e1175f9b79f98cb26bb7a897c6db*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"625761a32fb372de10d6307b58b57879*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"f232c50ecf7cfa20288937f0694787d7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"0b33b8399534651180931794d51ffeff*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"c11b4a47a76147b0a0bfa26e72f056bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"6f695bb925ba2620cc327ccc03e399c2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"139afa9cc0ecef3eed5257c0b4e80b4b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4533da362009bef3c02942e48cde1113*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"81beeb27861b27275404190263b2002a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2317e6d41e70692f0fe03df32afb88af*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"d7b860c6eb50ddd3efce5d1ba762d1c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"7bd6bc4dba7e40aff5bb7b4a24d51ced*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"716315956dd40b914deca5aa2bd5cd2a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"75b32e9dd8425a398a87505338859c98"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc4_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f5bbda024f6b59fd32e496dd4eee456e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"23cd9a89c8cc38eaf120031536ee2b55*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"e3cc96aa38358808ed46084f5630f835*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"67fbb81b878e2da50810a2920f07cfb0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"13837ab32f249792555d98f664b89697*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f193d9f2dc1fad42c76eac07e88f2317*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"2542b8690384e1b5cfeb0fe2f3d8455f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cf7c09e640d8bf26ddc5820e750962fd"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"05fc0a0f02159ae12699c8cb9096968d*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"4773bbcf712a720d13a9fe39f99eae2c*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"37bb9222fd06c04a28d7f8ed7dd49760"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"5d334c944b6ee17db782eaa049594332*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic960oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"857be560bb7e8c9d25c8fcf4ebe48cc5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"5153216236cebe69341ba0ad821bc65b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a6ead362e9a164089aedf275c6e5ae5d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"ee9c107d01beeac0518d0f60027a650e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8548a34b10347c590933df8d5b07cc08"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"2f3296e6966c6be02bc4f979de34f472*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4f74b0ff532600e15d1b78292603ff02*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e4e625e421bd1e7a776eff6832bbe585"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1920oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"b2fb1aa7560b7427c205e26d6a938e93"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"49b6ff9666b6305e07434166090b0218*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"fa7b99f4b5456bfb5f1deb73f03d5c07*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2560oc1280_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"02f518ff5d4708bb071d5f37ae68efd8*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"f4cdbba00029565c1b76932f4b97aca5*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih4oh4kh1sh1dh0ph0_iw4ow4kw1sw1dw0pw0_n"c517a9dee3353804803e5dd193508fb1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"96a7a408a05dde28a43063ba6bb82917"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc1280_ih8oh4kh3sh2dh0ph1_iw8ow4kw3sw2dw0pw1_n"93c07bcb0f25c3263b00a5b2f7056fd7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"6c8b44e99313b867db0ed6e0b83490c5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"57b47d636079766961d4cb76f63975c1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"1b2ed3e0ec586f00a38a67c073e3a470"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic640oc640_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"156de6efb5d8c4a6a4e5bbdb1122cee6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"63d0dc241446a97d3a4815ea1dc6bb97"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc640_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"17ac20c473e2458ec900d03e3488279d"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"1f88d48f45bf305e6dbd63218a43ea6c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic320oc320_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"3236f2ba0a3f08ffb5e5f6d834cc922d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc320_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"599048edf9f2ef095e7b4ba1958fc58a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any mb32_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"6f1ce72e32d0b757cd312698eec06414"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"8728ddeae0de2573fa5d8a7323261e55"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"3c17e426d7298cac9e832908de4ca9c4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"9d99b45c47267c04c3b70bc5fc5684ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"10d54f55e0516413216845a8f1596f97"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"b04b487a09cdf8e8ca4f644c14982e94"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"9b19bb8965703c1463fb4075d7d3ee02"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"5e34049058f54ab232382e08c5857990*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"0deab6b0ddfc4c1ddfc22ef2c6b87921*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"bffc493263c846426c8e9430c1f18bf3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"237a823c157912c9be1e0747349b79cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"6ed78928aeaebd9ffe4123d19b3ee568"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"0a8eae30757c7d8dff3af8dea1adb609*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"e0baca94287d78298120e2a2275611c4*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"2656ad0189ce76a90181fbc6d0216f16*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"8d0f382bc258ec649522d5da5884d6cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"b19bd008b037a72207e9da3c1237dcf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"c571bd17e3dd2b71616300f2a97ffe88*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"3e004b728358bc334fa745b410cd60ce*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"4b6b3ce9fb475a7faf34b33e945acb1e*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"db77e5db7c00f305777c583b2e59cb6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"abf5f8e3b25bac1fb2dc3a5fbbc05d54"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"c1c02620d7aa6ce742e8d73b18dacf98*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"71931e71a5d7ad5f46e6598d891d7541*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"66efb68b778fe553083a05a1b789c865*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any mb32_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"92214c4316f026b2d0e555b6105b0c3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0af68df986f181d6bd40d46e6fd3b8c2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"4b9d62ab679c92c632476e533528b007*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"dcb6bb2b4faffeb052d822dfe5d4731c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"5f80269eae253d20025efee0b8091918"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"de05d446b5901b0a32d799b3352a5f92*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b4bca7070136b43e9b879f71c80057a1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"43bf2c3b8e1d31bd75f6202764c73d29"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"b325af85ae3c1cf00cb87c4a54fe8696*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"6df2eed9a08ce2a2fbfae3a8575701ad*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5533699edf64e696309696628fe02e06"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"ae8a55ad5efdac51fe0733e3510b67fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"4d41c75d5263397e79242aec2e3bedd9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb32_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"8ed85e5ead2b1aff2bc203d271c2bc28"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"7bdb52c3efb644574f2905bfea1e5de0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"c8d8c24b2f3d803e385d9376427d061c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"9eccf80566a278e6a7e2e90de59d7ab1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"b544e8941294925a1e9c1b6ab28d23f5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"71dde47d31f6ed8f44dd3fc8515d77b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"6daae703fb9283f7ffdd26a283ad594b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"dfadc4a41cc4e24f839a7deb6be805e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"7585129ecd50b1da3324c5df9caec9b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"ee84f3d016e8bab562309a114a139b8e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"b50a2bb2217fd2c165e398aac880ef9e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"14796d7028e51e078bee2ae647b3c0eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"938e7d8a39f2a76c50d35272f09e0d72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"173bce9ca4ba70f36861bd4305d3521a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"948a8b4abe36482a639c953810829054*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"2d0779887fdd0eae873ac6c690d7fab5*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"111d228b6a6e7a736959a17ad47844b2*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"f8e6348b4da18fe4a3d3c73cf9be4a03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"0f3702b95c99f337228a9c1674807c0e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"736c17e1a246c7b081b567f5654900a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"f5ab0c5c8ed9d7df4a00a4d5e15cbc9d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"1440c6755140a6b4d0687003e6cd183d*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b65c040ea7f00d7172e74f6e0dc3fe1c*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"38b94cfb60c7515638de432bfb614b15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"b56224a27fc2fa549941234321887839*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"37eee20a5dac7c4089ab2ea2039e481d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"8fa3327f6cd61a8c9998574325ebebec*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"269739bf260984c35964e81713d822d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:14:aBcd16b mb1_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2a07bfc656a1f6e94d330e5729e166fa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb1_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"4da452c706a1c22dc1145d0c3bee4fab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"cf62bc4b1d2ee688d5a11fdc80095652*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"54c4cba0c2bef60c271626efc6dca2fb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"0c917528d51819c5baaddaf56f50ca1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"fde8f115d0fe7e1fe290775d6700e42a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"b663ca80b65409bbb6c3313b9ad311fa*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"0328cb143eb688b20232ca97066f1b51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"7bc9af07ae9a3c7e8a9d8e7e262eac94*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"37071cd68f559ff8211f899551250be0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"862545a476d2a5a25b7d1abb23136911"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb1_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"6ac5feecb6c0a2260df7657bdb23ff55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"b42d64e7eb078330ef32720b27c92366"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"31acf24bde1309e103c5f041c019b915"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb1_ic3oc128_ih224oh56kh4sh4dh0ph0_iw224ow56kw4sw4dw0pw0_n"d3063a938d71bc6b3dbd2e1a5d0f6417&1d20f90d1a7b2f23230014ea48f8b2ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"73838ca90a15333f0cba13164ad7490f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"637d8155fb9923c9200ada1b5ac8a43d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1c8416aced2da101d5b615951d7a6c08"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc128_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"d077fcc82fe771bd317bed32b6c582bf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"43a3e3078fff748598e8fe10822db4b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2f6e59f5d4f7c1738bf84819060fd63e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic1024oc256_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"f4189f284afb2dec45050438860a0f49*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"c2a9dd54162ce9efadb31751291c7d68"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=dst:common:0.5 mb1024_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"7858b3295d9e54dbf43fe9d408333d33"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb1024_ic2048oc512_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"441a1598d0cac21959f77920cbde1f01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic4oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"40ec6f897677a5603caca14a9c6e49b7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"85328e5f210e80601d9625d0862151d6*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc320_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"159194e0524f36e7a13850e77b1efefc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"95cbdf8b02031c0803e4ac0f857d944f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"842322be1e451abe12f4e8f4a1a23180*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"c00139fae675c441ca8d3d878f068829*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc640_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"cd1be29b676d39d4e3f8a4028f5616a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"0f3d286fcfc4d4a8ef15e3233a229d28*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b3c3396877ca688cfd90e62da6882aea*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"df15c63fff25b9f3fb3e719d1ac933fb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih16oh8kh3sh2dh0ph1_iw16ow8kw3sw2dw0pw1_n"47393cf67f7237d4d0b112df7a9d8692*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6e7f97ff604c622f8bf9f52e583974e2*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"1c99f966a45d047c8df512614542fed3*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"e3c5463349bb8d50840389551152445b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"a0c69066c6ba5033ade68a500350ad90*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic2560oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"1273d132146dedeaeeafee8662077906*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc1280_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"b04e7925f45a122d75288426d3fc1d1e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc1280_ih16oh16kh1sh1dh0ph0_iw16ow16kw1sw1dw0pw0_n"a53e7375e07f3c627e8be58725ec4f6a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc1280_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e5734c48935b311f5b16fa33ba372b8c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"302b0a98a4c2bddd9c3cae891b015bae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1920oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"0bf8c814dafad28c0c75c6a88d0ded7d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c4cb37230283a29627ef39c4484cada7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic1280oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"5997163c623d157ad8c6b20705509515*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc640_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8916b017e7fd66799c715ffec57f31cf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc640_ih32oh32kh1sh1dh0ph0_iw32ow32kw1sw1dw0pw0_n"6b335a435f05e558e955e6e9d6fbc414*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc640_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"1e199507842a2a4e998846c6dffa0fa6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"71bfa020f4a8726e63deabc74a2d1d0b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic960oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"e17070100e32050b9fae33442db4fe59*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc320_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7e5ffb7dcf8e1fa156d665d23d20f282*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic640oc320_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"724773d4f49986fdc18b42a5ed5bee18*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb16_ic320oc4_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"8d3f85e8cd42b3ef6bc6e835a3fc39b7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc4_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"04f26c305b22172faa8f9e6825a01cf5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic4oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"9feb13c82b1ba31a3444e8da68f3f81a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"7139df48d003f2104a0af8e5458197a5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih64oh64kh1sh1dh0ph0_iw64ow64kw1sw1dw0pw0_n"94687032af719581a3ab630817db8a4e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"0d649162f958270b7a114c5a15aa0968*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"327887efa90459352f55d1f4c50ea1a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"55f4ffda948e69a8115b6805912ac13e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"fa6137dc3b14046d3b104b1a8b9b77c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc256_ih256oh256kh1sh1dh0ph0_iw256ow256kw1sw1dw0pw0_n"bd16c3977486bf78a445432fbac48994"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc256_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"e5945c2c2952351f23d5d2c57b8eff8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"853f73d4f8cf230b96fcb0124216e218"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"c76220b8ce1fe7b357a8891d83d50009*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc128_ih512oh512kh1sh1dh0ph0_iw512ow512kw1sw1dw0pw0_n"58a4998a70dde839f9af2e040dc7b28f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc3_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d2bee71e0029ff4922e916dd48737503"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic3oc32_ih299oh149kh3sh2dh0ph0_iw299ow149kw3sw2dw0pw0_n"158424aa972ceeece2a678a4481490da"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc32_ih149oh147kh3sh1dh0ph0_iw149ow147kw3sw1dw0pw0_n"5b1004bde1e435fd8e9a2a8c607550fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic32oc64_ih147oh147kh3sh1dh0ph1_iw147ow147kw3sw1dw0pw1_n"15b2b42b7c10208117f54b8220419e74"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc80_ih73oh73kh1sh1dh0ph0_iw73ow73kw1sw1dw0pw0_n"ea40e6fce8b34cb5eb2fc1bdf3b61878"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic80oc192_ih73oh71kh3sh1dh0ph0_iw73ow71kw3sw1dw0pw0_n"0a72659ab295462493845fac482a7caf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"bd7e9a733f795fc41080e70c1d3fc3f8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"88feaab5b7aa47af5a04fe31dc89612f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic48oc64_ih35oh35kh5sh1dh0ph2_iw35ow35kw5sw1dw0pw2_n"3bbd8f9f508f7567746f912c39e8b196*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"2a67e8d799ef8f0aa07920007a16a93f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic96oc96_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"a296ba834f402f75e34f37694e31d91b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc32_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"5579c9788b073a6ed69df313f43faaed"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"6d851201f5cbfaa223b5dd26518f470d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"4245259acecd8bf9dbecff3890838f40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic288oc64_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"5427cd21b5ed75673d33c7e7c930d737*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic288oc48_ih35oh35kh1sh1dh0ph0_iw35ow35kw1sw1dw0pw0_n"f74ef669e874cc7ddeb47a8e1cd06646"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic288oc384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"8c4d1fa696cb7b15768d607131a7bfc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic96oc96_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"941a9a6bb40f9e3be64fddedef565785"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic768oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"7c120a6e0772af11fe392995dfe9acb2*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic768oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"ab6b5ca1399f2f21fedb4f69c08bb873*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"6b864ab3d4319552752b5fa91de8bbe9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"365f015ad3c8fd2fe1a51f46114b4aa4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc128_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"cc307d2ea6680c120b33e11a718586d0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"df6ea45d6c54f13babd85bc415364ec6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic768oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"00c9e84e7ec60bf80eb40994c56715b9*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc160_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"0fc4ef250518516bb90c2244aed8dcb7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"88e46ec099f3a7d482c6909de65e2c72*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc160_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"55c8cd674974edba72ac14b66697d29e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic160oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"3eacc3de4190d69e4bb0528c433f22c6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc192_ih17oh17kh1sh1dh0ph0_iw17ow17kw7sw1dw0pw3_n"8980315064fd219bf396f52c37152512*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc192_ih17oh17kh7sh1dh0ph3_iw17ow17kw1sw1dw0pw0_n"d3c66c0f62627e9119b3eb16ecae4c9e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc320_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"214d3c74c0d0056ad5a491c96573b81a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic192oc192_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"59ba698ec23d44ce34accd0bfec833ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"eb95b2cca30bc64e3740164be14cf6f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"08ede0ecc6b0ad35df25512edf32e4d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic384oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw3sw1dw0pw1_n"e407ba111b9462d2657f3afb7d4bc28f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic384oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw1sw1dw0pw0_n"af64eea2b00700f5ac5baa5683ba6e69*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"a62c1357c3e1e9ea38b24dcd737e3ec8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic448oc384_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"fe9068771d509e2522b0bb8aed7cda67*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic1280oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"4b2ffab81f86c83f7d90539e6646ec45"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc320_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"3fb4ef0d5666861f9f559459b0666ba7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc384_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"bd4217f43cea5c9e2b980c13da99bba2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc448_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"61a1c686d394917e6b889bde94a7eff4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic2048oc192_ih8oh8kh1sh1dh0ph0_iw8ow8kw1sw1dw0pw0_n"f46498ca6b2ac678293c96c72cae8d05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any mb8_ic3oc768_ih224oh14kh16sh16dh0ph0_iw224ow14kw16sw16dw0pw0_n"c54758c0cf1ea54b0a594bec63fcf5dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic4oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"8b9aff4d0119b829d0f3be730c1ed949"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"8f20cf6c9df5b76a7ac61228bd3949fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc64_id224od112kd3sd2dd0pd1_ih224oh112kh3sh2dh0ph1_iw160ow80kw3sw2dw0pw1_n"e502cf9e56fecc846db4f312e409a395"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"20a569ff593ca954c56875483668813a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc128_id112od56kd3sd2dd0pd1_ih112oh56kh3sh2dh0ph1_iw80ow40kw3sw2dw0pw1_n"a781ea23a3157df99a0dfc1a97e5e33f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"159eec8ec8a1888984f1114e3f048cc0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc256_id56od28kd3sd2dd0pd1_ih56oh28kh3sh2dh0ph1_iw40ow20kw3sw2dw0pw1_n"fb47a5c640f81af2db0e31a6daa6c4a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"4b1966ea5b50d670d0af8288f7f9a96e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc320_id28od14kd3sd2dd0pd1_ih28oh14kh3sh2dh0ph1_iw20ow10kw3sw2dw0pw1_n"c7d39fc04abe9868e817ea2a872a8843"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"6a5c175e4d2e472e4156ecf630071700*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc320_id14od7kd3sd2dd0pd1_ih14oh7kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"59d11d0f3b5b03e3c85b20730ca41dff"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc320_id7od7kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"bdc394d753aba293886acf48568133b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic640oc320_id14od14kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"fd8ced4bd0c1cef47f7449203360c5b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic320oc4_id14od14kd1sd1dd0pd0_ih14oh14kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"943615a50090422159fe724e1f3df224"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc256_id28od28kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw20ow20kw3sw1dw0pw1_n"c99a1f9f65dfb94b67150557ad6909ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc4_id28od28kd1sd1dd0pd0_ih28oh28kh1sh1dh0ph0_iw20ow20kw1sw1dw0pw0_n"4eda0fa472e98b73661574549b45eb4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc128_id56od56kd3sd1dd0pd1_ih56oh56kh3sh1dh0ph1_iw40ow40kw3sw1dw0pw1_n"501ea19ba0758a9a8bfa6963fd47ee91"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc4_id56od56kd1sd1dd0pd0_ih56oh56kh1sh1dh0ph0_iw40ow40kw1sw1dw0pw0_n"1683b4818a38ed2710ace05909fb1335"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic128oc64_id112od112kd3sd1dd0pd1_ih112oh112kh3sh1dh0ph1_iw80ow80kw3sw1dw0pw1_n"ff67aa9ad1e0e58751f4c9e8b1857c25"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc4_id112od112kd1sd1dd0pd0_ih112oh112kh1sh1dh0ph0_iw80ow80kw1sw1dw0pw0_n"fe13f7afc2c5796b61753a31ae05ac86"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic64oc32_id224od224kd3sd1dd0pd1_ih224oh224kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1_n"fd4403a49babc43914998b3510ec8542"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic32oc4_id224od224kd1sd1dd0pd0_ih224oh224kh1sh1dh0ph0_iw160ow160kw1sw1dw0pw0_n"0efe6f4d807c7f5703922573c52cd4bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 mb1024_ic3oc32_ih300oh150kh3sh2dh0ph1_iw300ow150kw3sw2dw0pw1_n"6368e8e8fc7d2ed60272e43101be4018"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g32mb1024_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"25db0a1c055b38e677303c0270aa2f47"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"78d297b700b868d177575b1982391a44"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g64mb1024_ic64oc64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"03faa734a2b1dc10aeb4d37273e45be6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"1ccfeb0d6cf06d455f734ffce53ac11a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"5104193969e3f623ddb456edd9004528"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"3a7dfce010ac3e03c2b69cdb5c027959"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g128mb1024_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"7098f26cf53c5323525292db0d6ae2c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"f6dcb302f191a9af301ccb3bc16bb62a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"3219bc2aa5217ebdb7fc89fc21b2191f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"4740c729166872dd0a9f318eb9b9c0fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g256mb1024_ic256oc256_ih38oh19kh3sh2dh0ph1_iw38ow19kw3sw2dw0pw1_n"f0af0df4498bada3fe4b7c8d7ea14e1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"6af9c2751ba877b4d86380f98fe08218"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"65ca582b20b30edeb27300982e162bab*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"fab7c67d22960ec1bb07efb241ca9b9e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"207da6aa57c9df36362a07de29ede077"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a9e4b84fb4698e5d2c7f7974f9108d86"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g512mb1024_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"2fe73d3994309eaaccb2dd81c83e957c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"7a9d798d09d1b85aedeee2ed606947c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 g1024mb1024_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"71f20f6de95ab7e1668a02d856c63a83"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"66dd23dbca6e457974e3e3a52912e6ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc126_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"54d5faaa7e1d03a021b307a0dc341a99"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"70e519bb5ec461bda269955e324c3d38"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6a927f335391d7e220e2d2dfc6da9fc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc512_ih10oh5kh3sh2dh0ph1_iw10ow5kw3sw2dw0pw1_n"2b6097f35af8398ddd3f5df3129d4f48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc126_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"0280a2d4cade99f03b0d427848d01951"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"fd939c8edc942856b41116f8379abdb8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"80921ddc1f247af51f6734f5799710f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"d541cf659f4b9b19885ea3c16a07f6cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"8d15557d498014d51b7bca5b355aef78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"30bc4dd2fbd2272e8796a54007b3b77c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"1aec5898159732e5a090cc28b5c831c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"244a9b1117f7bd546164d57210001fb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"24f2f92eb78ab50c36fd921e65ecf95b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"1a9fae0434cc27e0457514950f2f4989"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc128_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"9b38e1816b7e6a8feea2b70beca1b41a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_relu+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic128oc256_ih2oh1kh3sh2dh0ph1_iw2ow1kw3sw2dw0pw1_n"f6c882a51a853a5b64782839c5edbd46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc126_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"43e736aa75c11f47a77e9e83ea7f0c13"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=u8:s8:u8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 mb1024_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"459445033e2951965955b5ec5ff4d423"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic3oc64_ih512oh256kh7sh2dh0ph3_iw512ow256kw7sw2dw0pw3_n"398b7cd194e2a3c5d77c9a5ba3d083a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"228c5bed1d918ef4dafce520cf90ef23*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb8_ic64oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8e1244dd8b8dae541fdd5064faf57319*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic64oc128_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"570945dbc0896b898a5c9318936574c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic64oc128_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"33472f22537511ee359e1c5774c63098"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb8_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"68568ab941a36c9e0c9b58fb02de0511*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic128oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"4265a9a8f47b84edfdbd423e920da88d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic128oc256_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"b2e43cfc7c7c8462bd7d83d076f0266a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic128oc256_ih64oh32kh1sh2dh0ph0_iw64ow32kw1sw2dw0pw0_n"612e01ab1d1106f432d06c418c6638f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb8_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0c4e4a1ffb80783d0e772106db361813*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic256oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a822a33b3adf3eb3869a0efea80afaa5*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic256oc512_ih32oh16kh3sh2dh0ph1_iw32ow16kw3sw2dw0pw1_n"8a2abe3d37d45749175455b68d517152"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb8_ic256oc512_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"f64f504562ee89fee77af92d434e4e73"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb8_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"1605726d9b0a5eba23cd8cd8d4c25a4a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic512oc512_ih16oh16kh3sh1dh0ph1_iw16ow16kw3sw1dw0pw1_n"8554134f7bd998b4c0ac7b51add5bd77*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic768oc256_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"627dd009eef1a2bfb4405839eccd26c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic384oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"18a047e4c96ec3ee4c08f5af073d8c77"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic192oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"38fe9b6f38af938d95d18d733509ee2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic128oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"60a4182c57ad4070d4a4ccc44f755150"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic64oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"56ea03cba35d1137f170fbce4d23aedc*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic512oc128_ih64oh64kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"372eeae464f2547db12cb958fd85e398"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic256oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"28c88cd774501d7c70b61c1ac77905f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic192oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"0d313680262bef7eb87e0cc4425b067b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic320oc64_ih128oh128kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"560b9aa2d86a01fd77e6c255f18c21ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic256oc64_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"dcc0f010277c8ebbdfb8f1eb0cc9651b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic320oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"bea1794dc39646898dfdea0d920b5f5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic32oc32_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"6fce1178328f93c03ea852c058da6f6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic32oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"fd083ffb30f903c4f840004762ccc34b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb8_ic16oc16_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"d88959a5345ed56c313dc46142910d11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb mb8_ic16oc1_ih512oh512kh3sh1dh0ph1_iw512ow512kw3sw1dw0pw1_n"b89e33b564beef94ba2f62afa236398b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"a5c78c73c571237ff31d8d0bbf3b7cb1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic1024oc24_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"b213c30da19ceaff312e7a23b1e3a473*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic512oc24_ih5oh5kh3sh1dh0ph1_iw5ow5kw3sw1dw0pw1_n"4dbaace60eeceb2ea6bef5390df725fa*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih3oh3kh3sh1dh0ph1_iw3ow3kw3sw1dw0pw1_n"c14ec96ae00666be6400c12afa33ef4b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih2oh2kh3sh1dh0ph1_iw2ow2kw3sw1dw0pw1_n"c083fc8f1233f331c3c168036c7e1e6b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_add:f32:2:abx+eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 mb1_ic256oc24_ih1oh1kh3sh1dh0ph1_iw1ow1kw3sw1dw0pw1_n"ebbad8f8cbb58aaf713068ec4af1dea6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc2_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"acb57c549e37aa248b157b7db9fe7573"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2oc2_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"d47b8c076ab4b0012e96c3d301d11e04*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2oc4_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"015359fdfd268328e1df4d5743a4c169"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic4oc4_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"4050bda138cdbb308bda1419641da4e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2oc4_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"146c4d52ddee80d8261eb0d313ee0187"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic4oc8_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"0469e41c2d2835a314e366f17b950990"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic8oc8_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3fc750d28bd372e78cea86da154d884f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic4oc8_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"c77250f88cc9a20d65599fcac96242d1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic8oc16_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"bc1b18a39efa573aed0fd967c639e3c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic16oc16_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"653553f26f8e62758ed79a137f5cc033"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic8oc16_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"cf86fe6e62172423290e3603ce8b392e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic2oc2_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"871899568f25ac2f098f474fbe83ef3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic4oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"b97238cd6e22a34764510803184b844c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"b9105f13a9063d9caf6ea5eddf0b8428*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc320_ih96oh48kh3sh2dh0ph1_iw96ow48kw3sw2dw0pw1_n"17d850bc80f9f4c2b8e2d240fd8ebd4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"7faecf1d3011738a25153e7da547acbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"5fb73815955f7e236137e8ddab188241*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"4835c79468f03e05f8769a900bd9b9ce"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih48oh24kh3sh2dh0ph1_iw48ow24kw3sw2dw0pw1_n"52fdc3a3083ca95645403ec0fba4900a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"b574f82f51bdd10a121055bccd3d62f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"57dd4fa227a89d1c850c07e7eb327033*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"6cf66e40f99ef12aee152981cc55980b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih24oh12kh3sh2dh0ph1_iw24ow12kw3sw2dw0pw1_n"24f4fd7112662383a34832063ea6a8a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"1529f4b3066777529bf24e00873dd4c4*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh3sh1dh0ph1_iw12ow12kw3sw1dw0pw1_n"2e0978e047f0d3def5298314f233e0e5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih12oh12kh1sh1dh0ph0_iw12ow12kw1sw1dw0pw0_n"2f176b0527b78e0f633a21a81604e021*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"80a9fcfb1691df393a17da9a629db286*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic2560oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"082a74d5c87a288d1478a1aa165adc62*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh3sh1dh0ph1_iw24ow24kw3sw1dw0pw1_n"5dbf2ea30d3fb6970851992cff3d65da"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc1280_ih24oh24kh1sh1dh0ph0_iw24ow24kw1sw1dw0pw0_n"e02e842732026e7ff6470e4b47d0949e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc1280_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c74da3d24c9dad03d75153b82cf9f336"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"c0b2ea4cba6407a20a6c9efaeae73fa1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1920oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"2fe85418f1b91898c3c701530865da2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"6a083c7adbf8ea1a304e047b8acb4928"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic1280oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"649abcd631b4243e8acd1c646d992aa0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh3sh1dh0ph1_iw48ow48kw3sw1dw0pw1_n"d696e408e081933e56a9840fb8fe3b27"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc640_ih48oh48kh1sh1dh0ph0_iw48ow48kw1sw1dw0pw0_n"aa1f4fc476a9c74127d4014cf622d415"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc640_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"b6d3652ff71d58b7df6c9ea4072e0d16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"d64a7427851f0adbf9fe60f8093831d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic960oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"0fd2b6d6ced97c5a7267976d899fd2a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"79a52e87e946d2da2bc7ed5c048a8e20*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic640oc320_ih96oh96kh1sh1dh0ph0_iw96ow96kw1sw1dw0pw0_n"637a5fa9f48584bb54a614e42dd2b2eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=f32 --stag=acdb --wtag=any --dtag=acdb mb2_ic320oc4_ih96oh96kh3sh1dh0ph1_iw96ow96kw3sw1dw0pw1_n"2e26a64fea6a750bf8e97577f626de4a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"c981313272aac81a4651bdd9c7d15015"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic64oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"dfdaca07f202bab0b752698814fc7de4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=abcd --attr-post-ops=eltwise_relu g32mb1_ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"aa29b32829e7428692749849ba7a286d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"ce09e7e0d361a8013ea25c39044faeb2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc16_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0c6e524a736d6404f06c15261d2dfa7b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic16oc256_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"f127dc691f6cb381f290d90c8aa78086*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"6ba4af9c247b5371e6edbd8bfc87de3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc128_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"a38a6c6f0940c639a663e7f42d95ec84*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic256oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"7d97b20f33384609f6ffd7c5bc79d0d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=abcd --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"48aa94c1e47f9c92d97f3208d5d8a861"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"5db6815d72204afacb49367837f06ca2*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc32_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"0b9dc56529830c78185bb801ebbabcb7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic32oc512_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"6b40babe77039e5b3cd8e08c0da5e84e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"a3609cb6207f7b294b0461295f6bfe8c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc256_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"7cf051865183aba0fae7287c9554a108*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=abcd --attr-post-ops=eltwise_relu g32mb1_ic256oc256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"add44a832641583917e4280e2230bbd1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic512oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"972cd494f56d97a214f371c3f80d4208"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"920f8c4774c10d153b3fb7844a137bda"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"b44c37b2a290d4822c703a6f3a37f760*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc64_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"82d7c22d80d5942195cc9ab787e59069*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic64oc1024_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"82018396f916be51396ceb5e4cfc1787*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"ef16cdc40a2c426c0278c53c048b7496"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc512_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"d8937426ae06c10dd3dc6be746d7e91e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic512oc512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"7a352cc5798234b1c3bf4d49f59397ec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic1024oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"71f25237b785425eb953516deba100cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"1da916518c00c08dae7dd52d25b81a51"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"c4f88c41925a2b0db94afcf0a291e5c9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc128_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"31fe953d6169a8ef2181888f0a99ca37*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_logistic mb1_ic128oc2048_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"9d36cbd78e2ba7b54fe827924b515998*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"3df7c14f78146a7333eaef9e4f0ca419"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic2048oc1024_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"b612728e6b82231d68103b8e2ed0109e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu g32mb1_ic1024oc1024_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"da63e6fe166da93b2398017bcf2bb588*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"aecfc1a2b7255ed852191973ed327d71"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"72020abdfe9af215ebc16576a11af503"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"ce0c2188b2fdab83fc9b17ccf530e26a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"e404d30068c374bfa4b62fe5c699a25d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic512oc512_ih16oh7kh3sh2dh0ph0_iw16ow7kw3sw2dw0pw0_n"a85761ea81d9131433ec094cde211877"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic256oc256_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"589064f6f6071006e60a6c1a684b1d8f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic128oc128_ih58oh28kh3sh2dh0ph0_iw58ow28kw3sw2dw0pw0_n"509249adbb5a3403fea750a50ca8be93"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb256_ic3oc64_ih230oh112kh7sh2dh0ph0_iw230ow112kw7sw2dw0pw0_n"49707c03d0f4dfdde9876401865c57bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=any mb1_ic3oc768_ih224oh14kh16sh16dh0ph0_iw224ow14kw16sw16dw0pw0_n"bcab014a82edbfee1ef41bc9c9f99d43"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic3oc32_ih416oh416kh3sh1dh0ph1_iw416ow416kw3sw1dw0pw1_n"83808117b275a85743f06f50dfcf7dca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic32oc64_ih416oh208kh3sh2dh0ph1_iw416ow208kw3sw2dw0pw1_n"8669f97092ca85e9f4596ca58db4231b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic64oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"d7f3e612da1975e3ccdb1e4924c18bd4*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic64oc32_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"dad5745d2727a52b042c7882a51c4d36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb64_ic32oc64_ih208oh208kh3sh1dh0ph1_iw208ow208kw3sw1dw0pw1_n"2369b09a508be8060f59c4c6b8f96d2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic128oc64_ih208oh208kh1sh1dh0ph0_iw208ow208kw1sw1dw0pw0_n"d73c82a39e89f6619c917c1c5156d4f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic64oc128_ih208oh104kh3sh2dh0ph1_iw208ow104kw3sw2dw0pw1_n"8af37e74c4e75d85d76126ca61427672"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic128oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"28d53e535e607c56057f2571969f536f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic64oc64_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"29e71634a042499a882522d0b30e2c32*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb64_ic64oc64_ih104oh104kh3sh1dh0ph1_iw104ow104kw3sw1dw0pw1_n"97ee0cec7ded2f892631a7e0492fe361*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic128oc128_ih104oh104kh1sh1dh0ph0_iw104ow104kw1sw1dw0pw0_n"dff0984b33b37028063d7708e324a58c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic128oc256_ih104oh52kh3sh2dh0ph1_iw104ow52kw3sw2dw0pw1_n"9217e6acf8477792a05b68517b9dfa2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"f29e59181e9054998a452deaa7fd822c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic128oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"6d8a91681acaf84446e9a2944395e56c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb64_ic128oc128_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"f000c999cc9654b836c3c1f900643c81*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic256oc256_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"9821b4d86db029a913a4ef0b2a03a292"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic256oc128_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"b0adc628b9faddb570bd99e564a9643d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic256oc512_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"c37c63e8d4578ae949e91cf5078c4767"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"116d34390730e9287529902d12ef0003*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic256oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"d597bfe98b35aa0ac3b29f8a7c8abec1*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb64_ic256oc256_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"374b28ba2777bbec7a8093dcb71c9ecd*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic512oc512_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"e4257bc0086e4c8221848d2e6651ef1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic512oc256_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"bc6ba17732e800fb00a112e985cb1762*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic512oc1024_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"ce2a08675963b0a7c033fc21a8039182"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"6dbd9a8a5877a471f4c799fdf32cd89d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic512oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"1e3a86b428a10ad6ef79551ad6b65382*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish+binary_add:f16:15:ABcd32a16b mb64_ic512oc512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"ef5895304d007aeb0c759a6eeb7bfbe1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_mish mb64_ic1024oc1024_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"988bf2318655b4f26671bfab76d5b03f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic1024oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"f9d8ff0973346d8fe7d4a3a186f5115b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic512oc1024_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2dee5abc40d3cb4ea2e1b4946c1004a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic2048oc512_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"13077f33be08ec08f18dff400a4d30e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic512oc256_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"555239bb4223e648bd7834ac67dc3d37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic256oc512_ih26oh26kh3sh1dh0ph1_iw26ow26kw3sw1dw0pw1_n"c842d35b9ce33288a0b117e13b1ecd67*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic256oc128_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"6c7391cf6f299242c0be142eac9d576d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic128oc256_ih52oh52kh3sh1dh0ph1_iw52ow52kw3sw1dw0pw1_n"ba75e5a37995f8075fe5e0e3d9a30be1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic128oc256_ih52oh26kh3sh2dh0ph1_iw52ow26kw3sw2dw0pw1_n"93422b13e07a0ab998996ad827351b92"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic256oc255_ih52oh52kh1sh1dh0ph0_iw52ow52kw1sw1dw0pw0_n"5059b3697bdc159fa763f366000d096d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:1.0 mb64_ic256oc512_ih26oh13kh3sh2dh0ph1_iw26ow13kw3sw2dw0pw1_n"3a5cc731cbf322c1af1ac61060aa91f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic512oc255_ih26oh26kh1sh1dh0ph0_iw26ow26kw1sw1dw0pw0_n"ba434a038534ee5ecff323cf349e9538"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb64_ic1024oc255_ih13oh13kh1sh1dh0ph0_iw13ow13kw1sw1dw0pw0_n"12c69b10f9bd607879452873511f5125"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic3oc32_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"f6f8dee2ce950c67e246a45f6e664ee0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g32mb1024_ic32oc32_ih150oh150kh3sh1dh0ph1_iw150ow150kw3sw1dw0pw1_n"8d0bcfa43f96c5ff3a36ee476e6d8ac7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic32oc64_ih150oh150kh1sh1dh0ph0_iw150ow150kw1sw1dw0pw0_n"868c6006fbb9507484ef74c488620a62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g64mb1024_ic64oc64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"a74ffec04950a65cd4b384196c4587da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic64oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"a38e6a82804760e3cfb1c58f8c4ff206"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g128mb1024_ic128oc128_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"9b0c4898f25bb1f3dcb3a9479bb45597"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic128oc128_ih75oh75kh1sh1dh0ph0_iw75ow75kw1sw1dw0pw0_n"10defeb931ca0d7738e1f7c0df511cd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g128mb1024_ic128oc128_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"4d811434de56e6f9bde57d6cd910d0f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic128oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"d7847dff9816c7c5078fe3ac0d033e68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g256mb1024_ic256oc256_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"94ce1ed931edbe33f43e1343df3925b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic256oc256_ih38oh38kh1sh1dh0ph0_iw38ow38kw1sw1dw0pw0_n"4a1b3469f6470df16a63d57791cfef21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g256mb1024_ic256oc256_ih38oh19kh3sh2dh0ph0_iw38ow19kw3sw2dw0pw0_n"aa362bfab14b23ae0440e795833a5b5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic256oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"b32af6395a7f1857620a4666851f6145"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g512mb1024_ic512oc512_ih19oh19kh3sh1dh0ph1_iw19ow19kw3sw1dw0pw1_n"c2ed3f349fea0ab8ffec2af647e2e4a9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"cf415c03b704f7c2bcd6651db02c47ef*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic512oc512_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"ddaf179d91f15bc34ce37385fc4e6dc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g512mb1024_ic512oc512_ih19oh10kh3sh2dh0ph1_iw19ow10kw3sw2dw0pw1_n"0dec940a8aafe30a1866fdcc62850000"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc273_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"80dc4356808444db948d70b350858e12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc12_ih19oh19kh1sh1dh0ph0_iw19ow19kw1sw1dw0pw0_n"e45fdc27b418d8fa1bb27bb111648463"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+binary_mul:f32:2:abx --attr-scales=wei:per_oc mb1024_ic512oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"2f566e30906d0f5b3046eaf31f1e6f77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc g1024mb1024_ic1024oc1024_ih10oh10kh3sh1dh0ph1_iw10ow10kw3sw1dw0pw1_n"8c8b804c18240f600e5a5b5c47b63d14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic1024oc1024_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"d6c0bdd4efc0314a8c01a46a5fc7daf1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic1024oc256_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"a77ccf6f92292f782cce7d02bf6bd7b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic1024oc546_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"6114d5ab2f0e3e65ce6fdd8243de0229"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic1024oc24_ih10oh10kh1sh1dh0ph0_iw10ow10kw1sw1dw0pw0_n"4cbc2df0188f3d5a2ed08086230c50b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic256oc512_ih10oh5kh3sh2dh0ph0_iw10ow5kw3sw2dw0pw0_n"1183868da54b81945db3bcae059496cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic512oc128_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"65590aa5b7872d7556fba11cddbafccd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc546_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"612bbc8a41cf1304081b418b96d24558"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic512oc24_ih5oh5kh1sh1dh0ph0_iw5ow5kw1sw1dw0pw0_n"c79db9f0b99e41678b6ce9bc163e2f7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic128oc256_ih5oh3kh3sh2dh0ph1_iw5ow3kw3sw2dw0pw1_n"52b296601b2229dcd7c027719c5a98cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic256oc128_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"a9fa9fd70eff0804333c12f4ec30abfc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc546_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"6b5e0ec57141ad972f7df70f2e37caf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc24_ih3oh3kh1sh1dh0ph0_iw3ow3kw1sw1dw0pw0_n"42b39647dd34c4356343edad25b1649c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic128oc256_ih3oh2kh3sh2dh0ph1_iw3ow2kw3sw2dw0pw1_n"ebf9b66b7352f714cebe46945d5d6e27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic256oc64_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"7597724ab7357daa2df5f814e1ddab71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc546_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"cb774b4613ac45c5d4e4c49f154da665"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic256oc24_ih2oh2kh1sh1dh0ph0_iw2ow2kw1sw1dw0pw0_n"3ef05f5f500b9c6fabb918e175d4d67d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_clip:0.0:1.0+eltwise_linear:0.25 --attr-scales=wei:per_oc mb1024_ic64oc128_ih2oh1kh3sh2dh0ph0_iw2ow1kw3sw2dw0pw0_n"90b946128aa813b7f86f1167ae56aacd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic128oc546_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"1ba8e015ea17cce207a41e1ae57d6b2b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:f32 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-scales=wei:per_oc mb1024_ic128oc24_ih1oh1kh1sh1dh0ph0_iw1ow1kw1sw1dw0pw0_n"7db19bfaced71091f76a95bbdd54a204"
diff --git a/tests/benchdnn/inputs/conv/option_set_v2 b/tests/benchdnn/inputs/conv/option_set_v2
new file mode 100644
index 00000000000..1a86d976d40
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/option_set_v2
@@ -0,0 +1,110 @@
+--dir=BWD_W --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb54ic464iw25oc224ow13kw2sw2pw1
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb96ic56ih14iw14oc240oh6ow6kh3kw3sh2sw2dh1dw1ph1pw1
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb97ic426iw36oc64ow17kw3sw2dw1pw1
+--dir=BWD_WB --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g4mb78ic76iw112oc852ow54kw6sw2
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb17ic160ih14iw27oc480oh7ow14kh3kw1sh2sw2ph1
+--dir=BWD_W --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb113ic480iw8oc352ow1kw6sw2dw1pw3
+--dir=FWD_B --dt=f16:f16:f16 --attr-scales= --attr-post-ops=linear:-0.5:2.0+mul:u8:per_oc+relu:-0.5+relu:0.25 mb28ic7ih13iw13oc176oh12ow12kh3kw3ph1pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= g24mb79ic24id29ih29iw29oc24od29oh29ow29kd1kh1kw1
+--dir=FWD_B --dt=f32:f32:u8 --attr-scales= --attr-post-ops= mb111ic400iw29oc224ow15kw1sw2
+--dir=BWD_WB --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb25ic352iw34oc160ow35kw1pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb9ic64id68ih68iw68oc16od35oh35ow35kd1kh1kw1sd2sh2sw2pd1ph1pw1
+--dir=BWD_W --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb59ic38id3ih85iw9oc48od3oh43ow6kd2kh1kw3sh2dw1pd1pw1
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb75ic352iw13oc432ow13kw3pw2
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb116ic38ih4iw67oc400oh6ow35kh1kw1sw2dh1ph2pw2
+--dir=FWD_B --dt=bf16:bf16:s8 --attr-scales= --attr-post-ops=ge:f16:per_oc mb52ic4ih3iw3oc240oh1ow1kh3kw3sh2sw2dh1dw1ph2pw2
+--dir=BWD_W --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g98mb24ic98iw33oc98ow26kw5dw1pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb41ic176id42ih2iw7oc1od40oh1ow4kd5kh3kw8sh2pd2ph1pw4
+--dir=FWD_I --dt=f32:f32:s32 --attr-scales= --attr-post-ops=linear:-1.0:-4.0+relu:0.5 mb6ic263id3ih3iw3oc224od2oh2ow2kd3kh3kw3pd1ph1pw1
+--dir=BWD_W --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb57ic222iw15oc115ow15kw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic14id25ih19iw16oc77od21oh10ow9kd6kh10kw1sw2pd1pw1
+--dir=BWD_W --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb41ic96ih24iw24oc336oh13ow13kh1kw1sh2sw2ph1pw1
+--dir=FWD_B --dt=bf16:bf16:u8 --attr-scales= --attr-post-ops=pow:4.0:0.25 mb25ic280iw57oc87ow53kw3dw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb33ic5id19ih41iw42oc27od11oh21ow38kd1kh1kw3sd2sh2dw1pd2
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb81ic352iw14oc96ow5kw6sw2
+--dir=FWD_I --dt=f16:f16:u8 --attr-scales= --attr-post-ops=exp g10mb37ic10id33ih33iw33oc10od32oh32ow32kd3kh3kw3pd1ph1pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= g43mb30ic43id3ih3iw3oc43od1oh1ow1kd5kh5kw5sd2sh2sw2pd2ph2pw2
+--dir=BWD_W --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= g80mb15ic80id58ih58iw58oc80od58oh58ow58kd1kh1kw1
+--dir=FWD_I --dt=f16:f16:f32 --attr-scales= --attr-post-ops= mb79ic48id4ih4iw4oc80od1oh1ow1kd5kh5kw5pd1ph1pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb103ic96id2ih2iw2oc512od2oh2ow2kd1kh1kw1
+--dir=FWD_I --dt=f32:f32:f32 --attr-scales= --attr-post-ops=add:s32:per_oc+abs mb9ic496ih5iw66oc240oh4ow33kh1kw3sh2sw2ph2pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb65ic96iw1oc80ow1kw1
+--dir=BWD_W --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g22mb39ic22ih65iw19oc22oh63ow17kh5kw5ph2pw2
+--dir=BWD_WB --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= g7mb9ic1120ih2iw2oc1120oh1ow1kh4kw4sh2sw2ph2pw2
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb113ic208iw32oc128ow30kw3
+--dir=FWD_B --dt=f32:f32:s32 --attr-scales= --attr-post-ops=elu:-1.0 g13mb13ic13id21ih21iw21oc13od17oh17ow17kd5kh5kw5
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= g78mb124ic78iw21oc78ow12kw1sw2dw1pw2
+--dir=FWD_B --dt=bf16:bf16:f32 --attr-scales= --attr-post-ops=sub:s8:per_oc mb2ic248ih45iw32oc448oh23ow31kh1kw2sh2dw1pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb103ic404iw1oc224ow1kw1sw2
+--dir=BWD_W --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g3mb35ic3iw44oc3ow43kw3pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic266iw26oc432ow13kw10dw1pw5
+--dir=FWD_I --dt=f16:f16:s32 --attr-scales= --attr-post-ops=clip:2.0:2.0 mb7ic32iw110oc32ow54kw7sw2pw3
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb38ic224id4ih4iw4oc144od4oh4ow4kd1kh1kw1dd1dh1dw1
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb1ic80ih53iw62oc208oh23ow63kh10kw2sh2ph1pw2
+--dir=BWD_WB --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb30ic240iw22oc394ow22kw3pw2
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic6id93ih29iw42oc240od47oh15ow42kd1kh2kw2sd2sh2ph1pw1
+--dir=FWD_I --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops=abs+abs mb1ic96id1ih55iw13oc288od2oh56ow15kd1kh1kw1sd2dw1pd2ph1pw2
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g102mb73ic102ih16iw16oc102oh16ow16kh2kw2ph1pw1
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= g3mb57ic150ih4iw4oc240oh3ow3kh3kw3ph1pw1
+--dir=FWD_B --dt=bf16:bf16:f32 --attr-scales= --attr-post-ops= mb17ic176iw35oc224ow31kw5
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic512id33ih33iw33oc176od17oh17ow17kd1kh1kw1sd2sh2sw2
+--dir=FWD_B --dt=f16:f16:u8 --attr-scales= --attr-post-ops=ge:s32:common mb17ic208ih6iw34oc59oh3ow32kh3kw3dh1ph1
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g81mb41ic81ih33iw47oc81oh20ow24kh10kw1sw2dh1ph5
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= g93mb33ic93iw55oc93ow54kw3pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb91ic32iw1oc192ow1kw3pw2
+--dir=FWD_I --dt=f16:f16:u8 --attr-scales= --attr-post-ops= mb2ic99id46ih46iw46oc177od24oh24ow24kd1kh1kw1sd2sh2sw2pd1ph1pw1
+--dir=BWD_W --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb83ic208ih11iw9oc128oh6ow10kh1kw1sh2pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb12ic320ih7iw7oc16oh4ow4kh1kw1sh2sw2
+--dir=BWD_W --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb2ic405ih15iw44oc96oh13ow44kh5kw2ph2pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= g114mb121ic114iw35oc114ow18kw3sw2pw2
+--dir=FWD_I --dt=bf16:bf16:s8 --attr-scales= --attr-post-ops= mb73ic432iw57oc238ow29kw1sw2dw1
+--dir=FWD_B --dt=f16:f16:s32 --attr-scales= --attr-post-ops=sum:-1.0+add:bf16:common mb25ic144ih15iw15oc32oh16ow16kh1kw1ph1pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic512ih25iw55oc48oh25ow51kh1kw3dw1
+--dir=BWD_W --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic16id61ih6iw24oc132od31oh6ow10kd3kh3kw8sd2sw2pd2ph2pw2
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb35ic256id4ih4iw4oc250od6oh6ow6kd1kh1kw1pd2ph2pw2
+--dir=FWD_I --dt=f32:f32:bf16 --attr-scales= --attr-post-ops=abs+relu:-1.0 mb67ic16ih7iw7oc176oh4ow4kh6kw6ph2pw2
+--dir=BWD_W --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb32ic160iw71oc144ow30kw9sw2dw1pw4
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb9ic160iw1oc464ow1kw1
+--dir=BWD_WB --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb105ic16ih60iw60oc32oh31ow31kh1kw1sh2sw2ph1pw1
+--dir=FWD_I --dt=bf16:bf16:u8 --attr-scales= --attr-post-ops= mb89ic48id7ih7iw7oc129od4oh4ow4kd2kh2kw2sd2sh2sw2pd1ph1pw1
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb25ic48iw40oc288ow37kw5pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= g52mb11ic52iw8oc52ow4kw10pw5
+--dir=FWD_B --dt=f16:f16:s8 --attr-scales= --attr-post-ops=clip:-4.0:-2.0 mb89ic272iw1oc176ow1kw1dw1
+--dir=BWD_W --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb105ic192iw20oc160ow18kw5pw2
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb9ic48ih7iw37oc200oh4ow39kh5kw1ph1pw2
+--dir=FWD_I --dt=s8:s8:u8 --attr-scales= --attr-post-ops= g63mb106ic63id6ih12iw121oc63od6oh6ow61kd1kh3kw1sh2sw2ph1
+--dir=BWD_W --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb20ic352id9ih9iw9oc464od1oh1ow1kd5kh5kw5dd1dh1dw1
+--dir=FWD_I --dt=f16:f16:s32 --attr-scales= --attr-post-ops=sum:-2.0+linear:-0.5:-4.0 mb1ic64id45ih15iw35oc432od23oh15ow18kd1kh1kw1sd2sw2
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= g102mb50ic102ih44iw51oc102oh40ow50kh9kw3ph4pw1
+--dir=FWD_B --dt=u8:s8:f32 --attr-scales= --attr-post-ops=add:bf16:per_oc mb17ic377iw10oc80ow4kw5sw2pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb15ic32iw49oc128ow25kw1sw2
+--dir=BWD_WB --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb50ic96iw15oc116ow7kw5dw1
+--dir=FWD_B --dt=f16:f16:u8 --attr-scales= --attr-post-ops=sum:1.0+relu:-2.0 mb1ic304ih61iw61oc192oh63ow63kh1kw1ph2pw2
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= g125mb4ic125ih62iw62oc125oh32ow32kh1kw1sh2sw2ph1pw1
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb17ic16id35ih4iw35oc112od19oh2ow17kd1kh3kw3sd2sw2pd2
+--dir=FWD_B --dt=s8:s8:bf16 --attr-scales= --attr-post-ops=sum:-0.25+add:s32:common mb97ic176ih76iw6oc1oh38ow5kh3kw3sh2ph1pw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb18ic358id3ih3iw3oc64od1oh1ow1kd5kh5kw5pd2ph2pw2
+--dir=BWD_W --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb6ic39id61ih61iw25oc1od61oh56ow27kd1kh8kw1ph2pw2
+--dir=BWD_W --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb79ic46iw11oc288ow11kw1
+--dir=BWD_D --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb6ic320ih26iw7oc224oh28ow5kh1kw5dh1ph2pw2
+--dir=FWD_B --dt=f16:f16:s32 --attr-scales= --attr-post-ops= mb1ic432ih28iw9oc400oh28ow2kh1kw9pw1
+--dir=FWD_I --dt=f16:f16:bf16 --attr-scales= --attr-post-ops= mb14ic32iw30oc160ow30kw1
+--dir=BWD_WB --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb1ic503id4ih4iw4oc32od3oh3ow3kd1kh1kw1sd2sh2sw2pd1ph1pw1
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb25ic512id4ih4iw4oc254od4oh4ow4kd1kh1kw1dd1dh1dw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb65ic160iw18oc53ow9kw10
+--dir=BWD_WB --dt=bf16:f32:bf16 --attr-scales= --attr-post-ops= mb15ic128id21ih15iw49oc32od11oh15ow49kd1kh1kw1sd2
+--dir=FWD_I --dt=f16:f16:bf16 --attr-scales= --attr-post-ops=relu:-0.5+sum:-1.0 mb94ic240iw6oc128ow6kw1dw1
+--dir=BWD_D --dt=f32:bf16:bf16 --attr-scales= --attr-post-ops= mb1ic157id5ih25iw95oc32od1oh21ow48kd3kh5kw3sw2dd1pw2
+--dir=BWD_WB --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb9ic64ih7iw7oc112oh3ow3kh9kw9ph4pw4
+--dir=BWD_D --dt=bf16:bf16:bf16 --attr-scales= --attr-post-ops= mb97ic352iw49oc80ow24kw5sw2pw2
+--dir=BWD_WB --dt=f32:f32:f32 --attr-scales= --attr-post-ops= mb89ic434id2ih2iw2oc240od1oh1ow1kd3kh3kw3pd1ph1pw1
+--dir=FWD_B --dt=u8:s8:f16 --attr-scales=dst:per_oc:f16 --attr-post-ops= mb33ic224iw21oc272ow11kw1sw2pw0
+--dir=FWD_I --dt=s8:s8:bf16 --attr-scales=wei:common:0.5:f16 --attr-post-ops= mb1ic352iw98oc283ow48kw7sw2pw3
+--dir=FWD_I --dt=u8:s8:f32 --attr-scales=dst:per_oc:bf16+wei:per_oc --attr-post-ops= mb29ic128iw11oc32ow11kw1pw0dw1
+--dir=FWD_B --dt=s8:s8:u8 --attr-scales=wei:common:2 --attr-post-ops=abs g81mb1ic81iw116oc81ow55kw5sw2pw1dw1
+--dir=FWD_B --dt=u8:s8:f32 --attr-scales=src:common:2+dst:common:2:bf16+wei:per_oc:f16 --attr-post-ops= mb5ic368iw59oc304ow59kw1pw0
+--dir=FWD_B --dt=s8:s8:f16 --attr-scales=dst:per_oc+wei:common:0.5:bf16 --attr-post-ops=div:u8:per_oc mb97ic16id14oc16od1kd10sd2pd5dd1
+--dir=FWD_I --dt=u8:s8:f32 --attr-scales=src:common:2:f16+wei:common:0.5 --attr-post-ops=mul:f16:per_oc+relu:4 mb89ic80ih3oc131oh2kh1sh2ph0
+--dir=FWD_B --dt=s8:s8:s8 --attr-scales=dst:common:2+wei:per_oc:bf16 --attr-post-ops=linear:-0.5:-0.25+add:s8:common mb57ic16iw79oc144ow38kw10sw2pw5
+--dir=FWD_I --dt=u8:s8:f32 --attr-scales=src:common:0.5:f16+dst:per_oc:bf16+wei:common:2:f16 --attr-post-ops=max:s32:per_oc mb73ic81ih47oc7oh24kh1sh2ph0
+--dir=FWD_I --dt=s8:s8:u8 --attr-scales=dst:per_oc+wei:per_oc:bf16 --attr-post-ops=relu:-2+linear:-4:2+mul:f16:per_oc+sum:-2:1+mul:bf16:common+linear:1:1+mul:bf16:common+abs+add:s8:per_oc mb46ic176iw1oc80ow1kw3sw2pw2
diff --git a/tests/benchdnn/inputs/conv/shapes_4bit b/tests/benchdnn/inputs/conv/shapes_4bit
new file mode 100644
index 00000000000..2dabeea55b2
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/shapes_4bit
@@ -0,0 +1,18 @@
+# one dimensional shapes
+ic16oc16_iw5ow5kw1pw0_n"conv_basic_1d:1x1"
+g1ic16oc16_iw5ow5kw3pw1_n"conv_basic_1d:3x3"
+g1ic16oc16_iw5ow8kw3pw3_n"conv_basic_1d:gemm"
+g4ic16oc16_iw5ow5kw3pw1_n"conv_basic_1d:grouped"
+
+# two dimensional shapes
+# Used in smoke validation, don't change the name
+ic16oc16_ih5oh5kh1ph0_n"conv_basic_2d:1x1"
+g1ic16oc16_ih5oh5kh3ph1_n"conv_basic_2d:3x3"
+g1ic16oc16_ih5oh8kh3ph3_n"conv_basic_2d:gemm"
+g4ic16oc16_ih5oh5kh3ph1_n"conv_basic_2d:grouped"
+
+# three dimensional shapes
+ic16oc16_id5od5kd1pd0_n"conv_basic_3d:1x1"
+g1ic16oc16_id5od5kd3pd1_n"conv_basic_3d:3x3"
+g1ic16oc16_id5od8kd3pd3_n"conv_basic_3d:gemm"
+g4ic16oc16_id5od5kd3pd1_n"conv_basic_3d:grouped"
diff --git a/tests/benchdnn/inputs/conv/shapes_large_conv b/tests/benchdnn/inputs/conv/shapes_large_conv
new file mode 100644
index 00000000000..10741b226d5
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/shapes_large_conv
@@ -0,0 +1,11 @@
+# Large iw
+mb1ic32oc1iw134217732kw3
+mb1ic1oc1iw4294967311kw3
+
+# Large ic
+mb1ic4294967311oc1iw1kw1
+mb1ic134217732oc1iw27kw3
+
+# Larg oc
+mb1ic1oc1342177321iw27kw3
+mb1ic1oc4294967311iw1kw1
diff --git a/tests/benchdnn/inputs/conv/shapes_mem_strided b/tests/benchdnn/inputs/conv/shapes_mem_strided
new file mode 100644
index 00000000000..b20ad6ee7b3
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/shapes_mem_strided
@@ -0,0 +1,49 @@
+--strides=903168x1x4032x9::6422528x1x28672x128 g1mb50ic3ih224iw224oc64oh112ow112kh7kw7sh2sw2ph3pw3n"resnet_50:conv1"
+--strides=1204224x1x7168x64::1605632x1x28672x512 mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2a_branch1*4"
+--strides=5419008x1x32256x192::200704x1x3584x64 mb50ic64ih56oc64oh56kh1ph0n"resnet_50:res2a_branch2a"
+--strides=2408448x1x21504x128::602112x1x10752x192 mb50ic64ih56oc64oh56kh3ph1n"resnet_50:res2a_branch2b*3"
+--strides=2408448x1x43008x256::1605632x1x14336x128 mb50ic256ih56oc64oh56kh1ph0n"resnet_50:res2b_branch2a*2"
+--strides=4816896x1x43008x768::1605632x1x57344x1024 g1mb50ic256ih56iw56oc512oh28ow28kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch1"
+--strides=7225344x1x129024x768::401408x1x7168x256 g1mb50ic256ih56iw56oc128oh28ow28kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch2a"
+--strides=1806336x1x21504x256::401408x1x7168x128 mb50ic128ih28oc128oh28kh3ph1n"resnet_50:res3a_branch2b*4"
+--strides=602112x1x10752x384::1605632x1x28672x512 mb50ic128ih28oc512oh28kh1ph0n"resnet_50:res3a_branch2c*4"
+--strides=802816x1x28672x512::903168x1x10752x128 mb50ic512ih28oc128oh28kh1ph0n"resnet_50:res3b_branch2a*3"
+--strides=7225344x1x129024x1536::401408x1x28672x2048 g1mb50ic512ih28iw28oc1024oh14ow14kh1kw1sh2sw2ph0pw0n"resnet_50:res4a_branch1"
+--strides=3612672x1x129024x1536::50176x1x3584x256 g1mb50ic512ih28iw28oc256oh14ow14kh1kw1sh2sw2ph0pw0n"resnet_50:res4a_branch2a"
+--strides=50176x1x3584x256::451584x1x10752x256 mb50ic256ih14oc256oh14kh3ph1n"resnet_50:res4a_branch2b*6"
+--strides=401408x1x14336x512::602112x1x43008x3072 mb50ic256ih14oc1024oh14kh1ph0n"resnet_50:res4a_branch2c*6"
+--strides=602112x1x14336x1024::1354752x1x32256x768 mb50ic1024ih14oc256oh14kh1ph0n"resnet_50:res4b_branch2a*5"
+--strides=802816x1x28672x1024::602112x1x43008x2048 g1mb50ic1024ih14iw14oc2048oh7ow7kh1kw1sh2sw2ph0pw0n"resnet_50:res5a_branch1"
+--strides=401408x1x28672x2048::677376x1x32256x1536 g1mb50ic1024ih14iw14oc512oh7ow7kh1kw1sh2sw2ph0pw0n"resnet_50:res5a_branch2a"
+--strides=150528x1x21504x1024::301056x1x21504x1536 mb50ic512ih7oc512oh7kh3ph1n"resnet_50:res5a_branch2b*3"
+--strides=301056x1x21504x1024::100352x1x14336x2048 mb50ic512ih7oc2048oh7kh1ph0n"resnet_50:res5a_branch2c*3"
+--strides=903168x1x129024x6144::200704x1x14336x1024 mb50ic2048ih7oc512oh7kh1ph0n"resnet_50:res5b_branch2a*2"
+--strides=903168x1x4032x9::6422528x1x28672x128 g1mb50ic3ih224iw224oc64oh112ow112kh7kw7sh2sw2ph3pw3n"resnet_50:conv1"
+--strides=1204224x1x7168x64::1605632x1x28672x512 mb50ic64ih56oc256oh56kh1ph0n"resnet_50:res2a_branch1*4"
+--strides=5419008x1x32256x192::200704x1x3584x64 mb50ic64ih56oc64oh56kh1ph0n"resnet_50:res2a_branch2a"
+--strides=2408448x1x21504x128::602112x1x10752x192 mb50ic64ih56oc64oh56kh3ph1n"resnet_50:res2a_branch2b*3"
+--strides=2408448x1x43008x256::1605632x1x14336x128 mb50ic256ih56oc64oh56kh1ph0n"resnet_50:res2b_branch2a*2"
+--strides=4816896x1x43008x768::1605632x1x57344x1024 g1mb50ic256ih56iw56oc512oh28ow28kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch1"
+--strides=7225344x1x129024x768::401408x1x7168x256 g1mb50ic256ih56iw56oc128oh28ow28kh1kw1sh2sw2ph0pw0n"resnet_50:res3a_branch2a"
+--strides=1806336x1x21504x256::401408x1x7168x128 mb50ic128ih28oc128oh28kh3ph1n"resnet_50:res3a_branch2b*4"
+--strides=602112x1x10752x384::1605632x1x28672x512 mb50ic128ih28oc512oh28kh1ph0n"resnet_50:res3a_branch2c*4"
+--strides=802816x1x28672x512::903168x1x10752x128 mb50ic512ih28oc128oh28kh1ph0n"resnet_50:res3b_branch2a*3"
+--strides=7225344x1x129024x1536::401408x1x28672x2048 g1mb50ic512ih28iw28oc1024oh14ow14kh1kw1sh2sw2ph0pw0n"resnet_50:res4a_branch1"
+--strides=3612672x1x129024x1536::50176x1x3584x256 g1mb50ic512ih28iw28oc256oh14ow14kh1kw1sh2sw2ph0pw0n"resnet_50:res4a_branch2a"
+--strides=50176x1x3584x256::451584x1x10752x256 mb50ic256ih14oc256oh14kh3ph1n"resnet_50:res4a_branch2b*6"
+--strides=401408x1x14336x512::602112x1x43008x3072 mb50ic256ih14oc1024oh14kh1ph0n"resnet_50:res4a_branch2c*6"
+--strides=602112x1x14336x1024::1354752x1x32256x768 mb50ic1024ih14oc256oh14kh1ph0n"resnet_50:res4b_branch2a*5"
+--strides=802816x1x28672x1024::602112x1x43008x2048 g1mb50ic1024ih14iw14oc2048oh7ow7kh1kw1sh2sw2ph0pw0n"resnet_50:res5a_branch1"
+--strides=401408x1x28672x2048::677376x1x32256x1536 g1mb50ic1024ih14iw14oc512oh7ow7kh1kw1sh2sw2ph0pw0n"resnet_50:res5a_branch2a"
+--strides=150528x1x21504x1024::301056x1x21504x1536 mb50ic512ih7oc512oh7kh3ph1n"resnet_50:res5a_branch2b*3"
+--strides=301056x1x21504x1024::100352x1x14336x2048 mb50ic512ih7oc2048oh7kh1ph0n"resnet_50:res5a_branch2c*3"
+--strides=903168x1x129024x6144::200704x1x14336x1024 mb50ic2048ih7oc512oh7kh1ph0n"resnet_50:res5b_branch2a*2"
+--strides=10838016x1x32256x96::4816896x1x21504x64 g32mb1ic32ih112iw112oc32oh112ow112kh3kw3sh1sw1ph1pw1n"mobilenet:conv2_1/dw"
+--strides=21676032x1x64512x192::5419008x1x32256x192 g64mb1ic64ih112iw112oc64oh56ow56kh3kw3sh2sw2ph1pw1n"mobilenet:conv2_2/dw"
+--strides=2408448x1x14336x256::7225344x1x43008x256 g128mb1ic128ih56iw56oc128oh56ow56kh3kw3sh1sw1ph1pw1n"mobilenet:conv3_1/dw"
+--strides=2408448x1x21504x128::301056x1x3584x128 g128mb1ic128ih56iw56oc128oh28ow28kh3kw3sh2sw2ph1pw1n"mobilenet:conv3_2/dw"
+--strides=1605632x1x28672x512::802816x1x14336x512 g256mb1ic256ih28iw28oc256oh28ow28kh3kw3sh1sw1ph1pw1n"mobilenet:conv4_1/dw"
+--strides=1806336x1x21504x768::301056x1x10752x768 g256mb1ic256ih28iw28oc256oh14ow14kh3kw3sh2sw2ph1pw1n"mobilenet:conv4_2/dw"
+--strides=802816x1x28672x1024::602112x1x14336x1024 g512mb1ic512ih14iw14oc512oh14ow14kh3kw3sh1sw1ph1pw1n"mobilenet:conv5_1/dw"
+--strides=903168x1x21504x1536::150528x1x10752x512 g512mb1ic512ih14iw14oc512oh7ow7kh3kw3sh2sw2ph1pw1n"mobilenet:conv5_6/dw"
+--strides=301056x1x14336x1024::451584x1x64512x3072 g1024mb1ic1024ih7iw7oc1024oh7ow7kh3kw3sh1sw1ph1pw1n"mobilenet:conv6/dw"
diff --git a/tests/benchdnn/inputs/conv/shapes_resnext_101 b/tests/benchdnn/inputs/conv/shapes_resnext_101
index c190e11ec06..858e519e14a 100644
--- a/tests/benchdnn/inputs/conv/shapes_resnext_101
+++ b/tests/benchdnn/inputs/conv/shapes_resnext_101
@@ -1,24 +1,24 @@
-mb128_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3n"*1"
-mb128_ic64oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0n"*1"
-g32mb128_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1n"*3"
-mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0n"*1"
-mb128_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0n"*3"
-mb128_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0n"*2"
-mb128_ic256oc1024_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0n"*1"
-g32mb128_ic1024oc1024_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1n"*1"
-mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0n"*1"
-mb128_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0n"*4"
-mb128_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0n"*3"
-g32mb128_ic1024oc1024_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1n"*3"
-mb128_ic512oc2048_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0n"*1"
-g32mb128_ic2048oc2048_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1n"*1"
-mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0n"*1"
-mb128_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0n"*23"
-mb128_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0n"*22"
-g32mb128_ic2048oc2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1n"*22"
-mb128_ic1024oc4096_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0n"*1"
-g32mb128_ic4096oc4096_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1n"*1"
-mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0n"*1"
-mb128_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0n"*3"
-mb128_ic2048oc4096_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0n"*2"
-g32mb128_ic4096oc4096_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1n"*2"
+mb128_ic3oc64_ih224oh112kh7sh2dh0ph3_iw224ow112kw7sw2dw0pw3_n"resnext_101:conv1*1"
+mb128_ic64oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"resnext_101:conv2*1"
+g32mb128_ic512oc512_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1_n"resnext_101:conv3*3"
+mb128_ic64oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"resnext_101:conv4*1"
+mb128_ic512oc256_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"resnext_101:conv5*3"
+mb128_ic256oc512_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"resnext_101:conv6*2"
+mb128_ic256oc1024_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"resnext_101:conv7*1"
+g32mb128_ic1024oc1024_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"resnext_101:conv8*1"
+mb128_ic256oc512_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"resnext_101:conv9*1"
+mb128_ic1024oc512_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"resnext_101:conv10*4"
+mb128_ic512oc1024_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"resnext_101:conv11*3"
+g32mb128_ic1024oc1024_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"resnext_101:conv12*3"
+mb128_ic512oc2048_ih28oh28kh1sh1dh0ph0_iw28ow28kw1sw1dw0pw0_n"resnext_101:conv13*1"
+g32mb128_ic2048oc2048_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"resnext_101:conv14*1"
+mb128_ic512oc1024_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"resnext_101:conv15*1"
+mb128_ic2048oc1024_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"resnext_101:conv16*23"
+mb128_ic1024oc2048_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"resnext_101:conv17*22"
+g32mb128_ic2048oc2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"resnext_101:conv18*22"
+mb128_ic1024oc4096_ih14oh14kh1sh1dh0ph0_iw14ow14kw1sw1dw0pw0_n"resnext_101:conv19*1"
+g32mb128_ic4096oc4096_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"resnext_101:conv20*1"
+mb128_ic1024oc2048_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"resnext_101:conv21*1"
+mb128_ic4096oc2048_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"resnext_101:conv22*3"
+mb128_ic2048oc4096_ih7oh7kh1sh1dh0ph0_iw7ow7kw1sw1dw0pw0_n"resnext_101:conv23*2"
+g32mb128_ic4096oc4096_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"resnext_101:conv24*2"
diff --git a/tests/benchdnn/inputs/conv/test_conv_3d_f32_nxc b/tests/benchdnn/inputs/conv/test_conv_3d_f32_nxc
deleted file mode 100644
index b346f577700..00000000000
--- a/tests/benchdnn/inputs/conv/test_conv_3d_f32_nxc
+++ /dev/null
@@ -1,8 +0,0 @@
-# f32 3-D Convolutions
---reset --dt=f32
---stag=axb --dtag=axb
---mb=2
---skip-impl=ref,x64:gemm      # ! test jit version only
---dir=FWD_B,BWD_D,BWD_WB
---batch=shapes_3d
---batch=set_conv_3d
diff --git a/tests/benchdnn/inputs/conv/test_conv_3d_f32_plain b/tests/benchdnn/inputs/conv/test_conv_3d_f32_plain
new file mode 100644
index 00000000000..9ad91cff473
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_3d_f32_plain
@@ -0,0 +1,12 @@
+# f32 3-D Convolutions
+--reset --dt=f32
+--stag=axb --dtag=axb
+--mb=2
+--skip-impl=ref,x64:gemm      # ! test jit version only
+--dir=FWD_B,BWD_D,BWD_WB
+--batch=shapes_3d
+--batch=set_conv_3d
+
+--stag=abx --dtag=abx
+--batch=shapes_3d
+--batch=set_conv_3d
diff --git a/tests/benchdnn/inputs/conv/test_conv_all b/tests/benchdnn/inputs/conv/test_conv_all
index 5c5062cb376..f7c7bae1f1b 100644
--- a/tests/benchdnn/inputs/conv/test_conv_all
+++ b/tests/benchdnn/inputs/conv/test_conv_all
@@ -1,20 +1,20 @@
 --reset
 
 --batch=test_conv_3d
---batch=test_conv_3d_f32_nxc
+--batch=test_conv_3d_f32_plain
 --batch=test_conv_all_topologies
---batch=test_conv_all_topologies_f32_nxc
+--batch=test_conv_all_topologies_f32_plain
 --batch=test_conv_attrs
---batch=test_conv_attrs_f32_nxc
+--batch=test_conv_attrs_f32_plain
 #--batch=test_conv_bfloat16 # included in test_conv_dt
-#--batch=test_conv_bfloat16_nxc # included in test_conv_dt_nxc
+#--batch=test_conv_bfloat16_nxc # included in test_conv_dt_plain
 #--batch=test_conv_bfloat16_ymm # excluded as it sets global state
 --batch=test_conv_ci
 --batch=test_conv_depthwise
 --batch=test_conv_dilated
---batch=test_conv_dilated_f32_nxc
+--batch=test_conv_dilated_f32_plain
 --batch=test_conv_dt
---batch=test_conv_dt_nxc
+--batch=test_conv_dt_plain
 --batch=test_conv_function
 #--batch=test_conv_gemm_bfloat16 # included in test_conv_gemm_dt
 #--batch=test_conv_gemm_bfloat16_nxc # included in test_conv_gemm_dt_nxc
@@ -26,3 +26,4 @@
 --batch=test_conv_int8
 --batch=test_conv_regression
 --batch=test_conv_wino_gpu
+--batch=harness_conv_output_striding
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_nxc b/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_nxc
deleted file mode 100644
index c13818593cb..00000000000
--- a/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_nxc
+++ /dev/null
@@ -1,10 +0,0 @@
-# Test All Topologies with F32 Configuration
---reset --dt=f32
---stag=axb --dtag=axb
---skip-impl=ref
---mb=2
---dir=FWD_B,BWD_D,BWD_WB
---batch=set_all_topologies
-
---dir=FWD_B
---batch=set_topologies_inference_only
diff --git a/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_plain b/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_plain
new file mode 100644
index 00000000000..4f43b2ef1a1
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_plain
@@ -0,0 +1,17 @@
+# Test All Topologies with F32 Configuration
+--reset --dt=f32
+--stag=axb --dtag=axb
+--skip-impl=ref
+--mb=2
+--dir=FWD_B,BWD_D,BWD_WB
+--batch=set_all_topologies
+
+--dir=FWD_B
+--batch=set_topologies_inference_only
+
+--stag=abx --dtag=abx
+--dir=FWD_B,BWD_D,BWD_WB
+--batch=set_all_topologies
+
+--dir=FWD_B
+--batch=set_topologies_inference_only
diff --git a/tests/benchdnn/inputs/conv/test_conv_attrs_f32_nxc b/tests/benchdnn/inputs/conv/test_conv_attrs_f32_nxc
deleted file mode 100644
index a9c2fbc5356..00000000000
--- a/tests/benchdnn/inputs/conv/test_conv_attrs_f32_nxc
+++ /dev/null
@@ -1,11 +0,0 @@
-# f32
---reset --dt=f32
---mb=2
---stag=axb --dtag=axb
---dir=FWD_B
-
---skip-impl=ref,x64:gemm      # ! test jit version only
---batch=option_set_combined_postops
-
---skip-impl=ref
---batch=option_set_all_eltwise_postops
diff --git a/tests/benchdnn/inputs/conv/test_conv_attrs_f32_plain b/tests/benchdnn/inputs/conv/test_conv_attrs_f32_plain
new file mode 100644
index 00000000000..aef966c88d0
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_attrs_f32_plain
@@ -0,0 +1,18 @@
+# f32
+--reset --dt=f32
+--mb=2
+--stag=axb --dtag=axb
+--dir=FWD_B
+
+--skip-impl=ref,x64:gemm      # ! test jit version only
+--batch=option_set_combined_postops
+
+--stag=abx --dtag=abx
+--batch=option_set_all_eltwise_postops
+
+--stag=axb --dtag=axb
+--skip-impl=ref
+--batch=option_set_all_eltwise_postops
+
+--stag=abx --dtag=abx
+--batch=option_set_all_eltwise_postops
diff --git a/tests/benchdnn/inputs/conv/test_conv_bfloat16_nxc b/tests/benchdnn/inputs/conv/test_conv_bfloat16_nxc
index 8783b499ffb..cbba8ac76e3 100644
--- a/tests/benchdnn/inputs/conv/test_conv_bfloat16_nxc
+++ b/tests/benchdnn/inputs/conv/test_conv_bfloat16_nxc
@@ -5,14 +5,14 @@
 --skip-impl=ref
 --dir=FWD_B
 --dt=bf16:bf16:f32  --batch=shapes_resnet_50
---dt=bf16 --batch=set_conv_all
+--dt=bf16,f32:bf16:f32 --batch=set_conv_all
 
 --dir=FWD_D
 --dt=bf16 --batch=shapes_resnet_50
 
 --dir=BWD_D
 --dt=f32:bf16:bf16  --batch=shapes_resnet_50
---dt=bf16 --batch=set_conv_all
+--dt=bf16,f32:bf16:f32 --batch=set_conv_all
 
 --dir=BWD_WB
 --dt=bf16:f32:bf16 --batch=set_conv_all --batch=set_dilated-conv
diff --git a/tests/benchdnn/inputs/conv/test_conv_ci b/tests/benchdnn/inputs/conv/test_conv_ci
index 25cdf0d5476..75a0496d005 100644
--- a/tests/benchdnn/inputs/conv/test_conv_ci
+++ b/tests/benchdnn/inputs/conv/test_conv_ci
@@ -7,7 +7,7 @@
 --dir=FWD_B,FWD_D
 ### Direct
 --alg=direct
---dt=f32,bf16,f8_e5m2,f8_e4m3,f16
+--dt=f32,bf16,f8_e5m2,f8_e4m3,f16,f32:f16:f32,f32:bf16:f32
 --stag=any,axb
 --dtag=any,axb
 --attr-post-ops=, \
@@ -17,6 +17,11 @@
                 prelu:per_oc, \
                 mul:s8:per_oc+sum:0.25+relu:0.5+add:f32:per_tensor
 --batch=shapes_basic
+#### f32 abx format
+--stag=abx
+--dtag=abx
+--dt=f32
+--batch=shapes_basic
 ### Wino
 --alg=wino
 --dt=f32
@@ -32,6 +37,11 @@
 --stag=any,axb
 --dtag=any,axb
 --batch=shapes_basic
+#### f32 abx format
+--stag=abx
+--dtag=abx
+--dt=f32
+--batch=shapes_basic
 ### Wino
 --alg=wino
 --dt=f32
@@ -45,7 +55,6 @@
 
 ## Direct
 --alg=direct
-
 ### All inference configs
 --dt=s8:s8:f32,s8:s8:bf16,s8:s8:f16,s8:s8:s32,s8:s8:s8,s8:s8:u8, \
       u8:s8:f32,u8:s8:bf16,u8:s8:f16,u8:s8:s32,u8:s8:s8,u8:s8:u8
@@ -63,7 +72,7 @@
 --attr-zero-points=
 --batch=shapes_basic
 --attr-post-ops=
---attr-zero-points=,src:common:2+dst:common:1,src:per_dim_1+dst:per_dim_1
+--attr-zero-points=,src:common:2+dst:common:1,src:per_dim_1+dst:per_dim_1,src:per_dim_1+dst:common:1
 --batch=shapes_basic
 ### Signed input
 --dt=s8:s8:s8
@@ -77,7 +86,7 @@
 --attr-zero-points=
 --batch=shapes_basic
 --attr-post-ops=
---attr-zero-points=,src:common:2+dst:common:1,src:per_dim_1+dst:per_dim_1
+--attr-zero-points=,src:common:2+dst:common:1,src:per_dim_1+dst:per_dim_1,src:per_dim_1+dst:common:1
 --batch=shapes_basic
 # BF32
 --reset
diff --git a/tests/benchdnn/inputs/conv/test_conv_dilated_f32_nxc b/tests/benchdnn/inputs/conv/test_conv_dilated_f32_nxc
deleted file mode 100644
index a2753e807b1..00000000000
--- a/tests/benchdnn/inputs/conv/test_conv_dilated_f32_nxc
+++ /dev/null
@@ -1,18 +0,0 @@
-# dilated f32
---reset
---mb=2
---dt=f32
---skip-impl=ref
---stag=axb --dtag=axb
---dir=FWD_B,BWD_D,BWD_WB
---batch=shapes_dilated --batch=shapes_dilated_rfcn
---match=.*fc6.* --batch=shapes_ssd_300_voc0712
-
---reset --dt=f32
---mb=2
---skip-impl=ref,x64:gemm      # ! test jit version only
---stag=axb --dtag=axb
---dir=FWD_B,BWD_D,BWD_WB
---batch=set_dilated-conv_1st
---batch=set_dilated-conv
---batch=set_dilated-conv_3d
diff --git a/tests/benchdnn/inputs/conv/test_conv_dilated_f32_plain b/tests/benchdnn/inputs/conv/test_conv_dilated_f32_plain
new file mode 100644
index 00000000000..9769a39ab9a
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_dilated_f32_plain
@@ -0,0 +1,27 @@
+# dilated f32
+--reset
+--mb=2
+--dt=f32
+--skip-impl=ref
+--stag=axb --dtag=axb
+--dir=FWD_B,BWD_D,BWD_WB
+--batch=shapes_dilated --batch=shapes_dilated_rfcn
+--match=.*fc6.* --batch=shapes_ssd_300_voc0712
+
+--stag=abx --dtag=abx
+--batch=shapes_dilated --batch=shapes_dilated_rfcn
+--match=.*fc6.* --batch=shapes_ssd_300_voc0712
+
+--reset --dt=f32
+--mb=2
+--skip-impl=ref,x64:gemm      # ! test jit version only
+--stag=axb --dtag=axb
+--dir=FWD_B,BWD_D,BWD_WB
+--batch=set_dilated-conv_1st
+--batch=set_dilated-conv
+--batch=set_dilated-conv_3d
+
+--stag=abx --dtag=abx
+--batch=set_dilated-conv_1st
+--batch=set_dilated-conv
+--batch=set_dilated-conv_3d
diff --git a/tests/benchdnn/inputs/conv/test_conv_dt_nxc b/tests/benchdnn/inputs/conv/test_conv_dt_nxc
deleted file mode 100644
index bd746632ccc..00000000000
--- a/tests/benchdnn/inputs/conv/test_conv_dt_nxc
+++ /dev/null
@@ -1,19 +0,0 @@
---reset
-
-# f32
---batch=harness_conv_f32_nxc
-
-# tails
---reset
---skip-impl=ref
---stag=axb --dtag=axb
---dir=FWD_B,BWD_D,BWD_WB  --batch=shapes_tails
-
-# bf16
---batch=test_conv_bfloat16_nxc
-
-# f16
---batch=test_conv_float16_nxc
-
-# fp8
---batch=test_conv_fp8_nxc
diff --git a/tests/benchdnn/inputs/conv/test_conv_dt_plain b/tests/benchdnn/inputs/conv/test_conv_dt_plain
new file mode 100644
index 00000000000..456057c0961
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_dt_plain
@@ -0,0 +1,21 @@
+--reset
+
+# f32
+--batch=harness_conv_f32_plain
+
+# tails
+--reset
+--skip-impl=ref
+--stag=axb --dtag=axb
+--dir=FWD_B,BWD_D,BWD_WB  --batch=shapes_tails
+--stag=abx --dtag=abx
+--batch=shapes_tails
+
+# bf16
+--batch=test_conv_bfloat16_nxc
+
+# f16
+--batch=test_conv_float16_nxc
+
+# fp8
+--batch=test_conv_fp8_nxc
diff --git a/tests/benchdnn/inputs/conv/test_conv_float16_nxc b/tests/benchdnn/inputs/conv/test_conv_float16_nxc
index 11950b8de36..786e6c71d4c 100644
--- a/tests/benchdnn/inputs/conv/test_conv_float16_nxc
+++ b/tests/benchdnn/inputs/conv/test_conv_float16_nxc
@@ -5,14 +5,14 @@
 --skip-impl=ref
 --dir=FWD_B
 --dt=f16:f16:f32 --batch=shapes_resnet_50
---dt=f16 --batch=set_conv_all
+--dt=f16,f32:f16:f32 --batch=set_conv_all
 
 --dir=FWD_D
 --dt=f16 --batch=shapes_resnet_50
 
 --dir=BWD_D
 --dt=f32:f16:f16 --batch=shapes_resnet_50
---dt=f16 --batch=set_conv_all
+--dt=f16,f32:f16:f32 --batch=set_conv_all
 
 --dir=BWD_WB
 --dt=f16:f32:f16 --batch=set_conv_all --batch=set_dilated-conv
diff --git a/tests/benchdnn/inputs/conv/test_conv_fp4 b/tests/benchdnn/inputs/conv/test_conv_fp4
new file mode 100644
index 00000000000..cfe00034c6e
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_fp4
@@ -0,0 +1,4 @@
+--reset --mb=1 --dt=f4_e2m1:f4_e2m1:bf16,f4_e3m0:f4_e3m0:bf16 --dir=fwd_d --batch=shapes_4bit
+--reset --mb=1 --dt=f16:f4_e2m1:f4_e2m1,f32:f4_e3m0:f4_e3m0 --dir=bwd_d --batch=shapes_4bit
+--reset --mb=1 --dt=f4_e2m1:f16:f4_e2m1,f4_e3m0:f32:f4_e3m0 --dir=bwd_w --batch=shapes_4bit
+--reset --mb=1 --dt=f4_e2m1,f4_e3m0 --dir=fwd_d --batch=shapes_4bit
diff --git a/tests/benchdnn/inputs/conv/test_conv_gpu b/tests/benchdnn/inputs/conv/test_conv_gpu
index 86fffbdbf86..30f2dcc4466 100644
--- a/tests/benchdnn/inputs/conv/test_conv_gpu
+++ b/tests/benchdnn/inputs/conv/test_conv_gpu
@@ -1,5 +1,11 @@
 --reset
 
+# Test v2 implementation
+--batch=option_set_v2
+--impl=v2
+--batch=option_set_v2
+--reset
+
 # f32
 --dt=f32
 --mb=2,16
@@ -132,6 +138,29 @@ mb128ic3ih230oc64oh112kh7sh2ph0
 --batch=set_gpu
 --batch=shapes_basic_gpu
 
+# fp8 Stochastic Rounding
+--reset
+--dt=f8_e5m2,f8_e4m3
+--mb=2,16
+--dir=FWD_B
+--attr-rounding-mode=dst:stochastic
+--batch=shapes_basic_gpu
+
+# mixed fp8
+--reset --mb=1 --dt=f8_e5m2:f8_e5m2:bf16,f8_e4m3:f8_e4m3:bf16 --dir=fwd_d --batch=shapes_basic
+--reset --mb=1 --dt=f16:f8_e5m2:f8_e5m2,f32:f8_e4m3:f8_e4m3 --dir=bwd_d --batch=shapes_basic
+--reset --mb=1 --dt=f8_e5m2:f16:f8_e5m2,f8_e4m3:f32:f8_e4m3 --dir=bwd_w --batch=shapes_basic
+--reset --mb=1 --dt=f8_e5m2,f8_e4m3 --dir=fwd_d
+--attr-scales=wei:per_oc:bf16+dst:per_oc:bf16,\
+                      wei:common:2:bf16+src:common:2:bf16+dst:common:2:bf16,\
+                      wei:per_oc:f32+dst:per_oc:f32,\
+                      wei:common:2:f32+src:common:2:f32+dst:common:2:f32
+--batch=shapes_basic_gpu
+
+# Test fp4
+--reset
+--batch=test_conv_fp4
+
 # dw gen9 fwd conv
 --reset
 --dt=f16,f16:f16:s8
@@ -150,6 +179,20 @@ mb128ic3ih230oc64oh112kh7sh2ph0
 --reset
 --batch=harness_conv_zero_points
 
+# Test strided tensors
+--reset
+--dt=u8:s8:u8,f16,f32
+--mb=1,16
+--dir=FWD_B,BWD_D
+--batch=shapes_mem_strided
+
+--reset
+--dt=bf16,f32
+--mb=1,16
+--dir=BWD_WB
+--batch=shapes_mem_strided
+
+
 # Test layers of some key GPU DL Frameworks
 --reset
 --batch=option_set_fwks_key_gpu
diff --git a/tests/benchdnn/inputs/conv/test_conv_large_gpu b/tests/benchdnn/inputs/conv/test_conv_large_gpu
new file mode 100644
index 00000000000..778dce8c19a
--- /dev/null
+++ b/tests/benchdnn/inputs/conv/test_conv_large_gpu
@@ -0,0 +1,50 @@
+--reset --dir=BWD_W --dt=f64:f64:f64 --stag=axb --dtag=axb mb2ic2oc2iw86368893ow86368893kw1
+--reset --dir=FWD_D --dt=s8:s8:u8  mb46940ic59oc580iw143ow142kw2
+--reset --dir=FWD_D --dt=u8:s8:s8 --stag=axb --dtag=axb mb2520098ic36oc79iw35ow35kw1
+--reset --dir=FWD_D --dt=u8:s8:s8 --stag=axb --dtag=axb mb471ic277oc30iw49872ow49872kw2
+--reset --dir=BWD_D --dt=f32:f32:f32 --stag=axb --dtag=axb mb27ic6867oc371593iw6ow6kw1
+--reset --dir=FWD_D --dt=f16:f16:f16 --stag=axb --dtag=axb mb1ic88oc2iw13798453ow13798453kw2
+--reset --dir=FWD_D --dt=u8:s8:s32 --stag=axb --dtag=axb mb2ic2oc25538372iw28ow28kw1
+--reset --dir=FWD_D --dt=u8:s8:s8  mb3098141ic7oc650iw3ow2kw2
+--reset --dir=BWD_D --dt=f32:f32:f32  mb1193ic2oc22iw21210ow21209kw2
+--reset --dir=BWD_W --dt=bf16:bf16:bf16  mb1482ic25oc11iw94550ow94550kw2
+--reset --dir=FWD_D --dt=f32:f32:f32  mb643ic57oc4767iw647ow646kw2
+--reset --dir=BWD_W --dt=f32:f32:f32  mb2ic7688oc135342iw3ow3kw1
+--reset --dir=BWD_W --dt=bf16:bf16:bf16  mb101503ic52oc83iw171ow171kw2
+--reset --dir=FWD_D --dt=u8:s8:u8 --stag=axb --dtag=axb mb10282384ic51oc1iw7ow6kw3
+--reset --dir=FWD_D --dt=u8:s8:s32  mb44ic621oc7iw223152ow223152kw2
+--reset --dir=BWD_D --dt=f16:f16:f16 --stag=axb --dtag=axb mb2ic7302634oc136iw49ow48kw2
+--reset --dir=FWD_D --dt=f64:f64:f64 --stag=axb --dtag=axb mb3ic16523oc446iw13281ow13280kw2
+--reset --dir=BWD_W --dt=f64:f64:f64 --stag=axb --dtag=axb mb2ic378670oc1011iw1171ow1171kw1
+--reset --dir=FWD_D --dt=u8:s8:u8 --stag=axb --dtag=axb mb2ic10063832oc82iw78ow77kw2
+--reset --dir=BWD_D --dt=f16:f16:f16 --stag=axb --dtag=axb mb147101ic1oc568iw13ow13kw1
+--reset --dir=BWD_W --dt=f32:f32:f32  mb4ic42oc223iw1610431ow1610431kw1
+--reset --dir=FWD_D --dt=u8:s8:u8  mb71849ic40oc55iw437ow436kw2
+--reset --dir=BWD_W --dt=bf16:bf16:bf16 --stag=axb --dtag=axb mb2653ic581oc10iw2442ow2442kw1
+--reset --dir=FWD_D --dt=u8:s8:u8 --stag=axb --dtag=axb mb16ic14oc824604iw596ow595kw2
+--reset --dir=FWD_D --dt=f16:f16:f16 --stag=axb --dtag=axb mb733ic24oc155247iw14ow14kw2
+--reset --dir=FWD_D --dt=f64:f64:f64  mb4ic3257oc872iw52011ow52011kw1
+--reset --dir=BWD_W --dt=f16:f16:f16 --stag=axb --dtag=axb mb35ic36oc2080803iw44ow44kw2
+--reset --dir=FWD_D --dt=u8:s8:s32  mb12417ic9oc48232iw2ow2kw1
+--reset --dir=FWD_D --dt=f64:f64:f64  mb49556ic4346oc1063iw1ow1kw1
+--reset --dir=FWD_D --dt=s8:s8:s8  mb2740ic13oc16iw33161ow33160kw2
+--reset --dir=FWD_D --dt=s8:s8:u8 --stag=axb --dtag=axb mb212615ic4oc378iw107ow107kw1
+--reset --dir=BWD_W --dt=f16:f16:f16 --stag=axb --dtag=axb mb2ic3oc181309iw7471ow7471kw1
+--reset --dir=FWD_D --dt=u8:s8:s32  mb2ic51oc1520023iw376ow376kw1
+--reset --dir=BWD_D --dt=f16:f16:f16  mb2624ic4oc1387iw661ow660kw2
+--reset --dir=FWD_D --dt=u8:s8:u8 --stag=axb --dtag=axb mb8742ic1oc433iw707ow706kw3
+--reset --dir=FWD_D --dt=u8:s8:s8 --stag=axb --dtag=axb mb15ic12376743oc60iw12ow12kw1
+--reset --dir=BWD_D --dt=bf16:bf16:bf16 --stag=axb --dtag=axb mb10ic58oc11iw3011534ow3011534kw2
+--reset --dir=BWD_D --dt=bf16:bf16:bf16  mb48ic42oc16iw506333ow506333kw1
+--reset --dir=BWD_D --dt=f16:f16:f16 --stag=axb --dtag=axb mb1131ic196oc2iw10497ow10497kw1
+--reset --dir=FWD_D --dt=s8:s8:s32 --stag=axb --dtag=axb mb32ic131607oc3iw744ow744kw1
+--reset --dir=FWD_D --dt=u8:s8:s32  mb948ic5oc10672iw124ow124kw1
+--reset --dir=FWD_D --dt=u8:s8:s32  mb23995ic32313oc3iw4ow4kw1
+--reset --dir=BWD_D --dt=f64:f64:f64  mb84ic5546oc3iw2381ow2380kw3
+--reset --dir=FWD_D --dt=u8:s8:u8  mb1109ic21oc6iw135375ow135374kw2
+--reset --dir=BWD_D --dt=f32:f32:f32  mb2432ic258oc431iw1168ow1168kw2
+--reset --dir=FWD_D --dt=f64:f64:f64 --stag=axb --dtag=axb mb56ic38651oc1iw274ow273kw2
+--reset --dir=BWD_D --dt=f16:f16:f16 --stag=axb --dtag=axb mb1795ic17394oc181iw40ow40kw1
+--reset --dir=BWD_D --dt=f64:f64:f64 --stag=axb --dtag=axb mb24272ic837oc34iw17ow17kw2
+--reset --dir=BWD_D --dt=f64:f64:f64  mb76516ic1348oc145iw4ow4kw1
+--reset --dir=BWD_W --dt=f16:f16:f16  mb2ic7291oc6iw250399ow250399kw1
diff --git a/tests/benchdnn/inputs/deconv/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/deconv/option_set_fwks_ext_gpu
index e6928147055..b794cef788e 100644
--- a/tests/benchdnn/inputs/deconv/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/deconv/option_set_fwks_ext_gpu
@@ -1,60 +1,60 @@
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih4oh9kh3sh2dh0ph0_iw4ow9kw3sw2dw0pw0_n"a42982ef0627d722412103c26f5bb4c6*5&abc2a2c6fb0c2082bf5c11935d1584ab*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih8oh17kh3sh2dh0ph0_iw8ow17kw3sw2dw0pw0_n"c2b83f6817da072bbfb6f01b039cddbb*5&5db0e977faacc92759acc88a547647a3*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih16oh33kh3sh2dh0ph0_iw16ow33kw3sw2dw0pw0_n"f1dbc623af1522928c420a58e061a353*5&3cf0171f64edb4ed62026138ed897f71*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih32oh65kh3sh2dh0ph0_iw32ow65kw3sw2dw0pw0_n"1c4da2d1ae54b58c8172c2f5a3490197*5&b3ecb6fc6b6d68ea1185aa589dcd764d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih64oh129kh3sh2dh0ph0_iw64ow129kw3sw2dw0pw0_n"d7494f04ff96d53a68e66c458ae76b7e*5&0ba05a829b5546c25c7f039ddb0da61d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih128oh257kh3sh2dh0ph0_iw128ow257kw3sw2dw0pw0_n"b4230fbd40f4ab697c128a48ad46266f*5&aafaa745fd7555be92c8761cb3fc6aad*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih256oh513kh3sh2dh0ph0_iw256ow513kw3sw2dw0pw0_n"2ef21f015a7c88a08dbedc3b46d3e52c*5&47b458835e0684818de95c299eefea55*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih512oh1025kh3sh2dh0ph0_iw512ow1025kw3sw2dw0pw0_n"18754c82af1ee2058a5b0aea26583318*5&1e57325769e65b1e6ebd64f6eab62d2b*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic24oc24_ih184oh368kh2sh2dh0ph0_iw184ow368kw2sw2dw0pw0_n"16c86114122e5682a8886b66c0776574*5&593c84ce20bf50567716398fba3fc547*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic24oc1_ih368oh736kh2sh2dh0ph0_iw368ow736kw2sw2dw0pw0_n"74fa30e104e4bf2c508c9a082441a567*5&963d76c21566043112c1096d392ce366*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc320_id7od14kd2sd2dd0pd0_ih7oh14kh2sh2dh0ph0_iw5ow10kw2sw2dw0pw0_n"c2900507b84a815ac96343d5ec3992a4*5&7b9788acfe1be25c4a37fcb880e3caca*5&a7edc19aa463c1ffa7817e1f2c217125*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic320oc256_id14od28kd2sd2dd0pd0_ih14oh28kh2sh2dh0ph0_iw10ow20kw2sw2dw0pw0_n"1371b0792d4a396c93d08f8c1a590bec*5&ea9e2e1d0dba7d405cb095aea0aa2338*5&4da786f53884180b3cee4620b57f66d4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_id28od56kd2sd2dd0pd0_ih28oh56kh2sh2dh0ph0_iw20ow40kw2sw2dw0pw0_n"ab6a36aa5634db0487fcdf62e8cb9ab9*5&d3d68be72bf231ba9004b6bc3803fa86*5&3303e30bde820e4b5664a81bdb8ff20b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc64_id56od112kd2sd2dd0pd0_ih56oh112kh2sh2dh0ph0_iw40ow80kw2sw2dw0pw0_n"cccf20d6a485ff6095a1d2a89ffde63d*5&c29a0cff022cd8334c0c2352ad1d2a94*5&3a330e70112faeec4434391b664716cc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic64oc32_id112od224kd2sd2dd0pd0_ih112oh224kh2sh2dh0ph0_iw80ow160kw2sw2dw0pw0_n"6964b43cc36df40a0435af4f08e6d7c7*5&ba6109a54274a3c31a28270d27257fbc*5&9b74e351278888c79b12fb2989b7f27a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"0fa03de02bdc4e2a9d848c19948e1d58*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"cd0b87f1c8ab9bc9fe04f86baaebb8e0*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"cff3e2d35c7a3a0abdbb2b5a74a48822*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"2a8acbd62066d295dbacacab3d8c3b29*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"ea3fe20912cbe6cf3c08f362110aedcf*5&9a51ef4434f7cd853b9fae461b3645dc*5&4c7bff80fca8972b4739393784e4f914*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"0d3ccb0de725f227b6dc542414858e5d*5&86917bd18ff5eae8394dfe192d5d6e49*5&7812a550d0ce3c833daebe2b08f7bffc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"dc4747337f38904b7163b6a30f65fe99*5&ae9929d2bf34bedcaf3a5e09362da31d*5&d06c69c519be65835a4afacccede1752*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"30e7179eb3d87e8132c8dc3ce18ce957*5&451c9f67bba518b427136234a2fda476*5&9a41551d3d595e442bf9d6fe80df3e0c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100_ic256oc256_ih14oh28kh2sh2dh0ph0_iw14ow28kw2sw2dw0pw0_n"c0fc58405514a37e440688d65a945532*5&4e1ffbefbe797b9cc71a3e08354b9c71*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2 mb100_ic256oc256_ih14oh28kh2sh2dh0ph0_iw14ow28kw2sw2dw0pw0_n"c7cee706805c07005a45611d435a5e95*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:u8 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"b5bbdcdbedb989aec4647e86ef1ad6ef*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"0cddf90e1104792dca43c812c8aeec0e*5&a17e8e10f0fdb12ba0dffd242ec13c46*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"3776f12d6cb69c822320cedc5de6ffc3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"2f856ae3be3d641d5449cd82b81a648e*5&e2c90dbdfaa6e2f2994af7f558d2ccfb*5&d768d7811d21ea8f24f0fbf7490081c7*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"17ee8419a8b155bddd7b269c586fc1c1*5&a81d69b44effd458fe3fe1c54cf93ae9*5&e5e5f359b52141d1fae705ae2799f5f3*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"5b4d32510b902f111876034d9aad0e36*5&0fedf4706ae982ae624201d104753426*5&4db95cc517a4453c4e6638a4fa53b757*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"2220c0c249d95ab03b90841cfb286cf0*5&6dc08c1b5502f0d641fc49afbb600556*5&fec45363289ab40de11bfb08c902050e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc64_ih7oh14kh4sh2dh0ph1_iw7ow14kw4sw2dw0pw1_n"09bea4bd7d99972b799eebd19e3ec048*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1_ic64oc1_ih14oh28kh4sh2dh0ph1_iw14ow28kw4sw2dw0pw1_n"53932ade9908c556ac0730010f812c0d*5&4431b9a4cf68adfb41a86b36b294c1bb*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"506b330d0194bf3962b8553c4b438f8f*5&a78d1696943619a474806ee48ffe3165*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"789d0b28a062dae28defc06cc92e5331*5&8ddc1eccde713185b6e1e8071d76a27a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"62ef0234ea9daf4e769ac9f95f73c9a9*5&ffa8922b503143911e509bc6c1d0aee2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"1ec2db85b6a5fb872ec95473dfd5f3d0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"2d7f42daa62a7bbe4d761e2c488a9924*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"5bf622a36e386a94c47f676c700c4c01*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"89a9f9f7aade51649767e4f65dd8c09a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"68927a966277b9183e40768c45369da0*5&e673e5bfd537636ca77b481bbbf63194*5&67511a1552f328e44723a37018cbe217*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"a900905164ad10f4a19d3673eeb9e5ef*5&e9dcd8939c0d79a22528256bba799227*5&d8e116d710bac91961805b97428df570*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb1_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"17339f355848c2ffcc47633ae5d04863*5&8e5d92f49f23530f2885fe5370f80709*5&1edbb1446b6e9eb837b2f024318857db*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"4467bf3acb42000cdadb873c09628e86*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"aa21dae20f6d5722c24ca35fae604e26*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"065974d9f840c15a90c40298fa2d52f1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"213932921a509f3e9b30e7f06d17f05a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2 mb1_ic128oc64_ih7oh14kh4sh2dh0ph1_iw7ow14kw4sw2dw0pw1_n"17deb50e7f075af4b1692290a78a9d35*5"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"8519dddc5fb962a017bdae6bcce217dd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"ca2caf7c8e1629570f271638ce9f6c80*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=s8:s8:s8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"7d13f8d368417f584c8556fc8fb7de0c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"5a82359ed86a349aef31e5eb04c3ce42*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"f7b314253bc9b48795c7b68a4ca1dafd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"567b41267e96248fb40454194eec7f27*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=u8:s8:u8 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"ac49362e482f3a50f19c67237f77202a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb32_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"af2ee89fc448d5f62b532a66d7448333*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb32_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"f12e66303ba7e7b75edd190ffe4c0a51*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum:0.5:0+eltwise_relu:0.271:0.314:1.234 mb32_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"251c2cf280f3866e2e1ba8b54cae97c5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"34defcaf6aa01afbca0ba9a55dd171fe*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih4oh9kh3sh2dh0ph0_iw4ow9kw3sw2dw0pw0_n"799722648277b35a8a7c2bdf0064e40c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih8oh17kh3sh2dh0ph0_iw8ow17kw3sw2dw0pw0_n"e89743979071f7033770ab76a8d682f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih16oh33kh3sh2dh0ph0_iw16ow33kw3sw2dw0pw0_n"66da375dbf2ae59e45dd63aac7de9008*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc512_ih32oh65kh3sh2dh0ph0_iw32ow65kw3sw2dw0pw0_n"aad696a2cc37359885921e7f229d0bac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih64oh129kh3sh2dh0ph0_iw64ow129kw3sw2dw0pw0_n"b70d409c824325d0f024728e7401a7ac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih128oh257kh3sh2dh0ph0_iw128ow257kw3sw2dw0pw0_n"ac2a297b988c3a78023ba5fcbd57ca1b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih256oh513kh3sh2dh0ph0_iw256ow513kw3sw2dw0pw0_n"80cfc4b9df59e9825d518a1ae79b6112*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_ih512oh1025kh3sh2dh0ph0_iw512ow1025kw3sw2dw0pw0_n"58af2b22cddbb73670b12ad65423313c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx mb100_ic256oc256_ih14oh28kh2sh2dh0ph0_iw14ow28kw2sw2dw0pw0_n"f98b31e274aedacb14e1e502ed39cec5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"c7ede707d23d1bd9ff49d295ea6f4f3d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"f86ba9b1e6d88ad76b5b7ebe793b666c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"d875b379bc926509e282267cec479f57*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:0.0:1.0 mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"bcd7bb683f034f1037b905cd70a853b5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"ba3e5896c6b312df50c22a77d60ec7ec*5&78240aaf6585d516d477c8b5d520a30c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"b6915f796e746d4903bfb7de49a16dfd*5&fb7615817507d5834ef671efbf6b4aba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb1_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"a79e990ab59ba0403d296116f89ec72b*5&ad4c55d28e9d814a52a4074dd19d38c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"8e5450c2ba9bfe5077dc9c9250954a27*5&3a6434bdc803079e71ae999d697e8fb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"03225e1c50956cfa98cb2413eac4f0d5*5&7c62d948e501c31129e9e244e928bad5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"1d35f974ac7ac28dccbd7c815da81096*5&5d0e9934351743bd21e9f0a4ba6fe99c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:u8 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_linear:0.25 mb1_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"f6098ea358a3ea59055b13c1201f785c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb100_ic256oc256_ih14oh28kh2sh2dh0ph0_iw14ow28kw2sw2dw0pw0_n"67ff075b68333053cac2dfb83668c299*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2:abx mb1_ic128oc64_ih7oh14kh4sh2dh0ph1_iw7ow14kw4sw2dw0pw1_n"e542b5596e39109a049862af65211b18*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic64oc1_ih14oh28kh4sh2dh0ph1_iw14ow28kw4sw2dw0pw1_n"d9fabd8c7f20647d8fcd36f614dd5f43*5&4a131ffaf48a193c575fde9ec7f9bc00*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"d033cf476cb973ea2bb219be06356cea*5&a810106b2086baa026d1e1ea928b7327"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"0c3426887ea2e71cfef8dc48889f8aab*5&9b88b276fe5c2bc90f39605a590e4e98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"814788534c6bc027d770e6dc700b04b4*5&cdc06135387a5bcf753e000d632ca603"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"2ef5425ee48004714dadf82273b829ba*5&e7a55ab4d1d2d21c24c8d1f0214b958c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"33dc193fdff3ee9003ddd882bcd075bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"b19f8797d94014dce2d8c75446492603"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"94108f41b9d07951dc9fe1104cfce633"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any mb32_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"c1515be8ac7cc8273fe06d08edd90d86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"9f8c310e014ce7674b3b67074591ac14"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_relu mb1_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"b71e56f3705c2288104f7f0f1e9ee07f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1_ic128oc64_ih7oh14kh4sh2dh0ph1_iw7ow14kw4sw2dw0pw1_n"706ef1eb5af156603b3251d126c0b306*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"7bcb6990960c9e870a94826092bd5fea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"548f58cce49efc0cfb20c75f0cb483ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"986c6fcb260f428e35c4ff0f685003df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"8853cc6cacb3edf1a3ac24661298c20d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb32_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"944c559ccccaf999a938866530c3a094"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb32_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"5a16664855e6175182d00c96c29ffd5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=sum+eltwise_relu mb32_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"da96d6880ade03a15e1f0e1c1124d315"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb1_ic24oc24_ih184oh368kh2sh2dh0ph0_iw184ow368kw2sw2dw0pw0_n"d500fb02552f7bea2c096007999a41c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=acdb --attr-post-ops=eltwise_logistic mb1_ic24oc1_ih368oh736kh2sh2dh0ph0_iw368ow736kw2sw2dw0pw0_n"652ecd40ebb879aecedf55ca12407a00*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"b6ad54d992f7f39b8de29b19c47f54e2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"794a7a0af6eb7e7dc92bec5a917e74ff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"309b63aec81f89fe99f8b2bfc7ecbcdc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"37e6a965354a8f2f199055048de0dea5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc320_id7od14kd2sd2dd0pd0_ih7oh14kh2sh2dh0ph0_iw5ow10kw2sw2dw0pw0_n"82a47e3357cc8659fd8be69f08bf1ab8*5&6bdb78078184e76ac2a7913cb947e141*5&a9093d81c79390568dc0582c9ea07d1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic320oc256_id14od28kd2sd2dd0pd0_ih14oh28kh2sh2dh0ph0_iw10ow20kw2sw2dw0pw0_n"231c03613064a6cbd37de0f7fe32e3c6*5&66f0ca68bb8263aa25577510dd5b75c8*5&d0dbd8fefe1eff67c75bdffd1c596443"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic256oc128_id28od56kd2sd2dd0pd0_ih28oh56kh2sh2dh0ph0_iw20ow40kw2sw2dw0pw0_n"a48c6f65e48084bb79d13777e57bfbe8*5&29e5ba668e6ac241968da8aef75feb53*5&75eac2b91cbcce626d067fc144d9476e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic128oc64_id56od112kd2sd2dd0pd0_ih56oh112kh2sh2dh0ph0_iw40ow80kw2sw2dw0pw0_n"ab4df65813e66dfd6c6835edf29c789b*5&a2316e233183fba69fb0631b6e576a91*5&bf91df72603acd09ddf48e229b16ef04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=undef --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1_ic64oc32_id112od224kd2sd2dd0pd0_ih112oh224kh2sh2dh0ph0_iw80ow160kw2sw2dw0pw0_n"aa437e2b550faa43aa4a7c2de523e049*5&9ebecaef8eb99d56fc1347c2c1b3000a*5&0c854e81517232d0c8e90e7e4900e810"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc256_ih32oh64kh4sh2dh0ph1_iw32ow64kw4sw2dw0pw1_n"106f527a032e471e22321da8221ececf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc128_ih64oh128kh4sh2dh0ph1_iw64ow128kw4sw2dw0pw1_n"128ef1c6e56da4c39066f19b2456d918"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc128_ih128oh256kh4sh2dh0ph1_iw128ow256kw4sw2dw0pw1_n"3b9ee42057a573d8d4343f61d02dfa3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic1024oc512_ih23oh46kh2sh2dh0ph0_iw30ow60kw2sw2dw0pw0_n"ef7af8dc647b0beb31872d0e9f1f6794"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic512oc256_ih46oh92kh2sh2dh0ph0_iw60ow120kw2sw2dw0pw0_n"e78669d195f39c91b1ae1101c1901e45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic256oc128_ih92oh184kh2sh2dh0ph0_iw120ow240kw2sw2dw0pw0_n"f2c948289723ff3b8cefd06341cfe8d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32_ic128oc64_ih184oh368kh2sh2dh0ph0_iw240ow480kw2sw2dw0pw0_n"511197e71bbcf532d18324c631657646"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --attr-post-ops=eltwise_relu mb32_ic36oc4_ih360oh1080kh3sh3dh0ph0_iw640ow1920kw3sw3dw0pw0_n"0c65f1d51affc980fd8b8b5fb089ba63"
diff --git a/tests/benchdnn/inputs/deconv/option_set_fwks_key_gpu b/tests/benchdnn/inputs/deconv/option_set_fwks_key_gpu
index 9ef7f963428..f95f187de09 100644
--- a/tests/benchdnn/inputs/deconv/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/deconv/option_set_fwks_key_gpu
@@ -1,33 +1,33 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od14kd2sd2dd0pd0_ih7oh14kh2sh2dh0ph0_iw5ow10kw2sw2dw0pw0_n"26088ee6b894448f5633b13bab7f9c6c*67&d33c2e89aebbb0851ecb235264e122c8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic320oc256_id14od28kd2sd2dd0pd0_ih14oh28kh2sh2dh0ph0_iw10ow20kw2sw2dw0pw0_n"a455eefe44323b8f7307e9a7ab4611de*67&35eb7dd403e3d80fd785104fba60bb70*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic256oc128_id28od56kd2sd2dd0pd0_ih28oh56kh2sh2dh0ph0_iw20ow40kw2sw2dw0pw0_n"1621692475ebdc1c3479b4201adb630a*67&28a83a139812a28fea96bf1e19643885*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic128oc64_id56od112kd2sd2dd0pd0_ih56oh112kh2sh2dh0ph0_iw40ow80kw2sw2dw0pw0_n"cb85a4d13d08817b84e41d0c0d750e0b*67&71adbc0aeb569b996ecd42498cd6f19b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=any --wtag=any --dtag=any mb1_ic64oc32_id112od224kd2sd2dd0pd0_ih112oh224kh2sh2dh0ph0_iw80ow160kw2sw2dw0pw0_n"d81d655fe09e51248849f6e786cecfa7*67&8004735dfc61bda0bade2c28dc208a35*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id7od14kd2sd2dd0pd0_ih7oh14kh2sh2dh0ph0_iw5ow10kw2sw2dw0pw0_n"2c724434eabd73a1f5594d59dfc2a83b*105"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc256_id14od28kd2sd2dd0pd0_ih14oh28kh2sh2dh0ph0_iw10ow20kw2sw2dw0pw0_n"6a4ecd82dfa6859c660df1f1a8859c22*105"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id28od56kd2sd2dd0pd0_ih28oh56kh2sh2dh0ph0_iw20ow40kw2sw2dw0pw0_n"6eb3c188bba9996f555871c067e8f217*105"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id56od112kd2sd2dd0pd0_ih56oh112kh2sh2dh0ph0_iw40ow80kw2sw2dw0pw0_n"172ef2d9077b2a57f94bc9651a5198d8*105"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id112od224kd2sd2dd0pd0_ih112oh224kh2sh2dh0ph0_iw80ow160kw2sw2dw0pw0_n"74aab5232aea03d9224e3194bbe04ffd*105"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih7oh7kh1sh1dh0ph0_iw1ow12kw12sw1dw0pw0_n"9b708fcee8ac508ebe2e42ba2a24a2fe*501"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih7oh7kh1sh1dh0ph0_iw1ow12kw12sw1dw0pw0_n"1662fdde54bc77bc186b3638c7301afd*501"
---reset --allow-enum-tags-only=0 --dir=BWD_WB --alg=direct --dt=bf16:bf16:bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih7oh7kh1sh1dh0ph0_iw1ow12kw12sw1dw0pw0_n"8306861f586ef67151bebb8c6252fbc2*501"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih48oh96kh3sh2dh0ph1_iw72ow144kw3sw2dw0pw1_n"d0e3e7aabff1b693704226e9e18274cc*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh192kh3sh2dh0ph1_iw144ow288kw3sw2dw0pw1_n"727435205e26b7d0e02e35a1c422ea89*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh384kh3sh2dh0ph1_iw288ow576kw3sw2dw0pw1_n"dbccbde51660d17a1eec41cabb4425bd*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih384oh768kh3sh2dh0ph1_iw576ow1152kw3sw2dw0pw1_n"0347ffea669282fb844aa61c420fe691*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih384oh768kh3sh2dh0ph1_iw576ow1152kw3sw2dw0pw1_n"18575a59c7c3b44751a811ba6cfc35ee*2"
---reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih384oh768kh3sh2dh0ph1_iw576ow1152kw3sw2dw0pw1_n"59ac0ddaad5756d66e013e555ebe1bde*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh384kh3sh2dh0ph1_iw288ow576kw3sw2dw0pw1_n"343de5f90c9db287c6b4af18797f08ee*2"
---reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh384kh3sh2dh0ph1_iw288ow576kw3sw2dw0pw1_n"9ca8519c7bd8b1941b524c6d00043ea5*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh192kh3sh2dh0ph1_iw144ow288kw3sw2dw0pw1_n"91cbc03e0ff9726bf2982b3722e628b0*2"
---reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh192kh3sh2dh0ph1_iw144ow288kw3sw2dw0pw1_n"16cd86d6fa79e33b63c7405b9c8e2835*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih48oh96kh3sh2dh0ph1_iw72ow144kw3sw2dw0pw1_n"21a15a010b251d601ac2e5f8110b7a4f*2"
---reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih48oh96kh3sh2dh0ph1_iw72ow144kw3sw2dw0pw1_n"4e691887f70473277b5964761bfc75f4*2"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abc --wtag=any --dtag=abc mb1_ic512oc256_iw283ow2264kw16sw8dw0pw4_n"ba39fca70be5939ab5bd8b25a1c593f3*2"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abc --wtag=any --dtag=abc mb1_ic256oc128_iw2264ow18112kw16sw8dw0pw4_n"64907a2c9858ac4eda256241e3ec591e*2"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abc --wtag=any --dtag=abc mb1_ic128oc64_iw18112ow36224kw4sw2dw0pw1_n"40132a99c1b3f6a4af11638cb91f8abb*2"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=abc --wtag=any --dtag=abc mb1_ic64oc32_iw36224ow72448kw4sw2dw0pw1_n"aed56e17f1c401d790f28063a57ec55a*2"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc256_ih1oh1kh1sh1dh0ph0_iw450ow3600kw16sw8dw0pw4_n"4090e809b532451b4d7ef03b6cdaef29*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc128_ih1oh1kh1sh1dh0ph0_iw3600ow28800kw16sw8dw0pw4_n"5d88299679f8b597b03758919e5e5140*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc64_ih1oh1kh1sh1dh0ph0_iw28800ow57600kw4sw2dw0pw1_n"6601fec3066777ab5175f214695493ef*1"
---reset --allow-enum-tags-only=0 --dir=FWD_B --alg=direct --dt=f16:f16:f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc32_ih1oh1kh1sh1dh0ph0_iw57600ow115200kw4sw2dw0pw1_n"fa466950a47f5ffcd424473e19b971d3*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc320_id7od14kd2sd2dd0pd0_ih7oh14kh2sh2dh0ph0_iw5ow10kw2sw2dw0pw0_n"49235fc858db7e2d7455ca7ec68376c1*67&ebe2d71f35415328ccb6c89f790df3d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic320oc256_id14od28kd2sd2dd0pd0_ih14oh28kh2sh2dh0ph0_iw10ow20kw2sw2dw0pw0_n"3637a479f58aab24a42d9ccfe1b25389*67&cd28de554540dd02b824fa202c9e8be1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic256oc128_id28od56kd2sd2dd0pd0_ih28oh56kh2sh2dh0ph0_iw20ow40kw2sw2dw0pw0_n"2f8db0aed43536f1edcc77fda6061722*67&ea5c9af93eaf8bb1a91bc03ae9892cab"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic128oc64_id56od112kd2sd2dd0pd0_ih56oh112kh2sh2dh0ph0_iw40ow80kw2sw2dw0pw0_n"de1035f1a66fb1e146ca8dd023448c6c*67&46513c56f531628cd0fcbca107389536"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=any --wtag=any --dtag=any mb1_ic64oc32_id112od224kd2sd2dd0pd0_ih112oh224kh2sh2dh0ph0_iw80ow160kw2sw2dw0pw0_n"076b2745547e19a4b0337b6d85736b88*67&2895ab31d039dd56f5ae3a8c4af75fc4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih7oh7kh1sh1dh0ph0_iw1ow12kw12sw1dw0pw0_n"b7f0d48111f40407ef525a56f3940849*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=bf16:bf16:bf16 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih7oh7kh1sh1dh0ph0_iw1ow12kw12sw1dw0pw0_n"346ef935b4b113cdb7d61123a454a770*501"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc512_ih7oh7kh1sh1dh0ph0_iw1ow12kw12sw1dw0pw0_n"4ac94ecc59662260f71332440163d165*501"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc320_id7od14kd2sd2dd0pd0_ih7oh14kh2sh2dh0ph0_iw5ow10kw2sw2dw0pw0_n"b1b462d39fadbab7b11e0dd16a271869*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic320oc256_id14od28kd2sd2dd0pd0_ih14oh28kh2sh2dh0ph0_iw10ow20kw2sw2dw0pw0_n"f688205e8cde9178912dacb37cc12505*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic256oc128_id28od56kd2sd2dd0pd0_ih28oh56kh2sh2dh0ph0_iw20ow40kw2sw2dw0pw0_n"4585b1dda25c8452d55fcda216e66724*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic128oc64_id56od112kd2sd2dd0pd0_ih56oh112kh2sh2dh0ph0_iw40ow80kw2sw2dw0pw0_n"0763adbbca37c97b4f722a1f37786b91*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=undef --stag=acdeb --wtag=any --dtag=acdeb mb1_ic64oc32_id112od224kd2sd2dd0pd0_ih112oh224kh2sh2dh0ph0_iw80ow160kw2sw2dw0pw0_n"7bdb9892c4452962adc204ec937bc159*105"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih48oh96kh3sh2dh0ph1_iw72ow144kw3sw2dw0pw1_n"9531f3479beba42560e1970a10512211*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh192kh3sh2dh0ph1_iw144ow288kw3sw2dw0pw1_n"81c84a9916a294c61b8acd502218694b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh384kh3sh2dh0ph1_iw288ow576kw3sw2dw0pw1_n"b8ee3bb8926eae8eefab063b565ee9c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih384oh768kh3sh2dh0ph1_iw576ow1152kw3sw2dw0pw1_n"0ae708c0e2dd6c162bf74aefbf52b779*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih384oh768kh3sh2dh0ph1_iw576ow1152kw3sw2dw0pw1_n"719641589443ce952bb083e61f99dfd0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc3_ih384oh768kh3sh2dh0ph1_iw576ow1152kw3sw2dw0pw1_n"e106549abb17d68958b07854ea171f68*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh384kh3sh2dh0ph1_iw288ow576kw3sw2dw0pw1_n"588347fe9e6843b21746f1a4536a33ea*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih192oh384kh3sh2dh0ph1_iw288ow576kw3sw2dw0pw1_n"3ca5ef949630d403b854b3d439ebd92b*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh192kh3sh2dh0ph1_iw144ow288kw3sw2dw0pw1_n"708bf9407e409e92833495d2e95bd096*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih96oh192kh3sh2dh0ph1_iw144ow288kw3sw2dw0pw1_n"92cb7f03430a373806fac0746099fe88*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih48oh96kh3sh2dh0ph1_iw72ow144kw3sw2dw0pw1_n"85c7a754789a5f96d6f6d152f644a24c*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --alg=direct --dt=f32:f32:f32 --bia-dt=undef --stag=acdb --wtag=any --dtag=acdb mb4_ic256oc256_ih48oh96kh3sh2dh0ph1_iw72ow144kw3sw2dw0pw1_n"f1caf1e0c8bb61af24ee5035e506a2a9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abc --wtag=any --dtag=abc mb1_ic512oc256_iw283ow2264kw16sw8dw0pw4_n"060581de1759aca62a45c9e478fc1051*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abc --wtag=any --dtag=abc mb1_ic256oc128_iw2264ow18112kw16sw8dw0pw4_n"a831cac367aeb1b14de8ad4d8feafe51*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abc --wtag=any --dtag=abc mb1_ic128oc64_iw18112ow36224kw4sw2dw0pw1_n"756d65ae088f76060b438b652decbfe9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=abc --wtag=any --dtag=abc mb1_ic64oc32_iw36224ow72448kw4sw2dw0pw1_n"f98e8ee3e373ccb9382398eb17f21a59*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic512oc256_ih1oh1kh1sh1dh0ph0_iw450ow3600kw16sw8dw0pw4_n"45f9f0032dd58afba0dc3ca5931fd7e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic256oc128_ih1oh1kh1sh1dh0ph0_iw3600ow28800kw16sw8dw0pw4_n"291f8de30c9b66e80759a2d8e66b0224"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic128oc64_ih1oh1kh1sh1dh0ph0_iw28800ow57600kw4sw2dw0pw1_n"8106939d77f3de342c35bf9d8ac9ee57"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=direct --dt=f16:f16:f16 --bia-dt=f16 --stag=acdb --wtag=any --dtag=acdb mb8_ic64oc32_ih1oh1kh1sh1dh0ph0_iw57600ow115200kw4sw2dw0pw1_n"08c5bf323c98f39125b4b35d09487495"
diff --git a/tests/benchdnn/inputs/deconv/test_deconv_bfloat16_nxc b/tests/benchdnn/inputs/deconv/test_deconv_bfloat16_nxc
index 20226e258f2..7f36c25c868 100644
--- a/tests/benchdnn/inputs/deconv/test_deconv_bfloat16_nxc
+++ b/tests/benchdnn/inputs/deconv/test_deconv_bfloat16_nxc
@@ -13,8 +13,8 @@
 --attr-post-ops=
 --batch=set_all
 
---dt=bf16:bf16:f32 --dir=FWD_B --batch=set_all
---dt=f32:bf16:bf16 --dir=BWD_D --batch=set_all
+--dt=bf16:bf16:f32,f32:bf16:f32 --dir=FWD_B --batch=set_all
+--dt=f32:bf16:bf16,f32:bf16:f32 --dir=BWD_D --batch=set_all
 --dt=bf16:f32:bf16 --dir=BWD_WB --batch=set_all
 
 # Test Deconv w/bias through GeMM
diff --git a/tests/benchdnn/inputs/deconv/test_deconv_ci b/tests/benchdnn/inputs/deconv/test_deconv_ci
index 7586ee1ef0e..088dfd93782 100644
--- a/tests/benchdnn/inputs/deconv/test_deconv_ci
+++ b/tests/benchdnn/inputs/deconv/test_deconv_ci
@@ -13,7 +13,7 @@
 --dt=f64
 --batch=shapes_ci
 
---dt=f32,bf16,f16
+--dt=f32,bf16,f16,f32:f16:f32,f32:bf16:f32
 --attr-post-ops=, \
                 sum:0.5, \
                 linear:2:1, \
diff --git a/tests/benchdnn/inputs/deconv/test_deconv_float16_nxc b/tests/benchdnn/inputs/deconv/test_deconv_float16_nxc
index dc90b06a94d..3cc615bb341 100644
--- a/tests/benchdnn/inputs/deconv/test_deconv_float16_nxc
+++ b/tests/benchdnn/inputs/deconv/test_deconv_float16_nxc
@@ -13,8 +13,8 @@
 --attr-post-ops=
 --batch=set_all
 
---dt=f16:f16:f32 --dir=FWD_B --batch=set_all
---dt=f32:f16:f16 --dir=BWD_D --batch=set_all
+--dt=f16:f16:f32,f32:f16:f32 --dir=FWD_B --batch=set_all
+--dt=f32:f16:f16,f32:f16:f32 --dir=BWD_D --batch=set_all
 --dt=f16:f32:f16 --dir=BWD_WB --batch=set_all
 
 # Test Deconv w/bias through GeMM
diff --git a/tests/benchdnn/inputs/eltwise/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/eltwise/option_set_fwks_ext_gpu
index eb21f505cf1..a40c00469eb 100644
--- a/tests/benchdnn/inputs/eltwise/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/eltwise/option_set_fwks_ext_gpu
@@ -1,98 +1,86 @@
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x1000_n"af79b0f942290e6348ab95cc93de8fd5*2&95446fbaffeb9499bdc9be7c14a274dc*2&364396b843ee2e2dd21f342c3ecac049*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 1024x64x112x112_n"e82af09a0c36c83132d9ade82095bc4a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x128x56x56_n"d4d1e92037b23598c5c2d0e74658909c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x128x56x56_n"da8bdce770d0d4030c6fac9bb964b882*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x16_n"d22ddd85e2c66e3a391dd0e57165e935*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x256x56x56_n"58dc3df155e846c3de63c2a3bb24c4c8*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x256x56x56_n"321287be6ff157bfca3bfbd91f7e9956*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x256x28x28_n"d0bd95b87e225d962509f1aa590de1b1*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x512x28x28_n"1c25200d765959df9d90322538c715ed*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x256x28x28_n"cac77d06716015224c06f9652951d507*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x512x28x28_n"cadbfae0c2e63296eedb076c0d7813b9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 1024x512x14x14_n"d2f6d95df8d55a6914a5bf8ca07bc984*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x1024x14x14_n"a599b5e52322a6654abbb96d528e0e54*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x512x14x14_n"1a2afb71ff30be9d233b8a70c2b110d6*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x1024x14x14_n"51422aa58b40fb396c6b3b165908f95e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 1024x1024x7x7_n"49de87ccf91b578b33bf30c997082112*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x2048x7x7_n"16da0f242e598a43f149ee09cb934cd3*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x1024x7x7_n"1cab0f62db305b0ecba36815f7567f4d*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32768x1024_n"b81a8b176ef7b26909648cf556b6778b*5"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32768x1024_n"be99365c7b9f932906f88b0b9f092a5d*5"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x1_n"d6bdf0ab4d45376917a67dc9fc8a2592*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x30_n"40b9b53cb495187dd33fb2b8a6329e46*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x60_n"1d23cd4741c6f42464addce42c4a8738*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x125_n"c18958b7c3c1798a4f06a415b30a3570*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x250_n"04f932456fd4c5492799f9da6733802a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x2014x1x128_n"578b358bbd392395bdcc86df27fcb22a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x473x1x256_n"fba97268fd7c58f4c52c195a6d860808*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x118x1x256_n"7d9b8c9a348b8f4145f05067c657af83*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x128_n"b22725e531c3aa65ad5ab08628a223e3*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x64_n"0449ed4b869ee8f48d5cdecb893902a9*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x128_n"1fd06c241a9f2e8ab31e18ed12d2f276*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x64_n"4ce94eaaaa9609987b0ea5e3e0ee47f4*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x128_n"c804e09123d0f96b6a79a6eaec171d1a*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x118x1x256_n"3b0cbf2403a4202dea1d8d97b0d9dc42*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x473x1x256_n"f407ac85088d814cc1374fda3e8ce9e6*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x2014x1x128_n"6ec5c3751bfdc669ec2454846c3a639f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x8177x1x64_n"00665895e1e9289e4d37ba52a3207702*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x50_n"e7bb7100167a91ab4dbafe1c5f2ee4af*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x100_n"466054a69dc4692a55b4a878b9d48062*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x500_n"d8deb83efee65498c541c44ca8fcaf86*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x1000_n"d25f79da00f726f94db5702a182c72fa*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x250x128_n"4529236b5659ced2797312856d7700e7*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x1024_n"07bf186b02453891b1a650d88a15460a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x256_n"ec96f434fd642a0edccd1d1d51057261*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x64_n"945b942511ce7e54187d94708aa852de*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x16_n"2b24023b6fb536100ca4bd5d50645715*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x16_n"f9e353e58b7cece3550acdf58949af28*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x64_n"e8e8b87068822533a3eade3a93d516bf*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x256_n"4d9626af7ad5bb08d5e1b584385f8795*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x1024_n"a7ba15e8c24bdaf94c0bfd8adb580fc5*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x250x128_n"e0a7742e9ce30aa79b50252deadbd3d6*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x30_n"c869bbc37ff1c0da1e9ac4fd160e9dad*1&e8367c090ad2f4a35e8b9e673019ad4a*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x60_n"64f33e1d8beb16268f1c6761b52c1501*1&071790eafeb27338b7c442ed3ed7d1d7*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x125_n"a6247aa690c95c4f1c69133e9c305091*1&b587b672db0ff92ab7e107f87f114283*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x250_n"05d6bd0d125bbf4040d82670acec9fe7*1&f94e13151098ab9d45e580190095a903*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x500_n"57303a87b12b121ffa1b0ed10343c9e4*1&187bf01206e09d5d7c4788ba7ef7a746*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x640x1024x8_n"f3eba2d153761180d0d9718c0c972521*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x320x512x16_n"3aa265ba7aa2a997418de016252b58fb*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x160x256x24_n"51dd4342f4a4008a5438f1e9f730f641*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x80x128x32_n"84510b6f21fb5746f67baf7532fd90ca*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x40x64x40_n"f6a679765a439b83249160078e40d103*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x20x32x48_n"ef32f68738c22f21f763989b9538fd4f*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x56_n"50c89e3802850cb3e5207ef29df44f77*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x256_n"f32c29adf6e122ab21ee4e8ea62bfba3*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x168_n"6947a9127e7432da54f7788f89e6566d*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x640x1024x8_n"06d46f83e0919edb8dd56f1c208601b6*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x320x512x16_n"b19b1d9603cb0604c13f89b0bcd08bfe*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x160x256x24_n"7631c9634d92b5c0cfcaded8d089978b*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x80x128x32_n"764eca1f4e440895586b707adf387c77*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x40x64x40_n"b3890ad2fb954db01319cfdfb8d81ab0*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x20x32x48_n"d78e21386d2aa4ad115d00c8ef190ca9*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x168_n"a4f8329729902be4975a67b5e0689019*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x256_n"fb96955af0ddcc6aec541ab35875c169*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x56_n"8ee0d4aa9f706bd65f2fc162ce65533c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x512_n"4a29bb111a11b49b1ab29112415fdf1f*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x256_n"0121209bbb490b6ee48b61c0d1405023*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x128_n"ca5c362d56c59c5a49738166af1e46a5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x1024_n"bb4da5198074f1be4b9ce237b8188a8e*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x256_n"cec555b6fffe538c2dbcc29f4bfdef6c*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x512_n"77d6ed50e027100a242c09bf68dc3c36*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x1024_n"877453d4aa7ad0f8c032a58fa7371dc7*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x128_n"3272ffb05c0859b1aa0e525a3a52f6d9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x5905x10_n"9f95386ea712874324503ffc0aa6f971*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x117x10_n"07fe8c739cb189078c651a873a71b2c9*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x117x10_n"ed6186db1aaae674ff7cab7f0664fbfb*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x5905x10_n"d7faafee85469b75c79330524456cf41*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x128x128x128x32_n"dc25eb29bc76ec92c8ac3f5923a81b9f*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x64x64x64x64_n"83993fecd33dc01a50acc8acfb3548a6*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x32x32x32x128_n"be2e6db44187e238d9a5c0f61f31c2c5*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x16x16x16x256_n"8fcabd6cb5a9bb2f0e5156fd84c55c01*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x8x8x8x320_n"0e5fef7d58d8af6ae626aa13dc0be219*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x4x4x4x320_n"139a46b167a43ce51d9c15814c6d2271*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x128x128x128x32_n"ef455fbe9d2d86df90c0c3108daea730*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x64x64x64x64_n"ce19629131b9e5fc324f09e0ad12d1e7*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x32x32x32x128_n"a7ba7fb06154ee8857972220946737f0*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x16x16x16x256_n"583da34b955ed0374840aafe35218feb*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x8x8x8x320_n"073f21f056b932533c9bcd953c8e089d*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x4x4x4x320_n"2cf53fa6eb6d0addc991442a705f02ac*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x640x1024x8_n"eb133d029a565678b5ab5527be54e6ee*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x320x512x16_n"65ef32d7a9c4697b59ede10e75863447*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x160x256x24_n"fc582622bbedbccab696174e1bc47855*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x80x128x32_n"edff54a6bee29a226bda5c9fa197eade*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x40x64x40_n"a205d6229e22dd0fafb8a2bb5a46359f*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x20x32x48_n"0a4dc9062170447b4d34cea1e173d9ca*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x56_n"3c980198c18450f0332741f6b40e573e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x256_n"80dadb9aafd6667465302a0c9d713384*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x168_n"be900f47cd5fad44bc91107da02394a9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x640x1024x8_n"013dfa6f194856f295fce2ad1c07e1c3*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x320x512x16_n"b677575fef138acfc845f02d0ca32f7f*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x160x256x24_n"3c5fae84c67fb8e8e08d9d2966eae6ca*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x80x128x32_n"adb5bf702500079d57d72596fefb4465*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x40x64x40_n"330f84b62273539cddf9910f1cbc5ad6*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x20x32x48_n"2406a0dc5d320eea97ead7455f78d9c1*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x168_n"51067284e5cc3cda98a0ec09cdf58d08"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x256_n"c26a448f8558b9bc609af586d1ccfcdd*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.2 --beta=0 --dt=f32 --tag=abcd 14x10x16x56_n"e1434142082abdada51449092c0194de*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x128x128x128x32_n"88b2c5c018b02e8eda24df69d205d2e8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x64x64x64x64_n"be1820bfb425a374c7a8500335941905*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x32x32x32x128_n"c17c75137a8b5cbf88b2f6a1e7addf5d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x16x16x16x256_n"f7ac73a08a24ae270ecd66d6d3bbb255*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x8x8x8x320_n"0d32e643aa847bd81eea9ae4eb84233b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x4x4x4x320_n"9709f8cd877d21f775d747650bf6fd94*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x128x128x128x32_n"9f3e089c6bdeb4c976ae5dad62810b0a*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x64x64x64x64_n"ad831e64e66514448b228b4839043744*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x32x32x32x128_n"993f935368646e77969723daa036d1fe*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x16x16x16x256_n"06436e57091911c04e80054988f02e38*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x8x8x8x320_n"47ce16b877c34cfee7a4b376f800ce60*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=abcde 1x4x4x4x320_n"00a330b243c8ac5aeba0d1034af2a0c7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x30_n"76c374201fefa5a51bc630efd8b3c536&56476b3cf83bd2672d14bfc8974d5552"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x60_n"3fa04cc322a5b7e0543dd2e322b308e5&eb9baa6a93d6bbada3e9929f36708116"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x125_n"2faa10590f8a49fc7c5d019161949f4e&bb07adfe3b8e05a89f2bdf5d12a6c95b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x250_n"1c6343beb1382525b144afcb0a3f6d6e&8aa713f507eab911cc92ff15991aab4b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x500_n"84c2f258cca2799a136d08048cdd7302&dfc7089607096cdc5b174b264536e3d1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x1000_n"24b17c557de367d53b3040352e4d1029*2&bdd3aa2c1fd0df22fe16625c29ddf5dd*2&6e7b7f09636773951e87ecdefbcf6e6c*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x1_n"61a585c431bca1ccf631e7167ad7ffbc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x30_n"341f6157654275aa788ccae45f18448b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x60_n"39d22059d0620d9b5bc4d3346df3d09a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x125_n"2234457da6be754b678affaf9f3040fa"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_elu_use_dst_for_bwd --alpha=1 --beta=0 --dt=f32 --tag=ab 32x250_n"ba757fa3f061376533f7d1d133540e90"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 1024x64x112x112_n"6f82fe4479beb8f0f553e379467a379f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x128x56x56_n"161e670b7601e8eb728f82c68370faa8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x128x56x56_n"4dbeafd26200b8dc11f78c52e005dd95*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x16_n"a1af8109ff2932cd16cc138364227b86*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x256x56x56_n"c5b3e36a2fdf38bbd4533a1920df53a1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x256x56x56_n"a858dbb24134e002b4f5ecf795e52eb1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x256x28x28_n"90973856ee73cf29e593590a5d3eafbd*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x512x28x28_n"4ede78e17fc87b61e024a904efbd8270*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x256x28x28_n"e430186d4805cad13c5f09876bb5e01a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x512x28x28_n"ecbf73d233993af8d2dca6604e3ae75a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 1024x512x14x14_n"7efe73984666a6c1a5828e6e6b219443*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x1024x14x14_n"ee6f347d524d2af793becf5a5b4fe535*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x512x14x14_n"6bc0e5fd693e01e924d2b2a27977eba5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x1024x14x14_n"5ba25fac8c6025c715d3a0218b9587da"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 1024x1024x7x7_n"be377552e34b167dc4d4b38126fec343*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 1024x2048x7x7_n"1db8b2a03b8895ad119078280eb03f4c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd32a16b 1024x1024x7x7_n"f0cd1719074fb092f80c974e35b6f5c0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x250x128_n"4ac544eea113734140d00a89f6ba2ff0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x1024_n"2b6eb234f2ed2efcf6c2b43368f8a6a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x256_n"df9c82c555edcad0ba0121767fe000d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x64_n"5f99936bdf34ef6f8a36818ce91ef23e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x16_n"d1ee40126a6e3f5e0d80923748e4e641"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x16_n"75693afe23c65113c4384da8b1ad66d2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x64_n"09625ccea6c7988c32142f6d0d0beb44"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x256_n"05ec2cfc9c0e2fd87acd638ca9eb4027"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x1x1024_n"4d880a4d4b5de7b72d1a290fc9240ede"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 32x250x128_n"fe7d10cbcdc1c118ed2dfaee46494898*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32768x1024_n"32034dd960bfdd7e244845db12c14bdd*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32768x1024_n"8a538916eb1a1b9ad2afed64c26f7155*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x5905x10_n"e3247e2ce3fa06b74155f3e25758bff2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x117x10_n"693dde3b088d26c4189027743e3fb9dc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x117x10_n"b5129fb2e407002dc8407f6ad46d6d17"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 100x5905x10_n"f934cdbb59b37efffb15c504b0565f9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x512_n"eb789fc7d2bfb1503a6373542b9dbc83*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x256_n"d389ea93ef42efd53eef605fc8b39678*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x128_n"6372e5dda505e26705c83cf04169e500"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x1024_n"b77397d68904b3a6a2e2f2902b19d9de*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x256_n"5923116c2f8c83c27480061e29c63515*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x512_n"b5e391671d5888a54f012b335c4eac92*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x1024_n"d15c5e15482e022ec48d6027983d6540*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x128_n"984ce902e12ccf6602fca1996e7d9361"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x50_n"44e43d91ca80909880995c74260605b8*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x100_n"e5d42c7efaed2cc92bccf614e18b057f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x500_n"1fbbaf82784ec94bc2d943fdefad64d9*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 64x1000_n"4ec7b5951f0dc8acb2ad1aa738cae783*2"
diff --git a/tests/benchdnn/inputs/eltwise/option_set_fwks_key_gpu b/tests/benchdnn/inputs/eltwise/option_set_fwks_key_gpu
index 579e33b7953..91e8ed2acb3 100644
--- a/tests/benchdnn/inputs/eltwise/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/eltwise/option_set_fwks_key_gpu
@@ -1,392 +1,392 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x32x224x224x160_n"20ddcbf06a576ebe9f556d37fac961a5*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x64x112x112x80_n"9687dc3d060a4536b2c0498e97888285*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x128x56x56x40_n"233334cd66cd923859bf8b683583cd0d*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x256x28x28x20_n"20d31486c8a240a5c59142f8c32b6eec*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x320x14x14x10_n"a9314a7772fd24ab5233f82388209d02*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x320x7x7x5_n"7df8f18583191d4c3e9bcb69ce245f18*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x2014x1x128_n"38d7a6b0d3c1de2ee3fb2e89a1b49660*1&578b358bbd392395bdcc86df27fcb22a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x473x1x256_n"4b4c34a1a1f052e816bb7f177e0b1cbb*1&fba97268fd7c58f4c52c195a6d860808*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x118x1x256_n"5767e284e2a415464dceab3c228fbdd5*1&7d9b8c9a348b8f4145f05067c657af83*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x128_n"1759f1d12947780fdba45ed5001ac703*2&b22725e531c3aa65ad5ab08628a223e3*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x64_n"3668dcdc01291809b0d48e29f5e6678c*1&0449ed4b869ee8f48d5cdecb893902a9*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x128_n"a368fe740d5a4a3fc579f272be8cd196*3&1fd06c241a9f2e8ab31e18ed12d2f276*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x64_n"8184549d11720dedb12811a1257e9a4e*1&4ce94eaaaa9609987b0ea5e3e0ee47f4*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x128_n"0a22ce01ccb8260364fecf169fc1f16c*2&c804e09123d0f96b6a79a6eaec171d1a*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x118x1x256_n"501fa58502e828710d6095e84a84b26d*1&3b0cbf2403a4202dea1d8d97b0d9dc42*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x473x1x256_n"91928f51e383c67263bc2eafe0329db7*1&f407ac85088d814cc1374fda3e8ce9e6*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x2014x1x128_n"f4e8c72e4f25fa335d5996328a220504*1&6ec5c3751bfdc669ec2454846c3a639f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x8177x1x64_n"232681e4ac49558df9adc8c6dd04889b*1&00665895e1e9289e4d37ba52a3207702*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x32x32x256_n"502bdc0efe8471e685a2c1ab90f905fa*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x16x16x256_n"a64001fef96f1d377cc7b14d7cb0413f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x64x64x256_n"88b8691d1bb2b04ac72e1e59662de312*12"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x128x128x256_n"3678633c8342257b8d59361a3695f29a*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x256x256x256_n"b0cc58a1ff5b589422adbd506c2088a9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 4x512x1024_n"34910b6e646b9b19e7b1bc445e763696*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 4x512x1024_n"0a4ce1d8dbefb03bfb2d46a4c3c8ab26*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 512x28x28x256_n"8e4ce9445fb7b60d2db094423ea7d2fd*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 512x28x28x256_n"909b245e649d48958b90bf72dc2652ed*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 512x14x14x256_n"8e76033cb76a647b8d68f6ec986c93c1*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x32x32x2048_n"dac7ba8ec495e02272b56dfc31b5aad5*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x32x32x512_n"5ac59ea468211822b3a8ce8e1162be1b*5"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x64x64x512_n"eb6717490b414622b1c2ffcfc31e5afb*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x64x64x1024_n"2a780afd4d5b91ed8065a944a4616696*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x128x128x512_n"40866271ff22508951f5b83600e94733*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x128x128x128_n"bb6126843758b586876276c8bbc0c958*7"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x256x256x128_n"fc246e1aad5283f8eddb74932d1ce7fd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=a 32768_n"b79c4b896725f3be40f612d6a9b078c8*100&5d70aac5e015b116171630ad44763709*100&3571d734f52011785e758b683f49fc06*100"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32768x1024_n"40cba01c529fb438eafe3b19c73724ba*500&12e76b5e59e419bd18e2bb7751173a43*500"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x8x640x1024_n"6627f81e0e11beb37d39580c9bf64d7c*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x16x320x512_n"ccb3f4b1bc6b0ce11347db42c18308ec*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x24x160x256_n"208663141674296bac719fbeb9dc0a20*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x32x80x128_n"b24d62fe806616b98bea77dba97da01d*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x40x40x64_n"29e81b48eb957f7b2ee69016a03a171c*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x48x20x32_n"bd49719307710a6983e06c3e85c99ad5*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x56x10x16_n"c4121a7ace788d8a5de28f106c0323a3*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x256x10x16_n"7adf0ba9d17120b5f44f8defcaecd0e9*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x168x10x16_n"0240080d030023806b04c88abfbb443a*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x8x640x1024_n"fac94d42e528666ecdaf33081aaa27e1*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x16x320x512_n"768dedeab53356e6f1c13529c04f8e8f*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x24x160x256_n"3dec2fdde8bf7f6a5fd5e87e9af620b7*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x32x80x128_n"7d2d6be7a0e1f5eb441a6058fa9de4d4*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x40x40x64_n"572df44b9fb595700087fb28834c1670*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x48x20x32_n"1b16f3eadd7c2bb467e415f9986885fe*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x168x10x16_n"22b5c348fd0b3d0ece2f1450fd4b37fd*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x256x10x16_n"222e6557329b9481d7c8e35bb08d065d*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x56x10x16_n"8688adfaafcd4c51e842b9f9184a697d*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 8x1x5x5_n"6efebf06cacfef675e5f28bc7af1643a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 8_n"c1fa47868b62702e430f7a9e94ff596d*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 8x8x3x3_n"2a86c5fec0393a708e9cecdce90467c5*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 16x8x1x1_n"018e4f71ce845f27d2ac6baae5256210*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 16_n"75fb88fecccf44968e95b960669e2866*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 16x16x3x3_n"2acc56de8f2fdc7df7537a5d9ee0e0ca*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 24x16x1x1_n"a135bca768c3707db49f82cce64a9835*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 24_n"e135e814a50d6af70b9dd6c099e9f989*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 24x24x3x3_n"784815d56a56213bfcb06615f1066049*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x24x1x1_n"3c1426e9448611c6f9a6e7358a82ff3f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 32_n"ecc2cbc2f09427376a8c9e2fa2ce1012*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x32x3x3_n"e52f4e1583a8d2f09d33a8c4d546412e*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 40x32x1x1_n"d79124df38c9c9f14acbf46f1d3bd561*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 40_n"52c0e4ae5dd6288bb7c7387168992c44*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 40x40x3x3_n"c08564a3e3f5cf94a3eef3b4e53f2dd6*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 48x40x1x1_n"38e708a428f82cafda5d9c258b4e5d5a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 48_n"c7612bac1c6ba2c01870583c994f5bcd*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 48x48x3x3_n"6eaf76bad773307a6137cd17e2fe3f79*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 56x48x1x1_n"1fbbcddbecd5c77154b8939787d918e9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 56_n"b1144e82d70038cbc938d8f3a40439f9*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 256x168x1x1_n"c4eb2aad1c6ecef4c930239c7da605cc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 256_n"d3f5d80f254076e5e362628f8a7995fe*18&f6ad177519e5b91d49a5933b11c505b6*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 256x256x5x5_n"4e1e70bbeab28ce9b81ca716998b1d52*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 168x256x1x1_n"ce87a107a928ff28f3824ea25c1062a3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 168_n"e90a0bc07f8285fa0561d666f816bf94*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 48x56x1x1_n"77e3ed4144f1cb91a931ea7dbd60680e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 40x48x1x1_n"45058742da1071cc04e082d06506fea3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x40x1x1_n"2fbe8f867a981decf64e46a004fe987a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 24x32x1x1_n"21890b2d1b42eff12dc019d8fa3a0548*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 16x24x1x1_n"f526b6f4a9cb8a7dc1c86bf805138ca0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 8x16x1x1_n"a97ec8659af2c22e89356256bc11290d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 3x8x1x1_n"a24953e94fd02c7ff534014239f8e519*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 3_n"ff0bdeaeb0d8dece2e805140344d3668*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x112x112_n"444ea337d9d5ab70a22325c01a2c3162*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x56x56_n"586a9c4d4b6224703ec00dd3fb03677e*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x56x56_n"ac2e3f78ff7abcfc268e4668e434181b*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x56x56_n"6d6108e2cb74d0d548f1b2639ddd80ff*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x28x28_n"f061b2742b34cf9feb7cb69e806a624d*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x28x28_n"85a18e48375fddba84ab8f3732f1243d*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x28x28_n"353c7ffddd88e3167848b41e7ea01c35*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x14x14_n"c4ea4baa6c739530438eb93372a476f3*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x1024x14x14_n"161744dc60391ad7a4332853cebb66b3*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x14x14_n"205c5c94aefd6b5fcf8c9cad7ff6f966*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x7x7_n"5e8bd5118f9a641462c8d1f4f69c1b21*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x2048x7x7_n"d825905bf90cb19343f23279b5d17428*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x2048x7x7_n"aaf71c7413f627e3e1ede5c94b25ef43*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x7x7_n"0b851bea25bd9078220cfe5b5d776880*5"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x14x14_n"7989c3b5c1ae4fa09f632da8d1aacb26*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x1024x14x14_n"c6d4c95014555f96066043fa34da0f89*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x14x14_n"0b62d02c931cff9b170e4d9ac8ce171c*11"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x28x28_n"e20be940902ac3bc81c5949a1f1b2fd8*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x28x28_n"b5feba4ed263ef60bc4941365b8c87da*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x28x28_n"97f48ab43ff7190bf7f37e0dfeca610e*7"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x56x56_n"915eb476167e3976a8019a253a6797ab*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x56x56_n"fccf0b9628fd81cfed98d0d989a83310*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x56x56_n"348c85ed82e7d0e7925d3d34d4bc1239*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x112x112_n"6f319bc6dfbfba0f8df65257543648fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x208x336x256_n"89afc56a32094b2ba5a76fe1540e0e0c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x104x168x512_n"ea997f2cceefcfe0eb1d35441a1c0c82*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x52x84x1024_n"42c67689935429d2bce22eee54855ade*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x26x42x2048_n"48f3238113d4be9399fdd3a78900195f*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abc 1x1000x1024_n"3a28659162133318bcbfe65e24a16452*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 100x28x28x256_n"f93512d20eae6581749a7f7b002cce40*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x256x56x56_n"5230c772d6a2500b935b0342e901e723*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x512x28x28_n"5930cdde0fe62bda4d52b99daa379000*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x1024x14x14_n"76a1c972eb022b80dbf02e6b5fbcf68d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x2048x7x7_n"254a95d3c48708b5db05dee569d98cab*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 256x256x256x2_n"1384205ebca980cb8fa8ed778b8e34dc*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x128x128x128x32_n"38317d452ee8289b5c917b01b66afb1c*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x64x64x64x64_n"d80316d50533bb3772a67c3fe589195c*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x32x32x32x128_n"459f3239159fc906ec862b8ec3d81494*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x16x16x16x256_n"8098f7a3a62538cf7ef4932d378a8816*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x8x8x8x512_n"c0082ab8dcd30f8942cf6cd5eca584a8*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x128_n"8de7d1339c5ea3ce70ae4650f1228cd9*128"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x64_n"ca2504e1801aa91553e701e9c8082f97*128"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x64_n"00cce4b0d0952ca849778c36de35d920*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x128_n"99028b2e3b54f44f89cedc151c775959*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x8x8x8x512_n"ae3c350402750e62bea8ac2e59d0b015*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x16x16x16x256_n"80ab78a07050e33d9b7ba1bd20fd5702*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x32x32x32x128_n"0e292e8e31dd77af2ef50b2cde9e06dc*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x64x64x64x64_n"efc253c8373d65681a5ee259ef60aa66*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x128x128x128x32_n"8dc0796415d118e5fb6bae99462d66e3*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x128x128x128x32_n"4570a4a9a0adc3a7ffa84e2528b39d67*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x64x64x64x64_n"713c257c25f16a5eb1a990b1b6750245*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x32x32x32x128_n"7a5532c2e20c5fa96d23ac78ea3ac24a*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x16x16x16x256_n"5b56fe2a2898dc227c71a81c2dafa4d6*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x8x8x8x512_n"65cd31471ff9bd4ec78d7832fa9d5b67*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x128_n"7f4d5348309d64697fac671116c8e7ee*128"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x64_n"29f085cb8b92661a97e503818a9b7ad7*128"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x64_n"f484fa3ab8beac51770bfc451e192d2c*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x128_n"f661dc4eccc870d3c356c0b6c0f14f13*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x8x8x8x512_n"5cafaf029690eed6244b2070e52ea574*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x16x16x16x256_n"9f8c9c52505b8d6417967d2d0d55a5de*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x32x32x32x128_n"0fb0fc58874773c2d0d6680c667f1c07*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x64x64x64x64_n"ce536ed872799fbe5dac751a69e7704b*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x128x128x128x32_n"b1c95c86728c8a7be4891051237bc15e*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=bf16 --tag=ab 16384x4096_n"3cfb2a8692e3de0a6581bd57968d5a9c*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 2432x1024_n"bc9a51d8c6fc2054ea3159806b1a19bb*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 2432x1024_n"a8f49adf80882dc70fbc59f4f3bbdf44*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=bf16 --tag=ab 16384x4096_n"3cb1700e3d57f252bc1a730104fb73ba*24"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x64x64x256_n"2584b2706f50cf1739261c592048700f*12"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x32x32x256_n"0f9b2f51e2abd02653d46f1661004aff*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x256x256x256_n"a36d78aab11d781262513350d555ad35*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x16x16x256_n"a96899c95ca60509ff92d950b1bc35ee*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x128x128x256_n"508f516b669f2524026a3702968e9015*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 4x512x1024_n"6333d7348559b9aed4b816fdc8e96237*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 4x512x1024_n"9d99c63558778d1b7097c3b34acccf76*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 512x28x28x256_n"34ed03335569053d4ebc63fbd9727292*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 512x28x28x256_n"f9b1375e13b43976d6a0a5d0e35fadc6*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 512x14x14x256_n"0d8fa15d76286989200fc425bed89afb*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x32x32x2048_n"890f3ff20df91f3c7ee4136046fef713*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x32x32x512_n"92961a4997c648dd6b04a5f1e9888359*5"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x64x64x512_n"fc3ad30f742874f9ba70bbd0692d604f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x64x64x1024_n"ef3905656291f32978c105b937e8823b*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x128x128x512_n"a946379c06feaffa8705236a658e96c1*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x128x128x128_n"6681d5cddabdc9c9d61344751776d8a2*7"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x256x256x128_n"52ee8947f1aceb7f3b93328a8303ddaf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x256x56x56_n"1253ab43b92a051324303d56ce59e020*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x512x28x28_n"fb81572a31d5656560cde9d6fff5b806*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x1024x14x14_n"1c6b825df8fb4739c796a23dc9a87759*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x2048x7x7_n"18d2fb38eed07155bd9f24c7f0158375*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 64x7x1_n"dcea6e712b9b33ec2982ba67b50bcef9*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 64_n"f1a6408419c553e3a550821cd78307ff*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 128x64x1_n"9ffbaf84476fd34263d78d4a898f444a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 128_n"98a4d4af0e84395cd6574fc8423336fd*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 1024x128x1_n"39185934f0ee44364d8613da2861be12*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 1024_n"01d50031a929cf27eed420d2eebb02dc*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 512x1024_n"2cb8e3792cc693543d8ab07cfaaacbd6*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 512_n"d2b1c51dd1846c809cd7eab41b5ca55c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 256x512_n"920994b5322fae331bb068f332297c7b*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 49x256_n"d089cf7003b363e1405df6e0cba6bed9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 49_n"e97b78168ad6c9ed6eb41a437a42c232*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 64x64x1_n"7cee1e9f070e9f04a7805a133ad321aa*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 4096x256_n"68c8470ff7196cad31a6ae3ac4eb758f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 4096_n"6618a2b6a5c3ef18c3d9647782b331c1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 512x1088x1_n"736c62ab2aa356e0c338293b15d3a5fd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 256x512x1_n"2c1c9ce4734dd28719838a0be465bbb8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 128x256x1_n"6e2c188d59d843d880543cd6ead6c033*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 5x128x1_n"88264b45b98fd9d1a826c11e498e2485*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 5_n"c1fb8211c76f1ba3f18fdb070f54382e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcde 16x17x33x33x32_n"2f62fa6a1a46961ae4cc9dc075ee1d44*9"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcde 16x17x33x33x32_n"9d7c9173f811732a64f862ba5a2dedee*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 1024x4x8_n"0a3551a80269e4607dddd80d25363139*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x8_n"98e0d81db9430d990b7151cd7c82b0fa*8"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 1024x4x8_n"45432f27fbf9aca17a729b684fd46deb*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 867x4x8_n"d70ec9e0eaa2efcabf8bd084c82c4028*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 867x8_n"41d8a52d1f495cf1e90953d664fd4b2d*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 867x4x8_n"5d6b6185ef956271b6bbbc76af63754d*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 128x1000_n"7f856f8ca4d24d0988328538ce6015e8*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x712_n"61cbfa951c0555c252006622558a8098*31"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x712_n"8af464249e87e3f08a058e3cb865e3be*46"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x712_n"7d7e0806b1780cb8da78cfee972eac99*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x1120_n"27cfe1e3af85648e68ca1df80e16300c*31"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x1120_n"d47dd2ba03e4070e258421e9fcf47b37*46"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x1120_n"e21519f9841f187d5c0de7ae75ab71d5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x360_n"3296bd48cec2b4ebbf4350c94c95cfe3*31"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x360_n"eb5bd9b50cbd3eb793a3b2e6c6b4e87f*46"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x360_n"7085b1c923e2fa165026e172fcd239e2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x288_n"beeb1a7797353ea53e288648f3f137d6*31"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x288_n"0659b26146b768058a8cdf8db044852b*46"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x288_n"fe90274e3d6b09773d71dabfaaf0b9a7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x560_n"07024b83071642f88898a468a2a648e6*31"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x560_n"2992b3ad78692c3652b8a48501b10f54*46"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x560_n"e113774a287a0e5269f71b8755677cd7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x64x180x180_n"a87127a8f5efa092b7b23db8c726b445*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x64x90x90_n"ec87b0b45b12430e2eaa7cc34a013e01*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x90x90_n"a91dcbf413d5b0a41c06dee5d36c055b*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x128x90x90_n"61099f9abd1242ca710fd2fb4a117d71*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 64x128x45x45_n"bac2185decc64bf602450b64516f7a41*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x512x45x45_n"c0cf1ad456655a22229f4be854cc681c*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x128x45x45_n"8fe2d23e8400811e5b5465f7cc7199df*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x45x45_n"835d572418269ae4de6b9b965522b9d2*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 64x256x23x23_n"7fbcd38d3ee592f697e6214d91f63c00*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x1024x23x23_n"baf8bdd79872dcad08b2e98cb3cbc1e9*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x23x23_n"f4794e63a7eb766b629b0b4e28f965fe*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x512x23x23_n"dbdfdf096e92eace959cd4f26f6ce484*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 64x512x12x12_n"cde28ff5593ef8edc23fb750a01c041b*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x2048x12x12_n"ae8e90d4058eb3bb434cc5bfa3821c61*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x512x12x12_n"59981ddeb73fb66df7781c7c8e7408ea*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x12x12_n"3d9db0ae9efd9013aa195ca1b86deb42*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x32x90x90_n"6443a1b22e2091976c4acc8371dbb37c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x6x6_n"416b50eaaa99de5e60cfe0c76d04b522*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x3x3_n"e266003690327030d7ef30127c727b9f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abc 5000x4x8_n"eba9680b8fc2da31e513d4a7788351e7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ab 128x256_n"9bcd563dbe5fe58c47af131af0ce1aac*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ab 128x128_n"288aa3decd9779c9f5db8b5768ee2b0d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ab 128x64_n"443d93b712ac13930815f2754c380a77*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 8192x4096_n"a0fa290471975bc012832db648e3aa6e*24&1b6099fdaf0d6a3940b92999eb2ea482*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 1216x1024_n"543f551de3e0b42ac7f42905236af98a*1&6e98303be3e66120f7d32df8bea02c18*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 1216x1024_n"9008b32c55f93a85762cbcf7af5209c3*1&cf718b7d4461a3429ad4b265ae0d5023*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 8192x4096_n"e97df86199a82046014c975f5d151a7b*24&0bcb31bd187481d011e790773a0667cc*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x64x416x672_n"d86b243c4f8097d02a390b5ec81adc43*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x64x208x336_n"1d96dcbbf6ee6a1413325e04cbd3ced5*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x256x208x336_n"ed0403365f7821dedc33191186b36d60*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x128x208x336_n"ac89b6fb2dbe4fd434562c2273e0272b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x128x104x168_n"e22dfaee66a2fceb136b5f85b11ef297*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x512x104x168_n"915cd33d146aa5992c051ad5c81d6349*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x256x104x168_n"bdbdee28960f8a55d29c0a05f92bf4d3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x256x52x84_n"3f4da306e458008a1727e9e600b93dbe*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x1024x52x84_n"3c96d93ad3fe141b69a5a1e287648e70*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x512x52x84_n"79be31814ec76b15cf71249e39af1d2b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x512x26x42_n"e0e7cb03b38e638fd6228475cd592b7e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x2048x26x42_n"6f43560acacf69dcaa8e28a3b8f17d5b*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abc 16x1000x1024_n"c89a6c7534b0a0a5a5d425afb653dec6*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1600x28x28x256_n"79ad455eaecc73faa11c1b0350f25420*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x8x640x1024_n"3c5f5413d13caab90218156526b2dc24*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x16x320x512_n"b0328ed7533ca06b77b07b668d22de20*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x24x160x256_n"0a7920a80a8caf37a35b1a4566fbad44*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x32x80x128_n"cd49352cdf4e98ca9d259b401fc2ece4*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x40x40x64_n"63db2795655f706d77f15a41abeebda9*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x48x20x32_n"25364343bdb555717c303a2bd110739d*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x56x10x16_n"dc2f99de0794f6309d0f9546e059083a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x256x10x16_n"a1b1b67fb83b341e9d0d38a4fb1f7c55*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x168x10x16_n"185722e5df716f743a44d6734a5d0159*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x8x640x1024_n"b5f62b46b73c0573312a5dc814be8e04*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x16x320x512_n"12a80d8c94002a392fa2177acdeb0c66*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x24x160x256_n"41ac4d0c4a61a54bb748bf63deed53cc*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x32x80x128_n"797e3f9e4d34cec9c628a60cbf756b0b*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x40x40x64_n"b12399d6b7bf60fda2bb2433b5acffb5*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x48x20x32_n"90c5d88b926421f466c7e289593268c5*18"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x168x10x16_n"2009f47d9fb8dd147347c99ce278d61b*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x256x10x16_n"475ac3245784d5fdc6ba2e04c974476a*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x56x10x16_n"0d6fcd697df2591b2cf327e8be611c0c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 8x1x5x5_n"c20ffc3e704bad76e2e8e88d5096b2f2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 8_n"3876af15bb6e4c0dd945ea42d451e2b5*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 8x8x3x3_n"0e97ed667d54b47b072e9778c2c668b6*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 16x8x1x1_n"1bef6a2dd63c81ddaf600e7f0fd53d7c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 16_n"ec899326142415b07c284647753e8015*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 16x16x3x3_n"8e7a036227dbfe6d75a0b2b69044f482*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 24x16x1x1_n"e9bc1d97f66795a17532fe5ccddd9abc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 24_n"31c1cdc995d77b110f6e149f7e97e2a8*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 24x24x3x3_n"d10f70bea9269fc4f2853b8d97326baa*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x24x1x1_n"cce1a6fcce2df463094f8bea255199ee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 32_n"be73102834812a42a43bde47931b1cf0*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x32x3x3_n"25a3671c475507ada7004d72046ca031*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 40x32x1x1_n"b4b6a87ccd5a5ea9836b22cc432aac47*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 40_n"61a9a605faca5651a563e8a62cee2143*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 40x40x3x3_n"1a00a57c1ff6872b431220aaa98f0daf*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 48x40x1x1_n"4c4f8408d56284544ed471aadc35ad57*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 48_n"2ca1b39f5e5926abb6fbf19d478bd8f8*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 48x48x3x3_n"260c99abf498289821617ca230f95621*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 56x48x1x1_n"974f4a4af1047f69c6a62e59d4f604a6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 56_n"50b69b498bdfdd81ffdf1246850d7098*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 256x168x1x1_n"67caa3415f5154cc75d56450b909608f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 256_n"36aa2242c701ba6f70a4ecb07264c130*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 256x256x5x5_n"b076bb3f9162476318a227273c20f476*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 168x256x1x1_n"f6b229e726012653cb268e050e04c381*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 168_n"d8e89a602d1590bb491d4e65cb47f948*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 48x56x1x1_n"ee03804e317823d6dbd2c6d67dde4854*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 40x48x1x1_n"5586553887e3dff9410aaf4f6c332c8b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x40x1x1_n"9d920101fa7b422c9b706a2a2f53e9a6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 24x32x1x1_n"5a6255b33f3855a53c53cfb26b95953a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 16x24x1x1_n"b1e211d0a4ee28cc5157353829c84d83*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 8x16x1x1_n"94f31ffc30b6683fcc8c6370e69c2694*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 3x8x1x1_n"4b6ab5b1d7fb9d84b84a25ecb964be7e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 3_n"27216f663055affa93b74012c5c38210*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x112x112_n"a20ba11cde944d943210f285eaa09d24*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x56x56_n"76aa0ff26f46bd26d0ec729091025f10*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x56x56_n"f427f48e528b0a2d7bcdea5c870f64f5*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x56x56_n"e9e598d4bf461a2d1ed29f1f7a93d5b7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x28x28_n"3d8c8f826d3ba1e8b7358c865becb613*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x28x28_n"1058197440dce7ea972ae1db7daf5a39*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x28x28_n"d7ed3c0dbbc22be2e631ba10d82f3bee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x14x14_n"30e24de2c4a1a5ed61d3751c2bd5895b*11"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x1024x14x14_n"c746044ef36425ebd6cffdaafd96dec3*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x14x14_n"d7b2b9389f02d88bbd7fc51ac67fb2a3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x7x7_n"2e85e84a3455c60cb14e8ed0cbc11485*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x2048x7x7_n"220354c2d64c9d904821a1c257acfcb4*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x2048x7x7_n"e64342cd206ede0339f871284872669a*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x7x7_n"677f0ba0eb182deb8ea7ff1f17dd5273*5"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x14x14_n"bd9df43e76328c5e59930664c2fbb302*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x1024x14x14_n"22af24d4f4f6dfd97d5a50ec69d1919b*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x14x14_n"25d28a97e197791c5559a3d3d893e95a*11"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x28x28_n"4e6aa51fe76ef36adf12f9342749bf51*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x28x28_n"ef2cfe7e8897d3f00fd5873189b19bb3*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x28x28_n"e0f26a1999629213f859eef868cc6eed*7"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x56x56_n"c938289005e7f284e46bcea6692e0b0f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x56x56_n"2f4a4a8ad679911ec966f47f2a2c7ae8*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x56x56_n"a7aaccc4d86b4abf3ce18a2d9283bcda*6"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x112x112_n"fd23ebf762e3f12884e2e07bafbcc060*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcde 16x17x33x33x32_n"fdf04bdceb7877948ad5b3883cf7792b*9"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcde 16x17x33x33x32_n"7a4ee2ef500ef88c66d0a59a31b8d878*18"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 1024x4x8_n"625b434fc38dedecf11c108f876b42d4*7"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x8_n"b27055604eb83caba4b52489441aaa97*14"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 1024x4x8_n"b0ea570678f53e5d29a7f36659747b22*7"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x2014x1x128_n"8a090ceeb9a1a9e974b4b69cd272a5a7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x473x1x256_n"339d9d8fa1587e7bfd07f3a2da432296*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x118x1x256_n"2efe78991cc088edbb525aaf7916ace2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x128_n"d9265cf2f24394def58d53a6d95f047a*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x64_n"226e1f61ac72c0e5ce361040bc400c41*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32x128_n"c5682d9aedea03c2d19cb4a60870871a*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x64_n"79706db1679fa3f41b97cbf7512e81f5*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x128_n"b167f5b6f0a0c4aa8f4ccd395aff8bd7*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x118x1x256_n"51d863dc675f227c037172af873f2053*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x473x1x256_n"2302bb09c31a410f693206ef54fd9501*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x2014x1x128_n"365190a5b5bdf0a721f2dd2307b9862e*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x8177x1x64_n"cef85d593dfb3f728a683d07850d0e32*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 256x256x256x2_n"6867a50c95c2cf5b1993e7f4f5544518*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x1024_n"3c6a2f61b9c57451feaa43ef7c37f13b*500"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x64x64x320_n"fa8df134701db4d035c900ffe8a7ff58*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x320_n"4a7f3810f781b37dedbabb751863a7f4*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x640_n"7d8ac36951d14b70f0ba484eef12ddb6*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x640_n"3d80047ebbca486c1fecc308b8a18cb0*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x1280_n"e63e0d24d598195d61250a1b3f4fc8d6*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x8x8x1280_n"089d19a14b4db8e2e2cdc9812047701b*22"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x8x8x2560_n"2f06a6e427fd37f9fe9223d8966a9ded*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x2560_n"e8c2c9c6f8133430512b0e877a6de7c5*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x1920_n"3b444b6abc57ffddabb97d58829fb675*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x1920_n"abe71d947bc06593f6525ec829104a4d*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x1280_n"29f2bef8760c39644580cdb7a9a66ad6*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x960_n"17d73a4d0a6e177e8a939567ceced34b*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x64x64x960_n"b5cdd460405a4c07c2e1035fcfc2a3a9*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x64x64x640_n"2ad911a900b1a0088726e6dcecca8b64*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x384_n"7b9d924e17cebb26bce5e52041beffb3*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x320_n"ad52224be97f7f019af9bc858b69b3dd*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x256_n"3ddee1b184deac79fe826566b034b1ab*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x192_n"32b962211788cd311f684a935769d080*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x128_n"3f2c77b5914e952849c00bfc76cb1058*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x64_n"ae0235ab46d16cb41fcc0aa339d19ec4*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x64_n"8268af2396ec5456fc16182a5b99b975*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x128_n"7a2f270832ffcd7dc6b7d959080b771e*16"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x192_n"b265a3b002a852ab2c61cfa23276c530*16"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x256_n"1d8cfdd67ec47f08329ba898a79d074a*16"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x320_n"87bd0137219656e7a7b138a10718769a*12"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x384_n"8f482fe81415515c9db063a5490b9c35*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x384_n"6a7cca2da9d68bb8b20af56d6a75865e*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x320_n"905131da18a4d40a7321e5195378b147*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x256_n"a3084b8bc97e116201e79f6170c22cae*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x192_n"12add7c257cc1acce49ead61fdfb6ca6*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x128_n"4ab17200df7ae9ecb92a2c666e393b1b*16"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x64_n"6273330171835612c3b9dd8884db9f42*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x64_n"34c4a89ecb5f3e59f9aadb41d5c94c99*4"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x128_n"e93b99525d97d2286773f7d4995deccf*16"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x192_n"d364d1929e55ca5cba67b857f02173d3*16"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x256_n"ce922156d41940257e87945e45bb538d*16"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x320_n"6887b08e336c10f02d97662fd4d69c4f*12"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x384_n"5afb1c88421e0590be30b6a997771053*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x2014x1x128_n"a33a8cf36473747b6bddda16c4f811d7&dc0f0c3515b9a0d8014038ffa4b73271"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x473x1x256_n"55349bf35111ebe30074dc40df46c418&b9baaacafc255824755665ed36a0d6cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x118x1x256_n"4b3d19057cc62bc8210048515b60f279&32da5cc680169dced3dceef899a76663"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x128_n"1a68ef04b46706632a3c27f9007401a4*2&4f450c883fad15158b1c94218eb5881e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x64_n"94a94fd2db9c50e0d2c0d1dd200f4285&390753f92a0d7baed0f461ca350f40da"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32x128_n"d5a22eace61292676289183747080c59*3&7ddfbd251e8ccb7b293cccd9eb09116a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x64_n"9cddf575576b673ef1aca35fa448ee5d&ca29d858f8cdc6b7a19c67d1929a4318"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x29x1x128_n"3e8849a9a3efb889dc96435ba45a61a5*2&7392e4df53e1983c6cf27973466b555a*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x118x1x256_n"d05963a18a7035c8038c64587afd5e29&0146a1a5a50c0ee252efee3e8b24aac0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x473x1x256_n"5ae8d9a7bd48de8283343d54a65af8ae&dc68cdc98974dcac6b2a298066405779"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x2014x1x128_n"0620853bb406e83cfa863b8961e3505f&34b799c1153fd017413012ef93ef96de"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x8177x1x64_n"d82477233df13fe36cc622e001e72c59&93e9a3a2f42ef09270da83ad7007cb73"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x128x128x128x32_n"e4bc3f936f9614274f18c7a7f9bd3d8f*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x64x64x64x64_n"d3fada516cf46b4837fbf7aee620e578*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x32x32x32x128_n"039d5c5b1ebb18c44fbc8df4fd7e9e47*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x16x16x16x256_n"0362732483263676680dade7c699e3c1*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x8x8x8x512_n"31f0e23ce6ccff31c99a7e1a5d7f7b77*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x128_n"c0e755ec0bab485cccd8f7e12fa18d1b*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x64_n"283a77015eb43f4df9e73a1c3bd461f4*128"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x64_n"3eb8de70b65df99a2eebc28a151b89db*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=ab 4x128_n"ae21b8c33323dc24596172415e7a7efc*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x8x8x8x512_n"f05ae336457ad66176aadbbedb271640*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x16x16x16x256_n"b9f8337aef96d4469b68795b21dfc7e4*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x32x32x32x128_n"d21713b7817c1bbef3d5178e2ed98b1e*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x64x64x64x64_n"17147c2d317499f10b84e820e227921c*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=bf16 --tag=abcde 4x128x128x128x32_n"6087dd23f7b77a8695854893bab0deea*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x8x640x1024_n"a9bd95d9f0ccd3ded1ecd586901c6440*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x16x320x512_n"a433368813a00e10a52180d990bc9d9a*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x24x160x256_n"1c4c2e42ea59ef08bd724dd7b1d4f3aa*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x32x80x128_n"6d6fa6a80e516fbcfc8d2d8c51db8a7a*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x40x40x64_n"268730401a5a84e5e3c379eaa0f55fc3*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x48x20x32_n"371e98683fb9e89e9f76bd5e6e7cfc44*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x56x10x16_n"7d129f6912d0999df2b6511cd1c75f6e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x256x10x16_n"73ceaa1115ee479de2f914c2c272e89f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x168x10x16_n"de13a13b6ab9bef954fed9cd44081ec9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x8x640x1024_n"fd42998de31cd1ccb8402b4a628d9214*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x16x320x512_n"2e9089cd49eca11c06d7f2984f59feb3*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x24x160x256_n"b0d8c5fb4954bec1caa4eb26d974ab3d*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x32x80x128_n"477c0d8ecf8c2a335d53f12b720b498b*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x40x40x64_n"25f6d94c5aab180ec0bbcf7ef1eeb44c*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x48x20x32_n"f9b97f0a7b506b405e1d118f87a0389e*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x168x10x16_n"c18ced966524aacd070637ca8a529d2e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x256x10x16_n"1cb7e0703a83130cdbdd0faebca44b86*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=bf16 --tag=acdb 4x56x10x16_n"3c6410254d488cc1068f688e3feb3aff*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 8x1x5x5_n"e2875e0c6f4115fbf6b09536750c73a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 8_n"c76f6e1d6168e4bc70868beac6dd3e2b*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 8x8x3x3_n"d6311f78e56a786985a0c1e2eec70fb7*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 16x8x1x1_n"3a46c72526ec82291fdaaeff5bcbd193"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 16_n"50b0d12caffee1f2bdd4c2739ae9c769*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 16x16x3x3_n"ef429f960e6e0f184a42049e6efd9ad4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 24x16x1x1_n"5cd17fbcbff6ff1851d412ab92730d24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 24_n"40649037f908262aec9ea83f856d08f3*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 24x24x3x3_n"00f4f4d3c0fb7abf8ecf509e41b0bea4*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x24x1x1_n"e84c5af25657fd28c846427629377b48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 32_n"b855db49007e6cf913a99e150c80fc37*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x32x3x3_n"7afd386342591b909f58b73526c01085*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 40x32x1x1_n"05263bcf6f81dc3b7e56eb15fd09b3ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 40_n"c71229b7de54f153ac8708edbb7e5da2*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 40x40x3x3_n"bdf39ee9ff8e0a3829571516b23991d8*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 48x40x1x1_n"9e28c014ba6b401285ae0cc050185ddc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 48_n"d63b349f92128a00a20f4fc611c55132*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 48x48x3x3_n"e4e7a860f42aaf52f8aca519fc15126f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 56x48x1x1_n"7b6df3f20ad8216cc37d0bbca26c1203"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 56_n"43ad89ea894a6f752c7117782f88aa91*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 256x168x1x1_n"77fe48e8cee188275578f99f7da5afc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 256_n"d880f8971c9882fd20bc0d8f00e3a030*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 256x256x5x5_n"80ca1096d7ed362d50d66728adb9e506*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 168x256x1x1_n"3d8e56cbdbcbd63cfea2dea53833713d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 168_n"42cad23cbb4cfbd015b0a7a63bb78ce3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 48x56x1x1_n"f0038bdd6aeca6ce42ce434fdb7c02b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 40x48x1x1_n"40e8e18523a427adad0adcfdc450b82d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x40x1x1_n"66ecf8dfd0222ff078fe9b550901f60b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 24x32x1x1_n"614fa07d53a6a61f68ddd1f1509cc9cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 16x24x1x1_n"2f79503a0112cb9560e08a7e2fe04670"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 8x16x1x1_n"0066a232693530ca13123d54184add89"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 3x8x1x1_n"aadbbf8a8fcbdd77b477f0a163ee6087"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=a 3_n"1814d4586029fcbacb9860379372e9ce"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x64x64x256_n"77252bfc93b99ef0d31fea37988a91eb*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x32x32x256_n"5ece63c487d6fc02a8c7a2c903626427"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x256x256x256_n"ea8e5fa1f83d4da5bfbad331fd0b778b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x16x16x256_n"7da240f39f56e46a4dfe202dd129b525"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x128x128x256_n"ad18cd2e7bf88d0575e8c21f8e226e12*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 4x512x1024_n"a641e0895944b61b5a32be9ed882ffca*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 4x512x1024_n"2198d820abfcba927495bd0dcaccec7b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 512x28x28x256_n"04a73d762c5311505c4f4683e4f336c2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 512x28x28x256_n"d69c2bc819fe14a3125ba538776c0ab1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 512x14x14x256_n"ff3f9e00239d4949f406da4fde302c0f*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x32x32x2048_n"cd9ce1e303ae8d60d104f29150efd3d8*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x32x32x512_n"8b28ff8505ff665974cb2c2dbac77f1d*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x64x64x512_n"e5e016708864709f06fd1cfd14b66a0f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x64x64x1024_n"ae7c87e6010ba9e9fd6a47c29c08a2e1*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x128x128x512_n"7dcd8687c555b01803922ad0b6b3519f*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x128x128x128_n"a367000780cde8dd43690e1a6e2f915a*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 4x256x256x128_n"983434e0bbe7a0165dae365847eb98c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x208x336x256_n"8b5c8954274572ca058cde9aa76f4a37*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x104x168x512_n"a69f7dc1b7f578c8c6e4a9d69cd73a90*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x52x84x1024_n"b2c26ccc9e31ba0db693596b9e6dcf38*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1x26x42x2048_n"a5533ee31da918f115210264a92f7e21*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abc 1x1000x1024_n"bedbe78a3daa00e22a6ad3e839271cca*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 100x28x28x256_n"94e0ba89158f2383d4cd737d8a20f3bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x128x128x128x32_n"d1c8e88af7e4ea63a1957bbeb21904c5*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x64x64x64x64_n"2f1043ac1865ee1ef373097c17359aae*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x32x32x32x128_n"cecce01032d019db264440f3cd9c3a3b*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x16x16x16x256_n"81f97825d0b53691c9d2331a67f29d58*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x8x8x8x512_n"d9b465737b18c3c366179b15098b274c*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x128_n"05de1f70304f9aaee6bb965b3486c7f8*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x64_n"3bc490d2ba5254f811c12f822b428d76*128"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x64_n"16a8598d5c0ea023fa60d7bd612aa6de*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=ab 4x128_n"1d5a046aeb7ec0d074f4d8dd4deebd10*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x8x8x8x512_n"fffee275ed2d33dd19d37bddaf18ec25*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x16x16x16x256_n"b9dfb58745bdc11d3131fd7d72088af9*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x32x32x32x128_n"f370f68eb28bfe3f84c8abd538103d08*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x64x64x64x64_n"651066cefe1d7d904142af79c82e9185*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.3 --beta=0 --dt=f32 --tag=abcde 4x128x128x128x32_n"d0371bccd85ff1f9cfeb3d6b85846eb1*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x112x112_n"f8cc2437902518db19f0d4cc071a47ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x56x56_n"b1276f3abf1164b4a0f258830939c28f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x56x56_n"e3037757ba00d8be9004b2ec55587aa7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x56x56_n"dfadacf7d5a782db9183712b983f6aab"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x28x28_n"01af636e31f44d36ea3d84eee81e0b05*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x28x28_n"120d9cf17d3bff2f4c44674960a9108d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x28x28_n"7e1c80f20c1a903adf290d48d6dfc510"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x14x14_n"79dcbb96fc114bee7aee51cf070f5d35*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x1024x14x14_n"99027097993499d17ddedb293cf8b9b8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x14x14_n"52a85cd1753493ccdb0355b9288a6663"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x7x7_n"2d145a9a120d0af2dd04754ac38aaf37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x2048x7x7_n"a94f1d4c1776f3bea88958fe8292e7f9*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x2048x7x7_n"6e948f038c9239ecf1c031f50a5bd2ae*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x7x7_n"1e7a7296c633c39acc6a506524fdc79b*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x14x14_n"06b60cf534e30caa81c0cdeb7bf4ccd1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x1024x14x14_n"cb829e16ad3bd73fd254c99dfd8c6819*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x14x14_n"334f1e19722ff6b1045e207b1516a103*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x28x28_n"fb6e74bc3faad3106d7cd6c6b2262c12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x512x28x28_n"86e48ac99ddce0472d38f48abdab3a8b*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x28x28_n"c1957687f9f2fea599e4c737c6fdb28b*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x128x56x56_n"51edbfcb185b31e37d20521d0374aca8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x256x56x56_n"1f3be0f80fab2e4ffb648a75b7ef44f4*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x56x56_n"c1b8a1ed4330c5d669366ed1b5778892*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=acdb 16x64x112x112_n"2ec801f20f4844980592f1315ff52c0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 8192x4096_n"79ed557de5f0b34121af4f20a2d89ccf*24&32dc5b34a00ea88a88eb5d2ba278c7c4*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 1216x1024_n"2e13527496ecaa2d6b7cebe42c5fa685&093b26f92ddc4cb1964f337750d3c037"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 1216x1024_n"7a5e0c9e197aca640693528c4e2fc83e&cb6f766ca89470709bcfba5b93828737"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 8192x4096_n"abdf288237726afa4977def6b07c32ad*24&2c18513cc4a5b0eded08a829cc63b9d9*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x112x112_n"323d834d399786ce455add285c679a75"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x56x56_n"64cf29bcf0906f493ab0866d37fc4fdd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x56x56_n"cbe2a27f37d375389b27673e9b1d5c8d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x56x56_n"161a9bfc2f104adff12a43e41c3ff7f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x28x28_n"699b262cd937044bc456f25f4444c2b1*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x28x28_n"eff6628abe1f25dd8707ef54e149c3ec*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x28x28_n"64f7b95ac4d77bf597b2d5b09fd6f42a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x14x14_n"ad88a8b725ed5ebc132bfed02033fb9f*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x1024x14x14_n"6e3d86e1b95cd4ec531aa3fb1de3adbd*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x14x14_n"e82643a37a3305fe59f4fe76dd2c298c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x7x7_n"dae4b4e58ce83949040148f4d766949c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x2048x7x7_n"cb7e274f7eb73fbf2c5ae5f53ddf4b6f*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x2048x7x7_n"aa5125e6d27d1a3d4c4673520ddacf24*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x7x7_n"e39405ed164477f3d8517777f303bc0b*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x14x14_n"5edefa0f373f087ea16c09b172714f9e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x1024x14x14_n"791724d98960575032fd6617d090f56a*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x14x14_n"da9ecd0c61471611e8ac82f0dfaec5ae*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x28x28_n"b1dd5e5f0f0457eaa4783ed937cc9883"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x512x28x28_n"42509fb1ff192ead4e578eb6ab8a190a*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x28x28_n"187ea0f1b5fabf7488d3440a85340724*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x128x56x56_n"b5a47e726af7933b40d88046871647dc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x256x56x56_n"d430588be0dee4cb49b7cd40e0e77ea2*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x56x56_n"49cf2cd09e1c836de671c1e5148c8871*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ABcd16a16b 16x64x112x112_n"4487bd616efb577f60896b542d8ae71b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcde 16x17x33x33x32_n"692590fcb840011a8c6b9d2018f00caf*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcde 16x17x33x33x32_n"be0d27e48b9275e9e5ddf1a7fb634655*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x64x180x180_n"2424bf900296c21925dcb0d53b9f0e75"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x64x90x90_n"764eb41551ab80bf423ce40e53aea44d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x90x90_n"4baf6ca0fbb612aaab7aa42cbc561e1e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x128x90x90_n"e4b610c638da807b0f9c531487536afe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 64x128x45x45_n"ce043936c7dd7aee3e59e311f3fc0b4c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x512x45x45_n"98d081d9d2a172a0a39dcb18c803541a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x128x45x45_n"cfc85ded1ec84d8007802c1b72d53fa7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x45x45_n"ea0b087de2bdf27b6b086ab890b5561f*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 64x256x23x23_n"d93af0074ba52c380a1fc814a7ecedb9*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x1024x23x23_n"e1d40b30c0fa8344e0e0b31ba8631810*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x23x23_n"3e3b1a761493e8b96cae91203d05537f*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x512x23x23_n"300c8ef8e67b1ddf7cfc764f129a2dbe"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 64x512x12x12_n"c5e94a5a29b47e7fdc987ab1527864bd*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x2048x12x12_n"5f8a93f99d48763588435662753a0313*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x512x12x12_n"d94368cd9475ae93c9579eeef291417f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x12x12_n"62c1ae01d0fd576c8cf5189398868dbc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x32x90x90_n"f71c07a18ba9251f20a07a5b849efab1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x6x6_n"135c3852b64b92b7b9e32453b42ac652"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd32a16b 64x256x3x3_n"5ddf19776d21877557683d7d5ed356f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=a 32768_n"1739459957dd96c1ff0cbcb5c8c9e0a1*100&27e637e0af3bbf1be9f622fcc04acd55*100&298f2e66a19bb01ae576fc104dba729d*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 32768x1024_n"9f7c4f250d848ae41ca8c9386ce49a1d*500&c67545c6d310b3a074a32fe62e0bc2b8*500"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 64x7x1_n"53f3671cc9bad02a4ceae5273f1d3f7d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 64_n"87f4723b1b48875c001aa0baf23ac6a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 128x64x1_n"bb38e70e4cc156d83082c82a5f15ef86*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 128_n"a200d16c6318c06ad4937ed51235a4b0*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 1024x128x1_n"ddd08630e476a3edab5447cf3bfb3967*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 1024_n"88377bf1759226efe042d52f0f3a0b14*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 512x1024_n"8e806df4596c53951ffad3a450632ee8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 512_n"77c3341c4cea867a41b846061d5dacf2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 256x512_n"addb40a1a036575c7a8d3636277a826f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 256_n"4e269c0e2b7fc1af349e503675ca8973*18&ff734f0bc33bca6a37e9d3c33f368950*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 49x256_n"0d50331c876cf6990d0c9b275e633ecb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 49_n"cade85828017cf00c95e5009cf195815"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 64x64x1_n"00ecd0f9db8c56f36755781cb66d34e6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=ab 4096x256_n"04674b43a38d33283d3ba760e54704d3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 4096_n"115bcac3e7679bc3eaa7bac86d9122f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 512x1088x1_n"ecc433c3a22ec89683b6ee47512a9641"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 256x512x1_n"f0931ba58be667d25b6e8e28c7d35055"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 128x256x1_n"5ac612ffaa8bc3e651dc2c29f47a4b78"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abc 5x128x1_n"61d7b158b34938d2c7585df8cbcb4df1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 5_n"5009bdb90ac0f4e3521c3279e2e23ae5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x384_n"d2ed69e63b17ef52f04d8d4b699ede4c*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x320_n"ee9ebec2f36f687db4e8bb9442a0cdc4*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x256_n"8ac3fbf43d60b20a743318ed006675e1*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x192_n"7a10111fe0264639b6637934b3e85b1b*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x128_n"0e6e9bb4cd756802dc0e2eaa75383bf9*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x64_n"a3d3fe8d37cc3ccf05e0114aff942090*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x64_n"91ba2348c584b4f1486c1848fda9da80*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x128_n"bfa5675b27cb9e937796d226ceee014d*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x192_n"09d3904f19531aee1e58826882acbcb9*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x256_n"a204ea6f4572b6a36fc2e5a42528d4c1*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x320_n"04c0df2ac74c804d06a77cbe03d9c60e*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x384_n"9f1ba72f890adf32f881ecf33c6487c6*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcde 16x17x33x33x32_n"2b79509ec769d5302467ed45ade1ee03*9"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcde 16x17x33x33x32_n"f76ffcba141c0fe92b92c85a04e768db*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 128x1000_n"8f5ab4a04b164701d88caa96286855dd*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=bf16 --tag=abcd 256x256x256x2_n"684d8563f9e1792a8db922873ef4f2f6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 256x256x256x2_n"82765ca7223cd04f0b082f09fd13c4d4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x712_n"3d94bb779601a76d0016a6606aa148ee*31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x712_n"886d2e3a6f455418ec3c57e0a5a4c849*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x712_n"823ab146223d48c5107d3a721b535638"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x1120_n"b42d9ea5411581bbff66d3a5937d50f6*31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x1120_n"da089fdeb053adce22ad2311a440a5fa*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x1120_n"3361112206e13c93f107d6af9a8f540e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x360_n"fc5acdbff71e106130e5ceebbf2686ac*31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x360_n"ffffa08bca14425fe215fdae645012eb*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x360_n"6438c2a4ccd6263cfc2cc9dd4f3e128c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x288_n"1ba3cb03f561a70fd93c833d94a32a78*31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x288_n"6635b1141c971d1b7f0948ddb91cac35*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x288_n"bb5dd144550380fcef5a02a58b6958ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x256x560_n"98d5cea48c2cfaf3f1b5bdf703e2be81*31"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x512x560_n"e9b4dba6e5d838e0f38620c0b022a07a*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBc16b 8x1024x560_n"652beed5576930ca053252993f0a755d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x8x640x1024_n"765483e7472f1e015a431dfec6d318f0*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x16x320x512_n"49d8c27e98909374a225648855de909d*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x24x160x256_n"0e4f823fa34efe3beec6ab2ce1e9e4d4*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x32x80x128_n"b9135b65c73d34e84beaa48ff56267cb*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x40x40x64_n"143ca7ac5d05ae9e7ae0ca1272da240a*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x48x20x32_n"5c14f684746418e4fe5e24a5cfae737f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x56x10x16_n"756e22f54810c86c88f229f18dbd5354*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x256x10x16_n"5d3b6809126cac1208b48bbbf502d61b*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x168x10x16_n"43e135a1e2a89b55836a253d5936a40f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x8x640x1024_n"e9786347228f36ee4534a8303452abbe*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x16x320x512_n"a43b6e8fdfa3811ce0c6dfbb7b4863a2*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x24x160x256_n"28063af709296289f15187af375ccc18*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x32x80x128_n"8b196d36e186e4f8e03b2a578c49d286*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x40x40x64_n"dd836292c23691cdedfb98052cd45d88*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x48x20x32_n"5298bf4a826db5469f53c8c4e6b7aadc*18"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x168x10x16_n"f79998fcddb21e9cdbd83ee04a002182"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x256x10x16_n"56bb0c93c6a8559ba113dcfae26d6668*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=f32 --tag=acdb 4x56x10x16_n"b073422e04cbe1c7314f96505c6538a8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 8x1x5x5_n"3c8663dd87a93a14fd95b203f262d992"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 8_n"0bc5452d55988297feab71526742f1e6*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 8x8x3x3_n"ebbc327d9e6ce3b7e369ca5005314054*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 16x8x1x1_n"b2a370bd9cd17ac3c8eef0a3c53b347d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 16_n"19cadbd48bc0b3fe19f3287c381459bd*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 16x16x3x3_n"7b6cfecfa12c35173df68da594b5f975*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 24x16x1x1_n"0745d15f43fb6e6a4ff64f1a07bb9ce5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 24_n"81f2a8c9b6ea1bfad3ce84fdfc16b0ca*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 24x24x3x3_n"984fc163c7b4df6c5b9501fde74510a1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x24x1x1_n"a76045dd31ef72c5f3cd5d33df52d25d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 32_n"3f034fc1510eb684761beca161d48bdc*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x32x3x3_n"28e7f2c288b9dacf0049a19adc8c3f0c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 40x32x1x1_n"1e55579c875495b315ba9c69b4116b1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 40_n"1636fa23246ff93be3a51e6be6a6b8ba*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 40x40x3x3_n"9040841c8bc656ee4f09c4d2eb02fc30*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 48x40x1x1_n"62e7a21eca4c6aff529a37361c6e815a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 48_n"469247eba46859d705e2f41306a9035f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 48x48x3x3_n"406453331505f3eeb2a05a3db9308b78*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 56x48x1x1_n"f0b6fb70d565d26bb79c4f8ed8c2b28e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 56_n"a9ff3b078bed1c30eca041b53a35cb39*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 256x168x1x1_n"59286d29e91ee17fb379d8b86fc750e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 256x256x5x5_n"bc8905ae95bbc839421fb0d53b6ff4c8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 168x256x1x1_n"a2a9ec84945ebfcc52fa6430f9d9fc3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 168_n"e89b1f5407c0b6f230401c359dd20ed6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 48x56x1x1_n"ed1e90e9d7745fe9ad4cdc58da7ecb29"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 40x48x1x1_n"8af65357d59fb06e177e28b2f8a0af23"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 32x40x1x1_n"e486ec2dc37865e2a12539c9d8818366"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 24x32x1x1_n"2ce3a10451a86f21afed7e0c6ec17c9b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 16x24x1x1_n"efd1b1aceaf8a8a532ad466b6b79dd8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 8x16x1x1_n"b52cca68d00f73d8ea230dba3c555d58"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=abcd 3x8x1x1_n"9da2190cc0a6d0aedfad6b8101128164"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_sqrt --alpha=0 --beta=0 --dt=f32 --tag=a 3_n"ff317810932204c2d1444f493007ce3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x64x64x320_n"9de40b1678fce9b46270f8b8bb899f80*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x320_n"32e9fdcaa6979768e76774d8363d04d1*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x640_n"ce4ce4a474e3589735a7bb0ce5a027f5*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x640_n"efb027ac25b5b14bbb02734e070fa962*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x1280_n"12b61b8d8da1e33fce286c02058f83ba*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x8x8x1280_n"ef177bd2c6e95fa7ab9bb974c8230ed7*22"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x8x8x2560_n"213887ae1ac639473b6c5c7318ee6a52*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x2560_n"232b91f27599142969e355274a8d425c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x16x16x1920_n"f1390e46c73764cd27c01c3fd2afa666*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x1920_n"9263508af88890267092fa018c897b92*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x1280_n"64a4efe9566d5f0caff55bcb48035f52*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x32x32x960_n"5ad256ac714371f14f44964db08d46b0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x64x64x960_n"17e2e83225778f9f3c9a685af44f30cf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_swish --alpha=1 --beta=0 --dt=f16 --tag=abcd 1x64x64x640_n"02e099343822d244adc5b78c3c18194d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x384_n"d8f336b6abde4a4c432d696e22d8ba2f*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x320_n"7022ded89838ee48c129bd134e43e419*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x256_n"11f21889311d77e1df87af2059faa671*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x192_n"2df898ef07e1805bf7c2bca7d5e69943*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x128_n"58a181e9344c56db9a0131f4533309ae*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x64_n"d7cd9f1ad74af35ce00d2af735e81bb2*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x64_n"e560bddc0cf30ae2438ac7718ae300e2*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x128_n"851cb88dd651b481f56b0c0f8be2ffec*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x192_n"02a2827a515945ddf1c3c03637fc77c2*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x256_n"9055ae219613c8682d03abf5d2587a7b*16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x320_n"3562c22afae25b68007979f6c1b455fb*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x384_n"42b8d15c80c09041be2d2da162a024cd*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ab 128x256_n"529e312c2b95894bb70d8ba367501f74"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ab 128x128_n"d428d9031f030c9d1f876ea776a073bc"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ab 128x64_n"e668a15ed19e41f74d3b744571a581fb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x32x32x256_n"a922328774b70ed06eec3067f528a260"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x16x16x256_n"f43f2eea8360e6c0d204ad77215bb7cb"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x64x64x256_n"d14c957faaf5e22555f5c235b6994198*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x128x128x256_n"941a1822f3b581cc50192cdf10d0982f*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x256x256x256_n"31eb17e02f0d74b64839a3809a9b8885"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 4x512x1024_n"4460422ea726d0dbf5454b020bbb2b5e*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 4x512x1024_n"4216de4fec5c138b22fedbfce0fd44c3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 512x28x28x256_n"3927e8bbac0afa998f4368e77c1280df"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 512x28x28x256_n"43c2fc18a1ae41d1d41baad4ff55cb1b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 512x14x14x256_n"4b6ccea371a02b7c49b30a64b5ac85a1*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x32x32x2048_n"40ff5092601927553e263d5197348a2c*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x32x32x512_n"909ae0fbe3b6345f586c41fdd7f3e33e*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x64x64x512_n"9192b041a0e4291ac6ea40825de953f0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x64x64x1024_n"976c077b2a354999c92a57310ca4d5a4*6"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x128x128x512_n"b83b713a5975c56abd8aca5e0fa3d96f*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x128x128x128_n"5fc9090773dbf4514a7c40c8fbee31e7*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abcd 4x256x256x128_n"837181944c44675b738c69261c79bd0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x256x56x56_n"368269956bf32547a1e6cbc43fcacd6a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x512x28x28_n"f57775f6a5e575e6c64dcb10979cde71*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x1024x14x14_n"5baebd5a78f30e5a0e270bb9d9c1fe76*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 1x2048x7x7_n"318cab47de2535f44b51432916fe06ae*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x2014x1x128_n"f8718cf7f3ef20a9e98336da0654264d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x473x1x256_n"ae4f826b6421e060442721311ea1b362"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x118x1x256_n"ade787ac99a3beb22fe5946bf667c495"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x128_n"e06e3d37d2104eacafe1ee62e467f474*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x64_n"31974495c8725ff3388a190765fccf0e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32x128_n"c0a0a5bf28216ca6136e5bf9ac844516*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x64_n"4d07b4db086d0a2d06982fc4731e2c28"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x29x1x128_n"62c6bb26255a616db6fafe4c87309f48*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x118x1x256_n"d372f8e3f43ce3600a983c89c7cfb24c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x473x1x256_n"30c5b741d171e3541d231a4dd19abb89"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x2014x1x128_n"8bc3a028598e8ba4dedc2faa23a476ae"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abcd 32x8177x1x64_n"944748dd5d62c7f530feeb6d0beee238"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x64x416x672_n"6ecf5207885f132715fc467b20ca6bba"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x64x208x336_n"2b3f7ed37d2ef092aeadb46a4e243a6a*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x256x208x336_n"d54153bc43996d1e2582dab73284f073*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x128x208x336_n"a79609e649b99c98fd062b571db885be"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x128x104x168_n"3fdfae72feceed48419198ad7852c93e*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x512x104x168_n"11d8b42fddf7a4dc8ce4d213e455cf5e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x256x104x168_n"84176f25056c6400fd64cc4e0780d9ef"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x256x52x84_n"084df2346d107e36c05df93fc6915bd0*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x1024x52x84_n"97335fb3a41c439dfe53b1068893a7b8*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x512x52x84_n"61922c6e2d76e6130ffe096d95261aeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=ABcd16a16b 16x512x26x42_n"79be9785bb58c855a070fa5dab7df4f0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=acdb 16x2048x26x42_n"3a70d3f70674b87eb6c3258af9c6e34e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abc 16x1000x1024_n"3d2da4846c833e80fe547fe84f3c1151*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abcd 1600x28x28x256_n"dbaf7bd49485d67e2b13655b13600429"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 32768x1024_n"4bab002d4e3e8fb3a359fe739164e88a*500"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=bf16 --tag=ab 16384x4096_n"50ce084b320e1a3cd46f29b21e205af7*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 2432x1024_n"a8d5a7a4f0fdd34f127fbcc0d9f7e1e3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=f32 --tag=ab 2432x1024_n"b079eca2af4a813b0b983d93455f455a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_gelu_tanh --alpha=0 --beta=0 --dt=bf16 --tag=ab 16384x4096_n"4489284313adf9ec4a40ba99689ff0bf*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 1024x4x8_n"37f30327085bfc210327d3d1fcbcaf2b*7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=ab 1024x8_n"18be5bad92d662007495ab7bcd813b92*14"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=bf16 --tag=abc 1024x4x8_n"9bfc82585741f0040d55a6ffd021c914*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=abc 5000x4x8_n"45b0ef238f1f87023efdc38a4dd63565"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x32x224x224x160_n"076fcdec1391c73dfdfb5ea95e2381d6*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x64x112x112x80_n"3a08908b5d46ba839387dedd6628a1c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x128x56x56x40_n"5fda3b964cda955a674de762c87334ac*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x256x28x28x20_n"d40881c06f63d430574b366482af64bb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x320x14x14x10_n"68d3f9f8009c7776f12f3c012f222710*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0.01 --beta=0 --dt=s8 --tag=abcde 1x320x7x7x5_n"d51531529055f25a1cdd9de789dd2a66*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 1024x4x8_n"b414a6132f89ec863d40e6dad2dde477*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 1024x8_n"90da6fd00e1165bb38a49b19586e2e87*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 1024x4x8_n"8fa18a51daca9813e5735e10f4c0baa1*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 867x4x8_n"106ba59053a9c914b906754a2f7e00fd"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=ab 867x8_n"89fd982de94a33a19ced1f3dbcd57ce6*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f32 --tag=abc 867x4x8_n"098480c2abf0403f76884c0c153a4fd8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x256x56x56_n"2d5b49a315c373a6d8479ed1972337a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x512x28x28_n"9cb2f5d9e9abd85b23b768e2f7086d2e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x1024x14x14_n"cc64763382f72a3dbab095b775993d78*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=eltwise_relu --alpha=0 --beta=0 --dt=f16 --tag=aBcd16b 1x2048x7x7_n"095e7ed9e2f4d7f2f8d40870360ff07f*2"
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all
index 9e5882f8cf2..d6bbe93100f 100644
--- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all
+++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_all
@@ -1,64 +1,123 @@
-# Original graphs
---reset --case=complex_fusion/mha/JAX-MHA-inf-fp32.json
---reset --case=complex_fusion/mha/JAX-MHA-inf-bf16.json
---reset --case=complex_fusion/mha/JAX-MQA-inf-fp32.json
---reset --case=complex_fusion/mha/JAX-MQA-inf-bf16.json
---reset --case=complex_fusion/mha/MHA-GPT-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json
+# floating-point graphs
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/JAX-MHA-inf-fp32.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/JAX-MQA-inf-fp32.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --op-kind=1:Multiply,1:Divide --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-wo-scale-f16-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/GQA-fp16.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/GQA-fp16-v2.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-wo-mask-f16.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
+# f16 inputs + f32 intermediates + f16 outputs
+--reset --case=complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
+--reset --in-shapes=1:1x16x32x512+2:1x16x32x512+3:1x16x32x512+5:1x1x32x32,\
+                    1:1x16x64x512+2:1x16x64x512+3:1x16x64x512+5:1x1x64x64,\
+                    1:1x16x128x512+2:1x16x128x512+3:1x16x128x512+5:1x1x128x128,\
+                    1:1x16x256x512+2:1x16x256x512+3:1x16x256x512+5:1x1x256x256,\
+                    1:1x16x512x512+2:1x16x512x512+3:1x16x512x512+5:1x1x512x512,\
+                    1:1x16x1024x512+2:1x16x1024x512+3:1x16x1024x512+5:1x1x1024x1024,\
+                    1:1x16x2048x512+2:1x16x2048x512+3:1x16x2048x512+5:1x1x2048x2048,\
+                    1:1x16x1x512+2:1x16x33x512+3:1x16x33x512+5:1x1x1x33,\
+                    1:1x16x1x512+2:1x16x65x512+3:1x16x65x512+5:1x1x1x65,\
+                    1:1x16x1x512+2:1x16x129x512+3:1x16x129x512+5:1x1x1x129,\
+                    1:1x16x1x512+2:1x16x257x512+3:1x16x257x512+5:1x1x1x257,\
+                    1:1x16x1x512+2:1x16x513x512+3:1x16x513x512+5:1x1x1x513,\
+                    1:1x16x1x512+2:1x16x1025x512+3:1x16x1025x512+5:1x1x1x1025,\
+                    1:1x16x1x512+2:1x16x2049x512+3:1x16x2049x512+5:1x1x1x2049
+--case=complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
+--reset --dt=1:f16+2:f16+3:f16+4:f16+6:f16+104:f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
+
+# bf16 inputs + f32 intermediates + bf16 outputs
+--reset --dt=1:bf16+2:bf16+3:bf16+4:bf16+5:bf16+6:bf16+104:bf16 --case=complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
+--reset --dt=1:bf16+2:bf16+3:bf16+4:bf16+6:bf16+104:bf16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
+
+# int8 graphs
 --reset --case=complex_fusion/mha/MHA-GPT-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-LLaMa-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-LLaMa-inf-fp32-bs1.json
---reset --case=complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
 --reset --case=complex_fusion/mha/MHA-bert_large-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
 --reset --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json
---reset --case=complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json
---reset --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA_backward-Bert_large-train-bf16-bs4.json
---reset --case=complex_fusion/mha/MHA_backward-Bert_large-train-fp32-bs4.json
---reset --case=complex_fusion/mha/MHA_forward-Bert_large-train-bf16-bs4.json
---reset --case=complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json
---reset --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json
---reset --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
---reset --case=complex_fusion/mha/sdpa-plain-wo-scale-f16-bs1.json
---reset --case=complex_fusion/mha/sdpa-plain-wo-scale-bf16-bs1.json
---reset --case=complex_fusion/mha/sdpa-plain-wo-scale-fp32-bs1.json
 --reset --case=complex_fusion/mha/sdpa-plain-wo-scale-int8-bs1.json
---reset --case=complex_fusion/mha/GQA-fp32.json
---reset --case=complex_fusion/mha/GQA-fp16.json
+--reset --case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+--reset --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json
+--reset --case=complex_fusion/mha/sdpa-compressed-v-int8-gs32.json
+--reset --case=complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
+
+# Re-written graphs
+--reset --dt=f32,bf16,f16 --in-shapes=4:4x16x32x256+5:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --mb=20 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --in-shapes=3:10x16x384x64+4:10x1x64x384+0:10x1x384x64+1:10x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --in-shapes=4:56x12x128x64+5:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --in-shapes=2:1x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --in-shapes=0:56x8x1024x80+1:56x8x77x80+2:56x8x77x80 --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --in-shapes=0:20x16x384x64+1:20x16x384x64+8:20x16x384x64+5:20x1x1x384 --case=complex_fusion/mha/sdpa-plain-wo-scale-f16-bs1.json
+--reset --dt=f32,bf16,f16 --in-shapes=5:1x1x384x384,5:1x16x384x384 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --in-shapes=0:2x16x384x64+1:2x16x384x64+5:2x1x1x384+8:2x16x384x64  --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --in-shapes=0:32x16x128x64+1:32x16x128x64+5:32x16x128x128+8:32x16x128x64  --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --in-shapes=0:acbd+1:acbd+8:acbd  --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --op-kind=1:Multiply,1:Divide --in-shapes=3:384,3:384x384,3:1x16x384x384 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --op-kind=1:Multiply,1:Divide --in-shapes=5:384,5:1x384 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --op-attrs=34107656704:group_shape:1x1x1x32+34107654464:transpose_b:1 --in-shapes=0:1x32x32x128+1:1x32x32x4+2:1x32x32x4 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json
+--reset --op-attrs=34107656704:qtype:per_channel*axis:3 --in-shapes=1:32+2:1 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json
+--reset --op-attrs=34107656704:group_shape:1x1x128x1 --case=complex_fusion/mha/sdpa-compressed-k-int8-gs32.json
+--reset --dt=f32,bf16,f16 --op-attrs=40:axis:-2+41:axis:-1 --in-shapes=1:1x32x128x64+2:1x32x128x64+3:1x32x128x64 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
 
-# Rewrited graphs
---reset --in-shapes=4:4x16x32x256+5:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json
---reset --in-shapes=4:4x16x32x256+5:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-bf16-bs1.json
+# Re-written int8 graphs
 --reset --in-shapes=5:4x16x32x256+4:4x16x256x33+0:4x16x33x256+1:4x1x1x33+3:4x1x32x33 --case=complex_fusion/mha/MHA-GPT-inf-int8-bs1.json
---reset --in-shapes=3:4x32x32x128+4:4x32x128x33+0:4x32x33x128+1:4x1x32x33 --case=complex_fusion/mha/MHA-LLaMa-inf-fp32-bs1.json
---reset --in-shapes=3:4x32x32x128+4:4x32x128x33+0:4x32x33x128+1:4x1x32x33 --case=complex_fusion/mha/MHA-LLaMa-inf-bf16-bs1.json
---reset --in-shapes=4:4x32x32x128+3:4x32x128x33+0:4x32x33x128+1:4x1x32x33 --case=complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json
---reset --in-shapes=3:20x16x384x64+4:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
---reset --in-shapes=3:20x16x384x64+4:20x1x64x384+0:20x1x384x64+1:20x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
---reset --in-shapes=3:20x16x384x64+4:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json
 --reset --in-shapes=4:20x16x384x64+3:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/MHA-bert_large-inf-int8-bs1.json
---reset --in-shapes=4:56x12x128x64+5:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
---reset --in-shapes=4:56x12x128x64+5:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json
 --reset --in-shapes=5:56x12x128x64+4:56x12x64x128+0:56x12x128x64+1:56x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json
---reset --in-shapes=0:56x8x1024x80+1:56x8x77x80+2:56x8x77x80 --case=complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json
---reset --in-shapes=0:56x8x1024x80+1:56x8x77x80+2:56x8x77x80 --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json
---reset --in-shapes=5:20x117x48x128+6:20x1x128x117+19:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json
---reset --in-shapes=5:20x117x48x128+6:20x1x128x117+19:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json
---reset --in-shapes=4:20x117x48x128+3:20x1x128x117+0:20x1x117x128 --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json
---reset --in-shapes=2514:32x16x512x64+2518:32x16x512x64+2543:32x1x512x512+2547:32x16x512x512+2525:32x16x512x64 --op-attrs=4837:shape:16384x1024 --case=complex_fusion/mha/MHA_forward-Bert_large-train-bf16-bs4.json
---reset --in-shapes=2514:32x16x512x64+2518:32x16x512x64+2543:32x1x512x512+2547:32x16x512x512+2525:32x16x512x64 --op-attrs=4837:shape:16384x1024 --case=complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json
---reset --in-shapes=2514:32x16x512x64+2518:32x16x512x64+2591:32x16x512x512+2545:32x16x512x512+2547:32x16x512x512+2525:32x16x512x64+2548:32x16x512x512+5178:16384x1024 --op-attrs=7392:shape:32x512x16x64 --case=complex_fusion/mha/MHA_backward-Bert_large-train-bf16-bs4.json
---reset --in-shapes=2514:32x16x512x64+2518:32x16x512x64+2591:32x16x512x512+2545:32x16x512x512+2547:32x16x512x512+2525:32x16x512x64+2548:32x16x512x512+5178:16384x1024 --op-attrs=7392:shape:32x512x16x64 --case=complex_fusion/mha/MHA_backward-Bert_large-train-fp32-bs4.json
---reset --in-shapes=4:32x16x384x64+3:32x16x64x384+0:32x16x384x64+1:32x1x1x384 --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json
---reset --in-shapes=3:20x16x384x64+4:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/sdpa-plain-wo-scale-fp32-bs1.json
---reset --in-shapes=3:20x16x384x64+4:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/sdpa-plain-wo-scale-bf16-bs1.json
+--reset --in-shapes=2:1x1x1x128 --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json
 --reset --in-shapes=4:20x16x384x64+3:20x16x64x384+0:20x16x384x64+1:20x1x1x384 --case=complex_fusion/mha/sdpa-plain-wo-scale-int8-bs1.json
---reset --in-shapes=5:1x1x384x384 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
---reset --in-shapes=0:2x16x384x64+1:2x16x384x64+5:2x1x1x384+8:2x16x384x64  --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --in-shapes=0:1x32x96x384*abdc+1:1x32x1x384+2:1x32x1x384+3:1x32x384x96+6:1x32x384x96+7:1x32x384x1+8:1x32x384x1 --op-attrs=0:group_shape:1x1x96x1+8:group_shape:1x1x1x96 --case=complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
+
+# phi3-mini-4k-instruct
+--reset
+--dt=0:s8+2:s8+6:s8+8:s8
+--in-shapes=0:1x32x96x384*abdc+1:1x32x1x384+2:1x32x1x384+3:1x32x384x96+5:1x1x384x384+6:1x32x384x96+7:1x32x384x1+8:1x32x384x1,\
+            0:1x32x96x385*abdc+1:1x32x1x385+2:1x32x1x385+3:1x32x1x96+5:1x1x385x385+6:1x32x385x96+7:1x32x385x1+8:1x32x385x1,\
+            0:1x32x96x512*abdc+1:1x32x1x512+2:1x32x1x512+3:1x32x512x96+5:1x1x512x512+6:1x32x512x96+7:1x32x512x1+8:1x32x512x1,\
+            0:1x32x96x513*abdc+1:1x32x1x513+2:1x32x1x513+3:1x32x1x96+5:1x1x513x513+6:1x32x513x96+7:1x32x513x1+8:1x32x513x1,\
+            0:1x32x96x1024*abdc+1:1x32x1x1024+2:1x32x1x1024+3:1x32x1024x96+5:1x1x1024x1024+6:1x32x1024x96+7:1x32x1024x1+8:1x32x1024x1,\
+            0:1x32x96x1025*abdc+1:1x32x1x1025+2:1x32x1x1025+3:1x32x1x96+5:1x1x1025x1025+6:1x32x1025x96+7:1x32x1025x1+8:1x32x1025x1
+--op-attrs=0:group_shape:1x1x96x1+99:group_shape:1x1x1x96
+--case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+
+--reset
+--dt=0:s8+2:s8+6:s8+8:s8
+--in-shapes=0:1x32x96x384*abdc+1:1x32x1x384+2:1x32x1x384+3:1x32x384x96+6:1x32x384x96+7:1x32x384x1+8:1x32x384x1,\
+            0:1x32x96x385*abdc+1:1x32x1x385+2:1x32x1x385+3:1x32x1x96+6:1x32x385x96+7:1x32x385x1+8:1x32x385x1,\
+            0:1x32x96x512*abdc+1:1x32x1x512+2:1x32x1x512+3:1x32x512x96+6:1x32x512x96+7:1x32x512x1+8:1x32x512x1,\
+            0:1x32x96x513*abdc+1:1x32x1x513+2:1x32x1x513+3:1x32x1x96+6:1x32x513x96+7:1x32x513x1+8:1x32x513x1,\
+            0:1x32x96x1024*abdc+1:1x32x1x1024+2:1x32x1x1024+3:1x32x1024x96+6:1x32x1024x96+7:1x32x1024x1+8:1x32x1024x1,\
+            0:1x32x96x1025*abdc+1:1x32x1x1025+2:1x32x1x1025+3:1x32x1x96+6:1x32x1025x96+7:1x32x1025x1+8:1x32x1025x1
+--op-attrs=0:group_shape:1x1x96x1+8:group_shape:1x1x1x96
+--case=complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
+
+# llama-2-7b-chat
+--in-shapes=0:1x32x128x384*abdc+1:1x32x1x384+2:1x32x1x384+3:1x32x384x128+5:1x1x384x384+6:1x32x384x128+7:1x32x384x1+8:1x32x384x1,\
+            0:1x32x128x385*abdc+1:1x32x1x385+2:1x32x1x385+3:1x32x1x128+5:1x1x385x385+6:1x32x385x128+7:1x32x385x1+8:1x32x385x1,\
+            0:1x32x128x512*abdc+1:1x32x1x512+2:1x32x1x512+3:1x32x512x128+5:1x1x512x512+6:1x32x512x128+7:1x32x512x1+8:1x32x512x1,\
+            0:1x32x128x513*abdc+1:1x32x1x513+2:1x32x1x513+3:1x32x1x128+5:1x1x513x513+6:1x32x513x128+7:1x32x513x1+8:1x32x513x1,\
+            0:1x32x128x1024*abdc+1:1x32x1x1024+2:1x32x1x1024+3:1x32x1024x128+5:1x1x1024x1024+6:1x32x1024x128+7:1x32x1024x1+8:1x32x1024x1,\
+            0:1x32x128x1025*abdc+1:1x32x1x1025+2:1x32x1x1025+3:1x32x1x128+5:1x1x1025x1025+6:1x32x1025x128+7:1x32x1025x1+8:1x32x1025x1
+--op-attrs=0:group_shape:1x1x128x1+99:group_shape:1x1x1x128
+--case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+
+--in-shapes=0:1x32x128x384*abdc+1:1x32x1x384+2:1x32x1x384+3:1x32x384x128+6:1x32x384x128+7:1x32x384x1+8:1x32x384x1,\
+            0:1x32x128x385*abdc+1:1x32x1x385+2:1x32x1x385+3:1x32x1x128+6:1x32x385x128+7:1x32x385x1+8:1x32x385x1,\
+            0:1x32x128x512*abdc+1:1x32x1x512+2:1x32x1x512+3:1x32x512x128+6:1x32x512x128+7:1x32x512x1+8:1x32x512x1,\
+            0:1x32x128x513*abdc+1:1x32x1x513+2:1x32x1x513+3:1x32x1x128+6:1x32x513x128+7:1x32x513x1+8:1x32x513x1,\
+            0:1x32x128x1024*abdc+1:1x32x1x1024+2:1x32x1x1024+3:1x32x1024x128+6:1x32x1024x128+7:1x32x1024x1+8:1x32x1024x1,\
+            0:1x32x128x1025*abdc+1:1x32x1x1025+2:1x32x1x1025+3:1x32x1x128+6:1x32x1025x128+7:1x32x1025x1+8:1x32x1025x1
+--op-attrs=0:group_shape:1x1x128x1+8:group_shape:1x1x1x128
+--case=complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
+
+# 0: key, 2: key zps, 6: value, 8: value zps. Change them to use s8 data type.
+--reset --dt=0:s8+2:s8+6:s8+8:s8 --case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+# Change group size to 128. It also affects the shapes of the scales and zps.
+--reset --dt=0:s8+2:s8+6:s8+8:s8 --op-attrs=0:group_shape:1x1x128x1+99:group_shape:1x1x1x128 --in-shapes=1:1x32x1x32+2:1x32x1x32+7:1x32x32x1+8:1x32x32x1 --case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+
+# d_qk != d_v
+--reset --dt=f32,bf16,f16 --in-shapes=8:1x16x384x32,8:1x16x384x64,8:1x16x384x128 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --in-shapes=3:1x16x384x32,3:1x16x384x64,3:1x16x384x128 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci
index f67f5b73971..90731d0a4ac 100644
--- a/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci
+++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mha_ci
@@ -1,25 +1,29 @@
---reset --case=complex_fusion/mha/MHA-GPT-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json
+# floating-point graphs
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/JAX-MHA-inf-fp32.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/JAX-MQA-inf-fp32.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-GPT-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --op-kind=1:Multiply,1:Divide --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-wo-scale-f16-bs1.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/GQA-fp16.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/GQA-fp16-v2.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-wo-mask-f16.json
+--reset --dt=f32,bf16,f16 --case=complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
+# f16 inputs + f32 intermediates + f16 outputs
+--reset --case=complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
+# bf16 inputs + f32 intermediates + bf16 outputs
+--reset --dt=1:bf16+2:bf16+3:bf16+4:bf16+5:bf16+6:bf16+104:bf16 --case=complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
+
+
+# int8 graphs
 --reset --case=complex_fusion/mha/MHA-GPT-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-LLaMa-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-LLaMa-inf-fp32-bs1.json
---reset --case=complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-bert_large-inf-fp32-bs1.json
 --reset --case=complex_fusion/mha/MHA-bert_large-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-stable_diffusion-inf-fp32-bs1.json
---reset --case=complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-distill_bert-inf-fp32-bs1.json
 --reset --case=complex_fusion/mha/MHA-distill_bert-inf-int8-bs1.json
---reset --case=complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json
---reset --case=complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json
---reset --case=complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json
---reset --case=complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json
---reset --case=complex_fusion/mha/sdpa-plain-simplified-f16.json
---reset --case=complex_fusion/mha/sdpa-plain-wo-scale-f16-bs1.json
---reset --case=complex_fusion/mha/sdpa-plain-wo-scale-bf16-bs1.json
---reset --case=complex_fusion/mha/sdpa-plain-wo-scale-fp32-bs1.json
 --reset --case=complex_fusion/mha/sdpa-plain-wo-scale-int8-bs1.json
---reset --case=complex_fusion/mha/GQA-fp32.json
---reset --case=complex_fusion/mha/GQA-fp16.json
+--reset --case=complex_fusion/mha/sdpa-compressed-v-int8-gs32.json
+--reset --case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+--reset --dt=0:s8+2:s8+6:s8+8:s8 --case=complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
+--reset --case=complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_all b/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_all
new file mode 100644
index 00000000000..210c8722e50
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_all
@@ -0,0 +1 @@
+--batch=complex_fusion/harness_mlp_ci
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_ci b/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_ci
new file mode 100644
index 00000000000..767d1b9392d
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_ci
@@ -0,0 +1,7 @@
+--reset --dt=bf16,f16 --case=complex_fusion/mlp/gated-mlp-f32.json
+
+# WA: use smaller problem to pass correctness check for f32 on pvc.
+--reset --in-shapes=0:1x128+1:128x256+4:128x256+13:256x128 --case=complex_fusion/mlp/gated-mlp-f32.json
+
+# f16-int4 case
+--reset --case=complex_fusion/mlp/gated-mlp-int4.json
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/GQA-fp16-v2.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/GQA-fp16-v2.json
new file mode 100644
index 00000000000..0dddec27845
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/GQA-fp16-v2.json
@@ -0,0 +1,373 @@
+{
+  "version": "3.8.0",
+  "engine_kind": "gpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    1,
+    3,
+    8,
+    11,
+    19
+  ],
+  "output_ports": [
+    20
+  ],
+  "graph": [
+    {
+      "id": 7,
+      "name": "bmm1",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 1
+        }
+      },
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            64
+          ],
+          "stride": [
+            393216,
+            196608,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 3,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            1,
+            384,
+            64
+          ],
+          "stride": [
+            49152,
+            24576,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 4,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 10,
+      "name": "scale_div",
+      "kind": "Divide",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 4,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 8,
+          "dtype": "f16",
+          "shape": [
+            1
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 9,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 15,
+      "name": "mask_add",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 9,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 11,
+          "dtype": "f16",
+          "shape": [
+            32,
+            1,
+            1,
+            1,
+            384
+          ],
+          "stride": [
+            384,
+            384,
+            384,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 14,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 17,
+      "name": "softmax",
+      "kind": "SoftMax",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        }
+      },
+      "inputs": [
+        {
+          "id": 14,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 16,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 22,
+      "name": "bmm2",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 16,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            1179648,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 19,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            1,
+            384,
+            64
+          ],
+          "stride": [
+            49152,
+            24576,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 20,
+          "dtype": "f16",
+          "shape": [
+            32,
+            2,
+            8,
+            384,
+            64
+          ],
+          "stride": [
+            393216,
+            196608,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/GQA-fp32.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/GQA-fp32.json
deleted file mode 100644
index c4bfa43e540..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/GQA-fp32.json
+++ /dev/null
@@ -1,686 +0,0 @@
-{
-    "version": "3.6.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        0,
-        2,
-        8,
-        11,
-        18
-    ],
-    "output_ports": [
-        23
-    ],
-    "graph": [
-        {
-            "id": 5,
-            "name": "reshape1",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        64
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 0,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 1,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        196608,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 6,
-            "name": "reshape2",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        2,
-                        1,
-                        384,
-                        64
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        49152,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 3,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        1,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        49152,
-                        24576,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7,
-            "name": "bmm1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 1,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        196608,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 3,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        1,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        49152,
-                        24576,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 4,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 10,
-            "name": "scale_div",
-            "kind": "Divide",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 4,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 8,
-                    "dtype": "f32",
-                    "shape": [
-                        1
-                    ],
-                    "stride": [
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 9,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 13,
-            "name": "reshape3",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        1,
-                        1,
-                        1,
-                        384
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 11,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        1,
-                        1,
-                        384
-                    ],
-                    "stride": [
-                        384,
-                        384,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 12,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        1,
-                        1,
-                        1,
-                        384
-                    ],
-                    "stride": [
-                        384,
-                        384,
-                        384,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 15,
-            "name": "mask_add",
-            "kind": "Add",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 9,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 12,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        1,
-                        1,
-                        1,
-                        384
-                    ],
-                    "stride": [
-                        384,
-                        384,
-                        384,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 14,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 17,
-            "name": "softmax",
-            "kind": "SoftMax",
-            "attrs": {
-                "axis": {
-                    "type": "s64",
-                    "value": -1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 14,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 16,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 21,
-            "name": "reshape3",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        2,
-                        1,
-                        384,
-                        64
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 18,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        49152,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 19,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        1,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        49152,
-                        24576,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 22,
-            "name": "bmm2",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 16,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        1179648,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 19,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        1,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        49152,
-                        24576,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 20,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        196608,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 24,
-            "name": "reshape4",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        16,
-                        384,
-                        64
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 20,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        2,
-                        8,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        196608,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 23,
-                    "dtype": "f32",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/JAX-MHA-inf-bf16.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/JAX-MHA-inf-bf16.json
deleted file mode 100644
index 95ec599bbb0..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/JAX-MHA-inf-bf16.json
+++ /dev/null
@@ -1,564 +0,0 @@
-{
-    "version": "3.3.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        1,
-        2,
-        4,
-        0,
-        7
-    ],
-    "output_ports": [
-        11
-    ],
-    "graph": [
-        {
-            "id": 0,
-            "name": "matmul_qk",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 1,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        64,
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        64,
-                        384
-                    ],
-                    "stride": [
-                        393216,
-                        64,
-                        1,
-                        1024
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 3,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 1,
-            "name": "fscore_div",
-            "kind": "Divide",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 3,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 4,
-                    "dtype": "bf16",
-                    "shape": [
-                        1
-                    ],
-                    "stride": [
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 2,
-            "name": "fscore_add",
-            "kind": "Add",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 0,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        1,
-                        1,
-                        384
-                    ],
-                    "stride": [
-                        384,
-                        384,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 6,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 3,
-            "name": "softmax",
-            "kind": "SoftMax",
-            "attrs": {
-                "axis": {
-                    "type": "s64",
-                    "value": 3
-                }
-            },
-            "inputs": [
-                {
-                    "id": 6,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 12,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 10,
-            "name": "transpose_output",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        1,
-                        3,
-                        2
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 12,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 15,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        1,
-                        384
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 11,
-            "name": "reorder_output",
-            "kind": "Reorder",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 15,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        1,
-                        384
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 8,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4,
-            "name": "matmul_v",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 7,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        64,
-                        384
-                    ],
-                    "stride": [
-                        393216,
-                        24576,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 8,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 9,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        64,
-                        384
-                    ],
-                    "stride": [
-                        393216,
-                        24576,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 5,
-            "name": "transpose_output",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        3,
-                        1,
-                        2
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 9,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        64,
-                        384
-                    ],
-                    "stride": [
-                        393216,
-                        24576,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 10,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        1024,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 6,
-            "name": "reshape_output",
-            "kind": "StaticReshape",
-            "attrs": {
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        384,
-                        16,
-                        64
-                    ]
-                },
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 10,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        1024,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 11,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        393216,
-                        1024,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/JAX-MQA-inf-bf16.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/JAX-MQA-inf-bf16.json
deleted file mode 100644
index 479ccb4134b..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/JAX-MQA-inf-bf16.json
+++ /dev/null
@@ -1,497 +0,0 @@
-{
-    "version": "3.3.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        1,
-        2,
-        6,
-        11
-    ],
-    "output_ports": [
-        12
-    ],
-    "graph": [
-        {
-            "id": 0,
-            "name": "matmul_qk",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 1,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        64
-                    ],
-                    "stride": [
-                        24576,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        64,
-                        6144
-                    ],
-                    "stride": [
-                        393216,
-                        6144,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 3,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        6144
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 1,
-            "name": "reshape_op",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        384,
-                        16,
-                        384
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 3,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        6144
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 4,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 2,
-            "name": "transpose_op",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        2,
-                        1,
-                        3
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 4,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 3,
-            "name": "ADD",
-            "kind": "Add",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "none"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 6,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 7,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4,
-            "name": "softmax",
-            "kind": "SoftMax",
-            "attrs": {
-                "axis": {
-                    "type": "s64",
-                    "value": 3
-                }
-            },
-            "inputs": [
-                {
-                    "id": 7,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 8,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 5,
-            "name": "transpose_output",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        2,
-                        1,
-                        3
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 8,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        16,
-                        384,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        147456,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 9,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 6,
-            "name": "reshape_op",
-            "kind": "StaticReshape",
-            "attrs": {
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        32,
-                        384,
-                        6144
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 9,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        16,
-                        384
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 10,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        6144
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7,
-            "name": "matmul_v",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 11,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        64,
-                        384
-                    ],
-                    "stride": [
-                        24576,
-                        384,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 10,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        384,
-                        6144
-                    ],
-                    "stride": [
-                        2359296,
-                        6144,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 12,
-                    "dtype": "bf16",
-                    "shape": [
-                        32,
-                        64,
-                        6144
-                    ],
-                    "stride": [
-                        393216,
-                        6144,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-GPT-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-GPT-inf-bf16-bs1.json
deleted file mode 100644
index 17e991b8366..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-GPT-inf-bf16-bs1.json
+++ /dev/null
@@ -1,615 +0,0 @@
-{
-  "version": "3.2.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    4,
-    5,
-    3,
-    7,
-    2,
-    1,
-    0
-  ],
-  "output_ports": [
-    27
-  ],
-  "graph": [
-    {
-      "id": 24971530304,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            256
-          ],
-          "stride": [
-            131072,
-            256,
-            4096,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            256,
-            33
-          ],
-          "stride": [
-            135168,
-            8448,
-            1,
-            256
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971531264,
-      "name": "aten::where",
-      "kind": "Select",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "boolean",
-          "shape": [
-            1,
-            1,
-            32,
-            33
-          ],
-          "stride": [
-            1056,
-            1,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971531584,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971532864,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            33
-          ],
-          "stride": [
-            33,
-            33,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971599872,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971602112,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 18,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971604032,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 18,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 22,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971604352,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 22,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            33
-          ],
-          "stride": [
-            16896,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            33,
-            256
-          ],
-          "stride": [
-            135168,
-            8448,
-            256,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 23,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            256
-          ],
-          "stride": [
-            131072,
-            8192,
-            256,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971605632,
-      "name": "aten::permute",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            2,
-            1,
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 23,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            32,
-            256
-          ],
-          "stride": [
-            131072,
-            8192,
-            256,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 25,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            16,
-            256
-          ],
-          "stride": [
-            131072,
-            4096,
-            256,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 24971606912,
-      "name": "aten::contiguous",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 25,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            16,
-            256
-          ],
-          "stride": [
-            131072,
-            4096,
-            256,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 27,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            16,
-            256
-          ],
-          "stride": [
-            131072,
-            4096,
-            256,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-bf16-bs1.json
deleted file mode 100644
index 3e2000aeb5c..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-bf16-bs1.json
+++ /dev/null
@@ -1,494 +0,0 @@
-{
-  "version": "3.2.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    3,
-    4,
-    2,
-    1,
-    13,
-    0
-  ],
-  "output_ports": [
-    22
-  ],
-  "graph": [
-    {
-      "id": 8790977984,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            128,
-            4096,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            1,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8790982656,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8789246272,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            32,
-            33
-          ],
-          "stride": [
-            1056,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8789094144,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 12,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8790977664,
-      "name": "aten::max",
-      "kind": "Maximum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 12,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8790977344,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 14,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 17,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8790976704,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 17,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 21,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 8789023296,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 21,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 22,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-fp32-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-fp32-bs1.json
deleted file mode 100644
index a66c0d7b1d1..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-fp32-bs1.json
+++ /dev/null
@@ -1,402 +0,0 @@
-{
-  "version": "3.2.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    3,
-    4,
-    2,
-    1,
-    9,
-    0
-  ],
-  "output_ports": [
-    14
-  ],
-  "graph": [
-    {
-      "id": 26697104576,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            128,
-            4096,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            1,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 26697104896,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 26704894528,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            32,
-            33
-          ],
-          "stride": [
-            1056,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 1498088064,
-      "name": "aten::max",
-      "kind": "Maximum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 1498089024,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 1498089344,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json
deleted file mode 100644
index 284845c145c..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-LLaMa-inf-int8-bs1.json
+++ /dev/null
@@ -1,1059 +0,0 @@
-{
-  "version": "3.2.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    4,
-    3,
-    2,
-    1,
-    23,
-    0
-  ],
-  "output_ports": [
-    50
-  ],
-  "graph": [
-    {
-      "id": 34107666176,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            126
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.0419599
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "u8",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107656064,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107656704,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            140
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.0455477
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "u8",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            1,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107659264,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107654464,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            128,
-            33
-          ],
-          "stride": [
-            135168,
-            4224,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 15,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107661824,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 15,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 16,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34106997632,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 16,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            32,
-            33
-          ],
-          "stride": [
-            1056,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 18,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34426358592,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 18,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 22,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34426357952,
-      "name": "aten::max",
-      "kind": "Maximum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 22,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 23,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 24,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34426356992,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 24,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 27,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34426355712,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 27,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 31,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34250258368,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 31,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 35,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107767040,
-      "name": "aten::quantize_per_tensor",
-      "kind": "Quantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.00377987
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 35,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 39,
-          "dtype": "u8",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107767360,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.00377987
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 39,
-          "dtype": "u8",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 40,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107759488,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 40,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 44,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34107752448,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            106
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.00493215
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "u8",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 45,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34109838336,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 45,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 49,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 34105676800,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 44,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            33
-          ],
-          "stride": [
-            33792,
-            1056,
-            33,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 49,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            33,
-            128
-          ],
-          "stride": [
-            135168,
-            4224,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 50,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            128
-          ],
-          "stride": [
-            131072,
-            4096,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json
deleted file mode 100644
index e162e803377..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-bert_large-inf-bf16-bs1.json
+++ /dev/null
@@ -1,444 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    3,
-    4,
-    2,
-    1,
-    0
-  ],
-  "output_ports": [
-    16
-  ],
-  "graph": [
-    {
-      "id": 140256848244864,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            64,
-            1,
-            1024
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848245184,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848245824,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            384
-          ],
-          "stride": [
-            384,
-            384,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848246784,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848247104,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 12,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848247744,
-      "name": "aten::permute",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            2,
-            1,
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 12,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848248384,
-      "name": "aten::contiguous",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 16,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json
deleted file mode 100644
index 9df8a0022b8..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-distill_bert-inf-bf16-bs1.json
+++ /dev/null
@@ -1,457 +0,0 @@
-{
-  "version": "3.5.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    4,
-    5,
-    3,
-    1,
-    2,
-    0
-  ],
-  "output_ports": [
-    17
-  ],
-  "graph": [
-    {
-      "id": 140243871612928,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            64
-          ],
-          "stride": [
-            98304,
-            64,
-            768,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            64,
-            128
-          ],
-          "stride": [
-            98304,
-            64,
-            1,
-            768
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140243871613248,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140243871618688,
-      "name": "llga::Select",
-      "kind": "Select",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "boolean",
-          "shape": [
-            1,
-            1,
-            1,
-            128
-          ],
-          "stride": [
-            128,
-            128,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140243871620288,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140243871621888,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            128
-          ],
-          "stride": [
-            196608,
-            16384,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            64
-          ],
-          "stride": [
-            98304,
-            64,
-            768,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 12,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            64
-          ],
-          "stride": [
-            98304,
-            8192,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140243871624128,
-      "name": "aten::transpose",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            2,
-            1,
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 12,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            12,
-            128,
-            64
-          ],
-          "stride": [
-            98304,
-            8192,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 15,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            12,
-            64
-          ],
-          "stride": [
-            98304,
-            768,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140243871625728,
-      "name": "aten::contiguous",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 15,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            12,
-            64
-          ],
-          "stride": [
-            98304,
-            768,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 17,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            12,
-            64
-          ],
-          "stride": [
-            98304,
-            768,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json
deleted file mode 100644
index afc8b32a0b6..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-stable_diffusion-inf-bf16-bs1.json
+++ /dev/null
@@ -1,276 +0,0 @@
-{
-  "version": "3.6.0",
-  "engine_kind": "gpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0,
-    1,
-    3,
-    2
-  ],
-  "output_ports": [
-    8
-  ],
-  "graph": [
-    {
-      "id": 0,
-      "name": "matmul_qk",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            80
-          ],
-          "stride": [
-            655360,
-            81920,
-            80,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            77,
-            80
-          ],
-          "stride": [
-            49280,
-            6160,
-            80,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            77
-          ],
-          "stride": [
-            630784,
-            78848,
-            77,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "scale_div",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            77
-          ],
-          "stride": [
-            630784,
-            78848,
-            77,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            77
-          ],
-          "stride": [
-            630784,
-            78848,
-            77,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "name": "softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            77
-          ],
-          "stride": [
-            630784,
-            78848,
-            77,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            77
-          ],
-          "stride": [
-            630784,
-            78848,
-            77,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "name": "matmul_v",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            77
-          ],
-          "stride": [
-            630784,
-            78848,
-            77,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            77,
-            80
-          ],
-          "stride": [
-            49280,
-            6160,
-            80,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            8,
-            1024,
-            80
-          ],
-          "stride": [
-            655360,
-            81920,
-            80,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json
deleted file mode 100644
index 3be2824ddfe..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-bf16-bs1.json
+++ /dev/null
@@ -1,351 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    5,
-    6,
-    2,
-    1,
-    9,
-    19
-  ],
-  "output_ports": [
-    20
-  ],
-  "graph": [
-    {
-      "id": 94682587734000,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            128,
-            117
-          ],
-          "stride": [
-            14976,
-            14976,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682809299856,
-      "name": "aten::mul",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682717841456,
-      "name": "aten::where",
-      "kind": "Select",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "boolean",
-          "shape": [
-            1,
-            117,
-            1,
-            117
-          ],
-          "stride": [
-            13689,
-            117,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682499799136,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 10,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 13,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682597164736,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 13,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 19,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            117,
-            128
-          ],
-          "stride": [
-            14976,
-            14976,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 20,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json
deleted file mode 100644
index 81b9e192f38..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-fp32-bs1.json
+++ /dev/null
@@ -1,351 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    5,
-    6,
-    2,
-    1,
-    9,
-    19
-  ],
-  "output_ports": [
-    20
-  ],
-  "graph": [
-    {
-      "id": 94682587734000,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            128,
-            117
-          ],
-          "stride": [
-            14976,
-            14976,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682809299856,
-      "name": "aten::mul",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682717841456,
-      "name": "aten::where",
-      "kind": "Select",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "boolean",
-          "shape": [
-            1,
-            117,
-            1,
-            117
-          ],
-          "stride": [
-            13689,
-            117,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682499799136,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682597164736,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 19,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            117,
-            128
-          ],
-          "stride": [
-            14976,
-            14976,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 20,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json
deleted file mode 100644
index 28dd0782d03..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA-starcoder-inf-int8-bs1.json
+++ /dev/null
@@ -1,686 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    4,
-    3,
-    2,
-    1,
-    9,
-    0
-  ],
-  "output_ports": [
-    20
-  ],
-  "graph": [
-    {
-      "id": 94682566440192,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            126
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.032724
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "u8",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682809527888,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            138
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.0817283
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "u8",
-          "shape": [
-            1,
-            1,
-            128,
-            117
-          ],
-          "stride": [
-            14976,
-            14976,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            128,
-            117
-          ],
-          "stride": [
-            14976,
-            14976,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682587734000,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            128,
-            117
-          ],
-          "stride": [
-            14976,
-            14976,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682809299856,
-      "name": "aten::mul",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682717841456,
-      "name": "aten::where",
-      "kind": "Select",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "boolean",
-          "shape": [
-            1,
-            117,
-            1,
-            117
-          ],
-          "stride": [
-            13689,
-            117,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [],
-          "stride": [],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682499799136,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682615099296,
-      "name": "aten::quantize_per_tensor",
-      "kind": "Quantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.00391965
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 13,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 17,
-          "dtype": "u8",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682763795264,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.00391965
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 17,
-          "dtype": "u8",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 18,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682794097056,
-      "name": "aten::dequantize",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            125
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            0.00910742
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "u8",
-          "shape": [
-            1,
-            1,
-            117,
-            128
-          ],
-          "stride": [
-            14976,
-            14976,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 19,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            117,
-            128
-          ],
-          "stride": [
-            14976,
-            14976,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 94682597164736,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 18,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            117
-          ],
-          "stride": [
-            657072,
-            5616,
-            117,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 19,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            117,
-            128
-          ],
-          "stride": [
-            14976,
-            14976,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 20,
-          "dtype": "f32",
-          "shape": [
-            1,
-            117,
-            48,
-            128
-          ],
-          "stride": [
-            718848,
-            6144,
-            128,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_backward-Bert_large-train-bf16-bs4.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_backward-Bert_large-train-bf16-bs4.json
deleted file mode 100644
index 5a783303e34..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_backward-Bert_large-train-bf16-bs4.json
+++ /dev/null
@@ -1,837 +0,0 @@
-{
-    "version": "3.2.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        5178,
-        2525,
-        2547,
-        2506,
-        2545,
-        2591,
-        2514,
-        2518,
-        2548
-    ],
-    "output_ports": [
-        5196,
-        5188,
-        5204
-    ],
-    "graph": [
-        {
-            "id": 7392,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Reshape_3_grad/Reshape",
-            "kind": "StaticReshape",
-            "attrs": {
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ]
-                },
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5178,
-                    "dtype": "bf16",
-                    "shape": [
-                        2048,
-                        1024
-                    ],
-                    "stride": [
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5179,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7393,
-            "name": "gradients/bert/encoder/layer_0/attention/self/transpose_3_grad/transpose",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        2,
-                        1,
-                        3
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5179,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5180,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7394,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_1_grad/MatMul",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5180,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2525,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5181,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7395,
-            "name": "gradients/bert/encoder/layer_0/attention/self/dropout/SelectV2_grad/SelectV2",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5181,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2547,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5182,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7396,
-            "name": "gradients/bert/encoder/layer_0/attention/self/dropout/Mul_grad/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2506,
-                    "dtype": "bf16",
-                    "shape": [],
-                    "stride": [],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                },
-                {
-                    "id": 5182,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5183,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7397,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Softmax_grad/mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2545,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5183,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5184,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7398,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Softmax_grad/Sum",
-            "kind": "ReduceSum",
-            "attrs": {
-                "axes": {
-                    "type": "s64[]",
-                    "value": [
-                        -1
-                    ]
-                },
-                "keep_dims": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5184,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5185,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        1
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7399,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Softmax_grad/sub",
-            "kind": "Subtract",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5183,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5185,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        1
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5186,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7400,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Mul_grad/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2591,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5186,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5187,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7411,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_grad/MatMul_1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 1
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5187,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2514,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5196,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7401,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_grad/MatMul",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5187,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2518,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5188,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7421,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_1_grad/MatMul_1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 1
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2548,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5180,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5204,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7422,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 5196,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 7423,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 5188,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 7424,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 5204,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_backward-Bert_large-train-fp32-bs4.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_backward-Bert_large-train-fp32-bs4.json
deleted file mode 100644
index 83617f395bc..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_backward-Bert_large-train-fp32-bs4.json
+++ /dev/null
@@ -1,837 +0,0 @@
-{
-    "version": "3.2.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        5178,
-        2525,
-        2547,
-        2506,
-        2545,
-        2591,
-        2514,
-        2518,
-        2548
-    ],
-    "output_ports": [
-        5196,
-        5188,
-        5204
-    ],
-    "graph": [
-        {
-            "id": 7392,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Reshape_3_grad/Reshape",
-            "kind": "StaticReshape",
-            "attrs": {
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ]
-                },
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5178,
-                    "dtype": "f32",
-                    "shape": [
-                        2048,
-                        1024
-                    ],
-                    "stride": [
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5179,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7393,
-            "name": "gradients/bert/encoder/layer_0/attention/self/transpose_3_grad/transpose",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        2,
-                        1,
-                        3
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5179,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5180,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7394,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_1_grad/MatMul",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5180,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2525,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5181,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7395,
-            "name": "gradients/bert/encoder/layer_0/attention/self/dropout/SelectV2_grad/SelectV2",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5181,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2547,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5182,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7396,
-            "name": "gradients/bert/encoder/layer_0/attention/self/dropout/Mul_grad/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2506,
-                    "dtype": "f32",
-                    "shape": [],
-                    "stride": [],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                },
-                {
-                    "id": 5182,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5183,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7397,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Softmax_grad/mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2545,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5183,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5184,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7398,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Softmax_grad/Sum",
-            "kind": "ReduceSum",
-            "attrs": {
-                "axes": {
-                    "type": "s64[]",
-                    "value": [
-                        -1
-                    ]
-                },
-                "keep_dims": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5184,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5185,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        1
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7399,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Softmax_grad/sub",
-            "kind": "Subtract",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5183,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5185,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        1
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5186,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7400,
-            "name": "gradients/bert/encoder/layer_0/attention/self/Mul_grad/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2591,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5186,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5187,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7411,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_grad/MatMul_1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 1
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5187,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2514,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5196,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7401,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_grad/MatMul",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 5187,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2518,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5188,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7421,
-            "name": "gradients/bert/encoder/layer_0/attention/self/MatMul_1_grad/MatMul_1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 1
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2548,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 5180,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 5204,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 7422,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 5196,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 7423,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 5188,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 7424,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 5204,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_forward-Bert_large-train-bf16-bs4.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_forward-Bert_large-train-bf16-bs4.json
deleted file mode 100644
index 946fe044acf..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_forward-Bert_large-train-bf16-bs4.json
+++ /dev/null
@@ -1,607 +0,0 @@
-{
-    "version": "3.2.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        2514,
-        2518,
-        2520,
-        2543,
-        2506,
-        2547,
-        2525
-    ],
-    "output_ports": [
-        2545,
-        2548,
-        2551
-    ],
-    "graph": [
-        {
-            "id": 4809,
-            "name": "bert/encoder/layer_0/attention/self/MatMul",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2514,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2518,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2519,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4810,
-            "name": "bert/encoder/layer_0/attention/self/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2519,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2520,
-                    "dtype": "bf16",
-                    "shape": [],
-                    "stride": [],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2521,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4831,
-            "name": "bert/encoder/layer_0/attention/self/add",
-            "kind": "Add",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2521,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2543,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        1,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        262144,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2544,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4832,
-            "name": "bert/encoder/layer_0/attention/self/Softmax",
-            "kind": "SoftMax",
-            "attrs": {
-                "axis": {
-                    "type": "s64",
-                    "value": -1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2544,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2545,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4833,
-            "name": "bert/encoder/layer_0/attention/self/dropout/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2545,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2506,
-                    "dtype": "bf16",
-                    "shape": [],
-                    "stride": [],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2546,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4834,
-            "name": "bert/encoder/layer_0/attention/self/dropout/SelectV2",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2546,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2547,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2548,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4835,
-            "name": "bert/encoder/layer_0/attention/self/MatMul_1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2548,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2525,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2549,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4836,
-            "name": "bert/encoder/layer_0/attention/self/transpose_3",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        2,
-                        1,
-                        3
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2549,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2550,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4837,
-            "name": "bert/encoder/layer_0/attention/self/Reshape_3",
-            "kind": "StaticReshape",
-            "attrs": {
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        2048,
-                        1024
-                    ]
-                },
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2550,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2551,
-                    "dtype": "bf16",
-                    "shape": [
-                        2048,
-                        1024
-                    ],
-                    "stride": [
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4838,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 2545,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 4839,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 2548,
-                    "dtype": "bf16",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 4840,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 2551,
-                    "dtype": "bf16",
-                    "shape": [
-                        2048,
-                        1024
-                    ],
-                    "stride": [
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json
deleted file mode 100644
index d9f33bbecd5..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/MHA_forward-Bert_large-train-fp32-bs4.json
+++ /dev/null
@@ -1,607 +0,0 @@
-{
-    "version": "3.2.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        2514,
-        2518,
-        2520,
-        2543,
-        2506,
-        2547,
-        2525
-    ],
-    "output_ports": [
-        2545,
-        2548,
-        2551
-    ],
-    "graph": [
-        {
-            "id": 4809,
-            "name": "bert/encoder/layer_0/attention/self/MatMul",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2514,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2518,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2519,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4810,
-            "name": "bert/encoder/layer_0/attention/self/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2519,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2520,
-                    "dtype": "f32",
-                    "shape": [],
-                    "stride": [],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2521,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4831,
-            "name": "bert/encoder/layer_0/attention/self/add",
-            "kind": "Add",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2521,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2543,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        1,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        262144,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2544,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4832,
-            "name": "bert/encoder/layer_0/attention/self/Softmax",
-            "kind": "SoftMax",
-            "attrs": {
-                "axis": {
-                    "type": "s64",
-                    "value": -1
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2544,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2545,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4833,
-            "name": "bert/encoder/layer_0/attention/self/dropout/Mul",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2545,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2506,
-                    "dtype": "f32",
-                    "shape": [],
-                    "stride": [],
-                    "layout_type": "strided",
-                    "property_type": "constant"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2546,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4834,
-            "name": "bert/encoder/layer_0/attention/self/dropout/SelectV2",
-            "kind": "Multiply",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2546,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2547,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        4194304,
-                        262144,
-                        512,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2548,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4835,
-            "name": "bert/encoder/layer_0/attention/self/MatMul_1",
-            "kind": "MatMul",
-            "attrs": {
-                "transpose_a": {
-                    "type": "bool",
-                    "value": 0
-                },
-                "transpose_b": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2548,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                },
-                {
-                    "id": 2525,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        524288,
-                        32768,
-                        64,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2549,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4836,
-            "name": "bert/encoder/layer_0/attention/self/transpose_3",
-            "kind": "StaticTranspose",
-            "attrs": {
-                "order": {
-                    "type": "s64[]",
-                    "value": [
-                        0,
-                        2,
-                        1,
-                        3
-                    ]
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2549,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2550,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4837,
-            "name": "bert/encoder/layer_0/attention/self/Reshape_3",
-            "kind": "StaticReshape",
-            "attrs": {
-                "shape": {
-                    "type": "s64[]",
-                    "value": [
-                        2048,
-                        1024
-                    ]
-                },
-                "special_zero": {
-                    "type": "bool",
-                    "value": 0
-                }
-            },
-            "inputs": [
-                {
-                    "id": 2550,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        512,
-                        16,
-                        64
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 2551,
-                    "dtype": "f32",
-                    "shape": [
-                        2048,
-                        1024
-                    ],
-                    "stride": [
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ]
-        },
-        {
-            "id": 4838,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 2545,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 4839,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 2548,
-                    "dtype": "f32",
-                    "shape": [
-                        4,
-                        16,
-                        512,
-                        512
-                    ],
-                    "stride": [
-                        -9223372036854775808
-                    ],
-                    "layout_type": "undef",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        },
-        {
-            "id": 4840,
-            "name": "end",
-            "kind": "End",
-            "attrs": {},
-            "inputs": [
-                {
-                    "id": 2551,
-                    "dtype": "f32",
-                    "shape": [
-                        2048,
-                        1024
-                    ],
-                    "stride": [
-                        1024,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "undef"
-                }
-            ],
-            "outputs": []
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json
deleted file mode 100644
index dcf363e4184..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/dynamic_quantized_mha-Bert_large-inf-int8-bs1-fake.json
+++ /dev/null
@@ -1,1188 +0,0 @@
-{
-  "version": "0.6.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 140664149194112,
-      "name": "dynamic_dequantize",
-      "kind": "DynamicDequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "u8",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1000,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1001,
-          "dtype": "s32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149195392,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149195712,
-      "name": "dynamic_dequantize",
-      "kind": "DynamicDequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "u8",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            64,
-            1,
-            1024
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1002,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1003,
-          "dtype": "s32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            24576,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149196992,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 10,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            24576,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            24576,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149197312,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            24576,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 15,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149197632,
-      "name": "aten::div",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 15,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 16,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149198272,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 16,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            384
-          ],
-          "stride": [
-            384,
-            384,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 18,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149199232,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 18,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 21,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149200512,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 21,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 25,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149201792,
-      "name": "dynamic_quantize",
-      "kind": "DynamicQuantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 25,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1004,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1005,
-          "dtype": "s32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 29,
-          "dtype": "u8",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149202112,
-      "name": "dynamic_dequantize",
-      "kind": "DynamicDequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 29,
-          "dtype": "u8",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1006,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1007,
-          "dtype": "s32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 30,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149203392,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 30,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 34,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149203712,
-      "name": "dynamic_dequantize",
-      "kind": "DynamicDequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "u8",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1008,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1009,
-          "dtype": "s32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 35,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149204992,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 35,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 39,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149205312,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 34,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 39,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 40,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149205952,
-      "name": "aten::permute",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            2,
-            1,
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 40,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 42,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            24576,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149206592,
-      "name": "aten::contiguous",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 42,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            24576,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 44,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149207872,
-      "name": "aten::to",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 44,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 48,
-          "dtype": "f32",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140664149209152,
-      "name": "dynamic_quantize",
-      "kind": "DynamicQuantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 48,
-          "dtype": "f32",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1010,
-          "dtype": "f32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1011,
-          "dtype": "s32",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 52,
-          "dtype": "u8",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-gs32.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-gs32.json
new file mode 100644
index 00000000000..345d76240c1
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-k-int8-gs32.json
@@ -0,0 +1,449 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "f16",
+    "fpmath_mode_apply_to_int": "true",
+    "input_ports": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8
+    ],
+    "output_ports": [
+      50
+    ],
+    "graph": [
+      {
+        "id": 34107656704,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+              1,
+              1,
+              32,
+              1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "s8",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              4,
+              32
+            ],
+            "stride": [
+              4096,
+              128,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "s8",
+            "shape": [
+              1,
+              32,
+              4,
+              32
+            ],
+            "stride": [
+              4096,
+              128,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34107654464,
+        "name": "aten::matmul",
+        "kind": "MatMul",
+        "attrs": {
+          "transpose_a": {
+            "type": "bool",
+            "value": 0
+          },
+          "transpose_b": {
+            "type": "bool",
+            "value": 0
+          }
+        },
+        "inputs": [
+          {
+            "id": 3,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 15,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34107661824,
+        "name": "aten::div",
+        "kind": "Divide",
+        "attrs": {
+          "auto_broadcast": {
+            "type": "string",
+            "value": "numpy"
+          }
+        },
+        "inputs": [
+          {
+            "id": 15,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 4,
+            "dtype": "f16",
+            "shape": [],
+            "stride": [],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 16,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34106997632,
+        "name": "aten::add",
+        "kind": "Add",
+        "attrs": {
+          "auto_broadcast": {
+            "type": "string",
+            "value": "numpy"
+          }
+        },
+        "inputs": [
+          {
+            "id": 16,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 5,
+            "dtype": "f16",
+            "shape": [
+              1,
+              1,
+              32,
+              32
+            ],
+            "stride": [
+              1024,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 18,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34426356992,
+        "name": "aten::softmax",
+        "kind": "SoftMax",
+        "attrs": {
+          "axis": {
+            "type": "s64",
+            "value": 3
+          }
+        },
+        "inputs": [
+          {
+            "id": 18,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 27,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34105676800,
+        "name": "aten::matmul",
+        "kind": "MatMul",
+        "attrs": {
+          "transpose_a": {
+            "type": "bool",
+            "value": 0
+          },
+          "transpose_b": {
+            "type": "bool",
+            "value": 0
+          }
+        },
+        "inputs": [
+          {
+            "id": 27,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 45,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 50,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
+
+  
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
new file mode 100644
index 00000000000..0e438fafe74
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
@@ -0,0 +1,730 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "f16",
+  "fpmath_mode_apply_to_int": "true",
+  "input_ports": [
+    0,
+    1,
+    2,
+    3,
+    4,
+    5,
+    6,
+    7,
+    8
+  ],
+  "output_ports": [
+    50
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "dequantize",
+      "kind": "DynamicDequantize",
+      "attrs": {
+        "qtype": {
+          "type": "string",
+          "value": "per_group"
+        },
+        "group_shape": {
+          "type": "s64[]",
+          "value": [
+            1,
+            1,
+            128,
+            1
+          ]
+        },
+        "axis": {
+          "type": "s64",
+          "value": 2
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "s8",
+          "shape": [
+            1,
+            32,
+            128,
+            32
+          ],
+          "stride": [
+            131072,
+            4096,
+            1,
+            128
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 1,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            1,
+            32
+          ],
+          "stride": [
+            4096,
+            128,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 2,
+          "dtype": "s8",
+          "shape": [
+            1,
+            32,
+            1,
+            32
+          ],
+          "stride": [
+            4096,
+            128,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 10,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            128,
+            32
+          ],
+          "stride": [
+            131072,
+            4096,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "name": "matmul",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 3,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            128
+          ],
+          "stride": [
+            131072,
+            4096,
+            128,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 10,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            128,
+            32
+          ],
+          "stride": [
+            131072,
+            4096,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 15,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "name": "div",
+      "kind": "Divide",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 15,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 4,
+          "dtype": "f16",
+          "shape": [],
+          "stride": [],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 16,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "name": "genindex_row",
+      "kind": "GenIndex",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 2
+        }
+      },
+      "inputs": [
+        {
+          "id": 16,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 17,
+          "dtype": "s32",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "name": "genindex_col",
+      "kind": "GenIndex",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 3
+        }
+      },
+      "inputs": [
+        {
+          "id": 16,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 18,
+          "dtype": "s32",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 5,
+      "name": "mask_greater_equal",
+      "kind": "GreaterEqual",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 17,
+          "dtype": "s32",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 18,
+          "dtype": "s32",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 19,
+          "dtype": "boolean",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "name": "Select",
+      "kind": "Select",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 19,
+          "dtype": "boolean",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 16,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 5,
+          "dtype": "f16",
+          "shape": [
+            1
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 21,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 7,
+      "name": "softmax",
+      "kind": "SoftMax",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 3
+        }
+      },
+      "inputs": [
+        {
+          "id": 21,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 22,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "name": "dequantize",
+      "kind": "DynamicDequantize",
+      "attrs": {
+        "qtype": {
+          "type": "string",
+          "value": "per_group"
+        },
+        "group_shape": {
+          "type": "s64[]",
+          "value": [
+            1,
+            1,
+            1,
+            128
+          ]
+        },
+        "axis": {
+          "type": "s64",
+          "value": 3
+        }
+      },
+      "inputs": [
+        {
+          "id": 6,
+          "dtype": "s8",
+          "shape": [
+            1,
+            32,
+            32,
+            128
+          ],
+          "stride": [
+            131072,
+            4096,
+            128,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 7,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            1
+          ],
+          "stride": [
+            4096,
+            128,
+            1,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 8,
+          "dtype": "s8",
+          "shape": [
+            1,
+            32,
+            32,
+            1
+          ],
+          "stride": [
+            4096,
+            128,
+            1,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 45,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            128
+          ],
+          "stride": [
+            131072,
+            4096,
+            128,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    },
+    {
+      "id": 9,
+      "name": "matmul",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 22,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            32
+          ],
+          "stride": [
+            32768,
+            1024,
+            32,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        },
+        {
+          "id": 45,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            128
+          ],
+          "stride": [
+            131072,
+            4096,
+            128,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 50,
+          "dtype": "f16",
+          "shape": [
+            1,
+            32,
+            32,
+            128
+          ],
+          "stride": [
+            131072,
+            4096,
+            128,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "variable"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
new file mode 100644
index 00000000000..a8a438d509b
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-kv-int4-gs32.json
@@ -0,0 +1,547 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "f16",
+    "fpmath_mode_apply_to_int": "true",
+    "input_ports": [
+      0,
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8
+    ],
+    "output_ports": [
+      50
+    ],
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+              1,
+              1,
+              32,
+              1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              4,
+              32
+            ],
+            "stride": [
+              4096,
+              128,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              4,
+              32
+            ],
+            "stride": [
+              4096,
+              128,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34107654464,
+        "name": "aten::matmul",
+        "kind": "MatMul",
+        "attrs": {
+          "transpose_a": {
+            "type": "bool",
+            "value": 0
+          },
+          "transpose_b": {
+            "type": "bool",
+            "value": 0
+          }
+        },
+        "inputs": [
+          {
+            "id": 3,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 15,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34107661824,
+        "name": "aten::div",
+        "kind": "Divide",
+        "attrs": {
+          "auto_broadcast": {
+            "type": "string",
+            "value": "numpy"
+          }
+        },
+        "inputs": [
+          {
+            "id": 15,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 4,
+            "dtype": "f16",
+            "shape": [],
+            "stride": [],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 16,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34106997632,
+        "name": "aten::add",
+        "kind": "Add",
+        "attrs": {
+          "auto_broadcast": {
+            "type": "string",
+            "value": "numpy"
+          }
+        },
+        "inputs": [
+          {
+            "id": 16,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 5,
+            "dtype": "f16",
+            "shape": [
+              1,
+              1,
+              32,
+              32
+            ],
+            "stride": [
+              1024,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 18,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34426356992,
+        "name": "aten::softmax",
+        "kind": "SoftMax",
+        "attrs": {
+          "axis": {
+            "type": "s64",
+            "value": 3
+          }
+        },
+        "inputs": [
+          {
+            "id": 18,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 27,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 99,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+              1,
+              1,
+              1,
+              32
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 3
+          }
+        },
+        "inputs": [
+          {
+            "id": 6,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 7,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              4
+            ],
+            "stride": [
+              4096,
+              128,
+              4,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 8,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              32,
+              4
+            ],
+            "stride": [
+              4096,
+              128,
+              4,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 45,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34105676800,
+        "name": "aten::matmul",
+        "kind": "MatMul",
+        "attrs": {
+          "transpose_a": {
+            "type": "bool",
+            "value": 0
+          },
+          "transpose_b": {
+            "type": "bool",
+            "value": 0
+          }
+        },
+        "inputs": [
+          {
+            "id": 27,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              32
+            ],
+            "stride": [
+              32768,
+              1024,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 45,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 50,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              32,
+              128
+            ],
+            "stride": [
+              131072,
+              4096,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-v-int8-gs32.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-v-int8-gs32.json
new file mode 100644
index 00000000000..d89ff8cd8b5
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-compressed-v-int8-gs32.json
@@ -0,0 +1,446 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "f16",
+    "fpmath_mode_apply_to_int": "true",
+    "input_ports": [
+      10,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8
+    ],
+    "output_ports": [
+      50
+    ],
+    "graph": [
+      {
+        "id": 34107654464,
+        "name": "aten::matmul",
+        "kind": "MatMul",
+        "attrs": {
+          "transpose_a": {
+            "type": "bool",
+            "value": 0
+          },
+          "transpose_b": {
+            "type": "bool",
+            "value": 0
+          }
+        },
+        "inputs": [
+          {
+            "id": 3,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              64
+            ],
+            "stride": [
+              131072,
+              8192,
+              64,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              64,
+              128
+            ],
+            "stride": [
+              131072,
+              8192,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 15,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34107661824,
+        "name": "aten::div",
+        "kind": "Divide",
+        "attrs": {
+          "auto_broadcast": {
+            "type": "string",
+            "value": "numpy"
+          }
+        },
+        "inputs": [
+          {
+            "id": 15,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 4,
+            "dtype": "f16",
+            "shape": [],
+            "stride": [],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 16,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34106997632,
+        "name": "aten::add",
+        "kind": "Add",
+        "attrs": {
+          "auto_broadcast": {
+            "type": "string",
+            "value": "numpy"
+          }
+        },
+        "inputs": [
+          {
+            "id": 16,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 5,
+            "dtype": "f16",
+            "shape": [
+              1,
+              1,
+              128,
+              128
+            ],
+            "stride": [
+              16384,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 18,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34426356992,
+        "name": "aten::softmax",
+        "kind": "SoftMax",
+        "attrs": {
+          "axis": {
+            "type": "s64",
+            "value": 3
+          }
+        },
+        "inputs": [
+          {
+            "id": 18,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 27,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34107752448,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+              1,
+              1,
+              1,
+              32
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 3
+          }
+        },
+        "inputs": [
+          {
+            "id": 6,
+            "dtype": "u8",
+            "shape": [
+              1,
+              16,
+              128,
+              64
+            ],
+            "stride": [
+              131072,
+              8192,
+              64,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 7,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              2
+            ],
+            "stride": [
+              4096,
+              256,
+              2,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 8,
+            "dtype": "u8",
+            "shape": [
+              1,
+              16,
+              128,
+              2
+            ],
+            "stride": [
+              4096,
+              256,
+              2,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 45,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              64
+            ],
+            "stride": [
+              131072,
+              8192,
+              64,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      },
+      {
+        "id": 34105676800,
+        "name": "aten::matmul",
+        "kind": "MatMul",
+        "attrs": {
+          "transpose_a": {
+            "type": "bool",
+            "value": 0
+          },
+          "transpose_b": {
+            "type": "bool",
+            "value": 0
+          }
+        },
+        "inputs": [
+          {
+            "id": 27,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              128
+            ],
+            "stride": [
+              262144,
+              16384,
+              128,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 45,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              64
+            ],
+            "stride": [
+              131072,
+              8192,
+              64,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 50,
+            "dtype": "f16",
+            "shape": [
+              1,
+              16,
+              128,
+              64
+            ],
+            "stride": [
+              131072,
+              8192,
+              64,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
+  
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
new file mode 100644
index 00000000000..6d73a3cd6fc
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-implicit-causal-mask-fp32-bs1.json
@@ -0,0 +1,530 @@
+{
+  "version": "3.8.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    1,
+    2,
+    4,
+    5,
+    3
+  ],
+  "output_ports": [
+    6
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "matmul_qk",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 1
+        }
+      },
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            64
+          ],
+          "stride": [
+            393216,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            64
+          ],
+          "stride": [
+            393216,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 101,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 1,
+      "name": "scale_mul",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 101,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 102,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 40,
+      "name": "genindex_row",
+      "kind": "GenIndex",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 2
+        }
+      },
+      "inputs": [
+        {
+          "id": 102,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1021,
+          "dtype": "s32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 41,
+      "name": "genindex_col",
+      "kind": "GenIndex",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 3
+        }
+      },
+      "inputs": [
+        {
+          "id": 102,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1022,
+          "dtype": "s32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 42,
+      "name": "mask_greater_equal",
+      "kind": "GreaterEqual",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 1021,
+          "dtype": "s32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 1022,
+          "dtype": "s32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1023,
+          "dtype": "boolean",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "name": "Select",
+      "kind": "Select",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 1023,
+          "dtype": "boolean",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 102,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 5,
+          "dtype": "f32",
+          "shape": [
+            1
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 103,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "name": "softmax",
+      "kind": "SoftMax",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        }
+      },
+      "inputs": [
+        {
+          "id": 103,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 104,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "name": "matmul_v",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 104,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            384
+          ],
+          "stride": [
+            2359296,
+            147456,
+            384,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 3,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            64
+          ],
+          "stride": [
+            393216,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            384,
+            64
+          ],
+          "stride": [
+            393216,
+            24576,
+            64,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16-f32.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
new file mode 100644
index 00000000000..f8d6014e0da
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16-f32.json
@@ -0,0 +1,347 @@
+{
+  "version": "3.8.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    1, 
+    2, 
+    4, 
+    5, 
+    3
+  ],
+  "output_ports": [
+    6
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "matmul_qk",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 1
+        }
+      },
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 2,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 101,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 1,
+      "name": "scale_div",
+      "kind": "Divide",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 101,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 4,
+          "dtype": "f16",
+          "shape": [
+            1
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 102,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 2,
+      "name": "mask_add",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 102,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 5,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            1, 
+            384, 
+            384
+          ],
+          "stride": [
+            147456, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 103,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 3,
+      "name": "softmax",
+      "kind": "SoftMax",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        }
+      },
+      "inputs": [
+        {
+          "id": 103,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 104,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 4,
+      "name": "matmul_v",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 104,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 3,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json
index a3385a1c7d0..b467395a4eb 100644
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-simplified-f16.json
@@ -184,12 +184,12 @@
           "shape": [
             1, 
             1, 
-            1, 
+            384,
             384
           ],
           "stride": [
-            384, 
-            384, 
+            147456,
+            147456,
             384, 
             1
           ],
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-mask-f16.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-mask-f16.json
new file mode 100644
index 00000000000..d311248396a
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-mask-f16.json
@@ -0,0 +1,278 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0, 
+    1, 
+    4, 
+    9
+  ],
+  "output_ports": [
+    10
+  ],
+  "graph": [
+    {
+      "id": 3,
+      "name": "bmm1",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 1
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 1,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 6,
+      "name": "scale_div",
+      "kind": "Divide",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 2,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 4,
+          "dtype": "f16",
+          "shape": [
+            1
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 5,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8,
+      "name": "softmax",
+      "kind": "SoftMax",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        }
+      },
+      "inputs": [
+        {
+          "id": 5,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 7,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 11,
+      "name": "bmm2",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 7,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            384
+          ],
+          "stride": [
+            2359296, 
+            147456, 
+            384, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 9,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 10,
+          "dtype": "f16",
+          "shape": [
+            32, 
+            16, 
+            384, 
+            64
+          ],
+          "stride": [
+            393216, 
+            24576, 
+            64, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
+
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-scale-bf16-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-scale-bf16-bs1.json
deleted file mode 100644
index 98b2400f23a..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-scale-bf16-bs1.json
+++ /dev/null
@@ -1,385 +0,0 @@
-{
-  "version": "3.6.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    3,
-    4,
-    2,
-    1,
-    0
-  ],
-  "output_ports": [
-    16
-  ],
-  "graph": [
-    {
-      "id": 140256848244864,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            393216,
-            64,
-            1,
-            1024
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848245824,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            384
-          ],
-          "stride": [
-            384,
-            384,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848246784,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848247104,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 11,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            64,
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 12,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848247744,
-      "name": "aten::permute",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            2,
-            1,
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 12,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 140256848248384,
-      "name": "aten::contiguous",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 14,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 16,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-scale-fp32-bs1.json b/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-scale-fp32-bs1.json
deleted file mode 100644
index f5a0a3e2024..00000000000
--- a/tests/benchdnn/inputs/graph/complex_fusion/mha/sdpa-plain-wo-scale-fp32-bs1.json
+++ /dev/null
@@ -1,385 +0,0 @@
-{
-  "version": "3.6.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    3,
-    4,
-    2,
-    1,
-    0
-  ],
-  "output_ports": [
-    16
-  ],
-  "graph": [
-    {
-      "id": 139695268706816,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            1179648,
-            64,
-            3072,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            64,
-            384
-          ],
-          "stride": [
-            1179648,
-            64,
-            1,
-            3072
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 139695268707776,
-      "name": "aten::add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            1,
-            384
-          ],
-          "stride": [
-            384,
-            384,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 139695268708736,
-      "name": "aten::softmax",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 3
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 11,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 139695268709056,
-      "name": "aten::matmul",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 11,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            384
-          ],
-          "stride": [
-            2359296,
-            147456,
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        },
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            1179648,
-            64,
-            3072,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 12,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 139695268710656,
-      "name": "aten::permute",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            2,
-            1,
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 12,
-          "dtype": "f32",
-          "shape": [
-            1,
-            16,
-            384,
-            64
-          ],
-          "stride": [
-            393216,
-            24576,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 14,
-          "dtype": "f32",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    },
-    {
-      "id": 139695268711936,
-      "name": "aten::contiguous",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 14,
-          "dtype": "f32",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 16,
-          "dtype": "f32",
-          "shape": [
-            1,
-            384,
-            16,
-            64
-          ],
-          "stride": [
-            393216,
-            1024,
-            64,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mlp/gated-mlp-f32.json b/tests/benchdnn/inputs/graph/complex_fusion/mlp/gated-mlp-f32.json
new file mode 100644
index 00000000000..4eb0d404857
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mlp/gated-mlp-f32.json
@@ -0,0 +1,352 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    0,
+    4,
+    13
+  ],
+  "output_ports": [
+    14
+  ],
+  "graph": [
+    {
+      "id": 3,
+      "name": "fc_gate",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            1,
+            4096
+          ],
+          "stride": [
+            4096,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            4096,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 8,
+      "name": "swish/sigmoid",
+      "kind": "Sigmoid",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 7,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 10,
+      "name": "swish/multiply",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 7,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 9,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 6,
+      "name": "fc_up",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            1,
+            4096
+          ],
+          "stride": [
+            4096,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            4096,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 5,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 12,
+      "name": "mul",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 9,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 5,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 11,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 15,
+      "name": "fc_down",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 11,
+          "dtype": "f32",
+          "shape": [
+            1,
+            14336
+          ],
+          "stride": [
+            14336,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 13,
+          "dtype": "f32",
+          "shape": [
+            14336,
+            4096
+          ],
+          "stride": [
+            4096,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 14,
+          "dtype": "f32",
+          "shape": [
+            1,
+            4096
+          ],
+          "stride": [
+            4096,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/complex_fusion/mlp/gated-mlp-int4.json b/tests/benchdnn/inputs/graph/complex_fusion/mlp/gated-mlp-int4.json
new file mode 100644
index 00000000000..706521b529e
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/complex_fusion/mlp/gated-mlp-int4.json
@@ -0,0 +1,605 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "true",
+  "input_ports": [
+    0, 
+    1, 
+    2, 
+    5, 
+    8, 
+    9, 
+    10, 
+    5, 
+    21, 
+    22, 
+    23
+  ],
+  "output_ports": [
+    26
+  ],
+  "graph": [
+    {
+      "id": 4,
+      "name": "deq_gate",
+      "kind": "DynamicDequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        },
+        "group_shape": {
+          "type": "s64[]",
+          "value": [
+            1, 
+            128
+          ]
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_group"
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "u4",
+          "shape": [
+            4096, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 1,
+          "dtype": "f16",
+          "shape": [
+            4096, 
+            112
+          ],
+          "stride": [
+            112, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 2,
+          "dtype": "u8",
+          "shape": [
+            4096, 
+            112
+          ],
+          "stride": [
+            112, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 3,
+          "dtype": "f16",
+          "shape": [
+            4096, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 7,
+      "name": "fc_gate",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 5,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            4096
+          ],
+          "stride": [
+            4096, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 3,
+          "dtype": "f16",
+          "shape": [
+            4096, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 16,
+      "name": "swish/sigmoid",
+      "kind": "Sigmoid",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 6,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 15,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 18,
+      "name": "swish/multiply",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 6,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 15,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 17,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 12,
+      "name": "deq_up",
+      "kind": "DynamicDequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        },
+        "group_shape": {
+          "type": "s64[]",
+          "value": [
+            1, 
+            128
+          ]
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_group"
+        }
+      },
+      "inputs": [
+        {
+          "id": 8,
+          "dtype": "u4",
+          "shape": [
+            4096, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 9,
+          "dtype": "f16",
+          "shape": [
+            4096, 
+            112
+          ],
+          "stride": [
+            112, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 10,
+          "dtype": "u8",
+          "shape": [
+            4096, 
+            112
+          ],
+          "stride": [
+            112, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 11,
+          "dtype": "f16",
+          "shape": [
+            4096, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 14,
+      "name": "fc_up",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 5,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            4096
+          ],
+          "stride": [
+            4096, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 11,
+          "dtype": "f16",
+          "shape": [
+            4096, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 13,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 20,
+      "name": "mul",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 17,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 13,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 19,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 25,
+      "name": "deq_down",
+      "kind": "DynamicDequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": -1
+        },
+        "group_shape": {
+          "type": "s64[]",
+          "value": [
+            1, 
+            128
+          ]
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_group"
+        }
+      },
+      "inputs": [
+        {
+          "id": 21,
+          "dtype": "u4",
+          "shape": [
+            14336, 
+            4096
+          ],
+          "stride": [
+            4096, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 22,
+          "dtype": "f16",
+          "shape": [
+            14336, 
+            32
+          ],
+          "stride": [
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 23,
+          "dtype": "u8",
+          "shape": [
+            14336, 
+            32
+          ],
+          "stride": [
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 24,
+          "dtype": "f16",
+          "shape": [
+            14336, 
+            4096
+          ],
+          "stride": [
+            4096, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 27,
+      "name": "fc_down",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 19,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            14336
+          ],
+          "stride": [
+            14336, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 24,
+          "dtype": "f16",
+          "shape": [
+            14336, 
+            4096
+          ],
+          "stride": [
+            4096, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 26,
+          "dtype": "f16",
+          "shape": [
+            1, 
+            4096
+          ],
+          "stride": [
+            4096, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/bf16/abs.json b/tests/benchdnn/inputs/graph/op/bf16/abs.json
deleted file mode 100644
index 6093ea97536..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/abs.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Abs",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/abs_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/abs_bwd.json
deleted file mode 100644
index ce4acf86e3d..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/abs_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "AbsBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/add.json b/tests/benchdnn/inputs/graph/op/bf16/add.json
deleted file mode 100644
index d027b7dc74c..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/add.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/avgpool.json b/tests/benchdnn/inputs/graph/op/bf16/avgpool.json
deleted file mode 100644
index 20dce6b4f97..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/avgpool.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "AvgPool",
-      "attrs": {
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "rounding_type": {
-          "type": "string",
-          "value": "floor"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "exclude_pad": {
-          "type": "bool",
-          "value": 0
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/avgpool_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/avgpool_bwd.json
deleted file mode 100644
index 22b4e1d4d17..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/avgpool_bwd.json
+++ /dev/null
@@ -1,103 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "AvgPoolBackward",
-      "attrs": {
-        "src_shape": {
-          "type": "s64[]",
-          "value": [
-            1,
-            64,
-            600,
-            600
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "exclude_pad": {
-          "type": "bool",
-          "value": 0
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/biasadd.json b/tests/benchdnn/inputs/graph/op/bf16/biasadd.json
deleted file mode 100644
index f941b25c922..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/biasadd.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BIASADD_0",
-      "kind": "BiasAdd",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            32,
-            32
-          ],
-          "stride": [
-            131072,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            128
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            32,
-            32
-          ],
-          "stride": [
-            131072,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/biasadd_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/biasadd_bwd.json
deleted file mode 100644
index ac5d99c16fe..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/biasadd_bwd.json
+++ /dev/null
@@ -1,52 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BIASADD_0",
-      "kind": "BiasAddBackprop",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            32,
-            32
-          ],
-          "stride": [
-            131072,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            128
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/bnorm.json b/tests/benchdnn/inputs/graph/op/bf16/bnorm.json
deleted file mode 100644
index 00dce065e5a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/bnorm.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormInference",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/bnorm_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/bnorm_bwd.json
deleted file mode 100644
index 6b2c85f43f3..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/bnorm_bwd.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormTrainingBackward",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/bnorm_fwd_d.json b/tests/benchdnn/inputs/graph/op/bf16/bnorm_fwd_d.json
deleted file mode 100644
index 9cd08d67270..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/bnorm_fwd_d.json
+++ /dev/null
@@ -1,158 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormForwardTraining",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/clamp.json b/tests/benchdnn/inputs/graph/op/bf16/clamp.json
deleted file mode 100644
index 219b96d1385..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/clamp.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Clamp",
-      "attrs": {
-        "min": {
-          "type": "f32",
-          "value": 0
-        },
-        "max": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/clamp_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/clamp_bwd.json
deleted file mode 100644
index b804812316f..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/clamp_bwd.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ClampBackward",
-      "attrs": {
-        "min": {
-          "type": "f32",
-          "value": 0
-        },
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        },
-        "max": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/concat.json b/tests/benchdnn/inputs/graph/op/bf16/concat.json
deleted file mode 100644
index 302adf2fe4a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/concat.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONCAT_0",
-      "kind": "Concat",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            112,
-            112
-          ],
-          "stride": [
-            1605632,
-            1,
-            14336,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/concat_2.json b/tests/benchdnn/inputs/graph/op/bf16/concat_2.json
deleted file mode 100644
index d580a0be66b..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/concat_2.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONCAT_0",
-      "kind": "Concat",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            4096,
-            14,
-            14
-          ],
-          "stride": [
-            802816,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            4096,
-            14,
-            14
-          ],
-          "stride": [
-            802816,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            4096,
-            14,
-            14
-          ],
-          "stride": [
-            802816,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/conv_2d.json b/tests/benchdnn/inputs/graph/op/bf16/conv_2d.json
deleted file mode 100644
index 241fc392a1c..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/conv_2d.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "Convolution",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            1,
-            896,
-            32
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            128,
-            32,
-            2,
-            2
-          ],
-          "stride": [
-            128,
-            4,
-            2,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            128,
-            14,
-            14
-          ],
-          "stride": [
-            25088,
-            1,
-            1792,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/bf16/conv_bwd_d_2d.json b/tests/benchdnn/inputs/graph/op/bf16/conv_bwd_d_2d.json
deleted file mode 100644
index 664cae05a3b..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/conv_bwd_d_2d.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "ConvolutionBackwardData",
-      "attrs": {
-        "dst_shape": {
-          "type": "s64[]",
-          "value": [
-            1,
-            32,
-            28,
-            28
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        },
-        "output_padding": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0
-          ]
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            14,
-            14
-          ],
-          "stride": [
-            25088,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            128,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/bf16/conv_bwd_w_2d.json b/tests/benchdnn/inputs/graph/op/bf16/conv_bwd_w_2d.json
deleted file mode 100644
index 7c19c0476ea..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/conv_bwd_w_2d.json
+++ /dev/null
@@ -1,125 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "ConvolutionBackwardWeights",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "weights_shape": {
-          "type": "s64[]",
-          "value": [
-            128,
-            32,
-            1,
-            1
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            14,
-            14
-          ],
-          "stride": [
-            25088,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            128,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/bf16/deconv.json b/tests/benchdnn/inputs/graph/op/bf16/deconv.json
deleted file mode 100644
index 2a538b21d02..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/deconv.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTranspose",
-      "attrs": {
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            41,
-            41
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        },
-        "output_padding": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0
-          ]
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            32,
-            128,
-            3,
-            3
-          ],
-          "stride": [
-            1152,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            16,
-            16
-          ],
-          "stride": [
-            32768,
-            256,
-            16,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/deconv_bwd_d.json b/tests/benchdnn/inputs/graph/op/bf16/deconv_bwd_d.json
deleted file mode 100644
index 0695bfdc43a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/deconv_bwd_d.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTransposeBackwardData",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            41,
-            41
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            16,
-            16
-          ],
-          "stride": [
-            32768,
-            256,
-            16,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            32,
-            128,
-            3,
-            3
-          ],
-          "stride": [
-            1152,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/deconv_bwd_w.json b/tests/benchdnn/inputs/graph/op/bf16/deconv_bwd_w.json
deleted file mode 100644
index b50a70570c7..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/deconv_bwd_w.json
+++ /dev/null
@@ -1,125 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTransposeBackwardWeights",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            41,
-            41
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "weights_shape": {
-          "type": "s64[]",
-          "value": [
-            32,
-            128,
-            3,
-            3
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            16,
-            16
-          ],
-          "stride": [
-            32768,
-            256,
-            16,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            32,
-            128,
-            3,
-            3
-          ],
-          "stride": [
-            1152,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/div.json b/tests/benchdnn/inputs/graph/op/bf16/div.json
deleted file mode 100644
index c8cfcd3a164..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/div.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "none"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/dynamicdq_s4.json b/tests/benchdnn/inputs/graph/op/bf16/dynamicdq_s4.json
new file mode 100644
index 00000000000..438f46c04ab
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/bf16/dynamicdq_s4.json
@@ -0,0 +1,108 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "bf16",
+    "fpmath_mode_apply_to_int": "true",
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+                1,
+                1,
+                8,
+                1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "bf16",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "bf16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/op/bf16/dynamicdq_u4.json b/tests/benchdnn/inputs/graph/op/bf16/dynamicdq_u4.json
new file mode 100644
index 00000000000..b3cd70e8478
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/bf16/dynamicdq_u4.json
@@ -0,0 +1,108 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "bf16",
+    "fpmath_mode_apply_to_int": "true",
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+                1,
+                1,
+                8,
+                1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "u4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "bf16",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "u4",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "bf16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/op/bf16/elu.json b/tests/benchdnn/inputs/graph/op/bf16/elu.json
deleted file mode 100644
index c052fd6cf15..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/elu.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Elu",
-      "attrs": {
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/elu_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/elu_bwd.json
deleted file mode 100644
index 1a78aa27d4c..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/elu_bwd.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "EluBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        },
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/exp.json b/tests/benchdnn/inputs/graph/op/bf16/exp.json
deleted file mode 100644
index c4c1fa4b2c6..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/exp.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Exp",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/gelu.json b/tests/benchdnn/inputs/graph/op/bf16/gelu.json
deleted file mode 100644
index 3c50dff12cf..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/gelu.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "GELU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/gelu_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/gelu_bwd.json
deleted file mode 100644
index 0a0b8c59fe0..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/gelu_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "GELUBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/gnorm.json b/tests/benchdnn/inputs/graph/op/bf16/gnorm.json
deleted file mode 100644
index 82f45c2a916..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/gnorm.json
+++ /dev/null
@@ -1,106 +0,0 @@
-{
-  "version": "3.6.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0, 
-    2, 
-    3
-  ],
-  "output_ports": [
-    6
-  ],
-  "graph": [
-    {
-      "id": 6138621056,
-      "name": "aten::group_norm",
-      "kind": "GroupNorm",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 1
-        },
-        "keep_stats": {
-          "type": "bool",
-          "value": 0
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 1e-05
-        },
-        "groups": {
-          "type": "s64",
-          "value": 32
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2, 
-            320, 
-            48, 
-            48
-          ],
-          "stride": [
-            737280, 
-            1, 
-            15360, 
-            320
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }, 
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            320
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }, 
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            320
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            2, 
-            320, 
-            48, 
-            48
-          ],
-          "stride": [
-            737280, 
-            1, 
-            15360, 
-            320
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/hardsigmoid.json b/tests/benchdnn/inputs/graph/op/bf16/hardsigmoid.json
deleted file mode 100644
index 80d7c733567..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/hardsigmoid.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSigmoid",
-      "attrs": {
-        "beta": {
-          "type": "f32",
-          "value": 1
-        },
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/hardsigmoid_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/hardsigmoid_bwd.json
deleted file mode 100644
index e4571f65dbf..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/hardsigmoid_bwd.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSigmoidBackward",
-      "attrs": {
-        "beta": {
-          "type": "f32",
-          "value": 0.128
-        },
-        "alpha": {
-          "type": "f32",
-          "value": 1.28
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/hardswish.json b/tests/benchdnn/inputs/graph/op/bf16/hardswish.json
deleted file mode 100644
index 853fd4fefcf..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/hardswish.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSwish",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/hardswish_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/hardswish_bwd.json
deleted file mode 100644
index c3e03b794d7..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/hardswish_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSwishBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/interpolate.json b/tests/benchdnn/inputs/graph/op/bf16/interpolate.json
deleted file mode 100644
index 93120ebf556..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/interpolate.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "RESAMPLING_0",
-      "kind": "Interpolate",
-      "attrs": {
-        "sizes": {
-          "type": "s64[]",
-          "value": [
-            6,
-            12
-          ]
-        },
-        "coordinate_transformation_mode": {
-          "type": "string",
-          "value": "half_pixel"
-        },
-        "mode": {
-          "type": "string",
-          "value": "linear"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            7,
-            14
-          ],
-          "stride": [
-            1666,
-            98,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            6,
-            12
-          ],
-          "stride": [
-            1224,
-            72,
-            12,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/interpolate_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/interpolate_bwd.json
deleted file mode 100644
index 63059128443..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/interpolate_bwd.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "RESAMPLING_0",
-      "kind": "InterpolateBackward",
-      "attrs": {
-        "sizes": {
-          "type": "s64[]",
-          "value": [
-            6,
-            12
-          ]
-        },
-        "coordinate_transformation_mode": {
-          "type": "string",
-          "value": "half_pixel"
-        },
-        "mode": {
-          "type": "string",
-          "value": "nearest"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            7,
-            14
-          ],
-          "stride": [
-            1666,
-            98,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            6,
-            12
-          ],
-          "stride": [
-            1224,
-            72,
-            12,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            7,
-            14
-          ],
-          "stride": [
-            1666,
-            98,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/leakyrelu.json b/tests/benchdnn/inputs/graph/op/bf16/leakyrelu.json
deleted file mode 100644
index eaac7409bd7..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/leakyrelu.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "LeakyReLU",
-      "attrs": {
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/lnorm.json b/tests/benchdnn/inputs/graph/op/bf16/lnorm.json
deleted file mode 100644
index f50abad2480..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/lnorm.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "LNORM_0",
-      "kind": "LayerNorm",
-      "attrs": {
-        "keep_stats": {
-          "type": "bool",
-          "value": 0
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 0
-        },
-        "begin_norm_axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/lnorm_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/lnorm_bwd.json
deleted file mode 100644
index 14d38ded2dd..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/lnorm_bwd.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "LNORM_0",
-      "kind": "LayerNormBackward",
-      "attrs": {
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 0
-        },
-        "begin_norm_axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/lnorm_ks.json b/tests/benchdnn/inputs/graph/op/bf16/lnorm_ks.json
deleted file mode 100644
index cea4ae11e71..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/lnorm_ks.json
+++ /dev/null
@@ -1,102 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "LNORM_0",
-      "kind": "LayerNorm",
-      "attrs": {
-        "keep_stats": {
-          "type": "bool",
-          "value": 1
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 0
-        },
-        "begin_norm_axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584, 
-            28, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584, 
-            28, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/log.json b/tests/benchdnn/inputs/graph/op/bf16/log.json
deleted file mode 100644
index 28a67885bd4..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/log.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Log",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/logsoftmax.json b/tests/benchdnn/inputs/graph/op/bf16/logsoftmax.json
deleted file mode 100644
index 4c9cedf3503..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/logsoftmax.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "LogSoftmax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/logsoftmax_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/logsoftmax_bwd.json
deleted file mode 100644
index 4e56d4427d0..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/logsoftmax_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "LogSoftmaxBackward",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/matmul.json b/tests/benchdnn/inputs/graph/op/bf16/matmul.json
deleted file mode 100644
index 62ef406131f..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/matmul.json
+++ /dev/null
@@ -1,68 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "MATMUL_0",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            32,
-            384
-          ],
-          "stride": [
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            384,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            32,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/max.json b/tests/benchdnn/inputs/graph/op/bf16/max.json
deleted file mode 100644
index 84ea0ec687f..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/max.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Maximum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/maxpool.json b/tests/benchdnn/inputs/graph/op/bf16/maxpool.json
deleted file mode 100644
index 02b2fa23d7f..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/maxpool.json
+++ /dev/null
@@ -1,101 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "MaxPool",
-      "attrs": {
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "rounding_type": {
-          "type": "string",
-          "value": "floor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/maxpool_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/maxpool_bwd.json
deleted file mode 100644
index 5c1ddd0059b..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/maxpool_bwd.json
+++ /dev/null
@@ -1,115 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "MaxPoolBackward",
-      "attrs": {
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/min.json b/tests/benchdnn/inputs/graph/op/bf16/min.json
deleted file mode 100644
index 95d2c56dd96..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/min.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Minimum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/mish.json b/tests/benchdnn/inputs/graph/op/bf16/mish.json
deleted file mode 100644
index 23ef52765b0..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/mish.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Mish",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/mish_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/mish_bwd.json
deleted file mode 100644
index d9c4d597846..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/mish_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "MishBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/mul.json b/tests/benchdnn/inputs/graph/op/bf16/mul.json
deleted file mode 100644
index 818fc277e59..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/mul.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/pow.json b/tests/benchdnn/inputs/graph/op/bf16/pow.json
deleted file mode 100644
index be338829d23..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/pow.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0
-  ],
-  "output_ports": [
-    1
-  ],
-  "graph": [
-    {
-      "id": 0,
-      "name": "Pow_0",
-      "kind": "Pow",
-      "attrs": {
-        "beta": {
-          "type": "f32",
-          "value": 2
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            4096
-          ],
-          "stride": [
-            131072, 
-            4096, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            32, 
-            4096
-          ],
-          "stride": [
-            131072, 
-            4096, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/prelu.json b/tests/benchdnn/inputs/graph/op/bf16/prelu.json
deleted file mode 100644
index 5f8720c2964..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/prelu.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "PRELU_0",
-      "kind": "PReLU",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "per_channel_broadcast": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/prelu_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/prelu_bwd.json
deleted file mode 100644
index 5f37946e3ff..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/prelu_bwd.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "PRELU_0",
-      "kind": "PReLUBackward",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reciprocal.json b/tests/benchdnn/inputs/graph/op/bf16/reciprocal.json
deleted file mode 100644
index b9b988dd26a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reciprocal.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Reciprocal",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reducel1.json b/tests/benchdnn/inputs/graph/op/bf16/reducel1.json
deleted file mode 100644
index e34507a6ffe..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reducel1.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceL1",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reducel2.json b/tests/benchdnn/inputs/graph/op/bf16/reducel2.json
deleted file mode 100644
index 6e3cc871bb4..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reducel2.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceL2",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reducemax.json b/tests/benchdnn/inputs/graph/op/bf16/reducemax.json
deleted file mode 100644
index f9b8e4b0f35..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reducemax.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceMax",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reducemean.json b/tests/benchdnn/inputs/graph/op/bf16/reducemean.json
deleted file mode 100644
index b38344b3d40..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reducemean.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceMean",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reducemin.json b/tests/benchdnn/inputs/graph/op/bf16/reducemin.json
deleted file mode 100644
index d320008fc34..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reducemin.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceMin",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reduceprod.json b/tests/benchdnn/inputs/graph/op/bf16/reduceprod.json
deleted file mode 100644
index 9a0a9af3e49..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reduceprod.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceProd",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reducesum.json b/tests/benchdnn/inputs/graph/op/bf16/reducesum.json
deleted file mode 100644
index 64381386019..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reducesum.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceSum",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/relu.json b/tests/benchdnn/inputs/graph/op/bf16/relu.json
deleted file mode 100644
index 98eb3da8221..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/relu.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/relu_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/relu_bwd.json
deleted file mode 100644
index 4bf9967e66e..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/relu_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ReLUBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/reorder.json b/tests/benchdnn/inputs/graph/op/bf16/reorder.json
deleted file mode 100644
index 9df005badcd..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/reorder.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REORDER_0",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            3,
-            3
-          ],
-          "stride": [
-            1,
-            2,
-            384,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            3,
-            3
-          ],
-          "stride": [
-            576,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/select.json b/tests/benchdnn/inputs/graph/op/bf16/select.json
deleted file mode 100644
index 629476cd76a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/select.json
+++ /dev/null
@@ -1,96 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0, 
-    1, 
-    2
-  ],
-  "output_ports": [
-    3
-  ],
-  "graph": [
-    {
-      "id": 0,
-      "name": "TERNARY_0",
-      "kind": "Select",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "boolean",
-          "shape": [
-            1, 
-            1, 
-            1, 
-            128
-          ],
-          "stride": [
-            128, 
-            128, 
-            128, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }, 
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }, 
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            12, 
-            128, 
-            128
-          ],
-          "stride": [
-            196608, 
-            16384, 
-            128, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1, 
-            12, 
-            128, 
-            128
-          ],
-          "stride": [
-            196608, 
-            16384, 
-            128, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/sigmoid.json b/tests/benchdnn/inputs/graph/op/bf16/sigmoid.json
deleted file mode 100644
index ace7f5c7e0a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/sigmoid.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Sigmoid",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/sigmoid_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/sigmoid_bwd.json
deleted file mode 100644
index 6ef5bb234f3..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/sigmoid_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "SigmoidBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/softmax.json b/tests/benchdnn/inputs/graph/op/bf16/softmax.json
deleted file mode 100644
index ddcb79fbee1..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/softmax.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/softmax_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/softmax_bwd.json
deleted file mode 100644
index 45b7e4f6ed7..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/softmax_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "SoftMaxBackward",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/sqrt.json b/tests/benchdnn/inputs/graph/op/bf16/sqrt.json
deleted file mode 100644
index 2a9fa428f30..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/sqrt.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 6,
-      "name": "ELTWISE_0",
-      "kind": "Sqrt",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            512,
-            28,
-            28
-          ],
-          "stride": [
-            401408,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 166,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            512,
-            28,
-            28
-          ],
-          "stride": [
-            401408,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/sqrt_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/sqrt_bwd.json
deleted file mode 100644
index b6a0144821d..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/sqrt_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 6,
-      "name": "ELTWISE_0",
-      "kind": "SqrtBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 518,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/square.json b/tests/benchdnn/inputs/graph/op/bf16/square.json
deleted file mode 100644
index 93119105b9a..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/square.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Square",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/static_reshape.json b/tests/benchdnn/inputs/graph/op/bf16/static_reshape.json
deleted file mode 100644
index 087286d5027..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/static_reshape.json
+++ /dev/null
@@ -1,69 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0
-  ],
-  "output_ports": [
-    1
-  ],
-  "graph": [
-    {
-      "id": 0,
-      "name": "Reshape_0",
-      "kind": "StaticReshape",
-      "attrs": {
-        "special_zero": {
-          "type": "bool",
-          "value": 0
-        },
-        "shape": {
-          "type": "s64[]",
-          "value": [
-            128, 
-            512, 
-            16, 
-            64
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            65536, 
-            1024
-          ],
-          "stride": [
-            1024, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            128, 
-            512, 
-            16, 
-            64
-          ],
-          "stride": [
-            524288, 
-            1024, 
-            64, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/static_transpose.json b/tests/benchdnn/inputs/graph/op/bf16/static_transpose.json
deleted file mode 100644
index e05330e8be4..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/static_transpose.json
+++ /dev/null
@@ -1,69 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0
-  ],
-  "output_ports": [
-    1
-  ],
-  "graph": [
-    {
-      "id": 0,
-      "name": "Transpose_0",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0, 
-            2, 
-            1, 
-            3
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            224, 
-            12, 
-            384, 
-            64
-          ],
-          "stride": [
-            294912, 
-            24576, 
-            64, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            224, 
-            384, 
-            12, 
-            64
-          ],
-          "stride": [
-            294912, 
-            64, 
-            24576, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/sub.json b/tests/benchdnn/inputs/graph/op/bf16/sub.json
deleted file mode 100644
index 12a1fc7723e..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/sub.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Subtract",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/tanh.json b/tests/benchdnn/inputs/graph/op/bf16/tanh.json
deleted file mode 100644
index 67fc0956589..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/tanh.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Tanh",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/tanh_bwd.json b/tests/benchdnn/inputs/graph/op/bf16/tanh_bwd.json
deleted file mode 100644
index 7237847dcde..00000000000
--- a/tests/benchdnn/inputs/graph/op/bf16/tanh_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "TanhBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/abs.json b/tests/benchdnn/inputs/graph/op/f16/abs.json
deleted file mode 100644
index a57a276d4d4..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/abs.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Abs",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/abs_bwd.json b/tests/benchdnn/inputs/graph/op/f16/abs_bwd.json
deleted file mode 100644
index cc9fc278f81..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/abs_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "AbsBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/add.json b/tests/benchdnn/inputs/graph/op/f16/add.json
deleted file mode 100644
index a1cf2d7fd9e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/add.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/avgpool.json b/tests/benchdnn/inputs/graph/op/f16/avgpool.json
deleted file mode 100644
index 14457bbeda0..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/avgpool.json
+++ /dev/null
@@ -1,98 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "AvgPool",
-      "attrs": {
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "rounding_type": {
-          "type": "string",
-          "value": "floor"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "exclude_pad": {
-          "type": "bool",
-          "value": 0
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/avgpool_bwd.json b/tests/benchdnn/inputs/graph/op/f16/avgpool_bwd.json
deleted file mode 100644
index dba3ddadf46..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/avgpool_bwd.json
+++ /dev/null
@@ -1,103 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "AvgPoolBackward",
-      "attrs": {
-        "src_shape": {
-          "type": "s64[]",
-          "value": [
-            1,
-            64,
-            600,
-            600
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "exclude_pad": {
-          "type": "bool",
-          "value": 0
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/biasadd.json b/tests/benchdnn/inputs/graph/op/f16/biasadd.json
deleted file mode 100644
index e2bc76e9a05..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/biasadd.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BIASADD_0",
-      "kind": "BiasAdd",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            32,
-            32
-          ],
-          "stride": [
-            131072,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            128
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            32,
-            32
-          ],
-          "stride": [
-            131072,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/biasadd_bwd.json b/tests/benchdnn/inputs/graph/op/f16/biasadd_bwd.json
deleted file mode 100644
index 47a9fd944e7..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/biasadd_bwd.json
+++ /dev/null
@@ -1,52 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BIASADD_0",
-      "kind": "BiasAddBackprop",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            32,
-            32
-          ],
-          "stride": [
-            131072,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            128
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/bnorm.json b/tests/benchdnn/inputs/graph/op/f16/bnorm.json
deleted file mode 100644
index 3ce058fd13d..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/bnorm.json
+++ /dev/null
@@ -1,110 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormInference",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/bnorm_bwd.json b/tests/benchdnn/inputs/graph/op/f16/bnorm_bwd.json
deleted file mode 100644
index e25064238a3..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/bnorm_bwd.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormTrainingBackward",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/bnorm_fwd_d.json b/tests/benchdnn/inputs/graph/op/f16/bnorm_fwd_d.json
deleted file mode 100644
index 0b4fa3f729f..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/bnorm_fwd_d.json
+++ /dev/null
@@ -1,158 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormForwardTraining",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 7,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/clamp.json b/tests/benchdnn/inputs/graph/op/f16/clamp.json
deleted file mode 100644
index 192cdf9aa0e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/clamp.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Clamp",
-      "attrs": {
-        "min": {
-          "type": "f32",
-          "value": 0
-        },
-        "max": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/clamp_bwd.json b/tests/benchdnn/inputs/graph/op/f16/clamp_bwd.json
deleted file mode 100644
index fc7b92242d4..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/clamp_bwd.json
+++ /dev/null
@@ -1,84 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ClampBackward",
-      "attrs": {
-        "min": {
-          "type": "f32",
-          "value": 0
-        },
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        },
-        "max": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/concat.json b/tests/benchdnn/inputs/graph/op/f16/concat.json
deleted file mode 100644
index 49dade876d1..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/concat.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONCAT_0",
-      "kind": "Concat",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            112,
-            112
-          ],
-          "stride": [
-            1605632,
-            1,
-            14336,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/conv_2d.json b/tests/benchdnn/inputs/graph/op/f16/conv_2d.json
deleted file mode 100644
index 5c31e8ad4e7..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/conv_2d.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "Convolution",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            1,
-            896,
-            32
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            128,
-            32,
-            2,
-            2
-          ],
-          "stride": [
-            128,
-            4,
-            2,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            128,
-            14,
-            14
-          ],
-          "stride": [
-            25088,
-            1,
-            1792,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/f16/conv_bwd_d_2d.json b/tests/benchdnn/inputs/graph/op/f16/conv_bwd_d_2d.json
deleted file mode 100644
index e7aabdababe..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/conv_bwd_d_2d.json
+++ /dev/null
@@ -1,142 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "ConvolutionBackwardData",
-      "attrs": {
-        "dst_shape": {
-          "type": "s64[]",
-          "value": [
-            1,
-            32,
-            28,
-            28
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        },
-        "output_padding": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0
-          ]
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            14,
-            14
-          ],
-          "stride": [
-            25088,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            128,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/f16/conv_bwd_w_2d.json b/tests/benchdnn/inputs/graph/op/f16/conv_bwd_w_2d.json
deleted file mode 100644
index 78bce920a6e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/conv_bwd_w_2d.json
+++ /dev/null
@@ -1,125 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "ConvolutionBackwardWeights",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "weights_shape": {
-          "type": "s64[]",
-          "value": [
-            128,
-            32,
-            1,
-            1
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            14,
-            14
-          ],
-          "stride": [
-            25088,
-            196,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            128,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/tests/benchdnn/inputs/graph/op/f16/deconv.json b/tests/benchdnn/inputs/graph/op/f16/deconv.json
deleted file mode 100644
index 98fe44f6880..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/deconv.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTranspose",
-      "attrs": {
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            41,
-            41
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        },
-        "output_padding": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0
-          ]
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            32,
-            128,
-            3,
-            3
-          ],
-          "stride": [
-            1152,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            16,
-            16
-          ],
-          "stride": [
-            32768,
-            256,
-            16,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/deconv_bwd_d.json b/tests/benchdnn/inputs/graph/op/f16/deconv_bwd_d.json
deleted file mode 100644
index 5b4553e0e45..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/deconv_bwd_d.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTransposeBackwardData",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            41,
-            41
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            16,
-            16
-          ],
-          "stride": [
-            32768,
-            256,
-            16,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            32,
-            128,
-            3,
-            3
-          ],
-          "stride": [
-            1152,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/deconv_bwd_w.json b/tests/benchdnn/inputs/graph/op/f16/deconv_bwd_w.json
deleted file mode 100644
index 108d9b1e3c4..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/deconv_bwd_w.json
+++ /dev/null
@@ -1,125 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTransposeBackwardWeights",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            41,
-            41
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "weights_shape": {
-          "type": "s64[]",
-          "value": [
-            32,
-            128,
-            3,
-            3
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            16,
-            16
-          ],
-          "stride": [
-            32768,
-            256,
-            16,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            32,
-            128,
-            3,
-            3
-          ],
-          "stride": [
-            1152,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/div.json b/tests/benchdnn/inputs/graph/op/f16/div.json
deleted file mode 100644
index d335e5999e3..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/div.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "none"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/dynamicdq_s4.json b/tests/benchdnn/inputs/graph/op/f16/dynamicdq_s4.json
new file mode 100644
index 00000000000..d0d29325257
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f16/dynamicdq_s4.json
@@ -0,0 +1,108 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "f16",
+    "fpmath_mode_apply_to_int": "true",
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+                1,
+                1,
+                8,
+                1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/op/f16/dynamicdq_u4.json b/tests/benchdnn/inputs/graph/op/f16/dynamicdq_u4.json
new file mode 100644
index 00000000000..fa3ff90f706
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f16/dynamicdq_u4.json
@@ -0,0 +1,108 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "f16",
+    "fpmath_mode_apply_to_int": "true",
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+                1,
+                1,
+                8,
+                1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "u4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "u4",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "f16",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/op/f16/elu.json b/tests/benchdnn/inputs/graph/op/f16/elu.json
deleted file mode 100644
index c1bcdb3196d..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/elu.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Elu",
-      "attrs": {
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/elu_bwd.json b/tests/benchdnn/inputs/graph/op/f16/elu_bwd.json
deleted file mode 100644
index 650f6e432b5..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/elu_bwd.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "EluBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        },
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/exp.json b/tests/benchdnn/inputs/graph/op/f16/exp.json
deleted file mode 100644
index b3f8a542704..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/exp.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Exp",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/gelu.json b/tests/benchdnn/inputs/graph/op/f16/gelu.json
deleted file mode 100644
index 0efbeef9905..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/gelu.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "GELU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/gelu_bwd.json b/tests/benchdnn/inputs/graph/op/f16/gelu_bwd.json
deleted file mode 100644
index f6d51b38062..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/gelu_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "GELUBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/gnorm.json b/tests/benchdnn/inputs/graph/op/f16/gnorm.json
deleted file mode 100644
index 821a4725ba3..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/gnorm.json
+++ /dev/null
@@ -1,106 +0,0 @@
-{
-  "version": "3.6.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0, 
-    2, 
-    3
-  ],
-  "output_ports": [
-    6
-  ],
-  "graph": [
-    {
-      "id": 6138621056,
-      "name": "aten::group_norm",
-      "kind": "GroupNorm",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 1
-        },
-        "keep_stats": {
-          "type": "bool",
-          "value": 0
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 1e-05
-        },
-        "groups": {
-          "type": "s64",
-          "value": 32
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2, 
-            320, 
-            48, 
-            48
-          ],
-          "stride": [
-            737280, 
-            1, 
-            15360, 
-            320
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }, 
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            320
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }, 
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            320
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "f16",
-          "shape": [
-            2, 
-            320, 
-            48, 
-            48
-          ],
-          "stride": [
-            737280, 
-            1, 
-            15360, 
-            320
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/hardsigmoid.json b/tests/benchdnn/inputs/graph/op/f16/hardsigmoid.json
deleted file mode 100644
index a3bc0843193..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/hardsigmoid.json
+++ /dev/null
@@ -1,62 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSigmoid",
-      "attrs": {
-        "beta": {
-          "type": "f32",
-          "value": 3
-        },
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/hardsigmoid_bwd.json b/tests/benchdnn/inputs/graph/op/f16/hardsigmoid_bwd.json
deleted file mode 100644
index 5845f69e59b..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/hardsigmoid_bwd.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSigmoidBackward",
-      "attrs": {
-        "beta": {
-          "type": "f32",
-          "value": 0
-        },
-        "alpha": {
-          "type": "f32",
-          "value": 2
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1, 
-            32, 
-            14, 
-            14
-          ],
-          "stride": [
-            6272, 
-            196, 
-            14, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/hardswish.json b/tests/benchdnn/inputs/graph/op/f16/hardswish.json
deleted file mode 100644
index 47102e6a82a..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/hardswish.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSwish",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/hardswish_bwd.json b/tests/benchdnn/inputs/graph/op/f16/hardswish_bwd.json
deleted file mode 100644
index edee3cd2fa9..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/hardswish_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSwishBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/interpolate.json b/tests/benchdnn/inputs/graph/op/f16/interpolate.json
deleted file mode 100644
index a64a0312c66..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/interpolate.json
+++ /dev/null
@@ -1,73 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "RESAMPLING_0",
-      "kind": "Interpolate",
-      "attrs": {
-        "sizes": {
-          "type": "s64[]",
-          "value": [
-            6,
-            12
-          ]
-        },
-        "coordinate_transformation_mode": {
-          "type": "string",
-          "value": "half_pixel"
-        },
-        "mode": {
-          "type": "string",
-          "value": "linear"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            17,
-            7,
-            14
-          ],
-          "stride": [
-            1666,
-            98,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            2,
-            17,
-            6,
-            12
-          ],
-          "stride": [
-            1224,
-            72,
-            12,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/interpolate_bwd.json b/tests/benchdnn/inputs/graph/op/f16/interpolate_bwd.json
deleted file mode 100644
index 8d73fed8f4b..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/interpolate_bwd.json
+++ /dev/null
@@ -1,91 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "RESAMPLING_0",
-      "kind": "InterpolateBackward",
-      "attrs": {
-        "sizes": {
-          "type": "s64[]",
-          "value": [
-            6,
-            12
-          ]
-        },
-        "coordinate_transformation_mode": {
-          "type": "string",
-          "value": "half_pixel"
-        },
-        "mode": {
-          "type": "string",
-          "value": "nearest"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            17,
-            7,
-            14
-          ],
-          "stride": [
-            1666,
-            98,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            2,
-            17,
-            6,
-            12
-          ],
-          "stride": [
-            1224,
-            72,
-            12,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            17,
-            7,
-            14
-          ],
-          "stride": [
-            1666,
-            98,
-            14,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/leakyrelu.json b/tests/benchdnn/inputs/graph/op/f16/leakyrelu.json
deleted file mode 100644
index fec393c08f3..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/leakyrelu.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "LeakyReLU",
-      "attrs": {
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/lnorm.json b/tests/benchdnn/inputs/graph/op/f16/lnorm.json
deleted file mode 100644
index 8ae6d9fecee..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/lnorm.json
+++ /dev/null
@@ -1,70 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "LNORM_0",
-      "kind": "LayerNorm",
-      "attrs": {
-        "keep_stats": {
-          "type": "bool",
-          "value": 0
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 0
-        },
-        "begin_norm_axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/lnorm_bwd.json b/tests/benchdnn/inputs/graph/op/f16/lnorm_bwd.json
deleted file mode 100644
index c57aea6354d..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/lnorm_bwd.json
+++ /dev/null
@@ -1,116 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "LNORM_0",
-      "kind": "LayerNormBackward",
-      "attrs": {
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 0
-        },
-        "begin_norm_axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/lnorm_ks.json b/tests/benchdnn/inputs/graph/op/f16/lnorm_ks.json
deleted file mode 100644
index 37d154c9bf3..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/lnorm_ks.json
+++ /dev/null
@@ -1,102 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "LNORM_0",
-      "kind": "LayerNorm",
-      "attrs": {
-        "keep_stats": {
-          "type": "bool",
-          "value": 1
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        },
-        "use_affine": {
-          "type": "bool",
-          "value": 0
-        },
-        "begin_norm_axis": {
-          "type": "s64",
-          "value": -1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584, 
-            28, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            64,
-            128,
-            28
-          ],
-          "stride": [
-            3584, 
-            28, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/log.json b/tests/benchdnn/inputs/graph/op/f16/log.json
deleted file mode 100644
index 0e46bfd24a4..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/log.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Log",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/logsoftmax.json b/tests/benchdnn/inputs/graph/op/f16/logsoftmax.json
deleted file mode 100644
index 7667ac475fe..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/logsoftmax.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "LogSoftmax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/logsoftmax_bwd.json b/tests/benchdnn/inputs/graph/op/f16/logsoftmax_bwd.json
deleted file mode 100644
index c48b6d3c607..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/logsoftmax_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "LogSoftmaxBackward",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/matmul.json b/tests/benchdnn/inputs/graph/op/f16/matmul.json
deleted file mode 100644
index dc553815baa..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/matmul.json
+++ /dev/null
@@ -1,68 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "MATMUL_0",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            32,
-            384
-          ],
-          "stride": [
-            384,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            384,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            32,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/max.json b/tests/benchdnn/inputs/graph/op/f16/max.json
deleted file mode 100644
index f1d91bbfc29..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/max.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Maximum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/maxpool.json b/tests/benchdnn/inputs/graph/op/f16/maxpool.json
deleted file mode 100644
index c3201e2df35..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/maxpool.json
+++ /dev/null
@@ -1,101 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "MaxPool",
-      "attrs": {
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "rounding_type": {
-          "type": "string",
-          "value": "floor"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/maxpool_bwd.json b/tests/benchdnn/inputs/graph/op/f16/maxpool_bwd.json
deleted file mode 100644
index ad12933ac92..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/maxpool_bwd.json
+++ /dev/null
@@ -1,115 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "POOL_0",
-      "kind": "MaxPoolBackward",
-      "attrs": {
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "kernel": {
-          "type": "s64[]",
-          "value": [
-            3,
-            3
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            300,
-            300
-          ],
-          "stride": [
-            5760000,
-            90000,
-            300,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            1,
-            64,
-            600,
-            600
-          ],
-          "stride": [
-            23040000,
-            360000,
-            600,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/min.json b/tests/benchdnn/inputs/graph/op/f16/min.json
deleted file mode 100644
index e770ce6383b..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/min.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Minimum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/mish.json b/tests/benchdnn/inputs/graph/op/f16/mish.json
deleted file mode 100644
index 20a13cf47fb..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/mish.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Mish",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/mish_bwd.json b/tests/benchdnn/inputs/graph/op/f16/mish_bwd.json
deleted file mode 100644
index c8b13416641..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/mish_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "MishBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/mul.json b/tests/benchdnn/inputs/graph/op/f16/mul.json
deleted file mode 100644
index f69dbe3a4b4..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/mul.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/prelu.json b/tests/benchdnn/inputs/graph/op/f16/prelu.json
deleted file mode 100644
index eed6b26f44f..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/prelu.json
+++ /dev/null
@@ -1,80 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "PRELU_0",
-      "kind": "PReLU",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "per_channel_broadcast": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/prelu_bwd.json b/tests/benchdnn/inputs/graph/op/f16/prelu_bwd.json
deleted file mode 100644
index 0b5fab04b87..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/prelu_bwd.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "PRELU_0",
-      "kind": "PReLUBackward",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            4,
-            16,
-            3,
-            4
-          ],
-          "stride": [
-            192,
-            12,
-            4,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reciprocal.json b/tests/benchdnn/inputs/graph/op/f16/reciprocal.json
deleted file mode 100644
index cb0f60c86f8..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reciprocal.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Reciprocal",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reducel1.json b/tests/benchdnn/inputs/graph/op/f16/reducel1.json
deleted file mode 100644
index b64691eda0e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reducel1.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceL1",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reducel2.json b/tests/benchdnn/inputs/graph/op/f16/reducel2.json
deleted file mode 100644
index 57497b8f2b9..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reducel2.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceL2",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reducemax.json b/tests/benchdnn/inputs/graph/op/f16/reducemax.json
deleted file mode 100644
index 25dbbe3cea3..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reducemax.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceMax",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reducemean.json b/tests/benchdnn/inputs/graph/op/f16/reducemean.json
deleted file mode 100644
index 5a26ff96f1a..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reducemean.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceMean",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reducemin.json b/tests/benchdnn/inputs/graph/op/f16/reducemin.json
deleted file mode 100644
index e0fc8399a40..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reducemin.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceMin",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reduceprod.json b/tests/benchdnn/inputs/graph/op/f16/reduceprod.json
deleted file mode 100644
index 85b02c3ac36..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reduceprod.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceProd",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reducesum.json b/tests/benchdnn/inputs/graph/op/f16/reducesum.json
deleted file mode 100644
index 1343e64c82e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reducesum.json
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REDUCTION_0",
-      "kind": "ReduceSum",
-      "attrs": {
-        "axes": {
-          "type": "s64[]",
-          "value": [
-            1
-          ]
-        },
-        "keep_dims": {
-          "type": "bool",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            128,
-            150,
-            150
-          ],
-          "stride": [
-            2880000,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            150,
-            150
-          ],
-          "stride": [
-            22500,
-            22500,
-            150,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/relu.json b/tests/benchdnn/inputs/graph/op/f16/relu.json
deleted file mode 100644
index d0619f19052..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/relu.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/relu_bwd.json b/tests/benchdnn/inputs/graph/op/f16/relu_bwd.json
deleted file mode 100644
index 015bdb5c745..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/relu_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ReLUBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/reorder.json b/tests/benchdnn/inputs/graph/op/f16/reorder.json
deleted file mode 100644
index 591d781d660..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/reorder.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "REORDER_0",
-      "kind": "Reorder",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            3,
-            3
-          ],
-          "stride": [
-            1,
-            2,
-            384,
-            128
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            3,
-            3
-          ],
-          "stride": [
-            576,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/select.json b/tests/benchdnn/inputs/graph/op/f16/select.json
deleted file mode 100644
index 25dce890f4a..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/select.json
+++ /dev/null
@@ -1,96 +0,0 @@
-{
-    "version": "3.3.0",
-    "engine_kind": "cpu",
-    "fpmath_mode": "strict",
-    "input_ports": [
-        0,
-        1,
-        2
-    ],
-    "output_ports": [
-        3
-    ],
-    "graph": [
-        {
-            "id": 0,
-            "name": "TERNARY_0",
-            "kind": "Select",
-            "attrs": {
-                "auto_broadcast": {
-                    "type": "string",
-                    "value": "numpy"
-                }
-            },
-            "inputs": [
-                {
-                    "id": 0,
-                    "dtype": "boolean",
-                    "shape": [
-                        1,
-                        1,
-                        1,
-                        128
-                    ],
-                    "stride": [
-                        128,
-                        128,
-                        128,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "variable"
-                },
-                {
-                    "id": 1,
-                    "dtype": "f16",
-                    "shape": [
-                        1
-                    ],
-                    "stride": [
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "variable"
-                },
-                {
-                    "id": 2,
-                    "dtype": "f16",
-                    "shape": [
-                        1,
-                        12,
-                        128,
-                        128
-                    ],
-                    "stride": [
-                        196608,
-                        16384,
-                        128,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "variable"
-                }
-            ],
-            "outputs": [
-                {
-                    "id": 3,
-                    "dtype": "f16",
-                    "shape": [
-                        1,
-                        12,
-                        128,
-                        128
-                    ],
-                    "stride": [
-                        196608,
-                        16384,
-                        128,
-                        1
-                    ],
-                    "layout_type": "strided",
-                    "property_type": "variable"
-                }
-            ]
-        }
-    ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/sigmoid.json b/tests/benchdnn/inputs/graph/op/f16/sigmoid.json
deleted file mode 100644
index ada1ebaa203..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/sigmoid.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Sigmoid",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/sigmoid_bwd.json b/tests/benchdnn/inputs/graph/op/f16/sigmoid_bwd.json
deleted file mode 100644
index 182b973819b..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/sigmoid_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "SigmoidBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/softmax.json b/tests/benchdnn/inputs/graph/op/f16/softmax.json
deleted file mode 100644
index f8085096c6e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/softmax.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "SoftMax",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/softmax_bwd.json b/tests/benchdnn/inputs/graph/op/f16/softmax_bwd.json
deleted file mode 100644
index fd991a17128..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/softmax_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "SOFTMAX_0",
-      "kind": "SoftMaxBackward",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            64,
-            128,
-            28,
-            28
-          ],
-          "stride": [
-            100352,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/sqrt.json b/tests/benchdnn/inputs/graph/op/f16/sqrt.json
deleted file mode 100644
index 7104d747a1e..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/sqrt.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 6,
-      "name": "ELTWISE_0",
-      "kind": "Sqrt",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            512,
-            28,
-            28
-          ],
-          "stride": [
-            401408,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 166,
-          "dtype": "f16",
-          "shape": [
-            64,
-            512,
-            28,
-            28
-          ],
-          "stride": [
-            401408,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/sqrt_bwd.json b/tests/benchdnn/inputs/graph/op/f16/sqrt_bwd.json
deleted file mode 100644
index 16ef707e3ef..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/sqrt_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 6,
-      "name": "ELTWISE_0",
-      "kind": "SqrtBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 518,
-          "dtype": "f16",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/square.json b/tests/benchdnn/inputs/graph/op/f16/square.json
deleted file mode 100644
index 350580e4b7f..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/square.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Square",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/sub.json b/tests/benchdnn/inputs/graph/op/f16/sub.json
deleted file mode 100644
index 3d238210b9d..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/sub.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Subtract",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/tanh.json b/tests/benchdnn/inputs/graph/op/f16/tanh.json
deleted file mode 100644
index 060735ed51b..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/tanh.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Tanh",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f16/tanh_bwd.json b/tests/benchdnn/inputs/graph/op/f16/tanh_bwd.json
deleted file mode 100644
index 2e6696b355a..00000000000
--- a/tests/benchdnn/inputs/graph/op/f16/tanh_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "TanhBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/concat_3.json b/tests/benchdnn/inputs/graph/op/f32/concat_3.json
similarity index 88%
rename from tests/benchdnn/inputs/graph/op/bf16/concat_3.json
rename to tests/benchdnn/inputs/graph/op/f32/concat_3.json
index a6f078779f9..9ef7fdaf248 100644
--- a/tests/benchdnn/inputs/graph/op/bf16/concat_3.json
+++ b/tests/benchdnn/inputs/graph/op/f32/concat_3.json
@@ -1,7 +1,15 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2
+  ],
+  "output_ports": [
+    3
+  ],
   "graph": [
     {
       "id": 0,
@@ -16,7 +24,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             16,
@@ -34,7 +42,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             16,
@@ -52,7 +60,7 @@
         },
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             16,
@@ -72,7 +80,7 @@
       "outputs": [
         {
           "id": 3,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             16,
diff --git a/tests/benchdnn/inputs/graph/op/f32/conv_bwd_d_2d.json b/tests/benchdnn/inputs/graph/op/f32/conv_bwd_d_2d.json
new file mode 100644
index 00000000000..6326272990c
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f32/conv_bwd_d_2d.json
@@ -0,0 +1,149 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    2
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "CONV_0",
+      "kind": "ConvolutionBackwardData",
+      "attrs": {
+        "pads_end": {
+          "type": "s64[]",
+          "value": [
+            0,
+            0
+          ]
+        },
+        "dst_shape": {
+          "type": "s64[]",
+          "value": [
+            1,
+            32,
+            28,
+            28
+          ]
+        },
+        "data_format": {
+          "type": "string",
+          "value": "NCX"
+        },
+        "auto_pad": {
+          "type": "string",
+          "value": "None"
+        },
+        "output_padding": {
+          "type": "s64[]",
+          "value": [
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0
+          ]
+        },
+        "dilations": {
+          "type": "s64[]",
+          "value": [
+            1,
+            1
+          ]
+        },
+        "weights_format": {
+          "type": "string",
+          "value": "OIX"
+        },
+        "pads_begin": {
+          "type": "s64[]",
+          "value": [
+            0,
+            0
+          ]
+        },
+        "groups": {
+          "type": "s64",
+          "value": 1
+        },
+        "strides": {
+          "type": "s64[]",
+          "value": [
+            2,
+            2
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            1,
+            128,
+            14,
+            14
+          ],
+          "stride": [
+            25088,
+            196,
+            14,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            128,
+            32,
+            1,
+            1
+          ],
+          "stride": [
+            32,
+            1,
+            1,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            1,
+            32,
+            28,
+            28
+          ],
+          "stride": [
+            25088,
+            784,
+            28,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/op/f32/div.json b/tests/benchdnn/inputs/graph/op/f32/div.json
deleted file mode 100644
index 771722e5c50..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/div.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Divide",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "none"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/dynamicdq_s4.json b/tests/benchdnn/inputs/graph/op/f32/dynamicdq_s4.json
new file mode 100644
index 00000000000..7fea16b0465
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f32/dynamicdq_s4.json
@@ -0,0 +1,108 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "strict",
+    "fpmath_mode_apply_to_int": "true",
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+                1,
+                1,
+                8,
+                1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "f32",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "s4",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "f32",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/op/f32/dynamicdq_u4.json b/tests/benchdnn/inputs/graph/op/f32/dynamicdq_u4.json
new file mode 100644
index 00000000000..592c742b47a
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f32/dynamicdq_u4.json
@@ -0,0 +1,108 @@
+{
+    "version": "3.7.0",
+    "engine_kind": "cpu",
+    "fpmath_mode": "strict",
+    "fpmath_mode_apply_to_int": "true",
+    "graph": [
+      {
+        "id": 0,
+        "name": "aten::dequantize",
+        "kind": "DynamicDequantize",
+        "attrs": {
+          "qtype": {
+            "type": "string",
+            "value": "per_group"
+          },
+          "group_shape": {
+            "type": "s64[]",
+            "value": [
+                1,
+                1,
+                8,
+                1
+            ]
+          }, 
+          "axis": {
+            "type": "s64",
+            "value": 2
+          }
+        },
+        "inputs": [
+          {
+            "id": 0,
+            "dtype": "u4",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              1,
+              128
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          },
+          {
+            "id": 1,
+            "dtype": "f32",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          },
+          {
+            "id": 2,
+            "dtype": "u4",
+            "shape": [
+              1,
+              32,
+              16,
+              32
+            ],
+            "stride": [
+              16384,
+              512,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "undef"
+          }
+        ],
+        "outputs": [
+          {
+            "id": 10,
+            "dtype": "f32",
+            "shape": [
+              1,
+              32,
+              128,
+              32
+            ],
+            "stride": [
+              131072,
+              4096,
+              32,
+              1
+            ],
+            "layout_type": "strided",
+            "property_type": "variable"
+          }
+        ]
+      }
+    ]
+  }
diff --git a/tests/benchdnn/inputs/graph/op/f32/dynamicq_s4.json b/tests/benchdnn/inputs/graph/op/f32/dynamicq_s4.json
new file mode 100644
index 00000000000..39f00734325
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f32/dynamicq_s4.json
@@ -0,0 +1,74 @@
+{
+  "version": "3.0.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "graph": [
+    {
+      "id": 0,
+      "name": "QUANTIZE_0",
+      "kind": "DynamicQuantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 1
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_channel"
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            2,
+            64,
+            3,
+            3
+          ],
+          "stride": [
+            576,
+            9,
+            3,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            64
+          ],
+          "stride": [
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "s8",
+          "shape": [
+            2,
+            64,
+            3,
+            3
+          ],
+          "stride": [
+            576,
+            9,
+            3,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/op/f32/exp.json b/tests/benchdnn/inputs/graph/op/f32/exp.json
deleted file mode 100644
index c51442860e8..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/exp.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Exp",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/gelu.json b/tests/benchdnn/inputs/graph/op/f32/gelu.json
deleted file mode 100644
index 9c2c9f29a57..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/gelu.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "GELU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/gelu_bwd.json b/tests/benchdnn/inputs/graph/op/f32/gelu_bwd.json
deleted file mode 100644
index 80b0890197a..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/gelu_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "GELUBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/genindex.json b/tests/benchdnn/inputs/graph/op/f32/genindex.json
new file mode 100644
index 00000000000..c41f2568cd5
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f32/genindex.json
@@ -0,0 +1,65 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0
+  ],
+  "output_ports": [
+    1
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "genindex",
+      "kind": "GenIndex",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            32, 
+            32
+          ],
+          "stride": [
+            16384,
+            1024,
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1,
+          "dtype": "s32",
+          "shape": [
+            1,
+            16,
+            32, 
+            32
+          ],
+          "stride": [
+            16384,
+            1024,
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/op/f32/greaterequal.json b/tests/benchdnn/inputs/graph/op/f32/greaterequal.json
new file mode 100644
index 00000000000..61e77474abe
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/op/f32/greaterequal.json
@@ -0,0 +1,84 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0, 
+    1
+  ],
+  "output_ports": [
+    2
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "greaterequal",
+      "kind": "GreaterEqual",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            32, 
+            32
+          ],
+          "stride": [
+            16384,
+            1024,
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            1,
+            16,
+            32, 
+            32
+          ],
+          "stride": [
+            16384,
+            1024,
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "boolean",
+          "shape": [
+            1,
+            16,
+            32, 
+            32
+          ],
+          "stride": [
+            16384,
+            1024,
+            32, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/op/f32/hardswish.json b/tests/benchdnn/inputs/graph/op/f32/hardswish.json
deleted file mode 100644
index 86ccb5d55f0..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/hardswish.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSwish",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/hardswish_bwd.json b/tests/benchdnn/inputs/graph/op/f32/hardswish_bwd.json
deleted file mode 100644
index 515ce12f032..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/hardswish_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "HardSwishBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/interpolate_bwd_2d.json b/tests/benchdnn/inputs/graph/op/f32/interpolate_bwd_2d.json
similarity index 90%
rename from tests/benchdnn/inputs/graph/op/bf16/interpolate_bwd_2d.json
rename to tests/benchdnn/inputs/graph/op/f32/interpolate_bwd_2d.json
index 0ab35ef4963..d2664cd147e 100644
--- a/tests/benchdnn/inputs/graph/op/bf16/interpolate_bwd_2d.json
+++ b/tests/benchdnn/inputs/graph/op/f32/interpolate_bwd_2d.json
@@ -1,7 +1,14 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    2
+  ],
   "graph": [
     {
       "id": 0,
@@ -31,7 +38,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             2,
             16,
@@ -49,7 +56,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             2,
             16,
@@ -69,7 +76,7 @@
       "outputs": [
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             2,
             16,
diff --git a/tests/benchdnn/inputs/graph/op/f32/leakyrelu.json b/tests/benchdnn/inputs/graph/op/f32/leakyrelu.json
deleted file mode 100644
index 0a27446b380..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/leakyrelu.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "LeakyReLU",
-      "attrs": {
-        "alpha": {
-          "type": "f32",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/log.json b/tests/benchdnn/inputs/graph/op/f32/log.json
deleted file mode 100644
index 928ac536b2a..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/log.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Log",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            32,
-            32
-          ],
-          "stride": [
-            32768,
-            1024,
-            32,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/max.json b/tests/benchdnn/inputs/graph/op/f32/max.json
deleted file mode 100644
index 9257c45a297..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/max.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Maximum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/min.json b/tests/benchdnn/inputs/graph/op/f32/min.json
deleted file mode 100644
index fded398ceb2..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/min.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Minimum",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/mish.json b/tests/benchdnn/inputs/graph/op/f32/mish.json
deleted file mode 100644
index 866a39695e1..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/mish.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Mish",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/mish_bwd.json b/tests/benchdnn/inputs/graph/op/f32/mish_bwd.json
deleted file mode 100644
index 73bab2a1f31..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/mish_bwd.json
+++ /dev/null
@@ -1,71 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "MishBackward",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/mul.json b/tests/benchdnn/inputs/graph/op/f32/mul.json
deleted file mode 100644
index c6023d965c2..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/mul.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/pow.json b/tests/benchdnn/inputs/graph/op/f32/pow.json
deleted file mode 100644
index 9d88e1bb2de..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/pow.json
+++ /dev/null
@@ -1,60 +0,0 @@
-{
-  "version": "3.3.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "input_ports": [
-    0
-  ],
-  "output_ports": [
-    1
-  ],
-  "graph": [
-    {
-      "id": 0,
-      "name": "Pow_0",
-      "kind": "Pow",
-      "attrs": {
-        "beta": {
-          "type": "f32",
-          "value": 2
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1, 
-            32, 
-            4096
-          ],
-          "stride": [
-            131072, 
-            4096, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1, 
-            32, 
-            4096
-          ],
-          "stride": [
-            131072, 
-            4096, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "variable"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/round.json b/tests/benchdnn/inputs/graph/op/f32/round.json
deleted file mode 100644
index 09a2a7a41d0..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/round.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Round",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/sigmoid.json b/tests/benchdnn/inputs/graph/op/f32/sigmoid.json
deleted file mode 100644
index e33015b6550..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/sigmoid.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Sigmoid",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/sigmoid_bwd.json b/tests/benchdnn/inputs/graph/op/f32/sigmoid_bwd.json
deleted file mode 100644
index 5115a9dcd08..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/sigmoid_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "SigmoidBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/bf16/softmax_bwd_d_3d.json b/tests/benchdnn/inputs/graph/op/f32/softmax_bwd_d_3d.json
similarity index 87%
rename from tests/benchdnn/inputs/graph/op/bf16/softmax_bwd_d_3d.json
rename to tests/benchdnn/inputs/graph/op/f32/softmax_bwd_d_3d.json
index 0ea36808d38..83c0d71a9e0 100644
--- a/tests/benchdnn/inputs/graph/op/bf16/softmax_bwd_d_3d.json
+++ b/tests/benchdnn/inputs/graph/op/f32/softmax_bwd_d_3d.json
@@ -1,7 +1,14 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    2
+  ],
   "graph": [
     {
       "id": 0,
@@ -16,7 +23,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             384,
@@ -32,7 +39,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             384,
@@ -50,7 +57,7 @@
       "outputs": [
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             8,
             384,
diff --git a/tests/benchdnn/inputs/graph/op/f32/sqrt.json b/tests/benchdnn/inputs/graph/op/f32/sqrt.json
deleted file mode 100644
index 030c61170e7..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/sqrt.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Sqrt",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            64,
-            512,
-            28,
-            28
-          ],
-          "stride": [
-            401408,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            64,
-            512,
-            28,
-            28
-          ],
-          "stride": [
-            401408,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/sqrt_bwd.json b/tests/benchdnn/inputs/graph/op/f32/sqrt_bwd.json
deleted file mode 100644
index ad3f30e8269..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/sqrt_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "SqrtBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            64,
-            16,
-            28,
-            28
-          ],
-          "stride": [
-            12544,
-            1,
-            448,
-            16
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/square.json b/tests/benchdnn/inputs/graph/op/f32/square.json
deleted file mode 100644
index 433ac18297d..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/square.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Square",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/sub.json b/tests/benchdnn/inputs/graph/op/f32/sub.json
deleted file mode 100644
index 0e7dc58dce6..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/sub.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BINARY_0",
-      "kind": "Subtract",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/tanh.json b/tests/benchdnn/inputs/graph/op/f32/tanh.json
deleted file mode 100644
index 60b20cc7540..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/tanh.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "Tanh",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/f32/tanh_bwd.json b/tests/benchdnn/inputs/graph/op/f32/tanh_bwd.json
deleted file mode 100644
index 76f52f5d7b6..00000000000
--- a/tests/benchdnn/inputs/graph/op/f32/tanh_bwd.json
+++ /dev/null
@@ -1,76 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "TanhBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/op/harness_bf16_all b/tests/benchdnn/inputs/graph/op/harness_bf16_all
index 633c7cde72c..b7032271376 100644
--- a/tests/benchdnn/inputs/graph/op/harness_bf16_all
+++ b/tests/benchdnn/inputs/graph/op/harness_bf16_all
@@ -1,578 +1,571 @@
---reset --case=op/bf16/abs.json
---reset --case=op/bf16/abs_bwd.json
---reset --case=op/bf16/add.json
---reset --case=op/bf16/avgpool.json
---reset --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/bf16/avgpool.json
---reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/bf16/avgpool.json
---reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/bf16/avgpool.json
---reset --in-shapes=0:1x300x300x32 --op-attrs=0:data_format:NXC --case=op/bf16/avgpool.json
---reset --case=op/bf16/avgpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/avgpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/avgpool_bwd.json
---reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/bf16/avgpool_bwd.json
---reset --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/bf16/avgpool_bwd.json
---reset --in-shapes=0:2x192 --case=op/bf16/bnorm_fwd_d.json
---reset --in-shapes=0:2x192x71 --case=op/bf16/bnorm_fwd_d.json
---reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/bf16/bnorm.json
---reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/bf16/bnorm_fwd_d.json
---reset --in-shapes=0:2x71x71x192+1:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/bf16/bnorm_bwd.json
---reset --case=op/bf16/biasadd.json
---reset --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/bf16/biasadd.json
---reset --case=op/bf16/clamp.json
---reset --case=op/bf16/clamp_bwd.json
---reset --case=op/bf16/conv_2d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/conv_2d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/conv_2d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/bf16/conv_2d.json
---reset --in-shapes=0:2x28x28x32 --op-attrs=0:data_format:NXC --case=op/bf16/conv_2d.json
---reset --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/bf16/conv_2d.json
---reset --in-shapes=0:2x28x28x32+1:1x1x32x128 --op-attrs=0:data_format:NXC*weights_format:XIO --case=op/bf16/conv_2d.json
---reset --case=op/bf16/conv_bwd_d_2d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/conv_bwd_d_2d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/conv_bwd_d_2d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/bf16/conv_bwd_d_2d.json
---reset --op-attrs=0:output_padding:2x2 --case=op/bf16/conv_bwd_d_2d.json
---reset --in-shapes=0:1x14x14x128 --op-attrs=0:data_format:NXC*dst_shape:1x28x28x32 --case=op/bf16/conv_bwd_d_2d.json
---reset --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/bf16/conv_bwd_d_2d.json
---reset --case=op/bf16/conv_bwd_w_2d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/conv_bwd_w_2d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/conv_bwd_w_2d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/bf16/conv_bwd_w_2d.json
---reset --op-attrs=0:weights_format:XIO*weights_shape:1x1x32x128 --case=op/bf16/conv_bwd_w_2d.json
---reset --in-shapes=0:1x28x28x32+1:1x14x14x128 --op-attrs=0:data_format:NXC --case=op/bf16/conv_bwd_w_2d.json
---reset --case=op/bf16/deconv.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/deconv.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/deconv.json
---reset --op-attrs=0:auto_pad:VALID --case=op/bf16/deconv.json
---reset --op-attrs=0:output_padding:1x1 --case=op/bf16/deconv.json
---reset --in-shapes=0:1x28x28x32 --op-attrs=0:data_format:NXC --case=op/bf16/deconv.json
---reset --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/bf16/deconv.json
---reset --case=op/bf16/deconv_bwd_d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/deconv_bwd_d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/deconv_bwd_d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/bf16/deconv_bwd_d.json
---reset --case=op/bf16/deconv_bwd_d.json
---reset --in-shapes=0:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/bf16/deconv_bwd_d.json
---reset --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/bf16/deconv_bwd_d.json
---reset --case=op/bf16/deconv_bwd_w.json
---reset --case=op/bf16/deconv_bwd_w.json
---reset --op-attrs=0:weights_format:XOI*weights_shape:3x3x128x32 --case=op/bf16/deconv_bwd_w.json
---reset --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/deconv_bwd_w.json
---reset --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/deconv_bwd_w.json
---reset --in-shapes=1:1x128x57x57 --op-attrs=0:auto_pad:VALID --case=op/bf16/deconv_bwd_w.json
---reset --in-shapes=0:1x28x28x32+1:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/bf16/deconv_bwd_w.json
---reset --case=op/bf16/div.json
---reset --op-attrs=0:auto_broadcast:numpy --case=op/bf16/div.json
---reset --case=op/bf16/elu.json
---reset --case=op/bf16/elu_bwd.json
---reset --case=op/bf16/exp.json
---reset --case=op/bf16/gelu.json
---reset --case=op/bf16/gelu_bwd.json
---reset --case=op/bf16/hardsigmoid.json
---reset --case=op/bf16/hardsigmoid_bwd.json
---reset --case=op/bf16/hardswish.json
---reset --case=op/bf16/hardswish_bwd.json
---reset --case=op/bf16/interpolate.json
---reset --op-attrs=0:mode:bilinear --case=op/bf16/interpolate.json
---reset --op-attrs=0:mode:trilinear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x7x14x17 --op-attrs=0:data_format:NXC --case=op/bf16/interpolate.json
---reset --case=op/bf16/interpolate_bwd.json
---reset --op-attrs=0:mode:bilinear --case=op/bf16/interpolate_bwd.json
---reset --op-attrs=0:mode:trilinear --case=op/bf16/interpolate_bwd.json
---reset --op-attrs=0:mode:linear --case=op/bf16/interpolate_bwd.json
---reset --in-shapes=0:2x7x14x17+1:2x6x12x17 --op-attrs=0:data_format:NXC --case=op/bf16/interpolate_bwd.json
---reset --case=op/bf16/leakyrelu.json
---reset --case=op/bf16/lnorm.json
---reset --op-attrs=0:keep_stats:1 --case=op/bf16/lnorm_ks.json
---reset --case=op/bf16/lnorm_bwd.json
---reset --case=op/bf16/log.json
---reset --case=op/bf16/logsoftmax.json
---reset --case=op/bf16/logsoftmax_bwd.json
---reset --case=op/bf16/matmul.json
---reset --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/bf16/matmul.json
---reset --in-shapes=0:384x32+1:1024x384 --op-attrs=0:transpose_a:1*transpose_b:1 --case=op/bf16/matmul.json
---reset --case=op/bf16/maxpool.json
---reset --op-attrs=0:rounding_type:ceil --case=op/bf16/maxpool.json
---reset --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/bf16/maxpool.json
---reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/bf16/maxpool.json
---reset --op-attrs=0:auto_pad:VALID --case=op/bf16/maxpool.json
---reset --in-shapes=0:1x600x600x64 --op-attrs=0:data_format:NXC --case=op/bf16/maxpool.json
---reset --case=op/bf16/maxpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/bf16/maxpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/maxpool_bwd.json
---reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/bf16/maxpool_bwd.json
---reset --in-shapes=0:1x600x600x64+1:1x300x300x64+2:1x300x300x64 --op-attrs=0:data_format:NXC --case=op/bf16/maxpool_bwd.json
---reset --case=op/bf16/max.json
---reset --case=op/bf16/min.json
---reset --case=op/bf16/mish.json
---reset --case=op/bf16/mish_bwd.json
---reset --case=op/bf16/mul.json
---reset --case=op/bf16/prelu.json
---reset --op-attrs=0:per_channel_broadcast:1 --case=op/bf16/prelu.json
---reset --op-attrs=0:data_format:NXC --case=op/bf16/prelu.json
---reset --case=op/bf16/prelu_bwd.json
---reset --op-attrs=0:data_format:NXC --case=op/bf16/prelu_bwd.json
---reset --case=op/bf16/reducel1.json
---reset --case=op/bf16/reducel2.json
---reset --case=op/bf16/reducemax.json
---reset --case=op/bf16/reducemean.json
---reset --case=op/bf16/reducemin.json
---reset --case=op/bf16/reduceprod.json
---reset --case=op/bf16/reducesum.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reducel1.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reducel2.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reducemax.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reducemean.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reducemin.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reduceprod.json
---reset --op-attrs=0:keep_dims:0 --case=op/bf16/reducesum.json
---reset --case=op/bf16/relu.json
---reset --case=op/bf16/relu_bwd.json
---reset --case=op/bf16/reorder.json
---reset --case=op/bf16/sigmoid.json
---reset --case=op/bf16/sigmoid_bwd.json
---reset --case=op/bf16/softmax.json
---reset --case=op/bf16/softmax_bwd.json
---reset --case=op/bf16/square.json
---reset --case=op/bf16/sub.json
---reset --case=op/bf16/tanh.json
---reset --case=op/bf16/tanh_bwd.json
+
+--reset --dt=bf16 --op-kind=0:Add,0:Divide,0:Maximum,0:Minimum,0:Multiply,0:Subtract --case=op/f32/add.json
+--reset --dt=bf16 --op-kind=0:Abs,0:Exp,0:GELU,0:HardSwish,0:Log,0:Mish,0:Sigmoid,0:Sqrt,0:Square,0:Tanh --case=op/f32/abs.json
+--reset --dt=bf16 --op-kind=0:AbsBackward,0:GELUBackward,0:HardSwishBackward,0:MishBackward --case=op/f32/abs_bwd.json
+--reset --dt=bf16 --case=op/f32/avgpool.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f32/avgpool.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/avgpool.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/avgpool.json
+--reset --dt=bf16 --in-shapes=0:1x300x300x32 --op-attrs=0:data_format:NXC --case=op/f32/avgpool.json
+--reset --dt=bf16 --case=op/f32/avgpool_bwd.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/avgpool_bwd.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/avgpool_bwd.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/avgpool_bwd.json
+--reset --dt=bf16 --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/f32/avgpool_bwd.json
+--reset --dt=bf16 --in-shapes=0:2x192 --case=op/f32/bnorm_fwd_d.json
+--reset --dt=bf16 --in-shapes=0:2x192x71 --case=op/f32/bnorm_fwd_d.json
+--reset --dt=bf16 --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm.json
+--reset --dt=bf16 --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_fwd_d.json
+--reset --dt=bf16 --in-shapes=0:2x71x71x192+1:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_bwd.json
+--reset --dt=bf16 --case=op/f32/biasadd.json
+--reset --dt=bf16 --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f32/biasadd.json
+--reset --dt=bf16 --case=op/f32/clamp.json
+--reset --dt=bf16 --case=op/f32/clamp_bwd.json
+--reset --dt=bf16 --case=op/f32/conv_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/conv_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_2d.json
+--reset --dt=bf16 --in-shapes=0:2x28x28x32 --op-attrs=0:data_format:NXC --case=op/f32/conv_2d.json
+--reset --dt=bf16 --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/f32/conv_2d.json
+--reset --dt=bf16 --in-shapes=0:2x28x28x32+1:1x1x32x128 --op-attrs=0:data_format:NXC*weights_format:XIO --case=op/f32/conv_2d.json
+--reset --dt=bf16 --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --op-attrs=0:output_padding:2x2 --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --in-shapes=0:1x14x14x128 --op-attrs=0:data_format:NXC*dst_shape:1x28x28x32 --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=bf16 --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=bf16 --op-attrs=0:weights_format:XIO*weights_shape:1x1x32x128 --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=bf16 --in-shapes=0:1x28x28x32+1:1x14x14x128 --op-attrs=0:data_format:NXC --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=bf16 --case=op/f32/deconv.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/deconv.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/deconv.json
+--reset --dt=bf16 --op-attrs=0:output_padding:1x1 --case=op/f32/deconv.json
+--reset --dt=bf16 --in-shapes=0:1x28x28x32 --op-attrs=0:data_format:NXC --case=op/f32/deconv.json
+--reset --dt=bf16 --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/f32/deconv.json
+--reset --dt=bf16 --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --in-shapes=0:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/f32/deconv_bwd_d.json
+--reset --dt=bf16 --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --op-attrs=0:weights_format:XOI*weights_shape:3x3x128x32 --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --in-shapes=1:1x128x57x57 --op-attrs=0:auto_pad:VALID --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --in-shapes=0:1x28x28x32+1:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/f32/deconv_bwd_w.json
+--reset --dt=bf16 --op-kind=0:Elu,0:LeakyReLU --case=op/f32/elu.json
+--reset --dt=bf16 --case=op/f32/elu_bwd.json
+--reset --dt=bf16 --case=op/f32/hardsigmoid.json
+--reset --dt=bf16 --case=op/f32/hardsigmoid_bwd.json
+--reset --dt=bf16 --case=op/f32/interpolate.json
+--reset --dt=bf16 --op-attrs=0:mode:bilinear --case=op/f32/interpolate.json
+--reset --dt=bf16 --op-attrs=0:mode:trilinear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x7x14x17 --op-attrs=0:data_format:NXC --case=op/f32/interpolate.json
+--reset --dt=bf16 --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --op-attrs=0:mode:bilinear --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --op-attrs=0:mode:trilinear --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --op-attrs=0:mode:linear --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --in-shapes=0:2x7x14x17+1:2x6x12x17 --op-attrs=0:data_format:NXC --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --case=op/f32/lnorm.json
+--reset --dt=bf16 --op-attrs=0:keep_stats:1 --case=op/f32/lnorm_ks.json
+--reset --dt=bf16 --case=op/f32/lnorm_bwd.json
+--reset --dt=bf16 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --case=op/f32/matmul_2d_4d.json
+--reset --dt=bf16 --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/f32/matmul_2d_4d.json
+--reset --dt=bf16 --in-shapes=0:384x32+1:1024x384 --op-attrs=0:transpose_a:1*transpose_b:1 --case=op/f32/matmul_2d_4d.json
+--reset --dt=bf16 --case=op/f32/maxpool.json
+--reset --dt=bf16 --op-attrs=0:rounding_type:ceil --case=op/f32/maxpool.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f32/maxpool.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/maxpool.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/maxpool.json
+--reset --dt=bf16 --in-shapes=0:1x600x600x64 --op-attrs=0:data_format:NXC --case=op/f32/maxpool.json
+--reset --dt=bf16 --case=op/f32/maxpool_bwd.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/maxpool_bwd.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/maxpool_bwd.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/maxpool_bwd.json
+--reset --dt=bf16 --in-shapes=0:1x600x600x64+1:1x300x300x64+2:1x300x300x64 --op-attrs=0:data_format:NXC --case=op/f32/maxpool_bwd.json
+--reset --dt=bf16 --case=op/f32/prelu.json
+--reset --dt=bf16 --op-attrs=0:per_channel_broadcast:1 --case=op/f32/prelu.json
+--reset --dt=bf16 --op-attrs=0:data_format:NXC --case=op/f32/prelu.json
+--reset --dt=bf16 --case=op/f32/prelu_bwd.json
+--reset --dt=bf16 --op-attrs=0:data_format:NXC --case=op/f32/prelu_bwd.json
+--reset --dt=bf16 --case=op/f32/reducel1.json
+--reset --dt=bf16 --case=op/f32/reducel2.json
+--reset --dt=bf16 --case=op/f32/reducemax.json
+--reset --dt=bf16 --case=op/f32/reducemean.json
+--reset --dt=bf16 --case=op/f32/reducemin.json
+--reset --dt=bf16 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --case=op/f32/reducesum.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reducel1.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reducel2.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reducemax.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reducemean.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reducemin.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --op-attrs=0:keep_dims:0 --case=op/f32/reducesum.json
+--reset --dt=bf16 --case=op/f32/relu.json
+--reset --dt=bf16 --op-kind=0:ReLUBackward,0:SigmoidBackward,0:SqrtBackward,0:TanhBackward --case=op/f32/relu_bwd.json
+--reset --dt=bf16 --case=op/f32/reorder.json
+--reset --dt=bf16 --case=op/f32/softmax.json
+--reset --dt=bf16 --case=op/f32/softmax_bwd.json
 --reset --case=op/bf16/typecast.json
---reset --case=op/bf16/sqrt.json
---reset --case=op/bf16/sqrt_bwd.json
+--reset --case=op/bf16/dynamicdq_s4.json
+--reset --case=op/bf16/dynamicdq_u4.json
+--reset --dt=bf16 --case=op/f32/select.json
+--reset --dt=bf16 --case=op/f32/gnorm.json
+--reset --dt=bf16 --case=op/f32/static_reshape.json
+--reset --dt=bf16 --case=op/f32/static_transpose.json
+--reset --dt=bf16 --case=op/f32/genindex.json
+--reset --dt=bf16 --op-attrs=0:axis:2 --case=op/f32/genindex.json
+--reset --dt=bf16 --op-attrs=0:axis:-1 --case=op/f32/genindex.json
+--reset --dt=bf16 --op-attrs=0:axis:2 --in-shapes=0:1x16x32x32*acbd+1:1x16x32x32*acbd --case=op/f32/genindex.json
+--reset --dt=bf16 --case=op/f32/greaterequal.json
+--reset --dt=bf16 --in-shapes=1:1x1x1x1 --case=op/f32/greaterequal.json
+--reset --dt=bf16 --in-shapes=1:1 --case=op/f32/greaterequal.json
+
+# select
+--reset --dt=bf16 --in-shapes=2:1x1x1x128 --case=op/f32/select.json
 # concat
---reset --in-shapes=0:1x4096x14x14+1:1x4096x14x14 --case=op/bf16/concat_2.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:axis:1 --case=op/bf16/concat_2.json
---reset --in-shapes=0:8x16x384x64+1:8x16x384x384+2:8x16x384x384 --case=op/bf16/concat_3.json
+--reset --dt=bf16 --in-shapes=0:1x4096x14x14+1:1x4096x14x14 --case=op/f32/concat.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/concat.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x64+1:8x16x384x384+2:8x16x384x384 --case=op/f32/concat_3.json
 # softmax
---reset --in-shapes=0:64x128x28x28 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*abc --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*abc --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:64x128x28x28 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024*acb --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*abc --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*abc --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax.json
 # resampling
---reset --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:sizes:1.666667x1 --case=op/bf16/interpolate_bwd.json 
---reset --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:sizes:0.6x1 --case=op/bf16/interpolate_bwd.json
---reset --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:sizes:1.166667x1.166667 --case=op/bf16/interpolate_bwd.json
---reset --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:sizes:0.857143x0.857143 --case=op/bf16/interpolate_bwd.json
---reset --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5 --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11 --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11 --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14 --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12 --case=op/bf16/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:sizes:1.666667x1 --case=op/f32/interpolate_bwd.json 
+--reset --dt=bf16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:sizes:0.6x1 --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:sizes:1.166667x1.166667 --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:sizes:0.857143x0.857143 --case=op/f32/interpolate_bwd.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11 --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11 --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14 --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12 --case=op/f32/interpolate.json
 # pool
---reset --in-shapes=0:1x64x600x600 --op-attrs=0:strides:2x2*pads_begin:1x1*pads_end:0x0*kernel:3x3*dilations:1x1 --case=op/bf16/maxpool.json
+--reset --dt=bf16 --in-shapes=0:1x64x600x600 --op-attrs=0:strides:2x2*pads_begin:1x1*pads_end:0x0*kernel:3x3*dilations:1x1 --case=op/f32/maxpool.json
 # large scope
---reset --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/bf16/bnorm_fwd_d.json
---reset --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/bf16/bnorm.json
---reset --in-shapes=0:96x192x71x71+1:96x192x71x71+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/bf16/bnorm_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd_d_3d.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax_bwd.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:2 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/softmax.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax_bwd.json
---reset --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax_bwd.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/logsoftmax_bwd.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/bf16/logsoftmax_bwd.json
---reset --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/logsoftmax_bwd.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024 --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/bf16/logsoftmax.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducel1.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reducel1.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reducel1.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reducel1.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reducel2.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reducel2.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducel2.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reducel2.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reducemax.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reducemax.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemax.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reducemax.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reducemin.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reducemin.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemin.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reducemin.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reducemean.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reducemean.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducemean.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reducemean.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reduceprod.json
---reset --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/bf16/reducesum.json
---reset --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/bf16/reducesum.json
---reset --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/bf16/reducesum.json
---reset --in-shapes=0:1x4096x14x14*abcd+1:1x4096x14x14*abcd --op-attrs=0:axis:0 --case=op/bf16/concat.json
---reset --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:axis:1 --case=op/bf16/concat.json
---reset --in-shapes=0:8x16x384x64*abcd+1:8x16x384x384*abcd+2:8x16x384x384*abcd --op-attrs=0:axis:3 --case=op/bf16/concat_3.json
---reset --in-shapes=0:1x4096x14x14*abcd+1:1x4096x14x14*abcd --op-attrs=0:axis:0 --case=op/bf16/concat_2.json
---reset --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:axis:1 --case=op/bf16/concat_2.json
---reset --in-shapes=0:2x16x3x11 --op-attrs=0:sizes:5x11*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x5x11 --op-attrs=0:sizes:3x11*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x6x12 --op-attrs=0:sizes:7x14*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x7x14 --op-attrs=0:sizes:6x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x3x11 --op-attrs=0:sizes:5x11*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x5x11 --op-attrs=0:sizes:3x11*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x6x12 --op-attrs=0:sizes:7x14*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x7x14 --op-attrs=0:sizes:6x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x17 --op-attrs=0:sizes:21*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x21 --op-attrs=0:sizes:17*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x32 --op-attrs=0:sizes:64*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x64 --op-attrs=0:sizes:32*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x12*acdeb --op-attrs=0:sizes:4x8x6*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x6*acdeb --op-attrs=0:sizes:4x8x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x8x6*acdeb --op-attrs=0:sizes:4x4x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x4x4x6*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x2x8x12*acdeb --op-attrs=0:sizes:3x5x7*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x5x7*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x6*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x7*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x17*acb --op-attrs=0:sizes:21*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x21*acb --op-attrs=0:sizes:17*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x32*acb --op-attrs=0:sizes:64*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x64*acb --op-attrs=0:sizes:32*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x17 --op-attrs=0:sizes:21*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x21 --op-attrs=0:sizes:17*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x32 --op-attrs=0:sizes:64*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x64 --op-attrs=0:sizes:32*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x12 --op-attrs=0:sizes:4x8x6*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x6 --op-attrs=0:sizes:4x8x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x8x6 --op-attrs=0:sizes:4x4x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x4x4x6 --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x2x8x12 --op-attrs=0:sizes:3x5x7*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x5x7 --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x6 --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x7 --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x12 --op-attrs=0:sizes:4x8x6*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x6 --op-attrs=0:sizes:4x8x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x8x6 --op-attrs=0:sizes:4x4x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x4x4x6 --op-attrs=0:sizes:2x8x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x2x8x12 --op-attrs=0:sizes:3x5x7*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x5x7 --op-attrs=0:sizes:2x8x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x6 --op-attrs=0:sizes:2x5x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x7 --op-attrs=0:sizes:2x5x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x17*acb --op-attrs=0:sizes:21*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x17x21*acb --op-attrs=0:sizes:17*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x32*acb --op-attrs=0:sizes:64*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x64x64*acb --op-attrs=0:sizes:32*mode:linear --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x12*acdeb --op-attrs=0:sizes:4x8x6*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x6*acdeb --op-attrs=0:sizes:4x8x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x8x6*acdeb --op-attrs=0:sizes:4x4x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x4x4x6*acdeb --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x2x8x12*acdeb --op-attrs=0:sizes:3x5x7*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x5x7*acdeb --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x6*acdeb --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x9x3x8x7*acdeb --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/bf16/interpolate.json
---reset --in-shapes=0:2x8x2x4x12+1:2x8x4x8x6 --op-attrs=0:scales:2x2x0.5*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x6+1:2x8x4x8x12 --op-attrs=0:scales:2x2x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x8x6+1:2x8x4x4x12 --op-attrs=0:scales:2x0.5x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x4x4x6+1:2x8x2x8x12 --op-attrs=0:scales:0.5x2x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x2x8x12+1:2x9x3x5x7 --op-attrs=0:scales:1.5x0.625x0.583333*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x5x7+1:2x9x2x8x12 --op-attrs=0:scales:0.666667x1.6x1.714286*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x6+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x7+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x1.714286*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x17+1:2x17x21 --op-attrs=0:scales:1.235294*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x21+1:2x17x17 --op-attrs=0:scales:0.809524*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x32+1:2x64x64 --op-attrs=0:scales:2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x64+1:2x64x32 --op-attrs=0:scales:0.5*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x12*acdeb+1:2x8x4x8x6*acdeb --op-attrs=0:scales:2x2x0.5*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x6*acdeb+1:2x8x4x8x12*acdeb --op-attrs=0:scales:2x2x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x8x6*acdeb+1:2x8x4x4x12*acdeb --op-attrs=0:scales:2x0.5x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x4x4x6*acdeb+1:2x8x2x8x12*acdeb --op-attrs=0:scales:0.5x2x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x2x8x12*acdeb+1:2x9x3x5x7*acdeb --op-attrs=0:scales:1.5x0.625x0.583333*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:scales:0.666667x1.6x1.714286*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x1.714286*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x12+1:2x8x4x8x6 --op-attrs=0:scales:2x2x0.5*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x6+1:2x8x4x8x12 --op-attrs=0:scales:2x2x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x8x6+1:2x8x4x4x12 --op-attrs=0:scales:2x0.5x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x4x4x6+1:2x8x2x8x12 --op-attrs=0:scales:0.5x2x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x2x8x12+1:2x9x3x5x7 --op-attrs=0:scales:1.5x0.625x0.583333*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x5x7+1:2x9x2x8x12 --op-attrs=0:scales:0.666667x1.6x1.714286*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x6+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x7+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x1.714286*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x12*acdeb+1:2x8x4x8x6*acdeb --op-attrs=0:scales:2x2x0.5*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x4x6*acdeb+1:2x8x4x8x12*acdeb --op-attrs=0:scales:2x2x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x2x8x6*acdeb+1:2x8x4x4x12*acdeb --op-attrs=0:scales:2x0.5x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x8x4x4x6*acdeb+1:2x8x2x8x12*acdeb --op-attrs=0:scales:0.5x2x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x2x8x12*acdeb+1:2x9x3x5x7*acdeb --op-attrs=0:scales:1.5x0.625x0.583333*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:scales:0.666667x1.6x1.714286*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x1.714286*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x17+1:2x17x21 --op-attrs=0:scales:1.235294*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x21+1:2x17x17 --op-attrs=0:scales:0.809524*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x32+1:2x64x64 --op-attrs=0:scales:2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x64+1:2x64x32 --op-attrs=0:scales:0.5*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x3x11*acdb+1:2x16x5x11*acdb --op-attrs=0:scales:1.666667x1*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x5x11*acdb+1:2x16x3x11*acdb --op-attrs=0:scales:0.6x1*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x6x12*acdb+1:2x17x7x14*acdb --op-attrs=0:scales:1.166667x1.166667*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x7x14*acdb+1:2x17x6x12*acdb --op-attrs=0:scales:0.857143x0.857143*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143*mode:linear --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x3x11*acdb+1:2x16x5x11*acdb --op-attrs=0:scales:1.666667x1*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x16x5x11*acdb+1:2x16x3x11*acdb --op-attrs=0:scales:0.6x1*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x6x12*acdb+1:2x17x7x14*acdb --op-attrs=0:scales:1.166667x1.166667*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --in-shapes=0:2x17x7x14*acdb+1:2x17x6x12*acdb --op-attrs=0:scales:0.857143x0.857143*mode:nearest --case=op/bf16/interpolate_bwd_2d.json
---reset --case=op/bf16/select.json
---reset --case=op/bf16/gnorm.json
---reset --case=op/bf16/static_reshape.json
---reset --case=op/bf16/static_transpose.json
-# only the graph compiler supports this op
---reset --case=op/bf16/pow.json
+--reset --dt=bf16 --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm_fwd_d.json
+--reset --dt=bf16 --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm.json
+--reset --dt=bf16 --in-shapes=0:96x192x71x71+1:96x192x71x71+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducel1.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducel2.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducemax.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducemean.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reduceprod.json
+--reset --dt=bf16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=bf16 --in-shapes=0:1x4096x14x14*abcd+1:1x4096x14x14*abcd --op-attrs=0:axis:0 --case=op/f32/concat.json
+--reset --dt=bf16 --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:axis:1 --case=op/f32/concat.json
+--reset --dt=bf16 --in-shapes=0:8x16x384x64*abcd+1:8x16x384x384*abcd+2:8x16x384x384*abcd --op-attrs=0:axis:3 --case=op/f32/concat_3.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11 --op-attrs=0:sizes:5x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11 --op-attrs=0:sizes:3x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12 --op-attrs=0:sizes:7x14*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14 --op-attrs=0:sizes:6x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11 --op-attrs=0:sizes:5x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11 --op-attrs=0:sizes:3x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12 --op-attrs=0:sizes:7x14*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14 --op-attrs=0:sizes:6x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x17 --op-attrs=0:sizes:21*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x21 --op-attrs=0:sizes:17*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x32 --op-attrs=0:sizes:64*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x64 --op-attrs=0:sizes:32*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12*acdeb --op-attrs=0:sizes:4x8x6*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6*acdeb --op-attrs=0:sizes:4x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6*acdeb --op-attrs=0:sizes:4x4x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12*acdeb --op-attrs=0:sizes:3x5x7*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x17*acb --op-attrs=0:sizes:21*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x21*acb --op-attrs=0:sizes:17*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x32*acb --op-attrs=0:sizes:64*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x64*acb --op-attrs=0:sizes:32*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x17 --op-attrs=0:sizes:21*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x21 --op-attrs=0:sizes:17*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x32 --op-attrs=0:sizes:64*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x64 --op-attrs=0:sizes:32*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12 --op-attrs=0:sizes:4x8x6*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6 --op-attrs=0:sizes:4x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6 --op-attrs=0:sizes:4x4x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6 --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12 --op-attrs=0:sizes:3x5x7*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7 --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6 --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7 --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12 --op-attrs=0:sizes:4x8x6*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6 --op-attrs=0:sizes:4x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6 --op-attrs=0:sizes:4x4x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6 --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12 --op-attrs=0:sizes:3x5x7*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7 --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6 --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7 --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x17*acb --op-attrs=0:sizes:21*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x17x21*acb --op-attrs=0:sizes:17*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x32*acb --op-attrs=0:sizes:64*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x64x64*acb --op-attrs=0:sizes:32*mode:linear --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12*acdeb --op-attrs=0:sizes:4x8x6*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6*acdeb --op-attrs=0:sizes:4x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6*acdeb --op-attrs=0:sizes:4x4x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6*acdeb --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12*acdeb --op-attrs=0:sizes:3x5x7*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7*acdeb --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6*acdeb --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7*acdeb --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12+1:2x8x4x8x6 --op-attrs=0:scales:2x2x0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6+1:2x8x4x8x12 --op-attrs=0:scales:2x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6+1:2x8x4x4x12 --op-attrs=0:scales:2x0.5x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6+1:2x8x2x8x12 --op-attrs=0:scales:0.5x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12+1:2x9x3x5x7 --op-attrs=0:scales:1.5x0.625x0.583333*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7+1:2x9x2x8x12 --op-attrs=0:scales:0.666667x1.6x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x17+1:2x17x21 --op-attrs=0:scales:1.235294*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x21+1:2x17x17 --op-attrs=0:scales:0.809524*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x32+1:2x64x64 --op-attrs=0:scales:2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x64+1:2x64x32 --op-attrs=0:scales:0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12*acdeb+1:2x8x4x8x6*acdeb --op-attrs=0:scales:2x2x0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6*acdeb+1:2x8x4x8x12*acdeb --op-attrs=0:scales:2x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6*acdeb+1:2x8x4x4x12*acdeb --op-attrs=0:scales:2x0.5x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6*acdeb+1:2x8x2x8x12*acdeb --op-attrs=0:scales:0.5x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12*acdeb+1:2x9x3x5x7*acdeb --op-attrs=0:scales:1.5x0.625x0.583333*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:scales:0.666667x1.6x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12+1:2x8x4x8x6 --op-attrs=0:scales:2x2x0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6+1:2x8x4x8x12 --op-attrs=0:scales:2x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6+1:2x8x4x4x12 --op-attrs=0:scales:2x0.5x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6+1:2x8x2x8x12 --op-attrs=0:scales:0.5x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12+1:2x9x3x5x7 --op-attrs=0:scales:1.5x0.625x0.583333*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7+1:2x9x2x8x12 --op-attrs=0:scales:0.666667x1.6x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x12*acdeb+1:2x8x4x8x6*acdeb --op-attrs=0:scales:2x2x0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x4x6*acdeb+1:2x8x4x8x12*acdeb --op-attrs=0:scales:2x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x2x8x6*acdeb+1:2x8x4x4x12*acdeb --op-attrs=0:scales:2x0.5x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x8x4x4x6*acdeb+1:2x8x2x8x12*acdeb --op-attrs=0:scales:0.5x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x2x8x12*acdeb+1:2x9x3x5x7*acdeb --op-attrs=0:scales:1.5x0.625x0.583333*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:scales:0.666667x1.6x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x17+1:2x17x21 --op-attrs=0:scales:1.235294*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x21+1:2x17x17 --op-attrs=0:scales:0.809524*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x32+1:2x64x64 --op-attrs=0:scales:2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x64+1:2x64x32 --op-attrs=0:scales:0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11*acdb+1:2x16x5x11*acdb --op-attrs=0:scales:1.666667x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11*acdb+1:2x16x3x11*acdb --op-attrs=0:scales:0.6x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12*acdb+1:2x17x7x14*acdb --op-attrs=0:scales:1.166667x1.166667*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14*acdb+1:2x17x6x12*acdb --op-attrs=0:scales:0.857143x0.857143*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x3x11*acdb+1:2x16x5x11*acdb --op-attrs=0:scales:1.666667x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x16x5x11*acdb+1:2x16x3x11*acdb --op-attrs=0:scales:0.6x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x6x12*acdb+1:2x17x7x14*acdb --op-attrs=0:scales:1.166667x1.166667*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=bf16 --in-shapes=0:2x17x7x14*acdb+1:2x17x6x12*acdb --op-attrs=0:scales:0.857143x0.857143*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+
+# reorder
+--reset --dt=bf16 --in-shapes=0:2x64x3x3 --case=op/f32/reorder.json
+--reset --dt=bf16 --in-shapes=0:abcd --case=op/f32/reorder.json
+--reset --dt=bf16 --in-shapes=0:cdba --case=op/f32/reorder.json
+--reset --dt=bf16 --in-shapes=0:acdb --case=op/f32/reorder.json
diff --git a/tests/benchdnn/inputs/graph/op/harness_bf16_ci b/tests/benchdnn/inputs/graph/op/harness_bf16_ci
index 159322a7931..3c4245913cf 100644
--- a/tests/benchdnn/inputs/graph/op/harness_bf16_ci
+++ b/tests/benchdnn/inputs/graph/op/harness_bf16_ci
@@ -1,24 +1,23 @@
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/bf16/conv_2d.json
---reset --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/bf16/matmul.json
---reset --case=op/bf16/abs_bwd.json
---reset --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/bf16/avgpool_bwd.json
---reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/bf16/bnorm.json
---reset --case=op/bf16/clamp_bwd.json
---reset --case=op/bf16/div.json
---reset --case=op/bf16/gelu_bwd.json
---reset --op-attrs=0:mode:trilinear --case=op/bf16/interpolate.json
---reset --case=op/bf16/leakyrelu.json
---reset --case=op/bf16/logsoftmax.json
---reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/bf16/maxpool.json
---reset --case=op/bf16/mish_bwd.json
---reset --op-attrs=0:data_format:NXC --case=op/bf16/prelu_bwd.json
---reset --case=op/bf16/reducemin.json
---reset --case=op/bf16/sigmoid.json
---reset --case=op/bf16/sub.json
---reset --op-attrs=0:beta:1 --case=op/bf16/hardsigmoid.json
---reset --case=op/bf16/gnorm.json
-# only the graph compiler supports these 4 ops
---reset --case=op/bf16/pow.json
---reset --case=op/bf16/select.json
---reset --case=op/bf16/static_reshape.json
---reset --case=op/bf16/static_transpose.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_2d.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_2d.json
+--reset --dt=bf16 --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/f32/matmul_2d_4d.json
+--reset --dt=bf16 --op-kind=0:Add,0:Divide,0:Minimum,0:Subtract --case=op/f32/add.json
+--reset --dt=bf16 --op-kind=0:AbsBackward,0:GELUBackward,0:MishBackward --case=op/f32/abs_bwd.json
+--reset --dt=bf16 --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f32/biasadd.json
+--reset --dt=bf16 --case=op/f32/reducel2.json
+--reset --dt=bf16 --case=op/f32/gnorm.json
+--reset --dt=bf16 --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/f32/avgpool_bwd.json
+--reset --dt=bf16 --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm.json
+--reset --dt=bf16 --case=op/f32/clamp_bwd.json
+--reset --dt=bf16 --op-attrs=0:mode:trilinear --case=op/f32/interpolate.json
+--reset --dt=bf16 --case=op/f32/logsoftmax.json
+--reset --dt=bf16 --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/maxpool.json
+--reset --dt=bf16 --op-attrs=0:data_format:NXC --case=op/f32/prelu_bwd.json
+--reset --dt=bf16 --case=op/f32/reducemin.json
+--reset --dt=bf16 --op-attrs=0:beta:1 --case=op/f32/hardsigmoid.json
+--reset --dt=bf16 --case=op/f32/select.json
+--reset --dt=bf16 --case=op/f32/static_reshape.json
+--reset --dt=bf16 --case=op/f32/static_transpose.json
+--reset --case=op/bf16/dynamicdq_s4.json
+--reset --dt=bf16 --case=op/f32/genindex.json
+--reset --dt=bf16 --case=op/f32/greaterequal.json
diff --git a/tests/benchdnn/inputs/graph/op/harness_f16_all b/tests/benchdnn/inputs/graph/op/harness_f16_all
index 9b2701feb42..2ad88506a3d 100644
--- a/tests/benchdnn/inputs/graph/op/harness_f16_all
+++ b/tests/benchdnn/inputs/graph/op/harness_f16_all
@@ -1,150 +1,570 @@
---reset --case=op/f16/abs.json
---reset --case=op/f16/abs_bwd.json
---reset --case=op/f16/add.json
---reset --case=op/f16/avgpool.json
---reset --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f16/avgpool.json
---reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f16/avgpool.json
---reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f16/avgpool.json
---reset --in-shapes=0:1x300x300x32 --op-attrs=0:data_format:NXC --case=op/f16/avgpool.json
---reset --case=op/f16/avgpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/avgpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/avgpool_bwd.json
---reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f16/avgpool_bwd.json
---reset --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/f16/avgpool_bwd.json
---reset --in-shapes=0:2x192 --case=op/f16/bnorm_fwd_d.json
---reset --in-shapes=0:2x192x71 --case=op/f16/bnorm_fwd_d.json
---reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f16/bnorm.json
---reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f16/bnorm_fwd_d.json
---reset --in-shapes=0:2x71x71x192+1:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f16/bnorm_bwd.json
---reset --case=op/f16/biasadd.json
---reset --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f16/biasadd.json
---reset --case=op/f16/clamp.json
---reset --case=op/f16/clamp_bwd.json
---reset --case=op/f16/conv_2d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/conv_2d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/conv_2d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/conv_2d.json
---reset --in-shapes=0:2x28x28x32 --op-attrs=0:data_format:NXC --case=op/f16/conv_2d.json
---reset --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/f16/conv_2d.json
---reset --in-shapes=0:2x28x28x32+1:1x1x32x128 --op-attrs=0:data_format:NXC*weights_format:XIO --case=op/f16/conv_2d.json
---reset --case=op/f16/conv_bwd_d_2d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/conv_bwd_d_2d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/conv_bwd_d_2d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/conv_bwd_d_2d.json
---reset --op-attrs=0:output_padding:2x2 --case=op/f16/conv_bwd_d_2d.json
---reset --in-shapes=0:1x14x14x128 --op-attrs=0:data_format:NXC*dst_shape:1x28x28x32 --case=op/f16/conv_bwd_d_2d.json
---reset --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/f16/conv_bwd_d_2d.json
---reset --case=op/f16/conv_bwd_w_2d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/conv_bwd_w_2d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/conv_bwd_w_2d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/conv_bwd_w_2d.json
---reset --op-attrs=0:weights_format:XIO*weights_shape:1x1x32x128 --case=op/f16/conv_bwd_w_2d.json
---reset --in-shapes=0:1x28x28x32+1:1x14x14x128 --op-attrs=0:data_format:NXC --case=op/f16/conv_bwd_w_2d.json
---reset --case=op/f16/deconv.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/deconv.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/deconv.json
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/deconv.json
---reset --op-attrs=0:output_padding:1x1 --case=op/f16/deconv.json
---reset --in-shapes=0:1x28x28x32 --op-attrs=0:data_format:NXC --case=op/f16/deconv.json
---reset --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/f16/deconv.json
---reset --case=op/f16/deconv_bwd_d.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/deconv_bwd_d.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/deconv_bwd_d.json
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/deconv_bwd_d.json
---reset --case=op/f16/deconv_bwd_d.json
---reset --in-shapes=0:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/f16/deconv_bwd_d.json
---reset --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/f16/deconv_bwd_d.json
---reset --case=op/f16/deconv_bwd_w.json
---reset --case=op/f16/deconv_bwd_w.json
---reset --op-attrs=0:weights_format:XOI*weights_shape:3x3x128x32 --case=op/f16/deconv_bwd_w.json
---reset --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/deconv_bwd_w.json
---reset --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/deconv_bwd_w.json
---reset --in-shapes=1:1x128x57x57 --op-attrs=0:auto_pad:VALID --case=op/f16/deconv_bwd_w.json
---reset --in-shapes=0:1x28x28x32+1:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/f16/deconv_bwd_w.json
---reset --case=op/f16/div.json
---reset --op-attrs=0:auto_broadcast:numpy --case=op/f16/div.json
---reset --case=op/f16/elu.json
---reset --case=op/f16/elu_bwd.json
---reset --case=op/f16/exp.json
---reset --case=op/f16/gelu.json
---reset --case=op/f16/gelu_bwd.json
---reset --case=op/f16/hardsigmoid.json
---reset --case=op/f16/hardsigmoid_bwd.json
---reset --case=op/f16/hardswish.json
---reset --case=op/f16/hardswish_bwd.json
---reset --case=op/f16/interpolate.json
---reset --op-attrs=0:mode:bilinear --case=op/f16/interpolate.json
---reset --op-attrs=0:mode:trilinear --case=op/f16/interpolate.json
---reset --in-shapes=0:2x7x14x17 --op-attrs=0:data_format:NXC --case=op/f16/interpolate.json
---reset --case=op/f16/interpolate_bwd.json
---reset --op-attrs=0:mode:bilinear --case=op/f16/interpolate_bwd.json
---reset --op-attrs=0:mode:trilinear --case=op/f16/interpolate_bwd.json
---reset --op-attrs=0:mode:linear --case=op/f16/interpolate_bwd.json
---reset --in-shapes=0:2x7x14x17+1:2x6x12x17 --op-attrs=0:data_format:NXC --case=op/f16/interpolate_bwd.json
---reset --case=op/f16/leakyrelu.json
---reset --case=op/f16/lnorm.json
---reset --op-attrs=0:keep_stats:1 --case=op/f16/lnorm_ks.json
---reset --case=op/f16/lnorm_bwd.json
---reset --case=op/f16/log.json
---reset --case=op/f16/logsoftmax.json
---reset --case=op/f16/logsoftmax_bwd.json
---reset --case=op/f16/matmul.json
---reset --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/f16/matmul.json
---reset --in-shapes=0:384x32+1:1024x384 --op-attrs=0:transpose_a:1*transpose_b:1 --case=op/f16/matmul.json
---reset --case=op/f16/maxpool.json
---reset --op-attrs=0:rounding_type:ceil --case=op/f16/maxpool.json
---reset --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f16/maxpool.json
---reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f16/maxpool.json
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/maxpool.json
---reset --in-shapes=0:1x600x600x64 --op-attrs=0:data_format:NXC --case=op/f16/maxpool.json
---reset --case=op/f16/maxpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_UPPER --case=op/f16/maxpool_bwd.json
---reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f16/maxpool_bwd.json
---reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f16/maxpool_bwd.json
---reset --in-shapes=0:1x600x600x64+1:1x300x300x64+2:1x300x300x64 --op-attrs=0:data_format:NXC --case=op/f16/maxpool_bwd.json
---reset --case=op/f16/max.json
---reset --case=op/f16/min.json
---reset --case=op/f16/mish.json
---reset --case=op/f16/mish_bwd.json
---reset --case=op/f16/mul.json
---reset --case=op/f16/prelu.json
---reset --op-attrs=0:per_channel_broadcast:1 --case=op/f16/prelu.json
---reset --op-attrs=0:data_format:NXC --case=op/f16/prelu.json
---reset --case=op/f16/prelu_bwd.json
---reset --op-attrs=0:data_format:NXC --case=op/f16/prelu_bwd.json
---reset --case=op/f16/reducel1.json
---reset --case=op/f16/reducel2.json
---reset --case=op/f16/reducemax.json
---reset --case=op/f16/reducemean.json
---reset --case=op/f16/reducemin.json
---reset --case=op/f16/reduceprod.json
---reset --case=op/f16/reducesum.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reducel1.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reducel2.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reducemax.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reducemean.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reducemin.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reduceprod.json
---reset --op-attrs=0:keep_dims:0 --case=op/f16/reducesum.json
---reset --case=op/f16/relu.json
---reset --case=op/f16/relu_bwd.json
---reset --case=op/f16/reorder.json
---reset --case=op/f16/sigmoid.json
---reset --case=op/f16/sigmoid_bwd.json
---reset --case=op/f16/softmax.json
---reset --case=op/f16/softmax_bwd.json
---reset --case=op/f16/square.json
---reset --case=op/f16/sub.json
---reset --case=op/f16/tanh.json
---reset --case=op/f16/tanh_bwd.json
+--reset --dt=f16 --op-kind=0:Abs,0:Exp,0:GELU,0:HardSwish,0:Log,0:Mish,0:Sigmoid,0:Sqrt,0:Square,0:Tanh --case=op/f32/abs.json
+--reset --dt=f16 --case=op/f32/abs_bwd.json
+--reset --dt=f16 --op-attrs=,0:auto_broadcast:numpy --op-kind=0:Add,0:Divide,0:Maximum,0:Minimum,0:Multiply,0:Subtract --case=op/f32/add.json
+--reset --dt=f16 --case=op/f32/avgpool.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f32/avgpool.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/avgpool.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/avgpool.json
+--reset --dt=f16 --in-shapes=0:1x300x300x32 --op-attrs=0:data_format:NXC --case=op/f32/avgpool.json
+--reset --dt=f16 --case=op/f32/avgpool_bwd.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/avgpool_bwd.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/avgpool_bwd.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/avgpool_bwd.json
+--reset --dt=f16 --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/f32/avgpool_bwd.json
+--reset --dt=f16 --in-shapes=0:2x192 --case=op/f32/bnorm_fwd_d.json
+--reset --dt=f16 --in-shapes=0:2x192x71 --case=op/f32/bnorm_fwd_d.json
+--reset --dt=f16 --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm.json
+--reset --dt=f16 --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_fwd_d.json
+--reset --dt=f16 --in-shapes=0:2x71x71x192+1:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_bwd.json
+--reset --dt=f16 --case=op/f32/biasadd.json
+--reset --dt=f16 --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f32/biasadd.json
+--reset --dt=f16 --case=op/f32/clamp.json
+--reset --dt=f16 --case=op/f32/clamp_bwd.json
+--reset --dt=f16 --case=op/f32/conv_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/conv_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_2d.json
+--reset --dt=f16 --in-shapes=0:2x28x28x32 --op-attrs=0:data_format:NXC --case=op/f32/conv_2d.json
+--reset --dt=f16 --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/f32/conv_2d.json
+--reset --dt=f16 --in-shapes=0:2x28x28x32+1:1x1x32x128 --op-attrs=0:data_format:NXC*weights_format:XIO --case=op/f32/conv_2d.json
+--reset --dt=f16 --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --op-attrs=0:output_padding:2x2 --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --in-shapes=0:1x14x14x128 --op-attrs=0:data_format:NXC*dst_shape:1x28x28x32 --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --in-shapes=1:1x1x32x128 --op-attrs=0:weights_format:XIO --case=op/f32/conv_bwd_d_2d.json
+--reset --dt=f16 --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=f16 --op-attrs=0:weights_format:XIO*weights_shape:1x1x32x128 --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=f16 --in-shapes=0:1x28x28x32+1:1x14x14x128 --op-attrs=0:data_format:NXC --case=op/f32/conv_bwd_w_2d.json
+--reset --dt=f16 --case=op/f32/deconv.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/deconv.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/deconv.json
+--reset --dt=f16 --op-attrs=0:output_padding:1x1 --case=op/f32/deconv.json
+--reset --dt=f16 --in-shapes=0:1x28x28x32 --op-attrs=0:data_format:NXC --case=op/f32/deconv.json
+--reset --dt=f16 --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/f32/deconv.json
+--reset --dt=f16 --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --in-shapes=0:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --in-shapes=1:3x3x128x32 --op-attrs=0:weights_format:XOI --case=op/f32/deconv_bwd_d.json
+--reset --dt=f16 --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --op-attrs=0:weights_format:XOI*weights_shape:3x3x128x32 --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --in-shapes=1:1x128x56x56 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --in-shapes=1:1x128x57x57 --op-attrs=0:auto_pad:VALID --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --in-shapes=0:1x28x28x32+1:1x16x16x128 --op-attrs=0:data_format:NXC --case=op/f32/deconv_bwd_w.json
+--reset --dt=f16 --op-kind=0:Elu,0:LeakyReLU --case=op/f32/elu.json
+--reset --dt=f16 --case=op/f32/elu_bwd.json
+--reset --dt=f16 --case=op/f32/hardsigmoid.json
+--reset --dt=f16 --case=op/f32/hardsigmoid_bwd.json
+--reset --dt=f16 --case=op/f32/interpolate.json
+--reset --dt=f16 --op-attrs=0:mode:bilinear --case=op/f32/interpolate.json
+--reset --dt=f16 --op-attrs=0:mode:trilinear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x7x14x17 --op-attrs=0:data_format:NXC --case=op/f32/interpolate.json
+--reset --dt=f16 --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --op-attrs=0:mode:bilinear --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --op-attrs=0:mode:trilinear --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --op-attrs=0:mode:linear --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --in-shapes=0:2x7x14x17+1:2x6x12x17 --op-attrs=0:data_format:NXC --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --case=op/f32/lnorm.json
+--reset --dt=f16 --op-attrs=0:keep_stats:1 --case=op/f32/lnorm_ks.json
+--reset --dt=f16 --case=op/f32/lnorm_bwd.json
+--reset --dt=f16 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --case=op/f32/matmul_2d_4d.json
+--reset --dt=f16 --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/f32/matmul_2d_4d.json
+--reset --dt=f16 --in-shapes=0:384x32+1:1024x384 --op-attrs=0:transpose_a:1*transpose_b:1 --case=op/f32/matmul_2d_4d.json
+--reset --dt=f16 --case=op/f32/maxpool.json
+--reset --dt=f16 --op-attrs=0:rounding_type:ceil --case=op/f32/maxpool.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f32/maxpool.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/maxpool.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/maxpool.json
+--reset --dt=f16 --in-shapes=0:1x600x600x64 --op-attrs=0:data_format:NXC --case=op/f32/maxpool.json
+--reset --dt=f16 --case=op/f32/maxpool_bwd.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_UPPER --case=op/f32/maxpool_bwd.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/maxpool_bwd.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/maxpool_bwd.json
+--reset --dt=f16 --in-shapes=0:1x600x600x64+1:1x300x300x64+2:1x300x300x64 --op-attrs=0:data_format:NXC --case=op/f32/maxpool_bwd.json
+--reset --dt=f16 --case=op/f32/prelu.json
+--reset --dt=f16 --op-attrs=0:per_channel_broadcast:1 --case=op/f32/prelu.json
+--reset --dt=f16 --op-attrs=0:data_format:NXC --case=op/f32/prelu.json
+--reset --dt=f16 --case=op/f32/prelu_bwd.json
+--reset --dt=f16 --op-attrs=0:data_format:NXC --case=op/f32/prelu_bwd.json
+--reset --dt=f16 --case=op/f32/reducel1.json
+--reset --dt=f16 --case=op/f32/reducel2.json
+--reset --dt=f16 --case=op/f32/reducemax.json
+--reset --dt=f16 --case=op/f32/reducemean.json
+--reset --dt=f16 --case=op/f32/reducemin.json
+--reset --dt=f16 --case=op/f32/reduceprod.json
+--reset --dt=f16 --case=op/f32/reducesum.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reducel1.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reducel2.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reducemax.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reducemean.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reducemin.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reduceprod.json
+--reset --dt=f16 --op-attrs=0:keep_dims:0 --case=op/f32/reducesum.json
+--reset --dt=f16 --case=op/f32/relu.json
+--reset --dt=f16 --op-kind=0:ReLUBackward,0:SigmoidBackward,0:SqrtBackward,0:TanhBackward --case=op/f32/relu_bwd.json
+--reset --dt=f16 --case=op/f32/reorder.json
+--reset --dt=f16 --case=op/f32/softmax.json
+--reset --dt=f16 --case=op/f32/softmax_bwd.json
 --reset --case=op/f16/typecast.json
---reset --case=op/f16/sqrt.json
---reset --case=op/f16/sqrt_bwd.json
---reset --case=op/f16/select.json
---reset --case=op/f16/gnorm.json
-# reorder
---reset --in-shapes=0:2x64x3x3 --case=op/f16/reorder.json
+--reset --case=op/f16/dynamicdq_s4.json
+--reset --case=op/f16/dynamicdq_u4.json
+--reset --dt=f16 --case=op/f32/select.json
+--reset --dt=f16 --case=op/f32/gnorm.json
+--reset --dt=f16 --case=op/f32/static_reshape.json
+--reset --dt=f16 --case=op/f32/static_transpose.json
+--reset --dt=f16 --case=op/f32/genindex.json
+--reset --dt=f16 --op-attrs=0:axis:2 --case=op/f32/genindex.json
+--reset --dt=f16 --op-attrs=0:axis:-1 --case=op/f32/genindex.json
+--reset --dt=f16 --op-attrs=0:axis:2 --in-shapes=0:1x16x32x32*acbd+1:1x16x32x32*acbd --case=op/f32/genindex.json
+--reset --dt=f16 --case=op/f32/greaterequal.json
+--reset --dt=f16 --in-shapes=1:1x1x1x1 --case=op/f32/greaterequal.json
+--reset --dt=f16 --in-shapes=1:1 --case=op/f32/greaterequal.json
+
+# select
+--reset --dt=bf16 --in-shapes=2:1x1x1x128 --case=op/f32/select.json
+# concat
+--reset --dt=f16 --in-shapes=0:1x4096x14x14+1:1x4096x14x14 --case=op/f32/concat.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/concat.json
+--reset --dt=f16 --in-shapes=0:8x16x384x64+1:8x16x384x384+2:8x16x384x384 --case=op/f32/concat_3.json
+# softmax
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*abc --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*abc --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax.json
+# resampling
+--reset --dt=f16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:sizes:1.666667x1 --case=op/f32/interpolate_bwd.json 
+--reset --dt=f16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:sizes:0.6x1 --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:sizes:1.166667x1.166667 --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:sizes:0.857143x0.857143 --case=op/f32/interpolate_bwd.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5 --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11 --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11 --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14 --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12 --case=op/f32/interpolate.json
+# pool
+--reset --dt=f16 --in-shapes=0:1x64x600x600 --op-attrs=0:strides:2x2*pads_begin:1x1*pads_end:0x0*kernel:3x3*dilations:1x1 --case=op/f32/maxpool.json
 # large scope
---reset --in-shapes=0:abcd --case=op/f16/reorder.json
---reset --in-shapes=0:cdba --case=op/f16/reorder.json
---reset --in-shapes=0:acdb --case=op/f16/reorder.json
+--reset --dt=f16 --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm_fwd_d.json
+--reset --dt=f16 --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm.json
+--reset --dt=f16 --in-shapes=0:96x192x71x71+1:96x192x71x71+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd_d_3d.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax_bwd.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/softmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024+1:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb+1:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/logsoftmax_bwd.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024 --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:2 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:1 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:3 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x512x7x7 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:64x64x112x112 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x16x384x384 --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:8x384x1024*acb --op-attrs=0:axis:0 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducel1.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducel2.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducemax.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducemin.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducemean.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reduceprod.json
+--reset --dt=f16 --in-shapes=0:16x1024 --op-attrs=0:axes:0 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:16x128 --op-attrs=0:axes:0 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:16x256 --op-attrs=0:axes:0x1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56 --op-attrs=0:axes:1x2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28 --op-attrs=0:axes:0x1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x1024x14x14*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x256x56x56*acdb --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x512x56x56*acdb --op-attrs=0:axes:1x2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x1024x14x14*acdb --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x2048x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7*acdb --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:64x512x28x28*acdb --op-attrs=0:axes:0x1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7 --op-attrs=0:axes:2x3 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9 --op-attrs=0:axes:1 --case=op/f32/reducesum.json
+--reset --dt=f16 --in-shapes=0:1x4096x14x14*abcd+1:1x4096x14x14*abcd --op-attrs=0:axis:0 --case=op/f32/concat.json
+--reset --dt=f16 --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:axis:1 --case=op/f32/concat.json
+--reset --dt=f16 --in-shapes=0:8x16x384x64*abcd+1:8x16x384x384*abcd+2:8x16x384x384*abcd --op-attrs=0:axis:3 --case=op/f32/concat_3.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11 --op-attrs=0:sizes:5x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11 --op-attrs=0:sizes:3x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12 --op-attrs=0:sizes:7x14*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14 --op-attrs=0:sizes:6x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11 --op-attrs=0:sizes:5x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11 --op-attrs=0:sizes:3x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12 --op-attrs=0:sizes:7x14*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14 --op-attrs=0:sizes:6x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x17 --op-attrs=0:sizes:21*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x21 --op-attrs=0:sizes:17*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x32 --op-attrs=0:sizes:64*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x64 --op-attrs=0:sizes:32*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12*acdeb --op-attrs=0:sizes:4x8x6*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6*acdeb --op-attrs=0:sizes:4x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6*acdeb --op-attrs=0:sizes:4x4x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12*acdeb --op-attrs=0:sizes:3x5x7*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x17*acb --op-attrs=0:sizes:21*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x21*acb --op-attrs=0:sizes:17*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x32*acb --op-attrs=0:sizes:64*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x64*acb --op-attrs=0:sizes:32*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x17 --op-attrs=0:sizes:21*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x21 --op-attrs=0:sizes:17*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x32 --op-attrs=0:sizes:64*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x64 --op-attrs=0:sizes:32*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12 --op-attrs=0:sizes:4x8x6*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6 --op-attrs=0:sizes:4x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6 --op-attrs=0:sizes:4x4x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6 --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12 --op-attrs=0:sizes:3x5x7*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7 --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6 --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7 --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12 --op-attrs=0:sizes:4x8x6*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6 --op-attrs=0:sizes:4x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6 --op-attrs=0:sizes:4x4x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6 --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12 --op-attrs=0:sizes:3x5x7*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7 --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6 --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7 --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11*acdb --op-attrs=0:sizes:5x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11*acdb --op-attrs=0:sizes:3x11*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12*acdb --op-attrs=0:sizes:7x14*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14*acdb --op-attrs=0:sizes:6x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x17*acb --op-attrs=0:sizes:21*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x17x21*acb --op-attrs=0:sizes:17*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x32*acb --op-attrs=0:sizes:64*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x64x64*acb --op-attrs=0:sizes:32*mode:linear --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12*acdeb --op-attrs=0:sizes:4x8x6*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6*acdeb --op-attrs=0:sizes:4x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6*acdeb --op-attrs=0:sizes:4x4x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6*acdeb --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12*acdeb --op-attrs=0:sizes:3x5x7*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7*acdeb --op-attrs=0:sizes:2x8x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6*acdeb --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7*acdeb --op-attrs=0:sizes:2x5x12*mode:nearest --case=op/f32/interpolate.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12+1:2x8x4x8x6 --op-attrs=0:scales:2x2x0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6+1:2x8x4x8x12 --op-attrs=0:scales:2x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6+1:2x8x4x4x12 --op-attrs=0:scales:2x0.5x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6+1:2x8x2x8x12 --op-attrs=0:scales:0.5x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12+1:2x9x3x5x7 --op-attrs=0:scales:1.5x0.625x0.583333*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7+1:2x9x2x8x12 --op-attrs=0:scales:0.666667x1.6x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x17+1:2x17x21 --op-attrs=0:scales:1.235294*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x21+1:2x17x17 --op-attrs=0:scales:0.809524*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x32+1:2x64x64 --op-attrs=0:scales:2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x64+1:2x64x32 --op-attrs=0:scales:0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12*acdeb+1:2x8x4x8x6*acdeb --op-attrs=0:scales:2x2x0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6*acdeb+1:2x8x4x8x12*acdeb --op-attrs=0:scales:2x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6*acdeb+1:2x8x4x4x12*acdeb --op-attrs=0:scales:2x0.5x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6*acdeb+1:2x8x2x8x12*acdeb --op-attrs=0:scales:0.5x2x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12*acdeb+1:2x9x3x5x7*acdeb --op-attrs=0:scales:1.5x0.625x0.583333*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:scales:0.666667x1.6x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x1.714286*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12+1:2x8x4x8x6 --op-attrs=0:scales:2x2x0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6+1:2x8x4x8x12 --op-attrs=0:scales:2x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6+1:2x8x4x4x12 --op-attrs=0:scales:2x0.5x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6+1:2x8x2x8x12 --op-attrs=0:scales:0.5x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12+1:2x9x3x5x7 --op-attrs=0:scales:1.5x0.625x0.583333*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7+1:2x9x2x8x12 --op-attrs=0:scales:0.666667x1.6x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7+1:2x9x2x5x12 --op-attrs=0:scales:0.666667x0.625x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x12*acdeb+1:2x8x4x8x6*acdeb --op-attrs=0:scales:2x2x0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x4x6*acdeb+1:2x8x4x8x12*acdeb --op-attrs=0:scales:2x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x2x8x6*acdeb+1:2x8x4x4x12*acdeb --op-attrs=0:scales:2x0.5x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x8x4x4x6*acdeb+1:2x8x2x8x12*acdeb --op-attrs=0:scales:0.5x2x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x2x8x12*acdeb+1:2x9x3x5x7*acdeb --op-attrs=0:scales:1.5x0.625x0.583333*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:scales:0.666667x1.6x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:scales:0.666667x0.625x1.714286*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x17+1:2x17x21 --op-attrs=0:scales:1.235294*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x21+1:2x17x17 --op-attrs=0:scales:0.809524*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x32+1:2x64x64 --op-attrs=0:scales:2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x64+1:2x64x32 --op-attrs=0:scales:0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11*acdb+1:2x16x5x11*acdb --op-attrs=0:scales:1.666667x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11*acdb+1:2x16x3x11*acdb --op-attrs=0:scales:0.6x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12*acdb+1:2x17x7x14*acdb --op-attrs=0:scales:1.166667x1.166667*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14*acdb+1:2x17x6x12*acdb --op-attrs=0:scales:0.857143x0.857143*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x17*acb+1:2x17x21*acb --op-attrs=0:scales:1.235294*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x21*acb+1:2x17x17*acb --op-attrs=0:scales:0.809524*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x32*acb+1:2x64x64*acb --op-attrs=0:scales:2*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x64x64*acb+1:2x64x32*acb --op-attrs=0:scales:0.5*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11+1:2x16x5x11 --op-attrs=0:scales:1.666667x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11+1:2x16x3x11 --op-attrs=0:scales:0.6x1*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12+1:2x17x7x14 --op-attrs=0:scales:1.166667x1.166667*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14+1:2x17x6x12 --op-attrs=0:scales:0.857143x0.857143*mode:linear --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x3x11*acdb+1:2x16x5x11*acdb --op-attrs=0:scales:1.666667x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x16x5x11*acdb+1:2x16x3x11*acdb --op-attrs=0:scales:0.6x1*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x6x12*acdb+1:2x17x7x14*acdb --op-attrs=0:scales:1.166667x1.166667*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+--reset --dt=f16 --in-shapes=0:2x17x7x14*acdb+1:2x17x6x12*acdb --op-attrs=0:scales:0.857143x0.857143*mode:nearest --case=op/f32/interpolate_bwd_2d.json
+
+# reorder
+--reset --dt=f16 --in-shapes=0:2x64x3x3 --case=op/f32/reorder.json
+--reset --dt=f16 --in-shapes=0:abcd --case=op/f32/reorder.json
+--reset --dt=f16 --in-shapes=0:cdba --case=op/f32/reorder.json
+--reset --dt=f16 --in-shapes=0:acdb --case=op/f32/reorder.json
diff --git a/tests/benchdnn/inputs/graph/op/harness_f16_ci b/tests/benchdnn/inputs/graph/op/harness_f16_ci
index 45c45b68a1e..a2e414bcbd9 100644
--- a/tests/benchdnn/inputs/graph/op/harness_f16_ci
+++ b/tests/benchdnn/inputs/graph/op/harness_f16_ci
@@ -1,6 +1,23 @@
---reset --op-attrs=0:auto_pad:VALID --case=op/f16/conv_2d.json
---reset --case=op/f16/add.json
---reset --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f16/biasadd.json
---reset --case=op/f16/min.json
---reset --case=op/f16/reducel2.json
---reset --case=op/f16/gnorm.json
+--reset --dt=f16 --op-attrs=0:auto_pad:VALID --case=op/f32/conv_2d.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/conv_2d.json
+--reset --dt=f16 --in-shapes=0:384x32 --op-attrs=0:transpose_a:1 --case=op/f32/matmul_2d_4d.json
+--reset --dt=f16 --op-kind=0:AbsBackward,0:GELUBackward,0:MishBackward --case=op/f32/abs_bwd.json
+--reset --dt=f16 --op-kind=0:Add,0:Divide,0:Minimum,0:Subtract --case=op/f32/add.json
+--reset --dt=f16 --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f32/biasadd.json
+--reset --dt=f16 --case=op/f32/reducel2.json
+--reset --dt=f16 --case=op/f32/gnorm.json
+--reset --dt=f16 --in-shapes=0:1x300x300x64 --op-attrs=0:data_format:NXC*src_shape:1x600x600x64 --case=op/f32/avgpool_bwd.json
+--reset --dt=f16 --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm.json
+--reset --dt=f16 --case=op/f32/clamp_bwd.json
+--reset --dt=f16 --op-attrs=0:mode:trilinear --case=op/f32/interpolate.json
+--reset --dt=f16 --case=op/f32/logsoftmax.json
+--reset --dt=f16 --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/maxpool.json
+--reset --dt=f16 --op-attrs=0:data_format:NXC --case=op/f32/prelu_bwd.json
+--reset --dt=f16 --case=op/f32/reducemin.json
+--reset --dt=f16 --op-attrs=0:beta:1 --case=op/f32/hardsigmoid.json
+--reset --dt=f16 --case=op/f32/select.json
+--reset --dt=f16 --case=op/f32/static_reshape.json
+--reset --dt=f16 --case=op/f32/static_transpose.json
+--reset --case=op/f16/dynamicdq_s4.json
+--reset --dt=f16 --case=op/f32/genindex.json
+--reset --dt=f16 --case=op/f32/greaterequal.json
diff --git a/tests/benchdnn/inputs/graph/op/harness_f32_all b/tests/benchdnn/inputs/graph/op/harness_f32_all
index 4a899f92b68..7714320134c 100644
--- a/tests/benchdnn/inputs/graph/op/harness_f32_all
+++ b/tests/benchdnn/inputs/graph/op/harness_f32_all
@@ -1,7 +1,7 @@
---reset --case=op/f32/abs.json
---reset --case=op/f32/abs_bwd.json
---reset --case=op/f32/add.json
---reset --in-shapes=1:- --case=op/f32/add.json
+--reset --op-kind=0:Add,0:Divide,0:Maximum,0:Minimum,0:Multiply,0:Subtract --in-shapes=,1:- --case=op/f32/add.json
+--reset --in-shapes=0:1x2048x28x28+1:1x2048x28x28,0:64x128x28x28+1:64x128x28x28,0:64x256x14x14+1:64x256x14x14,0:16x512+1:16x512,0:64x128x28x28+1:64x128x28x28,0:64x128x28x0+1:64x128x28x0,0:64x128x28x28*acdb+1:64x128x28x28*acdb --op-attrs=0:auto_broadcast:none --op-kind=0:Add,0:Divide,0:Maximum,0:Minimum,0:Multiply,0:Subtract --case=op/f32/add.json
+--reset  --in-shapes=0:1x1x1x1+1:64x128x28x28,0:16x13+1:1x13 --op-attrs=0:auto_broadcast:numpy --op-kind=0:Add,0:Divide,0:Maximum,0:Minimum,0:Multiply,0:Subtract --case=op/f32/add.json
+--reset --in-shapes=,0:64x16x28x28,0:64x16x28x28*acdb --op-kind=0:Abs,0:Exp,0:GELU,0:HardSwish,0:Log,0:Mish,0:Round,0:Sigmoid,0:Sqrt,0:Square,0:Tanh --case=op/f32/abs.json
 --reset --case=op/f32/avgpool.json
 --reset --op-attrs=0:auto_pad:SAME_UPPER*kernel:2x2 --case=op/f32/avgpool.json
 --reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/avgpool.json
@@ -21,7 +21,6 @@
 --reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm.json
 --reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_fwd_d.json
 --reset --in-shapes=0:2x71x71x192+1:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_bwd.json
---reset --case=op/f32/biasadd.json
 --reset --in-shapes=0:1x32x32x128 --op-attrs=0:data_format:NXC --case=op/f32/biasadd.json
 --reset --case=op/f32/clamp.json
 --reset --case=op/f32/clamp_bwd.json
@@ -79,17 +78,12 @@
 --reset --case=op/f32/dequantize_u8.json
 --reset --case=op/f32/dequantize_f8_e4m3.json
 --reset --case=op/f32/dequantize_f8_e5m2.json
---reset --case=op/f32/div.json
---reset --op-attrs=0:auto_broadcast:numpy --case=op/f32/div.json
---reset --case=op/f32/elu.json
+--reset --case=op/f32/dynamicdq_s4.json
+--reset --case=op/f32/dynamicdq_u4.json
+--reset --in-shapes=,0:64x16x28x28 --op-kind=0:Elu,0:LeakyReLU --case=op/f32/elu.json
 --reset --case=op/f32/elu_bwd.json
---reset --case=op/f32/exp.json
---reset --case=op/f32/gelu.json
---reset --case=op/f32/gelu_bwd.json
 --reset --case=op/f32/hardsigmoid.json
 --reset --case=op/f32/hardsigmoid_bwd.json
---reset --case=op/f32/hardswish.json
---reset --case=op/f32/hardswish_bwd.json
 --reset --case=op/f32/interpolate.json
 --reset --op-attrs=0:mode:bilinear --case=op/f32/interpolate.json
 --reset --op-attrs=0:mode:trilinear --case=op/f32/interpolate.json
@@ -99,11 +93,9 @@
 --reset --op-attrs=0:mode:trilinear --case=op/f32/interpolate_bwd.json
 --reset --op-attrs=0:mode:linear --case=op/f32/interpolate_bwd.json
 --reset --in-shapes=0:2x7x14x17+1:2x6x12x17 --op-attrs=0:data_format:NXC --case=op/f32/interpolate_bwd.json
---reset --case=op/f32/leakyrelu.json
 --reset --case=op/f32/lnorm.json
 --reset --op-attrs=0:keep_stats:1 --case=op/f32/lnorm_ks.json
 --reset --case=op/f32/lnorm_bwd.json
---reset --case=op/f32/log.json
 --reset --case=op/f32/logsoftmax.json
 --reset --case=op/f32/logsoftmax_bwd.json
 --reset --case=op/f32/matmul_2d_4d.json
@@ -125,11 +117,6 @@
 --reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/maxpool_bwd.json
 --reset --in-shapes=0:1x600x600x64+1:1x300x300x64+2:1x300x300x64 --op-attrs=0:data_format:NXC --case=op/f32/maxpool_bwd.json
 --reset --in-shapes=0:acdb+1:acdb --op-attrs=0:strides:2x2*pads_begin:1x1*pads_end:0x0*kernel:3x3*dilations:1x1 --case=op/f32/maxpool_bwd.json
---reset --case=op/f32/max.json
---reset --case=op/f32/min.json
---reset --case=op/f32/mish.json
---reset --case=op/f32/mish_bwd.json
---reset --case=op/f32/mul.json
 --reset --case=op/f32/prelu.json
 --reset --op-attrs=0:per_channel_broadcast:1 --case=op/f32/prelu.json
 --reset --op-attrs=0:data_format:NXC --case=op/f32/prelu.json
@@ -154,17 +141,10 @@
 --reset --op-attrs=0:keep_dims:0 --case=op/f32/reducesum.json
 --reset --op-attrs=0:keep_dims:0*axes:0x1x2x3 --case=op/f32/reducel1.json
 --reset --case=op/f32/relu.json
---reset --case=op/f32/relu_bwd.json
+--reset --in-shapes=,0:64x16x28x28*abcd+1:64x16x28x28*abcd,0:64x16x28x28*acdb+1:64x16x28x28*acdb,0:64x2048x7x7+1:64x2048x7x7,0:64x512x28x28*acdb+1:64x512x28x28*acdb --op-attrs=,0:use_dst:1 --op-kind=0:ReLUBackward,0:SigmoidBackward,0:SqrtBackward,0:TanhBackward --case=op/f32/relu_bwd.json
 --reset --case=op/f32/reorder.json
---reset --case=op/f32/round.json
---reset --case=op/f32/sigmoid.json
---reset --case=op/f32/sigmoid_bwd.json
 --reset --case=op/f32/softmax.json
 --reset --case=op/f32/softmax_bwd.json
---reset --case=op/f32/square.json
---reset --case=op/f32/sub.json
---reset --case=op/f32/tanh.json
---reset --case=op/f32/tanh_bwd.json
 --reset --case=op/f32/typecast.json
 # concat
 --reset --in-shapes=0:1x128x150x150+1:1x128x150x150+2:1x128x150x150 --case=op/f32/concat_2.json
@@ -294,15 +274,8 @@
 --reset --in-shapes=0:1x64x600x600*acdb+1:1x64x300x300*acdb+2:1x64x300x300*acdb --op-attrs=0:strides:2x2*pads_begin:1x1*pads_end:0x0*kernel:3x3*dilations:1x1 --case=op/f32/maxpool_bwd.json
 # eltwise
 --reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/elu_bwd.json
---reset --in-shapes=0:64x16x28x28+1:64x16x28x28 --case=op/f32/gelu_bwd.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/gelu_bwd.json
 --reset --in-shapes=0:64x16x28x28+1:64x16x28x28 --case=op/f32/softplus_bwd.json
 --reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/softplus_bwd.json
---reset --in-shapes=0:64x16x28x28+1:64x16x28x28 --case=op/f32/sqrt_bwd.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/sqrt_bwd.json
---reset --in-shapes=0:64x16x28x28*acdb+1:64x16x28x28*acdb --case=op/f32/tanh_bwd.json
---reset --in-shapes=0:64x512x28x28*acdb+1:64x512x28x28*acdb --case=op/f32/tanh_bwd.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/sqrt.json
 # deconv
 --reset --in-shapes=0:2x16x5x5+1:17x16x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:3x3*pads_end:2x2*groups:1 --case=op/f32/deconv_bwd_d.json
 --reset --in-shapes=0:2x16x5x5+1:16x16x1x1 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=op/f32/deconv_bwd_d.json
@@ -326,11 +299,6 @@
 --reset --in-shapes=0:abcd --case=op/f32/reorder.json
 --reset --in-shapes=0:cdba --case=op/f32/reorder.json
 --reset --in-shapes=0:acdb --case=op/f32/reorder.json
---reset --in-shapes=0:64x16x28x28*abcd+1:64x16x28x28*abcd --op-attrs=0:use_dst:1 --case=op/f32/relu_bwd.json
---reset --in-shapes=0:64x16x28x28*acdb+1:64x16x28x28*acdb --op-attrs=0:use_dst:1 --case=op/f32/relu_bwd.json
---reset --in-shapes=0:64x2048x7x7+1:64x2048x7x7 --case=op/f32/relu_bwd.json
---reset --in-shapes=0:64x512x28x28*acdb+1:64x512x28x28*acdb --op-attrs=0:use_dst:1 --case=op/f32/relu_bwd.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --op-attrs=0:use_dst:1 --case=op/f32/relu_bwd.json
 --reset --in-shapes=0:64x2048x7x7 --case=op/f32/relu.json
 --reset --in-shapes=0:16x1024 --case=op/f32/relu.json
 --reset --in-shapes=0:16x128 --case=op/f32/relu.json
@@ -372,34 +340,7 @@
 --reset --in-shapes=0:64x512x7x7 --case=op/f32/relu.json
 --reset --in-shapes=0:64x64x112x112 --case=op/f32/relu.json
 --reset --in-shapes=0:64x64x56x56 --case=op/f32/relu.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/abs.json
---reset --in-shapes=0:64x16x28x28 --case=op/f32/abs.json
---reset --in-shapes=0:64x512x28x28*acdb --case=op/f32/abs.json
---reset --in-shapes=0:64x512x28x28*acdb --case=op/f32/abs.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/abs_bwd.json
---reset --in-shapes=0:64x512x28x28*acdb+1:64x512x28x28*acdb --case=op/f32/abs_bwd.json
---reset --in-shapes=0:64x512x28x28*acdb+1:64x512x28x28*acdb --case=op/f32/mish_bwd.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/mish_bwd.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/mish.json
---reset --in-shapes=0:64x512x28x28*acdb --case=op/f32/mish.json
---reset --in-shapes=0:1x2048x28x28+1:1x2048x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:64x256x14x14+1:64x256x14x14 --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:16x13+1:1x13 --op-attrs=0:auto_broadcast:numpy --case=op/f32/min.json
---reset --in-shapes=0:16x512+1:16x512 --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:1x1x1x1+1:64x128x28x28 --op-attrs=0:auto_broadcast:numpy --case=op/f32/min.json
---reset --in-shapes=0:64x128x28x0+1:64x128x28x0 --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:64x128x28x28*acdb+1:64x128x28x28*acdb --op-attrs=0:auto_broadcast:none --case=op/f32/min.json
---reset --in-shapes=0:64x128x28x28*acdb+1:64x128x28x28*acdb --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
---reset --in-shapes=0:1x1x1x1+1:64x128x28x28 --op-attrs=0:auto_broadcast:numpy --case=op/f32/mul.json
---reset --in-shapes=0:64x128x28x0+1:64x128x28x0 --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
---reset --in-shapes=0:16x13+1:1x13 --op-attrs=0:auto_broadcast:numpy --case=op/f32/mul.json
---reset --in-shapes=0:16x512+1:16x512 --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
---reset --in-shapes=0:1x2048x28x28+1:1x2048x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
---reset --in-shapes=0:64x256x14x14+1:64x256x14x14 --op-attrs=0:auto_broadcast:none --case=op/f32/mul.json
+--reset --in-shapes=,0:64x512x28x28+1:64x512x28x28,0:64x16x28x28+1:64x16x28x28,0:64x16x28x28*acdb+1:64x16x28x28*acdb --op-kind=0:AbsBackward,0:GELUBackward,0:HardSwishBackward,0:MishBackward --case=op/f32/abs_bwd.json
 --reset --in-shapes=0:96x192x71x71+1:96x192x71x71+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=op/f32/bnorm_bwd.json
 --reset --in-shapes=0:64x128x28x28 --op-attrs=0:axis:0 --case=op/f32/softmax.json
 --reset --in-shapes=0:64x3x224x224 --op-attrs=0:axis:0 --case=op/f32/softmax.json
@@ -633,59 +574,11 @@
 --reset --in-shapes=0:64x64x112x112 --case=op/f32/lnorm.json
 --reset --in-shapes=0:8x16x384x384 --case=op/f32/lnorm.json
 --reset --in-shapes=0:8x384x1024 --case=op/f32/lnorm.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/exp.json
---reset --in-shapes=0:64x16x28x28 --case=op/f32/exp.json
---reset --in-shapes=0:64x16x28x28*acdb+1:64x16x28x28*acdb --case=op/f32/sqrt_bwd.json
---reset --in-shapes=0:64x512x28x28*acdb+1:64x512x28x28*acdb --case=op/f32/sqrt_bwd.json
---reset --in-shapes=0:64x16x28x28*acdb --case=op/f32/sqrt.json
---reset --in-shapes=0:64x512x28x28*acdb --case=op/f32/sqrt.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/square.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/tanh.json
---reset --in-shapes=0:64x512x28x28*abcd+1:64x512x28x28*abcd --case=op/f32/tanh_bwd.json
---reset --in-shapes=0:64x16x28x28*abcd+1:64x16x28x28*abcd --case=op/f32/tanh_bwd.json
 --reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/softplus_bwd.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/sigmoid_bwd.json
---reset --in-shapes=0:64x16x28x28+1:64x16x28x28 --case=op/f32/sigmoid_bwd.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/sigmoid.json
---reset --in-shapes=0:64x16x28x28*acdb --case=op/f32/round.json
---reset --in-shapes=0:64x512x28x28*acdb --case=op/f32/round.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/log.json
---reset --in-shapes=0:64x16x28x28 --case=op/f32/log.json
---reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/hardswish_bwd.json
---reset --in-shapes=0:64x16x28x28+1:64x16x28x28 --case=op/f32/hardswish_bwd.json
---reset --in-shapes=0:64x16x28x28*acdb --case=op/f32/hardswish.json
---reset --in-shapes=0:64x512x28x28*acdb --case=op/f32/hardswish.json
---reset --in-shapes=0:8x384x4096 --case=op/f32/gelu.json
---reset --in-shapes=0:64x16x28x28 --case=op/f32/gelu.json
---reset --in-shapes=0:64x512x28x28 --case=op/f32/gelu.json
---reset --in-shapes=0:64x512x28x28*acdb+1:64x512x28x28*acdb --case=op/f32/gelu_bwd.json
---reset --in-shapes=0:64x16x28x28*acdb+1:64x16x28x28*acdb --case=op/f32/gelu_bwd.json
 --reset --in-shapes=0:64x512x28x28+1:64x512x28x28 --case=op/f32/elu_bwd.json
 --reset --in-shapes=0:64x16x28x28+1:64x16x28x28 --case=op/f32/elu_bwd.json
 --reset --in-shapes=0:64x512x28x28 --case=op/f32/elu.json
 --reset --in-shapes=0:64x16x28x28 --case=op/f32/elu.json
---reset --in-shapes=0:1x1x1x1*abcd+1:64x128x28x28 --op-attrs=0:auto_broadcast:numpy --case=op/f32/sub.json
---reset --in-shapes=0:64x128x28x0*abcd+1:64x128x28x0*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/sub.json
---reset --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/sub.json
---reset --in-shapes=0:64x128x28x28*acdb+1:64x128x28x28*acdb --op-attrs=0:auto_broadcast:none --case=op/f32/sub.json
---reset --in-shapes=0:64x128x28x28*acdb+1:64x128x28x28*acdb --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:16x13+1:1x13 --op-attrs=0:auto_broadcast:numpy --case=op/f32/max.json
---reset --in-shapes=0:16x512+1:16x512 --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:1x1x1x1+1:64x128x28x28 --op-attrs=0:auto_broadcast:numpy --case=op/f32/max.json
---reset --in-shapes=0:64x128x28x0+1:64x128x28x0 --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:1x2048x28x28+1:1x2048x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:64x128x28x28+1:64x128x28x28 --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:64x256x14x14+1:64x256x14x14 --op-attrs=0:auto_broadcast:none --case=op/f32/max.json
---reset --in-shapes=0:1x1x1x1*abcd+1:64x128x28x28 --op-attrs=0:auto_broadcast:numpy --case=op/f32/div.json
---reset --in-shapes=0:64x128x28x0*abcd+1:64x128x28x0*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
---reset --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
---reset --in-shapes=0:1x2048x28x28*abcd+1:1x2048x28x28*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
---reset --in-shapes=0:64x128x28x28*abcd+1:64x128x28x28*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
---reset --in-shapes=0:64x256x14x14*abcd+1:64x256x14x14*abcd --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
---reset --in-shapes=0:64x128x28x28*acdb+1:64x128x28x28*acdb --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
---reset --in-shapes=0:16x13+1:1x13 --op-attrs=0:auto_broadcast:numpy --case=op/f32/div.json
---reset --in-shapes=0:16x512+1:16x512 --op-attrs=0:auto_broadcast:none --case=op/f32/div.json
 --reset --in-shapes=0:3+1:1+2:3 --case=op/f32/prelu_bwd.json
 --reset --in-shapes=0:3+1:3+2:3 --case=op/f32/prelu_bwd.json
 --reset --in-shapes=0:3x64*ab+1:1x1*ab+2:3x64*ab --case=op/f32/prelu_bwd.json
@@ -946,9 +839,15 @@
 --reset --in-shapes=0:2x9x3x5x7*acdeb+1:2x9x2x8x12*acdeb --op-attrs=0:sizes:2x8x12*mode:linear --case=op/f32/interpolate_bwd.json
 --reset --in-shapes=0:2x9x3x8x6*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate_bwd.json
 --reset --in-shapes=0:2x9x3x8x7*acdeb+1:2x9x2x5x12*acdeb --op-attrs=0:sizes:2x5x12*mode:linear --case=op/f32/interpolate_bwd.json
+--reset --in-shapes=2:1x1x1x128 --case=op/f32/select.json
 --reset --case=op/f32/select.json
 --reset --case=op/f32/gnorm.json
 --reset --case=op/f32/static_reshape.json
 --reset --case=op/f32/static_transpose.json
-# only the graph compiler supports this op
---reset --case=op/f32/pow.json
+--reset --case=op/f32/genindex.json
+--reset --op-attrs=0:axis:2 --case=op/f32/genindex.json
+--reset --op-attrs=0:axis:-1 --case=op/f32/genindex.json
+--reset --op-attrs=0:axis:2 --in-shapes=0:1x16x32x32*acbd+1:1x16x32x32*acbd --case=op/f32/genindex.json
+--reset --case=op/f32/greaterequal.json
+--reset --in-shapes=1:1x1x1x1 --case=op/f32/greaterequal.json
+--reset --in-shapes=1:1 --case=op/f32/greaterequal.json
diff --git a/tests/benchdnn/inputs/graph/op/harness_f32_ci b/tests/benchdnn/inputs/graph/op/harness_f32_ci
index 8bef369d0e8..b2b9ebc9272 100644
--- a/tests/benchdnn/inputs/graph/op/harness_f32_ci
+++ b/tests/benchdnn/inputs/graph/op/harness_f32_ci
@@ -6,7 +6,7 @@
 --reset --in-shapes=0:2x28x28x32+1:1x1x32x128 --op-attrs=0:data_format:NXC*weights_format:XIO --case=op/f32/conv_2d.json
 --reset --in-shapes=0:32x384*ab+1:384x1024*ab --case=op/f32/matmul_2d_4d.json
 --reset --case=op/f32/matmul_2d_4d.json
---reset --case=op/f32/abs.json
+--reset --op-kind=0:Abs,0:Exp,0:GELU,0:HardSwish,0:Log,0:Mish,0:Round,0:Sigmoid,0:Sqrt,0:Square,0:Tanh --case=op/f32/abs.json
 --reset --op-attrs=0:auto_pad:SAME_LOWER*kernel:2x2 --case=op/f32/avgpool.json
 --reset --in-shapes=0:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_fwd_d.json
 --reset --in-shapes=0:2x71x71x192+1:2x71x71x192 --op-attrs=0:data_format:NXC --case=op/f32/bnorm_bwd.json
@@ -16,23 +16,18 @@
 --reset --case=op/f32/deconv.json
 --reset --op-attrs=0:auto_pad:SAME_LOWER --case=op/f32/deconv_bwd_d.json
 --reset --op-attrs=0:weights_format:XOI*weights_shape:3x3x128x32 --case=op/f32/deconv_bwd_w.json
+--reset --op-kind=0:Divide,0:Maximum,0:Subtract --case=op/f32/add.json
 --reset --case=op/f32/dequantize_s8.json
 --reset --case=op/f32/dequantize_u8.json
---reset --case=op/f32/elu.json
+--reset --case=op/f32/dynamicdq_s4.json
+--reset --case=op/f32/dynamicdq_u4.json
+--reset --op-kind=0:Elu,0:LeakyReLU --case=op/f32/elu.json
 --reset --case=op/f32/elu_bwd.json
---reset --case=op/f32/exp.json
---reset --case=op/f32/gelu.json
---reset --case=op/f32/hardswish.json
---reset --case=op/f32/hardswish_bwd.json
 --reset --op-attrs=0:mode:linear --case=op/f32/interpolate_bwd.json
 --reset --op-attrs=0:keep_stats:1 --case=op/f32/lnorm_ks.json
 --reset --case=op/f32/lnorm_bwd.json
---reset --case=op/f32/log.json
 --reset --case=op/f32/logsoftmax_bwd.json
 --reset --op-attrs=0:auto_pad:VALID*kernel:2x2 --case=op/f32/maxpool_bwd.json
---reset --case=op/f32/max.json
---reset --case=op/f32/mish.json
---reset --case=op/f32/mul.json
 --reset --op-attrs=0:per_channel_broadcast:1 --case=op/f32/prelu.json
 --reset --case=op/f32/quantize.json
 --reset --case=op/f32/reducel1.json
@@ -41,20 +36,15 @@
 --reset --case=op/f32/reduceprod.json
 --reset --case=op/f32/reducesum.json
 --reset --case=op/f32/relu.json
---reset --case=op/f32/relu_bwd.json
+--reset --op-kind=0:ReLUBackward,0:SigmoidBackward,0:SqrtBackward,0:TanhBackward --case=op/f32/relu_bwd.json
 --reset --case=op/f32/reorder.json
---reset --case=op/f32/round.json
---reset --case=op/f32/sigmoid_bwd.json
 --reset --case=op/f32/softmax.json
 --reset --case=op/f32/softmax_bwd.json
---reset --case=op/f32/square.json
---reset --case=op/f32/tanh.json
---reset --case=op/f32/tanh_bwd.json
 --reset --case=op/f32/typecast.json
 --reset --op-attrs=0:alpha:2*beta:1 --case=op/f32/hardsigmoid_bwd.json
 --reset --case=op/f32/gnorm.json
-# only the graph compiler supports these 4 ops
---reset --case=op/f32/pow.json
 --reset --case=op/f32/select.json
 --reset --case=op/f32/static_reshape.json
 --reset --case=op/f32/static_transpose.json
+--reset --case=op/f32/genindex.json
+--reset --case=op/f32/greaterequal.json
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/binary_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/binary_post_ops_fusion.json
deleted file mode 100644
index 7795d1128ec..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/binary_post_ops_fusion.json
+++ /dev/null
@@ -1,122 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "any",
-  "graph": [
-    {
-      "id": 0,
-      "name": "add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "relu",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/bn_bwd_relu_bwd_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/bn_bwd_relu_bwd_fusion.json
deleted file mode 100644
index 50707bd3e44..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/bn_bwd_relu_bwd_fusion.json
+++ /dev/null
@@ -1,209 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ReLUBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "BNORM_0",
-      "kind": "BatchNormTrainingBackward",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/bn_relu_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/bn_relu_fusion.json
deleted file mode 100644
index 09529c90527..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/bn_relu_fusion.json
+++ /dev/null
@@ -1,156 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormInference",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "ELTWISE_0",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/conv_bias_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/conv_bias_post_ops_fusion.json
deleted file mode 100644
index c567296c4b4..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/conv_bias_post_ops_fusion.json
+++ /dev/null
@@ -1,174 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            32,
-            112,
-            112
-          ],
-          "stride": [
-            401408,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            64
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "ELTWISE_1",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/conv_depthwise_fusion_cpu.json b/tests/benchdnn/inputs/graph/pattern/bf16/conv_depthwise_fusion_cpu.json
deleted file mode 100644
index 596a2725c52..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/conv_depthwise_fusion_cpu.json
+++ /dev/null
@@ -1,225 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            32,
-            112,
-            112
-          ],
-          "stride": [
-            401408,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "CONV_1",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 64
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            1,
-            3,
-            3
-          ],
-          "stride": [
-            9,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            56,
-            56
-          ],
-          "stride": [
-            200704,
-            3136,
-            56,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/conv_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/conv_post_ops_fusion.json
deleted file mode 100644
index 4ab54791232..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/conv_post_ops_fusion.json
+++ /dev/null
@@ -1,162 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            32,
-            112,
-            112
-          ],
-          "stride": [
-            401408,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            64,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "ELTWISE_1",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/convtranspose_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/convtranspose_post_ops_fusion.json
deleted file mode 100644
index b8cded4bfdd..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/convtranspose_post_ops_fusion.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTranspose",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        },
-        "output_padding": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0
-          ]
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            5,
-            5
-          ],
-          "stride": [
-            425,
-            25,
-            5,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            17,
-            16,
-            3,
-            3
-          ],
-          "stride": [
-            144,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            16,
-            5,
-            5
-          ],
-          "stride": [
-            400,
-            25,
-            5,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/interpolate_post_ops_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/interpolate_post_ops_chain_fusion.json
deleted file mode 100644
index 0a6b811b842..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/interpolate_post_ops_chain_fusion.json
+++ /dev/null
@@ -1,257 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 13,
-      "name": "RESAMPLING_0",
-      "kind": "Interpolate",
-      "attrs": {
-        "coordinate_transformation_mode": {
-          "type": "string",
-          "value": "half_pixel"
-        },
-        "sizes": {
-          "type": "s64[]",
-          "value": [
-            17
-          ]
-        },
-        "mode": {
-          "type": "string",
-          "value": "nearest"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            21
-          ],
-          "stride": [
-            357,
-            21,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 173,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2049,
-      "name": "BINARY_1",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "none"
-        }
-      },
-      "inputs": [
-        {
-          "id": 173,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2209,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 4111,
-      "name": "SUM_2",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "none"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2209,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4271,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 6145,
-      "name": "BINARY_3",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 4271,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6305,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/interpolate_post_ops_chain_fusion_2.json b/tests/benchdnn/inputs/graph/pattern/bf16/interpolate_post_ops_chain_fusion_2.json
deleted file mode 100644
index 24e95c57241..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/interpolate_post_ops_chain_fusion_2.json
+++ /dev/null
@@ -1,215 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 13,
-      "name": "RESAMPLING_0",
-      "kind": "Interpolate",
-      "attrs": {
-        "coordinate_transformation_mode": {
-          "type": "string",
-          "value": "half_pixel"
-        },
-        "sizes": {
-          "type": "s64[]",
-          "value": [
-            17
-          ]
-        },
-        "mode": {
-          "type": "string",
-          "value": "nearest"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            21
-          ],
-          "stride": [
-            357,
-            21,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 173,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2054,
-      "name": "ELTWISE_1",
-      "kind": "Tanh",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 173,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2214,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 4102,
-      "name": "ELTWISE_2",
-      "kind": "Sigmoid",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 2214,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4262,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 6145,
-      "name": "BINARY_3",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 4262,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            17,
-            1
-          ],
-          "stride": [
-            17,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6305,
-          "dtype": "bf16",
-          "shape": [
-            2,
-            17,
-            17
-          ],
-          "stride": [
-            289,
-            17,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_chain_fusion.json
deleted file mode 100644
index ddafa83bb67..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_chain_fusion.json
+++ /dev/null
@@ -1,275 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "MATMUL_0",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1024,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "BIASADD_0",
-      "kind": "BiasAdd",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1024
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "name": "BINARY_1",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "name": "ELTWISE_2",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "name": "BINARY_3",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 8,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 9,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_chain_fusion.json
deleted file mode 100644
index 313217c1fff..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_chain_fusion.json
+++ /dev/null
@@ -1,220 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "MATMUL_0",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1024,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "BINARY_1",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "name": "ELTWISE_2",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "name": "BINARY_3",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "bf16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json
deleted file mode 100644
index 4ca04a8931a..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/reciprocal_multiply_fusion.json
+++ /dev/null
@@ -1,122 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "reciprocal",
-      "kind": "Reciprocal",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "mul",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/shuffle_fusion.json b/tests/benchdnn/inputs/graph/pattern/bf16/shuffle_fusion.json
deleted file mode 100644
index 61a4a0907f4..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/bf16/shuffle_fusion.json
+++ /dev/null
@@ -1,193 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "RESHAPE_0",
-      "kind": "StaticReshape",
-      "attrs": {
-        "special_zero": {
-          "type": "bool",
-          "value": 0
-        },
-        "shape": {
-          "type": "s64[]",
-          "value": [
-            1,
-            64,
-            14,
-            8,
-            112
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            14,
-            8,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            57344,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "TRANSPOSE_1",
-      "kind": "StaticTranspose",
-      "attrs": {
-        "order": {
-          "type": "s64[]",
-          "value": [
-            0,
-            1,
-            3,
-            2,
-            4
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            14,
-            8,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            57344,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            8,
-            14,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            57344,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "name": "RESHAPE_2",
-      "kind": "StaticReshape",
-      "attrs": {
-        "special_zero": {
-          "type": "bool",
-          "value": 0
-        },
-        "shape": {
-          "type": "s64[]",
-          "value": [
-            1,
-            64,
-            112,
-            112
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            8,
-            14,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            57344,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "bf16",
-          "shape": [
-            1,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            1,
-            7168,
-            64
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/binary_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/binary_post_ops_fusion.json
deleted file mode 100644
index acd584b1ef3..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/binary_post_ops_fusion.json
+++ /dev/null
@@ -1,122 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "any",
-  "graph": [
-    {
-      "id": 0,
-      "name": "add",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "relu",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/bn_bwd_relu_bwd_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/bn_bwd_relu_bwd_fusion.json
deleted file mode 100644
index 6114d927c16..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/bn_bwd_relu_bwd_fusion.json
+++ /dev/null
@@ -1,209 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "ELTWISE_0",
-      "kind": "ReLUBackward",
-      "attrs": {
-        "use_dst": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "BNORM_0",
-      "kind": "BatchNormTrainingBackward",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 5,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 8,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [
-            32
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/bn_relu_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/bn_relu_fusion.json
deleted file mode 100644
index 39d674330b4..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/bn_relu_fusion.json
+++ /dev/null
@@ -1,156 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "BNORM_0",
-      "kind": "BatchNormInference",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "epsilon": {
-          "type": "f32",
-          "value": 0.0625
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 4,
-          "dtype": "f32",
-          "shape": [
-            192
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "ELTWISE_0",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "f16",
-          "shape": [
-            2,
-            192,
-            71,
-            71
-          ],
-          "stride": [
-            967872,
-            5041,
-            71,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/conv_bias_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/conv_bias_post_ops_fusion.json
deleted file mode 100644
index 40befdf81e3..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/conv_bias_post_ops_fusion.json
+++ /dev/null
@@ -1,174 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            32,
-            112,
-            112
-          ],
-          "stride": [
-            401408,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        },
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            64
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "ELTWISE_1",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/conv_depthwise_fusion_cpu.json b/tests/benchdnn/inputs/graph/pattern/f16/conv_depthwise_fusion_cpu.json
deleted file mode 100644
index 0d2fa685a7a..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/conv_depthwise_fusion_cpu.json
+++ /dev/null
@@ -1,225 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            32,
-            112,
-            112
-          ],
-          "stride": [
-            401408,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "CONV_1",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            2,
-            2
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 64
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            64,
-            1,
-            3,
-            3
-          ],
-          "stride": [
-            9,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            56,
-            56
-          ],
-          "stride": [
-            200704,
-            3136,
-            56,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/conv_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/conv_post_ops_fusion.json
deleted file mode 100644
index cc931103c07..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/conv_post_ops_fusion.json
+++ /dev/null
@@ -1,162 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "CONV_0",
-      "kind": "Convolution",
-      "attrs": {
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            32,
-            112,
-            112
-          ],
-          "stride": [
-            401408,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            64,
-            32,
-            1,
-            1
-          ],
-          "stride": [
-            32,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "ELTWISE_1",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            2,
-            64,
-            112,
-            112
-          ],
-          "stride": [
-            802816,
-            12544,
-            112,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/convtranspose_post_ops_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/convtranspose_post_ops_fusion.json
deleted file mode 100644
index f1c1132d031..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/convtranspose_post_ops_fusion.json
+++ /dev/null
@@ -1,133 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "DECONV_0",
-      "kind": "ConvTranspose",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        },
-        "pads_end": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "auto_pad": {
-          "type": "string",
-          "value": "None"
-        },
-        "weights_format": {
-          "type": "string",
-          "value": "IOX"
-        },
-        "output_padding": {
-          "type": "s64[]",
-          "value": [
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0
-          ]
-        },
-        "dilations": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "pads_begin": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        },
-        "groups": {
-          "type": "s64",
-          "value": 1
-        },
-        "strides": {
-          "type": "s64[]",
-          "value": [
-            1,
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            2,
-            17,
-            5,
-            5
-          ],
-          "stride": [
-            425,
-            25,
-            5,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            17,
-            16,
-            3,
-            3
-          ],
-          "stride": [
-            144,
-            9,
-            3,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            2,
-            16,
-            5,
-            5
-          ],
-          "stride": [
-            400,
-            25,
-            5,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/matmul_bias_post_ops_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/matmul_bias_post_ops_chain_fusion.json
deleted file mode 100644
index ed023e0a8bc..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/matmul_bias_post_ops_chain_fusion.json
+++ /dev/null
@@ -1,275 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "MATMUL_0",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1024,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "BIASADD_0",
-      "kind": "BiasAdd",
-      "attrs": {
-        "data_format": {
-          "type": "string",
-          "value": "NCX"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            1024
-          ],
-          "stride": [
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "name": "BINARY_1",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "name": "ELTWISE_2",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 6,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 4,
-      "name": "BINARY_3",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 7,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 8,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 9,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/matmul_post_ops_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/matmul_post_ops_chain_fusion.json
deleted file mode 100644
index 621fa590451..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/matmul_post_ops_chain_fusion.json
+++ /dev/null
@@ -1,220 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "MATMUL_0",
-      "kind": "MatMul",
-      "attrs": {
-        "transpose_b": {
-          "type": "bool",
-          "value": 0
-        },
-        "transpose_a": {
-          "type": "bool",
-          "value": 0
-        }
-      },
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1024,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "BINARY_1",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 2,
-      "name": "ELTWISE_2",
-      "kind": "ReLU",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 4,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 3,
-      "name": "BINARY_3",
-      "kind": "Add",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 5,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 6,
-          "dtype": "f16",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 7,
-          "dtype": "f16",
-          "shape": [
-            16,
-            1024
-          ],
-          "stride": [
-            1024,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json b/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json
deleted file mode 100644
index a7effd51560..00000000000
--- a/tests/benchdnn/inputs/graph/pattern/f16/reciprocal_multiply_fusion.json
+++ /dev/null
@@ -1,122 +0,0 @@
-{
-  "version": "3.0.0",
-  "engine_kind": "cpu",
-  "fpmath_mode": "strict",
-  "graph": [
-    {
-      "id": 0,
-      "name": "reciprocal",
-      "kind": "Reciprocal",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 0,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
-    {
-      "id": 1,
-      "name": "mul",
-      "kind": "Multiply",
-      "attrs": {
-        "auto_broadcast": {
-          "type": "string",
-          "value": "numpy"
-        }
-      },
-      "inputs": [
-        {
-          "id": 1,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        },
-        {
-          "id": 2,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 3,
-          "dtype": "f16",
-          "shape": [
-            1,
-            32,
-            28,
-            28
-          ],
-          "stride": [
-            25088,
-            784,
-            28,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }
-  ]
-}
diff --git a/tests/benchdnn/inputs/graph/pattern/f32/interpolate_post_ops_chain_fusion_3.json b/tests/benchdnn/inputs/graph/pattern/f32/interpolate_post_ops_chain_fusion_3.json
new file mode 100644
index 00000000000..dc65dc4f0ee
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/pattern/f32/interpolate_post_ops_chain_fusion_3.json
@@ -0,0 +1,266 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2,
+    3
+  ],
+  "output_ports": [
+    6305
+  ],
+  "graph": [
+    {
+      "id": 13,
+      "name": "RESAMPLING_0",
+      "kind": "Interpolate",
+      "attrs": {
+        "coordinate_transformation_mode": {
+          "type": "string",
+          "value": "half_pixel"
+        },
+        "mode": {
+          "type": "string",
+          "value": "nearest"
+        },
+        "sizes": {
+          "type": "s64[]",
+          "value": [
+            17
+          ]
+        },
+        "data_format": {
+          "type": "string",
+          "value": "NCX"
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            21
+          ],
+          "stride": [
+            357,
+            21,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 173,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 2049,
+      "name": "BINARY_1",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "none"
+        }
+      },
+      "inputs": [
+        {
+          "id": 173,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2209,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 4111,
+      "name": "SUM_2",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "none"
+        }
+      },
+      "inputs": [
+        {
+          "id": 2209,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 4271,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 6145,
+      "name": "BINARY_3",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 4271,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 3,
+          "dtype": "f32",
+          "shape": [
+            1,
+            1,
+            1
+          ],
+          "stride": [
+            1,
+            1,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6305,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/f32/interpolate_post_ops_chain_fusion_4.json b/tests/benchdnn/inputs/graph/pattern/f32/interpolate_post_ops_chain_fusion_4.json
new file mode 100644
index 00000000000..49965aed7f6
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/pattern/f32/interpolate_post_ops_chain_fusion_4.json
@@ -0,0 +1,222 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    6305
+  ],
+  "graph": [
+    {
+      "id": 13,
+      "name": "RESAMPLING_0",
+      "kind": "Interpolate",
+      "attrs": {
+        "coordinate_transformation_mode": {
+          "type": "string",
+          "value": "half_pixel"
+        },
+        "mode": {
+          "type": "string",
+          "value": "nearest"
+        },
+        "sizes": {
+          "type": "s64[]",
+          "value": [
+            17
+          ]
+        },
+        "data_format": {
+          "type": "string",
+          "value": "NCX"
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            21
+          ],
+          "stride": [
+            357,
+            21,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 173,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 2054,
+      "name": "ELTWISE_1",
+      "kind": "Tanh",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 173,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2214,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 4102,
+      "name": "ELTWISE_2",
+      "kind": "Sigmoid",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 2214,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 4262,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    },
+    {
+      "id": 6145,
+      "name": "BINARY_3",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 4262,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        },
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            1,
+            17,
+            1
+          ],
+          "stride": [
+            17,
+            1,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6305,
+          "dtype": "f32",
+          "shape": [
+            2,
+            17,
+            17
+          ],
+          "stride": [
+            289,
+            17,
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_clip_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/matmul_bias_post_ops_clip_fusion.json
similarity index 89%
rename from tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_clip_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/matmul_bias_post_ops_clip_fusion.json
index 925d7b0d683..a9fca38a35b 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_clip_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/matmul_bias_post_ops_clip_fusion.json
@@ -1,7 +1,15 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2
+  ],
+  "output_ports": [
+    2214
+  ],
   "graph": [
     {
       "id": 8,
@@ -20,7 +28,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             4096
@@ -34,7 +42,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             4096,
             1024
@@ -48,7 +56,7 @@
         },
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -64,7 +72,7 @@
       "outputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -95,7 +103,7 @@
       "inputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -111,7 +119,7 @@
       "outputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_elu_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/matmul_bias_post_ops_elu_fusion.json
similarity index 89%
rename from tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_elu_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/matmul_bias_post_ops_elu_fusion.json
index 905e29bed11..abeac647dd5 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_bias_post_ops_elu_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/matmul_bias_post_ops_elu_fusion.json
@@ -1,7 +1,15 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2
+  ],
+  "output_ports": [
+    2214
+  ],
   "graph": [
     {
       "id": 8,
@@ -20,7 +28,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             4096
@@ -34,7 +42,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             4096,
             1024
@@ -48,7 +56,7 @@
         },
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -64,7 +72,7 @@
       "outputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -91,7 +99,7 @@
       "inputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -107,7 +115,7 @@
       "outputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_clip_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_clip_fusion.json
similarity index 89%
rename from tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_clip_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_clip_fusion.json
index 12b73399c0b..534d186ce6c 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_clip_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_clip_fusion.json
@@ -1,7 +1,14 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    2214
+  ],
   "graph": [
     {
       "id": 8,
@@ -20,7 +27,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             4096
@@ -34,7 +41,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             4096,
             1024
@@ -50,7 +57,7 @@
       "outputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -81,7 +88,7 @@
       "inputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -97,7 +104,7 @@
       "outputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_relu_add_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_relu_add_fusion.json
similarity index 90%
rename from tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_relu_add_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_relu_add_fusion.json
index 5b7762dca61..a756ef5227d 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_relu_add_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_relu_add_fusion.json
@@ -1,7 +1,16 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2,
+    3
+  ],
+  "output_ports": [
+    6305
+  ],
   "graph": [
     {
       "id": 8,
@@ -20,7 +29,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -34,7 +43,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1024,
             1024
@@ -50,7 +59,7 @@
       "outputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -77,7 +86,7 @@
       "inputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -91,7 +100,7 @@
         },
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1
@@ -107,7 +116,7 @@
       "outputs": [
         {
           "id": 2209,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -129,7 +138,7 @@
       "inputs": [
         {
           "id": 2209,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -145,7 +154,7 @@
       "outputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -172,7 +181,7 @@
       "inputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -186,7 +195,7 @@
         },
         {
           "id": 3,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1
@@ -202,7 +211,7 @@
       "outputs": [
         {
           "id": 6305,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_sum_logistic_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_sum_logistic_fusion.json
similarity index 90%
rename from tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_sum_logistic_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_sum_logistic_fusion.json
index ca27ed6a799..73612261b41 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/matmul_post_ops_sum_logistic_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/matmul_post_ops_sum_logistic_fusion.json
@@ -1,7 +1,15 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2
+  ],
+  "output_ports": [
+    4262
+  ],
   "graph": [
     {
       "id": 8,
@@ -20,7 +28,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             4096
@@ -34,7 +42,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             4096,
             1024
@@ -50,7 +58,7 @@
       "outputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -77,7 +85,7 @@
       "inputs": [
         {
           "id": 168,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -91,7 +99,7 @@
         },
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -107,7 +115,7 @@
       "outputs": [
         {
           "id": 2223,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -129,7 +137,7 @@
       "inputs": [
         {
           "id": 2223,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
@@ -145,7 +153,7 @@
       "outputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             384,
             1024
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_l1_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_l1_chain_fusion.json
similarity index 90%
rename from tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_l1_chain_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_l1_chain_fusion.json
index 55e876927a4..99f4aa1a16d 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_l1_chain_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_l1_chain_fusion.json
@@ -1,7 +1,14 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    6305
+  ],
   "graph": [
     {
       "id": 11,
@@ -22,7 +29,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             16,
             1024
@@ -38,7 +45,7 @@
       "outputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -60,7 +67,7 @@
       "inputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -76,7 +83,7 @@
       "outputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -98,7 +105,7 @@
       "inputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -114,7 +121,7 @@
       "outputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -141,7 +148,7 @@
       "inputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -155,7 +162,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
@@ -171,7 +178,7 @@
       "outputs": [
         {
           "id": 6305,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1024
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_mean_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_mean_fusion.json
similarity index 90%
rename from tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_mean_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_mean_fusion.json
index 9043819c77e..3b5ec7bf705 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_mean_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_mean_fusion.json
@@ -1,7 +1,14 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    2209
+  ],
   "graph": [
     {
       "id": 11,
@@ -22,7 +29,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             128,
@@ -42,7 +49,7 @@
       "outputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1,
@@ -73,7 +80,7 @@
       "inputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1,
@@ -91,7 +98,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1,
@@ -111,7 +118,7 @@
       "outputs": [
         {
           "id": 2209,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1,
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_min_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_min_chain_fusion.json
similarity index 92%
rename from tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_min_chain_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_min_chain_fusion.json
index 3bbf73c5eab..9152e4436c1 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_min_chain_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_min_chain_fusion.json
@@ -1,7 +1,16 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1,
+    2,
+    3
+  ],
+  "output_ports": [
+    6305
+  ],
   "graph": [
     {
       "id": 11,
@@ -23,7 +32,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -43,7 +52,7 @@
       "outputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -74,7 +83,7 @@
       "inputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -92,7 +101,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -112,7 +121,7 @@
       "outputs": [
         {
           "id": 2209,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -143,7 +152,7 @@
       "inputs": [
         {
           "id": 2209,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -161,7 +170,7 @@
         },
         {
           "id": 2,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -181,7 +190,7 @@
       "outputs": [
         {
           "id": 4271,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -212,7 +221,7 @@
       "inputs": [
         {
           "id": 4271,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -230,7 +239,7 @@
         },
         {
           "id": 3,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             1,
@@ -250,7 +259,7 @@
       "outputs": [
         {
           "id": 6305,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
diff --git a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_sum_chain_fusion.json b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_sum_chain_fusion.json
similarity index 91%
rename from tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_sum_chain_fusion.json
rename to tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_sum_chain_fusion.json
index e0349e1a669..634570264f8 100644
--- a/tests/benchdnn/inputs/graph/pattern/bf16/reduction_post_ops_sum_chain_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f32/reduction_post_ops_sum_chain_fusion.json
@@ -1,7 +1,14 @@
 {
-  "version": "3.0.0",
+  "version": "3.7.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "input_ports": [
+    0,
+    1
+  ],
+  "output_ports": [
+    6305
+  ],
   "graph": [
     {
       "id": 11,
@@ -23,7 +30,7 @@
       "inputs": [
         {
           "id": 0,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -43,7 +50,7 @@
       "outputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -69,7 +76,7 @@
       "inputs": [
         {
           "id": 171,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -89,7 +96,7 @@
       "outputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -115,7 +122,7 @@
       "inputs": [
         {
           "id": 2214,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -135,7 +142,7 @@
       "outputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -166,7 +173,7 @@
       "inputs": [
         {
           "id": 4262,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
@@ -184,7 +191,7 @@
         },
         {
           "id": 1,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             1,
             20,
@@ -204,7 +211,7 @@
       "outputs": [
         {
           "id": 6305,
-          "dtype": "bf16",
+          "dtype": "f32",
           "shape": [
             64,
             20,
diff --git a/tests/benchdnn/inputs/graph/pattern/f8/f8_bf16_matmul_add_fusion.json b/tests/benchdnn/inputs/graph/pattern/f8/f8_bf16_matmul_add_fusion.json
new file mode 100644
index 00000000000..edc28d6c1bc
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/pattern/f8/f8_bf16_matmul_add_fusion.json
@@ -0,0 +1,374 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0, 
+    3, 
+    7, 
+    9
+  ],
+  "output_ports": [
+    10
+  ],
+  "graph": [
+    {
+      "id": 0,
+      "name": "DEQUANTIZE_1",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 1,
+      "name": "TYPECAST_0",
+      "kind": "TypeCast",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 2,
+      "name": "DEQUANTIZE_2",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 3,
+          "dtype": "f8_e5m2",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ]
+    }, 
+    {
+      "id": 3,
+      "name": "TYPECAST_0",
+      "kind": "TypeCast",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 5,
+          "dtype": "bf16",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ]
+    }, 
+    {
+      "id": 4,
+      "name": "MATMUL_0",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 2,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 5,
+          "dtype": "bf16",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 5,
+      "name": "BINARY_3",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 6,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 7,
+          "dtype": "bf16",
+          "shape": [
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 8,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 6,
+      "name": "BINARY_4",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 8,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 9,
+          "dtype": "bf16",
+          "shape": [
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 10,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/f8/f8_bf16_matmul_sum_add_mul_relu.json b/tests/benchdnn/inputs/graph/pattern/f8/f8_bf16_matmul_sum_add_mul_relu.json
new file mode 100644
index 00000000000..caac5b7d83e
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/pattern/f8/f8_bf16_matmul_sum_add_mul_relu.json
@@ -0,0 +1,652 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0, 
+    3, 
+    6208, 
+    6219, 
+    22
+  ],
+  "output_ports": [
+    12
+  ],
+  "graph": [
+    {
+      "id": 8205,
+      "name": "DEQUANTIZE_1",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8207,
+      "name": "DEQUANTIZE_2",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 3,
+          "dtype": "f8_e5m2",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ]
+    }, 
+    {
+      "id": 8209,
+      "name": "DEQUANTIZE_3",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 1
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 6208,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6209,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8206,
+      "name": "TYPECAST_1",
+      "kind": "TypeCast",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 2,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8208,
+      "name": "TYPECAST_2",
+      "kind": "TypeCast",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 5,
+          "dtype": "bf16",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ]
+    }, 
+    {
+      "id": 8210,
+      "name": "TYPECAST_3",
+      "kind": "TypeCast",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 6209,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6210,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8211,
+      "name": "MATMUL_0",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 2,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 5,
+          "dtype": "bf16",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8212,
+      "name": "BINARY_4",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 6,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 6210,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 10,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8216,
+      "name": "BINARY_6",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 10,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 6219,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 21,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8215,
+      "name": "BINARY_5",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 21,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 22,
+          "dtype": "bf16",
+          "shape": [
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 24,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8219,
+      "name": "ELEWISE_1",
+      "kind": "ReLU",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 24,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 25,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8213,
+      "name": "TYPECAST_4",
+      "kind": "TypeCast",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 25,
+          "dtype": "bf16",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 11,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8,
+      "name": "QUANTIZE_4",
+      "kind": "Quantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 11,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 12,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/f8/f8_conv_add_add_fusion.json b/tests/benchdnn/inputs/graph/pattern/f8/f8_conv_add_add_fusion.json
index b1c137cfdf6..f1227540e16 100644
--- a/tests/benchdnn/inputs/graph/pattern/f8/f8_conv_add_add_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/f8/f8_conv_add_add_fusion.json
@@ -1,12 +1,13 @@
 {
-  "version": "3.6.0",
+  "version": "3.8.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
   "input_ports": [
     0, 
     1, 
     2, 
-    3
+    10305
   ],
   "output_ports": [
     14512
@@ -134,128 +135,6 @@
         }
       ]
     }, 
-    {
-      "id": 8209,
-      "name": "DEQUANTIZE_4",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 2,
-          "dtype": "f8_e5m2",
-          "shape": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "stride": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 6209,
-          "dtype": "f32",
-          "shape": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "stride": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }, 
-    {
-      "id": 12305,
-      "name": "DEQUANTIZE_6",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "f8_e5m2",
-          "shape": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "stride": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10305,
-          "dtype": "f32",
-          "shape": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "stride": [
-            1, 
-            1, 
-            1, 
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    }, 
     {
       "id": 4,
       "name": "CONV_0",
@@ -365,6 +244,67 @@
         }
       ]
     }, 
+    {
+      "id": 8209,
+      "name": "DEQUANTIZE_4",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 1
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 2,
+          "dtype": "f8_e5m2",
+          "shape": [
+            1, 
+            1, 
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1, 
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6209,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            1, 
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1, 
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
     {
       "id": 6145,
       "name": "BINARY_3",
diff --git a/tests/benchdnn/inputs/graph/pattern/f8/f8_f32_matmul_mul_add_fusion.json b/tests/benchdnn/inputs/graph/pattern/f8/f8_f32_matmul_mul_add_fusion.json
new file mode 100644
index 00000000000..fa1a01a5610
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/pattern/f8/f8_f32_matmul_mul_add_fusion.json
@@ -0,0 +1,553 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0, 
+    1, 
+    2, 
+    3
+  ],
+  "output_ports": [
+    8353
+  ],
+  "graph": [
+    {
+      "id": 2065,
+      "name": "DEQUANTIZE_1",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            512
+          ],
+          "stride": [
+            512, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 40,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            512
+          ],
+          "stride": [
+            512, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 4113,
+      "name": "DEQUANTIZE_2",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 1
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_channel"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125, 
+            8, 
+            0.25, 
+            4, 
+            0.5, 
+            2, 
+            1, 
+            1, 
+            2, 
+            0.5, 
+            4, 
+            0.25, 
+            8, 
+            0.125, 
+            16, 
+            0.0625, 
+            32, 
+            0.03125, 
+            0.0625, 
+            16, 
+            0.125
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f8_e5m2",
+          "shape": [
+            512, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 104,
+          "dtype": "f32",
+          "shape": [
+            512, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ]
+    }, 
+    {
+      "id": 8,
+      "name": "MATMUL_0",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 40,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            512
+          ],
+          "stride": [
+            512, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 104,
+          "dtype": "f32",
+          "shape": [
+            512, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 168,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 6145,
+      "name": "BINARY_3",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 168,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 2,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6305,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8193,
+      "name": "BINARY_4",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 6305,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 3,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 8353,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            256
+          ],
+          "stride": [
+            256, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/f8/f8_matmul_sum_add_mul_relu.json b/tests/benchdnn/inputs/graph/pattern/f8/f8_matmul_sum_add_mul_relu.json
new file mode 100644
index 00000000000..a5970fabcc1
--- /dev/null
+++ b/tests/benchdnn/inputs/graph/pattern/f8/f8_matmul_sum_add_mul_relu.json
@@ -0,0 +1,500 @@
+{
+  "version": "3.7.0",
+  "engine_kind": "cpu",
+  "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0, 
+    3, 
+    6208, 
+    6219, 
+    22
+  ],
+  "output_ports": [
+    12
+  ],
+  "graph": [
+    {
+      "id": 8205,
+      "name": "DEQUANTIZE_1",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 0,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8207,
+      "name": "DEQUANTIZE_2",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 3,
+          "dtype": "f8_e5m2",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ]
+    }, 
+    {
+      "id": 8209,
+      "name": "DEQUANTIZE_3",
+      "kind": "Dequantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 1
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 6208,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6209,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8211,
+      "name": "MATMUL_0",
+      "kind": "MatMul",
+      "attrs": {
+        "transpose_b": {
+          "type": "bool",
+          "value": 0
+        },
+        "transpose_a": {
+          "type": "bool",
+          "value": 0
+        }
+      },
+      "inputs": [
+        {
+          "id": 1,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 4,
+          "dtype": "f32",
+          "shape": [
+            1024, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "constant"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 6,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8212,
+      "name": "BINARY_4",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 6,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 6209,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 10,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8216,
+      "name": "BINARY_6",
+      "kind": "Add",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 10,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 6219,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 21,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8215,
+      "name": "BINARY_5",
+      "kind": "Multiply",
+      "attrs": {
+        "auto_broadcast": {
+          "type": "string",
+          "value": "numpy"
+        }
+      },
+      "inputs": [
+        {
+          "id": 21,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }, 
+        {
+          "id": 22,
+          "dtype": "f32",
+          "shape": [
+            1, 
+            1
+          ],
+          "stride": [
+            1, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 24,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8219,
+      "name": "ELEWISE_1",
+      "kind": "ReLU",
+      "attrs": {},
+      "inputs": [
+        {
+          "id": 24,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 25,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }, 
+    {
+      "id": 8,
+      "name": "QUANTIZE_4",
+      "kind": "Quantize",
+      "attrs": {
+        "axis": {
+          "type": "s64",
+          "value": 0
+        },
+        "qtype": {
+          "type": "string",
+          "value": "per_tensor"
+        },
+        "scales": {
+          "type": "f32[]",
+          "value": [
+            1
+          ]
+        }
+      },
+      "inputs": [
+        {
+          "id": 25,
+          "dtype": "f32",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ],
+      "outputs": [
+        {
+          "id": 12,
+          "dtype": "f8_e5m2",
+          "shape": [
+            16, 
+            1024
+          ],
+          "stride": [
+            1024, 
+            1
+          ],
+          "layout_type": "strided",
+          "property_type": "undef"
+        }
+      ]
+    }
+  ]
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_bf16_all b/tests/benchdnn/inputs/graph/pattern/harness_bf16_all
index 59d2a58368f..2b21aa41adf 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_bf16_all
+++ b/tests/benchdnn/inputs/graph/pattern/harness_bf16_all
@@ -1,35 +1,35 @@
---reset --case=pattern/bf16/binary_post_ops_fusion.json
---reset --case=pattern/bf16/bn_bwd_relu_bwd_fusion.json
---reset --case=pattern/bf16/bn_relu_fusion.json
---reset --case=pattern/bf16/conv_bias_post_ops_fusion.json
+--reset --dt=bf16 --case=pattern/f32/binary_post_ops_fusion.json
+--reset --dt=bf16 --case=pattern/f32/bn_bwd_relu_bwd_fusion.json
+--reset --dt=bf16 --case=pattern/f32/bn_relu_fusion.json
+--reset --dt=bf16 --case=pattern/f32/conv_bias_post_ops_fusion.json
 # This fusion pattern is not support on GPU engine for now, will split into 2
-# partitions with GPU engine
---reset --case=pattern/bf16/conv_depthwise_fusion_cpu.json
---reset --case=pattern/bf16/conv_post_ops_fusion.json
---reset --case=pattern/bf16/convtranspose_post_ops_fusion.json
---reset --case=pattern/bf16/matmul_bias_post_ops_chain_fusion.json
---reset --case=pattern/bf16/matmul_post_ops_chain_fusion.json
---reset --case=pattern/bf16/reciprocal_multiply_fusion.json
+# partitions with GPU engine. Skip the partition number check for it.
+--reset --dt=bf16  --expected-n-partitions=0 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --dt=bf16 --case=pattern/f32/conv_post_ops_fusion.json
+--reset --dt=bf16 --case=pattern/f32/convtranspose_post_ops_fusion.json
+--reset --dt=bf16 --case=pattern/f32/matmul_bias_post_ops_chain_fusion.json
+--reset --dt=bf16 --case=pattern/f32/matmul_post_ops_chain_fusion.json
+--reset --dt=bf16 --case=pattern/f32/reciprocal_multiply_fusion.json
 # resampling
---reset --in-shapes=0:2x17x21+1:2x17x17+2:2x17x17+3:1x1x1 --op-attrs=13:sizes:17 --case=pattern/bf16/interpolate_post_ops_chain_fusion.json
---reset --in-shapes=0:2x17x21+1:1x17x1 --op-attrs=13:sizes:17 --case=pattern/bf16/interpolate_post_ops_chain_fusion_2.json
+--reset --dt=bf16 --in-shapes=0:2x17x21+1:2x17x17+2:2x17x17+3:1x1x1 --op-attrs=13:sizes:17 --case=pattern/f32/interpolate_post_ops_chain_fusion_3.json
+--reset --dt=bf16 --in-shapes=0:2x17x21+1:1x17x1 --op-attrs=13:sizes:17 --case=pattern/f32/interpolate_post_ops_chain_fusion_4.json
 # matmul
---reset --in-shapes=0:384x4096+1:4096x1024 --case=pattern/bf16/matmul_post_ops_clip_fusion.json
---reset --in-shapes=0:16x1024+1:1024x1024+2:1x1+3:1x1 --case=pattern/bf16/matmul_post_ops_relu_add_fusion.json
---reset --in-shapes=0:384x4096+1:4096x1024+2:384x1024 --case=pattern/bf16/matmul_post_ops_sum_logistic_fusion.json
---reset --in-shapes=0:384x4096+1:4096x1024+2:1024 --case=pattern/bf16/matmul_bias_post_ops_clip_fusion.json
---reset --in-shapes=0:384x4096+1:4096x1024+2:1x1024 --op-attrs=2054:min:-1*max:1 --case=pattern/bf16/matmul_bias_post_ops_clip_fusion.json
---reset --in-shapes=0:384x4096+1:4096x1024+2:1x1024 --case=pattern/bf16/matmul_bias_post_ops_elu_fusion.json
+--reset --dt=bf16 --in-shapes=0:384x4096+1:4096x1024 --case=pattern/f32/matmul_post_ops_clip_fusion.json
+--reset --dt=bf16 --in-shapes=0:16x1024+1:1024x1024+2:1x1+3:1x1 --case=pattern/f32/matmul_post_ops_relu_add_fusion.json
+--reset --dt=bf16 --in-shapes=0:384x4096+1:4096x1024+2:384x1024 --case=pattern/f32/matmul_post_ops_sum_logistic_fusion.json
+--reset --dt=bf16 --in-shapes=0:384x4096+1:4096x1024+2:1024 --case=pattern/f32/matmul_bias_post_ops_clip_fusion.json
+--reset --dt=bf16 --in-shapes=0:384x4096+1:4096x1024+2:1x1024 --op-attrs=2054:min:-1*max:1 --case=pattern/f32/matmul_bias_post_ops_clip_fusion.json
+--reset --dt=bf16 --in-shapes=0:384x4096+1:4096x1024+2:1x1024 --case=pattern/f32/matmul_bias_post_ops_elu_fusion.json
 # reduction
---reset --in-shapes=0:16x1024+1:1x1024 --op-attrs=11:axes:0 --case=pattern/bf16/reduction_post_ops_l1_chain_fusion.json
---reset --in-shapes=0:1x128x150x150+1:1x1x150x150 --op-attrs=11:axes:1 --case=pattern/bf16/reduction_post_ops_mean_fusion.json
---reset --in-shapes=0:1x256x7x7+1:1x256x1x1 --op-attrs=11:axes:2x3 --case=pattern/bf16/reduction_post_ops_mean_fusion.json
---reset --in-shapes=0:1x256x9x9+1:1x1x9x9 --op-attrs=11:axes:1 --case=pattern/bf16/reduction_post_ops_mean_fusion.json
---reset --in-shapes=0:64x20x7x7+1:64x20x1x1+2:64x20x1x1+3:1x1x1x1 --op-attrs=11:axes:2x3 --case=pattern/bf16/reduction_post_ops_min_chain_fusion.json
---reset --in-shapes=0:64x20x7x7+1:1x20x1x1 --op-attrs=11:axes:2x3 --case=pattern/bf16/reduction_post_ops_sum_chain_fusion.json
+--reset --dt=bf16 --in-shapes=0:16x1024+1:1x1024 --op-attrs=11:axes:0 --case=pattern/f32/reduction_post_ops_l1_chain_fusion.json
+--reset --dt=bf16 --in-shapes=0:1x128x150x150+1:1x1x150x150 --op-attrs=11:axes:1 --case=pattern/f32/reduction_post_ops_mean_fusion.json
+--reset --dt=bf16 --in-shapes=0:1x256x7x7+1:1x256x1x1 --op-attrs=11:axes:2x3 --case=pattern/f32/reduction_post_ops_mean_fusion.json
+--reset --dt=bf16 --in-shapes=0:1x256x9x9+1:1x1x9x9 --op-attrs=11:axes:1 --case=pattern/f32/reduction_post_ops_mean_fusion.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7+1:64x20x1x1+2:64x20x1x1+3:1x1x1x1 --op-attrs=11:axes:2x3 --case=pattern/f32/reduction_post_ops_min_chain_fusion.json
+--reset --dt=bf16 --in-shapes=0:64x20x7x7+1:1x20x1x1 --op-attrs=11:axes:2x3 --case=pattern/f32/reduction_post_ops_sum_chain_fusion.json
 # shuffle
---reset --in-shapes=0:64x256x56x56 --op-attrs=0:shape:64x256x7x8x56+2:shape:64x256x56x56 --case=pattern/bf16/shuffle_fusion.json
+--reset --dt=bf16 --in-shapes=0:64x256x56x56 --op-attrs=0:shape:64x256x7x8x56+2:shape:64x256x56x56 --case=pattern/f32/shuffle_fusion.json
 # large scope
---reset --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=pattern/bf16/bn_relu_fusion.json
---reset --in-shapes=0:2x17x21+1:1x17x1 --op-attrs=13:sizes:17*mode:nearest --case=pattern/bf16/interpolate_post_ops_chain_fusion_2.json
---reset --in-shapes=0:2x17x21+1:2x17x17+2:2x17x17+3:1x1x1 --op-attrs=13:sizes:17*mode:nearest --case=pattern/bf16/interpolate_post_ops_chain_fusion.json
+--reset --dt=bf16 --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=pattern/f32/bn_relu_fusion.json
+--reset --dt=bf16 --in-shapes=0:2x17x21+1:1x17x1 --op-attrs=13:sizes:17*mode:nearest --case=pattern/f32/interpolate_post_ops_chain_fusion_4.json
+--reset --dt=bf16 --in-shapes=0:2x17x21+1:2x17x17+2:2x17x17+3:1x1x1 --op-attrs=13:sizes:17*mode:nearest --case=pattern/f32/interpolate_post_ops_chain_fusion_3.json
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_bf16_ci b/tests/benchdnn/inputs/graph/pattern/harness_bf16_ci
index 6aa0b999de6..4b56b47f21f 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_bf16_ci
+++ b/tests/benchdnn/inputs/graph/pattern/harness_bf16_ci
@@ -1 +1 @@
---reset --case=pattern/bf16/conv_post_ops_fusion.json
+--reset --dt=bf16 --case=pattern/f32/conv_post_ops_fusion.json
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_f16_all b/tests/benchdnn/inputs/graph/pattern/harness_f16_all
index 0892fc8e1cd..8f16c3e9470 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_f16_all
+++ b/tests/benchdnn/inputs/graph/pattern/harness_f16_all
@@ -1,12 +1,35 @@
---reset --case=pattern/f16/binary_post_ops_fusion.json
---reset --case=pattern/f16/bn_bwd_relu_bwd_fusion.json
---reset --case=pattern/f16/bn_relu_fusion.json
---reset --case=pattern/f16/conv_bias_post_ops_fusion.json
+--reset --dt=f16 --case=pattern/f32/binary_post_ops_fusion.json
+--reset --dt=f16 --case=pattern/f32/bn_bwd_relu_bwd_fusion.json
+--reset --dt=f16 --case=pattern/f32/bn_relu_fusion.json
+--reset --dt=f16 --case=pattern/f32/conv_bias_post_ops_fusion.json
 # This fusion pattern is not support on GPU engine for now, will split into 2
-# partitions with GPU engine
---reset --case=pattern/f16/conv_depthwise_fusion_cpu.json
---reset --case=pattern/f16/conv_post_ops_fusion.json
---reset --case=pattern/f16/convtranspose_post_ops_fusion.json
---reset --case=pattern/f16/matmul_bias_post_ops_chain_fusion.json
---reset --case=pattern/f16/matmul_post_ops_chain_fusion.json
---reset --case=pattern/f16/reciprocal_multiply_fusion.json
+# partitions with GPU engine. Skip the partition number check for it.
+--reset --dt=f16 --expected-n-partitions=0 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --dt=f16 --case=pattern/f32/conv_post_ops_fusion.json
+--reset --dt=f16 --case=pattern/f32/convtranspose_post_ops_fusion.json
+--reset --dt=f16 --case=pattern/f32/matmul_bias_post_ops_chain_fusion.json
+--reset --dt=f16 --case=pattern/f32/matmul_post_ops_chain_fusion.json
+--reset --dt=f16 --case=pattern/f32/reciprocal_multiply_fusion.json
+# resampling
+--reset --dt=f16 --in-shapes=0:2x17x21+1:2x17x17+2:2x17x17+3:1x1x1 --op-attrs=13:sizes:17 --case=pattern/f32/interpolate_post_ops_chain_fusion_3.json
+--reset --dt=f16 --in-shapes=0:2x17x21+1:1x17x1 --op-attrs=13:sizes:17 --case=pattern/f32/interpolate_post_ops_chain_fusion_4.json
+# matmul
+--reset --dt=f16 --in-shapes=0:384x4096+1:4096x1024 --case=pattern/f32/matmul_post_ops_clip_fusion.json
+--reset --dt=f16 --in-shapes=0:16x1024+1:1024x1024+2:1x1+3:1x1 --case=pattern/f32/matmul_post_ops_relu_add_fusion.json
+--reset --dt=f16 --in-shapes=0:384x4096+1:4096x1024+2:384x1024 --case=pattern/f32/matmul_post_ops_sum_logistic_fusion.json
+--reset --dt=f16 --in-shapes=0:384x4096+1:4096x1024+2:1024 --case=pattern/f32/matmul_bias_post_ops_clip_fusion.json
+--reset --dt=f16 --in-shapes=0:384x4096+1:4096x1024+2:1x1024 --op-attrs=2054:min:-1*max:1 --case=pattern/f32/matmul_bias_post_ops_clip_fusion.json
+--reset --dt=f16 --in-shapes=0:384x4096+1:4096x1024+2:1x1024 --case=pattern/f32/matmul_bias_post_ops_elu_fusion.json
+# reduction
+--reset --dt=f16 --in-shapes=0:16x1024+1:1x1024 --op-attrs=11:axes:0 --case=pattern/f32/reduction_post_ops_l1_chain_fusion.json
+--reset --dt=f16 --in-shapes=0:1x128x150x150+1:1x1x150x150 --op-attrs=11:axes:1 --case=pattern/f32/reduction_post_ops_mean_fusion.json
+--reset --dt=f16 --in-shapes=0:1x256x7x7+1:1x256x1x1 --op-attrs=11:axes:2x3 --case=pattern/f32/reduction_post_ops_mean_fusion.json
+--reset --dt=f16 --in-shapes=0:1x256x9x9+1:1x1x9x9 --op-attrs=11:axes:1 --case=pattern/f32/reduction_post_ops_mean_fusion.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7+1:64x20x1x1+2:64x20x1x1+3:1x1x1x1 --op-attrs=11:axes:2x3 --case=pattern/f32/reduction_post_ops_min_chain_fusion.json
+--reset --dt=f16 --in-shapes=0:64x20x7x7+1:1x20x1x1 --op-attrs=11:axes:2x3 --case=pattern/f32/reduction_post_ops_sum_chain_fusion.json
+# shuffle
+--reset --dt=f16 --in-shapes=0:64x256x56x56 --op-attrs=0:shape:64x256x7x8x56+2:shape:64x256x56x56 --case=pattern/f32/shuffle_fusion.json
+# large scope
+--reset --dt=f16 --in-shapes=0:96x192x71x71+1:192+2:192+3:192+4:192 --op-attrs=0:epsilon:0.0625 --case=pattern/f32/bn_relu_fusion.json
+--reset --dt=f16 --in-shapes=0:2x17x21+1:1x17x1 --op-attrs=13:sizes:17*mode:nearest --case=pattern/f32/interpolate_post_ops_chain_fusion_4.json
+--reset --dt=f16 --in-shapes=0:2x17x21+1:2x17x17+2:2x17x17+3:1x1x1 --op-attrs=13:sizes:17*mode:nearest --case=pattern/f32/interpolate_post_ops_chain_fusion_3.json
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_f16_ci b/tests/benchdnn/inputs/graph/pattern/harness_f16_ci
index 8f914ec313f..8d37ce88a05 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_f16_ci
+++ b/tests/benchdnn/inputs/graph/pattern/harness_f16_ci
@@ -1 +1 @@
---reset --case=pattern/f16/conv_post_ops_fusion.json
+--reset --dt=f16 --case=pattern/f32/conv_post_ops_fusion.json
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_f32_all b/tests/benchdnn/inputs/graph/pattern/harness_f32_all
index ca06e8cea32..e74a0fb7a64 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_f32_all
+++ b/tests/benchdnn/inputs/graph/pattern/harness_f32_all
@@ -24,28 +24,28 @@
 --reset --in-shapes=0:50x64x56x56+1:64x64x1x1+2:64 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/f32/conv_bias_post_ops_fusion.json
 --reset --in-shapes=0:50x64x56x56+1:64x64x3x3+2:64 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:1 --case=pattern/f32/conv_bias_post_ops_fusion.json
 # This fusion pattern is not support on GPU engine for now, will split into 2
-# partitions with GPU engine
---reset --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --case=pattern/f32/conv_bias_relu_depthwise_bias_relu_fusion_cpu.json
---reset --case=pattern/f32/conv_bias_mul_mul_depthwise_bias_swish_fusion_cpu.json
---reset --in-shapes=0:2x128x56x56+1:128x128x1x1+3:128x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:128 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x24x56x56+1:144x24x1x1+3:144x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:144 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x256x28x28+1:256x256x1x1+3:256x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:256 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x32x112x112+1:64x32x1x1+3:64x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:64 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x512x14x14+1:512x512x1x1+3:512x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:512 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x64x28x28+1:384x64x1x1+3:384x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:384 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x96x14x14+1:576x96x1x1+3:576x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:576 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x128x28x28+1:256x128x1x1+3:256x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:256 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x160x7x7+1:960x160x1x1+3:960x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:960 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x24x56x56+1:144x24x1x1+3:144x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:144 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x256x14x14+1:512x256x1x1+3:512x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:512 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x32x112x112+1:32x32x1x1+3:32x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:32 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x32x28x28+1:192x32x1x1+3:192x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:192 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x512x14x14+1:512x512x1x1+3:512x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:512 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x512x7x7+1:1024x512x1x1+3:1024x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:1024 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x64x28x28+1:384x64x1x1+3:384x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:384 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x64x56x56+1:128x64x1x1+3:128x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:128 --case=pattern/f32/conv_depthwise_fusion_cpu.json
---reset --in-shapes=0:2x96x14x14+1:576x96x1x1+3:576x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:576 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+# partitions with GPU engine. Skip the partition number check for them.
+--reset --expected-n-partitions=0 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --case=pattern/f32/conv_bias_relu_depthwise_bias_relu_fusion_cpu.json
+--reset --expected-n-partitions=0 --case=pattern/f32/conv_bias_mul_mul_depthwise_bias_swish_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x128x56x56+1:128x128x1x1+3:128x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:128 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x24x56x56+1:144x24x1x1+3:144x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:144 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x256x28x28+1:256x256x1x1+3:256x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:256 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x32x112x112+1:64x32x1x1+3:64x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:64 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x512x14x14+1:512x512x1x1+3:512x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:512 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x64x28x28+1:384x64x1x1+3:384x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:384 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x96x14x14+1:576x96x1x1+3:576x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:2x2*dilations:1x1*pads_begin:1x1*pads_end:0x0*groups:576 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x128x28x28+1:256x128x1x1+3:256x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:256 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x160x7x7+1:960x160x1x1+3:960x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:960 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x24x56x56+1:144x24x1x1+3:144x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:144 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x256x14x14+1:512x256x1x1+3:512x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:512 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x32x112x112+1:32x32x1x1+3:32x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:32 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x32x28x28+1:192x32x1x1+3:192x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:192 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x512x14x14+1:512x512x1x1+3:512x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:512 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x512x7x7+1:1024x512x1x1+3:1024x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:1024 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x64x28x28+1:384x64x1x1+3:384x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:384 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x64x56x56+1:128x64x1x1+3:128x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:128 --case=pattern/f32/conv_depthwise_fusion_cpu.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x96x14x14+1:576x96x1x1+3:576x1x3x3 --op-attrs=0:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1+1:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:576 --case=pattern/f32/conv_depthwise_fusion_cpu.json
 --reset --case=pattern/f32/conv_post_ops_fusion.json
 --reset --case=pattern/f32/convtranspose_post_ops_fusion.json
 --reset --case=pattern/f32/matmul_bias_post_ops_chain_fusion.json
@@ -128,8 +128,8 @@
 --reset --attr-fpmath=strict,bf16,tf32 --case=pattern/f32/matmul_post_ops_add_add_fusion.json
 --reset --in-shapes=0:384x4096+1:4096x1024+2:384x1024 --case=pattern/f32/matmul_post_ops_sum_relu_fusion.json
 --reset --in-shapes=0:384x512+1:512x1024+2:384x1024 --case=pattern/f32/matmul_post_ops_sum_relu_fusion.json
---reset --in-shapes=0:16x1024+1:1024x1024+2:1x1+3:1x1 --case=pattern/f32/matmul_post_ops_chain_fusion.json
---reset --in-shapes=0:16x1024+1:1024x1024+2:1x1024+3:1x1024 --case=pattern/f32/matmul_post_ops_chain_fusion.json
+--reset --in-shapes=0:16x1024+1:1024x1024+3:1x1+6:1x1 --case=pattern/f32/matmul_post_ops_chain_fusion.json
+--reset --in-shapes=0:16x1024+1:1024x1024+3:1x1024+6:1x1024 --case=pattern/f32/matmul_post_ops_chain_fusion.json
 --reset --in-shapes=0:384x4096+1:4096x1024+2:384x1 --case=pattern/f32/matmul_post_ops_swish_fusion.json
 --reset --in-shapes=0:384x4096+1:4096x1024+2:1024 --case=pattern/f32/matmul_post_ops_swish_fusion.json
 --reset --case=pattern/f32/matmul_select.json
@@ -167,8 +167,8 @@
 # softmax
 --reset --case=pattern/f32/softmax_post_ops_unary_fusion.json
 --reset --case=pattern/f32/softmax_post_ops_binary_fusion.json
-# layernorm
---reset --case=pattern/f32/lnorm_gelu.json
+# layernorm: skip partition number check as it may not fuse on gpu.
+--reset --expected-n-partitions=0 --case=pattern/f32/lnorm_gelu.json
 # shuffle
 --reset --in-shapes=0:1x512x75x75 --op-attrs=0:shape:1x512x75x25x3+2:shape:1x512x75x75 --case=pattern/f32/shuffle_fusion.json
 # large scope
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_f8_all b/tests/benchdnn/inputs/graph/pattern/harness_f8_all
index cab44a82f8e..1f5ab5ae468 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_f8_all
+++ b/tests/benchdnn/inputs/graph/pattern/harness_f8_all
@@ -1,6 +1,11 @@
---reset --case=pattern/f8/f8_conv_add_add_fusion.json
---reset --case=pattern/f8/f8_conv_fwd.json
---reset --case=pattern/f8/f8_conv_post_ops_fusion.json
---reset --case=pattern/f8/f8_conv_post_ops_int8_add_fusion.json
---reset --case=pattern/f8/f8_conv_bias_relu_fusion.json
---reset --case=pattern/f8/f8_matmul.json
+# f8 cases: skip partition number check as they may not fuse on some platforms.
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_conv_add_add_fusion.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_conv_fwd.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_conv_post_ops_fusion.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_conv_post_ops_int8_add_fusion.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_conv_bias_relu_fusion.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_matmul.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_bf16_matmul_add_fusion.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_bf16_matmul_sum_add_mul_relu.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_matmul_sum_add_mul_relu.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_f32_matmul_mul_add_fusion.json
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_f8_ci b/tests/benchdnn/inputs/graph/pattern/harness_f8_ci
index 01c8bdc55ec..d75d2b076dc 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_f8_ci
+++ b/tests/benchdnn/inputs/graph/pattern/harness_f8_ci
@@ -1,2 +1,3 @@
---reset --case=pattern/f8/f8_conv_fwd.json
---reset --case=pattern/f8/f8_matmul.json
+# f8 cases: skip partition number check as they may not fuse on some platforms.
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_conv_fwd.json
+--reset --expected-n-partitions=0 --case=pattern/f8/f8_matmul.json
diff --git a/tests/benchdnn/inputs/graph/pattern/harness_int8_all b/tests/benchdnn/inputs/graph/pattern/harness_int8_all
index d4fe03d5d0d..f49ff9a5c82 100644
--- a/tests/benchdnn/inputs/graph/pattern/harness_int8_all
+++ b/tests/benchdnn/inputs/graph/pattern/harness_int8_all
@@ -1,8 +1,8 @@
---reset --case=pattern/int8/int8_bf16_matmul.json
+--reset --attr-fpmath=strict:false,bf16:false,tf32:false --case=pattern/int8/int8_bf16_matmul.json
 --reset --case=pattern/int8/int8_bf16_matmul_mul_add_fusion.json
 --reset --case=pattern/int8/int8_bf16_matmul_post_ops_fusion.json
 --reset --case=pattern/int8/int8_concat_fusion.json
---reset --case=pattern/int8/int8_conv_bias_fusion.json
+--reset --attr-fpmath=strict:false,bf16:false,tf32:false --case=pattern/int8/int8_conv_bias_fusion.json
 --reset --case=pattern/int8/int8_conv_post_ops_fusion.json
 --reset --case=pattern/int8/int8_conv_post_ops_int8_add_fusion.json
 --reset --case=pattern/int8/int8_convtranspose_post_ops_fusion.json
@@ -12,7 +12,7 @@
 --reset --case=pattern/int8/int8_matmul_sum_add_mul_relu.json
 --reset --case=pattern/int8/int8_bf16_matmul_add_mul_relu.json
 --reset --case=pattern/int8/int8_bf16_matmul_sum_add_mul_relu.json
---reset --case=pattern/int8/int8_avgpool_reshape_fusion.json
+--reset --attr-fpmath=strict:false,bf16:false,tf32:false --case=pattern/int8/int8_avgpool_reshape_fusion.json
 --reset --case=pattern/int8/int8_avgpool_transpose_fusion.json
 --reset --case=pattern/int8/int8_bf16_conv_add_relu_mul.json
 --reset --case=pattern/int8/int8_bf16_matmul_tc_add_quant_fusion.json
@@ -24,12 +24,12 @@
 --reset --in-shapes=0:1x2048x14x14+1:2048x64x3x3+2:2048 --op-attrs=4:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:32 --case=pattern/int8/int8_conv_bias_relu_fusion_2.json
 --reset --in-shapes=0:1x2048x14x14+1:2048x64x3x3+2:2048 --op-attrs=4:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:32 --case=pattern/int8/int8_conv_bias_relu_fusion_3.json
 --reset --in-shapes=0:50x64x56x56+1:64x64x1x1+2:1x1x1x1+3:1x1x1x1 --op-attrs=4:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_conv_add_add_fusion.json
---reset --op-attrs=8209:zps:1 --case=pattern/int8/int8_conv_add_add_fusion.json
+--reset --expected-n-partitions=0 --op-attrs=8209:zps:1 --case=pattern/int8/int8_conv_add_add_fusion.json
 --reset --in-shapes=0:50x64x56x56+1:64x64x1x1+2:1x1x1x1+3:1x64x1x1 --op-attrs=4:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_conv_add_mul_fusion.json
---reset --op-attrs=8209:zps:2 --case=pattern/int8/int8_conv_add_mul_fusion.json
+--reset --expected-n-partitions=0 --op-attrs=8209:zps:2 --case=pattern/int8/int8_conv_add_mul_fusion.json
 --reset --in-shapes=0:50x64x56x56+1:64x64x1x1 --op-attrs=4:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_conv_relu_fusion.json
 --reset --case=pattern/int8/int8_bf16_conv_add_fusion.json
---reset --op-attrs=8209:zps:1 --case=pattern/int8/int8_bf16_conv_add_fusion.json
+--reset --expected-n-partitions=0 --op-attrs=8209:zps:1 --case=pattern/int8/int8_bf16_conv_add_fusion.json
 --reset --in-shapes=0:0x64x56x56+2:0x64x56x56 --case=pattern/int8/int8_bf16_conv_add_fusion.json
 # quantized conv 
 --reset --in-shapes=0:50x64x56x56+1:64x64x1x1+2:64 --op-attrs=4:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_conv_2d_fusion.json
@@ -45,7 +45,7 @@
 --reset --in-shapes=0:2x64x3x3 --case=pattern/int8/int8_reorder_fusion_3.json
 # matmul
 --reset --in-shapes=0:16x256+1:256x1+2:1x1+3:1x1 --case=pattern/int8/int8_matmul_add_mul_fusion.json
---reset --op-attrs=8209:zps:3 --case=pattern/int8/int8_matmul_add_mul_fusion.json
+--reset --expected-n-partitions=0 --op-attrs=8209:zps:3 --case=pattern/int8/int8_matmul_add_mul_fusion.json
 --reset --in-shapes=0:16x256+1:256x1+2:1x1+3:1x1+4:1x1 --case=pattern/int8/int8_matmul_mul_add_mul_fusion.json
 --reset --in-shapes=0:16x256+1:256x1+2:1x1 --case=pattern/int8/int8_matmul_logistic_fusion.json
 --reset --in-shapes=0:16x1024+1:1024x1024+2:1x1024 --op-attrs=4113:scales:2 --case=pattern/int8/int8_matmul_logistic_fusion.json
@@ -65,7 +65,7 @@
 --reset --in-shapes=0:16x13+3:13x512+6208:16x512 --case=pattern/int8/int8_bf16_matmul_add_fusion.json
 --reset --case=pattern/int8/int8_bf16_matmul_mul_w_smooth_quant_fusion.json
 --reset --case=pattern/int8/int8_bf16_matmul_relu_w_smooth_quant_fusion.json
---reset --op-attrs=8209:zps:4 --case=pattern/int8/int8_bf16_matmul_add_fusion.json
+--reset --expected-n-partitions=0 --op-attrs=8209:zps:4 --case=pattern/int8/int8_bf16_matmul_add_fusion.json
 --reset --in-shapes=0:16x13+1:13x512+2:1x1+3:1x1 --case=pattern/int8/int8_bf16_matmul_mul_add_fusion_2.json
 --reset --in-shapes=0:16x512+1:512x256+2:1x1+3:1x1 --case=pattern/int8/int8_f32_matmul_mul_add_fusion.json
 --reset --case=pattern/int8/int8_f32_matmul_mul_add_fusion_2.json
@@ -83,7 +83,7 @@
 --reset --in-shapes=0:2x16x5x5+1:16x4x3x3 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:4 --case=pattern/int8/int8_convtranspose_post_ops_square_fusion.json
 --reset --in-shapes=0:50x64x56x56+1:64x64x1x1+2:1x1x1x1+3:1x64x1x1 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_chain_fusion.json
 --reset --in-shapes=0:2x17x8x8+1:17x16x3x3+2:2x16x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:3x3*pads_end:2x2*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
---reset --op-attrs=8209:zps:4 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
+--reset --expected-n-partitions=0 --op-attrs=8209:zps:4 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
 --reset --in-shapes=0:2x16x5x5+1:16x16x1x1+2:2x16x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
 --reset --in-shapes=0:2x16x5x5+1:16x16x3x3+2:2x16x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
 --reset --in-shapes=0:2x16x5x5+1:16x17x1x1+2:2x17x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:0x0*pads_end:0x0*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
@@ -95,8 +95,8 @@
 --reset --in-shapes=0:2x17x5x5+1:17x16x3x3+2:2x16x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
 --reset --in-shapes=0:2x17x5x5+1:17x3x3x3+2:2x3x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:1 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion.json
 --reset --in-shapes=0:2x20x5x5+1:20x4x3x3+2:16+3:1x1x1x1 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:4 --case=pattern/int8/int8_convtranspose_post_ops_add_fusion.json
---reset --in-shapes=0:2x16x8x8+1:16x17x3x3+2:17+3:2x17x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:3x3*pads_end:2x2*groups:1+8209:zps:4 --case=pattern/int8/int8_convtranspose_post_ops_add_fusion.json
---reset --in-shapes=0:2x16x5x5+1:16x5x3x3+2:20+3:2x20x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:4 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion_2.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x16x8x8+1:16x17x3x3+2:17+3:2x17x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:3x3*pads_end:2x2*groups:1+8209:zps:4 --case=pattern/int8/int8_convtranspose_post_ops_add_fusion.json
+--reset --expected-n-partitions=0 --in-shapes=0:2x16x5x5+1:16x5x3x3+2:20+3:2x20x5x5 --op-attrs=5:strides:1x1*dilations:1x1*pads_begin:1x1*pads_end:1x1*groups:4 --case=pattern/int8/int8_convtranspose_post_ops_sum_fusion_2.json
 # bnorm
 --reset --in-shapes=0:1x56x56x64+2:64+3:64+4:64+5:64 --op-attrs=1:data_format:NXC --case=pattern/int8/int8_bnorm_relu_fusion.json
 --reset --in-shapes=0:1x64x56x56+2:64+3:64+4:64+5:64 --op-attrs=1:data_format:NCX --case=pattern/int8/int8_bnorm_relu_fusion.json
@@ -114,14 +114,14 @@
 --reset --in-shapes=0:1x64x600x600*acdb+1:1x64x600x600*acdb+2:1x64x600x600*acdb --op-attrs=3:axis:1 --case=pattern/int8/int8_concat_fusion_3.json
 --reset --in-shapes=0:1x64x300x300*cdba+1:1x64x300x300*cdba+2:1x64x300x300*cdba --op-attrs=3:axis:3 --case=pattern/int8/int8_concat_fusion_3.json
 --reset --in-shapes=0:1x128x150x150*acdb+1:1x128x150x150*acdb+2:1x128x150x150*acdb --op-attrs=3:axis:0 --case=pattern/int8/int8_concat_fusion_3.json
-#layernorm
---reset --case=pattern/int8/int8_lnorm_gelu_quantize.json
+#layernorm: skip partition number check as it may not fuse on gpu.
+--reset --expected-n-partitions=0 --case=pattern/int8/int8_lnorm_gelu_quantize.json
 # layernorm with zp != 0
---reset --op-attrs=2:zps:1 --case=pattern/int8/int8_lnorm_gelu_quantize.json
---reset --case=pattern/int8/int8_lnorm_multiply_quantize.json
---reset --case=pattern/int8/int8_lnorm_tc_multiply_quantize.json
+--reset --expected-n-partitions=0 --op-attrs=2:zps:1 --case=pattern/int8/int8_lnorm_gelu_quantize.json
+--reset --expected-n-partitions=0 --case=pattern/int8/int8_lnorm_multiply_quantize.json
+--reset --expected-n-partitions=0 --case=pattern/int8/int8_lnorm_tc_multiply_quantize.json
 # layernorm with zp != 0 and broadcast binary
---reset --op-attrs=3:zps:1  --in-shapes=5:512 --case=pattern/int8/int8_lnorm_tc_multiply_quantize.json
+--reset --expected-n-partitions=0 --op-attrs=3:zps:1  --in-shapes=5:512 --case=pattern/int8/int8_lnorm_tc_multiply_quantize.json
 #softmax
 --reset --case=pattern/int8/int8_softmax_add.json
 --reset --op-attrs=3:zps:32 --case=pattern/int8/int8_softmax_add.json
diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_fusion.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_fusion.json
index 1e732f6631f..2444a1966ce 100644
--- a/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_fusion.json
@@ -2,6 +2,7 @@
   "version": "3.0.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
   "graph": [
     {
       "id": 2065,
diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_relu_mul.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_relu_mul.json
index 468ec8a52d1..ac375a72241 100644
--- a/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_relu_mul.json
+++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_conv_add_relu_mul.json
@@ -1,7 +1,17 @@
 {
-  "version": "3.0.0",
+  "version": "3.8.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0,
+    3,
+    10,
+    14
+  ],
+  "output_ports": [
+    15
+  ],
   "graph": [
     {
       "id": 0,
@@ -234,10 +244,6 @@
       "name": "CONV",
       "kind": "Convolution",
       "attrs": {
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        },
         "auto_pad": {
           "type": "string",
           "value": "None"
@@ -256,6 +262,10 @@
             0
           ]
         },
+        "weights_format": {
+          "type": "string",
+          "value": "OIX"
+        },
         "pads_begin": {
           "type": "s64[]",
           "value": [
@@ -338,119 +348,6 @@
         }
       ]
     },
-    {
-      "id": 5,
-      "name": "DEQUANTIZE_ADD",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 8,
-          "dtype": "s8",
-          "shape": [
-            50,
-            64,
-            56,
-            56
-          ],
-          "stride": [
-            200704,
-            3136,
-            56,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [
-            50,
-            64,
-            56,
-            56
-          ],
-          "stride": [
-            200704,
-            3136,
-            56,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ]
-    },
-    {
-      "id": 6,
-      "name": "TYPECAST_ADD",
-      "kind": "TypeCast",
-      "attrs": {},
-      "inputs": [
-        {
-          "id": 9,
-          "dtype": "f32",
-          "shape": [
-            50,
-            64,
-            56,
-            56
-          ],
-          "stride": [
-            200704,
-            3136,
-            56,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10,
-          "dtype": "bf16",
-          "shape": [
-            50,
-            64,
-            56,
-            56
-          ],
-          "stride": [
-            200704,
-            3136,
-            56,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "constant"
-        }
-      ]
-    },
     {
       "id": 7,
       "name": "SUM",
@@ -541,7 +438,7 @@
             56,
             1
           ],
-          "layout_type": "undef",
+          "layout_type": "strided",
           "property_type": "undef"
         }
       ],
@@ -561,7 +458,7 @@
             56,
             1
           ],
-          "layout_type": "undef",
+          "layout_type": "strided",
           "property_type": "undef"
         }
       ]
@@ -636,4 +533,4 @@
       ]
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_matmul.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_matmul.json
index 865a470f52b..101308454f1 100644
--- a/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_matmul.json
+++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_bf16_matmul.json
@@ -2,6 +2,7 @@
     "version": "3.0.0",
     "engine_kind": "cpu",
     "fpmath_mode": "strict",
+    "fpmath_mode_apply_to_int": "false",
     "graph": [
       {
         "id": 0,
diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_conv_add_add_fusion.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_conv_add_add_fusion.json
index f0fd9d98b13..5a21b0c0096 100644
--- a/tests/benchdnn/inputs/graph/pattern/int8/int8_conv_add_add_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_conv_add_add_fusion.json
@@ -1,7 +1,17 @@
 {
-  "version": "3.0.0",
+  "version": "3.8.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0,
+    1,
+    2,
+    10305
+  ],
+  "output_ports": [
+    14512
+  ],
   "graph": [
     {
       "id": 2065,
@@ -142,10 +152,6 @@
       "name": "CONV_0",
       "kind": "Convolution",
       "attrs": {
-        "weights_format": {
-          "type": "string",
-          "value": "OIX"
-        },
         "auto_pad": {
           "type": "string",
           "value": "None"
@@ -164,6 +170,10 @@
             0
           ]
         },
+        "weights_format": {
+          "type": "string",
+          "value": "OIX"
+        },
         "pads_begin": {
           "type": "s64[]",
           "value": [
@@ -382,73 +392,6 @@
         }
       ]
     },
-    {
-      "id": 12305,
-      "name": "DEQUANTIZE_6",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "s8",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 10305,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1,
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
     {
       "id": 10241,
       "name": "BINARY_5",
diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_matmul_mul_add_mul_fusion.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_matmul_mul_add_mul_fusion.json
index 9dc3823580f..936868ea853 100644
--- a/tests/benchdnn/inputs/graph/pattern/int8/int8_matmul_mul_add_mul_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_matmul_mul_add_mul_fusion.json
@@ -1,7 +1,18 @@
 {
-  "version": "3.0.0",
+  "version": "3.8.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
+  "input_ports": [
+    0,
+    1,
+    2,
+    8257,
+    4
+  ],
+  "output_ports": [
+    14512
+  ],
   "graph": [
     {
       "id": 2065,
@@ -239,65 +250,6 @@
         }
       ]
     },
-    {
-      "id": 10257,
-      "name": "DEQUANTIZE_5",
-      "kind": "Dequantize",
-      "attrs": {
-        "axis": {
-          "type": "s64",
-          "value": 1
-        },
-        "qtype": {
-          "type": "string",
-          "value": "per_tensor"
-        },
-        "zps": {
-          "type": "s64[]",
-          "value": [
-            0
-          ]
-        },
-        "scales": {
-          "type": "f32[]",
-          "value": [
-            1
-          ]
-        }
-      },
-      "inputs": [
-        {
-          "id": 3,
-          "dtype": "s8",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ],
-      "outputs": [
-        {
-          "id": 8257,
-          "dtype": "f32",
-          "shape": [
-            1,
-            1
-          ],
-          "stride": [
-            1,
-            1
-          ],
-          "layout_type": "strided",
-          "property_type": "undef"
-        }
-      ]
-    },
     {
       "id": 8193,
       "name": "BINARY_4",
diff --git a/tests/benchdnn/inputs/graph/pattern/int8/int8_maxpool_add_mul_fusion.json b/tests/benchdnn/inputs/graph/pattern/int8/int8_maxpool_add_mul_fusion.json
index 56363610437..9a71813e57f 100644
--- a/tests/benchdnn/inputs/graph/pattern/int8/int8_maxpool_add_mul_fusion.json
+++ b/tests/benchdnn/inputs/graph/pattern/int8/int8_maxpool_add_mul_fusion.json
@@ -2,6 +2,7 @@
   "version": "3.2.0",
   "engine_kind": "cpu",
   "fpmath_mode": "strict",
+  "fpmath_mode_apply_to_int": "false",
   "graph": [
     {
       "id": 2065,
diff --git a/tests/benchdnn/inputs/graph/test_graph_all b/tests/benchdnn/inputs/graph/test_graph_all
index 6ba7c2af95e..ac3b1c0e42c 100644
--- a/tests/benchdnn/inputs/graph/test_graph_all
+++ b/tests/benchdnn/inputs/graph/test_graph_all
@@ -3,4 +3,4 @@
 --batch=test_graph_f16
 --batch=test_graph_int8
 --batch=test_graph_f8
---batch=complex_fusion/harness_mha_all
+--batch=test_graph_fusions
diff --git a/tests/benchdnn/inputs/graph/test_graph_ci b/tests/benchdnn/inputs/graph/test_graph_ci
index 94cdf3bb5ea..0dd82e71f8a 100644
--- a/tests/benchdnn/inputs/graph/test_graph_ci
+++ b/tests/benchdnn/inputs/graph/test_graph_ci
@@ -7,3 +7,4 @@
 --batch=pattern/harness_int8_ci
 --batch=pattern/harness_f8_ci
 --batch=complex_fusion/harness_mha_ci
+--batch=complex_fusion/harness_mlp_ci
diff --git a/tests/benchdnn/inputs/graph/test_graph_fusions b/tests/benchdnn/inputs/graph/test_graph_fusions
index ce0e393b3ca..1e24bc292f3 100644
--- a/tests/benchdnn/inputs/graph/test_graph_fusions
+++ b/tests/benchdnn/inputs/graph/test_graph_fusions
@@ -1 +1,2 @@
 --batch=complex_fusion/harness_mha_all
+--batch=complex_fusion/harness_mlp_all
diff --git a/tests/benchdnn/inputs/graph/test_graph_fusions_gpu b/tests/benchdnn/inputs/graph/test_graph_fusions_gpu
index e7ea11ab380..b91df23259c 100644
--- a/tests/benchdnn/inputs/graph/test_graph_fusions_gpu
+++ b/tests/benchdnn/inputs/graph/test_graph_fusions_gpu
@@ -1,2 +1,4 @@
 --batch=complex_fusion/harness_mha_all
 --batch=complex_fusion/harness_mha_ci
+--batch=complex_fusion/harness_mlp_all
+--batch=complex_fusion/harness_mlp_ci
diff --git a/tests/benchdnn/inputs/ip/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/ip/option_set_fwks_ext_gpu
index 709acc2ad2e..4cc5696e659 100644
--- a/tests/benchdnn/inputs/ip/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/ip/option_set_fwks_ext_gpu
@@ -1,470 +1,480 @@
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1024ih1iw1oc1000_n"6354813678a39276bfb77f0e5f273bb1*5&703fa354e749299360509333e6f849c4*5&96402e762c965de01290479fafb589d3*5&efd374a67be075ef92966524982b1658*5&24b79d75b05256f9871b266a84dd6a7b*5&016cd80a36066acf02c86abf4c8251fe*5&808380a209088eb7bc9a6041b498a732*5&6dd5ec0d9107950533790011f0b422e4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic2048ih1iw1oc1000_n"81217ce5cadcd1032a76089ebec74a7a*5&748c03e3f60317646f54471a0592d184*5&e700193a302441fce5918cb49f68ccd2*5&17c56738e8a37d682104320e444b2f44*5&085794e7f912e2a6da3b3bb2ca795dfa*5&ac9a355b29dc2e131fb1f333404bb3c1*5&99bfa3706d13505168813a0116a8cc8c*5&a9d31980bc7f91c70fb934732ccbf1df*5&7b4547e4966499592c77a80864669b36*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic1280ih1iw1oc320_n"ed3400ae127b83a97aa7324e1924a773*5&f24260097661a34d5faf33f1682c2a86*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic320ih1iw1oc1280_n"6cb1a824a1b179da5e4f2095771dd869*5&3865f215683b48f2b0c630374d0b7d4e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb100ic2048ih1iw1oc360_n"b7635c100abe7dc0d3dac1a96fce8dc0*5&bf3e002dcb80e0eb3bfa81a0ff04db65*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb100ic2048ih1iw1oc91_n"467e060f2676632ea7be15f178e99055*5&f12ff5117ba85498ee2afc20a6824a67*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc1000_n"fa81f3dc1df1f8ffc3a8b830ff35aba8*5&1b583905fe9ab1c063bdb4bfe21d86f2*5&ac49561a63c70850d1a67492436f5afc*5&1f868f5c9bc5576a01179dfe5a6b39bc*5&50b7d120d35153f5e916e1863cb0cc35*5&a4e14fdf5e321d5fee6facaed7b0d68a*5&b449ba051b3512ceb75eeec784945f51*5&e249b5ba334783dd3051bb1af98f3845*5&46513de9ba9b6267fb683d0d3aa96c8f*5&8333b62b4cdcf8a29937c0b4029710f1*5&28676ba038f52e9f5fc41fb55007371b*5&04a3ad0b708d55ef0edcdaca262a9260*5&cc1754091e05fa0011a603b112ea95b0*5&25819b1c63467bdbd923dbb0fa551f6e*5&46c43a01bdd28321e886adceef4e5c92*5&3dfb628543a722bbd880e4653361f40e*5&955dcbe6acda6d9fef05c03bcf409a18*5&1fa7f6fc45d65c03b2a4aa222f61356c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic1280ih1iw1oc320_n"e4f49bd208caa6ee72a6177c28d1c02e*5&bcef92020746c09f37c26ce5cb4a2e24*5&c3ed2b8dfd6a08fc0f7debe196d1a689*5&49c3ed6823695348cff163846f30c959*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1ic320ih1iw1oc1280_n"3177cac4ef0584817d7a226e5dca5bd8*5&5582fb7b727d31407ce2e0d5f215c532*5&b1221f5d725d0043f7b5f614c941b331*5&c8794e70fc279a1a1bb73d3d92623661*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic256ih1iw1oc27_n"80bd2567bd889f16f05f3ebf8ca7f4cb*5&329043cb955f6e5c20fe86646690cead*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc1024_n"d81db491d204558c4402c362947bddb8*500&673f34dce2401b356cad909a6df459b1*500&ab2d0b7db133073782359f491eac84ab*500&b1c5f445ecebcc8bd56b8fed81e00fd7*1200&4e9b0e3226950673757a363a906ea826*1200&6842ccdfd4b183b44f3ab0ccd2de213c*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb25ic512ih1iw1oc37_n"e29f7df3886875076e280b8a402fd6cd*5&affb3243d5076d6b77ac3132f3b8ec4e*5&95a23a9cd949d2b1cfca964320595abc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb292ic512ih1iw1oc768_n"cba57715527875fceddb8cb074d997be*5&70c9d467b86c7c33b4c9d6bab637a5c6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb292ic768ih1iw1oc768_n"def4ca262658ad1bbbc204e3a0ee0d10*180&75d7051ed5d97d951c661d3bdff46542*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb292ic768ih1iw1oc768_n"689b365b796cc3fb113000689a8069ac*60&1fcdfe87f4a75c85b52982c5aebd6025*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb292ic768ih1iw1oc3072_n"5b90368b10f7cda03522e809ac10f381*60&16a309dee837a230d681d9be44b2de5f*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb292ic3072ih1iw1oc768_n"51bda0195086f1bdb78f086f1a3d93c6*60&5a37749f31fc31deac1f3d634a371928*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb292ic768ih1iw1oc32_n"da82be0c3f5cecb8af8db75278ac03ef*5&5959df00903a18cd020a76a6b9ced3ba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb128ic768ih1iw1oc768_n"1d6e7446fb2bbafe59abac9f8298e500*100&f5f0f039b1a6deb3031fbf6920f82246*120&d241b18c4c947e4396a1c52e08a77124*10&db376f588f19562b3ae11b86fdd68642*60&d9572b0b91269452bc5a601ddceb0751*120&de220324f5d89a28432d6006be3e0656*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc768_n"b29ea32934bacfab026a75c3468e8043*50&4e7fb76e0ac10735cf2397867ede200b*60&628da1425c83393adeacfdf4898012d2*5&18eff12a50fdd218c3fd990e6f493c9e*30&967a0e7ee2802ba0b4c8a8d238a0102d*180&9c55a9fbe1e51fe3072a155185b3ac98*60&06c633bad9108bd8c827f11fa59304db*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"22997d6cfbd681d4cd6bf96f3449decf*60&e08c80e8243fd94d0336164518811092*60&e6f3f53c4750e4736919cf104c3d44eb*35&49180b4dd6aad854084f9f3e035eee7e*30&82828ec6cb29c4a347c3d997309e4b25*60&c552e2132b388c015c762a0d489edbe1*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"e9141c21bcd5430d82055cf128151ae2*50&18c0661bbae2f0207f89291416670304*60&207c7a59dc153bd073405f302baff407*30&0d06c4381231ad9bac2df6a77d052263*60&2f53e6061c1290acd9e10414e5288864*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"8887cf75b79da5f065caa01b2b9687ce*55&d682ec0ab2f5292bd1abb27536904fcd*60&f6b122966cb505e6e24de4cfa19bb701*30&d4bf24efe5b7090e282673c319b804f7*60&8cef4281a986dc3138582be4c3e79f48*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic768ih1iw1oc768_n"c93410a978b9189d1bec2b30c3e2e20d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb1ic512ih1iw1oc512_n"a955b61d735ea0e5f4853bd9253dd8f8*40&48f58de282407ddc9260638595c168e8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc512_n"98a677c9974b1c38a41fb3848733a17f*75&5c100b02982241849e8aceafa3ff39fc*75"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc64_n"36fad3ef932d1a0397a4bce0b56496c0*15&7f65fcde4da39c0fea80d2f89509eddc*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc128_n"f8e2d63b7177a519a08ac9c37776c19c*15&b7f18de40d49e1c60511b56db2622388*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc256_n"a82dfd38cacb79cd48d4052f0226ef58*15&4cc3dfd2eb24f4dd515ae18177ec2535*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc32_n"9bfcba0e839c65a40e7397d76868c272*10&f1653fa45423eaaf72c4da528cc44b63*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1280ih1iw1oc1000_n"b8eb1a101d0b20b6b93468b6bd9b7b88*5&8e27fbd6d41b06a6ce020dea9815fa33*5&b80bfd5f9231edff3c89553891c8b99f*5&5833eb1c253136cd1a187d051651e953*5&ed0cca59ff36da69b1744c0ab6789f56*5&533a747690a783295e1597220661033b*5&81cc5b40d48648197a01de895bfc776f*5&bea40334255c641c2587c8dd8c7dff62*5&d8a2528e6a3d46ec5cabc4faf7a4e200*5&bf79e04bbeda125d9233a447e48f4b38*5&ae136712794320f97d8cc2388e4515ad*5&2e46e4d03c9cb98876c85c97fe2d921e*5&887b02633978f937faf9f3f5472d4860*5&6f0d7d50e1d0373e0c5fc439c7a076b6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc768_n"0e73fb64b3cc0f84843dfc7582ae5611*180&b469acbc568415532dc403c0d7089d0b*180&c451198d80481860ff16964c49f8a763*20&b39b3d75a2b3160d610d19ea21bc8ae0*180&d7c0fac8cd5c010964f1c992fc8efe66*180&b53ebcb2d439c5cc60103471659b7d81*90&f7f42a3d6e506ccb29206f6db77d9a87*90&1111bf4930af4df861bde0e38cf7e879*180&f0340e9df39b3b9bf156dd6a0c7bd7d6*180&e0957679d46fd834f96f9b596fc7b36f*180&369d358cb1d7a665d139de5ed7b5594d*180&ef55c6785e67d9bbae6048fe41519cfe*180&e655e0bff9183c40a18fbb5ec02b244d*180&d4c83e19c67bf0d4c058648bfa7576ea*180&97b975ce1105adb98555fc166878138c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"9f97d84ac7422768aac0b8e7818fc350*60&cf71bd5e5f9d5a963f37e282f12acff6*60&14859bfe6985b320489351ba6d176596*60&c39957d7a98802d6a7ca6eaaa0c379dd*60&3a6dad9abe3241f8a03153ca2a6c1a4d*30&37a67da9ebdfd5e4041e1a3eadf21a47*30&2532d7c61da4b7adc4aea6d46ef10cfa*60&1d038f7aeb69b7bd0c76573cb10bd613*60&b8ae9f508cc18f936bfd865d6ce74c07*60&56431350eca36ccbbcf9b458bee8e4d2*60&d237931c79177e584d581de90f4ac7e8*60&aa6eb34ab5097b2457d6f2d03f5bd611*60&a5f37accc52577ed7965cc1b7616c60b*60&75ed67846737e6bdbb0e00a6c287522e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"306c97ad06ad1e8b4b6251cfce75f0b9*60&2645542f031cb2a98d91f8b931ec6eaf*60&233f5f3c179b2e0c10629a4df102638e*60&a3bb7c734d74be5e17f36d31329d9603*60&9d1dc832cb2b8f46c0e041c14ccbc79d*30&fb88e48c369c271663138b6e8a3fd3e2*30&018f454eeb0a76246bb3884afb96c0a3*60&8c693d7d3ee092e8db8e3b02ec3cb977*60&a179797a94bbf4b56ad28b0b700e9e82*60&c37706e592f9a2223c83fab616509aba*60&02192e5f48d3e696c9a39f97f3d8a3a8*60&55a47bc2454690c2912a87978310393b*60&59e2b0fd33b487dde7483d95759b05f1*60&36e4f9ec1a4301c36885929805909230*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"7af6e3a2f76cc2ca14e129a1be465bd7*60&6134b4383301c1af8e79d78ac358a5bc*60&b2c99bbc917fa391fe8ce86650298f91*5&dd0bec2e04835439e789ce5b090ba74a*60&8a95f913d85c8152fc1a2e3d814ee7f6*60&fdca4794b317de51b3fb613d9f630418*30&d0ad107ceb772f2fbb6dfc81dfa063c3*30&5ea3eafbca522077b3a3073db0dce1f8*60&33bc95d29af1be2651a6bdaef1bb52d2*60&2dc0b6c8797ca16d68164022c3bf3a16*60&b31760ac975123ef583c88cf755fccb0*60&17cfd787bbb10888b951ec30136ef6c8*60&c09ff654c21605e1098ef5901d66ab78*60&96e714f19dee25fc9b14095b5b5875a7*60&9c2fa392b11a2ad8227ab89e422c7f78*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc768_n"764d10bd7cc3914ca36d6f8bf865970c*5&2e3623facae045d166069b00ebff157a*5&db94ef2f30cbd9047552f53ffd3f1c32*5&3ec028e09c18189adeb49ec2b5eae4bc*5&7e2b1e7662008db83df368f34440bd0b*5&a1673360546976fcdc2cad470d207c6b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc768_n"1dafa9e33127a68652890bedb1de7528*5&5e0673d9cdf5bc9b1d1de57eb74a56c7*5&4b5114d5037e6eeba349a340f605824d*5&15e7db3b6393a3c4978470adf0ba5bf1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc2_n"abe82b6304040632bf32809561070992*5&318c6b7265e859eda9370e7119223f76*5&cc72d6e833d0d6c0ddae22eacb414a3a*5&2a6d44eb154218e185e291893a7b7055*5&694222fcdf56d8b016791338d6455b04*5&3b9c2731ed448a7899bd7b02bfc2a907*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic72ih1iw1oc18_n"8a0a4583b9248ff00d7c76e43d6e95e9*5&00ace71eab31059e4118db57347eaaff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1ic18ih1iw1oc72_n"adf111abea6a127af0ac8fa47ff0dcb8*5&4481655f988239c405454c5e777f0b74*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic120ih1iw1oc30_n"0299e56e5cb6e5ed1309dc6c17880ab1*10&9566ec3281c86b421d1634207431a2aa*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1ic30ih1iw1oc120_n"d912dec19baab16fa3a25558059fa342*10&e618098e15e752f7d2562b7ced7299c5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic480ih1iw1oc120_n"3d0316300c89dcfc6ae0996ba699ccfe*5&6189197f5713ffa05a904a0406ee8b98*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1ic120ih1iw1oc480_n"4c81888a8d947f512ec68cdecf708953*5&5c2cc2d49b91cdf84bca05c417f01dc3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic672ih1iw1oc168_n"edea9b8f58121e9fa7fed8d88d3672a5*15&a749e5faa5ec55011fc597d52e27db94*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1ic168ih1iw1oc672_n"129aace48780a685ed3c2f39ac5db51e*15&c3c643e971419d475636fbaad64ea54a*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic960ih1iw1oc240_n"b5b47aed8d572f1074b65e8a6b756c1b*5&96ca3ccd1db7827dc2f2935ae9a42d5f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:0.271:0.314:1.234 mb1ic240ih1iw1oc960_n"865d2f0b58b76190b136d719d78a4ffe*5&25d0cc1d1062ad0b03894b8bd9b56cc7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb1ic960ih1iw1oc1280_n"e309ca84f53badd767b5ece473d5f432*5&09ab1b658f9dae1201b595bcb4c8c0c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb4096ic512ih1iw1oc512_n"53126ea623d71f9b4c7087e238183e2e*20&fdbf72482a8192b30f6f5554ead62da1*20&9ff0534b15e58028fb9af924f6ed6696*20&b6a551d5130747759eb0342a0e56738b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic1280ih1iw1oc1000_n"16dea72cb33a4f8b4c92c2073e866ca7*5&9243873ca04bc1c7dc9ad76dbe83f77c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1280ih1iw1oc1000_n"47fca61d4f30171d4eadcf99e46d3feb*1&5f28098962091aa3c2669a1c1877cdf1*1&11f811278e21c666ca4c4519466e3171*1&d800b4bb6211949301d412fd8c57692e*1&9bca355182bbe7d26b183379964fe4d2*1&c21ebc605e45cd1af2533cc8bb2e4575*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1024ih1iw1oc1000_n"db327ecffe0f7e4b03bda148b32545dd*1&8f2ae139aa9962783c114c82f3901176*1&57f62f244237e5a7b624093bc125a0f2*1&3c8adbd9c8c20d029e176ce013fb8e11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic2048ih1iw1oc1000_n"c4ed7e106edfd847704bb7be56f6627f*1&1d77a52ca159fec0efa6827a60114d76*1&b603816a38c547903fb35c915509e3a9*1&b413733877ddfa60d426bf72f263078a*1&beec445d4f8c6c3d9ed348a373d46598*1&e7044aaabe766ca7729fdc9a6e0ced8e*1&26043bd67c17f099cb663f7ff43fd99b*1&f8999eddbe1fc49ac151bd63b86fc5e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic256ih1iw1oc100_n"29c2b428a77eeaf2e4b5dd7bb46a3a4d*1&ce8b179b23ff2b004e80f0948a6cedc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic2048ih1iw1oc91_n"17f54d6247dc97d992ae47d6c191a7e2*1&83cde090cd5ba6059904a7ac3abff4fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic2048ih1iw1oc360_n"0f41bc73b20a4fe1444f67c2c8ef4749*1&c0cb62d55492672525bd33b1da0b3090*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic25088ih1iw1oc512_n"8e14523e5bde97c39039c3def299b906*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb300ic2048ih1iw1oc91_n"9f3b72f63b27110bee5354c66dd33914*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb300ic2048ih1iw1oc4_n"1cc2932a0d609abdb62af7f71645d2fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic768ih1iw1oc768_n"daee9713c01bf9a0311eba03f7f18919*48&e06164e909ae75985320f5c24aa78e48*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb4096ic768ih1iw1oc3072_n"4b202a9d9a73b20c86d7247331f591f7*12&c51d3e8e3fc0020e9d71cfc17fd21361*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic3072ih1iw1oc768_n"a04da6752576f16e76232847fa4e0caf*12&59281cce23ea103dd764403ceb3885a1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 mb32ic768ih1iw1oc768_n"08a878b46314ad8a5af67406e27ee611*1&7c080491cc0d0d885763f643fe6cc7b8*1&4d1567c3f079235417f31369467f7c66*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic9216ih1iw1oc4096_n"f294ed621ce1893f9d325bc16d16ec0f*5&c1dcb1eba025049bb8b3fb65b994a953*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic4096ih1iw1oc4096_n"86de595d8a6039b9036bf512ec650473*5&bdb0c59a4a6535cf217d0917b055f74d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic4096ih1iw1oc1000_n"b36d0b2196f7039998acbd1c77d6ee99*5&fd136fe674aba8cd106f3a5435619308*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic768ih1iw1oc2304_n"576c22ce7a3afc0de3ad18a66c5e37b9*12&73f53ebdffe0f2b23867bca52b388c55*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic768ih1iw1oc768_n"541a161555d367a7d7e4cfbdb4f5ae6e*12&8c232d815b231bbb72ea012e93524cd1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic768ih1iw1oc3072_n"58f056282a1b0dcb3263c9de107e5c50*12&bce900d20eb8ed2ae89957bc4cca3c67*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic3072ih1iw1oc768_n"8ec22a081033b7485963c30afb307f0d*12&02e2afb5b7cb2cc00ef879a0338aefe6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic9216ih1iw1oc4096_n"bb143d45751effa7552eb8241afb6160*5&4908247b697010c13aa5c17f0293e558*5&5bbac1c9b9e2e1a63cc869ef9923cf1d*5&09be3bb58c1e6d9a9c6fdf2fdbe064cb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic4096ih1iw1oc4096_n"42ed7c41681ed9b30512a9aebd0f287b*5&75d3d9d0230024ed63f6d65094fd92f3*5&7287e7c2544ad09b854d8a9d11901787*5&3b84778d41dc9d277f634d98bc3d6f52*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc1000_n"f29f144858472b457d3fb797e9b208b3*5&fe396453a2950e8e56cb121801771709*5&78cf9f001a52dd614b733ece14c9ca7d*5&a11a7369ac074c925a7194a58231044e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic1000oc1_n"e0eaf07b8dcb2d7787757b0274931e5f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic1000oc1000_n"331dc25e03c41a93182a7f6d752c1240*2&cab7ffec3cfd3d0e6fd0f04fb2816b58*2&d0b96b73223241548f760d495355f712*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic3000oc1000_n"dae4c0f916c870b384a28ed3489a8da9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1ic62ih1iw1oc8192_n"cc57044c7551526c490fee30dfb5cd43*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32ic440ih1iw1oc2048_n"655f9e363e14e4cbc4eb76a28ab80285*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32ic2048ih1iw1oc2048_n"5b7d4c0781897bb07fee5d7bdcfc7ed0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic2048ih1iw1oc3941_n"fdecb8eaf9e80060a25ef4122000fc78*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic62ih1iw1oc8192_n"797e9d34dfe30b77dd10beab1b8a3016*1&912e19c16969ed34d505a3a9a7db2c9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic640ih1iw1oc1280_n"050155c6f2d988f6b91eca3df480cba1*10&323d310225eec2a7cb5d60422639c6f8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic1024ih1iw1oc1000_n"ae8015fae08999ffb64cf52afadfeb23*5&e426bb311e80945283bae3fd54f38f85*5&84c29653ca13be24a990da9b4c80344b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1000ic12544ih1iw1oc1024_n"5e4b4c737ec52c32084b4a0a8a31c3f5*20&ef9b0b4a04d77ea130f66f463a7481da*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1000ic1024ih1iw1oc1024_n"e1f83ab3af8909cb74a5d179a770c38b*5&52e93b4d8e8b7aad6c73ab98cdd31cc7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1000ic1024ih1iw1oc81_n"6c309805c6270764170fd0d470110874*5&e1c9534595fc17065a1f1a54784715ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1000ic1024ih1iw1oc324_n"7842f96ac5d3a7fd04a8f7f9019bb06d*5&54f108bc5c0feb8d92bb07c14db6afcd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100ic25088ih1iw1oc784_n"c72fe1d5fdc4288f04ce543417dca959*5&d71603bbe4925bf0de5091ae3f1f17ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1536ih1iw1oc1000_n"ece469e88ac43a0efd248973f207932e*5&622cbd83d3ef8a42f952d232aee60d5c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1536ih1iw1oc1001_n"82c7b1c28b4433821ec8869bef277dd4*5&dcda8d55874a82e0d4488c52b5539dc0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc91_n"d9d0b7da6186112fdde37e8477a1940a*5&4a6fef8461c217a0a5552aa259d43cf9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc360_n"d6c0bf01da30d72db09f4c3a0ced8b30*5&b55c4692d95400c2da3ee6704d6131a2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc1024_n"0e3f55d96dac9e5e6a46a5119ef4fcb0*360&9d858f3580b5090a48a63fa915f5f93f*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic1024ih1iw1oc1024_n"42228b18df75cebba35098d53be98755*120&595e179aa90d99c7108ef7eca6d63f07*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb384ic1024ih1iw1oc4096_n"cedcccc3165aff79624610046141bafd*120&31124e2751f59fc9d730d45ed183073d*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic4096ih1iw1oc1024_n"cd1bf5ee51ed408cce3f181d783e1f5b*120&0443d4ac523cd57d777e9183a1d3717c*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc2_n"025c23e2a11dc00b72fa90804384f36d*5&35ea31803c238180fb03e488e8ceec04*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic1280ih1iw1oc1000_n"c227a3828c9949abafd55775092cf495*5&88440147ee2dd532343f8057af41b0d6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2 mb128ic768ih1iw1oc768_n"40599154ece54bcae0a527f8a7706dfd*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc768_n"92789cd95f8e51c82fd915906ec7cf52*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"e286f66562ec8d46231f3a7ffdc797fb*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"dd6321551a78682e3ea25e7c6062e3bc*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"1214256e5bd4c69979fd377dc047588d*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"c399df9172bd3a0adfe07d1a2de3edda*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb128ic768ih1iw1oc9_n"2fb50d19e18431e3fc38fbb4abddce82*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc768_n"e1ef35482f93a3ae015f5a6b3ffad403*180&0383166036678cc774090b52e44c173b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"3bcc834b50c663271c4f174552028f47*60&1f6b7a95894225ebf1a98833ff3036c4*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"10f9f254a9192c5904a71986b64077f2*60&710f61988d345ed1a0689fe269373f6b*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"854decb4444e424a07a0e67a419edbc6*60&2b6f65ae4634485595d3ce79b3496d0f*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc9_n"133b9c70454126d9c37859356633384f*5&01defcff46b7659fb4c19e2c8edff929*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic18ih1iw1oc72_n"9ae5aebe821dff4cc39b7c27ce499462*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic120ih1iw1oc30_n"3416f99f78de31509c9f293a1a461b9c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic30ih1iw1oc120_n"a7557b9440db8dc95a6a70b4d376769a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic480ih1iw1oc120_n"0158ad68320279c32ee0d4b07abe7397*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic120ih1iw1oc480_n"76fef2d377cb81bd81b038e9e67eb4da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic672ih1iw1oc168_n"2a763a6c4eede060af82d76aec9a1406*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic168ih1iw1oc672_n"a2ae24cde239f8d5dbe8bba01ad77dd1*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic960ih1iw1oc240_n"12c0a1475e6446543ee19662437d86f8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_hardsigmoid:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic240ih1iw1oc960_n"b41c6ec99ac20fd65ea22510bc5c4b35*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic960ih1iw1oc1280_n"ae3b023a62c1ccd8e9f87b2252c946c8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc9_n"4f15aa060e3a76723e9b4d8e6aad596e*5&3d9e29fe1f7d6b37bc79e3adea2eaccc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb3136ic96ih1iw1oc288_n"c140de53ab91a1a4732b04e71c4d7872*10&23d4e5ae175ebd8c2aff607ed7a2cdd1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb3136ic96ih1iw1oc96_n"458eafa43bf6b70e34f6e147a751420a*10&ad22d60f6101a0bc8e217209aff41b8a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb3136ic96ih1iw1oc384_n"7936c1791ace6a4670d11406f64b2d1c*10&630a4c33f082cccd7d668d4fb89c077c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb3136ic384ih1iw1oc96_n"02adb021433e311030ff133e917cdbac*10&d689d8d431835bd507ae7a263a49eb4f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb784ic384ih1iw1oc192_n"6c23c40453ff4fe48d99ec4c7ff16001*5&7570da4b3fda92a217a26a6c08b2865f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb784ic192ih1iw1oc576_n"23b5a541de4bc7be6a16011868f31ab2*10&24a523ece9db9345ea7b3ad8d6a3df98*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb784ic192ih1iw1oc192_n"4055038a4893b8f2e68b83693b880db3*10&721b72e82823f0de5da44dbd4d136d84*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb784ic192ih1iw1oc768_n"74f3ff367629fae407026dfc120e5edd*10&ce082b518fd43b69938dcb83a1977bf5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb784ic768ih1iw1oc192_n"b9722c89b50e3166e3e4ac0aeddb009b*10&a28b3d26ed228cc300b0e61734ee9ef2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb196ic768ih1iw1oc384_n"7292dc5f35f940bdfa31db0d53d11766*5&821d5dc2e16fc4056db23c8761fdc176*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb196ic384ih1iw1oc1152_n"ec460d26c4d2d33ae7b3e9879174b772*30&22d7d8d05f63f573e6fb27b8261f5528*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb196ic384ih1iw1oc384_n"d9bf007e41cecdfe9cedab22aa4af6fb*30&88f29037916318306045bc13febfb71d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb196ic384ih1iw1oc1536_n"97d78cd29a85aae34ae7d084c47ef8dc*30&68b55088530b114c8faba2035ec76a16*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb196ic1536ih1iw1oc384_n"8d6e17f0c67337f4be264fda47766887*30&eccad6259802efeb0f68c600dfb73466*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb49ic1536ih1iw1oc768_n"9f5f3c377c1b7db9b20430ad6b2e5bf1*5&df9781d27d9c481d0541220fb44e3364*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb49ic768ih1iw1oc2304_n"d6ba3b177cbcb5c37626820c01895724*10&1856159be3eb416abb24bb1f03eada02*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb49ic768ih1iw1oc768_n"0a054eca231469a427f7d70ea5a89e5d*10&9488ef0e1b1f86feda62ffb79ae67ace*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb49ic768ih1iw1oc3072_n"7a12de8e97dfbcb3593b831b49f72a92*10&9bb633018008516de397ad9b441ed5ab*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb49ic3072ih1iw1oc768_n"6cc8710700cdeb5dd27f4daf7ed3a871*10&45c5d1e0665e7293bae8075bba37fdf7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc1000_n"0c88f36d56757a75214d7656de324f7e*5&5db2ae5766b9a79bb22ec0a28f6df542*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic25088ih1iw1oc512_n"54a962955cc2eb8afa2e94268932347d*5&3a63128d451cc7cd7a2bcd41e11a947c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb1000ic12544ih1iw1oc1024_n"4fa08ce612226c4340bb0176f91a0060*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 mb1000ic12544ih1iw1oc1024_n"c52e565f597eafb067530b679a3d9b5c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb1000ic1024ih1iw1oc81_n"82015d5381fa8d2780a9b74f336f7f81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb1000ic1024ih1iw1oc324_n"0c418d9345e9699d3afa550f6b1e0fd9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb100ic25088ih1iw1oc784_n"18320ea3df1e4da6fc9c96c091b4dda7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic21504ih1iw1oc512_n"c046ab35f801c3c04c3055e82550858f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic72ih1iw1oc24_n"acf3ec9403fe81c604cdb12cd326580b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234 mb32ic24ih1iw1oc72_n"966c48d5a929ce53068b8b12bb5eb2fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic120ih1iw1oc32_n"60133eb40ad9eadef3e17862f9b07214*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234 mb32ic32ih1iw1oc120_n"77a435c5f8c5f2cd6023fee21a2c4c0a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic480ih1iw1oc120_n"7858174b6be862b80ea0a5ce0db2b655*1&4a92b0098a578953331c384ed14165ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234 mb32ic120ih1iw1oc480_n"7abb35fb80d95d6f2d2eb9e9908cbcfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic672ih1iw1oc168_n"1dacf8d8b257bcc284fe40d142b4831b*2&385c18bad7e389a46c2811528bee312c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234 mb32ic168ih1iw1oc672_n"362a9804d3d7bb8e534191cd64bd1ff9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic960ih1iw1oc240_n"877cdf906f22ad2cad2c426cd428c1fb*2&38168d65628023960738c81ba29be5b4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_clip:0.271:0.314:1.234 mb32ic240ih1iw1oc960_n"889f0a7d5821cc047c8b6d49288aa13d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic1280ih1iw1oc2_n"ea15c972858fd01c2df9f5ae2feef15f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb5120ic96ih1iw1oc6625_n"895af5efee66447fe7e95b5e965661e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic2048ih1iw1oc1000_n"ec8f8fae9871944f499d0a4aaef90d04*1&c39d72c77d7cd386c8a68df006301076*1&210112117e8e1208d4a6a595e945d014*1&76f81756ae1fecc30ecffa4876968689*1&aafbcff6c2481a4007c3369726b7679e*1&b956fed4a770a9c4b7f0acc05b05b1bb*1&39d8342e3873f337c43144332b765e54*1&c8f9bd69dd79d0cd5ba1145df57846ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic1024ih1iw1oc512_n"c9d6819a085cbc7cf4e351c2763e6c73*5&89be02080cacca752d13a228a8c00261*5&e2a013d060cf86878d821649705007e6*5&99afc330ff0a81508e31f274dab1c481*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic512ih1iw1oc256_n"028a19c7ea1c7cda82884658a930a249*10&b3bc7344e43cc4f1e27b47ea5ef21a55*10&3740457c7ffaf141e390d93a766be56f*5&4a48deb7b04245dff2697ca0fb636110*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic256ih1iw1oc9_n"3140fc96335915706b2d576519b81104*5&0efd76fa33e0feb7116ca61ac098da2e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic21504ih1iw1oc512_n"cca8ca82ab1de2c4cb7d4dcb69df3214*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb100352ic64ih1iw1oc32_n"11f111e452125f802b5ed56812b17072*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb25088ic64ih1iw1oc32_n"a6279f729924e57cf8923f68ef17bc58*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb6304ic256ih1iw1oc768_n"58c21203cd5913f387f026e150390784*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic256ih1iw1oc1000_n"9b6ee2602ac251bae716e063f3d0f5af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb1ic62ih1iw1oc8192_n"58d02c3e1945a83b6a3309fbcbc8036c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic30oc1_n"c3dc95510e9de7f37c7cc5af409e29cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic60oc30_n"ea420a65e57329486b53d3295b653c46*1&65b6b3590043852d6ae645fda4148541*1&e079a10d5d6942bb1e5f0ca380ec03c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic125oc60_n"b33ee339fe46f471d5b565d4f57b6cf6*1&4fb0c558e84947d30015c7fcbea6f806*1&d42088293acdfc30d77cbea789540b61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic250oc125_n"aa9c26d3b9c068bb890654522cc29616*1&75324f01c7468c789100c33589758c44*1&5433b947e20247de712e19e1f06a3d2e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic1536ih1iw1oc1000_n"11772f28cbb2ef0b84da318a939a06f2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc768_n"6fdbc28795697fdb39b3b4cb7deed7d4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"f29b5c068b4dda33cd8f0750e6558bdc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2 mb128ic768ih1iw1oc3072_n"4a678e8cec9f943bc2d20e76b74bce0c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic768ih1iw1oc768_n"959ba1aeef4b199da12e6957c457378a*5&8fc36474d9dbac6140812e85a076fbec*5&a0c9e8d1da0b8c16ea8e6f653e86ce22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb77ic768ih1iw1oc768_n"e357328e8ab5f327902290ceef5a3528*180&ee287efc11a01ac65db90bab285c3c47*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic768ih1iw1oc768_n"6dcde1a0b2f8923f7aaf7f76fdf2f421*60&1b9419b2e7729938c60f73ce7cdada46*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb77ic768ih1iw1oc3072_n"4b1bf037fd4dad24066f743b29fed193*60&f8a61ec7bc6b8bcf2c1e9ba3fd96e83c*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic3072ih1iw1oc768_n"cf71e8411a11fbe137537ce4e750d5f5*60&6c3a2efce32efb808550e2a27082ce6d*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb3200ic1536ih1iw1oc602_n"725c1a5d2f383f6755c5b92a8ec4164c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb3200ic1536ih1iw1oc2404_n"0f9600bfbc7d3a2392081d027f4d0f19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc1001_n"f8abdf95f88c50448d197d43df665cd5*5&f4f6f03dbe03f3a5cf00e3f5249a9066*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb3136ic147ih1iw1oc192_n"9e229bd2347531999d5e5f3af3789d14*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb3136ic64ih1iw1oc32_n"2f12036659515f5be9d8fcd6d84e2fb6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+sum:0.5:0 mb3136ic64ih1iw1oc64_n"ce898118e53ff62fb13ec0b9c9cd27f0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb3136ic64ih1iw1oc64_n"da09e6164a94eb499bf8bcc50099ce76*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb3136ic64ih1iw1oc64_n"bb85c11bcdd1d2c762142edfbc8492cd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb784ic576ih1iw1oc192_n"f6aeb5eb1864723dc1b9b2b45c93df8a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb784ic64ih1iw1oc32_n"184b3e4398123736c563766a7e48d3aa*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+sum:0.5:0 mb784ic64ih1iw1oc64_n"429bb01bf59b2927d6a2f5e96e282c89*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb784ic64ih1iw1oc64_n"5bc755f8dade712fd14d44d5389e3e16*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb784ic64ih1iw1oc64_n"c1c60b368dde9f94f7bc648c5df20e81*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb196ic576ih1iw1oc256_n"03e207fb160c511db0c724fcf7eede51*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb197ic256ih1iw1oc768_n"f3f79bab8687f6cf8bee55096c60257e*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb197ic256ih1iw1oc256_n"86505c09da115226b9a6fd4776b94e35*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb197ic256ih1iw1oc512_n"b635fec0c45097b2c9e072052148d848*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb197ic512ih1iw1oc256_n"96514b3a3cb6e023dc1762658885505e*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic256ih1iw1oc1000_n"7cac93b77d87336bba87354202dd85b3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1280ih1iw1oc1000_n"9a2d4998ff9457b149b5c6998695b026*1&7d6dafc1e3341d79f65690a0fda1ecb6*1&89ca7ff8e1f988b851cb0ac9ea33b29c*1&45d4f8fe7e4c5f0764e7609c383996b7*1&f9716385c4f821cb785873508b6808c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic512oc6_n"5a170948b7c8680cd148e9031a9aeb24*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic512oc2_n"a49228bcc2d6d7d20a6e8357288979e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic512oc3_n"7ae8311b93e391e01dac06b2f3f345b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic768ih1iw1oc768_n"1e02aebd35cbba42c2d70afcbbc8ec15*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb12288ic768ih1iw1oc3072_n"224ce97b436a7c6f21dc4c931a34fa27*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic3072ih1iw1oc768_n"527d48f7699a792facb9e6277e49e85d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic768ih1iw1oc2_n"9819adebc36f2070d5f3f8ae59e82418*1&a4417f8285094a60997c4e9042938949*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1ic440ih1iw1oc2048_n"db57aa81e7c29b0642d4cbe52e0d00e3*5&91a0982b6f382626478bd615b026c3ce*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1ic2048ih1iw1oc2048_n"b7d47f71bc0f738ee6b1d7ed9da7d790*25&15d25c1086bf439059d69fe5c29918be*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc3941_n"51022c544862cb5572df11bb29323cb0*5&ca021fa35fde3fe56d285344a42c947e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic50oc1_n"fc7372d15e0e295cddda73a533efd0b6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic100oc50_n"a217d024d88388219ae0afa04ab446c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic500oc100_n"fcc350644320caf317d579ef2e9bd201*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb64ic1000oc500_n"e6613dacb59fe4013bb1c7171171eb4e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic62ih1iw1oc8192_n"b2ecd8274a576c17a8406372bed0b3a3*5&fff2d4fd73c175943895742a09be6f00*5&65da5ebfaf90014d3bcf74094554a76f*5&546d04be1ea710ee7db14d64a61b2e6d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic62ih1iw1oc1024_n"7bcc30df6aaa07a1ef8c3514817a2ad8*5&b5743ea2e73e15ad7931401c1cc82d54*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1024ih1iw1oc6272_n"c0b589a43fd77f5cd5cf0952597ba34a*5&0dfd7819680c14fd4e4c78f62d7e59f4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1344ih1iw1oc512_n"9b37490f23a15666552ec4e4c8f46c2f*5&6ad4c106c7208655edae8f1f882f9c2b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc29_n"a415779206666a913ccf67fdd88245f0*5&0141a75b55470c3c35ace0672b742815*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc1024_n"317ca380af68b4d66441ad776d7970aa*360&ed31f49803abd9cd33846eaaf8ed8d21*360&71f8980993e1a4d3ca2b8cc1ce9663a6*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:3:ab+eltwise_linear:0.271:0.314:1.234 mb384ic1024ih1iw1oc1024_n"50ac6a56e3c7ae8ad9250ab495b9f07c*120&1802659986324a11d0c31753c15f7241*120&db3c4a2f816e44d849b3c8df38e65a72*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:0+eltwise_gelu_erf:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb384ic1024ih1iw1oc4096_n"2bf4b091a2199e2efa4fde7321b3773f*120&af5cb23dd4a825c5b5672c13ad457f88*120&9e9920c1d31cbe239814d275474bdc39*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:3:ab mb384ic4096ih1iw1oc1024_n"327576d4e5df8ab60f0937e7197f9b80*120&3038fee7f28bd729a9111efac484d9ce*120&b72b85a28d434b624d4d96bd36b4f23f*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc2_n"507c3498aba7a338a9f38562e604ea5a*5&55133f36f1979baec7211c7f69b2c08e*5&616f0e45ea9a14e59b2996d5f654c0b0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc1024_n"e6fba28cdd061a95f1e57792c24a8711*360&95bd960f6852972d080b1db87e0b5df7*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic1024ih1iw1oc1024_n"531b3c5ff99fd4350db76fc86f6b0a59*120&e8b07c9a110a9596311cab65ed7e3984*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb384ic1024ih1iw1oc4096_n"d79bbeaad68da90ec275d83cd1931fab*120&7a4e2bb5b3d543520cc36f019b209bd9*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic4096ih1iw1oc1024_n"779f035286599ba1ddac509783622698*120&a27e0cc1a44bb083b562c14dc8d79595*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc2_n"167213a7e6e68646a5b5ceb01160bc56*5&57070640e6579e1c0b2b17ee34861920*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc91_n"6e6524005db5e3aba8acd5dd32f971c1*5&3987f3ba2e61117f4663a49402a2b83c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc360_n"285aae2a04fbad2d90d73958f203b40a*5&165153533eb26a049a6683194c5ef6ff*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb100ic1536ih1iw1oc602_n"30ebdad2dfb6cc9a0984d51c08212092*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb100ic1536ih1iw1oc2404_n"7acbd13c3065c8d9b13b4ae70ebf8b6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc2304_n"7c031a82cb0d06083bc4f90d89dd0863*60&2f3236463bc8aef48b4557daaf69dc69*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc768_n"3bc8c15fcf0e6a9f42010b9bbb68d223*60&e4141b5878d6ea67232d89f90c7c633a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc3072_n"84a478e30044c8ee712dbd187a7d5cec*60&2af5e8934ea3f63bd90bf764aea8e37e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1024ic3072ih1iw1oc768_n"b447bc3f364773738f1779433ee3c0d7*60&3ea6003743b6ada48ed0357d45a37b81*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc50257_n"fac996ee85760dbaa17b1354e46080c6*5&268689332f61f2d9edc7b933142b6a3c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1536ih1iw1oc1001_n"c711a8389a81650416d215113f990987*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1536ih1iw1oc1000_n"fe2a18b7bb6e7ec699aa096eacb3c872*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic768ih1iw1oc768_n"c5fdaa883a1583eea861ab837f217b15*5&9d1040f2b84e086fc977f75b321c5b3f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1024ih1iw1oc1000_n"82519848c4332dabb19d4c0e10b1754a*1&737b4048207ae0880079b0056c0ce535*1&fb60e7ab45d1cd01cb3df50abe1e6359*1&bc6931923941e54d8242ca7c08659c7c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic512ih1iw1oc53_n"4fe4941308aaf891e4e128eaf2e52203*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc53_n"8e455afdf2baa133134e481423bfd039*5&ba67131080149b2c130f782e06c394e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic21504ih1iw1oc512_n"d8b39520ad6e784ce7d09f061550dd32*5&d468a3521e9e1894c29b9bbc299b6791*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb100ic2048ih1iw1oc360_n"861ccdb147f0acc24791fd5e8bf0658f*1&22b5fa13f5d3da4424b37ab95cb71028*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb100ic2048ih1iw1oc91_n"8746e37bde85448fe6eda13d1f9d26a0*1&7e7ee4ad92141ae31e2fd5b78a55538a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic256ih1iw1oc27_n"57993f44399ae00b52ddbecef66ef315*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic256ih1iw1oc27_n"fdeda922d7606681f7dd009dd435f500*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32ic72ih1iw1oc18_n"d18107b75fc54d043cd232db962a2605*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic18ih1iw1oc72_n"78882596b3f43ffb05909c6b7bef40b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32ic120ih1iw1oc30_n"95611e789bda8b51511112d226981306*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic30ih1iw1oc120_n"7c8aa54d47e4f26b85d02587792c61d8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32ic480ih1iw1oc120_n"1f430ff58352aa1c4e7549758673bc3c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic120ih1iw1oc480_n"487ba3a51732975f98e0e8eaeb6fe658*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32ic672ih1iw1oc168_n"8d0ef20b9981352d95eb0b9d73bfd90f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic168ih1iw1oc672_n"b725acf33d8907040d9cb08a9d5ef6aa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb32ic960ih1iw1oc240_n"0c0f0f4f9a8f55ad99f23afe1285a1f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic240ih1iw1oc960_n"c37988c0aad5e8129ce302e932160877*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 mb32ic960ih1iw1oc1280_n"9fa819f9aa4b57e7c9f9e1d836dda60a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb300ic2048ih1iw1oc91_n"46c6db271828fa66bb106a9c7d944fb2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb300ic2048ih1iw1oc4_n"f5235287681f19a0b4d556e055c0b61c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic13ih1iw1oc512_n"95f9976c54e33dd8c963f816faa66159*5&73ea7b952928f721e3bac67c8c661ad3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic256ih1iw1oc128_n"c44c2c51ff01ddb930f32a6fd7c4b77b*5&5641e5ca2a9d805e0f450ada786bddb3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic479ih1iw1oc1024_n"89c2cd0e78d4810fb7406e2a3dd460b2*5&6c31121962450165c58d897c55a87c9b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb1ic1024ih1iw1oc1024_n"54df0b6f9e747b18d21068c510583e77*5&ec0924b1ab62dd62505e5b951060da67*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1ic256ih1iw1oc1_n"d983ac84dd298f86f658cdeb04751b09*5&f73c7ab50a63b93639c39f7ba0c15840*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic147ih1iw1oc192_n"7a60ef8f97dae30241c25c81c89f90ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic64ih1iw1oc32_n"7dc51dc47b44b8d2995e0eda0ae9e459*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic64ih1iw1oc64_n"e3b802cd84f7f9b8bef8697951d33d4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb100352ic64ih1iw1oc64_n"07510cd9618061545827ec62fffb2bd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic576ih1iw1oc192_n"b0f39734d860a386477107b692361eca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic64ih1iw1oc32_n"c26ffd44dd6b25e560bbb926eae299da*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic64ih1iw1oc64_n"6d75753d66f8738eb5d5347ccdb07398*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb25088ic64ih1iw1oc64_n"cb34f96e156d037203f3355e2fc473cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic576ih1iw1oc256_n"1e5cec7609eae933a4b1db0cb5a6cd80*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6304ic256ih1iw1oc768_n"1ba5850e542bafa792463421db0a0011*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6304ic256ih1iw1oc256_n"ee299d65168d9d57d475917a72c99771*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb6304ic256ih1iw1oc512_n"98677a9b9270de62d30753bf92806fe9*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6304ic512ih1iw1oc256_n"75a0ac6ea488748f010ad2e9ef72f9cc*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic256ih1iw1oc1000_n"92d84ac4280485fcde2b29489314e862*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=dst:common:0.5 mb5312ic128ih1iw1oc1_n"f2fb53769474d55a4e29c4b084085829*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb900ic256ih1iw1oc256_n"5da87cd8b4394f24dab4a59e5a2f4c6d*150"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb900ic256ih1iw1oc256_n"a33d8f52f7b1c0fc4d49cabf61e90d3d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb900ic256ih1iw1oc2048_n"70b8b8bde740a2fb388b589922ef48f9*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb900ic2048ih1iw1oc256_n"5b1c224d8d4b89445e6824a961a6e22a*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb100ic256ih1iw1oc256_n"a38a704288e1aa782bf14a3177903614*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb100ic256ih1iw1oc2048_n"915fb3a45c047b5dc24d48a19f52c7c4*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb100ic2048ih1iw1oc256_n"3f17ff8db2c0bcff1bb78eabe39f18b9*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic256ih1iw1oc256_n"db5674af5da6e3222a24ea68c9b8b14f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb600ic256ih1iw1oc256_n"2898613c4dac8cc6244d201c6b6d3036*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb600ic256ih1iw1oc92_n"66445253338995c7ba41c1a7a8dd11e3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb600ic256ih1iw1oc4_n"5e5d927dc292899877dc6e3801ee1799*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc91_n"670d1a9f38b58affb2f3b745cd4a7080*5&9eef035cd890cff53cc49d936a4d221e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc360_n"1b2eaab200983bf046cb5b46fb81b72d*5&0f076afffe85abb37c15634e89c9d22a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc91_n"777aa73833ae0ddc4bedfdc95d1fe8da*5&16a0dd1606d186ff560361c978916e44*5&91eca73a7621bcb9079948d13e110d00*5&f0594f5df7fdef38a6a32f0554b8bd87*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc360_n"b822b9b22f5676b84dc59a4cb90b0dd3*5&1e56f72de7558e30707f2137017e40bd*5&17f4b7d82739884d50a8a2f43f7569ef*5&7786eaf8114a1a93c9bf4c3073f130a2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb384ic768ih1iw1oc768_n"bc4980eeaf2fa8409a49e9bd85b38632*180&69b9eade29d05c9afe93d33dd0b33375*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic768ih1iw1oc768_n"6ddd6b1123004519464eaa95de17f438*60&8b44789e2bc94f78ee9c4eac42bbd9f8*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb384ic768ih1iw1oc3072_n"4a0bce9d238ae20b712358cb66d2987c*60&03e7e4e8cdd47bdc4df65db664552c7e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic3072ih1iw1oc768_n"cf2bd2911b3e853ae0acbdca6f84e690*60&b8a782f92e6d0ebe80781f278216d3a0*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic1536ih1iw1oc602_n"6a9354ecf3fa3fec7009b9efa41da2d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic1536ih1iw1oc2404_n"a7280f7a9bb0fcefbe197224fa152c59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb3136ic147ih1iw1oc192_n"80de92792093420e152bef7a8242afb9*5&6c6a042ca74bab7aaf767aafba240a4e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb3136ic64ih1iw1oc32_n"45d111d89fbff3451cabc4e0eb7c7efb*10&875a38e423c61525cbfe8919c52a9241*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=sum:0.5:0 mb3136ic64ih1iw1oc64_n"f5383ae9eb997b47f5530f713d8d191d*5&19d145c20f54c0e50c0a0b7403e7002a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb3136ic64ih1iw1oc64_n"7a69608366e6908278e933cd0acf881f*5&4f8bab57229039e4deab112737d390a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb3136ic64ih1iw1oc64_n"a046b42df0e168ea0251ada263c7164c*5&0fe02540b6834d5571e6e83cee8c1b54*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb784ic576ih1iw1oc192_n"c7c07f75534f1c28dd89acf3facc6ac1*5&8dfc83207dd394218ca207f1c8f2dbdd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb784ic64ih1iw1oc32_n"1a8b4fcabb234c46699e28f954956932*10&1cf0cf4657f7558d0c87a84f4b2376f0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=sum:0.5:0 mb784ic64ih1iw1oc64_n"399341f425aa15d5828b0228eca770ad*5&7053cfc2b8d8905da3501de0e97267b8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb784ic64ih1iw1oc64_n"7942d7d3330cda0d1c57e6473ec9bc97*5&0786bfffbacb9c0c9e720412bb3116dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb784ic64ih1iw1oc64_n"14005ca0a52441f7ca0e8e21c40a9514*5&133476a70a8114a4b5a9abe6dc111303*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb196ic576ih1iw1oc256_n"71d11d0693efc2ef2e8b89ad3a9839ca*5&44822db0dae8cd5015371cd691157b92*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb197ic256ih1iw1oc768_n"a9e6713959a8af25c990e8bcd41538f7*35&6f601bfffddf23eb0bb9d6129e6e9ea6*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb197ic256ih1iw1oc256_n"18650fa15fe86c456720540e9ebd70b1*35&4743ace74cb1f0757b40f09272858060*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb197ic256ih1iw1oc512_n"b4c790249edb1a687c53d0ffcc870a99*35&321871456775128bdefdda294bfbaa19*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb197ic512ih1iw1oc256_n"019d308ebb9e536cd2d5f931ffb345d0*35&e378d72ac7151f2664e9f2180e526c9d*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic256ih1iw1oc1000_n"05b32a5b6dd0e0fbf21fc20a35864647*5&16bb5df37c4afcbfbfd1d9cef8436e9d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic36ih1iw1oc108_n"377f0900390c30e6999882e596bf6e1b*95&3b227df50134bd3af0ef87ce034912c5*95"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:2 mb1ic36ih1iw1oc36_n"871b3c49082bf9724aed9bb86499b0fa*50&906633243b092dc3aec72d0bf8b5bea1*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb1ic72ih1iw1oc72_n"766c90fe97efd63050f1dc2e7c63c583*50&3d73a74a892d9a201cf90f299b52c43b*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic72ih1iw1oc36_n"4c006ba31a07da7f9a8bdf6ce2c01f88*50&d4f1ba7a8008dc7e32e97c171752912e*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic36ih1iw1oc36_n"0939fe2a77dc71620b75c1cc17a4d44a*5&068cd5a62abdbce4f62a9e9a041a2026*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb10ic144ih1iw1oc80_n"cfb29fcf55ed095a5f8ff776be03eaaa*5&ad171c4a25592bcb2443d74e4f317681*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb10ic80ih1iw1oc40_n"c77674e0b111d9369bce65822b347886*5&68b96713b287f22dfc7d71b7cbc709a8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb10ic40ih1iw1oc1_n"f25eb23c6a87fd7eebac2ea69436a7e2*5&09591fa3c2c106b84d0bb630df20609b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic162ih1iw1oc200_n"b72b5d69855fc4521b03fe1b60dd61bb*5&24f30cdee87730076047ab40895ce6ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic200ih1iw1oc80_n"88b5e1f53539a42f00a2215f0dd6fb02*5&6fb29ed9eaa7c796bc20babc40498f93*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic80ih1iw1oc2_n"9a6d564c3e4ef441c1e8b0e1482ae11b*5&821ea5fc363ce4d042f5c2dbc3dbb9d8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic768ih1iw1oc768_n"bb538646612767cf02443584d2de1ff7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic768ih1iw1oc2_n"029cdfc28dff12ded80b53633f24d6ea*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic512ih1iw1oc768_n"d0bf880c74fe1d4106090d258bd943b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic768ih1iw1oc768_n"a24bb26cdfe434fc7957b5b72d08829e*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb3040ic768ih1iw1oc3072_n"727663de605a0b79ffd48fba13d9ec81*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic3072ih1iw1oc768_n"1d8c1de753e815bdc71dcc104ce4628a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic768ih1iw1oc32_n"326db881d6f36e1d8276f64f5dc55ffe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb3200ic2048ih1iw1oc360_n"e6b2410dd7dd6c9944410a8ecb004b39*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb3200ic2048ih1iw1oc91_n"8273bf3b34f0379ec0bc1155d9a36582*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb16ic494ih1iw1oc2048_n"82f0596895bda70f7dde67f87f314d31*5&5d424f7e9ff422e4b3064a6d839da256*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234+binary_mul:f16:0 mb16ic2048ih1iw1oc2048_n"07e0711cc58635c94e423d37a0480571*15&9829376c2a1cd12ca9f32da0928fc184*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc8192_n"b91298432bfc69e5515ac95aad971b74*80&b47411966b5830467bb84559ac64107d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb16ic2048ih1iw1oc29_n"2f6e5217e62fd5413ec5406331c33168*5&600a09c16b143efe41d74c36d50b69e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic512ih1iw1oc1000_n"e1691b9424a557c5fae971d8c17d9349*1&9adfc6911713a9cf8f5407178f7c99cf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic30oc2_n"edfa5d5384980fc3252e8831b0bbdf05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic500oc250_n"ebf38496fd1d903ba94dd82338217efe*1&f123792f53cd997bf34eeb3bdb2a98ba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic1000oc500_n"f6a4de831a3e7b632894f9d63129b819*1&121db020870fd3f06226b48850815cda*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32ic62ih1iw1oc8192_n"0a67bed23c0a9def9137469a44a30c72*1&1e8b7cefbc872c057755763ea8f569d5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic1792ih1iw1oc512_n"f6d4e73b56af5b81841cb9ed1173017c*5&c15f06d420d28e963376871d2c3f0896*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb95ic512ih1iw1oc768_n"9594bac9a77a0d4eaae78aa71f83d030*5&c9127cb9819defd02b12e6a5a7ebaf4d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb95ic768ih1iw1oc768_n"c9e7efbf753bc6449bb21d89a2ed093f*180&5a2a54b37f3b3bec1ccdb4da088b518d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb95ic768ih1iw1oc768_n"58247021232aaf83c44f452b048e59e1*60&22d2894d4446f225b017a9191d03694a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb95ic768ih1iw1oc3072_n"b3fd5f1b68a90c50df04f82d881ef410*60&f735506fcf929285bae702238f7d643a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb95ic3072ih1iw1oc768_n"dcacbc7bc6a54f513ec48def91040b96*60&918c5e8a0d8143c2554e735527a67af4*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb95ic768ih1iw1oc32_n"3ad7b11e51b35d8a63151eed1e14b809*5&1333e59682518e91db6311c5dbc734aa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic2048ih1iw1oc1001_n"6d0a7b1ea73ca3a4bf2f151593535c7d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic768ih1iw1oc1024_n"5f634793e592bb7bd8b19395424b908b*240&b4dce9730f5a5e803f207d96291fd19b*240"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb960ic512ih1iw1oc37_n"6e2053d6c6d0f23572ff6a41a91c5ca3*1&c5b937924c5d31aef0191f42fe058d10*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1792ih1iw1oc512_n"4d27e5849de991b7c7b02420d0228215*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic72ih1iw1oc18_n"65e2085054d4d8691727c0e442b375b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic18ih1iw1oc72_n"4a0bd3307361cc14e4aab2f3121647f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic120ih1iw1oc30_n"ca018361be4cbd2e6be4d5632cdac7a8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic30ih1iw1oc120_n"6cf5f2f3ee52d72510bdd8ef3dd6c1cb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic120ih1iw1oc480_n"e6d6b448c81d98e5099ad2479b805b9e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic168ih1iw1oc672_n"6672a6a9f013c29b1a3ae7ef614faadf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic240ih1iw1oc960_n"69e4c9fdaa151a937547edf02bb9b254*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc mb32ic960ih1iw1oc1280_n"130a7d7dbe2fdb16128bb398e352e854*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb5312ic256ih1iw1oc256_n"5790455cb996cffe1c8fa8a309a4e18b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb5312ic256ih1iw1oc256_n"97e0fcd0cc5cf78feb10a1a79d29ad39*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb5312ic128ih1iw1oc1_n"cb0298567de8d9318f04b2a0eb2603ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic512ih1iw1oc1000_n"09b81d791be270b02164c398e7f8af11*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic440ih1iw1oc2048_n"cb650d0c4a2df41905ed6b5a6f81ad39*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_logistic:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb1ic2048ih1iw1oc2048_n"a4bfefc7202e55975ae04cc9b066d8dd*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic2048ih1iw1oc3941_n"7a47fc9f55202e962f8d98f0902a2e11*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb100ic110oc1_n"8c469164ec18ab173fe801ad6b6ec26d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2 mb30ic512ih1iw1oc37_n"d72f80cc001a9b8910f72617ad55d31f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb30ic512ih1iw1oc37_n"2068b1bdebcfa73e37afb24ffb872e12*5&aa8b4300ae24014f77b9ab962830fc1f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic1536ih1iw1oc1001_n"e3279110fb8205b8935a5828a8f0b9fc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:3:ab+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc768_n"a1e9ae9b0acd095a5d0a27eed8f5c5c4*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"0d37ef1eee2a64cde10e22772ebf9e1c*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:3:ab+eltwise_linear:0.271:0.314:1.234 mb128ic3072ih1iw1oc768_n"ed15f198d6a9cc0e450354cc21b9b24f*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic768ih1iw1oc2_n"b0d53c7299b418fadc82528824e4f2f5*5&a0e873d373bc8192a33e7e502fe41cdb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic1536ih1iw1oc602_n"87f781f91859ea6621efa2e242ed6695*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic1536ih1iw1oc2404_n"19a5afd7ee0dc080cb5737d0c122cb6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1536ih1iw1oc1000_n"6abcd89f6d7a05ccbafc5d331ae5542b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic512ih1iw1oc128_n"ad25c84d4b49f5e8d9acc2ade21a6a2f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic128ih1iw1oc128_n"c2fb896906fd42441964a1738b3d59b3*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb12288ic128ih1iw1oc512_n"9eb0aaa73b372d7b500258b60e6281ad*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic512ih1iw1oc2_n"b27c02a15deee8dc46b57ac58b845847*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc1024_n"1b466a87d0d7c82237e794b5b20dcf64*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb12288ic1024ih1iw1oc4096_n"c2df08645025a591aba2447aef1e8edf*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic4096ih1iw1oc1024_n"ae71228327e715eb1c6cb2eb0c37d4b0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc2_n"7f0a7e9db75df25c1a50f9342505afa8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc1024_n"a65a6c1d0bc24abb970dd3db19622743*72"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb12288ic1024ih1iw1oc4096_n"5bccb44d2413adf161249a500ade4a08*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc2_n"cc4d2ed31b6066f1d4e694f1bb8a36bc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2 mb32ic256ih1iw1oc12_n"e94ecb0236e020f778479b6daaee4345*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic2048ih1iw1oc91_n"ad9e3de95ea2515607fc14626d877ae7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic2048ih1iw1oc360_n"50ca242ecac3c4e4b73ffe092bb00bdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1280ih1iw1oc1000_n"77f97898ac6ec322dd8725c529270380*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic25088ih1iw1oc512_n"d94e176efa96ef2d3a8d5b5f46d53391*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic21504ih1iw1oc512_n"f44b5275cc0880e4099ddf0bd482847f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32ic30oc3_n"0949432d34b6674ea15e7f666233ab47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2 mb128ic768ih1iw1oc768_n"19bb794837bf942c0cbd517f1cc85e7d*110"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc768_n"1a80d160d4cde53378f845108c89e14c*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"eee362858cfe24b1e6ff456e3a4b4d53*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"5a6036175e363f15fb451761a5b376fa*55"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2+binary_mul:f32:2+binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"4162a3abb006f17b55049e8eb87a8ff6*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"de6ac341d1fc6b5d036b5a43b6199bec*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb128ic768ih1iw1oc768_n"5125a72206e961f5d368aedfa0102a39*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc768_n"a626ed89275dfa8f46631d9841c5b5e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:u8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb128ic768ih1iw1oc3072_n"46611abcdc043466ebcf787051fffd87*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb128ic768ih1iw1oc9_n"4ee95ea1b20efd4d9e7fe2cebb959736*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb3136ic96ih1iw1oc288_n"cfb296fb8b41da2e76f790ae37d23959*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb3136ic96ih1iw1oc96_n"845d412079e2337138d4ca836ee4c12b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb3136ic96ih1iw1oc384_n"29dcb33733bc16ceb85ac11d3795ccbc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb3136ic384ih1iw1oc96_n"9fac05b7acb7fb4f6e16e12867013032*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb784ic384ih1iw1oc192_n"09e3dfa47185379912a91cfb28d8593f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb784ic192ih1iw1oc576_n"9392096145571bb84932e9fcca0e58ab*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb784ic192ih1iw1oc192_n"3009430a0bf35f6a7774717919b2ec0f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb784ic192ih1iw1oc768_n"fd874853d989f04384d2f8449cc72777*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+sum:0.5:0 mb784ic768ih1iw1oc192_n"e537eccb0061dea4baf029a74c66211b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb784ic768ih1iw1oc192_n"5f3b7458d5b7adf0e53a1b2b22da8864*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb196ic768ih1iw1oc384_n"bfcec889568cdfe401ac4789fd166792*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb196ic384ih1iw1oc1152_n"f459f157b4a457b75385b7fe7a129476*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb196ic384ih1iw1oc384_n"5fd27db008cf77fc6afe7fa3cecbbab1*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb196ic384ih1iw1oc1536_n"12a26f8e8d51aa6e01d57ba2997f19f8*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+sum:0.5:0 mb196ic1536ih1iw1oc384_n"5a4d1cb78470283ad2e4327093af22c6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb196ic1536ih1iw1oc384_n"d57cec5972d6e4c60263e0c87ba528b6*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb49ic1536ih1iw1oc768_n"2d86a52358c07b775337d79b98add407*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb49ic768ih1iw1oc2304_n"4d418da888cdbfac99314f2fb4b7793c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb49ic768ih1iw1oc768_n"5e77e5b3b7a5bea9a45a435b3069e0ed*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+eltwise_gelu_erf:0.271:0.314:1.234+binary_mul:f16:2+eltwise_linear:0.271:0.314:1.234 mb49ic768ih1iw1oc3072_n"e1d2c77dd62bff9150bdb1d72b1a054f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f16:3:ab mb49ic3072ih1iw1oc768_n"d823f37b7b5a990b7ca1c5bf25fdfdad*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2 mb1ic768ih1iw1oc1000_n"acd25f0a5366564862fe10d6cc8575fa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic768ih1iw1oc2304_n"2a663921eb7686395ab8c12b2048e0a3*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic768ih1iw1oc768_n"1e51733d180f0b4bb34fcf5401c184b1*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic768ih1iw1oc3072_n"8580917fce20b5b8754c79cc0f9cbe5e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic3072ih1iw1oc768_n"b278809ea40dda9a8b3209b15047a241*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:0 mb1024ic768ih1iw1oc50257_n"604698c8aa5a484f5b7baa6313618911*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1536ih1iw1oc1001_n"81f94ffb15daea8ef6c40d062b93065b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=s8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2+binary_mul:f32:2+binary_add:f32:2 mb32ic1792ih1iw1oc512_n"ea96369db62346f347c7730d462ea9e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=s8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32ic960ih1iw1oc256_n"aab153d123db0f50b4f6bf0fe1bb966b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic96ih1iw1oc288_n"efb8f05766327bdd9d9fe6d826257300*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic96ih1iw1oc96_n"c299890200c4d76240479708b9b092a8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb100352ic96ih1iw1oc384_n"112110fdb6c59ab455419fa01655f806*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic384ih1iw1oc96_n"fca93743e5816e112c57aa4444d0ba42*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic384ih1iw1oc192_n"5d466d4bdf28140f92943d6f91b056aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic192ih1iw1oc576_n"fd3f310ee3b86e059612d3365ad8628c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic192ih1iw1oc192_n"3e66c6e5606e6b64a9be84a6f08861c3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb25088ic192ih1iw1oc768_n"d0cb7a9298e3b8a586fd74c808b55b8e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic768ih1iw1oc192_n"4acdf6dc36286a423069bf0a9dd59af5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic768ih1iw1oc384_n"209b83a32cf672b6f66db761a8fa62d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic384ih1iw1oc1152_n"2798424e4e3f84363fda43e864d67adc*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic384ih1iw1oc384_n"52e6135663788abb99ce9e6588a57111*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb6272ic384ih1iw1oc1536_n"01b9a9157d698a26c9ad33c694bfcb6e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic1536ih1iw1oc384_n"cd447ba0a0a4cff21becd291e300a8db*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic1536ih1iw1oc768_n"ad0302a11f0edde1c7e0854d119fe851*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic768ih1iw1oc2304_n"99cb1484df371e8340893c5eaeb1170f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic768ih1iw1oc768_n"39bbf064c3f50e6093d72186121c5c87*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb1568ic768ih1iw1oc3072_n"f1216c62115db877f46d73651baa816b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic3072ih1iw1oc768_n"a6023238211487ba22129252b47ed8ac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic768ih1iw1oc1000_n"b43ca3cea769817b803c211e8ff89d94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32ic440ih1iw1oc2048_n"4f192e1947a1c934b3a69991e1fc2445*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb32ic2048ih1iw1oc2048_n"58b864576388ebd23a3401de286473de*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic2048ih1iw1oc3941_n"68894034b86369316530b94c016bd1d5*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic1024ih1iw1oc1000_n"e9c8cc3c689aedae70c4f7c2ab9a87a5*5&563b672e8fc057520cdc79a68037d74a*5&847e898b6b2b4dadb586d2d26d1806ad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic1536ih1iw1oc1000_n"92a66721f8b39647d6f3e410a607d671*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc91_n"cd4cf55c35e52956f12eb54315a57c2d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc360_n"b4545e4116e693f6ed20fa510fbd5906*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1280ih1iw1oc1000_n"bed60969ca9a60cee92436a1ec4b3a41*5&eead4d3f5279aeeae6a3b5f4fad71fb3*5&d18be8a5869156a04d7559cfc6c46643*5&3b3b169f97ca92d13332ad04dd6a3aba*5&82eefe53b38cf5b004890c9fe9101a1f*5&76a8b59eff3bca56787584b82117985d*5&e6552d746e5c54a260be12651ed5a41e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu:1.0+binary_mul:f16:0:abx mb1ic512ih1iw1oc512_n"a7ce27e32cf4a8e35aa96553ec77a97b*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc512_n"7fcd64a6fc7cc6d5f9b652871340b747*75"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc64_n"a30377b5837581a2e627a3f725a0faff*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc128_n"591a685145c2b46b7788720a94dc38f9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc256_n"cd3e68247724b52d07b43035079a9483*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc32_n"1bba1025952377d9019d45f696d157c8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc768_n"ff9f5c4f9908f86a89fe439e2eb4ef11*180&9f89d5aaf642617ff45051a9215a7b7a*20&5ed0787ac12b3a2ba39c126ac1718a5e*180&ce1ff32af14eed908378c43c4f333441*90&b55b3ff77499bfe15daa82897bcc558b*180&6912e4972223ed5401675aff34241c7e*180&b18cc188709d13ce5256fd1ae2ffdc0d*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"e5df07a80db41652860a496f732a86ef*60&8142e5277cdbfee9bc96b370ca3c47af*60&71ff1b57011b9755bff2275fd268e235*30&89e59c392d0fe66b5e41ba18c4115396*60&d469f156c85f944c88c0937467dab233*60&b0cfc8f7cadac86b26b6a37c4938ba28*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb128ic768ih1iw1oc3072_n"19c9b13b2bbf7364bc63fb6c96d8880b*60&f747d3479d9e28f3803094600214f748*60&87f967b2018e759b1f17a11ef0affc09*30&45b4bd8d9aec98cfa708ba661a7a7d75*60&a6585f57488d6045804459958ee09c7b*60&11ca5cc77179ccc104d877ace13b515c*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"af704bda2695c3a4b4e0f4d3da632e90*60&642bcc4f749f37b91ca0d04f56d35375*5&f35bb7f92f8dfd8910590900e025bcd2*60&e2306ffbd9c3bb94e72ad71594d005ce*30&a20593f66f586c0aeb81171d37749942*60&0b06a304d9766decbd5fbea0a1ab0818*60&a88c73260b33c0d45f599f7cd0f3d53c*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb1000ic12544ih1iw1oc1024_n"b15fa19e120f427b9f6d84254906ff82*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0 mb1000ic12544ih1iw1oc1024_n"ed300276fa914add0f5c06830ea3acf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb1000ic1024ih1iw1oc81_n"60690c981270d46bcf8ea800922c4691"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb1000ic1024ih1iw1oc324_n"6b0245d391a6ea4e3243e87ce5e6ef25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb100ic25088ih1iw1oc784_n"5069f33f6570b07cf98602d8fa29cfed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic72ih1iw1oc24_n"a6e3f72d4c8fdfc4037bbd4fd566c693"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0 mb32ic24ih1iw1oc72_n"7200446b0be25dbc1cdcb76b1805520c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic120ih1iw1oc32_n"fbb99baf7eb1ad4eeca8f8824aff1ee6*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0 mb32ic32ih1iw1oc120_n"8da06db1cbb6399d4e3c1fcdf7ba235a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic480ih1iw1oc120_n"a893ac9852e78bde17c19f7916d0bd66&a30b37371b24b3955eb5947b481c3c2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0 mb32ic120ih1iw1oc480_n"fc570ebd2d32b8afc595b1a7a030501f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic672ih1iw1oc168_n"e53d2afaeb4261a9e3193fb501295d70*2&9c7c11fe2b8e70bf9afad4a3145ccfd8*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0 mb32ic168ih1iw1oc672_n"063cb08713061b02e62a99c2d62dfbaa*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic960ih1iw1oc240_n"4999f6e5d003f845bc04b27f7243b07a*2&7aec3a42962e7996e632f500ff06aa7d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_clip:0.0:1.0 mb32ic240ih1iw1oc960_n"ec188f21ba1b523231ddb575ad731952*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic1280ih1iw1oc2_n"a30a5cabaf419b5d0d385f48d67f7e22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic9216ih1iw1oc4096_n"da7c1c48e3d731997d300d37bb607a08*5&76305ecbc31f6ce9250cfe0799eba3a8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic4096ih1iw1oc4096_n"a23b56af13d0656a33fd1782b96b5364*5&14668a06e1b0812a2b930626bab9922f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic4096ih1iw1oc1000_n"fc4801976b7ac70276101af8f440178d*5&036698c5fb7101183b478fcb657dd954*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc1024_n"b13d54c67d5e0fb60ee7284d2ef5cc14*500&d5ae499861c88ab9664b2bfd3688487f*500&4fc0518f9874ea7bf55e88fe5c7fcacc*1200&a06fbbf0d50c23396c0a7d8209e16866*1200"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb30ic512ih1iw1oc37_n"85fc0915c85ecbdeb12b869d3221e039*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc768_n"cd51fd41be72335ec50d68df94a32465*5&4616d4611bd0988a4d3627bb45f29d90*5&c6ac256af71dd785f5b6c6560dcecd35*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc2_n"ab7681730c269d69212089f27893e539*5&77dceaa19ba6eafa1772163c0bdb5286*5&142f5e9a60414a2e38ca09bff40222b5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb3200ic1536ih1iw1oc602_n"4b6f20251884da4a5440e703c508fdc8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb3200ic1536ih1iw1oc2404_n"faef4e2a7938003e9cde22e6f78c19a9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb3136ic147ih1iw1oc192_n"6c75787e51020854c775be51e0b7ecfb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb3136ic64ih1iw1oc32_n"eccbd851d0df47d59cd76984e95a71b1*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=sum mb3136ic64ih1iw1oc64_n"b988227aa0f505582c66a0c6aad8ea2e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb3136ic64ih1iw1oc64_n"1d27adefbfca7ce4914e94bed8c1e039*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb3136ic64ih1iw1oc64_n"4a0dbd62de0125a016b203f65a954352*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb784ic576ih1iw1oc192_n"d785e96645e0ab161cfd5565880572ff*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb784ic64ih1iw1oc32_n"9edc66db5869ccf9aca2d4ac253f619c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=sum mb784ic64ih1iw1oc64_n"14b3d1c824269c8e37b10e8e081eb217*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb784ic64ih1iw1oc64_n"7d23fe824a0783ab36ce716c7f957a5e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb784ic64ih1iw1oc64_n"5e60ca2dad33a1b574ba7a8f49906194*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb196ic576ih1iw1oc256_n"10fb2e4ccd2a2739dd0588f04aef9e42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb197ic256ih1iw1oc768_n"4cc231b1fd2d91d55c8085f86dca50cf*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb197ic256ih1iw1oc256_n"49b259912b3dec1410e0894fddbd5c3c*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb197ic256ih1iw1oc512_n"3b68efeffabece4f1821d002c719c35f*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb197ic512ih1iw1oc256_n"051cbd550b90ecef12d31eba448fc9b2*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic256ih1iw1oc1000_n"1248602f2f6cfc0dfad06072e07ed78d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic2048ih1iw1oc1000_n"ef8f2e8668bd5bb629a04fa8e2db3077&2f4e543f53023c9ab64fb695abca4bd0&559cb5cf90a973f03810c57e51514cb3&c2728443242f9c9fe45b0fd58ff7bc8b&f1b70a0be4b02d3d912047a173f772af&c141b3ee4d00a8f3efc336c45ea09e13"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic30oc3_n"bdbbf3dc87f4c4e77966123c6af16231"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic60oc30_n"b3845f160e8632d332958fd6cedc889f&a5df1bfa17c3dec370e8f658d703a499&dbe169ef8a2cab2dbf8c26f60c554c39"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic125oc60_n"1a4e335d40cd6d4e7e2b25eb53d75fe7&fd687a37cbb610631ca4a91aad7da073&90dd9644ad317d2b731a56305d629cdd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic250oc125_n"8c8b7a1323429436f9c6481bda1cbd2d&e57d8fce1d4f82c3755280f4e936b33f&2a7f13361944a3d768553a68c5c2004a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic500oc250_n"958697df78d7931191e8ddaa11adf3e4&f314b959dfae837cb3253107cf16302f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic1000oc500_n"4c8a0612edd4eafcea088acb5b97eb26&8a18f682082995116e81af19e7593903"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic1000oc1000_n"9f012c6326ab9cde4ce107211d38062c*2&65e63614fd37a1c29ffa3474c8e7756f*2&6afb9e8f4d1f9be56acae4c183f0f462*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb100ic2048ih1iw1oc360_n"9b76c0c52bdcef3708aacd3e75b71f19*5&a09ced62155c901b66f06752e58e5870*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb100ic2048ih1iw1oc91_n"74286513e0a0a645fda6fde485c54119*5&666de58f9aa9d34ded4ca92188806a51*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc1001_n"ac4fbfb9ff7571cf87f9b3c19bae2790*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic512ih1iw1oc128_n"85cfc4b76a334c7d7138312880b4bd1f*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic128ih1iw1oc128_n"5c1bde5934fc305f40c4b2717f37d87c*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.25 mb12288ic128ih1iw1oc512_n"a8048429365c564df195585bdf029449*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic512ih1iw1oc2_n"c53c526a33e99999920acf3f1bdf6aca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic13ih1iw1oc512_n"75336fe45ab0c8f0b785b2938eec2c56*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic512ih1iw1oc256_n"d269a3f5830f81c03ccc012f10f66749*10&6b078c8fa051c040a09a3ada652973c7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic256ih1iw1oc128_n"cb1a03184112eb241a2c7f787c40feb2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic479ih1iw1oc1024_n"2fa46e76d5950932a327109cf377ade7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic1024ih1iw1oc1024_n"c74e6e27876b1f1c7fb6795e7cdca406*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic1024ih1iw1oc512_n"aebbe9b30db36b06f36de32c28864ef0*5&4338b370f44e4ec1f4745e5f4b5ba239*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic mb1ic256ih1iw1oc1_n"8ae8f4cd013f8ad28bc8cd3a0c1bc7dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx mb30ic512ih1iw1oc37_n"bd188915040ff3d05626e91f27831e88*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1536ih1iw1oc1000_n"6eb4be2d2304682c8bc7ab05f145577e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc1024_n"2ccca6b4d651fee4d3f77a02dd954d8b*360"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic1024ih1iw1oc1024_n"165fce6f81f036345fe1f39529ce5e36*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb384ic1024ih1iw1oc4096_n"9e036503dde685e2f50184dc357f4b12*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic4096ih1iw1oc1024_n"095e57df0ec6b8f86c2cdcd57dba9ca8*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc2_n"4622df6e758f9bd5dfdce99df38e874c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1024ih1iw1oc1000_n"fabcae04be2d0c10d5bbcfdcbb66d66d*5&158f30be1d734cdbdad79bf200de6cab*5&83c5a026290bbfe464c167b47a6f3c10*5&c473c84f530affe1da5bf690f35abc42*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic1280ih1iw1oc320_n"f05b20e43133566604f4316ce8509fae*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic mb1ic320ih1iw1oc1280_n"9102662cd4273ac811afb4b060fd359c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:s8 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc768_n"6b6296b2131e32d8244d15713ae5d144*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"eb1005c0a3689069ff01975136f7ba94*60&4985ecf40c6bfadbe8141bc7d339b3ec*60&bb4a495db32f04f48b32ab82bc291617*35&afef640c49f7b2788ad6a129e6c0f8b3*30&ccbbe234c57ee169192da3d92e6bf0c9*60&9aaabb46242243c2ef716a2afff68eb9*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"4deda5c2c8fff9cd4fefae26ab3211fd*50&df204447438145ec0028cc50926c27de*25&192fb41be28b64e1d265ca390945efc2*10&b3d08cf4a5aa505a7c677700a5d356d7*25&7618bc2913708630b3e1c3bb4ef4a853*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"95b3a0a7c07d1231b6a2693f30d768d8*55&03142a87cd4c9d5b1e1f63ccb2ce1b6c*60&46299326bd3efb43ff3b36b9da3949d6*30&eab48871e755c3b605c0ef689567e64f*60&73e8e81e74e6b9e4e1a002f7cd5092ed*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb128ic768ih1iw1oc768_n"e9573d390e9d000775f8f79f3d01a322*100&cbca9ff40835d9b00a241f4fe19043fe*120&98abe179cfcba4c46867ff2c07c0ceec*10&37e84e2e7315c62518fc985a252e3e33*60&8c90302cffac214dd236a8d148592831*120&647ee161ed7e7f18f41a12b745ee3cf4*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc768_n"95782e88a216f25d8ebbba33f6d5b9a7*25&15905c12cf757b35278e5de7b2632b8a*10&24fd7a1408ce1731a072ce1c61a0ff6a*5&a600caf770c1f0de1ee7206e690a7b88*10&c56ea28a13a3af36fe21d446e11ab4af*65&04d649dbdcb531ec2d836add07b82382*20&dfd2447779ed65ff7a32167c87aeac9d*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25 mb128ic768ih1iw1oc768_n"3ef38f4a305a798d9ce96c9b995a31ba*25&5d50d5079d93cc510e8c12b03cdecce1*50&a94d80185113d686b2fe0103285f0853*20&01abcffc9fe881940988ed686dc03b04*115&612b349929d064cda69373821695c57c*40&5ddb568f04dbc8fcfc914b8984666d50*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:s8 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"de0feae248ac3e447597518fc62c7767*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx mb128ic768ih1iw1oc3072_n"7eba7daed4c5a068ca46343e79e84e07*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic768ih1iw1oc768_n"3b99b30fb582568ffcaadc9249375601*5&ca0c88bbf058ae5deddbbbd4237fcfbd*5&a182b1da9b8b4fc26a28cccca139a764*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic1280ih1iw1oc1000_n"254a2b6c9cdff6c04ba8e31f4761802d*5&576b5ee59e1ce01f99806130e9c4cfc2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic36ih1iw1oc108_n"85df341b6f62746cab6c0c5eed515e9e*95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:2:abx mb1ic36ih1iw1oc36_n"29503dccd0c1d1813eb09b9af401b9af*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic mb1ic72ih1iw1oc72_n"1523673343ee25786f9b6d083945c3db*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic72ih1iw1oc36_n"9ea20a1335195630f313df842594000f*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic36ih1iw1oc36_n"0b8ae2bedac1d72bf891309895b88f26*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb10ic144ih1iw1oc80_n"4707ed99f78d71e242d841d422d503fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb10ic80ih1iw1oc40_n"f20e309bfb63c5e04ecb83f11e22d84f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb10ic40ih1iw1oc1_n"134a42dcf663e87e6a4634126b01d7f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic162ih1iw1oc200_n"446c09e3eb8a8b26d51d57545b62d2dd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic200ih1iw1oc80_n"f62b17f39147f114f78b678c6a2a71a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic80ih1iw1oc2_n"fb8111e9a8e7cd043555bf2c53062800*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb900ic256ih1iw1oc256_n"363bbcd469a7740acce8bb4cef63dea8*150"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb900ic256ih1iw1oc256_n"d04a54c56ac63b0623f9deeea90845d3*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb900ic256ih1iw1oc2048_n"0e57ac9fc4cb882a75b449ecf8c1a307*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb900ic2048ih1iw1oc256_n"eab117c2577629fbbedfbe9c8698fe3b*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb100ic256ih1iw1oc256_n"a4407f9cf99540396912baf9558d8e68*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb100ic256ih1iw1oc2048_n"a19aa39fe713c8e0ba4dbfa67d6c4fa9*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb100ic2048ih1iw1oc256_n"d81d7a2cbc75f954861f2aef538b319a*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic256ih1iw1oc256_n"bfd84963143bf6e756ff7098b7bf594b*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb600ic256ih1iw1oc256_n"304257d2c242f5efa450b1f2923ddd29*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb600ic256ih1iw1oc92_n"80b68cc42c6e057ce7dc5342d1ff52d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb600ic256ih1iw1oc4_n"1883afb4b19a48125ddda58e8bb66f4f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic1280ih1iw1oc320_n"4975a35e45397ee676205f0130eb213e*5&6d2cc655539a6160dde04914131a649a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+eltwise_linear:0.25 mb1ic320ih1iw1oc1280_n"654898232a832b53812ed75acba1965c*5&c4fe8ba5553d1d6b26f40ebfbadfc0e4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic2048ih1iw1oc1000_n"d5b569efcb61190ec4832de0fdce61ac*5&15441f5f68b76a60fefbeb6010370b7d*5&fa9c42b473e2cad76899b888ebd1cede*5&25dfce587f502b31f931834e094cdf66*5&6f6855f4dca9f606109cb7f37f692cef*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic768ih1iw1oc2304_n"bbab6c1419e915136412522e23c564cd*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic768ih1iw1oc768_n"a00b3500412e00a1d39ae85d4062330a*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic768ih1iw1oc3072_n"bb8a6bb3a3e0664616db1fa56565dce3*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb1024ic3072ih1iw1oc768_n"ea366600cac3918ddea293345bc826b5*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:0:abx mb1024ic768ih1iw1oc50257_n"4c1eca76c423a5a125a3f57573246cf9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1280ih1iw1oc1000_n"ebfde685a239ca547a691bf3da827195"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic9216ih1iw1oc4096_n"df90c882537d339b8231e7f6488a7fe3*5&5e7e90a5648be3ae9386208da2a3d140*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic4096ih1iw1oc4096_n"da66ce8de8cdd4a92af17dc01a773d0e*5&22965b3828b095209790c7e2f7f89fc0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc1000_n"3f112e6fa19e6a6fbb9b8065e0eb5879*5&2dfa833972f02ce8b16820cdb7a4145c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+eltwise_linear:0.25 mb1ic440ih1iw1oc2048_n"05cc515c0f1b7f8c2605bf579850caac*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_logistic+eltwise_linear:0.25 mb1ic2048ih1iw1oc2048_n"fea7a919a5777bbc1bf5a97e8ef7528c*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic2048ih1iw1oc3941_n"44ac97305822192c6aee31e6906c9873*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32ic62ih1iw1oc8192_n"5fa468f63acd58e0f0d5dd95edaa38b2&4ca0f464ed2571ebc27d735a48941382"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc91_n"387d6e1a7ddf07744f3c8d2bbafa4f63*5&a9f766ac0bee49570c411df6be85d261*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic2048ih1iw1oc360_n"b8f5a5c3cf4ae36dd0cd9683166bc885*5&cd5701990bd6ab6785f935e2b2f79386*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic2048ih1iw1oc1000_n"3871e47978e7ee48c5000efecd9b3bf3&c4e398e073f3561cee2746c0c23d05d5&a4f50c3561c3457f29fb7fb640b4ea8b&62d3e6d793a0f3f9cba349b725b15ba5&3c2399edf0841342fd9a19a5a8f0ee6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc91_n"70f0331165ccb2fac02b56973e3b457d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc360_n"add6aea0b7e797a6ab9eeea816d92c2a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx mb128ic768ih1iw1oc768_n"40d7df642addc455df39086fcfcc088c*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc768_n"4ffc436f6fa219781676b3bd4906622c*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"0758f792624b8d0825ad8a896b6e1575*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"93a28d02a1092c5050670ecaf49e0e12*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"d86d0f36e7b4a8aa7df4a9ff81e3e8af*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25 mb128ic768ih1iw1oc768_n"1fe937e5cd8bd5decb4af6b0813ea5ec*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"ccfb1df4e49613e496cd84614e43a876*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb128ic768ih1iw1oc9_n"ceefbd2efd30034c2aebca9a43fb275d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb384ic768ih1iw1oc768_n"e178f75c8e760878fb8a5d8835a6e408*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic768ih1iw1oc768_n"3ab482298844f31a023f7301e67274f9*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb384ic768ih1iw1oc3072_n"33bad4639aba9589787ee9e8e42c501c*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic3072ih1iw1oc768_n"7f6e0801795e9189a7c1a945b4d97a8a*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc1000_n"18b406e13a720f145283d58016a9191d*5&1a20cbb475d18aeee2ee81ea7afab229*5&4eaf05216770e357063c2be129a2b610*5&e64bd964614a5d09dab3b5ca7c222644*5&a707120940669fba91cf2e79743c2595*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic2048ih1iw1oc91_n"52ed90f67e7ef835eda4e76bfdc1fd58&d47469b22a788d00faafb867ca042cba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic2048ih1iw1oc360_n"6a6f9d7c883b4f130332fac4d1a77cd0&7467feb6988bc0b9545864c975cd61c7"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic30oc1_n"1e6b446ae2a4b887f48ee33a94eb7191"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1536ih1iw1oc1001_n"0ad65ceca1bad3630882026f06f4b722*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc1024_n"dc65626a3ce2a3717bdd865350220665*360"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic1024ih1iw1oc1024_n"03ceb39c608927ddc873f27c954b1f01*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb384ic1024ih1iw1oc4096_n"5bcdb521f40a3d71e58165f8071df950*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb384ic4096ih1iw1oc1024_n"311e92d34b66598869426ac7194c28c4*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc2_n"cc2ee68d8e0790d8c9942599544e7da8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb25ic512ih1iw1oc37_n"f5d7f28a09ea71fec3d981d930cbea15*5&a653cb13f8ba6bff5c2900baf3676a86*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1536ih1iw1oc1000_n"c66eb2351774bff9292627b2521150af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc1024_n"4fd34f10b252399cece44fc09db953f0*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb12288ic1024ih1iw1oc4096_n"35165781ba19d7f7c6641871cce9c697*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic4096ih1iw1oc1024_n"557bd3d12f77c5b1d1eac21bdf6c7967*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc2_n"fe377cb848f373eb6c6852f37066acf7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic25088ih1iw1oc512_n"00183fc83d077f56e43079a8226b810d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic62ih1iw1oc8192_n"c362af3e3c041c09ee114b57a3a71852*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc1024_n"e828e2fc408f6b89d119a57b7edf232a*360"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:3:ab+eltwise_linear:0.25:0.375 mb384ic1024ih1iw1oc1024_n"5930ff383517daa5437ef029cd74d1bb*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:0:abx+eltwise_gelu_erf+eltwise_linear:0.25:0.375 mb384ic1024ih1iw1oc4096_n"791ff0d88d05dcd45e290aee510cbdf4*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:3:ab mb384ic4096ih1iw1oc1024_n"d56a049209540e0b24da8f12854daca1*120"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:0:abx+eltwise_gelu_erf+eltwise_linear:0.25 mb384ic1024ih1iw1oc4096_n"f705d337c75010c261436129716322cd*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab mb384ic1024ih1iw1oc2_n"31b3ba74a66eb150c380696afb8b916d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc91_n"f845d3ea1fbf443d65ec98007e2739b3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb100ic1024ih1iw1oc360_n"646bfd4ed4c9e2cbc4a16fcc9173fcf0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic1536ih1iw1oc602_n"143b07f9114b4a6dbe86297cc939a2e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100ic1536ih1iw1oc2404_n"a011535601a7d48ffb1c4c9223f132aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic21504ih1iw1oc512_n"faa3e473a4443e7ec18f431d7b9b485a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1280ih1iw1oc1000_n"a9b5c6aa1ac4aea40e2010bb9d9b934e&dcac07c7ab2a0dc4ca16db4fd454845d&b64b1eed2586130bdb3605c113ec6277&651fa732ca0de192a8a97d3c32f4d050&5ed8e9b059d8c83498880420044ef593&8454e2747b657659bdaa828c33642fd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb32ic256ih1iw1oc27_n"8a24956223179d833a01f8f98973531a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb3136ic96ih1iw1oc288_n"ad69a0c0beea88efa243d1bcf8e5635f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb3136ic96ih1iw1oc96_n"93b50ca38499f44a1adcec8828091dbc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb3136ic96ih1iw1oc384_n"d39b54f6d37ae38b4ea41115ace05fcb*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb3136ic384ih1iw1oc96_n"29a7cdc1e0c8735fbc09ca3fd20efff4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb784ic384ih1iw1oc192_n"8c7c784d142b69a8b75409c9f653197a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb784ic192ih1iw1oc576_n"57965e31a9fc63bc05439803e4289187*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb784ic192ih1iw1oc192_n"8583d9ee8fb54ed3f600e5eba1fb8e55*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb784ic192ih1iw1oc768_n"62d128cff1c4f90e25b994176f656cb9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb784ic768ih1iw1oc192_n"717a86746ff4416cd992996c49308b50*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb196ic768ih1iw1oc384_n"0404f785826fd330a287c36a77b1791f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb196ic384ih1iw1oc1152_n"fd63fcf1098ac72c6766e25581eab275*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb196ic384ih1iw1oc384_n"bb2fc5f725c93ab8974fe0235b1a7463*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb196ic384ih1iw1oc1536_n"8ac54af4476c95f2ae517ac71a14f121*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb196ic1536ih1iw1oc384_n"2ef3a4d9bce92ddb3199a6c7017823c3*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb49ic1536ih1iw1oc768_n"0b10e6f2d40f43ba0bae85ecf2a208df*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb49ic768ih1iw1oc2304_n"40e69b8ae71e91c543327f4a0f440756*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb49ic768ih1iw1oc768_n"c4dbc2feedfca2edfb170d2707788fe0*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb49ic768ih1iw1oc3072_n"d420c86d946b874ceeea079566e5e026*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb49ic3072ih1iw1oc768_n"7415b978c8fdde068fd18e08a5148b11*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc1000_n"34b629d1f1f4b0f9c0d1d4f843e34e5f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1000ic12544ih1iw1oc1024_n"e01a0f35367a931ef66454234e628ab5*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1000ic1024ih1iw1oc1024_n"d1f0469f8f3ff2ba32c0432c2e268cec*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1000ic1024ih1iw1oc81_n"d47ddf6364fda0c45e7069fcb3092804*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1000ic1024ih1iw1oc324_n"02c98b756514fa151301573b52cc8fb9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb100ic25088ih1iw1oc784_n"274c422668c1bc98efb40b5e579ce8f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic1280ih1iw1oc1000_n"7ca62a24e7df3bb8e300cd1b0dec2726*5&7087cb9c705b3b5966ffd7886b535352*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25 mb128ic768ih1iw1oc3072_n"0b3d4957738460c2837b554741a2e601*35&7f89945ba4056d4c0e2b8afd9e4ffe8f*20&98bc8f3d18efaf09278da588c7e5b110*35&47f6db5e29bf6e787e7f99eab484f0e7*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc9_n"616a6b80d67c3dc618ad9373ed5664a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic2048ih1iw1oc1001_n"b5872e1d40df0290f0fd11eebb43f04d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic21504ih1iw1oc512_n"ecf6d1d53f0171d31efc93435ac04a17*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25 mb1ic18ih1iw1oc72_n"c5cd6e1eaa2e8ad87a8619e6ea58eb0a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic120ih1iw1oc30_n"f58f107e5b7c8011bd0036cf20e19b83*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25 mb1ic30ih1iw1oc120_n"d9c73b642ef2ceafd0ebde7292bbb33b*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic480ih1iw1oc120_n"a2b9c0ac95c5b3144019ade5a653aab8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25 mb1ic120ih1iw1oc480_n"0820bf766dec03816644baecc4af85eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic672ih1iw1oc168_n"6a613c3af670d5e100f522b4ef6dcf85*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25 mb1ic168ih1iw1oc672_n"ad5f22476a2b6e1eca79d640f513ab98*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic960ih1iw1oc240_n"52cc0e68a5638581500e5330d6ff540e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardsigmoid:1.0+eltwise_linear:0.25 mb1ic240ih1iw1oc960_n"e690681fab2f96342c7aef8b27ce6a37*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 mb1ic960ih1iw1oc1280_n"3ff0e018b541e5e48f8bce11a6229cc1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb100ic1536ih1iw1oc602_n"af8553572f202ceddb8fa1396f3236a5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb100ic1536ih1iw1oc2404_n"771cd0fd7e756834036c760f30ab2c79"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb32ic72ih1iw1oc18_n"43dcb0b578b4c295193497eb2b8a3bf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic18ih1iw1oc72_n"eeb7d7e97e14e579c51d6eae55a71d6b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb32ic120ih1iw1oc30_n"8ed49c55bcccbc403cc25532d8488e9a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic30ih1iw1oc120_n"4d951dcd7877feec4608a0536a510a67*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb32ic480ih1iw1oc120_n"1c97f1068dbed4302b1149ef2b4964b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic120ih1iw1oc480_n"b88dfe25bece949bf52e90af1b9b3f5b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb32ic672ih1iw1oc168_n"26c2cb06a654b7d06164bb39dbddd9d9*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic168ih1iw1oc672_n"94332eeb3eb3d5e83d6cd5bab1345f5e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb32ic960ih1iw1oc240_n"dfd3db2897bd3226be4ae91587daaa6c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic240ih1iw1oc960_n"f17bd60c64a81aa8e3e27bc6a8b2cf5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_hardswish mb32ic960ih1iw1oc1280_n"04fb645a753e858e63fc27b55b905926"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic mb5312ic256ih1iw1oc256_n"dd384be0e551223800abc12573b056ab*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb5312ic256ih1iw1oc256_n"b357ca2a5f2f05d38b30175d65f83ee3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb5312ic128ih1iw1oc1_n"103f0a50612755ba92cc39ebfbb3b16b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic62ih1iw1oc1024_n"e2a09214721883dc94e63c95ed86446c*5&c6f2a6b730bcfedb930e78f9a5cee112*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1024ih1iw1oc6272_n"f5c0d03b3c53c7a65bd7c1409a886745*5&5dedd5ca82982abc8d0372245630e2db*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic1536ih1iw1oc1001_n"ba09f12e99df65638438b7623cea0f3f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic256ih1iw1oc27_n"4910a6f2ab5a15b9c602fceade0d594e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc768_n"caa9141d57c80e844e6e1b4c16d7cbe6*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"5a7b1a58f741e84fe54c689d6a666212*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb128ic768ih1iw1oc3072_n"fec3ed388aca912b2fb48398809b3471*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"f17645e9e74baf26e1ab4990d3e88472*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic768ih1iw1oc9_n"ca68e0d6c59018f2f146eb32940af941*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb32ic1792ih1iw1oc512_n"b82d8bf55638722b3ddbdc102fe39842"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc1024_n"16876bea7799170ea4160618c36f7119*72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh+eltwise_linear:0.25:0.375 --attr-scales=dst:common:0.5 mb12288ic1024ih1iw1oc4096_n"5c90d921529b3f72e5ac2c239a468544*16"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh+eltwise_linear:0.25 --attr-scales=dst:common:0.5 mb12288ic1024ih1iw1oc4096_n"638b1158bef323c09f843cd48b56c1a0*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab mb12288ic1024ih1iw1oc2_n"2ec580457e3cc6c16092d6bfa6b51042"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic768ih1iw1oc768_n"0dcb369e643ed308365464d0b4db479c*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb12288ic768ih1iw1oc3072_n"efd5e86dd49c2a194308c85cb169feeb*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb12288ic3072ih1iw1oc768_n"7e6d13f0ea58e010f9080983ee4f278e*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_tanh mb32ic768ih1iw1oc768_n"f1e803eaf8304e12a86993a193b62870&85221b7a17e901357277ea5a1472cef1&c4cfe38d19e8daae7cd52be6a5d19277"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic768ih1iw1oc2_n"ab4b74e72934d1491439ef8ec656f3e4&1939c3923f1c01fd3b0caddc030e14e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic640ih1iw1oc1280_n"e5e258b36454a6a0b77e5cbe5fdd72b7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic1792ih1iw1oc512_n"7cb858e8b85758082f4d7930e29f5bf3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1ic62ih1iw1oc8192_n"6630f178a5917d4507dd1c28b3d6a76b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic768ih1iw1oc768_n"5d24126fecc6860651a59e9bd9d2d4f0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic768ih1iw1oc2_n"9e5cd6bf2030eb7ecf9086b794289859*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic256ih1iw1oc100_n"a86b3cbadf3be182c6b68d692bff5e50&d20ada6a467be86c26226e0fe36db930"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1024ih1iw1oc1000_n"de25ce4de34f637463590acdebe3c850&c9c1936e67300bb54311a0cf0dc48067&657190eafd7725c934333a59e46fd415&533f9ee9fd34692614988e10b7fb177a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic768ih1iw1oc2304_n"91dcd32c7b1f925ccdb45b53be47eb88*12&c31dc0323bd0211f7991ea07dc8c5109*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic768ih1iw1oc768_n"78f1b28ebefe439570ef516d2e141caa*12&593348c025dd7d96c816d7fe41755f8f*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic768ih1iw1oc3072_n"631d304dfb56074f4aef4d8fcd88b638*12&a0cccf82fa7ff3af710b19b5c75fdbb4*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1024ic3072ih1iw1oc768_n"cb158d5ed75a83eee80e062ba80b1128*12&fa77198f42535f81f0303fbcd6c6b0b2*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb292ic512ih1iw1oc768_n"0c20c2d98cfc6ae574ecec735f791790*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb292ic768ih1iw1oc768_n"29b5d334080cf5982afc1aceb6940622*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb292ic768ih1iw1oc768_n"66a485172fe6b1af7b9c27cb1a70d2de*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb292ic768ih1iw1oc3072_n"10754f9668beb7e27940c9a9082bb9b5*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb292ic3072ih1iw1oc768_n"78b1bee028c0b37c435e4cfdf4f67a9f*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb292ic768ih1iw1oc32_n"c694b12de8add39dee3a14ddefbcd976*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic768ih1iw1oc768_n"3d6bc7bf575894fdf8a385f83a747bf9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic768ih1iw1oc1024_n"464acceed7d8200680e858986b2a6f77*240&97fef3ad9d59f10720580234df4240d3*240"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb960ic512ih1iw1oc37_n"74ab926e562a54110ec20d73888856cb&226db153361dc5f4afdb5a8c141f1f8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:3:ab+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc768_n"a7ed3fb5f567dae9f504c4fd987e6874*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+eltwise_linear:0.25 mb128ic768ih1iw1oc3072_n"92776d44080d8bf3620633a57bfdb84f*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:3:ab+eltwise_linear:0.25:0.375 mb128ic3072ih1iw1oc768_n"e38a4f1a02785353975ccd1811033575*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"392fc855971539c6da7115a8849db3a8*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic768ih1iw1oc2_n"42df13af4f452516315d0bf3bd74f2e4*5&1d1b8416eed0f67a77de8b827e661100*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic72ih1iw1oc18_n"cb33de5a84fc98f552a439bd08419e0b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:1.0 mb1ic18ih1iw1oc72_n"e2a29c3a1a5a94a58a2b09ab83e64e97*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic120ih1iw1oc30_n"295e9a5ac6c7e0c82de5ed551f4843da*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:1.0 mb1ic30ih1iw1oc120_n"efd2535c4e577a6843fa63f5f77cd608*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic480ih1iw1oc120_n"8cb133c194d4f3490adbae140d6b470b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:1.0 mb1ic120ih1iw1oc480_n"db1362dcb153bfcaa198ec8a162d21fe*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic672ih1iw1oc168_n"4e4091f4598b8c7f708f4f8f136ec268*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:1.0 mb1ic168ih1iw1oc672_n"550e8c4a0415228504afa93d0e0195d9*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic960ih1iw1oc240_n"52d427c2e19b53145590ef40e2ecca5c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardsigmoid:1.0 mb1ic240ih1iw1oc960_n"5398d39e21da4f5638167cb178841b69*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_hardswish:1.0 mb1ic960ih1iw1oc1280_n"64ef4d701d558e760ccb3f40597c7a07*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb4096ic512ih1iw1oc512_n"80ecab42d3d259015f3ffe30a8eab8ac*20&08c0ad292477db61ed332de27a98df7c*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc2304_n"0d86c018ca976ef68ae1cf9a291ce245*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc768_n"c19176cdb92c64159057c4dcf771ec92*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc3072_n"7ffcb00a2bd8825994e887d7fb11c15a*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic3072ih1iw1oc768_n"7267f6548d20697801c62f65807d00f5*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1024ic768ih1iw1oc50257_n"0bc6066c24cf3063d600c438b8a56464*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1536ih1iw1oc1001_n"ef77c48488c3cba579e2e8a3fb3f51af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic96ih1iw1oc288_n"f7efe35fb6aa94f0e0869f9887af3c69*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic96ih1iw1oc96_n"5fc150d251ea87dcb35580d3bd2cf1a4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb100352ic96ih1iw1oc384_n"a692a82661b527e9f5ce110dc6d9d8ed*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic384ih1iw1oc96_n"bf6b7ca414edec243085fe022e231657*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb25088ic384ih1iw1oc192_n"feebe8dc5a31aed0f2947de5f6e20976"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic192ih1iw1oc576_n"d8934e917be43d8c5d1e77216ae973c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic192ih1iw1oc192_n"fd414bad04991ffd8363ecfb77238f18*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb25088ic192ih1iw1oc768_n"8cf8a47a331c0bb8304def86bec1f0c5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic768ih1iw1oc192_n"a1a742466578d6680756e7728d705b99*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb6272ic768ih1iw1oc384_n"ae8c966b6edfe156f5da821302e98c56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic384ih1iw1oc1152_n"492cc64c003cd2a03fa08d01c91454a2*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic384ih1iw1oc384_n"da653e537adadbda9bc50d022a18372c*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb6272ic384ih1iw1oc1536_n"ebc949fe0f1ddaa6dad16559daeeec94*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic1536ih1iw1oc384_n"e2f9c536b207b6295ba63e3542aa24d6*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb1568ic1536ih1iw1oc768_n"734462e0b215b427edc85dbcdd5e1ad5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic768ih1iw1oc2304_n"40984b8db923a9e3cca5c9b1713e7821*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic768ih1iw1oc768_n"bf29c7b5ccd61ca0f91653d33883ffe8*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb1568ic768ih1iw1oc3072_n"de779ca4cc2099acbe5900207d09412d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1568ic3072ih1iw1oc768_n"14a4b2ada74328b8ebee02fb3fccd887*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic768ih1iw1oc1000_n"14093992552a16496a62157c04477c83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=dst:common:0.5 mb5312ic128ih1iw1oc1_n"34204d9ace895ce3414cf0fbfeb34231"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb77ic768ih1iw1oc768_n"e74b09d1518a93f1af7703a1d91fbf39*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic768ih1iw1oc768_n"5e0cb8b2e9620f796020a82617294083*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb77ic768ih1iw1oc3072_n"3160cb6378857d97a44ec71bfe17fbb4*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic3072ih1iw1oc768_n"af60241cf6eac853d7a49af6066db5dd*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu mb1ic768ih1iw1oc768_n"fc8a7030954797d34388cade26ded733*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1024ih1iw1oc1000_n"ea589cd416dac5a442238df32cb548a9&f4ae987faffac343827f428f131e5afe&88af876a7d7c6710e8daee8a90dae22d&3ce32cf63bb743c32a48b6596e7b3423"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic1536ih1iw1oc602_n"2540c65cc1133bcd42246794d87e3ebf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic1536ih1iw1oc2404_n"96e3a139f4ebb2c11f088a5fefc0f915"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic25088ih1iw1oc512_n"6c0f6be8cc8d962fd3b973a26b8fb807*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1536ih1iw1oc1000_n"2862c9fbc43cccc2a97d51b2053cca93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic62ih1iw1oc8192_n"85e710ddb2834eab265b80f84873175e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx mb128ic768ih1iw1oc768_n"cac96a3adb8969ea490a7dd23b92eed2*110"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25 mb128ic768ih1iw1oc768_n"2cb261a6659339d7a2d7d0dfcddd4dff*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"e4586ee7fc7fde2f695aa7feed229ee6*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"ceb986c815451ab71e2ebead56070c73*55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic3072ih1iw1oc768_n"259db3893357141cfe3b5c686db79871*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_sub:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc768_n"87273cf0acb7ec83fdb0a73584bcb277*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb128ic768ih1iw1oc768_n"daedee16b9744396a9e614541a774e7f*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb128ic768ih1iw1oc768_n"af0f4f0ed66a0ff4f9f215cdc32b73f5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_linear:0.25 mb128ic768ih1iw1oc768_n"b902f18110a737eff29d32cd76d9031c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb128ic768ih1iw1oc3072_n"f1197a26f534355cc3ac1842ceb47f62*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb128ic768ih1iw1oc9_n"0e8451fb13f73edad3f30b04a3a59957*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc53_n"3a6ce93f532e5b381073b5d1739ca02d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb100ic2048ih1iw1oc360_n"0adbc6e7f0978d2111d0e0b031a0fbaa&5bcafe9c457591d0db2a18455a5606e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb100ic2048ih1iw1oc91_n"db3bdf5bba3e0ebca87d743313c1f298&46f08ee1dcaddcf571ff455b62fd15d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:s8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.25:0.375 mb100352ic64ih1iw1oc32_n"40a7802bf3a32522dc4a72ae449e4e9a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:s8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.25 mb25088ic64ih1iw1oc32_n"ca08d791a1abe7dcfa3439e33d76690e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:s8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_linear:0.25:0.375 mb25088ic64ih1iw1oc32_n"b4dbe40009b6a73a0c90773c2d245c73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb6304ic256ih1iw1oc768_n"76e81d5c30711cb1e4a3f311183b77cc*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic256ih1iw1oc1000_n"3bc1775f7f56b4e59692993b98c823c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb95ic512ih1iw1oc768_n"34cccdef1ae4d62390924fd0c1f7256f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb95ic768ih1iw1oc768_n"5dc26985b4361925dc073e551bdf1f4f*180"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb95ic768ih1iw1oc768_n"29800898f9f8b47bee870d5e4c8f7a18*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb95ic768ih1iw1oc3072_n"2ff05c6fd599dea859b26ad84444f9d3*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb95ic3072ih1iw1oc768_n"a551e7bbf5f74b3f2a730d1ae3a8cb91*60"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb95ic768ih1iw1oc32_n"ad2e5d59c0018d4238c984ff8ced2c34*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb3136ic96ih1iw1oc288_n"4d6763a23d95bfc110068e5436811302*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb3136ic96ih1iw1oc96_n"74bcce1428289f08030b150f32dc1b55*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb3136ic96ih1iw1oc384_n"c47d8f9053c7e29fd96b44d56953c996*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb3136ic384ih1iw1oc96_n"311bf686b8c5aa9af9a3acc757f2c027*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb784ic384ih1iw1oc192_n"0ad0320591f2a5f987639b4b6904d183*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb784ic192ih1iw1oc576_n"0f9830343a4ad5aa2aa52e7109444978*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb784ic192ih1iw1oc192_n"f7024d297b0855d1de2cfffabae80f96*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25 mb784ic192ih1iw1oc768_n"352b2e37ce183bb92cb246f33840eacc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+sum mb784ic768ih1iw1oc192_n"fe82295570d53d8a1c58e81ec350108b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb784ic192ih1iw1oc768_n"54235dce07eebbf8a33918f711037d4f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb784ic768ih1iw1oc192_n"fdc76290f2660ee84d83d83d5184ffce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb196ic768ih1iw1oc384_n"644ead891195f9168da045d32bd863a5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb196ic384ih1iw1oc1152_n"1300f8576c2ef3ed71bdf5ae4c45edc0*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb196ic384ih1iw1oc384_n"276f242051e01c6482a6f0b340196741*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25 mb196ic384ih1iw1oc1536_n"781d5a1841d7a4e494f57abd19a68c40*30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+sum mb196ic1536ih1iw1oc384_n"6cb1d428383edd5459c976e303b3e6b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb196ic1536ih1iw1oc384_n"acda9e07f040944f56d041b5d8bc2286*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb49ic1536ih1iw1oc768_n"e142334f1e9d99dac5c700639b997760*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb49ic768ih1iw1oc2304_n"c222140beb493b117183f9daac8b0a90*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb49ic768ih1iw1oc768_n"ac20b41211ef3adccb27c6cfd4db8291*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25 mb49ic768ih1iw1oc3072_n"0b50b51da3c5901564a0f011acf5e7fd*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb49ic3072ih1iw1oc768_n"3649163f54f9934ea3074f3b7cbf285a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic768ih1iw1oc1000_n"fae872aad82e4726855410e68d9f7c14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic1280ih1iw1oc1000_n"500ec3e2d9ea6a73fd67825cf6483355&a0727d7fd9ece06002e2dae3f2eb067d&44547917ab676a4078fe437905421fc5&a95cde235dc301f765def3c165cce16f&bfc5e33a986f72b88b04b6a5c1262ba9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic2048ih1iw1oc91_n"b32c12cd26f9423ac0b09f9ceddaac0b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3200ic2048ih1iw1oc360_n"dca41c348ec387f8a846cf78910c8f7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic147ih1iw1oc192_n"7ccff002b3adbff736fe885a754235ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb100352ic64ih1iw1oc32_n"5ff3d0834fbe365d7c714b8146150637*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb100352ic64ih1iw1oc64_n"a44f8815d09d53808a49b926e4299a1d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb100352ic64ih1iw1oc64_n"a6868318859c20891bef3a7500376234"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic576ih1iw1oc192_n"1027c4f3df34bdc9fd866eb760b112cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb25088ic64ih1iw1oc32_n"0a7ff14a4fb27935dd4d2e2f14b3c8bf*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb25088ic64ih1iw1oc64_n"0d68dcfaf1f670f1683aaf8b663f1a25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb25088ic64ih1iw1oc64_n"e33d37567cb6e2718314bd299b8b5df7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb6272ic576ih1iw1oc256_n"2b135593916b0a8dcc8d532e1cd31bfb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb6304ic256ih1iw1oc768_n"4aac59f7605458266645a7d0decd9e9f*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb6304ic256ih1iw1oc256_n"8e59c8ab5a8eab6aad51c5315843b7c3*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb6304ic256ih1iw1oc512_n"43950cbed172ca0f4ab0c037ac920f7e*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb6304ic512ih1iw1oc256_n"dafada921a1ad0076c359c2d8e26ea06*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic256ih1iw1oc1000_n"a3c7293b95b24a650f9f869e8818f0a3"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic30oc2_n"eb87dea291b7086be91896f22211fbda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic768ih1iw1oc768_n"05ba8953108bd0bdaabacb2a4a68031f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic72ih1iw1oc18_n"08073de9804cc39c8a1ae16b289f9718"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic18ih1iw1oc72_n"630431f32a54394c55d3bc5055c86171"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic120ih1iw1oc30_n"8d1f46f92afa7719f8d1a4946649e56a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic30ih1iw1oc120_n"ff10062eb3e8c6d0b0e9f2e32c8a1ce4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic120ih1iw1oc480_n"028edd6e1e4be573bc6aebef5f69f1b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic168ih1iw1oc672_n"bb2c04c84e3a1119a86aab4ecbe5c6e6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic240ih1iw1oc960_n"eac93fd23103272e9a73ac7f6f56a2b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_hardswish+eltwise_linear:0.25 --attr-scales=wei:per_oc mb32ic960ih1iw1oc1280_n"2653168d2371e85fb3c8eae496c627d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic256ih1iw1oc27_n"dcb2bdfcd89b693b06bbd3f6276248e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic25088ih1iw1oc512_n"f00fde9271d0bde656bbcc88fb1f820b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic256ih1iw1oc9_n"38f368d3814362f8ce1adf223e5ef91b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb5120ic96ih1iw1oc6625_n"03940b35e3008d32d0b2bd46d6af0455"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic mb1ic440ih1iw1oc2048_n"b382ddeba60410e9540dbd7b317b2688*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_logistic mb1ic2048ih1iw1oc2048_n"f8cf6f638a6a84c9b9d0d0558f5e22b8*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc3941_n"e4442c04fe8c8095b78252ea9d907e75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb3200ic2048ih1iw1oc360_n"5043e75a2416cb9632f6c7a6a5d0baf0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb3200ic2048ih1iw1oc91_n"c22f959d22239aed79b1f3f6986b4947"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic768ih1iw1oc768_n"7c90508413930fe9f9a471a85ac8494b*48&26306439117463ddc08a868be271a437*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb4096ic768ih1iw1oc3072_n"7341aefb34a9926c43682b6115801ef6*12&7c1ad6453c95b552964a5c38f46a423e*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic3072ih1iw1oc768_n"bafa7db6e6b0141441d129da2bccb288*12&d07f1ad055b9e1d60bf05a82bc9da842*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic1344ih1iw1oc512_n"65d1fa4d8003919b26bf886bdf98d675*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc29_n"ce73c6c1495af84024b9a3db049dd5c6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic1536ih1iw1oc1001_n"a2bc227c07330a1bbf4be63a5a74b80f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb3136ic147ih1iw1oc192_n"4c1fab760679f27f873b0652c0466379*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb3136ic64ih1iw1oc32_n"e0f1768250a5686c90500ee69518ed7e*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+sum mb3136ic64ih1iw1oc64_n"22fffa4df389ed58847a9c381bb3d8fc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb3136ic64ih1iw1oc64_n"1d710543c5ea403055d6d9242ef683d6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb3136ic64ih1iw1oc64_n"ce689f85c081b24362d09087bc9a14d1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb784ic576ih1iw1oc192_n"7ba02d9506ff566fcba7b7ee5f81ffe6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:3:ab mb784ic64ih1iw1oc32_n"05a15d79ba520761562ed2fa8b1e1b06*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+sum mb784ic64ih1iw1oc64_n"53e13b7efdeb7a90f261d988da8cff7c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb784ic64ih1iw1oc64_n"f14595956abb62105b318dfca9798e8b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb784ic64ih1iw1oc64_n"7850b60facca54ffedf742f16959ef09*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb196ic576ih1iw1oc256_n"48e3e55791d76f3325d48510f8f687ad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb197ic256ih1iw1oc768_n"99a8b0d9d39358e8caed1d1905bc41c7*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb197ic256ih1iw1oc256_n"72cadedf2714d3e39513fdebe0a561eb*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25 mb197ic256ih1iw1oc512_n"b5ab13a453843ffe9d3d59108d4debba*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:3:ab mb197ic512ih1iw1oc256_n"285fbc299224ddff3fb4fa17495b141e*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+eltwise_gelu_erf+binary_mul:f16:2:abx+eltwise_linear:0.25:0.375 mb197ic256ih1iw1oc512_n"4e384e00a6c1393dd9b161d8128e39d2*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic256ih1iw1oc1000_n"365aff9099b861c15a59811f954b0ad0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx+binary_mul:f32:2:abx+binary_add:f32:2:abx mb32ic1792ih1iw1oc512_n"967ca97971fe5b4cc8b86fa0722cb2c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb32ic256ih1iw1oc12_n"b90f5c72febbb511198bf53e802d91d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic21504ih1iw1oc512_n"a89706dd97c68fe8928e43f7003a49da"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb100ic110oc1_n"7522797749ada544ed38d0e095c9f227"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic512oc6_n"5ca66909d5f88cff1d3e824bbbf5af24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic512oc2_n"4eb10e38b1d5063037c5e0b7d71f51a4*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic512oc3_n"37d5f23092b066eeabc303e03491ee41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb300ic2048ih1iw1oc91_n"a49bc1b5377c957a36048979363362e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb300ic2048ih1iw1oc4_n"4f13bf7c2492faaeffca45f2faea8619"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic --attr-scales=wei:per_oc mb32ic440ih1iw1oc2048_n"243a50008b025562b8c3a6099935890f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic --attr-scales=wei:per_oc mb32ic2048ih1iw1oc2048_n"a7989702c29eac838f2933fbe9706c1b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic2048ih1iw1oc3941_n"af87326b7de761aabdaca855a1d90299"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic512ih1iw1oc768_n"54be1b6124c2bb69b807dc56b5d172c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic768ih1iw1oc768_n"173dcada06b946b697bb3c3c30472db9*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb3040ic768ih1iw1oc3072_n"3a584ec52bacc57dec8ea0429c46b6ae*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic3072ih1iw1oc768_n"a0d5a0cfe8a0a7ebf794bcc5fc1c7c71*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3040ic768ih1iw1oc32_n"9043bd6d240f72cebcd32f89a0b3f1ce"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic1000oc1_n"3f882d53ff4ff78de93987d820140e15"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic3000oc1000_n"4cf45b51ef62ac1ee5dacbd5111be18f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx mb1ic512ih1iw1oc53_n"727c105dba9ffaa7cfeac9335ef97b48*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic21504ih1iw1oc512_n"05a7c367cb215657c8193c77e6947280"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:s8 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32ic960ih1iw1oc256_n"38c491f5fedd66c91dba7ab44f122fd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb300ic2048ih1iw1oc91_n"fa982a4affe0ac6bff44f13a2107e30c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb300ic2048ih1iw1oc4_n"72ad6abefa07b78b221bd9c7be3d590c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic512ih1iw1oc1000_n"7b6e0390e2e7c73bd47cadd831b578c8&6a0e144f8a5867bac0cd22fedfb8926e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx mb16ic494ih1iw1oc2048_n"cf455cbeaef63cac9582a3ad76587176*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_relu+binary_mul:f16:0:abx mb16ic2048ih1iw1oc2048_n"5a7971d7601dcb4e9db8182cec9adddd*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc8192_n"80536ffe5999b744e4601c99588c83ff*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb16ic2048ih1iw1oc29_n"8abaa6ed64a5d191727d20b7c70ceafc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb32ic512ih1iw1oc1000_n"48990aa64d18df657c4c983906698868"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=s8:s8:u8 --bia-dt=f32 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb1ic62ih1iw1oc8192_n"ae1471a1b222c77f2541d5f8e78cbff0*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic50oc1_n"53853e46c5aa24befbd12779c90ec7df*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic100oc50_n"098f2ec42925b7d56c49850cb320cce1*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic500oc100_n"f5f355aa07709a0a8bb21ba83daa4910*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb64ic1000oc500_n"85c028df8c6bc89e74e644c23378605d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic mb32ic440ih1iw1oc2048_n"7fb77857f344b584b8abe8c565b60a31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic mb32ic2048ih1iw1oc2048_n"fb7bba502c1a862e45cfa3a442a12a36*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb32ic2048ih1iw1oc3941_n"bf33654e85160835a1c53b55914665e7"
diff --git a/tests/benchdnn/inputs/ip/option_set_fwks_key_gpu b/tests/benchdnn/inputs/ip/option_set_fwks_key_gpu
index 63d1850a46e..5f5bc33ad76 100644
--- a/tests/benchdnn/inputs/ip/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/ip/option_set_fwks_key_gpu
@@ -1,153 +1,291 @@
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb256ic2048oc1001_n"caceaf4a80c7133008024ac9dbc4476e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic128ih1iw1oc384_n"fc7b21137b615e58aae5270baaf7ef22*2&6bdb3d38ef42bcde2f30654910f75313*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic128ih1iw1oc128_n"dc3ffcb9fc0f8ad04311c9fa8d4782c2*2&bf91f1abe8e696f2496be2b048d7f7ba*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb3136ic128ih1iw1oc512_n"778537bccdd409ebc12885159cb9d2df*2&9a9edec4574ffd11bb8d9849648bf97c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic512ih1iw1oc128_n"3ec36f959c6ef40c8c75e29a53075841*2&2f20b9ccb30749ddf55d198343991989*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic512ih1iw1oc256_n"eaaba43243deb675aa252dbdc640ea82*1&7b4edd8cae2824df2e7d95f35a3a96ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic256ih1iw1oc768_n"23ff9bd07eb93d72d56d38a05a72e95e*2&472fbc76bdde23b25ef2696806b7ffb1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic256ih1iw1oc256_n"0d2f5c4433124f0c8b99f89e29cbb968*2&7c3a3602cb90e441dcd8c2c802fc9486*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb784ic256ih1iw1oc1024_n"2e1818d9847672eaea3698817bed3e31*2&f7473f394bdfe99e35e94885879e1ec4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic1024ih1iw1oc256_n"1ba1bc8019867dcd0f8e5eb2093b71dc*2&7df03a6faed79850797feaa925d35a27*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic1024ih1iw1oc512_n"30fb4a3a9eeed208d90b212263b517bc*1&8a28e5d32d998e74a3cf95456f340899*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic512ih1iw1oc1536_n"4f77d2320201bc1ae1fffa4632bfdd2f*18&3fd3ee833bb26c2e3cfeeaf66b392f98*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic512ih1iw1oc512_n"71cf3c57a899db106e90e74460ca1f23*18&86ddedb16070a966a16e27a64083b0b6*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb196ic512ih1iw1oc2048_n"3c92a30f3f13f48fb50ea0d5b0dcc437*18&1bae2ec84c42123eeb2d660971cf8901*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic2048ih1iw1oc512_n"79ef8e61b10dfc914c47617e3d522748*18&e17c37bc363fa3ba33495c4d401bb2df*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic2048ih1iw1oc1024_n"4cd6835da63fb97016948b5327083789*1&f3a457c1832fba26af9fc90331d4b83d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic1024ih1iw1oc3072_n"c46bed7780ee5723293229e0165416e8*2&989b09fcc0f7fa01077d601bdb87fab7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic1024ih1iw1oc1024_n"622d01a14c6795467b66283058f1492f*2&5a5bd147b60e2ccc7cdf29a3a288f597*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb49ic1024ih1iw1oc4096_n"21b0e56dc50365eb43690820122d2015*2&d908dc9c763e046e9864f11927d67a85*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic4096ih1iw1oc1024_n"d1db7deb20d60388bc120e7c9e130ac9*2&ae9b9f0825cefdffef8b7ebc9069a0b7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1ic1024ih1iw1oc1000_n"4a84b145951888b85a2f327f71dbc5b9*1&a5d856668daeb4667dbbc341bfff10ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb256ic2048oc1001_n"0ba83f84e7abfd30ade318c818e94df3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb16384ic200oc1_n"89c080244944731174d04ed0cab1fc5b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb128ic14oc200_n"dd5a6ff9f2db2cb481b4a520d0ab2038*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb5000ic845oc1024_n"eb3fd7d12a4cc2c852d78924ac3154d4*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb5000ic1024oc512_n"2eb85f3a25c7ac7a447a4c9389078860*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:u8 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-scales=dst:common:0.5 mb5000ic512oc256_n"84f8c77be12ff0fbb346d5bcdee0fe3f*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:s32 --stag=ab --wtag=ab --dtag=ab mb5000ic256oc1_n"c03edfd3b6eb3a07b13457fa0c59ba5d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb32768ic1024oc1024_n"26d43e417cff76a7a657d02a87365562*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb32768ic1053oc1024_n"71c48fbc989e6db4d1dd207bc8a474c8*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb77ic768ih1iw1oc768_n"348e018cf5320ce399ef508f8eac3fb2*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb77ic768ih1iw1oc768_n"570fa5d8c7fb2f9c9bba32976f9a7a30*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb77ic768ih1iw1oc3072_n"f318364d8fc065f5d43cc45c93d1af28*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb77ic3072ih1iw1oc768_n"ac60402b864716a545a456a2fa1ac12d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=u8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb64ic2048ih1iw1oc1001_n"4147a89ffa0a73362d5f4c65dc48a810*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1576ic768ih1iw1oc2304_n"20723d95ee2f3c1dc4a67063f0c28838*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1576ic768ih1iw1oc768_n"7f576df89beca6317f3b807ecdc4ee80*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb1576ic768ih1iw1oc3072_n"a94173c7e059f8bd4b6bf14ab49c7ef8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1576ic3072ih1iw1oc768_n"65167c2d2df7c2f9596e2ed5d9715f02*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb8ic768ih1iw1oc1000_n"188a164a0bd8737bbe6cf22e04228427*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb4ic64oc4_n"3bc574aa7a666d487c6c76432ae34f63*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb4ic128oc64_n"ab1ac389c4cbb1d4095284d4a5b74a68*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb4ic32768oc128_n"9e6167f663807bba187ef5d333346248*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb4ic64oc4_n"0860457251672c85f17fb2216538010e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb4ic128oc64_n"ada4490cafd14933879f9ef48c16ada8*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb4ic32768oc128_n"e7acb03e2b5d8ccc720051ab2d786343*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc16_n"d3cc31a4bc3b7f6f286726d76088b59c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc32_n"448acda24a4ea9e3ed93dffa0a228c1b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc16_n"679a489c3a4a2d65cb4434d950dc6095*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc32_n"4af6d9d259ced9e48f21bd74565520ac*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc16_n"87f2dade4f9fd094ecc68b8ecf036a10*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc32_n"114a20431f5ee44b47393563f032fdbf*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc4608_n"c1bef995d7857367e192ceda5451319d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc4096_n"b9a4c34e30274c23eb20e6e87c8b6eb6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc27392_n"b6c53d9c31338094cde1d4a04b971f92*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb25ic13696ih1iw1oc4096_n"1d6a33738f228d887307c22f50f2a0c1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc65024_n"bdf6198a5605f6b6f1320c8d291a5796*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc4608_n"dbe337bccb3ba7e805d2269a10bffac7*252"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc4096_n"8a08734955e73740ee49dba2f44268f6*252"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc27392_n"ccc3073b4d8b8a64678a793f6a3f592f*252"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic13696ih1iw1oc4096_n"760e6335153b60f97c1bb03b1f826746*252"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc65024_n"6db10fe73802a849b036a3d89415d772*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc16_n"70762a0ecb11ed7623a4e626c64a9970*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc32_n"8e2aa21f8f4462a116fcb6a28fccbb17*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc16_n"664109dbc6f40af520b307de2a3beb5c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc32_n"43c4ec20cd4d87bbf2075bebf38b510e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc16_n"e9ccb14f039311c525b924d2cbaa6d07*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc32_n"f9520b65a49ba7fe48f4b04824378303*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb2432ic1024oc1024_n"612e9133add07f93caf5cfe83c5d2b65*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb32ic1024oc1024_n"54238974844ab0df9d5a5dc8fc924010*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb16384ic4096oc1024_n"7f6f650abe8764071aff5068c0612923*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb16384ic1024oc4096_n"c058445e3d6af929cb17325a57814fa9*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb16384ic1024oc1024_n"80ba8ac7978d6ee45c9ae6013dffee17*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb1024ic8oc2_n"7f64e090d101c6bdb22b60c1e77d74cb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb1024ic4oc8_n"920f75e9ae480846c52b370d5d7b42d6*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb867ic8oc2_n"22ecbf6991127001cbf1d1ecbce7646b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb867ic4oc8_n"822a8f0dc65092c12c1fbe1b34d51c51*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb128ic1000oc1_n"c9ad7c3cf87f1e6cd0ac31c11c7d613d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb128ic1000oc1000_n"a6c520b2f5f5a4ef92e7987f5bd26807*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb128ic2000oc1000_n"7dc6502fd71b7053112f01bfce3d3698*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic512ih1iw1oc512_n"5fa969a118cef9419d0556e24a80f161*3&97199e271cc7cc68cf62a9a206b06b5e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb4096ic512ih1iw1oc512_n"20a8c14f0f0afe68b732f73fd8a449c0*1&25ab0888fd7991b39cf5e2c511b41474*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16ic13ih1iw1oc512_n"1d3b8d7823d3ddd5f2877aceda8cbe4a*1&1d3b8d7823d3ddd5f2877aceda8cbe4a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16ic512ih1iw1oc256_n"f3398974aa1a616cc313850050986fe1*2&f3398974aa1a616cc313850050986fe1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16ic256ih1iw1oc128_n"56d0a65112f094c6f4dc62afb65a5d1d*1&56d0a65112f094c6f4dc62afb65a5d1d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16ic479ih1iw1oc1024_n"47492416946150b7ec3089187ad0572e*1&47492416946150b7ec3089187ad0572e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16ic1024ih1iw1oc1024_n"f9b66afdeb28da88e9f39eddecc34fa9*1&f9b66afdeb28da88e9f39eddecc34fa9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu:0.271:0.314:1.234 mb16ic1024ih1iw1oc512_n"cd52cbc726b3d0eda045412504af626c*1&cd52cbc726b3d0eda045412504af626c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 mb16ic256ih1iw1oc1_n"2c1cd909b91e728d592f4b159527670e*1&2c1cd909b91e728d592f4b159527670e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb197ic768ih1iw1oc2304_n"a399027e219f1185c704eb1b091f50a6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb197ic768ih1iw1oc768_n"2bedfa213ea2c3a614e59e8d8d4ea2b2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 mb197ic768ih1iw1oc3072_n"d54288a7452014ae2774c123a7cb7397*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb197ic3072ih1iw1oc768_n"1ff582c8b91eafc28ce4101e0941ad07*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb1ic768ih1iw1oc1000_n"ea31e06c4433cd51ab879d9bd80f96ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb1216ic1024oc1024_n"6bae90e8a5ea61eeab97b9d409baedeb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb16ic1024oc1024_n"615b23e1b87e2f07853a8cd5d2c83671*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb8192ic4096oc1024_n"81fc276be83fec2ad85202f1c92c5ebd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb8192ic1024oc4096_n"b11cbb901ff2ff6fd334b6d0c1b65a75*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb8192ic1024oc1024_n"2e08e560e23fd4c689f608fb2affe6fd*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb154ic768ih1iw1oc320_n"9ae84220acdc6f91685a7860d53f15ac*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb154ic768ih1iw1oc640_n"8698915d1b0eb0679572d87d0534f8c2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb154ic768ih1iw1oc1280_n"9dc60be326ae245391b332a0c3f13937*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb2ic320ih1iw1oc1280_n"a74935b2f17b3a87e3b1ed45a4d80132*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_swish:0.271:0.314:1.234 mb2ic1280ih1iw1oc1280_n"0a276886f751d2e55a3a153d2d5f8491*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb2ic1280ih1iw1oc320_n"41b9348a84f89d7a2aa2528b627fb044*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb2ic1280ih1iw1oc640_n"27c861a3512b80b7f1c27783b4fcc3ec*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb2ic1280ih1iw1oc1280_n"2b0855b0027b1c06b3d3351e16c713ad*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb8192ic320ih1iw1oc320_n"b7fa219f5887a9727634efed9cf6d9cc*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb8192ic320ih1iw1oc320_n"33a84a15c9e0be1ca00a10f549331e16*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb8192ic320ih1iw1oc2560_n"7ff01af699e801fbaacba0a51fa7fb66*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum:0.5:0 mb8192ic1280ih1iw1oc320_n"5b366896e2be1ba3016928b80cd1bbaa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb2048ic640ih1iw1oc640_n"de571f8d69b9812b891dc93480fb99ea*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb2048ic640ih1iw1oc640_n"1abcf842cab73091b5658410c895ca6d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb2048ic640ih1iw1oc5120_n"bd02f545301328633ac2ca75673edada*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum:0.5:0 mb2048ic2560ih1iw1oc640_n"9782c64249e3f77c443ec4dc7c1090da*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb512ic1280ih1iw1oc1280_n"cba16f58648cd650b41f99022f979339*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb512ic1280ih1iw1oc1280_n"fedd591f8e594c959eaf3c9ce43f5d7a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb512ic1280ih1iw1oc10240_n"05ce8cd670ee5069ca0664a7a0e5208c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum:0.5:0 mb512ic5120ih1iw1oc1280_n"37f9453084043c7505ea7e0070d71f78*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb128ic1280ih1iw1oc1280_n"03a716d28dcfa91cdd0377a22319a543*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=f16:f16:f32 --stag=abcd --wtag=abcd --dtag=ab mb128ic1280ih1iw1oc1280_n"e7a487d53c916326a4ef6d2bb056ae20*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb128ic1280ih1iw1oc10240_n"cffaa2156d7a4150f8960bb27f102236*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum:0.5:0 mb128ic5120ih1iw1oc1280_n"761dbebc1d9fad1e2d91f243322e9e47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb256ic2048oc1001_n"0973c146786645535a64a65243a5891c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2 mb16384ic768ih1iw1oc768_n"40dd2ede5964a84e591fb0f94b95e8e4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb16384ic768ih1iw1oc768_n"45623a24c60a7029e3936e6596a7b33f*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2+binary_add:f16:3:ab mb16384ic768ih1iw1oc768_n"a49fdec917b722f0aeec4a38f4befae1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:u8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2+eltwise_gelu_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 mb16384ic768ih1iw1oc3072_n"f3e8fa5e975c818fc7e6d28763c1124f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f32 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2+binary_add:f16:3:ab mb16384ic3072ih1iw1oc768_n"f5418a1cb9c32b2f4e8b9c14de5adafb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:s8 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2+eltwise_linear:0.271:0.314:1.234 mb16384ic768ih1iw1oc768_n"4e08245ce4367c77d7b86415ad6aed55*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_I --dt=u8:s8:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2+binary_mul:f32:2 mb16384ic768ih1iw1oc9_n"62b891c4cf2a356c3ce6b694439761f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab mb128ic768oc2_n"7c611b9d5ca33a6156126be830e45a2e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb128ic768oc768_n"d9fd8427dd57b3a0e17e241b756b595d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb16384ic3072oc768_n"6b3fafebba79a08e1e34eba2c2996946*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb16384ic768oc3072_n"d5140e8233cc19e1f9eff1d54d3d329a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb16384ic768oc768_n"cf53f8efb9dc0fb192fe7299c148f196*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32768ic1024oc1024_n"57aed9d7e98a326cec4b047523919d3a*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb32768ic1053oc1024_n"8f04c5527dcc76e73dda52ddb54424f7*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb1024ic8oc2_n"ff947ba155b09964432adde0df806c10*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb1024ic4oc8_n"07ca825f3d9452f7f1fb42a8c29020b0*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb16384ic200oc1_n"f63eca503760dbd3dc6f0509d4309ab0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb128ic14oc200_n"9c938dd616030cadb32b5d3ee20409c9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb32768ic1024oc1024_n"afa2198868d3956c153925534bd7849a*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab mb32768ic1053oc1024_n"e2402516afe3eda5e1e8bc26b895e1e5*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb1216ic1024oc1024_n"8116796cf9e088c4de3949cca1758ef0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb16ic1024oc1024_n"6de33af60edeb738f92198fce34fb442*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb8192ic4096oc1024_n"66a60934627a2ed58a82bd9f6120ad26*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb8192ic1024oc4096_n"7ef761105ff10d893ce7c68cc647357c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=BWD_WB --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab mb8192ic1024oc1024_n"7dd21081d287fd64364e2c59e8510ec0*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb16384ic768ih1iw1oc768_n"b5dc49b5559085db182f41f7e487df10*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 mb16384ic768ih1iw1oc3072_n"11cfb6838bb01345c4370376f9c8ffd4*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb16384ic3072ih1iw1oc768_n"81197f8dd55fd77e811d4914062fda87*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --dir=FWD_B --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=ab mb16384ic768ih1iw1oc9_n"f3ffaf17d3828257b3cb7a217dfd26e8*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb77ic768ih1iw1oc768_n"82d578c96a79d3384f3656a3babee665*35&5c5340164e88f5aea71b7eae047cf5b8*71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic768ih1iw1oc768_n"3e4932411d08bdee21e9fd8b08ad5776*12&e71303dfdc8013c6e326246125b2642c*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb77ic768ih1iw1oc3072_n"3e29fe4b20c40e7de68031ea758de828*12&7f56b8b2fb0da979e812db08e1b2ebcd*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic3072ih1iw1oc768_n"78e978b8461eefe9cd301e23a4c1fcfb*12&4b5538c7c2d40d65415127c17c8fe533*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb77ic768ih1iw1oc320_n"f542a18500d2a7166d359d099f23e9b5*40&c9461ddfe000c074945b30ae13223f43*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb77ic768ih1iw1oc640_n"996fb90418560d9dcc49a1dcea194d9b*40&1e823b1ff64fcb52a3e2f44f2edfc3c7*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb77ic768ih1iw1oc1280_n"1f6e2817b1452899d6acf9ce088de380*48&3d6327efe905e5c205a128edc38432a1*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:2:abx mb1ic256ih1iw1oc320_n"29076e7fedb3a177271914ba1c4d28ed*4&2f33c2e0a8df74ad5588292143f0aceb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb1ic320ih1iw1oc1280_n"a1bcd4a991c92507b3e41c2c4687e864*4&14c9cfc2d021d161547940b7010adc21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb1ic1280ih1iw1oc1280_n"b3ecf00add36325ead2b5b438c1d0857*4&f70e5d385cfa55607fe9a851e460f1da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1280ih1iw1oc320_n"6eeb9c9b192316c7690b590074a0ece9*20&894407c29fcef8a7a096b83691f40f20*4&a19a87ee7d871c973fe8e4978adce47c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1280ih1iw1oc640_n"0b896a185338aaddd0bc31c29f1cfaee*20&6eefcf73de6541742dd385ee60d0c4cb*4&4a7087dec1428931aaee761dd3707d7f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic1280ih1iw1oc1280_n"aa1826f131ea35f3c12bdee2875719d7*48&b558102e3c3525662ac3928f62b36570*11&c7eb854b6d6dec392ce65b6c0b8470ba*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb4096ic320ih1iw1oc320_n"354d91839e3d8551be33ba7a4bd53896*80&e373d89fa6610eaf806feee3a7fa5dc4*19&89e90825c17850aba8e3c3360fb6b399*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb4096ic320ih1iw1oc320_n"071438e4c517065d1eaebca016e41285*40&92ec46538cbac7c8653f0f46b15ca7e9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb4096ic320ih1iw1oc2560_n"62853cef22b7c7ed7a636f8641597ddd*20&fa172615eacd024d9bab694488d7a5d2*4&c8e9cf614fa58ac388e98baf86b752f2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb4096ic1280ih1iw1oc320_n"c42925a52657ef1f37bfc1c61964b5ee*20&1827b80a62a3d7ba5af021f3b3720ccb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1024ic640ih1iw1oc640_n"fe7b5ed67864e2f5c16b5a548ad18d1c*80&6ec5c25f56c4befb18406719f8b53a98*19&6ef8b6c6dd76815686b3e6b6e2ce923d*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb1024ic640ih1iw1oc640_n"beb37d8e9ae90decbf8f32503ce07157*40&6de34648256ab1986e3c280ab7b15e32*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic640ih1iw1oc5120_n"ca2f845b371f9042af1bb10b6e0f8c9f*20&94a0b8b30ccc9220796a26113a10dd5c*4&a0abc3106f2c5cb6679033c35099df56*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb1024ic2560ih1iw1oc640_n"e1356b3c1c10c5e9b7a34e509129407c*20&fffad7d151eb724165ea250aee22fc34*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb256ic1280ih1iw1oc1280_n"0cb61064034c9f8af816a992d45b22fd*80&f4eae00a29ce3be7c60dbb9924583377*16&000a802958632d685852ab0916fc6423*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb256ic1280ih1iw1oc1280_n"a13ae9f10af32868bffa2acb5c0f9212*40&64a52f88be90b5df39862c222e36b3e5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb256ic1280ih1iw1oc10240_n"38280f7eff7831f3cab19ab957f30e6a*20&dc9ca7e8ecadb0bc713fe6eeba910127*4&503e003a46f57eb7ea50ce623cce2407*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb256ic5120ih1iw1oc1280_n"333767ff22c75efcf321d4a2ee6b5798*20&a52e8574e1da092589493dfef58f4e5b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb64ic1280ih1iw1oc1280_n"80e228faa9b065e248f71097a46c72ad*16&557354a738e4dc6b1776776500ed1a60*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb64ic1280ih1iw1oc1280_n"c6ce89ab623db4c6bcc8b6dad209d72a*8&469994b343ac60ba2123303f6e31ff36*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb64ic1280ih1iw1oc10240_n"6eee8315f762073d44e81c4d692f8655*4&67c8dc93c164a105b6c8e9420a685aba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb64ic5120ih1iw1oc1280_n"4cb7dfe80e0297e98de629cc75d103ec*4&84def012e8f1ece0a6561adfa2f55fdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb4096ic512ih1iw1oc512_n"bf011ce7f20ef360ec6155f97a6ac0b2*3&3e594724955a3d76071c848a8140e193*3&1fff82feeec934d02972f48c67c53541*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_mul:f16:0:abx mb4096ic512ih1iw1oc512_n"f4340ef44899fc7d3517878fab301933&a27194edb1afada86246c83aeb3efc13&3bdc15fd8e3c21e5c49071700a716c4d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb16ic13ih1iw1oc512_n"b3eb985b6cc7c433cb26daed137b4535&b3eb985b6cc7c433cb26daed137b4535"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb16ic512ih1iw1oc256_n"eeb7f496fa4825497c4a85527a6fa98f*2&eeb7f496fa4825497c4a85527a6fa98f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb16ic256ih1iw1oc128_n"fe3c81197bfd5d19b1a35502e2706acc&fe3c81197bfd5d19b1a35502e2706acc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb16ic479ih1iw1oc1024_n"f2fe3b3faa38337c6f6b7d3a5ea31841&f2fe3b3faa38337c6f6b7d3a5ea31841"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb16ic1024ih1iw1oc1024_n"0b297b7d143609c9f0d9ce8003d2a9e8&0b297b7d143609c9f0d9ce8003d2a9e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_relu mb16ic1024ih1iw1oc512_n"1f15991638e392ce30b9db750fd00bcd&1f15991638e392ce30b9db750fd00bcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_logistic mb16ic256ih1iw1oc1_n"58f8dcf009df21b1fde43de12e3cf6fc&58f8dcf009df21b1fde43de12e3cf6fc"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb4ic64oc4_n"da479dbf1c0c2b03abd5936341170749*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb4ic128oc64_n"ef1fc4749affa9351cd1a2ed11b9c08a*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb4ic32768oc128_n"8e1afe2330dc931eda0e6c5f809a3372*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb77ic768ih1iw1oc768_n"c5d5eda568ce4d0f589fd7e2ce79fd2b*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb77ic768ih1iw1oc768_n"36081358c35815533c75c7a7f78d4f2c*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb77ic768ih1iw1oc3072_n"70d24ba0d201a8cbef8688be038046b2*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb77ic3072ih1iw1oc768_n"e7541ef7f938812ab57324cb9db3e21d*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx mb16384ic768ih1iw1oc768_n"e535d9c94c6c14bb0626dc084dec1a8d*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25:0.375 mb16384ic768ih1iw1oc768_n"43d60c67d302d4f2560071f7c7f9e1c1*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx+binary_add:f16:3:ab mb16384ic768ih1iw1oc768_n"ec9299cce0f556d8f7039abf23624372*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx+eltwise_gelu_tanh+eltwise_linear:0.25:0.375 mb16384ic768ih1iw1oc3072_n"cd405affebb23193c2fc7efe4ab44498*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx+binary_add:f16:3:ab mb16384ic3072ih1iw1oc768_n"9fcf6efac91908bac0a0f0f2d2e1f279*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s8 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx+eltwise_linear:0.25:0.375 mb16384ic768ih1iw1oc768_n"65d0cca6086eb9c54948991bc37c39f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=binary_mul:f32:2:abx+binary_mul:f32:2:abx mb16384ic768ih1iw1oc9_n"fd2e63b36055ba6416170aa54d9686b8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc16_n"7b87a5f94003b024f34bb1b57a39754d*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc32_n"af7ad683198640ee0c93d3e4f634f139*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc16_n"5e3a900d34ae955c2c82bec247076b6f*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc32_n"25a5b99cdf45b3ad69d5d9faa2dab864*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc16_n"3dc2eddb6fe3e5bf2245be81763076ad*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc32_n"727c60b1ad1f05f835eadc66ee365569*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc4608_n"31126059e5088c7cedd5aaf35396777d*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc4096_n"8088bbbd0ff7648cfbff2d4bfb5a4adc*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc27392_n"e0c52f9012fc6d86f20fb2d593fe8bf7*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb25ic13696ih1iw1oc4096_n"4b0a5823ba537a4bfc06a364803fb476*28"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb25ic4096ih1iw1oc65024_n"5e0a3a1dc33c24ccf98d5c247f2aae1e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc4608_n"66e999b7fab2671306d0e6d73a069fac*252"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc4096_n"5c72e1f98c1644bb06851c8ffca46fde*252"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc27392_n"a83f04b9b3ab46316e6d3bbd8a29e81b*252"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic13696ih1iw1oc4096_n"3171d12c658284d9b9a9b246542cf278*252"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1ic4096ih1iw1oc65024_n"089b0f240bedba01041b056d8af2b55c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1504ic512ih1iw1oc512_n"a59c5c8728bf23deb6a02340a1d14f19*9426"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1504ic512ih1iw1oc512_n"4241883dd30da3cbb5444464dbc86f57*9300"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1500ic512ih1iw1oc512_n"abfb3256fcfea4623e1983715fd67085*126"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb1504ic512ih1iw1oc2048_n"ba8caace2b3bd36de7331c1b197ace98*126"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1504ic2048ih1iw1oc512_n"272028aaf1948b20a3b2b8ee78562420*126"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb16ic512ih1iw1oc512_n"6ed09f6be3848c54689d4184d722a74c*27234"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb16ic512ih1iw1oc512_n"abe8f4dd4886ddadfbb1862f0abed86a*9078"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic512ih1iw1oc512_n"1635b4a74e84557865d5bd81b047d184*18108"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb16ic512ih1iw1oc2048_n"f9da13216f75e393069a3fb283c325ac*9078"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1ic2048ih1iw1oc512_n"2ace6656dd1a8372b0a6c3c03dea7edf*9054"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb16ic512ih1iw1oc51865_n"83a68550e0b4181e3157644532fe05b0*1513"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb3ic512ih1iw1oc512_n"72ed6702b79f336282f547a95ad61696*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb16ic2048ih1iw1oc512_n"13790ae787f7aa195fa664ea06c98db9*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic512ih1iw1oc512_n"694da3e9e7fee2a0f1e875bdca4e85c0*72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb128ic512ih1iw1oc512_n"6a22cb3cb1189792c9b73dc9e7e1632a*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb117ic512ih1iw1oc512_n"32adededf8fe0b0bd9e4fea84309db58*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb128ic512ih1iw1oc2048_n"a9e60456391b61882a76a92e53e360fa*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic2048ih1iw1oc512_n"eb9b19c11576149b2b09ef7282379890*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb128ic512ih1iw1oc51865_n"b329ebf2c5922e334d4663fb399b6caf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb240ic512ih1iw1oc512_n"5ad90b27d7c228e385ac270e744e758b*216"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb240ic512ih1iw1oc512_n"7c3ee1349b1bbba7dc50c68f6bcfda29*72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb227ic512ih1iw1oc512_n"8c760e8731a0e8ec3665e748e35e8ae4*144"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb240ic512ih1iw1oc2048_n"279896b3e651dbee4c2da46ac0feb212*72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb240ic2048ih1iw1oc512_n"b6e5c98faf90ede17679ab04d428d7aa*72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb240ic512ih1iw1oc51865_n"a9cadd563e2165ec89cab0917773dd95*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb160ic768ih1iw1oc320_n"7864999061ac37f733876cdf0f0c7665*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb160ic768ih1iw1oc640_n"34c9e4a696b8460c54c900f43369451c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb160ic768ih1iw1oc1280_n"8dfb14545360e1ce995b1452c5ab7449*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb8ic1280ih1iw1oc320_n"f2750c092f5038d917f612a9d3083c4d*4&a95afac7d9d9c1dc6736f16c0115757f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb8ic1280ih1iw1oc640_n"82aaf25c6854bfcb3ececdab37623462*4&2b9d68975c01b2f3788c96796f5ceb05*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb8192ic320ih1iw1oc320_n"783184498b0cdb0489946dc676ba82e2*400&e00a088088340f212673cf5471c7a09e*19&ec2b807aca533e01ad91d420f9522fa7*20&2ae2d464ed237bf56eec954782b1f3cf*400&292cdc0605487ac87c21ad8a82266723*16&ae7059707f959d9a5cb3ef9cf49fb440*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb8192ic320ih1iw1oc320_n"f6178b95a04f31f781676d86b16e2514*8&e9e1955c48d33942bf3c76fd3b6f6e66*200&a4ba51845187b61d9f64565da27af51b*19&907ada973d5c60749be62f8d275d0e87*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb2048ic640ih1iw1oc640_n"c904b0b2c4edcb6a7eaa4a78839fa001*400&1dd47084c5914025e2d977a23f42bdf5*19&6e8d1b3edad8b33c38ec921542431b8d*20&1adae0502abf191992cf6cbfc229b70d*400&76a277321c17d3e35dcd44ae1ed4ffa9*16&b7cca4cc7311d6dd209357c2d3e75007*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb2048ic640ih1iw1oc640_n"0b5da412e9648f6ded2780f86742e017*8&13c701dfa1c244ed4449da16beda068f*200&1a9e847b52f29d47180d40cdf0be4154*17&5ceb13382316c8f108855b74740d42e2*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb2048ic640ih1iw1oc5120_n"eaff0e6c2a51caa85c0abae0b9cbe23b*100&071e95e8378ece7779cb62556db098ff*4&cc090f12a9ebb00ccc0c14f7aca369a0*5&812a813604ca9b01e517a718418d3fac*100&8b5b8eba2e68132f8324eee4a39a0bc9*4&cc8f159a905e704b26039941813d04f0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb2048ic2560ih1iw1oc640_n"6ce1695b0e978e9e3c7a17d7ddde17a2*4&65f88ccff5c4b18585609eef291aa26e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb512ic1280ih1iw1oc1280_n"735f888ec636ad243b64b62737958bfa*400&10487c2ee9a85b5c165cc8c25a7b4ad6*19&faf259492f58087381386424af433090*20&44d5038c5b64cd03291834f05b823c78*400&a1988aaeae5999abdaf689209defddf9*19&dcec36d8ba57082a610be51e7b7862d1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb512ic1280ih1iw1oc1280_n"022133e1e14c7df1b0a9f1415de308a9*9&ce963ec7078fcc60ba833b7362dfe534*200&85d76c4ff61e587a4a6ae79b908dedc9*19&adbd2a329f823fe434d8456c6c20b482*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb512ic1280ih1iw1oc10240_n"4dc6350a5a888fdb6b9b525316f695b3*100&63b149ba501ad4d66c29242828be025e*4&2956a3b4e10ea30111fe14b3e59b4148*5&9e008467d8f34629527fe3d59ebe7e73*100&09ef97a11716f48c0198d0d0686a19cb*4&6f3bd9d0fbb532093cebf5293c70bda6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb512ic5120ih1iw1oc1280_n"a859a78a1eb726f80030990c53fe0e6a*4&ebd6f2460d3f5282d2ba2e85812aaf3b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb8192ic320ih1iw1oc2560_n"c2e87f3bd0e56d5173bf3edbff3dca38*100&50fb276038dee8db4134b9693276b104*3&8b45ca2428d87c7f43573dac09751dbd*5&c4230c331db8ed2d77e7eb24c703e25c*100&82598310cb774ceedb43c0fecaaec039*3&09f78ded30aa97b1f1cd9d31fe952da8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb8192ic1280ih1iw1oc320_n"a6861895d41a2b25010fb6455057fafb*3&7f3e6150acf95a9c6eef56e0cc10cdce*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic512ih1iw1oc512_n"3198b992606f0dd9e071dfb6f620c3c9*3&a9b3ba81a19319ff0510eacc340c7bc0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb4096ic512ih1iw1oc512_n"ea7598319ddd8da2cfd1f5601f21555d&1ada4b110ebf26327e770ba2b31c0e6f"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb4ic64oc4_n"dad7c65423bf8869071df733154ee291*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb4ic128oc64_n"6d4e0d1a23d020b67238f614c4b56650*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb4ic32768oc128_n"de813b38f600eeb039e990cb5a98898d*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb154ic768ih1iw1oc320_n"8f4db40fe67963fba289ee812ba2af10*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb154ic768ih1iw1oc640_n"8485fcfadac0df29f34585fbc1581bb6*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb154ic768ih1iw1oc1280_n"22667368e17bb60aeae516df75e07962*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb2ic320ih1iw1oc1280_n"0381a380875f2cd2d4fafab1cc3f98f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb2ic1280ih1iw1oc1280_n"4bec2ed286e9d48eeb24828ba8f4387f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb2ic1280ih1iw1oc320_n"505db597d3ca042c28c64381e0e4ec27*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb2ic1280ih1iw1oc640_n"cfe3f4d7a2c7b3c24495712bb56c1f56*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb2ic1280ih1iw1oc1280_n"1fd78347511c4763ad23b3139f43014e*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb8192ic320ih1iw1oc320_n"69c82158377ab415842c2b776c6f9c10*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb8192ic320ih1iw1oc320_n"3653190a5641922710db8710e15387be*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb8192ic320ih1iw1oc2560_n"74c0742a6e6added86eebd6aef85050d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum mb8192ic1280ih1iw1oc320_n"3eb367524157869931434542a7253e6d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb2048ic640ih1iw1oc640_n"d6f6e32f7eb076b04202244120d20be8*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb2048ic640ih1iw1oc640_n"0e8e96ae5eac3e725a3b3690703abcd9*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb2048ic640ih1iw1oc5120_n"01ed99e118e27ae75f5301d4b2b16002*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum mb2048ic2560ih1iw1oc640_n"124d4f95376c2028d666ad5f0b4da2d2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb512ic1280ih1iw1oc1280_n"96710c9229e544e1096e5d0decdf73e9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb512ic1280ih1iw1oc1280_n"bb44e77b8e33c70d829a69d295fcd1b4*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb512ic1280ih1iw1oc10240_n"f75eb1515483c19a15c9f2ffc597e3f4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum mb512ic5120ih1iw1oc1280_n"e3547306308c2872e8a78c5433425d4f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb128ic1280ih1iw1oc1280_n"3222d9bc42c1373a81b42629af73278a*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb128ic1280ih1iw1oc1280_n"c78c3c3137344b8368f053caac9338ad*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb128ic1280ih1iw1oc10240_n"22e69d5bfc05aea9e339e924d9a749b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=sum mb128ic5120ih1iw1oc1280_n"38409bccdd96dda7e5354626b51de3c5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb1216ic1024oc1024_n"738c738e0b1d0f5848d6a26c06ddce18"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16ic1024oc1024_n"b41d93d4944ef38fd755496a153a9912"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb8192ic4096oc1024_n"8533a77378e3dfd711a878463bcd5d02*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb8192ic1024oc4096_n"a40f328eaecb4bbf82e7cd6524196716*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb8192ic1024oc1024_n"ea2278ed586c3373ac65ebe9a32b502c*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb154ic768ih1iw1oc320_n"2f622e6f357819bde429e64459277a71*200&589655d1407249158896701ea217e6e5*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb154ic768ih1iw1oc640_n"4b5531e86388e5039cee6d5c8ceec39f*200&ccda648c1d72f24c1d7943773e1f366d*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb154ic768ih1iw1oc1280_n"453f546ef4b2a9e1b3f19b263a060125*240&f6c30d7958d541c820ea8b26957489ca*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb2ic320ih1iw1oc1280_n"4230de65d04f296cb570fdfc1aa8a8be*20&2a5d205a0e74f2d1f19fe932c59cab08&5883be7b6662f4976551d5e1982d9670*20&f31a025922e85475080549f191858e3b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_swish:1.0 mb2ic1280ih1iw1oc1280_n"3ef263be4ee8a5c1a80e79dbbe79f421*20&7cae747fcbe50fd341f37dc5f23ed673&90d017f4685480fa74d55617b331bcb5*20&3205b5f1c888f78c8e7c4c5b24376980"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb2ic1280ih1iw1oc320_n"0cf7ff41b84530b3c2b5da324fe11a0a*100&4524f23eb3a925819de7036828927581*5&11e1fd00a20c5d11af11fcaed5d24407*100&38fdafec09187f2148f2d17a04f1eb69*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb2ic1280ih1iw1oc640_n"3f328e32f24816e675ddddfece93861b*100&45f7afe67da5e77a14311d9e44485341*5&cd98cac51683f37de3f132407eb92701*100&66b1f4a5d5addfba1c9144cca53fbbe3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb2ic1280ih1iw1oc1280_n"fb3f72492850cc8064e761e449e44322*240&cc86cf72a3d4fa0d7f90dce5a9c84f96*12&bde15c313fc2815b52368bf2dc58ee66*240&45fb386ef0dffd9d64d5729c08281ab4*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb8192ic320ih1iw1oc320_n"dac61b66cc04da53cd896808cc650496*200&3554c12485b7ff9e41f3fbc7c96acc4d*10&5633761f97c2c1f925188590d15abf55*200&e91b551ce37709461ef7e7613574078f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb8192ic1280ih1iw1oc320_n"9f44c35e8d2cf8f9427eb3c7b525b309*100&d5fb16f89a75853310c53f5c5fa25f9d*5&90390d1b00381366b73e197c6d677d6f*100&9104cd1667b0f610e8af0fde37a75c14*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb2048ic640ih1iw1oc640_n"d1b6198f397c4f36054a843ada22b26e*200&ec3d84d4ea4559be00c81f950377ee0c*10&9319ddca6f4008f38efca22c5ff258f1*200&1faf174ad79a69180dbd7e698820d5dc*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb2048ic2560ih1iw1oc640_n"22f01ba6f37fe96e14f9d46b62e24f16*100&bb848f71601310229abd3089ce1bc374*5&aa8425c5aa6bcaf61ff949165cc8c4b6*100&d4a42796933cf84c6c5530d6ad207599*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb512ic1280ih1iw1oc1280_n"59f78df73f150049c73e46a9260bc59c*200&ec00a4ef00a174b1b97f3a9ffea63ef8*10&f3c65e7ec534728969cc0fcf1b0d70bd*200&dabb7cdcb137fc9a7bae61163ed62d8a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb512ic5120ih1iw1oc1280_n"d8f1d73045dbb7ad4c8cac92f30e1b7f*100&2ea27f88a5f36d9a2f8eb5a6ae268639*5&102893814dbfb9d495226e5388d1f65c*100&8358f2a0f62f5de5353e560b468b43a7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb128ic1280ih1iw1oc1280_n"f34c56a2044107a9719675b33cbdc716*80&cf353a7a38f3c65a8dde517b2fb6c7db*4&0c68eb4a296a0034314f7dbc556f24ac*80&34c2807e3f19b2dc7f97564627e0460d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic1280ih1iw1oc1280_n"408bbab1183a4c1a59b73a0df16d6c71*40&230762fda56ef25ac1d29def3a598ff6*2&033a6f6f8843bbf8c5148084c02d1a31*40&eb606cba0811d21f4ed26981fc8dad6c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic1280ih1iw1oc10240_n"4a771e8a8942bde444a5ae34e98a7bcf*20&6f54d6548f27ffd19c78e646b161fc26&fb61de0a1198ec65aa2a3360a31443be*20&a0500d60f6636ab533e033bbc2301848"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb128ic5120ih1iw1oc1280_n"cec59ec2bc0c0a1f849321b1ac038352*20&9c165041cc92b048eaa30d8c17305c9c&94b171c477a2fa294aee6557ddc6f28d*20&044ead7c9c91210bd21ba08d3884e167"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb256ic2048oc1001_n"e6424ab0cd9cb8e23124985063544297"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=s32 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb5000ic845oc1024_n"ad417dac37fddebd997882df869b9d24*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=s32 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb5000ic1024oc512_n"767d0af596e53ab59963b123d6458363*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:u8 --bia-dt=s32 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=eltwise_relu --attr-scales=dst:common:0.5 mb5000ic512oc256_n"f2734acd7b7eed0df8af7bd1140e2fc4*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:s32 --bia-dt=s32 --stag=ab --wtag=ab --dtag=ab mb5000ic256oc1_n"4ae11869be57660ac54ca81053dbc46e*400"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32768ic1024oc1024_n"84f34d42e03e937c19b474d201442ac1*400"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32768ic1053oc1024_n"0f4c9f5b3ea0203836e02091bc3be7be*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=u8:s8:f32 --bia-dt=f32 --stag=abcd --wtag=abcd --dtag=ab --attr-scales=wei:per_oc mb64ic2048ih1iw1oc1001_n"cc13deebee308ceae84b515ba2ac4f61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb197ic768ih1iw1oc2304_n"85a0327ed0d911eda275150471305e3f*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb197ic768ih1iw1oc768_n"17a699c77ed492105def2476973d343d*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb197ic768ih1iw1oc3072_n"5359df90f1ed1fcfadcdb32f632f50b4*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb197ic3072ih1iw1oc768_n"21ef569cad44bff0e468a419e6b00f7b*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1ic768ih1iw1oc1000_n"b979620ce0a479fc10bc0538416db46f&8e50dd793d1837c5918610fbf84222db"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb77ic1024ih1iw1oc1024_n"4f32a13be90e66a50bcd2f6ddfb8eb17*137"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic1024ih1iw1oc1024_n"eab3c5dd2a91427397db755b2c507683*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=eltwise_gelu_erf mb77ic1024ih1iw1oc4096_n"e2f8b243d27bc388e41e04fc49188e79*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f16:3:ab mb77ic4096ih1iw1oc1024_n"bfaf17fac193e4fa5225e032a1c1598b*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb154ic1024ih1iw1oc320_n"a30be4a6508d8b6dec021f02c7a5afc1*200&f5b51b856dbaac39e8d4a5ae81856a01*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb154ic1024ih1iw1oc640_n"ec15569934ec25018be47c0cf842cff9*200&4f921945e1faf7a93e0f7deb96b90e45*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb154ic1024ih1iw1oc1280_n"397055c5e711537fefff4e72b99a7966*240&42993b9636a2185178b27d2e1d0cade4*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb128ic1280ih1iw1oc1280_n"77483993a51d86685569781758ce7229*40&186ffdf2247825967e88b0fa8a0e0e36*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb16384ic200oc1_n"4bb638913d4ad89ecacbc6bf127e66fd"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb128ic14oc200_n"382f6920eae10122bc8c7a1ea8e4196e*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb3136ic96ih1iw1oc288_n"f1bfc7282c813c5d3c8812137458c146"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb3136ic96ih1iw1oc96_n"aaf374483cfd8d8ddbce6b449a2452bd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb3136ic96ih1iw1oc384_n"7200d2378828aaa5ea8fa7b18a1675ee*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb3136ic384ih1iw1oc96_n"cae8f4d564ed57f5cf27ba5d9074bab9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic96ih1iw1oc96_n"1e20252fcdfb6918cb874cb01967e8de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic96ih1iw1oc192_n"a4f6bbbb8b2de4833204174f5caa4012"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic384ih1iw1oc96_n"3f01a93ab1423ffc9feb7c05ff0cf975"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb784ic192ih1iw1oc576_n"449e9dd217dc9a0b8efd44dc69e5af05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb784ic192ih1iw1oc192_n"1de982cedc8b5cebc323e1c1d5e2db01*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb784ic192ih1iw1oc768_n"622fe3e08d8f2532e7eff7b7635ff66d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb784ic768ih1iw1oc192_n"9308953e55e1ebf634079197f14a3bd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic192ih1iw1oc192_n"539789c28ce5f75f2d94fefa1713e179"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic192ih1iw1oc384_n"6950591ca06e2f91d5ce4b18f8c48b36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic768ih1iw1oc192_n"e171783c7895d4273af07082bea31c73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb196ic384ih1iw1oc1152_n"f2d0b9740cac1040182947be4a1904ff*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb196ic384ih1iw1oc384_n"fb30e3355d16e8c63b3fee46736779d2*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb196ic384ih1iw1oc1536_n"2a817a91b141c8847c307dae3ea6f239*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb196ic1536ih1iw1oc384_n"9936f1d56cb1a81e7932e78ac77dfcb4*17"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic384ih1iw1oc384_n"86634dc62724857636392a6eea3c0d7d*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic384ih1iw1oc768_n"ca40d2b71e56cf33d75fcadcc35389fd*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic1536ih1iw1oc384_n"78266917ac5cff404063cd891616ba4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb49ic768ih1iw1oc2304_n"4f790e50f9b8b61a153579e80ed3644b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb49ic768ih1iw1oc768_n"e8adc4d812d27de1081d023a1a0707d2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb49ic768ih1iw1oc3072_n"407643a905f8767c5542669db7dbe6e4*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb49ic3072ih1iw1oc768_n"1613bb7ff6b53c3a9a963a4db70634dc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic768ih1iw1oc768_n"96d29d477a3ce96a3689a098257e4fa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic768ih1iw1oc1536_n"f2ab834f3178f2b46f78078afab3a698"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic200oc1_n"5acb36497bfd31c83d8492e6640dcbc6"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb128ic14oc200_n"a496fa31c8a2aed095f117bc4686883c*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb256ic2048oc1001_n"a46c6832371c676a23519cd97396af5a"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb128ic1000oc1_n"72964afff41ec72b961f1cfcc6c9318e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb128ic1000oc1000_n"72aae1541cf403510f535a99b90bbdb6*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab mb128ic2000oc1000_n"fe998f0eb5f0a9665aa176272f20fa2d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc16_n"a09d49da64b647d5ce2dc1f0b9175454*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb11456ic96oc32_n"fd2bae4e63cc2073de6433343d762980*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc16_n"bd69c4cc642e53f6b7a9fba7744a8a76*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb11456ic51oc32_n"4fdc25f1fd1794e7aee41b4184e5bf38*12"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc16_n"bf9a7924cac1415ce8081d83582ba34b*11"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb11456ic54oc32_n"6237b73aa18a230c60880c1529e50d2b*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb80ic768ih1iw1oc320_n"c1f74833bef24c3c24fc7f4d54f88153*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb80ic768ih1iw1oc640_n"533fb3a8582e97cf34136c843e3c79e9*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb80ic768ih1iw1oc1280_n"f59980cbd9bddb6c0f31c4624eb71174*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb4096ic320ih1iw1oc320_n"d7d5cbda58753b87f98a3236543c5a90*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb4096ic1280ih1iw1oc320_n"84a67f5838cd2abc0b277d7c485ac203*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic640ih1iw1oc640_n"9db8655036cb16ca2062ff590f46adf4*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb1024ic2560ih1iw1oc640_n"7a23327c4ea727481358d28be894268f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb256ic1280ih1iw1oc1280_n"ca9b1a8808bda2580b81dbb045f0f43d*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=any --dtag=ab mb256ic5120ih1iw1oc1280_n"00bf722070b480566fa233f6d97dcd76*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb16384ic768ih1iw1oc768_n"8581640b250be5f8d1cc368d47ea8156*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_tanh mb16384ic768ih1iw1oc3072_n"a1d9540e1cee9db58d61408bf3df0e7f*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb16384ic3072ih1iw1oc768_n"1d8dd543e46fe7f391b8e61b1f6bf5c4*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb16384ic768ih1iw1oc9_n"1f513d272a3aaeef0bf040812953e381"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb32768ic1024oc1024_n"bee6319775aece4d698efe5144a6597c*400"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb32768ic1053oc1024_n"0feb9bbc20b8bf07573bf266805a8d7e*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32768ic1024oc1024_n"5a08adc0eba4f43b4abad610120f6d0b*400"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32768ic1053oc1024_n"744bb28423a7f033f18b893b3f8aebf9*100"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb2432ic1024oc1024_n"55de0dc0b76b73806950a49a9b12594c"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb32ic1024oc1024_n"664b8eda3a2e31501bdb51b100f01ab5"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic4096oc1024_n"530872c93f6ec8be6f224653583cbfcd*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic1024oc4096_n"bfa63c65dc9be19ecb45ba4e3431dc29*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic1024oc1024_n"f4efbc2fec0b79958e0033ebf2db11a4*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb1024ic8oc2_n"7d3139fc017bde8c99ca16b9ff6c40b2*14"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb1024ic4oc8_n"01f71e4ab1d5e24c6bf0effb2ba309b7*14"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab mb128ic768oc2_n"e67fdce8c9ce98c3ed0e521a195218a0*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb128ic768oc768_n"2f59ef6b06429c12bebd2bf5a441b2c7*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic3072oc768_n"9261763893d9a12914f3fd6b308e1431*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic768oc3072_n"a3ea64756c55c979b7f640608d0fe444*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb16384ic768oc768_n"6f766657ac589e47ac1daff0fd838478*96"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb1216ic1024oc1024_n"0cc158b04d8175ff3bcc73892214335d"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb16ic1024oc1024_n"1b24ce7ba417e379e04386b57d74014e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb8192ic4096oc1024_n"2a2b9f51ba08e06a10c09a3f3a6f7de8*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb8192ic1024oc4096_n"04f7426c322b5ff70b388985246a59a0*24"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --attr-fpmath=tf32 mb8192ic1024oc1024_n"8ae2dfd3e50a70280fdb3df1a62b536c*96"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb160ic1024ih1iw1oc320_n"9a7358bee89b6e855bbc3f095599aca1*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb160ic1024ih1iw1oc640_n"a804c7de6c0e515ad9596b25230a8a3c*9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb160ic1024ih1iw1oc1280_n"4db45bf20ccab97ff367c38e9d0bc3c3*11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic128ih1iw1oc384_n"efc769e6b052a907e9ab429807145cf5*2&c59c7780f42e046c695a02c9b920b9af*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic128ih1iw1oc128_n"31a9448b01ecbf8056b80288d3e98987*2&c654a7d7cf8122016e79913a5a19a5b0*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb3136ic128ih1iw1oc512_n"f4219858b835ee75674be9b68cfd4bee*2&ae90d7fe2a7a3166940f57da11da3a37*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb3136ic512ih1iw1oc128_n"6b80b423d6313df6e1879889d02d82b8*2&f37dbd1e4eecfc26b769829a88fd4d25*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb784ic512ih1iw1oc256_n"b0bc94736f86690253229077836da91d&a861304ff6d706dd3a9109d2e9438262"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic256ih1iw1oc768_n"ea4962d6b0bcd4f19bff680633e284bb*2&37c293b67bd12073f50e2248801bfe9d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic256ih1iw1oc256_n"40cde424b932d09499811f3547d650ac*2&2182161d94d6d42c2a03f4b8b5a1ebfe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb784ic256ih1iw1oc1024_n"915f6e5a18eff37d0e49d1b764b74cc7*2&c3cc8bbef75eaefae7448a4665167c55*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb784ic1024ih1iw1oc256_n"84d032ee7a59cb4853bb2257dc2e6999*2&c0999e6cc231d45dc3a3831c12e0ed02*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb196ic1024ih1iw1oc512_n"6cecafad9fc44cda96829cb964d7ae4d&8ef7e2d30030a8bdaeb67ca84263c2c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic512ih1iw1oc1536_n"5b7e1d32895b25282b0a07b59f9ccda3*18&c628c3d60c12d6ca142471773476129f*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic512ih1iw1oc512_n"6d0993d8fbd8cc39385ad1783ad0b40e*18&b978eb931b0860eec05ea4d847435eef*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb196ic512ih1iw1oc2048_n"bcf5e43c21bf172e31c791d363dd7d1c*18&da630fb1039b5c46031167f5546346d6*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb196ic2048ih1iw1oc512_n"8b5310e190bc92c71cb48d3a0984cdd0*18&c40e5ff285e5869c11be0c1d57d20984*18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb49ic2048ih1iw1oc1024_n"21d32961f2a269a593f3b127f06c0e4d&ac694f481404631d28ba149383d5ca08"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic1024ih1iw1oc3072_n"0dd3d5d84ff256d870fe98f38ce0175d*2&49951520d6a19fd259bd2e96039d5d19*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic1024ih1iw1oc1024_n"2763e43fdfc51274790384e473a5947a*2&bf72a78816f6d0d75ea41a9baf5900eb*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb49ic1024ih1iw1oc4096_n"be1188f8c1ba8f5d33bcd00c979f71c5*2&6ecc75ae4b8255c2ce691b43b28c0727*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb49ic4096ih1iw1oc1024_n"86ad9e22ab9939e31e461edd40797d1e*2&86fadc3bd3cb5653ea78585115c8d50f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1ic1024ih1iw1oc1000_n"4c22a5dceba88ce4d1a4eb874a0e00f1&7fd61758432f030072e8b344dbc2a68a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb1576ic768ih1iw1oc2304_n"ef2fa64caf60fac687316713a998bf46*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb1576ic768ih1iw1oc768_n"b90ac06fb5a5a14cdb4b27e10dbf73e6*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab --attr-post-ops=eltwise_gelu_erf mb1576ic768ih1iw1oc3072_n"3cf3b054052a2cd7bd3fb82a94eea231*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=undef --stag=abcd --wtag=abcd --dtag=ab mb1576ic3072ih1iw1oc768_n"b8153697055aba47da1991b819058403*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f16 --bia-dt=f16 --stag=abcd --wtag=abcd --dtag=ab mb8ic768ih1iw1oc1000_n"d5d334cc225aca90b245f24a5c9ee38e"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb1024ic8oc2_n"006b53073dea436049bfee7e33fe5887*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb1024ic4oc8_n"6dd86374798efd297aeb6b671251f8c2*8"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb867ic8oc2_n"608f39c05ad8ea377658260ac0f460ba*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb867ic4oc8_n"7e74d2f298f4f31303b1e5a683e90c8d*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_W --dt=bf16:bf16:bf16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab mb256ic2048oc1001_n"f063d5a81074be5d5f1aa910292e3e91"
diff --git a/tests/benchdnn/inputs/ip/option_set_fwks_llm_gpu b/tests/benchdnn/inputs/ip/option_set_fwks_llm_gpu
new file mode 100644
index 00000000000..b6f08e42ac0
--- /dev/null
+++ b/tests/benchdnn/inputs/ip/option_set_fwks_llm_gpu
@@ -0,0 +1,11 @@
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:3:ab mb32ic1536ih1iw1oc1536_n"b81fd39df3671740c0ef51cc6124162e*104&40f237bb550886054611c18f13683a93*104&d38f60a0a44092a18903acb1e3b5de09*104&55c63c1470c21a387924a7b9586dddb4*104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:3:ab mb32ic3840ih1iw1oc1536_n"289a7cce999c20860d2ed3b0671ab55a*104&e54d89b1ac3288e395e2ef4f8deaabd8*104&b4fbea22e60d7976fbc4b84e0e7fa55d*104&d8742d82d589c964f25dc4bc7cc5ab34*104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:2:abx mb1ic1536ih1iw1oc1536_n"df0fd8cd30dc09eea35a197d6aa98710*936&0f7fea7eb2fd27684be4e08aa73786ba*936&2e4cfc7287ef114397d499b3b5980560*936&72aa7f9274812b2cf4f084889934b12e*936"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:2:abx mb1ic3840ih1iw1oc1536_n"6a2bd7f7e174a7d1c6f4d077b06411d9*936&4050654b333d089880eefcb10c6cf792*936&cbf614d5a4d625f428ea3609062d8344*936&56bac508f9fb53d3b530fd03c19c0dc2*936"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:3:ab mb1086ic1536ih1iw1oc1536_n"d12f711e8f0aa0f9ef36d9dfc43c1a7d*104&700038b0187efc98365877f8c7e745cc*104&632372ba718f894c7c9d947b61ba1a2f*104&75c0eedc2ab04199f608a4951f5d5f75*104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:3:ab mb1086ic3840ih1iw1oc1536_n"28ea664e8467486403fde19369b26550*104&92240709da4397a133a6d96fdfd80987*104&00a841f7213b38b3b885a00c23467887*104&4caf4767b225885ddfe79b8b4ec54f61*104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:3:ab mb2171ic1536ih1iw1oc1536_n"1b4a11dba71e66647b3c00c681815bc0*104&4864e55dc8126ee50c624c54e1cc6f51*104&0253c59e60bdc4d67fbe80146157358b*104&fa1e6945eadd77eaa4d2633dac4b9205*104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab --attr-post-ops=binary_add:f32:3:ab mb2171ic3840ih1iw1oc1536_n"33323703c771b7f095ab8322040158c2*104&ba5556597996820e155ba094232e1617*104&488c9bef8cf3971b4da99c9eaccc6a94*104&85d8c5a3a39603f4be6a4d65b018aed0*104"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb32ic896ih1iw1oc151936_n"a541566c1d7f46b0926c28feba4bf1fe*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb8ic896ih1iw1oc151936_n"6865d0d2ae59f5eedad713073eba8187*508"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1024ic896ih1iw1oc151936_n"9bc3bc2721129b030df7ce5b21874d88*2"
diff --git a/tests/benchdnn/inputs/ip/test_ip_all b/tests/benchdnn/inputs/ip/test_ip_all
index ed909d2c52a..c69ec6c0c9c 100644
--- a/tests/benchdnn/inputs/ip/test_ip_all
+++ b/tests/benchdnn/inputs/ip/test_ip_all
@@ -14,6 +14,9 @@
 # int8
 --batch=test_ip_int8
 
+# fp8
+--batch=test_ip_fp8
+
 # bf16
 --batch=test_ip_bfloat16
 
diff --git a/tests/benchdnn/inputs/ip/test_ip_fp8 b/tests/benchdnn/inputs/ip/test_ip_fp8
new file mode 100644
index 00000000000..72463e6a2d8
--- /dev/null
+++ b/tests/benchdnn/inputs/ip/test_ip_fp8
@@ -0,0 +1,27 @@
+--reset
+
+--dir=FWD_B
+--dt=f8_e4m3,f8_e4m3:f8_e4m3:f32,f8_e5m2,f8_e5m2:f8_e5m2:f32
+--batch=set_all --batch=shapes_0d --batch=shapes_regression
+
+# attributes
+--dir=FWD_B
+--dt=f8_e4m3,f8_e4m3:f8_e4m3:f32,f8_e5m2,f8_e5m2:f8_e5m2:f32
+--attr-post-ops=linear:1:1:2.5,add:s32,sum:0.5+mul:s32:per_oc+relu:0.5,prelu:per_oc
+--mb=2 --batch=set_all
+--mb=0 --batch=shapes_0d
+
+# Tests for external blocked weights layout. This targets BRGEMM impl.
+# TODO: extend support with nD cases in the future.
+--reset
+--dir=FWD_B
+--skip-impl=ref
+--dt=f8_e4m3,f8_e5m2
+--wtag=AB16b64a2b,AB16b32a2b,AB16b16a2b
+mb2ic368oc623
+mb2ic1462oc412
+--wtag=AB8b64a2b,AB8b32a2b,AB8b16a2b
+mb2ic368oc623
+mb2ic1462oc412
+mb2ic65oc65
+
diff --git a/tests/benchdnn/inputs/ip/test_ip_large_gpu b/tests/benchdnn/inputs/ip/test_ip_large_gpu
new file mode 100644
index 00000000000..003fc7efdaf
--- /dev/null
+++ b/tests/benchdnn/inputs/ip/test_ip_large_gpu
@@ -0,0 +1,26 @@
+--reset --stag=ab --wtag=any --dtag=ab --dt=bf16:bf16:f32 mb21ic13oc133326203
+--reset --stag=ab --wtag=ab --dtag=ab --dt=u8:s8:u8 mb2270183416ic1oc2
+--reset --stag=ab --wtag=any --dtag=ab --dt=f32:f32:f32 --dir=BWD_W mb1423ic1oc2481123
+--reset --stag=any --wtag=any --dtag=any --dt=u8:s8:u8 mb1723182ic130oc2836
+--reset --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 mb99ic7oc15106073
+--reset --stag=any --wtag=any --dtag=any --dt=f16:f16:f32 mb53ic1oc26062789
+--reset --stag=ab --wtag=ab --dtag=ab --dt=f16:f16:f16 --dir=BWD_D mb106514950ic2oc50
+--reset --stag=any --wtag=any --dtag=any --dt=bf16:bf16:bf16 mb3391793ic328oc1
+--reset --stag=any --wtag=any --dtag=any --dt=bf16:bf16:bf16 --dir=BWD_W mb24ic670oc3662320
+--reset --stag=ab --wtag=any --dtag=ab --dt=bf16:bf16:bf16 mb39ic24oc32170728
+--reset --stag=ab --wtag=any --dtag=ab --dt=s8:s8:u8 mb10873493ic6oc1247
+--reset --stag=ab --wtag=any --dtag=ab --dt=s8:s8:s32 mb17905ic610oc200127
+--reset --stag=ab --wtag=any --dtag=ab --dt=bf16:bf16:bf16 --dir=BWD_D mb53378853ic54oc88
+--reset --stag=any --wtag=any --dtag=any --dt=bf16:bf16:f32 mb873846ic4oc928
+--reset --stag=ab --wtag=any --dtag=ab --dt=u8:s8:u8 mb64ic9oc176379826
+--reset --stag=any --wtag=any --dtag=any --dt=s8:s8:s8 mb16660ic2oc198414
+--reset --stag=ab --wtag=any --dtag=ab --dt=u8:s8:s32 mb66884057ic49oc1
+--reset --stag=any --wtag=any --dtag=any --dt=u8:s8:s32 mb186ic5oc3888311
+--reset --stag=ab --wtag=ab --dtag=ab --dt=bf16:bf16:bf16 mb31ic76oc55185860
+--reset --stag=ab --wtag=ab --dtag=ab --dt=f16:f16:f16 mb15058ic4oc275579
+--reset --stag=ab --wtag=ab --dtag=ab --dt=f16:f16:f16 --dir=BWD_W mb1ic27oc44024665
+--reset --stag=ab --wtag=ab --dtag=ab --dt=u8:s8:s8 mb3276ic812oc1058399
+--reset --stag=ab --wtag=any --dtag=ab --dt=f32:f32:f32 mb20135ic369oc47033
+--reset --stag=ab --wtag=ab --dtag=ab --dt=bf16:bf16:bf16 --dir=BWD_W mb1ic15oc149971097
+--reset --stag=ab --wtag=any --dtag=ab --dt=u8:s8:s8 mb526ic2oc12112924
+--reset --stag=ab --wtag=any --dtag=ab --dt=f16:f16:f32 mb1481323ic1oc1577
diff --git a/tests/benchdnn/inputs/lnorm/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/lnorm/option_set_fwks_ext_gpu
index ec185a67e8d..afa79876e64 100644
--- a/tests/benchdnn/inputs/lnorm/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/lnorm/option_set_fwks_ext_gpu
@@ -1,7 +1,7 @@
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag= 64x384x1024_n"762a2c48f8b850f39740d14babe2d995*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag= 24576x1024_n"5d90cda5b0320d8e4fd414a4d42e7c9e*48"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag= 64x384x1024_n"1835e23454a6686e537cba92a9349f8e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag= 24576x1024_n"a17e3f049748a9853ac267f9e4165b73*48"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x16384x3072_n"e9596a832fb7db45e7b0e26435441951*121"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x16384x3072_n"48c3bab5c9b108d1eb79d8ae45ac8014*61"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x24576x1024_n"48d0b620bf161fa6ae3580933b26e425*49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag= 64x384x1024_n"721af4123f5e05a30d6fa774f3105c6d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag= 24576x1024_n"311d737d8ac84761db61775e36f7591d*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x16384x3072_n"7eb448015afc2fce67ea625cd66a69af*121"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x16384x3072_n"378d8036fd08e176aaf3fbf1cef928fe*61"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x24576x1024_n"8f1fde195de18c4d3ed8fd7605717af4*49"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag= 64x384x1024_n"691ce4101988ce4faba0aba444f4fe7c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag= 24576x1024_n"9963af93ae19bbf7d24da5b32fd33e90*48"
diff --git a/tests/benchdnn/inputs/lnorm/option_set_fwks_key_gpu b/tests/benchdnn/inputs/lnorm/option_set_fwks_key_gpu
index 81faf7f9b25..293a7f9cfca 100644
--- a/tests/benchdnn/inputs/lnorm/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/lnorm/option_set_fwks_key_gpu
@@ -1,67 +1,67 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 32x512x1024_n"061aab64e862d10a69d262ab99c3cfba*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 16384x1024_n"0d162db7d50a90b1977a578d18d54b24*48"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 2432x1024_n"dc575fb187bbf00790de88928f15e96a*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 2432x1024_n"ae5f3e3644801c6efaa6dd6707cdc7c1*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 16384x1024_n"91e98382faf512c5986731d587a93c68*48"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 32x512x1024_n"34c6039771846501f73396dd97ad6488*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 64x384x1024_n"50b5aa858d716a2bd3f8381164a37ecb*1&edfd68f894ebe8255e650e7fadbb0702*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=ab:ab --stat_tag= 24576x1024_n"f247304fa1768366c1255144d590f034*48&970e60bca240ccb4e1ec339e12b19d79*48"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag=ab 1x128x768_n"71b12c2402bfa8d8533f74aad9613379*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f16:f16 --tag=ab:ab --stat_tag=a 128x768_n"55d9154bfb10e586805bd6630866bf35*24"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x3136x128_n"e1cf3dbe1c15b261beb1bc579c0eecbd*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x784x512_n"3321c5e092b939ec7080528c595254e0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x784x256_n"37e12ab2bd52812c0f8816a6b2d30e52*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x196x1024_n"bc71f003cc088edc0f24f924fd0c64d3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x196x512_n"2c1339efcabc6913d5bbbd94145ffd28*36"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x49x2048_n"521ab481d0c2a2aeb6093aeb14816c15*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x49x1024_n"2cb95fd35a62785b883c575fb6362028*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x1605632x128_n"68feede75a65315d1f71ce8e466e7010*400"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x401408x512_n"5af6cec9b68091a6d66bcb764278379c*80"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x401408x256_n"1a6dc0a7fdad3f2fa3de6d4eafa7a344*320"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x100352x1024_n"243f219d598c6b62c22363cdc915aebb*80"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x100352x512_n"150918e69c132b0eb87944d51742d4b6*2880"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x25088x2048_n"5ce25a5a0654ce870c019039e72cd85e*80"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x25088x1024_n"994ae185aee2f2f9390f1c0d4e6284c3*400"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x25088x128_n"f450aa52ee7ebc095e06f7d5c4bdbdef*100"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x6272x512_n"800027ba883c4d69b3e0ad611b3852ca*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x6272x256_n"5040b42f5a05bd3719eb824da797843f*80"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x1568x1024_n"f635a0d5f5363b0a99f08e99b91e7800*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x1568x512_n"03ff9235e3ca802f4c8a15c9bfa55df7*720"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x392x2048_n"2297c2f9c37bfdce87b66656644e3b12*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x392x1024_n"2de62dc84e931a97c225440e5e8206e9*100"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 16x512x1024_n"2e75e585a1b90206b4d60451ca9f6870*1&276282569a1daf7a7d0b7c82b96428f0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 8192x1024_n"7abe67b0e743b702abe1d03159b26efc*48&f5fea8bb848c31996bf6fe1b7349b96c*48"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 1216x1024_n"a1bb2bf5fb786e1de01c41e5b8f0c982*1&ad1d8e0502d7df1425566e01dab7a248*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 1216x1024_n"9213474221092a9f676ebbd9f7dafdb7*1&ad2aa1619c63af41fc8054529fe905d0*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 8192x1024_n"0dc2c8d818b14a4d4a03e3ac51cd98d4*48&3f705742db8f67c333f7ab75ed6c1478*48"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 16x512x1024_n"04518120aba180d7e653b3ed3ff4fcb7*1&a3c576da398b7e2180a57e0e199a5211*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x3136x128_n"84d97b229519c34f48f8b64686737580*50"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x512_n"b54e9de045ee81a09479a98eac7a4273*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x256_n"07b8295022c5f9bac60632365167a890*40"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x1024_n"23aa7d91cc8ee20f16725c4092745a99*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x512_n"c0545d0c9a0027ebf7fb4b99c3a8ae81*360"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x2048_n"43eb4c7d13a8056216d89d6d53cb13cb*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x1024_n"8ab3c0eba44f25129915cf45fadeda74*50"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x1024_n"36616fc1298682d1d0f4f9e95bbf023b*50"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x2048_n"f7805d690004b680503558f3215bcaaa*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x512_n"f175a317396455b41ce4321f34e30d4f*360"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x1024_n"96d853aa138462d04a2ef098c5ce58c8*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x256_n"cdfe937143a917ad922c1a7b17f0b20b*40"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x512_n"83974c918e3e44722d963c9141e97c0c*10"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x3136x128_n"f1626ef0c10a66166d4aea018d4e67a6*50"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x16384x768_n"a9c4dfc27c700484c81739523783a195*41"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x3136x128_n"dc95dc33a31f78fa510205f549c6b71e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x512_n"aee52d5ff67d9d9b6c844abcc8a707ae*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x256_n"48f708e5202d2c6a4b5f27ba73700905*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x1024_n"2b8714ceb8520971459504fa088e4791*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x512_n"b22f44201d82400076fd05c1402f499c*36"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x2048_n"756d9c0d385954fb3bb9f285175946e4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x1024_n"a86985aa68fc2fcc57102cefe7e04b4b*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x1024_n"10075bfee6698abb35bdddd3051e38f0*5"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x2048_n"ca3f26b604a0b1cd7d43a20fd1378e9c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x512_n"7895f616b884cde9840018304e612874*36"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x1024_n"78a87b5c21d8e9ecfd84a7671fb6558b*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x256_n"082b3518fb96327b9bbce29a14157239*4"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x512_n"6b49e41348b6b73787aa60658e4236f1*1"
---reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x3136x128_n"c230ae414976867b1989dcde3b57449a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x768x768_n"860fd342e0e3727e80fc0c71464fbc59*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x768x768_n"34e390221d46733065e1715dce0f5381*27"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 16x512x1024_n"2e1cef4819f83acfde0ee1438952929b&409037af9821515757ebb228c0569fec"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 8192x1024_n"c8b0d469f531a1b09591423bb451bfe8*48&2aade73144ffcf78cdc8d17e0a1ccb1b*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 1216x1024_n"680b611c756f7fc17b327efe4c4c0bc1&66765021786a7ece7041b907b112f08f"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 1216x1024_n"5db7d6fff8da2556b5e66b61a02b8241&533321b2fae9d6a8e1e6ad64640c628b"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=ab:ab --stat_tag=a 8192x1024_n"51a7910abb98035a3fc174cd3e3a5c80*48&f74a3ce8a604f87dc9ab56f581b68948*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 16x512x1024_n"9082595518cbe09bf6a59e1134bdc5f8&ac07d630ffa1f008a6a17e066d55f243"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 64x384x1024_n"5881dc1e04fce389a6c32be6dca3d16d&cf9ef3d87a5aaf9020b5b7bc5d02e5aa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=ab:ab --stat_tag= 24576x1024_n"411ffb251026881a95a901d9085997c7*48&c2bc57ebdae251579fc785a4384d5796*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x25088x128_n"db2a5bb25a32e0a030f4468d0eb3002d*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x6272x512_n"13e744c6ef53f75167d41ba60a36d57e*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x6272x256_n"4eafcff40320348ebe9c3ff2bde4956b*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x1568x1024_n"2372f7aa066bbc5c6433e8950b991065*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x1568x512_n"47007e59d9abb845bfca92b4ed5aef4e*720"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x392x2048_n"9b06e6f874856865cb4a84d17463e449*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x392x1024_n"86de09487a2903f7869688ab92a4c832*100"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x3136x128_n"fc52dd928371dd3ce1f30ada8af336e7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x784x512_n"a040c587b3ac1ba06b3123ff9bcadf98"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x784x256_n"f80e91e5d92872014ec4ca33d01aa43c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x196x1024_n"df403803aac22602db00c763a504e2a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x196x512_n"93c85c2da24153dacfd41eb6d6bd2170*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x49x2048_n"433680d36b2f3e682bd138ddfe0e3158"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x49x1024_n"b2540015d13b14e57a4008613c6a0b30*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x1605632x128_n"cf170f474f0b8871a89df978a0f94875*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x401408x512_n"16e37006f4650433d57c7fde1d992401*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x401408x256_n"8d085af6e53934a93320253c67073430*320"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x100352x1024_n"afc7776101e39518476932acaff41053*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x100352x512_n"51741958eb5cc86a850599903c858cac*2880"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x25088x2048_n"b2a7aebd0024fdd9036e6b99ec3ed22f*80"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x25088x1024_n"83dc9dda1e3909161358dda2960db027*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x3136x128_n"adc55975e1fe646d69b5a266432ccb13*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x512_n"ec44b5ca091cc21f26d6ebc9640fefc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x256_n"811c3cc925d3bad68077c7ddc5352d7c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x1024_n"804c9c5f7022c941e3540dc8137bc84b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x512_n"145eaecf40dc7484f04facf1a9ade91e*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x2048_n"b7f4a897dd66fd6309c45079713b379d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x1024_n"7d12680bc897fe9a8e3f6fa8117ac91c*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x1024_n"924dd2e9790435b174f132ba0f544f31*5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x49x2048_n"4234c898ae0513b9a3de1c8c85a3b2a3"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x512_n"933a3b0a6ade0d46dfe8fdbccd56fb3e*36"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x196x1024_n"f0669bc60658937cb9e68c782a9bffd1"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x256_n"03a1832b31bad2947ef211ce01946036*4"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x784x512_n"e5481a957a7ce7f43aa9546647802f20"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 1x3136x128_n"39250a54c9eb5499af5ceac7d79a14ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x3136x128_n"333a9cba8cb9057aad4de7e2f94142ff*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x512_n"3134f5528d216ca024798f1b2be5c575*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x256_n"ef3908fb0daa1cebd6336789d9b3a395*40"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x1024_n"8b3d3a83bcae11c73e355d1d9f6ee4f8*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x512_n"7ec3993523ec21ebc58381de91a509c8*360"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x2048_n"6faf519d049f37652a9e69cb789cbd22*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x1024_n"c556ea686eac482130b48d657786b19c*50"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x1024_n"c8f050cf25defe4ad363832b6aac5bc3*50"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x49x2048_n"547bca41498783b5a0db5b1dd7c5e69c*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x512_n"52161f88524597f22fabc8c464a268b3*360"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x196x1024_n"c66f5a8668552f51dfa32300f19d9aa3*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x256_n"623e2176b88510375bf25864aecb1832*40"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x784x512_n"5095dc36d03e4aa309a567167a2ba880*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=f32:f32 --tag=abc:abc --stat_tag=ab 1x3136x128_n"b653bee98c27897f1e96bb033bdaa4b7*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag= 1x16384x768_n"bda18338b9922da19b943c5388fdd01e*41"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 32x512x1024_n"1bb2dacf44c1c60091aa3784984cd142"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 16384x1024_n"b52c704fdeb7aad42fd475552b775bf3*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 2432x1024_n"0e0966bbf44c6a6d4f8430aacc8bc423"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 2432x1024_n"28a4a4a025f1c3c3e619167ee3c721b5"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=ab:ab --stat_tag=a 16384x1024_n"7f7d99787466d4c5130a9ae7df77aeda*48"
+--reset --allow-enum-tags-only=0 --dir=BWD_DW --flags=CH --dt=bf16:bf16 --tag=abc:abc --stat_tag=ab 32x512x1024_n"87305dc379e583071f3a8df8e322c77e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f16:f16 --tag=abc:abc --stat_tag=ab 1x128x768_n"9cfc044e15d67881636b2153e1dbce56"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --flags=CH --dt=f16:f16 --tag=ab:ab --stat_tag=a 128x768_n"51b348329fbe50df078adbf78976894e*24"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_bfloat16 b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_bfloat16
index cdfdef97080..4c442af44cb 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_bfloat16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_bfloat16
@@ -24,7 +24,7 @@
 
 --reset
 --skip-impl=ref
---dt=bf16 --bia_dt=bf16 --bia_mask=2
+--dt=bf16 --bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 # MM_2, MM_3 and MM_6 are the same, but MM_6 with binary post-ops by default
 21504x1024:1024x1024n"BERT:MM_1*96"
@@ -42,19 +42,19 @@
 
 #--reset
 #--skip-impl=ref
-#--dt=bf16 --bia_dt=bf16 --bia_mask=2
+#--dt=bf16 --bia-dt=bf16 --bia_mask=2
 #--attr-post-ops=add:bf16:per_tensor
 #21504x1024:1024x1024n"BERT:MM_6*24"
 
 --reset
 --skip-impl=ref
---dt=bf16 --bia_dt=bf16 --bia_mask=2
+--dt=bf16 --bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 21504x1024:1024x4096n"BERT:MM_7*24"
 
 --reset
 --skip-impl=ref
---dt=bf16 --bia_dt=bf16 --bia_mask=2
+--dt=bf16 --bia-dt=bf16 --bia_mask=2
 #--attr-post-ops=add:bf16:per_tensor
 --stag=ab --wtag=any --dtag=ab
 21504x4096:4096x1024n"BERT:MM_8*24"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_int8 b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_int8
index 4e117844817..c31d57454d8 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_int8
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_lb_int8
@@ -24,7 +24,7 @@
 
 --reset
 --skip-impl=ref
---dt=u8:s8:s8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
 # MM_3 is the same
@@ -32,7 +32,7 @@
 
 --reset
 --skip-impl=ref
---dt=u8:s8:u8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
 21504x1024:1024x1024n"BERT:MM_2*24"
@@ -55,7 +55,7 @@
 --reset
 --skip-impl=ref
 --dt=u8:s8:bf16
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 #--attr-post-ops=add:bf16:per_tensor
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
@@ -64,7 +64,7 @@
 --reset
 --skip-impl=ref
 --dt=u8:s8:u8
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
 21504x1024:1024x4096n"BERT:MM_7*24"
@@ -72,7 +72,7 @@
 --reset
 --skip-impl=ref
 --dt=u8:s8:bf16
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 #--attr-post-ops=add:bf16:per_tensor
 --stag=ab --wtag=any --dtag=ab
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_bfloat16 b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_bfloat16
index 1ff6707bebf..504e8ef1e64 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_bfloat16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_bfloat16
@@ -24,7 +24,7 @@
 
 --reset
 --skip-impl=ref
---dt=bf16 --bia_dt=bf16 --bia_mask=2
+--dt=bf16 --bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 # MM_2, MM_3 and MM_6 are the same, but MM_6 with binary post-ops by default
 384x1024:1024x1024n"BERT:MM_1*96"
@@ -42,19 +42,19 @@
 
 #--reset
 #--skip-impl=ref
-#--dt=bf16 --bia_dt=bf16 --bia_mask=2
+#--dt=bf16 --bia-dt=bf16 --bia_mask=2
 #--attr-post-ops=add:bf16:per_tensor
 #384x1024:1024x1024n"BERT:MM_6"
 
 --reset
 --skip-impl=ref
---dt=bf16 --bia_dt=bf16 --bia_mask=2
+--dt=bf16 --bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 384x1024:1024x4096n"BERT:MM_7*24"
 
 --reset
 --skip-impl=ref
---dt=bf16 --bia_dt=bf16 --bia_mask=2
+--dt=bf16 --bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 #--attr-post-ops=add:bf16:per_tensor
 384x4096:4096x1024n"BERT:MM_8*24"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_int8 b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_int8
index 6ceda3656d4..5a6954b22b3 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_int8
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_bert_inf_sb_int8
@@ -24,7 +24,7 @@
 
 --reset
 --skip-impl=ref
---dt=u8:s8:s8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
 # MM_3 is the same
@@ -32,7 +32,7 @@
 
 --reset
 --skip-impl=ref
---dt=u8:s8:u8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
 384x1024:1024x1024n"BERT:MM_2*24"
@@ -55,7 +55,7 @@
 --reset
 --skip-impl=ref
 --dt=u8:s8:bf16
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 #--attr-post-ops=add:bf16:per_tensor
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
@@ -64,7 +64,7 @@
 --reset
 --skip-impl=ref
 --dt=u8:s8:u8
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 --stag=ab --wtag=any --dtag=ab
 384x1024:1024x4096n"BERT:MM_7*24"
@@ -72,7 +72,7 @@
 --reset
 --skip-impl=ref
 --dt=u8:s8:bf16
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia_dt=bf16 --bia_mask=2
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.025 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=dst:common:7+src:common:3
 #--attr-post-ops=add:bf16:per_tensor
 --stag=ab --wtag=any --dtag=ab
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_bfloat16 b/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_bfloat16
index a58e7ddbe07..fc288cf6b8f 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_bfloat16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_bfloat16
@@ -36,7 +36,7 @@
 --reset
 --skip-impl=ref
 --dt=bf16 --stag=ab --wtag=ab --dtag=ab
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 # MM_2, MM_3 and MM_6 are the same, but MM_6 with binary post-ops by default
 28672x1024:1024x1024n"BERT:FWD,Encoder_MM_1*96"
 
@@ -54,20 +54,20 @@
 #--reset
 #--skip-impl=ref
 #--dt=bf16 --stag=ab --wtag=ab --dtag=ab
-#--bia_dt=bf16 --bia_mask=2
+#--bia-dt=bf16 --bia_mask=2
 #--attr-post-ops=add:bf16:per_tensor
 #28672x1024:1024x1024n"BERT:FWD,Encoder_MM_6"
 
 --reset
 --skip-impl=ref
 --dt=bf16 --stag=ab --wtag=ab --dtag=ab
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 28672x1024:1024x4096n"BERT:FWD,Encoder_MM_7*24"
 
 --reset
 --skip-impl=ref
 --dt=bf16 --stag=ab --wtag=ab --dtag=ab
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --attr-post-ops=add:bf16:per_tensor
 28672x4096:4096x1024n"BERT:FWD,Encoder_MM_8*24"
 
@@ -79,7 +79,7 @@
 # -----------------------------------------------------------------------------
 --reset
 --skip-impl=ref
---dt=bf16  --bia_dt=bf16 --bia_mask=2
+--dt=bf16  --bia-dt=bf16 --bia_mask=2
 
 --stag=ab --wtag=ab --dtag=ab 1120x1024:1024x1024n"BERT-L:FWD,Masked_1*1"
 --stag=ab --wtag=ba --dtag=ab 1120x1024:1024x30522n"BERT-L:FWD,Masked_2*1"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_float16 b/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_float16
index ac8d65dd1d0..c1a2f2186fc 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_float16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_bert_tr_float16
@@ -35,7 +35,7 @@
 # -----------------------------------------------------------------------------
 --reset
 --dt=f16 --stag=ab --wtag=ab --dtag=ab
---bia_dt=f16 --bia_mask=2
+--bia-dt=f16 --bia_mask=2
 # MM_2, MM_3 and MM_6 are the same, but MM_6 with binary post-ops by default
 28672x1024:1024x1024n"BERT:FWD,Encoder_MM_1*96"
 
@@ -50,18 +50,18 @@
 
 #--reset
 #--dt=f16 --stag=ab --wtag=ab --dtag=ab
-#--bia_dt=f16 --bia_mask=2
+#--bia-dt=f16 --bia_mask=2
 #--attr-post-ops=add:f16:per_tensor
 #28672x1024:1024x1024n"BERT:FWD,Encoder_MM_6"
 
 --reset
 --dt=f16 --stag=ab --wtag=ab --dtag=ab
---bia_dt=f16 --bia_mask=2
+--bia-dt=f16 --bia_mask=2
 28672x1024:1024x4096n"BERT:FWD,Encoder_MM_7*24"
 
 --reset
 --dt=f16 --stag=ab --wtag=ab --dtag=ab
---bia_dt=f16 --bia_mask=2
+--bia-dt=f16 --bia_mask=2
 --attr-post-ops=add:f16:per_tensor
 28672x4096:4096x1024n"BERT:FWD,Encoder_MM_8*24"
 
@@ -72,7 +72,7 @@
 #    Embedding - M = seq_len * batch
 # -----------------------------------------------------------------------------
 --reset
---dt=f16  --bia_dt=f16 --bia_mask=2
+--dt=f16  --bia-dt=f16 --bia_mask=2
 
 --stag=ab --wtag=ab --dtag=ab 1120x1024:1024x1024n"BERT-L:FWD,Masked_1*1"
 --stag=ab --wtag=ba --dtag=ab 1120x1024:1024x30522n"BERT-L:FWD,Masked_2*1"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_data_tags b/tests/benchdnn/inputs/matmul/harness_matmul_data_tags
index 46fa1e570b9..8d56c756075 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_data_tags
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_data_tags
@@ -19,7 +19,7 @@
 --wtag=BA16a64b2a,BA16a48b2a,BA16a32b2a,BA16a16b2a
 --batch=shapes_2d
 
---dt=u8:s8:f32
+--dt=u8:s8:f32,f8_e4m3,f8_e5m2
 --wtag=BA16a64b4a,BA16a48b4a,BA16a32b4a,BA16a16b4a
 --batch=shapes_2d
 
@@ -34,6 +34,6 @@
 --wtag=aCB16b16c2b,aCB16b32c2b,aCB16b48c2b,aCB16b64c2b
 --batch=shapes_3d
 
---dt=u8:s8:f32
+--dt=u8:s8:f32,f8_e4m3,f8_e5m2
 --wtag=aCB16b16c4b,aCB16b32c4b,aCB16b48c4b,aCB16b64c4b
 --batch=shapes_3d
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_decompression b/tests/benchdnn/inputs/matmul/harness_matmul_decompression
index cb70854fae2..9e6a5011627 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_decompression
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_decompression
@@ -1,6 +1,7 @@
 --reset
 
 # int8 wei decompression
+--skip-impl=ref
 --wtag=any,ab,ba
 --dt=bf16:s8:bf16,bf16:u8:bf16
 --attr-scales=wei:common:2,wei:per_oc:bf16
@@ -8,6 +9,7 @@
 --batch=shapes_2d
 
 --reset
+--skip-impl=ref
 --wtag=any,ab,ba
 --dt=f16:s8:f16,f16:u8:f16
 --attr-scales=wei:common:2,wei:per_oc,wei:per_ocic:f16
@@ -16,16 +18,18 @@
 --batch=shapes_2d_ci
 
 --reset
+--skip-impl=ref
 --wtag=any,abc,acb
 --dt=bf16:s8:bf16,bf16:u8:bf16
---attr-scales=wei:per_ocic:bf16:2x1
---attr-zero-points=,wei:per_ocic:u8:4x1
+--attr-scales=wei:per_ocic:bf16:32x1
+--attr-zero-points=,wei:per_ocic:u8:64x1
 --attr-fpmath=bf16:true
-1x5x12:1x12x17
-3x5x12:3x12x17
-3x5x12:1x12x17
+1x5x128:1x128x17
+3x5x128:3x128x17
+3x5x128:1x128x17
 
 --reset
+--skip-impl=ref
 --dt=f32:s8:f32
 --attr-fpmath=f16:true,bf16:true,tf32:true
 --attr-scales=wei:common:2,wei:per_oc,wei:per_ocic:f16
@@ -35,6 +39,7 @@
 7x16x24x8:7x16x8x24
 
 --reset
+--skip-impl=ref
 --stag=ba
 --dt=bf16:s8:bf16,bf16:u8:bf16
 --attr-scales=wei:common:2,wei:per_oc:bf16
@@ -44,6 +49,15 @@
 1x4096:4096x2048
 
 --reset
+--skip-impl=ref
+--stag=ba
+--dt=bf16:s8:bf16
+--attr-scales=wei:per_ocic:bf16:128x1
+--attr-fpmath=bf16:true
+2048x4096:4096x2048
+
+--reset
+--skip-impl=ref
 --stag=ba
 --dt=f16:s8:f16,f16:u8:f16
 --attr-scales=wei:common:2,wei:per_oc:f16
@@ -53,6 +67,7 @@
 1x4096:4096x2048
 
 --reset
+--skip-impl=ref
 --dt=bf16:s8:bf16,bf16:u8:bf16
 --wtag=any,ab,ba
 --attr-scales=wei:common:2,wei:per_oc:bf16
@@ -61,6 +76,7 @@
 1x4096:4096x4096
 
 --reset
+--skip-impl=ref
 --dt=f16:s8:f16,f16:u8:f16
 --wtag=any,ab,ba
 --attr-scales=wei:common:2,wei:per_oc:f16
@@ -68,30 +84,52 @@
 --attr-fpmath=f16:true
 1x4096:4096x4096
 
+## 4D cases with full mask
+--reset
+--skip-impl=ref
+--dt=bf16:s8:bf16,bf16:u8:bf16
+--wtag=any,abcd,abdc
+--attr-scales=wei:per_tensor:bf16:128x1,wei:per_tensor:bf16:1x128
+--attr-zero-points=wei:per_tensor:u8:128x1,wei:per_tensor:u8:1x128
+--attr-fpmath=bf16:true
+2x3x5x512:2x3x512x1024
+
+--reset
+--skip-impl=ref
+--dt=f16:s8:f16,f16:u8:f16
+--wtag=any,abcd,abdc
+--attr-scales=wei:per_tensor:f16:128x1
+--attr-zero-points=wei:per_tensor:u8:128x1
+--attr-fpmath=f16:true
+2x3x5x512:2x3x512x1024
+
 # int4 wei decompression
 --reset
+--skip-impl=ref
 --dt=bf16:s4:bf16,bf16:u4:bf16
 --wtag=any,abc,acb
 --attr-scales=wei:common:2,wei:per_oc:bf16,wei:per_ocic:bf16:32x1
 --attr-zero-points=wei:common:1:u8,wei:per_oc:u4,wei:per_ocic:s4,wei:per_ocic:s4:32x1
 --attr-fpmath=bf16:true
 7x24x32:7x32x64
-7x25x32:1x32x65
+7x25x32:1x32x66
 3x96x96:3x96x64
-3x14x96:1x96x77
+3x14x96:1x96x78
 
 --reset
+--skip-impl=ref
 --dt=f16:s4:f16,f16:u4:f16
 --wtag=any,abc,acb
 --attr-scales=wei:common:2,wei:per_oc:f16,wei:per_ocic:f16:128x1
 --attr-zero-points=wei:common:1:u8,wei:per_oc:u4,wei:per_ocic:u4,wei:per_ocic:u4:128x1
 --attr-fpmath=f16:true
 2x40x256:2x256x64
-7x41x256:1x256x63
+7x41x256:1x256x64
 3x96x512:3x512x64
 3x6x512:1x512x62
 
 --reset
+--skip-impl=ref
 --dt=bf16:s4:bf16,bf16:u4:bf16
 --wtag=any,ab,ba
 --attr-scales=wei:common:2,wei:per_oc:bf16,wei:per_ocic:bf16:128x1
@@ -100,6 +138,7 @@
 1x4096:4096x4096
 
 --reset
+--skip-impl=ref
 --dt=f16:s4:f16,f16:u4:f16
 --wtag=any,ab,ba
 --attr-scales=wei:common:2,wei:per_oc:f16,wei:per_ocic:f16:128x1
@@ -107,8 +146,35 @@
 --attr-fpmath=f16:true
 1x4096:4096x4096
 
-# int4 src grouped quantization
 --reset
+--skip-impl=ref
+--dt=f16:s4:f32
+--wtag=any,ab,ba
+--attr-scales=wei:per_ocic:f32:64x1
+--attr-fpmath=f16:true
+5x4096:4096x4096
+
+--reset
+--skip-impl=ref
+--dt=bf16:s4:bf16,bf16:u4:bf16
+--wtag=any,abcd,abdc
+--attr-scales=wei:per_tensor:bf16:128x1,wei:per_tensor:bf16:1x128
+--attr-zero-points=wei:per_tensor:u4:128x1,wei:per_tensor:u4:1x128
+--attr-fpmath=bf16:true
+2x3x5x512:2x3x512x1024
+
+--reset
+--skip-impl=ref
+--dt=f16:s4:f16,f16:u4:f16
+--wtag=any,abcd,abdc
+--attr-scales=wei:per_tensor:f16:128x1,wei:per_tensor:f16:1x128
+--attr-zero-points=wei:per_tensor:u4:128x1,wei:per_tensor:u4:1x128
+--attr-fpmath=f16:true
+2x3x5x512:2x3x512x1024
+
+# Dynamic quantization (int8 src, int4/int8 weights)
+--reset
+--skip-impl=ref
 --wtag=any,ab,ba
 --dt=s8:s8:f16
 --attr-scales=src:common:0.5:f32+wei:per_oc:f16
@@ -117,16 +183,27 @@
 
 --wtag=any,ab,ba
 --dt=s8:s4:f16
---attr-scales=src:per_oc:f16:1x32+wei:per_ocic:f16:128x1,\
-              src:per_ocic:f16:1x256+wei:per_ocic:f16:128x1
---attr-zero-points=wei:per_ocic:s4:128x1 # groups matches scales groups
+--attr-scales=src:per_ocic:f16:1x256+wei:per_ocic:f16:128x1
+--attr-zero-points=wei:per_ocic:s4:128x1,src:per_ocic:s4:1x128+wei:per_ocic:s4:128x1
+4x256:256x64
+6x256:256x100
+
+--wtag=any,ab,ba
+--dt=s8:u8:f16
+--attr-scales=src:per_ocic:f16:1x256+wei:per_ocic:f16:128x1
 4x256:256x64
 6x256:256x100
 
 --wtag=any,abc,acb
 --dt=s8:u4:f32
---attr-scales=src:per_oc:f16:1x16+wei:per_ocic:f16:192x1,\
-              src:per_ocic:f16:1x192+wei:per_ocic:f16:192x1,
---attr-zero-points=wei:per_ocic:u4:192x1 # groups matches scales groups
+--attr-scales=src:per_ocic:f16:1x192+wei:per_ocic:f16:192x1
+--attr-zero-points=wei:per_ocic:u4:192x1,src:per_ocic:s4:1x192+wei:per_ocic:u4:192x1
 12x4x576:12x576x192
 12x6x192:12x192x100
+
+--wtag=any,abcd,abdc
+--dt=s8:s4:f16
+--attr-scales=src:per_ocic:f16:1x256+wei:per_tensor:f16:128x1
+--attr-zero-points=wei:per_tensor:s4:128x1,src:per_ocic:u4:1x256+wei:per_tensor:s4:128x1
+2x3x4x256:2x3x256x64
+2x3x6x256:2x3x256x100
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_generated_ci b/tests/benchdnn/inputs/matmul/harness_matmul_generated_ci
new file mode 100644
index 00000000000..72deba6dcb5
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_generated_ci
@@ -0,0 +1,1003 @@
+#### Auto-generated by synthdnn
+#### python3 synthdnn.py matmul -s 1000 --skip-impl=ref -b harness_matmul_generated_ci
+
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 1x4740x2:1x2x7363
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f8_e4m3:f8_e5m2:f16  2x1x4:2x4x186
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:u8:f32 --attr-fpmath=bf16:true 4x5x1064:4x1064x2
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f16:u8:f16 --attr-fpmath=f16:true 4x40x2:4x2x67
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x4914x3:1x3x2653
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=u8:u8:f16  5x5x1:5x1x135
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 2x36x1599:2x1599x2555
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f8_e5m2:f8_e4m3:f32  6x20x123:6x123x249
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f32:s8:f16 --attr-fpmath=bf16:true 1x107x2983:1x2983x2517
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=s8:u8:u8  5x3x4428:5x4428x101
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f16:f16:u8  7x8x101:7x101x725
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f32:u8:f16 --attr-fpmath=strict:true 3x2670x95:3x95x335
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=f16:u4:u8  2x107x1271:2x1271x3
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f8_e5m2:f8_e5m2:f16  1x5812x1192:1x1192x7
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e4m3:f32  2x3851x1189:2x1189x157
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=s8:u8:f16  1x36x16:1x16x114
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f32:s8:f16 --attr-fpmath=bf16:true 1x3x1:1x1x86
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f16:f16:u8  5x5777x134:5x134x377
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 5x143x5194:5x5194x2
+#--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f32:s8:f16 --attr-fpmath=strict:true 2x25x745:2x745x7130
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f16:u4:f32 --attr-fpmath=f16:true 2x2764x182:2x182x27
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 4x3x4:4x4x2
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=bf16:bf16:u8  3x2468x2:3x2x49
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=s8:u8:f16  2x772x19:2x19x101
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=f8_e4m3:f8_e4m3:f32  4x27x5616:4x5616x11
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=f16:true 1x522x33:1x33x516
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=f16:true 3x8x29:3x29x2
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=bf16:bf16:u8  1x55x11:1x11x11
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f32:u8:f16 --attr-fpmath=tf32:true 2x21x2312:2x2312x1
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=bf16:s4:s8  1x329x596:1x596x2092
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=f16:true 2x559x1142:2x1142x4
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f16:s4:s8  3x22x1:3x1x81
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f8_e5m2:f8_e5m2:f16  4x3x5:4x5x42
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=f8_e5m2:f8_e4m3:f32  7x2x1517:7x1517x591
+#--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=strict:true 2x1485x784:2x784x159
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 8x1760x53:8x53x69
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=s8:u8:f16  2x1x341:2x341x7137
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f32:f32:f32 --attr-fpmath=f16 2x77x543:2x543x164
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=s8:u8:s32  3x7457x1:3x1x1779
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e5m2  1x17x978:1x978x69
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=f8_e4m3:f8_e5m2:f16  2x10x102:2x102x442
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f32:s8:f16 --attr-fpmath=tf32:true 3x50x3:3x3x152
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=bf16:s8:f32 --attr-fpmath=bf16:true 4x825x5:4x5x14
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f8_e4m3:f8_e4m3:f16  2x92x11:2x11x353
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=u8:u8:u8  2x7246x461:2x461x399
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=f32:s8:f32 --attr-fpmath=f16:true 3x238x611:3x611x22
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f16:f16:f16  4x229x1:4x1x300
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=bf16:s4:u8  6x177x2351:6x2351x882
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=bf16:u8:f32 --attr-fpmath=bf16:true 1x1054x1879:1x1879x31
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f16:f16:f16  2x128x1:2x1x39
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=s8:u8:s8  7x181x210:7x210x3
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=u8:u8:u8  1x10x45:1x45x61
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f16:s4:s8  3x1399x5045:3x5045x5497
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f16:f16:f16  6x3723x3:6x3x86
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=bf16:s4:u8  1x56x35:1x35x81
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e5m2:bf16  2x1754x5:2x5x10
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f16:u4:f16 --attr-fpmath=f16:true 2x7069x17:2x17x4168
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=bf16:bf16:f32  2x6669x1813:2x1813x4
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f16:u8:u8  5x2411x18:5x18x1
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=bf16:u4:s8  7x2x600:7x600x298
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=u8:u8:u8  4x58x2949:4x2949x92
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=s8:u8:f16  2x2x12:2x12x90
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f32:u8:bf16 --attr-fpmath=tf32:true 2x1236x759:2x759x10
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e4m3:f16  3x6772x1:3x1x36
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f16:s8:u8  2x4x1:2x1x139
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 2x551x1276:2x1276x58
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=bf16:s8:s8  2x161x72:2x72x2
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=bf16:s4:s8  4x270x17:4x17x248
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=u8:u8:s8  8x5402x374:8x374x1974
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f16:u8:f16 --attr-fpmath=f16:true 2x652x18:2x18x184
+#--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=strict:true 2x1891x2:2x2x3
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f8_e5m2:f8_e4m3:f8_e5m2  2x12x190:2x190x2
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f8_e4m3:f8_e4m3:bf16  6x3x6792:6x6792x976
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f16:s8:u8  4x40x28:4x28x3339
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f8_e4m3:f8_e5m2:f16  3x2768x52:3x52x34
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=s8:u8:f32  3x17x1043:3x1043x5782
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=bf16:s4:s8  2x4998x8:2x8x4223
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=bf16:s4:f32 --attr-fpmath=bf16:true 4x76x17:4x17x5087
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f16:f16:f32  8x284x10:8x10x1099
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 6x1230x352:6x352x941
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f16:u4:f16 --attr-fpmath=f16:true 8x3998x139:8x139x1344
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 5x611x4:5x4x2963
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=bf16:s8:u8  7x2238x9:7x9x43
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f8_e5m2:f8_e4m3:f8_e4m3  1x260x732:1x732x72
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=f16:u8:u8  1x79x20:1x20x309
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e5m2:bf16  2x1x2:2x2x6
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=bf16:s4:f32 --attr-fpmath=bf16:true 4x14x418:4x418x272
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 6x975x2080:6x2080x1
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f16:u8:s8  1x107x2:1x2x147
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=tf32:true 6x2x82:6x82x11
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=f8_e4m3:f8_e4m3:f16  2x16x5775:2x5775x2
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f16:u8:u8  2x5256x6:2x6x290
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f16:s8:u8  6x7x1527:6x1527x3903
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f32:u8:f32 --attr-fpmath=strict:true 5x216x2430:5x2430x167
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f16:s8:u8  1x82x216:1x216x756
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f64:f64:f64  6x582x9:6x9x30
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=bac --dt=bf16:bf16:s8  2x80x10:2x10x5
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:u8:s8  2x93x125:2x125x4
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=s8:u8:f16  3x25x1:3x1x4524
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f32:s8:f32 --attr-fpmath=bf16:true 6x10x2:6x2x2
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f16:s8:f32 --attr-fpmath=f16:true 1x62x455:1x455x8037
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  5x5x108:5x108x40
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f32:f32:f32 --attr-fpmath=bf16 5x15x2702:5x2702x1984
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=u8:u8:bf16  1x50x1601:1x1601x191
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f16:u4:f16 --attr-fpmath=f16:true 8x10x1137:8x1137x9
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=bf16:u8:s8  6x2x2535:6x2535x6295
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=f16:s8:f16 --attr-fpmath=f16:true 2x377x2:2x2x481
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f32:s8:f16 --attr-fpmath=f16:true 6x1004x84:6x84x316
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=s8:u8:bf16  7x5x4421:7x4421x409
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e5m2:f32  2x244x58:2x58x2
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 1x108x15:1x15x3
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f8_e5m2:f8_e4m3:f32  2x152x151:2x151x3
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f8_e5m2:f8_e5m2:f32  5x273x26:5x26x7
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f16:u8:f32 --attr-fpmath=f16:true 4x4455x9:4x9x1029
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f16:s4:f16 --attr-fpmath=f16:true 5x1734x30:5x30x3
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f32:s8:f32 --attr-fpmath=bf16:true 4x3880x433:4x433x4367
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f32:u8:f16 --attr-fpmath=f16:true 6x151x72:6x72x1856
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=bf16:s8:s8  4x149x4726:4x4726x7
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=u8:u8:bf16  2x4155x262:2x262x87
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f32:s8:f32 --attr-fpmath=f16:true 2x3701x2906:2x2906x41
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=u8:u8:s8  6x302x25:6x25x1026
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f32:f32:f32 --attr-fpmath=f16 5x55x74:5x74x1068
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=s8:u8:u8  2x14x3312:2x3312x5
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f16:s8:s8  2x2x1551:2x1551x1165
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:u8:f16 --attr-fpmath=tf32:true 3x1022x6976:3x6976x8
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u4:u8  1x195x657:1x657x1264
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 7x1x2:7x2x15
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=bf16:u4:s8  7x1x3:7x3x1
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=bf16:s8:s8  5x176x4304:5x4304x2
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f32:s8:bf16 --attr-fpmath=strict:true 3x9x149:3x149x1291
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e4m3  8x1x17:8x17x1926
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 3x3x10:3x10x29
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:s8:u8  2x9x8:2x8x678
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f32:u8:f16 --attr-fpmath=tf32:true 5x1x463:5x463x11
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f8_e4m3  5x545x1661:5x1661x469
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=bf16:s4:u8  1x50x102:1x102x989
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e4m3  4x174x20:4x20x338
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=bf16:bf16:f32  2x33x3207:2x3207x1254
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e4m3  3x11x7:3x7x6
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e5m2  2x202x11:2x11x2419
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=bf16:bf16:s8  4x6x3147:4x3147x45
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:s8:f16 --attr-fpmath=strict:true 2x1031x54:2x54x7
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=u8:u8:s32  4x17x17:4x17x4247
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  5x7159x308:5x308x16
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 3x69x1:3x1x2
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=f32:s8:f16 --attr-fpmath=f16:true 2x6300x2193:2x2193x97
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:u8:f16 --attr-fpmath=strict:true 2x38x47:2x47x2
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f32:s8:f32 --attr-fpmath=f16:true 6x924x5201:6x5201x6
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e4m3  2x185x7:2x7x5317
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=u8:u8:s32  5x952x752:5x752x16
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=f32:u8:f32 --attr-fpmath=tf32:true 2x105x8:2x8x11
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f16:u4:u8  1x47x66:1x66x927
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f32:s8:f32 --attr-fpmath=bf16:true 2x3x327:2x327x9
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=s8:u8:s8  5x8x67:5x67x203
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=s8:u8:s8  2x1x69:2x69x4
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=s8:u8:f32  6x660x1330:6x1330x80
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  1x54x6991:1x6991x109
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 2x3x2677:2x2677x110
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  5x465x5907:5x5907x10
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f16:u4:f32 --attr-fpmath=f16:true 1x3271x13:1x13x3164
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f8_e4m3:f8_e4m3:f16  5x35x572:5x572x1860
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f32:u8:f32 --attr-fpmath=strict:true 1x223x370:1x370x5
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f32:s8:f32 --attr-fpmath=bf16:true 2x33x7364:2x7364x11
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f8_e4m3:f8_e4m3:f16  1x26x54:1x54x791
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f16:s4:f16 --attr-fpmath=f16:true 1x53x420:1x420x85
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:s8:f16 --attr-fpmath=strict:true 8x2262x20:8x20x1908
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=s8:u8:s32  2x1035x2589:2x2589x2
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=f8_e5m2:f8_e5m2:bf16  7x4x10:7x10x4
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f32:u8:f16 --attr-fpmath=bf16:true 3x420x27:3x27x4310
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f32:u8:f32 --attr-fpmath=strict:true 2x1x13:2x13x138
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e4m3:bf16  1x7x227:1x227x4396
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 2x674x3:2x3x1019
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:f32:f32  8x5x2834:8x2834x44
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=s8:u8:s8  1x16x2:1x2x1
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f8_e5m2:f8_e4m3:bf16  2x3776x1080:2x1080x26
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f16:u8:f32 --attr-fpmath=f16:true 5x4800x2561:5x2561x11
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e4m3:bf16  6x2987x585:6x585x1478
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e5m2:bf16  3x7058x143:3x143x30
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=bf16:bf16:bf16  3x77x7030:3x7030x2691
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f16:s8:u8  2x4848x2:2x2x43
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=bf16:s4:u8  2x2076x9:2x9x1317
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f64:f64:f64  1x112x7:1x7x10
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=bf16:u4:u8  4x808x4:4x4x5
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f16:f16:f16  2x2x4:2x4x14
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f32:u8:f16 --attr-fpmath=tf32:true 1x353x1964:1x1964x576
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=bf16:u4:f32 --attr-fpmath=bf16:true 5x23x2119:5x2119x2062
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e5m2:f32  2x599x34:2x34x5
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f8_e5m2:f8_e4m3:bf16  4x5x8:4x8x10
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 2x1156x31:2x31x155
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 2x47x56:2x56x316
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f32:s8:f16 --attr-fpmath=bf16:true 6x10x2:6x2x11
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=bf16:bf16:u8  4x410x78:4x78x3
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f32:u8:f32 --attr-fpmath=tf32:true 3x1x68:3x68x1436
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f16:s8:u8  2x69x19:2x19x1491
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=bf16:s8:s8  5x290x3:5x3x165
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=s8:u8:s32  2x39x2011:2x2011x2
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=tf32:true 7x3539x7280:7x7280x261
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:bf16  1x5367x56:1x56x138
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f32:f32:f32  1x31x4:1x4x291
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=bf16:bf16:bf16  1x399x42:1x42x1520
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=f16:s8:u8  4x67x120:4x120x15
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f32:u8:f32 --attr-fpmath=f16:true 3x84x4350:3x4350x234
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f32:s8:bf16 --attr-fpmath=strict:true 2x548x4:2x4x23
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 7x349x12:7x12x2
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 3x4x921:3x921x5359
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=f8_e5m2:f8_e4m3:f16  1x527x18:1x18x3938
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=bf16:true 5x1470x824:5x824x151
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f16:f16:f32  3x1779x72:3x72x506
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f8_e5m2:f8_e5m2:f32  2x7070x2:2x2x2
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f8_e5m2:f8_e5m2:f16  2x150x369:2x369x644
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f32:u8:f16 --attr-fpmath=tf32:true 2x2x5:2x5x2
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:u8:f16 --attr-fpmath=tf32:true 6x6909x3:6x3x28
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=bf16:u8:s8  1x934x244:1x244x2
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f8_e5m2:f8_e5m2:f32  5x2409x5:5x5x44
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f32:u8:f16 --attr-fpmath=f16:true 6x1832x15:6x15x6936
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f16:f16:u8  3x285x620:3x620x1862
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f16:s8:f16 --attr-fpmath=f16:true 2x9x44:2x44x2
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f8_e4m3:f8_e5m2:f16  2x3484x65:2x65x2
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:f32:f32  2x1x20:2x20x12
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f16:s4:f16 --attr-fpmath=f16:true 2x2x11:2x11x7
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:u8:f32 --attr-fpmath=tf32:true 3x307x2439:3x2439x42
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f8_e4m3  2x319x42:2x42x422
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 1x7x3790:1x3790x1088
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 2x2329x5079:2x5079x10
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=u8:u8:bf16  1x49x1:1x1x3
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f8_e4m3:f8_e4m3:f16  3x65x1:3x1x2
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f32:u8:f16 --attr-fpmath=strict:true 2x280x30:2x30x2934
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 3x11x1:3x1x1087
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f32:s8:f16 --attr-fpmath=strict:true 6x524x31:6x31x219
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f16:s4:f16 --attr-fpmath=f16:true 2x2210x71:2x71x6879
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f32:s8:bf16 --attr-fpmath=bf16:true 3x17x201:3x201x79
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=s8:u8:u8  4x931x11:4x11x4821
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 1x1288x18:1x18x93
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f32:u8:f32 --attr-fpmath=strict:true 1x27x3451:1x3451x696
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=u8:u8:s8  2x507x3:2x3x267
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=bf16:u4:s8  1x3x2:1x2x6561
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f16:u8:f16 --attr-fpmath=f16:true 1x73x4965:1x4965x21
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f16:s4:f32 --attr-fpmath=f16:true 2x30x3040:2x3040x4
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=bf16:bf16:bf16  7x3x750:7x750x8
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 1x3057x47:1x47x8
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=bf16:u8:s8  1x13x1:1x1x75
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f16:u8:f16 --attr-fpmath=f16:true 5x3791x3171:5x3171x44
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f32:u8:f16 --attr-fpmath=f16:true 2x14x38:2x38x42
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f32:s8:f32 --attr-fpmath=f16:true 3x5x16:3x16x1080
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 6x2912x1387:6x1387x184
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f32:f32:f32 --attr-fpmath=f16 1x3x414:1x414x199
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=s8:u8:f32  8x199x22:8x22x13
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f16:s8:f32 --attr-fpmath=f16:true 3x93x2624:3x2624x21
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=bf16:s4:u8  1x5665x5:1x5x4
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f32:u8:bf16 --attr-fpmath=f16:true 4x4468x18:4x18x2018
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f16:s4:u8  1x1013x6336:1x6336x6
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=u8:u8:f16  1x1x128:1x128x1428
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 6x1948x4738:6x4738x1
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=f16:true 6x22x5:6x5x29
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f32:u8:f32 --attr-fpmath=f16:true 3x39x160:3x160x59
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=bf16:true 5x7x1075:5x1075x12
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f16:f16:u8  8x3x20:8x20x200
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=bf16:bf16:bf16  2x25x8:2x8x3
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f8_e4m3:f8_e5m2:bf16  1x1558x388:1x388x1
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=bf16:s8:u8  1x5857x7:1x7x6936
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=u8:u8:f32  2x16x40:2x40x2
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f32:s8:f32 --attr-fpmath=tf32:true 4x49x184:4x184x20
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=s8:u8:f32  6x70x137:6x137x2
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=f16:true 6x3443x297:6x297x69
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=s8:u8:s32  4x34x3:4x3x234
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f8_e4m3:f8_e4m3:f32  2x1146x2:2x2x50
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f16:u8:f32 --attr-fpmath=f16:true 1x702x61:1x61x1
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f16:u8:f16 --attr-fpmath=f16:true 6x5986x346:6x346x118
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 1x1x56:1x56x139
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f32:f32:f32 --attr-fpmath=f16 2x39x26:2x26x48
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f16:s8:f16 --attr-fpmath=f16:true 1x45x8:1x8x39
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f16:s8:s8  2x515x462:2x462x6
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e5m2  1x60x5384:1x5384x1932
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=bf16:bf16:bf16  3x13x1185:3x1185x1011
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=bf16:u4:f32 --attr-fpmath=bf16:true 8x9x177:8x177x2
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 2x7x20:2x20x1691
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f8_e5m2:f8_e5m2:f16  2x3x4489:2x4489x7542
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:u4:u8  1x68x1:1x1x112
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e5m2  4x27x4:4x4x627
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f32:s8:f32 --attr-fpmath=strict:true 2x452x181:2x181x5
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e5m2:f8_e5m2  4x59x47:4x47x166
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 2x254x78:2x78x1
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f32:u8:f32 --attr-fpmath=bf16:true 3x23x2960:3x2960x280
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f32:s8:f16 --attr-fpmath=strict:true 3x17x2679:3x2679x8
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  6x92x216:6x216x76
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=bf16:u4:u8  4x6x9:4x9x343
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=s8:u8:s8  2x2x1:2x1x1044
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=bf16:bf16:bf16  2x2x81:2x81x646
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f32:u8:f32 --attr-fpmath=tf32:true 5x2x381:5x381x3990
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=bf16:s4:f32 --attr-fpmath=bf16:true 5x215x6:5x6x61
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=bf16:bf16:f32  1x5x69:1x69x4128
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=bf16:bf16:u8  1x3090x2134:1x2134x3
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=s8:u8:u8  1x6x7:1x7x13
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=bf16:s4:f32 --attr-fpmath=bf16:true 1x69x6489:1x6489x232
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=s8:u8:f32  2x9x1718:2x1718x531
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=tf32:true 4x5445x73:4x73x7395
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f8_e4m3:f8_e5m2:bf16  3x2x6625:3x6625x7
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=s8:u8:s32  1x179x1972:1x1972x4703
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=bf16:bf16:s8  1x7x167:1x167x2
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f32:s8:f32 --attr-fpmath=strict:true 5x20x4:5x4x2178
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=u8:u8:s8  2x6x12:2x12x72
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=bf16:u4:s8  3x299x4:3x4x47
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=bf16:u8:s8  2x8x88:2x88x6633
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f16:u4:f16 --attr-fpmath=f16:true 1x5x8:1x8x14
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 3x26x1:3x1x161
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f32:f32:f32 --attr-fpmath=tf32 4x2x24:4x24x2
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  3x14x2:3x2x348
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f32:u8:f32 --attr-fpmath=strict:true 7x23x165:7x165x27
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=f32:f32:f32  2x457x3888:2x3888x2
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=u8:u8:u8  4x36x38:4x38x5162
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e5m2  3x155x10:3x10x449
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=bf16:u8:u8  1x1926x5306:1x5306x2560
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=strict:true 7x1679x1:7x1x2139
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=s8:u8:bf16  2x4479x1706:2x1706x56
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f32:s8:f32 --attr-fpmath=strict:true 3x180x538:3x538x5665
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f16:f16:f32  5x118x21:5x21x7511
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=u8:u8:s8  2x56x3332:2x3332x3854
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=bf16:s8:f32 --attr-fpmath=bf16:true 4x2x6:4x6x4
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f16:s4:u8  2x2x1888:2x1888x922
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f16:s4:s8  3x2608x596:3x596x1433
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 2x6958x1:2x1x3
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f32:u8:f16 --attr-fpmath=tf32:true 2x2x17:2x17x14
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f8_e4m3:f8_e4m3:f32  2x295x7:2x7x2172
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=tf32:true 5x4x2989:5x2989x141
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=bf16:s4:u8  3x235x2:3x2x11
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=s8:u8:s8  4x10x2:4x2x5
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 1x3x19:1x19x334
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=tf32:true 1x850x6:1x6x1591
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f32:u8:f16 --attr-fpmath=f16:true 3x1037x3366:3x3366x1620
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=bf16:s4:f32 --attr-fpmath=bf16:true 1x266x384:1x384x33
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=bf16:u4:f32 --attr-fpmath=bf16:true 2x23x1871:2x1871x4250
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=s8:u8:u8  8x147x67:8x67x215
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f8_e4m3:f8_e4m3:f16  2x100x215:2x215x10
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=bf16:u8:u8  3x1006x89:3x89x2
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f32:u8:f16 --attr-fpmath=f16:true 5x1x5585:5x5585x93
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=bf16:bf16:u8  3x1149x322:3x322x5057
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=bf16:bf16:f32  4x275x521:4x521x196
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f16:f16:f32  2x3x3:2x3x1886
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f8_e4m3:f8_e4m3:bf16  4x19x544:4x544x5
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f16:s4:f32 --attr-fpmath=f16:true 1x2931x99:1x99x91
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f32:u8:f32 --attr-fpmath=strict:true 3x1603x1:3x1x8
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e5m2  2x26x4:2x4x4
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f16:s8:u8  4x560x161:4x161x4
+#--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f32:u8:bf16 --attr-fpmath=strict:true 7x84x4:7x4x43
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f16:u4:u8  2x3x219:2x219x3
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f8_e5m2:f8_e5m2:f32  8x1x15:8x15x172
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f16:s8:f16 --attr-fpmath=f16:true 8x531x2:8x2x147
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=f8_e5m2:f8_e5m2:f16  3x1x116:3x116x2149
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=bf16:bf16:bf16  5x40x4719:5x4719x6684
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f16:f16:f16  4x1430x1472:4x1472x739
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=f8_e4m3:f8_e4m3:f8_e5m2  3x2456x482:3x482x213
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e5m2:f32  2x2x7267:2x7267x4
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f32:f32:f32 --attr-fpmath=bf16 5x53x922:5x922x2
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 3x9x103:3x103x3082
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f16:f16:f16  4x237x2112:4x2112x194
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=bf16:bf16:s8  2x4635x17:2x17x89
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=f32:s8:bf16 --attr-fpmath=strict:true 4x3x731:4x731x68
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=bf16:u8:f32 --attr-fpmath=bf16:true 3x3203x75:3x75x127
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=bf16:u4:u8  5x3x2005:5x2005x676
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 2x2x184:2x184x480
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e4m3:f16  4x52x668:4x668x3597
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 4x1131x37:4x37x5289
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=s8:u8:u8  7x1x2:7x2x3
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f32:u8:f16 --attr-fpmath=f16:true 7x1026x2:7x2x89
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:s4:s8  3x3646x234:3x234x889
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=bf16:s8:s8  3x6x9:3x9x542
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f16:u8:f32 --attr-fpmath=f16:true 1x10x33:1x33x1142
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=bf16:u8:f32 --attr-fpmath=bf16:true 5x161x2:5x2x6734
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f8_e5m2:f8_e4m3:f32  1x4x5759:1x5759x336
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=f16:true 5x4x69:5x69x2
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=u8:u8:s32  6x20x35:6x35x292
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=f16:f16:f32  2x1296x194:2x194x495
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=bf16:true 2x2590x166:2x166x91
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f32:u8:bf16 --attr-fpmath=tf32:true 4x493x2595:4x2595x29
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=bf16:u8:s8  3x2234x88:3x88x39
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=u8:u8:f16  4x50x119:4x119x2575
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f32:u8:f32 --attr-fpmath=strict:true 2x25x3355:2x3355x67
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=tf32:true 2x843x28:2x28x25
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=f32:u8:f16 --attr-fpmath=tf32:true 5x693x7835:5x7835x3899
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x559x12:1x12x41
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=s8:u8:f32  6x4432x9:6x9x29
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=f8_e5m2:f8_e5m2:f16  2x963x467:2x467x43
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f32:s8:f32 --attr-fpmath=bf16:true 7x88x348:7x348x5
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f16:u8:s8  5x147x80:5x80x1
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f16:s4:u8  3x2x3255:3x3255x2
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:s8:bf16 --attr-fpmath=f16:true 3x7488x11:3x11x159
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=bf16:u8:f32 --attr-fpmath=bf16:true 5x238x2:5x2x16
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e4m3  6x3x69:6x69x3923
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f32:f32:f32 --attr-fpmath=bf16 2x5911x18:2x18x12
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f16:s8:f32 --attr-fpmath=f16:true 2x3434x56:2x56x5545
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f16:u8:f16 --attr-fpmath=f16:true 1x3x24:1x24x6
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f16:s4:s8  5x154x6433:5x6433x3
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:u8:f32 --attr-fpmath=f16:true 3x1028x2242:3x2242x137
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f32:u8:f16 --attr-fpmath=bf16:true 6x9x4935:6x4935x21
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=bf16:bf16:u8  3x3467x13:3x13x4
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=bac --dt=f8_e4m3:f8_e4m3:bf16  2x18x2:2x2x23
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=bf16:u8:u8  2x3x1:2x1x2117
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e5m2  2x497x2:2x2x429
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f16  5x365x36:5x36x203
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  1x325x1387:1x1387x5
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x49x693:1x693x22
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f16:u4:u8  2x7414x9:2x9x2
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f8_e4m3:f8_e4m3:f32  6x2162x3:6x3x2690
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 3x72x5146:3x5146x509
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 4x7x2797:4x2797x15
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f16:u8:f16 --attr-fpmath=f16:true 2x33x337:2x337x50
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f8_e5m2:f8_e4m3:f32  1x2x707:1x707x2290
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=bf16:true 2x9x159:2x159x69
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=bf16:s8:f32 --attr-fpmath=bf16:true 3x7x144:3x144x55
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e5m2  1x335x1977:1x1977x4322
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f16:f16:u8  2x6056x1314:2x1314x6
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=s8:u8:f16  6x3x32:6x32x36
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f32:s8:f32 --attr-fpmath=strict:true 7x4x3458:7x3458x773
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f8_e5m2  4x3x11:4x11x258
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e5m2  1x336x5647:1x5647x203
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=bf16:u8:s8  2x1x1617:2x1617x9
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 3x6277x15:3x15x32
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f16:f16:f16  2x5305x5:2x5x160
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=u8:u8:bf16  2x102x129:2x129x117
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=bf16:bf16:s8  1x47x240:1x240x490
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:s8:f32 --attr-fpmath=tf32:true 6x295x3015:6x3015x1773
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f16:s4:f32 --attr-fpmath=f16:true 2x15x129:2x129x3501
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f32:s8:f32 --attr-fpmath=bf16:true 1x974x9:1x9x4
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f32:f32:f32 --attr-fpmath=bf16 6x1x17:6x17x2245
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f64:f64:f64  1x1x417:1x417x8
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f32:s8:f16 --attr-fpmath=strict:true 7x15x15:7x15x424
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=s8:u8:f16  5x68x142:5x142x553
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f16:f16:f16  6x6x62:6x62x22
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  3x14x14:3x14x2434
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f8_e4m3:f8_e4m3:f32  3x1319x10:3x10x41
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=bf16:u8:s8  4x373x20:4x20x10
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f32:s8:f16 --attr-fpmath=f16:true 1x62x4022:1x4022x19
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e4m3  3x1825x403:3x403x145
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f32:s8:bf16 --attr-fpmath=f16:true 3x154x3302:3x3302x5
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f16:u4:s8  5x579x141:5x141x1788
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=f32:s8:f32 --attr-fpmath=bf16:true 7x6x11:7x11x2
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f16:u8:u8  4x44x569:4x569x752
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=u8:u8:f16  1x1700x1:1x1x455
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  4x13x2028:4x2028x116
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f32  2x2975x375:2x375x26
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:f16  2x395x28:2x28x1818
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=bf16:bf16:u8  2x15x3:2x3x7953
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f16:s8:s8  3x129x344:3x344x4538
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=u8:u8:u8  3x1349x6:3x6x6
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=s8:u8:u8  3x221x128:3x128x717
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f8_e4m3:f8_e5m2:f8_e5m2  1x2774x81:1x81x153
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f8_e5m2:f8_e4m3:bf16  1x6x100:1x100x2624
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=s8:u8:u8  4x2x2:4x2x7
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e5m2  2x918x31:2x31x3
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=bf16:true 8x1744x292:8x292x63
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=bf16:s8:s8  6x98x3:6x3x16
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 3x9x908:3x908x2262
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 5x2363x1:5x1x35
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=s8:u8:u8  5x3328x1:5x1x889
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f16:u8:u8  4x2x100:4x100x6
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=s8:u8:u8  1x2x2:1x2x320
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f16:s4:f32 --attr-fpmath=f16:true 2x2914x5:2x5x23
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f32:u8:f32 --attr-fpmath=tf32:true 2x34x1328:2x1328x17
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:f32:f32 --attr-fpmath=bf16 4x1x49:4x49x22
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f16:s8:u8  5x4x6257:5x6257x6555
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f16:s8:f16 --attr-fpmath=f16:true 2x286x17:2x17x213
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  4x1x86:4x86x2666
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f16:u8:f32 --attr-fpmath=f16:true 3x244x9:3x9x7834
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 3x2536x1:3x1x16
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f32:u8:f32 --attr-fpmath=tf32:true 1x9x30:1x30x21
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f64:f64:f64  1x86x425:1x425x1462
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=f8_e4m3:f8_e4m3:f32  1x171x1786:1x1786x1
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e5m2:f8_e5m2:bf16  3x40x4:3x4x1
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=s8:u8:u8  4x67x1290:4x1290x528
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:f16  6x145x2:6x2x257
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=bf16:u8:s8  4x316x2354:4x2354x5
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f32:u8:f16 --attr-fpmath=f16:true 4x7660x5054:4x5054x2956
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=f16:true 3x9x126:3x126x6097
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e4m3:f8_e4m3:f16  2x27x520:2x520x2064
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f16:f16:f32  1x29x104:1x104x41
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=bf16:s4:u8  2x91x7381:2x7381x4
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=f16:s4:u8  2x11x595:2x595x40
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f16:f16:f16  2x2x6177:2x6177x51
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e5m2  4x440x5:4x5x826
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f32:s8:f16 --attr-fpmath=bf16:true 2x28x38:2x38x2
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:bf16  2x2x80:2x80x11
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f32:s8:f16 --attr-fpmath=f16:true 2x261x2249:2x2249x9
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=bf16:bf16:bf16  6x4x1:6x1x19
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 3x304x115:3x115x4833
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f16:u8:u8  4x4525x1160:4x1160x1
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=bf16:u4:f32 --attr-fpmath=bf16:true 2x1113x509:2x509x6
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=s8:u8:bf16  3x8x12:3x12x290
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f8_e4m3:f8_e5m2:f32  2x14x40:2x40x4
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e4m3:f16  1x4x153:1x153x2735
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 6x16x2:6x2x513
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:s8:f16 --attr-fpmath=f16:true 1x8x177:1x177x298
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f16:f16:u8  1x5199x1:1x1x3
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=f16:u4:f16 --attr-fpmath=f16:true 2x2x2668:2x2668x7052
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=f16:s8:u8  6x4x1:6x1x7
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:u4:s8  2x46x1090:2x1090x395
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=bf16:u4:s8  1x3915x303:1x303x4882
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=tf32:true 5x157x19:5x19x3949
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 5x210x76:5x76x1023
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f8_e4m3:f8_e5m2:f8_e5m2  1x25x265:1x265x58
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f16:u8:f32 --attr-fpmath=f16:true 2x13x41:2x41x2425
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f16:u8:f32 --attr-fpmath=f16:true 2x340x4:2x4x161
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=bf16:s8:s8  2x3x297:2x297x4
+#--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=strict:true 8x21x382:8x382x118
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=s8:u8:f32  2x200x2:2x2x2
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=bf16:bf16:u8  5x25x1055:5x1055x138
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f8_e4m3:f8_e5m2:f8_e5m2  2x212x1413:2x1413x65
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f16:u8:u8  4x9x5346:4x5346x660
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=u8:u8:s8  3x7x2:3x2x11
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=u8:u8:f16  5x3x162:5x162x132
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f16:f16:s8  1x13x4033:1x4033x69
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f16:s8:f32 --attr-fpmath=f16:true 3x5x5:3x5x6
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=bf16:u8:s8  3x18x2088:3x2088x4
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=u8:u8:s32  3x2x3:3x3x2
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f16:s8:u8  3x2x1501:3x1501x1099
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 1x3x998:1x998x482
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=bf16:s4:f32 --attr-fpmath=bf16:true 1x441x1824:1x1824x82
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e4m3:f16  5x2x2290:5x2290x16
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e5m2  2x4x359:2x359x201
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=f32:s8:f16 --attr-fpmath=bf16:true 2x8x2:2x2x3430
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f32:u8:f32 --attr-fpmath=f16:true 6x5128x1340:6x1340x1
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f16:f16:f32  3x1x4908:3x4908x120
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 3x3x2:3x2x20
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 1x2x260:1x260x16
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f8_e5m2:f8_e5m2:f8_e4m3  1x303x1632:1x1632x71
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=s8:u8:f16  1x495x3514:1x3514x1058
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x11x171:1x171x1090
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 4x1117x5:4x5x31
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f16:f16:f16  2x172x2344:2x2344x4602
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e4m3  4x28x170:4x170x70
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=bf16:bf16:bf16  1x214x6578:1x6578x17
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f16:s8:u8  3x1381x11:3x11x4408
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 8x599x1388:8x1388x1
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:s8:f32 --attr-fpmath=tf32:true 2x3x55:2x55x131
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f16:s4:f16 --attr-fpmath=f16:true 3x2785x46:3x46x235
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=bf16:s8:s8  4x45x8018:4x8018x102
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f8_e4m3:f8_e5m2:f32  4x3x5318:4x5318x4
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f8_e5m2:f8_e4m3:f8_e4m3  1x88x903:1x903x23
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=s8:u8:s32  1x643x82:1x82x3744
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=bf16:bf16:bf16  7x5211x1103:7x1103x27
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f32:f32:f32 --attr-fpmath=bf16 6x6023x9:6x9x196
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 1x5522x88:1x88x70
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=bf16:u4:s8  6x2x53:6x53x2
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=u8:u8:s32  6x15x464:6x464x15
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=f32:s8:f16 --attr-fpmath=bf16:true 2x74x9:2x9x21
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 7x40x8:7x8x19
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 1x3x218:1x218x1
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e4m3:f8_e4m3  3x2x4239:3x4239x17
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=bf16:bf16:s8  6x3799x3:6x3x5
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e4m3  6x2274x5647:6x5647x256
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:u8:f32 --attr-fpmath=strict:true 2x1x11:2x11x2
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f64:f64:f64  2x6x1832:2x1832x5
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f8_e5m2:f8_e5m2:f32  2x5746x114:2x114x528
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f16:s4:u8  1x93x2352:1x2352x13
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=bf16:u8:f32 --attr-fpmath=bf16:true 4x308x128:4x128x261
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=bf16:s8:s8  7x3689x2800:7x2800x2
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=u8:u8:f32  5x4x5:5x5x46
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  1x671x11:1x11x31
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=bf16:bf16:s8  3x553x2:3x2x48
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f32:u8:f16 --attr-fpmath=tf32:true 1x290x33:1x33x2863
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f16:f16:f16  5x235x3:5x3x51
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=s8:u8:s32  1x1241x764:1x764x4
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=f32:f32:f32  2x2x282:2x282x1
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=s8:u8:bf16  5x293x2:5x2x116
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=f16:u4:f32 --attr-fpmath=f16:true 1x1x1:1x1x1189
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=f8_e4m3:f8_e4m3:f16  1x4x999:1x999x4
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=bf16:s4:f32 --attr-fpmath=bf16:true 1x1831x1:1x1x269
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=bf16:s4:s8  3x826x1:3x1x862
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f32:s8:f16 --attr-fpmath=tf32:true 3x2x3842:3x3842x743
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=bf16:bf16:f32  3x9x3:3x3x117
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=bf16:s8:u8  1x165x2:1x2x37
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=bf16:bf16:bf16  2x2x1807:2x1807x1
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=u8:u8:bf16  3x10x5:3x5x1444
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=f16:true 2x1439x418:2x418x16
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=u8:u8:f16  2x4x8:2x8x39
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f16:s4:u8  3x75x1594:3x1594x857
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=tf32:true 8x83x8:8x8x224
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f32:u8:bf16 --attr-fpmath=tf32:true 2x7593x3:2x3x27
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=s8:u8:s32  2x3498x4277:2x4277x307
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 5x1778x1205:5x1205x7
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=abc --dt=f8_e5m2:f8_e4m3:f16  5x52x21:5x21x11
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 2x138x13:2x13x1
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f32:u8:f32 --attr-fpmath=bf16:true 7x25x403:7x403x29
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f32:s8:f16 --attr-fpmath=f16:true 3x4964x13:3x13x70
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=bf16:s8:u8  1x2x12:1x12x4
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f32:f32:f32 --attr-fpmath=bf16 4x56x338:4x338x14
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f16:u4:u8  3x4x586:3x586x6
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e5m2  4x1x11:4x11x258
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f16:s8:s8  8x26x435:8x435x96
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=bf16:u8:s8  4x1x6579:4x6579x8
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=u8:u8:f32  1x2x9:1x9x16
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=s8:u8:u8  1x6487x85:1x85x5360
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f8_e4m3:f8_e5m2:f16  4x1623x4364:4x4364x691
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 2x20x5593:2x5593x16
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:u8:s8  2x5x21:2x21x206
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=f32:s8:f16 --attr-fpmath=tf32:true 1x53x12:1x12x82
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f32:s8:f16 --attr-fpmath=f16:true 2x18x4018:2x4018x2
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e5m2  1x81x37:1x37x62
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=bf16:u4:u8  3x2x2287:3x2287x2
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f16:u4:u8  1x2x13:1x13x2170
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f32:u8:f32 --attr-fpmath=f16:true 5x5882x3254:5x3254x4
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=bf16:u4:f32 --attr-fpmath=bf16:true 3x57x7:3x7x4
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=u8:u8:f32  2x7362x1:2x1x643
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f32:u8:f16 --attr-fpmath=bf16:true 4x932x2:4x2x643
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f64:f64:f64  2x149x10:2x10x3
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f32:s8:f32 --attr-fpmath=f16:true 4x27x1780:4x1780x1
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=bac --dt=u8:u8:f32  2x2694x4566:2x4566x8
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f8_e5m2  1x1x342:1x342x30
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f8_e5m2:f8_e5m2:f8_e4m3  2x1974x171:2x171x1
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 1x6682x3:1x3x11
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f16:u8:f32 --attr-fpmath=f16:true 1x111x224:1x224x25
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:s8:f16 --attr-fpmath=tf32:true 7x2x6746:7x6746x347
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f16:f16:f32  7x37x766:7x766x491
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=bf16:u8:s8  5x1148x193:5x193x1268
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f16:s8:u8  1x6838x3:1x3x2
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f8_e4m3:f8_e5m2:f32  5x331x223:5x223x3169
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f32:f32:f32 --attr-fpmath=tf32 1x2x918:1x918x12
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f32:u8:f16 --attr-fpmath=f16:true 2x230x855:2x855x2138
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 7x1920x4080:7x4080x1
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f32:u8:f16 --attr-fpmath=bf16:true 3x1735x7:3x7x1405
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=bf16:bf16:bf16  7x3648x57:7x57x8015
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f16:u4:f32 --attr-fpmath=f16:true 1x1x2:1x2x38
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=bf16:true 2x68x1035:2x1035x2805
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=u8:u8:f16  5x40x35:5x35x9
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  7x62x1:7x1x293
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f32:u8:bf16 --attr-fpmath=bf16:true 2x318x889:2x889x2315
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f16:u8:s8  5x55x252:5x252x109
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f32:s8:bf16 --attr-fpmath=strict:true 2x57x366:2x366x22
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=u8:u8:u8  8x12x13:8x13x9
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=strict:true 2x1x208:2x208x437
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=bf16:bf16:f32  3x7x2:3x2x361
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=strict:true 1x621x7:1x7x1028
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 2x3x758:2x758x107
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f16:f16:s8  4x33x4:4x4x7
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=bf16:u8:s8  1x4288x36:1x36x3
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=u8:u8:bf16  2x5283x4:2x4x53
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=s8:u8:f32  1x720x20:1x20x61
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f16:s8:u8  4x421x17:4x17x5
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f16:s8:f32 --attr-fpmath=f16:true 5x152x270:5x270x341
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f32:s8:f32 --attr-fpmath=tf32:true 6x5046x1165:6x1165x135
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:f32:f32 --attr-fpmath=bf16 3x17x154:3x154x219
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=f32:f32:f32  3x1482x361:3x361x1980
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f16:s4:f16 --attr-fpmath=f16:true 4x1x27:4x27x42
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f64:f64:f64  1x11x1159:1x1159x260
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:u8:f32 --attr-fpmath=f16:true 6x23x160:6x160x7
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f16:u4:s8  2x2x17:2x17x32
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f32:s8:f32 --attr-fpmath=bf16:true 3x9x10:3x10x21
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f16:f16:s8  5x107x1985:5x1985x391
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=f16:true 3x478x7:3x7x92
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e4m3  6x365x2:6x2x109
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f8_e4m3:f8_e4m3:f32  1x2726x12:1x12x4191
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  6x9x1:6x1x2476
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f32:s8:f32 --attr-fpmath=tf32:true 1x10x1:1x1x4719
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 2x3x1816:2x1816x3862
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 6x1x22:6x22x39
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=u8:u8:f32  4x1740x1:4x1x1789
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f32:s8:f32 --attr-fpmath=f16:true 1x168x1:1x1x440
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 5x4x766:5x766x54
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=f16:true 7x218x1:7x1x8
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=f16:u8:s8  2x58x25:2x25x432
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f16:f16:s8  3x878x363:3x363x2
+#--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=strict:true 1x47x5:1x5x425
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e4m3:f8_e4m3  4x142x871:4x871x231
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:s8:bf16 --attr-fpmath=bf16:true 3x8x221:3x221x8
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f32:s8:f16 --attr-fpmath=strict:true 5x5842x4846:5x4846x4172
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f32:s8:f16 --attr-fpmath=strict:true 6x1469x1:6x1x4
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e5m2  8x5x176:8x176x2110
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=s8:u8:bf16  1x205x1501:1x1501x167
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f32:s8:f16 --attr-fpmath=f16:true 5x5x4:5x4x20
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=f16:u4:s8  3x1x5:3x5x21
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e4m3  3x5018x1154:3x1154x147
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:u8:bf16 --attr-fpmath=bf16:true 1x372x282:1x282x41
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=bf16:s8:u8  1x114x35:1x35x2
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f16:u8:u8  1x22x3123:1x3123x903
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=tf32:true 5x3x3:5x3x1980
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=f16:true 1x4x3:1x3x1
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=tf32:true 3x6x1:3x1x13
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=tf32:true 2x2x6003:2x6003x30
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=bf16:s8:u8  3x1264x3:3x3x1195
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 2x153x2286:2x2286x1474
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=strict:true 3x15x2:3x2x85
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=bf16:s8:f32 --attr-fpmath=bf16:true 2x67x1552:2x1552x8
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=bf16:u4:u8  5x119x6317:5x6317x5
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f8_e5m2:f8_e5m2:f16  4x630x297:4x297x68
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f32:u8:f16 --attr-fpmath=strict:true 6x231x2437:6x2437x1
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=u8:u8:f16  3x20x835:3x835x43
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=u8:u8:bf16  1x3093x3:1x3x44
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f32:u8:f32 --attr-fpmath=f16:true 1x131x19:1x19x22
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f32:u8:f16 --attr-fpmath=bf16:true 2x145x234:2x234x4
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e5m2  2x5x458:2x458x2
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=s8:u8:f16  2x4x3965:2x3965x24
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 2x396x24:2x24x486
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f16:f16:s8  3x17x1461:3x1461x189
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e5m2  4x2992x65:4x65x7
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f16:s4:u8  1x3111x1:1x1x832
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f16:s4:s8  4x1x1671:4x1671x117
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f16:u4:f32 --attr-fpmath=f16:true 3x103x16:3x16x2150
+#--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f32:u8:bf16 --attr-fpmath=strict:true 2x16x14:2x14x407
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f16:s8:s8  2x727x817:2x817x5
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f32:s8:f32 --attr-fpmath=bf16:true 6x2348x4690:6x4690x3
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=f16:f16:f32  5x48x18:5x18x13
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=f16:s8:s8  2x3544x111:2x111x1119
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e5m2:f16  7x7x5403:7x5403x295
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f8_e4m3:f8_e5m2:f8_e5m2  2x177x6084:2x6084x48
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=bf16:s8:s8  4x1x43:4x43x27
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f8_e4m3:f8_e5m2:f16  2x5469x1763:2x1763x23
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f16:u4:f32 --attr-fpmath=f16:true 3x6x59:3x59x7634
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f8_e5m2  3x2x9:3x9x38
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f8_e4m3:f8_e4m3:f32  1x949x72:1x72x6
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f8_e4m3:f8_e4m3:f16  7x1486x1116:7x1116x1304
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f32:u8:f16 --attr-fpmath=tf32:true 1x130x168:1x168x372
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=bf16:u4:s8  2x34x17:2x17x120
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f32:s8:f32 --attr-fpmath=strict:true 2x7255x1:2x1x1125
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e5m2:f8_e5m2:f32  5x163x882:5x882x113
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=s8:u8:bf16  3x104x81:3x81x6
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f32:s8:f16 --attr-fpmath=tf32:true 4x327x2489:4x2489x25
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:bf16:s8  2x529x84:2x84x53
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f32:u8:f32 --attr-fpmath=strict:true 5x1x3:5x3x715
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 5x93x1322:5x1322x18
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  1x2x3149:1x3149x6044
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f32:s8:f32 --attr-fpmath=tf32:true 1x84x5006:1x5006x1603
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=bf16:true 4x3x325:4x325x54
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=f32:u8:f32 --attr-fpmath=tf32:true 2x5x309:2x309x5
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 3x33x168:3x168x14
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:s8:f16 --attr-fpmath=f16:true 4x6x49:4x49x2
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 7x1x26:7x26x36
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:bf16:u8  3x102x18:3x18x3
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f32:s8:f32 --attr-fpmath=f16:true 5x7x459:5x459x6
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=bf16:u8:f32 --attr-fpmath=bf16:true 2x4x121:2x121x18
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  1x2018x1961:1x1961x2
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 2x80x6:2x6x2
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:f32  2x1607x3:2x3x4400
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=bac --dt=bf16:bf16:s8  6x2575x13:6x13x1
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=u8:u8:bf16  1x2686x61:1x61x51
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f16:u4:u8  2x120x74:2x74x48
+#--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:u8:bf16 --attr-fpmath=strict:true 2x406x1:2x1x3921
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f16:s8:u8  5x1x7466:5x7466x595
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=bf16:s4:s8  4x126x4:4x4x3
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f32:u8:f16 --attr-fpmath=bf16:true 7x75x3:7x3x196
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f8_e4m3:f8_e5m2:bf16  1x22x5:1x5x20
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f32:u8:f32 --attr-fpmath=strict:true 4x1009x3957:4x3957x10
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=bf16:u4:s8  2x5x5:2x5x66
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  5x1x9:5x9x52
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f32:u8:f32 --attr-fpmath=bf16:true 2x23x398:2x398x61
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  2x181x2:2x2x94
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f16:u8:s8  2x1x7428:2x7428x127
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=s8:u8:bf16  5x2900x1234:5x1234x4
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=u8:u8:bf16  2x56x73:2x73x1
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f16:f16:f16  2x4694x481:2x481x9
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f16:u4:f32 --attr-fpmath=f16:true 2x27x1:2x1x5999
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f32:f32:f32  2x4650x1354:2x1354x1
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=f16:u4:u8  4x13x10:4x10x63
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=bf16:bf16:u8  5x3x3034:5x3034x563
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=s8:u8:f32  3x568x11:3x11x14
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=f32:u8:f16 --attr-fpmath=f16:true 7x335x35:7x35x282
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=bf16:u4:s8  1x24x48:1x48x83
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f32:s8:f16 --attr-fpmath=tf32:true 1x10x244:1x244x26
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  6x5x16:6x16x1187
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=f16:s4:s8  2x3x4115:2x4115x12
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f32:s8:f32 --attr-fpmath=bf16:true 3x5x61:3x61x5121
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f32:s8:f16 --attr-fpmath=tf32:true 3x72x1:3x1x92
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x1680x3691:1x3691x633
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 4x1x6:4x6x2
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f32:u8:f16 --attr-fpmath=strict:true 2x54x316:2x316x1794
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=bac --dt=f8_e4m3:f8_e4m3:bf16  2x353x3:2x3x246
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e5m2  5x13x989:5x989x1751
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 1x10x172:1x172x2465
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=u8:u8:u8  8x429x2:8x2x496
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=u8:u8:s32  2x1729x209:2x209x3
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 3x4x13:3x13x18
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=any --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x3466x299:1x299x19
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=bac --dt=bf16:u8:s8  3x131x123:3x123x2529
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f16:u8:u8  2x130x1:2x1x2353
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f32:u8:bf16 --attr-fpmath=f16:true 2x682x4:2x4x6
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f16:u8:u8  2x10x9:2x9x20
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f16:f16:f16  6x9x4:6x4x6
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=bf16:u4:s8  1x1218x206:1x206x3
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f32:f32:f32  4x85x11:4x11x173
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e5m2  4x2x178:4x178x5801
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e5m2  8x206x84:8x84x10
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f8_e5m2:f8_e4m3:f16  1x3990x18:1x18x1860
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e5m2:f8_e4m3:f8_e5m2  1x8x4044:1x4044x4855
+#--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=f32:u8:f16 --attr-fpmath=strict:true 2x487x663:2x663x615
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=bf16:bf16:f32  2x224x32:2x32x319
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=bf16:true 1x79x3954:1x3954x605
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=f16:s8:f32 --attr-fpmath=f16:true 5x255x1169:5x1169x3322
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f16:u8:s8  3x375x1381:3x1381x116
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f32:u8:f16 --attr-fpmath=f16:true 3x6x134:3x134x7360
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=bf16:u4:u8  3x229x3:3x3x13
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=bf16:true 8x12x2:8x2x52
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=tf32:true 3x105x839:3x839x1318
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f32:u8:bf16 --attr-fpmath=tf32:true 1x2178x31:1x31x4
+#--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f32:u8:bf16 --attr-fpmath=strict:true 2x1216x9:2x9x8
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=bac --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 6x1558x1058:6x1058x1150
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e5m2  2x4771x1996:2x1996x2
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f16:s4:s8  3x4461x73:3x73x1136
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=bf16:s8:u8  1x69x2:1x2x9
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f8_e4m3:f8_e4m3:f8_e4m3  6x97x81:6x81x108
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=bac --dt=f16:s8:u8  2x20x13:2x13x94
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f32:s8:f16 --attr-fpmath=f16:true 2x3827x3:2x3x1
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=s8:u8:bf16  1x189x100:1x100x6
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:bf16  3x6x14:3x14x218
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 5x2x84:5x84x6
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 4x2x6630:4x6630x1945
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=any --dt=f8_e4m3:f8_e5m2:f16  6x40x815:6x815x1455
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=abc --dt=f8_e4m3:f8_e4m3:bf16  5x9x99:5x99x15
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f32:u8:f16 --attr-fpmath=tf32:true 6x2568x450:6x450x1216
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f32:s8:f16 --attr-fpmath=strict:true 1x196x3:1x3x218
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=f16:f16:s8  5x4014x1354:5x1354x32
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=abc --dt=bf16:bf16:bf16  6x2x6:6x6x9
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=u8:u8:u8  7x438x12:7x12x932
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f64:f64:f64  3x104x4812:3x4812x2
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f32:u8:f32 --attr-fpmath=f16:true 2x5x2:2x2x7771
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=bf16:bf16:f32  1x2680x1662:1x1662x3848
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=any --dt=bf16:u4:u8  3x1x897:3x897x102
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=bf16:bf16:bf16  7x375x264:7x264x4
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f64:f64:f64  1x1x31:1x31x2199
+#--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f32:u8:bf16 --attr-fpmath=strict:true 1x1142x39:1x39x62
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=bf16:u4:u8  4x298x1:4x1x1
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=f32:f32:f32 --attr-fpmath=f16 1x3080x436:1x436x2
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f16:f16:f32  4x3x5196:4x5196x10
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f64:f64:f64  2x3x13:2x13x6877
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f32:s8:f16 --attr-fpmath=bf16:true 2x2x1647:2x1647x9
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=f16:s4:s8  8x363x256:8x256x3
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=u8:u8:f32  6x72x40:6x40x761
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=bf16:u4:f32 --attr-fpmath=bf16:true 3x18x1450:3x1450x17
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 2x2x579:2x579x30
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=bf16:s8:s8  3x6x27:3x27x4
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f8_e4m3:f8_e4m3:f32  6x1640x129:6x129x4062
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f32:f32:f32 --attr-fpmath=tf32 2x449x16:2x16x225
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f16:f16:f32  2x14x5:2x5x23
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e4m3:f16  1x23x251:1x251x20
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f16:s8:s8  2x18x107:2x107x20
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e4m3:f8_e5m2  2x61x351:2x351x2
+#--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f32:u8:bf16 --attr-fpmath=strict:true 6x50x58:6x58x2
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f16:u8:s8  3x2x1086:3x1086x1204
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=s8:u8:u8  5x75x16:5x16x564
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=f64:f64:f64  8x35x21:8x21x158
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e4m3:f8_e4m3  1x22x4136:1x4136x1687
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e4m3  1x35x82:1x82x47
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f16:u8:f16 --attr-fpmath=f16:true 3x1539x7329:3x7329x6
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=bf16:bf16:f32  2x8x591:2x591x3
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f8_e5m2:f8_e4m3:f32  2x4856x17:2x17x2
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f32:u8:f32 --attr-fpmath=bf16:true 1x1041x26:1x26x4698
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=bf16:bf16:bf16  1x362x682:1x682x1
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f8_e5m2:f8_e4m3:f16  3x22x1677:3x1677x422
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=s8:u8:u8  2x81x5614:2x5614x3965
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 2x38x347:2x347x41
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=f32:f32:f32 --attr-fpmath=f16 4x35x9:4x9x105
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f32:u8:f32 --attr-fpmath=tf32:true 2x1293x305:2x305x107
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=bf16:bf16:f32  1x70x1017:1x1017x7
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u8:f32 --attr-fpmath=bf16:true 1x323x619:1x619x2826
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f16:s4:f32 --attr-fpmath=f16:true 2x42x1711:2x1711x2
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=f16:f16:u8  3x3846x17:3x17x1261
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f8_e5m2:f8_e4m3:bf16  2x2630x517:2x517x2
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f32:s8:f32 --attr-fpmath=strict:true 1x3x1300:1x1300x375
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=any --dt=f32:s8:f32 --attr-fpmath=tf32:true 1x7819x364:1x364x941
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=f16:true 3x27x741:3x741x35
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=bac --dt=f32:s8:f16 --attr-fpmath=strict:true 3x402x134:3x134x5
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=any --dt=bf16:u4:s8  2x2x29:2x29x876
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f32:s8:f16 --attr-fpmath=tf32:true 5x525x8072:5x8072x2
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=any --dt=f8_e5m2:f8_e4m3:bf16  5x15x2350:5x2350x4
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f16:u8:f32 --attr-fpmath=f16:true 3x48x804:3x804x9
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=tf32:true 1x1x7:1x7x3
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=f32:s8:bf16 --attr-fpmath=tf32:true 2x152x20:2x20x8
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f32:u8:f16 --attr-fpmath=f16:true 2x232x649:2x649x41
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=bf16:s8:u8  7x6815x6781:7x6781x107
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f32:f32:f32 --attr-fpmath=f16 1x3x1:1x1x29
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=any --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 3x4x7:3x7x31
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f32:u8:f32 --attr-fpmath=f16:true 2x170x277:2x277x20
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=bf16:bf16:s8  2x31x2671:2x2671x229
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=f8_e5m2:f8_e4m3:f8_e5m2  3x10x64:3x64x1
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=u8:u8:s32  2x8x330:2x330x248
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f16:s8:f16 --attr-fpmath=f16:true 6x1x39:6x39x467
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=bf16:s4:f32 --attr-fpmath=bf16:true 3x826x7197:3x7197x3891
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=any --dt=f16:s4:f16 --attr-fpmath=f16:true 1x1105x62:1x62x664
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=u8:u8:f32  6x1443x745:6x745x1
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=s8:u8:f16  7x3342x2:7x2x1510
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=f8_e4m3:f8_e4m3:f16  2x1x943:2x943x261
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=bf16:u4:f32 --attr-fpmath=bf16:true 3x57x60:3x60x202
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=any --dt=bf16:s4:s8  4x3x57:4x57x5723
+--reset --skip-impl=ref --stag=abc --wtag=cab --dtag=any --dt=f16:u4:s8  2x1179x2:2x2x298
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=bf16:s4:s8  4x564x693:4x693x2
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f16:s8:f32 --attr-fpmath=f16:true 2x2292x878:2x878x8
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=bf16:s4:f32 --attr-fpmath=bf16:true 6x1280x774:6x774x7
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=s8:u8:f32  1x3x48:1x48x5
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=u8:u8:bf16  1x114x4:1x4x97
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f16:f16:s8  2x2484x1600:2x1600x2603
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=bf16:s8:u8  2x1773x130:2x130x1
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f16:f16:f32  1x48x2:1x2x8
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f8_e4m3:f8_e5m2:f32  2x1523x12:2x12x2
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e4m3:bf16  2x2x2:2x2x3
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=bac --dt=f8_e4m3:f8_e4m3:f16  1x675x7781:1x7781x55
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f8_e4m3:f8_e5m2:bf16  2x34x9:2x9x28
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f16:u4:f16 --attr-fpmath=f16:true 1x213x6400:1x6400x1147
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=bf16:s8:bf16 --attr-fpmath=bf16:true 4x10x19:4x19x6404
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=abc --dt=f32:u8:f16 --attr-fpmath=bf16:true 2x15x50:2x50x21
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=bac --dt=bf16:s4:u8  2x52x13:2x13x90
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f16:f16:s8  2x21x5319:2x5319x1349
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=s8:u8:s8  2x45x2900:2x2900x108
+--reset --skip-impl=ref --stag=any --wtag=abc --dtag=abc --dt=f16:s4:f16 --attr-fpmath=f16:true 3x7275x532:3x532x27
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 5x52x14:5x14x3707
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=bac --dt=f64:f64:f64  5x70x1888:5x1888x41
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=abc --dt=bf16:u4:f32 --attr-fpmath=bf16:true 7x28x3:7x3x13
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=f16:s4:f32 --attr-fpmath=f16:true 3x2416x2:3x2x3
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=f16:true 3x162x1228:3x1228x10
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f32:s8:f32 --attr-fpmath=f16:true 1x176x17:1x17x9
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=bac --dt=f32:s8:bf16 --attr-fpmath=bf16:true 2x11x2465:2x2465x428
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f32:u8:f16 --attr-fpmath=bf16:true 1x255x906:1x906x33
+--reset --skip-impl=ref --stag=abc --wtag=acb --dtag=any --dt=f16:s4:f32 --attr-fpmath=f16:true 2x41x1355:2x1355x227
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=any --dt=f32:u8:bf16 --attr-fpmath=bf16:true 1x3878x279:1x279x77
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f16:u4:f16 --attr-fpmath=f16:true 6x1482x83:6x83x12
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=f16:f16:s8  1x160x1:1x1x3071
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=abc --dt=f8_e4m3:f8_e5m2:f16  1x7596x408:1x408x113
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f8_e5m2:f8_e5m2:f16  5x18x240:5x240x1731
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=bac --dt=f8_e4m3:f8_e4m3:f16  5x731x16:5x16x1864
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=s8:u8:u8  1x41x157:1x157x700
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=bac --dt=f16:u8:f16 --attr-fpmath=f16:true 3x715x267:3x267x22
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f16:s8:u8  1x855x819:1x819x95
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f32:u8:f16 --attr-fpmath=bf16:true 8x113x1359:8x1359x32
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=bac --dt=bf16:u8:s8  5x192x1534:5x1534x1132
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=bac --dt=f8_e5m2:f8_e5m2:f8_e5m2  3x56x173:3x173x265
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=bf16:bf16:f32  5x314x147:5x147x248
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=bf16:s8:u8  3x2353x5450:3x5450x6
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 3x3352x6680:3x6680x3
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f8_e5m2:f8_e5m2:f32  2x2904x1078:2x1078x4726
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=s8:u8:f32  1x187x1:1x1x2
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=f32:f32:f32  4x912x454:4x454x644
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=bf16:s8:u8  5x6157x6:5x6x4728
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=any --dt=f16:s4:s8  8x24x189:8x189x2
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=f32:s8:f32 --attr-fpmath=tf32:true 8x136x3755:8x3755x858
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=bf16:bf16:bf16  1x68x166:1x166x223
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=bf16:s8:f32 --attr-fpmath=bf16:true 1x7217x46:1x46x201
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=bf16:u4:f32 --attr-fpmath=bf16:true 2x237x429:2x429x124
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f32:f32:f32 --attr-fpmath=bf16 1x68x2028:1x2028x2375
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=abc --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 7x2x105:7x105x1699
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f8_e5m2:f8_e5m2:f8_e4m3  6x7x67:6x67x6606
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=abc --dt=f16:u4:f32 --attr-fpmath=f16:true 2x9x6:2x6x6
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=abc --dt=f32:s8:f16 --attr-fpmath=tf32:true 6x1x7:6x7x1216
+--reset --skip-impl=ref --stag=any --wtag=bac --dtag=abc --dt=u8:u8:f32  3x3691x1583:3x1583x491
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=s8:u8:bf16  5x23x7788:5x7788x80
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f16:s4:u8  5x3952x760:5x760x137
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=abc --dt=bf16:bf16:f32  7x43x755:7x755x1
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=any --dt=f8_e5m2:f8_e4m3:f16  1x1023x2688:1x2688x20
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=bf16:bf16:s8  1x15x1:1x1x165
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=abc --dt=f32:u8:f32 --attr-fpmath=bf16:true 2x3385x145:2x145x175
+--reset --skip-impl=ref --stag=cab --wtag=cab --dtag=abc --dt=f32:s8:f16 --attr-fpmath=tf32:true 4x39x156:4x156x2351
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f32:f32:f32  3x1061x6733:3x6733x11
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=f16:f16:s8  1x49x17:1x17x2
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=abc --dt=bf16:u4:f32 --attr-fpmath=bf16:true 2x30x9:2x9x48
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f32:u8:bf16 --attr-fpmath=bf16:true 2x12x283:2x283x5
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f8_e4m3:f8_e5m2:f8_e5m2  1x2978x11:1x11x98
+--reset --skip-impl=ref --stag=abc --wtag=any --dtag=any --dt=f8_e5m2:f8_e4m3:f16  8x50x80:8x80x1
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=any --dt=f32:f32:f32 --attr-fpmath=bf16 6x2673x1793:6x1793x85
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e5m2:f8_e4m3:bf16  4x4647x27:4x27x3042
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=f8_e4m3:f8_e4m3:f8_e4m3  7x192x83:7x83x1
+--reset --skip-impl=ref --stag=acb --wtag=bac --dtag=bac --dt=f16:u8:s8  3x44x1690:3x1690x12
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=bf16:bf16:u8  2x805x1848:2x1848x20
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=abc --dt=f32:s8:f32 --attr-fpmath=tf32:true 3x401x84:3x84x1395
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=f8_e5m2:f8_e4m3:f32  4x3885x252:4x252x59
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f16:f16:u8  1x425x183:1x183x7189
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=f16:u4:s8  8x77x25:8x25x16
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=abc --dt=f8_e4m3:f8_e5m2:f8_e5m2  4x789x76:4x76x42
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f32:u8:bf16 --attr-fpmath=f16:true 1x3819x10:1x10x4
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=abc --dt=bf16:u8:u8  4x42x207:4x207x409
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f16:s8:f16 --attr-fpmath=f16:true 3x263x318:3x318x177
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=u8:u8:s8  4x1x19:4x19x102
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e4m3:f8_e5m2  1x8x5242:1x5242x2
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f32:s8:f32 --attr-fpmath=f16:true 5x63x58:5x58x28
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=bac --dt=f32:f32:f32 --attr-fpmath=bf16 3x7x5:3x5x3196
+--reset --skip-impl=ref --stag=bac --wtag=cab --dtag=abc --dt=f8_e5m2:f8_e4m3:f16  1x3888x36:1x36x2961
+--reset --skip-impl=ref --stag=any --wtag=acb --dtag=bac --dt=f8_e4m3:f8_e5m2:f32  5x652x849:5x849x1905
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=f16:u8:s8  3x2347x102:3x102x188
+--reset --skip-impl=ref --stag=abc --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e5m2:f32  6x26x668:6x668x111
+--reset --skip-impl=ref --stag=acb --wtag=cab --dtag=abc --dt=f32:s8:f32 --attr-fpmath=strict:true 2x4x3:2x3x3
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=f32:u8:f16 --attr-fpmath=strict:true 2x1485x48:2x48x2
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=abc --dt=f16:u4:u8  3x2x2:3x2x288
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 1x4893x74:1x74x175
+--reset --skip-impl=ref --stag=bac --wtag=any --dtag=abc --dt=f8_e5m2:f8_e4m3:bf16  3x4x1459:3x1459x92
+--reset --skip-impl=ref --stag=any --wtag=cab --dtag=bac --dt=f32:u8:f32 --attr-fpmath=strict:true 1x3x167:1x167x1913
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=s8:u8:bf16  1x1x27:1x27x1
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f32:u8:bf16 --attr-fpmath=f16:true 2x9x2279:2x2279x582
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=f16:f16:s8  6x582x1054:6x1054x2861
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f32:s8:bf16 --attr-fpmath=bf16:true 3x1x2:3x2x1313
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=any --dt=f8_e5m2:f8_e5m2:f8_e4m3  7x489x61:7x61x159
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f32:f32:f32 --attr-fpmath=f16 2x124x36:2x36x1
+--reset --skip-impl=ref --stag=abc --wtag=abc --dtag=any --dt=f16:f16:s8  7x40x945:7x945x276
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=any --dt=f8_e5m2:f8_e4m3:f16  3x10x16:3x16x92
+--reset --skip-impl=ref --stag=cab --wtag=any --dtag=abc --dt=f8_e4m3:f8_e4m3:f32  3x7x18:3x18x964
+--reset --skip-impl=ref --stag=cab --wtag=acb --dtag=bac --dt=bf16:u8:f32 --attr-fpmath=bf16:true 5x1577x60:5x60x9
+--reset --skip-impl=ref --stag=acb --wtag=acb --dtag=any --dt=u8:u8:bf16  1x188x4:1x4x338
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=abc --dt=bf16:s4:bf16 --attr-fpmath=bf16:true 6x4x245:6x245x37
+--reset --skip-impl=ref --stag=cab --wtag=bac --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e4m3  8x3748x2:8x2x67
+--reset --skip-impl=ref --stag=acb --wtag=any --dtag=bac --dt=u8:u8:f32  2x71x128:2x128x5
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=bac --dt=bf16:bf16:bf16  3x656x157:3x157x50
+--reset --skip-impl=ref --stag=acb --wtag=abc --dtag=any --dt=f8_e4m3:f8_e4m3:f8_e5m2  7x1459x2616:7x2616x251
+--reset --skip-impl=ref --stag=cab --wtag=abc --dtag=any --dt=bf16:u4:u8  2x6457x48:2x48x374
+--reset --skip-impl=ref --stag=bac --wtag=abc --dtag=bac --dt=f32:u8:f16 --attr-fpmath=f16:true 6x49x3586:6x3586x4
+--reset --skip-impl=ref --stag=bac --wtag=bac --dtag=bac --dt=f8_e5m2:f8_e4m3:f16  2x13x112:2x112x3611
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=any --dt=f32:s8:f32 --attr-fpmath=strict:true 3x7016x37:3x37x717
+--reset --skip-impl=ref --stag=bac --wtag=acb --dtag=abc --dt=f32:s8:f32 --attr-fpmath=f16:true 2x677x47:2x47x26
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_regression_bf16 b/tests/benchdnn/inputs/matmul/harness_matmul_regression_bf16
index e84e403bd74..be37bc934db 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_regression_bf16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_regression_bf16
@@ -17,3 +17,15 @@
 --reset
 --dt=bf16:bf16:bf16
 2x1280:1280x65_n"parallel_reduction"
+
+# Shape to better test k tail.
+--reset
+--dt=bf16:bf16:bf16
+--stag=ab
+--wtag=ab
+--dtag=ab
+8x2664:2664x256_n"k_tail"
+
+# Test that cases when M == 1 are handled correctly.
+--reset
+--stag=ba,ab --wtag=ab --dtag=ab --dt=bf16 1x2:2x256
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_regression_f32 b/tests/benchdnn/inputs/matmul/harness_matmul_regression_f32
index 89488076cbb..c97c06ff25c 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_regression_f32
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_regression_f32
@@ -20,3 +20,11 @@
 # test for K parallel_reduction with batched case
 --reset
 --stag=acb --wtag=abc --dtag=abc 2x16x2048:2x2048x16_n"large_K_with_batch"
+
+# test correct LDA initialization, when batches are merged into M dimension
+--reset
+--stag=abcd --dtag=abcd 2x1x8x2:1x1x2x8_n"merge_batches_into_M"
+
+# test special tag that can be matched with adbc
+--reset
+--stag=dabc --wtag=abx --dtag=abx 1x2x2x32:2x2x32x7
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_regression_float16 b/tests/benchdnn/inputs/matmul/harness_matmul_regression_float16
index 053decff0a6..5145a18340f 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_regression_float16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_regression_float16
@@ -1,4 +1,8 @@
 
 # test shapes with b_buffer
 --reset
---dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --bia_dt=f16 327x256:256x256
+--dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --bia-dt=f16 327x256:256x256
+
+# Test that cases when M == 1 are handled correctly.
+--reset
+--stag=ba,ab --wtag=ab --dtag=ab --dt=f16 1x2:2x256
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_regression_int8 b/tests/benchdnn/inputs/matmul/harness_matmul_regression_int8
index 8936ceb45c0..c696837a2f4 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_regression_int8
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_regression_int8
@@ -3,7 +3,7 @@
 97x2053:2053x997n"test_scratchpad_allocation_for_non_batched_case"
 
 # batch size smaller than num_threads
---stag=abc --wtag=abc --dt=u8:s8:s32 --bia_dt=f32 --bia_mask=4
+--stag=abc --wtag=abc --dt=u8:s8:s32 --bia-dt=f32 --bia_mask=4
 2x20x30:2x30x4
 2x20x30:1x30x4
 
@@ -20,3 +20,13 @@
 --dt=s8:s8:s8
 --stag=ab,ba --wtag=ba --dtag=AB16b16a
 11x13:13x16
+
+# verify transpose dst tag can be treated as plain
+--reset
+--dt=s8:s8:f32
+--stag=ab --wtag=ab --dtag=ba,ab
+1x1:1x200
+
+# Test that cases when M == 1 are handled correctly.
+--reset
+--stag=ba,ab --wtag=ab --dtag=ab --dt=s8:s8:f32 1x2:2x256
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_runtime_f32 b/tests/benchdnn/inputs/matmul/harness_matmul_runtime_f32
index 86a41f55f9e..9ad01730422 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_runtime_f32
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_runtime_f32
@@ -4,14 +4,14 @@
 
 --dt=f32
 --stag=ab,ba --wtag=ab,ba --dtag=ab
---bia_dt=undef,f32 --bia_mask=2
+--bia-dt=undef,f32 --bia_mask=2
 
 --runtime_dims_masks=0
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,sum,relu
 --batch=shapes_2d
 
 --runtime_dims_masks=3:3
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,sum+add:s8,mul:f32:per_oc,mul:f32:per_tensor
 --batch=shapes_2d
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_runtime_int8 b/tests/benchdnn/inputs/matmul/harness_matmul_runtime_int8
index 9e566cd1dd2..5946d408dbf 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_runtime_int8
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_runtime_int8
@@ -5,9 +5,9 @@
 --dt=u8:s8:u8,s8:s8:f32
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,1:0
---bia_dt=undef,f32,u8 --bia_mask=2
+--bia-dt=undef,f32,u8 --bia_mask=2
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,sum,relu
 --batch=shapes_2d
 
@@ -18,9 +18,9 @@
 --dt=u8:s8:u8,s8:s8:f32
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1
---bia_dt=undef,f32,u8 --bia_mask=2
+--bia-dt=undef,f32,u8 --bia_mask=2
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-zero-points=src:common:1+wei:common:-1+dst:common:2
 --attr-post-ops=sum,sum:1.5:2
 --batch=shapes_2d
@@ -37,7 +37,7 @@
 # zero point doesn't belong to the data type (e.g. -1 is not u8)
 --dt=u8:s8:s8
 --runtime_dims_masks=0
---bia_dt=undef
+--bia-dt=undef
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:1
 --attr-zero-points=src:common:-5, \
                    src:common:-2+wei:common:128+dst:common:-129
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_strides b/tests/benchdnn/inputs/matmul/harness_matmul_strides
index 8799f33fdf1..7c2d989fd92 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_strides
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_strides
@@ -3,7 +3,7 @@
 # 2d input
 --reset
 --dt=f32,s8:s8:s8,bf16
---bia_dt=f32 --bia_mask=1,3
+--bia-dt=f32 --bia_mask=1,3
 --attr-scales=,src:common:1.25,wei:common:1.25,dst:common:2
 --attr-post-ops=add:f32,sum+mul:f32:per_oc+linear:2:-1
 
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_bfloat16 b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_bfloat16
index c6f6d2eebd6..8da8772f989 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_bfloat16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_bfloat16
@@ -11,19 +11,19 @@
 
 --reset
 --dt=bf16
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 
 --stag=ab --wtag=any --dtag=ab
 # 6 per each encoder stage, 36 in total
 2240x1024:1024x1024n"Transformer_lt:Encoder_MM_1*36"
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x40x64:56x16x64x40n"Transformer_lt:Encoder_MM_4*6"
 --stag=abcd --wtag=abcd --dtag=abcd
 56x16x40x40:56x16x40x64n"Transformer_lt:Encoder_MM_5*6"
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 2240x1024:1024x4096n"Transformer_lt:Encoder_MM_7*6"
 2240x4096:4096x1024n"Transformer_lt:Encoder_MM_8*6"
@@ -35,12 +35,12 @@
 # number of calls depends on sequence length value
 # =============================================================================
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 # 6 per each encoder and sequence length, 1440 in total for seq_len = 40
 224x1024:1024x1024n"Transformer_lt:Decoder_MM_1*1440"
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 56x16x4x64:56x16x64x40n"Transformer_lt:Decoder_MM_4*240"
@@ -48,7 +48,7 @@
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 56x16x4x40:56x16x40x64n"Transformer_lt:Decoder_MM_5*240"
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 224x1024:1024x4096n"Transformer_lt:Decoder_MM_7*240"
@@ -64,7 +64,7 @@
 # instead of running whole set of such problems we fix sl = 20 and consider
 # this problem is running 6*seq_len times
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x1x64:56x16x64x20n"Transformer_lt:Decoder_MM_xx20*240"
 --stag=abcd --wtag=abcd --dtag=abcd
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_int8 b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_int8
index 1bb9ca01d73..569f8ae6d71 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_int8
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_lb_int8
@@ -10,39 +10,39 @@
 # =============================================================================
 
 --reset
---dt=u8:s8:s8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 3 per each encoder stage, 18 in total
 2240x1024:1024x1024n"Transformer_lt:Encoder_MM_1*18"
 
---dt=u8:s8:s8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 2 per each encoder stage, 12 in total
 2240x1024:1024x1024n"Transformer_lt:Encoder_MM_2*12"
 
---dt=u8:s8:bf16 --bia_dt=undef
+--dt=u8:s8:bf16 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x40x64:56x16x64x40n"Transformer_lt:Encoder_MM_4*6"
 
---dt=u8:s8:u8 --bia_dt=undef
+--dt=u8:s8:u8 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abcd --dtag=abcd
 56x16x40x40:56x16x40x64n"Transformer_lt:Encoder_MM_5*6"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 2240x1024:1024x1024n"Transformer_lt:Encoder_MM_6*6"
 
---dt=u8:s8:u8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 2240x1024:1024x4096n"Transformer_lt:Encoder_MM_7*6"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 2240x4096:4096x1024n"Transformer_lt:Encoder_MM_8*6"
@@ -54,49 +54,49 @@
 # number of calls depends on sequence length value
 # =============================================================================
 
---dt=u8:s8:s8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 3 per each encoder and sequence length, 720 in total for seq_len = 40
 224x1024:1024x1024n"Transformer_lt:Decoder_MM_1*720"
 
---dt=u8:s8:u8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 224x1024:1024x1024n"Transformer_lt:Decoder_MM_2*240"
 
---dt=u8:s8:bf16 --bia_dt=undef
+--dt=u8:s8:bf16 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abdc --dtag=abcd
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 56x16x4x64:56x16x64x40n"Transformer_lt:Decoder_MM_4*240"
 
---dt=u8:s8:u8 --bia_dt=undef
+--dt=u8:s8:u8 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abcd --dtag=abcd
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 56x16x4x40:56x16x40x64n"Transformer_lt:Decoder_MM_5*240"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 2 per each encoder and sequence length, 480 in total for seq_len = 40
 224x1024:1024x1024n"Transformer_lt:Decoder_MM_6*480"
 
---dt=u8:s8:u8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 224x1024:1024x4096n"Transformer_lt:Decoder_MM_7*240"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 224x4096:4096x1024n"Transformer_lt:Decoder_MM_8*240"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each sequence length, 40 in total for seq_len = 40
@@ -109,11 +109,11 @@
 # instead of running whole set of such problems we fix sl = 20 and consider
 # this problem is running 6*seq_len times
 
---dt=u8:s8:bf16 --bia_dt=undef
+--dt=u8:s8:bf16 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x1x64:56x16x64x20n"Transformer_lt:Decoder_MM_xx20*240"
---dt=u8:s8:u8 --bia_dt=undef
+--dt=u8:s8:u8 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abcd --dtag=abcd
 56x16x1x20:56x16x20x64n"Transformer_lt:Decoder_MM_yy20*240"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_bfloat16 b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_bfloat16
index 66dfeb46d98..f946975dacb 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_bfloat16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_bfloat16
@@ -11,19 +11,19 @@
 
 --reset
 --dt=bf16
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 
 --stag=ab --wtag=any --dtag=ab
 # 6 per each encoder stage, 36 in total
 40x1024:1024x1024n"Transformer_lt:Encoder_MM_1*36"
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 1x16x40x64:1x16x64x40n"Transformer_lt:Encoder_MM_4*6"
 --stag=abcd --wtag=abcd --dtag=abcd
 1x16x40x40:1x16x40x64n"Transformer_lt:Encoder_MM_5*6"
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 40x1024:1024x4096n"Transformer_lt:Encoder_MM_7*6"
 40x4096:4096x1024n"Transformer_lt:Encoder_MM_8*6"
@@ -35,12 +35,12 @@
 # number of calls depends on sequence length value
 # =============================================================================
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 # 6 per each encoder and sequence length, 1440 in total for seq_len = 40
 4x1024:1024x1024n"Transformer_lt:Decoder_MM_1*1440"
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 1x16x4x64:1x16x64x40n"Transformer_lt:Decoder_MM_4*240"
@@ -48,7 +48,7 @@
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 1x16x4x40:1x16x40x64n"Transformer_lt:Decoder_MM_5*240"
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 4x1024:1024x4096n"Transformer_lt:Decoder_MM_7*240"
@@ -64,7 +64,7 @@
 # instead of running whole set of such problems we fix sl = 20 and consider
 # this problem is running 6*seq_len times
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 1x16x1x64:1x16x64x20n"Transformer_lt:Decoder_MM_xx20*240"
 --stag=abcd --wtag=abcd --dtag=abcd
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_int8 b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_int8
index ffebd011e5a..05e6ed2acb9 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_int8
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_inf_sb_int8
@@ -10,39 +10,39 @@
 # =============================================================================
 
 --reset
---dt=u8:s8:s8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 3 per each encoder stage, 18 in total
 40x1024:1024x1024n"Transformer_lt:Encoder_MM_1*18"
 
---dt=u8:s8:s8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 2 per each encoder stage, 12 in total
 40x1024:1024x1024n"Transformer_lt:Encoder_MM_2*12"
 
---dt=u8:s8:bf16 --bia_dt=undef
+--dt=u8:s8:bf16 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abdc --dtag=abcd
 1x16x40x64:1x16x64x40n"Transformer_lt:Encoder_MM_4*6"
 
---dt=u8:s8:u8 --bia_dt=undef
+--dt=u8:s8:u8 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abcd --dtag=abcd
 1x16x40x40:1x16x40x64n"Transformer_lt:Encoder_MM_5*6"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 40x1024:1024x1024n"Transformer_lt:Encoder_MM_6*6"
 
---dt=u8:s8:u8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 40x1024:1024x4096n"Transformer_lt:Encoder_MM_7*6"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 40x4096:4096x1024n"Transformer_lt:Encoder_MM_8*6"
@@ -54,49 +54,49 @@
 # number of calls depends on sequence length value
 # =============================================================================
 
---dt=u8:s8:s8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:s8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 3 per each encoder and sequence length, 720 in total for seq_len = 40
 4x1024:1024x1024n"Transformer_lt:Decoder_MM_1*720"
 
---dt=u8:s8:u8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 4x1024:1024x1024n"Transformer_lt:Decoder_MM_2*240"
 
---dt=u8:s8:bf16 --bia_dt=undef
+--dt=u8:s8:bf16 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abdc --dtag=abcd
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 1x16x4x64:1x16x64x40n"Transformer_lt:Decoder_MM_4*240"
 
---dt=u8:s8:u8 --bia_dt=undef
+--dt=u8:s8:u8 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abcd --dtag=abcd
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 1x16x4x40:1x16x40x64n"Transformer_lt:Decoder_MM_5*240"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 2 per each encoder and sequence length, 480 in total for seq_len = 40
 4x1024:1024x1024n"Transformer_lt:Decoder_MM_6*480"
 
---dt=u8:s8:u8 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:u8 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 4x1024:1024x4096n"Transformer_lt:Decoder_MM_7*240"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each encoder and sequence length, 240 in total for seq_len = 40
 4x4096:4096x1024n"Transformer_lt:Decoder_MM_8*240"
 
---dt=u8:s8:bf16 --bia_dt=bf16 --bia_mask=2
+--dt=u8:s8:bf16 --bia-dt=bf16 --bia_mask=2
 --attr-zero-points=src:common:7+dst:common:3
 --stag=ab --wtag=any --dtag=ab
 # 1 per each sequence length, 40 in total for seq_len = 40
@@ -109,11 +109,11 @@
 # instead of running whole set of such problems we fix sl = 20 and consider
 # this problem is running 6*seq_len times
 
---dt=u8:s8:bf16 --bia_dt=undef
+--dt=u8:s8:bf16 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abdc --dtag=abcd
 1x16x1x64:1x16x64x20n"Transformer_lt:Decoder_MM_xx20*240"
---dt=u8:s8:u8 --bia_dt=undef
+--dt=u8:s8:u8 --bia-dt=undef
 --attr-zero-points=src:common:7+wei:common:-2+dst:common:3
 --stag=abcd --wtag=abcd --dtag=abcd
 1x16x1x20:1x16x20x64n"Transformer_lt:Decoder_MM_yy20*240"
diff --git a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_tr_bfloat16 b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_tr_bfloat16
index 04150541d81..f1d3a725c81 100644
--- a/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_tr_bfloat16
+++ b/tests/benchdnn/inputs/matmul/harness_matmul_transformer_lt_tr_bfloat16
@@ -15,18 +15,18 @@
 #   2d problems - M = batch * seq_len
 #   4d problems - M = seq_len, B = batch x 16
 # -----------------------------------------------------------------------------
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=ab --dtag=ab
 2240x1024:1024x1024n"Transformer_lt:FWD,Encoder_MM_1*36"
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x40x64:56x16x64x40n"Transformer_lt:FWD,Encoder_MM_4*6"
 
 --stag=abcd --wtag=abcd --dtag=abcd
 56x16x40x40:56x16x40x64n"Transformer_lt:FWD,Encoder_MM_5*6"
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=ab --dtag=ab
 2240x1024:1024x4096n"Transformer_lt:FWD,Encoder_MM_7*6"
 2240x4096:4096x1024n"Transformer_lt:FWD,Encoder_MM_8*6"
@@ -38,20 +38,20 @@
 # -----------------------------------------------------------------------------
 224x1024:1024x1024n"Transformer_lt:FWD,Decoder_MM_1*36"
 
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x4x64:56x16x64x40n"Transformer_lt:FWD,Decoder_MM_4*6"
 --stag=abcd --wtag=abcd --dtag=abcd
 56x16x4x40:56x16x40x64n"Transformer_lt:FWD,Decoder_MM_5*6"
 
---bia_dt=bf16 --bia_mask=2
+--bia-dt=bf16 --bia_mask=2
 --stag=ab --wtag=ab --dtag=ab
 224x1024:1024x4096n"Transformer_lt:FWD,Decoder_MM_7*6"
 224x4096:4096x1024n"Transformer_lt:FWD,Decoder_MM_8*6"
 
 --stag=ab --wtag=ba --dtag=ab
 224x1024:1024x32768n"Transformer_lt:FWD,Decoder_vocabulary*1"
---bia_dt=undef
+--bia-dt=undef
 --stag=abcd --wtag=abdc --dtag=abcd
 56x16x1x64:56x16x64x40n"Transformer_lt:FWD,Decoder_MM_xx40*6"
 --stag=abcd --wtag=abcd --dtag=abcd
diff --git a/tests/benchdnn/inputs/matmul/option_set_fp8_mixed b/tests/benchdnn/inputs/matmul/option_set_fp8_mixed
new file mode 100644
index 00000000000..5c1192ad050
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/option_set_fp8_mixed
@@ -0,0 +1,106 @@
+# fp16/32:int4:fp16/32
+# 2D block wei scale, zp (fp16/32)
+
+--reset
+--dt=bf16:s4:bf16,bf16:u4:bf16
+--stag=abc
+--attr-scales=wei:per_ocic:bf16:32x1
+--attr-zero-points=wei:per_ocic:s4:32x1
+--attr-fpmath=bf16:true
+7x6x32:7x32x64
+7x6x32:1x32x64
+3x6x96:3x96x64
+3x6x96:1x96x64
+
+--reset
+--dt=f16:s4:f16,f16:u4:f16
+--stag=abc
+--attr-scales=wei:per_ocic:f16:32x1
+--attr-zero-points=wei:per_ocic:s4:32x1
+--attr-fpmath=f16:true
+7x6x32:7x32x64
+7x6x32:1x32x64
+3x6x96:3x96x64
+3x6x96:1x96x64
+
+--reset
+--dt=f32:s4:f32,f32:u4:f32
+--stag=abc
+--attr-scales=wei:per_ocic:f32:32x1
+--attr-zero-points=wei:per_ocic:s4:32x1
+--attr-fpmath=strict:true
+7x6x32:7x32x64
+7x6x32:1x32x64
+3x6x96:3x96x64
+3x6x96:1x96x64
+
+# fp8:fp8:fp8
+# common/per_oc src, wei, dst scale (bf16/f32)
+
+--reset 
+--dt=f8_e5m2:f8_e4m3:f8_e5m2,f8_e4m3:f8_e5m2:f8_e4m3,f8_e5m2,f8_e4m3 
+--attr-scales=wei:per_oc:bf16+src:per_ocic:bf16:1x256+dst:per_oc:bf16,\
+                      wei:common:2:bf16+src:common:2:bf16+dst:common:2:bf16,\
+                      wei:per_oc:f32+src:per_ocic:f32:1x256+dst:per_oc:f32,\
+                      wei:common:2:f32+src:common:2:f32+dst:common:2:f32
+--stag=ab 
+--wtag=ab 
+--dtag=ab 
+2x256:256x128n"DLRM:2*1"
+2x256:256x2n"DLRM:7*1"
+
+# fp8:fp8:fp16/32
+# common/per_oc src, wei scale (bf16/f32) 
+
+--reset 
+--dt=f8_e5m2:f8_e4m3:f32,f8_e4m3:f8_e5m2:f16,f8_e5m2:f8_e4m3:bf16
+--attr-scales=wei:per_oc:bf16+src:per_ocic:bf16:1x256,\
+                      wei:common:2:bf16+src:common:2:bf16,\
+                      wei:per_oc:f32+src:per_ocic:f32:1x256,\
+                      wei:common:2:f32+src:common:2:f32
+--stag=ab 
+--wtag=ab 
+--dtag=ab 
+2x256:256x128n"DLRM:2*1"
+2x256:256x2n"DLRM:7*1"
+
+# fp16/32:int4:fp8
+# 2D block wei scale (fp16/32)
+# common/per_oc dst scale (fp16/32)
+
+--reset
+--dt=bf16:s4:f8_e5m2,bf16:u4:f8_e4m3
+--stag=abc
+--attr-scales=wei:per_ocic:bf16:32x1+dst:per_oc:bf16,wei:per_ocic:bf16:32x1+dst:common:2:bf16
+--attr-zero-points=wei:per_ocic:s4:32x1
+--attr-fpmath=bf16:true
+7x6x32:7x32x64
+7x6x32:1x32x64
+3x6x96:3x96x64
+3x6x96:1x96x64
+
+# fp8:int4:fp16/32
+# 2D block wei scale (fp16/32)
+# common/per_oc src scale (fp16/32)
+
+--reset
+--dt=f8_e5m2:s4:bf16,f8_e4m3:u4:bf16
+--stag=abc
+--attr-scales=wei:per_ocic:bf16:32x1+src:per_oc:bf16:1x32,wei:per_ocic:bf16:32x1+src:common:2:bf16
+--attr-zero-points=wei:per_ocic:s4:32x1
+--attr-fpmath=bf16:true
+7x6x32:7x32x64
+7x6x32:1x32x64
+
+# fp8:int4:fp8
+# 2D block wei scale (fp16/32/fp8)
+# common/per_oc src, dst scale (f32)
+
+--reset
+--dt=f8_e5m2:s4:f8_e5m2,f8_e4m3:u4:f8_e4m3
+--stag=abc
+--attr-scales=wei:per_ocic:f8_e5m2:32x1+src:per_oc:f32:1x32,wei:per_ocic:f8_e4m3:32x1+src:common:2:f32,wei:per_ocic:f32:32x1+src:common:2:f32
+--attr-zero-points=wei:per_ocic:s4:32x1
+--attr-fpmath=bf16:true
+7x6x32:7x32x64
+7x6x32:1x32x64
diff --git a/tests/benchdnn/inputs/matmul/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/matmul/option_set_fwks_ext_gpu
index 90cbb6eee22..acc985ee3d1 100644
--- a/tests/benchdnn/inputs/matmul/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/matmul/option_set_fwks_ext_gpu
@@ -1,909 +1,905 @@
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x768_n"bb2b2123f40efce1fac583c610e1f19e*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1024x64:192x64x1024_n"4aaca9cd7c63aa2a99bcb693f54719ea*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1024x1024:192x1024x64_n"f6276195d34a71bc25fe8d6122af6c2e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x3072_n"024d388bdc353312fe556a33fc9f2b13*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x768_n"ab9037e65438990c3dda7415d176f4bb*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x768:768x768_n"d37d0e9529e92055752cc43b735f5e06*18288"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=cab --dtag=abc --strides=:: 192x1x64:192x64x1_n"c545c3499dd6e2ad0f606da429828aef*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x1:192x1x64_n"b6c679dfe67848592a2abdd2d9bb617a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x64:192x64x1024_n"7365242a24940d84e500ed486d68c729*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x1024:192x1024x64_n"b63fbac8bcfa5a45135541a4598b5e82*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x768:768x3072_n"2b886a25fd9a7244578afa4d6d40a208*3048"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x3072:3072x768_n"b00a9921d4ea0b349d170a1413828241*3048"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x768:768x32100_n"b2ca79a40223ab864a81b0fe8088b82c*254"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x2_n"73a9b7051eaf423c77eba56127fe7261*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x2:192x2x64_n"a8422234387e14cf3c4ef96be9e1099e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x3_n"fcbf97a15ac3c8f2f93f3da788f849c6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x3:192x3x64_n"569f4169457a73bf7f56e594e1bbea79*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x4_n"84c4378130e203af0ac8fd5f7fd391de*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x4:192x4x64_n"c45a69787250fa8b70757a7d846e2d7c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x5_n"6f3bdd0d571faa11c8195855b1da712f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x5:192x5x64_n"1b3059eed3d24118fc919e4910c1ec12*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x6_n"bb0237b4e4e76eebbe285620f0a03236*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x6:192x6x64_n"f2562e4e4d2b9cbbc958523139878855*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x7_n"80b672cd33e1984ec32c41e8b20d638c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x7:192x7x64_n"cf722968961bc0e428dcdecfe85f9933*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x8_n"b2c2c9eff96a9449754c58cfdb9111e3*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x8:192x8x64_n"32937460a7faac09a396923021d66906*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x9_n"0af8a15ca2d9542e2cafda6a352398f0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x9:192x9x64_n"34a63e3b2614e826c516ec68a251b875*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x10_n"f92f2dfb3807d364d35de8ed1e450718*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x10:192x10x64_n"17339d9699709fdbebb0c43ab443159b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x11_n"46f1c7e8042e3017897d5a2bf8bd9d35*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x11:192x11x64_n"07ef163969360a7e4b4d6998fba842b5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x12_n"70e57d2434a5e97de968963cb88e24a2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x12:192x12x64_n"11772626954d941a22d3356fec4d8e91*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x13_n"dc10133fa25f4a72b574a98a58415a35*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x13:192x13x64_n"ff45d39687bcdc82665d8d3f587ab0cb*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x14_n"eeb0df53f0c93ee445c59cae05704b1f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x14:192x14x64_n"0c0a952b7b8ebbd713c0a49dd488b4fa*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x15_n"a16e011fbde44c2fc9e79dadc32d7ce4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x15:192x15x64_n"a0b29e86748073c718fb65b99bd88602*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x16_n"267b482ee3e948866d10cbebb58395c4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x16:192x16x64_n"57fba55c76a36fc961ffdf40c357424f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x17_n"867fa4b8fc26f3c267f56dd843cce543*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x17:192x17x64_n"c43444f51dbb42284b5c5d13c54d770c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x18_n"567e9270284701209809060eaf20ca0f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x18:192x18x64_n"61f28814c63fb3eb17fe4ec375f65bcd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x19_n"e8f44c100185c1a4f15274eb190e41fc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x19:192x19x64_n"d3e74aaf8d4edc8618a2f11c713aff78*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x20_n"74f2b648e4940c75b89560a4b8007afa*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x20:192x20x64_n"16840b76b87e786066d2314541a9f063*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x21_n"3d6e9dee4f734fa3a52fa9586d3ef864*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x21:192x21x64_n"836b367a6b65e984006dae4a7766cbd1*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x22_n"6b186c05767fcfce56633ce83739d0a7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x22:192x22x64_n"bee202cc5fe710cfe2285cb1c2e3d5e0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x23_n"2f6efbdcd130cf17ac2f3b0a7b4da00a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x23:192x23x64_n"42190e42e8fbd0dbb83e219ad00b9bb2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x24_n"25c8eec0693c1668a5fafbc6b3cdbe6d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x24:192x24x64_n"5a835a8e9ae934f9ff298760ed9a1264*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x25_n"ecd353a4f4b74a695c36b963bf28082d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x25:192x25x64_n"28e567ae991f4652e9efd21d15cfa9ef*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x26_n"078b34bfe2b8bc92e6cf1d3f84e94218*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x26:192x26x64_n"2d84ef161fe56727db0784e25391fbfd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x27_n"252cd64d09b999f0e46d0d0fc039f212*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x27:192x27x64_n"83a5bc28ae7f6a125f7133c8c56b93a4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x28_n"5f916621ed799f7118055becc2f6fb4e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x28:192x28x64_n"c71db35c81cf4009aed6e6b870bd41b9*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x29_n"dc561c1d9a18f368f69b4c3d5397edc0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x29:192x29x64_n"0f0cbee3a0435ee760001003672db5e3*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x30_n"6ba4abc10b930e4267e6193c3c792def*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x30:192x30x64_n"1ae1657ebfbe203abc3f5d6aef055722*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x31_n"cfb692f45de538043544e002abf1e1b5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x31:192x31x64_n"80b708a730aa567dd96d90b3b5947e5d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x32_n"e4e50b9ad17d3a6bf87dedb10f5dd410*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x32:192x32x64_n"ea81c51c460dbcb3fe963140f70b31ef*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x33_n"9d58c3e1e8dcade024ac17745baef035*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x33:192x33x64_n"f920f43caee8b96d57a6b4b4fad72798*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x34_n"92ab51fd59c9a75fe60bef7885cf8b49*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x34:192x34x64_n"9cc0727a482d735d9df6b4a99c913d0e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x35_n"4102857cfdb3a9b98816dcc4d106a2ab*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x35:192x35x64_n"ed524101be93b0619446a51f4f4af5f7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x36_n"ab3ad1903aa0198841e3f4e8a2465116*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x36:192x36x64_n"6f5cef3d189d50d90427616655c6d4c0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x37_n"baf7d7fc3f7bccd9c6c769fb3b55bdc6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x37:192x37x64_n"471d13aa5a078aab06065df048ca37b6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x38_n"8ce86274f1dd07ee4b3e117bb69a10c6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x38:192x38x64_n"0ed8525493718013768d2de1e9aa88af*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x39_n"51a3c46a392578e73f77cb57e4971ed1*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x39:192x39x64_n"973269e7fcc43c731070aab0e482c3ba*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x40_n"0c860df564cf94e2fbf9bf09288b45ca*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x40:192x40x64_n"b3b48eaae1458c3495aa7ea5005c7e82*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x41_n"fb23236b678e82b322b443e092e08b28*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x41:192x41x64_n"c1ce2ff67df0f24df9544b5e95627eab*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x42_n"24372945302af11c9386f0cd8ee92a60*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x42:192x42x64_n"332f0b2a861cb2da4dd613addf5def92*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x43_n"440b83c8f9ff9bfea0352fac74c023f5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x43:192x43x64_n"d72f4a75433ad295b8016e91cc744a4f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x44_n"eaf7c699134ae499e20fd293bdfd6c08*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x44:192x44x64_n"251a01422df3ede84f4f73491ae418eb*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x45_n"0bc7b46fc99dbb9daa373710a56ccdea*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x45:192x45x64_n"227fbc3591636740050c57f8cae370f0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x46_n"4d93919d272c2e327a298143a8be8fbc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x46:192x46x64_n"6a27f0ae3b6a67b1e83efc522976a421*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x47_n"affac3d02ffba12c390c95d30ed02872*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x47:192x47x64_n"26c5c99299270ba443723fb2ea087228*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x48_n"a70348b1e850460c00e93da587c6479e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x48:192x48x64_n"bf054164b275d1f425f5b3473e81db2a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x49_n"69baddb3f1f17731551d201a1c7a7da6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x49:192x49x64_n"452684ef6098d7c4e5412ed9aa94db00*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x50_n"99f5358f40b375153186224ecc9c7242*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x50:192x50x64_n"a6d88c73ba19a69643ab4433b181f798*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x51_n"a4a1fd73de5f3173210e7a6811da4392*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x51:192x51x64_n"ee0464044b885a67d7f92925f1aafb0f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x52_n"002f4cbb6af69d97cd7ace3005e88b9c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x52:192x52x64_n"72e60e3a760789d2b288262c5ab18da9*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x53_n"fef961bd96c0181174596bd79c7eba3f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x53:192x53x64_n"ba3f9d635acdc8bccc88aff082bb07ca*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x54_n"d822f56e180d56ecaa855952d175cb3f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x54:192x54x64_n"c67b7c6d37453a4b2690cea9986ab247*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x55_n"b2bc76ca773b8de49c49332807e92dec*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x55:192x55x64_n"7babe6f6fa17777e10113ea866b50de5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x56_n"ad4d9b3f71e4ccf2eccc8a8452edd029*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x56:192x56x64_n"16c00130d91849dfbd08467218996b18*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x57_n"de1d50ca24771741ab895c37d043d7d7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x57:192x57x64_n"d27082e90eca9222317e68121822fea8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x58_n"699f0ada0a0d199ec4c783b594fac158*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x58:192x58x64_n"4df71266e02170a05470f28040c03a25*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x59_n"bbe837ed7aa86956906bea8f6c1fc684*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x59:192x59x64_n"e5b90cf97b287065a9d7fca039f6e045*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x60_n"7d64b90a190a1b692775685978457619*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x60:192x60x64_n"77b8e64612abf1292443ae80dddc8e3d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x61_n"e6f4d09ef1efd98376b213c02a0c2452*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x61:192x61x64_n"c53db0d0c98866b38dafca821172624a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x62_n"e1149c3e72edc4462baa9e7f1c40beac*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x62:192x62x64_n"85b6ea1f6a9b528b1bb9363b59f9eaa8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x63_n"8d62e1a9879d516d0a96cb3a45e7fc4d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x63:192x63x64_n"f53670c9ba211892780fcb538a6e6ca5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x64_n"99c45cfe79f077fcc8e9979a5fbe73e3*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x64:192x64x64_n"b669a6bfca59fd6cbf61c304c423085c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x65_n"21a47d73020511a216f4df5ff63f6e08*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x65:192x65x64_n"a9157354a0383a44e40dc9f3423c6e2b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x66_n"b9688c9457335b7be3d6cf860fe66c4f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x66:192x66x64_n"5d74a5b84152a25b3642b6d2bbb32a57*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x67_n"1c851e150ae21196ec3bf858993f05a1*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x67:192x67x64_n"59a1f9c92ea827f8defed8e9bbedbe30*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x68_n"af551d40d6cb5e7855f4f42c5ad38f05*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x68:192x68x64_n"8e7df6111d6681b892dd2130ab9edc87*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x69_n"c7bd64b11425772c9aa95af8a925d489*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x69:192x69x64_n"42c2e1966409f1a0b09634f822019bd8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x70_n"80b425e2625025b4cc23ced2f638de96*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x70:192x70x64_n"b9c2aaa2dcff6a27f2eeb1118852ae25*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x71_n"ac61b79b854e89131cb52ee444714607*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x71:192x71x64_n"c85bc9e6230cdbc1b6d0d28b85e2f41d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x72_n"8aa069701b55d0bbb2f6bc3a390766a8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x72:192x72x64_n"7efd411a0634681cbe15c30301124d32*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x73_n"33907b8c2cf8fd423687ecda73e2a73b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x73:192x73x64_n"8d1fe1cd0dff2c0b03e8462fdcd1fb6e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x74_n"79d29247e33a725824e75107af459fd5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x74:192x74x64_n"144520f0758f8a15a5d4ae19a9234f0b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x75_n"7682a15b32fdc77f566f46acc9837d43*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x75:192x75x64_n"69b894d769e5287847f5503e572cff28*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x76_n"6abe64e399afdebcafc9aedd43e05629*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x76:192x76x64_n"74dfe8b42400381336fa35423ce39aea*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x77_n"bb6188c22bdd6d4f8f651837bb726e4d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x77:192x77x64_n"a30d5719f1e4bb99df24708364860745*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x78_n"8860f53a831aa7a10de54eecb437a40e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x78:192x78x64_n"f75df4b2a8df9056b5cdf5ca016aa884*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x79_n"8013b924901a81aed2319ebdb9174b9b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x79:192x79x64_n"ca433681f8c00f76a1a39e5f02e82908*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x80_n"1722138b9e0035287f1f8017eabacea2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x80:192x80x64_n"03f3b9181031204d8d6632ce784e7d6f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x81_n"ce31ef823388155bf6c49e3e99c5c41e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x81:192x81x64_n"1b7bdc8a19f34a492e996baa7cc0f4c8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x82_n"b9973862f5fbbaac16077acd0b5ab97a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x82:192x82x64_n"90ececc241ae01efee853770a1565e9b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x83_n"0315a4e3ccc62d265335315b4ec31d09*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x83:192x83x64_n"ec3bc50aadae9f444eff801114779f60*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x84_n"a2d55584faacf1d3ef06d4204b438d71*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x84:192x84x64_n"7e4f8ce8b67bc235da8496259174219d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x85_n"8473e47955606cd7daedb92fff5cc379*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x85:192x85x64_n"92fe9737edddfa3da227e814fe835473*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x86_n"698b63b6446e40549dbb99b37162278e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x86:192x86x64_n"59b7cbae9abcd5e926864ca1e6496332*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x87_n"d9f830bac3ed86a921abbfd9326d0b43*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x87:192x87x64_n"3fb68c1b8d57de73f224b2116398547d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x88_n"26811cc46f2e0c3c4ce9683d6b85b594*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x88:192x88x64_n"845c972a88da36af6d80b058a248ee7f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x89_n"bc75542cb2b79b9f64810060d6c7263e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x89:192x89x64_n"ae18fdaa3b882e6ea4283365b41d176f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x90_n"9213e73df94bebf8279da97ac0dec9bd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x90:192x90x64_n"b7913b4236943a93dd49f2b4627b5679*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x91_n"47f11cacd2d6384f2605f7a37061756a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x91:192x91x64_n"ee3072e6a1da867ac0ab67705cf28605*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x92_n"3d40fa943523546192737ac321e1f440*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x92:192x92x64_n"a3eb77f2a121ded22458d4f694594b9c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x93_n"37fe969346f5c48ba408a0ab4dcb1c96*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x93:192x93x64_n"8f551e83b7184549336ba47ef7910d87*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x94_n"b334030432e1f8106a0c3be54375ffc7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x94:192x94x64_n"6ca4a40f53b61169fcaaffda6a87e840*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x95_n"5211fa6a5ed9f091351156fe3ac3e18e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x95:192x95x64_n"864c09814ffc171df8816933c6dcf72f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x96_n"9c35b4a33cd2070c556cc0614afea71d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x96:192x96x64_n"94705619d19044337c972c0e8e350251*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x97_n"01d2e73f9ccddffcad08918505b9bd24*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x97:192x97x64_n"c3536fc75e2e450311072f7e3a6fc315*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x98_n"b5e0d7efa5555180462b6700445da40a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x98:192x98x64_n"c8c3286660c8c89e966d2a472a5dd32b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x99_n"4bfe6c7570df5def47baf9d72b9709aa*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x99:192x99x64_n"e5092cd2d33df875341ee13f1b9b240e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x100_n"09d0c49d5d4d70293f3b050b2ebcf707*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x100:192x100x64_n"e721f1763df4d0b3b59d9408ce0f11d1*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x101_n"a6d1e1684821cf57e529be863c45bfa3*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x101:192x101x64_n"ab5d0b22e77567480e8a280d870f503a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x102_n"54875b707bf41426d894c82ce2faed05*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x102:192x102x64_n"e2e8ad6c7a4c1bcda6c3cefa20179a3d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x103_n"5722b4331dc2fbc033e5a7c5ac0d348a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x103:192x103x64_n"658395bf2e593f5f6bd60122bbb82d06*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x104_n"a94c6747ae32d1b863a20e46837f2473*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x104:192x104x64_n"2e28072be74bea8716467ce3dd9e14b7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x105_n"07952ff8045b2125513bf10ad932f63c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x105:192x105x64_n"5f7c858b7625d3588eeedd39c2d34256*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x106_n"3ecc7ba9c783fbb56d6fb77ba8fcfbcc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x106:192x106x64_n"fbce283f4c70097f22a292a51dfd1206*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x107_n"f6806d831ee9f350d2fbe344bf79a388*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x107:192x107x64_n"cf79340dfa61d7bb9a4fb7c459c45160*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x108_n"1f123f7bf63bd81d23f9ccf909808c73*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x108:192x108x64_n"220f327e083712bb9800683865efc0a6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x109_n"514270d4e1d0b96c647e4a714a9a1911*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x109:192x109x64_n"a47c8cc476fc6ecb84deb5e3043933bc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x110_n"de93b19b6ff0d7dd83b95559de4fcf23*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x110:192x110x64_n"50f7f9cf9ddf979257f951f5d7cb7423*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x111_n"6dbf815483a2b39c7bbf9a3ddd0d7aa8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x111:192x111x64_n"f3022b5cf14580a371824b0c4ba4898a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x112_n"4d8addd9fbf7cd0e2e6475a072b02c84*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x112:192x112x64_n"562e408d7bb33f3fa377b7937b74ae92*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x113_n"2cb5697b4b2cb6af76df367f26d622b6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x113:192x113x64_n"2872c249798de27d1e01ed47b06f9754*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x114_n"d81a19a4570a6326b8705643f66f286c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x114:192x114x64_n"c10bae7400a8347f45ddef07c0c0e54c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x115_n"4dfa67962f40d8dcb7a6f9fba630ea3d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x115:192x115x64_n"d4dfee7a873cf03286ea7542c5d6bbe2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x116_n"49628a7f8246681ffebf5ca2c0bee051*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x116:192x116x64_n"ee2712b41c2371e93075da4e78e1f951*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x117_n"bd2a6e7d78b11cb4eda6d05100fd4885*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x117:192x117x64_n"d8f73062b1f2583bcc2e23bee72cc9b7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x118_n"619f43a575b6703f57ebd75bf0f59c9e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x118:192x118x64_n"03b70af6dfa250d4042be80b43e8efab*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x119_n"faf19e2febef51b1bc9a6aba587e2d75*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x119:192x119x64_n"7737139281f22ab862be047ff86afe0e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x120_n"e6a1deb0d0a19c3a910b81e628970cec*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x120:192x120x64_n"375d3e4496f46d321fb5fde073699402*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x121_n"d5d438b6668a6b513b10a07991828a3b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x121:192x121x64_n"143fda5ff779ed5b51625352e115dd65*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x122_n"5f9564901510a843f2a6f7c7ee216b57*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x122:192x122x64_n"07d3f3d8dcb7308e1e4bdd60e12dce11*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x123_n"d8512331c79c195045992d14c5a70bda*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x123:192x123x64_n"3e2e84fa426aacb36741ad785bff52f6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x124_n"c716ed58ec8ec9066740d971ac3bea96*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x124:192x124x64_n"52e80b23512ebab35aa5ffb866e89d3f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x125_n"b2d25f0fca34bb5bfe028a67411d4a34*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x125:192x125x64_n"7f353d5a31eab2b4b35bafa125acf35a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x126_n"10fad74d521c4e56cd91739f3b6cf80d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x126:192x126x64_n"a2164ab0e7cdc942cf98e0b3c1097cb5*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x127_n"01733e72d7b948e32f72e53116c719af*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x127:192x127x64_n"42020d710be6eb9f869d914d413d498b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x768:768x768_n"51ed40a90c7d79155e868a5672c96970*72"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x64:192x64x36_n"a1169aa6df02730fab88d37d9b7a0525*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x36:192x36x64_n"713189187a8409c3508a6528ce9c264e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x64:192x64x1024_n"d1cf87975f7b2d838882a58ffbd50993*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x1024:192x1024x64_n"fde780b736507cc3040592578668036c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x768:768x3072_n"1433b239bedb93c49f88b62ab60ade05*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x3072:3072x768_n"5846ca9708a729f9031fb443cdafa53d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x768:768x32100_n"e9613275a216b56df7ff640efce80103*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 10608x768:768x768_n"102482725f32c2d3c9d045a1f25579f2*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x663x64:192x64x663_n"2722429d755d23fe7e512b57209619e2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x663x663:192x663x64_n"859475ff766f9e20ade8c609a67f3dbc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 10608x768:768x3072_n"533cfb5b8f093a9351363017ccb6ae94*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 10608x3072:3072x768_n"94fea51102bb1b28c10d4fe402ef49dc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x64:192x64x663_n"199b5f5d13e2932b608dfd750df5a75b*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x663:192x663x64_n"089c993499fa2b077284c9cc691f1956*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x768:768x768_n"a07a646ec3393f76c15c29deeb3c970a*72"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x64:192x64x39_n"c822fa2f308981a342cddda2647f46ac*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x39:192x39x64_n"14aa0ab3a56e506e583337c45107091e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x64:192x64x663_n"6cc5bffd3da27ea5f0454a7d4bdaea69*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x663:192x663x64_n"1b6018d8f93ba539172288c2d4018902*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x768:768x3072_n"e49079e2a1470a56e0f4336032a1d49e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x3072:3072x768_n"efbb381ee6d0cf93e2723ab713738df5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x768:768x32100_n"bb453d71729fdbc958ec68b4cc03aefe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x2:2x1024_n"8b02d0eed9b1fdd19c94ac65855f0e1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x1024:1024x1024_n"688f1b45d7e7b876b5a0ecb3f673126d*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-scales=dst:common:0.5 1024x384x64:1024x64x384_n"a4da1fe747dede6bf9a2c969dc25ee19*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"169271799573c3310b13bda143ee1b38*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 24576x1024:1024x4096_n"e4ccc579e3ece592ae565740deaebfe9*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 24576x4096:4096x1024_n"f8f688ff4caab8246785d55b111fe5d4*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"ecdc5986c3e8f90009e5e3c5d963a846*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x292x64:1x12x64x292_n"08343e069c6e537fe4e032e6ac21f831*60&54f02a3c08a4bd3ea730da52ca39451a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x292x292:1x12x292x64_n"624b807c8027e43741f0ecf516b161d7*60&041a519f8c812943675203e2dcc3c378*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0+binary_add:f16:2 1x12x128x64:1x12x64x128_n"54814e070a88053a99d157f7a5f8cf54*60&58372ac68e73c4c06625e0694e8c3f4f*60&7054ce5631c333484c31e939cc7fe981*60&7e1cfd16bdf8be9bace1d45308807270*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x128x128:1x12x128x64_n"07163ddfd8f55bf96df4b6b78407fa45*60&ed2627e75a0d099aa94c9a2cc802b8fc*60&84f5e9fa5e4cad78feaba299e1efb46a*60&4acd1ee91db611a7b3b55071d1c45b47*60&2d75a5bbf675f62495590eb7aa9741ee*60&5bfb8c0bbc4c3feb284c469da4fb42d1*60&d626ebc6c7bdb550fc0b7426ff78ddbb*60&e671d9b706067f8d6359bd396705afd8*30&d8eed433c6e1c0ff579c6add310ce2de*30&714a4700a69ade0ff999d2180196015a*30&c01e98fc8cea445dd536c7822aa7262a*60&aaa5f2272669943d9ec97e4bddd86734*60&cf3dd0c83326b4bbd343d45f46ba46b7*60&68fed0d05b95ae8af0c2d1fb14aa106e*60&40f9c1096a1831bddf4dbb45eac0ff27*60&a4d5910459e4e389e983c434032c57ea*60&7ed6b2833318b7ee2abb97bfcb39d374*60&bafffaac22cd98fe741f37deddc98c00*60&67b272fcfc8df8efdd6061fc3d9efaf0*60&49e438caa7c5e8be5df9903eff73ed1c*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2 1x12x128x64:1x12x64x128_n"f18c033bc3ed4a9183d9f9420abc1b91*60&3a5dac47c018642218cf14dd1cbd1000*60&1e5f4139cadbf4c4c503e2f7d28fbf2f*60&811d38afaecebece84a3ce03f8d674f4*60&d635ec35040b46cba970d267372ee0a1*60&200830e5922aa3d37ff77a069cb36d9a*60&5927d22967b6105706e6fdf715d07f0d*60&a7c36cb0a720d5d004e7f16aec1f176b*60&36c8ffa1a950a4ddcacec9d263e28286*60&562b0a6fd26b18c1c52191df29e40244*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: 1x12x128x64:1x12x64x128_n"671cd9d43ed895fde9eac83b61e8506f*60&d212bff8726b44eedc0ce4f81665d672*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x512:512x4096_n"d4f316d35db8b9600d8d85d620d88c80*5&5d2192693c53d2bed08693968d0717da*5&2a67348804c2340d1898352837539239*5&7ef927634999ea5e070cc504eb1bd57d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x512_n"e5048c2f2f2f8cc34bf74e2b816ed881*5&16390e8fb83a05c47273b7c84ab50a91*5&f809ce2f02bcf1f4b1e0760ca47b0b75*5&76e58ed9fbb3c77b54b1d0a081d2729f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x128x128:384x128x64_n"f9807a4a26f1de28165264f98c2dbca6*12&4b75371f018bf26763b09a65316f4708*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x1024x1024:12x1024x64_n"72898d054cb7d4f0da8427115024fc5d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x942:942x1000_n"ce69a55cae3ccd40095ff42f7e08aaa7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x1000:1000x1000_n"1b9f2f9f64b9238f2e69be1f58c5a5a8*1&88686d85bf7c99d923baa58345422839*1&da59257560b2b5e336952551875d6835*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x3839:3839x1000_n"947d9281af5aee63448c7cd77d3b16ff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x3000:3000x1000_n"e1af2cab70e23ffe552b22af142a6f57*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1000:1000x1_n"91bb3156fb37fe6e573c9e2af3c1628c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x1000_n"3b64be610176fdf813ce7671b8b330e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1000:1000x1000_n"71fb95c2f3753abd08c671e533e3caa9*2&29d498639a6b071426acf12a96f970f2*2&85bf9ff65fd9d8b20f05a515c6a965c9*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1000:1000x3000_n"e6cdb67794bc407e0ac61bc56068b6aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 942x32:32x1000_n"b8717e319c998c9f1e00fff540dc3260*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 3839x32:32x1000_n"398429969ce2e010139c4ff3bc448c68*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2 1x16x384x64:1x16x64x384_n"959b09067b79c2d2e849dba48b4885fb*120&7c499dcb0df3147e9b4a2d1d78b8a018*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x16x384x384:1x16x384x64_n"42efe0e03363a8635c44f9130ba054dc*120&3a60113c7aa40a5c1c44c2b1e1b812e2*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2+eltwise_clip:0.271:0.314:1.234 1x12x128x64:1x12x64x128_n"b4273d496251d1c20c6801c876ab56b7*60&0e5746f84844293bf1584f783c9d0f79*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x128x128:1x12x128x64_n"524f398a0ac9c8bc8fe3c112063c3b6d*60&e90e50c0dad2255f996cb21adcbfd6e3*60&822fb602992a8221f0449b6b2f33bf21*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1840x256:256x256_n"479861b25caa44a7cc04d42f1b0f8a5b*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x115x128:32x128x115_n"5f1f4301cdb874de193a3fe8d7a85244*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x115x115:32x115x128_n"8d0cad55265011f8f667fb3a93793356*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1840x256:256x1_n"540518fa9e40e4d82530b18f09d46237*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 13776x256:256x256_n"c4921473bf266211237d310410fd7cd6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x861x128:32x128x861_n"bc4a670fc132da4f94b53d55fae9e8c6*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x861x861:32x861x128_n"fff33c936bb29ba2ce0d15e127cd4f7a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 13776x256:256x80_n"ac0ddf288e2d0fc0a20c4515e4030bcb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1856x256:256x256_n"5ea7fb0d970d7b71ab62d9620085d562*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x116x128:32x128x116_n"68c0a1469a01c566c2d9effd272bbdd0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x116x116:32x116x128_n"7af8a7dc292f97d09ec56db387eeb564*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1856x256:256x1_n"128ada5ba4cc12ccb7ca3f297d7de715*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 14448x256:256x256_n"38702cec5482f4b1715c2585078ecc3e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x903x128:32x128x903_n"1b5be6c973fedaeba1f072d9ef83fb77*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x903x903:32x903x128_n"4072d3e94444964baa2f12a5be78096d*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 14448x256:256x80_n"f3d006a93abf28b759dd5b611be163aa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:15:abcd 64x3x49x32:64x3x32x49_n"1963865d0a3cc32ab4e3bf54794ba396*10&f4165d1d6518b90ad9aa41aed44dea6a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 64x3x49x49:64x3x49x32_n"c765f3c774db51c2ff1b4f4cccb0e475*10&24f7f3ca33c4a9570827e2d435725fc4*10&29ad5b62302c74a901bc944b8657b32c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:15:abcd 16x6x49x32:16x6x32x49_n"55ac174bb55129598b19a106272ab6e7*10&58e31d47fc08ad0824040468a8c3ad87*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 16x6x49x49:16x6x49x32_n"632ec1b6f7a5011527e8fad703ac66e6*10&4847b2134dfdc9279357e6d222671498*10&dc5a3177a508d3e8da613297b4fff213*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:15:abcd 4x12x49x32:4x12x32x49_n"976e2ae9a9b0235e9a9c63e765f9290a*30&da29f0ea394baebb210a446c77109de9*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 4x12x49x49:4x12x49x32_n"583ae59472c5d065db07f7f3c5adbb16*30&806303db5b87977d58ecc039d2b455fe*30&75797641d8037907acb0d6d5d78d220d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:14:abcd 1x24x49x32:1x24x32x49_n"c4402fb9c85851cd18581d9d89a41d1f*10&b0cec10d52001b676d6683124c8fa349*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 1x24x49x49:1x24x49x32_n"67fec4e45e6bfa6d47d74389507fceb8*10&acf9c12e789f8a7bb222d32f4f61ddd7*10&e0f0867cc5854744da7d5dfa523662e7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x256:256x16_n"1c5cc47491669f69052da93e68e891ab*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x256_n"216efa3be9525b019b670fd564b48194*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x512:512x16_n"ca8665993818ae2097cfcc2cc0fb7d9c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x512_n"6ee004b3b0e597d17eb9dc3536c20643*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x1024:1024x16_n"1e7816fc3ef282d2731817d6d867a41f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x1024_n"80efd01dc89eee3a5980eeaf8f331def*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x16_n"d490957f3d7f7956f921513ca01ce00a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x2048_n"78d8c56f13b956e4106c51bbfc23d570*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1000_n"806f960cfaaa0e86ffbcdc88e56d5b99*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x3:3x3_n"46df8ee6df8666974bba6722ed746044*5&91944b550020d579aeb91c12bbffbb23*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x13:13x1_n"28d20bb28e1a23b78b0f0d2af7ba6d33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1053:1053x1024_n"c998a22dafbf37c3d30376426a6d06e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1024_n"9f9549a1dd0c32c9ef7c8e2218c0b29a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1_n"bd5d8e52765d1bc5f8d2ee40b90e064c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 13x32768:32768x1_n"2bf801ee80460b862401dcbc4a12c000*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1:1x1024_n"421c9647bb981d24d9b14dad1f8f43e1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1_n"7570b217b91cde080c6d58098bd4907b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"3f9b68573bd75ed89f39c1dc3a874780*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1024_n"e554181294f4d17816b3d16ad05142fb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1053_n"4babe8122d2c4082ac024106c29a1bf5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1053x32768:32768x1024_n"da712245ef21268a003b7591e5ce69e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-scales=dst:common:0.5 128x197x64:128x64x197_n"763aa7a816703812da0664dfd931b863*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=binary_mul:f32:0+binary_add:f32:0 128x197x197:128x197x64_n"96cc49ec3bbc72306ccb8869e460c93d*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6x960:960x1_n"2eaff5799cda894c7fadb26b2973d33c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:0.271:0.314:1.234 32x1613:1613x250_n"8d8eb9a3ef63f852d2783658e8089f6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:0.271:0.314:1.234 32x250:250x125_n"152d0bb3c01f95daa414fcaa4e2360e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:0.271:0.314:1.234 32x125:125x60_n"53931cda585ed9245a42d374c41fd7da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:0.271:0.314:1.234 32x60:60x30_n"896f69f953e23781f06efd566ae804cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x30:30x1_n"dc470cd69cefce94de8e1861ee1e18f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x30_n"6227a29626b09af7bf8c95a0d9e6b146*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x30:30x60_n"2c4bc823982b3629bbd1e309404a5941*1&269ffe69c4add6f252e5ee91dc95f4f7*1&07f6356dc297219bb5b77518f5028dfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x60:60x125_n"ffd1e36363cef69fb537961bf75f3ea8*1&abc921fdafa40379bc292351062a75a0*1&64ab76fa6a0bb5e251b112b8d674ee0c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x125:125x250_n"1f6581f32d1982e457dc533865420e43*1&9f5669a9a7e5697b880833e1c13fc676*1&2620b373f6c226319781b1a550b77829*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1613x32:32x250_n"a95ca467d83db83d41689f87ce86a4e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x512:512x512_n"4f70bc9c1193b1611cf292d9970e0cb9*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1024x64:512x64x1024_n"0b398efe459804dedb7b3c4e85174204*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1024x1024:512x1024x64_n"3523f8b7d9c18dbd5c5d031b97fdd163*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x512:512x2048_n"e51624a198e375ccc321588aa58cda5a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x2048:2048x512_n"43bfd533b672ad98f12f4c0dfc042981*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x512:512x512_n"7165d02edd4ad80b99571d6f5d33c9d5*9144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=cab --dtag=abc --strides=:: 512x1x64:512x64x1_n"9d871f92d4a0f51f65ead1e353063c33*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x1:512x1x64_n"6972a6ac2770d36f8dad8c673fb4d079*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x64:512x64x1024_n"2f8f9e0c7c018dec58aeb734ce0c036c*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x1024:512x1024x64_n"7048889d45fbb5e6571bc64404709990*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x512:512x2048_n"d7db7384306286784cfdf204b8f2d193*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x2048:2048x512_n"d897acea1660419d720dfbda92f6d943*1524"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x512:512x32100_n"c1441fbe8ec832aff29633e2641de83f*254"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x2_n"36986c17d0cfec978aec5beb120f8403*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x2:512x2x64_n"27ea96facabf2e9b10fb49a83d408505*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x3_n"b73dfd523798679b491461ed4c6892e1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x3:512x3x64_n"ff6ef0572624eaec50211ff99c19418e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x4_n"d3b9b871b7d5233bfe2dcd10166130b2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x4:512x4x64_n"328bf31238905ca780ee2c5a08d0b95f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x5_n"bd04db26441d7a9dad034c8ac720c9c6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x5:512x5x64_n"5d71491534ec80b30c9d63f5384f13cf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x6_n"a62dba908d40ce2664ed12324ed26eb0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x6:512x6x64_n"99ee73e4b3b24d3642562fc1ba5e304e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x7_n"eacdb24dab1c363117ceb920f439bf25*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x7:512x7x64_n"e96ec050b146bcfa025676541dfddad1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x8_n"46bf4c9b26e8a327fc3bccea9bdab1dd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x8:512x8x64_n"a4807997614e605033a6f1a2683b739d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x9_n"8457e771928e7226655b53407cb01136*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x9:512x9x64_n"1e4e43a615f4256f4f99f42b43c84c27*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x10_n"72b2fdf1ea91f97da72f381dc7f46684*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x10:512x10x64_n"aea4ab626765eee64152e299b2c5eedf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x11_n"20804b8bbe92a9c8f60033afeb42d7ad*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x11:512x11x64_n"dabc81e1e9fc8a8ba6a2417368c13458*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x12_n"32c485b2d36ae47e8c52028e1e534e01*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x12:512x12x64_n"623c633e7b7b999394733117f65a8ba1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x13_n"c87826aa495f15aff33f99cf21e3f4d6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x13:512x13x64_n"a065d17370833bde4ef1ad1c90470e94*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x14_n"028a6f2bd43402bd7d5a015c8d5fac42*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x14:512x14x64_n"d19f2130fd7260d91a9d99c13de5eb20*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x15_n"b54df34c3f1b48cb079f4087d2b3ff78*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x15:512x15x64_n"a6d331c766d2a09639e66e3e87f2aa70*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x16_n"70b58bb00cf06712e2f8d52f79f76a22*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x16:512x16x64_n"b6ebae98bb05c0fd759a539ba0026e66*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x17_n"918e3feab63d6f190775f0f911c16541*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x17:512x17x64_n"c8ea3a04f70666b6a38a9fa014402cad*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x18_n"4c64306c814bff8c445246dfd594dcf5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x18:512x18x64_n"020d52e3a96b2c888a42e0611ee2065d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x19_n"a39b16929df1f126972c8e16ba755f6b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x19:512x19x64_n"8aafeaf30124ffba7116eb152da332fd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x20_n"85a798cf8bfe86083daa8c883bb6f4b0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x20:512x20x64_n"ed26160290619f631bc7c47c8c0e603d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x21_n"b5858ceb367d634e7f99f98a783eddee*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x21:512x21x64_n"be4e506897e8747f94f1ed6d81b63680*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x22_n"5eacbe796d5d437a6302492ff2dbefed*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x22:512x22x64_n"a4ef30a8df41970131273713f281e980*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x23_n"a63b504ab48212c4507762b7024cad75*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x23:512x23x64_n"b335dd81287f089796449009a13f446a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x24_n"45d77ae50f2afaa8450dc9ab620024bc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x24:512x24x64_n"6c84307ee30306b80aaf08f780877f7b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x25_n"333ecb300e0471efd2988db46889b811*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x25:512x25x64_n"06243572dacb00cb0fed80c968dfb3da*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x26_n"ebe7575aba624858b86e6c3ffd34a63f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x26:512x26x64_n"3c8fd9fb65dc605490170420ed427f84*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x27_n"e8d84398325885ed3b5a12e62e13237c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x27:512x27x64_n"f3caff374bb6d4cde173d9d42c6b3ae4*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x28_n"aa37a003e23b4274da8894276021add7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x28:512x28x64_n"d80c2c6b7749b4461a771dc4283d45d6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x29_n"2269869361aba0acc5529741ab86ca64*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x29:512x29x64_n"ece3baa1a5d7155a28749c7249d637cb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x30_n"d29e7513f76667af182d5ed694a938a6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x30:512x30x64_n"6a4416032c73b0df0cd2f28425faa6f5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x31_n"f3d5251f127eb1f4ca8a0837e4ce604f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x31:512x31x64_n"0786f5aa715e473026555795ff78dd1f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x32_n"47281e54954375e56e6e8a05a183e629*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x32:512x32x64_n"28288bc54e1a2fa6e01fc32f5e3a290d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x33_n"7f7230648ded1eb0163549244d1dc801*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x33:512x33x64_n"5eb511d5f5be373102d8d379f41ab4b7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x34_n"54e4295b689704c3931c4d94d4503366*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x34:512x34x64_n"c9964eff613de082ea0bd22225b3af41*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x35_n"30193981173feb3e74ca084d1bceb62d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x35:512x35x64_n"ef721b5f7d3fa2edddca82adf340b701*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x36_n"db54bad97df4e90434be50958200c1a8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x36:512x36x64_n"a0ed805d75ca61a44f0e5233fc407874*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x37_n"b888013713b8ece93a63fe676255f0ec*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x37:512x37x64_n"d74f1924ccfb64707d92370ba41ce472*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x38_n"5e38fe9a9d53cd28482e3b517ab43efc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x38:512x38x64_n"6708694e030cb049f0b564e13de19b49*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x39_n"7f4f8b6e3c1f96ea6ee75757e2c66a10*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x39:512x39x64_n"c219f54767628968f91270e10eb02c55*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x40_n"d26dc477be098c01578dedafd08463b5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x40:512x40x64_n"eda6666eb5e9de8659ebfa8b0b1f9576*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x41_n"d16d660a059643e4c58815fa3e59ffbc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x41:512x41x64_n"b52c6ad4e1fc13544c906fbddc2412b6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x42_n"25e59e21fd2c54b42e94b4292a556af6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x42:512x42x64_n"d30b9ba59134cc937b24d6e91ce451ca*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x43_n"fcb84cdd54d86bf862b7448929845b68*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x43:512x43x64_n"60b477361b3f2e59234f99ce81dac63e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x44_n"0ed969d55430719c804fdcb41317a282*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x44:512x44x64_n"eb2ff4013509948f233503b86627f3fd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x45_n"f82fdd6a31d8f25ed4cf46679801195f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x45:512x45x64_n"fab5d40237e1d524cff41b1a46caaeac*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x46_n"a88a25ba981daf046e6e74b352e7cabb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x46:512x46x64_n"c4109afe19e6928877dd78e9f7db8781*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x47_n"564c2bb2ed324a0d46812fc3f92de81e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x47:512x47x64_n"76eb4f5221c20fda4ed78c5289c1f371*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x48_n"b75df5ce0d188b895677b4a380418343*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x48:512x48x64_n"868de8e3e157abf6f32d14d2d457c5f1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x49_n"2e8c51c7cc2491fdddafa8a163a94291*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x49:512x49x64_n"c550d6b05ce29b99ef74a6a8d6c7216b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x50_n"909457f7332cb8e538bd8912cc2df6c6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x50:512x50x64_n"638204caf8426297fadb5f6133885bf9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x51_n"cfabf8ff64244cb7445e031ac65d1d00*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x51:512x51x64_n"5d40b85a18ebde7651f71ab1ae223693*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x52_n"28ce0b6f3ef15a67059252a5461870a7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x52:512x52x64_n"a01f7e96b4a4de809bcd4c4749210c62*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x53_n"2ba26ad7836c5e9767076f11062575a8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x53:512x53x64_n"7b2f979607ee6de2a2ae620c4ddf615a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x54_n"5c406023d52b8dc3c191f7c4024080ab*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x54:512x54x64_n"d6b463b7718ffc25bd0e86188b6dd072*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x55_n"74220e56957e2b84bbfa78ca95ea2476*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x55:512x55x64_n"76e42f2cfdf4b9da6ec8796221077e0f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x56_n"fbff2a8fc2dd77701a719e14a7e774a6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x56:512x56x64_n"9816194daa61bbb272d592a3fc02a757*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x57_n"cd822ed97f7cf99470a98cdbf0e93008*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x57:512x57x64_n"bff1f5efd12acad04608b84365473c4e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x58_n"ee6571af82fa0888b32e96f026b7b4bb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x58:512x58x64_n"b00c136aa7e0ac71ccfdf8d40575b1b0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x59_n"426470fc7a54198dfd875ea14bb5415d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x59:512x59x64_n"c759932452c06dd3e444ce06580922c0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x60_n"5f6153d72a41bed0b2f4fbec4ca3fb5b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x60:512x60x64_n"893a417e0b71dbf72a7e00e144f6a341*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x61_n"7305bef9e794c7f7ffa75e875b7c2892*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x61:512x61x64_n"511cd2c96a65d782289f3a8aef32cc9f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x62_n"4cdcdef44eaec9181aa2fbadfd4619f8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x62:512x62x64_n"42e33cd9f5e8d25e5d8bde2856e54c0b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x63_n"5b75ea692bc8888d2e0b3ad0ca55d541*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x63:512x63x64_n"86a9d691792d07730d7bdd9397ebff3e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x64_n"f35c6188a0690efc92bcf2146c758030*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x64:512x64x64_n"f4b4a5245137e0f432c22197536f0c54*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x65_n"eed644777893a4fc120a23da385a5b7d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x65:512x65x64_n"24cf7446f644703973fc9eed7e8162d0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x66_n"f098ff1b7fb925657d5f24e7f98107eb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x66:512x66x64_n"7ef80383fc81daee2541ceb21f8a3d1f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x67_n"1714ba047a3a8c1ddfdb2d4ec380591c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x67:512x67x64_n"e049c61812974d2f2a7932c28483562b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x68_n"95b19bbafc51ae0848ddda68e77341cd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x68:512x68x64_n"bd555286c0027cb22d1812a4ab9fe3e1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x69_n"8e8bab58dcd5c06a641b366986e301c9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x69:512x69x64_n"ab90718ec8a203eca5ae9b36c1c43923*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x70_n"dffc90f90fd0421fa7b6d2a1663aca9b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x70:512x70x64_n"c42f4f24b780ca87b184cc65c2051bd2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x71_n"35d4c870eae5589211eccc65b8342196*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x71:512x71x64_n"a6e637b01d3928392862a3adb80b6710*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x72_n"a8f9779686433b32816a69f223b7aeb1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x72:512x72x64_n"613d1ca39129b1689c35a915284cb2d7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x73_n"4f03a46dc86751b8fde42c11f0ee2fc5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x73:512x73x64_n"93efeaa70e8954d1d25359354be52e70*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x74_n"cdd220cfb77d53965e506c57b756b16d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x74:512x74x64_n"ef89db7af720ea0870df62969242166a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x75_n"75e51e38a25d3d8e648bad119358814e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x75:512x75x64_n"4b809a451b24f9f847c0c37df0bfe275*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x76_n"24391f37c259c0150116a46a731ef73e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x76:512x76x64_n"2070b87af8b6c668ada6333383fffb8e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x77_n"f4deb3f1f65dd29a1522ba70c510678a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x77:512x77x64_n"e8b3769f36f4fa92dcbecd2cc2f8ea75*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x78_n"81f011dfdec441218a98c326e1cefe19*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x78:512x78x64_n"3f24b5b2b43ee75bb08c28a6624b5256*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x79_n"e3c9bdf4135d854479b38c1ab2141a6b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x79:512x79x64_n"164356a56070f0265732d80e30a0e015*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x80_n"2c1127ba022bc4e89ce9a5b2583f7b82*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x80:512x80x64_n"fcc04346555b4962adf05773677313ec*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x81_n"fc189473c9fa7b8322662617f035ade1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x81:512x81x64_n"ec24f3069e58f225c043a2bf33867ee9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x82_n"37549072236c7f5f83e200b561f4b27f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x82:512x82x64_n"9a04f9081e7a39b5835d90b860e9a0cd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x83_n"630d935640b043b17d3d7b47b8908c19*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x83:512x83x64_n"32075f9ec88368d14fa414afa73f1cb0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x84_n"774926b53bb05b7a556b69db5633b0f3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x84:512x84x64_n"3e4772c9350c059bf8be6eee32399aab*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x85_n"870517e6e7825510fb69e824723501fe*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x85:512x85x64_n"a0ae1733ed3cecf73a76d3485af7ecb3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x86_n"bbe4d33323f6482bf11b7945098ca916*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x86:512x86x64_n"49d8e71087ca19398d1d9475d2d1b8ef*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x87_n"b86c320f3b9d8377fe4305770b5ab57c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x87:512x87x64_n"f03743238bbf2f4db3dcb1e683e26d16*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x88_n"41d3c6e60c9e7a37be57fc9a56a2fab1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x88:512x88x64_n"7db7f88c485dc69c4db16540ecec219a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x89_n"9817cebe2258cf17a23f52a58daed4cb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x89:512x89x64_n"73e43012f0fa8051e43b6a678d89a5c9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x90_n"608ac2c5921ffc4726a94fcb9801224a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x90:512x90x64_n"6ead93ddfa9d53115dc1d3baa7665de9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x91_n"695a04438f43905fabe3df4d97e89449*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x91:512x91x64_n"771b6f850669ee4fd4d5dc3202e79b3f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x92_n"aec9b2a501142acdb19e01f8b7cd5863*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x92:512x92x64_n"d24f954090ac9d5486f91c605434ba62*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x93_n"0f1ba733c84b1dc396991e61a8684116*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x93:512x93x64_n"983dfc85a9cb47cbd19dea633aede56f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x94_n"a68b684b8e6e70ee1d23c3de63c3790b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x94:512x94x64_n"d2cc302b45cce41942ed28b598852a7a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x95_n"d1bd450489b1626be05bca38bb9788f9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x95:512x95x64_n"2a099868197a0e9c8c96bdd142351e73*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x96_n"9cb22e6791ed28dabc0a83e9b8c281eb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x96:512x96x64_n"d4951fee66e268934e646a3410ab7724*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x97_n"a3d02daa97d6b0a6a171585e8b5a383c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x97:512x97x64_n"5bb2e83b3e62b999b66e748c80796516*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x98_n"ee2e9d4afc7259bddc7dab473fbbb9e3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x98:512x98x64_n"f2b8d32677058d52ddbde4e40899b998*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x99_n"a6eb1719b983d58b6f36e336d51a9ba0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x99:512x99x64_n"ba1dd8818db044bf2a0a1eb24ea72350*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x100_n"7754e44a172794146527efee3aeafb31*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x100:512x100x64_n"754ea838a83922d112fb5b408a70df3b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x101_n"e243470f4ca17fc487ef70fd076ab141*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x101:512x101x64_n"138629053ece8b05d54e0ae641f3ec07*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x102_n"00598b92641ac99df50604b3ecf739f1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x102:512x102x64_n"1a2080f344b31299a66d5d8d28ff6f7f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x103_n"08d7be92eb1b9488ded9b508332af9bd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x103:512x103x64_n"89c14d41324d133f79bf428de5e04f0d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x104_n"40fcab12bb303deeafe1b272b6549484*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x104:512x104x64_n"ba1f74f9504157b8f5f000008ea0b2d7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x105_n"82fd472ab8e9f1f98ed790fe81fceab1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x105:512x105x64_n"289b351ce4b07cc4b2ad0b2fd1a47a1a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x106_n"29e038801e0c94f45e8379177817acef*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x106:512x106x64_n"b8869a8784ec17d60676129d669515f8*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x107_n"36be49016a8e40bfc4137df8ea98f3d9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x107:512x107x64_n"0b586e474c7960416f427bf0375d24d6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x108_n"a8ec3c8d5c6ad138c7387f04a0d3670b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x108:512x108x64_n"3b05a4520b662050fff29093ec66df4f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x109_n"378d357b6ad63d8e11383924a10f696a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x109:512x109x64_n"69f7fa4deb7b2d00836347950c676e2d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x110_n"087a966868f919b142151d6dc244d88d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x110:512x110x64_n"e2aa6d9216560f74d2b44c0d1cff6e87*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x111_n"93a3c7daa0079174793a019c55126ad0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x111:512x111x64_n"78561fc6a8864f084f3b9afb040b5eab*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x112_n"7a203cf58757a961381088ea4dcc64a9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x112:512x112x64_n"52dbacda04ab5c4a2043ed47843acf69*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x113_n"df0093f786a267e6990a56653ece8bdf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x113:512x113x64_n"62bf8922ea041420b6a60b24cc84aa3d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x114_n"ce693051282ee67f07b1c9be8dc3714e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x114:512x114x64_n"b988c71ce6eeccef2eb072096ea05f49*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x115_n"c4ba18c47cc19711b67b9b39eb1d37fd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x115:512x115x64_n"f9d89bfd8ccf6a372c66b5fe5f42bf1d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x116_n"544b119a6260407cfe6fcb3ed21b27fd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x116:512x116x64_n"04be3065b5ad704b23c0e092f3e6e195*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x117_n"278004eb9581a6188b16c1daa504a611*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x117:512x117x64_n"8fd9e0c66fb1cad08fc1788e2cb8d00d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x118_n"6aad57455bf50455fcc62b0b796cfc39*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x118:512x118x64_n"04721e60380f6e8cf5b8bb0fcf906e82*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x119_n"55b831df1ac05a22aad4536f2e2fcab5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x119:512x119x64_n"9c92199854846691a6c6c2bc55f975da*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x120_n"12e2f212524aceaa2e0ab8dbefb4e6fc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x120:512x120x64_n"e10b022658d5e6a50a582bbe2ff772d2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x121_n"d90ea360afb9a3f5bb421ebe236a4064*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x121:512x121x64_n"e298a3444ae8c71a4ad8bb161efce983*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x122_n"c09121b71b0a468be990fc2f0754541f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x122:512x122x64_n"d2408869920bd2fb80ffad9f8534587e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x123_n"ac169e93ceb616b9e52a2c0178282eab*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x123:512x123x64_n"b760c62b21e2b3b1292fd39b2ec8c3f6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x124_n"ee80133fdd1b957ebe41e7eb6e94a779*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x124:512x124x64_n"b40494f7076692aab93b4c249eb1a615*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x125_n"ad7d5b9ab37f5a562b03fb836509f4a5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x125:512x125x64_n"8dfe7eef41ddfb9e873a4863fa5641fd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x126_n"1657319ebfb441d5e04410bc9146004f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x126:512x126x64_n"0b06db2d3ab953e5b29586db16996879*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x127_n"4c51b1117fa1b50e3c6e250bc2acd339*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x127:512x127x64_n"08705ca01e6e7661a51951dce76824cb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x512:512x512_n"7012efd0b0e17a9fc02a94b790278269*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x64:512x64x41_n"8f6aaa24d69ad4054a379c4c398d9aa8*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x41:512x41x64_n"9fc291ec94b7411ca6d94d9ff96c70fa*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x64:512x64x1024_n"ba00d49d01caca11a8ce537514f23521*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x1024:512x1024x64_n"044d168c14fd46143379f1aa0fbc913e*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x512:512x2048_n"279aa3ce62753d98435e99fa9e93a730*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x2048:2048x512_n"7c5bbcc7cba80a762c87ac7ebaeeb871*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x512:512x32100_n"134d6cf038a5a99fcdfde1a4a9e02e43*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x512:512x512_n"73778255c25792df513916e7cd7a8438*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x64:512x64x47_n"c56a7f1af29fa52dc13a2808883a2163*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x47:512x47x64_n"a28a1650a2bc9bb9cdc680abba33cd1f*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x64:512x64x1024_n"76289fcf41aa50d1c916fd34ea1fd587*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x1024:512x1024x64_n"679704175e811d0970c8a6393ccedb61*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x512:512x2048_n"1536ac4f941fa8dfc45de5e6db522b09*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x2048:2048x512_n"d10f8dc0ae092b80b821d5d8abc26067*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x512:512x32100_n"6132a9cfefa250ee1b27d88d5f82df7e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x77x64:1x12x64x77_n"d63b53ac7449c4d4d35d4e4d38cf9de6*60&d0329649e15feea239bd7cf94eb4a4f0*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x77x77:1x12x77x64_n"971fb72c6a20c65838df00b02ab316fa*60&401af2fef777230c34e705a75867552a*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12288x768:768x768_n"c4eccb5674554af632a87d5ed3be1666*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x48x64:3072x64x48_n"e2a5c0e6213138731d73b538dcafcad0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x48x48:3072x48x64_n"bfc0771786b81ab1e1011714db5aca77*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12288x768:768x3072_n"350240b66c5be9afc318c650c851873b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12288x3072:3072x768_n"0cdf120ca310614ffad7e8f984cf6929*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --wtag=ba --dtag=ab --strides=36864x1:: 256x768:768x768_n"d82384fd25766afae4067381c11ff700*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x768:768x1_n"2796e70fbcb3ce8f6a1b15d36338b774*2&b2322f97bf6a06f1425430c9ea3f0e1a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16640x768:768x768_n"ae4f71d0950eb86736ef7974ffc9e22e*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x65x64:3072x64x65_n"41172d6be8a9ed07f757800b901760f6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x65x65:3072x65x64_n"4ac4ee2d47815ef79ec8ec4aec9cbca6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16640x768:768x3072_n"7d85d2d6e820c2b59a8ef81d4659180b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16640x3072:3072x768_n"7ab287c17f07aea2b1c61d7e9a67c203*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --wtag=ba --dtag=ab --strides=49920x1:: 256x768:768x768_n"d8c9672f275b76fc5ba55f1ae90266de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0 1x4x197x64:1x4x64x197_n"80384f6e567aab7c51d131ef43109d04*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 1x4x197x197:1x4x197x64_n"4ac828c70537e1ec3658ee00bd061556*35&dd424ab543a6af6073a6aed51bfce563*35&94a7adfab1d1869f5279f6f88708f2b5*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 33390x300:300x512_n"37eb6129c44b32cc6b084b463e0599fc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1113x8x30x64:1113x8x64x30_n"45f690bf6341882f4b1aad1ec9b52ff3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1113x8x30x30:1113x8x30x64_n"32ae158fb6f6e8f97dcb0eab0f651d39*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1113x8x1x64:1113x8x64x30_n"ca28c9cb097be00179d802194044b727*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1113x8x1x30:1113x8x30x64_n"55ad6c4fe5cad5a18d1ea456e375ec19*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x512:512x512_n"f47812b3e02de10bf2b88740d3a39d67*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 64x8x50x64:64x8x64x50_n"0f41246c23a573c20c5a899be420077b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 64x8x50x50:64x8x50x64_n"ef09717531cb7ea6efcb2ec19e36f81b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 64x8x1x64:64x8x64x50_n"2c5aa7cde1097345d864bcbd78d37e1e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 64x8x1x50:64x8x50x64_n"d274059908130fe7ae6a2a3835418231*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x512:512x6_n"4aa11d95df89d40c7667b364f2b290d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x512:512x2_n"559003555105a43e0f6f45c524c57cee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x512:512x3_n"d7c7d7ac1834301a41560ea850920028*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x6:6x512_n"5d270874b9f92aff786f5f99a0c0e367*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x2:2x512_n"f7e49ad2bca5d1801eb9bdb318784a77*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x3:3x512_n"22beb02b32e82b3b85131a7a8fb779c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 64x8x50x1:64x8x1x64_n"2fe3c796d84e35a63d04e9fc4aa612b2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 64x8x50x50:64x8x50x64_n"a3012108482e3374b03d3e453d2a4f0f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x3200:3200x512_n"f8161bfb114443eb98c14452892c0ea4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x512:512x512_n"77a2b14570868100cfd5ee33ad155fc1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 1113x8x30x1:1113x8x1x64_n"6a7160b07929647fc8eed87f601cc9e9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 1113x8x30x30:1113x8x30x64_n"5a0c285209012efef4dc73e988395ae1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 300x33390:33390x512_n"6e4d612fb8ee074c32d50afa088076b7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x384x384:384x384x64_n"9dbfd846226e0f5553db468c4019bb12*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 32x1856:1856x128_n"2b2db7eecbf93318ee4fd6e5c24636f3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x128_n"8212a063f26d3237ed43cf4aaee0c7c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 --attr-fpmath=tf32 32x128:128x1_n"c1e27c8c8267bdd24c5cb6de2a3c64f5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x32:32x1_n"9b19a7f0a75a774e25b1e58280ee4faa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x1:1x128_n"ffb6121f22914f4dbfa2e0d5d9bd1961*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x32:32x128_n"72e6a93a7432be06848c5fac3ddcfac7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x128_n"1691aba3fb5894e619f6eb43f918a906*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1856x32:32x128_n"c9c7b89d9828493bb39a8fd3e7ecea4a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x1856_n"3982b9eefcf4f8af037dd78736d5fa3c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 12x1024x1024:12x1024x64_n"6ecd771328bbf13230a4df7e109b601b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 64x29532:29532x1000_n"1b6242b20bd4488e4a937be976e228ab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 64x1000:1000x500_n"498e0b3d4989beda607f57c9636ef57d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 64x500:500x100_n"30378e9f48c30272b8b9270db762d3cd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 64x100:100x50_n"2a4dab5b9a32f352691bd784dc0b8b5a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x50:50x1_n"59ac4eab2176cb76bb5a72b29a2b84ee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x1:1x50_n"ce608afea6b3b06ca90d5de8c14a4d6e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x50:50x100_n"d285599e4430cb354898b729089c9b63*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x100:100x500_n"70727f9703a3f540723e6af4081d81aa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x500:500x1000_n"76f2810612ee33502ecc2a85a43dccee*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 29532x64:64x1000_n"7359d92c9db1deb398c81615bdbd7586*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 5x512:512x2_n"9831683dbf4e8c17a0a2a0aecec3f25e*1&a590aa54cce53f1025c4bd7d6f690e58*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:f32:2+binary_mul:f32:0 1x16x384x64:1x16x64x384_n"7cba4452578b7a2eb70a78254875d943*120&af251c07aab8f57774f7428b9bad1daa*120&c26a9a75fd5c6c4d3b75f3f90129cd16*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:s8 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x16x384x384:1x16x384x64_n"c9e3a8575f4c71c3f434c2512b077e3b*120&00f4cd206caaccfb0c2d9dd1f87ef226*120&64499e41a499da0c129c5ca3a2ae2a40*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2+eltwise_clip:0.271:0.314:1.234 1x16x384x64:1x16x64x384_n"111257b07402167f3baa8e82fb2ce4a0*120&db5c02226aca1038c9946cd0d5d6ab7a*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x16x384x384:1x16x384x64_n"9f4d4ee1554eeacd537243a893adfa94*120&d574b11592b656d92712efc7ad043d62*120"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f16:3:abx+binary_add:f16:3:abx 1x12x1024x64:1x12x64x1024_n"bbd2a053bfc1f9745fff98c60c7cb670*60&8fc892330c4ba24763e8254e55d5a16e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x1024x1024:1x12x1024x64_n"8bdc3e40906fa872d2006cda675a08b4*60&f6c38d631cc71472c7eca9b8f67cab47*60&59b3d143f45b2fcc09eadfbbcb9aec7f*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x128x64:1x12x64x128_n"12f20ea17d1f6f69c81bbc038ac7af13*30&3daee425ea499633b82967f5c6bc9256*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12800x768:768x768_n"5c287cc19958efd3841e043fe1a1cc71*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x50x64:3072x64x50_n"ffa3a5f3dc1eedfc50b565947bca1ba9*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x50x50:3072x50x64_n"2ef33cae55af6fabbd97459c0f8ea1c0*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12800x768:768x3072_n"9b71b0300232dde525c465a15da868f4*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12800x3072:3072x768_n"77ce8400e0bea0fc12999dd527cfbc32*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --wtag=ba --dtag=ab --strides=38400x1:: 256x768:768x768_n"f44878b2769e42677dc457b365ca2c71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16896x768:768x768_n"40aaac0c738f2d962efcbad6fa0bfa7f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x66x64:3072x64x66_n"f124332bf4b87911bf67a22979304ebe*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x66x66:3072x66x64_n"2f6cf4f03de45d7c009ffb6b17a1a7da*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16896x768:768x3072_n"ad01d8af393093888418a32f44d6b7ed*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16896x3072:3072x768_n"42e23923a9819e405799ac7f2c08f87a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --wtag=ba --dtag=ab --strides=50688x1:: 256x768:768x768_n"f0066a390e5eb7291195d87b2155e983*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 27x128:128x27_n"7c4c41de14d922ac6cf688b0399a9d77*5&2eb36e4868d2abc85f444f592b6ba712*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x197x64:128x64x197_n"8b5ef241bbfb5d52e48673deae36c7c2*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x197x197:128x197x64_n"5eb4cc23320c5113280481c7805ee9c4*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 6x960:960x1_n"88a0994b607f12a13f929eacf7033680*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=any --wtag=any --dtag=ab --strides=:: 24576x2:2x1024_n"e60c93275e44b12fcb1a448ab65ea94b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=any --wtag=any --dtag=ab --strides=:: 24576x1024:1024x1024_n"177a246ede569c03e71cecfa97d8278a*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-scales=dst:common:0.5 1024x384x64:1024x64x384_n"8e419a8868f4a034b2e1c16b6b8d6cb2*24&271883cff868a03c1394a57d466c154d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"fd3ac15eff8ea7756e4dd525eb5473e7*24&e963c64168261d02d2266c41fe0f8343*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=any --wtag=any --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 24576x1024:1024x4096_n"7d16b0cee1796090f41215a9153b6c29*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=any --wtag=any --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 24576x4096:4096x1024_n"2d882eff09bcaa4c1b990285c1a4e16a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"34c35d2c168f530e196c4b54688d938b*1&0b4a32a89f8c0664f0cb5527011d8c0f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x900x32:1x8x32x900_n"4b36aa8938c94d77e0f01e30d585a71d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x900x900:1x8x900x32_n"2cebb38c609af8eb763ebe7eb8509b9d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x32:1x8x32x900_n"8a4844057026f7a84a1b7247dd2bf2ac*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x900:1x8x900x32_n"af9d0a5b30564044faede8abe846252d*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x32:1x8x32x100_n"bf4402f5ee70a21bb1f4ccd0236327cf*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x100:1x8x100x32_n"5393c8a4f10310da7837c1bd3f7fdb0f*25"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2 1x12x384x64:1x12x64x384_n"b84ada97a48b739ce51abaff64f27338*60&73b974153b83b5528378bc82ec483a22*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x384x384:1x12x384x64_n"765ecf99b84978ec83fffeb3489e88e9*60&fa9510c56ff64c42c83c2dee99477878*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x768:768x2304_n"4ad50d91998485b65b0c5b7fb858a16b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x50x64:768x64x50_n"9521802b5baeb4f3ea1e1f546110efc7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x50x50:768x50x64_n"66e4a854afdbbc6e7bf55f5e439e9e0d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x768:768x768_n"66b851cdd8ebee3a6cc302d0b76d457f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x768:768x3072_n"f118cf42df505f7b10307be5422dcbcd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x3072:3072x768_n"99110074db2528b7f5f05dfc8e590c48*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x768:768x9_n"132173d499e1c95f5193ae63f57d3034*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x768:768x2304_n"db4555e34e8fb4be45975fac3f6d955b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x47x64:768x64x47_n"03f5e30c1bc32011602da50d96615721*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x47x47:768x47x64_n"35854229deceabba5cba2584e121ddb2*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x768:768x768_n"2d194d3da67536481f05983638ece2a1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x768:768x3072_n"0d6224d02d7d97e405d948535b293cd3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x3072:3072x768_n"c047ded6afa066b3bb25b01d4ff3ac8b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x768:768x9_n"dc2da531b29299f8a5372912d1dfecb8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x4x197x64:1x4x64x197_n"901fb37caa24639d55f0aed1db3fba71*35&2c0e76444a41c993faf6a5e394cced42*35"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0 1x12x128x64:1x12x64x128_n"4a2f9588ae8cf91502952516aac91495*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8000x128:128x2048_n"1cb3644811ddadccb0e517c05bad0c56*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 32x16x250x128:32x16x128x250_n"7d065ad16f94457b05860c0fc9e3a2b3*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 32x16x250x250:32x16x250x128_n"55ba47811b9c594601784e28bcb282fb*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8000x2048:2048x128_n"b9a741a0710b979276dd201ec7711617*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8000x128:128x128_n"7dad9ee506afef3dc7b20171763bfe43*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x32000:32000x1024_n"d12bfa90dfe607460e5d05bcc7b9eccd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1024:1024x256_n"c8e02b352bee5485d0294c422af4b39f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x256:256x64_n"ee71acb8fb14165d2c88a5e34f262340*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x64:64x16_n"b12e05a5b55f69e328e2400e3411b0ed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x16:16x2_n"f4c3432c751a6388f1c9aac98095d0b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x2:2x16_n"961492ebdeec74aa62667c9c98af01ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 16x32:32x2_n"eab7d63f284abfe7fffd1a9aecaf47ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x16:16x64_n"7a2e37fd1c4affb7b5f0f3cc55079a61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 64x32:32x16_n"fead5ab0b976ec1cd152e44315af2d4b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x64:64x256_n"e637b0397cb735d1f5a80a8fa83dd2da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x32:32x64_n"82a3407d512c367a884824e09181ac76*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x256:256x1024_n"40f6df5d243ed7ebe970288dca648cfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32:32x256_n"ccec8b70d22504d2e3b0248feb93d840*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1024:1024x32000_n"150184a64812b0bc2bfb6b07cb06a1f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 32000x32:32x1024_n"69ef0b6e576ee55586fa4f2fa91c2486*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8000:8000x128_n"27f41632d3957c6dc8421630bf1c39c9*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8000x128:128x128_n"fa2f67f5e8941a6d8091712d79c1be6d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8000x128:128x2048_n"621ae3d51a7562971bfb27ba1c076c19*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8000:8000x2048_n"2e727036db2af257575d322eab141412*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 32x16x128x250:32x16x250x250_n"4b18a8ab41555f5d2c192ab4598bfb88*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x8000:8000x128_n"bb7182d27e7d9055eaf1ee267c6c3247*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8000x2048:2048x128_n"d6d227317bd14e3e17445d51572bc2f3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abdc --dtag=abcd --strides=:: 32x16x250x250:32x16x250x128_n"772bb43588568dc7f32022df1fe0f525*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 384x95x64:384x64x95_n"0a085f8db486eef8ab63c8f32b7d4ab4*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x95x95:384x95x64_n"1363ffd03f0678dd1692fc624cd35c71*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x1001_n"33e9406b091e950532f70f4648d30e9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x6212:6212x1000_n"02e85f219159ec0b623a88c3bef682fa*1&330e895ad8155513a30c5f0ce5478688*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1000:1000x1000_n"430c2c296d352fe499709c3b02ac8846*1&cf094b5a666361fa87a81a37f8d6226b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x1000:1000x500_n"cd55d7b0ac73bfafc29f33cd3a3a5411*1&8f1a97a3ad3418189abf481e621ee52e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x500:500x250_n"4925f1b315bc99a5171a00b0e767242c*1&01197e6620b27c47bfe4d055be86a8d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x250:250x125_n"5359c5e3ab23a8a6d9f02e6f2c0b2be2*1&8c83991f3859bba260e0c43945586ad0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x125:125x60_n"ec97e90c4e4732a72805a4ae8620cc29*1&f2f41e1b68d8fd6efb135f13c2570685*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x60:60x30_n"432b1ced2e6ebe43a6890f7d605b0272*1&870ed744c6ad2b301d9e9f6ad0a6099a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x30:30x2_n"38e5e57aa47de92b2c0115839d465ca9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x2:2x30_n"ce06d8e9065fddc494aa4df5431a295e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x250:250x500_n"88083b592507f37cca809dd4ce5a625d*1&496e807c9ebfd381927fcb1fdab4bbaa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x500:500x1000_n"a8368e18737eb8ea7a4e79ee2d99b67d*1&62f3bdb90499cbf59546be3c950b987d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 6212x32:32x1000_n"a17904224f88f4c974c9bd210e954f54*1&b8d4842f593ef4597537caa6972590e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x9216_n"f43314db7cd1fdc5074cf2ab05496bee*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x2048x96:256x96x2048_n"8ce67c2fe088df45248d37429dc20313*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x2048x2048:256x2048x96_n"5c5262b579779e3ed9faa06f3a2366c7*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x3072_n"948deacec3ccdc4b12edd833f7a25f7b*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x12288_n"879f6c8aae90b99a43e9db1c03950a5e*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x12288:12288x3072_n"b063f86a69e3c416f2ddbf8467cfd2bd*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x50304_n"7b82a0fa23e6ea518f7385df562bb64b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 50304x16384:16384x3072_n"4b1675d9eab64d89c65bdf9075e51188*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x50304:50304x3072_n"9100e6d97108f398290c05f38ce661bd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x16384:16384x12288_n"9c613a9e9dba4b5ea543d7f66c8b8692*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x3072:3072x12288_n"6b174b390ce68a0ebd847548106976b6*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x16384:16384x3072_n"7eab8259885f5a9acf1bb19c946c4427*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x12288:12288x3072_n"2fbf37bca34a46eb214a44e2cb95cfc4*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x16384:16384x3072_n"77143723e296f93dab2fe75f37c8e8fd*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x3072:3072x3072_n"2ab97170fe53872d310100ba274893e0*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x2048x2048:256x2048x96_n"68da9d92b052b261618740d50bdb0a66*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x2048x96:256x96x2048_n"4659c6894e2b5b3ce7c8204a1bc6ecb4*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x96x2048:256x2048x2048_n"b9e10b08ad12b2f0191a84d11578450c*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x2048x2048:256x2048x96_n"09e62671a61e8978b7f78fee743b7bd7*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 9216x16384:16384x3072_n"f2e651f7e573b44c049fc54600907218*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x9216:9216x3072_n"83b60d94bc3b948923bc26877e403798*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x95x64:1x12x64x95_n"0645aa8b6740affa7f09624848ad7f94*60&ca4a2350810a4121c4f10ed07fe3bb4b*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x95x95:1x12x95x64_n"27e2130621608e3a7ee48e872579a750*60&cf168ddd2e84679135fcbdc039f3a765*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x1024_n"2c623b693d879e1c7eeda29815a19b81*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 24576x1024:1024x4096_n"25f8b48c511f7e4fd6dcbab2ceb24f36*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x4096:4096x1024_n"0e195f3b958afbe459cbe4dc635f2ded*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x13:13x512_n"2e31737b9b1dba5b1e98910726db0d3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x512:512x256_n"d165f945795e8f1570920e812fb757e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x128_n"1441b896c91148eaec6d94e137ef8250*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32768x27x128:32768x128x27_n"c7cbde0a1bb061a83e2ed4e684363acb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x479:479x1024_n"157bc6bb0f1be4c9ecc012181d952fa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"aebf5d0b1b9d88ef5445212c00292864*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x512_n"88dd800405510c0c6bc260da08e8bbed*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x1_n"8fadd796653f7b2c8844ed69a3d9301e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1:1x256_n"4e8bdebc5c6df48c3217aac8d9c02ab0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x32768:32768x256_n"a5ff1a86202c4b176612700523b8a5ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x256:256x512_n"13112c60634fc76a3d3557e38c4814d3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x32768:32768x512_n"1cce5b32013dfd49847ac29b96a838e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x512:512x1024_n"2f4902cd326a33dbeeb49c158e08e836*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x1024_n"c43fc5214ded737e09f1adb461ef81ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1024_n"699c63add42833d0829162f84c199f0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1024_n"3a800bf6f0f9a495604b7c0bdd13d251*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x479_n"81bfbe9ecfe091fb9099da40961dcd74*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x479_n"717c8e4c15bacf45992c30c71d85a07d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 32768x128x27:32768x27x27_n"56ecf7c8070449574adf615f82407f5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x27x27:32768x27x128_n"84c17790d644289dee8d173310312625*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x128:128x256_n"c992e564e165cc86a2b74ad0eacab455*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32768:32768x256_n"293641da6098ece00ad6d69fe46d9f9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x13_n"666b9b7c1f0e8e35b2a7592fa4ab6e1a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 100x110:110x1_n"4e84d19aae6a4597037c79764f678164*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 100x1:1x110_n"8ee98fa2c6638fe78e602d2f7a5b3b86*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x13:13x512_n"7a3f0e842b0d868fac57b4a14e538f16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x512:512x256_n"8b4b044fca64893d069b0481cf47caab*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x256:256x128_n"0bf78fbaa313ce30315e22acdb00109a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x479:479x1024_n"40f45651dcd57f003b77611ee0dae76f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x1024_n"eab853a0721ebc53a2a6835837b10d3f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x512_n"4b57911b3af8b67dab5d82330bde545b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 32768x256:256x1_n"be6f64ab0ae5744f63af9e403b996486*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0 1x12x128x64:1x12x64x128_n"20fb33d453d46177f05af2a7c1a7b51c*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:s8 --stag=abcd --wtag=acbd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x12x128x128:1x12x128x64_n"b3ef5ad720fc66709d48c9d519b2fa12*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 128x384x384:128x384x32_n"f2f7f35aba56a6d6fbf0c5a4e342828d*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x384x384:512x384x64_n"e41a0e764a52567557c2a45ba9bcf77b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 512x384x384:512x384x64_n"f4bde1cbd222b4ce653ef5c9d922b217*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=ab --strides=:: 1024x2048:2048x1000_n"e395b32d1ae3c1d1ef6715071c987827*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 32x30:30x3_n"81fed7a1bc7e05a692fc503dfea6ab19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x3:3x30_n"22938d0900e80e739305a47358523e13*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0+binary_add:f16:2+eltwise_clip:0.271:0.314:1.234 1x12x128x64:1x12x64x128_n"15ff2bff634a4382538bad63d25a0609*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0+binary_add:u8:2+binary_mul:f32:0 1x12x128x64:1x12x64x128_n"d3da9cd8bf6384a5cc0b134a7abd0f23*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:15:abcd+binary_mul:f32:15:abcd 64x3x49x32:64x3x32x49_n"e593fe2cfdfb79b48789252d530e11d6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:15:abcd+binary_mul:f32:15:abcd 16x6x49x32:16x6x32x49_n"fc47690f6ace81484c28de64d2e31858*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:15:abcd+binary_mul:f32:15:abcd 4x12x49x32:4x12x32x49_n"e66a3d0372d0e304ffa0bc57b6fedfc5*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:14:abcd+binary_mul:f32:0 1x24x49x32:1x24x32x49_n"e24f2a8c36e6dc434d453b5e4a8c7652*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:3:abx+binary_add:f16:3:abx 1x12x1024x64:1x12x64x1024_n"8fac44d55f38a36c58806fc07d90e7a8*60"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x768:768x256_n"6e97da9dfddcc30bd72f1666932872e5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x256:256x256_n"38a0c770b84c8bca1e0cf45a799a6419*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x50x64:256x64x50_n"9e026da794ba360b8c7e4734ed699800*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x50x50:256x50x64_n"ee036440bd9bb62b7c0d77f7ff008320*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x256:256x1024_n"3abb79c27a6dce406bf84337b41d4ea4*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x1024:1024x256_n"6b4a430f00ec4c2172f78a93dad3c531*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x256:256x9_n"f3ecb76705e68f9c843903df8a52d2c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x768:768x256_n"7f9e4d67ffb5b9faa8252af17d22e602*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x256:256x256_n"43abdfe43fd070b2331a223e4e208348*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x48x64:256x64x48_n"485778a16e76b18bb842e0e5e060f22c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x48x48:256x48x64_n"7045d3c5903ddcdfd165d3cf68d2cbaf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x256:256x1024_n"ec86396c6a7abc2e7c4549f0afa22fb9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x1024:1024x256_n"15dd3ff09845535d91782bcd13a0719e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x256:256x9_n"f423e9d13d897b2c8784998b5f7ac8bb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1001_n"e276b8a243e54a2d3fc1c7746b637d28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 6144x49x49:6144x49x32_n"73d7feaa8f5e14a9872cda9f5316358e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x49x49:3072x49x32_n"4e0fed009b52668d5b7246f4e6af6f62*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1536x49x49:1536x49x32_n"72cafbf90b9314c5e6dc47bf0997e34a*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x49x49:768x49x32_n"fd81e195b09815c387d54a9137b247c6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x768_n"b74938269385bdcfaecd434cb785270c*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1024x64:192x64x1024_n"f5bcebf9ed31f6bedccf07584d8629b4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1024x1024:192x1024x64_n"e7a5d3a510a9c6288857da3ddd7164e9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x3072_n"08c2d000058efafba345ab3ad5a57c9f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x768_n"1263620ba5e99183460265683bc57151*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x768:768x768_n"d0205cb5658ecdd4ee142e578a7b0b13*18288"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=cab --dtag=abc --strides=:: 192x1x64:192x64x1_n"68f6a2ca2e6d954d3eef86a46a20fa1e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x1:192x1x64_n"38b773d69fb42b11f38d14dcd351861f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x64:192x64x1024_n"1fca72717b7d0dc85cf5a1fab02fb608*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x1024:192x1024x64_n"7b76a5b1b3a0c64e02b84cbc71cd0573*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x768:768x3072_n"3c36c2d86cedd7cad571eb5be8c6de3d*3048"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x3072:3072x768_n"7bdf4e4e948a8e363d5f32e8f2a42f26*3048"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x768:768x32100_n"4daa2268ae328ff2fad540d8be2a2fbc*254"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x2_n"3acaac271386b22abe44b847a164c9bc*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x2:192x2x64_n"9eef3797c1a40adc4fdbae1f42bfb079*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x3_n"bb89738b40d7f0818a13fd4494c4efda*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x3:192x3x64_n"93e9922548d35abbbdb4b8216a71a218*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x4_n"5cde6da0cbf1440f88283f188e3b966a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x4:192x4x64_n"3273f82910433646f390737043dc6623*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x5_n"e5e7c81d8c1d75fc137c9f97f3ca5e70*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x5:192x5x64_n"48878e42e9d0ca4089474688ff301344*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x6_n"0b6aae2fcb9befc43ba67cd0c4a06a7e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x6:192x6x64_n"11294930b06b82453d3cbd843fb6e816*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x7_n"387c375ef268277e1cebd9fa4cf5bf69*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x7:192x7x64_n"7f721a4328d88cad28b9b802a479f0ca*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x8_n"0073fabdba018aa185b710662a14ef34*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x8:192x8x64_n"6383a6b8d20b3feabe1ca3bf4d635459*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x9_n"799150fb9376a9e7e233c3578625b294*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x9:192x9x64_n"dfb7b37e54ee0a4e5feba9c25ec61594*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x10_n"8b7c8c14577e8cbe954c96e368566c1e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x10:192x10x64_n"1df0885005a4c1622d60b394f24e5297*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x11_n"4a1267e564ed534f6fa8fecc6db95c05*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x11:192x11x64_n"027f9f3ebbbd9558a07f8ffddb3b7c7c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x12_n"9fc2db4c66fda2f1adf41ef8fa41dc51*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x12:192x12x64_n"1b04144cd7595fab201db6f0d4f151b4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x13_n"448d160fb7a525d9a644a145545f0fa1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x13:192x13x64_n"c157ed7f88cc312c55f13d09546ecd51*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x14_n"eaea65867812ea95fd734896d531377a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x14:192x14x64_n"da98081a171416d422bdbce8011658da*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x15_n"983687f3cea1663c2d52be9670fa58df*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x15:192x15x64_n"7c11e7a337792ea0bbcf8f0e4d9b97b9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x16_n"6f7fa5c6f7485671432cd5650a65ed06*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x16:192x16x64_n"56f4993a801c7dadb1a0124c094d1fa1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x17_n"117cda1cc532b6653e62a6e954429b07*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x17:192x17x64_n"3e36e0b7ca03a5fa3e5e9d07efc0f16c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x18_n"61fd9cfd6d8852dc5e28a58f220ffb33*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x18:192x18x64_n"d2803cc7e5003739b00b23a8e314f09f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x19_n"fe8523acd5045c851077b99861e75fb1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x19:192x19x64_n"a558e34a3544468d0b7b3fb4362b7855*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x20_n"845db08f584db82f1e12b3b08d61df3b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x20:192x20x64_n"021a4b638fda955664c8d4621b31446f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x21_n"dc6a71b1aa69f7a80d272896b6a5945b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x21:192x21x64_n"a75fcecf1b4a6b58ab45d3426718def6*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x22_n"6f6b6adb6b2cb91a1699d4cc9fa5892c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x22:192x22x64_n"21d4fdda1c93aa68a589a0c395eb59e0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x23_n"a7ac2c7315a3bfd8769c80020935fa85*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x23:192x23x64_n"32d04540cc0c7b800490fe882fc2befb*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x24_n"254caa1267efe2cbd23715112d794170*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x24:192x24x64_n"fdc3bbdd715e995ac622498590a3dcd5*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x25_n"8f16dfe4735ec4396d04bd5a492501e0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x25:192x25x64_n"62d88785b29983c614f5917acf97c64d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x26_n"888582e6ce6e0429e9d1e7a53191dbe2*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x26:192x26x64_n"c8f66b6f341b2bd3474a0856fe1a9f94*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x27_n"407d5a17eb377d35820c7300eea24950*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x27:192x27x64_n"b942cf783160ad40f4ed3ca9f847ad72*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x28_n"af625fd847eaf8a5c7d9fbf0ff195b19*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x28:192x28x64_n"b6ec10e72f0a086c255216c737f0fbcc*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x29_n"b99231135657b87f349ed569ee3d14be*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x29:192x29x64_n"98b0cb65ee2ecd38e5d2c05f8e7d21c4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x30_n"6637f5e28cf3ef95a1a87152d5dcec91*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x30:192x30x64_n"acb867f17917165deab20edfe687316e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x31_n"7f934daf9994a7080a9a93bff82532cd*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x31:192x31x64_n"f995468732d09b053044e5f4bd0df00a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x32_n"300705414866e1375b477682c1d94b0e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x32:192x32x64_n"74f74e2d5b87e08c75939684199d9821*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x33_n"fd4b666ebc1b1a7d5d0216f88d027f22*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x33:192x33x64_n"ef3ac767ca45027127fc6fbef16bc269*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x34_n"05696c1be720e776223f7c8b7c1408fa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x34:192x34x64_n"59560f8d0cb03f6412271ec2a38064e9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x35_n"075e21134c15c4b98cfbe5c628144af4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x35:192x35x64_n"2c1fcdbc5b572a4c6bf24b100ffb9fcc*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x36_n"477d0f554a79287f58d56d68cebc4e83*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x36:192x36x64_n"fb8ff004476d73b555982d4d8e7a2ef7*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x37_n"6f8c9f434c75b13885cdf5d5f4382b1a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x37:192x37x64_n"3d345e9a98a4ad87a0cf4d21c5afbbac*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x38_n"bdaa995343aedcb684f64f758939f0b8*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x38:192x38x64_n"bd7dc3724d70632c501369ed1a7c9d6d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x39_n"2203f59d8200fd08869b1c9f54f578dd*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x39:192x39x64_n"6070d751367ad4779d03fcf4c09b799f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x40_n"25cd1c790a9ee493799d620b1fcbdbc5*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x40:192x40x64_n"a78a2c4428bdab7d62cf0ab8e58b7f0e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x41_n"e1ba4fd2467a9c29dde84741d63c8874*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x41:192x41x64_n"eef076fb5553765a07594ba18d3dd76a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x42_n"8099ebdac33b631e0d4ce30c444cee82*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x42:192x42x64_n"f5cc5d5744ae8e91526277e48340068f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x43_n"3317cd534cb892c3be24711f10ab2d83*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x43:192x43x64_n"d79568d1fcc4d5b90ba1b1410f0283c1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x44_n"8e10a145cc2e5d489eb46ce416ba9423*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x44:192x44x64_n"05d33247ca5a12feb893aa3304068571*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x45_n"aaa48d054d7981547d8f3a049ccc785f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x45:192x45x64_n"1c072876dce878ea590ab18ab9093852*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x46_n"e18936692ebe3f057491f84ff3c3bd9c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x46:192x46x64_n"9db795030a6e4af3b564882cce7bb10d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x47_n"9d0f400a6c0589d6d1de350b3ea24799*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x47:192x47x64_n"7d9a2ee2890280d22f89ccf8c9b73090*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x48_n"1c1dace3b043d32427c9ae68432dfb84*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x48:192x48x64_n"745b5117a8cd9ac36731cd7cbca1c937*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x49_n"de2c7c2a4dbbe624fa363cbd0e28a057*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x49:192x49x64_n"a93559cfb01e591f52e281915d5cbc61*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x50_n"fe4fe82a5713a0e75f7d0951b11488b0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x50:192x50x64_n"765b3d94ab060b4bd06ea9e7774e3dc8*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x51_n"b0eb3d7809afbe97f7a1e1a6e2e3e220*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x51:192x51x64_n"33de3848e3bc9c53c884b1a04195bfb0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x52_n"99f4bd735c8eb57e0f6ab0aa702c812a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x52:192x52x64_n"aeb95e49d9e874bdec3b2fbd96537ae6*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x53_n"5f49774442494a7b9e98f682c47b3f54*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x53:192x53x64_n"b94dfdd60830480fc0c7bd032a968f35*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x54_n"63815372376d0994586993fc5df5cf99*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x54:192x54x64_n"4d3db0dcf54368fe1bbcfc05f7e4562e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x55_n"443dcd372141424797a767c8cb6cc908*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x55:192x55x64_n"3b1bbe81b250a882dacad6645be485d0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x56_n"1ddfe4bbf8ce34e3364633c84c44f9e6*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x56:192x56x64_n"414e45ba47327c87cc2859d184d19a3b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x57_n"904612cb22c99b083e7cad0c865e2f94*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x57:192x57x64_n"50b4240f103e4102a90d60c79f7a4b1c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x58_n"f1ae1b877474999cfad1d21c29da0dde*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x58:192x58x64_n"ea4d35987741556182ef42608bd0268c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x59_n"41be9ebe14a6777df2d12eefb3769773*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x59:192x59x64_n"0019815c12d823e3ebbbfcb5ba1c60cd*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x60_n"5e0ff2d77c61666990c48857bf67e988*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x60:192x60x64_n"920ebd70ab642a92d3e6f703048e01fa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x61_n"87df2f7c3b8c08b93bdcb7702ff917e2*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x61:192x61x64_n"e57fe9d8d4c2f2da9734761540e6b4ed*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x62_n"999b0ca2aaa07f4611c5dc1cd8cf0e29*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x62:192x62x64_n"881e84a70cc7097a481913baa0dd265f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x63_n"4162d0be793d47563acb7a8ab839aa7d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x63:192x63x64_n"3473b96fbcdf89b5ba0b51664f4a8adb*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x64_n"7c4652a89cb686ae84f27064bf811088*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x64:192x64x64_n"1a6ae9bcc204275432f39ee6b933e8af*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x65_n"560849e5613fc1025301bb36a83f227e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x65:192x65x64_n"464fb39ff2ea13db0b23d29931acaca2*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x66_n"248ebaa7168478e49df337c4187e25b3*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x66:192x66x64_n"2bf5d660f5e9245a32a464b3b227043c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x67_n"598ef4a007f83ee39018e383ad6cc248*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x67:192x67x64_n"b4fd48ac6ff99874aa2538e40a0eccbc*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x68_n"f1dfcd2510534647c0019d32d4916719*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x68:192x68x64_n"afef46f0480aaebf56ce7014ce6db28a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x69_n"54ba3288e89f9a4b783171b5ca08fd35*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x69:192x69x64_n"f5395bdb9e549e9166eac2bfa4c65f44*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x70_n"a092e0e0b4e81cc539ea710b7c84f48c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x70:192x70x64_n"f38bce758539aba63cf8425336606630*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x71_n"28bd3e52d5b62778015b35992493dcb9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x71:192x71x64_n"ffb3c270b9861c01c9638593fdb890e0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x72_n"69b1d203b3dac50e645a84e67e4ff172*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x72:192x72x64_n"43eded246e82d642aa225379cc96f3e5*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x73_n"3b4a44c0db8f6644fb2178f61b93a91c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x73:192x73x64_n"d737d39c4785fa0c0fa4b42125cf8788*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x74_n"b89c064f37fb7414ea02c1ffe8a5bb83*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x74:192x74x64_n"86062303dd122afafe902e15d0abb714*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x75_n"135abb0a3e9e3063e55485c6ab2dc73d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x75:192x75x64_n"e5eb73a57f6083f90a4d95c94e839d45*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x76_n"7532c96d85862e5046fb420f1a0f4b8e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x76:192x76x64_n"7197814c1bb1439d059b8b2c9d951808*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x77_n"48dc3c5035cde2651d7f394d8be1ab1e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x77:192x77x64_n"91e24fdeaee4bc195bbcbd4c05db6f62*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x78_n"12fa181d59549f40491022e548141702*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x78:192x78x64_n"e0c6fb95b1783e5c18b9d7d4d3ccdaff*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x79_n"6c574cb8c9d65f51939fa4676dec6530*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x79:192x79x64_n"1f2d7891bbfa47b9901a45ff2eb85e86*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x80_n"e85a4cb0a453362ffd2f44f2eca3b35d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x80:192x80x64_n"fc3760b5a320e886ce5982f1c915b2e8*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x81_n"889815ed7cfe2073c16bffd5ecbc0a82*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x81:192x81x64_n"bc19813baf984ae5b6c7e4dae32915a8*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x82_n"64ebc553a0e2c5551c153bbf7bd9db45*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x82:192x82x64_n"a3f46116ed540538e41b5fa3caa97588*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x83_n"bc2c59216c1f75061b110ac012436cda*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x83:192x83x64_n"4d22d6fa69a920626644585ff558d46e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x84_n"e60709c3a019ebe9f81f89aeb6e6ca87*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x84:192x84x64_n"4573711e8cc7902680600d7c2022881c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x85_n"75e19f952302b0567fddee6c49c0db24*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x85:192x85x64_n"e3c6aae3f22938347692e4ef26c86f76*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x86_n"2d5512fa1f0f64a6cb828309a5ebbd69*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x86:192x86x64_n"3d8a09e45e24685da26ad31e2e674b37*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x87_n"1c344089c3aac739c278e09565afec88*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x87:192x87x64_n"c6f14c7d44908fe87f99cbf24d172a92*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x88_n"fcd08a0007c326b4f5820371485e5a06*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x88:192x88x64_n"8b015dc39561bf640ec8b2a07ecdef27*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x89_n"7bcbf5ac47dd42a5a4921517cf0a700a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x89:192x89x64_n"e2546df1aeb205514c35076bbb9c2337*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x90_n"127d573b56b481ec1dd940103b1735de*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x90:192x90x64_n"547dcb22799aec039a237491bf9fd666*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x91_n"1bde7961b81c87461afa8aea3bc40db7*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x91:192x91x64_n"33a8a70a5631d3d0731f9330440c049b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x92_n"7e2f4da7fdc8001e12b9ffcbb07a2377*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x92:192x92x64_n"155c81b7be6cade60064353ddfdb618c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x93_n"f9bf0cfd346659ae5f842bb24ad56642*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x93:192x93x64_n"2d381e49b8896620ad4d6b7d671e6938*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x94_n"0f904877048f65a046aab0decde17f79*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x94:192x94x64_n"e48cccfc5b158605d0b0fd716b660f30*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x95_n"4b374d7f43b880ebd2a2c55537b1e583*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x95:192x95x64_n"c56dafa5f38533d9bd7eb2a0d5656996*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x96_n"85cd145ed1982f39e49c5b846f07dfe9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x96:192x96x64_n"600896afb6f8ea4cc57a2fca8dfdd91b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x97_n"b8250e60da604155af64e0528c958e05*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x97:192x97x64_n"d92d9d4f06e9a42c60a47785eb351bd7*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x98_n"2db5f84e1c66f6fc7587dec1ebbba151*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x98:192x98x64_n"212f88b1fbbe2931e3be0634d74dfb1b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x99_n"8ff82f3c1c6347018cc279d01d294f8c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x99:192x99x64_n"b7d979ca410f3bd9a7048fe5725c4fed*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x100_n"331a2fe448867e1bdbaac91462da151f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x100:192x100x64_n"be7f2376ee9d7abe07fec9c037dda7aa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x101_n"e3a24d16dd4e229c93696670c4f69e89*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x101:192x101x64_n"eafe83d31bc0436737d4bd7b6ae07c82*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x102_n"64e149007b54b42192b4a117e6b5a772*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x102:192x102x64_n"bb48d31d9f55082ed3254d847be451b9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x103_n"816f94c7f59df8b1a0ba37904391425a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x103:192x103x64_n"01c88f78db558200114843ab43d86c28*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x104_n"4d33a76de33b62eabf74e36418b714ad*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x104:192x104x64_n"68c9c3bcc1645fa6073cd4d6da48e250*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x105_n"1dd532990ef6b19f10cd26e6627d8b4b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x105:192x105x64_n"1c6f9c05fc9dcf3928425d6562c0cc45*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x106_n"ea8a11487f2ecacd979ea9676c9cd0c0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x106:192x106x64_n"ab229d5744c6e46e26c2715dae62b406*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x107_n"be982ccfa20f71d4ab0c62e8e2a9cc82*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x107:192x107x64_n"1217e61fea10167db8d5a80830150e10*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x108_n"bd1825ba9e4e257874f821982ca62198*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x108:192x108x64_n"de811ba1c7405ba34596fe74f021889d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x109_n"9e3bb3432c22b9162ae90ef1cd0a3c17*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x109:192x109x64_n"e580b924c237806709beb05381394125*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x110_n"21ebf5beb2e84e98397710b37ca2f4f9*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x110:192x110x64_n"eb579ed1c3d538ea45314affd80cc4e4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x111_n"e7a1038d5545bbf8232e43f1fe913f49*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x111:192x111x64_n"29289514cb43aee70bc3693c90a7cadf*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x112_n"f88bad300b30472731f5577904a4c72d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x112:192x112x64_n"34bca2a750e2e9ed2c2db260ef56e006*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x113_n"11cec1f283002517aa1cb2fa31ef1d13*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x113:192x113x64_n"3a54cbafb8ec6b820d080ec4fb89dbf3*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x114_n"0d54d5dffa6732fe6050bcc269f7fb04*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x114:192x114x64_n"488e3d62953f04fe64c95aaf1aa3eb9d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x115_n"ec8a2b6b6b5ab80e577fd035471f63a6*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x115:192x115x64_n"cda5e917022c08cd0c2b9244a6e6163e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x116_n"468587541c1077cbbb8df4098f19125d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x116:192x116x64_n"777aa6eed3b019f2a8e823334d807815*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x117_n"43c25721936b79fab4c72747cedd3b7b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x117:192x117x64_n"962a9ac4f983727cc7a5e54244f29791*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x118_n"082958c8204afd7980a7a6d2f1636baf*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x118:192x118x64_n"1f0f0f2aa2d4de2cf3545b7a69aa8180*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x119_n"d1e7aab35b13e74da07ba5424d8b4c11*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x119:192x119x64_n"b5b15efd1a55fa2244f97de3935d84a1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x120_n"7afe0e58940e21acff685743f0986eb4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x120:192x120x64_n"75ef9f43b9e863490f62bd8a1aec5dce*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x121_n"540ce49299f71801201df465fdb17c3d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x121:192x121x64_n"c519794dae741aba393dfcde29f9815b*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x122_n"ef59680599fef3224ccb0516b4d2f41e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x122:192x122x64_n"68258429836dc85573a2bc6a8f1ae135*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x123_n"abcec841a365da918bf20627b2883a7c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x123:192x123x64_n"84c8a02d859e7435238edd75d46e71dd*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x124_n"a1381f9170a94e7a2fa5afe16b7c021d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x124:192x124x64_n"66fc558d92a22fb31a40f32bc7b07bbc*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x125_n"2eb47798e25bff80d3d0e6b9040b0aa1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x125:192x125x64_n"14b1e3777eeee1eb8762daf7e5651392*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x126_n"9164c9506099dbac375c770c32af03ac*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x126:192x126x64_n"79f25ba973767b17644a1538c5a1a0e5*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 192x1x64:192x64x127_n"87200a478784880e9a01631d06815080*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x127:192x127x64_n"a79b5b6fd1e5c462c4a2e08e1883e9d2*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x768:768x768_n"191429d51253abfb7e57649a8997a284*72"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x64:192x64x36_n"b670156dd999710ab86b55e9dc16da98*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x36:192x36x64_n"77a40956229004eddbbc269eb2b8a562*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x64:192x64x1024_n"f980f2eea6883fb2f01c5b1dd1589e9f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x36x1024:192x1024x64_n"71c4c0716a039e32f2c07738c2411411*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x768:768x3072_n"010c5ec7206ba17c0d986d8520c23390*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x3072:3072x768_n"9d64245c975ced73ba3555a33f6de3b4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 576x768:768x32100_n"62f643432c3c1a7c6ad289c67401a081"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 10608x768:768x768_n"ece6e9bb4470692c64a3cfab2423a2ee*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x663x64:192x64x663_n"26d94538e467748afcbb2cb15771269c*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x663x663:192x663x64_n"9d248295f7a2f337f35ce2e23110b037*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 10608x768:768x3072_n"dad57d21969c7390062fd67a428e3b66*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 10608x3072:3072x768_n"9ad0bc34b5011a5ae4318521d39b0c77*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x64:192x64x663_n"31bf37db81260fcd05f6175b8211dc1c*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x1x663:192x663x64_n"626bdace2787bf567dc9acc584b2468a*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x768:768x768_n"28da1faec64320a4690e59b07e90a0cb*72"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x64:192x64x39_n"c3b0757aff09623b62c37d7ccd9eb3cd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x39:192x39x64_n"37d13250e14c768e7b08606476a325b4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x64:192x64x663_n"d3ab9c9c4babd9300c94e600b7435ce8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x39x663:192x663x64_n"0209e86cabf149c2b243a44d7d8ce796*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x768:768x3072_n"67cf659db945b811f829a14c523cc649*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x3072:3072x768_n"a073120430049bdfeb8a14e81447f645*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 624x768:768x32100_n"333f9fe953fd39f4ce23a9cd6478b207"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2:abx 1x12x128x64:1x12x64x128_n"b9d4e5691499cf5d5f9ea929898d1eb7*60&8baf240f7d73d9d88a14e8ba2a30fb74*60&75b5ea79bf9ad3a05bbac6be015ac973*60&6cd302213d51e89ba86a305642e8cfe0*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x128x128:1x12x128x64_n"eb6576047cd91bc115077063b1811257*60&d69b80bdd047e7e2a617e527be357571*60&3d26a09b93d618bb18c2f815e2c9da9f*60&27c74002aa335112cf9323637906d582*60&318e92e8048f8736f9026e325257e77d*60&b8759b53815267b9e3e461497ded6121*30&2323e166b80a0052af25c6f6f46b5ef6*30&d86f7bab53a6bb4c4035c190c6d0a8a5*60&bf9eca8c2f8c8bf0f72e426f4c3c37e1*60&cce1ade71f1db6a1996dc4a08b7c9eb1*60&670b4da2aced836370b9da41dd55ae38*60&f6ba9eed8c01b70093ff72570de8f845*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: 1x12x128x64:1x12x64x128_n"af99e26bdfeee4a064f8f4fd4de415bb*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x4x197x64:1x4x64x197_n"f0c267907fa6c5a898f9fe8194fb614f*35"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 1x4x197x197:1x4x197x64_n"de11ca1eb1365f7657a11bb411c8552e*35&93444cfacc92f75bcb2b1b0f70d655cf*35"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x6212:6212x1000_n"0302d9be914e595a07724344629c8513&70bde0aeef8dae8bd69f44d29ca4dda4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x1000:1000x1000_n"af5ca6afaa862b0116e1d7683e44abd4&61db5deb8d934b84c0cdeb5521e3bdc6&6af16f5ea1565239a7b6341a1eebe3d6*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1000:1000x1000_n"1adf82eceb93f6ca6b1867d46d926da2&3fdd87e352f49a1ab4c0a1b8908a71a9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x1000:1000x500_n"3311326c13a3e707698379704f2d588f&fc151d729f73ae96aebc0e2a7ff4172a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x500:500x250_n"6419d149cefb0561d71039862325d1ba&2c4b04d21f9e5c6a472ec4ea17f3db73"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x250:250x125_n"6a78042e024395161bc1ee533c16c58e&d2f25dc10236df9cf5cf5e02c08e0843"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x125:125x60_n"9450ce1d84885ef0cc524b2c8fb150fc&8fa09e1770409774c70b9dae1bd28e03"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x60:60x30_n"8bb2fe4f465434717426138f3c8c7c33&ed314679632956412716ac300d562b2c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 32x30:30x3_n"4fb5ef224c8bb2dc552754a2c3c8a9c7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x3:3x30_n"107e7a3cdf9f0f179ea7d06cdebaf499"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x30:30x60_n"681690cc1fb479b03b0d85ffc9918719&3d3e1b5123f4b57393a636fd25605501&7f2b4426b8c86c11c44badf2e3dbe36d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x60:60x125_n"c69cde3f9b191cd255329947df5eb0fb&403cedb3555b857b0b5750573d4fd206&44cc614e85e179d3390097d45534ee19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x125:125x250_n"f979d9bbd51179124f90dccfa2c3aae1&3a99600c7f22470c7598cae72017da9b&6cfe0b69a2dbf3c2e9405e4a90317424"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x250:250x500_n"c89a595c5030db92d472f3711287b789&5511358bee8aac5e9495607520934486"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x500:500x1000_n"654acae98b749deb9b60a62082479c3e&cfbfcbcc3067f5f0dd3e40bd0473375f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1000:1000x1000_n"de07ba4b6d9f4ae97800d817057e4a4e*2&5ae51fe8a1d094d8f8a211977ad029ad*2&2eeef55c8d19b00d6808682ef32896e9*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 6212x32:32x1000_n"93c84ae76dc32362b0ad6e2ca7cedded&bcc67c26c0914d1c219d09ae591f59e3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x512:512x512_n"50c1b08e4d3de26d4b418096f170d105*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1024x64:512x64x1024_n"f6d13ffbfeb88b7456fd2368d5414c17*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1024x1024:512x1024x64_n"9fb480420210f62ce406a704e5871a5f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x512:512x2048_n"12154422608723af25d5b681fc1ce34d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x2048:2048x512_n"c2dc6528a986edd9da59a3f790212112*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x512:512x512_n"d120f773d0db01e13af85820d231e40f*9144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=cab --dtag=abc --strides=:: 512x1x64:512x64x1_n"acd29dc79255010db7cdef6f0b47b44b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x1:512x1x64_n"28fd07d4c0c29682a67460648aabc611*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x64:512x64x1024_n"21daf2791789e688dbcc652abda4f44b*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x1024:512x1024x64_n"ae962604fd80cb7de6b1f00f987d26c2*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x512:512x2048_n"f6011b2dcb72433900d698b5ed421a53*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x2048:2048x512_n"4e74c74e5c762f0022eb2743f7f02b1d*1524"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x512:512x32100_n"ff261cf9ba691356496ea4919d01f6eb*254"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x2_n"660f7e4b75b98639d519b159041f3d2a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x2:512x2x64_n"dcecde100432f7869e7dbd0133abad35*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x3_n"88f34228a27c89c0fb7f54d62954ebe8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x3:512x3x64_n"100989e8bf76b4d32bf433cf6641cb09*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x4_n"d49470a47c4a514f0cc3f271af31cdfe*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x4:512x4x64_n"0fc9eb2559e2a0f19d2fb70f468f2e9d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x5_n"83029de72ec77b0c739d2147f8001710*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x5:512x5x64_n"bbf02eda76d27523759a2de5136791ac*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x6_n"09ac2579fc797b18011a534b44595e46*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x6:512x6x64_n"3c6c590606666709e8c31e896f0865f0*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x7_n"f9848157b56e269b58d510cf787af66a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x7:512x7x64_n"0f19c013e0bb7305ac9f5e70dd2a144d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x8_n"de21bc9e8ca03ab074b8ed3543314a92*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x8:512x8x64_n"9703e6c8aab744afb4152fae0f8886f4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x9_n"e892bc1c0308d3e664a8f804068ced11*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x9:512x9x64_n"837ff618d03fc0269120345ac1465c2c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x10_n"7fc2dc4a5276eddcf99d81bbbff4b2a3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x10:512x10x64_n"cf0d0ee3aff7c13b4410035d66e66b5f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x11_n"744d43ea9a9919c2d4daf44180af43c8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x11:512x11x64_n"85b53e4d7b92ff76b8641bfe2282d403*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x12_n"5bcb481a64edbec94c479d2f64c9de13*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x12:512x12x64_n"252cc395ce86777179d098a2c9d4ad25*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x13_n"7b97ae221f2d9071c1e7903227879d70*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x13:512x13x64_n"d424288171dc2a8ffcb230bce4a1f7fa*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x14_n"66e0f26730a31d97ad9aad069b7322bd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x14:512x14x64_n"1d5d1b8d5a0616a49292775eebbbf157*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x15_n"47cde3b2cc79476165817b2ce5f4928b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x15:512x15x64_n"21c459676b08bd9132bfdacf080a60dc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x16_n"c2dbefbdfe796931edd881ad16907aae*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x16:512x16x64_n"8392e4f859f40aae2528c36d0671c3d1*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x17_n"847e5c8b7f8a4f510c8cf61ead3cd34b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x17:512x17x64_n"cea00c88fa1ec283a7733d8018cc1620*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x18_n"1bbbfa6777ead32348cb1604ef964c48*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x18:512x18x64_n"180d376e74b4b4355971ffbe2a71de8d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x19_n"c76701a1847d9263cbe3b9651fc136af*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x19:512x19x64_n"c38f49d4ecc435499893ec45894185cf*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x20_n"10de03c1babebbca052efe3e3aa441e7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x20:512x20x64_n"eb5bc082eea044ee0847056eb8b03f03*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x21_n"8446d1404cefe9e05c94aa4b929c4570*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x21:512x21x64_n"2817f5993a4ddb54c0d02b3661e3246c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x22_n"a6c3c76066c81afa3c1519687339b233*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x22:512x22x64_n"50be28aaa15d1efc31513a578ed22ed9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x23_n"1a96d3096b554531f361b5be73d501e7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x23:512x23x64_n"9bc325e2a8519124be323483eb5134cb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x24_n"2d01090559e4e11b141130147ca1f900*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x24:512x24x64_n"462e224c14253b1136ca2dde3ec27b8d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x25_n"9f0602459390b639baed5cdde1b3e2e1*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x25:512x25x64_n"953fadba5e10f76457a57144c4be3136*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x26_n"6e65b7244f59eb41c8753df478094bf9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x26:512x26x64_n"dd941c77ad3b1ef4f0099f90df9fee21*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x27_n"b62b04ead21d4e7bc3df8761b01d9b2c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x27:512x27x64_n"a5302dc57c1e0fc2aa7a74aae76bf373*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x28_n"01b2592469c4399e44f5b02bfda4771a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x28:512x28x64_n"beff2a22d544858a621d844be6acff43*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x29_n"f004cd1c5eea96bbca64b0b9737f6c3e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x29:512x29x64_n"2b324667d53c93f74b839b8954b9323c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x30_n"e41a7544a25af90b0187aa77fb67cae1*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x30:512x30x64_n"43f9ce44fd57509134f02fb90049826d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x31_n"3070c24d9a2fab236766d58f19b85332*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x31:512x31x64_n"62245afb2e183b982247b3ab7a1dc7b4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x32_n"2653caf1b6699f3259f2091b239ff14b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x32:512x32x64_n"6cd00c96bf0878d0d069738ddcd9d2ad*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x33_n"a649096c44581e7e57efffd590127d09*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x33:512x33x64_n"a8d3a1e9921b5b9781b74036cd007a63*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x34_n"13f4c5275baa9a7e8fc0b000180ca8ca*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x34:512x34x64_n"d0ac838a642714d4b66d0cff34b1687f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x35_n"0baaee066f34c5e84608fdf621840314*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x35:512x35x64_n"fcefe58f28cbce1d5e86fed4fc0296d6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x36_n"ee1dc351714208fb173c96089e447a95*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x36:512x36x64_n"d3987ec4cf490fcabe4e76a39767f088*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x37_n"d78dceef0f320986ea36313c4b42983e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x37:512x37x64_n"e07f2b536d2412cd95db6ba3a7db490e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x38_n"d8b9738b10c94d0565139bf34a5c3e7f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x38:512x38x64_n"663683ffe58302032254f5b4d47508cd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x39_n"e903bd857bc9257bf8aa84eb26cdb762*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x39:512x39x64_n"ad006274675eb600f9efd91729f2d822*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x40_n"5114eb2c4d8441722c334e747d998a86*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x40:512x40x64_n"366e826876e7452c4cf97e236d2cd1b6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x41_n"603fa7676370adbafefd3367f32e58cf*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x41:512x41x64_n"9da619d0c8874f20c4674b9bbcd38043*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x42_n"5dd53cfcb63647e4289f8696363b1d4f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x42:512x42x64_n"3b9cb958cea86450036f202136d1e0ed*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x43_n"57505670af19728e06719e5b631be179*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x43:512x43x64_n"da75b3251d7388812e2ae27efa361cf4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x44_n"af0bb85af7141bc1ca3bdd0ceeb8872e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x44:512x44x64_n"dc026076d4de99ab40b5c58758c31155*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x45_n"a6871633ff0d23854475d3f3b2633fcc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x45:512x45x64_n"b2e1479f0262ed63ddaeccf6d3382d13*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x46_n"4d4446f3327794563b64d6db2673c660*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x46:512x46x64_n"86a22f4e592b836ce45fff24670f7713*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x47_n"05c9f6e17280d0891a238e39b5e19c5b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x47:512x47x64_n"ba9598a279225916c267630594a302cd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x48_n"0c7775a24b121bab2a49ae88a15330b9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x48:512x48x64_n"0f5fba86a417f7eb9a30d8b86bd57043*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x49_n"a3af9bc4ba9f4d8916bf2ddf916432c9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x49:512x49x64_n"b6d9f12bc8785ddeff5c3bede75f9043*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x50_n"d11bcc7b3d4097df2429676c5b2794fc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x50:512x50x64_n"e50da5d890127e7c161f4a2d819a2d91*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x51_n"101bffe9d7412fe7a3e16e28d9d7f5d4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x51:512x51x64_n"59ac2a14117b30e79e140a1f93dc8663*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x52_n"8a9a1e6e0baa64e7fd6ce1d64170e1e1*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x52:512x52x64_n"2b6223e2bfe2547d88b29593943dd4f8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x53_n"ade654f2a251aa9086256bb42a6ce3ea*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x53:512x53x64_n"ed0c73fecd5a7d584c3bb5a0bf0d315c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x54_n"653c87807b4537cca417ab4c35188354*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x54:512x54x64_n"29bc1286a344c3bd9a2c13cd6f7c786c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x55_n"678e0b02b26031ada8a73c0a8034e41b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x55:512x55x64_n"a070e143e8a3c52d2a02c89bdcb938be*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x56_n"85eb474d6d59a6ba58b7281cb7468f44*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x56:512x56x64_n"cc47cad4067d51326c60a18887223caf*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x57_n"79196e62b49bffe205ce84fc87dc6db2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x57:512x57x64_n"acf8fd2456e3b7be250a4146b7515626*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x58_n"fa204929066e7a58f704fe0695642419*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x58:512x58x64_n"2f02041e77e8301f585b7a63e170e0f8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x59_n"69871c6decba00a6640c0a91566f8367*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x59:512x59x64_n"f4c5b464d489c956ac6620a1c69c9ff2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x60_n"2b83dee909631fab08f0a5b0f458f03a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x60:512x60x64_n"dd61e9cfacabb387e005cc868651c698*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x61_n"1fc8e7bb24bc9802a7ff439060039bdf*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x61:512x61x64_n"e14fb8ef5b45cb559dcaa0e1e42fa242*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x62_n"83d8487b1349c725249404cf7c3f0e94*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x62:512x62x64_n"1a8d3bdf1cd700f0e088d9f93d474d30*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x63_n"54226b97444c5b073da8fa7a3299a84f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x63:512x63x64_n"9752290f91385f9b2b602e84ccd15a66*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x64_n"2e76223f335117cb5dbe9d84950f1751*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x64:512x64x64_n"3bfd5bcdbe75e55a9fe62aae5457a40b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x65_n"145186e8f85d28d58754853343dc6636*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x65:512x65x64_n"cf8be1eed2d4d862212d0654fb17d76e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x66_n"b32faf0b6c2533c5f62bb0ae1db1c5f9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x66:512x66x64_n"29ba85268677f7de195c95d751177a8e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x67_n"0d2d9b97a74425f1ed8b61762f3661a3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x67:512x67x64_n"dadd21d2f6efeb415499b37587f453d3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x68_n"c32409c4130445fe40826230fa5b58c2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x68:512x68x64_n"59971e11deddcf340160b421581e7c44*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x69_n"8a2ca782b586a534e5d01237cd63135b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x69:512x69x64_n"8f331e7c846f1224351682e33984c23f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x70_n"dbba2f3423a57b05743abafb698317ea*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x70:512x70x64_n"c8d61a1fab33011cc5e08b4bec719d48*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x71_n"a83adf87d712e6294ed9c9c16d43a4b3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x71:512x71x64_n"fd3e4d4a71016fcbad4d1874c5f7a389*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x72_n"ee593e69a6f3238ae11a57fda4e597c3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x72:512x72x64_n"0813412a630c2ed875cb70a79b1cd20a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x73_n"2c229e4d70833d3bc7d8ea94d9ae3e72*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x73:512x73x64_n"daec1efba6a554e83df0f7ae0a29b332*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x74_n"7484c4ed8d8e98ccb4394c132ad8a120*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x74:512x74x64_n"b0943e15dd7742f681b520a2b5d2fdd8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x75_n"52b661bb985187ec72208ad208c22843*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x75:512x75x64_n"55e3e84c063af5f2090a540a588277ca*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x76_n"cf8e296ccbe39f917f8bb1ac86711f58*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x76:512x76x64_n"09840caa5344955e8fa318a9e567f405*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x77_n"8c519aa342c38a9fa420640f0030e595*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x77:512x77x64_n"8f3da4116de0acbe134c758e4a8f4af4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x78_n"0da539757e948e018da6fef18d09d585*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x78:512x78x64_n"e6b6f17cdff1deebec3befe73493875e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x79_n"1ad91162aba728d429b16dd4622f1037*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x79:512x79x64_n"b1aa12b14bd8097c4d57218dfe85c44a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x80_n"e84e26b73716c7a6d108701d5683af8a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x80:512x80x64_n"14c421ed3396af3d9779c14a4997bb92*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x81_n"9d3e912c687025ad29816dfc98e8bddb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x81:512x81x64_n"debda2270fa531dd436956e2fea1cf40*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x82_n"d9cc010ff01390b48b08ee2215f308dd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x82:512x82x64_n"9c2cc3b9b032cc106817ba71381f2d2f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x83_n"2d29eb3a666befc8eb6b085adec1b6c6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x83:512x83x64_n"4f307db58a9ca17b9fb5b40aeaf4e2f8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x84_n"ba1b3e39338ddc4941f702a6d49e078c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x84:512x84x64_n"4cc1b807c863d0edaf7a44386aae919c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x85_n"87ff4b795e4e1a62c33ebbcaaf7aecee*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x85:512x85x64_n"ec0677b6aa556e821dbe3ab73c336f14*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x86_n"9ac5d3cc3d5d25837bd1a23b5f4e9c2d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x86:512x86x64_n"6a12a11129180413b6814f2cd7cd5930*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x87_n"32942741ffc73608a87450897a31eecb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x87:512x87x64_n"6cf7f1562656aeacf510744b4403945b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x88_n"b7b42778066f1ae0b64f74111a733583*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x88:512x88x64_n"23f33fef5100a3d81cc20e1f6dbaaf5d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x89_n"1e4d8654030c05984b1e15e2ddafa458*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x89:512x89x64_n"22c90e208eba61210fc7956329ddc309*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x90_n"98ec77c75901e8fbc89406850dce0209*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x90:512x90x64_n"d6026bb936f7c6928ab0c9161ec7395e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x91_n"b06b68241d3942ed017a8e9697a60ce5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x91:512x91x64_n"990516d6543b76638e8ddba60cbff422*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x92_n"a054e15333d27e14d8c57586804708ab*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x92:512x92x64_n"d8da6004f7466f97f6c6f9e95137c0ea*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x93_n"b2a7d684ef6f51fdb37047f40e6be9ce*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x93:512x93x64_n"0fa110586863ae259b181706908b6d54*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x94_n"fb0d5b411087178a732bf7614e5d41dd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x94:512x94x64_n"3b485ac9442b12229ebc060b696aec10*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x95_n"bf637df4b1d8e72d0d2e72fd4078c8f6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x95:512x95x64_n"cd71603b95119dc6b59b0d1bdd708496*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x96_n"f2168fae13eaffa3d0479d114b406f03*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x96:512x96x64_n"0f792f9447f3473ddb2ff892872be5d7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x97_n"116192c7ae12d5c0cdddcdd2e4372c7d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x97:512x97x64_n"ee36dc4c8b58d45a28bfeefd266a73e7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x98_n"9afffc953ed11c631d2beacb924d0323*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x98:512x98x64_n"f1b9eb3fecac6fb50b8ea3d26a7620b2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x99_n"b87ca97421781b3d31081e55172d09ec*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x99:512x99x64_n"722eb041926f13464003b0645629f252*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x100_n"14dc8d1c8e5be2703782ec2952dd6f81*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x100:512x100x64_n"2daeaba05ed96b145bbe5d412353de4d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x101_n"4584836ffd915a149fa1518979d1b589*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x101:512x101x64_n"23b34ccedc0f6bc7cf11b8d04eabf13f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x102_n"31df3d5c8043a6255a613f7792b3a4f6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x102:512x102x64_n"8a478fcb28bf4d6a9282cb3cb28b8482*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x103_n"4e3a033ae3ce098fb73ac680dff9396d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x103:512x103x64_n"2f1a88cc2a0fe10100fae74954c1d260*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x104_n"97f42639e21c2bf23b0c816f52ddd1dd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x104:512x104x64_n"190bc3ba4830e9a67cdd942f95398aaf*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x105_n"dc3831e7a906a8f93170c21feb72813e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x105:512x105x64_n"991774814462e5bd3e02fc2e9f409667*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x106_n"46594f8e321bd77b18d4a6d5a40ac815*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x106:512x106x64_n"2480da9994a80966a3075d1802e6d940*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x107_n"7caaffb722a7016602b3538afea8a7eb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x107:512x107x64_n"89ea13cfe7c9c7ed1024928ce7923ec3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x108_n"207bb3ca967ff1e89862bcc56fbbc832*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x108:512x108x64_n"c223af6672db0c6fd45f0655658d86b2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x109_n"e2c9cebf709ed9314fd8f3b823736fd2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x109:512x109x64_n"863e10978a2e2d882f008b868cffcdc0*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x110_n"023106aaea031b1fbdcf626b2f5ec6dd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x110:512x110x64_n"f3b741bd34091d18718dcb99ce3d3463*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x111_n"23dcf3af85fd39617d042858ebf1fc33*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x111:512x111x64_n"8e60f6233af29f51fc43fb16043e29e3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x112_n"3b750786e83faa7897bdf278fad9aee7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x112:512x112x64_n"ffcabde52f9b9fc13265c1e7db628ff9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x113_n"e9e44d9570be9e8a013c8db5de6e7373*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x113:512x113x64_n"b39c1f6cc2432a0f929786260bf2de9c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x114_n"2bc84d1af1b0aef4e7cf153644d6032b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x114:512x114x64_n"3498304606c5034c25e3bfba10658701*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x115_n"574c6782cfb2f815688c7f9453ca46dc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x115:512x115x64_n"cb499a5db1d602ef1cae0203511f2924*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x116_n"bec29afc1c92ff4a503b948394f4265a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x116:512x116x64_n"bcf33b3813772ac50fc94436be6e5a69*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x117_n"eb561df0bce2982da5c632f440e161e5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x117:512x117x64_n"10988aaeebd045dc025a7de6bda59731*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x118_n"d11692c2582ff7eeff69f34a5b8d8f48*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x118:512x118x64_n"7c1d460b823fa8168a57edeed32f02ce*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x119_n"37799b2f21cc4ff1c54a7f2fcaed3dc8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x119:512x119x64_n"5315a440f1fc714fb5bb93b555ddb58c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x120_n"1018574f7d3eeae5c1e6a94ea685ab45*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x120:512x120x64_n"1392fa3d903d7f329418764eeb62d3ef*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x121_n"ef33cf2f4ca715c7c11dd77e972baaf5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x121:512x121x64_n"aec006699074faeaec8651f40bc06ba9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x122_n"c9ce41ffb54e8cf8d98b305dddcba63c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x122:512x122x64_n"ea4d24f82d2d220e7b0c925d00d6facb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x123_n"9eaa5b628634c94f4142362a380875a1*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x123:512x123x64_n"de217feac4ed86663211de28ade572af*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x124_n"67441039d82400c6966baeedaed46666*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x124:512x124x64_n"ee116932e27dae4e4cfb6674ee3cade2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x125_n"6567cc8005dbcd9c431d8127e8fcd852*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x125:512x125x64_n"b1ff8b73058c8586fd56498ea88d0d9d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x126_n"9cff5eae32d86c3d2252829f0e8bb74d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x126:512x126x64_n"c46ce288dd4ce14e46a81f2187778f33*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x1x64:512x64x127_n"54711ea84454a34de8f1cddfd26197c6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x1x127:512x127x64_n"25d42bed6486610e10560ad1013545ee*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x512:512x512_n"c19b5cc50b540e5b1fb03742588ce1a1*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x64:512x64x41_n"594ad136cb3a63524f4e2d1bd496d55a*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x41:512x41x64_n"eceffe9fd9c3d66d3267d646512e449c*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x64:512x64x1024_n"796ebb0adb08cba3e053eb79351d3f6c*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x41x1024:512x1024x64_n"954a77c1291fbc5ea95a7ca59caaf6cb*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x512:512x2048_n"ea533200acad18c1e0a3643ffc6be498*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x2048:2048x512_n"64c89dd28303b6cf9a24fb4d7ec2f7f1*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2624x512:512x32100_n"8251aa055b07cb357f8cdfa0367b9eff"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x512:512x512_n"f250db653143e794745d89f78f3547ca*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x64:512x64x47_n"f8d3cbdcafa71fd88146ba411c63bb30*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x47:512x47x64_n"62d3f2268e8a47e5e0c6a75e32a2091f*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x64:512x64x1024_n"3f3525e47fb8703f6375fecef1ef2658*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x47x1024:512x1024x64_n"28a127cebc1e17ea7ebe338b233e1e3e*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x512:512x2048_n"a1952516348a34870f68256b68b7f822*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x2048:2048x512_n"acd907fc290487271605bb900bc8bbe1*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x512:512x32100_n"3e38ee96d913ba897909dcc9f299ef97"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 128x384x384:128x384x32_n"42a3ff30c05640f79e381318c5b3aaf5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 128x384x384:128x384x32_n"d553eb1ccf44921f5d3d842fe126ecfe*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 27x128:128x27_n"86386e4ac5f78a98549de46b18073ca7*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2:abx+eltwise_clip:-1.0:1.0 1x16x384x64:1x16x64x384_n"05b54e68ce9d597827afa7f95f320f5e*120"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x16x384x384:1x16x384x64_n"01966c829dd212aa9970d39a49b08ce8*120"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0:abx+binary_add:f16:2:abx 1x12x128x64:1x12x64x128_n"b321e03473d903b270b4e3179621101e*60&5aa014a508591cddaaf3a44f3647f388*60&530278aab719f522397a7b2f9091f7f0*60&13f154898e15a6bb17f1686388f69ff2*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x900x32:1x8x32x900_n"7f74964873566e8f9ee9a3689374cc5a*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x900x900:1x8x900x32_n"f37115d08aac31be21b831947fc43e73*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x32:1x8x32x900_n"3f932ab0f9792ed8b3ffc57e7161c5f6*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x900:1x8x900x32_n"2ef162959432883875caf4753a52da76*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x32:1x8x32x100_n"90da5d33d230ea0591324ccd672aa4f9*25"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x100x100:1x8x100x32_n"6ac4190c38c6e0eaebf0c6e425cdce55*25"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:3:abx+binary_add:f16:3:abx 1x12x1024x64:1x12x64x1024_n"c447d5839b6d4a149d15165eebce12de*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x1024x1024:1x12x1024x64_n"5786a39ab7e70075cdf948854a7fbc9a*60&f1814b6a84435f7bfbcc8ddfb439e238*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=any --wtag=any --dtag=ab --strides=:: 24576x2:2x1024_n"827d572b7c7dc36b49cce092b3f3204d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=any --wtag=any --dtag=ab --strides=:: 24576x1024:1024x1024_n"decd6b7baee077adeccc59ca9fa534ca*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-scales=dst:common:0.5 1024x384x64:1024x64x384_n"44606c316c98a2470a6cc81572005f66*24&debe75355db1299230cfe547a84e97aa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"0600d47f80805f77fcf8785234e9cd7b*24&683eaa2b0c708d685a2bcee8cbdd583f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=any --wtag=any --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 24576x1024:1024x4096_n"362168ad1d3b7e5cf29dbed5c51a6a4f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=any --wtag=any --dtag=ab --strides=:: --attr-post-ops=sum 24576x4096:4096x1024_n"d85d8075a06e8823241d3ecb4f62c65a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"8a8902835672e9dc5b08e3fc8cbf6611&01829864e484b6d21ee6e36fa9276f98"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2:abx 1x12x384x64:1x12x64x384_n"9b7f84f5aeb944ae6888bf220beeeabb*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x384x384:1x12x384x64_n"b2d5ed382281680804ea4b95c7713c25*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:1.0 32x1613:1613x250_n"6f684c33bf1d1f7d99a1a9562ce9ad37"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:1.0 32x250:250x125_n"d00a2fcc73f1771c364472b3d29263bd"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:1.0 32x125:125x60_n"d56789b79995fcf759dc979db6ff7858"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_elu:1.0 32x60:60x30_n"369eebd3e26889b2a50d5a485de02b6a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x30:30x1_n"5ba261a3f757314aa350e137db85e4a2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x30_n"1ee8ba347e9fe46300aaa18947662f2b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1613x32:32x250_n"7bb19774d12c7af6c57dc1e8eb9cfea3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2:abx 1x16x384x64:1x16x64x384_n"57510af8630a289da493895bd3187fd1*120"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x16x384x384:1x16x384x64_n"0dcab6135d2dd7c11a0e84dfd3e66ad6*120"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x9216_n"dd9c885ff1ef1e32882777e45418a86a*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x2048x96:256x96x2048_n"d609596dab8ac2487281d52552b1321b*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x2048x2048:256x2048x96_n"b5b567f93e5590b4086319fbf162a0ca*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x3072_n"3074e354f7b47f7298046e39038f4a3a*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x12288_n"e5266f2c6149479e461e3d14e56adfe0*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x12288:12288x3072_n"e7cb69f525ed07886cde937ff4eb3c9e*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x50304_n"af803516787ba4420af1a0cb542066e3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 50304x16384:16384x3072_n"f0692dabe07ca114750d859e980b1670"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x50304:50304x3072_n"85a269ec83f0197180ed426fe511b1c2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x16384:16384x12288_n"cb8e7d612e4ad54cf5aee17b53383aa8*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x3072:3072x12288_n"f1cbb4d23d2862ac19162548f75aedea*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x16384:16384x3072_n"22d00324a0006cf3acf29ed567293237*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x12288:12288x3072_n"4e5e91bc58dd604bcad2169965f16d6b*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x16384:16384x3072_n"f690bdb222f3cadd84b0fb47a993282e*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x3072:3072x3072_n"69ea7373d0d6200dd4aa685a9c49b823*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x2048x2048:256x2048x96_n"4c38e82a7846a96095aeef2567e0aae2*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x2048x96:256x96x2048_n"c31dd5ce865c5e3865be5b4b9758a51c*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x96x2048:256x2048x2048_n"0aacd0c301b18971d22c6187bfb35b19*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x2048x2048:256x2048x96_n"eb6293cdda54db930f687c305fd5bb93*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 9216x16384:16384x3072_n"7018210e5b4f38e6a8e8d50aed30f86c*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x9216:9216x3072_n"d05a824f7f41868ee2c212fc1ce71a6d*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1840x256:256x256_n"973398e71084f95be33dfb795e42960a*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x115x128:32x128x115_n"8b3d88e9470b26f0fec32b36e95adfd3*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x115x115:32x115x128_n"c6040a2c9ec9913802988100c0f95337*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1840x256:256x1_n"59af00ad91eb97628ff0ba232e83fb7d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 13776x256:256x256_n"c6a90f5d313f4ec6e17bd35e73351062*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x861x128:32x128x861_n"c17bb496c9515653dc7bce082919f091*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x861x861:32x861x128_n"60e438a230ebc196f348b52c15dc92af*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 13776x256:256x80_n"0abc53fbec3e0387ca8444371f8e3696"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1856x256:256x256_n"3259c65bffe934e0d4974893308af1b4*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x116x128:32x128x116_n"b30ac9f162f3bfae5f5f00fde70b6d49*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x116x116:32x116x128_n"dc8c4b92f1b42216e1891e4f6644b641*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1856x256:256x1_n"17d0037de0a63b6d790e85dcd831076a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 14448x256:256x256_n"54f4623a996fabc6588e5974bceadb59*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x903x128:32x128x903_n"d6bf65380cc0adf89056b6f7bd493af2*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x903x903:32x903x128_n"4200afcdc48a86a2ca82de31d5cf408a*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 14448x256:256x80_n"454dfdab1e85eb10f34e0dc9588e346f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x384x384:512x384x64_n"c221e39675476c0c6b5b68fe6ee28b21*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:f32:2:abx+binary_mul:f32:0:abx 1x16x384x64:1x16x64x384_n"87a1599e524016d87df45d7bb95d9a90*70"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 1x16x384x384:1x16x384x64_n"03a46fd1814f94496076dd61e620c615*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.25+eltwise_clip:-1.0:1.0+binary_add:f32:2:abx+binary_mul:f32:0:abx 1x16x384x64:1x16x64x384_n"e81ea8aa5ad208737640f18f8c54431a*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.25 1x16x384x384:1x16x384x64_n"eb2803f9a3daa8d6d5adb64e09a3d9e6*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=ab --strides=:: 1024x2048:2048x1000_n"ae0da330c83681fc93681c1e52828f69"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:15:abcd 64x3x49x32:64x3x32x49_n"48f1378972cbc407d7b3bc43c2c5824c*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 64x3x49x49:64x3x49x32_n"3824ea12d982cf28ee98a1acda97591f*10&367677e1a5850d09e8404b1d1abbf712*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:15:abcd 16x6x49x32:16x6x32x49_n"d891058430c8c91bc8dd104d1a607f7a*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 16x6x49x49:16x6x49x32_n"c25fb0189360402a472c00b60e21f205*10&4ae428e4726ca928bd8c100cc990d322*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:15:abcd 4x12x49x32:4x12x32x49_n"dba84b7393de379b60d0989507f87728*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 4x12x49x49:4x12x49x32_n"f48ba8bb2fca9407420583e17be4b195*30&5b86525b897e85c61abce0e83f0ee40a*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:14:abcd 1x24x49x32:1x24x32x49_n"45c33a0a79fda5d345c0040ffcfedf01*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=acbd --strides=:: 1x24x49x49:1x24x49x32_n"5f0ed16ae499eb7f00365379f41cfc18*10&f847ea4538ce2435d4085ba0005cfb02*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x1024_n"8dc57a003b7bde002e49ce592d4a3b96*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 24576x1024:1024x4096_n"66917a4249784f8d1027eb6ff1e2ebfa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x4096:4096x1024_n"e4953a86b98aeafb062e43b81abe9201*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x256:256x16_n"4ccbe4e33a1b6f50487e038cad8ded27*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x256_n"231384978a384e9d7216e1afd96a7d69*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x512:512x16_n"3c9c254835e4c32d9399ad1696791ee7*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x512_n"57e08dcfa36fdfeda6a53a04eb33d724*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x1024:1024x16_n"c193828c453a6bee6c48d3cbba71aeb5*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x1024_n"42e7736a4f006213ac9bc956c0781cc0*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x16_n"4def3d98e677ad3ee755da3de41c2704*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16:16x2048_n"7c21ba388252a7f6f1bd2bdaa5cace16*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1000_n"388a36ddbf2fecf2a0b74b510e4f4263"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x2:2x1024_n"258c94cf06705eaa0bc6be9173c64fa2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x1024:1024x1024_n"4dd90c70efebaf79ad15fa8718396cb8*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-scales=dst:common:0.5 1024x384x64:1024x64x384_n"d3790640af5409eba635ed40185f29fa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"5a6dbe78a6c2cecb5da1cce558f62520*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 24576x1024:1024x4096_n"eec0373273d6620f24c06b398953c43d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum 24576x4096:4096x1024_n"1c2e8330b76314856be8a5f712a23c8f*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"f9e07c3dee286c07602672a50ff57383"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:2:abx+eltwise_clip:-1.0:1.0 1x12x128x64:1x12x64x128_n"e49d6bc38d65de7a30e7a94f5bb1cd8c*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=acbd --dtag=acbd --strides=:: 1x12x128x128:1x12x128x64_n"3234869c9fd80e40e1d384cb23082909*60&945908b03facd3f743c2176d9400de6a*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 512x384x384:512x384x64_n"7e24c6163825241a7e38aecd38e7b39f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 512x384x384:512x384x64_n"2345fde40b0790035ff7ccd913e9cbf3*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x384x384:384x384x64_n"702338dfb500b4dec574078b49439d07*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6x960:960x1_n"3ded077dcf1a8a7c2125459cab6ee582"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12288x768:768x768_n"fa1d0c7fdca46baa17b303467f0bf0d8*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x48x64:3072x64x48_n"ce691e622b639c0015d47610b6a1692f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x48x48:3072x48x64_n"7b627d245d3aeb0f80bc7ac3abeca101*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12288x768:768x3072_n"819d761cd3f0be59fd4686eebb9ede4b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12288x3072:3072x768_n"4e371afb0a8ede0761e2248c58677c42*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --wtag=ba --dtag=ab --strides=36864x1:: 256x768:768x768_n"8c616241646bd391a321aac7bc570435"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x768:768x1_n"53083166ef4d4cbd6b7f42a31ffbd4a6*2&1eeaa1765e64d0efd66bdaa1badc46e4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16640x768:768x768_n"fb232fcafb640703999faaafede80ad1*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x65x64:3072x64x65_n"d212f5f2e2a498f2d4e68ba6db0ad7a7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x65x65:3072x65x64_n"a7cfd04ede0cae67e3c6a7c4ca75740a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16640x768:768x3072_n"e29c7ebf04a36e7b742fb87b44f6a9f8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16640x3072:3072x768_n"98ad4d821c1b45566d6e1b35c61447c9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --wtag=ba --dtag=ab --strides=49920x1:: 256x768:768x768_n"defbf718491cdc0b311beb6c1cd64bb5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0:abx 1x12x128x64:1x12x64x128_n"e39c352d493d8b60383a4ee0ee7dd05a*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8000x128:128x2048_n"1f2b4579d65a276ab277b28bd1303ef1*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 32x16x250x128:32x16x128x250_n"a149c6cf184f6cb24ceefb519a02d1d8*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 32x16x250x250:32x16x250x128_n"e38c35b97fe36af74ec37a7024d938d3*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8000x2048:2048x128_n"59e3ad735c2c41703df5689a5b7fc247*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8000x128:128x128_n"52fd481f8fbf5b257304532499740861*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x32000:32000x1024_n"086a011d29074d48e98e342019f469fb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1024:1024x256_n"8f1f3f929e25c8ba53e4829a55a67052"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x256:256x64_n"88eaf375d2e2610c50bc152b7cd0f8c2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x64:64x16_n"487ca157db158e753ad6391e8fbbfb5e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x16:16x2_n"f135a16e0338596a413542675ccf2d78"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x2:2x16_n"b57fd0e74da5d3ff3d723a7dd5de1ac0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 16x32:32x2_n"c5b62ad71822df958b60c979f43030bc"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x16:16x64_n"cd52acf8f6830991e865b5f8209b49f9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 64x32:32x16_n"5b62abab080cc6d68a7e8750254a18c7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x64:64x256_n"693f7d83e13f62029a5123a92cd04f73"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x32:32x64_n"5d9f36adc531dd5247f51e449047d0b7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x256:256x1024_n"9b08ee7d3d3393f5ed0b90b878f45461"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32:32x256_n"ff08772fd0690c251da6ff554d4e216c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1024:1024x32000_n"0f47a3719f6107e4a145a0c4dcf6571f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 32000x32:32x1024_n"c5422b919c1528f5e7b3f9478ea1703f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8000:8000x128_n"a977231ea2bc233796bef20f86c59301*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8000x128:128x128_n"39a4b00dc73f25a284bdc65e8aa4b88c*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8000x128:128x2048_n"6c1556581c67692896ec7e78294641f2*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8000:8000x2048_n"44410f18f4d0b651688dbf9847c2cae6*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 32x16x128x250:32x16x250x250_n"2e3e83eb2fc838a7b0756b9211632dcd*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x8000:8000x128_n"4248398ca9d35ef025f01a55317546bd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8000x2048:2048x128_n"3aa3611ea7f0466b827f67615e518a9c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abdc --dtag=abcd --strides=:: 32x16x250x250:32x16x250x128_n"e2239e1261ec3f2fc768a4f1291579c9*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x1024x1024:12x1024x64_n"f24d96fa71c3c4971d8f8dfcce4e3a41*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x292x64:1x12x64x292_n"0e348366c426731b77323d5ce6a37abb*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x292x292:1x12x292x64_n"e774b27c8ef1fcd0e66f131e00742852*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0:abx 1x12x128x64:1x12x64x128_n"9b68093b2d3d70d85318f259b458a57d*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:s8 --stag=abcd --wtag=acbd --dtag=abcd --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 1x12x128x128:1x12x128x64_n"75d1e96bb79b1d1c558deb0e5dfd6181*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x512:512x4096_n"db86971b965d37396d0361453ce15604*5&ba64081df629a0f5d17f35c44563579b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x512_n"1411710be06d950394fb9d867ea1213b*5&1f3e1a2baaf8856d09d0589e5f8138e7*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12800x768:768x768_n"35fe67d89a59c36d8712ff1138cac2ef*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x50x64:3072x64x50_n"edc41976464871c94f3b2b73e3f30df6*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x50x50:3072x50x64_n"eca7c2112e22d008630feb6c3da5612d*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12800x768:768x3072_n"212135b3e1b85627e4cf538218dc6921*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 12800x3072:3072x768_n"a50a9a7531f1609cc4753131bbc1fd11*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --wtag=ba --dtag=ab --strides=38400x1:: 256x768:768x768_n"f99004262ad2efcb44e18df72641f3bf"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16896x768:768x768_n"97951f8b68a41ac294a68f828ad12e40*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x66x64:3072x64x66_n"70356901c137962a5e34a02ed595738c*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x66x66:3072x66x64_n"7aaa5fcdbf4afa38719a5ed6f926269e*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16896x768:768x3072_n"39ac66e46e4d12a8644756b277b1ac65*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16896x3072:3072x768_n"b6912ea3155a5b1b054c85d483cff9f1*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --wtag=ba --dtag=ab --strides=50688x1:: 256x768:768x768_n"74f65c28c4a48cf1b31ce4876442d3ad"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x768:768x256_n"993d56e91711f11ab07d74aa38f86cfe"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x256:256x256_n"e4e3367c82f171dde417cdf1de5c0c73*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x50x64:256x64x50_n"6c5e2a3f3b552631d2535cb4302cdef9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x50x50:256x50x64_n"07f5e97c4ec982bc6f2d0b7927c6ae9f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x256:256x1024_n"8a4c685be32ebd9bf66a51b1242c1052*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x1024:1024x256_n"7d83fa8559b18ae37d28f42fe119e1df*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x256:256x9_n"7c3ec86b15b12eae71297f4795f9bd2f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x768:768x256_n"02036feb797e8453858fac4741d6b5ee"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x256:256x256_n"797cdbdb8dc86d5a7a51e55fb3be209b*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x48x64:256x64x48_n"013363926ddb4b5f2d8c03dae642d750*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x48x48:256x48x64_n"76d3b38846b87293479ac8721ca45664*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x256:256x1024_n"1b4684437a21690d01d5bcbaef01b543*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x1024:1024x256_n"7b095a598b5c464d48c84fb6208c8b79*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3072x256:256x9_n"24f7266a8fbcb9e6d28baae077b3d727"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f16:3:abx+binary_add:f16:3:abx 1x12x1024x64:1x12x64x1024_n"a374aa0f8a8eaa42cdb10f6ce8681458*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1001_n"dd58d9e0b54f848beb3d65f91ae54cab"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 6144x49x49:6144x49x32_n"c56f9de091e57e9b7902db0429e760ad*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3072x49x49:3072x49x32_n"de13d02e8d58babdac8d7675acb150e3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1536x49x49:1536x49x32_n"4fe088ca6dfd4be3b2d5aec653010fb6*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x49x49:768x49x32_n"63b9c84e2f4c4c925a44261d667f4316*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x77x64:1x12x64x77_n"0d617e5047a3e768f94a33f5deb59016*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x77x77:1x12x77x64_n"3eb2714bb9be772fc8f7cff647364ea5*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x128x64:1x12x64x128_n"52a827817e784172093ca6725cd8b1da*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x13:13x1_n"f49971dbc7c75d8f38761578fe1cf18f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1053:1053x1024_n"f14a48a31f20d4a0e71d5adf66bcc204"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1024_n"3dcdf43f781c4450da814826c21c2dd9*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1_n"a9579771d71e8cb7e79314148aee9a3b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 13x32768:32768x1_n"7d7904f11658851855e772579c56d195"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1:1x1024_n"f9ba9545c653371e4fd5e41aecde66d7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1_n"0eb43e93e6cb6826a17bc7ef0c71ce2e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"eecca4c1cb3198da2e2fcd00f3d41233*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1024_n"9c763ecf3867b9b1d8fae6296f573e1b*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1053_n"8f079453742c1b01a742865e7abc44e3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1053x32768:32768x1024_n"75ae94f85c5c22f8514c402e0bd8b42c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0:abx+binary_add:u8:2:abx+binary_mul:f32:0:abx 1x12x128x64:1x12x64x128_n"eb5fe523dfbaf37e1859785d1fdf9382*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=acbd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0:abx+binary_add:f16:2:abx+eltwise_clip:-1.0:1.0 1x12x128x64:1x12x64x128_n"a480ca8175d0a2a80303f3b12da181ae*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-scales=dst:common:0.5 128x197x64:128x64x197_n"1098b7dae29bcf8d792641b74df58179*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=binary_mul:f32:0:abx+binary_add:f32:0:abx 128x197x197:128x197x64_n"a22cc893880adf672ea14ddb25bb8c2f*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x95x64:1x12x64x95_n"658dbcaca8d4a1b3d8534c95e82dcc40*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x95x95:1x12x95x64_n"328250fcedfbd8778bc4e22449680ea5*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:15:abcd+binary_mul:f32:15:abcd 64x3x49x32:64x3x32x49_n"ebabe095ee7eebdccba385fc715a66dc*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:15:abcd+binary_mul:f32:15:abcd 16x6x49x32:16x6x32x49_n"79b5af05469354ceb9dd6540d40464a8*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:15:abcd+binary_mul:f32:15:abcd 4x12x49x32:4x12x32x49_n"6065d4992a082db50128c060570c954d*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_sub:f32:14:abcd+binary_mul:f32:0:abx 1x24x49x32:1x24x32x49_n"652829481e7755d1a4eefcdd6ccfb04d*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x197x64:128x64x197_n"10869adae27c8f7b0ae42af9e6045017*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x197x197:128x197x64_n"09ba91ba1691b4b84001a5feb0cb31ff*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x30:30x2_n"1f1dead1ad7ddea19bfb047900d0ffd2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x2:2x30_n"d08ddce2ed96c7d25be40b785e3ef45f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 5x512:512x2_n"4de5a040e79dd33bc35f895c956ff26a&c7bbeea43e75f010b77a7fffe7af32b7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x3:3x3_n"86c9f52065207f9cf711c2edf3eb220f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x768:768x2304_n"6ea739a14be8c061b47e9fd129247468*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x50x64:768x64x50_n"23cf239b7bdb661bcfe4dc761351744f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x50x50:768x50x64_n"896f93a89e82e9f9677116a8d65d0eee*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x768:768x768_n"9a5ace74a1cf7e1e95fcf6d06a0a0c66*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x768:768x3072_n"d61e92f4f87e5690ea66f8ba37308814*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x3072:3072x768_n"9d25ab68efc319a961a7b54b7b4bdc54*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x768:768x9_n"0d10bf64055cc24f680cdaf1a54c8328"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x768:768x2304_n"51ed643f24a0168b839783dc9d88b4d5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x47x64:768x64x47_n"a70c8c128479adfc0d72ae486883eef9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 768x47x47:768x47x64_n"23fcc17f7b3350ad7f7e906b16730ae5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x768:768x768_n"92c5c3f8e4a3641454754d39c50d1432*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x768:768x3072_n"9b7a008ce1bfae5160330c96fbe9de25*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3008x3072:3072x768_n"695752837f9088cc997898af152f28be*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3008x768:768x9_n"b2905e81cbfb67a2ec3a5a1438dc41a6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x128x128:384x128x64_n"66cf88158730d5a6bf0da06b752fab22*12&4e6a80f6c3aaaaff25778b135f39fc0d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s8:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=binary_mul:f32:0:abx 1x4x197x64:1x4x64x197_n"960108ea9c23bb6cf4e2ec2f51cc6a4e*35"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 12x1024x1024:12x1024x64_n"8394b1365519ceab65be9cc93d2868e8*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 12x1024x1024:12x1024x64_n"a717c78d286bbbb4d7c76ab9c1b177c2*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 100x110:110x1_n"832c9dbea0d2966e1217d48b5403ecf1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 100x1:1x110_n"45a0eaa240587aa992d368306ef42604"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 33390x300:300x512_n"49bfa2cfcd57f9054a69d0a2b059f724*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1113x8x30x64:1113x8x64x30_n"37de3afc6f8afc7e0f34f6e521cf5750*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1113x8x30x30:1113x8x30x64_n"ebbccb50c0e61274a73770bb4e760094*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1113x8x1x64:1113x8x64x30_n"e4df798586b2d08fd1e07dd513ccd3c0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1113x8x1x30:1113x8x30x64_n"a4aee6c76da499990c781c2ac62a95a6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 3200x512:512x512_n"46aeaf96d502138c659b72e6cc2b74bc*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 64x8x50x64:64x8x64x50_n"095d231e958336d6f83260e6f319ed9f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 64x8x50x50:64x8x50x64_n"dc50e52d079a82b1717977c30215ebac*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 64x8x1x64:64x8x64x50_n"6b30e7d4c2caa6c7c9d35b85c430dc29*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 64x8x1x50:64x8x50x64_n"8fe34d34ba5453c6ec925940be908e8b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x512:512x6_n"d21ffb22052c88fa33819570bceb1942"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x512:512x2_n"86d866912e8509f52978f363e72dbe46*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x512:512x3_n"20b7ffb03f1e90f0d66c7be5e9ac6b2d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x6:6x512_n"fa054687118b8122cc4a5d26414e5d5e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x2:2x512_n"0441c183e665c419738eed9b665e4b9a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x3:3x512_n"7af62ecea3dd5b569c54ad92a661c064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 64x8x50x1:64x8x1x64_n"31bd9910a59215a22637f6be819ca61b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 64x8x50x50:64x8x50x64_n"a17c2a33f62aefa3497e8da04f1b1d95*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x3200:3200x512_n"fd00b0d3d26ae207abaa439c5ea1ecca*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 3200x512:512x512_n"e677a73d45ef63b14502900e035ea14a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 1113x8x30x1:1113x8x1x64_n"7fd437630183a7970ec7f81fbdfaf7fb*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 1113x8x30x30:1113x8x30x64_n"673158f98a5e1c8bf3db401e6066c2a2*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 300x33390:33390x512_n"a812abbab1c72f06dac4717be0bbe672*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x13:13x512_n"ab7a44ddf0c1a22f095203be6dc9fcf5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x512:512x256_n"feca91aa1ca9b05a9a45e8bd21657da7*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x128_n"594b7eab8cf1f62e70b65ab8a9afb99c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32768x27x128:32768x128x27_n"c2cc7dffbeb33d299ced80a38b256311"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x479:479x1024_n"185c59301e82dfa7a86f3b57abec16b7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"ae1deee67cf8b0d9ef058c3083142630"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x512_n"9d8ae2df8c9ec2913ed37c07cd7ea19c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x1_n"36e6fd6a376feaf3bab5fa73dd50946f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1:1x256_n"4e6d7b2e3624e329a28306f532e3510f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x32768:32768x256_n"d958c9bbd99b41da27c5fc29957250b0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x256:256x512_n"61a7463d4521bd8eb09d974aeb80bbf5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x32768:32768x512_n"b85b69eb7098b4254f7b5c4d4ddfd53f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x512:512x1024_n"edd7c5147858f2126defb464048a7dc9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x1024_n"6b6f7addc3bc4abb3c5333e607743de2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1024_n"290c0e35fe8d1f37bbda72f60a9ec528"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1024_n"f0ef1a34b6436c185dd58e485bdb5412"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x479_n"2a21d07fa63418169418751b541627f4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x479_n"4814060f89756b50139674890bf60292"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 32768x128x27:32768x27x27_n"24c3d3575457d96c39ccf7a81658403c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x27x27:32768x27x128_n"2e37ccad44c3b2dc39d19b7d4b81ffca"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x128:128x256_n"906bf6fe5bb86f709e3b6ddff1a95818"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32768:32768x256_n"37c65e05508a038f29b3ac755eb8df1b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x13_n"819b6a64a3df50eb8b2f1578a522abe3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 384x95x64:384x64x95_n"79b4651cfa21ba2527061462b333ba45*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x95x95:384x95x64_n"d624affab186b714af4ff5c2b74e613e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x942:942x1000_n"55edfdf71326b886dc5e10ea66fc6910"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x3839:3839x1000_n"5f1ef4d0f5503fc856126efe11d9ca52*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x3000:3000x1000_n"b6ea1d07f6cb67feb6c5e77ccd9fdb89"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1000:1000x1_n"4fee5ef1cf1d48f2cb2667b5b6b37296"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x1000_n"d814d6a21ebc0a8feae84c64def90a84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1000:1000x3000_n"d79b7112b01d6eefecf0465b555abcf9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 942x32:32x1000_n"5159b236b9969346ce294b2db7404f3f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 3839x32:32x1000_n"e74bfb7c0d5ca1a4edfe1ddff63e81cd*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 6x960:960x1_n"f00eeb6ec72518ac974f8209156299c7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x1001_n"68480d1332341e98fad5b9fa3d449953"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu 32768x13:13x512_n"7deaffda0ec7d2a84ca4a913eedb1109"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu 32768x512:512x256_n"169c2976112ed15e902d2bddd4304c16*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu 32768x256:256x128_n"acad7ec6e0238271ac821915d386a31c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu 32768x479:479x1024_n"845a20a18d38474d565726d11289874f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x1024_n"ce19402e1381cd782cb46f9f93d98d99"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x512_n"2dd0f50513237602a5517ef1a001d83c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_logistic 32768x256:256x1_n"0c427ad95502204329bac0d10fac7896"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 64x29532:29532x1000_n"a4a8b76a61680fc90981749ef2259fe6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 64x1000:1000x500_n"1f7699db2b7b0dd9d6b486fe9b8eb28d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 64x500:500x100_n"a909dfb466dccb94c552d963005d3bdf*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 64x100:100x50_n"6f53738c041fe70bea59064b1392d32d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x50:50x1_n"9ef6bd212d6fed9590aa9c7b3d137e73*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x1:1x50_n"2e01094fe9b3c73fc6b71b8a644f4993*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x50:50x100_n"f9fa370caa2b4863ce43ba4c0f7379ed*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x100:100x500_n"c1a0c26d2eba2c081b5a0cf234bcf416*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 64x500:500x1000_n"a9f98738bc0c0a82bdf95f754234dc5a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 29532x64:64x1000_n"ca5d0511dba12a9e630cdf1c47e59a09*2"
diff --git a/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu b/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu
index 36de24b38d8..e9caa77961d 100644
--- a/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu
@@ -1,7385 +1,8229 @@
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x6144_n"76654f2ea1aa9fea47a439e903ff107a*36864"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=384x6144x1:384x1x6144: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 16x2048x128:16x128x2048_n"ae3b340224c240a8bac6e16772facc63*36864"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:384x6144x1: 16x2048x2048:16x2048x128_n"168f8b4d93018c8c9fa3732c8439f984*55296"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2048:2048x12288_n"c7f1656448d646a1dd3cc100156c6bdf*36864"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x8192_n"609aa6a72e46a33ba32911b1f15e2384*36864"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x8192:8192x12288_n"2eece678a445d354f1a7f7ebeae5c54c*36864"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x8448_n"0baf1f0aa34d437e23de13c3d077bc14*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8448x2048:2048x12288_n"433e75eba4c43db0da42460ca23bf4d1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x8448:8448x12288_n"bd377489d289991ce6dfa2f13f835a35*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x8192_n"83809f7328c141c2a5ddb2a2dd051e9d*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x8192_n"94b68a5a65708f30419e6313c0069571*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8192x2048:2048x12288_n"ed9ea712089f35f69e4addb5efa89fde*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x8192:8192x12288_n"024bd8ac290d30ae3b786eae43e92b0c*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x2048_n"70cef75035f83352d567f2d4df0644b9*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x2048_n"a851a300bc3ff8d396a070426ae2c8ef*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 16x2048x2048:16x2048x128_n"f59b141d744b8a96cd446df26268a587*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:384x1x6144: 16x2048x128:16x128x2048_n"d661e2401865c39294b5a01d4ce0eba4*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=384x1x6144:: 16x128x2048:16x2048x2048_n"52f31ae7674934059ad9f7ad6e4c55b2*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x6144:6144x12288_n"e748713095f8449c261555e2e1677865*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6144x2048:2048x12288_n"57df132fcbf8b14ca05a62831a435b26*18432"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:f32 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x2048:2048x1001_n"e71a5da481662f4ede53701873f4dc81*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x1001:1001x2048_n"65bfa9964a23db6abc3874827b5e04e8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1605632x128:128x384_n"fca571cd77875564dec573403e534262*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 131072x49x32:131072x32x49_n"fbb946e5bb84dbf7b7b74ac57dac0ae1*2&30fc84dcb3b56d5c14c5441e9d36ee15*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 131072x49x49:131072x49x32_n"07c4c26d7a56990c70f7f7a698849933*2&df2b4e61417009e9fb314967d5baae96*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1605632x128:128x128_n"56f7c1d87c935667de3775bd722ebf3f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 1605632x128:128x512_n"89a221340b68433807352dca9e8dcb54*2&9ca0ccb100d55d046c1cc65340856c7f*158"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 1605632x512:512x128_n"8dca91b251b937b5e11908a3cc26196d*2&7705587f6391ad5290ecc76f7a7068c6*158"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 401408x512:512x256_n"bbaab1a6dc88ce93fa8b8ed1f29cd4e9*1&ec745bd9d8e671d1bd84ec33a5d49cbe*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 401408x256:256x768_n"bdd4c200c87e7e0729725b1b6733891e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 65536x49x32:65536x32x49_n"a4351746e12c62f19d5f2eb68dd3c4cf*2&eb4bdc71c1c7a3e2fae2e966f8e63e64*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 65536x49x49:65536x49x32_n"eef67c5beef8d92dfba9602ebfdaca55*2&7f0c083f9126946c0b06b9e36f390a34*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 401408x256:256x256_n"07fec2569724311f70a25d5e71d26694*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 401408x256:256x1024_n"b0066997138bfe15beb5ec7ad1591ec1*2&80c8118c54ae0c6693cd23a6122f1950*158"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 401408x1024:1024x256_n"e0a8a4b87216a98a8592bc18cfec0e54*2&8ee2bdea86fb3451b3b2c56798dbf670*158"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 100352x1024:1024x512_n"ef593b91ec0c838396298ae67ce29ee5*1&3424400289216022255ece3c8c444af5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 100352x512:512x1536_n"7ee55c0d402f5d02d2b156d101029ac8*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x49x32:32768x32x49_n"2a97108c139c1090d7de6e568bd135ce*18&e393f2ac64c93537fba334d8a2705b53*1440"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x49x49:32768x49x32_n"c90959adc445df1fc0f701b3fe84d389*18&5aa09a5fe5effc88fd6cd140d3928d2e*1440"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 100352x512:512x512_n"74ade05a8000fb2dd3afc4845c281185*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 100352x512:512x2048_n"086bde0dc09c548a4ed2dfb088593abb*18&869b7756eae6d05a778cb1a569d5adf3*1422"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 100352x2048:2048x512_n"53cdb476e31f9d788d5a08df9b88be1a*18&a1d84476f8fde8ac7ee45f716f1f61b3*1422"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x2048:2048x1024_n"42f35fede7ec41c5e2eeb5e912f7bbda*1&e00c36e6b394d45d8a3cf713acb06767*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x1024:1024x3072_n"fa5c931fb87e1e6224d7072fa296cb7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16384x49x32:16384x32x49_n"58f815f9ffd4589e43797abd3d614f51*2&31f957c165e2b90e3a0bc4b24aa4e005*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16384x49x49:16384x49x32_n"21f4c301e782f3333a06b3e675e1f909*2&d88bda75355315f26a0a4267340502b3*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x1024:1024x1024_n"e1b2c71cdd275a10c64ace1fadc1f2a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 25088x1024:1024x4096_n"115c3e8c2716b264353a95f901047d6a*2&12b5d43f3078691dafad88c67d725c70*158"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 25088x4096:4096x1024_n"d1f142076d14a41a9959eb87294f0704*2&75fa528bda4c3de6ef51c0b1f04a14e5*158"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1024:1024x1000_n"63badb092b3a9e0db117f1cf1b1882b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:7:abc 256x49x32:256x32x49_n"1d0af477961c1b059983c8617d6d0f2e*2&0edc849ced14439a651fe37e697bab6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"ca17e49b10504039d14693d36bfb1d3d*2&cab82c9321a2b5423e355eac8418e094*2&a33c60362b48a010128b4775e69f62dd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:7:abc 128x49x32:128x32x49_n"06ee72c6883f2fe82a785175f38ba387*2&357076e2243850f1849669bb7fe39e24*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"18f6dcac1a47fa1e10d55522f40ef678*2&5882f9de1df5d31da9aaade65028c8e4*2&27988f44ca623c0bc74e709ab6f73043*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:7:abc 64x49x32:64x32x49_n"5e4fbe2f67d2a5e521f96d2924262143*18&02ea9cdb373fac346db279e987c0e2c8*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"a701871c906117546e698feceb74bbd6*18&464bfb3929825fe4221de6f8cc5da727*18&9aa2a70636090ea41f27a3156298cbc7*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=sum:0.625:0 32x49x32:32x32x49_n"7e30e5a4258b60c40c873a828ac7dade*2&af459ae644f20ea1b75c29f081807729*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x49x49:32x49x32_n"c85ff926f56a035ab999d9821e48893e*2&845883aae2d4668de82564cd03b54a23*2&16481ee1dfc90ce39d5543bbcb458239*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=u8:s8:u8 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"6f0258db88945a1d4b2b0ba1173d91a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 32x1856:1856x128_n"2b2db7eecbf93318ee4fd6e5c24636f3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x128_n"8212a063f26d3237ed43cf4aaee0c7c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 --attr-fpmath=tf32 32x128:128x1_n"c1e27c8c8267bdd24c5cb6de2a3c64f5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x32:32x1_n"9b19a7f0a75a774e25b1e58280ee4faa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x1:1x128_n"ffb6121f22914f4dbfa2e0d5d9bd1961*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x32:32x128_n"72e6a93a7432be06848c5fac3ddcfac7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x128_n"1691aba3fb5894e619f6eb43f918a906*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1856x32:32x128_n"c9c7b89d9828493bb39a8fd3e7ecea4a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x1856_n"3982b9eefcf4f8af037dd78736d5fa3c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1001_n"be271bb01ac6157a925e75cedcf110d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 124300x1536:1536x512_n"0a3fe1b7cf93198ef06c7df51c844e67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 124300x512:512x29_n"6825ddba53ef51584535f4384018e993*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 29x124300:124300x512_n"61401ac66f0477e98c609b797f00bc73*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 124300x29:29x512_n"33b58a6b9b59d6a14d95b176b810984f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x124300:124300x1536_n"75928ef77661a53713c7f0a5a75f93c9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 124300x512:512x1536_n"841fb56fb651af34bc179af37aa7b717*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x3072_n"010d07041fb79d0f482111ccce8832e9*36697"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=384x3072x1:384x1x3072: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 8x2048x128:8x128x2048_n"6cdeb5892f32a583a55528f47acbe72a*36767"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:384x3072x1: 8x2048x2048:8x2048x128_n"9244e9e73d7dc61376cf999f6ee6ed0a*54944"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1024:1024x12288_n"e67814bcbacfd948c886fda63bdaea34*36668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x4096_n"734ba76baad89599eee772ad9af6f467*36794"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x4096:4096x12288_n"b013406752448bafe26a6ab4ed8ee4bd*36758"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x4224_n"3309154565753559136bfbb8142a5354*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x2048:2048x12288_n"63449b2a2ba64a0dc75d2558547e846d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x4224:4224x12288_n"99c3182c2246386a81fff891160bfc71*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x4096_n"229c3c5699d3606f2b5cec2251ff39d7*18258"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x4096_n"6dc1b2ec337d8afffae21d58bdccda92*18276"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2048:2048x12288_n"ba2e69ac6ec03ef47a4c424765798a0a*18328"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x4096:4096x12288_n"40fbe4927bfed984062a4f9bf81f9730*18332"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x1024_n"c88e574c6aec787716b7d50bad0c944a*18255"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x1024_n"6bb1d98d013ba94e3eef3925f080f117*18284"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 8x2048x2048:8x2048x128_n"7732ba2a6050d8a34bf21ca975416c84*18323"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:384x1x3072: 8x2048x128:8x128x2048_n"83829042cce44716443ec3c3327e30c4*18300"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=384x1x3072:: 8x128x2048:8x2048x2048_n"a916d4100f2caa30f6ee33d33525fd25*18328"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x3072:3072x12288_n"1db26ec85a077696ddb1844779133a0d*18328"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x2048:2048x12288_n"d3e6c08c7cd815094d34651346b0a581*18294"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1100x1100:1100x1_n"f5cea8f73151d6c12a731b64496a37fe*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x1100:1100x512_n"ec689ff893291699dbfff3e0f1c4286c*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x1100:1100x1100_n"b93212e6cba96be82e3530771430d7b0*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x12544:12544x1024_n"0ed37afe45b55da8485d727c86b482e3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x1024_n"00fba3722347de29dffafe28d8dc3bae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x91_n"19d895c608a63aec03f59d0fd3a4f379*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x364_n"c5b7a519cdec61200fcdbce476cc0d72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x186368:186368x4_n"614e7cf7fe400eb901b0d4566116590a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x91_n"fe561fa47ab1ff52d1d279bb62d9dda1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x91:91x1024_n"13e0bb6888aef67e05d9fc3d3533153a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 186368x2048:2048x4_n"4637f5f1aaadfd143e421de02dbff050*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x364_n"19d751edaa987806ca4817e642802a70*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x364:364x1024_n"7a067245663a4349595078eb76a2ef89*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x1024_n"69ea25b9e3113d6dd4637c8bb40707bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x1024_n"bfcd42db94b2c5814bd5763a0b502fc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 12544x2048:2048x1024_n"64984cb415fab87003957c5c5c0f0fe0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x12544_n"f9bb49ccf2b1d19a669f8863d8e771d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x2048:2048x1001_n"cec0789aab03b0a2e1bab3b14e3f707f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 256x1001:1001x2048_n"da2931acc6f2918a4159e0190c5b7dc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x384_n"001506cdd124a2b7d54c0be93a8f20c8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x128_n"6a0db532091eb30734d09ccd6b2ac71c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 3136x128:128x512_n"ae0d6faf928bdf8d3f1b44452b31f75e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x512:512x128_n"0a9918a971d3a8f235b5dacd41451a53*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x512:512x256_n"932ec707dc27104549fdd8ed884d4b18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x768_n"18a6dc0c4e69ed672de7b239cd71c7ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x256_n"618da6ee6d2cc31dd6c22b7aee1d5df2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 784x256:256x1024_n"584461ee7ff3c204fd4ac92efa89be51*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x1024:1024x256_n"ba47e9476393dc337cb70093177d4576*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1024:1024x512_n"c40a79b245d04e3c92cd45e83fe5b07c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x1536_n"f857120413505bac03b7cead9a49819a*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x512_n"b6d090117dd638fc45ea4d8c3bfc7f60*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 196x512:512x2048_n"966ccfdc7af222a36d35c0c2f045097d*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x2048:2048x512_n"c687a336f28995d985f868b70e382aac*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x2048:2048x1024_n"7fc16245580a16e4a0c3dd512d0f19a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x3072_n"adec377df3fe7261575754aaf0006a92*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x1024_n"18b493dcac8eedbcc00321fb085e1c97*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 49x1024:1024x4096_n"8ca128b8d11209fee7575d9230be83dc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x4096:4096x1024_n"ae985a01da8fd6650ccc5c1cf51df16c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x1000_n"69c768bb44f41086a9c4211563c93bef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f16:2 --attr-scales=wei:per_oc 128x768:768x768_n"1d2f42f26886cd907e31900d594b1cd7*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:s8 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc 128x768:768x768_n"85e9d2f0f6aa5545fdff1ecdb709e726*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+binary_add:f16:3:ab --attr-scales=wei:per_oc 128x768:768x768_n"f1b184e49c55c580de248901598b457a*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:u8 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+eltwise_gelu_tanh:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 --attr-scales=wei:per_oc 128x768:768x3072_n"d0fc187f33ec84da765d887e257ecace*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f16:2+binary_add:f16:3:ab --attr-scales=wei:per_oc 128x3072:3072x768_n"0dcc300278831cdf324d7c879a098aa6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f16:2 --attr-scales=wei:per_oc 128x768:768x9_n"779ddc673a54f71a0efbed5bdb7ae734*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x13:13x512_n"59b6e86c05c946bf05764e4f8979ee64*1&a3df73794583066acab897751b56b39c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x512:512x256_n"792d29ee5c5e204961cc320b78c5fd7a*2&bba0c7e8322f15879a31267709a2148a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x128_n"82d42c49abeb03213862f8cbf68a0afa*1&5a0955719192c0a715c268305199ac18*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32768x27x128:32768x128x27_n"c83a100f204283b521be30c08f0d77c7*1&fff0a4573b269807167936fc34a0facc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x479:479x1024_n"84f9d18771e5c0b251bedd13e5a0960b*1&8e957d511701b33c3f3dc8ea67cfd3d8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"9fb71581604aecbedef2449b6ebb99e5*1&5bd9c1ede7e4b7abb9b2a071c3a8057c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x512_n"c95f887e9815f20f813a8ad08068b955*1&84a9c6574fd3867adbc3c58c0aa28c14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x1_n"8a2651af197b23a12ec8b317707010d5*1&651d2beca1128b8dfa3b6a69bf4173eb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x13:13x512_n"68959ae9552810ea43e60003a34a3f53*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x512:512x256_n"07106385dfeb60a7ce003a41a111357d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x256:256x128_n"7948f20c67c866a14ac9c62120f11d9a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x479:479x1024_n"f09e8025c9da0bb84225644f96c54a1f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x1024_n"de51bfe49356c74b13e892cdf49585e8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x512_n"0dd7adbca2ad7268583ae073cf697191*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 32768x256:256x1_n"8b8e86b28c6bf8a6d6293a7abe2c2b98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1:1x256_n"085d3afaea0effc5996953c05be8520e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x32768:32768x256_n"9ee4a4ad6a851a057e7dfe9c87a0dc12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x256:256x512_n"f18ccaa5535f9ae85be6b0e286418772*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x32768:32768x512_n"fc24cd17a548ac901d396246db7cd3d9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x512:512x1024_n"b30a9e7ec50d62e3ca929018878c1f14*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x1024_n"684b949a9b21d21500706d64a9ae6262*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1024_n"def7552f8a4c123ada8b11a35cab6c28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1024_n"2e056da085c069015f85945dc2a24f05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x479_n"8f95fa65afc9d65ceef0928a35c62f42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x479_n"f3a5ff960b16cd2f232f1f58a88ffb28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 32768x128x27:32768x27x27_n"a933a692fe77a5f3721a0b1d19d0ecdd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x27x27:32768x27x128_n"d28cd0f7fba3f28a5a9d587535a8b711*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x128:128x256_n"3312ba7d6547c38348f7f1aeedf7c5c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32768:32768x256_n"59b40c16040f59c097fcf69c0af7e4f9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x13_n"301db5fb7d876255f51a850ccc3b50a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x3584_n"11e53f92a2e5492c6b07c38119cf3c8f*576"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1024x128:28x128x1024_n"d2e5fb1080450799adb35cb5d7eeb983*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1024x1024:28x1024x128_n"7e33d9231aa70adf122636f2953066cd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x3584:3584x7168_n"f60b71b69391125583f1fd2b29257f2c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x14336_n"f7db7a18f56ccfe3f5f3890b7adff698*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x14336:14336x7168_n"953fb92e0a2082e0536b26bea77feaa0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x50272_n"6330bcb8513aebf210e1efd619cc839f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x3584_n"182ef7bd36fd54d0bd7d638e752624cf*73152"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1025_n"a8c40b10df4c1e8f5954684b7d204804*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1025:28x1025x128_n"7496915df988f89e0f29816265f04b11*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x3584:3584x7168_n"f307f066c86cf4f85fa2c491ca027402*24384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x14336_n"406f646c94eaab973902f5374afca09c*8680&bfc62fe8600f0fa21e3cb3822682737a*24384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x14336:14336x7168_n"ae0cd7f875780a6f593f66a6159c9345*8680&40433c2ac19f61e0306d5ed009f759ee*24384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x50272_n"28e68b653b38e60885078ccb4f415a41*508"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1026_n"8b03fa3d1a9fdafe8599b6f3155dc297*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1026:28x1026x128_n"12ac389f986b6fcdfc9d2b0ebfcc5dfe*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1027_n"da854f0a31bbf2ac9b3664a44de83b45*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1027:28x1027x128_n"41b2926c66faeafbc7062529b13840c6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1028_n"69df5214f484ad935113fb181a1f676d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1028:28x1028x128_n"1e8c387b53131f07913447bfcc5ef0b2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1029_n"dd6929c8b02a4ead056055e4332b5df2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1029:28x1029x128_n"9ed0afa81408faa008c87ee8ed49efc7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1030_n"d81208c9853d1973a0ed66094ca953db*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1030:28x1030x128_n"c060eb405aa7ede85119c782593a30c7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1031_n"a9b8dea3f18e9aa56e0426e882803ac2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1031:28x1031x128_n"7044551a92050f996071815b8cce5625*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1032_n"f6c5f2ac6dbe7171bc714dd435806e32*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1032:28x1032x128_n"b96c56c878cf7d193512ab8e2e260f04*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1033_n"4f18d0ea8a6a6093defc8281a320a876*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1033:28x1033x128_n"db3e21277735760b0a56aa0cca6b87e0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1034_n"fd34188bac9d18ab4e746ee6d6b0a6a2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1034:28x1034x128_n"a0f332941750b9c90cfaac70f1f824af*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1035_n"38de578f22e38bfd1e5613f0ea20fb3f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1035:28x1035x128_n"bee82255798fdd436c3e0ca1fb04e0e9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1036_n"c04d571700824d76987181126264f6ff*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1036:28x1036x128_n"a04e1ffe7599219e2af5ffa456cf5631*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1037_n"6d7164bab59226362233c4407f5b2070*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1037:28x1037x128_n"8fa3373516af2e09ef990e3e53088e8a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1038_n"93f329415c9c39689570b34d08c709e3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1038:28x1038x128_n"9f0c5c423423b0a9504d6c4f912015ef*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1039_n"05b32e15e83733da40ce6e97aae0c956*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1039:28x1039x128_n"635e20e2f61557eeae959f43fb5b74e1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1040_n"d11885789fc027875acc824e151033cb*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1040:28x1040x128_n"9e7463283ed0f957dbb59f39297491ca*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1041_n"790606f2748c7be5f62c8dd869374136*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1041:28x1041x128_n"653c673ed3a97bc99d57a10e051e504f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1042_n"c5d3a6d890c79341994b09d5ccdd2bab*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1042:28x1042x128_n"0ecb52af623c90acacd994b1eb9e8437*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1043_n"338ef0052679f9d03a7256ea0b21c20d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1043:28x1043x128_n"162399f771ea2bde675b3c13d1fb476f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1044_n"017a26d40d60d947696e7b23936155c5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1044:28x1044x128_n"f050951ffa6bf29b97161dd473f36085*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1045_n"2282b5df06f32bf8deeec239f1effc0a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1045:28x1045x128_n"eb9683503c2107758160030332859d7d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1046_n"647ecca1c737da6325d506b48a68a40e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1046:28x1046x128_n"510e492eabbd832ff870e7a735c6882a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1047_n"9309122fb73e3e15764a0a124a8727f5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1047:28x1047x128_n"8670c674105f619f08a6048af0ec288f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1048_n"dc01e55463b561b5009655519362d2b7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1048:28x1048x128_n"6d8e55fc3c3bb3c5792984df2a236ab5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1049_n"dd86d3b6e6c82e41e6fb01d734b64adf*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1049:28x1049x128_n"3f01eb8c04fa813e8d0f88ae8967d82d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1050_n"96e913cb13dfe96d56e28ef8e46cbc15*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1050:28x1050x128_n"f397ba81014c1e9cf7de41c6273932ee*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1051_n"940dc40bdac3fe0b4f5ce82ff161c13f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1051:28x1051x128_n"5d09bf21927aafbba9538e790301fa87*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1052_n"869f080707063e7de0535adb11880f41*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1052:28x1052x128_n"ef459c1f0c1293c219ca818fb86a7497*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1053_n"dae1223cabeece807b3eff17e6d4d118*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1053:28x1053x128_n"1f161a1aef53ca26386068365c17def1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1054_n"287dc596acf1762ece36f0ea25fd2bdb*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1054:28x1054x128_n"4c13e1fb754023e94f9f254b73cdf79d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1055_n"eb901572b5128ecb60d210c4fd51fc88*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1055:28x1055x128_n"f8cbc4ad34442015d191f8fff1c00140*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1056_n"8eadba97df033f87a283f0bb98c2fb0d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1056:28x1056x128_n"1fd8fa1d4f79f6e166e83a407bcee3b1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1057_n"494886d378404331dd19d8d47f3b2e26*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1057:28x1057x128_n"9fe0b013964b1fe3d61688fdc0d069c5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1058_n"243ca350eb117a2f5813017e12c228a8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1058:28x1058x128_n"1255994c22230feb46ff783c8cf88f11*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1059_n"f1b678e210fa83d7f8e60b92d1c81c50*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1059:28x1059x128_n"fdfede3e821376cbef3d1eb1b945ce39*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1060_n"5d686f218bec11b86efa477918c1998b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1060:28x1060x128_n"c7447d382eebf831d78ea5f39d132c88*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1061_n"bbe3d8c78234c3403f5247ffa7a12994*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1061:28x1061x128_n"9182df2e3ded4c970fd52728893d9542*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1062_n"52c27505537c093846dc2c5f4d8c13f2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1062:28x1062x128_n"9dbeae8103d695502c24aaa199c13683*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1063_n"58843ea6a16763316b6b3cf27d1c4cf2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1063:28x1063x128_n"bdbf567a412b533571c0bfa4e6a9ff98*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1064_n"dd49ec1cd3cf253214982ae6fa39a575*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1064:28x1064x128_n"7d378c1bbd87ad949be38ce658391327*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1065_n"a43291989c6d899bf56b07823d681c0b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1065:28x1065x128_n"f28dca8cf1656122c74a49be4ad378de*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1066_n"419700ea28ad0d0ec243787921d105ae*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1066:28x1066x128_n"eb1aa770d1f4baf281314571bbec2b0f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1067_n"92c10bec308654b98752d3b5da2f476b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1067:28x1067x128_n"317b738562304a033a6dd9efe2b4744f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1068_n"3bc70a508c15c75ad4e519a283ffa114*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1068:28x1068x128_n"5f42d9321e0eea210c2ee4d361b8335b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1069_n"92c2602e4d0abf95be00e00cdf2e3709*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1069:28x1069x128_n"25ddd614055813c0f77dc14934b9d663*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1070_n"ca7bad2121f7e0bc242c0b04bf6dde00*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1070:28x1070x128_n"353acf241249551e4b29adf9f61fdc8b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1071_n"ce02a0ccd1e22c892257dca6627178c0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1071:28x1071x128_n"be0d09eee713671c747a7fb6bc878d1a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1072_n"c48ff8f698ce08732cd240e1e5602d79*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1072:28x1072x128_n"1e0d415d387ac3ad092b5b4e45102580*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1073_n"d2a3880c284e5043654004ad9b7cece6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1073:28x1073x128_n"21715b9b6e0429dab920d019a3a8b82f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1074_n"e282aa0c73f3418e8eab54d7eeea5046*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1074:28x1074x128_n"1cae2dc1c030ec0089d9667048f9210f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1075_n"25683c097c4ee0e89f7b8a9cc789deaf*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1075:28x1075x128_n"ddded8bcdd29f1019fb99a10c36104fe*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1076_n"6544da378001cdf52b76b6f6bc0f0081*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1076:28x1076x128_n"d679fea0bb649847b8c79871bea83d1c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1077_n"38acfb3765508018545ea4243db3edf4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1077:28x1077x128_n"a7317db61d34bbb41e763eb830f215ff*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1078_n"eedae4792e72f71c18f72b85c09e0995*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1078:28x1078x128_n"d937898211ccc3c7570d91ee84f00f48*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1079_n"f42ec2fde296669ab07cefb6ba615dd8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1079:28x1079x128_n"336d170daaa5d87fc5a974e82cc47738*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1080_n"80361c42582821911fbc4e7d00422e8f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1080:28x1080x128_n"51417f8c778ebb32fbbcf0290060c412*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1081_n"b0d40402f6a1a5e9297ea5d994c70224*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1081:28x1081x128_n"1c75e6d74f939860132d880fd3c6dd66*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1082_n"bc5399d3f15602e9d9fb184890a644f6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1082:28x1082x128_n"096730f74b30dd4cb51cb92c8d8f1a4c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1083_n"4f3aac32cbf1f806fcdaee9371d4c497*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1083:28x1083x128_n"d10ce97c1cf2a0fc8aff85c7fe76b6c4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1084_n"762161945a1a75076ac76a9bf235b3fc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1084:28x1084x128_n"f913af9d18e74777735fb2f395f2bd9c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1085_n"c5430eff0fcaf81f056fd0431a793e46*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1085:28x1085x128_n"776e376ed4f5fb3797f859b3c46b36db*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1086_n"7e1d99141fd053340262b49fb3e988e4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1086:28x1086x128_n"8e02059428e3f00f915ab8e79d0a5608*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1087_n"270b98886a7d8b0f83562ce29c4cd70d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1087:28x1087x128_n"c4b82f601911830108eb4bb41ff0e944*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1088_n"514838457523c78c60516f0913eb956f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1088:28x1088x128_n"c83017f81dc60bc0dcc42980fdfbd319*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1089_n"e8b3bc6ea7a8e380c46d216328fe0f97*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1089:28x1089x128_n"48f79e4c12b8b3d1c4b5aec818a2f6e8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1090_n"f01d9103f2f5c065cc1fc4a0aaed9391*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1090:28x1090x128_n"7c7990837556a376fe4f825957d05ca7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1091_n"75cd2f9be197e73e987e34cf4460036c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1091:28x1091x128_n"c7ccbf7c3096670572848d5430edf403*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1092_n"b0c2c0e5c5906b8d50d6ab0e73ebd686*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1092:28x1092x128_n"8d1adff932721b30d7993878129a5828*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1093_n"53a5a3ec4481ac7e840c2e67f2630cc5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1093:28x1093x128_n"2077289683969d1821933e1ecefec5f7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1094_n"b75e0cd447f1c8183f8b6d5ebfbd1c73*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1094:28x1094x128_n"6c0d4a45723f60aa7901a0fd448896f2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1095_n"bc39c6dfc17365fa5413647abd2565f1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1095:28x1095x128_n"1d4763c162236241f3bfcb1069f6e287*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1096_n"e24c1cbbb9cd2bd56f02a6cd6692d4ee*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1096:28x1096x128_n"b9bb9a9266255f0adde7e6411c302a8b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1097_n"9594b2524f7ccf36cc4da9ee6db210b6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1097:28x1097x128_n"3dd258e8a8a8d3b5bc40a3fa6c068fb5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1098_n"4f3293c13c898e537dc0575082b99840*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1098:28x1098x128_n"6bb0886aba1e11f75a5005cb97059f5f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1099_n"1553e270aa2521f645a957d2c688c67a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1099:28x1099x128_n"a342b734a166b522e1cc377a55454c0c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1100_n"377b4e0428f48a3cebd4393918d380a1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1100:28x1100x128_n"8bcbdf18b4e8b3e64c4b7991e0fd6238*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1101_n"e06129f2dd1d462a72a2a533fb5d0d80*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1101:28x1101x128_n"3a0909529ef8dd47286db24954b2f4d1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1102_n"60b34094749ff91a6cc55915491d4a33*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1102:28x1102x128_n"2e6c44bb6b99db2b5440c56daa2c4d86*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1103_n"9441e23c5d2a8c48225d23d4ac0581bc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1103:28x1103x128_n"58813202dbf05ed18d7a37fa91e93667*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1104_n"365769c94858bd6295ef9c260b432b99*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1104:28x1104x128_n"5866c85bae31ac8ec4328d2295be65ef*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1105_n"b733d9b580f0a3b1dbd5a68fc0e9c491*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1105:28x1105x128_n"d08f8d6162442109ed7d966174e4ba7e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1106_n"a7722058480589e82668d47800eecfb8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1106:28x1106x128_n"19627b8b9275e674f65cf0e94fcfe0fa*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1107_n"d1e2b72e2ae0c91cc8974c6ded059278*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1107:28x1107x128_n"62a8dce8e25bf9c31adc67b7c9760460*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1108_n"3997f474a8f00f15f4e78e4dcc327416*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1108:28x1108x128_n"e5c29010bbd715f63a25656be3661e6e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1109_n"08efbf0713dcab0c623bad960c34cf9b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1109:28x1109x128_n"aa985232bd53832d1c4d0036d8315599*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1110_n"5bee1c6a0e2a80f75018a70145ffc576*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1110:28x1110x128_n"9dfa08d13e3b42b76486f0f4987e49a2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1111_n"d30715f9f4e3b627dea43e80bc07d334*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1111:28x1111x128_n"2ee59bbd5355c5e3957c40ec9af03020*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1112_n"717d36e52ea8bd6e0236c6a538bd31b7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1112:28x1112x128_n"3402cb56ecc89f46e2f6e4896cbab354*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1113_n"182665e24ae4d7ca513b195cb7342d08*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1113:28x1113x128_n"e8ed38f7540d80b338d70d8c1a03d0e1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1114_n"c10ba3454b37e0c964f8991292298fd6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1114:28x1114x128_n"f6c37819843746b1be351c67219ae408*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1115_n"e177b9928c5f6da775675544150527da*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1115:28x1115x128_n"77b8f6cee2ac3167df3d059886709fcd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1116_n"f6ec1da63160bfb55aa7881489b705c5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1116:28x1116x128_n"a46d2ed6381a6b02139236e0f5c73299*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1117_n"1d2e22812841f31cc3dfbc2b1f8c6cf9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1117:28x1117x128_n"001e3f39a7928d19be7b18a53a4f57df*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1118_n"1c56bfe847144334f8cd9ea359843e91*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1118:28x1118x128_n"d1e6f6767b5da22e10f1b6098e66e8da*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1119_n"aadd53bcd259fd0cc954b238ce05ab1e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1119:28x1119x128_n"8ec247b54dadd7fb60482843c9ec47f8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1120_n"3180df7ebee23b1a550bc32c03f20f64*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1120:28x1120x128_n"0e68149bc4156671cb7b1f14cd0d1c30*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1121_n"3b591258a551f4e4857d2170b57f497e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1121:28x1121x128_n"2384b1598904560e58c88ef8c4a16fc6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1122_n"31d931086e8c373cc6bec01164f7260b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1122:28x1122x128_n"9725415b5d38c8549e15755f22dbd987*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1123_n"f66ccdcbf5a54c6348fb0348858552d8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1123:28x1123x128_n"fa8e53b359d065ec1b6ea0ad4354b754*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1124_n"5466ae9b4e43d967def2c60582e22ca9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1124:28x1124x128_n"db4b0fab4fa3673539a524c1cf88c174*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1125_n"7c75d9ff7529dabc58c97e72a666938c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1125:28x1125x128_n"5a4b17ebaba07966e718440da75a6d36*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1126_n"37f94614e45cd24cb2ff307f1ba6877d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1126:28x1126x128_n"8fe3d57e93509a76b6b5887e87fd2ef6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1127_n"c0a3b013c45e73a777676d28b199cfe7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1127:28x1127x128_n"72827eb7d5f3e35e053846ecc5524c2d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1128_n"c13e9183476a7b7e3acd976f322b1588*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1128:28x1128x128_n"5bb84a1651bd95f6b0ac0ca5a9046e2f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1129_n"fb8566656f587aa6864c686c953085c0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1129:28x1129x128_n"a92920ebc6e2d38da1e2200e7b48417f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1130_n"d506392d43bab9ea0cbbefda9e228a79*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1130:28x1130x128_n"b07a291279328c82cd370f212303879e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1131_n"65e53ba23940353d5882de2f76b054b8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1131:28x1131x128_n"b8b64e51b86d7c5e82522219f991cf87*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1132_n"b2619f6eb86f3133b1a2c7732f68249d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1132:28x1132x128_n"50d121df91d8989311356afc909111c1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1133_n"fc54d631e090ec0d292d21a1d7df0075*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1133:28x1133x128_n"3e280f2b46fff0a5ae03d8cfb4e690c4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1134_n"78ce5d7543c6623f57761df181fd1cc5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1134:28x1134x128_n"4c84ecaa347ff204371ea41a96df5187*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1135_n"18ac8c72fc609bc5c5197dae9f75d773*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1135:28x1135x128_n"9371187697e2299c244f2b4464fdcd92*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1136_n"b1f6a2a2834cbe596f5a0e59ffd2cfe8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1136:28x1136x128_n"8ad9c92af23828382e8273f4fbba7635*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1137_n"eaf2f627cc01b19f72724276c751a638*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1137:28x1137x128_n"b3e8f724c65011429d8fdcfc0676376a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1138_n"eb5e4ae537f05f52a2d22d5950ef88f0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1138:28x1138x128_n"f3fc2ac05d16c4d892d0167acbfb3ca3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1139_n"5dde2b7a61b75dd9e40cc783f06ef373*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1139:28x1139x128_n"73512b8bdd788ad7b768e772763787ba*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1140_n"db4674704073804089925de039cdf1e7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1140:28x1140x128_n"0561b5dec3790c82d65ba48124ec699a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1141_n"b6ba93416402d9cd55c180b92d90c116*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1141:28x1141x128_n"78fe3c80ec383426368e122f26cc2363*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1142_n"7f45c3b313dcdf5beaca1d73e0df34cd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1142:28x1142x128_n"bfe79cc41e7af4f500f2f26df85e35b4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1143_n"3a5c513285342357d9d61d58be2a759a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1143:28x1143x128_n"8948d3e7fa56a677db84b8cb420e1d0c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1144_n"36ea49eccd7d2ab179fe382b637b8713*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1144:28x1144x128_n"ce6d6593515acfc343d21451bfdcc6d2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1145_n"d21d02a673bf74d05e194e5c83b3f17e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1145:28x1145x128_n"cf0ab7546ed035f2005b4d0fd9dd674e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1146_n"fb3afece84d1e83a930a4855d7980d41*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1146:28x1146x128_n"01ed57cda1122783275696341ee2b787*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1147_n"50c88bfb92f6340f9ad8c902c81769fb*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1147:28x1147x128_n"fe68f706847b65d6aa96bf083bcd47d2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1148_n"bbc484d9afed1f84dfb44643785d1d17*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1148:28x1148x128_n"601e8d220dd5e7b45d53717a03954fa9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1149_n"ae476269dfb6f71e0133145d04847a3d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1149:28x1149x128_n"22bd085a1a07d019488e0e310bc912ce*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1150_n"764f95845bfa0ae783a6f419fe192795*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1150:28x1150x128_n"8faf667ebd5a6679ab01c7597a41bfa4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1151_n"565e6bba08ef3702201077a4ccc916ab*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1151:28x1151x128_n"a843746d850166a63cadfc77b55782dc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x2048_n"bf981b498a7eab58d3efff0176733f60*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x256_n"ac2d3b93680262a0ea913dca86e5ae47*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1024x128:64x128x1024_n"bcd7e40d8784c2de99855bb096bc0fb8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x128_n"7c3670ade010853e9f2326fe84b238cb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x2048:2048x8192_n"2264cd51ae65d7081a9b48cff73d2f37*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x7168_n"5c4f38691d0627a4ac9989adb687ef2f*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x8192_n"9e9ecd8374e3eece338347fd570ed183*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x32000_n"46daf3cb5bba430046c0d4aeee265720*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x2048_n"21041c40ace13b503e7623e215cb5014*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x256_n"86976aaa23320dfbf4742aa09701e3c9*20320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1025_n"88f0437f66dff7c48607217c344c7ce4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1025:64x1025x128_n"3a0bfc50a9e2eca9f5017669833b627f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x2048:2048x8192_n"9576d47e799638c1cb22b22ea962ff29*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x7168_n"efd587b40bbc9672d96a76fdbe08ace8*20320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x8192_n"4b68335547a6a406f02d5a3f6666832f*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x32000_n"e7e5030af506649ca1bf6aacdaaca2dc*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1026_n"3da41512948dca7f714bc028a8f7878a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1026:64x1026x128_n"2dedf5562773044ca407afbbc8c031d0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1027_n"54f98c25ecce9eaeb6868628301a05d7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1027:64x1027x128_n"afcf71bbb5c3947d6b0f832cc5972411*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1028_n"154bbb0306ab96e0888c7e96d89089ee*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1028:64x1028x128_n"7f6794554137a85ebf02f7c301287bb6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1029_n"e44117582861ae10acc6dc71c7d95362*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1029:64x1029x128_n"d6078b268545e3c7b0ce03355d9b1ffd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1030_n"05ccb2bec9ff198e4eb31e497ea872aa*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1030:64x1030x128_n"cb537136397d718d65f306a004a2c7ec*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1031_n"6df144cb472ddedb237aa0243d41924c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1031:64x1031x128_n"8adcbbbc4aa98e978f72fe0cf9f6784a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1032_n"8fcd7de5d271016df1462c929cd37a3a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1032:64x1032x128_n"98c8d2e3f649a281bf49b1cb9c5ac751*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1033_n"0d3e2723d478235040a6794b272add32*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1033:64x1033x128_n"283237c860e786599a155b69003e4303*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1034_n"ff6a2d8869d3a0a01ddea012a535fd19*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1034:64x1034x128_n"424534aeb4faa7caf45f58616208bd59*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1035_n"4bb08a0c923a2f122c536abb13524c5c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1035:64x1035x128_n"2607a20a7d42b52eec8a1364428a1fbe*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1036_n"9efc53f451adedd61ba0ef54f98ff96f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1036:64x1036x128_n"f5897dc16ca12cdfc53f1f9077493e7d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1037_n"4b6d0d3b4631df7d9439740a679cffec*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1037:64x1037x128_n"a3b709681966a0a9dc9b15003c706f99*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1038_n"0a30d898729ac945942fdfaac2691a62*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1038:64x1038x128_n"85065f1bec242e7fa4215583a4667383*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1039_n"df0c8ef51c6d765186848e398d1ab427*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1039:64x1039x128_n"c3a8c73ac69f0a25496b43bc2cf0bd9a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1040_n"4fd712cb17ff3a5789ffdf8a518b34d3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1040:64x1040x128_n"44180c9ff3760d62a5df0cb75bc0c640*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1041_n"89f2448bc81dd2cdbacd01b3c2923f48*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1041:64x1041x128_n"877b04ee9b60f8a7bcf250ba407778aa*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1042_n"fd0a273895cf2b0d51bcbbe3f7face41*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1042:64x1042x128_n"d737ccf73b864cf2820fcd3abd6a314a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1043_n"83717da95e350c02865a69f7cc5ef11a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1043:64x1043x128_n"e34c37a897e26b6b1a2a48cf3f5db74e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1044_n"77ddaa64610913b2fdad10ae5149e11e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1044:64x1044x128_n"df1736439efa0916fdd4ab024f400dea*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1045_n"1d641d7dbec263434b483d916b59dd0b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1045:64x1045x128_n"2f854be7f0e9683ff18a05b56d371067*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1046_n"96c637393bd7bdfd2213d6810775ce4e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1046:64x1046x128_n"18421a19ee378b386677d1280b31a7d6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1047_n"211c281ab512ebc7128b09e38208e9ac*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1047:64x1047x128_n"aab56f7d4f9c852492292cefa963f09c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1048_n"5c9ac18e5f9e3eb65893ebeccc1be76b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1048:64x1048x128_n"c7ec53cfcfc05f2f7bdd56a482b173e7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1049_n"d6ccc9577a75dee4448b0b5aa5f863d8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1049:64x1049x128_n"d23e68d38da53ba4a5f86e1c9070e86b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1050_n"3dc490ec6466ba8648c5344190477f3b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1050:64x1050x128_n"1a3b31b696886f49247d5fe3cb34d070*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1051_n"d340e42a95f717a0a8c2d3dc276c6f0d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1051:64x1051x128_n"5ced7307fcfcb5d3605f759f3b1fdc10*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1052_n"06734962f8dc0f816993d28a9be23f05*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1052:64x1052x128_n"3be965e12da5a0b4a0a3b56704f2df96*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1053_n"9385bb38bb7574174d602d25b03a0341*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1053:64x1053x128_n"9b93d0f5b0f2f506291be2473c8eb05b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1054_n"70352265f2b150a3f510fe650301b821*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1054:64x1054x128_n"cab47914dca4180b6bc2be0f86a44feb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1055_n"653ca6ad8c541c586b918652385fab9a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1055:64x1055x128_n"160b0752d1c84c4514cd5323c31842d1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1056_n"e453387e06e9ae312a524eda9cd86317*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1056:64x1056x128_n"0c429df043ac8b581bf1e24221ce1ad0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1057_n"a7c7d7235c54fa04bccea809d0c3c180*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1057:64x1057x128_n"8b1256e31549808b38afb74d2b833904*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1058_n"cad71841dac1331ddac3d7d8147b4acc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1058:64x1058x128_n"cded41e988421cad58181be390f948ea*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1059_n"99b0ad579771474958dff497779fa890*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1059:64x1059x128_n"2746e431fcf16a2f6f50b5658fc2f661*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1060_n"85164ddae76710dc7188884678f4d910*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1060:64x1060x128_n"d063c404278c365d2ceeb6ee43ce9a29*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1061_n"12af57608c03e195ca3af62e2d4912c5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1061:64x1061x128_n"970b933825bd7fec244c086f806425c1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1062_n"ea6d88d865807cb716b328359109868b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1062:64x1062x128_n"5b5d70c75d41da8fb2f605c64ec1c560*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1063_n"cb00d50424523a1e0e7c305b15021632*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1063:64x1063x128_n"b854b4238aa4711048161af95efbcdd3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1064_n"f824a3b7972f37078ee6d69a79440274*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1064:64x1064x128_n"1b4e7f6c2121a2bb1ae987f060432d74*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1065_n"a4703d8050fe3256c4da65f61158948b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1065:64x1065x128_n"0dc1efe6e192cf5cacc26b70faf7f0b7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1066_n"52aa6ddac6b529851bc399994414ffee*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1066:64x1066x128_n"0e3bcead7d2939ec23a9ffd4a8e26acc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1067_n"42a8865ba817421d36cdc9847fc0437a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1067:64x1067x128_n"09d1a21817f6611828c14ffc0994a729*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1068_n"3069e4905981b6370bc507f925e0e175*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1068:64x1068x128_n"68697c3043626da8f3d780f1c9c42191*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1069_n"f91a56e71572a2e9e51c4e8da6e688cc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1069:64x1069x128_n"afaf8e402c23de46b07d2964c9e51679*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1070_n"5d5da6506710c6579a8e184c08a01490*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1070:64x1070x128_n"da2f706caf8e69efe20fc13d59844253*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1071_n"30ea9445681293739e30265d71fe38bb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1071:64x1071x128_n"6bea8d20dc9f1f184480330565e04a5f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1072_n"6c0ed62ea52b2b49c7250f1394ec9266*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1072:64x1072x128_n"994f9a4994356a321e161257c445c632*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1073_n"73bf802f4c2ec8b7ce834cd60b3b237b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1073:64x1073x128_n"4ee960a2c2bdaf2e894ab1c770fd6883*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1074_n"eaf3141805db42ced560cef86b51c578*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1074:64x1074x128_n"7715a60379492bb040af5be2f045a6a6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1075_n"667a4080fd60fa1727d7c84b55dc45cc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1075:64x1075x128_n"c2e937dc1a756967d3769f46a184ad67*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1076_n"844365b61cd9193850c4bb0a7d3d3613*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1076:64x1076x128_n"82af9964ace261583bce053c9f3d0eee*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1077_n"0c9403fa9e57c8e7ed958e3d30803c44*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1077:64x1077x128_n"3b19bd7bc66f289ad3ca132818334a74*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1078_n"0b1d80d467ddfd80e10ff10d7fe7448c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1078:64x1078x128_n"f8cbfad30cfe5b0640542f1514720bc7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1079_n"53356e7dde3d250f4978916de0a868f3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1079:64x1079x128_n"5ad9e8e4d2d0bca07e13d1bf5c534f02*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1080_n"4aa9fc05b6184fcdea3b8fa38c404b33*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1080:64x1080x128_n"e88fb767f6febd7cd59f5eaffd42b2d3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1081_n"22c98b70fc268f6d76c85d243bd3dd34*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1081:64x1081x128_n"dccf1d9ccb083342ec66ecf8beb9e719*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1082_n"48e238689f04f4c205f5ce444b046711*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1082:64x1082x128_n"760c6786b165275efc3355ef04b88faf*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1083_n"a6e71f611ea81ea88786d4ee2b6c3dca*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1083:64x1083x128_n"3c1c7babc7960d42efff8e1ed823a31f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1084_n"855ba42684fafbc01675b1e6f4dcc152*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1084:64x1084x128_n"6f6b702754724fd77464b3a862c98446*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1085_n"40cb1bff8cd4738dc65de987a81f3145*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1085:64x1085x128_n"e9bb02fb9d0d9fb591dd9876c108812f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1086_n"9e9279f502c58fa137d413ad4297ced9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1086:64x1086x128_n"4bd88c38264673381b6fcfe9d2422ae9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1087_n"45a27fa7dc1c83e12ca533c84ee09e21*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1087:64x1087x128_n"27ba37862b7799e8217c4694fd3bf6f2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1088_n"5485fb510073fd0a05e1ed85e86fd006*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1088:64x1088x128_n"3e0779b48a1a234c103be78f183a9b66*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1089_n"cddcc9cfa7ae950ac41533f9ab6d80c1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1089:64x1089x128_n"dbd7957865e0cb777994ac7dbdd686fd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1090_n"04db0afe039e87848a195d6bd9bb7984*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1090:64x1090x128_n"58d1fff71eca90084e334e010eb3eade*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1091_n"acee2856c6ec6b8172aebbe9e3bd9223*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1091:64x1091x128_n"074e6a123de2d3bc8e1c28b586518033*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1092_n"4032d6f888dd966a93bade30974a726a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1092:64x1092x128_n"8365002bb13903af137489b08673f1f0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1093_n"88c29df3ce47b2dd7d5450104d687de8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1093:64x1093x128_n"87a65459a1eb8e2d230d10493273041f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1094_n"3cec1035b831885ae0113408ee6f0ef5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1094:64x1094x128_n"be75ca7d9fe97d6b759b2b545d355724*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1095_n"beee4849c43d328f775e5928d2a43a1a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1095:64x1095x128_n"cc540972a135f508befdb8ee2314dc45*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1096_n"459b3b8ed3b15350477c1b68916cc5f0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1096:64x1096x128_n"c34805933d1e02dc11c6f54a1e142abf*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1097_n"04970a6390c96049cc25461265bfd61e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1097:64x1097x128_n"45b1a24be8dd7e94a712853f7bba20a2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1098_n"72e3178cb542c162bc478adc903fe71d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1098:64x1098x128_n"3b1a324c941ce03235efea4fabab5c1b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1099_n"877cc0a61fb6d59a4e9e1c0bc321f144*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1099:64x1099x128_n"6e76ee4d17588d856ab62444d8dcca9e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1100_n"d61fb235f13603742ab768d7d3cc80e1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1100:64x1100x128_n"cc26a0a884f4d792aa2f5add31b0ed25*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1101_n"f3aa1a391fb1c44509a31fe1420abc6f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1101:64x1101x128_n"f79b1759fa2fc7909db401761681f240*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1102_n"5cd47cbcd1eae737ebd6d3747897a593*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1102:64x1102x128_n"99ebf84ec9d8777a5c389d7ebaf78390*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1103_n"a46de5b365448c301f4f579c13220848*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1103:64x1103x128_n"d15facb2f8bc0b2d9d676ed71768d14f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1104_n"3c39170601e9ae5d9a137328219f2144*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1104:64x1104x128_n"30aa1fabfe0329450e7dba6b0f12c519*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1105_n"0bdadfdcc7fbedac8e8102340a0322d3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1105:64x1105x128_n"1c308c758bfcd5f92481b6d671c1eb06*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1106_n"13435a21904e2dd938ff0edaedee43cb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1106:64x1106x128_n"16cc29aaf937eae8a09a05c2894a9798*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1107_n"fdea124d08b9178e1611a8864648598c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1107:64x1107x128_n"5d974d146c91e98ea0ecea898bdf9b62*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1108_n"daaab60c70b59c0dc0076db2cf5c28ef*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1108:64x1108x128_n"f4b119239a7892ee2736a679e31803e0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1109_n"c6be359609997fe742103074f61ea6bd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1109:64x1109x128_n"875747a7e67df2d77229cab209a82c48*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1110_n"9ad7fbd65600e3365db0f37ea202c6ff*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1110:64x1110x128_n"e4b6012313d6e0721391d243249c88a9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1111_n"ea27ce50343ab2407ed1cbc9825209d1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1111:64x1111x128_n"c9458ca577414a0bb9dca84877d8fb0e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1112_n"0e19fc1d72991eeac18069dc37dfd939*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1112:64x1112x128_n"e77351ad5f2fd737f62ef51a98969303*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1113_n"055a0ca86821ef413a47428782578777*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1113:64x1113x128_n"204a7c3ac9f753140d616e77ebe06ae3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1114_n"fb5cad55f87f65b49a33e1bbe12003da*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1114:64x1114x128_n"0c9c0b80cb4ae0609c069177de3f3734*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1115_n"8355f1439b3853a3fdf6c7e1cbc61555*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1115:64x1115x128_n"6bc29c7c7a79817c84be388aebc28ca9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1116_n"ba19862329c48f6b949832e73fcb72a3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1116:64x1116x128_n"9552c852f525a643338e7181ce6a5295*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1117_n"a14655385954db7a85c7bca6d7331569*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1117:64x1117x128_n"ad6c4ed91d9a9077441e66cc762c81b3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1118_n"9ee81c189005e7aa25ca3875dc330b06*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1118:64x1118x128_n"b07e1cf47edc6aba6956c021aa74bf0c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1119_n"47293f85dd02838e619962339775cd4d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1119:64x1119x128_n"3de9f1e86c49dd1c90d8608a82254e53*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1120_n"5f8b9db02b370705d066c4b0692685e8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1120:64x1120x128_n"d315713a5ea1be7308be1bc65d1c053d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1121_n"43fdf9978ce9ee4ece09a6a7da1b008c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1121:64x1121x128_n"9c1fea1dd32becd6ce73b3206a38f565*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1122_n"c72bca53ac0f14860217ba82d0b419b1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1122:64x1122x128_n"9a0330a3b4aafce17bddf9f6d7693bc0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1123_n"cd12000b24bb9071aad7a8946a471b86*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1123:64x1123x128_n"e1bf0630b799cf98926dfd89d51734ad*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1124_n"9e7d0067ccb7abcccfafd3afe88c5ec6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1124:64x1124x128_n"4f936b458aab590b09695170caf353d2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1125_n"16692be821cef1ea8b0a220482f369c7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1125:64x1125x128_n"770679cebc3c6c9cc4777daf4114a9d8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1126_n"2c45d702193f7594d286b9c6d3592b81*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1126:64x1126x128_n"5f002831c381fdf6535913fada292ebc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1127_n"a19e1b26da2a9b470eb2b906064c1dea*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1127:64x1127x128_n"e731757dd1491ac2d110bff7a05f0821*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1128_n"17cd7a3d5fcd8cf350403c91ecec4b77*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1128:64x1128x128_n"84cd20059fc379f203c6ea0badad71e0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1129_n"efe407b98ed2dfbaa67834f23d336724*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1129:64x1129x128_n"a53ab66c521d82a8b09d3c98d0fd3425*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1130_n"8ca09564ba290b70451a44661256b06b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1130:64x1130x128_n"ba4eb3c6aa9be64e2f69031c251bfbd4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1131_n"58a8ba92e1f534997d89e679f9854460*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1131:64x1131x128_n"e0495fbcaf857822152b89c4eb7075df*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1132_n"4bbe2dc2e47a1fb8e25aba1909baada0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1132:64x1132x128_n"8d5f21cb915d21e5488d6c5d5147781b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1133_n"32aed2cc10e96dc50fef50a322570760*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1133:64x1133x128_n"d43d19329c4a0a3967fbc1a546dbd324*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1134_n"3eda78342d06dea081dd2c3c98522f3a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1134:64x1134x128_n"e05bdfa5fb35d496bf440718099aff9e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1135_n"ec2dd5d7e11a3310cee8d6b6b5cb07f8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1135:64x1135x128_n"5c44dc2b158b9a8b0fd4572e2ea8d29b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1136_n"4beac6d74b5c782a3ff650fa78fee4cf*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1136:64x1136x128_n"60797917d586edae18379344a8903a5a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1137_n"d6b92b78e64822409f71260de3f02020*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1137:64x1137x128_n"19307615f6343a984089953905c0a7e4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1138_n"60e022cbc332d32d909caef7247d6bb7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1138:64x1138x128_n"ba423bedf9504c3b5fe19f8105ef0738*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1139_n"90e6374978eebea591626e4694ac4bfc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1139:64x1139x128_n"0993bae357e3ea934c3684cca5abc2d6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1140_n"bae7eb24558858c80f6bdc3273b087ea*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1140:64x1140x128_n"c5786cea189089842b2f23f5f58c0229*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1141_n"80252b5f0eaccc3aa781517ac2ff0495*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1141:64x1141x128_n"d0a42f8d1406759ad6f09cea7033a21f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1142_n"71601d849d2732896158b5a1e569c8b7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1142:64x1142x128_n"cccad5e8343f226365a97ebbabd71d4f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1143_n"8f528f04add84b4bb8dfb736b7d87f20*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1143:64x1143x128_n"752af2973da1ce7b2b1ab9a0aee006af*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1144_n"2a2b925c0edd388a3ca173a11dcbc87d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1144:64x1144x128_n"c3bab2c6dc39daef480600092854c4d4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1145_n"335794907d12cc004671f705d63b9852*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1145:64x1145x128_n"7605d07e952e100a94e21393485d7661*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1146_n"e2e575ed6a15e63b5425ea1240e79d40*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1146:64x1146x128_n"fff620a66ab96e36bb8eebdf5d7b1d7f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1147_n"52c59472b4245d075dfbade0a2087a4d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1147:64x1147x128_n"e057e95c86b7bb56989c0559b6113d32*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1148_n"7b51093c734c12a721ae5af3f7828aea*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1148:64x1148x128_n"f38dd2866e55ebd77c945db35cd6d61e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1149_n"0aeadf3c2c4cd24580e75c5a761b5796*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1149:64x1149x128_n"57e1c68ae3c4949a838576ea79af958b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1150_n"5fc6b74fa13921562d8381fef535a42f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1150:64x1150x128_n"e91de6774463860dbc6bf4a0fc7ffa07*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1151_n"6d3b102d6cabe85b05b832840df424d9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1151:64x1151x128_n"b987e2cbd85a3da7b6e2cef085085071*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x1024_n"c0253ab254d179aaaee409770155900a*12024"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x64:1024x64x384_n"6fc296e675c8d5d7a10afee312639a85*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"9d311a22a642b435b044025e7812b327*4056"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x4096_n"4eb765b44f903aa1292188a5b847349c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x4096:4096x1024_n"65fa1236807d5edde7553d5edbd118a6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"13470e10fbc1d2101008f33823ecf7d3*169&0308c90fecd87b68cc8b5c3e96a5b9ba*1&7a5190fa7eeb85fa1a9053b5dea609cb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 1024x384x64:1024x64x384_n"6bb01f48b11a3af94f6dc836bb50ffc4*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 24576x1024:1024x1024_n"1462982f838376563f90c9b7bbb5e66d*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 24576x1024:1024x4096_n"3c3ad2a158bc016b912727a2cafa0646*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 24576x4096:4096x1024_n"66d764d9268383d8318a838cad957a3b*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x1024_n"0606c95b917a72efcad9c1dd9213a970*71"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 272x384x64:272x64x384_n"d3b8e411bc9a0cc16f616bcc2705efdf*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 272x384x384:272x384x64_n"3fee365fc0fce76a79b99722b3dd8faa*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 6528x1024:1024x1024_n"e73555a463c7eb61d318e4bfd24fdaf8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 6528x1024:1024x4096_n"f6101f62c0de1aedae64d2fdfdc11a47*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 6528x4096:4096x1024_n"bb4cfaaa200d2750e1746e280333248f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x2_n"f48a54b52ae675e097d7941268f92021*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x4096_n"fabf231fb7529ca0776563ff63d5e88e*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1024x128:128x128x1024_n"7c929c042b886dc55595608cebd5fe42*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1024x1024:128x1024x128_n"fb72eee57f2200dd42b6341fefd1de47*32&7dd68e15a53678c8228634751de8daf5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x16384_n"cff773c787d4f749ce633b8fc2619bbe*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x16384:16384x4096_n"96d39cab0dd216db4e1fe0f789a240c3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x50272_n"7856547ced4ac0795833ab2e6c0dbe2b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x4096_n"b9d9a7627e6d25ed79f40d7373bb1c96*39680&2cc7ec4313bdfdd70690e0ada7b99632*16256"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1025_n"2798f9fe06c7e91f3b0b25ab1585a0f6*32&0388eaac21b1d980c3d8ffed0a41c979*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1025:128x1025x128_n"4a4b1dbd173d4ecd9e2feff934d1ce53*32&7e04dac894f0adea74f959b14238cf07*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x16384_n"72c87d205d3910c06878404ec759463c*9920&7ceec578a308c9ca26184ec49395e525*4064"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x16384:16384x4096_n"9a350384c23f57f78223356a68efb4fb*9920&7759bc7a6f30fe77302e3ab903f09e6e*4064"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x50272_n"836477a77303dc0ba8a15d3b33f064fa*310&ec808189f624a54c9e71d3a0243e1744*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1026_n"72eddeae77d0be56081e0c196ebb1627*32&91c53325f6f3381eefdbae6e8080ed29*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1026:128x1026x128_n"7a7dd500873c4f0b845cdde48bd80a91*32&d5484b3105e1a9ee99c6ec3f2a50cbda*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1027_n"d7d69f4153e5b1c4519f0d52087386e6*32&09d95a3eed831213ee17052804eea24e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1027:128x1027x128_n"cf61cdf91ef61e7db4886fb4e9c8a291*32&56440d95f47dc160f2937f0006e31145*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1028_n"166b2ff04f538bd0fe1db10a1766a559*32&1da6d3f7cff7682980e995ec48a7fe83*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1028:128x1028x128_n"eeaed58fb05c3138db3585b3b8f835d3*32&dedaf5c02d275719a338af948365a223*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1029_n"0f3e3bf60edff6da68e254761be6d31c*32&762b9a3726d93026a07978fdd3a1a0bb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1029:128x1029x128_n"1e2ae13d35b70a33f97558a12c15326f*32&b1c19e32646f3975b7059f8a3537b7cb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1030_n"2ca0f8ffa1d7bfde2c48a78e36fcc5b4*32&9c7ed507b0416ec7ec52b6c41e471a8b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1030:128x1030x128_n"b1b0d2b371e864de376415e4b579ec75*32&116ac06716cc93e53d34ad14f79a3f67*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1031_n"7642d09b31f6a245b138e6b7b7989731*32&b94861850aa3f2e1baff87cd5aa41756*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1031:128x1031x128_n"fdb3b8f3e645cb74ed8fc826a1964e7d*32&4b8c6f2f8af8448c54c1881692fda089*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1032_n"076e8253dc811e03a06be1406a2044a5*32&7f9fa7a76d4a203320a4bb37e56b3a72*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1032:128x1032x128_n"da5b5d1d9231e28c5910bfd1d9d9cdfa*32&21f4793f43b41da3dfd3c13ee7e0f816*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1033_n"4f9165b1ff53f26604f7a88a5ce9c868*32&593f5ba368eb089a92577df8d5f17551*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1033:128x1033x128_n"7f907fc03c684b85fb1cfb4dbaeeb312*32&7bcd514227480c6f106bb1338fb424d8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1034_n"578a34848ff4ad10261cce9d43590aea*32&bcc38d74fcda9b83fb8fb795c731c1ee*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1034:128x1034x128_n"e176c883a348b78eacbea516b3183bfa*32&5f2877ec8ae4b161484aeb0820d49ace*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1035_n"6ef8466ad5659374333e1afd1cfe0efa*32&d78094ede666f32d74a9bddcdc112061*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1035:128x1035x128_n"54ce023900d435f5a34f44bce769d082*32&847f74acd9c553975a47e2c165957cb8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1036_n"27d732caddd3690e3b0b2546eee900a8*32&7079dbb3e307130ff90137ccbe04ec7c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1036:128x1036x128_n"ba029f21345484c1a718fe6c59aae07f*32&078d8d960b79d8189fb3ccd88b55c095*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1037_n"69cfb241e0af68a09559e6a23a8b99e0*32&d54e8b12babca1adb7acf576fb7d2536*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1037:128x1037x128_n"bd2a5a9f02f98e26b96bb75b82a086a0*32&bd05483ff97883ada92aa08241b20218*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1038_n"5e3aa3a669461f58fddb5dd6436cc5fa*32&ce2487ae96cf93f15149d8cf7085984b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1038:128x1038x128_n"3fa5b3d8eb7e9cda00e2453356114cd5*32&93ecccee51d70017db95c18fe1b00aed*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1039_n"50123bbb97c9b1bbbeec06ea3cae9c8e*32&92cbf5d486d586065e2ea635efe6e797*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1039:128x1039x128_n"01a4a878356c7e5488044d747b1f34c7*32&8eb60b14817b509eab993eb914b627e0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1040_n"47335eabee90e10c7479b7310478724c*32&59a58f105164cc600900d029311a0eb4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1040:128x1040x128_n"94583b2f8368cefa4833a2bc7e455521*32&ca2cc743074286ca508f2f7e33eac611*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1041_n"23fc30025d2ae044732b27af61977e06*32&b5bbd79c30bbffffc6d503111ce657ca*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1041:128x1041x128_n"2c32f0e6ff8f97d45b892e2272b8d1ad*32&e4eca5fe39475c35976cf9ebf8432425*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1042_n"ec0357687568d4029d6f4d5b6fabf6a2*32&a32e5ebc6dbb96f8687d9f568831e0d7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1042:128x1042x128_n"011e19b70dac8007374c6c6cc2c7fce8*32&a1c84f9ff39cdbe229361a0054e4ea20*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1043_n"e272cf65ed6dbf4c5484ed04cbe384e0*32&245f94c8e0906a7a723869d4b2bf7959*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1043:128x1043x128_n"54ec4715bf02faf1a81ef22a779809b2*32&02ee9434b3130cdc2bf720adfe4eca81*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1044_n"7d14896d61cd27bbd383cfa4883565df*32&d5db7d76c463ef2e63f173ac1e9c30a5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1044:128x1044x128_n"04421bc26fc06444d2676208587f0712*32&3b125911142462fbe0bb4de2d6c581e1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1045_n"39f234e5aeb5f09d52a3f6e6a99c91fe*32&0bc6cf88ccf69c9fbfb53b06f4c72186*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1045:128x1045x128_n"4cd89dfd118ed335ecbc39f54c61359d*32&140e062c287ae7e9da36c229c2cad6f2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1046_n"468599ac4261e4ea102b00c9595392ab*32&f71ea8873ce16350f03f5b0878e8c571*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1046:128x1046x128_n"520ec58628fba5b7b15b1532703038d6*32&03f7607fa0360cb49c2608d2ccea491e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1047_n"ad2b770b1030db609152846d034342a6*32&99d7cdb1ec28bf9fc4cfa5f808192936*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1047:128x1047x128_n"1f94fbee75248a7e7631052b0d8e54f1*32&5c8a0ccb8a4b753df0bf71f9d36969f7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1048_n"58af2324da4557e21d5fa6a2f94a4267*32&823eb6c2762d3af2e3bab47b0af1d1e2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1048:128x1048x128_n"76cae5fc8f61c190b20e524f792ef2ba*32&c3281c9b49edd30816d342e7e4a0212c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1049_n"72d868522b5e118d68f12f3efff5055e*32&6c39c6d8486a02ba2c73c468ca85eaa3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1049:128x1049x128_n"d37f958915f0d61ce0f155d9fc038c9a*32&a036dbf466339066d7a1274005e292fe*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1050_n"6ee663c5142b03b4cfc17738c2c4d6be*32&fa677c5e9e7178d33e553f1f44f64b89*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1050:128x1050x128_n"9e794dbd5c6d32f05f9ba6abf52f559b*32&86c507d893ac415f0b5feca74f85f684*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1051_n"4f3522f00dbf5250654f349d8054e414*32&876bcd8910cccd54b51369cb1b8e522c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1051:128x1051x128_n"7cf75f6340eff0242a68348a28e9b189*32&f19d10b09620bbd22b69663b117a372e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1052_n"0fef05e0a5edb6eee07acb3274bfdb2d*32&58414cceb6b81de7b561d238b2ce7ec0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1052:128x1052x128_n"c8f9e4abd6ea9f656ccabc445e9effe1*32&2476b1b1396ee6868bd5cf9f6fb4da05*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1053_n"9774baf141e7b82dfc70b9796bab4c23*32&d513a9e36e40c0d9c672b745c5b40778*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1053:128x1053x128_n"5a9f851c184124b6e3736380d6f0bae0*32&3bef969b48b72a8c714738616693c2d2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1054_n"8e94b234688c0ca5e3bf17859b23909c*32&fffb7ef3a04bb09b7ee12adb655626d7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1054:128x1054x128_n"401ac470ad6ecaa76413de09ca2c051c*32&394f171ecfffd0fb8795aa57cb433323*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1055_n"ae31ada9db54059773429b5bc7c87b60*32&6bda49f9a22860debab38a8f052d6285*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1055:128x1055x128_n"7644aed3ac8b7aa4c4a329d07f92f9eb*32&d040a05dde5b215b70e411b881ca051f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1056_n"0d8eeffc893b72e8729f2a4a87264d89*32&f97866d464cb01dcc65f97566b31d1ac*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1056:128x1056x128_n"c12f4687216a5804c41012ad8e30ef0d*32&4d5ea206fd661a72c341cee525b57843*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1057_n"56e23adecd9402f6736e142932e1a196*32&f9a5916a94d5447569127561c981aef8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1057:128x1057x128_n"963b7a818708571504d359ddeb39f190*32&9424505fa09e839e9096b8a9ae11a807*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1058_n"34e60a58d5e1a37709a40ddbd68eece3*32&0de4ca46b38cdb8ba39d800fa706ac57*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1058:128x1058x128_n"707f5657375aeff13caaa507c49bfe48*32&c3bd010c97c7a89e5a85a321013f6c69*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1059_n"1e80123d1977f41cd5843372f5fe6de0*32&a3fe79914db1f7eb0d2330ebf8dc32ef*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1059:128x1059x128_n"85cce4bfa639f9a67efce8e61020647f*32&0cc0d3fe3300ff5d2631d1875fffa9b1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1060_n"4fe33ccaf77fbba51825eb4b6c2a0274*32&6a3221d40e139c769c0008864f2b1465*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1060:128x1060x128_n"285b12d98c390e2182800f686ca0e3aa*32&511327fd79588a8d19280061bba26753*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1061_n"f1559cb9b0352bb3ef0c709e3c441e31*32&22713b90b8fd75c1bec3776db904d8e1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1061:128x1061x128_n"35fabfe1a8c2d0b9ef3bc92ad66412f1*32&6469656f110174ec98fd1fb629762d15*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1062_n"6f788bb379305ea8bb8306a7faddeee8*32&06f044bc0aa2c75cd2aa1112b6420abf*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1062:128x1062x128_n"b66fe0109f7fe67df1e8fb0c168bf7c6*32&1df1582d68bd5e976c9420640900bc83*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1063_n"af9a1217a0a65efdd292c02d90c60c31*32&844130f2539819199b174baed954ba95*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1063:128x1063x128_n"9c4a47c5804f5a33be5a1144e9590ac8*32&99515c4f08157e8404c7bcdce2084fc0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1064_n"faa7393daffca4c4e9ce217b50aec8be*32&eaf3af75d52cf65bf7d24a8f3a94c25c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1064:128x1064x128_n"5fa397d1e22a21038e7c14c9a31b7b44*32&d835c4b4de6afd1c0ab74ffdb93b7032*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1065_n"b3b24358144a77b5e58d8d93be477972*32&fb6a4524e415d49b1bf92cbfb547edee*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1065:128x1065x128_n"7facab71c4230aad61f3642d76b53338*32&4688d2ca195555cf5288f3a302780584*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1066_n"200e0ef54bbb168f3385ff83dff997d0*32&2664f57dffab7bdc4c4559d2d67be847*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1066:128x1066x128_n"a57b2ca14308d29eb59e4cac8e487996*32&da228457e01c3dc496848f4f788bf9de*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1067_n"45fe7e33d5bf7e86825a553582cca757*32&5cc635de304f6d8dc73a46de0d10aed1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1067:128x1067x128_n"f7232a497e1caff3f352efb707d78025*32&6ba1e83c8fcdb59b98175ec3907197ad*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1068_n"37f0f976bfcd0ef3dc5a373ae885a1ea*32&8449c251ee7e4f93a34a4e5fa52c0c10*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1068:128x1068x128_n"cac677b8ac20fd3d2c36e1bab039fc09*32&4bf5185e1d8cf9c973d774fd1729ee1e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1069_n"99e918e304ca48fd5d40470114518085*32&b7026bd5e0b583fb2554be515a67989b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1069:128x1069x128_n"ca840f654fa897eb852b5bc0fe8f3cba*32&1c72464e7cf04bd04e8dce0c11cc5818*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1070_n"14d1ec63467060cd77805885df8248ac*32&c3bc89e80419f18de0da327739b547fc*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1070:128x1070x128_n"3faac28f92aa91638d15e45a1e150346*32&a028afd807c0ffe84ddd820831f1d7f5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1071_n"92b26882366b87bbfd21777c811d1a02*32&9ff27d131c04835388ec9131bb36c339*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1071:128x1071x128_n"2ee8deec0fa8bf0ed626b89cf625d573*32&0e335c7f60b7eee7980df4f9abfc757a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1072_n"7c7bfc68aa9fc6bf2f31d40ed3406e72*32&d1d132e8288bf88bfae02be770c75069*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1072:128x1072x128_n"bd337b331da28b95420344e8ff8abf49*32&c9b6783e24de97bf0ce5e169860dc1d5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1073_n"c6d700c3fa438c0e9a56ee37ce6ac5f3*32&db294268c45c09912ba486756815ce2d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1073:128x1073x128_n"cfba829c33891408af880ebd6bf70366*32&5b280c350974dc78a8adc5b47cb63494*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1074_n"c3f9741e2b3d84f057ec3f40cbe24781*32&6a4217b5233ed2680c5a7c3f2904a641*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1074:128x1074x128_n"e80e978baf52b4b591f056abfb5636e8*32&98c56b9dba36d236e5152b1445315745*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1075_n"df6c8d8466616d5fe83bb794ab497a42*32&e8573b8b6e595a5f9d58c09fe476d509*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1075:128x1075x128_n"d8b80f4fb29aecfa52ef23dfc31d1e84*32&52dbc1458da424555d7689a48bf6ed90*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1076_n"d7bc0748bdeb4190217af0acaea362e0*32&04747c293aca3890f229f1574050ade0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1076:128x1076x128_n"ccc51e0a1d00a3cdd04202a914365f0c*32&cd3fbf85b81c9c3f129a26cb0567b35f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1077_n"cf128fc8054e83ab78a62a6eadc94d29*32&0ddf5cd7824943979551e5c94f03014f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1077:128x1077x128_n"6da69ae91c32aac788c8b4c5a1f75a09*32&7132d78b2f4c41e595f2d472cddf1fa4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1078_n"5b42f870ce0e4b7d76998a3f85ef7402*32&1a5e7e46c12b5f9027c9370397620cf2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1078:128x1078x128_n"0f581a5009eb10f96cf93b25f17285db*32&57db322024f620763403bd59a9e37bd5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1079_n"8f1f8f45aa02de4e8b702ed064d0bb00*32&73190b47841b5913126643cc57d12e88*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1079:128x1079x128_n"ad2e59e0f32dc24c74c3ea13890a7c4e*32&fd1f58be2b758805b2e7fe7ab850d267*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1080_n"68fb52d64480cf3c6d09a4cb5f732559*32&dc6eb9c450a5f0792ef92beb4db40fc2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1080:128x1080x128_n"993e706ecc3018b603d1b7a93c284a6c*32&b6135acdad8980f0da1d0f9494933332*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1081_n"f365e568f6038f30e6f35af11a045229*32&633ba9e7e3d06ac401dda035c50cd648*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1081:128x1081x128_n"222b3c931b3840d1b5193cfd5f78b296*32&f9948abcd94f5a2dd31b6fa87f65148e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1082_n"4ce49adbf572b69852e1d74fc174c1bf*32&132efcc6ed76c3e8f7787c10dea59d2f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1082:128x1082x128_n"5f3c97f8f7268719c9ab61ba5e26a37f*32&867b1c512df9bc149d95fd0723dd2d96*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1083_n"fa3d542e54d6f7d1550cc54b3981ddb5*32&13595f64438c205cd083d41d56c5d486*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1083:128x1083x128_n"a856fb667e9bca04899af445e4c8d9bd*32&a0e9c4a43e8ef069b44cc212136acbdc*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1084_n"ba710ff389e84c2af169a5953b4ac1d1*32&f2ccf576253cf331e45a35f526a880a9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1084:128x1084x128_n"9ce650a708f059071c170a8a17dd20aa*32&58e136cecc2596fbd806d4fd6b4d0c38*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1085_n"863593ab527be15b827f74be173dfe76*32&c2174a2607a1162a5b0bc9739942623d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1085:128x1085x128_n"83c6090a287b408cf6381d36dd1f5488*32&b6d7e809242a8b9b73d160c0a2a1a8b3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1086_n"6e5509f35dbf28948ffa6425ae56092b*32&9717819db240a437b09e39495361db50*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1086:128x1086x128_n"6539aef35b967eb453885c413780b070*32&c7d20f3c3558b030503f3312c5ae177b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1087_n"f1ccffa30aa4531e51914cb3eed4a5ac*32&97f53ec4abbcc74664f78e05e8d222c9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1087:128x1087x128_n"9ccd50903909435cd429466db0babfd7*32&8298d38fc7d142ee812274d000d10e80*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1088_n"095ae251b4f1e38de054ad7da170f0b6*32&af5180e82c80a0ebf82c2e1f58d3bec7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1088:128x1088x128_n"1b868f80347fbf5ae06dd69c35cb23f4*32&0b53f763e8b84d8f648f52d479cf8954*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1089_n"3ddb08b7fd94848cc6ec6d404f3119a5*32&f2f261ccd3402725edd06b6526b37000*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1089:128x1089x128_n"e581d7d9c33dee5b6af5e2166118bb94*32&d382413756a027a944d4c3a265363d11*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1090_n"cd67afd4b53b0f1a424edfc7cae3438d*32&c69c11565b0d63f6ed7f83d096181fe6*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1090:128x1090x128_n"6f207e6b74b97bfff06ed2773a189873*32&93db5580210ee7db0781aa433fa4bef7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1091_n"84811139a85d2ead249785695afd7809*32&9ffb346e885015bfebfb6debae8355d9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1091:128x1091x128_n"9ba9eec0b89ae7b0e253ab00bf1ee28f*32&5be6b27df1e5e0c3ba3f59d26a270dc8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1092_n"0415e2e00371bf7b270c5e8572f28918*32&4f9daf725c67e24623f6053d95cf5245*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1092:128x1092x128_n"9083888cf56d1ed2c7062b81fa0d881f*32&ac3c6c543f037afaaaec49b0901f0965*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1093_n"8e5ce466e4329abbb81204da5c84bb03*32&108daac34e8357ca7ae566c589de81bc*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1093:128x1093x128_n"9dbd9e6d30adac0bb118f88604542bca*32&037fd0f98c812e277e43eb5c43b94075*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1094_n"a38a29fc5f8597cd0b638fd3bca9f3ee*32&187de6376e0ff1132f1d7f9db46e9517*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1094:128x1094x128_n"d8f1a657d1d4e3d8c2bcee4ed8089809*32&d2fed91147d89b52219b5657a6ded817*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1095_n"4f250ce07e7eb76218a6c37bf3c9e00c*32&704fcffe6f40195195b4196674a29ba1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1095:128x1095x128_n"a47ffa47e913c2f2bc86e342f51332a5*32&098c7c667c042d23952b873871c71c49*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1096_n"731172fe7841c790b4041f61b455c837*32&cd628c66892c0a98b5904f37216be54d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1096:128x1096x128_n"7a53f9039c25b0ed07ebac84b0440e8d*32&06a42b0d58c739777f0dce27da28091e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1097_n"608e8679d2e532811b0aa849ebb1f502*32&af37ceaee21f2efabb9f9feda9b6a8d1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1097:128x1097x128_n"662036326d2b0f4676042daab462160b*32&3cbaa38f8d358c1c4d2c12bbef398799*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1098_n"cd64bfeb2f3606e12b12f4d5b618910c*32&a79b5df4159d53e6dfe5b1650b9e8db1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1098:128x1098x128_n"2d074279ce07ef595f8d841e2ac8227c*32&e1cbff58d1989c1ff772e546286daf45*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1099_n"e2ea2b009500854806922c866fe7821c*32&e555ff0c02c7aa6bd203a9af0bac87f9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1099:128x1099x128_n"383e2255a072d8ebf39724e5e144bd6e*32&fa885672c8366bcdd73ddbbb759cc5fe*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1100_n"63200b721f039b5c541ba6ab1ff45374*32&79bb5bb202b00f18dc7b8c4bee65b899*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1100:128x1100x128_n"2c5b16c84e054fa275cc8e0cea2e60da*32&5dcf02df674bdf79e48b35ea4047bca7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1101_n"3553c6d5d61b95f08e662e2e793b022f*32&42df702f6089e146ab91d34470b88f33*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1101:128x1101x128_n"654864d7aa1e8d199bb6146304e7e2fe*32&89eed93213d57bb5907f08b1b00def66*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1102_n"56832eb12ff0a4b421c3fcec0131f0db*32&428c6f1052970efbc5318623c79b039d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1102:128x1102x128_n"6e34ad123d634597630783a228ae4145*32&e3433fcaa66c8a55ca2be65f26f53b48*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1103_n"0f857b8afc7fc4a46c03604827ed608a*32&15a30b09dc54bb5629bc9fe141f0413e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1103:128x1103x128_n"91c2b7f96fde813ceb2a9897374b72e6*32&8bb8fabfe1e8995884f542914405f2af*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1104_n"7178e8fd83caf92828565bbc1961c10d*32&a37cf950aa50dc69e95eb3986bab2e33*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1104:128x1104x128_n"c3be4a06ff88d0e86ede80fd6198cd31*32&b3285b070048bbd62f7659d8d32baf9c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1105_n"ca6c70808a27622e98fac1e6450f6c73*32&64ba2d824ab9990d4c69647394c6763b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1105:128x1105x128_n"6d516635e20b48ad29a6510f79623c8d*32&c228e930d3d26a68896b1010ce168abb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1106_n"4671325cce3b94ed0908322e061f10a2*32&9f9cba20a899fbcf182b9be978d9d7c2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1106:128x1106x128_n"6e898f0d913501da8a8414eb06e108f5*32&2357ad0ee5b3d08c5a3b9c6ccbaa5124*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1107_n"0bf15513b1c62fc8df2a520ca9190c42*32&6bea94d020f9b832ae7fbd83275e4f83*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1107:128x1107x128_n"92173026996306aa169befb0cc0607a5*32&502422b01c2e54769f462821fa9df577*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1108_n"65bd5b42bf24e7d6e091ac73ee805354*32&40028555a7a15fc67ea9d2256bbffd1c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1108:128x1108x128_n"0425afc4203d2872efa7b8af6df18f8d*32&2a2bc2ec1e207b17f22cc3a0b9b96128*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1109_n"ca4cc69ac00d0cf068b24d1a992b6356*32&e83f2416d2c003efdd6ba7ab8969c44d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1109:128x1109x128_n"5f4f8c1e973049690cedee4f99c7e766*32&6e01b865fcfb65d5976153091e2dc79f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1110_n"c2e5e2de69cabfe426a6460a747fcd87*32&fbb86eb74be8809bc90eabc0aa33605c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1110:128x1110x128_n"5dc71796f2d1a7c1b0b9c50149a98353*32&edd64ca9bbf3bfa9ef36f7242383c516*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1111_n"241add149d3a489a93ddd29c8a37a4d0*32&83cd1c0a2c1ad734831437a8e39aaf6e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1111:128x1111x128_n"1c9f7f8e002072a360cfa04a7942aeaa*32&4d20b4feaba54daa5c500cc7827a8ee0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1112_n"6620624941aaf92d2daa510159a7e7c5*32&f79450954edd26e36205141d53a52d87*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1112:128x1112x128_n"80bfc6b95f8e805514d6c1d5086599e5*32&e41484a8b23001b6324af8f6083d7be1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1113_n"9778b958e4922819e25b9bec274dc538*32&6d579a5bba24886055eb5c5347ee40c7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1113:128x1113x128_n"834126af72ff04c858bccda3bfbabac1*32&5e7ea76346242401f5746e909a4aecb3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1114_n"f93c6fb8c9ae05dc52fc81d5e9e27161*32&838d688853b95b7b261e1744bef9e79a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1114:128x1114x128_n"c2dcff70cf8ab2c1714b365002722e83*32&ea2faaf08b4a38fc3ecc22d88ff1831b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1115_n"3378a0061906bc2215255a567f3b2e09*32&d272bee12bc2c36ac98041fa24fe4aa4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1115:128x1115x128_n"2f368642d89814ffabfc83fbc4fa8eff*32&b31a92a3f55de35f5457bd2a4d193d58*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1116_n"b370f6da1b5b3f1e7b0b16ecaf536382*32&11dacdd8df6bf14a47794f26794e8018*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1116:128x1116x128_n"72115f529888759aa1a9b704cab02ce8*32&006e3e831c84090f75f99e5f7fbd1e17*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1117_n"b3eaefdba4b82e97993f090522f68f9e*32&da406d26341fad0b39f13676b6b16fd8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1117:128x1117x128_n"bdab6bbe7347d2a9a79f99ad50dd96e3*32&753ff4858d895ef8088034d54b306e6c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1118_n"5ea6e7001b1104098f952bc3bc4f8e7e*32&e256adf1879067a46c99f5f79e7ac466*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1118:128x1118x128_n"8fc030995d81eaaaade672f2acd9aa30*32&2a34a5cafc1ba203ef9b4b7182ca1da4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1119_n"c7008a0f67f3870ba4e400b9856b6d62*32&7a1e06a444bc30ee00c872c575bbb400*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1119:128x1119x128_n"d4e23755560991bb2897eb17fa299afe*32&ce980efbd1b9316a083767324f11db9b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1120_n"4a76ed31dbad5339680229f5b6214a33*32&efd3bd2fc59eea42c2093d987e34656c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1120:128x1120x128_n"b0a24f2918d9d4cbacb5aabeaa2c24af*32&383721ede1bb731c4ec712d3a4155bae*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1121_n"a544b7e345537a6367d9b4b012e460c7*32&a4c795616afd52546dc5142a771b9286*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1121:128x1121x128_n"3c60c3f67c6f9b706864b32a4dbe57aa*32&4faecc9040c7d3649b7155533eca5c5a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1122_n"05de92c956eb7c1b818d072f7cbbabc8*32&bf9c8ece1b74c7f4615e7882a5702574*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1122:128x1122x128_n"370698779d2fb2204f02727d40c8f4af*32&3a7348b2e1b0503139866819a464a636*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1123_n"816b5d054d92b50f5254ac272a98c588*32&cfe1deb947658934d9c6d8e418f68db5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1123:128x1123x128_n"7ee68f74f528ad26cabe31ab473a17e9*32&f7ef1f7b615197626fc16b2e7f4a1311*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1124_n"bac3f2cbb45ba96de06b95df1491683f*32&77dc6cb5c83f4e9b0b37da7b1dc25ab5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1124:128x1124x128_n"820d5a7e8079dadbd4732fac8f0fe8c3*32&0c76131aace25d010f8c482e56f82e8d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1125_n"b3694d38473ec77482d67c9b37aa530c*32&31e852d1e6b5b59f2454bf20d77f71dd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1125:128x1125x128_n"5670988dc0451417862bb9e044749c62*32&f109a4b0b69289d78cc514986375db56*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1126_n"6662a35518dab3e18cd92df3672f2811*32&540fd8b42aa9ebf6679720bc5bf601e1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1126:128x1126x128_n"515a38ea8973dcdb4f34ddf9dc00c5e2*32&f0b508effc64f5b4d56cb96da8ef85da*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1127_n"845af355742cd8bbf38dd1355e3940f1*32&2516e837f0fe95501d0e75c1c254e6d2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1127:128x1127x128_n"b50b10be1fd1c325808ba9861ce526d9*32&d6e9f8aa53b7059c7281dbbfc1c2a67a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1128_n"129099cefe91b675e793f3c6ef9440b3*32&9fcd69cbebc300c3e16d8a850328a305*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1128:128x1128x128_n"eed0b3a3e23a9a8a97495b7dd7766979*32&161a0af76d5faf0867b9225b0da9b2e3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1129_n"af239e56f0bb1d338b511d188274ec14*32&5cd7c8f2dd0ec4351dfacdc9404a4c13*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1129:128x1129x128_n"d3e3aec22157e92bca9e393d5856d6f7*32&a2a4d4806300347d639f47ca1ff9d490*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1130_n"8162256ffcadc643a9335d2dcd452de6*32&2a0f3e43793e8dd4c7950109d546f6cf*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1130:128x1130x128_n"8b450cc96761c3b15b98bd09d68aa01a*32&8e2d0dcbc9f13f2688191c7058c43937*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1131_n"977aa5b88bf5c035946a46f98a90d513*32&98d96612e22021dc4c30517a34c28603*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1131:128x1131x128_n"0767068f9632ea07dba42a6b0fa295b8*32&0acad777cedeced0fb25a8d8e12c2df7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1132_n"02f984a43df0fda077bdc3c4a9788afd*32&238be736a55cfa5f1f2b8eca051297a7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1132:128x1132x128_n"c1a1a3f1dd0f658cdf1a8e959fc54b53*32&c36c7b5caf28c7dd3b3e1c27a9d6c1ba*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1133_n"2c6b488e485f72da1c5a680e7ec42fbf*32&9229e4d820491dc393d17931e7d33294*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1133:128x1133x128_n"813bae2903340645d455623db7bb5dfb*32&fc06c2adefd5e52cce42a8907d03ac73*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1134_n"7e846a46fc5d2fd1ef4c73e25f267f79*32&28f2d4f3b428fc5e0924206f8a03f2e5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1134:128x1134x128_n"c49435e43c5e3d45f95a830d18dc716d*32&eadcea397f0caef570aa8b613d149faa*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1135_n"ab6f2ff6720834b5f385ad3805c602f8*32&60bda1c8e34aa658111f72bddfe78ff4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1135:128x1135x128_n"671f761fac12c446c851aa9edd22cd1c*32&ff535e8d5cd11495970020e16bf16139*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1136_n"9c289afb919734643b1cc7ff32dcc8e3*32&a1797e672284410d21f2a97ca62d74d1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1136:128x1136x128_n"c2c15cc99dba2e7ba658126c54890ab4*32&71322d164c6f2f3e4e3b247df14d9b5b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1137_n"74b8a6650ef1900e08f8e648600f5cd3*32&1306aedfd4f82264773a3935bc91d0af*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1137:128x1137x128_n"a2a4f4f50aece102d47c6e28fb3846fc*32&aa26cd0ddc8ab210f187dbb2c9a70dca*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1138_n"9961bcd3aefe600566d4b9fb3d90c913*32&09b52dc8eb27f438b785180a6357d394*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1138:128x1138x128_n"20b4c4490d5da23959ae5c199164ef45*32&b433b38d1bc5129432b8eae1c21a9c9b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1139_n"c1c7ae84c87a10cfbd2bc955b094db45*32&afea3de121dc614ca5eab7eea97cc81c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1139:128x1139x128_n"5a6ed177be109c29899b95832a04d1c8*32&2abe819aaf310d68bddf380aeb7c28c2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1140_n"795e78c0f65e6a5bea982dfd93c9b06b*32&adfd79d1c21454817717b554edbb06d2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1140:128x1140x128_n"af7d356aabae6aed202f307658c8e882*32&e4ec6815d9eb6548ac2ab1cf9afda29d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1141_n"dbb02b924fa938fddad01b75897ec55a*32&5fb5a301b8d5a98760d2128e30156868*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1141:128x1141x128_n"5316061e6fa7bae5f4c43508f1efa96d*32&9debfbdd1d17eb77db8b8c5f0a7d9d3d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1142_n"9f2d5d79a067a56f88d51aa1a1196ee0*32&5c0a44537ac77ce141f54cd127b2f947*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1142:128x1142x128_n"27b69a780f2db5e423dfcfedfc3611ae*32&c0ae0691b320d51140f114bfa2006bd4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1143_n"c0d067bbe39aa1f4442285adc76d64ac*32&d969bdf4b65e8144c3f89e2ec72cdce1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1143:128x1143x128_n"c93fbd9c6b4ce1eaa48a47612d5650b6*32&b6e026d178b95f0c68ac232928519854*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1144_n"f54eff28e710e5eb6a91eaa6afd71345*32&2d85247f3249229d1b2d9604f3010bf7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1144:128x1144x128_n"1813ab847d0352747266a6cc4efc67ae*32&b342e90a97086d0941dc802ebdbccc1b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1145_n"971940d2972e1c3d8d6185e75b08d7ba*32&3e2ec7f4f772b28f0ec70315a30f7080*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1145:128x1145x128_n"a7ffd575f8adc1ebe696afd2db14f5dd*32&be26ffd49a23036a6cec845feba25ef2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1146_n"863bd97bc6edef21183ac0d0afc27a8a*32&91d8fde81bb75845bdd68823ac5cc696*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1146:128x1146x128_n"5756d59c772633b757c8be3091b88642*32&d4020cc1b29935712ff16ec55e8b8f29*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1147_n"05e281e655d25638922eddc82e0a338a*32&f33a560630e6b988adef36536ea8b87f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1147:128x1147x128_n"56e7e7699ac30c3d0c3d3bb91caeac58*32&a47f8dc8fb1cd4ead5d05c102cb09d95*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1148_n"80461c7fab2777a1595cadabc7925382*32&43eb2cbc1b911631f82a7a505a7c6ce6*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1148:128x1148x128_n"62bdf4be8ebe5bc5fb65f0c37183f8eb*32&fdb4f4363f5e8966e18308d22fcfa116*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1149_n"9aa14f8dd8ecba6e283e0df35c2bd28c*32&107cd7d4caaed2dbde70f553853a8a9e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1149:128x1149x128_n"889b6bb0382e4959ed6519dd5ad65aaf*32&d15c060a3a416b8f94ba9396cf226c45*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1150_n"f770c36edc001d891aa780a01ee42183*32&3136a24338b3c7db75c2e3e90a9830d2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1150:128x1150x128_n"671844a54e8137def534247d5bbf0b0c*32&22360d2aec62a8ab64f5ac8595a578d8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1151_n"a6b1633f474f944cdf4aba8600dfed7b*32&1fe823f03c8d4daa30f9669e1dc9d036*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1151:128x1151x128_n"01dff0982ec2e6d1a7b965700adda1e1*32&da169762fe3164bd7306664cf7c8834e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x4096_n"d679b927cd3ec16bed67a3a07a9e62d8*112"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x256:64x256x1024_n"7a8ed048a0e7c5b06496cdc7f12b97e9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x256_n"5f8be70f47503f61fba4f1b7d34c8285*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x16384_n"5db1dc82fc8b0cb9a245e671efcc15df*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x16384:16384x4096_n"2b8ff06c567358d4a892f336c702b34b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x50400_n"2932731bf4f77ad5d41b425858ba3bb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x4096:4096x4096_n"3d394b57fe5f7527527d0bc9d963e904*14224&5f51c16452803ac0f2a4724053d25b78*3472"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1025_n"2b21c3bf1bc4c29f262841c44fd22ede*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1025:64x1025x256_n"60a7eb490dfd87ede30d44366ee7f07c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x4096:4096x16384_n"da29419dc4168e36ca4707839574cb23*3556&feedf14de6d113355f9d05d6d625d02e*868"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x16384:16384x4096_n"b504b5ff79e2efdc78ca9d1531de3854*3556&2b1e31fe2294f83bb58bb30b2eb43d5f*868"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x4096:4096x50400_n"db41fbd0a5ba3973e3142a8ee4b344ad*127&9ff6c1d26bbf1952fbfd04150dbe936c*31"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1026_n"bb7228b58ac0118468a92a2cf37311ab*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1026:64x1026x256_n"ecefd0d8173981456d4cfcb6df725b1c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1027_n"8c7d44188ae26f5a2ed05287c2e21843*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1027:64x1027x256_n"9a64645c659626445c9391d0668af495*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1028_n"1c748e2e825f65ee06b53513dc4c31a4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1028:64x1028x256_n"17efe0eb8e8f51122e3374226f91ab3a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1029_n"9ff853c1de94f2972406b297973bc91c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1029:64x1029x256_n"1d7772e7a6a9f8fb5175b437204417cf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1030_n"ad46174ba8259aaf309726cfa3bfdb0f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1030:64x1030x256_n"ce2aef4e1d1995001bf20ac2fb30757c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1031_n"29bdb3eca2160b708feb33917dd81325*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1031:64x1031x256_n"5810b90905990539b8758757595cc8c1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1032_n"2518c4e37ea1104f02fa92dcbba791b6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1032:64x1032x256_n"6427b286791104a7190de509d4cea760*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1033_n"a2db45b8ca03a2e2032969eab00c8c82*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1033:64x1033x256_n"f55d7f9a6809a2354e386596b68c7ba4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1034_n"9b9926fa2b12aaa07d2982c767e75e3e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1034:64x1034x256_n"315648ab1598d7afee0a574e21268e69*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1035_n"759890d38fc9bbbfdbdfddfd5cda0be9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1035:64x1035x256_n"e0bf7b1d5b8a519d2eb27de66864b356*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1036_n"ffa2996774fc75587aa9ebfb71655e1f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1036:64x1036x256_n"06b597c4109e6136dce9b482602efc58*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1037_n"42959aae48e0e794e25e256df2e491ca*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1037:64x1037x256_n"9fbc5ca429360b5ae683029f617dee28*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1038_n"202fccc943b1517ff3893547136f9be2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1038:64x1038x256_n"65b89e9e79f1bc3638365dd4a2ef985e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1039_n"24cccaceb0e488db50a47d2d34a94ade*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1039:64x1039x256_n"fb4ef01289945cd06036f793693c1119*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1040_n"01a3a9ef7c0a21179f2e940cbf68e8ed*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1040:64x1040x256_n"9858009ce318d2381fbf163e369a0904*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1041_n"3dbc62aa244bb5b551092cb39a78f4c0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1041:64x1041x256_n"f365d7e9b7e0a7587476e0a16bd34837*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1042_n"5dac9d69a1762c9896f73a46820cba5d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1042:64x1042x256_n"16c44e96008755071c7c9db4fc0b7f9b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1043_n"15e79e07f0da44d45691addf32e3790a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1043:64x1043x256_n"21cfd2f4eb7d71a1ccf0d26d89878858*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1044_n"0733500e11cdb822c6a917a52cd27d87*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1044:64x1044x256_n"63d21f25b149ee9b4c7515e9b8f548f2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1045_n"709784ae26940b5d76b4f867728790a4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1045:64x1045x256_n"04cad64232634da863bd31d9da837a80*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1046_n"f2c40f089727642bf0793c1dcaafcbc1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1046:64x1046x256_n"0f9976b04d6143a76af8dacc4c5628f6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1047_n"54225da635df9495140f7c027b627060*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1047:64x1047x256_n"3682577835ade960ee01577216948302*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1048_n"2133095aff1c271ba29fe711a0a1d61c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1048:64x1048x256_n"7dd221192a1736f24a6580eb4d52f4ba*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1049_n"47993ca23aa5b9747674d07922035a79*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1049:64x1049x256_n"1a6f41b217787a0aa5a1c33d37bd1ef3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1050_n"fc27a87ce7d75c347ff322b16c669848*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1050:64x1050x256_n"197d882800f64ce076a34af7b72fd1c3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1051_n"79515b646e633561667c98ae60d1967e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1051:64x1051x256_n"64c419ab2ef0108c197ec77b7c8eec14*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1052_n"8d6bb8e3597299bd6047926cfd181c8b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1052:64x1052x256_n"64f0ea551e7d963e67c31a1380f03ca4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1053_n"a9dfc9895442016e0f6511ab28db8810*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1053:64x1053x256_n"99668a6d658e1ff55b6fb6a9396b9d94*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1054_n"e17d59437a32f0847e9c6c7ae18d3454*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1054:64x1054x256_n"fcd069990aad5ca59dd8cf64445e0bc9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1055_n"25c6c2f07fcb460f635282fcd93b22a7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1055:64x1055x256_n"e4b64d9487c7b68af5fb8731058f9446*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1056_n"62bbf301a61f1ab515e1852f55fd0dfa*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1056:64x1056x256_n"cbdd2e7f24a711256508b53a0b9fb766*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1057_n"e60e0de65bd56e8201f25f36c7ac30af*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1057:64x1057x256_n"9c6f6f8732ea662db5479ce5288ceed2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1058_n"4e946ff9d5c05f02e3043aa9482f290f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1058:64x1058x256_n"344ee1e3c402071fc8af4ab9ea66e8ab*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1059_n"a93fc4b2c824e0add345d9c0798c49f9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1059:64x1059x256_n"238fc3408ca6046530abd8f838de3b27*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1060_n"66685cf046f1c7e18e5001d19fa91cad*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1060:64x1060x256_n"ac197c33a4a1766a6d56498cb830117d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1061_n"247ea6d1607e9c524007dc1b86f869e1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1061:64x1061x256_n"050d2f890bfa4a40488bd9bcba0985cc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1062_n"ed2ee4e1f922d9a3756f2224f6d833c7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1062:64x1062x256_n"91a601f79cda4cff3cf09359d6dcc2da*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1063_n"2a869b944495713a2c3ad498ff4432ce*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1063:64x1063x256_n"05bac7d63a1fbd6412bc3374e83f70ab*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1064_n"de6f28b500814d96db301a4208793911*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1064:64x1064x256_n"619437f25463b0204ca4c9c92c718240*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1065_n"76603542bc09e1343bd373bf5e345bc6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1065:64x1065x256_n"b835b94769fb9850a256999416db2117*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1066_n"8b2d9566c4721f94afb9ca46751e0682*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1066:64x1066x256_n"ea70e159316fc0013ca02b620d6a7e66*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1067_n"caa51ce53a8903dd4f28800e3a5c9e8f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1067:64x1067x256_n"f1f81954e2859ed7b64378c31ea0b874*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1068_n"70e37da8be50cd107589a37dfe04b190*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1068:64x1068x256_n"7115e4b4c989176eba6b3352f8602a85*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1069_n"6f8119b0b2a2e8caaab045e57ce125bf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1069:64x1069x256_n"002c7153fb3c4484db49d2460a3ba9d6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1070_n"b2d733dc1fa14a0ddc3bfdf18e795887*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1070:64x1070x256_n"106a2c72439e8b41219f5ff189f1b3b0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1071_n"0f1bac26fccc080e07d480220015837f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1071:64x1071x256_n"4229f3d0cd622b4146735b0de0537abf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1072_n"43b771fd0287faa3ce4ae8d5f4157e7e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1072:64x1072x256_n"2d61012bc538ef5e02e02a9b7c019ec1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1073_n"f1699a47ccce8efc4de175d5628c1e5f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1073:64x1073x256_n"f530e4d73714e68eeb6e0f0f4e507c3e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1074_n"39768ce05c2651a30104ed506b374a57*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1074:64x1074x256_n"8bf9faeb3a8a095ee4292a2e3309a225*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1075_n"c1a0c5bd1208a640445109defc5444a6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1075:64x1075x256_n"4b963a42e3edcb96d519b691363b8f32*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1076_n"8bbd602335d41d70c5dbb4c13327d59b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1076:64x1076x256_n"4489fe81fe753e7a3938cb7a8bf81b13*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1077_n"8175c0cd0f31eab1eaf73ac83c8f13c0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1077:64x1077x256_n"f0860d9ac1eb7cb93ff2f6ac5d3642aa*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1078_n"1a232d704fbb3ae9d394911fcb72039e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1078:64x1078x256_n"1d4a05d28cdfc12c7e22c08c60756117*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1079_n"8239863076f2e8ac45fc3fa28f92f402*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1079:64x1079x256_n"6c58ad039d25d74f705c75e6f1d114b3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1080_n"5bc1a87040b1eff3697b20b875363835*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1080:64x1080x256_n"0b4fa5bc82aee5a184183a472cc4bf38*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1081_n"2ce6d068b9088fe12af046f15078ffd8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1081:64x1081x256_n"daab32146ac80f18e1274483814eb0b0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1082_n"7ddfacb3f42600805f21a1afabd68df8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1082:64x1082x256_n"de70c12bc2b14fe83aae987dc4016a99*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1083_n"522aa924801e9fb6c16d15d827446e0e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1083:64x1083x256_n"76c4fee9c068cea6c9b78f35b623bf1c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1084_n"02c77f620a60fcbfcaa2ff7595c2babf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1084:64x1084x256_n"241fc7aeece588062e785f8ad96da0c3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1085_n"bc0544d89ee51c0ac8dec7051b99eedb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1085:64x1085x256_n"c9caf2908a074c2fc91e29f2a542b573*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1086_n"b6e752fc89f1e6d4a7471a5b805602fc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1086:64x1086x256_n"478890a862f373cc4975e684c92d1f2b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1087_n"af59666c66d098dd49e208ad2698346e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1087:64x1087x256_n"762609421b65b7c8910af4ed104239a6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1088_n"be1e96d2b8120f9dd526b53c3d83aef0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1088:64x1088x256_n"4baf6d444f0f862a6af460146874f38d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1089_n"c54295348300bc78955adeb823c86fb5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1089:64x1089x256_n"705257c4fe0f6f61ecc2dcda9364d9cf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1090_n"364349f3b792e31c248fd60c35e08ded*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1090:64x1090x256_n"cd6962d2e6cb6a9865f820e711e2ca85*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1091_n"90f7a8328633dd0a5d003a81287a2a7f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1091:64x1091x256_n"56cec572da05b6b1dd276a656d8a4a08*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1092_n"f4a1f0a4db53bf902422d0066e5b8afa*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1092:64x1092x256_n"beeb06925ea7237adb1ef16f03de8450*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1093_n"39c8f809d7826b5274f76150c018ab83*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1093:64x1093x256_n"60991cfa3a45c5f4f29cbb0d20c50c77*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1094_n"3d4a93604b4ed54f1a62b04857ecfe81*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1094:64x1094x256_n"28aacdf04ad7f5fc2b7f263ec1458a7e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1095_n"561338c73bb51c00bff6346a25ae7df0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1095:64x1095x256_n"fa8747c63f3756559b5b70ff260e3e41*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1096_n"3c7686a588b1633e361166b2036eaaaf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1096:64x1096x256_n"5c45224578f222a5c113460830d03f3c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1097_n"86b3ba2db5b0529928d6fb0b6753da1b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1097:64x1097x256_n"2db6fd8da69c167c5fcdd2df488f2cf0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1098_n"fdd3ab202aecfcae42977b73aad45c37*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1098:64x1098x256_n"b3f2325e0d11707621561a4816215726*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1099_n"c23eac93d210f7e95fb5e37fb419e058*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1099:64x1099x256_n"01114edff3be6a9f27f268b230fbf6bc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1100_n"0633562ee27bb446b6222046d7bb74d2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1100:64x1100x256_n"92bb0a72450886953a6dcbcd6d4d3036*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1101_n"c701691be47175dd5f446173bea6e148*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1101:64x1101x256_n"337a19c5a7d3515103acaf082ed8b834*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1102_n"1ce0e1014ed46e1c030340f5d79d22f9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1102:64x1102x256_n"892600cd77a9141fa598357cf1c0dd77*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1103_n"74b4b3cc85e28ad92a3bf5bbcc1453ec*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1103:64x1103x256_n"730ea8bd4b9d71833c85cd9f771b92cf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1104_n"2ab48985ad13aad3fd890093607629c4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1104:64x1104x256_n"80815103417d658b02ecf18e4c15e068*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1105_n"7e17b2b34717f441901babaa2d0ac46a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1105:64x1105x256_n"52120cbb396e46e697a1f4959a355b03*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1106_n"4ec8479e1b23354d2c89b716f324e4bf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1106:64x1106x256_n"3097f3c07bef91654db3bef544b0a9ae*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1107_n"3e3d8f81c53696323aa4b1ac595280f1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1107:64x1107x256_n"8ace6f28ab5b4c4132f1ab2f14a35756*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1108_n"a60023727049ceed673e164856e0d2d5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1108:64x1108x256_n"1664b2058f6795572e48b814a85bd906*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1109_n"7926bcfb27be0a18a5e23628301dd661*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1109:64x1109x256_n"070f5b39611d4899dec2f1e2a9625940*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1110_n"2edfc2fd3110e3570d6d32ff5c1421c8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1110:64x1110x256_n"fd32198bb98e4d70a3ba79141c741569*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1111_n"0c30e8e3e52de2a53c8455740c112f9b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1111:64x1111x256_n"7ac9763a26282c5bdbcf8798adfedfb2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1112_n"09de4623c0bb94119cbcda10c5c5c37d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1112:64x1112x256_n"ca481dc2d21e89a2e1cfe8fb6737de17*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1113_n"ccc824b35f9ce55f8f34b8e6d7a4119d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1113:64x1113x256_n"8d432900d601f846a8f979513d884bdf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1114_n"acbe2072ee1c5405a4718d21136f53de*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1114:64x1114x256_n"3ea9013399fe12d038bb4d85ed2ee4f8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1115_n"0d5ecbe5ea397fd4a21bbe1107344ef8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1115:64x1115x256_n"b66055c54b2b6715ea9ae7623bbcf82b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1116_n"6106e45c2fe24c5bffac6d0f96bcad72*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1116:64x1116x256_n"99ab0394bc43412c88f8a767a1cf8374*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1117_n"5f6ba721da85346d740b89511a6d4b8e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1117:64x1117x256_n"866af74a3e77ee2472f50bf4f3556fbc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1118_n"019df4129b7e5b6ec8e9c17df0366847*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1118:64x1118x256_n"bd0e6a42ba081a895bebcb23698669c5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1119_n"ee8a4c66df1215829a6e028d78a2a12e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1119:64x1119x256_n"2a6b6c24400401668d6d9b31c8d4b678*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1120_n"c2dad6983bfc7cde95c55c6477030001*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1120:64x1120x256_n"bfcac6b2ea318967623d5013ba0c070e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1121_n"d3e00f6c038096523a1b5d219762bfb9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1121:64x1121x256_n"ec0d22a8f447d8d0386c2c22c58a8052*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1122_n"90ef151e2d579dee144288e83d71eb20*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1122:64x1122x256_n"1c958ccdaab047c1ca0aa736507e08bc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1123_n"2f60e12898330c02dd169847b9ea942b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1123:64x1123x256_n"fa65a46fa7bcd400fe4efb6bbf62bce5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1124_n"657861324f58cde8dc3f7b4c2ed1a18b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1124:64x1124x256_n"07098a514edff6452bea29e07b69abce*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1125_n"e1a10e1159d398635e5f6dc157dc50d0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1125:64x1125x256_n"9eb8d237c45d56bfe39d7ca23e33da02*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1126_n"550531efefc493ab0435aa1bd65a28a7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1126:64x1126x256_n"62f69cc464cde74bcb986ed7bfcf369e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1127_n"f2b5fb52481ddd782bf7a7a0b39a0470*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1127:64x1127x256_n"af7a06555a851d521dd1a67bfab77c8e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1128_n"b3606b3c8116855a0d7da53adcd09e97*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1128:64x1128x256_n"2f79a742eb274628beab2e9efcfcd067*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1129_n"b041e866f94508c62cabbe35c32acc8b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1129:64x1129x256_n"21d143481fa77cc8e0f053e54e771913*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1130_n"6d19471f2838b2ad42816b95f0265222*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1130:64x1130x256_n"1a6a304514e68da3899fc4373e321644*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1131_n"77e651b82ac74f8f0945e74cc00b2e8d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1131:64x1131x256_n"b945e7d03455e5feaec7354245a3c9d8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1132_n"5a5d019f7ec0e0475b4a87d45bae8d8d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1132:64x1132x256_n"f0007feac9e92e1c09b31057f203cc12*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1133_n"f4c0d4b396bfdccc31f74004a207f56d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1133:64x1133x256_n"cfaa87ae3bdef436d25983e2e39c7b9a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1134_n"aede1b96023bc98d2ff1f91f7c6893f5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1134:64x1134x256_n"a61fac51bd6ac1e13291532109004c08*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1135_n"6ec3309777e4109e38ad255c5264a299*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1135:64x1135x256_n"652801d2dc8654e20a1250c224b48674*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1136_n"7c8242c4e98d4a514b99679101aca3cf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1136:64x1136x256_n"ba0a1cc2734f9a4deced5e06a50ac207*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1137_n"ba92ce43967bea3b6708f1aab4f738e9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1137:64x1137x256_n"41f50f58031d7746eafad5bb58513ce6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1138_n"11c78487dc0cc7e439de11c340d4821f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1138:64x1138x256_n"cc419c5af41825e364afa89ddbb7d56c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1139_n"cdf6effee714ed4535d1eec503ddf604*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1139:64x1139x256_n"4994cb04fde294c9b9f1dbbb987b5cba*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1140_n"43cc0d5f36ab7a4d229927e2e26bff02*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1140:64x1140x256_n"4c249a7624708e478d7854fac77dbd6a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1141_n"8ff7b103bd0510c2823f8ccd47773a57*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1141:64x1141x256_n"8261e5cb499777ff057c24dd8b73bdf8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1142_n"f11cbae5894fa802ee539f2cd12eabbd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1142:64x1142x256_n"9440473dc2ecf8872c9f899fed241c21*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1143_n"c5f9f3e9059a18209a2bfdebfacc3775*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1143:64x1143x256_n"13745802ffcf96c78f6120d1c56a718e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1144_n"380ed54e0531a333a3deed118eb7f896*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1144:64x1144x256_n"1b87fd024abccde56673625b6ffe0f21*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1145_n"9b321074a47e9db9ce43968399057065*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1145:64x1145x256_n"004eede01caffa0d27c0c084e0c2f952*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1146_n"41950ad3a3e29deff96cef51ff3f2df3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1146:64x1146x256_n"569002615ad94dd438f5e1096f8f735a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1147_n"b2360e8b81bebf6a321b797c77a3b5b8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1147:64x1147x256_n"fc12232c7cfb993702502c91a78c58a0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1148_n"8dd4895ba53ff26338422965f53ed87a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1148:64x1148x256_n"c5a07e387be601a5471d3c99fe422069*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1149_n"11ad171a6ab291a97e6bb504697e6858*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1149:64x1149x256_n"6840364cf642b47e9188cb02e23d872d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1150_n"0702a6af136ea31035d716dd17451da5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1150:64x1150x256_n"96b2aebc5b6ca62740679aaf670207a1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1151_n"6ff886ee218fb445064a36e7247c9220*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1151:64x1151x256_n"12ba7bf19aa8663362780f611b4ec5e4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1280:1280x1000_n"f6dd488d2b064f5c9711800bc299b52e*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1024:1024x4096_n"0cbc6a3bd7975fa9ce788f9ba438ba81*3165"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x240:240x4096_n"2a6c60634f2cc5efe3f5b5afc4d1067f*575"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x2048:2048x4096_n"6bb43870c25bfd541f3b27cc8b0529d7*288"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x512:512x2048_n"072ecc270b6d2088a3d46b17d8e653ff*824"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 118656x1536:1536x512_n"ede1cd1487dbb02afa669d8b2ab82d6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 118656x512:512x29_n"8ab65070931328fc7efd6f598d70fa22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x118656:118656x512_n"74865dd60a3bfc817000d9649877e331*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 118656x29:29x512_n"aa13ea3072dd581c118b893285561ef0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x118656:118656x1536_n"5f4e35a4aab0cb8963822861c359241b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 118656x512:512x1536_n"b097e24cce295ef00f895ed9f95c0308*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2x2048:2048x512_n"24a3fbad4635ccdadd0c95cecdaa38ba*822"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x2:2x512_n"503e361499361f5f83c83189309d792d*824"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2x4096:4096x1024_n"e8a4d41016977be8f79caddfb578ed3c*3160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2:2x1024_n"3df190a4382a24bab349f37734b58eb2*3165"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2x4096:4096x2048_n"d6187c2683a4e743c4be22cdee99540e*288"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2:2x2048_n"38b4bf982290d71116e7d46acfeeb8d0*288"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2:2x240_n"9702b92fa0c4a1871588f15c0f22e6ec*575"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x200:200x200_n"b6921b42686081d2fae3fd8610838823*1024"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_logistic:0.271:0.314:1.234 128x14:14x200_n"edd62c4e56fc1c7e794aa0a942240cf0*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_tanh:0.271:0.314:1.234 128x14:14x200_n"55b270f09205e1fc589c7e59f19c4e9e*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_logistic:0.271:0.314:1.234 128x200:200x200_n"4b5b7f044ad1a2ba1d18457c0725fb3d*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_tanh:0.271:0.314:1.234 128x200:200x200_n"a92a9bf217a0f0592c3751380f4647b1*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x200:200x1_n"f0431f3c5c83f8bada9e0a12762ae110*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1:1x200_n"3dc23685acf981ba5574f0cf9639d226*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x200:200x200_n"861b90a29d49b2ca5ab3eeccb79f41b3*1528"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 200x128:128x200_n"97da303b28cb9fc3b2110ace83c8b997*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 14x128:128x200_n"3b3afcae46f20203f1e603fe77d6aefe*508"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x2048:2048x1000_n"7ac0d7bd60d479904589b5140a9ea9a8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1000:1000x2048_n"4b3b3a446098647a7a2e2ae5d2da9ab5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x16:16x2048_n"6cb08066a217dcb47f7370c43a4ae677*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x13:13x512_n"2b04957e15ae90ec6209bb6d54cd810b*400&5935d47e24cf714da71a5f0f0f503326*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x256_n"7b9623131ea0ee2403736641f3a0b065*800&175196d1bf29647759386a4dd8a9e0fb*14400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x128_n"3e561a5c943aa9efc6c7e4eed0aa5095*400&bc1960138f3eb3b06137363c6deb9ce5*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x512_n"0fae07bc1e7c49da8a177563b0cb3c5b*1200&91f38e1ae294057668e2e251ea75d92a*21600"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x3456_n"f1992657c98159e321041ff5d5059c1e*1200&253bc65d12b73590a3ef879164cae0de*21600"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x1024_n"9b798a9bb9717c3d99eb5728bd34d2f4*400&6addc2a77b9a3d78f3e8247887a2d4b2*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"f7ab5149dcf25f70bf98ed19c0879a11*400&7644dc0aad2797ae644b87ce30078631*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x512_n"5461fe0360223a1fc9eab519d3c4f53d*400&29474dbdd5413babaa1020e343b7062e*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x1_n"f2e0b0fa8018345921c8c10684e1e2d9*400&33dffaad05e368759596b1c479f9de01*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x92:92x768_n"b36aa69477a76f9041db88ec8afc2722*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x256:256x768_n"335a8cbff4d2d371cfaebde85eafecce*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x256:256x287_n"b5cf80f8442e715652690a0af6b99514*2&4237d488ae238602bd73aa9de7f56b78*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x287:287x681_n"80146e05c6b6dcf97719391381924f04*1&6b3a46307b22f31fd66bb1b501764c54*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x379:379x2043_n"edb2bf661ce1d3470bd8eef1fda712da*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x681:681x2043_n"38fa1bfa9e7c8c51795cdd00340e31c8*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 42496x681:681x92_n"c51af425c8441ed7890840e36126a0f1*1&2fe4d1de816b29c8393686d810846d8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x287:287x144_n"f0c9faf3511afbacc4ab78d3d8a6a446*1&dc489e86ffdc5d49cb90faa88d888402*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x144:144x72_n"a503f0a4218aeb3e7985538dad395c42*1&723c66f7c7db35d81cd010854c9aa557*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x72:72x1_n"da770ebaec10c291a718a5b854a05e9c*1&8dfede1ee31e79f127717da8d5ecc0cd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x1:1x72_n"a5fb8749ca44429a92c1837e09ee1cfe*1&d35c19f2128e032ecb3c0864baad30a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x512:512x72_n"78fab6f88b6f8c84258418bf5d6a9257*1&dcba82ecaad310af99baf563d3c34273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x72:72x144_n"6dc25f256227a6e6401bf9f47bf94d3f*1&72a86dc23a1b7c5504cb70dda91f23e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 72x512:512x144_n"685760831247bbdb989a5bffa28dd433*1&f27f3f00ed4ef03c0d5c7835d030a383*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x144:144x287_n"7ed23146d1563c1db2fca392a4d856ec*1&d0d42e23cc0a7bc323df041ddcbe7113*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 144x512:512x287_n"d802244f1e7726ca4e6e25a941c0e89b*1&6d60416b59a6ba3dbdd793e10971bb72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 92x42496:42496x681_n"760de5649849d867cd74cd0463570b42*1&753109b99cba7171a049e340c9b231af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 42496x92:92x681_n"af278fae5a28b956a2dd3e03f17f67ad*1&0b7dcc5d5e87f837cb07fb6f81031e47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2043x512:512x681_n"75de14859e570d90266b76d15d073e7a*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x2043:2043x681_n"4d7a70c28154cd3633978f7e8f272223*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2043x512:512x379_n"b8a684580affaaab9f5f8cefc94eed85*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x2043:2043x379_n"4839af5fa2bd62ceca66795d6c6a9616*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x681:681x287_n"a9961c99caf7bdcb9f4fab1b34249e6f*1&9bcf34c9907ef5aa303936a72332af6e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 681x512:512x287_n"f1d041c77a59dd8d1d1aa9a97d20bc7f*1&a8bc9f094ee71694d453b68c6f1d2af1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x287:287x256_n"37dfd499289ed5dd20db2d8072f70afe*2&6b09c78bf28729b21131ca91d1988517*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 287x512:512x256_n"23fe4a57e53c967d4fa340df2cdf292c*2&b6db18d5de547e0e1dfc63ea5f281ca1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 768x512:512x256_n"f35d22d144425c6a75669cae64a8ec38*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x768:768x256_n"98fdd252f5591c6f6ecdf83b8e746bfc*82"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 768x512:512x92_n"bf5fe154e7f8918c5e40de40032eb799*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x768:768x92_n"232ffc3e0034955c8c3e38126ce58b70*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32768x13:13x1_n"3cb46ec56a7001c7e1e18485f96d9d1e*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 32768x1053:1053x1024_n"40dc3afac9a9003138a4c1ad03529c0f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 --attr-fpmath=tf32 32768x1024:1024x1024_n"852255c212b8b56b2b2c575072fe508b*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1024:1024x1_n"dc5c46d2aea29f28233add7e84604fed*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 13x32768:32768x1_n"1e3d58f147a34a37f3b58230f2353a9b*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x32768:32768x1_n"4ebd3dcd4bd353ba79983923aea2d162*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1:1x1024_n"2d473a531e192d72ad3b8398f83dcc91*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1024:1024x1024_n"abaa4cfcd9b37fca42530fa804401304*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1024:1024x1053_n"9f2ad5e841e0fac40e960a8fcb76f2f5*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 5000x845:845x1024_n"5ecce00f0e22a618cfe92860bcdabd8b*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 5000x1024:1024x512_n"68b365147b3c9f5a3981cf5fa6c59fca*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 5000x512:512x256_n"fdefd2e1a2e641e237450dffa74b590a*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x256:256x1_n"e6605f6b0bef0fa6bbd57b7b69924bd9*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x25536:25536x6384_n"6ed8f7306eac5207e5948395e717eece*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x6384x1:456x1x6384: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 14x2048x152:14x152x2048_n"468320a54ee7e5e0eaafe33f9a197229*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x6384x1: 14x2048x2048:14x2048x152_n"bf15bbda7edb6a5f73a41a8471da92d2*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2128:2128x25536_n"bdd9de78b5e27f391f974d817fd7cecc*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x25536:25536x8512_n"90b1cb3b7032a9abdb519c557c2887bd*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x8512:8512x25536_n"b1fcc805a1badd0110590f84c7664d33*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x25536:25536x4224_n"0f1cb584bb897c5eb541a6be2104e13b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x4224:4224x25536_n"0528e72c155cfa919c0133854cad0c7f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x2048:2048x25536_n"5e1adc8fc2303db7f263c81906da3f8e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x25536:25536x8512_n"a994ee17242fa2d1b4524831b22047d1*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x2048:2048x8512_n"be561a2b9140b893a3f16a71671e0233*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x8512:8512x25536_n"d33210118303708bca68cba386f66d9c*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x2048:2048x25536_n"04ade9a53b6ae55b075447bf13d14cf1*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x25536:25536x2128_n"97ac63eb205030006094811e0c50aa0c*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x2048:2048x2128_n"29b796d69f93e84cb99097fb16e202f4*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 14x2048x2048:14x2048x152_n"b8a9dde74632ae719a72b6fb924531d8*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x6384: 14x2048x152:14x152x2048_n"063a9e258870a12274e927c792b5b033*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x6384:: 14x152x2048:14x2048x2048_n"53919d62bdf4a106935af38610a3deb2*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x6384:6384x25536_n"b8a23ff96663fa5c511643110ac2ec87*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x2048:2048x25536_n"b3e25628bd14d809077f5c6b86bdc3b5*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 24x2048:2048x1000_n"1ff18072f6c3dcf38643dba3f3055b59*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 24x1000:1000x2048_n"7bfcaec312444f991330435286240c42*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1000x24:24x2048_n"2e4bc69e344f9dbcdb9a620c3d8b66a7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2048:2048x1000_n"4c8151f3c84198e59eccd13ba51684e0*1&c175409823438516b6b665ecb53cc235*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1000:1000x2048_n"dcaa492cda594e6082b792b0352c1f9b*1&68ae6bb405a25fbbceebdab461e16164*40&c40457eccfc4a5e9b7309fc0b953c273*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x16:16x2048_n"446b1bb3331af639b9c07e4c7f6c2489*1&b6a750455c1bcae26d374dbe6e3d1a17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x320:320x1280_n"6ef7bd862df507c5fb9c1f67d17910c6*1&4428ab74ae2fafa4c7018ac09e633bf7*1&694824afcc3120d666cc96ce1c328e7a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x1280_n"e02fd415590b61fd290ef70b9d0ba58e*13&034a05c167e7104b629008df1c4253a5*13&819ab042cbfe3fa2ea1cc1cabd09514b*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x320_n"55a1ff046704ce87489cbd2232e640ba*5&897f1133b60739416f55fc34035742fd*5&838d1b582c34a9c1252bc93b7054a297*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"2abd86b095e84610274f422e3e0c3e73*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x40:16x40x4096_n"9a6eab6f3c600a075a875e75de6ab6a8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x4096:16x4096x40_n"342df915497b1c8db59f492059b644f6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"6b0d4d5585413a535ddf3c74b60b885f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x768:768x320_n"2b8f57baad071afa1477d52f7391dfc3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x40:16x40x77_n"5b078f5d51c5206e2426137b4aecc73b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x77:16x77x40_n"3c6cbdf1da6a743bbf6682ee787bfc3a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x2560_n"de1672015c00e2c7dddc77e3f47a9c15*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1280:1280x320_n"6bf3b491af33a08bce7a1f9fb452d4df*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x640_n"ddf816c1ec68b3b5407f5e1fbedebbad*5&bd9f9bd0c46869e5eecbe03592e6f2ea*5&b9d68098933daeb01cc3e9048c475648*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"7dc84ef99c7bbf652fb820f0cf77b62e*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x80:16x80x1024_n"7e2d7d4e002f743e806463334f1d7cf4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x1024:16x1024x80_n"485d09ed5b0206074de27f49479e86ac*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"33db69bcbf2fb909fa90409980efa67f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x768:768x640_n"85aca557abdc16a30da075896c7570d0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x80:16x80x77_n"15c4af3f42143b83f1d284337215ea1c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x77:16x77x80_n"e1de37e03ad9a3080e726219374a9f9c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x5120_n"e862da9fd7fb8551376e2d445f572482*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2560:2560x640_n"2f7948cfb9e1b9e5de4b37b2574275c2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"e5f2d0c45fbfa04a98964d1fabee792d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x160:16x160x256_n"487d7f163f1ce266c51c2f71bebe1d38*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x256:16x256x160_n"d8794d73c5a5b06f265e0174886fc76e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"44cde2c4fe0ff44aed96c9bbfe22f621*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x768:768x1280_n"54a1f4755feda9bd373cf1ce4ae465e0*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x160:16x160x77_n"b6ddf427447dc436091aec81eda9b757*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x77:16x77x160_n"cc36d95c84d335308754e6609e13fce6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x10240_n"d230e6678e2b579b6c2643c2840166e5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x5120:5120x1280_n"58a2fa56a9a99b498a6bbda3c220643b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"70db20ef02d68bfff6de885584937049*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x160:16x160x64_n"17133c40a2bb2b54edc1076647b6923b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x64:16x64x160_n"a2b3b2d388f49a7e52111a122fbbe46b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"513f4a2197c44b1414ca78a1f7bd3a1a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x160:16x160x77_n"4380790408860724a953551925e8e96a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x77:16x77x160_n"eaf3204acaa41e689644180d14796f45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x10240_n"c66b2ee60c2c175b20acb820b866c354*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x1280_n"60ce1f4c3bdbe8abc34f8046d4b671b0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x12544:12544x1024_n"04a8de9a23723ea6547daddcae88371a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x1024:1024x1024_n"8f229491a5c3d3e8fad9d2d83695ee47*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x1024:1024x364_n"a64f60545584ea63c695ab9e967696ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x1024:1024x91_n"19602628c363d9c012f98a53849c0a8b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x1000_n"aa3d96cbc3042a6881e4f172ac4c4aa5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"fdf6e7e0ab129fe8a0fcb95f2bb2d562*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 1x2048:2048x1000_n"b217f483cb56458184d2b2420443e7ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x77x64:12x64x77_n"b4d8ecdd03b6d78f5e517cdad30302f6*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x77x77:12x77x64_n"7ccde317d60f342eee31306cadef3760*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x197x64:96x64x197_n"1814b7527bea5beb9b4f36f4aee6cdaf*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x197x197:96x197x64_n"54442da67e4554a38e98cc832d47a641*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 197x768:768x2304_n"36da6bb5138abfc916c10c84566d2530*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x197x64:12x64x197_n"8b44d6c424e6bb9a69d1173311ef2385*12&b4d416bc29a13a38fca3ac82143a975d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x197x197:12x197x64_n"89b02f319ce53fb3a078b2921022a1f0*12&a1b825a149977db29c420ad620556d0f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 197x768:768x768_n"10a61d26539ba738c80e678eed51350c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 197x768:768x3072_n"8baae953ca19278017e008eeaba55639*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 197x3072:3072x768_n"91b2dd6f39b3bb14fab311cf842a4d88*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x768:768x1000_n"c1c03d002431b572aed19cf312ccd1f9*1&d83d676eb829178420c2cd3f1c7bf9e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x3584_n"31b387dfceb1939bb4d54dfd4c42f289*576"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1024x128:112x128x1024_n"e9305d7bff3af39c6b78c01d4d9c913a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1024x1024:112x1024x128_n"823719e701e599ac0e8f1c693b06958c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x3584:3584x7168_n"c74849c241501c048926e31588486c5b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x14336_n"93a57513b22a66b3464de3a5b019d448*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x14336:14336x7168_n"4f99312c1f56496cece2f6a1203e492e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x50272_n"c18032e280a2cdb30596968c4a7fe4c4*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x3584_n"f32089828eadd7e7fc566934ef40ccac*73152"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1025_n"713f6a94d2636c3cc0d3e50aec6376bc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1025:112x1025x128_n"b2cc4aac10a20c0ae3f80ef8790c5ce9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x3584:3584x7168_n"d3f758a075601d5eb9ed6be3102f9e27*24384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x14336_n"aaef6704a2f65eab1dac3defc5e6d839*8680&d28299537378c4e6a6711df689ef6b05*24384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x14336:14336x7168_n"1d52fbd4e2739d3129ab2a9e30ac7074*8680&1091b27708e56e4a13816aa989a24daf*24384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x50272_n"16b711ff5159217633eb11dffffe677f*508"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1026_n"913e20b7ba374265dc18d0d8ff19b4f1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1026:112x1026x128_n"f767a69f52c827f7fa6077321ac78dcc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1027_n"e2b1648caf2d4e38a93257816e115401*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1027:112x1027x128_n"bd90f3d8503cd574610524f6d7524003*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1028_n"152cebaa8790f0af25e27779616da607*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1028:112x1028x128_n"c4f8d8028314b42fe1790af8ea26a787*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1029_n"88de958c5aee75141aa13650e6dc66f8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1029:112x1029x128_n"122618d5ae28da36bfcb8c89fa97c015*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1030_n"0cde63e1c8f152dcc9a576dbf5ebc8fd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1030:112x1030x128_n"04ba753a6a2eb108be872765d8a65f5e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1031_n"04e40af13b2c008218537958bc696bba*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1031:112x1031x128_n"e1b587ef173af6a6864aea2ed5d83ffc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1032_n"15cf5f1d2b2120dcb700dbbbc0ac07ac*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1032:112x1032x128_n"359ebc71568167f22e2ccb10a02521c7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1033_n"272b5de916521f4042729661eb5c5ffb*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1033:112x1033x128_n"43058706511a1bd6a439a9a414aced97*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1034_n"d60d56f4db6d981604f6565e8c009f07*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1034:112x1034x128_n"5d4eaece8c116d1b0acf4bbb113d8a59*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1035_n"d9157116957ed85288f1830909068975*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1035:112x1035x128_n"3fb1bf963ef8554f5dbdfe7d28a562a0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1036_n"b645133398994cdfc683480afc37398e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1036:112x1036x128_n"25f534a791d3f83a0643a1701cbd15e3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1037_n"133341a91c2be6e90457a23821561ecb*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1037:112x1037x128_n"2c940f34d5ed98e6ac993d5f214c1732*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1038_n"4a0483558606bae7a8bb8e320d4ba4c3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1038:112x1038x128_n"ca4c93c015237aa256f80cf669c0d304*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1039_n"377c307fbf2f41abf7f60d072bba4667*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1039:112x1039x128_n"ab4cbed82979d52390c2e6c4592e5f55*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1040_n"bb2080ee9f2b80ba04aa42c610244243*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1040:112x1040x128_n"1325656ae735e0a6788d436633ef2902*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1041_n"8aeda9f1637787ad6f580880d0f85b6a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1041:112x1041x128_n"c70795cb35d62b23d465cb2fd844ab2e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1042_n"818c9d9da801e0e0dd0e4979dbaaf336*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1042:112x1042x128_n"161e6b1248223437a7f0115b6d562857*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1043_n"6f37fced682a7537406bbfafdbbebb10*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1043:112x1043x128_n"65ba67430f6d4c5ae12e82f04cdd5ed6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1044_n"a72b070234db7449ec8c7885b688157a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1044:112x1044x128_n"e4eae7dd8c733955d244dd5f4cb8bc35*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1045_n"98dcff3011d7705f3952cab6de6655d7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1045:112x1045x128_n"943826850d48427eb43435a6af083121*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1046_n"3a90281744b2241e24f79f554c3ab0c2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1046:112x1046x128_n"83bad370acf7d82bd01b576c91880ebd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1047_n"42eaec201d388354faae60d4da04399d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1047:112x1047x128_n"8b62d05382443e4d4cdfc60b34c370f4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1048_n"8f04c85d351f2e36b1bec7aa6b413ae8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1048:112x1048x128_n"93e9f123d17c3c65b35092d3422296ec*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1049_n"9db06a433c6355b60799785d31dd514a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1049:112x1049x128_n"b2b0de0c9610ac5260a7523e21fc2b13*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1050_n"df55a24b004ba2888ba0a05c149df0df*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1050:112x1050x128_n"a17c2417c9831b2b73326e9b2e13840d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1051_n"0855598390a56aa0ad1c2a3c98f22a71*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1051:112x1051x128_n"a4700ae750170a59799a78289cb35761*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1052_n"8b51618886b1ce56f7e3a3b5dee575a1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1052:112x1052x128_n"643872b3871464042448faf602d83579*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1053_n"5d32057dc516ab3393a7e0946636a8e3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1053:112x1053x128_n"bb15fe4dd6a676065d5178baee58bee8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1054_n"e36e9b15ecf74742c5c5fea640dcbc9c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1054:112x1054x128_n"d74f5b59fb0b9acb168cb8174b392505*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1055_n"710d462af6c58a596cb4a184bc17e428*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1055:112x1055x128_n"5171d758a767faa64055089c3e740274*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1056_n"2fc505befc749977c2136c2af7bde79b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1056:112x1056x128_n"ee11c1c0dafcef727e6e71e9dfb20de6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1057_n"de70eb3593776c4276316606d4fd8b74*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1057:112x1057x128_n"57e734a64c49fc206b9c96e4da2f34b3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1058_n"e54f5c5f2ce092b5fddaa84eacc4366b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1058:112x1058x128_n"84518bfcfe8201d9b7544657f6a32f84*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1059_n"3be98bc22e9c62e48af95393307ee460*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1059:112x1059x128_n"c0c784b5ecc4c6296c2d99d694d573ac*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1060_n"9a7164206fa206d5b0daf00ac5214b7d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1060:112x1060x128_n"3c743d472dc2c1fd32c4e1e19699d257*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1061_n"46d5c9d52f14f7409142636fce4edb72*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1061:112x1061x128_n"2916f5da46835d3fe618abd87a6e51d4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1062_n"c1a334bf91c019c6cbcb1e1b78273582*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1062:112x1062x128_n"01a05ca15c0954197486bdd8d46eda27*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1063_n"4e0f3b920ff38da1dd98404535aafed8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1063:112x1063x128_n"895b2d036204d242f066ff1ae74621ef*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1064_n"9b4e8382e6782c64f803c3d45694e95d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1064:112x1064x128_n"b96d5fd3540fad7c8046c11ba877a17e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1065_n"a807c0c4b0fed2e05c603e7c37798cd5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1065:112x1065x128_n"7ddd76f347bd62caf3292f601645cade*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1066_n"241a8f1d1e0006b18d114f88eca56a29*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1066:112x1066x128_n"58a3ff0a47260abed2e9a112cca66b0f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1067_n"5656a7a2c01bfff48e279d6d3d275553*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1067:112x1067x128_n"566f7feb82014fa97474992805fc5eb2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1068_n"17d35f7e7b6a322906012965f8920925*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1068:112x1068x128_n"c8d8ca9da6857d9d4978651ac942e923*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1069_n"d8434c5a665459afce5be2c7c90a1d3f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1069:112x1069x128_n"b38c4bc7a7a49b065ab71f4bfb1623b3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1070_n"62ca69bf6853ed0c893f2b78f1b98742*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1070:112x1070x128_n"c1d2db18d67d27f57067a13ceb3710b6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1071_n"ef67a264bd1c1cbb6c639b4e07e621d8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1071:112x1071x128_n"08c34103c33493fa87ef16d72ed8cb5b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1072_n"41fa02b10df55bf57fc1121078fbf3c7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1072:112x1072x128_n"0a61590921ca300a1d0a5c9793e5de95*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1073_n"8330470f7aa7434d0d0a44093d04c643*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1073:112x1073x128_n"c1d96e80433501396889033b1c08c7a5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1074_n"f22756ab7add13d421ff559b52a00c80*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1074:112x1074x128_n"883d67bcbc5635be06b849bcee2bcd77*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1075_n"02869ac2a8961404c5abbbc695692da8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1075:112x1075x128_n"f5891f2e4ba207e7aa4626036da2f642*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1076_n"2c28f8e1777cbe514729f62188b073e2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1076:112x1076x128_n"375c6f3f14474bf12c35113f15fb46c3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1077_n"b90ed16ce09e04a5835bdf0d4cdeb613*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1077:112x1077x128_n"9e3c56cbebf6d060482f3d682087651d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1078_n"baedb0bb36a0e20359991825b3503aae*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1078:112x1078x128_n"a9a8d595a4e09a652f91cbe76458c406*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1079_n"f25229565046dbd4450196d7aede1165*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1079:112x1079x128_n"7a5b4faf8bfa91144fa0338e18a3bcd6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1080_n"f5e2e2fd7cebd7a093fbd450b2e796a9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1080:112x1080x128_n"8c343229b3ec28b0f34afff788d7171a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1081_n"ee3bee443189371a43ecb14fc60a80d1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1081:112x1081x128_n"65f082e86e535caa3b652c5b37bfe08d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1082_n"48ab82acb6ded3e3782a231b73fea644*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1082:112x1082x128_n"c0a9a7bf97ba01cc0e43e8e1a7d2265e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1083_n"9885fa9d8f7c289fc8a3e6964933a911*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1083:112x1083x128_n"71211bc1abc8123e4945c3705d72e74f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1084_n"b169db0bce8d27a8d2603dbc986e193e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1084:112x1084x128_n"01c8745249076c3258bf0fc7c53f8acc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1085_n"0e358467f0835015074bbdff1039154f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1085:112x1085x128_n"8a90abbac2015be4ffbe86cac870f0a5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1086_n"e97c15760719844fd3dd63556e09f2dd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1086:112x1086x128_n"ba55a30a478a51472184d005ee69bb39*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1087_n"c6248633d13761d3634c1363d9410ce0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1087:112x1087x128_n"ebbe8d84af1355d5e9e937590cf0af70*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1088_n"1c7269d23ed01c60887033a24bf2cd8d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1088:112x1088x128_n"d47a24d2d084fb6b581e579bda9837b9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1089_n"8f23c443083d23ca34fd2b1b21afd164*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1089:112x1089x128_n"bf78a04b2833dedac8559b865057f714*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1090_n"bd2922cd8a0aba99607529e2dedbebcd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1090:112x1090x128_n"25e66ff3c6e2d8cc15ac5160c9b8d429*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1091_n"8b8ac1be9eff644d465e93b3ad99eae2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1091:112x1091x128_n"f76d961fb92db58641087f22e7953e8b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1092_n"eba9764c691b9809c64e3cd45c902605*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1092:112x1092x128_n"5b3aa9882a2d04db95b8b6c8c6993ef8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1093_n"9751c9000edd02174da17f8e0dc9cf26*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1093:112x1093x128_n"ee4483a4478b1e7a6d26bd22a3f4d67e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1094_n"e5a02ce6a86832a11cebb203385468fa*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1094:112x1094x128_n"44f3f6a8071d78220226f795985357fd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1095_n"e94e3b9e640943a84c3ad46968c71621*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1095:112x1095x128_n"7149b6ca29b41ea02354a03a2e7ee766*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1096_n"07bb721e50941f085a63df733714394c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1096:112x1096x128_n"45bcfa2e6cf9426ffdd4cae7330cd964*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1097_n"c977f007615662d0c668a9ff139231da*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1097:112x1097x128_n"ea3e94a030833db2f7bb38853ff86134*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1098_n"8ee5b3917a46c02e8c5b6fa6c50925ea*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1098:112x1098x128_n"4df932117ca988b0c62ba5c0d6d5445c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1099_n"a6112a95a5b2696d0a72b830a7f04a22*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1099:112x1099x128_n"7e854df3ece15735fde1186b4a50c1da*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1100_n"84bd0bafea3a72f067ee53f3aa36b93f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1100:112x1100x128_n"f4b35a5535929500c35c9940588e101e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1101_n"052cc7281e7cd2e2ef577f7fe050ec52*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1101:112x1101x128_n"4921e5bf8dc59309074aef9778b069f7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1102_n"de2507883ee4cbe618f109da88b7e9f5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1102:112x1102x128_n"356cbeb88f22beddc17e8c9720b602ad*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1103_n"0ff53831dea3b60417abb42c4caf1fea*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1103:112x1103x128_n"49bee6a76a52526bfa292f064b2197cd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1104_n"f76671f203a6957a63877542c0566bb4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1104:112x1104x128_n"8d1682c65c8ddc0be841d16ffd8d28b0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1105_n"e2b9823909c879f94658e153cf9d4466*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1105:112x1105x128_n"26b26c39400ad81d93500d408bf3c293*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1106_n"b30c0069da8152778ddaecf9ac055c0e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1106:112x1106x128_n"bfa085af969dd71d3735a8c5cc23bff3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1107_n"0ac688132b1b89b5106e10443c540025*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1107:112x1107x128_n"66f3583f2a43bdb149a2c4908113fd03*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1108_n"8f8f92b3fb6b8ec8326096e75c8a83ab*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1108:112x1108x128_n"31ec88765053883e760dcf8dc1ad9a1b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1109_n"4ec3f1deabe7a0017545dfc8a8307b61*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1109:112x1109x128_n"ae05aacd394ac2950c811e49729148b6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1110_n"bce721332a46538c9a8b940e1adf8914*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1110:112x1110x128_n"05e7b5194fd1619329926d8acf215947*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1111_n"12952847c1a7cade9edab20fe08b661b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1111:112x1111x128_n"68296c3ef13466028e0cb9ab1877507f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1112_n"a25124fe0391c427c37b26583b647def*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1112:112x1112x128_n"f80600cac989cf291e59d6379971f5e0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1113_n"fad313c95baeb73cf862aee8a7f7c94b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1113:112x1113x128_n"4ce2e7842e07e71c45dc2a80f554a190*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1114_n"d855f9bc5b15e897d8615d3153faf821*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1114:112x1114x128_n"2c713cf946296fa5ad7739cba70aaa27*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1115_n"2d91d471f069115bb81db8603270aa2e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1115:112x1115x128_n"1a120cd191f6cf84114f7ca34098ae9f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1116_n"9b3581f046906572b3a45c3368ded0d6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1116:112x1116x128_n"f4aabf54460e62e8f06989741fcda1c0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1117_n"7b41ead2b84e62caba6aac3fd580c9e0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1117:112x1117x128_n"901b0ee26b8cda07b2aac5de938353e2*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1118_n"658574b51c94d51eb3903dee90e40496*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1118:112x1118x128_n"9e085c7487ec36f1e434b9e4eabfaa4c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1119_n"15a2353b9e5527bfad82b9c96b2eea6e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1119:112x1119x128_n"870dec804de88e12763714f71f6f9eb8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1120_n"730a660c797a6f5f0c410bee5d8faca9*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1120:112x1120x128_n"c867888f406828a24ef887875c189b50*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1121_n"5cef3d051fd93cbd009bfaa2da1d3abf*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1121:112x1121x128_n"fd51def82afefe0cc1b4fddbc77618a5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1122_n"f4b050e7ddc055f902a8a272b9fc1711*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1122:112x1122x128_n"e05773cf7fc1ac41e5c770e7cb0115b6*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1123_n"8d374d8420e3086afc16400da18b6ef3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1123:112x1123x128_n"9eb3d7e27c589018ea0efe4b9a225f77*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1124_n"6028d8ecac32da4a6e828e82ba736b8e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1124:112x1124x128_n"675be92b6ee5e10dd43489fdcce7e156*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1125_n"124f42f9a195b1b7b0a7091a21069e0e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1125:112x1125x128_n"6779a9a81a2a9bbdc49e11ab797e172f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1126_n"62eaa1dd7230dd8018afc351c7d34fff*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1126:112x1126x128_n"c5218c51c09f5341e00989167aadf99a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1127_n"56115ecec837a814760231badacf3805*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1127:112x1127x128_n"b84c9623a3f95ec97a28a92f7fca7d57*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1128_n"004cbc92755159798e4a31800f4631f1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1128:112x1128x128_n"2027abdf726f8107793dcd785f0c2d89*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1129_n"9fea49acb3ec460a95d2ea6af4919a58*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1129:112x1129x128_n"e1901a33654576a9d81eed13d7e4adcf*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1130_n"52cbdab355c203eb967570523c03d51a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1130:112x1130x128_n"f585812e3efdfe9d15f35fa133628320*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1131_n"29a81f03b65b2a2ddc7bd04ad762a10e*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1131:112x1131x128_n"f979f54e228952267e3ef3f75bfca3fa*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1132_n"a3904c277e5aa04f83a3c0c8a32db48d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1132:112x1132x128_n"bcd3b5784ffd1272f1612ec1b1e4df76*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1133_n"fb80400646cca46d6a79460be208e122*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1133:112x1133x128_n"02ed387b46c3179368be9f1890102a65*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1134_n"e34c091492e97109cecb275a385467a8*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1134:112x1134x128_n"d7ef7108c00947c2b2d0a57ea77db971*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1135_n"68524682416402262ec5c07546219bfd*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1135:112x1135x128_n"d54bd07f8093cb1e9ac8f8858f9a4931*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1136_n"797c5c19819ac4060d108b26736ca96a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1136:112x1136x128_n"fe80766e88e863db198f38110d81e0f7*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1137_n"3d9d9e13e81e205f547d7cfc739684b4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1137:112x1137x128_n"5e4c550d62763fe4ebd16a251be0b5c0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1138_n"55f0cdbbd8578b2c420609fd31f53545*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1138:112x1138x128_n"99e4f0a62d9c9f316ce2a060db3b2507*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1139_n"9b2199d5f13502f6217427e898eaee77*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1139:112x1139x128_n"fba3be243a3fd1e6beca174a98f1178d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1140_n"dead34a4c1bd917fa25dade6d271eeaf*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1140:112x1140x128_n"644e1bc78a07409dc6ca57d3c4cc2add*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1141_n"9bd11039fa0397342870b170a996be5f*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1141:112x1141x128_n"4641f47cbeb4c6299e3bbf1b2627f243*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1142_n"531c8945a618dd6594c71cb7457974ef*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1142:112x1142x128_n"62dceb2e2795468ce3e641977fe2a65b*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1143_n"6894dfc95db741044fecd625a5534281*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1143:112x1143x128_n"418bed9b59dbb64a36dfa4ee2cb06643*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1144_n"d0532d9cd687a88e0337b3e47548827d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1144:112x1144x128_n"469a4ca11337a0c6daef5eb2fa814a47*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1145_n"431ebf3b4bb3eb5f04b90f72e6f9116a*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1145:112x1145x128_n"8beb8c3d28cd8411330c6e0f14580956*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1146_n"3aa3e2b14deaa396f586cd996e585df5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1146:112x1146x128_n"e785c4ae106f864d3c3694d97c5b5901*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1147_n"82e12e38572cc96466b8aa43f85e52c3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1147:112x1147x128_n"d70449534bc6923e4ad7148d39b9b58c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1148_n"c9e1bb1748505e16161a36cd7d0f77eb*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1148:112x1148x128_n"4d96da85e76ead64e44ea3eb07b8397d*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1149_n"04e70b203025a7285a490764d2114041*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1149:112x1149x128_n"df65b649efc9e9a742d2c608a1241495*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1150_n"6c63996b8cc3767cd5f15dda32c0dbb3*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1150:112x1150x128_n"1b082fb45ebc40e8d328b2e181c90489*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1151_n"9b912c26c0650b2af31abf20e0763a97*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1151:112x1151x128_n"64fac1664ee0ddfcb14f96404560d1b0*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x2048_n"c55e8e119f1356029960c4c4372fdcb0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x256_n"88b6195187cae2150ddfa6e825aa09ae*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=acb --dtag=abc --strides=:: 16x1024x128:16x128x1024_n"f4f904ee3afb64192cba0389b04f28b1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x1024:16x1024x128_n"a6142108789429c5a5d2091f26ba7ba1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2048:2048x8192_n"074535f7f2194fea4c8b3de999375f88*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x7168_n"360fcf415084c2a5d9c7261b60381a00*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x8192_n"236747941a3fa534125fe3a241afe520*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x32000_n"c51aeb5ffa9165df31eb537f7dee0afb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x2048_n"5b119927f59f6d1cc060b8c3e71e1dbf*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x256_n"0e545a4211bdf589e8943fa070c52dd1*20320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1025_n"3b3d1ffd6125868cee0f2a5267d1e7a2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1025:16x1025x128_n"a43bfea3dfa1e53b8a8ad4a76d823992*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x8192_n"3e9712210580cac1a812e5381978b3c6*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x7168_n"802cde8fa5e75f2e46934fc475aef9e3*20320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x8192_n"c0038b7c6d2d808a4cd1c1f76b0910ff*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x32000_n"6e2f3f18ea202d74264f6de8dbaa4662*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1026_n"a1aa6e53ebfb504a608caae563fe6991*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1026:16x1026x128_n"d7534e5099177dbadbe04a8a1ee72abd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1027_n"c21e6c8b41b3f742d067f831f76d85a2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1027:16x1027x128_n"9ae55d0cf4103eb68ee4ef7467761913*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1028_n"b9d4488e48f60fa5508593325c610457*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1028:16x1028x128_n"e863cbb7b05a7f4e9c28569cb94cd37d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1029_n"37e6f160aef21f0b948a684003e39849*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1029:16x1029x128_n"fbe575d6cad71a90f7b211db978e9c6c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1030_n"345d59fc865e0f8b31825b3ea581a521*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1030:16x1030x128_n"8efd66cf71d94fb56ba0c7e2eb4c5d47*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1031_n"9b20083c5279a0f15afdab7093135527*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1031:16x1031x128_n"8f305081376a97406359c3ccecc61142*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1032_n"3b4cf3c70a2367a39d8017cb2f36cf0c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1032:16x1032x128_n"673b43f90ddf8c1250167f1eb099c04b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1033_n"769db0e068e4f1ef253dbccb47eb98f9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1033:16x1033x128_n"62980232c4dcdd74a68d2f6c75682481*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1034_n"05cc4f0762f089de6e8da31b3404f2bc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1034:16x1034x128_n"afb9911c5630ccfa89f7e6c74c174c1c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1035_n"034681b3141e1e9eb79835e42da90516*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1035:16x1035x128_n"4500213628da7a0df1ca566f2b1314f9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1036_n"04ec617128d665fc8f78bb6ec4013530*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1036:16x1036x128_n"b9204e40efb722ba50838ad0c137d965*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1037_n"91d8c6363916465c9095a0cd607728ae*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1037:16x1037x128_n"40a55c80dc9e7a930f1c18404addb96b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1038_n"1c8d9abf58ae21555a42b048f6faa652*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1038:16x1038x128_n"5ea0efb44e4607414a30e2166a4abe21*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1039_n"8be6cdb9400ef91e91b387c2a5cdadd8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1039:16x1039x128_n"dc7f241cb4e68f3ac056f1a76fb1507a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1040_n"102e2d6e021283f4e45abac3f3bd4c86*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1040:16x1040x128_n"38c2aef88cdf5cf7ac25a136f880d341*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1041_n"f7e2a9a940a2b39edb468d1eef562559*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1041:16x1041x128_n"577f52914d27461f79aa73b50be2c68a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1042_n"8cf91331ea2d51570f9140884b6ef03d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1042:16x1042x128_n"c2afe9cba988d05ca0e8e32090de0dc8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1043_n"74403c0b1d0ae89bcbdc9196edf6fe26*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1043:16x1043x128_n"cd7215bacf3f49931ad835882e7aa07d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1044_n"0106a0f70603900b36a810618bfb7811*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1044:16x1044x128_n"e35b79b3083fec4e730502e770030407*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1045_n"da972f670f07942c636ded78d335f2cb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1045:16x1045x128_n"9d8c390179413ffe57b441da5de62e0d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1046_n"4e72e0854c1a690ffbf7852bc5ca117c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1046:16x1046x128_n"4e18d9135354436ab0748513cac02169*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1047_n"bf63dbe627dfbaec75157bfa2ec2d4c1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1047:16x1047x128_n"472ada59f042bb5c1173abe090f7f2d6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1048_n"c48a9b7ed5b7646906d1c166942af971*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1048:16x1048x128_n"ef74a9cc3c0d08efa79db07c80f2e23a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1049_n"b381e69a5733427c7312f34c16bfdf84*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1049:16x1049x128_n"1a9751f69f9162257847a40178086073*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1050_n"e26535ad01864689451da14e2caae052*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1050:16x1050x128_n"b1a481e457403cd03dea003fa46683e5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1051_n"80ff0c1ac213e18603d49b0902614710*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1051:16x1051x128_n"ec5ffea93c26c1499874df10cd6177a9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1052_n"c9c3fc76d47a2d6be759491175e5ee6e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1052:16x1052x128_n"8195b64a9e73d02bdddfcb69f5ba3cb1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1053_n"a231e04e7944ab323566f21f66f1def0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1053:16x1053x128_n"2465bf27012dfc2ab360bdfac28c85c1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1054_n"c90a9b7a4bc4cc237b91dfee350faf94*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1054:16x1054x128_n"e138034c7f54fa3c06099f9fce024fd9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1055_n"caff71a560541f512ce43558be19a600*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1055:16x1055x128_n"ba01bfffa29d1c0247b5c33e906f7361*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1056_n"1cdab9cbe189f658ff0eb1c8fac5ff2a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1056:16x1056x128_n"a6750dc73e09aa3b52ae91be5dd51653*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1057_n"150221b6ab925d615318a7be72adfed9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1057:16x1057x128_n"2bc6348305b7298f0f308da18852ea3f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1058_n"ca7c64c3c3ea994ffd91bce742802e04*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1058:16x1058x128_n"1901fd7fbd552a6b051754601bd0e082*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1059_n"5e97e76cafc54f185d4e7132846ec78e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1059:16x1059x128_n"67ad70674c33ff230418f2740de18ae7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1060_n"cecd3ccdc4c7be575c7b6da6c5a362d1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1060:16x1060x128_n"061a6fff664d5ad81d58581f1b880e0e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1061_n"2d8b4c8a04bc5bd8b6f1bcbc054e5487*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1061:16x1061x128_n"d615125a1e16faaaaf7c0a8793380fda*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1062_n"25ad255474c0444dc70332fdc722aec0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1062:16x1062x128_n"beef11f2c30773fadbcf10681d0b8e2f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1063_n"5e307d3c93c4c6ca69f8c0600d1d54d1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1063:16x1063x128_n"30b10702424d836472fd5ae5f7f1b24a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1064_n"3cac88a54af36dcb2f7b1322bd0a23f7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1064:16x1064x128_n"48aab84e38977314c5d36c6c94956cce*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1065_n"42ce3a87f3ff89a0ebc1c5b0e3fb26b1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1065:16x1065x128_n"d8cb5bc6c6bd16d34c6aa5fddeafba3d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1066_n"f451e2829af3c8f18216126e12a5b8aa*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1066:16x1066x128_n"b21373c2de0ff126af5aee1c0b73b1ae*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1067_n"092b04b64af8fee1422bb265431e9099*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1067:16x1067x128_n"0c66adcc891bcfb3289a302db3168184*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1068_n"15d5ba55374ea69959251518024c5524*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1068:16x1068x128_n"28e049aabe6893e2d05def8da29f9ec4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1069_n"1e5264abd296e9edb30259fa10644ffe*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1069:16x1069x128_n"c1bebe31286f783df4398bedd492a2ff*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1070_n"139bf58334b402c03927f75f44b01c02*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1070:16x1070x128_n"094f75d7dae16c0f9aa51533eaede5f3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1071_n"885639117a7da0370c3cb7a46ab02d90*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1071:16x1071x128_n"6e1baef0ba27c29dcfab41017bdef120*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1072_n"c46bfaf3e7053b069f4dac1f38f754ee*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1072:16x1072x128_n"b9316787f51f008e827b2a4695b2e1b7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1073_n"bca3e7653ea5de320ecab9dae3804ab3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1073:16x1073x128_n"db82da64fc9099d2585f94e856dc109d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1074_n"25535d6ab37c1c368666f8af1d2aefd5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1074:16x1074x128_n"96d65ed3b8642bd48d92ca987dbb50d0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1075_n"43330413413fa0f264ab3ae4594cacb6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1075:16x1075x128_n"6109ecaa918e083808eeca26ede2369a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1076_n"e878a298d20f7698ccb29cc20cb99f69*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1076:16x1076x128_n"c2f685f59eaf6c1d1bf2b8930604e4d6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1077_n"d8c816c47b3aeb723fb3947f03a39f58*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1077:16x1077x128_n"b540d54ca5064bd1a980a38e6e84e2f7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1078_n"f6101816ea0f71938aa73fe3d6ebab4b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1078:16x1078x128_n"7471a27997b10f34b6d4b94c6ff50ea7*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1079_n"58d7c6a57f307111c7704ed8f1840d2d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1079:16x1079x128_n"d8999415f2e9d025be65f4557e56bd8a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1080_n"dc2330d987d0557b43859a13a23422da*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1080:16x1080x128_n"a758d38ebf3ad55ecd0fd56649195a67*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1081_n"709521e9e829da19ff7f67dfcc1a0f86*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1081:16x1081x128_n"9787e6745a7285498d00f9892f219c96*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1082_n"5634966a05efb140c5b7894cfce2589f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1082:16x1082x128_n"195f4773ff2be6c15b5ea378ec4572b8*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1083_n"c1887eb1817f42b6508e1d1aca11dfe5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1083:16x1083x128_n"cb1a0a5f04f35d5358295ed2253c08d9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1084_n"de8f234ec0e6c6525d9444112eb53f2f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1084:16x1084x128_n"30fad6be53ed78442fda239ac471e60d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1085_n"de80691208515047515bbcc3bcee350e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1085:16x1085x128_n"0a0f82efc75966f5223b9978635305c5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1086_n"598cded0693c81b269addcb664d09fc0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1086:16x1086x128_n"853a3f54eb5396f0e391964abf9e373f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1087_n"34b12870b8b95d4da8d3f3b9a92f5fa0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1087:16x1087x128_n"8a7200efa43103ad0b2377a25bdbec30*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1088_n"8f64650006a18a5a0a4a06e38206653e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1088:16x1088x128_n"3ae68fe2c61bec37975376cafa2c6c3a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1089_n"55654fb25b73393b898cd6aa111737bd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1089:16x1089x128_n"ce71fc7d0f1080f2022975374d631433*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1090_n"6802d7cc5b0954fac6eb7e5a5dd3ccae*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1090:16x1090x128_n"64a6ba393ea019772c8b030247d68bce*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1091_n"055f8793d515a75e81ead693be53fba9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1091:16x1091x128_n"d34ddbe64fbc1fe4a3a17c7ac7cbd769*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1092_n"fc070160dc7d911641a82931e1fc8eef*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1092:16x1092x128_n"723e121d7eeff0985d9d7aa7a772853a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1093_n"cc51c80f6ca3d063af1eae45c165ed16*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1093:16x1093x128_n"585fc0f928f888224ef74be19b80b164*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1094_n"b6d26c5c98e4f396199dcbf2d36f4fb5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1094:16x1094x128_n"0812e785e8fd98de34c0277c475e63d6*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1095_n"902489c3badd5173ac5e332c1cddf536*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1095:16x1095x128_n"2a100f144c13cf0316c6ea6cc570c932*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1096_n"322bcfddb651711f1d0a3d7110ec4ff5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1096:16x1096x128_n"ffd170cf8804cc4134c567e2b9fe459a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1097_n"867cbc96470958dd3f59760e77b2f6a5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1097:16x1097x128_n"941c84f8338839a37dbcd7bcf5884f9b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1098_n"4af454feef5bfebe129c33844ce72499*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1098:16x1098x128_n"c66ac33f4291ff3cb00e2dbdbd3b7365*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1099_n"5d4230e35db23827249e12c28112c50c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1099:16x1099x128_n"129312815f698150f14b0300d673dd5b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1100_n"43963bf30595ce1a5139d2c6443b61be*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1100:16x1100x128_n"86e7c3e7b803990965189cd6200db025*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1101_n"87adb61d072581586937b98aa307ee4a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1101:16x1101x128_n"5a7ea9e971ae3d44d82697a240ff9151*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1102_n"d5d7981c6507c1d6cfec0bd9fdf38838*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1102:16x1102x128_n"b73258000781ccb97ca88fa7b0015f9a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1103_n"d025d87989be73efcec200a8b28a8177*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1103:16x1103x128_n"841db25349d3c64f7f1c4adcf4504e4a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1104_n"be84e1e233f217e68b85d62807c35552*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1104:16x1104x128_n"6275fd7ab43331db72adde7fb4ef33ad*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1105_n"b22dc332efffa40f6b8f04de210fb311*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1105:16x1105x128_n"77f1bdd0384749e9f0ec54108d768afb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1106_n"d938b7c0596a50fb830c12b713bd8d3e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1106:16x1106x128_n"9e75947ca82fc5df907abac8b459b1db*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1107_n"a5a50b6cf52c187f0b00d89bcf8caacd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1107:16x1107x128_n"372947e07ca6e6a5924820df5a2a1d3b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1108_n"ed50ace8d9989b6adc3c2f6896819345*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1108:16x1108x128_n"c58f50b816b17ca681a643ae3ed17a60*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1109_n"b924d99a1ead24e05584fe47431342fe*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1109:16x1109x128_n"590e1c35744e1c8e9d025a9f929868e2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1110_n"526c2c0bf51b8e789fe63ad4aa025e11*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1110:16x1110x128_n"b4faca110991a99634c7bf887c75ae4a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1111_n"ce73c8a37e2f2d42b7de50bf50927a89*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1111:16x1111x128_n"1d95381fd5b67103c371ace8e98c4e5d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1112_n"b2d5777381d3015161305ffff5df44ab*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1112:16x1112x128_n"b084d014eefa54ad80f86e154648e78c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1113_n"4b69c077377cd713b92d05460a946b45*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1113:16x1113x128_n"012fa83e1dc26f2ea37383ecb5ae2a6e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1114_n"833626e09cd90b1e22c0b855633f676c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1114:16x1114x128_n"55cc94d42187ca2418656bf980b3faa9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1115_n"608711192b078d5e1a3dc65c1a5c4b3f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1115:16x1115x128_n"e170739e8c0118221cdc88ada732c3dc*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1116_n"e4f4f8af8c21598ab32a5bee7e493237*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1116:16x1116x128_n"715e2db56e5bcec0ec22db1b038cc9f5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1117_n"88b0f5b4a710ec094fdcacbdefd03a14*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1117:16x1117x128_n"83071c1065bb20d12ab65656c531140f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1118_n"b7d8047177ef8c0d6745ab0400cf1d81*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1118:16x1118x128_n"6ef5d1cd1613e7544c398f2448520397*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1119_n"71907ed3377241047d7679a94b60329d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1119:16x1119x128_n"2cc83a9211dcf966bfdd4bb5c3f8067c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1120_n"b1f2a07aaedaf1c43ddac95964ae2986*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1120:16x1120x128_n"9bab980b5c3a1dece2ac8a9b8f7eb9e3*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1121_n"321f1cc8c8291f7946e9aca16d93c86c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1121:16x1121x128_n"9c4c73b2f4226fa8514bdb95fccce343*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1122_n"b78ae4ff913f372ef30e1b17c222fd41*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1122:16x1122x128_n"fba7b39ebbe57dc530ddb8bd169781a5*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1123_n"d132b9af25f41f20325a7bc8804c3371*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1123:16x1123x128_n"cee47ec449b94216de5cddcd71789e9d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1124_n"141b4e254adb57f6a6e0ae2cb505de9c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1124:16x1124x128_n"8b0f7d3ab110cee34a83b6654d0e19ed*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1125_n"7d3091dbcef56e1a9b0d649bebda0060*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1125:16x1125x128_n"a78cf504bed285e8c7df66f18f475a43*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1126_n"870f6f61071aa57aa0a8d646c30360b9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1126:16x1126x128_n"6b8ea3fd86c894fa3c011af86452d5f4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1127_n"74b9736bb334bef3636675bb5a0e04eb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1127:16x1127x128_n"ea4a6c08417eda1367295214ef681b60*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1128_n"ccc39b09ebd7973b8c7feef2171f92ed*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1128:16x1128x128_n"cff279de42c1be6243447afd0d98689e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1129_n"bf867c4f82e5ddfd55641359b87ab048*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1129:16x1129x128_n"003979122a4b7ecd4aa7134ee63404f0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1130_n"83a87f2fa649e539672ba71cbd90e238*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1130:16x1130x128_n"7659ee02f1696b95d81b704031f32843*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1131_n"dba47d9ceded346aa97e93fd57b603d2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1131:16x1131x128_n"1b4c0ea01cf6e115ea4e64bc2214a5ef*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1132_n"72d5df98cc90fa783bb0273a1d29b70b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1132:16x1132x128_n"bc6b1d001b9e7363e29538784effdf7c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1133_n"af5b7effcb8144bc0bc198ab33e1677b*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1133:16x1133x128_n"0f1f6f2c65a36de8dca5c15c6c017940*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1134_n"56d5968b924382e02b5e52f9441a74bd*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1134:16x1134x128_n"de18c2b90107318f23c8969df9cae339*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1135_n"825235301eb34cd9d13525dc270608fb*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1135:16x1135x128_n"8005f83f3041bf8386034ff1549bd4ad*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1136_n"4f83823bf6dc8a1941eaa0ca5cf75091*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1136:16x1136x128_n"95591926d522834345dfc2694d833a0d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1137_n"e6b1b0427f624ef95ef2f97ebbba1592*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1137:16x1137x128_n"a437c9ebf4873af2bb79bdd056269e68*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1138_n"ecdb992f5514acde1e8451e981e8f96a*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1138:16x1138x128_n"91289b9ad0718b15fc00b6f824da1dcf*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1139_n"d7176e8cfaad2937e605832f7eba06e1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1139:16x1139x128_n"add7c3f822c44879349233a3379d9613*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1140_n"e182dd76f8abc35590fab84ee7365542*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1140:16x1140x128_n"a497edaa3c1669dbdc455c23635f3614*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1141_n"34eba9397519cfb38d248725fb38199f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1141:16x1141x128_n"83efb563be57f2826c4efe5381821c33*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1142_n"c1e187eeeba9f1c4be2057a4fb58f82c*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1142:16x1142x128_n"754da288f900f201023a40918a6409a1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1143_n"eee2db9bb445d3025e319786a95085d0*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1143:16x1143x128_n"179a5a206808d44c5657613d524bd950*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1144_n"53dbff16edf77128df6c6e8c7f9eb232*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1144:16x1144x128_n"c8325892e190477e48be4a2a3befc514*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1145_n"cdd6670225f7ad1599546c15b1ee35df*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1145:16x1145x128_n"8e7856c9e5643e10b4ea81bad50ca1e2*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1146_n"cd022237b9cf091bb9570ca47936800e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1146:16x1146x128_n"bdb0d0e73e32b091f253cebc63976272*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1147_n"6974dfeb8f59de1a31c0918b463e7056*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1147:16x1147x128_n"ffc2b48cea4f67e1cb791c1c55da0eb1*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1148_n"10a1f2981b39b81a584c0f50a13e8c42*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1148:16x1148x128_n"982e80df66752f3983ec50002d9fbe89*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1149_n"6da9768be7a8f0618200ef4c53728f10*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1149:16x1149x128_n"6c4e416a9b1a26563c4fe570e3ed985e*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1150_n"49bb19d08bc3364ee42d94e513c2d066*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1150:16x1150x128_n"a950642dc2ebbea5d4cabfface763191*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1151_n"62771e700d9f9f8a20cc83145884299d*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1151:16x1151x128_n"4d26f5e57d73e930c35ba25a6c37b3ca*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x1024_n"8aad3c22a230c8a7aa23efee41ce040f*12024"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x64:1024x64x384_n"7f315e0a5c9d726a3c72433d2b700d00*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"87d2fa9cc128de0881c134da05700629*4056"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x4096_n"83291c6155c10fc4401fe7e795a0472c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x4096:4096x1024_n"4f590145ce05f28e3c7dfb92292eb093*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"ad9bd6a8c049769b7de4c4a33bb073e6*169"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f32:5:abc 1024x384x64:1024x64x384_n"979e45b113740836d24f59e4807f9086*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 24576x1024:1024x1024_n"9eb130e7a02898f091fdc8140c8c04a5*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 24576x1024:1024x4096_n"e4013a0ed34a49bc43c2312dd9d3de70*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 24576x4096:4096x1024_n"217e75360397abe7b776a3cdf59bc0e8*4032"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x1024_n"a21f2d30a7078ec6105480eb11bfd599*71"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f32:5:abc 272x384x64:272x64x384_n"2ae8a27acd5f9623983fdddcb7d5be5f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 272x384x384:272x384x64_n"721c45df0a6f6711be1c1d26d50f711a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 6528x1024:1024x1024_n"6af1cf46f8df455de1741a3f288645ec*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 6528x1024:1024x4096_n"3c5a232cd0b7cb328b1fccdf400fa169*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 6528x4096:4096x1024_n"df3799dd295ecbf9e53b4b611172e836*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x2_n"dc602b0020498919f4d3cba8cea9ed84*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x4096_n"30d49012cdb49e728af0c89862d0222c*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1024x128:32x128x1024_n"2c04bd7576c3b8ab281718092f9c1b96*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1024x1024:32x1024x128_n"b25ae4b2ed68aaf490745412c6d47598*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x16384_n"c2fb6b9592249d05e18c905517909979*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x16384:16384x4096_n"49dad11d836c9bb5cfd836d0aa3d7441*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x50272_n"50fbedb83e6a0641f3c09533d0ef7b71*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x4096_n"2c1600bcafecbcd68086d892305bf2b2*16256"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1025_n"9eee3c1ff86bc5bf6fd99a71a0015b01*32&21994bf8c22ff1a193e835be2239d6fb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1025:32x1025x128_n"33c5e760822dbd4c89bd3c9677eb6ae4*32&0bd8b7e6c574ca88c79ea2ef4979214e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x16384_n"122fc9352b80f2419fb92b592caee71c*4064"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x16384:16384x4096_n"ec2d2301701f1b3984bf76d9845b2259*4064"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x50272_n"4ce14717f6a9376bcdd3b458989a65ce*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1026_n"c3b871505ce7ee6f8950bcf70a2d65c8*32&1abc075eb616d8865369da328db8c139*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1026:32x1026x128_n"aa45f1490a92ab9f0fe2cc7b14496616*32&c7b0256d7e25671b88882be1ff8ea794*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1027_n"ce917481830c1d87ef1c715182fdb1b5*32&59e7bbd0306e17dd81fbbcdd949451f9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1027:32x1027x128_n"19dcf1a9e6a345ef55d8268fcdbf1ae8*32&482c1069718c270a60e9644c31dd2d41*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1028_n"047cd6e5e9d5f0cb6018e20f1a03c4d2*32&47fb1293d5a4cc16e10367667f0a7e73*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1028:32x1028x128_n"2b550888d3a20a4dd5a96bb181802a60*32&1aa7b7b4b65d5809efd7fad034c057a3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1029_n"e0b27c4c1eb57ca5d313d7ba3203a67a*32&8cdac2f54f30d500247f7180d602a1f9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1029:32x1029x128_n"01645e9ea74456a04268a1a0793609f5*32&af7092e254dc57c3387443355ad71969*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1030_n"5f03c4aa33a88b46099c1a22a2d4dfe2*32&84ac4ca2dd3ba4cec762e0770142378b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1030:32x1030x128_n"e1536ee1acb5efee029f709879551613*32&523898d6e23391360d720bf0ee4e6e7c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1031_n"3a0bc9f510e3b25f1987fef40da69aa3*32&d2f06c9689320583fb4fc9c902d3a8a7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1031:32x1031x128_n"f25160e40071f82d7f77bf5f7422a430*32&8e180b093f618395ce4a8dfcf5b33f6e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1032_n"4db35f76ab153d179df0a89e0926db91*32&765a366f13f5a229f0ad8d2f4a0e8a11*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1032:32x1032x128_n"c57311b8cbdcd739839fe46a702370bb*32&91e04663290ceac1fc63502559d06e3a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1033_n"b62201c75b405528f15c7c1952c40d0f*32&c4d64bb65dea1b5a919741cd7c66124e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1033:32x1033x128_n"1ae558515f1d5cefb25ac1a1d89d588c*32&7f90678d36855cfa4131b84c96e1647f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1034_n"279b3b725e37e0c65321326723aa09e9*32&172edbc32f000e1a7dc3df5ca4577254*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1034:32x1034x128_n"e198e9ebe634ac6eac22406b018d18ac*32&224cf5a29a7389cbbe50abc4d35a94c4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1035_n"d3e3e7f3af87d51b2fdbe8257a7d11b3*32&bbe093adf4c2a1232f5287e44850055a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1035:32x1035x128_n"ad6f3428453b7d7c94677bad16aaf120*32&99747cb0f5db86e1e44d3d60dfb04e6d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1036_n"b7c9c54fa0e0e65fcae09264f0d751f4*32&3ec5136ad6c333197a44c6a086d500de*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1036:32x1036x128_n"2e826deaea28a979a6ad6cd7eef76091*32&28ef83b28249ee0876cebdf0a3ce7525*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1037_n"463f5890920c4f1d6d3ce07c8008896b*32&3f5c2183064209cb7a1c395e48d9ba34*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1037:32x1037x128_n"59d460a087d05eff5885ba8a3e9d57ef*32&0f5559e417680d51d7e3dc3967074943*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1038_n"4b3022f335ca35abc334f76ae2fe00ae*32&e1948da2461752450eb51d52af58c920*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1038:32x1038x128_n"d8b7d00b13303a032c0508385fbf7ce2*32&e39e65ef50ce362590e697e41f1ade1e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1039_n"c101c8bbedcb8e03c3d2840a3a75b7d9*32&b4c0d9d6c4a7758c40a03e55d9f48cd7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1039:32x1039x128_n"c02801f8764ab6fdb0884aead4c494c1*32&b95d3b70c1e338fcdf2965537e28fa34*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1040_n"3987ac67cdc1539a42d78046543c59aa*32&e3cb8f633121e50b62ca8702f3901da9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1040:32x1040x128_n"39a70d72f0754ba61a9df8e845e1cb62*32&24e640b4cfab8590572c250ffbe06ef5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1041_n"3e27b03044e124e39ffdb5f87c609014*32&1b576329f590e0f8dcd740de2e76db37*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1041:32x1041x128_n"269ed4c29ab03f4a94ec6e758ab48ce0*32&9617495d42fc1e0df864fa5a4d617b67*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1042_n"372fe302844d42dc9091da125ce49029*32&15b6cecf1f125b91115f127f29619a40*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1042:32x1042x128_n"91f625895ea789f0600e763c674090ed*32&323d609995725d8ac2a5aa59d5716175*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1043_n"e21d745ca3caa3381c6750e236867685*32&dbc4f77619dc128902b8f0d12408e17e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1043:32x1043x128_n"a76eb997ed148870733b1d55f6b9751f*32&2594186c20ad5b78d917053add2a3612*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1044_n"f1f0df407ac6a9d3cff15ea037704922*32&14377f26ada7e6772e52b087e66b17bc*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1044:32x1044x128_n"2ccde3c040d6f52b7405d7754f66c691*32&3621f8082eed0e86d48bba136c8d7d9e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1045_n"0893ed406a1bbfdf693be11a3a5188df*32&6efd322ee7274609b4da2b4251a8ef1d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1045:32x1045x128_n"7f49e88d6a76641a53d92c20bb9d4c42*32&fa49169ef99e912831642b6461dad921*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1046_n"97daeb3aa5c977adeddcd95a24e9c52c*32&aa9ac9305f36a87538c50b5b095e0c3b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1046:32x1046x128_n"8f51d80c2aca20fb1a67bb2c474ddfd1*32&aef052b178b49837769192548d5f16d0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1047_n"388bd3853d82f822cffe646b5eca6449*32&07ccc17990f5aa71f2339160146a48c9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1047:32x1047x128_n"aa0c1c1b247f00e17854709cd8e908a1*32&c405d069e15902ba798817f229857445*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1048_n"1a94e32cdd43c6ee0b36c16adbba2c87*32&301c7443632fb7b504f8277244d806e2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1048:32x1048x128_n"19dc883d360bc5a66d116f67d8f4aa65*32&a6b687c5a013f444536eba32e22238ad*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1049_n"ae2a19e3b8cbef81379c6f3572676a87*32&a1624b25b10a5747bebe393adb73dbe9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1049:32x1049x128_n"867eac21b966f44c45ae749e13e69047*32&9d202b1a930edb1036dc741addd0e4aa*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1050_n"946a60acde828664d2f8fc6887b9fb83*32&34cf55948b0712956c04d66438504167*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1050:32x1050x128_n"93de34faf211d077377217ca7e104b38*32&5ec974763f7a5c0505839b44380681f1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1051_n"90cba15e60b9a65671bd5bcd0d685804*32&93803e2fe665668cd07c85cff475b4d4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1051:32x1051x128_n"862fbf7e00e9486028ebbe1c4643786f*32&cfb4b20c240c3fb698a391d49acb8509*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1052_n"f2bb886cfd7e1876c5b998c8aa393140*32&98a701c6f17386a1bf55d0cdb61e8d0a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1052:32x1052x128_n"4eafaf1202a0499505097ff354175075*32&7796f603a992db8d20ae7865c2d11829*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1053_n"a43c8a3ed5bc3c15a1d33f6fa1ba57e0*32&715176e142bc8c265550b082fdfde64e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1053:32x1053x128_n"4305675239aff262dfddec74b21bf19e*32&2e950e440a9f65250755da5111a3f883*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1054_n"a2ce69518b42308a189347961183dba9*32&0329a52b89dae9d1af92e7a31b80d140*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1054:32x1054x128_n"aca27a0e32ddf427e8f4d3dc247e4f84*32&faa0856f90d3f279235bc567cf87a705*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1055_n"68c1a7cd0f67ec083fd9b0324ae9971c*32&b014949c18b76870455ef14062f75333*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1055:32x1055x128_n"d2a952abc8be803ca5bc18469afcfa3a*32&640abcf92ad6e6d2817bd66967412a5b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1056_n"1368fa0a8860c67c6ed6f02eafd51a5b*32&d23cabaf1c908dc5f3fc963a89b8da6e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1056:32x1056x128_n"04eb65228b3a74d72cba1bf8863cab77*32&11ddb0ee45ca39e784935c15d4ab3bee*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1057_n"41ce57a2627e3f815d2bb9cd02577ca6*32&f9f753a57e92fa89f4990ddb6f481676*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1057:32x1057x128_n"3a91cc3bd674c84e93ea2082116c2358*32&597d31c90fca0666c95d39a80c3b1eb7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1058_n"7e8782ad52b906ae71f3186ac3cc75c7*32&49b832cf3b8ba84173df54156f822ef4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1058:32x1058x128_n"16c40fa1579a515f864041a974bfd995*32&92d99094db219d2835f24f1295168c5e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1059_n"da7b99844091556cfc187ffb65b13107*32&98554b1c7b406be4b0636565c6b65fba*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1059:32x1059x128_n"7c34c1324cfd6eda2cfa82842d1dba89*32&72edb8cda8bd777446e78ff451f7d412*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1060_n"2f8a99ba00897d5e163f5fca3c6332b6*32&fc59056591fd6fd5e1e97407d0a67d19*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1060:32x1060x128_n"f8c7312c16761ece1c3a6237b6987fc0*32&a99e84e4791e8806d38c34ea626eed23*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1061_n"28f71d5e4e01e7209c7361c515cc09e2*32&472fca8362bc196a3d79de607ebc77a1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1061:32x1061x128_n"746eb537d7e3b204d3fa586203a7e0ac*32&96928f4acf3481dab129af80a863cc83*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1062_n"273fd2fad764eede8b07583270dc792f*32&b2d923ac3c476a903741a5708f5c2c85*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1062:32x1062x128_n"6fab748ec3db8e0ea7def73028bb5927*32&e8128c83e5dcaaaeebbed24c0cb9f556*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1063_n"7c96d33a4e908b57262b2c2caa2b9767*32&89dd4324b1f5d601babba9d79f42f551*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1063:32x1063x128_n"14004f48bd398e98a0ae982aa48c7605*32&f10b0471c10ed75f3bc136ce3da36cf0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1064_n"77957184d17977e9186b2873061a1de8*32&e8d99ee6c38da2b886b94c6dfaaff5d5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1064:32x1064x128_n"052adf332d8b35d30a86a6f5aa90a826*32&f9702e9c819c7e659f7d7e86c2de3004*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1065_n"0a69f4390efb193afc905f74585ff52e*32&e0cd7e781d34954bf8513dba1b3e303d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1065:32x1065x128_n"b18a8d680ab4825ebdd937633cc61096*32&a76d7d499999ee447ba7e3d16b0d2557*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1066_n"d3f4a59415855ca94fae83289ecab9dd*32&b6b52e767ae9c7ca51bb68b95762a774*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1066:32x1066x128_n"db050cedd82c056085025bab5168371c*32&add88e7e246bdb805639ce8b45fa9ea9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1067_n"e4f53134fdff6137fcefdd01efa13a3f*32&55ef829ccc7356cbbddc4b03fa08ac53*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1067:32x1067x128_n"f60f34b0312a4f4d8f1eec4bb653c037*32&74248a7e7092c9a8f16029a0df35dad1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1068_n"86e5d974d70c422e2446d493b3dc7cbc*32&2893e730ea2ee4d75059e7b0bb8fdef8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1068:32x1068x128_n"0041ff26c5c7636905efae11349ed1d2*32&724270919fe639c29e02a732dd3d04cc*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1069_n"3f3a20cc9b10187fec581494cfd91087*32&f833c449963b97195c881a03a9a0fbf8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1069:32x1069x128_n"b879e343d5481730c3d7888735c19557*32&dd5425fad969eaa721da11c3667e44df*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1070_n"ee8bcbe0316118ad185b07c88e7eb3cb*32&ffaf2b911b88e8264d1432a79a88cc32*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1070:32x1070x128_n"7422c5de40f3aa883aa17d3ae7b41c1f*32&32c4b3509e9fe57a8347aac509235318*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1071_n"63e47539719593f9efeb09ab1fb797bc*32&6e2229eb0fcdc78b58dc042e9d87355a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1071:32x1071x128_n"e7d28bc7a4e02b0f0991e5d265a12dc5*32&05728e61f41c0d42fe372ffb247d257c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1072_n"ac0bded1d1789765292ac9e500e9f0d1*32&b45916fcf2c7f16a06d37de2d2ebe44b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1072:32x1072x128_n"5e4cc247fa1f946759b01d0f5b967492*32&ad81b8461ac1a97d65940a3fed0bcc79*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1073_n"f40c44013dd446b73de43e267e2fd0ed*32&9395bb18e9577d137315cec8916edac2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1073:32x1073x128_n"991a82f465701ea2e9e751975ab05af9*32&a04ab50c8372628193dedb912bf7e587*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1074_n"ff9324dbf0bcb104a4be06518b6848d7*32&42055267d1ab5c5a4d44eff25828fec9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1074:32x1074x128_n"5bed0913acc239b2af26f0a1a7100445*32&1964b293c7814984c88960688ac6567b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1075_n"d07f0411c32fc297edaf25f75502d0fe*32&748568453085c70d8f4bab9af789e99a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1075:32x1075x128_n"8cdf227d139e80375df35e41c850f1d5*32&4dffc3738b809bde02f05bec71fc5e02*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1076_n"f38e35ab53cf171bdd80c2895d94d845*32&0df4b8417cdc96406b8741a7dfb1d5ca*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1076:32x1076x128_n"dea8f3bdd12f561620f243e20003b94d*32&9ce0a6ebfbd2653b9715668ccc0fb9fa*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1077_n"cea79c80970407891a6485e4812aa8cf*32&fdda43941a8c1ceafaf3573c38a99040*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1077:32x1077x128_n"2e79ec3bfdcebcff830ff377c8d143aa*32&8f23b8a8fa8abf9d5d62ce6d69c3249e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1078_n"dcb2e64fe2812c3f77d1dd0f8a64781b*32&ff58ed692a90f17d40e6f36906948e12*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1078:32x1078x128_n"b3f127430fafa6c836b045ab05c4ae29*32&79fac8004381a1c2b80d67cfdfac7538*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1079_n"c07fd672003ca5c5187b6e62851c9af5*32&e9188ec6cedadc01bf2fb088cc79910c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1079:32x1079x128_n"26f34abe885e61538a8d2321c1983bde*32&8415825c7f3a6294c3fe2ed0564fd0a2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1080_n"4d1c5b4c1811e284f9a4b992989c2f7a*32&a9600492b2e4ca9599440f08ffbbb142*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1080:32x1080x128_n"39f00e4bf015c3a4191080931290c087*32&0731a72f421efcfcbd9bc1b3a987c836*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1081_n"ed5c4cd9f1c36e42761bb2bb818014b8*32&4357cbd258139ee45c65a69a3f59259c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1081:32x1081x128_n"d8af357f6e69648e99428c898d019a4e*32&f2b0f4825f4419739bc014bb923055a0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1082_n"71895736b602d0f37f04dbbcd6f8246b*32&ae0ea662769601f6893b078a0b1f5a27*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1082:32x1082x128_n"5de33c6bdda1d8feac35f10005dd798e*32&75c1cab19e977fe5ee308b5e1ad589be*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1083_n"b19197b4e8f46bf9778a4a4765bf9c4b*32&9cd3ffd551686a183c22dc4e0e02d4f9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1083:32x1083x128_n"d46cd83e7a9247b36f954fae36468a30*32&aee70ce7c4843ceb3b9359c97042d259*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1084_n"0f2db92aa4c55767f0d958cc321ae0c0*32&bc2832c146f16d935194a04b05e8d192*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1084:32x1084x128_n"fb5a0e64b65b1be501d94b18a9110006*32&80fb02493fcec6f33b22abfba3a71e9a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1085_n"4fc594dd1a319318e25f74bbaefa9f78*32&0a012831e023d64ddfd3bfb295fcdf21*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1085:32x1085x128_n"e4fc05b44d0d68f43fc3530bdff7eede*32&57e9f04489ed927245fdf51dca78cd03*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1086_n"c2dd5150067b34036c86ef98bed18244*32&67a887373b6896dda07dcd6159f4f6d3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1086:32x1086x128_n"446335a92d4e247d3cb231680e4cdd36*32&ff0d1eeb8bd389f59388ed4685cca8a1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1087_n"1f7ba4318b7b92d30d7e114b70378388*32&4a60b720031a020136146ac6c57eac56*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1087:32x1087x128_n"71187f2a8e131e1ff083858afb8681b3*32&374309e40da08a7ee2a4b8d67bced445*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1088_n"5a5489a7ed557a460fb284772b693a9f*32&3a233eaa3d0587dc3dd85c932181294e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1088:32x1088x128_n"8456a7c601dbb791083b2ff98329f343*32&2ab6eb2e1a74fca23f6fb434b2a88e01*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1089_n"701f3e6b16412608e3d925f5822aabc5*32&4fad502f9ac10ea98edf0ed14d532a33*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1089:32x1089x128_n"eb98dbd71ea07437230182765c4ddd81*32&0a0d68bba68624e7c3b16d86cf156ffd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1090_n"b461546eb250f1e061313de9c0e7afa3*32&dc9ffca563b122e0ac9b7c4772715288*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1090:32x1090x128_n"5f6098db3c9370c0f32d4485f166a839*32&9c83cc1ffab9ae82362e585894cec681*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1091_n"75777a086d07216a7a2481bfa10aa65f*32&510a954abd77d6c4bb0236ba70cb57a8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1091:32x1091x128_n"a82523cef9616a9c6cbcaa6a9d3f6356*32&0b5f643ab04f00287268314ce07987ee*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1092_n"12d37b426920eb2b6fb5d517547aaa22*32&3e4d0c3209f3cd38742274b0c5a01a0b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1092:32x1092x128_n"8c5970d5347f1d27927643266888e5a9*32&26195aeab7e1b2c12d5a5bcc9ca3acc5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1093_n"9ff0318975732eae0e7abaeb55d1d2ef*32&c23a60278493027f8b573828d1d4d179*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1093:32x1093x128_n"5d86abbb6025536835c65baffbf60d61*32&0ba8ca4db2c3e4ffd79bfb5dae8ef1f6*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1094_n"c74f28dd63048f49d8484878133d4f6d*32&82e26f15047536c4445b800308d59062*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1094:32x1094x128_n"057538db19ac925e5c8c5c912cb5ad7d*32&82b70ec15aaa25047574065ec8c7e0e7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1095_n"d14efdcebed708497fffb60772174486*32&c2d2d6e14906a3065d8c2673368dc694*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1095:32x1095x128_n"a7643da7fe64e19f871e78ae2730b2d9*32&4cfda703155c5a8785609c6d6548b3c6*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1096_n"3b0b14423cbf4527aa15b0814b2e33c6*32&1ce97cc1cb9f5a65d25edd7604e4ed5c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1096:32x1096x128_n"7b1a1554f7ebb435e2a42386b47bf1d7*32&d5eb8f067ee37d060c443f9f2d0c7896*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1097_n"80a1c4077f185a34b9d554c9b7b1cd5d*32&c40dc525ff6318c05672c1e9ce3b7f8d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1097:32x1097x128_n"95da9168dca13580cd13aaebc5556a7b*32&aaf7fdd8a7d73f553af1a71b671b477d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1098_n"e9dca5affd18800682d2351cc7520d57*32&d26c4a9d1509d371784afd613a3e1c25*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1098:32x1098x128_n"36a20c51b3833540f6a10c271258f659*32&88ec08d72d249288a58f337f98edc065*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1099_n"b06f78075baee7805507feb3f10aa479*32&03e1a82f5a1f6bdc621b169c2a87bf09*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1099:32x1099x128_n"0980a0578ee70ed4de3039b6dc646e3e*32&92ddd98a3a02cd5e8d5ee54c97a52a21*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1100_n"3d346f72ba788f542ec312da34bc96d3*32&4dd892fda11cde340293dc484b778e3f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1100:32x1100x128_n"c19d87932be035f68e06be9eebb277cc*32&d085c3b1adbdc475d656f8cee02c5a7b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1101_n"dc56974a02734f3aecc6c3f3592bb4d8*32&fb4b09374f27981fea7abbb2ae25b882*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1101:32x1101x128_n"9d5c3aa732d5e4a7ab7aee582b75497a*32&84862c9bcc2fb32177622e79c9ce2b17*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1102_n"4a46f94f45c25a24d9ace30e06e34a40*32&193c1f040130a1b6b57e4945655757a1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1102:32x1102x128_n"89b7f7709578cdaec5c50d9f4fd5d50a*32&7a16388147d343b79f17fffe5303b578*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1103_n"26ca664e4faee0fb4db425b8fbd59a03*32&aa52cf296f29a1c2836cdf246034dc5c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1103:32x1103x128_n"9c6d681bd43bdf0673844b5f3b70866f*32&d60586458ae43976911d7aa35ecbfddd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1104_n"50e7c4f256b7732e305ed692d34eaf3d*32&e726833e90827d0f533ece9e748067cc*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1104:32x1104x128_n"8b0e83f356b7e57b6e45fc49717d7dc6*32&c07a47f06d76ea11f812408b91a6bb1c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1105_n"cba9fac5e665752a1f70b9f9ce0a6947*32&dcd56a594a958ceba3fe28c9039c6725*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1105:32x1105x128_n"cf770d3b47d8324deda0446d6b2e9d30*32&bfc6565013b48706150e8ed5be26c6a0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1106_n"d14df1e5dd26c53f818587e028e0068e*32&df088b1b9161c8b3044670656f303a4b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1106:32x1106x128_n"628c1ee95ea293e7cb457668d4b5d642*32&ff7c577e833d0b90a79d7d5ce32814a7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1107_n"611c9ec5ec1fb80c28a981d34ab861f5*32&9e01b5f005cb82666ebf233cf6c3a045*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1107:32x1107x128_n"2e40e55f5773f3f715d677234d06332f*32&1c0ce3456df7871afbb1976912c88250*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1108_n"bdcd88277d1e05060c71cc0558700ebd*32&c5edad16c30ae8c99d45ad9928f9fc84*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1108:32x1108x128_n"9d1c78aa0cea444cee89a8d7f7812af6*32&25e2db16b7089b5cd0271430cd2b68a3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1109_n"45a43d87f7a7825d93779cf25c819006*32&848429cc877c8c20eca6e8f88b7b9d6a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1109:32x1109x128_n"ec726c33fbd467f5049b51ed1ddf3f88*32&006dfe6920b0f6f298c94b04e0fb6736*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1110_n"6b5bddfdd0150d0461450afc8d19c498*32&7d37a6b3efdaf7eb90bb96f2b52c708a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1110:32x1110x128_n"8c44c9514fb87cee195740abae9b66d0*32&19e54debb857e47649a7a357f4a7f850*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1111_n"9242f354755bf4326a03f200bbc14b96*32&8756ab36580e2b9b6b1ae5971aa2b59c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1111:32x1111x128_n"bdd54ac6f9de37646b4bf3a3e8a392fb*32&d0aed330d91a77babaa61264c43dcf20*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1112_n"fb9ab2ce273865f935b5f8982b4687e6*32&2669dd81ac31696e23dc55223d880bc4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1112:32x1112x128_n"3bc781617ac1a4cd439df39bae28acfd*32&ddb4e0107895bc46ab30b312a1d692fd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1113_n"8fc6b4e9a37927377a23e23a13006501*32&21b7091ed9273370257581e8a2ffd604*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1113:32x1113x128_n"12bb7c9917615c255e96637f96885e4c*32&3dae34d8f1a72ee7bacc27e5ddf94836*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1114_n"2261040f5f97d7090269f52dd1c42bbc*32&dc90c9f0e0d5d37b857b5c058f4c4d7f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1114:32x1114x128_n"f821823b279785e0a808ea7c70492387*32&cacd05d5b348b30eed7c077239139347*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1115_n"6df717e406cc4b46870906628bc98c2b*32&1c81aca3e0656db4a9aa51fb31ce503e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1115:32x1115x128_n"ff09e8ef8babe6e970d6e06939c6d28e*32&54eeb003c7603fa20d3f3c3cdfd98754*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1116_n"75a0a00d88b4437f1eacf884ddb5e664*32&b56717f76ce30e9b5a56b9ffc3dac174*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1116:32x1116x128_n"7ed11c5ef3f57b787f37d2864bdcabf2*32&b415aa0ef0e2174e8550914d881e5ccb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1117_n"96165caa4337d1818116bfa19fbdea2e*32&b2256cd9816dd38d74dc2fc1dcb075be*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1117:32x1117x128_n"62a00f6e028d3aa426d39929bbb26266*32&6a17c650016df465accdaad47b5e3147*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1118_n"23995a6ce6d41e457b243f2770dafecf*32&c580213e3f3bce963dfe10316cfe31f2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1118:32x1118x128_n"65431c43427bfe18a30d2b76948e7993*32&5878267ba2dba514be50ada351be7216*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1119_n"c2ccff98868fbd030e1fa49dcd44d368*32&f2f87cb98c8b93add6d2cf56f6e7013c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1119:32x1119x128_n"35d815b6c38a0090bbb788a433e20b5d*32&63bdaab62e33e9ffee903b6ee4c822df*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1120_n"e6156c4ea3690b5ddb23c17da65f0986*32&c12cc9d2aaf7ee3a21c7dab84ee39e2d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1120:32x1120x128_n"0a48147941d98ae1b2530df8d359a74f*32&2a5938806cf73318b97483787faf2351*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1121_n"1f13c94c47d2695f5794ef3a2bb22707*32&36a30992cf97d60a0c80fe1ea6407025*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1121:32x1121x128_n"5f7333227bbd08f3886c5ed53eacfc7c*32&b7b7eeecb090e236a573de33c4661656*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1122_n"d66fbd15dfd33eeb6e72eb88edbb939b*32&c5aa377dea400c16007957b79742bfbb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1122:32x1122x128_n"c95dba1dd910e3d605baeaa1a6c87483*32&3a78afb9eac6ad5191ec9cdca58d8004*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1123_n"143dafb9cc7f2fb28f7afaa609fc31cd*32&dfcc79291dfed99a1a807e8738cf1ca8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1123:32x1123x128_n"dc67fe28888271849d39fde8250e6df5*32&6be69da579f9a765f828412625260202*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1124_n"57c5b604338571a38f53e8eb7cf2b148*32&b71e3afecdaa84000f8ba04c92a4e6d4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1124:32x1124x128_n"4325fb2e3c0bfbc63d2289980933f5e1*32&31dc39256a5f2bdaa2c6e98698d52883*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1125_n"8894a79c46aad687ac07aafa636729f9*32&d8e12470a672ed413f1f6c88121ad3b7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1125:32x1125x128_n"c974fa7b72d4ce57faf254b6a6ebfc2a*32&f2f38e6bf63413164dcfca7f79092781*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1126_n"e669ec733371ce841ee8e5bb1f07cf59*32&76e7c0064f691160a046630640cc3474*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1126:32x1126x128_n"eeb0b98f5203d3d9a5614dfdf42a4079*32&b2ae446203ddc8f3a90005b596ef4c8a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1127_n"6d4c0c6a9beb040dc90bbae65806f5fd*32&63442936a783b5ce7535243c1f960168*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1127:32x1127x128_n"e3a379f8dc2022c70ed4cced8df96470*32&f3edcb4ec8aa503aa5d93c9ffc3a8854*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1128_n"a0d3f78189fba7a913022b9f680cf6c1*32&3d36616fbefd7d15be7ea92f5b832005*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1128:32x1128x128_n"f9e103b88eb4d8468d36ba54e8a12564*32&233cc1e845b682e54e4c1ffe60a65df8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1129_n"1f6e60f33ac91138cd815bb77a2f96c2*32&c518076c7b66bf80e1686ea2f2dd6894*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1129:32x1129x128_n"539c0403eb0755eb545f27cdc0045871*32&f20bf9615442dd968b0fc235e75837f4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1130_n"28a9bd8e6cc5f762044fa9ba95de087f*32&54a93debc9756f5125508c245e0a5327*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1130:32x1130x128_n"78d7b405fa5ed0147d451d500da3ca6c*32&ce21b7fea34aaeb887563f53ec8aa844*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1131_n"0c0b88379e988a7b00dfad0f2b8a2107*32&f0c73ea0fc3f474793c83bd2a2fff852*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1131:32x1131x128_n"997a86e9452bf8ec0346b30bc1aa4848*32&2d695f5210ad42d91b6096526c80b9b3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1132_n"f7b21ffcc810fc60e05e28f324c414fd*32&044fab85625b21c2e816f54a914ce531*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1132:32x1132x128_n"033b7fd4710aea64d5d3d46c3130bfa7*32&ae63d1c5ae526f25753feca0ff8b4668*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1133_n"419834f9057738940a43c3900a15ab1c*32&eb129aa4b893527845073b7478cff33d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1133:32x1133x128_n"9a91e7782d831c79bb089dab2e6f3c11*32&850a1be1741040c81c288b228ba3eba7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1134_n"1d38c2055457b05f508b754e1f5334f0*32&89d784cbb7246ceeb2717ee60860e2db*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1134:32x1134x128_n"acd8889c0aeaf8f31d55155f931c1380*32&ffc1f2ee37bc8eeee3a739e0aa16054d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1135_n"ac1dea533b3d7c3066e80679d0b50112*32&4d095b32b6d3c4b8e142df861b4fc948*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1135:32x1135x128_n"affe1a88fa61397477aae1e17ffd1ea8*32&c500f0796a6d707978ef2edad7229895*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1136_n"d6dbd17baaa02d38bd9d0d6301c03216*32&e4e96e8c36d98c6323de67d5e5d1a0bd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1136:32x1136x128_n"544b9b7def017d7290247bcb32f79c50*32&ced76c30e6a6f4b30c8d75a58a7b5935*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1137_n"49b23b7a97ea1c455e30e8354a35edab*32&2f16ac596d36e01ad32a33b2929b9c76*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1137:32x1137x128_n"d57ab2393bac135faca82297b7bd311d*32&36ffe829c144172e76f90b294129e0bd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1138_n"396215ecb38f3721c346733587f96997*32&161a35c0798f2600bd526449be251ff4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1138:32x1138x128_n"73fbd2fa44207456a7a5596479d10613*32&64300ca6ddfae97597132a520579f282*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1139_n"cef8a260019691be054bc8d9807e38f5*32&6928a4bcfedbab0b0018dfd1b9dcf2ad*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1139:32x1139x128_n"17cebbf0ee419e5ecbc04f55ec496e18*32&d20cdfab5670afc9fcd503a1b987e86c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1140_n"e91376cd63460a4207d3e646ca9e17c0*32&c935852e652a0b6684bef38261d69538*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1140:32x1140x128_n"6915ed91f3b87c809729942d26c59e42*32&8f30490beb0a4929d27ea1ce13ea1e36*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1141_n"1aa81029941f5031e00e9ef8e2c20535*32&d936aa928a94e1c69f6e6f54dac55e9d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1141:32x1141x128_n"7b9a49497ff74f9fc781267a40429644*32&0e909024f211d87d1438df980d9cede9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1142_n"faa6cfd37e624e32bc1fd996b5fc2f87*32&7d22193a10c534ee892c769efaeff217*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1142:32x1142x128_n"74a491cb8149770cc9b87bc7c9d5d252*32&4d6adda1c2d766c67f5d32218b338745*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1143_n"86916960c7b2dfbf9105fc75d74abe45*32&d86666c28b0147ae7749d27b660c3393*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1143:32x1143x128_n"123238af37295f42cdf2a291d3a97cbe*32&a56ea58b6c0f7aef081b1d3433694803*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1144_n"a2aa874080e820a570e88906d052e019*32&7914f5b173746673c2836f30ed871caa*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1144:32x1144x128_n"fc0473765bdd2c916bf3a699001278cc*32&34dfc77b0a40e8dcce1839e6e1520cc2*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1145_n"dc003e82ef263467d4c292aad0a5d6a5*32&d4d7239fc8073cf30e28e31ebbcdd59b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1145:32x1145x128_n"e263891c316cc4eafc6456463e8a873b*32&1a0c1da07eaee23f2281a88582ff1631*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1146_n"a23e6c4bd7372c3215835b27e1752452*32&53f6f80ad44848bea7b67bdded6e6f47*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1146:32x1146x128_n"8ed34ce2f07309145015824e7ddf0602*32&db2c203bef1f534bac904641fa232bcd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1147_n"08641b5fc1408974dfdb4d9ce4cf81db*32&067504e6b32b399f90280743bb0f6f50*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1147:32x1147x128_n"6be0fd28458651c33da15cd468cfc72f*32&1283821879d049865eb4265c86af9b83*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1148_n"7e44a04f9e962d727b98a23a654aa2eb*32&7ac4bb9681a091a7f7d5477eea023249*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1148:32x1148x128_n"5776c682012d4388b609ebb06a48d3d7*32&1121a9030f508f2d2024bde8d02d2df8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1149_n"68af1f4fbd91eb79074d92c7013cc124*32&f37b4fc05cacc4ee7020053f6f808df1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1149:32x1149x128_n"77405625ce10fd808ed1e4368f808a73*32&07acc86292722a02a0d5ffc772bbf68c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1150_n"686ac851aefc898393b613740c3c3df8*32&0a27cae8cfa67a8b91fcaf726ce1ba17*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1150:32x1150x128_n"65cfcb5de81629b788659eccd075073a*32&f48cfdac156ed29e22720ddd5de4eac4*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1151_n"6758f4d88779cb9dd705049cb93c6e43*32&395d4028d420ae3a9e67ccbef1d17db1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1151:32x1151x128_n"f5414458c57188517b58914e8c81ff3a*32&89081f77c9bdcb8387a83444872015af*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1536:1536x1000_n"2513dd32e9f4559a4453196bc5aaf6d0*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x2048:2048x1000_n"8b3c215d62fba6677333a3768f08ecf1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1000:1000x2048_n"09bcb01592e4f86cbde4257743cc6c02*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x256:256x2048_n"432ded2f8a95c7c72c1cfc49808b02ab*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x4096:4096x4096_n"b79c4c17291dda2a897d9a97aec24731*112"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x32x256:64x256x32_n"9d4e458112ef46c9a67cf6f6ea6667eb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x32x32:64x32x256_n"0981396f3d2010f61a8e61d87855a508*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x4096:4096x16384_n"07c8c0b346bc729431fc2d0d01666a91*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x16384:16384x4096_n"692eb8c97fb104b2bd8df9e8fff41e38*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x4096:4096x50400_n"bf7e8414a4ab9c629cb6f1b4630cb5ae*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x33_n"2a85f68c7270f25bc4199ae3879e8797*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x33:64x33x256_n"2bfae50ab393ff0a15cdfff6e18efb78*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x34_n"01f2def39ad22777ef4f82f6cd819526*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x34:64x34x256_n"7da2d5f0df4546d260ffebbdabfd0da2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x35_n"99ed23bd0ecc2261e07658b8b9a3a941*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x35:64x35x256_n"0c4d53517f470e21fcc5141d9a797185*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x36_n"317bcc4d9f6afd1375b7e4b5b5b498b3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x36:64x36x256_n"32caabf15c5a01129f5ae979492ec7cb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x37_n"b21064f1f68d1b4f0f4648f52df5897d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x37:64x37x256_n"dc706cd72e6717cd3b221b32f1399fe9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x38_n"25cfa35e291b9ffe26af29fe2e728517*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x38:64x38x256_n"3081c3d02155dd20d93a6c043a1185db*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x39_n"dff074acb1f1a84a8aceee22567804c8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x39:64x39x256_n"35820d9b13495e17818c334465eec71d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x40_n"0c1048b3d434c39a8bd0b509da5d528b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x40:64x40x256_n"15f23dbb2e7985a1ba01e4813cb8ea49*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x41_n"df06acd19281f280a3b3ed07c4fffc2c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x41:64x41x256_n"fcf9c3612bf6419a05dfafbaa179291b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x42_n"fe83a962883ac9f4c4d80afe946d8937*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x42:64x42x256_n"452b74bdd30296c4f8b8a068dc784409*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x43_n"9e8d78faa2214cb235e335441e715c01*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x43:64x43x256_n"7026f50593eba4ce85feb432f44adbde*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x44_n"76f7feefd43f9af442e486255553a38b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x44:64x44x256_n"80ff08230f429470a7896bb6a749e5d8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x45_n"2a7b086cd63afbbf3bf0a32cd45df9b4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x45:64x45x256_n"823678960a41d29ecad2a532b5ebdc80*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x46_n"21c0d852526bf708b48b9267097a5148*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x46:64x46x256_n"dc083f65ccc5ad45c2e5f1560b78a87c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x47_n"c53ad0525b1c535a6b9735a745deac02*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x47:64x47x256_n"8523067ce6d12f6445150f218d816b1c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x48_n"2a2bd74919f9c388c1d0e81a8941edc8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x48:64x48x256_n"0e755d3b10db6a46942fc51f602b3624*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x49_n"c111a62ae61ed14a169553c65ede4370*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x49:64x49x256_n"24a9988e0e3f6aea0cde5de0bf9a4004*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x50_n"19326601f8ef0173a9faab9bff51849d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x50:64x50x256_n"df4c758fca01bc8f9ff0a578baee707e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x51_n"217db20f444691e7cce4338c717ed12b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x51:64x51x256_n"025d05e3616569c4c2821499d55a6391*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x52_n"cd4dd7756e3b77ad03437f215c7e508d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x52:64x52x256_n"9ced25e12d5e4fe52cea80073a43a3b1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x53_n"1d4da6c4c914d63514cadebad2dd6166*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x53:64x53x256_n"3270d47585795f5609a637d792f8faa2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x54_n"5a068f823347dcb91caabc5300cde1e5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x54:64x54x256_n"3f8dd37553ecad2d7cf435b74c5e2b68*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x55_n"22f1cd297f426b113705662840f74b76*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x55:64x55x256_n"5dc92da0fa234f99ef04f29fb6799621*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x56_n"e1f6d7cb48fc828dcd1321e95a289a82*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x56:64x56x256_n"42425f5f675fc8a09203a5376af23a1f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x57_n"0416b9cd00d1d2ddc6e1af6b2dd7081e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x57:64x57x256_n"34f982adbe1aa183ca8caee4be6c89b8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x58_n"a299e86fd16007b4c74558bcbe58029c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x58:64x58x256_n"c8346962c948898d9afeeb4fe1212a6a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x59_n"4f6b6136277f3f413222b24cdbbe6d57*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x59:64x59x256_n"eb90fb953dbf35f81c0f0f369702051b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x60_n"7f4b6cd18f749669931247d096021d6d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x60:64x60x256_n"ea73ff098a7853e722a8b4ee0ded188b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x61_n"03ba335bc4c9b4d4ddd7cd47c283677e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x61:64x61x256_n"180d245db52eb05e9bdced33322ec013*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x62_n"58812422c5061a99d7966e48d1f5c8dc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x62:64x62x256_n"8fa427534024ba8a2c1301797f89ffa7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x63_n"fbe8316e5c65ae0093b2d1676c9ebc73*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x63:64x63x256_n"2dc0756a8f3e7481434af9812b384f85*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x32768:32768x128_n"69d5f3a0ed74a76ffdc300bb3f1f2519*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x128:128x64_n"6833a5e80c5ad6b5c2821a98b9a559bb*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 4x64:64x4_n"c059c70314c95089deb6268888bb8b1c*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4:4x64_n"e2aeecac54edc061ab6944af9b53608f*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x64:64x128_n"afe6d9abf532376fda2fbd3806bf9b6b*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x128:128x32768_n"25a71e75ea4ea4543ca87a97252c32fc*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x32768:32768x128_n"0621d3634d670c2accc98056fb2bacff*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x128:128x64_n"077922aa90e4923b68631d429d7d0cd8*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 4x64:64x4_n"ebdc344e5d5000a95b72945ef2f1f845*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4:4x64_n"6bdb2b04e9f97fb702dc981e11b40e4e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x64:64x128_n"4d3ce16995454682a69d82ed098ae1ce*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x128:128x32768_n"f58db7a90e542a66bc51858d577a01c0*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x32_n"f4aa9399cdcb50c0ee5e774eb641c3fe*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x16_n"8c896f4406f1f7f5a79c812871bea213*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x32_n"f9e326237fc9294135fe27e8a762fe4b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x16_n"7ef93fb63eaf51ffe1de74f2180b3af6*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x32_n"67d7ed54569b03f0a2cf24abd809822b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x16_n"e787e77ec789ddd7d0f698057719e429*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x16:16x1_n"98a11c1c40bf3495628f4a1d87dddfe1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x1:1x16_n"0208d125c546b2ae8bb4fe00f8bac2a9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 16x11456:11456x1_n"2da4d507e17a260e9c4e3c39a5df8072*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x96_n"2bf7ad1bf41b22a09798a87258993d0c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x96_n"ddd7a37d472c6bde1b1b974fc0980c06*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x51_n"d36e637e0bc7445a88f8caba01ffd628*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x51_n"f6de90fdf95706501792a64bca1048e7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x54_n"d298f63394948398e472f1761b20a5b9*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x54_n"cab09085990bcbda03dbc4787b7f3df6*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x16_n"1e6012422b0275bed802806e07d8768f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x32_n"8e980373921daa4a602b7abbdfc1e9a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 179x54:54x32_n"f5340541c3f330bd3396388e1a6583eb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x54:54x16_n"bb71916265075df1ff35259035b504e1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 179x96:96x32_n"eee48e3b58b9cd457a51821dd6ac176f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x96:96x16_n"24e7e2b09ef31e4c6befeae55492c472*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 179x51:51x32_n"e4de4090bf4c99f48953631f3015c269*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x51:51x16_n"16e59cb1c7a3411dd44a932da2b5dbce*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x16:16x1_n"b16140da2433b04122d9b1a8f712dab1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x25x128:1x32x128x25_n"09cbfae42b8572cf1134e43859a55afe*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x25x25:1x32x25x128_n"078ad283362ec2dd24a36fc2b5760ac2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x26_n"7e486bcfc92609d3f92f78d74bd7a480*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x26:1x32x26x128_n"8a452f5627ef407a6009ee06a0133d52*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x27_n"5d59c7eb279a048d90ff7634fe21e66f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x27:1x32x27x128_n"206c9f3ff190eb02c95faaa542916807*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x28_n"ca0ac9adb12e90aed608f32b067e651e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x28:1x32x28x128_n"8116944ddc790e20f87b32f0f91f6587*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x29_n"2cefba7214ff7721cf556ef090bd0248*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x29:1x32x29x128_n"aaae20a177a1398006bd6bed91fdf920*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x30_n"4ab1989db76dd82f82b8760f1c9e6199*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x30:1x32x30x128_n"dca719f370c37312d756c642d90d7714*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x31_n"e9297dff767a5f2978b03c1efda0c896*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x31:1x32x31x128_n"67953418d7c645dda1723dacda629888*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x32_n"914a56778f921410b686d78ea1f38db5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x32:1x32x32x128_n"846509c82ba530e8192a123cefeba6da*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x33_n"c3d0acee430b1058bc05230fff064388*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x33:1x32x33x128_n"8b86d5e2feb3917e8addf260537da13f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x34_n"41a231dccef463699e998348b8ec18d6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x34:1x32x34x128_n"e4db7f812125d572f129b8a04bbdd803*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1344:1344x512_n"54f6394715967ffbbd9c4490cbec05d7*223&4ac83783f516893eb94f541c4ef27e63*196"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x512:512x29_n"85952462457e54b0970dd350324e22a8*223&eea64fb2cfc12c4c4f140d64a3e06a84*196"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x32_n"9fc606f64ac10698395e72e6f36d8082*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x16_n"f8882c114373797fc49b151a190ce19f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x32_n"2c9c9f8382a3efca57c4d1a59b66f897*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x16_n"73eaa26c22999a21121c985c3b246e92*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x32_n"f749169adbd54e6e8209ca07b8db1838*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x16_n"e32bf7be86266016bb74850be7dd59c1*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x16:16x1_n"5da63ece313809d4c896d98f02f56917*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x1:1x16_n"48d7be8b2fc006417fb57d3a805b148b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 16x11456:11456x1_n"bbedff646ca476b73b7cbf6b8d117307*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x96_n"8226850fa487ba5717a9b38bcf1f0089*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x96_n"305159c628140cada8dcecd38e630eea*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x51_n"0930508261ec8c7eaf733937ae681edc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x51_n"bfae7c78da99052b47c9992d02601165*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x54_n"9abde7e8dfce570c0e12f44b72127fa7*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x54_n"8ffe4696f68e24d354cb5335f3450aaa*11"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x16_n"3f23121d9187afc9488afd56a3c5a9ef*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x32_n"9943f3b9798674a5a062bad396c063ff*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 179x54:54x32_n"4793ca55170ac9a890cae7f8994ccbbc*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x54:54x16_n"7e0453a657048b241bf6a6046062a203*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 179x96:96x32_n"ab2d7f15e2243e7b459270ce0bcf1194*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x96:96x16_n"054043858128878656b70cd22553e208*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 179x51:51x32_n"bd0e0a7b3d8059773fd366e658499301*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x51:51x16_n"aeec6c41f3b0e5a515e8113ed7bf728c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x16:16x1_n"7679d46da5de5b6d958cdc5c83b58b1c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1024x512:8x512x1024_n"a63e4e89af3e29dd00a803105e3ae6a4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=acb --dtag=abc --strides=:: 8x512x1024:8x1024x1024_n"c29a79fe7a2ff4ca7d7e241b83d319a0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x768_n"cff73e21cf33ae925e7336d72a81ff14*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 96x77x64:96x64x77_n"868313cb096e2525dd98e0cae6482a4f*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x77x77:96x77x64_n"6ddb91b49abfa534083c38cfb0f8c62d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x3072_n"62b27b025a11d35208607b06d7522f99*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x3072:3072x768_n"fdb0b991bfcd2ce89d67a7bc67ff18cb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x320:320x1280_n"5de5e65c8bf4cc379fc006c9b17d7630*1&1aa5b57cfeff97071cdc581d99dabf9d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1280:1280x1280_n"cdff6f504be4de92b9b602fef3514994*25&3c5e23dd26167e905e73261b41e54e4a*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1280:1280x320_n"f2a198c92e1ec3ac9af719fdeefd734c*10&3ad65a449b4e7dc09fa2a900c0af6602*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"322eac385b66ae62bc3a0f198adfe015*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1024x40:64x40x1024_n"6302dfa497fe1b810d56279e67ac55b2*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x40_n"ba51ca690d0c10127458c48d2e5dccb4*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"1d0b5a385346dd81dbf1d656dd341be3*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x320_n"f466ebb197bca9bf37f19d0852395458*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1024x40:64x40x77_n"0524545cbe524da0294e05161a702c9d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x77:64x77x40_n"7503373f16788f9575b2c7d0bf179234*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x2560_n"bd3964dd24a714e750d145387602fc79*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1280:1280x320_n"3449812d42793853e8c3b4a5e7d66984*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1280:1280x640_n"3ff9041908ba395587fa22d58c4496cb*10&19660c0b7b3384b49fe25e04194e9205*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"c587d46fde15af5d4754b23a78ae4d3d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x256x80:64x80x256_n"92fb3159e330669d5873c4467c8c824d*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x256x256:64x256x80_n"a9912dd9695b3650bd5717270f866682*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"96e12e799f914e3a7f488eaa4db50c4c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x640_n"5e6c0b6d8eff5e9098b84f33006a0039*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x256x80:64x80x77_n"2d630f3ead86589a5f0d9dd8e5101536*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x256x77:64x77x80_n"8703a59afd8f4ab44b5670c1522f6c6e*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x5120_n"77a140285dc6224fd42a93b6ab76b1ac*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2560:2560x640_n"68316fc55c28d1c02902648743bf2f69*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"48149410b06bbf8c247a456b792d5926*40&09daccfaa14df38f7539fc2795cfdd34*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x64x160:64x160x64_n"90ba922fa0ed3fdf5a8e00b392a5340c*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x64x64:64x64x160_n"8158e83e66a2e91f9245d1b14f2e228a*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"6a15441b94f3d735ad093cc87b5bffd6*20&6ed9a3d713d49d09696e68a0b7fafa9d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x1280_n"c2bd65d21a1bbcdea2456c0e996bbecd*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x64x160:64x160x77_n"e08f38c64dba998300d0a04aebefe822*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x64x77:64x77x160_n"2f609b73fdb787e2010126762773a2af*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x10240_n"ff41d569dacce8a8983e523b416a1e76*10&b0e75cda9f1deab396978fe0966e0ff2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x5120:5120x1280_n"ea267cef5d8b7cc6a63f8968c4412937*10&3a9203d85c71b9b987570b965f52ab88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"adc2f0fb68c92da303e2ff679a46985d*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x16x160:64x160x16_n"5afa33065174f071c857a67384923039*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x16x16:64x16x160_n"97b1e3594b1ed2f99f494339564dca1f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"9bcb818db12c3c78cdd50cdc6c590eb8*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x16x160:64x160x77_n"18a41c5d2d01f82cc1881198d3229427*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x16x77:64x77x160_n"bddbf6b7045cf89091fa0e331eebd2d8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x10240_n"82a0293b5958f3183ca2269c8832e7bf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x1280_n"dd5584b0bc7868a1038f4282516715ac*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x320:320x1280_n"16cd083687989d1436a3b1d685e6bbf7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x2560:2560x320_n"4eaf630ed5bd3aab3003fe490004596d*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x320:320x320_n"20733de04c8904506f8c7dd43c5c8368*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x1024:64x1024x40_n"894ac48923e042771d2d01b8328ff6bc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x40x1024:64x1024x77_n"3083936ded5f2da3192bfbc2b34c5061*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 616x320:320x768_n"ecf035c73f3fc8528d05c4c57df84d1e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x40_n"33e81ec271814a4e57957c22efc8aaa9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x40x1024:64x1024x1024_n"ac4a53aa4a6072b38612a2e99f6eb591*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8x320:320x1280_n"83f3f7414a9799dd78de98a1479a2927*5&1c1bc16d070658be3b489f69ebfd3b55*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x640:640x2560_n"ede70adac95eddac2c7d361a8a497e41*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x5120:5120x640_n"869c856d9ec9c791d5aff851747de989*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x640:640x640_n"966c018060eacda47365fe1e9e790605*30"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x256:64x256x80_n"2435326a9bf6cf305d48a22c5eced25a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x80x256:64x256x77_n"b31bc02d9e5ad40b517d2335a9f347bf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 616x640:640x768_n"3756601a5e71f63c04ecf1f52293cb09*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x256x256:64x256x80_n"9f97c63cb65573b2c18cec69aaf332bc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x80x256:64x256x256_n"8e6486ccc3985b6e4cc9202883166fd7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8x640:640x1280_n"c6899515935c1a19ef5afabbc591e83d*5&9b552d26f775c3722953f15e1a5b157a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1280:1280x5120_n"0c8ad00a4099270ddfa38283cdd7a38f*5&3fa7def26b8fd392374c2a6529bb72b1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x10240:10240x1280_n"82e036871b7e0dbcaea6b73b39550177*5&88a479beea74276a976afef7e779942d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1280:1280x1280_n"c2642525c3d8a2e9cbf95189792fe08f*30&3aca2f06cf9f72d8a2975fb7053321cf*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x64:64x64x160_n"0f4b92c0d648a1bda8e0749299d53fe4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x64:64x64x77_n"340453f80b74784e6b70e4fa1e44d117*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 616x1280:1280x768_n"544254da8c9a7c351013ffce83d0047b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x64x64:64x64x160_n"b2d95e3a52b0c4f34be25edf3b2121a0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x64:64x64x64_n"ad15d7e9711fe551bca78afa6221e7b3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8x1280:1280x1280_n"90324b01dbfce303bbbfce4212f10760*13&3d74ecc095660b7e5f3311baa521c083*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1280:1280x5120_n"e95b35f795252383c869d500580d85b6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x10240:10240x1280_n"da03a4828f5a8e0ef26064f89dee0c19*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1280:1280x1280_n"e65656d15f3219f29bf6cccb82a50bdb*6"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x16:64x16x160_n"f1640214fe094689b9403322dc221f46*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x16:64x16x77_n"d003d75d162c066e716cbeeaa52706ee*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x16x16:64x16x160_n"aee572304987836a75dbca2c38f39900*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x16:64x16x16_n"20bb5b3826cb84a2b13530d5d7e4cfd0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x8:8x1280_n"18a2eaf829f9f78c87248ec68db1ecec*1&3bdc5169fcf3aa46b3f8e994dc8abe6c*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x8:8x320_n"4396b983495beb7014778ef1a09a868c*1&b31f5f305c274ceac31ff1a5889c811b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x2:2x1024_n"b91cdec709c7864059f9fcd28bc26670*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x1024_n"fee6b567076bd5045b77e33632c7052b*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 32x16x512x64:32x16x64x512_n"8d9060cafa742eebf3482ddee220a2ea*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 32x16x512x512:32x16x512x64_n"082e7339e23a032484ce4893a623f844*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x4096_n"8639e8b8b193e6f42a5193603d72af22*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x4096:4096x1024_n"78367cb34715b102138cdfd52aa7990f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:f32 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1024:1024x1024_n"bf220ec1935ab2b2f56dd17249d5844d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 2432x1024:1024x1024_n"f3d629b9296f560bd427399e6e38f27a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1024:1024x2_n"5b29c5f70f8e30864a04bec714ffb2a3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2432x1024:1024x30522_n"3292556bda34320dde05f3597257d597*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 30522x2432:2432x1024_n"b9257ca1a8278828cbb9b14b7bb06816*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 2432x30522:30522x1024_n"3f934c819b04b0f67bd8163860af98a6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2432x1024:1024x1024_n"adf2fbb00e2ca3c39229560afc07b7a5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x2:2x1024_n"72cf493e1a305072b31f6d4d765b7d04*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x32:32x1024_n"30924099f341e4ce9e76edd86c411b67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1024:1024x1024_n"83ba83bac84a8ea93c17920ad3a9bffe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1024:1024x4096_n"f79aece7adbe7ccac6c847405c543300*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x4096:4096x1024_n"d38583d68fc5fe0a7ababc4ad7e18e04*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1024:1024x1024_n"03311773aff2a3c36c4919dfcdf77a58*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 32x16x512x64:32x16x64x512_n"16c11fdce44ea17d3aabe6e1f98a2553*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 32x16x512x512:32x16x512x64_n"07be2d584aace28368ab25f732fd2edf*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16384:16384x1024_n"0ad85b35c5a32be78e6c6bdfdf70661c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24x2048:2048x1000_n"817d9b6965d471f6054fabe023d50828*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 24x1000:1000x2048_n"e73abf874c6cec9cbcce02ff7170523f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x24:24x2048_n"eed34e366dd340ee206b7c986b81161b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x2:2x1024_n"8a88cc49ca5edb328a795c92afc82a0b*1&17df4c57d0b4108af22164eee12d48fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x1024:1024x1024_n"14213e73f8bf6ca3dbf2db5f3b2d34c2*96&3c57b77b3dc780f2d538daa194fd7ac2*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 64x16x384x64:64x16x64x384_n"65b2c43a9fe9e9d6528291b55f49b305*24&81b17a5ae27398629b007706784a0091*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 64x16x384x384:64x16x384x64_n"894a572ab5385c2bf3d1dd546370b671*24&0d0eaf7f773cba1206a9673bf4c575a8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 24576x1024:1024x4096_n"24bdae768cae08335f1223cd28293d6f*24&fb2e42ac19b1a54f79d74e90d841ec59*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 24576x4096:4096x1024_n"d3b5bd04daae93449ad4124147eeb1ea*24&d01e3408d43b4d5eda1ce61309f6c199*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1100x1100:1100x1_n"dd24fb5226fad6e6a0f40f812c503d8c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1100:1100x512_n"fa9f13cc045818cba6f1483a594bfd52*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1100:1100x1100_n"4d926b0a72dd489a62fba1f70a9fcbea*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12544:12544x1024_n"4521364b61ebfaf0e4c7d76091a3a693*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1024:1024x1024_n"298af1c01628cc863553714e2054e36d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1024:1024x91_n"c49fd273df73c97087912d048782f257*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1024:1024x364_n"0c58bb070c893cb4388b8f94041ece05*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x186368:186368x4_n"a8dfb67989a6c5ff93294c8ffac5ed9a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x91_n"24176dcf67f7f43e6805cbab3da2eeb1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x91:91x1024_n"b29e647e52e7f58ebe2b68ac34c5a870*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 186368x2048:2048x4_n"d6761f99c461c89f52c1ffb8a2108cd3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x364_n"9f02d138d0e68a5a123f5cf4ff7969be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x364:364x1024_n"3e7449ff4db53e8bd9224a60b8fd6948*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1024_n"fea7bd9814697dcf5c22ee2f4191c068*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1024:1024x1024_n"d9ed6568808e03290e6ad005d88a2bb4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 12544x2048:2048x1024_n"26f6edcc4415d38e29d91869fabec7f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1024:1024x12544_n"e161cb3c6ff6b1cb7cae029c12955dcc*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 4x512x3:4x3x15000_n"46ee86de8dba2b0f0c3622191d705cbd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x128x3:4x3x512_n"c51977ecde576931938e71e82a422e16*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x512x3:4x3x128_n"f92cc75d6990b61af890808a1ec33790*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=acb --dtag=abc --strides=:: 4x15000x3:4x3x512_n"77573aa53f08547874ff06201edbb2b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x512x3:4x3x15000_n"20be8d9a91c56b7dc7140031014572b3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x128x3:4x3x512_n"87a93e8141f9aeb15e2d0191a8de39f4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x512x3:4x3x128_n"41ebb2b9d0494e47436a6e25e8fcf160*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x15000x3:4x3x512_n"abaf68ce3576191b52523e6cf10277ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x512x3:2x3x15000_n"a6338abd326358a711b89eb572db8e22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x128x3:2x3x512_n"0a5cd9a9dcab7360c374fb75c7822e93*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x512x3:2x3x128_n"6aeec176a70eb6125a3aff31b9903fe1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x15000x3:2x3x512_n"675e547cab15bf77442834c1f998a137*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=any --wtag=any --dtag=ab --strides=:: 1x2048:2048x1000_n"867318981607644a655caabb38680083*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x512_n"84a25454f3a357e811574ce3b5f40a79*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x512:512x256_n"dea4095073d4fb39a80dd9d438e53067*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x49_n"abee942ae26d1282a18e1cc84417c071*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x7x7:16x7x15000_n"9ffd3b4dd871ea744ab08e8a0c52fb82*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4096_n"314882afb70376f40e849455629f5620*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"7e6560570798d96e40a637f71bd626df*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4_n"41079c33a95ccae2a57d820e4ddc0bd7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"8110e947da0ac2cb5bbd6f571cca3cba*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x64x15000:16x15000x64_n"c7166bcd098d1b36f9312e87b15cdcfb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x4096:4096x256_n"71cb8a0c92890b3826a69e0ee9670120*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x16:16x256_n"fe933cb972c7ccea4706490b185e9cc3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x256:256x512_n"d63eeb28813782b2d51d423e04df1a7a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x16:16x512_n"06be8728bd359f03fc154e91f51d753b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x512:512x1024_n"0d272a285e4cf195c138a29fa0e83755*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16:16x1024_n"c9d9c2b3d2e21b88342c104ce2c51165*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x7x15000:16x15000x7_n"e7ac3900cd4a1cae8c04bc746b0261d2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x49:49x256_n"b46cdc3e6733fcd942cfb8272bcdf222*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 49x16:16x256_n"86615e3be325d8442cec347bec2b27e0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x8_n"6b17d487cb8169f835cf83e411b5df05*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x32_n"bba6845fa0a3e1e0e312ea711b46d365*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 1024x4:4x8_n"16d93649de1ea5c0a12e8eb0ca7e7310*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x8:8x2_n"de9be070048ab385c852e91e0450c9ae*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2:2x8_n"dc1c8988607382709a4a66c017962715*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8:8x4_n"6bfa84efbd2d165fd85094ea4be5bf85*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x8_n"bb8ac7fbc04c7dda3d7eafc1f3de0de0*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x32_n"afc03101bfb4bb8f7b48781275755681*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 867x499:499x8_n"9e6c83823b8de15716fe6227120b39d4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 867x499:499x32_n"48fbe624dbbbe639731d1bce60a4552a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 867x4:4x8_n"d89c1b462cc68d2af042ba41d7440c5c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 867x8:8x2_n"c31d81cb958c7572f40606ca954ec23d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 867x2:2x8_n"40d81eb556a022e6e7e40cf614aee839*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 867x8:8x4_n"25a3da03e0151962f8aab2264de7f916*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x867:867x8_n"f393c3833f4cbca51ed8043799b7ab18*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x867:867x32_n"be981678247267caf89479f31a03a77f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 1024x2048:2048x1000_n"e9a105aa02e65cfa9e8aac8813ca9abb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 128x5270:5270x1000_n"2dd47323dd355dd7abcf4c82bbfda280*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 128x1000:1000x1000_n"996c692ba4abc9bf7fcd35e85e89113f*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 128x942:942x1000_n"f88c8d6ebfbd17a85296fc9a652fcbaf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 128x2000:2000x1000_n"b6d6a81c7b7d64c684c09274e3d0e2b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1000:1000x1_n"59e3a6bdc8aaa26c0202cc1896bdba9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1:1x1000_n"1a2b6697bc2c5b50602f4af3c7019264*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1000:1000x1000_n"b2fa7fa39995a93b045b5fca2cfac5ad*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1000:1000x2000_n"1872e71078faef0a18de92c3c081165b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 942x128:128x1000_n"e90484d6edaca088deb1090ae606e82c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 5270x128:128x1000_n"188f30642bacfce8b04e007176225f51*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x512:512x4096_n"883dc1330c10dbb197d6e57062bfc652*1&19420129efc834816ab4e6f487512617*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x512_n"458440443f186eb69c13b8dca26f1767*1&1a7765f411709baf940d9d17400af4b8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x27x128:16x128x27_n"9a4f1cd52b56a0647e8055846121e6ec*1&9a4f1cd52b56a0647e8055846121e6ec*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x13:13x512_n"1d0fea76d7fddb9b83f6f179ef130cb0*2&54fecdb8fdb9217e5f4f06ab7f82c54d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x512:512x256_n"3c857c0d55bd6c98deac9967e8f0abcb*4&94cca669cf61ec0f8433589d77f2ec2c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x256:256x128_n"ad9b18020ab0337715d526590dc6b95c*2&c60d0a810720e84c2bcbee6597578790*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x479:479x1024_n"2b76c7b4433a7b9efd53d39c6180afac*2&e31c1e9f5e236855b5d56eb6e4a5a184*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x1024_n"df20ba4cef9aab256470417c5a4c87c3*2&a694dfc9bb8b9c2d6efb6b59fcc99f4d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x512_n"be76d7a63e2d8458afdc17639641a101*2&5d2fabef92ea01cdf5efef7c8d59c04b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:0.271:0.314:1.234 32768x256:256x1_n"6c746e64de5c1df1949b2026b687b8f6*2&e83380b3689dbf641dcb30cfa423830d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x8192:8192x4608_n"a287422a803e8efb7dd6918d10a1e84e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1057x64:256x64x1057_n"b25ae4e50d490c4cea5bd0fe83b62027*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1057x1057:256x1057x64_n"2f3aa2624cac85b49b97eb6992e1a50b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x4096:4096x8192_n"ee1faad37516ec39fb2ab114cf832c4c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x8192:8192x16384_n"eef400dab9db16f407c6ada804c86108*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x16384:16384x8192_n"64872640070a229f7fc0f95aeb37ee6c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x8192:8192x65024_n"4f77611f7fc1920b824fd9559ef51526*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x8192:8192x4608_n"ca3ac971eedaece0fc6550500de59ea0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1058x64:256x64x1058_n"70548192d21991d88ed500a00ececaa9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1058x1058:256x1058x64_n"123819ba6311cedc4f97b39fb9409b4b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x4096:4096x8192_n"5849edc6ccd4cf12c5e1a0f5b925638b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x8192:8192x16384_n"0c1242f222f98bf801e92a54cf053fdb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x16384:16384x8192_n"03ee6bbba6c3ef72e0d0793c11f939b4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x8192:8192x65024_n"fe2eac019d34940b463935ed71f561b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x8192:8192x4608_n"c9f839f8f19c62c6f44672240e18dcac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1059x64:256x64x1059_n"0b224155460540aebcf332c56457e7d8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1059x1059:256x1059x64_n"f58d80cab1b744104e4e574d45aa051a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x4096:4096x8192_n"c26a430501726a28893f01ba3a0be5df*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x8192:8192x16384_n"a26e224be1268f5b18d96599594bf5f8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x16384:16384x8192_n"283bc34300d0a76d70a34dc179544758*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x8192:8192x65024_n"1c5c2401900ee998d952a3fed1e070de*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x8192:8192x4608_n"f5ac9522de91eb06077bb4e3ebcdee8b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1060x64:256x64x1060_n"4667182e20c3ba1cf857f06e5383783e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1060x1060:256x1060x64_n"d63b0bb25639d0c2de517bf4e5638b0f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x4096:4096x8192_n"d6e1044a6d17b29ba7b80826e6c7d553*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x8192:8192x16384_n"222a6eb0badaebbc991d5d757e52935d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x16384:16384x8192_n"1d46b6003bf5b6d2e091c22392441bfd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x8192:8192x65024_n"10a8a5f49a0acb697c36a1ade190cf61*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x8192:8192x4608_n"b80df851020c3734c703996f4a3b2524*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1061x64:256x64x1061_n"d879871199c80c026f1baa841a361442*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1061x1061:256x1061x64_n"1506d361f0e61ddcd69bfee1dfa35648*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x4096:4096x8192_n"f287e41296d3677f3066b34296d16273*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x8192:8192x16384_n"a4d970da71e76e492643b6953feebe9b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x16384:16384x8192_n"1ae99c0deb0cc0ec0c070af6d5d02f16*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x8192:8192x65024_n"6908d90190ecb18f71f2b04c60e200c5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x8192:8192x4608_n"b9b3a5da77f4701bee9411b33bd09f1b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1062x64:256x64x1062_n"57ae94a0d52e6ca22a50cf969f156967*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1062x1062:256x1062x64_n"10ef8cad0268470379d333d8137b0f5d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x4096:4096x8192_n"76ebd6f8558583e716ca546d689f1bee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x8192:8192x16384_n"c97689d9696d0f36109ac6df3c7ee985*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x16384:16384x8192_n"9125a076a4f1790b110469445b55e8ba*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x8192:8192x65024_n"893c8c2f832f26d4df0ac368c8d87cb5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x8192:8192x4608_n"0fb735a3a8cdb6ae05dd36737303741d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1063x64:256x64x1063_n"09a4d9f2ad8446d766be198f517afa78*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1063x1063:256x1063x64_n"2d7a7b35f49c6942a8d0280aa366afa2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x4096:4096x8192_n"c45d88da3d61769064bea48708edba8c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x8192:8192x16384_n"828aec919b889d55c405222c7caa5668*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x16384:16384x8192_n"e213015c945285452c33e691aaef116b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x8192:8192x65024_n"a188470b1f1d390446e82385938c29ff*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x8192:8192x4608_n"4c57b9f6abcaefa80456543d1acd0597*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1064x64:256x64x1064_n"92887cb0ce75c7df0cc47925b0157ed6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1064x1064:256x1064x64_n"1d7b2d1d7e94ccc13f9aa1bb13a35543*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x4096:4096x8192_n"c60af7756f1a5486b2ea7d825883f629*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x8192:8192x16384_n"0a35ddb062e0833e8a8fced8a31241c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x16384:16384x8192_n"455bbc81d3322b8fa1272434aa1a2f9b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x8192:8192x65024_n"e299bf9dd422b875b3e16c5567e07b7d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x8192:8192x4608_n"6d80ffafe178a57c8d413f49f75cb580*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1065x64:256x64x1065_n"b1a3ccdb597e7a2384134af471f9dbed*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1065x1065:256x1065x64_n"3516526db78fd5fe79c7e3b247d3126c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x4096:4096x8192_n"936c888266f04c9fb7c1d3ed7c446267*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x8192:8192x16384_n"bb28d9331a9ad5911193856cf4f9f347*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x16384:16384x8192_n"999186bdce7ac2ec3f441a55e05ff860*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x8192:8192x65024_n"874474a1bc0b10d9ad9df86616a78dce*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x8192:8192x4608_n"864c764d07f350b20a8cc687fb5a8486*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1066x64:256x64x1066_n"5031d116fed5a850ea0d1e797ca44bf5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1066x1066:256x1066x64_n"3820086f26e85154b8a863ca63c0986b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x4096:4096x8192_n"c9c97cdecea5bff7763f3bb97a511ee7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x8192:8192x16384_n"a122a32c65e3ba0f1f43e7fda5ff602e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x16384:16384x8192_n"769ecdd33d7b122df5c7d66c89045723*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x8192:8192x65024_n"b028d0a59e5fc8a224c377f975106a18*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x8192:8192x4608_n"f834c0d7e6aea2a48c17fab5c9c0715f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1067x64:256x64x1067_n"0e6a0f3e8c437138b3b5b2ad6fc7f9e1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1067x1067:256x1067x64_n"fa377f2fe94df2c47e9ca99f1678a151*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x4096:4096x8192_n"57d1e6a0cb8f4fba58971e4431fc02cf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x8192:8192x16384_n"0f3c8d514e4c2bcb11a70e655c8ab242*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x16384:16384x8192_n"0d595798255f828a5e4e8d0499c6d5e7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x8192:8192x65024_n"63468646dac63bb72a04c6363824c16e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x8192:8192x4608_n"305cf707e7d9a188ed3963c1cdbab486*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1068x64:256x64x1068_n"59c9a1a6874db6c215e9533daf2b3b5a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1068x1068:256x1068x64_n"4c75c92e719fdbf04a1627b14d0a79a3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x4096:4096x8192_n"68821837dd043d85a92dbbb426bd47b5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x8192:8192x16384_n"9149e4a57a0c22d648c88d00b91a7252*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x16384:16384x8192_n"6d0b0213499bca28b34725c5711231b7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x8192:8192x65024_n"58e016b0dcf9a9406b12c2e4f2ae88fd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x8192:8192x4608_n"e05c0bd50f18c5159abc934238cb82c1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1069x64:256x64x1069_n"e1140417e2fc60492983fc2cc7e5bd5e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1069x1069:256x1069x64_n"ff6b82b590abc4e090a172ad15ec4dbf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x4096:4096x8192_n"f82dc60c56b749ed4bfc282bfff57c38*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x8192:8192x16384_n"d051b70a10678f5c0e18d3a0dee4efb6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x16384:16384x8192_n"f3d022f0f2f6816f1efd1881780b5475*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x8192:8192x65024_n"690fb6cc8b360ed1d41f761ed6840eb6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x8192:8192x4608_n"48ea3eb0bf9f09332c10d8f2f6cfec48*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1070x64:256x64x1070_n"6c49e93038954aeed8968ede23792420*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1070x1070:256x1070x64_n"35bfd9b033348579f27decd5814cec2b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x4096:4096x8192_n"2734c69de843dab5a4cef7e910e12428*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x8192:8192x16384_n"9c9fcc940b0d4113156bea8e8d2efb6d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x16384:16384x8192_n"efe260907898f1d605c296da33f7982f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x8192:8192x65024_n"0d99d67a951d68aeef363931e797ae13*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x8192:8192x4608_n"d774ff11e136c1cc8fed48cf5deb7b0c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1071x64:256x64x1071_n"3f860db4d234b40f193cb6e1fae521f3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1071x1071:256x1071x64_n"db9c86f9c542d97bdf06fe9e11f68544*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x4096:4096x8192_n"e202680a1fef9565f17bd0a0b7f9cb53*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x8192:8192x16384_n"0747168aa51ba2b05e25abb1070a3658*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x16384:16384x8192_n"29fbd91754b82d0fa6db11a2c42fc080*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x8192:8192x65024_n"7b05415f4946bfdb6a6296011a4ecb45*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x8192:8192x4608_n"fc7dd4c77c2b51cec81e96c28259bc44*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1072x64:256x64x1072_n"5790beefab3a6657fd72b1041a875f88*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1072x1072:256x1072x64_n"5855f514864a38da331128826c384958*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x4096:4096x8192_n"b78f515d2ac74c01e279e8a612d45291*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x8192:8192x16384_n"b51870bc9304322dcd09077cf6edc632*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x16384:16384x8192_n"acc64ce23ec0efd7f42dd5e495ee9f04*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x8192:8192x65024_n"0bd9cc62ec81feb7c6d5f1822c6e2dde*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x8192:8192x4608_n"32a890d4c7ad6ad69e7353e13d6fe580*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1073x64:256x64x1073_n"116ebcfa3fc274ec4c25580ac1da23e0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1073x1073:256x1073x64_n"3b279749029c1ddbb060d4dd12a9ff77*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x4096:4096x8192_n"e8ad5307f1a8574e8675dacfd6f7ab55*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x8192:8192x16384_n"d0f308df6e5e5c6131faa7e64c139b43*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x16384:16384x8192_n"bd3575d8dd888a9ce4dd6f0af643d30a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x8192:8192x65024_n"422559a2372d64e1f6788b21fee675f3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x8192:8192x4608_n"1ebf36e6fa67f8c5e5342170b923fab4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1074x64:256x64x1074_n"346267eac8357c7f369c42697fb9d9cf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1074x1074:256x1074x64_n"1d7bd92efcc10218810dce6891f5e392*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x4096:4096x8192_n"581c050f1d9f1de829fb488cc4384d76*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x8192:8192x16384_n"0d93154a18f07ed3d670f638632e34e1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x16384:16384x8192_n"f769d1606ad37348e33b30864ab994e8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x8192:8192x65024_n"3e26ce378fbd7f2bcb843fd759fdefa1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x8192:8192x4608_n"c3924c9f73aff0d1a88f4cc4b9b4944b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1075x64:256x64x1075_n"ceb3579566025f6d5545c533cca02deb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1075x1075:256x1075x64_n"3700849d5b5a4feab3a54c3e45d1fd82*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x4096:4096x8192_n"dea8cc17a1feae5aa220b6a1d4fa7845*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x8192:8192x16384_n"13a41053c7d522cbb70dc636b20b02f6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x16384:16384x8192_n"bf9a917fe8f56840717dc82cddb51f8f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x8192:8192x65024_n"003961d20ced378ec2f8f5af60a161b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x8192:8192x4608_n"47a0b880f71fce8e635a018638db0b2a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1076x64:256x64x1076_n"ff4a93028d57c6c394de20cfb419d50d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1076x1076:256x1076x64_n"2632787a1aba1a67d0e421687f6363f2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x4096:4096x8192_n"c323816a480506b03822f1ee5ba4f938*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x8192:8192x16384_n"2673bff18331982236f521cd8049b4ac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x16384:16384x8192_n"88585b04755f6729fc26ef829a0845ce*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x8192:8192x65024_n"e44411c3fe3790f4813390a0374bd024*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x8192:8192x4608_n"48b8ee7ae48596908f2efab980b462e6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1077x64:256x64x1077_n"5efb1995912ee9db2802f0626270c441*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1077x1077:256x1077x64_n"b13c3e5d3a4d578a91515f0d950a93f2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x4096:4096x8192_n"bc4b63a12ba8dac809392dc8f8ec3278*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x8192:8192x16384_n"7797113806177eb622338d8096667953*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x16384:16384x8192_n"69213bda9d3b20abf6efd8cb7e13e329*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x8192:8192x65024_n"2bcb6ee368819aefaa200d137e310cdb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x8192:8192x4608_n"15776b0491a5f9723ae767d0df586657*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1078x64:256x64x1078_n"230982b2d0dbf992909961a7402cd0b9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1078x1078:256x1078x64_n"5aa197ff23205fc204e07d02c1670473*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x4096:4096x8192_n"590d1ae538ac01b6a67233ee1faf209a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x8192:8192x16384_n"ca03b9bd6d417f9ee7259bf121b610c6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x16384:16384x8192_n"c64a761743661db416ae159d8031cf3b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x8192:8192x65024_n"b9d2740aeadbca2efecd99dc5cb1a491*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x8192:8192x4608_n"1e9fc103d99576a2fcaa4bc004632729*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1079x64:256x64x1079_n"2820899059f7da86c143e8438d0f0931*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1079x1079:256x1079x64_n"19079be0983804e9ea3ff5d2ef6c639d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x4096:4096x8192_n"6d9572b56722789d5aa2c21bddd65631*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x8192:8192x16384_n"709cd7e414032a2c462171582e150c77*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x16384:16384x8192_n"f85c3b9c8e780f5f576630a7f6dde58b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x8192:8192x65024_n"4e5a774fbd9f6fbb156733060fd8e9d7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x8192:8192x4608_n"9eb885dec3ba9668aca1148e72655d7c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1080x64:256x64x1080_n"5598471223b6f6fd325704668bab62ea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1080x1080:256x1080x64_n"c2e147b600ba0463bfff96cb23bd4fea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x4096:4096x8192_n"ea7e93fa966d2d51143413331b33a50f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x8192:8192x16384_n"0faa3e80079beb7e4920941cdf5316f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x16384:16384x8192_n"2e6efa310e697b56f0fbc04d0bb3cc2f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x8192:8192x65024_n"a412feffed49d9e1e9743cee953fdc9b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x8192:8192x4608_n"7f35a3b7e203abc33c3723377aff1fe2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1081x64:256x64x1081_n"be988ad4007e830357403510ee5cec77*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1081x1081:256x1081x64_n"5a0671d6e745fe337897e03a0982464d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x4096:4096x8192_n"917bb8b7345bd59a50192133721f43dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x8192:8192x16384_n"6c6a91a311a2187a130556e3e80b858f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x16384:16384x8192_n"bc0fc66b98b0c70955a2f6b658bbb28c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x8192:8192x65024_n"974bbb46816dfb4b9f864160a964499b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x8192:8192x4608_n"a73f45e43e5baa26c4ad4d780fa0cf2e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1082x64:256x64x1082_n"a93d10f2c2d8dc6c89cca56416423bb0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1082x1082:256x1082x64_n"97fdd26c03a997ff54ab02b55271244a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x4096:4096x8192_n"edb5261f0605de8c3f6e9a1b68462e85*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x8192:8192x16384_n"6ede0831e0939a3cca99a0a5d014efa2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x16384:16384x8192_n"e007cf9a27cc30b9466ff3e9aaf81895*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x8192:8192x65024_n"8596fb48ec1bcaf4d0feabcbbc73c9c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x8192:8192x4608_n"3aa397d6fd4852029e3338017c2faea0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1083x64:256x64x1083_n"58005b0b2bbd8175f3273ddb4382d741*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1083x1083:256x1083x64_n"4f67c4ad372f33571eca51d76b79c2e3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x4096:4096x8192_n"d2f61ea2d34c7fb9cadb54c71817869c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x8192:8192x16384_n"8cc4d4b03e7cfda25cd8532e777c0caf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x16384:16384x8192_n"6188964ebb1b04685f071bb100bd0baf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x8192:8192x65024_n"d0556bff9373a6339ff467cf14611cb9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x8192:8192x4608_n"5d062a690eed5c917f8c57eb8a50b68d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1084x64:256x64x1084_n"c735663c49a25b295d1176cab7cdc04e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1084x1084:256x1084x64_n"aebb413e85b5fcab97050fae785d1251*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x4096:4096x8192_n"ef5e1f53e64d67df9a5235af73916543*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x8192:8192x16384_n"e5272dc36dae1a832a243bdce367a45d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x16384:16384x8192_n"a747f6e084c1732c4ec937142153f8fd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x8192:8192x65024_n"dda22a69a42e1ce4ecd1f54d22a6d19c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x8192:8192x4608_n"bda10cf6b04d4a301d4fd9a20a7e2613*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1085x64:256x64x1085_n"af61d4af88bfa7508dfd0388534234c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1085x1085:256x1085x64_n"48c3f0f9d75fa83586861bd58eff374b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x4096:4096x8192_n"1271bd397374a02cfe33c9745f5488e0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x8192:8192x16384_n"83212187d6661cb90fdc302fb864a415*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x16384:16384x8192_n"95ea4dc88eb05e201775f901d97bcc8f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x8192:8192x65024_n"13c74c92bb273e98387824a26c58fde2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x8192:8192x4608_n"6b78fbf2bd73687622264f31504f9d47*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1086x64:256x64x1086_n"e395ddd1924769eb6d2bad33b22ff2ba*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1086x1086:256x1086x64_n"d65e9e408b46bf0db486571bf27fb547*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x4096:4096x8192_n"f17cc96a760161abbc3f2c5f1d30e578*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x8192:8192x16384_n"143b8e65b222210880fa54ad0c21de25*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x16384:16384x8192_n"02f0257a06e907ec6dbe18e23e213bf6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x8192:8192x65024_n"bcc2af98a32273466cb33538e1e22679*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x8192:8192x4608_n"4275931e0fb822f86e22680d3616feb9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1087x64:256x64x1087_n"df78dd52c8c91347db9ff7f90f251dc5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1087x1087:256x1087x64_n"175b0f54bec4d57497efeb17def483e5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x4096:4096x8192_n"63c04ec116af72718a5549f74429643d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x8192:8192x16384_n"0814f6927542f0e4e470fb30dfdb3259*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x16384:16384x8192_n"5267596667fdb83cb227b015790a7d82*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x8192:8192x65024_n"cff59d7566c47f901b56eceb5eef16c1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x8192:8192x4608_n"b364a4eb470acd7f71e5bf762d76c3fd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1088x64:256x64x1088_n"f3550715ba81b80944ed937e0ed16b9f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1088x1088:256x1088x64_n"c3374cb915c52baf8762598329b5e77a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x4096:4096x8192_n"55b610ed1fdc66d297fadd34e9759095*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x8192:8192x16384_n"dce2b6b9f5269bfe9d5646ae0f2a08e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x16384:16384x8192_n"7acea967527c336eef10adb6faba5189*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x8192:8192x65024_n"b0b2e9c3de5885d28d50241e62e85b90*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x8192:8192x4608_n"33d2fef90854a94a8d3aabccddcf61ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1089x64:256x64x1089_n"4fa0f9b77c6968b90d6f2cfeb227a051*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1089x1089:256x1089x64_n"5364df47c8023aa0d1e4ae0fc640c89a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x4096:4096x8192_n"e8bfaba64534b01f988b045af778bdbf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x8192:8192x16384_n"d0de0a8870eac559c1be7d1c6c9eaca1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x16384:16384x8192_n"c0ccf6f36172510131d504670f0658b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x8192:8192x65024_n"5979aa20ea3927920b83fe0b3e4bd188*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x8192:8192x4608_n"1ba7a4530b7b573ada9a2f1d68c603f1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1090x64:256x64x1090_n"fe19bad1a9a62be98ddb3f7a48db2ba9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1090x1090:256x1090x64_n"7dd6632a96423eba1a0e21be3651f0d9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x4096:4096x8192_n"7540d292ea875daad9263c8cefb39cd4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x8192:8192x16384_n"4c5c8caed588d7386e37060cdeabe7ed*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x16384:16384x8192_n"477537c5944f1a436e4c3c073a174475*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x8192:8192x65024_n"c31af4fc05700c64386f11c920f0a7a0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x8192:8192x4608_n"0bdd8fa0308a83a14459504ef6fd479c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1091x64:256x64x1091_n"2dc105e0e5c77543ad19f054deb423f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1091x1091:256x1091x64_n"72fde03d56bbe248c294452099670712*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x4096:4096x8192_n"3dcf9e20fe401b9baf1b8d12daf533a5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x8192:8192x16384_n"1bebecb068a0f4a1b20ef0cb2b782fd0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x16384:16384x8192_n"4348cb49dec52e2f4eb5b873ba8108a0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x8192:8192x65024_n"04c0ff842c94d6a60457fc162e0da5bf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x8192:8192x4608_n"5a3ad3fd41b020ccca61bf0027db71da*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1092x64:256x64x1092_n"35e876be7089bc336a2381a1e12cad7c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1092x1092:256x1092x64_n"e53284be50cdd905ac9e418826fd9baf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x4096:4096x8192_n"9b6554c8d6f93db8c01b9bd01546a045*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x8192:8192x16384_n"5d29b1f4f0cb861482ac093b6cd851cd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x16384:16384x8192_n"bae0ba035cb182053c4fde19e822c3ba*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x8192:8192x65024_n"82068562f018a59c7f4d0ba60ca0d172*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x8192:8192x4608_n"0e285d2b1f799969cac95c23c32235ff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1093x64:256x64x1093_n"6317fe218f648d2fb219a57fbf577a59*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1093x1093:256x1093x64_n"f4f3f89f77fd7ed064db1dc0ee0248cb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x4096:4096x8192_n"d928e73b042d17114ee1696642096336*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x8192:8192x16384_n"ef0ce2b27c34b9903c97e003a4a76f16*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x16384:16384x8192_n"a7d7f9460d0e8f8980f3826aa2be0da4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x8192:8192x65024_n"ca1934c864765da057840aa3f419042a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x8192:8192x4608_n"61f67172a08f3a5ed3403d47bbf57831*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1094x64:256x64x1094_n"2de0b6d64456c344d5219dec6fc190b6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1094x1094:256x1094x64_n"4f4cb9dc650beba8e21990c9fe4f0efd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x4096:4096x8192_n"ff87a247fffbfd08b14600e2302762bf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x8192:8192x16384_n"90fd4af160f6e7436ce1bc4c4dac4389*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x16384:16384x8192_n"0fb7e7c10d48f3cf92e38c3271d6c384*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x8192:8192x65024_n"396f0a0c59deb0e4098a5969e2f8fd5f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x8192:8192x4608_n"9c02c4ed1b6818ece60521dd8256bf79*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1095x64:256x64x1095_n"c1d5e1e733c688cdc0028a2d9ce485a3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1095x1095:256x1095x64_n"d36be3fbab45bf10dbb2593bff026f48*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x4096:4096x8192_n"22ad514d529f39ef392218fe871f35bc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x8192:8192x16384_n"c120662d5eaac1213d86445e8fcdd682*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x16384:16384x8192_n"d35c88a72560c889c1e0ba8c0a660dd9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x8192:8192x65024_n"764a1d2953da83f5160cda2a93c6cbca*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x8192:8192x4608_n"a74cd7d9d54ba2f3c8d06f9518bbd242*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1096x64:256x64x1096_n"5b3a96fb00590f26e6e38967fc2d222b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1096x1096:256x1096x64_n"6f895e6bf5c3aec6009210be512eebca*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x4096:4096x8192_n"4d4f3905af5485a58b54fde376b3e412*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x8192:8192x16384_n"d068e56bbeb91f64bce06c94fcd6201b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x16384:16384x8192_n"67d890dee3ff79168bf45dbbd894d85b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x8192:8192x65024_n"925338036e48df579e065cf265ef9127*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x8192:8192x4608_n"a20013b5f84dff7262d24a67c5c7a6f1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1097x64:256x64x1097_n"174ad388a0c17483ba1a82f834a23216*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1097x1097:256x1097x64_n"e62a7759ff6053ed2a9ceb77ca1e63ac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x4096:4096x8192_n"5f1eb80f86571b342c8b10bce3025d89*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x8192:8192x16384_n"af696af2a912d67d779a8e96e82bf21d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x16384:16384x8192_n"a2f4b1f730224103ec3b3da53eae469a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x8192:8192x65024_n"0db14b5963c42cf8b57c9566878b4ff7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x8192:8192x4608_n"0ac13e0421359f63450d570d686d43ba*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1098x64:256x64x1098_n"9195ad4dc32a8e6f12bfff3d4e0cce26*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1098x1098:256x1098x64_n"aaea15a7d9b2483f200dc515e38ac650*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x4096:4096x8192_n"d4f6e7d954647cd97b2cfe5bdd1c2b11*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x8192:8192x16384_n"e5cbd8b34607410ba6a6cfc431587bfc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x16384:16384x8192_n"86fd9c71cbfbd9922e2b1d205b5e4538*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x8192:8192x65024_n"98ee4146459d1c3251f66958aaa67aa9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x8192:8192x4608_n"dc68986851300ed33ffa65e9b0d8e497*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1099x64:256x64x1099_n"2685f0aa37fe4d55b67257e38ab3c569*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1099x1099:256x1099x64_n"1413588606db672af9f8a09549025fda*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x4096:4096x8192_n"b4484870591833e2782b29c41a4c3d63*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x8192:8192x16384_n"b70013c6cfde796c2819e510a6a381c6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x16384:16384x8192_n"f150c39e36e382db8c8fae76441f11d8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x8192:8192x65024_n"527a01a62517ca35ac2c750bd82b318e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x8192:8192x4608_n"3f7a91cd68007f2147302ed4e75caaf5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1100x64:256x64x1100_n"d6b7e1370cb3cb6f71dbd1ba1130ebdb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1100x1100:256x1100x64_n"8b432be16b601b2dd962648ca2a014b8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x4096:4096x8192_n"b139b736853602381b1b86fc1f1c90b5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x8192:8192x16384_n"3d75d4787ff26d791130a2f405ea01b6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x16384:16384x8192_n"b481c253910a78488bbdbab4d2c929b0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x8192:8192x65024_n"acaa1b003c95ba2f3eb69fc2057d9404*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x8192:8192x4608_n"8d8587e4c93db381aded24aed87d2a78*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1101x64:256x64x1101_n"67995596561006ace269c45f531b4c1c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1101x1101:256x1101x64_n"6c0b88709c756b2979062e5ea2153225*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x4096:4096x8192_n"5e8195e9f56af8424094a55b8aa00af6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x8192:8192x16384_n"3e704b83eaf830a4903b6e60feebb4cf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x16384:16384x8192_n"57e526b8b59dbe967bbfd866bb5d4898*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x8192:8192x65024_n"32ba4d735291c299fbd29ab62c8a15aa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x8192:8192x4608_n"d202b36ab3ba78a206b5c2904f8a9b3c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1102x64:256x64x1102_n"e80da2760de9a1b1260d2e68f6329dd2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1102x1102:256x1102x64_n"62eadcd118eab4b11980681848c7f950*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x4096:4096x8192_n"fd45d34245bf70726ac183805ae61ace*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x8192:8192x16384_n"46e32d5f6c4c15e35f50fce3def61afb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x16384:16384x8192_n"6b2cb05f33cebf8113bbe820810bf75a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x8192:8192x65024_n"389f0d7dcc26973443af4dbd85c1afcb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x8192:8192x4608_n"eb0ddd81081258954dcf13e6ad9ac6a7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1103x64:256x64x1103_n"5ef80d7e4a3b4342d65a85a492ca8943*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1103x1103:256x1103x64_n"35c85ee462d7f4d46208623201f241f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x4096:4096x8192_n"ec6bacb0a758df9bf8633f8ddfa27ccb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x8192:8192x16384_n"821c26b00c83c681b41c8bf2fc46a23b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x16384:16384x8192_n"03abdac4e585cd69ea49c155bccaecc6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x8192:8192x65024_n"96d49d7d9d1b51dd4245dbf0eb590e4d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x8192:8192x4608_n"733c17b73e77321a95f2de89966d815d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1104x64:256x64x1104_n"b7fc39b79542d4fd2198c8392ca1b6d5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1104x1104:256x1104x64_n"c153ee50e357694bafb67c6f1aa3e3d1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x4096:4096x8192_n"5e6e48c1de99ced9bdbc0dec3f52d817*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x8192:8192x16384_n"8a424363e73b1085610b444a924fcecf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x16384:16384x8192_n"87c68fe46821bf8a25857e31b0d45a87*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x8192:8192x65024_n"6195f1d2fe2460de5c6148088da18ad2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x8192:8192x4608_n"b57924f878e80786a1883fa1f31640ed*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1105x64:256x64x1105_n"b164fc393ec60c4e6e963acdfefc3617*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1105x1105:256x1105x64_n"8d2ed78e0b1541de071d0a229b7ab741*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x4096:4096x8192_n"19e492e160b011f22df63c6329468338*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x8192:8192x16384_n"da4aea72ec3d93305bc97f0563ede872*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x16384:16384x8192_n"4a927d2fb578fb96e37e03bfe2336119*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x8192:8192x65024_n"50e68a64954e63d88e38a31df4afa6f3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x8192:8192x4608_n"f9da226150d645680b56d45fd184d391*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1106x64:256x64x1106_n"06a98f8b8f90c7ca4cb1ebfec392174c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1106x1106:256x1106x64_n"eafbdabefacd092972a7dee742f9a128*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x4096:4096x8192_n"d1a287cc82d62dbecd8b72be96224352*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x8192:8192x16384_n"840670a426ddbeef15529f4980273dd4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x16384:16384x8192_n"a63172e508c604d359baa6288efd33e2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x8192:8192x65024_n"50d7e599f0690c310605c2be2d45df8f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x8192:8192x4608_n"3cce97e240150efdc21af8a716507ba4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1107x64:256x64x1107_n"753e7602a23859a160fdef9dbe4b1568*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1107x1107:256x1107x64_n"c987370ed74135671c1e341434dd02aa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x4096:4096x8192_n"c580ddb7e942ec8de1417c4d84427b35*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x8192:8192x16384_n"43b86726e50ee5e92e9a4c0a90722a9c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x16384:16384x8192_n"5cd659e09d6703e207f38757222d03f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x8192:8192x65024_n"780aa74cda2c0eea6f91f27be5bd06a7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x8192:8192x4608_n"303f9489fb11966bb09b5d4322dc0e5e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1108x64:256x64x1108_n"4c6ec94201a4968597ae426b5fe13fc5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1108x1108:256x1108x64_n"84cee5a57af2a2f11884fc8d415de7c4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x4096:4096x8192_n"683b000c651fb7ab8654e696f606f5fa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x8192:8192x16384_n"1b94f51272b239fc1ff715ec6c32e649*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x16384:16384x8192_n"95ff1d09647b7c861169e8898c1cb411*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x8192:8192x65024_n"6006fc1d192a49d6c3a51053c0ba9415*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x8192:8192x4608_n"c59cdaf25d0474aca175187b787e1072*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1109x64:256x64x1109_n"7123081e1ecb8ec39e18ebe6f4f83b1c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1109x1109:256x1109x64_n"25c3f89c00f42dee4ba0055b811d989e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x4096:4096x8192_n"de177a4cdd3b6150e8dc2ee420361ee7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x8192:8192x16384_n"16c753090115dfc8c48c53c6aac5dccf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x16384:16384x8192_n"8c31aa96e6d56006ef8a104908dcbe6c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x8192:8192x65024_n"216e8e64bfa109f873cc47dd2d2552b7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x8192:8192x4608_n"6d05c473f9484d9ec25b4ddeb0bf6128*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1110x64:256x64x1110_n"09c15b83992eaa8fda553c8dde343412*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1110x1110:256x1110x64_n"403ceb8d4ca456098e28fe1f1a9850e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x4096:4096x8192_n"478bc4f6e2050efc963db56c700e4f6a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x8192:8192x16384_n"12297ee98a1cb61e4d957e6d825c2838*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x16384:16384x8192_n"dd14587071bf5d2a9e1190ccb218e07d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x8192:8192x65024_n"9eee0222cc9c0678b9f9e75a182694e3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x8192:8192x4608_n"17ee2bfe8d346ab66da65aa2af629486*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1111x64:256x64x1111_n"a93dbbf9ab72ea9c34a517bfa29e1e48*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1111x1111:256x1111x64_n"312dd4b6dc87e411461a15c2f2e28db9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x4096:4096x8192_n"2a5f0569bf6124a107c09fcd6960265d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x8192:8192x16384_n"d769daacbab152326e2b9aad930ff98d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x16384:16384x8192_n"a4765af067923801a2ed5e9bca9b7d59*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x8192:8192x65024_n"4144a0a8df0906b5981ec0f1bf9a2389*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x8192:8192x4608_n"e120d4a16354b3d24b91ca86bcf9e19d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1112x64:256x64x1112_n"e7a3ce170234cf8046b151625754cc37*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1112x1112:256x1112x64_n"257aa2826e2fd4013122f7a9932840aa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x4096:4096x8192_n"3e20edffc24f8ddeb00cdc38c76c3386*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x8192:8192x16384_n"1d93e7bad328974bbdf97a014da07ecd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x16384:16384x8192_n"af0b98402f24f7c9b17298dc1f54ce55*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x8192:8192x65024_n"b629b7e392e3cc7be190d4f131456ffc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x8192:8192x4608_n"e361e1c0108a026158b8185073ea8df8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1113x64:256x64x1113_n"e2dbac9aebff00798384f6ed3dfa2af8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1113x1113:256x1113x64_n"50fcce2f0ca2587e145fab2536e2583d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x4096:4096x8192_n"69b13fa8ea796fdd5bbf002c4491ef5e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x8192:8192x16384_n"c31b51b4dcaa1bece16241ba49e425fd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x16384:16384x8192_n"01737e68f12c29162c6e0e61a388a3eb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x8192:8192x65024_n"dffac3bc0f29d6efee37def7aa1533ef*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x8192:8192x4608_n"5140e7fc8e0c83927e9ef31a321ca6a9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1114x64:256x64x1114_n"fd3fd0851beb7ebf12f86e17b41f850b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1114x1114:256x1114x64_n"d3e4cb9d25bed60a13df2c9d2771d02c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x4096:4096x8192_n"93dcf45e4f8727d9e57f5daba33f40b2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x8192:8192x16384_n"4b8791f1901a449ace711abd6fdc90cb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x16384:16384x8192_n"b99911d83750da7148da9512b2695e2e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x8192:8192x65024_n"50f3b82d580a7d4a5f692cbd8ebef023*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x8192:8192x4608_n"d758a3bb06567dcf64026af577a1968c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1115x64:256x64x1115_n"cd8d94a1274142055dd58559d821207b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1115x1115:256x1115x64_n"921faa4815858a3649b5b181009950e3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x4096:4096x8192_n"2f2f1b4d609e93aa9ec150d6dcd849fa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x8192:8192x16384_n"410e87e5510e75af3c38af996f05c790*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x16384:16384x8192_n"ac5cd70c1c7dfc5cfb303798ae059e93*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x8192:8192x65024_n"103e596a9810999d679afb770140259c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x8192:8192x4608_n"ea96eef82852c6b6a44bded14c161420*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1116x64:256x64x1116_n"4f527e3e421b23a8fc6291588ded22f0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1116x1116:256x1116x64_n"3f9fa3eff9e80b0d6641aefdfecd9783*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x4096:4096x8192_n"57fc96ef020fe80bffe8fd4638adf72f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x8192:8192x16384_n"667451e3fd0b75a3ea65e8db525d42bd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x16384:16384x8192_n"43513972ad8194745e06ed91f62be874*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x8192:8192x65024_n"1586cd7e4d10211ee71ef194425b09b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x8192:8192x4608_n"9a34f82132ee94940c5fdf50396dfb60*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1117x64:256x64x1117_n"ca6bef476a5a428c3247aef54155959f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1117x1117:256x1117x64_n"7066bc5d88ea0839f80d2cf6910f788e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x4096:4096x8192_n"13ea29dd84a3c3ca28d9f99d5badde8f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x8192:8192x16384_n"d24548cb40e73a1d712c9f347f068610*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x16384:16384x8192_n"83edfd9e8617551e7e19b884547750f9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x8192:8192x65024_n"d961447b714917cb2b33f82435657efe*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x8192:8192x4608_n"a99091f80e4196aacf75bba40421e90a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1118x64:256x64x1118_n"7966b4d3f3ac35a44c26c38b1cd0a4ec*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1118x1118:256x1118x64_n"2500e550c086abcdd800cab875b90be2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x4096:4096x8192_n"92abb46afac8654ef7703211f99c8aff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x8192:8192x16384_n"21ce39dfc1434af3191b7064dca179de*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x16384:16384x8192_n"ff5b388fc3bb1f51da225c8afed91e5a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x8192:8192x65024_n"98d520ff078ef33031ceecbb7ee0d142*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x8192:8192x4608_n"c2d1a5811a3b2d292c4ebe15a1b22b68*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1119x64:256x64x1119_n"6e05c4da32426d3e59c57a53a5ce240b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1119x1119:256x1119x64_n"253db85825446631886d47b477c48011*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x4096:4096x8192_n"99ee09cc3515ff95f27bba4b6e059e1e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x8192:8192x16384_n"2a4b0423b1ecac7c893efc9f034b1ad9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x16384:16384x8192_n"2c61f9ac4918eac921c2de3d5e170d69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x8192:8192x65024_n"57f3ab0d79600ab883fedb01043d5c69*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x8192:8192x4608_n"959c8489d79a629d1dbec2e5d9ecb112*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1120x64:256x64x1120_n"052f1b27d37fe1f0a664e758360de268*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1120x1120:256x1120x64_n"853cd0ed6aaf3f44c3d5790736670c5b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x4096:4096x8192_n"751248894e9a783b538b7510b6f3eecf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x8192:8192x16384_n"a51dc63a98ccdad843a16985a972d541*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x16384:16384x8192_n"17d5c9bc60f3a041735d044151afdb7d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x8192:8192x65024_n"a68fb2917c262ba5692238c2bd2673fd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x8192:8192x4608_n"8be155e9256123642dffa30975f5855f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1121x64:256x64x1121_n"77c98c625a9f642d42066c0467db4edf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1121x1121:256x1121x64_n"1c292aba11f2957954215d381d35114d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x4096:4096x8192_n"7a664b805b9b30b3fa4d19bd096a9c69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x8192:8192x16384_n"ff6a2fd7b96ed222137df79fa941b2db*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x16384:16384x8192_n"f7e31f94c7d28430565a2101c83f8f50*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x8192:8192x65024_n"42cd370c38a07f9d9d4c167a92f4ba66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x8192:8192x4608_n"9c433b3eb925ce1ef5d35c33df0f154d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1122x64:256x64x1122_n"bbf0a44413c7f3e28b5899d6d37e2393*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1122x1122:256x1122x64_n"8569fde3ce8483d0fdb9d17b0ee3a0fc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x4096:4096x8192_n"c650ad03a267bef2f7d12e34f7622d5c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x8192:8192x16384_n"4f1989b354091d07424db58cd0d030c3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x16384:16384x8192_n"bf5adbaf8864fbd357bfe596c9c2c8a5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x8192:8192x65024_n"9cb5adad7d89cca7a56eda54b0e461c0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x8192:8192x4608_n"3dd6b2132f1a93e324cf0c19439a152f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1123x64:256x64x1123_n"f9a787c3e8a2a4a98e55795ee9a4a49b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1123x1123:256x1123x64_n"f377e67f008307152cf73822154f8e7e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x4096:4096x8192_n"2e241d5da986f8a6e80a736155923d80*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x8192:8192x16384_n"6971972b76efc53766377501881f22da*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x16384:16384x8192_n"cc0dfe4645ea8300889268bb4f4438f9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x8192:8192x65024_n"43ab5e49f4b4eeee492586bd3b351a11*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x8192:8192x4608_n"5c1543ec392e89871a61feee6ea1c4df*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1124x64:256x64x1124_n"33c59f3c5e3923781cb13eb524215e23*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1124x1124:256x1124x64_n"215b2b62f7d811e37e65238afddeeff8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x4096:4096x8192_n"3d593e841e03646a0a14598b64d31784*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x8192:8192x16384_n"11ab044e391a04061b817200dd006cab*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x16384:16384x8192_n"2f50944eef3e0af207f275afcc78a78d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x8192:8192x65024_n"6d2c0c219f0f29a53bf3ea41a2f65767*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x8192:8192x4608_n"6a199be29e681b25a8ba556831952a28*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1125x64:256x64x1125_n"4a62f00fb2e08a951f2839b3b4b9d2a7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1125x1125:256x1125x64_n"1e217a326e9eb6d9f9f058df0db15184*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x4096:4096x8192_n"6bf6e4970d6d3c9e8f4ba5ec67f6f74f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x8192:8192x16384_n"68787649a99215e6fbdda03af8d6e0a3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x16384:16384x8192_n"95640d3b6600999e78ad450ad6c1cd7d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x8192:8192x65024_n"c2fd2872f84244f37fc1fde13e95f10e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x8192:8192x4608_n"59c799558dbaa6213df07c15cf24fba8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1126x64:256x64x1126_n"9e4810e86a8eade56cab412254834fc5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1126x1126:256x1126x64_n"804bed3830aa08f2348faaa3ee9a5915*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x4096:4096x8192_n"5f1621c68749da89f769c1031513e067*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x8192:8192x16384_n"21f43b6ce80a68e4180bcfc64b04f856*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x16384:16384x8192_n"c1ddef8fd8ba6759d77d4a93bc6f3853*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x8192:8192x65024_n"fcc6a10aa339e1b324640020b4528f87*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x8192:8192x4608_n"89c8fafbff26f949399d8a7ec760a698*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1127x64:256x64x1127_n"849344e779b5c3611faafe83e5ae4b1a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1127x1127:256x1127x64_n"c22fcf90e883b4635dcbc2e1d87ea08d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x4096:4096x8192_n"08f7a50d55f5609fa11c3cd18313b84d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x8192:8192x16384_n"7b8f4e38b6c72e4057c0db59f7c30870*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x16384:16384x8192_n"9552c0ac6501cce83e5789250b4d8b1f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x8192:8192x65024_n"a10bd08ce1000f5a96c022c4acb4485f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x8192:8192x4608_n"dcb95ec9b773df4e55f9483bf0d19b89*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1128x64:256x64x1128_n"1830342b534078ab3f937e63d4c98396*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1128x1128:256x1128x64_n"74b199de8436dab9d9a31742ccdca7bd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x4096:4096x8192_n"ab4e3eea9ee514a211949c8e01900ada*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x8192:8192x16384_n"e1e71e74ed86f92c808080f71a69c60f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x16384:16384x8192_n"dd0e845c18e3e5a81999d3c2ef899ee7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x8192:8192x65024_n"419661ec33ea429c61ec50db782987c9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x8192:8192x4608_n"cf6b5a2f30e5844cbab5b62d0b1434b9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1129x64:256x64x1129_n"c5e4f90a9a0348a68b168d8269fd5775*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1129x1129:256x1129x64_n"46fdd2dcd38fdfab4927f7da0b77427c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x4096:4096x8192_n"3cdb06f6df3f54b8a33eae6f1d8fd50f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x8192:8192x16384_n"8892a98a416f04f779a4cf9e8801e238*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x16384:16384x8192_n"5a394575beca60987228fc6a2e7e5661*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x8192:8192x65024_n"592aee269160c3e6b45bbfde1a017341*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x8192:8192x4608_n"c901139897e7b80a5f86ec8cc6645087*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1130x64:256x64x1130_n"6a11b50ee7b013411c010221ff927c2f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1130x1130:256x1130x64_n"1d97ac4fc0c1e67a720ff0953a5311c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x4096:4096x8192_n"619b1f3904502e073495e5f55f1624fc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x8192:8192x16384_n"fa942a5b5b50ede9f5f1217a85b55a23*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x16384:16384x8192_n"c60e3b6ad2cc9df3e5afdfe463908fb7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x8192:8192x65024_n"50a88ce0fc10ec22df88f850859aaeba*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x8192:8192x4608_n"73ffecb2b9641ade4df3f1fefb63a798*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1131x64:256x64x1131_n"bbfe21ff99f40c083b791b091428b41e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1131x1131:256x1131x64_n"7043d647c3ecb071869174fe28bb5617*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x4096:4096x8192_n"16da5a04dd9c52c5b7466b1170dc43b2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x8192:8192x16384_n"35a1be3b04292b388c98f00c1e862f4b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x16384:16384x8192_n"ffdc43dcc2cfdfa632d153f4924e951f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x8192:8192x65024_n"4a0f946435cbe941d568248a7d244ffa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x8192:8192x4608_n"c39954667d69e324632ee4d2a50361c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1132x64:256x64x1132_n"eab7966750b4ae113a6d5503aee17e7a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1132x1132:256x1132x64_n"a99ec14b1469fac0e9ab612e9ce0f730*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x4096:4096x8192_n"87b6961e159e45ffedeaaf0d131969ca*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x8192:8192x16384_n"a9e2fa8d41bb4cc96a0981590390646c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x16384:16384x8192_n"e446dbfef52a8a94a818b8b5c38a6bde*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x8192:8192x65024_n"81b5a6f6eeda857a140bd224323ccfc9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x8192:8192x4608_n"4959b119c56bff7d105cd1c24fec089a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1133x64:256x64x1133_n"897480aa0abfe7d0af43349665efcf6d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1133x1133:256x1133x64_n"669840094533ba3faf520d0cafb5d895*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x4096:4096x8192_n"0cb599b4c91f50c9d90522f564946bd5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x8192:8192x16384_n"8be916d33cf99d0cf9c396182503766e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x16384:16384x8192_n"8a0b28e32037399af9bcccff335e6efb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x8192:8192x65024_n"915290c91b7301c11be3393ed363b2a5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x8192:8192x4608_n"735c3e73759b10274c70d189997d1fd1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1134x64:256x64x1134_n"128ccf9b1d9f5882587a80f48973844d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1134x1134:256x1134x64_n"0ab993112c2e6abf017d0a674aef0ab1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x4096:4096x8192_n"29e2d2accdd245ac512d8cad6fccd177*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x8192:8192x16384_n"553d7ecb6111151147db97fd16646036*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x16384:16384x8192_n"73e9f6c7eb8b80532e7fb4e0226899a3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x8192:8192x65024_n"80269cd25a4ce9729332a753196882e8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x8192:8192x4608_n"1ff0135c9ff8255c506ba860cd85c121*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1135x64:256x64x1135_n"4c7c339f30c2c4e2638d36f27b318028*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1135x1135:256x1135x64_n"e1c6a9ed8bc628bfc9aca8d353f3cf46*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x4096:4096x8192_n"bbbbd8b797ee4378c627721336a85fbf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x8192:8192x16384_n"5e6aa82c572aaf8d7c117900aa181420*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x16384:16384x8192_n"076503bdd4094a4a009b8c6a34e93664*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x8192:8192x65024_n"0af9018b247f0586bdcab94454817663*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x8192:8192x4608_n"005d22cab7b81b9ea23e1ebd2a407f0c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1136x64:256x64x1136_n"426dfc996633844d29efd96675637d53*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1136x1136:256x1136x64_n"9c4c7248dfc696050c69509ced0b896b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x4096:4096x8192_n"0208135343dc9f9c6d0a58f6dd3a7838*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x8192:8192x16384_n"4f9e394e573752eb1701ff4e4966c796*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x16384:16384x8192_n"f4ee05502ebae45834ad088a3485bd4e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x8192:8192x65024_n"16b4f51e685666a2a158759239ee3024*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x8192:8192x4608_n"5409b5f3bf27ea66b73cd7c8fc12b52d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1137x64:256x64x1137_n"3e0db034453c22c54a35a73a92630a15*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1137x1137:256x1137x64_n"1e6b8a8dc311d3cb5d0f824bc01fe32c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x4096:4096x8192_n"1405b3d62b17cde928453c911b48ceb8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x8192:8192x16384_n"de4ef79d1ad490db71937a21b80e9eb2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x16384:16384x8192_n"0ed5d0da3ec5d44e96950115f1cfbcac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x8192:8192x65024_n"8d3aa45c99d911a15c705690292cbb78*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x8192:8192x4608_n"5185ec7a3f541d0679937f4e4ded1c5b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1138x64:256x64x1138_n"942f4e923713b4af1cff2f41eeb02782*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1138x1138:256x1138x64_n"f064072c93f4668bf80168065d09ecda*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x4096:4096x8192_n"4518fad87eb299ab16b47ea6c7a0b806*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x8192:8192x16384_n"71ccdea9e93d1dc28d7cb826958f2d2c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x16384:16384x8192_n"e26816e3ed5bdc4b5b757544f6b1ba81*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x8192:8192x65024_n"242cde00205446375e4a9df625bf32e7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x8192:8192x4608_n"80c11451f8022c1574941f9a5d69dce4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1139x64:256x64x1139_n"791306e449721bb3ad3d797a691a76c1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1139x1139:256x1139x64_n"1cb76cf7999df491f2bfcbbf05306e25*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x4096:4096x8192_n"152569f96e8ab1a6891d50113cc7a36a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x8192:8192x16384_n"a7a780c2d236aae0e2b88fe6966e06dc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x16384:16384x8192_n"c86ff6982cf4e31c9eaac387d0435ae0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x8192:8192x65024_n"e1720bbc4dcb3936126c9a1d989cd17c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x8192:8192x4608_n"537526beb114f2ebab0c81149bc87417*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1140x64:256x64x1140_n"7d0de1dab8e41d81889fc81e6ec465c3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1140x1140:256x1140x64_n"1cb0be67fe77b4c8bf8f6b048c32800b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x4096:4096x8192_n"8aade8fa0b265104a53bb978ec13202f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x8192:8192x16384_n"271a79839077218faafa0a06752f89f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x16384:16384x8192_n"ce766be81c77868f72cc7b1c82262cee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x8192:8192x65024_n"bf8ae687937eedb559779da6f95f02f4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x8192:8192x4608_n"c3c01f1a579da5c285bcc259569bcd40*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1141x64:256x64x1141_n"a0b0f9afee85c82cf9f10e5550bdca0a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1141x1141:256x1141x64_n"365996a0eed25541c58cdbcf35eb3da8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x4096:4096x8192_n"72d3e98f3c1f5e759730a746349eb0df*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x8192:8192x16384_n"ccddf1de11b62a5737d75d6fc6bb08d0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x16384:16384x8192_n"68a63bb6b980b4f428eaa7879ce5b72b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x8192:8192x65024_n"78491683b9fb7dcf4436ac871051b8a8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x8192:8192x4608_n"82ec365606e8aace33f70db24bb82e8a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1142x64:256x64x1142_n"da4275ec475a9fe6e9ffe86addf7505f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1142x1142:256x1142x64_n"906195fedce58e4f366b021118548015*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x4096:4096x8192_n"c293a292bdbbcf999b05a275c2471f23*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x8192:8192x16384_n"00456f54c4e7c771f37c2d3b7f248273*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x16384:16384x8192_n"9556475a62cbbf2855d1be6b62cf5018*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x8192:8192x65024_n"a529f4f86545bf44f3bd7538780169a2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x8192:8192x4608_n"6277e1c6f1113cea4a377bd284cc56d8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1143x64:256x64x1143_n"3d07d225f78fe01d86b0a588ed68e79a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1143x1143:256x1143x64_n"6a8e918b8620503e887bd2b03171d0c2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x4096:4096x8192_n"1561a85654dba4f43ea2a5b5a46c40a7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x8192:8192x16384_n"c8965cfbc56ebdc0db82a9bb05b5930f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x16384:16384x8192_n"8dfcbe1b656e545a9b81afb17c33a86f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x8192:8192x65024_n"0bbca97bc8ee1e2001321f164d1939db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x8192:8192x4608_n"3f8e7a1699262d4eef5a73e63d242664*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1144x64:256x64x1144_n"58b32d508a7d45aeafb028981083c295*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1144x1144:256x1144x64_n"d732a6037970f4ec7136634e3e7eb9a4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x4096:4096x8192_n"14c5309326c438abaee508f11b5797b2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x8192:8192x16384_n"603c7e3012637f1d94f291241588185d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x16384:16384x8192_n"1540032c2d7cd68a2bb655a4a0d9efd9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x8192:8192x65024_n"e48d1904701918a7a9d2128d3861e13a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x8192:8192x4608_n"e011486680c18db0e247c1c844524f41*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1145x64:256x64x1145_n"fa9d92d9b3bbc19a0d6821f4297a1f65*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1145x1145:256x1145x64_n"4160bec2a27c7dc1b52209a927c2b4f4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x4096:4096x8192_n"3abe6d3ad9ca5bed93cd02430ef393ff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x8192:8192x16384_n"6ba94c18490df17bd1796ccace87f259*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x16384:16384x8192_n"d34842400aa41c8359b53da578285c3c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x8192:8192x65024_n"2c51dada8bd36ec1bd7a3d21ca6395f2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x8192:8192x4608_n"db44c630e9d010aff43d91ff1acb0655*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1146x64:256x64x1146_n"4223c5984ddb9c33391b8bfb2c2545be*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1146x1146:256x1146x64_n"17641416d35afe42cf4cf098d1a6cff5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x4096:4096x8192_n"064e41a75412220e825501bf9a2dc39e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x8192:8192x16384_n"a115e713e3f0dae998ee3d494a2c5162*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x16384:16384x8192_n"ca5205326dd96942e2b6afe935618e94*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x8192:8192x65024_n"dcda1a1ac8d5ebc4eb07d1c7b3434d97*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x8192:8192x4608_n"7fa97dcfb6beba640f2a5c4222c2e729*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1147x64:256x64x1147_n"a6ff86aea6a265628aec596b85d724d3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1147x1147:256x1147x64_n"04b7bf55b723fc359c9935b350a920a7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x4096:4096x8192_n"0ff3ba320f285e635d390d8f35f21d7f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x8192:8192x16384_n"dac4b9bacbf6079d1db27a04bb73d12b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x16384:16384x8192_n"48be6ed190d1558e0de68518a02d184b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x8192:8192x65024_n"752d8687a9b1a4a949d822ee06e49301*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x8192:8192x4608_n"ac8d2fc60e01112958b0810f112667b9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1148x64:256x64x1148_n"b14068b33bc8971a3075f90ca30c4399*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1148x1148:256x1148x64_n"f2b1f28b140085e7b220f5599272b208*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x4096:4096x8192_n"2978e2db0ad7157cbac96812be49e941*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x8192:8192x16384_n"cc531e3db662eab177e56ce4b9524c19*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x16384:16384x8192_n"141c8e324fcb2e5a19f6867654995aef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x8192:8192x65024_n"79f0513554780dedefc76f9d48dfb73a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x8192:8192x4608_n"c1be873a057b52d5a1f92c8d19ce9e74*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1149x64:256x64x1149_n"a3be90d63d9ca7bd8a2493943eb4dc60*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1149x1149:256x1149x64_n"9c9a9a4d308c299023a55e673c151cf4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x4096:4096x8192_n"6c3f09de6e49169ccaddc9b207c4d0ff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x8192:8192x16384_n"72280eea526357e5b32dc7046269ae9b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x16384:16384x8192_n"50285af89ae3fcec113684d816798805*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x8192:8192x65024_n"aa86550a9191ceb61014db866ea1d28e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x8192:8192x4608_n"670f6307b91fac8619e062b6489ffe76*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1150x64:256x64x1150_n"c99e912dca9ab38ebdd27ec097a10fdc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1150x1150:256x1150x64_n"2e876ddd674c6fdb77cd4dd37a1a30b4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x4096:4096x8192_n"257720800093f96611c90d1f2f22f823*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x8192:8192x16384_n"e9cb89770d012342c06ec81609bbc17c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x16384:16384x8192_n"dd8ae4bc4912e3037b2dbb2fce4012e6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x8192:8192x65024_n"7145970cd1a94bf37435d67ff6cc03e1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x8192:8192x4608_n"99e785ed3be4b336841ff4ac377b2d5d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1151x64:256x64x1151_n"8f03a52aeab5133e62750e4b65532225*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1151x1151:256x1151x64_n"3950f0bfe11d62cbc4a7d84b66f61393*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x4096:4096x8192_n"73f3000a9f720a96eaf95900701dc0c8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x8192:8192x16384_n"3e28c6d3940213628357bc974ce578c6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x16384:16384x8192_n"aea0b3e1d4bff49bb9abe066d228d68d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x8192:8192x65024_n"cdcc042a071d0d816a6267427a569247*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x8192:8192x4608_n"d7f1fee91a94109db445e824e4540369*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1152x64:256x64x1152_n"123c34b937299d9cd601f8733776a601*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1152x1152:256x1152x64_n"b9fe31282fe11e9d6447e8af64872ab6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x4096:4096x8192_n"8d277522edd5cb48777755bc1b5308b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x8192:8192x16384_n"0d9427a43e041714af2efcad22f85267*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x16384:16384x8192_n"a0a29e3896d44c5c1a7891c05397c503*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x8192:8192x65024_n"d7c7fe663a4d681d811783a3e30f4410*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x8192:8192x4608_n"d3a62c59905f5a552841c1d51da0ca6a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1153x64:256x64x1153_n"e9f2473de0ae989d4b4f3549eb05fa5e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1153x1153:256x1153x64_n"79fc4a6c0ed4f9b5ee45b70cebb642b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x4096:4096x8192_n"1b9c75f17a382290bfb6bf85b95d1b5e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x8192:8192x16384_n"116e062a449109dced1d483e7cabadb5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x16384:16384x8192_n"0b483403741785bd1d2a8a72d3994bec*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x8192:8192x65024_n"32b3b48d6671caf7a3361b4ea4c9e313*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x8192:8192x4608_n"055c63a5f0b0e8fdf0a37217c5a70b03*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1154x64:256x64x1154_n"6105ebbbe98ae64057c1b210b7459a16*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1154x1154:256x1154x64_n"09479fa9d151e7d4a8eaab3a5c5125f1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x4096:4096x8192_n"810eab08fbba5215cd76bccabe3078a0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x8192:8192x16384_n"02048dff89b4702dd595e7f4b87865a5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x16384:16384x8192_n"81cbee17709a640b719fb41413db043f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x8192:8192x65024_n"3532fdcba2ebf4548bd5582be8eabde5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x8192:8192x4608_n"1e35424fefed45efcb568e03232ff360*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1155x64:256x64x1155_n"fd70db14a68b2cd7a3b3cb4eff5d494e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1155x1155:256x1155x64_n"0d9b8cb4aaaeb573010ae268331c679c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x4096:4096x8192_n"ff3976297ae8385110cd2530b89de3c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x8192:8192x16384_n"4ec0495d6cfef62cb436c20ae2ba062d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x16384:16384x8192_n"b669babd6b1eb258e5214bfe65112184*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x8192:8192x65024_n"59f0fb1d195c95aff421a62ebe29be24*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x8192:8192x4608_n"7cab752a2729cfdcc7e8253a9440ee81*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1156x64:256x64x1156_n"4f139ba031bea8fc537f8981f3dcab6b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1156x1156:256x1156x64_n"84ae9a2e67cf205db8b5f271850a42d2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x4096:4096x8192_n"7a281e62540b53eb05cd6dc473137015*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x8192:8192x16384_n"753b95a323bb4a643cd22c199625cb91*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x16384:16384x8192_n"0ecb3c0d8d5b39d066c2fd6ac42a52fd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x8192:8192x65024_n"0ed85a886cf186c19ff7cc12cfdf8f31*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x8192:8192x4608_n"00b5b605daf232fa3400fa4e6924477d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1157x64:256x64x1157_n"9660d2089fa82881c9c6855cecdfb292*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1157x1157:256x1157x64_n"5bce1363dcd994584fecb862f2438f4c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x4096:4096x8192_n"935260d530f1b9966ee11bf2a7872574*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x8192:8192x16384_n"cad72d67ae5a5e312a8da8db571544e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x16384:16384x8192_n"98c478037e57816427aaa6f361a2b702*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x8192:8192x65024_n"6fd3a6d778efd4958370465946d02aeb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x8192:8192x4608_n"358050890c0ed0ab232ca67ab0326954*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1158x64:256x64x1158_n"715ed558731b5911b60dbe0c8a72cc62*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1158x1158:256x1158x64_n"006a8ae528ae79f00075fc31e4251b5e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x4096:4096x8192_n"353831ef05b259a26017fc523f9b5445*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x8192:8192x16384_n"71aec9848b793d3b5e1b0fe33718c71b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x16384:16384x8192_n"73e6ed6f0c6834a7b3816a473f1201bc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x8192:8192x65024_n"8f56418fdead523005537f08003b2ced*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x8192:8192x4608_n"47614c472ba9e675d3c67bf0fbde9606*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1159x64:256x64x1159_n"256d6e804872d08d2bc4083a2d04f9b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1159x1159:256x1159x64_n"9e4f94f041396af6a21ca104610dd2a9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x4096:4096x8192_n"75167f04fc8c564fdbd14b617a2e6a11*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x8192:8192x16384_n"61cf755bc2b7b204b78e6db768538ec7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x16384:16384x8192_n"b90f7f8676538b2bea056b56c0f26922*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x8192:8192x65024_n"61e7a74a4fa30a28eb94f44f5c45983b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x8192:8192x4608_n"3b32dbc147c1281fffad16a4f60a9b25*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1160x64:256x64x1160_n"53e3ceddc4fc43e59d9db0d2fb834344*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1160x1160:256x1160x64_n"ac4d9d6e16147542804b6db08df4ad94*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x4096:4096x8192_n"0dd2df2bc1f36d70fcdd0ef0e2527294*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x8192:8192x16384_n"c2ae545e6f34e35227e50f937a58965a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x16384:16384x8192_n"ce4fccca41fd13de87f9af99d3e56f2f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x8192:8192x65024_n"31475383e075c41eb12ad3b0c8598e69*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x8192:8192x4608_n"6c79396bc5a3ffb50df10bee41f6c94d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1161x64:256x64x1161_n"369e1ae35abe6df173883e54d5911348*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1161x1161:256x1161x64_n"0110728ceb36ff83fc8823f52af1943b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x4096:4096x8192_n"8f94db22f99361a1a17a51d2cf793484*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x8192:8192x16384_n"de522dd312a60b56c2afc756e40b8dd1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x16384:16384x8192_n"8adafb5563f524a60a99ce8c6b4ea82e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x8192:8192x65024_n"f1cdb9ec98fdf39b683430e34c75b085*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x8192:8192x4608_n"79ec911d4727152e85288dfa8eecf9e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1162x64:256x64x1162_n"c90f59dc2c0bc2b2a728449255f5a956*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1162x1162:256x1162x64_n"6629202f4bf1129cc181e09a71384a98*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x4096:4096x8192_n"244b7fd16644135035d105189b6900b7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x8192:8192x16384_n"da3801d6117ca1056a1ecd011387d8eb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x16384:16384x8192_n"f50aead7fd5ccc2b0c39b34cca9d3806*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x8192:8192x65024_n"e3e99f4583157c917ab6a0c18879f7d7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x8192:8192x4608_n"f22393fcb44ed0797ed6b89f11c637bb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1163x64:256x64x1163_n"d5e5debed3836963d78c86ebf809a15d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1163x1163:256x1163x64_n"1f6e431ede78805612be498964cce3dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x4096:4096x8192_n"ee7cd7c183d349e5ff6a79bc35941ac5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x8192:8192x16384_n"a28568c324c46e58d26bfce2c5473d04*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x16384:16384x8192_n"d86d59ea170c575fe6a05aa9b9d96e7a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x8192:8192x65024_n"f650de74426affe5fe451f6cc4341372*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x8192:8192x4608_n"f72a7308c9c454289b6f1987d107df2b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1164x64:256x64x1164_n"80c75171fca2341c2744e05d45a2eb9a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1164x1164:256x1164x64_n"df6bbcf7aec24939991a79fefa055ae2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x4096:4096x8192_n"e7208773dee39df88722c23a59a2afbd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x8192:8192x16384_n"07e3ee9cb0c25f89295c035155f654a2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x16384:16384x8192_n"1701a41b84a39fe622b7fa02def8e953*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x8192:8192x65024_n"2cef40c5ce880798fa8a62539e021d87*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x8192:8192x4608_n"eec7d94fb52edbb288aee06bb20444a9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1165x64:256x64x1165_n"fef9cd4bd37699f5adf8bb9e91d1ddcd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1165x1165:256x1165x64_n"00e166674d3c8b16aafb45d08d75a039*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x4096:4096x8192_n"d84c3d0ca7094dfc1fc5deea5c9f0bca*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x8192:8192x16384_n"dc7284b6ac923c859d16bfe77ba591d8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x16384:16384x8192_n"1a80ca75110fdc3173d490a70a7c1740*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x8192:8192x65024_n"c6ccf559a071929a2d108564f0d17ccc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x8192:8192x4608_n"78e7d2c83343ccd7b54ac7b25d94ef7e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1166x64:256x64x1166_n"1affb24a8e72e4ea4f1cf58d3535c82c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1166x1166:256x1166x64_n"daee12841d91eb468d0c9051eb06ba85*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x4096:4096x8192_n"f51704dbe9dced1b604669937c54f6f4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x8192:8192x16384_n"7a265645c10e9e9eef19a68e610e2aa6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x16384:16384x8192_n"a1f2cbbad0b9588c7f27e3b72bebfd18*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x8192:8192x65024_n"101e8259331ae261d30599558ce10537*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x8192:8192x4608_n"2a0f730b5911f5abfc9f3cd33eb95b15*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1167x64:256x64x1167_n"6479f6054b26061ebb857a3af9a930d5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1167x1167:256x1167x64_n"809ad4688a6ccfa36ccb590876049ef8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x4096:4096x8192_n"f6807b71f7db62a5c89ca0e6a767f021*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x8192:8192x16384_n"a8a589684a1d6338513e9a08bc04a56c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x16384:16384x8192_n"450ee67d3a9b7b24edef87f22cdf908e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x8192:8192x65024_n"2db18f7a7bb29f96c08da3da523c5ab2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x8192:8192x4608_n"738cdb4e33e0cb2a58b474ddfa5669e0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1168x64:256x64x1168_n"8c3d48aebc260fe240a55c458a6d3f7e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1168x1168:256x1168x64_n"7ff1e49f13af497bfd1a13a5110f7975*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x4096:4096x8192_n"053a9e77fb5e3c5ea1ec1e8cde83bfff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x8192:8192x16384_n"e75e30c3e2c1337fd49186c5ca519bd7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x16384:16384x8192_n"3d45b501c22833e301bc6704e978c7f8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x8192:8192x65024_n"a91800de134172b7721344f3b4943ea4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x8192:8192x4608_n"f569bbe2ca30fdbbcec412d0edcb9aa7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1169x64:256x64x1169_n"167251f31f048b88148a609a37e8338f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1169x1169:256x1169x64_n"77c67017c8081311cc50fa59f9175de4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x4096:4096x8192_n"4399a26a9286ca03ee69899016bf4661*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x8192:8192x16384_n"43110f5264dc660bf2b9fa8c307cc4c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x16384:16384x8192_n"9d76600c6d8fadf0a0306b71105709c5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x8192:8192x65024_n"261b75e5ccb41eac06ff44fa77452306*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x8192:8192x4608_n"6fb0e7fa9eeba3c53a4880ed574c1c21*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1170x64:256x64x1170_n"743d2a10c3a894c8afac0a9594159ca0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1170x1170:256x1170x64_n"970c80fb4b1617c94db5212a7be81b76*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x4096:4096x8192_n"add6de52820f1226b621f8a3525e2027*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x8192:8192x16384_n"b099b77bf40209ff041c25206eed93f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x16384:16384x8192_n"c2602190e2d47daf36d50d9ffe4446cc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x8192:8192x65024_n"6956925ae80419fe3f8a19909dc83eb5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x8192:8192x4608_n"a69539d3f42e81c9c84abeec5a14ca62*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1171x64:256x64x1171_n"1ecb6f911259a0bc31c72ca2aa92ff0e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1171x1171:256x1171x64_n"88efe54e0e8ca9108316a299fcefa113*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x4096:4096x8192_n"21cd9f6c8ac9e651bb9bd2b28620b20f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x8192:8192x16384_n"ce40101064ec3437777cd5f2db944883*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x16384:16384x8192_n"824373702b5571606477b842874bd00a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x8192:8192x65024_n"eae426ae87ea338a4e9326679182b94c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x8192:8192x4608_n"eaf8553b0e8f896f2110e4f01d972290*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1172x64:256x64x1172_n"786f98a8394cf23f3081327ad75458a6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1172x1172:256x1172x64_n"9543cebb202004b0bcc58c592f05e16f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x4096:4096x8192_n"b2b69369b561c5fcde86d4677e2c7e98*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x8192:8192x16384_n"5d9606e0212a8e36e80c1c8044448b76*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x16384:16384x8192_n"46ef445036fd25a098688b3e70c61a3a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x8192:8192x65024_n"c6f62f6438a1da68099ea5ff5fa45add*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x8192:8192x4608_n"9c87505f68882bceb2463045284099bb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1173x64:256x64x1173_n"5031dade7e361646daff95428a69a5b0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1173x1173:256x1173x64_n"56070fe4375388c690acca3b8b40cf57*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x4096:4096x8192_n"244822584541e9a21fedd7f30b17d35d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x8192:8192x16384_n"695082f43159b3bdc40693e21d95e03e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x16384:16384x8192_n"3dcd46c3f8a3be938003722a2a860065*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x8192:8192x65024_n"d1c8d2eb6c4baf806b5c8dbbaedce59b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x8192:8192x4608_n"24038eb2e0fb2a4b8c093aa65f711030*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1174x64:256x64x1174_n"0f552608031de3e630558412c7439502*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1174x1174:256x1174x64_n"8d1783f3f5e30cf6d4f53991772deba9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x4096:4096x8192_n"2b964340c93127f02b4988a300c92294*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x8192:8192x16384_n"3b679efd991cdd47cd77df20de7b9a7d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x16384:16384x8192_n"02a7e48e2a9a7fcc8bc62221814a4f56*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x8192:8192x65024_n"ea75f6c020fedbbeeb64d459f7f0105e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x8192:8192x4608_n"21440e058e78127ac8000ee2b13f8f96*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1175x64:256x64x1175_n"b622051ad0a03d9a078230058cdcc11d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1175x1175:256x1175x64_n"6f3bbf65a9396c54fd64a6a4e547f8d9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x4096:4096x8192_n"0d776ac60bc15f2ecec2717b55b246dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x8192:8192x16384_n"8b777f5368dca88efc9294f8288dd0d1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x16384:16384x8192_n"23c281330cf8320aba01da5eacc2f261*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x8192:8192x65024_n"f7111d010fa39b9c6716fd1f5a211a52*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x8192:8192x4608_n"c2c5541ba337640e435255382f57d0b8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1176x64:256x64x1176_n"8fdcbc6a98e95fbee04b732c7f9d073c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1176x1176:256x1176x64_n"1671fa6a7e6687fa1c33c6a14f37b410*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x4096:4096x8192_n"ce39f191e664d79e75f6bb761bfa10bd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x8192:8192x16384_n"9baa39b3aacdba2fef1a8ddb295dd4f4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x16384:16384x8192_n"ef81010840760310878d42213221c33b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x8192:8192x65024_n"55867fff185e258fee77434986be0f2a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x8192:8192x4608_n"cd86d7044166b0f1c70df4bff570ecc8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1177x64:256x64x1177_n"fa8b2b17a1a29e19b2dff7bbdbece883*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1177x1177:256x1177x64_n"aa9a605cc95278ec1d6a14621e174eb9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x4096:4096x8192_n"f46f314836a997c912b790f1c461039f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x8192:8192x16384_n"50e1667cc6e70a016e72ccaaff84cc50*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x16384:16384x8192_n"50a215278bcf0cbc9ba1a4929bf75932*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x8192:8192x65024_n"8af9a8087c2d8a54be1158924c1d3de9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x8192:8192x4608_n"9c025e0a1be771765f31744c69ef8fa2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1178x64:256x64x1178_n"adbbaf4f643e004257d47b80156cff69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1178x1178:256x1178x64_n"c25f2084422936a0e08cb6852c9131e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x4096:4096x8192_n"82d4b0bda6f1006ee11c22c4fdee1ed4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x8192:8192x16384_n"0b47e954381b3fa2ff946952ad14a3ed*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x16384:16384x8192_n"42a5622a3acd4988d31999aaa1e51628*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x8192:8192x65024_n"1c188e6537704cd24148d7f84911099b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x8192:8192x4608_n"9008d5345d97bfbae5acd2479cf9b23c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1179x64:256x64x1179_n"60ccf1a26dc2d429a92985c1cdecf919*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1179x1179:256x1179x64_n"8a37bbd63aa01c21d8950f174e2d4ef0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x4096:4096x8192_n"08ff912d17af6b2e51f19a3c08e15692*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x8192:8192x16384_n"a78c45b3e0120eb53040543821c9f380*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x16384:16384x8192_n"3853d895e6c7b65f7e851e198f4da5cf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x8192:8192x65024_n"21c7799df3e3f42558eb854a7bc371f4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x8192:8192x4608_n"6f009d59ba6b0a79e5bcc7bd4d237f32*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1180x64:256x64x1180_n"21b061dbb20d7e01dde11331e2c3d2bf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1180x1180:256x1180x64_n"3681626cb8c5b0ae60b6de9867d0a668*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x4096:4096x8192_n"189562f003b5acdf1f27b7ac7372c61d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x8192:8192x16384_n"ad2af6781d60d3f2643f9be69a87924c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x16384:16384x8192_n"9687805048e32ed3ffc2051efe6dcfc2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x8192:8192x65024_n"d918e0d4cbb99d17f28f28fb0a0cfc5b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x8192:8192x4608_n"85736501bc7477b1891901293bcfb159*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1181x64:256x64x1181_n"e5edf913410da976ce54f50f535bc5f1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1181x1181:256x1181x64_n"fd199e4527df3a92cadb36e74dfac027*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x4096:4096x8192_n"c9b857304651b0e242246b400ff02627*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x8192:8192x16384_n"9813d6c4b1b74a69717ebdeb6e6783da*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x16384:16384x8192_n"c08142022091c027fc4919be3cbe4408*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x8192:8192x65024_n"09cf8d89cf4e80f22729377e828aa74a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x8192:8192x4608_n"6e7d4c2e05cb9838742c473755f52c0a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1182x64:256x64x1182_n"dfedc2689b951d7eeec399e8d76d0a1c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1182x1182:256x1182x64_n"da976075903dd880ee35d4c15bb184c6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x4096:4096x8192_n"3a1b55ebfa4555e00938476eee84151d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x8192:8192x16384_n"0a31a0573b96d2edc11f5796466a53ec*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x16384:16384x8192_n"fbf3c4aae979db570a6ab74d95cd5d2f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x8192:8192x65024_n"b6f1bf08fbc3e259c977fef6d15bb212*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x8192:8192x4608_n"7e4d507e45209e3fe594c2682b7f3049*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1183x64:256x64x1183_n"e426a43148916a1b33b7e06108da682c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1183x1183:256x1183x64_n"11bf6e638e44978bb6f3d464ebfa2cd5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x4096:4096x8192_n"67c7da058fc4ce50e780ff9d40febae9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x8192:8192x16384_n"ebb4e6b19811e793b11abecdc456f6da*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x16384:16384x8192_n"1932c2cf4eec1d87cd8aa25671755237*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x8192:8192x65024_n"f5063dd17f736202de72a49cf642b51f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x8192:8192x4608_n"c32b0de908aeeacba4845483873847e8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1184x64:256x64x1184_n"1cd17573ab4ef4f9e675f4b200040c07*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1184x1184:256x1184x64_n"e1f8b43b5445e436280c9a08bd42753d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x4096:4096x8192_n"92e5752239c3d426f62179448e7c0abc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x8192:8192x16384_n"ce216143127993d2b581c1b008cd6aec*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x16384:16384x8192_n"ef2922b2b97b02fca2f4b8ce47419c4a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x8192:8192x65024_n"44218f0f4e0a6b44834a3f0be2f9ad3e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x2048_n"881e3e7829bba30df5fdc84d5b621f9f*252"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=bac --wtag=cab --dtag=abc --strides=:: 8x1024x256:8x256x1024_n"b92b47ecd42bfb56013cfe8cf5d5930d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 8x1024x1024:8x1024x256_n"b2fe261a8e362131fa0860bf8950a483*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2048:2048x4096_n"8b766c5b7a8744a8b362249ce76c4692*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x8192_n"d661af464fc0a3597e0c6d9db2634c43*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x4096_n"3dbecb527d75211648d948cbca8fd846*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x50400_n"5003343c30416d2e80da666e83189c68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x2048_n"de8278d104857ea1ce633ee40182f5c0*32004"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1025_n"592fca4617d9e819d92fdc49aaae7b1a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1025:8x1025x256_n"4401974c591ee54da7e3c6b0e4b29111*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x4096_n"7a303dd32bf52464032731c73eb3423e*10668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x8192_n"1c157fb3e999c234b06d473442bdfea5*10668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x4096_n"67ac7134e9b868c5756bbb8e9dbf2c43*10668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x50400_n"bffef566d89658f1ed943687c4f73cfc*381"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1026_n"fa30f7eb3f6e2d91fc30b0daf1ff5293*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1026:8x1026x256_n"22e310683571a626074d3bed8849f581*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1027_n"86a49b373310efef0fcee6e6885bc21b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1027:8x1027x256_n"1312b4aa826d5e1515032289072ba013*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1028_n"44b474f74bdfb06d540fe7ac3570d366*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1028:8x1028x256_n"cd8c4daad8ca954ef4e5dcdb0dba4c3f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1029_n"bae0d3b1843cd8c6c70f75203d6711e1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1029:8x1029x256_n"4f23fb9fae90b654d1fe2a29c6b29044*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1030_n"d26ae4606d68409a9848e2785fb3d598*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1030:8x1030x256_n"7ab8e77308bd38117ce42388a5e93fb1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1031_n"fa2959e5c1ad2c73eccb93fc7cd2dd75*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1031:8x1031x256_n"4e8fd4e262e48722d6f69ff6f5fe000d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1032_n"0c31f88742d6cbcae9c6cc7b8f839a6e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1032:8x1032x256_n"54be15298859a0d2da2835a86a69035e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1033_n"adaa5e80ff68d3440be05e22f1c994f8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1033:8x1033x256_n"863edeaeb237e99399dddbb9e954f880*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1034_n"31d02ee76de920d201d5ae4f3a7e1762*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1034:8x1034x256_n"5ee84460a8f598774a453d89bb157cb9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1035_n"804c6419d91652d4f06d108adf9536f4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1035:8x1035x256_n"c8ae9a69646cb97970130ac27bc70dfa*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1036_n"89089491769813d3c7b41fae0f72962e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1036:8x1036x256_n"5f176e408664b190829e6e633de062c9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1037_n"1f48dfac5490ae94bb515b2a47510a4a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1037:8x1037x256_n"5347cf56876d6533306e1c4687202af9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1038_n"d88a28c83c5d60fe7060174021837703*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1038:8x1038x256_n"b100657e3691f9d5d489e6572ad6ced3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1039_n"f2771159dafd71b4a685181adae490d5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1039:8x1039x256_n"49c5c5424cbb5cf84f4d6070583848be*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1040_n"492bc71d24d997db1acd371c9049aebe*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1040:8x1040x256_n"7708de0b3f27194a4f637cb2ee69492c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1041_n"09f8c733c4e25f917ebd5716e822a691*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1041:8x1041x256_n"08656548b8dedf9d904b0e3fe26eec04*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1042_n"d851c983abd77f7097f03949fb7d9ee6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1042:8x1042x256_n"4496783b32444dac4cd2d06890fb1bd3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1043_n"5f5582a87c481dce5c93bc4dc298c624*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1043:8x1043x256_n"f4318e87ebafb4920439b9e8683f7799*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1044_n"bcb495f02339cbc194599ff33b63533a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1044:8x1044x256_n"a2fe48c6df4d9215474e13b81dfc27d9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1045_n"80e5eab1e813e85d2f63a96428e57e3a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1045:8x1045x256_n"38d23aa9862fdab98753ea983b8f0e73*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1046_n"7b25fa1f4f5882ab07d1aedb09278ac3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1046:8x1046x256_n"91079b54760878ade323101d182dc872*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1047_n"c8e2623de4d1b92342e37889e028fb79*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1047:8x1047x256_n"0bdd9df144092f2e2eb2da9869bc56cb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1048_n"4302910c012bcc88a944e52a57fb0f37*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1048:8x1048x256_n"47593156ea97c749bba41b891b6313d6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1049_n"f6e3adb7406c4285885ba96c75580bae*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1049:8x1049x256_n"c172a1f1638c2811c9d8bbc9406f2afa*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1050_n"fe4ad45a20c7e792a899e77de7fe03ca*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1050:8x1050x256_n"3c9d8af3219a50f70984207f666d0e56*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1051_n"1d0beb250b30576240ca074d1781d5b0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1051:8x1051x256_n"cd3bc4fb3bf4fdf9ce812e39494338cc*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1052_n"e62cef547204e79948920fc888bea6a8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1052:8x1052x256_n"9fc94e7c0d1aba90326238c759fe0016*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1053_n"da7ffe5f476ee1f269154fcc497125bf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1053:8x1053x256_n"d4860aa640500e494719d90e497bd824*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1054_n"568ecc491067e95b051bf1819c640813*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1054:8x1054x256_n"352ff477a106a531b74d6f953e9d5998*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1055_n"607db6a27786ef60fc747849af0fcc95*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1055:8x1055x256_n"5c39991385dd297d15ef6df29b0c7c45*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1056_n"c996c1911150065da4a51a0b7dc28b53*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1056:8x1056x256_n"11d1ae8ff6e929a50329ef11c84715cb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1057_n"0c4a93fc22b6a840ff217544542fa0b8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1057:8x1057x256_n"16d716992f8ba0bbbaf7307c7e6a8d8a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1058_n"59cad13aec58d1d7d4fe546930d043f0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1058:8x1058x256_n"d0dc80a18016cebba261f01ad67850e6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1059_n"1300ab3ae9464d7c770449b124ad2827*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1059:8x1059x256_n"eec90d1a73cdc21e1b5e975a306829d2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1060_n"d9d9f8a07ebb7a7f68f0f3359dc6ce4d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1060:8x1060x256_n"58abece7d56bd6b8801cf7fc7905098b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1061_n"2c1566fbc773a5f7abe61c1250f0ec13*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1061:8x1061x256_n"cbee8a917af24b220bcf635b794c0b69*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1062_n"048e4ec7c6e927830da325906ff17ed5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1062:8x1062x256_n"8dd73cdc2dd8ff6fbf49d59f2e55a615*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1063_n"648b6d36309faab36ea236d0fa823779*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1063:8x1063x256_n"8ff07e00b862cc33c0487cd2a914df2a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1064_n"deb6435144428e723b1fb7cc49e5cd0d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1064:8x1064x256_n"085c5014d2d7f21a7f8353e9863e471c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1065_n"83c8732b6b9a92c9c4322a9f89d464c9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1065:8x1065x256_n"b425e94db625a5652a13daf4e2a48ff1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1066_n"be9fc27e02169f24c13e163c7828226b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1066:8x1066x256_n"ffd7e965b876562b93802a96952bf33e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1067_n"78f1c81d7dc6fa025620364006ea7a42*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1067:8x1067x256_n"2691b8a56849280abd502a2807a324bb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1068_n"9d1133ac5ab1ac096bc3ba909d47f506*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1068:8x1068x256_n"5529d0e0e21ad372500efb30a5af3836*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1069_n"c58ad00ae5cc1f06febbecafe8806516*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1069:8x1069x256_n"3548b909923c53d4161b6fa97a0c5bbd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1070_n"664cf13ad545ccba0c1bceb8fde75eb8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1070:8x1070x256_n"85e3042517f629010c7612bc4dcde93a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1071_n"ebf084161981757e3627b61ef1e7de9c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1071:8x1071x256_n"0e878512323242f78a705e3ee6bec597*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1072_n"76e60d0f26e79602e21cdc86e3aa35df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1072:8x1072x256_n"1222abb472af6b4cc4a615b09b277e3f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1073_n"2df7ea850b7a15e0fc78634b010250de*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1073:8x1073x256_n"ea6fbae98ebd2c8dcf6eef9ae6f22721*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1074_n"e803ef9dbe2c0a3ebc990b1268c425ba*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1074:8x1074x256_n"ba7c50255dafa7b44a4248055b646b14*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1075_n"c4b0c9938e6acd06a3cbad0ddaa10a37*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1075:8x1075x256_n"0d6886935cfb61022ac890fb9b6353a5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1076_n"cfa60dcf335623dfd71c7e9302c0f484*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1076:8x1076x256_n"26da49e40fef6c9bcd996095a39dad45*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1077_n"6af053f11fb63a53745f1a4018f8da6b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1077:8x1077x256_n"5763a03c34dae06069e7bbcd5959267f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1078_n"189fe469e958a3347e71afa718eb6164*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1078:8x1078x256_n"a37fd8eede37382f96b99536bc1d0df1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1079_n"1a71a84cef1be7715482e0b74c65922c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1079:8x1079x256_n"a466afb285d225f9a23c8ca9394b87df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1080_n"e5f7cbb3499a80f2d0ec362e4ebd5b8e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1080:8x1080x256_n"d30d5f2a980d4631cafd5391993e2bb8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1081_n"aaeaaf472501fc9a24ce923122d193ee*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1081:8x1081x256_n"69a278be5f88fb0d37b8410a2196ec2d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1082_n"87c1d9f3ab6ad0cee91cea09addba5a4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1082:8x1082x256_n"faf1a38b876354323ce05ce101aabd61*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1083_n"0ada2b3d316665b6b03ae5c38e388923*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1083:8x1083x256_n"4cc8cb8836c3bf1756b15341f162bfe3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1084_n"5f92953181c7a6c0a7abfad688e9c7b2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1084:8x1084x256_n"d962a6ec14e5a4f35a41a8849da716fe*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1085_n"1396010a52705b80aa8462a9cba65aee*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1085:8x1085x256_n"4b77c6ad3e800bbc4765a1232d32aebe*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1086_n"3af9a0de46110a7989ca8e49902ea571*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1086:8x1086x256_n"7b697694e29ef0ef833878bebca341db*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1087_n"42d0ac9b0ce4c4efc1cbabc637830314*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1087:8x1087x256_n"fa41e6f6b975128e910f0a77ad9cceae*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1088_n"dc5dda0f3acf9d3a99b9050d609e4e98*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1088:8x1088x256_n"b0144a0585bbbd6052c400515b29347c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1089_n"e66bc6e30d02eef68d72cb288cbae188*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1089:8x1089x256_n"cd54b0b90a15c514ba78ac25c73148f1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1090_n"4295f6fc5783667a5b7410f04e3910de*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1090:8x1090x256_n"232df1002dd513f776f319bdf040445e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1091_n"a535d5163072dfa9a49dda196e294093*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1091:8x1091x256_n"71273f3c430d6af47d90db2337d588c4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1092_n"e7e6d6395f0dbbcf5f8fa80790c98a97*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1092:8x1092x256_n"d9d443be183dfd764217b4ead144f539*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1093_n"7028bc6112f57bd151a0432ba1516c1d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1093:8x1093x256_n"f14dfcd7a1396f11f314452f4b0c5e4c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1094_n"3e4a64217376f97daede8617f6d73b7c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1094:8x1094x256_n"22fd1400e0a981b94d77a4ea58257c0e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1095_n"7bf3799dd63c93141fd495b72ff2836d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1095:8x1095x256_n"3ca5051ca96eaf3c5c089f93ad8abbdc*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1096_n"9bebb8a94c9a7b57a632597609162b49*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1096:8x1096x256_n"9b3a104b67a15dda564cc13e893f5763*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1097_n"53abd975ea04d7d4159353f4721a3da2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1097:8x1097x256_n"58a7a9507c29546a705751ee45848d57*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1098_n"2935728ad5fbc763c5517f66d9d3f467*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1098:8x1098x256_n"fd7e87843b16cce92f0ec20fb0e95622*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1099_n"bb9892306015574c9aa9a86b40b74352*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1099:8x1099x256_n"c32cb0fb5de991dde2085936ddad977a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1100_n"9c3da1b06ca7c517fade1a6f3a73a0a0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1100:8x1100x256_n"be554c17700f7d6428898ddc05014cdd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1101_n"d4d0a906a58825f4e0cadc2504345c7c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1101:8x1101x256_n"dd2c463007ea38cfddc419cd230cd984*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1102_n"9bb5c707dec33f26e8b0a7b5b13de602*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1102:8x1102x256_n"b46a47e5a6d148049f3ecdb84f600a1e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1103_n"1943717e508f49633323eeabccaffad2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1103:8x1103x256_n"f9bab0eef8d68efbc3467fc3fd341588*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1104_n"0997780ad73780c8310dad27c67f557f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1104:8x1104x256_n"d62d426af54c95fd535a7006a886aa69*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1105_n"bebce8efd34648b7db4ee213dcc7c6b1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1105:8x1105x256_n"f3cd34b2d30b3d66ca78ee2f64e54fb5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1106_n"23573e301e26bedb4368706035462249*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1106:8x1106x256_n"644642c1c04c7db6150eaabe5f325328*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1107_n"3dbfb6e36d281ef9cbe49047226d0d0f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1107:8x1107x256_n"97bd71f99cb29e242c0421cfd1c3f1f1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1108_n"f7970a3399cac78cde05ad010e2bdc52*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1108:8x1108x256_n"d2ff13c6770102ff25f2b5d6c1718de0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1109_n"c27d335eaf3b0ab3ad4ad8cf13059fdb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1109:8x1109x256_n"ebb40768deda4ef3294a74d91b273ddf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1110_n"957d3c1c8abfa6fe8dcdf9678b1ebfe5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1110:8x1110x256_n"2d4ed6eaadb6ae3009c8cf72d808a1fd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1111_n"8534b7bbdbccd33f206476153a23c891*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1111:8x1111x256_n"24b4cd09682b2fee2a3f995cf812fcae*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1112_n"265341c8908be5b776af0ac5dd70edb8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1112:8x1112x256_n"0a390ec6f7a1abb93d571a4fec712b23*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1113_n"f50e35ab9d1110b2c3dd102d087cf0df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1113:8x1113x256_n"07d6f11d46ebf19252481d8172f5a9f2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1114_n"27682b37cc22a08d1d4dda0171b8e0f7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1114:8x1114x256_n"3c7c05b4556a767ed6c585f1ed48443a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1115_n"0a8577dca25fe6c6d8990e9c850a2195*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1115:8x1115x256_n"09c999b8bef8be019109bef804ed01b9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1116_n"6e89f41c56dd42723e7d905580d1c3da*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1116:8x1116x256_n"dae66f4b5d13d7519fb8dcbcfb6dc93f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1117_n"991b573e69aeaa310c8120668549f99c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1117:8x1117x256_n"b171044c4fb3604db2869cf257c78f38*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1118_n"e34b33e0538d15e52c88a4c08704b170*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1118:8x1118x256_n"7688b115e32713779cf831f895311b46*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1119_n"03ed1ad29947c541b92040d3ba8c7754*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1119:8x1119x256_n"1a40a2b506529d99e10de21157c6e9c0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1120_n"13e9eb1a91805463fa89aa0648117d87*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1120:8x1120x256_n"2e2d15912213488f7eb11e2cfa5b2e21*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1121_n"66cdae9e608fba150e1257f71253177a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1121:8x1121x256_n"f435ebfe9fbe48e6c66a884d5e888878*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1122_n"67d116143fd0aaa500224987db06871a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1122:8x1122x256_n"33626018c6c6a9eb392f1fe0a0fa1a0e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1123_n"1de17e0879886a9d7dbeda80501e20a6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1123:8x1123x256_n"431a3b045055b2db41bf92a23af8ebbb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1124_n"b81fdfb86aeb143ac0d5fd39a53ce70c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1124:8x1124x256_n"d76b2b3ebe8e1f8966e8492542975d42*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1125_n"729c2736b43ca765fe1d981935cb0e69*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1125:8x1125x256_n"3f161a0a0efc5fd9550a6c7c0676493a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1126_n"7b3bd2c56afab5a9626ac48fb7198038*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1126:8x1126x256_n"7f332f122ddd7ca85f2fe63210d3edc7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1127_n"966ecb55a8c1689a3aa066e9cf817491*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1127:8x1127x256_n"40c23f99d6e0aa3652abc183a414940d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1128_n"e47e3a9d9a9d7ca22328da6613450141*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1128:8x1128x256_n"450245723f809143e993e2782ab1089f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1129_n"fbc6360f041653111d7afe2d1463149c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1129:8x1129x256_n"9dae733dc7a1e5dd8b987c0b1c85898b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1130_n"e6fc577b1ba79b0ecf3be165d3c5f742*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1130:8x1130x256_n"26903396dc3a2e681859f32eb3711d81*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1131_n"dba95554f955b8cbee0db3e9d90d02d8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1131:8x1131x256_n"6410d1331ed1f6c42d31b9e60cc741d9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1132_n"4f298a5f479dbd354488d0669b79783e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1132:8x1132x256_n"be49dc20785d462b51f4ed44c1895528*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1133_n"7bd30e1370554ad569754c3b75afd148*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1133:8x1133x256_n"3454d847956082525dd0ca1e550cf1e7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1134_n"f3c49dd9ec6c1e31f0a3f7e10bbaada6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1134:8x1134x256_n"4b11de0db33183d1ac204dbf03c2fb82*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1135_n"8824798eaf3e9db31a83cac1571c48e4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1135:8x1135x256_n"844570ef49fcaa1f666cef91c39d0e70*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1136_n"2244e8f75143877a9ae0eb41d232bde4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1136:8x1136x256_n"b0a075d017bc70005a39816f1d926855*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1137_n"9c7be07ae1649d91908cc656d489a97b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1137:8x1137x256_n"78801cefef3103417d84686b6969dfd9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1138_n"2fbe5bae796da02100b43779313210ea*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1138:8x1138x256_n"b278abc4a3a3a592183bc7a7536c3763*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1139_n"448d494b38366b47373e3e4bd2b5fa93*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1139:8x1139x256_n"3bffd7b21fc5ebb1e7fe60897899ca45*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1140_n"2e4113eb17a239ecd078eabd9b59d76d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1140:8x1140x256_n"2ea4805b250834665750e6976d4ec842*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1141_n"12564310ce8cac36c198df926f88f822*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1141:8x1141x256_n"21649654262d91fceeca3d3dce44ada5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1142_n"8ac1b04f27b4c901b50e734246eed6ee*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1142:8x1142x256_n"59160cc934ea7ed090e409c4030cc3e7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1143_n"818e159808d55e62672174e8bf7afe46*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1143:8x1143x256_n"2952b4067679e5ee960f925be38455d2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1144_n"1cccf4d4f57280514a3be5950799eb53*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1144:8x1144x256_n"3f08f5dce3ce8ecb9498ac84f47f73b0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1145_n"d007cd6d382e164fbbeb14cef93eb464*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1145:8x1145x256_n"3683866f2275e1a2efafe9cb4dfbc89a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1146_n"408bca5fa83d28e8a1ef57078c208fde*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1146:8x1146x256_n"a4b3fad430c4b67af82f50ed385aec50*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1147_n"2a729dbb5cb9f183f76c1e609c7ec58f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1147:8x1147x256_n"08f9132ce6d0bae8b91b5fa23c81ed1d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1148_n"976b6e66106f9d3b79ef6b554dea06e4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1148:8x1148x256_n"e9cdbaefd2cf58c0e5cb980097d4d5df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1149_n"7d9873615648db2d5ecc97816c34a02a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1149:8x1149x256_n"299d729a0ded97ee851a3d01c0cf24d2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1150_n"939347713a1ac9bb693e810e4078c0df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1150:8x1150x256_n"bc16c81c5a9c4201f7f80a15ff484a67*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1151_n"5fa28872f5f321ce0d7f1e158e105735*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1151:8x1151x256_n"c7c974363d051111560f225e7b267c05*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x5120_n"f9c400fb5c543d85f304d6d678409580*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 40x1024x128:40x128x1024_n"a6e4c6bc1c2a9748be35389838589f72*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 40x1024x1024:40x1024x128_n"62ab42e12a1960d771c65f0eff48b7ea*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x13824_n"e8dfb2792d144324af37d356a8cbe8b4*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x13824:13824x5120_n"bb039c6a9b4dff01c904c5aa172b0966*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x32000_n"fcb3dcae51d61537d0b266db2d6b4bea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x5120:5120x5120_n"4095462f15212ec5a1c52a172070bc79*20320&8b86a29ca1f99fab9c057741754c1b9d*4960"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1025_n"cffeb4f36e9e94edd993f0e22cedaa0f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1025:40x1025x128_n"98bd54754513bfc29e6e07223aa29d56*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x5120:5120x13824_n"0b9dfada503b690f427bb44819e9afef*10160&8fc13c559300ccc4d737a9c2757689e8*2480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x13824:13824x5120_n"3481f7cec6c06e28c30ea5575eba8018*5080&7d3ac03ca189b369cd8dbb814a1aa600*1240"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x5120:5120x32000_n"a3fa096107afbaa53cb1a0b9dc3f62c8*127&68bf0016a64e93a5bab43f437975b0af*31"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1026_n"b5d34bb0aefe95b3f093dd50e4283763*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1026:40x1026x128_n"8308be890e008887795f88ccb70cc5ec*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1027_n"0b7ed5ea75cbcc9473d802099a030704*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1027:40x1027x128_n"99fb3de4e6733684537c892685015a34*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1028_n"acfeee05f1f0a594b244d401ad3f5177*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1028:40x1028x128_n"912bc0c06f497f2d147a65355624cb46*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1029_n"7c5f84af2cdb7e609b408f0706119222*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1029:40x1029x128_n"1e5da1afc99afd8a56c24a98791a12c6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1030_n"c2752d2fb677b55c446a42d704e1bd61*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1030:40x1030x128_n"02eced365f8e45f5afa0e95bf31ad77a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1031_n"d35bc7e69d4333a87374c3ea9117d87f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1031:40x1031x128_n"950c5ee3541b6368f496a14824c30900*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1032_n"78f2769d83637043b78038ba0e6e8ed4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1032:40x1032x128_n"7d441d89af9821ff12554f872892136f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1033_n"43d6718dfcef54349a5fe7b97d5e21d9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1033:40x1033x128_n"df124894ea86b2f789157ca4f633d571*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1034_n"7a2a015d38843539f71ea0b7a97c4564*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1034:40x1034x128_n"19ec5742590e2eeeda993b65aa922aff*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1035_n"8fc38b64edff8b771f201faa5ce9209b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1035:40x1035x128_n"a880fe52ec809c97446135e6600a340d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1036_n"35cb798ec9ba363117ba85d9b88528a7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1036:40x1036x128_n"674c8d68f35bd0af4a70072cd15336d7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1037_n"8fd3babce31ffd32d1b885e16c013c07*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1037:40x1037x128_n"8ab76bc7fd347f8ffabdd0c02e2d00f4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1038_n"03f92b7dda95baa0f3451e9e689fc195*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1038:40x1038x128_n"4de14a034f1d2809237378dee9808db3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1039_n"b40b32727db65ffadfb6f921d95cdf35*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1039:40x1039x128_n"0a1bb46b74c8d86787ac541eaa2fc5fd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1040_n"663b9e371c499a91d40ed6a391b67300*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1040:40x1040x128_n"c1910c57ca22dc65bb792c7990ddbdee*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1041_n"e8c056ce470d92118eb7765adfe9911d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1041:40x1041x128_n"aeef928536fb77a278db48e4a9679fe3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1042_n"86cbd5b9a287b599026755f45a399a44*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1042:40x1042x128_n"bbb63061dd05b03b90a17acd9777d17b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1043_n"a8681e590116757cf81b88ec675071f5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1043:40x1043x128_n"8b1330d614c798e68ea701a61d947ac3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1044_n"91781b9af12b749e800acb8b95c18b99*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1044:40x1044x128_n"e73de52aaea696e20e9526fa4737432b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1045_n"afeb89810997add89c0c564f4e9c42f7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1045:40x1045x128_n"691ba63dc2a68b7662fc530f9eea3054*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1046_n"3caf770c669a875b9422bfa741adbaf9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1046:40x1046x128_n"a67d66dd8b36b1c319970396a4422d11*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1047_n"59491603dff4682b46aaf409e768c374*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1047:40x1047x128_n"33d14277268bbdaa1f5a26d14a1540ed*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1048_n"da1ec94e9dff1ae7509015b088cb5a29*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1048:40x1048x128_n"73d862ee7719e03a8e957adc4a1c5c3e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1049_n"e412c9a343c420982a2d67ec503d5f4d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1049:40x1049x128_n"9cd731e21929b2e5e93b4c4e8517d5c7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1050_n"66293e23c7af242343dbf4b2012e361a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1050:40x1050x128_n"390e8a5f34e5b399b00589c0925aed35*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1051_n"1c98121e9e71ee6ec2784a886109adab*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1051:40x1051x128_n"ee1eed9581d61e7f0f994b389133a841*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1052_n"1098b566c19cef2f20b34f5e17ed1d12*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1052:40x1052x128_n"281c08f9460728c27b418f381beb2969*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1053_n"9bdd489b9f03a558f2579cd1a2e4e325*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1053:40x1053x128_n"ac839fb8d503ca855f1a937fc3c107d2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1054_n"a984656a00b705810ba5c3c1041d80de*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1054:40x1054x128_n"db42e5828e8ab1bcdb2dd88dc7fa2cb7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1055_n"b2d94ebee2cb7ed3e7869efcabca1eb2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1055:40x1055x128_n"3328ed78bb7d5be6198b02e79bb43fef*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1056_n"78f73449421b5ff0b21a54b635169705*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1056:40x1056x128_n"9e911b0ac26eb4100a488b23f1a873f6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1057_n"85522ef262dd01cedac70046a334e4cb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1057:40x1057x128_n"3d96d816e1606518eeea050d5fae641f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1058_n"03c458be79f60f4a2f60afa416bf6e40*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1058:40x1058x128_n"ab75824833398d30222213da53e6028d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1059_n"799ca9350a683274c1bc36e7835b34db*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1059:40x1059x128_n"03e4879119cbb9b8e23cc14b04a74063*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1060_n"5d3b2f44900a08845010efe488a19cec*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1060:40x1060x128_n"8ca710a4ab6ae2cb5ed9485b6454a619*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1061_n"b80e38bb46f35203d4939bc95b4e340e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1061:40x1061x128_n"3e54d3f2592279a84111c2ca96512b5a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1062_n"768150d9bfaea50c885f1a726135f759*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1062:40x1062x128_n"4bfdccd5ca6d50e378bcec480ca3b13a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1063_n"881cf2838ac0c89c286b110cb5547502*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1063:40x1063x128_n"7773c903abd65ea44fc314d2ba5fc317*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1064_n"b4192a6d0f379bd941d8234ed5ac5b24*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1064:40x1064x128_n"cdb8b40f329279192ccd09f428340177*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1065_n"dfce38dd781717eb5f6d617c66f94c29*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1065:40x1065x128_n"28af5f10fc243b5f8f71e9e7949eaa6a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1066_n"b8e331a9582ca87af67c0ee4471c9354*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1066:40x1066x128_n"cf0ccad252a666b621c012d06c7b18fe*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1067_n"c5dae2ea8b458e9a081381e1cbcbc15d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1067:40x1067x128_n"62f2ba8d9753b908d781b3986a0a69ee*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1068_n"59bbec9e562798279f9f2a9b770f1bc5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1068:40x1068x128_n"7ea21f24d2e0d560bc8726bc0db4cf1e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1069_n"5adae7094aef613074ea08a0d945d1c7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1069:40x1069x128_n"8940929b5a3a9f90a8c9ed3fb6ec423d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1070_n"8090cfb3454c14f3769e4225a3459d99*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1070:40x1070x128_n"9dec40f2055e27e26893000595eea2fd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1071_n"42fa8ce263605f2bd205c1324b3c702c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1071:40x1071x128_n"d9987143b95f0af5b30aa7c0fe4b7b7f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1072_n"f62c654ef7ba9290a0eaccaeec0d8d2b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1072:40x1072x128_n"9fa2c92641c52d621b30c4653326edf9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1073_n"0179da2fa22a3ba6f69bb58aaf39cfd0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1073:40x1073x128_n"ada6fd8dacbac4b328a7fefa9d4461d7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1074_n"14bd56d00b9fcfbfe22e3483a6f07a43*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1074:40x1074x128_n"6539557a44c3a2c26e899b419cfb6226*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1075_n"7351ab446492bd986c337d0f2ee38ed8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1075:40x1075x128_n"25ccae118a743aee83f4e7619ab2dc59*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1076_n"904e3acdcf49d09eb1fac6a55aca1839*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1076:40x1076x128_n"7dfd2e78dc04b96f56a7f25a5b3bcbf1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1077_n"bf5fa13ac5d7d29cf360054e94f507b2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1077:40x1077x128_n"ad791c9959d4d200350b50d007e8a718*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1078_n"f7b04309169c4680fd0e2eee46961ddc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1078:40x1078x128_n"c919b3a2bdcdebc7f9deb54ccd0ad276*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1079_n"64eb0a28a6982d3e4b6b55ac06e066ff*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1079:40x1079x128_n"59af9a4a402c0e2fef9f20fb1e2c517a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1080_n"34a3c40dd452f79b7fb73a19568f5ea8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1080:40x1080x128_n"95a3826acd9b764db6dc4762447369b7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1081_n"d590227ae103e941d76d78b4f71199d9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1081:40x1081x128_n"7fb58a39bd6f67474e6384bfd18e4375*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1082_n"24f6f33e8dad6e0c5c7b6632681ea0e0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1082:40x1082x128_n"80a6bdb1174ad0ab00d1c7185e87cdcf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1083_n"7c19108a210ae785706242e3c88e916c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1083:40x1083x128_n"0a2f67ce0f689ba1856056492c02d66b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1084_n"1bd6506382d4c3ffba65abc7a35fd87f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1084:40x1084x128_n"6bc162df28dce0bf936191a245ef55b0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1085_n"1e67b29a15ad6c6252d0d572e8eff824*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1085:40x1085x128_n"7bcc1992f8839c5f6106114a5ab013c1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1086_n"95502ea777e97621625e373d929af0bf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1086:40x1086x128_n"9748212928814532606d606ecf32ec6a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1087_n"2b68be6bafa1dc3f34fe6717cc72c864*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1087:40x1087x128_n"27eeed7a0e8c8007642a7c09e48806c9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1088_n"efdd241d49a74728d9db5e037515f1f3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1088:40x1088x128_n"e7fb4eb5c83d0a02d373e231eac7a2ee*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1089_n"00a879ef5dca252b33fe24af247aa600*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1089:40x1089x128_n"481ead824c6f6ae768df57796aba0533*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1090_n"099da738738968a15dbaf95f1cb2e697*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1090:40x1090x128_n"ca302693dfa6bd35a47fcd8ce174fc80*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1091_n"14c5b6b6ec0ab586c633495e21948e22*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1091:40x1091x128_n"266b194f69719051ddd550043ebbb3ea*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1092_n"ef17dba1d230b34d5fd72af2292ac18e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1092:40x1092x128_n"493305f040553444e09e6ef049a3151f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1093_n"a2bca029e8a0aa397c297c72965b7649*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1093:40x1093x128_n"84ce8d25ea5c78361e462fbbdb657d2f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1094_n"be3e8c5c2fdf2cdef24f5c4c6a89df91*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1094:40x1094x128_n"efbb22b4e6426b25f4e3ff6ba1eff161*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1095_n"09967426d43c65bb0a07ca364d1c71e4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1095:40x1095x128_n"b5495ee563055638525aa538a8a14ad6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1096_n"43c1baf07f720e381a90c3e940939437*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1096:40x1096x128_n"6b847d5468ddb07d88c0d55b52d47d4c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1097_n"fe36c257b26ec8265254f76bc5d958b0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1097:40x1097x128_n"622a5d530c0f75d4099f74d937fca70f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1098_n"1e8abeaf757ebdeb3856dac3d78bade8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1098:40x1098x128_n"470c3c1eb2486cc42bfc8bb93d635f38*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1099_n"ed57d0f143379feb600a3e425d038e17*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1099:40x1099x128_n"39700c9d31c397adc176b19706e478da*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1100_n"441bab86eea0c06373fc76bee674b087*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1100:40x1100x128_n"b26ec755e0e966e6869df7166871896a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1101_n"4d5e2a1ff22338f72ceab673ffa0e5ff*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1101:40x1101x128_n"fda4ac745b319aa840a3139cf3d26650*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1102_n"62ae4d2628076f50a40714c6ebd0fd2c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1102:40x1102x128_n"2f5624f4cc717c201681cef90dd3510a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1103_n"14656c92a11d3b1045fb5b0e0c4a71fb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1103:40x1103x128_n"79defd59acb48766b119b2ac529dfd36*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1104_n"c6254168f5e16243800d995ec7d22726*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1104:40x1104x128_n"c48f0b5014e46a3898a2c04ace8bb48a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1105_n"ea33559d34f114fbd521d075676dd614*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1105:40x1105x128_n"24e16ad15b6f6eaa23878696bfd56055*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1106_n"f0b213bab7ffb9cf6df41cdfc27a78bf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1106:40x1106x128_n"36356fd82cbb964e637c87a26aa35239*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1107_n"9d64838fd2186558fd10b5584e1c3d0d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1107:40x1107x128_n"e401e39ff919a0fdadde42f0b6a0fe39*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1108_n"57503961a6e3e64da929089056a10744*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1108:40x1108x128_n"39b711b0826c57a6d2fe191978a94115*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1109_n"9c36a959b90b96baa4cb80ea96b342ad*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1109:40x1109x128_n"e6232ed26270fab31c8cf4b87d89633f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1110_n"9bf6449cdbd447cd6e71e3a415220a9f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1110:40x1110x128_n"35be3c38a7f6cd3d1b0e0c6760cf251f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1111_n"929b06e03125196cb251089ff10d50b8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1111:40x1111x128_n"9d7ebd70ffb6e0b4b01514228b760469*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1112_n"8660523574097cb1c56e1b1a3f5ad95e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1112:40x1112x128_n"e724f3c859682109a9022172bd84fab1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1113_n"0c3a773bbd9077a88019942c9240c0b1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1113:40x1113x128_n"8d17909c819d9dfe03aed2800b71d972*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1114_n"d3b86400148364ea6dd90e13a9f40768*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1114:40x1114x128_n"bb70692e983883e86e65e4bb79b4fc18*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1115_n"021f0f9a18494350e8fc1cb7793d720d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1115:40x1115x128_n"83e4f33c8a693cee444cb23d8f25364a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1116_n"7956319c0b39ace580d509434d9cdd78*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1116:40x1116x128_n"b311a45431a1babf1ec37c1cbc42d8fc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1117_n"7e50a1c7b6f66fddaef8763b8d92e929*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1117:40x1117x128_n"81c2e84031b5be18c862e64d80a024cc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1118_n"b3de7185feb460896089fd938f0750dc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1118:40x1118x128_n"816115f1a31254ead23b67fa611bd77b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1119_n"bace7029586bc7628eae06a3124f64c4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1119:40x1119x128_n"ab09921d2fde61446f2e1934c8e5bdf2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1120_n"a1337ef41e4178ebac88def599e553ae*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1120:40x1120x128_n"2c9994e4657effd136239b34afd05b74*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1121_n"a66a62d7ed52b08b45b4f28d822e26bf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1121:40x1121x128_n"4b3cb5daffc90617e7428ab41307f74b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1122_n"0773986e0f3efa86bcdf2f4a202d7799*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1122:40x1122x128_n"7e3e50193ff446e54517a1f105db33c4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1123_n"e4ccf74a0144f9e32216ade5fa0dbb55*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1123:40x1123x128_n"b6932f986901dc26ee06a786c807aa71*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1124_n"01f2ae230ca437830229de657d7e2f8d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1124:40x1124x128_n"11d1de84c79fac037a97c872c53f70e7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1125_n"49991d684397d41ece535fda077f060e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1125:40x1125x128_n"6fb8dba0ecc78e55de8276b974de9ba2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1126_n"58a17cc41dc9d3c4419f7c8da1bc616c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1126:40x1126x128_n"d40d5fbf4c58856c9b33976454712a0b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1127_n"e3c06138b5486665e0e2ce062754d8ef*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1127:40x1127x128_n"bca31fa8c4d529a71e8d3df3b37ddf51*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1128_n"1b77ffed6c78aa9f7dd0f6242e950713*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1128:40x1128x128_n"e809dccf04ef27758dfa9b5509c0e00c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1129_n"58c08293d3b4c5c1aae0a07ea83d44fc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1129:40x1129x128_n"95e39d28b99776c94bd877fb2aabcd6f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1130_n"867d96e6289d34e325304ed1cf5bd54e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1130:40x1130x128_n"c09cf4e19e97e4c0bc4e5a1c1ebd6b56*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1131_n"0b3e94d0a008e2dc7f1ea2c586dcbfe5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1131:40x1131x128_n"c9570bde2f875f077133dbaab3e07939*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1132_n"33dad20e66e7496d6f5ae20f0798291a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1132:40x1132x128_n"5d581a19149eeb3520f120761030fd2b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1133_n"9b421ad0936f6a0b9c0da0f563d4d211*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1133:40x1133x128_n"c344a8519d1c659f5dbec3cd238276a3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1134_n"5c7be1f33fefe294d4541fb97c3febb5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1134:40x1134x128_n"ac28fe32724bcb24ce88afada4a2860f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1135_n"c83023d61deb6321844067855b39a0c3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1135:40x1135x128_n"85b76bb2ecbac8cdd17b90289e4e6364*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1136_n"0e4166918c3a836e9126a8627304d93a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1136:40x1136x128_n"b6d091458fd6817e48ed2bb973ea44ae*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1137_n"87620f732f2e0452964ed533048e09ca*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1137:40x1137x128_n"f69fe0e9d04ae30d02c2bcaa5972c2c9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1138_n"3b63f0196cc4ed1a42595eae39876bcd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1138:40x1138x128_n"72ec80d91d8f50a94ab8fa33f88b4beb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1139_n"d561009ffd931247c0e1947ccb30179b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1139:40x1139x128_n"a98a71b9efa49a24d3fcaf98f3baf2a5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1140_n"81708fff8308dae590156d71e4bf0255*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1140:40x1140x128_n"8d43c6bab7209c9ccddfa306b51b45e7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1141_n"2954d498fa56610a1359334aaa267fb9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1141:40x1141x128_n"69e02bfa1b1bc5be1db1b6e4a9678361*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1142_n"f3eb82015aaebef459063ab23417a9e7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1142:40x1142x128_n"06ff99b88b7923053b6386d83d666991*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1143_n"d5b47d6d64752a495bd47c381fd922ef*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1143:40x1143x128_n"a1d38c21e6e15256538ec5ab6ead1506*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1144_n"fd66cbf532da67acf264c65dc369d245*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1144:40x1144x128_n"b3e6a4bf51f5da57b34b176e132a808a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1145_n"650968a68c53e51b1b3e89138bbbfe36*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1145:40x1145x128_n"56b8cc13d18de4411253b3add2e2f6b5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1146_n"f6755b4b015d4dfe93bae5360e552f1b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1146:40x1146x128_n"87126ea52e9275b17077e0c29710cd10*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1147_n"fca756245b639e8ebcc04425cc775fbf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1147:40x1147x128_n"3bc5a5f407b6438e2f500864167584d4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1148_n"b6bc3d6976180ce56ee5b5b3ace9d7da*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1148:40x1148x128_n"4c6da099e9925fa5cb797ca4498bac59*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1149_n"3ad563cf9d299667c2ce3005a7017463*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1149:40x1149x128_n"5285e0ac3e948a0b306f6cbebcbd63b7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1150_n"514604cfa77cb6f4d3caf852fddebc9b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1150:40x1150x128_n"6488af6213bff2bd83334bed58519a0a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1151_n"cf20a724c3a9ae7c08288cc51aca107e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1151:40x1151x128_n"f27a47ae8a2f1c2a9e4a147ead33f3e3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x4096_n"11c1550c97e24a73315b93e1a37671b1*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x1024x128:32x128x1024_n"4f937bdd8ae437cc84af3422f97f90ca*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x1024x1024:32x1024x128_n"b8afea911223830e470ed6a350f05fab*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x11008_n"88d25b0878a171ad0915b24684026b00*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x11008:11008x4096_n"84cbae71ffdcb2114b77b4c8d74c7a39*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x32000_n"463b47b9628df4fde2f66da09e3a87c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x4096_n"c7ce833b535fb7126608ea2516db5bf0*16256&3a52eb079aa4d1a2af03a8fe163cf595*3968"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x11008_n"2bc58eeae48055fe479cee597cf828fc*8128&cf66795ff6d0839990824aa0340e2650*1984"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x11008:11008x4096_n"3cefe94ec190158855b49fae7bba3a18*4064&065500745f39d32a33790ffd2dd55f58*992"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x32000_n"5c669d446b430a89fa830c680bb81e08*127&97826a6fb3411c460870cc0bc1d57249*31"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1024:1024x4096_n"cf4546bddf1c02d53538ce89489c2957*3396"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x240:240x4096_n"622808219662a609eb86117caf349da9*617"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x2048:2048x4096_n"5643b11a3fbf6d6105f1becd28e0a0d6*309"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x512:512x2048_n"56aadbe499d4bb30396e1546799b631d*906"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --wtag=ba --dtag=ab --strides=154624x1:: 512x512:512x2048_n"a68f392d4cc255d6e222b99eade758a5*302"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 746544x1536:1536x512_n"b18157f53195034b8007ae19bbcaff2d*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 746544x512:512x29_n"ae397910005f387981535ab7f918c759*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x746544:746544x512_n"71c8ed324ecdff83cc5127f3ca8fdd50*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 746544x29:29x512_n"f3f215a5d7bb0d43c0c5f35ea7159449*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x746544:746544x1536_n"ab81d63f884bbc3746ce09d5196663a8*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 746544x512:512x1536_n"917c46d7cbbd157f66a1af7d4567a736*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x4096:4096x1024_n"029f275248ca0a6e667ad1ce2d294382*3391"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x512:512x1024_n"af60cccbfc21f0c049e72afa4863f72a*3396"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x4096:4096x2048_n"d1e13250077bf06ead11223fff608bde*309"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x512:512x2048_n"5b4117a60c24f01d751e8caa122d8421*309"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x512:512x240_n"7b5ea743ef63779f982add12b7af0158*617"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2048:2048x512_n"5828a7af8f4cec8fee150bec20a63331*1206"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x512:512x512_n"90c5baae189bf182efb41aac3b495496*906"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --dtag=ab --strides=:154624x1: 2048x512:512x512_n"7e16258853b439c2b1e9eaa940561350*302"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x25536:25536x6384_n"c7e87e5e9822a66e1b31458e475e0b32*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x25536x1:456x1x25536: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 56x2048x152:56x152x2048_n"70b7ea32e32610e85cf4e6f18e1bea6d*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x25536x1: 56x2048x2048:56x2048x152_n"8407625df57a1ef0f2768095971eebf9*1152"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x2128:2128x25536_n"563a9e6843f5c4c8e9fe92c88a096e27*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x25536:25536x8512_n"885ab65c7835dd6927aa92b52099104a*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x8512:8512x25536_n"7b6262e8ad816e7688a011a4dbe1e72a*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x25536:25536x4224_n"37c423bc96a2df8167ccd30f0819a54c*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4224:4224x25536_n"f710dd930fa142c5e441d970b06c5545*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x8192:8192x25536_n"60245ab549c10f1bced158cfa8d4bec1*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x25536:25536x8512_n"267ea776c35baa34c7de43531bf42fa8*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x8192:8192x8512_n"5be58e30c4cecb330f9554c12e6b6c3c*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x8512:8512x25536_n"f0a45c7d8eb7669a6ef88b9df0ac02f1*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x8192:8192x25536_n"1542fa60a39e5bdb9f52df7d77195327*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x25536:25536x2128_n"9732d708900dfdbd2f6f511750ad4ad3*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x8192:8192x2128_n"e0e9da623c2de81e045cd142433d6d72*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 56x2048x2048:56x2048x152_n"7f04030cbfa5b3f989be695139009a34*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x25536: 56x2048x152:56x152x2048_n"79497bbd5e2920e444f475f819d65e23*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x25536:: 56x152x2048:56x2048x2048_n"82f9e6386b06f2231953d4b06b628bb7*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x6384:6384x25536_n"6c7bdbecd7c7bca1d5d0e40e809e5eb2*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x8192:8192x25536_n"2fb0260aa0055556c7310d9c179b4662*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x1000_n"72a773c9d1770ec8434ff877a98a8bd6*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1000:1000x2048_n"8a61634bb66418a5857632a595aeaee3*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x1:1x2048_n"8e676391ba8cf28f4db5f0da8f258d3b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x92:92x768_n"11ed3f066c397295a5972f6d901d7671*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x768_n"3e771fc5c55f7854e66ee87ff01da9e4*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x287_n"f80e38c1f0338ea1be3e5eaa844e99c9*2&8a0d69d127c25baeaadb403823870e97*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x681_n"9501f1a102dd3dffcc53284c487cb906*1&2bdd687f8f029bd9247d1979e754ff3b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x379:379x2043_n"7c53710e563aaef529b1c44574047d52*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x681:681x2043_n"f4f202bcc05358ccb77bd1a242d3694d*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 42496x681:681x92_n"d113d84262956e12e941c1a87f6147f6*1&6b9d879e6023cadc2bef9dddc20c1348*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x144_n"fd9d8de7daef668129974e8130b34d43*1&2ff8e0bf6bccb96913f98f3ef39de988*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x144:144x72_n"6138e8c43602c54de62ddacd0cda7a27*1&141b4af3617052a28d1ebaf85de10ffe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x72:72x1_n"f1555bd2c0b246885f82c1cac3047844*1&dc49839375eb23a2fbf9479f7c5e4a6f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1:1x72_n"7c699399bb4c83aa91174a49a33d9063*1&0b1843824708d276c241f8593e6a2dfe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x512:512x72_n"a2e25316678b8d5f5fd28771dd4079f8*1&3ae2b4e482ccd9f3702981222ca1a7ac*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x72:72x144_n"6bbaed66e1e0c2d9f9ef18838d61ea0c*1&912c1660be9feb07af61c661b094dbe3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 72x512:512x144_n"a51886cab10b5ea8f7e6e0ba5be7d889*1&f465cca9ec172f767c88ab4c337324f3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x144:144x287_n"d701cdaff1ad02243848905635569426*1&8f16c5e1bdb5eaeae99db2839341ce9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 144x512:512x287_n"b6e0af375a59fc38343236dfab668b6f*1&fedde6697fa858d01f8cb51856506b20*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 92x42496:42496x681_n"4077600bac3ab3e59a301f95de27d324*1&15f54cab65d47c567da3240b4d83de15*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 42496x92:92x681_n"294e8956ed95f6972d2a08cda7551a65*1&9bd01a807b1edeadfb6a3611ad696e21*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x681_n"4a0bd5575a9077295b1de5b2f69ebbd2*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x681_n"b4dda59c3fb40e821947868f21dc4e60*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x379_n"da6c3ab6b355fefb6bf91952d8844d8e*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x379_n"3cc00fbe7b38a51b767668e37eec6761*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x681:681x287_n"c11e760708856750388ca9a36a3948c4*1&529453e516b8d82fcc4a3a7e4af50fab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 681x512:512x287_n"a3095d27453a74fd43989bbc0a02cd44*1&b166ac650ca123afa724578a8b864863*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x287:287x256_n"b18a058275d7ced160abab2c9b0d8970*2&9073bacac4c5f61606b6271f20eb1f27*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 287x512:512x256_n"25e8d40dc0034ca4266a1fb78b0e486d*2&66b11856e8d59fcfdcfdf286526cc495*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x256_n"7a639acd8b7ae1a59e28e5223bf2c527*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x256_n"ca6113d3d91887079af7b1e715daaa49*82"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x92_n"080713494b31870f51e79744c2c53733*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x92_n"c1886397c88ae827b6068fa3783c3877*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x1418_n"b568de70f8a6d4339e663087212ac55c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x2230_n"81b81df55e8cb8c0c050323f1f92f6a1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x711_n"4a454d1ecd6b92033623c9cd02df3b87*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x566_n"6bc062662d9887de792dc40533ec87c1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x1111_n"116f09e7cd035826e55f6fc07d996735*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 1x2048:2048x1000_n"5e7b3cb566c7c2f6d8cb2282c4f78976*15"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x25536:25536x6384_n"6cf9d0c9e938a284f1a4b61557ad604e*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x12768x1:456x1x12768: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 28x2048x152:28x152x2048_n"ea4a05c6c09f7fba0190699796a83e1e*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x12768x1: 28x2048x2048:28x2048x152_n"bf5a722e9c3f058a6fcc3fb339b65d7f*144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x2128:2128x25536_n"b33261e7d5dacf39965e6f19a278657d*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x25536:25536x8512_n"bbd97bcf5b3adbf11fa51b45eb37973d*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8512:8512x25536_n"43dafce9c8d8d9fe63c2120e7d2e780c*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x25536:25536x4224_n"7aa369f9a3bf20d07de022610d327b53*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4224:4224x25536_n"7244f99117f39f54c70a935c485ed47b*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x4096:4096x25536_n"bf881c1a66663500d17a44ab8648034e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x25536:25536x8512_n"4783548c4863666901c5e2c29f6691f6*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x4096:4096x8512_n"daefab0ff4581377004d5df5ea087d25*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x8512:8512x25536_n"16f3a8e58e45b69a697205d6b7b7392f*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x4096:4096x25536_n"f9e42f04b8ca95c01213a44339fc55de*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x25536:25536x2128_n"de906129f38f6582dc8a0ef7b5a0557a*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x4096:4096x2128_n"262b9ba61b421017aa0af4e634b6f020*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 28x2048x2048:28x2048x152_n"98b35a56494656e3feb625ae10d51fd9*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x12768: 28x2048x152:28x152x2048_n"c513133861b233ccb7e1af57b424722e*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x12768:: 28x152x2048:28x2048x2048_n"015d142a4d490817e79cfd36e1348a36*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x6384:6384x25536_n"bad0fb822b647068264734f381c27674*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x4096:4096x25536_n"585393749f9567060a7053c4bcfd5dcf*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x3072_n"8a3a27e889c035b679e063297415fd80*46&76f79012c2a93fe2826de9f265b759fa*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --wtag=cab --dtag=abc --strides=:: 128x77x64:128x64x77_n"dd193ea42381b8200c78450ead085283*46&52f6f526a4d74af894ec5ecf95106762*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=bac --dtag=abc --strides=:: 128x77x77:128x77x64_n"71a04af34773e7c42aa3b58b5419073f*46&3256e6f31a1f0a5415286e8422f3467f*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x1024_n"1969ec8bddcb51b33815f2c227f959f3*46&74d2de1f159b3280e3a43e06ae3ba96a*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x4096_n"6653b07a72af4b520c44469dddbb2e85*46&fa98cd87be63ef4a3498f5fa8bab3e68*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x4096:4096x1024_n"66556875675dc01cd89f2ac6d92ebb7b*46&8f5c4bbf69ae1d5e3b0bfdd73d66acbc*23"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x320:320x1280_n"a532cf56be1b08b2d3a04a7bdc2d250a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1280:1280x1280_n"2465608cfc55763d8a68b966900de8fe*26"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1280:1280x320_n"464d1c25cfb9dd991c9dfeaf22838cfc*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x320:320x320_n"505a3072871c8b18bf088d3b09b1bebf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x320:320x320_n"c41d2bb36dbd981a8600377a16ec37d6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x4096x64:80x64x4096_n"0c2c8784159a300bbe98cb162c1a8794*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x4096x4096:80x4096x64_n"9d16dbaa1cde87d7cdf70d4d5f473476*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1232x1024:1024x320_n"cd5c1fdd0ac9304574bff62601cf9902*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x4096x64:80x64x77_n"388c0930f4f1a6649b1758565f00ef96*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x4096x77:80x77x64_n"bd3ec62c45e087f39faa92d3b38610f7*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x320:320x2560_n"1bd32e6cad7e3a215da093a974eda0c8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x1280:1280x320_n"f0b00f2f9a9b285f166e3c290580b94d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1280:1280x640_n"d534971200b00b2ec79a7ad6a18519f8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x640:640x640_n"ed5cfdb7393f726e885d78c5c357895a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x640:640x640_n"c16cfcf91ab8eebec739e4876ab6ffd9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1024x64:160x64x1024_n"b9f42d028560b33b22eef66eab0597a1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x1024:160x1024x64_n"e88a426bcb4664df8522af55b8152114*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1232x1024:1024x640_n"ab8f48cde22863587850a4ae43cd97df*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1024x64:160x64x77_n"0f59d36b97effdb2208d74faaa716acd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x77:160x77x64_n"0be80afa7588f80f264a64a9443b750b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x640:640x5120_n"f86434a83a00e896db861d25447d5fba*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x2560:2560x640_n"93adebd687a5ed18157aa4407fbe895e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x1280:1280x1280_n"f23593565bf1dbf1b4cd5ec3497dcf78*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x1280:1280x1280_n"37ad976da93c7e2ffa51216ba5442e53*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x256x64:320x64x256_n"e60ec19dff383db3d2cd570858d8aeab*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x256x256:320x256x64_n"36033dd14a70a5d2329184936e3ba819*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1232x1024:1024x1280_n"f7da312622103633c4792275cebaf220*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x256x64:320x64x77_n"ee1ad381b8749ed642c29009223c0c4e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x256x77:320x77x64_n"8be8a5f1e1dd3ec1d1f0ca385634025b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x1280:1280x10240_n"ed822d59ae05c806c885762876d5b213*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x1280_n"0760a92ccac1910a4acc3158f4650019*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1280:1280x1280_n"fbe80a7d8299d86619f226110d26622c*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1280:1280x1280_n"31046b320c4a568c2b86c439d9de4563*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x64x64:320x64x64_n"ec54277083936267b2068f41117064e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x64x64:320x64x64_n"13f18a8988a64ef2c7daf4f1a177bddf*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x64x64:320x64x77_n"3624fe0a7cce7d0447300870a8061071*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x64x77:320x77x64_n"10c7cba315a775e8758fda7efaa0680f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1280:1280x10240_n"b2228ea664268e221aa8329b3ba607dc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x1280_n"5a3ce6746862ae5e67b3e355c0f118f0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x4096x512:8x512x4096_n"497d1589d8c3cd2cdabd13ea2c93e04f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=acb --wtag=acb --dtag=abc --strides=:: 8x512x4096:8x4096x4096_n"8b7776b978192ad73066b414e99832d1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x2048:2048x1000_n"7275acf45f865e3aa878aeae7a1dd67f*20&040cf6245d7dc10f0cdb372f76b09eb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1000:1000x2048_n"ff661dea9763d157e6f68c48944c2592*20&09879b43638977e1702332dfcd57dad6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x256:256x2048_n"af92d6a832828055761004fc178d81e7*20&7f3698b70bf92025cf6db49d7c5dde79*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x512_n"81bd03a2551582f160a685ddcbdca72b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x512:512x256_n"99ef4068690cc839d305d5239cb4daad*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x49_n"3ace8b10a375de2a523639dd54f4f154*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x7x7:16x7x15000_n"ac568fb508b9df01e647eade454e604a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4096_n"a1f4fc4741fbcf84847ccbcf37c6d23d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"0a9f191ed387a2c52a31ff58fd78bc88*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4_n"1daa86432768341b25b4a838023503be*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"52185278f012de86c06ceaf74bac5120*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x64x15000:16x15000x64_n"b69c524815cd0eb7d050709abf5afa40*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x4096:4096x256_n"35c3cc809da79cdabc7372b955a50757*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x16:16x256_n"955352e5cded9cd94e3072d046e15a22*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x256:256x512_n"ddde46be845b888e33e65753a02feb02*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x16:16x512_n"5cdd2d9e2164c748f69f04222ea049e6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x512:512x1024_n"a2c7e01b85f4a98dd0bd7f3b0d75f944*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16:16x1024_n"605ca831c6d5f3dc46eb2434dc4259bc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x7x15000:16x15000x7_n"fb8ff7b4602002c324f03ae0c3a604ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x49:49x256_n"6a35a3e7ff9122a53dd31832228284c3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 49x16:16x256_n"5e7de7fac4e44edf09b63a87fafd5874*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x2:2x768_n"5970dd86900d4ad1ccb183c0dec4c0f1*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x768:768x768_n"cf06e97ed623a50faddf717009df97d9*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 1x12x128x64:1x12x64x128_n"8f9966b3dd7bd323ffe9d9a64c774085*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x128x128:1x12x128x64_n"2a544da1ad5be559e7196350b54ac1be*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 128x768:768x3072_n"c595575067a28e5d200e262fe71acb99*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 128x3072:3072x768_n"f906dfa888b028c50826f36d5f7c10ce*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 1x768:768x768_n"776e2951dfb668e1c641f04961bb84de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x768:768x2_n"91271ca8cc1a8e670c3040106b7a8d0b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=u8:s8:u8 --bia_dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 1024x2048:2048x1000_n"56f4b39a8050b7e6731235a42d6ffee8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x1856:1856x128_n"2a13452fd563a25ce9e7bd97bc445648*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x128:128x128_n"dd1d74c915eff24a0c10af9233fe3f37*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 32x128:128x1_n"33326d54887dd6bcaaa6139df29b7823*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x1_n"2d252e59d86ec64be7f5ef4c312c817e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x128_n"3b49c1950ae50195ce5b7f48b6692b2b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x128_n"2f24928b9c94abe3c8993c469c2c82bf*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x128_n"795be08651032bcfd406eb94ea2dc21a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1856x32:32x128_n"63e22c32aea183e4856c3853046c12de*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x1856_n"14697703971b62ec9bb2d8eb31d1bdf2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 1024x2048:2048x1000_n"f4d908a0bc7d805e3ace2ddb47d7a348*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1001_n"55c181899b3fdb7cbe08c94b30770621*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 117972x1536:1536x512_n"d1b0d8c5ff6621467eecd6b22a4a011d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 117972x512:512x29_n"113e3d86dd26deefc221a2c574988cb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x117972:117972x512_n"c53b6957844791f326765c82a5e9f1e6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 117972x29:29x512_n"bd97d2fd0e99750c0891dd85c532a212*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x117972:117972x1536_n"395aced1420913c1ace4c3a4e80d2f9f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 117972x512:512x1536_n"885f01d020a047d334d30a278f0fb22a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x16:16x2_n"5493fc2ee5fe0caa582a96feb26e40ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x499:499x8_n"21f5d5cdb205a06283191d37b568ad5d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x499:499x32_n"ab84c5640a91d71e9283751728ebfdc0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 5000x4:4x8_n"4dcca5f2bff49be0d564fcc5cc610734*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x8:8x2_n"84e65561402299b899ee2705a97215f0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x13:13x1_n"0db1ce545cc4f19ee6b99ba4849cf87e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x117:117x256_n"812b065f2490d37a420bb8db2ff918b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x256:256x128_n"c7fbade0e90f32bd863a477c60e5a067*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x128:128x64_n"95c127e36abaae93b5eea83ab3a61c67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x64:64x1_n"c85a23cacf6c62fddc28660f0e835432*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x13:13x512_n"e36300ad8fb3f17cc16388b2268ff55e*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x256_n"398dbf9d2ba5e0370f410606cdc4e36f*14400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x256:256x128_n"329633dcad38f9ea02877826824f2e0b*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x3456:3456x512_n"1805590b33dea963bf05604812297780*21600"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x3456_n"17508b5cfc25c4f2c6c47d80ea9872e8*21600"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x3456:3456x1024_n"d4b11e147bf4803ca036916243b650ad*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"4be55790820ded3d55b7e5bd35226dba*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x512_n"39aa37edb6d808307008601247f99ef1*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x256:256x1_n"3047b2fb1bfcd41110fab74b654c4cea*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1:1x256_n"be2497200dafe0d5108d21de64cf4a63*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x8192:8192x256_n"7cd9fe4236b39259fd39b434713db4a9*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x256:256x512_n"7d47a1f21da8b9ffc4c41e0677ffb96c*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x8192:8192x512_n"23d30008aab2a4e21fed72bb4257ed10*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x1024_n"49e376757dcc304124a61367bfc59d1d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x8192:8192x1024_n"d8346691d32efaefc899e74a6be3da10*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"b118543d3ef9d21f1b381fbe6d49872c*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x8192:8192x1024_n"42473e154f316190d3d70beff873a983*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x3456_n"b655ea1f9b97a1ca29cba3ef2bb7c314*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x8192:8192x3456_n"c34b977ecbcd0bbf28223d3e9688065a*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3456x8192:8192x512_n"09e53c9878150d88d7596a6042a43bba*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x3456:3456x512_n"950f6d084ea17b185f15d6a80508ac9f*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x8192:8192x3456_n"f73569b39cd5deccca167b7bf6eec3a3*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x3456_n"93a66754a12756b294e89f68c11f71b6*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x128:128x256_n"68083fad7973d9c1608f82e1b5c4cac7*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x8192:8192x256_n"9ae2c470d0a0620c4bf5c8abee7fff24*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x8192:8192x13_n"ebadca87e43301a6f5e2c7ed6203089d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x512:32x512x11_n"4ece57bf7a9de3e804795b5187041242*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 24x1x512:24x512x11_n"c668b38162cd83da8d30e2417f389e8a*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x512:16x512x11_n"35badab6f3b639201d93a5a16dec20fc*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x512:8x512x11_n"866a925105a22f4d733552114df3c079*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 8x512x1:8x1x11_n"e637f7ca849e2b4c005800abb1a257ad*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x11:8x11x512_n"1e79e38c75624e50deb5b8f4a01b1405*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x512x1:16x1x11_n"cd372762e6537a8a7ed29d683b2ee6c4*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x11:16x11x512_n"c1e3cb27ed3ce7731e8df67da7dac13c*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 24x512x1:24x1x11_n"93fa3a6b497ed072f445b31073965836*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 24x1x11:24x11x512_n"aec8d9487c6d4fb346464aa40caa3581*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 32x512x1:32x1x11_n"78a8c1a808e164676f25eae41075f760*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x11:32x11x512_n"b68cf9b8cb1bc1429e16512a68e97e3b*501"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x4096_n"ff9a96ba6076210ed76d02a1273543a7*1280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x32x128:128x128x32_n"70651cfe4e7a92d5894c4b04dfc601f0*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x32x32:128x32x128_n"71a7ade621b13ac0878e7b5182525f36*32&270d9d7cc63593ba62f8e2734b5c06b9*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x16384_n"9f88bad0a2072d9cb226857ce272197b*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x16384:16384x4096_n"57369f1db40aa8dcd55db7aef7f2f69c*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x50272_n"4ae16d803ede6a3c8527e7e22fa3f126*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x33_n"e8b52976b8b5177e0cb837290c81115c*32&86fdba733f37f5d41fe91b46e7432b8b*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x33:128x33x128_n"677fdd4018d14f80068e577466b9d40d*32&3af1abc66d3dbbce1569b506ed9a1d13*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x34_n"f0099fddc818552fd455aee1661de99b*32&2284dd04198017317c03f115081e4879*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x34:128x34x128_n"e8c5b3960c27f9824b99e02d71093fb7*32&dd080aaf53ef408fb4e943914898d47d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x35_n"6a14ff63549eeb93aff403acf688b4c4*32&89e0dab0c605c98ad5abc3588c377b8e*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x35:128x35x128_n"8d7074363a0c18a438a7539b79518eb5*32&e3325f1802e8f4e221791d208a795602*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x36_n"36e5881a2660f667af398d66ed483a86*32&68e7cc30eaded738ec2e33961357e344*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x36:128x36x128_n"d26213c9004bc5aeca2e3da356031578*32&b3c205dca2fdbf4f19ce0c0e6db8831d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x37_n"67442fc22792798016f0289cb7957340*32&7c492127e6e051a489b8feb08d6656aa*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x37:128x37x128_n"e29a799c2eb7f97a8be07d12de6eb57b*32&3df10f05d69a43e59b788ced009495b6*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x38_n"561d844458bf220fc617be2f93e96f38*32&a8b26c7e7a44fd732c8bc238b0996666*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x38:128x38x128_n"efe83d093f599df65d734d2102d8a750*32&864f8bd542c12386fc15c9bda45b3f76*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x39_n"cccd71fa2a2a3e0c5ae3b4fdbdf752b3*32&8f23f463f7d66a3e59db8bcc769fc530*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x39:128x39x128_n"754df3c84b5aeda38d85245b4420911a*32&3206f4d9a8a456d145171fb3c4d04df7*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x40_n"ab5b059261833f375db57bc3a04334be*32&4b06d8a9546c6c1d42f6128ede17ed28*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x40:128x40x128_n"e4d58cee306328e1636b5aacfdab096f*32&ac3aa06af8c8de6890a3e0bb43d4fe63*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x41_n"5e9073c2df826d2f93011397b5c4e5bc*32&fb3a7c06e28230e243826d6586833339*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x41:128x41x128_n"edb536be9a4c4ded6f621d5c8afb05cb*32&3511405445ad48ccf0a3bacf26229c92*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x42_n"513152103a3c1aebf1d3ac9d652564ac*32&86d58a49617c31728f70bc8ece780a38*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x42:128x42x128_n"768e54a7add2ac8a2999b7af8ed52ee2*32&c3fb23b22850e38d42fd5ce0dae3cac1*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x43_n"3f1166ace11695ff379840e1916f54b4*32&33383ce9dc8774b4b31ea4bdc139c15b*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x43:128x43x128_n"35827d333d55f2905d52b8832811ef5c*32&79361e04d365dfb1adc6105a17a1e569*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x44_n"425496a56bf20d338379e0cf9224d3fc*32&ef5fc038743d287260bd24607cd754e2*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x44:128x44x128_n"e9444e9f66fe1ae156a9a5b00c7ddc0e*32&a15b4f98377db4b72f050c263d222ec2*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x45_n"14e3a7d7300c4b44e4a30592a0bb3b43*32&7c99723d1ae9ad0143d7734b61ec7851*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x45:128x45x128_n"3049d29772342049abe0b389a118869f*32&6be61423f6771b433495dbcf342ccb76*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x46_n"1919d2355e86933cec2cecbc02bc3a79*32&97b712919c8bf4f1cc4e9958a8428fcb*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x46:128x46x128_n"5b3d5f3a4d2e34721033a84306623716*32&fef30e40b662b2949230b09626c8d153*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x47_n"945110fd766f6fc67bc68debb796b52f*32&161a67d89074cce65dc36add89e6479e*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x47:128x47x128_n"8bb98f88d3492417d51a5639b6e56d5d*32&02259de947c0a2d7c34051753cdd6aa3*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x48_n"037cb647885cc5e5ea9255fa7948dcd6*32&b6221917a0beae675379cd04f3b9957b*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x48:128x48x128_n"e0e3d63ad564e9b5436e5cd64fbc7d38*32&35e9e5e310669c955b33bbe61e29b32a*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x49_n"92302c4758b11e9c4c67f2d7a5b72647*32&13b2b1b1f018e2c9643f4a509254af8a*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x49:128x49x128_n"e0f159a5466c85f20282cab02fa1772b*32&3c81e530dfa87016b6976575796fc271*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x50_n"71f01f968fd2dbf64300d4bff8ead40e*32&5189d0bee383e23c93bd366ea7e4a3bd*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x50:128x50x128_n"4725736fa4c3d6879295837d3c32fa69*32&2cf07329465166ef49785b271fc9cbf8*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x51_n"7d2799c317f6a9e5f339c8694ea57012*32&a31d06499a49d01f13a5b6a4abd2e70b*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x51:128x51x128_n"217096eb40f2ee6784b59773914f087d*32&6a548cfe0959e101f90efea6acaae4fd*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x52_n"c81dcf20c6981c392f154068d5a92d47*32&20ba386f89a1c86fc098605238725b41*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x52:128x52x128_n"60901060e96b8ad582ea3dd909438276*32&1be06f44c2674114e9b8bf57d9972c7c*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x53_n"55a297b243c405bc1fe9773ba4d48247*32&354a2391567c579686d73f5dc01fb45f*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x53:128x53x128_n"04d52bdc2d32e809af94bf721490ddfa*32&962e334ee1e55c20a857ce88a41d9be2*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x54_n"c4d590048fe6721501ddb1b11719dedf*32&95dc88da2af76e5f6fa65f7cd013633c*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x54:128x54x128_n"a111a42eccc289592d63c9e062137bb9*32&dcb52c22e97984eb0b8a33ea49c9370f*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x55_n"de274db06a693ba0ec219b8fde139767*32&24c31b444137b7d2e8602aa0be62f5a4*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x55:128x55x128_n"95fdbb43cec01c89264852e507a53fa4*32&876e6e8eaacd93813504d5251967c3e0*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x56_n"90fbb7c4af4de0ad069df9084acd4c0b*32&039a31367cc264014e27ec5fc4b747c1*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x56:128x56x128_n"fcbc58aa3a6664c8e2cfd2e26445f9df*32&256b0069b695650e1af720b60299887d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x57_n"3e8f7d809040380b6b5103321d184625*32&4165947d2af1fcad800ca4fa53c5330f*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x57:128x57x128_n"dbcd8811fa35ad8b3830bd1b3b14482d*32&86d71f69e9133262188f3defe7e5266a*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x58_n"7d0805a78d3a72ebf1ccfd80da695f8d*32&b65520f7a54d4a493394c594c854114d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x58:128x58x128_n"0c154618463c0f395e066f7f206c7004*32&225cf9f421879bab75201e6d5cb1edee*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x59_n"6a336b2dcfc621ec0a599bc2093e5b3c*32&e0e6ff9a5912ea0423227c4dfad30959*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x59:128x59x128_n"d9740114474e50d1711125cb4b928d2f*32&e844f09a2584183d32f22f0cb38db999*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x60_n"7c9d40d26083891b13511db12a705d6c*32&16329f7c427650b9c1246801da2b3afd*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x60:128x60x128_n"d020ba4975674468412caddb78ada277*32&9f9ed32e698a8cca1eb68279d8dc44aa*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x61_n"56d6bb056eecc1d2c2875a615bdd4b26*32&e5d7fb0e312cca4f60cce388a9c48635*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x61:128x61x128_n"c1bd766c6ea728acefe5f0989df3c56e*32&57727f91e94ceac898ff7c1cf0c05d3d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x62_n"b526be3ce3fc83be8a9b87a9411e23af*32&0e3d9d6a13fc3ecc565291c6ba3bc200*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x62:128x62x128_n"7095c96b93a923bcb8676918cd4ffb77*32&6effba71f3626b3f5a534e465b13469d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x63_n"d00a814373f84da1a1573137eff79425*32&5ec5e89a227075e42a0775adbb56d60d*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x63:128x63x128_n"31af10a780e6b6f95cbed5266886fa31*32&76c77638e0ac12ca11d9753fbff5064a*320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x14336:14336x5376_n"b39328577d3267175d7c5934d84b3323*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x32x128:56x128x32_n"93f3f6c9258a78895f816913c4357103*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x32x32:56x32x128_n"aa4b0284ca533e5042176aca865eba1e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1792:1792x14336_n"1ea7bbe5a03d49efd0750e2bc587a51e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x14336:14336x7168_n"e58f338a56d5c70a6e3de07ab6248a41*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x7168:7168x14336_n"bdf7154265362fea9342dbf24691ec9a*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x14336:14336x250880_n"5b75238bb9bfe5ae207aa4fa7cb5794f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x14336:14336x5376_n"5dd25e36bedee91fe57d618b7f6b9563*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x33_n"452d9c57806deb3589b4fb09d6187cc2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x33:56x33x128_n"79b7572f5fea5d338f77ea3efeefc8a9*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x1792:1792x14336_n"4553e072ebb2ad3435cc28a2f6bdc1bb*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x14336:14336x250880_n"73cbe7b1f66611e6bcdce8bd660b2014*124"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x34_n"851d0c189a42ce371c0978d4d8ea79cd*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x34:56x34x128_n"913a3d02cb9cfb4a59a9d62ef50e0784*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x35_n"5abbdc1f31bc9a0e0eccc7e98ab2607d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x35:56x35x128_n"4382f9178e35ac3d567f3a3cc0586841*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x36_n"d314ceb5407a3ea88b5c2797dc531186*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x36:56x36x128_n"7f79fc3c7e405689252a2758b2fca2d7*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x37_n"07fe30ec8a4a3c2f4aa51e48dbf59f82*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x37:56x37x128_n"5ea80a79ac92d2abeebb352b3d566221*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x38_n"443ce9cd88e2e031a97fd64b9c808a59*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x38:56x38x128_n"dfbe40bbc4fb4be04869fd3872e11ac7*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x39_n"c377f60adbc49bbef6442a4c938003bc*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x39:56x39x128_n"ac7e514212bba72a2c3e0142400a4ed2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x40_n"48e4eba8129800e4d7a3e4c95b3f8efc*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x40:56x40x128_n"83da4d186343f14a448c79a447d199a9*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x41_n"21a15bf0f24cdb53dae4c4858934d9dd*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x41:56x41x128_n"fe106f9e262fe19041dd986b0909a254*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x42_n"f9eea45eab1fc7eee66b6397e1b91015*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x42:56x42x128_n"333e34bd37e9b8568a876e531d3268ef*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x43_n"d25f0c570260ccbeb55d0b0859b5f0f8*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x43:56x43x128_n"dc2c83f9c5eebfc88ce1ab43ce8f67bf*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x44_n"8e6806123004ebd539864be352940389*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x44:56x44x128_n"074e8e2d64d4057f0d1739f964f5adb6*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x45_n"4909eede97174ca9b7cc237e515ca6b2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x45:56x45x128_n"7da646bd50c8937eac17df6d46b014a0*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x46_n"716311f7024e9f9081470c17af6cc715*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x46:56x46x128_n"5e3fe09f4a4a779f7d14a7dfe58c7e09*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x47_n"bf9daa2666bb109d17a1d8633ab0ff63*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x47:56x47x128_n"f7bda3c78ca88cf880e0c322d78ac7ff*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x48_n"718fe331ec9160298450bf308be7ad9b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x48:56x48x128_n"361b3e2c6a36e9e2bff7aa5f2ff7b26d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x49_n"3e6bd6171a71d4ac20dd086027d85c40*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x49:56x49x128_n"c2414fb7487575d8ea878c0d95e4fd35*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x50_n"2164ddfc7d08ca4fe83c8ad69b162a20*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x50:56x50x128_n"a01ce3784067995d4f8556a8345927ec*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x51_n"42351b84f6dcffe7e0884bf17a5a6971*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x51:56x51x128_n"f2bd13ed7ac56b9d667797ea28f973bd*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x52_n"1a2e3215f08d5d3a8d290a33cab189e2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x52:56x52x128_n"10ecbf67445f5cc3c2ef553a6a2ddcce*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x53_n"424a7529a2692985b46fc726502c36ce*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x53:56x53x128_n"4b9f326973ab31cec2894d967ae18c34*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x54_n"2534ef8eda256ab4384338b5e7d521c9*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x54:56x54x128_n"ff1c3591ec59a3d466d936ddb99218d3*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x55_n"47082000a9b8881bbf7c1805ba617619*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x55:56x55x128_n"ed4f02aec04d09467fc4decaa70b6219*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x56_n"3ffab0b9bb681f080d0739b41959b267*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x56:56x56x128_n"53fd99b1133d508f478694652256f696*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x57_n"08fed2bc6a2c2ba262d7af922ee90d89*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x57:56x57x128_n"9172af8ffd6f4256f77ec9b12da652e1*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x58_n"19ddadc8bc65184d46c4f9259071e75d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x58:56x58x128_n"75a5f1bffef2097425c6d307cd087488*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x59_n"6ed5d3130203492ae95a3a55b3f479ab*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x59:56x59x128_n"7952fda19469b0d13c81678c672a7645*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x60_n"5b254681d8acb4ab83098cd8ce9baeed*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x60:56x60x128_n"f1d8b9196b85361f491beb8e65b10db8*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x61_n"8f6798264f4d622c08e6dae48f35932e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x61:56x61x128_n"1262776dbc1546fdad8ecc6214ee1b34*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x62_n"c0fe92cdda94f36e8a0bf0940cc5bbc3*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x62:56x62x128_n"8c749393cb1823acc8528835fb7b1159*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 56x1x128:56x128x63_n"5c379b2e1848d9c9be598fad82795339*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x63:56x63x128_n"19c2502192defab166922220b64be1dc*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 3136x128:128x384_n"1c114d5a96d8ab24cd38c54d5efcad07*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x32:256x32x49_n"64bf773e0c76c76f2b64f1c5a561f764*2&1f4cf239f56e89b0a6d7d3341d362ba2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"fdf9ccf39a8452a3ebd112e33542ab7d*2&a778e527163ce4c379291bcecdef9991*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 3136x128:128x128_n"5d3413a663f8877081600c0b2569468f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 3136x128:128x512_n"6329889501e9e8682d8cdc8af54fad6a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 3136x512:512x128_n"08ae7a6f67bfa67f9afb7e813876598c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x512:512x256_n"7d42e23e88267efb80a7de1362bd58ca*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 784x256:256x768_n"f7697fb7efcc060148c79705b5b72d30*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x32:128x32x49_n"bdfa5b1726251f9fdda706d2e0829b51*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"536179491b29914f671b828db0ebb5ed*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 784x256:256x256_n"b31ae5ee61480c80028d3ded8084b333*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 784x256:256x1024_n"05e85a6a453d6d5aec9e328793f2caa1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 784x1024:1024x256_n"d422b08370e90c7d329e4d1ce935aeda*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1024:1024x512_n"e3afb7de28324929f091c5b9a3d79b90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 196x512:512x1536_n"e2bb29961ef74233acfc28d35a98ee54*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x32:64x32x49_n"5c663020d6b8abbd1b91945e871efd45*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"9e8c5c7ba6b946735ec3ec85063cde92*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 196x512:512x512_n"4f439325bfedb95ccf5a86be6f38333b*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 196x512:512x2048_n"89b3483707d531b5d0345968eb76958e*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 196x2048:2048x512_n"e7be4337531cacf91ba9a61ecde8a72c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x2048:2048x1024_n"4edddbe44addaa10a545e5d97f4893fe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 49x1024:1024x3072_n"1f7dfd33d38595f5afdc22dba0f552cd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x49x32:32x32x49_n"392a69faeedaf10797011caf226f32d4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x49x49:32x49x32_n"92e0fe74cf73090f0fe1c5e63fc42ff4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 49x1024:1024x1024_n"1e2a1dcccd1858231e8f184309cadda7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 49x1024:1024x4096_n"0619a8f44231100c375b84acb8104fd3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 49x4096:4096x1024_n"3124bf5d30db5394f5feaec7fc91624b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 1x1024:1024x1000_n"87c21217e85bbfc09927ddac8216925c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 1605632x128:128x384_n"7784fe187a190cfe2588fc29da0545b3*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 1605632x128:128x128_n"bc1d6a0d8662ef17fd5f5449879e98dd*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 1605632x128:128x512_n"3740f6704998d4d0c145f5d83f1e7a9a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 1605632x512:512x128_n"c2025ba68b974400852c67bb47078a23*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 401408x256:256x768_n"69dfd128ab5f3f65c153731a3cca4d9e*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 401408x256:256x256_n"4507d3064d143315f8205587e58a0686*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 401408x256:256x1024_n"c94406ea1f4bbebfc3e9e5882b3c6e4d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 401408x1024:1024x256_n"17c8a37fc37b54aec50d4dbf36756d9a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 100352x512:512x1536_n"ba9a7b48e6a8a273544f30168d2220e3*1440"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 100352x512:512x512_n"1732987f2dc32b6341f7126f0cce3b11*1440"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 100352x512:512x2048_n"2521347fb59e0e1bbccbea829c9dcb7b*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 100352x2048:2048x512_n"454b4f3d10cf5b3e48730dfd7dc0fc5b*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 25088x1024:1024x3072_n"7d21be603186b11c7f20f2bf6f091e48*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 25088x1024:1024x1024_n"85700d31eefe13b8133ca5beb948d307*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 25088x1024:1024x4096_n"c95b0e09c70b43470a206e227767e0cc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 25088x4096:4096x1024_n"d5ae2e85869dc099c43d1a1257836c77*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2 512x1024:1024x1000_n"89ac68b3c52dcb6df0a1470a7938448f*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x4096:4096x4096_n"ca7958ef7fcbc29b633d76ae6e0c6e47*112"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 16x1024x256:16x256x1024_n"d677622ef1b7f645a9d37349c3930094*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 16x1024x1024:16x1024x256_n"5910e0732b2b9ca6a0b2b36a26765e48*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x4096:4096x16384_n"7af8f5237e45028b4554934ba4126420*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16384:16384x4096_n"9c818e5875b907a2620b999dde1cccc1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x4096:4096x50400_n"d9fda5a1876f785f2136700b42395530*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x4096:4096x4096_n"4d1fc30f3471d4d7b4492fb15d9974dd*3472&0acbe67487e21464483aa4403620b794*14224"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1025_n"17cc98bfa6cd92249e51680d4517a5fd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1025:16x1025x256_n"e30212a7e618fdf42d64188eb37dd48a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x4096:4096x16384_n"b15f2aaae6fb53bfc50cf11dca491750*868&dfc894a5f55ed6f257ea900b44aada0f*3556"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x16384:16384x4096_n"b9b61b7f7dc6ad95829f4070ed32b599*868&a0ce972bf5788cbd287b865e3b5ca069*3556"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x4096:4096x50400_n"cb9f5fad68dd928c6c2bcacdb190a3cf*31&e87c31354fc90f6650692dd7a8e82958*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1026_n"b9e85bb7a1294830ee202b4e8688eb2d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1026:16x1026x256_n"f0bbdfafd43c098d0e6f3b0aaff5edbd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1027_n"21f38331879e00e9f52f05dfc95c30d1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1027:16x1027x256_n"0800265810d7cfd393bd654568120ee9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1028_n"424b8a503326cd26d99c93e96e56cf0b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1028:16x1028x256_n"07fbf8dbc27eead60f01c7d84c3ed50f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1029_n"9d4254fad876b80b41ed869b4a8b68ee*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1029:16x1029x256_n"4dcd7e7ef8efbd247ca163a7defc7bcc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1030_n"aa91ae6c135723741a74da0efca9e274*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1030:16x1030x256_n"d774cb4778eedf4d30032c6c0436fef1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1031_n"1d92bbb4a8e2556722a075e7712ffbd2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1031:16x1031x256_n"f374c4a0183b64c16a8485e40c8d2cd6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1032_n"e6989b80d1c7c217ba3f683916f43731*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1032:16x1032x256_n"d69e5669b2525460662851b1e4cdfd3f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1033_n"24a2cda26826c48be46fa58dff5fa9c8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1033:16x1033x256_n"b08199ebbd367081526784b9745721c4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1034_n"3363cbd3c6bc758d5aa6c3741a10630e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1034:16x1034x256_n"14ffca0a86bf37a656392d2b1a9365d6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1035_n"4a24a46419c092ddef728f752145bcc4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1035:16x1035x256_n"871d4158b26a7f7404fde838055918a3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1036_n"ae9b4a38d66d4e1e8e452be05d0601e8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1036:16x1036x256_n"dc4d0755942418c1165748c604c0cec3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1037_n"32414a11707042da8b7e27c64f1e40b9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1037:16x1037x256_n"eef679712dd245424bde616396221f1e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1038_n"86b055de3f85fc9534ae874ebc12bfdd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1038:16x1038x256_n"a5ce131d14040453a37fce9a011174cf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1039_n"e10909c7cbfe71563c841510acbd74cb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1039:16x1039x256_n"fb1b7b7de5a6f508a75fbd13be28897b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1040_n"0438649cb988e11494f6856ebd515526*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1040:16x1040x256_n"d6dae28dd19aad48e89f6f9a2e141d76*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1041_n"937c0572c03b67dd9e685a4514a2820e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1041:16x1041x256_n"1828b606d485ed9222de34596fa81308*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1042_n"1c3c08c6ec7352570ab27b5831e4a840*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1042:16x1042x256_n"c5e22c702a5993a13ce56405d58c26ac*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1043_n"4fa4efe694a84983a0c6181102554fe5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1043:16x1043x256_n"e25beeb4d35493ef8a0f0f9d071d221f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1044_n"ce4c93478b88dc03dd0dce01fbb587a9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1044:16x1044x256_n"60b5b6c3b633afb09e5e645e37999649*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1045_n"fe99f3de94e20ff2dc601c1ce04034d4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1045:16x1045x256_n"35cd1e4a7bed2a070d1fad27adf7f927*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1046_n"9f4a4816e329e47ad7b5baa39d3683d7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1046:16x1046x256_n"803d2a9528e698e79004398ce34bbb5c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1047_n"217f22f2664578d431a4f3790568e096*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1047:16x1047x256_n"502badbc05d55fb4b7214d958939bca3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1048_n"d7cfc67da5ec8f482cd0352d9e26b2d3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1048:16x1048x256_n"a3b1c8b553f4eb9ac8939c50af00ec11*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1049_n"8806f52881526809e20c6ebacb6a906d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1049:16x1049x256_n"0fdb7204d7c92e5576bedeb5fc1a9bb1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1050_n"46192cd185fd7f1c7aaeb7d06fa2240c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1050:16x1050x256_n"39a4d4645070e5185f8adb50711a28f5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1051_n"97c1ca4ce8b2f19f0239755056aef44e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1051:16x1051x256_n"4534c591f5d989e87010b2e665f84d7f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1052_n"b3f200f1463a6f35f27b5b14e0535028*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1052:16x1052x256_n"244aa36fdca0dd7b6304468f8a2723c4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1053_n"3c277964df3381f8c675a72a9df19e89*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1053:16x1053x256_n"00328a9810758f33867a62b103703c48*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1054_n"61fb54d5c3907ff6ef23a6552be8846c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1054:16x1054x256_n"09b315fc662195814cbc3ea9b237f55c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1055_n"71afeba8062dd0b2ce5d55b418794b47*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1055:16x1055x256_n"c97c78f338cfb97a1781cff7931e9105*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1056_n"e52d3dcc83a52411cf1192cb4bc2aa3b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1056:16x1056x256_n"01b84610d296049de158984eafa50f1c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1057_n"3dda86a1fff76e54cc0abd1f87692065*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1057:16x1057x256_n"fc7adb52e92b28d8738d082a8715e6b0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1058_n"7b86518d7f11d27f65d68f212d1e8e92*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1058:16x1058x256_n"8a50ef006b565a4475715fee4518aa95*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1059_n"e14a12273d16dd609bd7e2a31a7303ee*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1059:16x1059x256_n"cf28896ac3f92d9e112c011e42bddfd8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1060_n"94c23225ec144bdb6b9b455e9540a4a9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1060:16x1060x256_n"879e25484b10ba6687d53b63a9622174*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1061_n"efb7691780768f1edd81f599a2ce709f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1061:16x1061x256_n"491046bafbb4fd36463040e6e5e8c27a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1062_n"2d5bff3c16160a5045a0b331a7c974c0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1062:16x1062x256_n"23175e6d137bd60fcf86543e930de218*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1063_n"d68176e6244cb9c3fe7d98a446006dad*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1063:16x1063x256_n"c22e121b4e527112710ade881aeccae1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1064_n"292790090d3f59bb22ebf41a63d63228*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1064:16x1064x256_n"4fa5f3a017ddaf31aeb0e925dc5cea74*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1065_n"df0bb55ad0835a0455f33fc85ee364d5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1065:16x1065x256_n"84dd50d5bd73ee7763b8a1fa62d7c922*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1066_n"b655374d0d0cff47467cc3113673c313*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1066:16x1066x256_n"c6353a2bd1d1e54ba5c0be36e884ccec*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1067_n"f46640d20ecb304c88e63483b88b4cbd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1067:16x1067x256_n"5e3bd8c6007ff64f6db3cf93fdcf51cb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1068_n"bcdcd939a31790afdfa8b2a52d5ca27f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1068:16x1068x256_n"3e9213b95418dff11570ba111bbd83a9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1069_n"7687d7024e8dc976af2b40ea8e3de83f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1069:16x1069x256_n"c78ba358988736e690a3dfd9606649ef*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1070_n"4aeb2c9022a28234b50d3df9c85efd0a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1070:16x1070x256_n"407b2b4b0e081b542e788b16f1aecf3e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1071_n"1329e177a63b2294561ee0cbd9301823*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1071:16x1071x256_n"b5428a9c12cb1b74232d43396e79dd0e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1072_n"664d9f910658ce6934ef7d9da19413ef*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1072:16x1072x256_n"2882928facc4a582a672460390aa5b2f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1073_n"d7acca5289bda2bfdb2ec4147291f604*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1073:16x1073x256_n"3d97cd6a734e9c8e6f119c0067382998*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1074_n"cde05f34c1d19c7e45a8cb5022eaf8f0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1074:16x1074x256_n"1924ab91162d3af1c8d288a36d69072c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1075_n"3b6c51377f4d88efebf538ca71525121*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1075:16x1075x256_n"1af4faf0bb9e8343680c4aa5759b2002*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1076_n"23ac8a690212eb23254fe5f527dbdeb1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1076:16x1076x256_n"91bee266160df512d6c50d20e907e362*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1077_n"c7058259323a63bbbbc3e4c0cbe74aea*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1077:16x1077x256_n"1a266755ee3aca1e7b386a41189441a7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1078_n"2ae1b7e63fc1edb29378b88053558cf2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1078:16x1078x256_n"e6a0a9724af9a70a8784dfa665b1fbf8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1079_n"6fbd4e8ad4be16ca98f57e461572315f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1079:16x1079x256_n"9c040819d4b43b12a9951f497c5f89bc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1080_n"40af0f44edf16a468914b3ff1e3f8c50*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1080:16x1080x256_n"208a4e82fc3c26acc2851a82bd04bdfd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1081_n"072153b65850c3b04a0d6524f55c8394*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1081:16x1081x256_n"dc23b6aa7813e46107eb4f49d2c540ca*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1082_n"ead0d0c22ac6cedb6012157953a7ecb5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1082:16x1082x256_n"c5c33872eebf1fc2382646810c487629*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1083_n"be0fad7a1f207947f510689509c0575d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1083:16x1083x256_n"ae11f25badfb500f6110073dea382f83*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1084_n"cffb0bdbf8447ae82fdcc06f570c0852*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1084:16x1084x256_n"62ac34ce0c340fec98318c2993834220*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1085_n"70330c170506e8f63c37e0760236367d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1085:16x1085x256_n"5b7340e57d618e882744507283e147f9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1086_n"ad76db2b35f3ad1b15e292406fff5821*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1086:16x1086x256_n"53d3f1957c5723a628e8f3e0d8ab4813*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1087_n"ad464703575303a76e72fd5ac58193e0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1087:16x1087x256_n"e5aa26fe4aff265c40e11336d50b3ed2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1088_n"5be33aea3a66514b6d3e4b20b1207926*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1088:16x1088x256_n"322f288f65f374c16668dd48b51a46f4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1089_n"74df73ed803c7859e0062314fc7aa743*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1089:16x1089x256_n"f458de67902f5c1e2d8d544aee741ca1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1090_n"bc64d71a932a6161692dc919318b55c6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1090:16x1090x256_n"912f3e45ece779659308f456c33243ed*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1091_n"ca3d7eddda883c8aa6eb1ebee341c4ce*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1091:16x1091x256_n"704a0509e6e4e28e4372f04c3bbdca60*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1092_n"3818ca79780261285d76dd1bdc761291*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1092:16x1092x256_n"93d900a4baafdd5c13dd058334deef8f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1093_n"5c30812c6aab1899ac63e4e084142e97*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1093:16x1093x256_n"c5c59c2bbf075f6fb50df9f57c6e6cd0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1094_n"30327251b41ff676830200005565da23*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1094:16x1094x256_n"c2e4244597a77d70f96939e6acd8ce59*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1095_n"23f9cd52c756db06dae23d92b3438ff2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1095:16x1095x256_n"04cf51ae26a40ed89020e673d7ed7e37*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1096_n"bf2dfa91092f60cf811f175094da773c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1096:16x1096x256_n"64f6b8c4f60f50f3e18805881c246276*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1097_n"81623c578b5afffd0e0a4592af7c8836*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1097:16x1097x256_n"937e4d95954c940e0cc6c452eab54025*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1098_n"259a96d9ea078d9c6d7b6c795b3f160e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1098:16x1098x256_n"3e61a99774b2e0b3b660ac224ad9be3a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1099_n"cd4e7348809ad7ecc5c40c84fd3141a1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1099:16x1099x256_n"85dd7d5f4f2fa64040b7d2823a04434d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1100_n"b5d03c0cdbdf83aa500bda2534dd4f15*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1100:16x1100x256_n"d38a4df808eb14f341496cbb562ac544*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1101_n"5d12f51faae634510b3e6b125976753b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1101:16x1101x256_n"9b5dc968ee4b4d136503be2028285541*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1102_n"ff335369807c7dd58932a7ab4e396505*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1102:16x1102x256_n"87756d582d4704d9789a84afb22856d9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1103_n"b68b99cb42f11b6771a67c93a97234d1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1103:16x1103x256_n"aafbec08e105da88417ceaa6c6acdcb8*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1104_n"553145f2bc078bb631a259d328d6048f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1104:16x1104x256_n"de9f2e4d5849bb0cb4e8a219c3100751*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1105_n"47ec1b2efd48e128320484a2c8d6c8dd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1105:16x1105x256_n"b7705d577af5a33ea721494d8ff5fc66*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1106_n"db50c6bcd3e96d2c92d266baa2cef4eb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1106:16x1106x256_n"55f7f7396c7e27201de37f4cf8fdc542*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1107_n"b9aefe72b02bc46c9730cbf65a948bb4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1107:16x1107x256_n"55124c86f1ebb25265555d14641985cb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1108_n"6886a099ec65af4878c1d4862ec3839c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1108:16x1108x256_n"b08c574ade1d7af3644adcda7a82cca6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1109_n"8935069ca859a4f19008bcf18d832e15*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1109:16x1109x256_n"b38b09a7d78c68ace6f8f3df0cebe7aa*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1110_n"0abb6c9b10d862530ff2ec553fce9b73*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1110:16x1110x256_n"885a82fc66bd8b4ae7a962598abb2e2a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1111_n"b85c117309c92dfe2cb6cd33c540063c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1111:16x1111x256_n"c6d376cc0a3cba0f80b54aa5adbbbcfc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1112_n"a7931c6392b11ad64599ed48bb780ed4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1112:16x1112x256_n"8a110367345914b063e088f41b50b1b0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1113_n"5d7071a4dc2c0162400eff20747871e9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1113:16x1113x256_n"e3452a8019d2a5663bb7994b081ff675*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1114_n"0af09e1c8fac9e1363e2d671189f862f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1114:16x1114x256_n"c5eaf5b6e0c14534f07b3001e4433c0b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1115_n"61d0e96dc186def1cd94a585159f6c79*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1115:16x1115x256_n"a9c4a7543b85147993d5d961dae52974*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1116_n"1e7e7a28f5cbc7a59c638ea16d9c0a02*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1116:16x1116x256_n"c672a7df01f8a6c0c7860528b8b44bfb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1117_n"20440342c3873915891178eef859399f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1117:16x1117x256_n"89fcde18ea692f0900aa18392ca98f59*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1118_n"345a4eb3f492304f58ff4c1aa050e717*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1118:16x1118x256_n"9a0066c416edb9597eda305fc399c978*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1119_n"823dbcea7f33a98c72e5d72f05ea7a06*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1119:16x1119x256_n"cea0bad62d1a1161ac93c5911d6bbac5*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1120_n"5a36f6b8da49b7ca621800265f1e862a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1120:16x1120x256_n"4a5d305f142ebc6deae23857f30232a9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1121_n"7bd78cfc1ac55cf8cfb593c945ac8192*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1121:16x1121x256_n"d355e897dd2d5788570f44f89fba2891*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1122_n"1212556412b3508f642fd91943a181ea*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1122:16x1122x256_n"a8c96493e208621a0efca8058fe04640*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1123_n"31a99c75aca4b0675c1238ce48097314*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1123:16x1123x256_n"c0e143ab943b501230d015cb386e1c92*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1124_n"852a3c456a9549f8d502f47c60f4c2ec*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1124:16x1124x256_n"d7e648e167d8f8c1a2366f457dc07fc3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1125_n"c1a057420015a3335170a9a34ab46979*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1125:16x1125x256_n"e1353a94aa4bd2ccd6179efc3b529c2b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1126_n"dfccfa32a03de6ceedae4999d86c793c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1126:16x1126x256_n"b15754d8440255b1213a7a0ca8e601ce*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1127_n"7687b30960a0fbd21ab6a89a575fe46f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1127:16x1127x256_n"a8ddd3a25305ee48c95554367e6c24dd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1128_n"cae7c41c2ddd2e9d77fb755145b4da7a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1128:16x1128x256_n"215ab9fc2e93196f82b2a29fdc8478c0*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1129_n"aaccd3251419b10a4c0557b66d0ad3f7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1129:16x1129x256_n"66b62b84242b9877bde685637e506243*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1130_n"28c2b60eea996e92b3aa3befc46d79ff*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1130:16x1130x256_n"d35aeee6198f413912375c899d649316*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1131_n"37e4d114689773bf05f93ace8730c24b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1131:16x1131x256_n"579f7c63a3d9f366bceaea57744e60c6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1132_n"081ac7717902dd57be82b249f7c18d47*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1132:16x1132x256_n"ac4b05daa98b388813c466b8aa765c74*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1133_n"1fdcb4c9a4e41dfa6054f00c218cc250*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1133:16x1133x256_n"40ed30675df92ace545d3b14b186e41c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1134_n"dce73f2b631632ecbf65318810f7d9b3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1134:16x1134x256_n"f8bcce6918f3585c7c9d92033bf628f3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1135_n"5b867aee63afa9cff99878cdbeec6e14*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1135:16x1135x256_n"d1fe443b87d1a37e57a88e6e66c004a2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1136_n"96cbe59c35e07b61e85d79bb2c9c5692*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1136:16x1136x256_n"442eb04cca1657313565526624b07cc1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1137_n"5278b5a9c3c99e9da697ed37b6c8f80c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1137:16x1137x256_n"c33fbfeee3693bbbe744312d42224416*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1138_n"d1292e94d8289244eee9ecc5d901661f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1138:16x1138x256_n"b6705e4a2729ada00ee8b1522acd8634*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1139_n"d1e38760c58b129a1503cc221a115f0b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1139:16x1139x256_n"c2c364f8e08967bc99f6db77c0fc801b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1140_n"21cd02162f535b5a45fe9c1facdbba1c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1140:16x1140x256_n"4dfa0847122d0609180cf74d1cca4d4a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1141_n"7f7d06995a356fc0749bed5c0e1fc938*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1141:16x1141x256_n"c7632fe193a36b237cb1353ba123f2c1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1142_n"52f383ed58330eca1a092d68fa8a2ac2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1142:16x1142x256_n"b00c28039af023b964203ab814d422b3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1143_n"a7f3a0d2455372e674b368b427661db6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1143:16x1143x256_n"26a950973db22cd8a6b46ca299243f09*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1144_n"2c1a5b168e5e3c3f38b7d2ab7ed16768*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1144:16x1144x256_n"be8411b36d69e213e63d41205e9cf29c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1145_n"8de53f20b3e5d7a76a417a76d8b98b71*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1145:16x1145x256_n"25aedfe5f28381854f1b7982ea696494*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1146_n"215965ff92d78752f846a4e7cd4f7e2a*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1146:16x1146x256_n"4c0f2d5f97be8c6d54930cbd74129997*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1147_n"551349c973e99146e69e2f4f2d9203c9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1147:16x1147x256_n"5440e2e9ecebd8acfd2552dd988de864*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1148_n"5553b427588cfbb4af6f37b5acef59fb*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1148:16x1148x256_n"5f1eff5371e8498a240317631c8e70a3*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1149_n"5d3257c8cecc092688028a4761eb5609*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1149:16x1149x256_n"07d3a49fb4beb0e8b240834e50c5292c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1150_n"9494f8729ac81a2b2b26876225d574ea*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1150:16x1150x256_n"735534d4f39d64852a276666c265ada9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1151_n"72d0e9e5c147b7c8c04e30cefffe925e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1151:16x1151x256_n"f557f86f728c79a713f087aa936af8db*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x32x1:0x98304x1:0x98304x1 1x32x32:1x32x98304_n"f6b3c23181b492a0e274052e025433b2*65"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 1x2048x2048:1x2048x2048_n"0c5da91bfc7c1a8ce3f49168838d997c*3144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 --attr-post-ops=sum:0.625:0 1x2048x2048:1x2048x2048_n"b2329531e8f4d7c265e3889cdab86fdc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x5120x1:0x5120x1 1x2048x2048:1x2048x5120_n"78d3ff76ec694e968a3c025e83d4d266*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x5120x1:0x2048x1:0x2048x1 --attr-post-ops=sum:0.625:0 1x2048x5120:1x5120x2048_n"adbf066d0d6fd6099d512ed1b183d47c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x6144x1:0x6144x1 1x32x2048:1x2048x6144_n"47edbcd251b7d90e8650c23956faf075*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=sum:0.625:0 1024x1x64:1024x64x64_n"b641c53a5ff736c2fe415975e93c236d*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x64x64:1024x64x1_n"7dad379136c01b0b31fd6afab106de21*3072"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 --attr-post-ops=sum:0.625:0 1x32x2048:1x2048x2048_n"29b00ef2deecec033f522eacfec23674*3072"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 1x32x2048:1x2048x2048_n"80a2a1b087920bf8fb0e2a540cd295ff*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x1x64:1024x64x64_n"94509d8bf55f376f81a1e683cbf07e85*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x5120x1:0x5120x1 1x32x2048:1x2048x5120_n"412deedbbf75f1e579feb2c4c12b9573*3072"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x5120x1:0x2048x1:0x2048x1 --attr-post-ops=sum:0.625:0 1x32x5120:1x5120x2048_n"a0d1c5f579d9ddaabf26d487de603252*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --strides=0x2048x1:0x32128x1:0x32128x1 1x32x2048:1x2048x32128_n"ccb5ed7426cd7c9eb39810ef8b72ec5e*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x3:32x3x1_n"9e8aa01f52cc7a6d4747dac60f5b754c*64&ae6eb8dce3ddca53655a4e29cb630ec5*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x2:32x2x1_n"24e6cfe59a78b0424dc0b8a11631bc04*64&d193c20915f042a5d39192940102fe8a*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x1024:1024x4096_n"8adbc569cc33a4cd225d8a2fa2fd3032*5992"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x240:240x4096_n"580327b2326db478b54e58e919122d99*1089"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x2048:2048x4096_n"9aff3b8ccafde1f4c2a67b6162e14a33*545"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x320:320x1280_n"e50a4ab96a8ef3a3f207592c20b2271c*4188"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x1344:1344x512_n"59ccd0aef412a45afd96ee225632d874*1047"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x512:512x29_n"10967f0027c3a3d55f075ac08ec8ebc8*1047"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"9e04d0bec0856856ba875a92a524a658*1940&8e73a21d9abef5abffea4f2cf3024b15*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x512x64:256x64x512_n"136480ca869b83eb8bd328b4a1d05727*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x512x512:256x512x64_n"349a13267a2199eab5f8c3c95b367e49*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x4096_n"219d68bb6c6e18037e9f18fa1b4ea41a*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x4096:4096x1024_n"fdda96a060ac697db5a47ad0ead00d17*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x1024_n"561182a8f69ef52253efdc0676f1223a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x30522_n"a1599f0d8ccef5b69e4d76a7bac958ac*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x2_n"88d4a44585abccd408df4341138c735a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2:2x1024_n"a39a4d13c8f01b8c1dddb9a9d5308228*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16:16x1024_n"1a55640e65d986f148d111d9af0cb3f9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x30522:30522x1024_n"765b51d86ead729de1a41175b2941bc1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 30522x8192:8192x1024_n"9628232eb0c79affc5f066cd24191e55*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"65e85a30f43df156fc42844411c61a14*1940&4122011e33682b835d36320054f74db0*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x1024_n"6dcae96cd40c8a3ea6718ba2c4d9af4c*1940&0d555997f22158411f1a1a372ead551e*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1024:1024x1024_n"78fb01a646c45c20cb9460d82fb05251*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16:16x1024_n"bc39193c9fc122158821d8bf157d14c1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x4096_n"298e3b36ee8dea05ad87b965ca69e2dd*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x4096_n"3f5fbc5a4af057f1d17fa530472d9e6f*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4096:4096x1024_n"b630c1b58e572a8125b7f52747854889*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x8192:8192x1024_n"8af5e64ae672c26ff75bc4e93ad24f38*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x512x512:256x512x64_n"6e600f03bd60e6e3dac6a75fdb49a913*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x512x64:256x64x512_n"821a85dafff351b56aafaf2fce1aece8*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x64x512:256x512x512_n"b26cd6c425245af73385e66fc8ab69fc*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x512x512:256x512x64_n"6eb603c3604d637c879e1f79b65d1a46*480"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x4096_n"4c56909b93abe658bbd946fca3c22229*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x32x128:128x128x32_n"0a78c80cf09738e6b36db67771a14edb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x11008_n"fb324e5ed4c744d2012a059bf6576224*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x11008:11008x4096_n"98babd9a89c7da5ad116d81f77440d93*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x32000_n"0212f58de899b9bf4208de1cd709536d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x4096_n"02896b6618fd12af5c2857fa00f701e7*3968&8dff701147d708a563d21e14e46cfe63*16256"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x11008_n"ff0e4399e23f49ce015c1da0872a19bb*1984&b54612660137634731a37a276a59fb57*8128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x11008:11008x4096_n"54abb7a083f6e271a95d5768723313a6*992&e11240944b7cfa755f5569b078e6f060*4064"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x32000_n"bd557f2d2298b333019a82552c289991*31&8c2d0caf4cc557e380fefc63fd745a54*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x13:13x512_n"2047157ff652c19393a3f68364f46b5e*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x256_n"4124c7997eafe18ffe4fbc754ac7795c*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x128_n"be20a5afe8467790b9e551eac7928567*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x512_n"bbff5dd6c8018f447805241c7be7f719*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x3456_n"abd26183d11d0e736802fcac4f7336ad*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x1024_n"d0bdd657fc2aa515db44438960dd443d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"89dcccce0a65b5b5fdd1676f42e246e3*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x512_n"f94e541a68f1acae26d2021d576b2881*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x1_n"a25ac62f38c83ac8477493f4c56c5774*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x128:128x384_n"d118278989eeb9bb9122b01d92b17ee2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x49x32:2048x32x49_n"15b58db6d82b321e60c50ed2d49c03a3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x49x49:2048x49x32_n"19859b7b999f97b53431fce8801c35fb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x128:128x128_n"a31116494c74391e53e9b0ab95b18b9f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x128:128x512_n"256098a1e3bba47e4bb5555855f1d445*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x512:512x128_n"08fb3d57067e4d9beaf828abe70ad630*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x512:512x256_n"1d2cd60cbd84634cb06c778204205703*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x256:256x768_n"e41a307a5c69eb963ceb6c4bdc367536*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x49x32:1024x32x49_n"11a937b8242a7b9f796b916f3753bd9a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x49x49:1024x49x32_n"553713de67e95d63fd7f6b65e161584e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x256:256x256_n"22f91a5799e16bcd7fdc2a5cd2ddcaa3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x256:256x1024_n"e495d3e0e3257e95389781a07d425450*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x1024:1024x256_n"d1f166cf43da55dbd291bc51d8fc719c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x1024:1024x512_n"f2e5127030531c849c7c3584b86eec51*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x512:512x1536_n"aa7d9e0c10ce254edcbdcfe27de4ec46*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x49x32:512x32x49_n"d99b7e41010fe526e328b9a8cf28a74b*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x49x49:512x49x32_n"a8ce2dff096858cd6e2dfcddd2ebb468*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x512:512x512_n"c3b26ea130237e5a03193bc9f169165c*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x512:512x2048_n"01b610b6e370b5dcdbc8aa573efb71ca*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x2048:2048x512_n"fa334216771d53cd04e28275bac38ecd*360"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x2048:2048x1024_n"44adb8c81fb22707c5f11a5d6f13d77a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x1024:1024x3072_n"8918ad843bf617e45282d30c498bee3f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x1024:1024x1024_n"54ae901ba8ad87577d2650c1eadf2e57*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x1024:1024x4096_n"7a13038b276db0b538b1a2758fa3988b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x4096:4096x1024_n"a68a6c8e49e3704dfdf9cd8bebe77c6f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1024:1024x1000_n"dddf60b79f6f61c73bd9b537a239ea16*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 2048x49x32:2048x32x49_n"6db7905c9c4a8572b0aa005613b33f8c*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 25088x128:128x512_n"23df9433857ff11fdb2acb60c423048e*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 1024x49x32:1024x32x49_n"f64115b4518ff3eb8a4c53ecb2294c48*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 6272x256:256x1024_n"db072ea8602107ea965a40713df563ea*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x49x32:512x32x49_n"0f7b63dbade4cb1c42bd387d4e67ff38*342"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 1568x512:512x2048_n"745e43812d8aea3547ee8dcd22ba43f7*342"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x49x32:256x32x49_n"4b46e095c32968a6111f2c266023d4ec*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 392x1024:1024x4096_n"9343c12bc16d81368beb61f3ba322eaa*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1344:1344x512_n"437527cb758a54349631f0823abeed24*359&0764611a1bb2d66e756878f95fea8a66*196"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x512:512x29_n"6a8fe344afc759c355d1302e2e554414*359&1b49a173b718f63cf096b9188210cccf*196"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x2:2x1024_n"b388f638246c8652520b47c2a443c9fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"cc0e8f9d4271a1d6c8de0b0ee95769d5*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 16x16x512x64:16x16x64x512_n"3fc05359bda3b396b9844490aaa45ffe*24&9dfcd34650fb4aaa2471f909ee6de30e*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 16x16x512x512:16x16x512x64_n"cdeb143852ba8b0363206112be1ed0c1*48&04231890c99edf7e9fb3c050701b286e*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x4096_n"aa47a057775abeeabf4e003787e5ed92*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x4096:4096x1024_n"098fe4eef7d21f5dd4288eef7342c7a1*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1216x1024:1024x1024_n"9d116a741a05b22b536f5207e5b4028f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 --attr-fpmath=tf32 16x1024:1024x1024_n"caf6f39da646e30ab156516e2cd7fefe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 16x1024:1024x2_n"7839d5860b4eb3c6d3caf3cdabe97a45*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1216x1024:1024x30522_n"b92eead126524d3f19e418d838783c00*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 30522x1216:1216x1024_n"a8f9e356c0369b440c77c751051065da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1216x30522:30522x1024_n"8ea44dfd6a0a489b54c6436f7588b72f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1216x1024:1024x1024_n"8a8558a9c5bbb445cbf11067c861ae90*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 16x2:2x1024_n"541fd265f8b86f6798075403636ba966*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2x16:16x1024_n"30df0255920f63848b169e5da20e3902*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 16x1024:1024x1024_n"87dd76601f13f84c4e842a7c794d6867*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x4096_n"9d8e6a83e0aa5e85d3899a7cc1457f5f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x4096:4096x1024_n"bc20a4d27ff8ef1b9e86991c18bb1294*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"0a9e61efea355ce90f2eb5ed69704651*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 16x16x512x64:16x16x64x512_n"2fea7022840c5793dad7daea9cfec9ec*24&4ab4b1d05a9af7a2af511ad69f6ce949*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 16x16x512x512:16x16x512x64_n"91d3c7555398a2ae2fbca97cf68bf634*48&d43b7ef5901b159d6b9223baa349a9ab*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2x8192:8192x1024_n"87229be098fe157c2f3ceb8d1f3c11d4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x12544:12544x1024_n"08a1f734e36a075a21a2ca2e4adb6bdf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x1024:1024x1024_n"9ce1fc6ddbc2c5990cac67e5289e01b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x1024:1024x364_n"2f4c32d9f5955bfb133f91f87076720b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x1024:1024x91_n"21446c791c3e716b05b656506d637898*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x4096x40:16x40x4096_n"bdbc8dc26dd91fbe15851229b0c37bb8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x4096:16x4096x40_n"7baea1578a59cb927507787f97d83146*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x4096x40:16x40x77_n"8b2ea7e2690dfd3296a29f4c8e04a1d2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x77:16x77x40_n"164546f82d1b1082608e14e41db96c9e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1024x80:16x80x1024_n"9e1cea240ff6e458baeb1525fa1a01cf*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x1024:16x1024x80_n"52f97ce6d639dfc2443c25f8dcc0b048*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x80:16x80x77_n"8af1e8939fb14f1850238985ed8594eb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x77:16x77x80_n"e45bf55152212f22e49ea3dd62310b9c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x256x160:16x160x256_n"95058d7a447c8815e300fab957b73919*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x256:16x256x160_n"4ee958820d3fce95abee7d7df5e2c8d3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x160:16x160x77_n"2b23a1f90875d9fe6c85d33a40a9bfc5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x77:16x77x160_n"f537b49427e971490c94839d627b5207*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x64x160:16x160x64_n"aef7edc4007703efc387a3f82ee30e94*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x64:16x64x160_n"531b319951b9be1fdbcff27c4881ef0e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x160:16x160x77_n"539184b82157bba3f9363fb540a611c5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x77:16x77x160_n"a40441a353c3b4c753ccaf36b9d383af*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x2048:2048x1001_n"1104256e698871f63017b956036b2172*1&289055cbb9312b7ea987895ffea3c40c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x1001:1001x2048_n"f9c7e3013aba039f8d7ad90cad3d67f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:9:abc 1536x128x64:1536x64x128_n"866ac8e0c34e9de1b6a1325ae1569630*12&f2e0bd0a68ade00383fa0a41c44fb885*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:u8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1536x128x128:1536x128x64_n"16699e7f2065c9b331cd11aa17e42f83*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 1024x2048:2048x1000_n"120a084e7bb0421cec3e55f93f74c7c2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x13:13x512_n"bdcaa7cda5852631449dafa74a3969b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x512:512x256_n"599d3ca8576a6540c3ef3c3087a8c7bd*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x128_n"6c66eb4869b8519cf707738427c227e7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x479:479x1024_n"e03c160e9afdfb8c134a685fec9b3282*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"60eddf5bf9727fcdd58c1ce3c3d4f568*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x512_n"02d62ad593c3d25577d9a9f4966fb350*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x1_n"29bbb348d15ca0235b343ab426a60a12*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x13:13x512_n"bbb624c31143867a9557291510cb3437*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x256_n"8ca6ba2d07a66e34976082d505a409ee*14400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x128_n"7e356872a8633200625fe5848e241ce0*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x512_n"72c00b8e9b3e90342b712eac7a99bbe4*21600"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x3456_n"32c7bc065915fae3f2112da6f75c2375*21600"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x1024_n"a357183188dd6237106612673e97d771*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x512_n"74f711723d914cae194028ff3b14641c*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x1_n"a03702623ee0f62f5a8b707dccf8adbb*7200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1:1x256_n"099c746ce55769b300cb3a9503665309*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x8192:8192x256_n"b9fac12570f373e5d416a5ed4625a8ba*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x256:256x512_n"36bbaf6341a9deef76c4e387db356d1d*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x8192:8192x512_n"3c216d4d9c1316e5013cdba19a289a9a*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x1024_n"6ed92f16c0e38ead437073202edf72b3*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x1024_n"d86110251890e6a2ed2a636cbffd0894*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x3456_n"121aba10eee89d247825340d5a96a49c*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x3456_n"efaafece9524839bfcfc4b9314b7bbcf*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3456x8192:8192x512_n"2840f47df7e8163bf1f217e0c6ac438d*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x3456:3456x512_n"a74d33b2d34264de48ebb1c9d5ffcb88*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x3456_n"0488e4256ffb63131d25f5f49d88a8ca*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x3456_n"ad91b34a5e878442a36fcf14c6ba0254*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x128:128x256_n"a3d1dfe47c7b05fab63724b197f70913*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8192:8192x256_n"f79bb0c429346047d61afcea53ade16d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x13_n"7a1b4226ae5dd64c5054df832a79a872*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1:1x256_n"75ea987311c359cac62db34bd53a8645*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x8192:8192x256_n"b53853eaa46372f4d1e782ca0ba15bdb*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x256:256x512_n"caa33f20bc02df99e6af961f912e392c*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x8192:8192x512_n"57a7ef8b03f28016c9a4f7e1ead0a408*800"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x1024_n"e8ec32616a97fc7b325e63344412b8c0*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x1024_n"818d80972dcc5cca444a2a3c88f85ce8*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"e54b421f999295f5008378d828fecc7d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x1024_n"9c1a87f8b9213d093d2068521343556f*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x3456_n"9bf1b73772a253dc0431c15f4031d0ff*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x3456_n"c628c34a67a330427d1c0e141d681edc*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 3456x8192:8192x512_n"4d6f2e8d9671fada4bcdf012796d0a0e*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x3456:3456x512_n"21ea5bda706399c88d8fe543b654da20*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x3456_n"81b45bb92e0b0004b9a39edb215ddacb*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x3456_n"0f8f6e29853f9c35f5f255ec45bc6d40*1200"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x128:128x256_n"d5060825afed722cc21d14af19da8e4d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8192:8192x256_n"6fc9e6ffb87736e9e121d272d4557f6e*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x13_n"d6e0ff1ca5c291115c46b93bc12f7b9f*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x14336:14336x5376_n"1bfb83bba6cdd4e9ab7ba5dd72b2d240*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --dtag=abc --strides=384x5376x1:384x1x5376: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x32x128:14x128x32_n"dddb73c2a5d8492f38be5b1c9e1114cd*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --dtag=abc --strides=:384x5376x1: 14x32x32:14x32x128_n"175ed30c89b4b3d9cf5a2ff511774404*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1792:1792x14336_n"ace22be2075b5980a05a6de6910b2933*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x14336:14336x7168_n"24e9a9ed45150c2b35943bd315923d8c*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x7168:7168x14336_n"5f4773ed0454bf2946ddf4452f63d03f*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x14336:14336x250880_n"3cbc1b62f2a7764a014fa0fda54f7489*4&a624a51daa5310633649f10e152aa6eb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x14336:14336x5376_n"78b6e6a6586bb0b352aee43f539d4a29*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x33_n"c6f5c3e478a53403bec61b434c901a9a*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x33:14x33x128_n"170d056140d2fe60925a3fccab42c54d*280&f7bd1a67eeea7a391f94cdeb6cd2c4e5*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1792:1792x14336_n"2e7e8ef1534378831f7e093a2e7fb419*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x14336:14336x250880_n"6b13d053e4623a7c8975742364660fca*124&05a5218f260a6627a70b56e95721702c*124"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x34_n"f96cb42bbb8c45ad811b2a93c2c72cc7*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x34:14x34x128_n"8c052f0de54f04732c6ae599ed9530a3*280&eed791eb306b0f1f7d8956413a2a3f1e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x35_n"a1a106d252fa797680c084a7981cb906*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x35:14x35x128_n"e79d9414a2d3782cc0488f58241ec6d2*280&170b0a92f48cf095033c06a774a2fea6*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x36_n"a4bba2c981276cb647ebcaf04d3dbcc4*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x36:14x36x128_n"1b73e13571faf610a41b456ba2823dac*280&98d99aba19fc95f32c507464dd8fdd90*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x37_n"63d419810ac2b5fc08a2f064b487bc8d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x37:14x37x128_n"5b9118da56bf985ea50a619088116732*280&905d188ed0296de9827bc512a231ae41*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x38_n"8b1a41993a4acb3d25771277c0a9af9c*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x38:14x38x128_n"84853ece0454f92a511c0f30a51130fb*280&398683d127aa486a237e7a727a6c1bb5*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x39_n"5db997fd6495c13e1ca9e61706b3d786*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x39:14x39x128_n"edfc8aad66df41e1b7dda872e2c91da5*280&28ca43b521e9f398ad9a9ff13ea1e507*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x40_n"db7a97d120e1bafa209ba472ba64b377*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x40:14x40x128_n"3a71c651adb697c9bbfe1158887f70cf*280&8c32a72403b172bc94f2286525a06901*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x41_n"d110672e9d24407c3a260febf70cdce0*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x41:14x41x128_n"9717b3fcdd90fcf9049cc217017f87eb*280&c0b58228ed85b9d93c86641b141e4687*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x42_n"454e3cc6e91d61bee12f6d6e7755e3c9*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x42:14x42x128_n"e12f02d8e5f729d001afb2eeb1852b3a*280&0f962282cb0ef0b8ecf3d53f67d14339*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x43_n"a0f4a943ced10ee1d5a42a4021a47976*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x43:14x43x128_n"bb7eeccc855df42590bc14b54973e6e8*280&3b42785faa1f285fd1ee1d55d074870c*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x44_n"e5d45cb093ee5000d945f86935afe645*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x44:14x44x128_n"ade2c0c3cd1a3f149782c74b5a6fa0b3*280&77e4e96581a41ec489c0948f5be5c965*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x45_n"87b8d7ecbfde0156c1addcb8b3c0d8ee*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x45:14x45x128_n"27b5ead60478581638aef047fc6b5e6d*280&515ae77a8c5ef651db22ac667cb8104b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x46_n"6a739d607b8c1518dd10886bc10b451d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x46:14x46x128_n"3688ba29625aecc3681d080549866cde*280&6a6622cf6d5acc8eb2dc655d98576805*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x47_n"78cda4a8a101f2fc775088fe7c9b4390*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x47:14x47x128_n"f87c657a9f3119274c329845dc587676*280&7261063f1030976088d75870277f02a7*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x48_n"af959081eab67dd3103c8eb48ec9f238*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x48:14x48x128_n"b584a643d3ecb4564348903cb20c8821*280&b9cfa95e092aee64d3b14510d5c4a642*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x49_n"c5d51eb6798715c6368be3ae3e896c82*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x49:14x49x128_n"3caf645c5d6eff62a5bf6b76cd3004bb*280&901975f1583b0180e01f6b9561d4943b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x50_n"0e724348ad581e9a271c2cac76019a39*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x50:14x50x128_n"3afbb4e1b925dfa4bfdc89f7d3cefbcb*280&89c60b2fa2ae523725a45609821a5df0*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x51_n"4668261360fc4d3e77fdb1c0d1739de8*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x51:14x51x128_n"2d28115f065d020e708a00d836b72acb*280&fcc03f7e349efc41a9cebf4a20de0a1d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x52_n"ac6370bda5c294831a1fa5402996c781*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x52:14x52x128_n"838ca8aca4d8c5390112dbff2ad9533c*280&e6798487cd834059cf361a2e8828844b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x53_n"82eb77ad18b793c100a8a787c3deda7d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x53:14x53x128_n"77510a63e3312b99e1110457053978e6*280&40d5d01c306d93854930d578a121eb04*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x54_n"da428e778da91f99d2f39a1bf2a1299b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x54:14x54x128_n"3415d0f3a21bbfdb0dc1841d27262aed*280&2ad6be2506bf506423b5c15c62f25f74*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x55_n"34e4ff8c1c541a808b54c3763654ede3*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x55:14x55x128_n"f6b27b839afbce9df3f0f52a14ee1a70*280&76ecf58b23d5a4e353589269fe424600*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x56_n"8c9171236c5d77ad63fbaa1364e2aeeb*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x56:14x56x128_n"deef28eaefc709fc6e4cd91ba6366151*280&6b2d9f11aa61c06041a88dc531bb3cba*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x57_n"e75469c7da2ee569d9b42e5f4fa4a4bb*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x57:14x57x128_n"e7f1fcc99ad21ba882d26c259e930035*280&34222aab1745cdb03342111b8aebc3a1*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x58_n"4d02fd6f5f41db9bf03bcae06dfb5a23*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x58:14x58x128_n"d02c2cbd5c724db480acd8d4677e569b*280&be93a1d16674b84595a4ba36b91656e4*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x59_n"b045eb76ecb325b5c128db70b5119208*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x59:14x59x128_n"725cca666e286b87c2941bb0af934ba0*280&95303dab3edde74a2150551a9aa45b0a*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x60_n"11a4fdd31c34ad360e8a1af737bfb52b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x60:14x60x128_n"9b0e6b0ccacd5a2e0da708c0cfad87b0*280&7e95667da61a824977d07cef7b4998f3*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x61_n"b5824f6ca0e88c84c2049a85bb1be59e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x61:14x61x128_n"081ae069e50bb10d0a930b8a30c0d7b7*280&d2c2e3c16c43afa65dbf0da0c77cbe2c*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x62_n"af515b071f0eecfbb8bebd881c379d97*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x62:14x62x128_n"226822c61f3140e6f688dec8a291fc0a*280&5e0115aacc51793b61e6fc5461b63fe3*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234+binary_add:f16:5:abc 14x1x128:14x128x63_n"2d46cfd310370f2cda6f169c198380fc*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x63:14x63x128_n"d8b09842ffacef44f288c5fa283688d5*280&b081546802d4e5a46b30c521d5306aff*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x14336:14336x5376_n"614bdbbd4f5be3011b6b64235c1a83bd*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 14x32x128:14x128x32_n"09a883c1bbd493eb95e5b524059f3a70*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 14x32x32:14x32x128_n"b3a63f4f62b7551f3a6fdbbce35219eb*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1792:1792x14336_n"15b10d1a78117348e724eb25f05333a0*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x14336:14336x7168_n"0d339c009e5442ace5a74b1af881e5d2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x7168:7168x14336_n"408cad798b580d59e4238d11afeb8f73*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x14336:14336x5376_n"0bd6b7673df25ae7f6af561575b3f440*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x33_n"e096d5f13b5a9cc42e8139d026328b09*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1792:1792x14336_n"fe4e4299bdd968e3fcb70542a9fd9ef4*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x14336:14336x7168_n"bff69be3a65d43cddd758e21c4865d93*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x7168:7168x14336_n"c354d989bd7bb78d402a4c8251b65ae1*8680"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x34_n"fd3c8c9d7e2235343f61dabba20a8dd4*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x35_n"c4661a817b04565dc53f72f9fced9428*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x36_n"9724a7995810a5961daf6538605146aa*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x37_n"172662d5c77392317b9bea802608f60b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x38_n"fd5775853bde58754998f88b65834f46*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x39_n"7d8d0126cf500726f25ed9d2405a6aa5*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x40_n"07c770329ea5ee0f137e578dfbf645a2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x41_n"4e9951ccdfaae62cb498607f6ad5c56c*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x42_n"a023a3078b89cf5bc880525cd47e9a26*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x43_n"63b8931276a563b1c8dda946ca9ea754*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x44_n"4a8a2bf76d0bd8e6b44a2e34bbf3f793*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x45_n"8c2933aa8a1a1acdc56cdd8e444e4cfe*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x46_n"aae46bd33f32c52a1949b8c35db8509c*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x47_n"f78cc60571f325c14bbc5911389b6eb6*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x48_n"68aa948c011c6cf8b8102041343ec8ab*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x49_n"642c8ffa9161d366e0ca773624e8ba1b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x50_n"7dfc3e54ad4359aa2cae6f91ed662cab*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x51_n"144427b81fce295e676b1f980f5208d1*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x52_n"86cfde5b17c29358791135e568f97c30*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x53_n"ed68bf55ac9a3ea21caa5f92aad8677d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x54_n"7f4ef6e507c142ec3b8b4c9417d507a2*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x55_n"098416de894f66d36f2568aed7f6ab34*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x56_n"f0303df5d8a2cd74b7c8644b1ab64c5e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x57_n"2597b007ab98c8c2d80b81dd1e55aa76*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x58_n"866a81c49925648ff15c0832e9ea582b*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x59_n"9ff7ec1cbbac26e0ca3d058aca6e5a8d*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x60_n"05f2c34db39683e2b9a2ec496254648e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x61_n"1e311f11d28c3b8057cb091328e83a58*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x62_n"30834c5c28eb2542b50dbfff4cc9e32e*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x63_n"3847e23fe15bb26ff32e53099542dc33*280"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x4096_n"90c32c5e7da7b05beedf9e6959004358*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1024x128:128x128x1024_n"230ba564cd5ad274496aca264b0caa5e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x11008_n"3edeb9c354a118b6740315bc430f88c4*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x11008:11008x4096_n"e6cd8247238ed9eb6279851ac992d7df*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x32000_n"f522f0d983d1fb81b5927083cdac63fa*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x5120:5120x5120_n"af0b288a49bd1f622954386296cda444*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 40x32x128:40x128x32_n"282bfb09f833ede82c36a1dce8b4bf4b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 40x32x32:40x32x128_n"7201e1d59705dc899379ebf2669a15ad*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x5120:5120x13824_n"e36cc11bda2a2847bcdf22698fc57566*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x13824:13824x5120_n"1c3eab893e730cb3f1c45044b6b9dcd0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x5120:5120x32000_n"538e4cd66d4578833fb7ff8d8c9c5910*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x33_n"073a2b347709b10836fab7ff622ee415*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x33:40x33x128_n"a2372c442faae9996dcf770c26179638*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x34_n"a85960dbaded3e6a8f651c485c56a029*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x34:40x34x128_n"3d37185d7b1b33748f61ddf5cac5c3c8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x35_n"370465702792e0917087ebfbfe470bb1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x35:40x35x128_n"0703ab4d9d35842b323b32c97ae69656*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x36_n"ca6ae5834db1bdfe724c33aaeb099e84*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x36:40x36x128_n"53a45586c852b76fc94757faef5c35bc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x37_n"750956dc057c73b2fa5676626afa6503*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x37:40x37x128_n"9946e671bfa7570f8f9b3811e656f8b5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x38_n"bfc6ca43379ba71da2044a713b2aa699*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x38:40x38x128_n"957eba3f4823787af1415afbd87b4048*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x39_n"0df37dbd6d8845737772d1ab204c3c38*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x39:40x39x128_n"eec43e4c58d76b492efe79edb06b1b08*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x40_n"bfce4a88251e6efbb04cd50fc657a823*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x40:40x40x128_n"d0cb352cfd02b1e49ff4e9385bc53ace*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x41_n"df9e3458386c56fc1f8074f81fc71225*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x41:40x41x128_n"cfa0ef1830d542287a192658e1af9a84*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x42_n"f962a09851e078ebe3dc544bda9ce30c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x42:40x42x128_n"32692e37a2bacd0c2b2152573d9d1de0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x43_n"ded62811be54f14e949b9b20173fbf46*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x43:40x43x128_n"152431667a6e64fa3f18be59753dbabd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x44_n"cf01d239fab5c3d57c677e450b9eaf42*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x44:40x44x128_n"bc259ffe5b2b191e9ffcaebb5b22a697*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x45_n"04a0f88f3b7b4d0341216cd6a2022b9a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x45:40x45x128_n"51c0d807be001f85f136eeedcf7fb02b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x46_n"65d9b9fa68979d2df6fbd0909b6207be*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x46:40x46x128_n"4dbd21290b72575b2b94a7d09f3383df*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x47_n"653d0d5290013a24db9a4420027c590b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x47:40x47x128_n"661bceb71367a1aa5287a6c0928ae32c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x48_n"0095775beb9f71e5299fe08dee554480*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x48:40x48x128_n"c5d181570db612b35441f9a9027308f4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x49_n"0d116afa88a615efdbd6fb4734ceb7ee*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x49:40x49x128_n"5aec17208687b516b7286f81520901fa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x50_n"e908369ffa4d4561e7d69e60cd98bd19*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x50:40x50x128_n"093d8c0e67eac2dc2e29210324f8f479*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x51_n"4888fc69bcab0256d7947fd225e9eb01*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x51:40x51x128_n"07284c797dd947cca53a52b249f88fdc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x52_n"7d7d89e9feaed940494580bbb49dcd2b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x52:40x52x128_n"e2cddfee5f5f7667c7730b6bc5647052*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x53_n"84218ceab62069f1d05024b9e2861d5b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x53:40x53x128_n"03980e94328a1401f90f8fdced3bb863*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x54_n"bb5c789a5f94f089793fdc7ffbf96c81*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x54:40x54x128_n"37e1672bf7d69af4d06885a1edfbec57*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x55_n"5cc16c4262f56df092c6db7962c39f57*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x55:40x55x128_n"9df9e616e483aa3b358ea94d81a56e9f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x56_n"354f344c3265e6e51699ebd43bfcea12*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x56:40x56x128_n"c362f76cd1af79cba944031e53c1a59e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x57_n"675e23b49bb589009ec3510055171a02*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x57:40x57x128_n"9c6701d59b68a7a32fd1d0b337e26042*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x58_n"467a25c0951aa7dcfe084d0f47ece17d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x58:40x58x128_n"6c049794a16a00573fa535d3a83f04b6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x59_n"c707420d131c78f4e802e28abfdcfdb2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x59:40x59x128_n"1543fd88a358be30c548bcfb351a0f6e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x60_n"79953d51c82aa4c0af43943ef3c61f6e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x60:40x60x128_n"671873e1c01fe5f91907200959cba276*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x61_n"f07977519dda32d29b67da6a5c082704*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x61:40x61x128_n"0410ae29fe2be83847d6b254b117c113*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x62_n"0478ecdb157245f2a5a0f2eb9bc7b814*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x62:40x62x128_n"95813398a086b3481825a80f304e18bb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x63_n"7546ce75924a94f9c60e09eee8890ebd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x63:40x63x128_n"15d083301297e15896493e9867ab920f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x4096:4096x4096_n"2ac2c7a865b3fb8e24b9a9ae706102c2*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x32x128:32x128x32_n"9ffb22b9130f64570ad9fcd96956caeb*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x32x32:32x32x128_n"b1eed1e64acd72d541dd40a4c420822e*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x4096:4096x11008_n"343012eb779923ea5d4ac4e39c45fc0f*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x11008:11008x4096_n"52ba81ebb6bb89d351c03389fe8423cd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x4096:4096x32000_n"bf917ac96d87012df1514d25c3e77082*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x33_n"8a01d4530f9a86c91984545f1ac401f7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x33:32x33x128_n"9ed9b21e6a1b8d1b0332ca2fd17645ea*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x34_n"e3816330ac6934111de78ffc3de2ee28*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x34:32x34x128_n"ec2a5cdee12b341d2a4ff10b16cbc0af*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x35_n"ea0ec22fa8f5ccc6aa634cf7384b2a5c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x35:32x35x128_n"a9cf2b65bb3a8929e0b863fdd72f9313*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x36_n"88fc23ce570aab40871c5802db44ac57*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x36:32x36x128_n"25105477f063ff5c61547bd48e18edbf*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x37_n"b14833a234c376b28a0814c239873003*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x37:32x37x128_n"fb41e7e00fdf6057e3b3c9a70c5cf959*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x38_n"22bc1d480d3023d64b7ac78d3def3c26*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x38:32x38x128_n"de94f310943f3192d35b6bcae6e5f4cd*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x39_n"c0d8dcf6aaa3e2a2e8d893776ec9a22d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x39:32x39x128_n"1088d272fd1505b63d8c174945e79107*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x40_n"063b2b8e352fe7bce5037dff22236e21*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x40:32x40x128_n"c51a7086deb40391434102a48fb7b25b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x41_n"12dbf0c5e9e0504fe9ee0a0a2c4bc64d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x41:32x41x128_n"73779f703dfe9328407eecbc3f20c51a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x42_n"090f3de19e4d4a988eddece95dfe5402*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x42:32x42x128_n"96be92ce5b4b2e9626978f0018a376f7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x43_n"9dba70cbc69647eb15ce9f7d1f6e6bce*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x43:32x43x128_n"61cf9c8a55db70ac31c2777be542d950*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x44_n"5470b755231f1573d75133c51b3f7255*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x44:32x44x128_n"579c12337e31c50438a6f57ef8364abe*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x45_n"7ad72a4c1f9eceaf64580731fa9410e5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x45:32x45x128_n"39382170ff57f3d4cbaba55e1bf52f38*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x46_n"92d875c408c18e21d0ceb11289dc0b3a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x46:32x46x128_n"30db13f2dc3da9acfd58ebf34734c2d5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x47_n"03dc07b3901d5ba3e52bffd8b8c85f0f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x47:32x47x128_n"88b4adcec4b042c95739496d974dc39c*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x48_n"032b6a25cbf800e1d325232ede3eaa52*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x48:32x48x128_n"7a69a1c29d64b9385331de61da3ef362*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x49_n"9ffe90a3067924f8618698d9c1339b46*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x49:32x49x128_n"f8a916bedbe09c50716ae7befd0e1b70*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x50_n"610a7b1d3d2bae467c897072ea7b2084*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x50:32x50x128_n"86e0273c2e1b38af06bdab0d8bf7c8c7*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x51_n"5d5ebcdfacef3a53a045499a5aa01267*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x51:32x51x128_n"8f414354d0f3b98dd8e7a0bc849e9db3*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x52_n"6044751b4ec8b70d1ff5c540251d2618*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x52:32x52x128_n"704b24516965ba8fa82b2c8054bc2420*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x53_n"1bc08dc80a0b295795732cb75d17c5d8*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x53:32x53x128_n"d06ec81b69fa82984ba02540a52f481a*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x54_n"76b9bd518d6363ddd3316bb769004586*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x54:32x54x128_n"020d98ba8fb09fd1d46fe6aa9e33c716*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x55_n"c11a344e446fefe75ee59da5dc3703ef*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x55:32x55x128_n"bf4571282b0dcba7acb06ad6b51cc3ce*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x56_n"d3797d78e394844a5c19cb8fb5913f24*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x56:32x56x128_n"f2f060676774623823b709287a501619*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x57_n"43d8fdb0f2efa6fa449dd328e3b4bbb1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x57:32x57x128_n"0033150f8d1704249bb5c18827678a25*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x58_n"889ee1ccea949add9fdab45b2e6cac7f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x58:32x58x128_n"2e6ebaab55fd9830d2d77e6731c8f11d*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x59_n"5675afe9598eae3a7f8983e614714bf9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x59:32x59x128_n"7ab09bfa1970a1603694123c2d621325*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x60_n"339a61f97e0b5c82c70c60235098a72b*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x60:32x60x128_n"f48ab866f84153e61704397922a5e1e1*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x61_n"bb7038dfe4ea785bd6f170d858e91a35*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x61:32x61x128_n"f3ccf91f0dc8964f9e45d78ecdd6bcc0*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x62_n"01b617fb9b08a8daf86f22abbb1dbdf9*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x62:32x62x128_n"0b6828e5f828905bca1ac0c5361c6ce5*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x63_n"781c63a19b4059c5f7456adea1a68b3f*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x63:32x63x128_n"375c9c7e1bb97392a5e4b57127553a88*32"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x4096:4096x4096_n"29f923cd13c3ffdb17a2b2dbfa99ee28*112"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 16x32x256:16x256x32_n"8866de6d3147c5cc1ea34ff46fbcd711*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 16x32x32:16x32x256_n"de8d6e0daf9195313ee477691d510a27*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x4096:4096x16384_n"c293f1651eff77c499578a13c1113441*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x16384:16384x4096_n"c9b417a54a447bd803966c86a29957e1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x4096:4096x50400_n"ae97040ddb9f98538fbc8f46197e34c0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x33_n"6e3a552b0c8eae63f6428c2a55e7249f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x33:16x33x256_n"b203a25c91c21cfc42972bc13ef347b7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x34_n"38302dc53f2465405b7ec8db1f4acc44*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x34:16x34x256_n"e9b09af8d086c166513f0d03947761c2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x35_n"2a040293220b7e3be88d16c5596673bf*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x35:16x35x256_n"344b6a2e922e46fd234d2b4772dcd73b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x36_n"72d56662a65db7603a5f5b8c836525ec*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x36:16x36x256_n"979e075ec249f5926a80304c838d3f66*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x37_n"463b3b475cb6ed906520b2ee69521e3f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x37:16x37x256_n"2b308f66b0834dc5a169ad019b0a01da*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x38_n"353162b8ef8af884c3416bf54c7685db*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x38:16x38x256_n"eb5363c2668bddf56cb634b6ea962d6f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x39_n"79b7af10282c90e1f75d7d7615cdc137*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x39:16x39x256_n"7a811341b9731182a5bdb2d991fa50a1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x40_n"dceb5fc29675422c08f0fbc40e0dadab*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x40:16x40x256_n"a8bad238eb09feaff40d795a38a63357*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x41_n"13d341d508fd54801576e25dfb6ae563*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x41:16x41x256_n"0498954e98e862f330567bd3e6c2d523*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x42_n"3420e8af31f41231b8f72426ef219981*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x42:16x42x256_n"7a64b659ab3b3bde2c99acfb7ce71589*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x43_n"7b8d3bbf85e61f11e8c2c90ff5386171*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x43:16x43x256_n"07583a844857e384117fb231fd47dc5e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x44_n"7c9fa27ade6756ecd9a5d80e930ac83c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x44:16x44x256_n"f8294eff911fde7cca897785b510da2d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x45_n"c0f9a1c52d35ef819671db3d870a5670*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x45:16x45x256_n"92390790522320f6f3d1cfc75f2f8e0e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x46_n"99f7eeaa2f51fa5a03399c7ff48046e7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x46:16x46x256_n"d3c5a43ca4599c59348d91a2eca55cf7*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x47_n"56620e171906a0efc68d78a2945782a2*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x47:16x47x256_n"0a4b0d08d898e9581a9939c67ed45a6f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x48_n"58a8dd3ea02171810f78e2265c127e90*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x48:16x48x256_n"4cb6692bc0f9bf5d84088e16040923b6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x49_n"5cb7cc4ee83ce891bd6261acaeceb7b1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x49:16x49x256_n"0e96a1ae763a198306bfb5e06b7209cd*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x50_n"190d191979ba56693d04ec8d83536025*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x50:16x50x256_n"3839601b3a5dc2a7024a555a7529c802*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x51_n"bd36b206729f06d05cea6f2607589f45*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x51:16x51x256_n"89f02c19aa9c366874b54634634ebd34*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x52_n"660be18788905dca328d8518aa4a0462*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x52:16x52x256_n"dd54c9e63232c393429b4a4e706e3a27*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x53_n"2faabf96069821b93ba374e75071d714*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x53:16x53x256_n"34595ecbe220e41501cec9ccd29a100c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x54_n"2fb82d6f6294555febf8429b048c724b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x54:16x54x256_n"7b505b2d6a8432a0241c1f3b59dd1d73*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x55_n"30c9d0c56bd5da687d1665d176e53d24*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x55:16x55x256_n"ed628fc4d3e3b15ef007ca2d73b9ea3d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x56_n"b4386f6e8f03ff83f2515f90e883d218*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x56:16x56x256_n"933347d2b7b5b0d28af77907d92ebd3f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x57_n"63a1082071241b77629c78ac8a81bcf4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x57:16x57x256_n"fbdc351b5709a010f593e34fcfd5712c*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x58_n"a8620d258894012b2fcf70d795411bf4*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x58:16x58x256_n"2d0960cc22c36d15a312e065e9fabc8b*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x59_n"62374c59932347e8c2bd8a55ae774a8e*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x59:16x59x256_n"7386ae8d38290f692edab5bd55e8b43d*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x60_n"335867e85ddb024b6303cd3529f92c87*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x60:16x60x256_n"6faa05ed648dce5f3a10373f5f4b54a9*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x61_n"153c3e152be52fbeab75e8397718b2a6*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x61:16x61x256_n"a6d229844c34265c2fe8f18cdea03550*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x62_n"8de4733c3dcbfd5918ad88e27964c8a1*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x62:16x62x256_n"7e57e25984d3d33e6d4a941399f11d2f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x63_n"8ef814d35f6a81790d3d5bcbda80d57f*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x63:16x63x256_n"7d86a990e6a5e9100d4d50b5e303f0bc*28"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x5120_n"16752579601cf873cdc6fd335f069864*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x32x128:160x128x32_n"e14cd079ee0c0e6592a2ac72a8f56d57*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x32x32:160x32x128_n"661083fbda00e092639b68d67a6f5e78*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x13824_n"2356b085371779a7e659d7f0b861b5a9*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x13824:13824x5120_n"a61b887223a4f702ac1b8854746fab03*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x32000_n"f3fe66b3a09930121f95f58235a66bab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x5120:5120x5120_n"7122f1e72706af192cbc60de2fdd2d45*4960&a95e1c47ff2a054da41c6ae35ed4409a*20320"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x33_n"3d68ed2354a542e073591db37a8c237e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x33:160x33x128_n"ffd16493a8292e64f1d8434ca46f2797*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x5120:5120x13824_n"95244b4a3fc3a1ffc699841ed808d77f*2480&837773dabc4273129bfeb4dc5c2d123a*10160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x13824:13824x5120_n"2021322610376c16bced0063a5cac351*1240&e68b320e8105ad15d07682e8113341e0*5080"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x5120:5120x32000_n"d961869d34f04277921617797dfd5c1a*31&da79cb5135a7439b55ffcc5e85fb15f9*127"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x34_n"212b92927d9795c3b351a1bf383e2007*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x34:160x34x128_n"51569c9c4b02e170af1362b1f1b92b9a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x35_n"1b7f48f2541ef332ba4a2571a85e7435*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x35:160x35x128_n"da7040569e5ce4395163c844fc6452f8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x36_n"ad49da9eeaf46ffab40484bbd23b2ae0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x36:160x36x128_n"e34592bd5a96d6d0d992aeec040364eb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x37_n"dea1c0aa734d3c06ac85e7b4c55eec3a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x37:160x37x128_n"a2efdf4b9c0be22cc46145624781f77b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x38_n"d871c78b85d5b3d0a19e6de2e5cdacb1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x38:160x38x128_n"f5110d4ecd240a0e68b1300fcca21f8e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x39_n"1ea250686353ae69d90f9e7991e1e7db*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x39:160x39x128_n"ab5d50fde191fc2c1e6dbbbbbac33e5e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x40_n"7926b99ae88982b434232a51ef4d7d4b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x40:160x40x128_n"d7ef037d5769f684ac0cfdfcb2f2ad90*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x41_n"c0c607354f12e7449b089b937e8f6a69*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x41:160x41x128_n"b03aeb50f13770d07bb4594c865eb034*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x42_n"f7358a47f9f6d178ef1b764927c32820*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x42:160x42x128_n"2f652857e9ea09cdc3b7ff876f1bd5b5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x43_n"25e8751d62cc04df9ed9e7693608bfad*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x43:160x43x128_n"c6262324e1438bc871522d629d63a731*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x44_n"672d0cfa0652ab00ec9964c7a87748c9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x44:160x44x128_n"dbe205eb1e5040fe218448d72e68f9f1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x45_n"a55980c015af65a5707c3f3ad9370d9e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x45:160x45x128_n"d8d7d6405c08f0b137fb7c5705001d64*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x46_n"45137e9b500376da29be3511b15fa65d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x46:160x46x128_n"f868501f8d31ece0f82ab8ec29263404*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x47_n"a26d408d7123657446119266ecf6c2d5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x47:160x47x128_n"c31625f87c245f72d496f790b620dad8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x48_n"6688de76015874f5da225bc1e0e782f7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x48:160x48x128_n"cabd9d7a2173366985b179dcfdd6d5ec*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x49_n"fdd8ae718c53c14533812f8149c32d06*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x49:160x49x128_n"7b2e855bc5e5c7eb7eb08c4e9d00561e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x50_n"f1c1f3d1ebad5c1532c1575b319ec255*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x50:160x50x128_n"df1f5a165b2ba70e0743f05450b14688*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x51_n"3421d9a040d70216294c2d4eb92e1c56*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x51:160x51x128_n"35a6a179749d1371e18b44bdd61fb79f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x52_n"1f9ba7d2c4b9abb4b4bb2a57ccfc72fd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x52:160x52x128_n"ec19ea33b7e67b9cc8ad21b3085b91bd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x53_n"f05aa67c48861f819ab963642911f312*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x53:160x53x128_n"485bb5b51a4dd1e6cc9b8a4f97d2882f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x54_n"3c685f6612926b515c9b86a7c0cc28eb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x54:160x54x128_n"deece5319c8a6af7287c3bdbaa0eb4b0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x55_n"bdded30295824b7ef2a3917f85856d03*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x55:160x55x128_n"123a2b5efda3e8ae1471cf4ebfa853b7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x56_n"3e5a8abcdd373ce30bc839c72179840e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x56:160x56x128_n"26095c157d39d916cad197351f23afe0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x57_n"4796b84c35d47e852b19099fdce00a54*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x57:160x57x128_n"f6e6291babb2a4b448058bb28efdcfb4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x58_n"456ae95922dae9b4e9841933e5ed5a1a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x58:160x58x128_n"92962252d6fadad2a27197fccce141aa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x59_n"f874118ba4f25fa9a3999d8dda3f8a47*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x59:160x59x128_n"f63b0ad68be042e833c4c5455fb809c8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x60_n"408859c91df1f9680e4c06322cbcfdd5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x60:160x60x128_n"aa6d15a434109d8b7a3cc1f923f0d0f0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x61_n"a41b999d50cd129d7abf0c368dcd4cf7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x61:160x61x128_n"5a0772d0f1261a308e858ecf2b9cac6d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x62_n"4daef8e07e681ef8584e769446022e5b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x62:160x62x128_n"057d945f953cb587025d1908ca86d8df*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x63_n"1847fd67360e9aad2a46daf63eec0dcd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x63:160x63x128_n"0cff4e6ae557257ae0d72d61cc7809ed*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x384_n"52377bb5cd0569843ae904b923c2a52c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x32:256x32x49_n"b580a7cc1569a73674193070800043a9*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x49:256x49x32_n"1abe742be56f37b3eaab8347ac859e2a*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x128_n"8a04460373b9865b2b1adcf782f70cc3*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x512_n"42917a6eec9842f1655467543250024d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x512:512x128_n"677d896c1f5411ee0abc699314a0dca2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x512:512x256_n"fc07a6779433a234a06674ffd8e142dd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x768_n"f03bf44d0ca4330f0675bc20c4e33d3f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x32:128x32x49_n"9793cd4ff02ba7c7b74070b414495c41*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x49:128x49x32_n"ce1bc27a8d242265efd346c1ed528992*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x256_n"710995f276accb41a0f030d3a2bbcd54*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x1024_n"62df2f579fd780c81b153f6fe7236cf0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x1024:1024x256_n"e8c9d97ac6af1f629cf16f6d79f3c6f0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x1024:1024x512_n"dc34bbc2e5505af3e10893ab18788f73*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x1536_n"322647a79ce2c861030672b573f7fc3f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x32:64x32x49_n"9a170837af43706e6556b079565a4c6f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x49:64x49x32_n"7a583b432fa778fb184a979fd7b024c8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x512_n"f70dbc39fc4356add1ed207eb1499602*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x2048_n"3654deb5aee20292fef8d394a49bf20d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x2048:2048x512_n"c58ce908e4908057fa8c73927fcbd239*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x2048:2048x1024_n"3e76063a6ce0d510c40ae54ce47ce0f8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x3072_n"1ae1abdda11b9b0a3dc013c6ca0bc9eb*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=bac --wtag=cab --dtag=abc --strides=:: --attr-fpmath=tf32 32x49x32:32x32x49_n"a5cbd55cfa1d04a832b1a30cc9aca8e3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=bac --dtag=abc --strides=:: --attr-fpmath=tf32 32x49x49:32x49x32_n"7e610e7457c50f8b1c7323070331b192*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x1024_n"8c8bf27c172bef44bd7da57be1d562b1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x4096_n"d544352c6b8e2bc1584cb6df9af68f64*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x4096:4096x1024_n"9489d2fc4d1372c07840f44bb2837490*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1x1024:1024x1000_n"afc91211aab622ad4966ba4190261203*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x1000:1000x1024_n"1f0859f44f20c559286c5ed4cf883cc4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1000x1:1x1024_n"68fdb1924e4b6548e8736d1ff60c11ca*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x4096_n"be3e1600e5ff6af307b7676d2a6f88c8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x49:49x4096_n"8b90db8678e1a03e2e26e2e208344acd*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x4096:4096x1024_n"a39f11233b9fd5d8997e57f9b8d4661d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 4096x49:49x1024_n"a43168f5577730f312286f5b17f23975*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x1024_n"91c53851e04b353831aa10691887077e*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x49:49x1024_n"02eafab9c05f857bbadb2faf5902254b*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=bac --dtag=abc --strides=:: --attr-fpmath=tf32 32x49x49:32x49x32_n"978c28939a6cd271737c0a2406043a20*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=cab --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 32x32x49:32x49x49_n"2557ca06d1d300cffb5f7c2e9e85b012*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x3072:3072x1024_n"c58974c16c67aa4542ccf64f6b714e72*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3072x49:49x1024_n"463d1ebf71e5b69b01f6cd8096b1c4bb*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x49:49x2048_n"1a285bd577f77db2348abb005208a995*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x2048_n"c959f419cd84873150d40393817e2d0d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x2048_n"d2e0f243bf0375ac1f9b52c4597960a3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x196:196x2048_n"cc0bb10a56fafb0dd42885e15dfa49f6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x2048:2048x512_n"11583c72446884956e7c8272e13b12ea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x196:196x512_n"29bc12c56dc9979e28201f5cbface5c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x512_n"efde9c876f29c9a889870152e49baea9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x196:196x512_n"b2ebc3407d5e15d2e48817fe0414107b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x49:64x49x32_n"262e394df12580ad3ae002eaebca65bf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x32:64x32x49_n"80a08229b28d7760c9c5d40b80f59b6f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x32x49:64x49x49_n"305a7a7c70ea82a195d15f5d145f674f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x49:64x49x32_n"1490d831d078e113566098444ff5eeab*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x1536:1536x512_n"19b0ebf69fafa0f600a8d5cee034d837*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1536x196:196x512_n"cece3a36ec2d35f81f836072893dd6c1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x196:196x1024_n"6687477cdf503b7d7ec50ba60816b2e5*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x1024_n"a141a2576a0cbb29cf5f172775c6afb2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x1024_n"10fa0b01fcdd89f56a5e4e2c3586a2e5*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x784:784x1024_n"7c11dab6a8fcf183bb10674b7a757c83*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x1024:1024x256_n"2672e5c387329f34b19c7eabad8220b1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x784:784x256_n"bee1a0f52c3dd2df0f7ff21c3b6aa245*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x256_n"53f727dcb2b8dc13010c70abd299688f*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x784:784x256_n"47ff8949d7ef27d0738172b0c414b6e0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x49:128x49x32_n"9105863b4a53d2fc29b73e17aac7e69d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x32:128x32x49_n"a3813fdcb05359eaf3a3923c46165bfb*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x32x49:128x49x49_n"47b3c07ce6e962c46c587bdd5050c23d*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x49:128x49x32_n"b76cffb52fb08f680f9d41596d07c2d0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x768:768x256_n"b67e2cebca0719ceb54787d3f31fdd27*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 768x784:784x256_n"772b6e01607adc5a6fb8d9950ed60abe*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x784:784x512_n"3fd1551d9396bf5f034261301f85d40b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x512_n"ce96a5dd350f396daa2740ebc6d09dd8*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x512_n"6be04e34c2a227f47186d4beb84162c2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x3136:3136x512_n"646ab37cd6e6ae569fa51fbb54b0e9ca*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x512:512x128_n"ddfabfebd03b63ded867553216a84ce0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x3136:3136x128_n"5bc0fbde9340e32467a266276c61dd72*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x128_n"a8d17b3b2462525311f6845d0a252a94*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x3136:3136x128_n"644f5f8e5a5b187dd7fed44ffba26318*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x49:256x49x32_n"f197d5cc13e8f626d6fc6cb6498c1540*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x32:256x32x49_n"bf261451c4d7f55b1e55261c1e1a7b48*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x32x49:256x49x49_n"d0425b622c03f515c0b380306454a078*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x49:256x49x32_n"69f34bdbdf6590c5f5e363cabb0f0561*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x384:384x128_n"ee06ef06ee7b9597956dce1c50888ea2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 384x3136:3136x128_n"343497bc0270eb535441e354f31d14b8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x2:2x768_n"0a080a972efee3d1dc66a41c4749d07f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x768:768x768_n"72c329d1e863024d882c996b5f0e8ed8*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 128x12x128x64:128x12x64x128_n"41d228e88d85945daca17c7a106714e2*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 128x12x128x128:128x12x128x64_n"a99093a4a53c1da5d7f4d001c40edfee*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x768:768x3072_n"46862be6674caf7e62fb6c1f08e1d237*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x3072:3072x768_n"fe80b70b6bbc62e8683685acdd10d625*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 128x768:768x768_n"ac576af0284a83dd7a49333e515c10c7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x2_n"093eaaebf4ef81b454373c878b257a03*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x2:2x768_n"27b59723e7d21d63fc42bdc1d0b914be*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x768_n"22900c7d601c7f80f93f6a62bd0c3791*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x3072_n"49fd2a3883f410fe824bffec6bc66814*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x768_n"fd52008ffebddf99919d4e1f25b7f2fc*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x768_n"4c6cceed10b540484d0ce2f0181c5a02*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 128x12x128x64:128x12x64x128_n"aa488939d51685d0b12da2084a8ca8e8*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 128x12x128x128:128x12x128x64_n"22e033aed698f740e8b04caaed6acc51*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16384:16384x768_n"88870e2cedd914f9b909604f6207704c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x13:13x1_n"8f6de94cc171ede184966cdc06f1954f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1053:1053x1024_n"70e916fd55b6a633fbf9ea819d268912*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x1024_n"c1ce21c2d089a191ca7c7c4888dd1be7*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1_n"f531fff726115bc121952ea59cc32b23*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 13x32768:32768x1_n"bf322fc176d481e6925e5a4e821836ca*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1_n"62d9aac6d8147de91d224949b35fecd1*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1:1x1024_n"70146d5e3e8f05c4c4d7276b784f2faa*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"53c8951ef462591cb85b2bd6e6295fd2*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1053_n"70917a89b3b59d1828d17f4ef544559d*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x4096_n"dc6c21afb52b3ecc8b2b84f08c0103e2*1078"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x240:240x4096_n"636aff884331438aaf1e0e660326d664*196"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x4096_n"d59bbd2316000b331b35bbd48dd8c9a0*98"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x320:320x1280_n"7dc2d73a227549da79a18887915857b1*784"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x320:320x320_n"0427aaa87bdd5146fd549b80dbde1aa3*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x320:320x320_n"b38b6281498592ebe68ea5d5df42caa1*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x4096x64:40x64x4096_n"4aba7d38b716c969c6c7b38b4b599131*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x4096x4096:40x4096x64_n"84f823c4ae885489db0acb0519ed12d3*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x320_n"bcf9af410d7627765fa6fba42a51b1c1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x4096x64:40x64x77_n"0ed29e45733fdef8693344f525807baa*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x4096x77:40x77x64_n"e9ca20e0b176e6fcd44c1a0414b2ce58*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x320:320x2560_n"816342f8f13f8d473c15b9e9a1d2fcf6*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1280:1280x320_n"51d09e51f06f8c1a1c53edd1b37165c9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x640:640x640_n"f063e696d992d8a43113cf396615470c*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x640:640x640_n"30c2ff5be9cf40bf332edb77001203c4*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x1024x64:80x64x1024_n"52d06717cd860ea7b64fe875219442f0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x1024x1024:80x1024x64_n"5c347c399bc6f4295a2d5cabc0d95340*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x640_n"6d2474546494419cb14af3d7667064f4*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x1024x64:80x64x77_n"4662109a345d46b1ea48051725cacb19*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x1024x77:80x77x64_n"897a3cc65041ffef5a4cd930b28a69c0*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x640:640x5120_n"1452047b6d2a7c10044d59b0af2f1e59*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x2560:2560x640_n"d64e3c26c6aad4ced59a8471c320c7dc*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1280:1280x1280_n"4a07245a9e39658dae639d36783a4486*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1280:1280x1280_n"b3ee9e1513382d2221acf9a896e07d33*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x256x64:160x64x256_n"e774ebc0ca6a82e05731c3463c974f75*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x256x256:160x256x64_n"2471bcf5bf8bdcbf1599733da97db975*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x1280_n"f3132bf90cdaa13274774fcb58e1069d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x256x64:160x64x77_n"4f64aed18f5442915f0692a18c726c84*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x256x77:160x77x64_n"e300e0b80f81e1922d0ea8d524ec701c*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1280:1280x10240_n"a43b76a89a1c5294ae952312bee42253*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x5120:5120x1280_n"b252126bb9a9c2473434e01d0f3966e4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x64x64:160x64x64_n"d546df401715382ceebf03ef1318b76a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x64x64:160x64x64_n"9e7735b0051ce8fbfe6d42722879a2b9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x64x64:160x64x77_n"7f23fd10f5578dc7ba9af1028cdb7c7e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x64x77:160x77x64_n"9d6ce1b25361d171c78cef5765963446*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x320:320x320_n"556110a1b8b1a130878f2e3954d9bb98*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x32768:32768x320_n"db852411130dbb7ec4d9dee48ae5ddc2*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x320:320x1280_n"e9e70632ccf0e4541f4a946653727f69*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x32768:32768x1280_n"15175ff1a0517bfb657e5e4da9a8aa35*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x2560:2560x320_n"33c6ab1ac85ca7f061e54fce72c951dd*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2560x32768:32768x320_n"30ee5f6e640a43b662fa7dafe5401722*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x77x4096:40x4096x64_n"ec53dfe589424d0871f50edb6fd4385a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x64x4096:40x4096x77_n"120676c9280c56636091c5940cf7a3b1*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x616:616x1024_n"8e7705c79609251a27c968a9222b727e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x4096x4096:40x4096x64_n"f05488cf114b41a35202160c21fdcb26*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x64x4096:40x4096x4096_n"decd181d6d809f01a2b69060e65fb4eb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x8:8x1280_n"1b4e2ec34ff570b4695d12333ac2dbfb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=640x1:: 32768x320:320x320_n"449b712c2c747cbe3487970efd50b5df*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x640:: 320x32768:32768x320_n"0227616d4ed8c5973cd5b82b396df4ea*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x640:640x640_n"70a4a056f58db4b3abd2a2b8435110fb*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x8192:8192x640_n"f489714732cf0e9652dc026657b2691a*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x640:640x2560_n"da15924b637592ffbeade6811b330183*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x8192:8192x2560_n"36cea599d5819db291cd5382f27b13b0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x5120:5120x640_n"c16826ab69548d55be7512ddd880b677*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 5120x8192:8192x640_n"bcb1aafde10b044f266b444ffdb87ef8*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x77x1024:80x1024x64_n"161824d5da5a0897cd7bc83fc5fd6de0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x64x1024:80x1024x77_n"8a9f551708647398e1163520402010c5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x616:616x1024_n"c75ba114f9d3a11b6d30ee878ed03e55*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x1024x1024:80x1024x64_n"996dba8f1c4cd22bdb242351e1bbf3d4*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x64x1024:80x1024x1024_n"3aaf7fed37e132931ff3291179c7d967*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x8:8x1280_n"06b702b765e3909ea7c90648832dc670*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=960x1:: 8192x640:640x640_n"7a58511bf3f3c683d667f306c91f9350*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x960:: 640x8192:8192x640_n"5147831af762d684c3bca848840bcc61*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1280x1:: 8192x640:640x640_n"b2d34feb0298dd9ed2a21c44ed0477e4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x1280:: 640x8192:8192x640_n"cbcc256b2ff1fdd86bd59d06abe60322*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1280:1280x1280_n"1d21f2927856693ade70d8077b50014e*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x2048:2048x1280_n"ceb4d4f656800efc692f149399818ce4*38"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1280:1280x5120_n"628db77bff8fac6e118f66fb522d1acb*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x2048:2048x5120_n"a5ed8a6de3f749b0679f7e3f4e5ebf65*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x10240:10240x1280_n"6e5356b9994cee01abaf4e721d86a295*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 10240x2048:2048x1280_n"f70c3f2373c5a54b4d8d241c2b1c8df2*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x77x256:160x256x64_n"402c1e4d884b623db55c7d19d3d25daa*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x256:160x256x77_n"33759687ccbb71dc429339b44b4d0897*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x616:616x1024_n"1d00d614019312463593d11430a4613b*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x256x256:160x256x64_n"933a7d0f7e6646efe1aa907d3f5d1462*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x256:160x256x256_n"38bd7f4c3bdc510cc5462ae785ab49e7*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1920x1:: 2048x1280:1280x1280_n"12ebf690397e2f3708dd9016ab23a1de*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x1920:: 1280x2048:2048x1280_n"f0b92441df442480276eebf47f277240*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=2560x1:: 2048x1280:1280x1280_n"e70af179b1e0cda72959fa317297d970*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x2560:: 1280x2048:2048x1280_n"82927dd6ea3779d86a948a88ebbbc469*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x512:512x1280_n"e40ef6122b3bc3eef562c69d646de3e8*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x512:512x5120_n"e57d0610b1a90c18f39a1d4bd62d617c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 10240x512:512x1280_n"9c936ce644f2d6811706b4dabfbd2764*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x77x64:160x64x64_n"9269b1e6c1d1f16b4270812bb96e9a56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x64:160x64x77_n"328723bbd26b6e931ce96ad1f60f0898*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x64:160x64x64_n"85734242358c3284ba9442f2d9d3e69e*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 283x513:513x80_n"49d545de8ed59db35e6b895bdb76807b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"73bd20bc0fee7670f63213c8c35f58ca*19&a0485eb8465fd2f1dd3472eeb6630db7*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"9c3e863b4127318b342d5ab4b3178a1e*21&69dee38346b02f2581fe51d65bd9edef*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 10x9216x64:10x64x9216_n"2f9b84f71e5fd2f8a341502336ad219e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x9216:10x9216x64_n"98b03334d235fb43ca5618896e84b0cc*5&eb1a54db159b85bc3032da0057134c0f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x320_n"3fd2d02f7b4f58c90b013fe4d303fe43*10&713aa06653d83432af43900c269e0348*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 10x9216x64:10x64x77_n"a51fd1cddac3659827cc9e10e6d8548f*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x77:10x77x64_n"2a619a07ea0bc7ff567dc27bcc3f700a*5&8b7826dcd02cd8c2741866a376c06885*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x2560_n"1a224949d57c361edf0db5f6f3cabd8c*5&4b40408f40b6c193be30ef74d2e22c08*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x1280:1280x320_n"49ac51c4ef0c662136409f24fb735fbb*5&f7bb3b70f7561722ecac4c3cb01c5b69*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"e01a72072d5c0ff67fed3c29acf69319*19&daf5440f37c5a1d872cf81165da19786*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"8ea3d0e2b380b7312cd6934a77994957*21&d09f16f82a91e7269e7669ad97aa046d*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 20x2304x64:20x64x2304_n"14f5c85e05f4b41b5231e29643ce68be*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x2304:20x2304x64_n"289a4eec57394435eeeea0f088c515fa*5&ac569e881313106a53f2d428d1a4478e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x640_n"c9b82be63208bb61a0c4fe6f8e252ee4*10&cc9fa47d7e10fd518cdccd4015905710*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 20x2304x64:20x64x77_n"a8e258ed92a5f99b94227493ac6fefea*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x77:20x77x64_n"38015a88593f265e510b21380f1e3bc9*5&480a00b88c0de6adf9c78dbb8a365247*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x5120_n"24767ef7cc389aeb6e7526ef1b4f139e*5&b2b16f83e2548a97f3d746931a0e414e*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x2560:2560x640_n"95a0b07920787fb3cc76a6d942f2931c*5&7a94a5ce4e675981d242fe674761c879*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"060186f91196a6ff714dadace0fc3246*19&7b5b1f7459985cafd3a23ceed6137036*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"0343f1bcd2ca66142453afb2129433e6*21&e065aeec7cb9637158a1e3d34666ad7d*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 40x576x64:40x64x576_n"803b20d3db3b6ad087369daf58a6504b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x576:40x576x64_n"2f15b611699cdb56b2a2f11497af061b*5&754b42a85d261d1f74da741f4e408083*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x1280_n"3c5e46736c43e24eec635b72569f3976*12&1496c74898478ca2c9e304e82ea02ffa*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 40x576x64:40x64x77_n"8a5565b698fb7304a5d1df5d4bce0a22*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x77:40x77x64_n"73cdd2eff410a784c8ca2ea9a9237c22*5&6211e952ac9606a318b6bdc184965d8b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x10240_n"80f1a8413952893a36e8131e930c2c3e*5&32ed39cf9cca3903c9d9309570e0b9e0*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x5120:5120x1280_n"a666bb1e1d9cc2d746d2f38e0ca0aaca*5&3e1ca4aa932fc8db5e21800aadea1ea9*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"c71d63c6a1b3fa87aec01f7e7ac83f5b*4&143517aaa39123eaceb4f66f4a9ecb1f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"a4119b5bc7eb01b13d17a2bb67eec548*4&8d1560722e1be07e766b02993009a189*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 40x144x64:40x64x144_n"35db98ea5b3f33c5fddf522e12e09a52*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x144:40x144x64_n"c873c2eed226705635f734f29b8f0eee*1&ae5c0edc09448381350f976b5ecdd041*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 40x144x64:40x64x77_n"723b0fc2ddaefde7ad7d73cb7dbb5f75*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x77:40x77x64_n"8a19957587b0c93d2bd482e3bbf1d55a*1&d1dfed6a857187a83db8cb7089383c23*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x10240_n"a3c13a372ac0f5fee7c2bba713bd0a3b*1&54b5078743c780d462f3249beffc605c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x5120:5120x1280_n"06457486a26489f8d610837c16dbfc15*1&736a6aaf6b2d40ea3ed33af270868340*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x320:320x1280_n"2924cc54990e9c9d69f46682c23625d9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x1280_n"42a146a6c1e4c8d2438c043cfdc8fa1f*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x320_n"30b7861ff0ce8b29defc0d568d527ec3*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"44dacf89b049ab6452d5e503c446350d*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"9424ddc364c4de605fdb83e89981ed14*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x64:10x64x9216_n"17a072e927dd4b97c7d06d8815ffe497*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x320_n"34f2266ef344f7ffe9292549020096af*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x64:10x64x77_n"b4bce7e2a55b6d43ec7cbada220d7267*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x2560_n"be45b3f327697116d9e7bf2786483155*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x1280:1280x320_n"6763f5e75e5a61c80f0b5fe4634fc23c*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x640_n"f3718d944557b7ba9531176523f1a55a*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"d99f433168c839f7006516333f127fc0*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"00f88ae6213ceb75df0ee55536ee9d7f*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x64:20x64x2304_n"4c869061a1d19fd9eeb3779a132afb73*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x640_n"abfab72f883a09ee1b8560f9c0eb1f4a*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x64:20x64x77_n"5e3f2f0ce6ed23ef6b73892ed5a43b48*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x5120_n"5d15ae361fc2a0c6fc460c98f2e04351*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x2560:2560x640_n"f098bdbb3014fecf7cd157447536a22b*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"6110e63d9e792a1b61e76a66460c007b*19"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"99779306ac9183178af25f13f1d10038*21"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x64:40x64x576_n"f6aca024f1c5e4641da5818a45e886ba*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x1280_n"08f76f258ad8399ff0c48c4f178e2969*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x64:40x64x77_n"f2b3110992bd9f6ae41a7aff08fcbdae*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x10240_n"56462027841e50c61c7f1bf8f3d85ea5*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x5120:5120x1280_n"7b6d7e599baa4bb244bcdffd399c9465*5"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"f6cdfab9f842b566e7805c95c400543a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"6131e327b8554a2c0ea481d94febfbe9*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x64:40x64x144_n"36d7aa0a2baf97980866767a960700ea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x64:40x64x77_n"863cf413f41ec507fa1e6366d545ea3a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x10240_n"0c7cd3fb5d87bee3cd645d6fd86d9162*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x5120:5120x1280_n"fca398e7dd09ca58fcac2bd59e4bfdc7*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x768_n"3e19bbd7e364d2fc1cc575cb19d56f8b*54"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 384x512x64:384x64x512_n"f9163aaaee8889cb490e707571d0cb96*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x512x512:384x512x64_n"e4047921eabcc9bbe91419198022050c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 16384x768:768x768_n"de36400c4260fcdde718b206bceef5a0*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 16384x768:768x3072_n"b5eff94436ba149dedc0ed0229d0a074*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x768_n"803950fe241fb77033480a2e1170c358*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 16384x768:768x768_n"eda9246d6b583954a90ce1d3f8e63404*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x30522_n"f84b231ae2830bb5932ad4a6d1bcbb25*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x96:96x288_n"334f4e6f4710dccf0d04ebd648070586*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x49x32:192x32x49_n"86484aca29f30421d92930e5c77d2805*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x49x49:192x49x32_n"4849035b9ef09b448ba9204f41ff6ade*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x96:96x96_n"263af1ed830ba0ce6cfbe50074604cdc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 3136x96:96x384_n"f81be7aad5ac5f1a8312866b75699414*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x384:384x96_n"7f4abeca561bda6c37d1cc7529aa4377*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x96:96x192_n"144ff3b3a3d9f3d4ff69d0198c3a1043*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3x3136x32:3x32x49_n"7ceb1babbb396df4afd41db09a301b8e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3x3136x49:3x49x32_n"4544119fc33ff4e53a57b7345cff5785*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x192:192x576_n"697a59b03427447d74e908156fa619dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x49x32:96x32x49_n"ef3748779531f83164402723fc969e9c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x49x49:96x49x32_n"37312b6656421a39b7f89dd223918b06*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x192:192x192_n"534f8facd6cb03a26df09383e785600e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 784x192:192x768_n"032f9827c74503e6fa4ef958fce0e71d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x768:768x192_n"e988c7dcbd95189d345955ddd53a8b34*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x192:192x384_n"2282db094d489ce989d88c78b03fb3fb*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 6x784x32:6x32x49_n"d6c3e322d61624c2dd81a9336e5a5fa2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 6x784x49:6x49x32_n"16999add598cb5dd6c48c80dbb6be5dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x384:384x1152_n"ff3d83ec0d95e317db964ff2642ead13*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 48x49x32:48x32x49_n"5e41045d8ea89b8fe5706e34d2a06a7a*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 48x49x49:48x49x32_n"df23fb27157e29c37beaf56b4607a4ab*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x384:384x384_n"73b0ceb98991fe32cc824da4efdb61c3*27"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 196x384:384x1536_n"7763a29af96b1ba41eaede149fc5cc21*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1536:1536x384_n"652de3f87c51dc988d73a797d2e7d751*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x384:384x768_n"4d6c61f9f8d6bf0066e3b843329b861d*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x196x32:12x32x49_n"f229c51363c5ea176b8b0c0fc49a4761*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x196x49:12x49x32_n"5324c1af875e8e8eb1c3a113281e4f61*9"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x768:768x2304_n"f232cde74cc62ce5cf5eaf5d72dfa9ab*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 24x49x32:24x32x49_n"4c8df074e0d3f32b3c072637b025e9ef*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 24x49x49:24x49x32_n"0a41439508af3b0670dace29acac4e3c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x768:768x768_n"af3419905ae5b1d7a7eae2cceb822d58*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 49x768:768x3072_n"4591cca7d14cbe3c30df00a06f7473c5*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x3072:3072x768_n"fb65e79d4399077aef8665c1ba8ba9cc*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x768:768x1536_n"d9efe5bb5b6e0ece107e5205a9c331f0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x8_n"f7148caf045c96580d27c20afe5b35cb*13"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x32_n"d2ff99343957817b4dd2be453fdd8279*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 1024x4:4x8_n"6f5ae479ef4745933feeea83c32a5af4*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x8:8x2_n"cdb2b87477e48a78934d3dc96ad49ba2*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2:2x8_n"76d1e21a4b479d7ec1bd97ddcf48881b*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8:8x4_n"5cd50a7fdebc1b0f56f00821c16888e2*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x8_n"ac1ecb6a6e63087a4b521615d2ead152*14"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x32_n"62a9b4256562336ed42c49972977abc2*7"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=u8:s8:u8 --bia_dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"25ebaf3b1b171f88378b5b2fd5d7a621*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32x1856:1856x128_n"3fd0308e000430c207f40bffab9daafd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x128:128x128_n"6b5f7f8e48306523f3e7210803984582*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 32x128:128x1_n"540f2d1911a680d26244c6a4aebbe2f1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x1_n"a6d9f2ecfa9d3480f121bf3612c99ad0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x128_n"64155a7dca0a151ff0b83bec58e0adb2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x128_n"584076d92ba0a66774fc07554fd5bd82*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x128_n"ad7f40b3bb1b6730a23f1eece15ca023*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1856x32:32x128_n"c3caf200b52a080be16858a88217ef25*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x1856_n"25defc72befe993549ac607b6038df4b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x287_n"a054636078f7adc9e7be7311b73a0b04*2&95a52a2a0eb67f6c2640d30b20346c02*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x681_n"031eaf5109012aa9f43140fa21b4d9ef*1&fa022bea831501cd6bc046cabcf99079*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 42496x681:681x92_n"7e3eb0571a7a85f906580ad1e7576ef3*1&0e57b030ba4df5f2f227e271e81938ad*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x144_n"913f86e0cea7348ebea285b0a3c3de0e*1&76ce00e5f661bb58f2d9809aa478772b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x144:144x72_n"f0ad459761e8dd0c20a6ba065df614ac*1&a2df4a9b08eabe1dd53023ec795c4191*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x72:72x1_n"9eb236dc872cbe1525cf37b5f051f5a8*1&85b46eb82c5319450873a8e4204cff53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1:1x72_n"4f974902f28dd5260a9403b140f389c1*1&083380ae71b60cb08b315b143289d469*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x512:512x72_n"9fcffb53ac95b287fe6b7a6679caf6e7*1&de9e8a8df62014c1b4358b36e06538e9*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x72:72x144_n"fff5cb09f16d6833fe52abcc8c1c3db3*1&1c2f96453eeefb8cc29a76e385d49ba8*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 72x512:512x144_n"08b45a03afdaa7092465f11832fe25ec*1&55e64e142e6018174d49041f1231bd5d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x144:144x287_n"12a0f0d92414993046e7efde5eacd917*1&d3c84cd065af1780179d1300fabb56b5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 144x512:512x287_n"17100d3ec6bc968688f938a326921791*1&4bc49968be976b260c89cccde8a7a935*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 92x42496:42496x681_n"27ff3e1211687c9a1af8cfad762d3526*1&3d44dc777594a9a54a4a4941dd84eae2*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 42496x92:92x681_n"0e2809b0ccf1cc3fd9046966cdcec4f0*1&4e5271a61ea684966e772cc6b25a415a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x681:681x287_n"86687bc3dbf7b3ae0847e43ae9e467f5*1&672b72d935d4f0ba94ce19db835d605a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 681x512:512x287_n"e401da4bd016b119464f8bcea683fa9d*1&3460d1adc92f8bd871765e94f43cfe56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x287:287x256_n"2d45d20f3599aa317975921090924429*2&ae12a4c88fb444179c6bf107b99746e1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 287x512:512x256_n"56c09ad239ed2cfcaa6e28683847a999*2&1c340271408a3eba74004f7daed6a91b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x3:3x2_n"93e4fa9a9e608d7c4ad7acbedaaab845*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2:2x3_n"a6cb1ccbcab7e788368e1677d7f1f570*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x3:3x2_n"918a49cba52f1d5350f8cf37a9be3647*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x8192:8192x4608_n"68c480baac21028910badc82f0743721*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1057x64:64x64x1057_n"e5aef089297e7ae3e83bfcd818ca2b30*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1057x1057:64x1057x64_n"319b56ac1516451a87f1944278e39ac7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x4096:4096x8192_n"51b3ded905366330c8b3e8326ca3e085*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x8192:8192x16384_n"7c4ba8869f961eeeaf49b13c89b61d42*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x16384:16384x8192_n"1236f961288a44d767fdea38b916f468*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x8192:8192x65024_n"ddd73c015b49961ec45d598e4ca7a98b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x8192:8192x4608_n"1432ca7470e34fd19455a114c3c79665*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1058x64:64x64x1058_n"15bddb8d69d353a492a3db999d56ffc9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1058x1058:64x1058x64_n"91b04e77ff055fec45612303b5d67eb3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x4096:4096x8192_n"dd97048bf9c432b696c47b4dd00d9d7c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x8192:8192x16384_n"7757df24890f04e611454d38dfbed0f3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x16384:16384x8192_n"23e4f634f9095c75fb66fbd9aeb164ce*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x8192:8192x65024_n"a6facd451d64ae5d1c4c3b019204c340*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x8192:8192x4608_n"af53df6794b8fb9dbea9a8292262a4ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1059x64:64x64x1059_n"370e7358e04cc3350ed3aa499e21e78d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1059x1059:64x1059x64_n"0ddd5fd26eadb93a41f3e1112172b20d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x4096:4096x8192_n"ae6862ff9ffcf7742cf709b6f15de3b0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x8192:8192x16384_n"147024714a4e995e55341cfb014e0166*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x16384:16384x8192_n"4af3e0aa5526380146b536743a313a2b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x8192:8192x65024_n"32e655c2cba47886af93b189296d0471*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x8192:8192x4608_n"39932c4ce1fbb9c59af4a92f486e8f51*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1060x64:64x64x1060_n"e45f7336d03c8a0210853c362cca9609*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1060x1060:64x1060x64_n"53012db991953cdec912094ac08f969a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x4096:4096x8192_n"2452bc0d8fe9d1ac101806d3392bf420*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x8192:8192x16384_n"53c97d0650fa11f7541f7b7fb8f27f1c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x16384:16384x8192_n"c9212492f637fea2c8e5eb361634b0e1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x8192:8192x65024_n"18efbb4efdb65152f0d8d82d18264cba*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x8192:8192x4608_n"05cb19023af2f44ea88ae831f071cd9e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1061x64:64x64x1061_n"73c7b9ce3411f2da259c85553b4467fa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1061x1061:64x1061x64_n"22e8f5776a829b79a1dc43c379f736c8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x4096:4096x8192_n"214c8ada014fc5d02c1001eb6dc36e45*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x8192:8192x16384_n"f081a4a978f4ba60cc844ddd82aee3ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x16384:16384x8192_n"87729ce2086f67cec208772f4533ba6a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x8192:8192x65024_n"3a2771dc5c8ddc06ba1d378f4dcac11c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x8192:8192x4608_n"0e8d23bcd8f9b9c53808c7777666bafa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1062x64:64x64x1062_n"1182b8fde32a23f9d7cb9bdb91f1e882*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1062x1062:64x1062x64_n"3505921df62e8e5cf06918787304831d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x4096:4096x8192_n"3ab5c2cdc7d0d2ff8c1d6d60da48f1db*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x8192:8192x16384_n"89897529df909efee42a7f5b115d8a0b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x16384:16384x8192_n"1d581bc10a0c20af2af33c83a5c6bee0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x8192:8192x65024_n"1aac29daf7ae5db8acb4b1ba7d368e5b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x8192:8192x4608_n"45e409931d7dba21a823a3706b7fe86b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1063x64:64x64x1063_n"aad67da0d238029f4b4acafa6f5bf64f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1063x1063:64x1063x64_n"984f6b189b0e6a343858402954db499b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x4096:4096x8192_n"69d8a1ba9b955b321c18ddeea623ea1b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x8192:8192x16384_n"c9a3cfe7f70ae30bbde8705f141d2b6e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x16384:16384x8192_n"c1132c6329f41812afbf5341ea658b16*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x8192:8192x65024_n"4b2f410f2e8fa009471128854f928ab7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x8192:8192x4608_n"83d4d6557f94a1581a7f812c6e66d056*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1064x64:64x64x1064_n"8035505b25b1db2af2349c843fd0fb82*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1064x1064:64x1064x64_n"fde98aec8068497424efeae6cfb60df3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x4096:4096x8192_n"ef1e3a9053e6c638524958c6170c5046*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x8192:8192x16384_n"c9dcae472645da5c679f638e23fd0f95*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x16384:16384x8192_n"3a39410d1d61a2aadf9fcfb922fec840*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x8192:8192x65024_n"59d212040ab3e612a3f93709a2f2c162*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x8192:8192x4608_n"ea284eb8c86c73d1eb8821f01f59e312*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1065x64:64x64x1065_n"5d7e2af2c8b478f499b71d8cefb1eecd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1065x1065:64x1065x64_n"31b1991fc3aef67e6dd87dbdfe1ad236*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x4096:4096x8192_n"492dbc3c7638bddbe286fd945d15cb37*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x8192:8192x16384_n"953b2f86bd018f076762a87ea32d3333*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x16384:16384x8192_n"5f945ee20fa9ba6034567b6ac5d0ae91*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x8192:8192x65024_n"7d813be0ae2a05f7b1781cfb86c87388*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x8192:8192x4608_n"578dddb77d0b349285543df92ff7d5ee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1066x64:64x64x1066_n"a43a4d18fc83d67843d9f64059424eb7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1066x1066:64x1066x64_n"eb1f402196628b041b0c7787bff749fd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x4096:4096x8192_n"60790ffeeca74fff205537d295cd2dda*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x8192:8192x16384_n"e1a18f2b8ef996888a33e04bb6d802eb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x16384:16384x8192_n"56a09b67cad1d344fd09230782856a78*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x8192:8192x65024_n"15116ca1fa6e487555ae09d1de71ac66*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x8192:8192x4608_n"68124f1b9eb3b6ce7da4631c947930ad*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1067x64:64x64x1067_n"3aebcf1596aab852034d2aff714fc6c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1067x1067:64x1067x64_n"1ad7dece98a74a2c7702367057fcdcc6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x4096:4096x8192_n"9af373205d2b3114b4974bcaae35e0d0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x8192:8192x16384_n"38dcbacf7fd29248dff852bcc97f4030*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x16384:16384x8192_n"c825c69c4376437f1f85478295213c13*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x8192:8192x65024_n"4d93f49b8278e4e70bbcb474bb6ff296*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x8192:8192x4608_n"9775a6eebf9d9179993aacbd3908e8b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1068x64:64x64x1068_n"f66a6cdf25ba0d5262b34f265bc974e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1068x1068:64x1068x64_n"2ea76e8915c8a982414cdf6183f64044*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x4096:4096x8192_n"7e45e224ad7ee17dc96d612c7d0cf49d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x8192:8192x16384_n"6df7adfe9a5566db12a35a29bd0e446c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x16384:16384x8192_n"594ef0c763898b1a90ea6a6f16879afb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x8192:8192x65024_n"b2d7aa6cbf44c20c543e98067d5b13c1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x8192:8192x4608_n"2f0cebdd2e1275056e714f5047620652*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1069x64:64x64x1069_n"260c93a0a4a2f7534ff6add4d4b5da00*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1069x1069:64x1069x64_n"9fd60be13a215b0de6cae101edfc72cb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x4096:4096x8192_n"da6eda770c5cbc70f2281581c7c341c3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x8192:8192x16384_n"c4be402ed437671a05a186490419ee0a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x16384:16384x8192_n"11892f352650d5a88f29a3e28076b47c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x8192:8192x65024_n"8aa56d150462b1a29868c9d13b362f58*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x8192:8192x4608_n"adccc7c12a0ba3dd0041a02a779ae7a1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1070x64:64x64x1070_n"e0b2e1d53fc13c02e2d5b42e4185bdc3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1070x1070:64x1070x64_n"c529c86c2b2381fa48d1bb06a3368ea5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x4096:4096x8192_n"7045f7a3a6743f37e9f89f51d0ac20dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x8192:8192x16384_n"96f340084372d0768b3b2693539bfdb4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x16384:16384x8192_n"04507fca33f900a91b1ed79bb761273b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x8192:8192x65024_n"1b91400e1a6af6d232f4e9b65325f3b9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x8192:8192x4608_n"bb0c21493550389828d0ad0aae2c97e3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1071x64:64x64x1071_n"1e8b02d3800d51372e9dc391a8df6799*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1071x1071:64x1071x64_n"bf3bbd99f86133c89e61ba4831b70f2d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x4096:4096x8192_n"0427e1acaf8f8dcb6ccdf14a15abd082*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x8192:8192x16384_n"0e40495591915fb22e624bf068d7ae53*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x16384:16384x8192_n"79eaf0759e6d8dc940a50e94198bcb53*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x8192:8192x65024_n"a0c8fac34f5bebb229e186659accb129*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x8192:8192x4608_n"bdc9fd2c14baf170b3533b8d05a2f89f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1072x64:64x64x1072_n"8202342da23a0431b85841c117c2c954*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1072x1072:64x1072x64_n"6d26d45acbf4b00b947b3577c181b87e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x4096:4096x8192_n"81b3b4ed8d2df7203ffbd1bb91e0f2e8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x8192:8192x16384_n"d6490a0d91aa95c4f5d941e3c49f18c2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x16384:16384x8192_n"6344dc8c8a819ac5444e1463d963b5ea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x8192:8192x65024_n"99a8a74d22fbae10e6b216afbf6161f1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x8192:8192x4608_n"f623f75d7e9d5c35d9e8f9d400385c2e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1073x64:64x64x1073_n"6b7828bf4ecab6984465d4ced60f3567*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1073x1073:64x1073x64_n"8653408473b6e6d18696415412b5b378*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x4096:4096x8192_n"aa085996000577d7eb699fcb5ceb2d10*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x8192:8192x16384_n"a86d9cb09fdac5c130b58b64d0611e3f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x16384:16384x8192_n"cfb3274a77496463e56cd93ea94784c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x8192:8192x65024_n"3cd2a543c59a00779c8b3d2421bd8540*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x8192:8192x4608_n"ee5b636c6e6f1aab48039929a67381fa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1074x64:64x64x1074_n"118500c66c3e4750c60149ad34d7befc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1074x1074:64x1074x64_n"f51cb02305236f0c862f4ef0ea534a86*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x4096:4096x8192_n"efca9c67f4646a63196a8aa827be003b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x8192:8192x16384_n"07f6de8bdcdc31afb15910be11e7f251*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x16384:16384x8192_n"119dd9b4dec008f5852a2eee67897119*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x8192:8192x65024_n"c87c022c74889cf6dc4f32d3bfd90f8d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x8192:8192x4608_n"5e628a5b9b6c425eff2a81c7bd9d464d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1075x64:64x64x1075_n"13418dedaeaf0d716e4b5bf171839acc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1075x1075:64x1075x64_n"bdba8128e25d88903dca47e4a1cdf245*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x4096:4096x8192_n"c0d2748f57376d3244f953129eb5ef63*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x8192:8192x16384_n"10075bd25d9b9262783fb63936a36ad5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x16384:16384x8192_n"3a3ed650c3d801d987bf50ddb8d5aa69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x8192:8192x65024_n"dc0081dff6df0b0e7554f19dc5dc6bf1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x8192:8192x4608_n"b98a6706d48edb65fd266834a1500767*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1076x64:64x64x1076_n"35e9ce157b02c77ca53239b93c313882*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1076x1076:64x1076x64_n"f5337fa7861a823eeb1795db5b2dc179*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x4096:4096x8192_n"c592c8c3ae1916e0e2c4e80fe5598f9c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x8192:8192x16384_n"a3ca0f88450ccf787ee4b2558b28f90e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x16384:16384x8192_n"e4981659db9fd56870c270d0d8515ad8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x8192:8192x65024_n"52864b478a0682caff68f1388c8a37cc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x8192:8192x4608_n"d4d1e370bb06bc39c1ec238bfb42e69d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1077x64:64x64x1077_n"0cfd3784c34c7f8a3d1018722a3575ee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1077x1077:64x1077x64_n"450dbf1fbf680bdc58173e7feef5fc3c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x4096:4096x8192_n"82a894e4078169f22320cf9d52483b57*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x8192:8192x16384_n"0f34c5c4aaf40c3daad2e24530a94810*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x16384:16384x8192_n"fe9c9ad11eece1554adc1e16f209a1c5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x8192:8192x65024_n"046f2c298181d62ad1825d30018c3644*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x8192:8192x4608_n"0bd6243da52217d1e7137c23dfe6a251*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1078x64:64x64x1078_n"71d400d4635d80a6d2a6775a5319bec6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1078x1078:64x1078x64_n"6943519c4bf3f71655c91a9ef5a74754*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x4096:4096x8192_n"55682f1cf5319c607b8c5801d780981f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x8192:8192x16384_n"a9910bb49b26b87300fe6b64cd138a08*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x16384:16384x8192_n"7e75f8743f901f606e5172dd5e9a92f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x8192:8192x65024_n"e640f1263682be5f4b80c83f442157b0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x8192:8192x4608_n"7047845e30bc31d7d116f203312eef35*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1079x64:64x64x1079_n"083ab82ec2f6241d0aa2189fdd3fb3c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1079x1079:64x1079x64_n"7d2f7c861e0c274d866471f95259e550*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x4096:4096x8192_n"ed9e1ea1f9750b300f78c57f8c8768b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x8192:8192x16384_n"67e9401530312686a2458399bc995e65*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x16384:16384x8192_n"098f82678135f03ae78c88645d73405f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x8192:8192x65024_n"9cd8b73bf0935c763ff91d6dcde2f296*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x8192:8192x4608_n"a6a7e304c95a3e26403c9c35fdcd1e53*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1080x64:64x64x1080_n"09d181cd49cab97729e43da470fb2384*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1080x1080:64x1080x64_n"8c016ea8f4883ad9e7dfd218f163d9f2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x4096:4096x8192_n"30a1504f28a2857dd6e92daede2a5f69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x8192:8192x16384_n"5094c514a5943504d347e43859ebf117*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x16384:16384x8192_n"7f7a5126aca470136e169cc8222dbab7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x8192:8192x65024_n"b476c4db0191f9dff5ab20ab0fc4c106*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x8192:8192x4608_n"b599508bfd3bed85bf5a425280f56def*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1081x64:64x64x1081_n"db6753a92b6b62e558100a5a7da9f350*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1081x1081:64x1081x64_n"35c07c4a5f42fc75d47f41300aec2ad5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x4096:4096x8192_n"f35b130658ea6c349791987e1db368da*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x8192:8192x16384_n"8215cd8745c35c8c890eeef146e402b9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x16384:16384x8192_n"f0855ac4790384194e3d7d7df55e0495*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x8192:8192x65024_n"9e30de033d47873c21d67d192aa398bd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x8192:8192x4608_n"71a92da901c0d05aabe1dd3001261e05*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1082x64:64x64x1082_n"82572dde9b713819704d83d49354dd7d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1082x1082:64x1082x64_n"f33f53f392e07ec167b41e787d4d7e9b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x4096:4096x8192_n"2b8f6e6f798bfc046a2cc669138f6774*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x8192:8192x16384_n"558eb74730db40b9185a6b71aabf43b7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x16384:16384x8192_n"4befb91de7a60b7878ecf09b6d104428*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x8192:8192x65024_n"6a0e8418c1624616321826f8f3e1c2be*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x8192:8192x4608_n"292042ea6b2d7a19aba7ec619fa1f5f4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1083x64:64x64x1083_n"c7b82975874c38dca78271f1f2759866*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1083x1083:64x1083x64_n"b44e91b1da3cf8b8426ec22e56677b9a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x4096:4096x8192_n"b2ac70adfc283dc27b4aec5037547a84*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x8192:8192x16384_n"7ab78267c7022a7c1fa2fe42e68dd1e4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x16384:16384x8192_n"05c48f462a9d368ee87513080f70878a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x8192:8192x65024_n"7c503e7a61c4b2d11ba5f8b024136260*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x8192:8192x4608_n"d09216847668103b7976275cb6b52375*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1084x64:64x64x1084_n"c2eeaf3ab0f220784974a19a30b000f4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1084x1084:64x1084x64_n"d0afa1e8e262e674356c0c70abf1086f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x4096:4096x8192_n"ff84f358e63e45d9acace72cfff17a43*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x8192:8192x16384_n"e3cce87046bcd597e0e5ef9d2a2a3870*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x16384:16384x8192_n"2afcf9fb1fec83263ae253ca9fd45463*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x8192:8192x65024_n"14198f3e30254a7764a1c9d23fbe2ae8*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x8192:8192x4608_n"95c8ede6da33788afb53c141d92718e3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1085x64:64x64x1085_n"399d1c54aaa15a9cbcc5ee4be3ff3144*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1085x1085:64x1085x64_n"a1e1fb6c379e335977c2203440dd09f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x4096:4096x8192_n"8cf4cbd55a7bc789dae1b3c1fc183124*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x8192:8192x16384_n"3f4983666b2e8cde2021940750899696*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x16384:16384x8192_n"f25ea846a64e3f45a487ff02baaf57dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x8192:8192x65024_n"176f026c2e6e2fd4401e8365210ffe68*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x8192:8192x4608_n"fb3c9fad086bcf57ded5253ca4a01963*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1086x64:64x64x1086_n"ea14a383cc5d023dd993364d55f91b74*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1086x1086:64x1086x64_n"7473e1e4cf5eb139151b78bf05c4688c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x4096:4096x8192_n"e6fb48f84b19d9539a892948960db419*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x8192:8192x16384_n"e42faee98637372bae3594be2fa8f442*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x16384:16384x8192_n"169ccbe5d3cfbaefb23aceb4fd6cc038*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x8192:8192x65024_n"58cbe1c197b8b50750974deeeb5520f2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x8192:8192x4608_n"771c804efc99d214dd8b5fb86080e73a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1087x64:64x64x1087_n"c24bb9f733d732d4b7bdf0514c632a58*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1087x1087:64x1087x64_n"5a29f659e29251a10bd2f2cd52bc573a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x4096:4096x8192_n"1e52a35d5269d7d845c8bbd576732db0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x8192:8192x16384_n"b6c8859f8725108554aed2ceb46602c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x16384:16384x8192_n"ee8fd8be3c23d927870644109677d71b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x8192:8192x65024_n"6aeffc9949268c300be47665b3fa6b1a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x8192:8192x4608_n"ddae1abcbd65d85e9832a272aeb66424*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1088x64:64x64x1088_n"2935806427054495605747b89f64ded0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1088x1088:64x1088x64_n"041322eadd638fe288a31548f68d9cb0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x4096:4096x8192_n"d82b9698cdfa71ea65f979759c1abbea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x8192:8192x16384_n"22db28a957ad1d0cfe64b0390c0e7f38*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x16384:16384x8192_n"b488e4ef5fd24cf33d78b846e941c003*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x8192:8192x65024_n"37d60a61cf38b83931b12ea38d8c6212*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x8192:8192x4608_n"3ad6753ff29a25a3cd7fd429d8b8630e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1089x64:64x64x1089_n"40a4363d9f4a241bb34d69941c166861*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1089x1089:64x1089x64_n"5b0f2018bd4ef599a452498d284a62a4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x4096:4096x8192_n"ac71d95952c90d3b6a9c285195ecd306*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x8192:8192x16384_n"0af2e76ee3ec560cf51b5be1be5a2ef5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x16384:16384x8192_n"e182fcb583ef4902ba5a136447b5be81*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x8192:8192x65024_n"73f87d071de826dc18f9b3fcd7974d51*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x8192:8192x4608_n"967bc0efbea5a3e5ebbc45c702926538*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1090x64:64x64x1090_n"311ba3df7d2e56e9727dbb84ca41697f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1090x1090:64x1090x64_n"9369879117206ea60d07f57cc6ed531e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x4096:4096x8192_n"69b4b07957aae80aa8de8e123cd226bb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x8192:8192x16384_n"723e5eb9487fe5686f78b9f7c08c232a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x16384:16384x8192_n"23d69128fe5021bca84ea61b891b2d69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x8192:8192x65024_n"2dda74806941f3011fdf76ee2f1ee59f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x8192:8192x4608_n"a0ec67c86b81b90a006cc2e3bcdc2b05*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1091x64:64x64x1091_n"f527066ac9d4ea0ec7be045013dc6f9b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1091x1091:64x1091x64_n"d4ffc1ae2a10e18faaf2b7e3b39760ea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x4096:4096x8192_n"0d885246d3f8a706e4c5d9ebb8536deb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x8192:8192x16384_n"cdb5b90e2034bfa8f12dae6abe5f95b4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x16384:16384x8192_n"fe21a42ef01d0b53beb163ba6e00b27c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x8192:8192x65024_n"eda622857bab25fd245310cc0536740a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x8192:8192x4608_n"2042ed3b8b3056c0fabc37c52bafa888*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1092x64:64x64x1092_n"509604730864b8a724f409e7f0b41d4a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1092x1092:64x1092x64_n"3af5d4aa1433dbb3b57776b98e303f17*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x4096:4096x8192_n"17ec552e836e9c0348661f69fde7c497*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x8192:8192x16384_n"4d8c3d319e6cf8f4767205eb2323eda6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x16384:16384x8192_n"048ac04b70227544f074ebb28b2323da*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x8192:8192x65024_n"66eabd82437e8858f448b0ce174606d2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x8192:8192x4608_n"87617973355a1b48c779e75435d17ba3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1093x64:64x64x1093_n"64037a13d9250f97e0233db490e10a05*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1093x1093:64x1093x64_n"d9685c1d04e68486e781747dad749c11*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x4096:4096x8192_n"89f18b839a6f64a80f31d65ccbbe3513*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x8192:8192x16384_n"b8503dddf0dcf3969fc1527e0990217f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x16384:16384x8192_n"7c0f0730b8417870dbf071021270bd90*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x8192:8192x65024_n"9005684d7e7894e772ed3d2017792a2c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x8192:8192x4608_n"dd1437fcd79d52c35afc439ad7854a89*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1094x64:64x64x1094_n"777253b4c9a5ed374beec7a271441e10*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1094x1094:64x1094x64_n"091e7d8cd44407c89acb2ef59e5e91c8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x4096:4096x8192_n"9df43c7f75eebf77c8bd20303033b96c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x8192:8192x16384_n"d83cd07a0447d7e106605e2b280b09c4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x16384:16384x8192_n"999297734f53da9ec90cc61781e6de9f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x8192:8192x65024_n"43f9a4cbc31dbedcd367b52d195241f2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x8192:8192x4608_n"d7d44cfbd521b54408fae8718156f573*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1095x64:64x64x1095_n"507311db88e3680905e410e16a66ddc9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1095x1095:64x1095x64_n"726a49455616098a28c677eb23efc564*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x4096:4096x8192_n"2228608c6b1e2c9e2dc1b942e9b85304*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x8192:8192x16384_n"c6ed34ec9358e916b73e00b9e896fa3a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x16384:16384x8192_n"ff765db4ccdf8ee6931f6f9d885fff15*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x8192:8192x65024_n"22bc8440d8d551d2d97c943ca3615535*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x8192:8192x4608_n"d75bfda7be5c0502b1f4938a3be0424f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1096x64:64x64x1096_n"786f81eb43826174327ea636b9ffedf2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1096x1096:64x1096x64_n"92385e6ec5b4354daa358d759f923dd7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x4096:4096x8192_n"169118be1a193e33c964418d8f7db54d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x8192:8192x16384_n"d2b46377814b592b7c18ffdfb6f1f271*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x16384:16384x8192_n"98fa2bf2fb5b812397dc0451da0356d9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x8192:8192x65024_n"1f8ce0a729304f93acb295742473378d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x8192:8192x4608_n"786d35b34127449bfc14f0feb96406a6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1097x64:64x64x1097_n"d349859feecc29a90b5a8b50b7ccd323*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1097x1097:64x1097x64_n"916d1d2bd8fb1264cd3dddcac31e739b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x4096:4096x8192_n"910d91ed980c5522a25bfa41278f8f89*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x8192:8192x16384_n"5ff5382571846e5960a03ef74ec85351*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x16384:16384x8192_n"1caa2d0bd28468754eca560ee27a1aa1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x8192:8192x65024_n"202d99c392514320f3e7cdf37585675c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x8192:8192x4608_n"dbc6d72b0dffcce83b8330299a3a1271*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1098x64:64x64x1098_n"3e6a5d36068dff8c615887f5f2d281bd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1098x1098:64x1098x64_n"1dab8762f798af555d0bf23f745ba1b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x4096:4096x8192_n"154bf5f5b280e6a1d3aea5c53e22ca74*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x8192:8192x16384_n"3f86070e6bd4fbb2632c2939b2e3debd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x16384:16384x8192_n"65b8fb8b3767d82f2d536b577d9958e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x8192:8192x65024_n"4ae07b0c09afe246579c1fa497cfe451*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x8192:8192x4608_n"b7c9b022ff45b3aeb85a2f01034922fe*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1099x64:64x64x1099_n"6ef8187e20074eb13c46099eb7aabd6e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1099x1099:64x1099x64_n"e1bd64bff65502dac4c733332cc8afe6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x4096:4096x8192_n"be233dcc7fe847b4c48d14d927b56b92*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x8192:8192x16384_n"897edfbea01c0ea00da862b12d06e281*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x16384:16384x8192_n"0cd06b3d4ba3635bce74851f4d4cb35b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x8192:8192x65024_n"ec9d25fc06823bfed0d03b82ce54893f*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x8192:8192x4608_n"a5f45151b624df0a6ecc9eee66625233*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1100x64:64x64x1100_n"72b89d63eeff0687953bc3c0fc6c688a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1100x1100:64x1100x64_n"c9980ff87ff0104f88b9c2d0912865b8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x4096:4096x8192_n"1b14c0186b5a3176c550bfb2675e8453*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x8192:8192x16384_n"ca8dc4b3c01eae9b42809de64352ba01*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x16384:16384x8192_n"da85835c89d370685036c204bcd40a12*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x8192:8192x65024_n"044207ee4357d4facf4c99c53007befc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x8192:8192x4608_n"88963bb18d8003c154886e66b5a9c51b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1101x64:64x64x1101_n"9e98c1bfacdbf8a6932c0e82e6def263*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1101x1101:64x1101x64_n"59f2453581a79fe992ac3a31d8caf2a0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x4096:4096x8192_n"ff8794549d27374b47db3fd3108a4da5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x8192:8192x16384_n"fe9ad8d59f72a00003e560c1ce124ff5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x16384:16384x8192_n"67848dc6257533612814bf953b24f6d7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x8192:8192x65024_n"2e8eea99a94b2b73b4e3345189539f8c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x8192:8192x4608_n"17a888244d7b576d1a91eee6069ef6ec*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1102x64:64x64x1102_n"3c4913d7bbd87f12252b2186f5066299*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1102x1102:64x1102x64_n"14bd5a62277bccd5d2e81d8cd9c02f00*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x4096:4096x8192_n"c82ecf365a9ce25ff70523cd3dc3e855*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x8192:8192x16384_n"f3cb2e8284dc3c658efb82d0cf65b58a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x16384:16384x8192_n"ed52ee813c25908d82d9e30d16595dc8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x8192:8192x65024_n"8b0c2588aed744a74aa315beda5b63fc*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x8192:8192x4608_n"c342413568b4048fe0c9458fde70c450*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1103x64:64x64x1103_n"db7faefdee969dce283b7690d3314e8e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1103x1103:64x1103x64_n"7025ac194e8a7f9cd2e630edc2aa599f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x4096:4096x8192_n"d202130eba85f1a8a22410f5a45cf824*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x8192:8192x16384_n"07bf127d00fe16f8f26194927183fa79*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x16384:16384x8192_n"8a317ceccb0420e5cb2e05556c6ef907*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x8192:8192x65024_n"517766e100a81ebcf2e778d0636e3244*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x8192:8192x4608_n"9338bd8f3865122760293ba77964791e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1104x64:64x64x1104_n"9ba2a3fb4d82bc94bea608f210c9dad1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1104x1104:64x1104x64_n"e34f41b59db674c81f666833f0ec4407*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x4096:4096x8192_n"072bae4a17b248c8f1260fc0a99138f9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x8192:8192x16384_n"72dfd4e0b08c69ed4bf32914a040b87c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x16384:16384x8192_n"da1c60b2b16f09e832c695e5d27223c0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x8192:8192x65024_n"aeb0d336f7b4c3a445e0f373cc07a99c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x8192:8192x4608_n"e9cf67feb00ae31aabbe8c92fecda44a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1105x64:64x64x1105_n"713d7aa21cb507acd96a30dc04fd366f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1105x1105:64x1105x64_n"ecccf671653b5f8ea680274019150ad6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x4096:4096x8192_n"eb84eef5a3f9287a01db04f873a2c44e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x8192:8192x16384_n"d5989dfef9e8d9327df9351b1a75941d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x16384:16384x8192_n"d24c5516147df0b47b870d9f25b7af83*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x8192:8192x65024_n"1511e98c3a48616b46a09e10536d570d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x8192:8192x4608_n"0110c70605f460530fade7d1bba5f5f3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1106x64:64x64x1106_n"afc511287c1a8def1f94b093b6bfa3a8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1106x1106:64x1106x64_n"15d6e5eb369d2dd989f7b5a3f8ede810*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x4096:4096x8192_n"222825ecc314c2025c457746d19b4557*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x8192:8192x16384_n"ed3c24aaabc840aa25e4c73a6952ac92*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x16384:16384x8192_n"a7dfab5f45b01fda86fc438aadefceb6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x8192:8192x65024_n"2c7450c4627428492abd185fe1aa2cfe*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x8192:8192x4608_n"1ce2bd1c3245868f79948c4121c53864*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1107x64:64x64x1107_n"cc6ee61c5457eb0381e5e9a15e2dec79*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1107x1107:64x1107x64_n"180cb34997b09276312fa0e4dd26948e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x4096:4096x8192_n"f6cabc546b1f82b3cf54023e614db070*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x8192:8192x16384_n"15c1409890e53403126a80e436ef407a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x16384:16384x8192_n"ec6800f1f06091eac0351f43ec00025d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x8192:8192x65024_n"6e895870f933fc0b9c244fd0f2914691*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x8192:8192x4608_n"49cc90bd5b7e67f79ea20ab70f45aa98*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1108x64:64x64x1108_n"dd8b44fc2699df266f331ecd36b98fcf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1108x1108:64x1108x64_n"f911174696e8658332db6f147ca590ce*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x4096:4096x8192_n"f5bb8a31fef3f0dbce7c9915b0884912*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x8192:8192x16384_n"9f9e68280f7fe461e7d487285f807977*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x16384:16384x8192_n"a7e4797a08f1285f7f7abd7975198617*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x8192:8192x65024_n"987e1190efcaa69f9b69a244f0162195*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x8192:8192x4608_n"74772fe642d6ce8ff9fc92c7faf16cf3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1109x64:64x64x1109_n"af0be9b9d596f2af6fe4ae61abaf7caf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1109x1109:64x1109x64_n"aa626df950fffc7d56f67deb0d38a89c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x4096:4096x8192_n"d65a14902e9a4b5a03508d944408f1f8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x8192:8192x16384_n"90e179b6fd3cf5873baed9b28f833fb0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x16384:16384x8192_n"fef2360ee50832a9359cf6d462a1a2ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x8192:8192x65024_n"76211c5bd6d89424b76e4c12fbc32da6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x8192:8192x4608_n"3f656e55881cf8907f22405792db8849*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1110x64:64x64x1110_n"266a7b3a80ba0edc6a1bbccb7b0e4a96*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1110x1110:64x1110x64_n"784ae5a5760e160bd643abeef68e15e9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x4096:4096x8192_n"bf386fb76bd91ad3c8e14468ee834a68*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x8192:8192x16384_n"9ea2ffdf50fdb8c6a477bb515e0f09af*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x16384:16384x8192_n"f95700573dbb66e9e28ca8c1045a4832*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x8192:8192x65024_n"325bccc291a5b44a1acb318eb1eb85f4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x8192:8192x4608_n"62df29db62b637c1f84c0e4a053b2bcc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1111x64:64x64x1111_n"8cb63040af1ec579ab6d5ce332ece4db*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1111x1111:64x1111x64_n"9f4ebee4b4798bacaf465e7fe3c3d570*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x4096:4096x8192_n"8d560da84a4dfe1b35d08489ee5e52f1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x8192:8192x16384_n"b1882ac624992ff7f9d80a626e729c38*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x16384:16384x8192_n"0902fa041767eb32c46c4c1014ae31a4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x8192:8192x65024_n"328682aa0665330439b356c2902b4d51*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x8192:8192x4608_n"6614cb78e8579bfbd76a5629adc8cb4a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1112x64:64x64x1112_n"0227536357ff9cab1751b47af6831ea9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1112x1112:64x1112x64_n"43adb7a3711058173bee4ab0c2db9767*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x4096:4096x8192_n"306850442b810ebd80bfc6b62fb0ae2d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x8192:8192x16384_n"015fd67fa8c823f49cdba2c30e2107e3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x16384:16384x8192_n"dc2ef6af737067b0d6fb9fbb907edad5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x8192:8192x65024_n"bf2f5cf083f8e0d7f02997248de8d401*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x8192:8192x4608_n"33e3d8e44985fa2a4173033a2d916aa2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1113x64:64x64x1113_n"93e63070233f6cbd498499fbb0651b4c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1113x1113:64x1113x64_n"0abeaaaf57d08b6ad19026134ef99717*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x4096:4096x8192_n"dc865194dd2a08f5d13d648a86dfcdfd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x8192:8192x16384_n"c376bd987d09e428024c71ae24caaaac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x16384:16384x8192_n"3859563c6edf627df15cabc9169cc291*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x8192:8192x65024_n"7af89541b941b4cca8efde273693ab98*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x8192:8192x4608_n"02b0afa28d8c5041020956d635df25bf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1114x64:64x64x1114_n"dba04a1888c26692e9ff52027c45bb5a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1114x1114:64x1114x64_n"effbabdfdbb080b9f49f3f4b25965ddb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x4096:4096x8192_n"5ded61e2552114313d699f221d5ff00a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x8192:8192x16384_n"9dba3d1e068756a4d6ba827eb96a6e78*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x16384:16384x8192_n"1ce6bb35acb03b20a1e16cc958cc0574*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x8192:8192x65024_n"220655778ed72079ce7c6b998d8197ba*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x8192:8192x4608_n"6af3e8db1879f57744ea8316c051e0aa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1115x64:64x64x1115_n"08b72b15ddbbb9f8d2fc17890f2b3946*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1115x1115:64x1115x64_n"f12f300e79effcb0201ca842ed393c81*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x4096:4096x8192_n"a40521fa7e0937fcf39bfaa641b042cc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x8192:8192x16384_n"60986146b13d3b28e3cf7f6824bea6f4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x16384:16384x8192_n"4582ca14a170a6f90633feda6e547b84*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x8192:8192x65024_n"41a573f2d3999ea5df9d2a5eb6ad0db5*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x8192:8192x4608_n"77146d8fa6eb40c103a06aa8c3e7bfcd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1116x64:64x64x1116_n"3065eb4d97225d2f37d06d1327d07581*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1116x1116:64x1116x64_n"853dfdf94d0505b44e85ce32fd675717*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x4096:4096x8192_n"f81f72d94f2a4105c025134a69f61d33*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x8192:8192x16384_n"f4f99cdedfea56729c9a342151631753*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x16384:16384x8192_n"7cbb1e489985c2851aef089e717fa559*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x8192:8192x65024_n"faa74be03d64a3155ace34a43a6e5230*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x8192:8192x4608_n"5622ec86530cea92af0e51c1c5b0a3b5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1117x64:64x64x1117_n"2269c6482111e3c88e07630a86a551f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1117x1117:64x1117x64_n"7d86118e8c58092c4375e4185e47d5a5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x4096:4096x8192_n"f36d53332ed5bb396f65359c01565f1d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x8192:8192x16384_n"d417b2d0f1d6e828354baa7a4d49528f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x16384:16384x8192_n"9fb0cca6c61d8e1b133795984aba0a2b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x8192:8192x65024_n"0ac2f70dfb68a83cec178969c1663718*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x8192:8192x4608_n"fd2ea64d0b8684d5a9b76b8a11223fc9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1118x64:64x64x1118_n"c0bd1c4e0bc0c89a12d65d909c3f8bce*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1118x1118:64x1118x64_n"ebac00a0961d16853b7de5e9f37ff0c2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x4096:4096x8192_n"9dbd72ab3de5a8eba7c58a3b9d72b067*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x8192:8192x16384_n"5838b665d631fb8fc6c4aa52430bc103*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x16384:16384x8192_n"6623dedec475e97c92ff141988627ddc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x8192:8192x65024_n"8d53c3fc3a90f9082bee5de4731b7708*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x8192:8192x4608_n"e9bcc06568463bc6a8da2c730f6b6ea2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1119x64:64x64x1119_n"b9d7f7bdbeef23e901d48e46593fbfe7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1119x1119:64x1119x64_n"8c71e4ea5fdb56f2cd69b85184db6f20*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x4096:4096x8192_n"3e0273878ef14dd88d53b3f84ecf0351*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x8192:8192x16384_n"bf97c1d2c90b0f5650c118e7bfd2a403*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x16384:16384x8192_n"117440327ec0c3d2b2f61d4fdcd63486*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x8192:8192x65024_n"d41e9f1fec8579f225da06fea9af7d05*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x8192:8192x4608_n"ea66a847c6fd08624a4e2baf13f35eb0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1120x64:64x64x1120_n"81e328c1a4d2a5bddfac708b3c220689*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1120x1120:64x1120x64_n"0808e38450a03d0ac8d1cbf07230df1b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x4096:4096x8192_n"b3b97930a2c9ac802390503cee8ef0b0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x8192:8192x16384_n"fa76e78363bb640b1ebb195b916d728f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x16384:16384x8192_n"0712a89c160f5a7697c59afa1b3b4887*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x8192:8192x65024_n"b663fbb9807fde8443455f7e4ea0d8b6*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x8192:8192x4608_n"6cc34ac2e91f6f4f84d3dc2c4fd62990*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1121x64:64x64x1121_n"ed76c3d8da18edc371d72d2ae537ce37*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1121x1121:64x1121x64_n"da9435f062e35dc02fb0e6a00baadf43*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x4096:4096x8192_n"79d9b88969edbed90f02fa943e0a8700*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x8192:8192x16384_n"cf44913cf70cccca289889abd32e9060*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x16384:16384x8192_n"cd5eb332ec9886f7bf665693fbd7d75b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x8192:8192x65024_n"e1892500541670606f67c4e810b593d0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x8192:8192x4608_n"80ce83af8aca0f9647f64bde6793c551*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1122x64:64x64x1122_n"ee5344a03f3546121ee323ffa6012852*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1122x1122:64x1122x64_n"4074735eca01ccc46d314cd780ffd069*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x4096:4096x8192_n"4db66b20dfab392f246c76dda30234bb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x8192:8192x16384_n"f6aeb15af3b97ddc36ce13776053abfe*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x16384:16384x8192_n"ff09e0c97150daff7fb98dbd39dd4d3f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x8192:8192x65024_n"4127fab3679dd92c9df8a0d6d1340e70*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x8192:8192x4608_n"9d60343af7641b008aa39b4c20dd7e2f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1123x64:64x64x1123_n"d264d70fec6289123d44c9db2f3bd0f3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1123x1123:64x1123x64_n"f95ceef6c7901c67dbc05cd9a93e9171*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x4096:4096x8192_n"a053c9bc58297622f520e37aab59d0e5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x8192:8192x16384_n"6aa7670a31947e8c96468b3bb60a0e2c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x16384:16384x8192_n"9847b0369814d005711b96cf0ef7a206*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x8192:8192x65024_n"88efd3b326564db64579ed458a4fe26b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x8192:8192x4608_n"75115762c6a2c83466594b06de1d0f28*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1124x64:64x64x1124_n"eef0948e9fa3cbeed42db312cdac4dca*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1124x1124:64x1124x64_n"496bb85b96183c7ac0e78368dcbc0025*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x4096:4096x8192_n"417e8833f0657de30b1993348d0d17ee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x8192:8192x16384_n"73b121df0a47250348037e7f97761f3c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x16384:16384x8192_n"00b4ba8460f239d00a1d63e9c7abc319*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x8192:8192x65024_n"ffd893fd1cb45b2c6a0db4e219fb4e9e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x8192:8192x4608_n"fd28d63fee32d72b95bdb7238e409213*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1125x64:64x64x1125_n"8dbeb7ff6232fc960c864ae791387130*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1125x1125:64x1125x64_n"e6a402683cf9f7467ef649d3e5a49a91*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x4096:4096x8192_n"bb2e237e250b899dc646b22d5ea120eb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x8192:8192x16384_n"3f39cabaff0e49c17965e7d285392c40*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x16384:16384x8192_n"2b99e488d7d2e99e458369d34e29fcfd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x8192:8192x65024_n"f5128a2e64ffb6f76bcdfe9e546398de*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x8192:8192x4608_n"9bffea68d7dea446d2e21308d84b4524*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1126x64:64x64x1126_n"d1267d207d07e5fbddd91adf78e20c97*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1126x1126:64x1126x64_n"00e5def94a244c8d6ed4e86a067f7a75*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x4096:4096x8192_n"337a60f7bff798b5f60cc21443d027b2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x8192:8192x16384_n"76658c101b3329dd8f660a0f4ae584b6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x16384:16384x8192_n"ca8ea15b68d06d70e9a85669fe41707c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x8192:8192x65024_n"85a85fbd7e79c8a510bef57db0620ce0*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x8192:8192x4608_n"97fec4829b3b121d3d9313a35a9b3882*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1127x64:64x64x1127_n"df18a3d97e60fb13f37fc01147491503*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1127x1127:64x1127x64_n"e69a2f8d316b52d210b05a196a393fa9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x4096:4096x8192_n"50230f80147103cfe134795802c32a6e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x8192:8192x16384_n"e2ec345adc2a7f18258fedb981d98180*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x16384:16384x8192_n"e84f62fd1ed0335af47921ebafd6d639*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x8192:8192x65024_n"b5c6d0ef555730833c9f027cc2631cf3*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x8192:8192x4608_n"b331c21dbb6b693cd76f19295c8babab*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1128x64:64x64x1128_n"e6d132684361ea58fdf98d997532a64f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1128x1128:64x1128x64_n"63f434a8411254af1ccb43024d71826e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x4096:4096x8192_n"b1c14e9d49faf792998d2c2a6cab28db*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x8192:8192x16384_n"505ba008b28600c8841a4e80b8dbe9fe*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x16384:16384x8192_n"f80b7e213c7955446dbacae3bce6fb69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x8192:8192x65024_n"71d958a44b46450e872ed46875f163b4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x8192:8192x4608_n"adde8ab71f4a6d2dc53e3d69e7715204*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1129x64:64x64x1129_n"eeb435089be1610c5b79d34f03db1d24*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1129x1129:64x1129x64_n"8ad47918c6b2da7d9d0f6c7ded797abb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x4096:4096x8192_n"f92347b425a2f3ea41b10ee4ffc9bac9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x8192:8192x16384_n"8676ad5539c5365159637f814c9a7d0e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x16384:16384x8192_n"005c5584d1a979bd0c1dea49f2e5d367*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x8192:8192x65024_n"e394350819cf9f0a1a91064fd0f52548*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x8192:8192x4608_n"c854ef1a09434c0d4a517d14027bbab1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1130x64:64x64x1130_n"2e71f30bbf6b7059f453a9def9969de3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1130x1130:64x1130x64_n"e4777b0a8e6044b5a9362e79a3f3e934*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x4096:4096x8192_n"9bd58d6e1292bddf6ef762ccd8328be6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x8192:8192x16384_n"f415fc0ee33b1a0cacc4fe518229de35*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x16384:16384x8192_n"3fec88ac7d602c97f425adf5e9a345ad*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x8192:8192x65024_n"696ea930bbfdc31c8abbebcd81a4c7d2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x8192:8192x4608_n"bcd2f62c1f703b9fbc01584431505c37*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1131x64:64x64x1131_n"97a2ec5a6484ec10ed6fed76cc05bba3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1131x1131:64x1131x64_n"6d0966bc0d5f5c7c3b381367c8d130ba*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x4096:4096x8192_n"c852a0896bcc501e67bff1dcb26f4097*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x8192:8192x16384_n"4cc2afe6154cd6e8ba0215d78783bf68*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x16384:16384x8192_n"13935064e9aa512cb9b0f0af7357fc91*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x8192:8192x65024_n"f6a81772a4cef2b7358e4bd13cde91ba*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x8192:8192x4608_n"1ddc499039087ee8f3979f0187c13a3e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1132x64:64x64x1132_n"865513db6ec474b9dac37fa9a0f9f492*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1132x1132:64x1132x64_n"6e2bdeda4b373cef37a80bd14b29401d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x4096:4096x8192_n"013232b756df2c1edb430e5d154c2ca2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x8192:8192x16384_n"89ddc3e136d765a8c3ab08be07077f6e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x16384:16384x8192_n"2f54fbb0bb6153c531f62539680df267*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x8192:8192x65024_n"c5d47dd5f761c9c34898e7196f9bf584*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x8192:8192x4608_n"8cb871fb8bea8aa52cd20814d89596dc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1133x64:64x64x1133_n"3343d92428a3e360e9078c445f3de8ea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1133x1133:64x1133x64_n"2e172ec1f0863d1ac18e983d21ee8272*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x4096:4096x8192_n"dc84dd9f79060c2797c4ed0b8954a7e2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x8192:8192x16384_n"a0dda441defcf2855de9e3995681228f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x16384:16384x8192_n"d9f149282365552d458fc691ab4d70f9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x8192:8192x65024_n"5a3131d3faababfa38f4af092bd5507c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x8192:8192x4608_n"76eec03dcf748099329865494686303f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1134x64:64x64x1134_n"cd7e4a361e30007d04aa1fda53918fdf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1134x1134:64x1134x64_n"656cf559d00aaeb46ce43a35b73cb64e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x4096:4096x8192_n"c31fd1b2c400283b377bece21b4aae92*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x8192:8192x16384_n"d1ce0a05aa5b31fbd8921325553243fa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x16384:16384x8192_n"f7b28ac947ab9f98aca11f591d109a23*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x8192:8192x65024_n"1647cf81210f23260b7e4eb5e815e14d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x8192:8192x4608_n"46e2b8e472bfc79c84df9cb8ade3c8dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1135x64:64x64x1135_n"441821889db2569f4870cdafb255b812*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1135x1135:64x1135x64_n"670b107f40c1566a00ea7a66b8d925f3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x4096:4096x8192_n"020ea2f955bbe70549c619b3ff757e5a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x8192:8192x16384_n"4b0a86ae49f819b791dfb3cc1b3a012f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x16384:16384x8192_n"ae6cb0f80ddf294fca7a0462e031101c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x8192:8192x65024_n"2770ab5ce1d0e510cac630fcc149e4d1*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x8192:8192x4608_n"4b7471f69ddb97b62b3c800f4fba71c8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1136x64:64x64x1136_n"d5fa224cad11a76b3eb991a8af6398cb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1136x1136:64x1136x64_n"6f3d422a44877fabb3d4416aef7409d3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x4096:4096x8192_n"88e4efcfcf100c77dc05d00a71c9baef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x8192:8192x16384_n"99b5c195b97929703956a06ad9053bac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x16384:16384x8192_n"e81153b874a2d1889758e2433e3e2fee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x8192:8192x65024_n"e989a2a837dc593f7e5178473fea503e*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x8192:8192x4608_n"9079183d931b7495367ca13711cb5df0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1137x64:64x64x1137_n"485f90bdea331b8c345c6f8568eaae7b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1137x1137:64x1137x64_n"f81c896de3db597c794e8fccd35d1ccd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x4096:4096x8192_n"35ffa5a39f425252e14b5a698c2f33a0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x8192:8192x16384_n"c2fecdbcd9d1eed12611d55e2e122445*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x16384:16384x8192_n"bc0d8d2880d81b74ead02e70b88c4632*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x8192:8192x65024_n"3b131a224bb61bfff5ab9b70495362cd*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x8192:8192x4608_n"046f79e3a858e19abd585311e81986e6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1138x64:64x64x1138_n"7ea89c7feae00ccf235ea853b5d6ccd7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1138x1138:64x1138x64_n"351eda1d63ddecbdb26e59bcf9098520*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x4096:4096x8192_n"b11a02275b088e57eb07f9067c8e8e8e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x8192:8192x16384_n"e45fda2963b3305ced03177a715fb07a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x16384:16384x8192_n"91c52afbda4b0da6ac5579fe441744f2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x8192:8192x65024_n"be518017fdd6aa3522a9c96c07e6a63d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x8192:8192x4608_n"3d61c92f00b4b66efa47a57a05f2fe6c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1139x64:64x64x1139_n"67666fef188a3f95d154bfbfa7bbc51f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1139x1139:64x1139x64_n"0bf4db352d900334f9ab732403d504dc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x4096:4096x8192_n"2efb97c51a4f432cbd4e03e824c34c77*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x8192:8192x16384_n"1a5c281b3418e66fd44d4cf6ab44a44a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x16384:16384x8192_n"c270ab1a5a6a4ea3943ea6b3c88f1d93*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x8192:8192x65024_n"d06c740860e10a076ca232f1f51b12b7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x8192:8192x4608_n"1bdfea61221751270a286829da7703ff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1140x64:64x64x1140_n"d2a190d583c77c0a9bf9b03fad3aac69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1140x1140:64x1140x64_n"2e8f17f3eebd4c2081908e12fc2e27b6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x4096:4096x8192_n"8598a6c8d244f093d47d4bfe2725a583*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x8192:8192x16384_n"8ef357582e44b69145f557ee2e6ee87c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x16384:16384x8192_n"356bb1e6a9b5970293c9f2d921db2e2e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x8192:8192x65024_n"3ce58cbfad16f74c68a7e946c10b0007*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x8192:8192x4608_n"de4f22e3047ff70948daa5fc9d83a613*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1141x64:64x64x1141_n"f3ea300a33dd9a4a0a155efdbf3ee645*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1141x1141:64x1141x64_n"81b2d0c0309ea3b2c9c17c0acf6f08c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x4096:4096x8192_n"a1338355d7106e57aadf81d77f61b347*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x8192:8192x16384_n"52d798ac396d10fb8babd4e5dd077de9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x16384:16384x8192_n"7b46d45af0e28c3dfc102c0d4045742a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x8192:8192x65024_n"0b6150fde53a15ad5069e15084aef28c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x8192:8192x4608_n"f1457dbbc65bbb40d9151896e46c16d1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1142x64:64x64x1142_n"1a79cafacd533732a7d42842ee2e759f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1142x1142:64x1142x64_n"9ce91cf2485e88269fa963945c8adc23*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x4096:4096x8192_n"08190042f1f2ffbf2c1a0a57cb5e3290*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x8192:8192x16384_n"ec4c1350b36fa52f3b5f926328a5934e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x16384:16384x8192_n"32f4629ed302789a8b665bf562b51596*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x8192:8192x65024_n"86ec8c446c140bd8f6b5560a749ced72*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x8192:8192x4608_n"32c87a122ca3fc19f5e9196b3623ed69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1143x64:64x64x1143_n"c1cf613ff6b8fc68102b5d1f47830566*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1143x1143:64x1143x64_n"68585f40836a336f6c93073f95a609c7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x4096:4096x8192_n"3d0dfb2eff74d7107017a4bb7d8f715e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x8192:8192x16384_n"193f59c7be6bcb855619128c25241d0d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x16384:16384x8192_n"ada38f47052b1b0fcf4fd0f14c826e76*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x8192:8192x65024_n"b88dbe334ad8df65611343f949f11e8a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x8192:8192x4608_n"3e8b5d99cc5f39ac4a79524e00a74946*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1144x64:64x64x1144_n"3a0ebfd966deae1c22938654cf022b02*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1144x1144:64x1144x64_n"d61d43c29644f34fc16b54e750174c9b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x4096:4096x8192_n"e4c8d2de7bf489f87a836084a697273a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x8192:8192x16384_n"4cb6a596304fd1b75d13564549a6bc39*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x16384:16384x8192_n"c5e863e7af8a18f703d3b21b5cb5b7ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x8192:8192x65024_n"7b75e5eec50625bd9af18ae9515e942b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x8192:8192x4608_n"483bdab7534dfd5eff9f45b84733a820*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1145x64:64x64x1145_n"e3a324831521cd065cf9e2827bc46de9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1145x1145:64x1145x64_n"4746171c2657b016b071f02b8b0f1891*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x4096:4096x8192_n"1006dddee926c96ff7f0fd651f150305*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x8192:8192x16384_n"f314dd6a00aaca89114af212679f3a67*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x16384:16384x8192_n"7376b66cf3b796cc3138ee2651b53af0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x8192:8192x65024_n"df312f51d9b76aba4502dd86a4aba391*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x8192:8192x4608_n"3cdb964e7b8bd747dee636163bb25063*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1146x64:64x64x1146_n"270a31b804e9c39e18642dc642a5f12e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1146x1146:64x1146x64_n"380262826a5e9b0cb210c83981b09e66*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x4096:4096x8192_n"a87b2ce07fdc53bd92fd04ea27c5fde5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x8192:8192x16384_n"05704f89778b9aabda3ade1b83a045e8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x16384:16384x8192_n"433f86446b65a3760e539b41e385b08b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x8192:8192x65024_n"7638ea533edcbd2b65767a7b8a9f238d*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x8192:8192x4608_n"8ceff310c2f71de2f114691059fdb8ea*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1147x64:64x64x1147_n"d1e9ddc6cc11ed6eef3eb77c871bd026*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1147x1147:64x1147x64_n"8a74f4fcbe33dc6e3fcbcfb928191d4c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x4096:4096x8192_n"7881d991eed701125b1c742b5eef4182*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x8192:8192x16384_n"838e51608dac378febc4e384033ee16d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x16384:16384x8192_n"42f3711d2fb77da0f65a667805289713*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x8192:8192x65024_n"c8740da78d0659c85a9e551fba76df6c*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x8192:8192x4608_n"f0c2b447bbadb60d5f3ddcbd4d79a4d6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1148x64:64x64x1148_n"00602f424fe3c2c38413546f8347674c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1148x1148:64x1148x64_n"c77070d6cdce9cc3f46f9c109e07902a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x4096:4096x8192_n"79d143b4f624167f913ca307dc2ec711*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x8192:8192x16384_n"38b9d5c120f01b400e45503fb32aaf7a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x16384:16384x8192_n"e8e9e372ce26e15705b2e1242d17140f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x8192:8192x65024_n"87f8112e2ea918777ed95f6b24034856*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x8192:8192x4608_n"e1ffcbd2022400a89f369eb7ab4310bc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1149x64:64x64x1149_n"b0722a8bd35f8c211e16d57805c8b01b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1149x1149:64x1149x64_n"76cfc7fa650074c9f33703973a96dc97*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x4096:4096x8192_n"8a7ea49cd967673a5ae05d00ad1c0619*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x8192:8192x16384_n"c049fe2b016f3db0bc786a031a299cdf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x16384:16384x8192_n"47cd0be65f65005ca3b99b6c1d419eb4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x8192:8192x65024_n"e03ccb65d684b2e9dcfc7d9febb64666*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x8192:8192x4608_n"c87886f9aa45858dd42963f0484cf190*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1150x64:64x64x1150_n"e9802b50fef2165d68b81524776bcdf8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1150x1150:64x1150x64_n"faca094b53b9d56ac66a57b65bf988eb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x4096:4096x8192_n"388ee2ef3b280cc3602847aa9d1184a8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x8192:8192x16384_n"8ed05940352cd11df08bb5c671b5cee8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x16384:16384x8192_n"f1227156280edea3db722dd8970b93c9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x8192:8192x65024_n"31074a910f80c6fa12c629ca98984a96*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x8192:8192x4608_n"6acb5fb14f2c4f86614fd48e4a021dc3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1151x64:64x64x1151_n"98f32b0a6f7bb999c63ae3cc7b2ff0af*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1151x1151:64x1151x64_n"0b8724bf19b15fbeefa5f5271c53bba7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x4096:4096x8192_n"e7b79f437d0980117be25f26325260ee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x8192:8192x16384_n"ffe1c8d0fd157dfe7284f2c5985aa49b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x16384:16384x8192_n"40196dcdab3a01e9ba6673fe4503847f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x8192:8192x65024_n"89d89ebcac7e460ebf12ce2192cce785*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x8192:8192x4608_n"14db488a86ee766e5c07235bdf635d42*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1152x64:64x64x1152_n"133a7acd49e3b697a3384f60c18ec6fd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1152x1152:64x1152x64_n"e5d1052c2835bc1d4f5bce0d129f970f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x4096:4096x8192_n"253b41f8bed1d9e0f9c3c9ea7a3ba272*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x8192:8192x16384_n"6631d4ea669a0165827d5daaa5f1fd23*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x16384:16384x8192_n"248905a77e1960a0b7efac75b38c7478*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x8192:8192x65024_n"0fd49e63078f0eb2140fe06b18b8c8f9*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x8192:8192x4608_n"c144ad9eb14ba9ad0ef5b73a0a14d580*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1153x64:64x64x1153_n"f3a92b5bdebf4d85855172a969103d38*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1153x1153:64x1153x64_n"367f03c0023442e1e43edd0c12c5f6de*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x4096:4096x8192_n"929115d502cb5af31a417f4792be577c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x8192:8192x16384_n"caf6d815d2b63bb6b415d1dd53000e14*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x16384:16384x8192_n"c46a4646876ab2d88d2ddb3892b85adb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x8192:8192x65024_n"4d53775e9cb25f257008187d3e55c4cb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x8192:8192x4608_n"28fe70c43d98b334c5206c4ed7561191*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1154x64:64x64x1154_n"1c4671438e540b76aa85927057841c27*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1154x1154:64x1154x64_n"867664e50493004eb7291e28ac9dad4f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x4096:4096x8192_n"b505181260a88c9fcf9f7cd8b6b0900c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x8192:8192x16384_n"71e18d04a0b638eabbf6d54f4810f7dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x16384:16384x8192_n"e50bd4011eb3ab201c23643a544aeb5c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x8192:8192x65024_n"f13cad687f7092fa9ef93d3cad2d0559*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x8192:8192x4608_n"f0510ce07b59e7738a3d31ba5747e3a1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1155x64:64x64x1155_n"50584e6fb50f71afe6973206687c87f3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1155x1155:64x1155x64_n"73b2f416b46f9340bbb3462b3080484f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x4096:4096x8192_n"18b1a1233cb19f95f69dd8b7f128308f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x8192:8192x16384_n"33ffbf81a358056a01b8f39b68ede831*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x16384:16384x8192_n"041145d378787716ac37a1767482bd6b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x8192:8192x65024_n"018ad3a6fee4a6aa40bcefe1cf0a75c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x8192:8192x4608_n"ff8a9576bd66bef19137a0be1ef98736*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1156x64:64x64x1156_n"e1f39d8ada18acfeef7fb534973241d9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1156x1156:64x1156x64_n"3ee68565edeb74b1e458f13cb8fc85c1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x4096:4096x8192_n"e42e1e343ef4eae0bf73d62f6ae3e358*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x8192:8192x16384_n"9d7f147f1e4f19fe32297f7ac5abadae*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x16384:16384x8192_n"8ee821e8cb6279fccc0143d92dc0e84a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x8192:8192x65024_n"6f665e4c8bbb7121908971af3f641819*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x8192:8192x4608_n"855703268dac5c63acce31e02ad8bf2b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1157x64:64x64x1157_n"18f127794b4f44b0547b5fe725d67f89*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1157x1157:64x1157x64_n"120909dd50a200b49be05a89130fadb2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x4096:4096x8192_n"b5a9f46a891d7e913ea2894816987807*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x8192:8192x16384_n"de024e173412b96ac6b7ad69eeb7e9fc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x16384:16384x8192_n"c0edbecacb12fdbc32f01e0eaa9fc9f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x8192:8192x65024_n"82291d204a40f27bfdc305b761c84017*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x8192:8192x4608_n"a7a690de721c89f2157a51c5136466cd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1158x64:64x64x1158_n"86660510ac00d11a08fd04116b254ed8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1158x1158:64x1158x64_n"398d95d92b9424333e3589503494d00f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x4096:4096x8192_n"85385ce725f5a76247ae044bc38896b5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x8192:8192x16384_n"35d682db4bc15bc8207efa508c77f968*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x16384:16384x8192_n"223d58374dbfa9c789a9e4853718f354*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x8192:8192x65024_n"a451b87f76e5989158d4f5b05d2410db*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x8192:8192x4608_n"862f75d5d858540034d15e2a99bab69c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1159x64:64x64x1159_n"a7af4b35de91fff986b5d60520e77f35*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1159x1159:64x1159x64_n"0efc08e51022077c9c73afc6234d09a2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x4096:4096x8192_n"4e758fd11634ad929e5f7ea1d41356c1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x8192:8192x16384_n"14b7256015cffbf844df48af223356c5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x16384:16384x8192_n"ddeb921ab94fd30e2d3ec1618a2c6359*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x8192:8192x65024_n"4176e162d4f806bb9dd8cd8702853279*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x8192:8192x4608_n"9d6f8f87495c8552a22cc196261417d1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1160x64:64x64x1160_n"712551e1f92f7a8b6f281663feab028c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1160x1160:64x1160x64_n"a1706fc9b5271269e29d364514a0dee8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x4096:4096x8192_n"133aa29dbcce8a2fb2266613e289c0b5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x8192:8192x16384_n"1fce70acd13620931a9e1cfb7529a2d4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x16384:16384x8192_n"3be4d185b3dd7c316252e7fd778adb03*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x8192:8192x65024_n"ac13b7ba40f764560db20cbbd6c9b83a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x8192:8192x4608_n"1fe63aa4995ad7ea95f61905bc61c705*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1161x64:64x64x1161_n"60f4cb86cef10eedbac34b61227ecff2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1161x1161:64x1161x64_n"617bd491e13416dfff9d7d13dc315fc0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x4096:4096x8192_n"c5523b50e3e4530f42225b8ae49464b7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x8192:8192x16384_n"5717f9dfa8200e871fb064572f5947e7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x16384:16384x8192_n"20cb4511c75238d6a0137cf9a6305b66*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x8192:8192x65024_n"77a1f8aa2c6670090f04b09ebd5aa6c7*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x8192:8192x4608_n"109072a575e44e31dd4971a40082ea38*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1162x64:64x64x1162_n"9d0fe1fe4e62eec265aabd08c91167ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1162x1162:64x1162x64_n"5e789846f2e865f806ad2593bf57a330*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x4096:4096x8192_n"f2785c114e09132da8a0cbbcc0d499ce*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x8192:8192x16384_n"c22bcd50a165e6284dc8842da8a19c57*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x16384:16384x8192_n"81ab4c08066f945cc7bbc4e270fbd344*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x8192:8192x65024_n"1edd4e09686d8fba02d5d5ae874a0747*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x8192:8192x4608_n"cedd70cddda1745c7a89674e0eeaa7e4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1163x64:64x64x1163_n"2694d6092fc0d88d70e576d4f2f7f2eb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1163x1163:64x1163x64_n"c45e8bb8f780ed07063e55cc800442b0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x4096:4096x8192_n"8e4694f908c3aa84c6147028aba75ca3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x8192:8192x16384_n"58a067704cf377bc94a2b54e8006c825*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x16384:16384x8192_n"e78329e74a225d9e8e90b4db7c0c8935*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x8192:8192x65024_n"39c2829d783c9ac45298e426a1f877cb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x8192:8192x4608_n"027cc09719259b09a18b44e4c98cc9f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1164x64:64x64x1164_n"02a2e8bae0a87c40853057cbebdc9f27*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1164x1164:64x1164x64_n"f84134dd753b0c5e476f6793570b51a0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x4096:4096x8192_n"c256b149a18508ed398fa6fd38da48cf*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x8192:8192x16384_n"fdcc4079105f9f3be12461ec7130a913*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x16384:16384x8192_n"a553f8e5431e02594e2fa230b1cfc91a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x8192:8192x65024_n"0c74c1a260c6dddea8ccac1466d06e16*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x8192:8192x4608_n"0bca787b90ac9c8351f77a799a7fc2f1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1165x64:64x64x1165_n"58518f7d82ccdc697cd101a9e14de62f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1165x1165:64x1165x64_n"8f3a735f5b9c8a91cb72b20d34cfa9f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x4096:4096x8192_n"cf29ce79067c71238a8aec16db9062f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x8192:8192x16384_n"47dbae6d9773444241f59e804ce92188*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x16384:16384x8192_n"3bae25fdc80067f2a48fbd27e15c3c67*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x8192:8192x65024_n"1baae26c1264601f0956e05b1c1fe592*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x8192:8192x4608_n"83aad0d9b8fdbfb10a74e3911c6a1fa7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1166x64:64x64x1166_n"43e162962ae9f5ddce1ee960c5cec0b8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1166x1166:64x1166x64_n"30ac123d931106ed05207ef87399c4e5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x4096:4096x8192_n"40c955ce97629c56bb3dd7112027cbe5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x8192:8192x16384_n"a592ba7d84d1493d08c271b294e398e2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x16384:16384x8192_n"abf31a7aba44d9d84aa107f6799a3b29*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x8192:8192x65024_n"3636ada9e19561c89e9eb65f8f798f8b*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x8192:8192x4608_n"dde676c706db737ba947fadd8a5c22e0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1167x64:64x64x1167_n"62db4d5e8c15e49dc2d2c366be3283a2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1167x1167:64x1167x64_n"7cdfbe7d2e864173a667c3088b6d8627*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x4096:4096x8192_n"815b94bf9407a046c6fa0d81a68354af*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x8192:8192x16384_n"9c3b789235b79a4907c8287bc913b941*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x16384:16384x8192_n"48d9e8ea5d07de09b900621cc47df29a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x8192:8192x65024_n"7df44bc0d2010b1a8c2233c9ec5e9d53*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x8192:8192x4608_n"9c35d4d63d5614da148fef363222bb63*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1168x64:64x64x1168_n"c0669d234774768ff4cb245344818ad9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1168x1168:64x1168x64_n"e8e16cb89f49666d972b012d9fec4e01*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x4096:4096x8192_n"9dd470a7de4a8d40a04d746b323efadc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x8192:8192x16384_n"f95b394752f80af8f9bccf576d053b85*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x16384:16384x8192_n"55047d0823c33462ebf88a3735c41a86*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x8192:8192x65024_n"e83da67fe6774ec657a802b5f52f4b26*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x8192:8192x4608_n"3616fa722a109fadfdce2c530f8b3233*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1169x64:64x64x1169_n"5e285efa28bca5b5d900dc1597dc26ac*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1169x1169:64x1169x64_n"a95e1d9b48647294e1cff20a74116a90*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x4096:4096x8192_n"d1c47e8b688246fef3c11e50b91bb849*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x8192:8192x16384_n"cc4c939a2f9c0c25c3b891baabdef328*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x16384:16384x8192_n"f6181bf01c7f9c5c2290f493b202c322*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x8192:8192x65024_n"b1398989d9d0b7c66dcc69dd1ccc7796*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x8192:8192x4608_n"43e236f8de390310c76796ab23bce591*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1170x64:64x64x1170_n"6c31a47421a380bd0113de7263ab13ee*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1170x1170:64x1170x64_n"995ee873d447bce80d692a7ae2a85e3a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x4096:4096x8192_n"98357f3593d58d1320bf3ce853f6e4dd*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x8192:8192x16384_n"c3cc3d98e5f595eb6edbf7a9ac72b2cc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x16384:16384x8192_n"2880e1a0769909344ef07552a778c876*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x8192:8192x65024_n"9f5d3c3061292f5e2f09bdf1d2b5b3fa*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x8192:8192x4608_n"2cc3cca6310c6fecee62c1262dd145df*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1171x64:64x64x1171_n"fdcbeb4a3da657d32c6fccc2248cd2d5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1171x1171:64x1171x64_n"45d1535e2e2e5befe83e4688d1083a2d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x4096:4096x8192_n"ef3a323738fddd6924b7891631cbef69*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x8192:8192x16384_n"19fa5a2e4bdb0980dc0f58fc2ce2b63a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x16384:16384x8192_n"66c44c8440008be764111fc07a983d8e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x8192:8192x65024_n"8eabe405b314ee2241155e1a5e472e50*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x8192:8192x4608_n"b14fe0d49b47d337e38efaa4aa550253*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1172x64:64x64x1172_n"6e9219fe582aa4060c41d318639d8baa*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1172x1172:64x1172x64_n"50c7e5393616e3b722b5d7ec644e0f79*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x4096:4096x8192_n"3bea7b97e7fdc5056d35c487f448b911*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x8192:8192x16384_n"7c6ee01e9872d2331b49575e6fabd13d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x16384:16384x8192_n"3b71a5217ec90e646d355a9f80ed69f5*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x8192:8192x65024_n"6dc4f6fc742131cf62987cf76f5e03c2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x8192:8192x4608_n"92da6642747ec33161c82f481722f365*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1173x64:64x64x1173_n"0174e00cf4420c2f275ff1943848d2ff*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1173x1173:64x1173x64_n"b8848750be18e4c5004334b3856e6506*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x4096:4096x8192_n"328177fbf37b7a13f964cccc0bf33f51*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x8192:8192x16384_n"d29285ceb8cd79872d9b6f58aa378392*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x16384:16384x8192_n"87692f9f74a610700d6abbd9e489db81*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x8192:8192x65024_n"268473cdf5d309cbafea471499dd3945*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x8192:8192x4608_n"fb83421b9d6c3127c71535603dad5a5a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1174x64:64x64x1174_n"59bd2c1d58d4cda68c972ad7bf4aaaa1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1174x1174:64x1174x64_n"71c70736ae553b050b8d4320cdefd89c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x4096:4096x8192_n"fed251a32e11243b7264784f141fb5ca*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x8192:8192x16384_n"f3e3d9ad6fe9039b85da4f05f7b63b61*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x16384:16384x8192_n"fe3f3c1594386932097771ceda5e2d43*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x8192:8192x65024_n"4b925735085b69e8109fe3b5932c2e23*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x8192:8192x4608_n"724c5d789af3cf8329389d6d3f48897d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1175x64:64x64x1175_n"3d6b41d7186c2c0523634a509fdd9836*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1175x1175:64x1175x64_n"4d70b97b8167f418a335db682169a1b6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x4096:4096x8192_n"a5007c4817e7586b6f652a9968ba9228*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x8192:8192x16384_n"464a1497c499f087fcb6b5e557c5fba2*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x16384:16384x8192_n"7fce516bbc7189848209c9ee1b93b934*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x8192:8192x65024_n"50bc8c8884ff0342e718feccbf28930a*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x8192:8192x4608_n"82b04c286c52239f68266df6acb98f95*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1176x64:64x64x1176_n"14e6517e1a191173206202e0a2f39d7e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1176x1176:64x1176x64_n"3be89ef168d638df193cffa67eb336f7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x4096:4096x8192_n"2bd84e2ba65a74e097c8e2d1a46c4d52*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x8192:8192x16384_n"777de0ada1da79d231b24685b2f585ef*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x16384:16384x8192_n"e45d5290db0c5005149a9f37bf108539*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x8192:8192x65024_n"38892eabb9b835a0499931095d456a49*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x8192:8192x4608_n"d9c97180ea02729694d7f516cd709e33*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1177x64:64x64x1177_n"0d62cd54a4ef2daa1ca3805667b9ad48*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1177x1177:64x1177x64_n"608131019764de91417a4f02e6117cc6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x4096:4096x8192_n"8f67545a6e2601384006ea76d7a679d1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x8192:8192x16384_n"9af9a3b6345152fc9a254bf85633f210*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x16384:16384x8192_n"dc5c3df25e06598d3a7871ca923804df*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x8192:8192x65024_n"5548b9b81992bda5731a16e0add0a914*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x8192:8192x4608_n"b5e0852576583919a4d46b3e4d0a4c8e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1178x64:64x64x1178_n"847cb21ef426b60b801529d9257324c0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1178x1178:64x1178x64_n"2d336db42269509766350920e106f3e8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x4096:4096x8192_n"3cbc5e1c4ca3083c87b92c312b9ef1bb*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x8192:8192x16384_n"111902548702cd61e4513e007f5ac858*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x16384:16384x8192_n"b599626e6fec20ad190665910aae89ec*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x8192:8192x65024_n"70b914ea6026356f5803ee4a66947469*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x8192:8192x4608_n"c01139de5bf5114e7f6d320fe36c6ed4*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1179x64:64x64x1179_n"ff92f91e6f51cd7a42e0da90c53cddf6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1179x1179:64x1179x64_n"269dfbb337a9b0f83fc5ec51ec00fd5c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x4096:4096x8192_n"cd42a98d054fb2390f614291413f9861*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x8192:8192x16384_n"26503cb133d8f7cfef750b9d196f13b1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x16384:16384x8192_n"6d0b5cb15a2030061b93763b259a5fc3*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x8192:8192x65024_n"7b71028882c13a75514599bf96441add*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x8192:8192x4608_n"601a0762905cb1ea48984c04e75bff7a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1180x64:64x64x1180_n"3cbc2254d4f39a9eabe0ed151bc9a5ad*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1180x1180:64x1180x64_n"0e9b8e34e601f709d0be935c5913c8e6*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x4096:4096x8192_n"4f8259990627210bb2921b582d210af9*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x8192:8192x16384_n"f89e01a17b4df3ef993626532f2cfe9d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x16384:16384x8192_n"ce5233be424bf69e653bbd5e40f28030*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x8192:8192x65024_n"12539aaa69d1c2b48aac18d8e70637f2*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x8192:8192x4608_n"c0cb424a9959a24af2fe0c40a955b60f*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1181x64:64x64x1181_n"87fc96032ffe7a65683b574984a56b4d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1181x1181:64x1181x64_n"9501fb3071267d3da405c1ae0460e54c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x4096:4096x8192_n"7a3536000fda9c0313b56804ea10ec78*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x8192:8192x16384_n"ebfe0a20cf06dcf457f99fece46b66f8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x16384:16384x8192_n"3966c114a532af06e497621d7d151e4e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x8192:8192x65024_n"7e5b967ad579b5628f5d085724c4eaeb*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x8192:8192x4608_n"3465c796e708b1e99a012a7662ab9c2b*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1182x64:64x64x1182_n"73250b142c44b90246727c0a1a705073*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1182x1182:64x1182x64_n"092b48a68bb1c718fd871f673e2d20d0*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x4096:4096x8192_n"6e5aed3ab2a10925ada51858ae5d434e*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x8192:8192x16384_n"ce09fa1dcf6d8b1158ecd084197982c8*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x16384:16384x8192_n"ee19a8728bf30de9453e23cc49c5c6a1*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x8192:8192x65024_n"59d920b54e545ff786103b63e0c104c4*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x8192:8192x4608_n"29bbebe2f15bff8d807ef1d7d7c1e5dc*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1183x64:64x64x1183_n"acd2030c4287f1cb19f04a2ec9ec7e2a*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1183x1183:64x1183x64_n"6c2853159b9bc889fcebac7896c91484*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x4096:4096x8192_n"5453793d9937dbfab9f87cb2e8d164ad*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x8192:8192x16384_n"080fd2eeed4d8678e36a8d04bd811c71*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x16384:16384x8192_n"7d56585d8518bf98fe8dbb1435d9fe25*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x8192:8192x65024_n"0b425b924ad157c76e5cae2f5fe0c525*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x8192:8192x4608_n"89e563786b6be2de13c78f77e35b062d*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1184x64:64x64x1184_n"79a38627de47550db8c0e10c1da98f3c*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1184x1184:64x1184x64_n"71cf5e1e2997b3977d7d8e11445c2393*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x4096:4096x8192_n"ade793e0904348fb8d614ab9709f5c11*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x8192:8192x16384_n"624ea66adc6f88742271c88ca0e1a180*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x16384:16384x8192_n"7e3a3a908a936b131761fabd490031b7*180"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x8192:8192x65024_n"bffcd35977c27b3b93ded230df1722ba*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x2048_n"ad598597e4aef7bb0692260ae596f0f5*252"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1024x256:32x256x1024_n"3064ec167d37171ad01c6f97e095f60e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1024x1024:32x1024x256_n"b85abef958dfa7e7767dbdba923bc65a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x2048:2048x4096_n"dcadff3d395ab642a03af67c037e5ca1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x8192_n"8dbdc8e00f9bd958d298ed6ba0862c6a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x4096_n"85f49ca01ecc9d5fc5458be6a9f7a143*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x50400_n"cf0b8f7f8045d4cd78b20bc4a52ec928*3"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x2048_n"2a91b24aa87c31e2c70a997d60cd8a1e*32004"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1025_n"8e51e7a5e27ee84f00e0c1822c0f33c5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1025:32x1025x256_n"27e74c77ee9b43b542027c60ac54a737*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x2048:2048x4096_n"74a2e8888f8c87a7f384069ca94bfe4b*10668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x8192_n"ce7cefd4a1242e78e0bc73baa40d154b*10668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x4096_n"d7330bb79f928ac572673c88b7ef7595*10668"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x50400_n"9d72db941374abee0b28b0dfc02f1eed*381"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1026_n"bd7e99e15fbad9eddc14447ce0729d1f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1026:32x1026x256_n"daec36ac071bc9d0ebf508b034b05825*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1027_n"c7fffe8c8215680665753ea4273b2a26*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1027:32x1027x256_n"ae5809a045b12356e9854c9d5395dfe4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1028_n"341a7acd69c3b72ba0079f22f0a105e6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1028:32x1028x256_n"54012d3bee526b6daa9cc72fb2e9047a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1029_n"1ac6108a2165f23aaca1ad1bc99ef6d2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1029:32x1029x256_n"936ed6175909594c13b87e5c88ab60f9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1030_n"062268f5ef23a8a80ad93db3e8b8349c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1030:32x1030x256_n"90595a377a5b4bd7813381f9ba85d9fd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1031_n"c3bf180acd4d0cacea7379d41d800660*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1031:32x1031x256_n"549e0d5aceb4bec9f320ddb07cb83821*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1032_n"d8ab1315dbe5418c0e4d1a2f7e7b9e8f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1032:32x1032x256_n"4c452cc4367fc108eb4d192108161e92*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1033_n"92b36e4640bcf34f3604904d282a9bf1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1033:32x1033x256_n"de7e686f69d7ffc26516543cf5a8357d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1034_n"cf2153ea49b428d91d50ed496878e9d9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1034:32x1034x256_n"94bf085f386cba525c817c1f32e88bdb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1035_n"c34ccfb5381723ced79039fa107ec26a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1035:32x1035x256_n"783c18870a9102b332259fa477cb26cf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1036_n"26e62762f65c405b9678bb032b84fcfd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1036:32x1036x256_n"530dc65aab7257b3e2d7eb8c51453bc3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1037_n"5aa9559dc5366e21579e8dd289378e7f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1037:32x1037x256_n"82c5c659b84df3a2e7466a77cc6249cd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1038_n"bb55356ad59c3beb7a11e3ace724908f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1038:32x1038x256_n"f9d218c1d514d9c978d49091ea815f89*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1039_n"b3988ffe7a4265ba646b32b49c712632*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1039:32x1039x256_n"6f87b93d855b50d8982cfac7a7889f3a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1040_n"a78b66ac558f5fb7694f9fbc47d17de9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1040:32x1040x256_n"a8e81cda1fc197b5ac08ee39486a3dac*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1041_n"433a897f644c8c5babaf051e740cd28a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1041:32x1041x256_n"f8bd728a429fc6b9a902fb51b3e86cf2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1042_n"7acc325ca2847b2c0cee3d757ea99954*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1042:32x1042x256_n"82e28f423c66ff8c1d65f92bfe7408f6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1043_n"e2d1e5f12d2cb9cac73d2fde3a768fae*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1043:32x1043x256_n"8e6758e4b858bfa67b1f9f9505c16b80*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1044_n"91d73ea70d9c2f31b9af9c07d7f17e8c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1044:32x1044x256_n"fb47ebd32ebc2fc8e63bad0b59a2003a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1045_n"0894bc3841c135e074e4b2997431de97*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1045:32x1045x256_n"1b24d4005523d44bf7cb119a59bcc7e4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1046_n"7e2e921815324ed71b9219e00b03b953*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1046:32x1046x256_n"ab5a7de25a13303809dec9a94d32d984*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1047_n"008165bb7a84c2d22365b641953b5617*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1047:32x1047x256_n"544d1fda24c36376f4a3034f1050b2a1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1048_n"9409373a4ff4d3ba0fec358f1ee98083*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1048:32x1048x256_n"8895b2c4b7570494df21b47d01600b8f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1049_n"b9b119dd0d5730be1692dbb007890be2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1049:32x1049x256_n"b5eab5d1770255bf599511cd8b851197*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1050_n"5afc1470d63f58a724b8d9162ce1aca9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1050:32x1050x256_n"bf4b1ebaa321d1eb14e01829e66d384e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1051_n"6124298144778ef66e60df861d9b7f31*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1051:32x1051x256_n"39fb01ead5f562d6204c8df16f0beced*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1052_n"68a22954cacca246c4bfa26a7a0588fb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1052:32x1052x256_n"0c4696b8d138cc202c2933e58060ae6c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1053_n"1f927810d9353d07d8808a24463421de*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1053:32x1053x256_n"c724b1b4d2ff0207fcbae9cd2e5ff5fa*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1054_n"a495d574f4ec7588147d6be164efd356*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1054:32x1054x256_n"90dc749982facbcfd469ebd472277867*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1055_n"d96ca29c1cdaf393e64b7f1a307733ec*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1055:32x1055x256_n"73e76277acf281e499165e4b69c95477*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1056_n"660db254302876813190d354aba6b86e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1056:32x1056x256_n"c8cc3a28572b8fd8033a481d85ae9be7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1057_n"4f1bcbbc95b80b86bf2bf66b4015cdb0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1057:32x1057x256_n"dceddfff739d69a763074d73f72c1d2b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1058_n"ef8c79e994fbc3e0eba547722f227032*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1058:32x1058x256_n"01f5d57821a3edb3b7622324480ef9ff*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1059_n"ad0b685c175a3543853e09af14c0e66c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1059:32x1059x256_n"12ea9d22f89f52dce0adf3320c057357*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1060_n"76b4bb93be9b47a6b94caace2f7977d7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1060:32x1060x256_n"c85b528651b277362c166c22d727ecd4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1061_n"189bee6255e99f025ed6e4ec60e66f71*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1061:32x1061x256_n"3313fa425997ce35749aa3128a860e1e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1062_n"f3b275f92e01d3cd54a44eae55fa2a1c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1062:32x1062x256_n"f37378938b8946a42619a2fb160a6840*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1063_n"9bf55a7a823543277eed3f7ec9601c33*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1063:32x1063x256_n"d771e54fa8a82b603370133c4fbd4ea7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1064_n"fccf553a1edd4a76fc885ef9a8a6e802*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1064:32x1064x256_n"f48567fe15af4b38ee4514a0257cb521*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1065_n"b4932ee11d3b2863f0e2f10ef04a09fd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1065:32x1065x256_n"4a0869098958000b37bc46c8f96b6840*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1066_n"04570a4fdcb9375dfce3fe5f29b8eb92*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1066:32x1066x256_n"c6186e86e789ef94a39cf4b8bf2a393f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1067_n"c93a3e084f7886e020273f6c2ca20c3b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1067:32x1067x256_n"d86ac0ee0cc98f99508721c2694b687a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1068_n"b088e04bab4306e4b0295d39f79157b0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1068:32x1068x256_n"dd615f6db1391949f8142bad60fa1149*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1069_n"691f9f6d48aed497666fe61cda434b57*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1069:32x1069x256_n"415d45611f326dea9485612faf05dc90*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1070_n"28846aaa2f9b750c45d30ed522a93dd8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1070:32x1070x256_n"a5601ce42c085414850ad663eb93d23f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1071_n"e6e688bcaf951ea2599b36833af44123*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1071:32x1071x256_n"28b1710dcf9c9fc36d45cb9effb8ce56*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1072_n"cfd89d49340c61e8070b4e4fb4ec9b15*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1072:32x1072x256_n"bab41a14d0feec11706f827147af038b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1073_n"13c85006c6ccc6997ee39755beabaff8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1073:32x1073x256_n"28825e5255e39296e6397e4fda1c6766*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1074_n"1c405cb3812d846c404c5d4512fc6ed6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1074:32x1074x256_n"744e04978cfd01ad3cc869e06f1c99dc*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1075_n"62712e33ea00f34ee6191c103998cbea*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1075:32x1075x256_n"c205bffc66d2c1154b8905915c3d932d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1076_n"38cef4ce24cec5a7d92c2e6ef0237c11*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1076:32x1076x256_n"26fd00a35914753e44f021f409a35e54*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1077_n"fbcd336b52e8062d53e6ebc991c08652*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1077:32x1077x256_n"b08e74e5c78b643af2d326c80d927472*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1078_n"909e3b1f7beb43b21ed1bffa97972957*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1078:32x1078x256_n"e6bbf81b75c4ba10b0e3126f1a147157*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1079_n"0ad8f9c49e68b315f39bfbf96250633a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1079:32x1079x256_n"43c7382ed4702f17afa512614c68aeaa*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1080_n"6644767c97d4e3db1106c6fcd57b643f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1080:32x1080x256_n"cb11caf9545b598e78435ec374637300*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1081_n"55508a3834105c6023d468b80ed19c00*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1081:32x1081x256_n"e7bac67b1121216c0405f3d215763753*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1082_n"9da4de69eed4673723018db1fe4de18c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1082:32x1082x256_n"aaec7acd775ae06407bd4fce76f506c8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1083_n"33a35c9214defd65a2d9c53783e82a49*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1083:32x1083x256_n"88506466eae71f5a8cfa4e073e77f9cc*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1084_n"2ab7920bb0ff9ae6349b1c98ed398f87*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1084:32x1084x256_n"83e0630119991824f2fc9c9149d6e277*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1085_n"3271c630c747b618f176a628d6472e79*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1085:32x1085x256_n"59f7c128ed50cf139eaa18f4e877ad61*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1086_n"f5f35ca806a530fcbb7b6c595a2a502a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1086:32x1086x256_n"06c0a53a8951ec3845426dcc418727b7*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1087_n"63f4f84be298aaf5412c20f2cc904acf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1087:32x1087x256_n"53d37dc82c99a7d72e2dc96e487b2267*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1088_n"ce75e4c43d8ce5b2492fd9f095522c37*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1088:32x1088x256_n"30d8cede122965a2ed9babb2e9ba6122*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1089_n"ce52e76ef092ba29536cc958859fc455*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1089:32x1089x256_n"4a5acca270628f97dc81204f57c20211*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1090_n"549b37c97061ff2cefa659599f6c0002*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1090:32x1090x256_n"c212ca359b5a23f8ea7718b43ba83368*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1091_n"b72e2dbf33884af6779d54a5c8280c26*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1091:32x1091x256_n"6c1f7bcb26e3e5e7032eaf21574b074d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1092_n"785b3dff63da2412b71ba54393317ef8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1092:32x1092x256_n"c67cf22916c318bde3cd5329ad76245f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1093_n"0761f5db4580168cba2530b5d7bed2ed*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1093:32x1093x256_n"dcc941538d9275c09e6ab8bde494f6b9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1094_n"76025fff57847585f16b5eafac3e3ab6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1094:32x1094x256_n"35d89a86fc52cdd5bc30a076693cc0f3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1095_n"dd6fb24241542421ab778a68215bd5e5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1095:32x1095x256_n"18e69d005e5dfa67ab419152f6cb1113*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1096_n"cc1c04651f4efb44d467af01cf651e22*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1096:32x1096x256_n"ab8cfb09551646e608e4e368c90bceb3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1097_n"9514d53b8a3c7a04497c8d47d42bd0b1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1097:32x1097x256_n"e01e6e1b2b28231971b16f6a7e84957e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1098_n"38c875f9b4142517ba28e09fd120b543*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1098:32x1098x256_n"6d0063747b460fb4f88cb2d6e6435d50*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1099_n"c4281b654b72d71ae0f1035d4ac2d80f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1099:32x1099x256_n"27a1941fbdf80e0ef90866c8b8b9e2b2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1100_n"d0ef3f5749bae34891891e7ba3a756f2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1100:32x1100x256_n"e87bdd18106721e92f82fe20bddbf89a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1101_n"68f64bb0eb712c521c7897afb43d4297*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1101:32x1101x256_n"aab590958fd2a2ccc872c8f3b3448cc4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1102_n"1f398a1c573e4184c5b355a772093421*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1102:32x1102x256_n"6a175622e81702776714db5371e1c624*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1103_n"686c040d87444e32a00fe774773cf740*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1103:32x1103x256_n"9f79a24c816140326d6619a76e26abbf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1104_n"42cd1245eed7c61378254887b5b8223b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1104:32x1104x256_n"8d950fad9b2a34ca3fc78d9d7adbd760*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1105_n"7c99ba68324808faa607da53b1d47166*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1105:32x1105x256_n"3ef854e9488a454f7104b03c605b9d27*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1106_n"fab6bfa7b0b428b30aced4150fc3634d*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1106:32x1106x256_n"1d43e446d437388210cc50224be4d4ef*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1107_n"57a22e26c04df592d9a3ad01aee63d12*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1107:32x1107x256_n"959dd802cb4dbeab34281169246c4879*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1108_n"45042f15890d945093fd139c71df7317*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1108:32x1108x256_n"0fa0060fef56d83055bb1f31e0a77dbe*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1109_n"2ad05ff9b74f1399d37193b605e3996c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1109:32x1109x256_n"139cf24d078ebf4d204287e9ac44fe9a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1110_n"1bacc45f4dd5fc0f533140c8896cbe0f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1110:32x1110x256_n"8414d255fa702d6d74c20fac350c4628*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1111_n"d831827139e6aac4e2ea8f6255231b13*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1111:32x1111x256_n"88b33ff9b59c5c3601043c451467de18*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1112_n"e5d1215249e71de7970750a0d91c5972*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1112:32x1112x256_n"13d3c0b016833a3644e482b4f41cd2b5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1113_n"50c25623b47f48e1cd8296bdc386a52c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1113:32x1113x256_n"58cd2100c4eacf9436f15ffe11b534e9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1114_n"6db05494d740942b73cf010ba3e4c6a3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1114:32x1114x256_n"505b54278b43d1cd92578fe53bd367fd*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1115_n"fb833c5bf29b1e0ec62a04f014f67c8b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1115:32x1115x256_n"444a4ba6c8b3560e47d1a8b697f9fdd0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1116_n"4790be04ff90ad8ebcc5516f29230756*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1116:32x1116x256_n"5f10c217db94cff4e83dbb099d299451*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1117_n"53ca19b884fbc73ba877db0eac842174*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1117:32x1117x256_n"adc06cfe4ca3eea5b1fd5a9fdea5bc5b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1118_n"381e9686c1dc7460c754046a9e2dfddc*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1118:32x1118x256_n"1011c12609207a52347af4514345cc41*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1119_n"79e2518e5f8c2aa0d969d8ee83f6e16f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1119:32x1119x256_n"1b92cd31456a92bab670193d13a2cc59*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1120_n"107aa9c24642fc58dabda0afb59831a8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1120:32x1120x256_n"07fe992c73fc07ec8bd98f0130692911*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1121_n"e4748a5b1812baa76ff4c3dcdd2aa2ef*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1121:32x1121x256_n"953531ff186266e40088052399e037d1*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1122_n"bd8571ea992942c769fab561963c189f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1122:32x1122x256_n"2c371d3fabaf8dbc8523aae1e65335df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1123_n"322a1818a730e4b17ae0807d213b9462*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1123:32x1123x256_n"f656af41a769258907961095a60c75f2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1124_n"58e239fbcf7593d0bbdd7e2084481c2c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1124:32x1124x256_n"2d76059cf98304c971aa3eb937e29399*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1125_n"5bf761f7421da3569dc7457a9f15b6c5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1125:32x1125x256_n"44e8abe7dccc62e35134f189dab415bf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1126_n"109003fd867a2c7624eb3422939072b3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1126:32x1126x256_n"5f81b773f511c7ef586a108ccc6ba861*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1127_n"81cf00b296b9f930efbd4b43f28613e8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1127:32x1127x256_n"6eac4baf4c4404f43bacbed68f88770b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1128_n"02d68145236d99ed7f6fcdd3d903bf3a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1128:32x1128x256_n"92e067b257919bbc2d735adc2e681b77*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1129_n"1ed596e10d3c3b55e9928ab29e50edee*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1129:32x1129x256_n"822b0f73ea6abd5a8b81669bad447b54*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1130_n"591ea994ca80ee7a95fd360b829e0d3b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1130:32x1130x256_n"492a94256c0cd3f4dbe5c4f663b3580a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1131_n"7ad5ffde5b402ccc6e86ce54cb222fce*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1131:32x1131x256_n"ea5555996788dd0a8d98677e7c3633a4*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1132_n"69995982c644b86ee740bc798ca648d3*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1132:32x1132x256_n"70d12055bc2c77ed33f768b9a7e2cea8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1133_n"05ae1b47ac91f528c5556da942d54ff8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1133:32x1133x256_n"e53b736c5a923f14a30970b37b77a027*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1134_n"aedaa2fcf3db58a85256ca192227b704*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1134:32x1134x256_n"fd74f5c3fd5342dc84f591c43760b3e6*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1135_n"15a1354150293773fa8adcf229b1a34c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1135:32x1135x256_n"619bc3b60e6ef796ea4222d18f91b9c9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1136_n"ee9572c34247ca3eea5b5512a8250ab5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1136:32x1136x256_n"8b4f17402c5529f2a8aa17d995b5998e*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1137_n"1e216ebb76f32d62a2d8db9bc912b814*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1137:32x1137x256_n"ef01df48a13841965f856b1d6efbe3b9*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1138_n"a383ec5cfc44a85c6c9ee2e4b14a9ca5*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1138:32x1138x256_n"1f7841ef568a5084aede28acdae1efdb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1139_n"fca5212cbfa21b8ace895c1b659cb417*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1139:32x1139x256_n"3ed70d99d430467c901df9d7084a33a2*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1140_n"1b6f2e77123ec018afcd6acf008db124*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1140:32x1140x256_n"c0e27cc2c9963e5fb634123c3f3e4ca8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1141_n"ca7d9d1fbf820c7690de6f388169772f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1141:32x1141x256_n"bb67992b127905b9274df3aed5023d77*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1142_n"befb7c7aae082abae3b25819ac6aeffb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1142:32x1142x256_n"623405804c8571e6f9f617a72015e7ac*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1143_n"1f1fec1364e37e34e0824d7fe5642892*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1143:32x1143x256_n"33687e55d278929c83acbd2801e2cbbf*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1144_n"b2c79f85f2709abe52f5c64be62549df*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1144:32x1144x256_n"a627ecbc7d84faf9bbca9cf2d87abadc*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1145_n"e109fbc75d504fc655ebfb8796e76069*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1145:32x1145x256_n"acc2b3e01ba054813991c7940c123880*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1146_n"43cf822f249aeccce65e2ddb0cf31dcb*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1146:32x1146x256_n"62c1465f9beff9e85104ceba8caa9634*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1147_n"951cc9db199f0768801b61e9e48f166f*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1147:32x1147x256_n"30444d38bec18cc1b4025719eb35b35b*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1148_n"9b9ce2ab9369d117c619fa8901e0a742*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1148:32x1148x256_n"59b4dd226b43762b5363686875a45f06*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1149_n"fe8e6961435aaf50b65876aa2388d5d0*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1149:32x1149x256_n"530b52f5ecaa48470daf0bf54a33732c*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1150_n"e944efd8b473bc78c9fca5596701fb74*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1150:32x1150x256_n"09cd4dba2cb9bc90aca6ceb5ba2d271a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1151_n"5ca761976e1b2a1e046a580e67d1996a*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1151:32x1151x256_n"2449288df396f0b952942333547c1da8*84"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x5120_n"39c66e6e939d6156ae91389a18c6c011*160"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x128:160x128x1024_n"5a8df7da3f5fa46219b66bfd03f4dde9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x1024:160x1024x128_n"81fab21e97fa06259acd2492e08e341f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x13824_n"8f243de5c8a777b7efe112984e31a3be*80"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x13824:13824x5120_n"207da6a2e6df31bd6353f22f36154761*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x32000_n"929fd2007e7fdad23a3ed8bd8315d9da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1025_n"89ae48d48425e0c3e3b1ad749eff763f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1025:160x1025x128_n"7a7d2e8bad5d30d3e27ee92668b9e931*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1026_n"0dfeaae46fdc6178f686a0fed35da8ce*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1026:160x1026x128_n"2851925ac3805b939a071e51313a53ca*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1027_n"f63799bd4cfc54a52fea2d92462511b3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1027:160x1027x128_n"e03197a4d017181b511b540d7edab2ad*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1028_n"7e4c71dc79b901929131885a0d457280*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1028:160x1028x128_n"10e72e22e88f5a96b731312bbed6b64a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1029_n"b37896007c7211bc59228eb353b06244*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1029:160x1029x128_n"73a36578d6023bb10c2dac47e88a5d58*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1030_n"7839427f0089f5e60a655a681c135273*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1030:160x1030x128_n"7a37a099c78a54a2ae82b66484186d52*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1031_n"2450cdb89d379bca6ade58521467588b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1031:160x1031x128_n"88a0f85c21898c445f05c6e2c44be9ba*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1032_n"21a9634e932f99f5cf2e531431b10483*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1032:160x1032x128_n"452ffdb6a97d1b7e1dc673fd2cab23c6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1033_n"404e6e596baf854cc1f0a6937fea14f6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1033:160x1033x128_n"85fbc3555b2a961960e175ae4dc6a3d8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1034_n"5d1a965247026ae33b311528b3440622*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1034:160x1034x128_n"9c955a71fd7914aed3549b10405a3a44*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1035_n"6519a543011dba6b290bd587708dc6c2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1035:160x1035x128_n"87aeb3444440ce8e8ad6142a044ffff3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1036_n"af88aa07b99bb17cdcb122caa9a4a324*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1036:160x1036x128_n"d3902898d5dbb3b4425566bcddcfb3b5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1037_n"4b245a60aa376b3a1fc2cbcd86016915*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1037:160x1037x128_n"84a95257cd4c08422cd0f5b6fc146c1c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1038_n"0b149025042471579b33fb6b2bdc9eee*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1038:160x1038x128_n"a1574fa95b62751a46536490542cbebf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1039_n"511c1d01e6555d419156d28f6373ee52*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1039:160x1039x128_n"8155a2b070986bb5a6bbfa44b9d1c581*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1040_n"bf4745237803fbbe356411338b916e51*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1040:160x1040x128_n"cc027a26df4e109a030f651dfcfa8cd6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1041_n"4df772f768ebd19fba9112ecf4fc1c26*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1041:160x1041x128_n"e8d7852ce22b02ef686cc757650e433c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1042_n"7854e229930add9c839bfd5912a793f2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1042:160x1042x128_n"50e59f4bbb26cc9bccda09c0ac4fb55a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1043_n"05611209cf893b97748c4da5a1ecefd7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1043:160x1043x128_n"0abbadfa6b7c6f525c12a93e45e71bd1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1044_n"1b1ee0ad41df4291ae59a406ba10f6cf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1044:160x1044x128_n"7d9ab902bffcb8025f0ef52fcd976a10*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1045_n"165e79c3710b30ee2ddae49264161f1e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1045:160x1045x128_n"94fc48baffb17e0515c6c000c0cf50fc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1046_n"1d474eb341b9e924b64b5c97432a3c48*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1046:160x1046x128_n"6f5a8842044a81311c60d80033323999*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1047_n"0b5c3a17b6a108254891d419b7318a3e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1047:160x1047x128_n"cb5132588c60b8f6d9ddf4243807c872*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1048_n"4317e0fc50ec34875e4522365cef89ef*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1048:160x1048x128_n"a97aefae1ec96288e66c6c3b53d4e43b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1049_n"133b7c67f232926d14ea833997467c7a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1049:160x1049x128_n"b10ad5ad13c6d349e7ed2d51d7dbc41c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1050_n"55629e05e6a3099d94632c03a5a14a88*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1050:160x1050x128_n"aee923ed873e2b24a5fa93042bfffe64*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1051_n"1ff198e479114092060f7e840aaf16a2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1051:160x1051x128_n"0192da89c619640632d61ef7192510bb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1052_n"e2fdc94813d4d183162c93e13e4e27db*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1052:160x1052x128_n"3724accf90dfb23dd62d4591e4f59f47*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1053_n"a9782e4891e1e65832fa3ba90aea584b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1053:160x1053x128_n"609f606dece361814ee4203efd2f2bff*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1054_n"80eb02f70a7be5221bd034857fbcdf45*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1054:160x1054x128_n"6f125aa429146f84d8cfbffe2a51914e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1055_n"88511179ff1fb5bb0b3d92a642dd575b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1055:160x1055x128_n"6cf1b1ed57200698a27033fda32f430f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1056_n"40781df60dd6809c919c3d843556e7f4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1056:160x1056x128_n"4665bb0ae5022dcd2979c4a52c29dfda*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1057_n"b41260bad5d2e5424c3e6932d091b5cc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1057:160x1057x128_n"3fac85e862f011032bab37f4fabaf657*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1058_n"ed65b8e1dce0a0b6f1e2e7b65a9a295a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1058:160x1058x128_n"0eb7f435d7b26e39852edb077034bf75*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1059_n"2249d03acf0dbbd8e9d6b97419553a21*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1059:160x1059x128_n"77d0e937763fca47785488dfe978fe58*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1060_n"26ef16eac3637786197ce1929ad3a7f7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1060:160x1060x128_n"b9cd8923fd5fda394bc1b73927d8edae*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1061_n"827d339be57e6b291136e8720897f553*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1061:160x1061x128_n"30725bf1ce7cfc90e04dc9108a3972cf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1062_n"3065dfa3316201bb1c6dbb740e31bf2c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1062:160x1062x128_n"408ec5139e94b26e6e9aae81ae95cbaa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1063_n"f98dc9e7baf52f608c440377837d8418*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1063:160x1063x128_n"f9f076a2d8e735a22494680915274b96*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1064_n"68b1ffca9ab612a66952ed864741a922*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1064:160x1064x128_n"b4af4577f3bad8d0ab1dd14e57fc53cc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1065_n"f2dba827121ca31ab10026a237304930*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1065:160x1065x128_n"07c6d77dcaad4708cfdc7e43a1bc386a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1066_n"70afdf64c172c797fc7eb8546219ef5a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1066:160x1066x128_n"32c391e01d38816473066954f7537071*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1067_n"9cb7e5b28499e458f1643f9e0f0d03ae*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1067:160x1067x128_n"3d903ae6ce06c32c94afffc418323970*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1068_n"2d4fe6dfc3be68ac0c2de0d451bbc360*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1068:160x1068x128_n"45d1b5688bfb78bc529cd686d2e40a9b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1069_n"b8f3bea22ea9ce34f9371e500d4126f2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1069:160x1069x128_n"fca2225b2460ca2ee43b385673c725c9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1070_n"2f98fad62b9e8f55e073fa6a76d2d146*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1070:160x1070x128_n"c3473f49d80f9ad0705d67f13dba4647*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1071_n"d9a3bd5bc194b18227fd6dbc5dc9242b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1071:160x1071x128_n"7d6f2f80e3901f9404ad77e5895aa064*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1072_n"ca3fa1ec3152b877d30e189cbc4fb1a0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1072:160x1072x128_n"83a75f0b1215ad8414376be32ae6bf1b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1073_n"2df69e6d444216b290fdbfddfc0ee1e5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1073:160x1073x128_n"e8f23e43f395519df4ec12a39a68c49a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1074_n"3d0556abdf4e04e16e93701f8282617f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1074:160x1074x128_n"2d81dce661553d8027fe1996afd273a5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1075_n"946b9272b20817f0f9f403ea8fd025c0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1075:160x1075x128_n"c369f975f71a76980bc9957ab00c934e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1076_n"ac33ccf395c2b64f65e5a3be097d122b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1076:160x1076x128_n"5961650acad593c006454a5099aeb94e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1077_n"a8ecf7774dbe54bc80f2d74d7ce53417*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1077:160x1077x128_n"06f26d9602f4ab819ce4965a9e5a4bad*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1078_n"5b07874edfe49c17484907abd4157b43*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1078:160x1078x128_n"41d895021333c369e1dcf69fc532fc7d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1079_n"e1cc4705bbd98ca442224d416591a001*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1079:160x1079x128_n"5ab5de5007f08635dfb77e4f63519d96*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1080_n"9a2e70a025eb1795a5653fe87c08f273*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1080:160x1080x128_n"0ad61f6c4083797ad4e37c9a649955fb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1081_n"802dcddb18c366149de18834bb43057f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1081:160x1081x128_n"b79fdcc5a27e833ab9e5c76a0980f9a4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1082_n"54f5b9b62010edca7651cbf010b3464f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1082:160x1082x128_n"1768d1eceabd3340a90d8793d97281ec*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1083_n"ddcaa59f05ec5e2f8934fc8da5e7e86c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1083:160x1083x128_n"8f34d6618e7fb69a9ea7a024e6f5ce12*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1084_n"e73ea0ec03f1a6ea80ad82204060c081*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1084:160x1084x128_n"9237bfc00622ce6d4862e12c17dabd8d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1085_n"b55011d5e2a51c43f29a4e880a027014*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1085:160x1085x128_n"739e7f2e91f116fd72f479b30ea3def5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1086_n"6365be2690314834d5d8ebe58280de16*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1086:160x1086x128_n"41e7ea2e6bbe312d4f81236d6da166dc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1087_n"9eb5cf077ebb881d44f376af784a9d90*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1087:160x1087x128_n"1d3f98faf8f89c0c78c0f11a5977fb3e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1088_n"ae285a3a41d5dfedda4038fbc79bec25*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1088:160x1088x128_n"4aa3af9909ab4c5581da4ba2959018cc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1089_n"6f0a5aa7600a413c55880497093df694*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1089:160x1089x128_n"a70f2ef2b728f6cd1cf64b0b70416ca7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1090_n"966b2844dfe9712f24eff19cebc34074*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1090:160x1090x128_n"9b7de892c6b463969f4effdb34b51da9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1091_n"c605037b44b25a602b1827f8d58fd736*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1091:160x1091x128_n"a03776136c4541d04626a2ad9feea72e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1092_n"5db8f4b624138d1487839c54607617aa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1092:160x1092x128_n"14c0e8625053107e034e9a29f04cd6b8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1093_n"48ec0fc8b19dfe9c18b656d93bb61232*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1093:160x1093x128_n"ee76d0590b9f587486a18692650631d9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1094_n"08fa83de5b8fd8a792a22a602257d9cd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1094:160x1094x128_n"1caf65b142a5fbe996fa17b20c091da0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1095_n"f518c57568f2410e51c26d9c82cbac25*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1095:160x1095x128_n"d9c1d72aa8c758aa949966907a404638*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1096_n"bb466a2b14377b5f688a31aa7511c9ac*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1096:160x1096x128_n"6e6301040840e71de8fc1dce841485c2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1097_n"d3e7df5c0f54385622cf452b8edf5966*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1097:160x1097x128_n"b9d58898fef07d3b3de0d794054268f2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1098_n"e4491e90ccadbd755366b33f839e2a82*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1098:160x1098x128_n"677be35599b225d3d17d33d460adec15*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1099_n"9c4eeef5bbb54ac445ae4ee20218fd7d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1099:160x1099x128_n"7eed3755db98e6b9074eb48b677f6070*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1100_n"043e163a68402bfb2cca23733e36652a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1100:160x1100x128_n"fa46d0ea06c8a3267f3021db48fe4438*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1101_n"377ef3dac9db2f767256fda44dcf15ef*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1101:160x1101x128_n"b70f51397d4398a6ee5f124f9674abf0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1102_n"0a6a90635657a94a60b6f2a7dd14d214*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1102:160x1102x128_n"a81b9322b84af3ecdf941f2ab8f2224d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1103_n"3fb3dc6caa7bc48191d4b4410e125126*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1103:160x1103x128_n"84bd1fd0e368c991e83a45f2de4c9c9d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1104_n"297d2cce7572c9faffecb93268b9f518*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1104:160x1104x128_n"bf5fc34ebfb0bfcf1aa78f022dc1fab6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1105_n"7a5adca68b500ce45c25346d09d1ca94*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1105:160x1105x128_n"0ad49d7be22a47ec9cc380bd57b330f4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1106_n"62ba566624a5d75673d1f39becad7488*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1106:160x1106x128_n"1fac998e47224e6f3ded99a3dcbf8e56*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1107_n"f8cea4672351353482658ce152396966*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1107:160x1107x128_n"aa28049b92b332b1a0775607a30d69d4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1108_n"9b6d8f14ea6fad880500223a3a64901e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1108:160x1108x128_n"63276feaf6f6d26aa047af962f8cedc4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1109_n"8a92832d548fbbe5d701b4fa3a93f524*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1109:160x1109x128_n"363a1e8064dbcb947d61c61e78a9a86d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1110_n"9b3f8c4ade18806adf43bdad757f318b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1110:160x1110x128_n"6d127d4a4f4e6460753a8951febcf79d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1111_n"27d6a0eae3709f8c454df242c8f794f8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1111:160x1111x128_n"17822e80f495dd7b3cb961cf9ecb7f9a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1112_n"2ad247cb9fcabb747803a769cea4cefa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1112:160x1112x128_n"df5ad4a998cde2d6cbfde921b7fb4fe9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1113_n"b989bd83cfb0dffbf21b6f08b603a0dc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1113:160x1113x128_n"ebfd5f23259cf0d1a68bcf2fea9f6b2c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1114_n"b5ba6b8d5976bef65146e73496567246*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1114:160x1114x128_n"689f6f3cbe625297b1f7606454a9aabf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1115_n"185547cb317e4f77de54a9ef44eaf417*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1115:160x1115x128_n"6f85c410df0ca957276f854abfd76804*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1116_n"7173ace75485543024af7954173abb78*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1116:160x1116x128_n"7ed9a44913866f4dbc92f2a1308e04c4*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1117_n"73d3c2d62c269d6d0067668314db9b40*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1117:160x1117x128_n"6b8a28cf7e8df2e065f757a02206d1a1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1118_n"c67ffea131f75b9a5888b1c25d9bea1c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1118:160x1118x128_n"b59ef7b5df23c01f7e5b0b681f8c8b5a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1119_n"fb86fe046bbc2f00b96e2437b756c49d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1119:160x1119x128_n"6f056d9e2fdb100a0c6a619978b62599*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1120_n"283414ace0296ba340e167f2065b4d12*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1120:160x1120x128_n"787868de2142665eba7282a0d800be4e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1121_n"063c0a110306f1852a886f2b5d8110db*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1121:160x1121x128_n"147bb9c46d1b2871ba28ede799bdc53b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1122_n"9296b69cdb2e8c14f00b994cc4c50a60*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1122:160x1122x128_n"dda073e1820b59b848aa30faf25e0465*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1123_n"a05e872a5c99c9a05af923f24154e7cd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1123:160x1123x128_n"7c89257a22a3804e68f1d7b31169d509*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1124_n"2ae03506e9d66aa2d1d9b33d7d153397*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1124:160x1124x128_n"e776662ece87a13d0f6ea8f91f4e8ecf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1125_n"7948d0b7af31957e1cf9612707d77cf2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1125:160x1125x128_n"160b20526a923d36f703330a53256b98*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1126_n"97b463734eb1943296982c418450d5cf*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1126:160x1126x128_n"7dce3aef125015196aa2abc40404dbb7*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1127_n"2b218d7127a342d2b118deec8c5d1929*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1127:160x1127x128_n"d07d31b7ebb16fa5f9ef225c989f81cd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1128_n"6853eadc74c13b63cc450fbbb88c040a*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1128:160x1128x128_n"0ff97324c29803151344c9d950cf86ea*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1129_n"130dc686f13b714f0ddfdbb11b205de2*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1129:160x1129x128_n"2bd1af861f3f8dcefa1d997b53ae0951*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1130_n"115639719f65c5ae25f8f0b2482d49c0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1130:160x1130x128_n"3518e01fb1454c6edd5a60863331fb69*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1131_n"8c7ec255c1209c44b659fc499180f3eb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1131:160x1131x128_n"7074cbbde77ae65cf86c734c8caa515f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1132_n"6fe0c1a51dc69db8e35fc013f8bf32d5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1132:160x1132x128_n"0449c6e98bfdd6cb55afb92646e8e6fa*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1133_n"a3bce5f928737e36e74bbc70de5ad39d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1133:160x1133x128_n"982092206db47126f0198469d2c7ba06*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1134_n"f5115dfd1730b9a8cddffe56f292e328*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1134:160x1134x128_n"0c121c0f8eca2f0ec166f131c3a9a667*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1135_n"8433cc45525f8eba5a68acbbc0ab5f2c*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1135:160x1135x128_n"2f2c25cbc5e59002cfa718d31fb0ff8f*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1136_n"581221f6f70d39ad3f6df2a5a43b2e37*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1136:160x1136x128_n"04c550241f5abcc5fd5623c8a4e8d474*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1137_n"4975e329db9f02948339028d99339086*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1137:160x1137x128_n"cdd9181f23f648dc8623c3a911f92ea0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1138_n"b3bff466a07495972dadc87764dfa7c0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1138:160x1138x128_n"255d7635f5b82e9398b19bd6d54b9a41*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1139_n"14e081675fae3f161e3fc09e30deeec9*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1139:160x1139x128_n"46137e5d54ce870222376d888de6ab6b*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1140_n"06255470f304db9cc087aa20dfd7975d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1140:160x1140x128_n"3718da890bcc68963a55a04d279bf391*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1141_n"0cf38039b50e25f0fff4f6efc727d7a3*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1141:160x1141x128_n"08913a086879e1d2adcbbc4042874557*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1142_n"6c928684c444b28d06f2a8ee8b104e8e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1142:160x1142x128_n"87742a82f94e53bc3ff50ed18aa96c29*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1143_n"956bb1794db051b75ccd8b57601bd94d*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1143:160x1143x128_n"06bf058f907f041de37339f5727f6ff5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1144_n"4022db7e79770ced9092dde1fda9f9d6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1144:160x1144x128_n"21859acbcd789be087e6e58bbb7f29eb*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1145_n"bf254376fd35cddce94cdd2fdffa2eb0*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1145:160x1145x128_n"77ed472b5ae6ddd39f065977a8e8e0d1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1146_n"43f42aa40e070fa51b2dbdcec57259a1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1146:160x1146x128_n"e63bd146a1068e918b4e4bc145672296*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1147_n"88b43ff388754af52adca9bf982ef3d8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1147:160x1147x128_n"5a2d0e76295cf5b2ec6fc94707a037b8*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1148_n"b39b214da4309eeb5f0405b77ed5ae30*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1148:160x1148x128_n"f07c202f146cf230453548954818dd04*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1149_n"9f8ec871944e5b5003c2bb8a04c170b6*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1149:160x1149x128_n"f0727be68fe1fb1382499e435c82bddc*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1150_n"63b0d92820350b132509579efba21229*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1150:160x1150x128_n"28270a65b8f870e12e67ba3eeddeb4b1*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1151_n"cb8215fa4824017745653799dc7da237*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1151:160x1151x128_n"b85ca850db82527713eeae45b8dde18e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1792:1792x1000_n"0431ef9721e9d2692bfa519291d46cfc*50"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x32x1:0x98304x1:0x98304x1 1x64x32:1x32x98304_n"5e50a0dfffab1bda850747febb42d249*65"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 1x2048x4096:1x4096x4096_n"49627db1c126f5ed0213c6edd6b505e6*3144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 --attr-post-ops=sum:0.625:0 1x2048x4096:1x4096x4096_n"b531d09f815391864007456ed5186fbf*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x10240x1:0x10240x1 1x2048x4096:1x4096x10240_n"db97b2b1b377c63422353e18eb4f0ebd*48"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x10240x1:0x4096x1:0x4096x1 --attr-post-ops=sum:0.625:0 1x2048x10240:1x10240x4096_n"03d9e8a2d186b1ab61cbc13de109895c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 1x32x4096:1x4096x4096_n"689fdc1b2967aa3ac6a53c3a9632c2cd*6144"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=sum:0.625:0 2048x1x64:2048x64x64_n"50b5d09a84a44cbbef2b7207de72bfcc*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x64x64:2048x64x1_n"eaf84cf8304836427b847134cd238cac*3072"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 --attr-post-ops=sum:0.625:0 1x32x4096:1x4096x4096_n"8d5bd3f334b7e1a32b6c13dec46defd7*3072"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x1x64:2048x64x64_n"fbdd6dff1c2f9f17a04c70f05db9137c*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x10240x1:0x10240x1 1x32x4096:1x4096x10240_n"2e8f3fc2de4dff9a778256b00766883f*3072"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x10240x1:0x4096x1:0x4096x1 --attr-post-ops=sum:0.625:0 1x32x10240:1x10240x4096_n"327952398053123d46b54dbb0878ff68*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --strides=0x4096x1:0x32128x1:0x32128x1 1x32x4096:1x4096x32128_n"f9a9ec4c9cc43b7d683c81528dc4d52b*64"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x2048:2048x1000_n"6ea24a70b0243343135f475090a3358e*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x16:16x2048_n"82986ace62ddc06362a59c5f51c702dd*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x25536:25536x6384_n"edbf425bd62803bca5b0028a10de18f0*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x51072x1:456x1x51072: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 112x2048x152:112x152x2048_n"eae2d4a9f0cac597eabc814b0d17396d*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x51072x1: 112x2048x2048:112x2048x152_n"5f225f9db44416ae5f701ccf09384d6e*1152"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x2128:2128x25536_n"6336f3a94b94fc74116ac151a636fd32*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x25536:25536x8512_n"e7087c77f4895081999d6751419b5f54*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x8512:8512x25536_n"a2d069df96f64dfc1f107b2b68541fc8*768"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x25536:25536x4224_n"9209feb30fe3bf07edde22a7b3ad63dc*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x4224:4224x25536_n"60d6a7ebeb6dcc19fff00b82d7600cd5*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x16384:16384x25536_n"7f3ab9c20518722dc31b9c91c4b9a3a4*192"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x25536:25536x8512_n"ca11ba783f230bfb3f127827de906bb1*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x16384:16384x8512_n"090047ac79c9f7219146803d96ff52e5*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x8512:8512x25536_n"7bba965f98f2ef6365c8cca339b6b234*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x16384:16384x25536_n"c7a6db287456d09acae7711ecb548440*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x25536:25536x2128_n"57bba21880205312c906766d0ffa41f1*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x16384:16384x2128_n"13229951608ac43a9a79495b3166c8a3*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 112x2048x2048:112x2048x152_n"b36760348a5315cffc763f37df3e7e25*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x51072: 112x2048x152:112x152x2048_n"611986f73e89b88ade3be73fac1a0ecf*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x51072:: 112x152x2048:112x2048x2048_n"fe5016e46d66eb90cd9cc0a0b0efb173*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x6384:6384x25536_n"86b1513f36e7c97a5c21ee61873a42b5*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x16384:16384x25536_n"c04818ba086f5523ea33923cf6caa6ed*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x2048:2048x1000_n"ff38de36a7b0d4a29f37b1aa360d4625*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1000:1000x2048_n"be425eb196eddc31d3768ab53282a2ee*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x128:128x2048_n"818c0c89a8265478fae99affc6e69508*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x200:200x200_n"6e5e979878daf8552927127c3a490df9*1024"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_logistic:0.271:0.314:1.234 128x14:14x200_n"93ddac4cbf14c5abddb97d2f2a7ff4a8*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_tanh:0.271:0.314:1.234 128x14:14x200_n"ded7755b600addc4899216848a0aa673*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_logistic:0.271:0.314:1.234 128x200:200x200_n"3d2acb08185cab208c6cfbe1c6b0116d*384"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0+eltwise_tanh:0.271:0.314:1.234 128x200:200x200_n"3e106d7aa8f82d6a91f31546bd6fa39a*128"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x200:200x1_n"2e262bccfade061f45ff0d45a06ee8c6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1:1x200_n"87837b223c9c75e15c1d5cdd8f2c775d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x200:200x200_n"bd014a521171934f022e21e3847c4871*1528"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 200x128:128x200_n"031de2726ac04ce107c16f249acb266a*1536"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 14x128:128x200_n"123f60c77ee3b54d85d318a0162e941f*508"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x92:92x768_n"733b026ab6391d7f2595583c863d9868*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x768_n"18c4549c03b161e7e2a52aa3594a0c2b*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x379:379x2043_n"198e9cf72fb13fa2ddf23cf6ef7fa7c7*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x681:681x2043_n"462ef3e07f9e3b2c088a569f598f3f42*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x681_n"2cf8a767e5ade6d5104ec8cf75f8f17d*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x681_n"28d83c5512fafd254d161810098117d1*415"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x379_n"1d88cd982c292bfafa329380c7575720*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x379_n"d50c687e7c55ee6e1dccd25fb76e1d3c*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x256_n"c5d04cc1c6e8b9626fb28f0ba7d14f4b*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x256_n"c75a74a0668bee46f3e6741b26be389b*82"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x92_n"95e020dc94b55f8bd4d4a791feaab659*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x92_n"9dd6b6b46d54a1cca9b2e7e7092c1252*83"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x13:13x1_n"b7708b9ad756968535806f8846d16d99*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1053:1053x1024_n"3e7c74a65572c59b6d6ea8cd6f7ff089*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu:0.271:0.314:1.234 32768x1024:1024x1024_n"a4ec446dea79c10e0ce03888b7062f83*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1_n"05c78899e04e02a306755a23c2ca109f*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 13x32768:32768x1_n"3b3ce4deacac4aa03ef57ab050cf4fba*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1_n"084948f17926ee86db6991ba0d2593aa*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1:1x1024_n"9cb229b9e3600a930e386a859eb2bf42*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"af3d0f9e313bd5ae71b5da1976e2f91d*400"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1053_n"06f8d9b79812f7548fbaed8a2d7babb2*100"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x4096_n"0fcffec67883d9f574055c69512196a4*1078"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x240:240x4096_n"59fdc469a14420d36a050c4a6d32251b*196"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x4096_n"70a3acacb2edb7432a307b306983c4e3*98"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x320:320x1280_n"dff83c296515ba60b16d1d6a87d26815*784"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:0.271:0.314:1.234 1x320:320x1280_n"5133ab1b1a8a59fc453d8f9e105f66a9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:0.271:0.314:1.234 1x1280:1280x1280_n"9cd4771f363ea7f5542de34456bff056*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1280:1280x1280_n"5023368d1142a6a749c3e27eb551de4a*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1280:1280x640_n"ce551f2c52073e500e0b827dc62cbf89*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1280:1280x320_n"2684b1697eafbdded897e7bc670d959d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x320:320x320_n"fda290a23bb35fb56e7117d5b5664998*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x40:8x40x4096_n"3ed1324ad7428a88c7d51c315de28402*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x4096:8x4096x40_n"777182b65dd114bb5ceec5b844a1e227*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x320:320x320_n"33598bbe9818ab31409b2ff93affc741*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 77x768:768x320_n"7b46421e81d45d3bf4228670e9beaff0*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 77x768:768x640_n"44e8b3de3418d6e412bb3a9cc2682aa2*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 77x768:768x1280_n"71b5139c0613a83239ea79855553b957*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x40:8x40x77_n"5cf084fa0d95c992dfe493d1d073ea9e*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x77:8x77x40_n"3bb8dc3c4392f75d43ee3e0b76640c23*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x320:320x2560_n"05bf7e2d33c6d766766c2f91ee7109b1*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x1280:1280x320_n"ec9abd1f4f9ac82408a31dc6a66dbaf6*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x640:640x640_n"1425afbeaaadbe1ec4297f97a03fa1d5*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x80:8x80x1024_n"169c9d4c8846b44c5bbfd33331c6464b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x1024:8x1024x80_n"054fc03edd00cfdc8cc45739bd117a4f*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x640:640x640_n"2cc5bb339bc11c7c7cbe90b9f17a3602*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x80:8x80x77_n"008fa5d9cb866ffd3aa19059570c3d90*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x77:8x77x80_n"012f131e7fe0d45e15c8b5dc7f2c155d*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x640:640x5120_n"946cdd4c4a3054905908c4096ed19217*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2560:2560x640_n"a29c5f2fd2698ee790daf1a33c024363*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1280:1280x1280_n"362e1dd249e332130745684581501f28*40"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x160:8x160x256_n"acd8c9942ff61e3e97e1091c63afceb2*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x256:8x256x160_n"ee0fbd2fab10e5439f52a35f6ccba1cd*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1280:1280x1280_n"584c4e9962d3d7638e453231b38d62c8*20"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x160:8x160x77_n"a74cbf28554fa06e1ca662a95060eb00*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x77:8x77x160_n"74353d2944e2e91a9ecf336f3724d12b*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1280:1280x10240_n"a06e8b021e82a076588619cfb982dd76*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x5120:5120x1280_n"02023c7c621341d83e285ab1ec9cc4ff*10"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1280:1280x1280_n"ddf2101844c9373e7169c2cb1f62cd30*8"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x160:8x160x64_n"482808bb3a54b6ea9b203fae6a52065b*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x64:8x64x160_n"cd50753ffe5af3d19df81c5c5c92840d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1280:1280x1280_n"ebad03c4346d109ef97ee0244742c703*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x160:8x160x77_n"7669aeb3f5f01c2507a5e742aca460a7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x77:8x77x160_n"edd9ddc20ecab4320c6e0680e28e73fb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1280:1280x10240_n"ac6dccfa8604424fca371d488ea4bcf6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x5120:5120x1280_n"5bcf3ffd4cafdb30f14b840348d1147f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x2:2x1024_n"b66a8827d229b2e0d978d205ee6a680c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"281d93809776bfc25a1e07e76c06ef30*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x4096_n"036bb97cb772cc92b2d75a42b1e4db2c*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4096:4096x1024_n"b5d68ed65d2a2fc20479dd4e8a514a7f*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1216x1024:1024x1024_n"943dda7cb7b362ac42bc35ddb2fb72b3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh:0.271:0.314:1.234 16x1024:1024x1024_n"4d37a7e1f259589ad126ea0f7bad8b8a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x2_n"4af5aaff7d22f85d6eadf4e2df63daf3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1216x1024:1024x30522_n"abea5c0136c2632af928308177fa5356*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 30522x1216:1216x1024_n"c74d4842284f0457db1998e543421e67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1216x30522:30522x1024_n"6fecf1b157f8f1f0713855cb8f957512*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1216x1024:1024x1024_n"554c7d45b3e054e690b7c2e36d49f055*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2:2x1024_n"b14bbbffc26b53252b1e8f5dd44929d3*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16:16x1024_n"e7fb6bf412275e912fa1eba630463be4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x1024_n"88c557ff12a8423e5a1f6abe45c31038*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x4096_n"8cefaee60b702dc7f16cb94b287f5d35*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x4096:4096x1024_n"8b99dfad3d4e4dc39db31f56df0bcaf0*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"4015390c3891065d64983b653d66bd9e*96"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x8192:8192x1024_n"88ab059f6b98999b0038efca5cfa4e3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24x2048:2048x1000_n"2c1216e1ca350e726fe334ccc61666dd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 24x1000:1000x2048_n"bd8a3f765b40f026f4995ed8574e6535*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x24:24x2048_n"66efde5de5238d89987b107574d162fd*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x384_n"df4ebe69bfea217b9e7643702e47c1e2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x32:256x32x49_n"bbc8b95c3adb9a638fe8c2939051f986*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"c06e81d7b3b7f8c7a7f72f1bd887f2ad*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x128_n"0c241fdb696205a806e3ff2f2a93e043*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x512_n"a5b9928a87866f7cae6eae918b4ebdaa*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x512:512x128_n"e068c86275d80e76b385eb46dae23767*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x512:512x256_n"e6e404d33d04fa7cf0d9bd6c24c74b67*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x768_n"65da94bc50a6d4212a6427955a72a409*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x32:128x32x49_n"202c2024f2932b1d63c56f31f3faf001*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"6db6cffa5c9716f076c71c0708e4c3e0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x256_n"4a375a5c91c3f910d65b4bc9653be32a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x1024_n"542aca39dbd9a6d18a7f7a72797fd5d3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x1024:1024x256_n"bf2ecf9a9b28698f807e8dddc55266d1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1024:1024x512_n"ac3a8f40834395db1814c2143e840fb0*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x1536_n"3e05b22371db2d63afdf85419c815354*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x32:64x32x49_n"df71857788fcdf9c19257cff43ad2cd3*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"dc0e2bc899f924d28e59865f91835509*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x512_n"44c169ff6bc429f159670d44d3270e12*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x2048_n"19782448821b7057a5aa3d50e9b5bbff*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x2048:2048x512_n"fe212eaaa888c1df5cb863906a8c77f9*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x2048:2048x1024_n"f027fabf35b9e24e03e4ca431082df98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x3072_n"cee9891198e0385199e6477db0f03ea6*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x49x32:32x32x49_n"873b7cb25f5367f49aa4ad9c7058e9bb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x49x49:32x49x32_n"c7608535b2bfa61fe9b4a81888d19289*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x1024_n"75c303206fabf1e171dfeeb82290e75d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x4096_n"2e60c2d56203aefeb4ee3ae9943d0865*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x4096:4096x1024_n"afca5d2ba516e2e20eb55e4c0d2ed59a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x1000_n"381b6f7c96ffaf788e12e05d54eccee4*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1000:1000x1024_n"fbb8f61621559a894c5cebce24a3647f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x1:1x1024_n"e38b78dc605a783c220b52f463ce0cea*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x1024:1024x4096_n"beea0bc69c852a1f9adf3584078523d0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x49:49x4096_n"e9e26843a4779f6355b60ffd97090148*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x4096:4096x1024_n"dc2fa4951ed1a1831d280450f1a6cc06*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x49:49x1024_n"12d486de2892f3eec52340c5435db881*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x1024:1024x1024_n"fbedb300d1da757cf32440689549c156*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x49:49x1024_n"8f5e1dad0610011c23003b1cf1c36ed8*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 32x49x49:32x49x32_n"9995ba3b513e4c217ec2bf74704513df*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=cab --wtag=abc --dtag=abc --strides=:: 32x32x49:32x49x49_n"66946d4e94fa6f8b0de0298646c433e7*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x3072:3072x1024_n"14aced20e148d7e05e85ee3a883ff74d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x49:49x1024_n"85bafb602c332961a5ad71a8a79d3832*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x49:49x2048_n"13aeca279250796bb913985108092a6b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x1024:1024x2048_n"3ae7a4ddb04d2b8bceb549b88c5dfc17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x512:512x2048_n"f098e1ad86adfde595ad536c57d99229*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x196:196x2048_n"278fbfe3bceccd1562012ed3ad86daf7*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x2048:2048x512_n"4f6a35cc0bad37ba88a7cf77e9017b57*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x196:196x512_n"b497e5bc295d397cbd602c2b6e43d6db*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x512:512x512_n"a31a733fb662579443c0564197cfd186*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x196:196x512_n"2b8d274c397c3f668abe9416056b3b40*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"ef9de3ae18a64de3f1cc0257a2e7f263*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x49x32:64x32x49_n"53d994b32db3a5873c15c08f80409437*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x32x49:64x49x49_n"a14c1ebd24a5ae808db56572b39ff97b*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x49x49:64x49x32_n"d3d502393830b426159afbb8d502825c*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x1536:1536x512_n"bdf2a9dce9884dce2257e2f5168541bd*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1536x196:196x512_n"b83cf1534ee10242b40e8a748f5cfa23*18"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x196:196x1024_n"866a90694e5b530a493192129690c163*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x512:512x1024_n"60665c3c2aa7b94bcd5cf8253e23ff53*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x256:256x1024_n"83685a1055cdc31d2de443de0d07848c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x784:784x1024_n"1dab8dd10fdf33f4559530c101d50ef3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x1024:1024x256_n"474c7ef1aec89285c23705021b9e2229*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x784:784x256_n"39159fbf5889ed715b96cc8b512de574*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x256:256x256_n"7c524592fbe8b7569f675305055d96ff*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x784:784x256_n"4132cb782fd60e536bec2436c71aa067*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"e8251556afb7b0cce463cbc9a93bbedb*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x49x32:128x32x49_n"9ee487f2adcca9f867962767b055d80f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 128x32x49:128x49x49_n"333c020ed7e5f94863bf593e59ce8eb2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x49x49:128x49x32_n"4297521a543c169edfb73d2670f01368*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x768:768x256_n"7c49324a4b0c15fa7dd83e059d3a1355*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x784:784x256_n"3430f3bdd5e2e819f601933cd75d0dd3*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x784:784x512_n"fabf8cb4b28a6b9f77cd590d9028e499*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x256:256x512_n"bb321ae166b5cd232f811b4afffb3ce6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x128:128x512_n"ddb9adf94596e57315277746de8610e9*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x3136:3136x512_n"8ce0ecc95c76cbd553f2bd247b008c25*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x512:512x128_n"cb83fa295fe52748126646fecacf8074*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x3136:3136x128_n"8178732d372b5d3b654d9e81bfd28eb4*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x128:128x128_n"b39245f4b6e4903a6feec49d6a8c947c*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x3136:3136x128_n"e0ef9a37b2b6d94e098762146e61ccd2*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"dcd458a8cfa99cf62647824495662158*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x49x32:256x32x49_n"b1534f3466af01069ee3070d74196649*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x32x49:256x49x49_n"5adafc93654bccca497256895925f7a1*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x49x49:256x49x32_n"4da549d323ccdc9cc86f1ec1c820d016*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x384:384x128_n"ae1e7b7db443eb2f1f52e9d67db6b093*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 384x3136:3136x128_n"ec0566cd98c7ade6fdbad95f2e1acd8a*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x164:164x384_n"a2d935eb44f01129a31bed24bfb12048*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x320_n"7ac5b0cb57b7763d407cfd0405e10b9e*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x320:320x256_n"c882e0883233f9469853263473b94db7*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x256:256x192_n"2e98f209ee5cd799fa4269fb1bf75415*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x192:192x128_n"f9ae66bfe64822ed084c50688203cd1b*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x144:144x384_n"0e30d372322d2697b3ac87c5a321241b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x176:176x384_n"9dfce51d1788bbea1c866a457933303f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x256_n"e69df54e9565f6a1045853034f2ba252*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x128:128x64_n"3dcf4b08e0051e02cb80bb6388fbf623*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia_dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x64:64x1_n"16cf00e729b767ec4de71de80007fa34*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x1:1x64_n"841cc9ad1449a7a6dcfbf1781cf9b6b5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 64x1024:1024x1_n"8e6ae1ae7935d0163afd6271992a748f*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x64:64x128_n"e835256a44fc82d1f5398dd186393244*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x1024:1024x64_n"831b8ba03ebbcc354f04040ddccce6d5*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x128:128x192_n"4018b8bf9c1f4e2c44c1e3069c78ee82*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 192x1024:1024x128_n"d633fd29480681292e66579575cfde33*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x192:192x256_n"1248e8b7eee5b5a73a7539815580dad0*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x1024:1024x192_n"56316da2db179e773694be357cc9c375*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x256:256x384_n"5c914242899bdbdeafcfe209088f27c3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 384x1024:1024x256_n"9ce608e8edc022ebc3bbd6e9a3e7f5e7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x256:256x320_n"0ccc1b5881022c9f7ba9d65202046bba*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 320x1024:1024x256_n"9e769ddd91a688acfe4b1d504fdac927*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x320:320x384_n"4bfe6ea91ed85d5a6f2ca9ffea3c91eb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 384x1024:1024x320_n"215940cab83d4e30fbdf40a19367436d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x164_n"288c3eb0fb29d844189fbf6fc5f84242*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 164x1024:1024x384_n"6ab6f87ecd8ad0a3f2462c5160f4d44b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x176_n"abcde71c8d6784a83ea743b5d6944808*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 176x1024:1024x384_n"e1f0cfec5809a8bbb8dc5b1f9adc4cc2*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x144_n"27edfe7a31bd8b83263cc7d5256a4173*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 144x1024:1024x384_n"d54389a7d7e0c09b2cfae46392202c2d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x176:176x384_n"3a78ec497cb58d7d874275cd2a00bbf7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x384:384x320_n"fd2913f0868070711d44a4cd8bc22d5d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x320:320x256_n"146ec8e091c73770ef3ad449120d2ad5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x256:256x192_n"2483641a9a73dfc20e53e3e90b28ed98*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x192:192x128_n"3c75b7ed0e763b89c582867eb26475a7*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x144:144x384_n"d6c2f3b4fa8a21598a78abb8abc42b97*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x164:164x384_n"df38bf06ec7d5a380c684fb007de439b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x384:384x256_n"9cdc29e853b1667ddd73b44bff572b0a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x128:128x64_n"4e1f926c4831b1d524a830f8a987bf09*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia_dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x64:64x1_n"1353faae57cc80fd29543581d8318b8b*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1:1x64_n"bbfbd209ec8d6dd9d7ee6ed3d7fb13f1*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 64x1024:1024x1_n"cbe12321c74e3c92ce2b86cdc248442d*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x64:64x128_n"45e6109a2ee97700c74239a1094553b3*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x1024:1024x64_n"2735e6b8e9edc98e0331a5cdff3f3619*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x128:128x192_n"01a9abe8f463a7757290aa41703bf09d*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 192x1024:1024x128_n"2310cd53c365b06d40f5b99850b6a7b0*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x192:192x256_n"69545aae364fe150b8741c3cf02dac86*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x1024:1024x192_n"ad1e7529f070422b6378ea98ec704a41*16"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x256:256x384_n"6c264e264c11b154286fc62b822b7a46*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 384x1024:1024x256_n"88914b4d5ba81ae5e4a2534a2e3500bf*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x256:256x320_n"6660778d10b19b709bf2a01539c485f4*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x1024:1024x256_n"8d15b04c7ae57fbaadbf9d772ea61ed3*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x320:320x384_n"085b2648233a489395089c617347de20*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 384x1024:1024x320_n"5d8767ebf302574f3a638c4918591f4d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x384:384x164_n"db6e9efd98658ed5c616a35947bd04f6*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 164x1024:1024x384_n"19b1132dbf2ecb7834965f7a8cde8246*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x384:384x176_n"07ca163f8dcebb3b023b3dc77be37cf7*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 176x1024:1024x384_n"6a58bad1e7d993fa5fa4dd4470a495bb*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x384:384x144_n"46361d80af9cf596cb0e89e87007b457*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 144x1024:1024x384_n"12ec0cd168360837b9face3402bcd43a*4"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768x768:768x768_n"c24ae5c54d282b32959f4460615668ae*36"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 384x24x64:384x64x24_n"25df21efd42389e2910fddc406fb6ee5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x24x24:384x24x64_n"cba6dbc0cdde3afdc70b2250a325f53c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=sum:0.625:0 768x768:768x768_n"7be8e91b45876cc002b00dd5d54e670c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 768x768:768x3072_n"aafcc1a21d403c23bc59d07f6f09b774*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768x3072:3072x768_n"e4a4a4f504b77f693f4ce06540fc36fb*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf:0.271:0.314:1.234 768x768:768x768_n"1a2ec283b4b16871f9c77c1b5d20541f*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768x768:768x30522_n"434b04fd9fc31bab3a0cad6bdd77eb6d*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"b6fb99d6ca4958695ce271482a5b0b25*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1536x128x128:1536x128x64_n"e03569adddb0b530f25b04ce5f9f8f3d*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x2048:2048x1001_n"7c2cb1716250c74ceb553275e4620d91*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x768_n"ad22f3662f90fd0780d8dcef2f9f2ba0*48&cffa4401b372f7a22a960bc4269e2abd*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x128x128:12x128x64_n"4475a1c373f0e698dc62a481595683f9*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh:0.271:0.314:1.234 128x768:768x3072_n"a887fe526e144c45a3dfdcff4e10fdc5*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x3072:3072x768_n"ae7ae09b02f67b6f467ff3b7e577997c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia_dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x9_n"495768977274375c788d689198d47b6c*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc 128x2:2x768_n"a3e368f280633d11cad3e5bdebcad0da*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:f16 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc 128x768:768x768_n"5fc2cf99fa354bb1e3ef549b46d715f7*24"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 128x768:768x768_n"5177a3b95ef946c60f96128ada8c95fe*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.271:0.314:1.234 12x128x128:12x128x64_n"1437a9f41626537c97c76918365b7324*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:f32 --bia_dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x3072_n"9bd80734c9a28b15eaa63e1ef47e1f1c*12"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 96744x1536:1536x512_n"738d0cea63fdfd9b49cf49cd75faf5d6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 96744x512:512x29_n"0e94abdb37b29aac1f93889f8c004f98*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x96744:96744x512_n"1415e1246317f84e2f88bdd0956dcc6a*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 96744x29:29x512_n"17d5f7b5c390acd32491b660cefe4f56*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x96744:96744x1536_n"573213cfa6880912e425d805a564bd3e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 96744x512:512x1536_n"03d0b0b8b7ccf5410b38b8c72d00113e*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2 16384x13:13x512_n"da036082497afa3d0fb288eef9c8b740*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2 16384x512:512x256_n"26d1801c36685643f215fb2b093c0df0*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2 16384x256:256x128_n"ab0728c86ed6a330884ca6b86df74d33*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16384x27x128:16384x128x27_n"31dec01b90a062c22489f91c654c3d28*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2 16384x479:479x1024_n"7b1f5a3a676693e43226be3764d1797b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2 16384x1024:1024x1024_n"1fe0cd580c9c68d5640e631a54eced09*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2 16384x1024:1024x512_n"0685afd72f77926311422991e5747305*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:0 16384x256:256x1_n"3e47824c328f69746b2046f99f6158ce*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1:1x256_n"350c125236824727981310e4c4c38dfe*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x16384:16384x256_n"08c7851b8cab50ac44d67948ef54ae7d*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x256:256x512_n"d213b3213db83ba3985bd9baa8a80251*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x16384:16384x512_n"9f7066cd6e3943a5bd627bfb7aa42d98*2"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x512:512x1024_n"b821b787797b3d9e2285482df3a81755*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16384:16384x1024_n"19a9cc2b17649e1eb22260a9b3072a9b*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x1024_n"7976d6d93cddb243d5c1b4d88f8eaff6*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16384:16384x1024_n"6230e79daa2cdd0cb3118b79257fc807*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x479_n"4ab7b780d9fb4ba268d6d35d1a32f34f*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16384:16384x479_n"13416fa5b007f51cc62bd4caf66c45bf*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16384x128x27:16384x27x27_n"a704d69077d50ff2282f46468a92da17*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16384x27x27:16384x27x128_n"fd426d6b3ef2bf60de7f0ef44367ba72*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x128:128x256_n"87de866e6b4fd21c0d9f209ff2e08ad5*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x16384:16384x256_n"03f29635ad7ee0dc8ea3cecee0bdc337*1"
---reset --allow-enum-tags-only=0 --check-ref-impl=true --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16384:16384x13_n"86ab53d3a908d723677a6c70fcf366ab*1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x6144_n"9603fe473b03da0393573ebe94be4602*36864"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=384x6144x1:384x1x6144: --attr-post-ops=eltwise_linear:0.25 16x2048x128:16x128x2048_n"4ab0f116dcb18a26aa55182ae80f7bff*36864"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:384x6144x1: 16x2048x2048:16x2048x128_n"7d27e4328ad2bdd8f2f5dfe15f89e19a*55296"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2048:2048x12288_n"72bc8ff11822fa2b1d3e2e9f5c16fc23*36864"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x8192_n"9fd54c811a3f1fa4e6680e0426faed88*36864"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x8192:8192x12288_n"8a9136b60f5eb6b951d62b7fc4a095c2*36864"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x8448_n"ddb987647ba371ade65118206ade34a2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8448x2048:2048x12288_n"f91bff597a75f311fd748cf4504c292f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x8448:8448x12288_n"9a7b16820b4032956e3724c2e7e5e04f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x8192_n"7686cf45354dea891e242256170883d9*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x8192_n"fef0ce359a5c6449a3e631fafc89df30*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8192x2048:2048x12288_n"a3bf82b7098a096050081ae9710beaf7*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x8192:8192x12288_n"ad0bbab91a957c9c747e1f8aaa5a60a1*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x2048_n"7c05070568931af09d3933ab309ff78a*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x2048_n"9a6aa7093506c30d1d3c874baa54edc7*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 16x2048x2048:16x2048x128_n"e40d8881c0cb0aafc99bc362cf72b11d*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:384x1x6144: 16x2048x128:16x128x2048_n"a662e04dcca39e0e598914f474e55fb8*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=384x1x6144:: 16x128x2048:16x2048x2048_n"679004092ea73a1ef089d81d78ebdedd*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x6144:6144x12288_n"e67f8bd0f0f14366b1e0f836409b91c8*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6144x2048:2048x12288_n"946c8a77a3131f2483f18fe7690795f1*18432"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:3:abcd 1x12x77x64:1x12x64x77_n"97eac54bc55c44e501d48521f05e6c73*12&b48866b6b7e55401189adf7ca19b4de5*24&82faf7ef4fd6d19534ab88b6f4c0e0cf*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x77x77:1x12x77x64_n"710e28874d64efcaec9f7a2d785c966b*12&5771edfe5ea8f205bc3deaf12060d45c*24&a057e46b31fbec5cb11ab4d0ae7a3d35*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x4096x40:1x8x40x4096_n"c0afb46d99e9e9d202a516f8d54435b9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x4096x4096:1x8x4096x40_n"2e8bc022ff98f86f7d4c74f29e29ddf9*20&cef15dcf17f6e89c7f5b8a0a9d65b9f9*4&580005ce3323dbf63a7342310310c56a*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x4096x40:1x8x40x77_n"245fca58aa3a5ca1f271024beea00788*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x4096x77:1x8x77x40_n"7522091a7718747741532ff52620de1a*20&731007b82b9bdc70ca950c6141d88e62*4&91b0346a748c2e77bd9b6e43c75dd8ce*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1024x80:1x8x80x1024_n"d03e02e26fe2ba2ca56dff87cad2c21c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x1024x1024:1x8x1024x80_n"9d1fd137452ebbce45137320ff043504*20&96557d0204fd3ec66e3594ed31841954*4&36fddcdd9dbe6ae28fdf881f356fa08e*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1024x80:1x8x80x77_n"ac387e690ca8fdd60414315891e85a12*20&dfc2d21dad69b1d5c964397a38cb2ced*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x1024x77:1x8x77x80_n"160f343ce1f7e1cdc012d2a19671b95e*20&385f224f25b6c2ac8a9adc50771a441a*4&c451fb82f4e1fd235414293c5bcc1854*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x256x160:1x8x160x256_n"9f03a292a7b3b35930fbd739baf171f2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x256x256:1x8x256x160_n"e0bb97b69aacddb038a183659c2a7c5e*20&b11911ac1a443026c9cb3925b40eb3a8*4&e1c2ef7a6203821868b21d9a5842cd54*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x256x160:1x8x160x77_n"11429bf0d0169e1c5620039b642d2ae3*20&c32df3ffb3321c3a1ce19ea6a394ef63*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x256x77:1x8x77x160_n"87a7795b46d4ccf9fb4f85a20d1f9e70*20&078caf31f0c623f1647a44bc15c9dd26*4&974de6ea5064d12f681d564d3e34fcb6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x64x160:1x8x160x64_n"1f4c401f9eada1414a761108f8e33f79*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x64x64:1x8x64x160_n"ad632a2a4371d96ecbbb910e31170dc0*4&6c91b83dac54db7a75f3d68469e42e25"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x64x160:1x8x160x77_n"6df281248b973d7b2e8c518173e0f853*4&0170ced96ee5dd06a441b5a60a03373e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x64x77:1x8x77x160_n"fe162c122253d49e781a8b4e6e726c8c*4&6a9c6eb81941375d2f43f5868c0d26ce"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 4096x512:512x4096_n"f26f7794562f3066461a285b582fb26a&88bf32e345aa7c8c7f668d7d302e7427&aac89f30d564dfcbf5f0354e9f09b6de&766f7dcf14f59a6a2e1fc8de22cd0ef7&f608648ffcc8938658dcb050ca185598"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x512_n"be2930df49168db9dd9fffaa3b3cfa23&e0d3453ec28d41fe12f105a2948b51e8&16bf80dc23b774334f063f9ee46d45a7&a78ab7e800514b2573ee31dbd57a1139&52f18711595ff5d223d2e44f06efa000"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu --attr-fpmath=tf32 32x1856:1856x128_n"e48d3348e4d9024360f8a37e2101a26f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x128_n"fc7f5264362c5bfe4d4dcbc193941848*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh --attr-fpmath=tf32 32x128:128x1_n"fd98c36f2a93c9b9bb7a6c074dbdd7ca*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x32:32x1_n"550ddb3a520cb3e21a0878e67d55501c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x1:1x128_n"f3602989b9c2f651fb91c1ac9e97afe5*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x32:32x128_n"da10935af8c7b703830bc2fc65821daf*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x128_n"06264682fcdd85c05073a82428c187fa*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1856x32:32x128_n"baf6251cfe82fc1132f151f2a723f7a8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32x128:128x1856_n"8cb36584c68682e04c18f8d6d0a3a85b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x287_n"65130bd953b74eed8fbd83c750a60cce*2&5062cb4431bf77bc843328be6a56d791*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x681_n"bad3b807fa8845c5d45a1dfdb824ca5d&ab632b459de4a200a66b52a53b22331a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 42496x681:681x92_n"333db190954e06cb59a30661413b5a24&f82fc2364abd1d300e3ddf3f72ce4d56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x144_n"cc85fa9a54978a92743e1ae91d35d8e0&c71919957e5901b86e9ab244677963fa"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x144:144x72_n"bce18f10c107f174973b05596ddbe754&b5748e8b847569a40d96cf0fa0a8d5a1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x72:72x1_n"66cc9f1f19250123db63a7da6a127314&ad9638a100db85655a53737a1c0ffb40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1:1x72_n"2294853f7701c09614ab1a0b9911dd05&d6e9b76ab9d32b91215b961ab8ebdab2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x512:512x72_n"dda5767a42e21928757c271cf3051927&c534a37df569241f9ba59d825c8dbf66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x72:72x144_n"c4552c86629e1d9ba8ae0eed05baa9a1&867e255bff7ac5ee14c76c34c81733b8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 72x512:512x144_n"4894b8b737455b439767fbd083741a60&570d5de075f9a744c7d6b72ba6310e1d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x144:144x287_n"9ec76fc160ce1f9ab10406d6dbd2aecd&cfdb323832e3417cb229702ef5336499"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 144x512:512x287_n"d786e464b4ec12e35ed957df45efa678&d6c25a2e6584d921552844b566d770b6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 92x42496:42496x681_n"487ae53ef35dc56c5063a4bdd6513ae5&e5e1915b031690d419bafc586eb7f287"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 42496x92:92x681_n"a56655a4bfad8eea2ac8a28af37f23ab&f1adc0672a273a20dc352bdb8d37d53e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x681:681x287_n"a9771fc2c1e5e1bef78642b044c984e3&83afbff5879ae82fe23da02066fd3adf"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 681x512:512x287_n"4c821bb71366bb8c1f67b5d8c2db8a9d&9d75f89777931a174546d0d7c3dae286"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x287:287x256_n"c63a80d88e94c4abe1469f4c7f7020bf*2&45d14c6a5d959279d6b29e43dacda87f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 287x512:512x256_n"e0d95cd4e438f402376618ba4ab6d5d5*2&812647bf02f79f079a96156efdb32117*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x27x128:16x128x27_n"70048765e9da2de1314f678cf1ad8530&70048765e9da2de1314f678cf1ad8530"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x3072_n"433471d15a9b83677d8ea82b438471e7*36697"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=384x3072x1:384x1x3072: --attr-post-ops=eltwise_linear:0.25 8x2048x128:8x128x2048_n"d8311220dd21cc39b3b31c51a838725c*36767"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:384x3072x1: 8x2048x2048:8x2048x128_n"623125ca1212cef1ba3f4dec5f96c2a6*54944"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1024:1024x12288_n"8d91432e90386113fd7232d4cac8eb59*36668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x4096_n"a3d8ef562a17410d1d70349a7cba93e4*36794"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x4096:4096x12288_n"5b431de9318d6906a24341803053364a*36758"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x12288:12288x4224_n"ca4c965c9486501943f9c2c0289d6a66*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x2048:2048x12288_n"061fc19c230f43d36e02270fc52988a9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x4224:4224x12288_n"3f63b91246a5f816e5d8f85e4f3297ab*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x4096_n"69473d93762c860d0ea4da6044c42d7a*18258"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x4096_n"3b0dcfa86d3517f5c6f8b68fbc72b2bd*18276"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2048:2048x12288_n"4289c831dfeecb408f71764b0729b981*18328"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x4096:4096x12288_n"71e323a65ce37c88bdad540ec5368016*18332"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 12288x2048:2048x1024_n"e7ea8480dc577199d849fb46d997aaf5*18255"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12288:12288x1024_n"5904beddd57ec1b1b5d6e101902414b7*18284"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 8x2048x2048:8x2048x128_n"ac61e10b7bc5dbcd8299aa1a10213760*18323"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:384x1x3072: 8x2048x128:8x128x2048_n"6e0a88e47722dbd85e2e4000761695b0*18300"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=384x1x3072:: 8x128x2048:8x2048x2048_n"ce434761f6409fc675b6ff54dcdc7c44*18328"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x3072:3072x12288_n"555347c21a3393ae022c3e7333d2e4f8*18328"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x2048:2048x12288_n"5805a7b41d9060faec2b9510fc8a35f7*18294"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x2048:2048x1000_n"fb9470181c78614529053ffd02cb3918*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1000:1000x2048_n"b7b4cc691ba35a20bfd233dcd3588c6e&a72fe8dc18107a8395b46c482cd7cb66*40&d6fef1ee7383be55cdcd59e36b4f4e98"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x16:16x2048_n"f87cc439c29701193c6c7d8877ad4916*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x32768:32768x128_n"d37d01b8f4aca780da77c089dd48938c*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x128:128x64_n"eaa57f5969adc1d48afded47db78dc76*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 4x64:64x4_n"30c8b27ef9ad1b5aba3505b51766c44b*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4:4x64_n"f7bd7e6b49cddf3db138a7f3582db6ce*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x64:64x128_n"da93c342f53f1b70c6fe8bf3ec92a55f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x128:128x32768_n"e0d03c296729674576cc1d62fe137306*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1100x1100:1100x1_n"91a2cb696ecfbbe3aa888e729f6eedb2*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1100:1100x512_n"a3d46efb94a73ae976ffc878dc987558*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1100:1100x1100_n"bd19836775960ab7ddb5d7e3e35ca595*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x12544:12544x1024_n"3952ee452a5456443da22fee135bc806"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1024:1024x1024_n"95b464a20b04191a99bf4e210909b7ef"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1024:1024x91_n"0477bb0bbbf165774ccc6cb8148c1b5d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1024:1024x364_n"46498e63ffd08f42ac174dfc8d0cb407"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x186368:186368x4_n"fe125ed3c9a153bf06af34fa25f378e4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x91_n"fad4c7f71326fdeac701fcf2bc348251"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x91:91x1024_n"6753b9c48e7b1f0cddb13582a9d24d7a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 186368x2048:2048x4_n"8687cd3de0fdac546274b36b231e89ea"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x364_n"4714bd32539d548012aa944e830b97f1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x364:364x1024_n"ca478eaf106ef656f91c917ab402e570"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1024_n"a56410cfdb9c431db3c33803d9ee3911"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1024:1024x1024_n"9221a7cab5ddf8d4ca608d9d7fbd0c79"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 12544x2048:2048x1024_n"fe95ebc320710cc4cc30426d6bd9da70"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1024:1024x12544_n"2f83fbfe8e74b6a5ad77222ac951b3ac"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x77x64:12x64x77_n"939da955cb1569233d2454700e2a0869*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x77x77:12x77x64_n"ad7659a29d73cafe445a485c2a35be08*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1001_n"d5516a1acf7717ddc6634cbbac553924"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x3584_n"5099b28d57ebbfa4ed86738fac74da4b*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1024x128:112x128x1024_n"25962b67ceba848bf1a2e4a597f4124c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1024x1024:112x1024x128_n"1578b219e2fd2b3bd7c5f402c6bb71f1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x3584:3584x7168_n"1a9ee3f5d7e94576d3ee0e2ec7ad7365*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x14336_n"9915e2841f9cd985d2d1b996f7421fea*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x14336:14336x7168_n"9ae637cb9ac3bb9f7192625ceca78fba*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x50272_n"efe8caba0ad6763ebde713b9526dbaeb*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x3584_n"cd8ae577014eeb699e0cb3e809fe3bd6*73152"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1025_n"1d02582b19f76d860b1f47a83d95dbb0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1025:112x1025x128_n"2fd85055649bc7d469a4d321e95f6e49*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x3584:3584x7168_n"91b6fa11787ccb63d693e4fc3856dfe6*24384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x14336_n"bce3fdc98ce4c983880afda4114f9d17*8680&2934517e2fe948f4330ca3e72e39713a*24384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x14336:14336x7168_n"ca40d69c9c8873c4edac78b53dd7e229*8680&f9a5e47b5a185670448b0bf4fce7d5bf*24384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x50272_n"a5270b88aa16030c8831f3be87e5a7cb*508"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1026_n"ab0bdb8882e974f425807ecdddd8bfbd*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1026:112x1026x128_n"820a3d33fc99a834f662976c87e99603*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1027_n"ba3b5654d1f9a8da27ef8464177f091e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1027:112x1027x128_n"018f34fcbf5b0c5fa7b016abf98eb107*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1028_n"dff5f4f58dbbaf7648ab6d29db5e544e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1028:112x1028x128_n"20ef316f570b670b5c112da222251fde*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1029_n"d169c5d7dc9e6867b9e885b6e76494e1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1029:112x1029x128_n"cb503044f2f9a53e70a4208baf6d0da7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1030_n"3bbbf444fa876f61326ff38d2d6e052b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1030:112x1030x128_n"52b390f0adee103b1edcd135dafae781*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1031_n"6eaf878420e59f59fd60f4660ed5943e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1031:112x1031x128_n"d08f8194ee46cbc3b5271260ae014866*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1032_n"132107e33f754da50010bc7c63a79340*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1032:112x1032x128_n"ccd481e0fe8d1f693cf93596b6e2c051*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1033_n"59607ccd41bf72a54a50892c4219e7a9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1033:112x1033x128_n"0e6410abf0c356d34e0c1d66d058f256*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1034_n"fdd2c1a555a61796c8003a53b5bcbc02*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1034:112x1034x128_n"a249a4242f63054ffb3776dd8f049b5f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1035_n"ccad0a0b7232384aff9902ebaf61c058*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1035:112x1035x128_n"53e5275567ea24d3471a0dc277601d1f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1036_n"727cbfd5a9fc6cefd2e1f8b47bad89c2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1036:112x1036x128_n"5ff6804ab6aa92504fc9d4c801317203*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1037_n"30da69f746273884ab893a8e11a0fbcf*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1037:112x1037x128_n"81255d58fadab39e8e95f115af3ae0a1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1038_n"7953c827bc65c74e955e246ea42d962f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1038:112x1038x128_n"0a2b27d21ba41ecfd77e5d74d0e4f44b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1039_n"166b33eed4e6768b23c8a30772d10d39*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1039:112x1039x128_n"d3413d2e67761def7ee1acd48044f5d2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1040_n"1766e8d20e890f96e871e6357e82f195*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1040:112x1040x128_n"75ded7dceb91224630e7f8005c634809*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1041_n"fb539de54255dfcfc527841b2e1e61f1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1041:112x1041x128_n"cc6d5b87028d21d9529da1082bdad692*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1042_n"2aabf1ad41a2fa47e03b7321baaf740c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1042:112x1042x128_n"d66f460db0239613450031a398f31543*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1043_n"c6d5ec05b21c9cae933fed2e91c4fa19*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1043:112x1043x128_n"9b56451d22ce6b638709b2ba2741d2f5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1044_n"398307f1c1a0d191fee1b54060937529*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1044:112x1044x128_n"67fa1fe115bec67566dc8d2097a49f64*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1045_n"4ba291d50c4a90ead311a3c9acfb8ec3*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1045:112x1045x128_n"be4a3f0fb79497fbb69cef22796354bc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1046_n"649d183cd195b770708fb3df5f0b6a7c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1046:112x1046x128_n"022601f04d9c61a8490e358c9726ffc9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1047_n"8b3e91b54c206025c8e089ccf64a68c4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1047:112x1047x128_n"e9fb672a181dd355248a908a52565d20*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1048_n"178134bef4d5b61743712920d7f6a9b6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1048:112x1048x128_n"ed64ef0a435028805f9735edfb71493e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1049_n"43a21f72d4ccbd648e323edfedfc04fb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1049:112x1049x128_n"1d0bf1b3efb918eec809a245879e5588*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1050_n"8374b3737788c042ea943bb9a658de77*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1050:112x1050x128_n"b4831617306a268b0a3cefbe7846d2e3*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1051_n"6fabe9447a8a35f5a3bc4c0646a5c0f2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1051:112x1051x128_n"f92603546f3eff1ff3a845a2b5874237*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1052_n"7076f6e0413dfd6e35ff15bd5a6cdcc1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1052:112x1052x128_n"889868a8b2ed4fad7ab49f11e68a2c1c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1053_n"ac5e20f6e1ef4ea7bff50e6eb6818e03*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1053:112x1053x128_n"743901d20dde7e6119a23670b97d1ad9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1054_n"dc642f362b7cca28fa968e5a77e5051e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1054:112x1054x128_n"2eef0f80db55b2b1eed299340d07fa16*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1055_n"40c694d41b2aeb8f6a36cc08d9a6f161*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1055:112x1055x128_n"1b196807113eb802a0843385da27fd23*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1056_n"cba1f5708a621bb08e7aa3d5e5d44f14*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1056:112x1056x128_n"9ca5f0657d5d40e4013e3719f18414e5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1057_n"20196eb78f3f1a7044080cd95761f2e8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1057:112x1057x128_n"ff235f3473b7926fcad520df023d06ac*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1058_n"f3f1fbdc82a4943cbc8e3e7224e90650*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1058:112x1058x128_n"52ceac9b5f3084fb52aed04080b17d5c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1059_n"21e19cc409c9b59992d6e2f19c42a5bb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1059:112x1059x128_n"de82f15501643aa1fbb2bac170cc727e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1060_n"3d07dd1e5ffddd461f0bf3b62d3c5d7f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1060:112x1060x128_n"91ba2288529d216e190e10783bf1bef4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1061_n"604917594b91dc096339c7c05fd0b687*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1061:112x1061x128_n"efef83555c543ce80a0f7b13916515bd*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1062_n"6db49527169d667bf62cecad56aa44d2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1062:112x1062x128_n"9f294d9135f71531faa2f75612314bf2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1063_n"e9331d28ab78807607c65e7a9a792fc6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1063:112x1063x128_n"41d1c331921c2553ef80ec768ab30a68*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1064_n"54561962f0f9bc2bb63e77da880b9f17*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1064:112x1064x128_n"f35d20b59708c07d08774d2de0ed367d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1065_n"7bd5ee6f3707aa272e31a1a02d6e84a0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1065:112x1065x128_n"3ceb84e4bc29f5c6e6a9f1edee75bec5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1066_n"8d754cb7df00363a52636261d946489a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1066:112x1066x128_n"3e567dab1094053f059691581f47e48e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1067_n"91b5fd3a2143f18173cb63f8384e396c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1067:112x1067x128_n"f6f6d0d4b814a04f6d643933297690bc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1068_n"02c5aefa5fc52c882d5bf01c4722a5ec*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1068:112x1068x128_n"8209c63c0c735c5679d92ea06a55b791*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1069_n"39fe30eca064efa4d97ea206e0442560*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1069:112x1069x128_n"c51c0f7ff7a18a37345255bf2624a7ae*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1070_n"bf83f9bceb4273a1e7bfd17572c7a4c2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1070:112x1070x128_n"452cf3bd402de7342a63d392a4cf96b8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1071_n"978cd2e7e0e8d408a1e12a2144eee2ad*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1071:112x1071x128_n"88af35d1d30e586a8d54f021fe565f35*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1072_n"9b144ce9636cae4c7529b096ba33238e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1072:112x1072x128_n"2f35163c0ea9e553de62716bf24719a0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1073_n"49b47c283fa4baa63dbc054be21b8b1d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1073:112x1073x128_n"9b0730d182b9ec600c60e1094a9b47c8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1074_n"b1bcedd432198f8423cfec5088e09228*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1074:112x1074x128_n"1a062cb5fba56cacd4cad5d73f575342*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1075_n"91d4172e6b8f1b7bd968fd8c25fcfd4f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1075:112x1075x128_n"a52a4b4c34c19053a2be5beb29d17cc5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1076_n"cc7e66c68111a631c5fcb8c93941ad02*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1076:112x1076x128_n"24061182bff6df6ed826cca483cb0d95*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1077_n"b760dbea523939cffcd7fe0b97d61cd1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1077:112x1077x128_n"3f74c46ea5bf0f4fefd2f272c978c659*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1078_n"80f42412f2ebd9745236db45a70692d2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1078:112x1078x128_n"4fc04302b2caf972a18b531c14b3789a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1079_n"28e44288c71a20a9a25676c6b757104d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1079:112x1079x128_n"613142909707b2a01f110de0aa3b725c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1080_n"05c0c394b808e75723bf0c52e33dcb63*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1080:112x1080x128_n"77b8282e137c046e6dfffd279cd5481e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1081_n"2e032cbeb30cdc285f4933e5aba7c6e1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1081:112x1081x128_n"71c067d8be724b2d83f424c2cb532f54*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1082_n"bfc9df62b9ce7604a1df3495a4e2e363*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1082:112x1082x128_n"ee3991e2c4e265cc211ef094203b4e89*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1083_n"ea89c6ee1f9ab8463e5134f051ddc22b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1083:112x1083x128_n"1f4ed2df176fc11fc48a5492e77630e8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1084_n"c7adf041eaf83a556e6208043ba2fb54*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1084:112x1084x128_n"4de160a2813091491a13167538ef9dd9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1085_n"84f874bb652b09dd7d2230fc49811dc5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1085:112x1085x128_n"b9ec8e1b194ad8321ec9c6daa1c5da8b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1086_n"d116a7910c6ddc4c2e92917b29469b89*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1086:112x1086x128_n"ae64dd518be672d6c14dd486af246fa0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1087_n"13ee05e5ae52769911e7a689f02ffba0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1087:112x1087x128_n"02ef6cdcefe3df0d98dbeac0879cda21*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1088_n"174c573f487f431a13cd33fbc0065006*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1088:112x1088x128_n"ba84ccc5096d9da6e405d9afbb088acc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1089_n"34cbf65a206b8072065f4af782dfef0b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1089:112x1089x128_n"21910eee0270688c846e79fa40ddd912*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1090_n"2c873199c696b1d91943cde9e232f1f0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1090:112x1090x128_n"1d2b64bdc33be31933d5c7df8cf8ff18*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1091_n"10b17c9247abf2b79a21b27bed3a8e94*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1091:112x1091x128_n"0cad144d9ee8916ae5f8aa7620a0bbb0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1092_n"df6c5ab19f78c1da3e5a282069ae6683*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1092:112x1092x128_n"911ee877fa247eb50f049b9e224a12b9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1093_n"041e1004c0e72367c67540dc0fd066e5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1093:112x1093x128_n"43020c7fe7198fab19528c7c6ab2b7b5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1094_n"68cf3bc1fec9673817e73af34b4163b6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1094:112x1094x128_n"0a087d299650a5409215c99cc82c11a4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1095_n"35217472c0651ff3f8b455aa7bb09f40*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1095:112x1095x128_n"b2af6baca444dc9cb38789addf193735*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1096_n"4fc099a1eedc7f51e16b69d2502ddc01*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1096:112x1096x128_n"b5f9e4a358a9dd2468217e8d7b40d35b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1097_n"2709d3427389a00c45fb2950c8896169*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1097:112x1097x128_n"64c1d98cc517f411e73d00d3c5fc714f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1098_n"31d2d894195e3826ed2406c386da0274*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1098:112x1098x128_n"c47bd9cd890986ef9fee172f982774ee*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1099_n"c8530a87e869ed2eadfd4a152b68b6d5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1099:112x1099x128_n"bff1839550215d003cd6e168a0b110df*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1100_n"36f46543d67abf2a7ab3d46d5a10f0d0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1100:112x1100x128_n"7c8ceb2675c6356502ab6195afc5f844*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1101_n"c29cf3606f79a8c6a645d0551f821825*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1101:112x1101x128_n"f91dd9f9e471f12f81b6337f8e8601af*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1102_n"c885c91d718e01481932e15a8bd62dc9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1102:112x1102x128_n"476428bf2deec272349a4b8581d18a3f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1103_n"c6077232e74fcecffd9a9cf86f3554bd*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1103:112x1103x128_n"db267babde06c7286db5142bc40e7f80*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1104_n"51eae574310edb91346cf3a50b657909*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1104:112x1104x128_n"96e15afc2a89e8dd8167662084957615*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1105_n"9ba6454458bf6f3c542175e4b4d33a13*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1105:112x1105x128_n"2816bde88b37b0cab56104ade0a306d6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1106_n"7eb0ac48c9c2d21fba3a66f18ab205ae*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1106:112x1106x128_n"d5116b7da203b7f764daae70e2b41073*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1107_n"cc1b580e4bd74d497648bb0a660f3a0a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1107:112x1107x128_n"baad035e91edaa227f7c5c1d4277308c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1108_n"9cb4d1b2e07ae253923960caf9229d20*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1108:112x1108x128_n"e1614563fb6e44fb9f2a1de5b650df56*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1109_n"791ce3c97a2bbe8c63dca452a28b79d4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1109:112x1109x128_n"e2ecbbfd9c4bf753edac951039fbbb79*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1110_n"36b48fd3bad049b611a8b6bc21541abc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1110:112x1110x128_n"a662ef7d7db8814017fe8f44abb85fea*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1111_n"67ac3e57c3c119330c35090ce12c95ed*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1111:112x1111x128_n"b8660f384b27316195f2742742295467*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1112_n"ef51b21ea0d677dd59324d16df0eff0d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1112:112x1112x128_n"82d536f25d79cad14212584d0165c247*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1113_n"e37f753cbdbad1537bc4ab80e3883475*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1113:112x1113x128_n"85e9c66a5f9249af4c7fd7fff27be013*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1114_n"aab27490af76a8d37c8beebfb01630ad*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1114:112x1114x128_n"de264ca8ae5617f1d865b0e53eaf609a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1115_n"4d447198d88f5ee8ef3fd704d7a692a7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1115:112x1115x128_n"7fba97296f567c484bd2a0987a784a53*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1116_n"cae811a9faccc6906b4386ceae3efe20*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1116:112x1116x128_n"d1d1079048e0aa4fde1273a0dc81e690*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1117_n"5000149373a804d9176cf3e3612e891e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1117:112x1117x128_n"87c255bf44a599615843fff10af71b80*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1118_n"781f0b21a0264c176bb5019d001eb066*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1118:112x1118x128_n"99a3733e4e605af082685edef00455d2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1119_n"d909398cd2d8cb47900aaa936ed01d85*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1119:112x1119x128_n"c4abb06e3a940b208635ede53f220cb5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1120_n"a7910ddcc814f8bb7c7a29c810690f3e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1120:112x1120x128_n"e99d233b546c282b5c5ceb31a0bd87fe*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1121_n"af6b760bffa118a46f54d46266f1227c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1121:112x1121x128_n"4b12bd4c37714f05de9369b5797265b1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1122_n"9b316839530b40921439706bcb4d3e94*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1122:112x1122x128_n"b07a14729de269d48f3f4956fd40cd02*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1123_n"cc7b5be2abcff01c425a0d9199c90d4c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1123:112x1123x128_n"056522c947df09ccad8181eafd9f7611*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1124_n"1afa797db86696d694c417b5b6200268*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1124:112x1124x128_n"e5aa13358fe094590d851a952cfa57b2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1125_n"9b5449ec6f852c424d30b8b8f22c9c0d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1125:112x1125x128_n"7732b8fbc36e6016e991489f38e10561*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1126_n"193c1da3a097479c34f31032968327d4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1126:112x1126x128_n"595456e00c57aece95d87a92a9e5b69a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1127_n"1af4e7fc7953b721318106125f408685*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1127:112x1127x128_n"48699d182d902fe8e2e4bc7a5c517be6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1128_n"75123d6d05b710cc714410620c25e05c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1128:112x1128x128_n"a055eab396ef9a03af4e994172be632f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1129_n"b6dd8aab1ec5d9f8ccaf789535b7ef35*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1129:112x1129x128_n"85b17ea196f0bcc5408806ba5aae6ced*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1130_n"2e21b778d7915f09f5e0e4a3a41a0a0f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1130:112x1130x128_n"3d3f0eec74c591d40deb9145b8632283*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1131_n"bebf502d0e05dc2b608764d804171a9c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1131:112x1131x128_n"b9bea0baddb25d7f9c0de9c906f22cca*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1132_n"e7c1ce1c782d3c22b03f611b367439d4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1132:112x1132x128_n"46160ef301efa9ebed7949343c61c12b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1133_n"a18d584ad08fca09a26897e74ad608c0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1133:112x1133x128_n"68902c62c6fd2f77675493f3a1da5c5a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1134_n"d40d1e6e22230123b362c21b8f5107ce*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1134:112x1134x128_n"44b7063d4b4229a8924f459997a08d0f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1135_n"62d6d95ebe6808b782ea9bad7173e51f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1135:112x1135x128_n"5d67a2b71d7105b97f46e0f6ad119887*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1136_n"af96aa0795d8c4241b4ae2b1099594c4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1136:112x1136x128_n"48df50b44000ae755a126501c880c462*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1137_n"feb3195ffa913ba47a328e0b4963bb15*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1137:112x1137x128_n"5b7d4f887fe736daef8af58e67809a4d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1138_n"62f60b2b336ca0d57790846625f514a0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1138:112x1138x128_n"b4ddb0b4d6e4f2b10bfbc52f66e71105*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1139_n"d31d626d4ba6b6387c9396058362cee2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1139:112x1139x128_n"0ae572f721918c6a5a7fd984598c4d66*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1140_n"5abece18096874eae8f8ff22ad62ef3a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1140:112x1140x128_n"068d8ba262964876396c8d2f5b819252*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1141_n"0900321683b70a434577de0f9f7e4c82*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1141:112x1141x128_n"f33942d0e1bc4a36169c9161b11743b9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1142_n"99aac44f68e6b184555c8538c58a06d0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1142:112x1142x128_n"3d18d213d96b012f54b970fc6757a0ea*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1143_n"3161227256609781514ab86863d97cb0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1143:112x1143x128_n"67e35ddfde7fc525def8496f77f87dfb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1144_n"cebde0b58d9121311965dc4e2d4609f0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1144:112x1144x128_n"c750d4514a7e0a6c2f90197ee935a1f7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1145_n"df7a4abf20f6b37ca10b6377e54c4a32*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1145:112x1145x128_n"4684d5411dc05eb2f79b6eccd21c266e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1146_n"fc533e30dfd75dd0bf4f9e25d23f1965*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1146:112x1146x128_n"fdacbbe7731879dd92d8b503151f4af1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1147_n"3df5f260c93c3c23ddba0930919288e3*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1147:112x1147x128_n"8ad6e848395463ae9c81d51cdab9e9e9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1148_n"e10d2432c21b6d38f896bcd07a966684*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1148:112x1148x128_n"d4df9e5a714ec7ec036bfdd1d219928b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1149_n"489e1394429ff2ea657739600c896da9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1149:112x1149x128_n"00ec109dfd4a35b75047430e79c7b810*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1150_n"450ded6ef6edc9a5f66673f1ce3165c2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1150:112x1150x128_n"7ebc41e4a4791c6645a35b4d03b108d1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 112x1x128:112x128x1151_n"7d718c37e105cacdbfe0d5f06a459f60*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 112x1x1151:112x1151x128_n"51830c3c19f0238b7fa97aac43e7aff9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x14336:14336x5376_n"479434ed5bdc8a65bf627879ceb79620*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --dtag=abc --strides=384x5376x1:384x1x5376: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x32x128:14x128x32_n"f92f37d7b0c3a993d7720b4814ae6c93*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --dtag=abc --strides=:384x5376x1: 14x32x32:14x32x128_n"b35fd830a76bbaa016fed29750d6bbec*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1792:1792x14336_n"c61054400b96dd2c6b7d198372364eaf*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x14336:14336x7168_n"35f0331ee1c9d20e1a41ef4770d6ca46*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x7168:7168x14336_n"407af10555182a530519276c737a8f2c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x14336:14336x250880_n"f5ac9321b34c69a729c87479e12c175c*4&4a7787bb5c276da376cc08b613cefc5f*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x14336:14336x5376_n"dbbb7d1ea627db4e9f8fe0310231786b*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x33_n"7e2f421e8be4dc96ac763478de4814dd*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x33:14x33x128_n"f39be9c3ca74e9dedcfcc337a002f85b*280&8c9576ad95b42ffa41f596ad114de71d*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1792:1792x14336_n"b7cf48c251efb1f5a27ecf4ea01bc01b*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x14336:14336x7168_n"c2ac9cf1f69fd570f13e5c35690f09a2*8680&e455dc93bdc07f61e64db386418ae5bc*24384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x14336_n"d3c3c124a85b5b72106c6723925bd4fc*8680&2ee2d7410d69f321b9a8b0c0de8015ee*24384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x14336:14336x250880_n"2a0ca957b2d20cd7e52159250250fd48*124&1fca5890c69660aa93e6f06dffb83dd5*124"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x34_n"b1b31aaf185fa3c91aa2278c50ca5691*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x34:14x34x128_n"b844f427b94c25deb5269dbf7d868747*280&1cdd52130d90ad986f3fd96204948c6a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x35_n"95713f7b260c60bd592a401ffbba9f3e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x35:14x35x128_n"c939248bc504ebf42b289c7fe96a3dbb*280&bb62472ca29ff7a86101ef8b0909d032*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x36_n"4fd610e9579f248f736bcce54ecf3822*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x36:14x36x128_n"2730a991b444d0270efc89dcaaf069de*280&e205e97c1a0a901a98cc2ebc7e5ba940*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x37_n"7a9114d80c7dcf34416dac8fdd454186*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x37:14x37x128_n"ba81d8ef1cf86ca6e4365bceec7b6c73*280&090a956829764d8af2b79a0c3ce34467*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x38_n"2165d625685130e4238efca87f738175*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x38:14x38x128_n"054eb2a2a0bcebc8494563d5ffc664bf*280&b50c38b91501a900925d3b142ab5243e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x39_n"45c75c1393fe4c1cb9067e285f9dd008*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x39:14x39x128_n"c5b08c441ae48cdec4377834d6f48366*280&239f526d024f7d2f3821eec0b85f2d01*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x40_n"46328ff38e337d189480dee8c06ae151*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x40:14x40x128_n"aaae9ef35a8d48c723d96a39e1873e64*280&5fbafaa434dcc8fc76938c383d5005c5*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x41_n"0126d0da61fc9c499486b85375387edb*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x41:14x41x128_n"40d1dded783bb1dc6e9429010f776617*280&46f57e186f6a41327eda27ce19274694*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x42_n"3b86b1aa9e4f34ea2fc814aa133e8929*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x42:14x42x128_n"6f1609653aabd7a4561ad819bb19130a*280&0e32d187cceb66c6815d48221dd76612*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x43_n"20f46efd6a07b9a8dc9eabf1b7a59bcb*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x43:14x43x128_n"a32a6754e9df383acd55e1375f33e76a*280&b6cdde2f79e3fe3b415790e4d1bc7ebd*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x44_n"572c7bb673db424c184bcd6fc4c9ca59*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x44:14x44x128_n"88ddf1e0be8150d857d7df88d6e69979*280&595b2ef59847d80e6ae7c2624b38b335*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x45_n"f4fcddee5385487c5e09f30db50943fb*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x45:14x45x128_n"576c18182cece068243097fed14a34d4*280&341b946e93e658c06e2d6dfadf365189*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x46_n"07b0ee26431e585e5341913721f24930*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x46:14x46x128_n"5765bebd9437ee61618ca4dc02960b09*280&e0bb31b9a1a08a06304a5c31ae61ce4c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x47_n"aa67ea78ebcdfd4451b78227a4bf2ca9*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x47:14x47x128_n"a651159a533c47a86a5f8e3d822ae20b*280&080fa5dd7c8b255a90272f1e951d5a11*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x48_n"ffe4dcceece4932e5741bcb241b999f3*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x48:14x48x128_n"7dac2eb0465b72fa6f821e2ff7fb57ef*280&0af275a5f1ebc995a8872cb4511913f1*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x49_n"aaba2bf906fa89730e25781fdfd3922c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x49:14x49x128_n"607f053245ae5ec02ada7950dadf5d4f*280&9d94cf198f9f2a997fe0b94a79db0afc*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x50_n"39d6df407f1041254d20ff74ada47dfb*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x50:14x50x128_n"2df1479e7cbca259f11879497c193eac*280&04151a2883dc85d1d51ac89db5a5a56b*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x51_n"9f82e1dd2d77ae12590eae5a0f97c55d*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x51:14x51x128_n"a121fb0caae3971c05e5a7c9eeb5201e*280&fa0fb0e2b56b8299d9dbdc865bcb2120*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x52_n"df6184f49a152e13cbdecc61e841481a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x52:14x52x128_n"d1ec4f9b52d807def37b956da14f7ea6*280&ea275dad3109b33f6818bcc722b870bd*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x53_n"74c01d1662444672613a70f1c6333199*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x53:14x53x128_n"064186ccea5b2470b06ed0b745bca561*280&69a945173fa19346077c3d12f20b51ea*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x54_n"876efd017166942f3f87f0c8cad7c164*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x54:14x54x128_n"2a436ed9cd42ebf3474ad90510533835*280&9cdfc47b5f9c1d46e14d9d81a714e7b6*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x55_n"485db50abedeb5b448e32177319999b4*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x55:14x55x128_n"d2455eb387a16a243fbd6b463d33136c*280&089add6c8b37148bf9f276234e1f07e9*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x56_n"8f55f5487ece0d5b8de3f963e22dbc11*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x56:14x56x128_n"eca7d329dd32907ff343737b26c95cd7*280&b349aea8010d49223724a6f2be2e9e92*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x57_n"985d534381bdffe5b43ca32a6f483b56*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x57:14x57x128_n"9051d8763039c57bddbb2bd07e78709e*280&fcf470fa020041263aab7761a1818805*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x58_n"06339196511615f3111a271f382095ae*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x58:14x58x128_n"5760c00a8d7927816779989beeafb0f8*280&fbd7fdca2924ac4ccb78dd937d468cfb*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x59_n"c3163d62864371e7e4ebc5c672a492b7*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x59:14x59x128_n"258587c793be9e6903b853811b605424*280&b668971940aed363d39bce226a98b95e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x60_n"540f1e03a7b652731c84e3e48d6f4e7f*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x60:14x60x128_n"3aa8241b7add7af113eff46efc771b42*280&4937e8b8091799b240be6b1451e7c563*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x61_n"f7f68123abb432eb6952af4a2f638673*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x61:14x61x128_n"7fabc5e5354395f024fac5ad89fee7b3*280&48ff40ddb792f19cb986ed0f3d4fb378*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x62_n"852099cb2259e6af8df8271cdaca7e6e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x62:14x62x128_n"2c44fe953ca24af305052fbe47db3200*280&74983d498b6e2f7356efbc0f4d599eb0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 14x1x128:14x128x63_n"cbec89dbfffa45235f14b546dad20f70*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x63:14x63x128_n"218d5d4d654f30c919c95d9f1f0dc9c8*280&7dad1ea45df80877364646e6bac5b0d2*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1024:1024x4096_n"1280629df90501dd01b016662b4dbe3a*3396"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x240:240x4096_n"7ec002cbdd998acc55c2e05d2f2a4417*617"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x2048:2048x4096_n"b0a49f15c0546d0e855b645a4aad3428*309"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x512:512x2048_n"2465ecd3254f853f58ad045d154eb68c*906"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --wtag=ba --dtag=ab --strides=154624x1:: 512x512:512x2048_n"0a6fe3846e8e53e37fb3d011b713a953*302"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 746544x1536:1536x512_n"c47a5bbac1eb893c2cda6540c1137b45*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 746544x512:512x29_n"d4335f549c97366e57dba67cffd4c8b7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x746544:746544x512_n"9cc5d67cf235ddc237c229477158124c*64&5275734878380cbe88751938ff203862*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 746544x29:29x512_n"d8164d9d7961b3186f4113c54daac1da*64&2dadb52155d100c0390526fda7edecd9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x746544:746544x1536_n"3c68a416e1260d5bcc69efd57edf1808*64&1070593b10791ae6c24d8d03283a6fe1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 746544x512:512x1536_n"083332d525bf02d8cf1d2374549b894c*64&a1bb8eb03b7f2ae1896ba4112cf792f0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x4096:4096x1024_n"8cd75ad912222c70ec4d696b21a3013a*3391"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x512:512x1024_n"5c50f00e8713f2b0ed7e1c481b6b9f87*3396"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x4096:4096x2048_n"2daeff5797a3e7ca6f9f9268d7d7fd90*309"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x512:512x2048_n"3c0eb17543d0b82db73f859d7ab1197c*309"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x512:512x240_n"8883a25d03eccca6b2e89e18e33e11ab*617"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2048:2048x512_n"60cfc17bb79dccd78b6ce82bdd7f84c8*1206"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x512:512x512_n"149b748ed751b6e9eed5f490318c8851*906"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --dtag=ab --strides=:154624x1: 2048x512:512x512_n"ef2e8ef9d60150cc17cb1208224f4293*302"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x4096:4096x4096_n"a9f84652bcb1c831fce7af30ac26d9a4*112"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x32x256:64x256x32_n"5ef060a4415f0efc7a0b1e1c2828413a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x32x32:64x32x256_n"6aea33ce14eaed40f9c03808c9f1a333*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x4096:4096x16384_n"d8523391d325a002975d4f8a720a5d72*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x16384:16384x4096_n"6eb83eab3e7c034126972b0f05d1d5f1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x4096:4096x50400_n"222ae1f149e0288421b208eac2a5587f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x4096:4096x4096_n"4a617e931d0df818eae375ed8e828e6a*14224&a29794859eda2a55d3e7fa907d0b54d3*3472"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x33_n"36b1223158cdab3b8501ccd059da3ed6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x33:64x33x256_n"a6de5a054147541f2d88f9d1205c29ff*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x4096:4096x16384_n"23002455e8650dcfc7299d8d0765edfa*3556&b968459d321e20a95c235fe8310a875e*868"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x16384:16384x4096_n"d9cda6af227c21611687b7fabe528d28*3556&68ca0f41242279d7720f604c70dbf7dd*868"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x4096:4096x50400_n"f2b5b7251cb5e794cee074f5adc6c4a6*127&47169edf169b347c26113c65fd234cb0*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x34_n"3c44c374b572c562e260ed86a2d705f6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x34:64x34x256_n"c13866ce137e77ab5703c8a08a4d8098*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x35_n"f81bbc456e018d9505605925004b4643*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x35:64x35x256_n"e3a470de5bc37690cc629da238007237*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x36_n"0479e3358a3409f64934ec3b271c6996*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x36:64x36x256_n"491da103b0155e91ee744b7443ab2477*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x37_n"85707713003bcbf6a81235b119d54317*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x37:64x37x256_n"6d38cf4ebca8c217f79b5b5bb434deb7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x38_n"5205a6616bb1dee7cc2921311881d2f2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x38:64x38x256_n"b7959fe6c403a1ab6f2fd18f055df9a1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x39_n"cb5d33601d3c1e7e3bffc28b50b4260a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x39:64x39x256_n"db8ef405e261578279912e2b5c173a66*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x40_n"ae3b8f126443d615b6f4d52326331837*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x40:64x40x256_n"f637c1857bd2aaac2ca3d856d9ebd78e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x41_n"d9a1ebca903d2e9bbb09b23a495639a0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x41:64x41x256_n"2edace2743ded4721cb2bf4bfc3f01cf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x42_n"a010597e5c1e7e3b2c54c81ba99824c0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x42:64x42x256_n"0acd682013364684f56cfac1a6dcc531*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x43_n"60e40198412aebda18d40db7a52fe959*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x43:64x43x256_n"430e42edc14d6ed5b3cdb52f5e7783f9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x44_n"539d5f819a767872feb29647a785cfb6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x44:64x44x256_n"497b571a020562f79fe64c05218257dd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x45_n"8b647271ce64657c2cb8fc088f418bf4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x45:64x45x256_n"bbe60ca483c8bb7425497eaf08b3333f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x46_n"b1e0e36d257bed783ab195f967c55713*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x46:64x46x256_n"9c4f93d1d88b52c9666e522c87a6606d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x47_n"d151eca6f4bf5ad01db2b9b9f07f9f43*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x47:64x47x256_n"5219f738e3cf5fe582a15ac1afe07aef*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x48_n"08f1830a887844c8913b99d7a999323c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x48:64x48x256_n"69905e9fb7db301a168f5ce9de6b235c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x49_n"f645aa67de2cd28b0976ae7b19ed2766*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x49:64x49x256_n"5fbe68a47614f6b75b33781d2f562afd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x50_n"b44a52f330304ef92172b33d2af91ce6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x50:64x50x256_n"e3bf5b84cf9be421b957e9be5e6cec68*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x51_n"b201a2615a50913ba090f5df2c25e198*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x51:64x51x256_n"634eb9b992a502fc634e5302840c0f10*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x52_n"448aabb29d93539c67320d2bd5252c97*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x52:64x52x256_n"ceae9268071c9af5907f3bf29e77b487*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x53_n"583f58c0d624aee0b3ba04e208c35d90*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x53:64x53x256_n"cb406e4f2d1d7982bb242537168a5d90*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x54_n"76e07355923cce537995310727930d4c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x54:64x54x256_n"49a27a94e5a96deec28543684ad0fb00*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x55_n"5845d32a0745b0ee190b1f9c34562b7e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x55:64x55x256_n"e485ac31b9c9a415d58a71c2e78cdc96*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x56_n"e3b06c540e45cec40b65e891539a5d3d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x56:64x56x256_n"901f4bb71de88a0409adad62d413953b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x57_n"d9e4114b0c4b02124591f6bd8309356d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x57:64x57x256_n"ff004e0962aff26e8e744bf3bc30c2b2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x58_n"2d37ba953cc3ebd308fdae268eed3f09*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x58:64x58x256_n"3343d82c54fc9336a270d162c1b617b9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x59_n"101a8368371bf8abd1c31481616569b6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x59:64x59x256_n"012d8b74e87fe2ae44fa4e77c6e4ffee*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x60_n"db14bf98cd27c2e6d31e69e60b008f27*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x60:64x60x256_n"d85fe5d24e6a4e0090d949c3caa330f7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x61_n"4425d677f978a7acd71a8cb19a084d58*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x61:64x61x256_n"15a3ed1849801dd9ebcd86df8c533ac0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x62_n"3431003367a35c9a3f9ea0e713242596*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x62:64x62x256_n"481174151fbb34c24562edd0e4809a49*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x63_n"a261bd7b277bad5697d2b8618f1b134c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x63:64x63x256_n"6c64c1e225737d684acf298c37bc6655*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:9:abc 1536x128x64:1536x64x128_n"c2e90bd723a135847fb9ba0445d7ee24*12&293dbc1a89dd754cc70660524f905a7c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:u8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 1536x128x128:1536x128x64_n"36feed0ef903851439b294ae8f469d84*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1344:1344x512_n"c7f537340d9843bf98b270d654f848a6*22497"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x512:512x29_n"c45bd1e8fa51a118e42fc563034521e6*22497"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x14336:14336x5376_n"c97a0e535555fc072e06b06f6a097c0a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 14x32x128:14x128x32_n"05939377737465fa45d31673a628acb7*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 14x32x32:14x32x128_n"c904bf19d2ff00ed83092ba8bc775090*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1792:1792x14336_n"ead906fd5cd94542449794eb46aa32a4*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x14336:14336x7168_n"c22d8fce7f9436ccf10c2f099c0c8fe6*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x7168:7168x14336_n"d074960e1b19b690c215e90a66bb1d1a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x14336:14336x5376_n"df0f0e085cb89edeb8fb33994993fe91*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x33_n"ea8a9e5f688b744876f9e3eb8befb572*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1792:1792x14336_n"decbe621edd7d2aef47d8793ad46a43d*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x14336:14336x7168_n"052d14a9d065b7823ff5f5c8607fc77a*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x7168:7168x14336_n"3fb04c3489e6d178b2d079c339bd46c2*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x34_n"a01ef6f06015f202e19b4da9da9fd610*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x35_n"a26141ad490ec13d188d733e2ae51bc2*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x36_n"318e9b6e54f598718fe68285b78001e0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x37_n"ed90956f4466df64ea0ef30641386ed8*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x38_n"5943ed4697ca244c09d73c521043af4e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x39_n"74653936262319783ccb96d107b8c9ca*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x40_n"49a2daa0353efd2a131ef2c9f4d63441*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x41_n"d8c2ec53206a8c76f373382e696d2236*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x42_n"61e85c7317150f338026dd98c54f6d32*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x43_n"1cfe5cea9d6d3cd30541933bda146589*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x44_n"247b43bdf90f69889727c21c564aa684*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x45_n"aff63682f9e10a91f33b275e34eb21e4*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x46_n"2e6dec37aa09508a4a405d1860991248*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x47_n"44ff94ad9cee44ec7be765f72edc3c3b*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x48_n"a49cc0ae5c1fd1ce04fae2790bd0bbac*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x49_n"650dace1be2c51be69016aab8c1af8a3*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x50_n"95a18fb9e361aa06f518f307f081f383*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x51_n"fff2884de67df39027ef8e8ba74e901c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x52_n"23ad6166de8754b19695302029725701*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x53_n"82d8b209c225eaba0d6972f10ecd2851*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x54_n"743a153ede025f3d464eb9c1a5b632e5*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x55_n"35f0b8fef54a1c275b803da13e517fd0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x56_n"0c19aee010f06ff3153d905b9d9b4649*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x57_n"1947d8b7f87263a16f6265345855b775*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x58_n"53f07cd32fcd9eda5f2d3971ce41207f*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x59_n"1c8b967a8893fd74d8a20f20c0643657*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x60_n"9c0bd060eae43e2a97ce586a91d63d33*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x61_n"b8488419ac10e17b766c1898b79e66c4*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x62_n"bdecef1529e18c3064aba0ec933024a0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 14x1x128:14x128x63_n"ec8f26a63193c51473eca15a480c7760*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1024:1024x4096_n"97e060815dab6a6ff1e47099090e7857*3165"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x240:240x4096_n"edd6e6bb2d314c3514eca50742128923*575"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x2048:2048x4096_n"71f90dfdd31d5e62aff32606aa3bb773*288"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x512:512x2048_n"4200f26f7a38b8fd6f3a612c5bbaa45e*824"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 118656x1536:1536x512_n"dfc8bb37c38e4e369c78a193b7082115"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 118656x512:512x29_n"c9fe24215010603ca3344ca075968678"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x118656:118656x512_n"4c8d721868e0564cd18afc5f9efddc09"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 118656x29:29x512_n"4024d411b9638384a3d8edf82fe483f4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x118656:118656x1536_n"8724e58782d08aec6a0a81f24882430d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 118656x512:512x1536_n"5c860f45940b4050f84b652731329699"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2x2048:2048x512_n"84e4fab14f9af11c37c105dc4a705f72*822"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x2:2x512_n"cf9ddc937bf3d3b9c58b15ab38b03387*824"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2x4096:4096x1024_n"6908f675ac470d541bcb3c29543ffdca*3160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2:2x1024_n"284d8b8fe7660a91148e7dfdc4abb234*3165"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2x4096:4096x2048_n"2186a7fe007362cb994ac3cd1e1c00e9*288"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2:2x2048_n"0541919e589d5cd9cd65102121997ca1*288"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x2:2x240_n"f10dbe53836bdb8f416f10c3ba286ef6*575"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x32_n"fbf9d21be4bd3438f994bae0211d82cd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x16_n"70f4fd9dabbff291b22b5ed7bb7be635*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x32_n"2349a98a5e354685cd178479b5dc0989*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x16_n"5b47acc61705c3dd178d1deb6f6539ee*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x32_n"0eabd73cec8bd78de65176a94a476457*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x16_n"f03a93f68ec901c5857d0c96bfe7d070*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x16:16x1_n"bb4a3f6606ce04b2814ae91df57a3fea*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x1:1x16_n"419e5eeec59999da8d35b66513f58218*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 16x11456:11456x1_n"da8fd85adf51df8e0e9098524ffe353d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x96_n"6c37dd743077b4d6dfbb98040a08bade*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x96_n"c7ee1c7cf13d5fbaf50a17d137102940*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x51_n"043f5168ed032c3644c455c896ce8f19*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x51_n"d8c8c7a2563024b4dc17778f002f916f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x54_n"5d6471e57e3e4724a944dc82acd7dd5b*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x54_n"d2e79d59f32e11aafa425867c67d63c4*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x16_n"11e13f05cf3cbee29b7fade4671863da"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x32_n"311621c0a57f630729ea3a666b0ae7ab"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 179x54:54x32_n"fac90d6bfbf277cc5cf876628ba87d6e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x54:54x16_n"2b3a5e167aa9b5d4e76de3c590a24118*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 179x96:96x32_n"46afc6e897457feb74c5a8c26698b988*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x96:96x16_n"55e4add743260eae3048857acda40693*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 179x51:51x32_n"d8ea80d3e104a3e34aaea52e1f466bd8*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x51:51x16_n"7ab1ba4f8372dab094768cdb10710e3a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x16:16x1_n"5d1a8d186f16d4b0954ac8503163e7cb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x25x128:1x32x128x25_n"1ec7af3c7841baa96bb823a4a1975be2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x25x25:1x32x25x128_n"a9e404d6d058eaacb76be1ec0df579bf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x26_n"601d597ef711817bc8c379bbfe9c296a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x26:1x32x26x128_n"5c63e321409376880fbd29cf7d6ee0e8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x27_n"78525a25e4ccd0ae1d46803ff31a8b5a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x27:1x32x27x128_n"6a2cb561955210ad230a4a17f424b7a0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x28_n"591b67ae62d186ded92a537182310b12*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x28:1x32x28x128_n"9ed026c467b8f29262ca7b6a243de786*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x29_n"a02efbdeb1095d5c4a6f0b81878fa63b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x29:1x32x29x128_n"6374eb1f498a8af93464e3b76815d521*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x30_n"ee095bb207938a2a0ec71d4ac178dc0e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x30:1x32x30x128_n"8d281ed10a59bec98ebe67b46f56e3e4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x31_n"007a7dee843664a783bb96acd07aa4b8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x31:1x32x31x128_n"fd5b0e6350e8172e525f3f0304f57b11*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x32_n"36dd530e968cd716e1bd92baa29826a1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x32:1x32x32x128_n"4f8f034dc19bd2d8a9d0eed2d891d9fb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x33_n"783464b45e96755d466f04c42655d225*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x33:1x32x33x128_n"1b7675e706e27d3f2c5e2363b9881579*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x32x1x128:1x32x128x34_n"69325d7b8696206e5c4d3f3da5caee63*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x34:1x32x34x128_n"ca0cf713e779c683c47bd857f9cacb13*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x12544:12544x1024_n"9357078e9bc736ad4b1f2e74c574835f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x1024:1024x1024_n"1a39e8090e0f0a59dee7930dfc3b94fb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x1024:1024x364_n"ea106a4aa3bacaccf80c63e1e7acf454"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x1024:1024x91_n"e8290412cb16dfceab619b20e2607c44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1605632x128:128x384_n"a0123fc96c9cc24b63f898406a70c29c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 131072x49x32:131072x32x49_n"c1c8d0ccdc86b9065ed0b69691725554*2&844ffa93b4eb80598ccaa49dafd7bf0c*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 131072x49x49:131072x49x32_n"fcd676dc9d54948cdbb975076f131fd8*2&c71c32d9302cee7a235c58f15a1fa23c*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1605632x128:128x128_n"6d727a8273418da7e2c975b62bc19c45*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 1605632x128:128x512_n"6653bcc67b2a1ad71cee2e800f379b34*2&1e5f86138a9254dbc7aa0f4c14e6bf6b*158"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 1605632x512:512x128_n"744c2a952c80a1641118644208047025*2&bb169e59b25f57ac43999a6a9745ea2a*158"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 401408x512:512x256_n"64080f2f096e91d9d387961cbce351e2&78b5cb47c59b6422fb114a025c8802fb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 401408x256:256x768_n"230bb400a43d4d19758042fb3dccf16a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 65536x49x32:65536x32x49_n"2dd736edb3a017f5265df7141a772646*2&7bcf99bd5c10686bb5a66baa4c78a01d*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 65536x49x49:65536x49x32_n"53ce7b501db73ec84035cea3fb3b65f2*2&c3e118ae2d5c56f2d9f2dcaa25430598*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 401408x256:256x256_n"96746f998c0413b6c2452a388c6f8a59*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 401408x256:256x1024_n"3ba712e8edc82e556ca12d1b231879b9*2&711cb29ab0cd0b5b09226f9ea0b87033*158"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 401408x1024:1024x256_n"9cb1e42c2a5da54a38750040d7f592dd*2&6f8747af16682c8a46e19eb48d8b0ea4*158"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 100352x1024:1024x512_n"2d96b864576900198684a3c138137fae&2243c580647681dd0ff528e7dde66b57*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 100352x512:512x1536_n"93d861f4dac91424e261ee6e50cd23ae*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x49x32:32768x32x49_n"f1f3e4fb098d72ba42f2759c0cb2d613*18&a5638b65135ade5567e5f65af6807ca7*1440"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x49x49:32768x49x32_n"e8d64a0ede608bdb035b0c91e509b7bb*18&af6614ea27b7d80de14b7f596afe23fb*1440"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 100352x512:512x512_n"b752baf71775c3baf2b92b50f045328e*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 100352x512:512x2048_n"baee7f531e7598566707e1cb49982d61*18&a4a9f8110608285677e9b27158591e4e*1422"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 100352x2048:2048x512_n"19a9dcf46c48f8d6143b2a58187de12f*18&5d4bc2dd8ce3d0b9478e33e99deaf09f*1422"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x2048:2048x1024_n"ad34a10ebf39bde42fa8054b07a69c23&c621636b2f5c5c9c5f438e06a40cd085*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x1024:1024x3072_n"adb08425a5547811a5bb1a6e979871ae*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16384x49x32:16384x32x49_n"d889b7d165c789742f512004be47e207*2&5d2159e87f37d4c698aaca9849a6fc1c*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16384x49x49:16384x49x32_n"91f00b4ec6013c38e6df564c390149eb*2&3693daa0dd13f548656a4ab1a749d3dc*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x1024:1024x1024_n"8eecc344c76eb9a81326d2efa6ce8128*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 25088x1024:1024x4096_n"9ca349dc10124415796448aa4e01d8a7*2&94b78fcb1c5900e23f3882b79b70b8cf*158"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 25088x4096:4096x1024_n"d795fe7ea944292f0b9ffbf3b8d9a7fc*2&bdc27ba10c341ebaa758a29d32e74d10*158"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1024:1024x1000_n"4b310071ca1c5fb4f8e51129a74d63e1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc 128x2:2x768_n"cad9cdd147b7dda3a32d7b73267f1424"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:f16 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc 128x768:768x768_n"0e3ee97cb868d80007fb64595f772618*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx 128x768:768x768_n"4ec7bd202ac4b5e5eefcdb0016181e93*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 12x128x128:12x128x64_n"aa1227e52a6baf4d4ecc49ba825348ea*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x768_n"b00bf8a634d956546212a14909a2bb03*48&7bebcf10832aee16a4e6c946f95ed817*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x3072_n"c4fd5c192ec253ca37be73d9692bfd2d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:s8 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 12x128x128:12x128x64_n"f4bf68ec21167a4db6ba2309e51fad93*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1500x64:1x8x64x1500_n"6e596001e8dd15e2ed313734a163ead2*125"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x1_n"fcf468d3daedf7fb14d353d79e3ba331*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x1500x1500:1x8x1500x64_n"0f3cc028586a50b60d226c5a43ca8f98*120"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x1500_n"d7114edeba4b099dbe35fdc232cb91e5*9048"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x1500:1x8x1500x64_n"f330f6025d483c4b5eb1edc9d8d9b7ac*9048"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x5_n"fc69bf52da4705be16f559423a2b4be9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x5:1x8x5x64_n"bacb4892162eb0912bde33fe74aad51d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x6_n"7b64b660cdb4fd637574324f43252cbd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x6:1x8x6x64_n"e3675b2734f6b596c1b2849bdaa2c0d5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x7_n"8347bd038d857c1bd4e458e994d50942*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x7:1x8x7x64_n"a04da36b72752d5b3ca863bac255ad6f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x8_n"95593a9c000f8d935e5e637cfcb5e07c*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x8:1x8x8x64_n"6da261353f1930612e35746e067b4c6a*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x9_n"236bf3d2912835ac2174959a2b9b9a67*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x9:1x8x9x64_n"acaa7ba4e3f8bf6da5b7e4affc93cdab*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x10_n"31ada0ea64f89a359ac488f56d95d9c9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x10:1x8x10x64_n"46de969d6bb78b80b01176c86ac6cda9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x11_n"28a42dc5c58389cb47ca5026db5a9189*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x11:1x8x11x64_n"05b099301c0762944421b9d5e990a8ff*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x12_n"db741de00ad7abe5a99966ad7a3af0e5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x12:1x8x12x64_n"9371c65fedbc59afd6986e5c5f366756*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x13_n"c51dc8c019feffc30ef3f72464ee8062*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x13:1x8x13x64_n"73a6404b7464be4da71e13e3c17e1a8c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x14_n"0676d498206e78488bcbbc653170f81b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x14:1x8x14x64_n"73125cf1f62f36291aae32b88843b379*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x15_n"18ee15f11cf0097b01ec6424704d6f43*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x15:1x8x15x64_n"b382b52ff1af36a1bca5fbd3da3f49ea*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x16_n"532c9fe02e3c49a9cf7cae292ae986b5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x16:1x8x16x64_n"1eed720a020e7203b45a7c96b9d1f52a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x19_n"c783ebd3ba2b58f6791f05b945f62803*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x19:1x8x19x64_n"42c7d1584fc7299a2bf92745ac8b51c9*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x20_n"172f41df9375c4e39f07eff94702de34*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x20:1x8x20x64_n"4edeac2a42cfa9a55d0a644bb12e0bf7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x21_n"c3217635beb58fe37ae832df3f1a61f9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x21:1x8x21x64_n"035cb7d01159422889f1e8805614e90a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x22_n"0a4a74128f3546250d26a701c2e2d132*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x22:1x8x22x64_n"7535a89cafbfe86f1fea80a20f9522f9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x23_n"fd7e52967cc60457360defcf3ba0f3cd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x23:1x8x23x64_n"23dfb91633189d74e3ccb029096e3366*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x24_n"8fcc2424cb20a1e2279c7de246c87a81*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x24:1x8x24x64_n"5d1c7360d4ed1b6428a944594a698ea2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x25_n"213ad0616309ee592d9c188d0a78eb4e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x25:1x8x25x64_n"4fb3a4e0c1d7568ba782149be4cdf1fa*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x26_n"ab8634cdf1ef87f6677354d54f548546*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x26:1x8x26x64_n"3658aa797d45bf54e29d7fd5d0afdccc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x27_n"ffe170676376c8c7fe6d7087a7efa994*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x27:1x8x27x64_n"4957f2dcc581ff7e19cb28fd8b1144ee*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x28_n"d68d2a328cdc91c259eba0c1f6b40351*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x28:1x8x28x64_n"64a58e90e3581a085d6c0177aee613c0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x29_n"ce56765a3454ab48901eaaa99297b4c2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x29:1x8x29x64_n"bc6a584249ca206b8248c6af445392f4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x30_n"9ebe4d2e102a8c76a2cdc67413dd7b2a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x30:1x8x30x64_n"e044d3d798c10cd11b012fb964f5ee38*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x31_n"deecddce126c6a1eeca6df68d173fa68*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x31:1x8x31x64_n"cff175a9f4d27e431152b9664e169528*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x32_n"ca2e7f3bf582386123e76966ff576a21*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x32:1x8x32x64_n"3bed2eaa271c1db561078a7683f9ae37*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x35_n"992840d293c7cb5f35d8646db12f7ea8*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x35:1x8x35x64_n"ce17e05f089fd3a6ce57b557a3a8f3dd*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x36_n"15923e0f6d99e1f87bb486b45e4b6b01*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x36:1x8x36x64_n"6fd879f3697127405d06fcfcbab1eca6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x37_n"60d6136d6537a2aec825132da63d8c99*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x37:1x8x37x64_n"f178f67f9604fc32f7dfb4215dde33e2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x38_n"83f5bc14b00649e98af4011a701eed07*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x38:1x8x38x64_n"1d593df5fafa485d4610c10230c82b76*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x39_n"391fdad849a7fe106c80f0af9cbafa39*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x39:1x8x39x64_n"5cc9ba6236ca675b073ac120c5ede5b1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x40_n"f567cfbc95d437d28a865c76c87e9fa5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x40:1x8x40x64_n"62809a91f4c19c59995b6f78eb5052eb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x41_n"47aa3f5a0f829d90c7502c5cd2791ff8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x41:1x8x41x64_n"bd116f9242b8f02cfcd9c48b19b7a968*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x42_n"db25fe6984ec0ee220ba39168875d1c5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x42:1x8x42x64_n"090378ab19b85adeeab67fd5728a6cb9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x43_n"27833288888f39345f4963fe223475ed*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x43:1x8x43x64_n"edada44d63718192fd2c47787ffb650e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x44_n"d634208e09cca843b87d2ddd6b1cfa18*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x44:1x8x44x64_n"69f7bb062b872e40d130d036f759c568*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x45_n"8cb176d797de230f383f9e07afd5e411*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x45:1x8x45x64_n"978088f357c03a24c04ffb77b2a2d62a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x46_n"4fc3d6bba74e6c4d13b1e2607adc3ab3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x46:1x8x46x64_n"460717ba8d0fdb42c75db2bcd9cede76*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x47_n"331e7ab15e44070e274a1e3e071ef413*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x47:1x8x47x64_n"561349ade64de0831a20d336f173c169*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x48_n"14bb47e4bffbed545935e056e51178fb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x48:1x8x48x64_n"67d04235dc6ce34de3a972478790f61a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x49_n"04d5a2ae1f56ab55e51a4bd9df0f942d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x49:1x8x49x64_n"39481e42e06b2cc008b4b16995cb68ed*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x50_n"e8cd9be840ac60c9b45d7741f5e9a5a8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x50:1x8x50x64_n"34bc3fa6b6893860c4355c9582624cfc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x51_n"3da1dc3a205c3f7006ad2fda290f2944*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x51:1x8x51x64_n"b37f5c800ea89e6804b8dc7ddbbab79e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x52_n"07daf17bd5ecc2ce3ecd1a8d96d3bfaf*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x52:1x8x52x64_n"be34a03b356daa89c47c4cda94497730*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x53_n"18aed5d8fa66b10baf5e8bc1b2d6bfec*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x53:1x8x53x64_n"ee65723688e58bf3b4ce33f0bb0e91f8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x54_n"a516a896206c48bfa636d46da059fef6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x54:1x8x54x64_n"8cb43d183a58aa5da39e789479be4e34*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x55_n"fe08176bd5c2f2c6cecbf220756c7716*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x55:1x8x55x64_n"36633239ed81ff6e954b7a6ff3ce49f5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x56_n"4d9a373017b30243cdcc7cc31cada8d0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x56:1x8x56x64_n"d405e09b775e7531b4eaacb675349cbd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x57_n"ab714bf92fbc252995337762a8b8e4ee*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x57:1x8x57x64_n"89d6a9238576d6d4a74ae387e1a3612b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x58_n"f2492bea198382210069eb7fd01958ff*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x58:1x8x58x64_n"82f5570d9c4f16fab71dae48e82b5e3d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x59_n"fa6645bd8bc660bc55d4d6d269fa5ca2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x59:1x8x59x64_n"440487c3055a23d79c7b92bda7305ccc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x60_n"b9affd15a1447ebcf722f7bb6a4c3219*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x60:1x8x60x64_n"5c46ba9e2c0c31e60d66e3e1690b4ce4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x61_n"42e54a3e498576e05a7d178d6aa87b1d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x61:1x8x61x64_n"51842d4fd20fe3dd2c6977590874b8fb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x62_n"06485124e5286de8eb5f3849d6cafe17*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x62:1x8x62x64_n"fb4e52e40e5fa7cd36ecd9bfeb7df6f1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x63_n"a485ebbb750918fff7a89307d7bae2ab*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x63:1x8x63x64_n"e524d7c238ff113426faa0c1dcca0c94*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x64_n"ca0c6d488044714b15196b70606f88b8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x64:1x8x64x64_n"9ab1f8f543aa36586941fd5af43f46ee*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x68_n"a6666b77837ef10404512d50fa5e7371*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x68:1x8x68x64_n"5e0fd1b59247e00efeaebfcbc2c78da4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x69_n"c8b4b7e6465efcf8fc97eadc16ebbd33*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x69:1x8x69x64_n"244f697a5f26d84e60a4c75ba5d1dc12*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x70_n"b719f97c08f7beb43f172ea05a5be3ce*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x70:1x8x70x64_n"558738152c1d968076c96524337cdb58*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x71_n"7dcc1c6ec0596c1bcbe3c77a8557b7e3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x71:1x8x71x64_n"2ff1e277735aaf52db73fc4e977f243d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x72_n"19443caf02c7ce7fef7cb56a80a27e7e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x72:1x8x72x64_n"847c7a7d622a3f88bccd727bf47d9863*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x73_n"99b8b00de636e40e5ba6780d99b82df6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x73:1x8x73x64_n"1b9b2f7767b1426b03a0b00b90f6b37b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x74_n"c146807f7532fbd1d2797b342d87be2a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x74:1x8x74x64_n"1da0de211525176ba1db02c2e76a4c48*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x75_n"a52785145faf2d7d6e103278da9989f0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x75:1x8x75x64_n"27dfaa6742ea1998a85ddb9f0c5c9a83*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x76_n"cecb265be7ed9811db202693bb792f5b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x76:1x8x76x64_n"8e673a2d082ed2eb0e8bba1a7ea6098b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x77_n"08acab7a67a8d854638f1c9894a0e954*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x77:1x8x77x64_n"eada2fe6b3af1154c445c5e9958e65b3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x78_n"6501b8bef4f52b27cd24c02c105d07b0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x78:1x8x78x64_n"351d0b7764f858bbd2e497b95b426c36*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x79_n"d42980e0a4a1b8e77fab0bae352b2a23*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x79:1x8x79x64_n"95f74db01bfc60c0a6f635bd1731cc10*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x80_n"7a4b5bb5b9f6a0f7e43e152a3cc04c2c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x80:1x8x80x64_n"168a2cb4a30c0beb7a3214c2f7ed470e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x81_n"e08eb8bce30110e552fcb64ad7064c13*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x81:1x8x81x64_n"6d1f230f21e23fda1f377e7fa47c9353*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x82_n"4273493360977a4e16c3794fab5008da*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x82:1x8x82x64_n"63a5fb1334714e8c5bb903af58768a30*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x83_n"558a2170b5c3219af0b7a44e3baf83eb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x83:1x8x83x64_n"98441b1bb1575c9d1819ec929bc035a0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x84_n"ab87d7716a4f358399797348dce4a196*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x84:1x8x84x64_n"ce481accba840cddc5fe15736298b23b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x85_n"45e3d20645192dd62dde61c4a063e089*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x85:1x8x85x64_n"a0b26a29a682b64abd03349b9d0bc2ae*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x86_n"8c4cf7208ed0328e5c0718cc6d4b128f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x86:1x8x86x64_n"cf3b3e441ccc1aba413ddfa70cb478ca*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x87_n"4f577fbae174b6a161a61a26fca10b43*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x87:1x8x87x64_n"c1bde98d5f5d3e44981b8b7779ca9c3e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x88_n"5108d8057929cb4ff20f18303fb12eb2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x88:1x8x88x64_n"f4c7289b00c0b5db3e1030e76e93f1d5*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x89_n"ea2eba49a9ed9e66eef681f49ddbb5c0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x89:1x8x89x64_n"2293926aa396d4aa013a6b922782a894*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x90_n"1d33f9cf61d5d083369228afb88350f9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x90:1x8x90x64_n"29b0a6bd1a1bc13bc39286138b82eb5f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x91_n"7382fcead374eed03b593fe9d6627eb1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x91:1x8x91x64_n"ed28a89d8e2c4b1b3abd5c80832d690c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x92_n"89bb9a1f8ea8ddef41e2ae9e0b42c051*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x92:1x8x92x64_n"b5bc9baf77f6aaaf53de3d39ab074a62*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x93_n"05c0b30ac52289026247958e9b103196*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x93:1x8x93x64_n"a051a758c97ac4cf1ee2dd93540b4254*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x94_n"b09c54bfa88576197b4b0c6a6385cc68*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x94:1x8x94x64_n"2a2e5f5ab8941cc30142a00be80c062f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x95_n"79a6f9ce431ba50d9edddc8cdd2b4fd5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x95:1x8x95x64_n"5ca0b3e310638d99e8f13552094962fc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x96_n"8bb1fae44fb71e094e92cf3e5073761f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x96:1x8x96x64_n"c43835f3828cea70c8327de19056c77e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x97_n"362e154d335afda14cf307a12cb1fc31*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x97:1x8x97x64_n"ad691c70b744685e3280316b08d1ac37*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x98_n"7bdb339535e5f68dd6ee3c9562cc55a1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x98:1x8x98x64_n"d7d0daa7ef2cb66ad39873e22c551bd2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x99_n"5e3758872ce10dc87d3f31440edb6776*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x99:1x8x99x64_n"8ba0eb922dd8732f8fa8d95441baf861*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x100_n"b6aa161711911603dc0ac21ac882c940*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x100:1x8x100x64_n"ca603cbc284bf77e58ed673ea68fe80f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x101_n"b4a970cfb34fcfede8ac232366451cad*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x101:1x8x101x64_n"0c8428fa87f444f74ecdeda51a6cdf6b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x102_n"4e0d3dcaa309cc4702985cf9304b5774*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x102:1x8x102x64_n"757250c7060de75e78ac8111f159230c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x103_n"b8e1e73e2c0a95745caeb9d2d4701592*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x103:1x8x103x64_n"6662a5579f371ac3a2afd81fca01d766*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x104_n"753899ee6031b29c4b6340ec223e5189*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x104:1x8x104x64_n"b00efe3965e92c4149bd6e5f5650230e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x105_n"d2f302a262c06ede87cb9567d7c95a6c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x105:1x8x105x64_n"2975644c2bc9d104506b2e476b3ee14d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x106_n"96352d294482eed432df740d572fc3eb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x106:1x8x106x64_n"4842e8b46d51fcd872c0e889acda5c79*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x107_n"651e28229ad206ca49e29a6617a531f9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x107:1x8x107x64_n"a17134c38dd2105988d64ddb5df89cc2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x108_n"056de05f7ce68e310fa6f88ed27ee6ae*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x108:1x8x108x64_n"acf8842e17d01b82dc50554ad401ff3d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x109_n"8f423faef158a5b5b2d4231b36f923d8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x109:1x8x109x64_n"10d5208816e501a0b160bad42fbae0c8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x110_n"4e34c11c4c6691537dc81d8ce37b76d1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x110:1x8x110x64_n"93e5f707c160fc1bb31e00ad2736c63b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x111_n"3d6d60c0dca6e63ec2da3908296fb04c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x111:1x8x111x64_n"102e9ea242934fecf6258359bec121d3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x112_n"3979bf7091892d74a1d6412e35e06fac*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x112:1x8x112x64_n"fc90e3f225591ee9a89f1685305c9f9e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x113_n"d8f77e45322a10eb882cd4443a3477ce*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x113:1x8x113x64_n"9e1de9a41836d2fda47b57afbebff134*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x114_n"b8dfeeda8bc20ba893cf9c648a32e269*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x114:1x8x114x64_n"4502f0caeeb7dd5bfa89061badaa2f8f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x115_n"b75053fe2324d0aec1e123b18408bea0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x115:1x8x115x64_n"d6affc51aebfd2f97c2a8062f78e0f0a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x116_n"8b2681c96edb49b20d150a3a7bc31277*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x116:1x8x116x64_n"6c25763ac3d669894995ce83ffcfee07*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x117_n"7a556efe55a2a008a8354dc1bf8baefd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x117:1x8x117x64_n"6bbd372e327ee54246c9b9b5786acd9c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x117x64:1x8x64x117_n"7dec151a7f0ab6f76c335ffbb4ac13d6*17"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x123_n"9b427a8341548c6bd5e87d14d0b8c921*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x123:1x8x123x64_n"78bbe2d8909b99b01b0b6c1aba746c3e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x124_n"a0fca1e7dc9736b10d09333e670df28e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x124:1x8x124x64_n"b902f657f4868384e3eb7c684294f7da*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x125_n"54b09f52edce1c85aa52e8d63a8905ba*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x125:1x8x125x64_n"3abd7bd4d82d01eb53fe5baa9b90aae6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x126_n"bb39950115e3463a3ed926769df3f40b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x126:1x8x126x64_n"fee8715ec9cfd4fd034cd688e43fccc5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x127_n"9130bcf72ecaefda29a11099ffd4dcb9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x127:1x8x127x64_n"753e99570380dd4a8448a1ea8b672c19*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x128_n"ece67f01c43747c8e6222070230e26b7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x128:1x8x128x64_n"695d4687d29b89e7d3c642f455876e81*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x129_n"d3f668f813493dd0e20b162a8d1b4f14*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x129:1x8x129x64_n"7e83d4c2997e9bdf49d1e956189928ea*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x130_n"f33a8cce496906c9bf96af77733fa2e8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x130:1x8x130x64_n"d6a142d785291ebbd012044441a79692*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x131_n"8ad5dfe158dc14984e72e0284ceaf44e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x131:1x8x131x64_n"bc174041f237cddeacd1a85543d1f602*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x132_n"4086cb3c04a416e4fa9836da65fff57a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x132:1x8x132x64_n"ceb10fa846b231d4eeff7ea8d252e408*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x133_n"b4874cf6854d05b7dc2f4805d97ef5e3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x133:1x8x133x64_n"bae2fe6bd5f39b1068e72eb0f99468da*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x134_n"bdf793eaedd44f99abc87bf3cfbe11a9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x134:1x8x134x64_n"1cbee8c8ff51b963268c90a1210ea6a6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x135_n"c143abbf524bf2fda60b3a5ff01fa6a1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x135:1x8x135x64_n"dce586f9cd5a5b97674dfea322860e91*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x136_n"591480f90d4aa2447fa42962d7127769*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x136:1x8x136x64_n"09eb359521a7869649d533fcd561fefd*17"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x137_n"591d644fe15ce74d22d4121cbeee5411*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x137:1x8x137x64_n"a72c776fcdec8c89a7c4672800f8be0d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x138_n"552b70928fc23304f516880e161cc00c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x138:1x8x138x64_n"cac993ecf915f24ef5acf63cb44d8bad*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x139_n"d8cdf940b2d60812b1c25c20ac5162bb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x139:1x8x139x64_n"47ee915ca9fbde922d968641303e9b48*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x140_n"4a924f7cc090486f128790fa1363b146*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x140:1x8x140x64_n"c8ac34f5df21d56005c7d3ca18c4f673*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x141_n"c39d978eea1106838cd49788df23daef*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x141:1x8x141x64_n"aafbd6c9f8ffd6a8894e3ac48318a96c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x142_n"316303e437967cf0cece72b1e39a3d95*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x142:1x8x142x64_n"d4711e51c65c92f3e776de8917cf428b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x143_n"abc4f2312f83e8de6179c542323e8f93*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x143:1x8x143x64_n"31e4fd8f489e8742afc24cd954f34731*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x144_n"3eb8295dac05f422e8bb25aebe4b432e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x144:1x8x144x64_n"2e765c949a8d27597996b9cb3e10ab4d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x145_n"6db13b75db650907e0968de247be46ed*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x145:1x8x145x64_n"1ff4fda7a70a1dfd90a7237fe60f625f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x146_n"cfcfb5c88fbfee2d17ab739587b2ce43*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x146:1x8x146x64_n"bb7ba5402bdeb1a2a4473250a4419360*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x147_n"9aa95707bc0418b4d3574f76c0f29732*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x147:1x8x147x64_n"78c95877aebb0b44e4ec099ca35dd2f3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x148_n"48c03a440a1d95c07dc37b1b00a4fb85*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x148:1x8x148x64_n"c2f66ea292d027446707be86985cbaee*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x149_n"69964842eae35481a6441bd1ddad70a3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x149:1x8x149x64_n"f701d9d4a84dde13aed2a563c42c15f4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x150_n"f9b359289fc694498ebc0175e3f4f00b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x150:1x8x150x64_n"f2474ed5674af668db364822e4cb3725*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x151_n"62ebfc64c95353926db9b02dbbe6c832*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x151:1x8x151x64_n"ffd339bddce09caeb2686928e7a4006a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x152_n"f4ac5af9beff8c38c96240cb1f7466b3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x152:1x8x152x64_n"95964619a4ccc53fa3dccb2f1b2245cc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x153_n"8f62ae599a9077cad913574f4efdcc51*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x153:1x8x153x64_n"d1e18d780aff0dc4d1d347f34a2347a7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x154_n"8aede8e8f75abd65d31cb4707f74477d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x154:1x8x154x64_n"d9cb62eac368f4d56c4b16862c7ecbda*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x155_n"af28967f22a9e42bfcec53d00ef4c046*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x155:1x8x155x64_n"9f79528eef4703d1ab8ee79e8a1fb986*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x156_n"2e9be2a4007ea8528ad072e610a66b96*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x156:1x8x156x64_n"620357ad16675c916594f1bc9b406290*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x157_n"db24f7c925c15a9a81e97bc26c405c23*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x157:1x8x157x64_n"24f41c772901c2ee71e0c8e522fdc8f7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x158_n"88b6d0151214b09e32e623e7a0cd5711*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x158:1x8x158x64_n"2b140c3f4815f759ba069ab2a3f32d28*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x159_n"0bc333e8847c01394376c42141f6d396*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x159:1x8x159x64_n"18c98d23b944658006d8a5eda7e90eb1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x160_n"d1d3bca84957d685435eccf789a76c97*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x160:1x8x160x64_n"d6a07f52a1138791a49e71ce11d058a3*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x161_n"8c4061e6256f6485206eefcb3df61ff5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x161:1x8x161x64_n"344fc124789a6bb970c1df0d60495eb2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x162_n"a659a5df177c50470a89a3d23bb5d618*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x162:1x8x162x64_n"1616dc39c16c7e16b3fbc3c41ccf616e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x163_n"ee4b89dc3e04afde80ac9c1470acffbb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x163:1x8x163x64_n"ded7851c14c14671ad7fd0983cdc3d7d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x164_n"47199530e9ce0619016429d36f26bf3c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x164:1x8x164x64_n"e85bad1d884da41ce2d243d78d3ac9f4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x165_n"8adea8958964add6c6ac99e8dd27b2ff*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x165:1x8x165x64_n"ffd390cd63a9e09ac8ca3772435eb05b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x166_n"784a4442037c5baa0a7f98c849e5742a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x166:1x8x166x64_n"5e256ce78c500013a9dd8d317209b7e1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x167_n"d99c231f07c1b2904c932b312d021ffa*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x167:1x8x167x64_n"8d880d4f1b11be24a17580f2baa50c71*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x168_n"0c2a208bc7e840f3050f8c11a66f2314*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x168:1x8x168x64_n"b9249bdafb615a90448e6e7b3f9121e0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x169_n"17944988475082f62b01c179d3dba722*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x169:1x8x169x64_n"06856ac81c1f58c20851c50105b25270*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x170_n"59462d56d1d7d5c20a544694a3c3299f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x170:1x8x170x64_n"6a2d6ca27cdea702ff5d2845ed30375c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x171_n"385ef9b0ab2022072857ae1ad57bd976*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x171:1x8x171x64_n"d4f35e8f79314529508bdd8d455885cb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x172_n"cd629986666a39095ead30e2e0841bc6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x172:1x8x172x64_n"fce7a37b99b50a3b05dc6668b65580c2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x173_n"b22d0461b23ff49ef1c66eaa3ceb2319*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x173:1x8x173x64_n"adfef76722c8b82701f64a6ef4373327*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x174_n"0e5d3062e05e7ad168ab357a24343cb4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x174:1x8x174x64_n"ea6a36e9251352a6b077637c29b1bcce*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x175_n"003efccaf688c9d72fa4372fd9813ed5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x175:1x8x175x64_n"565c6ee2fa16bc6e367813bd901eda40*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x176_n"19d12947165856bd702d111dc231d81b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x176:1x8x176x64_n"ff33fd21ff49ea28a5b5dbd29fd53a50*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x177_n"646e3d14f93b1f5585fc63be37a388c1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x177:1x8x177x64_n"0ea0a4e029f4723293420494e4bb8da1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x178_n"cba9e7f62900bb5bf9cdb3c440814e15*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x178:1x8x178x64_n"46468f620d0f04b242ee2ec85bbd4a65*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x179_n"21dd1d63a99e46225f4245eeed713b13*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x179:1x8x179x64_n"1d452601aeefc634e9c50b312ccd9289*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x180_n"a3c6b1f61f907580c568d5cba5b0d24b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x180:1x8x180x64_n"da6e19f34b65d927e396671511b487e3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x181_n"e4d4907effd7b82ac0153b80877aa635*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x181:1x8x181x64_n"0eebf888ec3cbc912b3cebd82f6dcc84*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x182_n"9032145966ad73bbc31070da4c8697d3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x182:1x8x182x64_n"2a323ed9dfa3da1a47275a62ec3b5b4a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x183_n"539aabbd97c4adee173a6c8aa01ee50b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x183:1x8x183x64_n"292248bf869d8a34de70f035c057f986*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x184_n"7dd0c37f16ac9c1b621a00ab9028a328*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x184:1x8x184x64_n"5c9444e6fdeb70295f2da282d7476c9b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x185_n"19eb9e67d042ba4f7a1a041454a7487b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x185:1x8x185x64_n"da86f712a474aae5d409c63c065e896a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x186_n"18b45ec79ab9f6a8e62976aa17f43dcf*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x186:1x8x186x64_n"bf71a867662c9640f329657a739c7ebe*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x187_n"32d91efa2f2b24b1d60955fd23539ca1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x187:1x8x187x64_n"e055a82771aca1769e273a60005225ea*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x188_n"b4f07c12eb3047273bb23aa38f18d7c3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x188:1x8x188x64_n"d947765916200e6b248b5f73bb3d30c1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x189_n"4553db9ece1ba1c913cafaccd634fb4f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x189:1x8x189x64_n"20d0b2fa505fe06665dcbc9b10c6fc4b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x190_n"0abd8fcfca2dc8450a4b0582776f2bba*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x190:1x8x190x64_n"47ea80a52662ae46caecdddb5de8b07c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x191_n"851d4a8dc0913b7d1cf3e3e63882fe5f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x191:1x8x191x64_n"0f62f7c33321749dc05634df79af9bb7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x192_n"97b750f310da3458f152d340f764d384*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x192:1x8x192x64_n"fd3e001250aa05756397502e4b843d3f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x193_n"8298e1b8019a0a32e48160f8d9e7696d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x193:1x8x193x64_n"10c88d6ef34f97c5d947f3409f347cbb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x194_n"908ffde5c7e13e209de853c848377b70*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x194:1x8x194x64_n"6026abdae797d2933299b649a08fd819*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x195_n"c3f75e3bf2bf5d4abc39b83d5c95093d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x195:1x8x195x64_n"4f7f36bfed7c0b97f4adc4db56ed1996*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x196_n"3694c35a4b73e89c58e4de75e6f27e8c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x196:1x8x196x64_n"ef787400c766483cce4c74dd1f194dc1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x197_n"4b62d6304765899ee78df4b32e6e7442*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x197:1x8x197x64_n"66b2ba9e0dd8e8f5ae46edcae6c74151*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x198_n"ee2c1439fab36ad34a6151519aa2b168*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x198:1x8x198x64_n"35b9638be1cfc9afa1181b32ce75d33a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x199_n"342d48e54daa0b98826a0dc06b4f02e9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x199:1x8x199x64_n"0aef60fe75353d626a932e2d52ae4bd6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x200_n"8d1df63327df2607fad9459cfa2671d5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x200:1x8x200x64_n"cb39d7a3041f0b8820a92d8f2768a137*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x201_n"eb7a7c7fa1a274c608eba37da4d72ce2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x201:1x8x201x64_n"21fe3b5053847e2fc452e107eaaa9349*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x202_n"912472ec666bea31718c572a0463b3e3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x202:1x8x202x64_n"2111e99774645db4012015410026838b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x203_n"978b0fd8181a851ed69b30b8d4bef194*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x203:1x8x203x64_n"38369675e2aebc96e2e3b2b4de783c7c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x204_n"acb6882f7f5a04c320474cf0816ebe4b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x204:1x8x204x64_n"4bd846b2f834b0e8ce67ce995f504a35*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x205_n"39016e60dfe30cb523e3115c0243e96e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x205:1x8x205x64_n"d476fa43963f7254a213d66a7d88a876*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x206_n"b0fceb568c1da9dde2f13fb9417a3ea1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x206:1x8x206x64_n"30c0b2b2803d30a82301dac8e1ce5ad3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x207_n"6152d45ff157c60f25ba9ec771c61136*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x207:1x8x207x64_n"658fb9a1a2076612223c02028542ac15*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x208_n"96ef310af83d3e0602f92e3e3d5f6246*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x208:1x8x208x64_n"6bdb39ba0cda2fff3e48e02941dcb4c9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x209_n"e6f1e6eec5c3774394f788e2b606a467*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x209:1x8x209x64_n"1a924350d943f04aa0f8034e99bd902a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x210_n"a403a35ca86b7ca954c55ba763240c6c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x210:1x8x210x64_n"752d9baaaa5a8ead5a8690cf41597d6d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x211_n"f84d1fe2a09bf748d70b0989394c9957*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x211:1x8x211x64_n"10aa38cfd1b1dac06d724bfd0dd27bbc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x212_n"fde3acb6ab0008ffd48956234cbb2839*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x212:1x8x212x64_n"d7b801fda828f7f8bd2237063dd9c10c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x213_n"d4461ab286ff04bc184bbba1d9804330*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x213:1x8x213x64_n"ebecdac49d46a9a51a29c5b3716eef5b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x214_n"6db614b3eac1486bfde766dafae0f84f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x214:1x8x214x64_n"d33e0341da9ded71aeafac6c6a1fdeea*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x215_n"8479ef67c3ac592bfe4484eaec75e13c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x215:1x8x215x64_n"40bfbf57d3e409851989eb561d0f17cf*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x216_n"2f5822dc7466633778839cd533fc26de*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x216:1x8x216x64_n"5214a549c1fbf239fe880b3efbe2b94e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x217_n"5a3b1ab485efa313fe34a297bdc36a92*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x217:1x8x217x64_n"be8fd1eb3dbb5bf06301d1f1905f9852*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x218_n"61032fd77947cc9967952eb3240d7812*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x218:1x8x218x64_n"c50d730c326ef48f4c3c8ba1bdf884fb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x219_n"5ed114470b80e062bbe67601bd069890*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x219:1x8x219x64_n"8c50214b763f9b21f10b1f9edb04992e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x220_n"3391799ee05b7563c856c60cc3bde5c5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x220:1x8x220x64_n"7be3e06bad5894a2083f75ec2c6f636c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x221_n"eeb48921333a131ecb687786d3cd8faf*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x221:1x8x221x64_n"e25f99a84c5b58e74894e6af1d05347b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x222_n"f352ea8e71e727e66f7acb38c3a77be2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x222:1x8x222x64_n"1abdb8465308a732e241233f4956ccd5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x223_n"c3f79f54cdf98f08814a10df7224a821*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x223:1x8x223x64_n"ec8abf489e235efa503c9ce51b5ba366*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x224_n"4f9db3a132cbdbd5466bbf6315d28623*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x224:1x8x224x64_n"9b6b2ac9d86f55e529756c72e4de909a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x225_n"d8ed5682dee1b93c57b534607296a6d8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x225:1x8x225x64_n"9642f368d6af6e75c17ffe1fc983104c*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x226_n"ee3f8824ff6167883e2e3ca7808a417d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x226:1x8x226x64_n"15303da6c94df694dcec03294fd5f3d4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x227_n"d2c5717717b42aa0bd88a618a77d35d6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x227:1x8x227x64_n"28aaea1cdea921c7d6caa280ed42a547*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x228_n"bbd5ed6cd49a445e3ae14fbdb8697b31*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x228:1x8x228x64_n"1bde2eb0835abba2de93a7772fbbbd19*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x229_n"1aaccf08cae06a584c2076b739cc9e92*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x229:1x8x229x64_n"5c9f7ab2f05f2a31641fbe37f016017c*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x230_n"a78d69cda6aa49d13b99fc83498dbe2e*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x230:1x8x230x64_n"5d133903cd1b437d8e0170dedfe215eb*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x231_n"beafd1726caa8b080a0a2fc98b0a204d*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x231:1x8x231x64_n"2385acac74c94cab79898813e33abcd9*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x232_n"85fc3b16a3f15086f1c7d8d84592cf2a*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x232:1x8x232x64_n"cb2dea62082071b48c8274a34ef642ee*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x233_n"e091e521c0da34887875d433e73817f5*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x233:1x8x233x64_n"7290191ae07904b682641f7be7c1dd78*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x234_n"711e607d01ca72c2e5398387bd7c3084*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x234:1x8x234x64_n"3598da478902bf8352d8aa4452f5e88a*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x235_n"c07b2d051abe8300a327da55700c72da*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x235:1x8x235x64_n"c416371aff8030b194cb9c52f3fa6ed9*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x236_n"8a42dda625abdecff3d3f0ecca6a8673*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x236:1x8x236x64_n"ac8f4a87fd2328063ca57b74976d5b0b*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x237_n"44ef14bd67b41b6a03896f5b9197bded*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x237:1x8x237x64_n"24d7eaa05e449641b17ca55233b05ba3*92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x238_n"c9b8de7d5f24b89410789ab10f9e8837*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x238:1x8x238x64_n"6b53475ee06127912fe44718515c2c17*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x239_n"bb71475b7cd0e0f4448083534ff33d04*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x239:1x8x239x64_n"c5624ad69f1171694ec59bd1a11add10*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x240_n"a414511eb9329157717edd9fde2a5a7e*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x240:1x8x240x64_n"e5f8f94ed6be9caa3f62fa5c55715dc3*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x241_n"35630852a03597f4606915e156ef3b91*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x241:1x8x241x64_n"1a4e6f77f113395f7d22ac22fdc9a9b8*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x242_n"5916eefb3502ba813cb0600bc2aecac0*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x242:1x8x242x64_n"54698ba314c382e6d58b42741db7dc8d*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x243_n"704cab0db219d3db772d54dfaab6fde0*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x243:1x8x243x64_n"803b8aa060a7c780e4c88113b129bcec*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x227x64:1x8x64x227_n"4d822d28bd3ef6771116c44e3ed68111*68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x244_n"7ae8c3fe5af7bb2d93152860c3d792dc*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x244:1x8x244x64_n"ed559a8876435ad2f3ff9228ecd47414*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x245_n"dd9863f41c3911f7c9a871843672af28*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x245:1x8x245x64_n"79cfa0c0fb7b796d04fd1013c8bd763a*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x246_n"59dae6b8865ff09662627c38b45f6b66*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x246:1x8x246x64_n"08fdf56695adac5337459ec08a8d5755*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x247_n"2f185da87c1cc39d7b019d0c084af6c6*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x247:1x8x247x64_n"227bb660087ff08acc021fc67612fe04*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x248_n"157abf214696dae60c260eb96f159631*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x248:1x8x248x64_n"f50b6cfc7335bba0fbbece2e45e6dd81*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x249_n"0102c4e6645e338af33de4ac93d30274*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x249:1x8x249x64_n"eed7e72b6db03383f3ca348161ea62c8*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x250_n"0afdd06a490eff2decc3b81306571214*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x250:1x8x250x64_n"36f3543481a0bd23927383a88ec6c82e*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x251_n"8596ebf0ad20b4a47304aa1b9e562638*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x251:1x8x251x64_n"8c00db20f43d94dce4ceedb7bc911288*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x252_n"0781bb89fe78d51098be630ff038d777*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x252:1x8x252x64_n"d637936db6f6150fee6774eb25e80148*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x253_n"0286fd96512fea58469e7070bf66f9e7*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x253:1x8x253x64_n"4e6a496b40b6cbea1b846ab2dacc757d*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x254_n"347f555e7b5624c2dd18cb3dae4bf4f1*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x254:1x8x254x64_n"dbd822669d866cd5fa11c497525fe295*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x255_n"6d1c5f6f901418fb5b4f6593495e9ba6*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x255:1x8x255x64_n"a0d95cdebafe08d0326908ce475d0e49*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x256_n"7d9a71f54adcf32867bd25a6ab4328a5*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x256:1x8x256x64_n"c1431166ebadc6d4b17e99d65d901509*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x259_n"87e42156a9c62f129c0e607a67208984*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x259:1x8x259x64_n"bfc02f579446ab350685dd51a2bbe7f3*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x260_n"6a8d5395ffed31d876b59d0332deccea*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x260:1x8x260x64_n"7b3dac4d90d9b91683864f87d2e32698*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x261_n"f498b554e521fd4f53d6caffa50007d5*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x261:1x8x261x64_n"5e5614a773dd8eeeb4bbb53f7caef67b*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x262_n"69a324638f4c440f99c6c59876b78bd3*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x262:1x8x262x64_n"25688470341420ac6e09e59c5f0b776e*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x263_n"fe77f08174fb63e8974dcbf10ecdd96d*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x263:1x8x263x64_n"a838bb347d7d626de8d87e46f35795f4*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x264_n"c8cc7d61661d5425bab3db58835c94ae*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x264:1x8x264x64_n"f8f66f354eb1643304feb6adf672bf09*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x265_n"45e5966c25e2b5debc7f8b5053a40880*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x265:1x8x265x64_n"4c164e21faaeb153c02a2b75b0bb5c34*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x266_n"5a7ff563c0893dc7a0723fcce44037c1*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x266:1x8x266x64_n"6bd65f54f2fb14efcd8cea7ad59cc4cb*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x267_n"9329760f2fe45ee17c4fc851a464db6d*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x267:1x8x267x64_n"38c2f52f9fee31f511f568112a3099fe*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x268_n"2408b9145105a2c6b165409baf95d2de*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x268:1x8x268x64_n"8dd2c76f12d53db2dd3f38f5c77ca36e*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x269_n"a9508d051e4f8384c4f0bdeb95bdefc0*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x269:1x8x269x64_n"ef0f0374c1ac0dcc123d7a0fe3297fb1*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x270_n"ba3d0da8c689ca5fc5aec3680f1fcf76*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x270:1x8x270x64_n"2cc2d362e98eb6f86031d36ccbaf911d*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x271_n"66af2dd73cfb18312d69bb1aa20bc4f6*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x271:1x8x271x64_n"9df05181b15a967da8be3e36c55ed078*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x272_n"bbf12f3260b3e732e283fea409004bdc*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x272:1x8x272x64_n"0522719914758dd47b8acaabffddb7f4*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x273_n"2868c240f556dc11f1ec0fe2676fe010*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x273:1x8x273x64_n"61ae740116f2d3401a2a9f8fb072938d*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x274_n"c30a10c5fcfa2990c7b0eb96c03ac38f*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x274:1x8x274x64_n"b9adaef02c572a455e525742acc5f195*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x275_n"245d66dc1881ad6e1f937c2611aa76ee*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x275:1x8x275x64_n"b470420a6d7585b59d874b169e4f9b6e*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x276_n"958e6b99bdff0e066ed73479204ef0b1*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x276:1x8x276x64_n"fd0c93936acedbdd76cd49376f89b557*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x277_n"25ec7a890470f08a78fc41466ae87a55*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x277:1x8x277x64_n"675f00108ffcb89453d61e76807649a7*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x278_n"4f5be4e13131bd5919c97d532adb3b61*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x278:1x8x278x64_n"bcb6d085a0dd1b5505a8814e6798a504*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x279_n"cce55c0a0dbddd3234563dd764473c02*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x279:1x8x279x64_n"a4dd97411d29b3c3300240bb34db1fba*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x280_n"065469ab47939a45bd6f7d55ab9de03a*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x280:1x8x280x64_n"6769a6bb44b5e17c3aaa82a61b656ff3*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x281_n"41a706b53318686f4c857d01f7c8c2bc*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x281:1x8x281x64_n"6ab4d6d223d24b606878fbd973782818*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x282_n"c3623c3b70d976e7d822a822ad8d266f*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x282:1x8x282x64_n"c23533f6d925b29e9d7288524141e1af*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x283_n"6082ab8b710e9f3f1b98fb4487c70b57*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x283:1x8x283x64_n"1e3df9949ed39563330e24ffaeb74a4a*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x284_n"6124e4f5ede1a39b02d871e59adfd596*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x284:1x8x284x64_n"9604d5cedd8c6347b37b3be242b31132*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x285_n"8ef2176d2d59c136225664421fe7ec7e*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x285:1x8x285x64_n"41b363c4d752b1e28af73e09101bc2cf*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x286_n"c20b7c3ef3c01e23943ff4e924e58982*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x286:1x8x286x64_n"61759f76c42c502eebb61ac22123312a*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x287_n"871ce3590cc9d754c11ad67adca9d04f*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x287:1x8x287x64_n"d00c18ddcaad0826c2db5268f8db4274*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x288_n"3abb5a53d309dc4511778e1190e09d4a*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x288:1x8x288x64_n"34236c62b0ceff988daf65fa14671d9a*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x289_n"9010a3bbc184eec722f6071887fa3a6c*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x289:1x8x289x64_n"69902d7b058377d6415f5b1ab1b08553*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x227x227:1x8x227x64_n"d86b242aa8efde7349dd94871c230f23*63"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x227x64:1x8x64x1500_n"65ee70cc7a1aa5d92e8eb293d4e32b56*63"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x227x1500:1x8x1500x64_n"c83fb8fe97e153cfb378b37ef57ac124*63"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x257_n"4c0f77ad8037839adfbf8daa6eec2c9f*39"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x257:1x8x257x64_n"5f809dc4fe7d2afe9b5535716406412e*39"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x258_n"8cbaf3056f9719a6884b578f2191a75b*39"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x258:1x8x258x64_n"4b234f2bb186a8f82c9be795af095ae6*39"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x290_n"1cae5509cb15c5020a017039f4114309*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x290:1x8x290x64_n"8fa9c9441a7df5541ac6cf471f655cb4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x291_n"9993694a620d4d384946b28e45594709*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x291:1x8x291x64_n"c517f5b044f406d888ee2c7b67b9d9d1*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x292_n"db71f4bf765512cc9e788da6b2e3b266*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x292:1x8x292x64_n"f291b78fddfa17f3e1349bf7407e6f4e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x3x64:1x8x64x3_n"0150c4ec71aef1ea0d633207d14b965f*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x3x3:1x8x3x64_n"9fd0da1d7ccaa998259e1dd4aa5b0be3*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x3x64:1x8x64x1500_n"4528ead4fc42b5468304f0edc1962027*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x3x1500:1x8x1500x64_n"f493fbaf2ce942ff778d0fcff2f8472f*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x4_n"a8fe03109445f072d0685b5ea7f99133*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x4:1x8x4x64_n"cd363bac9a53c54859c47a81f0682c1b*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x17_n"c93d3a63391ff121d856b6b9a465cb74*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x17:1x8x17x64_n"478d02b3487f3fb054d5c20d34819cd8*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x18_n"3aa4e87d65358c17f9d71048b04bc500*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x18:1x8x18x64_n"206f7d494c8cb83e8979e858322e51c1*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x33_n"053ecfb2b67eea79ab5ca9e69e630c43*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x33:1x8x33x64_n"174b83e4d64c2a50e90472b1dd403a2c*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x34_n"4f0afffcd254ff4078fd4f6d596a75d2*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x34:1x8x34x64_n"efc9a88212fb2c1209a5778e4fc38a04*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x65_n"2e84d4732c42fee726c44852e8239514*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x65:1x8x65x64_n"8c7029f32dfe322f39ed6585c04440f1*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x66_n"8f4a741bbf9c00723868674efdb7eb56*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x66:1x8x66x64_n"189339d078d29277971131bc25f91568*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x67_n"df0080c891bc9ef8436e87d3d42d6726*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x67:1x8x67x64_n"7df2769f2012ff0516c295c8198ad0bf*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x117x117:1x8x117x64_n"eb63f19808400c86ee46e748b710996b*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x117x64:1x8x64x1500_n"4cdede27fd763e75b0f78a8556d1c69c*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=cabd --strides=:: 1x8x117x1500:1x8x1500x64_n"3119ed4e18d99550eb9737ca616e58f2*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x118_n"133184734994fccf33f4b61cd85142e8*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x118:1x8x118x64_n"44899e3082795beb6324f68c39ea00d7*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x119_n"c197d34dc28677a14a05cb061d4846a6*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x119:1x8x119x64_n"f4c506a3c2afc915a561088814b615af*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x120_n"5a18452d0374e1b6a46e383f4126e427*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x120:1x8x120x64_n"f2d800170506e4332dd0a510507ee6f5*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x121_n"668655d324eaf805b991b5c9301cfa32*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x121:1x8x121x64_n"5eda86382b306329f940e79002cb8748*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1x64:1x8x64x122_n"6d1b502fbf61426b19c41f0e8f2ab0d9*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=cabd --dtag=acbd --strides=:: 1x8x1x122:1x8x122x64_n"0b7a3a570a8b25ec7417d004c4727885*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"1bb8a9d9c8a3e222cf721a9d46ba1cbb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x3584_n"e0bdc7520ef0a7fba00acf6def86fae9*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1024x128:28x128x1024_n"a45894dfd1e28f5b803123cedaaa0541*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1024x1024:28x1024x128_n"9a782a6b62e2a39d0a3b832573960a17*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x3584:3584x7168_n"5918b2e14b7702bd35221546046a59ce*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x14336_n"992938b07737a63c7794b77c37a15f00*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x14336:14336x7168_n"e23117b8b90dcf59443cb4ffe5dbb837*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x50272_n"98a4c5e438fac359a92504b4ede4fd6b*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x3584_n"d049059fd88bbfefb7f5b158b87f2c85*73152"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1025_n"7becb24c05541cf19b2fae89907ea974*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1025:28x1025x128_n"f1e059129ede5b4bbed8743d40b7763c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x3584:3584x7168_n"9301873cc0c93949bc0d3af5d12f4659*24384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x50272_n"ffcc433e6b1a2bfe3ef4e6424211d1c3*508"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1026_n"0cc7485f231a1da9de76564cf13ccf2d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1026:28x1026x128_n"b5c0d05313bab7dc4bcf6f94de953fbd*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1027_n"3615441aa47d8049b1e635a2e34273a4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1027:28x1027x128_n"d878e709198bb941523573b566763b64*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1028_n"fff3738eb53b0f7f2155ef2cb32a9abd*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1028:28x1028x128_n"9ba122f5bd4682733af26536fd124856*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1029_n"8514fbd86a4313f803923fa955479597*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1029:28x1029x128_n"7b4a0994c207e32cbdd2712a42a2a437*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1030_n"ee82c51f323eebd28051cf56bc60c19e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1030:28x1030x128_n"ea7a99f83fa0f84e81fe0c35df3ede75*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1031_n"c577713ecbe574788276660c30e3e21c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1031:28x1031x128_n"305092e336d04ba54fa63ee14c5fb90b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1032_n"88ab48808c323e81476cefe9a24d3dd8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1032:28x1032x128_n"77f867e617c27755e241c7e0c16bd34f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1033_n"3b250b69e7dd75f0d4aa3051b03bb1f8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1033:28x1033x128_n"d0063bac5da341f86481715e323222da*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1034_n"32607a0829191cca92e51a10656e772d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1034:28x1034x128_n"9d59d96721ad1361e980794bb3fcf4e1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1035_n"e75a34d1542f3e1cb9f06d82e113b489*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1035:28x1035x128_n"821991ebff545c2847950d7101878087*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1036_n"ad97a176c79c6a73b19f1e8e33079e69*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1036:28x1036x128_n"8c6075025aa7a7255aa897903a3979e4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1037_n"41b6b89aa9f201b73b9af0926ffe70e1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1037:28x1037x128_n"38cd7e68ac923e59fbe82472dfd625bf*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1038_n"e25a3686f682aa7978cc13e7cfdecfc0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1038:28x1038x128_n"eded3b3dd797c5ab99d2395691b534a0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1039_n"7ab970b4d51d481215a961c36b760337*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1039:28x1039x128_n"a8ce9ef6bd215959c8ba0ccf6bc15c74*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1040_n"1c8cece7830a6a7f386f438ffa5b0cfb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1040:28x1040x128_n"c41d266c569acba246e258b05ef849f7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1041_n"7525bfff9189c22bd6fcc588428a601a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1041:28x1041x128_n"aedc25b620ec483a6e35a7aea52c971a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1042_n"120f753279fbbc5637bfbf045c403d77*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1042:28x1042x128_n"4b615acad2698336d2c200de5a5bf2f5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1043_n"31529a9c0ba6f36c1eabb8261a90e161*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1043:28x1043x128_n"9c759b061b02357825a2fbddac21feaa*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1044_n"d609ed8af6a47aba5d1b9f2093c9e34f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1044:28x1044x128_n"8e4b4d80feaca06bc4ae174e7f6c658c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1045_n"66088decd40153541d64475a0513b204*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1045:28x1045x128_n"de9df83854a40eee86e62c9382e8a9b0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1046_n"01ed663581d0696f2214f954adccb1ba*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1046:28x1046x128_n"5a12aa6d793918e4790fc62a4c4ab888*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1047_n"ea8f02880ca1a511389ef84935e3c6f8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1047:28x1047x128_n"01433b59e755f12fcb48d8b4161e5157*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1048_n"fffd1c14a5813a409d891201b262df69*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1048:28x1048x128_n"62be570a668b32face0cee812a2a4520*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1049_n"bba857c8e8dded3852eb3c048eab6ba1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1049:28x1049x128_n"36c1e005a0762845ccc4c39c673e10ce*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1050_n"bc4377ac8f5bc08dd9310771a290dbb1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1050:28x1050x128_n"01f9f9beeffaa7bcc458d8fe76ee3e8a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1051_n"8ade6b82ae009f687b94ff116c6b5ac0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1051:28x1051x128_n"73b654abdd41b974baa426bac8f5db71*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1052_n"448cd991363b7d84f05fccc1bc0143c9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1052:28x1052x128_n"2498157494a4fba719d640b9973c4d2a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1053_n"b48e8b28b0d628852c58645c5cb210a1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1053:28x1053x128_n"733f87a8edb21c0f6330b2915c8bd2ef*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1054_n"a25593ad8fd8511254bfccdb23fb74bf*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1054:28x1054x128_n"cef44ffffdc7e112fb1e7c7fc3d61e86*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1055_n"b570c9d112b4070cd9aa433b919e390a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1055:28x1055x128_n"75512f3383dd0ac2a2947435a289e55b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1056_n"371e2e685ee9bd4188f0f2d853a575fa*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1056:28x1056x128_n"ebdc175920cc2c3de1c60876f68a0805*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1057_n"ce6db2080afc8bec29a0016470d89780*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1057:28x1057x128_n"1689876a386449ba6ba52069a8e174b6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1058_n"98256c2014a1070cb706e089a5636298*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1058:28x1058x128_n"3a6a945de12700d3fd358d2b4d3e2e9d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1059_n"8c36d62614733e9a0090312064381893*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1059:28x1059x128_n"4d8b5b6e6975ae0003223ac20f3a59c2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1060_n"d8ee565a5d6bdb749f1aba21299a608f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1060:28x1060x128_n"87394f738a744ed176a250ed12b1236e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1061_n"985d9cdaf22eee90cd12cb81d3995ece*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1061:28x1061x128_n"1d42b400030141fc0d7657a29d1a8003*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1062_n"edf40cbcd61133c8a76699f9fd6fcb0e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1062:28x1062x128_n"d3821347b2fefbeb809f869acb300ee9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1063_n"d05ff5ce3e92dab3c4b346ca9358a5c5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1063:28x1063x128_n"90c9feaee87a13477e7173069f6dcfb4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1064_n"fad746c2349e640e15db675fdddcf138*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1064:28x1064x128_n"b6b8c9577e6e7dcd6e52e819b1da4808*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1065_n"b1be7da18d9c7f66bff8280eed954eeb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1065:28x1065x128_n"1a3a5e5a151cbb51cb913022854985e2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1066_n"f602cb6eac120b6b14e5d4d1d8974071*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1066:28x1066x128_n"1800aad462cca1f10fb28ccf79218f54*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1067_n"93d45ffcfc247f900c65067635ae5210*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1067:28x1067x128_n"9078c900167ec59deee287a85a134af8*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1068_n"7e78ce17be51b0b88895346a01cc3555*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1068:28x1068x128_n"751ca10bf2af0fc88829aa78bbade832*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1069_n"e577e8d8178d7d22da6ec5228e8eec47*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1069:28x1069x128_n"731ac87c5457353b4e60d67d77241420*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1070_n"cb6550a529c5c9b5bd899eaab99aad9a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1070:28x1070x128_n"11be54dccaf3322b85a7914cedaeb1f0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1071_n"65b360a89621d02ea3bda4d8fc2c2705*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1071:28x1071x128_n"a7ebbe4d11263b2b9d3bea78fdab1188*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1072_n"7c68847eef69888c05204c6c13b3caa4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1072:28x1072x128_n"2903083bdd57881bbc4f9b2d94bcaf5c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1073_n"0283563dd5e80ea802e4e61a97f8afa9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1073:28x1073x128_n"23bc396e81681e2b79f706f83c31e08b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1074_n"eb374d9f35cd2645e17b70b1a8e9f8cb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1074:28x1074x128_n"7b311b188a5a8fcb269be1018cba890c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1075_n"577d9cb046481d422e3f86f6ca7abf42*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1075:28x1075x128_n"f784ee66c319b9d8414084cee5a61db4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1076_n"c64519082becc251f88384a7c0d93dc9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1076:28x1076x128_n"c75a1e88bb39c888df33804a1880ef74*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1077_n"ba00414806d7de88f6e695d5386d5249*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1077:28x1077x128_n"3a45731fbf865d27ed4a3ad626709b00*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1078_n"6976b28a27332b240345d6361b6f3f2c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1078:28x1078x128_n"587298f0f982404b815464a14b23167c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1079_n"d45b65d60bb02699490219122d1d74f4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1079:28x1079x128_n"3f6365eabfb1c5dfb313fa67e3913554*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1080_n"666b6d6ae6a4a9a86c29e8596ee72c7e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1080:28x1080x128_n"a73111e0ca3ec33456e83b59eb05200f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1081_n"442e4294c70306e0e615b8a0b21c1fdc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1081:28x1081x128_n"726bca8af752cf336f20ebb0ab9b3ae6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1082_n"2ccc5a03d8cdedef99b099ebdf6c995b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1082:28x1082x128_n"ba6b7df983038936d32363b605a4e567*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1083_n"45490e2290e89d5b3531541a464fa926*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1083:28x1083x128_n"1fa4d7ce7f42726804b22ee0ee4b66d5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1084_n"652d91050b02d01af22dd862901bd225*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1084:28x1084x128_n"0e6409ac7922a735652f812c1c436ed2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1085_n"4a99aa066d762b722cb6bda03c059310*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1085:28x1085x128_n"b31b447ecef115dcbd5ce7e47d5af9e0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1086_n"62e2a3742cf647709c803262f9b562ae*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1086:28x1086x128_n"c9851879bb0abd4ee4fb2f7c2f412223*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1087_n"45a4c49c12544b0e9c4b3dbd8ee05609*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1087:28x1087x128_n"6c61225224fbbe111eb9679855ac8166*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1088_n"902a649432baaaa7a2064b8c93277863*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1088:28x1088x128_n"309e4515396b56ec272f42b68a4fec4e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1089_n"9bd1840888d3f077f219b6e4a5d17912*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1089:28x1089x128_n"1007be685fe0761ac988fb1623cfa306*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1090_n"ce607aa29afe485c471cfb5bee300cc4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1090:28x1090x128_n"acfa18bdfa91c0b9211a88c51e823d1c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1091_n"ee3670b18b2149c1b0403a312b2c5259*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1091:28x1091x128_n"32df247048c4ea10baeacd216586ba71*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1092_n"7337648fd5a550e86067fa8cc1a8fbc7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1092:28x1092x128_n"72ffee340c4556689bce7d96485341ba*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1093_n"eae699ebd8991a27d573f25a103b0366*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1093:28x1093x128_n"3c62b29e00464e90cfa1d94500a21440*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1094_n"03f2814160b343616acb6e28d4fac544*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1094:28x1094x128_n"665e8a235f05dfef772e16495fa754cc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1095_n"5fc54840b657940114bdd45693ba42a6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1095:28x1095x128_n"97f136b414b10b5526ef2a3a742be7f7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1096_n"2a6e2e8ed6ac4f95716ff04a77667256*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1096:28x1096x128_n"ccff2cc98f8bd0e3bfaf52b3b3f9ccb6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1097_n"8ee9626d2525892cee3f96bcea3e1127*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1097:28x1097x128_n"5d706f0c4a2c582ac8a2621c78d268dc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1098_n"33073ddd8f81aafc6819702bdceec8c2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1098:28x1098x128_n"a5007ac628bf65049a61efee7c313e7a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1099_n"6fceb63f6408d33672d18ba6f04ec84f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1099:28x1099x128_n"96eaf64a51db2e4085a5188119830a0a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1100_n"021af20e4c8a0d33d41b9f17b1f37cd0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1100:28x1100x128_n"2d2ec4053b51b8f0629d888c786a5b45*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1101_n"1424c230952ef8797af81385330ae492*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1101:28x1101x128_n"0f05e2bc78f3881563677084327ce5ef*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1102_n"5269332a3e09c7227712b6811cdb8e72*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1102:28x1102x128_n"c2aa0b050167c77498cf3ff831ac1984*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1103_n"b9e457122fe179a3edda1af1593ea151*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1103:28x1103x128_n"aa9cd3c7c7b3f779cf5becf3dfcf269d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1104_n"4a77297d12769a70832d72197e360d73*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1104:28x1104x128_n"77017dd6034628a3957ae666728a8cfb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1105_n"09a2e35fb97e67293784e3689d36a96c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1105:28x1105x128_n"673738a533f1599f87a8b883d609ccfc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1106_n"fbcc2af89895147d8927ee204bf8ad4a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1106:28x1106x128_n"b688422286f15178e24eb8040f61c2aa*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1107_n"df1071a523e9a2e8609e79ab95b6cfcc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1107:28x1107x128_n"14afbb1260f5e1dc5f223ca280740f32*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1108_n"f492284b5740d32cd5f29a19e4312444*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1108:28x1108x128_n"e3dfb1126b85d18646f7d2b7c13f8440*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1109_n"e149874d691ea2f4e1d8925e970c6a76*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1109:28x1109x128_n"8d68eada5827022c208abe65d6240af9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1110_n"399f5544f4dfa651bffc71cd468a7839*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1110:28x1110x128_n"d241ce40a51bd5ea0a91ea4ccdc10678*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1111_n"4f10d18998c2f00e2431ffbf37749609*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1111:28x1111x128_n"fcbe079e4a14fd9cff9f91b854b6dbab*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1112_n"5bdd15566e2066d4f2d5096bfc93118c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1112:28x1112x128_n"6955cb4f2defd07a1ce9e55c080e80ad*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1113_n"9c74bb84483ac56917dda76611aaecfe*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1113:28x1113x128_n"cc4fae33e214dec7a7ff43b7b7c00108*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1114_n"7ca5cdc29d19775379edddbc69ee472f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1114:28x1114x128_n"50637db7596e556561b7a853f15db336*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1115_n"d993df43c09300f8647935c809740939*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1115:28x1115x128_n"d8814d8da516c377f3f693143cf89e46*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1116_n"6895f99c208d2403a471771e52c8ed6b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1116:28x1116x128_n"5b55656512247c0c0ad1ca254becf01c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1117_n"ad09659b5a9c288a21b2c9006fc9a3a5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1117:28x1117x128_n"adebf8c4f316593803fa0462938de44f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1118_n"79f745f76f7c9f74d51bd8b163814e33*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1118:28x1118x128_n"67310a218d60689d94a131d25e0a3524*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1119_n"f163c21cd02e8b41f08e9bc449affe5d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1119:28x1119x128_n"bc88af4a816f94187b650ae65f718d28*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1120_n"ed7fd716589e984760f15ca126fde512*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1120:28x1120x128_n"559ba122a8575417be1dd56ed681f0ec*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1121_n"f6fc0ab8a8233c335d0a683f15a33d31*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1121:28x1121x128_n"4e1430909773fa4bf248cc614b949a89*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1122_n"2eda147204cb1b2fce4186868dc28280*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1122:28x1122x128_n"1a1c79b9a05f943ce245563291cbdd06*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1123_n"f5ffc2c680075947869be19b554d0da2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1123:28x1123x128_n"c1150e4d7093eebfb24b38b821d08648*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1124_n"b1ee3d0b06f05d987bd27ed95e39dd34*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1124:28x1124x128_n"fb1b23fe2ce691c9d6b5586df735020d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1125_n"7c7017a5f092df26ea4a0cbb5d6e623e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1125:28x1125x128_n"20379d6376262f28651eadcbc242894a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1126_n"4bfc984bdc383e62bd6f7217cce28a4e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1126:28x1126x128_n"dd37a0d546aff8177caaa367216816de*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1127_n"1bad6f75d7f4b3a82dda16838a3b1757*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1127:28x1127x128_n"d216472b6760701a39bafc34d23d7722*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1128_n"ecf2c784d6f984b2b29871cd468e1fb0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1128:28x1128x128_n"86f8e53ea5dc04b7be36f2ee4c04c8ab*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1129_n"49501feef95412c4772ad088fcacd060*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1129:28x1129x128_n"20b8a1bced7f1dd586ef4cac1b0a36e6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1130_n"29a0c59eeb9e07a3cca2a1bf16f948b5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1130:28x1130x128_n"9dec11b413ac16360845a1431f988f13*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1131_n"139b14b8ea14bdf1a973aa3735129bbc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1131:28x1131x128_n"f64ba1ab9726e43e024988d40fc788e2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1132_n"0e41fff2770b23128d45f656e39fd637*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1132:28x1132x128_n"98ef37031826ba28eaa261e1519708c5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1133_n"c706d2c0c7945dac50bf89eccc34bbc4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1133:28x1133x128_n"2e8f3cb95d2652c23411d4deb2c73eca*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1134_n"ea42bfd000aba0abf548e880fe2e1c26*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1134:28x1134x128_n"947a53d657f611bae5d8edd298ce7d12*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1135_n"33a8398e673793b54c160f7d520918f5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1135:28x1135x128_n"769c2db4ecd30a19b715bf264fb0655a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1136_n"5cac58c9d18254ad87b13c6ad8838dff*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1136:28x1136x128_n"e71ecd3c347b674afe2d3efafea57ea3*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1137_n"64d008dd1c4d853a69ba0048d56eae8c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1137:28x1137x128_n"eddbaa173c25790b60ea5d23891c3abc*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1138_n"18f60840cc986210c1eb971f0fc65db9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1138:28x1138x128_n"1ae21c7e5e2a5a58080213dd9162f494*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1139_n"fcd1715f51302a532b9699aa863dbf91*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1139:28x1139x128_n"6f5066f113ab6a00ef859d240099681b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1140_n"df8e50dde397c65ca86668822929022f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1140:28x1140x128_n"891b80b91b1ed47f01cde1c8da9d271c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1141_n"1f22d3b7c0ba9fee56f6530556106518*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1141:28x1141x128_n"06ddb89a1a516eb881252ec2eef5f3d0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1142_n"29eb39d955210f06234720f63647ac33*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1142:28x1142x128_n"7879e5ab050530b744db04616dd0c51a*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1143_n"1449daa24ab283ffad328545eda2c5c5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1143:28x1143x128_n"2de329161dd1f0baf309b11d4cb17cd4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1144_n"e1491844d4f6f0c55d1d0e42d279ea93*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1144:28x1144x128_n"8062eb4a78c12ece4870621163ddc165*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1145_n"7b625c53c5924bc96fa24d5dc05dbc04*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1145:28x1145x128_n"dfaa80bd434ae09fbe68ac1d5c1acd42*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1146_n"336f0cdd7ec5f048f8165976165ee7d1*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1146:28x1146x128_n"a6e46e2fa8ab67aeb3efd27147506d57*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1147_n"1a6073bc64368c64db38ef27236268bb*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1147:28x1147x128_n"374229f69ca2b0450b7faee1e482a3f9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1148_n"75f23d1e77955b5589a48b2ee7f88200*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1148:28x1148x128_n"87ed84177ae50d79ee3c53cad0b06dd2*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1149_n"06c25dcb9984b05b48f4d52ce6bb9e53*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1149:28x1149x128_n"30fffdcbfdb7302928ec73618a5078d9*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1150_n"558520ce61d15517541498f1bc4ce4e7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1150:28x1150x128_n"558ea8c1d6a7c6c29f13148c193c0bd7*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 28x1x128:28x128x1151_n"8a577ef599de76fe59b179379c56ebb0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 28x1x1151:28x1151x128_n"852134ccda05f79e0a29494c0a78834c*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x4096_n"1b86ea041839a2941ed6fd1aad2eaf3a*1280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x32x128:128x128x32_n"94e030b84527167a31afb24f823c2eab*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x32x32:128x32x128_n"191adbfd1db717757df31253a51f08d6*32&303fbf2dd976fc34ffd9b77fda1e51b3*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x16384_n"bfa6c8d0dd878116c037a0d2871f8f16*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x16384:16384x4096_n"b70c18cead49769932ebaf47c3bc50d9*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x50272_n"1a3e9a929e4964f09f587d0c1eed57d3*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x4096_n"42209e4aafb411b4707236b09e307ae8*39680&6d00bc8650d894ef94f636895104a2be*16256"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x33_n"6e8f14b2007427985b2f8fe3f9bbe113*32&9a1b8701b150b26ef156617372220691*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x33:128x33x128_n"6184b1dc2c18299c4313cf8fabd776f4*32&1ab544c90e171d3e9f2dcc8775b695c3*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x16384_n"0fdec3ac86473b9aab33f8723c83f950*9920&db99d593ac60fad2cca863b48c32fd64*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x16384:16384x4096_n"45674c52e34fea906dbc22d2b7bc1398*9920&8e8b9c037551d77ac0cfacf2559a447e*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x50272_n"74fafa336c18426148aac0976eec556c*310&250d26dc9e4e59b994a13a08416d9e45*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x34_n"d5c4271db979bc90eab5f3d0be94404f*32&5ab2f94a7ffa00c6f4238ecb67b7cb37*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x34:128x34x128_n"c6deb3abbacd8e8dc8603ad6df1a1dd7*32&5ed3a83c5fb01583d6daa3fa4247c390*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x35_n"64ad4f63709f2a192113b39a60ae7375*32&540fb15a023998dda83a9ea5fdfb9586*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x35:128x35x128_n"d4bba1c0571abed5c8680447ff8e7a03*32&8d8884adb5047d0dd94a45ecc00c99d1*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x36_n"a17e29d1fc3d128878605a6106f069ea*32&63e3f90296525e2339f6782c8c8f38d1*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x36:128x36x128_n"e9786718871cc17a9baf492aa33b50f8*32&cfcc4906062e1e214911ed07e1f1383a*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x37_n"cd6b40daee648fde029b5f5332768112*32&eef430cedfece8a41436f8aeca9f19a3*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x37:128x37x128_n"f23cc8263beddb99e962b8b00c03be09*32&9e4bdae597ccaf8200fd73be61ca6ad6*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x38_n"eb2234335b1e8eb25fdba7e0d67aae02*32&5ba4c5d59eb85cff7ed04baeb6a5c040*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x38:128x38x128_n"6d3e4a38046a6d722aecb37524d7153b*32&2d89576e6e2018ce4eb0cf9d85bf8d8c*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x39_n"f371e484bc703d4c29f63a9f454a7e0c*32&e72bde9f527a370fdf5505b57b6c47b4*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x39:128x39x128_n"944ff0e99690cdc566cecbe254afa8df*32&d8583ede5203699f3ec030a48da33743*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x40_n"30f72d53f05d9f7d6f07ddcab789be87*32&3a4995b1fbb6aa10e6c5881d9fa8f210*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x40:128x40x128_n"935e6e47850b34a0230120245b317e1b*32&9c8713dbb7b63df52aa779415cdcf611*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x41_n"7b8beaff7a63ae98e5cbecdde508b758*32&55fc06e3435b67f76e354cef6df4f668*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x41:128x41x128_n"17977b0b0964a0dad69ba51ca4fe420a*32&caa66260675ffa8b07af9a24bdf00d0e*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x42_n"8533cae9756fa0937d3b58026660fe92*32&e6fa3e3d586cf3a90b77fb4cea666f30*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x42:128x42x128_n"d3b8d0f657376d0f979691cf211dc95b*32&e3e6220a4270f2f08e8016913afab636*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x43_n"500a98f2a4cd650507b24adb7409af8d*32&84d5c0823120ff8c75a62d8edb535d85*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x43:128x43x128_n"21a98450ae756462101d8edd3c55e0b5*32&318ec57e6c25cb16b1c87c9e3777c5b3*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x44_n"05115aba98f6c5d8b41679a8a1ed3107*32&52f5f7ecb368ce5e6637d840ec908e3f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x44:128x44x128_n"d8c4587234d59f900ad155ea33309b99*32&2262da480d14837f680f6607c91e125f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x45_n"c6c86b0e0bbacd993f93432a03b7b657*32&ecc547d219e7d753e353f57815f3d86f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x45:128x45x128_n"02dd1df388e698ef76b4d564fbe6a6f6*32&2f915fbccee60f33b2407e38e2076e54*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x46_n"81f664e5b894df1ae25ee3a848ca5e6a*32&add76eb4d59c0305362f8bd148ac0fb4*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x46:128x46x128_n"6532950f0344a9d5af2631b2ed5be9c3*32&396762a665fd48ff0d254d3006780413*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x47_n"4da15252a1d4a2e64571a4867ad35a2c*32&a9adde90061437498b9333b417ea6971*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x47:128x47x128_n"be7522dbe4beb3db2f9754f0c6967735*32&073b41bc435ad8a5589df492d7e477cb*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x48_n"012b4f813922015c30ba858f7e34b80f*32&0d88ad966695727b1d9a2633600682d3*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x48:128x48x128_n"2566186e110eeb620093b93f29e201ce*32&8b762ac7f5ab72bb739be49ee550a030*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x49_n"bc0e77dc12d99ee3591730543d8e9cfc*32&dc90b879f6cd641de31b9575e7b49727*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x49:128x49x128_n"bc75780c9fe220a5be03cbb6a4d2e772*32&de2f0907bf22f7167c083549ec8c1508*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x50_n"51d5f78d849e8c7af42c808b73ad5296*32&a45b3bc1a09f4a6e522b40cb21fd3e5a*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x50:128x50x128_n"0630ad977de3d08b7b25daf28ad2e1f0*32&ec96cc2b39d915cfbd18eabb2c2d605f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x51_n"528d1d04a7850f44ee6e4449246936b2*32&0130ea9df80062bb3169b793545dd31a*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x51:128x51x128_n"31f969e1ae5ca66e2f3b8123e95ef596*32&c0bfb6d2328ff5964b0b5e19699a3a1e*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x52_n"7ab32461399b4cb60d34dd513e729529*32&4dfda7a8bd5f83becf8afaf90396741c*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x52:128x52x128_n"9f90479546f9593a9af05b48ed38304e*32&9be873a06779ce38654c25593218519d*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x53_n"d42cd879773f5448d9bfc4e19600f7c6*32&5a6c5dea7e7f8a30109cb75126fddac7*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x53:128x53x128_n"b44c5df36c51bd441b217e31ddde55dd*32&f78f014daf234da9f394488f970d15e8*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x54_n"fbb4f350310ee6526046d75b1d74bf30*32&ebc3a7653b595b51d587a38457bb8742*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x54:128x54x128_n"59adaaf2fd088a1e697b4caffb5f42ec*32&cc39d2abdc54c7efd3f4d9e1929b7f61*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x55_n"86919abf7e31bea635feb88c369b6242*32&499ef97be7d98b250732b3c9cbe1fec2*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x55:128x55x128_n"6fc0e84d3bbb85f12ea55f8e844d3dcb*32&9c4f464428b6c5232201015ec8766225*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x56_n"556da98b161999f1b8cb2ee37cf3c28a*32&8c10ff0c14fe63cc026b3c0b2ed0e95f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x56:128x56x128_n"69c43ed9493ca4ee6a1b8614b3a2da3e*32&844b086cc31f399d52272602258f3bdb*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x57_n"49fd073038d590735ae4f4afb083c0a2*32&5b5fabc083abe8f2339ee6cca5ed5ed7*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x57:128x57x128_n"83dbe904992b3fdeee94bc5de7e3209e*32&a8042f32047faba07ae87e242603a0ef*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x58_n"80e042ce42629bc9aa2a9bceb9cda158*32&ce4c4d70bbd217e2be5c3a45faeb2cc3*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x58:128x58x128_n"123ff9f4a9a82998f1c8cf350137bf05*32&0eae653bb0bdb2970ec62f6227f0765e*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x59_n"a73cecee1988b212ad955e3dc2299e51*32&0cf89dd5b10944c4c12659ca5b146ec8*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x59:128x59x128_n"23d08be79589b75ddf86b08bf59b73f3*32&547b598a0f2972fe6fcd3cfc367ba2f4*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x60_n"a2bddc83f447c73617d6992b636ade73*32&7da10f1ff9e07dc3b4dcc8706484ce2f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x60:128x60x128_n"7ce4f810f1b853552a2102eac752dc70*32&296c0e017ac8b61bb57d3cbf28711d77*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x61_n"c2f99cb6ae1f03273ffad53d8ba2cfa8*32&5072e4ed5eff0a3b729bba2fccf7bd5e*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x61:128x61x128_n"34069d957b26b72b9fec0750b64c5042*32&261423edf61eb750bf4f9a0fa18c567f*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x62_n"a6c786e2bf89c52606ec3dd90ed9edd5*32&cfafcc57d93280370240db45a13960e4*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x62:128x62x128_n"9553fe0d68a74e9929e961b408d3d670*32&d0b4a5d7351920c53269e16e884368f6*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x63_n"f869c3da19200021ea01c6fbd73c57f9*32&e4606e5b227eeb4c65b46077436a2f38*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x63:128x63x128_n"acb059fd7cc79319e2599e71e4b865d0*32&b7750d39c5d0bc66b9a6329b0bb454ef*320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1536:1536x1000_n"0d73f2fad9327d1520ce7ef252c3b1ca*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x13:13x512_n"a9158adc94978e398f6da5b4bd1cd881*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x256_n"5737ce366da469315eb3ca9db88aaf35*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x128_n"3bd27dbcc69bc96535d01c7026378495*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x512_n"d3d210000fb665f360bf63294951e068*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x3456_n"89cb558d70c032c0eda46a41f5735537*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x1024_n"5a0fa59089d37379ce1823a72f3c5119*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"8878bb67426298b606f7e1011f177e96*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x512_n"fec88ecd0fb2641a44124641a318d44f*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x1_n"a8fed3f19c0029da64702f0ea34fb253*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x25536:25536x6384_n"9decf07e69ed53ffde94c4a7e14da7b4*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x6384x1:456x1x6384: --attr-post-ops=eltwise_linear:0.25 14x2048x152:14x152x2048_n"97f72bac0fa2d3f04328a276be4c8303*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x6384x1: 14x2048x2048:14x2048x152_n"d60822608d6232355a4ecbba879f49bc*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2128:2128x25536_n"78ceeabe9e9fd4fd32dd274643244234*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x25536:25536x8512_n"34b31357f77c911b96e14eb99160afc0*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x8512:8512x25536_n"d7b0e0cc78bc019da6ef487905c79eb3*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x25536:25536x4224_n"00b169b3ea7d178223bcd07f28eaf70d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x4224:4224x25536_n"6441c77667bc316c4020c3fa0eceef7e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x2048:2048x25536_n"ccf26d6d2ee2463186cda7d49f2b9b77*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x25536:25536x8512_n"568f59d562c9f95fd91ffb6b2cdf19a1*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x2048:2048x8512_n"cf8616bb9d920a12f7af990c8b6a85b9*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x8512:8512x25536_n"8f8118290473efe8d6a3df76c08c40c8*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x2048:2048x25536_n"64c283f296e08e385eac20b2b50ac41a*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x25536:25536x2128_n"2b42ce3ac27be23f20111ed3d4594aaa*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x2048:2048x2128_n"1924854928806d009f30ddbae4d03081*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 14x2048x2048:14x2048x152_n"515dce084275d8f70f95d4f43dc5e31c*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x6384: 14x2048x152:14x152x2048_n"9655ba381a3038de77b58ae688aeca32*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x6384:: 14x152x2048:14x2048x2048_n"1a6e970f0f7b8b84e13b90b849d1873b*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x6384:6384x25536_n"0105853be30fd49528e58f734faa4322*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x2048:2048x25536_n"0025869a5ad2390969d075c58da87723*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x16x4096x40:1x16x40x4096_n"dbb2eec8b0a668f71f78f6539202fdf0*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x4096x4096:1x16x4096x40_n"78add4ff9b0a753750619852f10bdcdd*100&ca23f8773fd22555c80a5fbb267953f0*100&a100f77906cf0796e9411ae2ec2a372e*4&9845e08f1fc9b25fd095feeedb087eb1*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x16x4096x40:1x16x40x77_n"c206a5d876733c8bf6bd2ed1e5e9dbae*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x4096x77:1x16x77x40_n"ebb2ebaf0365d7b6ad4943a97367026c*100&6b591b0661709111fd38d252dd9492bd*100&6c93a9de90538714920e2c9c0a1f993b*4&dca67ddae8eee10879aa8ea0b599968a*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x16x1024x80:1x16x80x1024_n"534aa291c556b70bbe0768e0cbbd1019*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x1024x1024:1x16x1024x80_n"f8d69a6a606d96052b017a94ab332ca8*100&1e311d488ec507bf4295577a07faae38*100&1d532625388c32e3102ee900629f043f*4&da2bfca3fa4ea07ca5b5b8911c1c99e2*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x16x1024x80:1x16x80x77_n"b468250c35c93a74429e9d4fc04aa2fa*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x1024x77:1x16x77x80_n"7574cb90ac3b20d23b27d70f39b3cac0*100&5122e8a41f8e6b83701a9a9b5609e26c*100&0091efadfabf4f74b3db4ac5c743f0ee*4&7136fd1aafdc704b781361fcd8a88762*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x16x256x160:1x16x160x256_n"b62df8a7a54031a593db0af2ac330082*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x256x256:1x16x256x160_n"b5684f1ed4af16bd19c66903c36900ad*100&c7cb36405f1da387fcff12933c313c30*100&373459d5a6d028d6f56fab7ff3bba0da*4&60246e2a892887e7f20558bd7315e550*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x16x256x160:1x16x160x77_n"ed1b5b8697e54a622ad07e8443dbef72*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x256x77:1x16x77x160_n"c9dab3c1334f1abfebb39cc154f8e52f*100&5bdb45ce36d922fea3bf9b407b839534*100&0b4e13b1096d4588a4ae1df9bbafa678*4&be928c090b03bf0a239381d376ef4979*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768x768:768x768_n"5aae35159b843d47ad4fee7f8752f9c5*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 384x24x64:384x64x24_n"1c219a7fc17d1275f296e70ecba93206*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x24x24:384x24x64_n"86b122a50b824f05ed63d3f19b6d4f1b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=sum 768x768:768x768_n"28008f8483fd6957a7ce7d0f5e59c2d7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 768x768:768x3072_n"523132eeb604945351bb65307c72addc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768x3072:3072x768_n"06ddb5913fb52376805497fd57281111*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 768x768:768x768_n"ca2d114c168e93891af906b6e7f300c5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768x768:768x30522_n"64e06a5ddf030f14221d0c9b2bf7763d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x512:512x4096_n"e90b3b3825e159f38cd1df6d36af0588&f6b05db560808c835057baf9725f5107"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x512_n"3b25770c8ae6e9f05c53415c34698300&a9df5cc049e216fa4a1261d886a9bc66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x13:13x512_n"c53e212f28ea8b0d5a55ba508da786a3&dfa793b93ef828fe5551902d8e985b52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x512:512x256_n"8c7e2455b47f43615493b0f062cee043*2&63a0cd4305f1277060560121f4588c9a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x128_n"feb15809dc1189315492d092355832c0&1418200f92d937cb11952881aa162fed"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32768x27x128:32768x128x27_n"469de23d3d62a3c962e626f4c16609d2&e5a622bac20f0478eaa79af02a9a8608*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x479:479x1024_n"7ed2e83e8d5ee951da2a6bcca49e31a7&da86b4e5b0ca03f60853abd2aeb23e72"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"07865e381fd2642cafa8619d868373e1&f98c2b6e21e175b3739ddd4f376d41f8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x512_n"8992ff5e517b63b1cd1a753cb488ad49&37460b310e9a9aa9b13df740da4c781d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x1_n"2139be8a24ad78135d05723d628841e3&82a201c0c4e690a2a009cc884f3f56be"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1:1x256_n"7d7d63c695f3da7a3e2bc65653e0999f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x32768:32768x256_n"20bf3a67d2be62c85cf19ad7541cb8e5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x256:256x512_n"0965834085a9b51e66f0f419125ce2af*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x32768:32768x512_n"07e2602134b812881cd4d15f20cda2bd*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x512:512x1024_n"33fac66d71aecdf0f03c46bcb22e4b57"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x1024_n"b77aafa6927b71485cf32d030080a20c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1024_n"01dccb4bad9d5c79e65a9f4a84203b9a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1024_n"9ded033388aba4d72928a2d349d7327d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x479_n"b5fc843ce3768d9bfec6dbdbbd59c135"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x479_n"0e8373510ee6a775f6a99364918b56de"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 32768x128x27:32768x27x27_n"36ca3fc174613e2b593099efa019885f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32768x27x27:32768x27x128_n"cf9e7a6bf9f081b607755550bedeff7f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x128:128x256_n"ca948f56f0aec2164f020a0b925ec474"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32768:32768x256_n"34156c2bf2f8d729a889f5fb06e53f90"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x32768:32768x13_n"67be2cede85f966fe34498664155e1c4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x8192:8192x4608_n"5f531e44dfc27065208514a0eea59d8f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1057x64:64x64x1057_n"617e9c4683a18a88047ce379687a3760*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1057x1057:64x1057x64_n"4b01a9e7faccd02633a74a62f9fd05ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x4096:4096x8192_n"0f5dee8bc1d4bfbdbce825f19a83a368*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x8192:8192x16384_n"0f2267333a6b039711da009c4cf597d5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x16384:16384x8192_n"1581d564493208de7f27c6056484dbf5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1057x8192:8192x65024_n"7252f99c4021871dfe9c78c2e8accd6e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x8192:8192x4608_n"16fa5293efb2d420d49d1de6047015ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1058x64:64x64x1058_n"fedb8e94fc7195ebc9238bd5cd71a5bd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1058x1058:64x1058x64_n"5e293753b7a7cc56ab343712af3e2c0f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x4096:4096x8192_n"5512940baec1ee20f759fe47c3877e21*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x8192:8192x16384_n"94cb60a5b4c8763063710e05173d3ea9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x16384:16384x8192_n"e3e6e3f71f4de80588563238a2af5409*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1058x8192:8192x65024_n"0d82d410586a8849f08eff93b85b3bde*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x8192:8192x4608_n"b00eb114c966457e3b53e1d7363af533*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1059x64:64x64x1059_n"e5d717ca44e5230380bffcb8b7846c57*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1059x1059:64x1059x64_n"6d3021aa096f59086929adc59a63a1ad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x4096:4096x8192_n"fef0eef53e6b255a632bec6531272987*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x8192:8192x16384_n"73276833e9a9e48bbc6da1664eb52904*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x16384:16384x8192_n"a92200c1f5ad59b8f75505d34c929788*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1059x8192:8192x65024_n"e74035585e5884111c5ece54fd772735*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x8192:8192x4608_n"1d26bc91be82a88d794af733ffd366f6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1060x64:64x64x1060_n"fbfc419f93f6cab392da1ee8a87d3205*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1060x1060:64x1060x64_n"7bd4ae9ea7ea3bbeffc6f91ecf9561a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x4096:4096x8192_n"4bb230f8b1f4aa79afd4f3ae8e292af1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x8192:8192x16384_n"dc3cdfebc2ce79237757868b089b13c4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x16384:16384x8192_n"d406bb99962e63d67842c9c74dbc45a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1060x8192:8192x65024_n"03cf5a2b5a00c56db9d392a9e03257e0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x8192:8192x4608_n"6452c9db3b15609818b5b67e6c19a25e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1061x64:64x64x1061_n"b1b846b2a7413f427c15a376ba6022e2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1061x1061:64x1061x64_n"902007c7bb162ec2547912b26e84e69b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x4096:4096x8192_n"a7f16621bf09560dc3f15d8821b32141*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x8192:8192x16384_n"cc55b56c5e116f7787482e7ce31d9b42*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x16384:16384x8192_n"8d1230f57da1c1b064ffde7053d129ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1061x8192:8192x65024_n"75d7de8fb0dfdf15b5fa39b83ec38d4f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x8192:8192x4608_n"015cfc9b1325bb7b3a04fc94a0b404b1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1062x64:64x64x1062_n"260db021b96081497cf0535cf835cf12*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1062x1062:64x1062x64_n"c1fab3cfa59c41db4939e7280881c047*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x4096:4096x8192_n"be482f62eb5de2f1d587b036e71321a5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x8192:8192x16384_n"5c306c7de28b457e6426d7e3be3ff7de*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x16384:16384x8192_n"928cf3f17aef366fc280add33ec59975*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1062x8192:8192x65024_n"e8ab326fd121184b3f8a6c2eb1273f9d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x8192:8192x4608_n"12d253d4a1a68e4df2eea70dea63645a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1063x64:64x64x1063_n"75dc3246e0912f0c1c83d9e46827bc57*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1063x1063:64x1063x64_n"23fab187b76c7f8f04db58b2459f77b4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x4096:4096x8192_n"adfe6dc8d1fab2df0b5f86d2f2cf3be5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x8192:8192x16384_n"dfdf14130139357ee6f54091199bb076*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x16384:16384x8192_n"5210548666c41b72f3062e96dd2aa1c4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1063x8192:8192x65024_n"3344a346b4d529a85999c7127824ec23*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x8192:8192x4608_n"ef20e6bf2784a473adb96819f2a52a46*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1064x64:64x64x1064_n"1fa9e380f9b31697c0cde0ee37600f06*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1064x1064:64x1064x64_n"11134676cd8e216fbfd91191be31c919*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x4096:4096x8192_n"5908e4d3fd6a06be52175e169de95dff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x8192:8192x16384_n"c0cea9d5369adb903c98f99f13525427*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x16384:16384x8192_n"0300170a817788625646a0f386db0647*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1064x8192:8192x65024_n"594c8112e78e50f9ebb88a58fc967bba*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x8192:8192x4608_n"aab82d2894f888458a94f5a674a0a43b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1065x64:64x64x1065_n"1ad70589376feee67f3b1f8913fdb40f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1065x1065:64x1065x64_n"25611cc8897f60420de25582697e6832*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x4096:4096x8192_n"5d6e9ac568ce6be311903ac600d5dbd6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x8192:8192x16384_n"aacab07b79e40c66c3df513d3ad74eb1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x16384:16384x8192_n"cd5da992aeb12b2a54e5abfe4fd611c0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1065x8192:8192x65024_n"35f1cd31a50f6e108ed1de7d92711ea3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x8192:8192x4608_n"d804cb57cca7fb20a01b48a3afaa7e6e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1066x64:64x64x1066_n"2a2b83c47ace551c0103b31d1fae0a19*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1066x1066:64x1066x64_n"e567ff17590f0060187b296a189e9415*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x4096:4096x8192_n"284d8d9c7e1e51a9f8d8b1016df410cf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x8192:8192x16384_n"411c3df6ea570e81bbf0938a622fff10*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x16384:16384x8192_n"92f14390dd3cca231c4f2b9ba2fe4039*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1066x8192:8192x65024_n"8d06be78be1867a3ab8b95649633f7e0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x8192:8192x4608_n"9d026544b3b9bd8fed25d14e67039782*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1067x64:64x64x1067_n"f064e8c95853e3a2e10a1be208a9e77d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1067x1067:64x1067x64_n"9cc2202ee93d7a493083bb17481874ac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x4096:4096x8192_n"9bd5bbfc1219551291aa0dca33f0679a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x8192:8192x16384_n"638c23cd930b8cf0348aa3a0ff04a1c2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x16384:16384x8192_n"674cbeaf777687cd7d146a019bcae0c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1067x8192:8192x65024_n"882e6477fb41c13edd7504d263873bbc*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x8192:8192x4608_n"a5ff616b63ce518e634ae61cfb6f05eb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1068x64:64x64x1068_n"6c77a95dfa734b3fc47635c60c14c22c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1068x1068:64x1068x64_n"e7e4d189417c70ea50dc95a4502796e8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x4096:4096x8192_n"044bd1ba1c82d880f40bca5df7ccb3ba*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x8192:8192x16384_n"52b517cbf9c2693ebe45e4db1a2a2f69*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x16384:16384x8192_n"b154fa57772897bd53ba1050798fa191*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1068x8192:8192x65024_n"f7655a886085844e2744c5f774e792b5*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x8192:8192x4608_n"c165acfafe05fea7ff0dd87cf7651faa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1069x64:64x64x1069_n"7400d651ddc2e5592e410e4cfc7ea867*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1069x1069:64x1069x64_n"29a1e5473d7ed72e5e58af1e602609bc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x4096:4096x8192_n"ac7ab11d99058b63dac9a84427f87415*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x8192:8192x16384_n"151e63cefd2b82cc568d086675f2742b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x16384:16384x8192_n"258608aac5bd69c83f82500dc903f1ab*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1069x8192:8192x65024_n"cadbb187d75bcfa999a08cd04b653deb*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x8192:8192x4608_n"5b9f897e4f716da605dd7a1789bc83b2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1070x64:64x64x1070_n"040fd3e67c04d51f508e05a63975bf31*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1070x1070:64x1070x64_n"b2e8be30a1069e65f01ba4b77c865ef3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x4096:4096x8192_n"27baa44fcf84d0f8af43b3c890a80e59*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x8192:8192x16384_n"a23047ba5735317a56e3e99535400ff1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x16384:16384x8192_n"468bff458c3b9507f746f9762367569a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1070x8192:8192x65024_n"c7ad68e43958aa0f904007e6b307082c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x8192:8192x4608_n"3455c4b88bd4cab77ccc28d165d102f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1071x64:64x64x1071_n"471962133f8d9a2888a18cf601faf807*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1071x1071:64x1071x64_n"0bdca0b9da1f9a1e057aeca3dbafefe5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x4096:4096x8192_n"4a9ea8df5ca23d6df53c41d7bdeb4409*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x8192:8192x16384_n"f5e252fdf24711bbb9a67e5aaf16c143*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x16384:16384x8192_n"eb8fd068591ec814057de62bf987cd35*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1071x8192:8192x65024_n"c9107933d995bee3808e3e524dc06272*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x8192:8192x4608_n"61d4bd976275afb4ccd2a3957b7d3568*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1072x64:64x64x1072_n"b76536f3861616783cd7f560864a45db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1072x1072:64x1072x64_n"35a34ab29a1b8c2895cd0f870414de3d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x4096:4096x8192_n"5d49d0a6c49ee55c31872f24ad3435eb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x8192:8192x16384_n"76211a62c8dc179719530201330009b3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x16384:16384x8192_n"22f2436ee99b500963d589e8a8afff2a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1072x8192:8192x65024_n"c8064221d5785263f371e280d47be90b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x8192:8192x4608_n"c61e07e7a7907920da85bf8655b75e2b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1073x64:64x64x1073_n"a020e99d389fe4eabce1f6238bdae92a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1073x1073:64x1073x64_n"e92584d9c4c111358fb32df8a957b2c7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x4096:4096x8192_n"d3183b66aaa017febb686698686439b2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x8192:8192x16384_n"46413b937f4d438da952a682954cbb5a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x16384:16384x8192_n"5ff754f6377160b1b1a9095cc9e64eaf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1073x8192:8192x65024_n"206b1a07154ce2a79b06ae0a91deac57*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x8192:8192x4608_n"672d696e5fdb1938b6b285d850e05a11*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1074x64:64x64x1074_n"c4ab9ec210d977e1c9e02d4661658a5c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1074x1074:64x1074x64_n"3bf91fd62fad444e90ea7d5d3e1c4163*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x4096:4096x8192_n"6f391e832346ed9f18578c14857e1c13*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x8192:8192x16384_n"3eccfb55c309a312c19e77204f69d754*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x16384:16384x8192_n"1aef1284add1bdb6e5b38207aa7e525d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1074x8192:8192x65024_n"406228d2f14ac9cf4ab06b791f13beb8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x8192:8192x4608_n"6b710e8769e007399aba37680d0bfbce*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1075x64:64x64x1075_n"968a3b7f0c509c9273fff73437b5932e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1075x1075:64x1075x64_n"9b937713f0486e8560da094f4864c623*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x4096:4096x8192_n"45bc5d76aed8336550b0dacebe73374a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x8192:8192x16384_n"c11aaa1cbd33227f753103b9fd7871b0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x16384:16384x8192_n"c71cf90c6f1b31ef6cb89450413275ac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1075x8192:8192x65024_n"1bf75ad4eb70fd6f9512560ee287eb30*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x8192:8192x4608_n"a43964c70b3e931aba99cee20d70d24a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1076x64:64x64x1076_n"03c04956d60e0cdd8bc04053fe7d7387*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1076x1076:64x1076x64_n"33d66e2e8e7a4f9db50fb8dbcd5da639*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x4096:4096x8192_n"6b87971bf056e9666d52536b2672f08f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x8192:8192x16384_n"daa01eaae6d79b1e12e4585e5b6b5135*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x16384:16384x8192_n"94fba5d4a704258da3e898ff0c4e1193*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1076x8192:8192x65024_n"7faf63f39f0eeb4d1dc2533229750582*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x8192:8192x4608_n"4bf77a114c265d8f8263c0377e6096d5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1077x64:64x64x1077_n"61b2cd6835f20b78e596ccb2c00dc7f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1077x1077:64x1077x64_n"219d124dff8a9a193ea17592f79eb392*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x4096:4096x8192_n"647ea3e2dc0d50ccd8f33834b8721b56*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x8192:8192x16384_n"f436e1930d3c555f9c803f752c9fa68f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x16384:16384x8192_n"50288a52982be5260b73ec82006f0bb0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1077x8192:8192x65024_n"ffe18ceabe4b9e692a9fbd34ed6e3c45*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x8192:8192x4608_n"8719e3a852a1fbd83a6d6ff491fdcc3d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1078x64:64x64x1078_n"84d19637f4524f1b2725207d56c5ceab*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1078x1078:64x1078x64_n"e34d28b0b812a29933f80fe432b1c5ad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x4096:4096x8192_n"e2b9f26f8da96cf2522f2518aab88046*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x8192:8192x16384_n"76bd90eb921b0f1a745db7aac20584a3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x16384:16384x8192_n"0aca50e1af126086cfd75f9ce91cdfe9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1078x8192:8192x65024_n"653db34c544618fe1ac6622d1926923c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x8192:8192x4608_n"981e25576f8c587a567ccb2a96f020db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1079x64:64x64x1079_n"9058a67b08b7de3880c2ae444b89fdf9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1079x1079:64x1079x64_n"697c17fa6ab10c3d6df6e672d33b11bc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x4096:4096x8192_n"29b5bac27e4e4a8b1f499153f26df37a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x8192:8192x16384_n"3f7214b4d5a1190a39faef910b0ab36f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x16384:16384x8192_n"5bbfddb66d5e2de35559c931e167addb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1079x8192:8192x65024_n"de9e13b7b998970c865af06f5562e18d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x8192:8192x4608_n"5b98d59de5028287125e02e2da087cf0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1080x64:64x64x1080_n"97cbea8ac5a3132d06b74329d959793c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1080x1080:64x1080x64_n"381eeb5478ab56b84a4be08910bc8adb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x4096:4096x8192_n"3dc5a8117ffa14a6e520ea0b2939ef67*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x8192:8192x16384_n"78f4a500218d8a24c7dfe8d543546e04*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x16384:16384x8192_n"b6564a676774c33bc2c19fb5388c5551*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1080x8192:8192x65024_n"52b1853ad7bb4c4b2aef0e69e15ac6f1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x8192:8192x4608_n"4d3c03d1dfd026d10c2d1d02ed43801d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1081x64:64x64x1081_n"8e3792ce2a64fe1b8de184364b202ea7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1081x1081:64x1081x64_n"04b55d054d75b663c0b98f2f2161351d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x4096:4096x8192_n"2242e6bf466dac8a861db0ade6861446*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x8192:8192x16384_n"04fabddbc8dba699d724e5d11b5d16f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x16384:16384x8192_n"73fd8c0d3b2a1647e7d8835865f30a67*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1081x8192:8192x65024_n"789dc3e17850c2ad21a872e36b49c9d0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x8192:8192x4608_n"9845f2a33b5b14eedbbd055803fb7148*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1082x64:64x64x1082_n"9c0e3785a7e7848d9b57ca726dd51dd4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1082x1082:64x1082x64_n"2f871568097b129a69e99517c24b4204*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x4096:4096x8192_n"23114ef1c03bca892356f903ff3c79d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x8192:8192x16384_n"1de7337bfcd16ab6d3f74efde278d01d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x16384:16384x8192_n"d44605b180487f3f5cd27b84a825f44f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1082x8192:8192x65024_n"d199173aa26da52afdc0c74670260fcb*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x8192:8192x4608_n"3cb87de53b8a8579b26a16c853593ae2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1083x64:64x64x1083_n"e48cc16d8e8cfb61231a48a49f4cc76d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1083x1083:64x1083x64_n"94da363e0730d3898c1e42247b1d1176*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x4096:4096x8192_n"6f2ce9b3a1a3de33dd1b8cd5f48f37dd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x8192:8192x16384_n"56f04d6bf10b9bbf9645e302d34c912b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x16384:16384x8192_n"0c7cebbd32f1224e30b8ce5a807b0fe8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1083x8192:8192x65024_n"38f037ffae2fc0fca3aa05866ff9a854*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x8192:8192x4608_n"cf217f7923d96652b294a8aead0f15e3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1084x64:64x64x1084_n"c91573946d225da268565aca7a421667*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1084x1084:64x1084x64_n"d17f4e7d885270ed7fab1c70c268ed79*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x4096:4096x8192_n"0d41ec4d62225699baa93f009ca62c91*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x8192:8192x16384_n"dd23d02f978bb3c15dd9d4abd4fcd356*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x16384:16384x8192_n"f17ce3b89e140ebd1a8ccdd74582ce2e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1084x8192:8192x65024_n"78445e6390942867e3dc51990733a8dd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x8192:8192x4608_n"aefa1f0fd76a2144107007238e8f020e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1085x64:64x64x1085_n"e87dcf29172e1be566adc427f046880c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1085x1085:64x1085x64_n"110f3bccb472ea40837ab06444d12249*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x4096:4096x8192_n"929602348b30ec565b8315a92d1e7de6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x8192:8192x16384_n"348e20949d678e3bbad28248dc93ae80*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x16384:16384x8192_n"97c0ef4eeba03de560ba29e1b442aaac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1085x8192:8192x65024_n"dcebbf256a2868e544c6f0a3937d1011*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x8192:8192x4608_n"fe67b6066f2e1a43d8870e9d1896929d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1086x64:64x64x1086_n"0530d4fec89955284e5b2e6a1a0a501a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1086x1086:64x1086x64_n"16595a1c1614565b2f63c595521dbadf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x4096:4096x8192_n"d5ca9197fcec169c002a7a1f862bbefc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x8192:8192x16384_n"b201aee89522ad4081fc3ae486330a3e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x16384:16384x8192_n"dbe77a4e7664f2baa24cf14bfe22a045*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1086x8192:8192x65024_n"b0bbcfceb82ea6116b63aada26d57cdb*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x8192:8192x4608_n"2f412f573101a75fdf2c7d4287984b33*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1087x64:64x64x1087_n"456e1377ffd8d57050fc82ca737f7bd8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1087x1087:64x1087x64_n"1fdd01d042af05cc2da996977efad157*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x4096:4096x8192_n"c5add58826d93d63274187278c66f381*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x8192:8192x16384_n"89c20ba3576200216893cd59c4d57a2e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x16384:16384x8192_n"0105d9fe3c9f0c478827c7b9c1e15f1e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1087x8192:8192x65024_n"6f957d1c12eb2f49874adead7c227fe3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x8192:8192x4608_n"ba2c3f4d2620dafdf3959d76493a36f5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1088x64:64x64x1088_n"523311c56226093efa960364bb6eaa65*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1088x1088:64x1088x64_n"01a6ecb8a0dd9ba37913f91062d755cc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x4096:4096x8192_n"4f8013ec21ae295e29234c4ae3b28c8b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x8192:8192x16384_n"bf6b63dd1ae9347ef4cbc4c46ebaa805*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x16384:16384x8192_n"18d598be64cdc5a5a9b45200d6d45f23*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1088x8192:8192x65024_n"27a4781079695887a8d690cbd4c709df*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x8192:8192x4608_n"ffe8cdd75278bca6ea6b590280047b93*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1089x64:64x64x1089_n"393a3be3d6fe4948e0d72201f3061412*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1089x1089:64x1089x64_n"27b16ca44dcaf6c1a8502e6da2cc67ae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x4096:4096x8192_n"93ce4bebafee3d687a13496ea9b721cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x8192:8192x16384_n"42c2887fda4db2fcac1ae0521616a3ad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x16384:16384x8192_n"c7938769afb2733d86b27bdb374d5feb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1089x8192:8192x65024_n"59792d27c38d984f7bb291f89fe84fc1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x8192:8192x4608_n"cd0dc642437769011ab5bb8dd947c3fb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1090x64:64x64x1090_n"8b577db3cacd35eae17908dc56e1babf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1090x1090:64x1090x64_n"26a223a605faf9f2044238c51812e2e7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x4096:4096x8192_n"f116ea40fa126ad266dc4e4e160edeb4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x8192:8192x16384_n"760ddba423cf28de70c2190453af55e1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x16384:16384x8192_n"69d378b9f0d671e40db41568abc52e42*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1090x8192:8192x65024_n"94db613cf7833bf89bfc3ca3988478f8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x8192:8192x4608_n"c7b68dc210a4653537f9ec6fb56ec8d7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1091x64:64x64x1091_n"f06fe0cfd93720acf8d6781eac5575c7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1091x1091:64x1091x64_n"34ba9178786543dfc5a184aa20c1a9b7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x4096:4096x8192_n"e372fecf48c6321b8c79d8e3018e78c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x8192:8192x16384_n"5368e157f99bc61a8d9f14b7ddafe0a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x16384:16384x8192_n"38106ad78b84aad58b1345671e5352bd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1091x8192:8192x65024_n"4d6dbecb1a244e84280dfa34c82a76bf*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x8192:8192x4608_n"e186d9fb6cd2cf3056f46fb720d1a1bc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1092x64:64x64x1092_n"99ea8e5ff5b27e345109be6d464706cc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1092x1092:64x1092x64_n"198151b68fb7d27975b3cb6b88ca55fc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x4096:4096x8192_n"5fe06aec5994ed4a2bc72ee3db282439*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x8192:8192x16384_n"8a8bd7cc68588d3ac247fbc7b4fd1289*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x16384:16384x8192_n"9e460a07a275bbd54879914dd2f4bea2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1092x8192:8192x65024_n"21d8063ad48e3e67977360322230cac3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x8192:8192x4608_n"e4f6e48b43372df507263d234c18a80b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1093x64:64x64x1093_n"d9ce9c8bf3a12826c615486550d7afa3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1093x1093:64x1093x64_n"c0f5a13ca97260f366a01f6574e3a726*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x4096:4096x8192_n"f83aa79f1b3747d9d1354b3db84632c2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x8192:8192x16384_n"a767bd48cf47459fde31a1b71e292828*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x16384:16384x8192_n"471e60ba221fe5fded02f9ee7ae8ad40*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1093x8192:8192x65024_n"69f230f4c4e7dcbe7e88ca3ab8bae1b6*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x8192:8192x4608_n"8d087462d641c9cb10d9a50322ec7c54*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1094x64:64x64x1094_n"631fd885bd937a0b832daed334689530*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1094x1094:64x1094x64_n"0a11b14f7edb40e20d411f26b387a4be*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x4096:4096x8192_n"06d18ea5899062fc2c3f42e6b43c226f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x8192:8192x16384_n"b20f19429814d9f2cf993980091d204c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x16384:16384x8192_n"08c1b94518033c7fba4597f82c8d108f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1094x8192:8192x65024_n"679c847eec733c67fe39690e5b4cf020*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x8192:8192x4608_n"c530cc84f73101a7b14b85f193e6440b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1095x64:64x64x1095_n"ca3fe915f6d49703d8324dabe42704ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1095x1095:64x1095x64_n"653ff5b507229fb4657113407a6f7fba*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x4096:4096x8192_n"97b0d31dbe90fb5962ed442394ff5595*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x8192:8192x16384_n"e49f628a8155c207e292f1f0463840fe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x16384:16384x8192_n"3c4c978e96aeeaa7d26cb427fe6b53bf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1095x8192:8192x65024_n"6c2b5969d595629b959dd40bdc81e5f3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x8192:8192x4608_n"148bba66bbd28b942026b5d05e6edcfb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1096x64:64x64x1096_n"3ec7dd3f6358b6273c225220707f8bbd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1096x1096:64x1096x64_n"736edd00c0918f0b3e78988f7fd82b99*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x4096:4096x8192_n"deba71af01ed0a22f9a49f92516cf7da*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x8192:8192x16384_n"9ef6e2e94060e2c9066df1116681608f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x16384:16384x8192_n"5198b423adf13e90caf27da02b5f471f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1096x8192:8192x65024_n"02d40bec95bab4fbe56c337fc38ca4cc*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x8192:8192x4608_n"c6e15c52e3eb04678e67a2f894410b29*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1097x64:64x64x1097_n"e43465a9aa1c91e29bdf1013ec25f4f5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1097x1097:64x1097x64_n"9456e0cc30ba3ea3d10ed5e56fcadbbd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x4096:4096x8192_n"c63a8aeb600ea8c4c55a53efd18c1b09*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x8192:8192x16384_n"960ab81a2263a07cc802fbb6611c60f6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x16384:16384x8192_n"05d2317e97e8081dc735018b756008a2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1097x8192:8192x65024_n"33d296534ef51b0577311f4a330dcd28*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x8192:8192x4608_n"8ed428aaab282500cb22b55095d3ce35*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1098x64:64x64x1098_n"6792267cb43e22bcbb5a5f2d93030714*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1098x1098:64x1098x64_n"a7302fde854586c2fa61739fa0e25107*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x4096:4096x8192_n"5ab66c7ad7caf1cea36d528867b7109f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x8192:8192x16384_n"0f6eab09b7857b74ffa9ddd6c1b39660*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x16384:16384x8192_n"caee80f202e1d8c7916c5cab51f3bda9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1098x8192:8192x65024_n"dc23ea6195bf7f97429a47590724de8f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x8192:8192x4608_n"be329c14dc9f92bf4c5af80cb23ec5cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1099x64:64x64x1099_n"b0791e3e8da5ccb795d22c93a4c365d6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1099x1099:64x1099x64_n"f4f5dd2e4033001d1c46458397999f49*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x4096:4096x8192_n"e2b9476e94ade4a613532cba72317a4f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x8192:8192x16384_n"81eb887fea33b41a6d3a20b653c25c4f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x16384:16384x8192_n"ea847536485e3661939fddea8c63188d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1099x8192:8192x65024_n"1e9a557c8615f3d5bef2663a9ed08f9c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x8192:8192x4608_n"a1ca731695cd58b089646c6360de92b0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1100x64:64x64x1100_n"a80573bdecc1e0798d140b8ab1df1018*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1100x1100:64x1100x64_n"11df492308ea2fb5e06d4e419da4852b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x4096:4096x8192_n"461c821a2e732240a745896501c1e382*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x8192:8192x16384_n"cb02356a7ce9590f389827f8c0dcf626*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x16384:16384x8192_n"953293d652d7328fae6a5ea039bf03b4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1100x8192:8192x65024_n"563da20d9499fcfaec4511ba244f1366*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x8192:8192x4608_n"9b1bec1b96040d618b1bc2087d492983*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1101x64:64x64x1101_n"954a8a7f33c96e065e9b342f3ed8ac6b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1101x1101:64x1101x64_n"913a8fd7dd90442aa8d593dbf9cf28f5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x4096:4096x8192_n"3780fcccac7c1380254ef927064cb3c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x8192:8192x16384_n"d59dbe7fb3c7521cab3539a12e74c247*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x16384:16384x8192_n"6f5d7a76cb45e4c4c5aa76192bed8e97*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1101x8192:8192x65024_n"64c76d5eb74131e80fe2078a17d05f81*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x8192:8192x4608_n"39b455f05ea81669c9135d93ee45acb6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1102x64:64x64x1102_n"b2d3bc3156d785afe8c3e5bcbb2bc62d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1102x1102:64x1102x64_n"fbec5e6555d7a810ecaf0d54bf4c0930*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x4096:4096x8192_n"2291684cc0539092758e2e32c04cf806*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x8192:8192x16384_n"2d552c01c0f381079758d284ff0987a6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x16384:16384x8192_n"698ed771ca3abe2fe49988768a7d2296*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1102x8192:8192x65024_n"e6c4b43518bd4642357eb861b1a335d8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x8192:8192x4608_n"613654e5cc9b9f6f1bbfedeb4e420b1b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1103x64:64x64x1103_n"6725bc9f310f3bd5893d9f3b36251cb6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1103x1103:64x1103x64_n"62e817e81ad0a1195fc73dbac8ebee98*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x4096:4096x8192_n"0d4521a20b63014bece791395f49e5ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x8192:8192x16384_n"bc6ebd64d1686d10fe8864c86a0c10be*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x16384:16384x8192_n"f2d904c1987261223b4031cef27c0e09*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1103x8192:8192x65024_n"187197c86ebaea56749ec962375a3a74*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x8192:8192x4608_n"815ed6a987bfb0131bb8869ff53a0fdd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1104x64:64x64x1104_n"4b468d256cd51a984f5469e66a218425*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1104x1104:64x1104x64_n"540ce30bde42bdb142cda557d1a33a98*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x4096:4096x8192_n"db5d9ac9db7661ff7f33c16c9f3c1921*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x8192:8192x16384_n"eafc47b58dc4fd374c2c17254ad8bd8e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x16384:16384x8192_n"d2220d2a553b54717f77a1e2e8f26339*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1104x8192:8192x65024_n"47477f7d4ae0752f3f56b9b39cfdf4a1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x8192:8192x4608_n"798528fa272d416baa8ac589f32e600c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1105x64:64x64x1105_n"0cc200523774255d078150d6661ea3c4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1105x1105:64x1105x64_n"de4e4f0d184ff7cc3ec39c4eb10e4ad6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x4096:4096x8192_n"fadbf98b7ba7f1b34d9972bf960e7433*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x8192:8192x16384_n"ff292967d8b042d315db2db1522648e9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x16384:16384x8192_n"bace866f146a7f9da8d98a0e157e1064*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1105x8192:8192x65024_n"6012e6e7809323b1e0648151de9d1aa0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x8192:8192x4608_n"45ecc5edc0fe8bf28b9445b75f65fabe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1106x64:64x64x1106_n"c3e668d92870d085d53a7ce6c06dad56*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1106x1106:64x1106x64_n"ef09f72d6f4d5f47b13a5d118f44da98*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x4096:4096x8192_n"8de67f9bccb97150ab940cdf3e2b9e4a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x8192:8192x16384_n"09cbeb626fcb6560c6971a8b63a265af*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x16384:16384x8192_n"649735109abb41c2f92f755d9b26a9e1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1106x8192:8192x65024_n"1e45d1c85e82a89810b8e16f2d135b78*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x8192:8192x4608_n"892833df98c0902a169a8997f09c1d5e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1107x64:64x64x1107_n"6ae1478996b6319eea85a809fc6b1719*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1107x1107:64x1107x64_n"b65f5d4308b3981b4b7b401f46f511fb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x4096:4096x8192_n"121f1b0a7714c768279506276d0e2c6c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x8192:8192x16384_n"47e4fccca69a8282f772da9607d59250*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x16384:16384x8192_n"94c347d90354629242b82e1bb2ed88e7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1107x8192:8192x65024_n"83ffe7598492fe279cd9c85c5c2c88fb*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x8192:8192x4608_n"9cfe2eebdebaaeb15fedfc0bb3f2ab57*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1108x64:64x64x1108_n"55ea1ee874f3cfed4e845729b3f4773a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1108x1108:64x1108x64_n"fad94f38ae7a1bc79b014b9082b0d0c1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x4096:4096x8192_n"23b33e750b5282c5bc85009654f7de6e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x8192:8192x16384_n"3fb6b0ca7f276526339a863f13c6f4d2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x16384:16384x8192_n"59e31865b8a2972adcc793cd610ffff2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1108x8192:8192x65024_n"b2cec4e3510aaaff46e17f28901115a0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x8192:8192x4608_n"fe9616415464bfec0a7a515f241f2d57*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1109x64:64x64x1109_n"0321a8b6cb14871aeedf6db7c7f5cc21*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1109x1109:64x1109x64_n"bb543333e2ad1b9af4ffa5d28f18dbc0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x4096:4096x8192_n"310bfda085ae4847b8b61c1c4cecf234*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x8192:8192x16384_n"d28378d9489e86a7312e1ca9fd728fa5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x16384:16384x8192_n"c85a30f49c82a904001dedea24e79975*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1109x8192:8192x65024_n"1a91847b4333f13db717c2d97a9d0fd8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x8192:8192x4608_n"ff1280c487cfa210274dc4b859adc961*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1110x64:64x64x1110_n"ddc972aff45e3603f7d8f43a004d36ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1110x1110:64x1110x64_n"e137d0005210aec30a7761aeb57b1d59*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x4096:4096x8192_n"09faa370ff7631e7448e1c60cfa7552d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x8192:8192x16384_n"fe379bdf6cb369478bb7596f3d418a26*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x16384:16384x8192_n"130d9bcbc722a84ca8a06c48d341ec06*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1110x8192:8192x65024_n"a1c50f6bae40d20eead04d55c9233c1d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x8192:8192x4608_n"50abc8c208c195510596511174754841*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1111x64:64x64x1111_n"622ec9b5428cf327ef9131b9ead09cf1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1111x1111:64x1111x64_n"542b3b5cef63faf372b36a46b7ee8a4e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x4096:4096x8192_n"d84c4434261b4e811413d4901c60b993*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x8192:8192x16384_n"ebedcf0bd21f66859d7671a1d78c7359*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x16384:16384x8192_n"b23061de5b21fc91ea9fb7cfb6345f11*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1111x8192:8192x65024_n"d67e5f46ce440aa22fbb093fa884116f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x8192:8192x4608_n"a43c57faf4e7b2f05eb01b5820b4ef50*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1112x64:64x64x1112_n"892669447da917c32289027a49e183ca*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1112x1112:64x1112x64_n"8805474adf66b0d0ad511aa79c804359*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x4096:4096x8192_n"07b47a94b6505a4a42e021d730ac3c00*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x8192:8192x16384_n"5d70c677115871d1aebcc0f4d0837a79*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x16384:16384x8192_n"5275780c059639afd3ac0a1863f73dcf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1112x8192:8192x65024_n"432442ce10daab81750e9aa12a7aae92*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x8192:8192x4608_n"7b6909ee17743c6862e3f66c74c3082d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1113x64:64x64x1113_n"bbcf900a7dc6ae9c0f1c63984f1e59a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1113x1113:64x1113x64_n"98f17a914b2e3c67327a9d9451bbc09d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x4096:4096x8192_n"0dcc749e272ea0220ad9c5dfa5354741*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x8192:8192x16384_n"be6a8184ef3df750ef6711ef5311a1d7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x16384:16384x8192_n"2e4bf840a4fd2ddb2bb9a8e579688723*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1113x8192:8192x65024_n"0b0b04575e5b998638ae0281386234f3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x8192:8192x4608_n"efd45a693bd1be538a8d5d02a5ad2e6b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1114x64:64x64x1114_n"842d0aea79791d7376733261f5ce35e1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1114x1114:64x1114x64_n"fdba9027227fac2cc4a259543491c74e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x4096:4096x8192_n"7aa331e005b52f0a18a1c48b21182b8e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x8192:8192x16384_n"5007436d6f87780fbeaea9aeaf50ee9b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x16384:16384x8192_n"4a9a5db51aa4aef943e865d4f74cb3dc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1114x8192:8192x65024_n"da349ae520998a57730e266fa9ec6e4f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x8192:8192x4608_n"b0b73b915a2a9e18f997bcdfe9e1b7d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1115x64:64x64x1115_n"e4504fd506bac9eb2ec83ea38bd26f9c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1115x1115:64x1115x64_n"df3cafe116d345a110db100e003207fc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x4096:4096x8192_n"d5f8107fe76d0819569fff5b5b4d086d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x8192:8192x16384_n"337165c351036e3cc8dd368246e989d4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x16384:16384x8192_n"ae8686fc52584a4987d151a11d9e8058*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1115x8192:8192x65024_n"a12708a2f9ac8658096cb7d761926b1f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x8192:8192x4608_n"a7192871a16d97c89c9b35b8d031fc9e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1116x64:64x64x1116_n"186f48f23b367707285d818bc7fc4716*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1116x1116:64x1116x64_n"e457343194ce566cac3edc8e0ff4e613*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x4096:4096x8192_n"2ac08f3f6c59f21e0627d72699d029c3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x8192:8192x16384_n"4b91ba772a166c2a8d9bb12f8f1e1075*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x16384:16384x8192_n"d13d70c7f964eeaceae5bd78c321a8e0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1116x8192:8192x65024_n"37915ad590708ff18cd1970b1d642aee*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x8192:8192x4608_n"a66de956d5dfc492392fc560ffe4d1b8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1117x64:64x64x1117_n"a8bf5877c3ee81a2f792075675ede768*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1117x1117:64x1117x64_n"cb7dc4186e6907f9e952d6d185c1422b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x4096:4096x8192_n"4600e28658439d43b54e0a8aaab2eaff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x8192:8192x16384_n"555a3977fca229cc7585212ddcdcbfa4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x16384:16384x8192_n"76b03260b672c13feafcfd303f20a188*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1117x8192:8192x65024_n"9ef9cac89b93001449896d8a82fb36e8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x8192:8192x4608_n"6cf2a5a35a1ed998d186e67b87e9d104*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1118x64:64x64x1118_n"1244d880ddebbeaa3eec2306e881c1ea*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1118x1118:64x1118x64_n"fe16a83d8cd3754ce4ebeede68df24a2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x4096:4096x8192_n"f514d63b868d72dcc4da19e54f647e8a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x8192:8192x16384_n"efc518d0cb2fbcccf6905f2aacac7aa9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x16384:16384x8192_n"e542352d90740b18c6ecb4233fabd9ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1118x8192:8192x65024_n"15c7bd83c2d1cdaa93bb57cbcd6cccd4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x8192:8192x4608_n"38ba01a059f0738c39a9836aacb118d8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1119x64:64x64x1119_n"2c705a86d3ed402d8f2213b256553774*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1119x1119:64x1119x64_n"994197501897ec8c60baf6c55ec3a144*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x4096:4096x8192_n"161347434a59104695e273834f248f1b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x8192:8192x16384_n"370e6986f3a3fa7ba984f4082d800c00*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x16384:16384x8192_n"8c46f5da04d33e9d40b3587d346ebadf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1119x8192:8192x65024_n"ddeecbbbe18d4d4eaea6751709c1743b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x8192:8192x4608_n"9517d785599c21e4291e348551172566*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1120x64:64x64x1120_n"88a28e1d69625aad69b23101598b1b7b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1120x1120:64x1120x64_n"d8dc202f25775acca0925f18180d1ec1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x4096:4096x8192_n"f5bf47a15cece1950843a285522c8bae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x8192:8192x16384_n"6280b3d9b6f5cc6dfffb705cbae55a3f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x16384:16384x8192_n"fbf4dd56119c332238e366f2cf75d8a2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1120x8192:8192x65024_n"4e1db23ea6932ee52558968fcdd4c170*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x8192:8192x4608_n"03663200c39527c4a7ebf75c13989e8e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1121x64:64x64x1121_n"6900d3b5784d4fd52b84bff1b175304b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1121x1121:64x1121x64_n"9c023a08c4b641ab749f64105a71ef11*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x4096:4096x8192_n"f74a0f7a42fdab8bd4a8ca21f91bc2e2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x8192:8192x16384_n"c78daae21970e7bd11e9d16754a93c6a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x16384:16384x8192_n"edcd7145f4bc020b5b0bb88fbbc32ba7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1121x8192:8192x65024_n"32c96361e6d7a75b9a2e2b5b568367b8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x8192:8192x4608_n"395da66c5c989e3bb9f56307e838ea81*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1122x64:64x64x1122_n"fb43a22bb29bec3cc0e892e090797951*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1122x1122:64x1122x64_n"487ab7d42c5f6f0d40614ffde5d15605*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x4096:4096x8192_n"4de38934c10d2a7fbc75a604ca4ce9f9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x8192:8192x16384_n"3d360d0849eb01e973146ee968ff088b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x16384:16384x8192_n"7d7218a6593eac24e442e0390f8d2992*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1122x8192:8192x65024_n"919fc58317cb3b4e2c8f19c19f75fb82*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x8192:8192x4608_n"5052f9b16639e369a6b16a0188f7b893*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1123x64:64x64x1123_n"0bf3ceeb5533e37b4b6c521a14b6ca66*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1123x1123:64x1123x64_n"7b88fd7c7d630f1a85cbff1abfedbaef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x4096:4096x8192_n"dff785a7324274284f7a3aae010754f9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x8192:8192x16384_n"9bf6785dd76980fef3e03f73bf7d77ad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x16384:16384x8192_n"6bb9012913de773142a2c415efa356ec*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1123x8192:8192x65024_n"5588a04d0f3aff4356e578248e5888be*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x8192:8192x4608_n"6162a08947cac1ade338703feeba8c30*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1124x64:64x64x1124_n"83fb7646e00abace8664d5036dee1a86*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1124x1124:64x1124x64_n"7346c00dcd02bc176b1e9f2cf4154845*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x4096:4096x8192_n"538ea4380af010a3f4bf60a0906370b5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x8192:8192x16384_n"8b77df4e4a0570eb3ead76f01e64cb55*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x16384:16384x8192_n"9ea26b6378a91d145b2ee253436644eb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1124x8192:8192x65024_n"8e095db444f88b86e901ff8e0fdc6054*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x8192:8192x4608_n"90034f1c7a7c0f63b6908787f0cc170b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1125x64:64x64x1125_n"cf9ffc90836722eb87846c8c9b44099c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1125x1125:64x1125x64_n"75a0ff544c453eab4960de170eaf8296*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x4096:4096x8192_n"88fd7ed0e67c383cd80fb26b39a3621b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x8192:8192x16384_n"dab4e5762fd6db208a4d9905f7450acc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x16384:16384x8192_n"592aa4120d61a18e0ea50d3ecfbaf3aa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1125x8192:8192x65024_n"f23dd9fc5f4dd1239a0ddfab712e0090*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x8192:8192x4608_n"b4e3ddeba803c92ea3c8315ad6af6bcb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1126x64:64x64x1126_n"2d572f4efecebfbb9b7ffa2458a2017a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1126x1126:64x1126x64_n"918460dc21cd1b359eb0b679b98dc8be*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x4096:4096x8192_n"ab8b79014c5bffb13a727460954b8f7e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x8192:8192x16384_n"af0928bb071918d4f6eb2c1d7aa06c8e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x16384:16384x8192_n"ca7d6d37b9da3917ae1dd1fb21c28593*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1126x8192:8192x65024_n"7fa5e3bc5c3d45f25c7f5f17d946918f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x8192:8192x4608_n"0d7d4258683544bb3a885a3d0f947138*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1127x64:64x64x1127_n"8b7b1004e6dc03951252832378950887*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1127x1127:64x1127x64_n"d011b8e5615caea627818b4683535f9f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x4096:4096x8192_n"07f8c3992a4f5d2a4a855ad8cd2c7316*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x8192:8192x16384_n"fb03d681ac6b91f897309054cf249ad0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x16384:16384x8192_n"62d327ca9e81f00f34ed0e5ff7fd7573*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1127x8192:8192x65024_n"ef3eca2757d1f8cb43d3d3fdc2ac0397*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x8192:8192x4608_n"e33c4e10b95ae4c0f638d4ef71562d51*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1128x64:64x64x1128_n"665b50668edea079bb4da8eb27f97cda*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1128x1128:64x1128x64_n"7fa2506c76f5ee19de52eb3dacf5d2f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x4096:4096x8192_n"60252c6661ff36021a3cf58b5e236183*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x8192:8192x16384_n"c0e9c3d23ed8dc0615caedc4da91939f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x16384:16384x8192_n"518c185394be267dcc7e9adff23a3121*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1128x8192:8192x65024_n"3f59c1b5714c05553be1acd43aa9bdca*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x8192:8192x4608_n"d1d669c081f5a5617e9a4c5c880727f7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1129x64:64x64x1129_n"0f8d212ce32ac9d3b04abad60b439c92*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1129x1129:64x1129x64_n"8a9e975e123ddce678f3a59ff1757da2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x4096:4096x8192_n"fd07f75ce9c4bd4ed7a9201ed7876e29*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x8192:8192x16384_n"ce87b399f61de95fb3665037cd408ca3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x16384:16384x8192_n"e5bdfda207df2ebfcbaf63385d9843b5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1129x8192:8192x65024_n"aa7376b75a17a0c24acbd29039467c2d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x8192:8192x4608_n"7258e0592dcac1492de02aacb32e44d7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1130x64:64x64x1130_n"74a834b150ce976e9d415fae0e17b166*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1130x1130:64x1130x64_n"45ab9229a4333b41b31fdc1e82fca2a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x4096:4096x8192_n"32603d423082fc66d5a4ae60a568353f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x8192:8192x16384_n"d0193c3a19b8906d4374bf0b76afe88c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x16384:16384x8192_n"6fb28f4520e987608746c6cf79e8b3c7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1130x8192:8192x65024_n"ffba4bd049d07900b23210449b56f968*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x8192:8192x4608_n"09898257bba9dd034f40b127fc8058a3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1131x64:64x64x1131_n"e7de68d51054c4e422d061852f176d69*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1131x1131:64x1131x64_n"33dd062312ab6ff59aca468f38bde433*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x4096:4096x8192_n"621aa27abc053e4b39ea6c32ce7c0238*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x8192:8192x16384_n"213b46bffe5673d5b7f93197db6522d7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x16384:16384x8192_n"1416326002f598491ff072fdb86a9c83*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1131x8192:8192x65024_n"1ed9185236162627988d4200090b3f82*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x8192:8192x4608_n"a93cb27f45279a02d4f4f625f44e41c5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1132x64:64x64x1132_n"f036f18b48045008b436d3d92f263c56*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1132x1132:64x1132x64_n"dcd1835dbd7493d38758c0c307b78e36*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x4096:4096x8192_n"6dbe51e8fe3ef309056a006d88a1e4ae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x8192:8192x16384_n"4e4ea0e25b14941ce16f37dd7c71c707*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x16384:16384x8192_n"886d30efe9ce4982e2705806b468274c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1132x8192:8192x65024_n"da405f52f02b3a034b0f2e1a299cc827*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x8192:8192x4608_n"7b9038a25a15dcd39d10163955a958e8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1133x64:64x64x1133_n"530b78f77911442d01f7dbcea722f8c6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1133x1133:64x1133x64_n"79629297ec32ed3582cefa6600b36f3e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x4096:4096x8192_n"ca32ae77eeae5cccbcb29f61fbe01237*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x8192:8192x16384_n"0888cf0f0bf443a99ed784591b548634*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x16384:16384x8192_n"f84393618631351469894a4325c47baf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1133x8192:8192x65024_n"e7ea4f634498b904d60e40ec66078cd2*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x8192:8192x4608_n"88556e617893e6ce48e84184fb2fb3e0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1134x64:64x64x1134_n"ef011305edab6c1f741e5d1497f5091b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1134x1134:64x1134x64_n"ab306566626126011ac56212d678bbdc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x4096:4096x8192_n"4639bc7d4667db556f3320dea15471e4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x8192:8192x16384_n"9089ad010307d43892ede04a2f53182c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x16384:16384x8192_n"6f1b363abcda35985f07ffc6a1ae130c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1134x8192:8192x65024_n"1677ee9710e936abbf1052359f360800*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x8192:8192x4608_n"90c6b03cea5712494a3e216a2e3526d5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1135x64:64x64x1135_n"f50db4f1b9128d3119eaff07b60347bc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1135x1135:64x1135x64_n"971276511d2d0259e2a405b6c2ab395a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x4096:4096x8192_n"d0e703a008dbe3558e6acb5d72451618*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x8192:8192x16384_n"ad81b41f7cd11319a7aa56a10babb475*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x16384:16384x8192_n"6b994b8cb9b862eb842273126792cb65*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1135x8192:8192x65024_n"2c7c0b7da3bcfd673961fcad80f849fc*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x8192:8192x4608_n"4248d03fb44510ab91cbe8b228e59934*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1136x64:64x64x1136_n"cece875a3c150b222a0ab7617d1f95b8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1136x1136:64x1136x64_n"bfc3896b6957a3522f4577a1ccdac773*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x4096:4096x8192_n"ede3d5d72ef4e14fe85ff1c7dff18177*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x8192:8192x16384_n"b13742b2d4fdcfcb415542dfed5efecd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x16384:16384x8192_n"45b50e797e50a50e2caddf2e33c69291*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1136x8192:8192x65024_n"2548d1cec29c2dc88783158ed633ee34*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x8192:8192x4608_n"73fca964edba35e8c7dedc29fe06733e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1137x64:64x64x1137_n"ce990b59fb35076019fd34b72bae1093*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1137x1137:64x1137x64_n"ab5d00609069a4962d49ff76fc5b1f99*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x4096:4096x8192_n"ea76b90c5c8d8d33210e83c0ba0e4b12*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x8192:8192x16384_n"f601d23133a2fbe6d2e32f3269480cd7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x16384:16384x8192_n"951ce3d30fcfcc8ffe39424a1e1edb54*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1137x8192:8192x65024_n"55219b0dca9a7160cce89f285ddfff9e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x8192:8192x4608_n"6650d7bdfd195ff36fcf52cac908b559*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1138x64:64x64x1138_n"4e78c5bfd65f98a91891bd758d91c809*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1138x1138:64x1138x64_n"0586ba70bc47df52209ac346e42d9306*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x4096:4096x8192_n"076b5f9d8763383e9ab0d0c2adf6ea5a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x8192:8192x16384_n"cef62ec51d175c4c725d9778ede3a8f1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x16384:16384x8192_n"8629aecde0bc850540bec042941e20d3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1138x8192:8192x65024_n"8c9f90b44922bbe82ca7e0b7798ad928*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x8192:8192x4608_n"db5db3dacffbd85fde19f265e004e14e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1139x64:64x64x1139_n"afb4803585df911c817878c144264c76*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1139x1139:64x1139x64_n"4b7c603e350386b67a5879bd7d6134ea*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x4096:4096x8192_n"526fdac117f673e447a59d60a38d17ea*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x8192:8192x16384_n"fde9e554b56d0fdd1dedd0efa1a0e44c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x16384:16384x8192_n"1630aee20053c686c906e5e625b6024e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1139x8192:8192x65024_n"b0f7fefe2ddb3039c17af4f85807b500*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x8192:8192x4608_n"f9ae20559b6554aaa1a8a5d69d4c4a3f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1140x64:64x64x1140_n"3a6895e42c3805f43118290bb1434acd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1140x1140:64x1140x64_n"9ccbe221a9eb1950be9c266ad8a36b6e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x4096:4096x8192_n"d951c8a481a123accb40c1197168aec3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x8192:8192x16384_n"86ff0640b5be7bb962f920a08913762f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x16384:16384x8192_n"1aeba5020f67a7f37f7e2642be466214*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1140x8192:8192x65024_n"ec13ea82986aceae5f5159b918a75e89*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x8192:8192x4608_n"afe80b4762780fcaf38ded3d5279ea5f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1141x64:64x64x1141_n"4f8b5a46aefb5520c14badbdf9800f90*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1141x1141:64x1141x64_n"c25b4fa74becaf503777107e5461386c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x4096:4096x8192_n"c7d35e916317abdb99ce4ae50a6ee729*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x8192:8192x16384_n"cfed927e690c7b84138f1de50efd7a81*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x16384:16384x8192_n"e232760e8be8464c5d40860e3161778b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1141x8192:8192x65024_n"ee556eba1de9b104dfafc90be9d61847*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x8192:8192x4608_n"e1237e104a86ffb8314469a7a8a57bbe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1142x64:64x64x1142_n"7d7e9776a974960e23fcb439002ae6a9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1142x1142:64x1142x64_n"495b23079dd753e6e85e01cdfaecf9d2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x4096:4096x8192_n"1006feff4ce809d5256948a9642f03ac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x8192:8192x16384_n"9d0dc209027ea643e942bf9398e56626*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x16384:16384x8192_n"ec32ba64aada73fc12892827698aa493*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1142x8192:8192x65024_n"e0947394696d8b7253359064df6d8a7a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x8192:8192x4608_n"202164965ae01f58fb568616fd85fc18*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1143x64:64x64x1143_n"bd8ec4da627c3eacc43b29f03f471a86*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1143x1143:64x1143x64_n"3a9edaeb00508d6a3709666007562102*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x4096:4096x8192_n"2e868920942bfeac712fcd8697185ed1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x8192:8192x16384_n"72fbca78662f28fcd00b4086980c4d88*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x16384:16384x8192_n"b6110d1865fb5f1a567185aae7cf1bcd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1143x8192:8192x65024_n"614bf32bc1ff5bf3b8f1bea576d3f425*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x8192:8192x4608_n"fe779b2b9254ec9b091e3f062738daef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1144x64:64x64x1144_n"ac39305a4f16aaf476aa1106a1a75ee0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1144x1144:64x1144x64_n"f3946c99857d1ab67223fffa7b9c4bba*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x4096:4096x8192_n"cb1dac8805361ca0dbb7f2610d8f8ca8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x8192:8192x16384_n"7bd9d0838816f2c054478fce05cc048e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x16384:16384x8192_n"61f6ce6db52b0d246356580e6440e30f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1144x8192:8192x65024_n"bae1628e5de3d5ca75541a010e5d9eed*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x8192:8192x4608_n"d6b3127b46e39c13aeda6d8bc7677b3d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1145x64:64x64x1145_n"fed7d95b3e69c3d51daaa994b584baad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1145x1145:64x1145x64_n"64083f888cb5a072b1e1da5d05cd5048*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x4096:4096x8192_n"42b2b7b5b6c105b5a012ced688c6e566*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x8192:8192x16384_n"e839aac6292ae04f881bbe0633c8a7a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x16384:16384x8192_n"ad80da6a2fa914c53fc3448d9263af83*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1145x8192:8192x65024_n"0c871d9a777b11b46bf3cdcab7a0f7e2*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x8192:8192x4608_n"a0512fbc4fd6c31451fd241d08fde1bf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1146x64:64x64x1146_n"96d5445f7b0f92431e3241e5b1964c3a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1146x1146:64x1146x64_n"6d3f9aaf589301c3be2ef0e9c84bcb29*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x4096:4096x8192_n"2900f44e544ea56cda4dfd7bab65ac28*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x8192:8192x16384_n"35edaa0861b93952053c54cf773c1275*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x16384:16384x8192_n"bffbf92e4301c48daaf299154bd9db02*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1146x8192:8192x65024_n"2f919fe183536975cc238f35f9b9de41*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x8192:8192x4608_n"50b4592005f6a240d3e5130cf0b0af59*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1147x64:64x64x1147_n"22cc63224e3c1258e286c1823be2a38c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1147x1147:64x1147x64_n"a5b0d142035b3d8a492eca60adb4aa1b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x4096:4096x8192_n"c5daa32d5ced1f121598d0193c2c6b5a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x8192:8192x16384_n"f0cca961dc60c4a5e29ebaab71cdea84*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x16384:16384x8192_n"352ea28c0a65b6fd2a141f5b8a276560*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1147x8192:8192x65024_n"b78269142c2f6526e18e07a68ec116ef*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x8192:8192x4608_n"5f877980a063bfb3775746eeea2d499a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1148x64:64x64x1148_n"98d6bac1f9214e7e3f556113b9c2fbd0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1148x1148:64x1148x64_n"8e2da9bd293804ee9ddeeb3fc94f25a5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x4096:4096x8192_n"3e20733c357fd1756eb9b5bb9308597c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x8192:8192x16384_n"601d76a3944979e5812c7a66fb83698d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x16384:16384x8192_n"477c96f936b27d19812227507718e290*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1148x8192:8192x65024_n"ede70561a5d3f2f038e31ab8755c59ef*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x8192:8192x4608_n"05bb889d8aea286d72dee0c067e9fa96*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1149x64:64x64x1149_n"a9adefa0a411ba085f78a8b86a835408*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1149x1149:64x1149x64_n"92e1266df2cb31d7666b2a52ddaa64e2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x4096:4096x8192_n"7eaeee45f44a233b6b696a575bd1b60f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x8192:8192x16384_n"341d87432d739c8106d9a1654e8af077*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x16384:16384x8192_n"bbe5f2b9f49b48f542b6c55fb79d2b15*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1149x8192:8192x65024_n"d873de925d434b35716653c8ae5a2e0c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x8192:8192x4608_n"03b8da44f4d8de540acb48d5f09be9b4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1150x64:64x64x1150_n"bd52ab3cad407fd295b243b1e486944d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1150x1150:64x1150x64_n"61ee606650f9bc3ac2736181d9700481*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x4096:4096x8192_n"7d49459fd95e17f6cbfadeeb9cc44dc4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x8192:8192x16384_n"28e42eeadb5957d66b3769f5ef5a7f37*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x16384:16384x8192_n"2e02234c78ac041b606dad8239d6b5ce*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1150x8192:8192x65024_n"848cc0571a7680105495acc57546ac2b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x8192:8192x4608_n"81cb8973ceafd70e9c86e4d102282c9d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1151x64:64x64x1151_n"5ba4bca9bb90206d76cd9771922ae290*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1151x1151:64x1151x64_n"a2c3ed5e9fe02fb4eedc07a7695e4ae1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x4096:4096x8192_n"b90a72be07bee3cca588f64da0c65c56*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x8192:8192x16384_n"7b55a0c99fedd377d357f7537326a96d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x16384:16384x8192_n"c524edf3bcb0312162a334daa8a410ab*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1151x8192:8192x65024_n"5421a55814326319fe324018753725f9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x8192:8192x4608_n"730805870bb1cc8c439eb8e07340be8c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1152x64:64x64x1152_n"9f5fe78693d8f30941de23ee2e45b519*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1152x1152:64x1152x64_n"da811d82f7ff9551b57b870bc2f24166*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x4096:4096x8192_n"32ee22f8c3dab5084308cc9a613197f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x8192:8192x16384_n"40af03a5e43de38d91418050d144f26b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x16384:16384x8192_n"69f118aaa8e48a565657a473cc9760f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x8192:8192x65024_n"995b68eaf73cd23da1e93fb967d76f81*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x8192:8192x4608_n"7914342436c59b8f77a9629857b752af*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1153x64:64x64x1153_n"3b531954782ff0001a42df1119d9340f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1153x1153:64x1153x64_n"392ec26167a9bc95aecfa56ef5921e22*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x4096:4096x8192_n"e906ce68bb2ee7a3bf109f9465ba7f24*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x8192:8192x16384_n"7342fe9291483dbf7edc1531c1eb6d3a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x16384:16384x8192_n"e0debd7a5b92b391f1c9bd2970a94620*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1153x8192:8192x65024_n"a86976943e587478cae35b06ea8de3c4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x8192:8192x4608_n"2ae04d75455efbc6e18a8132d1b80920*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1154x64:64x64x1154_n"153eb14040797866cbfc9a118ded1615*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1154x1154:64x1154x64_n"b81398e7bc0c54057df677319a191c3b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x4096:4096x8192_n"3ed3d9b5fb5a07b3778255b486e2e7fa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x8192:8192x16384_n"97f2b56e7d514e267c133e6a7e71607a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x16384:16384x8192_n"5810cc65c351339fe2be0a6c541afa93*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1154x8192:8192x65024_n"1ebd92df20b93c957f00ca9607571c2f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x8192:8192x4608_n"496f959113ba12b96d1fe66e4fb80e33*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1155x64:64x64x1155_n"8a94316af593659022d74d28111d55a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1155x1155:64x1155x64_n"d939fdb9f507721898821b02792d773c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x4096:4096x8192_n"8dbe2369f26d2db8576555dceedea30e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x8192:8192x16384_n"c669b500dc06e2d052c092806e97ebbe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x16384:16384x8192_n"f8d18392564e2239ded7898baf298328*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1155x8192:8192x65024_n"610ef85e90aab48253c157a03bd8b173*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x8192:8192x4608_n"1bfaf5e676a7f4ed7d23b1bd55906403*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1156x64:64x64x1156_n"212f7c1e8b8650f2201b4985e025747c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1156x1156:64x1156x64_n"d24b1906a6da6334bff6b8f86b06cc68*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x4096:4096x8192_n"de3f9b575ea54ce87b59dc583224dfaf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x8192:8192x16384_n"5210de0e836d941af5b4b931d834983d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x16384:16384x8192_n"2cd6a4991b2724be917d408dffb0ea06*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1156x8192:8192x65024_n"aadfe9c8f5ef80778061a48bdb584002*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x8192:8192x4608_n"b79b4f8ca46d62d09a5cc5c34de83027*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1157x64:64x64x1157_n"63047bb8d056ce4ee0ac454971117094*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1157x1157:64x1157x64_n"3775fc52a059c24702e5bab3123c2958*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x4096:4096x8192_n"afc8c45440f55272c78fe7438b7d7d3c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x8192:8192x16384_n"bc78ba23b61c59d99d053c5a3dbc8ae5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x16384:16384x8192_n"eea935b33ca2b096c77217833bf48d36*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1157x8192:8192x65024_n"ea1436fcbde5fe52fd90fefbd02c4875*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x8192:8192x4608_n"b02b7e92ea78b36d58636a574823aeee*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1158x64:64x64x1158_n"42613b5b48e1478f5163791d684b3f66*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1158x1158:64x1158x64_n"eac98a2f1cb4f9dd5c941ceebe0d0282*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x4096:4096x8192_n"14e790b93916831308d58e4b1a311648*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x8192:8192x16384_n"b0a5a919fd9d6a387bcbe2e970125819*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x16384:16384x8192_n"a70a8cbe5e7fda95aa4242a86c0a2f2b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1158x8192:8192x65024_n"1e29237e01ff457b055119cf206b572a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x8192:8192x4608_n"e9f37a4cf39fa2758d24916ab6c44920*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1159x64:64x64x1159_n"bd996058a6a63563d52dd1a16e674b12*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1159x1159:64x1159x64_n"f161d9d34ceecc4b8095f59e8051f0ae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x4096:4096x8192_n"58ab6a60491389b5df44a80b93e163ba*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x8192:8192x16384_n"648b5e380cd13073405863b22d5ddf61*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x16384:16384x8192_n"e0aecbdce83cbdaba1b1b2079081443c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1159x8192:8192x65024_n"be9b66ba434255edd681d00362221428*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x8192:8192x4608_n"87c1367ae145a67db872496c46f52b40*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1160x64:64x64x1160_n"687a00be2a6d842ccde8e11c126b063c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1160x1160:64x1160x64_n"bea881c226f72cc584ff7712a320c4e0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x4096:4096x8192_n"79d90ee88fd2d20b76662805115af12d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x8192:8192x16384_n"6a5789b1fc82d2529312b0c112ed2cb6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x16384:16384x8192_n"c9526b323502de3caaec8660bf0e6f2b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1160x8192:8192x65024_n"bb140c96519097215502e84c09b76bdc*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x8192:8192x4608_n"a3fb2ff36950ef8f6e9e27f63270f677*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1161x64:64x64x1161_n"934b164aa8cb2eb0764bf5d021f8a3ea*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1161x1161:64x1161x64_n"3442136e44406354fa1aaac9a098ff24*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x4096:4096x8192_n"a20fdd0c873fc8c8b10d080211916b9f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x8192:8192x16384_n"87fb9b765f806d9e52833c7c8da49664*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x16384:16384x8192_n"82dd0cf717ca0308a5189341054bc87e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1161x8192:8192x65024_n"5d88735e0eb5da37c722fbbf202f44d2*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x8192:8192x4608_n"e8607eb379ee293ce6074a9017e75331*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1162x64:64x64x1162_n"1d36cadfbd45a431c6999253470ed4a9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1162x1162:64x1162x64_n"9b4510d1db81b319a14a3e2b27302cd4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x4096:4096x8192_n"01e1e4164ea180c8e36a5e0e5c5775de*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x8192:8192x16384_n"2b06203a387d6906587f7a7c7b67ff6f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x16384:16384x8192_n"7cbd0606673b660232b2218329074273*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1162x8192:8192x65024_n"ca7dbe05839a89e2ec3a2d94fd1ac7ce*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x8192:8192x4608_n"42a04b050ffef75ae743b9fe58349c5f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1163x64:64x64x1163_n"dc6053a9b9c6ede12425e1b269a69f64*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1163x1163:64x1163x64_n"676683d0bf28683565d833146ed1c8f7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x4096:4096x8192_n"c92f50a4fa6a52c84e50696274c151a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x8192:8192x16384_n"a48cbb304a0739b1a6e407041a98bbcf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x16384:16384x8192_n"218717f3b81755920ee76d27fcf5f295*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1163x8192:8192x65024_n"9c1ae43336a6f4ac9ff9126df107d7f7*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x8192:8192x4608_n"12ee92366b9760128aaa29cd8e9fbb41*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1164x64:64x64x1164_n"a1ee9caba90ea66b7d7b4587c3596240*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1164x1164:64x1164x64_n"848ae920c8844075d73d46eb4ee7caef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x4096:4096x8192_n"060e3b8d4bcd395c02e3241a72ab86e3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x8192:8192x16384_n"f83f3ab0e715cb9a8e568b3ab67cc43e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x16384:16384x8192_n"43db65d1bb02c0189b25e1106a1bbcce*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1164x8192:8192x65024_n"65ad1aa35d2deb26efeea1ad0ed70cea*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x8192:8192x4608_n"30c72a1eb8001df6148b3b55d3a2401f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1165x64:64x64x1165_n"72cdccdda384e07f987d534b15dd7398*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1165x1165:64x1165x64_n"9ef968e520b91702ada4c189ce712499*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x4096:4096x8192_n"982cd45ec7202f5063a335bfc2ea0655*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x8192:8192x16384_n"486607580d6c232d1ee48907db705052*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x16384:16384x8192_n"612d9c6c3e2392380306ca27e14fbc77*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1165x8192:8192x65024_n"3c68caa4902576269f9f0fb2e648a4af*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x8192:8192x4608_n"818f7fe383683707fcde330e8448f328*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1166x64:64x64x1166_n"12ea3008705c69bfd5543e2547f943b4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1166x1166:64x1166x64_n"52675a720ca9b117b7d712916b16e2f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x4096:4096x8192_n"37f853e4b1cd5f040f30d2b95e6c2f79*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x8192:8192x16384_n"22f2df4505ef9ec6a58ce81dfa191a30*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x16384:16384x8192_n"ba9aad04c1e350c3129f0543bf4c1b15*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1166x8192:8192x65024_n"dfef8bdb7bdd9ea1f58357a898a6e59a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x8192:8192x4608_n"c0bc443d0e1ca7346b610ee3a79c4a5c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1167x64:64x64x1167_n"1f146982f1c0aebd41c026e8ab1a1271*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1167x1167:64x1167x64_n"e903dd372c99ee68316cb5c99efbcb43*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x4096:4096x8192_n"92f0871960df4a1331779daa455dd2cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x8192:8192x16384_n"7abc46b066d39c8faa77bf0cd6c52a4f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x16384:16384x8192_n"4b1909516c229c4c7219b066085e9367*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1167x8192:8192x65024_n"c5cf51aeb82488395dfab6a150515bbf*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x8192:8192x4608_n"ae089637b436ea3e221130b45d2ee908*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1168x64:64x64x1168_n"14079b27452e67c621277d139279c989*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1168x1168:64x1168x64_n"48485589bc6305c43c8fae2671beb558*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x4096:4096x8192_n"cee2c03bdf33ca88c61abd59c0e79dbb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x8192:8192x16384_n"5f0a83745e63639bb7e89f20aca18cd8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x16384:16384x8192_n"a611356fc89f1b0a8947cc4f5b7b510d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1168x8192:8192x65024_n"de06d4613377f171a8839b4bd526e7d9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x8192:8192x4608_n"757e7a3762ea928ae69959d3df66abd9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1169x64:64x64x1169_n"459e1aba98eab1c3a5f207618a06c5f5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1169x1169:64x1169x64_n"bec1fc980d374e89632e18d501e4ba18*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x4096:4096x8192_n"6c504808606b6778eb8ddb1d0b0d4628*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x8192:8192x16384_n"cfb7bdbeb3452d2e8cf250513285a10e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x16384:16384x8192_n"0aa06ce4d2f0a117ab1a4ddef8f4d8c3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1169x8192:8192x65024_n"382c98ee3a8d7a0578b250a9f9667cf5*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x8192:8192x4608_n"a5312cd25fa12269c3be7c0147dc8be4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1170x64:64x64x1170_n"0769873ddded054726b0a7628a8a0dc8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1170x1170:64x1170x64_n"dbd0a8832c571e8e295491af64265fe8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x4096:4096x8192_n"a3c7de95172bf2b8972e48c1583ecaa4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x8192:8192x16384_n"c0baa6b51e15fb7dcc6f4215f45e24c5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x16384:16384x8192_n"1e56f408d1225a4832351117a658541b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1170x8192:8192x65024_n"516107961ffb5c77339c94bc72c92b12*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x8192:8192x4608_n"4a90c0eb995f612f17e675c038761119*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1171x64:64x64x1171_n"c805e74cb21e829f895ddf83234cb0aa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1171x1171:64x1171x64_n"ec6abbe997d96a9c589e566ee5bd85a3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x4096:4096x8192_n"98ec67ae596f14db96fcda675b918b9d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x8192:8192x16384_n"d89b1f2a3834ab152a6de62310abc45a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x16384:16384x8192_n"0529b4cff5afb40a358f479298ba4784*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1171x8192:8192x65024_n"717e88fd1ded085824d593c4e770adf9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x8192:8192x4608_n"9361bb279e4a184161125741da8a455f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1172x64:64x64x1172_n"a16efad2094c7a3a143951f33386f1c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1172x1172:64x1172x64_n"256dbfd3f2e64539216c5a856b22eef6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x4096:4096x8192_n"56622371e31eaf9d25f61fb1cb3c8e20*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x8192:8192x16384_n"635b25ad36cb6a16239d01d868d0baef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x16384:16384x8192_n"80f23535b35a23686cdf8ca03eea308c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1172x8192:8192x65024_n"1091aebcd8cc040744aa5b1080e2fb81*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x8192:8192x4608_n"c0ade9e9c9150d97d621e63559275f31*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1173x64:64x64x1173_n"68d6616833c81b42df77d4bd0ee6711b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1173x1173:64x1173x64_n"d7cba432511f639e0815ec77846572c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x4096:4096x8192_n"4ea78097c3f5478c8da496a33a430222*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x8192:8192x16384_n"78fa59ade6bf37a4900131cdc6295645*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x16384:16384x8192_n"baa3e072d6f8190ed7a9261d681917ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1173x8192:8192x65024_n"8a4125b99ec27d71d27a28033136e7b3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x8192:8192x4608_n"d658b52cc88bbcf5382b7e0478c5d9d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1174x64:64x64x1174_n"7640124f7ffad88e55a1ce37a284ea73*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1174x1174:64x1174x64_n"dc90b94a0ece5f3b7384db4f8dd52bae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x4096:4096x8192_n"570c757feb0e2cd1d8bf37c5b3296c04*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x8192:8192x16384_n"ff42b86ed2b04ea4857ea4f49e00687f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x16384:16384x8192_n"bc782743dbed0b9ed9b695213eb77ae6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1174x8192:8192x65024_n"f797862066c78a50e9062b22b03c0d16*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x8192:8192x4608_n"cab562a438935ee89103d7995d3569c2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1175x64:64x64x1175_n"985fefc79e3e30c09ebe55ddb2e1f4cf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1175x1175:64x1175x64_n"864e0d9be796d35b476b0edcfd9b1324*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x4096:4096x8192_n"e9fe33ef96f8a17890668b8521e33bbe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x8192:8192x16384_n"094c23cd721d1e29dcdd11604b61b5c0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x16384:16384x8192_n"ab30988d611bcd9e98eb3292b1ae0695*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1175x8192:8192x65024_n"7617ceb1101b531d9785b1ba9fe934af*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x8192:8192x4608_n"2a1f4e94d29b9ee2e011972715e06ef7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1176x64:64x64x1176_n"2793fc2ab142d49b423cf6a23e1f0af4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1176x1176:64x1176x64_n"97ee893eb965eecd4f59115fef25b1a4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x4096:4096x8192_n"94557769ec998b5054f1ce2b531ecef6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x8192:8192x16384_n"15f30f4c4238d83fa4cc9c0660ad20c7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x16384:16384x8192_n"f7e28c619ed60997ba7dbf1056297c40*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1176x8192:8192x65024_n"b7c7c3b3ff253f901dbe0d73f50c7a27*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x8192:8192x4608_n"1c47b55ec917333e63e3fb5bd174b1cd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1177x64:64x64x1177_n"4365f148bc009e6db43cd0bd941b711f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1177x1177:64x1177x64_n"6fd7896bf94fa4a8d14abb198eb0b887*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x4096:4096x8192_n"b43e83a91ebd94a0e677f28584a0c8a5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x8192:8192x16384_n"71e1af7165ff3b7b30262ea59de1a1a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x16384:16384x8192_n"165906ebfa2dfa38d15eeb8482c81ba3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1177x8192:8192x65024_n"5e9acda2fee4a5812f35989fddcab92c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x8192:8192x4608_n"db2bd064d21cd9e32daf9e0f98fd03c2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1178x64:64x64x1178_n"006e3ce8c23dfe3b0a2121b1189d03f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1178x1178:64x1178x64_n"aaa4fbadbbe7bf2c3dd33568047c926c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x4096:4096x8192_n"373887d6e9dfa18cf203b4070165aa67*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x8192:8192x16384_n"41a3434ec19781a0c022b6d5f5f7e699*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x16384:16384x8192_n"23cb4ddf4b1ff114bc7114a532bb0afb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1178x8192:8192x65024_n"e17dbdf52a69660ed179576ba0c37e01*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x8192:8192x4608_n"53843cd20a68e5cb7399a883662ccbc8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1179x64:64x64x1179_n"5f6d28321aec529a317131ae7a41813d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1179x1179:64x1179x64_n"e84ae3ef0bb1b64151c4f74ad6b2c979*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x4096:4096x8192_n"ee4b8c1bdccf9317f4dff44dbc8914b7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x8192:8192x16384_n"6ef4f8d16e2ac3aa53c26d26461262db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x16384:16384x8192_n"e519ebf99a0ba1a4b37b20fd78b7567c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1179x8192:8192x65024_n"7dee016e6fb34fc5d17aec1247ebebce*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x8192:8192x4608_n"cb3bc6f19ef79afc965edb722df2ce1b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1180x64:64x64x1180_n"53251902d19691f0e0f77b0d20d739aa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1180x1180:64x1180x64_n"8cbc7f36daa6ce0dbf21921b79b82b7c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x4096:4096x8192_n"0b809cce60f5082e3a1a55f7c6e480cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x8192:8192x16384_n"e2cf9cfb2871666602f5268a275f7e1e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x16384:16384x8192_n"4717a13d322a7c4acbe41b9e91cde9af*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1180x8192:8192x65024_n"ba8ef63eb39243dec27ea21dcc881bec*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x8192:8192x4608_n"db93e2626d7fd383f99f474d23f7cfbe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1181x64:64x64x1181_n"1a136cdfa33870734573496169394801*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1181x1181:64x1181x64_n"0993d3a4a1924496ea306ec8bdd6f8d1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x4096:4096x8192_n"adf4f80162bf3837e97ff6701f3280b0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x8192:8192x16384_n"d67d6311ad8eb1e8a3715bc02b75ee22*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x16384:16384x8192_n"3c403b692681529498bd936ee98da606*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1181x8192:8192x65024_n"cbf5fb64f97f03ea00070b6b9c8a8303*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x8192:8192x4608_n"33d3ffc31a07599857284f513c3a38f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1182x64:64x64x1182_n"84c837c51df6591847f681c4aabd660e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1182x1182:64x1182x64_n"c4d5bd4f12c35532a24c5333dc32c87a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x4096:4096x8192_n"c11db5f7e8962d04889debf4e9e45459*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x8192:8192x16384_n"3d883a800f2bf66e6998a8a39c5ffefa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x16384:16384x8192_n"490b4347be7be4f94f539a19237499db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1182x8192:8192x65024_n"884072ef9a88f8a4237940979088870c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x8192:8192x4608_n"490d1a799ac95553b2cd9f9b24d2c449*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1183x64:64x64x1183_n"922727aaaa6e4d4f59e061fb8d835cb2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1183x1183:64x1183x64_n"44069f71a8e4d68c28518c6525639304*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x4096:4096x8192_n"7b2dc5ca040244a883360e10b5b9657f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x8192:8192x16384_n"452c5a1b660d6c33ed9b6491389d73ae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x16384:16384x8192_n"aa02e6bdb7d24072050634716e247281*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1183x8192:8192x65024_n"48613df563775758a90899072934581d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x8192:8192x4608_n"9d944a24ddf70ec236b33455a9938f96*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 64x1184x64:64x64x1184_n"3032e914fcd89e73c05092001bd80b5b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 64x1184x1184:64x1184x64_n"e6950e37a7335c824f175dec80aff4b5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x4096:4096x8192_n"77c1c6e517bebda4ebf76fd8fbd1422b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x8192:8192x16384_n"e2d20d45c658354a6ecfbd2556c2fe3a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x16384:16384x8192_n"6a1cfd0da416664de0a23c70916729b6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1184x8192:8192x65024_n"0f6c59a51a88981b6a53fc6c3c6246a6*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x32768:32768x128_n"19822e13dfd946e0e3e96735ca9cca94*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 4x128:128x64_n"adad4da2bd3ce3b4ca3f713476070fb8*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 4x64:64x4_n"38ecba0fae5d9300078541526f34d225*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4:4x64_n"5aaa03de0b7f48c41da0b33632b26d37*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x64:64x128_n"70ec000df7e78474aa91dd746482050d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x128:128x32768_n"f88a32c782368250708406227bec2875*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x92:92x768_n"23b71f357d6685d8a64718cfae9445e7*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x768_n"97fd705002e80b7798f63f28c511ab11*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x287_n"257bb5a0639fdb9b0eca48792dbda393*2&a0a3118e83a4270b7c183dfe7c26aac9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x681_n"6d03784c7c6adb2b9ff39376db3059a3&05b45fb88cd9f4488d5b09ee82d38048"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x379:379x2043_n"b536d6b12fd58d89ad5660a192d4ce2e*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x681:681x2043_n"c2b913a04ae577f434bacdbb3efe9c4a*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 42496x681:681x92_n"bcfaf592804cf022e3a2c27ea662a1cc&b385feeefc3ec49ec7cf7dc69a73e82a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x287:287x144_n"51a8163490209acd3cc63202a62c85bd&6bcd77d5a4467c83a29ce4615bd25a7a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x144:144x72_n"0f10ac94b88c3fbe9a7bb213bde3f83e&a66024bd22c23ab2457de49d5406b9c0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x72:72x1_n"8039ff69f0d015c70ff71c9d1ef97b54&41e3078a6ff7e6e8f5939053752bf663"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1:1x72_n"7d54b0a7b9545b6ac3b485c49a3c2d12&8ee1b90e6de21bc9eb8168c9816971aa"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x512:512x72_n"788baa5ba88c356e115d5af06dc3768a&40f1b668c135cb9ddc0b5328203300d5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x72:72x144_n"c3099f84660456d5d59bf19f26897db7&9d9e9151240cd7c20327692990ce568e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 72x512:512x144_n"f3b0887bb8e88b4b094c5de5e1ec549c&1bae5957d13e46b17f4e8051aa956c97"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x144:144x287_n"f1525d434ccfe50db4b3ac430d457039&41a80348bc37dafe7cdfbdb7cce70b92"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 144x512:512x287_n"07ca0e5210185b956273d9d8151265c8&0788e118a9fcd06af10c689f65bc3460"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 92x42496:42496x681_n"4dfd01b6a274564d19640a8bb11dbaf7&888ad6d03089ea7567a7a6178a91dd9d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 42496x92:92x681_n"253d759095eb93dc25e01eb3ece2ed7e&24e291e97902f91b0bf7326e72c34283"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x681_n"4c39747e04b29b1b29fcab209a45472b*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x681_n"86857e6054c410d375b18704c6c89288*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x379_n"d7a2d65f4f9e34c71098efa14e39f8a9*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x379_n"91834c9fbb7d7cb6f5e08a35e09506af*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x681:681x287_n"eac1ddc118fab0c8ea71f7750ba40cf4&68bd8dfd6bbeb6926b33aea3342ace2f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 681x512:512x287_n"f57182c0be9bde2dfb4d15d5dde12120&b92932a00b8585074db317c718707e1a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x287:287x256_n"5557e119428fd6ee9766076dc0d860f7*2&3b3cf5d8a81c22c5f6384414606e2d20*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 287x512:512x256_n"4d7a457d4abc72a9752d2427744e2914*2&62ac64befa9680db939af8434789c46e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x256_n"7149c9d5747050ff39234c5958997f65*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x256_n"b7cf3616c17b5a3943f0a5a33f17d623*82"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x92_n"eec4e5936d33dd76f70329db9721055c*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x92_n"77f5e2e741f9252daa20f22e2ab36663*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1344:1344x512_n"e0a721054db2215d8310b32e8bbb571a*223&6bc057fa88a73078342f5b7da062aa16*196"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x512:512x29_n"30108569911ff779ff1c8ae8aabbeef2*223&888183e289cfc92535fe5293ce478bd7*196"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2048:2048x1000_n"14250964944c709c44a21022afdc5fa5&625d281d47159d53e7920d99f8f4c923"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1000x16:16x2048_n"68b5e365fdce6b4c636902e55fec5449&df1d00a579e4ddb1b03ff09d7fb6c690"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x4096x40:16x40x4096_n"5fd857af69319bb6b1a236e75b96ebf4*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x4096:16x4096x40_n"05b00f0cbaa94be3ab59658504da81b6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x4096x40:16x40x77_n"8f03a46889b41a0e7dfe50142812749b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x77:16x77x40_n"63af4aef17edbebfe57d94f20244d878*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1024x80:16x80x1024_n"7cbfaab851220fa0dc16d0c92750882a*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x1024:16x1024x80_n"f7d0a377af6aa75d0c1a5a8d395541cb*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x80:16x80x77_n"cde14e71e1df3390ebea761090724e78*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x77:16x77x80_n"6acb5ab061137b790a25ce1fd5716d70*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x256x160:16x160x256_n"c95cd96550908757c0392983c219b94b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x256:16x256x160_n"5039efc074c8c395199024f5d62bdadb*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x160:16x160x77_n"c879ce7b2ac4424baca4f345a3180f19*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x77:16x77x160_n"436ac9ec6fa2e0ed11ca04ecc705f27d*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x64x160:16x160x64_n"c6897723c4f98e1d2379ec16d4774e18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x64:16x64x160_n"7d590658137689f6419cea92f568668e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x160:16x160x77_n"6a1a8786b0f07a20fad38d63011e0a68"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x77:16x77x160_n"a6bee0633a5b67cb74691f06b3c44cd8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x13:13x512_n"3da7d9c49100df5c2b6d991af6a3a2fc*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x256_n"7dc9262257d06600604b2459cf96fa9c*14400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x256:256x128_n"a6516ce7670d55344f38a91d0830aa20*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x3456:3456x512_n"b71a9745064b9d507b58de87a97797df*21600"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x3456_n"5000bf57addb03ae2dc8bb47d2b04eb2*21600"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x3456:3456x1024_n"04a017188ea56f28fdc23ec90141419d*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"c1e09f968e3954131c7f6b636859b3e5*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x512_n"d4a870a9f55e10e474c340a269b75374*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x256:256x1_n"128cf985ced27af481eaad68979b7a0c*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1:1x256_n"692ecd61d3442693bed3523fd9c06470*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x8192:8192x256_n"e6e397dc05ef405c01d6febb0e1a6986*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x256:256x512_n"0d8b9146921c4e011f2e1dd55d4d6329*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x8192:8192x512_n"094cc70b2a88a1825a37775108dab0f9*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x1024_n"1e8c7b386350e0f0a311537768b9ee3c*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x8192:8192x1024_n"526a1077b77018399d11966e8e59b54d*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"927fca7468e3febae1425eab058a7689*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x8192:8192x1024_n"30d7864ced3af7a7c33232abbfef1716*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x3456_n"d225490314e9f02c44b8ebb3a8a4dbe4*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x8192:8192x3456_n"2f135f28006a27741bc721e21d4ac495*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3456x8192:8192x512_n"31fc87e5b401a173f7c50fda6517c8fa*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x3456:3456x512_n"c384c21f2cfe59517629ae3f7cf8c1e6*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x8192:8192x3456_n"bd63cef23aae16ba0021bd06b74a6c72*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x512:512x3456_n"df09f8228f89b31c6c1d6f154e18c901*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x128:128x256_n"1f03ab89ae26a30e725d767c4797b0a5*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x8192:8192x256_n"ed0a4e127a035235a0e342f886c70788*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x8192:8192x13_n"cbf174e46c0adf837ad80072bdca3f42*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x2048_n"15098bf7c430591b84289eaaf7257762*252"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=bac --wtag=cab --dtag=abc --strides=:: 8x1024x256:8x256x1024_n"829a8d2afd8956ead666908b9834b702*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 8x1024x1024:8x1024x256_n"4aaeee7ab0ad05218d50e9b452d11b91*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2048:2048x4096_n"610cd33d95cc51078075cfe5ed965c11*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x8192_n"5859bf730b8202fa28db3a2bcb8a190c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x4096_n"95894861f9c7125c5b838c79a7d29c83*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x50400_n"a958f1cc5734bad3e66b5e1c6cc99ed9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x2048_n"b456607f09b5f9e548bc83a9824e80dc*32004"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1025_n"2bc7d53d72b9c8b0163cb2a7e4c45779*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1025:8x1025x256_n"84892055bab49caaac14ca304d08c77d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x4096_n"5e44a6afc2b6494a22fea2491a1f6b22*10668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x8192_n"91c22d1a4b146dcc4915b1af1c6b9433*10668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x4096_n"95399c5e1cc993cce94cdecdb907cba4*10668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x50400_n"f699f600162c499cd201fc4e4ead7606*381"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1026_n"7892a591494cc09051b75687dbffe5e3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1026:8x1026x256_n"572e81fdb5b4946c58a719bbde19020f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1027_n"18fc8e793a0fe03c0fe566559700511e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1027:8x1027x256_n"1ba4645934a450f75a2516c0f9a82714*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1028_n"a4e8aad9823b1283817ed941e53b7889*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1028:8x1028x256_n"4e1aa11fc42e46ca3af90ca48b9229e7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1029_n"67099729435ca0129fec8e21c690612c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1029:8x1029x256_n"852bb86dd970bde096177e2602ed9a99*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1030_n"e5898fa514d21aeb7bf348e8f4c05975*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1030:8x1030x256_n"52511a948c2aab1e4bbac6055b5a85df*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1031_n"68d2ef56629a2233963b0fbfb3ffa467*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1031:8x1031x256_n"9a099ccf147477bea5d860b5a0740049*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1032_n"059ecb97cbbed26e7b91ecb1bd55f0c6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1032:8x1032x256_n"5580b8f6f149ae6bb18345f1440faea8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1033_n"bbedbc94b6958602617e0ed2d984e275*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1033:8x1033x256_n"f3a9f38589ee914b6fd678a4aaa2dcb6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1034_n"c1425d6a693b2d5cd7df05be887b26dc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1034:8x1034x256_n"98916593abebe83eabfba2f4fdfb78f5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1035_n"76f35f15f2802d0704b8a99a1209ad48*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1035:8x1035x256_n"f5f928b8b3f1e4e3f2e4d04ad4dfedc2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1036_n"87640bfb32a9cde3bec552f2009abf2a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1036:8x1036x256_n"3adba7842f429d7010d65be356f9ffcc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1037_n"1349f9532de5760b28b5a68df4c845a4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1037:8x1037x256_n"628f88b74f4c4700388ac3b73f6da18b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1038_n"77032e335706daa911499c82d90ae497*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1038:8x1038x256_n"ca43a73640797b20d7f6ad01c5a5c400*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1039_n"4a0316d9fc11d72b8ec88cd91f78b16a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1039:8x1039x256_n"e55bacce94674a22ac652960aa45631a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1040_n"95bc7255b0ba275e85922d35aedfe1b8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1040:8x1040x256_n"f36e942fc1588201ee406307cf28a528*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1041_n"c4eb432633c821264207c705c9f45333*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1041:8x1041x256_n"c78fe2fec1a1da041c5089c20755acc6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1042_n"eae62646da97a3c9110bf0d0c30bb247*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1042:8x1042x256_n"b12367c21523eca42cafcc4191b230f0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1043_n"16504b262411b2a46cb90e2cccd6e7f4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1043:8x1043x256_n"651e89547c779fd801a531ccb43cb1f1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1044_n"006c94d34c374112a02ed5e3a3c0767d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1044:8x1044x256_n"976dedcd110cc50b3dfb08735f279069*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1045_n"334438688a94570b9bb1e17a68f23c3c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1045:8x1045x256_n"b1415253203d9de2826d36201e926431*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1046_n"847f27dd3579d0aa8e82f610a98d8d0f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1046:8x1046x256_n"c38d38a35b38264148748cc5cae997b6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1047_n"f7bd7a9e65eac6324a478e7478a26346*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1047:8x1047x256_n"09f89617abb92ee144b211ddbfed8441*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1048_n"91ba606964d9660d76b9536acf739dc8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1048:8x1048x256_n"5a3a38bea860559e87a105a4cf145a8c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1049_n"dea29f44cd67417797bc0ee8929f6a31*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1049:8x1049x256_n"33bd7d839fa386cb347cd9ab1bcc876c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1050_n"b9fd53eff4bcd938f00c8f2290b702c3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1050:8x1050x256_n"52b6844e2f98e4beee53b4796761a259*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1051_n"8d5dcd1acc9d97062318bd7b4b9f1328*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1051:8x1051x256_n"a99e9a91cac1a63e3aed7598fe059710*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1052_n"ac7cc5401446373982062fe80aa694e5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1052:8x1052x256_n"ffac1032b73da0ea5868689328421611*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1053_n"e73cd25b5a5bb754066540b8b12aa8b7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1053:8x1053x256_n"bb6710004851b3c71e2cf5bf88155e1e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1054_n"ae268183af176d64faf9486aef91fee9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1054:8x1054x256_n"0e36f6f3609c67a06ffeeb1f20bc53cb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1055_n"bf27140e5ec6d3fd54932a79123c22c3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1055:8x1055x256_n"7fd7286ec1828ce2f50a5ab04fd3bfae*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1056_n"2433269d31ec9d6c502b9494c5562904*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1056:8x1056x256_n"d11c6d8210afa28d499fea52e18a661d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1057_n"9d5bb7fc06c964b5933a80bcf820f10f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1057:8x1057x256_n"a03e08e222fbc3671960357391616daa*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1058_n"55c3f860a323d5351ae7e8026a3e7f9a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1058:8x1058x256_n"4128dec3f4013ee073a36828c5d31252*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1059_n"003532715f97a21a4592ff5f3b118958*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1059:8x1059x256_n"d1592ddec1577df323a119935680d3b2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1060_n"e439de03238f42956f1910e8574d18ba*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1060:8x1060x256_n"b20939b0f6c2c57c7723b927b13e5689*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1061_n"077fd81f10bf5fff08220a501e8d483d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1061:8x1061x256_n"dd6aba6c25ad26fe7f210ed2003ede9a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1062_n"ed52f8e33eead79508e2595df52523e1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1062:8x1062x256_n"5cbb2a7af85850050dd1757e09aa97bf*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1063_n"bd3ef3b16fb448d3e4619e265c5e8402*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1063:8x1063x256_n"1cb53a72b9e15fe87c2d6b2ec0ddf315*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1064_n"0d988c33672568c9934cfd2231e81984*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1064:8x1064x256_n"fbf27126bd869eda057a912c9ca54d9c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1065_n"9a750b3f43c4713ad9ea0c197600fae1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1065:8x1065x256_n"cf2491a97a99bde325ddd5937169cea8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1066_n"810c5744f0ff4caa350d23288511ccfe*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1066:8x1066x256_n"63513581dfbfaa4be9b08ba46e7fb2bf*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1067_n"1df116b29b72c015238376f75d921bd6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1067:8x1067x256_n"6acd73697cfcc310e333c8f2bbcff1e8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1068_n"97c16947c31091f10797299aaff3f8b5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1068:8x1068x256_n"671429159c8d182b34408211a942bf05*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1069_n"3e6babe3ff8401ab450b98863b3511e3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1069:8x1069x256_n"09b482eacd5eac16845ecac18bbe3cba*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1070_n"53d07730cecad3a8e89aa1c1b7a51486*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1070:8x1070x256_n"6b63fa6df90e1cacfea9488f3f4b2a7a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1071_n"605c68d2c98790bc4f281a3747c6b145*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1071:8x1071x256_n"7905c9033f4b9d186f67ec245e714f16*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1072_n"f6477e406f4fa60b01ed6b06011dc194*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1072:8x1072x256_n"9a17dec24c70b50ccfa4793b09001cce*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1073_n"a036b71fac357cf0a4bb18c924fd613d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1073:8x1073x256_n"3863587439d18546591a647d7b26ad1a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1074_n"70951ff5201a553083816cd02b0fd5a6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1074:8x1074x256_n"4a1866381914df5b163ae8f3f2a12b66*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1075_n"cfb9cf06005c8fac3453c79c35a34f4c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1075:8x1075x256_n"59d40bf4efb9fe9ed9ffef57fd6e73ef*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1076_n"765a4bf35946b8f4e8f98a1aef85ccae*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1076:8x1076x256_n"467c470ffcdd3a526e0d680165d83f3f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1077_n"ff8250b2eb3bd2730a190fd7a3ef6ff1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1077:8x1077x256_n"e4d1120cddf463137b67ebe1d373b693*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1078_n"ca8236043f2694aa19d1ea0d0ef5edd5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1078:8x1078x256_n"76d8c3365535f9330d9a8784a664432b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1079_n"cad6879640e66edcb6395871cb16a43a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1079:8x1079x256_n"677a328a6b4cff35228afad96c8b9e40*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1080_n"a4564b2a6d2aef1ae41ff4b441427be6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1080:8x1080x256_n"fed640e8c819ce580678c105a2c5be49*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1081_n"50585b23325bec7100596b190da9caa6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1081:8x1081x256_n"a998cebaa089f233bc98917ad54d41dc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1082_n"8cfda6b8d77db33f9ba9dfa48cd99925*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1082:8x1082x256_n"e81dbc35ba0cf9dabed2f2280de2836a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1083_n"099f495e23bc50fb194f235a9e9066a7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1083:8x1083x256_n"e1e4a72a522a4e03d6026e9a9d45211c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1084_n"98551cfe3e1e19d621713e3853f76168*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1084:8x1084x256_n"04a9def8c19b1adc222ddbc1acf42576*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1085_n"bbc1b2f182a105a02a9f84d6c917cdb6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1085:8x1085x256_n"b5914ace0f33b609ba2ef54b8098253a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1086_n"9472d5f4d611fe163e87063408671a05*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1086:8x1086x256_n"a3e5a909e3fe6d366e579209bd482f16*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1087_n"0e4b5d6f0be2d757c46923be1c1a0063*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1087:8x1087x256_n"995c9f2a091cc196c15cc42d467f3aea*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1088_n"c3426430a0638a7646d8d93e3e5c6b32*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1088:8x1088x256_n"8c96a0265ea8e3f1ae4508a91c73213c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1089_n"b0bd7c397bc26de65356309500008e6c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1089:8x1089x256_n"9e144845c3c1ec94bc12738cc487aebe*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1090_n"c01c1591472d5d357ed0b5a093bd1025*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1090:8x1090x256_n"68ca7fb3a16bb836d0dc92f4c991fad9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1091_n"75fafc80013915c0f25e8f37bf9999ec*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1091:8x1091x256_n"773ca1be83d1ede68bd39cac76c5dc64*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1092_n"b4795d868c458aad4575955738d47e41*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1092:8x1092x256_n"45cced7bfcf80927c3437a69906581c9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1093_n"484308cd81f42da205b4861ff723d5ed*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1093:8x1093x256_n"4c595b9a0c137762e32f23eced9f20cd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1094_n"0c06f4ba0561d421c9292008a66f7145*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1094:8x1094x256_n"2a4307d646385ee90126e483d2d9d28b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1095_n"27dc5d2425cec2724c88ba64f07de909*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1095:8x1095x256_n"6bd86c5908aba04bf52dd610fc0ec7e5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1096_n"6b22c0cb613c21f4231f583f386f5280*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1096:8x1096x256_n"6be6210c71b6dbfc641ad40234cb4ea0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1097_n"b4bed2b9e2bb61e423e35be2213563c1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1097:8x1097x256_n"945303a049086a0e33c033d1b478986c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1098_n"59bc766e4a30740381e278deac64a2fa*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1098:8x1098x256_n"5ea5752c816bae3fb111c956da72f056*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1099_n"3c542acafda79d30d9e0cad953f738b3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1099:8x1099x256_n"1f4bc17dcc5ca94bbe2b39ce6a21e546*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1100_n"e9a9107ec19d9f55b957363d69463b08*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1100:8x1100x256_n"b0df50d3c368981cd4a992cf0d2c648a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1101_n"3975f28fd9c41657daacc7fec0f173c0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1101:8x1101x256_n"bb35300325df25d2648bd6f7a885d331*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1102_n"e95d3e91751f4bea348f76928287cf45*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1102:8x1102x256_n"48d3458042dd5dc4f2ddee9bdc67e5e9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1103_n"4167d27e30e24db3d4fd8b46e25e7a29*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1103:8x1103x256_n"f178834a69862acee2a3f869b8e48f63*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1104_n"61f5387789772cea4ef1cc7826c6c9d8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1104:8x1104x256_n"b243104d3462f3cdb4e3d025e1be3c82*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1105_n"5f9820ea6c24db01714ae13583206085*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1105:8x1105x256_n"5be50ebf0d193e00f6fbcfc8bd3fa149*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1106_n"6406aec9b4ff40e0f8892a12bb1e4e71*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1106:8x1106x256_n"f965aaa6302878d675e98c5be20a41b2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1107_n"436fa2616b9044280f9588ab7547e31e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1107:8x1107x256_n"215a75f40914cdaf6198b321c2a688c5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1108_n"44b9edf8181ded035375dad6301bfae6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1108:8x1108x256_n"1e3eaed0e787d51bea26104bf2a6f875*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1109_n"575d749f722af6afa17bb64f8349e66c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1109:8x1109x256_n"912c0b47ebebe03fe5b782e055618cd4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1110_n"dd21e4630d9fcd05235a219a4500906a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1110:8x1110x256_n"3f2880259c73aae7b5ed0b37b7e3c027*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1111_n"b0def170751131dba804c1e21e7384ae*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1111:8x1111x256_n"eb52cdb04b430ce1cd740b818dccbc28*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1112_n"ce5ede82414861a7e049ddcf57e508b3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1112:8x1112x256_n"528f699246e8960801244874732246bb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1113_n"0949977c31939556a0154abb53772e50*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1113:8x1113x256_n"239077c0da3d50de427cb7fc4c6774b4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1114_n"50db483b0fa4d31bd793598586d2fc24*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1114:8x1114x256_n"beb0ec56e8593d0cc525fa77f3891794*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1115_n"21454941c8b24a98391af52f71912d21*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1115:8x1115x256_n"f29a00da3fd61d29c4378a21478a8b28*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1116_n"9d96d390a6ea76d2134ef013c19ef65a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1116:8x1116x256_n"e13976b2a8c078cfb522373cfb29613d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1117_n"49c5e77b29bceea61a1192614e8a7043*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1117:8x1117x256_n"eff5bbcaca008fc306513b4a150debc2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1118_n"26a31903d0bf4541b8670dbed3961660*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1118:8x1118x256_n"1cb8b5d58e13454722b625afafc90044*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1119_n"c216db9abac1b8d1c9263cc9336ede2b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1119:8x1119x256_n"768a8b9d2522bc4c3de6be59251a8c50*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1120_n"f5f67e1cfab0420d2b77e374cbbfb62a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1120:8x1120x256_n"c0f9103ce6ba65ef336443dd8d2d4e46*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1121_n"1980ff0deeb74ac7cd81ba489c08fc56*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1121:8x1121x256_n"d15ae2f373f85f975112b6a5d0194ccb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1122_n"2648469c4a54c7a83abcfede25d2c761*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1122:8x1122x256_n"e9bc5c02dd4cf0e46db4598f799f8830*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1123_n"0c42093a73bbd9254924ff96836e2c79*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1123:8x1123x256_n"46a531cdcc23d6b3b7a96e1e359ccb41*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1124_n"875f8dc2bfea1b7f7187043b02fe5c9d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1124:8x1124x256_n"0baede70b302d7f3eb68657ff50eae41*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1125_n"05e5ed0f85c6eb7f9bfaf893e742311c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1125:8x1125x256_n"58cbf7b3c4961b3967bc4d524a0946e8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1126_n"b50c17d2d2d517334af33c31905c3379*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1126:8x1126x256_n"d9e4a5a81a900a7d22de21356baf3eee*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1127_n"e3caee667a2ef90fb665d38278fa5e09*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1127:8x1127x256_n"acf066180c286c251d8027d4d5dd9df8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1128_n"561ea3b7311e8c43f2b24a75706c75e0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1128:8x1128x256_n"551080d56f1b5366989ab1d580468bbd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1129_n"d4c56e1e1a8dfa7a4a7f1226fbe49c57*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1129:8x1129x256_n"1e62fb2e62e919ee308c199a6d612a83*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1130_n"a92a6bfcbd3b55adb1e44708ce1f2ccb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1130:8x1130x256_n"6a9ada4195d6166fd44dda8c4a82e083*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1131_n"59bd9b138a7363f44e2e9e815f17311a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1131:8x1131x256_n"7fb97e363da79219e1938196d5f83e67*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1132_n"a4478f9cc6d1419988ca7302dafba5a2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1132:8x1132x256_n"382a3c324e0864394a87f4c48c5731e3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1133_n"2dad1ea6f04c21dc82ff9dc12bfcb2ba*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1133:8x1133x256_n"00f19fa1db49be1b09c58e1cd0650bdb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1134_n"758dd45593988c9663b8165bb183ce98*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1134:8x1134x256_n"0d86f9ab4fd827d43f7acf3fac22076b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1135_n"755db123d1c833eda707c4ec0beb6fb2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1135:8x1135x256_n"6592d8127e4cc6ed19e83f77bbfd95a9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1136_n"c666f7e0bac402606f9dd2ad2799f00e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1136:8x1136x256_n"b35748a5a56718f7b86a9fcc55754dbd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1137_n"e8e9e4cd3f2b7fabf1b248259b67f85c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1137:8x1137x256_n"fe8791b49d9b5a35d5596d5f062c54ad*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1138_n"def2f4e99cfd967efb7e41f44ba37bf9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1138:8x1138x256_n"5a7ca134a87de98dc516362d786cabf9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1139_n"558301a177732fe16537ce76d5cea04a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1139:8x1139x256_n"2d5fef9165d77066877eec0e6a2c02f9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1140_n"874b64ef3be4ba630152bb00a83ce915*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1140:8x1140x256_n"d70c0f316f80104baeac611c5f12b97f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1141_n"49f69343a915764b296d87a74f03c2f9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1141:8x1141x256_n"e1eddeaeb7833496f3dcb932bb796fd4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1142_n"29aa89da6ac039309333e58cef4dc83a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1142:8x1142x256_n"84cf904b88c2ae3edb945abf88f7457b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1143_n"c608ded99fdf0b6470faa583ce279654*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1143:8x1143x256_n"6284e1441845074944c1b29a884c7378*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1144_n"ef0d68dac38c6d89fa26d3454bc5c24a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1144:8x1144x256_n"c5e4682bf22501734e0ee6cbd43e12e5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1145_n"048fd94683ba2b43a504198cfe34b9ba*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1145:8x1145x256_n"10a56d0ba8e2a097f349a68fda1bd942*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1146_n"66c69fe3db75bfd8b8dbd95bb7d47919*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1146:8x1146x256_n"9924222f3060ed3ca38b09e49ea23204*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1147_n"59f4b98f9717013500c69b0f59e96688*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1147:8x1147x256_n"c3b9e73b5bf3eaf67835f14ecd28b552*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1148_n"b370a035fa2d033b53cf20b00c6d6dc4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1148:8x1148x256_n"759369a56b39dfba6c76f41924d73c02*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1149_n"ce6a7bc8391f963834f25d5f9dc30504*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1149:8x1149x256_n"de9b02599444730b1a479605575a5f10*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1150_n"08d3dccbde5bd3dc507ba7d1d9bd1e1f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1150:8x1150x256_n"f11731f2e8056be1e57078dd0d7b752f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1x256:8x256x1151_n"ff031d2ecafe8c27954d6c34d3bf8644*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x1151:8x1151x256_n"f0c62383fa2cbb1d9ca2543e372583c7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x5120:5120x5120_n"e693b5a670143697d4c4ea8e5ced1497*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 40x32x128:40x128x32_n"20f9325d089a07ed4e28b5daf8ca61b1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 40x32x32:40x32x128_n"86251f5d87459b478407d4c350c903d0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x5120:5120x13824_n"3c2d9dd83c84689135144d795c242e75*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x13824:13824x5120_n"7771106dc7953a1073053be31621d547*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x5120:5120x32000_n"088ee2b703790abeeb61dc4c31551f3e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x5120:5120x5120_n"ad34aac2c269ad6f25dd9682c027aa22*20320&b5cf0643ed470ff13ef782d122c63b30*4960"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x33_n"730a5ead6be7bb5271d4fa51a40de377*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x33:40x33x128_n"3f946d879901131732ef989d63c2a17f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x5120:5120x13824_n"f3e2e40917e9bd02169a948ceef62f70*10160&efa5ebb016eb3603e39e75bf240fdd14*2480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x13824:13824x5120_n"f379ea7612123c3cdfd48eb5ee45458b*5080&6718bd1052e7e114bf33f7ed80d0acfb*1240"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x5120:5120x32000_n"a9abde74b7609cf50f482fc9524e09a7*127&677f02e7b2fd531b73f3005294a2f472*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x34_n"8c32c427a97b541900fb109ea61dc95e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x34:40x34x128_n"37754fb16b133f387d6d15627dafd0dd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x35_n"e21ae2687ced4dd820a23392f40b6da8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x35:40x35x128_n"10c60adb67b55940994475937f940035*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x36_n"3a92d18791cc414932f6a496074331c0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x36:40x36x128_n"760b682a883b6f8c29bc548aa99f67f1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x37_n"9c87c9342f5ca427763bb45dc9d08542*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x37:40x37x128_n"c6d25302e08481fa3db658b40173a38d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x38_n"38f9e0c5dc8915b46998f5a55809c878*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x38:40x38x128_n"878cdb6fca56447b2ede8375f5779f9e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x39_n"3432c3a0172b6e68bdb1f166567d2e25*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x39:40x39x128_n"b7472321dea6b81a626c532f374d6da8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x40_n"22344b0562c727e06db5325d2f9d3b3a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x40:40x40x128_n"94a9e59c4581a6bb6759a7a1af468391*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x41_n"61bdfc2faf490e308fcf4b99bc82a447*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x41:40x41x128_n"58bf61b0eadb6c5ac4e89966e645e1cd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x42_n"d7180d4d28f827ad382fbeea532af049*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x42:40x42x128_n"eaee5d55eddff596ad5c222d6e7bff82*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x43_n"4fac30447620ce152f9c10ed0330b092*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x43:40x43x128_n"c245aa74444faf16d693eb43990fa503*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x44_n"1c37adeae83bfc0bf944958536187a3a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x44:40x44x128_n"e1bf396d37a8ab61643fcf2f56cdd010*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x45_n"a480f446d564d80ec03b0560387cfbc7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x45:40x45x128_n"874186c0639e77b645cd7e2cf5918633*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x46_n"b1b2a15ca40d80a9cfdeb34152edada2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x46:40x46x128_n"b6d6b1d8f1019d3fc56d89756b7d022f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x47_n"9413a84b242a13e6232ee6b972e19509*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x47:40x47x128_n"d5be3d5b61401e87952c07226c42c35a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x48_n"48bed85373d1adea1b80f8fe839f2b70*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x48:40x48x128_n"6107cfe3860a03aad7229d3188d2535d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x49_n"e5d97d2b27b8dfce334a643939968261*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x49:40x49x128_n"7555a07b2c380c75f1ae39e17bef01fa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x50_n"58477ba9d70b48c0169ef9d36a8a5b66*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x50:40x50x128_n"5394af8c702fbfefb16d0083d516ceba*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x51_n"7385696563d74ae5700a5ae70b5f7b05*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x51:40x51x128_n"9dbc5a68d8c5874c5bf4b372fc9b911b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x52_n"5b1beacd964454e036f241419d03de83*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x52:40x52x128_n"a301b2b61e38732df418448b24ecccbf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x53_n"200243565f091eafa8be5adf053a518c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x53:40x53x128_n"678a3653233160d2cd1a568fff368340*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x54_n"598d43345b45f5621b499c78171b4bd5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x54:40x54x128_n"298d99c5e604a56b8b3707947763a76b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x55_n"ae727143886cd8a5a28712465b25b6eb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x55:40x55x128_n"b299bf45acf5e89a59cfb65323899f78*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x56_n"05ea30832e42b9426339b4af418bfb9c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x56:40x56x128_n"423f28f24a3865c6eed332b686be5e18*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x57_n"8ac7dd5be8db27f1112ad70275210fad*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x57:40x57x128_n"1c0610c2ec92143cf3bae2f2a60132a0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x58_n"b3e19be2974a900149d0076442ecd619*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x58:40x58x128_n"e934c4aa23695aa5604352846504bca8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x59_n"191f30c4fcae06b6418d8b3be61406b7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x59:40x59x128_n"76bce6f76bb20f61457820d3344120f2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x60_n"4f321aaddbd67b2c6246345ba4f9c22d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x60:40x60x128_n"d0ddf01cd7a394efc6f537b89dfdc6a4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x61_n"f8704722412119f89978ebda700f68b0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x61:40x61x128_n"6851b407d563214a259248d6f30a1f59*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x62_n"b3eff1df4ae8557b78ea7a9fb21960ae*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x62:40x62x128_n"280be0900d852da0a61c75249cafbae8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x63_n"ec77051910fe8cb78f2c3dc8307fe55c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x63:40x63x128_n"98952bee2745e454b9acafd9af724bfa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f8_e4m3:f8_e4m3:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"6bae444d591f9a0020650fa9833e8b6d*1067"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x512x64:256x64x512_n"a14cf1946f5a8d8a9d94b86aeab6f566*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x512x512:256x512x64_n"ad7e4cd83e5e70e4deb091cf362b96ee*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f8_e4m3:f8_e4m3:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x4096_n"64e385da9134d2bf2d372e8676737b08*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f8_e4m3:f8_e4m3:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x4096:4096x1024_n"1c2a12f0a291574ad2bce5446acf2a8f*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f8_e4m3:f8_e4m3:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x1024_n"ce0e21de1bfa38816888b9885dace5e7*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f8_e4m3:f8_e4m3:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x30522_n"1223f840ed9607bdaddcc534a49230f9*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f8_e4m3:f8_e4m3:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x2_n"2f141ce543d8e88267eff48583ba2960*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2:2x1024_n"0454391baa6fac77a4de8f1aefaf5216*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16:16x1024_n"f7901a6f9b90bec0d5115aeee51c21f0*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x30522:30522x1024_n"6734d9aad5ceaac1efcc1d3291faedb7*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x30522x512:16x512x1024_n"3a84a3c88ad459f63c8527a9df849592*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"ab3529af061463b554b5c4c3f146a25d*1067"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x1024x512:16x512x1024_n"a806006ae2c596cf33165cbc64d62adc*1067"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1024:1024x1024_n"f3d50d5db8519c6983ade0f7f8713e10*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16:16x1024_n"d9c7926d51674d02a9d52d6096e6e070*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x4096_n"4eea08cb7e317c0fcf0427780eabefcd*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x1024x512:16x512x4096_n"93a87a10ca6e8535a1fa2f91d6bd6271*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4096:4096x1024_n"d57b4f7ef62389f41e9c1386abfed44f*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f8_e5m2:f8_e4m3:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x4096x512:16x512x1024_n"715a42f68d824ee5cc208650caf00c6f*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x512x512:256x512x64_n"c4de9d5fe98082803314e7970854b076*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x512x64:256x64x512_n"616f5ac1ce6a23b61ddbb1e7c5bc64d4*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x64x512:256x512x512_n"e57187f2c49612d305e73d95c27a6dd2*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x512x512:256x512x64_n"de860732226aa298875e0a9a314a3122*264"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x2:2x1024_n"a291f2ee3d199c59bef3ccd58208d5af"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"b5057af9c99c06b69e15ffa216d70c52*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 16x16x512x64:16x16x64x512_n"98aea71b9137858bff4d1cab2b34ea86*24&b6260861a02433d0244f08c109d0a3c8*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 16x16x512x512:16x16x512x64_n"38fa17f24fad92e0d40cde3c00bf5808*48&051b491f4820a7cb2d6754801987d94b*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x4096_n"20e526769f2c9dab405e9858996f974e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4096:4096x1024_n"52d8ef24fd82ed4ab2df5313bc05b746*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1216x1024:1024x1024_n"dee59e3fb209915ce4c3cba71147e270"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 16x1024:1024x1024_n"907e48523cf91488f2a569103aa77942"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x2_n"4f9021a58f1fd44f24579a0787ec28d5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1216x1024:1024x30522_n"ab2ff96e8771f83ea666e788d3b5048f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 30522x1216:1216x1024_n"90483bea136375db9d2665a7753ef7b4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1216x30522:30522x1024_n"dc80bc3108dcbb10346d728883f2b753"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1216x1024:1024x1024_n"e16bfdcffd18f93fbe989e967086aa90"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2:2x1024_n"9a170ea449684eff593e48b425bd7af6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16:16x1024_n"8c1159e0784cebcaca8aaf1daebeb1b3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x1024_n"77ede99196b75f5146a41959f3cb55ad"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x4096_n"315c0be255496734767e215ea8d0eea0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x4096:4096x1024_n"f2330de806de7141bb6df2646b6c02fa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"4b9f4cf0461aa024d3e66253db70c8a0*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 16x16x512x64:16x16x64x512_n"b2b26ea4eed25a03d033c3712e1ec888*24&3eaac4467a5100703be4c4d92a466dc6*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 16x16x512x512:16x16x512x64_n"19e8847bf6c483e97b53fa7366eae618*48&a0ca66ed0d8e700f2665af3a1de35d60*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x8192:8192x1024_n"677b8c3b8f8e8c7add272e60cbdbba35"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 4x512x3:4x3x15000_n"8c34c1af99166c3db3d3eb6badf99e47"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x128x3:4x3x512_n"ff905b83359ca4ecd63c72cd71dd233b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x512x3:4x3x128_n"d0793987da9034b263302364ccbfb9ca"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=acb --dtag=abc --strides=:: 4x15000x3:4x3x512_n"eb19d3c64f07b6892bb9b5dbc3ff6f2b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x13:13x512_n"691db1168805ba5ffc58afc732d34429*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x256_n"da0510bfe13e25ac1d374658114b48ad*14400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x128_n"d052ad506b3078a1e644398fc22f73df*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x512_n"50366b39f2b4e4496ede51c62d730485*21600"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x3456_n"bdb852f19277a0ed43286d62fa0ba5d4*21600"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x1024_n"4273f76d4c045bbfcd5dd11b7115006f*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"af877683723f29b9e946e35b15dfd224*1940&e32bde9edeae75f3afd8324aeddd8270*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x512_n"2d98cf6fe25547ef98d5eeff0dea275b*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x1_n"7f3013bc616d1040bdf5683ddc4890fc*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1:1x256_n"3fa2bbb76d14cc05f4d10fc0f20482dd*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x8192:8192x256_n"bc348d2187e11da051ea22ca1629d4ed*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x256:256x512_n"b702f94ccac56b54ea2de196219ebbd8*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x8192:8192x512_n"75ff875e849094aadb59b001499d91c9*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x1024_n"fdf0d4eef4181162b7eb4686c766211a*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x1024_n"a0abc4de65ae1167828245f7c62a8a48*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"f63aba8580d3d9bf13de9aaa7a526d48*1940&4ad80af92f4e0f31e8dea43a0d058878*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x1024_n"fd537cdfc6ce6b76907e32a049ff21dd*1940&404fb3d16e23c3d306b127780840aa13*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x3456_n"79ba4a12f38aa709bddb04e5ce6241bb*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x3456_n"19b13f9294b306de80d042a78e4b0a67*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3456x8192:8192x512_n"e3d63ca823b8f0b2a744651251d31c09*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x3456:3456x512_n"e5410ee37bd789ae32cb7ab8a5434463*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x3456_n"c02e11e4fafe14d8f82376e4fa3f6db1*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x3456_n"d243f79aac822950f74ba2d0943df7f6*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x128:128x256_n"14bab2d051a32913356f339adb931d5e*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8192:8192x256_n"1a0a8d0962488506256a56b3515f7019*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x13_n"e0bdf0b112c8b6ac98f82b282ef43ace*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x1024:1024x4096_n"9b172adc282d5f0fd4ad69eee6280e88*5992"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x240:240x4096_n"fe2ec700c01ccf68c04d3a79e6761445*1089"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x2048:2048x4096_n"711ec921bd07261b58e23e54ee6ca7da*545"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x320:320x1280_n"c779a2e9d85b3f5f3059e3802a1335f3*4188"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x1344:1344x512_n"85cf47d395128777510b5778d018f798*1047"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 120x512:512x29_n"a69ce1f327434064cee625d7f1f42de0*1047"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x4096x40:1x16x40x4096_n"cc9341b9ef8dc2d95bc72176aba3a768*100&250486a448eab20754dc7f0590012344*100&0bb278e4023ef9828716fa4010e44fa4*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x4096x40:1x16x40x77_n"8a107cc3b639126fff4b52e86b349667*100&e2587324da9b20b6b67cf289160bc992*100&b63164ecc4db661809d617ed19b1ca33*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x1024x80:1x16x80x1024_n"0d79129301dad74859d73f52529df209*100&c6018a1bdd22087e2e6b348519f676d2*100&922662afe844e5a5904e8782b0ca7287*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x1024x80:1x16x80x77_n"35c57756ea1d8d403f0871874d010d16*100&a7d5cd9f88372ac7c9a1317753cbe622*100&b488a50a20d9a12c7d6e9c4a18a89d17*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x256x160:1x16x160x256_n"811cf0970edda7ba4933ac7b68e36f26*100&69a412aae26a13d4db875ff88e975cc2*100&7de6c26baff7fda20135447da1cd31a0*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x256x160:1x16x160x77_n"776c5304ef236f1f15a65aa0f5f17899*100&766948fa9cd896270253cdd8eaca2ed2*100&007094c19152d20a522a2e6121a7e2a6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x64x160:1x16x160x64_n"61f1c60f439e45a6c43aecc2cef5d6b4*20&5a268a5ebf44440e8186506017a480ab*20&ef8c30c1431b53baa77caaf1cec4cb5c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x64x64:1x16x64x160_n"07a991b68d69b8edb72becfffadc19f3*20&6ad4ea0d5a636396547d29babb5b8c8f*20&e5db62f3eb6b260620a94b3be1ffbbfe"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x16x64x160:1x16x160x77_n"3222289310b0c8c93102ab58c934b9f2*20&b0a63a84542aac796594a17cd11a4b2b*20&be128946465801a7d57ecbb94e5cf22d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x64x77:1x16x77x160_n"55bb1352e238bc60077748c0c5e66987*20&bb9c7cbf821dccd467d032b257f5d284*20&bc61af4eec3d423355a4aa3a8b362d9f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 1x2048:2048x1000_n"0419e8dc9f5e4cca443d32bce01f2fe9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2:abx 16384x13:13x512_n"97ebeb7f39371696052977b7703b5a84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2:abx 16384x512:512x256_n"3a7b45a226a7e6affb9c886f9810db2a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2:abx 16384x256:256x128_n"525b13d74fe41c5cfe489e66063f2ea6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16384x27x128:16384x128x27_n"9a6b82329dc762baf3336f5f535e67fd"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2:abx 16384x479:479x1024_n"702ce50eee0b15c1d7a3ac13a9cff914"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2:abx 16384x1024:1024x1024_n"32b2750d82a310e4f19ea7d1228a02ef"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:2:abx 16384x1024:1024x512_n"8f8ce1e73be8c92668580695dd388c67"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:bf16:0:abx 16384x256:256x1_n"9700bbd619510defd643c73cc76fbba3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1:1x256_n"ee7f78b23ddc05231aea56430c0996fd"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x16384:16384x256_n"5ee064b48d2e01443012726a9db898bb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x256:256x512_n"4ae008e7a4b0f89c9bd716aed8f48e7b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x16384:16384x512_n"242bf79d86e51ff8012127dee507117f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x512:512x1024_n"50245b8f500380b1336e27ea39f6a75c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16384:16384x1024_n"2b514ebb3d562c5c65d64151ee73dfff"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x1024_n"875deb8f0bb0a6aa9195e65aa86cca99"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16384:16384x1024_n"9c82dc7dafaf80ba6a82872d0ac17bcf"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x479_n"2b8d509eee2b28d7e725398914334a30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16384:16384x479_n"e787e92c78d72b49381c5ba49c35053d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16384x128x27:16384x27x27_n"db5694dda02a5d971579c8bee24c4657"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16384x27x27:16384x27x128_n"a361c302e5b9220b62006dc18abf983e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x128:128x256_n"ecc36ffecbcf866484aa912191496234"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x16384:16384x256_n"53c98f2482c35f44396302ef79f586c1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16384:16384x13_n"85a2462d386f3346b8ca6799dcee94e7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x8192:8192x4608_n"2103f116d1b2ef577c9775e6325b031e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1057x64:256x64x1057_n"cf09e31c0f456cf43fa2404177ea30a9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1057x1057:256x1057x64_n"ffd26bd6b8f3bd69b0ffdf576acb0106*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x4096:4096x8192_n"ce57bf5e4a6d49c154408dee8ba1a708*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x8192:8192x16384_n"106e2f0fc28d4e0335ab30019152e787*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x16384:16384x8192_n"a714ed21277091d73e2bf583a0d19798*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4228x8192:8192x65024_n"6d5cb1e3915d2147e9a3e81ae4d9042e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x8192:8192x4608_n"4cb5388353cfd1375b5f7089e5a5badf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1058x64:256x64x1058_n"b031db0e02da8b76e881f06f3e487d34*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1058x1058:256x1058x64_n"3cc8b8cadfef29ea417eb06ff1ab1966*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x4096:4096x8192_n"7b0f3152ff99609e4ba74b6c25ec27d3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x8192:8192x16384_n"07136e219c1463040bbaff73c284d7b6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x16384:16384x8192_n"765d6ec3c7851d62c8f04a3d95e84dd0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4232x8192:8192x65024_n"06098d81cdab8689a405e33956d2ad22*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x8192:8192x4608_n"96ab736c5f55e4e5861b8cb092aaec78*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1059x64:256x64x1059_n"6834b26fbd0799e5dfa2120e5acce784*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1059x1059:256x1059x64_n"8da12597c9e2ab8361c066f2ab0c62d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x4096:4096x8192_n"74733f1b816e7a2fd8d59b71e68e46c1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x8192:8192x16384_n"6259586089d907a0ee2e205146883823*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x16384:16384x8192_n"39c803ecb178e69e0690aa488613c9d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4236x8192:8192x65024_n"da67f40fadd0dcb25eb395245d3eb8f9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x8192:8192x4608_n"ecd872e0cebb8332df612a0dc5eb7617*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1060x64:256x64x1060_n"f4ec29626c43268c6b35bfbf2731dfcb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1060x1060:256x1060x64_n"2816c3b622aafb0acc6b3034614e07b2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x4096:4096x8192_n"8f089968a9d058610513a00c6ced2c12*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x8192:8192x16384_n"d9e8a5ca816e6c0a153808512f01efd6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x16384:16384x8192_n"3f1360215ca15cfab31fbc5ecbd82b30*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4240x8192:8192x65024_n"233b3502e03a7367b2112b84f1d60c4a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x8192:8192x4608_n"445da675380baec2971303ac4a3248bf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1061x64:256x64x1061_n"6d8d6786aad0a041f5de9878531e44d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1061x1061:256x1061x64_n"4a9a147539996a3b5bfd5b81f2952c8d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x4096:4096x8192_n"24a8080328c2e04ca68dce8caf16e6c2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x8192:8192x16384_n"5fac46ae5a8aefbf660ad2522ebe6a86*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x16384:16384x8192_n"57377028b5b699dbb73a4c1e91e34cb1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4244x8192:8192x65024_n"e8956bb0a942db13b1dc210e250a9564*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x8192:8192x4608_n"9bf123cb2bbaaf9b9faba7ce365e1cd4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1062x64:256x64x1062_n"f491bba32fccfaeee5a226feab54b107*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1062x1062:256x1062x64_n"0e21060712bd87db84d0728759759c41*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x4096:4096x8192_n"5ba7f8ac382d0b5cb09e523ef47fcb21*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x8192:8192x16384_n"9fc7f176cf01c1b1c132716a77cae49d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x16384:16384x8192_n"8a21365e6815b68405e87d9ecd68861a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4248x8192:8192x65024_n"8f1de03d6d9c34a7069bb3d11c733803*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x8192:8192x4608_n"6687371078c9c679a8d6c79cc5c84b83*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1063x64:256x64x1063_n"be9155fa273a1cc3d6574932957e0202*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1063x1063:256x1063x64_n"674f92f34f4a45c2c0abfdbd7e4fa908*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x4096:4096x8192_n"0e94ef113a95d5792222166a6234f3e2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x8192:8192x16384_n"464025980b2c5e1d5a09bda8d80479be*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x16384:16384x8192_n"56ec6e1314aa0f915eb812bf239bd135*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4252x8192:8192x65024_n"035e03ac9e78f8c89320d5fe3749928b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x8192:8192x4608_n"4ba2da649729c6e029f2cf1ded40abf1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1064x64:256x64x1064_n"ee65eb9cc24c8fb04b817e31a043125d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1064x1064:256x1064x64_n"c49603eec67ce9d13521f69e8d6239a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x4096:4096x8192_n"49cb58ed19fcffe1885f9dccb564385b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x8192:8192x16384_n"756abaa4468f8931f4dd490923e66c42*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x16384:16384x8192_n"1d5222a060e22688842fb8aa2a7437ec*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4256x8192:8192x65024_n"739a2f49a8dd433cfa81ca5bd97115e7*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x8192:8192x4608_n"11b93347fbb6197e87e0c4e8eb8e3395*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1065x64:256x64x1065_n"ba3beb6824c0f6d2894d40eadf85671c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1065x1065:256x1065x64_n"c3b98a9d9ed45957af22549d5b1dfc48*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x4096:4096x8192_n"5acdb3a6e307ca8390b882e7f0007b06*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x8192:8192x16384_n"063b6950fc5a7c47e4eae83d78b725c6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x16384:16384x8192_n"7355fa272de6f4005b91b86c8e5bb6cc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4260x8192:8192x65024_n"cd6b961b04cbe992bfd0014a1d8805d4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x8192:8192x4608_n"8fcc2620621cf27578da34b931c0237c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1066x64:256x64x1066_n"da10ce99ea7fc36d7c47486d2df811b9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1066x1066:256x1066x64_n"92f9718307898b1b348f0d25a270f5a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x4096:4096x8192_n"ad953f7b01f116136ac2469c72705a87*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x8192:8192x16384_n"53e041659b31dce7c7e4539ada037242*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x16384:16384x8192_n"bb1c27351874ea082a1b7dffcc0afae6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4264x8192:8192x65024_n"0136ede042bbde7f71a5b0a1192b35da*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x8192:8192x4608_n"0e7335b4ca0f4ae621395af4a7447af5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1067x64:256x64x1067_n"77aadf0701445e914c52a189b38e4716*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1067x1067:256x1067x64_n"ffa916392bb4f5f767f7ec087e58fd68*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x4096:4096x8192_n"2f669b0a23c2f0709f80aa13310b1ef8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x8192:8192x16384_n"a8efafcc59f186b1930088290c7046bf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x16384:16384x8192_n"9a9eca4006668e7c06f5b18de57cf223*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4268x8192:8192x65024_n"64cacf5e314b92df52686fc3382cf146*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x8192:8192x4608_n"f7b370f73264232fc4e3f20960683c9a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1068x64:256x64x1068_n"d76fb2ae40faff75b0c74e94944ab1c1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1068x1068:256x1068x64_n"0ac46aea38aa5ae80edbd4cd2de43b2e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x4096:4096x8192_n"0da686bd13c606e193edf885b99046b6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x8192:8192x16384_n"463159f695cc24db888f3e24512137a7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x16384:16384x8192_n"e11e8abed1e75925f11d95f17c3f6ac2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4272x8192:8192x65024_n"548fda917c9592c5b28434671fbe48c9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x8192:8192x4608_n"391cdaed8c966f45f0939f3d15b16a17*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1069x64:256x64x1069_n"7fbb3db8a8ae32d5fcb4b1f86c000f89*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1069x1069:256x1069x64_n"12823b4799306569388dd0d8c3d03895*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x4096:4096x8192_n"107f8169d61742408442fad8d5464bca*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x8192:8192x16384_n"b9c2fbd38ce31d4bc45286bed3f0f1c1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x16384:16384x8192_n"7ee79afb40fa96f890594c8f5be33ba1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4276x8192:8192x65024_n"1c95ef277a55671f66d9eccd978515f8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x8192:8192x4608_n"c9047bd6ba546ee5b26af6d0973edfed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1070x64:256x64x1070_n"9dc58d7b4fac5866e94c0e75a1c26ec1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1070x1070:256x1070x64_n"1c81df935650dad212f96c2fc612b46f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x4096:4096x8192_n"b393787bddd1d3fcb73c4fbfd3258bf1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x8192:8192x16384_n"fce3b54e513d3fa4a319c37c0c246293*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x16384:16384x8192_n"d0c527a314bd001c00ee030ea87818c1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4280x8192:8192x65024_n"4aeb38b7dbb3ebe136ed410962db60bd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x8192:8192x4608_n"c51c0d862eb0b528086a6fb6ff65979a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1071x64:256x64x1071_n"699321338b1a7599803388cc40ea4e54*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1071x1071:256x1071x64_n"8deb96df0ff78a5b872114a088387ccf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x4096:4096x8192_n"a652b18729cb7491b3bd266065df5fc3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x8192:8192x16384_n"8e55973d65abe182b91dbd3cd3629baa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x16384:16384x8192_n"8b3f4dfc931bcbec5eb81854419d58a5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4284x8192:8192x65024_n"e8a3ce2ffb2b082f520102746ac5c810*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x8192:8192x4608_n"5fd00e87c2d050e109548dad559c205c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1072x64:256x64x1072_n"960f43164c22186359ff368290cd999d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1072x1072:256x1072x64_n"d645d6313bc2c3879eab258793764214*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x4096:4096x8192_n"8c400c83f7e0ab58f8cb90d1d89d117a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x8192:8192x16384_n"c86c43a292986d24aa87e6b10ca720cd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x16384:16384x8192_n"4a0828687afd7ff84d7c065815b3e182*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4288x8192:8192x65024_n"b62b905af49e50ff5c1d5a723450b322*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x8192:8192x4608_n"f4f47e5e7b6231ecce98fdd789f8eb32*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1073x64:256x64x1073_n"ba9edbe920a5c46a07abafc2ab56838c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1073x1073:256x1073x64_n"246b23266c0c3e2cc33ab08c9e771802*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x4096:4096x8192_n"ba815f2c043d6038905911dfa9afe85c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x8192:8192x16384_n"b8b6111130dec52d723f4dd3f3d56c00*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x16384:16384x8192_n"3f67922ca631eb9974db35861cfb27f8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4292x8192:8192x65024_n"3d0297da4152ff338f60d400da899707*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x8192:8192x4608_n"fe44dc22ee81c81ef6bbf9cea030cc5d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1074x64:256x64x1074_n"622a6a71b27f2007a7408309fce01455*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1074x1074:256x1074x64_n"faa626c9dc0bac908e9ddf77cd8e2025*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x4096:4096x8192_n"6289f960325545377f89e819d89428f8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x8192:8192x16384_n"fbc035995a5b0509b407b2308644268b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x16384:16384x8192_n"ff8d09399aeb357c9d9f854b54196122*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4296x8192:8192x65024_n"32dfaeaa231ca30bda571f6a240c9afe*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x8192:8192x4608_n"94d23cfe094fc0309971e65b7c93eed0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1075x64:256x64x1075_n"6a3db1bf42d5f8ffa674c5faecc72955*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1075x1075:256x1075x64_n"eaff1e30c4b24eb48b149f4876d7e2ce*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x4096:4096x8192_n"f7bfe33720e681633e9cd7777f8fdcf7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x8192:8192x16384_n"bb947d4ac7b13be92e5775079258435c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x16384:16384x8192_n"be0ab71cbd7dc6feb47029cc11eb2708*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4300x8192:8192x65024_n"58a0c9bb31983e3f02b8e287cba03430*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x8192:8192x4608_n"887abe9d3e7847db431c5c27b38fe745*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1076x64:256x64x1076_n"1e2565825037b14da4b846c5f6bbddbf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1076x1076:256x1076x64_n"f28a6886344f3ba433915eb4b33ebbef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x4096:4096x8192_n"23858c4d204bee72a0987ed1a1e7e011*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x8192:8192x16384_n"79ba9f3223009639b56b3024f16dcacf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x16384:16384x8192_n"8c4be937694e61ec3ba679f895aafcff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4304x8192:8192x65024_n"8ec687d07c1c548bbbfe9774a745040a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x8192:8192x4608_n"7268d086a298696169632946baa70def*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1077x64:256x64x1077_n"f5ed047bb1d0963b8194575bb3b8fe9c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1077x1077:256x1077x64_n"5fd10985f5dbe3d2355f46fcc4af2ca0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x4096:4096x8192_n"4b029d51d65388f2725033239f3b5376*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x8192:8192x16384_n"6b2a6bba6240e2bd888411f0127661d8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x16384:16384x8192_n"b1253547c21c5fc0f057ac04d9530618*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4308x8192:8192x65024_n"acd70421520685d99e8f16d335e1c6d0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x8192:8192x4608_n"5f44dd645faa41af6ea45a03cd2a0bfa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1078x64:256x64x1078_n"1b20929a82c0e95691e1faa33f8509ab*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1078x1078:256x1078x64_n"76f13cbf34fe4566c6cd9b2ebd0e36f6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x4096:4096x8192_n"4301f98ce2a25ac25b83117487387fe2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x8192:8192x16384_n"c6174588164cc9c5da28a807cb01fcc2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x16384:16384x8192_n"219875064da7ba5da1e3b81723763153*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4312x8192:8192x65024_n"cea8db960e0951de7c57b6b314451e81*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x8192:8192x4608_n"be67f8bbd4669c5c9119211e02a4ea04*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1079x64:256x64x1079_n"1baf39c3b91d76309fa2d529025436f9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1079x1079:256x1079x64_n"7833981c8afd57cceb69384869c6b7e5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x4096:4096x8192_n"568a88a4269d97a94db898832c8bf5f1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x8192:8192x16384_n"f385eea0cc59868bb4b546237ef6e11c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x16384:16384x8192_n"4fdf4ef1fedcd4a15723e3e02b9640a5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4316x8192:8192x65024_n"0b98a07ac639de56cb2bb54412dd7125*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x8192:8192x4608_n"d0ff96f5a1a61236d34769cd1e46eefd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1080x64:256x64x1080_n"17c4db9a0b5a42ef8ff42569f874dbcb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1080x1080:256x1080x64_n"61000c943fcac8c319c7db1d4eaf390a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x4096:4096x8192_n"495765b439221ac0f8c2578ecca630b9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x8192:8192x16384_n"e12a46c299c851bc18acff98b4c39b77*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x16384:16384x8192_n"07b71bf6c7643df37c5cd77c06fa67b5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4320x8192:8192x65024_n"c35840557aa32a986f9b88a9f68e5654*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x8192:8192x4608_n"6006fae1e5be943596e742c260f0c723*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1081x64:256x64x1081_n"f14ad509dba82e3db12712befc1d9ed2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1081x1081:256x1081x64_n"3e24796c9bfdcab03c0a71dbe33cbb55*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x4096:4096x8192_n"5fb7546e9c2675c6801fbb41f0da8fab*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x8192:8192x16384_n"84229508f19edbb3b8c945088d63e4b3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x16384:16384x8192_n"dc2cee5e4e502584f8e7adf4db7d041d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4324x8192:8192x65024_n"206830a3bfcde8a98a5821ba9e8f126e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x8192:8192x4608_n"6a56b78c8fbb63835ca667bee77e733a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1082x64:256x64x1082_n"8ac297d9a8ee090d95c9db58c0872407*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1082x1082:256x1082x64_n"0b69d7ded7b5e6c12dfb7ebe20d01bb1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x4096:4096x8192_n"8e0faa6e29b04c339c97824ac04a0d39*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x8192:8192x16384_n"ca7413c70014ae5e8e20ea5696fae595*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x16384:16384x8192_n"cd040d83c422d8505180bd9ab7cded70*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4328x8192:8192x65024_n"99d0baab850c55b8fa3d7bd90b27cf3a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x8192:8192x4608_n"7494fe36cd1e7cb285035e653efad1f7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1083x64:256x64x1083_n"a7f0d50bce30949e9aaf23948abfaf49*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1083x1083:256x1083x64_n"86b34ea8a72f1533f65685dd5e6140c5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x4096:4096x8192_n"343cc91ee5e1fd39b939a89771afdb7e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x8192:8192x16384_n"5d5af82fa677924d1ef6974de7634e4b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x16384:16384x8192_n"16ce4c0a565303b9c8bded604d5eb768*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4332x8192:8192x65024_n"cc3c1dad95511483d2786bbc1246368c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x8192:8192x4608_n"a9bc7ccc7e2cd1a140bd7986925f2145*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1084x64:256x64x1084_n"59a671fe4c0e2e533e4f9625658ed2dd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1084x1084:256x1084x64_n"128eb03e6de9c158f601f7e7671397ef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x4096:4096x8192_n"4c4de5395699a76f0a59e07e4daaa291*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x8192:8192x16384_n"8a90907996a91bd256a194c2654a234f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x16384:16384x8192_n"7247ed4fb6bd31f7b0adbe45127dd3af*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4336x8192:8192x65024_n"779673a0a2a67969bb3ef03121c8ff08*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x8192:8192x4608_n"22129690bd3b2d1a05c35917775f4c97*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1085x64:256x64x1085_n"f18ddf7cf0de679e7dbe5c26541686be*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1085x1085:256x1085x64_n"6f3c6a6c699a9368fe6cdf6e117b3b26*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x4096:4096x8192_n"66c030bac097211934fd9d28e707239c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x8192:8192x16384_n"febe6aa790c1c42981dc39e4e61f7565*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x16384:16384x8192_n"170e6f557846e13bb004281ea4bd6eee*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4340x8192:8192x65024_n"4e43d03e27afe422fc6fb0514d16a784*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x8192:8192x4608_n"4b76d53a5805229838b681267371520a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1086x64:256x64x1086_n"265281730626b5ec88a91a198c0abb89*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1086x1086:256x1086x64_n"06d75e09b03dcd13579a16d819b6c926*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x4096:4096x8192_n"ed30b1f7b3c52492dce45aa2a71a192e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x8192:8192x16384_n"6336ad213c60d9e88626aba2fff5a36e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x16384:16384x8192_n"2e3d0be953bbb5fdb726e13441d3c2c3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4344x8192:8192x65024_n"40079918cf4b16c9a046ba24638a17eb*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x8192:8192x4608_n"77c7b9a99078f8c0559a0a5a996626fd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1087x64:256x64x1087_n"0ce14e3ade9ee123bffe31e9d706e13e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1087x1087:256x1087x64_n"8aacbac855bbec5ab7ee88e8a7875a49*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x4096:4096x8192_n"ac9f2cfb10f6693572e3b7d4c55e5dcc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x8192:8192x16384_n"91722d560c8bb85d1b1a4111a94dd4ce*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x16384:16384x8192_n"55175eee5ddfc08d8f5b5eae74f703f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4348x8192:8192x65024_n"5a09d8bd42e76dcd14ec0f87905bade8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x8192:8192x4608_n"30d261489b5646649827a3870e6e5077*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1088x64:256x64x1088_n"4054675ee02a3bba28e2eaec4d2161ca*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1088x1088:256x1088x64_n"ee0077b574118c8c9e1309d4017051b2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x4096:4096x8192_n"a76b4fab8f5ff2ababbc0da7dab5ed6d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x8192:8192x16384_n"02c4b28b80d068ec4e9012c29cf6d5dd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x16384:16384x8192_n"4d43144d089fc7a92255d05b341a57c3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4352x8192:8192x65024_n"9bdf07987eee26a3ad627bb5c331c62d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x8192:8192x4608_n"4ca88c1dbf0f6842088cfc06f70c37d1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1089x64:256x64x1089_n"b7f883469236ef5a5a4cba8a4c1fdd35*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1089x1089:256x1089x64_n"8315a3f9bc95570c7fef92dcdc6e5f9e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x4096:4096x8192_n"9de47e824eea4cca0348ecb552538090*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x8192:8192x16384_n"fefa35e2891450a1aecc520e05195519*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x16384:16384x8192_n"8ee0c1c922eda983ff4c0a29d894930b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4356x8192:8192x65024_n"9810e9d17bfc4629b1d539804676261d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x8192:8192x4608_n"4b606ac0871deb6783053fd10da53e2e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1090x64:256x64x1090_n"acc171f407f1d2b5e2e62e8bb5733ed5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1090x1090:256x1090x64_n"9bd88a82535840b993638ade468a6612*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x4096:4096x8192_n"3ecb31cafb00749514b59e70afaeaa70*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x8192:8192x16384_n"b391acf93665ecbc1d7686986c62d5ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x16384:16384x8192_n"ff49ab77747fe558f97d85991dc979ef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4360x8192:8192x65024_n"d9d221c11cdb290281cf9dde76f12f21*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x8192:8192x4608_n"137415819e9f49af447246f27b1eb8e5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1091x64:256x64x1091_n"750f1559ddfe0a7fc6c5c380d3a20f66*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1091x1091:256x1091x64_n"41a5725c8cc323bd2a26bcac980a262e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x4096:4096x8192_n"f5e2cdf4d664ebf95e88883fc9d2a39c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x8192:8192x16384_n"2c3bd7cf06bad1db29543baf01efe671*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x16384:16384x8192_n"b8fe9d570f66247c6c0458dab01a296e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4364x8192:8192x65024_n"caa34c0d343b6e5ea9ed83ec00c6cc53*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x8192:8192x4608_n"688dcd9c438da59ecf3e94839b83f58a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1092x64:256x64x1092_n"406e966513cae67f30cc0c96a09c1bfa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1092x1092:256x1092x64_n"107948a78f33a44be6c3615906dab3a6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x4096:4096x8192_n"67fa376826669fb3a91996037066b819*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x8192:8192x16384_n"36fb61467d2f8e59aa3531e3ee8abf26*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x16384:16384x8192_n"708be8c5b3ae85bd89f51faa27d90d19*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4368x8192:8192x65024_n"e9d1efa4a15fc11ab9f4d98eb1156c00*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x8192:8192x4608_n"403cad87c6a4e15d2fa4b56e840b03da*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1093x64:256x64x1093_n"f03b27795198d5425079eae8fad736a6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1093x1093:256x1093x64_n"69556502812a46e1b70ea4344397ff10*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x4096:4096x8192_n"f832a264db159ee594162fc7c627786d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x8192:8192x16384_n"a6a84bf820a876df601d7689c74255f6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x16384:16384x8192_n"e28a01c6db8c3f66a2ddc3e3124747e9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4372x8192:8192x65024_n"5d180222a3f7b2d11b7ff0ef2a888505*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x8192:8192x4608_n"b2cf83c32d37f76c794a8452ccede508*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1094x64:256x64x1094_n"d93db01ba1136af776fc411206d1c5d3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1094x1094:256x1094x64_n"7e3942ec80d3b17401039abf63919525*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x4096:4096x8192_n"e484b88918e1752a0f6703ab34d92777*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x8192:8192x16384_n"8f475a3399dfb46d5e26dfe3a28858d6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x16384:16384x8192_n"0d1427cf5a7faa87e951d3c8dad5ff52*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4376x8192:8192x65024_n"6be5f5f60db2ad9e89fa413b53048197*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x8192:8192x4608_n"bfbe81f44c3f81d0a2ce66b578fcd52c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1095x64:256x64x1095_n"ac3585781fcf7d0887f547d6e08a6731*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1095x1095:256x1095x64_n"752b74a4185e6a184a4c55c317fed23c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x4096:4096x8192_n"a449098f87503e4d8a9216675b9de60d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x8192:8192x16384_n"0cceac3f561acfcbee297b7945fa6186*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x16384:16384x8192_n"071806958d55176d6c53c722b8b4ee8a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4380x8192:8192x65024_n"53e119874296f9eea80e707dc9e2bec4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x8192:8192x4608_n"5fd0894b4b2933452577d431f6095879*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1096x64:256x64x1096_n"96efb7ff75df3a21b1f582233e91eead*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1096x1096:256x1096x64_n"64e46495645b463a652aa3d018d3e8f0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x4096:4096x8192_n"22dd0e141c513818ec394e913ac1d704*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x8192:8192x16384_n"9a9c0037eddc451c33796dae304b3113*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x16384:16384x8192_n"62169e634777596da886370835a4670f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4384x8192:8192x65024_n"4fb226477c8bdee2cf8fbd22255ac5e6*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x8192:8192x4608_n"99154cc188981d9f3268c5b6c2446b5a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1097x64:256x64x1097_n"fcdb6351fa1a2f3c45e3a74fcbd918b8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1097x1097:256x1097x64_n"16e3bd7d24e0fcbdb94b3d4a41fdedff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x4096:4096x8192_n"5e169834c7a5777f278660fc04bec271*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x8192:8192x16384_n"cedf500041f110c81d68bfe8d2b3d2a8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x16384:16384x8192_n"b1862a646654b4359afaf746f52957ba*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4388x8192:8192x65024_n"3e1cfdbfa35056eb801bd51937f5a8af*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x8192:8192x4608_n"9766738c920ba6b6c68256062b4baf1f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1098x64:256x64x1098_n"5cf681ee35471da542a3a7da8bc95e6f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1098x1098:256x1098x64_n"3a434def45e15f97bc75df165c8be8c8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x4096:4096x8192_n"82f61f408fa0d70519a8d68b0849111e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x8192:8192x16384_n"72c54172b09a1eb2ae4b260506c5530d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x16384:16384x8192_n"e8b887eb8efa1f7cd87867639bb50417*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4392x8192:8192x65024_n"6f469df234605e8cb8e9310e98c817ef*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x8192:8192x4608_n"05c6b10fa5a78cb086f1f82cc0e9860c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1099x64:256x64x1099_n"1e6877fd153c32ad12cef4f354645551*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1099x1099:256x1099x64_n"799abde0c3bd8d53af4c03635fe1304a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x4096:4096x8192_n"bc19d603119430162c2bbb8e8f3e1655*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x8192:8192x16384_n"549a41fac2e46b8c93502198a3467209*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x16384:16384x8192_n"35681c909890d26321ff2ac1246c3e69*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4396x8192:8192x65024_n"afc47f12250d65efcb64f71c7946052d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x8192:8192x4608_n"80cc7c15af162eafa51eb995e049635a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1100x64:256x64x1100_n"d278c418694ca493285f309f9295b51a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1100x1100:256x1100x64_n"40d977c306c008083670b3704660e8ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x4096:4096x8192_n"6a447424f84b4e8bfe6c568f39d037a6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x8192:8192x16384_n"676a2a94d53f6a2109bf76f0ed89e23e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x16384:16384x8192_n"9e3db23c09eb45d69df75308c2ab5a2e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4400x8192:8192x65024_n"203b3efbe50df9805695a01e31b3bf75*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x8192:8192x4608_n"1af431eac426a131507b62def7dc77c6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1101x64:256x64x1101_n"4ebaaa9ab780214ba4df7bb8656a1204*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1101x1101:256x1101x64_n"a132d0e2b42d7175cfa689dde48f206d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x4096:4096x8192_n"3e2712d89e641367410ce393c7361a52*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x8192:8192x16384_n"7a8743ab72eb825df62b6c4e83818f5b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x16384:16384x8192_n"d3f5967e52a4f827ecb1097bdbfc8ec3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4404x8192:8192x65024_n"2689ae518d99a896adf802ab5a8eb5f1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x8192:8192x4608_n"ab389a48b94ef9104c521509bf74877a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1102x64:256x64x1102_n"7fac2b84a2740884279e8c2f3a9b8b63*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1102x1102:256x1102x64_n"ebc77372b301465f34bfff101b23f5d0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x4096:4096x8192_n"81df383b3872e02826929de04e7d7c7f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x8192:8192x16384_n"00acf67df9ad4669cee13ae24fb3afd3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x16384:16384x8192_n"033aa7055ee263666e0d46ced5712f97*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4408x8192:8192x65024_n"72021e71a021062ca97c02325e3693d9*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x8192:8192x4608_n"460916fa0b870291cc5a9e48c0137458*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1103x64:256x64x1103_n"94a8ac52607d7fe40cd11b25ded97b10*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1103x1103:256x1103x64_n"922c1fd654e35cf258eeb7fbef223f85*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x4096:4096x8192_n"7575cff1adf44118dd506030b72c813a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x8192:8192x16384_n"ed03070d186a7efaf15ed76107e916fe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x16384:16384x8192_n"29adaa7830ee971dbb34c3cd3c84e026*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4412x8192:8192x65024_n"7de0cb3b546beef40cf1a9f32e2f1003*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x8192:8192x4608_n"c2b12ebf6f932ac684373ede763d57a3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1104x64:256x64x1104_n"e06d882b9a98b167720ac053f790766a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1104x1104:256x1104x64_n"7b1a4ce8f10c59bd77e479a7e64fd4f6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x4096:4096x8192_n"68e6ea7d1d93e7bf2df980e2ee7d425f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x8192:8192x16384_n"dbba1aec1fc6e222233f802cf9fb0de0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x16384:16384x8192_n"605622140c111b9c280f1972a55c66f5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4416x8192:8192x65024_n"44db907379c0519c79e443d934fba47d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x8192:8192x4608_n"5c557ee34bc2a85a20739383397bd139*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1105x64:256x64x1105_n"03636237ca86665157f5ac0e31bd0584*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1105x1105:256x1105x64_n"054bf7ba4eb4186872b333be772ecb10*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x4096:4096x8192_n"9b080f20a81639a1f66762b57929a2ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x8192:8192x16384_n"f3c44ec637223b0b543cf3a14273dbac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x16384:16384x8192_n"294285bb71f0d278cdb675d30471e4a9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4420x8192:8192x65024_n"214fc68b63defa88c9a849b384b49397*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x8192:8192x4608_n"8e8aa4c96a2332640244722c0d387ff4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1106x64:256x64x1106_n"d224d1396c1428d1028e58e19eaeee4d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1106x1106:256x1106x64_n"b118364cc2ebe72f50157291ac3eb7d7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x4096:4096x8192_n"85d0301f3dc10ed0b0efb510a7ba3c48*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x8192:8192x16384_n"379d221aa65c442071c1b14081a8304f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x16384:16384x8192_n"0df7b240cad0525b11689260801a32f8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4424x8192:8192x65024_n"fab5cfa98e47a071f58a16ecd470938f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x8192:8192x4608_n"58897dcfbd4fc2203cea1a5365013890*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1107x64:256x64x1107_n"4c86a1d333bb2af09495751e257e15b4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1107x1107:256x1107x64_n"8d50f96636cf5a69bb137ac4c7ecb7ec*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x4096:4096x8192_n"e583c5ce841b71756ba91185a6cbbd9a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x8192:8192x16384_n"172666d799e78eb5b93490ae8fadd1db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x16384:16384x8192_n"d269a805e49bb793df1038733361bf0b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4428x8192:8192x65024_n"b0a36a6397a5889738b0e6eb641eebac*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x8192:8192x4608_n"f2ef176cc2d095db1bd9e66ce0598e10*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1108x64:256x64x1108_n"ecf0eabc667ed510f029196dd307b4db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1108x1108:256x1108x64_n"732c5e9f2efc13c5d3f9870ca24f8a8c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x4096:4096x8192_n"044dfa226c425dae906882e87997d6fe*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x8192:8192x16384_n"080962ad3684a0d8a4710398be9c647c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x16384:16384x8192_n"0740d0c6a12e5a0818d06ecce98cffd3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4432x8192:8192x65024_n"d8a56259303c491a38083184e734bd08*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x8192:8192x4608_n"ee0039561114a66077454ec4b4fea2e5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1109x64:256x64x1109_n"c301494dec95765475cc928654590789*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1109x1109:256x1109x64_n"574ca895a88d66385cb025d7fb460ccd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x4096:4096x8192_n"aff46a61e1a4c83e1e3b3872d9b5b18f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x8192:8192x16384_n"0996aae953f48e8c1533fc21a46a6599*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x16384:16384x8192_n"82997f904153eb09e87dd931f1287de9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4436x8192:8192x65024_n"8f2c74647a9d857c8db2a117f6f4db97*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x8192:8192x4608_n"6d2a9cb321aa553102713e335fb95176*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1110x64:256x64x1110_n"1b672d06bcd828affff336ff6fcc0ca2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1110x1110:256x1110x64_n"7f975860bc1711bdfdded3a85bbba9f3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x4096:4096x8192_n"6aaafdd6bde9baaa4a29b74a9021668d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x8192:8192x16384_n"6c02080319e2e96a148dfd03383331f0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x16384:16384x8192_n"5123fb76e9f947d8a78feeeca88d88ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4440x8192:8192x65024_n"909f76bc0456d763b14326f4284d92f1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x8192:8192x4608_n"da06f4ea4d90678bc2723b81773c280d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1111x64:256x64x1111_n"2a150ab8f1ab96e68b8fe1ce52dc73e2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1111x1111:256x1111x64_n"639bc0047b92acbe563dfdaf023e881d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x4096:4096x8192_n"266fde22bfd928b11d25daf9c441ab56*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x8192:8192x16384_n"8036dbdbdb70cee287edcde8947b5456*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x16384:16384x8192_n"dae4cbe6071bf3ffaeb63601534b55ee*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4444x8192:8192x65024_n"060b86dbd2d0cb6d7cf05d7531e67abf*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x8192:8192x4608_n"91128096eb03b2b00a1a5ab8a6b36fa9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1112x64:256x64x1112_n"98c460fcb4a86722e3a89023e1b793b9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1112x1112:256x1112x64_n"56f9ac50e651503e5e9695e4f2797ca7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x4096:4096x8192_n"b9c0542128d2d899b8135c28d564d3ff*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x8192:8192x16384_n"dc30d6696fcc8316ce5811d35f0463b1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x16384:16384x8192_n"51388ddc9682aa43bd6fa7244f803c70*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4448x8192:8192x65024_n"14b10bbddf4dc9b6eee57f8c8d47090c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x8192:8192x4608_n"91306eefc8b2dc959852cd17db1dcd71*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1113x64:256x64x1113_n"7cd5fdc84128909a7d75a2f4951c102a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1113x1113:256x1113x64_n"69cfa0ea42504ba8946f9dcd18a70579*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x4096:4096x8192_n"180e5d669061cec489bb04fef7d1bef1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x8192:8192x16384_n"01448bf2b5427096ccc797c7115ddbd3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x16384:16384x8192_n"285eba6876f924b410c507834cf3da8b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4452x8192:8192x65024_n"ecdddf7286b32134d39efa3909cf32b4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x8192:8192x4608_n"6c3a874ea08d648ca582823b66695342*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1114x64:256x64x1114_n"a8ac0e0a34b9d08c333e6697f7d6c725*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1114x1114:256x1114x64_n"7668ce73990381185c419ea000e6e427*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x4096:4096x8192_n"e6775c87547c0efb7669a2611fb2d4c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x8192:8192x16384_n"00d5e6f4e004368e0cba21f0b0971ffd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x16384:16384x8192_n"7024614eadec870842eec74008c1567d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4456x8192:8192x65024_n"85d4b46b916afa2b51aeb4d8844933cd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x8192:8192x4608_n"38ea05380b6f9c5b9486fea65b2b9ca4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1115x64:256x64x1115_n"0c87ba769bbe3877b74719544f83129d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1115x1115:256x1115x64_n"d5ea7c87a9728f6177b4f9db294e396a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x4096:4096x8192_n"9edd0eb51468e3f3867b82d36d40eaf4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x8192:8192x16384_n"3c007f7cfbe225756660324cc7bf989d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x16384:16384x8192_n"6d16dfbac3ebf1d318b18ee978b050e0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4460x8192:8192x65024_n"1813706361ffe014f21acf5ac31bb6ed*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x8192:8192x4608_n"94366425d943deb7b411bc5e40eb582c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1116x64:256x64x1116_n"4c1fb316adedd3f3a7e2e20cd6e3f075*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1116x1116:256x1116x64_n"43df01ffd305735351fd983502158bb3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x4096:4096x8192_n"9a8cbc332a5f6d2f7a6ea52ca2bf6d8c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x8192:8192x16384_n"a68e1404d9e96ea647953d8090d247e5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x16384:16384x8192_n"529a08d69fb6313512de91be10d09885*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4464x8192:8192x65024_n"c2a9d56c6c0bfbeccead9bca3686c475*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x8192:8192x4608_n"ad3ba6049a02200366ab5eba91c71bb3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1117x64:256x64x1117_n"19a2953a6c730d56587424c0beed2900*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1117x1117:256x1117x64_n"e28315969d6860464a705f0df27fbee7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x4096:4096x8192_n"e135aa1d632e3425fdd72fb924152f3e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x8192:8192x16384_n"6a36a75ec74c28cd92a9ed00ef254816*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x16384:16384x8192_n"60c0054ca4c18ff58d589eb2ce75b3f7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4468x8192:8192x65024_n"4c3629ca6e5c1589b15cc07fb7d13d81*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x8192:8192x4608_n"abd2db9436a207ed7266cb4a5111e2ed*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1118x64:256x64x1118_n"68fbcfa4ba7fa48a0652a6031cc31e27*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1118x1118:256x1118x64_n"afba327e29c9549836f57f9838f419b3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x4096:4096x8192_n"8ec276eef92d2fb873ff160333fc1dc5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x8192:8192x16384_n"f43c8f4737061b9f9d579fa8d3e36dc4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x16384:16384x8192_n"7612ffa14fa6af01e1b8b28b8894a2ec*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4472x8192:8192x65024_n"128f54d7ccf05e529549bbaf998bd252*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x8192:8192x4608_n"a90234189cb6be79e645a7960088432a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1119x64:256x64x1119_n"3b8d5adbe018783eb0d9c326c0a62fde*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1119x1119:256x1119x64_n"a22b8a2526a2e518bf9d67ba76edd750*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x4096:4096x8192_n"eda2eaa66bdfa4eeb2c072dd46e6b5ae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x8192:8192x16384_n"e828b9892089b0f29bd7f0232d0b1422*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x16384:16384x8192_n"a079b08175a457af51ef0be9f8aa4d9f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4476x8192:8192x65024_n"0bcd5f4d892608d18f50677e77c80ffa*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x8192:8192x4608_n"0df10ecb207ebc715c30c49320d7a9f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1120x64:256x64x1120_n"2d8c86d34a28980786506cfc2f5edc31*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1120x1120:256x1120x64_n"97c0d8658fd46b222708a7562a4a45aa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x4096:4096x8192_n"52e16a24a1eea7d4fe32cd93336eff13*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x8192:8192x16384_n"d3795b4fa76fd890d2c73c59b33067db*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x16384:16384x8192_n"68d6dfa55133d42d8f975a60fc7e1d1f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4480x8192:8192x65024_n"cd9750085fcb266868feb7cfb99603dc*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x8192:8192x4608_n"2faf2f1516714e9668a36acaf4932590*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1121x64:256x64x1121_n"3e53bd4ff33ff22675d6bf049f144d79*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1121x1121:256x1121x64_n"420461a048a566416dd3fd24a94edc47*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x4096:4096x8192_n"d87bc9fa17ab2ae84511c3ab7b26fb50*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x8192:8192x16384_n"eb7c8d86904892b25846ab7ec4b81b55*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x16384:16384x8192_n"404a575c1071352346c87d997b8ed439*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4484x8192:8192x65024_n"5f4cc50a12142cebd3dae804213d0c3d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x8192:8192x4608_n"8f031ec6380b58d3993471b58f790da3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1122x64:256x64x1122_n"25ca6f12ec34066ae71b67b3eb919881*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1122x1122:256x1122x64_n"40969ff73f3aabe109c6e3f4b754dccf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x4096:4096x8192_n"50478e320cab725eed57f9a63c137fe3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x8192:8192x16384_n"410068ae089005abcc7452b6e9742f1f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x16384:16384x8192_n"2c5f2826b057be7d975576f39b79d977*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4488x8192:8192x65024_n"6073463b56dc3bed4696d7547f714d75*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x8192:8192x4608_n"2b0b96d52598a33c3a2b8433249bb97c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1123x64:256x64x1123_n"13f7bda5e38377ed672452c86829984d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1123x1123:256x1123x64_n"c97164497d055329b704bbd158eb99a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x4096:4096x8192_n"c2182b6196992ebb115931d803ab523d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x8192:8192x16384_n"02a962ca29993cb6e13859685329e96d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x16384:16384x8192_n"e400adaa356b472ef64c691236bf5920*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4492x8192:8192x65024_n"50db4748c93ac9ae1923200cd8464960*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x8192:8192x4608_n"c357571b7bf255c2d3e57e2c6155c1c7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1124x64:256x64x1124_n"0d038ef4f0a96ef68a60aa3e770bbb58*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1124x1124:256x1124x64_n"085cfc9fe34369f32517c520eb540a8c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x4096:4096x8192_n"1969f39f25ada25d2b3f5f7689dbe427*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x8192:8192x16384_n"e576c1ebe80e78e829eab55628a67c0e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x16384:16384x8192_n"cd3f60dde78ec13799671611a740b834*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4496x8192:8192x65024_n"77523416e878b1c910f4e7987270a42a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x8192:8192x4608_n"fe0cd52ca054c020fbd41ffd66f426af*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1125x64:256x64x1125_n"2773bdfdd28ff6d35c5f1af1847e08d3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1125x1125:256x1125x64_n"fd9d3c70631317a97c565a19533ec8f1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x4096:4096x8192_n"a14384f175e7f4c7a2dc9b2643f70a36*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x8192:8192x16384_n"0ae518e1c299baf21d98b2ee7d98c9e9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x16384:16384x8192_n"35f786a007a3551c577f8ec114860c6c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4500x8192:8192x65024_n"c0ec4f0331ebfb213d8f9bc7d7c0b1b6*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x8192:8192x4608_n"44c2f8fca96953f957cdd1e399e8761a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1126x64:256x64x1126_n"aec1e545497f8f6e92e4df9004b36c40*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1126x1126:256x1126x64_n"4ac1413817da71c6d290574dafdcd83e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x4096:4096x8192_n"43c9126aa7f0533ab2539c64744826bf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x8192:8192x16384_n"fb0e3c9f7bd9f77fb4077d32e8eb7cb0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x16384:16384x8192_n"39066ae25d43b6b70e93a3e830507322*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4504x8192:8192x65024_n"97bb037dbbd1fd8e7694796747c1e98e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x8192:8192x4608_n"d1a84c2eaca39b8454e17173207c151b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1127x64:256x64x1127_n"891aabfa6fe284660d0d0a29ca005226*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1127x1127:256x1127x64_n"8a9371cf9a5cae8b96356c1c9fdac6f8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x4096:4096x8192_n"7a1b3f1df872b4d4fe9e22f588569b60*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x8192:8192x16384_n"6b7614c706ce1d0ea91a620592efc610*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x16384:16384x8192_n"7ebd25d83c16984673e17a69f67abeaf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4508x8192:8192x65024_n"5586fa61ca03ed161cb2f984d398deba*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x8192:8192x4608_n"3de8daa691538f9de7e8d35e469fb80b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1128x64:256x64x1128_n"9bbe60718aa69f0c47d75975e23f3c5c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1128x1128:256x1128x64_n"4e34a80bb7e14649cac826015ce80b48*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x4096:4096x8192_n"3953c51b83952ca4f9d9356d74ad3988*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x8192:8192x16384_n"4b463af9ad45163b759c8be37741f4c6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x16384:16384x8192_n"f309f825f7124571aa45b9513013ecad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4512x8192:8192x65024_n"f544e9816a8536802dfb0d7a3c336ea6*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x8192:8192x4608_n"a1e9e431ed20fb34a66ea1a2cf6efe92*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1129x64:256x64x1129_n"03c79163d834268bc4cce0689510683a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1129x1129:256x1129x64_n"9a810c11bde9948b1b85c16f3b64350d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x4096:4096x8192_n"acce3646fe1eabfbad32491c60fb8973*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x8192:8192x16384_n"ba124ed86de8a10c26d4861426c05bde*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x16384:16384x8192_n"042475c9f959724ffa80b469124c455f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4516x8192:8192x65024_n"054a3002c56b1e40d7c80bb7994c1ef4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x8192:8192x4608_n"13620dc88234815354ff61f2ca3d69eb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1130x64:256x64x1130_n"e1f78f4dc6d3e494464367bf906e0752*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1130x1130:256x1130x64_n"6a8a665e27b5783abf8fd5870dc73f67*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x4096:4096x8192_n"910362a4bbd5534b123ab1df661c1158*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x8192:8192x16384_n"9ce89791891851dfa5e75714802a84e0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x16384:16384x8192_n"504eb7a22ba2dae58910fd50241646a1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4520x8192:8192x65024_n"fb8ea108d2635cc4c7bcd241c5be4b7f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x8192:8192x4608_n"8984b83d46552c6a2f98d0fea3279fda*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1131x64:256x64x1131_n"c449cb12a7da44ae9e910a3a8f1aec8b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1131x1131:256x1131x64_n"51a76398ce505407167d8d43fcb9ea25*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x4096:4096x8192_n"742c26bc7477042b5c91f73030aa7e6f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x8192:8192x16384_n"a16a142f46df588c632a2c9e5dcbe80f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x16384:16384x8192_n"3150f17f519eaa2298358b404239a239*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4524x8192:8192x65024_n"58eb4f89b2e6d455f5412ea05f69a057*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x8192:8192x4608_n"b9b6552e8059054df3e20fe5d646eb66*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1132x64:256x64x1132_n"7060f65f85c7354005f2143e99327acd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1132x1132:256x1132x64_n"52b6e3dfa3ddbb2bd3843426435dd49f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x4096:4096x8192_n"4f08c849605ae5d20c85f0e667065141*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x8192:8192x16384_n"7ad0b15fd41f0826f413732bb5463b89*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x16384:16384x8192_n"53b4701ad80bf61df7cc1be7edc77f9f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4528x8192:8192x65024_n"7e36a4e7d29f1488a0e0d2cd3d014412*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x8192:8192x4608_n"2cc51c462ece0c4e3a82b426dab77e69*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1133x64:256x64x1133_n"107e66f44a85cac8e2a0dfb93e5eb88d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1133x1133:256x1133x64_n"b3fce84928f80a8614c7199220ca9454*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x4096:4096x8192_n"9ad73f2ceaf06badac2da6805819a757*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x8192:8192x16384_n"9b67d7f58e9af2c1b3eca82be2db9330*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x16384:16384x8192_n"15f904d7a627db22602b4eac9fa6ede3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4532x8192:8192x65024_n"eb3c4b0adf0fc92fb508f4fb8aa8ab2e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x8192:8192x4608_n"083f91fb8e1cda581183cdc633e00919*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1134x64:256x64x1134_n"4a393be45763e23cb6f4986c0b9020ae*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1134x1134:256x1134x64_n"be8c6b46459a9be933d9999847f22ed1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x4096:4096x8192_n"a740b4791ca52476d3f097a27f01d5d2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x8192:8192x16384_n"1ba8f1c86257a5a01b2b3082f71aaf16*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x16384:16384x8192_n"1402373113bff41d418da13cb08c7fb7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4536x8192:8192x65024_n"027ab23b3ea5a9174ca71fa97bdd7ecd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x8192:8192x4608_n"ececd9ed1e7e138f82e4c4ef855da620*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1135x64:256x64x1135_n"54cfb12b3b8f3e71dd2232e9413fa1a9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1135x1135:256x1135x64_n"71450bbc19cdd72b022a5dffa52683d6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x4096:4096x8192_n"65bfbc323b1768c24bb7b27d56920585*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x8192:8192x16384_n"3961ed6684ee18e242702b7081cff729*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x16384:16384x8192_n"3d2bb17642b9d6017eb55b05633db4d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4540x8192:8192x65024_n"ecad15dc2f0ab0d6b4cb096326ed1a31*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x8192:8192x4608_n"97db61785dd5c96292e871a23e106408*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1136x64:256x64x1136_n"71a4c7193a661d4c3f44f18aa6934ddf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1136x1136:256x1136x64_n"dd9a9182d853ca2116e9a814bd8d522e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x4096:4096x8192_n"1b2e54eef78dc6dd2236d153a2cefcb3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x8192:8192x16384_n"65ce8c7e3e3d9c9a8c58c8253cbf4ba7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x16384:16384x8192_n"e62312afbdfd6ebb2b4239e54f2321d9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4544x8192:8192x65024_n"7887ef132fda2db4683abc9c44f897b1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x8192:8192x4608_n"460e557d1c1a52ffe604497d1d4ba4ca*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1137x64:256x64x1137_n"c10e49f4827e0b59006d4d3b67c11ad7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1137x1137:256x1137x64_n"21a1e1aa18b4cc45d5a92ec96652b8f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x4096:4096x8192_n"436698e727e8d1653453dc6d8144af3e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x8192:8192x16384_n"ed3e086fb74e22111d01cb0ec43b1f94*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x16384:16384x8192_n"34c003d945083c8f795274558df07297*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4548x8192:8192x65024_n"1d5ab006709d17e00076733f0c513f33*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x8192:8192x4608_n"7129a32e8a74c8586458e54345f31c5e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1138x64:256x64x1138_n"c5477495312afc70691ac2744738af95*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1138x1138:256x1138x64_n"bec6b8611016ea6aff83bf1e84f2b933*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x4096:4096x8192_n"c70e709a9671c80d927d3db458ddd894*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x8192:8192x16384_n"beb15826d3284470692d56bb790292fd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x16384:16384x8192_n"cac272c4801925143552f6341310dc88*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4552x8192:8192x65024_n"5b3a2c699c1ae7902fe9703970aba628*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x8192:8192x4608_n"f3580ae2f257b9af53af68d7ecf1f8af*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1139x64:256x64x1139_n"74b24d3e967b0227422a7f3c86462f33*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1139x1139:256x1139x64_n"9e0f53ab0ec956fb851a476e9588ac39*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x4096:4096x8192_n"4dfa6abab02b86ad73f6ed84b1bafbc3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x8192:8192x16384_n"a5aa7cb67d013060c5c5d2f7237318cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x16384:16384x8192_n"73972cf7f8f523c1b2140c7841f0c702*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4556x8192:8192x65024_n"156635ebdb5c1e8fc549041695a51cbd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x8192:8192x4608_n"20c35c762eb3f6d3fb55f26d45fbe95d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1140x64:256x64x1140_n"0ea3407ec7a922a4674d5c47f1d874d0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1140x1140:256x1140x64_n"c51e1008fdb8c964626225212f6d3133*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x4096:4096x8192_n"ccf702d002b1bda031c5ea44d1c8e132*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x8192:8192x16384_n"3bb48059b2b7fdb2a4d9e8aba69966ca*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x16384:16384x8192_n"7e90a303309dfc6e4ea3250a38ba395a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4560x8192:8192x65024_n"765d20097cbba15101996c8ef1aa74f3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x8192:8192x4608_n"4e03f4c6f81a37e2d4f80236cb7a379d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1141x64:256x64x1141_n"1121c2ec85911d986a5b315e6dea2f35*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1141x1141:256x1141x64_n"96996e50acfe1ed9cc63ece0daff2d48*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x4096:4096x8192_n"76cf4327fe3032078d971106e4a03943*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x8192:8192x16384_n"4c2d8d1438b8cdb515eb52af35188343*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x16384:16384x8192_n"23bc238919d5c4c211736d201aaf8754*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4564x8192:8192x65024_n"d4a815e1341ebf13ff88e26bdb5862cb*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x8192:8192x4608_n"a44b2691466fd359e0dcacca6dd80b6e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1142x64:256x64x1142_n"ccc39799e1969d2df77b1674c7157be8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1142x1142:256x1142x64_n"e93f4d0a34417a9b83abd4e4251e389f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x4096:4096x8192_n"f2bf4725dbacafc7694600c58a3e545d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x8192:8192x16384_n"3a5894aa0afa993c9976bc8945c40c69*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x16384:16384x8192_n"02002360baae4fd936a10171568d570a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4568x8192:8192x65024_n"c48d60aeb0ae45a0daa828b6375cc7f0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x8192:8192x4608_n"f5924867544c4b56af533ea679c7f510*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1143x64:256x64x1143_n"4a9e9e4e5ca7e39e6bf3d61f53c9cf02*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1143x1143:256x1143x64_n"f3b8fccfac1aa51f7c91077a734599f3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x4096:4096x8192_n"f1699fd8cbfeb9da2ae0a39ebcca0b6f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x8192:8192x16384_n"344171253d8b06a754937658defbea45*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x16384:16384x8192_n"cc2ac8e1736e52e772e79ba744e73423*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4572x8192:8192x65024_n"1f361bcc45816a70b52ff7e2c1c5c4ff*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x8192:8192x4608_n"42af4168f2c0ec965f5b3145bcea5c4e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1144x64:256x64x1144_n"c2580d3ddbe02d71b034034582828a80*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1144x1144:256x1144x64_n"6825e0cdb47d68bbbb6302fdb01c5a6d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x4096:4096x8192_n"06f9102980ddddffa796cb03c9bdd9bd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x8192:8192x16384_n"1abcde25f4407906dbfb0aaa40a8d129*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x16384:16384x8192_n"b5bb8ec62c2286b34b3d956c0bb9d47b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4576x8192:8192x65024_n"1ccb297e84d8a66839bfbc77145aacac*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x8192:8192x4608_n"c7688a2943a1f3f5082bdb2f2bbbb772*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1145x64:256x64x1145_n"bcde6867cec8951ce4f5614bd181f3e5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1145x1145:256x1145x64_n"6b2095eba0c0d0148adcea5854b5fe6b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x4096:4096x8192_n"3bed20b9de96b517c9a74fb0be969b46*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x8192:8192x16384_n"fe12319dcb5383e1fd3e1585124fb15f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x16384:16384x8192_n"c84494b95299077dd45d39b502eaf8c4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4580x8192:8192x65024_n"d6122c9e36d9bd4a497372d036912506*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x8192:8192x4608_n"417ced7281c95054e6d85476fb936545*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1146x64:256x64x1146_n"17c2cec7523a3ccdc9e79d26119e79d3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1146x1146:256x1146x64_n"f33a14816f66e707aa91bc8835a0153e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x4096:4096x8192_n"aeb5b9cf22492f79a0263f641275ee6b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x8192:8192x16384_n"c324ad9791e320fcee46425145f6a140*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x16384:16384x8192_n"ae566f5fa42440655eeb0851408f53a2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4584x8192:8192x65024_n"4a5a187813d65cf50f3285a18d0803c2*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x8192:8192x4608_n"0531513a9043592f98d7ff0e500e3e3b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1147x64:256x64x1147_n"ced76f9cb9f84655fbe8fc766a907c52*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1147x1147:256x1147x64_n"9cf8669e004e73c88425eab91bccd930*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x4096:4096x8192_n"8f85a0f0ddc9dde424eb9d9ebc8bbd92*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x8192:8192x16384_n"3f629f9394231b061c2531cda09a1193*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x16384:16384x8192_n"85ac4ffa84d29958259ee14b64daa733*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4588x8192:8192x65024_n"affcb03e70a2c416b8353add78b224a0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x8192:8192x4608_n"e88395cde52520988a9700a1a6435ffa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1148x64:256x64x1148_n"2a5797e635cd094a2b435c45b14da743*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1148x1148:256x1148x64_n"06918fd15b8801aed72758a454e55fd6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x4096:4096x8192_n"7d3878801c4d75953e02ee1ad951a593*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x8192:8192x16384_n"3dc9a9a29d7176b627b8ff6be9fe4957*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x16384:16384x8192_n"3ec1931a2db5f8ede2e9f5da6da956eb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4592x8192:8192x65024_n"63182ebda4519df80ab514fe59c10633*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x8192:8192x4608_n"0afb81fcdf74c86df6b9a3275f3b71c8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1149x64:256x64x1149_n"6cbfa88c0bc66a236522ec6fd47d9622*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1149x1149:256x1149x64_n"86f0bd19ecd11a5bac901cafd3840105*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x4096:4096x8192_n"50ebc698761c90ca3fc76f16cdc735be*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x8192:8192x16384_n"7e949fd64a43e3620a85330e97bf48fd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x16384:16384x8192_n"633aa373cfba78f43185e8fc13872c61*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4596x8192:8192x65024_n"c0eb32b5acf922cc57eca8074ad0218a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x8192:8192x4608_n"7ddade721c0960b4fca1884a7f23e4aa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1150x64:256x64x1150_n"591a844663f7ac34dbd2cba51ce24a2d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1150x1150:256x1150x64_n"cda0285df0e075b891f708c5e032cee1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x4096:4096x8192_n"fd275decdd5b1e191b13ecd8557ccaa2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x8192:8192x16384_n"6e7a62f218881398fbf3c706ebfe83c9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x16384:16384x8192_n"6e3004360cfc8cb27840839112ec18b7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4600x8192:8192x65024_n"b4c7483ce90392ce89a28908b45ad3e1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x8192:8192x4608_n"abd32d92c42784aeaf82caf41916d43f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1151x64:256x64x1151_n"19919827dba5acea3b75878d5620d524*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1151x1151:256x1151x64_n"e7db5ea68a6dbdf8918844800bd02fcf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x4096:4096x8192_n"d1aff091d92023d755e886393db383dc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x8192:8192x16384_n"3e3518c9307ba6efe33dbf9c9175a0d1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x16384:16384x8192_n"841316df203e1dbd5196070b73067a7b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4604x8192:8192x65024_n"4b279dc17e634435eb2798323d6d9d73*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x8192:8192x4608_n"280b2296f0c45fbafa6be55b96b7afd1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1152x64:256x64x1152_n"2d5aacf3b39ab233218540172b3cd8ac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1152x1152:256x1152x64_n"546544fa6ca93448e47115b102aad7f0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x4096:4096x8192_n"b80840b27bf95cbd0820d71315312729*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x8192:8192x16384_n"317b7ef08329ee64776a895d916d514c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x16384:16384x8192_n"19f6eb15171f73a6af7a529690a9642e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x8192:8192x65024_n"8a75e7aa0700ce1e0d66532008655320*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x8192:8192x4608_n"26785c6fc3d115646d28205ed49a6623*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1153x64:256x64x1153_n"b8fd9811a1ca52aa80e72e2f53ff846c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1153x1153:256x1153x64_n"936c19ffa5ef382cc4cbd9d6aa3786ac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x4096:4096x8192_n"8a7096e8e62869b726eb4f7a9d9da790*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x8192:8192x16384_n"a3f8c04fbfa69cd6a24e3f36de1cb64d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x16384:16384x8192_n"20f4d23ecf5b07b6bff7a925293c3b27*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4612x8192:8192x65024_n"5bffa7dde3214e4d314b0547f8f3be68*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x8192:8192x4608_n"424eaf2d84ed9c853672727ec8fdd133*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1154x64:256x64x1154_n"1835e997dd74346e55a45d950d3bb608*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1154x1154:256x1154x64_n"1799d55f5585d6cb40059ec3813eb616*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x4096:4096x8192_n"ea161df61d1bb1ef87d2289f02dd088e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x8192:8192x16384_n"3386182ede0c3052c4650bdbfc797f28*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x16384:16384x8192_n"e689af8820b1f209c719931f2bdb4f53*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4616x8192:8192x65024_n"1b974313f3b0004523ee0bfb230fd13b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x8192:8192x4608_n"68cedbf09688272ecb3bdab1e38f6f30*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1155x64:256x64x1155_n"7737d03edd79b616bc74aa219af75090*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1155x1155:256x1155x64_n"3982ded6d6d8b477372d4deab2368799*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x4096:4096x8192_n"df7d0e1e863229fb3592c49256a0f829*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x8192:8192x16384_n"8186da6a87aa6bbc29af067168949542*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x16384:16384x8192_n"671e20be5cf85658419707e9ccb5bc32*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4620x8192:8192x65024_n"0561ff7f9421a8763fe7f853431ebce6*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x8192:8192x4608_n"7dc17b4c8226fd38c7a2fb8ec1b536e8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1156x64:256x64x1156_n"21d08eb20c50e7a6b030d4bbc072cd4a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1156x1156:256x1156x64_n"55f9618c6c778b3595aabf4a80cedfa0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x4096:4096x8192_n"209ff9b264d73ba0d1a20c127decae3f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x8192:8192x16384_n"e61383d6b0555ff70641748bd9a6ce89*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x16384:16384x8192_n"3bd25e6a6a8d665ec4666e17aacc920a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4624x8192:8192x65024_n"ea2b14c9b8137f65ae571cba2d86bb23*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x8192:8192x4608_n"80ce165a5a18e738ea4e5e40d78c0a54*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1157x64:256x64x1157_n"8ec2e15b1f49af65c78a78a1fe234e13*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1157x1157:256x1157x64_n"540f6e033f2ca0f127c7a2f6aaea141c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x4096:4096x8192_n"889525ce884e0cf563fe9ae5fcde5a71*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x8192:8192x16384_n"ec4e2e9a72fdd7db93f0db2616d21acb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x16384:16384x8192_n"f2ef097b813034d24d4a033d89d3c9e2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4628x8192:8192x65024_n"3e8b5a65fcb4eb6dfbf17cf0103d01ed*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x8192:8192x4608_n"3a7a7cc3936c0654507cf37159fd92bb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1158x64:256x64x1158_n"7576fdfe22eda1bfb6ff7f941fcc46d1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1158x1158:256x1158x64_n"9be2252bf77c548a82f5a45987aed7f3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x4096:4096x8192_n"336db0ee75ce4bbaff04b8e96f050416*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x8192:8192x16384_n"e04ceb41bccb41d125585c1f46c60bd7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x16384:16384x8192_n"f2edc505a9b71a758de265bdfdb9e3ef*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4632x8192:8192x65024_n"d62cbe5d883ecadb3113b96fbd94542d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x8192:8192x4608_n"33f367d37b431b763dfb590445d3ac7e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1159x64:256x64x1159_n"08caab7ad61c869e16020cd2db20373b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1159x1159:256x1159x64_n"c4e803dfd12583583b781c845f2df8fd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x4096:4096x8192_n"67aba3e41ba2ba5cb9f1c9ed929a9111*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x8192:8192x16384_n"f1ad5402e17aefb73abf3b120b6aedbf*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x16384:16384x8192_n"60c5a5e74f814f351d0de7b1e380b65f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4636x8192:8192x65024_n"bbd9693ec8a24c804f4561dd6467539a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x8192:8192x4608_n"6d0f8f407155a1238627cefed0a648d0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1160x64:256x64x1160_n"91a3c0626f8b369e53f34eb0f01f321d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1160x1160:256x1160x64_n"238e28b6992406d043ebd412754ba521*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x4096:4096x8192_n"144946cb2d81bb6614f7178145bb821f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x8192:8192x16384_n"4ef841fa1ca7c1689ca4252fa2fd8f9b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x16384:16384x8192_n"acb7aa4617effe2d380df703bee5f4c5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4640x8192:8192x65024_n"95e4a0ce54eb4a30d68c75b3b908ab0a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x8192:8192x4608_n"bb648507857ad3c20fb7770911eb3a43*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1161x64:256x64x1161_n"b502b161b21125eb778075641a8b5287*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1161x1161:256x1161x64_n"e3aca94d88bc43f0603e1864a4da636f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x4096:4096x8192_n"d48f5565bd39ff17ec5a25399da2a663*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x8192:8192x16384_n"fe00209d1ab6a373a442b4cf00faf999*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x16384:16384x8192_n"33a4086d743fb350bb683e0fc8074730*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4644x8192:8192x65024_n"a0750d43ec1110e23b919d21e200c221*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x8192:8192x4608_n"c1cb9e12d9ef744c37ff1f116bf18375*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1162x64:256x64x1162_n"e867cff905ac8e003ebfbae3c02bbf13*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1162x1162:256x1162x64_n"3988fc0e46930466d5dcfcf6870c3584*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x4096:4096x8192_n"4c12129dec05b1b58c04bc70c5a9ccb9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x8192:8192x16384_n"137d7ee284ef33f60018b4eed79eeee5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x16384:16384x8192_n"db27da9a4ea415df5a75fdd010d13a64*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4648x8192:8192x65024_n"60a86cb69b33d2dffed90a9703c72738*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x8192:8192x4608_n"5f3a08f5163b3cee862b765e0a651c8a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1163x64:256x64x1163_n"b4751ac5c037b8d4647036485a8b5f7d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1163x1163:256x1163x64_n"9512eddb0e8fcdaeac31c198c9b9ab0a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x4096:4096x8192_n"5907350400c83194b2c9cbe1504cf38b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x8192:8192x16384_n"e9d40457499b1417fd33ed40d36489cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x16384:16384x8192_n"eba2391b7bbdcde47c4ad98d08eaa0e0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4652x8192:8192x65024_n"cb1fc51abaa9c4ddcca479247089ccd5*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x8192:8192x4608_n"0cc326e854b00acd4774eb86089f2084*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1164x64:256x64x1164_n"300a9337ed3efdd198db3b71dff5f46d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1164x1164:256x1164x64_n"b3333f79d52d6c7b686e1dfec0a66fc9*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x4096:4096x8192_n"c6593c79c8e0264dc4191c367a34261d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x8192:8192x16384_n"ec30ecd1b93c006a1a94d24287406f49*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x16384:16384x8192_n"fb17e461029c4b7d9aead8a2838c2d2e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4656x8192:8192x65024_n"d1712b1191d26dba447170dffd6b1f13*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x8192:8192x4608_n"2a6fbf748d57a85cae0eedca9f1ae135*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1165x64:256x64x1165_n"7867847e00dd375bc68ad36f5f3af988*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1165x1165:256x1165x64_n"101ae5d36fcebc853446512f162f0768*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x4096:4096x8192_n"5f137f4eedb5ac927ffbcb52c1a1b2d7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x8192:8192x16384_n"fa05b8f98ea5b28af831f000f1b04138*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x16384:16384x8192_n"c9c7f67d9dfe86c14cac768c7064f6e6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4660x8192:8192x65024_n"642012d19cabb848638bf55f82ee49de*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x8192:8192x4608_n"c1c5bbebbe2f025fd2334a0b363aafa3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1166x64:256x64x1166_n"ab94b4a4193bcdb3afa3a7edcec5faa3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1166x1166:256x1166x64_n"485e99d967576189adc9250ce76ba0dd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x4096:4096x8192_n"a62d50d9367ba68cd970e44dc81e24e3*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x8192:8192x16384_n"aa3b77ef1451e82b7fcba4e147e20ce2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x16384:16384x8192_n"ae127e45ac19fdd2a434156ce5b88054*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4664x8192:8192x65024_n"a2b52eaf1e980599282b8d64135da7b1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x8192:8192x4608_n"2aa2e66677eb1453dc96721c78e04f9b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1167x64:256x64x1167_n"055f82845209aa27f57a501300269853*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1167x1167:256x1167x64_n"223bb4a5f33da403c3cedd6a0cf1d004*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x4096:4096x8192_n"8293af014447b8760a249654822f62fb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x8192:8192x16384_n"5c8549bf68c2ff717a25552e7e1efe84*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x16384:16384x8192_n"8bd47ee75579b432012b1050fee1ac76*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4668x8192:8192x65024_n"29b47cd93eddc5754e10f3749121af6e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x8192:8192x4608_n"2437a26a74986c2450a1c967c0bbed72*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1168x64:256x64x1168_n"20204057c74a8a9b20e4537a31fcab0c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1168x1168:256x1168x64_n"e3fb8c4ef7e834bc678c376bec65bf5d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x4096:4096x8192_n"4334cd49e6f103cdbd939b572fb6fd63*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x8192:8192x16384_n"271cc4003b35761bd0bc68166fb3c36e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x16384:16384x8192_n"4e2bc673285f54d4a4c13d4be9fcbd99*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4672x8192:8192x65024_n"25c3c8302c46101337222d4a92e7eb03*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x8192:8192x4608_n"b804eb09faa2ecdf39f891a4ff2bf752*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1169x64:256x64x1169_n"661c6985ceb633ef027351cd249b29c5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1169x1169:256x1169x64_n"93b01f07897a33f79696c0fb503518da*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x4096:4096x8192_n"222233cedcc91afe6a21e631cfb61f6f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x8192:8192x16384_n"feb09ced123fe74bf572e297155704da*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x16384:16384x8192_n"bfc4de3b8c14387b7e3129076eb583f4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4676x8192:8192x65024_n"0d9d5b630b692eadd483724f20c588d3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x8192:8192x4608_n"6f53f2ab020bf23ffd24508f82357874*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1170x64:256x64x1170_n"7a1e1bdb3a82e642b76f7eba5944a373*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1170x1170:256x1170x64_n"0e9815f22df2a2aba918dfa9f25f414f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x4096:4096x8192_n"71b470cd1b2a862793c2091619f722cb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x8192:8192x16384_n"bf155081467b57a65547c86f4775f9d8*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x16384:16384x8192_n"58ec486a8de6dc31ef5202092b069053*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4680x8192:8192x65024_n"3c633fea18b5c7f661bf011cb6158145*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x8192:8192x4608_n"fd0984c1717018be0716b1aad59414bd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1171x64:256x64x1171_n"1f559ab18b75183a9e34faa9befbd563*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1171x1171:256x1171x64_n"72e72d2dd79f3944447a67c17555f6f5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x4096:4096x8192_n"ee4d3f0314ac366c899062fea11da219*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x8192:8192x16384_n"db8ad577af6ca607af70c2d25e788029*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x16384:16384x8192_n"a59bc2847b0b5367bec4ab1fa472a5df*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4684x8192:8192x65024_n"bfedbf6e047785b407bc2c5ca3e63992*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x8192:8192x4608_n"f01b766976ad659eece815165d439ea5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1172x64:256x64x1172_n"6c8a4f9c5996189e2a24b7590182fcfd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1172x1172:256x1172x64_n"0a3d6ffe8bc6008f586e3f75b50223b4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x4096:4096x8192_n"5c3478e54828d249110d8839c8a5d5b6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x8192:8192x16384_n"cf7d89aa666b92229abe59b180b1e274*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x16384:16384x8192_n"21933417a16ae96b4ab037cf89f61a61*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4688x8192:8192x65024_n"f4bd9ed9ef0d50b0d49974a9bfdda127*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x8192:8192x4608_n"ccd491e437ce30531b9a38f654b46601*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1173x64:256x64x1173_n"b7f50a48ab67789a8c8b8176937690dd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1173x1173:256x1173x64_n"bf29daa92be9707e69ec19c3650b8fb6*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x4096:4096x8192_n"1ac883325c09b21d1e295760b70139f0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x8192:8192x16384_n"d440f5465e977c4735627f4b2c9c74fa*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x16384:16384x8192_n"88915094333aa07c5055fe2655c14a0e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4692x8192:8192x65024_n"70005c77a9e9f2f81743e3125dd4d87b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x8192:8192x4608_n"1a267bd74063285701f4a1cfe23e4af0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1174x64:256x64x1174_n"647078157f51b080ce862a1875815231*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1174x1174:256x1174x64_n"4022af4a28d2dc98bf4d92222920df83*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x4096:4096x8192_n"8c4760faea404ab4bd49c98a8a8f1dbb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x8192:8192x16384_n"3b326e6f9a215536cd851d513356e354*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x16384:16384x8192_n"dbea17c0988e223896ef1408de8047e5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4696x8192:8192x65024_n"58a120e9c3d10e14cce7946756cebec8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x8192:8192x4608_n"7830b2108a1aefc3196f0b223db4cdf2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1175x64:256x64x1175_n"369bb4422cc6fad7b2b84cfe26ad716b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1175x1175:256x1175x64_n"45c370dc220d52b4492f896c0ebd2c8d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x4096:4096x8192_n"ba43f390c0694301809cc01e2ade0fe1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x8192:8192x16384_n"1d0cada3542e8c0992bf8cf890b81ed4*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x16384:16384x8192_n"5d1c26d40554a36774c303d63daabbea*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4700x8192:8192x65024_n"1f012f84d9529fa20a056d8b83c7859f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x8192:8192x4608_n"7448f780c1d8972e56b656fca576fe94*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1176x64:256x64x1176_n"eefe62c73c51f39c7d1d2c40221961c7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1176x1176:256x1176x64_n"18aeb2f0668733735eb0f762fcfd357b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x4096:4096x8192_n"df54834f8f0a8d71cf49ae99c51865a0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x8192:8192x16384_n"12626905a1c4f226ae528c23326e926d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x16384:16384x8192_n"815f6164c315246523a318ec8a6a2d28*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4704x8192:8192x65024_n"02baf05a53b9f22941d3efedf34ed09c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x8192:8192x4608_n"d7d6583607d646b9b1b2eecfa8c56f49*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1177x64:256x64x1177_n"4421d9e3810691b8936dc548caa8e9e7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1177x1177:256x1177x64_n"c3a9ec8327bcbc1f7fdad6468622b973*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x4096:4096x8192_n"33ab576225a490584bb66b420551f16b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x8192:8192x16384_n"b44341df7006d084118140b8cdbabd74*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x16384:16384x8192_n"d0d398fa74c13b252d68f2437d4a5b0f*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4708x8192:8192x65024_n"02b62c15f289fceed77cd17154b0c736*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x8192:8192x4608_n"85ebfcc20ad8aac31d2d34c61d114f29*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1178x64:256x64x1178_n"4916f01bdcdeb18b798c5ad369b6ca43*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1178x1178:256x1178x64_n"3b0e17fb71a8a98e1c7c356de235ce4a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x4096:4096x8192_n"3e4eb8133a7834ef28570a123a1e3598*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x8192:8192x16384_n"f710f04aca537806c58338c7f6841a0e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x16384:16384x8192_n"b1b5c0c79abc391310f2514cdc2d5b0d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4712x8192:8192x65024_n"74bf3752e26b1a36a63bde80436a7ae7*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x8192:8192x4608_n"6415e8651e0012ccdb10a8ef33c10735*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1179x64:256x64x1179_n"e8149c82aaab3c5e704e8b66bb0c8ec0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1179x1179:256x1179x64_n"76c6300959e25fb62fe6cea80ecebecd*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x4096:4096x8192_n"f45faf0589d92948ef6cfcd93f2fd261*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x8192:8192x16384_n"1b19913841e6c0a8930e792bbce0436a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x16384:16384x8192_n"fb94d940b64d6cbdab0833e28016dbc5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4716x8192:8192x65024_n"cefc6e5578c38132d0b15efb453af49d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x8192:8192x4608_n"67c7f0db419c3a075f5f21a7322c2d4a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1180x64:256x64x1180_n"2d90fac7f11eb8d7583ccf26eb5467e7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1180x1180:256x1180x64_n"8373fd775374eade48fda53d400dc92d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x4096:4096x8192_n"d97fc5eb3f088b63eb04b9fdac162e98*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x8192:8192x16384_n"e69541d941072dde58a59d067751b422*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x16384:16384x8192_n"931f68858113875daeeba9bfd126d99b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4720x8192:8192x65024_n"ecdca3192f0501f9ce0edef990adc36e*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x8192:8192x4608_n"09e8bdc9c4b93960f1b17f3217b58297*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1181x64:256x64x1181_n"ccb01255b2bd8829e17e21d12b728163*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1181x1181:256x1181x64_n"51eedff35eeb4777fc8a358e8e12b2d0*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x4096:4096x8192_n"b774807d5cc21d9e3f35ef91b1ba47f2*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x8192:8192x16384_n"2983842094c681f49b88ff61637edd30*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x16384:16384x8192_n"0077ebf57d751fb695fb2127a6e54732*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4724x8192:8192x65024_n"029c5f82bf290b1a55727a8018e5be4c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x8192:8192x4608_n"c52e397e95b9edef7be80ce3cc434337*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1182x64:256x64x1182_n"1d6e0145e6685701a16ce4531d86514a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1182x1182:256x1182x64_n"c710e1bff85b07da63c1688a34db7498*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x4096:4096x8192_n"e89bf73d47aab2a20b9a8ba68ad53f18*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x8192:8192x16384_n"7020de15cc6d930992579052ff2b3a06*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x16384:16384x8192_n"4b4858ddb68c97c6184e354fb501e08d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4728x8192:8192x65024_n"b47da98e382565ac4347fd589f876968*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x8192:8192x4608_n"16dd6c1bb2eaa33fdfd9a19243b00992*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1183x64:256x64x1183_n"5b16cdae2b834edc8a6bd6928d636dc7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1183x1183:256x1183x64_n"d6fcab2a8f95ced8ab0bb4061dbca90c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x4096:4096x8192_n"8b31f860a6f41751432c0b125e083362*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x8192:8192x16384_n"6b03b0968389c9e8597200371bba2907*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x16384:16384x8192_n"643ff5888692658777464cb194849da7*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4732x8192:8192x65024_n"4ef78aade48330ab44ce00833d3cc6d5*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x8192:8192x4608_n"4a6242cb9babb1f8fe2440363ad84a08*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x1184x64:256x64x1184_n"3c690ba2637caae6076c1a88dff3ed81*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x1184x1184:256x1184x64_n"b16ceb4b631cea61f01f294e45243f52*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x4096:4096x8192_n"48c47522199c714c4f9f0cf005a06de1*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x8192:8192x16384_n"c44cffeee16fed022d0c8f87821f7356*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x16384:16384x8192_n"6ca6b3c6a77db4a9c3510da49f05ffad*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4736x8192:8192x65024_n"a2fea7f372989b841bdc6921ad672cd4*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x768:768x768_n"fc49fa384bb9251be603aebeeb1a665c*71"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x768:768x768_n"79d9441e9175d63a638153e7cf98e3b7*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x768:768x3072_n"381618602eb0b5b6d1a670ce992d34f0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x3072:3072x768_n"6929d97a2c143850824c0798227534ed*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 154x768:768x320_n"d21acc6d88f82a25a1e9147df7f8893a*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 154x768:768x640_n"7c3dd62c780a1be0f64f2337735cf5ea*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 154x768:768x1280_n"54285e4e3f6ea63561cffbe3615e673d*240"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2x320:320x1280_n"4ba7efe006b3c1215a987cf44cca7b5c*20&fecb6e18ba6a6838c0b7804a0bae3173*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2x1280:1280x1280_n"52032bb636ba55b1cc46aba8dee79c89*20&f9014215484f2fef5d5e5d557cf2a357*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2x1280:1280x320_n"e7895c137d3f9003743783be747d7669*100&00ac0f0bec9ee0a093b9cfebb3778e6b*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2x1280:1280x640_n"9f0cdcae8edb312949612f6ade51f173*100&6a9055d9ea9af92eefbb9ff43a4595eb*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2x1280:1280x1280_n"5548b140e3e817e2e219a402d3ff0ba6*240&f7ba7a86be01e7f5924963150070a909*240"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 8192x320:320x320_n"6a09a3c0e451b652a98ef135ca7f7c05*400&823e54a60d3fee84e8d081e9bc4e85ae*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 8192x320:320x320_n"d71b2dce14a785e4bd8f58f608839a81*200&657ad90dacd3a5071d6542e337fea028*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 8192x320:320x2560_n"816f7757b51fd9b61bc5680e34b989dd*100&84a10936a92e575ab20393a57fe2c118*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 8192x1280:1280x320_n"9d9222e9a3b8e2bb5e6f348bdbe21864*100&2ad42dc420037dd8c9b82ca2f1d3d3a4*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x640:640x640_n"006fe0916d566803fb14d01fc4ddcc7c*400&dea808a3bcc2b013a6c7a93807bfd01e*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x640:640x640_n"4d9038b2aab2f6e0e34605e7da51953a*200&9ce8438e41ec845b1bbfd398824a2828*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x640:640x5120_n"a2d8f830e964968d878ab0491b5c52e8*100&826410717befbc162a0f35cd018649c4*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x2560:2560x640_n"539ed17565bbf3a94305114d349344fb*100&c43f6b2e2c1cc8d3cf7d8db6e1e074ae*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 512x1280:1280x1280_n"3d428706c97c563738542845f8a489cd*400&60caf505bcd840966362a4557d7f7231*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 512x1280:1280x1280_n"fde1a0d56b6f21ee8c26ef04bfe545b5*200&b25013935102eb49b0ccf7320c118432*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 512x1280:1280x10240_n"cc2b9cc5c5d2b8c1f83540ae6a300963*100&e55b62c06f33bc955247e7a3307c6614*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 512x5120:5120x1280_n"2d001cd0c7c369a4ea6126c7b0efa419*100&88c239599fe4e48eee9ea663eadd7ce0*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 128x1280:1280x1280_n"6612ba337f295fbf0e59e9b21c0203b1*80&abcfc022bd1c59a428f5b7dd99b66a7b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 128x1280:1280x1280_n"01823d226a9dd7c1d50ddd906a31d283*40&76000fae3bfe29f551cca8b8e2f849da*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 128x1280:1280x10240_n"9facc92ff58cac25b3a3411994009872*20&238fc3ea6e8ed399ce69c241bdb54271*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 128x5120:5120x1280_n"61c629f52bd9dbf4ebe35ca6cc5512b4*20&3baccb31c4788bb133b12432f27ac548*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 4096x512:512x512_n"9909096e20c3c7d03a4473bca64fcb45*3&0e770725f3768e2bbe4a1b61099b2a82*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:0:abx --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 4096x512:512x512_n"fe0ed747234f7ad96caf61192f660415&82d206fd69ddef99c9e63b817ec04658"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x4096_n"a4285b500b4f23cde8f489f95886d244*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x32x128:128x128x32_n"c04444e2ec89a72d119147e2b2d3283e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x11008_n"33ab9737f7b4b7e9a6a6eefcf13fd50d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x11008:11008x4096_n"c72a29ef32448481bbdd0a240e117d0b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x4096:4096x32000_n"7e754ecd92c0179f8310648998006a67"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x4096_n"13f87ce83981837f872c6eb65fe49d3a*3968&88ca6362acc3689224cdccb89ae704ab*16256"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x11008_n"ec89c7b7e97727f06e1d09777c780439*1984&6199eca0cbac63f29d7772da88ee4ab8*8128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x11008:11008x4096_n"a0ea6ad1e0d723e92910c379c442d93f*992&f4a0a55234e9ff604d97c0933eb6601a*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x32000_n"44a2461294e94becc0d63b89750889aa*31&4b8b919cf6de21c0aa5f38028de9d1ac*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x320:320x1280_n"2ebcacdbffd8113dd62ad072227e02f6&13ec48296dc15bd84b430a8c87371743&e6fd3d6a2e4df16ae5f8fe14c4d39f2d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x1280_n"8cdb55c196baae8ea7a86803841314d0*13&1197f5ee131c351e65657522f9738cb7*13&9d899356aaaf0914c186ebfa606cb057*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x320_n"f84f32f112fc04c2f7f3cdec565c5878*5&741919424a14a7d3bfe54f9ac36f4596*5&547bfefad9eb1136461115443461fdf2*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"6ebb6de98856e82afd7efdba409f0f01*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x40:16x40x4096_n"99c49d340506ca0b4227aa09faa6760f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x4096:16x4096x40_n"c6c713d6433f1b8ba946390c51423081*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"9b52f1e78df433c536babb42665bba90*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x768:768x320_n"1c857c88708550293ee76ec17884d0e9*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x40:16x40x77_n"ae9caf69e5f2fdad094ed36ad4bfaba8*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x4096x77:16x77x40_n"0a7978f814e9172e705416ec9f06b30c*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x2560_n"958019b2ad56cf022740eab038330adf*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1280:1280x320_n"ec397390239de2f4d692b37e42a35882*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x640_n"5ce9fa70288005c4717fda8596c2c9b2*5&baa54d5552cf3fbbb85779884886c336*5&9438a1d09bec01a6838b6e2c0dfb4178*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"e2d86d1ddae3f986813b8b12c30306c5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x80:16x80x1024_n"19261fc92809e78cc554bc9c23bf2bc0*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x1024:16x1024x80_n"d970c34b3ff5e5d225e3ca381b043f16*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"76d88e7271400ca75c305237cbb23ce7*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x768:768x640_n"bef4ff6e6a30f89fd484565fa95ccfd3*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x80:16x80x77_n"92eea1932938ddb18889de4f9182fe5e*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x77:16x77x80_n"b7864360088875a9e29720ebaa95f9a1*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x5120_n"346931ea079f4a274d709624a167b7e5*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2560:2560x640_n"9fbcb264e863a7e790b8dcc147b85a55*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"21fb9297912bcc27c1aefe6e631a032f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x160:16x160x256_n"0126268f100bab87b6d463d99b724f12*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x256:16x256x160_n"f10e5897ddc863f21d04dbd0e535f1c6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"b99696dbd37cc7ff74dfd8fe9c6aca99*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x768:768x1280_n"d2074451c4efa2d68001b0aab1e6b86f*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x160:16x160x77_n"d561756a6e4923a27efcac89178eb486*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x256x77:16x77x160_n"787d028d1dc3d0a31b15896e830332f4*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x10240_n"912891e0b87fd0c44d39bd775b0d3689*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x5120:5120x1280_n"e7f46eea14adb6b75a44551e1c2ce666*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"ad575cfe562e09033baf62a865f9fd95*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x160:16x160x64_n"c921dcc468200c92cacb8cfd9f945e8b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x64:16x64x160_n"9a13a28f1f6ad4866f552153f627bf95"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"784cf2b2990df7635ab9ce8d6dda363e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x160:16x160x77_n"983c0aaf06fdf5e6e1b980eeafec2699"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x64x77:16x77x160_n"dd4f9a7372305773f87783e6c96944e8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x10240_n"072224192805918d9b2bc31c9ad4bd9d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x1280_n"1262130ba52db67c71ce2339883d9666"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x2048:2048x1001_n"b0e00805cd5eead9a7b3ada124d8e3d6&34f3a5216f6ccb2fc4fc6af35b9ace73"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x13:13x512_n"cce83e00101a8c738f20d127d02c6b22"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x512:512x256_n"72e56a8b968ba41eae44d8bbd6c27417*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x128_n"36cb4b5c74cf740a76a5119f001b43fe"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x479:479x1024_n"99c69359cc4f934b3dd7504a2ebd7526"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"e00e3a74eef2fbab7374e07686c75014"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x512_n"eea73b09f6e2697302c820594ee00b34"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x256:256x1_n"a6f6e85bbc6a2223ceb75881d3af9425"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x13:13x512_n"3439a2502c9b12c4659e00fdbe2613b5*2&a0ed4e4c524873238e91547c306bcd4d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x512:512x256_n"0d36b950e91dde6b1f1ef9049c8f0bc7*4&b9cddee133d9b8221e05315e262d0e3f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x256:256x128_n"21299a3c2a8d37e88d2349cd10160cfd*2&b07f44eb87fd97702b8b03683f10ea11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x479:479x1024_n"5fb65bc37de96691f8d35a923394d3b1*2&a7180145e6419d94a7b8453dfa2b1860"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x1024_n"80d9fde60c2d4e3aaad7266f788e0af3*2&7ec1a6253e407992ea4ad202a0257d79"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x512_n"fea2d39c7e338183ad28effdb2e2c66b*2&58441101a30e1837cfd7044f2cdc721b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic 32768x256:256x1_n"2a6a1461e679b654a082ddebd69c5e86*2&998e38922e8bbd1ce8bd1b427cfd5990"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x2048_n"a338462cb4c68f9cd302c28ac70ee2d8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x256_n"c0ac30db63ff214c9f12554959c3ab7a*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=acb --dtag=abc --strides=:: 16x1024x128:16x128x1024_n"c7e801b01709b02ef093d4dd224866c3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1024x1024:16x1024x128_n"0e2e1eb47b89078c8c5e17edc22b029f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2048:2048x8192_n"a321a65e4d5ad722bab25dca62586625*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x7168_n"8312abec07ebe806ac220f5d106c02cb*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x7168:7168x8192_n"5bee1a25345a159ea0525ba64c59b6a5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8192:8192x32000_n"2c6f613a99cb5861992d663740953247"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x2048_n"7302d76bbf2b0474fdb43bea820b7d0f*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x256_n"ff707a783a067f501bf5edb40efff559*20320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1025_n"8045b635e0133f78571b78149025586a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1025:16x1025x128_n"8d7238919c4d9b9637474a2d7d986c11*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x8192_n"8e452c58f49cd51ce906e2ea1b364bd8*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x7168_n"f59d2d11d5be925f8145e2d515e91844*20320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x7168:7168x8192_n"46409ae626705ae3216e2f1a45fab33f*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x8192:8192x32000_n"f78a670d207d933bf5bc12fa98088904*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1026_n"5a438932592ed69d6e8cf3c1f0a9174d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1026:16x1026x128_n"5ccc2fdaef9c468eebcc1eba5840889e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1027_n"b86e492c36b22a78ffff2f78af092f04*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1027:16x1027x128_n"d8ec6e050f31907b9c8597416eeea63a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1028_n"e933f5d11072e327e4cde22e607f3391*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1028:16x1028x128_n"22111e99fc72a9fc516b975af5f57acd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1029_n"28d31a9baaebca9395896bfb59b6e017*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1029:16x1029x128_n"d36ca3f583cc499f11b4b4b64acb0d44*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1030_n"95cab58cb76ed5fff17e92bed1c920ad*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1030:16x1030x128_n"2a479ab86cb9c9f6b4f8638d1703432a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1031_n"05a69fd6e7d51fe894e9b3f910bf729c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1031:16x1031x128_n"22079f8906013b1b80d10de38d188d8d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1032_n"f4a0fe80fa0fcd6658ac9a6412b2f5b5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1032:16x1032x128_n"5fae974968e0184cd045fdb64b3bc017*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1033_n"0a363da9ce0631e0b9f113c057c77a9f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1033:16x1033x128_n"1f8cc61f586fee9c89680c40c02e5e29*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1034_n"a6e44c032c5cab958d1454c90d4b1530*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1034:16x1034x128_n"9a3b2491259cad4b66c2939f09111f6d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1035_n"e5ead3f0f789f0df28d0888adbad1f9c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1035:16x1035x128_n"86f822c23f4e9f0be577a719921bb29e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1036_n"be33cbb3f62c4687dd66085576941051*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1036:16x1036x128_n"04755da4001d095a5b53aeb080aadf84*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1037_n"e3559085346983fb271f6b9afef6247c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1037:16x1037x128_n"0c41b8418c00251aed84ca4f3c401922*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1038_n"ae214bc0610676a23fe3cd343508072a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1038:16x1038x128_n"8d15cb6783e5e32cc802bf37c5981fa9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1039_n"be355566a83b8bab24deb96d799a5fb9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1039:16x1039x128_n"fcdd5a7542eec09e28d10bbace14bedd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1040_n"d6e01860354bf2751b8840ca3be90b6a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1040:16x1040x128_n"6268b10745d08eb6b76e03b1b068673c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1041_n"e35acc87d4b502fdf36b90ca219645cd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1041:16x1041x128_n"5889a399dabf3b787802d04b9778a2e0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1042_n"2ed1949766ee98fbdb8d3d3f40467e15*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1042:16x1042x128_n"0585b08bd5c741d3e19eb805f0385b59*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1043_n"2ed24c357b7a9eed6cdd6bd96b720f6c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1043:16x1043x128_n"8bf09708d663e6792da549d700bcba8d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1044_n"bfc39037ad1bacc37f4826e6d0752911*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1044:16x1044x128_n"a83daf6ca7bb3de2eeb7b6506594ec6d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1045_n"fd7410341189993b7e9a3432f5e4e32a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1045:16x1045x128_n"7d2f2f096f1225a87295d0c7a7750246*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1046_n"dfc18e2e8408d2d16ee026f3ac21616b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1046:16x1046x128_n"bc610eb92c02fb5f36aa31d0ed7089d2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1047_n"26d02fd27a7b0ef4023dfd9c7cae6075*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1047:16x1047x128_n"18db4d6eba3ba488f60d685f95fdf9d5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1048_n"370e8e233b8b1f6f6fa0c198dae0b8c9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1048:16x1048x128_n"d6a1af333e37b49b117a77d5d04b8c6d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1049_n"7a5dfccb685d650b9a0da256cc213644*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1049:16x1049x128_n"e54de86016990247367efb390129bd89*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1050_n"6c71c9f9202d8816c760a34d5f28ffa1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1050:16x1050x128_n"e8e0e6253391a01e296ed76f50f4c517*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1051_n"4b8c2e24d083d2836305f8a0cc8a954c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1051:16x1051x128_n"4064cab5067e137ae3290fe45c5694b4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1052_n"3c9d67c01e8903fb28aac0146eff3762*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1052:16x1052x128_n"90e6db0a95ad86a4efff9409499eec65*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1053_n"a6d4d05c78656d12277289361cf7b25c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1053:16x1053x128_n"cac046b9881cef34534ed233291a4419*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1054_n"6e741bfde7ce0bd4d1cc6f5e3f28f1d9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1054:16x1054x128_n"5ed3663b41ed92f9d2719353260bd0e1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1055_n"b5d1ea5280c750975ed8d5c8b6245c87*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1055:16x1055x128_n"0dd8d253868ef14d3a7c7bf2c5cfcd5c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1056_n"9c77c509cf47540591f330334d5e1230*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1056:16x1056x128_n"ac0f5eca81ad5f680091582d1718d104*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1057_n"f45229966a759a73499ea8530f4cbc90*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1057:16x1057x128_n"2806200452ad13a968ad63375dcab172*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1058_n"0a089933654d3d1eda3435a94edffb28*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1058:16x1058x128_n"ddc427856e358370ae278f2269a2f733*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1059_n"306069cf44affe05c0e94521493287dc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1059:16x1059x128_n"e1b135d02ebd763e93e941314e14708d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1060_n"a467c5c4634a255b21458b3428701c6d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1060:16x1060x128_n"c20ec217e28082eb658502dde98bff6c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1061_n"ac2ac712ff085fd47f31e80064231546*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1061:16x1061x128_n"70cba4b1590801bf39afc2549e439507*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1062_n"44952901757a47f45c1942b2999cf810*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1062:16x1062x128_n"4d5e944575bfb3b4b161cf0f4f7327f1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1063_n"cc8b7d68bb5429de1ff1eb398b681fe0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1063:16x1063x128_n"03f944ab78ab49c103c9fb8a33a3d59c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1064_n"54346f0ce9d65ba0b3760ad195305ecb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1064:16x1064x128_n"6fe5ba39cf58f8e75f5068e0d6d5658d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1065_n"439bc259bbd7a4656a3cc37584ec8037*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1065:16x1065x128_n"ba21b4d83d5b768a3b601c7e4c67b957*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1066_n"0601629e8d4e7f996b729bf0aa3e0cde*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1066:16x1066x128_n"0f921d2f8522598c5c30ff81377d1d50*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1067_n"bdbb49488178c75d54cb16a25b402686*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1067:16x1067x128_n"14c9944c11fae11c731cc7d71bedb8b8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1068_n"56ac44933968aed040f3f9641f9fe3fa*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1068:16x1068x128_n"5a28866ce6038e4b24c1adb307910f48*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1069_n"86c9d09bcc50cbefe1f065fb66f1982b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1069:16x1069x128_n"2dd96c7a979abc4d282b7d194e9931ab*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1070_n"1127b6bf7ac903b1e5edde38435a0c7d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1070:16x1070x128_n"e26941a05def23af0aa775fc8ba3ddc1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1071_n"398ec51c0ceac5674a587e4183c62cb7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1071:16x1071x128_n"762007473d5ecdac8f071d74c6f14ecf*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1072_n"71625deb8c657adf61a8ba810c9a3e0a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1072:16x1072x128_n"038d6c3b12e083a62cc8a9cfd9feefea*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1073_n"0a681e74fcc32baa370d472a175d6726*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1073:16x1073x128_n"3200629ee55c382650bc8e18bb87becf*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1074_n"db535768f37062dbee5e9e1102b2ef59*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1074:16x1074x128_n"b71cbd6191c5c29a7b78148b08da764f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1075_n"21d7a35468fee762ae9be458b9b0d989*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1075:16x1075x128_n"d9e1b78d35318da00d34f8e15191390d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1076_n"64eeb207839302c0971f18e44321b00d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1076:16x1076x128_n"d4186c956f9bcc93b6976d150a13458c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1077_n"af756684a06778836815baebc16659e6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1077:16x1077x128_n"33f0799063ced0270a34d0fa4df48dd7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1078_n"962649fc79fe31dfd36ecf5151968b4b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1078:16x1078x128_n"618d431a1063bdf67432e0aa1fea51a5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1079_n"760b821971151fabb4d4458eca8b7bb4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1079:16x1079x128_n"33e642516c81aaf6e7ffd3c13109ce71*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1080_n"aee5641be0207437dc4f0fcb0cb5fa17*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1080:16x1080x128_n"3de1e257bac0ad8ebf6dcf1063536285*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1081_n"85292e32f5357629a6304270d274bbd2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1081:16x1081x128_n"f460bffadd7081f03d28c055a8eaf3d2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1082_n"081cba4c3e5c4262809a2a990cb71e90*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1082:16x1082x128_n"74d932b87e938afee00ed31557d6d422*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1083_n"fd508a2d271433c177b8ca9396cbf841*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1083:16x1083x128_n"b85f52ff73c5fa3a7573c5f5d4ed1f83*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1084_n"b518bbaabe1d6fb8728905891f9a13ef*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1084:16x1084x128_n"d78f5ae40b2a139ace4c1a2383151a40*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1085_n"f155f6219549c4888e41fc1f9ac24ea0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1085:16x1085x128_n"b976d194246d22a9c10be310eada18a1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1086_n"74a39071fd439ce77923d1931c47f3df*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1086:16x1086x128_n"1f412d3e51f6612e00048e21ac23d806*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1087_n"9e75531ef662f2e3cacdfc9c9fee3e00*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1087:16x1087x128_n"89fe0fa6b2b0d367a4c81d56c98df379*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1088_n"ae86032a19c6cc89903c87ad86a2d550*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1088:16x1088x128_n"ac4266136b9dbfd9410fad777b2e4c91*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1089_n"81185462a6c4691cc49821f0b807ea8c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1089:16x1089x128_n"88ee498c4b5ff82be4206940d5328a74*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1090_n"f9073be4477988937aef5e0835c4c0b3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1090:16x1090x128_n"490772e4c90032cc06b3e1f179126b19*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1091_n"5489e9a885fc3c148195953fa1633c46*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1091:16x1091x128_n"4cc9c0d74f9a022b6e8f30b6804322e5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1092_n"c66ad7c8e95bf3896163a2e7ab10289a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1092:16x1092x128_n"ec5e3e902560b833c4d07ce2f7292614*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1093_n"705e39d60d07b17ed819aa2629aa32f1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1093:16x1093x128_n"4f623ebb73171827bddfe68f402c549e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1094_n"9988d2db205940f85db3f11c4450793d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1094:16x1094x128_n"30722346b7f59c2ff42354f7b99bc839*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1095_n"d9f1ce7a2723a75ff22dc7be564266a5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1095:16x1095x128_n"729fe90b7935664574fe9cbe6144fdc0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1096_n"75d3c8f13ad8cccc70e8bf625e8e88f2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1096:16x1096x128_n"be7ff6298b7db3aac4d539b5e89059be*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1097_n"e3b01a2d1fbe6715e69a6e2a0ce27601*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1097:16x1097x128_n"d318106a3457d18a893be9bfb6b5535f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1098_n"9681ec6b3a5878f1ef772cff7273e713*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1098:16x1098x128_n"35959acb7df1b8eb238bdc54e97b4907*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1099_n"1c96ef23737a0170447b68b6cad69fef*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1099:16x1099x128_n"2b2a1799d47a093a1f51fef6924f293f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1100_n"68ffac3fd147653e4742f162129be597*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1100:16x1100x128_n"43c03427b0bf42ceb8c40abb6fee0d40*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1101_n"cc4db2de5c9f9e38cafbd0ff4b3cbb32*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1101:16x1101x128_n"262c11488bd987a2449f13f532be1112*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1102_n"72eba68ddce3e4f56cd98004cff54e0e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1102:16x1102x128_n"5622fa6ad418f2114fb8034788c7bad1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1103_n"e7e98f4c83ba8c569f674dc1425c3b0c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1103:16x1103x128_n"119d104bc960d39c0118d9aa2908fffc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1104_n"f11ed9408dbfeaa2ca7b2a8c2ad41333*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1104:16x1104x128_n"0b4e0c52069510da34d2ed916479648c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1105_n"ddf659f1ab795e00a6b19c6fda8efeee*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1105:16x1105x128_n"8a0ef28f7e4db8b7d818fbc388aaf537*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1106_n"0f56a21af91019c81018a2e127caad4f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1106:16x1106x128_n"e40efff6c19abd75c267eef6a2f62bbd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1107_n"62c94757f337d781b1cee1e5e3142084*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1107:16x1107x128_n"dbaf155d2f6308a1c6ae50be0dfc58ef*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1108_n"7d9dce1336e9d342d530080a619be21c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1108:16x1108x128_n"3d1a0d1118e781c542e145efc71993f1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1109_n"c17a9c97a57fbd715eb875422b2f8624*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1109:16x1109x128_n"d6258caf2d1b78173729645ab5377154*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1110_n"a1d51c4bd6f490085f1c29cf63aa0a09*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1110:16x1110x128_n"47e4d131bedbba0dea5fd65887b6554e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1111_n"6d8e0cc12b2cb599d19f33b8dd0ee11c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1111:16x1111x128_n"0ccaea226dce0b8d39baac2b90b3c589*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1112_n"0ba15b96116360ce857bfd5c2157b713*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1112:16x1112x128_n"6a2f152d4b757f67353bcee7b969ba3f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1113_n"6d9f9b39e415fef14768642502036441*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1113:16x1113x128_n"8c6e70efbab8cbfc0bf4ff116b7f52ee*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1114_n"e6580953df9c4cd8426b4d5021eae686*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1114:16x1114x128_n"3fd96fe57464c7d76ad421e3d05e1c40*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1115_n"1a47c712d4fb84a52ae4835e62e3949e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1115:16x1115x128_n"ebf50782238aa92392cb0ec1732d5912*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1116_n"27579f42a73ff162c27e9607ce29fdfd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1116:16x1116x128_n"3c7e4f92e0e41247c68391ef58115666*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1117_n"0b843555554e5d9fb776c19e38766ee8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1117:16x1117x128_n"383a1b15b68a3f7a68c24a264f2b3317*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1118_n"458bb14af871b4d0e5004834d54781f1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1118:16x1118x128_n"55c3b2d2c23a998b0b47b403ace69260*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1119_n"09248c1c2f02cd2ff65f81552ac11577*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1119:16x1119x128_n"93d552b19dac8e055021ba54d108aa96*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1120_n"8cc97fa8a2122984fc026db8d1bdb0b0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1120:16x1120x128_n"a3cc7ee5e8d1f08e0c883348f52aba06*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1121_n"d63c24b8a79eac6e2cece11d1be6bef3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1121:16x1121x128_n"92e9be4752d6e4234b2f2024fb512a17*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1122_n"aab7740a7e5f7a8372db616641ffe36a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1122:16x1122x128_n"ecb240deee29954992a00b532e8d6a28*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1123_n"207ebffd17cb57124b3f7e4d0cab98d6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1123:16x1123x128_n"74a0b67142737d1d6c3fec8c643939b4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1124_n"e42284cc07933ba643276196de5480b9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1124:16x1124x128_n"a8f41a60106cd52e16f434c3f13498a2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1125_n"fad4a20be5b46d7d41bfb18e62f18ef7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1125:16x1125x128_n"241e27a663d84e28cfc42ee55258ae48*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1126_n"824bc976e041facadb1b930ab21695cb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1126:16x1126x128_n"9ad71aaea6bded09bcc76c79b8e83b5c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1127_n"e976bbd3cc37fefd836b5e7db3cce8fb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1127:16x1127x128_n"0e9b4a02af9bca039fee313328c282dd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1128_n"638fadf976d8878d564b3421e76241ba*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1128:16x1128x128_n"48691987e4b4c5e3b22e2d7219e404f0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1129_n"9bc89d43be96364d1bcb4e88999fdd87*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1129:16x1129x128_n"4f6a3c2e5c6bec8c34966b0a91b32f70*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1130_n"f6a3421ad0aa2a43fd4ecc427606ce44*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1130:16x1130x128_n"93cc46bc59156662014180976e927d7f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1131_n"cd364a7082533074740a8cdee9bb37a0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1131:16x1131x128_n"9fd557cb6dcda107de110bca87fb0fed*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1132_n"377c46f559a0d0482256861d7caee214*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1132:16x1132x128_n"32f4ad26fe2c40df05acecf800f59f24*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1133_n"8bdf1bf324aecca6ffc589257fcdadcc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1133:16x1133x128_n"9e2a4e93aacbb698d755eb1bcc470fbe*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1134_n"aa3546a84a17ca57e4876e0956bf1395*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1134:16x1134x128_n"157c0361826cb6501077149e114b7573*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1135_n"88bda9a168d460c62a7bf6668d7f4b55*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1135:16x1135x128_n"63a452c8d5b6f240a3aa06cba4e069a2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1136_n"2f5fbe03569cc472351019e67a6f161f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1136:16x1136x128_n"e3618193cd4c8bcab011a2c0151156e3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1137_n"80e03064fca8f713c31344ef9f7638ec*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1137:16x1137x128_n"d11c1ede086d079ac2b7a2b9e006ff49*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1138_n"df9e384610157a9f7d28ffa888a183b7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1138:16x1138x128_n"49c31f1fd4eb562cc821cfb7d6231350*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1139_n"218a3acc69cc010a015330a4ac670446*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1139:16x1139x128_n"2581d92c9332ef956b52f900b8611641*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1140_n"32a33d3908be24377120f5affc8d40d6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1140:16x1140x128_n"de472339cfdb779d980f19ac6d0472f8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1141_n"1d2bb694f6104ef0c9713e3717ca3a43*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1141:16x1141x128_n"d457164f41b8bcc7bb62f6baf86adedf*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1142_n"22ef0bef2cb614ead0c7d17fccb9daf6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1142:16x1142x128_n"f17b0f5363cf037c820d8736cf0af319*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1143_n"0f5508dd94d9d6336418311ed1b22b9a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1143:16x1143x128_n"1ad6f29970ebc8adad5c2d47271bf832*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1144_n"9eedd6a5fb5548abaaa2ce289a28cd55*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1144:16x1144x128_n"e7294dd7d8a7792b49fc14c864703e0c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1145_n"452431ef4149e609e92e938b96ef464f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1145:16x1145x128_n"9f5618f3459c3f24172c900cf3ecce6c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1146_n"efb15f7c5bf80670418bf12181b42d8d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1146:16x1146x128_n"cbc1a931c82127ad0035273b1c3d52f0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1147_n"5b509381d2e2708929dae1ce8241a835*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1147:16x1147x128_n"bcf7688dc5d3c5e6bfdeef8fe10b0319*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1148_n"e027d8039d7d74a7b8403c7c3dea72a9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1148:16x1148x128_n"a520d8059acb5ba82c3a0a7bcbd864da*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1149_n"4055486664369e9f10216a642b021688*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1149:16x1149x128_n"5359333b5ffb563e2b8a5b7c44d69c9d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1150_n"b7aa6e599dec6eadee57c6a73da1b477*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1150:16x1150x128_n"196003d7a2d707808b437e8ff43196d6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x128:16x128x1151_n"a7f00b1348e851700c6a25699ac83415*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1151:16x1151x128_n"962caa997937cde82ffe6fdaf206cafe*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x4096_n"8639c0ca18d5e92ab086005441e57bc7*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1024x128:32x128x1024_n"ac4dbd491af1ae1e08e0b1ef775bca56*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1024x1024:32x1024x128_n"fe6d7bc529de6ba59d66f0e03fc84d9a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x16384_n"d530dd26ca6221057a9f856610893f4f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x16384:16384x4096_n"4c7ec1702c9f12b389ecf477b9f4959b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x50272_n"283902f65957c8313104062fbf170b04"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x4096_n"71a8698739360e39e6c7dafd6663c5d2*16256"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1025_n"dc8d4b03cc424f326311a1f97c90ed07*32&da4d03e7c68407b1ae08dbe1939b1816*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1025:32x1025x128_n"89b511b17b35f4372811c2a176bbaef8*32&d014cb365b58730ba4873ef2038613c7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x16384_n"23a230746c435ccfba93e965cc495490*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x16384:16384x4096_n"06a247435385320f5f4a817f21070c0e*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x50272_n"8abd1e96f0407b8f54acff1ecbff448e*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1026_n"52557b5d5e4235580477fa96f7e5f7a4*32&cbd1e354eb67efa0fd91905e6026a947*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1026:32x1026x128_n"4976e25df588b5319b1aded9f8188f15*32&bd176f8ec876fa913c0c962e7fcb134b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1027_n"69096fdedc8e08aaaacfa1357f36c0de*32&7b075304c6e40e8918c3606768e2c572*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1027:32x1027x128_n"db7e920373fbe6f9cc0a661dc57e0f2b*32&d466f4824b82001eecec00c4b38932da*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1028_n"3418ccbde1e0e3619296446fec8b704a*32&6d22cc0e02a56ddc44f0a08dc2d37129*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1028:32x1028x128_n"2f0f307e4576952ca9d00fefb0b44a00*32&aefb16e2f6eb26e485875dfe2250f492*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1029_n"0958e7da7eb21cfd7f06071d46c408d7*32&949c49f319616a7740d61fa55f8adfaa*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1029:32x1029x128_n"755468a896584b6560db2baab9cb027c*32&6f48b946f71c909ff2cf67811e4b4602*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1030_n"8fe1ffd53b225ae9837d2bce14feffa4*32&c8057183254b002585ec69362ef698c6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1030:32x1030x128_n"a374838728d8edfa1afcade5cf561629*32&be80fde534e86590c8b30b484eb501a9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1031_n"4c9c9241d1b6c4c4e987ebedebfaff51*32&e2a004ebcb905f2f3ebd050a4b2b53e3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1031:32x1031x128_n"78a6afffe02e54f48a5911739cffb0e4*32&6414b7030c3182b715a98956a9a253ba*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1032_n"1ba5560202988d0076749ebf1eb67e2d*32&1afa9240c85a92afdde248267ee5ba75*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1032:32x1032x128_n"376bcaa59bc1635a46a4ae775acc73c8*32&eaf88697cecf688b8134a603fea8f76e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1033_n"f2df582a5684b4cb145a85c5d584050b*32&d836ce1a5ef49d5fe762e7c970706e78*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1033:32x1033x128_n"cb1bcb0610ff45e51069ed914a5bfa89*32&6748c6c59452d21c83ff0149ef911eb6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1034_n"f79e661a8fdbad4b32ce5b7ca97c3269*32&cfd59f99fd859db5bad5f3953b1d7f58*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1034:32x1034x128_n"e593bbd81ce6c6ba62704461feaf4149*32&f8df5a3b94381422dc33231e148183cf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1035_n"bd491d247cbf00c60be88a142fa128c8*32&196b40f91da08571480b46e44ceb905a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1035:32x1035x128_n"13ac58de4ab993cc94b043dc2fb2581a*32&daff2d38d5d5a6ada76462398d9f6679*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1036_n"166f3712817ace28aee9d73ad8abb08e*32&3a6e8069027d068d0476728c04449b18*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1036:32x1036x128_n"87ac2ef3903d29e9b60e479c989aa606*32&e3f766271f703f915895f1e9be029f54*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1037_n"0fb7849bb878a7a4042229d5b31e65ae*32&4223e8d0ecc52b31ac5d85f36573176f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1037:32x1037x128_n"5cb1c7eae0ff137f7c175bbbd5df09c2*32&331c68f00bd9df05983fdadca16a46d2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1038_n"8bd77721d052289c4b9029ab3321eb0a*32&f1820afd5e1cce04b615b41d45e8f647*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1038:32x1038x128_n"30e89133b73e6b6bf608a53d6b03e89d*32&9565e043e8745ed4dde4d9fbfedc7ca1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1039_n"a24c7c52741542e8aa5d29488d41ccce*32&295315bf00fd71243961d47a3cea4764*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1039:32x1039x128_n"229612fdce7142c36bf485c8503d08de*32&5efa189e71a428c726ecff222b3ed715*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1040_n"9e397f941ee1beca2433a89a1e620082*32&bcf95d06898ba0310091b046a9f9b54b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1040:32x1040x128_n"24b92f59e5b1fce63aef473f4209390f*32&e68f369e22a601e22378482d3d94ede9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1041_n"471b287385aaa8f04849e60bfff7bc66*32&8ed13929af0646940d3d911ef5366abf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1041:32x1041x128_n"712ff0a061f0ab5358d1e843165f4026*32&2ee6edc40bc2bd03070f4bee12a74aaf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1042_n"90d8801abbc99984c4afd1e8fd3e9ea6*32&b3d9d2271618e34a5b8d415d58d984ee*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1042:32x1042x128_n"d8d0b993cd36c22c8b2836c69b5baa18*32&264da4ec04929d88019bde5a24f8fb5b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1043_n"fd5bef6854949e453abc6f691dad86c0*32&1a87044e0201e493b96e39beff3fdb88*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1043:32x1043x128_n"ba0a69d1d9f3fe5ef633127fd784c1b1*32&45344ed41af2788b526b0a0b98b58068*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1044_n"9c6a05aba9683a21425aac978d6903fa*32&43130d7f572b72200416cf920555ae3a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1044:32x1044x128_n"4d08c6df2744091a63f1ff58eb576c17*32&470146aa7563832c485047d4413afeed*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1045_n"12f2fcbac4e5393f6310e7dc300bdc41*32&ab56f56a517fa74892626456f2e530f8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1045:32x1045x128_n"8ccf9cf0f133674450317dda65799cfd*32&1f02fdf82f0c08ed54fd782498f82961*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1046_n"ee39a8b564926499dd3b0aa1b4728b81*32&6506aa1954e2993c56ced077400fbc2f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1046:32x1046x128_n"8dda209cae4a4d692ef7bf1cf50c7f47*32&bd27623200d752c30d3a8abd71c9514e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1047_n"5892e014a92c46dd1a2c56eba4e359b3*32&5bfa2356bc6e434d596665e03f41b7f1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1047:32x1047x128_n"27244f251a39ba9ece8e1c7f036024f7*32&eaa26219829e3db206931fa77b84ef23*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1048_n"ac190e42712d75f8d71bc86a252bf73f*32&c3de9c3aac8cbe0d7defc81207d1a81d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1048:32x1048x128_n"44745b9dc0ad001a0e2015bb272d169a*32&ee22bab0b92c8497f87aa07a0cf12bce*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1049_n"bf9db4e899254141756e2545f2411283*32&f925860ed17bf423a1b259f7606cfa5a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1049:32x1049x128_n"0747622a746539015809781c18532618*32&4b04dad75a3951ea187ad008385fd03d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1050_n"6f59d5862e5b12449828a2c5ea45f12f*32&0cd21b6faf1971b2bd6a7d758aeb4c25*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1050:32x1050x128_n"4d9aa2a9d7f8a722a01fb3a5d44001c9*32&3cf4bdba2fe0a753bc0a31c171dfcfed*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1051_n"5a35282ac926d839f6d0c4bd843b457a*32&3d80375053d5b6d432cba31c8e4349e4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1051:32x1051x128_n"77e1c6e7c5e3ce138b2bb660b222db62*32&baca95c74b135069149974a2b3bcd592*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1052_n"e5d289313bf5b2992dbaa4393f5c7812*32&6c2552cad258800f7ba2b8f8006cecb7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1052:32x1052x128_n"7a242af9ff8c79c446c904910d50dd3c*32&7875ca10d3f8b9ad5e9473934ce54980*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1053_n"3569d98c057758dd225772e10cda0eb2*32&742d5566a5828aac23b6a9619a8c6f98*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1053:32x1053x128_n"1032fc0794489a3eb30225df98aad524*32&46007fc27de17960de9ebc41a1c4ecfb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1054_n"f90a553a5034c3135bb1347e479aae4d*32&23708b259ee072110589afb997925967*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1054:32x1054x128_n"07392e23ba78c803604529503e6d433f*32&e3bdbc38ff400b433c5751fdbde4b71c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1055_n"6e4f7a41849438b429f3aa401c86abbd*32&72e29b5d9f95db332d3b79fb88ced53e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1055:32x1055x128_n"42fdb5a372b0b45b90897cd474ec2ccc*32&db0ed0ef66b9a1cfcbfc33e2b1444071*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1056_n"7fea29f328e9344f445f85675e172628*32&d34a8363d882636355160b67f09d87df*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1056:32x1056x128_n"d7ba6b768a7fc6141cfdfab610ef5999*32&56058c0ae1f98bd6640e82e3eef635e4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1057_n"7be530e267f4c5d8735abfa402eadab2*32&2d4552006957be3b295469fdbea6b1e4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1057:32x1057x128_n"aee848eed5b9bcc4b0717aa3529e2faf*32&27b3ce8de07bfebc01de1c27581beb62*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1058_n"a5829bc6458082992c812b5e03e5db08*32&bd625e142691054098c2e0a0de78a89a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1058:32x1058x128_n"00ea5f3178924c6f8f1e66a14e235882*32&ac6749c5bef7eb950a89859607cc47e3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1059_n"49cd9521aed29325e0bf3b66614fc242*32&2a8c5f92a3baa92de69aa3a8d399472a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1059:32x1059x128_n"5718ddb8edc7e6b3aee3dd581f752d25*32&d160c34809ed44c16a0d98517edf7075*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1060_n"91e99ed03edd6dd61ca57edfdecd7443*32&3c8048e71bcb2b837532bf60339fbded*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1060:32x1060x128_n"319d4c09c26ba2882b3b6f5c244dcaf6*32&373b8421250b292b7e7a3212b1df1ded*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1061_n"6ee47cc80a07bd010639753a5f31147a*32&6707bec5bf6c5d92205ffd5e9c5cea03*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1061:32x1061x128_n"fc401fcb89e3548099f4efa4a6511cf0*32&747f2c9eb8576b85e02d9772b6036d2c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1062_n"33bff281603f8c0a897e357249e27221*32&050f75e878573a51c980d13529d05867*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1062:32x1062x128_n"7246043227ef1190a2961d274c3c247e*32&919b8e0d3e8183db5b5feb3fa00166b6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1063_n"5c643dc259cabccf51920fa60810f7bf*32&db1ae1ef528659f21d9fb37b5ccdeec1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1063:32x1063x128_n"eae72430adc0b03ed09815093566c5a6*32&66411ee1393f9b453adf6216708aea8e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1064_n"0d9deb098a6d9b62fe06d1c7821e7781*32&f5512486c02db754e820567b59c4785e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1064:32x1064x128_n"7b303e185f19a8a8cb958e04e4f37e49*32&4e371b082604b673efeeb423bfd4c4a8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1065_n"8c96cb3d5f2a3806238dbbfc0c204ced*32&8c5ce4d4655adfb1ab2f06db66cfd38f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1065:32x1065x128_n"11a74dc748ebc24efecf8fb7f13c2372*32&39c5bdc648632bf685dfde26763c4bf4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1066_n"e1ae5213da04152900dcbb23fbd6a55e*32&576703e24c1343cd51587fa5b293a76b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1066:32x1066x128_n"8d11bcad39193d116af3d29a4f61b539*32&f20b546f33206f6f433f392240551641*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1067_n"941e73e205fe5152b6d1b4c131e7ae30*32&f4d6bd8de4c8e516303e04562988d369*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1067:32x1067x128_n"5fc7f621b37fd8bc0b08db11831cdf7e*32&89edafcca795b5a25384633a18677a9d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1068_n"f1f05b90a9300cbc9e24a7556b55c813*32&b28cf15fd51f8826eb73e70ce999a3f7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1068:32x1068x128_n"eee5cf5e071693162c7909d17c3b3f4a*32&f8b81a07f27675a8c0e398175f361114*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1069_n"58e6b04c41bc974f00d4913808e70fa1*32&36b4e31cac9e30bf304fbce1337a60eb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1069:32x1069x128_n"3222156e6b8d3d5f1f53f3a510cbd7a1*32&d0242621e24f2893faaece16b79c3491*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1070_n"2a80b1e084cb82eb9345b6da60f45acf*32&acd8080c2ad41559d11fa3a43d5888d0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1070:32x1070x128_n"04e066665f3404518c495988f8c82d8f*32&ff69d076e924d19ae953cd4f899d0a32*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1071_n"bcae629df36e7797927ed1394f5b67d0*32&6c677b4223f989cf2943c003850fa4a6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1071:32x1071x128_n"e56768ea67024ba7c174f06c908c52b8*32&525aa0509379baccea44a65eb8bbdc94*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1072_n"849719bf9e0148c2122b80da3eee70f0*32&9c9cdcb3bacb3117e99cb4eab661d290*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1072:32x1072x128_n"5b8e2725de5c0fceab45588f05b1a786*32&8bba191c29f39704e9977dab4f6e0964*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1073_n"02b5defbacd280501165876c575371c7*32&983706095a33a199688c6fbb337f83a0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1073:32x1073x128_n"e22a30e2a2c8bc8c30f7c9a6f5172cbf*32&1133d810035b892ed927a0d1ec1dec64*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1074_n"c6f877b8e19eda4e9d226879d885071f*32&89512ec7c228b5063f8e4366cf342c95*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1074:32x1074x128_n"e653973308278842a156db1f9930ee78*32&ec20c9cc77814a7c17b8a4617bc0ca76*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1075_n"376ba1ae9ec2ba0b771d7923294a25cd*32&cc7dbfaed1f1423492b324660fe93c1f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1075:32x1075x128_n"b1e11d0dddb62a2d67a226ad5a752470*32&30bd8cab722b5765c56b44ac775295b6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1076_n"e4a49c8d0db61c2dfa3673376d6651f1*32&bdaf6c62a7642b86cab7152d3b56f1a9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1076:32x1076x128_n"9c3aa9593248187f8afa3d10af6abe92*32&c10be66e3910028f09e396d0f048c516*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1077_n"c7d60dfd325ab80e96b0e0ea1e8b0f1f*32&6dccf46b33f8bff69952b83bbcd0d1dd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1077:32x1077x128_n"90e5f2467284ee575b872f64919773b1*32&5df84fc089eebb053131e728378b4082*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1078_n"f9640cbf17caa8962ca0f0f655956fd3*32&a5500b79ca87e1161e3dd47087d6abf8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1078:32x1078x128_n"20ce782cd10a49507b1551e6d1e1ce11*32&6795dc94420bd08efc1abc6a60dc296c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1079_n"30877b4359dfbda51ada44493391db69*32&33c556631a49f726731da34aadf51a95*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1079:32x1079x128_n"407edf0a9292d72ecab19cf91101484d*32&c730d5f60648db4ddc9d3752169861dc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1080_n"0026a748a6371a0f0f9a5daf08b1db94*32&a7df8a7ef010798dfdb2771c819d6316*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1080:32x1080x128_n"df15387d012d3d349a2527cbf85e27a2*32&80e56bd587fa8016e820588015249cd2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1081_n"a5f1e93eaab76f6c8addea221ab32ca5*32&662b424c8e889a37ecf4c5f34a84d2e9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1081:32x1081x128_n"bab445c92a8e6930a00d2fd7a9aaf2a4*32&ad60e8e6b43402cfae7663257bc4a875*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1082_n"7ad360fd4d703912f8e5c04682620761*32&47af65abe810d585ea61e46e1d9edbd0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1082:32x1082x128_n"1d80174e0140d4b4595fd810ff2bf33a*32&208dc0903288115cfa59f8c13b40d460*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1083_n"01c591da2198134af46517229edaf34b*32&9daa251386bd549472ff65ec81576846*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1083:32x1083x128_n"b39c637623f0a702c77cb98f806bf571*32&65661f7612cd39f42736930b69356ae6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1084_n"4a7415571a55ccfd081062d53ea55ad0*32&dbf0abd9879acf35b9b3fc948e1495e1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1084:32x1084x128_n"22a9d4435f91cef54e289e106b598954*32&08b52574ee2c8ce4184b83abc08b4b07*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1085_n"4363ba32bc1b841f23a2b9935e7be989*32&00d2f6e6131f1fabbdca685236cf37f4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1085:32x1085x128_n"a34b8d43efd7aca655de3fd604d5032f*32&0091abd03292c2289210aa86e20845bb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1086_n"42e73069d11152d73e491585f18e1c2d*32&6da6334433775e19f06d635b99e54032*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1086:32x1086x128_n"05aa630178b48de7e7c63c0b2c946d71*32&0e4b620484b657a941e0b981b681a43c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1087_n"62e44bb4cc9d7fa2fe3883abc6470fbc*32&0594408b20777ae84905e7e2ee24203a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1087:32x1087x128_n"00cf0db3533ba930e2d8365f01e739d3*32&9b77fc390c91f0777a1741ba9b81a21e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1088_n"d30ae6b8e3274c748e9cdd1508db58d9*32&93a9fd6301adb56c936a01a2589410c5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1088:32x1088x128_n"a3aa76c4164976ce2e055f5dda569059*32&00bdc8d50c41a86dc587869d6d62c2a4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1089_n"c844145ac142ce424e2bbe2c5ee92f4b*32&8f90e2c35829672a2a2c0eccf83a8b74*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1089:32x1089x128_n"0bfca9313e838451d557357d7bfffafc*32&1417f070ecd31068bd72cc489fbc4358*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1090_n"e79de755746c26fcfd0dcb6e36737b81*32&cb9dcc7be286f769307c5063746384ee*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1090:32x1090x128_n"0cd93f77737524e99fe441e331777150*32&25acb1251b804dd325beb66cc499b576*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1091_n"43e670e7c1cd7e440fbe9c42fd3abb9a*32&ff47e2a02725bf4a84e9dcc05cf26043*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1091:32x1091x128_n"e840db83a1279c5ef089b6b05ad04685*32&fed6f6597f22449cf2c111aae434dddf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1092_n"1d7f6d5c581cd4e26a87f712dd4e4439*32&325c774226fd0b37341f2703f23b5df3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1092:32x1092x128_n"a574963d24683224c7eda21703f24e05*32&db57dee1fd7d4584b98d1737aaa15d54*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1093_n"9e9fded99c51c4eb5e9986435b027ee9*32&c373adca3196931041dcabeb352c35c5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1093:32x1093x128_n"eb5420b358793f6cd0b1b0073c407ac3*32&000092d9e941c1d3a22643d8855b469a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1094_n"f7858612066902ead4d686b830990fd1*32&5d834d3235065a09097f93c941022200*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1094:32x1094x128_n"3c165a8e1b6ab86134cf629645a47464*32&400df0b61750afd5c7ed4dd3868422e6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1095_n"69ef36570d222a7f81f35229a4b56981*32&3bb9576b0b129edcd841a9140490067b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1095:32x1095x128_n"cb7db5599c77cfecf01c38325a2d134c*32&3748d9061bd4dce8484c6ee798d531bf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1096_n"af4b9ee213e23b94cba83419b8557e22*32&52fae7d653e1bfd509356b0f31ed52ef*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1096:32x1096x128_n"33d769076a1ab9d263f47ca9e7b84230*32&04b69132d847ce6b57678d8ca66dc4f6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1097_n"b6edc7d798475979939484388239ee12*32&790e33f3191b9e9324df11f7c3603023*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1097:32x1097x128_n"18878cc2d90382cbb625335f8a6e9080*32&fdd9b291c64589768128906f7791e2f0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1098_n"9920424242b2400eab214aafdfa6d644*32&12850c930c29dbf7968199410f38662e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1098:32x1098x128_n"3344c847c7f3217ccbf150cf7c5227f9*32&fada4074892d43797aa30e0d9b1cfe65*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1099_n"f754dec73ac39841029597a66cb7700f*32&0c69f90b9cf3ed36a199de6df03ff0c3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1099:32x1099x128_n"48ec4096ec4cdaac964e66238c2d4b9e*32&ed7a8dfc025cdefa037ffd9121107eb5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1100_n"07dd15d48021461fb8d157dc8d66cdab*32&c9d7c59f4bea546eab8fcef171ca7204*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1100:32x1100x128_n"314f1874cfe54f139d5fe916ab2fb211*32&03689bb2bbb5e82293d63f7d754ff0b2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1101_n"370c2ee22744f23f64a064d4e41226cd*32&64ae586770c3ce39c6934f21f11de144*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1101:32x1101x128_n"c25036bdc4cbe80679f458c6841607b6*32&cc52105ddad2052b420b5e7c71e05220*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1102_n"623856918be24de788d2aae68dff7762*32&8b37d8e51d55fe24e2aac21e14db1035*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1102:32x1102x128_n"8aa8c02f0557ff4c9f886e59f641112c*32&98e5bbce707893f941cc5a4b0b6e3958*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1103_n"0bed4ff50b18b9eaed7e91a4e5405d90*32&5556a74b545d8f5e77a01b100a8be92f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1103:32x1103x128_n"0fcb59411f81ab5a63769032d7eb36c5*32&810917163232d52af41731325c1986bc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1104_n"9cedc23dbe51bc22bf182d62549271cb*32&2cf9b20d6a5ebd12ddc3711fbf44c680*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1104:32x1104x128_n"7dc27f2df01fa2b8fd6bd14fc5b0247d*32&48d51d9360128f716f0582b2a0f7139c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1105_n"4015171146721d82069f273dcb46e925*32&e7466dd5f8dec5c08a760417b0a40e46*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1105:32x1105x128_n"17ad82eb2e91353de84ee0cceca99086*32&729ee44175c2f8b8a004270b0410dc70*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1106_n"03c98becaca1b6959bc09fcab450b70c*32&aa21bb62f214e57f4e6250cbb66d538e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1106:32x1106x128_n"eef0a98179c1f966a940db5a72cf93c7*32&cffea230f872fab872dd3fc9d6699abc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1107_n"e3c74f5ac6a9823976967d640db7ac03*32&7c0f315f9ee6ddc87f4f05f845f95253*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1107:32x1107x128_n"68b3d59cf138a35a03d8d8932ed2b5f3*32&fdc135ea48ea08cc4bc96910e49fe1df*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1108_n"6c9bfe899e87a51ae3185bd21ff19dae*32&4abaf5aab8a9c0c09b983583edecc515*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1108:32x1108x128_n"f2b53c8c766a56aa0fcfc1b356ca6ffe*32&9b4a9bc5d75bdc362cc294c080de671b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1109_n"aaf269a1e9f67018ba1e2f0c80d19138*32&c9f3726cd64e0d76baeefdd3437258ca*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1109:32x1109x128_n"f75bb37c559a69532ba0786b3e89b81c*32&3049d3f57012bf2b62d8fd45a773f287*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1110_n"8e8adbe1947dab3468ce0261ab222157*32&c10ab7c26ebc8cb1384baed1a08620dc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1110:32x1110x128_n"f46a7140268bbb983f7bdf9de63002c9*32&d6239d53c00ff09fdc2b93032e367b70*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1111_n"4ddd72bfb6736176822484b38c280597*32&d8848b73021b307bb52a52e08457766f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1111:32x1111x128_n"e75fe9f3e85d30c29bdc3f9a3f4a83d3*32&6ac96ec0027469dfd4927406cd17ad7f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1112_n"fa4c5c8fcbb78414e5d24ea00fd5913c*32&eec84dae12cd017ad72112b5ff20d07e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1112:32x1112x128_n"7ed0c9d7a9d3898492daefe1e2403073*32&2c0e1abbc5be202e3b5894aa5a032d11*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1113_n"bc14f9e8b041e5d1439f81039ef17b9f*32&6dfe59ba6bba23746246228a4b191985*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1113:32x1113x128_n"fafc7501fc5d084cc246d3db5c7b34e8*32&9014097a7c676d7a3e9e01ef60a70484*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1114_n"89255f083bf6106fea9cda9a0e44c396*32&bd0cd88a1164b96f650ad20c622f501f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1114:32x1114x128_n"c36b1ef1e89c17b8bc2fd47ddd58c3d6*32&187c3b9ecc8ca4bb09447ec4b736c16f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1115_n"8c5b95a3a81a4a8815ec0e733178cd0b*32&7503c5c9823652e7484f716e9edb1ac1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1115:32x1115x128_n"93910024d24bedc818549dc580873ba6*32&73a52cbdf4c2f731a9a8f1c112a50fc6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1116_n"21db2c80b90b4a2696b2aaaa72d5656f*32&e1cf649e3678d1fc1f4219755f7c577f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1116:32x1116x128_n"00a3f97af4efe177931934da00dbf7a7*32&38f046b392e3b8716fcb7865fe91ccc2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1117_n"259822855d00e35febf5dda8d446671a*32&329128e7f9947c5a802ce9fd93f438e7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1117:32x1117x128_n"04b6de63addee40f2f727d4ad6d03484*32&715ebc67a8d4b00993c0b3d7b5dd2af6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1118_n"f2f9608a1906cd2e33ea666e16794446*32&a5d125b776be34381d1b87f5b5958f57*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1118:32x1118x128_n"cb2338e28a32667bbcfed8855946649d*32&0d5f78f0e24fd2c44660a3ce761d3284*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1119_n"46f31529c8891ffdba9794bd914f11e4*32&d37f59b3b4cd438a2ef2a75f640bb033*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1119:32x1119x128_n"aaa3dee8619b38b4f8bf692a606a3338*32&5b52be95a7ffd2156c9523325e0002b0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1120_n"d8f507c500c1a29680bc3e87ada91508*32&d591f1cabfab6c9c8e9825f69238c9a3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1120:32x1120x128_n"148705183c9872b1525b1ccbca8e64b7*32&45a2beefe99ea6daf2a4bf6afa6a5d85*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1121_n"7fd4c9889330b9c0911c25d63bd223fd*32&cef71ddfdabaf5bad360b69a2ec2fc3a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1121:32x1121x128_n"c8852e9a6292904b12034dda5668557c*32&4377527b06bc601e26b3a0fc801f9ec9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1122_n"4a7998396d801d903d4b352d066e6c15*32&bb51c705cb902924d27c7cdd43cbed12*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1122:32x1122x128_n"ed4f223962a444d9ea3aff268c34909e*32&43ddecc4920a0caafc0c2d76e305c5d0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1123_n"ff0b2b299d172431dd16e17d49bc8269*32&1d7b7f4158a969230855616a312954da*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1123:32x1123x128_n"05bc13423aaeaab6955dc5bbe7f5a212*32&f8e6efce2d5958eb986d94f30d7da552*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1124_n"acf951a093797bcc407840540f7fe2cc*32&d41cb4c522422f10d53e12df2c0e58f6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1124:32x1124x128_n"fcf9a3e202a76a28c57d2834e19912b3*32&51d24535b8c5e7c7ebadfe003eea3129*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1125_n"84470f4ac665f0f7a82abae4d9ebde23*32&cbe57c48fc2b455b1c19a4e7ab252928*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1125:32x1125x128_n"101bcd6cd3010ae3658225558e4c3eb6*32&f256fc01c6f76cd00e20f2a367f5e3ab*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1126_n"11f067e649fe5a31ebf673937de956cf*32&30799e9706105a89a215a2b012d5c4f5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1126:32x1126x128_n"3a34f3bed501ba1014d213f475b1d9ee*32&841a5ee160be5ed35138c74fa6106031*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1127_n"97ba4fd9b26a50ead451c739832c419f*32&401ab16b0843fae3c1ceadbb8014df75*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1127:32x1127x128_n"6cc5c8648f45bafd3fb12c5bfbe288a8*32&37d77818946c58dd3325503bb93c5ba8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1128_n"b51ec8c875574c929c88fa3d6776eda3*32&1e9f3cf7f992a946092b43b4130af132*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1128:32x1128x128_n"af5bc30b9717d38d4e9491d19f9e6820*32&cf3515e3473c018dac1dc9bee5e1a694*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1129_n"bb996c47a0993fbc33a6dc5eff111892*32&e63b5a14794a41b640c9abb91d87f52d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1129:32x1129x128_n"8f3a564a51848d6c17c35200ee01867e*32&45668b7b437bf0931f90e92a04334de7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1130_n"f61c4d06f154463519c2c4a5dd3018fa*32&f5fd7cb1ca9e92e182248e6bfe69271c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1130:32x1130x128_n"7f13c252de86590fc07dcc8bdb005652*32&f859430fd11309f983d0a124543ce9e3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1131_n"972c519cd19e33f0336c45f12179b447*32&7f33bffa8e9a7f4b417ec4a3c605e158*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1131:32x1131x128_n"8e5690960fcfa70b08cded3235a24f90*32&a30a4f59132139c129cf4651bac0baff*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1132_n"60c34a66c921fc0abe0ce9e15cb154ab*32&81f7c6ce0dae7cf0031cf2d05dd002f8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1132:32x1132x128_n"ce5c1ee563acf2c4d701a23928659a48*32&09f06f2783e48339fcc31df78a2f0726*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1133_n"123dc9bf04a9576c473188a7635aec40*32&ed95e6f5650b0304cc8121e587226cfa*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1133:32x1133x128_n"dacbbc29752ed836cb01a5c53ab7baaf*32&a5f590751bef2f0ca1f6e1a75a540e98*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1134_n"2638c6e0bc43cbcb08d6a695b17740de*32&3f33d5db0df1dd7ec4ef43de01f058a3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1134:32x1134x128_n"e1e389015661c26e832cd717d7373d3d*32&b492db48ff4774cca066ece70052e1f5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1135_n"52bd0b76747bf73a32af17bae5c2bb58*32&e44ad7a081d29f8685fd8416af7c4d19*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1135:32x1135x128_n"1c18cb7114f5616923a12e72b53ae544*32&3dd818af032cab19b49beb47605952f6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1136_n"5f8651fc48b425f038e99aac7a735382*32&e9ed1147f64fab69e43475f61c9fc219*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1136:32x1136x128_n"9eb4dd5552fe3b279c76b260d1183347*32&128571c9aaadeaa0f834b4595de92de3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1137_n"1418e3cb3c23c609ca262c2ca4777665*32&4f4fea0cea357abb6207b7e930c49a34*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1137:32x1137x128_n"458e3e8dec2916ca0b83702ced7b2ece*32&39add97f7378e2596c303db621680554*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1138_n"42b6f1b1ad697ebbf2bfc992b8f5bf99*32&67f7d6dab8f127a8c3d2600ee9fca60d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1138:32x1138x128_n"570a2df4e3c0c3d835b0f1942ae745a8*32&c8f7a2b380cc751838060f5675401dbf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1139_n"8ed94e6816bd20816771d04485e142fc*32&5ee31776a456d7b1f117935d08bb6282*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1139:32x1139x128_n"a70a40f66c437f916ad6d515d7298d0e*32&5bc10d5ff2d175f991f23fc297d6ca91*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1140_n"52247c56360871efc7ad0be9589579ed*32&b3033aa302d185c7ab48554e3f58873f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1140:32x1140x128_n"04012f164c84c71f68101041cbf0eab9*32&6fcd648e06691a985595f43773910483*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1141_n"cdfa3fba715b6f6f62c153005aa03d18*32&40f96143002a743eae9b5c60711e574c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1141:32x1141x128_n"53076f5805dbf7617db25b7e3bd340f4*32&f43f2406ba57f4d3e0d18c2b8ce07c75*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1142_n"6ffd9e8c044c4af5a6e4db1a804da7fb*32&2e6eaec519f21a6cab6c922e9aae961c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1142:32x1142x128_n"855085d247e0e8d3c116cecceb5e97d2*32&3bd293b4e854d8daa09e064b0ef169de*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1143_n"aa32d2c62fb7cff8343e5b33799e9c8b*32&fd463da288848bca9f20c481bae54a6e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1143:32x1143x128_n"6c9caff50647cea5eede5dfbb2f61115*32&07281c09ad5ca5a1cd6360571ef83b1a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1144_n"8a43f7b0b15c3113b6fd4e563a58c306*32&fba3c8be0f13008629beb48b361d91b2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1144:32x1144x128_n"8f13cd6bbc517ee8c837f519180d5b97*32&66f9793df0924a2d5101e7b55ff46ff7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1145_n"4909dbd556e437d9d4444418ace8333a*32&0779de9e5d8260a631d7329cecd83c03*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1145:32x1145x128_n"92c4f5fc73f3d1f439a33c317445b0df*32&f15a809a9ccc3c1117106dd3d8897887*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1146_n"51f9028e24554ce9d4926ceca64f006b*32&862d7f94a6e4a3f4b8d4b99e0e5af96b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1146:32x1146x128_n"8d528b8a2a4bb3ee4e244572b9bad49d*32&1e25ada7087a99408d150ec1a388fda5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1147_n"a411082d6616bdeb011e5402eb23fbfa*32&8358ce46a5f3c4989cc35bb93ad8c226*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1147:32x1147x128_n"cc5106cdac1f8439978373a5dc0eff9b*32&13bbf0091c70ba12d27cf3207a6afe26*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1148_n"6ca3a65fc8f955de8f6304a49cc4153a*32&8441f5392d5caeb0c3299beba44c20d1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1148:32x1148x128_n"71d2b187dc2ff7e453904a4f636e562f*32&4011389b870274acd497f00469f4cef6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1149_n"058c6394951485eab89c3e81d96abe91*32&a9c0f3c2082e990c52d6bddf548dee97*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1149:32x1149x128_n"28208c920ba4019ed811dbacbe31c90a*32&88c903fcfa5e53fa3a58b3a7bbd2b6a6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1150_n"c5fc52fd4d60d34a8ab4ecd6f8e3c54c*32&91ce135583ee990b4a94885b7eee94f8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1150:32x1150x128_n"711a2e0a7334bdd72f606ebe4321419c*32&1b8235c790b76e57656bd81e98e22c9b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x1151_n"d5bd214a8b3ab9c611043b2394b5cae9*32&8eba0551d7d2efaa8c6e8bd0426d2814*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1151:32x1151x128_n"c52111d6d070207ce9967a4f2e9d30f3*32&013b10f5c136d16f97a5e1e18d1dfb44*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x512x64:256x64x512_n"95e1c5d771c1d2b8ce4704f0682f3f80*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x512x512:256x512x64_n"213e81cc10533c12f5c4e6e75e646622*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x4096_n"441c75167343b3ae3bf189d9d075aeab*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x4096:4096x1024_n"55acaa7fc4ca9cd7e9a573c4df719239*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x1024_n"ef9ac4c26d746917edc80b9e242365a3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x30522_n"b8b78b1ac42af506fb7789ec07785227*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x2_n"38b4e2f2d3ccd5f4619614aa50c46616*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x2:2x1024_n"377235f7102a79d9c39ceb7b9154f942*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16:16x1024_n"3f288fb5a0745d662ed1a92b8107f8c3*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x30522:30522x1024_n"d6aeade446d7e70d44a96d1466ee444d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 30522x8192:8192x1024_n"d55d7989b11ff8fd4e99b097614e392b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1024:1024x1024_n"fb3af514ef93e0f47939ce989b7632b5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x16:16x1024_n"e42097d04eb2d367374523205428729d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x4096_n"584389313d483943e5ef7a4782e4dcfe*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x4096_n"993d4aec74add0f25297187ecd8c6e10*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4096:4096x1024_n"0131c90439124ac7c6011386ffcf891a*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x8192:8192x1024_n"1b7f59388758172401b3c49c483ffb9f*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x512x512:256x512x64_n"1b77ff4e4c3e1f6e574a8932c657bd8a*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x512x64:256x64x512_n"08cb479d55ed3f2c6a81ae798a82320d*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x64x512:256x512x512_n"d9c72848e510b498c254e0b8e60936f9*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x512x512:256x512x64_n"e79684852483df32d8426863554094c5*480"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x2048:2048x1000_n"dd74ca79baa780a29f18bf5d5fef15fa*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1000:1000x2048_n"2a4945c41581d14ab99ad6a77a6cb16b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x128:128x2048_n"204fbf31a7a568fc10a7a99f91591a9e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x2048_n"24cc3af4ce15243e4d4d375a24df645d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x256_n"50e492183d4b8ad28d2b5ac2509cd6a7*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1024x128:64x128x1024_n"689499ba173085307fe1de8875525b8a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x128_n"b2f973f77d8eab8b6e883a998c9bd0fa*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x2048:2048x8192_n"37f79d73ae6c039b8b4b73acd378ca70*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x7168_n"f6f964d3a8907486f800f88bb262d4c3*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x7168:7168x8192_n"648da6d844e70918acd497b12ba79365*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x32000_n"f66a194291d2e10ed9f3a3c678dfe9a8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x2048_n"c1865e71bf9b5347206d29189fb56a9a*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x256_n"5a52b1ef13146e8b6c11654a7211153c*20320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1025_n"a332927c7340d0414d3c082158b38975*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1025:64x1025x128_n"ef99f14055995b4f644766828dd0333e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x2048:2048x8192_n"655beddba668ba3b018401798270c653*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x7168_n"c75944eddbfccfc1c8ae62eb512c7948*20320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x7168:7168x8192_n"8b033f8c4668e73791582f4cb7b550cf*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x32000_n"5498ebf73f95e5adb8d909736ef00b92*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1026_n"8489a38be9dbf021294beb474d861331*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1026:64x1026x128_n"75ce7e84823b581fb24dd6773d3db7f4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1027_n"0ab1df13b2cd07021dee45ad80d7e8b7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1027:64x1027x128_n"6ba22f040ccf57ef50dec8835b58c2cb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1028_n"24b5afe343d7a94175562183cffdd2e7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1028:64x1028x128_n"40d0a1aa191df00fd981de62557f1443*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1029_n"72bc25b34d2f708cba5440d3d276397f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1029:64x1029x128_n"0fb0aba133b9725752aebb65fde1fdc3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1030_n"751184492274ab6840e7ec4d4ecf0822*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1030:64x1030x128_n"e931aa9ef43c290041d1c09f135690b3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1031_n"c3e91b562661ec152257bfe4f384418a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1031:64x1031x128_n"7bfb7647847245227ca13ff2cd85d68a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1032_n"cb1ae58f17c6c360d1b66caa74abab99*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1032:64x1032x128_n"9bcf76e93ae7027a5089b9eac72a43ae*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1033_n"13bf1df16ab61c0e8cb13dd51a47b13b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1033:64x1033x128_n"5dad3032facc2b551d82332237401ac5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1034_n"0434d8289135d0fd90b28db717537496*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1034:64x1034x128_n"b7a4df267062bf4df1fa95bdcce54786*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1035_n"0596b88aba55855159e1669dd8aaf530*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1035:64x1035x128_n"6e7775dccedfaf3b781c9dfebe8c8426*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1036_n"cb881915c36d6ddc215390eb5a713607*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1036:64x1036x128_n"885005c802f547200ab4c5b4a5e6d6ea*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1037_n"69aefbf3961686ea8e66434e9e69e30a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1037:64x1037x128_n"58eee185b7bbef6a1eb7c9c5a7719edc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1038_n"3a597455390b1d85a12f8cc7d74c8a94*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1038:64x1038x128_n"619d3125bc6339f44bc9b037c28cc21e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1039_n"eac818d00839c5d4d8187dbc988d98d0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1039:64x1039x128_n"cce9d6acb9122d37d04221e29d8d36ab*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1040_n"99cdd55c9b76ea750d7cb4d500cfbaec*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1040:64x1040x128_n"779c504d42cb39de10968d21ce37992d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1041_n"4ce9f6c48bbdfdf84f709ec0d34f5b77*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1041:64x1041x128_n"60e3fd3a9e4a2e9e6b9cfeb879410db9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1042_n"c9f4a1840c6b48244f5ff4c2e2845222*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1042:64x1042x128_n"6da28a7c66df29501e6796db2697ebb2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1043_n"6d96c98912914397c80ee3609bc41460*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1043:64x1043x128_n"a3ad5d0f1e8616a39ffb498d284b6e92*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1044_n"b2129ff9737faef02a856d4f6aa3b7a4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1044:64x1044x128_n"d8f00cb3118b068d8d897a8aed3b24f1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1045_n"840e9e5587f857644fb249508e1ffedb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1045:64x1045x128_n"1633afd7913aa611b7da8bd7241a27dc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1046_n"62369f9dd872f9debe00251179620bfa*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1046:64x1046x128_n"c5ac29bcab7d4e8f14e17c3bfcf0b0a3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1047_n"45910bba1e5f0f193882345651ab7f3d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1047:64x1047x128_n"cf78bc05630466de4bf114be6ead9c2a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1048_n"e0d0c5cdf1074e8141697ce5652d85a4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1048:64x1048x128_n"3143aa7c19aaaf600ec02673a8636f31*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1049_n"84fb323094606d0f871e0581e9bf1fb5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1049:64x1049x128_n"20abc42e9130324494831aa5a8a5fecf*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1050_n"c873cbc50611886e2b2f719c616b8130*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1050:64x1050x128_n"53b43734d46145bf6d9fcf0d96ef22ca*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1051_n"027c787c8b9c3e44968ae3d9f1642033*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1051:64x1051x128_n"29f437a855aad6733bcaf6afe3fd9f11*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1052_n"7ad3de04d772ac5764dc572c4eaf45ee*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1052:64x1052x128_n"9828fc261a4b2368b812efbf575a7842*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1053_n"8f6999788d8cb33b3b2fa71202784ca1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1053:64x1053x128_n"4eded2c8dbd591cfdb43ae2a6344ad5c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1054_n"65378bb485d3ae1b829c802e1f440ca8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1054:64x1054x128_n"a1984590d7ba2f2c585888e723d1a974*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1055_n"23cf35c5eb2cd35a0f56f47101577d81*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1055:64x1055x128_n"713c730780cf8cfe7e3c36c09cbf3f3a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1056_n"53f5efd13112a113ab205640d1aaf246*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1056:64x1056x128_n"a87f1555a097d1cbaa8d8193442530e5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1057_n"cde6cb206f2d5a6df829fc717fd1b7e3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1057:64x1057x128_n"8f284d105ed9a5570eba5ac83336bcce*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1058_n"6fe4a69c6952075b385788d6c924c355*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1058:64x1058x128_n"5f21a0749800194f2b91a70685158013*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1059_n"7ffa285f3c4a9608e9f44dd96b1fda57*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1059:64x1059x128_n"0bd12750d2f2db73c825bacfe2f3af3c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1060_n"d672531e5af6937f1dba93c9e4f9c26c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1060:64x1060x128_n"3f7572dee051bfb5a2fcc38e1ee1448b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1061_n"58e84383397f75cc23fe7c6c2772a5dc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1061:64x1061x128_n"b441a4e560731b0fc14e1044e470e214*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1062_n"1577cc092802acaa56f5cbc2a767d627*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1062:64x1062x128_n"ba919a1a65ce5674857c5df99005d6e8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1063_n"97a023651e9b9d4a30a97f84fb641c48*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1063:64x1063x128_n"14a4601a69663554ebbad9aef34ea349*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1064_n"91d84731ffd411710a5040f21e844d43*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1064:64x1064x128_n"af1d414b943b22b5495eee871d2cc1c9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1065_n"cab92be869a7c58fd1c49c8d1f74e9ad*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1065:64x1065x128_n"486a0a1bd29bb40cac7a974a5a93a27c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1066_n"0fb906dd66478dc2dc6ff266a2da4f7d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1066:64x1066x128_n"4749b7b7657aed5bac4b6230b02a30ed*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1067_n"c300527e2a35c897b27bfbe9db475c37*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1067:64x1067x128_n"00bee2606a4439a8acf5a87e88cc4eb7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1068_n"f14823300fa8b90612ba5f96b86d20c0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1068:64x1068x128_n"bffef0b1c4a78c5af46eddc7a92bafb0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1069_n"a15bdf9f6683da312357f146d27f67fd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1069:64x1069x128_n"f541f02cddd9f97b02fea96cb69fdde3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1070_n"a9b9017c5b27aebf9825df9e4b6846d9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1070:64x1070x128_n"9f3f1215fe3f6388eeda15f78c4f0647*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1071_n"9a081e355cc39eb38eacd6e5e07d8de5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1071:64x1071x128_n"1e38e44a5894f1bc457a0d1eeb83ff4c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1072_n"9d47ea70ea13e6fcf3a0cdf276c50c94*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1072:64x1072x128_n"da8af4d499a902d7f1ec4599ec4eb85d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1073_n"5cf0ec1ed38b0417cedb36de9818bc31*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1073:64x1073x128_n"5c0ae185370a3635ab8b5b9486b1cea6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1074_n"801fb7ff2532e98a0fb59672a7249bb3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1074:64x1074x128_n"9faa90c210ec21d8f00dd5059341423a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1075_n"a7082368e2fc3c133b9b7b58fc89b19c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1075:64x1075x128_n"72c02345e757afe5bcabe9f3d32f3260*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1076_n"b8ed99d6c367afad81ef901638769a0d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1076:64x1076x128_n"f21dbb174baf3205732d11fb6fa0ce2f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1077_n"6cfc36de32aeb79148789e0996c694b9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1077:64x1077x128_n"4c993ac69931f894edc306ca92acd8e2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1078_n"337ca0a42a7af490b6c2bb005227a6f3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1078:64x1078x128_n"ca6e4b83adee57788bcb33c55c1bf21b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1079_n"4e7d4de1dae76d76f02329c21b8a0595*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1079:64x1079x128_n"84f343ed22620081749a2971361e0d48*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1080_n"c2dd8b38e0cca8733c5d3a7241ce2e95*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1080:64x1080x128_n"821f399647af8c5ee7151a5a236c315b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1081_n"011d883f61ba77858be06fd3f91b09e8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1081:64x1081x128_n"9cd3e93a813d806a6d24db78ba2da5d7*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1082_n"1bc1e47d4d172dd3eda22fa87dc2a6b0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1082:64x1082x128_n"e4a3678c61f945d3366d00aead9c1a66*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1083_n"24c9cd3deaef5ec1271df5239d30ebf9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1083:64x1083x128_n"a9a9c6c9008d8837c95cc48f8a134c25*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1084_n"d64bce0c8c42cd0259520ecf4a7a45e1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1084:64x1084x128_n"41a6481e1742f4db02c8f795ebc2cd68*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1085_n"3ea7749c53353018f7691f5fd2e3308b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1085:64x1085x128_n"36e6d37dd0c8c9d15a2f42ae1f6c37a8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1086_n"5263b53ea9508539714248af359ddfd4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1086:64x1086x128_n"aebe006eb7950169d10c8397b7e4d3ff*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1087_n"83c3c60695d4648bc94d29265bde3fb6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1087:64x1087x128_n"6b0a7397c58d4ab70628969f3924f15f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1088_n"b34f36cc6f39d48d307d10440905c3f5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1088:64x1088x128_n"68d644893e7d914b0f05e8c52f2cf54f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1089_n"41de121fed2fda851b3ebe95c850d81f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1089:64x1089x128_n"a4f95d79881a7e2229c14c4648958130*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1090_n"11342191d756cac0fbba1a21f9c72db4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1090:64x1090x128_n"7240d6833fa15cd1af6fd7b2eebeb664*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1091_n"55747ea7d36877bab987eb99de92f7c6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1091:64x1091x128_n"b3f573699e141d76808cc2196128450a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1092_n"ac7e426c174a440d50bfce3650d965e8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1092:64x1092x128_n"f82740a852dc6b4c4fbfdd9cb1b879ef*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1093_n"6ad2d859c38c71f2ad5dca13d9a3b6eb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1093:64x1093x128_n"13862f618ee1249931fbb98b103d786c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1094_n"54790d4c71fe1c3567e14b297ca42213*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1094:64x1094x128_n"155451404c8cb3c0fe65152f77085f0d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1095_n"18389acab8b1538b44a64a6601e7b00d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1095:64x1095x128_n"66833bdbec7b5e800f1db74650f12f60*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1096_n"e7c17e016199a41e7e72689cc3115178*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1096:64x1096x128_n"220754cb05d641fee3e0b58aade6f83b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1097_n"6ce7ea0785be5712f492fad55fc3817e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1097:64x1097x128_n"1bbe4f0d7e391ddf297a843f4bd7369f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1098_n"7f8cefc382ee4ba77aa07f447a12887f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1098:64x1098x128_n"e46df7ab2ced7ec1a3b601f3845b55fb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1099_n"8310397763b50884b947efbeac1349ae*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1099:64x1099x128_n"00cfd4978d5408f138c1f0432f13b7d4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1100_n"6a92375b24e55a9cb5d27f212403ccd5*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1100:64x1100x128_n"846e09954df55837abac5bf8ca74ed86*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1101_n"9c33176c1e20609d00ed14b4a556249f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1101:64x1101x128_n"96d79ee0c6de3ca404e414f093d25f66*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1102_n"bdd4ad8bbfa33569c2339edb6a9f2559*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1102:64x1102x128_n"533aae73e3d2c5adf6998b0fb990ce46*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1103_n"84cfdaaf6296d1325f9fd6f0b1692f6a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1103:64x1103x128_n"3af224108b4855c4d2d408854bf2ee1d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1104_n"e23eff9c88dfe19437b80c30c6474830*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1104:64x1104x128_n"0da729ec00879c14b75841187aaf42dc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1105_n"dc870ddce3aa0216e8f8155bf0f5a102*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1105:64x1105x128_n"1ef9a59d09f815d224e7797915e00d64*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1106_n"c0d575d0b8b74b2f044ed53df7fb2836*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1106:64x1106x128_n"05e1465f434b38ee447bd85362e5d2c1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1107_n"c4cadff1de4064abf9db68fa24b91013*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1107:64x1107x128_n"7c0bb97932f60a174f4474365823e017*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1108_n"d9cf613cead54cfcf9576b2d219d2cdd*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1108:64x1108x128_n"2fc20b570d906ef4b1ab5f0fee6e5c1d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1109_n"938db040224bc1acceda53b03c3f4123*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1109:64x1109x128_n"4c74c52add1c8a4d0b8c5c0e4b1381c8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1110_n"b78d20bb77ae5447eb874e07f7c9fddb*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1110:64x1110x128_n"2f425183b0dafcac0153094323951120*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1111_n"e6bd36c7e5c842069cc6a5c714047c89*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1111:64x1111x128_n"6d63fef2a9ce985d7bbef071fe431a2b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1112_n"a0a7d938f5b46aa3003b847dd6c58013*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1112:64x1112x128_n"019b436eab90085132d3e7bd90241769*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1113_n"391939dd73a279f3c273d98a75426569*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1113:64x1113x128_n"879e9fb8bd3a5a0b73ca0604461f8f7f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1114_n"77b8bdb34f36d19bffe690ec290ec630*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1114:64x1114x128_n"76e056ee1da1c443dc3e81e428a2a1b1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1115_n"561f25f64e02ba4b5b74543da3d003f9*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1115:64x1115x128_n"d8094d7cc6bf67295b969adea8ba3819*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1116_n"f7c8c4be249ff13970e9a5b180b36a12*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1116:64x1116x128_n"db90534ec30fd02f75f79e60158c5084*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1117_n"e322579cad7ebee1c505666bf72cbce8*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1117:64x1117x128_n"dfbda53ae962270181a442892c297571*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1118_n"370aa74788b82cc2ba8122bc8876cb53*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1118:64x1118x128_n"1fbab6839bada97447f96cf8c5d1a8d4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1119_n"54c7b7070ed0a3ab1d9f20a461af7bdf*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1119:64x1119x128_n"8ed3696120789835c61569799297eeec*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1120_n"b9c49ed7f188b41a34afc7f6ef2e2e67*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1120:64x1120x128_n"8b5cc5ab11745c71b2c670abf8d0fc98*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1121_n"4fe8f5df6be1045e273fab0e38e06f42*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1121:64x1121x128_n"9430a6c03f94fc2f758bd8da71010ee0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1122_n"657efed1da23a8575cfc6159aac5b7d6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1122:64x1122x128_n"bf287f956abe373fd8a358e51fec53dc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1123_n"cc73d57c9359f922b6ae8ff8dd5e8459*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1123:64x1123x128_n"e924e0bb4341f7273a29ff5be8d5e101*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1124_n"82c6679b0fe8553e9435b95e54af0138*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1124:64x1124x128_n"0895b8a82fe41c544de6d18ed5be183f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1125_n"6beae54e2f4143c4dcb7753cfcc9cb33*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1125:64x1125x128_n"7a0881b9cbfc76fa921297cd083bc36c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1126_n"fe4956e32262a8eeeab64c0e478c651d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1126:64x1126x128_n"e4b1d05f9a2be1137896479232d9476e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1127_n"fe3ef47e6270bbb54e7988803860ba80*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1127:64x1127x128_n"8f9b22df9fd9d2831cbdd9b37ad72482*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1128_n"a86e9a5bca639e70bf99f237ae517842*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1128:64x1128x128_n"f6ecf9b53ccc8fb7a0a9b9545ca75bd0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1129_n"7cf08d4d0fbd3de5f6232383ce4f8788*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1129:64x1129x128_n"d946612a280391461b2b12ce500aaaa0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1130_n"ef7a449b1a9e0e1a2e6abb55256becc3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1130:64x1130x128_n"ac0cbe9c561938e6d5a7277b10566e0d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1131_n"cd2d56d00d9395ae45aa6e92a74d0eee*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1131:64x1131x128_n"7c9f740209931834dc2a13094b165c4f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1132_n"f7d12c72c0baafa717cd9f8ead57c23c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1132:64x1132x128_n"7303b8b225ac456a0f66195b40c3d567*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1133_n"5ada05d22a2e9f4b3016b061f98986af*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1133:64x1133x128_n"83d318faf57555cba91061d529898b53*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1134_n"7c0f55cdc1f4acb8af51bfc4c53d7764*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1134:64x1134x128_n"649df7e869c1da88588e9034f7d2f599*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1135_n"77338574317f846b88f1a6557618b44c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1135:64x1135x128_n"12c7a167a84d12897f840b3b3b9d0b0f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1136_n"58c0f36971b206561d212ab236a8a7f1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1136:64x1136x128_n"ea5e5b5cfdc6623a82756997c393ba60*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1137_n"860b897a9c444ca192fb8e4c61bb9e1d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1137:64x1137x128_n"045697f74f63934982cae3b09e372aa1*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1138_n"49920864fd1d8e202fa27d774f8165f4*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1138:64x1138x128_n"d8358fb4a0d0f20d6e94ca2bf11515ba*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1139_n"45d82410a68f418327aaee10d3317baa*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1139:64x1139x128_n"f069e0015b1b9b3fdbadc7ea7d084a5b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1140_n"3630ddd4ea5df41fe34c19a8077806c0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1140:64x1140x128_n"5ebc091733b91d7416406b16c29498a0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1141_n"7badd3e25978dce1a7ad6d6a45b5593a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1141:64x1141x128_n"9d9da24123eec46e2d19ff4f9bbc866c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1142_n"149998a9bcee5a2fe2c05e65b7909fff*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1142:64x1142x128_n"05eb7caa343519c1a5e03de5a0a90e0d*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1143_n"bd7ea8c8cc2b27efe867376a9908be3c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1143:64x1143x128_n"5eff4442e1420301db8d6a588fc415c3*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1144_n"dd2c818d7d5d2b64233ed21ca5b19381*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1144:64x1144x128_n"5578246d9355f798722ec6184c1331e6*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1145_n"862bb21b2aea6407e5dcdfcc3b922e40*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1145:64x1145x128_n"50cae87cbe887aada019fae0e4f03a9c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1146_n"9ccf14429c0ab5e294bca2219e160e71*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1146:64x1146x128_n"41c314b28c355a859c55b5c9b5fc7620*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1147_n"fe842ce78b27782a9dfaeb32aa0aec9a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1147:64x1147x128_n"77de9488150db38935ffaa9fe4a673c0*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1148_n"353d3e2013dbc535379173fbf4bf771e*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1148:64x1148x128_n"a891a2e69712749986c7170a76d11efc*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1149_n"1d0316fc09217937029b27bc5936b238*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1149:64x1149x128_n"3ab7d589d2be441ad11261870bdb97a2*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1150_n"8767f39d86f458f3a06dec5e4e0e605a*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1150:64x1150x128_n"49a2911cb134ec35aa30ae96ecbf05cf*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x128:64x128x1151_n"c14a0136d7ebc9d88a2cd9ea9eee3f6b*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1151:64x1151x128_n"6ae4c8ba4d277b2cecae741f2885a835*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x4096_n"28dccf64c012a592db37060a35f5a4d0*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x1024x128:32x128x1024_n"e5e6e9083e914d76e143ae67a91205f9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x1024x1024:32x1024x128_n"67373bf7e9fc75959e9ac10d63627668*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x11008_n"ffc061cd29230de13ee853c37bc60bc3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x11008:11008x4096_n"a27f014023a70267c0d498c73e257c80*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x4096:4096x32000_n"90fba38b7d2a209db1dd781700414114"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x4096_n"5bd5fe908708ece8b0b404b1059636a0*16256&537782ccd476ed898b9bf6b3cd0d0f99*3968"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x11008_n"6ba1c703db99169f1a8db0c5dafc9754*8128&bb2c3dd6bd3a8c29634995d3c97c2b8e*1984"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x11008:11008x4096_n"da883593d403ec62c45e553ae1030b97*4064&f2495d0e3639784de7025c38d54cd560*992"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x4096:4096x32000_n"edf9222422a990b5c16a7b972c2ba1d7*127&764e0fd479cfb2180b784967e2f67e1c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1344:1344x512_n"9afd5f1e78d0c6e24982629a0867faf2*359&38cb9f5a25160058fe2c31226dd72d13*196"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x512:512x29_n"bb23fef341ef8492a99242b324f3f3a0*359&172b593d044289aca16fffd8666b9db3*196"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24x2048:2048x1000_n"068f0abe7043271f1e7e13c4163e3a65"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 24x1000:1000x2048_n"90c0d0eaa9794b1391bcf839192c3f3d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x24:24x2048_n"b375ed036b08b0bb72d20fc61a252a9a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x2048:2048x1001_n"86fbd24e297d2f850684cd3d94283f31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 256x1001:1001x2048_n"81b5f63823856f2cf5b893e9ecbeaea9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x5120_n"b2ca65a3a1c8a664b5de2e5afc444b35*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x128:160x128x1024_n"f69446b9667732f91804f9d59c02315f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x1024:160x1024x128_n"368ed7cd2fd2a2f85a5994159039f7a0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x13824_n"de5f76d72b11dd8d393e902a6b0abc01*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x13824:13824x5120_n"514960c493e541c2339adae49850881d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x32000_n"b42a2448d3b47dd1cfb74e16d1da4b8d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x5120:5120x5120_n"90336105b507cc20f6fc439ce9066f12*4960&b260f87911d4cd9605b595b418f4e194*20320"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1025_n"f5f8286e9aceaf8cddc7d827cb5f0f6d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1025:160x1025x128_n"f83857744d5441639fb19c2b3af2765f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x5120:5120x13824_n"14cd28738a6cbc3ad41c4b7bbb89817c*2480&8a23785956311d884e0d545da7837cfc*10160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x13824:13824x5120_n"4c8d25415c7d2d5f7369975f7f0e3e5f*1240&d4b8247a6c8b8531d10e0513306f44fc*5080"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x5120:5120x32000_n"4fb0ef8df879381c8479593deaecc310*31&bdc8c1ef6ab2f01b96a3ed9f1a14968f*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1026_n"b5e22d371e3744a3641aae0571304ba8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1026:160x1026x128_n"8ac28decff3e9faa4b5619619b06089b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1027_n"e23c6b1bcb9803c9b62b4212ec4580c9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1027:160x1027x128_n"be74f14a2fea3e7f9e10fb56551ea53c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1028_n"20ac91e3952f7ce8ea10dbebd68118fa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1028:160x1028x128_n"ed2023e21f3c2fe2d7147971f35d45df*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1029_n"2838ccdf01a773469c35a3454f223b21*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1029:160x1029x128_n"3461df239c41a665d09dcd16af9f04ea*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1030_n"0154e0e2f402a397e4e5f7465fdc1768*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1030:160x1030x128_n"2bdfb39c4204d06367b523569ac82e06*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1031_n"60bef94699be73bda3ae10b4567905d5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1031:160x1031x128_n"cbc936be8a1aa717d4c35a0b2663f870*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1032_n"670a79bb9fb5b540db5504901fea8099*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1032:160x1032x128_n"9218f460db4c2b2dc4b7b0c4e7d1f440*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1033_n"ca27d0967c3306e7c4ceae24839127f1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1033:160x1033x128_n"cb77e1c07948e20877cf0a83c7a455d6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1034_n"21f6580915e719c7b138b2e4784d5018*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1034:160x1034x128_n"f17b4bfd00961245cd7169b7d0c9b588*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1035_n"23c93110876da16169a8e81e5aa071f8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1035:160x1035x128_n"8f473d9e2233bf4f0e011e0c5578f530*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1036_n"b47476505c88821268b9af35d176ab5c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1036:160x1036x128_n"8fe1c2e0d59a671ea17c1285a4703770*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1037_n"27695da71fdcc374829e20086929ace1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1037:160x1037x128_n"180b8c0896fed8ec87c2ea1527d193af*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1038_n"c0ab2de67ab7036cfa92172e2ef60fc9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1038:160x1038x128_n"882efe5e419dfe3acdb215b355c1544b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1039_n"03f0adb28f5e0741bb9d4cb58857508d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1039:160x1039x128_n"9d437d746792c0b2cee44cb9d1b3bd2c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1040_n"8f55a267dc1cf7bb70311116e8a29b17*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1040:160x1040x128_n"5ac190492ee3f2182a7f32eaa155a10e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1041_n"c5098c0647de9564d16bad68494eef7c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1041:160x1041x128_n"3b7c4244d0bfa5aa52844e507f2c099d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1042_n"05ca5e80a55a3a8930fac84b62561194*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1042:160x1042x128_n"960a207b96759301d65d65fda62bae5c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1043_n"76682fbe3b1b3ae6a4ff99743075512a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1043:160x1043x128_n"493b8d7451266eaef06bc38e77c8b030*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1044_n"f5bd2ac24933d85425dc01d4f7bf6467*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1044:160x1044x128_n"eb274fae2ed287f1c1c715ea7ddea7cb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1045_n"1e08f59e00652e4a7069888373669df6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1045:160x1045x128_n"dd76dc2b388bf9660b7600d7fbbaa87f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1046_n"956e378656aeb38d16f6b29e47e212b6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1046:160x1046x128_n"5f40fb5ef324c2d510406533fc7fb69c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1047_n"d4a9fef93ac0a165879643a8d9a9bd09*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1047:160x1047x128_n"9963c7d06a3e2774c09756609f50caaf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1048_n"4fa962d231242a2a8d6b10facddc8f03*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1048:160x1048x128_n"a92aee5b422c0e0a312d99765285a974*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1049_n"fe7761453e96312483a05136b20b70e8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1049:160x1049x128_n"85ba692f10eb89182f179f5ae44cf9c6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1050_n"dec2089b4b512331a89e6c6f992157b3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1050:160x1050x128_n"526b9a7910c293d2c85689ba7c3723e5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1051_n"fefb820ff79c6b11f6d5e901b61841c8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1051:160x1051x128_n"f43288c92d195089de9e00472604f47c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1052_n"51e745ce1dcfcce8e5ff97426d74c174*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1052:160x1052x128_n"092eebe0ddfa7ffda856cb28938fbea0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1053_n"31329e4d5032d46e48037c8f5fa240c3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1053:160x1053x128_n"43c2e448905059bc346af8e491df5c91*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1054_n"6dbe2ca4687260e91a8f8e5f8bc0f63a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1054:160x1054x128_n"7521f88dc318db17c183367180d67616*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1055_n"bb15d535ed0f82c4952482ff9f794476*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1055:160x1055x128_n"74e7f012fe0feedd880428551585a6fc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1056_n"4eae036070b7c7bc896821776417bfb8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1056:160x1056x128_n"9b341bb637fa8a4a6cf9819a6f720ff8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1057_n"afd543026aabf88ae3b5f773f6408235*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1057:160x1057x128_n"38bb445772549724cdc3666a49434972*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1058_n"b8ac055c2ce9fdb0ada97b4c2e9ae6e2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1058:160x1058x128_n"0683d88840d645ede5f8c3ce3336e698*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1059_n"50adac1bb3c07048d84b429e878c404c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1059:160x1059x128_n"c7d7abaa18e014dd5ea9da7aea390e93*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1060_n"4331e3d285a275ba1362395bc86475dd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1060:160x1060x128_n"47e49f980ccbcf34ee8bfb75eb4f6990*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1061_n"96ae19ea2740d6cecd7d63264ffe70cd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1061:160x1061x128_n"20dd88727ce187a4a8ddd14ca84f4436*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1062_n"c160c004a39a4f88bacd358a86aa0886*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1062:160x1062x128_n"4b3694aa30a5e0d7547c0c91f54c8cbe*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1063_n"4c10e2adf4ebf58fc32e2bfa591f2307*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1063:160x1063x128_n"eb9954a730a5d4c9248803b8c32cc83c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1064_n"3a94e0545290333cc54bdbe791f64e01*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1064:160x1064x128_n"e059141c055d9504e112c43ca6e10a90*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1065_n"f8e94ff61a11fc48d4de7aef045d05e3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1065:160x1065x128_n"afcc5f24dcf89b9de6ff1601c46ed47b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1066_n"4309d79d984fc7308de5b3f3634b3fa5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1066:160x1066x128_n"cf97a19c290ba728a2f0e08abd56d971*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1067_n"6c69d4a547cc23e8010d415a78e36934*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1067:160x1067x128_n"1d9fc779788591eea903a9999fbdb82d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1068_n"9c2a3ae004740580c15731d4065bbefa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1068:160x1068x128_n"ecb9a3c0e0bfd38c09f2fe388e30765a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1069_n"708fa228561e7abfc63568571c6cf72d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1069:160x1069x128_n"abbbcb18ebf9f45626c859b607aeca39*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1070_n"0f658f8b3481750e39556f3b018344ed*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1070:160x1070x128_n"6dd77768c878d5fbde5b5c63ef7d9baa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1071_n"d4c3dcd70614a3028a7e2bedd76ed6a5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1071:160x1071x128_n"f0b51b20f98f174bcae6c0628f909eb7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1072_n"46de6beaaac39684be831988cbbb31a4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1072:160x1072x128_n"be7027aab2f4d61399bd8f24d5a4bf52*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1073_n"422e1234d4c97c067f2fa1cfefa74948*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1073:160x1073x128_n"5bcbbcde618c451350ff3675d590432f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1074_n"fe594cc991d7c9ddf827979073bcaad0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1074:160x1074x128_n"ee2ea5e612f292235814494953aeaae0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1075_n"c596e55031f4422e3611fadd2687802e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1075:160x1075x128_n"15cb45a0e3b5cae75c136f398b9f691b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1076_n"c45e1e258a5d20f9e5975f697fdeea60*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1076:160x1076x128_n"74a6e5041532033ebb8dcf4a5d1dd4e8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1077_n"c990d5ad8c235c211629df6d3c306246*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1077:160x1077x128_n"f47e386025903c21f93c476880ab0eab*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1078_n"9ef91bdf306a8ad4683bb7d4f50d6a86*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1078:160x1078x128_n"544f572b0606c73026e27c8f57903baa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1079_n"1a4c533a4235115e7f84467086876d61*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1079:160x1079x128_n"947b540860f51d6a2b40f65e1983eb62*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1080_n"389ff2bcb0124309157a45a32fc6a271*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1080:160x1080x128_n"2580eb2e4b338a500d86f7b5e0eef3a9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1081_n"44cfe4de259f8644a1992c54326ce0ee*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1081:160x1081x128_n"40221ae82b074868e905e552ee2a83f2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1082_n"9346e79dc4458d68b0d4ca781304f064*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1082:160x1082x128_n"6a3385cc149202d27e05a5ebafe69ced*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1083_n"b1e7e1e619dcf6f90868252e2b099fc9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1083:160x1083x128_n"b4821ca2ed983cea718a329322e0d65e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1084_n"c230d2dbb5276625264625937479710b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1084:160x1084x128_n"54920c48595982f95b65be5d519aa8b0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1085_n"78e2439963ca77edcfb8ae9e94e4406c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1085:160x1085x128_n"59a4165bcd9781bf3600d8ef9524eb39*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1086_n"c735e607240c9fab04fea04ed0bb7f96*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1086:160x1086x128_n"060208b3089ce34e2d0d527f6218ab57*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1087_n"aef8a712ad9ff7af5380d7cdf40c8bd7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1087:160x1087x128_n"e874bb84c6d06624b380126dc5007e0b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1088_n"f57091a4e226358314fa2659bd8d7927*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1088:160x1088x128_n"b072cec6c0e85f06bc2f051e17b1c454*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1089_n"ece75ff6cc4d8127d07684c9506a7ecf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1089:160x1089x128_n"e4f25b2968371175ab57f1bbb0cf3c28*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1090_n"54bebf406fd30533280368e850b08a7e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1090:160x1090x128_n"9e1deda954c6fac4a91bcd7578a2f119*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1091_n"1e85d6ea3f89bbca1e4caed007e8a977*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1091:160x1091x128_n"ae58783267c71927e8afced41f58ba75*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1092_n"d4f87b3602e1efa68f672400c207ccac*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1092:160x1092x128_n"03fb7aabe9688a627eee7db41e7a0434*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1093_n"c766e8a2980bbcd5f92a44c7d5f3ed08*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1093:160x1093x128_n"b5528c94ec52b63c235dc82d933fdc69*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1094_n"1139efbe20c06afdb5113b61b706a6cf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1094:160x1094x128_n"69bfb24d527264e1d3dcd3c48c040870*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1095_n"a50adf7e9a3c6269d13018b06c87e98a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1095:160x1095x128_n"fb6dda19dcd8c0d6754e894cb6fd0eca*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1096_n"bd422cb243d1e16e7ee2fe4f525baa76*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1096:160x1096x128_n"13e1269732876edbcd72258eb255b38a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1097_n"9baca82746684be875d5e2d945058d39*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1097:160x1097x128_n"1a2ee43e1913db6e4622a316323af0b1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1098_n"f626a8cfb05236a2a0dad50d67ab9b67*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1098:160x1098x128_n"002c9bc2f2378795978c191bfb065191*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1099_n"c31fd353e9128e4a8c6b2a31353d7198*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1099:160x1099x128_n"3ceb4e2ce59cf24d5bd339bcd1e9eefe*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1100_n"23525d7e1091018696184cd4e3661712*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1100:160x1100x128_n"22a52fb227f859bcad57c9aa5319f7fc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1101_n"7f52402152f13d102662adbe24cd71d2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1101:160x1101x128_n"5a7c8c91d298d08aa6c918d9c8c026a3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1102_n"cff369327360c5245ea6b7e116782c4f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1102:160x1102x128_n"0795f6ff42bcd850b86e59a9b53fffd6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1103_n"1d1fac66f1bfa49e408e4348f61e7853*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1103:160x1103x128_n"81df0df8373b41b502e6c374546f6c05*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1104_n"d64961bb73494c9b792227b5c56cf40d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1104:160x1104x128_n"d2e574f11ce45182260b95bbc72d32fb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1105_n"296e7cdb619da02863fc6daf61bc78dc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1105:160x1105x128_n"992663f98cbc1c8424cf025256b2ed9c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1106_n"bfa0595d814746b97695669cc667fece*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1106:160x1106x128_n"c2c9827bd4734117f5f87a374f3f7da7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1107_n"da4ea581e4bbe16f38d6bd614f4bc3b5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1107:160x1107x128_n"92e25e13b68c334f8d8089f1488f7032*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1108_n"f25ff6abf2a805a589010772bb727ec3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1108:160x1108x128_n"810cac6aa6ad3a409ed2b19352d7c3ab*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1109_n"c2f89617a4de356712968a35eac6e90f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1109:160x1109x128_n"e6baee49ef26df46704e8bb3cc825d69*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1110_n"39421c2db2afa0ddb008186a3aaf43bd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1110:160x1110x128_n"cda5f420b79742854eb217ab6680bf1b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1111_n"3dfbf3e45f1d2ba540951cff26be2e42*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1111:160x1111x128_n"15a7549bd06c6221c63dd730df4e3937*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1112_n"96ef332341264cc99a474afe42c06091*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1112:160x1112x128_n"940bc6ae0969db3f05edd9206729ce06*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1113_n"25e7b52e2a95532d864e3b2929daf036*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1113:160x1113x128_n"efcb4434b0e6976774870d3383789421*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1114_n"512bb37eadea908f4797e9f5577ebc75*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1114:160x1114x128_n"8fd49e3d291b605194092423b70cf4ba*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1115_n"e15177faaaa404688b04807a22898320*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1115:160x1115x128_n"3a2e5790485111f95a5321a8a93e5717*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1116_n"3054f0d2ab88cc6e4838e516e2d921c2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1116:160x1116x128_n"a5dcb802ae3f3a617b6490718db8f6f7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1117_n"60cea7c37d29b0a44d528f1bb118d3ed*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1117:160x1117x128_n"9b5aeebda808cf1eb4d025d346c8d5b5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1118_n"cc23d6d3bbaab2c5df85028028ae3806*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1118:160x1118x128_n"8f89828e6c6dd4bf3edd9dd41b0eb24d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1119_n"dd97b07a4d9fc3325bad39db359692b9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1119:160x1119x128_n"06949750afcc46b772d0fe8d86217f36*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1120_n"4f7ce2819aa9d73952a7ad66be3215eb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1120:160x1120x128_n"a88e6bfde0d2f2206de1e704db4433e0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1121_n"48f97eb71d9e7888474ff05906ad7283*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1121:160x1121x128_n"2ee143e2f8d6531e4306602cc28671d5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1122_n"aac95f5d8141eb6f8a302c8f181cb7f7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1122:160x1122x128_n"6232ddafa084d13ec4003e94692133ab*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1123_n"3feb6e6d669e2cacd505f780e15934eb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1123:160x1123x128_n"44f14d791d820b1a3b871aa5a34b2c02*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1124_n"3dd85fdefb4fc4e99c1121a842b14ef6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1124:160x1124x128_n"174cc3b0f5eb042817f2483eddbc5cb0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1125_n"15c60831cc20423af82cee994dcbe745*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1125:160x1125x128_n"189d4d5de1d77c4d2a47610781ee5d58*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1126_n"bc19abb8fb8387d2333610d500f0a8dd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1126:160x1126x128_n"508ba88fbc83509d97758d0b994002e1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1127_n"9f2a85b6933e6ed9aae570cd08c75058*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1127:160x1127x128_n"cca258d611e9ec4be9a6286d64aa03c8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1128_n"af15f4c478e72d2100ee8a9a8779d75a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1128:160x1128x128_n"1869c737e16741a54692e4aaf8477fe1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1129_n"2a4ba8d717d82f1c9a2b939b7382a965*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1129:160x1129x128_n"2ee26cf45ecda3724d4e600332248039*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1130_n"1a77dc30be161ff2634914ed467245ac*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1130:160x1130x128_n"cade7d69bc10f617082bb514a9330b0b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1131_n"a70eece0ac3b3472d4faf6409d1b5487*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1131:160x1131x128_n"9b339cabe42805c199194cb2023dc32d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1132_n"dd5ff96ab02759148d98724b771cb051*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1132:160x1132x128_n"3dcc5caa21ef475e5f40613f3321eba8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1133_n"37c3e441a0ca617a080cd8dda49187a1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1133:160x1133x128_n"6bba26e2b0e924d68855628acef99865*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1134_n"8a8e985dbfef4de155e9a345a2969fdc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1134:160x1134x128_n"978f6074224c3275470dea507057207b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1135_n"96b06dbf52f3243d3532222cad3a1d3d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1135:160x1135x128_n"027f1d78df7a746b35543f5d8ecbc280*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1136_n"07a66c5d8b1046f6a0a8761239b945ac*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1136:160x1136x128_n"eff11624c16c49c0af48a4078c02ec7b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1137_n"f9da1037fae94518c6f5b556959677e9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1137:160x1137x128_n"67d5f82add3d5eaa81c8afbef33d6162*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1138_n"5fb575dd57e0c750078f0f55a40035a5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1138:160x1138x128_n"683df7ae837efcd6345a894228c4620a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1139_n"2fef84d9cd1fbf189fd3cef44815260a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1139:160x1139x128_n"75f433f2c9f2f67fe2dd3df9dc5d394e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1140_n"814830de1adfaca2b846143a845a8271*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1140:160x1140x128_n"be03ff39865f2bbd983988c599bd3f89*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1141_n"4a7f67b73b3746495b3711629c86d61f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1141:160x1141x128_n"7477545a5c81c09ae7f4bb9ead9a7d73*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1142_n"133761f6ab41f403ab193fbc06523100*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1142:160x1142x128_n"559aa7a7869eabbac172d6f116d09b92*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1143_n"1379123ca3f3ba345d5a03ecae65410c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1143:160x1143x128_n"e65c33b29dfca93d8dbc9a6ca79f059a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1144_n"089fb04e4e7ecd7b16759c5ae3c6972b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1144:160x1144x128_n"4b0b1855b7c33ae204ac345bcfa24375*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1145_n"ee677b315933fe8a1208a22893bd649b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1145:160x1145x128_n"b967e44bb08028591563a1c778faa773*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1146_n"1fe76f6f9dd9ab3dc55c39f57da75ac7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1146:160x1146x128_n"4fd1815dfa683ee9322493c82fac5d5a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1147_n"0a3c036c1a21b989846d721bb98537ab*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1147:160x1147x128_n"a4818aea22b2f12900e0ac1c53956c4c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1148_n"c8b46bb6929e5b19d52f0d97596a4c7b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1148:160x1148x128_n"38d9a9b1ccbf6f20cf7355e33d12e13b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1149_n"fbc442873904dce1b0b89531aef27c92*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1149:160x1149x128_n"c595e4c06f978e4285d9012ce933457b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1150_n"1149bdd98b69258c3f93622eb028b9e3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1150:160x1150x128_n"248a274e5980287d2c5fdedf802051f9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x1151_n"51e37388821e964d7480ea2ad76843f8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x1151:160x1151x128_n"9ba2946e60ff00f86e877bc582ae0a52*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x32x1:0x98304x1:0x98304x1 1x32x32:1x32x98304_n"81da3a95ed62062b804c6193c959e7a5*65"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 1x2048x2048:1x2048x2048_n"258cedce1ae0862d64fb42390e4f2257*3144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 --attr-post-ops=sum 1x2048x2048:1x2048x2048_n"3bfcbb13123d569e7aed53097c1cef76*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x5120x1:0x5120x1 1x2048x2048:1x2048x5120_n"b580c51d11b314f11c2cefef12735a96*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x5120x1:0x2048x1:0x2048x1 --attr-post-ops=sum 1x2048x5120:1x5120x2048_n"039c03ca993e1a359872b1c88ac968e0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x6144x1:0x6144x1 1x32x2048:1x2048x6144_n"0757887e5b0d5e06e04a6f6413be4994*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=sum 1024x1x64:1024x64x64_n"0ef344e6d866cfbcbf6b060de2510269*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x64x64:1024x64x1_n"12dd0656e84610a8b9737066163910ab*3072"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 --attr-post-ops=sum 1x32x2048:1x2048x2048_n"0a30c63b021ac1b96e7f52626baca921*3072"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x2048x1:0x2048x1 1x32x2048:1x2048x2048_n"a7b0d0f09ca8a507894f4139a733adeb*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x1x64:1024x64x64_n"b94a833f03de946e3d790128f336ba4a*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x2048x1:0x5120x1:0x5120x1 1x32x2048:1x2048x5120_n"79d63c89e5139448e166f36ca676a5e4*3072"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x5120x1:0x2048x1:0x2048x1 --attr-post-ops=sum 1x32x5120:1x5120x2048_n"c6a49de019b8ce7ce504299a553cb39e*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --strides=0x2048x1:0x32128x1:0x32128x1 1x32x2048:1x2048x32128_n"28d2817d4df274e9efb2bdc74674baeb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x3:32x3x1_n"7a6a877a01e2c351a9f8a711fe6ab453*64&ceeb6f641e967467aaee54f3786c4ba9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x2:32x2x1_n"d5f5c8c2221b6031d6d4420366c21cc6*64&e323fb3099993bc17c2642dac96c9c79*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x13:13x1_n"14f82c3982dccc16bd82cdfcf46ca7c2*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1053:1053x1024_n"43f1a734a139101669faae730a772d4e*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x1024_n"e0ac48903c89c5224ed9f26bae335351*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1_n"fafd57ab252148acc1700307324b2081*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 13x32768:32768x1_n"6a58d938a62ba3daf6856f571ae40afa*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1_n"9575a385182ef6b8c6318b571da06624*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1:1x1024_n"d9f5b7276c2fcf48845051041ca70d02*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"249e4382c7b44b4d803b7798508e013a*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1053_n"380b367775383f012e4215e60f4923c8*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x3072_n"7fa1bef5949587f62abc16e471523415*46&c0e9abb7693c3640e144abdd63ea8015*23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --wtag=cab --dtag=abc --strides=:: 128x77x64:128x64x77_n"66048df7b9d170281e742b3d8be7890c*46&ac2f7dbe84e075ba0254785cb73412ed*23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=bac --dtag=abc --strides=:: 128x77x77:128x77x64_n"48b3811233668961137819d9878fa213*46&94882237f2c89792cd0f64715dbea0cd*23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x1024_n"b00ca34092f8dd17a86257402ec63950*46&5e162bfe28b853994a49fc0fc61eed65*23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x4096_n"5a8d7c40801975843bdd336ab3efa05c*46&38556c7ee10f5bb9db3a844cf57ccc85*23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x4096:4096x1024_n"0f5837776b1650d77b1dd643102b7f14*46&23203f2a4039abff76357d8ddcd29eb3*23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x320:320x1280_n"0897a7927b6d49cfc7d2acf7ebd0ce87&fa53339ccec06ee7f6b950f921180a0f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1280:1280x1280_n"7456abc45904da283c6dc72404be62a9*25&8e5869e1650e469e7be460753cf052fe*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1280:1280x320_n"ef03ed01ca040616fbb35a1fdd1c42d3*10&25a1e21459930c573ce84ef96480c0b3*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x320:320x320_n"f1461181e869fb04ff7da5812d42353f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x320:320x320_n"dd7959b4ae574e753d5f79eff67a4c06*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x4096x64:40x64x4096_n"577fa6a067f4ab2c5e91ee43b09e226d*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x4096x4096:40x4096x64_n"07766d7b6f2d0e1604de7672f5e2fd4c*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x320_n"aafb934154f633638ba49e1337f22b79*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x4096x64:40x64x77_n"71c0c93debe7093d062a89e1ea8025ec*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x4096x77:40x77x64_n"4888dd741323e38d3add34fa268286d3*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x320:320x2560_n"9e4b7f33bbb08b3d1543e350ae8debad*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1280:1280x320_n"0c940dfe14de65efc8febd0b9840089f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1280:1280x640_n"ddb7289978592929631ad94ddfec1234*10&ab7edd7e42cc15fd1ccc5465ec39decc*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x640:640x640_n"7e421edab142d96a24d652bf5ae3057a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x640:640x640_n"a29a4ca08b33e0b01b589f7b7b44a619*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x1024x64:80x64x1024_n"16780aa8b8481e6c67b86d7da941f40c*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x1024x1024:80x1024x64_n"8657a1909cf6ed3b703c52a3149fecc4*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x640_n"734325a2f4fd28782b2ce8d06404f40d*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x1024x64:80x64x77_n"4c6560396935e62aed3d0d855fe4b13d*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x1024x77:80x77x64_n"044b8c5f8ff3aa54a0d4fbcb604ca2cc*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x640:640x5120_n"044120e3295d6367cb61057e73734e46*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x2560:2560x640_n"d435bff3d2a7dbd83952f9011893daed*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1280:1280x1280_n"c08e054e0ed144548f1d1514b18a762e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1280:1280x1280_n"8b6f24784a286672af33f1d2e1691aa5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x256x64:160x64x256_n"1b6d4edd43450d2c67f9945783e103e1*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x256x256:160x256x64_n"8c7fb5538d9466ac14021f2cd653b120*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x1024:1024x1280_n"3e6e3855cc1fc8abc8cd5ac98c708fba*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x256x64:160x64x77_n"bc49001dc189c4f44caceba504e1cd63*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x256x77:160x77x64_n"9798b95c231121303b9a3312e5a71fb8*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x1280:1280x10240_n"74d5597ce97d225a00ec8e68944b2acc*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x5120:5120x1280_n"3ea5666b90114c10af0f987d185a96bb*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"9f263b5f67ab0003414d06ac2c4d85f1*20&755f89ec5ea36737f442c60b16d3d71d*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x1280_n"d42093864c5c423c96c48f29afb7f4dc*40&5e16412d55d90b7183a4a93278977248*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x64x64:160x64x64_n"fc9b3ed97a484aed6306a9d70fd5bd9d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x64x64:160x64x64_n"9d62871f8686bc88600b118141c7dd72*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x64x64:160x64x77_n"25ea1716c3b765d9a71080ae78944dc6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x64x77:160x77x64_n"9e4d8f3949f8b144b3ce2c4f55a9087d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x1280:1280x10240_n"f61078aacd4082ea2405a5e0c24812ca*10&6d83bc2a445c1731c0146e53e0de49e0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x5120:5120x1280_n"073f4c813373045456989a1d2eecf52a*10&283c8564b5b6bfb0cf591c97596b4de2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x320:320x320_n"e95fe21fcd67f0525e996d58e07e59c1*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x32768:32768x320_n"2fb242fa42ef0c151e0a0518aa57587e*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x320:320x1280_n"8aa443a316bf1c2df5b768a97391a454*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x32768:32768x1280_n"a6581bf7d02c7b1f6d1a5f6d55c66bd5*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x2560:2560x320_n"ff310cab55d7ef4b96cc75facba59b2f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2560x32768:32768x320_n"19dbe52309af485eacaba3caa2307b07*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x77x4096:40x4096x64_n"5b2187a137d82d710926e291057bcb04*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x64x4096:40x4096x77_n"cb90fff4f56c5e3294b87012dd42bae5*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x616:616x1024_n"a9b6a9fe820796784bb9096f5c05f6c4*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x4096x4096:40x4096x64_n"08518608a015e6b856e372043c02e5eb*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 40x64x4096:40x4096x4096_n"46e43145897eda884ecef9108afd9112*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8x320:320x1280_n"a182d4a8a57d3731817a55e86236bb67*5&8247877a85023a5e704e77908fdeedcc*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x8:8x1280_n"9f9482e20ceb670d2642c2dfffe14f5b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=640x1:: 32768x320:320x320_n"ca0757493b88761931a7f478e82d67d0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x640:: 320x32768:32768x320_n"1c93accf0c52a40e41a06f1073a8e0b5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x640:640x640_n"14088ac4104a7556821f8a59dd37f8e5*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x8192:8192x640_n"792f439edb723c9a006f28ca828afea7*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x640:640x2560_n"e127059fc8a9d4ed7e8d896277af59d8*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x8192:8192x2560_n"e8e77ddd746b9700ac9f63ed8b46059f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x5120:5120x640_n"ffeaff2aeb42627ffbf54900307f1b89*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 5120x8192:8192x640_n"a862927fdee185383a472293b4d52c1e*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x77x1024:80x1024x64_n"61ea43d45493c3f6db974fdac4a301a7*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x64x1024:80x1024x77_n"89f4bcd316e73ad10d271cdd22f21b93*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x616:616x1024_n"ddb11c71afe3595c8f29518ff0cc9075*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x1024x1024:80x1024x64_n"edeefebadef36effe3908f3e0a02fdab*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 80x64x1024:80x1024x1024_n"b92502129c6b3740f1271143cfab6288*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8x640:640x1280_n"ccb4b4f1d66df051895fd32ab9bbf98d*5&d834fdbf6746d6f181eb2d21bb5e601f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 640x8:8x1280_n"6a113bfc0781bdc4cb294e2cb2261254*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=960x1:: 8192x640:640x640_n"3304d9c5c0d02f6d388fe59c2280d7c2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x960:: 640x8192:8192x640_n"8c321069229e1d41f2cbe8a7100d0623"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1280x1:: 8192x640:640x640_n"31cf9ac1c2e9f9d9f4f333a8e0c36596"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x1280:: 640x8192:8192x640_n"8b5171877933bb57893ee152f92ac8c5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1280:1280x1280_n"5c12c9a464c8ab79a38afa14066b61ec*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x2048:2048x1280_n"eb454ad755b994f16698add493f7127d*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x1280:1280x5120_n"9dfc74e481fa95adfcfdd40a01542560*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x2048:2048x5120_n"76a031f959e7e44aebc5df3b28358891*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x10240:10240x1280_n"89aaa2212be578196c5551219a08401d*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 10240x2048:2048x1280_n"8b5856065d94c43e0b64d953cf79cb28*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x77x256:160x256x64_n"9515689db65bf5a73652f233cbf34981*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x256:160x256x77_n"9915da4c32b03c03ee26b26583814080*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x616:616x1024_n"ebfaa40ef681e76c1b51aa7aab95eca4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x256x256:160x256x64_n"2aafec36e4a4bd3174043b38377650c0*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x256:160x256x256_n"66982e69f3b5326a672279bdbc7789cb*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8x1280:1280x1280_n"ac0b2adb016e874449774ef94d910a7e*13&d4add5e3041923ecb3d6a2572bc5d4ac*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x8:8x1280_n"ff326488d89daf903aaf15eae49a61b1&0ecbcef2c218d474e22f992fb5a2f76a*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1920x1:: 2048x1280:1280x1280_n"aa30ebf13fa7a0109eca171aafe14185"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x1920:: 1280x2048:2048x1280_n"66477915e5829290e8027c4eb86ac3ba"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=2560x1:: 2048x1280:1280x1280_n"ad1308a50bf4b8f250241fa609083ebe"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=ab --dtag=ab --strides=1x2560:: 1280x2048:2048x1280_n"f24af0fb0fc83da3d3d5678339d99ec5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1280:1280x1280_n"860f01d03e5404176a61283c90344ae5*30&697726ecae2e6adb7ff58fb3ec02fd2c*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x512:512x1280_n"ab2147bdac3e2001771eda6ad9134389*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x1280:1280x5120_n"3a1b9ef28ebc80f620ebbe8ebf999d61*5&0cb6e349c71edb3942bfaa73940e9dad"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x512:512x5120_n"9f78a488bd732d4f2cd5e553d9098570"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x10240:10240x1280_n"d84b6fa68feb35f76b38a39349247e7e*5&df6528473dcf09e65ed310a400eacc26"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 10240x512:512x1280_n"7ab369f4c3cd5fdf95165157bc2ca058"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x77x64:160x64x64_n"e4290843b1881bb33393326af1ec7e55"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x64:160x64x77_n"30d965f39664775c509e9dd0b6e0ede4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 160x64x64:160x64x64_n"431fe71b59bf204312e34d4e6b542880*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1280x8:8x320_n"fa0007445eb93274481345ee7d4667c2&2cfcdde20481afaeac049ef5d8a2fb98"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"202c262d50781accd7652d1d02cf6b18*19&58880849cecf18c9ace4bea88bcc7401*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"e64ec54a69161651d1b3b0482a5c4d0e*21&b62595cd0d1673c2c5443a8cafd00fe5*21"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x320_n"36abb590b36c15d13a1b1626135b92b2*10&02bcbb7b2ce37f807a2a3b48999a1c95*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x2560_n"58a9b5a78183f1b59e20511063cb6afa*5&38a00e020d3f19e898642415dfab88f5*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x1280:1280x320_n"1c08447a3a5f9e9d51986170df45bba6*5&7d354e5dc5b8dbf9fd8419f9ad5ef1ee*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"93d9d923922184e3509665761875082b*19&32c06aeca98b5e6d4f5c242b3a018fd4*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"d1ccb62b58b3fdc5cda739a6926429cc*21&0b28da6707951273d7f30738ee776ea4*21"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x640_n"2f8378eb548bd639ad30b4554abe96f6*10&c5e5b222d3e2f4c514d8877adc2329eb*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x5120_n"06d4a1ed3daa228e31ad0ca576174253*5&6c9acdc5d5a6596272d002746a21b3d2*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x2560:2560x640_n"0cb5dda6a08c34ecbd50311955644b6d*5&6f688d7640befe1545c5c2568adc38c6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"a17e6581bab6c9ab0e25d0ccd8d31f35*19&cdff0cee7117fc1dac5c2417b490bd75*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"f4261c15af38cabec3130fc98946e5c2*21&08e10ca1b3b0beb1685f9905a30a6edc*21"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x1280_n"4135f86f5e80426dba81291939f2fbc3*12&e0433cc8b64d51885381be9ea0af8049*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x10240_n"44790d457e4dcff3bf92e39de605e7d2*5&56b329eae2441b4faaefb8af08680e89*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x5120:5120x1280_n"2319952fe37b8da52f220d537a858d96*5&69f69b9ca1d62f5fb7b2084ad3fb295b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"5dad0207e63c997445acabe1b7d6bb0d*4&45a47192c9f4aafc71fee12ddf6c61d3*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"e8ea00e892ab51fa0acb1949d452c48a*4&f6082d7822c05c8ef39032ad4524bab7*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x10240_n"a7fa9d92a8190525bf5d746e334fc369&4581d2f6bbdd45f6047f99ed90573633"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x5120:5120x1280_n"a595ed5abed5d657eafd8d3fc61e470a&d90055573b31c9c509230350ba3f84c3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x13:13x512_n"f70f821a9d323ff3f12e11ae3961db27*400&751ec604ae9df50f296a17fd00936004*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x256_n"aa61fade40beb77c90b76ccfd2dda6db*800&a14a100f2b2103bdca33541d3513bab0*14400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x128_n"01b3423902a1bf0cd5f7f1a3a9fa022c*400&3f331e873b313dbbd6f12f95b4f3e2c7*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x512_n"80eeda5b64c213de21f272677d205b96*1200&1e9e8330ca047aecb7af8fd71f729f6f*21600"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x512:512x3456_n"707cd20bf88327f62b2a4c1e5b9b4ce0*1200&b8468c3c5dada3a5cde41d78d386ff27*21600"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x3456:3456x1024_n"02b5a9c9f0e2c70c18ccde7157fd90b2*400&26e2f96879499ac8509c271880c67d24*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x1024_n"8edecfbdcb778b0e68953c84e64f5b57*400&0bd739cde2579ae40c2bbc6fa2ed8fa7*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1024:1024x512_n"ba0ef1fb29b310e5be2683fe855fa31a*400&9ba5f562196c792e77b8b275a550c2ce*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x256:256x1_n"b6872418611513ab966997a2d3167faf*400&62cbb43e12c1d4aeecbcf6bc2ee03600*7200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1:1x256_n"e369deccc70693de6cd6a3ee9cc138c2*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1x8192:8192x256_n"d07d336af894756b6ab51edf1cb41e7b*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x256:256x512_n"dcae6f1208204c5c45ea348364874a51*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x8192:8192x512_n"55f4499c779914d6bb3713609937c2a3*800"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x1024_n"2ade577a984b2171ad8197e137728b67*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x1024_n"7ec31654908e0927a1d5792117933204*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x1024_n"9fe20b4c3e7817dd3c1f96dff0e09358*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x1024_n"52c6238176b9a742b3b88fcf70d5842f*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x1024:1024x3456_n"a1f6c6d94091ec612715862ecbd05675*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x8192:8192x3456_n"e5ea1ad84ac36de9e3b64608f0caea74*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 3456x8192:8192x512_n"ae69b75a0b17099d44b21cb6ec7e27ac*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x3456:3456x512_n"866a4d13796733e106850dbdeeeeed15*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x3456_n"b5551581723c0e2956c0475a5e1b7d7f*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x512:512x3456_n"5ad4d930844993be42c98d9a9cd4a2b8*1200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x128:128x256_n"5bf264792891a8433890c510658e81c2*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x8192:8192x256_n"cae1724b015bd574d05c09f12ae735ba*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x8192:8192x13_n"85b4bb43841de43cde69a227ee8030ce*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x2048_n"2a5a9b1419c0befab942816a7d16dff5*252"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1024x256:32x256x1024_n"8e4252424260a4b02fb824c9386f4edf*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1024x1024:32x1024x256_n"33a0ae20d759f91b190fdb3d2344fe1c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x2048:2048x4096_n"cd07de331a2b4fb11d01bc4a661d4b64*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x8192_n"95f31d2a5f0eb7be43c1717d6021b046*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8192:8192x4096_n"58312432799589618762318dd47bfea0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x50400_n"5cdeee606eb356364a1c0027823ea4b0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x2048_n"1b6f431793783aa9227526126128a230*32004"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1025_n"8903615cffcdeda1642fbfe9c948ef51*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1025:32x1025x256_n"00ea1b89cceabd71861567779d385997*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x2048:2048x4096_n"89f0d091b03b29152bc262e94c3815c5*10668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x8192_n"3c39598edb77a6220afb26b769ce5779*10668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x8192:8192x4096_n"a861f9b018f6a5baf574acefce8913a7*10668"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x4096:4096x50400_n"140ec44d714b5f9e00627091df87e539*381"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1026_n"c1aa3893af43ec52b731afc3345507e2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1026:32x1026x256_n"b028e543a90938f89d74f387b959d418*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1027_n"60921358b2b774623c3965205d2db249*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1027:32x1027x256_n"3a8046a6e28508a3081b3ade12a8d2b7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1028_n"0d3ae48e787670991e58b91b2fe69a19*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1028:32x1028x256_n"4db0b9c2d4f36c3a2028c61d276fee1d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1029_n"a693aaaca5e28157df5cadde2dddd567*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1029:32x1029x256_n"30160a2f97e5d71915bc8986c76b5b43*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1030_n"796c9b9f50de51e0404a69d2d9a693f8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1030:32x1030x256_n"9cc4703c37bb36f3c205ac9338c2d8ad*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1031_n"70be914bdc30fec2a684f24a788fc72a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1031:32x1031x256_n"46e87fb5b62400cb0f18ba872b01681a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1032_n"351b64b6933b48926371f6782a95ad50*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1032:32x1032x256_n"d878f9ed6f01de6d539c2585e6b88fcb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1033_n"d1b2aac83fb64cfee1248add93703207*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1033:32x1033x256_n"5394cd81e2d9e294fd0fd37994613b5f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1034_n"a7e84fe25c501416d555f8b723cffe74*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1034:32x1034x256_n"c784f90c4f33df856bcc1ce9fc66abb1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1035_n"2933a6a00ad173ca2cb0908be416b259*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1035:32x1035x256_n"adce609fbf27aa69ea5159c595976d29*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1036_n"d6f4146d2584eb0deead9c90853f5d16*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1036:32x1036x256_n"b5aa246602de519d473084192fb8564f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1037_n"071c80b618092bb2ddfef057816dda2a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1037:32x1037x256_n"f86a7ddfa8e79b589d5ad1be3c158496*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1038_n"b84c71bb99a042663b2159b611c0499f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1038:32x1038x256_n"29cdf18e59d5048b36501a1f813c22fd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1039_n"dfe291b53f989bbcaa6ae9a89b1333c3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1039:32x1039x256_n"3b2750bfa8d7d758df77c7d3416fa0d8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1040_n"a3135378edb5051fd60810f5b48f636f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1040:32x1040x256_n"2fb02a1f64b5da1cf222d01ad45ede53*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1041_n"eaafcf34f846cedcb2dcfa93603ee087*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1041:32x1041x256_n"2cc5581fa38362cadad727bd045ffce6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1042_n"179776ec0aba6ccb8dda6181482d442d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1042:32x1042x256_n"02b651c96eaade6d884ca310a52653df*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1043_n"04f86644f7aa6e5eb8b47d3f6b292e7b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1043:32x1043x256_n"07e3f9d1817a09a37cd568c07d06b449*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1044_n"5cfa599389ae83a0fbf25258aa8e42ec*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1044:32x1044x256_n"37a14bbf46d04534b8bcdefdb9ca8fdb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1045_n"393556d09313b5dcede7250154847530*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1045:32x1045x256_n"5115e1a0fe237599b1c010fec67758e2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1046_n"1ddef77dc8eedc225612d8caa3f6a721*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1046:32x1046x256_n"06d5933168ba6b16c326cb34b6c46ec6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1047_n"e8653db37e85bc4878df949bb0706a07*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1047:32x1047x256_n"2d44e54b24d24fdd69d2e355d989b2be*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1048_n"a4a12afabc045772b4138778125da08b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1048:32x1048x256_n"5dae2ae16fa1aa1ecb58b15fd94e1628*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1049_n"19bd41e67f216e6700669935f65f41bd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1049:32x1049x256_n"727aab47a2f6b66718951914320412f7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1050_n"83a4e2eed31767d9ec515f8e179218f5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1050:32x1050x256_n"bf2275ed60c7a3a31030fe4928a8c504*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1051_n"6659d18210973965b893161e71a7a5c7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1051:32x1051x256_n"688bad0995f9b3b347232a411e3af5d3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1052_n"8b95ad150d80448e05f2c67ee2cb4dbb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1052:32x1052x256_n"608e94fbad85b88a09e51976390bbc0c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1053_n"0456764fcb7b94f0840a331d70bd2cc1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1053:32x1053x256_n"99f8a6ecd6a135624c81ddc3c8357712*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1054_n"c8927bdade0901e9f2b8cab05ffdaefe*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1054:32x1054x256_n"dca1721896b24d392ab55dff481b6b96*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1055_n"2769cb81b3c10b762abd36021024d888*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1055:32x1055x256_n"5c89b4dc70c34449e59bbd4596faa836*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1056_n"41b9ce78cbd16a05e501bcbb4ad89078*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1056:32x1056x256_n"d49a272fcd5bd7622ae0cf58d9fadef8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1057_n"56d63ba4a718c69cdabce4ed0a3e358b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1057:32x1057x256_n"3d783895e310ce333c510e3fcf6dc1ec*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1058_n"69e9f4dd5621650fab4cdbe24e435776*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1058:32x1058x256_n"5112ad63fa0ae0b60492f8a1c6fc3f1b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1059_n"330e4911b04061f5b12ea66464ad7a1c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1059:32x1059x256_n"f8db5391af48f15139868132972d5478*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1060_n"f7e969af0545e00a6165ebd66ef43d04*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1060:32x1060x256_n"4d83f92d21b1bf971e836a4043146bd3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1061_n"a487188bd609d211f2d89a4a70db7dce*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1061:32x1061x256_n"8eca21b7483eb5909f5f24144ca4c307*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1062_n"696393bb03333b3bb3b8ad6670f89eef*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1062:32x1062x256_n"fe88a53b80b7c3f6bb6c546dc033f173*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1063_n"79d4abce92a08d8ad1781f67706c4e77*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1063:32x1063x256_n"a2826bccf964483eccb878bf9fb6fcfa*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1064_n"ea0891ee94291d2959c5212899c7d822*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1064:32x1064x256_n"6644027b02dbcef57779a160101a5272*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1065_n"2f90d5424be85e26c9e945ec5cf46cfe*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1065:32x1065x256_n"4fe26bc92b552edc09cb2f0bd60ba052*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1066_n"3cf0ddbe07b5789943031768319dac1e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1066:32x1066x256_n"6bb0fb114cda6730007ba313316d1fd3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1067_n"9e73a2e996334acd14a3a4e227948c09*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1067:32x1067x256_n"cda4f2806cf3df7a35ae1615b73168a2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1068_n"2152a0f72849d6a3b9f22f8b23632480*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1068:32x1068x256_n"981fd5c9ecfd8d02f9e4283f8771ec1b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1069_n"1eacd02428c6b583aa69818683598df1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1069:32x1069x256_n"23c96198079308f5e7b0891c12be8bae*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1070_n"285c7bd61c0ccb0e3d8dc7bb9e496cd7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1070:32x1070x256_n"66cf30a6eabfa28aa28c8e45fa1d7e32*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1071_n"747c4e56d948fb18e2f091c6be460a3b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1071:32x1071x256_n"fe31c030982c81497df99ee8c1c89cd6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1072_n"e16aab931e82f9e7045dbfdc84cf4387*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1072:32x1072x256_n"f16734ed692885ee443c80321f308a70*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1073_n"8d1ba8bf97de81e2425303dffa8bd278*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1073:32x1073x256_n"9ece83dcf4fcf9e27b819209ca130f41*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1074_n"258e5f589f550730efb608fc161a107c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1074:32x1074x256_n"14bd4768518ac6e4aa30420303b036b8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1075_n"1437f2d1b83187121d3d1b31a25e1b25*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1075:32x1075x256_n"35b268fc5fd510b55ccbb26bf347c11b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1076_n"1a809cb21865b646180e42a88e287722*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1076:32x1076x256_n"35acd979ae93fcdcf3e5b5ffc568e85c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1077_n"172e8a2ebc593697dbac1846fadd144c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1077:32x1077x256_n"db9bcf5fc82c8659cd558c49e43190e5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1078_n"94e34366b4cd6fedd29c7bd063793305*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1078:32x1078x256_n"7e46eec75ddffed44f7bf3e93701effa*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1079_n"249d39110dcfdf46bbfbf66b99031b1f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1079:32x1079x256_n"7d57f8caee671ece323d5dd36dfb7bcf*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1080_n"4fd0d35365efa384942f16655da1e4d9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1080:32x1080x256_n"4c3a0041385c37ea074e8d38914a53a7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1081_n"aca9d2ea7a5dc405a6dfc32c84b613a7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1081:32x1081x256_n"de148cafe90951ac45a8593298d556a7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1082_n"f5446fd35e1f40e741be1757e25f4c16*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1082:32x1082x256_n"2e5440825f8999a2233cfe18013dd7ff*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1083_n"c146e3d5982840c713ac7bdb4fc8a066*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1083:32x1083x256_n"dd1d3a40a50ef04c49b3f2c4d209861a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1084_n"8b2ad6afaf4d6fdada284736a1a29184*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1084:32x1084x256_n"a2a9453957b3d0109ba16b3563e475de*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1085_n"bc47ca757022e4b566d5ae7b9a88838f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1085:32x1085x256_n"50518eb0563662080cbb5bc8b2b789f3*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1086_n"9f411388a20eda775f4898a4619d6c59*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1086:32x1086x256_n"98c919ac4dd261f80965ecec2a0af867*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1087_n"a075e0932bffb5733ed7d59857d14223*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1087:32x1087x256_n"9408bd74783f43335551fd01a800dd97*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1088_n"c7f96817aa4a61f0e6f705c0068e9455*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1088:32x1088x256_n"5c68811548670e28571e022a00be7f83*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1089_n"2f9fe0d52ef305045339124040a338f5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1089:32x1089x256_n"810651fc7ea141ac48e8ea1a3d855f88*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1090_n"7438d11cebd4cba9b05bb4b552a8fde6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1090:32x1090x256_n"e422be92b3ba3da4b4087bcc003e174b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1091_n"01dc79a5854916d313efa5b9b756b87a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1091:32x1091x256_n"7882ebf1fd59ecec9c5d17828b2fdaf5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1092_n"1403e128b8f468037d60c2b7e3653e13*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1092:32x1092x256_n"0b549a6d4e0834e645aad38293418f2c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1093_n"a5185baf1986f628ef20a063075d8e2f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1093:32x1093x256_n"f732ba1a99cef8c650e0731886aca9ff*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1094_n"8893954fabd03dbf44e2dcd0b643ffdb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1094:32x1094x256_n"dffd7b89fdb3cc5866d8efcb9c0b2382*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1095_n"32257ab7b9c2821a8bcc18aa78a848f1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1095:32x1095x256_n"8aaf6773a0678c67c3d4c9ab0760b404*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1096_n"6e47c6f2c4e430ef1b4134e6a4365226*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1096:32x1096x256_n"fa414a092fc0f486824db4cc69f8e766*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1097_n"b1403cc59b1f21b49dbe3f53b1f710eb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1097:32x1097x256_n"6a28feaa4382f7aa989688921f4bfc45*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1098_n"c1897c1ce965c1185cce881ea38934d4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1098:32x1098x256_n"0fc87a6afae9f31f9375e143d7ad71c9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1099_n"313a96a92c9c6f0ecd84a485bd633005*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1099:32x1099x256_n"e7dba4109ddc5f5bb5dd179499784f94*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1100_n"1a6b376ebc4e96a172f3898a74e1fb00*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1100:32x1100x256_n"88517a272b601588dddde1d84bf165fc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1101_n"8456e9db10c2fd89092d00a692c7c826*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1101:32x1101x256_n"0123dd4e9d5069b9cf08437871e40c68*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1102_n"8439c66f5bf2ac0937b3bbd43ed609d1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1102:32x1102x256_n"46691bdfbd36f47a6ac5fa799f210d8a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1103_n"ae1120e544eb8780500269141b37ef64*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1103:32x1103x256_n"b77514f44fb41e07f563d5a81161b3da*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1104_n"ffab1497e40061045deee1f7392dadea*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1104:32x1104x256_n"d92ac5acc10bc037f075e4d9c0da1f5f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1105_n"e2224a8ce898ef84d72c6738a1ba824d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1105:32x1105x256_n"0d166159451c2f0d148f79c7406a24a9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1106_n"060c040c12b0dd7c1eead843679d9a87*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1106:32x1106x256_n"937c667d093607f3bdbd8504a335d547*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1107_n"eb26c91fc81e0fd459a342791115eb8d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1107:32x1107x256_n"f6481645d9a08d4a63a5180b5e3a2cd1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1108_n"1dfd069f654095da9d9cace84ac6f3fe*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1108:32x1108x256_n"57101b68cf6639f5e4e68bcf37baf581*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1109_n"fbdd22e7b8c43b8ae00c9cdad9327615*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1109:32x1109x256_n"be0f006f43ce59d30cac09d3e4d18d2f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1110_n"7d495d67400bb32071a6ecba0bb069bc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1110:32x1110x256_n"8a4ba2cff74e5e684a7f729f13bb4721*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1111_n"b269ff1cc16c91fd74c217548a1e373b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1111:32x1111x256_n"5fffb72018851ebab242028a4c6464b6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1112_n"3f73dfaabe9c44a7ae46918e8f38feea*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1112:32x1112x256_n"1b089187a92ee46a1ba86c689bcceee8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1113_n"0b95354a6058e3a90905bd6e4e3faee4*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1113:32x1113x256_n"82f613bc9f222eddf7f31d04650a54d2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1114_n"6bd047d6bfc4b3b8f9883e8ba5d79d86*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1114:32x1114x256_n"d9c14e3f6a9dab805a5220b73c45fc05*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1115_n"8ae14bfa1aefdbada815213bc076abc9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1115:32x1115x256_n"9e1b2222e95e34dad4364ab1737812b9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1116_n"b97622deb89da8075ae0a28ca35e3320*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1116:32x1116x256_n"8f0c7e79c533bf7e4c47b695c7fc8840*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1117_n"426ae384a0e4b2bebb377f9a01800965*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1117:32x1117x256_n"596ee62fac5b96d385c4ce8efbfd06f0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1118_n"e1ca9542ad1aa11f93b83e1744cc637d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1118:32x1118x256_n"7d6dd26bb55da105699a857352518825*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1119_n"2a8967a0d9207827e2128c8c40c50e5f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1119:32x1119x256_n"55adfffc343dd939a883ba7090d2b09c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1120_n"a87312dca44fe234027e6ac52e0a69cc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1120:32x1120x256_n"c7410688d0ae9ae863ad771a69011991*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1121_n"bd1d1ce4e8daee5118e3f10bd0885026*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1121:32x1121x256_n"938504df9e6a58a2e1fa20ec1374e1ff*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1122_n"ffcdf3c84ee169fd7a1feb24e9ea02b5*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1122:32x1122x256_n"9678882ce233670b1584f455f3881524*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1123_n"3ec01c797c6e4941f9e773181b32be06*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1123:32x1123x256_n"0761d9ac560353faac5627ce544c34c1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1124_n"1084844a4230ef46bbdb64fbf8d953fc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1124:32x1124x256_n"50ba9ce16fbb7e3284bd9fe78ad4146c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1125_n"461d2514d62273c0a6032d0e61c83a87*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1125:32x1125x256_n"72ca3ce8697206e9c1c47519de732963*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1126_n"8f79359103fc35a4d6cca58418e674a9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1126:32x1126x256_n"de869ecf49e74d718c6f00f3998dc078*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1127_n"3ce238d2d235ffdb46cde0fe65acc9dd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1127:32x1127x256_n"90891399b73aaad2457336c02bc28279*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1128_n"e942cfd43ea0e570df990c0211208d9f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1128:32x1128x256_n"b0dc70846615cf2c632c0b2a53e7cd22*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1129_n"0d919b5c39de9aea576edfbc54e6d6e8*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1129:32x1129x256_n"cee8225d37e53d889e4b3f645cdb134b*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1130_n"8b1354a096bedde46ad721f958bf4cf9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1130:32x1130x256_n"0959e52e24ab39ae4dc2b3a790705149*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1131_n"42a069f1f978f1daaf07cc2c62d8f49d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1131:32x1131x256_n"c5aa5dcfef562a532ed5afd79705cf53*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1132_n"2b2cbacb6b0468902a3d607181cf3237*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1132:32x1132x256_n"2a8e9cd7bba4c63592efc490dbdc72bc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1133_n"2311a645558c3f8ad92caea8832d156f*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1133:32x1133x256_n"9854d6ae9856ea1c690f0ea1222ff011*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1134_n"e04e0090a6b02eedd26b0ffc8c98ae20*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1134:32x1134x256_n"4e45959e74755fca9aacb8f2478950c7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1135_n"cf4493853330a48e5c021e00c07193b7*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1135:32x1135x256_n"a75d8486f0f23ad846d7a96b0f3127c6*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1136_n"b46f21782c66d819a8a3be54dcc3a309*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1136:32x1136x256_n"e1b0c55ac65f5a6ac19416ab9264c1a0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1137_n"26d62b9a65d3114f2cfa82bf6f8133b9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1137:32x1137x256_n"6e79bd32c4d4d6662fe5fb3e5190d15e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1138_n"270cc6e610c80a6aeb28386d002d1b9e*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1138:32x1138x256_n"0db69bde855c2ffb41b1b5649e687624*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1139_n"62ce7555a6898ba5b57fe5dcdd95d149*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1139:32x1139x256_n"2cf8ada17b250731a64047b581e0895d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1140_n"cdc59b9056d1ed0ab5802765eab99086*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1140:32x1140x256_n"42488fd2d53dcd29a89c12311f01e736*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1141_n"6cebe74dbd9cacce254e931ee1b3cf63*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1141:32x1141x256_n"1eb464b548efe3b66b7eb02a4fbddeaf*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1142_n"7bdddf1dd94326448fbd938ee5107f9c*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1142:32x1142x256_n"e57e3b549924487e0180846c656761d0*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1143_n"c46908a12fa76b9719ba17b4a96fb584*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1143:32x1143x256_n"ee4c3621e90cb16930a81ad301ca4eab*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1144_n"aef8edfed8fc60d129e6fa2ac17e9c49*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1144:32x1144x256_n"88e824fd7c76d0c6b1bc436adaf75f8d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1145_n"4c113ebb497785c0428e76d489907dc9*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1145:32x1145x256_n"242840f0916699f21523fdd4843658fc*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1146_n"61b390eba41ea62aab0b966dce09be21*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1146:32x1146x256_n"8b3d2f7852600d5b92282bece9c276b2*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1147_n"c0f60a2c5f62e55ff5f1e0296b5b9811*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1147:32x1147x256_n"271e18690c06d35466a66a7e16d80dea*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1148_n"f8b5470193bb41303241a104fb0d068a*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1148:32x1148x256_n"39cf3a80c6b207c854202d78a9d52203*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1149_n"fdfbac4d96d58d0bfcfd596c1f7ab6fd*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1149:32x1149x256_n"ccb9e200b66ec8b2a873c5df2c2b5e3d*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1150_n"451216e6a5a7430dc9586456d19f2b58*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1150:32x1150x256_n"af1994ea7ce7af790069ef591b1f1606*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x256:32x256x1151_n"4f9807c507f93e3138e3412242117dfb*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x1151:32x1151x256_n"cf4bdc4054f045c1ef9c1e65a78996c1*84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x4096_n"642a1341e6ca7b60a471fd3938d0695d*112"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x256:64x256x1024_n"9aa0489873781a3546f78f095b32d1de*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x256_n"a7248ffd8de2cd027eda3141d3e909fb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x16384_n"e730be73d0ada8ca36c1682f19291ccb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x16384:16384x4096_n"88e651e62fe99778b84eaea80f5f0b21*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4096:4096x50400_n"50eca629f2ca8e0eb272e00e39558b22"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1025_n"caaa8f85fa7aad407ed3ad265b182a8d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1025:64x1025x256_n"611be85c47bb63fc0e395e455c059cb4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1026_n"909a797ce8fd0b75c2bbf4fad3f4db3c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1026:64x1026x256_n"ba033026114689b9222dd83d86bde158*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1027_n"d49e4f9644d33f63f9bbca34c9f4909e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1027:64x1027x256_n"e7364724f5bf9b05dcc091f335ffd030*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1028_n"1db72f32e46b17ea82ae36bc7f0cd8f6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1028:64x1028x256_n"e47a40ee39946773f3da893cc50a35c6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1029_n"f4a659923113e75d69fda560a69a1bd1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1029:64x1029x256_n"0fd737256d66662940d9ee403391e74b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1030_n"b1eae4e9ff7a8958e7d068b9790e42f3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1030:64x1030x256_n"575c00e2e185b09db4eb8fbc3005cdf7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1031_n"8023bc72ed18a424d0fb4567e97377c5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1031:64x1031x256_n"0a3072cdff1f0fb837b49d92c167600d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1032_n"db485a4aa39b76c7f27364b0eec4f739*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1032:64x1032x256_n"9d90159ff1afe7256d392dc3ab3bb86d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1033_n"c6449375b5f88af9a3801a3520efda30*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1033:64x1033x256_n"432b3bd8a09e62a8cf4438e2bd2af464*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1034_n"49f7e9f08637ff15f0f6bff17cedae7d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1034:64x1034x256_n"0ff9ae41d1fcfcd1b71fc7d992674fed*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1035_n"60f36190084c3d84badd0f0a3373e378*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1035:64x1035x256_n"2c60a99fcaa2562bdf4d1d914ab56bc2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1036_n"be0b0743cc2e8bf33843cf41aaadf87f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1036:64x1036x256_n"b53d5a8b0e3e7ea3b8e25a6b629b090d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1037_n"406e522c3e50792fe8d1ac7c3a3a4590*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1037:64x1037x256_n"bd3988ac696c3002c92846b0c28c92e3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1038_n"8f8b3846b6814ad93d409838470fb1a8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1038:64x1038x256_n"716c60cf02cc24f398051f17b23cf215*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1039_n"96fcccdf9d8a749cb4342121259c754e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1039:64x1039x256_n"ab2798124131e21ade00d241d6d64992*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1040_n"886535ecf33f9b14fc8e3428d011c565*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1040:64x1040x256_n"903d46212e0adc5a1b47d70c95c43245*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1041_n"2ba58741fad83c547dc317bb339d5c18*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1041:64x1041x256_n"a22135702d7b7da8fa1d43ce2dfbf42c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1042_n"c35f27bad7c1e38265b5a19bea5e837a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1042:64x1042x256_n"6efb876230e85681c8d105a25d177ada*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1043_n"827eafa3634bb6c99c91d002ecbc3f1b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1043:64x1043x256_n"ad6dbee235c1a6545b97825b81611678*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1044_n"0feb750d042a87bcdcef8f5ff2dbfd2a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1044:64x1044x256_n"be7201efd42e666028f88473593963fb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1045_n"e44dfcb1c271c16fc983050e3617acf2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1045:64x1045x256_n"701bcc546e237ed7d926e4b17bc43605*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1046_n"33bd820addde51adc6c9b953ca3ebc3a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1046:64x1046x256_n"cd51b0a9938d403d2f575a5aa8d0bc5a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1047_n"b4f5c4ad902accb71a3658ddee14418e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1047:64x1047x256_n"69e13aa27b8e43a4721298cccc5aa170*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1048_n"cba64485d91db36d76df65f16b3fd0c1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1048:64x1048x256_n"fe794b8e1f9af8a6757a5d4dc8420e18*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1049_n"295c8f005b120a77bcdbaab505dee597*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1049:64x1049x256_n"e6e0aa9cb35bde552591b2561921bbc9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1050_n"3c4bc4ed924c1d3a279385aee8ffbc90*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1050:64x1050x256_n"5d3496d704ce4acc5b17d40791013412*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1051_n"edf6219aafc7e999fe3d3a5fcfe95d97*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1051:64x1051x256_n"6119d62a5b0c9f4794a1c8937b52692a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1052_n"e61f5acedf4da1548e4890062c4f1304*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1052:64x1052x256_n"fea5a8c37eb6d9dc00c2b9915b1ebbd6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1053_n"21ebd8464997a9cac5e350cdcaee3d15*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1053:64x1053x256_n"e14532e3cb075b75da2a3d9151a41eef*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1054_n"a156b5bff9e6c66d8de85b40d7c8362c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1054:64x1054x256_n"74dc30b452ce01895bc710aade52b0b6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1055_n"57fda6323fbc107c0dce5b57ffb923ad*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1055:64x1055x256_n"83f79b345f1495676d5ead5bff41cf9a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1056_n"e5880572404d7a9be2f79e81cd6ee9fb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1056:64x1056x256_n"d4da6f659f6505bccdc022d4ac0cce88*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1057_n"2f57df23379906161a9f3d59e83bbb8c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1057:64x1057x256_n"751e0f80980975fc9c29bc5d511778b7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1058_n"8868748377b251d64209483bec2d9d8e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1058:64x1058x256_n"d4e870f22fc9def6814ee3f77d957770*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1059_n"1b215bb4529e771d91ba75aa9d5c912f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1059:64x1059x256_n"98195b8b8ad83b3aa0ec5af3a9343b7b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1060_n"330f0e33dce645135053a4a75a090a5c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1060:64x1060x256_n"32716293c1d72f9895bc8ed191532169*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1061_n"318eee1c639cecb9b56c6c9082bb6271*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1061:64x1061x256_n"4ce3deb76a5e11fb9ece796705819663*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1062_n"6f9f825a866a5a16be7d2508b80e06c9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1062:64x1062x256_n"410a113ac3e2ad6bc3ecf9a7ac7e80e2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1063_n"ca4f5263c87fabcdb659fa22a814348f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1063:64x1063x256_n"c6379a3e9f16a4471f2cf2289da888b8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1064_n"c13bedccbd76105a37743b8a1838fc4e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1064:64x1064x256_n"439793fc12cb335ec23467e8e3112a65*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1065_n"16ca49062f5571292981b6e3d6ac93ac*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1065:64x1065x256_n"e60d4c475b2386ca64d92ca2413142b4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1066_n"3fa3440f0297178bbb02290fcec5f98d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1066:64x1066x256_n"7f89bae518dca4129d39fb27fa065c4a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1067_n"90ff1894d04d3f3be24120b95c2394db*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1067:64x1067x256_n"5caba4ccb4ada17231377d3d0ad5af23*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1068_n"d7165f00c1abf2ea67bfae6eb7199e09*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1068:64x1068x256_n"3eb6fc413a3e3395797a1d2490b1b2b0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1069_n"fcebc5591b6edbf563c96f3e1779f989*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1069:64x1069x256_n"296e99b7ad29e38fb7b7f5a55bc7183e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1070_n"06d97ebb6145add02d3a724af75dc221*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1070:64x1070x256_n"141d678b4fb2c2a65e86337c6bb268b3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1071_n"c4c0ae5a1ef2a54eebe0135ef0fb804e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1071:64x1071x256_n"6606c29d9d44b4c96c38b716c88792f4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1072_n"6f672e79f3483b976630ee832b4595d2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1072:64x1072x256_n"ca682591e5af2ce0d3eeaf3044192245*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1073_n"9fcf8dd294246ec3e74f311278721042*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1073:64x1073x256_n"4889c0984ab35e74c517e54311663c55*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1074_n"6659e3155c0056a2d019d857f2b0bb96*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1074:64x1074x256_n"187a72eb5313e8235444d40932e345ae*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1075_n"0f3a4b48b711e34224e159cab154d101*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1075:64x1075x256_n"d14dde0e6e2a486b7466a38e3828fbfa*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1076_n"8f7e68f096029a57bf8d1f75ccc0885c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1076:64x1076x256_n"26a394c3771a67135893425646df640a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1077_n"f5be33429b190b3d53454f660e419e9b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1077:64x1077x256_n"190723f74cc7b8048638bcd5143a5740*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1078_n"61ac71ab472cb5e10c70be1f7ad6e2c8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1078:64x1078x256_n"1b584f671ae0854ed915c38f49c83741*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1079_n"dcc00a9be16610f1bdc36abe29b67f6a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1079:64x1079x256_n"99ea1aec98c3bb7cf23ebb90847c2390*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1080_n"2e51bce50fe1ad7dcd09ac76b686f10b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1080:64x1080x256_n"c7f04d6b681f601443a7bc7bacd425be*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1081_n"563252659f784e885cddd9368e64585c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1081:64x1081x256_n"5b368f17c4c9908cbd5e9481e65742f3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1082_n"c3e5e343403a89dad8cb36255ae8d804*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1082:64x1082x256_n"bebc82b303bdd9914d2e31cef9cf0377*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1083_n"3efc902ded98cbc8a1033f92aa278d1c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1083:64x1083x256_n"860c6807304bd52e3e114d4dfdeb378b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1084_n"c49ae2acc2cba1c9070d175644c82d06*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1084:64x1084x256_n"2bd50d8458f3174f80003cfe352b3c89*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1085_n"0ffa17dcaaa06ab8f6ec666c19b29d56*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1085:64x1085x256_n"141e6a822d6b1ceab088f54e13ca9193*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1086_n"c198ccf082c7e73c886816d7d6649ae7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1086:64x1086x256_n"9ddb2c1bb13d6514507547f6bfe1b07c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1087_n"a4eb34150d34fe3af0b05065250b378a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1087:64x1087x256_n"6bcb405ad93e7062157525b2c883937f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1088_n"9d7c1ad4c6596ee817ac86c93dc32e27*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1088:64x1088x256_n"b3cb8e4b6032b5c9b8301c2a78767fe8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1089_n"b8ac062628bb54ed0b1cdaeba34c9d09*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1089:64x1089x256_n"f3135c691b8171cc26f91b145841f471*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1090_n"fa01a6af71988d04bcca70787d1bcf9c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1090:64x1090x256_n"30cbc3d265b1043e2fce6c1a2f909555*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1091_n"f9680be368af56b8517bd76cc9e5b186*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1091:64x1091x256_n"520bc51d7b8b994021587ac7053b2e94*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1092_n"841bc0518086d386aec8dda9ee5db4d9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1092:64x1092x256_n"830c8e8a53c684fd1a742a6e123ca7c4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1093_n"4f38e0de52bf3d0f0eb952ad29f0473c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1093:64x1093x256_n"214309371facbcc7af7f1691c161c02c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1094_n"c43553d8959928777ac22f5db7ed2c5d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1094:64x1094x256_n"08d4e6181cb95501f3505607843ad694*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1095_n"bae883939a2142f80849611bc4300b15*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1095:64x1095x256_n"d6d1e799b0bbc3bacd82fc7731d3f483*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1096_n"10f916b1ef00b6972ef9c9f382a7aed8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1096:64x1096x256_n"6d474e1c07d3ef077c5686b49cfd2882*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1097_n"dd64905bef3b7d64731dd12330b66b2c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1097:64x1097x256_n"5f1160ea3b9c8d6c6ad66c52bd206af1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1098_n"d627915d9be01f53bee7c279f2d2b4f2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1098:64x1098x256_n"338edd51a68af3e04884a94a8e801376*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1099_n"4d8d92d9727a4820c07c6ae056897e5b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1099:64x1099x256_n"73e82c44b2c94ee5a363554540a5be54*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1100_n"c8899aea9f838e12e5f0ae659511c74e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1100:64x1100x256_n"12df009d23bb93d2b5c031d08a34a580*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1101_n"5ec6c45ad08d2858e0e96fc4bb58543e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1101:64x1101x256_n"fee5a0f62bbbf79175067405cdfddca8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1102_n"924efa7fae365e507ede355e01b04554*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1102:64x1102x256_n"900fa143b3f9e8823db3e71a07c6e744*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1103_n"c034a0732642652e18a4f06e7f08ddab*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1103:64x1103x256_n"f832f9d37b3d76067e5cd820df3f01d5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1104_n"3968c98f83c229029549920e1d08334b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1104:64x1104x256_n"680f6c908ceb0d9b5f6bc5851915dad8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1105_n"faa27e6ba67039304636129cc542e8a9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1105:64x1105x256_n"19af475bda2786c58bdb4df5d37204ab*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1106_n"7b43606a45274eaaf36584f5b42bbb89*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1106:64x1106x256_n"d7d6999e1e7854ddc78f7393c801a316*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1107_n"a4746d96867983fc7bf7c668788009b1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1107:64x1107x256_n"f6ea2fed70fcacc5ea326648de70e27a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1108_n"24eeda9220fa1f22a233a21ed0cabd21*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1108:64x1108x256_n"a648adab443d0151c84197c712513167*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1109_n"8a853d3d41d877d1472f697095c735f1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1109:64x1109x256_n"f86971bc12525dab2b959585a2a6013f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1110_n"7b153aecfcf28452fba60e2016dcaee8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1110:64x1110x256_n"7e94c95614574de41a9ba969f18b189f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1111_n"648af03993790f19155d6ab5b2dc041e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1111:64x1111x256_n"abc9a2443455009f0ba22489218804fe*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1112_n"b58e0120b1347629d4d48e0297dad51f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1112:64x1112x256_n"f45aa5fa37e3542cd2449c0fc6ea3ccb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1113_n"6f0c44759b1d8387b6549f668981a0f4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1113:64x1113x256_n"b45a692165dbcf01774fbb9f7fd0799a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1114_n"d89c7545c4fa22cfab47a6fe426aa789*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1114:64x1114x256_n"b103a835e868fbcc583a20e9a91f8836*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1115_n"d1c7275e95f5831d639f8c4b725a3f05*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1115:64x1115x256_n"2ab1844f7f87a1792731212b260121c3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1116_n"e1db688afb6e1b09246cc0e2e5140e31*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1116:64x1116x256_n"6469382536a2b0ae609e805c8d5f19de*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1117_n"9f02e153b4c347f869096851ee1dbc99*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1117:64x1117x256_n"ed7d59d44eefa9ed1ad910e3664216a6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1118_n"6b26db0250837f3e1c663feff0ca14e2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1118:64x1118x256_n"6988f5776c0a668932409a8d938d7fc2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1119_n"56c67d17383b5e4b433dff9071d03683*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1119:64x1119x256_n"501a305a8874b744c87a38e0205be6d8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1120_n"d554d71876f5cbfe87995b3ea13a2546*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1120:64x1120x256_n"9bf7a0c655b4eb1739a44b1e880ca764*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1121_n"defd88de8c331e15db2308c87cecb979*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1121:64x1121x256_n"bfe775a814dee323b55878a295bf4e06*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1122_n"67c6a0de16b966528d992d33a8169a1b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1122:64x1122x256_n"8c316acfccd4e18b49819bdbafa5dec3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1123_n"adfa2f1d09bafc466215fd63f4830a05*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1123:64x1123x256_n"37f80afdc5c01befd9e96b7171ebfb46*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1124_n"c813ebf916d8445904c39445b13e1c0b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1124:64x1124x256_n"499b5ebb1ae3a7b49f1644a3153d29f1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1125_n"e76a130a59c24e7c53189e136e401f29*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1125:64x1125x256_n"3a84a35d65f21460aabfcac5772d0e72*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1126_n"961b622592b7e3339f9a298734067500*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1126:64x1126x256_n"a36ed7d68167caed48d17c99fbba5a3a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1127_n"1b71513886068d963e0ead4557bae865*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1127:64x1127x256_n"d9c8b00e38fce474045ddc8b6329382d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1128_n"c542587d7e6a8f5891cea693373df49b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1128:64x1128x256_n"2c4717a52de151421bec527c23789b4d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1129_n"08239f56f28915e9cd3750e683a57447*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1129:64x1129x256_n"ee40f8474a5fd7d253318b5018cd5882*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1130_n"6fedd0b29f0f2e98748c51e31ea7d61a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1130:64x1130x256_n"b66212c0ff730d78b9fa6ff1fee87a11*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1131_n"960e9465c5f45f4692783187b344b6ff*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1131:64x1131x256_n"b46f7c3002938b2932a7a6d1f03a7b37*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1132_n"5794f75ab03902991bef2bf955bc3036*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1132:64x1132x256_n"bf71119bbc6f870df67a95bb8b5b1c16*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1133_n"6d8f88844ece2ca4d47a44ada2c9e596*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1133:64x1133x256_n"1b8f8d08025739b9d068faf3570e0037*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1134_n"bcbd83e926ca544cb54c867c2bb0f938*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1134:64x1134x256_n"cdbcdb98785997c7057a73c3bc7e2dc2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1135_n"e0e4ffaf4a5e371ae71389122a021758*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1135:64x1135x256_n"f99ed984b35f31dca41e263a21e47155*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1136_n"08a8d9f2873d2f51bb742f07909d1671*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1136:64x1136x256_n"383cb1ac2b8cfa7f2552d192ba1b4a9e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1137_n"d43f60d7f8f2edb7bf5fae2e55f0ea1d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1137:64x1137x256_n"a8c5cc1be9397e0253a615cd963c8ee2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1138_n"b12265bb7d2ea35db8aa8c745dc10284*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1138:64x1138x256_n"af7347d6ea3c7139cc17fd713af8048b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1139_n"89ed716b7cd21928e19bb3a916246afa*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1139:64x1139x256_n"1d52e8058510386cd4101101c394e581*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1140_n"0446a265d66446ffcfc064918e8eb85b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1140:64x1140x256_n"c422709e00cb575927b635dd22441978*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1141_n"4b6bef4284f0fcdc3ba6e6072a1f0576*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1141:64x1141x256_n"f0d96d2e0550603fea02af217660e84b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1142_n"7f34172dde9b0c255586ae8a48803509*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1142:64x1142x256_n"4c8c3dccf8469f6d33122f2ea7e4f208*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1143_n"ef2ba2377f0eade38393c39be2966488*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1143:64x1143x256_n"548503a20a1c8d05567ea324d29b551f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1144_n"9756cf6baf318f6a3e14b9eb8419f942*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1144:64x1144x256_n"d0104f6899ecafc7b0a982114058bb57*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1145_n"5e126e76d9c3a3ba558c1755d628d731*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1145:64x1145x256_n"901aaffec7a2dd4ccb4117750f063bc9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1146_n"5800215fd028cc85d47762e5ba121de8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1146:64x1146x256_n"985008c14dcb9acbfc572ba8c2b59a13*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1147_n"4f79d46c705af4af88b2e30d5be44576*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1147:64x1147x256_n"84cf171746fe3c64f619f16423f85fe6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1148_n"446b188900a9d6e6571082607b497ea1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1148:64x1148x256_n"96b6a43e96489804702d1dfecdcf7f43*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1149_n"b9eec8c5aa627b835eafd7705eaa861c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1149:64x1149x256_n"b13ceb9b2fdd0cdf4717581b472ad15c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1150_n"930ead545431eff805bff10a729863f4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1150:64x1150x256_n"2fc7082dc5f484c35ee128b040b846c0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1x256:64x256x1151_n"46bd42938a38ec584f08640394df0f59*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1x1151:64x1151x256_n"9ad48516c65882a5c1558f0c2a24159a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x1000_n"f29e5fef2e942c62a503b3fd31871fcb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1000:1000x2048_n"7f4fd22607a3cfa956c7f78ae2e53ae2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x1:1x2048_n"e66c3d9e10394a777241e105e07fb685*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x197x64:12x64x197_n"a9e5dd827d77d8fd27eb403d8f5f1a43*12&51233e5d3ae090ebab7499626a8d6755*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x197x197:12x197x64_n"efc524c164031bf33e3ba9b1e9e6a711*12&eaa2c31f61810d83f87da90172b60bc5*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x512:32x512x11_n"ac0bba7cafc2f4ca213d3f1e1030b25e*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 24x1x512:24x512x11_n"4ae36d217e350580e3ea32273af01ac3*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x512:16x512x11_n"780f99010967b6bc5425a6d8ff254b4a*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x512:8x512x11_n"a41d7a8ca00c86424b5473bbbb33fdd0*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 8x512x1:8x1x11_n"8c59bab492d52699c5971997af89ad6e*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1x11:8x11x512_n"33a530fe1d5823c84406f53a86402a29*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x512x1:16x1x11_n"288fca297917c447323e396ed84935c9*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x11:16x11x512_n"62af30168d652e95a3e7e50d459e8ebf*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 24x512x1:24x1x11_n"867a7ff265188ab6d457343d960ca4cf*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 24x1x11:24x11x512_n"15f928a52b3ed5b0372942c292ff4ce5*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 32x512x1:32x1x11_n"1c8dadfa54682d77feb3b2977803b6f3*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x11:32x11x512_n"990b4cbd593ad9d5762abee8e04af455*501"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x1024_n"b3877d18415628effeb938c8a606a475*12024"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x64:1024x64x384_n"46ec2e44cfa44b185a3aa8bd798d52a8*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"0273144f69cd44dfb2d115af1fcb89aa*4056"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x4096_n"fd4296c70d6cf39b415568e44a957185*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x4096:4096x1024_n"4720e9fa8e94003d7f85188db851b208*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"a3baf0e66cf0e4c0ef6651d5883ca2f1*169&671e2a8976c742723232aaed200989f6&a402022db6c64b400e7eed37d9369fc6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 1024x384x64:1024x64x384_n"ca10c29b905407d23d69035c530e8143*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 24576x1024:1024x1024_n"69b3336cb3aae9a86ef2fdb2f4b334e5*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 24576x1024:1024x4096_n"b323bade61d2c372aaea064251f30089*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 24576x4096:4096x1024_n"bc26ce9c2cab1183d57328de156560e2*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x1024_n"934264cbb368029ee564537c61bc4223*71"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 272x384x64:272x64x384_n"a511b367db3d28e68338df9b8885a8f0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 272x384x384:272x384x64_n"65cbbd5f93662f3cded8f22e67605c89*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 6528x1024:1024x1024_n"d8d615ea0dee2c17282481f64c6cb7fd*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 6528x1024:1024x4096_n"9d1af06e3da9f042ed5225d65825dcc1*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab 6528x4096:4096x1024_n"9cdf0de12d25713806c40013aa6e9b17*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x2_n"3169e7294a608e38936d6878392491b6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1792:1792x1000_n"571a68560f5d82011a7b897c61d8c103*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 781152x1536:1536x512_n"f3f51828c519830f8f3f4a809681441d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 781152x512:512x29_n"944fe377f47ad4c0706298e0c20ff5a1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 781152x29:29x512_n"533ffad143b4a59e770130dcaab84cb5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x781152:781152x512_n"255faad310786e8f6106c747dca0eb39*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 781152x512:512x1536_n"db9e3a5124e8cb60b1af159c65237a8c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x781152:781152x1536_n"e6b7d3707e604afb173af48f8e6103a7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 766160x1536:1536x512_n"58f65632004b9bd109b21a62b38eb215*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 766160x512:512x29_n"1f415e17afcbbac1a3dcfc6317c43364*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 766160x29:29x512_n"5f639b6239a1128ab5fab93a9bf99307*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x766160:766160x512_n"420fb583179619f35c1b214e356d79c1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 766160x512:512x1536_n"089b2377c8439c2fab482f2a344ca6a0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x766160:766160x1536_n"3c85c1a1a8e32d819040cd737d57302f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 776208x1536:1536x512_n"51212f7685fde3d82047c2055a186121*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 776208x512:512x29_n"aba5ea4e6560610d506dbc67a65e57ff*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 776208x29:29x512_n"3dd7a74a6d0cc6f1ecdf6ead6aa26a97*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x776208:776208x512_n"1bcd50947c5805218072e121ae896b5b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 776208x512:512x1536_n"f48a633d28bbf4438c7868e842f4685c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x776208:776208x1536_n"ecbde9ed1a2011d177f73ff9d7f488ce*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768792x1536:1536x512_n"134e6a5b2c6029fe2ed782f63f8d02dd*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 768792x512:512x29_n"b149f04f4211509ba48bee47804f1622*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 768792x29:29x512_n"6141209fe847c281e0380dd359b6eae8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x768792:768792x512_n"fa405f0821651ae72c3e015bc32e78e0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 768792x512:512x1536_n"47e748e3d80aba9794ead8beed0e0792*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x768792:768792x1536_n"562142e79bedbecadc3315d028325128*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 756432x1536:1536x512_n"ff3554fb7a353d5a19e144a4f8402ac7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 756432x512:512x29_n"5ea864235ca512986a1ea6eb900ebcc7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 756432x29:29x512_n"63c1e560e3d8ebdfb411863159def1f3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x756432:756432x512_n"be139f3bc4b30455a7cf3977fca9c3f4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 756432x512:512x1536_n"e9fe8a822474e23577df1dfad039ba1b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x756432:756432x1536_n"8c34174645bb594897daf87a452db583*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 785536x1536:1536x512_n"e01e9997bc5585afbe5a4001a7c96cac*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 785536x512:512x29_n"9d21974a80e14fc5c5c81857d875e666*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 785536x29:29x512_n"2834111c07506db061e3356109e27188*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x785536:785536x512_n"40c6a99b8f7855a54320f059a358dcaf*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 785536x512:512x1536_n"d70e2586a565d26050f43beb72fd6f4c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x785536:785536x1536_n"5e18fa4f42dd279d9c763ba3fdd6db41*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 744168x1536:1536x512_n"1cd86a3559cad351ed041457e6288067*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 744168x512:512x29_n"f6af63b0fd31ba35dee292a41d63b455*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 744168x29:29x512_n"a25d0127791393c0de6df4f1df5ff18d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x744168:744168x512_n"2431f8e72f8c6a9c57e2772a14b5861c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 744168x512:512x1536_n"e822d9206f5b40dbf3a040fe35afa202*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x744168:744168x1536_n"69818bf5176e99ea5163993799133f07*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 771264x1536:1536x512_n"af23675bf316c247122fb88ede598d12*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 771264x512:512x29_n"b802608e27d5b8bb6be02eaf8fd32aee*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 771264x29:29x512_n"1045b705deaf80d6810c939f8f642e54*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x771264:771264x512_n"7cadddd32f3c10ff16b306c2771b5763*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 771264x512:512x1536_n"e996b4cab565d9f38708cfda4e5bcf59*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x771264:771264x1536_n"8c7277a175cacb4eb0cec442ae73c430*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 756448x1536:1536x512_n"2f7cf11c5defeea7cfb0a784fd39e2ac*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 756448x512:512x29_n"c0263e51796f3c052b7a87e488faea64*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 756448x29:29x512_n"158d5bc4cb7835b36b2520fdb9a08458*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x756448:756448x512_n"cc146dfc8fff693e1cb6b4a05a45d2fc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 756448x512:512x1536_n"78cb1041ac9c8b04309eb4bc9dff050a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x756448:756448x1536_n"81fe6739704d2a236266e229689df149*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 749080x1536:1536x512_n"0e50296b732ad848ca02780f80910715*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 749080x512:512x29_n"70f2e3a36e1a45cd3b8bac62b7f7f5a5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 749080x29:29x512_n"b1a84725bf63169d029b704db6be19d2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x749080:749080x512_n"138dc2d9976fd8d9d5ad966b5bd4eb2d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 749080x512:512x1536_n"cc62806a5e4592f8854366dbef610b90*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x749080:749080x1536_n"134901220d597c4f1b97476b2cfc3b81*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 800800x1536:1536x512_n"8bae2f2b4e3233ac913e1385ef8d6498*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 800800x512:512x29_n"2f244ad2222200c4c0c35f4a077f53fd*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 800800x29:29x512_n"803663bd778fd6043433f77dd283f00f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x800800:800800x512_n"890854d5f11f2de3299274ea960b5aa0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 800800x512:512x1536_n"76c8f9af44bda631c8059605a4674814*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x800800:800800x1536_n"3cfb49568d35d8cbf2a8c1d2a3824618*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 773736x1536:1536x512_n"2214495e1fa4dfe0d55f25044a3c1d4c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 773736x512:512x29_n"5bbf2fa20e608378c0ee4fb3f590b6dd*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 773736x29:29x512_n"c993ac334e67e7fadc7ffe5daf4fcba7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x773736:773736x512_n"01aa3e398e6240c5949336e6d7d8eff6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 773736x512:512x1536_n"9ca620de213933e6e6501329f8bad02d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x773736:773736x1536_n"e776e3628b0924677d4838bbbd6a04a9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 751536x1536:1536x512_n"7385b2c3d10dcf471289ebd038e248c1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 751536x512:512x29_n"96d402066fc8ed5a35da26f6306563ae*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 751536x29:29x512_n"f6825bf25118d29c235a50aadb0dbadb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x751536:751536x512_n"c6be810c4e8ff470e0fcfd4445d3428b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 751536x512:512x1536_n"6e8309528f8410b8726cbc40f8cd8c15*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x751536:751536x1536_n"d75b9fc5ffa6cf77275387a09afdfd9d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 780528x1536:1536x512_n"837c238eae122d6f391cb1f708802f80*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 780528x512:512x29_n"dfbe2b9dd7cf06951d5fe0ef501d8684*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 780528x29:29x512_n"47b9e42b9b9ba5205486ed2024b4768b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x780528:780528x512_n"1637bc8fd1286df551704f729751709a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 780528x512:512x1536_n"618aa7b487b7d26c2d6888d54c08c142*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x780528:780528x1536_n"20dfc15de046a6e2ee1d1359008c191e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 798200x1536:1536x512_n"0b387ed18b18273f1a410ff3e591267d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 798200x512:512x29_n"b64e106e6ad6e11e1b34585a70981fef*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 798200x29:29x512_n"1c988c147f48c231aaffcfb23689821f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x798200:798200x512_n"56a2f67f1ba8309f9e7fbc4f96749b9a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 798200x512:512x1536_n"e67a77e56c55715bb6532b757bd55f26*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x798200:798200x1536_n"c5356b551bb61ab526342425914b08fa*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 758904x1536:1536x512_n"109c30cd72656ca4f9ccfc1694b33d93*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 758904x512:512x29_n"a88da432a2f2c38f01229df89596e44c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 758904x29:29x512_n"30be1c4ae5ff9fd79665cd42107e09b7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x758904:758904x512_n"7dc42568e4c0507b3c8662960c96fd7a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 758904x512:512x1536_n"cab88c672bd69bbc7a764c2cfb9bc606*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x758904:758904x1536_n"6623d4ff66bb3e36f993e41d32bc8969*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 744072x1536:1536x512_n"6db2b147573f159dd58fe5b486fda8ef*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 744072x512:512x29_n"38c3bf99a8e73ef8de9871da4c8e2503*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 744072x29:29x512_n"973c58e3eacf69342edd254d614701f7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x744072:744072x512_n"178a525e7171aca6165fe615a19bd7d0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 744072x512:512x1536_n"bfbb245d8091883d5a73eb30cd2f697a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x744072:744072x1536_n"94f4bd8d1a25590386f894942f3ce532*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 773760x1536:1536x512_n"c31fbd7ec01f6f02a4b066d83c710a32*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 773760x512:512x29_n"910a31c781088e530487d12eb514c809*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 773760x29:29x512_n"248f527635bad14d5c7c8f0231c062ff*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x773760:773760x512_n"df6d190b4cc803cf400755da9a3e41f4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 773760x512:512x1536_n"3c7604373b9bec2cf194ea84a9f7432c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x773760:773760x1536_n"b7829395bbefc8412c5da0ae4124b137*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 810480x1536:1536x512_n"cae95e209e823f4c9ffd91e0ad6bd502*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 810480x512:512x29_n"b05caec507497bce5b5f58619db45e93*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 810480x29:29x512_n"d501ac23bd226075d675f5bc6006fe3d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x810480:810480x512_n"9b45e81bd77e7e2bdca5eb85f620de00*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 810480x512:512x1536_n"f807582b0b767b3ec9bb8a34e4bc7c8d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x810480:810480x1536_n"51fdeeecdf0b2aa79c47844107a93c5a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 780800x1536:1536x512_n"ad5b0f6f102c2de5998e8d5018491d4f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 780800x512:512x29_n"96ddd2f2c2f77c4d623f93b0384d1b43*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 780800x29:29x512_n"45fb311bec375698d363d45bb7b8fb16*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x780800:780800x512_n"587f0c783678ff2009c7ffc29dbcb360*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 780800x512:512x1536_n"35ced267481f6f7c8c4f2466ceb829ba*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x780800:780800x1536_n"5a8b4e91e6733001a2ad234458520832*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 746544x1536:1536x512_n"1daab54b9f3e67f252943d4360b90a09*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 746544x512:512x29_n"5254dbdfdcd4a1e447e9ba7f27567bcc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x2:2x1024_n"14b0efbf8d0b489066b8b18aa3c4d538&50913959c86f891da8b509804a6c3d0d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 24576x1024:1024x1024_n"f4ba78998f38ba9cb70319bb9d2bb0b4*96&62e67bb4f2d94007856dbfcd5f69d4fc*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 64x16x384x64:64x16x64x384_n"3d666fea373950a5bea1204432133e9a*24&117f7489d8dcc9527ee1fb8d58ec6655*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 64x16x384x384:64x16x384x64_n"2f6998bf189e84f6adba43a7bc5d6cbe*24&422ffa3655f4ffcc25df535ebd2bf206*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 24576x1024:1024x4096_n"7551907f7ed30346ae775b3b544527bf*24&28f3c9d3a56e1f28c258333c6373ce6d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum 24576x4096:4096x1024_n"1abc14f1c1005218216a5e22c5611bb9*24&bc1f4c5b3f291f0be3e8d46e231eabd7*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x512_n"99152671051f4f581b1f048ab287dbe5*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x512:512x256_n"150bcdfb0493ec920362c2d900ba2309*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x49_n"bd3a1892bbc180827a1e4607d2002add"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x7x7:16x7x15000_n"bca99173cc1e3377be5a7b06eae5516f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4096_n"7860e6b721939bab91b8068710f2dbf3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"bfd7c7fe54fcde9467f57ebd967f8d24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4_n"32d84e8461b716de588c3a3bbfff6eb4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"440bbddcb5655d6148c19ea413249ab0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x64x15000:16x15000x64_n"8949ec8374fa20160820a5c3f4028811"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x4096:4096x256_n"425bf295f074f341014c78c2906d4942"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x16:16x256_n"5f84b94cafa7141ec0e2255753a8f30e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x256:256x512_n"444f0ee39ea74a4bfd40b328cd0aab86*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x16:16x512_n"33c9a7439c33edbe37113387c3da7804*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x512:512x1024_n"9fb684892c6ec8f50274b1d552c4aa99*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16:16x1024_n"40f972cc56fadfe26cb6f3e42da70a70*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x7x15000:16x15000x7_n"d31ada651adefb5e1f41ef1210d2221c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x49:49x256_n"1be1b262ccbe71767728825cea3f3f25"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 49x16:16x256_n"4388f03b1ed2479ba3c6f0a538b6e203"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x5120_n"5bba222d9c88e81c93de467e2b9190a0*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 40x1024x128:40x128x1024_n"d103234c6f1ab1a1db741f906113131e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 40x1024x1024:40x1024x128_n"d43d99427722ac7b98baecc62ed04570*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x13824_n"dbc4cbc18c7168728d00a6e2590c6870*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x13824:13824x5120_n"f14d4b72853f3fcfc3be8b91f5351bd6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x32000_n"e978a1e1952b0d4bf514382a08e1c4b5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1025_n"7405cb0214f0223b74d38c2ef9e770b4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1025:40x1025x128_n"cc7e0683e59e5ec164f76c82c74cf86a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1026_n"ccd6f8312028aa563795f48d97a6d48a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1026:40x1026x128_n"3d7a1c49553703b9ba6d6c197f74a7bb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1027_n"f7416d5e2687b219b571514f975598e1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1027:40x1027x128_n"14e11b81f8264999089fed7b278414c4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1028_n"1943e1c8610fbc745e4a297650668a51*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1028:40x1028x128_n"b51ea61788d5f65e60ab50641d8aa5eb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1029_n"067521e833f58c48b7aa5d9101534782*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1029:40x1029x128_n"822854595a038d4b7261cd977d4a3f02*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1030_n"61b915feb9a4d1489690334dabdbbe80*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1030:40x1030x128_n"e2dd0e9d96cf625d81a47a0276bb4bf7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1031_n"ba3ad55dad3a74ef21ccd78d12523177*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1031:40x1031x128_n"0f27ed202a1c14c9dda041c087509186*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1032_n"89d8f9a384adbf92f474d4cd4093ce96*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1032:40x1032x128_n"446b559d1959be4cf57a5fe2dc8a72c9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1033_n"4ff4143d5d72b5b10fa7fe0c73a300ff*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1033:40x1033x128_n"4e95c5136516672220e7b4c68321aad6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1034_n"c7a7108d802ee631725a807e58c9004f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1034:40x1034x128_n"96aaa77ddba18ae8283dfe87bd9e0509*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1035_n"40e429b3129f0f13378108c3a95b4974*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1035:40x1035x128_n"79d27cab79e3a6c916dfaeb25a5cdee8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1036_n"7c681de16ed14a2d46e4e98f34344366*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1036:40x1036x128_n"b74b1e513a24e66a267afb781cdd549b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1037_n"1e5d58ed72023c6ad394c37e36e4448a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1037:40x1037x128_n"1a6953e90752776bf3d6bbc87f6a2796*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1038_n"ab5bac3332e885efe5cf7587b89b5b43*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1038:40x1038x128_n"5a10e44e95716744711a571506f33fed*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1039_n"68997b39439a4de52ebf3a87f00ff693*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1039:40x1039x128_n"d633654e20b4c82137ffde1cccbb1e46*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1040_n"a341031cf697dc4c75af688d19759941*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1040:40x1040x128_n"c3134b3ac0a7ba3ddd18cbc4ed041cd4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1041_n"7d5c63aab9c6cb0b641b9998e4cf7862*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1041:40x1041x128_n"7187da1a27c5f110fb83cbeb20a74dd3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1042_n"cb36c7627ca7d3f52ce0858683c9f429*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1042:40x1042x128_n"c5cafb7ffe0d5c32857cffdb26e20eae*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1043_n"9bebfc43152f4b37210bfe648b7172db*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1043:40x1043x128_n"625cbc63d6feff1134c6454925599e7a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1044_n"01ff0ccac806884cf5e6efd9c4caf7e6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1044:40x1044x128_n"35407190677c16d67b19c5d27d78ae19*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1045_n"8b33d90f1b00e2a9dfd2c0e84426c267*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1045:40x1045x128_n"2271e62eec4b9ce921f99c293ce0f308*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1046_n"bb2e3bce2a311e92a8a6ed3fb05eddaf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1046:40x1046x128_n"632e59476410277e97686791fec8136a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1047_n"1d02706f7fe72fae30a56581c3d75233*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1047:40x1047x128_n"3079da376553c1c9585faab8981acaf1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1048_n"49e6bbe0c5587aa2ccd7d1750c7fac6f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1048:40x1048x128_n"2c1eb0a93ed0eba02d3db005a8344ede*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1049_n"1739d273cacae0043cfea13a9cf68a06*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1049:40x1049x128_n"bb951a1ccd06479ccfd13dc8c9b07049*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1050_n"8567ffa399aa32b65c31d88f6554e6d8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1050:40x1050x128_n"cf8f24a1a267ee484978782700244172*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1051_n"e4bbb8504e666a1aa8d2f94ecc727cee*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1051:40x1051x128_n"c6129174a2705b89a6f143b2e0e1d718*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1052_n"bff204834f741f0a169840509e040a4d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1052:40x1052x128_n"6c602e3b8c3f54f07b50a13c4d4523c4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1053_n"3afbf5d124616f906110e54c22e3b649*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1053:40x1053x128_n"4779de985d61f1b8193a87eeb7ef4516*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1054_n"f8be69cd7f4798a1adef3cf313014053*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1054:40x1054x128_n"f1cea337e863f5dd28b3ef6bc2dc9786*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1055_n"6bcdfb3210713a1ff48438b33fbed1e3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1055:40x1055x128_n"2c68714abd2454f585f6394aea87e111*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1056_n"7852ad0be37da731f7e263f54e72a34a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1056:40x1056x128_n"4611a729b7196cc62c5e2fded65ad4c1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1057_n"ab11a340f1d4ce9891b758f64cfe86de*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1057:40x1057x128_n"c42eb234b2229de89aac4d35e715dbfe*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1058_n"a8e265e206daa779e4663f153f9c6d75*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1058:40x1058x128_n"1e19dedd02a74dcda2f8795db31239d6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1059_n"65c1fbcbe55156bcbcbc508d7d21b6a9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1059:40x1059x128_n"02b9fd3820b073facfe80599a40edf6d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1060_n"9ae01fa7e8e2b9068a502443e68247d0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1060:40x1060x128_n"f24aa10896cc03b40dc908cdd33751d5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1061_n"64bb41db9d7eb6fdb80c890a0c56abae*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1061:40x1061x128_n"746060160702bd9aea899d66bfe761ec*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1062_n"d14dd3f5dc23b76b675310cfe9c46d2a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1062:40x1062x128_n"e6e434bbc944d81ce682c2074e3cc445*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1063_n"e6af1d9f95ae12c7fae73ce828a1b002*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1063:40x1063x128_n"e59de80999567e1dc092f911f21fe9a2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1064_n"e34a57406d6c71e5e1183fc13e75c149*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1064:40x1064x128_n"7d6538eae98e41485e21cd497ef1dcae*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1065_n"0c10a5c0f7077ef5cbb73319b43fa373*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1065:40x1065x128_n"8369cb8dad0ec11d91ddc2c50ddf2605*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1066_n"3b2fbbfd271094ecf88b64e54874af68*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1066:40x1066x128_n"4cd903cd87c072d5653f545ec89101bf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1067_n"5a72f410e85c32d610a8b650e14f2f8a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1067:40x1067x128_n"6c7d5c5b67e477a602e03a81d5c29b4c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1068_n"208ffc0152517f0a9de29d24fbec9d20*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1068:40x1068x128_n"967ffae86ab153b683eb0c69d9fc7f12*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1069_n"8b1f5cb2919e4d9e3a2a26bcd0d3ffe7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1069:40x1069x128_n"5e30a4710ad92a3ff981bba7e8ced033*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1070_n"7c2c2db56325e0f63861283756061868*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1070:40x1070x128_n"202e41dac18bf06029050a29ddfde4b1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1071_n"866b9fc7dee187dd3185fa431162e2ef*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1071:40x1071x128_n"1c9553112f3e8d70f30231f815f03178*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1072_n"839554ffb69052da6735e760286d84b5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1072:40x1072x128_n"7aeeb6a45927b7201bf281179575cbb0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1073_n"e81a83616c2c65918cab5cab172a607f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1073:40x1073x128_n"8e980f167b7b90297f02439b26361336*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1074_n"022424be1559e9b29f54ac1200162378*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1074:40x1074x128_n"029a2f29ead95daf203a1e2c30e2f8d1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1075_n"2601e135435255d1174d52167e794546*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1075:40x1075x128_n"01910b0e338c8136170614227e1e5514*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1076_n"111c913139a28981973012d79833c896*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1076:40x1076x128_n"b7e8bba1844eabf0d24b3d4ea664d2d4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1077_n"fc3cc9f6772e98ca249f35e96dd897e2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1077:40x1077x128_n"0dc454067a6969857ac9b1cf825ed2e9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1078_n"82afe1d012cb45ab5d8a4a600a0d6714*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1078:40x1078x128_n"ba6ebbd7827bb4180bc87d86899d8e46*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1079_n"0a1a7fbeb28440c4aa45fc24d078773f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1079:40x1079x128_n"946d0c5f8eba7ae8aa3866e655fe968b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1080_n"58e0460b0596b14189cf59d06e4cb926*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1080:40x1080x128_n"3ca4eb9ba7e783ba36ddddcad868b672*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1081_n"b2f372eb3109e74a37320a60932f934c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1081:40x1081x128_n"cce77dd3d48c540ff5fc14c90bd8db8b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1082_n"8e1b502144b9c60922e6d95037742f16*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1082:40x1082x128_n"a00dfd6a19c0a977ee13d8570d7d5c9d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1083_n"31bc45c9ee70ec11252841ebfdd23c7b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1083:40x1083x128_n"3a4924ed42b7f771795e06c748a5414c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1084_n"f1c15c72fc3823148a9588f24e3364aa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1084:40x1084x128_n"e20a834084a5a4288c40497e831e6622*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1085_n"45debd535144cd7652a0fcdaa63cde3d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1085:40x1085x128_n"e8635f3577145340d457ed3069f65418*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1086_n"e60841f4c33d3d956c9df92f5926bc1b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1086:40x1086x128_n"2bc713554754b046b3d070bafa0e8bad*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1087_n"748c600a111f39e6a1ba5df91c0fdee6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1087:40x1087x128_n"226ae006a4f8fa84a82b504301f74ca3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1088_n"e19878483d8bcb02c89d75f9a1236741*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1088:40x1088x128_n"f9bdb5ab3c1276928cc8d896d9db28ed*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1089_n"0387d8642a5f0f997cede6e3ba08ecfc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1089:40x1089x128_n"789d815055a224900a029d938e444b12*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1090_n"1dc717ab86db48714bc550436347a42c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1090:40x1090x128_n"a9b53c2f97d9547f5a241dc7a6f07208*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1091_n"77d32f499a6682ef23cbd99a5005d6a8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1091:40x1091x128_n"01d8142eaf65f5da225ad421048c0473*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1092_n"c5574d817a60b5bd998ce5447cf1e5ec*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1092:40x1092x128_n"377493893cf89e87ef0654ac51c8e413*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1093_n"66aefd8ece1c92fb70e5c662e0d9d94d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1093:40x1093x128_n"659f1ee1779bbe3a6bd297f50ae0848a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1094_n"c9c21f8b88c76532d98d32412d61cf9e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1094:40x1094x128_n"ee07fd999b9043072591cd1c2b418a4d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1095_n"6c0a0a2b73a7dc1bebf01a30cde6d203*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1095:40x1095x128_n"0c96d31efb787c2f6ef04964d344b151*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1096_n"579458d06edf23b2930554c90a1e052f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1096:40x1096x128_n"48312be57d97e501fc9efe0531fdc60a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1097_n"928835b189e9f426172e8fd7fde7d525*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1097:40x1097x128_n"b34fc8280a692560ba271d4a6440db1e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1098_n"0f1493316080f63ce975407cd6267e8f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1098:40x1098x128_n"c83a329a042d1ef874c28204872f8b0a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1099_n"fd30707abf0173aca50c31df7b2f7295*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1099:40x1099x128_n"8d2fd076251a0cd9f8da81c07e1d1e1f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1100_n"2829bcf0925cfa95df1f3740d46fd3a9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1100:40x1100x128_n"5d34e71de46e00c5422a460210fe026e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1101_n"73bdaafb5a5e9b15c1a63dcc1ec7aa8e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1101:40x1101x128_n"5d4fbb25c3b6fd2ebe98326173aaaafd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1102_n"96a7fc947b73098cceb7668ba70fabcb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1102:40x1102x128_n"e6f4b32de75ae5fbdf4c119e8e1e039d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1103_n"9f0ce8ea2d933ab0909524905a24d177*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1103:40x1103x128_n"bdc6f127528cdb4492c67a4fc705f626*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1104_n"8df77752eec9a862ffdea8daa4ffa183*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1104:40x1104x128_n"b402cad16c1a09d684aa69b0c9e6f5cd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1105_n"028aa680d3257f768631051fb1da6028*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1105:40x1105x128_n"eeb4f65149447cf3976f3aefd412ff1c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1106_n"edd8e421756e09602a9b7fcde50c725f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1106:40x1106x128_n"f22dbc08bccd789896950f0cde272590*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1107_n"58e9588a29f6bb8213b34058cde259bd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1107:40x1107x128_n"658ca1e31b9b76c0e9af7bb7a87700d2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1108_n"551c3db09339ad884963305a1278248c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1108:40x1108x128_n"a6fe6b062ee42729e9fe85ccec7da7d0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1109_n"fd1d432fe27d60a4a2c4a1ea998e4b2c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1109:40x1109x128_n"8a60b101e526dc9621f41f2127f84588*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1110_n"62f12765effecdf1b18ac2311cfc2780*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1110:40x1110x128_n"10f0269ca870926f45df7ae6afd0aa58*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1111_n"1eacbf0d4f7e5458e907accbdb3ee3fe*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1111:40x1111x128_n"7b67fb05440c9c719e05d74068a5fe85*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1112_n"4e35e13f2360741cc756ce85a1f61396*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1112:40x1112x128_n"a420422c0741264de5c1c0308a0c12b0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1113_n"7f70e730ce7143439056f58e7fd377da*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1113:40x1113x128_n"7e5fe73b6205b25f2049aab218d27a4b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1114_n"622b436008455e4ff263d5fee9ad4dfd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1114:40x1114x128_n"35917524794b722829a2a59f78ea91bf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1115_n"b3668c32ebed9d3f3892e6c6f9333c6d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1115:40x1115x128_n"8824da9324f160b25decc47d476cc76d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1116_n"f2502004d18b805dd9059374ded71219*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1116:40x1116x128_n"448bf272c9155d86d53898cdacaf03e6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1117_n"d1e0e3395342c3ee1a85c0be954dff0b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1117:40x1117x128_n"26f4056ff07c4bf112d3906efe8216c2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1118_n"03383b62bb10e180231303bfaf2b7bf6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1118:40x1118x128_n"b78b6a642fec815eeff7e24b378b4e87*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1119_n"50e2f1aa3f1f5590bfe99810ca5dc6f6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1119:40x1119x128_n"d2be2bf6c481ed5943702d2d7ad7ca8c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1120_n"43e5851f56a8d8f94baba2ed1e8ea32b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1120:40x1120x128_n"d5e64371f8614904842da71c4ee25732*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1121_n"1723393f378da7b5c3fd5df9ba821a1d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1121:40x1121x128_n"5950e234905a2731059dcab734997894*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1122_n"29f5e52f0b2cfc924d41f692ee0d8721*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1122:40x1122x128_n"a4573ff9f1f2df93187a21be86186149*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1123_n"485f4fa1381b5bbeba2b46dc86a4e3ab*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1123:40x1123x128_n"1c5de7a955e18680a75df8d432c3020b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1124_n"91d4ed66211dc557713b917888ea1a1f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1124:40x1124x128_n"0fa42cfb34f85d0d0e638092c02f8a98*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1125_n"1de2537cb82c5330e56b1f440308bd9b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1125:40x1125x128_n"67ce4262bc38d649109360d3477fc705*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1126_n"78dd32234d257b7bc5039a55e1bfd04e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1126:40x1126x128_n"4e40143eed4bd6861a92188e113b73a5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1127_n"24adc899d80c0fab8d75d938fb3064d2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1127:40x1127x128_n"e599f1b4cf97d596cbf7b7db5d78c6a0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1128_n"23ecc516b6fe6da9eefec79916adbd09*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1128:40x1128x128_n"2d3cbbf130cfe6ab2698831098ba0a39*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1129_n"bd5844e106a9e013c1567feb49f1a1db*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1129:40x1129x128_n"ab4f9781e04f56b582825feaf66a89b3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1130_n"21ccdbd1c1b3a2d77d1f9aee147fe7cb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1130:40x1130x128_n"601e0d6206d807cfc84cb4674b87b156*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1131_n"088b0bda3e58c0cc105673b75d924bd7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1131:40x1131x128_n"2fa665e6a2101d78e1a19406331f4060*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1132_n"e52dfd6bcabcebc8713d56444e9f4a16*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1132:40x1132x128_n"af4313f7d4fac5dfd57d710cfe3a3a6b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1133_n"ac0efc98cc70f38181d7032e234a876f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1133:40x1133x128_n"700963fa1c33e0b6137bab4167b51ffa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1134_n"d6aede540c6c7cae3c8901034cb8ffea*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1134:40x1134x128_n"aff0a048fe59fd5a654315ef6634eb95*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1135_n"0969254b4adbf61a3b7b54a8c831360b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1135:40x1135x128_n"cc83e63d331eaff18cc8770c757ffbc6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1136_n"7b3c9d89a8a93d7668835438974eca6e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1136:40x1136x128_n"155dc9db0b316e6d97865143a99b10dc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1137_n"590d0f6386dc32cce6871bcf01b9eab2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1137:40x1137x128_n"8de600c9e904c6af6ec2c22d693b79ea*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1138_n"b4add878e7747505416d61c91999eeb6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1138:40x1138x128_n"39351687ff0f393405ec614b988d6530*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1139_n"2f25bf845c5317e884f25d16aee0e41e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1139:40x1139x128_n"5bff83f6b023f2f8e09398ca515edf7b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1140_n"1c61e1480589a7d54186960e9b77d5f1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1140:40x1140x128_n"7522ea8deb733242c6b7337e65573646*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1141_n"6ae5ab2d139662c13063d111e7623222*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1141:40x1141x128_n"2da4f029fbb91c1e4527f4137e992cde*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1142_n"fdf8b75593d34bfdc7c34d82b94ced71*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1142:40x1142x128_n"aaa8af5b501a14eb795e830408477c3b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1143_n"78b45e1521010ffcaa40dc51d63e96a7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1143:40x1143x128_n"4128398a09b9973e1c2f89d71be01c48*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1144_n"fb8171b8f1d16ea2e0da3274a9243473*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1144:40x1144x128_n"3367a872635c4f8f09647d58dfcf85cf*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1145_n"c85d0d9cf340ff9560b500cd3b010652*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1145:40x1145x128_n"fa65bc31b9525c2d4fd81488cf08d46a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1146_n"a3cae3ed0c2e58e68f90a5ce24c0701b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1146:40x1146x128_n"913234b2a90ff5f61adf0483f89b4fd8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1147_n"bd01f34f36d0f1406587010a7a665e7d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1147:40x1147x128_n"70ab28e2cdd041db415337698ca24260*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1148_n"59be3cdc9581a2a86ef5b387eddc0e4d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1148:40x1148x128_n"50346d3f1946e024338fe811f6cccecd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1149_n"52066c55f4c237c872d87d5b7eb4e5e0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1149:40x1149x128_n"12545f2c89e751999bb3a35986ca5e94*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1150_n"5bf07cdde8511f98a2d282e685ad65e6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1150:40x1150x128_n"339e7e05154504ca2766325833d9b800*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 40x1x128:40x128x1151_n"532f6950a278b52d445d58fe0b76c6d6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x1x1151:40x1151x128_n"bb18135fd60cbab906aaf6c7491c3fb1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x4096:4096x4096_n"ac5a59019759d42e771637c2fce95b71*112"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 16x32x256:16x256x32_n"e384648de083db2ede051904a21c3056*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 16x32x32:16x32x256_n"d9fd1c3d3a2227caa39ce1399dbf4217*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x4096:4096x16384_n"8ee80b944218628417ec9ae9ccaf74e5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x16384:16384x4096_n"97ed4d5390af19c059821fcb8cd4ed7f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x4096:4096x50400_n"f4e12c0b42e0c759b3e4f988d56bd8a6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x4096:4096x4096_n"6508db2bcfd1f046b5748e493609e5bf*3472&92a8f8df17ca19442b9f5131ccab47c8*14224"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x33_n"3164f05a6263585d452e0be11dcf61f7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x33:16x33x256_n"48fff74d8faf97409c437de8fc3d47cb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x4096:4096x16384_n"842b6a1667ff9b702b9ef54b0c5c2806*868&901cc1539e694fb100fcebbbccf4a11a*3556"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x16384:16384x4096_n"b07fb078523cc2da8d93ca788b9eea3d*868&40cc6fc9678f4e2ea6767562a1b4bf1d*3556"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x4096:4096x50400_n"9c4e1f749e3cdf17a082fcb07aae3c43*31&a7f6c80e45645a3f328af676e79f8c0e*127"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x34_n"815e9bfb0976db2c970420810845db02*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x34:16x34x256_n"fb0da9715e1f6dd6c8a2c90e9d4a02c5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x35_n"c9ba271adbab4cfe26a6c58b588449ac*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x35:16x35x256_n"e92d277cf0373d3411724b9eb88b0da9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x36_n"bc95c515d9fbaf240075a70ee993fae3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x36:16x36x256_n"70988d21415536e5981e3f0d42d27a6e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x37_n"14c42a7df3d69c1c5d1af7e3178ac4c8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x37:16x37x256_n"b14113145f57796cd03d1e1f4eb12a1e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x38_n"fc81b16175e711676576835de753b4be*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x38:16x38x256_n"9a307d9a3a9d2047744480c9c7d30694*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x39_n"ac5b8ea7e7ab6fc7c6ea0654727e655a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x39:16x39x256_n"95e4f5476a3a8d67b2a516ce7294a4e5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x40_n"1f2df453d302980c801895d647cb1501*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x40:16x40x256_n"958b9e9200a23306615cd7afd9563c55*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x41_n"215c270afb9e23ccf3d51c0e92f9b8fe*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x41:16x41x256_n"c84c47d6ff7dbdc5a237c171bb8dc20a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x42_n"637632a796978189ab9097ae7bf33aba*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x42:16x42x256_n"24b9b43cdf97a15ccd3e8c9460b5ec6d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x43_n"eb21ee6b0dc0eaf0f7eecd1f41acb4c2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x43:16x43x256_n"9c00a2d86f1a6a2f1a8b902063da1a81*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x44_n"7507bd43c4bf5a223003d5cfb4cea32d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x44:16x44x256_n"dbe8ba94135e74873840f44022d99837*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x45_n"a3081f9d1f11321b9a9ee5d6220d6100*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x45:16x45x256_n"22ff4866e5af6223e2d45866eaf91c7d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x46_n"8d9e7eab783bd7d93cc7b9d686635cbe*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x46:16x46x256_n"1f6762ddfce9bae50a38106140b2cee7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x47_n"8f5462680b6ca8e1a2038f8ae28c730c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x47:16x47x256_n"f252e1402ff3b7e60a81603bb23ea6ca*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x48_n"52211da1ae77e4e5c76a27cb2e51b379*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x48:16x48x256_n"7d848e90e8d2121bf5a658ca97e36433*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x49_n"5ddfb27ab5746b5c45df04ef1e246340*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x49:16x49x256_n"cded02bbd4dcaa5810d2dd070037e6eb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x50_n"0c8b3b97c717856ef93299e7ff870b34*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x50:16x50x256_n"a27ca29570608e68868d4b20f16d4b24*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x51_n"a733981e02beed8e92e8faa71192d06b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x51:16x51x256_n"9b19d2753ae94fdc4f44d40073f89e21*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x52_n"2de25e4e368f1c62a5b64584792156d9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x52:16x52x256_n"3150d826840a2e83877c56ddb90c366f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x53_n"1e2f6e4538b56ac883451d6908bd5e64*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x53:16x53x256_n"a9241d39afff36df0471bfc74abd64d3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x54_n"07c793f97d5bc8d8227e192b17394433*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x54:16x54x256_n"d31e9ebc938e0d4c67de674edcf1cc52*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x55_n"88a79f6cb2a26602d24df46895557321*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x55:16x55x256_n"7b28d9daaf91fa82f53673058a358777*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x56_n"29414f9bac4570ee6f0c2c69c8a9dfbd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x56:16x56x256_n"f0fe366d5c2c99b40d9e73f081da80e0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x57_n"27e47da62338e6d61e5fbea4ea3c381e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x57:16x57x256_n"92b00aca5aea791c298b8f3c767813ec*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x58_n"e688491011c69f9a3d0b770772b65abf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x58:16x58x256_n"d5fae9a278cfe197f2672e3616f2701c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x59_n"a64f41cd5c1d098c412b6e7eae23d007*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x59:16x59x256_n"e683e6470f196876d1d4c381680733da*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x60_n"40451f8738a634161eaad5867b215850*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x60:16x60x256_n"6f53224121e3d758514a4b13757a52c8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x61_n"1272be64912d286f0d46952ec708782e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x61:16x61x256_n"0cd6c04eeb1671e1e390205fd88a76e4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x62_n"19d097323228fc8aca2dfdab90d140dd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x62:16x62x256_n"e60b6839f53a0fb73ea57aadfcee3cc2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x63_n"9bde72b18e835847c7e91a48f598cb8c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x63:16x63x256_n"df740cb126b3f294d78b56cc388283b4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x128:128x384_n"cea9fc8a06776207f2aa9137caf79723*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x49x32:2048x32x49_n"111379e7a16f0dc8c199866bd7211768*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x49x49:2048x49x32_n"bf03269790b73a7c7c3889736d7ddc40*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x128:128x128_n"8810a35401e00a176a7ec3e3cad92385*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x128:128x512_n"06f6b7a49d1096fb348162f8987d476e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 25088x512:512x128_n"c89ea08e009213d4210a787705f8264f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x512:512x256_n"c48b6318719529fbbead9b62498a11f9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x256:256x768_n"7aaf7df3b2a29bdb0416f1504e290a50*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x49x32:1024x32x49_n"0186302a16e31007c468524a5a2baffd*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x49x49:1024x49x32_n"df281edc48b48d14e707e7edf0a0d6fa*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x256:256x256_n"eac19a32e155dd917284009749fcab8f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x256:256x1024_n"b11c2f0fef2453543b13ff32ddc1f308*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 6272x1024:1024x256_n"43453c62bf922a4c3d246ef154a3c8cc*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x1024:1024x512_n"7653b96500d3d568b93f661873149401*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x512:512x1536_n"4eaacf78ae776e8bbec6f6cfa569424b*360"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x49x32:512x32x49_n"fda1ac9fc9b41775a34273808e6f18f8*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 512x49x49:512x49x32_n"d744589d960cd8ba1c091995826634ae*360"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x512:512x512_n"53051c4aa329733acc0c8b6dbd3acedc*360"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x512:512x2048_n"1006178bdae73d5bcce3d65ac9159524*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1568x2048:2048x512_n"581479d356f48ba139f9f922fb1a3e56*360"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x2048:2048x1024_n"eec5b8b0f2958ed4dd32f4fd3042a207*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x1024:1024x3072_n"72db563d486082ec20aa2c43630e716e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x32:256x32x49_n"80701dce44196235cee4a834ee870036*2&f2242a0474928aa5ac072010030d2aa3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"f58cd44006d816eb619b7835e81c66fe*2&b16966bbb0bbd97167c1655d244e7952*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x1024:1024x1024_n"ce1483b29c03eecacea0bb28ced978d7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x1024:1024x4096_n"2d9961d4dfa1e3785893e489b3a3c63f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 392x4096:4096x1024_n"b9699236dbe5fdee62170ee0f57b3473*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8x1024:1024x1000_n"3c22afda0b881519f32f82f18661f304*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 2048x49x32:2048x32x49_n"1c931d85e6cc63b6693ea50b1558aa66*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 25088x128:128x512_n"180175decac8d20c3367b5b733722316*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 1024x49x32:1024x32x49_n"c73054197759988e141dfa88686f90df*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 6272x256:256x1024_n"028ca7bfebaf1ca66a0859ad08080fd9*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 512x49x32:512x32x49_n"629a12f67e76312ab1db42f191d06e19*342"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 1568x512:512x2048_n"c5f51aa0ca6f3dccf06b8c83fc69d558*342"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x49x32:256x32x49_n"c5ccc0ecac523568361505f64d2be5c1*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 392x1024:1024x4096_n"88b1aa97480b821d4ffd42db2a38a7bf*38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x164:164x384_n"8280d0ee2ab75613fe41cb3b9451a3bd*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x320_n"7e17450acbf84693499f05b6beb2ee6a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x320:320x256_n"2348b760228dce80b373d0b18111addb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x256:256x192_n"44ae306870805a580b7ba09bbe52a9ca*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x192:192x128_n"9282e11ffe23105e9865a4fd2c502ce6*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x144:144x384_n"0cdc64363716532a789c0e94e4c63790*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x176:176x384_n"c0020fe12d9e9daec1e406e11e931fd3*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x256_n"35107ec577142b1a29f3fbf0fb396b79*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x128:128x64_n"1cddcd76849a33114406fdaeffc90e83*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x64:64x1_n"20c5e49610576552c571489cd413d32c*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x1:1x64_n"7df513ec9ecd8caf68d7b7e2f09aaeaf*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 64x1024:1024x1_n"373835d94e7bef6c58903b96ec35a80f*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x64:64x128_n"be66d37ea1eddafbf9a3e9ebb728978a*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x1024:1024x64_n"fd7dca768bf7301892d86c2810aaf4a8*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x128:128x192_n"f695bc8c2ff4927f46e3e6f3427a1a4b*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 192x1024:1024x128_n"29e73082e3517615e76df8ad72e40616*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x192:192x256_n"ffd805e7a24c04967e6ad0694f7441bc*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x1024:1024x192_n"93479e56d971f3dccd3825cb8c74efc9*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x256:256x384_n"ae81d40f8c0a1296491d6115b6939a5c*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 384x1024:1024x256_n"193553a4ad3f8f0ffc797941ace153c4*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x256:256x320_n"241bfcae2a661637b1b65ef67e05eac6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 320x1024:1024x256_n"8b50da212c8b6d20913cbccacb858911*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x320:320x384_n"f8ba2dd0eed1735c78162e855d0881dd*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 384x1024:1024x320_n"7057dfb4def938d62733ea94ab422be0*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x164_n"3d38d205272b3c9d350315f9d48705c8*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 164x1024:1024x384_n"082f63887c20272ea0d1834f942c7871*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x176_n"64c946db2980bc44b955484759097f9c*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 176x1024:1024x384_n"09581f79c0ae1d290a124ea54a18ff26*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1024x384:384x144_n"26d8e7c3ee8a9ebb415f6199eef5923a*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 144x1024:1024x384_n"da5cd9649d486068834708dae4546381*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x3:3x2_n"a045b885e758e7f2b488d9910f2e0450"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2:2x3_n"d74c30b35e2560a369c5a846b0e54114"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x3:3x2_n"364be05c44bb2f0ed2bd89b18dd294b1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x14336:14336x5376_n"108cd04c1859b4dc4215887c88a8dcd6*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x32x128:56x128x32_n"1b65990d97e0f5e3fa0fb86fb3fa6799*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x32x32:56x32x128_n"f65e882d17aa06c32a0c5848ebdbf85b*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1792:1792x14336_n"cdccf10f0a849947cb59abb80418339b*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x14336:14336x7168_n"27e17ecec49d4d8c557a055957e6fd08*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x7168:7168x14336_n"36f7afb52a7f45bbdb9bac199f24760c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x14336:14336x250880_n"c0ec0c287d92e71b2c6fccc5bcdd5862*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x14336:14336x5376_n"e023b6f86d1090f181704ee2b5d89b52*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x33_n"5770e4befe70817cd32172bde3628e9a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x33:56x33x128_n"5386758525c48a03a931645ebe22e58d*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x1792:1792x14336_n"9817a50f92e8aabe99474a290b73ea5d*8680"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4x14336:14336x250880_n"39b39a4d21b25051ffe5f9cfff3755ea*124"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x34_n"2684061ced2ec04aa3e513c0ad73d91c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x34:56x34x128_n"d2cf76c53375469e8b70e72a4027d79e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x35_n"ff09af94ec0b2d41962a624bfa2708c5*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x35:56x35x128_n"1cb7b9348ded214810e2f8f2e1534467*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x36_n"0132f818a671e97fa689037e1f639b16*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x36:56x36x128_n"7e14bf8dc354a4fcb0054fa50a6c318f*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x37_n"4f409ae90e9c6a998f2284fa49eb76bd*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x37:56x37x128_n"7c9f01dde796637c4b72d817579415f1*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x38_n"a365b6d7df9ea7c8b9c49ffed49c3e81*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x38:56x38x128_n"6ef92af49aa4ba07c2605319c85fcc72*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x39_n"d4922ecdff40acf77a06bebc91536211*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x39:56x39x128_n"4449f945eadc5265f5880f4ec49d96a7*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x40_n"89485c5f198d82729db27c8926696a3d*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x40:56x40x128_n"7ba1e596815793a8e286122e2f95b0bc*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x41_n"daa0ccb4eb17189a83331c89f4208bf0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x41:56x41x128_n"38d731a7afd8a416e5b4401563653dab*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x42_n"2d86d6e941bf1ebc803f8e0ebfcce87b*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x42:56x42x128_n"cbcae318bc8afbd1a4ac16160bcfb6e2*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x43_n"1db2a51b2267485a4a06bbeb2d969767*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x43:56x43x128_n"f8cf34871bc1f25f32c72c5d2394d1e1*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x44_n"5221105557779c5af8ec5e355962f08e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x44:56x44x128_n"7f3ec98b485d465a3e95e14bd5ea5e0b*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x45_n"afdbde91756360e8fdae7375ce3d39b8*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x45:56x45x128_n"0b04b7390e9a5cfaedc4e84e9dc3ab3e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x46_n"0347a480b731f4d9a9ca42fb98a635e6*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x46:56x46x128_n"2a388ba1f2f20b4565c49022987a0d5c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x47_n"06c972e9ed36e1642bb17b2fb59f77f0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x47:56x47x128_n"9be09e06222ba9c537eaec5d1b951311*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x48_n"ae2a98d722bbe9088b1f27fb7ca68fd3*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x48:56x48x128_n"7fb96d1818877878ae3a76282dd90724*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x49_n"c48ebb43b214864b97d2425d92f17ac5*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x49:56x49x128_n"8196474de05e6b26a8574d9ddf4225c8*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x50_n"7be5d9a9308fda1d246d600839c7af28*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x50:56x50x128_n"9404eaa92fb549dda7d477ce4d204d5a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x51_n"ff53d54d354e11730119aca47359c915*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x51:56x51x128_n"65e7ce589f738fe0224e24d6f338f9fa*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x52_n"345674cb02c241a441bab58a25d7e3cb*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x52:56x52x128_n"df477bd51c966451284f9633301835b2*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x53_n"8099ee5d8bc187d206e0924870280ee3*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x53:56x53x128_n"c057996be82dffe6edbeeba543e7e9d1*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x54_n"afe1c3adf65d6e81e3193710e3787cf0*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x54:56x54x128_n"d57a119d74a17e33ba8ba59421abd801*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x55_n"5acdd287c2c3a6fd959d6a11506ca59a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x55:56x55x128_n"35a87908d7c9fb08576f6c6abc73ce86*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x56_n"7004afa2da243cc056fa0bf947e81bd3*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x56:56x56x128_n"6d8cc8b1b3560f0740d22bb1aa838576*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x57_n"a22d9262fbd02bca4b09c20b778b2563*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x57:56x57x128_n"817c32822c91a6beea2228c669464e94*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x58_n"bd5e0c309c9bd3e4b7883f349ffc193a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x58:56x58x128_n"f6389d981dc2abfc3a07471134b6845e*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x59_n"5e645cd0240ed7d87f8232826d17005c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x59:56x59x128_n"448ca51cd15abdede372c5087dfddd1c*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x60_n"4cab30794218139d3c62f121c0a96b70*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x60:56x60x128_n"027d5c0bb7500352634e98250e01c309*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x61_n"ad4f0505d32152649f8487f3fd2f7ff3*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x61:56x61x128_n"6cde48c0174466abb7999f1ec7fa8c3a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x62_n"8a9f6cd9a3c3771aec200fc24aca598a*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x62:56x62x128_n"6bb2c071487817635fd5cdfd14e4ffc8*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --wtag=abc --dtag=abc --strides=384x128x1:: --attr-post-ops=eltwise_linear:0.25+binary_add:f16:5:abc 56x1x128:56x128x63_n"93177d0da5d8c39c86f083055d444e1d*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 56x1x63:56x63x128_n"560be6a327e6247970912b7bb5116861*280"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 3136x128:128x384_n"5864f619ce00b44ed937a5f55828b79f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 3136x128:128x128_n"ba8dbf0c9fb17f9be7271f209dce8d9c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 3136x128:128x512_n"12b078c1f4770b357959203f2b41c560*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 3136x512:512x128_n"bce238652f8ee295dc457063c7120d9c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x512:512x256_n"3dc1b4ce0cf51c8b59748eba72ec07f5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 784x256:256x768_n"06da6c6a3a0219fb9a58dd8568264021*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x32:128x32x49_n"683d923c2d3f074db0358715a006d337*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"37d439e403bf52d7e2558e78b3ede3be*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 784x256:256x256_n"3ecdcc04ba0c5a3fb9a617d2aa1ee56e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 784x256:256x1024_n"ea432e5e0e1522d666caef7b6a219b97*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 784x1024:1024x256_n"305e7bcd609e9bb7630b4790156b1da0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1024:1024x512_n"e488c626206336f42a3aab9c066ef5f3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 196x512:512x1536_n"d1acd9074afc3e55750aba8899218ff7*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x32:64x32x49_n"18be0db800412c33cf7cf62b4ab063b9*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"3e3c98e353e9bd9b5a8393b24be58993*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 196x512:512x512_n"503fb46ed86302938e32c21d04d3e9b6*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 196x512:512x2048_n"23417b65c1458ebaf373b4e100e9e18c*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 196x2048:2048x512_n"5de6ec230675631b03449556b8916e0c*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x2048:2048x1024_n"e792cd15e1ca6dde25f09bc9785dcb54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 49x1024:1024x3072_n"aa3dceabab44a56a4579551cca8d34a1*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x49x32:32x32x49_n"a69190204a7cb489642eace4e0e389df*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x49x49:32x49x32_n"e4b0192b810fa83bb94517392db3bb61*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 49x1024:1024x1024_n"3a50d2291b8bc00f8d05f15a2a116db3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 49x1024:1024x4096_n"57be6c00c8e5fc703a37623582702531*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 49x4096:4096x1024_n"48bb09cb75e968b3f9b85d681f5d4d3a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 1x1024:1024x1000_n"cb0003579e8328f99aa5c6585abc97bc"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 1605632x128:128x384_n"833d5a2662fc1e1e72534a3cac3db69e*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 1605632x128:128x128_n"2aa3f343d1a344ed6f79480adf424a00*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 1605632x128:128x512_n"e2285f80863f14367803ac6390409fd3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 1605632x512:512x128_n"f21b5e3d2dc035a02b3c53b99c73e17b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 401408x256:256x768_n"05f83f6be5abf5855fe58fac2ce80acf*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 401408x256:256x256_n"6b8ec0978a2b52ddd972f0dcec4fde1b*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 401408x256:256x1024_n"7c2f6b821a6633adb672a808e5627c9e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 401408x1024:1024x256_n"8a7241e236ed3da2473bf0c5be9c7480*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 100352x512:512x1536_n"9ec551ac7bc756a1029d4e9cc162f952*1440"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 100352x512:512x512_n"c5ea9553c69f35c4e502b0926fb75d61*1440"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 100352x512:512x2048_n"e664602a26f67e9e18d4be75f433649e*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 100352x2048:2048x512_n"5143e7c69f9df54998de35045ac02c1d*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 25088x1024:1024x3072_n"f542d78bc399c53ac9356985fbc4fa18*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 25088x1024:1024x1024_n"3703fb128bba1154bcee1c8d842a266b*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 25088x1024:1024x4096_n"3d769b8e112144dbc27f19f4d4888742*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 25088x4096:4096x1024_n"75a2c7532553a904afcb6bfd889b62b4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx 512x1024:1024x1000_n"4e3e3a1eb3bc7f5c5f128fd22829843f*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acbd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=binary_add:f16:3:abcd 1x16x77x64:1x16x64x77_n"8139fab014a2a3240e38b6b52ff98cf5*46&834444537214c3af1f5e55f1fa4af1ea*46"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x16x77x77:1x16x77x64_n"b8021448437fee6bcc188fe8b9a2b009*46&f48f51ba2d2fb70167559fbb7e01494c*46"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x10x4096x64:1x10x64x4096_n"dd9eabe07cf3839e27706a98033da48e*100&5a34b7eecb4657d3178f3557fcb2abef*100&6e16b27d7a636ceaa11f762027c5f823*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x10x4096x4096:1x10x4096x64_n"d157fa3b0f168cb98c791d01076f49a2*100&a2f96c7a18cdcd7e909a9164c461eaca*100&efe4224a675a2e571db91f5794881c07*4&defcb9a0b63fd7e05dd2b610c8492b55*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x10x4096x64:1x10x64x77_n"73012a8b2b0e7ea1e8742712f8457016*100&bd3f68e995ffef404f6c9554a5f1e934*100&5f4b2f68b91ebc6ee9a48bc356f2752c*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x10x4096x77:1x10x77x64_n"f772dd83364060dcc75120193e0c099c*100&45b838311153ff2dee90cbca6c4dd9eb*100&4912880a011e53d3fac1c60fca341e44*4&2118afae72ec1f8e3ae79b9d50d1f153*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x20x1024x64:1x20x64x1024_n"3d0a66c0c6191d557164e188f1733808*100&255725398e1ab040acad3bce4b0a0b19*100&cd34b17c2f288599da7d5c9da94e4f0a*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x20x1024x1024:1x20x1024x64_n"65f26d12ae063dba9438f926c6a9f3da*100&c19a4f8f33d92e3ede10f5b8e536bdac*100&bf68e6dc8d62c4cf99fa026176aeaff6*4&9f10312c9258f9f771cb0a2f4266884a*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x20x1024x64:1x20x64x77_n"34376395f2ef9b5cb833551f15cff871*100&c5e790b4f03c0a1c87cf26030109dd83*100&3dd8811957fe87701a4b1e42a94f2111*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x20x1024x77:1x20x77x64_n"598f7e19c0bcc77d401d9733efa40246*100&1a731d7be031f36217193552cfb6adfe*100&329314e9b0455f4597dfefe9978c375c*4&be2f9f66c062fc193482c1a0b097245b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x40x256x64:1x40x64x256_n"1c85962ee6c4c2fff8d22a08cbbb7ff5*100&bc38ccfbd5c9f885d07519d5e11c16e1*100&2cb33f2e4317e127db12536365d9bf12*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x40x256x256:1x40x256x64_n"bd3ef0dc852ff9f43600a70dac9e6e2c*100&dbd544f69f410c56ccdf99cec049be63*100&8405e36fc50ad956d1302a88bc15ed9b*4&8f4647d116df9516d06ff031cb8d3e38*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x40x256x64:1x40x64x77_n"4f765f5274c4305cb5ef533baa2477ee*100&75f268b4d2809b4c66ea7eb0e73261e0*100&b3fa46fd2ea96b9a521b449233de35c6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x40x256x77:1x40x77x64_n"cb98e18c218cbdee8ed47ded88abea8a*100&3371f0a4c5e10ef033fac2efc0e5e0b2*100&e7a101a1ad69d3208ba8032b8d1e9a35*4&f043430f20c4fe6711f4e3e625405c85*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x40x64x64:1x40x64x64_n"790c23a88c3d6bed3954f2170642c9c3*20&64b4d899cce8cfaa96c71fcbefd5d17c*20&23e46342ce55e27ddb157566f4d8dbb1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x40x64x64:1x40x64x64_n"800c0b6b919b47edd45643ac5c0f0c46*20&9e81f6806c99370c88e49e1fe04296c9*20&5cf695201867b308ba4deab78095cb8b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x40x64x64:1x40x64x77_n"276c51bbe33355c0994adafa638fd0c2*20&7fa5e35abcb3dc283beccba76e09007c*20&a4ab710b8ec17ec1e299775ffd316518"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x40x64x77:1x40x77x64_n"711d01f55df82c5f010602f3d9faa06b*20&978a566fe50b47d5c273a88951a3f3d1*20&c399f8e02717d172a8db76e78a501eb9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x384_n"639aa8473a0943d2c50eda48471ff46b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x32:256x32x49_n"7b65654720b432e7c13bfcb5bdb33ef9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"473951a6ae47a453cbe46c1925cf9f57*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x128_n"73af8eb25e350647d44acdbc54dd070d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x512_n"4e739113f5f2e36c2fd1c33c3e867165*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x512:512x128_n"025007112bb911b08187de322e78a694*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x512:512x256_n"979376eb5c7a2a4d8f9ab1566b7c8252"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x768_n"04515fb9ff5e227edfb960a26185517a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x32:128x32x49_n"561019a982542b5c866f7d3e34edeca6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"1a1820967d6b9c2678f21b0b3e054d40*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x256_n"4e1bef6d413cace60188c683c3f14aa8*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x1024_n"775f7f46ce72b7815900c744055ef083*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x1024:1024x256_n"79e8e9f12f42f303d337ee09dcb32ef7*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1024:1024x512_n"9ee61c2af70cd4ebd17b545da3cbaf4a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x1536_n"d3e7d97aed01fc984d4d819d3e9f2e1f*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x32:64x32x49_n"a00880eef36a6b8286e69bdc380d41f1*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"145d4c2f1c690e80055a1f26b2bbc346*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x512_n"96284f7402fe164783ffd0c576581aac*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x2048_n"1df88b45acff44eb04ac3dc873254751*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x2048:2048x512_n"a70a40977a84d419e0ffa597a85bcc7e*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x2048:2048x1024_n"9ff2dbb9b9dd5148887a777307b12253"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x3072_n"2555c439830b576a81589a68ca173ca2*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x49x32:32x32x49_n"907031daf1b30ded127e40fe7205debb*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x49x49:32x49x32_n"dd68221123b96e4ed61217c704c65f6f*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x1024_n"1a210c8c007d332698f21f0f4b4d47bd*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x4096_n"7a862220d733c5d0cd58331bc50b8aa8*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x4096:4096x1024_n"a838ccfb34bf4604137593787a44fff9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x1000_n"a340e7d3987409bbe869579bfafaefa5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1000:1000x1024_n"b39d178e3af77894b1291a6f30683695"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x1:1x1024_n"2f7ae8f29ed263eaee4ee87dab1b7d6f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x1024:1024x4096_n"fd6e755c9922230e62154af8758842b2*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x49:49x4096_n"e7e133766d7cbbb2a6077933404ff979*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x4096:4096x1024_n"3a63601b522aec92ca66615aba724413*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x49:49x1024_n"a13289477089bf0b145e0dc68d62cd14*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x1024:1024x1024_n"00ae3d91bd47901c4800cdb053cc48cc*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x49:49x1024_n"b2d4bdb3dc1d4ece966fc5a0ba06e1a2*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 32x49x49:32x49x32_n"2cd50808838333314f12b5843bd2ed2c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=cab --wtag=abc --dtag=abc --strides=:: 32x32x49:32x49x49_n"159ceef8c71c194a4e16f35eb6d650b6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x3072:3072x1024_n"cc53040babc71f491fae558c0acf5575*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 3072x49:49x1024_n"b6b29e35e05ea562f43a90774473a9e4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x49:49x2048_n"f3373eccce11e757f23c795687a7f309"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 49x1024:1024x2048_n"2ca4b919c54448d3d8c4f2c6df56a7cf"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x512:512x2048_n"c7b06863356f48a83677cc4c66167b52*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x196:196x2048_n"a96372adbd773113b9eda06934dc9eb5*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x2048:2048x512_n"468d65f8bb16c7946cd217b2f7c1cba8*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 2048x196:196x512_n"9e1a0feeeb6ba5ab0d8f9fa827ec6aa3*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x512:512x512_n"ebba70cd1e2a52461e461950c9755206*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x196:196x512_n"c1822561f75b09cf9924192000aeb004*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"5926de7084492048a53c700f69dc93f5*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x49x32:64x32x49_n"62f63d228fa6da9b473a25fb33a9bdcd*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x32x49:64x49x49_n"4867e7866c87813c5c12dce4529fbd9a*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x49x49:64x49x32_n"e9a97c606a5a7c7e00ec21d1c0b3fc2b*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x1536:1536x512_n"30a0ae8002367d8daca9f9cbaf4e2ad3*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1536x196:196x512_n"89c2e92b13b69e7abdb6d7a628c251f0*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x196:196x1024_n"248a328f487de2d98214be8b4eb469fa"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 196x512:512x1024_n"48492ab484c10474445c95c6d2c8709f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x256:256x1024_n"7fec21c09d34ab08af8ea62e6f321d97*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x784:784x1024_n"235c14b7f87856ef4ff05645db472c86*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x1024:1024x256_n"b6d47b0346d8d2607e133f4232f32862*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x784:784x256_n"5e6b63b6e4208b11edbef614f01dbd54*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x256:256x256_n"6819519b86f3a99b30df14f731ce4d0e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x784:784x256_n"bb2734839832c3fd31450084a949830c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"70370564b4cb118756fa70c8ed587601*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x49x32:128x32x49_n"89867084e15cd7b0ca0aaab4a2e0fe36*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 128x32x49:128x49x49_n"4713e59d7f1ed06f204eabab207f3355*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x49x49:128x49x32_n"4cbbb2b6c9de6bb9eef11f95f354a706*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x768:768x256_n"ee71925d875999ef017fa170fe887a56*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x784:784x256_n"461b0238bdf4683abe517bde047d9439*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x784:784x512_n"609d65dd04063823b0823b7533221830"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 784x256:256x512_n"8e5831ef3f3cb69cdd67e993ba5821b7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x128:128x512_n"8b073dd06c6e870065a6ce02145c5345*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x3136:3136x512_n"d5208de556f4bc02f29912b74dcdd76d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x512:512x128_n"d53f185633ff20e5ddf7e5f2c2cb34e9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x3136:3136x128_n"4dcfdc92f31fd60b0ad3a29cd08e4e13*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x128:128x128_n"0dcf264930d60d88f20b3cf0a3d27506*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x3136:3136x128_n"8c2dc50b6d6f5e7ad26dc6e569d42289*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"19f2643aec35470700101edc801be9e1*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x49x32:256x32x49_n"7800cc8b4397b54cfd1184c957ee2c62*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 256x32x49:256x49x49_n"bdf79532553be1aebb606d2d761291ec*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 256x49x49:256x49x32_n"faf719bc095332f21e4b03e1ab46c9dc*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 3136x384:384x128_n"29a2b8ea47fad1c3005c9b868de2e7fe*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 384x3136:3136x128_n"e242634cd52312c09c7fdd38a9284264*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x512x3:4x3x15000_n"3dddacc4efe9022e59b55818cfdf9ae0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x128x3:4x3x512_n"1077cf2e62d2048c483e1092c93ad9b3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x512x3:4x3x128_n"05392cd7c2fefd3bea3ebb19ed36b6c3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 4x15000x3:4x3x512_n"4da0011c34630ff9223f90e930ca322b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x512x3:2x3x15000_n"2d93d3e2034bad44643bc223dbad9f71"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x128x3:2x3x512_n"ce617b1330caa84c94dc05454166c9b3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x512x3:2x3x128_n"3eb4e58c0aab7261067638696302ee29"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: 2x15000x3:2x3x512_n"40cc05d504e62b145044f8d1e14e1eff"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 1024x2048:2048x1000_n"0d3a3f88d129e2618c87191146658bd0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x4096_n"65b210a78edac6d8d1baf381e52e76eb*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1024x128:128x128x1024_n"dac38bbb2050310cb2f8cfdd818aea66*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1024x1024:128x1024x128_n"85664c5b5638ded47727d14e69255958*32&76bee1bc8df47a012923790960306f31*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x16384_n"92b28655d052d49398e13eda1ace0622*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x16384:16384x4096_n"95a18678d71eebc51443d32a23082494*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x50272_n"cfc7b86cec024d9e9b3640b5f3177bde"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1025_n"61307ea4ba8691a8e2ad9544a421ff4a*32&eed4da8e45f1e5572f496d96a0233beb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1025:128x1025x128_n"4647e96e3178d4c170b065ae3b0da1d1*32&18926e02b73f58fb7f2db97d28e00428*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1026_n"1240a720a33e469d053663d4c7c22b1b*32&1dc8420dbab408bc7c0fd1cff4ec009b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1026:128x1026x128_n"ee079b6fa8fb1e2210ba3fc2a4a150e9*32&953f0289d872cfbf72c3bc0f0140bee2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1027_n"ba25107d59af95e6e6019a3c4d0ae026*32&9d7f4f4a42af64a6b178a01e8f8f446e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1027:128x1027x128_n"78841c7f25ceca94b73b84a2a44bcecd*32&ad0f53d07aa95a1d229c1aebb95bdc4b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1028_n"706b65bfcb4d2c59f2e999ccf271445d*32&c1466216153b8e5af659603f78e2340a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1028:128x1028x128_n"aa3b787eabd819a5b6897052eff58093*32&0b7dc8fda44907a374394a19c7a44bd9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1029_n"b246883e8fbb70cdd8a0a621b2d0f75f*32&208e6129d44866e0ac003137f1f9a4e5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1029:128x1029x128_n"f8a687a8e181695bcd4d0aaf8f98abe3*32&b2e60e35364354afd2a840fb7871d439*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1030_n"f84e6f4ecf8e20aefd28ba9429331a63*32&d06ced7954268d3cf5c5e6240ba01b6e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1030:128x1030x128_n"8a67b2e9e7f16307e658a144450589e9*32&c01bf99f37c774e85a4d972ef0dcf194*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1031_n"ea41d126e95259936d489959b2f5162e*32&84d70f996ef279b3b083b7e82fa39109*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1031:128x1031x128_n"8a59ff8b61ce06b62b6f85ed8af45bdf*32&d1535027030b43eefd5f50475f6f8b6d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1032_n"b15319f5b0d41770d70bccf9219aed09*32&f844309782279e46656bfc0fb985e66b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1032:128x1032x128_n"1a8e621fb963157098dc150fa06650e9*32&2e9ff975343badd6cadb1e0a94983459*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1033_n"23e8c07d5f8fbac067bd36f4845a4a0d*32&d9382d443c84064ec66dce11b78de140*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1033:128x1033x128_n"a606f1233956b0d920afb5ec0df3112e*32&e47baab4e5571f188510015c5d2525a0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1034_n"7efcd207dd96291a094229b7f3c554fe*32&c92a9f3288dc5f60ecf52c9e8f3dfbaf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1034:128x1034x128_n"1f272b830a00a767abf7c4d982d077fa*32&4d42566dbb86abb6fa77171f9c1a0903*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1035_n"4e20ac2ed5b405b84acb05808ec2d2b5*32&ec3c401e058cfc23eb42de8f5064c6e5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1035:128x1035x128_n"3397613512f2610ab75684b8eee047ca*32&bdd045242d37a2531befd6ddd62bf1e5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1036_n"1b70a36f8bf6ec27c9ac075da77c9b86*32&affd7f57c30b381fe3b9c5b79fbbe932*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1036:128x1036x128_n"ad990dfa46f9bb40eaae4e33c24289fe*32&c17900909294eee2f88781a68654bb08*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1037_n"836a05639818c349563fb1915a3d5d5c*32&1f464217686ce255ab38ba91ba228225*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1037:128x1037x128_n"7e8c1cfe2e7035d95c9281d5c79d884f*32&0296e8ba262fcdebf2ac15a52830cd69*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1038_n"697be3567d719000be6a1176b9041c2d*32&eb5f502bd1d8d1aeb665e8fcb91f01f0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1038:128x1038x128_n"640c1dbd4a46538adbdc5ade3b4d5a52*32&f4ee73c14ae56af4658cf4aab1ffbd74*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1039_n"4bf171a6a758e34bba80da09d07a491e*32&321d8a5eeaa06200281af994b8f42c83*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1039:128x1039x128_n"8e8621daae7b19fde27bbf71cb956769*32&9fe5a03d2b337487b6a188ef8eab5b9f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1040_n"c8c0f26611a0876c2290871cdabcdce5*32&2bcbc1eba0998dade340c14ab3a1d7f7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1040:128x1040x128_n"756a902e4ed7e30e9da0716a378500a7*32&430a657682727de2ebd474549ea9df60*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1041_n"50556e03d0b53168fd8f1fb140ab64df*32&979ba53c14e97337e4dba875b21bb7d8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1041:128x1041x128_n"3c12461874e409e255d038bc218e7a2e*32&95f25bb216f7fbca7c94f064c4d7e8cb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1042_n"2fd13727cf0b3b6b5bf588cfae96233f*32&9677ce3cc459fdacd462a97b3be877bc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1042:128x1042x128_n"e82fa814d8c22bc1126b52b687f4f9c3*32&b8d2aed115155e13d9efc92e7026265b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1043_n"d883779886d3365e44824fc384b89d37*32&40fa423be45d910b2784eda784b9e2ce*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1043:128x1043x128_n"b462f1b0c715486bd411165f6527707b*32&23b2b427b3909e3dbe539a3c19194d0a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1044_n"b118914775dea65e0d73ac46c9b10bd8*32&a1226099885ca3166b31956e73604c15*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1044:128x1044x128_n"d0a467a6a8849e399ac9c3a547dfda3f*32&fecd2d86a9d0cb1669b9723cd45995c3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1045_n"5e6c7b60770c2a1451cc4f7c6bcac9af*32&b74d87becc4879e7b1fec03989f605e7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1045:128x1045x128_n"43d51d8f3a8ba20f69d58ccb72670345*32&4ace7c739159965364e4b12de7a168a7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1046_n"c2d050e5f4559e2da85175f1e8089afd*32&2aa090882bc8e60dcea12a16f35403ab*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1046:128x1046x128_n"901de9377eb0d41cfbe8180a3aa8fc9c*32&de84afb7b0d919c2ec1f20f9da168697*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1047_n"412497a88ca64d30abd4e20f71a3e896*32&1717c742c8e17cb11e69195ae3a99c9d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1047:128x1047x128_n"f4390fc8466f7c8790005b2cb16d9298*32&e047001b8d8e53797cba3a79cb4c8a78*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1048_n"fd40bae39756049b0c81b4fcbc2bbfe3*32&65a01973946f48fc1cb30bf25233e9ef*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1048:128x1048x128_n"3df8d8c22ea0b7ea11781bd5c1d7d46e*32&f9e4813edc98431a9ac2ba983ad59b36*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1049_n"e13a4779fbd50ca36fc1ce2cb4e33c2b*32&c14fec6abed7425312510d9ee893b09b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1049:128x1049x128_n"81d741457bb948e829f0d7cce4401428*32&271f4868eea5682ee1c372dbc80a0627*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1050_n"904fe9223922669057d696c6c77a8ce2*32&7f013d141132d86808f495ca73801120*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1050:128x1050x128_n"38466039dbb2f572cf5da7fc66a56eb7*32&3c40dbafc8984ddc1c746034183bf570*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1051_n"63c6fe3c381b3797f963562e1b5f0a5e*32&3082e8aa825330addec987fb566b09ba*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1051:128x1051x128_n"13462effdf5ac7c67db69db362dfe827*32&41f593af6e00ee452ed491a7e85fde5b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1052_n"c3c0210315cccc854d249ddd4202ff51*32&5bcdfc6cad1e86dd9c383f100e74aa2b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1052:128x1052x128_n"4e870cf4617580bbfc024e48836f3074*32&5550fc2b4d7a69d55e2ff64e578412f3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1053_n"2ecbf154d5e327241c0589f983f7cae9*32&e1b58e0193e7e07a3a1a16ccbc48812c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1053:128x1053x128_n"9aab794b8a7933bd0e94bfd1abf7aa01*32&5e71d736e8d0138f4faa495befcad92c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1054_n"3a2debc71e8240ac47454ff976b6ea4d*32&bc886b3302d3f8002cdb66363bb16a11*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1054:128x1054x128_n"7d20daef3089238e1e47fc6d277fa259*32&cb4b48a3fec26fbb7f24f405bbe51735*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1055_n"015247fef855f5a2bf178b3688c0031b*32&785a16322574b7c27e13bd0f78a2de44*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1055:128x1055x128_n"7e19b8c221f738cc4c485df137c87a5d*32&88a38009c29b18b5261784e10b929045*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1056_n"26675e32f6f0c4e7fc7b6157ef17ce2b*32&34201cded6b9b3e0b692d9285842532c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1056:128x1056x128_n"8a801f257087ccc4656e8350070b40c2*32&99447fc3c70b3f263a2f417aecfa3a2e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1057_n"e54c6c274efa7f7e19a6b837c5c4d073*32&6dabcfe36c7beff5955b0ec2d49ad8da*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1057:128x1057x128_n"0af92f1fada5105eb1482a2077770a1d*32&e37216193598cc5cecd1e1035c527cae*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1058_n"c539727f5c243f83c7aee1f2c9d07474*32&331aee9eee60b5145d082d8b27002246*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1058:128x1058x128_n"04de5390625ca50b840762c815794773*32&c6ebac0c454d4ae4ce0898629cd8123b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1059_n"2455e5b9bd295661eff00a0f49d52f06*32&5e4a022c6f510884a1a9e5c6a7b5eb87*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1059:128x1059x128_n"f74eb5b1ed42ee19353ba823d5434f98*32&b42f30f09bf2af8d312a518557effca6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1060_n"97fa9b44406c0b2e9c86ec88f59bbac3*32&caaed550dee83a9980871b8bb0c312fb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1060:128x1060x128_n"8d3efa689d1fb83b9f01565a24fd2599*32&926fe4d7fb2ac48663bb96023aa944e6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1061_n"8280294f670880bd93804749fb972ac2*32&f6e203c715284d39b025ca91ff059b9e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1061:128x1061x128_n"aefe87eff051b43e2d28b4b723be409e*32&b5e919737deb3438de37edf073fcf3d8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1062_n"c31feec733d32db82553f5f26062696d*32&ff15eb705fc28e79630cbac14f4a956c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1062:128x1062x128_n"167f02cc410510a8aa835173879317b6*32&880aab6901b08b8b3d4d27145a08d618*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1063_n"f4ede9a69a73f50089427530b1d8f45c*32&8f7fef00dbb682ea8b35660b8dfd90aa*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1063:128x1063x128_n"b5da334c8e19170a590f23e1086e8379*32&ecd2da177d3e12b72b9c21c05853e01c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1064_n"3c009ac8d4a77f3f8e0ec2fbebe19cd9*32&b7579f26cb9b8471926a5ca06be77708*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1064:128x1064x128_n"cbb60803fff0d29a212eea35b3fab1d1*32&f763195d6c29d349dfc75c8d799b01b0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1065_n"9092908439729f9fbab53d3771d3b2b3*32&9e284b5868bfa9f501749251e22d0dc7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1065:128x1065x128_n"8416691eaa411b5d4bd383acf6d01ad1*32&03c536c15e98d17f5777ae2c43cd501e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1066_n"91f8914dffa847134a2d2ca3cac4b090*32&5fa0fb1995db8aae764deaab2c2b00c4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1066:128x1066x128_n"2efb93aab828a9c6fa9e86a59e56396b*32&f7fc847de61f55bd0424073c9d9c5c81*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1067_n"8bfce5983103f1a9e7324ff1666def90*32&dfd13b32d7ab02bff27a4e6684a01be4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1067:128x1067x128_n"be34d2386bfc67e393a53d62a72af7a3*32&1fc62373d67da2734681a9689a55cadc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1068_n"657f5b6d19b0061af8cd3f1cb301740b*32&e218ca1346364a3fa218fc13d7f1dba2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1068:128x1068x128_n"f599c89c19a0cc7d957f97b5129de23f*32&82e43e53e1d99784bee6f00f244152dc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1069_n"a0c2736768e1d06a670b6e4fecf1fcf1*32&3a329b9620ff14e80209330fd0f6cae4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1069:128x1069x128_n"e632cbd2bb66cb19bffaf5cc29c39108*32&3926d68a3ed7baaeb61fedd3b69930f7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1070_n"195728bbe55dd67b3c71925e00a0ec1a*32&8ca5c6bbd063478df8909da5655d0ba6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1070:128x1070x128_n"d47a1e92c1de83cf38e73cf885d2fa3d*32&334a298762d339ef9e6e037297768b3b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1071_n"83159f3f30d933d3c3392f7417ce6cfa*32&1141fd84c5703f37a83e24d57cbc330d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1071:128x1071x128_n"7075837aacc77c0e72e3a4fe42d5b280*32&88caf193efeb2c747a71790f5351905e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1072_n"8832e97caf9ea09e7df9bb7430df2313*32&df4bc29fb63c4cfcbe9a18daa13a4372*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1072:128x1072x128_n"b7f2a7cc6af52b7e6eb38e8bea85577c*32&5008ad2823cd62bf4f08a4483eac7546*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1073_n"c29180ad1017218c8a11ce1f9ae89bbd*32&8ebbd7ffec7a5e585d1e8f769ea5d874*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1073:128x1073x128_n"676eb4f15595a463a76151108dad1e54*32&b99c1b07103f2908c3906a8ce2cd7dba*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1074_n"2cf516f49dca9b204effbb52876b3757*32&88555df7218bce072ff4327b5e1a7ec8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1074:128x1074x128_n"c4a841414c6e075977259d08df2937ab*32&f98a4148139cd92ab018a8ee15c284aa*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1075_n"5e14b5835c6033522a928cba5f8f35d6*32&4e47a6bf5d54c087b2adc0d8fb9c99dc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1075:128x1075x128_n"7801b00ba3971d090ff6cdc772284f5d*32&4249729fcebcc5c8d41d164f604df2db*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1076_n"cce9fa28fba5ea929bccd88b160a7997*32&4101ab39206940bad91986bf14bf8db2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1076:128x1076x128_n"2a3c24fa458c1bdc4eebf426d169a381*32&b027e9e5936c64f61da5d7967df7154c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1077_n"7281e4a6df317a73e39dba710375802f*32&d01789937af1e724642da6d0920d45d0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1077:128x1077x128_n"14fc5f3bdb4e97d1bfffb3f252a543da*32&038d85832042706d22366193f3eb85bb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1078_n"79cfb926d5423059d6b62faa5eba47b8*32&03ebbbbec2759ea451bc88d77bc4389e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1078:128x1078x128_n"dd9c9ad40c524152c35eb5b82cacb866*32&eea3048be857bdcaf6ff1561e5e45558*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1079_n"bf0e9000544dbd41c0a7dab7a8c08769*32&2b93f8d5982f8384dd31aef987b9ea87*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1079:128x1079x128_n"592d2a4fe38e3d665cb98892faa1bc5d*32&4a10c5d4ea78e56d75e5b78c6b422893*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1080_n"aa48e234642c395beb236aa452236af7*32&4dceaf9ffaf2caf5c3505aa7b16c1e29*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1080:128x1080x128_n"cbe3204b47d0738168a832a79617eaba*32&9db82316e42b5f5b5cbea78a3e306fe3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1081_n"0d039ed0efce2280a97d22fd9a74ce36*32&80f4453e19382c5f57a9692353ec8895*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1081:128x1081x128_n"a3b0b344f7cf94ee62b2ada1bedfb35c*32&7d992cff7bfe37d84d91f6f7c7f30dbc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1082_n"2300fd1b93a07043352b2d1da7756d88*32&20bdb620030b0c83509e82b765477c25*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1082:128x1082x128_n"9297cb16e85e0cd6c804ce388fa37307*32&4b1d923b005db7b4e16f947870a4166f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1083_n"82cce8219864b753afe43608d251480d*32&8c6ef87533ab97bed9f1c10abd1b600d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1083:128x1083x128_n"fc4a32183f1bb265889d113c3e05e242*32&2979a1fdddeb89d7cbc57166226a725f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1084_n"a6f0cb501239d3496ba50e4438748cf7*32&56b93f9f8396c561d6005f63db406dc2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1084:128x1084x128_n"997ce2d4272b83bc3fb326678f3d8c8a*32&743afeb895eaf47c61bd25f24737abd9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1085_n"f8fcbe173f2bf1f29028a3803a770a70*32&5cf2558dc5536eba50300a9682edb77f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1085:128x1085x128_n"86d693c30b75124e17e55a3cd4171f94*32&ed353ca40f7d26d0bd67e09eeeb52059*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1086_n"5acc0aa83c2f8714a2bc82a466ef28a2*32&e8d3ef06362952d5d577d518730e675c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1086:128x1086x128_n"89c4a8e3bee321870df8816517d4ad3f*32&cb6b8587bcd6d2f165616cf14d5bfb68*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1087_n"5ad787d8e3ce105cee3e089318b034b8*32&eab65d74a9f70d6accb4b0b268ca091c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1087:128x1087x128_n"e0bec7c5fd1effb40b5353f91f6a8cfa*32&202cb090db0318ff57ecfef506e41680*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1088_n"4061c341f6b101ee15f970322e2308fc*32&e84911283465e3e5895e97e36ff9d887*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1088:128x1088x128_n"2bccf0298cd8536d0e9736d6aaacecf8*32&d47e5bdec19fe3d95210e0d7ceba295f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1089_n"10b62727130691705226071048542d5f*32&227a022393899487b09d82c224cf6c95*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1089:128x1089x128_n"5b9320a072fb45a33c8ca72ef9464765*32&a55de70f8f2b098c48c413f3695ebdd8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1090_n"b54758dfac09e91538700850a3313d97*32&67461c6ab829d18028560b9c0627091d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1090:128x1090x128_n"4a37b41ad095fc33de2caa3fe9ac4830*32&5cf749d01dd5d5d5d1f1500da7b5e599*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1091_n"7f664b121ab3799878ef50c8efa850da*32&f31c45275b765ee7d55b73636f3e2c34*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1091:128x1091x128_n"9eac415cb43886ed17fdc563df850373*32&78bc95ac52dba1d59056af3122ccbed7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1092_n"4a693aee1427f25f0f14110039d77265*32&75572ef5d07b05333ad6b8dd6305f5d7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1092:128x1092x128_n"f699add6e6049a10984f1b0091bc5f91*32&05797ad047cfd111ae5d7fb8fc4275c3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1093_n"d7889d204fcaa22d0ff8c4a5b3b0fea2*32&8829122a98ca23c87ed724f0c1a5b294*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1093:128x1093x128_n"323eca9b922e9f5d55a760bc3b5c2e9b*32&4e5c1145c377a316ddb48f01b6377455*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1094_n"5de5fedbb785f719272d5e9aa62e4d23*32&32d2d90c7274bc05b39b8fc5c9d8877d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1094:128x1094x128_n"3eb59534a36223dd66448e0ea0ac5c34*32&3a28a2720aaf90096f8c4e2860f7d985*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1095_n"5a002d5e4af0f6b5acffa34d7ebf76c9*32&03ed4d2d919e89228ecfe641efb523f8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1095:128x1095x128_n"0a6ce8f0742dbc344d589bcf80b7c384*32&ccf12548ecee893b8d09228753eb77b7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1096_n"b2314c6e3a93b37e32c0bc5052138f21*32&d1b445e2f4510261318f48af4b2691a0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1096:128x1096x128_n"9523f98b7c40970497832b7de1666900*32&cabc2c3d9c5c009eaa669d2f95133d4e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1097_n"0fa7bc499c845fe2ea58162e54f36880*32&ad9e88c93aa6d2d04a4cdd39889e020c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1097:128x1097x128_n"20605857640e07ac1054a96ea45c3f4d*32&07a68d9a032f8a5ab021e89df261081a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1098_n"3d0c315e6cbf64c6638591255c41519f*32&9383f01030044a283bd6b35f03e0ca22*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1098:128x1098x128_n"51f02ade3090c0f7a86371098b5d2813*32&f13d3fa0454f3780fd1a1ba650190c5a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1099_n"9938562ea2087655e9b567b3f045bd92*32&08eaaa6f19a49524ad6fc84a49837edf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1099:128x1099x128_n"99e2432cf2216368622f24baaa8e60b7*32&a09e39d04f9dc393a66b61c6e4fc7717*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1100_n"f3a9e740420a4df452c39b721a68ae05*32&0d57dcbbeb9f955060f1d5735b355a05*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1100:128x1100x128_n"7dfb303140bed09ac7e60d6b5b460485*32&9acee2fb30253d5a17c4fbc842e129c2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1101_n"e994b9fbdf32ebadfce4c7bcfa6ef90b*32&048e02f380656a88860a2c4f2faa7092*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1101:128x1101x128_n"3531b65dd7b8c06a66b8929ed6565c2e*32&5893d230b8575ce071f69f2160541a6c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1102_n"591a7af0362d0ab01f3f2a726a4e2251*32&2e6cd56a0a9827726d4e4e2b351e6a2f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1102:128x1102x128_n"710e7164f7d81a081537630cb07d4bc7*32&344fece5abf15af5117de92426b78eaf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1103_n"0459ae1e117fa738923674797057c222*32&896ba452b6bc23ee24ab94dc28c38a5f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1103:128x1103x128_n"2d87a54f3bd13cb3408be00200d993d4*32&972b7fb5c6ba94ad14c4d5c83af9205a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1104_n"3d1cf90bb96146efedee2d0719e8c48a*32&304beb5dc0b351486109e31499d3cfbf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1104:128x1104x128_n"2e593aefc74114d7fcc0641714de9099*32&45512a14042a7551a361406596fc23df*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1105_n"0273e5a62128018f92b0c7d448e5059b*32&56e9e334a3ddc0b67c9e9165163b37b2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1105:128x1105x128_n"cdc7563468c6627c3dc28675ce00767a*32&80c6d59b0ac300ddf566f468a6168a73*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1106_n"1ce886b0afc8a58e7917337528fe427f*32&57d06173087db9857c01fcc7fc70341c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1106:128x1106x128_n"2279ed8ba559a5421ab2cc01a1245f10*32&59ad363cf8b54181bdbfa50602193839*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1107_n"60a18c931ffa3d539257e4452f1366e4*32&03656ab4f1cf6990b377927bacb7ec48*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1107:128x1107x128_n"77d2e2ba96629ccda7283f73d6674c88*32&1c2b4a87924cbdf2402cd7d94fefa46a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1108_n"7a50d43d30a78beab084c4190932e0d0*32&e06fcc5ecf8fed828199e8e52417b183*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1108:128x1108x128_n"744d5bb8b5b6fb653bff8028019640ce*32&13e6f96800430a7c9aef57af9f076ada*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1109_n"e7fac0ec5aafac735b53abc0b2ebc348*32&1f1fafbb955e31caf437e03e3086ca3b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1109:128x1109x128_n"4ce17750f2dd99c4b3fd8012b89fd9d3*32&afaec378d2dacd1a8b9b5da997ad2ee2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1110_n"03c52c861569ec3ce2c8997d8061978a*32&7cae245c0be8c8162101ef69c464fecd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1110:128x1110x128_n"66fc3484830fa7ebe9bb7ac0b41814b5*32&5f304211557c3ac9d74f0b38b6ea0634*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1111_n"4bdb12cf9842fa1746fb4f725a0a9b3e*32&c189ffe581027f1793f1bebbc9f509cd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1111:128x1111x128_n"0bfd30046770bf33fb3121d30bd3e280*32&c306aab22e701e0930f5d4e2ada262ac*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1112_n"6369f58a47beef02f5c2b62be739c463*32&ad9a0274f96deb172cbbcf6bd83f1471*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1112:128x1112x128_n"e8f575f216461169a11cd449d3ef0c0a*32&b060c08874f2bf30a4640e1d6cc67d3d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1113_n"6ef016c707cda60dd8abafe52d6e2c7b*32&165a22802bf147091279ae8c1dcc358f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1113:128x1113x128_n"94cf3ba6221df4ee5edd5a9736f77ac1*32&16f8b0f95ff3e4a1828aaaaacf2c94de*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1114_n"bd2f4c65b115d707069b76af3d8a8dfa*32&51a42e5ede98a562bb90574bbb234daf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1114:128x1114x128_n"c6086c6e4434e362086976a056b3e9ef*32&1ebb6bf9dd874e85976ac87fce183efc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1115_n"020ba47f052fc0289b67ec50fbdbc3d2*32&4e2014c7378d6a50203a48f8eaab3d6d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1115:128x1115x128_n"4ab5f6202ca0ea32f2993ef335e2c2e6*32&0c38ddf6f3d5e2856bfdbaaf019873cf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1116_n"dbc93329d84e1cf11b21fb6acf90e3c5*32&a7a397aa70febec26d63dfd768b96ee6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1116:128x1116x128_n"48815178870be39e19eba2b08bf4ee6a*32&5abe379ee518a963e5bfbbb6f28dc608*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1117_n"a25cf02f124f56c374a82cab8b19b503*32&5960da73c5e951080400125f746359a0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1117:128x1117x128_n"9a3422cc2f18cb12971a2ece71f2c9c7*32&5674206af7b5a4abceb28fb4651381ca*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1118_n"0563079e998901a6b6c4a41b254f1287*32&70a961628588bd166366b94f87f6fb5e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1118:128x1118x128_n"15747b433f90e05d598cf5c178b2782d*32&b27253ef3f7123227ce40558d3cb13b9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1119_n"9e9b06e659e09b9c4c0b4a2a5c70e6bb*32&97512279fb62d7218e69d3466f8f49da*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1119:128x1119x128_n"6ff20a36af1a28275d74176057d2ac35*32&9fee3e63d1e88af790a0a73a0f01429a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1120_n"7ba476bbe320225d93db66e0c6793a2f*32&db048fb6f8b6b27d561730849f71b093*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1120:128x1120x128_n"83e51eb130d7ca8bebffb170680a5aad*32&5a8d68aa8d70b5274e7d60cca041f7d6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1121_n"8c63347ff2a6bf90b1fadd6155e98a87*32&5c4a2c2a631ad219dd234c565fb9754b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1121:128x1121x128_n"a17b7a8f0d157dd2e53ec9b44d0cab2f*32&026c34e582b7167300d2ce70067fe594*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1122_n"85199a66298fda09df74daf08c8b69be*32&4fff6ee17ff60740937e382a2fa92f40*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1122:128x1122x128_n"c5df464c6c5d7dfe3317bc1c67d03a6f*32&69f2bf86617752c91aea553b81aa090b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1123_n"49787c96b58a05c603212d381e31336f*32&d170f970eeb2f1661e2c405f025a2c7c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1123:128x1123x128_n"472fe82cc7b93d1944875aa1d6903e6c*32&9da33593679d262a908789a89e5b4196*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1124_n"6031a238a860f0902187f6cb493d566c*32&94a8254ac7b87584c9ac1e0a08666096*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1124:128x1124x128_n"9c7f16e6ea9f6856d65c25640b48a29b*32&668a488db8f0f1ee9e2c3f835e0d63db*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1125_n"bdccd9ad03002c24616077098a0ce04d*32&7168032a5da09ab4f9b6e6ca1cb65f7e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1125:128x1125x128_n"d8b58a1e5acb920572688688098923c4*32&183b608d590b38ca4cb9891ad42a3690*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1126_n"ac7cf64b56d4e22d80520eda044881c0*32&50bcfd5f2428dd4f079a9ad4d65acfe2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1126:128x1126x128_n"7a5af1e0571a247d27d23b36ffc23310*32&ac0a96fdeb694f6c456c27735dc95f41*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1127_n"ae1ad65f9e301e50cb6068a49f8cbfbe*32&81285e61bce9c6ad1ba202e7ede8b1f6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1127:128x1127x128_n"04b33f32e45b41fc20d147e24e4d47c5*32&5407f337718c9b9fcad01e53f7c9c8a1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1128_n"3e810e3051031fcea6cb824a3d018eb9*32&2b01bbdee9e2e6b134edfc978c1fec66*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1128:128x1128x128_n"59326a6b76981c4931a606f43e753c40*32&ba6467d4f5a1ffb3fed6573b84ac039c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1129_n"bc0d2f6389d443aa4f3543b6c8e737c5*32&3f504003ffb3570286780296dc6662aa*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1129:128x1129x128_n"05626331b54300ffe7c7c06283ba9614*32&63c1524b197bf26a9e769e64960a6edb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1130_n"f8abcc74fe3778a5b9f9e6e7dcb0a409*32&a3ae891c975c34e46b39e931e5116d2c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1130:128x1130x128_n"1d60aac7bf43bf9f364cc1090a07b0f2*32&83f2a6f91d1c8ee839ea2aed8235702e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1131_n"9b65509df6d637e6ecdb6a82c13e2c66*32&aee430457dff94f2757cbbac291ae8fe*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1131:128x1131x128_n"ea3af1361e3ef907ae39ec82d81069a2*32&3fb1205e6083b0733ecc778f8b1f6b32*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1132_n"2796e9d6de6377510c25577acb3908a2*32&c5e5cde18aec4b8d6523f20dac32101a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1132:128x1132x128_n"d884a95685469c724dd2ec73c695f10a*32&cc98512b0872598f13fd10db4f91b7f6*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1133_n"69c496c901e58664b7d7676eeb92c0b5*32&3f8ba53d1f6528c8e41183ddeb71caec*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1133:128x1133x128_n"8e0c92695430f51274c0e67a2843f773*32&37b04dab73aaf5ac1738cc75ca1c9290*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1134_n"40fcbb4fce8060f895f0d5ce0a28cbc3*32&6a858c168404a536720c76df1243d1f7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1134:128x1134x128_n"89a4fce1786abbb9b2e9d7258cf9ffbb*32&71c380cd4d5a877c39d0b7ef81505aae*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1135_n"90dbd9d13f6b1a3228c11fc8e9e4be6f*32&25e7acabbfbe197120d1a11516f883dd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1135:128x1135x128_n"bfb0632e72341db011712e5094ba75cb*32&3b977f2079e28eed17449d1527c9e948*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1136_n"a82c0d647195473869eb741c0e3e0ec7*32&7647c96f68e229370433247abba05e6b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1136:128x1136x128_n"d02195731d2ca8f95e5f5421ebcac56a*32&99efe1461a199667a813c55d74faa896*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1137_n"47659b56f21d470e513b3d45e56a9ea7*32&9e97d3ad1fbe5c7793362deb1dc63e01*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1137:128x1137x128_n"300710ad1e164ca7daa17814d156347b*32&013a8bf225231dfda9c01b3d20e7f8b9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1138_n"91ac56caf1b11a2125436dc37b1b8ce8*32&fcf5c7b3d1eaf768f02a68610f5014e1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1138:128x1138x128_n"ffaf674a756d28af2f63ed8823f2fa6f*32&324074be5e2ff345a493aa9630cf9317*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1139_n"9c0f0ef67ef96455b921b08060348f04*32&cbb77287ea2dc906fd99472004592660*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1139:128x1139x128_n"16c2c8f8829e3514486945e0ecf4d79a*32&0107eb3c22ab78f0a65dd7b801392acf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1140_n"4f238e058b45bfcf37882ab89a5e2eba*32&c9416a43486c5b48a0d7de6d422f2f21*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1140:128x1140x128_n"8247398e00822ff9c8a5978e695df112*32&453357688484c6c47e4b3e41a8293897*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1141_n"5da62f2c0434daeafd8297a29f87209b*32&c012738ff4c31a1d2ed010a8fcecda6f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1141:128x1141x128_n"fac6ba5264e59ddd5d1a44d00519a396*32&2a3d3d2fa337fdca37fcc9471bc46454*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1142_n"8f1371d0bbdfac59c932bc0dbea8f527*32&ce2f3fa4fc05d06576b8094076d649d4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1142:128x1142x128_n"45cf8f1881c5d501f37dbb2771d32625*32&77b528f251b67aa99da96092d333ba3b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1143_n"8766f7d1290f85702967918cca52db9f*32&36fa9f4f0ce03927fa5c8c0673e5cd2d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1143:128x1143x128_n"28fbad1b97872aa7392b38e737b1c830*32&200131fed10f44534dbe6f94d9b00bbe*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1144_n"e64ca487e41b23054c4fa78919b11691*32&4f52d41aa19940552c63f7e239ad12a4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1144:128x1144x128_n"4fb50885323b4759bd8e7ff345c38f60*32&625dabfd94c6301acc12cbb6de008335*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1145_n"e719f4522e4bc10d334cf992fe4431ba*32&478c74382ac89d0598404c615adc2c48*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1145:128x1145x128_n"a9bcd1df90613c1beb8e677436ed3c6c*32&e8d28efce723b375741c9eac43d79f0b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1146_n"c72f510827259ad69f23bcb622ff5ec7*32&126cb78983a9e08f049d477f4d6cab99*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1146:128x1146x128_n"f1776a0139daf9916161a41bf10b1845*32&3fc787b0f35f9fe20d0e3f32861608b2*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1147_n"be3523aee1032f0741b691ea6e28ef01*32&c5f0d3f0259990774adda5da0005eb82*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1147:128x1147x128_n"66791aac071bab8683ef534eac463c59*32&21fa3c55ce50387f816a18942afe0b49*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1148_n"9613e78fcf71921a37764e628646fd06*32&25b3fa59489e7ed5f3c99b431683641a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1148:128x1148x128_n"c811d1e920d567fb99973cbe0cf50be9*32&128170bb3b5add3ab994f6bed73aa71d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1149_n"bfc26d2b8f04fe57980db2b711fcb7d1*32&1a0404ec438b7244e58ab0aea17fcd46*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1149:128x1149x128_n"047b0f41531a68f4c733b6cff4473a3d*32&f7bb7dac504dcd0da636f49f6cd7b484*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1150_n"3075bdc00658bcaffd66ba4a29fd1625*32&0134d7d91cffc12dbe33be7b5fbca59d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1150:128x1150x128_n"6b764a3d995772e72fa6daa645740cee*32&3b93d632e74bc071a4f335f27f8618b4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 128x1x128:128x128x1151_n"a0363b6ddf371aa1b720f016f6f8f307*32&4d19a07be6f11ee83d3904cc713ed6ba*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1x1151:128x1151x128_n"80836d3200344d7b7dc33fdfcaee29ed*32&8cbd074e106666db089f3faa35157b3f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 283x513:513x80_n"6d87c24b6795012440f1643754508d23"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"a94328339f60488724069ee21cfee72d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2048:2048x1001_n"63dd48d850c22c9887780e8dc8dcf402"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 124300x1536:1536x512_n"1636b9eda282e2b91435ed63233fcf7a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 124300x512:512x29_n"11d2c415e314e5f32e4c7282956c7b1e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 29x124300:124300x512_n"cd5c487ddfe821630201cc28f2ccc79c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 124300x29:29x512_n"cd9b15b92b3fb9fa2128dd06288b12a9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x124300:124300x1536_n"ee1929fdb02482f37c0a119a3721abe7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 124300x512:512x1536_n"c6bfaf7250feb7df0f55634658695512"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x1024_n"eeb4c32362659bc3c75ac2c14fa21c89*12024"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x64:1024x64x384_n"9a705a76033ad54722941f597aee23ff*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 1024x384x384:1024x384x64_n"d3477c36a290d876a7bd37575482cf3d*4056"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x4096_n"62996e766c49a22a3e4a04a0da6e337a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x4096:4096x1024_n"f55ba89a1318583794158692a5264cb0*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 24576x1024:1024x2_n"1506bf09ced6da00a02dc9f163792317*169"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25+binary_add:f32:5:abc 1024x384x64:1024x64x384_n"ec9a7785f374e9518473270d155757f9*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 24576x1024:1024x1024_n"31925975c58c611e7098d69aec2d639a*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 24576x1024:1024x4096_n"1ee09f493f8e54af1c71112b04843291*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 24576x4096:4096x1024_n"6e7692338b563a2aa7766c9f3c921948*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x1024_n"34cdd296a7f29a38ae544a5446fb4505*71"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25+binary_add:f32:5:abc 272x384x64:272x64x384_n"9b62da6e6d122803a4c7e3028483ff33*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 272x384x384:272x384x64_n"c56302fe9c1eeb43ef1ea212b33ebf44*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 6528x1024:1024x1024_n"86f5b20689beb53001060fba2462992d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 6528x1024:1024x4096_n"200f02409ab52e719bff543789a41686*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab 6528x4096:4096x1024_n"dbef34935b4a4f874c8cea67c0b76a67*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 6528x1024:1024x2_n"48abfac132f554babfc750f59c259724"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x4096:4096x4096_n"1a33a544bed97bddb93c207352d851e3*112"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 16x1024x256:16x256x1024_n"20975557431abcdf1f37721b914549c2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 16x1024x1024:16x1024x256_n"9ae7a2eebde0647262ac931b0d76cfed*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x4096:4096x16384_n"6543a30dadcd45150ca13b68c247837f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x16384:16384x4096_n"69b88345c3ab000eefface3efce6bdda*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x4096:4096x50400_n"24dd534de967408bd2c39e68eab3e4e2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1025_n"fa549f9b67d4854bdb2845cc0b72af96*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1025:16x1025x256_n"4d5f0fbffbb11a604b9a02c5d80930a6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1026_n"bd1664a93583d742c09dadcd63fb579f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1026:16x1026x256_n"b75d7865f2849cf6c5f92abc0a641995*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1027_n"ad49397b2950629f7923abe5a3b2fca7*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1027:16x1027x256_n"ad9769b359c9693729cdb29b934e4c57*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1028_n"baf96a7e0adbc2014f00137ef16ba32e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1028:16x1028x256_n"df8ad9759ddf694dfcd49591630fc7f1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1029_n"09e19c6342380c913c8fab336692bc7c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1029:16x1029x256_n"d3e38362a25578482cb2664379928cf9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1030_n"3ab887dcb17668718f251d5b75af7831*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1030:16x1030x256_n"0986bfd7757e72f998a27edd91200016*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1031_n"2fdf5944eca41e40a79882bd83c147b6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1031:16x1031x256_n"235eda00405504c1f016bcb855f2942b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1032_n"c22b4d895bab01a531ec33e204120006*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1032:16x1032x256_n"f69feaa2d55ec871d2630d40c7bf9fc2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1033_n"85fb0cc0733f55cc2fe0acbec36b0cda*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1033:16x1033x256_n"cea5e40faca8df50f05b67fcc933ed9e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1034_n"b8e854c0646f23f2e1341a670065a27c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1034:16x1034x256_n"b4be1d6ee5e15428b2b37dbfbadbb1bf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1035_n"8ffff52da0ccd779b6aacbfc3f2d1c69*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1035:16x1035x256_n"b522fda5790e75156324435a586fa149*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1036_n"38516c48daec0300f48d1766b9dff3a0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1036:16x1036x256_n"30c10be58a3dfd2c51dcc8391cdb3be8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1037_n"57368390f28e2ac6028b62b8aeb13ad3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1037:16x1037x256_n"c915f6953386cfb5caed7d221c0f00cc*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1038_n"cdea6d7d2095866da000aff4116706ba*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1038:16x1038x256_n"02493e298053aa1e8aa930e588c215b8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1039_n"442bc20e6fdb760e2eab833773fd9076*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1039:16x1039x256_n"493ae9c31f283a661d76f75f2b6f078a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1040_n"f02240b7c7f76f808cd9bb76b9dec381*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1040:16x1040x256_n"a7c4da44f17a494e6281bfd5300a3109*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1041_n"8b000eebd2d682bf2a9d0a3f6e7fcbb3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1041:16x1041x256_n"dd264c11b55fdb01e839a1f7b9ece890*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1042_n"d91e1bab130442dee8c17c39cbd082a0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1042:16x1042x256_n"5c4abd47ef25ff2ea8b778bba6bf879b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1043_n"08cd786fca9e6d1767f27e352b704b47*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1043:16x1043x256_n"3f2bca94ca75b16d214adb4623215096*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1044_n"73578d06998d7ab3bf4dee2b22931fa0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1044:16x1044x256_n"85e0262ae669d35ecce63ca7b297ce09*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1045_n"2dffe0d89abe40ee934062014ab7f7d1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1045:16x1045x256_n"96b93a2ad54d167af225414f835510c0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1046_n"db01c740353ea4840aa83bea3469521f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1046:16x1046x256_n"2d41b1dde9ecb46230716595ebbe4c65*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1047_n"2e0060c9540a770fb97a185fe0b5341d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1047:16x1047x256_n"e89e3878a3e7317a13ce450bcb4b7479*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1048_n"acfd64cd0727fe8ab141687632a42e2f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1048:16x1048x256_n"beb38dcdd16c54074a88afd17836b544*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1049_n"c8dd08ca2cc42b565f845e52379377d3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1049:16x1049x256_n"82f69729f25a5cb26bcf76eafca9d954*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1050_n"a4a4b31f7b28ebfe20fb4c01dcf1d174*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1050:16x1050x256_n"b6b108ff00f7a290a59aca296d72f893*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1051_n"1c77ba2706f7a31be0fa481e23220c73*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1051:16x1051x256_n"3d6d2e637524622d2d8dc7632e55d7ae*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1052_n"c0af68a0395e1d68658c53a6307ae13b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1052:16x1052x256_n"5d74c47f3e14eb2c839899cf62d96fdf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1053_n"a0a6302752d1bd88f48a7f9b09dca935*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1053:16x1053x256_n"8021c063836bcf8063ad6d5e1020c396*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1054_n"3b293309b1612482d0af052e9bdfae35*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1054:16x1054x256_n"b5dc44e484893c02dedae4e5881eb8aa*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1055_n"9bcf75b9602552442280aaa483a9655b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1055:16x1055x256_n"fb2e34356164932b8a4af09e436aecc4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1056_n"72c653efe55b73fd4d5b3a3284e64f4d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1056:16x1056x256_n"781528cce529536bc1132c39a21555eb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1057_n"39956c1ecdd141c5f937d68853483590*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1057:16x1057x256_n"15f2145de1ccf37e6113f3a9049801d0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1058_n"44dc0c0b690e220dc5cfb9c47783ef72*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1058:16x1058x256_n"68c3937c8ef2f5c0d08af6c89423f9dd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1059_n"f34082d7040108507043803eb64dd9fe*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1059:16x1059x256_n"05819a64fb59d46ad67220edca46d4da*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1060_n"d51716dc3f9dcfce1155b78fc10ff82f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1060:16x1060x256_n"d3acbd6419d458ea02e9b2823b31ca70*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1061_n"a1e37a18c6a6c78938b57fd52fb2b2e5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1061:16x1061x256_n"38c0047b715a93ce7a657b1e12ba2c55*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1062_n"a270e2a4b6e72039b9c0c02f2f60d470*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1062:16x1062x256_n"189e665ba86e69d3b1d495c213c5b9dd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1063_n"1b8c53ae280862af28a2ea144e969ab6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1063:16x1063x256_n"f0d0e7efacc70f9c16273e0ad88f8b72*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1064_n"e7b6ad1f2642ceb8a3d4a3463e1c0789*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1064:16x1064x256_n"0c8ce5ee7bef064521f2858a9e51d4e1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1065_n"8e13b213d52f77e4f6058f8c1f469377*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1065:16x1065x256_n"b45cf2e22adf41e08477cad4f1209a29*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1066_n"df196483f84c08de2368755bf7b8c09f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1066:16x1066x256_n"41f926bffd738516a5b7dcfa026359ff*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1067_n"b1abb3a593010d842cf342161e3062c3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1067:16x1067x256_n"e95ba400f81b0d7d742c3f7f626c73e9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1068_n"516b4b0b85f310554b9f8d70a11150f8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1068:16x1068x256_n"7e422919ac14cf84532a35669b360058*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1069_n"f92ccc26d2a4feff379f76ca64d72651*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1069:16x1069x256_n"5037e6b4b309fc97e9147bc2d186a671*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1070_n"7a74b75dc04b0aa469afeb0f7b4e31f6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1070:16x1070x256_n"92c25727a6313d8d3e52af5237836298*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1071_n"d72169aa28671389db301f2582f129d3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1071:16x1071x256_n"5a86536184209e36f298c79dbc2303c3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1072_n"08fc6b6bae2a3eb3a6b7fb44b7e59a74*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1072:16x1072x256_n"e395e6c7146cc74707398b908fef3794*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1073_n"8638834894bd78ba08af8132f910b57b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1073:16x1073x256_n"2bc85d202579dbbd613870399f556be5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1074_n"0789edcabbcee0b0ecc63d96809fbb4a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1074:16x1074x256_n"120aa713bc56f4574fb0d2d2f01c6809*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1075_n"ecd5dff8fec7f8663e57f27193a32a8b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1075:16x1075x256_n"4b6fa73054254f8e41f1795b9a881866*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1076_n"be324df56f51a3df76e6bd246dd124d2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1076:16x1076x256_n"fdf01ac580ab89c5196b11d7f9de2486*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1077_n"a75d6f8d3314093c236fe146ed9d7179*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1077:16x1077x256_n"5f782ee0a7e878e447fae22757de1820*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1078_n"90c53168be2a8254485ae9825d8c8471*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1078:16x1078x256_n"86cbeb181a08cd9426567a5a99f56d6c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1079_n"5ecf281176ce500640439f375d4f0fe4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1079:16x1079x256_n"a1e5c8b4992ed0cd5701e392d388f125*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1080_n"6ead8007bb932a241fb3dfba8e368ed2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1080:16x1080x256_n"0a5aa1cfccba93ac431d781ba01ae2de*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1081_n"6b657faf22d793b6566faf47fc93eb47*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1081:16x1081x256_n"ff61d1f28ea68b6362ad5800b4303884*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1082_n"b9c900c86c94eee0797b80a105d8b8d2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1082:16x1082x256_n"5ecd6a2a073f0cdea7e65f9ec9e2cdc5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1083_n"b1ac9decb2ae875e903273801aa2af5b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1083:16x1083x256_n"6b0eaa34c42da0ab5ddb564a70556a3a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1084_n"b2536f8de04ac37d1b555241c0ec82ec*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1084:16x1084x256_n"a529f15330a11fdf0239bdd720b41c21*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1085_n"5209f9cb910994bafdf80b785fbc22b5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1085:16x1085x256_n"439799dfe6704f5e5b2c5a86936439fc*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1086_n"1d6b077410a2d4a65ca8aec40a498a98*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1086:16x1086x256_n"618e4f9d4731e264620939471c6356bb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1087_n"306c00d5fac81aeabba750bd7a781cb1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1087:16x1087x256_n"e4c20c75464a0042c3149753f3fb33d1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1088_n"ac835028caff7d0a57c80c31fd23060c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1088:16x1088x256_n"a330825e33478474aaac06aabd297c63*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1089_n"b7444a5e2319ae5e39ff56b0fe549e6d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1089:16x1089x256_n"4425429394c898eb9dab85fd0efa11fb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1090_n"744f56a73fdf42c11c1d15d130356a2f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1090:16x1090x256_n"1be304e386cf6b8d274cb0a4b6021ce2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1091_n"8cf5f5388351d6f286035d60161961fa*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1091:16x1091x256_n"7b89f033fbd9184c2ef7e7f3a433c9cb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1092_n"cff7e33c81a4696d4948f8a912163688*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1092:16x1092x256_n"8204297ed402958279c3e364c775cf29*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1093_n"b95c24a23b598d893a84bfe2044b8219*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1093:16x1093x256_n"947160eba58846fc2f30b0edc082f262*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1094_n"b27d4bb9d8762be159d3ebd96504129f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1094:16x1094x256_n"0e2e8d382e1a61d1f3b08c8b657988ac*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1095_n"ab8af355bd394ae9b140d16057aa2268*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1095:16x1095x256_n"f6c3323154c13b4d7c1ce31dc7119209*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1096_n"f5ca2d3bd2db160c92220e3cfefe5cc6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1096:16x1096x256_n"40f30e65a77ea1e8b519230a99164384*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1097_n"35cee858e424d8af880760023c362684*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1097:16x1097x256_n"5779e8ce3379ad8ff9457ff0581033e9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1098_n"ab9282605f4d26079468ba4c25a655cb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1098:16x1098x256_n"fe2dfa4f3981b75222e81c3a9f413a74*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1099_n"ff33fec4d9a5e913e5ed07423f71f027*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1099:16x1099x256_n"9651652e00d27713bf3ac670d34bb1de*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1100_n"15fdb6adc1d11abb0c1116896facf8a5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1100:16x1100x256_n"d2688323d1bcf39f78a31aec5ee7ff69*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1101_n"4311edcb1d5866c8fa9c102576af8e20*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1101:16x1101x256_n"6cae7447be896725be170ac2cbf925f4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1102_n"cf11e0c52b1546b046ebf4ebfa37d3b4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1102:16x1102x256_n"3d8f29df4158d41f0e5b9bb11ab58bfa*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1103_n"867c7bed55df1a6dab047706c7e257e9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1103:16x1103x256_n"f604269931ecf998068f2898b6fbc774*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1104_n"01041ca682883e82913c6ee52f253005*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1104:16x1104x256_n"776fbaeaf9352e1e0f7492b1e9cbc806*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1105_n"d28bcd2894aff32f208e76155734bca2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1105:16x1105x256_n"5120c608e0df766ffe859a3caecf1f0e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1106_n"f89a2405744ae8d9950747a4c6f3a180*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1106:16x1106x256_n"4a167dc9301a59745092a5bbf6af76ad*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1107_n"0d250c343f5342cce072ca778e62eac0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1107:16x1107x256_n"af3da4907c50e17521be380ccaaba64b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1108_n"95d8ad7b5b2a7e7c4449f558961e7e32*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1108:16x1108x256_n"4ea57514ffd22d5c92353592e1608fa4*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1109_n"07d7a1592d5678b39c272a66f14469fc*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1109:16x1109x256_n"0a68a1bc65dda0741d6b66d300844697*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1110_n"c3da720b6dea73e0ca26ba2edff2e1e6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1110:16x1110x256_n"52aada11c05f4ecc528ee588a52f5179*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1111_n"e2fee2bc21ddaae8ddf1c3974c55f400*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1111:16x1111x256_n"71dffa98b6db67e6c3ad235000e7ab6f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1112_n"c7668d791695310c0b631a22c9b42b22*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1112:16x1112x256_n"1c5ea44b6303d76be200077b6fe4e2f2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1113_n"52663985abad7ee8a29da4db4505e68a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1113:16x1113x256_n"75b78ec2ec95aab8d152b8ba16b343eb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1114_n"54a1c7e016fded02e406460ae08f9441*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1114:16x1114x256_n"044e303bbed8bb7b2042795dcbe94fe2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1115_n"17b764831bd884914ca9946e9f5a348d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1115:16x1115x256_n"195874298847109153d89aedc3a35963*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1116_n"7679a5815504001ea075c5e050a80ed1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1116:16x1116x256_n"f50ddecc3ff4265bc75bdf6ce0c8f6f5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1117_n"2ed2be9384360f38805ceeba328ecbca*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1117:16x1117x256_n"e08189bed4dc30478882952af9f98702*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1118_n"efbfc4ae60dad7d267ebc244df5666d9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1118:16x1118x256_n"706222fdee7b26824e881e0f48d98d3f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1119_n"8ec5df1892b0dbe29d374ec7391b2bab*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1119:16x1119x256_n"452fe32b7095ff5a1a9e58dcfd562df9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1120_n"8627abea00452353dcfaaa25ac9f51b5*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1120:16x1120x256_n"c05f073b96a99d44be790abdcdbd5beb*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1121_n"ab7722e4ea244206caf25f03bec18906*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1121:16x1121x256_n"a5213db9d5912ba553995da54968b1d2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1122_n"5f6f4528480bef95cbe32ff8c0525e14*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1122:16x1122x256_n"42a0b6aff94fbf40ad6f07562c58e85e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1123_n"4b63dc2da05b272ea201feed8e90e3da*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1123:16x1123x256_n"a6e55892e9711815aae03c8c083d42b1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1124_n"b267196d0b4c1445e57d3de6ac6afcce*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1124:16x1124x256_n"02239d24ce5ee0edccffa638ca6830e2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1125_n"aa00442c4b7db012eaa69f833b481a25*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1125:16x1125x256_n"80d5a9b32fe06bbe039c3a37cce47ada*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1126_n"10773698522447f884a3141bf830ce1b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1126:16x1126x256_n"74c3296784c4feeebac73fa87d397d74*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1127_n"db16dbab56c3c1ade50728343d895af1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1127:16x1127x256_n"ad1eee5f94e0a0eedc83edab7bb2c769*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1128_n"77dc35ede9dc62fbb6de2067c771f32a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1128:16x1128x256_n"65703fc659e64b69a4d026ede833b087*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1129_n"053b25d97182fd4ee2dc0218eebe24ff*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1129:16x1129x256_n"cc2eda79f038943c73afa618c1cb5815*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1130_n"ee278b47193233944c8709df4a1670f8*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1130:16x1130x256_n"1861c9c935c744c88cb4f32d39517a89*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1131_n"2091ff0d26f2f3f9bfff5df58b0b9f12*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1131:16x1131x256_n"556d7b14243db31e24849acc03c4d6d2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1132_n"803440e7f65ccb82bcb3302e82649679*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1132:16x1132x256_n"e137d1a05bcff925c162fd50a6984d27*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1133_n"e9b61740b394852fb2ee717c2434418e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1133:16x1133x256_n"ed58e33b07a5669690d74a7eeef4440e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1134_n"1c864b590eca72fff8a27f7aae13f9ce*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1134:16x1134x256_n"aa9196f1a48614b303692239cfb10031*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1135_n"23939ffcb0044ea185ec218dc56dc7ff*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1135:16x1135x256_n"0e07cfbce858c8863b32bab5f963353f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1136_n"628f7c95c725b1ef673660248de7da6c*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1136:16x1136x256_n"204ab8f0cd5c4ae2d86528f1d3bc20f0*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1137_n"e088e8f459091957718fbf906f16f2fd*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1137:16x1137x256_n"71c5efe259f25f600300f5ea9b54385d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1138_n"63a19932b9b729c5af718fa3b876c50e*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1138:16x1138x256_n"53338e693232abe8040025761e4fb3b9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1139_n"db7e5a71e0d111b93c1001070ed1f733*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1139:16x1139x256_n"13016638bc17c7e3b8d8a1e6cdbf18f6*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1140_n"e422c9ce929dd4bb73fc2b74c241f2ad*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1140:16x1140x256_n"bd2962c944991ae93ae1a30cf1546a0b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1141_n"a2894ea4ac91e1732db40271fcffee67*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1141:16x1141x256_n"26d8adfe4d8be5a179a00fbd88174bc3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1142_n"e6d11a67126493690c98847bf570919b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1142:16x1142x256_n"ab9221a220cf940ba8faf7a86d99a742*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1143_n"72d94da3f3dffbef747e7c7a125a954a*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1143:16x1143x256_n"f757c348cb57f64a3535bfedbad11c18*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1144_n"f9e8b939a453eae1267165f55512af05*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1144:16x1144x256_n"030f9975f112feedfc3bdce4a55429f1*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1145_n"baa5fa78df506cfe3f05f431dbfa3a19*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1145:16x1145x256_n"c5edf4a0a458c0255ed093ee2816ac94*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1146_n"55966ef8b462056b704282eeb44797e2*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1146:16x1146x256_n"07722d922fbc7d3b86d6bea53d62b01f*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1147_n"05fa756fad8dfb765bfc1e83fb604ca3*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1147:16x1147x256_n"4d3715bb090e505d15b60422f07254f9*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1148_n"44d2fddb57a9d4d77e799587b0dfa164*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1148:16x1148x256_n"7ca39b90f3db1b0739c47faa962cb73b*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1149_n"66063a14b281e24a592fe0133dd93c95*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1149:16x1149x256_n"adbdca1953201bbb974ee35fcd662e89*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1150_n"1ee32ff0ba17a8fb825b600484e18e8d*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1150:16x1150x256_n"18184bb602bf4aecfc24654e46a8d152*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x1x256:16x256x1151_n"c3528a766abb63757a178868232186bf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 16x1x1151:16x1151x256_n"0b8b30165c0b3a2436d4bc3c20d9c775*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x200:200x200_n"9cf3f440e13f76bbeca4302e36734da0*1024"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_logistic:1.0 128x14:14x200_n"f8d0ba6e67a3a22728258a892ec98bb2*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_tanh 128x14:14x200_n"a17d950e4b15efe39bb106723577471a*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_logistic:1.0 128x200:200x200_n"e1f4f6a01b2b612ec349e8ef39963ff4*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_tanh 128x200:200x200_n"cd8f027fe64bf61bc593755197ce5258*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x200:200x1_n"ca3effe986559950bf39a0f7a6aa11c1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1:1x200_n"f507768389d2b8ff4656569ca653f9ad"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x200:200x200_n"f9d84127450d3e894a79a57c3c571f8c*1528"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 200x128:128x200_n"d20b12beb8d4e33e99b1ec6f0b0b2206*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 14x128:128x200_n"e9eb1c2547cbe2e8be9440022869b33a*508"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x25536:25536x6384_n"8962f766d4f59b5a19f729eb74b2d173*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x51072x1:456x1x51072: --attr-post-ops=eltwise_linear:0.25 112x2048x152:112x152x2048_n"308b184953dfbda2cf8419e91bfceb3c*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x51072x1: 112x2048x2048:112x2048x152_n"fa13c4d7999d009a4ac524ca569d42b7*1152"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x2128:2128x25536_n"5901383827c3a4b864302341c0cb28bc*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x25536:25536x8512_n"98e81a0c90481fb764edeb485a02a82b*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x8512:8512x25536_n"154e4c81753705c83aa4020e069c2a50*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x25536:25536x4224_n"0e63bed6ebc2a00a625503bef58194d6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x4224:4224x25536_n"b5d077ebc8b881f82629acb8e7d6945b*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x16384:16384x25536_n"e15431bc80368b6f57ab602d9218762e*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x25536:25536x8512_n"7dc1cbf60136bd016e7401d86defd43e*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x16384:16384x8512_n"5c3b473221ab990a35b190760ca700cb*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x8512:8512x25536_n"fbc15f9d423b56a7004ca2ef910b4f21*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x16384:16384x25536_n"c1cb67cf8e43ab4c990e4343e7f3cbcc*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x25536:25536x2128_n"c1148a5eb9f9c185b0a37743581b98e2*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x16384:16384x2128_n"0aafc46dfe6910b5782747b0975da0a0*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 112x2048x2048:112x2048x152_n"c01f01bc49f69dd617fec28c614eaa65*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x51072: 112x2048x152:112x152x2048_n"32734a7e27027be5021e020f31a87a07*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x51072:: 112x152x2048:112x2048x2048_n"c542df3a064823a343acf71059dcaa5d*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x6384:6384x25536_n"5bb11157195e533abf6dc512321fdd4a*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x16384:16384x25536_n"c26ba3a0621d1e10d950598046760ca6*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x4096x40:1x8x40x4096_n"9dd87cadce0e0913cbc4a9da9a638a7f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x4096x40:1x8x40x77_n"98844e1efea03702c13d6fe61c88d7aa*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x1024x80:1x8x80x1024_n"eb550ca1c99a1846b28de4388d70248e*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x256x160:1x8x160x256_n"4ba93d04a663fee6b421a000a97b2686*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x8x64x160:1x8x160x64_n"0c329ea9e6ac5ee08d315d2953926018"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=sum 192x49x32:192x32x49_n"fb9a4238a200ceec7bcab134f0684a1d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 192x49x49:192x49x32_n"c9761b75b65449ea34b1053b78d8a7e8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 3x3136x32:3x32x49_n"f00ae8e6cb1a1e2ac0ea316a36dd83df"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 3x3136x49:3x49x32_n"be97a79053c322ef71df3c89f6093de3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=sum 96x49x32:96x32x49_n"d802774bc34796be428e5bfab1a50691"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x49x49:96x49x32_n"1952b7b113f04e927091486b5a8fc9aa"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 6x784x32:6x32x49_n"fcd4d523726aca0b7d74838f9b28bca5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 6x784x49:6x49x32_n"9e74698c5d94c43d88dfc4f202defcf9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=sum 48x49x32:48x32x49_n"de7f7c017fa2ee8d470ff8349c335f5c*9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 48x49x49:48x49x32_n"08dcc957796f909870119de34a7692f5*9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 12x196x32:12x32x49_n"7cf0a16d1d36681201a49843332ebd8c*9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x196x49:12x49x32_n"f7528b29f0fd2df24b475b982dcb0c60*9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=sum 24x49x32:24x32x49_n"9a3ed1cdf1e5ef332e4f9235594eb4e6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 24x49x49:24x49x32_n"e959710f23ba2a79c680bcc998322278*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 24x49x32:24x32x49_n"d33b708f93fd1d84efad962d98597d39"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x2048:2048x1001_n"abc79cd34a1c078ec2b86f93ddf23c16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x4096_n"2785bf0f850a90fd5bf4066550bb9309*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x1024x128:128x128x1024_n"c8ca3f1195451e09d83056c6f8411e69*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x11008_n"bfdb808eedb033f2c47e692ad46f03bc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x11008:11008x4096_n"52a7c6c286b821cc682d9183cc64d379*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x4096:4096x32000_n"d19e4016f24f4abf21d5987777e970fc"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1280:1280x1000_n"c83709a3d1574f8896395eb06da22df9*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x25536:25536x6384_n"5f37c9fc65eacec2ab7e6100de17bd95*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x25536x1:456x1x25536: --attr-post-ops=eltwise_linear:0.25 56x2048x152:56x152x2048_n"d1f3725b191089baec3ebb2001cb788b*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x25536x1: 56x2048x2048:56x2048x152_n"66813efb27e467b779c6b8c538fc3f46*1152"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x2128:2128x25536_n"1aabfc83fad2dd81a3a59aac21be7b84*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x25536:25536x8512_n"ba8ef716c096d2143b80aa6adb31990b*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x8512:8512x25536_n"a9c0dadaef43f06b2137f183d773a7c2*768"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x25536:25536x4224_n"8d566a0392d64be14fa2ee0031450d35*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x4224:4224x25536_n"a5ac728d53ca8451ca47b234835d73c4*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x8192:8192x25536_n"0868dbfb6f96ba37c63c7f0c691d2022*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x25536:25536x8512_n"a3c508d3d4477e98cc42d756ed0c83a3*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x8192:8192x8512_n"b58b2a9a4a0d0c7b0990f8ce6c85465d*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x8512:8512x25536_n"88a528f26a54916a855df77790cec201*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x8192:8192x25536_n"77ba1ddf3aeb242252a65229325aca2f*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x25536:25536x2128_n"9e931a991c6bf193d34dbb9e3cdfd433*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x8192:8192x2128_n"1c0e5adffdb8cdd448991e8a4821f541*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 56x2048x2048:56x2048x152_n"9444a4686f15842202db914aeb65a4dd*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x25536: 56x2048x152:56x152x2048_n"310daa810d74153e8de4bf1737130670*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x25536:: 56x152x2048:56x2048x2048_n"369e0710c02dfe1ec2d87db710c15e42*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x6384:6384x25536_n"c0907735cec4e4fea71d71e226a04e1a*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x8192:8192x25536_n"7ce2fa332e548bd697c3bf2dabe91749*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x200:200x200_n"519f03569ae8c50c611cb1bc566356fa*1024"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_logistic:1.0 128x14:14x200_n"ae870aecd47ed3b1c8e7df2388fa7ff5*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_tanh 128x14:14x200_n"30195285923a5e601e3318658198f217*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_logistic:1.0 128x200:200x200_n"1574376c2b11ac5d3ee989b16661edc4*384"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum+eltwise_tanh 128x200:200x200_n"2daf692730646eb6cf9a740ed7f81392*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x200:200x1_n"92cc3cb8d99fa851afce08a6f96eb90c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1:1x200_n"736dc27bef5a5f6229e2c21fda97a314"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x200:200x200_n"08770422386b1248afb02adc3d75ae06*1528"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 200x128:128x200_n"298ac8022cdb71582d7d64541444d2e1*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 14x128:128x200_n"732252443331c285d61612a3ba7ec444*508"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x4096_n"831587eda52ee01b4baec73d3872fff3*1078"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x240:240x4096_n"c749aa49d6853427772085ab7d5503d3*196"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x4096_n"440faf61fab5ac5914c821e177a2e9a0*98"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x320:320x1280_n"03fe14cb7342254b00d30ce9d32679c3*784"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x25536:25536x6384_n"94c393720d54bb7e29e1a7c76b609e14*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --dtag=abc --strides=456x12768x1:456x1x12768: --attr-post-ops=eltwise_linear:0.25 28x2048x152:28x152x2048_n"0892823c91af12a902cba67c3b6c8abc*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --dtag=abc --strides=:456x12768x1: 28x2048x2048:28x2048x152_n"57b585e3d1c7f1aaab55f68121e76e2b*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x2128:2128x25536_n"1dfb1763580c67210f90fd163deb1542*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x25536:25536x8512_n"db543434838a972398128e6b9c7f6b2a*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x8512:8512x25536_n"9c41eef18b0a2ea184e0d5b601180381*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x25536:25536x4224_n"546904795963872b9542bce068045b1e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x4224:4224x25536_n"9ae9a4416a17bed41cb356bf5f061a84*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4224x4096:4096x25536_n"435edf6ac23366500448d3d4a79a85c5*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x25536:25536x8512_n"0333957124db67ad57c25d4835b89136*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x4096:4096x8512_n"02f83eab2552658eaad9f447ba7cf370*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x8512:8512x25536_n"2e8d40dbbdd83814d4ad740872e32791*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 8512x4096:4096x25536_n"3137b68755a8c037bc004ba35340d05b*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x25536:25536x2128_n"ee8f4e8f758bdcb67ccf37f8a7a1cebf*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 25536x4096:4096x2128_n"536b90f0279f73bf8b4a4993d6c3e87c*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=bac --dtag=abc --strides=:: 28x2048x2048:28x2048x152_n"3a38e07adb529ab1f06fa474d5ab3892*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=bac --dtag=abc --strides=:456x1x12768: 28x2048x152:28x152x2048_n"5b65e0016568c21e0883274d4f469d98*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --wtag=abc --dtag=abc --strides=456x1x12768:: 28x152x2048:28x2048x2048_n"8d766195f260bc1022c089d67d7b4518*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x6384:6384x25536_n"eec578e5e4cc34ec40d8aac5491f126d*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 6384x4096:4096x25536_n"8cf7a26150807b5dc0123f58d08f7aaa*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x1001:1001x2048_n"784b49bb65615e91c741d2ed24c01fc5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 128x5270:5270x1000_n"50ac5a895b9fcaf38158bbf8e69003fd"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 128x1000:1000x1000_n"204c439eeede9dd8c7e0e395fd97d1a9*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 128x942:942x1000_n"ea8e741838df84d56541317dd57365b3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 128x2000:2000x1000_n"b3736991a67f0b2102f6ec506b84a034"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1000:1000x1_n"0adb2947b61806f148c8ad39c1abb0df"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1:1x1000_n"df98db449f7e07ce183635cbcc1bbf4f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1000:1000x1000_n"392efb6a6351d85465f1bf529e922075*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1000:1000x2000_n"342f8e00c5837b61252337706fae4763"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 942x128:128x1000_n"d20c48b784937adb72957b4c12e2a83e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 5270x128:128x1000_n"0697314a5810365f38ab3f9124b94144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x32x1:0x98304x1:0x98304x1 1x64x32:1x32x98304_n"fb8f0d95ebd59c0ef973ca86f7929f53*65"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 1x2048x4096:1x4096x4096_n"a335178ba96c83f33f059834897a69e7*3144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 --attr-post-ops=sum 1x2048x4096:1x4096x4096_n"c4b3f1c97d77586b5af86b3eb2c399ad*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x10240x1:0x10240x1 1x2048x4096:1x4096x10240_n"fcd29d1bc714e5fa262f98e6bdb92452*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x10240x1:0x4096x1:0x4096x1 --attr-post-ops=sum 1x2048x10240:1x10240x4096_n"d27177f0feb6d38544dd723ee0182a51*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 1x32x4096:1x4096x4096_n"6aeb662ede89d9be96f297bec1b6c6d4*6144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=sum 2048x1x64:2048x64x64_n"8997c0a14398cdc10308572ba7059d9a*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x64x64:2048x64x1_n"2d48bbb3c321ac7c19739ef6f2c596aa*3072"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x4096x1:0x4096x1 --attr-post-ops=sum 1x32x4096:1x4096x4096_n"64234ea8fd65618c5087387758110e9a*3072"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 2048x1x64:2048x64x64_n"f30dcd79f640f896c9318a9e030ffc1e*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x4096x1:0x10240x1:0x10240x1 1x32x4096:1x4096x10240_n"1379d15e4913834475708f45ef570096*3072"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --strides=0x10240x1:0x4096x1:0x4096x1 --attr-post-ops=sum 1x32x10240:1x10240x4096_n"c9f7706a8e3f8703249f2bccdefcdf5b*1536"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --strides=0x4096x1:0x32128x1:0x32128x1 1x32x4096:1x4096x32128_n"74577a5755d414a00e4b70fee1338c4e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x5120_n"864f7d348c7122d40bd4e55a21473acd*160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x32x128:160x128x32_n"4effb46cd353354dbc72f34083894590*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x32x32:160x32x128_n"c268afb995a0578ffea5d4fb7f825ed9*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x13824_n"c8e7c7b15a12847b5d561a05d61acb2c*80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x13824:13824x5120_n"6273012c9e5a31df30e42cbfc101fa6d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x32000_n"408f30494bdbe80184903a72272375a9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x33_n"bd180297fd9dbe65b4c5f110a6ae3e10*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x33:160x33x128_n"a16fd7b8bfb5bd1577aa7b5abe4a0c11*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x34_n"5bf4e101267ead2e259ac0b14bfff230*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x34:160x34x128_n"99eb44f8249853d22238454ea7eaa1cb*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x35_n"c0e806c645f4eca4f736ce77d15bdb97*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x35:160x35x128_n"3c5e05539aff26e6367de5b663041c96*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x36_n"e166941befae0745e2ce9d8547886f6d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x36:160x36x128_n"450a100c07546722bd42cccccf95e14f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x37_n"98b4868b9fca7a4c65718df2f1b9d550*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x37:160x37x128_n"b8ccfc5dba0d5362c3e04998156ecc39*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x38_n"bfd2a501e42b6a6504328a70977eb45f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x38:160x38x128_n"b91b6197902858c1246c3c49834bc3ab*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x39_n"c16b13158335dded127fa0f4298ead0b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x39:160x39x128_n"eb5e5fa60e4aa227484a148de797a0f6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x40_n"22b76190a37263391dd821f30f0fb2f0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x40:160x40x128_n"94379a16ae9a519d91fcce2697c1f90a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x41_n"2139e56e2f7a7568401f514cb6737df0*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x41:160x41x128_n"249d6fb2e2ad55b613cf94e3797773c7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x42_n"f70da18c2afa33d82d739cd96a8d9cc4*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x42:160x42x128_n"974ca5bba49f30626e62c852ab125179*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x43_n"a41514ee21a457e7ea006f18a075db7a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x43:160x43x128_n"03b8425c16f39620da28e0d066f42262*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x44_n"a322a1f98d9c37194e9adc6e4ff29214*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x44:160x44x128_n"ac60807dc817f8eeb9e87a4ad455909b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x45_n"f3e6f84f509575871b3ae4bbbec2359a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x45:160x45x128_n"2cc115580583dcc82cffe8b0a90a673a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x46_n"dcde1ad2580ffc45bc35c8b892301e55*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x46:160x46x128_n"bbc8572d47ebb4698dc4610a7ddb0530*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x47_n"02367376795d0dc579f542ca3a467a3c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x47:160x47x128_n"e1ab72caeb1fd5ede79223b01f2de763*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x48_n"d174ab2b0aad4da50cb3eed4719bcd6a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x48:160x48x128_n"8b9fd2e6de973c5e76f535953c96738f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x49_n"8ef6e502b887636ae3779f58dd94242e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x49:160x49x128_n"2a621ab8bff8ec97eee26373b6c911e3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x50_n"aade9f32bc132e5fb1a137138f0751b1*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x50:160x50x128_n"80bacae6c0b10df69fd5e204d25df34f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x51_n"8941259eece3b83a2706620eaf607284*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x51:160x51x128_n"1dd0fbb3d235a567f067c9fb2a108f20*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x52_n"5f9440349bded4acc7d408b94d38b255*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x52:160x52x128_n"16e2508e9614d59c23d2b58cadfec2be*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x53_n"a46c61b9d43d456f2bbb0bc739b8e562*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x53:160x53x128_n"a8a7abd7244349acf42a981aadb16d64*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x54_n"9815bef1d382097f95e57480bfbd4723*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x54:160x54x128_n"4d8f0b0c145cff7e70ca39fbb4d01242*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x55_n"c841575160a918f8a11b2bb876fe65ac*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x55:160x55x128_n"2c8a31beea6e83bb1deaba16f726ca87*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x56_n"3b8300ad2a63c22751b04a0fa54afbb8*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x56:160x56x128_n"066f47ed4701c47060ebdc1a64c694d3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x57_n"67c621a34cfc2dc5e8a5342235d98ada*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x57:160x57x128_n"4998846860b5eb9866df505b42e4042b*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x58_n"39b51b10a4756ee1929e4660f8e90cd3*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x58:160x58x128_n"9c4513cfe01bc57219a5cf35899e6f9c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x59_n"a8803558df21779441fdb0a0694fb471*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x59:160x59x128_n"efd18340eda85c90ffe2867454557691*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x60_n"e2026558001c06c7b4d47c25a563109a*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x60:160x60x128_n"2ea4423a6f5b9737bc38cd8ecdf0d957*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x61_n"fc652499a5684799c983d8100bb0590d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x61:160x61x128_n"07aea39073461d87227c87433cfa9e20*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x62_n"43408210d2f58dccf69befee952a4bdd*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x62:160x62x128_n"2528f5cd6ff73e4378028470ad86c403*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1x128:160x128x63_n"9f28fa0b47173890da293e39b1352e5f*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1x63:160x63x128_n"241e6d5af9c7979900c9e9546a09f3ba*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x92:92x768_n"55b70e81d8f31dce59c1e1cb54807d40*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x256:256x768_n"0a9c5da791484f37e2e6cce19627bd6b*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x256:256x287_n"97592178580dfc648facd7d5704ee595*2&81d2da289170aac341d9754f832e971c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x287:287x681_n"5bf516f454ac2eea8e6f37ede4b86a5b&cb1e2c9c2a9e585863c1ca8681a587bf"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x379:379x2043_n"7e8a576c200ad80638a532386c60dea1*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x681:681x2043_n"f453fed2e682ca852578ebfe8ec265c4*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 42496x681:681x92_n"58e120eadedd7d39414795639371326b&0427057fe68aa0fcfdacf3c9bd370f80"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x287:287x144_n"f8177e8892a0684ef7ad3d5e39821e3c&f1bf829e7aa881065e158260a0a73a60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x144:144x72_n"c6ec97d88abd2bffc80d297c37b14b6b&1d2941b836305a7e30bb8a6f9df22e9d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 512x72:72x1_n"132692408acebacd19bdcd3d1f611803&7fddc21134807eedff0adfba038f81f5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x1:1x72_n"9a33cc93d6c38ff9d46d23ab6c24af7b&12b164e6a01f4f9ed7c8ba37abd0db39"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x512:512x72_n"9b8aa9c44586a520596ddca0f8889be1&ffa10d0ba36ff74870f1ed99ee79bcc0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x72:72x144_n"a91664899157674c05913385b392e27c&b5e3b387e7b7f0d8f8ddc6741c05b976"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 72x512:512x144_n"8c8fa777ff519f19c67f349099fe44fd&2ccb77b566b122258a6af5e1e5bc36e5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x144:144x287_n"e2bef69f27d5d4c1a0c259bae11d7945&052535bce5ee1c52d0dd38716f81294f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 144x512:512x287_n"9d9db8c8ec7978761a5af60680e901bb&407f95e3f679f0a5360476eb1c50bea4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 92x42496:42496x681_n"5eaf655faadabefe5b66ac52ee74989c&3156662a22daf302dbfc7ebf6df3fa9b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 42496x92:92x681_n"a93ce62eb4765738114883c96ff094ef&1219b6469e3a2f1470cc8e1ab16ae6a1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2043x512:512x681_n"444652fadb7241054975dfcddfdd58a7*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x2043:2043x681_n"9900b67364d95f27b8d476f3cf391a24*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2043x512:512x379_n"d6545fa495a8a3094ff2b99de91946c1*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x2043:2043x379_n"a164364b9e0a44e9853301dc8317aca2*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x681:681x287_n"37a52f7577d4e16bde61967ebcb6fdb3&6457a9b935b56a93f2208d3f87ee5349"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 681x512:512x287_n"4ab7ebf8a4ebdf19cc70f920dfc24fdd&b6fe0e57eed2d608fcb7ac842656d70a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x287:287x256_n"a134e20619941f15aa343abc2a57ae5c*2&f49b3223bc104aa7fab14a6e3ca6e9df*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 287x512:512x256_n"bf9634dacb91390d56af080f11a4bfb4*2&375b0228796db64c1053c348368a158e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 768x512:512x256_n"1d9e1ae74251623ee7ed43c129a96dde*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x768:768x256_n"3ae4647723b8918a4520a2167f598334*82"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 768x512:512x92_n"46cf0061214f9c9c70546fe323378834*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x768:768x92_n"1806b863c90e9544767bddb71efe8347*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 1024x2048:2048x1000_n"6cb8f779a4a2f72a840b2072f6f52a1d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 96744x1536:1536x512_n"66ed78fd17f5544c15ab2cbbea45d2b3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 96744x512:512x29_n"d22eb294630d22de5ead36ee98d74b99"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x96744:96744x512_n"8733c54ae3c8b71a1ca741ca4ecdbdcb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 96744x29:29x512_n"9f4062d09b385d0cc278aa6de70d97dd"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x96744:96744x1536_n"0303be099d4bd39dc238006a539dbffc"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 96744x512:512x1536_n"993b2bd52515f26c00cb2a8519609824"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:s8:u8 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"40f3e41b82fd8466906f5cb4fba0e76b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x2048:2048x1000_n"490d8097fd00157648d850bee0f8df62*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x1000:1000x2048_n"997b7a42e50a42126a6ce83def8099ea*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x16:16x2048_n"076974f60cfe5bc132e9ccd10d7a9097*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x1418_n"fae19ca88f9c51981a637c3cbbc3016e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x2230_n"43cfba6aeb0674a410dc451e119a877f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x711_n"23cd056246c6c20ce9d3538f6ac7b1d0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x566_n"1db0941ddb9e02f4d8d71b48428628fa"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x257:8x257x1111_n"50b4f380c041947d5539c27ec4ee8fa0"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x1856:1856x128_n"0c761c1141f011f4d9deeb3b8beef436*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x128:128x128_n"d893874e1e62cc7737402a91d57f393d*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 32x128:128x1_n"22bd96eadc771e0f0e3520030bd3c79a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x1_n"a015c9b479649ef4ce7d2bc7da4afd77*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x128_n"97ec9bcb7bfdc9291da75920102e63dd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x128_n"ef914b8bbca44951c9645e849437bca8*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x128_n"40ee388da377f6ebf82346770d6d7dd3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1856x32:32x128_n"578469fdfe2427bf3830bd7c117a8b5c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x1856_n"f339000befa7ce242cbd1b1926f890e3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x4096:4096x4096_n"150982393881494ee930c13b61cbb2dd*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=bac --wtag=cab --dtag=abc --strides=:: 32x32x128:32x128x32_n"46189796fdf4d39809e5735318861ddd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=bac --dtag=abc --strides=:: 32x32x32:32x32x128_n"9265486af1d186369f1e9b0441d13397*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x4096:4096x11008_n"209e5cc888c8503d69c0ea3ba99828dc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x11008:11008x4096_n"bdb5eecab2993ce6085654c989e8db3f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x4096:4096x32000_n"67e3082a4c207a52a409125fd7057fd7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x33_n"1d83f4603e0be6436ba15d6ca89bfe7a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x33:32x33x128_n"8faa6343c1f47027d594856fe4705664*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x34_n"bc3f56221ece7e54fd01ba021f2375c5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x34:32x34x128_n"f842557a99bef83d088540e3a1b1af07*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x35_n"5e2e2d54211a0d64aef80a56052e6ca5*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x35:32x35x128_n"90e562ef10f597230583992ecf96ac7d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x36_n"88f89650391f2bcccab54784fa3ef658*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x36:32x36x128_n"d02394000e6ebbb5cf1d2dfa52e53202*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x37_n"c14129e99a4ade78da1acbf3b32d1c53*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x37:32x37x128_n"27542f564f2c1e0478814c76481d06da*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x38_n"3d5f1c1882cff8c5f78b9e89f10c89af*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x38:32x38x128_n"2a94b533ac6d492588eac6a8158c9444*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x39_n"ea537c94df18ff0683bae1e0ea2909a9*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x39:32x39x128_n"261cd5bccceb669b7e9dd79cc6ac2a97*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x40_n"26ca3378a5a834551e9eeae57e3f25a3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x40:32x40x128_n"651e05b144389ff408b6b0c92348c09e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x41_n"4718309751329729587ce815be654d4f*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x41:32x41x128_n"9eaabc1296add412dd7b584d6addf40d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x42_n"3642941b0def8845bd57af5d8ad57d2e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x42:32x42x128_n"116f2089281985193231f7f925b07888*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x43_n"6067b27cb1d2819305201033d8b34f6d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x43:32x43x128_n"76379b3f3c758f85efdf891a8bb19d46*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x44_n"18f9724ec9d8369116d0783867cd25c1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x44:32x44x128_n"69dcadc2569a97394f6bac309539697a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x45_n"4e799ad3a4b3060436db597be5a0d486*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x45:32x45x128_n"3fbfc4f32d14e94ae0ad6d1380981f21*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x46_n"c8df4195a2608c4967271c36594de5d1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x46:32x46x128_n"e983cd3cd64c23b996c5d2dc200d7177*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x47_n"71cfdffe2510a4c2f161f2a76f89fdc7*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x47:32x47x128_n"664e6f4a03539da81c6d42e885089642*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x48_n"f03643b44980f1b1db20f92f17a7b6a4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x48:32x48x128_n"a30047176812984a0a6523ca9209c923*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x49_n"dac950daf11900e231cd72fa9eb1b763*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x49:32x49x128_n"91038059ddd7683578ce9979375aac48*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x50_n"b6400b4b24ad02643505017e47ea754d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x50:32x50x128_n"981b1a46f10d7d766d78b2251403a4a4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x51_n"97a3c9186206431679585d9cfc516a23*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x51:32x51x128_n"cd3f1eb3873e3d82ef95733a66c94e0a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x52_n"e4d35efc18ba9d68b840e94ebd535f5a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x52:32x52x128_n"a1c61f53740be53501075d1f0fdd0aef*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x53_n"9d289186594970d4d5b6932d6b4274dd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x53:32x53x128_n"4916e344e070bac9b773f0f10059cef1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x54_n"8c52b02a383e37de68e4594af63bcec3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x54:32x54x128_n"64d0b93c4a7ed760a900ee334bceec7c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x55_n"8ae4422e009e5535bbc00d8154d75c05*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x55:32x55x128_n"49e60a187dee7195524e0f7afa8f0ec8*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x56_n"dfc8a4535a53aa609ba1f6999a5f9f09*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x56:32x56x128_n"a34a714b91aa4198c2af97c77fde00fc*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x57_n"e2f195de58227faba01150b9458613ba*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x57:32x57x128_n"8e55fb68350f761487fe76d70270ba39*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x58_n"422f00d0c43e9afdc133b5dd4bddea44*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x58:32x58x128_n"891162dc4fd8393b9e4c05fadfb3a477*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x59_n"09647eb7ca76cdcc7b116e6f1d6197d0*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x59:32x59x128_n"b4d9fa0a41a3fabd9e38450b77b2086c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x60_n"89e1d2f6def8cc6892447b580cc7763b*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x60:32x60x128_n"12dfb95d9e7389343af6ef258cf3875c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x61_n"9b969cce38465a656fc73892f5baaf62*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x61:32x61x128_n"3448259c61b87f83af644b8efe6f5f7e*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x62_n"a456778658ccef82e80ba982a43e6244*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x62:32x62x128_n"007a72e5c8b22180eee3d5e293c8eacf*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 32x1x128:32x128x63_n"5ffb6b5013fd66dc3f64cea0bdb0cced*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x1x63:32x63x128_n"f4b6160a258089ab1e9ba489f018dcd3*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x2048:2048x1000_n"436a6a0201ddd3b06067da4db6a11ce2*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1000:1000x2048_n"0c21275c324f61e11729400254748a6c*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x256:256x2048_n"81cd5c3418e361e3a099e717171e3252*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x2048:2048x1000_n"84f6e32ee5478ae0044e74710764b4f7*20&3d31991443305b27341c765c57eb3450"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1000:1000x2048_n"8b41521cda4186fd39715ca47a889e77*20&527e60df5106eb5d6bb8a026852d3736"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x256:256x2048_n"d3476474aedfd0fd4e04e34143a308e5*20&603c103d40359c497a699ec3b7827215"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x4096_n"81dd0712b51a5504148e4efc94803e40*1078"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x240:240x4096_n"cf22d7f8f2d68cda8fca91824f676705*196"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x4096_n"369367d10d51811a1e9bdb26f19dc882*98"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x320:320x1280_n"500b8f5e76c38b9dee7cc89d94a3c54d*784"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x92:92x768_n"2ed52d797ad035882c10b2bb115bb7eb*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x256:256x768_n"bf73768e5923edd081c80b5b0cb8d9aa*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x379:379x2043_n"0ef3084be790bbedb5e32e8b12c82fd6*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 512x681:681x2043_n"42b10412c1dd7a6d8255e66a231d2e43*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x681_n"85e21cf164e870a6ff6424de5efa31de*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x681_n"ed555d2b17a5fa6399ec4c94c212fd6c*415"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2043x512:512x379_n"6f56237acfcb9fc26cf40ac19ba91ba4*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x2043:2043x379_n"68ff00bdb5af152c06b9f408d13e448d*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x256_n"526cd3f5aa4e34d59b53016edfb11bd9*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x256_n"35c39af8829742aec793e17e073cb3d7*82"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 768x512:512x92_n"e60e8da2fd6e7fbd2b0ac2034c87bcbd*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 512x768:768x92_n"cfa8bb2f98dfa82a4105df7160b21e11*83"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x32_n"03a8aef1debc14df088771e78b0b7a8e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x54:54x16_n"71f1f8ae0c6fa31673eb29016558a60d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x32_n"0939dd82c999bb9341a2d8998a330afb*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x96:96x16_n"abbddcc65287df15b68641355b99a7ba*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x32_n"43b117f2ed5e9d9960767c19680a6d0e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x51:51x16_n"ccd836962f6619b934188965a913bf9e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 11456x16:16x1_n"65fc53e291878d3c516e83ef40fd658a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x1:1x16_n"2dd9752a16b47e3b6eeeb94b45287352*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 16x11456:11456x1_n"8bd41b7ec1af9856407cdead693fa429*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x96_n"9eb3a4b409a186a61f7f74e8fa2996fc*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x96_n"48d8914fb5ef5fbea25a4bb62ea06e31*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x51_n"d5c77c3fd234a18325d40e8ffd1d8578*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x51_n"9aca1faf0ec51c0283dc1388692ff4ae*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x16:16x54_n"0b56d8b73132092a5426971fcfaf2d9b*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 11456x32:32x54_n"4385b34f433bcc2043c30b532b6eeaae*11"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x16_n"6efc62e3b4040942c4939c935b3072b7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 54x11456:11456x32_n"708825c255f597bec2a0b873dabd0cfb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 179x54:54x32_n"805657aa97106990e50ba3156ea46bf6*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x54:54x16_n"6376220d4c63b04772f22c87506406ed*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 179x96:96x32_n"8343224dc032e0145abda1bb46264b60*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x96:96x16_n"d50d3953e461af23b97643a082074e06*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic:1.0 179x51:51x32_n"cd6572df94bd5e8fae60b83e800b2a2c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x51:51x16_n"fb66d8d788ff4a536c924d2662976756*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 179x16:16x1_n"ee7be9773aae51d29348ebd7649a57cc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x4096x40:1x8x40x4096_n"6d72ec7e6298c0a9ff63a0141b4488f1*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x4096x40:1x8x40x77_n"558752c6715c0e7668d68096004514c7*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x1024x80:1x8x80x1024_n"1e4fcf22f44abd7e5d1358e9e9bcfe84*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x1024x80:1x8x80x77_n"6bc4aed3b1e2b9c7455fda0d08100e6e*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x256x160:1x8x160x256_n"1372da50f993b5939c6d236a1fca5489*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x8x256x160:1x8x160x77_n"903ee1a491f920bf6084822722e9557f*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x1024:1024x1024_n"c8833d2f373b1d742c1c871187149199*137"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x1024:1024x1024_n"00def99c0cfc337dc5f47f0102022a73*46"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x1024:1024x4096_n"121ba7bd40645b0df3084f5716e43bd5*46"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 77x4096:4096x1024_n"af5bdfc5753b578f7ae483d26ebf8b2d*46"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 154x1024:1024x320_n"2957a2a96668bf19a21ee25c0b520021*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 154x1024:1024x640_n"ada4be86571543470176a944441b2f4c*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 154x1024:1024x1280_n"921797a81057d87f011a3ec8f0815ae0*240"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 8192x320:320x320_n"b01830d34a1c650eedf32304a89d9ab8*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x640:640x640_n"1938e1567e9d920369de077ca98b0db5*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 512x1280:1280x1280_n"f9745a11d8dba9c0b004eb658b71124b*200"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 128x1280:1280x1280_n"1c3fd84a82a95efd583ad2fb87f4de3e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 1x320:320x1280_n"4dda575f3d42507b4e342cecd7bb9117*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 1x1280:1280x1280_n"a9b928cf5fe6a79e0b49a3a38c711b85*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1280:1280x1280_n"89f03f13ce6cb13feffd723cf1a68a65*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1280:1280x640_n"be2b653fc3d4a1e06ec1feff1f38a960*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1x1280:1280x320_n"38cd79aa9e9bfc4c382e2fe37c6a8ace*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x320:320x320_n"9c015edd45ac4389ac620044aab75694*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x40:8x40x4096_n"6f95c1dc88b704bff1513ec48630c809*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x4096:8x4096x40_n"338d86f1d6ac4b93de5f0428980e6e38*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x320:320x320_n"5ef40b1efba719e7f8937ccc6dff6afe*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 77x768:768x320_n"8ffcd1b07e37db5220a176ef9d05d730*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 77x768:768x640_n"77b899f604ce2fce1d279b6972884734*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 77x768:768x1280_n"a4588fda557fc373ffb5cfea5d2ef7c3*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x40:8x40x77_n"e2f2afc1614bd2fde46cfc6889a717b2*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x4096x77:8x77x40_n"ecafbcee57352798e3cdddc713e6c533*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x320:320x2560_n"212e625a3e1e3a521fe400748d1de412*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 4096x1280:1280x320_n"61d114711c8aaf890e25a3a4a936ebf2*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x640:640x640_n"0d2d15c1e2231208a79a8fdae878414d*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x80:8x80x1024_n"3052a5c00f96bb64e4a920559cf622f5*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x1024:8x1024x80_n"6b0b9231382077d0e29c82752b230b62*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x640:640x640_n"742deb176a24f99663db298eeb46bc5d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x80:8x80x77_n"8b10bc82e8f6ac3a09c6dbd81594a3fe*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x1024x77:8x77x80_n"182b6d12ba06efe53c0c1b3c3a06f381*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x640:640x5120_n"4986e892936f697d04219e4dbbf77994*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x2560:2560x640_n"ca88dca5236342ec2e55093bbf4728fe*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1280:1280x1280_n"292f26d43ed7992c19163b49c15dba63*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x160:8x160x256_n"41c8a4f59b5b69a61dc0567089d5b4b8*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x256:8x256x160_n"a9c806f220c085702b475447ebdc00aa*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1280:1280x1280_n"9dd46ab20b30c789b13892db73beb429*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x160:8x160x77_n"72dc65f074a23b8a997c925c43f8e3c8*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x256x77:8x77x160_n"b59947b8d7cafd08b21c4351ee729864*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x1280:1280x10240_n"3999b09f3d16c2618919a5dc42028c8b*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x5120:5120x1280_n"b8a38bf8b96c73e6a4e82130070ab25d*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1280:1280x1280_n"71fe77e43abcfa04ac7cb5192e054ef1*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x160:8x160x64_n"abf202ba420f1472565365db1816032b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x64:8x64x160_n"a7da58c6fc8164f1bbe8be7fe1f9eee0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1280:1280x1280_n"3be1f7d1531c74bc187ecc63cc3c7a93*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x160:8x160x77_n"06691221d73a9e3fe2e3cb8841c6cff9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 8x64x77:8x77x160_n"44c3a1513d6a8974baf14298dba7902d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1280:1280x10240_n"19960c5ab7a895ef94aa2d575831c011*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x5120:5120x1280_n"9a01c9b61bf08941a4e64eeba7a4156a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x384_n"c96e43445720f389da5286cbf59aabdd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x32:256x32x49_n"24065eca2f44fa30979628eee3ff12af*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x49:256x49x32_n"bd17c074d99fef5ca3f9d064cb0a06e9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x128_n"fada92dfb32fb42dd451337ddbed3326*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x512_n"82536de20aa6cd184413a787db5bcb4f*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 3136x512:512x128_n"61cd1091cab755d5845195200ebac6ab*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x512:512x256_n"5f877d54c8c3ac514a39a16001026d04*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x768_n"51aa0395cab08316d098c3ed8fcada93*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x32:128x32x49_n"9c9eaf2e01830d3bf5f6d31d40d73ef5*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x49:128x49x32_n"6e02fa5730ba0026dec6994064a02a45*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x256_n"5d289f4ca1baacca9bdfadedab7b44a2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x1024_n"69921e680518b2152b6b111bc24a61df*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 784x1024:1024x256_n"2f404a983e969a75e3d961f129936a94*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x1024:1024x512_n"9872a879f67bd4cf689778600548cb77*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x1536_n"0c2782363e11b2e3c00759a983089370*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x32:64x32x49_n"7cda978b92a7a85a5d94cbdf26b40d2b*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x49:64x49x32_n"80b10233c1cbb852252d91d3dcacc128*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x512_n"ff4c37a2ea1d04e34a64c03cceb0fd4d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x2048_n"c9cb44461c6b78796e08d4e8c6f8042d*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 196x2048:2048x512_n"7d45da20394aa338d0db9df4a9fc3cac*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x2048:2048x1024_n"94e7a24d3f9347b32d4283bd84baf1fe*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x3072_n"db4a83ff47fc68b2b20b356752aba4f0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=bac --wtag=cab --dtag=abc --strides=:: --attr-fpmath=tf32 32x49x32:32x32x49_n"fd9df6aa16c39b3d494c2c90cdfeae28*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=bac --dtag=abc --strides=:: --attr-fpmath=tf32 32x49x49:32x49x32_n"872c8890ac6238cca7f5b042c1853372*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x1024_n"2e14c2e60bd08c4b4c593e4da10b796e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x4096_n"01241e19223194a144d93912ae0633df*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 49x4096:4096x1024_n"910625d21548b62e18c637f8cfec03ba*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1x1024:1024x1000_n"ab2ec9bfa3fcac078627b14f146ffa01*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x1000:1000x1024_n"b142fb9a0fcaefc7198024e3484b26dc*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1000x1:1x1024_n"6fa548c2c56c5748b744a591d56d8659*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x4096_n"eed96cefee68a698d6eed55f7acddbaf*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x49:49x4096_n"bbc028b01182efc50cd6dc2d7ec514ad*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x4096:4096x1024_n"42986e24ab8af1c4b399edd26444dd84*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 4096x49:49x1024_n"6ca39e0fed4f22f8603fc4b9f06f3d9e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x1024_n"ecd02eec463fad2f69bf9624d9603e62*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x49:49x1024_n"a35fac229b6be7995b7eaa30dd5a609e*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=bac --dtag=abc --strides=:: --attr-fpmath=tf32 32x49x49:32x49x32_n"35f7ee7e9cad01bdb6f6311c229668a2*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=cab --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 32x32x49:32x49x49_n"8fdb4fc6fe64b5ba32fac07d90b4ec94*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x3072:3072x1024_n"e10e8da5765463d677137edcfd446e66*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3072x49:49x1024_n"6f4c78a0bf99a089fa497d198eed9bfb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x49:49x2048_n"9181e48948c2f0332bef39af2bc57477*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 49x1024:1024x2048_n"537f6826c3ac8d4897bf89f85d80586c*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x2048_n"e7c8b2e2c3f3a40f6638d926badd0918*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x196:196x2048_n"c0d6a4d766759dfb766e059a6adb258a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x2048:2048x512_n"aa971b933f8a11ca683774be1ad3ed75*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x196:196x512_n"0af8e8161a3cad0c365ad3e22772299e*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x512_n"0c880a5fefa18aa9906508076bfc4711*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x196:196x512_n"9eaac9bc1e3beb24ff7782461c5ce350*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x49:64x49x32_n"a7bcc02fb8cabdb25f393fa2df071110*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x32:64x32x49_n"d62a50e6ab55cebf1d95fa6be3d066a5*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 64x32x49:64x49x49_n"45609aeaf8ce00126105e7cc73fd0ebb*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 64x49x49:64x49x32_n"5ff899b9bb1833d3ee58d4a94177c06a*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x1536:1536x512_n"b73b305b4621881ee09ff4aec5ee2ebc*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1536x196:196x512_n"875bf2c05347c8c6c0e8d4e24ee9df2c*180"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x196:196x1024_n"5a222bad41d2f3c812851730293df21c*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 196x512:512x1024_n"e514149b6f636b2a74af5855c3141e5a*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x1024_n"fba863866c80e796ba4d27246dc242f6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x784:784x1024_n"0500b6cb417428c86235d032df1b323a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x1024:1024x256_n"082d9a7b981fd69cabac270b3b513da6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x784:784x256_n"7b17d39c0fb2fda9eb77b0949b539fd0*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x256_n"3b7b9e2b350ea79891df178c5fbf39c4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x784:784x256_n"4ae8c79adf40ef6fddb00c718ef5c11d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x49:128x49x32_n"294a179159ad20c0e2aed5a229b62349*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x32:128x32x49_n"35052c039ecec4f31bc195651ac06082*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 128x32x49:128x49x49_n"e5037b15ee2806dfdf557bb35fe091fd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 128x49x49:128x49x32_n"707b8f6a333db61a3e400de8bb4183dd*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x768:768x256_n"d504e514450fcb4938b05aeaea2098f8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 768x784:784x256_n"c48ce838c496599d10c7acef39cfaac8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 256x784:784x512_n"8d3012df2aff00adb0e9efe549eff929*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 784x256:256x512_n"13c89572ec9ee0c0f118510eaf933675*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x512_n"392388840a79c3539fbc5a0f79b314f7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x3136:3136x512_n"945100a1d4d51867c04f8075f8f906c7*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x512:512x128_n"1f192149666f6f27dde8096d4666b020*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 512x3136:3136x128_n"6c59feb466dcb324259128f1a4627974*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x128:128x128_n"4112a53a38b387b98258aa50a9a4043b*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 128x3136:3136x128_n"d6506458766e02e70eb4f60a6e6abfe8*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x49:256x49x32_n"1a67c5977f27ea1b67f3db53164d47eb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x32:256x32x49_n"067bedd3712b18e2f48cbade446ac55a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=acb --wtag=abc --dtag=abc --strides=:: --attr-fpmath=tf32 256x32x49:256x49x49_n"79b802e8f4f4c2e4ca5660b448fc82aa*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-fpmath=tf32 256x49x49:256x49x32_n"06dcddbe131e01fc4c93daeb8580c5f6*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 3136x384:384x128_n"68a0ac917d5f1ce4023406d9f5835beb*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 384x3136:3136x128_n"e44125490565c67857144d8c1b9adaca*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 24x2048:2048x1000_n"fe713a3a73021cc5957a38290137769e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 24x1000:1000x2048_n"79d9d3c8f876f6cead2748dd509c3b74"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1000x24:24x2048_n"027b9b2e767f92a1c6572420b08c4f44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x176:176x384_n"be35450131cc70e0145177639ced3199*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x384:384x320_n"7f08282e7c909678b24b0eacd0c2e5f0*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x320:320x256_n"4018a539ba63b44f6f48b6b81cba01f4*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x256:256x192_n"b1191967b106f067fb19c4904bd6f88f*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x192:192x128_n"eb47a09f80a775b29558ae6a8b9b25f3*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x144:144x384_n"d1bdffdac26986e6010e7ac54e9326e3*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x164:164x384_n"07759541df7a20e68c8c48aaa929abe5*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x384:384x256_n"7fc96152c3c06ab205a0329bcb0dd96a*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x128:128x64_n"ec06f3b54dcba9de306e818735432b51*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x64:64x1_n"196ba9b24f0c0b06302092b10cf36c77*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1:1x64_n"5b35e859c48fa22466b352f188b6bc35*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 64x1024:1024x1_n"519034df24a85c950e5cd5d55b7c08de*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x64:64x128_n"ef4b3be6ca79ba585211171f0b8e6371*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x1024:1024x64_n"ff8f2d793b40c81129f66aecef80f5a1*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x128:128x192_n"df1eeb39f8ec432b50a51dc4475832cf*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 192x1024:1024x128_n"72725418bd28268fc400ff0f7146a95f*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x192:192x256_n"fd716753d0d5e28d255a81ff3eb4c593*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x1024:1024x192_n"0c0123773268496689e5be2b86aaaa4b*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x256:256x384_n"7e58cb822781cc20adf525fa502af1a8*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 384x1024:1024x256_n"71b3494b234da5ecd41a651da298e395*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x256:256x320_n"8e77539ed2abe98928ffff85458827ba*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 320x1024:1024x256_n"976c0b78f5c889531473898472614e4a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x320:320x384_n"9313a3362de5f533ed5f9d4ac3783e6b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 384x1024:1024x320_n"d08bddf99153af4dc22fac579ba11bc7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x384:384x164_n"8d5a4d7cd392a518834c538ad8f5c725*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 164x1024:1024x384_n"44649f047e4976526ecd29a0888f9c6a*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x384:384x176_n"a4999bc5c56a3ae8cd23b1c964a4bc3d*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 176x1024:1024x384_n"84818d0df3321d3422c85ef9e30bd9a8*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x384:384x144_n"da5ab3118f82802e4b602832c4b2e7ba*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 144x1024:1024x384_n"508767969a60b0e61df3c1cbd7bee0ec*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x768_n"be82b4a6249e6bee15979b39cd9f678d*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 384x512x64:384x64x512_n"8645b5d74f15a7f05573dd2ec0e90c1b*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 384x512x512:384x512x64_n"df725728e31ab3765e8501e057f25172*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=sum 16384x768:768x768_n"5d4a314da5fdcbce75e0620af9f4b43f*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 16384x768:768x3072_n"1d394188f2601e13fc7d80f4422d0075*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x768_n"036bda16626f51c004a5b29240db8d40*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf 16384x768:768x768_n"d0dc348cdc3718ce723a7a9f691e671c*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x30522_n"aa7dab58a5f4f93024cf416c36949a5a*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 1536x128x128:1536x128x64_n"bd7ea423d13ef0a13bd1a45918a7b7f2*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x13:13x1_n"448e5053ee4b582ea7b0c6d5f315d8a1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x117:117x256_n"1b5e282cedf1fadbce2730b221c70084"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x256:256x128_n"081f1c261b0576396848b769958f2f55"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x128:128x64_n"f790d548fce243a540e017786fa97a4e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x64:64x1_n"cce812edf7ef87c4f93addf88d39db36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x384_n"3f4c74f5d026a4d4cf9c44034f8f9a95*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 256x49x49:256x49x32_n"b311431d880f2fd8f4899a768143742d*2&086b704b196c448a164b8b3f522a6895*2&733186e4687674b50a4f3139fd36bb96*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x128:128x128_n"2114b288b3be10f54d6fe46f117a9d70*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 3136x128:128x512_n"51c361bbc5e9998d4c8f26eaaee794d6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 3136x512:512x128_n"dcd9a4a813207d3c57a44371f0c8b4b7*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x512:512x256_n"cec27b5a1c4ff6c20e752614dd4a4f31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x768_n"b7a878d6feedebb14f0b4facba056385*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 128x49x49:128x49x32_n"a6daf41a93ddc24cb83bc370a56e76dc*2&a4bdd9d0d83010cbba1520081bcc9ae2*2&426665ba1689ec9e895aa1ce26cb4a07*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x256:256x256_n"ecaa0c200e712124656612fb0bff6f5c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 784x256:256x1024_n"36fe9bb8b621eaf6bfe7914e8f27b26c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 784x1024:1024x256_n"344cbb564e31bb631b2db11a2f26a4ad*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x1024:1024x512_n"ed6f429c0e0d6d5c7e62722276ee940b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x1536_n"39368ee08c2f5c915a7fd8d7a4f402ed*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x49x49:64x49x32_n"fdb487fcc99850bb270e92353a651b59*18&7aec9a241f95cc7c027a559c6311b8b6*18&6e61c8a2e5ce18015c852ac05cab3ca6*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x512:512x512_n"12b9435cb29e7dac9537c7c1065f0646*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 196x512:512x2048_n"86e1c3c4dc0917ca0390a20a14db62ea*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 196x2048:2048x512_n"1df0b65cc1c470807bd024a9ab4f5139*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x2048:2048x1024_n"fb1d6d925ac609495929d2263202532d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x3072_n"15aa617e4bd6349d1da25d02476a1381*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 32x49x49:32x49x32_n"645f64fb4276f727d46befdf9b519d68*2&6d54ca5cbe53d761e5f151495c454568*2&520bd2137861499cb186af393d8a5db0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x1024:1024x1024_n"59fb4cd43b608d157beb5bc461ecabd5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 49x1024:1024x4096_n"4d6d767c7adfbb402c101f0acfe5225e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 49x4096:4096x1024_n"f4a21804d92510a8843505fc1f0b1a68*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x1024:1024x1000_n"60befbe96cba389426dd77b5870f3aec"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.25 --attr-scales=src0:common:0.5+wei:common:0.5 1x2048:2048x1000_n"bccc28561365061aa2e4e8eb89c0b8a2*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1100x1100:1100x1_n"6dba0afbf8329ea29950a39679d63c4d*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x1100:1100x512_n"9f0cac07b2beea13b499a7195515c815*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1x1100:1100x1100_n"b74e8b5ca8e7fc9e7b867d6e01fc7555*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x12544:12544x1024_n"8f90fd0cf9c2ce1b8b9adb16ef3fb568"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x1024_n"28b54c0ef72af19204ea5f64be54f5b5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x91_n"4213e3cfe421683e14b7bf8a86e1c402"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x364_n"e843693b45c6f9a9ba4ed5b6122aabc8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2048x186368:186368x4_n"31839270144ba8e704bf43a3bd24d8d3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x91_n"0ca537e34fb53e187642fbdefbc3b936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x91:91x1024_n"29198114df5260d4877a035dfc314b9e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 186368x2048:2048x4_n"d12f82860b268ff5853e1c1989925372"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x364_n"a4cc01e14d1e3e45e9c93c3da64b0945"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x364:364x1024_n"923c81b94bb589140df31694deaec70a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x2048:2048x1024_n"1c8cf5d25706bbd190cd5e338f44eb51"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x1024_n"1589016bc60e3b4679baef516291da2c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 12544x2048:2048x1024_n"3ae22ee67eb74ba8599d41e618accddb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 2048x1024:1024x12544_n"5532268bc6bf1e23e8b595d5abaee74a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32768x13:13x1_n"2dfd1348f0e948aa186e66f9fb6c8e0e*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu --attr-fpmath=tf32 32768x1053:1053x1024_n"c0274c809c953ba3e5e35025fafea0d9*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu --attr-fpmath=tf32 32768x1024:1024x1024_n"010ac2792176b97aa233c5974056c47a*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1024:1024x1_n"cd7b4cd9aa2a784be18a2858d2603f6a*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 13x32768:32768x1_n"7ddd548b754ecd9fde7ce1a6477f5c73*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1024x32768:32768x1_n"ddd1605ec45a256b6cee604d6b5e117b*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1:1x1024_n"e96ad493133383d98dd9e8b46252a7c6*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1024:1024x1024_n"7b63904cfc9e1d11af2fb564cd842341*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 32768x1024:1024x1053_n"433f77fa2128f73c19150ca0a7d2ff19*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 24x2048:2048x1000_n"70f360adb67f5c414cbe6e1eff231a05"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 24x1000:1000x2048_n"7cca82964ab9e1ed6b79682d4dc29950"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1000x24:24x2048_n"2045d1354673f7f2b3e4f0894febf0da"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x2048:2048x1000_n"31f571001162d7ed8024196322e78bae"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:s8:u8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 1024x2048:2048x1000_n"df656e21991800214c341ca3f7210071"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32x1856:1856x128_n"0bf125cb8b5c2a65c2d36e09db3bbad2*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x128:128x128_n"90319a8e4452316bc99d7e77292ce2a3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 32x128:128x1_n"cf9ccf0b26880e7e1079b8ca2f636192*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x1_n"36530355e905e06d506a2bf3becb0b5b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1:1x128_n"f1f02e7e3815d2cc6d0100fb60870f03*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 128x32:32x128_n"ecb5ce0de3fbeb4b8649ec882142c1ff*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x128_n"ac2c562e9af212b406203dad69d1f2dd*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 1856x32:32x128_n"e58d3ea926bf059e40a282590eb17d93*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x128:128x1856_n"cfd04c4db123c3bd053de87447c35f17*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 5000x845:845x1024_n"b011c0ffb7ec20edd85c322ba1e679de*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 5000x1024:1024x512_n"80c860b0fa469eb6e42a433fbe3e28d8*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 5000x512:512x256_n"5a6b65331102a9b9e5e3dd6613468e08*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x256:256x1_n"f9c1fd6f3251ab3615db634edab64e76*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x12544:12544x1024_n"899745e11b60901ec8a25177b83f7a4b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x1024:1024x1024_n"77b5b076afff035f1eb6bd86ec914d70"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x1024:1024x364_n"3e14bcaddbf43176fd81e23ee326be8f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16000x1024:1024x91_n"783a76b098c5d493d49c96732fee3d7d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1024:1024x512_n"86b2690c29f126ce5671c54878ac2778*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x512:512x256_n"5432c84005fec3549302d2a8f2af7a0f*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x49_n"02210904de746f04301a6e913c1a1a8b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x7x7:16x7x15000_n"d2d22cdb84a2729ca59b4f699942dc84"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4096_n"105c0c10d2602d08dc2e9bb2c8c1d844"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"ee2458eba1df03e56430cff80d5265eb"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x256:256x4_n"9a483aa0ae0722b165b2b52bf2abd88b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=acb --dtag=abc --strides=:: 16x64x64:16x64x15000_n"bc766883909c56d5c87112a82257e9ef"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x64x15000:16x15000x64_n"a8f150013d0336ce3ea24f319e4e058b"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x4096:4096x256_n"8e860e68e2443a4a46d420a85de1b599"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 4096x16:16x256_n"da13be8f63d415df335b1a0376168dca"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x256:256x512_n"da8cb21039038bd03047b35a48ca39a9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 256x16:16x512_n"de10bd36eb2698b88e1548d0823c98c5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x512:512x1024_n"79bcb67e0eabd2a415bfa8b30bd47de6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x16:16x1024_n"8995199c4b21f6b48f3860305d42d269*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 16x7x15000:16x15000x7_n"d78bd95b6cbb6a96b15abcb2c075d0df"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16x49:49x256_n"625024b7b01ec98277d48ab2b16638da"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 49x16:16x256_n"5fb1345b3b4fdd9285068739c9c1ed36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x13:13x1_n"8319e1c109abd553edded8684276b5e8*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1053:1053x1024_n"32620b6c4bd1b81159ff8e0a645799ee*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x1024_n"3553714eabaea16f626f1012a15cecb4*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32768x1024:1024x1_n"e42b1008bba75c3aa781f6f3cd0989c6*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 13x32768:32768x1_n"8367537e104a14d1b5c0a9abca3d4fd1*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 1024x32768:32768x1_n"b5b850d3239b2bf08ce7e782552f7f79*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1:1x1024_n"918c047f0fda3d1cd8e53800c3f36560*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1024_n"00d7bbd16c2a7ff054d564549f2d28f3*400"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32768x1024:1024x1053_n"513538b2a3f1478f09c33e24a84e3647*100"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x2:2x1024_n"b84214e313bf3457a06b5c693b92ca72"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x1024_n"9f0bce92f8a3139f56fd26dacf0f0a1a*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 32x16x512x64:32x16x64x512_n"66318a1b7e6a3e141f7095602c63fb18*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 32x16x512x512:32x16x512x64_n"33edfeefd57a28c8770f1c7599b99aeb*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x1024:1024x4096_n"404fbe7b2ce9034b43cc6d4d1f9cb767*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x4096:4096x1024_n"7e083270bb61f7c26dffd8fb0a9e5b61*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:f32 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x1024:1024x1024_n"b82621de80ee94e3c468dc3913d83081"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 2432x1024:1024x1024_n"f930d4f96b1d972bd568d77eef1fbe0c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1024:1024x2_n"791da1c7f60eb09909e2621af8e02d76"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2432x1024:1024x30522_n"4868f759eeebc21d0b0c9fbb3ccbf52d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 30522x2432:2432x1024_n"c0043af0b9a3b39557048d58ec38147c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 2432x30522:30522x1024_n"a52aa5826362828687bf21230843306c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2432x1024:1024x1024_n"0930d9f31894d63961953777ca051125"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 32x2:2x1024_n"d289157978852fe4ef8f8ee407e181db"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x32:32x1024_n"6b1440a1b683c76e43b31eedd95fd123"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 32x1024:1024x1024_n"6c8a48c197498b46e2b2bc99fe1785e1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1024:1024x4096_n"e15f2da062ecc6de37199e97bafb4480*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x4096:4096x1024_n"3dff5617334a5367fbfeadac6ce2fa1a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x1024:1024x1024_n"857f5073045ccf6baa014938f39cc314*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 32x16x512x64:32x16x64x512_n"3351745f2405dc7fb6ff885ba5c71507*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 32x16x512x512:32x16x512x64_n"6dff00d9b0bbef09506215a5592f74e2*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16384:16384x1024_n"7c2fd85a383edc6446383f1a9ac8204e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x2:2x768_n"2f41c16c1a11278bb7ccee329ec0d278"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x768:768x768_n"43db0afb05f990edf694b98efb597e75*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 1x12x128x64:1x12x64x128_n"e88d39debfc75db21bcaff84de491806*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x12x128x128:1x12x128x64_n"5fb8acfcd017971c9e5107bdf63ab5de*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 128x768:768x3072_n"a5f7b09090499d5131c7b5174e094deb*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum 128x3072:3072x768_n"6811e13c0bbbd87cd5d98b4cde0fd965*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 1x768:768x768_n"e72f6533bc6637f3e825706deabd7baa"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x768:768x2_n"a4ffedffcc0e0499715eca5324b8eb41"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x8_n"f2285fee192ccdbb86b0121558d346fe*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x32_n"1dcf1b44235d331eae090aad45595dd1*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 1024x4:4x8_n"444c61ac3bdf4689fa95994ccdaf9d6d*14"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x8:8x2_n"172d8a1a32ba6ce0f6783fec60360c4e*14"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2:2x8_n"8f4241b28d78ca86352dc364c9449e65*14"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8:8x4_n"54f23335fc91f67aabe05db25d5f69c4*14"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x8_n"7a83bbd923ae88297c0297fceff62494*14"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x32_n"d71e21c1e03362e4e3a4d9250c3a5a41*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x499:499x8_n"baaf7a63c28bc32f5186f2ee252b1b50*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x499:499x32_n"ba347f41cbfa77f2ce0938e741885a37"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 5000x4:4x8_n"e9385812caed076f657d3f50421f1df1*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 5000x8:8x2_n"8d73ecccd55fecc40a7700f0ad40ee53*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x2:2x768_n"0530a98cb62f18dfa8ac4d5e22f80e49*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x768:768x768_n"9a440c987fd5cd7c24f700c0ebba779d*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: --attr-scales=dst:common:0.5 128x12x128x64:128x12x64x128_n"018010691fcebef61c9681ddbaa40e65*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 128x12x128x128:128x12x128x64_n"3318a1bd68b723c01f496e0821ed046f*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x768:768x3072_n"0d33e518321935a340fc4beedef30b96*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 16384x3072:3072x768_n"e3cfd817f894a196c9c147bebaa65ac2*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh 128x768:768x768_n"08b294e9269268a4851d176cdea62520*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x2_n"4462c2d18f6c2bca813ac5bcc1da2366*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x2:2x768_n"e51350c28deb0dc1a0bd8911d8502fa4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x768_n"6c18e9ad2745ee95b9ab91cf8fee571c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x3072_n"bc797f3e24dd842e5240d98679a9df3a*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x3072:3072x768_n"1ba830f2db91d5b00f1bf62ec94e10ff*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x768:768x768_n"90aec6fa70e588854c31c17869ed75de*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 128x12x128x64:128x12x64x128_n"88984044dd9c228176aa58b0cfb2e3f5*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abdc --wtag=abcd --dtag=abcd --strides=:: 128x12x128x128:128x12x128x64_n"59d43bd78f5e3b876da178a4f84dce8f*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 2x16384:16384x768_n"8faf7979b39915649be9c903fb09f0f3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x2:2x1024_n"d561ec7fd1c1d2603c12fd124d66a590"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"033dd3afeb18702f176d34063d32a2f4*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x4096_n"b2baa787d5c6a1ff883732cbb7cf2b29*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 8192x4096:4096x1024_n"7052268ca8c6ec26ed105d3e51a3cffa*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1216x1024:1024x1024_n"68f400216c6a4cfb804b04784a482a1f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_tanh --attr-fpmath=tf32 16x1024:1024x1024_n"b40c79fac056779d05d9599171fc5717"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 16x1024:1024x2_n"defb83722cc93c7a73fec32b0cb133de"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1216x1024:1024x30522_n"729669bcf19223f4811e201b9cfe1ee9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 30522x1216:1216x1024_n"45801b336d75724a06427a4efc875e28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 1216x30522:30522x1024_n"64565dac6da3bbc21d04e45ff0a4bb63"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 1216x1024:1024x1024_n"8f6fd4b16ae0d2a92affef77a6cedf30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 16x2:2x1024_n"e626b73464a25d5ae3d1c9fe92d53fab"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2x16:16x1024_n"d6e170d174db7ca142b7882af192947e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 16x1024:1024x1024_n"ea2f7a1f4720c2ea7ffb2036dc616603"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x4096_n"f1c34c61f21cba2081f4320a55752a6e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x4096:4096x1024_n"b99ed999c3a22acc58095cbbefea102e*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-fpmath=tf32 8192x1024:1024x1024_n"c9de78225e7e1175c83308bcc4b02cdb*96"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: --attr-fpmath=tf32 2x8192:8192x1024_n"98fb52a63efe10cc02262bab2a718ff2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x10x4096x64:1x10x64x4096_n"1ae3a588d184ac8184bb5558ebb334f7*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x10x4096x64:1x10x64x77_n"ae5a6384ce4f95dac47cf85fcba7d539*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x20x1024x64:1x20x64x1024_n"2ec28ef45ba1415e5790e2ecf834fe8e*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x20x1024x64:1x20x64x77_n"c47b0ebaaeaceadfa310df52a0da1a45*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x40x256x64:1x40x64x256_n"19f048c8a4acb3e6817ce52e786c6d60*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abdc --dtag=abcd --strides=:: 1x40x256x64:1x40x64x77_n"594ffb08d871f034b8166b286a297edc*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x1024x512:8x512x1024_n"a669d58583d26a50e14272b404718a25"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=acb --dtag=abc --strides=:: 8x512x1024:8x1024x1024_n"6095f4d0f045e673ab8bdeaca1eb6a6c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x768_n"7720ad400ca02261772f0bf498b03c80*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 96x77x64:96x64x77_n"329a7c34eabbb7bc6ae972dac8938d01*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x77x77:96x77x64_n"49e5a37d938019bfdd481db125f1c916*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x3072_n"1801086f097af9d7969a77130f8f2131*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x3072:3072x768_n"e825d648b779478cc69bfee290547a7d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"84b30a0daefff4a370a2ff652f747533*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1024x40:64x40x1024_n"c944a080dcc9b909a8669671247ae737*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x40_n"4d25c6cfdf185362094c12df18938424*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x320_n"9c10dabb47797fbe1179931921bbf849*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x320_n"d04fe51b566ea561033e281075b881e4*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x1024x40:64x40x77_n"ce46f7dd7a2bb92007dc346da6f712dc*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x1024x77:64x77x40_n"030e4889901efb55f08f134b612c17d8*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x320:320x2560_n"848606aff7e9e483720e63b502630481*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 8192x1280:1280x320_n"37d5c90024cf75276f144d080ae36a79*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"925aafc3807b78232400fcb215d84331*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x256x80:64x80x256_n"582def42be8719f572175abc708db0c6*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x256x256:64x256x80_n"ffd4fcce811249749f84e54e30436d7e*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x640_n"e90999effa5a240d39072869b3a6f83d*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x640_n"f03ab55536dad8ac4b4acd09af8e3107*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x256x80:64x80x77_n"8097b4239034002758b0b0558509f235*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x256x77:64x77x80_n"a0bb630e2f8cfee65dbec0934e14b8dc*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x640:640x5120_n"a3f9937f15922d47ced8bf9a255cc763*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 2048x2560:2560x640_n"1d549c415b63b3d3e6d78fa6ec221f14*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x64x160:64x160x64_n"5e0cc229b009d6fbd9e2403d290a73db*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x64x64:64x64x160_n"017bab7cc43cd8608a664d9592f7a857*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 616x768:768x1280_n"88ac607f25967a9b978a884279686cc3*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x64x160:64x160x77_n"8c2ad66faf9c93f4c854ae2075bb72a5*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x64x77:64x77x160_n"4133ff0e7b787494c1175e3695de32eb*15"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"67d7f968feaf644de2d4b606632a506e*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x16x160:64x160x16_n"3e0ae348429710a236b793d1c740c12b*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x16x16:64x16x160_n"e6d51f3fdaf6a9836cfc45789b8ae0c1*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x1280_n"7ea5d95ecbf9f23fa02f470116770ec7*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 64x16x160:64x160x77_n"e92ccdfab5f25526928c5103c8d7f1d0*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 64x16x77:64x77x160_n"7660f626852dd2c61f2183bae0e3fbd3*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x1280:1280x10240_n"76c711d20782cf7f59b1744a0b2833c5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x5120:5120x1280_n"2efb53e2fe90e3eb23cb76a01e54bd9d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x320:320x1280_n"c660ed4e92674bdcd4c2aba2bc576603*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x2560:2560x320_n"9b229821d9e5a00d77504926b08faa30*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 8192x320:320x320_n"ecd06825c5fdf28c0741b2bded86d8a1*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x1024:64x1024x40_n"67caa88361567aad5e3bd1b57d3136a9*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x40x1024:64x1024x77_n"8770c1dbdbd32c93c8999e5e661936ae*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 616x320:320x768_n"baf6e6296e97f9584e33825b27ae232f*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x1024x1024:64x1024x40_n"1e7f2414a2c01cc71d160f85adecf251*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x40x1024:64x1024x1024_n"4a8510e520af14e1b5ddd0b25dd856e0*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x640:640x2560_n"d87caffd865a63faddfeb187df4d951b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x5120:5120x640_n"61c287eb50dc108afc55443cba5da4ab*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 2048x640:640x640_n"5db3cd5e9a047966a67ed0927cd3cf74*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x256:64x256x80_n"af6ecbc6086c2f8f62b33540cb3ebd62*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x80x256:64x256x77_n"87aabe685918fff5aac78bcbccbb7ac9*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 616x640:640x768_n"d80ee3145c70c424097ae77b3cef22bb*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x256x256:64x256x80_n"f351020c99c60785576c6e2d23ea54ca*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x80x256:64x256x256_n"e52111e6c3f4375df975b7a0d73916ce*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x64:64x64x160_n"3326cdaebdc8daa2a2099c8999a43d3d*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x64:64x64x77_n"f4541184c2730d827f6b2982d30b6ed7*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 616x1280:1280x768_n"ee61e5bdd6c0421d7dfb8f0d01e508cc*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x64x64:64x64x160_n"54c13815bf5cc82032aa9ed9564482ad*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x64:64x64x64_n"bc9fe857497b5843679eca5394426187*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1280:1280x5120_n"0c338a31fbc8a40951cd0156ac8adab2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x10240:10240x1280_n"701421e58281b58b0708c50a5bcbf737"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 128x1280:1280x1280_n"dbfbc992e5499a30c30b861c56ba0a95*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x77x16:64x16x160_n"009211d0a48a0eb6d4dd4e402188855f"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x16:64x16x77_n"6e3cc2aa4e3332bad26c9ba47c7ea949"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x16x16:64x16x160_n"a125e5df03fb4dd1310f077a97183015"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=acb --wtag=abc --dtag=abc --strides=:: 64x160x16:64x16x16_n"12d79df55b39809e7414149b58177904"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:7:abc 256x49x32:256x32x49_n"0703b07b161581b23e92d3f769a24c6f*2&9cee98557c8da1c3f7cc6051032263fd*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:7:abc 128x49x32:128x32x49_n"42e21de47a52ed368ea401fb97378cf2*2&64a99fa01fb629621e620d603daed97b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=binary_add:f16:7:abc 64x49x32:64x32x49_n"44de76b14c032bb5eb0d90b720d0b0ca*18&cd0231bb1f0d5f2b1dfae17aec4b524a*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: --attr-post-ops=sum 32x49x32:32x32x49_n"cbc007404f7ca10bf6307463d5950e55*2&c421a97c7d99073a7d491584ec316ca3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=sum:0.5 1024x2048:2048x1000_n"b4208a5418ea1d28618da99d1af8060d"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x320:320x1280_n"2ca7ff0df9297abc85438fdb6c9b6f12*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1280:1280x1280_n"4f4528ef0f40b4e4da0a7c5dc75e2696*26"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1280:1280x320_n"1d7ab1be236d3241a951532192b162cb*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x320:320x320_n"93e35f48679426d3eeae46879ce58f7e*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x320:320x320_n"f76c97f654fbc8170a005cc0df5a10db*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x4096x64:80x64x4096_n"31c5ad0d911a9cd72db5732d036c1567*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x4096x4096:80x4096x64_n"d188f0aa1851a479dca91f51bcc007e5*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1232x1024:1024x320_n"3478cae92e84fed8d41e3d630b870740*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 80x4096x64:80x64x77_n"83aea84d1a3ebd1837e1a3378193f523*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 80x4096x77:80x77x64_n"7b0c82049be7cf8c854f750867d60658*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x320:320x2560_n"4c26316369fe0fa560ab325e3b9080b1*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 65536x1280:1280x320_n"aeb234422752258ba2ade515501a14f8*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16x1280:1280x640_n"71a5f1e5ef2672d52590a458c6129681*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x640:640x640_n"470e85c787ead43ba3571d8aa4586ed6*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x640:640x640_n"9ad5f7630e89e30cbbc915fbf71801ea*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1024x64:160x64x1024_n"6353117199fd80a8c87196dd9fe6a020*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x1024:160x1024x64_n"4e66dd48aff23b72a749fa4c1c534a9a*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1232x1024:1024x640_n"e42df51129c20e5dea808cc43382f89a*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 160x1024x64:160x64x77_n"e63a0df7375e6d7305b3599673d31bd9*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 160x1024x77:160x77x64_n"b094791c52be870b130d18f1946cab2b*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x640:640x5120_n"3996572677f97dbdc1fc728ba0f6a92d*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 16384x2560:2560x640_n"8c4086f83111c44d9bbf2da32f6ab840*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x1280:1280x1280_n"92fdcb087d90d6094ebf5bbfd49093b5*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x1280:1280x1280_n"1d194b73d9b5a4aa6cb33f22a38c40ef*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x256x64:320x64x256_n"c95bed5dd8b9024cc8670db4ec9c488e*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x256x256:320x256x64_n"fffd15c808659e7eea185f3e1f4b444f*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1232x1024:1024x1280_n"038088068b32f77e8a245863007bcd6d*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x256x64:320x64x77_n"7b3be261b815491cca0f68dd6250c789*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x256x77:320x77x64_n"fed3743d3e365b8fac69f91729ea4430*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x1280:1280x10240_n"f004ef4020a8f428368a53af34cc3d34*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 4096x5120:5120x1280_n"79ba627b828d3730ed1c04cc152870e8*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1280:1280x1280_n"246ad523a98df1d59c785957180b920a*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1280:1280x1280_n"279a224d40af45f517855a82dc1d56bd*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x64x64:320x64x64_n"ef35b42e21b57c65da78f70ff5c1fd84*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x64x64:320x64x64_n"3b86e22719e1435580e17039faf1c15f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=acb --dtag=abc --strides=:: 320x64x64:320x64x77_n"e233c8266b9d22603c8953a6f7380ba9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=abc --wtag=abc --dtag=abc --strides=:: 320x64x77:320x77x64_n"647269bbed5bcb4d531b0967d53f45ca*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x1280:1280x10240_n"aa84d8f7322ee42c6f818149715d78b6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x5120:5120x1280_n"e568b86ad7c3110140c9d9fb17197470*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=acb --dtag=abc --strides=:: 8x4096x512:8x512x4096_n"7b7db4e35bbe99cfb04505bb89b3f303"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=acb --wtag=acb --dtag=abc --strides=:: 8x512x4096:8x4096x4096_n"dc758e6dbdc73096a14bfbfc951d558e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:s8:s8 --bia-dt=f32 --stag=any --wtag=any --dtag=any --strides=:: --attr-post-ops=eltwise_linear:0.25:0.375 --attr-scales=src0:common:0.5+wei:common:0.5 --attr-zero-points=src0:common:1 1024x2048:2048x1000_n"bb24d3f210ae717eea914eba0faf1260"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x197x64:96x64x197_n"e2bdb14e81baec5b8cafb5e51a288dfe*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 96x197x197:96x197x64_n"ff056bebd17ce3dfc69ed039e85a7f7d*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 10x9216x64:10x64x9216_n"dcbfa58ed779040b8f12d6e24978cbc1*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x9216:10x9216x64_n"6fe958df4f1f69ae805ba97b01e6e0ff*5&172eb66a09195a517895d4a03c9ced26*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 10x9216x64:10x64x77_n"0d078dc6513a62c9a3a2be72e2dbf6ca*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x77:10x77x64_n"c523b043bfdb0bb90a810604406de66c*5&de7d3a39cf2effa9a2b08ee4c2324f6f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 20x2304x64:20x64x2304_n"c3a181501f4a14f85ba62722debf8c61*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x2304:20x2304x64_n"45700e5b57514109bf905bbffbc6237e*5&60998bf0eb6c1629ba2bf4db927d853f*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 20x2304x64:20x64x77_n"2014c4115b04d2c324705c99f37dfbe6*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x77:20x77x64_n"9d04935b3b2845d2f86cd9cdc2e28fd1*5&3b5f5012eedd6e6cfae39b7847e5bb9d*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 40x576x64:40x64x576_n"51fb88ae780ead455f5534199c8091b4*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x576:40x576x64_n"47999539c35fcde6ace68886104d136f*5&f79663e84f47a6d883eb977879357c6b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 40x576x64:40x64x77_n"e4f4df43d9de9e541fd19a1b9119f095*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x77:40x77x64_n"0fabd2e26f26a582e25c331029f8d730*5&260c5155f89bca4d60a81e1c78a86712*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 40x144x64:40x64x144_n"6a2948b772ad9818bbf16d2d5e87ca66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x144:40x144x64_n"22c80edfe0b66786d3ead8b190ca0d93&9149024ffc2bea6661bd5eb6d6661420"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: --attr-post-ops=eltwise_linear:0.25 40x144x64:40x64x77_n"2b59f2bee9d00e88bcf4dc90b3f32d06"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x77:40x77x64_n"ebdfd60c2e9288e8c3e064de55b1bc8d&df4ebd1d18ac3f4bcdc6a307077587e3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x16:16x2_n"2d01d233ec34bf49ee23a6b71ffeddf1"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x320:320x1280_n"3aaafb246f12379b5c70b3ebc7806114"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x1280_n"3f82be7450e961d5a2d277ede03cfc62*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x320_n"68c0bdf3ed9b0b7e652db50d2d1cd8a1*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"f0451ac2792959f3bf7454d58b7ecaf2*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x320_n"c20097b63ad463f13b4316add5c56705*21"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x64:10x64x9216_n"002f28ba67be18377a935d80261e5817*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x320_n"450b59b5b1f5255f79591cd61d1ce979*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 10x9216x64:10x64x77_n"a042cb1119ff604d89d076d10edfd97b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x320:320x2560_n"867cab89ef7b6e633ef8350ed4975573*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 18432x1280:1280x320_n"ac6c3fa681e15c68a5d9e4c0c723ca23*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 2x1280:1280x640_n"675384f93f2041c47b7b1cb065563dea*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"b54947516bb7f52884d32c90ebe00041*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x640_n"9b09cfc78a61c63b059b9cb88601a7bc*21"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x64:20x64x2304_n"4ea54df18be6486c5855d80cfe092769*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x640_n"14f0671881620e05bbb3c0d2526c8499*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 20x2304x64:20x64x77_n"93f65d64fcaf38e4895da3f2af133377*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x640:640x5120_n"62bb2c420c63bf5a2c2b7cac3fe88c8b*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 4608x2560:2560x640_n"31c63e2f3a824767be7f95870afe2bfe*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"3f275cf91fe37a626f250a2f94c3983b*19"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x1280_n"ea4be7fd3c368d24180d277f63ad5d8b*21"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x64:40x64x576_n"37f6b9b5e8113a89fd613ab666c3df8e*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 154x1024:1024x1280_n"f7690d46bd4691c1c1ca48f5f2883f2a*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x576x64:40x64x77_n"ee53fb328a0bf106696e905615226bb5*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x1280:1280x10240_n"b6fa9625bd99c8d5ea4748293f5cddcf*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1152x5120:5120x1280_n"5cce17abc5afdc890101fcf7ef3dcf7a*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"c42c80ea73bb1efde6a128d4441dc231*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x1280_n"350fd11e473f212e8ac93af088a2416c*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x64:40x64x144_n"1d3466ebac42f6dbd67e8c65004b2771"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=abc --wtag=abc --dtag=abc --strides=:: 40x144x64:40x64x77_n"32fa40698e9f39f0e85ba1cba258c9a7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x1280:1280x10240_n"0f8aacaf33e24c3887a7079cf284b323"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 288x5120:5120x1280_n"9bb65ddd11122db5d0b33d9968f5a45a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x8_n"c15d3e95052667250943d0a36d70ae84*7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x499:499x32_n"1fbc064870e2e047dfb88e9ef53fa884*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 1024x4:4x8_n"a985e94c3662724ac7064a5ab5bcc964*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 1024x8:8x2_n"d937e5924adeee0205f74e22f07587bd*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x2:2x8_n"9dda36cf6ab6a3925cb9f1f546cfc6b7*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 1024x8:8x4_n"13c464e91193f0d4a367a7517be7b10d*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x8_n"751ce8bb070b58b50d4fbf54a6aec634*8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x1024:1024x32_n"f3429eef4220f4e54b03776aecbd77d1*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 867x499:499x8_n"f6c51f4056d50cd3e3a47da11a2459ef*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 867x499:499x32_n"f30bbc0f475e663260fa45d14919d00e"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 867x4:4x8_n"9bc6faee5da9e57859c6d9f738fde122*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f32:f32:f32 --bia-dt=f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 867x8:8x2_n"af4024121a9f693ae99a916f9fb82d30*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 867x2:2x8_n"6a940c7ffc129a8046ba379f0b5575ae*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 867x8:8x4_n"7a68b60790875ca53b186f516cc29668*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x867:867x8_n"c6dd59cefd30ba5bf94646bc92d17fb6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 499x867:867x32_n"2390e91db17b9557cb8a911361a9184c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x13:13x512_n"e4607a9d700d24e3b447c38bd92d66f4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x512:512x256_n"568b19f36a625670f54d5f8e1e81d493*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x256:256x128_n"f88d057c17644ca819f691bdc8dc0a8f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x479:479x1024_n"455576d0062d6363a224cc100d50d1ef*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x1024_n"74ce6fb1b80c3dfcbaec6a53eee2a962*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_relu 32768x1024:1024x512_n"c9a3bb6e47399668d7671025060eec3a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=0 --dt=bf16:bf16:bf16 --bia-dt=bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_logistic 32768x256:256x1_n"216fc09b59f04b3e14f8b4c19f7f666d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=any --wtag=any --dtag=ab --strides=:: 1x2048:2048x1000_n"64faf924ae36b8a64b917716d3f5504c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=bf16:bf16:f32 --bia-dt=bf16 --stag=ab --wtag=ab --dtag=ab --strides=:: 256x2048:2048x1001_n"533219ad32c5b3553304da7ad7a4b67c"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=bf16:bf16:bf16 --stag=ab --wtag=ba --dtag=ab --strides=:: 256x1001:1001x2048_n"5f1753ed303dd240848e67e85f4b9bef"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 197x768:768x2304_n"e22a32fa3b0f2c0876c38661289c7825*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 197x768:768x768_n"5fd814b4fb7a93de1308a83f3916c171*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 197x768:768x3072_n"9f4a0d9d4899045e62184f300bc0ea1c*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 197x3072:3072x768_n"fbe441311839b640e7ef18cd306d2880*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 1x768:768x1000_n"eb432bcbbf72088218fb92b93574e530"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abc --wtag=abc --dtag=abc --strides=:: 12x128x128:12x128x64_n"a882f4445f6af8eed3712e5b675319e9*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh 128x768:768x3072_n"cf00fc9b3e2854cd4381a5424e495bba*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x3072:3072x768_n"fe34cb5440ea1d2de889ad543af45a93*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:f16:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: 128x768:768x9_n"779176efb414218d7b8899c25b25cbc9"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc 128x768:768x768_n"62c7b8c16f30dded33e20c74d593e434*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:s8 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc 128x768:768x768_n"4548c08689a29bd8871ef0a0026abf27*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+binary_add:f16:3:ab --attr-scales=wei:per_oc 128x768:768x768_n"a85143fad955891cb0c5b60f6297c79e*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:u8 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_gelu_tanh+eltwise_linear:0.25:0.375 --attr-scales=wei:per_oc 128x768:768x3072_n"7e533bbbfd24ad22ebdca53ad72ccb8b*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+binary_add:f16:3:ab --attr-scales=wei:per_oc 128x3072:3072x768_n"22d46d5562e5c2eab610d68c1d7d88e7*12"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:s8 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25 --attr-scales=wei:per_oc 128x768:768x768_n"1e539824a8533ec45b6a65739b970d89*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f32:2:abx+binary_add:f16:2:abx --attr-scales=wei:per_oc 128x768:768x9_n"e754e45a59427fd6a97ef2e53603cfcc"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 117972x1536:1536x512_n"d72905f3efd671888de29798d79c5a24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: 117972x512:512x29_n"385c58301767429fd4d0f8c5b70d2346"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 29x117972:117972x512_n"15902a52de7c6490bf871d0d3e3637a7"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 117972x29:29x512_n"33ed23fc3da3426915c13b78c4e61040"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ba --wtag=ab --dtag=ab --strides=:: 512x117972:117972x1536_n"6d85bf05775aa0a66bd8784c78e58552"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f32:f32:f32 --stag=ab --wtag=ab --dtag=ab --strides=:: 117972x512:512x1536_n"2b601eade3d50969d45c04d3bc7008b6"
diff --git a/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu_tf32 b/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu_tf32
index c89a602e2da..d015f56435a 100644
--- a/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu_tf32
+++ b/tests/benchdnn/inputs/matmul/option_set_fwks_key_gpu_tf32
@@ -1,20 +1,20 @@
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert.inf.tf32.pt.mb64_pvc*96"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert.inf.tf32.pt.mb64_pvc*96"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abc --wtag=acb --dtag=abc 1024x384x64:1024x64x384_n"bert.inf.tf32.pt.mb64_pvc*24"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert.inf.tf32.pt.mb64_pvc*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert.inf.tf32.pt.mb64_pvc*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x4096:4096x1024_n"bert.inf.tf32.pt.mb64_pvc*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x2_n"bert.inf.tf32.pt.mb64_pvc*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert.inf.tf32.pt.mb64_pvc*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x4096:4096x1024_n"bert.inf.tf32.pt.mb64_pvc*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x2_n"bert.inf.tf32.pt.mb64_pvc*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16x2048:2048x1000_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16x2048:2048x1000_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 16x1000:1000x2048_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 1000x16:16x2048_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 128x2048:2048x1000_n"resnet-50.tr.tf32.pt.mb128_pvc*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 128x2048:2048x1000_n"resnet-50.tr.tf32.pt.mb128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 128x1000:1000x2048_n"resnet-50.tr.tf32.pt.mb128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 1000x128:128x2048_n"resnet-50.tr.tf32.pt.mb128_pvc*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16x2048:2048x1000_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16x2048:2048x1000_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 16x1000:1000x2048_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 1000x16:16x2048_n"resnet-50.tr.tf32.pt.mb16_pvc*1"
 #
@@ -22,20 +22,20 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 1024x2:2x3_n"lqcd.tr.tf32.pt.mb1024_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 1024x3:3x2_n"lqcd.tr.tf32.pt.mb1024_pvc*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 32x200:200x3136_n"3dgan.tr.tf32.tf.mb256*8"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 256x10648:10648x1_n"3dgan.tr.tf32.tf.mb256*8"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 32x200:200x3136_n"3dgan.tr.tf32.tf.mb256*8"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 256x10648:10648x1_n"3dgan.tr.tf32.tf.mb256*8"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 256x1:1x10648_n"3dgan.tr.tf32.tf.mb256*8"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 256x200:200x3136_n"3dgan.tr.tf32.tf.mb256*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 256x200:200x3136_n"3dgan.tr.tf32.tf.mb256*2"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 200x256:256x3136_n"3dgan.tr.tf32.tf.mb256*2"
 #
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 16384x2:2x768_n"bert.tr.tf32.tf.mb128_pvc*2"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x768_n"bert.tr.tf32.tf.mb128_pvc*96"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x768_n"bert.tr.tf32.tf.mb128_pvc*96"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abcd --wtag=abdc --dtag=abcd 128x12x128x64:128x12x64x128_n"bert.tr.tf32.tf.mb128_pvc*24"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abcd --wtag=abcd --dtag=abcd 128x12x128x128:128x12x128x64_n"bert.tr.tf32.tf.mb128_pvc*48"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x3072_n"bert.tr.tf32.tf.mb128_pvc*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x3072:3072x768_n"bert.tr.tf32.tf.mb128_pvc*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=tanh 128x768:768x768_n"bert.tr.tf32.tf.mb128_pvc*2"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 128x768:768x2_n"bert.tr.tf32.tf.mb128_pvc*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x3072_n"bert.tr.tf32.tf.mb128_pvc*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x3072:3072x768_n"bert.tr.tf32.tf.mb128_pvc*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=tanh 128x768:768x768_n"bert.tr.tf32.tf.mb128_pvc*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 128x768:768x2_n"bert.tr.tf32.tf.mb128_pvc*2"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 128x2:2x768_n"bert.tr.tf32.tf.mb128_pvc*2"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 128x768:768x768_n"bert.tr.tf32.tf.mb128_pvc*2"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 16384x768:768x3072_n"bert.tr.tf32.tf.mb128_pvc*24"
@@ -46,22 +46,22 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 2x16384:16384x768_n"bert.tr.tf32.tf.mb128_pvc*2"
 #
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*1024"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 16384x200:200x1_n"deepfusion.tr.tf32.tf.m128_pvc*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 16384x200:200x1_n"deepfusion.tr.tf32.tf.m128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 16384x1:1x200_n"deepfusion.tr.tf32.tf.m128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*1528"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 200x128:128x200_n"deepfusion.tr.tf32.tf.m128_pvc*1536"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 14x128:128x200_n"deepfusion.tr.tf32.tf.m128_pvc*508"
 #
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*1024"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 16384x200:200x1_n"deepfusion.tr.tf32.tf.m128_pvc*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x14:14x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+logistic:1 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*384"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum+tanh 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*128"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 16384x200:200x1_n"deepfusion.tr.tf32.tf.m128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 16384x1:1x200_n"deepfusion.tr.tf32.tf.m128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 128x200:200x200_n"deepfusion.tr.tf32.tf.m128_pvc*1528"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 200x128:128x200_n"deepfusion.tr.tf32.tf.m128_pvc*1536"
@@ -76,13 +76,13 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abc --wtag=acb --dtag=abc 2x512x3:2x3x128_n"pointnet-atlas.tr.tf32.pt.mb4*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abc --wtag=acb --dtag=abc 2x15000x3:2x3x512_n"pointnet-atlas.tr.tf32.pt.mb4*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x1024:1024x512_n"pointnet.tr.tf32.pt.mb16*3"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x512:512x256_n"pointnet.tr.tf32.pt.mb16*3"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x256:256x49_n"pointnet.tr.tf32.pt.mb16*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x1024:1024x512_n"pointnet.tr.tf32.pt.mb16*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x512:512x256_n"pointnet.tr.tf32.pt.mb16*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x256:256x49_n"pointnet.tr.tf32.pt.mb16*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abc --wtag=acb --dtag=abc 16x7x7:16x7x15000_n"pointnet.tr.tf32.pt.mb16*1"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x256:256x4096_n"pointnet.tr.tf32.pt.mb16*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x256:256x4096_n"pointnet.tr.tf32.pt.mb16*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=abc --wtag=acb --dtag=abc 16x64x64:16x64x15000_n"pointnet.tr.tf32.pt.mb16*1"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x256:256x4_n"pointnet.tr.tf32.pt.mb16*1"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 16x256:256x4_n"pointnet.tr.tf32.pt.mb16*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=acb --wtag=acb --dtag=abc 16x64x64:16x64x15000_n"pointnet.tr.tf32.pt.mb16*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=acb --wtag=abc --dtag=abc 16x64x15000:16x15000x64_n"pointnet.tr.tf32.pt.mb16*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 16x4096:4096x256_n"pointnet.tr.tf32.pt.mb16*1"
@@ -95,23 +95,23 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 16x49:49x256_n"pointnet.tr.tf32.pt.mb16*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 49x16:16x256_n"pointnet.tr.tf32.pt.mb16*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x5270:5270x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x1000:1000x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*16"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x942:942x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x2000:2000x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 128x1000:1000x1_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x5270:5270x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x1000:1000x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*16"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x942:942x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x2000:2000x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 128x1000:1000x1_n"uno_convergence.tr.tf32.tf.mb128_pvc*2"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 128x1:1x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 128x1000:1000x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*8"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 128x1000:1000x2000_n"uno_convergence.tr.tf32.tf.mb128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 942x128:128x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 5270x128:128x1000_n"uno_convergence.tr.tf32.tf.mb128_pvc*1"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 11456x16:16x1_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 16x11456:11456x1_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 11456x1:1x16_n"dcrnn.tr.tf32.tf.mb64*12"
@@ -123,20 +123,20 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 11456x32:32x54_n"dcrnn.tr.tf32.tf.mb64*11"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 54x11456:11456x16_n"dcrnn.tr.tf32.tf.mb64*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 54x11456:11456x32_n"dcrnn.tr.tf32.tf.mb64*1"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 179x16:16x1_n"dcrnn.tr.tf32.tf.mb64*12"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 11456x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 11456x16:16x1_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 11456x1:1x16_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 16x11456:11456x1_n"dcrnn.tr.tf32.tf.mb64*12"
@@ -148,17 +148,17 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 11456x32:32x54_n"dcrnn.tr.tf32.tf.mb64*11"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 54x11456:11456x16_n"dcrnn.tr.tf32.tf.mb64*1"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 54x11456:11456x32_n"dcrnn.tr.tf32.tf.mb64*1"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x54:54x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x54:54x16_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x96:96x32_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x96:96x16_n"dcrnn.tr.tf32.tf.mb64*24"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=logistic:1 179x51:51x32_n"dcrnn.tr.tf32.tf.mb64*12"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 179x51:51x16_n"dcrnn.tr.tf32.tf.mb64*12"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ab --dtag=ab 179x16:16x1_n"dcrnn.tr.tf32.tf.mb64*12"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 32x1856:1856x128_n"mma.tr.tf32.tf.mb32*3"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 32x128:128x128_n"mma.tr.tf32.tf.mb32*3"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=tanh 32x128:128x1_n"mma.tr.tf32.tf.mb32*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 32x1856:1856x128_n"mma.tr.tf32.tf.mb32*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 32x128:128x128_n"mma.tr.tf32.tf.mb32*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=tanh 32x128:128x1_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 128x32:32x1_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 32x1:1x128_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 128x32:32x128_n"mma.tr.tf32.tf.mb32*3"
@@ -166,9 +166,9 @@
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 1856x32:32x128_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 32x128:128x1856_n"mma.tr.tf32.tf.mb32*3"
 #
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 32x1856:1856x128_n"mma.tr.tf32.tf.mb32*3"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 32x128:128x128_n"mma.tr.tf32.tf.mb32*3"
---reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=tanh 32x128:128x1_n"mma.tr.tf32.tf.mb32*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 32x1856:1856x128_n"mma.tr.tf32.tf.mb32*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 32x128:128x128_n"mma.tr.tf32.tf.mb32*3"
+--reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=tanh 32x128:128x1_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 128x32:32x1_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ab --wtag=ba --dtag=ab 32x1:1x128_n"mma.tr.tf32.tf.mb32*3"
 --reset --check-ref-impl=true --allow-enum-tags-only=0 --dt=f32 --attr-fpmath=tf32 --stag=ba --wtag=ab --dtag=ab 128x32:32x128_n"mma.tr.tf32.tf.mb32*3"
diff --git a/tests/benchdnn/inputs/matmul/option_set_fwks_key_perf_gpu b/tests/benchdnn/inputs/matmul/option_set_fwks_key_perf_gpu
index 3a7c8c0ef71..546c81b6a2f 100644
--- a/tests/benchdnn/inputs/matmul/option_set_fwks_key_perf_gpu
+++ b/tests/benchdnn/inputs/matmul/option_set_fwks_key_perf_gpu
@@ -1,54 +1,54 @@
 #
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 1024x1024:1024x1024_n"bert_large_tr_fp32_tf_mb2"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 1024x1024:1024x1024_n"bert_large_tr_fp32_tf_mb2"
 --reset --check-ref-impl=true --dt=f32 --stag=abc --wtag=abc --dtag=abc 32x512x512:32x512x64_n"bert_large_tr_fp32_tf_mb2"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 1024x1024:1024x4096_n"bert_large_tr_fp32_tf_mb2"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 1024x4096:4096x1024_n"bert_large_tr_fp32_tf_mb2"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 1024x1024:1024x4096_n"bert_large_tr_fp32_tf_mb2"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 1024x4096:4096x1024_n"bert_large_tr_fp32_tf_mb2"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ab --dtag=ab 152x30522:30522x1024_n"bert_large_tr_fp32_tf_mb2"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 1024x1024:1024x4096_n"bert_large_tr_fp32_tf_mb2"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 1024x4096:4096x1024_n"bert_large_tr_fp32_tf_mb2"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 1024x1024:1024x1024_n"bert_large_tr_fp32_tf_mb2"
 --reset --check-ref-impl=true --dt=f32 --stag=acb --wtag=abc --dtag=abc 32x512x512:32x512x64_n"bert_large_tr_fp32_tf_mb2"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp32_tf_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp32_tf_mb64"
 --reset --check-ref-impl=true --dt=f32 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.5 1024x384x64:1024x64x384_n"bert_large_inf_fp32_tf_mb64"
 --reset --check-ref-impl=true --dt=f32 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert_large_inf_fp32_tf_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp32_tf_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_fp32_tf_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp32_pt_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp32_tf_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_fp32_tf_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp32_pt_mb64"
 --reset --check-ref-impl=true --dt=f32 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2 1024x384x64:1024x64x384_n"bert_large_inf_fp32_pt_mb64"
 --reset --check-ref-impl=true --dt=f32 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert_large_inf_fp32_pt_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp32_pt_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x4096:4096x1024_n"bert_large_inf_fp32_pt_mb64"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp16_pt_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp32_pt_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x4096:4096x1024_n"bert_large_inf_fp32_pt_mb64"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp16_pt_mb64"
 --reset --check-ref-impl=true --dt=f16 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.25 1024x384x64:1024x64x384_n"bert_large_inf_fp16_pt_mb64"
 --reset --check-ref-impl=true --dt=f16 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert_large_inf_fp16_pt_mb64"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=sum 24576x1024:1024x1024_n"bert_large_inf_fp16_pt_mb64"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp16_pt_mb64"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_fp16_pt_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x768_n"bert_base_blocked_tr_fp32_tf_mb128"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x3072_n"bert_base_blocked_tr_fp32_tf_mb128"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x3072:3072x768_n"bert_base_blocked_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=sum 24576x1024:1024x1024_n"bert_large_inf_fp16_pt_mb64"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp16_pt_mb64"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_fp16_pt_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x768_n"bert_base_blocked_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x768:768x3072_n"bert_base_blocked_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 16384x3072:3072x768_n"bert_base_blocked_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 16384x768:768x3072_n"bert_base_blocked_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 16384x3072:3072x768_n"bert_base_blocked_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 16384x768:768x768_n"bert_base_blocked_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ab --dtag=ab 2048x186368:186368x4_n"maskrcnn_blocked_tr_bf16_tf_mb4"
 --reset --check-ref-impl=true --dt=f32 --stag=ba --wtag=ab --dtag=ab 186368x2048:2048x4_n"maskrcnn_blocked_tr_bf16_tf_mb4"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 8192x1024:1024x1024_n"bert_large_tr_bf16_tf_mb16"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 8192x1024:1024x1024_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2 256x512x64:256x64x512_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=abc --dtag=abc 256x512x512:256x512x64_n"bert_large_tr_bf16_tf_mb16"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 8192x1024:1024x4096_n"bert_large_tr_bf16_tf_mb16"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 8192x4096:4096x1024_n"bert_large_tr_bf16_tf_mb16"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 1216x1024:1024x30522_n"bert_large_tr_bf16_tf_mb16"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 8192x1024:1024x4096_n"bert_large_tr_bf16_tf_mb16"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 8192x4096:4096x1024_n"bert_large_tr_bf16_tf_mb16"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 1216x1024:1024x30522_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ab --dtag=ab 1216x30522:30522x1024_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=ab --wtag=ba --dtag=ab 8192x1024:1024x4096_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=ab --wtag=ba --dtag=ab 8192x4096:4096x1024_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=ab --wtag=ba --dtag=ab 8192x1024:1024x1024_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=acb --dtag=abc 256x512x64:256x64x512_n"bert_large_tr_bf16_tf_mb16"
 --reset --check-ref-impl=true --dt=bf16 --stag=acb --wtag=abc --dtag=abc 256x512x512:256x512x64_n"bert_large_tr_bf16_tf_mb16"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x512:512x256_n"dlrm_tr_bf16_pt_mb32k"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x512:512x256_n"dlrm_tr_bf16_pt_mb32k"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=acb --dtag=abc 32768x27x128:32768x128x27_n"dlrm_tr_bf16_pt_mb32k"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x479:479x1024_n"dlrm_tr_bf16_pt_mb32k"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x1024:1024x1024_n"dlrm_tr_bf16_pt_mb32k"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x1024:1024x512_n"dlrm_tr_bf16_pt_mb32k"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x479:479x1024_n"dlrm_tr_bf16_pt_mb32k"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x1024:1024x1024_n"dlrm_tr_bf16_pt_mb32k"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 32768x1024:1024x512_n"dlrm_tr_bf16_pt_mb32k"
 --reset --check-ref-impl=true --dt=bf16 --stag=ba --wtag=ab --dtag=ab 1x32768:32768x256_n"dlrm_tr_bf16_pt_mb32k"
 --reset --check-ref-impl=true --dt=bf16 --stag=ba --wtag=ab --dtag=ab 256x32768:32768x512_n"dlrm_tr_bf16_pt_mb32k"
 --reset --check-ref-impl=true --dt=bf16 --stag=ba --wtag=ab --dtag=ab 512x32768:32768x1024_n"dlrm_tr_bf16_pt_mb32k"
@@ -60,37 +60,37 @@
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=abc --dtag=abc 32768x27x27:32768x27x128_n"dlrm_tr_bf16_pt_mb32k"
 --reset --check-ref-impl=true --dt=bf16 --stag=ba --wtag=ab --dtag=ab 128x32768:32768x256_n"dlrm_tr_bf16_pt_mb32k"
 --reset --check-ref-impl=true --dt=bf16 --stag=ba --wtag=ab --dtag=ab 512x32768:32768x13_n"dlrm_tr_bf16_pt_mb32k"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=relu 128x1000:1000x1000_n"uno_tr_bf16_tf_mb128"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=relu 128x5270:5270x1000_n"uno_tr_bf16_tf_mb128"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=relu 128x2000:2000x1000_n"uno_tr_bf16_tf_mb128"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=relu 128x1000:1000x1000_n"uno_tr_bf16_tf_mb128"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=relu 128x5270:5270x1000_n"uno_tr_bf16_tf_mb128"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=relu 128x2000:2000x1000_n"uno_tr_bf16_tf_mb128"
 --reset --check-ref-impl=true --dt=bf16 --stag=ab --wtag=ba --dtag=ab 128x1000:1000x1000_n"uno_tr_bf16_tf_mb128"
 --reset --check-ref-impl=true --dt=bf16 --stag=ab --wtag=ba --dtag=ab 128x1000:1000x2000_n"uno_tr_bf16_tf_mb128"
 --reset --check-ref-impl=true --dt=bf16 --stag=ba --wtag=ab --dtag=ab 5270x128:128x1000_n"uno_tr_bf16_tf_mb128"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp16_tf_mb64"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_fp16_tf_mb64"
 --reset --check-ref-impl=true --dt=f16 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.5 1024x384x64:1024x64x384_n"bert_large_inf_fp16_tf_mb64"
 --reset --check-ref-impl=true --dt=f16 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert_large_inf_fp16_tf_mb64"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp16_tf_mb64"
---reset --check-ref-impl=true --dt=f16 --bia_dt=f16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_fp16_tf_mb64"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_bf16_pt_mb64"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_fp16_tf_mb64"
+--reset --check-ref-impl=true --dt=f16 --bia-dt=f16 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_fp16_tf_mb64"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_bf16_pt_mb64"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2 1024x384x64:1024x64x384_n"bert_large_inf_bf16_pt_mb64"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert_large_inf_bf16_pt_mb64"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_bf16_pt_mb64"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x4096:4096x1024_n"bert_large_inf_bf16_pt_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x2000:2000x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x1000:1000x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_bf16_pt_mb64"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=ab --wtag=ba --dtag=ab 24576x4096:4096x1024_n"bert_large_inf_bf16_pt_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x2000:2000x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x1000:1000x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ab --dtag=ab 512x1000:1000x1_n"uno_convergence_blocked_tr_fp32_tf_mb512"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x5270:5270x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x942:942x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_bf16_tf_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x5270:5270x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 512x942:942x1000_n"uno_convergence_blocked_tr_fp32_tf_mb512"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab 24576x1024:1024x1024_n"bert_large_inf_bf16_tf_mb64"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=acb --dtag=abc --attr-scales=src:common:0.25+wei:common:0.5+dst:common:0.5 1024x384x64:1024x64x384_n"bert_large_inf_bf16_tf_mb64"
 --reset --check-ref-impl=true --dt=bf16 --stag=abc --wtag=abc --dtag=abc 1024x384x384:1024x384x64_n"bert_large_inf_bf16_tf_mb64"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_bf16_tf_mb64"
---reset --check-ref-impl=true --dt=bf16 --bia_dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_bf16_tf_mb64"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x942:942x1000_n"uno_tr_fp32_tf_mb128"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x1000:1000x1000_n"uno_tr_fp32_tf_mb128"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x5270:5270x1000_n"uno_tr_fp32_tf_mb128"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x2000:2000x1000_n"uno_tr_fp32_tf_mb128"
---reset --check-ref-impl=true --dt=f32 --bia_dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 128x1000:1000x1_n"uno_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=gelu_tanh 24576x1024:1024x4096_n"bert_large_inf_bf16_tf_mb64"
+--reset --check-ref-impl=true --dt=bf16 --bia-dt=bf16 --bia_mask=2 --stag=any --wtag=any --dtag=ab --attr-post-ops=sum 24576x4096:4096x1024_n"bert_large_inf_bf16_tf_mb64"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x942:942x1000_n"uno_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x1000:1000x1000_n"uno_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x5270:5270x1000_n"uno_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=2 --stag=ab --wtag=ab --dtag=ab --attr-post-ops=relu 128x2000:2000x1000_n"uno_tr_fp32_tf_mb128"
+--reset --check-ref-impl=true --dt=f32 --bia-dt=f32 --bia_mask=0 --stag=ab --wtag=ab --dtag=ab 128x1000:1000x1_n"uno_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 128x1000:1000x1000_n"uno_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ab --wtag=ba --dtag=ab 128x1000:1000x2000_n"uno_tr_fp32_tf_mb128"
 --reset --check-ref-impl=true --dt=f32 --stag=ba --wtag=ab --dtag=ab 942x128:128x1000_n"uno_tr_fp32_tf_mb128"
diff --git a/tests/benchdnn/inputs/matmul/option_set_fwks_llm_gpu b/tests/benchdnn/inputs/matmul/option_set_fwks_llm_gpu
new file mode 100644
index 00000000000..e5bc11acc27
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/option_set_fwks_llm_gpu
@@ -0,0 +1,1388 @@
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x4096_n"693be8c0b91b4c5676dcd18bd8e9394e*86&c760443a6da7f9f5da6a1f14e80e10da*22&7b358450c47318de76a4763e5e634847*16&30b29e2cf2314f09929deeb4c51eaf19*16&aed7c82093ea99409192e08e907d4245*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"80439ee6d7f50894995e9f86d46f222a*64&51756d400e6f8eac205d71409b698266*106&58f06b93f5daf7a34abccc02291ea6db*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"d4abadf0c791b4934a11605cb3de5614*56&3591c9f4bf5070287bd37eb68fa02180*64&834fb2e2228b90a1bab9ea9f047f40f6*64&8110bb3dcf981da803ab23d140b63d15*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x11008_n"b8afaa7c60c433749299e63d84cf092c*30&bff9532f3c3da73e5ae9566805856355*30&fa64d119a02fd05921a0e2fba5596f36*30&36c4e65380e0415f1bf1445bcdbedd95*20&04afc9a6f73765639ca4714d485b2071*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x11008_n"8308d595ff830d60614302d9f450b5fe*58&d731075bd6772716b20c76143043b519*22"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x11008:11008x4096_n"cd2abd2f2fa602815aafc36b2e7e88ca*64&3abab397860a7781403988281eb73223*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x11008_n"d8ce76a619ad179a847f71b93e7ffdfc*44&c0c582c72673098d680d554974f51a63*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x11008_n"90652e3bd716806f1aaca754e9a722eb*2&54054acd7057a1f0bd4d9c314ce1a950*2&70de80d9c3c173db066c97b5677c5820*2&4f83b24aaf14212d49868e714dc08e9d*6&8d99fbaad412eee5b96c14fa34b26308*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x32000_n"5806d05a7e9e0d93f6459fbaf1ffeba7*2&242ad5bda1fcab676bbc1e00dc58c82c*2&3d243b0429ed048ccf9d27eee10db909*2&83f5fa89c38fa3e84751c620a519ede8*2&4f28a66a4f3c9beec3175cc04348f018*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x4096_n"de3bba4b1c5e933682ab022ce3064caf*774&addde7c63b027386d747f406a879263b*774&c4314fc3c1651c78e65a83147a53c31e*774&35392f0231fa0dea816486be675d2703*5461&8480df0af790ac0a1854c6d6d1cbcad1*43860&527dd6f9442b67e60492d5828e91718c*11220&74fc28a8688ae199575266f727cd71f4*144&1bd016138113230d863f8256f5b29501*144&71f9e6c3843ea20910bd16aaac8bb0eb*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"81caf4968cb6d70840726bea092fa11e*32640&9030c81d913b89f8d09b4950294c8a67*6731&6687c4412fe2d307a8bccc6ee71b592d*54060&f8d14fd78b0d968938c4abb5ccd43433*21420"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"f97375d617856945169c3e3685779d52*3556&b0c261521fac39b357239ac3c4f28fbd*28560&30f8c96087c40bb1345d8ffc49b35ab3*32640&80cbae99217e043ab9684a8075431da7*4064&491cbe1af498feb0b73a55028ddb491c*32640&4c07edf8925f10447346781505fe83bc*30600&aef3d2f7a3593a2b95ee1e0a199cbf55*20400&aec9643acfb9272a3fa4d36da4dd8cd7*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x11008_n"bbc8924c3fca408ebf82aaf4e7b6fd85*270&b37d2a68cdf4519db9faa66ec2859bd5*270&ae76ccc0337cf698994748ea06658436*270&7cd805e203b2fd677b6a06bfce044f68*180&b10854576691fceff52b5b5e03a56d1b*180&2dddaf5d69c6d22c2a4335faa0a8d77f*180&ac9c60f85b6c11f8c386f3ada8b5cacf*1270&80e317c88807083af281b179f11c28a8*10200&93bad3ba21d4b8a0bf872fbb5c35b4f5*29580"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"62c81edee6869bdb523de3327b26c605*3683&a7d4993f1f851aab1ff632147acc1e66*29580&791b8f216b19f0107de1620292720f87*11220&d19a27b7d10876cbef7c821e92013b37*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x11008:11008x4096_n"e0995df0b0d8f5b8971e46591da6410c*4064&56e7df0b55b74d2883d9c0ab5de3d8d9*32640&85b679b6495097cc5e8818b55d4b63a8*32640&19ea96ba5b648ae2d7da16c118660428*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"5f41fb6d656d67531004beb63d09e315*2794&1bcf29590c777fb55f0ceebe900e271d*22440&fac2da0a07259666f582e0bc35da20eb*3060&81a3910ee5e3529b28d179c582753956*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x11008_n"2bfd863293813eee6ce3e61cfd0bd9f5*18&00ede1ac4cb8ac5e1d5a39eb394effc0*18&e5833e4b6d5ee7030747a0dec3640219*18&e101734fcf12965941350c4ec56ba28b*18&1f46d73e085d1464c30ebecad5c78875*18&8b9a6957ce455f25174eb85d4ce32512*18&4a6d6e265169e9a67eebd37477fe3012*381&685c4d94ee2bb798e606649df26d22dd*3060&9e65faea6738597c9ada849d96b560ff*21420"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x32000_n"b2ee9f177c7400589a7605b45b72ebde*18&32103d4d6162d2fb94cac81f8fc2ac2d*18&109bf1596412332c45c386c156d6228a*18&dc54a6980bcfcb54f5a5371d303447ce*127&dcf2d6bbd9ddcd58884592fca60533ab*1020&1895e24be1ec67e0737eefdebb9ca0b8*1020&8ae157adeff129155f4b4f63c5dd65b2*18&765eaa2e4862d91e3b8318c066100159*18&279fa88355eec69a79e9520ca05535dc*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ba --strides=:: 64x1:1x1_n"4534f98d11b670277ac83ba65d23fe59*1019&f25519a315919c2429bf4f4870d07002*1019"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x4096_n"6dc9d72ddafc53f43207499b2e98acd9*43&c116710d7bd1bc8e9b6d0361c977515c*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4096_n"9434c66e166bea15d06c04af51375ed3*64&7089c5db3bc2849b6af04f09f4e1b3a1*53&06fe6686954ef3fb2d759ee90e135c65*106&b88dd1e7820d18d2e60706645403ee36*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4096_n"c672972a29194dd1fe30783b1dc9c024*56&a6f0fb67132deaa7ccc0af8bec91f5d4*64&6ad3de7139a3dc3a7aa2f291dbd9f50d*32&5680636bb572f5fa5677c381b5a3f176*64&69f620a7e7f02c30205b05f0a87e82d7*40"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x11008_n"ab2fb7464114eb9b72450a16942581e4*10&dee9aabb1110eeda5858ad223eb8a622*20&6c37e7d307b4987e458314888c0c587f*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x11008_n"24500346fade73b255b919c50d934456*29&65a2e1e8083c4076f5be13b79f20151b*58&8ab33d7d8cd27af4c896fd2251bb4786*22"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x11008:11008x4096_n"b8c2de703d8f34871373d030c15f0a00*32&3c741a7fadc94a772076792d2693f976*64&24d3f1cd2809fe980353e25d66df74c6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x11008_n"51d94eb0a05b21cfbb869dab609cba78*22&b28147a90bdcbfcd9ac1e008973e3ba7*44&244c3fa88b41f3a5ee4a2e20970753a2*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x11008_n"155c1e43447b2dd8f2b6b2bad9b41c61*3&36d9d1fc8dddb7171db1e769d0ffb0d3*6&99e3a663c50b808408dcc4b38ce84877*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x32000_n"fd3e8c6640ccd364944dabf05fe26154&0cf553f71d631f3725bf4974eabf8558*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ba --strides=:: 64x1:1x32_n"436cd40a9d7b7ac114e48d05cc81797a&e7eb69403a85c42061306f730492f10a"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ba --strides=:: 64x1:1x1024_n"7c7f52dee23b7803b30a41228bcc785a&4502f57e4724e3c11a714a201385ae71"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x1024_n"ce419362879f50f2404bb8a6eb89d544*78&e4dd367933ab5c3a096fdbfb6b1b9e25*66&93983f0f231989ddaff61f17d7f9d5eb*66&aa27fe35dc276296da9f0e519f985556*66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x1024_n"a92204d67c4ba959e334d141d7860b31*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"aba587fdccf40f73212a830054e59944*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x4096_n"a96e5c87ae518014e0a41eaf71dbd338*4&208f92d9219ad349ace71e44758a097c*4&3c0279175d575e82e1a16239224e2627*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x14336_n"7a1348d3ca835efe52798765d1ce5424*10&40e873d743f486451436f8c0b1fd1196*10&a4f8b4be705b996a9b8c199f200ba3a6*10&714811a12c580949ab5c31408296e208*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"c987b1bcf7e5dd42c4451f5f58ea481b*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"03d0477fb0b7b88749bcd43bb3d75c3e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"b76e97099e74bdfa3c3f6f3f6f4cb1a7*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"b10e847ae6ab96850db4e19864227692*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x6144_n"444bc4bf0eb32842580544063f129561*6&cb2fd4b113b85ee9da7af6158b5e8435*6&6bfd46fb4c7bd6a067eadeb09d23a768*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x14336_n"02fd1e8cdfd4580cf0a7aa9600b75ed0*2&230009a124211e51c668e36e2d40dc9d*2&c0bac68009a5b144a315d9daf953e54c*2&5d93e6cc477bfd064376b6a5adef2559*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"97cddaa0c5c8b4be240b626a7bba31e4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x1024_n"47f511a3facc748f26ca8618a3fcd503*39780&b69fe9417020ed5da9b07650ac19bacb*594&bfd113aa0ce5b499385756d33d74502d*594&8e7827757a33f247aa20d137140df50b*594"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x1024_n"39d69e128b2a9b920ed15f4eef78ed5e*450"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"7039ab4b992bd4a1f7c41bd88029384c*846&78a9a7e08e2190ff26bba3f06f5b0632*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x4096_n"d31709b33897a771f6fb0a6d32a6b026*36&ca8f23bdd38dc297ba75dc05cb63dcf8*36&e56e28158cf9c7d6bdb5ca1515c31a30*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x14336_n"cf771644c45a745dfae28938fc13b903*5100&9e10729f2352339de8de2dde4d7cdd3d*90&6c96058adc3323bcbef7e2406289c4b6*90&ff53e24bd4ff19c44549e71f979b94ec*90"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"c6d05c86413a58c0330d31a5b72306d6*558"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"3726a026a934fec4838933ad1178cf47*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"c9bc341a0004e86b53ffd8a2e6bed45b*576&2964803c797a61e0527e70a237f68b39*576&9b9b5200e8ced3184082fc82a5a4ec3a*540"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"032cd22a31dd0bae7e1dd60a02896fc5*468"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x6144_n"6ca28bdd600b17b5e57b9f6e837b9fcd*54&e1126aa5e2092da6651a58ad60364ea4*54&d6c53dca63fb93c3b2213ec668c5e78e*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x14336_n"c8b6962270d6873f5dbeaf7b802c5f53*1020&e1222709cc9f489a1e8d564d974fec33*18&af525bc0435e2e99d882744d2ab008af*18&20412e85f0d3c900f92ae5355517033e*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"65b8780050f4d1af03f446cbd7f0c1cf*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x1024_n"53c6379aa6efd68334b43a10d093d603*66&03132265d1eee06cbe77fbea15f5610d*66&e4447cffe93dff597847b4e1e4eb32fa*66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x1024_n"d1bed82f18152798771ad1875881c66c*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x4096_n"f186dfccf142353a63a117c2c315eee1*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1086x4096:4096x4096_n"ffd5e84ea7750fbad71850154954f20e*4&0ac3e55fab0e42c8fa5d044441d699da*4&72a905303798e5c324fafbfa1c5e8198*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x14336_n"f8612f913c5afdbdbc5d15b75554f1f0*10&2f050085d17342887a06577acd67a7e2*10&9cc25cab5661491df50c0dafa20ef5c7*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x14336_n"f4aaf9c789673d55ac6384cdf3900308*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x14336:14336x4096_n"e0b00a685fd80bf2a75ee244b8d167b1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x4096_n"fdfcdb1fb979c67acd38e037d32901cd*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x14336_n"dae8bc6aa1c07f9ea982b03a3a519483*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x4096_n"2075103ad98490740fddb0a0f007f9c3*16&206f3ae2cd23aa9aef92bc864b58bd41*16&ec9c35821b6425ea724a6573c6cd5237*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x6144_n"94822d6f63a071c720b1e8bd1dc1170d*6&37b3aff5fbd5a352836659bdc4aa3df4*6&fb28054714da7c753fedda2a1ab4661a*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x14336_n"5ad709693571701d43e4d2e753799e0e*2&ce69987a0a19f90e3990867b03119748*2&6cb7f848c3b626a1e4e32f4315db43a3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x14336_n"d5423e53f07483351ac8e55f8e2d3e2c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x32000_n"11cd92fadfa24a6ef6b5b631c55e4056*2&23a4e83672cc3445c8c06c00ad769210*2&193279192c57115dc9fde3ed07c4727b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x1024_n"e6605bbbfe1fc5d1dd65fb846f681538*66&417b68b9bb76af773bd99693afcc1afc*66&66d57d7d6dc0acc27d3633de09ea1acc*66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x1024_n"8d9b6c5439451093b263a4d4868c6e60*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x4096_n"812265cf4a0b8acf35dcf84b067fe13c*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2171x4096:4096x4096_n"6777af0680194e479104a0cf94e1907d*4&047fb253a4a5b984336b1c0e5eba5796*4&4e86cde37548f75206e2c9d5e46f2705*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x14336_n"eaf1864ae39b38a642f8b64f902fc110*10&d551deece0cae98bc3f529d70a625236*10&1333ca1129a4d4b8fef79f06f7225a04*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x14336_n"4c86f0cee12c419972c41e4f31a16f1a*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x14336:14336x4096_n"4ef2fb4d667c5a61dab059cc573fda31*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x4096_n"019fbefc68a6032c2b0d72f16c0ba670*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x14336_n"e44d5e13c1324b9e228534a78d24a121*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x4096_n"3d66118981a6bfc5eafd44a69a219a47*16&307127c072bf376c335319ad838c23a7*16&e07f457dcd1c85c27c1aa25085900234*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x6144_n"71c6f43ab287594f74df7896bf92275b*6&74a5ab642c0e5e7286454ef6dcc528c2*6&54783e0e375f45d5d41fc1a8e8697bbc*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x14336_n"df65a44848a5dfac9f9ffe8abdf127d2*2&9d3736e3bd69c6e03d46c9d02f4284c2*2&b5d0e093f2f2156b3c59810f98dfae22*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x14336_n"a7a21602cc599b41bce00a80bc7e8022*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x32000_n"d39d492f857a66d7501a4254ac220276*2&749aab0c0cb691c5a2ec3108d7c172d7*2&c892df120f4416803c920313f96352d5*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x6144_n"a2cc0db173dfea1cc0b1a3768bf50a08*64&a486bb660bcc9e3b8685ad361154cf35*6&e6dfcaf97614792045b2250cbc3d4dc7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x4096_n"8fbfee567a16bbf2f7166f7b6b9410ba*64&efde27daaccbb9cec60ad2491b27cf6f*4&199655433290d1c627fd01c3b8e01c8d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x14336_n"636fe0fca1ae95f53878d088b59a5cf1*64&ad4b4ba465ba262383105a0063c6aa42*10&15d150511fef594eec3e9b3dd7972bc1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x14336_n"d1a09314c7362ad09d0af4477090b4d0*64&378a61ef4b789b5eb8cf11d08e2cc494*2&780b47e35d2a96287f0a93fe2b144b93*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x14336:14336x4096_n"47d71368f3670b5f12df187c69efaa9c*64&47eba6330e5c5f63f3907ba5e7a34176*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x32000_n"6cc2297e61c0c2770a7d00552e013c68*2&17527feb6670dfb6e030f27be02c5c22*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x6144_n"566d3acbad9c57f4df9c1f0c45438802*576&7400e05ade2e844ef25c505b3340c105*54&d7a95239df50ce447369d8d1095a9708*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x4096_n"d866f1acf4a3f8b359a3e731fdaa7d22*576&9b31a212199b495cf7a08bfd6f0b2cc0*576&2a5f6875e3f8209ae8e11269642a3400*36&cffb979be700b2845da21f5425005db1*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x14336_n"7ba83b119f3a5b98982cc99eca66e5bd*576&c8355b6fce67d1c1b1afc7d9a2ff5467*90&9c1401aa776d837dcd3faa6979d255e4*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x14336_n"f1cc9834672ac241b3ac7d8ea628f1b3*576&79f0c623acd82ee59f5ee91bb72b8453*18&687cfda9d4ddec7fd26695c9dfcf2e5f*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x14336+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x14336:14336x4096_n"8f057063a51a2102b6dbd45c5b942a40*576&54ee5a5f6582559e186128d248963819*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x32000_n"93c270b6d1e542a04988ab418bb92643*18&b615c5a1e2611586289f9e701296ff8e*18&d6e819e9529d7bd0aad6331e3c9351bc*18&2d4eae36250c2a93325898ffe6c66dac*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x6144_n"d02b3256c75ee3df47b053645cef20ef*6&2551ffa0b8ed645d8b581e0e07cc4994*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1086x4096:4096x4096_n"8ec6bed129db989148c16a3819f2518a*4&8f06b40c49e6887c8924bd1f376541b5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x14336_n"af482757ddd1ba5897490acf3d3a1d3e*10&a88fa66a79780800a780c779c5e3f133*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1086x4096:4096x14336_n"6e2ca1ad4425d90d88b9f84e2097606b*2&05f906a02f8adba40ddc9f8669df22c2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1086x14336:14336x4096_n"0515e76ad79a3c34aceb663d8155cac8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x32000_n"8ffa2f031324808117180aa746e10db4*2&dbe814c6d4c004fb1866076e4e70f973*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x6144_n"32966f893a4bf06b54cca23b6764625f*6&e93f4cd44b2990d94d80b4d7f936e9aa*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2171x4096:4096x4096_n"bd4da9957d1a54502b24fa38a95bb8be*4&0c17f7f3b507c7a3d9d93c25803969cb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x14336_n"d377f37373016983db3f9d9cf2dcf84c*10&6a38d0c857aee3afda281a21785ddab5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2171x4096:4096x14336_n"da96119a1a1b402660832df76ff03852*2&4c97ca67416b12a20472b8ad50b0737b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2171x14336:14336x4096_n"8fbf7819e6b813d2a7fed46f5a6a08b1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x32000_n"39e657855b4cad45ea5aeec9e611c8cd*2&73e31316a49a292c7b035234e359abb4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x9216_n"5f6faf3030a3ad6f8da30dfe836ccf05*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x3072:3072x3072_n"a6df02d6a37e2bc746cb3807f1a27ecd*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x16384_n"c1f5dcb552b5bb0c485739ba9616dbb2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x8192:8192x3072_n"a86f893e096454e65c6d6f800a465523*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x3072:3072x32064_n"f02b5469586238234dfd8ee4a553f2b2*2&e35580e2288b9464a50e487effa473cb*2&842b03f881ce3cab5b2aebc17066f64d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x9216_n"2bf4153b03f7ee2179c0b8367e0447de*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x3072_n"03f3728e9bc491378b95db2d75bbb2d2*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x16384_n"d7eb2d6fe46a8fed6622e762097ddd86*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x8192:8192x3072_n"dca235f7516e0c0e710dff95ed83c573*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3072:3072x32064_n"4584a9c4a87f8fac3a242c22c0d2444b*18&a7bb032c3de92f558f38e915945035a0*18&f9a92156ddbf80ff74cc6d54d7b54be1*18&c42cf187edc36e4df5d53fc17eb157b0*1020"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x9216_n"9dcda3dbe7f80c6f0fa7919f3b0bb0fe*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x3072:3072x3072_n"05064d496961b081fa5575d9e99a0c75*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x16384_n"41b8b81b4e3e5871c4eafdd21177a5bf*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x8192:8192x3072_n"885616edd6c67728f51d9b7ca083d961*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x3072:3072x32064_n"aad43e7641488b928576593de0c3d4fc*2&585ea5c232bcf3943d7df8d8941d86a8*2&7e6daccbef336d0039b72d08a966c798*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x9216_n"0426c480846cd66e032d85d150627b8f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x3072:3072x3072_n"d2b48aff44c343dc19194891ec4cc677*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x16384_n"7bcc80b7a63c3ef9035967881b6e6ed4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x8192:8192x3072_n"b9998c6bb96daefc1065bc8e01b11e6a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x3072:3072x32064_n"97faf5fffe639179e63894e249ecbb5e*2&268e146364ea0aa1c4042fa0bd511ea1*2&af92c811c95de6d9e50bff2b2d7e6755*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x1024_n"e6f1c97a6155f4187f9a170112a920eb*128&264e5e02fca5008819521d16999112f2*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"0c9f54ff001a541828a663440c9fd8d0*64&7e2cecfa3e9f103f4aa64c14af0a3409*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"a3a0bbd5c6769ed4651e9dd8297072c1*64&5a38ba11b0d6813d2f6b7347a33ac08a*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"f4eb30c219128c76a069bc4b72ee0951*64&2c663d9d9e7e8d884a8e9eb1a39ac680*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x128256_n"871fa6b074d1822cb7a5f99ade370dfe*2&5b55c7bcfa01879c7f11a20aad62269e*2&2b8ac52e23caecd765df61d650e91bda*2&4ffaaee3bc48b93a9a247424d3353094*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x1024_n"f0126b49bd8db5922e1d81736b309b14*65280&e099fb09145bed26326cc4474c4551be*25500"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"32ab864f962f4f75deb3fce6134a433e*32640&af7cec240edded21cb6a02620445026b*27540"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"55c69a2f2608db100434db60ceca941e*32640&564e13f1355a97a961937197e1b0ccda*31620"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"b7efeca74dc464c28d0be8f41abaac12*32640&fad212c50e75b7ec855514e2be27661a*32640"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x128256_n"cb4dabe48ccda51cfb765a69521d8485*18&6b30fa49a084207d5e7bcb6ef041cbbd*18&8ef395cbf510147fa0e943ae8c9e580f*18&d547cd251df2562d43b015bd05345dd8*1020"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x1024_n"694ccbcd0cb2dcac0aaf8d3101294de3*128&cc056971f280bb35dec748fa16d936ab*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"ecbb902a76aa2e905e9eec6bdf549e3b*64&8b560c2785c150fed5f9651d6fe0b600*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"5759984e038aafc556b2de777e08bf73*64&0955bb6822c8bef3b2d614d93cc793f5*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x14336:14336x4096_n"2f0cacbe7da34eb657c4491736354211*64&e78262430074ec070d49ab88a5004c40*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x128256_n"3a0fccd298f537f41a8148fa477a706c*2&c0dea4c99c6b4bc0f9587bae6f8201a0*2&fdb42166bc7b784fa270d222a2419d10*2&d3e62c15a7654c19df58ac2815004cfe*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"8809f9e6548095516396f912df043146*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x2560_n"d90e922bdb1249a2f49318394f1cdddd*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"634a78ce3398ae14c77b3214e3aff3f2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"9f50d05a7308acf3b5e9c281117a13c6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x2560:2560x51200_n"4e57e15e594a4a4a5162b1f8b2431dd1*2&cbbe8ec2f92b32a4eb39217d79189a65*2&9ffafad3d33609a8c70be55dbc6e5d5e*2&9327d52ecd74ad2cd374ec442302ef38"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"35c53ff47664037f4d788c4343e11b26*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"b447c0743dd9866878e2b7cc5b143667*1728"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"48ae3e2c8e3cfed924a3c742e4c9771f*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"b080b4dd0a89d12e7cc30c71fef17daa*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x51200_n"f7f68dabf4d1ed2512ba3055054880ec*18&b308317605f1c4bade0e22e385026025*18&5a7a6ca6f0ea6eeb1aed2f06f83f1429*18&ad67896ff4ea089435b5e2736581048a*255"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x10240_n"c2c89085161419df02257bddc3f773d8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x2560_n"9ae0e1f3bcaea14bd78998dfb67210d6*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x10240:10240x2560_n"8ea2f43e9135bdcb212dfffb0c67e3d2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1022x2560:2560x2560_n"09fdbb92dbcb893dc4fd1acfa2291fe7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x2560:2560x51200_n"327cda55c9b550c3c251a3d9fdcdad39*2&d2231fac848da6602937d4c5bc1fc147*2&4f3f41363006ccfc7cf775355b3985a6*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x10240_n"cf05274059f67393b239fec79e452c21*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x2560_n"971aada32730eae36e43fed59be9feb5*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x10240:10240x2560_n"9ba6ece497cda082b9f63aa536e35486*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2044x2560:2560x2560_n"81961bda571941eaad4d952ddd09bc8d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x2560:2560x51200_n"bff2a33f904f7f964eb27400db65bad7*2&92271de4cad0c8dc25232733eddb345d*2&955d71d92035bedb9f81116dee95d81e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x6144_n"9c52d8c6b7d3d250097303f748d07d20*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"4fc3e11daf926573f2b97f06cc56aead*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"3b9c78da16fe1c75618b7eaf487eec90*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"b726b6953678b0a2c6272bfd73b46452*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"213e35d32f1e5b3f79567ee3bbb06fd0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x6144_n"7431efa0abdc004475a940aa087b824f*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"dc0f5b701910d2c37a9e3f458ecb4e40*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"867336398fbaa33b1524496ad5b3311f*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"5c034f9684da0f656020c5096fa68b13*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"ccfc9dd14be90156f7adff595aef56c8*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x6144_n"9e6df2875d4d583de3734bc9f47fabd0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4096_n"7918819ec4df86fa300c095106ac51a6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"c32174beb1cfcceeb4fd509444b2ea21*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"54528b5a23e5a69fa83c26bfee38bef7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x14336:14336x4096_n"2899f8a5c93f62eabeaa9b326e99452b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x6144_n"8d0c692ea6d7e30604e604645eb4c7ce*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x4096_n"a9194445eb85a8feadbfc20b5afefc87*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x14336_n"530d8743c56a4e95cdc4d041b90a4fdc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x14336_n"e881a38631be9358f220f6d72ae78389*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x14336:14336x4096_n"4b76837ae4b17f02bab26d1bff4fb172*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x4096:4096x128256_n"a7861fb6a226b06f24b121f0323d8c48*2&ae533f59318511cb2a418bed5d00a4a7*2&09c7aa7d2127c1aafdcdb2425ce9b7d2*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 31x4096:4096x4096_n"e412fdceebc684a0aebce539ee87f726*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x4096:4096x4096_n"154aa8d31de2c85b94a68cbccca4d783*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x4096_n"aec17c403f8e216525d25c2681adc696*2040&fa0798cea66a9815d03151222b36264f*12240"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1008x4096:4096x1024_n"e00d0d34694f8350a6efa2adeed43403*78"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1007x4096:4096x4096_n"c9a90747e1a1b6a1987a5100824a76bd*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1008x4096:4096x14336_n"7f1da4c502862d018b6cbbc6156ba1aa*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1007x4096:4096x4096_n"590d334c84790eb367b9679a6bf09432*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1008x4096:4096x4096_n"777d9a0c586bd35fb11fcf814957dfbb*22"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1008x4096:4096x14336_n"f9894da56849a7682b61f4199262c87e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1008x4096:4096x32000_n"e1abd31bf7c250b0e6f53befce822c7c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x2560_n"5d21235914088b4b593b803ed09f71c8*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"a195647f0ef04f5ab74e05258a98f015*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"39f08c0e10dc64b75080028bcc306ef5*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x1536:1536x73440_n"f28d403d216fdece60ea0b68f85a94c3*2&8073f525b433bf934d02717f23c656c6*2&7d69909e5aa906feceba89f4cd9ad4e7*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x2560_n"a016723f8dd6dd2cfdb1f1a35cebb793*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"30acb91705825c52f3b3364e12924608*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"9f6bd05decc46b7a43a8ade19bc90298*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x1536:1536x73440_n"f6e24a1b7dd16af30486a29429db9245*18&1e635fca36e9c70eb317d91e8d9acb33*18&7db941a18b4034049bd2902a608d55a7*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x2560_n"49249529c398c98166ec56224c56cd24*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x3840_n"6156eeac78b7d5417c54ab7f1ae14fc3*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1086x1536:1536x3840_n"7f642737617b4e1e2c3af2adab590c6b*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x1536:1536x73440_n"adc06519a8867cf3f67d2e090b411c99*2&b0ffe98008712bf54b23b4f6d9dc7ff1*2&95ca68e95929771cbb4d08adff6dba3b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x2560_n"fa0b635f1f999dfeab8f4fc492daf283*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x3840_n"b6663da19a20559701d700fbd3da27c1*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2171x1536:1536x3840_n"10b4390026aecb381a918e32ebb160f6*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x1536:1536x73440_n"0aaa45c9fd0346ca7ed954c337cc15fc*2&a6e2e35f802f39269cb2eb9e72b4d380*2&4cb63b50312ca39d9ae212ee938504ab*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x4096_n"054d188fc2c52641d49aae26e2de6114*86&953af2f9eacee283b715fe9db4130ea1*86&7d625b048abfa889e413455f8d429861*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x4096_n"1954dc9fa9e32672c8db16c2edb43059*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x4096_n"fcac3364c1735c4a7e58a7aa3f67b7aa*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x11008_n"cd50282cabaf563cfecd18d95833b26a*20&20b46d48416cf271b48f7fa7c60bf128*20&3cb83e9097a5293eb8abb27036b3f2fc*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x11008_n"2cdb57201993642552d684dc5603568b*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x11008:11008x4096_n"2b71ec3ea9a68a85f173fcc4456083ec*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x11008_n"4450c7f74a0f13a9e8738f27975984df*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 34x4096:4096x11008_n"4a2cb8ed643da5dc592b0d3fb06279cb*4&669ee43aed6586aa16aa75ed40ce3dd5*4&ccb67d0f1d4d0d940dcddc9f860f577a*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x11008_n"655ee22d737ade8fa933c5a884994be5*2&bd7cd4fe0066b8888e7f1149cb80512e*2&85adf28c6c020c9365a38d1667314850*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x11008_n"82497e5bb7ac49341efa9b3a58996ba4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x12288_n"443f0f977006f01b86590dbf0d7dee1b*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x32000_n"dd1749ac8300bee255f2b450cdb4af03*2&bb5a85d4eb90afd0de07b7992bb51975*2&5cd6c9513f34514b9a20ff7d5e175cac*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"a30e27d3122956625f6e6184992a0b1c*846&289140e7dfdfd6f4bea8ddd8d112e581*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"4db94f0df638bee362150ddc5d1e53d0*576&3d74fad5f0d8f6e59db6147d3bf267e3*576&cc842af073655d5a84bdc1e44c5234f4*540"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"d65958da072b37ebc1dfef1a48fe803d*558&14d8f22e26322b5ff066153338d35938*522"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x11008:11008x4096_n"09bbdf5b90b56b3d1cb12a98e029f946*432&bab4f25f7ea0cf66b0d5999543838677*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"a451e51512dab3bd09cf1a0575634f76*288&016b2d5636b06f670f0244e089a7204d*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x11008_n"a9172d80e246de2940cfef5582402042*36&741f70ddbf9893b6352f0cd441a5a89d*36&27ceac393a48603714c05c0a0bdd3f82*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"67cd87e12419300b7faa62723491a261*18&4e11a56ee3743df0cfd8fb30da9c3d66*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x12288_n"35809ab3d9b7d4cb1cad1023c2df6a1f*486&4978d6ab6533af208cc597f5b9f9fd21*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x4096_n"80c631fc8874273db0f1740b93193bfe*86&fad5434bcc9d3c9b148f7782224bd8a9*86&786150d82ca56580d3900a4e23e9a22c*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x4096_n"f47d6dea98b1cb86d1f2c65700b57728*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x4096_n"9789abd854bb59cecc46f6e6c31a8764*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x11008_n"f987bab5a7e5c05de128015fc3f58018*20&b333c59f5423b75f87930efe2feb0e37*20&be583cb1a1c16ce999bc023404246582*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"2a7dca417ba7c36d5aa3cd5fccec0f6d*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x11008:11008x4096_n"53d0efc4453257a5c9e19e3572fbf283*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"586406e3ca6cee306a2cb01fe1440947*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x11008_n"1599fadac81cbb65402031b7a3fa20ae*4&fd857e915ba82154282db63dbde93623*4&9e7d90bc48510aded0031e13aab1e4bb*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x11008_n"65cacf5a5800e9041960c15d279830a2*2&272c9bd1d8e8ff04aaa57c3355442f6d*2&5c9194cbca28c8d435925a405b0f6f82*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"f211092ff407fb6317a8b2539292f4c3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x12288_n"677eb5e8064aa477395af3a7747aa3af*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x32000_n"f924230ae525a92aacee061a04a8ce08*2&c3fddaf4f8ac8b4b456e9031db89bde0*2&db1ff632164547a66684183634844846*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x4096_n"849864ce4ef44480f68f36fd83fd43be*86&40f6905e7df20b8acb0216d36df3a080*86&4029902880860a2f361c18cef2ad97bc*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x4096_n"657dae306e037f25b3eaa1d709fdeda5*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x4096_n"7bcf4ce77515dbbcbd4cd827a3eca7d4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x11008_n"a07b9d98cbb214445d791ce31b20f3c7*20&3d4e648706b2662d12b3a8e63cc43909*20&39b10b52684cef6d560f98878205c0b9*20"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x11008_n"28bd4cf8fd13a58817d49337acecd761*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x11008:11008x4096_n"80537f1fe0678d2797b68f2f70b954dc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x11008_n"fbb78f4a5a5db602d27728a558aa0aaa*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2207x4096:4096x11008_n"a323938587ac3b9f3343044b580bd9d5*4&56c8071400b184836d339936869398bd*4&f3f9d31469a5de8b6fa926178885a6f1*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x11008_n"418ceda9aa621f418c27a3ed4278f647*2&643a1347b8bcc38dd11507369e4f13aa*2&31a8fb1cd14b49a2f1a218f9a7223f14*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x11008_n"ce951a9e26f0134360bcb748ebfb8488*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x12288_n"1ce08803148e1900f6e2e1f125af8077*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x32000_n"ad10550ec794f88cc2dab6e0740f105d*2&0329058fbc092959cdfa25b05acfedb0*2&b678e426a0781cdea5fed2ee9261f6c0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x9216_n"d16467d8178598fc43c0088d11d6ff28*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x3072:3072x3072_n"b605bca728bc8b770977edd779d9fd3b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x16384_n"fe53121a2a5dcdc20e64826e7f0ae673*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x8192:8192x3072_n"7ccd2bbbbf5956394abe648c92ad1d0e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x3072:3072x32064_n"b08094375f428b9e37a01dc103dff23a*2&ffa01f51c8437702585bcda492739808*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x9216_n"df1285a3e5e614aae1a50889a2d6eee8*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x3072_n"ba1557588527d83c33635d47b00b0952*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x16384_n"afbc1d65d031b1ef35a12b7ba2821ec6*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x8192:8192x3072_n"91e22db663128a1897140ffc1730b014*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3072:3072x32064_n"122a0683085943d688a6c81569d7633e*18&97f1e14c0c1d6e1a2e0fc14b18c2832a*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x9216_n"68d426240d4a26a1fccd2f93d8b572f5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x3072:3072x3072_n"eaff0c709788f95e2aacc0e71a9658b9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x16384_n"9d9afb7e7c9394a2c90f21423e068137*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x8192:8192x3072_n"90618bf93f2280c62f1b54a2746a6898*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x3072:3072x32064_n"26190fbddb3a4c46b068203bdbd693f5*2&34fcb0890d69212f51af2678f31188b8*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x9216_n"a39a6d57e914f8b0ea3e393b3fd8c06d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x3072:3072x3072_n"7b81ba9102094a6585fd35a420f2c402*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x16384_n"2c2982a3d043fb2366af4c41a7507f44*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x8192:8192x3072_n"507c998f23f8466273b14eb2080aeb32*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x3072:3072x32064_n"bc57c714986a8664096aa02d70a9d60f*2&d27219024c3c57a1e0d70ba131936a18*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x12288_n"fac36a4f38552123a429032c930e5cc0*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x12288_n"29761e31db0c240be49c1b21000e4303*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x4096_n"90c41e7a3085f751374408fb6a8a6924*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x151936_n"b959d0ebe4fb9a5b5bf2f68d03347f63*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x12288_n"bc9b6b85a6825054209d78a88ad56c72*29580&c73a2df7cd31de8110a878455487b826*4064"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x12288_n"21eeca29eeafb3832fb8d9798b212b33*3060"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x151936_n"57a0b2076777fb510dcf1e8cd2e6db3c*1020"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x12288_n"b8d2abaa2356efd90e71d9402d4865ff*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x12288_n"f35ad8b14e32bac50f29c71277fda0b8*6"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x4096_n"fb1d99da810e437494450d8dfba4cde4*24"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x151936_n"6fccf248b0b5750db222e4dfcb0a8df9*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x2560_n"d2d28ec8b3f1cc3d4cbaa80df45f8ef9*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"c0909366fa242eba862f01cf97ed9f55*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"dbfb62efa56b875ffb5152a48cc42d2c*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x2560_n"f805a092b51f9aba77f95d81bd67d7c8*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"0b6d164e22d7b5e991648ead046040e4*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"5696fb57233a92768c61308af4dd19ca*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x2560_n"c55ead88d4833c67dff9036237c4b395*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x3840_n"18bb16ee4dedf004ac614a9b0917cf96*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1086x1536:1536x3840_n"2c5d3daa022d0c1e16ac5dcb8229995e*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x2560_n"811884d679d7826d634e0fa42caf191b*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x3840_n"a5c07130d60cb9cdb50e5023012c11a0*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x1536+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2171x1536:1536x3840_n"a8cf69890a9127732c45bd9dd8e19b70*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x1024_n"d2233ab665c4b4ff9f900ebfb749148c*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"24fc5a3472aea3ca2388b690f447f2de*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"7662b9f4f350363a561a7bcf111cae58*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"78d54b751c72e7aaece0d4d2d7c2540d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"fe5035c766257cb55836b46b2a26b595*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"028724f9dbce73316cfee7b285cafd89*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"dd7be7ae415d7ea03f1b280861be3571*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x1024_n"e1e092696a8b149c5f06a48575ccd6b4*450"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"a1a10fc506100fa8e72225c3354786e9*558"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"6d4d50537e49f31852065136419ceccb*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"253ef605ff70d0dfe8f63e4ba34edc2a*468"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"cf5e677415c5cd7d4eb35965c4cb75d6*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x1024_n"77f6915ecb042ff559eff26aa2ebc35a*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x4096_n"ac46b2f571b1a8da334bf383a84f5873*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x14336_n"d1edc679dbde671b3a0da923eac73ae0*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x14336:14336x4096_n"ced87c784612b4e19be1265ec4ad2152*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x4096_n"32e85d796aaaca68166b477ba645a4bd*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x14336_n"2a5958a07bcc086569491c5a3207509f*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x14336_n"e4a37249c9fc0ebf7b670583950c4721*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x1024_n"bf13c8ef29f271094d23954ce4adbd61*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x4096_n"2efffd9ccb0fd9c5aa1d9a1c57863fea*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x14336_n"982f0e210f1c2e151d20c5ef2c4237d3*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x14336:14336x4096_n"1f16afbb19b24bd3ccedcf7cf9d9d370*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x4096_n"879b44f9ffdbece7b3db9135a0c125e5*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x14336_n"618558cb5cd61835fc2a114fd6e3df57*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x14336_n"cc9e6a71d12de463dc9261528748ccb3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x12288_n"ee38dbf9efd7680f15b25695b6541f05*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x4096_n"9821230b83b58d38bce5d4dc5cfc9a3a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x11008_n"a4fa42ee824e6aa63987dbd80a338fc1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x11008_n"10af8062eba92a24e9e2f89a2cdb1c07*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x11008:11008x4096_n"ef7b732ab4e46aa3f1d4115ccbd53883*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 31x11008:11008x4096_n"ddd1a0919de2afbceffe25d86ccf142c*16&92ba6d4561a2f912d1dc64eb117523c7*16&5e1503673c5a68c529ea233e076bb904*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x11008_n"54505d9600e70bc8c4f7552900dc7739*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x12288_n"650471a9dc4a0f25d6fb4f8863813ea5*10&e4126d63925dde897563ea1b98330ea9*10&8fffff7460649e0a885f5c2a4cdfbbb9*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x125696_n"39fb20e786c1453431c74b8ae2489da7*2&4e87df695c04ee66c2641e04ac793ff3*2&dec2a3f5c9eaba9f7777a3ad942bcf4b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x11008:11008x4096_n"87eb1146bda379c03632e66f93909336*144&39400492c6b3f2600f216055d15e1586*144&94c25fefa6377e7c0459c76cabd897aa*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x12288_n"d8926b09ea349fe35172f16968af9eaa*90&807f9d644bbc518452fc2389bdebdc40*90&fefce7ed4f2f52a3b00ec4b3e7db38e8*90"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x125696_n"a456f88281047a42ce1d8653b0c81071*18&42c629fe12c885998df4347dd32e0567*18&50c1bf8045b058fc938419f5e77d9dfc*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1064x4096:4096x12288_n"d8d4dc003f6616bd0fb5e9819296dd60*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x4096_n"ebeabd1df058d74bc2442e0117c1b0d7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1064x4096:4096x11008_n"2a02e36a939e8d9646da5d6d7b46b6e1*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x11008_n"9dc6349fc07565a4dc12b110a56d28ca*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x11008:11008x4096_n"9f19a59de455bf9e69c69d722e9db2b8*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1062x11008:11008x4096_n"fe2389bcd0b11e6b6ee1fe41b7dba802*16&194ba4288fd72e39ba421a129793bcb4*16&aff444fa032e649d0718dc2366a9ee76*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x11008_n"6b3a8d7210a8787b507816a5fe38ef06*2&b178e9e4c005d1cb1696f5b414931003*2&f1dc0c5b39655eb617d96d83eee807cc*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x11008_n"25ccec196568e6a9ad6eef28f0b3a661*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x11008_n"f9a9ceb3b3ef535fda498d04a70f9ba4*30&8969557640738fa23b71b53e2f2ea2fc*30&2834dbd465310eb58e0dbb1c5892a970*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x12288_n"eef657e3ceeeb9c63473b15f8e95d7b7*10&d53f343719c61911089c92ac30746ece*10&0bfd3bedcf79527ee75ed00224a1d7e5*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x125696_n"8a99ab258e8b3ea04ebfaf25571b3731*2&31d67c43fc1ccbc04477262483ce090e*2&06487f61edfedad800552c30996d6c0b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2128x4096:4096x12288_n"1bd9038dab35c5ea1d1f1e6143d79cd2*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x4096_n"5b1459e18fcbdd48282c5a7e69fa0cbc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2128x4096:4096x11008_n"e4ee4795a9c0f27290a2e5e29b0c9a73*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x11008_n"87c98c6561ad36d86d7576c5b3479709*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x11008:11008x4096_n"f563de54d59ac8a260e58c8a87de6da1*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2124x11008:11008x4096_n"2e0b049caf76d33018a04f1a96a80d6d*16&beffccebfce80a00f65bb9e7edcec525*16&673b4d2bfe65f96e9a287285c7e3464a*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x11008_n"83e566efe13cec91e22e2e49812589d0*2&e03377db23bc8d3243b0e17b42037d29*2&0ebab8e47c60ba80ec60e2532650941a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x11008_n"c8527fb89a7cb7fb56353ccbe2795952*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x11008_n"b02bd0b45a0816894b97466928d05784*30&08101e310ca6448d7c4c04c5cff7767c*30&97c6efe428224fcb5445a889826840f0*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x12288_n"f24e2470acec03302bb2ab08aba0ccce*10&1482ea144808858d03c39f0342bb9e54*10&02ece724e780c72d300b2cdee0f0ea49*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x125696_n"6a5c4f9c0832eec1bc0e707fca3365f5*2&b4ed778108416c125d2aebd05c7cffc7*2&a6068781cdc2349a2beb1764ce27213a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 48x3072:3072x9216_n"730d761500236d53ade9ee887f5c056c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 39x3072:3072x3072_n"05ef73fff94511b8b98da50f4b4be638*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 48x3072:3072x16384_n"9b158676ed98d5c60b89dadf9e612a12*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 48x8192:8192x3072_n"d36c075c9a57f59b29083f50a93f7dc4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 48x3072:3072x32064_n"4ad55de9f59601d084abad8a02ce8959*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x9216_n"e88d0e6381d09bfc98bfbb69bd48da8f*32640"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x3072_n"eac634962600778840f9da1420f3c4b1*32640"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x16384_n"8271f38cefaca5cbc16b81bad5626303*32640"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x8192:8192x3072_n"577337184748a1a9672faa10447948e9*32640"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ba --strides=:: 48x1:1x1_n"57a29c3be55e40ad830e1bf68ae91eb0*1019"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x3072:3072x9216_n"e6b94a992598da924cb8eda84cad962b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1062x3072:3072x3072_n"2d82f1ee4730de8406f744f03b28448b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x3072:3072x16384_n"b9cba39b9e31a6006bb20b0fbd800160*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x8192:8192x3072_n"09ff87d2c283ba4bc752dde5004c9dba*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1072x3072:3072x32064_n"3ad0bd6bbd3353625560a9b90ee522b4*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ba --strides=:: 48x1:1x39_n"c1e63cd2ad32ff35e6cf4c8601959bdd"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ba --strides=:: 48x1:1x1062_n"29f0f05363578bceca05bf4962a10ef8"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x2560_n"6a8748d970ea219afef65c9c52f959f3*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"72ca83277521c4c92d2eb0d79364e47e*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"679be8893965077ce9708095121ae2a0*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x2560_n"2791a1351269423c295abf815149257d*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"5d2e44d9c7ed6f988f5c49a741fa5fb1*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"a3d1f47eb1a333cb46182da96ca9d10d*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x2560_n"ade85e7f07ec1f3d43f75bf2645a7a07*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x3840_n"fc4b554ca7aec5a2a567b72956c625e2*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1086x1536:1536x3840_n"7c112cf5e3ca9598191c16c91674eec6*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x2560_n"53d4d388f1fb6ecb4d46ab10c7d55418*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x3840_n"20e2d413bd5d9de52cc79e6906ddacdb*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2171x1536:1536x3840_n"79cf223d201608164d63122b9e9ef789*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"385cee28f06b447e07d5d3a059a92ad6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x2560_n"eddbb9ff5834e3b0e21ed6661b665e78*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"8450eb3463905b34a3637a286354ed68*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"dab0303accd3904321262ef415a56591*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"57bf7a59873e01f415f43d94fc3ccbbf*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"78438ce421b4108341ff8be589b0ea73*1728"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"424b34d50390648012cc1e7eb5c2057e*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"4b74e61ae351d9efa7f1a1270ffd8320*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x10240_n"e0006c6d40b99ecaf4f8035cc262de57*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x2560_n"c512b612f52ef9e4c5473ad25b412018*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x10240:10240x2560_n"c439464012909920df22d3f41ee3b1d6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1022x2560:2560x2560_n"910ac6c19082dc193a40daf305ac1466*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x10240_n"5e7b23cc7f1f792b15ccac8b514c9ef2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x2560_n"7b4b0edfb5ceaed34933fab953b7499d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x10240:10240x2560_n"2e6ef26352d5d036ca7bdc7c938bb270*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2044x2560:2560x2560_n"a4ffd0c6917f9bd1774dd95e47f126ac*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x1024_n"6027b96cf93588a25d3beede76c8c77f*66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x1024_n"de65057c531b92cb01071072fa99b4a9*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"d469bf38f6698f0c7159e84da586e5cc*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"4d515cde6a99cbf16d6736a2825f6f89*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"05ea4121fce2d7f5c803159004a57ff1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"0437189009afd46f54da90738f0711a6*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"ba7a4cf66dd132f1ecdef9e20cd69d18*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x4096_n"71785b921147fdeb10fdd22a0960678f*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x1024_n"9785895c4ed8e8633b43a19fcfa330fe*594"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x1024_n"7743872a0d90d3af6e609ee609037348*450"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"6439f479d09989e51c7dbe0d4caac098*846&12a6806a322b932c549a872810ef9757*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"86ebb3d3118a165703a5130956822088*558"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"47c6323294d9d859552364f61f1f56e8*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"58c1a916c3ec5866190297c4b72533be*576&3a4337ce82faefa8aefb27f6c1cee7ea*576&757b5e67b3a274a8ef58c1318e820a82*540"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"f82c5886a1d351f27477c18956e3eefb*486"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x4096_n"f87c01a2d09384380ae766c05cafa1a0*774&991aed0b0df7f9308bbf17014cbe39ba*144"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x1024_n"416980b5e893fd497207d8e65b1dede1*66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x1024_n"3bc24d8c6e6af37784c0c32b08be78a3*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x4096_n"c36b7f5a926edd79341e8d423098475c*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x14336_n"cd7a978b152922a30eade1b647846ad3*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x14336:14336x4096_n"d1f2726fffb93b7cfd8a4766122a0e30*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1086x4096:4096x4096_n"6a86dc280237d3aa37eeea3584f4d764*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1088x4096:4096x14336_n"973d67c4be19a3945ec7b3ad38e64497*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x4096:4096x4096_n"f2c400fbcd0be3a4a40d5e95ef597b77*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x1024_n"9bb7524f26f47caa59d695af10e9d3e6*66"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x1024_n"74edda5f846abb1bfaa4b25f49a4cce2*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x4096_n"dceab7864f46d4f2b62d16ebcbc21e6b*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x14336_n"6feb80dad53e40b34dedfa255a1637bd*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x14336:14336x4096_n"ece6437de058f3d9c074d4e50f0fab02*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2171x4096:4096x4096_n"8f644d6af96bb502be4eaf9e4813e472*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2176x4096:4096x14336_n"f48f36be41655790a54cdf6d1bb675e4*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x4096:4096x4096_n"fc1148dfd7c32c50cf6827f12f4f3b40*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x9216_n"605791c7b558e1840b52e4f6898c5ad6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x3072:3072x3072_n"9a57052631aa615ce2b5c3774ff5b0a3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x16384_n"fbf52cd5a35904d42064f84a902ee592*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x8192:8192x3072_n"aeeaba7d62b5d1f62995b8f8610bee7e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x9216_n"c50edc6d00115a2fd3d0ff004b4341b5*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x3072_n"be937b2606f944308e06fb10357b05c3*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x16384_n"86f4d3c590b21254cc6e127a78a1ed82*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x8192:8192x3072_n"cb0f99988310868e7572255bdef4e6a3*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x9216_n"a109867a7af9877bae60fb5af72d32fb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x3072:3072x3072_n"8fe6cc2ea356d32fdec87134b7877ec8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x16384_n"d985fc70d66e7647b2f723760022354b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x8192:8192x3072_n"3816a031fee41d7dc19aa486cc0bf990*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x9216_n"ee895110a1c7837921a8f402d13187ef*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x3072:3072x3072_n"17eac8673af3753b4de579b97de2d164*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x16384_n"36b9693900fe91d9aab021b5ac091995*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x8192:8192x3072_n"ea096deb1051b439479f8ca4c8d26b06*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x896:896x1152_n"734b6268e6117ba396087f9732801886*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x896:896x896_n"0fa637f96f3ddc6e22a3c628b473a2d1*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x896:896x4864_n"08548d65841aa28d105c62e61cfd30c0*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x896:896x4864_n"754b09de8f2c5d4f0d94fb1ae6b1fbc9*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4864:4864x896_n"f8c52997dbea84353c632cdc47595a5c*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 8x896:896x1152_n"b467c4dc3d74c0369b1084023a79f09e*12192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x896:896x896_n"919e7df9230f397978d5a6423eccb829*12192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 8x896:896x4864_n"3121153d698d781039b3794cbb2fa2f9*12192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x896:896x4864_n"990785b73296286363c48cb38a4fe36b*12192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4864:4864x896_n"c555229d9d918ffca92c3ffe8873ad70*12192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x896:896x1152_n"57c10e2f701ca428581bfa2e27f6d0f8*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x896:896x896_n"e0744fb45577257f212451c3b98c6182*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x896:896x4864_n"87978c1fcfa0f2dd9f3106712df33325*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x896:896x4864_n"567f31fe2fd1e9c57cba0856532ae212*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4864:4864x896_n"240afa05dfb61955abf9375a02fa70a7*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"5513c0dd98f61d91e8e5c6bc7db3dd67*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x2560_n"642e9fea16ad52c27790c883eb4b4840*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"360f9f2940f637f9edb5c23d1184db80*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"aa7e475dcd36e45ca586d6bedcd97d50*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"0a886269c6996f468fe11035b3e18a3a*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"7543cdc88f0b1736f499425dde2c358a*1728"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"21bf02b3a5ca79bb8d912b21f4b97753*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"d6953c43d0fe6bed9c942a9676c63373*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x10240_n"f176adbb8888fe6432500b9e5145c5d5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x2560_n"ebc9404523b4b332b06979e953fb50ac*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x10240:10240x2560_n"cea3312c5e75f6f6a830f90c80843fd3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1022x2560:2560x2560_n"05537f909309f253450be741eb4e617b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x10240_n"1ec6f2f714b5e0d32160cca507d4bb1e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x2560_n"3d4fb91ad9805904a739d48f97cf51ed*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x10240:10240x2560_n"c94deb7db6b91d1b9e01ba3d1fe019ff*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2044x2560:2560x2560_n"d48ef13c8055bd98b1e402b0da650f2c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x4096_n"619d20c65d0bbead4a7865c5f55c73ef*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x4096_n"2232811fdba3e397ea130b7b3fa2c1a1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x11008_n"90dcf7fcb8e54d99bd3c492cb5bb68fa*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x11008:11008x4096_n"c95ac63c0a25ec4f060de93ce32f808f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x11008_n"9ba470cd6d8ca5da3d5233c1b7d8b679*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x11008_n"867503d037a37abd88626e81ff10a911*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x12288_n"d5688f6db439671e2309906b6d05a3b0*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"02a26f499f7c48abbbb45b47aca558c9*558&1df396c5e8b5c2a846b2302eaa909ea5*522"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x11008:11008x4096_n"1320ba3b68ce8ec927a83bb77425c1ab*432&ca4ff8f29a4075ce588f8ec1e8fa725a*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"adfec35a9c671537e63bdc05f231d43f*288&14bae87eb2c82ce06c919a18555479c0*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"3d6f6abd8a0d35016e384b62eafe0611*18&b56ebe5b14bb189394079c9a4f78d906*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x12288_n"be86444b3d313f0cf727fe909e7092ac*486&25ac244a25c427e8930d60146a393a04*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x4096_n"94e11983b33e19898ad5a97c6cb8d162*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x4096_n"4fa3ae16f893591ad2ddf938c62a2101*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"9f1c8f9ba2459c04bf49cc0453ffe482*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x11008:11008x4096_n"89c9f975dd2abf89792abdc0d3325a41*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"18a6ccc5a5e4924b648e265d965da997*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"a1d7814d5e896c27d16ca456d816614d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x12288_n"6c8d39e150b39f87522ec1797e994148*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x4096_n"875ef22fee4827b6ba86e58e4be0373e*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x4096_n"5dca9641c61749c2dd2725c30d763ca6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x11008_n"5d8b4f1f568db693b6cc7651b7a00484*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x11008:11008x4096_n"bc7c4530cc63f73873a897a2f9b80ead*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x11008_n"40202206a986564017b684c32736d194*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x11008_n"33c030c5ad3eeb7367dc51028d2892f1*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x12288_n"660910ce885920c77b98462647ac85a2*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x12288_n"3f3ae66c966d8eed7db0d3bf70d1381a*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 31x4096:4096x4096_n"18cbdcd194834381fa0a59f1d05b98be*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x11008_n"ef2783bdeb4a7ce14fee87bc47a87134*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 31x4096:4096x11008_n"dd32ecd4a05cb2fede49d062b0295533*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 31x11008:11008x4096_n"6732b0b6ce71ba95ad8a6bd1d81f5e81*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 31x4096:4096x11008_n"f73a22944f47a45f995eace1f1b00c0a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x12288_n"9a627b6f835d9161b7bbebffa3f8c731*486&2a460e52e880e2732d8f086e319e4279*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x4096_n"a77b299f399f770ef3114ec44d715e57*576&a899ca6d88168e34b7d1ab2414e81c1e*576&4257904198ed052cb9b091a6f66b64f6*540"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x11008_n"4b77a67ebc35ac51b3e8d1b712d856f3*288&cda1a2f30e9a85b310e4556a3ee226c3*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x11008_n"2f5afa5ae8ed6e2a8c35acc4ec6bac6f*558&c7b790bcd5ae61ca30882a45d52c4212*522"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 1x11008:11008x4096_n"5aa922270cafb782110daccad772d797*432&ac43070393dfbe5d26f0b4e037470ef8*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x11008_n"69d2089352bff20c410f41dd286da05a*18&8d2917690c508317ff544ae533e126fc*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1064x4096:4096x12288_n"197a024ced0fc00ccd21412f8b130432*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1062x4096:4096x4096_n"5d74fc727f3f270a18f878f87cd758e0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1064x4096:4096x11008_n"ab581dd74ba859573e76e5c4b3db2c06*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1062x4096:4096x11008_n"f65bf1c13d64f5f24ffc2aec299824b3*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 1062x11008:11008x4096_n"0ff175714d79b1ccd9d0a522da07e7bb*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1062x4096:4096x11008_n"e27c40d0c1352e904a536b895d113b76*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2128x4096:4096x12288_n"164282d33f994f49e9d79893e1558ac8*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2124x4096:4096x4096_n"4837064497bc27f8d46f0da28c509cc2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2128x4096:4096x11008_n"753a04913e880dbe41652ac3ec0150b4*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2124x4096:4096x11008_n"d90e9b44633d176d4f50a780948b81da*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 2124x11008:11008x4096_n"080f510ea792e822625ada22108ddbf4*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2124x4096:4096x11008_n"1fc36e8af3b11126442465350260bb7f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x4608_n"53a47ce72f0dcdb46afb44d39c959ca7*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x3584_n"5144fde5b579ed957916fd9d40476e56*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x18944_n"385d66ce27f661dbf2007210bcd9b8e5*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x18944_n"3631da73d95b583013bf40cf104ca971*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x18944+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x18944:18944x3584_n"17cd7b9b5e7e0bb67376ade4e34112df*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x3584:3584x152064_n"45cc3a9b2bcb8cb6a21e989de2c48a8d*2&acbfa25206736f339c81035fcced645c*2&976c24f7b215ab64edc0952676c0f067*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x4608_n"ac70d1304689a9f9f372e2d76420fdc5*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x3584_n"599c4b3c08d3a96838778a3fc05adebf*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"9196fe5be6f0e930974ec087f9b4d372*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"f9a2df58f11932824f514e67c18c83d6*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x18944+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x18944:18944x3584_n"2d6502dc65bb0e6fbdf4cbb65eff894e*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3584:3584x152064_n"5299c941890ffb9fbd0e52fb32ef9931*18&0be11f8f7dcaef1ae080cce35a5e383b*18&2878809f22a037658a7825f2e9bd8eff*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x4608_n"064432ad44ce1ddb71891aceb0e18388*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x3584_n"618fd03a68c15a0070e4dc837a7a81c9*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"575ad1b3fc3c832d4e839e7ce6ebc8e8*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"f3741f0cbd2036513a1208e075907a92*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x18944+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x18944:18944x3584_n"0fead23a704989e104c136311c61247b*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x3584:3584x152064_n"882dc0b73e873f4484f7017a15dccaa6*2&8657c3155760d94d1e304ba8b781d40a*2&89276efd5e5c9ebc8b7a01ae6f8483b0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x4608_n"9cc7287b93605cd9cf580cdff88b00e7*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x3584_n"cedf66c80495037c21b4c089799f581e*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x18944_n"21e9fed4d52eb5b0244f41f6fec922e5*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3584+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x18944_n"12174af344e6983506eb1fdcc9c4a0d6*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x18944+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x18944:18944x3584_n"4da8934a5c59d80d2fc4afe1581ae262*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x3584:3584x152064_n"c6947e7b452548e2aa83efb4a83688fb*2&40cc52e7005fd29f75d391d60fdef869*2&32596fc105c583e7a6a3ab230a905afc*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x3072:3072x9216_n"93dff1489578040a3a9e6751b6c0286c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 33x3072:3072x3072_n"90679887e91099c92f3e537d8e676dd7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x3072:3072x16384_n"e59be6dd906c1044bca26f7e314f15c4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 33x8192:8192x3072_n"5d70f1d80999910a14567f893f3fca93*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3072:3072x9216_n"6dc252a37501d110636bd2a2f960d5a5*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3072:3072x3072_n"46d92c2ae55fcb3fe2e611a8af871468*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3072:3072x16384_n"e98690d6f7cca900ff6885f19594cc2e*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x8192+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x8192:8192x3072_n"aca53e6ba90471669cca4448dd93081e*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x3072:3072x9216_n"538898e900460bcce34f0d08f0c4ddf6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1103x3072:3072x3072_n"9d42d97c99fdf843656cebe74347c1a3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x3072:3072x16384_n"ae1891d18a94d04bd630c0efb9d2de4f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1103x8192:8192x3072_n"55ad05b5233ec33c8d9b5f6a98c24dd8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x3072:3072x9216_n"a6c8241d96e1877927390733171ec013*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2206x3072:3072x3072_n"54c6a78207f4b388bb32718177864f60*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x3072:3072x16384_n"7c1a49c12c6c44ebc2c112f7ed59f7c7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2206x8192:8192x3072_n"094fe157f2379ce47999b328d1b5a781*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4608_n"2bbe718285e84809646595737f997b0b*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x27392_n"cf5b74511f1b9f419c443b33b561a075*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x13696:13696x4096_n"795e29157851129c6cd2ff52357eb7d7*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x65024_n"5cb908546529d6b5956553160a90d596*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4608_n"ed715016bbd65b9a86b622bfe0167d1f*3556&c6dad270d9363f83212e6243dacbd544*28560"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x27392_n"c2ef6203e5175ba39900b7ebd9718cd6*3556&c6f6687ee0b6a48c9fd03775f32b5f17*28560"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x13696:13696x4096_n"a063f1e6e573627bb9d0eaf862dbafad*3556&b80c8a030c5ca45832d5d23e1e96cf00*28560"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x65024_n"c274eca2abdaae129e34311411237305*127&5cfa10fda767e817c3a8290b35516c9f*1020"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4608_n"67243867425c675f723a6fa73bfeb6e6*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x27392_n"20d3ade01c2bfd15f8b421afdbaa3802*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x13696:13696x4096_n"2e08aa6bec6e85d3b3332ebe41cf44aa*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x65024_n"8d4f196d4fc6e3cf5207e936cda6a31d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1000x4096:4096x4608_n"4791ce390f6e2a8c055bee3721c2c855*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1000x4096:4096x4096_n"25b9d82d36925cf376b4b8966e3fcfaf*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1000x4096:4096x27392_n"76d8ac88ce937c7b6bc326990e3c9489*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1000x13696:13696x4096_n"bba5464e7f5f4d6e2dce33712da61386*28"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x998x128:1x32x128x998_n"04489203805ac4e2ce8e40b654105da7*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x998x998:1x32x998x128_n"06a013e4a83051291fd6b21b5d4f43bd*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1000x4096:4096x65024_n"2fa369e9997d86e69aea9be80835f937"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x999_n"90137b71533a27e2d52dfdc2bf0fe7b2*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x999:1x32x999x128_n"a78613d9d77e91e258dba1adeeb0a498*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1000_n"0ddbdab804a97e55ec1a0989535b1d59*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1000:1x32x1000x128_n"9b60a726911d0437d9201a6a4df770ce*17"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1001_n"2fe3f0f2ff1ceabc7a7b285405b20914*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1001:1x32x1001x128_n"ff81eb182d12690ff4b5781b0f4bce52*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1002_n"bf8966d5e83549a32b55180cc1f833db*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1002:1x32x1002x128_n"24670abb71cdbcb657b122e1fb6a1e73*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1003_n"7b7635e39acd9d7b42bae7c7c6083860*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1003:1x32x1003x128_n"e177ef0fd4f2916af7e5864b548b2c8c*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1004_n"bebf7228b7555147704c17826f192d71*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1004:1x32x1004x128_n"477715d67d28c69c7f57f68eba989afd*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1005_n"02606e8b2c293faa96ff6d4b9a4fc63a*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1005:1x32x1005x128_n"77a39256ac1076a971a1f7546693a883*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1006_n"6ff2cbe28c4be456439129d8fd9b1934*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1006:1x32x1006x128_n"f0f1f8822d2977415e4c91f297ce2521*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1007_n"208da4e0ed6a868c57b252963b7cf9fc*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1007:1x32x1007x128_n"c3c283552505607f7ac3a020ef2810e7*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1008_n"5b01dd076d6ffba588e9b4a12ba7e046*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1008:1x32x1008x128_n"520ea90528caf2bd2ad24fe1c21db687*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1009_n"a6b8173f98aa026d7e30119f85f24c3d*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1009:1x32x1009x128_n"27ab6141a3cf09853c31ae5daf5f5eff*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1010_n"e8230aff5e8ea434397c996ad53212b6*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1010:1x32x1010x128_n"3d2cd66fb955f0e7723f9aba54b134a8*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1011_n"387230ddb4c5823ebb3a0b085cdf70b0*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1011:1x32x1011x128_n"521b1283c1e566e159066b85b2d67786*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1012_n"003fca6c9c193d31bdcfc613e7c2dfda*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1012:1x32x1012x128_n"61f7881de8ac3dcd2ee30a01c0bbab77*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1013_n"1c6faa1fbc30f64116eea0860ccb2973*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1013:1x32x1013x128_n"fcb36c44c1fb9ec1dd9165bbb20cbff9*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1014_n"24680628e5eb8ac7e11966a16581c31f*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1014:1x32x1014x128_n"4b36bae81831b877962e5dcdf387390a*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1015_n"100b7e3b40c1d9f86162aebc699cf021*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1015:1x32x1015x128_n"51bc9d1073203addeed6d62f9c35feb6*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1016_n"62182a72f34081dd1f983c81035d3633*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1016:1x32x1016x128_n"aa7aac08513b3d205cd233daebccc7c3*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1017_n"9bc18781d2b4830739c1ad5b50c94f69*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1017:1x32x1017x128_n"bbc80b342967bf17305099bbd006c265*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1018_n"aaf1a680393d00e1901d40cceed9a629*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1018:1x32x1018x128_n"f6484242aa01750fd650a5e2b900bceb*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1019_n"a7de3a5ed890517e638e1b2e5c6a727d*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1019:1x32x1019x128_n"a8d92339516cb2a9cb0434bd6d8a7d60*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1020_n"004a05ff6fb36a993ca514b2382927df*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1020:1x32x1020x128_n"01326cc7ec731fc2e19f4ec6461f1d7c*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1021_n"f4598bdc1fddbc4c313c379bfcd1542e*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1021:1x32x1021x128_n"3cbd9c0535503081e39a0e1604548ff1*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1022_n"05aef1b901b528e5f733565989869f5f*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1022:1x32x1022x128_n"2a009679a37eda3f0de1685522f204e2*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1023_n"1ad05599953366f3e9dd5288a9910f77*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1023:1x32x1023x128_n"160969d94797fba81dee71d721ac0eb3*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1024_n"afb0174180793ae80eb34222186ca9eb*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1024:1x32x1024x128_n"9c558ba6f7023b1895b35f8923dff26b*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1025_n"d7dab3909c1b9d20402737a1cee5c9d7*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1025:1x32x1025x128_n"46f5cf193d592b5fb559ee90dca5aef2*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1026_n"6e0ac59234facf501ee25d10b8cd8945*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1026:1x32x1026x128_n"68a7d02c3cdb98c950412b13615d0830*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1027_n"cb8bb8ca1521b6b1f3ded76ad02edae1*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1027:1x32x1027x128_n"d8b4a9f7786d914783d38bbdb63bc9e3*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1028_n"1dd27cfb4659a3fbedb20679329c3fb3*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1028:1x32x1028x128_n"19da9694c0f81e8f87c0e3fb06081da8*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1029_n"f287fb6535f1ccd3ee26d720fb782f8a*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1029:1x32x1029x128_n"392ce2c1b4f30980f18b9b9488cc7576*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1030_n"a6a8bed57a5039e9bb2992e27b8bab5e*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1030:1x32x1030x128_n"df8a450466e1a30568d37877c04f921a*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1031_n"cff039baa4333a8447d833e39e627b8f*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1031:1x32x1031x128_n"b5ba6c5dd8b5458cf7fbe11825897c6a*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1032_n"f9adcbbcd2b06828ccff5887d94d87b4*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1032:1x32x1032x128_n"61955e2998b47927761f2805b3eda96c*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1033_n"f70d751c6a55e22311018ec22f38b0a9*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1033:1x32x1033x128_n"e77922b3ca67eecacf1bdd73df839e59*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1034_n"398c03f28d354a2a33176459f6d7a830*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1034:1x32x1034x128_n"e169b1290049469f05b369067f66c9bd*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1035_n"212ea4836f3353fc14ef396128d7f8c9*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1035:1x32x1035x128_n"018898933d97a7e6938472555f578cbc*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1036_n"4feb15e9b45dec91bcf8b3db040350c7*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1036:1x32x1036x128_n"a3df38535c674e7523e53cdb278fa5c2*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1037_n"70ff78117d214ba2d20a317f549cca9f*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1037:1x32x1037x128_n"1b74530ed17f4a53e4c02f1ddeba2c0e*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1038_n"be66bdde9f7741328ae8bf105920ebb6*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1038:1x32x1038x128_n"e711f56cf7d09c932ec67750bdc34d64*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1039_n"e4da65cbe09620f7d101ee5e74df920d*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1039:1x32x1039x128_n"74c228515cb660ca77b086c94ca1d278*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1040_n"f9c5cc875bb33b4c1f066705a9af7aaf*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1040:1x32x1040x128_n"760cd1e74aaeabff50a9a019e5574c13*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1041_n"112443ab4d5abb4a04e9159513c2c358*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1041:1x32x1041x128_n"905c91e8075abb34635789728cd8cf73*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1042_n"6829212c56f2b10ef2c6562988728bb3*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1042:1x32x1042x128_n"f73d8e8c28c53bbb5cb8ab8fe4a55242*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1043_n"c416441a8ad113b470d694120e28ca88*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1043:1x32x1043x128_n"04c006b701f53983317b309ca4305597*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1044_n"4be0b5c471b46a640b88c1a974aaacdb*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1044:1x32x1044x128_n"43e37db6515e7cac2f28bee7d524aa61*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1045_n"ece794662f99e10979d61ee850b5a5ad*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1045:1x32x1045x128_n"49d4bb0e3788e655d0c5e848d4bf9657*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1046_n"f87438b90698bfe0287b33fb195abd96*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1046:1x32x1046x128_n"32bb4c2b1bd6fb7f2bb874039ad37650*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1047_n"ebb952702ef58d5a13bdd2e3a9b0cb2f*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1047:1x32x1047x128_n"ab50c9dd1eb4de1f1272813410dd7185*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1048_n"08a531b4e61d20b4c17b0047afce127c*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1048:1x32x1048x128_n"df3ff94eaced1328d754400a5b2cd146*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1049_n"0d0e555f11e0d415fd8e795efc6a4e30*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1049:1x32x1049x128_n"8ef5bf87dd666b4c0dd491736d6f3621*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1050_n"70ce83434ad92e85ebd82677f7e7710b*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1050:1x32x1050x128_n"8a19c00ee22e59bd60542bb0925d99fc*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1051_n"4d754ee8c36e70502c04881b4b397c85*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1051:1x32x1051x128_n"138a723070ab1640a15ae81e8603569a*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1052_n"e95b27f79e25be8578e6c55b99f6b834*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1052:1x32x1052x128_n"210acdf09167576b8cfaee0f61d6685f*27"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1053_n"296ee6ba260ac0ef665d7ec25d20f6fd*27&09fdba8ba9dec6aaf3e3c20e2cd78db5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1053:1x32x1053x128_n"320f01d4e1c0c9fd160ac36dae163b32*27&dd2f04c2e33a00879fce5235143859e5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1054_n"7fff4c7c0caf40768fc3852601b3169f*27&f8feaff6e9578a67318bf371b49088cb*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1054:1x32x1054x128_n"4b5bf04f8a21a8f5742d29a6bb9c806a*27&90a81b638d76a7bf32cfe3600a2fbfc8*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1055_n"46d10aada18e7ee8ca1cc210e4fcd7d6*27&9991add9e6b6b4028fe407bd0e24682f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1055:1x32x1055x128_n"fa51d245a90fd0ee20a6f0a4921d91c5*27&250a5731fc06b5c7cb44f84d56bf4b4c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1056_n"bd4acb7af542006b3a94e469ef8695df*27&7e4fa56c65c89b729128e8b844552483*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1056:1x32x1056x128_n"e5bf49ad7eadb033f1a1f563e7476fcb*27&8341b1bab5522b572ec4986a16e0ee55*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1057_n"f837e21fe8b133d51a5563cc79c068b9*27&b3644199ba9da3e38dbafd41871cc44c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1057:1x32x1057x128_n"c971c9b91da0cd5c75ab616e756cff0d*27&75cdda065977cc55efc16faf747b747f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1058_n"e9bc251d3a12a0add6b251fb6b8c12ec*27&6cffc07ad1bad93f3c3736f34e19fc62*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1058:1x32x1058x128_n"c7137e872af22c0e0e7c653a8297f5b2*27&e60e513327e934a73ce61b7cbf8e39c5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1059_n"3c99d50ac46987e583d27f2e3a74d7c7*27&e3b1e9552be419122380881e8d33ddc3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1059:1x32x1059x128_n"bbb094ab3a60ab7f5a352405fd91041d*27&d61b83523adca7a4540deb01f227c67b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1060_n"ba75d453593cc5bb2b9b39255d53bbf3*27&3a5e470a60c1083b32670ae1bb9173e2*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1060:1x32x1060x128_n"666ea3bd77dfe50f40d16ac09da86d83*27&2381b9995a79ec58da9204b7a54a1f79*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1061_n"f5593bb1df653c6fba43ba6ad0564b8e*27&7cd789067a52c1cd3a55945912c232d0*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1061:1x32x1061x128_n"bbc7d2880a07b8f4a93b9c2b04e509f5*27&fe48bb0f121774e2b42868247fa202f9*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1062_n"ea50e73288d51ff7d38b0628ea456070*27&76df430f449569a8665f203980d84110*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1062:1x32x1062x128_n"d687828702fbe450607b22412e0a6354*27&3bfd4ab24a9f3273e68666be98db42b3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1063_n"394fe9178cff0bdcb686a7c3785f31d2*27&5ecdb531a13691c4bc8f3b5b662d2f0c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1063:1x32x1063x128_n"6f231f2142b99ccf86bf7aeb8c24b601*27&9e48bc79f3609359c2d7bb15be9997e6*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1064_n"ba79c1045e87a0e898acc05cf2c8fdd0*27&9efa40b662f2a8e51971444fb86fdf53*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1064:1x32x1064x128_n"4daf9915f4c38e41a5bfb31aed42aa01*27&cfe51eaa47a2b978eae4c289fcef60ee*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1065_n"92d3076f78cb7f92c3351d47ca144dc0*27&8d0f66005cc06968c78a841bfcbff7f4*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1065:1x32x1065x128_n"9dc9a0c25025fa7a28804b20aafdb7db*27&3f20df5edc6ae9f5cc1367c08b5b04e5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1066_n"8dec327e362d8fc9308eccd607905508*27&bbcc3540181b75cffd42377367fec156*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1066:1x32x1066x128_n"ec5f53f4ba166c51a4e9f14f3567bc72*27&f1947edda66e47b7b2d995027ce8329d*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1067_n"b5dd97c25d03f10aa1d32056bd53675e*27&f4085998b733f659046b05ba1f9cf254*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1067:1x32x1067x128_n"4e134f732f226e15b8c5fcbe1feb005e*27&8af2e5bcb9f8eb7b8239469bb536ec23*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1068_n"a0f2ece186ff31978817f24a4e3796b9*27&89a21d20f9079d2a8a650f3c0d25db40*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1068:1x32x1068x128_n"17f1c7719931f4436db5d25ec51c0e04*27&02cb52ad0d4d266edf7fd9782ac52d87*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1069_n"cce13ae6bc45209547e7d896e75f871c*27&af83148439face542606f411e29e1ad7*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1069:1x32x1069x128_n"55cf6750f377b070c7d781142d0bd539*27&46964744d53cd96fd28512fb25d089a7*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1070_n"23f37e847727743748bd25ccf11f4adf*27&a0267be0b2b06256f0fffd9342e6e144*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1070:1x32x1070x128_n"385dd71488bd17a73f8f37971a85891a*27&1e85ad9242eddc2e47806dc5ce3dc5aa*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1071_n"da747ccc7d10b8d6f144fe56d388e30a*27&f5758da5a4aedfe1c2bff5d14dc2de6d*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1071:1x32x1071x128_n"34c4b9e26a9bef581c73cb35e965e58a*27&6565800ada23c6f22f1c1a09c2d61895*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1072_n"2e1068c49238c5310333d095e1b3e6eb*27&1325cd49b91370529484ccb3d83095af*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1072:1x32x1072x128_n"5cbef9ed441d0b5fcc619e13c982cfdc*27&bc5be72a3d5df0058e7c2dd125e90302*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1073_n"0ed16f6de5e3639905a37e16ad125e79*27&01d72a84ce76df0538bedaed4c5d25f8*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1073:1x32x1073x128_n"04da4fb31ed1ec3ef1764d4b63dc98c1*27&a6a738ddd3e63b3fe211f0725c0226ac*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1074_n"b5a829067b7ef0c15e97f07c9594aee1*27&9423ceeb4de7b61dd61e072b21fb31bf*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1074:1x32x1074x128_n"ceaa53b7800fc33f2683286a3e6a6d32*27&47dd49c658f4457a9ad02df1307e2b97*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1075_n"6eb251b2f76ae79479986850160dce21*27&abfdd713768a3b93bd9f4985265bd2ec*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1075:1x32x1075x128_n"1056c638daeb3443c9e6e6ff99fef684*27&0681da1768a9623f676a225e3919362f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1076_n"9f0e1022104001393e29a60a5654bb1b*27&a1e0b6a8f0bbb3963d0be7c58e089ab1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1076:1x32x1076x128_n"30f8f9168e399fab7e76fd3e1427398e*27&b6a597b39af3d1f609c0d08ccbf96921*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1077_n"40020792db8a53fdb4d24313198cd81b*27&1fd807ae0d7462f7ec2c45fa921be3c1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1077:1x32x1077x128_n"017d64c9515649f143bdffe02a584056*27&af208751c436e8466b2eebe8191fa48f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1078_n"e60ffa51d969c589885b2e1bb25192fd*27&677ea75cbeced3be0414213acd5b79a2*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1078:1x32x1078x128_n"244acb5f53b217f23455f42f48302bb9*27&e6dea2b6323f79343db02eb465d09020*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1079_n"44938ecc74139a810418b0b7c5cb6f80*27&c3d5400ae4af745ff19b668c7ce0cf7b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1079:1x32x1079x128_n"bd9bc7cff640b12b387c3154e68f1c9a*27&d3647f48172b8ca4823cf0acf1e15201*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1080_n"441a3b61b68d1bd2c348607b847deac2*27&bb0b189fd94d987c0165c284b8bfb261*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1080:1x32x1080x128_n"8c8bbe1b382e24f325bac576593f4ad2*27&c14df0085201ca4d21785d8137f66f33*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1081_n"185c6d1da15be4e9a74f011c23893718*27&c1df84ac81b9d1c10ece73c10aaf423c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1081:1x32x1081x128_n"68a915db1168f000dfccde47c15868fc*27&cf020a75ec3b6b675f7889d015ba12c5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1082_n"e3e73e8c8a508f0c27c888c8aa32fdfa*27&8121cbfd80f10e27c695b2ea124c2611*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1082:1x32x1082x128_n"e06290d4b7e01aa64cb42af87786af63*27&5171ba3b122eda5f73dceddb31d499fb*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1083_n"89a40b8d210f4d84c74bcfc994643566*27&301c8aa2953a3b86c743af11662653f5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1083:1x32x1083x128_n"c34e246cabb3ee36d0bbf9d7135b4ccf*27&7397a3dd64daa148eefee999e2aa9c96*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1084_n"680214ec5b89cf1c9181bcef73008f4c*27&d9a018f810428fe61be24f69bb417b93*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1084:1x32x1084x128_n"44c88a2c0ff239a23bb82283569c5ae5*27&9d065b1bf24ba4213255e97e515c1976*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1085_n"4b0446864a9b7a4ce03e7d3338a4a2c1*27&f6a856436a86d237cfb1fd3505d06175*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1085:1x32x1085x128_n"92b2cb38f7ea1ddd6b2c2ee5cf0f346a*27&3743c8d55514c3d34effe1d873974083*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1086_n"bea3bf856830d1381b3fff098033a43c*27&9d270e1ef29e3b03de580c3b54d08c39*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1086:1x32x1086x128_n"9c5f7737abad557ddb23a40d119f63da*27&0c58826876d62e0e411fa6d3fff8a544*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1087_n"fd82fdb4715b9ddf2a7d05a5b3afa015*27&c957e59397dedbbfaf04bb27557acde8*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1087:1x32x1087x128_n"a506261b5da2af8e3cfed14971b1bada*27&10ab940389a8d193a3d3350179aa4e52*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1088_n"0bc1fcf655cfd86db6625e65a7cbbe78*27&d5c80e96abb446e83133d7e07d9b88ea*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1088:1x32x1088x128_n"cdd4ca1dfef3f02ac106510836e046e1*27&bbf9a3e4fd6bf00b3c9ddb6daca5be0f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1089_n"28b3c7482d549ec2dabef41c605fe713*27&c8d21c61e1f68af906c4f1469242132c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1089:1x32x1089x128_n"bd9c9c02729dda05cd82293539075b79*27&4684d6fd1d5c65605ba0f029eb9eee81*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1090_n"ed6c266143eba179b15dc94c385ff293*27&074cd3fb8a971ed186ab1197c9e43756*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1090:1x32x1090x128_n"55107513be9e946f01bc00ebf0c2b97c*27&1015322fcaec0b217dec819e17afcc55*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1091_n"fdba7a09b0f09e5feb53f2c5cdc2a3c4*27&61b56dfb693ce2fb625a45249d0fc4ec*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1091:1x32x1091x128_n"81bf7cff47831d25b0cd33461dea6d78*27&a471f1752b97ce27575c6221f81293e4*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1092_n"00b79f873b20dab8b685d9fd26c8d2c8*27&9c0b8bd51e3659599b75e13068fe55ef*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1092:1x32x1092x128_n"6c90bc63cecdd52140d2cd1e857f335b*27&360a46813ad3b9f110ad943451451d16*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1093_n"66b4329baaeadc28d4e8c196ffd92f60*27&93ef717cf6be5f22502be80500eb66a1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1093:1x32x1093x128_n"2aa5a1425078cac01b84ab56f1fb7c8f*27&02f2e6ca3e3e2201159ac9265181f1cd*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1094_n"5ce567bbc3156c52824824b8efd6246a*27&0aa5c63ff3cea5e6d0258e715faa7b2d*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1094:1x32x1094x128_n"86c74ee48328b849cbb4ff259f33c884*27&efa311158e226ec88e0dc69fc87487ba*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1095_n"4865a2db4af523b003b11c2536397711*27&6edeb9935b1fa0f370af7aec4ac63574*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1095:1x32x1095x128_n"79c8cd22c51844b2b03ffc31243da54a*27&378a81ad54a42bdfd49f8efee78c8c68*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1096_n"feb5236e16d14628ea7d538a184acb85*27&0a50a23444a9bebfb2e03f19bfdfdc47*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1096:1x32x1096x128_n"1cfb52216582fb94a49cd9db68658114*27&c4a28262aa5023dc35c9e553692414b9*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1097_n"2dcf441acb846983dc53ce2d8a386967*27&43cd70274ab1263736ff920e47466bfc*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1097:1x32x1097x128_n"475dcfb1c23921f10b00b93e40050823*27&0bbef616b0aaa7870ece2b15482e43cc*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1098_n"8053e2e3dad13a14cf9e84b1367e7659*27&917064bbf33e8e7af795544b49b8af9b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1098:1x32x1098x128_n"eba542764be4ef4513f38dc91a2a1572*27&6fbb89e28c3c4b780d34032cac2fce6f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1099_n"5d1e4548007f41e855126e87a5932fa6*27&e904456a08dae8412006292112fff7e5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1099:1x32x1099x128_n"da402d81c45e0df3b25e0abf89327d13*27&b833e23b56846ecbd5efca4cfa904b9c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1100_n"8d4f2439112ccab9fdc7de7600db950f*27&a6cf21b145d7f90123f411faa2b7d00d*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1100:1x32x1100x128_n"f3a67bac70a3c0f3c4cdee04b328b258*27&256b1b6585747be0634b7abcdb2bb2db*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1101_n"84a7e8e7ba75de9fde1b4c63e586593c*27&b28e5b117e81c2cf1141b139a692a1fa*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1101:1x32x1101x128_n"8ec995edd0ba41decda18243a0844eaf*27&892dc16c7bc9dd884a98091dd87b3434*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1102_n"56991e4a6582ed1e2c9e89e5fb96aa0a*27&df30b702fd0339b33e1fde4649d6da93*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1102:1x32x1102x128_n"da830ed503221c695e5b1f7df038b69f*27&2230c9ec55e151c867ae1c06dc378904*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1103_n"15ab9cf75100b18f1a1dec95ef838a5d*27&8d5d693c0297bfecf6e5a20c881d64d2*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1103:1x32x1103x128_n"adfeecdc4d45f172f0c1a9808509d214*27&d128410701a2e92379c84ff65ae0b08f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1104_n"8b54e7dbc51ec4023ee6f765aba2131a*27&f24b783daaa2d732a68ca6e355696bbb*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1104:1x32x1104x128_n"42bfadd7c4c518f4b13351a059d48c76*27&260571403b3d8c0f385fe396dd81571f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1105_n"0825d7d9997d7e761ec4cee6ba228142*27&2a218a4ddef97aad741a0733474dc04a*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1105:1x32x1105x128_n"70992d14b8eb3318b2aa14d234757e1d*27&fdbe4e3ef470378499555aa18a95166b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1106_n"b8e6130e476dac82f23f3f193f637be5*27&5e9dc43593b758278f437442b498ddd3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1106:1x32x1106x128_n"6caa83151bb7c893ed7e5c5d86db2d3f*27&fedfaafc8a5b5d7d51e41d57fe01eab5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1107_n"0b242177e1e542d3818d56f58a3e3177*27&5ac3f6a1070042e3a3752b6f4818840a*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1107:1x32x1107x128_n"c8207981f28ec7e0e0c56e1b6ed3f596*27&9b1f8467df6641c3c9deb7f9db57ad1e*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1108_n"449e8ccc46817033676891b701f4a914*27&b7a2f1a124fdfaacba34fb8801b9eac1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1108:1x32x1108x128_n"ad98aa2033487df69692642335f8ca94*27&2aa47f4f940bbd678dd5364254355b27*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1109_n"815be24af856fb689b9eeadb07171ba1*27&3a285235e3ebf68fa9ba09ed0b04acbe*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1109:1x32x1109x128_n"6c2d69e28b28b60136235221e9ca4c8a*27&829d6cc9e349379b2b80b96b2b1356ab*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1110_n"12b518b39ffc422fd2967fc29829d4a7*27&83272cf1dbd62ea5ea866f4954423ab2*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1110:1x32x1110x128_n"689b377b752d059024563503758bfd0d*27&b9f2da8eda8485f9fe3a08a2b9e3ba6e*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1111_n"2a88b27bf8e36c4934421261f27b0ece*27&07dbb3ec9cda5f7968bebf3badbec390*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1111:1x32x1111x128_n"eae5ec4dee70bdd17c26d9638c63139d*27&6ec3a0f2afc0ba089a7122ba866b2a5c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1112_n"d457559a7fd61cb0dc328b4a5f000c72*27&bc53818ecd8c53edf18d0976bc6600a4*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1112:1x32x1112x128_n"d9bae55994cfa82bfce0b55b309e455b*27&3af73a4478505ee3dafd3041fc4ccaa8*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1113_n"c23f2e907081203922bcdc1c0d562ebd*27&6ecfb442615ead0deb212994d494ac54*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1113:1x32x1113x128_n"776b295ca24a5c99035b42364386f15c*27&dfe5794f2f84f385b9640cfba69a56ef*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1114_n"001ce76abfdf9afdfddd7ebea20d88bc*27&e4502a7c5aa226d547d3e5c5f5666b59*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1114:1x32x1114x128_n"4603a601f722ad8b1a4d71dc89b3ccc5*27&0525eedaedbc404dca824d09e836a704*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1115_n"bcfd1fe82d39441cae7030ed60353387*27&7114e1e2421039203598c731f0e39a8a*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1115:1x32x1115x128_n"c0731404b183492f33a94a20d7b27231*27&ecbd59570042573eeaa5508695713a51*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1116_n"9b53ed6e5ea44e5cd7209fb52994a173*27&054fac2fb3f30922926c8ccf32d40d67*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1116:1x32x1116x128_n"3d4eba80039c06539a7ad44b0272cc17*27&d4470f2f3d9f4b7d2497af1c87119981*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1117_n"a1cf881c91155b9728fc18a3ceb1cac2*27&b04cc22d7f267f8670f62df2a2cc8d61*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1117:1x32x1117x128_n"ecd6d5facc938fae611dc28d7ef7da8b*27&db51bde5018e25bf98d3ba73c390abe1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1118_n"e399d1b72cedfdb4f2f80fccb32046b7*27&387f4d50c1638c3d102b3d1f7508aed3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1118:1x32x1118x128_n"24e1a84438e3a1f4f103a6a0aee9f790*27&409eb6f9e03fdc82b124a4e2f0cb2641*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1119_n"b1eb5187e84c01e31a9406ac323d87d4*27&4ec9c7b2a63867adcaf38c1fe4138223*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1119:1x32x1119x128_n"0ae77575ef915808379e8fe06d21ad2a*27&ab966f84a9b6db1b6b1e61b511520210*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1120_n"3015b043c07e87cf12aa1a660e616a43*27&0cb4a000405cb07981578717a8b8b069*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1120:1x32x1120x128_n"908c6fe9f4aa3f9020c05d84ab854490*27&b56f5db3556c409e7fa399125e265840*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1121_n"1190b0ad752d8e770a874a37860b35a3*27&262a31ff40c0ffd334912a647c7fd1e2*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1121:1x32x1121x128_n"1f92f93dd8b4d9c3b08d4bdff344b5b4*27&defa1ba396839ba277e37667873717d1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1122_n"96ded24e3b591d946c20d98b526626e4*27&ad9f5f3a847a9c3b17e8a89aef89d1c0*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1122:1x32x1122x128_n"4d06ebb443dc1268b14b754109015616*27&c44e3edd60394682f13900734665d9b9*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1123_n"88e248c11dd2d3ab8cdfcb2203ae433f*27&d6fc026959e215af29cd4c3d25eb76ca*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1123:1x32x1123x128_n"a29bc02b0b160c78f2da55db03f81d9a*27&9d352e69019ce000ceb7ca8c5a0153c5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1124_n"91148c19cf30b6fd80aed2ae36620f5c*27&21a580344e33dd614c5dbb612e589913*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1124:1x32x1124x128_n"ff0920dec3a30c6e2b17540dc589c443*27&aace5170b21258f70adf603553b0413b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1125_n"3cf6e49bcd5e30bbbe410eae0670c615*27&8e078dd99e8532ac1b1dd83f8de2eaf5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1125:1x32x1125x128_n"c6ac38051502d4281e9dd692b7acef01*27&d995b89d5a4c7617b94c071164197a2e*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x4608_n"6ea5fcedd7ec87b797466e0af5094227*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x3584_n"120472030a98c152c1c192d090767d13*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x18944_n"da03f61effff08bbe885239c325f95a0*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x18944_n"c01a47e66e516bd9df743590526d37ed*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x18944:18944x3584_n"01f0af795ab9df713c0ef2ce40821d54*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x3584:3584x152064_n"7b64476ec2f4a091d142fc2f9bc3993d*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x4608_n"629251b981e6cd5ef6d2a6aa4cc5b065*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x3584_n"0698108aa4ab2c6b54f738b66088ee9d*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"3feb04dd1461df30dbde4d02d49d3778*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"1b22b0d7f334d5b47d7decf7d4bf086f*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x18944:18944x3584_n"fe6f6e5eef2c8358fb2143c7a2ea993f*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x3584:3584x152064_n"ab6a205c114a1dc680065ad1fcf8e2e5*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x4608_n"e22cd566ae9c16a49f1b72e73e082795*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x3584_n"27b0c2cd79c58d7f07a66ef760b2bbb5*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"08fe7a02b53b94271f6e72144c38aeba*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"883a858bfb29b97dd48cfe95a2a0a06e*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x18944:18944x3584_n"d7e14d8015066c4b34df7287e1dde78e*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x3584:3584x152064_n"888cd2205e7689dcf64325b3d1c36bde*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x4608_n"930c6cc9a65219cebd968f6518761d0c*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x3584_n"0bbf67f012c041562dc9043a2aa5cbab*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x18944_n"c736cfa81a1509c6c209b68467d7ea47*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x3584+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x18944_n"26535a9adbdacc21c748c351d4d4c713*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x18944+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x18944:18944x3584_n"372f64f6212f01d38cff608d9d5f405b*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3584+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x3584:3584x152064_n"79a01f2ba2621323db27812dfbb3d526*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x7680_n"f471c50023d7281948dafe0b110b3e3a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"f315952dc981f6889b399a894ab3b6b2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"fb1afc2bd8369ec883fde4156fb2591f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"ad80b303bd529a1841c0a1938ffa875b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x2560:2560x50280_n"c46a5f2ca38a2ac0788811e0d6ff2dcb*2&545bc7015c00a7a9f5282b3869fe257a*2&ee7add0314f65cfcadf6f5e85283f477*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x7680_n"f4d27e78f2a4ed565f9240db3f339523*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"cdfb9b575a7f1a68a099cb344b090982*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"5bd1e638df96884fd42716438037e8ef*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"d34f157d0fdc9a44b27570ced22a1d84*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x50280_n"80c85f522751578a1013bed3031d9ce0*18&be41338f9edffb0ad61de44909eeeddb*18&6d465686ea4678d62be9fa479c13abd9*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x7680_n"ac13f7a6c9e45e6cf9927a9cd02198c3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x10240_n"8cbabe70f5e6950f93b1dd7f222e3902*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x10240:10240x2560_n"9ad14cedcc1b19147bdce91c990dcf84*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x2560_n"bc1d986850f6c1c3528c66998c6afbf2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1032x2560:2560x50280_n"5183178ac5fcd074fd705233bb2cca41*2&d82e867c478df581594e30803a0fabf6*2&059ec5cab3f9378c8ffe6ae9b1a8c480*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x7680_n"80f270b4f9a57feb0ba26bc4723647dc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x10240_n"d7700ba4adf917bec36a2ab2f17e477d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x10240:10240x2560_n"d808b638d4fb5d2fb2490388964dc453*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x2560_n"fc9a2c8f44a73f37b4914b35fc833d0d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2064x2560:2560x50280_n"0cbdf66eb3e727effdaee496c477d709*2&8fcf8875bf9bccf0b5227b736bb506a4*2&c2f41db48edf4212b3cb6d87553ccc4f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x2560_n"4e8885555b3af369a4292dcc64ebc844*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"a43a17c08f88fe209768b92cff5d05ee*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x1536:1536x3840_n"f87da2bd96732244f0ae52a38a1a44a4*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x1536:1536x73440_n"88e378864d41ce5d744b79d0538808dc*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x2560_n"084b5ce2c9c05bf64fb81b00e8747a03*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"30da4d6d5eae61852d4f3be711de04ff*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x1536:1536x3840_n"2735713b59aba474978ca754b3172a55*936"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x1536:1536x73440_n"a2a92722f4a53d5308bdb20e7f24f426*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x2560_n"8404a9951d978b666428d15119e4a76a*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1088x1536:1536x3840_n"1a47c1311c28aff3f3773f23082c51e0*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1086x1536:1536x3840_n"dbca53343c57efb1604fd7e5be08bd3b*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1088x1536:1536x73440_n"c6e4d3a5bef212f7715343427223afcf*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x2560_n"c0913a259517de53dfbc482d9dc67244*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2176x1536:1536x3840_n"60f14fdb12b3ee44bf8d72bf760366ef*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x1536+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2171x1536:1536x3840_n"970c393cd9081ea9ab928d8f080fcf4a*104"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x1536+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2176x1536:1536x73440_n"ff105ff58dbe180e587728044e7a1637*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x1024_n"e75a0b513351c99e45052df882d5c776*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x4096_n"948ca1fc35f6056a8efdbd0bfc5bdd18*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x14336_n"39dfa346facc6232e960215e75514851*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336 --attr-fpmath=f16:true 32x14336:14336x4096_n"d32441fb1ef7357daaddff4bcee583a4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x4096_n"2990ade3d49c96365b2a82b9b7f6facb*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x14336_n"4f3cb6ab6582edce021c7cd7e16ee34b*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 32x4096:4096x14336_n"f6a37715d272d49dc1485496aa5d50da*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x1024_n"154ac6512b59bba01dcc83d7bb77b6f5*450"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x4096_n"c209527ded445d6fa94e30c467cf65ad*846&8c52cfc797edd9b2d85db9cfa3a60c91*378"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x14336_n"f914776bc668e69f4566bd47a2a38e7f*558"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336 --attr-fpmath=f16:true 1x14336:14336x4096_n"f2eb8986f9d2f7e60a2c84f0b307528b*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x14336_n"c2a7a45c1ba014c4e100efc95cb886f1*468"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1x4096:4096x14336_n"f8d0796979ce26bf955478ab7f20935c*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1088x4096:4096x1024_n"93512fd3df9b2ee5868677c0f2749ba3*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1088x4096:4096x4096_n"672467b362b3eb97c2c056888c75d5b9*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1086x4096:4096x14336_n"3214a5c259ea544255c9201e958b8dce*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336 --attr-fpmath=f16:true 1086x14336:14336x4096_n"e292c2c77589261d95a3aba8035cb97b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1086x4096:4096x4096_n"35d953ff99b20f694d13e7d6d483c743*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1088x4096:4096x14336_n"8848f96d37c2c18b9cfb1f5373a6d7df*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1086x4096:4096x14336_n"aeb8db7a23a28c7890d83de8320e7a28*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2176x4096:4096x1024_n"866d2ef803bc70a5934653b6a143c67c*50"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2176x4096:4096x4096_n"1dd8fa0ef63e2b23cc1db38e3ae1697f*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2171x4096:4096x14336_n"3cf187a1e42981dcec8116ae019fdd5b*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336 --attr-fpmath=f16:true 2171x14336:14336x4096_n"8da4b9ce2fb8791236faf6249af2c389*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2171x4096:4096x4096_n"16a96d33bd6d34b4135a144fee4460a5*60"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2176x4096:4096x14336_n"06e0bf9181b178a0bd86e6cd3d5de2d3*52"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2171x4096:4096x14336_n"8afeb63720170359428400aac39f04eb*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x12288_n"6c9be4499d76e06dbc02818a8e91b19e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 34x4096:4096x4096_n"6c1c2a865c9d567e2733bffcbbc4f109*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x11008_n"9df843cd79ac3f3753390dda424aca5f*20&e900c148e56d0d4cf793c389a5ee21dc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 34x4096:4096x11008_n"a00df9e8eb704de28fa9526fdbc45c96*6&490521ae25b13ef9b9e848a13236b794*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 34x11008:11008x4096_n"d99428215d07bbbc03389be32d5d8b50*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x32000_n"bc9d805e19a2906ec150490501667cbf*2&29104c0e57d7d3584f9c73c2c56c1dfa*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x12288_n"e495f9794b65f22dda23e274783bc624*90&8670a4e827d05310e0d526f7da85cf7c*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x11008_n"793400d5c969ad24964b3a40b90aae0b*270&e693d25052e910d65feaacacfb7a8433*180&bdaeb23d63304074db7fef2b99c816af*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x11008_n"2e77f306982af8414c4c5920c21beb36*18&951bec37ad3be156fc534891fd721f33*54&9c1d8e9f31b4d47c9b2228453e0e64ea*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x11008:11008x4096_n"daab5ad703e30c03c262089d4ea20100*144&f9bab3b20e2a169e1ad8051568b038cd*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x12288_n"e897046aec55d8f5e6379f5da2614dc7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x4096_n"9492a848f56460abe7f34a42f0ad79b7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x11008_n"44e3b14dd83b438662788aca7e252f13*20&0bcfdc9d71d4d6e3738803af3fb90ce3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x11008_n"de6875f89e625f3b9f27c1bf60153c7a*6&35493f452852eb64d2aff7b22477bee8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x11008:11008x4096_n"98d77fa5c6a12062997c6dc3c8eae4db*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x32000_n"7c8cd54ab7132f7e171a827abbc53b33*2&8686cde0ee3bcd8405db414943e36971*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x12288_n"4dae3fedc1ac891382c33f24399a245c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2207x4096:4096x4096_n"df7d24c61c2f30c876a5e3b32142056d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x11008_n"b85cdadb259dbf634e449deebd42a797*20&ce01e7961b4a0e998c87f6e87eaf2d4d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2207x4096:4096x11008_n"4f6675f38424088f238c99b8993d1ddf*6&f1727f22f471fc768598226d652e290f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2207x11008:11008x4096_n"574f563367900e9562c9075c424294ff*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x32000_n"2bb8c2d0d261cc2625983355ef66542e*2&5636b7d5f276d97a21de9973a634bb3c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x4608_n"c651d272b69c1d29f448e9f498386207*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x3584_n"9b411c55bf969ccd81124e58b3aa1084*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x18944_n"82f769be9203e1aeae8190481938e6e3*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x18944_n"d56600854163f288918b8a8323ad6245*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x18944:18944x3584_n"97b33e1d95f2aa619c8241b764c8db67*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x4608_n"dfa2f1a800f0626d3242753b95c864a9*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x3584_n"828734c30a844b0aa252dcbc3094640a*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"808effb45387149ac49afaa04b6ca49a*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"2bd1111c3fa9bb00602905e7d8c8daaa*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x18944:18944x3584_n"40fbc90c349016a2ffeb9a4f2a84026e*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x4608_n"ea2d2e5db00ddde34b4076a406dd8155*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x3584_n"bb7b73c8f32c560fff8253d16f827cd5*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"e1a68c0c090990e3082b44dbc25822e8*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"d9947bb56cf8296b69b1c9b69f54cad7*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x18944:18944x3584_n"4c4e5071d33fada28d38b22f52e6284e*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x4608_n"3e7a4f1d8cd1facf9a94388f42ec2021*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x3584_n"196cd36f01d9d27cfc67485f078b60e9*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x18944_n"1082a294df6e3b5193daac31715dedde*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x18944_n"ee25e54d1d7b9b3ca118645c72e8d46b*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x18944:18944x3584_n"768ef99069033991643db42115bdf941*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1056x4096:4096x12288_n"d1c3556d5dcbe6dcf793892945ce168d*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1052x4096:4096x4096_n"87b66d564013a7c1ef43ea9789f1a4bd*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0 --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1056x4096:4096x11008_n"ebfed0b33e14d19435c3b2c57d9bd063*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1056x4096:4096x11008_n"eac76d653495e127d47cc0dced4aa43c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1056x11008:11008x4096_n"0322a5acc8ab1e78f615ae756972fcde*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1052x128:1x32x128x1052_n"2ae542a13c1eb43e5fc95d2a7dff3f86*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1052x1052:1x32x1052x128_n"4eaae6588849afc8ee054d0fc6df8fb6*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x151936_n"9a6159271f5093a64128992e1b76f66a*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1126_n"400f261a7bc1ac019c6be09aae4ee719*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1126:1x32x1126x128_n"b72fcc184952c9a2b24be1cd451ab9f5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1127_n"15fbabb586c00159a63e685111211c35*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1127:1x32x1127x128_n"e5917c03bdda10478ca70450daa712e7*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1128_n"b298aac2ad2a88e9d82068ddb459175b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1128:1x32x1128x128_n"a043dfd780bb5929083966bc6ec5ae61*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1129_n"63406227823b3dc880878d78bf5a6cc1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1129:1x32x1129x128_n"336803f5a9599ac09f17ace33686ea04*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1130_n"2677e8c6f37cffd86f875bb4d79a5a6c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1130:1x32x1130x128_n"892d853ecaf0a62c40122baa999a2b52*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1131_n"ec3194a47cb1f46683c09120dee3570b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1131:1x32x1131x128_n"59d8866a9fcbdb1e04af7c9212097e8e*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1132_n"422a28a28f4c1121f1af7d9a70bcc95c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1132:1x32x1132x128_n"8015edcf96986cb0b2006dd665bbefa6*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1133_n"8a68b06efa3ebb66f80365b7241da6eb*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1133:1x32x1133x128_n"70da4130584662d1bab1e6d3aa146bf6*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1134_n"6fe50064e1fb5f930f03140d7d495884*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1134:1x32x1134x128_n"415f4ae8a21fc661706b1d5a550267f5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1135_n"e4f0b246ffbfd67e56beb4f6a36a4e30*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1135:1x32x1135x128_n"e57ae8f33ff3acb19301507a2006b1e8*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1136_n"6eea5e77abb9c062296c52f23d7ae028*17"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1136:1x32x1136x128_n"b75fff62b32fc120cd22e71f6c9ac440*5"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1137_n"f4c5cae4c7c227f7faae76048b88a337*13"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1138_n"6823fb02bdda90eeb97b0d33794a8881*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1138:1x32x1138x128_n"9d7f8acb13e5b4768ad92c16160e450f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1139_n"29bfb96818aae255b43fd1974539f74c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1139:1x32x1139x128_n"a5ac61122ec64799850195951dc06dc3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1140_n"a11a3284f5254d41e7ab8f9e2ca820e8*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1140:1x32x1140x128_n"9963d0e55265e9c5e4f33c6a85e6b685*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1141_n"f6a25186d5fc525588a943130100d920*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1141:1x32x1141x128_n"10f71e1bca73934794d8f3fee3dc5061*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1142_n"e2f70482b563541504a79f4a6b502425*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1142:1x32x1142x128_n"62aaf6c761bf155d7d6ba2dc142f91e1*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1143_n"b00aac15d500fee8ff9d649ef11527c6*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1143:1x32x1143x128_n"5eaeabad1f75ab912daf59a38cdec280*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1144_n"65ad8a8d4f9eea5f3454bc32faae0cca*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1144:1x32x1144x128_n"6ded8008bbf7ad556dbab2875778c2cc*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1145_n"f9ac4e0d8d78e3de84f95426285516ca*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1145:1x32x1145x128_n"c25e0b83be46aa542f54d37dceca0e5f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1146_n"d0cbcfe10dc9a0daace4367dec9e9208*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1146:1x32x1146x128_n"f1a623d3219ac3ad448b86e1d8f14340*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1147_n"34b5318c08e67b8f8e6a59b09aee156b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1147:1x32x1147x128_n"3d69978ae819ac156af68e83d75667e5*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1148_n"d78e4d71e88948cadc2f098b5cc9f2d3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1148:1x32x1148x128_n"6fb9ce7fae264afb95109fbedf319361*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1149_n"87d7daea359331747f1b1286223e4ed0*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1149:1x32x1149x128_n"e92ccb4c90bc2ecd044244b1e3245e21*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1150_n"f3c6adc3372b6f02865d780a6de35941*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1150:1x32x1150x128_n"842d0a076b8e594fc3e0f8784e27869f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1151_n"c946216189df3c81aea1e79cb0359c96*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1151:1x32x1151x128_n"5730f70e9e8d0a0a976aa53e24ed7d8b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1152_n"cdbb9664485ef1f5e573ce966ed33d65*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1152:1x32x1152x128_n"27311865e4e3517f759ab4fc95b29658*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1153_n"c0c7bd2c48f0cea21054b424a9b6354c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1153:1x32x1153x128_n"419a5ffb10b60e504f0062fdebbdb133*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1154_n"312493df3626febe8c5249c59b8d4579*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1154:1x32x1154x128_n"10c1ea1c41cd405b6d6765b0809cd0cf*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1155_n"d34d95aa0d42d773082ac07b2148ec58*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1155:1x32x1155x128_n"4f9c3f2f75c442cfaa2f9ffac3895448*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1156_n"1cd0f8cfd6b80d0f45bf6e583bb9de34*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1156:1x32x1156x128_n"f259eb3e46a933fbb01e97780fbc1518*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1157_n"fb3878c43a478ddc7fca62fd08a3a7a3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1157:1x32x1157x128_n"11314ff1dc1ac6ec4756a00367b07184*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1158_n"afd1dbcceab71b7f15dec2465c2fb115*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1158:1x32x1158x128_n"52906eb30a13011c2e029c1d7d24bb23*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1159_n"f72ee18ab30254acb8a3a3debd7d41ea*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1159:1x32x1159x128_n"3b2697420f2507bdcf426906f2036a2e*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1160_n"eae277b1d133e4e15e70635f5ba7ce29*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1160:1x32x1160x128_n"5baa6cc62322f5cd10366095ebb3be70*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1161_n"e1abf79cf67a53bbf9f7a99738a0a648*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1161:1x32x1161x128_n"6b8722a82cb59ae81685bc5da3707f3c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1162_n"8ef197a0a43b11074dbf58e7038dc22b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1162:1x32x1162x128_n"8b435ac48d3fc20b1af41f94f06fbb45*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1163_n"c8a668f4cedf29c70226e9a4f39b7d0a*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1163:1x32x1163x128_n"dbbf26b5fa3b9dbfecc4bf15877c677f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1164_n"7d0267fcff7d4b4d6ed5e0c7270f0df3*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1164:1x32x1164x128_n"d03d65472c0de0f75fdfb70c12088632*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1165_n"934fde3f43f177c39c5c075fc7b2f8b2*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1165:1x32x1165x128_n"b4406a7e9b7747ecd013dbb06e0330b6*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1166_n"11d6f3d6fae7084a2412c90bbb574963*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1166:1x32x1166x128_n"bf42d6cbecd55b0824c85457dc49ae9b*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1167_n"bd814a59e4596f1cfc23cd799f63db6f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1167:1x32x1167x128_n"222a4877ad2660dc2b0661d8e65cc843*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1168_n"17e738d88bc7d70b407bb4bd53bd2a80*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1168:1x32x1168x128_n"131e950cda8aec25d1af4f947f74e430*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1169_n"66c2140c21a92e3c50f14573862cb0a9*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1169:1x32x1169x128_n"6554d7c907250504a8e578e7dc4a0d12*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1170_n"b287ea37f21a8eaaa22b5e6b1b5487df*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1170:1x32x1170x128_n"a7399e3744b5f5b82d80b20abc4c7e30*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1171_n"31a260e96714fdb1088a8738fa9380b9*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1171:1x32x1171x128_n"a3774f9ab2cf39d8e6ead09ef595fd90*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1172_n"2a98a26726841fa464eb3c5807260920*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1172:1x32x1172x128_n"7b721bfad10ba54e36f659878f81f64a*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1173_n"d2ace14851676f1f73a9e8478dffdfbb*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1173:1x32x1173x128_n"c318e0af7fd27ea36e88e97dbda620fb*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1174_n"3939c6695c9d31ff7e03063252a1451f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1174:1x32x1174x128_n"f884a55c7aad7f3be73dc014f3acbb2c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1175_n"acd0ebc33dab12a80bdbed0fa1287b41*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1175:1x32x1175x128_n"ce27e272ceb3078ab3094c3b58d708bd*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1176_n"f9ee717a2edf84d638cee3d4cb26e39f*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1176:1x32x1176x128_n"2dc3afde12cd3ddba4a6beb34d552156*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1177_n"13b69e746bbe9d46ec8baaade665723e*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1177:1x32x1177x128_n"1de70ad82f6df99c33ceeb929571ad72*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1178_n"2fcac7387933c10d4740f5dcaf50aa86*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1178:1x32x1178x128_n"31ccd9d4e847a6ad7d000218fcd9194c*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: --attr-post-ops=eltwise_clip:-1.0:1.0 1x32x1x128:1x32x128x1179_n"a47e9f4b4dff251766ad036af903c9ca*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=abcd --wtag=abcd --dtag=abcd --strides=:: 1x32x1x1179:1x32x1179x128_n"5c80f2fc94be6a09b0e2082f8fc94961*31"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x6144_n"c42624990fd44d1e9e84fab6eb3f4d4a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"9bc3aa536a079b59af5c5408fb1f4805*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"d591b610720ce3707e3642bd362a65c7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"94ec8094147032ac3406901d2fd293a9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"3af3ffa65772042be7e9f60e3f131158*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x128256_n"693e5fb67709a44799e23305a45bdc91*2&7deb81b7735e716bc20fcc2e6e830e76*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x6144_n"5c49fc2f95e07ccbc5fbda93392e10fb*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"0158094f9f83fbf441d3758950c0cf4b*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"7a96d6c87112820b2bd000df19ce8d55*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"49fac015ee45a0a91160f9602eba35ce*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"396303891730410f81c82e4c926faf58*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x128256_n"cc7d330113593ba086ea645f222d55cf*18&6304a5ad032487aae53508794671bc27*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x6144_n"8ed8e0017c336758261686600200b38d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4096_n"0932b68a1b62f8383fefa9de619b5698*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"e801172ec72e48e5c8fa7d72987efc5a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"4e4fb6dfe73cf1491623774f300a1bce*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x14336:14336x4096_n"1097444804e43b55337ce602bd1547bc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x128256_n"bd05da008f508b0db8f2f26e042def6c*2&5163937c949d7dec289da5178c60a3d3*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x6144_n"c8d2d775c24faaa4221edbe43f86d2eb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x4096_n"73bead6e5673c26d1cae7a272415f317*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x14336_n"f5944977ae29c1360e5a0a9de306fce9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x14336_n"f54577a09b509ac499e4fcb912fba095*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x14336:14336x4096_n"7e7d0974f0d76a15e2e6b9eb682f10f0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x4096:4096x128256_n"d7f960f63662049bac84bb44b4719156*2&e820738d90ff202e5429da93c6245ec1*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x9216_n"607d9448340945a1ffc74876e5802fca*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x3072:3072x3072_n"26b0dddf68c87ce2728466ba64a04eb0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 40x3072:3072x16384_n"0a071e23f592c6ce0251d9e59614e4a8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x8192+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 33x8192:8192x3072_n"7100b0c368020518981d337860de1c12*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x9216_n"51fe701c8493740dd3afff0d10130460*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x3072_n"43e046578047b61e000c00730a2a1b4e*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3072:3072x16384_n"33ff7d971c3f776d6d09fa321d01e7cc*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x8192+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x8192:8192x3072_n"80537740334d4aca35f7cc5532e6fae5*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x9216_n"f286bb7fe75f789051cdd04236bdc2b5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x3072:3072x3072_n"aeea3e0a38c365966be14bf573a2ee4a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1104x3072:3072x16384_n"e464746a2117decf8c6c3627b1910b6e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x8192+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1103x8192:8192x3072_n"3779b4945938c3d3db528381600b4fdc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x9216_n"5ebe0ee1acb7beaed71c3de72075a453*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x3072:3072x3072_n"d7155464bf5dd6896c20cfb83bade0ba*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x3072+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x3072+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2208x3072:3072x16384_n"c00176fa7a7e15bf54e9e116d29e7576*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x8192+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x8192+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2206x8192:8192x3072_n"83d7e9c6b3561f159ccfd49d436965fd*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x2560_n"780bf2dfbc158905771e945e30a644c5*128"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"26ce97f210a2271a4aa11941ebb56795*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"cdf1bf0fe2d6f0473a569701b055f44a*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"3d707ad22331529efbeaae0fa6fbd012*32640"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"64a749317848f5b298c404cb453bea92*8160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"b20bab76cd90fdef4a033e33cc812b7b*8160"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x2560:2560x2560_n"2555341fc2ec61d3dc2ad7734232abc2*3"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=f16:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x2560:2560x10240_n"a24a325cb32c09b8a6a70b8b9ca81a10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 40x4096:4096x4096_n"42f71a137bc79255cc06c63441f1a9f8*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 34x4096:4096x4096_n"49a0559873dca307acc7d59179b480e7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 34x4096:4096x11008_n"dbd77b688e68f46e537a3f5ff3c129c2*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 34x11008:11008x4096_n"c4293b44abd60d792a7e29df9aedb000*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 40x4096:4096x11008_n"e0f3cf5b3b4ec7b3de3cd83971d20c82*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 34x4096:4096x11008_n"8d7d97db15f14b120b0325ab8f8556b0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 40x4096:4096x12288_n"734a1f6304baa9c342fce8e76c3752dd*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1104x4096:4096x4096_n"fe1548ee0f2c5816eb1eccdf90edb159*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1104x4096:4096x4096_n"7fc574c153c3580f75a6698fd700bf86*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1104x4096:4096x11008_n"af7a59aa5147acb548ab5b9c1e47da44*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 1104x11008:11008x4096_n"66c080a75c556768ecfc90eaac488498*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1104x4096:4096x11008_n"9ab7749ffea5ab62664f5b7eeed73c0b*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1104x4096:4096x11008_n"0650601786811cf9ad969e31699f780b*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 1104x4096:4096x12288_n"94496b9560a88741787b8dcedffe0582*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2208x4096:4096x4096_n"bf8550690218ada6a983c2d318557c64*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2207x4096:4096x4096_n"da64a46d59479b689101ff549194839d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2207x4096:4096x11008_n"8f03c4a6aa1f35382a2a90e1a69bd314*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x11008 --attr-fpmath=f16:true 2207x11008:11008x4096_n"08d2d46a8f3efae123c4f46078a13e2f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2208x4096:4096x11008_n"c463570dcfbd8fe001b222b0ea39229c*42"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2207x4096:4096x11008_n"1f3402776828275c376c0c294895ed2a*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096 --attr-fpmath=f16:true 2208x4096:4096x12288_n"f1e7d9db6d38b00b42b6b465a5282ea1*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x4608_n"fd7e7531259711b4885ce04ff19dfeb5*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x3584_n"51e54591e1f775457055ca07a1df54b2*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x3584:3584x18944_n"4af3017c7477e2f0c8b830f75fb0b62a*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x3584:3584x18944_n"5d666374ea1a67e33047dcc1d7e66918*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x18944:18944x3584_n"68d85cf76a177d34491d797dd037bf27*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x4608_n"dc376f3e06bfa43069d99c423d511669*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x3584_n"081168c410d2b9747959ab80f5502e90*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"6b2492647563005b6bb9fe701d3055c0*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x3584:3584x18944_n"738b4bde8cea94cbd3805e237e01b79b*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x18944:18944x3584_n"2fc25d75c8dda2de68c3ee44785582ab*504"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x4608_n"834d42dbd64d43c838a5d25d44f6c245*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x3584_n"e18e2ded0a0305dfa8c3d09e440a15ef*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"9126dd539c2b36972064c0a4dd4e8801*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x3584:3584x18944_n"c9220af0d3b8dcdd2a81dc5365c84907*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x18944:18944x3584_n"293e2e806a96aed308d84502b85413d9*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x4608_n"87fd5f8bfae8b8b5b7e6b477e190677e*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x3584_n"30d4242024a7f59944f5ba400e4aface*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x3584:3584x18944_n"a7d55c0806a97c6496f72f961fafbe4b*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x3584:3584x18944_n"6bd64bd12b9b02f78710b1c0e270d582*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2047x18944:18944x3584_n"7423968a278bf8345e41b1a264cdd5df*56"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=f16:f16:f16 --stag=ab --wtag=ab --dtag=ab --strides=:: 64x1:1x1_n"b2effcc615596f6657a6c21bc13ee2f7*4032"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x2560:2560x10240_n"21ebe37c31b1227619e3d15409c86b08*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x2560:2560x2560_n"7f369fc9ebb2deec036c1945064e4fd0*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x10240:10240x2560_n"01687ad4b8a428d7532f5f3059df2e52*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 31x2560:2560x2560_n"8f6283c9844efc6a32b69030190a3a49*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x2560:2560x51200_n"a8c1e6cbea58c61fc3ac7a5aa6e8e890*2&a1fb0fc7449386fc7170e8d4fef79639*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x10240_n"97672d1d935d2067c5436b71c0aeb403*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x2560_n"91a2fb0b62d86eb2c22864e9dec9f0da*1728"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x10240:10240x2560_n"46123d5e75f59a27d1cddde2813dd3de*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x2560_n"49e068648af06daa38a3faf643fecdf7*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x51200_n"0f6d711f04dd8e6b805d7b2e2b1f5c89*18&56051705830adc2e9b9ddca5f772c3cf*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x2560:2560x10240_n"bda1e0dc8fc436e1a8bff0a9b49e772a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x2560:2560x2560_n"02745fc3c0749f43358025cac0914b8f*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x10240:10240x2560_n"e8f17622f8a8405806fb88d3dd8829e9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1022x2560:2560x2560_n"72137a0be53f54af6ffaf3e98aeae153*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x2560:2560x51200_n"1c2c71b496f98c915c63080541dff191*2&c6cfbb7a72fb16e899017c77364d1d3c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x2560:2560x10240_n"49a139640a9bc090ded9de9e95ac6579*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x2560:2560x2560_n"deccec153d29860987c1e90b245082ec*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x10240:10240x2560_n"da238473ada1a413716664eb9f0c6939*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2044x2560:2560x2560_n"aa277aae51f16adbc733e51c4f3c5314*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u8:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x2560:2560x51200_n"957cfb32ff2f7e0715123dce18e03ef3*2&27e7c714b312ff5b68d489bb31c0d9ed*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x6144_n"46a3bd55bffa657e533e5b93110e8496*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"e5ad406eed27e2f9402242d92121b505*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"2846efae72da2e7fa9d5b9ef861020b0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"76e90bad59b9d5570d7aca61bd99768e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"6614111cd2095a624c25efe4f764400c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x6144_n"a0c9003d7094a5d03b38d819ea2a4f37*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"a882df1a73c6127e175fc9f0ba72ddce*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"02f9b5a341eeb48cd624aa1e5f34fd8a*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"d389ed10aad8ec2b0301fe5614282f4f*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"4ed3f6e4f2165af7d01947c14d3b8310*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x6144_n"1b08634f3d35f5620e52c800423d27e6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4096_n"3cabb311ce0060e77aef38b79ad1f790*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"4cc32e289a4cf3ade368977d54dbd0c9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"72b977c0ff7fed6651b860fb1d0b56f4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x14336:14336x4096_n"05a294f2d4a944d054655f5a06d5fd37*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x6144_n"c0f523e8d4c5d4d9be04547114d3aa9f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x4096_n"9f325a6fe1601d6f8f67a3e8d6980277*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x14336_n"9adc27d5ad250e1150090c76fc349f9c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x14336_n"e7a4a119a92ed2cf3c539ea8d312524b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x32+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x14336:14336x4096_n"6c883e4938f39a257b759926228ae73b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x7680_n"7ea65bab703d46c254a412005884ce5a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"2782b3d6a8b783581450c84a5dc02c2f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"fc2cd7b8d96bff8a9ae92b0b9998f836*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"b3ef96c748acc234049f387ead532372*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x7680_n"1a51f45a07ae108c6d1e644a3224e256*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"fe2af39f3d5a1c325cbfd589b3535528*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"b8b5ca1e91746556f980afc06ae08d8d*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"42e3dd3f825da2ec32597dfb5341d0e0*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x7680_n"0fe59b0e1d981f164488cff210eefde9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x10240_n"e0f105c82f8e9221be13c26330bf7d93*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x10240:10240x2560_n"97eae729c1de78f5b5f5186c43370540*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x2560_n"226c95a5dc4b6a2e53530e7fa0b6f0f6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x7680_n"9e457df5d92a3be10f1228af2598fd73*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x10240_n"5f1980801167f0278b317916bcef7028*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x10240:10240x2560_n"e4f9ed6d6dd70f9b5c23944d974798df*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x2560_n"e1092a68a6c4c0514dbcfec7d395365a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"3af370f3645b565102a62e59eb10bda5*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x2560:2560x2560_n"7ba6d06479a3f95b663957eebeee3302*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"3e2f96743254402078f957c7821c8f67*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"37e6f68af73579f1c8dc4f4b91fb3e09*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"15e686a852b5884fc97016c60c903cd7*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"c25c7581a579b0001765bfb402b307c3*1728"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"3ff04f93194dbe02d1afa4b78b606eab*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"ce5419099c93b3272f03070e9060fa03*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x10240_n"f5a71c4a995f17223a5a4b23a543700e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x2560:2560x2560_n"d9278272fe608c00145d34dc24d6178d*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1024x10240:10240x2560_n"e4298c0b792def2db7044e56b95045ee*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 1022x2560:2560x2560_n"9965e5d7500ae6f3aa9121b84c26ab57*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_tanh --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x10240_n"c5066d81a92124cec936cfa7be4feec9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x2560:2560x2560_n"ad0569a6cbd8853230b68f5e1ae6aa58*192"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2048x10240:10240x2560_n"8df9323167b19999e3660aa7dbf1a6c2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:64x1 --attr-zero-points=wei:per_ocic:u8:64x1 --attr-fpmath=f16:true 2044x2560:2560x2560_n"43a297ef51aa4aed636da66c04bf2a59*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x6144_n"dc811555c885382e85a7819167bdafa3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x4096_n"cb41b009fdae947dfa4b8e27cfc0bae7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"165658ddb00446f42e7684eecb962a93*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x4096:4096x14336_n"d1648a3dc066efe7024f49d4374719e8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x14336:14336x4096_n"ab584195b7ea32d58c1f771778949d8b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x6144_n"aa2d3ded8175e2e8803c0010608987d5*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x4096_n"b20b141063d6a7faed0bae73cea2c0b2*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"84741dd9cb5ea4d77473c6e554840d98*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x4096:4096x14336_n"267eb5a3cca2613bfc286fe4a25c88ae*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x14336:14336x4096_n"455eeb9cf6e2ae918db40861260fccbf*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x6144_n"d46499b5d573f5d8754875de14153f81*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x4096_n"e30c4d0a992a7729b435e6ccea575dbb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"789c026484df598ed11860069de2d1e3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x4096:4096x14336_n"e5623412ec029b9211654c93eecb906f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1024x14336:14336x4096_n"4bfeeaf1b25b983baa7088af5579965e*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x6144_n"7958162d55a28bbc0d4dd027bd7beb43*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x4096_n"2933993c0036e2cbe752d85747ad6f80*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2048x4096:4096x14336_n"1d6721535db9de1f0f380ae85ceea1af*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x4096+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x4096:4096x14336_n"552636c49e8a7b88d774fa5badab5897*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=u8:u4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x14336+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2046x14336:14336x4096_n"7e0833a07099004e2efbaa8d080dc6f3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x12288_n"fce2cb1e8e3971843f13ca6c9a0e4cf5*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x4096_n"5fc1e0e7c9b3dac9d4a77f8d1107b6d0*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x11008_n"7d692704c8ad6393f2e8d6f15d76498f*34"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x11008_n"015933db508e6cfeb0642b85316c3686*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x11008:11008x4096_n"18be37d4eaf42e2c5feb5744e7d695f7*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 31x11008:11008x4096_n"4448b8114f5a0978331b0f8637e8732e*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 31x4096:4096x11008_n"eec904e5dc70c498589c7f8c24a88a06*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x11008_n"f1cb116fb32669287f385e69cfc2a96a*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x12288_n"f8ff0445c33be843c324f7eb7e58236f*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x4096:4096x125696_n"9322185ac993daf735c602a26a05e3c0*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x12288_n"ac12006d9c71a12c944d9f7c3d42f2aa*486&336191e9a5cf35a5253488d719f7c2e2*36"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"4b36f7a76624028a10e7866defef53fc*306&88786625d3ca46a886850bab86d427de*396"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:2:abx --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x4096:4096x11008_n"996a5aa954ee7575d16271cefebfd1e2*558&1b7a45df093ed42e8c727877c078d300*522"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:2:abx --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1x11008:11008x4096_n"fcd52b867e51d563e2c78656e4a464be*432&642296eaab8538252b30d5604677b80c*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x4096:4096x125696_n"7217744cc93b479d160a0ce9ec7dd34e*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1064x4096:4096x12288_n"748bf64ac6d2d0d1f3171369b039785d*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x4096_n"5afdbd9c2161fe70510e831ea169e2e6*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1064x4096:4096x11008_n"75c5e9ba484723449d061ce546781cf3*34"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x11008_n"5cca8d3772df1765a815601774419d26*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x11008:11008x4096_n"56bfd66cb30e88ec6a292e261ebf477d*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1062x11008:11008x4096_n"4e29b5242006a4b56ed932ee7289e60f*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1062x4096:4096x11008_n"958819f38d18b29369e7f6a105f8cbfd*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x11008_n"50eaf403243d81d6325bb48a789c434f*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x12288_n"33241ca6cd5e22ca4a182baf1dbe1371*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1064x4096:4096x125696_n"9cf7b2670e6a9fa5191cf8a72bff1538*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2128x4096:4096x12288_n"f5a7886992fccaf0693e8cc743bcddcd*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x4096_n"89328806e672de96f56df20bff0b95fd*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2128x4096:4096x11008_n"e02ff0dc47d416378935d0bb43aedff2*34"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x11008_n"950f7e16bc6b16a75df6398ce58a0b46*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x11008:11008x4096_n"892e62b4a2e38a12648d8e68316c0712*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2124x11008:11008x4096_n"6f0cbccfcb62ca5d1b6b27db3a0407c4*16"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2124x4096:4096x11008_n"f66a84e2bd3992282ef8f2cef6c29e0f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x11008_n"ac037e250cc441494cd01ae47b6ddf3e*30"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x12288_n"b628349daddfb18df6240c7bbf0c2179*10"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2128x4096:4096x125696_n"72d03595fc61853989e1b87a3ec9d67f*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 40x4096:4096x4096_n"4fee9c8dd2da0274125e9df1fe2d47dc*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x4096_n"3aec8325952bfdd4d702e16b76d7b5df*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x4096_n"75fe100530db109c73fc4bf74ca81acf*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x4096:4096x11008_n"987cc12994eca391a5694421a05260af*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 34x11008:11008x4096_n"7dcf7053e242e98e370fb16f9fdea4bf*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x11008_n"4d92fa5c7af3753b439b344207f721be*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 40x4096:4096x12288_n"dcd8f803a092ac5fddd0baeef9679f86*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1104x4096:4096x4096_n"ce122f53770953a1d9e01a7e1b13c396*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x4096_n"5661ee5acbfc72c9cd8715f73bb624d3*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x4096_n"207bb3eebd1542d73bff62d59619cee1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"ebda171f77cc19ce577602573835ef8a*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x11008:11008x4096_n"e5bfefd87e96bc7d334fac1a3e6c6105*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x11008_n"e775f7dd8f775e2603fdb5c98dc64e06*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1104x4096:4096x12288_n"32ff2c543d2142d6f315e59f333466e6*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2208x4096:4096x4096_n"1976dd59048b90dc7c07f96a23fadf43*86"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x4096_n"0bb325979500df8eae88101fbc97344d*94"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x4096_n"9ac5c8a412b9209083f673cbe12a0c90*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x4096:4096x11008_n"6f614681bbd21f90dfe37f822ec457f9*58"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x11008+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2207x11008:11008x4096_n"f9b0a05d54c7aac736316d8d36f50e2d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x11008_n"3bfe37277be8097dce0487808a7dd8ba*44"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2208x4096:4096x12288_n"974f4a881628b9f42d4e7e5acce87cf4*4"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x7680_n"b940579f587260185742152fe73b29b3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"fbff7b0ec287d6cfb7f87678ac5016c7*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"9f9424e76cc0128cdbd748ea4507e19d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"4106137b9dcd1599d8ee4596477752ec*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x7680_n"39d573d95cc8c131acefc4dd2641db3b*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"f8c192e49e4b337c4f99c855ecfe600d*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"445c27b1a962aa50f42bcb2e5de3a17f*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"c8dd0ebcea4db5cd4de40cf544ddab26*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x7680_n"758026b1193df3eb7479c43740d6accb*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x10240_n"b1b40aa1e78489ad7cbb87890d44b1f8*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x10240:10240x2560_n"3bee45883773b9ae952ca22f95c8993b*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x2560_n"e13e445bca9161957f238cf52881d6e1*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x7680_n"dd32acbcefe5208878327927cc2ec296*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x10240_n"7339273f6fc3e759fa9bead036dfacb2*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x10240+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x10240:10240x2560_n"9a90fa56c85f64ede6b1d4859890d8ba*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=u8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=src0:per_ocic:u8:1x2560+wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x2560_n"635ad530285e398843e8c65d845e06f3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x6144_n"25a60479f169b6452e3b53a44d79f9ff*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x4096_n"4874be5018d4d8fc390df3baca279b3d*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x14336_n"c60366b28a2f0e3dacc7385d6ffd6e03*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x4096:4096x14336_n"f4e62829d1ba8cea579a53b14025b297*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1024x14336:14336x4096_n"83bc086684d8f0a61bb03ad019c58482*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x4096:4096x6144_n"866f496e21ec37efdd1fefba9d1263ba*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2046x4096:4096x4096_n"1dd2ccabf282122164dca5281a1e2e82*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2048x4096:4096x14336_n"ee3c77c6306648924007cfaa270b8852*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x4096+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2046x4096:4096x14336_n"505bb6680aac872e95f3d5da489a7677*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x14336+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2046x14336:14336x4096_n"febb498ed2605a451d18e983bcb588b9*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x12288_n"4e2767adf187662cf62feec413030435*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x4096_n"720a3745f10631d2fd8e2ba00b28982c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 32x4096:4096x11008_n"b9aeda3cc0a836457745b214d71420eb*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x11008_n"51062b76f240b97ea5e0d0a2548df9c5*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x11008:11008x4096_n"0d796d9256d167405ae0eb698b5ed4f0*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 31x4096:4096x11008_n"baa7b46f40c0c2060348fd2bbd429e7c*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1064x4096:4096x12288_n"11f483e7f532fa361d6a93732178761e*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x4096_n"e576a9136971bb964a9bc65a20f205e3*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1064x4096:4096x11008_n"5f5ede09733281e36364202dff75a67c*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x11008_n"512d2016f6299d34ad7b935a10cd1f95*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x11008:11008x4096_n"13230e8704de6934242ad3906875025d*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 1062x4096:4096x11008_n"e4ba537d3a64c236974932eaa47241e8*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2128x4096:4096x12288_n"8b467b0073e643a9d61a1348db29a822*54"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x4096_n"2076393d71918faee222fe06f956f1ba*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2128x4096:4096x11008_n"dcb2778d9cdd5c4ebdb6dbba9ec60868*32"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_swish:1.0+binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x11008_n"bab6742fc9a0b6b4d7a02be8d238a67e*62"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f32:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x11008:11008x4096_n"ac94a4c958848da88090c8cf87025cad*48"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:s4:f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_mul:f16:3:ab --attr-scales=src0:per_ocic:f16:1x128+wei:per_ocic:f16:128x1 --attr-fpmath=f16:true 2124x4096:4096x11008_n"fd9ccfd211a8ea130b40e500c3cee02e*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x7680_n"d9eee1da14aff9d75b3338a6ec093fad*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x2560:2560x10240_n"d20c6a8c71ed8407971a4fa5945b090a*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 32x10240:10240x2560_n"1d0a6e5f4a765fe226695312fe912486*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 31x2560:2560x2560_n"9b81de8ae492fbe6c80fae3e1af13ced*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 32x2560:2560x50280_n"07eb5212c3c822b4b4b71fa6e7a79207*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x7680_n"066c9a646ae20fb9eb12c8fb4b078b28*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x10240_n"270f391a8651bf9c9b61fb556348035c*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x10240:10240x2560_n"38595fbdcb1b39bf815f09beded59199*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:2:abx+binary_add:f16:2:abx --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1x2560:2560x2560_n"c5e14b2c566749a3aa928862e29b3309*576"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1x2560:2560x50280_n"a972c7a93e826cb6a990efb056493a29*18"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x7680_n"c0a6926876578d0b5412d5ee2b6469bc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x10240_n"af1289d96191de17521c79bb25200944*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x10240:10240x2560_n"3b5844139adcf5f1214bb9162b13eedc*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 1032x2560:2560x2560_n"e3d41560f97ed3829eaf1641cb1e7612*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 1032x2560:2560x50280_n"39b30e3aa7b3e3717b1e9e7e49e84acc*2"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x7680_n"85c2f7327cacc6f54ed9637657d321a4*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=eltwise_gelu_erf --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x10240_n"a31f1618957285a98fc9d21677e6bf11*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x10240+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x10240:10240x2560_n"314ad416b181bc5d53c17f013c7a2f8c*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --bia_mask=2 --dt=s8:u4:f16 --bia-dt=f16 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-post-ops=binary_add:f16:3:ab+binary_add:f16:3:ab --attr-scales=src0:per_ocic:f16:1x2560+wei:per_ocic:f16:128x1 --attr-zero-points=wei:per_ocic:u8:128x1 --attr-fpmath=f16:true 2064x2560:2560x2560_n"56c81755eb2a9ee09b6b4fe75b741d4f*64"
+--reset --allow-enum-tags-only=0 --runtime_dims_masks= --dt=s8:u8:f32 --stag=ab --wtag=ba --dtag=ab --strides=:: --attr-scales=src0:per_ocic:f16:1x2560+wei:per_oc:f16 --attr-zero-points=wei:per_oc:u8 --attr-fpmath=f16:true 2064x2560:2560x50280_n"7b3a0f2b38339cf1a29f654ab76f78a4*2"
diff --git a/tests/benchdnn/inputs/matmul/shapes_4bit b/tests/benchdnn/inputs/matmul/shapes_4bit
new file mode 100644
index 00000000000..ef8cc655326
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/shapes_4bit
@@ -0,0 +1,12 @@
+24x32:32x64
+25x32:32x16
+96x96:96x64
+14x96:96x32
+1x30:30x20
+10x30:30x20
+2048x1024:1024x512_n"DLRM:5*1"
+2048x256:256x128_n"NCF:1*1"
+2048x128:128x64_n"NCF:2*1"
+896x240:240x4096_n"RNN-T:Encoder_cell1_Input*2"
+896x1024:1024x4096_n"RNN-T:Encoder_cell1_Hidden*11"
+896x320:320x1280_n"RNN-T:Prediction_Input*12"
diff --git a/tests/benchdnn/inputs/matmul/shapes_4d b/tests/benchdnn/inputs/matmul/shapes_4d
index 7a8aa33de14..6ce9e666dd6 100644
--- a/tests/benchdnn/inputs/matmul/shapes_4d
+++ b/tests/benchdnn/inputs/matmul/shapes_4d
@@ -1,22 +1,17 @@
-1x16x384x384:1x16x384x64
-89x16x45x45:89x16x45x64
-97x16x41x41:97x16x41x64
-81x16x49x49:81x16x49x64
-74x16x54x54:74x16x54x64
-113x16x35x35:113x16x35x64
-105x16x38x38:105x16x38x64
-124x16x32x32:124x16x32x64
-89x16x45x64:89x16x64x45
-97x16x41x64:97x16x64x41
-81x16x49x64:81x16x64x49
-74x16x54x64:74x16x64x54
-113x16x35x64:113x16x64x35
-105x16x38x64:105x16x64x38
-124x16x32x64:124x16x64x32
+18x16x54x64:18x16x64x54
+11x16x45x45:11x16x45x64
+21x16x41x41:21x16x41x64
+16x16x49x49:16x16x49x64
+14x16x54x54:14x16x54x64
+5x16x38x38:5x16x38x64
+24x16x32x32:24x16x32x64
+13x16x45x64:13x16x64x45
+17x16x41x64:17x16x64x41
+21x16x49x64:21x16x64x49
 
 # Broadcast shapes
-74x16x54x64:74x16x64x54
-1x1x35x64:113x16x64x35
-1x16x38x64:105x1x64x38
-74x16x54x64:1x1x64x54n"B_full_bcast"
-74x6x1x253:1x1x253x1n"dot_prod_w_B_full_bcast"
+2x16x384x384:2x1x384x64
+1x1x35x64:13x16x64x35
+1x16x38x64:5x1x64x38
+14x16x54x64:1x1x64x54n"B_full_bcast"
+14x6x1x253:1x1x253x1n"dot_prod_w_B_full_bcast"
diff --git a/tests/benchdnn/inputs/matmul/shapes_mem_strided b/tests/benchdnn/inputs/matmul/shapes_mem_strided
new file mode 100644
index 00000000000..861460c8714
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/shapes_mem_strided
@@ -0,0 +1,37 @@
+--strides=1x1::1x1 1x1:1x1
+--strides=60x1::5x1 10x30:30x1
+--strides=30x1::40x1 1x30:30x20
+--strides=2x1::100x1 10x1:1x20
+--strides=1200x1::2x1 1x300:300x1
+--strides=2x1::1000x1 1x1:1x200
+--strides=1x1::2x1 100x1:1x1
+--strides=120x1::100x1 10x30:30x20
+--strides=150x1::47x1 2x30:30x47
+--strides=4500x90x1::150x3x1 2x10x30:2x30x1
+--strides=450x5x1::5400x60x1 3x30x1:3x1x20
+--strides=2700x90x1::3000x100x1 1x30x30:1x30x20
+--strides=4096x64x1::1024x16x1 7x32x16:1x16x8
+--strides=20480x32x1::40960x64x1 1x128x8:2x8x16
+--strides=2336x146x1::640x40x1 2x16x73:1x73x8
+--strides=1326x51x1::8450x325x1 1x26x17:5x17x65
+--strides=75497472x1179648x1536x1::15728640x245760x320x1 1x16x384x384:1x16x384x64
+--strides=1458000x18225x135x1::3456000x43200x320x1 89x16x45x45:89x16x45x64
+--strides=2689600x33620x164x1::4198400x52480x256x1 97x16x41x41:97x16x41x64
+--strides=384160x12005x49x1::2007040x62720x256x1 81x16x49x49:81x16x49x64
+--strides=559872x8748x54x1::1990656x31104x192x1 74x16x54x54:74x16x54x64
+--strides=156800x4900x140x1::143360x4480x128x1 113x16x35x35:113x16x35x64
+--strides=115520x7220x190x1::155648x9728x256x1 105x16x38x38:105x16x38x64
+--strides=491520x15360x96x1::1638400x51200x320x1 124x16x32x32:124x16x32x64
+--strides=552960x8640x192x1::259200x4050x90x1 89x16x45x64:89x16x64x45
+--strides=4198400x52480x256x1::2017200x25215x123x1 97x16x41x64:97x16x64x41
+--strides=501760x6272x128x1::768320x9604x196x1 81x16x49x64:81x16x64x49
+--strides=1990656x31104x192x1::559872x8748x54x1 74x16x54x64:74x16x64x54
+--strides=71680x4480x128x1::98000x6125x175x1 113x16x35x64:113x16x64x35
+--strides=778240x12160x320x1::92416x1444x38x1 105x16x38x64:105x16x64x38
+--strides=3276800x40960x320x1::327680x4096x32x1 124x16x32x64:124x16x64x32
+--strides=497664x31104x192x1::279936x17496x108x1 74x16x54x64:74x16x64x54
+--strides=8960x8960x64x1::313600x19600x140x1 1x1x35x64:113x16x64x35
+--strides=3112960x38912x256x1::924160x11552x76x1 1x16x38x64:105x1x64x38
+--strides=26880x8960x128x1::36750x12250x175x1 1x3x35x64:3x1x64x35
+--strides=1327104x20736x192x1::373248x5832x54x1 74x16x54x64:1x1x64x54n"B_full_bcast"
+--strides=121440x4048x1012x1::240x8x2x1 74x6x1x253:1x1x253x1n"dot_prod_w_B_full_bcast"
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_all b/tests/benchdnn/inputs/matmul/test_matmul_all
index bef9679c4d8..23b2a5a0ced 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_all
+++ b/tests/benchdnn/inputs/matmul/test_matmul_all
@@ -4,24 +4,24 @@
 --dt=f32
 --stag=ab --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,1:0,3:1
---bia_dt=undef,f32 --bia_mask=2
+--bia-dt=undef,f32 --bia_mask=2
 
 --attr-scales=
 --attr-post-ops=
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:per_oc+dst:common:2.25
+--attr-scales=src:common:0.25+wei:per_oc+dst:common:4
 --attr-post-ops=relu
 --batch=shapes_2d
 
 --stag=ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=3:1,3:3
---bia_dt=f32 --bia_mask=1,2,3
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--bia-dt=f32 --bia_mask=1,2,3
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=add:f32,sum+mul:s32:per_oc+linear:2:-1
 --batch=shapes_2d
 
@@ -35,14 +35,14 @@
 --reset
 --dt=f32
 --stag=abc,acb --wtag=abc,acb --dtag=abc,bac
---bia_dt=undef,f32 --bia_mask=4,6
+--bia-dt=undef,f32 --bia_mask=4,6
 
 --runtime_dims_masks=0,1:1
 --attr-post-ops=,sum
 --batch=shapes_3d
 
 --runtime_dims_masks=2:4,6:6,3:5,7:7
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25,\
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4,\
               src:common:0.25+wei:per_oc+dst:common:0.5
 --attr-post-ops=sum+relu+add:u8,add:f32:per_tensor+sub:f32:per_oc,add:f32:4
 --batch=shapes_3d
@@ -50,7 +50,7 @@
 # batch size smaller than num_threads
 --reset
 --stag=abc --wtag=abc --dtag=abc
---bia_dt=f32 --bia_mask=4
+--bia-dt=f32 --bia_mask=4
 2x20x30:2x30x4
 2x20x30:1x30x4
 
@@ -83,8 +83,8 @@
 --reset
 --dt=f32
 --stag=abcd,abdc --wtag=abcd,abdc --dtag=abcd,abdc
---bia_dt=undef,f32
+--bia-dt=undef,f32
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,relu,sum,sum+relu+add:f32,binary_mul:f32+sum
 --batch=shapes_4d
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_bfloat16 b/tests/benchdnn/inputs/matmul/test_matmul_bfloat16
index 233c9b2eae2..a1621606da3 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_bfloat16
+++ b/tests/benchdnn/inputs/matmul/test_matmul_bfloat16
@@ -1,49 +1,49 @@
 # bf16
 --reset
 
---dt=bf16:bf16:f32,bf16
+--dt=bf16:bf16:f32,bf16,f32:bf16:f32
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,1:0,3:1
---bia_dt=undef,f32 --bia_mask=2
+--bia-dt=undef,f32 --bia_mask=2
 
 --attr-scales=
 --attr-post-ops=
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum,prelu:per_oc
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:per_oc+dst:common:2.25
+--attr-scales=src:common:0.25+wei:per_oc+dst:common:4
 --attr-post-ops=relu
 --batch=shapes_2d
 
 
 --stag=ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=3:1,3:3
---bia_dt=bf16 --bia_mask=1,2,3
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--bia-dt=bf16 --bia_mask=1,2,3
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=add:f32,sum+mul:s32:per_oc+linear:2:-1
 --batch=shapes_2d
 
 # test any
 --reset
---dt=bf16:bf16:f32,bf16
+--dt=bf16:bf16:f32,bf16,f32:bf16:f32
 --stag=ab,ba,any --wtag=ab,ba,any --dtag=ab,any
 --batch=shapes_2d
 
 # 3d
 --reset
---dt=bf16:bf16:f32,bf16
+--dt=bf16:bf16:f32,bf16,f32:bf16:f32
 --stag=abc,acb --wtag=abc,acb --dtag=abc
---bia_dt=undef,f32 --bia_mask=4,6
+--bia-dt=undef,f32 --bia_mask=4,6
 
 --runtime_dims_masks=0,1:1
 --attr-post-ops=,sum
 --batch=shapes_3d
 
 --runtime_dims_masks=2:4,6:6,3:5,7:7
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum+relu+add:u8
 --batch=shapes_3d
 
@@ -54,10 +54,10 @@
 --dt=bf16:bf16:f32,bf16
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,0:2,2:3
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,sum:2+relu+mul:f32:per_oc
 --batch=shapes_2d
 
@@ -74,7 +74,7 @@
 # batch size smaller than num_threads
 --reset
 --stag=abc --wtag=abc --dtag=abc
---dt=bf16 --bia_dt=f32 --bia_mask=4
+--dt=bf16 --bia-dt=f32 --bia_mask=4
 2x20x30:2x30x4
 2x20x30:1x30x4
 
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_ci b/tests/benchdnn/inputs/matmul/test_matmul_ci
index 0fd487089e8..c359a9f0987 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_ci
+++ b/tests/benchdnn/inputs/matmul/test_matmul_ci
@@ -1,7 +1,11 @@
+# Initial Coverage Test
+--reset
+--batch=harness_matmul_generated_ci
+
 # Plain cases
 --reset
---dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,f8_e5m2:f8_e4m3:f32,u8:s8:s8,s8:s8:f32
---bia_dt=f32
+--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,f8_e5m2:f8_e4m3:f32,u8:s8:s8,s8:s8:f32,s8:s8:f16,s8:u8:f16
+--bia-dt=f32
 --bia_mask=2
 --batch=shapes_2d_ci
 --batch=shapes_2d
@@ -10,7 +14,7 @@
 
 # Post-ops check for different data types
 --reset
---dt=f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32
+--dt=f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32,s8:s8:f16,s8:u8:f16
 --attr-post-ops=sum+relu:0.5+add:f32
 --batch=shapes_2d_ci
 
@@ -31,7 +35,7 @@
 
 # Different tags
 --reset
---dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32
+--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:s8,s8:s8:f32,s8:s8:f16,s8:u8:f16
 --stag=ab,ba
 --wtag=ab,ba
 --dtag=ab,ba
@@ -54,7 +58,7 @@
 
 # Arg scales check
 --reset
---dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:u8,s8:s8:f32
+--dt=f64,f32,bf16,f16,f8_e5m2,f8_e4m3,u8:s8:u8,s8:s8:f32,s8:s8:f16,s8:u8:f16
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2,wei:per_oc
 --batch=shapes_2d_ci
 
@@ -68,7 +72,7 @@
 
 # Zero-points check
 --reset
---dt=s8:s8:s8,u8:s8:f32,u8:s8:bf16
+--dt=s8:s8:s8,u8:s8:f32,u8:s8:bf16,s8:s8:f16,s8:u8:f16
 --attr-zero-points=src:common:1+wei:common:-1+dst:common:2
 --batch=shapes_2d_ci
 
@@ -125,52 +129,52 @@
 # test all the supported data type configurations + bias data types
 --reset
 --dt=f64,f32
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --dt=bf16,bf16:bf16:f32
---bia_dt=undef,f32,bf16
+--bia-dt=undef,f32,bf16
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --dt=f16,f16:f16:f32
---bia_dt=undef,f32,f16
+--bia-dt=undef,f32,f16
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --dt=f8_e5m2,f8_e5m2:f8_e5m2:f32
---bia_dt=undef,f32,f8_e5m2
+--bia-dt=undef,f32,f8_e5m2
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --dt=f8_e4m3,f8_e4m3:f8_e4m3:f32
---bia_dt=undef,f32,f8_e4m3
+--bia-dt=undef,f32,f8_e4m3
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --dt=u8:s8:f32,u8:s8:s32,u8:s8:s8,u8:s8:u8,\
      s8:s8:f32,s8:s8:s32,s8:s8:s8,s8:s8:u8
---bia_dt=undef,f32,u8,s8,s32
+--bia-dt=undef,f32,u8,s8,s32
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --dt=u8:s8:bf16,u8:s8:f16,\
      s8:s8:bf16,s8:s8:f16
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2,3  77x133:133x117
 --bia_mask=4,6  15x24x16:15x16x32
 --bia_mask=8,12 7x16x24x8:7x16x8x24
 
 --reset
 --stag=abc --wtag=abc --dtag=abc
---bia_dt=f32 --bia_mask=0,1,2,3,4
+--bia-dt=f32 --bia_mask=0,1,2,3,4
 2x32x64:1x64x32
 
 # Basic post-ops with runtime dims 2D.
@@ -196,3 +200,18 @@
 --runtime_dims_masks=,4:0,0:8,8:4,4:8,12:4,8:12,12:12
 --attr-post-ops=mul:f32,relu,sum,prelu,prelu:per_oc
 2x10x3x20:2x10x20x4n"postops+runtime_dims_4d"
+
+# Rounding Mode
+--reset
+--dt=f8_e5m2,f8_e4m3
+--bia-dt=f32
+--attr-rounding-mode=dst:stochastic
+--batch=shapes_2d_ci
+--batch=shapes_2d
+--batch=shapes_3d
+
+# Dropout
+--batch=harness_matmul_dropout
+
+# fp4
+--batch=test_matmul_fp4
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_float16 b/tests/benchdnn/inputs/matmul/test_matmul_float16
index 29b2905a43e..590bc6de6d3 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_float16
+++ b/tests/benchdnn/inputs/matmul/test_matmul_float16
@@ -1,49 +1,50 @@
 # f16
 --reset
 
---dt=f16:f16:f32,f16
+--skip-impl=ref
+--dt=f16:f16:f32,f16,f32:f16:f32
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,1:0,3:1
---bia_dt=undef,f32 --bia_mask=2
+--bia-dt=undef,f32 --bia_mask=2
 
 --attr-scales=
 --attr-post-ops=
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum,prelu:per_oc
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:per_oc+dst:common:2.25
+--attr-scales=src:common:0.25+wei:per_oc+dst:common:4
 --attr-post-ops=relu
 --batch=shapes_2d
 
 
 --stag=ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=3:1,3:3
---bia_dt=f16 --bia_mask=1,2,3
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--bia-dt=f16 --bia_mask=1,2,3
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=add:f32,sum+mul:s32:per_oc+linear:2:-1
 --batch=shapes_2d
 
 # test any
 --reset
---dt=f16:f16:f32,f16
+--dt=f16:f16:f32,f16,f32:f16:f32
 --stag=ab,ba,any --wtag=ab,ba,any --dtag=ab,any
 --batch=shapes_2d
 
 # 3d
 --reset
---dt=f16:f16:f32,f16
+--dt=f16:f16:f32,f16,f32:f16:f32
 --stag=abc,acb --wtag=abc,acb --dtag=abc
---bia_dt=undef,f32 --bia_mask=4,6
+--bia-dt=undef,f32 --bia_mask=4,6
 
 --runtime_dims_masks=0,1:1
 --attr-post-ops=,sum
 --batch=shapes_3d
 
 --runtime_dims_masks=2:4,6:6,3:5,7:7
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum+relu+add:u8
 --batch=shapes_3d
 
@@ -54,10 +55,10 @@
 --dt=f16:f16:f32,f16
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,0:2,2:3
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,sum:2+relu+mul:f32:per_oc
 --batch=shapes_2d
 
@@ -74,7 +75,7 @@
 # batch size smaller than num_threads
 --reset
 --stag=abc --wtag=abc --dtag=abc
---dt=f16 --bia_dt=f32 --bia_mask=4
+--dt=f16 --bia-dt=f32 --bia_mask=4
 2x20x30:2x30x4
 2x20x30:1x30x4
 
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_fp4 b/tests/benchdnn/inputs/matmul/test_matmul_fp4
new file mode 100644
index 00000000000..d958bcff592
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/test_matmul_fp4
@@ -0,0 +1,26 @@
+# fp4
+--reset
+
+# Common params for all cases
+--stag=any,ab --wtag=any,ab --dtag=ab,any
+--bia-dt=undef,f32 --bia_mask=2
+--skip-impl=ref
+--attr-post-ops=
+
+## plain fp4
+--dt=f4_e2m1,f4_e3m0,f4_e2m1:f4_e2m1:f32,f4_e2m1:f4_e2m1:bf16,f4_e2m1:f4_e3m0:f4_e2m1,f4_e2m1:f4_e2m1:f4_e3m0
+--attr-scales=
+--batch=shapes_4bit
+
+
+## MXFP4
+# --dt=f4_e2m1
+# --attr-scales=src:per_ocic:e8m0:1x32+wei:per_ocic:e8m0:32x1+dst:per_ocic:e8m0:1x32
+#--batch=shapes_4bit
+
+
+# --dt=f4_e2m1:f4_e2m1:f32,f4_e2m1:f4_e2m1:bf16
+# --attr-scales=src:per_ocic:e8m0:1x32+wei:per_ocic:e8m0:32x1
+#--batch=shapes_4bit
+
+
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_fp8 b/tests/benchdnn/inputs/matmul/test_matmul_fp8
index 43f17f67568..ec9be388e6a 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_fp8
+++ b/tests/benchdnn/inputs/matmul/test_matmul_fp8
@@ -4,25 +4,25 @@
 --dt=f8_e4m3:f8_e4m3:f32,f8_e4m3,f8_e5m2:f8_e5m2:f32,f8_e5m2
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,1:0,3:1
---bia_dt=undef,f32 --bia_mask=2
+--bia-dt=undef,f32,f16,bf16 --bia_mask=2
 
 --attr-scales=
 --attr-post-ops=
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum,prelu:per_oc
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:per_oc+dst:common:2.25
+--attr-scales=src:common:0.25+wei:per_oc+dst:common:4
 --attr-post-ops=relu
 --batch=shapes_2d
 
 
 --stag=ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=3:1,3:3
---bia_dt=f8_e4m3,f8_e5m2 --bia_mask=1,2,3
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--bia-dt=f8_e4m3,f8_e5m2,f16,bf16 --bia_mask=1,2,3
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=add:f32,sum+mul:s32:per_oc+linear:2:-1
 --batch=shapes_2d
 
@@ -36,14 +36,14 @@
 --reset
 --dt=f8_e4m3:f8_e4m3:f32,f8_e4m3,f8_e5m2:f8_e5m2:f32,f8_e5m2
 --stag=abc,acb --wtag=abc,acb --dtag=abc
---bia_dt=undef,f32 --bia_mask=4,6
+--bia-dt=undef,f32 --bia_mask=4,6
 
 --runtime_dims_masks=0,1:1
 --attr-post-ops=,sum
 --batch=shapes_3d
 
 --runtime_dims_masks=2:4,6:6,3:5,7:7
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum+relu+add:u8
 --batch=shapes_3d
 
@@ -54,10 +54,10 @@
 --dt=f8_e4m3:f8_e4m3:f32,f8_e4m3,f8_e5m2:f8_e5m2:f32,f8_e5m2
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,0:2,2:3
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=,sum:2+relu+mul:f32:per_oc
 --batch=shapes_2d
 
@@ -74,7 +74,7 @@
 # batch size smaller than num_threads
 --reset
 --stag=abc --wtag=abc --dtag=abc
---dt=f8_e4m3,f8_e5m2 --bia_dt=f32 --bia_mask=4
+--dt=f8_e4m3,f8_e5m2 --bia-dt=f32 --bia_mask=4
 2x20x30:2x30x4
 2x20x30:1x30x4
 
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_gpu b/tests/benchdnn/inputs/matmul/test_matmul_gpu
index e18152cc9cd..13715bbb993 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_gpu
+++ b/tests/benchdnn/inputs/matmul/test_matmul_gpu
@@ -3,7 +3,7 @@
 --dt=f64,f32,f16,bf16:bf16:f32,u8:s8:s8
 --stag=ab,ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,0:2,2:3,1:0,3:1,1:2,3:3
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=1,2,3
 10x30:30x20
 
@@ -25,7 +25,7 @@
 # test any
 --reset
 --dt=f64,f32,f16,f16:f16:s8,f16:f16:u8,bf16,s8:s8:f32,s8:s8:f16
---bia_dt=undef
+--bia-dt=undef
 --stag=ab,ba,any --wtag=ab,ba,any --dtag=ab,any
 1x30:30x20
 
@@ -33,7 +33,7 @@
 --reset
 --dt=u8:s8:s32,u8:s8:f32,s8:s8:s8,u8:s8:f16
 --runtime_dims_masks=0:2,1:2
---bia_dt=undef,f32,u8
+--bia-dt=undef,f32,u8
 --stag=ab,ba --wtag=ab --dtag=ab
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2
 --attr-zero-points=src:common:1+wei:common:-1+dst:common:2
@@ -42,7 +42,7 @@
 
 --dt=s8:s8:s8,s8:s8:s32,s8:s8:u8
 --runtime_dims_masks=2:1,2:3
---bia_dt=undef,u8
+--bia-dt=undef,u8
 --stag=ba --wtag=ab,ba --dtag=ab
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2
 --attr-zero-points=src:common:1+wei:common:-2+dst:common:3
@@ -54,7 +54,7 @@
 --dt=f64,f32,f16,f16:f16:s8,f16:f16:u8,bf16,bf16:bf16:f32
 --stag=abc,acb --wtag=abc,acb --dtag=abc
 --runtime_dims_masks=0,4:2,0:4,4:6,2:0,6:2,2:4,6:6,1:1,5:3,1:5,5:7,3:1,7:3,3:5,7:7
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=4,6
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2
 --attr-post-ops=sum+add:f32+add:u8:per_dim_01+mul:f32:per_dim_0+add:s8:per_tensor+add:f32:per_dim_01+linear:2:-1
@@ -71,11 +71,11 @@
 --reset
 --dt=bf16
 --stag=ab --wtag=ab --dtag=ab
---bia_dt=bf16
+--bia-dt=bf16
 2x4:4x3
 
 --reset
---dt=u8:s8:f32 --bia_dt=f32  --stag=any --wtag=any --dtag=any
+--dt=u8:s8:f32 --bia-dt=f32  --stag=any --wtag=any --dtag=any
 --attr-post-ops=eltwise_gelu_tanh:0:0:21.5742
 --attr-scales=src:common:0.25+wei:per_oc+dst:common:2  --attr-scratchpad=user
 24576x1024:1024x4096
@@ -90,15 +90,24 @@
 --reset
 --dt=bf16
 --stag=abx --wtag=abx --dtag=abx
---bia_dt=f32 --bia_mask=4
+--bia-dt=f32 --bia_mask=4
 --attr-scales=src:common:0.25+wei:common:0.5+dst:common:2
 --attr-post-ops=sum+add:f32+add:u8:per_dim_01+mul:f32:per_dim_0+add:s8:per_tensor+add:f32:per_dim_01+linear:2:-1
 1x5x4x3x30x2:6x1x4x3x2x20
 
+# Test Strided
+--reset
+--dt=u8:s8:u8,f16,f32
+--batch=shapes_mem_strided
+
 # Test CI in Nightly
 --reset
 --batch=test_matmul_ci
 
+# Test mixed fp8
+--reset
+--batch=option_set_fp8_mixed
+
 # Test layers of some key and ext GPU DL Frameworks
 --reset
 --batch=option_set_fwks_key_gpu
@@ -108,6 +117,3 @@
 # Test tf32 configuration
 --reset
 --batch=option_set_fwks_key_gpu_tf32
-
-# Dropout
---batch=harness_matmul_dropout
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_int8 b/tests/benchdnn/inputs/matmul/test_matmul_int8
index 992a77b11d1..44d165e4caa 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_int8
+++ b/tests/benchdnn/inputs/matmul/test_matmul_int8
@@ -1,27 +1,27 @@
 # int8
 --reset
 
---dt=u8:s8:s8,s8:s8:f32
+--dt=u8:s8:s8,s8:s8:f32,s8:s8:f16,s8:u8:f16
 --stag=ab --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=0,2:1,1:0,3:1
---bia_dt=undef,f32 --bia_mask=2
+--bia-dt=undef,f32 --bia_mask=2
 
 --attr-scales=
 --attr-post-ops=
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum,prelu:per_oc
 --batch=shapes_2d
 
---attr-scales=src:common:0.25+wei:per_oc+dst:common:2.25
+--attr-scales=src:common:0.25+wei:per_oc+dst:common:4
 --attr-post-ops=relu
 --batch=shapes_2d
 
 --dt=s8:s8:s32
 --stag=ba --wtag=ab,ba --dtag=ab
 --runtime_dims_masks=3:1,3:3
---bia_dt=f32 --bia_mask=1,2,3
+--bia-dt=f32 --bia_mask=1,2,3
 --attr-scales=,src:common:1.25,wei:per_oc,dst:common:0.5
 --attr-post-ops=add:f32,sum+mul:s32:per_oc+linear:2:-1
 --batch=shapes_2d
@@ -61,7 +61,7 @@
 --reset
 --dt=u8:s8:s8,s8:s8:f32
 --stag=abc,acb --wtag=abc,acb --dtag=abc,bac
---bia_dt=undef,f32 --bia_mask=4,6
+--bia-dt=undef,f32 --bia_mask=4,6
 
 --runtime_dims_masks=0,1:1
 --attr-post-ops=,sum+add:s8:per_tensor
@@ -77,7 +77,7 @@
 
 --dt=u8:s8:s8,s8:s8:f32
 --runtime_dims_masks=2:4,6:6,3:5,7:7
---attr-scales=src:common:0.25+wei:common:0.5+dst:common:2.25
+--attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
 --attr-post-ops=sum+relu+add:u8
 --batch=shapes_3d
 
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_large_gpu b/tests/benchdnn/inputs/matmul/test_matmul_large_gpu
new file mode 100644
index 00000000000..2f52a835aa5
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/test_matmul_large_gpu
@@ -0,0 +1,99 @@
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 1x2:2x1789201856
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 407x35:35x14118450
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f16:u8:f16 --attr-fpmath=f16:true 39151x1:1x74148
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f32:f32:f32  65384832x120:120x2
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=bf16:bf16:bf16  584x213:213x3041378
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 2x31:31x106350853
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 5x1:1x615461344
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=tf32 83179x710:710x39802
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 653683x68:68x7348
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=bf16 2x30:30x95573179
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 21x10:10x80125482
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:u8:f32  14219754x237:237x123
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=u8:u8:f32  1585984x387:387x1328
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 4185514x587:587x5
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=f16 26x111:111x35222113
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:u8:f32  3083732x116:116x1107
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 14569994x7:7x347
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f16:f16:f16  116947096x2:2x33
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:f16:f16  5939448x363:363x331
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 18390116x171:171x30
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 6549540x350:350x222
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:u8:f32  93894038x69:69x7
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 90044944x10:10x47
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:u4:f32  69212x8:8x87115
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 13632175x517:517x2
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:u8:f16 --attr-fpmath=f16:true 1598596x640:640x834
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:u8:f16 --attr-fpmath=f16:true 122689288x8:8x45
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 3x45:45x69754893
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 1x160:160x16222019
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 385x939:939x4924467
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=u8:u8:f32  1992978x632:632x842
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 6293x17:17x1043944
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=f16 1419x14:14x4981072
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 290x3:3x18647454
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 511410294x1:1x13
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 259x39:39x9706702
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32  235484x36:36x18455
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=bf16:bf16:bf16  1422373x206:206x3619
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=u8:u8:f32  300573x411:411x9003
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 2274028x205:205x1054
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 180x5:5x21585730
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 91x420:420x10884120
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f16:f16:f16  3842x46:46x852208
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:f16:f16  576225x3:3x5316
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=bf16:bf16:bf16  8x3:3x516510367
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=tf32 85x446:446x8841741
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 624900425x2:2x2
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=tf32 113x119:119x20018389
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 52758381x73:73x37
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 11399472x667:667x2
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:u4:f32  7275826x471:471x30
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 5594x68:68x435719
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:bf16:bf16  91x143:143x21960221
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:u8:f16 --attr-fpmath=f16:true 371009193x10:10x5
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=u8:u8:f32  111x71:71x34901324
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:u8:f32  53230794x22:22x58
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=u8:u8:f32  49580x8:8x67799
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=bf16 3977x7:7x1560002
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 14481532x2:2x385
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 225586555x2:2x26
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:bf16:bf16  11065238x94:94x130
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 158x1:1x48350582
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:bf16:bf16  31x40:40x114671774
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:f16:f16  14x420:420x8868905
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:f16:f16  2302287x84:84x1081
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 36x37:37x64148645
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:f16:f16  4x10:10x355914435
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32  1x1:1x2318406393
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f16:f16:f16  156147464x12:12x11
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f32:u4:f32  16409x7:7x451020
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:u8:f32  873x2:2x4574119
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f16:u4:f16 --attr-fpmath=f16:true 8616x584:584x252846
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 1278x42:42x2906077
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:u4:f16 --attr-fpmath=f16:true 126x5:5x59779114
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f16:f16:f16  1698721x52:52x1360
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:bf16:bf16  1386988x652:652x2232
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:bf16:bf16  6840x25:25x445117
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=tf32 243x3:3x11621290
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:u8:f32  225x680:680x8365953
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:u4:f32  331x16:16x6457275
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f32:f32:f32 --attr-fpmath=f16 303052658x14:14x2
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=f16 28802x228:228x162201
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:u8:f16 --attr-fpmath=f16:true 56x92:92x18682172
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 7195906x9:9x888
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f32:f32:f32  43279x22:22x68269
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 226273600x30:30x1
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:u8:bf16 --attr-fpmath=bf16:true 131531744x14:14x7
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f32:u8:f32  810823x720:720x2920
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=bf16:bf16:bf16  449982x846:846x11975
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=f16:u8:f16 --attr-fpmath=f16:true 5879766x204:204x911
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=tf32 290917x204:204x8805
+--reset --skip-impl=ref --stag=ab --wtag=ab --dtag=ab --dt=bf16:u4:bf16 --attr-fpmath=bf16:true 249257x48:48x14627
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=tf32 348335902x2:2x5
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=f16 3978x4:4x1046982
+--reset --skip-impl=ref --stag=any --wtag=any --dtag=any --dt=f32:f32:f32 --attr-fpmath=f16 222263587x29:29x2
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f16:u8:f16 --attr-fpmath=f16:true 45757x191:191x114496
+--reset --skip-impl=ref --stag=ab --wtag=ba --dtag=ab --dt=u8:u8:f32  197x15:15x32179943
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:u8:f32  24988098x242:242x21
+--reset --skip-impl=ref --stag=ba --wtag=ab --dtag=ab --dt=f32:f32:f32 --attr-fpmath=tf32 3637797x11:11x2280
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_multidims b/tests/benchdnn/inputs/matmul/test_matmul_multidims
index aec36b29aec..d6d2e624f88 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_multidims
+++ b/tests/benchdnn/inputs/matmul/test_matmul_multidims
@@ -1,6 +1,7 @@
 --reset
 --dt=f32,u8:s8:u8,s8:s8:s8,u8:s8:s32,s8:s8:f32,bf16:bf16:f32,bf16
 --batch=shapes_multidim
+
 --stag=abcd,abdc
 --wtag=abcd,abdc
 --dtag=abx
@@ -31,14 +32,17 @@
 --skip-impl=ref,x64:gemm
 --dt=f32 --attr-fpmath=bf16
 --batch=shapes_multidim
+
 --stag=abcd,abdc
 --wtag=abcd,abdc
 --dtag=abx
 --batch=shapes_4d
+
 --stag=acbd
 --wtag=acbd
 --dtag=acbd
 --batch=shapes_4d
+
 --stag=adbc
 --wtag=adbc
 --dtag=acbd
@@ -46,7 +50,7 @@
 
 --reset
 --dt=f32,u8:s8:f32,s8:s8:s32,bf16:bf16:f32,bf16
---bia_dt=f32
+--bia-dt=f32
 --bia_mask=0,1,2047,2048,4094,4095
 --stag=abx,abcdefghijlk
 --wtag=abx,abcdefghijlk
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_smoke b/tests/benchdnn/inputs/matmul/test_matmul_smoke
index 302c5e623c8..9f905709701 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_smoke
+++ b/tests/benchdnn/inputs/matmul/test_matmul_smoke
@@ -6,7 +6,7 @@
 --dtag=any
 
 --dt=f32,bf16,f16
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2
 --attr-post-ops=,linear:2:1
 --batch=shapes_2d_ci
@@ -26,7 +26,7 @@
 --wtag=ab
 --dtag=ab
 --dt=f32,bf16,f16,u8:s8:s32
---bia_dt=undef,f32
+--bia-dt=undef,f32
 --bia_mask=2
 --attr-scales=
 --attr-zero-points=
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_sparse b/tests/benchdnn/inputs/matmul/test_matmul_sparse
index 81b662fb4d4..fe986bbf5ca 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_sparse
+++ b/tests/benchdnn/inputs/matmul/test_matmul_sparse
@@ -1,5 +1,12 @@
 --reset
---dt=f32:f32:f32
+--dt=f16:f16:f16,f32:f32:f32
 --dtag=ab
 --encoding=csr+0.9::,:csr+0.9:
 --batch=shapes_sparse
+
+--reset
+--dt=f16:f16:f16,f32:f32:f32
+--wtag=ab,ba
+--dtag=ab
+--encoding=coo+0.9::,:coo+0.9:
+--batch=shapes_sparse
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_sparse_ci b/tests/benchdnn/inputs/matmul/test_matmul_sparse_ci
index eb8096b6d32..9aaff11549d 100644
--- a/tests/benchdnn/inputs/matmul/test_matmul_sparse_ci
+++ b/tests/benchdnn/inputs/matmul/test_matmul_sparse_ci
@@ -1,9 +1,15 @@
 --reset
---dt=f32:f32:f32
+--dt=f16:f16:f16,f32:f32:f32
 --dtag=ab
 --encoding=csr+0.99::,:csr+0.99:
 --batch=shapes_sparse
 
+--reset
+--dt=f16:f16:f16,f32:f32:f32
+--dtag=ab
+--encoding=coo+0.99::,:coo+0.99:
+--batch=shapes_sparse
+
 --dt=u8:s8:s32,s8:s8:s32,u8:s8:f32,s8:s8:f32
 --encoding=:packed+0.99:,:packed+0.5:,:packed+0.0:,:packed+1.0:
 --batch=shapes_sparse_packed
diff --git a/tests/benchdnn/inputs/matmul/test_matmul_sparse_gpu b/tests/benchdnn/inputs/matmul/test_matmul_sparse_gpu
new file mode 100644
index 00000000000..7da76ffe41d
--- /dev/null
+++ b/tests/benchdnn/inputs/matmul/test_matmul_sparse_gpu
@@ -0,0 +1,2 @@
+--reset
+--batch=test_matmul_sparse
diff --git a/tests/benchdnn/inputs/pool/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/pool/option_set_fwks_ext_gpu
index b4934663b9b..be31b271ab9 100644
--- a/tests/benchdnn/inputs/pool/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/pool/option_set_fwks_ext_gpu
@@ -1,838 +1,827 @@
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"edc21c7a168fe3593bcc83b2b13ef4ac*5&0d3107a28c8486ee070d59d824d21703*5&539f3d808202cdbcb0e29bfc71d44d1d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"a3de312358679c0b1a9e8b9a69a116ba*5&53e2df25624971a29c526d89a6d00024*5&c0e76b715b4e6f8e93f2bf2857212c20*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"d68fc5d9f4b6f4de0103314a20b542cb*5&c3d9cdcff441d309ce5a441448238222*5&27a22fb1bc9964f17f68037f5ca6cde5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"e5aab3b382f159183e1802dcbfff3109*5&121e8d5f82564831f489a3a9c48f3625*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"13aa1db5c9382b9edfc19e24ac52a01c*5&766f1e4e4be797ad72965a2c4572e8c3*5&b41f1818a49f61540a9f2e44b25d010f*5&6fd58bac63679fd3c7112f284786eaa6*5&3bcf3a01771332a7e4fd7f604f174066*5&02017ca9e3d7d5403d7032e525d988fb*5&394ba57ba3ad870d97a028f3f88c7389*5&78002a8de4189f7d6d4bcd5f8c252d59*5&c5187208d04096684d5520179c86ed3f*5&01030be74f700e5f1d5b199f48aa5cf1*5&3653fa07926dfc60af5a19144a01efa0*5&6f6127231a70f9598a8b3b648b7143be*5&3e71d05d48f471e55b14a074325ca615*1&de7c28f15593eb6182043c0b9baee4b0*1&f8a7d75c0ed1b5109801c8008a66a927*1&d189e9faa86f9fd66cda4599f40c6e48*1&e381a86b0663a635fc927597110cffcc*1&7b3da1abc5a9da6790e2ac87803c2f22*1&dd16bd58a8eb25aea2b06138d7295ce4*1&5ca2efe80c15d85d0e67bcc9d5147d03*1&ba8dbe6e628b95da1428b0a89a7802f9*1&b91c453d560779941631fde0c57547fa*1&c09efddda31a89a7691d481acfa9cba0*1&d64f5578f1970c00df51240171bf1e1d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"50f0abe4dd531ba01c78f7383d63f51a*5&f0b9347d657d6fa66eac35eb3217267f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"23d75a5e2f30b4afc4db7f677d93feee*5&e665eca9e35f62674dc80de74f52048b*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"7857a0d3b8040a86daeabffc7b3c5d4d*5&d604f0988f4912badd22008121f3289d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"085d8ef13b34296dff223a9b344a4255*5&2ad8664829e95a8b7333a6e6fa2e31ea*5&f41410ab1f7de991a94677d93cf2725d*1&7a8db7b97ddc0cc1e6df63af921d22cb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"fa664d089eb9ded99b233ba9d4923965*5&475583a7fe6daecf54c98c022e2c265d*5&b08a9037af383208539d4bd22869460c*5&f7d85a462b94595e18fc85b413ec5e8b*5&27ef4bada81319e205cf12299b5b6c6c*1&bfab90b4d65bfb603bea30fbcfcd2982*1&9a0e2b20df24e3047535fd3d843ee480*1&a49ad9ec692af6c05a23cc9bfc8d5f4f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"cb83ae37986a98367b0534e3050eb722*5&26140cc98a1f634b08f7676cb6a63c06*5&38f6c5dfb2a7ed6196811a008752ffea*5&92a89d0b4d01bd66c533ce8239e4b8a2*5&140c5b363b9d79ed0828ddab8b2cb616*1&0620907beb544b8084cb89289741ca16*1&3218cea1e125ef215c3b6f1aab8a58a6*1&c9899491123c3dd4112ed0212e1f7e5b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"41e4c103ad29f1cbc57b759e87100691*5&2fc0fb0121487afe2ce5e47a6205e78f*5&9951c65a0a09b661ceabe8a167466ef5*5&e4c4c1e8de1370174223baff0bb96057*5&02b0f1d08bb57af5c8d2611bf6852664*1&b9c4baaee9a0b994a48c03f632571e2d*1&ed35e8e61a09ef8a37f61f409ffc8cae*1&f07f2d01108a7b2401742cdbfae9ff15*1&a03975fb3c5c7113b511c3158fa55595*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"7caea23c884f1b7432866833b725f285*5&9662d892aef4a8f94bdc3d80e10d3f79*5&d5eacc05b5deaccf6a84697b26cd9b76*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"136a9a55527e828b4b74861ed9e902a1*5&805a362d370c3a14fc8109e08f3f4dfa*5&518bfe2b6d7327ba71595cc48d8c952c*5&dcfe9e1455a5c85bc23e5ad8ed5beb05*5&5a9d949e1a887e73822e5d43913f6e37*5&bfc109f0e59883c9e528cd729503f7b6*5&6ca73f9b23a081eb4dfc201ecab8afad*5&a685f18a1e4f46fcd12fdba89ae19134*1&d14e45ee88fbad8b47e53fe1586c0a15*1&124474ae6172ad509c13a69e5fdbdf1d*1&4970a24a2d9e8b10564be6e4b9d33718*1&ce5cfbe07dce462acaba5c0141da0f4c*1&e9fb05ecfaa1f21e378bb2fd9ce65a00*1&f74424cd11b5c207e26ec5e418555d61*1&759d93e4e66f67a1b3761cb5c6d2f39f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"a2adcf809a26d2a7bf2345b98a9225f6*5&4b86cf251ce06c08a68a5629c5eae654*5&2192c0cd80c9ebfe9111e18591de3bbc*5&639c2d56b8df21a62fbba6e30e87297f*1&d40315636c12634410a1385f36788627*1&8ba3f5d08f84b9d0e53cd0f0ccc80483*1&c4c84c7e993171e7b13c032f93ca7230*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"29f0d02857e73deb4680b8dce5b8cb30*5&da89399506b85a9729b6582c08bcbb69*5&cb2e560012003f362af786a37b7ac2ca*5&b3ff020dcf57906c11f7168845161c45*1&852ff5e3e8058a5a5efdb9a13b6faae9*1&397c7366abaef05863c3e4f7527e5cb2*1&af58dd1b2d26c38d9f54564344995a9b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"ff73ddf0d621681138bead4d9de490d8*5&738feb8887594fbcdb9bc39942128a03*5&cc2239167bab4d1e61ab9d23307db8cd*5&2d47883aeeef607a1b1d3ad7c9d9543b*1&5f593b3dadd24550a78804cef91098b1*1&75931f9c584258569eec2d8c55b3fc14*1&f665a0c9ade0efe075e2369f63d632c7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih400oh200kh3sh2dh0ph0_iw683ow342kw3sw2dw0pw1_n"30976c065efe109e1e21d8649e855036*5&1360e49c9ea4175e6f1e9f45e43dc2b3*5&1d6b6fdf72a036b34456659297f84eec*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih200oh100kh1sh2dh0ph0_iw342ow171kw1sw2dw0pw0_n"1e98b91004a7c5807e0b9e735e16d4fa*5&46c04b42a2fdf14fbc82a4dfa371b012*5&a1da6e20cdd2a767064d253e59655038*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"66bac482032ea345f255b5dbd7188230*10&391c4585b57e23a2f7f8706d1a37c2e6*10&3e442b89ddc110b2f69abfbb44ede929*10&d189cc227a95a0d5b6f4e8f0852b4993*10&dce2fdf169e904e68e69b708ee46bf46*10&178048d6859c12035604de05dff682d4*10&0ce0a62308444bf985ac0f143d983832*1&c14db535c0442bd183cab37a63044660*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"dde9eda201f59026b3dd5efb95e46089*5&ea7cbe27f9fad2561c6952928e3c1003*5&fe38666a5701be5cf05f7258f0226db8*5&c5bb06036dfd8e5e11322ba73d054b4b*5&7283768f885588b76c525f38f6306263*5&8f7c0019c544222f2c2abc3749a53871*5&d6ebe0839cb993ec861b189ede339e7c*5&c025ea8b599ab7c90846219240a26fb9*5&4ad91ed718302f60742b9e303ab2fdbb*5&33344582a875ab921d41a21934fa8840*5&e58845f5c5b645ad60e69f190ff4b6e0*5&081ee87905096dae8c2b9da181216442*5&27cc36fca96d4776e120643a379782df*5&6dc8c3d1005305efbd76ce5b853fe8da*5&7aafe70d3cfb0c4d673d387233a86289*1&d0d63aa73cf68bcc2715d0b74a5018a8*1&a87800b81a1cb119862eba9455b0bd34*1&f82ee0c27cefa9b1af05f4484bc37a54*1&c23f37e5ba8497158c2789b391e02fe3*1&a04dcfc861dc7bdc6d94bd803a695855*1&afe687c56edafed6d39512d1b551a0ab*1&177bc21bba27b7e19e13b6d1875d0982*1&c4c5116b3b14a90304c99b760239f4fd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"6a3adbf304733ada743966cead2ecf3a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"fbb99a7321ea6567a3e84ee65f343c57*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"bb65585aa5f5dc1a7120adca0de2ef2e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic24_id8od4kd2sd2dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"112bbe9326bbc637c4d197b9fd3872fc*5&0225306c9656d651ddab30d1da3ee7da*5&e1bb8621d2d458cb37bf7952d907c0c3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic72_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"f82fb042a6e055d9463841cdcc04bfae*5&02a30223c0538266c5f1793a2aa341a8*5&ddb5a260dcf8c2c39347ae444de471ac*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic120_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"d953ac88c6d5925b33360b7b83199e2a*10&335dd0c8c4a1beaa24629b45f792ad29*10&0a9ea57c3d0a9294974e87ee444a2ea6*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic480_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"3fecd830911332f79baea1129204a4b6*5&2cdca7a211b1dafc9d6820b2b1defa93*5&68b5ad2e14744b3c68091383407ea9de*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic672_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"4ab0b6f5f9c3ed06055c994415f66c20*10&089ec67eefe927152751fbd9046cf399*10&ac20b67b160848ad6be445009a662986*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic672_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"c559ff22d56bb611437c9bbb28dfb70c*5&59c428c1e791abf4fa6d47b79b0569a0*5&2aa35c5ca84c05ef4badfe10bc964b35*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic960_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"838b2840415d54bcc663c2e64eb18ac5*5&7dcda1dacf7d9f08745f6894e84af502*5&8fabb2d822e2fd8482fe1c3d9ecb553a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih32oh16kh2sh2dh0ph0_iw100ow50kw2sw2dw0pw0_n"b3ebda0d49b951cfb12f384c75c2732a*5&50f4ce8b42c0e8f6f239959830ed02a9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih16oh8kh2sh2dh0ph0_iw50ow25kw2sw2dw0pw0_n"a8bab63e1b71510e54214777f110e5b3*5&105fa4b8001ddca1f74a29b6c4135f04*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih8oh4kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"aa464685675f8f6bc6dd318e239e621b*5&7220e4ab40df3cd569dfb870f6f09fb9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih4oh2kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"0ef372096102ffa697f2c43017b40803*5&f05c56a2494ec8de256acc9bf0bffff0*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic64_id40od40kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"d245648d3222b84d088d7fb270816ebb*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic192_id40od40kd1sd1dd0pd0_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"5313bef5aafd452cda7540c3d0e75660*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"2367818ab4e617a649e0a38b856077fb*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic256_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"86c349092244e76735dd179814b89008*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic480_id40od20kd3sd2dd0pd0_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"ede9aed20556bb39b79c2fe7db7b881c*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic480_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"da4e11240144442b5ecc48703b02f25a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic512_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d8943a85ed2f15f4a6468e6c6a8dfab3*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic528_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9bc56f64a028fc850e89fa6541d55a42*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic832_id20od10kd2sd2dd0pd0_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"24fec5efd6feb80f99f5299dd8a26af0*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic832_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"8f318b4b73af15ec380a6fc7743a99d1*10"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcde32b --attr-post-ops=binary_mul:f32:2 mb1ic1024_id10od9kd2sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"7b55301ce149712a65bef6545f86b086*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"4c5b0fe718a46817442bdc4151e7623c*5&47470b4c49fd657e0491ec323e034b7e*5&f00d7654b74330ffd27a4ed5a2877c0d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"550ea9566048a7f636c3000eafe0d560*5&7ab9ecf0983faae1c4cb9b86ea69d6cd*5&1474a181c5f793c1b6f03e4550bd1d7c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"c9ff1ab2a11e2326b045b36b3d83415f*5&a6967e6fe48db47252c66fa162646ff2*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh6kh6sh6dh0ph0_iw64ow6kw12sw11dw0pw0_n"bd8fb3c1284354fc7a0ea99a455dff9d*5&1c59c75d6457b39a31bd524c12b38f8c*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh3kh12sh11dh0ph0_iw64ow3kw22sw22dw0pw0_n"268e9d44e42eef3be1d302fb009730f0*5&68d620ba266980673075caf359718195*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh2kh16sh16dh0ph0_iw64ow2kw32sw32dw0pw0_n"bca5f8a5fd3f36944d491d1e416c9e3e*5&aa61efa9036c1e5246d54ea819d744fa*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"44c49a15f776cf9a0f731dcab13fc0cd*5&2c72632012a5e9b4a5a9f1a07cb7792e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic20_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"ccc0c15af8c68648af4ca7fe9792c482*5&0e9b62818a62222dad742d54b9bb258b*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ef2275016d925129c6a91af9b99058f2*1&6c333f2fd585fc1544fb10569f03ae54*1&42853017085175ec5f17d363aa12678a*1&2c74473bb5bcccbf6c9c6cf6089c1e82*1&be775616c88ea07d27a221306abf694b*1&36e9941b458cdba198baa923b52652f7*1&1df94c0b8c8181cc6499fec45c336e06*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"4e17a1335e76ad19a4b3608fddfda3aa*1&e8daaabbf3903d7a4f912cdded5518a5*1&c175066dbded1b8da5f10ead9e137e4d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"845248c2f987ef8ccc5d708845c58adc*1&adf028c6442604f6e17f57817c2dd15f*1&6196550e37b9777b01ed3cddfd742e79*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"eee65293a83abb4643e00ce9670a0d4a*1&83e5016ad746595c5b20f2e40a9815b9*1&b15198bdb3f5c91157373a80b435e6aa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"6ed271d9db02dc8fc24e369a5be7dae2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"7bfc077758058f810aa7823250deece1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"808eb21af6bd7944c3a576505dde42fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"1391276d4b5b11b32438c34ac1f1493c*5&8e0c1839ae4a9c4824c0ce99e3b1a4da*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"5bee23a262a2965f2aa357720d788095*5&53b1fd6df59553c556a4c9a452e5b7c9*5&d36bad65b29061ec4fae72f0a148f608*1&28dad42644230305a2586f966a7919d6*1&5a34c9fdf2e5765885ea335522c14d54*1&a0d678488f2732f1c1685fb0d0fb3469*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"d4bc61169e39e4adc7fac7aba187e1d6*5&9e246b8e9cfe6ae16c36ed64733b1d7f*5&b102300ba961bfac153882ebdae26e3c*1&aaeeac00db056e12d7d6b88480107bb3*1&10be3280d89183bd0830aee54b1e63c4*1&e68736e8ae557ff079914973329c3fbf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"61d004606f0ef3a296dc2faa0ea20047*5&1e8c875310b7eeb5dd6fd7059e7a9585*5&d64d2dee48aff4d5a1718f63bc4d3b26*1&22c760dbb2db1ee643811d3254303c16*1&f0a01f0a2350b2024e46373ce507f846*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"eecbcdf7242eab65b4f0b7ee9ef13eb3*5&cbe2f49da2bc91f6f915420796323a47*5&bc578e2710c1fc244dfea2859bab28aa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"31eca7724ce0fc824f93da729c3772d9*5&ae8b8998f64c058229311b222a977e71*5&1d18ba5b6f97e52d92c22508ee4ca721*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"9f2e3b2b6fb40c50eb26b0d319cb8be9*5&00ce5dc2f49f7624ccc7ef7b1e851e8f*5&670b38995fad2d06c96d6063b5ebc18a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"c1fb8b12c409cab7123bcd82f947f581*5&860e0da5f37399f2d156538e511998b7*5&0338b7910d0a817b479c21e77c17fa2e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"01f79cc7ed360b1433b6eb66e503992f*5&6375a53f9b9f0ead0dd461bae3afd8be*5&5c62b0efd3897275fe4414c0febc35a9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"5a348d465fa4d119498c6c63f11e34a3*5&707120705d41f5f6cda3608d8899383b*5&f71c0d702baf6d503835a3691e596f20*5&7ae61e735992e04da24cd540f5eae215*5&b7682a98a4b38003c3744d96dcd53261*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"5cdc150eb022f8c3c05e81214ad7fbe9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"4c4eefb9d1e8897edd2e7eb6db305dea*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"ee990e59f267749f0b70412c82998613*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"717cf524427fb12afdddefad56f96482*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"6f2c1c78cc3069d5105dcb1be26d9a84*5&060949333d15d21a4f0a82eb30975f64*5&56fdd39b7377188c19aea348d84eb855*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"bb8e566268b07318d1cae0f2a4dbc706*5&5d0e8a609080dc62aae5fd73b164669d*5&0a1d8ab373f33d1cf25f36500c220ff1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"721cecb04d012266314e2282e74d078b*5&bd533e0f7ae7125269bd291f4cbc6fb5*5&07098a759404ebb02dbad0d81e2f3841*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"68e170ae99010f62ad2084deb666e504*5&499e75abb9ed1314e0533bc269a39de0*5&5c74f696750759ead68df9d957a518d0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"98f943d0d01aef7919f763a63a95d285*5&5c38681b044f2b59fb72ce72766917a8*5&8b9645e91fe0522a7ccf50614d9268ef*1&6de475eb0fe2b84e1e23f0ec1768ab95*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"984046f93f21f358613adba35877b5d6*5&63547f0852bba429df7f06bf0de77a4a*5&e2f32aada6ab5d367e6baa359422bb1e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"640b26ee52e03fe4c9b26258239e29a2*5&d6374260dcc3e3515fa8b2e74e629d99*5&b66af5810cb8635c45e03c4f89a3e47c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"56e224870e14f00e78baf305853eb468*5&13b508ca8fa54375554f2ed4fc820860*5&b7817c5b47a11cc02de6f33af5dd7422*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"a14c6c4c880d4432a5e20f8601620e7e*1&db7cdd54800b12617aaba3d6989cc0f8*1&c946cc29f4bd03a1133483816f03f7c5*1&c417cf984fff05b5165c15302ddc08bf*1&4e02250390c979a67ca736e7aa26b0c6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"0fb451ce76c1f45580fc6cb3067cf6eb*1&79c35329bbe94dd38fc18eedbbe19476*1&70d6ffb5576f6f9a4c0339a5052ab0c9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"ff8c5eb44f0e55e4fec247b7305583c3*1&138cfb80dc341076424b5f90c5ce29ab*1&3e2eb415c7713d6177b3e5d26d6c5ed5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"860d477a444ae2bc98093d5c89e0c2d7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"1f4b8cf1866ae32062e6351ab5ed9c8d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"51ee78cd383cf4816fe75c62ae7ed80b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"8c658dfc118bae7e49fd3f46dfe37805*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"8e2ae33424487bfcec0e1120a623376e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"7ff3b5585c972b30f376dd844ff57b7b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph0_iw683ow342kw3sw2dw0pw1_n"155813db3e09a95d73403ee4ea500af3*5&ae5ab8105d2afb2f78514988ea8af2db*5&b1decf10fc201f3954f345d171965352*5&975d1b85396094b694feadb843c9e53b*5&97f8f3468aa528e07e5beb453a52be29*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih200oh100kh1sh2dh0ph0_iw342ow171kw1sw2dw0pw0_n"8cf6069a4d04a4fa604dd2e3b0adf810*5&d319eef6c43fdc042e0033fe4601f271*5&dfffc6e7815952edaaf82a25cdeb53d6*5&bd9e68acb5b46d4c100e0bef22f0fd1d*5&756d4df2d016046171d31f915f577ee8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic576_ih128oh5kh49sh16dh0ph0_iw256ow11kw49sw20dw0pw0_n"5ca40e9fd2b1a3a30c8d1555f0e999cc*5&3cc55d1d709521bd46b4c841d0c023c3*1&51348dc6f3db2d07ff49f1243d0e3d18*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh3sh2dh0ph0_iw960ow480kw3sw2dw0pw0_n"b6ab8c0de634eb279029e2e22be55645*5&440a068b94c8f0b6a64c980c3e06e8a1*5&a76748d769e17909b9c5fb310f01a902*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"3cfae1d08c3375b3a70d6c97cc7e4df5*5&dbf62930ec370fd96f5c96059117a075*5&22cd27875dbd022828a75735c2d22062*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"b06a27e82c180a9e509a93b9510e615c*5&78be5b5267dba869007048ef0651bb67*5&9e756fb52e0c41c52b34aeedc60421af*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"a0e03f59a9e632543c9e3681239e81a1*5&5aaddac0ee7ee31e49aa89380c8c629b*5&7450243f4336ce7cabc72f0c1f3151c0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"0d19e7f8df96f9ac4726276e1f5e3d65*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"ff9f12a51c8583ca9f2536a07cc7787d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"713be32470d1bba8bbf6fb7346d720f3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb300ic1024_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"a3ebfc0013c03d1270b5f68dee69f2cb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb300ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"1228f67b85da5b1e85fcca773763629b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"0d3680cc81b530989f55db877ae55eeb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"1d70930aa094fda9ade5191de156e120*5&4f6a8ff7d50ece396c59e9c79efb4b36*5&f6413ce606cf8c5557cfce1386be44bf*5&e37623c6c1c94a9612bade1c8aaab27f*5&a11c52827dbe9b067234d90eab93542e*5&c4f0a3f1a899a91b7b0a4f00357217a3*5&979f3a2b7298f51475d79097bebdce74*1&dc286632d05a89bf1bee21bbb3225007*1&8ebe3812ad5f3e16ae451875088dbf10*1&d1963d38539f4918d6254d7179367715*1&846293b0fbee4f9c915e37eb70d5e55e*1&112cc615d18828d265c43740dc7a2103*1&f3b3d2e6b71ed84243ad8234a5147fd3*1&b341bec2dca956a5528038239825f5ad*1&25d986193914adade58b4acfa9debccc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"25b60a252776213785832dbe76cc3015*5&8585111add1c2ffc3baa3ad64f8486c3*5&41e6ba80b9d6b35b03095c352c3bd194*5&8528883995a6843e61a658ed2c38f932*5&bc073483f3007692f795855dd0951df0*5&45ee15415e1db8694dc879ba5c792efe*5&b7c91435485ad338076b62a02fb9b6fb*1&40807a8a5a4756d45400a9963df4f94c*1&3065cb923ac05023be92f54cb3b0ec34*1&aca3ce465ee227e80461478a9baddba8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"a15a3c441cc47c96356d3bfe2c6ef5d6*5&eac481a9b391bf5e96a79791b0a840fb*5&24fb7c167bcf296106cf8613f45c4be3*5&80486c38f119c09711d7f9678b70b1f0*5&6ac5339211bd49e1ecf1de57f7f19718*5&28db12e7cbda7b09d0ceb08dcea309c4*5&ff7f164446f2226cf63887e577ebafc8*1&b1d73184fe12db633fb142727fba6171*1&4b3348852a90572a45518eabb9973eea*1&91dff5c3e0efd0f724772e88fdfd7c37*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e844ab912f0dc3ec5a6d66764b7651df*5&ddc980aec45d8c4494498b042bc1786c*5&95a0cd21462fabccc29dbb9727675b6c*5&498321c236818c62e0c05734e4eeea83*5&fe128b8df27cb39c4bc77cd99d1b4fc4*5&2caeef7dfc658eebc41325de984b3e80*5&b95d24d41e3d30a2aeeee0d431470273*1&1d68e6d2d2db26c501a19ee23a03611d*1&3d16b45da9e39b12522658349501d763*1&3e69a165afc6b4323bbbf669988ab957*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"f66af0aa41a8e5291e1f2d544786e3d6*5&fa748c4e120614e08ba3a07f0a8a9322*5&2d84fdd733acf11ecca16b6c2ca6ce56*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd16b mb1ic96_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"a82ddaf26793998855fca9a0ab4a1281*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd16b mb1ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"cc4b1353163c9e3e57d79ee513f43faa*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih13oh6kh3sh2dh0ph0_iw13ow6kw3sw2dw0pw0_n"879f543e320271c9477a1b422fe79461*5&d3c12c835f9ba73c35da2560610b5854*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic96_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"ce2fcc7398763d443229cf911f4a3339*5&c1a668c73e9b28e78e05b4356aa6a92f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih13oh6kh3sh2dh0ph0_iw13ow6kw3sw2dw0pw0_n"407892444b9990ee23149f37314a5828*5&7eff8d7b1ee69e3552b9de78f94e9d5d*5&10e5afbe23003474580c06fe775e85e5*5&842990872e5e48b2d28499f57c387d85*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"1d22be306a7d64f7059850fec1a82a20*5&012e367b9145ac64cdae6baf46c13286*1&2d1d9082f61ea67d7887d31938a8540c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"2f069de5285b941d4915034f139ddabd*5&066c7c72828a57f5c66d39bd14083c9a*1&7e5d053cdc282192ac555459e81bf8da*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"1b7b9a11d2f9f64c168f72dcf525ae51*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"091310462a2f9ff3fefbf25c84e5ae15*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"167a873bff5f019d2aae8ebdfa2be75c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"80c5407f28bf443717710ab998ab8503*5&a3b5cc2eec35f7fb74cc08e2721fdd23*5&e9506033146954d1659535d421d957d1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"690aeae7be0b7d988fd790114542f708*5&595a69f335b53612d539ae948ffd297c*5&fd7a8f6411c6e72203fef5feb0fc1077*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"f4fb3fdf29b1a687ab3f4a8ccebc4c10*5&085a0fed7ac398983feee58a264acc9f*5&a3b40a651a8e3b130560877f64963287*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"dc22207d6efb82cead9b4296dce3662e*5&f6bee6331308ecc43d47d9c7f5d2f323*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"76be1aca7d45a6ed8766dced27a14b24*5&ae38604ec97717b1f825593c6d8bf8e2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"933e634d8278368609851751709728a1*5&fadca691f668026c581a90c1033a3826*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"bcaa8de60bdba55d993dd6350054131a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"97a563ea1b76868d00cd15c0e3ff9333*5&376a6323887d5e9564a12d97424c3e85*5&9eeb1339605a8b88f81a3d52b3015410*1&bfd7f77cdfb0053f63bca3f9886ce18c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"fef7e23917702a1c9ebca79b68567fd2*5&37794640d17ff4be5f41ebec6176a88b*5&650ecfd705cc2acefde59b6dd2000d2d*1&b249fce0914d047587fddf2c7b2d8b5e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"bea295cb00302d70b79abe13c9eef7ae*5&e3b60551e96c062a9d20bdf24c031d43*5&f087e3d0ee40386c46a0aff0bef954ba*1&cd78b964b5f38bf856519f25a12e880f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"1861ba25e2bbd71b3289f97f4322fc93*5&476fd65ce4f6d7760c55096c7df2e0ee*5&574b7bb1550e950e081bce7132ce0cf7*1&21328ec254b17ca405829759371cb4cc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"0506b1bf9008cd982256f32e6e5588f9*5&6a0f1a88b2cce55efaced34496deb1ee*5&6da759acfc834c3c7e3f4aae0d63bea8*1&3ee88dd8779f87a341974b5d4828eaa2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic16_ih416oh208kh2sh2dh0ph0_iw416ow208kw2sw2dw0pw0_n"9780d17cb4a690b359a3ee3277a07a49*1&b6642269e8babe940007d5e6bfead939*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"cae1624c58dce7e4d0a4dbba2a7c8a41*1&682a643ed73a295bf751b2001dbbe235*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"1a22949d9c348cccc1f158eae421a662*1&73de667cf10da0d97c7b92d7fc62ef5e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"7d2ea1e84b3a496d149ba1cda0e71ca2*1&330cef528505cb07dcb19ad6f0ceabe4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"0bb1ddd668e5b09d00175a5a56b4acd4*1&06da4a1c2565bb7ec4affa2d769a756b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"9307b3012e115398163e7abdfabbbe05*1&ba081425b6adcbc34aa53f0e6b9e7396*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb6ic64_ih448oh224kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"583d5715f9f294482e00bd546737f604*1&4fee9acc8ae6aef85391246ccd4645b9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"d7f266e32d5a3e2659452d49d6bc283c*5&3feb451903ba8314aa79d8975d62ac51*5&756cecb2f4aceb877b9997827c8b8e3d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"3dcdb690d8ea575096ae63cde0408fa5*5&44d11216ade1113e9c8140b55c1b2f86*5&769b79c2279d4722eabafaf4fd9f18fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0+binary_add:f16:14:aBcd16b+eltwise_swish:0.271:0.314:1.234 mb1ic64_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"c4b7aaaa86cb8a86d50192dea8035924*15&ea391867153c54aed2284ae0a3cfafc6*15&2e6f8b3cdac21d761c989bec147b7335*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0+binary_add:f16:14:aBcd16b+eltwise_swish:0.271:0.314:1.234 mb1ic64_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"ad7d60302d2f7f0394d7a8fdd8bc4c28*15&804068e0ed339a37f423a380a2df6f71*15&0f3a40e75784bcf5f3f8fc04db8d5793*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0+binary_add:f16:14:aBcd16b+eltwise_swish:0.271:0.314:1.234 mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"4c0232c00cbccae277cfe12c5914063a*15&def8d0088d6777072af64ffbde2ff132*15&c260646b2097d61b32aa3ed415178201*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0+binary_add:f16:14:aBcd16b+eltwise_swish:0.271:0.314:1.234 mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"e541654b75aef8fb0d0a3e3395ee2c64*15&c1a50f2463a50cca93a63fc5864eaa2e*15&9d2030e291a179753d14184f45904de2*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph1_iw672ow336kw3sw2dw0pw1_n"8e61dbe04748aa7c71ccf233854b83d4*5&b8ad6b72c2950f5602288c47902a2421*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"9c3c4edc19bcb8f9e62b7624e82196eb*5&8bd54afc3860d556111700320585655b*5&6ba2a01eafe203f9edfe2be3d178f5e1*5&902840ba2356fc5afa7c226895f0c5e0*5&43d4ce16a5532af24bef1b59f33e2069*1&f06f8d16c7df59b61b0c38c9738e3210*1&8df7327577255b76580564b4f77d3b25*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"e9856aaafda307d0d1ac111a1806b8dc*5&863b4b210504809207cd6cfcc113f212*5&e603143d156538e65fb56b8bb7c1af5f*5&9fc0bfe8cc6f414d2213b37c3a78d880*5&3f4d7bc8298d260652c81b46568ce956*1&d0ba07956c6654b9b213c48cfb540cb0*1&2678cfaf008d8f9e50407daa0b649e22*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"6219f78967b95e8734b2fa049a21244f*20&2fe1e4b9dfcfdc8956dc63f82d2310a5*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"3d7d5f492a97589e7fc76136fc905f44*5&7bec70b097cbcd23681791a8eaa6fcbc*5&5e31b485fd1e163e5c525ce205d13436*5&bfd926acb7681e5265d831ffa48a43aa*5&146ccea90c0b019d34f34e6b2c409e5a*1&393aa7e01247530de847888c16fc0064*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"9422299f09f6c218b343072429aca49c*35&25312e34a93c48851d6dd74a18b97d72*35"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"3df27eb148b3b906ad564078b833dfda*5&523fa732bd2605096e8f224f0547711f*5&2e8eea2c0acb7c371ef3180057fe97af*5&c986e5f140693021e4329490b78e694d*5&1a2c596d6782781b185f326247ce8de0*1&a727e95ad6759bbe718e17bcb3fe5e95*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"51d1414b91bc824cfbf999af7edb3524*15&20a518e135fa44837034f257fcfecf9f*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"2b98076d0ba8bed45e29e6b333ea6d85*20&e3a4c7c06beff7ae7f7124f64ca0c20c*20&12d5cb8ade096b09edf2f83ffb1efccf*4&c9a08675bac5c3c5ce61871e7e40a735*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"e814bb58d55a09e4348dbd334ba3cb48*35&20464a13049d16dfc90d6b816f3333a9*35&da3d8cd420c2095f69101b2d11a75d40*7&2ac893e4241fe0ba4481df7cb80e5302*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"40428f4678c6a18c6600987609088f10*15&96c0774e605b72f4864edd678bd36e84*15&ac8afde2bb89b16fa88e5134f7cc091e*3&c603a2cb51fd4bf5715f3d8b3b49faf2*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"2d074098b877d7caa9c222a8db0b4f6a*5&48a369eb8c923610d78d609064626918*5&dfa24b634c94bcb6cfffb91607d58542*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"189fadc55e5b9cff3ad7928292c4494c*5&fa898b10bbdc72222c8369f000022c76*5&d35e2d7f710f7fc2c372a24312ebc000*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"4460fb630992c94df35e065e73b6606a*5&c2757784a089b931ed97dceadbe76838*5&694c14d13a8ac7512a5bb3beef8cbf59*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph0_iw400ow200kw3sw2dw0pw0_n"09d2603c94da5bfa7b2502ad2931471a*5&531b2720cc8fb6d5ac773a806bb614d2*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"5a00c6d1e026404199432ea9b36b8f8e*5&b1a79717d55ed772e929543c2b99d549*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"10ef6607c26d4a88264601138f85b266*10&590d76815abc02c79c72eb1162dd2479*10"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"525a06a1c1bec881d00ddf50184b0937*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"ff6b9eea4e3dca1cb313acb410a50f12*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"38f10efcbe23e814b7e7b5a3bae8ad9a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"8f47c554ecac0aa8e340a06c3995f9e4*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"9708dbefded3f80cd752c958e7c0e30d*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"20428d6ef7dc267b39100d754e186546*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"fcc24fae7bc8abdc04db89ce40a48747*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"56a05474200386a2b5379678e93bda2b*24"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"673706c8002ae6f3c9dfec639d82e3e6*23"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"72ceaba1abb9a7e1d354c26c94355765*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic64_id40od40kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"1996fe948fe12fbdee500aaecccf26e7*5&404c20f4642e994605584508199fb0a8*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic192_id40od40kd1sd1dd0pd0_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"05e92b9c1be17e4c64f52bc9b3e166dd*5&108670af210a0082d7281eb9689c7f22*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"14937bff1646f0098cf4ab5156f36db1*5&b6db0c939f0ad9526f570990fb1b7f17*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic256_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ec02312734c172ba13a3ceee3886b44c*5&68f4e264fa396866567ff9d4b64ea273*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic480_id40od20kd3sd2dd0pd0_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"8c3a36c0bb76f6a31567249d1ab6c503*5&608e24c4cc7503e63c316de969dfc9be*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic480_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"206cf32d0c576930f67bade766e4f16a*5&0b2a1ba6c5ad34141081fb313072315f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic512_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"25bda983c32ba0ed00457683941f2308*15&a1f0ccccab94e26bdf3e089d95afb721*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic528_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2275571bae3f6bcd90392fe859d74383*5&e192993a11d6d73342cd8e4b16c6b05d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic832_id20od10kd2sd2dd0pd0_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"fc16e1080c7e0b840a96b2b3bfc93be3*5&f39e79f2521bec4e25d106b3562d6a5c*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic832_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"e903127d592234f0433a9c2931afdbc7*10&8c2c33585283d7dd5ddfe1a7731171f5*10"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic1024_id10od9kd2sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"764dd50c74208f30f8caf06e6ad83fe5*5&5e6db4cc23fcf543e329d600c0df4d20*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"2624d3e4edcebd4124640569e5afaeb9*5&0e62f522a2f166e8e73aa9450cb38a52*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"7318343f7525128e27ce5bcbc805a253*5&aab43c72a850639508cf1babb75f2b7e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=ABcd16a16b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ecfd315efc16817eb52b6886b01436b4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"bd41ae65303985dcd04687ef5ab9e704*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"ab7aa74162843bb85b005a5530b868c4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"5ee60cae559faeb251e8508cc960ba64*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"9f164125984a61bb330e0f3c08b3b98a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih400oh200kh3sh2dh0ph1_iw672ow336kw3sw2dw0pw1_n"fae199c54e90cab8e8ffc82a08ca7373*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"7342e0305f3c69dbc03386066119d592*5&5b90a16a6120a79e5e81bb452a6a6fa9*5&314f9444183a151be5ea622ba15482bd*1&3c936eb181552f29564b666f17b2e898*1&94ae24d617287b77396c8f66165a4ee8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"33b8ff936ab175409ce62e0536df2551*5&434292c37d375cef39fe767a2c2a7d51*5&97d4abec58b7ce9f94dbee2dbac2f9c9*1&f9186f78fa34204eadb772b56a6f7163*1&41b48a31b78e848f0096e29245d277ba*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"7b08810e35a231306e46ea7c0c12ebe1*20&9bcc130a4f48ffe82f07ed118be35546*4&e71cead187593d42f1915ffa0fcd04c8*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"33074b5ffd0164f335423aef1d6006a3*5&7fcb18fc256a14a42b3e927e292ee324*5&7893fcbaa31056a5eb74080885546558*1&92d51d27a374a30a7aa49fc716c66ae3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"1321f3ad563f1fe35570c1fc74cd3f68*35&813892a93a980ca635f2db2d362f4b72*7&d2a8265caa32c8cf42fe3b916d266bac*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"4f638b315eedc7852105ce34ef338ae8*5&a0b5de22bb9bd7a59a24002d54599a86*5&055fca84ef3aa3bbcce4d20743d5dd70*1&b1ec4ccd58461063aa446bafbf780be1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"97ae1eed5fa7ac88f43aecccffcfe22d*15&1b757adea364e7d4cc1234aee6e50df9*3&3aee8d76f5e542f41c17abe9c40f2af3*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"bc267a5a94b1c3d6d565f3a9377a55e7*1&9a01407eb4966789e4f3a1ac3502d4e5*1&1a55dae4c490fd035eea32269f9b1bf0*1&a2c992850dec27923f38aaf64e3fae39*1&f61df8aa14cb90295c4546b5954330ad*1&aef32d1dc68dcb713367c96573afef11*1&c1c7216863aeca2aa68adcf2716a9732*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"43c2f396e2d34c7e562263af6d27c70a*1&b7a0eb3064fa10a990cab35d9eb032c3*1&829fa384329d7ed515401468d8905eb1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"292c6da38ac0315ab16f744726a94399*1&1f76c08887618c6e9e0576c327fd1f91*1&ea141ec292f67e9229d37655756e9af7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"00c0d489cbc39e1e3cb2df2ee2108651*1&6f951389f4c9d4744451cfd61291ae3e*1&fcf3d042c1c6128a984a74ceafd64139*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"82248a655a081287341bc1d8ee8a213f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"b0167e0e32ca4096c25d93d6315048cb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"94fc7664e774186fca9f4242982026ab*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"8feb4197d2492517b9ea464fe6740da6*5&ae2ea0b1abcce2b9d4f6482333927713*5&58c98dce59c75cb558361c88cc3aa4f2*5&11d7938d963bb6447ce33e6c60579266*5&c44d38ef0cf0559ac51a9323f8bfda0d*1&262b19588798435c4690fea83ae009f9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"4649009b3953ded4a6ea2554f04665cf*5&b0838efe7faf869fbe3ae1f8c0c89f96*5&38b1ab9e18e6c4c139ca9bba75560abe*5&0f180b1369fd2f2666f420ec57903d63*5&662c7c032f34638412e063270eaa8ec6*1&c1c7a72bf661ba347439ab65cf061dec*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"2e5678407426d44c0c9d8fac61184fc6*5&98882cbd349b041a13bace237fc175de*5&d5b9e3604376ec10d0dce5c221483a7a*5&696ca0644227e5558014c806950c090a*5&c14bda501dbb896c23d17d00388ffa19*1&71272c3292356869f426996c8513b342*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"8ec22b031dd860b233128d0adcead88b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"e4807d23e944729305f85ac7395cb64f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"057e90d55e408e62cc0e101203880d80*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"5e1e0f05adc71b939f1315f34c79db94*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih253oh126kh3sh2dh0ph0_iw253ow126kw3sw2dw0pw0_n"146f6ded72bf49027ce58e84e04ae713*1&4e2a61ac538e8cecc7c8df56ea008be4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih124oh61kh3sh2dh0ph0_iw124ow61kw3sw2dw0pw0_n"d9473ce6836c18b16076f42a39d67887*1&740dfcda7f9ab2a81d88385f6b905b0f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"32e70790eed0849329301111be738499*1&4d619b2abfa5eb7c1e492d933e7a827b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"fc8cc5889ac6768256f6c4741f75772b*1&0c4a44e25d4009439a464d663f85b6ea*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic288_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"3efdd343da709638cd2df24a778ec55f*1&c943033cd43ac0fa96e30553bccaf46b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"39975aba9602a7db285200077c149d28*1&f16e2821dcf0a0870a2047c05a8ab0ce*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"0a406d94d2ba12017d1a1d4a6b6388de*4&486eacd8c60b13e243741b90019043ec*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic768_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"93eb7e4398a90c7a2d662a8140b2def6*1&da33d695b0c7abf203208508e602bd98*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1280_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"9328bf242af6588ea3d6125b1a236434*1&e9b258ffedb240caf9b5dfc20ec443b8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"44e28148ab906537958d64c25ab5a979*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"9d17690df863bff08b5d12f6987fd315*1&b9c60fa401d3d900e056da45e5cd51b5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"1b0b75407811e051d712009dbd66d31d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"ed349d058b3ad48ccf5d60898d4eb4db*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"e15921f08151b709cb6d23bd73935c76*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"927533798c5134785f69d9ec0fd0d56f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"cd5b2f775fd79235a1b25ce98e340793*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"467c32d5cd5af7a149a35a51df23d0cb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"754799f061d2ba3029558f7880416cc1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"740ba291ebcb14131bf55f134d4dcbe7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"4f3a4b1fa0445c2dcc2b6c3482313a15*5&5ed455cd2489d0850687199ce29b7021*5&375c7d17d7741b202f577bf23146824b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"011e528231bde181d62bfc069cdd6462*5&9817a75fa0b9cfdffcda45f1096b3d0e*5&bc46e0db4edd7438e9de5a1a1f31ae15*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"b6849a5b5f84854040f79f0af0eead63*5&5244e47b55c4e49a4558d3caf32adf1a*5&2af65c8dbc19267c3f63aaf9dfbca907*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic3_ih1024oh512kh2sh2dh0ph0_iw2048ow1024kw2sw2dw0pw0_n"8f7efd3b90c802008cc422d21f09b637*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"f90fd94804ee2fe504379ed4efbf1bab*5&b418cbb9eeee045b17ea1a00ed8da940*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"55e6f947ebeac9bd7914e6d71cf87ff8*5&8598494afd046799cacb9e94857d56d5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"510fb37b1d47ad84d94a81c9d76d436c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"48184dfdaf293c3c0fb7b1b86e74b282*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"65fc853bc73c152f6e531f110d6a8711*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"d068ffaf32222f9eac842316bfbe09dc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih32oh6kh6sh6dh0ph0_iw64ow6kw12sw11dw0pw0_n"43baf102cafb080306ce4d4885ebc9ce*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih32oh3kh12sh11dh0ph0_iw64ow3kw22sw22dw0pw0_n"96b2de5c03403e9cce13a53077c8f7bf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih32oh2kh16sh16dh0ph0_iw64ow2kw32sw32dw0pw0_n"23bac76430012b42ddb94eae6195949e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"a1fe6cda3a9df27be0c99c0b204fefe0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"ab12adf5eb394556b9cdab4dbf7e7bdb*5&7eb9fd9c462c36d39f5793052d99738d*5&f80e0c962a6dd573ca2ed7fe9200c59b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"f249f6c7c82da5e8f46750f231837f7a*5&8760eba321c4cc171908508f4056f92b*5&f97fe7fc83154c080e638af93f6b7437*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"488b9d14efd153a70d28b4c9ca1b4aba*5&c326c5559a386d97f0249f8e00fd3827*5&26406f9063a1ba0ec64597d59d58c6fd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"66a13d14e2aba1cfa96ea65e3f86033c*5&4f64eab5746b48c48049679a9da711af*5&8b492647e55dd02905d28d41412ce85f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"6ab1590fa813d41bac432e9cde2ff9ab*5&6e8dfce4c49341a92f7aad21c852f010*5&f35fa5718354d4922d5bb59cedc7bd41*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"3c0acb9c92769b16213948f4baf2cd3d*5&a40b9a8f3eed6c9d80a54ea9de711dbe*5&13bd6fbb73ab30aca77a536f10a69eb1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic576_ih128oh5kh49sh16dh0ph0_iw256ow11kw49sw20dw0pw0_n"6a73344c6a7aa6ca2be306055c91fc88*5&ee53b3f7db4c2c7215aba95c1e4c5719*5&2a635bd655f8b55d8c52d3f30e0620b8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic288_ih2oh1kh2sh2dh0ph0_iw320ow160kw2sw2dw0pw0_n"a234836c8df4a743206f501c3b21810c*1&9e21f3467049a8e5530ff0bb8c637f69*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic64_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"a5fbb71936419efc1946cd59821a2658*1&1343c74109b550f8619b1828b0328d5c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic112_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"87cf3dd6b06546500a7d3d4b05c0b38f*1&e55978d7104fb1336b60bae3f1d833f6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"02cf12c787c1ebdc8b0118653f6be791*1&0dd79199ca3326a1591f8ba8970cd86c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic336_ih24oh12kh2sh2dh0ph0_iw24ow12kw2sw2dw0pw0_n"1c819fc21505a0007df696db1e0733f2*1&2974c81f54e80f755534a8fe433c4b68*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic512_ih12oh6kh2sh2dh0ph0_iw12ow6kw2sw2dw0pw0_n"002348067268476d4633850068267dfa*1&636527a488a190ec3cad95b1bd33fb4f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"977598454951d38cf6e3ddd474c0076d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"007391a4cbdfaf579b0661f43ecd5eef*1&9635c410ec093c529f42546ccfde41d8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"5671851c5bcd38b5343f595c9523af8d*1&f1b8d996af2257b4cc5ebf783d33ffde*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"ba0ace0dea7aaad2897fbef8a516832c*1&5da6f28ceed0d580dc5822fc06914c88*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"de1843ffaecb382730c799b03a326e31*1&b1b0ad4fc5fda962c67d5a7cd8b50542*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"4958b8be4c809beddac96b96181447d5*1&0bc236792535fa84432cdbc74e60dc79*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"342e53bdd977eed51010e1f31006d1a8*1&4d3ee965177b6bdd10718c9dd6b09d94*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"6be746bceb5d359b2c42b6efb8f6b570*1&b807bca21c6b450f84649074ff66f445*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"13b51c30d4144251e43e86b9d0cb83da*1&b2086d8b0762187a1e24e2b0fc2e7932*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"f39c180e5db6b84040e7d6218cfc8498*1&d1b154880ac30dd701bc65ef6829c5ab*1&bddf261a68219fbc5a927f9a06208575*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"59e5ca6fa0bcb61aed0105e8f9fb627c*1&a909a8d0f5a76af9ec813f6d0167526f*1&c89b0932bb6b86858d5f0e6a82aa025f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"22805cdec34d8d42b3dd7ee9b2f4eeca*1&4e26fbd6fc6b1787b033b61678b0b905*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"018bf3bcda91fe03d6e1bccfe7beb459*3&19e1d4f89d92c85e18990f05c82e8207*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic48_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"46d7c352c78c3c7d0c94e492ee141dea*5&e2d911bd7dba62db98a7e28384b8e936*5&93d05aeb428338f1ea15a21cab84d160*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"2809299e2ea07c59f60f269faf1746bf*5&74f493eb42f4f3abd1058214314e8412*5&0ec34dd8caf86ad4ce26a5cb58413701*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"08f41e1543b258a6be98439982d5af0e*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"aa868daabb44c51982199d9561e96b3b*5&e3a916322cd29b95326f1a24145710c6*5&360f16c0af90d8d563ad6c22ef3fc698*5&bd8fa50d8f8ed93ff8cc2c35b28fe849*5&3582d78df7fa54fc1d768dba867e8201*1&def8b31652287509b41bbc841833592a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"7adcf1f15495dbebbd5829aad16e8db8*5&0cc28598fccd43c9a95a42cbd6db6121*5&4890e8a9d04e44506f11fe1e2ec5a666*5&36d0774965475d58ffc6c17914752860*5&aa31fa160d48351024bc0bc37d66ddc7*1&3daf4ec65281c96c11d60cddc943f084*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic96_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"7d5827d3831e99f89026e2331b671747*5&0bdb7111b8f4bcf17f53120e079e85a2*5&a8250d0c099d99ab58911910b45119f2*5&74efb6ff7ca23baab409bc5a9c77a52e*5&d78b7f51658aeab83ee4737ca7ef49cc*1&26cb3f2c4ce86652167884d40a36a58b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"32459291e0c131dcd429d51b7555f956*5&11a8dfe75737668813ea790761f2bba3*5&6319452f35b95f4d9e4c99a5b0917a60*5&10dc22f14324ecdad69a14c6ca713a6e*5&0c204ac6904fb4104912c6cf7ae6963d*1&03dfa4b7018bf78cd6b753c4557afc80*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"a7c8cc61b987ecf521354bc388949efd*5&101c3b22bcb53a6ceabdee48b91b4411*5&7eb8d5990066bea278c850c0cad83cc1*5&4820cff8f41b616e2eb3bd6f6fb3f411*5&169c273c375f83a4547d5f35f88a192b*1&0c38ebac62bea2d397333860eefaaaf1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"bf4b4faff735fe0469bd3fd4cea9e731*1&dc2cd991952c485cda886f413e55fb4a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"8d808bc18567e79b289e3a3dce2c5d66*1&372e478a39a71d187bbc4f7592d74b4d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"2ca9e57921b40037eb6466130121f50c*1&b9452acd3188283ac7a616d94d12e6ae*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"b6b21ec980ea5a1ac14d50c21442a966*1&868d18204360f7a51695320293d4e50e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"e38a115d474d45b4d11abd941e5112bd*1&e402b2735a47b34dd6f4534ef2413b38*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"3f7886c5cf906b198636ef194ad78446*1&a6346579afadcca2592361a5e329f35b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"38e305c39d665bb45ee902ed8813c451*1&fa5a2bf0f369c4c0290102af046e58ac*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"74a689963dda09db7be3c4e54b565f44*1&e24453f11faec01c7b11a26dfd063d54*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb6ic64_ih448oh224kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"7fedbd61edcf40a66c8949eb52974f8e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"9c2c4bc30935ed435722b37794929ac4*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"3a899dc45b77535562466879c65e0df5*35"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"771c5557758d000b5ae85822d3c983ed*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"9ae890647228e3aadd2557bab5d6c735*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"e5cf1b5abcdc738ecbd7e883e0359bd6*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"b6040d6462730f6ef190036d18e30af9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"2425c57f5f2c7d7855cc6704596617e5*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"9bd86c80f6f07a6f8b1320569e99c014*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"76e85e88d786335ac9898b626ed83bea*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"6b3fdcddfd5855b12df80a334ec9df26*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"e3bbd68024e011b4a5d0475f57a24f07*24"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"0127771310f7e5d164b9ec1df7ced1bd*23"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"0e642343d0d9bf636c6881935c662018*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"8fe8eaeaf059eedf7d5d4bdafcfe1a2a*2&29455d431886160f2d59522f48f06041*2&0a62c157e00c6605849be8bb7e680486*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic56_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"11379d7b4c41fdf2e510d0ec37e9898d*6&c32af846fbb1efcb91f8e40baf4e98bf*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic224_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"089b8ecec82e7d4ab733b48e524ef418*4&13fe48831d99a9c3300a1bffd2f3a899*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"83f8538593e1a9ffdfa996cce5b7b3ae*1&cbb3b75bc85c5c6b9086ccbf2048011e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic224_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"9d19e80fcfcce09bb19ac200e2fb6ab5*2&89e8a242c61f0e30b29f0a0cad59b11e*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic112_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"f4cdef3320e6a6fb1dde1ff40bd0dd64*1&2d4c65e32b6dfe854628f9712178f05a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic112_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"cf2840429bb5e02d6555e1dc016f5b59*8&8ee1c004b20c00bdf0b72aa4fa32acdb*48"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic16_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"7fda2116eb60bf524b77e0e423d3e93e*1&99f570a78624602542c69ecdc4db80df*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic448_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"c283cd4d727baccbb163ac757b892766*5&b4e513fb7dbcf6b39da58a8d4c494cff*25"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic448_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"8baed5280976b3dbd0c5a61d4b8df5df*1&278e75bf11b213868f6830ef486ab02d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"4a72fd016a545d493981781755375148*1&99fe27848a22679a20a1f97f04547716*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"b80eaeb672654f9c4044d737e4560107*1&b9bd25b5e8dd2fcaf7525cb8319ce0ee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic224_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"170e062debcd28da87faac188a943d81*46&15348671c7d265481a60c31fafa131c8*72"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic896_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"1b9a738ba2025fdc886f5732a33eb55f*23&42851c7a7952993568c622cbd48c4d89*36"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"51418214451b56444753d2b00d662184*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"762318c46af4a0d4079416973b00dbc9*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"092e7d5626bff1a31feb4b2ffc6c82c4*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"c749b0dc68ab86a8faf6ba2f24452b21*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"8239e6c274cb67bb065d3971d77df65e*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih32oh16kh2sh2dh0ph0_iw100ow50kw2sw2dw0pw0_n"5693d162f55c2b65a704bb5e4322d583*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih16oh8kh2sh2dh0ph0_iw50ow25kw2sw2dw0pw0_n"3bff890b654e3e60429b151587f8204f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih8oh4kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"48f3ca43b85a2eb7265458c67ad240b0*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih4oh2kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"484d81022938d020440713ddb154a0fb*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"cd582ca1ab3309e168b0695ede3702a2*5&029d62acd6c0cb725688433f603fd71a*5&1f6d0b3d9783e751d40d473433269f05*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"5a18239cbaed7329af49ebdd2d559349*5&5d3db02e264ef4bdb98a34e589ef93a6*5&3b38d5826a283da426ce2bea0515fd5d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"490f7b37ed10da68e901d709f3820a5a*5&9ea23689dc7972a0838cfeb34b6ee4a0*5&0adc71151469b8c80ff52bd228cfd688*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ed10fe1d2e6468c103418f4c440a5be2*5&df8e982ac2062c2d3c6b1f083969e075*5&34ec10f6384370b6fd46cf4580d0a5fc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"78b6dbcf584bd9b6d5e8752a2d4d4237*5&f6e46c40a8ff225de66045c86603defb*5&6e5f3774e1d687785ccf4d6c295635d1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"40af9d5595e1927ce4210ddffc8c9b6f*1&e0e20c9e4a128febdeb2e0832c7bb035*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"55d09258a59dcc59193d41f5531dec49*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"9c7d659a29c97ef0c384227c6f12b8bd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8ed630d5f45f86838bb91c746f34937a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb3200ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"ca6644ff36f1b12b05ec625ef2c51bce*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb3200ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"2d349f89a3dcb17e289022e8d9b3b396*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"d01b09d3d39fcff23525e65a0d6ffbc9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"34dca92a8977c285be80e46fe4cac36d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"ca2f2604734e4e04ce15e4066350ad47*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"feb64036a96ab6cd17fa37a287b614d1*5&abf6c7509b9dd009e0a8e347f87809b4*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"a506195c64de707783aa89916850fc61*5&429d8b5bcb19f2f09aaedebbf212b0ed*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"2c462849d8b26675ff7ff3e3c451dc63*5&d795fcaa8293cf39407e5597ef987325*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"0f84d3637df451bc3b7aeef26bc1af56*15&ac5b74ed5d061318e75a5e8fa2c36bea*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"77ca42ec4693a4d02a3965a73cedd5e5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic640_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"a4ee572113ca1a1dafeb470d03c7502a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"c58fd941615f48f08cee87d72f1e10df*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"3bd8ee62f7724430facf89151b13a318*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"556c34e2e22ae76e0ef1c372beecd3cb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"777f929f0ccf10e909f161c98d4eae48*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"4637317418d809c9fbda8dcadce2f06a*5&35d28b6ca2fc322ad05b77edb942a041*5&d0c2ff79d448052dc52f47897cf38ae0*5&fb2baf2f20ea2c32f50aacfb75997205*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"4e27448716e3d581c921ad465d4695f7*5&ba3de84c9bb4b8476a1024ba4c849b1a*5&998f0959f8e59b49f03009933861a43d*5&ec286ecb59421993d8cf0d7d6bbd97a3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"ffd04d086ce10f2dca48966d6ee59c8b*5&9df7203159d2e2f26942626fe7d62cbc*5&3c59c3762312e53b29d4f587f0da0ff9*5&c97ec96d4fef73e7872480ca2955f58d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"cfe57f3d23c26eca8ca845d701671791*5&6c2928e87fbc6f348cd9ac86e8c78efe*5&bca1ceefe77d7e7e00f3768472c74167*5&18cd886e9b4bebba2a594fbac48b2129*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"f9ddb3710314740959746f6245a975ab*5&9b70a0d838372735210a8ab1e98e4a55*5&128057674e6caffa03706720604a02b8*1&7e37f394ca6f1fff2bb6cda68d9b70e3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"f108197384822da15e808f8151e934dd*5&149cb06d3f407fda354bec6d9ea1ceeb*5&008edb48c672a0d7d8e6050a92320bdf*1&0a43f4ce837dee03abe2296eb0a40aec*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"45b93a93b29e3c3fb305b728c21fdb80*5&f4b82316e7c8278844d1b13d5af5050e*5&d99bd49b26991b893ac1650bebd505ea*1&370d6e2eca5e8ce37d25f1a49a4005ce*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic32_ih112oh56kh2sh2dh0ph0_iw112ow56kw2sw2dw0pw0_n"845344305160138285595d971224457f*1&b355a279e447678f460ddd8f3038d520*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"1a64c8f1fd854bbfceb82b3b3187601a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"7b4acbb7d3e76767a206bbe9a266b438*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"ac4d0e799af7957cc3ce283cd36077d6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1024_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"b4b33ecd443dd90e07a9d8f725c91961*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"fe55dccfc6ab0794859ed126d330d8a5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"b84e3fef9f98bf61fb5e91207ae443ee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"64bc55f690a29467866a01b6a7f36bad*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"bfabd66bb3aacb966f55032f1766ef9c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"20f6f4ceb64aac9ce4e09bdaaee8777a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"5a2f07e06a19d01429460381b6943625*1&764f0300b4794d493ca09899065b451d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"096f50a2cbc0c9998e717a9f25d165a1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"95a80c5c7ffa077869a0a27d08dae806*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"6b1c47d8d92d7b166299eb97b1895dbb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"1cf20fb760016722da19913846094015*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"080680ddfd5d98df8292a05f70b6a896*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"25ee483e4747956c5c9c8eedd381f8dc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic3_ih1024oh512kh2sh2dh0ph0_iw2048ow1024kw2sw2dw0pw0_n"84e697d0c1d14e2117217ac150dfb743*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"3f652bf7765a14fce39aceb3168f5805*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"d2ec4c44544ce51afa9bc60fc4064f9f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"4beff467f0136c804a2027077d4a0ac5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"63e9bfe1dc883777756a94bba330b415*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh6kh6sh6dh0ph0_iw64ow6kw12sw11dw0pw0_n"0d47fbfe6b03888a1f0cae35bc7e4436*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh3kh12sh11dh0ph0_iw64ow3kw22sw22dw0pw0_n"cfa87b89f7a46ee2a683fe2a7c27d9c6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh2kh16sh16dh0ph0_iw64ow2kw32sw32dw0pw0_n"151e72a94d5f37f1111bcb6f1a4106b5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"d4dafa0b440625c92f12c9fc0cafa00a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"5a57a2381b0b84cbcbbf6ab7201f3770*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"8f3ef4eb0beae325cd46c96d0507e0aa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"2fee32f372dfb5f117ba8862cb10bee5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"67de2b032c0046dbc700126a9caf4619*1&3c1f798f83c086bf8822b1065c0dd724*1&e66efd2fcfa3ce242a631a52d288ac4e*1&29b02c0d30561aa1f274e63557bfe84a*1&129a854555f4112a2cd2b34efcf63b99*1&bf7c9a67face509a5fd12226977e355a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"e4027279a8e43d5ba14da350da544769*1&44e942d2af6c81f02626f81dcd74dbcb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"ec2c555c496c43e01c2894a6d7d5e457*1&577d4e1fbcab0cb4f7334eca506c8946*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4ddccd5afed81e8e97f33196062a475c*1&33567ab2c19ca5bd065775e1db18e7b9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"1d9c7e67ca55f77366f967d36e6a7882*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"c510b54e362b955115cc0eac35334283*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"3b32445fc77dd98e8bb942c384275a4e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"3613af62775b5510a19019a0f4866548*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"4abf6d1c9a6d2397c7e2af4f8acc1565*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"4aa04b15dc79706f0ff8dbbe9e7270a4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"c15c704a691f9310b0e401ea825a0758*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"eef007d65c8274817f7cf6101ed9f8fe*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"60bfe4ebae9ce80d328708068f68bb0c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"d8f7d54b7162d1263b85e4236be77ea0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"74c35b02eb90f6f1560cfd987ae7e007*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"bbf6058926a57840b4b80c9462edca0d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"3cee81657789b646b542001f3549c340*1&736e792714ca0af15dcd4fe851abd6cd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"0d9abce8a62f7f44b95e9c165f3488dd*1&4734f48d9d1cdf2cce6ff0000c68de24*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"4529f03ca414e2d9f21d623fe8863f9f*1&0eced3073d8d18d7dbe350c62b1b6cea*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"6fe8548e8b405b00eb692c8d8fbd605a*1&3e8ee8af1814300d826e7599062b1855*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"b5f97bb2eefe343026aefa84670e20c5*1&a390f905dfd0748257505299bcfcc7c4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"e6bae74d903cc33710a56613e65a6299*5&3d5e1c9ad5c0948c3494662e90380c11*5&6f8e720439a1e6426fed0e6adc491282*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"9edc8f9c24d3ab2ea67895abceaa22c5*5&656a6aa4332f491a17f2971e7cde3c5f*5&9c239bebb65dac55a30cf46b7b95376f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"057e714f154acc3e32fcf4dbf02702ad*15&01692a074f198820c7e2b942b90e982f*15&6ca0398fa029fe1ee708fc24ec0e979f*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"be1329ed01d0e5d77451fadcd6c997c8*1&74598d2fafb2cb3aa140023ccb4c483e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"81799d962e27018b04b07baaefca3de3*1&63813e0d15709214430bc03c0e10f489*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"67854bf340efb7c53175282302866c5c*1&7974fc4d7bb72a004ea67f24d9214382*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"af673c9a1e9f5e6930917f9a8f0ffead*5&cef0033c3623e13afff36c4f41395a8a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"f0678b86f139784038f31b7c534c48e9*5&af9c401e5e5c754f4147f597a635deeb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"b2608297b4b33de2470b06fc77b5ccba*5&4eb1e449f0d098a4383660809a4a3398*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb5ic64_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"160359d13a4a3f165a6e1176546d8645*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb5ic512_ih14oh1kh14sh14dh0ph0_iw14ow1kw14sw14dw0pw0_n"7ee1afc3b447b9378020abc111f08823*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0 mb1ic90_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"b1c7bf5817b3c44735b5ea56034cf0f1*5&cf59db20a8b258fdeed5e83de0f72866*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"0308bbfcb4902db719c6dba713aaf253*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"86a30dfee52f9284865631a1a2f8c083*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"f703c03f0cf98e6b8e7e9392f79344ac*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"92cc5b7d370fc2d24f708d33442d1d0d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"b790a1b397ae2fee0ed60ce695764fa5*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"5853211af6b109024e69ec80151ea2fd*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"61a32f72b4ae86a26ec2f08b54d8e84a*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"ff0df515a0bea105acc8a5c64ba1f8e6*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic10_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c81086317f60107752b111e91bdd81ad*5&765bee9f82718edd3ab5b7eaf45c5dcc*5&462e05d2d280158b673140d76c2accc1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1ic144_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"d7ef93be4bb5dc316c505ad5fcf773e2*5&6767a75ac6d0252a9e1b0c62ede75ba8*5&7a8d4e3510c49be46982fc46897f5cd9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1ic320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"40cacfa052c5820ce27e08c2634a90a3*5&7bcf5e589d3d0cf20fbf99bdb2fc73b6*5&a81fc510748d7d29b0786145330f7d08*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb1ic704_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"f71245488ab4a604c7abd671a4966e8f*5&c22d55b9af25e1e2b0aaf0fea3e030d6*5&b2856fcceae5e8b53ee1909424b26f56*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"7c74e8f8a382e5a052c122574f2db195*5&1fa019753cb3f50d888d8ef659b64ecc*5&3ca56c1ed7f64adf2e69b19814f92632*1&5190e1b9c7c5ee58daadeeb5c18b1d39*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"489c9ca3168dbbb16e0667113c2be147*5&b90541c9f0d50dc5ce01c8db37770cb9*5&b7c456ecd9692d10f8705f68266acef5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"03f6b9f26a6605a504d83458bdc5a609*5&6e020baccd18537a4499606f521274c3*5&18c172ba75b10a03dd76015ee51ae224*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5d805a8b94bbba7c12bab1daaab4979c*5&7cffe4c3bcc4578f3afcdddf2de0d280*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw0_n"ae88c6886bd9740850616dbb44a87c4e*5&1e39b3290df2eb84407b4815a6d3fb39*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic576_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"a0022228a3954aa0b5f132951c3d50f1*20&6148d31d0a67760c836301e3b910a359*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"39616f83deca4649028871a0229113ff*5&9b4f343e259fe50b005f0ff5a7e3d142*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"dcb4b845b22939f7ea4cafca107c50ab*5&b7f91fcb670d7d87fdabbf9df426c047*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"e7965122e9d1838171b151541dbb6c50*5&17e9d7c445f09a4f2ed3001b4fc664db*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"1ee4126ce2c9206e6db73f41ad2f33f7*5&a66bc76b3a992661686f3cedb5a2fb25*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"bd8105012662e3d9f6e50ac99cd32f60*1&071e008adf5d275c5fcf1c38c13de8a3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"e426bf780fdeb59be0aa95b2430d4314*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8dbbb22aca6fcd3931db3c9cc2d7bf54*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"a0369dffaddafa3659d421c089ab70b0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb100ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"72d05f5c55c2b7e21bdf36b399add1d0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb100ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9ff6fda3247cb991834b33b496ddce7e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"1d9173b5ebda9e5b83fdfc05eda738c8*5&0ecc1a5d7328c0d1c8ae5266203a25ef*5&8528a977a1dbf5df0526e2f4817af823*5&84f48f8c969ceafd7f13a31e4b682b6d*5&6729ef9a4865b87a1911836a621f388b*1&b1045a5c91f0fd24ab44065d5ad6af83*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"1adaba3ed195ed66a1446196c7edde26*5&a69909f8def1af910ebe45801130a521*5&652ead4d03808554ee0923012bdf784a*5&118de96cfd41bd5b953fb98140744fc8*5&59930299108a7e49f4845890af6ea6df*1&38b1ffa1f3321ce2757d767779aa9430*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"53c135c7f46eae8a09d7e6bb239233ad*5&dbf55a9ecd3972173b6a94c2c5290f86*5&bc50d39f3bf6c300ff67ec4c41ecde57*5&483260684a8c148ff92b76671d3a2b15*5&3b211df2be7b8749e3aebd56fd2ce511*1&5d67ff71431e608862ab6ecf621b2725*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"b499f8827660eaccb3d8e66ae9fbb026*5&c06996c0f4f60a751065d7e41665a234*5&38ac0291cd4cb37c4414a41a0f993ea0*5&8d82fcea7874742301979af21572893b*5&5a5a06ea77618c2eab7d88a0f441bf48*1&56feeb69fdb73432d3f6ab19d5b5f198*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"188822b68904ddfe0fe80f0126ad1679*5&2fe0ae6b63e846df8481af763e08b7ba*5&5de70544d2ac3538c12246834b4a41c0*5&22e85f132d6f0039751d74be96a4f795*5&9ac2d218bf53a8401782ec21d9da59e1*1&f356c6d30437369b64d94bf8187418d1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"35b3f70645fb1860f1d767e8acfe3d2e*5&9798e4a578aaa05f0ebe9ebe5dac4568*5&e21145ff307402a5b72432ae796411a1*5&c729773667098b691482f11f67482be3*5&c86aa011c6e0b411646e757bcced6528*1&4f205cf4dac19f1b1e0676f173883bd1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd16b mb1ic48_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"3460c837d355d8aa7ab1ba7009e7251d*5&27d631d42ab69cfc3a56d799f09fa7f4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"db5c59817d69c175e30f68b9bde7105f*5&2da505ca8014914400aa07228851e904*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"507927b42311f13a8623d760574d5f73*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"f722d7560712a711cb3e02ae0e5c6228*1&725eaf5dbaf73fbe033aebfdd92c4737*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"cc7b79439b1e0f10f0a526de76814a20*1&3c6505a64775d30aa58e567959129423*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"cb706641e25b717c242fbec3f1757216*4&76ad93794ce090531e8536fe5390580b*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"a4016b15b0fd673b04ee7643c172dfa5*1&f5e99a11039c46e08b3bf33fc5bfebea*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"85653cc494a097efd2e6eeb691556508*7&bbd453d835bc2d75c6008155e684e387*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"7e1a23fc5fe04e0f6578d780862b8cd7*1&c7d3cb3adfd8380d5b2a6799a9f82fb8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c42c341614557fc50ff0f0ce4c916539*3&c73d2121403521f8d194f2d1cc561908*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"084838df4bebdb55267d4001ecd5a409*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"f6a00ccb373bce7647d54369d1a695fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih1oh1kh1sh1dh0ph0_iw500ow100kw5sw5dw0pw0_n"61ccf11bad3eb300468ea52c72edd3cf*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih130oh65kh2sh2dh0ph0_iw102ow51kw2sw2dw0pw0_n"dd59b0f6e91acd8ee6663d5f39eaa08e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih69oh34kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"659380ec42c91de6e66381df155080b9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih38oh19kh2sh2dh0ph0_iw31ow15kw2sw2dw0pw0_n"acf403b9d5dd2ed3e6a3ae31583d09e4*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih23oh11kh2sh2dh0ph0_iw19ow9kw2sw2dw0pw0_n"22bf9464c746d846da690858d03d7b0d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih15oh7kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"440f0bec5e9bed8e766d3359d9a97267*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih1oh1kh1sh1dh0ph0_iw500ow100kw5sw5dw0pw0_n"a2fc4fa6eab4672a0343cae260de4f15*5&83ff8c5cbfdd39fbd44b74b1ccca4762*5&4ad75fe6bf45d00f31bb39c246f00706*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih130oh65kh2sh2dh0ph0_iw102ow51kw2sw2dw0pw0_n"40801cf9aa70444d5f9e0fb008ab0a14*5&bc0779ee5e5a5aea07920b175d6f0c9c*5&d0668f96e8f73b86456a17da5b8f7ba0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih69oh34kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"e4ba27a2ef007d08d6d1edd5add2a77b*5&79c5266be1cb2b92367379bdcaf755a4*5&009ba4c25729208d17218ade89f64b78*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih38oh19kh2sh2dh0ph0_iw31ow15kw2sw2dw0pw0_n"4f40c87935dea5594d6cfc06dbc8d87a*5&19fcd41c6eef5e3ba3c7fa2202bfda06*5&a1edb253a82a98afc92808caa71258a7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih23oh11kh2sh2dh0ph0_iw19ow9kw2sw2dw0pw0_n"e6fdb54370188c8f348cec12d7b5e5d5*5&ef0ce79636ecfb1d5c2da0250ecd4d1b*5&222d5b4a5da2ccb7e0b72f9163e98c32*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih15oh7kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"fd1e276a238dd81343c2aa88a76149ac*5&1d11e74581c229442de718712ea26dc7*5&343f40ca642c186b411fc50e2e353f90*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"d508b70d6c2768b902229cfd2bf2abfe*5&ce22d74b61a0204583b63f2fcdf578fa*5&5daea467da87d6a2c7e323191baaa5aa*5&50a58aeff37f9e0ee00babffa199c02b*5&ce258960552aa0424f4e03e1dad86efb*1&11915ccb30d3eb4759b4b6972a40bc1f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"02f3f804e8d57bedc09d998c9ec1e8ae*5&cd7fc1842c9d901df9ae29d49dbe555b*5&97ce218c27a56e74824dc6582f2b4dc4*5&5cbddd5d1a085ed7becd6fae53af6643*5&378883e6ad796896d9339dc1ec796c5e*1&7690e6c120b33eefe34e92745cddc188*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"34ffdae1b4abf7c8e6c4fdf09acc0708*5&823bfb859c4a01e0031eee4c1b02d0b9*5&91abdc45579ab4291912381f24973e99*5&282f3b84b0ce9dc7d6138d5a1c0a03b2*5&b2b6251a76c3f5b4c5b601feba1d503c*1&8ed32a3fcd9292bf7952af9ee0564f82*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih253oh126kh3sh2dh0ph0_iw253ow126kw3sw2dw0pw0_n"928bc1c0e58fdc4e349edf231d8e70fb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih124oh61kh3sh2dh0ph0_iw124ow61kw3sw2dw0pw0_n"4b9a9ec8361b652bdfd66f7c2c271486*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"870817bc92ad9295466e81890bbb61c8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"ca26174aab8be6c2b13c27a4a7c2765d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic288_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"bd257ab65362f1db4374f714360aa571*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic288_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"2b7d465e1906428332834bdc854dbf27*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic768_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"5c9c369eac5a2dee3e447b11325e3a22*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic768_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"1b0bdccf7014e17381f74dff138dceeb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1280_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2bb0a65f6bc8a65630ff57d203a4f4cf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"85a0062ad8543165d479b24910acad34*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a32b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"54e8bace1e8700570c1354b23214067d*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"3fc9ca7db0b30af329c799269f50a392*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"87e93d910085bf95a127bd76e2ff905c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"91c08e62183ece8b2d4950222fff8e7b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"3e828f0e5236bf5e5eb8762cf8591692*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"a438a5f660b458cdffcbf30083854739*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"5fdfde015bf3a57fb9d5b8aedb5f35b7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic24_id8od4kd2sd2dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"720ef3205274a4f095ef448f45971d07*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic72_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"3a92e3a4120e06c16922dd3c3b316a51*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic120_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"53cb831c54d8ea1bb7f0a56a8f2eb86b*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic480_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"9ac5b94f6a17c2580e03dd176d1a7ecb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic672_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"6dbe765a57b81222e8f844f5c4b34119*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic672_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"a6c7eda3bb49d2b360445be0edfeaff2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic960_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"d7463f1553c5e035c317631ade373c59*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"2a2ce273ec0eb24990d86a6df38dbdbd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"a55f752ca17aeb2d5835fbc9730c1452*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb300ic1024_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"da8ed51a7c2266a2424f0bea401638a6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb300ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"0c99ff0a2401d8a0985446442ed31f73*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"b8f8314f4804d845e9904c640eab7981*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"771f62efb5b73eba387a63e7e2e163a4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"118350a7798bea462df96a6b772b2e4e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"0d4c071391ae1c582f008d03998fabcd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"25c91fb0d7a0e83c132c1f756480735d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"522f822aa25fb078988b553fe8a7f65c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic528_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ae4feb89a1dba8f853898bbcbf861fbf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"961bb30056ec1664e1242d8540248d75*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"905c5ae06e50413677866c5a2c8dc738*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"ea17ddada527ac8ec577bd38564ec4d6*1&ccd9ba4deac237c695da55889c3b4b10*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"a57e16bfc28e7a99417666952b4a5742*1&eb2c5ea24b773059904f716de978330c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"d1d13b02674496f722eae961cc1ecd42*1&d4c3cfe48f8eb412e60a93827dee0eb4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"36e216d90edb297e86b3c9e9787e5173*1&81a7f4f305218a6a848ef391ab6b20d1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"63e8e8a456870346e165da7f5afa7999*1&18e5770f6b056d76b4763adfffe8ab44*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"2427671a40516347e8f09b8349ebfe41*1&d4528aede25afaf3e35b8727ab69d13f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"8b97c0b3b69dc3980eefdae0845e08df*1&89edb0d9f428d4c275fbd61d28314c63*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"51efefa4dd4e9c7290fc813bfbb05bf8*1&381253ed8b6c7ee5124c3799f5d9dbcb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"4bb961258dc595f969a34074b6a17be9*1&9981ca055b73ef49f1cb1aab2dcdfc8e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"6f902e7ccc471cd47ca234aad16429d0*1&dfc448fa81d4acb3a7f0fbceebd7955f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"b7cb53e847e3e74ab96eec4f04e497cc*1&82c02389782679202cb617b0eacc124f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"baac9e0d1bac20bd71ef49b012be8e85*1&b820be0045a5358c6279dd2d051b6f71*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"97746661c41a2b48efc12c955a81aab1*1&29c0b92afb466fd5dafe7c8fca86e865*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"cb61e756afbbb69c7904f7d3af40012d*1&2e833a73c2e5b5ade7cb0110dd94a901*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"8d66b56d06905f1ea20ab9b6d3016938*1&12efbb68b095f130c4067dcbe265d148*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"248d39eccc8d549ab0c012e186859f69*1&22a2b6e4d5e80ebcf3b8d0fb0a7076a2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"736c53734bd9da2fb74bf4bb8b355b26*1&5eeeb5834ff03439db0eeb55762bf2e1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"278053df331e1a581f003d4b68d15273*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"0751a0c029c13c662b49d3fffdf0197b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"3591f638f3215aa960a9ed449d526157*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"0023f72a65d8d3e82675cc7fc56495e7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"2d6eb6421db7af398dbd521aa39cd44d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"7d8f345b94118041dae54a56b4ed58ad*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"2842f45b9f18e6c2c713c8450c80987c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"80ee6beeb19781b1ec2599b3ebd9e084*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic16_ih416oh208kh2sh2dh0ph0_iw416ow208kw2sw2dw0pw0_n"8c138f69513bb7467b995f2fbfc3700a*5&80c09e3baad651053c13fbdb25bb3520*5&d8b80106b4039dd964c47ae087ac9964*5&e51c738bc47276d27c54c87d3b60c498*5&06affaf29c45b7febb978e15fda2ca92*1&9f7e6df107ebc900a3d93ba6b4bccbc5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"cfb5056d972e54b3bdd6f56a35994cfd*5&d6b0de30d3ec0c8e82d29c70570f5dfa*5&d68caa73b2605b96b08b049f78c6a2d8*5&9ea81c92f9a154c1b6d4daab6ac96087*5&8e30b1b4a71c5c73c20bc2855578cff7*1&184f359f7ae238c9f680c02e9c230fee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"0a73c38b5668cf944884f34363dd09ca*5&2932087a1e1582eebced5557454fc86a*5&b13961dbf15cc627f412d4136328f546*5&bd61605f71e8106df64aa7e4c77f0bfd*5&4e612519caad7f0f3ba66d87a8c68056*1&faf2b314f0c172bf06d9a7bf47348f8a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"c4c8736b4c00f524a546060e73ec9820*5&47ca7c776d80a9970db7f8f911a94e36*5&686e44127031f2994c4cd2cc2a3e14e4*5&049331e68f4d7c2b8304ab85f281d47c*5&debce402802e0acd7e453fbd7cb330a4*1&f5a5a84acab63954df5c55901eb92ab8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"54d6c60cd3d8f90868a478d154117e44*5&e681982288166db2a2d8c75d04175309*5&1e869c90d1dcef98b210609852eec348*5&be3e41ac993b9adf9e54fc832aa2e38c*5&37e8da673397f420fe2d1d42197b5629*1&9194a791bd9296b6ae287e89b2441c7b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"a9ca93dd33831f285aafad81f4e0e69b*5&4647d7ad1cc7c1638d2bec07bfd736fb*5&a49662c106584b80d8e31e9ad0d2910b*5&2263f50a1aa25ad92d19b575f0ca7257*5&c9c441210cfdff7cb80fec41cd5f576b*1&effb60f1f2b2686525196befe25db3d5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"cf6cc0703c28f4c10ff5d3ec75f51ccd*5&f6fe32c34267fd6b729b5c7266f8fd51*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"777142881e0b22388d1c7651562338ac*5&37152ed32d9d53e1b72754c98bab76db*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"00b4b0bbde8b7d94d9178a8b6eeb5363*15&d053491926dfc2e1099cb1d60dde6356*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=binary_mul:f32:0+eltwise_linear:0.271:0.314:1.234 mb1ic64_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"116bfbbe9f4787679a93bfb727f6d436*5&ed1234880a5a7d35a1ee4e8b0b22c97c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"12c806ca3a14e91375419d22dc49bfe4*15&9b2929de25cdf67ae12c7e4e5f3ec8be*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234+binary_add:s8:14:aBcd32b+binary_mul:f32:0+eltwise_swish:0.271:0.314:1.234+binary_mul:f16:2+binary_add:f16:2+eltwise_linear:0.271:0.314:1.234 mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"450cc2680acd38ebfdfb8047ad5536ee*15&1476dc46175bf45a1332c355f407c556*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic64_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"1b0264d44a1787dd7728adb307cf2829*10&bd926b24b4d15f87ca69588b611c0151*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"7cef077802ff705b2434d327e0d510e3*5&cc65fc0b1676cb45e9183635c999868f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"01873a96e0e0abdfcfe7f9b00109f95b*5&2ed9b2bd17e95d1b8ff0eefb0b084b29*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"2f69ed66c67c2c2bf8f726b59f2ab4d4*5&15d8b48fe1d4d674b55077d2608522f5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"8d86cbdfa60b6b99f39e870d7c6d6e3d*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"ea1777d22651c5c7c48d15c7dcc73e4c*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"238494784723752ca3b793ec2ec65fde*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"de54dc7589cc490fd2e60a1b7b2b93e5*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph1_iw569ow285kw3sw2dw0pw1_n"845b072eee00ec4aea9b173ddb703675*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"04adb59e46d2299727cf2050c4991fcf*5&8fe30bd3087155b37ff881a32f7ea2f1*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"a25890d9fb1a284d4ba6c1323530cc08*5&032bc22a1e423f578a1e0203a523d1e5*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"a4ad0e25337fa65eb14076935229c42c*5&9ed8758ad1d24a0a7402cfe8ba6e5103*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"2ef8cf984baf8ea777e4b7fb6e235dd1*5&5e1a5fc102f8f6f20d619c1ef4ced9a4*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"4f24f1288ef4031b233c5b1fc6c29b4b*5&253cd68f4a0a6ca937afb3da9c045dc2*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic576_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"0bcb599072dd3cb3145ab9737a2eb60f*20&dc50c1f05f7feae9aac34034e680c534*20"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"df52b79fea67584dc08014d4a4e6fc7a*5&29b3573dfbfcdb8232b85dfa5a0a7b33*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"8bb739763d11521dda3ae2f54c0c062c*5&b2f31b3329ad729adabbf52d413248fb*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"d4428e0cbb9a06cb9398a41d81c7f7a4*5&2db076bd9c62805bc94f6716465a2175*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"a62f17fc24d9260b2a200c35a90b65a9*5&ad37c4c174c9397034808a8eaffc967e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1be785b55b6241bc6f775ad7e4ff1019*5&86f1761b7bce39416777295070f08118*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"5fdad6218e7bf01572c903f70aa91545*5&5cfb22ffb00478e930af7cb8a3d86303*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu:0.271:0.314:1.234 mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"4648eede7b17029aca2a4948f15cd302*5&02af47d6482beef3751f572eeeffbd42*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"5217b5cbcab515a2557989099646a7a9*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"c9c98ec45b79fee4e0b3dba3f9cb90bb*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"936879ced7af9e45db5959d61ff08a55*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"16693b1b73d941e0f8dd171aa5793171*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"bc2ce9715872e95a32fe4fc20c66464c*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"27912b9f5ce44b381ad09217322c76c4*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"5d4b54da3242ca1c83e3f7993ca8aca9*46"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"f1363b0dc659835ec8c08d37095a0069*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"e744cc81b78a5ebbd98236bedc3a21cc*23"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"0832e3dde1141dab2f82d775a498b1d1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"881e1f1c3aa64d22f3f9aff0b8f11e6a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"c79f5a03a95c0974a880d4b6aa884bd5*1&264f8d653bde9342fec7941408e6a0f9*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"11b5f781b9bf2c435b65a0ee56eff81e*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"64f2e4904d3f7ddb37fd4fb140baa482*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"bc75f1bd1d700e91a9b7f5ad8dd62250*1&fb35b5ee112f31fd054e223c1e6857fa*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"7ac7b38eb44cae06890391319cc5eb30*46"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"1ef5bae9c8a7774a8c95f7ebc0be733a*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"b5135a6f0a23ccb078863d93ed277b1a*23"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"f945ae079776a20e78bfcdda6e8768c0*5&b044fad04ce07a2fefebd89f95564c6a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih256oh128kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"64a6bab910d83fd629c145b9426e6cf7*5&166784aec86d76e92c17bc76c3624470*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"14b19faba30b6e1745d39f7856ec1c05*15&7705656bae8d705161cee0bfd7129369*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"41ac3bffe763c2e5160f5c9d76cb61c3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"225e4ff9972e5bbc68b7823987cfb366*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"890296afea700856d4b86d27b2f8cd20*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic640_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"6d32b72221da20b3a2933f2a6f6eaa94*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"e560975cea709639949a533315ac7343*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"8113e71679f123108efd011e220bfc15*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"d8fd18429d91b3f0b7622b0b24bbc5a0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"cdc1f7724ac88143778e68bbef79e875*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"57c24330775f5184591b7dee468ebd38*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1056_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"a69d235f0a1a67cfd060650fcceba512*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"42532aa829936d9bb03f8adaa60a79c6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"9a46777a55aeb9f4f768a92955a3d203*5&0f86e3bf5c981565ba26062e5072f9bb*5&dca3469bc6ce5285bdef4cb4a5ed3bcc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"c27324dcf0d7da83f095b9866255d952*5&5046410b5992638836d0439ef407086d*5&8f8efeef576536862509b1de000058fc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"8706cdea54e6360b0420c3444e945bb5*5&6460234df59588a0a8cd5f432465589b*5&b4f10a2dc48987378a5843c66877abf0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic32_ih112oh56kh2sh2dh0ph0_iw112ow56kw2sw2dw0pw0_n"145e4e2fbf1fe41501884ad005b1df23*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"30b0823f065189db96e6ab90b846d836*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"3c4fec4b72c7e6ee53198b433b7c8253*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"eca519e9f6302013db387f917f31ffbc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"6baf67f553ad6c91025aaf32d1635d0c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic10_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"a0944b07f95148d5d42f6236770f8ce9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32ic144_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1c4ec834c76fb3e06d33a5bcfddde897*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32ic320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"2ab0f7821569aea4c5afe4b3e637bde3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b --attr-post-ops=binary_mul:f16:2+binary_add:f16:2+eltwise_relu:0.271:0.314:1.234 mb32ic704_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"7a1102778c8184682c092b955aa51edc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"db5b2ffe0305904ce7c38b0c2d1bc78f*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"547127f7556895e63ce000673a68a4de*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"deebb53bef9e0c24dc8ab9e3c8fd2a8f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"655d8fbc87bfd80b704c575dc7cd97c3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"0486f576cad7c562e8e4d291893390ed*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"aacaec4031abddcca2ee1cf477c96aa3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb3200ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"83b08a1831671540c215e7b99c295a1b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb1024ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"a7347466c0b6b9375c51764681e6b33c*1&bc3baefb63e69ece069b2fbedb15d7c0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"86eebbef5f47261a973567e8ac029d53*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"54b83b1afaf9b12faebf49fa363dce0a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"145dd13b4fba577361dc84a9c1520c87*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"12d16c86896908b165afe2cb6ab9b3a6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"166f5535ad1f64d2dc75159a4a5d97fe*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"693da4d0ffaaa31084752b6586ced820*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"1024e19de51c25873a5fa9dc21ee3e0c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic528_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"5f3413a8f8e0faf83f309250aa3985aa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih13oh6kh3sh2dh0ph0_iw13ow6kw3sw2dw0pw0_n"eac6253ba186eda93725f916147c2c67*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"108bc06da742180c92ad2b23e6a5f496*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih6oh1kh7sh1dh0ph0_iw6ow1kw7sw1dw0pw0_n"16ce03bf7af6816cea2104c12909359b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"a6cb08aaac445f6871f640c9dd91322e*1&9fff7792cebdea7a1617261a6ebbfcd5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"77462d7b9ee4f65743d44f0562328cc4*1&f2579c09ef5a6a00cad7893859c5c659*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c213fbb60cf3d8ebebb7c931b40ad29e*1&383c2e3f8d3b630683a988500728797b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih2oh1kh2sh2dh0ph0_iw320ow160kw2sw2dw0pw0_n"cd5f24080a9616358a6fe21bf4455b3e*1&862f62e2ddcfdb2f5c197241d05d7481*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"8522727b0241dc8a60e06a5c3e4fa4a5*1&6a07b2b3c126b57c5d91d7137eb340a3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"55512eb973ead4dcffe9a3c648233d44*1&535c3e6339b97890010c1305ec9cff21*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"e2af1f984c62f863ccabe88a21426948*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"9a87b05e2b8a1dafe319fe16d5b4ef90*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"4e37bf2e7f36792cb932039160e6e57d*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"8596c2178a2f4191586944e147d1e4f0*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"15cd73d83c7f72df80df1c5b33c4c273*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"1a554f4553c16ea7054e21203ae3973e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"0c472d9e78cdc639fc6d326e1e593d2b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"5a6602eec4f2ecf4d60cba2334d697ac*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"2c0c35a13f4bae0a52808549222badd4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"f273d3a26b8db354a0645a4f136feb17*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"5cb4f785a4015ab0b6da71db2383b305*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih160oh80kh3sh2dh0ph0_iw272ow136kw3sw2dw0pw0_n"b21900b6797786149a792e1cd1e882a0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"b5198d300a00900c3c04cf2d9e148430*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"a499e52fcff2701c2ad174f889bc4225*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"ad88381e31515b66e22db3e19907a354*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"192b6bc71dd08a110fd4f53c63224b61*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"4b11ab2910477b630621fb8a1fcba5a2*6"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"a00281950eb6efeb1f5912b6538eb554*6"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"f3873dc10fbb3863c03519603ec21816*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"9d6c00cfa93368ceb7e71e61d6e48edb*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"e66bde75f2a5944857f6d31bcc8d687a*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"4a6c1a893e4beeb5490d6ed7eeca2e97*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"d027756d779bc6f2a95e90757d49413b*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"01677d90fd65f2699700a78349b9f7b1*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"7b7316e746e4bab7975cc978298b3af8*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"09f95869036ed4df759fbadb9a72d61f*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"5d110a825032d36abb4fcc563897c858*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"2122f9712ad9b9bc7a9c984eda8ec4ae*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"d255e56531b434d59a0b3eb27a593405*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"f526f9d89ad3d5097637e77111a8472a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"dcea607198dc19ed64080c567433a69a*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ca1ea7bd26a9a2a21c1328a3f40ffd6f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"ef71a7db6aa5b752148bac3dd1b73624*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"12b1cf227058c8d45fbf254947a53c2c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1056_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"06cc215adedfecf05e72644105bbe7a9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"ddc2f3a4bc85d6c5b3f2249d192a4f35*5&b1a952c954b956f54fa0883a3f178750*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"41559929e9554b6642200aa2cc1c15ab*5&b7e76eef6bc9ade2abc09aaf0b87a37d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"1e3574a4443a9f20698dfeb38401fe2e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"8e0029072c1d2a48e1f74df8bdf76d23*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"8c865119401c21c499aad16408b6e782*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"706f08b4f8150ee2e7bd5b4cb52b4716*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"d767df01d4803858a489f765514c39b3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"cb037c55f2d0e7a6e4ed212a48f49936*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"7a15aec138d7b0f2d2021c293896f0ba*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"64183d2710caaaa60c025522b763b2a7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"74e184e8d0e7285c425b36a91c2bfb28*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"12f640c7c4ee64585190d53dc5dd42a0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"593fb4e6dd8830b686ca146762cad973*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"23634ca81e04da16839375e559b4908b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"b7f7aefe6b3e1015b9e4dc435bca6c49*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"f389abd7b5ca7b92935b092a3409096f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"83bfa6fbe9d41d8fd0d7dea927c2b793*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"d4d64c0cf93ead0231015fafcdf182de*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"eb49afc9304674f8e4765f60405277f9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"836c50ce38f594f2f0a8b83820397140*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"ebff4cec61acd06a65eae8aeeeb514b9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"788875c1912f0f25cfbfd1fe9df81a81*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"2dd0c6d2de45a7d9bab28bda8f91f466*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"c08e3f4a075711b83808681270b348ce*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"88a021c8141209f4a2dc4b673f9bae12*5&54e7884aa3aec9d1ed07255e149b6867*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"83bf05528ee86015701c52295f237940*5&a460193940ac7e20fc0dd15f8cfe49c3*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"45b346280fb838e4a4dadcfd05835a5a*5&454daccb482eb28d9582fcad00965469*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"ad1f2daae5020d5868fd3dff1688c227*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"a6eebc0d31d9329c9c9ef7d0cd7f2b4f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"cf57f7e1c41e731b516a487ab0b4008c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"15849cc9a6a93ca8d667b507d7632d56*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"6e88b7f14098420267975b4f62c67c97*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"8139a2bc66e6bd66f705d57fe113002f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"3d0adcdebd62e579aa95ec44306000df*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb32ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"9db1ed39f0b9b413873b919d2eb218e3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"05d380d4778f845aeba6e2aa746de5b7*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"d32db1cc39453646c7d7c642f7e7c611*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d0a489e000336ecbd266835c1992152f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"ffc5ba624fe99abd00f92ab3844238c5*5&c3ae9d3e111273224d7919c8497fcba2*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"f747b0b8b7a8c0f46f3a689cba651c06*5&a7fd0aec1ce434fbf6f370cef03ac963*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"6a88c33a2b7a3150c03a20ceac348357*5&1245984dd7856b348489a7faf21fbbb8*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"6b4ef8d7a6f9daab26b21d20f091e5a8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"e21bd041312a3dcaeb88054a05d2089a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"ac94e695dc9db2043318bd15a0a50c32*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"fdb10f3e37e7cdac83cebf22e1e41ed7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"92ff8a5520960802e2c3c4118adfc1fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"efdcae8b7796ac383df160973b419b10*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb5ic64_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b7593ce18a502d9f19afbd8f4b0ae5b3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb5ic512_ih14oh1kh14sh14dh0ph0_iw14ow1kw14sw14dw0pw0_n"d287b6b583f69a4f8d9c9c2501cec96a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih5905oh590kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"a9d4bb56fd08d8704c7401c7af8994b2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih117oh11kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"44a7a35bd51e5822957c8069cc4caef5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih117oh11kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"90aa15fdbb011c27f4a2378b01912d73*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih117oh11kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"794ab0323b0e5957a2a13f32b8f946dc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih5905oh590kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"e867ac92057f7d0320805b3888bf5852*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih5905oh590kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"0375dc897e1e9fa7bd26b27149b0550f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"67a438c004d3874ca66c3ec8be1b0f08*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"015c785edfc94c0965e413f7507b4702*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"cc91cf4c776e95da4ef62b14d27cd5d1*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"bf8f3b041ad396ed2de11f7024477f8c*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"1e5b2e671557bf522f062cea9616bfb8*5&ed2e86df29aab93c35a29d368fd258a4*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"d8b2c9c46a3bd19b851cbe855c78ecc7*5&dfa07701455b581a21ac16f7096b8c61*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"161cb3ff89f312415d9d190930111d8e*5&203ed4cd37084c8072d0e3573cda1538*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"3d1d338a4aae6526657fc174f53e024c*5&d0133cee0df93b1b1735f2d5ecbfd394*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"c09b1681050ac723f3d584923274e20d*5&90c6f6310735ae0126c2f208c6a463fe*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"ee00208e5f02e00d1e299a900cb256ee*5&ce51ea07218da18782e0434b36bf21c1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"b58c1963bb187605506ccf214fe86c20*5&d08eec97bda14118fee30717caa313d3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"b6606b18f3d24c5b4b1b29eddad55dd6*5&77f1b294ee86ad48fa54865e907cc3af*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"f2c9c72f3c9181dc634343f903d6a926*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih256oh128kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"dd2cda2b84465cbd82e9568ae86ad5bd*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"4ed1aa60ba6107bfe89ef21a59439ada*1&e5114b045a448933f9a2c31987b7b8e6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"64920a40322997201aeb5ebf79eeea3a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"f41d98ac46bc610dda2491aff08f89d8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"26aa1dab388231c59d97e26c22428698*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb3200ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"b181c9bba3d99637dd143ee4c8eaa00e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb3200ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"fb3e662fdfb2d9a65b7095217ef99dac*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"896d8dd4bc9f054a54d48e69e141b8cf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"047bec07f1cad447d4011eb924a25868*1&220c0e244ae0b3095ab2f7af433ade51*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"597a8942fa149c3a1b5398b4c4dbf18b*1&d42c951ea054a4ea8530d8e53e7b2132*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"14b9731f6e5e631b1633e769db7ac451*4&138ccfed5c6f2911bb2b0b51e657c32a*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"0bd0a2080353d154b4693a523cb02ba5*1&b24ae680d69c9ac25658360f56bb1467*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"e69f3590cbb5da0edfed7cf059a8544f*7&bb852430e5b7e5ccd7111c4dc1435d9c*7"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"0239ca5f842a9401204487d50afd0117*1&91c76001ebc071be797e0d046ed6e722*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"22bb12d800559f040d49a53bf2c59893*3&3f35fb7da5b3121767c2766b5e578b48*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"1ccc6e881be8e4b67a2ac5ac0638d069*15&1ec5c833b16e6e0e29a1aca7ac679363*15"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"85dfe98d2071a09e7946eb4363dc9758*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"c07b80bdb23aa879ebd20981b7a1b776*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"ac25e164b46407e9dfd8a4b23c366616*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"c65482403f3b9d56d55dfa73494597f2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"22724706b25c4a1a901637de889588d8*1&7edc85e6e8975bf6b821273da2f7f20e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"a7d972aa477020254f5004bd350839bf*1&1557a7bfea19df8e3ee152a601e90e39*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"7694a886f13cdc6100a5653086017ca8*1&600cbfdcf9a9a92222bf1e0a35e19054*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic192_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"d4455f66ee895a5b9324757fdce6d8b7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"e6cc39a4be8f0127266bd610acd765a7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic288_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"9c0e07eb6fa50e9896b60507bf38d2a5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic288_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"fbe3e67241a8af4734c6445fa98de07e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic768_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"dc394898e505b6e75fc6d1f46f372bcf*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic768_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"99dc70778d8a14e37ce44262267ff976*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ffb9aeb4a972364ff1da4ee0e0b38074*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic2048_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"c63b6ac760cd7147301a4613bf182721*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"eb1a2ee924fbb3ca002f978c1675a1c7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih512oh256kh3sh2dh0ph0_iw960ow480kw3sw2dw0pw0_n"5378844891b55c815d89f827a8d298c7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"e1e4a28ace2a76465709c92cc9b4ed57*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"b29beeb1ccb6975110ce8dfd93236f94*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"fb0d8bf74220ae9bdcc5291c4ef44772*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"a1d2eb0baff7e52d6d235139b1c24a96*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"383ca9dea77399537866c19d54d588df*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"4c90da12a8ab9604a8c0318b35cd4508*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"d0a95b0485132496388a96e72bee624c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"b3b7dcf7d919f16b069c98a3e2e63397*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb3200ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"43e3fad4e0e320e1be5e30ef9b8d165b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ee69a2a5c5bc10850d5278309dc8f34e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=ABcd32a16b mb1024ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"a897a70cb7cc6d3828637edb04c7de64*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"6e78a5c45182538af1a883ab5b256c56*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"54cf806449e04fdf06aec19c3db5e6e6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"a6a652a6594b059e1188d25d5472531f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"8d45ee1c9bbf705019d138442a7fccf0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"1939b5985bc076e8bd01d844a37d4361*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"4850bab5a05421384b8dba39ab295e1b*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"93b88dc44cf1863be6a83131d8be623a*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"e7a562288d102d31d47452a9af0bc573*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"8dbdbf9df27a9f604110b99fe45fd0ae*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"4d590db3431ca3ba5d455d45ada2e471*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"be9da863d6642b31c40ef78e5e4f95df*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"3a1dd8b05b64db7b6c33a2e7da6277ec*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"fa0cea0ec92cfa388a819d994d36e5ab*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"95e4ca1543c3cb650df024a16d912db1*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"f9efde7ca92ed2fa042204e6936a8f87*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"80b76975ad81cc5ecd1d4b1bb9220f8a*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"16a74a2446f268c2265ffded154f7ef5*8"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"99663c2294fb4946af4908e8f77c52fc*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:s8 --tag=aBcd16b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"5b74b378f8ec93fb40ffdfc70d1d690e*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:s8 --tag=aBcd16b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"120d5fbb83c76bc3b5857caf79ea2f56*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"f11067abb36f383d389de09218dc2585*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"d6c4711a261ba31cbbbcb5dd8cf491ec*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"5a0c6f1002c347aa28398a7d47092d0f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"2fc9530a0bfba9685b93ad09ec28d12e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"c26def331f769da8d6d04636790fd4f2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"7bc2fa7e02e89afa1239c421e0b0cd8e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"40b2e4edc3f85e6ceb57cf6148c9a95e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"b644c365da7c00df37974f86718a5847*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"d294fdd38d147514c53bf650e66c4d02*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"e5378694eb5b627f29c2922b103fb82f*5"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"9f1ad5ae2dae0f75eacb4013ccf849bd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"04bb972ba2ba96e2ba2af0329cbf88b0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"fdee69ffd59ffba0832a7b7b0b1f78a9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"0c1325756f5f09fe0c2341bec056a0fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"292bdfcc3bcd8ab86ea7c5cca508d808*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"5fa167f245128510a57cc97add8ae80d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"e59eb8c3fe2c32766f9bdf56585484ad*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"50f18a1e9e9c9d3702e12af1900d0a13*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"56ed124ea617107f94669010c6b2e009*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"b8aa9042f9dd276dd02359f36879c817*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"205a1560989e981f545377485a170584*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"b88ac8d66c64be02350a12b71e4721a1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ba16c12e439829e2dce9e88d5c1056c0*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"4d7e33d67fa2c9bb40e13dec819e6e46*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"229864c73c11be99eb9bcbb09fe9b697*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"f03d4006e60e99dbeb7207e3dda1c70d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"398c6adcc8cf155c07ccb593e20694ae*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=binary_mul:f32:2+binary_add:f32:2 mb32ic64_ih12oh6kh2sh2dh0ph0_iw12ow6kw2sw2dw0pw0_n"5ca770b125baaab0e98df0ac9c9a7c2f*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"de05fd8f3079d01f353eb3355f104e33*5&19e662dd759230b1bffc9f0fecb9a6c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"66fc33b7a19a9945767b50f953f1ee02*5&3fe6b88a923e271155a268e2484c807e*5&2ff3d52b908f60b0389a56010a873478&a757fa96aac5b7f81853d007169b33f7&716c9de500698ebab7ea664f86222d8d&d70f732e312c206268d6a9df8e76ebcb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"07c9378eabef5193b7896b53add6355e*5&1719c273325bf1d64093c14fc37a4f5f*5&2a8d2dbfb1176afeeb794b753de918b7&73881e2936be8e0f5e29327d4142ffe3&0cf3a6014c806e627fafbf92cd99452b&b823e3633545ceeb629b3d850080530b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"49d359366108a40f8d0314a2246531f9*5&5851054ece77fe726cf2df9657f4613a*5&3b8b230e69e89a07803bd8e9a440caa8&ce31aa12e9b15793709653b7d02df368&81fd3acceeb182dbefc69eded5e67c04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"7f08fc71b699fe22800e58ce8f2e677e*5&beb0aa2174fcdf2192088392bc860b3a*5&b9c48e2a76b9940a4983f26d1a3860ba&e675cc1021dd5e7071b902d9842e182b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"cf3b3c38a743cedb33fe4ae9a0893bf3*5&7ac9d1ff3c9481208691f11019dae150*5&0abac93b0a1bcf1ca2c7e8d7e795c44f&4519acef577148152888d6009d7815ff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"f558a0f75d4c1c48d4fc97023a7b71a9*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"c89949b9df5d3b8aab44d3b35eda1305*5&9de3833d3d1bfc943f01ed1502dc3756*5&8073ab12c0a5409bd30f7262aa706ba7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"d4ab8d7ca02518aee5e25422eb8660bd*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"8e42f4b20c75d608a8c3adddc79a69f3*5&89129d7a799992f71f033c054563cedb*5&b5689b027bf7a6719a273a7c6cbe7729"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"50ba3c2b7026476505bebf4acfe787fb*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph0_iw400ow200kw3sw2dw0pw0_n"6022e49caf88e35fae7bfa3b59c50fcf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih200oh100kh1sh2dh0ph0_iw200ow100kw1sw2dw0pw0_n"768650eca9db63fe3c02d43670295a1b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"87b2391e2ba85d9029ef0633d42fc587*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih400oh200kh3sh2dh0ph1_iw672ow336kw3sw2dw0pw1_n"3ce4b04776d503d2055cb3845e7d39c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"d6be874e556c6c12495c24721358907e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"d73b6fa9b6939f8cd1d6fc82f8f5336a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"d1e08a5aea751f4de65ae2cee95a4115"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"c4e9ab6b061ccbeec3045b63da60df76*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"6bf3ecd50b2b64c78f08bac4568a65eb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih13oh6kh3sh2dh0ph0_iw13ow6kw3sw2dw0pw0_n"b258999a0e74ddb4ad7c7ee51ba7bc50*5&0bb65603016c9502aefbf1e4a7eff839*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"230e256728bf7cc34fc0a566ef6b8527*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"bba531adc765d88f778c3323fd61a571*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"1189a30700d7ddc32273df496b30c3d4*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"949851faf97191c852dc29bae0d02e08*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"e478c067f6ebd5c4fe1ee54958cbc895*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"ea98ea92a68403566f76abd8f25ae352*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"94b2cdb47aa852614641437f750edc6d*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"67c232283f6b7e7fb2633fa791673083*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"0a7af3ae43893e893e1bd7c66a90439a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"8451030483e51732304a96fd8539dcbb*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"cd38d3b81c69f50a86416b058538e11e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"8839aa322ae7e00942b26fff54c6b53a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"f08d7ad45a986a60ea2207d34da0f238*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"7c6df022e948ce4967d619416a027732*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"83b64a23a03af61101c00d176c350e0b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"773224bdfd1bc39b5ed3cd1b164b2aeb*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"0cb7ddc0957797f262acff7cc722f946*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"6cd4ca327e9b4f6bcb55d9e429b40c7e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb14ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"cca3291499f109b6eebea6ee8f0c15a2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"048d1b3d077c7d8d8e236d6dfa13ec18*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"f35b5380de7deec733b01577b79ecec3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"3a641460e389852bee932469a326c65d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"dbb5d1fe129c583e57593efc1f41b211*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic10_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c261566cb7f3fb4b85fa3a14f1377128*5&592b1810306d8c58745dce6a6e115c01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1ic144_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0eab5f87abe41200708f9882d5b54cfc*5&ce36d87f4871d395cd88a562daf3722d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1ic320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"18e3f1bd4073cf66333976e4afd50a1e*5&2296677c65a4c1e8bf66236fee2cc8a4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb1ic704_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"677e293069ddef36a907c13fc7c4f772*5&55f05c517d0bdc09ac157d774a6eb927"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"5a99e5ad0bca0d8bae8b909b7d1d6fda*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih256oh128kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"745d7388c423782c4340dd2a799f3053*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"d7a56f23c4d201fe022a5ac2d6b577d2*5&534767faa8be5445506cebd293ef55be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"4575222f0e0b07db0434793e5a93c635*5&f766065d13c5e90a0009317da1f847d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"7db28e0e0bb76fdc2517d60416900e4c*5&5687c758289971ff07c8b5e00350d1d2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"c622a25ea04f7eef8aad0ab39d4739ab*5&540871f2ed7e8bb8fdb8c917863a56cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"626dc3702947d77e72d7673ec24499d5*5&8e0aa52a091667e637549cd653458920"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"dfae02d4f700e3fba32727997d2b98cc*5&aa06a4d73c49f9ab429b1c70f25fba51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"d942ebe881a45b90846e82f7de3b6b7b&17ee34b222b7c4beac2a1089978021fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"86fa3eaf805bc8b04b374ea448bd8c83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8cd623f9273ebad6ee9637def7b8d1f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"ae44883e5c211e9ec289cfa79279c0ec"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb3200ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"0bd239b334106b1ac1088aa911dfd77e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb3200ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"5f9a1de910a42f02636aba5a30002ac0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"27e7c5d7e33839567fea2f5bf03b799a*5&7ac7d35e757f31b7652b2d87b1c5f353*5&d50dedc848d290dfbcc65551f74caa91&d1a190b6ed08d28fa0e3f7a2c30252bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"9ae7b4090870e4160f263825331e0ff4*5&5618ff5086fa9cbbc6c10b0d4ba07e06*5&f3f65e34b24ce262c100ab697e07c04a&ee23a6e61bd12bd3a568b82629dd50cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"429e7bf6e66bbfd0395266e7ef013aa6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"44fd7c808ef9f1c5c4471227cffec31c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"09d837b14f60279bbe43fb65739490d0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"e44a4d79a0121653ee0e847c088ab247"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"8c667626cd9b650b9b34928f2555a26d*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"b887fc06780e694a54149a7e0214a263"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"9681f288bc219e4328bb28dd0eb8e65b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"7cd2c2456fa6243d1e62bbee9a96fa7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b358ef521eb19fd09d0629d0f6f91b31&585e088117fc0c975de19fe772c0ce33&c82cd43ed9c2217fa28e2ad82204b9e6&eb9c51acb276b008d5fef9bb6895233d&9d2fc7f9a8a89883517fbde8159bf67a&82a69b72c19be985eec6e852df73b279"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"1228d212cc04758f965f18d52f8c56bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"0655adebabacaf8f74cebb9eeb1dd8e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"f613493dd4639e9cfa19a9e4447dd4b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"edee47d76614fb1593dfe1530f86064e*5&48ade4c6719830522b05d1d3c150c18f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"8ac846f9271e3673a3a8c288f910d92b*5&d1038b9c58f983fe06e787b65d782810"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"5e0c5c54f530e644293fd76d0e5c00fe*5&fd6a0efa0619da5a1f08a580e1adea58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"e319e0c8610b4468270591cf65413c5b&14bc300c17c5d88455ed81a6c5311261&d8a8787e079eedda85404f5152349a63&9e5ff5a3685eb81d5910d669852d9c64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih400oh200kh3sh2dh0ph0_iw683ow342kw3sw2dw0pw1_n"ce2313ce873de4f5061c67d68a9c0404*5&5f4e11366a2176a0254a9ecb449aa197*5&a2b53d90a7aebd551497b025fd85015b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih200oh100kh1sh2dh0ph0_iw342ow171kw1sw2dw0pw0_n"4b68495f276d37ad324730477ebd7593*5&cffff46848ed0be92dedd47404ecaa00*5&712c1f2c9ebbcfa35223a8b8339cd2f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"7ea22f3298cd7085066691182c288241*10&278df00492ffd9d84bf9e4f4db4a2d97*10&6d05fece77a72a01422cd8ec9a930e25*10&32401e0d7b6303076a418fd621df98cc*10&c1e1218b14aeab04779b1f66ed173f51&6375199dcc8f8e242a92c82ffb8d9c3f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"13b94d67f26cc7aaf12547ed06437ef6*5&16338b6f892ef85a90338bd28ea83f34"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"e869ad480acfcc2c2ea25aea3a58164b*5&83ef92e51bcd71e5f4e5e87f32088080*5&3e7a16523e6af990a7a362082bb20eef*5&fe6748701b049c6ae9b113d5bbe169fa*5&4fe7da1e6f4265cf784f63aaae9d192f*5&485636d4c0aa88dc2d649583c53c68c4&e78600737b2ed92249913a82e967c834&350ed660bb3e0469f50d0ee60b5515cd&6a0a030a97ae0082c6326db9c3fe0e71&7db64d2d47843d042f38ea1b49c201b1&12a81c90ae0111707a6ba0f0d94eacbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"3bea4b9c1aa86024b52baf4fa0efcac7*5&b71d69de3f1838183a283397c0770141"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"80605f8efe88948ce4f365f6da6f2076*5&2594922afe30a14b5e28c94ba5ab4f70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b+eltwise_swish:1.0 mb1ic64_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"381851e86b841ce665117c9a2acd7c5d*15&730dc60eaf832427c031ce84f30c54cb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b+eltwise_swish:1.0 mb1ic64_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"90a88f088fd3f865a1839a9257411b7a*15&2facb38a0e6d050fce28ce525172f323*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b+eltwise_swish:1.0 mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"953e85d35ce22fb2c5367e0855262dcf*15&536cb8bb474b3be665a3c57b7484a241*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0:abx+binary_add:f16:14:aBcd16b+eltwise_swish:1.0 mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"ff61c821118232f101e775dc4e51e8ef*15&01fc6ca6df7de6f3626a5a6da2c1365d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh3sh2dh0ph0_iw960ow480kw3sw2dw0pw0_n"26fc56185662e59649263540d46b1e58*5&fd6459c54763d4a7ea40481d5095026c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"81c9059010457cc5e78c53cbdcd33b8f*5&112276249ffd56525ae546f7b5c801fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"ae0e695015d2dd18c49795a8226adf28*5&969949208d2057d372738f3748c83831"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"c673880ee246b0b82c9925eeb3c7e7fb*5&bb552e08f5ac19f136a50988c9e8c3d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic576_ih128oh5kh49sh16dh0ph0_iw256ow11kw49sw20dw0pw0_n"6265457ba5831667e88e2f3479ea8a75*5&bc0e7e59a72fc56bdd54828abcac4f6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"dd3182d92c3ef194cf30f478f80b3c46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"a2c44155e713691622c7f172870aafb6*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic16_ih416oh208kh2sh2dh0ph0_iw416ow208kw2sw2dw0pw0_n"5cc394b92c4eb65f83675ec7dae4dcb6&a1af506b712b589865e982d67f15692a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"8800b31843c5e65565f6a6b354da1f3b&d051d5bf4581b55b63578c101a6c3fd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"688ec7347c43562fe7a07ea9b6236efe&58538221136e3e70e1f2ebda64c19fa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"77eaa54eff2a68e0ddf2a0b853a1c229&10214cc93e92c0aeb440840276ff0005"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"19be6a09fbcac616f6f26420d245c07a&523051f832e834ab8e038e11affe6d13"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"cc6eb3c741533ff1ae2c72b496aec83f&ad6b4b030b8961a6e8f7256ecc78d891"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"6be324c73369949985534c01deac6844*5&cf58e6ef13dc4565fa5bf027e48118e1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"505a3f1311573666f81388dedabce4e0*5&b0173fb67101bba37672fc06dd8ca095"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic64_ih64oh32kh3sh2dh0ph0_iw64ow32kw3sw2dw0pw0_n"ae0e2f83b80fb0c45a975620e7a2f266*15&84611b250e2a772fea99f24cadd2a212*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx+eltwise_linear:0.25:0.375 mb1ic64_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"7652577db3c515751a63f1dc889f8bd8*5&9cd6eb8ab5e5f4cc32c513d50be3c17e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic64_ih16oh8kh3sh2dh0ph0_iw16ow8kw3sw2dw0pw0_n"6fd40959202013930d4f5c86f97fbe47*15&125a72b81ad2c6d260e495f1b644aed3*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375+eltwise_round+eltwise_linear:0.25:0.375+eltwise_clip:-1.0:1.0+binary_add:s8:14:aBcd32b+binary_mul:f32:0:abx+eltwise_swish:1.0+binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_linear:0.25:0.375 mb1ic64_ih8oh4kh3sh2dh0ph0_iw8ow4kw3sw2dw0pw0_n"3742cb8283c3e9489b9bb98e99adfc6f*15&12df4d86a1eb4380e13f82ea1f8fd601*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic64_ih32oh16kh3sh2dh0ph0_iw32ow16kw3sw2dw0pw0_n"034070fd85b7eb992e5116948f7c0fc5*10&feabb3e984c5d1f4d9f34cecaa836ff9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"b9544dd4cfe4606e593a695dda1be8b4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"8246b80962ef459c6b00fc5fae82c0d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"ef3a9582fa0f20670d58c0bf6ae50cc6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"d15539aa58eab41f616763b8ad784396*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"14297d31d30e39dc18c1635919990ce1*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"90c522b66b0d75bcf4245742a998e8a2*5&ab09ce955e802f0d1783b9fa08829200*5&9ee542606b6d5c4c73f053a173781c30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"c164c22d2033ff8dce872823753ba75e*35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9770789c39a8f1ed9b0ddb72d63528cf*5&b023e5d8adab36eca2c676ce217e5e3d*5&78127756a6ff905ce92f7d23168c7b63"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8d42bd1647f67e243ddc3917bb8d2acc*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"e40515927f767dcc9759804069992e04*5&f01024fd4075968cedec7b6a02802689*5&6d8989f1f0b9b10dd46c0b4d0dc9beee*5&febf3f6881dc4d4ca7c020170a5ae202*5&2eb191927bbc7234eccff3224bfc7217&a98cf7a554344930db2613f94262b1ba&8fdaa4c07ff7ce2145731e62fdb6c571&f0e45a9d3f5a352be45fde618ad34f9f&0d6d206a354234ebeae0c9cb107f447b&eef400e56f6da30c634f6e6270a7deee&10d7ccceddff27b51b224dde193160e9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"995da33ef9ac34bb0807459192f6d406*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"059fe8b203968c5ac822df8f26d5730c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"32a15c3da183cc70573d19221d90dbcb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"c340771e5d06570c8ca07f7166caaef3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"47df75f4206f9e3792b9b25816a11e6a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"8d697491521290b1410abcfd7e590651*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"f906d3c0e0b6db1fd5a8f760ec0e8403*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"f2174f2bb0395ce59c0db60857ff14b6*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"21bdac15f7fa45c5fbde32bd42b1dae9*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"aee6106754ffc24ba5608a7e41f0af41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"40f3c36f7610a6eba41e1929fd294a52*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"3a2c34763f67e433e329f99893e9d54b*5&fe76287b4b0abf799f7f6e98db781e64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic48_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"091b09848ea00a3865c4113149cd54de*5&e57df4a846f0d71e8e5fb28c2373669a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"8c10ad0e97f8db3bf45fa3aa4961e038*5&e78441a4242ddbed08d1f380e01b1427"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"c234d8d31661dd56212082d285c9d251*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"d417c666397c925e36394bb169807272*5&1897024872a9526171b15617afaa5696"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"badf7bdf6100487a29805b45146f856d*5&0888a83a8cdd6ff8aefb9c34cbd14030"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"d77bf387a4d3fd6720fbc07212512eb5*5&50fa238619d9dd62a860805eb314900e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"cd97a9778fe6ed7b6fab2596d8472477*5&a6a1c532836f9e8ad04b9ca0c64d06f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"4009027c8598c6fade676338364c3c8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"938e1aa24d19a8a3e5bb70f89f14b82f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"295fdd615d8fddf2eb646871fce8aee0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"2c15107fa161dbc849e0de449020066b*5&68d30b625fc47e05053f882cbc43c84c*5&4e42b66de0fd4e437044ca0fe1d16c97*5&8049432142c6fa0c9f62700c8e0e07f2*5&ea80b6699e32df0a0d9fbc3c8917f284&7d1ab9d5ecf90be5740b04a5396e39b3&b799f304344b469e4cc597958876b90f&f6a032134102310df86acece6b49a85e&226e1a7e0495e31adb5e5a8a7bce7acf&80ff8fcdce4603677c5d68380b4e6b0d&36cf13b6166c3533da022388f9429d84&1deb9730192dcf77643faf4ac6a69b1f&20b4418567be75c14d94b283004d18c3&2f31467c2dc738608979f9336dc969cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"0e7627476edcd4a2025982ba6221e8a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"0da80b8797fb3dff8e0fc090d2b3bbed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh3sh2dh0ph1_iw56ow28kw3sw2dw0pw1_n"3e0e5b9dc319187af8ed6dc3f4f8f0ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"880b641ce7af7c5bfed1c224402d65f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh3sh2dh0ph1_iw28ow14kw3sw2dw0pw1_n"4460ee3d4bfd388fadd8cc851f4f0b23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"f43c83147e84707d2c87cd34c7785152&0183e52c07de55df1d0641bfb34d2914"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh3sh2dh0ph1_iw14ow7kw3sw2dw0pw1_n"98c50efeb5c7ff5d68a643afb3d76e3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic32_ih112oh56kh2sh2dh0ph0_iw112ow56kw2sw2dw0pw0_n"2f8e58c5a965cfcadf1cb3d3c221cb0c&e81dd4557d037fb7135c22f0decce5f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"98fe11ecb0dbc33c26eb593ebdd78897"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"7daaa02f938b322197a13545546105e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"6769c56f1a8a30b3fef43cc3144f2467"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"63a72c55d3e39ae5975604f21f078d86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"4619ba476e7881ce02c2498e1fd084e8&7d1e16f6d565a7fd0f1f5648b04fb876&eaa4b6b8951ad2e71daf1ca02c03fb5f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"b371c2dc7d324ce6eb1d3f99b3dd74fa&dffb6bf42ec78a54f908117b5f3e4f06&36bd1780daf4bd6e9cd88af141ef5478"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"de801216d09970c3a5ba2a8d5519c5ea*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"a34057af42980527a7e32bee189b93c8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"a5b3cda1b7e8474de602cdf73383e2e8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"58b31636fd77f05650b46f7befd44063*5&4a4d34c0270c3b8c87767280bf8f61c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"aff2096063036a668b2f537b63e5a646*5&3a2469230fea0336440976b65ff026d5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"1d27ce2028e03497e6a31a99df9f1f99*5&16ea4352bd3c2992ffee65c25fb84b26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"b83ea44319647c436d3e0e340f2027ee*5&449892e4d52565c6b940623cab90f2ef*5&683f72bedcac20df381712ab58d8203a*5&01a9bd881c6523d2ebfa1d423d047688*5&a4c2bbb70234c86afd916daa9adaae5f*5&14ce4d69b97ed981613326657e3271a9&7acc72720578a518102930bbacd141fd&6a222a0f0fcd1c4b04902229288b13c4&62776fc9400814136ff9d435b58acf19&3f36bc73986cad45b3b0cfa38551af27&0c52cc72e347d332214b410f2c7d1363"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"90b84e0faf941af094a76db46b16e0a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"47f384b9871a74f28dc8c4934989b948"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"a085548e78d3c0c0c75989082243cc4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic1056_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"77e875cf62b98a0d0fa3803fb9b3a867"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"beb66f2eec07e9b1d5b4fadce24bb7d1*5&b71b5edc3645b7cce00402804ddde2fc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"5dbcc7baa5bb950810f3315a71aa08b4*5&a270f46175af0014653d0a8ae0f3e740"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"f23ca98b26350ddd7e2b29d460cd6ddc*15&01548d6a6eb3f7c90ff33b236784a41e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"1cbf0be10afc52f40f5fb332d904c331*5&a39dd38b30cdedb19ff1f3b934026a12*5&787b7553fd1f6e9dc3e6371083155525*5&23fdd3b871a1fb0f4406112a9fa59700&d5cdad6514ba1583d3e228947e747cbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"925e0dd8468ecd6ab52f702e538ea775*5&0da43b1922963ce8fe8eee71e3328a07*5&56a3d1188911db90094ef05d5f9e8368*5&b6f4860f49156a8a05599984c6795a27&a97624b9b7326cfad543d2d9d7503b45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"91f0df37ff20aec8de221e0ed5a2ea7d*5&1838363d62bd843a8aaa2a138615dba4*5&d85b4b2dab644226e79c851c55324924*5&3576456a66e12aa1034dd652296747ab&77913cd0aa991ad628f04ed666a431b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"cbcca90808be678ae3be6d7b6be95eb7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"2497c5278a5e1fbfac14f9134bc57d23*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"11824f02ecbccc9c791a72039512685c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"6ba480ba8c7511aa764f590474adcf06*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"eab9a69c7247de6074921bd549e0ec71"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph1_iw569ow285kw3sw2dw0pw1_n"dcd30c697fff0d6c8f7fe897f02e07d8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"0978cf46f7fb2266ca27f725ae6e3384*5&023e1d5c70ee0f0bdfcc1889da28f385*5&446c95846c67f6127be9d93fc3fc30f1*5&933e4a70345ed2cab40648d71a594cff&b49bf98df202ecc26b31b0a646d5d8cb&8ec742bcd46c80716651c5dad719525e&a8248cd2ce3caf22f968409882c46612"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"44ced9de5b4ff075e8a9d9e01b8477a0*5&e36469f87b9f2961ca8951727dc40a12*5&a75b3c727c6adf783a54f00cb78c0115*5&8f96da438c10782f8eec0608d87f470f&3846b20e21b0c313cb64c98570ddc49c&512bb66ab6a95a3c06549d3853825dfa&8402758cc2e2463b2fec440c88b8ce2f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"fef85a26b896bd5415ebbf8860536507*5&f79d5f0815565b249872adfca6b56844*5&39c31e7f668924bd2690d3c10623d7b2*5&32fcd9b2b4e42f07f5be095271c46c31&32e9ac90f6aa2b407e0b9e2a1092804e&7b394eaff7a768b17b6a66df6b9ac00b&c5519d4c63f8ffcd801e40815971df0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"e329dc232a51df42e8fc939064752d75*5&61235cf15f5e5d05ff54e249908b7f4d*5&a10ca54f20af86ea57890f91d583ceea&7451ffce1c298a5d2e9631144c3625ce&da82c23795c2ef4651defc1147d26aa1&8d703c0fcf6dd1f18663d98ebf57a709"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"eabe89ca53e72325af22480b9c7610e8*5&346f15da0b30f1c3f4d6a9747e113855*5&407608713e197897b278134f1f48b4d0&e35e1d856ef6e92c13c0fdacb9a467d0&199472b8215ee4e449c773f73a0c448d&4492367b7c36c905b5ab64cb11253ba6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"5e57d98402c485632d4aea5f5505b3ca*5&7c09e98c994fde0b6b3b5d3e648591e5*5&e664e24a2bc5c85fd2516a38d8103192&1000b2769310b6b5680c54c5c5ae3116&5664db8664dff732ee703a0ddc7a8216&c48bf678c3b8c3360582ab4124d02177&4c9828039ab4345e9dc9c11b09acfb4f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic96_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"84ea4309b196208976ae1deece750184"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"97f0fc0f85505d7cfb21bdaaab860714&eb5c64af524e853b3a8a93eb67c612db*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f08b6c0c63b2bcb90442c690c0dd8553&84a4e786257a253972b31e721068dd1e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1056_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"280aab7b2e37deaf9e0f8aff564a098f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"58bf5d684bdd22194f22293e49a09fd0&718821e8e1783e7a07603f1bb0cbf0af&7082d8052e85c90a11594dbcaaf9479c&331aac2980f39d4bb4ab6dd3eae421f9&2927ff08a588aab87b586862b1ff3773"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"5addcdd98ded76af9a30b55be7b880cf&7b3596deeeaf354bd9f1a0be646263b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"4b0c12078990a04c0c081031ebd82420&2f02e6eeb97839d6202b3e1707cd4e38"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"d4856449601ac683b14a708cf089dc82&31c5ace68bed7bc82a264fdab957fd27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"0a02615315d9abc4d084e4b8bf491bf8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"4167bedf47a5ea3df5083762d86f4a58*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih13oh6kh3sh2dh0ph0_iw13ow6kw3sw2dw0pw0_n"1284118f0bfab18ad308ce6513718386*5&19161565dd0492d840f1825da3027d80*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih6oh6kh1sh1dh0ph0_iw6ow6kw1sw1dw0pw0_n"d5e8d05a33fbe60b3185eac3576ba065*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"ab93522c6ed9b2bc1ac7078aeada011a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"2a1351379b8813e775aff5bb1bb964c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"e76572c24f3bac23fa0f0558936fb1ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_mul:f16:0:abx mb1ic90_ih256oh256kh3sh1dh0ph1_iw256ow256kw3sw1dw0pw1_n"5117a71dc887be55e7c8fdde57f0643a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph0_iw683ow342kw3sw2dw0pw1_n"27e6b2ae0891a8df8d8c60907e10fa5d*5&db664dd15cd3c8fef756d544b0919661*5&880fb25bc7717a62e6441dd6c0d036df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih200oh100kh1sh2dh0ph0_iw342ow171kw1sw2dw0pw0_n"deff4b340da162daba431ac5932e2746*5&5f66d68d34fdc6efd2537147985c18c0*5&fd330292fcc46bb7439b47598c685779"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"c0889007840621d689cd7e04e9ae5a4f*5&08c53a27a88cffdcfba62e2c5eef4a0a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"9746e8fb890b90918247c6bc5e08c53c*5&6b0c482135c901601ce10348a356b260"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"3145dbe5cbb8f7afd3cc015d41803c01*5&076cbb1415a4d2bb403301d41e88373c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih300oh150kh3sh2dh0ph0_iw300ow150kw3sw2dw0pw0_n"a5952bd1d466ed899b5bfe776bd92821*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"5a8448356b73e7665c50665731e7a880*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"a435aff5381e8997d303ecef505f6408*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih75oh75kh3sh1dh0ph1_iw75ow75kw3sw1dw0pw1_n"b585a8064b41ecd97f67c3f4e60eb284*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih75oh38kh3sh2dh0ph1_iw75ow38kw3sw2dw0pw1_n"66728f4f45df06b1784cd9133a09b212*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic576_ih38oh38kh3sh1dh0ph1_iw38ow38kw3sw1dw0pw1_n"65913e0ceadc6e7182f4310350427c96*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"92567fd50483d51e122e3e984abc37cf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"28b81d80b911760610eedfd4f0dd924a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"2a925780a37a1087ae8411b5f8c5372d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"bd2ebc49b23769ccc87ce506eb28f504*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"c2777b23436299dd3840e6377dc9a60e*2&5a1770442b69c815fd785c678c3f47a7*2&c72cf831c200fb3346a4061bbd6927d9*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic56_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"78d036fc08f5106eefc135c1041eb8f9*6&f2446f627299854357eb7c385ee2290e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic224_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"b30096c8beb3f6d92b5d3a17b95afc47*4&d1807a7400c7e2737f028d533af34abf*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"493d662fe56215693f7c0985a28afa03&637ca80bc70e4a83707402d9f9cfca11"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic224_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"3e206788c6ec2cf00f05f252e42533ac*2&4d2df864cc0ce22eb037ac102bf89f07*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic112_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"76b17ee75c11833c3e46a7a8bb905a68&2e5f2e31472f254975d406ff5c0143e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic112_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f1654cd3f0487056db7e87a3f76f19c9*8&5f5e87b3559eb3a26ad3b42ee4d46520*48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic16_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"116a30ac863f6d3b76a02986ed1a59da&0455589f3b30d4c72d6f68474f91c063"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic448_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"8c78723ff4f61309950310c32c6fe6ba*5&0f7e3d209fbb07d5689b251fba8c1ae9*25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic448_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"5738300503d502dea5f25b41caaf6d9b&ee315649dd9180254980568241d00139"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"08cd9073e07daf7d6fc7ab5fa70339d7&1e3babed31056f6dd10c3a084a4ed85a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"ee3616cf870ff1635a2bdb52bafba25c&fd4870fe822a967b27df7937a634028c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic224_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"a0ccfa93203b2a006d5c47688b44ffde*46&265a1365bdf1edb561f11906e0aa6ef9*72"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic896_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"e84aa15f0124bd9b4938ab812c61d68d*23&ec51e4a1ff4226f5cb23c37f657e63ea*36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd16b mb1ic48_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"072257548425cc45f9bb5edcedba4139*5&61442cda9fe36f0d545467053c143260"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"1669a1cce6ce0d147ff349cb69e95eaa*5&cb492e8dc6cae46283d80df1911889e5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"037a3c5ebd301ced33bfc62e759aa1c4*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"f03d6f8780be271c3c739934f72f45d2*20&e4e34c5442f526a8de1b1588d7c871c3*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"b0cdcbc8207e348e205835d042b0de83*35&02a4846e76fe405e78b8653c96a0cb81*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"6689146c60e8e6f9851372d1a6ca82bc*15&1c7ac842348cac998b839a48564e1eee*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"5c19026b05b212a783b98798a4c605ec*5&58c30ab9eb19cd682432947524d152e9&fee92782a39066e1645915e8543d2d5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"31df03ac211d0fc282e62c9734d3b52d*5&432fdfa8aaf1173f3b4da1c3ae795cd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"b360f4487cfe34692391045b27c04cf2*5&afa597276f25373e95c1a1381c58bbd7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"69a5dac900c8c16cf00e06e048073a0f*5&2a22b49b948587babf55ae22e45fda64"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih32oh16kh2sh2dh0ph0_iw100ow50kw2sw2dw0pw0_n"2816db073b47f9d384b7e84513d305d5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih16oh8kh2sh2dh0ph0_iw50ow25kw2sw2dw0pw0_n"a61b0a8032725f178729ce3063aae975*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih8oh4kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"cc2c2b22f2dceb9c0414cebd70a32d9b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih4oh2kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"c8d73050876e0e97947cc61410809b95*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh3sh2dh0ph1_iw1024ow512kw3sw2dw0pw1_n"75f7767e40f51bf0af6c7639057b57c3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih256oh128kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"2ff3fadf9cb8ee8c6235c43d880f640a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"50506fa0d776815b4697ed4aaf609768*5&cec036eda41fef7a551840e7c4e8d412"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih224oh112kh3sh2dh0ph0_iw224ow112kw3sw2dw0pw0_n"3d34c950a484cc71b312068522998fce*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic20_ih112oh112kh3sh1dh0ph1_iw112ow112kw3sw1dw0pw1_n"474914c7b63ed3a29906e7dfd8a2fb59*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"e5b78aa529af93959ba5fa51e906dfbe&0635689035d4761374b1cd2c14eaf72a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"7942bde642463a884e29c96cb12890aa&088d4afb02e542fddf494c075430473d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"57f263d5122698100f98af9eb9597b5a*4&92a69616616bb42faa80062d95f151ef*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"71868e2024a2f00da9660742e79a6c08&22cd8456b3f081df798a541c090bda95"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"7642e43de578168b5e252dd73ece7d7d*7&0e7fa39509eb2d48ece1502f954967af*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"c96a6b05e83be15da54f7ec328de31c9&e1b28d7ca9a5c55c7818aadf70a1d9ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"ff848119b2fe28f234d83d1fb720721a*3&b5135aeb0d79f5eefa42689c31872884*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"9da195a3fc2a0a6f415e23e218978ecf*5&fee2a9ab08a480d2f2a43cdea8a2d8d4*5&df1285f3336ab1c3545dabf8382ecdfa&64f61d727e763efc897d2866e6596254"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"134435df0de53f20635a04b1e2f9704d*5&d32d6b8b9f33c11fd4e5c8ab5a5ac5de*5&81be8860f7a23015d93e5462a6101dc7&8462bfd3c47698d9f247167b45d7542a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic96_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"7a70979d2af11663807e908562381ddc*5&822ee0d529fe8c475000f7ae267cfe9c*5&19e06871498a3b5b714c28a7079ac900&77a85b523c8d9a796903a5eef2926fb3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"8b3439fb11c2986a05fe53d5b994a893*5&596d121babc09fbd69dc310836a95456*5&b880236ec67900c6812029d044a7d6cc&0489f4ab7849e9962abe56a03b21b27b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"9b6f550efd0477c9ffbd355709f48484*5&335a79f9db7b2d43adb63dd182dd8e70*5&d97d6b0c77a27bc957cee6c40306c127&78885abcebfc95ed4d7910056756f2f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"d15a5f74b0785d08bb0fe2482b859945"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"766842aaa4bca88fbe983e60725b8aa0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"54681e830269592729db71e9ba7b6e6b&055ce5e1e6ed2173b0db76bd92b22965&622040a367ed7f9909061080fb3cbe01&1b14c44ebd71a853d14a37e7a0c1c3ba&ee4f76684e42b8b6ed70f4dd49c63aa2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"3d618c72fab210b407d1f6ed4777e82c*5&6f644d7572e71834eef60aa2c0ea3375"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"64fc24a9cfeb3abee89608c27d742cbc*5&c79d3e72a7a965f1fcbbb1e4f37d1f22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"f521bc289af3ce1c6244df272690ec3d*5&8308a933487b928882e2a14b2ee9cd35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"bd16c9a52e6def0df1d37dae5ef1e0c5*5&d432bc9c81f3945a78a01433bd1b0db2&37e0ca89a6f45029abbf95f9572558f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"b756599f427ebd9ea3e324a616f06b60*5&93682096cf3c563a7e4e928f1a90d249"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"40b059abe826e088bc69e817e1c39836*5&cf124e274c93c8682e0ce09cf63fc955"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"c70b4ca36b5141bcdd483669b680f648*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih75oh38kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw0_n"46e8a2a563585ffcd450b486846a04b1*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic576_ih38oh38kh3sh1dh0ph1_iw64ow64kw3sw1dw0pw1_n"62e0b1932ac0a82bd85c9b2f4ff9c1dd*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"d2ec1cc33e0671fd4a970cbf7b39fe51*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic576_ih7oh4kh3sh2dh0ph1_iw7ow4kw3sw2dw0pw1_n"66f5aa35edf32063108d743fe96d9e53*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"9ae2fc7fbe177bf196129903ae992313*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1024_ih4oh4kh3sh1dh0ph1_iw4ow4kw3sw1dw0pw1_n"15b01acee6f54f02381293cfbe0d0f00*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"ccd7e28872d00fb8923d88c31e5a66d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"c1230cde6351aecd46ebbdfb59618bf4*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"1df1f13d8a05a35c2178fd57421d642f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"c33dbc9a800621dffe2598f4731501de*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"b867fac9a6a95e925d4246c79bc48f98*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic64_id40od40kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"c1af5ada5d3488c766ee204eadcd6d72*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic192_id40od40kd1sd1dd0pd0_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"307df11c8fb47c8c756ae716ba99ab07*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"165b799950c7b5822d4b70374b6cfccc*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic256_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"0ab2a6c8b1cdc8e9946e228c04163738*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic480_id40od20kd3sd2dd0pd0_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"0c48a1f134505cf3e7dd1e06f8e02eee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic480_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a6ab12d0e8a4f8cfa2c40edcbd985c75*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic512_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2a918cb0655481a1cb08a203621fbabf*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic528_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"d149a47ede04dfc40bdb0b5e4d2feb66*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic832_id20od10kd2sd2dd0pd0_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"9784c3cda14ee50c8b42aacb03499883*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcde32b mb1ic832_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"ac1a2eeca047dce8d14bc50312244caa*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcde32b --attr-post-ops=binary_mul:f32:2:abx mb1ic1024_id10od9kd2sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"007c5c05283ad36096e4c2bf5ba0160f*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"dd2fecf7955272bb284780f0e9075501"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"9edcb38f36edac8c35d64df1500a2f89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb100ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"31abdeb5b18b6c9e2599b05f2a7ed896"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"322cf1a928d7fb974824a3d02853af2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"3ae122f50ab8fedc0f703de849188116"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"bb8bd5b29be4de6c1471a3983217dc40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic24_id8od4kd2sd2dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"9b1d94eaaeb9e45498e391cd989e4ad3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic72_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"8dd49fd74af77e7e3f8132e010a0d08d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic120_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"9b9eedc834ade2168430491af940f02a*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic480_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"d7ebc6274fd5e65486a7032128c4460e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic672_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"cdf9bcdb57ea493e93866c7e06c32b3b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic672_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"8b7ea0310511febf36f1c5071c66cf91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcde32a16b mb32ic960_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"caa0981686a3bc51a2871859eeeb27ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"d29c943855ab2aaa31b56a783cdb014b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=ABcd32a16b mb1024ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"d8be87b29ab29c46ef179224b09d4fab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"e1025e85ad35a067c4de137b5140d371*5&6d9f5c76d2189478cb1d9084106fcd9c*5&829763ac25634787e1fa17a349a7e3cd&d3c184fb28c58cb3b192bf5ef341174c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"b090a182ab1fcfaff7fb47637aad45d8*5&605426de18e5e6fe069ec417b67f0eba*5&8c5e458c11c11952f80980d023035c63&1b4f16e7d69687a8d1cfcb0ba7147fbd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"f628d5f84ac095a8519611fbf75437a9*5&7d6e275fb3416500ec538e183463742a*5&f88355f0dfb0cdd4f866e3b4d392e772&e02984f35655a5de8ed1bc932b2f0614"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"5e0d80821d61926fe7b4335536bac007*5&e491b0396b0d788a91aa3b13212484d0*5&e4aa9bae2ca455a146432cb40639b8bc&b93f86138d7b71453cca9d21524ff1f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"0a9b3335c06907a8e286f7e302f9562b*5&83cf3461e1e71bcc0ab77e5030a14df1*5&a6166aaee772bc3d0f4ac2f62006c4f5&2349ffaf8858a6f8c8ca8f758f9c3b21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"761eb94d84bb81cf0541c1cbfa7187f6*5&329fb4d8b7bf3496b94a7feaa9f52c83"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"d0827006264d82ad3107ebbe2c3786a9*5&d394ac10d21b58bfbfbaffc230bbf8e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"66aa68719cd68cae6e7b2da79daf0fad*5&bb3c24bbcb15c40560f5223d3cfddf03"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"df7a8595806718d9c46562e54f900b7a*5&ff71191f33468706fcbd455399006c97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"6d17dfa33d40ded93861d0897cfb0d4d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"f01e3ec9260a245d8f7dcc653fa46201*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"0b72578892ab66e8b0e9caa279c095ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"feb9fc1d158052f15002e7adfa61d788*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"bdb6ace21e33a8d3716c0dc28fa9fb8c*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"17607502b8ad0c072259b230b4ac39f5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic576_ih128oh5kh49sh16dh0ph0_iw256ow11kw49sw20dw0pw0_n"5c9cbcc1706e68853878e408afa3dba6*5&7274e27c6d9457a262a6dc747b1e2e97"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"dba43e73b01e3eb7a897590d69570e53*5&c6bff6b7af96a54ca9995e42fd248df0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"8cc5ed7f0298f2bf3b9c28559054ab89*5&31cdd122ddb27d03a061703fafef47e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"3f2bd392b87ff3d616f25df6649a5d36*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh6kh6sh6dh0ph0_iw64ow6kw12sw11dw0pw0_n"8087e2a214646376bf3b4df739e87a86*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh3kh12sh11dh0ph0_iw64ow3kw22sw22dw0pw0_n"5539c3886db603e3e6f305aa7dd1500a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh2kh16sh16dh0ph0_iw64ow2kw32sw32dw0pw0_n"82139b6c5bec7ca21f3165df7249c5ed*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"cd56e4b69b7cfd2d90e01c34779ace70*5&0b2189d4cef8c78cd1e0e9d666c6c2ea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"9f1904a9a1c5875c1f1e3aec522394ed*5&aab08af02ac5dfaa783a6e74d2585bf6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"f7a628dd47682ac49565f7cf8ec0751b*5&ce8b95c4738cb5368055a5f4b192cfe6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic32_ih112oh56kh2sh2dh0ph0_iw112ow56kw2sw2dw0pw0_n"7e715e8b895b048eef1485fcc848e775"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"2b07e5c457e14119c83d0b74b636c290"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"2835260aebd6460b11700b01444d5a7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"465766509b96f06fbd7ff59dba3cb9eb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"b441e6cee1a3517cf3bd79be343fa6be&0d807d4594acdefc5d66421e0d3ed56f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"0d9f21cdd8a6bf9f7a73dd34b24a5284&b4413b79a74d05d5fe94e49eb57254e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"a8523bc82c80b5e308ca72d1f85de6b6&02abbbd775930bf902a5f5a9207b4919"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"4334a76b55bd827a4dc25af5d18532e8&c43a146e058f91c08b0dcf5cc0ecbb1f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"7d9af6234097fd11264f69af0048a970&1cb2d6f4b030164f6b8aaf79fa24567f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"d0a270beb849453b637eb705d3aed9a2&19d443d442fa5cc250ee9d37d3a9a059"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"3e462bb641022fb6584bbabc25e39e1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"28bcbc91e3acf7c06c84406e1b210b4c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"3328e0b1b92c4ce79a5b834a44cf593c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih400oh200kh3sh2dh0ph1_iw672ow336kw3sw2dw0pw1_n"fcecfb2c1c639d7ed6f8b7609c0891d8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"46ab794b44d2b53b07c461bac983cb98*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"e3ade27e8b395103a9e7976c0addfdad*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"a8c34468e2b0c34e67914f9d1bb8104e*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih32oh16kh2sh2dh0ph0_iw100ow50kw2sw2dw0pw0_n"ef96e3d6f037994771bcd270fd647ee2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih16oh8kh2sh2dh0ph0_iw50ow25kw2sw2dw0pw0_n"6ff1026d6720cfbb4ec27ef04097b329*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih8oh4kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"1245ca43012a00e75554ac507be59cb0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih4oh2kh2sh2dh0ph0_iw25ow25kw1sw1dw0pw0_n"828132b1e4dbc10f30aa6e30c3777a62*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"eca12427f49108acdc76c6e76d05fba4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"967bdf28e1901021d69e1ae09f0584af*5&da0cc476e9519dff44251777ab2ca613"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic16_ih416oh208kh2sh2dh0ph0_iw416ow208kw2sw2dw0pw0_n"980550cc22dac16c3bd66f67be6cd7d7*5&0f48f0ef65def452354b3cb135f361b5*5&8f56950bc91ffc9aad67de9ab70616c1&d88c489aaf57271d51cf691f3abc4a48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"8747b4a1730cafaeaeccd3841053dc48*5&9aa0707f8751d6a8cd3e0a51ab85c82d*5&a7d25780f167f05a1019f4b010a1fdf7&6944af20cc7a38bdceeba26dab0237a6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"dfa31d108e8cf15ea1ca3170ac2ae091*5&46560bf69842c805f3fb88a7b1c7bfc6*5&c97ed44d34006ae9b5dce94729827342&bfacaaa7e664d3c10b2ed204b4152987"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"f7d4d644511eb326511c2beda149b54e*5&0ea37fda2f9b74611461bd7a7eeb4a90*5&96206cf5b9f0c76018f3e5280757fb79&d91a7183ee71180410a436a6cbd949dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"cbb58297c6bc5049eb4187419929c896*5&297e98a03320bcccd8880c19aee51f95*5&80fec18d56d299271e931323b1e1b409&640c1e5a9058d6dfcbbfeabf87cffd93"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"15dc1ae8d4ba33fcd0c489bfae56094a*5&1e33a1e06073d34a675d155b440021dc*5&dd24945aa38c94e9094c291faaf118eb&40a9c9f986048ab3d52e69469a7f6392"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"426460fde2541e06075454b0875ad3a0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"4bb6d399251a3ce20012a2b583772ef7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"df61e2f1079a9dde893387c81ac4b013*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"1db3688f7005992e4d6f2d86fba68a70*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"536d9ac0fe3f1c7b64ce927a7f8474a2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"fe9a2c30f28dece6b16068480e226f5e*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"2c39d55b410a8b23d7e048fb87784f03*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"f53993bc712d08365d1f347ccdcdda38*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"be452c0de1f4e4aac35ff6aa0b0315a3*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"6083ef30ba6a2f8ad2a60a322e0c7d3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"5f4c8f2905422a94e3027c6566592741*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"6bd43f9406572dd358d21017a4a7acd8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b --attr-post-ops=binary_add:f16:14:aBcd16b+eltwise_relu mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"0dad80b347819a417e4b47a86b45e4ee*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"f3d59345fce7be6b5cf4340c54b63463&7d3d5fbdb18a42337867933e7d61bee2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"2d5e6c36616aca12bbfd89e0ef9fe0ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"5cf013e900689d0d77758b7a4c99bcbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"fd5d1074e5302a50e6a07573f0cdf2b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb100ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"c5c3c4dad371e2e0872f79257df22357"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb100ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"2f5256f0ad6da37761f3307916abd3d9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=ABcd16a16b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b26de6d3a9972d726a857f2f10141b5e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"95313a4b874eb1fc891e1c1a47cc6451*5&386f335af8b8b2a18b542d8555bbfcdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"23f6bb0acb158c12b49f3e578dc64a34*5&c200b2af2cab574b35638087c52a289e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"139367397891fc42ce36c654442a4090*15&a124845f0c08a0f474e73ea44e6432db*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"40d95592f6f553a9085984bdfeb10589*5&3ab40a7c67e28f6c4aea7080475935a7&e81114f1616c98cefb01c170b5fedd7e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"f2cb9748ff95b08638d1d9ae83205570"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"75f1b7ddf88e07fc48a92d5c23166b9d*5&6a6f009546ad6c0e2e74f4ad4b110a31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"f3bdeb4443d6cede92e7c34b3a9beaa8*5&b9854add04b897654a7ddf561a4de186"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"6bc9b3a5d16b3d8cdaaa4f7a64883750*5&206a60dc3d1d7e6fe2e7a5211f4e2ce5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"64332f4f083af287eaba2bae9bbfe523*20&eb522b4b8f52342424233b6c912b4d18*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"0cdd703915f35a86aa08c65df78534db*35&3a8547100d48c68e5c75e55e1897d310*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8967d5dba37db6bb6642eb7c69f8a291*15&a5e343456697a69c37950019d4b41aad*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"d3ad50a180dc2e3d9e560b3c0ad665df*5&528be60f6481b196eb3eca6c369b7d8e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"abd2490a200b6de51627403de5ef6cf9*5&0460b15c4f0c3ea3362f7a202fbc10c6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"475e520fb1c458bd4489149ee8d60cc6*5&7000151a638cc8fccfb2a7e3ce21580d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"c37b471caa47198713f88b072d616054*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic24_id8od4kd2sd2dd0pd0_ih56oh56kh1sh1dh0ph0_iw56ow56kw1sw1dw0pw0_n"f676bae7901819a55908e157f2a4acbf*5&5341cdad97e1ee048f556bf567b1ed9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic72_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"475779356f2b7b0fcc7936c59efc421e*5&a28591158e1228d2380b4fbc5a8777be"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic120_id4od4kd1sd1dd0pd0_ih28oh1kh28sh1dh0ph0_iw28ow1kw28sw1dw0pw0_n"00c9a5721345d597864190b264d82aea*10&12b2d3ce0498e50abb96472f456a6c9e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic480_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"0eaff4a67992a67cc845082397f17ad7*5&36f44e22dfa68b36b6d80d382be30c73"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic672_id4od4kd1sd1dd0pd0_ih14oh1kh14sh1dh0ph0_iw14ow1kw14sw1dw0pw0_n"8c17da6bff964230ffd91eeeb5bb0027*10&9ee411c7188d4526c506178ef7d0922e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic672_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"cd1ba7e176f033539c09c0bb28c1a8f8*5&d1a4e6508ba680c409fef7064c0c8a58"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic960_id4od4kd1sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"14b021125423ff19bc45f55d3e6e0b3c*5&a9232cce701e238a27ea1110d9b58791"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"dbf4040a735100ac6016412f96eb5932"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"2326e07eefadeca3b2261bd20b176e45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"3e1dd5391098b053c24844c13c9fbeba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"e508fd42c8802b9ba5aabf39ca4041da*5&f7c96a2073bd9aa44612fe87c3fbd21b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"cf1491bb46820856f8d02032ee55bd83*5&67abbdcd0b3eb24567b78917a2d88b26"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"3c970462a997dd597413e58cf0b897d5*5&b7121c8259935990b910d5e525bb9357"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"d9e15fe2f257b193b4434ba82b4a6168*5&26f88155d78b53062b5148ba1f1e43b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb6ic64_ih448oh224kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"1042cf13162fbb75436f9131c3916cfd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"0fb0271342f8396503fbeae75676f397*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"a6ebfa3dac91ff7a46151ea621f6ddb3*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"445043d115c8eb9add031b6f8877ff92*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"72477d0551e2667373aff6391528f74d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"79158185cdbde46762945c0396ada765*5&b8afb6190ac08bcfaded916f900b3ea5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"2bf696edb0d6b114f0952ce239ec48a9*5&409a1667943a7bd6821584419c624c8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"3780ed114f018a36840598b2375f4f91*5&ad89d1834de28b18841712afd18ca903"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"ab47c4692b733b4561a8096501c0c429*5&cb63140cf141ad6a3fa05174b9e66b40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"f929d1ebeb0ffe0c9704434f095cb020*5&4c14e64fdfa9d572a17833ed488e72c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"62f7e6e7272a71b7db0b48737bbbcd36*5&d78d75f242d2efeaa663c2b997fe520c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"059c72298eda2d176e01c765faff8fb1*5&0df66f03fd76e50ee56c9cbf0664e115"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"cb00e161435222ee318194cd4fddbf56*5&9bda5c16770ba6a2935fa67a24f90bb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"23b0ebcaf231ddb7a5815c12ba892b46*5&560f66204686bbfc04331f1fbd158bf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"34b861fd750a41f25d155ed7fbb733f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"da44e292735d86b36309903a9e214d98&1a8517563a1fd5d7e545904e40b83bf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih334oh167kh1sh2dh0ph0_iw334ow167kw1sw2dw0pw0_n"16b412da3cb4442da77b9c89a050304f&952928ef077d8bc76964355206adfb2c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"60c76b5787e9fc9ecdb5e86cd699faa8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"3b0ac219cf2bb918bc6cd9377976aad1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"ccf4916864aea6b8cf850ce1bbea5bda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih2oh1kh2sh2dh0ph0_iw320ow160kw2sw2dw0pw0_n"b18a6ad976b2318309aa4011e79988af&b6d2bfa4c270037caade3f4cb9406939"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"b977fc95121e82161ba21965a329d593"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"d69baee8c0b493f173718d26be3ec48b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"e8dc48a16a47e4b2078bd11c690051ba*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"1e7953659df98010b45bf8d4b1db4d69"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"6a8a422be0d40d22ee6025cd156b6468"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"5b9464ee9e1bb9f394d6b65e0a14f449"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"ef2c1d79a78bfde2f6ad00654e48af59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"3ac5d3f8289ab34d3cddb16395511b7b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"3f514710f0d7d116974c357e1e3d925b*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"87e25b431002c81519f381eda3295b47*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"701e8fe230634adb5f05a2b6b4e7b815*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"d4ff62c938da4a6b1fbe7d345cd16516*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"2bad9b4ad17beac56106d8127a0fcf9c*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"aabc21dfb084a93bc0bf0ac7fd53c958"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"827ed992ed9be5397967e2c36e6a9e77"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"9899c14b398cf921ab145afbd72d4c62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"fa1b5253298857acd5985259bc68889e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih150oh75kh3sh2dh0ph0_iw150ow75kw3sw2dw0pw0_n"8bbf0cdbc32de31b77f61d35a9e0e455"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"cae2745b64e65fd912ba6f2c39d75ee0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"ae6272ed1e821327e3410481ecf518e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"2dceea85c11e4dffbdd0c1db2c032adf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"6792147dbe4dff41857d77c2335049dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"bc46567929d484ac194bebbeef9c7eeb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"3111c1a9561881679664849d1648b2b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"590d6008c2acca3d54882d66e98724bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"2751cb5be504894bddb8627e21acb0b2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"c4057a19a1bfbfce1834e4f50266d7c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"2cf6066feab80cbc06c5127b5ad0be68*5&8234ce22354c0dcf362cdd6a11ee3c3f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"3a5672bd6239f6987b6b55455f57e27f*5&b1d4691323972214899bf714d72f2a0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"08a394663cfffb08a28138ca97aefd32*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"d186087c5f0809b96ce55bb1d1cdbf63*5&bc486e091a0b57a9c486562555810a31&d764d406ce46cc9c196da15fa3aebd48"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic48_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"6d219ff04569a6f5ba553185cc40ae86*5&686caf12729386771c60a8ce2759b056"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic160_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"1e7aafa36f019bed22e0edb9f6895e3c*5&3c7e07bdfab494abe235653c2812bc2d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"370cb8e6c82395afca1da74688aa7743*5&d0a51110b27b91cd8136fea7b7f28051&53d490616244b900d214b156218cd0b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih253oh126kh3sh2dh0ph0_iw253ow126kw3sw2dw0pw0_n"9a8e3df0b6412316f2922bcc3a355684&a70c9c95fbc7b04f73d46d021199e1cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih124oh61kh3sh2dh0ph0_iw124ow61kw3sw2dw0pw0_n"5deb46dd1cb2cb287849981ba147f563&89cbe3eb64ea1c9a9a6c49ca696d38ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"ee60f3a0cf14fade8b21bcf577e8611a&faf49f9def295522518ec68bc806feea"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"a0ef8be1ebe1cc68b19a221f83e2efb5&a33dbd2ce86233aec680939ec734f0b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic288_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"4e34b5d49f5cf28fd2090e044c0cf0b2&59ccabb98d7c081c65fd7d16ebbc430b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"30b7c7ca43c492a80e8b03358b8bc5ed&162c5753e36b1b07fc76f36bb7ee6032"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"e86b5a7591d2bb691f5f29480d0f86ec*4&b6f70a97374df52e42682645e4fd3f0c*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic768_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"79ecb97b2d41a65ad9b67b0e24a5b56b&eb88c290c8f60317898bfddef1342808"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1280_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"da666a2a18ce7e46d7c9665acdd5b305&890f74644ec6846ba5382928f97ac822"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"ca17a2ed5593bd5ba9c9d277db58eaf8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8882f1f8b0d0a21c17a218c2cb47b885"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"3ff7c4c3d460decc29443d10b74cdafd&61495527b661943d87147d850f70d563"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"6e3ccb2652e4a1b7c56148b48dacbfe1&58e6144efa7744bb17bc9e4b75053b8f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"beb4c04f5cf5a396f3f441304c7dc3f8&34c597c25758f25432f50b9927a510a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"c9e76b9fa6a202ba23cd47f143045c04&232665f110378c242368c37c832f115d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"bf76ff23be554898d5937f46bc15f4a2&b4de38581555a77b36615f18e34f9ecc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"201e13afdabf81758f71c79d23bc3266&6e88ac668205190394c70055e08e4a36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"7f1510b39c0dc6702ce6ead1484d0c95&9e35ab9c49f02934f75b8c64d07cea2a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"e4018888435c45cd0d5d15fb9c3adeb9&f29f359b74f203e3583e77cf20e918c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"a797f9391ba44cb9d2cbf1ff8fa345a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"23faee9f489ec84d45d47018edf0f020*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"05614a6e9ebd95f7bb15de9fb424c11b*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"f19a98147526daeb995c2adb9d163ae5*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"4b742867105bf31ed67333357aaf9815*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"75edb32f752d503469d005a701b39cf7&64994d11a1c5ba2a7cef86c25aeb16fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"0ec825907ca1ee32acd6099053c4e0bb&c8d20add5cb74aac2ff358cf43e11f31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic384_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"e795aa3fc7ac77ce6cecb1d17ce8c366*4&133b3dc5e038e06bfdb517c23cb6e04f*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic384_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"286cb38373535e86a0abf74fa920c7bd&b6fa500dd6631a5d90be14f383189fe9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic1024_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"9dbf197feeffef93ac48eebfe7e8cac1*7&3b15ef3e3f08ed08c9debedbd585babb*7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic1024_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"6e8004ab1a64434d36d320d80fca8f4a&cab586754bda2c70dc1eec268118a392"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic1536_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"2fa04e7cce15282ee19ac0aa4dc74ec2*3&d48c066662cca3df3007fc8747fb7c76*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"e684b5b60f3ca737cba9670ef0fea699"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"1472ab1cea29e157b94d9e6bd08d08e7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"c84ae38c9d059a6b6db07d31bbece5fe"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"e2be7af6c4d652c69228a56ef54284f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic288_ih2oh1kh2sh2dh0ph0_iw320ow160kw2sw2dw0pw0_n"e3cd0a090c1012ff7a175bb64dbe58cb&1529d8567b789e9cb3f638629f65ef04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb1024ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"041c9e5842f7a10a62b65859554bc40a&f53e5a07c9637d9f7c353f14a4279dc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"06ccf592425602bd949d02b4fae78604"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"363c48f4b51ccfb3cfbeab17fbac693b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"100cbabfd84c47d297b3397f6fd015e0*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"d15d3216932fd88f6942b2150c32d047*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"5db91252b01fac030b5b287365d2b04f*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"2813617ad789bcc82eb691804193e653*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:s8 --tag=aBcd16b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"5a9e6b282c698868148e5200c2a4971a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:s8 --tag=aBcd16b --attr-post-ops=eltwise_linear:0.25 mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"30105bb30212de8424f53e39af832978*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"5fadc344974571dcc686f9915d701d62*5&ece5f68ea7860c2e94aafc3a7b5a1d69*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"3778cc3b8f7eba61a6844c0287072fb8&0a02031ce0afb464db45bb4b01b51f34&f50031edd5a1e31ee3e3fdc6b2a80932"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f9141336feadde754ea56bafd168de51&81d15b88d477e9593b33ea372f8e8f03&5a1e04ea04e61f06be4598e255490dde"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"b274b80fff5594579c6aec9983397e5f&ee2394b581023725c4e877e615cdf37e&a927cc7173a3e855cc19d07491769f37"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"29c220892c22246f08ed2d713643ab30"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"6f091f29e2e721f13f30cb1f03dc4c8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"187847edf569a3a02a94bb89c7528fe3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"04ed3997e8eaf8985910bc2c47e60850"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"367b7fb2385e4e93bf7c206ef4add0dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"5aa902884bf26cfd7f4f161f40e25793"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"ea3d7791bc622b471292524c00149c59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic24_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"550c15d46a88f262b0c2689982395df5*5&53f5b579fa85d47cf78f00d07876dbc5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic64_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"72cac83c184139b08ce05e89019fb0fb&eda26333f24444ca14a6d51fda772278"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic112_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"e60656c2cbf5577cd20ac075ac7a23f0&59da91e30328c12803c6a1e03a8769dc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"06ef914565b87b883ed2b33759935772"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic336_ih24oh12kh2sh2dh0ph0_iw24ow12kw2sw2dw0pw0_n"7a81e943a8779a571467d78691d74cd4&acc7b9792d3868eb01306b0a4966538f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic512_ih12oh6kh2sh2dh0ph0_iw12ow6kw2sw2dw0pw0_n"ba4aa8958c2d9469d305fb54f0f36467&82479c01375eedba59482b2d9a67781e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"5ec4d71cd0c8a434dd153ebb14ac51a3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"d956f0efbe91775073d90a6eb26fefc4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"e7f46a346655aa4c27df439b0b6bacb5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"ec034d921e0b5810069607460861066a*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"5d0d52ed60a1a536901f48ac45da1540*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"13840fa372c5b2282dd7057b6f0307a6*6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"4ae4538437a093a2392cf25450764a4b*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"1271ae3caea400e3877f83b8f8f8a112*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"2ac80c60d6afe8ba496c7c70db181e28*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"59b32f1e773468b6075600b7025deb01"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic64_id40od40kd1sd1dd0pd0_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"98c961a0aaad0f8359b2b77eb1a0bf90*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic192_id40od40kd1sd1dd0pd0_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"6205bdda6a54a8d1d4c7ee54c9ed3c94*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic192_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1d38f7c38bf394609e2e9d260e6649d7*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic256_id40od40kd3sd1dd0pd1_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"1f69a7ce68a25dad7f68c5d6f0fde524*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic480_id40od20kd3sd2dd0pd0_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"3309ea29067d4b362f79f218dcbcffb9*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic480_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"26288c64778690ced8dc64d94b63aa85*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic512_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"3d2e283507bc60b67d0b8d743fce56a5*15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic528_id20od20kd3sd1dd0pd1_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"b183446d96b982bdf08512ecfee68286*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic832_id20od10kd2sd2dd0pd0_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"97264ffbc96e94d99c9685269aefa1bd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcde16b mb1ic832_id10od10kd3sd1dd0pd1_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"2acc379b3077c2074245fc3bdaf2e181*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcde16b mb1ic1024_id10od9kd2sd1dd0pd0_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"d018fffedfdbcd0d067a62eb4abde16a*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih300oh150kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"6bd34baa6b641d51d0dfe0a64e0bd9f9&52ad71f487328ddf6245783d2099a53a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih150oh75kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"b3b08b2ee50e273688e998330814c2f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"aaa97e9cfdf7ba183b7e1cb2ffe14388"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih75oh75kh3sh1dh0ph1_iw128ow128kw3sw1dw0pw1_n"8c8112e326b48a9deaff39ac74374227"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb3200ic1088_ih17oh17kh1sh1dh0ph0_iw17ow17kw1sw1dw0pw0_n"361a0b9b50a443f55db4594ce1019b3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb3200ic1088_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"b02f0806e2af78db482f7edde80c7f89"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"2ec81e95ff3309f1657651e2642d387e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"d2f40a73ff624e488ce8d16675a555bf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"b862d7a96bf590e4d29576eabee68e70"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"4d1692537910f58ce19762c0085b9087"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"b1d65585243d7328784d56a7d1ba4743"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"fe68e58b88947d9251d0d3dd3d96dbcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"ed6dcb531d57a97817da817b5127a72a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"e46a4de09bac4d9c2a9d355ce2b515c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"f0e11f5a2991fd2afaabf9fc15c4bd72&cf4d4e1a6236b3439f75bd4bae9e4dcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih160oh80kh3sh2dh0ph0_iw272ow136kw3sw2dw0pw0_n"ee3d9aa48e694ee4590255756a5fa760"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic96_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"570c4115e8f0d2fc357daf28690caa51*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f51d52877c8def382e4d60186969f6ae*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic64_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f61b837705f2a6b16409d6586a94fe49*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic384_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"d75f76fe16b06fb847fb24daa3facf84*46"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"766db65f3a3ae042524d10d29e7cfdcd*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"a37bbb6edb4cd0c9a3ec6d4a0590bb78*23"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih1oh1kh1sh1dh0ph0_iw500ow100kw5sw5dw0pw0_n"bac558481168657dec65cad76dc267fb*5&e6ded5325a905928475adb4228d813cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih130oh65kh2sh2dh0ph0_iw102ow51kw2sw2dw0pw0_n"deb9454a1980c9e3db1235d51592b292*5&f32fd37da366b912a465461a927c1cc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih69oh34kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"9f1ac91972b13fa8ab8bf18542764697*5&7dd7463b8ac2912940b7c940c24bfe54"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih38oh19kh2sh2dh0ph0_iw31ow15kw2sw2dw0pw0_n"da9a47a05e99672a8d85777ecf267227*5&07694239af70cf02f58dc88fc74c2e36"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih23oh11kh2sh2dh0ph0_iw19ow9kw2sw2dw0pw0_n"2ef3b248096e6e2296c65e39d6ea3655*5&b118b5d675256a36453b3e71740645b6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih15oh7kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"4e37b47213de53635aeb526c1b23514a*5&8581788100e05ad5abaadd388f852365"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a32b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"5c48f909f01f5c886597ab6469d52737*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"3917da2f96bad8b77020bda13f5369b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic128_ih55oh27kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"f00cf88008787a8faff937da56185e59"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic256_ih27oh13kh2sh2dh0ph0_iw27ow13kw2sw2dw0pw0_n"d5383516ae55ac5db17be3154dab041a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic512_ih13oh6kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"87ab6e4ef138d27cc93309ddb4ecec8a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"cd9fa81c88d4d6ef9d15f2018a8d2f5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"3710cb77904aa10b4222bb0c62155283*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"b9a9b0f8a5255c460ae2a513e3b822cf*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"1a9851563b52507296a5beeb2968d1a6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"c00ba6009faebbb65d9e57b09f6ee0f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"ba3b08aa8c5363d6254b2ebeef615eb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"b8ea4596b5c7ba462028d06955a6c178"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"3ca4f936eb78a090b888518d40ba1323"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"4985c73e462427fca0d718cee66c34e8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"f7e9cd8fe40b8a90b077367678c11701"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"52cee947e33e8a2fd683bd8dda61091b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"e67dae44152dd4af36a6b864065dcb9c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih27oh27kh3sh1dh0ph1_iw27ow27kw3sw1dw0pw1_n"1ec334e10d634a4ce184980d057c4eb6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"779d1b944f999909ef32a61161365451"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"781affc5d53d3e5e859ac7354f3b03d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"2e5beab01c69bdb9548fd6bf996c2f75*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic528_ih13oh13kh3sh1dh0ph1_iw13ow13kw3sw1dw0pw1_n"bfdc0a525d35236ff936773ca3a818cb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih13oh6kh3sh2dh0ph0_iw13ow6kw3sw2dw0pw0_n"c5b8c33ad5bfa7d301d65e4e71345864"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih6oh6kh3sh1dh0ph1_iw6ow6kw3sw1dw0pw1_n"0d17af5771430e4e0c80768492102710*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih6oh1kh7sh1dh0ph0_iw6ow1kw7sw1dw0pw0_n"2a77702848620532c23d782bb4376f51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"74ceba36ba324eb95408963a3141b46d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"01bc73d51561d1cc287952f2ce8a5483"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"f9ea51cea0c0f651f668a1734f28c44a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"36e539b803ba6b2db84ffac32427ae22"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"e981e8e6f11f45b5824818f014b66dba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"30bb91ce92fb59b6b5a34edbbde7ab91"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"ab0ec4766a29c3b5fd3800c12f1e3a25"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"0e2414923de42f741c800a4e66d87be6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic4096_ih166oh167kh2sh1dh0ph1_iw1ow1kw1sw1dw0pw0_n"7575bb333b55b8f51aec00dafec1638a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"249ee2c93771adb25ad727384e8a817c*5&ae3c87943fac806fae05036b50531d9d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"79eb10b2c02f3237be4accd2533ee61f*5&9f938e4154e32f38b0e2daa10d4737bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"82c115ebc7cc1ad4d3d64066ce769da3*5&c13d6a58ee6f9b39533e5029ddec0ba5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"6bdaaa639e8c0782244e8eb4f85f32e1*5&7e87839649e357a2e5efa30ba322cdb9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"98c0edf27dca0fefe5161dd592297fd5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"3bb6f1b181526aa07c7c5ed25336e096"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"880544dc9d4e16a32170790094427d9e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"2a2992b38042f1dd6f877be290052f40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"3cda84af336ecd2e01c578d261c9a629"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"bcef86ec62281ad03abaa1abd33fdc27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb3200ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"02d1776cb4d7ff7dcc1366bfd5da3aa7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"55a71f47ca3f26d0ee0de361492bb30f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"54c8aa2c6315d9f1cab266bbd7fa0c54&fa224a1f539b3495c9de6657b42a56e5&5384562f353af5fed8dbb522a832a1c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"dc2fcd2538a8ecd2eb4439a19445eb2b&af3ba6eed7265a00c15a7a93aa21bffe&cb597b1c2aa5d5ac34116e990838d27d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"6c9c48e8b03d51a5e4ccc932138495f3&b79b11e575e358b57e28a495d4abccd4&acd5b72e63baa5a98de57cd3c0bfe7e6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"18c397852bbf46c5634106a866210df9*5&a583e80ac6af2b8f90381e92fd2e56e0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"fb1e7b0f05d2d0b01868506b20ae6ca5*5&7c462c51959591f255e54bde8b551b45"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"91f4fac561c9482678bcc3addae7080a*5&7b12e0d10166c9bfb5fe72ebba19a048"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"7aba4c9fc02a59f327bc4538ef842fb5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"a0cd60c55beaecf79751c494f0b00d62"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic192_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"660925bc39e414732d70fd663db6f0f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"6b9521ff1a3bbf8c152369c18b7b3364"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic288_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"84ad77a30e3bb9d3bc611caf3244bfae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic288_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"d904c35d97588339ca534d82616a7a15"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic768_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"dd75949f2c2273597af746c21d8e5410*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic768_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"113fe82ac19459ea1266ba5fa8d92468"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"a5d243e7af94597619aec6aec8414059"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic2048_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"37e6ed79887de92718e4c5c3a46143b4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb5ic64_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"96a61d9db0d198566f2b3a7981f4c1cc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb5ic512_ih14oh1kh14sh14dh0ph0_iw14ow1kw14sw14dw0pw0_n"5bf51e551f632cfec5244708887cf775"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic640_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"6e778ab9d7d621149093ecc5a7018b3e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic3_ih1024oh512kh2sh2dh0ph0_iw2048ow1024kw2sw2dw0pw0_n"9858ae6da70f99c0af7bb11872dcb28f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"bf958baf77377649602013e295dccd32"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih320oh160kh3sh2dh0ph0_iw320ow160kw3sw2dw0pw0_n"8d7d0006ef67b0a05aff1f9b261b1ebd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih128oh64kh3sh2dh0ph1_iw128ow64kw3sw2dw0pw1_n"2f463b32a471e64e97ddeb83875669c1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"716575cbb498b69d2d938847a70bb687"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic96_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"9d9e7648728a75b24578d556e546d11d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"d242fba4ae8435414f16c9d8215c5582*5&006c0532b5fab209dea8c48414b03e19*5&8beaf7d8d2918dbbe0d90a610fa8a453"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=binary_mul:f32:2:abx+binary_add:f32:2:abx mb32ic64_ih12oh6kh2sh2dh0ph0_iw12ow6kw2sw2dw0pw0_n"5ba233ab5da23de2b78de38d72bbc861"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"766957d1a8f1080bd9b492168af3d6be*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"5abd81220dbfa91f24d0fe7773170a0b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"3ddb71493398c2dbb6591b91beb82ea0*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"143b1b2d25ca025cfbe6989145d44bb2*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"e74f1cad49be07f205380f9988bfd3ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"82dfb3e6791727dd7895f0580701ce2e&88e541aee86201ba6abef3f12417983d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"af0066be54d1dfe1cab095fb262cc92b&e815095b93fcf41c10365de2c9f0cbef"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic64_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"85714f0de6ed407431a0e5fd2638366c&6e40436c336ceed3c90acbcd8360f99c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic112_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"55c3a22d80c061411ab7ccfbe5d45d0a&55e3c3c885da082b74844f6a8368583b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic192_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"734e7e3dd99a87f06e967a4287a0af19&205415def73102a15127368f83859bdd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic336_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"353fb5da95426aa55e11a7b5df46145e&cc32c367e4d619f115aa495ffc0ae022"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"052ae4ec544cf7f455aab50dcb8d8023&b6de8dc4826ba0fbb0dfb46f6e441a0d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic512_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"4016d8170ace8b15926a4d8b93e8e2ef&14eb5e8553615220902b3ab6ee93f0f6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic96_ih112oh55kh3sh2dh0ph0_iw112ow55kw3sw2dw0pw0_n"aa11eb5dfee6c8144122177d945a0acf*5&a38ce8c48a9ead1f898ce686806be668"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"a88bc43246f96e63f13ab1cfccff97a5*5&23310e1d1bfe4c1c84a14947d0b9bbbc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"4473d801fbe40980bca5349fd5039255*5&a4cf2c245febbe29949776f357a57956"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih19oh19kh13sh1dh0ph6_iw19ow19kw13sw1dw0pw6_n"55880d825583ba8efa6706688106feda"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih19oh19kh9sh1dh0ph4_iw19ow19kw9sw1dw0pw4_n"417cbf159f05371d4f0b3dfa6251f844"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih19oh19kh5sh1dh0ph2_iw19ow19kw5sw1dw0pw2_n"0b8a5a1b3af26846e68e7135206847ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"bc11fd9137ed5f044b0f0401872ccbe6*5&556ccc06c6604950e2bbf45029b0038b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"b5d85a33209518ff9495eca49fc2a0de*5&0165ba5729ba727c35f6962633c1a5a1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"2306f49972a1e7bd296fd4f411765080*5&2b5d7fab9c25178687c32d456366e20f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"8c3ad692c1eccc7ab678c5c44e46832f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"dd4eca3e905ce9e7a0510c038fb4c5ae"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"41b99a442799e744b170b1ed2bfe0e35"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"442c5dbd9aefc20ddfe967678b527cf5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"829d1e79eacde1aaf8d3d76a70933e50"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"00d87587f6239b5a1396a65d59a44c3d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"f5764ee58da40fdb449124fceeb44555*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"2934af70ded3b58b12b3094fee0fbadd*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"9ce943109c0cd18748a85aaef94081f6*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih253oh126kh3sh2dh0ph0_iw253ow126kw3sw2dw0pw0_n"ae732125154fdb5166ac97fcddb709b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic192_ih124oh61kh3sh2dh0ph0_iw124ow61kw3sw2dw0pw0_n"fde8b6f79278b3723aab48d61648c8c9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic192_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"b7653d7224f65f71d8a9f00e40a6e026"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic256_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"9e855ca32e274ac66c945e1d00dc509e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic288_ih61oh61kh3sh1dh0ph1_iw61ow61kw3sw1dw0pw1_n"d96d07323600bbf4852c61f266901f81"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic288_ih61oh30kh3sh2dh0ph0_iw61ow30kw3sw2dw0pw0_n"7d96ab6f4e8dd80babbd78366bba5883"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic768_ih30oh30kh3sh1dh0ph1_iw30ow30kw3sw1dw0pw1_n"aba78cace5dc0da1b9b19faea5bfb3fb*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic768_ih30oh14kh3sh2dh0ph0_iw30ow14kw3sw2dw0pw0_n"873c8ce5bf82c2e924be3da28b0c24c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1280_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"39363e35eefe15157f0ca767389f98bd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic2048_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"2d293fb48fec13f33eeb729250725400"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"16cd216746914619a5fb190d207d6565"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"3da2614c945328940ef5a0e96de6854f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih28oh28kh3sh1dh0ph1_iw28ow28kw3sw1dw0pw1_n"ee20fcee4d071fa01b82e2d8e55719c7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"8bb53e0fca5a0630a6886f24947a762d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic480_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"97762810d10429cb0438204c8f934444"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"1008ca3523a460fd0edf96074fbbb745*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic528_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1_n"a086bfdce79aad384811f7f69c182425"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih14oh7kh3sh2dh0ph0_iw14ow7kw3sw2dw0pw0_n"d8c18d532fcc31542677a399cb3d7dca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic832_ih7oh7kh3sh1dh0ph1_iw7ow7kw3sw1dw0pw1_n"328c00ae4d74998836915169d7004ba7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"f3bab26b2f4085b9ca7adc31d2f07d95*5&49c7d0364ed145d03be0c217a4c7f598"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"8566878aea08ad08226bf1be0366e027*5&798eb0e9221d71958fd1925a81b6aeb4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic640_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"f8086b5e706b52fe236b7f7d080048dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"41935c1a90ce57f52f9c790c6d9626a2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"ab5f3de6e38e70334d9350ceac940f6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"8ccf18a443966907d1ead33d9f044bd0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25 mb1ic1024_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"ef9b922cc622f2d25a75c77ff1717974"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"41e8233bd5045d7f86d3c11ef1341dc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"15c1a9f5a99e9f7124d8e07a98e18ecc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb100ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"3af4499c706ba6ff326c3c7e83ad9cf4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb6ic64_ih448oh224kh3sh2dh0ph1_iw336ow168kw3sw2dw0pw1_n"a192e11be18b07af6fe92c82e9a6e2a1&ab712d5843cda2ed67e59618d7290cac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih32oh16kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"d366d1737dd8a15680a239e9c942dce3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih16oh8kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"40bc793d3d0fe0993cc2269b98e5f7dd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih8oh4kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"66819451faf12bc5fded01a11b261128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih4oh2kh2sh2dh0ph0_iw30ow30kw1sw1dw0pw0_n"e4a714367a037cddf85b18491444005d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic32_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"8fe4efe97184d70d6a819df7939a71ac"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic64_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"7279efa715111e8550312e0ed183a1cd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic128_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"f844c3761b269a5a520ad570a53d0941"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic48_ih192oh96kh2sh2dh0ph0_iw192ow96kw2sw2dw0pw0_n"f1eb3d09acd7721a8f52bdde1ed9de6e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic96_ih96oh48kh2sh2dh0ph0_iw96ow48kw2sw2dw0pw0_n"a0c03b2dc36623cf6d5fac73ccdd05e3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"2585605840f10cb88520e55ccae6e439"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"a4198f164b5fe6b03ab9011dd588a5df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"58abf752ca5f7e2b87dd05e6ff66488e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb3200ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"195ccbf76bb0aea29a5f8f475cc31734"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic48_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"ca859530a4f1fc53e1ed219b54f2e72c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih64oh32kh3sh2dh0ph1_iw64ow32kw3sw2dw0pw1_n"9a667e68979f851a1a4bc0957e9fb254"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"760215be78364efbe41520260dcf24de*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih113oh56kh3sh2dh0ph0_iw113ow56kw3sw2dw0pw0_n"e42389b8755ef5abcf1c8b2eb7cff7f8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih56oh28kh3sh2dh0ph0_iw56ow28kw3sw2dw0pw0_n"1d177b947718dd446aee88a3cbe18bee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih28oh14kh3sh2dh0ph0_iw28ow14kw3sw2dw0pw0_n"f68b1bff6951b7e6a8bed7ababc7c825"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih77oh38kh3sh2dh0ph0_iw77ow38kw3sw2dw0pw0_n"cb681aa8f8992c020641c4fc4ea7682f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"b1ff90a483a45e740bd03c49fc3160f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic896_ih8oh3kh3sh2dh0ph0_iw8ow3kw3sw2dw0pw0_n"83d47241fa7a1416b642f3efbf08e4cf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih667oh334kh3sh2dh0ph1_iw667ow334kw3sw2dw0pw1_n"d4559099cddea52f00e3a682b5ba0baf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"32e99c0d3dc858503451aecb19347c6b&6eb5eb372e23a3f6e20fa4ba375c555d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"b1b074ddb5b51ac6dcec354d33ad4a43&19af9435505be2290434d4d8524bede2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"3690ae023016ab39dfc05572ef896d1d&d5901427882b97371198f5cf4aed95f4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"a83609cfa0d982d793c725f934e6add6&2df9e91fa85f15101602707863320469"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"67822b5c09c0c9318020391a8c0230b6&d2a00e312837f984fbd164ac6b9aa151"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"02146a13c53b5afa958cf7283217cc86&8cbbb26ca912567b642d517d366168fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"b47540385ed4bef06f4daf4d649933be&6ddf95d3fe796e56d99ddf9a215aa826"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh2sh1dh0ph0_iw13ow13kw2sw1dw0pw0_n"519d7592266c99a50d74ddd48940ba89&2a56db2e920954f6ac240e90065d8cb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"0783cccbfbbe1b75f08b86f567b48859"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih114oh56kh3sh2dh0ph0_iw114ow56kw3sw2dw0pw0_n"218e0ff7a582639d8bd77e65eeb4acad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih512oh256kh3sh2dh0ph0_iw960ow480kw3sw2dw0pw0_n"6e9802a56b3bea0b54be7375da0fbcf3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"f3fb70c36cff84c69e456403d03e94d4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"6f0edbe2e0d5c2999921a61443869589"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"7a98287aa40e856048d23cd5d3953905"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih5905oh590kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"759df08ec249543902b09dfadb0532d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih117oh11kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"f9ca0e6500d7152f80b1db08a94e91b5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih117oh11kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"6dbfa3dd8d56e640be8bcbe1e5788c04"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih117oh11kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"4b3966c825d6b7867e4db223db99c3c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih5905oh590kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"c139b4ecbe5712ba9694a324b2ac487c"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb100ic10_ih5905oh590kh10sh10dh0ph0_iw1ow1kw1sw1dw0pw0_n"7473ba3c185cf20c24397374aa323dff"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"16cc230f7caead2af9fd4ec101019cdf"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"72fc55db05cb80842e6623911afd43ee"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic320_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"2ac0122e865220a5ae10e14332455c52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic24_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"a73561b182ac408d274891274d05f1de"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"576c77068595108917c548a1f47f835a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih256oh128kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"d6372f865b8985a6570746e8624d0595"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih128oh64kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"19421c60c9a40029343af8f978fbbb06"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb300ic1024_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"e4c5b407f1d40a27fda05810d23c49a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb300ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"5bc6aaa531338109cfbb9809e66a50a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic32_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"d9014993d4788af3274c14d3c09726da"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"4d7a33a18804860a488c7e5d64c63e0c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"2b52b7286b156eb45ff329dc14e976b2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic256_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"afc4298bc92120652231dd297f563c7a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"7d5b7505f954e2ca6d59d29c29c499c0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh6kh6sh6dh0ph0_iw64ow6kw12sw11dw0pw0_n"b7f7fccb12b76266ba876844eb4d013d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh3kh12sh11dh0ph0_iw64ow3kw22sw22dw0pw0_n"9ccb7bb6175a7dc7c0f80c0f83add4ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih32oh2kh16sh16dh0ph0_iw64ow2kw32sw32dw0pw0_n"38de2e3ac7bae8ec588a1d94a06d5580"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic48_ih256oh128kh3sh2dh0ph0_iw256ow128kw3sw2dw0pw0_n"96d9f09fc317cd90fcace5e5711dbf31"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih65oh32kh3sh2dh0ph0_iw65ow32kw3sw2dw0pw0_n"b81ff055b9e782b03e749ffbc47f8af6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"ca0755be0b530488fe7192b1071c3b0e*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic32_ih208oh104kh2sh2dh0ph0_iw208ow104kw2sw2dw0pw0_n"cb8f50c7cb8580498686a73fbafcaa55"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih104oh52kh2sh2dh0ph0_iw104ow52kw2sw2dw0pw0_n"c29f7feea13fb86f3110b2692e4052c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic128_ih52oh26kh2sh2dh0ph0_iw52ow26kw2sw2dw0pw0_n"60cd0933a3008cca7794a224c1bd9c78"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic256_ih26oh13kh2sh2dh0ph0_iw26ow13kw2sw2dw0pw0_n"5230352982b1307f98003d7483e6ff86"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih1oh1kh1sh1dh0ph0_iw500ow100kw5sw5dw0pw0_n"5771e53523c55bf9b361a243f34ed4b8*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih130oh65kh2sh2dh0ph0_iw102ow51kw2sw2dw0pw0_n"5a61baeffbd583367cd769b8aaccb5bb*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic128_ih69oh34kh2sh2dh0ph0_iw55ow27kw2sw2dw0pw0_n"2c6d66e3dc0d8eada6a19c71ee1f645b*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic256_ih38oh19kh2sh2dh0ph0_iw31ow15kw2sw2dw0pw0_n"ed310f9429b8369c0b0f83a1399ca310*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih23oh11kh2sh2dh0ph0_iw19ow9kw2sw2dw0pw0_n"81952a18831b8b42c279e698e34e79f4*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic1024_ih15oh7kh2sh2dh0ph0_iw13ow6kw2sw2dw0pw0_n"cdf92e5bf0bce65342c804228cf80048*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic10_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"52724b95d5b7f2fd1f9741dde4ad87c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32ic144_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"22bc4403f1c5dfca47792ddd8319dc9f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32ic320_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"d3806cdb523c047e8eb52f1d76d3151d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b --attr-post-ops=binary_mul:f16:2:abx+binary_add:f16:2:abx+eltwise_relu mb32ic704_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"c279f81e0d4a7b90dd00ca5dcfb103fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic3_ih1024oh512kh2sh2dh0ph0_iw2048ow1024kw2sw2dw0pw0_n"c2aa1353ea8fd503b9d111f257066b94"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"09644c7fc3caf90138f69666cf02a62c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"10892bd001231ba10eeff419355fbcf9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb32ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"1f8bc5aee633800d6e7794e8686b9943"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25:0.375 mb32ic192_ih48oh24kh2sh2dh0ph0_iw48ow24kw2sw2dw0pw0_n"db4efa42edb50bc89981cf517c62328f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd16b mb1ic96_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"4d37883f3658ffb6af78c24c9d9aac1d*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd16b mb1ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"faf9d057a20caeb5ef504f7d111270c5*5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih368oh184kh2sh2dh0ph0_iw480ow240kw2sw2dw0pw0_n"8792aa7bf2f824f7bbb30db5bf1e40ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih184oh92kh2sh2dh0ph0_iw240ow120kw2sw2dw0pw0_n"78fd6cc60050aaccf133d7b182b53ca2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih92oh46kh2sh2dh0ph0_iw120ow60kw2sw2dw0pw0_n"e4837e9ef82a83027bed1feb6737b124"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih46oh23kh2sh2dh0ph0_iw60ow30kw2sw2dw0pw0_n"f82246efb3b6f1e593e5bcebfe9b6ea2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"696212e7af16dd21895421a90809edc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 mb300ic1024_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"1d51b781e18cb70fb99a4e259d3faa8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb300ic1024_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"d45c0d2da1df5a1fd2e15225ecd52e51"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih256oh128kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"d717571484c81f48afaff40620efc37d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih128oh64kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"6c0dd2b09373900de3064a1318f23699"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih64oh32kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"16555fe91d8e1cf309ca5cf6bfcbef3c"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih32oh6kh6sh6dh0ph0_iw64ow6kw12sw11dw0pw0_n"8b77674c67600743134e50c00f0d4080"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih32oh3kh12sh11dh0ph0_iw64ow3kw22sw22dw0pw0_n"e178d641001f93cf21ec230303974a52"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=ABcd32a16b mb32ic1024_ih32oh2kh16sh16dh0ph0_iw64ow2kw32sw32dw0pw0_n"98288d8695e0b638ebc95d53b0e6f698"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih256oh128kh1sh2dh0ph0_iw480ow240kw1sw2dw0pw0_n"fbfbbf83c45c90b4f53e22c88e3569f3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih128oh64kh1sh2dh0ph0_iw240ow120kw1sw2dw0pw0_n"4f0cffcb399505c270b01b7c31d47309"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic1024_ih64oh32kh1sh2dh0ph0_iw120ow60kw1sw2dw0pw0_n"0eeb71f126a739a8bc26c3db3284574a"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb5ic64_ih224oh112kh3sh2dh0ph1_iw224ow112kw3sw2dw0pw1_n"b47b59f55aac5f207a1db1337e8d3b61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb5ic512_ih14oh1kh14sh14dh0ph0_iw14ow1kw14sw14dw0pw0_n"83e9615bd41dbe8e6442849fd3bcc22e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"ee31f77ef19a4603b9718454ea776568"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic128_ih32oh32kh3sh1dh0ph1_iw32ow32kw3sw1dw0pw1_n"cb0aa908bdff888f7270231a3805efd1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"275a3a6daa7f11619f48f71f8b3cb634"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"04926c84af6e88e26eb66512a1474fbb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"7fd95270202b84049cad70155d42e430"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih150oh75kh1sh2dh0ph0_iw256ow128kw1sw2dw0pw0_n"cb3376f181c6ba9c372ba5deec573196"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih75oh38kh1sh2dh0ph0_iw128ow64kw1sw2dw0pw0_n"67d5b77e9b6fa2eb1c430a168f1b6e84"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic3_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"c3a6026bf659ad87ca0f05f12aa2ae05"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic3_ih256oh128kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"07aac25e4ae03d233f31591ce3306a39"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic64_ih111oh55kh3sh2dh0ph0_iw111ow55kw3sw2dw0pw0_n"62e6c007bf8ddd0a150fcea343bab1c3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic128_ih55oh27kh3sh2dh0ph0_iw55ow27kw3sw2dw0pw0_n"e31c1e76d9d6f65bddef830d53e92d56"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic256_ih27oh13kh3sh2dh0ph0_iw27ow13kw3sw2dw0pw0_n"4585f70cf5447823753054538cff53a4"
diff --git a/tests/benchdnn/inputs/pool/option_set_fwks_key_gpu b/tests/benchdnn/inputs/pool/option_set_fwks_key_gpu
index 402e638a521..e135060b4d1 100644
--- a/tests/benchdnn/inputs/pool/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/pool/option_set_fwks_key_gpu
@@ -1,202 +1,202 @@
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"f89f7212e7418b211528e694c2f76332*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"bc000e45e6aa7a107fa96fa95b603c29*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"04820ae4009bffbaa3555ea454e15ac0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"a4ea6e3cfb7d50ebfbb2c4ba660c93cc*1&43562e0362649233a1e08acc9c1ec3d2*1&4820e4034e34b03827f932bb4da3c48c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b mb1024ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"984a04d1e9135b1095295de6c62469db*1&67b6ef8b7f25458865666a750e3f8d59*1&e51e032f2b2e5a97f743507b83a4efd4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"a22c77d03d6c598d36fad8ebbe51da85*1&1d9c7e67ca55f77366f967d36e6a7882*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"6238bafd2c7ef65e0e8357ead8faa4f7*1&c510b54e362b955115cc0eac35334283*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"6b4035f2d5d011eb2468897058be7ba6*1&3b32445fc77dd98e8bb942c384275a4e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"431fbf7fe8291ef27e6ebc7427cca258*1&3613af62775b5510a19019a0f4866548*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"ccf8601f397821b814bc175578f6b18f*1&4abf6d1c9a6d2397c7e2af4f8acc1565*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4cca367e07b556eff6b9ddfb388d4f2*1&4aa04b15dc79706f0ff8dbbe9e7270a4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"ba0a26ea18af56712e389018cf022435*1&c15c704a691f9310b0e401ea825a0758*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"dd4c0c52ec68b889967ade9854c9e3fc*1&eef007d65c8274817f7cf6101ed9f8fe*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"520f555302e25b6fd26538a578f60351*1&60bfe4ebae9ce80d328708068f68bb0c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"baecaee3293beea82693eb18409442f7*1&d8f7d54b7162d1263b85e4236be77ea0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"b9a0675f299ca9580aed1cb5f9328d6a*1&74c35b02eb90f6f1560cfd987ae7e007*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"bf4525a89f1005f81f3433dc3fa35ca1*1&bbf6058926a57840b4b80c9462edca0d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"bba02208b2b965239628cb17f1eec0de*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"494f692295e18c9be98a1caae0c4cbd7*1&397a5870850a35fce2c989d886c67e5b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"fcc99d8e561f4495e6c5130f27a37403*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"0f17c7d42450ac94eaf7ba790d9f1a92*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"1e543fbac41f1fe51274215048c21e86*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"3bac52867d419ad6b002c3ad1e62d086*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 mb1ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"f601202331adaeb201a3d6376d614c97*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"32a050ef0472811267f876e1ddd6faf3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"012532176f45c7b7ba81c6113b10a848*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"449c699371866a3058d350a518927701*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"eafe29b075fb623a977e7b215161bab1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb512ic1_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"b0dc5d50de3594c02918a72dbe5e4771*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"dabb328746382625ee3a9d34ba20973f*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"265ba8880c7daf0f3d82317cb66de18d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"a6e11e0f6fd893e808f2d578e644c512*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"efa50ce662409daf03373341bf8ce5b8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"4d608058af3a9d41a76ee01e11142ce6*1&1fe91f6dac87f51ae2a1000a857141a2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"fa1335f98b78581e3fcb45fec0a06f18*1&77d8073e4df39544e9f28d648d8d2237*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"c29b647736f8c4e7997aa55d98f9ec5c*1&14136af970cbcc5f4156e81eba94c081*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"e69bd09d454a2756fa2c807e7045b968*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"01db1cecf7a217a613057dd021f1ef7a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"21403059f5225a509ddd1a365fc95fbc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"0baac065913c85a05468e651e1fcf16e*1&4507bdd1c72d22778c489e1710d1bc83*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"7567d9e77a8193a58129acb6130e763c*1&4c4d9c99ee82b8a05b80986b96a30c6c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"ee0896df2f7f36cd5c85ac833c5f041c*1&36a11e54259af498f8b9e305b62d8609*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"52409e9c58eba6a22ff1b13fc7296636*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"61302e3f78d19ff558937c3d2eb97926*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"cef776dd078b096f24ec220686b31196*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"cee3f8ef0441698fda600b38e7d8aeef*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"5eadf7d088a845e3063def4b0c32112a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"1d4860016266c5128a1375a522aa6bf7*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"93601179a1e8e4c0d0ebd8e985712812*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"ff13ee1ae6e6bbd38cdfed6188b92037*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"68774ff8bcb5c8664ae273421620929d*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"c5da825ef960f40974a134f2ce59cf82*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"0612d5eb5d5aa25656d61f8c0d59bb82*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"f07cc3f262178c3a0ca262b5db1d0fa5*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"7092b92404403cd5a46e5b85bd9839ab*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"5c7ec636bed9cf2a1e547c9cfdd06652*1&7167fe6527cacee4d32c79126f97c03c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"4a3b3a69c3168dcfe25242f82159d25c*1&8d351647879b0d4d5f25883c652fe1ba*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=ABcd16a16b mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8f3f635c3952a7071644383cd0a1222a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=ABcd16a16b mb16ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"5702cd1f08cbf8779d8553ea3f3d873d*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=abcd mb16ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"31f8dae3d7643f4ef37c05fb69ff7cd8*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=ABcd16a16b mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"4285514e33912c6794619ea56f4ad6fa*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1ic64_ih416oh208kh3sh2dh0ph0_iw672ow336kw3sw2dw0pw0_n"d19e1aaa3ba19e9c045b56d8554f4f4b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1ic256_ih26oh13kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"4b0a3fedaf4dc23a21c97538626a6ef3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb1000ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"0a38fed8818a7c10dca25d2857422e46*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb100ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"1dc1066f61cc9447e9cad003aa97e81f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"6a2f9806a2c39f7de851a913718776df*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"359ccdc5635a026cf54cecc632f51173*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"6735e353f8572c4880c7c447ee4c6ee9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"9fd176edf773f5b7bd8d2d1e726e7b7f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=acdb mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"d9ccd42fc30e3ee85d2da874b9a52ca9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"f6860c3cb5b013e0ccee66d6b1e07bda*1&66629ec31b1cf8bc689f1c0f23e88ac4*1&79acb58b30159b81d8f8efe2720461c5*1&88994f88a6b54c979fac6a312e64f163*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b mb1024ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"7cd57b3b7adfb26305b6d58ae906e901*1&ed43db8f5c8714f9962e4fa64608171f*1&29fd499ace43337ea4ef351ef3fcc95e*1&c5533248889ba923e70a084cd88c4e34*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c8c80a19698f89ab1d4afd7b3bac2bae*15&2f210a38789fc79320e9b2d6d54cea54*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b mb1ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"a94d00ff18de4f1f00ab3a65028af01d*15&f2f875c652d3af54f2dd1f0a2e149fb8*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"cf58dc8400fa7964c6daf3dfbd4de8d3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"9b86a9e9af6a247a0ac359e1dc59e9ff*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"5073cbb161ef32050df061a1c70958fc*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"9d50841fa3251aaa93a2748b8e3f2aa1*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"74f7b30dcb2a83d24510d5d2ef267ce2*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"a605218d1873a790ee22b69dbfe75d4a*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"27525a544a61b846329ec5d83ddf5230*128"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"59a539cb9b7d43c174a8539ab0f9c44f*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"636e3bf6ce93907fff823b22fdbdd5eb*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"5e2e4a5d06cd6a79585dba7181f7bfd9*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"7304937c4961cd263c54a4452f9fbe15*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"ce003fc087c03740166b0d203ed9ac09*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"78f9476f987eef22919a510ded345369*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"80cdb6c185171dbba15258d5d9aed3e3*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"a8607e4c8a26ad0bbbbcb37e8a74b5be*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"ad3f611f6868cfc01874800d48f86f45*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"bf78dc98bdefac9d3a2ac6a374cfe3d9*64"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"3de58bf5cfd9a13b335d1eea026320fe*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"e904b481f1acf00bc0d91440d806df3d*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"57c8ef6f116faa19db4fec9426839684*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"5cb9c613e344edcb96c6bc01a709a0ce*128"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"74f5b2364020f027d5e3a28ab60d12f1*128"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"3686f27fb7d154106c58e63fe91b3d34*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"44eaebac208744511158cf2dddd44ac7*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"c9bec4256a3bfdd33882e70c1d49639e*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"43565087f219737cdab2b3c8470e3b72*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"334594bb53bfb76f5f84200dde64f799*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"265d14e9069274833743b1fbb6c5b325*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"5cd59659044e4a817946a6c0ff26cb74*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"01c74e3ca8842d71847e4b8b8888066b*64"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"74f253bf305fdc7b061cb18ce1b7db71*64"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"c3c131ce865a575502743cbedb34e561*64"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"91ba3eed86a275d7c0385fcdd3bf4b98*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"e1d2c33901d61508257a523cee697017*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"3f0c2e9bd964c51f241548bf6c7defca*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"314f8ae2bacc867612f1a2f43f9d123f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb512ic1_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"2a00dc618655c62180af58d6ea0c04e4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"ba2608a6e1903b21e41cf0db59809992*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"6a9bc7c1e3510c4d17e76b2ed9366e12*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"40536e3bc284eb1c869f00178e3d8ac5*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"cdcd1563f6e8f813fa3713f10a596fe1*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"06d58a34ce39e17770315b5db000ad27*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"b22f795952f2091c84b446377c218032*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"f5104456d23719258422676160dd4db3*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=dacb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"b82ecce2da24c8849485f0e3eea0ee4c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"09d0c40b4e0b5f60d46cc46fff4bc268*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"32463e534f4f721d343c4b587c14fed7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"eea9be990c93360d9ce9cca342398119*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"8ce06fe1ee5effeca7e6761ba477a482*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"382dbdde1a4f90882c53c34ee08607dd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"b3116a0694efc7d27f407737cc84cc3a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"6d9e6b5944205dbc3ab08fc39ab8659c*4"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"5c48ba133b0fad9bf818fddc97e40da6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"cffdc0ecbef35dd758c5e05af1fbedfc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"d2b659a090a8ed0453acc3b21ce06cd4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"584d8e3226f4dd42034e938f5a2868e9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_include_padding --dt=bf16:bf16 --tag=acdb mb256ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"8916a0b69b1334aa378cf333540fde59*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_include_padding --dt=bf16:bf16 --tag=acdb mb256ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"645f5f1b9e9f64942224aeb2fef321e7*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ea1438f7b6b9c2098beefabca5da4d6d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic64_ih180oh90kh3sh2dh0ph1_iw180ow90kw3sw2dw0pw1_n"d1dd0a6ad029faef8224f133af06f3a9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"3f404e36a8fe31967d69324efaf22452*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=dacb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"64493b67ced2dd3595b1862d40f71100*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"059d7d619df3a9f55385d88f3c543b84*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"3f76630ec3cd7f2028796e8fb9f51e04*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"e84f34fdd5ef5f45a9c8304031c5ad4a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"828281b265488edf37ab1f865219a97e*3"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"0802e3170f2324749e238cabb2a79bbd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=ABcd32a32b mb1024ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"4a08618b7e4ef5d484579b5f05f29323*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1024ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"6f47736c095c4022f864c0050ba49968*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=abcd mb1ic2_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"6abf0a8a5a90b51269c7634088a846b2*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=abcd mb1ic16_ih7oh1kh7sh2dh0ph0_iw7ow1kw7sw2dw0pw0_n"cf45d129602d1379191a1b32ce01097c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb8ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"fbbfc8e2b73c4150d3ed4c29f6072d71*20"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb2ic64_ih400oh200kh3sh2dh0ph1_iw400ow200kw3sw2dw0pw1_n"0b6f7b6dcd51adc949f1f6c3f565ae4a*2&31b5a6764d894eb939557efc0a983d2f*2&ad7a0a550a1a2568e905f81cae25bfa5*2"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd16a16b mb16ic64_ih416oh208kh3sh2dh0ph0_iw672ow336kw3sw2dw0pw0_n"b3021b7b50f50ed45e72ad2a74c3a399*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd16a16b mb16ic256_ih26oh13kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"b0188bcaeb6b46bee9c54752d1d3a8fc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb16000ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"94dfd02ff9fb7f9a07d6e5a7dc9867c7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb1600ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"6e3181564317e049b28de0ec14042518*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"5603a9114b5bd980f89c116ac87e6700*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"23c7efddd6ceac38a38233c205166fa9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"b397fc47ce103df66f88bc8e25d6fac0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"2e4b9aeb118b4eaa7f88e89f301a935e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"220bc31af2400ffc7abccecdc0d0e180*10"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=dacb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"aba01206af79101d0371610d6b96eaf4*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"e19ec13ec21cb3e306ed17ced5bd20c6*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"dff3e6692eea6e3f04fd9583ac35642a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"75beab4b022878cea30a9692dc6edf91*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"ce7cc81800659b50bab61d74be452d8f*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"eaa3fe21e7e4f7a704c820c1c8e27fbd*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"5171a88d5229e4da66e76c208f7cd971*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"e63aece00f9ceba157e644ad236b4eb5*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"698207a6d7cf937b51fac102c735ab15*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"6a07baa2cd46166a55ebe80a4be8935e*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"2acbf81b9d8ec5326fe18992732cf8a8*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"b017a76e111e097cf5e644cbf23d99d6*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"e766c7ea07fa4504f98e680a5a5a6a29*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"61b690535327213af32c50e5b832c0cc*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"566d0cecde4c92cedc1813757d507d2a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"b2e2b412118ffd5e6e905b751e79b9ef*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"52f89d2c7de7205d854f4058c28ca33a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"b258405d85abfd9a99d0d43bd78390b5*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"cb93dd8fab2864c79bc258e484d12f1e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"1f8a6bf780a2180c92d48006f2fbdebe*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb8ic64_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"f2042549936b10fb07759517645d2a41*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"ff060fe654206575fcd370b11eb7f049*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"1ec7e2528d5b16affa091f532934b26b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"0ded3b79f7281439064d48736538b07e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"ed7835a2ed14f3fd2d896d24a9e6f278*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"d234914677e11872d0ee5fb8a3e813a9*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"f68c836ffea5924ceeb7717b65bf83ea*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"da255dcc90d597789ec0d2ca1ed3aabc*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"34bac770cdf078d60e033b02f9014fe7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"732e1c156650d5db19790246f03f6270*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"439db94f79098b46b3b9c35ec07f9765*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"2c94d492290e82da2e546c0d5ef521e6*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"39a07499e020d8b1f8979ec2c8567a15*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"3ccc46b270e0cd27f582a140edb6227c*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b1fba6333d94a3b50141b54616d3b963*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"2a4f812fcd2b49f93c25eb656960fdc6*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=dacb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"afda761264c3f872ef6f2c9b19aa8b5c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"2513b61f80246985734e0b25473a50e3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"1f8f5bc0f012880ab9df3858732a19e6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"1de9714d11a507ba0782cba01ef7d00e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"8bb08929d30b813de9d5b9df8cdaf037*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"61f8f4646ac7d3abff8a6cc04d849477*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"0739be8feef6389034e2daf5f34e2a8b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb256ic64_ih109oh55kh3sh2dh0ph1_iw109ow55kw3sw2dw0pw1_n"25309cc1d43418ca2b9fbbe60a63de16*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b mb256ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"4fc90cd136ed6552b87ddc46fc73d6bb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"ad8190de422addb78231f64083068ac1*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"3ae329b054eb898ba28e6d1bce32f142&f37f59b11757ada7801ec62cb1d7b498"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"1358a180a2d825d3722e4c22d3dc6137&94b441666a14c02717d358acfac654bb"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"cdc357af91a3ee97d1a75588ebf6eacf&eb79b6665e5ac8813e4342c3cd9ec536"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"b72d8354f225553698dbf33347edd78e&4319221b72075f1027458bb3936adfd6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"4cf5ac0195e7858a4e2ee8b43f44a304&5c463c40001d061ffd21cda2c855087b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"14297e096ffb0a5e00b14c233d5ef771&937ce009b2965fecbf0506111d33e993"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"2e30a5eef91a72c3d9f34d1d2abfe9d9&608405940cb11e9116355cefb1cd32a1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"5ea5b50ef3d6dcc615014f650ad08780&df98123bcbe71cc35485d8965d9409ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"ac03d3d6c7870ffbea54c1efe7e291cb&f11f184a55f7e9055fbcc568636c0b55"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"654bc292fdeaf21a1c5f34650ea22416&5a015347e5e13ba33ede0a6017d520c4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"d631bba1b3ced228b5cec7cabe837b26&de948ae7cd6b93c1311cb60cabfcb979"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"a2b8e95eabab62db33a6396ad97034e8&0f796eb1ff81b888317f2e2f3664fddc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"f29ecf9fdc99bf8b6fdf0d2c25984d16*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"e136ce91926d831015a41414a37897f0*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"a4ba6695a5b8ba130615b410815cbe8d*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"f4f55f3fe3dc87839a8e1842aad9fc72*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"951753d89caf23717b014ada651f2de7*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"3db0ed8188c4596ebc019658013469f4*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"8bbc4be454e28b66b8c36583832498b5*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"a3b040c1d0abf7fc722e3f2d5c426215*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"8b0c00a2c1ed1a15110c14139821a712*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"153614360aac31adf64d8c86e23d4df4*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"49866f80241de65884b47d1de86fb014*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"3d9284e1e09dbe15aa7b6ac0802d61d1*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"c3166c536a464a106026be38fed8e932*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"5eac863a91c9ac7c848e695c1d52dad0*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"87c1dcdea6cfc70948d2402a3bdfd1d9*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"5d5b28234bf05c018326c82dcbd9f40c*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"7c3fd66c1d8d90bc292784941e57c825*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"66888a84d6ea87e80cddd78d85d0430a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"a25dad3f082a50d8be0e4535ee950837*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"8a463825a3e4ce5ad31d1f46d4852369*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"74787f78f44e93f38221ca42ae66317d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"cbb50181bdfa793c1e954c6abd90de3d*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"1b9a8edf578377644edfb45a49ced8c9*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"2c30cb26a0d59685b509ade53e1dd1a8*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"993de08fa331aa569caf6657a2e518c8*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"e2fd06f528bb6fcbcd8fba4dd3261290*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"0588ed6ec8aa4c63fce2ec6adb1c7c18*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"0f1c60c443fba39ba3c7a92bcfd2d8c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"de72add06cf625e6f712fb980a21792d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"ffe2cd75e5d12a8db9fa96aabeb0db07"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"7664a3167b32330963de730c9226ce68"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb512ic1_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"7c4c0dd263bc088e2d20bf46f1533607"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"c004bee8e63142c7c9b3e67f86ab04d2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"e70b64736636868cecf7fa1a2f7afab1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"78523b4f6b112d37a864e44a239f670f"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"4bb42115a1c1b0df0ead061649a9c5f1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1024ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"3668eba6dd7b751b5dec5a0551d50199"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb2ic64_ih400oh200kh3sh2dh0ph1_iw400ow200kw3sw2dw0pw1_n"4e4d8e3143c2ca16a0078fcee96f52e5*2&56be91ffff35fe383a266442c58ad299*2&08cbcb59612b517e30fa6832a8aa9168*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic32_ih512oh256kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"a2ab20baa6a8c9ebcf50cedc0178c334"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 mb1ic304_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"5e6a87dcc17d04a0b40a48c0bba0877f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1ic64_ih416oh208kh3sh2dh0ph0_iw672ow336kw3sw2dw0pw0_n"1b0ffcfac4766759e035ac2bd749af40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1ic256_ih26oh13kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"a6a2990851647e86d3f6c5f0a5057214"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb1000ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"52d139d71a88f14c7805dc66c76580af"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb100ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"f522f389549a0d3b55481084948f08df"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"95cdae5c2df6441af5ce6cbb03c0eed8&745e59396b449ecd60e208b20c7055ab"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic512_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"98fe95a659fa99987ad0badb8fcd6f07"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"a53f35b69b4c027c79bee80cbc3a308f&dc5089464165f477a09b5407cb9d2af0&4bd61c5a8246e8985f4a81374326efea&a48e1f4ba01d540d19ed66a2dc883464"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=ABcd32a32b mb1024ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"991d9c1123b0a6e62d3b0c016b0a4852&efc57305cd6c552034ed24030bd6d3df&682966da140a4ad548891967a2fa756d&37e203f8f86f5fa35c6701dfc6be8093"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"b56e37af0360902c8af7a270ff7cf9d5*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"6cc011b3fe7de902189a8e750ac60eb6*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"4fe99d4e2e836ed0cc32541c7b9bcbd5*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"f8629cd721c867548bfc14c5b38ac977*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"e7889862d785c2b8feda5399497c793b*128"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"728436b236501b1a1ec020524b81fec3*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic512_id8od4kd2sd2dd0pd0_ih8oh4kh2sh2dh0ph0_iw8ow4kw2sw2dw0pw0_n"92cdea7a863a52e1059cc4233cd540cd*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"118d06085f12f9fb0deef033aea634f0*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic256_id16od8kd2sd2dd0pd0_ih16oh8kh2sh2dh0ph0_iw16ow8kw2sw2dw0pw0_n"0e20558424e8e8ed233aa13219624d3a*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"6b68114d88432ba2e22d7e13b0ada12b*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic128_id32od16kd2sd2dd0pd0_ih32oh16kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"7128fcc852810021b59d5d3ba1766d0b*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"73469d8dd392afafd7c8d2dc7c6c29d1*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic64_id64od32kd2sd2dd0pd0_ih64oh32kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"e4e5d5b866e6f9916702b4d95e3e2d3f*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"5037b42b962ae08ddb8c98aa0785803f*64"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdeb mb4ic32_id128od64kd2sd2dd0pd0_ih128oh64kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"fd9988d8c13f1510ea6023240e5a41b6*64"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"60a9359c19402b41f071dc649008e7b2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"b6bef03b24b1fbc53cb63c88562a502f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"22a270b9efe344f07b3cb2413a8d7a27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"d9a02459f8f904c09eb2d67c178e80ed"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"b12ceea1cd39dadeb7dd7b9ed3e05ff8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=ABcd16a16b mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"fb1fd47d8382254087df1e8d06c66dd4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=ABcd16a16b mb16ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"0add3803f5698bf1c0ddb1c867f82fe5"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=abcd mb16ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"f7d711f9966222b671e2d04b6f14d735"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=ABcd16a16b mb16ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"ddf9adfb4c928fc9600f742396f3df27"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"bc9d1e386bc2de6e5aa2c27f215c4a6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"deecd239e361eb9423a61c06effb4b61"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"d612b240fe91011b5527783640f2bf04"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c97f989882e8c453e55a30b6fbe0e4c8*15&6821f2d97223270a9d72e4a6817a6eba"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=s8:s8 --tag=aBcd32b mb1ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"bed05d3939d628ae71e1a83191663cc4*15&4ec6b0691bfcf1b6ce3bf194838d5bcc"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb256ic64_ih109oh55kh3sh2dh0ph1_iw109ow55kw3sw2dw0pw1_n"d5d0423a1ea0593511a1f4f4715c67f5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b mb256ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"97618e1688f3621a3d026dcfb19463b8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic64_ih180oh90kh3sh2dh0ph1_iw180ow90kw3sw2dw0pw1_n"d949f64f4bcfd3023e7f2f37c98731b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"7541b219f738c924c272c4d0964c7b87"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"93b917458a4bb9dc44dd8d8b21d9c2ce&7582614eb6f1f9d6b07ec2ac1c36a8b3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"21d9b231f227070710d0ce10a87fc8c0&41bd1d92f8d369c80c577c3aefc933b1"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"1142c1662f350b05f45896ebeac8f325&5ef7d3d610ff14af76cf64a43acd1533"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"b19c4313cc9cdd78b74d5d0dad779a4d&37def8000935bc86657cd2c1a33a9181"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"20da4b115c6b139564a2d46be6795c17&328c934be23450d92ecc1c51baa3e98e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"21f82dca77fc74636773fc5d46f92dc6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"df4d33262bee8cea44cbdab55ba0165e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb32ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"11790b5b483921b0b0af79507d871ccd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb64ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"8a7e545d4d2d0fad841ab5cb1e2d1f40"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"1d58eb946eafc9e061bb0ec5806aed1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"f5ecb535b83b0a7d99c51373cdf888c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=aBcd32b mb1ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"e63f3b451d8df8af7235aaeeab94cd5d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"644f09c2d6adc0a81bda900d6b82e970*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=dacb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"c40ddb73fa00001ba00e99b1ed189dc7*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb8ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"d8f7c95c6d6a94f59ee9bf9bd4c73a57*20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=acdb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"538f25bc0a1429a431f31be55f651b39"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=bf16:bf16 --tag=dacb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"d4c391e75f08df12b8eb060f9bcdf315"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"c661b57f8c4f9449c61ce1da4cf541b2&3192fe6f2e6f30ceb7b1633ec44a67fa&ac519b2873dd9e3e59e5d664560f8621"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=u8:u8 --tag=ABcd32a32b mb1024ic2048_ih7oh1kh7sh7dh0ph0_iw7ow1kw7sw7dw0pw0_n"0f2c8b1d7fa7f716cf7537a6370eabf3&af63248f0e3d1471b7cb7347c8032887&2d74d002527aa0ce4c0e244b04d798a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"8f06ece4cc9fc3a73c3855abd5e19ae4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=s8:s8 --tag=ABcd32a32b mb32ic64_ih600oh300kh3sh2dh0ph1_iw600ow300kw3sw2dw0pw1_n"cd387c8c15990ac7cdfe7364409942b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"5b983dc570da1454ba79e60399a462b3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"dcf9e0b14168236cc9284ea4a30741a0"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih150oh75kh3sh2dh0ph1_iw150ow75kw3sw2dw0pw1_n"78cec0c9654685be1e6e0a97f7030f30"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"e2043c69f4e87dfcf900c91455568fb0*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"48d364b47e8a6a6dbac6e35040a31e56*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"49b27760c613d545a195bbe002e01dbf*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"83459ff23643262491d3ce1ddd48c7f2*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"d89c382b36400de81de4a305c90223cb*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"2ce05606adcf7cf9fdffed1f5cc78707*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic48_ih20oh10kh2sh2dh0ph0_iw32ow16kw2sw2dw0pw0_n"a324661c24bce6b37b7a7a4a973ea32e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic40_ih40oh20kh2sh2dh0ph0_iw64ow32kw2sw2dw0pw0_n"c34b3c496476f69783d2f6f0a5f6779e*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic32_ih80oh40kh2sh2dh0ph0_iw128ow64kw2sw2dw0pw0_n"a482b2bf3c359e58d0bcc96a563fb002*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic24_ih160oh80kh2sh2dh0ph0_iw256ow128kw2sw2dw0pw0_n"e7229f2bbfcbf115ae5d7cc960a66023*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic16_ih320oh160kh2sh2dh0ph0_iw512ow256kw2sw2dw0pw0_n"fe241128dfc9195c29046b19a9e2fbf2*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic8_ih640oh320kh2sh2dh0ph0_iw1024ow512kw2sw2dw0pw0_n"3fa16a472fea7f5a46ed7b765b79bd16*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"7e89a629a806ef411a497884604eee4a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh13sh1dh0ph6_iw20ow20kw13sw1dw0pw6_n"b17b76267f8ebb955efedbaaee6e596e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh9sh1dh0ph4_iw20ow20kw9sw1dw0pw4_n"ce1385dcd45e81ef99efb1f34e6380d7"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic384_ih20oh20kh5sh1dh0ph2_iw20ow20kw5sw1dw0pw2_n"1ada94c99399eb1af1a3435fdae69cda"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"4089a6b15be7fd7663c7f386c59e8e44*10"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=dacb mb1ic1024_ih1oh1kh1sh1dh0ph0_iw49ow1kw49sw49dw0pw0_n"50550bc6ff2dbc32bdbeff34afce4b0a*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic256_ih56oh28kh1sh2dh0ph0_iw56ow28kw1sw2dw0pw0_n"8d23f125e91a7771c2ba3154ec1a521e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih28oh14kh1sh2dh0ph0_iw28ow14kw1sw2dw0pw0_n"20cf752b5dbc01145ca07eda31ce48c5"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic1024_ih14oh7kh1sh2dh0ph0_iw14ow7kw1sw2dw0pw0_n"e0ddd46ac08ba588c3b8e4d819bacdd3"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic64_ih512oh256kh3sh2dh0ph0_iw512ow256kw3sw2dw0pw0_n"f9bb108944943c1907dfa7b8431103f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"265911e82f3da60bcaa0a8fda86f0841"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"2f536f5ead34af9a28135198d265b921"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb2048ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"aaed8cb0963a07f9194f9739afc376d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb512ic1_ih56oh28kh2sh2dh0ph0_iw56ow28kw2sw2dw0pw0_n"873668ece616254f74e0c48e79da0a6f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"b8752498e8e0483533501213589ca608"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_exclude_padding --dt=f32:f32 --tag=acdb mb512ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"6ac83a36a094b5772864dd33111142fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"dfffa3442cc5af0694f79cb7bec090e8"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=f32:f32 --tag=acdb mb4ic256_ih32oh16kh1sh2dh0ph0_iw32ow16kw1sw2dw0pw0_n"4a993083cb6eec65b8aeb840774cebe7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"aa2a05cab77ea4dbafc98949f6087a5e"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb24ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"8d95e2dfe3a1a2813c05d5fff5b7f060"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=acdb mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"699b3c4788569934e4e804fc22101e8b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=acdb mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"a36058edfb4bdafc5cd3763d68af0092"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"dca01899b5ae4ec430a28f66f77442b9"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"51fe0010b6af960797f02e6ce9132c41"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"5067b69b43b6cc88ee99d080ed343393"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"2df3ed88abc417cb0ad3fd5f1543e1a7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"748101dabafbf0fe91db5c69949d7e5b"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih118oh29kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"294dd5dc04a617286b3c0a4ea55263e4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"7c937cfed6160bdd97f4f20acf2e870a"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic256_ih473oh118kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"8c82ec0437c7248747d02734f09abb55"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"8cfe834a8740926dcfed34fd1c502f16"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic128_ih2014oh503kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"6250989f56a153c0e7f628ab08479a5c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"a4332cfda7ce28c72afc9062fbd0c8be"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb32ic64_ih8177oh2044kh4sh4dh0ph0_iw1ow1kw1sw1dw0pw0_n"4f1e7f81cc6feb4eeeb6827f1ccc0d1d"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd16a16b mb16ic64_ih416oh208kh3sh2dh0ph0_iw672ow336kw3sw2dw0pw0_n"1c11eda46cf4e3e30080e3d1ac61bec2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd16a16b mb16ic256_ih26oh13kh1sh2dh0ph0_iw42ow21kw1sw2dw0pw0_n"1ea510c46b35fa0e3e8c8c53848a07f2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb16000ic256_ih14oh7kh2sh2dh0ph0_iw14ow7kw2sw2dw0pw0_n"88c9f4a696c7ff3ebaeedc211ccd2534"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=acdb mb1600ic256_ih28oh14kh2sh2dh0ph0_iw28ow14kw2sw2dw0pw0_n"656a6b20300552fbe03acc9dea1205f0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"bb5b1fdf4e14c117cd920e34940e04fe*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=dacb mb16ic1024_ih1oh1kh1sh1dh0ph0_iw15000ow1kw15000sw15000dw0pw0_n"16ecf4a748001dabc429a87a7bfdf14f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"3ce9c3d4b2c8ea2c827d2acb4e50fb99"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_avg_include_padding --dt=bf16:bf16 --tag=acdb mb256ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"f10a338b7ce8f34dffc81075789c15ec"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_avg_include_padding --dt=bf16:bf16 --tag=acdb mb256ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"11ada723365e153bd8367b0b28c502a7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"cb0252bd7ac40a0ac825136fbbc31973"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih147oh73kh3sh2dh0ph0_iw147ow73kw3sw2dw0pw0_n"0a61bff7b54ad6fba71ef03636978f0e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic192_ih71oh35kh3sh2dh0ph0_iw71ow35kw3sw2dw0pw0_n"922f17d8d581e5a7b6065957a76cc5c8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic192_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"fecc82cb8f18c444db2fe7d5338b03ad"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic256_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"fe61f0ed38b6a43a10ff6ff2f8e0bbb2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh35kh3sh1dh0ph1_iw35ow35kw3sw1dw0pw1_n"ffd50b060e5bea4b4747ea507c494f42"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic288_ih35oh17kh3sh2dh0ph0_iw35ow17kw3sw2dw0pw0_n"0404c2ae2df8f15997ec9310c7565a18"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh17kh3sh1dh0ph1_iw17ow17kw3sw1dw0pw1_n"180b9f2947fb87d6e6b01af9416ca908*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic768_ih17oh8kh3sh2dh0ph0_iw17ow8kw3sw2dw0pw0_n"9ee52a0060b99852d92972a54969b9fd"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_exclude_padding --dt=f16:f16 --tag=aBcd16b mb1ic1280_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"043615b773445bc8fc2e40eebb885613"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih8oh8kh3sh1dh0ph1_iw8ow8kw3sw1dw0pw1_n"8194e99dce9ed7a9a474a2dca15e66e2"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"04d02b08c72eeaa2cb007c1999bab084"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"4c12f0fdfe8ce40dd51aa584582d981f"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb32ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"0e3902ca5996cf8fbaaadb426948e62e"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"cffa1533fbd507382f281ed328dd7911"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"9168e1feca0d6708f51846db7dbce263"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"cb9cf77b447ad7517dac165cbde19229"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=u8:u8 --tag=ABcd32a32b mb1024ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"4581569cb5717d153daef5e7eb829433"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=s8:s8 --tag=ABcd32a32b mb1024ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"7a91609781116cea29dcc69784013563"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb8ic64_ih256oh128kh3sh2dh0ph1_iw256ow128kw3sw2dw0pw1_n"7f1d88c957284ac56b99b8ab0519cd90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=abcd mb1ic2_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"1989d274d31651cd395d816fd04f9e21"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=abcd mb1ic16_ih7oh1kh7sh2dh0ph0_iw7ow1kw7sw2dw0pw0_n"b79f27d874da3e549cc97f604bf996ca"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic64_ih112oh56kh3sh2dh0ph1_iw112ow56kw3sw2dw0pw1_n"4fd176a7dcc1a88059a6550f9a2df77b"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_avg_include_padding --dt=f16:f16 --tag=aBcd16b mb1ic2048_ih7oh1kh7sh1dh0ph0_iw7ow1kw7sw1dw0pw0_n"71c73b786c12992cec4c9c06e6efd7d8"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"28194d3c0fb679e0e1ed897a8fda01b0"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"f22b0073c3ac77973e087269811525e1"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=pooling_max --dt=bf16:bf16 --tag=acdb mb256ic64_ih112oh56kh3sh2dh0ph0_iw112ow56kw3sw2dw0pw0_n"7476adb99ed8f83db68f9700229fea90"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic512_ih13oh13kh13sh1dh0ph6_iw13ow13kw13sw1dw0pw6_n"5d13c7b3b468b5085a26b549798f1295"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic512_ih13oh13kh9sh1dh0ph4_iw13ow13kw9sw1dw0pw4_n"8478e119591408f486b15830d3c104a0"
+--reset --allow-enum-tags-only=0 --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=ABcd32a16b mb64ic512_ih13oh13kh5sh1dh0ph2_iw13ow13kw5sw1dw0pw2_n"84f0cef840bc7defecec61f15782d6a7"
diff --git a/tests/benchdnn/inputs/pool/shapes_large_pool b/tests/benchdnn/inputs/pool/shapes_large_pool
new file mode 100644
index 00000000000..14d912121ca
--- /dev/null
+++ b/tests/benchdnn/inputs/pool/shapes_large_pool
@@ -0,0 +1,10 @@
+# Large iw
+mb1ic32iw134217732kw7sw5
+mb1ic1iw4294967311kw7sw5
+
+# Large mb
+mb4294967311ic1iw1pw1kw3
+
+# Large ic
+mb1ic4294967311iw1pw1kw3
+mb1ic4294967311iw1kw1
diff --git a/tests/benchdnn/inputs/pool/test_pool_bfloat16 b/tests/benchdnn/inputs/pool/test_pool_bfloat16
index 59ed02bf4b7..35e3e35c395 100644
--- a/tests/benchdnn/inputs/pool/test_pool_bfloat16
+++ b/tests/benchdnn/inputs/pool/test_pool_bfloat16
@@ -22,3 +22,13 @@
 
 --attr-post-ops=add:bf16,linear:0.5:-1
 --batch=set_all_small
+
+# Backward propagation without f32 accumulator
+--attr-post-ops=
+
+--alg=max
+--tag=axb,aBx16b
+
+--dir=BWD_D
+--attr-acc-mode=relaxed
+--batch=set_topologies
diff --git a/tests/benchdnn/inputs/pool/test_pool_large_gpu b/tests/benchdnn/inputs/pool/test_pool_large_gpu
new file mode 100644
index 00000000000..ee59757df84
--- /dev/null
+++ b/tests/benchdnn/inputs/pool/test_pool_large_gpu
@@ -0,0 +1,15 @@
+# Implicitly test FWD_D via BWD_D
+--dir=FWD_I,BWD_D
+--dt=bf16:bf16
+--alg=max,avg_p
+--tag=axb
+
+--impl=jit
+--batch=shapes_large_pool
+
+# Test both gen_pooling and gen9_global pooling in the same pass
+--impl=gen9
+--batch=shapes_large_pool
+
+--impl=ref
+--batch=shapes_large_pool
diff --git a/tests/benchdnn/inputs/reduction/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/reduction/option_set_fwks_ext_gpu
index b6b4ff19b1f..758434d7ca3 100644
--- a/tests/benchdnn/inputs/reduction/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/reduction/option_set_fwks_ext_gpu
@@ -1,318 +1,319 @@
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x7x7:1x1024x1x1_n"d89a151f8aaa602b44e8f8af643346e8*5&0f65c505034117ec17fc429e2361eb15*5&ad8f9d92dcb75b5101c59991d5775473*5&6bed0022ae33876b0d54935da26293dc*5&ce43be46d092cd169af16dc8022071f0*5&6a07f14f731aa4e7f556ff61b05ddbeb*5&b30ab7bf4f11c16f4160e5ab298cb5a1*5&8a4cdeb4101deacc6ce7502ced69fde6*5&809173dfefd59c7cd1e1856d119309aa*5&074198438d1b3ed12b5cfc879b8bee0f*5&ca53c009bd061350c5c22654ddc04ff7*1&12bf181a88bd672decad48deb1b2d5f5*1&f7607263c4b343b2366bd73c973b539d*1&4a7a4d4945b37fb2d34bb4664b3d87e8*1&aebe87dff71c000fc257e1f4a803d90d*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x256x56x56:1x256x1x1_n"5a63568b78b2f422feb86976fee84550*15&05645fb301751a47be3595360e4a2499*15&8996a941273916fe03811bec79253c58*15&c39f32fd015f40364ca3cb20a86f6c97*15"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x512x28x28:1x512x1x1_n"e5ca228a0ec924b385f9b3728071447c*20&0f422f96fd23b75e717b2faeb4ede26c*20&eecb0b5add7741b7677fc49969371797*20&3b1b6441bdb4cc7e44aca9556dde285c*20"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x1024x14x14:1x1024x1x1_n"1a849ace38fd7d08a4b6056274cbabaf*30&2a06f349cd44b0c476d35c08acffd7f6*30&46633d28ca70d1d68aed57159de4cb8b*30&36311bead9be06747dfb529b0d04c704*30"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x2048x7x7:1x2048x1x1_n"1b9f02f3aa1ca8e29ed22ee96b7eb52f*15&a2356def8266fad125eab3ef4a00c088*15&56ba3ebd13697b30bb0d66bcab73597c*15&0da8bb0417739feafae752317348b876*15"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x2048x7x7:1x2048x1x1_n"7bed121ce31699dfbe39886ee6eefcac*5&98824d54f8e7cabdf4143764cf4777bf*5&b14b501d3b56c31b4419f5cc2352bce2*5&cad4849e0ced0db6f6db4b57b7cd89cf*5&4eab485eb544b0e4901adec772d24cc0*5&218e682288661c7f31bd561cf5234fe0*5&d0f23601c3351ed32eb0e4fe7df848b9*5&33ccf47fcc78e042a1b5592496c40a18*5&fd0af80da0fc99faccdb9e4044ce5414*5&ea346e6cfe5cf49742adf8c64e19e83c*5&59252fa7c38cbe87523763a2564ec9c9*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x1280x16x16:1x1280x1x1_n"48e463af77d3ff93c53b17ee32b7e9e7*5&55b7e645ba3477f30635e39d6469f8e0*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 100x2048x7x7:100x2048x1x1_n"194ecfab8f7996ee30f793de62b8bf4c*5&cdc7e73a29d5e04549c9ab83c2c0150c*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x256x56x56:1x256x1x1_n"b090ec29fde390d4a0eb3127f79984e0*15&54476095c2c8545c1afb7e3ea0e30a6a*15&427fe39a8ac08ec0d0ce358ef1bf7260*15&2321ae654b04f84dbff0060291f310a7*15&f9051a0518acd5ab6ae718804f09ce72*15&8e5c3a62a4e327bfdeec81a03e85948d*15&9c0b523fc4e3f5f129540c5fc98dd132*15&777348a200964f17768155e1ecd3d7da*15&7239ba91f5417577fa8f92ec351a33d6*3&6d63e9a40c986e5c014f98ceeb468b6a*3&123de0e045bfb89a8ddc974b3b11bcac*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x28x28:1x512x1x1_n"df7b0e2bfaf803257e9466a860fe5908*20&b75c60cf5893b14b4861c62737692ef3*20&3aaff9127f900f57ab99da249bf5e864*20&ef408d1cb10924c2630aedca56f09010*20&6d255b502a357b34818a8bfa36ed6988*20&865b60c80210640e13b00770cf928ef3*20&224a68ee8d51318386701ea7676e85db*20&8ca5800e3c5f9af4ce4a1033e9988eae*20&95fb957f4a0001445424cc56d0d78d2d*4&8c46797a4cd71119b7a6e7bc1f6a9a8a*4&696f726c636ff62ec1506317862d45e0*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x14x14:1x1024x1x1_n"15a6be0075b288bd4a1da17050659f38*30&1dabdf6841236569fbccaf2e4331fc59*30&e7c753e72613ae6894fbd7f128b1d965*30&b1242133281fc549cd2ab1e214622c06*30&0433e748704497949f65d2c6a4a0ad42*30&b053d2d0158297ad9f8b271a67d6a475*30&95f1bcf0f9665d6de646d27c9cf31c30*30&b903c491c33d630b0466b4e5a3dd5ef3*30&52f328b390815435fad54e7875ef3cf1*6&2d2669f11ded3a2286b01993485dd0d1*6&78e46dc21bc88bb1a6ff6ad8c0a3f2f3*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x7x7:1x2048x1x1_n"aeec142b06d0c745760dc809fbe37dbe*5&44e37fed4921402f88d356db2847fdbe*5&29d0e150216fe0020cd00ac8eafe95b8*5&3e1f377d3678970900088ad604680994*5&f79fb3c045f2ff95760d1eafad982a9a*5&27f1f76dff93779154d3c526d7019e97*5&42f2be78ffc1db27cf9a0f0744393642*5&6479f7d256fe32b054f23e4aebac6dac*5&9e42acbd9c2f30b2e0f58c8b21433011*5&9db327c54c4901221bd85e66b1b45042*5&7e92f8b160d6547f0c472099d07c997d*5&af7d3444c5992712e65165868ab9f9be*5&a5c2abdf34b06d07f098d457ec88ee6d*5&29a19878654f1e1104308f62a80b6956*5&5b7112f5801e368509fba682f669dfc8*20&f1893b671b8a1b302c50df1631039404*20&27bf0cc35c1d11a1b5815535dc62a0f0*20&fb9edb4133d55d87b695c9492e6b6418*20&45ae73239e1b2a5b43d927d11a3f10d8*20&aa53c8454dddd95dac8da9c676c291bb*20&02a5e64c2fbd672689cc417e716aa53d*20&3f2aae6223df5eaad406f66f03acb97f*20&4a52d5776ddc2562d27704663954e2e8*1&7150cd5e48a2adfca090489fc257b6ca*1&991623b9b998e40069fa7486b4b8f99d*1&62c243075abe066894ddf9fed8618477*1&b550a456e2844fb4de8deb015b236f3e*1&8b60a99f96667b0a5f5bf66ad2c534cd*1&272ee9b2b0d03219a4bb9b4750c82c35*1&4d52600be1b0d87ea3b385497746dbfb*1&e666bb9d4cd556d9e7a58444184ca764*1&2fb850641d471a117a34e4f59177affa*1&70df43991f3f22e47033946aba98a7bb*1&3a85c97eb4adacafc202aa1fd68c92ee*4&f8c433a6c8a0f82bc9e3f9de9326bcfc*4&70f52e0aa4dc122ffdc13e1338babd9f*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x16x16:1x1280x1x1_n"7a1b95958200f6a77de76aaeab6e44e8*5&a548421709f141c453e34f7327c997ec*5&dadf659ec9e09aa999b8ff360003ed19*5&7a8b6ec44ffca4e2e7e47affaff46bfd*5&22f561cd22ff844c9e8669c4c09c78cc*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x72x28x28:1x72x1x1_n"44cc72d2573154f92693593980a94444*5&b9a3b5f59ab8f4382d14f433609ebabf*5&7482f7b8b5a532d267dff9aa67f49663*5&9a4c33f0f67cc4d23e157917af0bb08f*5&b0114af14e7a4e0cc00e8aa2e086b35f*5&98aec30b2c18af89d35e0fc6e3d69b3e*5&5a3f7faf7b73d83bcb23981c4c9162e9*5&54f1e5bf943e43e52c12bf0553153ebb*5&6dc2f0aa455267e5ed37f8df5170d6ee*1&d34c82c83d6ed207f24a0ce3e6614e57*1&51e854d4b602587487d3febc8c714661*1&e052ba6f7edeb5f635544ee284023773*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x120x28x28:1x120x1x1_n"c84f6d2577ed5b3321983412f7b8217e*10&888483d332689c762d5e73ce4968a6b8*10&cd6faa95f6e3101e516367f70f0508f2*10&872b409044fe5db13fb6011c156c7e2b*10&6c8b5fd22b5c0e92ca8c64cc15029c10*10&93584d22e70a885737ea9fd1f81fc0fb*10&c9e393dde93eb916438db16c13230ced*10&efa9917de3959e6eaa558c3b52ce60c2*10&b64d46092d2a5af2ddc84ec492e59341*2&c73b90ee52742cb7ec495c81cef48c99*2&603479aeb46a0baa3984b88bd8216c6f*2&b58996e9814c954420925a249928c079*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x480x14x14:1x480x1x1_n"2109ba16e724c2dbd7df3ab644c07956*15&46fa12392e8842f0ff4361abdbbfe286*15&f1e9e01b42388101e3bc474547f67188*15&54cc22c49c4a865d84bbd5ccd59c509a*15&0052ac19651c8d9dc65815b20d298897*5&89396dd4e4251d4636575f7df75ec709*5&21d941003e3a889bda7a8a737bf0017d*5&e0fb00b41bf802198a8e978a95947962*5&792b077eb7ac6390e6fdb1014b5bb5a3*5&65093c096176e9390aa9c0ec4f082af2*5&e716bc2f54dc48be757d5611e03f03bc*5&c1a066053e56a3d392467eb055ea0791*5&bf9d6a7c27fe52dd3a8190ee95c8598f*3&59153c722658c23b20336ab41e57bee6*3&4390c2d64ed892d2f539c9bb53af4c0e*1&a90c7387513fec3643a57ea64643e833*1&92d674f334d1eec9d3ece9e52f192ad1*1&3d2aa84136dcb7a7a9e9020a32b83615*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x14x14:1x672x1x1_n"9b6dc7be6f33038dffc81f032211c9b3*10&2a30f7d6eaeea079fc071991921df220*10&ba54790fa2646505e2e259591a54e1f2*10&1a19bd2f022b649810107581843f714a*10&e6b23d1f5a6601f5c308e33eda23a6b3*10&e97cfff6b99dea9cf8b632bdeb35c40a*5&6278d1af560aa969bb3f284efb3957ec*5&88718e67f33988b94ff3ee879df4227f*5&496fae4dadc3f22b85276938a2212cae*10&7abc9c1d0f535125d8b670c2e17fefc7*5&9611b2f99301406dac70cd369493231d*5&f201823c7ff2327a614004c9213db6fd*5&fc52b1127c911f8715d604db55ff70a8*2&76d753b20a201c192057bff84d00d7fe*2&4e267a7b4d3e06eb9ecd7e1b052f30d4*2&07651f80cdd610421036e811cbe5a278*1&b474ed71f76137dd70eb098b95bad979*1&ea6257e2fc0a21c28c2b301c598f5ed4*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x7x7:1x672x1x1_n"22f8e66db2e4a098d072cce22323f579*5&68449e0bc220640cf98e719ed2578538*5&3a56523a2ff7434926f5cb1f84b74448*5&342091eebdcaa3f4933e64bbd7b5f1cd*5&63040ecaeaca23c81c5bd79fcd974e35*5&d24c87e02dfc4985d89ceb5e269252d7*5&cd4122b349b7afa5af3326ebac78697e*5&63d26de74886044855299a30982f08e0*5&30196411fefba19bb8e7ab7a3e68990e*5&302baaedd537c26e695e2f72379ac19f*5&cfe6d0e53ad0919aba10375f6e773c38*5&eb6401f0110a57501b0cc787e3c33fdb*5&4ecf650b4bbdae35dd6f3934e78bb25c*5&dea9a622d4bc7a9410ceecf15cefd9e1*1&905240ada865a5e1b1a49983bc32c41f*1&3e3b2a6640ee06c45780a58f742d59b6*1&8a54f9cdd392422ae0004eea660e8e6b*1&ad5260d8e222446b8ec1f80e92ee4a7f*1&d7e5c0b802ec78879a774c10b1894dd7*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x960x7x7:1x960x1x1_n"35b72001767988648b923c7334f5cb76*5&a6ab07e4bd518c0a96ad040fe129df84*15&0f51424d450673d5fc712354088f6e53*15&d802e0c81e9a8dc763f9741a099d5c9a*15&3fda5f439cba05af1a7b0468760a0523*5&dcab60907b1438b0bcd2d722c0545fe1*15&74754d0d667d9f0b49c975de65073a6b*15&8b3aa0b2264ac7f8b8ae568b3cf613c3*15&3c3af6677fbe5c85105d7589b22ea29d*5&aa3b28a7f3f4866822d1d6fcdf7db874*2&521b9b247dbddb7df16fd497614ef383*3&e4cbea1d6cb9584576e7346274cbdcd7*3&4188d6f65f2e131cf3b2545cfa0331c3*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x32x64:1x1024x1x1_n"ab6d75c797d63ceda004a36d825155b4*5&a53ae918bc1ae366efb76e8531d840b3*5&5f9e1a735beda3346ea4b9f18fac3a48*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_hardswish:0.271:0.314:1.234 1x960x7x7:1x960x1x1_n"18f71f45e1c25aab58730ea8771ee13b*5&5c1e5ec89ca3a147ad48df93871e5583*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x1280x7x7:1x1280x1x1_n"6e5048be6ddaa35ec78365a4689f04c7*5&695b3954472e1b49ffe92101fd7cb6fb*5&f891bac7eb3ef3ff95d0a0d5602c5900*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x320x30x30:32x320x1x1_n"eb07991b7751b402007619c48e8b9539*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x640x30x30:32x640x1x1_n"f43850b290e607869bd5432cf4672e72*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x960x30x30:32x960x1x1_n"e06b5d28fdb3ac8d004c48ea1a0e11fa*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1056x30x30:32x1056x1x1_n"b7716cc2ae015450cf2b4bc066dc4328*13"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1056x15x15:32x1056x1x1_n"8fbe2c8bac93d011d506cafd166f393e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1824x15x15:32x1824x1x1_n"61730abedd7d7fa17d7396227c05818a*18"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x3072x15x15:32x3072x1x1_n"5e2343e115ec7aa5bc79b3588f787d5b*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1280x15x15:32x1280x1x1_n"61e0759189b4e0f2fcdf877842654ae2*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x320x30x30:1x320x1x1_n"5e284a3987dcb13b0b5450b5d2949726*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x640x30x30:1x640x1x1_n"00af90b828ea0ace6b7ad9761dd38644*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x960x30x30:1x960x1x1_n"25a6089cab2128c73e0fd24e3181712e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1056x30x30:1x1056x1x1_n"822f466afa6618b0fd4355db2c8103e9*13"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1056x15x15:1x1056x1x1_n"880caff6e7d129d3bca400d40d70d8aa*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1824x15x15:1x1824x1x1_n"37eb34c5a68146cab00083fe824eb6fe*18"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x3072x15x15:1x3072x1x1_n"71a51b209e3ca7514b17fbb20da05539*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x15x15:1x1280x1x1_n"43df37339558ec65b336cb07c9d39241*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x7x7:1x1280x1x1_n"d76c45209feaa9a4b6e0f7fddb405ae1*5&b587cfc39677807dd2b07c0eb3546a2b*5&ed030344ebff733e968125dc42df29fd*5&668cc3a55e6eaa9c98c5e4d69729ed46*5&8ba81b6c4d8a172973d2a169dc85fea3*5&0ce45f225f06ebc5d8e74164fb4610e6*5&372d14ac6f5b4b59f2a0dc4a4646442c*5&d4fa9599ca462a7c8aa1217b22bd46a7*5&059744255e635208b66352051fef51bf*5&1300eafa17066983602847c0d122fa98*5&758f7a0b2c739bd03adcb2ad33d6d700*5&1ff86a6a10a3939eb5c5b8d3f7058a1d*5&ea358e38487c75be7c502b234f8ecdf1*5&4e108c8571183aa2c4c1ce1646f8ed1c*5&b56b11b856a091529214e17b16543c43*1&11d630ff600d63bc3d1b53b8f14ec588*1&cf8bac8ecf4e5645fdd440f979c3e919*1&6ef7d8210e3a85dc621696e767359a66*1&14a8d23b1f1c10fd0635c856cb2450a8*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1000x13x13:1x1000x1x1_n"80d4d4e4321c55b4a2dd8f6eb1f7e8df*5&f2c45f53cb8f67af259544b69a19045f*5&b463e0428091674e3811445b9e4026b5*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x6x6:1x1024x1x1_n"1694f58281098e2d1d50fcba6409eb98*5&7a85889e475e2c6198ba44a35698d3ae*5&7df17b7157c6de5ca188fb4f2c44ea64*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x256x56x56:32x256x1x1_n"1c167c9ee96594fc4b568c9c727837b4*3&e2bfb7661e3d624da977a895ec355337*3&6962d103917c6a3b337a4e0a23176ce4*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x512x28x28:32x512x1x1_n"db271e47fde108bde5aa2975afb3df7f*4&2c150350d01d88df75f9886c5099e8dc*4&89fb1a32a6bc28fc1136bc9649cbe889*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1024x14x14:32x1024x1x1_n"b8c7fe512ce8e9836871b28c5836b174*6&957c74ebf2f6ad6ff0902e729b0ec984*6&943925e51cd583da673630e7d7333eee*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x2048x7x7:32x2048x1x1_n"323612b8bd577eeaf64ba0e81d8d09f1*3&570f63b925c013b65a6f16bda66a269b*3&9e132ce86918af2f59244a5917d78079*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x2048x7x7:32x2048x1x1_n"bd0468da5289077cd2f5a51ee8148f34*1&d8db35f2b21208fab679611adcadc348*1&5d0ae877cf901e43097dc322f9daa619*1&3e07738849b432be8412ccbdc85f4685*1&bc20936b142767bf385d9ffafbdaafd1*1&9db8641ab00cb1fe0a17431650b09bab*1&0d13f180d83863baeaea8b1a6ec3f99f*1&185d944f578b96a76e07ba7c64ba13db*1&5cbd121d404a68fa382d2f1cb0905d23*1"
---reset --allow-enum-tags-only=0 --alg=reduction_max --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 3614x4x1x1:1x1x1x1_n"5cc709a20eb69d4bd4826d358504dd0e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x832x7x7:32x832x1x1_n"b02142b5fbc7eeeb935ef8317f2f195e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x2048x7x7:100x2048x1x1_n"2c309041f0445433488b76c778048bff*5&beb3f93335ab247c9b5a785bb01d407d*5&1a228ddb108711be870d947da17f53a7*5&99743f4ae61847b749ead584d7fa9c9e*5&ef2aaf7862d7c1dbeba2274485209b7d*1&179ce0d84912ff6780826c4b0bd34582*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x96x128x256:1x96x1x1_n"dce8d06533241711f1cd6a53e5f5421c*1&8c2ae151c6455b80cea02f8ee7dd99b8*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x128x256:1x240x1x1_n"c1c2b9eea9bf591cd7f182fac1330f23*2&407a51fd8778109c4ffb4c4b3cdce186*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x128x256:1x120x1x1_n"4c222c44712878038e2edb2bd1ea9135*1&13f7df2ecac4fae187ea40c4850ca0bf*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x128x256:1x144x1x1_n"60ade8ba160678cc6f0c2ff30c25bc01*1&b815b04975765f11d75297bbb594da15*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x288x128x256:1x288x1x1_n"f6d301fca3de807bbd8703c114604808*1&f97298b0d86bc44bf1771be6059e82c3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x576x128x256:1x576x1x1_n"85cdddf3251ca10e902127a20865304a*2&8411e9bf7e6c15be6640feaa959def1e*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 300x2048x4x4:300x2048x1x1_n"2eabff85739497009883a99851a7dba6*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1280x7x7:1x1280x1x1_n"e12d507fb57b656614a0faf47d385f56*1&251c275c9428af3ad491c96d1ef75f9a*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x7x7:1x2048x1x1_n"2057849732dbb9d31294125889cde564*1&8aa598747bc363c1206d0ab10409150a*1&d840367ec0f6f5d360f71101ad97c590*1&eb3fb432418cec7952ee94a910b0aab3*1&a272b39435958b60bfb5f31ebf8909b7*1&a96206e9ebb5448976bcf1b0ee7d5e84*1&bdc9d55f82cdda89b8f4f7e37776d0cc*1&33d3986df1644f4603b485258989eeb5*1&08625e17668969c0017404e985f75fd1*1&910bff36ecab59d7080a75293356d89c*1&4881c3721c3bf45266678a2614b59e52*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1001x13x13:32x1001x1x1_n"605fc0973b58c6630f4163f9baa35dad*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1001x13x13:1x1001x1x1_n"cb28b92044a1acf486ac51e22b3b26fc*5&2c23d8cf3ff8f69465374c9b4028448f*5&28758db95afd3834b1451318622a69b3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1000x13x13:1x1000x1x1_n"e00c1f127e76cd0911d608a5d8db6392*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x64x224x168:6x64x1x1_n"494466759808216abccdf34d3656b2cb*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x128x112x84:6x128x1x1_n"a5d2e51e9d91679ce8b56739bff92cea*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x256x56x42:6x256x1x1_n"a3a6b2c40f787bb5c36eeb54800c7e45*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x512x28x21:6x512x1x1_n"67607677dd8b77cc3bbdedf9f2e4d86c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x32x256x256:1x32x1x1_n"c7ee5e267da286509dd0715ecf690968*5&3157c189b7645e67bbb16a9d70a645db*5&edc43eeb57e54af8c70a0cb97918a774*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x128x128:1x96x1x1_n"af569bf283a9c681975e8bf36a4904ca*5&12aff4621a002cbfc0d1b344d3c1f5d7*5&ecb9ebf15257228d17c819e8cf84920e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x128x128:1x144x1x1_n"14d3e73b972aeaf827997efe4751db2b*5&ce2628d6eb9080b37d084d21723b3501*5&746f70bd13ca58bb344d778d432b4d3f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x64x64:1x144x1x1_n"cc28956c45f4e66a76fe52029d33a7b1*5&06fcfb4710e3c985fcdc3e9434b5ac0e*5&488ab1365c618bb40ce2e74808cd6be9*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x64x64:1x240x1x1_n"4e3f473a01a1baab61b8e6917bdb8c3c*5&45b3980906c660f42d96e7fddb054ab0*5&6a48e248e01c75014c714dd7f3f49aea*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x32x32:1x240x1x1_n"c2b5fd0bf147d480b0499b082303835c*5&b2c9efcc411d0c1b5e1aa7a5c7cbf5c0*5&63a2ba3923288c67a554a1a24dfb3692*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x480x32x32:1x480x1x1_n"d41128423359bae9e96692d6e3327b69*15&b31459d198ff387923232d53b011cd22*15&a45b785a7604ca60e117046b8d98b45d*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x32x32:1x672x1x1_n"71663e5ef1e24d62b5011e583533f5da*10&40c9a6f2c43a4ab3e5f2a317c5614a3d*10&a137e7d978849100f3a8e4945d983898*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x16x16:1x672x1x1_n"6b5d965328ebb8bac9f47410dda8bb18*5&64ab5cb556e47bac2eca251305f90ea8*5&52c51b2bb39add1fcbf4bcb8ff3e2441*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1152x16x16:1x1152x1x1_n"2f43eb34059f13248fb8597ca212182d*20&14c136cdf87fd7b56a76b9064ee49060*20&ee9f95b773747b172d966cf3c1c745aa*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1536x8x8:1x1536x1x1_n"d01c44d2924296932d8fd781e1f11069*5&a3b9fde9f939fd90e9a7f574e344fc33*5&27725d81d6b39e56757d266aa5795a07*5&21637a1e3648b525878b453f6a5aa1c4*5&27ab71e9627c7b3dd9382ac260c1f77f*1&3e15853fdb7b3473e0ee11c74d55c04a*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x2048x7x7:100x2048x1x1_n"1b4aeda41bf5a056714940da7fc59875*5&04357097bbc7d635c483cef19436dcee*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x32x112x112:1x32x1x1_n"3ab5c67d38d6c8efdc3161674396001c*5&a25671aeac4116b7469e770fbb131a7d*5&62361286071b35b2335b80ae36f97da6*5&69297ae2b5bf83bf4992a253b2014624*5&2c0374424fe46d4fe3acdde63b0823fc*1&6205a1856e372c017a406786ce517d93*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x56x56:1x96x1x1_n"062f024cb3cfd859c8590903da3f1f5f*5&105b065fd146990df607154b764dbfca*5&b6c756110bcd89095d20e9c9e6109483*5&05f279572bf83f6e3612be833095e6c7*5&eb0c6b4f645fc59a142bd22d63119337*1&321a1747c696899d9c752763527403b1*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x56x56:1x144x1x1_n"89e4f44f23f169c9ad8031ede3c8981e*5&ea05a4563668f410e2af7ed8ca8a100c*5&c36328e3e7491606499376dc4f45d549*5&247242baf62d6f6c38468a44c594672d*5&16969952c8da063c110e42096d665523*1&0a02f5cad6bdbff2d22ce824ff10962f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x28x28:1x144x1x1_n"f97c8dafcbafe1f2647735a5b9bb8ca0*5&b1f648e4217e26ceb678fd12eb08d5a2*5&5a8a99639385426b2c246c65298c3c38*5&2fedb024f017bebfd744fa46f72892f5*5&97a91452c99fd1488dda357c5073d84e*1&83af18a3ba23d8f30774d7094dec54a3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x28x28:1x240x1x1_n"8c512a7b7ea517eb69d66f2f1be96ae6*5&83728db2b48ea7c40857eee5e3478252*5&bc65bbd9c51837908b38a7d9aaf21f75*5&ebf1575eed5c4927e7603298f79a31f6*5&9a553f014fa6b26376c2e8dfb3a8e99a*1&552d825bca61b1010ca1b53b5efb1735*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x14x14:1x240x1x1_n"6a4be7d0ac983cdae8182a709c1306ba*5&6a6ee10d5a82c2f6afc142740f3db11d*5&9f5e83fff0309de9b5ab1f7f6e06280b*5&9d9fe64787996c8d6ad7e105f47afb2d*5&53e347149929fada748cd0d3938e0ea7*1&419dd4f4717391e23bed1f428d032426*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1152x7x7:1x1152x1x1_n"0aa07fbeb7c2f6cf38c2035b19495e2a*20&207f93fcc6ee18cd38c4f6561457695a*20&96c4b279c14a989926ec0066fdf4c763*20&824b9efb4f7a83cf45f8da886ef4909d*20&e6693dde17ac891ffe7295039520a675*4&10a7e2ea7c399387a5830a67ca8880f3*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x72x28x28:1x72x1x1_n"292118e8eb5b8662ccec780021f0000d*5&da59418dc9b47623a1636bc573add40a*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x120x28x28:1x120x1x1_n"10d670bab2314165d773afee1db7f574*10&39988b4d0c589babd2303c4619ecaf9a*10"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x480x14x14:1x480x1x1_n"2785ff1a333198f32460309c9a06e844*5&7d635e59d170adbc2d4ee34f8b8498de*5&ac09031ecd3b968c4e8ac04cfc44c3ed*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=s8 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x672x14x14:1x672x1x1_n"2cec299381e8d0ad786e4958a372eef2*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x672x7x7:1x672x1x1_n"6f815fd73f20d41ed77240e2da68a166*5&32f166b9e38e517e692e6fb57fd20d23*5&839a486c8eb44cd8d1988707be8456d3*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x960x7x7:1x960x1x1_n"7809a48d34b33eaba12e0fa908ea8d22*5&7494cfd9a9e3d0a802d8a829c5f72cee*15&25aed2055de830b977bfc374ce6adce2*15"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x72x28x28:1x72x1x1_n"3154885e9f5f7120950f55cc61c88d0b*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x120x28x28:1x120x1x1_n"420352534a67b659f4007344a554919e*10"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x672x14x14:1x672x1x1_n"b1535a69e3db5fc7ffced4334c708658*10&1030a0c389aeff9ad5838cdb66e4be9a*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0+eltwise_hardswish:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234 1x960x7x7:1x960x1x1_n"4b8fdf4fefda97ece82010e9f8b8a10b*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1024x7x7:1x1024x1x1_n"8eaade44fa46344507ecd0729b1dbf78*5&9db86c3a6eb73168897b295f9ef4ade5*1&260c1fe44db15ab896354c3dfb3a7f1c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x320x65x65:1x320x1x1_n"37d6af6bfab6f5da1a4e6b9aefcc1ce8*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x320x65x65:1x320x1x1_n"abd03da86613a304955ca4c3ead05f1d*5&c905c5c710d097b003a68c09bccf531b*5&7c024398db121188ac11949c81052459*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1536x8x8:1x1536x1x1_n"0026aef42e40c538799d05459d0e0b63*1&f8453ec9ff083c533d080bfb99fdb72b*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x7x7:32x1024x1x1_n"0bfab9caa1ac4df095f942db8a8b4e27*1&e1bfb2c2bf8d8bdaa1e24ac84d08cbce*1&48971d60dd47c9bcc1f362f01d8b4765*1&8c986adc0c193291657fc87706b25624*1&dc7926f815d9396dba3e6acb5dc42a80*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1000x14x14:1x1000x1x1_n"f285ce0e6855cbdb4de6e1a7aa9a4fd6*5&e2a63227ea960a89c72b7a807c468de4*5&db8acf8831e0daf741f348c233017ea0*5&682d8b27794853de3fae59fd28c90494*5&6c3098bafcab8b42601504f4ee5fc58a*1&53ef0b0d9b15019462f275316d39a8bc*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x8x8:1x2048x1x1_n"6d2153e092aaa49c02d0cd038cfcf1f7*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x832x7x7:1x832x1x1_n"ada0a7607cbd56400c7a6404d7e6953e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x28x28:32x72x1x1_n"93cda221a18b2b89155043b2a390b839*1&173a91b33cba8e898d4e68c35db159be*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x28x28:32x120x1x1_n"3cff3a42fe2e8a958aabe22886474ae1*2&4aea98fa5b901dda6077c98812226916*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x480x14x14:32x480x1x1_n"20b724f1ac4b3d14c114dabff60c59ce*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x14x14:32x672x1x1_n"76ce2b3e7d90438d9cd27b40c34e3932*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x7x7:32x672x1x1_n"6058cf58eb27ddcccb3c3e4d38b789aa*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x7x7:32x960x1x1_n"37ef810da922f035bc8fa1749ab23fcb*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x72x28x28:1x72x1x1_n"58d2c27ed97cf17c915257c011c80211*5&4f8cc36a967f1f3fb1ebb8a33d00b5a4*1&9cd923c85304c1bf61556d858c807608*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x28x28:1x120x1x1_n"b0172e65694e92334e9acb8901a4de1f*10&eb4a630ed42b7608f8b9e5ece9927599*2&f51e03643bebe27182c7d9c80827bf9b*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x480x14x14:1x480x1x1_n"c7c976fa53cf28013eac44a998e5d800*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x14x14:1x672x1x1_n"75227cc683ca20102be1e01225703b4f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x7x7:1x672x1x1_n"64b7cf4ce179b846b988043959570ce9*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x960x7x7:1x960x1x1_n"b9be6c6ae84dc45cac1c8c0578be66c0*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x16x16:32x72x1x1_n"13ec130ff01f5701c7f73e81d0dd5070*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x16x16:32x120x1x1_n"e6b673b4ece086f4bd08619c991b5862*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x480x8x8:32x480x1x1_n"08b5f0ad3932c89ef8f4b55df035e735*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x8x8:32x672x1x1_n"7a254e541ea087c981a641b3754e3363*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x4x4:32x672x1x1_n"724b92e3a59deb10ce416199035eb6ef*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x4x4:32x960x1x1_n"1047b252c7cf1e6cddd3915b4ce801ac*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x4x4:32x1280x1x1_n"4a0a140d7f38211cba980e7b085a0678*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1024x32x64:1x1024x1x1_n"390b156f04ad1c9c6a7aa50e9e4a61a2*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x72x28x28:32x72x1x1_n"17d4a15c370d750489ccacf7c3866121*1&cfa92208d7e4f9a5d1f6600728ea2d80*1&77b748c39ad86664786b5b066c167837*1&13d93d6309a9f299f9c6bbed4482387b*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x120x28x28:32x120x1x1_n"6a679f9eab9b7ef8297c28523927d85d*2&6f0eb93c036b8c0a460e5595c45666b0*2&3b2ff0762f5c4779cace6d342bd46a38*2&24fb41f379e67e9c71aa796cf5624d78*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x480x14x14:32x480x1x1_n"4409787753eaf3e88f20b69371d5524b*3&2fa11fb34570443a7af014974006a3fa*3&c3a15447e229bacd02426bdbe35a7a2b*1&645b8987040eda21554933402810ef1e*1&de1c47ca22e87b6c5d58be7f64652388*1&15223e5e892cd2d22375dd7884166684*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x672x14x14:32x672x1x1_n"c8655f8bab4efbadca6c49f8431b05b1*2&b0b724c6f74779c2456cde2331916a7f*2&f2146a39de742decabddf0405a5a25a2*2&abd6ba539e6df30a53c28a7d7f536a96*1&6b65b7735426e9c6a9611bdf7406d705*1&9012aed1120ebef6355093789e9ed5be*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x672x7x7:32x672x1x1_n"caadf3c61e507eb97ab75167ff30ff40*1&7598a7c994c7f982de6fc04f7a9f6618*1&b75f52ceb020a6e30e8ed05f9b9caa0e*1&f0587f93f76ff9d8d6eb1afa5fcab62c*1&817de735f455758aaf59f97ea267a5ce*1&21e9a7ca7b4c9e94fca15de0b03f4ddb*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x960x7x7:32x960x1x1_n"789922611e148a36dcef587115d957dd*2&f070059d6c4f009a91028b346838baf5*3&be19a4f2ef2fcb1705629b9c488b1f5d*3&3a5594a84307ebcee2cb0d0e9fa57e89*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x32x64:32x1024x1x1_n"d876e7658d0cad2b8d109dc58b93d6b7*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1024x7x7:32x1024x1x1_n"f05f7deb24d051b49761565cf1c3c53f*1&5962a8c0ba7ad6f465d9af2c52fed6a3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x16x256x512:1x16x1x1_n"4156055c5b53715e0b8fd79a9361c34f*5&e00a7ba9ca46ba43608c0b3f175c3811*5&8e716f5dfc799fe976d9f84787fa7a1c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x128x256:1x96x1x1_n"5f036b71f859a16025fa356d78632b9e*5&d06941729148964fb7371650ac084592*5&f2df34f559ecfad9cb7e328c64457962*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x128x256:1x240x1x1_n"8c38e7f76823f4c0a5b9b60422422ecb*10&c20901973c28e3202d9826c4b531941b*10&3e775f9d5dfdaf614879030836cee2e1*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x120x128x256:1x120x1x1_n"5cede7e24a97f3b30ac7d18e20168185*5&792aa24d1b3ee63554cbe8921a92da31*5&6ebafcc8ec3945f2d96369ccb91a9699*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x128x256:1x144x1x1_n"bc2768864c167dbb46e554dbb386f38f*5&3cb6aaf9ea84700cf67c6cdbdb9d2c66*5&2ae034de4dad0fd04737585144c3409d*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x288x128x256:1x288x1x1_n"242ffc6209f365e336c9ae031c4fcce3*5&85cad2e70c6465e1cfd3664017671bae*5&6ad93badf183d2d79762bad1ca6f177b*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x576x128x256:1x576x1x1_n"c4c593c5611bf67d8cd48df8f0948283*10&be7f646079b99f1da43e4aef1bda84f6*10&ce341033809237d72094fa7b1fe0ba54*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x48x4x320:32x48x1x1_n"1dfdf7f07167cf380b58ab823285e2e2*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x120x4x320:32x120x1x1_n"22912cd24cba7e1be1783d9bce2f0e7f*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x64x4x320:32x64x1x1_n"539dbdca5badeabebaa93f72e0856886*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x72x4x320:32x72x1x1_n"547a48bc3d2dac1e7d6fe8991729dacc*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x144x2x320:32x144x1x1_n"4160e8cfa5d52fff56847e047e23deff*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x288x2x320:32x288x1x1_n"66437a9c2e754da14a233bdaaa5fa4a6*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x2048x7x7:32x2048x1x1_n"36048899dadd86759783aab7f0067d5b*1&e1687ea65d8d2d7207c7120a525275e9*1&3ecc2f26a932e86290432048f4a5ab1c*1&16080ed9de572e44ab4415c043621efb*1&c3627b82977bbf70ea75eaa87a3243da*1&d42a11ad3023f0e01b66d8ea7a336a1c*1&1c4d5dfa585b7a8dbd94676080b497a4*4&6842d4cf9043d2b731da5acb5b8dc627*4&536d1004f7ce1c0aed451a2736ef3060*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1280x7x7:32x1280x1x1_n"09a47bf7bbcb7a40b077c9580e67aa58*1&51006af3362c200fe58efe87508bbf75*1&d055c9783dfb8bc2fee4e33c687e8c85*1&1a4c8a285748983a95a77e955b561b91*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1000x14x14:32x1000x1x1_n"4670a14da4b431d6e1a3bbc76a2fbec6*1&1d7810815a946a41e20e0df2954b9324*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:0.271:0.314:1.234 32x2x1x1:32x1x1x1_n"3b8bb7d6444d4539760802ef53d580d9*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:2 1x1536x8x8:1x1536x1x1_n"39a2d976144dbbcad01b807d847646a1*5&a2c52d1c8b987cb06e5faa31e64252e5*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=u8 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x16x256x512:1x16x1x1_n"08ad99f78e1de96dab69a48ec2b9111f*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x96x128x256:1x96x1x1_n"23360c0c48526d7d6aadf33af0d72833*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x240x128x256:1x240x1x1_n"b8d0e430eb79d65806708c01853808de*10"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x120x128x256:1x120x1x1_n"853e84e5488ed90a676a85e0a863ad06*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x144x128x256:1x144x1x1_n"18d319b5e0b739deb50ae3408ca8af00*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x288x128x256:1x288x1x1_n"858549afb0ba6efce1e03d720dc6bcff*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x576x128x256:1x576x1x1_n"b353eb8a446146a405f61c9d1ca93e75*10"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x320x30x30:1x320x1x1_n"5e3947d6a4fbfc659747411f2e49b70c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x640x30x30:1x640x1x1_n"985df2cbc98870bc03b11395fd83a102*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x960x30x30:1x960x1x1_n"fa3cf838375afdf0f20e4fa78d651fae*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1056x30x30:1x1056x1x1_n"a107d4717910596ce2e2d902dd14df2d*13"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1056x15x15:1x1056x1x1_n"87cf4ee41cc28f022467d74d9bc5775c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1824x15x15:1x1824x1x1_n"77e13ae861de3b13f23ebb8d5ddb2c0d*18"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x3072x15x15:1x3072x1x1_n"0b4a7132b6df77b79cccefdf25a254cc*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1280x15x15:1x1280x1x1_n"4fe50153e689098ff8e37a1e1a1af751*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 3200x1536x8x8:3200x1536x1x1_n"26c22af6b43d738a19fc0b769220631e*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1664x7x7:1x1664x1x1_n"c18394ca5cdd4738394761d6072a3f3e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x1000x14x14:1x1000x1x1_n"147cb39530375442f3cc3bd757bcdfda*5&cbfc5a1e434b5606466c8d2f901b27d2*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x1000x13x13:1x1000x1x1_n"fe6a27fc0c493118ea7cac287aa02e51*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x64x56x56:1x64x1x1_n"204569347ced0bacefc15f3e038e2d38*3"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:0.271:0.314:1.234 1x2x1x64:1x1x1x64_n"264915031ae97bea6135b5e6b02e4cf3*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x128x56x56:1x128x1x1_n"585139692af256cbc702feab49b7f9eb*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:0.271:0.314:1.234 1x2x1x128:1x1x1x128_n"0f78e265d7c803004c26a1ff65f706e5*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x128x28x28:1x128x1x1_n"252482af50d8d218b18109bfc218e3e2*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x1x1_n"4fa2844255a8cff5d31b804d97d51678*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:0.271:0.314:1.234 1x2x1x256:1x1x1x256_n"c8119b6b12e6eb83c3944cce91187f28*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x256x14x14:1x256x1x1_n"c9162f6bff65b822bc9d8e1bb1deb882*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x1x1_n"fce44fc9db91d02b52c1b039d4b4713a*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x2x1x512:1x1x1x512_n"ffc964b00c3884f1829573ce58f44200*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x7x7:1x512x1x1_n"325bbcc38143ea593d6e56a16ae08f9d*2&4f7161d0415e79173fbee027cef2bd59*1&b325a648d50dc8fedf67b987969aca66*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x32x112x112:32x32x1x1_n"9277280ea69e286de603e9f84b6826bc*1&4fa27769b6e50c7222c52fa5aa54efd0*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x96x56x56:32x96x1x1_n"a40f36b68c48a52ea68dd0c8af8603eb*1&dbf6affa4b7fd446d3216cdb6567794d*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x144x56x56:32x144x1x1_n"831d35f88638d92e8c19bb9bf0b3fa15*1&41a29c600dc41733f4a1e171f26a76b4*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x144x28x28:32x144x1x1_n"12501ceb608fd985601fe329f1799aca*1&f20b3be725729c93d84426c010f1d7cd*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x240x28x28:32x240x1x1_n"e473007b3ac8b1ed77f3deba9ebafc53*1&0f6c2eaf710a5395f303ef5c24d29fa8*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x240x14x14:32x240x1x1_n"df019b60aaa5fc21d003554edffecc16*1&2a0a9fe8504ea599ab16f7d9930818c7*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x480x14x14:32x480x1x1_n"a528baa85c725dd8729a51f5c8a2704e*3&d1d6db5ff192b551083b199ffc55ead6*3&974173c445519098e9f15b40562a51ef*1&704896474dbe3b82b8c3cdcc9be8f503*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x14x14:32x672x1x1_n"292733a1e9b4ea9f505bb6a70a5ba5cd*2&655963ebdf40c3e569b4fc3e5178ee16*2&78316265a04f78660651658fd83f9c7c*2&103706765d818eac813b193f262d3d12*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x7x7:32x672x1x1_n"3629abd8127518bb0b9fb4c9f0694198*1&e1af195db633d7ca73686dcd4be26f61*1&dcacc0bc4c0a12fdf2810acd4bbf5e3d*1&8bcec706a16e6a00da4868cc2f5b1096*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1152x7x7:32x1152x1x1_n"d490a1f5c855b696faf4f060d7522077*4&54db9802cf6c5881d881eb0afc73cad9*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x7x7:32x1280x1x1_n"ba7f63646d73f01a6b188646de176700*1&8d4ebd1d29013b860cb6b07bef982f16*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1024x32x64:32x1024x1x1_n"cdecddf39d803159528ee309f99195db*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x48x4x320:32x48x1x1_n"7348b1d2f9ef72b04f35a0589672dcf4*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x4x320:32x120x1x1_n"01945b68fb0cf30f5d68e1e97f66d3a8*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x64x4x320:32x64x1x1_n"0c8d9f3ab7a100d0892cbebb463a0210*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x4x320:32x72x1x1_n"f806e9c519425ac2488f2115af88fc0c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x144x2x320:32x144x1x1_n"1e583e1070083f5d6715d98dda549e7f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x288x2x320:32x288x1x1_n"347a72e08cf1d9744671196895eda558*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x320x65x65:32x320x1x1_n"05c59ba8f6487b8fbda1b6b409887bc7*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x7x7:32x1280x1x1_n"3562c899b4a4cd1d901ac2590ad74583*1&2451077cf1e10661486c546e1c58c30e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x256x4x4:1x256x1x1_n"ea554c530e48138a13236ae6fc545880*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x320x65x65:32x320x1x1_n"4e80555d5a6ae4cce892529eceb9528c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1000x14x14:32x1000x1x1_n"0ca0ca5e4c5892cee3f9af9c32593e4b*1&bef12fee1d5467c46b7cb7bbe77cb7f8*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1001x13x13:1x1001x1x1_n"486c15017698c4cf5db314f1fc97acb0*5&971bbb22118f1cd11b9258379808475f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x12x368x480:1x1x368x480_n"0b9da4468ae858e485053d46a25ee484*1&6560f0d373e95cc491f6442200ea0af1*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x512x7x7:1x512x1x1_n"d3a1420fb04d18787119a7a29a6513a0*1&0a498ce872b314bb55a8b6887c86d7c3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x832x7x7:1x832x1x1_n"f3ad4e5166826e0ad95c5beb8e9d90ee*5&582cadd49d0e9c2d602c365c8911e0f9*5&6c42ee2ffd4850badd44f156682aecbb*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x1024x4x4:100x1024x1x1_n"c13b1e25e75b0436022ea516453ea18e*5&98c10655493180eb616032dfffa5b243*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x480x14x14:1x480x1x1_n"0449d54f1e3c3fa20d9f4deee9f0c2f8*5&53863326373d3b38458d01a7bab002f0*3&c15af0bfca8be9dd8f45275501deaa3a*3&45c07ca1d15a0230e0fe166c5b6789b8*1&18be109f597e1a798342f0599b0c78cb*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x14x14:1x672x1x1_n"eabd62c148ad3efcfa91c6d88ab519c1*5&5a291a030bba0aba653480daf885d955*2&1cfdd4a2dc36a8e32f926698600cb88b*2&4b529ed1528f5f9e20db642c3bb204a3*2&e6dc15e94cca547438cd9650bd28bce1*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x960x7x7:1x960x1x1_n"07e92c0dc7f5c2367b9f7eedca8108ff*10&375e44a85326febed551a8dac01343cf*2&b879d5988026546c979cf7820165f4f3*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 100x1536x8x8:100x1536x1x1_n"dbefed97a421d1b66ad0eb77330a7894*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1536x8x8:32x1536x1x1_n"9c1957f00dc38de01adba66c2df76277*1&a4af4b583dc2553d40168fda68ed1dbf*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x512x7x6:1x512x1x1_n"dc28c6f7e39cbb796b60915bcad237ad*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_relu:0.271:0.314:1.234 1x512x7x6:1x512x1x1_n"5de12ffdd104fb4b310cde1cf165402a*5&34c23b7c6aa629177ee6dfb576d40b00*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x14x14:1x2048x1x1_n"d6ae90123f51f0171fdfc0890015e4c9*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x5x1x1:1x1x1x1_n"6a0c9ecfddfdf37d878ff947fc3c4f85*1&0dbb3ed94471fa1aaa3c5eee98e323ba*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 100x2048x7x7:100x2048x1x1_n"05f1dd7933dc54420b4aed5f50e9d787*1&77937d164f4b957f6cb42dc19169a172*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 300x2048x4x4:300x2048x1x1_n"1041cac2fa16c2b6da47d3f0b40e71fe*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1000x13x13:32x1000x1x1_n"784ce2019f78562d745fcfc173ecfc14*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:0.271:0.314:1.234 1x2x1x1:1x1x1x1_n"e6cd7d73c01b10213abca4ee420658c6*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x32x256x256:1x32x1x1_n"fd9ba86f360ea7b641df8ddae8534467*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x96x128x128:1x96x1x1_n"e25189d1bd69c2921e569fe471306f58*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x144x128x128:1x144x1x1_n"49b3cdb8c4bee828676120840ac86ed4*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x144x64x64:1x144x1x1_n"bdb8b4fc9e1ceb008f8900728a6ef56d*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x240x64x64:1x240x1x1_n"adbd44326efa97de2ed626801442e069*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x240x32x32:1x240x1x1_n"dd07089953b6a89b051621cd76e80cb3*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x480x32x32:1x480x1x1_n"d2e5ff609f7677b525e944b544e6117b*15"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x672x32x32:1x672x1x1_n"3f04eddd61f94ff8a8c0b7ebdd012b6e*10"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x672x16x16:1x672x1x1_n"e03c4cff83335ac8930c1f52e9f6ed4e*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x1152x16x16:1x1152x1x1_n"03a0ec8cdedbfabc5334964ba439c3ed*20"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x1024x4x4:100x1024x1x1_n"1aeb5e8bd7025bd7159092d22b286791*5&6d64523d92db1518aba043b49d579d3e*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x128x256:1x2048x1x1_n"cc7feea93479e03f5422c976c1a87657*5&52e3f5cc7fb71bb3113e1ab6c32bcc83*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x1536x8x8:100x1536x1x1_n"0fbec2db11c2984bb7970bed7b365598*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=abcd --dtag=abcd 1x1664x7x7:1x1664x1x1_n"d6aa88c1f8b9102542503885bf24f40d*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x6x6:32x1024x1x1_n"be64dac3526456a4da770db744c85aa6*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2208x7x7:1x2208x1x1_n"f9e7a1078e5208c334022b5c27a5972b*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1792x3x3:1x1792x1x1_n"fc949b106e84933552c4f234117e4a0f*5&2679d327a28941bc423c2aa18f624471*5&3f2fe51b585f23def98735772f999a22*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x256x56x56:1x256x1x1_n"c5c6756b9cd89833e5251f0f7b423f27*3&122b750a6ef534cfc7bf71e592430df2*3&6088b7575f7a16c4af42e4123cf0f0d3*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x512x28x28:1x512x1x1_n"f66b294e0b85ac1ce23151ac872af26f*4&7943b98d817e2ad64fe4c15bcf7a0aa4*4&70b4e098aef5f56076f2fa5cba76083e*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1024x14x14:1x1024x1x1_n"7389db29c75431e1c8962c351cd1b523*6&222df3abde36482e705b278506c11f47*6&c1802640d243ae49da9395057ec1f472*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x7x7:1x2048x1x1_n"f263dc6f30b29c8077949e41e8c24f6b*3&a4ee8d88e17f43cec0af5917fbf90a66*3&51ffcb7ffd6c2f88ffa1e343b947fd9a*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x14x14:1x2048x1x1_n"6c0d53684eac345ecc73bffdd96e6e9a*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x832x7x7:32x832x1x1_n"bb9b4d71a785113c74ca7e027919a8e4*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x256x56x56:32x256x1x1_n"5805ef9cee48a6791079d294198c9c3f*3&654cb24cb20e3ed1b2225c202d107e8d*3&443bc0b508e7d3d6aa1eb982f8586670*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x512x28x28:32x512x1x1_n"2feb0ba7b5fb37c33e607dd561b2c11b*4&61870b90ba061ae741f5bff0e504bb5e*4&e138625684912a457cf4356074b5d72b*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x14x14:32x1024x1x1_n"042033089a02d34d2206e0a89298edae*6&d1e08ba76804841dae1fd06e750ed8f2*6&b1e0e23af43050ba95f6c706ebdfa5fe*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 3200x2048x7x7:3200x2048x1x1_n"b32e9249466b9d5a4bb7bd696365bf7c*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x72x28x28:1x72x1x1_n"ab4bdebc34ec3a2aff0c6990532bb95f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x28x28:1x120x1x1_n"d11d4fb12a60db80e0eecf480634a0d8*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x7x7:1x672x1x1_n"505d806a3b45fc81791434777475ded7*1&856e02cdc0a6671ec2a28ca78ee32fdc*1&f0380ad305976b8cafa39f8fb6972bb4*1&f9e2cf716b454d9d8003001d6e93d461*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x48x4x320:1x48x1x1_n"cf05547614a996677248fcc24f5ee863*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x120x4x320:1x120x1x1_n"da4bacca1cd1cead6267a0a5784711e9*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x64x4x320:1x64x1x1_n"ca3775c3c52f56b88c1d550bf57ab859*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x72x4x320:1x72x1x1_n"58fbdc03b7dd225c6a2c60ced29818f9*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x2x320:1x144x1x1_n"72db87b24d9f896b7ad57cb954e2cb80*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x288x2x320:1x288x1x1_n"feb5130d7ad2b7f0c21de622efe16d01*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1001x13x13:32x1001x1x1_n"21309e9ce9bdc00911c42f0c142a5bc0*1"
---reset --allow-enum-tags-only=0 --alg=reduction_sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x12x368x480:32x1x368x480_n"951e902bd8a9a50c3dd5b790d9693e4f*1&a927a206b414d567ac1648f8c0d2a775*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x512x1x1_n"1012aedb6fba1c26bb58d1b7c65b1b79*1&5c3aafeb90f384bfbe4e735a638b1adf*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x832x7x7:1x832x1x1_n"6efd9d52d2f0e1272be555b6bc05cdbb*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2208x7x7:1x2208x1x1_n"66bbd9a92c4634a43906bbd2f1bdd9be*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1792x3x3:32x1792x1x1_n"97ed38564f4249e2181d6450ce8cf872*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x32x112x112:32x32x1x1_n"7ab86008034e428b74164581ceefe389*1&0b8beff98ce871652fe165be93ad7fdd*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x96x56x56:32x96x1x1_n"4df01e95d15d638f9e2cc38342b42a84*1&ca4197a885df3d3f92c6c42ef3c9bc02*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x144x56x56:32x144x1x1_n"bc993068efb93d4d463dc673b289915e*1&04554c46165faea50179081accab1fc0*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x144x28x28:32x144x1x1_n"796a2f04a1d554870de7f65807af8200*1&114a51b9a76b3eeb64a944fdee641184*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x240x28x28:32x240x1x1_n"80ce50ceb6dc09efc21277a6e6a81118*1&b4b20a565b25958e11cd0c29425b008f*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x240x14x14:32x240x1x1_n"f6940665e336ab0dd321bec8aa4c9048*1&33f02a10299dae13f56895f5b75b1ba3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1152x7x7:32x1152x1x1_n"bd07e322c0f1dd51185d52d23fc30ccf*4&ed3f2cac6656cbd01ace64d928435197*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x28x28:32x72x1x1_n"a1467d8fd71c6d082838bdf9b34282e3*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x28x28:32x120x1x1_n"9a88099cc6761a38e48703488406e560*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x7x7:32x960x1x1_n"2b44c852a7d7b1501143369049f5dea9*2&dededeed725d62ec772cd97c10f3c20b*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x320x65x65:1x320x1x1_n"8107970c6c49e473deb77f9889481a51*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x512x1x1_n"492d021d5998fc0ec915ffd8c5d37985*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x1024x6x6:1x1024x1x1_n"fcdfb95bed5e8264a2fe0c8f3669be22*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x2048x128x256:1x2048x1x1_n"1ae3b1a894de72525038c3079a82d366*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x320x30x30:32x320x1x1_n"cb3c8c229577c540fe2a8f9408dc355b*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x640x30x30:32x640x1x1_n"b8a5ef7a3ebc16dfa922adb9b4c1ea33*6"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x30x30:32x960x1x1_n"dfa4fe1a4c0bc153c1ef8f234e6f726b*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1056x30x30:32x1056x1x1_n"c65d96bf34240184065cebbd65910cbd*13"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1056x15x15:32x1056x1x1_n"b8bb861054e3c6ee821d0148331d509e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1824x15x15:32x1824x1x1_n"e69ae70f53dbe1c03d77c4aa2b085d2a*18"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x3072x15x15:32x3072x1x1_n"e6a7ed4bd2be80da663197a020fa15b0*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x15x15:32x1280x1x1_n"ec78df8f9785e4ad55003deea02db08d*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 3200x1536x8x8:3200x1536x1x1_n"82645e13621305a0a55e3b5fac873fc4*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x32x256x256:1x32x1x1_n"66dec0541ccfd082a96c0b89edb73f11*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x96x128x128:1x96x1x1_n"b830aad9dce64713bd9a9c02c0ed6538*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x128x128:1x144x1x1_n"ccb091535bedf4594a66624b9e53ce95*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x64x64:1x144x1x1_n"8ed40a77dc1b4027b51409fb8fa5a650*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x64x64:1x240x1x1_n"7e7bf06b957a7fd3c758252680d32bb5*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x32x32:1x240x1x1_n"970e26e9f5c46d335c345bdeb9341eb1*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x480x32x32:1x480x1x1_n"a77c72019028da3870aa8d835b119b74*3"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x32x32:1x672x1x1_n"7d7f1600e5482e213772b9abba7897ae*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x16x16:1x672x1x1_n"35e4e121af17a3ac75c3bbed2d266922*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1152x16x16:1x1152x1x1_n"027e3c47847e5020f927291b01e2d403*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1536x8x8:32x1536x1x1_n"8cb51f20b0cf1ad9a00f10afc4cb5449*1&187567cfbd3584107629cf5c9933a4c7*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x32x112x112:1x32x1x1_n"dbf60e6ab76ad36e97896b9504722d40*1&10b59c5989cfc6491e5860021b416234*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x96x56x56:1x96x1x1_n"cbc824a1e0d933e403d81f48197cb236*1&77aa8ab324f601720692038d219ea0ca*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x56x56:1x144x1x1_n"0c549d54689bd9bf463f0eb44c06f2dd*1&0cdef53bf3637fe0178ecfab92cae869*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x28x28:1x144x1x1_n"4c28e235dd7cdf440f3eb0c243486108*1&3461d0c4c87821255f29a7c40a132e64*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x28x28:1x240x1x1_n"58c3cc6718d6979d7e92279be403b038*1&2243f511b14c02bf25d786bd97a004f4*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0 1x240x14x14:1x240x1x1_n"2d8950a6a9455f3d75da0d1d8256c68f*1&e2d9af0988c1b5e665f502b8de07ff33*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1152x7x7:1x1152x1x1_n"596f54bc9e8dd06819e5f1167173555d*4&adbef230f71f72a1c9d172f1f2ac42f8*4"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1280x7x7:1x1280x1x1_n"38eb7ac6e8c33dad20719f31e36d2240*1&e899dfded4879f2ecb318f870ecb1f41*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x8x8:1x2048x1x1_n"4cc6cf9c2f3dcb1380fdc3b42d13dda1*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 3200x2048x7x7:3200x2048x1x1_n"0bea3ea6ba120d6049697f7a8d475838*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1000x13x13:32x1000x1x1_n"65592c9fe4af62514f8a9d2deea3fead*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1000x14x14:1x1000x1x1_n"8fa52724e4f13114432d2531914db21a*1&d3b0cd6b4c798d4036b29ddc9e1def30*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234 1x1024x7x7:1x1024x1x1_n"65510030e948963c8b1fca0de5e00dbd*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.271:0.314:1.234+eltwise_round:0.271:0.314:1.234+eltwise_linear:0.271:0.314:1.234+eltwise_clip:0.271:0.314:1.234 1x1024x32x64:1x1024x1x1_n"f9456c68d63c64889129326e6a7e5b7e*5"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1792x3x3:32x1792x1x1_n"54ad72318f6392a9cde0fa90de1b1ff9*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1792x3x3:1x1792x1x1_n"128a6bb3c8c1f7c88be93c03c9903e32*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x48x4x320:1x48x1x1_n"df3242fdb89a0bc8970d6645d5a52a4e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x4x320:1x120x1x1_n"e04772ae9b89a11b5261d776e0a91182*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x64x4x320:1x64x1x1_n"d5c5eae6b47bab65c96590fe12f1574e*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x72x4x320:1x72x1x1_n"a3b13c33f0828fd0e5fa2b34e2ad2650*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x2x320:1x144x1x1_n"f3614b0166a02c8eaa811d1c8f89282d*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x288x2x320:1x288x1x1_n"43f86478777610f17c525bf490a450cf*2"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x7x6:1x512x1x1_n"76bc29d4b4202e3d25cdaa6b7cbe3fbb*1"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:2:abx 1x1536x8x8:1x1536x1x1_n"ce7922f905a1816ec4b9bf7145d11723*5&a83afe03e942334433ffb7747919f702*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x2048x7x7:100x2048x1x1_n"04cda82d18dbdb071f4ef59e49c34cf6*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x32x112x112:1x32x1x1_n"77aae9175c732bd060153eb009af6c60*5&12ccfa70e48d4af3751368bcd48bf708*5&acbb8a62beb197e7a67d4309fc9617d1&831efd34234863e5b0ba7b4080b4bc8e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x56x56:1x96x1x1_n"623e2f27c47a279d4cdc3d81add10254*5&75dc4d83c93ac79f9828d6e8c566230d*5&85d62dd882ba7da2ee98c23d4b712230&8f3fc160950a0f1adf5d3567ea760385"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x56x56:1x144x1x1_n"f889ff1edba3ff13d703146ab96de21f*5&4e3240baf78f1d1a7abf3e6243174dd3*5&767b61716228f8c78f14c6a1c32014a9&049f943a987968618269519442f2594b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x28x28:1x144x1x1_n"fc6cc05b8ca8aebc69f154158d77827c*5&889b41d65cb94b7afeb5a01797059c20*5&09b4bb7b7b4b58bd5a467c1c84e3a5d8&562b611f1cab6a9188cc50660639abe5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x28x28:1x240x1x1_n"bf62ba26bbe20200707296939ab403ef*5&0ba6b35d02fc7676038e4c4f5bf72c41*5&1073d4ab370a4c6aef32aab20537ed51&4b56e77f1f5a8266a623de73e1ce227b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x14x14:1x240x1x1_n"78416b39c19fb4e2fe895125f516a5c1*5&4557d063b71575cfff634c7cba410230*5&ca7d422980848ad90919deb9145bd787&3b5a701c5ebe44e64ee04741803c9a9e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x480x14x14:1x480x1x1_n"0048f4bfdd4e4bac13a71f162137d8ea*15&3b4563c9ef9aa6913ab3468fa3b8b94a*15&2236c4ba4956020e907e59e764a9d1a3*5&6750cbfb1cd931cd853f11eacb3836eb*5&14f1a88890df482057efa6aa1690295e*5&8200726ef22401d289c2a6a3316330f2*5&3ed0bed9cc581b368880a206e6d254a0*3&a0c4dc2a51d0e7216d7f57e1479f0ffc*3&57e30f59639612942abf43dae82b88f6&b4a76903964c5816a0ba3f45a1171b33&870fa0fe2d22cbd9d8189fb3a65464a4&90604a9a253ecd7458975d5eab305cd1"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x14x14:1x672x1x1_n"b0d1ae9e166f18729331c3969bfad06d*10&89f3176f226b9a881b6fb4d88d09f773*10&86e86ce02bd43eb331696ca623b9c5cc*10&444e7503712d00bf607385676784a43d*5&42f8cf5df94d932f06c127022afb12d1*5&b569e51d85283ac28183a3868d35deca*5&801b922fd916a10ec87488adbbcda388*2&f98378f503635c596d2c75597df102ef*2&5fc31c68b6e38b26206e7477525f6966*2&ca6ca569aefb6e891c6fff7a7501dee9&11f0686b5ed6af8a06e0d9f505ef3781&242ed278f7320002472c5b38e59c6c28"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x7x7:1x672x1x1_n"ea2d3a24644b07be61a2607a6fc4b3c3*5&94bce491a6d3aced33c40f9054f00c54*5&738813fe763306af9ee913222278f3c1*5&45bbfd84a3a9cdd11d20238011267371*5&62204fad7549dcd2dde540a04fd43e33*5&534ec1ed89bcb4e700dd0a0348e7c62f*5&8ff18ffeba9d53f6d52621863d406acf*5&5ed732c961baf30a995f2ab8eff0d237&2218621fb214e9b3563dcea9d1c22c68&667f07a89eec4ecf6f66d3cb068cac9d&6ee19bb6d8f7d76137082edfb0f56461&6ad3cb670550c7a4d26cdad77f862bdb&81ad3ca8f243b6b44fa7013bc065c938"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1152x7x7:1x1152x1x1_n"4e6fc05d0017be3217305104722565fb*20&53653663ded8ba71bdae67b63b5b946b*20&9eaa70aecb2956b89664246c4cfa9a14*4&a7ab88b4b6bd1455a926968c2df785cb*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x7x7:1x1280x1x1_n"7b3d3a8d9f542bc6f3b045c18887e10b*5&34965e9c2b55e90eca661b5043e53ae7*5&e4f91eeae29b9fbbe75187e3cd152346*5&dca5622adff61eb5f427623f000e403b*5&7d098bde4ef2c4a1c11f0b7a1d5c73f7*5&ad576da179af6285959578b58605c3c4*5&d49768df85be4a0ff5667d1e968f6e64*5&ada3a4a90788c371c6919fb14f51fd42&98c7dd2d0987b9271922927aa04247e6&d621c41ad4e5cabb490b66f303b8ca65&288141e05c14333ed7c4c7427c90f0ca&6664452219295bc9debaaf8cadc6a6dd"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x320x65x65:1x320x1x1_n"bc9bcbbe4ac0d8950f409085fe6630a7*5&8f7d05b68966130cc5fe0a22163db733"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x16x16:32x72x1x1_n"246d5477c1aeedbee88f2fa7caaaaa90"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x16x16:32x120x1x1_n"9fb3a6f20d40d0a42092f93cbb7fc50b*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x480x8x8:32x480x1x1_n"9e671a04fceea8cf02ee0c5e8c03ac65"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x8x8:32x672x1x1_n"54d3719be43125639ddbe63d1c8ca04d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x4x4:32x672x1x1_n"42d731199cd4590535d831b29a76a5f3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x4x4:32x960x1x1_n"bc442e017b9337e22ad691236b723c99*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x4x4:32x1280x1x1_n"700722d726616e430d76d81f2d9db699"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x832x7x7:1x832x1x1_n"21e04444fcf212f0206452c4bad0a49b*5&bcd625e180829d9222b06d508d1a6818"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x2048x128x256:1x2048x1x1_n"def6be2628f2c9f7c08719e282532e09*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 3200x1536x8x8:3200x1536x1x1_n"af82814fc59f63833102ec5cc6e34f7a*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x8x8:1x2048x1x1_n"1088ae7c72a04a2bc7601f58622b6ffb"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x72x28x28:1x72x1x1_n"6866c555cc5c923d6957eb93a11f89cf*5&37ef5e9065e45d861299a36948ff25a2*5&5b5c9255c92d415eb423a64f68adb301*5&dc52e43d422f772f6890fa301c377849*5&1bf1eb40df67468a2308e2c5f7b9f6ae&714e234116c48494f66a025e2edd9ac8&8fbb357e0f566748a1140a2c2b1e3206&d3e452c5f106aadbdc0e6033e3c8efc4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x120x28x28:1x120x1x1_n"4e5816ae61fb78686af0f13ab073ff31*10&6f5dc373943e7f4004380ea77d90a4fd*10&bccc65775ed99882fae0cd2e38e44c67*10&b9ae0ae76ce32a455bac2081780bf395*10&e47c9485175715c60a6ca12971e87e0d*2&11ef7d8d4ded6ba6bdb57ceb5f8f69ba*2&e65d858d09b1a93ad2b0a4509b7fed23*2&5228b5463103681b2ee45b4aa31b146f*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x960x7x7:1x960x1x1_n"cdac731125a85cad8fecc88c0539f817*5&1364af31d2cb0554f4ecbb98fd63bba2*15&6ad9f751997bb36850c167cd1f519c9a*15&e3f84a07ffad7bfd536bc2420371f999*15&8b5985522940e4b558d4cbcf868909cf*5&0a13eabe1dc5c7b9916947cf014a3a22*2&7025869231d3eaffadead7bded75af98*3&9816793bf6c31e20eb109b77e2acca55*3&c0460e55b9e669bf04f2e2ae34ad5c9b*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x2048x7x7:32x2048x1x1_n"cc0e0b8401dd85a52439ec47597cd099&9ca616d28c9736c4504f0464bb1bd5b6&9f2276af21cfbf2cfb610b90482c3702&86f16b947a0d4ca61d21f31ae9aa8d71&5689be6702911e163a3c9043363fde81&3bec11b41ec4355ac4427c52a1856fb3&cb1ffac2ef47a6cebe7f85163916bfd6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1001x13x13:32x1001x1x1_n"cc130d1debda59c6a94ec38c67f7fae9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x256x56x56:32x256x1x1_n"a9275a772c0a55e2a2d39e89118fef96*3&1cb56204eafab266ba1c54223525801a*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x512x28x28:32x512x1x1_n"7df2717a7d6cf56b5d6eb89da5d3f29b*4&5277be8ef87d3bc12e3036b9c5360d5a*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1024x14x14:32x1024x1x1_n"94d445c97223c39b55c20d40b34826c9*6&4d3c72d9e4bc0e7cc37a53ee033ee23b*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x2048x7x7:32x2048x1x1_n"d3352ffe19eafcf14b094aba3fd3a9b3*3&d64560415976db66c01dfa490d1422c6*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-post-ops=eltwise_linear:0.25 100x2048x7x7:100x2048x1x1_n"0fe4c1de5471a98fff7889702140bfd7*5&fd77e9eead3bccff30d67c0df4f52ed1*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1024x7x7:1x1024x1x1_n"d5c8c4464e06c56e24d2920a79d97500*5&3d86e40073b3da90bb11a972f0679244&ecb87a6ff62d15abc7a9b44690e9a1f1"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x7x7:1x2048x1x1_n"4f6ff959c71c14db885294a03db98a10*5&e6e31c5d9d6658c47e90846027ad6c67*5&c96dd53eb26091ff9a3f7af77da4588d*5&53c17e36fbdfc4a91297738680220fbe*5&be42e864d451b224991a9c902e1563c2*5&5fb2979b98a629e5e39fbe3810f7df4c*20&dc36f8bf59e18b949d55c4b0c4fcbeee*20&f7500eefe70a3747944139ab22ad74de&fe72f81afb3b68a1168708517431715e&d437f77ecb4b8d3e31d07520bab14741&b2ea9f6ecaed2962b0e3251adada58ab&879b348eb4775afddf1982a9af4511bb&15616fe14fa097de99424a2c164b5a89&089100bd4c544cdc4e968bdf7d305dff&665eb0bcc24ace0fb731ae4a73beb03d*4&3ec1e80b50ef1088996d9882b94feea6*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x32x256x256:1x32x1x1_n"bf50f73f72e330be7c19cfa83a3ce2ed*5&54644e1c55d72c3bd4d73ccda6229933"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x128x128:1x96x1x1_n"e17ff5f440e43705275de883040b4879*5&b18aa4aec9279a8e7708ee89c278559c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x128x128:1x144x1x1_n"c0c21ccab7d1304b6f32b9344f1b09c1*5&f09c8b77b5d74105491d04dff68b818c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x64x64:1x144x1x1_n"cdaf81e10db6bc28a1d3dd1cf894dd85*5&70c68a8dbd68f4965147c78f072e5763"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x64x64:1x240x1x1_n"2a0daed8bfcaacabdd5b00de97bacce6*5&41c6037b1b2c0221607d30b6a769fb12"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x32x32:1x240x1x1_n"45ec27343e7a26e0bd21665b7c13d963*5&8e24e738a886403c9e59a8002df8a2bd"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x480x32x32:1x480x1x1_n"0806737afd6ee894981cf1c8692cced4*15&ebac3577dfad7124d65e6f752a91138b*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x32x32:1x672x1x1_n"d7897b1591d9732863384932c7d0dd39*10&1e2058644c48778037197de4ce408eac*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x16x16:1x672x1x1_n"81cac9a88cc4e1e4407732092d80617b*5&7cbe3904cc43bc3873b836367849f279"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1152x16x16:1x1152x1x1_n"cbb59279b45918bb120643d9c73fb6e8*20&824bc56d53e7348576cade9a6c754c2d*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x16x256x512:1x16x1x1_n"225af23f3a2be73d137cf40fc2e685de*5&984037ba2520169fb22d0c51be939038"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x128x256:1x96x1x1_n"bb2321b10d481577e3ea07f9943a41a1*5&96ed69a17bede9c5ea7d3826b4f646a3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x128x256:1x240x1x1_n"3534bad905bc6fdb811cf076dedea79b*10&6a135fb9b3f3739002a61b585aa84d8d*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x120x128x256:1x120x1x1_n"bd596109b519f73ccd65afc6c448730e*5&10181a5f122c9d44eafa629f501aa89f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x128x256:1x144x1x1_n"adb9eb60b3e9068b78ef9dac463e5d1a*5&c52fd6ed52136497417fbcef7e19b1a9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x288x128x256:1x288x1x1_n"d47a1255100f1a8698ad443a7fa70558*5&0db834c81a725e4cf0ef1b48e8706131"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x576x128x256:1x576x1x1_n"159d10149087c1c28763e3ec7bc5769e*10&8e1f79e02d39dc4fdcebeef42049be91*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x32x256x256:1x32x1x1_n"30a46d2351a3db55c9e2ee6fca5cf9da*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x96x128x128:1x96x1x1_n"4b1d1d8468bba5327d7e22ae854f45fa*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x144x128x128:1x144x1x1_n"88648751a29539645b79a46e9c357dd2*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x144x64x64:1x144x1x1_n"28f3681f8383d0c463f18c915adb248b*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x240x64x64:1x240x1x1_n"4e270615c357a4a0717867bbb3e6217a*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x240x32x32:1x240x1x1_n"bc998f9a169cc342167fc00b09e746e0*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x480x32x32:1x480x1x1_n"4f518f134f2446540c8bdd0418cf013f*15"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x672x32x32:1x672x1x1_n"7162989bac2b06a3ef09cd518d1c9f7f*10"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x672x16x16:1x672x1x1_n"5fdd4ff5de0dbc621e4804ce84adb385*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x1152x16x16:1x1152x1x1_n"15151b5e237b30d85b0d90ebb080805d*20"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1536x8x8:1x1536x1x1_n"2b29653127d4208d3c130ab49fe1cc52*5&05af86d7930e2b569217344fd9b14731*5&1e2ebf8cd376d49f6c2af4d056a2e299"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x7x7:1x2048x1x1_n"49d3c9dc40cc5eeb067f34539b554ea0&d89c559d357c1b6aec6bd94bf797a1ca&5b8a6ef8a77a88bfa3d2b4c712c45e05&e926e7e06b3bd367185225626c9ff98d&c6cc37aa5ed1fa1b047e3a3f22459084&7e13b857efbc609ff125bbcc1840be2d&24026bba2afc3c6cf43e08cd9a162fbc&80a5052b3e8b760fc8e176da9b9d2b72"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x7x7:1x1024x1x1_n"1f80c62bf680443d6b709fa1e02d6fe7*5&c0606adb3b3c7edbd7d6c16df9ea533c*5&a814758fdf43e8133368e5156452b79c*5&b6a86a1d61d1adf3c46a6028543146f5*5&cb810328b2e7ea718baf0f94b008031b*5&68778f3345215ffde42ecf059c0ab2c0&ff51f09b56fc8361d5e229e2c9c8cfed&77521e0a83745aa29ca5fa9350e67448&5cb992cf902bcadc999949b871c6acf4&d711dbfaca93ffbc9b39827f8a7da4d7"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1792x3x3:1x1792x1x1_n"3385425b9f9bea2b907887c73f5e222b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x64x56x56:1x64x1x1_n"45e681b2d42d2cd9d53e31f2191c35be*3"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:1.0 1x2x1x64:1x1x1x64_n"c9ba512c3d736dcea0c6e97f7642689f*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x128x56x56:1x128x1x1_n"c897081fa99ed3056f66098280b51746"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:1.0 1x2x1x128:1x1x1x128_n"73884fd42b4bed700073417ca7435f77*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x128x28x28:1x128x1x1_n"962e27147dd4b8049054a0dafc762b29*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x256x28x28:1x256x1x1_n"05104e5eb482c9ece31dfe79d17105cb"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:1.0 1x2x1x256:1x1x1x256_n"8f3b231b3731d4fab887c2fe570ac1f3*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x256x14x14:1x256x1x1_n"cec8560776c9198bf7d6524bbd4ac770*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x14x14:1x512x1x1_n"d1eb78f1a1362f7e592370f1fb51187b"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x2x1x512:1x1x1x512_n"f7108f509f0af7ebb5a204cc42d20861*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x7x7:1x512x1x1_n"8f21dc0c6825f7ef3df65744b4ed3a56*2&293da6afd1f6c2580c896a7257749cd9&99fff2afe7cc27c8c6fff8d9c039e229"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x832x7x7:32x832x1x1_n"d4f5557fd868349b3fbf2728d76aafbd"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:1.0 32x2x1x1:32x1x1x1_n"b01ba3bc97d3d8e9c1d319b99083dc6c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x1024x7x7:1x1024x1x1_n"5533b5b8188ebf7a1724a01506ec5900*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x256x56x56:1x256x1x1_n"060498859685909187e7977c84b4dfda*3&cec9a17f9674701fbb3e5fa5862b4376*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x512x28x28:1x512x1x1_n"2235891759e9211e61ea17cc398b7939*4&d8290aefd41139d6ee7acddf61bd9ed7*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1024x14x14:1x1024x1x1_n"b702a26b738378d8c675044c0cdbc45d*6&3ff19f64c333732e255ee3daa5eb856c*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x7x7:1x2048x1x1_n"38c37f243f2d25429364462871da28be*3&8081bbb906bc12440d9d215d3db982d1*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x16x16:1x1280x1x1_n"8aa33cb6596f9c39b59d895834be4d07*5&194f3ae494db77d246ba81b40c4502bb"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x1280x7x7:1x1280x1x1_n"9cf6ca5871fe8d53e278ae396b8e5c03*5&9267bc12918e96fbb9f86380059f4047*5&1fe5386edd282e91c087a78a32bb9154*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2208x7x7:1x2208x1x1_n"5429d03d8e91e26e5a7adbef0b7efab9"
+--reset --allow-enum-tags-only=0 --alg=max --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 3614x4x1x1:1x1x1x1_n"1467d082581faad51854e484b305931f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x72x28x28:1x72x1x1_n"a29dd8927b760eb05ce713bdb0a00ec6*5&22e68d29242ae6cb966771336a8a524c&d569b7980f06cdd75a4fc46797afb674"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x28x28:1x120x1x1_n"647a8c74af13eabe0f56ab94f0f75342*10&7a831c1d46abdd4fd9cb5c7a7db99cba*2&906a080461c9dbcca75ddfe37157dc19*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x480x14x14:1x480x1x1_n"53c62bee7647428aa1cfad0694b5b0f6*5&2dac3626c856f0628e549e370b03c60f*3&69fe48f547afeb16d5fd7dc7ba2dff01*3&2240cc00eb34f4fc539d1cbe817b2d06&09b4b332b45c285155d1106dca71ef0c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x14x14:1x672x1x1_n"97a8a5f4946fe094a0ec04ecd43f1646*5&f75f7f430d0f18ea4546bb50d9a37178*2&4a0b2bb4547e9c71dcd515146a293bb2*2&1317b1e55e519bfb9ff76285df911aeb*2&b60038d7f5f0b9fdd50ce2f6911ecf0e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x7x7:1x672x1x1_n"5e8f456253b418769f14d3637744d2b4&2092202b9cb9cb90a0f75edd86354394&ecbe4f734594956f4e51a05696045495&908438c7c1b34e7be2b6d4fca80996b3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x960x7x7:1x960x1x1_n"f090024f38246fb3b96f2fd9cae0ed89*10&9579a1a099316829a3b8d2a2efca666e*2&4d3c054b4ca237069eab266455a8198f*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x256x56x56:1x256x1x1_n"22b2302ef7bfb49044cc3225a2dbaba5*15&7332ef55484d7535a24c086018a2915f*15&23b6594b31a7420428303d8b22e5aa2d*3&77e948f5e98bb2c8837b77c78f9ddf8e*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x28x28:1x512x1x1_n"1c272964ba892b76a3307db066904753*20&6bcae365c389b78cb96cc7fc5d4d0fd4*20&d8e51c349aa45e558f52be9f082ebbc4*4&b7e485ee60699d8eb03a85d329c15c0c*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x14x14:1x1024x1x1_n"8fc22c128fc8263a36d04b368e4dc997*30&6d9150889ae55d8fe99261f229ced3cd*30&92917b7a466081b060dcf55ca41554ff*6&475d6a3e5c1baf65cc4a88aee4c05f0d*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x1280x16x16:1x1280x1x1_n"d433cd1e684c4aae1932a1f47596c280*5&f0c5f5791ce966723c24386d1b4c7d63*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x2048x7x7:1x2048x1x1_n"245af4e8e4d7190312f2000fd98b2238*5&3efa488f7ca6ffa3b914825be641eb43*5&a2739390142b78d403c515d86e0fffb7*5&6e735b9c9a4271c23d818b70d3660af5*5&8ecf3955ccab1e96fa8a854cdc29fc81*5&7a6bd6ac0efea51bbfb4c834063da774*5&83fb430479bb3cf9234c5b84e11680ef*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2208x7x7:1x2208x1x1_n"b4522f19966204a6aad3f94078a11725"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x832x7x7:1x832x1x1_n"06646b090b9eb8e272117b72ba50097c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x7x7:32x1280x1x1_n"5d3e0ef2bf75db292ac81920a8578dd0&e090f6ac43e29791404254a9d50279d4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x2048x7x7:32x2048x1x1_n"51a92e09e7d85cba6ffa0054a437cf8d&49372c05739a2de832c1feb2887ea337&0c17e63dd10063aec2859af030139275&5b2e0d1feb4df4e09993a2c494e7d0e9&e5ac8a06e1794bf995134dee33ac1df0*4&b1df2dbd129d5e9c08ac4531ac576a62*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1001x13x13:32x1001x1x1_n"f10b3e5c74de71b0e206cae3015ad53a"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x2048x7x7:100x2048x1x1_n"896dcd1db9f089e2afeabb809d6979e3*5&06ac94a0452b11eb43a09a3193d4aeeb*5&b0e2862630a89c14cd7e4a093079f706&6cc5df40fde278b4c51c1f3fe020b46f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x256x56x56:32x256x1x1_n"c006c6c8a84bef264d35962b5e61ef0d*3&15168fe564f14f835e3ac02f7dbb74b6*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x512x28x28:32x512x1x1_n"6a7231e283bfeff82140d3f3ddad51c8*4&478ee0553d96b9c5aeef3ef39ae1c8d8*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x14x14:32x1024x1x1_n"03d91df6c9ecce004d2c840e67df9e50*6&814b8c64c8105afdcba4de4021477af5*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x1024x4x4:100x1024x1x1_n"374c5c5ab6464efdb75b34d41c2c30d4*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x6x6:1x1024x1x1_n"c1bdeb69dbbe5d25df88cdcba159275b*5&40d1ff00b3f25c9cf81d573aa1aa0238"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x128x256:1x2048x1x1_n"4ea4d4e12f246d91d8b2b9630e668a98*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1536x8x8:32x1536x1x1_n"e52ab25822e6cf514dc29e94754a9de7&c696b017c3ba0c00d70bbf77f22e1d76"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x256x56x56:1x256x1x1_n"c2a0e9431964b0932b9d32ec52ad95c4*15&18648ef8a02fa4dfd1c703f33cd7e981*15"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x512x28x28:1x512x1x1_n"8f67f1b67f7e5dc74678c3bf948f4886*20&5566b0902ce8f8f4044373ac78ffc0fe*20"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x1024x14x14:1x1024x1x1_n"555a4507df8d7448d487e97b0897f227*30&ab9ab79ac6d3319cab2288d2c414dbda*30"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x2048x7x7:1x2048x1x1_n"da6ccf73f828ba247e2af8dc34e7e83a*15&c63e9384963692b756ce5bfed50aae59*15"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x1024x4x4:100x1024x1x1_n"fd754eb10374042fae91599584d7ff81*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 100x1536x8x8:100x1536x1x1_n"0eaa7c20a8bce4a40e318085ec04e4bb*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x32x112x112:32x32x1x1_n"de22c329ac3dee6751ab5f341ab56e9c&2a6d815c3769afd0a6946893cff2af6d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x96x56x56:32x96x1x1_n"017de3c725b57d7c9eb9b18951915612&100750ec8b61de14c510b208b297c286"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x144x56x56:32x144x1x1_n"b2cf4a1e58cfbef373dfa71a43245fa5&118590b2cf2c634323e6cdc15be755e7"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x144x28x28:32x144x1x1_n"dc15f8e6e11a489b32b88eb76f47d982&76cf785f1058d74335d873ccb32f1cb2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x240x28x28:32x240x1x1_n"bd66ca7c7895d19913ea47fd2c794f6f&6dfce6019b5931ce63a78aeaf27cf0ac"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x240x14x14:32x240x1x1_n"d0f6783f457a099a129d5c50c463b10a&25848170286ab162130dcb138a43045e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x480x14x14:32x480x1x1_n"28022f6425f59b5d6f039b3193d63e7a*3&def4f9a2ed4646d979ceb941af9b7175*3&7c3c8115617fb35808cb14402258a83c&7fd4374405a4a3e3a98406df82026444&097e2aa029f790c82f1fc89f3d6199ef&04e02e91e2180b80ac3648cdd1c116d2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x672x14x14:32x672x1x1_n"26b68414d279eaaa92673124efb5c6b6*2&e21395aab520f135676d9efe9b777cf8*2&e5b01f4993e8bc7d912df53b9534abb9*2&fe61de80a01bd4933571d6bec1aa9ae8&f81f0817035a2177b66a206557a5345b&048a06086b912886485a83c20737a8bc"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x672x7x7:32x672x1x1_n"f68ed26144ef04a4212263071a59095e&850b33f08782094e669c49d1908c2443&baddccfc3bd03ac792eaa66b71a7229c&14a3d04cf2d0b62642727f4375620603&4be675440ea2cd681de18976754c05f6&f712294e01549a349483b0ec9bd9589f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1152x7x7:32x1152x1x1_n"ef5d8d844b5c46c7dd53ffebbb9c804e*4&1c7a1c92d330c7f52470dc4c65bdf1ce*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1280x7x7:32x1280x1x1_n"01d243ba32e9a6d798a86b58a253461a&1fe117ea5073b788a0fa6cecb415ca90&341928b505cee2b24ee929c44bb06a72&032aa9cb7a2c8dea2cbc5c056465c098"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x1024x6x6:1x1024x1x1_n"5d2f360e4b45c84809e0c9bf96f709be*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=u8 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_linear:0.25 1x16x256x512:1x16x1x1_n"b7be8d18501ed522a1107773125438b6*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x96x128x256:1x96x1x1_n"1ae2fed0fa9393010351222b2c24cca6*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x240x128x256:1x240x1x1_n"3ad54128e3099974ee8e43a03027c1c9*10"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x120x128x256:1x120x1x1_n"a6585affa17beef7a0734e2eaf46d3a1*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x144x128x256:1x144x1x1_n"de37ad0153b1574faeb497ef16b4c994*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x288x128x256:1x288x1x1_n"374156919bb8c90a75e43eeb834a0a01*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x576x128x256:1x576x1x1_n"5fb882d34bcd6648c450238b30c8a170*10"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1024x32x64:1x1024x1x1_n"c7c58bb1d49412d11ffc0424ddbc8f32*5&f69be0c50c600ba3bfb283bc9d9dd969"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x1000x13x13:1x1000x1x1_n"b35ff7e8c907be48c2a9b2665965f352*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x28x28:32x72x1x1_n"2feb3c424c6a0cc49ab0984cbfb79ae3&d67e70fe9bdf63e8ea847bc681115af5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x28x28:32x120x1x1_n"b83362fdf1bd3df9431c4b4f81327a8a*2&6c66046038f516a1786631403e0d23d7*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x480x14x14:32x480x1x1_n"8b563891736ce6708b82bdb81b5df33d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x14x14:32x672x1x1_n"175f67b812ad76026dd6aa9aeb650acd"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x7x7:32x672x1x1_n"1a82d67ff32eba4361199c506ee0df74"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x7x7:32x960x1x1_n"3d086e1f9da276d74d53464e2ec29dc3*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1280x7x7:1x1280x1x1_n"7fb17cc4601a3c0a31449da08349adc1&db33766d943e94a55081a03a1a10ca27"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1000x14x14:32x1000x1x1_n"0892bdac3069a34ff786033893ad7b97"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x72x28x28:1x72x1x1_n"d8a70a7b1cc31f72c03517815e53bda8*5&dd827a2adc90ef45d4732d21958acee0*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x120x28x28:1x120x1x1_n"c41b368ef502d86ee22fa6358b06ac90*10&8f469f642b58977c4074ce75c0e34341*10"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x480x14x14:1x480x1x1_n"5f71c82b81084f2de8da3b12789f3cb9*5&339cc42913ef1cd19f91aa741ab50961*5&842f5ce4e2b1f874ed0a118e1392fe30*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=s8 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_linear:0.25 1x672x14x14:1x672x1x1_n"bdc73ebb11b2b44ca40814052a1a413a*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x672x7x7:1x672x1x1_n"b0ab06b57fedffe961856843b990da31*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x960x7x7:1x960x1x1_n"2fb6059164211dfdcef7ecc1e3665255*5&f119cd680f31c44888cc2d691a07b17d*15&bc6e86d2f6a046c0b93ea48f697d3cda*15"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x320x30x30:1x320x1x1_n"990ad8bbd939d28e35c667631a4ab451"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x640x30x30:1x640x1x1_n"f33c05c7b83c2a3d8870f3df865f3df3*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x960x30x30:1x960x1x1_n"3320023b14048b11d4e77b790dd00be5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1056x30x30:1x1056x1x1_n"14b98b70d17ff8fb414a50269cdc0b2f*13"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1056x15x15:1x1056x1x1_n"bbd9e14a1c3b45470081b0ee123a92b0"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1824x15x15:1x1824x1x1_n"2b748d8d8bbbc5473206951e1943bece*18"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x3072x15x15:1x3072x1x1_n"66d9bcf16f1044b3856bcd6763f5f3c4*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x15x15:1x1280x1x1_n"b854180c75e7c661dac87c4bb624cd83"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x72x28x28:32x72x1x1_n"2c837b90633a390b08a4f726181d036e&5d1bc5a8a3e91d4dc22c62dfd61ad75e&611be9c0c0961f63a3e9dde5896cd791&0e42b1aa5cadead636af8b0c0f403b85"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x120x28x28:32x120x1x1_n"65b6fedd562c203c8fdb9334abb3c2b7*2&af3b3f80561da4110e800836dd549100*2&527b706c7a214197872bf5f4dfd14c13*2&43345ed7d0bf3b9a91bbf00da2c3bdb0*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x960x7x7:32x960x1x1_n"51a6e132132c8ae0bedd1f705e6c0dcc*2&1dd1d5b6fa7fcbaadb0ec572a32e8389*3&d1c557cb9dd66eb0133a013a09126206*3&610f5c70631c40ba4e416cccd755a6d7*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1000x13x13:1x1000x1x1_n"c990ef712a90e33aed2a89f083e57d5f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x672x14x14:1x672x1x1_n"a3987cc915dd61efe23499f551430522*10&f96dc6a3197f3e00a81588eb46ee5431*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x672x7x7:1x672x1x1_n"623e1b72493e5e4832d6f5c4995c8722*5&241d67ac18a7dc1aaf27a63b50c9ea77*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x72x28x28:1x72x1x1_n"6df9c01d361a8b52bde4892000317033*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x120x28x28:1x120x1x1_n"9769fb1399542da2dcf50daf00d79083*10"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx+eltwise_hardswish:1.0+eltwise_linear:0.25 1x960x7x7:1x960x1x1_n"9ad7c70dca85cbbf4e7f80f815716613*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 100x1536x8x8:100x1536x1x1_n"73928cac44ec88e26f9d8172fa7c5677*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1000x14x14:1x1000x1x1_n"cb70d2141ac5aab6516f768ed8cf07cc"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x832x7x7:1x832x1x1_n"7b9ac9fe9b701aa6410195f1a2351a22*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1792x3x3:32x1792x1x1_n"97853d794ed86558359cf1a45c40b66f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x1000x14x14:1x1000x1x1_n"93e386e10379404e9211dc171a57e564*5"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x12x368x480:1x1x368x480_n"fe9ca3caeefddda05d7ca22dca998a0a&29402af0e3a0be8549598ae6fd17cf9d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1792x3x3:1x1792x1x1_n"40b0b77e29106660ec9dd445a62ef4fa*5&7358fa291d87d9cafc5f738844403476"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x320x30x30:1x320x1x1_n"d3214e976788f88ff631faf59fec0c29"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x640x30x30:1x640x1x1_n"ca44ca858101a2fc0ff345046f97aa88*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x960x30x30:1x960x1x1_n"622ee61f065aaa427ad495e66a125df3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1056x30x30:1x1056x1x1_n"3f1c57c2b2e6390cd4aafccac77c86ff*13"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1056x15x15:1x1056x1x1_n"6a51f61e783288ef76710a17c970ea48"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1824x15x15:1x1824x1x1_n"43a0f962abacea39301fe88c2a521a2b*18"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x3072x15x15:1x3072x1x1_n"f87bab629ef2930afb33a2edbaf58305*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1280x15x15:1x1280x1x1_n"f374e0a61802b23af09d25e97db9f565"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x320x30x30:32x320x1x1_n"f6923821717a91915e65eb6c474c346c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x640x30x30:32x640x1x1_n"36c2312ca3cf20235ea444dcf4f72cf7*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x960x30x30:32x960x1x1_n"f497c0cbec1f6e8187330acac914d4ce"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1056x30x30:32x1056x1x1_n"c4b763f0b4b619b8dac6ed705cd035b4*13"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1056x15x15:32x1056x1x1_n"3fa3a8736f31c67815d1cc6101ec4102"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1824x15x15:32x1824x1x1_n"e13ab85358ed066abc6b21ed41e2d220*18"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x3072x15x15:32x3072x1x1_n"4ffe691245d5f1784c6e8c93dd972023*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1280x15x15:32x1280x1x1_n"bdfa6116931c1000c6fd9784936f161f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x48x4x320:1x48x1x1_n"893467d55c7f78ad9ceb08b4129b3333"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x120x4x320:1x120x1x1_n"1df4e3ccaa754b87f1796010c963be02*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x64x4x320:1x64x1x1_n"1603ca56dab72ff8955500e05422be74"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x72x4x320:1x72x1x1_n"232e3e3123139a88b8fc7b6792670295"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x2x320:1x144x1x1_n"70d5f0c168f4a6506a9694eab12ec039"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x288x2x320:1x288x1x1_n"069670a96af9c0d3c9703ebb5cf476d6*2"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x12x368x480:32x1x368x480_n"c254ae2a80fb9ef603b294d68b5594bc&f98f5282fd485c55354dd9957cec760e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x7x7:32x1024x1x1_n"7b9cc8db03c0c8f71240444efb2d3e79&243f186be673374c3a776f4d80af14d0&f9381cad5ee1d1cccdb7d364bc79d05e&208cc554c5587a3ce5af7b2008d00f3e&991b9294c757b2bbcf0efacbdbc4e678"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x512x7x7:1x512x1x1_n"7d02705df5b27bd47e3aaffc8f8bf31d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25+eltwise_round+eltwise_linear:0.25+eltwise_clip:0.0:1.0 1x1024x32x64:1x1024x1x1_n"2dfbb4acde6afe3a847d79b0714196c0*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_hardswish:1.0 1x960x7x7:1x960x1x1_n"97931cba46f542ab91eb0a36c5a9a028*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1536x8x8:32x1536x1x1_n"cc36a0179ae113ea20e6ae7260ee4a5c&b2e7fe7f25a5ae52a06e11cd64407e69"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x6x6:32x1024x1x1_n"05ab1fd3ccdad3dc4f9eb74c48292d4f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x32x112x112:1x32x1x1_n"c56522af8c3ab0cd5cbdebca1ca63945&64dff996beefc3e215ea027cb20bf688"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x96x56x56:1x96x1x1_n"d4a8afd29636a644681b10a9dd03d9e8&aecbb3248297b66192c4553e3415a5d3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x56x56:1x144x1x1_n"5cc49d19d6de4934d3d4343433f6c638&3f8c1f901b032d79df3e8cd3cac78971"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x28x28:1x144x1x1_n"496fb5d0a46a7b3a8e5ca9c2d3e14a89&06ce2c1b4480d5ed3370d97ed84e8932"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x28x28:1x240x1x1_n"ba75f98d03792e805eacba90938ef221&3440470b0049a33bb182090fa50189b8"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f16 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=binary_mul:f32:0:abx 1x240x14x14:1x240x1x1_n"5fb9f0b526008d197a40b3556854e3f5&b578dafaa851f9244a9e54d471439530"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1152x7x7:1x1152x1x1_n"e8589be62ddedf61f92ff42b0d4ab76f*4&1adf60a74c1ebc40bd910f620f57c588*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1280x7x7:1x1280x1x1_n"8727dda502c63e1c26bcb86f9bf65ab1&ce7e22198951e68e100fe3fdf976e7e3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x48x4x320:32x48x1x1_n"d41b08bbe3846874e8832abee7e68329"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x4x320:32x120x1x1_n"2560ecdd707e7541240da3a9c6e3a2c1*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x64x4x320:32x64x1x1_n"2c12f0c6f3f3e2f2f7ce7fe9e3ceed9b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x4x320:32x72x1x1_n"790bd5f6424c21e33ea649ea46957735"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x144x2x320:32x144x1x1_n"b659e74afb92908b39a0cdaa1d08053f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x288x2x320:32x288x1x1_n"8fba49c2faf5d8f60d492b36b5f84174*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1024x7x7:32x1024x1x1_n"7e61f90737d3be9570b9db9a724b4be7&b50543598fc33d9e3944b1c224e1b23b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 3200x1536x8x8:3200x1536x1x1_n"3dd9b0067f05f5f665608fa4926bd0f3*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x32x256x256:1x32x1x1_n"57c8441068d210465781c39356d95b2c"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x96x128x128:1x96x1x1_n"f761946fe9d716f37e0d73cc35cd8c71"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x128x128:1x144x1x1_n"93cc5273f44e09df285eb173668d8366"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x64x64:1x144x1x1_n"7f2e82077aca6e5ceb94775a148817a6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x64x64:1x240x1x1_n"6438951b14c45ba6419bf5a594964555"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x32x32:1x240x1x1_n"c162bbc67c07ae91d612606df95c057f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x480x32x32:1x480x1x1_n"57ced8a4ce277b95e9074ac87919c6be*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x32x32:1x672x1x1_n"2a9e352b267105c9f49273cddf8043e2*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x16x16:1x672x1x1_n"1ee4a43fb57b2143612c390b5ccd1dd6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1152x16x16:1x1152x1x1_n"ff8d00f9f5e2c9dca6b80a5df43904a8*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --attr-post-ops=eltwise_relu 1x512x7x6:1x512x1x1_n"6c706609d527c85ad44d676b867976d0*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 100x2048x7x7:100x2048x1x1_n"1343ab81ab846fdd8392a1a5a82e99d0&047172172da23a9e141051a60530ad7d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25:0.375 1x320x65x65:1x320x1x1_n"1e5a334899a61c504e985ddbd0892f28*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x320x30x30:32x320x1x1_n"268238d9086e79e9aac3bba4aaed572b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x640x30x30:32x640x1x1_n"41dc39396eaf9c180b648a003686d1d3*6"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x30x30:32x960x1x1_n"20da65a09f2d1f6e1a091d91eeb25b20"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1056x30x30:32x1056x1x1_n"2d3668b86f3c7385fcbf3650830b4daa*13"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1056x15x15:32x1056x1x1_n"ff3595b27e756783712cd2fafb3e07e9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1824x15x15:32x1824x1x1_n"e169d94a1e668260ab7bddf9acbff658*18"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x3072x15x15:32x3072x1x1_n"8d8af200654f9e92d443fd56fd2cc09a*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x15x15:32x1280x1x1_n"2f2cfdc758a3b066d183b06facd830fc"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 3200x2048x7x7:3200x2048x1x1_n"d98e9b52d03e27add0b741dd0446e82b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1001x13x13:1x1001x1x1_n"50d146f48073103d72ba4fa7d1b97d33*5&238bf0a80eb5a0b3f53e194bd41f929d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x320x65x65:1x320x1x1_n"d32fb07c4c6a0875c3496222e350f4a4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x320x65x65:32x320x1x1_n"c4342e29a68b7f1470667f9f9bc8a421"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x8x8:1x2048x1x1_n"12be2c1102066cbb4e58be6e9cded889"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x72x28x28:32x72x1x1_n"f13751ab9f6b76ce3c8bf208db3d025a"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x120x28x28:32x120x1x1_n"fd3699a3b6279229c08b74d95cc307b3*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x480x14x14:32x480x1x1_n"0866b1dd48d6ebe9e37a12e908611203*3&9cd2eaeb57863e1b3581ba2fb8c7131f*3&ff32845d2468f713ef887cb62e8c833f&5bbd8ad948aebd49e3c9593af0187fee"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x14x14:32x672x1x1_n"5f2b893ff7196c5ddd105c12cd712d47*2&a85276746163556ddbbb8b3236e30729*2&7836ad106ee4d3beebc75b6a37c11cd4*2&e30a1d8f85d885cd3c17f579ad2be1ba"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x672x7x7:32x672x1x1_n"a1f6b3d78fd68f357be134a362a76831&ee5cc7bc03090072d36a931e424259d4&7994dc4fdcc238772ce91148efdf3705&bf09301e688a1d52414e837aff7cd47b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x960x7x7:32x960x1x1_n"959241993c0c8cb64d01be79ce111b26*2&f64fcc67ff706fc0c1320156a8bd2624*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=abcd --dtag=abcd 1x1664x7x7:1x1664x1x1_n"e45db2d0997327ad83351fb8be81c410"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1024x32x64:1x1024x1x1_n"ed9412649ec0076aee5149bead579b23"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1536x8x8:1x1536x1x1_n"9972bbe69cc2a7bf82d95c1c14cce956"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x14x14:1x2048x1x1_n"df3b7012027ac7dd0ea6608d2085fa32"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x5x1x1:1x1x1x1_n"c0bc868d8264664c967414b1f8275205&38a3fe6d776bc0d35835eded6b02c78a"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x1001x13x13:1x1001x1x1_n"ecf3e88cd00015e5a14dba56692bce5c*5&530feede8cf22b752ac4fb4962a19f49"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x96x128x256:1x96x1x1_n"0c8bb60a04221c8f9bd0cf62fb7ffb08"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x240x128x256:1x240x1x1_n"2c47f09606093bc1564f2a80fc205f6d*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x128x256:1x120x1x1_n"76a3354c62f8f7b01d918b40258d1a97"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x128x256:1x144x1x1_n"ff3b9d396d18cb36ef399218ab8db894"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x288x128x256:1x288x1x1_n"ed138a244a157158d0b5dfe6e4d3c416"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x576x128x256:1x576x1x1_n"5354e063bb8554eb9905fd3c9cba2f4f*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x256x4x4:1x256x1x1_n"a2ef597882cd677a35094089935db5c8*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1000x14x14:1x1000x1x1_n"4c0575c4a98a8cc9ed2080488c7d8c18*5&e72955f46991ed29846b74d98605660d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x14x14:1x2048x1x1_n"af0855ff3552cea71719c79c6fde01cd"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x48x4x320:32x48x1x1_n"78723fca8fa5f4d37b8ff03b62da45d4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x120x4x320:32x120x1x1_n"59903ad0f0fcb25e6f8be21badee51b7*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x64x4x320:32x64x1x1_n"c3a6213b0220a43ec99869b5db99dad0"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x72x4x320:32x72x1x1_n"1feca80a6f3d839031e47e25de62846e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x144x2x320:32x144x1x1_n"92717380766fabb2513b761fc517da02"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x288x2x320:32x288x1x1_n"adbc965fb291e4f955f7c9699d8e9ed9*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1000x13x13:1x1000x1x1_n"4b93191dc96b6d663bac15b8fd3afda4*5&c908b5695b199c3c17d27dfc6d301f47"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x320x65x65:32x320x1x1_n"6bad5ad49aef232eedbe21ae2eb80026"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1664x7x7:1x1664x1x1_n"069e93ad69ebb2977037a12f0b88f2e5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x64x224x168:6x64x1x1_n"9443f69f928846780d5712f79c37bb9f"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x128x112x84:6x128x1x1_n"a0fd9cfef6c406371078c79fcf360b28"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x256x56x42:6x256x1x1_n"37c86f7d55dd134ec055aee9e7a85cb2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 6x512x28x21:6x512x1x1_n"902a47738e962c9795bdddc99284d47b"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 3200x2048x7x7:3200x2048x1x1_n"8f8ceb6194d9b51b24bd2a956b92f73d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x72x28x28:1x72x1x1_n"ba55c09b0f1dd8d504254b9ccfc013dc"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x28x28:1x120x1x1_n"80ecd5f310df9c83638151e464051e0c*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1000x14x14:32x1000x1x1_n"3a82e0bd07e02ad9cb82afa8a8b0b615"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1792x3x3:32x1792x1x1_n"111de3351807aabdce4317cc507d41c4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x48x4x320:1x48x1x1_n"6114ccc971b12ca22334acb1f21c2684"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x120x4x320:1x120x1x1_n"c138153a88932c0ef3ddaf38ba219293*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x64x4x320:1x64x1x1_n"4f1dc51528537d59f0f97b11032d4ecf"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x72x4x320:1x72x1x1_n"5ba0efb1c2e42ca761eb4e8094377085"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x144x2x320:1x144x1x1_n"d96bb40d1a699479380df584cacf83a9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x288x2x320:1x288x1x1_n"e4fd6824081c3178e41b457f050fd7e8*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x480x14x14:1x480x1x1_n"eba1997e332fe0379cedb0983922d7f1"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x14x14:1x672x1x1_n"9a20898e1a30e4a828bd3c912bfc8053"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x672x7x7:1x672x1x1_n"0d861d1bc494f8564b4f3fb6ae30b22d"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x960x7x7:1x960x1x1_n"950fde86d86ba5ddc86b90ad3ee1eb47*3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x32x112x112:32x32x1x1_n"3b05a02b8c067ff7121f5fc2b7eac6c1&6447cb407a0639bfdce6aabee4ab2b25"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x96x56x56:32x96x1x1_n"a217e7af421531300c84ecbb2123daf1&cdb8db92761f3c4f754a5658bcc6eb85"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x144x56x56:32x144x1x1_n"932dc89acd4f843a6d45805bafc51e36&94154eeb9ba96122b9b4ec72661f5d43"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x144x28x28:32x144x1x1_n"3c7fcf5f99c3cb13574be0b991947f54&90db57d3c2bf300f93f053245c52de53"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x240x28x28:32x240x1x1_n"a7d18e24ff883eb05cb9496f8fc2d1e4&58daca06b740dabe7ab983cfac8820d8"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x240x14x14:32x240x1x1_n"7fea8a0a460af0521c823e7effab2a2c&dbac93b8fbd9fc21beeae4780a58564e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1152x7x7:32x1152x1x1_n"559b73d2dc258aee356c65591b609867*4&1ba15fd1ee7656b2042a16801046300f*4"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1280x7x7:32x1280x1x1_n"95823fe178a0ea95384de6ed4afc734a&44a679880f12e6beffe3907f89c50fff"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 300x2048x4x4:300x2048x1x1_n"f130479412d2101a99ee322abd262155*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=s8 --ddt=u8 --stag=aBcd32b --dtag=aBcd32b --attr-post-ops=eltwise_linear:0.25 1x512x7x6:1x512x1x1_n"30b5f61469ef7420cf02fa8e0056ea33*5"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x832x7x7:32x832x1x1_n"d085b5c43ad0d5d0b0cebac60cfd3d15"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1024x32x64:32x1024x1x1_n"1b288766fc2fcfbdc673e741c88b580e"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 300x2048x4x4:300x2048x1x1_n"e363868d629e79b9aa087dfccb384c92*2"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x512x7x7:32x512x1x1_n"36dac25d286c7b5f4f58f6516a335a79&6366ec3015314987c0ada05de07ad9d3"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1024x32x64:32x1024x1x1_n"643b6a1a7ca2a48fa1a53a1b5aef11ae"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x1000x13x13:32x1000x1x1_n"36d6affa234fb26acf07ba8b3cc030fc"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 32x512x7x7:32x512x1x1_n"bf6271dfa4e060807b0a16c0c8e2d031"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a16b 32x1000x13x13:32x1000x1x1_n"c0864728deb435fb3c25f31106b78839"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x512x7x6:1x512x1x1_n"28bc0fb09d393855f72330919238890c"
+--reset --allow-enum-tags-only=0 --alg=sum --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --attr-post-ops=eltwise_pow:1.0 1x2x1x1:1x1x1x1_n"c0ec3812245d36110f5844b41c99ca24"
diff --git a/tests/benchdnn/inputs/reduction/option_set_fwks_key_gpu b/tests/benchdnn/inputs/reduction/option_set_fwks_key_gpu
index 1cccd235572..5c772264123 100644
--- a/tests/benchdnn/inputs/reduction/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/reduction/option_set_fwks_key_gpu
@@ -1,14 +1,14 @@
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x7x7:1x2048x1x1_n"a708e66e60067b3b80868d75803f9c33*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 64x2048x7x7:64x2048x1x1_n"a5ed08d7c786816ecce623fb06c888f1*1"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x32x112x112:1x32x1x1_n"a69582421e0371b6141e365e770922b0*9&9f6fc581e26b6275e795c51a9f1c6ba4*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x56x56:1x96x1x1_n"d5536312f4c9119afde0e4e99e019d93*9&49320ac27fb02738c643238e9936a3df*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x56x56:1x144x1x1_n"483cf157bdc8ea79ab043d2bd5a481d9*9&4eb595ca969fe40f8962d00b75554f22*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x28x28:1x144x1x1_n"5e2f74297d3d5a12115f3bef8f177f80*9&72cccc4ad8feeabc8a08e1932e518f13*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x28x28:1x240x1x1_n"bd97d569a95d71dae8e36cd98088d52e*9&df7b05b4fdaa8fb4b53ad09762fea0dc*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x14x14:1x240x1x1_n"cc122986fe7f9cf3212864d3f1d92271*9&a154d2173f7dfe419dbacda2dbbd0932*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x480x14x14:1x480x1x1_n"0b5bfc8681e29a35c1455bb6bcb17cec*27&74902a5d9356fd5f20b180109b24e27a*27"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x14x14:1x672x1x1_n"b92038094979da80b83738fb76055da1*18&f64e7eb1dc1164a7f70ef2f373470ff1*18"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x7x7:1x672x1x1_n"b23d683abc013128ae7b57ddd16df27a*9&50c4642464c74b31e08726d60d1f46ec*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1152x7x7:1x1152x1x1_n"9a46be24b85c95bbe28871ec23db945e*36&6862d3a5ce38fb41db8e252e059b0828*36"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x7x7:1x1280x1x1_n"5a81d999e978adbfc3748421afe58dcd*9&1dd7f96258cbc955cbf57767129cd02b*9"
---reset --allow-enum-tags-only=0 --alg=reduction_mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x7x7:1x2048x1x1_n"68c1e3e052c159dfbab9d1bbf620b9b1*1"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x32x112x112:1x32x1x1_n"b613f94b409f19d2ea91cc069747fc71*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x96x56x56:1x96x1x1_n"33cd9d371104ae163b91c01512fdffdb*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x56x56:1x144x1x1_n"6909849f731270b424ff0cc9f6425878*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x144x28x28:1x144x1x1_n"218c00aa6db637500830fe4a58ce9f8e*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x28x28:1x240x1x1_n"ca7eb4fcecce6df15b3ccf82d8ff176e*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x240x14x14:1x240x1x1_n"cc09e0a3c17e5fe75978b5997fc88427*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x480x14x14:1x480x1x1_n"67ea44680a30d7f25560d52e527eebc9*27"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x14x14:1x672x1x1_n"b3fe5c285799f5ccff36ce767c8da8a5*18"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x672x7x7:1x672x1x1_n"4418eee3ff99e73b2ea97f27cc3153d0*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1152x7x7:1x1152x1x1_n"63214dc508b6b70333ae66a18938d24c*36"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x7x7:1x1280x1x1_n"b6837669fb54a281ce19ddb7b8b691ed*9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=aBcd32b --dtag=aBcd32b 1x2048x7x7:1x2048x1x1_n"44d80d7c549f55ce25575ec4729882e9"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b 64x2048x7x7:64x2048x1x1_n"163ca20f4022757b94e2e3cf68fab572"
+--reset --allow-enum-tags-only=0 --alg=mean --p=0 --eps=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x2048x7x7:1x2048x1x1_n"89d95989b52039ad63103d74fa54e711"
diff --git a/tests/benchdnn/inputs/reorder/harness_reorder_decompression b/tests/benchdnn/inputs/reorder/harness_reorder_decompression
new file mode 100644
index 00000000000..4ab086c8cd8
--- /dev/null
+++ b/tests/benchdnn/inputs/reorder/harness_reorder_decompression
@@ -0,0 +1,21 @@
+--reset
+
+--sdt=s8,u8,s4,u4
+--ddt=f32,bf16,f16
+--stag=abx,axb,xba
+--dtag=abx,axb,xba
+--attr-scales=src:per_tensor:bf16
+--attr-zero-points=,src:per_tensor:s8
+48x64_n"2d_scale:0"
+17x32_n"2d_scale:1"
+17x33_n"2d_scale_tail:0"
+
+--attr-scales=,src:per_tensor:f32:8x1
+--attr-zero-points=,src:per_tensor:s32:8x1
+12x25x16x7_n"4d_scale_w_d0_group:0"
+128x16_n"2d_scale_w_d0_group:0"
+
+--attr-scales=,src:per_tensor:f16:1x16
+--attr-zero-points=,src:per_tensor:s4:1x16
+12x25x7x32_n"4d_scale_w_d1_group:0"
+128x16_n"2d_scale_w_d1_group:0"
diff --git a/tests/benchdnn/inputs/reorder/harness_reorder_large b/tests/benchdnn/inputs/reorder/harness_reorder_large
new file mode 100644
index 00000000000..9abf7ad94fc
--- /dev/null
+++ b/tests/benchdnn/inputs/reorder/harness_reorder_large
@@ -0,0 +1,11 @@
+# test if jit kernels properly handle corner cases:
+# * large stride problems
+# * huge dimensions (UINT_MAX + 1)
+--reset
+--skip-impl=ref,simple # run only jit impl, won't iterate
+--sdt=f32
+--ddt=f32
+--stag=abx
+--dtag=aBx8b
+2x16x19200x19200
+1x4294967296x1
diff --git a/tests/benchdnn/inputs/reorder/harness_reorder_regression b/tests/benchdnn/inputs/reorder/harness_reorder_regression
index cc98a1f333c..95877f65b42 100644
--- a/tests/benchdnn/inputs/reorder/harness_reorder_regression
+++ b/tests/benchdnn/inputs/reorder/harness_reorder_regression
@@ -11,3 +11,10 @@
 --stag=abdc --dtag=abcd
 --attr-zero-points=src0:common:1
 1x32x128x33
+
+# Test bf16 with aBcde4b format
+--reset
+--skip-impl=simple #skip non-jit version
+--sdt=bf16 --ddt=bf16
+--stag=aBcde4b --dtag=aBcde4b
+2x24x19x19x19
diff --git a/tests/benchdnn/inputs/reorder/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/reorder/option_set_fwks_ext_gpu
index 44393c9753f..1472fe8294c 100644
--- a/tests/benchdnn/inputs/reorder/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/reorder/option_set_fwks_ext_gpu
@@ -1,804 +1,790 @@
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 64x384x16x64_n"608abeb5337bc6a37b1215c9824f2647*72"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 64x16x384x64_n"ba854cb028c8ad73968b54fc773c20e4*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 24576x1024_n"950a84d0a3666f322398c01aeb5c5856*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=cab 64x384x2_n"efd78500c935d40bf44c945ac04c02b7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x16x256x256_n"3a83b4ebfd309ed0c2f0edf89120a049*25&8d57098707a3e5df0c5035766ea26d53*40"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x35x256x256_n"f28889c04117b514d79a41714e131a4b*5&23db777fc428d5025e0de5ae747cc7d3*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x16x128x128_n"78349326f09ae8c410dfce5a4eb22058*20&e16e903713a658a96a52fa682a4de1db*40"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x64x128x128_n"c11c953d456864433db16b8ac750514b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x32x128x128_n"b705abb48117ac51a59c24c2b735ff26*15&f7d3ab6f05a6d60100cee634693b5b64*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x48x32x32_n"0fe68c7fd1ceb6734e5ede509d7c10bb*5&4b38cc01d139f80687d17582e4877054*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd32b 1x96x32x32_n"c66504042002302e5d33e4bde87c01da*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x16x64x64_n"2c34768a2209fb823e8bf1d41b4047d7*5&338615482baa78ee73683af7131f1829*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd32b 1x32x64x64_n"42f690da6c323f6a5dff9903f1103719*5&1b1b5d73e25b16225c9493b2adcf8768*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x99x128x128_n"b76f06b6f5db328a5f78b841e2a19ca6*5&9ec66cdd5a204e68415466327cbf3911*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x64x256x256_n"b84e304c090a56b1e4231d348b62e9fc*5&9f39438089dc69a54ed56ae06d17c046*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x32x256x256_n"048f6aa22733d8c245fe4e432dab01b3*5&7ecc284662d7a89b738f4158288c9e8d*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x35x512x512_n"448f74b9e2ec871f8710c0c46ed08c7d*5&48dbab6cd8c41c022cb1a6a37b6c6db4*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x100x171_n"6e7ba2b1f15b66a38a3358919213b0b9*115&500b385493000b4fbef59eceee263ad5*30"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 4x256x52x88_n"22071c15e16b4c84195badd72d205447*115&b1761381fc8dee0e7ad9d0a55ee84a69*30"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x1024x100x171_n"e8b88ba6303cc6a8a08674f4f4b810a8*5&3ab7143f9100289167db6e795756d13c*5&0a3af39fd9a838917763746757641181*1&5c532f1af5adcf018e8c6e42fc12bc95*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 4x1024x52x88_n"51a504f50286eb8ae6e2eb9e158ece67*5&db43f9b04e6f73b775433d1a0179be95*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x100x171_n"bc65f0297c437475d9a84522168eb206*5&8652350a493bdbc857060b975c18fcc5*5&6fae1cbd36081569b59f1a95c713772d*5&0aa18b227d2e4602d658c774c0c1b343*5&aae2281c3106593d2137b2f2ce2fb34b*5&42f4b06dcf1f40a0d8eac657751f93f8*5&1e0057c471bd6a53bfb59ee34359f7b5*1&ec750a812dd2a4063e7dae1ffbe4d969*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x100x171_n"9e99fb81f864e8dedee25ab784a29763*5&c0f54be6e187d7e0fed4a83ebcf1389b*5&ba11dcfa49663a35c340fe868cf84195*5&f2cdad632930b5c93981d4b853c9414d*5&dcadc4a6e17cf568522fb179489b4fdb*5&e5410f0aa86d6b729046dd9fc44d63bd*5&bdea55b2c92800f6bbe784e29ad38493*1&77cf5702c970924db70ca60d8bb81d66*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 100x5x1x1_n"befe86c69717111b555ba49c1fd8101f*5&e4fa51256302bbb8da46ff295164f48d*5&3ac51651af99fcc6fecca1640c7b110b*10&9fe4cb352c04db2fa414c0ef352b55bf*10&33b651fdcd7902a085da0ea5ea90f5ad*10&39d1092b81b1a994cba9d78f1cb064f9*10&5c4dd1518d6ba4d5ee2b187c8cc4b7df*10&04f4e4e89bceb76f8d68db0829c6af9c*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b 100x1024x14x14_n"e65a05cda0733bdb9062fa5171c60f1f*10&58b346c9ae870e19d19872d68a3d108a*10&798ca50f8393511bcb73f79c2e85b572*10&fc6ddb2e45de5557829506f8914f54dc*10&9fc191c46ae14a6a98a79ddef14d5a4f*10&4b559ac18edf924c2ddd75f92ce9123b*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 100x2048x1x1_n"e01b7ac381d4a57d9668061f7b023697*5&dadccf30f782ed51e20173cbdfbc6ffc*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x16x256x256_n"d535efcb7b8ab6de9e10a26f55973323*50&37b398ffa7313e5ae9dd958b460d66b3*50&eaff4b7b69bb5fd6c779999e050d11b4*50&72a7b3906c895d79d5e0c30139edae59*50&41a9a2bcaf960e142707b5bcde831254*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x35x256x256_n"ff10f519be929cb56cb58b0ea1adf0a2*5&720c77446d4cd7f79a71b4d811b2bd9f*5&4d6e9d52aae90b055d3e984b91770045*5&93f7155bdf4de0b6750932383604b132*5&e15562f4c0808ab5639345a12efcbd64*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x16x128x128_n"1baec35fb3af2ea8a8e31d4f87ce9537*40&6de287ac5a7bdd6b2e1e75bdb5b77cb4*40&550bd6e836586a0e269b9135cf4484be*40&259fa3803a5c6453fb41b01bfb7f03b7*40&5017cb6a700917bdf902b7a9c6acb7da*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x128x128_n"e1c7957c7ea2bede304c4da2062bb43c*5&1478882ddd5bd28d4787eb38a9e4e223*5&9ffff37743683b4133b497bbff9f54e4*15&071102f1dba05a1f0c0ec5e92a861bad*15&68500edce1366f4bf6f1121f2c8a1636*15&e3ab4da1d7f1b2692e8f276b9b52cdcb*15&1ab985bfe05221795783e24386987b37*1&06843974bbb8f0367f860fea6bacdab4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x128x128_n"b8313eb85e7d7e5d6312a6d55207b00c*30&64789d7133e1043e203c98024ffa4816*30&0e9c07fbd140dd5025f06eaa0e79dfaa*30&6260f12b848e0f5a4e2f966a1343dbba*30&379fb4a19a3aceebb56e20f33c848165*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x32x32_n"796f3638db678dc12f9832e068d2f5d1*10&b16ab97f9d606229932b333301f1a5aa*10&09a1a2c723284a5dec6113601a0ddce0*10&473c4e0180dd21287f171f438aad6dc7*10&f0bb37d9676608f82015a0beeb36a590*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x96x32x32_n"cc41335759090d2ce7eaffedb39dba4c*5&272702692d639f8c906ea61f220d6e25*5&d3996be9d22a066f7a70fdd8a26e18d7*5&138cf4e7d1431b4a0378e3119c0ed229*5&bf993c54db41e334a5f012295de49af7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x16x64x64_n"9908efb548ad63b73924dd0124d724ac*10&1869035c2feb95bc288f93a8bfba75b5*10&dac388caa34fc1f4b8d3cfd9bfb88e6c*10&5a5e857236f7c8e983f71721f858d67c*10&35156a270493f1294f4dda621df448a6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x64x64_n"f0b0265f2c7d8635990bec9f4bd354da*5&d85c074a676eaf18bba180bf1de132bf*5&3af246b60fc8c2c1cb4193b0596e432a*5&6a448a5c1d69458d05a097639bc608fc*5&55d5605c306e7070b61c1299bfae9879*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x99x128x128_n"169d19f67efddeb828d241b1939855de*5&f1763f7cc5de16c090730c0a7a0c082d*5&4cce8d55c5296ca2224134797fbf71aa*5&7afbd019a6c0326b74a615b7431ac86e*5&b56b7de772ee4d05236b3aeb60dd087c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x256x256_n"ed71e519b1341d4d85a47b9c849be296*15&37a43286c61b0b9901cee27181070481*15&76e22679bd0da7ea8eb2b1439f572e22*15&28bc4f217530ce9045682f34409be032*15&4a2a50533422e6ceab663c6ebd83f344*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x256x256_n"99af260e2a3d4e321de16e4c5639c202*10&f2e02b0a8a243046e0716728c4483452*10&04aacfb46892209d6a654cd73d4c0bf7*10&dc7f3ca9fda74627aeb7a1a41a8457dc*10&46695d7f5cd352abd39e23b6a9b559b5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x256x256_n"942f1eee8fae8a74e196440c1a3d1a27*10&93ce9ac857264b0be21979bde0b433f0*10&baf4be9fdd188a7af8cb44e6cefb263b*10&bf30198197652cd762460257d62590d1*10&9fa219e7110df340d3b1b8ffae471db6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x35x512x512_n"f8bbcd0de05aceccf044bb8f76e08628*5&de4dd75740e567e7261c14a37bfe77c5*5&0889cfd2142d9fb6ba4955bb818e34c7*5&95fa5b76094782fd024ba4b85e634bb3*5&94b1a9552a8872bcbce305ab73c48c8a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x8x512x512_n"97293407823b94d0dd7997ebc9a48bcf*10&00fd77cb488430b44a3d4e7171c6b203*10&3312f3070e370642e0fc773a3143e5fe*10&7263bab56efaf6856c9c8ecfaeb9411b*10&d08eb40299c3ccbced1feb415a652724*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x16x512x512_n"310beb261a3a1982411381c666812eee*5&98f3575b13eaf739a9bb794a0cad5d64*5&42e1caaae7dc8308dc7ff8114a58dc02*5&e29a6bb286af8d276ec07747dd92b150*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x1x25_n"ab4b990eb8eed92744cac3557cb6809f*5&069ec0e9e863c54eac3b93ad7ab08f93*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x1024x1024_n"0a150adfc06fb43d320f5fa97aef212d*6&b6e7d5d5ff14823662468fb5af68179e*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x512x512_n"35b27d32140957401f52cc5bf36e458d*6&ea814be0402182d9022aa847d89dc5cc*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x256x256_n"ef4ac61686fa8342f176cfa00da5b1ff*5&10873eaf01cb2c6433c34a34eb953b00*5&45191c113f6a4dca88f6b960fdd31d18*6&b4750fb2a513f723e2c25ef53442f547*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x128x128_n"9f52c368c332d1b98fd12494f51ad9c3*5&31e4a5cce1453d80a9bfeaed3e56173d*5&91270fafef785c5ac3ab804fa067fc37*6&9b7f877cca72040b10394c184f5dfccd*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x64x64_n"74bb06337a7a217dd71ff2da2f2ba5f8*55&989f34949f9c5e6b43cf2e4949adc41c*55&eddbfb87d43aa45a4fc257f51dc722d3*50&781e0f130414fe400abce37962ea15d3*50&7515c0093b5ed0fd5d9df1c5737e1132*6&fa9ac93843dc39f2713ac6b6cf41c686*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x32x32_n"4fd6d2baa55bbe817bad28492a300617*6&3a35512a800d0a407a14119b2be48357*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x16x16_n"0c8e858cde193564bbb733808a8aea27*6&d8ad2b376fba5c13cb7c3d1181ef3926*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x8x8_n"c5900d1104f92e1e66e3820683eb1c41*6&544010f07673a427c643933aa3b858e1*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x4x4_n"7b5a8d127d6c92cbb5ce22a83d529d51*1&abb87e71dcd5e0dba17633681915a959*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x9x9_n"374d75c0823d2ef71437d06734799c64*5&b93920783bb68df4e4431bbaa91db544*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x17x17_n"4248a586b6cfdb908a52bff2d273832e*5&dbf05ad028fa62815ec5f8c5256fb021*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x33x33_n"53ea1728984c922dde69dd6b0330022c*5&9a621641be6eaa717d74b8b31fb2a48f*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x65x65_n"0fa60d3e00ffdfd53d0e839109900f49*5&f6687e6815875cd2ad73ee3dde063132*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x129x129_n"e3aed51d92a9431b1ee73aa887e1c829*5&bde5288ea8d6dbace6e100b2c3dd3651*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x257x257_n"e73cbc275b79555bcbf1e99e35b8ea0f*5&1a7ecadfa1c8ef3cb419ffb5a81b4068*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x64x513x513_n"6d861abbf760ef1a82451068022b07e8*5&4ae276ff44ea3fce00a17b30aa873552*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x1025x1025_n"767a8a143bbfc688e4b509af18136514*5&9dc7a0f77f9015c266a21c64a0aa4ea1*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x20x256x512_n"8d973baf930e8f2c5285ac809ceacd3c*5&0ebe1664dd1a1f1581fa51172acf6c54*5&2692d21686b280c2a7e021a153a70c14*5&db06655eaf789a385fed0af520af39ac*1&5abce678e97d3caa5515635b7aa33df4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x72x1x1_n"c74793679ed54d3965208bb503f7572e*5&b3a9a9ba4549ace510d6291efe7db2c0*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x72x1x1_n"56b08ddce75102a4f439804445fffb8c*5&e53e7a39e8ae264651b47ba7aecfb190*5&f204aa3635087483dce2cb583fa38669*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x120x1x1_n"bc0202159ac82dbe040f34ffa7d13ead*10&c35fa85e023f1cf2cfb6fc08a827f9a9*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x120x1x1_n"7ad98a1ea3596a5e98f4e4dc1bdd26b9*10&a0c3f041eff1136e22a6e276684f6bd4*10&f94eadf8505a2370d0c5ad5031739fe0*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x64x64_n"ff44287609781c2b7842a738ebe41982*55&3efdae36601088a40b80ec9d59c7b2e3*55&56f147b0ca52fd552168c0ca79edaac0*50&85e6f0010c99a742eda143591c4db4c3*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x128x128_n"fa1b795abb744b8a5878e90e64705fb7*30&edbd611e81c08b49cec8eb0bbdf4beb8*30&41465c488dfc0df4a01fece19905c47b*15&5e3a99427d789458b3903058b83695eb*15"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x128x128_n"1dd6dd1967ac50e90afc7ebe7aec97f1*30&2c2924eed1e5b1f0ae6f1d1446d56fc1*30&973de60a1ab6e72c4912523f4eebaef3*15&1dddba241442a6a08d2d2453e8b25779*15"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x256x256_n"d297208452f1d8d2717f8e41bb080795*5&f5d1888db4554ffb123670addf6e8580*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x256x256_n"27562945defa6f3091051de18abe5ba7*5&10a0006fc78c7f214e19065e9be792a1*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x256x256_n"951169bcc8dc853df58b99987f50deae*25&25f16e91a216cb925647f10a27b77cee*25&99817fc60fe40c05403b4bf6f05e9c96*15&37d6b68c9eb1826e3e36036b9b158c23*15"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x256x256_n"c986ff9088b3bdde6672b87101033912*25&ec8ea63e9dacfba9829e76423d8b467e*25&350afcd9e8125ef7c7ace9644f7f9f0a*15&a5b48b3082dca20bad82041968c6c783*15"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x512x512_n"1d5068b0264348891fecbece471823f3*5&ecffb7d44a8f7ad6e26326931e212bc5*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x512x512_n"ef28a06026b2751543476425f8e533da*5&66e18a49206e0f2ebeeea7777a700005*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x512x512_n"0aa5a51ef81138c9dfce3491bf42fbba*30&7181aae76f1414eebc58cddbce541943*30&b66d8f0c801db4277b21d13ffa34fe2d*20&bd0bd5328b017fef0496c50a0f4cf8a9*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x512x512_n"b0e7c47ab03ff3dc5b236530ea237596*30&2293b908b8b8d6d6b8335f4f95539b9f*30&f60365c52ec38960d107e4ebda3cfc29*20&1bb1ae1f01c49254ff59fd3cb40eb418*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1280x1x1_n"f6eea21b3b28290045f015c7e14dc489*1&ec3589f0b591274ac09f4c35849b0709*1&da8396ee39001a62af5b4e61f53da25a*1&dbcdd50a9a1298967950227c6ba8953a*1&79c0275e87c5e18fd2cd1be87a48fdd4*1&f43f0c89ca848e62c99624d6c116b0ca*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x20x112x112_n"f66d7a43f753883549ebd7c1a95c1999*5&1007514c55fb34a4e6e2e17e44132c5c*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x256x56x56_n"e0871ad14e7cb5df6c1c7c624410df32*1&7b430a61e3122db848a047643e9bca29*1&9ffb14776411d52073148faeccfe7d55*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x256x56x56_n"c335310a84c2ff9a6b0fbfd86390e008*1&ce8bf81269ab04dbecfe4e10bae10ce0*1&ee7c97cec3d1c4a48f91d81b3c85853f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x32x11x11_n"e56a779e2e237a3a47d50a8717f80c74*40&fc1e55edb1bacf8e46a9afb96d0d6e2a*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x11x11_n"bb005522a50f6704a821e5e36ec562f9*40&833a21ab6d45bffc1626ae58f75ea517*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x32x6x6_n"579c3201b421943eedf2fce52b6f9385*40&b77ac842de96ae7a7b343b5f4e1adfd0*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x6x6_n"2f9ecb506f4143e961ff96d85fff6ccf*40&9441531ba48934ae8a8f60240b8ba806*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x32x22x22_n"7fa66bd138aba054858de47734fdb2d8*40&0ed9f16ecb46a07aff2ac95d0ebdfcfa*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x22x22_n"ac85be0e7d083a8153bfef5e5bba32d1*40&fe60c3fdbe6fa396b28c87419bb9d3d1*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x32x44x44_n"8776ea52978d036d32bfec8d591b5e65*40&5ef6b15e0c31355b7cf492186eb6c9d4*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x44x44_n"6a5c582e43535a95f107d84c6373a576*40&07fcc55bbb398f1d1a0a3e07dd02aac6*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x32x88x88_n"46ebd7ff7b98016da05c8bf7b3fedb29*40&dac563a406c1e45941b72f6be8f04045*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x88x88_n"038ed61438decfdd13f6f4cbbabb4270*40&81c5a681bf5641979865f327fe10ca82*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x224x224_n"24a1d5e6bcc9b38e37ad49b97b01b1fb*10&40dac77416d51d6cc71406d929affadd*10&760d7208e9511fe40d4a208cf0c9eec3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x224x224_n"2d5534694390f2efd5cb896ac47d17ce*10&c9eba74983d06c6a7d55274a1d606d3b*10&62aa6eb20924355985b9fffb159ad271*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x64x112x112_n"36f02dc5bce7bfea4d501645604ad324*10&ca7288ed8b705fff4f4288966846524c*10&b8b7efa7d2a78918c36d62fc4e241e12*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x112x112_n"8fb5b77f61522c8dbbfa33a88a1d51d7*5&b8edb6776294a9fd63ffa09a42e18e70*5&be3c315a07c92f497b9a31dee60cc3fd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x56x56_n"7d53feeb909ea29e64f4dbfaeb01854b*55&1635a264ee11a79f21b0ccf495ba7b4c*55&206ec03d25a0e113d2f6de8c9923fcf7*11"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x56x56_n"9f38b0c0f83119395846848bdf5514ad*50&e3d84795f7c834edec09879e6c538c53*50&9808300405186ba32cc067d867aa6fcc*10&6deb9e6da287862ad4869c051931f6b3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x112x112_n"3e8873a60e2a28128727a52aafb4c5f8*5&232f514b52be9c9bfc0fa4b12a7a0cd4*5&43cfdab36fd2cb3fc11b64091de5aeb2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x224x224_n"9d7af2b13771b5aa28ad8614ce337192*5&a7b1b2973b48a7b7b5de0e551fcf7578*5&f948b850b869f5391171fd288bf0b571*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x12x368x480_n"ead2b171f3808efa3a73f29d18ff700f*5&c9d9b275f8992b561a007e22ca9b188e*5&118533086964e693ee29f0dbe141a8e1*5&ee96839cf85ff0e31ca5a25fe0de5f22*5&ad584e8a08cfa436dc0d7f8830feadae*5&ef99364b74dfe4d4b173f34a8f20f9ff*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x80x366x1_n"c9a6bbe3a10c83651e73d8f483d477e1*5&6b33d056cde7369f5540adfc49344e90*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x362x1_n"d5139f40bf32299603fa2ee62f78e45d*5&e52125f52b0c17786703e359d8ee5911*5&7af1121d82d625590c324045cdbdd0fa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x256x1x1_n"3e848db52ef1f9882b00a2e73bbb2065*3&7b675496c883f9e6bfc95b2e2c16be38*3&781e2309a33b971ce7bfd0ec2b0b8df6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x256x1x1_n"135639bb497f2c7c970172cfef6916d1*3&99fd649197222892d78c7bab8d21a256*3&a5bee8125edc7eb746645c6c2e8f034c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x512x1x1_n"40a6203d3a2296253432871dae1f0f0c*4&abc4e2d366fdd02b0105e02b39c5d534*4&274bab5d72456aac12817fd96f30a404*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x512x1x1_n"6a0700399a926167f4f576a87cad50de*4&8d654cbc7d223f1abf7843af0a2ac768*4&caba55995b7ceb753a4535adee8b2ecb*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x1024x1x1_n"d616a739c2d694ee24b7191dc456d849*6&82e043574c652631e69ed4feea199046*6&e09c09573a8fd399912ad824b6a16281*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x1024x1x1_n"795607828cbe009ee5315478d8acaba1*6&57078296afe600315aff4fc364c5d3a0*6&0465d0edac5a7418a6a879261fdeabc6*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x2048x1x1_n"8157b4d6fbee07ba1a9d6db5d374f796*3&86bc51bfab025984168eb081fc0539ec*3&13157038f534dec98e218c199213a3c8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x2048x1x1_n"8894097b35af69226c0d00a5fc1c8839*3&4a6e4a5fd97136d404dcdb8091cefc73*3&9a919d339c34b0ab0d955e05b99c9c6f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcde32a16b --dtag=ABcde16a16b 32x16x16x112x112_n"4c4c90a17d9cac432f774cbb8d5ce925*1&e99f59da3e9df984ad4a5bf4fe1d1981*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x72x8x1x1_n"60339a0a42ea841e627ae42f8f0a1259*1&46578bde44cc5761637f48d286781bbf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x120x8x1x1_n"5ecca2415787715718875eae3025b9e4*2&3fdc18f0a9fd3d8c495966baa94e1daf*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x480x8x1x1_n"496addcc4ba41d6c54a7bb0b6b95686d*1&75f27dd347809638b6b3479056cb4f2f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x672x4x1x1_n"966bda34064ba05c065bae87587fcf38*3&cf7c10463d1ad2dc316c745ae9fae910*3&ce03a57cf2ee152ffc94d413962aa0ed*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x960x4x1x1_n"01b965f3391cf74b7296937dc1d8adda*1&f4cbdea7782ac0cd7b7c86abc54ad272*1&5008542d2cfe251daeda63dda39cf13e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x546x1x1_n"310a90471e872930e168685f6d71dc5d*1&b6ee8c693830e39300262547000da4c5*1&232b967652da1f1690be3227535c5087*1&86ae617bad0914cef7fa75142df25c82*1&0ab54d23d49e062436aed58899a5c31a*1&cc6e3c236ab75baaf8fc8cc2b2a3c2b2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x24x1x1_n"6c33a008fedf2955e78e93b1f4a236f3*1&550c8fdb59bd1978867c2efc0e55b58f*1&3d988b1e0c073080fae4effb33c15f41*1&6b53e5c7c7710bc107900ebda303bcf2*1&7d9e329bde10f2a8d507c0a2c5c95dd4*1&64e16c424a4fca5d4272f0a0b943bf0e*1&98cc237eae17879816397ac454d8a8a4*1&224bbfc47c4f69f693972d296a9315de*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x15x192x192_n"d0099c19431ac6f9978a3b2ab5a4b55f*1&768a63aa6d8720d0cb6f12b3d9b420ba*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 100x1024x14x14_n"50dcc57d1744b9fbc3cb687081763111*1&cd3bf1c0d7590293c6b481c565573672*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x2048x1x1_n"cca49237bd3a2803ab70c34aaffdd658*5&4eb8d1e96c3a0eeb5dd5d8acef210353*5&eb641747e838862baad5268adadce294*5&c805ebc17308c51f0bc8857d2bde75a3*5&940bf8669c4a5b6fc474cc71015d23b0*1&09bf86164602507f2ea8242f29aa4376*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x96x1x1_n"c01fe1f6c1d29597c7597f72a0fb0c9c*1&aa810c824a4bfb9fa08b38e9eb3fc141*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x240x1x1_n"b868be74a5323173e72e1b8842364818*10&ccb6a3650c4b5d3b995c0f60982afa2d*10&8d8144825278c288956ece77b5fd9f09*2&fabda56320cad31b4ed7127f15127339*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x120x1x1_n"4386dc55b4a35cd0d2da83d478b6b0b8*5&9271057e41d3a1747a64bef753fd207d*1&4c6206cb75602a0b2f46ca3bca039646*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x144x1x1_n"7cbf48964f8113af8ec3a6bc58ccc6c0*10&f31f3162ca84795a8fb8d841117520ec*5&0c90832050a132cfbe11aadfa57863fe*1&36116722e27a0466687a7df6f348ad32*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x288x1x1_n"537d3d1b9a8b52d336717beecd946147*1&b74e238f4484397d97a40bdbda3ac4bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x576x1x1_n"9fd81cfd60260c5bd6fd3a3df2d8d45e*2&f7f8b14eabed879cba6caea56d7c22dd*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x19x512x1024_n"37f09a85c87978865faf6734e226952d*5&7a5c86324f56667a48fadfb48e44a2d6*5&f09e91f9901d164f233c8ea9256a7266*5&ebdd2fe91619dc72ef3117037d687f87*1&c29c2c9810d101e0c5907530058cfed0*1&aa5d41a2a43a787a053ad699ae13954f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x512x7x7_n"6def7350c2c9404bbc2488c3fc9f6936*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x64x64_n"c12c7ae44e7d66cb81f22583d934b562*1&69297636ca4f7aea3b4652c428a02a26*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x64x64_n"d02dd767de74e27ae04b1182c25d281a*1&df2038bdfccc53d7419247c2e4eb90a3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 300x2048x1x1_n"a1cae68e46d21f98c5cd3db88462611e*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd16a16b 32x256x150x150_n"a9201d83cfd59a09534caf1dd33952bf*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b 32x256x150x150_n"c22556636a2096ff51cc30e6a8f5df4a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x256x150x150_n"35e20b73dc5e28c685cf90b6cbd4b3b6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b 32x256x150x150_n"8f737b2049e2053f135837d139175df2*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x324x50x50_n"51f8fea9558d3c83da9a32b36a4c5af8*1&6c678592424acb4088cb3d0798d2a581*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x16x50x50_n"5e80c8a369792ad45d6ff601e6b43eae*1&4643d83eb13000293974e9f396123889*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x486x25x25_n"d24442b72a2857b064fae6d4a042fe63*1&74e5c3cb7ea7ab82db48e92731f1ff53*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x24x25x25_n"7cc840785d235b06da8e9fab8078d4cb*1&74cf457187b6ae923f5f3d7853f65f51*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x486x13x13_n"21eb31cfc82b739425490d1b2b925c5d*1&4b87aefd2ef6cba0d08997145a6c224e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x24x13x13_n"902c56ff2caf85b76c7297e8b9fd1358*1&50f401322fdd68cd1ba5081cef67bd09*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x486x7x7_n"e7d70d6d98b54a974ada009c87d4fce5*1&27596a8630115f5c406fdab76fc62f60*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x24x7x7_n"5c1024bdcdcae443bc324f956552351b*1&5b2b3e6bf90bd374c4e413b6de230daa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x324x3x3_n"c10e9cda85b95e7fdb9c701aa35e2957*2&a31530d985d01ff080e671631b71134b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x16x3x3_n"0338b64cd4200b74bc18f57cee6ef690*2&cd56ff6e72673767e99c7a357bf3ddc1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x324x50x50_n"0f96efe6de3715d243507675ceac50a1*5&206c3f40df43c0b62e760a5d788d0e04*5&80f44370b4972ed82ef9dc7aadf8f011*5&bc2b98c93d9527683eceb58aa2318b4a*1&325a378aafe38a3ea0b58dd6a924ba56*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x16x50x50_n"c9d96fc006f979a7df439fab385289aa*5&198003f0899b4f01b4ee25d749d38d4a*5&6374447f067e7e73c772529e3105d0f2*5&9189817fb3e4908e2befb3d34708f677*1&f5d758923ae20e9967b597918d2f2dbe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x486x25x25_n"bde7b925c40c3ef5906f72797201283f*5&ed30220b31bca38af728bc7ef98df366*5&5b57d8d588b41beb294d6a4f754c5eb1*5&94f3dfb00a14aa7b7ea9ccb713ac4871*1&9475b8efe7790c1110de82f1bef37104*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x25x25_n"eead81243fd7ce9c3003b9225b51b92b*5&1ebfc551786a82a9d09bc00da8b6097a*5&0e997694024c4d36d6c88c932c148b15*5&ee5f5161ced6675fbff0fe4ba61bd2ca*1&14e8b68ef88b2d4d8f7ba4fa422c16e1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x486x13x13_n"b741f36ad1d65d82ffac2eabf59e4928*5&d4ac2f17fb8cf60237644b1ebcf6e1cc*5&1062ec44ad4ec3f3145e57043445d87c*5&90c6cf7df2729155bbf86f942e838a19*1&f4b44eb15d9f5562dcd8577a18119d55*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x13x13_n"97c4d2296214fbc659a218bbd607d54b*5&9b398f2df456c5067418737a869ff4a9*5&3fecd9cf29755663bd5916b31e8d84d9*5&34473e00d0363c35b604592e72410558*1&2fa8f2269c77bbebcd18b6053fba4bca*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x486x7x7_n"648327565759135dc60594b6bc70c499*5&6f59f176a18f4cf9d091545dd6847976*5&4eefdfc8d11394c1cc57ffd6c96466f3*5&6c24ae80d62d2e1df36effc0e838da91*1&460267f7fcc3e9ab9191f3ddf1f292e4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x7x7_n"f48b47a087ef236b7088b4d055c6e4cd*5&d4c1dd85b5bc7a6057bad265800a3b7f*5&ec88a1565decd41e03d3885e8c1863e6*5&d48324cfb9b1967a530f3cda0a85087b*1&2fe479bc52924053d72e6a467e76e2c7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x324x3x3_n"d13239d26326f668b023529f2b5c590b*10&a773889ff922a70e229463831267a313*10&895313f3da8626f5a2664f4598b2465e*10&64793022ae75f032f2e46675f21f6377*2&fcbd65afa5e0ececac7d5587bd71e8d4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x16x3x3_n"77216b458b907804aca8bf4f08f056cf*10&57aaee1077a01b169881647478fccdea*10&112965c1338f283653a0e08f4699d016*10&ed4f3a470bb44deefa58c3c693eaa191*2&d240c3dc289d7faeb49347bd0722f5a9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x1280x1x1_n"1d1d04b6eff81e8e67a6219eb6afa4f6*1&19063ac73960a7995d9f3c7b3656ae89*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x256x6x6_n"2fee92187d98125845d3dafee61ec0af*5&788bad506ed39f4b95e9a6f1af460121*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x6x6_n"56cf7408a832b574fc4be0e9b749215f*5&7e9036842d8dca46f77c4d1dfa3c75f6*5&0b91ca2146af72fe7cd6a05a96d4bdfb*5&bf5b4ca1237c5ce32267477d5cfa5dda*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd32b 1x1x1x1_n"84af48855304c292a6c27bb2374ea160*1&f91986367963cd24c7fd506cebe85d6f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x1x1x1_n"c25aa13faf4d0593f30d2d423a1b0a02*1&f397a44495c4c1ff5d19cadb506f1626*2&7fafbc5bc21a1da97e8a7d248e16fb0b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1001x1x1_n"e282eb659069071b56b6e420dc6559ef*1&3b393dc8ae5be609f057c87e5a3f5f07*1&318b76acee516261588ddcca01551e85*1&b35989f502e7fe824c57b90ca58206d7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x3x224x224_n"8ac2881c38fbe7470b3dd9982b42182f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b 1x128x8x8_n"75ef5483edc015e8f097888ed71d2079*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x128x8x8_n"c39f5daf4a3503557bc8fcbc139247b5*1&b53416da1a76e161f0c798f4b4ceda88*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x200x336_n"aa733a1c2e71c503b96d374f88966a9d*5&c92a367203b46c77a7bc1bcf63355b75*5&940120761f504d07de22e90d86335dc9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x12x200x336_n"36d00158891108734c123405c12d4a53*5&65bf3e5844b5c580ea4815a97ae5fda8*5&e020c1c99ffd8901d09375e5418f800f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x18x100x168_n"ef6b92fbe0d2e1f235824a3a1a1eac3d*10&b89b720aa387c1410d08d525e24c1c09*10&2b183aa9bce71b616cb4f695eb1c68ca*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x100x168_n"df877af2e77c4876859046931aba49ae*10&664e39d4c32054168c95ebf47972b596*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x100x168_n"35cf89de7e3b575ccc410878100852eb*10&286d00f5239500929416f186c980e196*10&530b9fbca2e7e4c6c69517b5570e8a93*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x12x100x168_n"e6d453b75f4c8631d413f9be52f33a62*5&28a427ad44485d80a7c160bd3d35d142*5&d26430b2cb88e112365574f443a4cb5e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x18x50x84_n"40436aa6ae148a538297d94df5d861b8*10&f692efdd6299ace6d4056799e3875d94*10&51ab8fefd4dfb405498f1f59d5bbbf2d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x50x84_n"575196fba60ad27a634e2c8a2c52e270*10&7be37534f9d3ca8960b813ab026d7840*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x50x84_n"0c5213fb61c70421df1258dbbcdec92d*10&63916628db8d8805aa270cc6fe9f8255*10&d642263c0e127ddfd57e651c4079f7d5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x12x50x84_n"64a5e4b1493b08b1b30b445edf0b38d9*5&0b34792dd76320004df7eba9705da8cd*5&ec1d20ae073313564a887679aeff3dba*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x18x25x42_n"9f35ad5eec33942eb0b5aa596cb73ee0*10&fbad7c24898124eb818b6c7404380fb9*10&9779a1fa7b418366c1d9ed09fbf58e45*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x25x42_n"fdc74ceffe532ccca537f38e4a86a0d8*10&aa7464e4fdb27675a7607da1f17fb4ad*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x25x42_n"16c38bbaf8633ded6d1c297009bc98f5*5&a0bc07fa0f107a1b0114c44e159b9bea*5&33c5214b9e6e886babb0ae876c0e3190*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x12x25x42_n"c47fba83ec530cb2ef78ac2fc3317aca*5&418c56bcf88f17ded249452f9e51aa1f*5&f25d77b973c3502366d4db557db3564e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x13x21_n"e3b57547dc1e3b1c8678e823dd169f56*5&2bb5c8bdeccda9ee3d7bbb3ba2c25f5e*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x12x13x21_n"a854306ff4c276e3524208ffc9cf11da*5&f5d2ee96e397820f92fcd42c478058f5*5&988fd96a633334e45114ee3d969a2941*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 100x256x14x14_n"440794ed52141ee65f95d9542b92fa20*45&dda94ff2f3bab25bc81ecd89bea6c227*45&6e393866eddb82ec553f1e5494ab7c5b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x256x14x14_n"2cc08fa944d9f4769eccec698885f5d9*40&37a50d7933af3bf28b6cd8a906e7f935*40"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x128x14x14_n"f1b6d21305d4a4f715d9c2b271850f2f*5&0746244679c09f92237b4a4f5e2c6f6a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 100x1x28x28_n"b65a6a39e19c46b35d3cb5d134c44e27*5&6c60142d49153b1b61090b9ea28d548c*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x128x1_n"27f247c72c80522d4f32fab5527927dc*5&397bc92567deeecfccee40a30e8b4dc4*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x12x640x640_n"f7931161eaa1bba38fb553237e7c9619*5&5a7cb942a13a5c97415213dc2ff8d3ab*5&7339ab037e08ba73ae364f57b092b650*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x100x100_n"6e615fcfb06ff0489ec8dbfcc0b5cad0*5&fa86b754557e7df304d43983ce55dae0*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x100x100_n"e69bb696552f50761580b6d91769cd73*5&2c84f14f9216dc82793fa407ea7e3cd7*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 100x5x1x1_n"b425ba67fd4d868f007cfa24af8104e3*5&cf67a04429804d061b7c936320e21d05*5&419d76a7f28384e8d88c2ebdb7706d29*10&22025c1e72d1afdf4fa7400d98667d7c*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b 100x1024x14x14_n"91e91e83637d8f5d93a6fd8828dc1341*10&d15c7878010b3884fa3a8ec912b5df03*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x2048x1x1_n"82c0990b4f0292d85aa287b0524e700f*5&b49697d85c091878eafba9227374518b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x96x28x28_n"40b5093b54098d96e54b0280bedd2872*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x192x14x14_n"10f80a0aaa5c6c0291af802c5bfab3c0*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x384x7x7_n"e0585b50004902bb8b9fc4a46c0bf6a7*23"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x72x1x1_n"7784c821a90254c7a815bc79df5ac5f9*5&4d3bd05f7af277a48894d7d817709903*5&ec28c141020fcf9e96b05bb0d3b0de78*5&a265fc33033ecf2eafff5265a04a5648*1&fea4d66754e05e6f4dff607806f9448c*1&91430dc1e63698e8cc234030fdf4d90e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x120x1x1_n"5242189e97dc3ab776362f990ab0580e*10&54daa3f0d417010aeed9294a2ae58263*10&197899ddc3b9b32c408ed2c9823da010*10&3a980b2f0c5ab9d9a3b325a33a1a0ae9*2&b63274c2f94421606d945b1fd287cb88*2&908a4c2476e699ae635b0cbe0dda40cc*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x72x1x1_n"e9c5f6562f72918e65c2bb0e4f8e6f41*5&caafe70b38ccba0408f8b63f9765fb2e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x120x1x1_n"1ad01cc0951b2794d2a7710275713d33*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x120x1x1_n"cc3242c23d93a0cfaa15fdb729209ff6*10&dee31b37ffe970714c92f92e357dccfd*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x58x28x28_n"71c8e22e42eea4bdace1d761e3648d4d*20&b95ad144af0940c277cf9a45e161150c*3&8bc3f1c005b6f387462cea6a386c06df*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x116x14x14_n"4b79b8b99fe833a32143228631a5b0b7*40&fd72ed884ad114af7ac4c4bc580cae17*7&fd26c3374136aaca8a3a0ada5bbfa5f9*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x232x7x7_n"52f6c56b9e42b967ded695773cc63c43*20&ba453e225070a35b78e8417e4cd28095*3&825a63c946a8295f3ee99d7966d07c3a*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x21x65x65_n"e8da4627e3abedf0bd066b996c3a2760*5&fcb0e7de5d5352c58695b8cc5e9a8e2e*5&9bb2d066f9b6cecf301f1db95bbec55e*5&d158915b3fff25e5913112ae7a6812ed*1&cf42eb5c8e2273297ada2d21d026d62d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x96x56x56_n"1f5eb89088c6e44cf55d85e6b2ed86af*5&8f41d13986b9134517bea21b3c93cdb6*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x7x7_n"f5a6d1c10650c10d164934c8ccbb4755*5&322cfffce987acb56b9c3702f4dd631c*5&457c786a4c730fd266285f2cc299b62b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a16b 1024x64x56x56_n"c56cc5e5d708b6d030d94556463f8583*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd 1024x128x56x56_n"7162c2e92f58b2124876b57d95f473ca*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b 1024x128x56x56_n"3e22ca32abc663bf46e305a4bcf658fc*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd 1024x256x56x56_n"8f5e3a9b65f22560efc869a0acff3389*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 256x16_n"85908a60f4ac1f7dc27f82b805e9d585*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 16x256_n"b450f8cf5e1b9358425e53d4f0f49d43*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b 1024x256x56x56_n"468c2d41174be7a86ab44f0017bbdd60*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b 1024x256x28x28_n"05e0cf676dd6519b300e3bd456154662*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd 1024x512x28x28_n"54764f89eed737c110d4ce6fb3a9a0e3*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 512x16_n"5929d756f71c66344b0ef29adad15c33*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 16x512_n"e835c31061528754517f5c6de4b022a4*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b 1024x512x28x28_n"5ed61a9c39d568545d2337504e5326d2*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd 1024x256x28x28_n"649801060957e29d81da1ff8453284af*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b 1024x512x28x28_n"933c3714d9867f6f59d8f4d1d095d14b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a16b 1024x512x14x14_n"04ca6d8b3f9b318c6721dc6a180959fd*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd 1024x1024x14x14_n"9209d60f6fdfc57d3cdfab46a79ea0bf*13"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 1024x16_n"e00dc476860b239750843ffba0070d7f*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 16x1024_n"969c80036f8a13956f1740b116892cbf*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b 1024x1024x14x14_n"816c32ec200877db532b2ac03d6a20a5*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b 1024x512x14x14_n"37f6695039b1504ee6e687cce90ec97e*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b 1024x1024x14x14_n"f3c46b36145ae8763a1ab1744b98efb1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a16b 1024x1024x7x7_n"3e099b1f8c8d6270704867b5c88afc77*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd 1024x2048x7x7_n"63f36fe8df71dc5919aa65cc8e71f974*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 2048x16_n"6522db48f1985ce923ed4ae834660b29*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 16x2048_n"f1fae159f8e88f87ff5b100f0dd89c01*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b 1024x2048x7x7_n"bc087e8ae76442496bba1d5930f0dc39*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b 1024x1024x7x7_n"8050361941a576b9dffc6fa522d1f103*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 2048x1000_n"7d91a431cab36719a94d1c6dd770d423*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x32x1_n"6466cbe0437c645104a5bd2ec0016bec*10&8688461d1ccb7d5bec5b43b482455fc9*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x256x55x55_n"2769d7633e875b91ebc634d022a1fee7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x1024x1x1_n"d0ad06efbc12101057b16d845bc365ba*1&e4352ba06da458c6fdf7ff09c558800e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1000x1x1_n"5430f722a97804bb535f0614676ccb7d*1&7ccf61188c73cfc13df9ef738ecaa20f*1&0628d21725c98f00be7ebbf759cc9677*1&b1bb13335f6004fa198e9df150a09f15*1&119a55c6d13fc92984158ceef533e457*1&60706869295a6283310e35690e6cd19c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x256x13x21_n"a4555f0632256cf1daf13691e23628a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 100x256x14x14_n"123e41de7becdc90bc47357106e42235*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 100x128x14x14_n"700025814d7e5a25e9fc208f948ab306*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a16b 100x1x28x28_n"2b4c140e036f441d114605e6ce06aa37*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x1024x1x1_n"c339bed38762b1fb1a40e2e482599e78*1&ddf777f1f41750945264ee7f3dc959a0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x512x7x6_n"551bfaad0932bfa52dbc0369f21751a1*5&79ef379baf487e990bdc1687961bc301*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x12x320x320_n"b26f071352d1e5ea26b1698b35b05fd7*5&edeae8fc21a745ac9be0dd213dea1c31*5&e40ba1a3fbd406bff0d306d1660760ff*5&5d87a2cc6125383fd31b36ff305bbe29*5&6d26448e69c88e376c18b82826030e6c*1&72f4a464728ad1db7f4d96022e815ee6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b 32x16x112x112_n"ff79332d5477d1b6fb38d0de66be6189*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x16x112x112_n"3686b894d03450fdac87b24d3f4693d6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd32a32b 32x72x1x1_n"d0f6187fdf5e5d2cfd955f70e0732872*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd32a32b 32x120x1x1_n"fcd9851128ac2d9859d92ad78da57c0b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b 32x480x1x1_n"f0a4d9df032c7413bfa5eaa14d97f6f2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b 32x672x1x1_n"b99927f4a080defde050d661d1a34ba1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b 32x960x1x1_n"7a19f8f092cbd5f7521e0ad2a0b127cb*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x480x1x1_n"86b195faa76f4c4b8f1422dbcbdd261d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x672x1x1_n"26eceee16f5053306b4c9740d88d6b1d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b 1x960x1x1_n"417019fbb1b32b94d3eae4a4033b5a57*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x72x1x1_n"9c165ffb3a51e3a76308e501ccc1d704*1&116b5532606fda83113de4d9f97b96ec*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x72x16x16_n"968184f8223d5155302a1b17a935a9c4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x120x1x1_n"36d292727df11520d0dd35a6c2210488*2&3e6717d880dd1d4d48da18a884408d55*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x120x16x16_n"9fe137870728177a4d47fa3c69500585*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x480x1x1_n"70c58e14b14cd578f23938ac2ec5dc88*1&19796bbede3f71367e5660a61cb767ba*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x672x1x1_n"9b6a05eec85d33c3b9a8827c4b9bb163*2&c3291dc2cfeebd4c56d6153302d5a669*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x960x1x1_n"4b3e61f4e4adc0e83849b301571a35b8*2&ddbe0d84aa27430816578ffb67b34799*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=aBcd32b 1x1024x32x64_n"1bdc9aecbf1d5b051d95e47e762700ab*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x20x256x512_n"1556fa39bfe19488a7c3c7499fac8587*1&ec11924897dfa207859fc1b199d22e7b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x58x28x28_n"cf28936b5d5e306c2eece3f906a2509d*3&9c54127507f2a14bdf690b3b98c9f535*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x116x14x14_n"afa2ef5049e8919ec97bbc31d0325c3c*7&1c8d59d86f03404d8c7fd48b6368566c*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x232x7x7_n"ba64330f5b9094bef0bb8f9a2025ef13*3&84df3bd53eb59c2a1c204ba07244535c*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x15x192x192_n"78e77a3d545afd23272d67bef996f015*1&dcb6ad608cf5715d3aeac51d37aa014d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x288x1x160_n"e157d1e74d11aede0c0780b73d7d63eb*1&43bf46a3d390113edbfc90876fe001e1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x512x12x12_n"07249d0c05652b4db4f8a49f9259ded5*1&f99a4b742084a837681d4b4d2ae91e81*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x512x24x24_n"cf20b67cacfaf62eebca0e65f970cfc6*1&87a0f62eb647f3cbb98e0987a0eb15dd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x512x48x48_n"f547f174181f9aac5c5973e32344ab2d*1&b8a28a71056d89740c7d2e7b0c63916e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x288x96x96_n"b6960d956a73decd569d731b2b12fab6*1&4eb7926afc0f7fafd295a1988a85e993*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x160x192x192_n"c488f93de0000b83b09833db7f16db5c*1&14a7aa17796b4a589eb18aa7cca72fc6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x96x192x192_n"8046eafa0267976de0b452d398a98ee1*1&72a1ed7a38482115b6b83c1e3d8f4c16*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x72x384x384_n"6ca3a078e234541e5709a203356c72e9*1&93e956469f87dbd7a150666bdd20f127*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x2048x1x1_n"5bb05d83ee32417c98355e0cf30b2621*1&2a8b20edb3d5a23865f7b133690e3f87*1&47024dc1f38c3580fe58a2ac3f7003ca*1&142fb78fec7a6e76111957aa74cbc92b*1&93fec0f295f510bc3e122590b13ed0a2*1&d35f8cf79ad47c24e134c0b563f2e5b4*1&33ccf21f5826272cf29289d15e55d5c7*1&9d53268dcd5f0351bb3bf7d776294cbe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 1x1x1x1_n"abba89ea918370709f593519c9493b5f*2&8274312fc1a58c9beedd8a6e34f858f7*2&4e39ed029fbfe78fa93fe133919e93a3*2&647d873c12908a24a4088074d180e6f7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f16 --stag=abcd --dtag=ABcd32a32b 1x1x1x1_n"67e90b5d32833159809dc1a854a859cb*1&807b586b61678e1bacfdc4d1f39fcb71*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 1x1x1x1_n"80d255191c1325cb55124661035593b1*1&a450f593fd31d6e6adf2b5529418b5f0*2&97b81b757b54133f4474f4b58ec144ec*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b 32x48x256x256_n"4bb1959fe66347223a4eba0183c80749*1&dd02799aebcc1eb39f3768c572a1547e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b 32x48x128x128_n"408dd499746f5fa77d8f15279c2fc41a*1&ac875e003c11cebedca61f6459c1c364*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b 32x96x128x128_n"22ed44976d08d5ff71edbfa16070d320*1&c91806c48d8c84331e6800dbd5366323*1&927f2e7dabc593083548fbfdd26ef2c0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b 32x96x64x64_n"0840ab1d778c08a9dacae03b7e358dd0*1&0bb7279c803dc38c3524a2cc8f764ba3*1&4ab80603911d64069d6674db485936de*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b 32x160x64x64_n"04fc80e5cbe1e9782c7513eda578f44c*1&f9242c910e8e56f034231e50d10edecf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b 32x160x32x32_n"c037659de04d71f454e56601ddd00e98*1&1b8c31851616e3e7abc5c8481203f476*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x464x64x64_n"0029478a603b52dfa013daac189eb9a4*1&3794ab403850d72e1f50822407c8100d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x256x128x128_n"7315c2d4b0a724b12518f7d41581fc74*1&331269d01c63f9d8c8a837659c1c1a85*1&7d6ca51f0a22e5ae8d79c130688513be*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x144x256x256_n"90fed7a1c82e56287d6c46a728b49aa2*1&1e7b8ac261f56e74369cfb1a4dedf1b4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd 32x24x256x256_n"8e4ba05a224ea969cb0aaa9d30164487*1&36786f6c9762e746b66648b1a7cd69a7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x48x128x128_n"79a415968442e6ba5295f28d9825b12e*1&295e23310030f2c13e04c3a9388c772d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd 32x64x64x64_n"f2675e9b2237a1d01e57120cc0a5ca28*1&f8cbff3c7b2d7a4cbb22c4c7bc7cc26a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x128x64x64_n"1da22cf2cc3c1d11617b7458415c2e07*1&f33ae30629a1a030353c254900857714*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x1x1x1_n"4fc11ec05c1789841b035b153a892c45*10&5991a474ce3921ab4dee426daec020c7*10&ac0c6762871803ff35e0d888b0e64907*10&129b6baf45c2ff105a69e360117489f9*10&1e935e7b17f48f18f58de7311c7f48b9*10&75d1c2b95171edcc4b7d0b40b2c8b756*10&f7a28209504fbe70bd146f76bbb9e95e*10&86504efd57137ed4b913739f642b24af*10&46accc2447e8c94c4d34b1387facf5f5*2&6531d5be3c77084d3de12e10a54c608f*2&f52eb624ef156950d6bfca28fd9532b7*2&d14058139388c7bb70021fd50c29b1f5*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=abcd 32x1000x1x1_n"7610779003bc32afe3cfdb1209e36d24*1&46e3d2e7ce1be1621947af8aecc17030*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x32x224x224_n"192c94ac91ef2e5e8a5d2e523a00e558*10&3e25f7e2cefd3b22f98b1c4d343a3907*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x224x224_n"56becc22a50e8e3c38ebe8cdfa3e40e4*10&766ad645c1890f4ade41f445db7a136d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x64x112x112_n"31c5911bed03f2458158ea5981b35d5d*10&a9c2966d666bb8ef2a6853fb39f25905*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x64x112x112_n"8521a37af15cfe26f52eecc7ef28c572*5&01517d47fa7379a2f00bec24ae089799*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x128x56x56_n"428485cf1283ca95ae1f1eff658db657*55&a6c0e3956cedd83489850ce8bfb75de7*11"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x128x56x56_n"6307615fa8c8fa0e9d4f4d53f2e117ea*30&cee52065f9fa14f1c0c0f4741fcb1e0f*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b 1x128x56x56_n"c9e59fcd92bc31fa4f3881a96b698a70*20&8804477c25fbfeffbc8a59ffd177535c*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b 1x128x112x112_n"f9bdd517ecf5c2717c17f45dc3924fdb*5&f44c2b7e5c8b9f43ff57486a08d5ca6c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x64x224x224_n"0c0d10db973c311b8bbb70b09ca655ba*5&60fd2e72e3684580de8e45c0fafa96b5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x512x7x6_n"8231e100f3e94cf8905545dcadba611b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x8x8_n"910829743b33ea87bcc782c02d54e6d1*5&eafa6827c573e4e45d8d82ca282eef72*5&723a8685f1d1fa8be3db716ca0d68d3f*5&0582a9a9663ee2ff102103bf2dd5cf6c*5&232436ebbf6a6fb63ed5882cdc8c8076*5&c56e112e8fc8a54ef39cd0b4cf9e3be0*1&83354f7ca8e13c548a7a570f3139bffe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b 32x20x6x6_n"cdb60652e2b230ae181ed01b0e00af06*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x6x6_n"9865a381f7eadab8cea35e91280e1b4f*40&cdf6e6ade85c7c5cf31e571783d13dfa*40&103ea22cdf402a05209e25cc13efbf58*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x6x6_n"b1b9734d16b0350ff681a6a00e58382f*40&6ae30154e2e748020073157ce48dc76d*40&ac2a40ed96749c7ad7fba6ea72a44357*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x11x11_n"f44691a2b951bc0aeb9abe532433f295*40&b39a142a2553899a130f3a05fbccbd95*40&8917e6ef947dd2993f548e3c1cc56f36*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x11x11_n"1c18242833246d2b79cf5a7ed236fd77*40&27df68a802488a321b1991dd373042b4*40&46c6e5c02f364018346924c24ab5ab84*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x22x22_n"0bafddc00f540aabd45b6d40643bc120*40&48c3498cc05ceef44ede57140686d27f*40&22cc410949fcdb1b24046ab75e505624*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x22x22_n"df69ae91a241610a811bf094afa92977*40&2d30cc6796b8dacf6843af7b25e8382a*40&676c70654144b5d4143501941957e7ab*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x44x44_n"fa1b748b9621ea4a52de7d1c3107c27a*40&33af4bd16eaf3660751803f0ef6aa093*40&8e2c99413af63a401327bff54bc61940*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x44x44_n"a0e4a94890df4df46a2f2b608c7daf82*40&5dbc881b80c253d1e308813b5201a27d*40&935178153f7e1e8fb632ce50ab690c59*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x32x88x88_n"563cd9f46115cb8239ccb79dce4593b8*40&76a44fa10ccf8f71caa3deedf164fdda*40&05ad40dba92fc87a7965f65fccb0a7ca*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x88x88_n"5556cdb247fdcdb35200382c76b5bc58*40&dde304d216fa288262ab64071fb9ebda*40&adeedad020a0883721637a822ae98170*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x96x28x28_n"50b3f26358e5b93a19a0f2836dcd034f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x192x14x14_n"4ba61bde7bc91490b1eb95df22bb2319*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x384x7x7_n"0aeff8748d0cfae00e81b1885729bf38*23"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x512x1x25_n"5abc3fc25e3750bd8bf0d165edbb166a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x320x1x1_n"44d6bb1f666985b1286eeb30c82df63c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x320x1x1_n"724923c2ca06f76d4e0c7713d7c746c8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x640x1x1_n"a946004480ba2925b09c8b7c38074fbe*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x640x1x1_n"b8795fca3fc684957b681213cf086c99*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x960x1x1_n"96ac53abb3dc02393c449a0dc3adf237*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x960x1x1_n"3e4361456fd7c8f18e4290a3b67068a6*1&0534a92ca59fd9623dc7281566d1faeb*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x1056x1x1_n"a8e7cbb2f2f1ff89d1aa605f6345f351*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x1056x1x1_n"3468e24f6b7099b30cda123d77f833e1*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x1824x1x1_n"004b56673aba4ffa17c6dea25ab1e0fe*18"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x1824x1x1_n"a339e9cf9317b6e9789097cabe299a81*18"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x3072x1x1_n"6a6ccc3b7a08e33ccb59bda5688247b5*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x3072x1x1_n"a804fd3893054d818914773b34c6cdf7*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x58x28x28_n"f40f07bf2aa629f6318f057b1644f5c2*20&a2dc2174fb7054db89e205f1a177ce70*20&06db64acc01b653ebd2c3185628c1c09*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x116x14x14_n"95cfb09b7548a5c539c219288c4bf46b*40&e20e206701fd676c1ed1c1cad0e1d8a7*40&7bf8647a3746a983fc9c8aa837ba9f81*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x232x7x7_n"24ce3f8b28647a8f09fa86dd09cc18ec*20&cd9735945552142ff7b6b6645e7c4ae9*20&90b92c14e1a9109e1b101e4bbdf069df*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd16a16b 32x1088x75x128_n"a38d885b65040158929c64e65a565a7e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x1088x75x128_n"6f7474f3935bda49695b39012eb491b7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=abcd 32x1088x75x128_n"26ea51ef744ce95032d5110c183ed51f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x24x75x128_n"bdd3928dfdac039033b37fe222a785b6*1&548077e2d106bcae9ae64ab277fa8db2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x48x75x128_n"e3254dc982c72f5b282c87a48efbf76c*1&02f4ccaba0d07485b4f133e2dea90e4f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd 1x1000x1x1_n"4dc57e926b66693a5c15955432dc0104*5&8b1a15a2e98f6b91682b53c34f6f2063*5&5350186080d4d1c7959e534b5e51a29d*1&28c64e9fe50787d119dc565f8d4a8607*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x1x1_n"d3181d6846023b59273ef65eeee1c990*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x1x1_n"d3524cba6b1d9db600304bf02ab1f918*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x1x1_n"a1f1654b61a9bd820497f335a8315fe0*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x1x1_n"d29ac4617318569d470d878869bdac28*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x96x1x1_n"8d6eaff9e23b14cc4ff14d5bffff20e8*1&5c87fc81e73eaf18685b1daf9899f9d2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x144x1x1_n"868a0260f23f0d071adf4da8e98ce3a6*2&35b774e0c0284740366e1130ea6b57db*2&ef1f9d6adb7f609a3dd2f8e47386a255*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x240x1x1_n"e41d58de84c24f8294435c75d6e4c996*2&57efa3617d435b704ec852355ef6bfa7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x240x1x1_n"875ed96ad4651e0080b73f6f626cfdec*2&9ea35192dea827dca230dd7eaf06abd9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x480x1x1_n"560d49cc6e5bd89bb5c2fc744687a3fa*3&df0eabd5fd5b4a2fadeff0d43ee1b465*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x480x1x1_n"0c7a24d6f0a0236a8b6c6b6ab1f74802*3&1015da7ae877d5af150cd1d4bd331e07*3&1b18d2aff18997d61a443e1b7c510467*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x672x1x1_n"38d9d33bcd49d78b3254981dd877feb4*3&b9882ab5abfe129f02639554a6d4b034*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x672x1x1_n"669b00b33bd7d79e8f8759023f796f7b*3&bc4b1e205ce4b9da05bdeaebaf75b11b*3&ba9ca98ccc37be4787f85a63cd68ac87*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x1152x1x1_n"5f0fd3630f05fdd7195aacaee1eb843b*4&84d6825ac8b54dd1f627274312f49fd1*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x1152x1x1_n"ea809aae4849ff1d562792e11334aa49*4&225eadddc5b4d5c4bdc80d7b7541f392*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x32x128x128_n"227277d417d5ce388210d1470b12062c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x16x256x256_n"7eb7d108f33b13d28ba303c33c9fe39f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 1113x30x8x64_n"8dd2ff8124484f3d3fc07ddce65e3f4f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 64x50x8x64_n"8aa991b0352dbff55ba8a7fbde9efce8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 64x8x50x64_n"64b876d98e926165b8bdb8be2b299b0d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 1113x8x30x64_n"5fe6e0edd68cde606d8dfdfd0e90a0ec*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a32b 32x1024x32x64_n"9cc0298241592cd347eecfbd469f4e0b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x48x1x1_n"b7581309a89dbee7c5b15dde2773d318*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x120x1x1_n"50be644c5410dcecca363fc1ac94ae72*2&e9781a30cb43a6471f44513ae8142af7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x64x1x1_n"acaf45306818243f97f663433f4957ae*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x72x1x1_n"a6283670f4070a9e232e9eb74976bf7b*1&95d9310e3017aa591ad1e3a8530f9163*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x288x1x1_n"f106833a58677b459ec0a2fa45a82525*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x21x65x65_n"b20bd3f9ceacfcf8cbd41630789f9887*1&a2d01fd530b26a1d9f3357d54efbd459*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cbda --dtag=Abcd16a 64x1x16x1_n"08bb779e8ce47321d454806de0bcb7af*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x64x16x1_n"b53a7de0c928cc81f51e748d5599eb24*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x128x16x1_n"41456de32ec74ae38c7153bb51c48113*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x256x32x1_n"0033852fc1a44130111c3bbe7c138b19*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x256x4x1_n"5214bf3e3b0f8df72cd95607c341e516*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x128x4x1_n"93583775e5bc6a80c72a55aa4ca4aa6d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x128x2x1_n"f8046b5a59cbaebf01bc776877a80a61*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 64x128x2x1_n"6bdd65caaf72913c488ec046b4540045*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x128x4x1_n"bc260ae93161816c3a4066b890a0526e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x256x4x1_n"85a01b7fdac1e18cb711562d6a2dac7a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x256x32x1_n"c4d4877d218a9c1c78ac626378b87887*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x128x16x1_n"1d7ab8a0e3d003e1c10e9101104d6340*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x64x16x1_n"bce5bce2e1720285cfe8cba0f29c624e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cbda 64x1x16x1_n"15728d7b25664a2159854b57e9fff725*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x480x1x1_n"6812c74182db342b761865d3213f9325*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x672x1x1_n"e179b1aee6754347017b6f069710e85a*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x960x1x1_n"20eaa3c881a1c71fe9b5818c2074d578*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x3x240x135_n"7de03c5ad73222d82b3be3c9e462f30f*1&8c502cb2c5d0e9627a163d5158952bdc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 5x512x1x1_n"925c30e8e83cd8fab929cf50808a9da1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x90x256x256_n"c77b6fc0d1b818cad5d006218f46f8b2*5&8338a95a6f9e723f30662e9e29625301*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x38x64_n"e1ebf1ea0b74c358e661b971274e2f8f*5&6c12a42f9e9926deabc88357e4531cd2*5&ddb41109912fb30f7d4809c7d8d047eb*1&e23bcd8b1a9b2eeb2b153d5ad0a8ad74*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x38x64_n"a64b26d0998ead46c66e71b7ef08bcf1*5&03f9410715144f4c65c7a8c3c357d07e*5&e75aefc770fbb8335ae616eddb15ad12*1&704fc908fbae45b98f0eff262ee187b8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b 100x576x14x14_n"63ea68ba1a449030045a5d87e74fc58a*5&b546754c6c75dc519c4261e097c3c30d*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x1024x1x1_n"9c6e5e23dbb8fbcfa8b337214334848a*5&45309c1a334721424175e0df501b7424*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x1088x75x128_n"1b0344ae06fabf5f4401d6155e8bb39a*1&56fa46962a59225b5989a170a1d14c37*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x75x128_n"6acfe4604919bb1785cc1f774e32cbeb*1&8c9e156acb350f7a9c94e657cd6b7529*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x75x128_n"582bd0496a46fdc949ff6da445a9f57a*1&64a8dc664b4afab07dc9843f74b5d678*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x128x64x64_n"d42da4443dd8dfb82cb0dc9c556f6b99*5&3153a193d4ea40e79b7b1bb893a1aa08*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x64x256x256_n"431319415b5ae78c13c17db4e6cc19aa*5&678d97b495a48c7bd9d2f587b6988cd5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b 1x64x128x128_n"f43f7872b6964b72cbffd7038d94b3ed*5&6038a2eff67d6fc8e6167af9cc0de4ec*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x128x128x128_n"0ad311f22af369122d73c655c0eb92b5*5&3b3eecf11b330f80c7a32cf8340027e5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b 1x128x64x64_n"2d9358a9afe781bfab8c3b4256ac0a61*5&4f7c774264095a59f370508ffa5689aa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x256x64x64_n"4c1694abf1b1a4973dcb548c564d8145*5&457245a4c705db8134b001704da1ebf4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b 1x256x32x32_n"b635d84506cba5dc2264be522f4339b1*5&e6fe5e3d9734874284c7c6f2c694467d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1024x1x1_n"d706c137408621c32e46461f1b60c03a*1&b18c3cf5985fffda04cd795fff51fe07*1&050ca8cf257ee5dc632fd171cafaf13d*1&7b67355b0e19bbdef2913a3a9a18c92f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x80x200x1_n"b1b1878eddb2c4fc1a42faf8bd56530f*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x128x196x1_n"3b7ccc8aedaee91bb2157067e074ec0d*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x80x326x1_n"8afd0c61705a5ab73797e3c5e9c05901*5&8a1e26084071d7490b7058023ec99a33*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x322x1_n"a51445c17d859e20b077c4c15093b3c1*5&5df26c4e4e4eb9565a9a56a8cf312f9f*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x128x1x100_n"589d2d08c0472e21bd32891cdbb1d2ad*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x1x100_n"0b17ecb133b288ae2b4fc8a9b7baf626*5&8407c5f9033d8f09e908e8a9851d1306*5&8264abb2251e9a31d24f44e64ddc564e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x7x6_n"72500f059bcac01d87bddc4d6b84a1a9*5&b4a3fa5b92ec4721dffc915cfaa847bc*5&6cdb2d01ed1121f2763058c582fef6c8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b 32x19x32x56_n"aacf79686ccf64b0cc9a82f5e24b4309*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b 32x19x32x56_n"3d450b77d0080f8d8be720fafc79a15a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd 32x19x32x56_n"e02025de35fc773903723285500768f1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b 32x38x32x56_n"d1704a421a1f7b13d1a7d237906d6855*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b 32x38x32x56_n"bd9cb8b5283ea1330d5068736aad703a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd 32x38x32x56_n"931508b3623a0cf67d0edd65010c06bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x57x32x56_n"3b9fe2a79a9c823dfb98f8495c2450e0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a32b 100x1024x14x14_n"47af5924a9005208f7bc893b164718bc*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 100x1024x7x7_n"66d9785e754f60e9c634aafa3c66f588*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x116x28x28_n"030cc798b7bc8a876d5f9506257b3596*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x58x28x28_n"bb906f9797e2bdcd16b863f23e60fd81*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x116x28x28_n"070ccdf782cad997c37381d12276a1cc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x232x14x14_n"d6229e892696f9827fce296f13c6eafd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x116x14x14_n"1a327e1ca9bb265287c116afaab8fef8*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x232x14x14_n"54edf8b0abc79abda9bedfab4773925f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x464x7x7_n"5675341487e23458e6a57dc77b14022d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x232x7x7_n"94a348da4e2b1278e1f2a9aac540a252*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x464x7x7_n"3c1542d961f2b440d2222262035b2f5b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x72x1x1_n"e655f4a7df3e7b02a6e1aa1554415cb5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x72x1x1_n"66540557293b67a2aff2d6a0103cfbac*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x120x1x1_n"b1f7b461903ca21b2f908427f7f81b1b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x120x1x1_n"9a6803a1687342d9eb101186f87052ea*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x480x1x1_n"88c4d2a417575feda265a673ec76feda*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x480x1x1_n"6b56493ac45fb752a8b60b69bc73fb46*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x672x1x1_n"ac4302a55bc0aa93576c3650e73638bb*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x672x1x1_n"461de464eef2ed3118a1d4b473d10475*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x960x1x1_n"45caf0098485ebb0013163a7e7d87370*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x960x1x1_n"93c25fc83d63f342cfc8c9ccfefe66a6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x126x1x1_n"ef485084fc3b6f505b4ae51e4cc9aec3*1&12d44a222dd0ca7f97cc895e2b4ad14e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x38x38_n"edc7e7e3ddc893f79ad0597c837f5a9d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x3x224x224_n"c3cd73bd992da820cb75f1e001206bcb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 6x64x224x168_n"2dfd3efa76a9e62309bc25052e750b34*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 6x128x112x84_n"ecf5c3d1b5129801ba2436330e59cf5e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 6x256x56x42_n"bc89939d811eaf8b5f4f67896aae712f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 6x512x28x21_n"499489dce54427cd1606602a0e64284b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=aBcd16b 1x144x1x1_n"642deb64255a9ef395ef047eb65e485c*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=aBcd16b 1x240x1x1_n"2571d3cb6dcfb8c0284f2276c7e1ac3c*10&aec3420003c4a8516f2d029d55115b6b*1&cd80c31fd4b4651a19bd697d14e01224*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x3x512x512_n"777ccdafe4ddc658a3e3b30f36b21317*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ab --dtag=AB32a16b 24576x2_n"9fc9eca79f2e8a15b90102b882755888*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ab --dtag=AB32a16b 24576x1024_n"71e507a1e72924196f753f75b2b8fb59*120"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd 64x384x16x64_n"d810ba407c1199f5284e6166d369fe0a*72"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd 64x16x384x64_n"f0b186080aaba2a456f1fe26f27014b1*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ab --dtag=AB32a16b 24576x4096_n"d459a68559d1fae7f9bf9e9402fec2a9*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ab --dtag=ab 24576x1024_n"9744b26291b6ccc2367f0eb14b7203f5*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab 64x384x2_n"2d511a21898c2d0855deb2d65aca90f5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x25x36_n"a7e152788f74cac7f21ca2c6e7f97091*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x24x38x38_n"cd95ad0a880982f485cd8a01e8c42b79*5&08d6183c047e73d40985acb4b5a6f305*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x48x38x38_n"0a9817af726799c71cabe25075d36b08*5&6ca5cc514a1a54b086e780a712830a8f*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b 100x576x14x14_n"cd8aadd815f18efa499ebbc90033e85c*5&f62cfcecadd835452c80c580c54488df*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x1024x1x1_n"7699ef2446d515e5e214c8e703edd39e*5&d65871bf041ad979844b2b1b2f4a0ba3*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x96x28x28_n"cca3411f5d3cdf275fb7f4df57a33c41*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x192x14x14_n"285feeda413a08c9c8f9b29b2900e033*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x384x7x7_n"6248604df7840899defba12f227561d4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x96x28x28_n"2ea66434181ab11e836e40d618e9cec5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x192x14x14_n"0b57926ea211608e0b5ce6cf77d21fb8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x384x7x7_n"a5ec65a86dd32664bde7166ba2c9ee90*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x19x256x512_n"cb97a5b6e422b6d7b1dc42ac451874e6*5&0500fc204c030bfeb3fbaeadb4f516b0*5&3baffbc81964df46878b4dcdb3cd1fd2*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x144x80x80_n"454b228d1dbedca87dad0fe75bd9ecb2*5&309b8af5244587628a6f691eaa2f4f53*5&b6b14596d059af8bdda97b9b757aab73*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x144x40x40_n"f90827ffd065558162190698e804f0d7*5&1c12c24cbf411f5e1aaaaab6cab3d9f6*5&f0333932bfcfa419c9c2bf8ee6d2fdb0*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x144x20x20_n"4cafe2e773b9b76c7b4cc69a59530486*5&21306104278f2fe52a3098f925e2cecf*5&0cc7bc4b0e7d3a4118455121f61c3700*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 100x1088x17x17_n"634e9da96680e34b87b88abe590aec21*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 100x1536x1x1_n"cfdb751a8d5004615179bb1bed09886c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x64x256x256_n"3a69b59fc7fb310fcaed9a9cef25656a*5&97317d247706b5fd886caeccf7677d4e*5&92622f1f29dbd06761b469d8aab6171a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x128x128_n"66555df320cd6d41c4b518d5152240ae*5&0a72a7ee7cef952723b50e896e7fbdc0*5&cdd726e87c6c20e59782221bc37bd8ee*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x64x64_n"4425564e429bbc232019560da4e47e29*5&3a484f8349ad41042bdf25c6cd95c6a4*5&b4780a75da13f478fb1bb11870d91d66*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x64x64_n"732638879da167c5cc79bf7313868b77*5&d19db706173a691ecdfc2ca8e344b432*5&268d58b4fc7b6ea411beda37ccf03889*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x32x32_n"32a26bedd06731040e7d5c7718fa5cd0*5&89897ff95ed9bda2b969694f26241266*5&b9e07c931f23df9a2a417b201d5fe3b2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x256x10x10_n"3e60d701632dd2d94343066cf1a01b56*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x256x1x1_n"a1b652bddceb2ab622437027f77430b6*3&13a513123c0c084e32cda39062f37613*3&ba700e185ea678a1dd66d7d04afa64b8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x256x1x1_n"01434137a3fb235f384ce0c839366d25*3&bb21157a0f478da2356696adebb09f77*3&2509a89fd481d0347ff50f500e987a74*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x512x1x1_n"203b9ed2ed06a52c4454a6a0b734e23b*4&ad8c8defdb683c8e2ea93cd76a73fc3a*4&f935f8303eb732a8c72553d645785c32*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x512x1x1_n"61721de78a16e2039a842d0b779e9f2a*4&dc9f16cea76dd7d4d4ac31adb9eec1d8*4&bf427d642d1d0c0ca0ed7251109efb68*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x1024x1x1_n"8e3b42cb6096f66843990e5f7e34a35c*6&532d338a7a387a47d398faedb7afebae*6&fd20dca78da607643590fb86bfac06c9*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x1024x1x1_n"b4fce7eec1a14a13ba6b0d34e9d00060*6&f74cf77f5370208fac8a5474e9cb6d20*6&41d9dd59cedeeb717990ccfcb516d7b9*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x2048x1x1_n"51ec92b3943baf34a2cf8eebf7e703d9*3&14e324385a912aa124f2bb04d5c70f78*3&abf2aa17ee4e564c6de1cba87b2354d7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x2048x1x1_n"e0d7170fffb567cb084f521d1a21c85b*3&fe2bc2a479e9434137fdfa4ac4a51b68*3&618426b83a44293efd6a5b573a95813c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x512x8x8_n"10d1f3f1644a1fdb5cf79e17a8d45e4a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b 32x512x1x1_n"4a8854910bb2615e34b98aa1da4fa5c4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x12x320x320_n"d878c1a059e758da2f90c3f283b7d0e4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd16a16b 32x1024x38x64_n"6a2a6b001d2148cb4f001c5972e0ca7b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x1024x38x64_n"da60d11eed01cdb42ee8fe9bbf13d8ab*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=abcd 32x1024x38x64_n"5daffc65ac43166a06f2b4ce0d2953a2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x48x38x64_n"0eaa7716b656ca58fc916051d989c205*1&d5db215ac3902408f41625a7fc69c975*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x24x38x64_n"2067dc4bae2cc80b67988661d39e16ff*1&1cf2e3f313727b56de0660c2ba74e22e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd16a 64x3x7x7_n"d8a1ff8c74d552b9ab04868983bb7587*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x64x1x1_n"6e6c897094fcc2e07d299b4294ce7e2f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x64x3x3_n"c5c8d36432bcea60005f21653023e444*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x64x1x1_n"039965f4da8b04c58412ba100f0df916*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x256x1x1_n"53a22e13a7af86ff5a33de9e51d3ab72*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x256x1x1_n"e0e0bdf353ba672e13b67574be8b15a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x128x3x3_n"06428d7d90e91377ffc0149f4a8fba3b*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x128x1x1_n"b775e20abfe11059d387475234e4870d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x256x1x1_n"4c8e8b7ed4ace7c3a3e9f35e0232015b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x512x1x1_n"a143b3fd744cdac1b8d8448f7bae31a1*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x512x1x1_n"17e4ad7dac331760f1aa06c9698e5826*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x256x3x3_n"74952ec3dd96aa369e19b3e642911044*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 1024x256x1x1_n"43e2a129ad484c6527e3d3f349628c8e*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 1024x512x1x1_n"5ebbc84412b7f904d0fcf7c7db03ba23*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x1024x1x1_n"141aa92d3d62d00073b9aeb0539bb6fb*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x1024x1x1_n"02812b70f279502404a1311ff078c9d5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x512x3x3_n"6f503f3e7fff6035c382acc6c9edc4fe*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 2048x512x1x1_n"021831d0f6ce3e4d9e1227e29535cecd*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 2048x1024x1x1_n"313fcb58d0b1f970232cd4dabb39ce4a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x2048x1x1_n"30222622d8cadb7d0da2a97b4b3a9e34*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x58x28x28_n"dda9ceaffa8332ccb024d70b9262731f*3&ec6f47eb79f29fdcac29d3ee071ed2ca*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x116x14x14_n"6d30d0d73938f1a081c1a857e972024c*7&a22254c00bb3f24354e6705390387c7d*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x232x7x7_n"70528dd0bb09e0887010502364ff02eb*3&8453799d161a7555019bceabfc1feba0*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x480x1x1_n"f476cc5fceea31179d0616ddbf09fd63*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x672x1x1_n"064ad1a4434e05f6ad2826d78a99bfc5*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x960x1x1_n"7773eba49556201376dfa2af2aae35da*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x288x1x160_n"bb6ece02fa45d661ccc3ccc12a29f42d*1&d58c25c48df51ccbaba9830ce0937eea*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x20x720x1280_n"0b33ec106b107f7bdad5b867b6945476*5&580e4d51b40c278def5f64bb366b0c4a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x80x366x1_n"490570b34e86e029f0896a3be4df79ce*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x128x362x1_n"d1f139379e3f34ff9303321bda7cdf53*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x512x1x1_n"baf14c411f10f9e625179d81451ea8bb*1&7cb9533c2d03f872ae6d0af0ee980480*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x128x8x8_n"3d318ded06f69f959c2c7a8024c15bad*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 8x1x5x5_n"13efbf34e061c6981d955b58df0b0c84*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 8x8x3x3_n"ce2faf64d296e4c0d1ec98f308cb53c7*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 16x8x1x1_n"806bff74c7a2be144c6131528a2ab5ce*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 16x16x3x3_n"7ecd59ba830557cfd4073c176b62f63d*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 24x16x1x1_n"d2488ba227bfd0913325f1791b5134d9*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 24x24x3x3_n"98ad0df82152a3220ae6ba2319936abf*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 32x24x1x1_n"e8e87c44acc7c6e9682ec5c0cfb4aca1*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 32x32x3x3_n"2f2ab01f28cab1e27acb085968980863*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 40x32x1x1_n"f9dad1608750f6b74545d0451280f8a1*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 40x40x3x3_n"5461b2eb5e801c7463d8d32f74611cc0*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 48x40x1x1_n"dfd3cec6e0ddb8bbf0902da032a91de9*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 48x48x3x3_n"d9d0841bde1914cf554c4eb3d8b94b95*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 56x48x1x1_n"94ce7fb6c7ef505f1a8b11fde2c59496*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 256x168x1x1_n"30571bcd24c9febf3cca407f58fa7e67*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 256x256x5x5_n"70f6373c32f85cff2bc0ca9a1fb0db7a*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 168x256x1x1_n"e856fc82cdc72ff891689f9c9549f01a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 48x56x1x1_n"156f6642db7f99b6023219616af4b7c5*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 40x48x1x1_n"6014beb0cec844e19f4685ce8a7f2239*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 32x40x1x1_n"26bbda091831f81daf66d05a6bf2fff4*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 24x32x1x1_n"fbb75de1760455ec25c3e7d83facbc22*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a 16x24x1x1_n"f8a82119a025093df7bbed5c429db044*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 8x16x1x1_n"4179b0d70c6f3e95e53fc3684123da7e*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 3x8x1x1_n"c5958e606809aeec325b753452823882*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 3x8x1x1_n"491fad80637edd5e50e3237aeec52d01*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 3x8x1x1_n"50e9db7d20e278a2eced04f1eb06071d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 8x8x3x3_n"de5545b23dc0f444115ac1836e9ff36a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 8x8x3x3_n"627e76346b78a0e1764b8666cc294d5f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 8x16x1x1_n"e1d7dfbaf8a706ccbdc54c0c106ae5e9*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 8x16x1x1_n"be33154b1b7b7d3fa6e021ecee7dc9b9*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=cdba 16x16x3x3_n"3e1505b7d7be70e1734cc3bae88b2076*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 16x16x3x3_n"0266da7dcab352f4164bbc5097f93022*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 16x24x1x1_n"aa62997ca3e665bf7b827e749d479ec3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 16x24x1x1_n"a5af5c24d1fce61759b3ccbcd68d4374*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 24x24x3x3_n"8fd25a75e43edc899c2a1dd037b8e02f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 24x24x3x3_n"986d964add2ed362c9ddb3e02c0e824f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 24x32x1x1_n"20d0b0fde1f4156bc67444281febcc70*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 24x32x1x1_n"086852f0c225faa2852193a860886c3c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 32x32x3x3_n"35a6e374c92a53ba2f8ea6ea6ae93211*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 32x32x3x3_n"406ebaabcbc2bb670989fb4038358f66*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 32x40x1x1_n"4f571783be25931fea272e5c73a16132*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 32x40x1x1_n"daeff211f102c341ee8615be727876b1*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 40x40x3x3_n"1ef91bab0b65fc7f3b54fe9f144e5309*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 40x40x3x3_n"55e1bab939724113063901339769803e*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 40x48x1x1_n"4dec5b49ab4993b0f6652ff79d15c323*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 40x48x1x1_n"768d8d94b0b1720bf2ddfbf99a07d4b6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 48x48x3x3_n"1aaabf1ec2ed320a3c7f974abdb8858c*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 48x48x3x3_n"c2c1947e8a1acc84b92f1d7f8a03975b*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 48x56x1x1_n"7244dbcf8f46c4edafe80d0914d40ba8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 48x56x1x1_n"41489609e3d69e32742645af4a9102de*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 168x256x1x1_n"6f83dce65df7cbfa84b7a706448bc206*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 168x256x1x1_n"9f289c3bbf9a429709a980bdfc80d723*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 256x256x5x5_n"303d4f5eaeb3fb0572c6e2a8b0b8622b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 256x256x5x5_n"865f6843bdf498fafec7feb1664a0abe*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 256x168x1x1_n"eca13b0a34453c2cb7cc454b60cc8dc1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 256x168x1x1_n"373681245e52e5947f8a1033e28a222f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 56x48x1x1_n"95b4362f632e6ef9bcaadae34e48d86c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 56x48x1x1_n"04562e4fcad148ecd803ba6474d61752*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 48x40x1x1_n"5988d268109bcb2f0af74149133234b6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 48x40x1x1_n"ed245911d349b1f520e8af830418f3ab*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 40x32x1x1_n"de838d06e7d8f15127eaa293462ce401*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 40x32x1x1_n"302498da30aef304e2c446b1b585ce5d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 32x24x1x1_n"797b39836953e08c25d6a6fc1ae3dc4c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 32x24x1x1_n"0150c5155bf89e0d2a03fa173ce0cbfe*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba 24x16x1x1_n"98e195461bae3adcaad49265b98c9e78*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 24x16x1x1_n"e2489fc52de56b2734957ff141414378*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 16x8x1x1_n"8f0134e322dc7ee163dcc02cc422d7dc*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 16x8x1x1_n"561451af478443c2b5b2b50c8868aa56*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 8x1x5x5_n"89a27339417b2d82edb2a51ed190928d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x28x28_n"965d9d4b5fb20ad76b4b0e72bca9df0f*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x6095x1_n"6787ae2ab7751db0a2345075d1099e7b*5&7116ddb528b24f0e0ef3ec3079f52203*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x6095x1_n"a4ae81c6f4214aa9326351a369bce0da*5&e7069d8a16d0226cc0a5c83f22f4dfe3*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x768x95x1_n"dc5f8aaf48bf90993261e7102b981148*5&ed7ec8f6e25a000f02b8a8397891f8d7*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x512x1x30_n"1847f20d76b28a116f47542777327fb0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x12x640x640_n"405b7fc63e565572b5ec5722de927570*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1792x1x1_n"f0caf1552ddd426182c36ce0b041d304*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x480x1x1_n"01adc1333a22bce286dba4280a2c2e0e*3&c94cefb417002b820253af140a830433*3&9be3f1c559b2a64e176caacd13435495*3&642d28830baac4d9d6ce0ce2517635bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x672x1x1_n"eda3f3753e628cff7ab947808a63843f*3&a97ce9736564fb810629dc64f3f6423a*3&65a927dce0b800df7c5afaeaac8f72b8*3&222a0f8e29eda0f6cfcf316bfd534348*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x960x1x1_n"d36692b6ead7ac7cb2bf5cfb4da5aa51*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x1280x1x1_n"c448dc328ac3354668e3b3ba157dfa74*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x512x7x7_n"4e679440d191f067c2d19a38aa70f471*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x64x256x256_n"8a7a1e6d0558882245b7aabb455458d8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x64x128x128_n"3ecd69e0985e9942b74e3d5c4e66298f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x128x128x128_n"e53bfc99bbc0c6b920b685f02d4c40b1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x128x64x64_n"bdd8751b370e17b7fedb631a51ab7130*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x256x64x64_n"11224d5fe3a484c14efaec5382c86e24*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b 32x256x32x32_n"f3da341468e5674fba5cf6b8c4bd9c2e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x80x366x1_n"7a72c204c44c2c95650465a2a17b3e43*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16b16a 16x1x3x3_n"087aa8e6679970772d76d5d20de5726e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd8b8a2b 128x16x3x3_n"33c5f361576bd86681ef59320099afc9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd8b8a2b 128x128x3x3_n"ce6c85ae87e2ab8d75b0a634d92034ee*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd8b8a2b 1x128x3x3_n"70cf3fd3202e4b43a577a7f376e93c1b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd16a 10x1x1x10_n"20fa1cd74446bf71a1ff92c95a0860c8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 10x10x1x10_n"633e78a0058d029e237ccea9c89c09f0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8a16b 10x10x1x10_n"ba0ba3f4a4300fdeba87954dc903505d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 10x10x1x10_n"b61b513253b17e958ff30af56e4b77cb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cdba 10x1x1x10_n"67f12eefa656516dff820b356210776a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b 32768x13_n"f4d3d1d5342c27698508f854be64f2c6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB32a16b 32768x512_n"1cffe93bc538c05848daa6263726a26f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB32a16b 32768x256_n"02d90183e57a1961ebb52e3f44bb86ac*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=ab 32768x128_n"2dab714f0158839625acc10c6b4bc463*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b 32768x479_n"1c6bb63b30fea60b8aa93b5b1bc2d767*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB32a16b 32768x1024_n"01ee09b5b3e40b88e633a82c6be4fdb5*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB16a16b 32768x256_n"57c4de5c550ae522533a9811dd55a6f1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=AB16a16b --dtag=ab 32768x1_n"1802259428f6b07c9babce34b38334b4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x512x1x30_n"bd5d9586ab1d8384ecf9672d4c986f9b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x1x30_n"d72164a84d753afb59f5e21454ba549e*5&f703d73930a4993f2f7652989a240b92*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x128x56x56_n"3c6b76761e52028d94f3a099027b5f9e*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x256x28x28_n"d0df55993506f70054c9d2879b055186*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x16x256x256_n"779a1f42ba3a0ba9de542a99f12c8db6*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x32x128x128_n"642db88910419ba8049e76f7a9ade8bd*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x24x240x240_n"8c0762e101e59b0c7996f6c120789c63*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x320x1x1_n"8e6764c2e27dc1b6d100b69483b322fe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x320x1x1_n"4633ab1e4548455f25efb8693abe56c2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x640x1x1_n"bb508a7160d0b2c1d54e5c38c5efd627*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x640x1x1_n"424c8f9a6fa4cf2c86cabd8ec9008c94*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x960x1x1_n"292ed70a0ebd39f190b067777367d9e4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x960x1x1_n"64f2515cbb488560c551b7755219cca3*1&652d668dd060ba364c69bbfdf47b7018*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x1056x1x1_n"f2d78333a193e46c94336248886721cd*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x1056x1x1_n"a7560d165f290b021f123786e53f6937*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x1824x1x1_n"be3721963bb897efdb67e5fe4cdac42e*18"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x1824x1x1_n"a186976c9ff394949f9c42597471be46*18"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x3072x1x1_n"5a690b558745083ce73b47860fc4f71e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x3072x1x1_n"bfe6fc04913b94b0701bcffa4c1eddc8*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1088x75x128_n"61cf2a935b0e5bffb2de219f79901303*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 3200x1088x17x17_n"035f871dd0523677d4a0b93defe3ab0b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 3200x1536x1x1_n"b3ea22058a214e24354c78110a153670*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x96x1x1_n"6eb721187f80bf92305087bd897bbcd8*1&1aec77b7d6af1034dbef71af8522d8d3*1&c735662c10db6b6e37f9fc647a4f1b36*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x144x1x1_n"b6ee1babb6f2c988f60fd054f35eb0dd*2&28b737d1792021ada52d8545e56f80f8*2&074e30cad9e0651914e552256920b883*2&98ce6e337fdbb18f0980ea94fd84d249*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x240x1x1_n"a6741ba9572bd5681b159e70db51aa23*2&785ddad09c8739404b6e1a4c688a660c*1&820d67b4ab857658752f35230392687e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x240x1x1_n"0585265a96edb73df6fa1fe49fa8833d*2&dd7ee25cc327713f0c6eba241c7f2262*2&a81274e22f6a2e4554f7045ceb6f4d38*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x480x1x1_n"5315c0319df4e6d82a04e91eea160a06*3&805171e41c38ba47cc9d49972654197d*3&f05cc938282ecbdc0601e25dd3521c53*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x672x1x1_n"8d2516212c9acf625f90967c09a0ed41*3&90fc4d4686e1ac4aed0a0991095d8ccb*3&5c20135d80294e468584dedcc74d52ec*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b 1x1152x1x1_n"e6516402285202de9fc966d033d6ae6c*4&62b53a7d9c6db54f15d0368bbe7dcc88*4&37f2c26f1f65f64d262793b788e33ecb*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x1152x1x1_n"9605a14098861550a28544ae1313b246*4&2a3a10e15e589132bad5add1a1ef0c58*4&e2bbc19f0c299c5800be090fceb340d8*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=aBcd32b 1x64x8x8_n"c574eec83f6e960cd3b6a06dc438c363*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x256x256_n"7845d0836f4015bb6ffc771cad54b3eb*5&829a93bc702ea7965de3dd32b955f5de*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x128x128_n"c81110337f24f5fc89d387085ab896f0*5&bfa716b73e6a91e69aad3eadf128cd3b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1536x1x1_n"f64189e849c9ebac7088d3b7780cabd5*1&d9ddc0651938fe9f8c01c9453bcc77a1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x80x326x1_n"897727e0c6348a974c023dd1cd009ce5*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x128x322x1_n"73ed68bdefb5180b457baf615b42d6f2*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcde32a16b --dtag=ABcde16a16b 32x16x8x112x112_n"2439e95489c75b9c93d41bb053b32087*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x72x4x1x1_n"3d69e9e4ae837cb613163c36d5c93a59*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x120x4x1x1_n"bdc4653387cb091a83342f6e389074b1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b 32x480x4x1x1_n"13b998368966fb7c4e027244d89d50a4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x116x28x28_n"4a9f46dd5211e361ad9d6aec2cecc788*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x58x28x28_n"08efc0a6d50eede6dc34670950d9402f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x116x28x28_n"c90b2707d66f85deb10e1f57179dfd5a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x232x14x14_n"01930556e753ecd6bc86d844b0190091*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x116x14x14_n"17067f7e0a913db35ab026512a7ac9d0*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x232x14x14_n"413cdc600dce79e3dbb727b1b9f75e5e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x464x7x7_n"5e288924c3ecada500986d65727714a0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x232x7x7_n"ffa6ea2a3b92f3af30645e3cddbf3207*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x464x7x7_n"7d7c7bc3718e86a4c08d019aaad5e3ec*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x1024x38x64_n"4e373d93bb7dbfebffa5795c1b4e9b88*1&6dd0c0111bcb57fd789fc4287a037728*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1024x38x64_n"2f8fc22700eb3878bad5b399b81085f7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 3200x1024x14x14_n"00b2c0a5b4d0a064de2eff6b0b6e3976*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 3200x2048x1x1_n"23447df47dd22f2106b348527eb3b3e1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x512x7x7_n"302e5355a8de7129e8fa46ce390719f0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8a2b 1024x3x224x224_n"a810e2a13df425e8d69d629bcf7d8d1b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 1024x2048x1x1_n"5ea2bba638046040ad6cd359a70782f3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b 1024x2048_n"aa49efdba80a4a4f61a8ddcba8ea0796*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ba --dtag=BA4b8a16b2a 2048x1000_n"16e50a006a9ac371826b39390ccbbda4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x64x256x256_n"68853234982addb620e5909760ccdfc7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x64x128x128_n"b24af1f84f635dfb84405bb8682c0f68*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x128x128x128_n"24e91c62ccb23a702c4369fd4b068d09*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x128x64x64_n"afc96099bb9f33c7c94f4d570358508e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x256x64x64_n"a326f2d4cb8c1ce0ceb9483912b92eae*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x256x32x32_n"36979a493f315fd2019acaced6feb5bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x80x366x1_n"6d7cd83874fe4c4660a9483a96fc9db7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x128x362x1_n"786f2d8e4c25f62b6b6767644bf8eb67*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x512x7x6_n"50e6c6e2d0456fb2dd9640ffc9c4f060*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x36x360x640_n"f4ab0fa81eac9a0a2d9a627e040c8172*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x36x360x640_n"a8c6584bbbd5ebc832f26246bdb5ed55*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde 32x4x3x3x3_n"fa18e454cec7f558fc13bf2b4978d3f3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 32x32x3x3x3_n"cd0f2b79aaec4adf82ad7a73b7aeb9b9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 64x32x3x3x3_n"7df77c745f17599bdea4dea8a0475d4b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 64x64x3x3x3_n"be569fad259d8cbbe4a4d965b5e6f81d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 128x64x3x3x3_n"5440018faec8c77960c67d3689785104*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 128x128x3x3x3_n"2c9c91114a03fd9535e86606fa44b7b1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 256x128x3x3x3_n"33027c93710029d47df531aa12efa78d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 256x256x3x3x3_n"cac699e1c1453080ad29beaab29ebb7c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 320x256x3x3x3_n"7a1ac89776b7f1102379208b2aecf185*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 320x320x3x3x3_n"3ef7db0b144f4759a42cf434af3865de*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 320x320x2x2x2_n"e57b221de7dc63cd6d4025c634192a49*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 320x640x3x3x3_n"943507bcffb8ebe86d9ead35f6b175dc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 320x256x2x2x2_n"67ddb0bba2f00c81ccf3081e624c59d8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 256x512x3x3x3_n"e331000364ad03e0da9fbc99b1764438*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 256x128x2x2x2_n"7294e18ca365aad7a3212547baaaecae*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 128x256x3x3x3_n"0f59f027729aac077f50d4f6c381743d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 128x64x2x2x2_n"0cfe84529d7b8bd0366754ae51d538ad*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 64x128x3x3x3_n"8b4117c309fa2610d1b9deb5a481de36*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 64x32x2x2x2_n"f44922452173857114ba51796a495e4f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 32x64x3x3x3_n"933c765a76838b8418179e5d7bce970c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde 4x32x3x3x3_n"d2b0faf2015dcda80c5d92e0b066cac8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 4x32x3x3x3_n"98ed889dc6466f3dba7b533c6219e9da*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 32x32x3x3x3_n"467794ea24c3e1cbb1a05e92e3152f08*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 32x32x3x3x3_n"c6fde647cce8ab04532d2c8ef0a657c4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 32x64x3x3x3_n"32cf8a9339fda8d992903538a3a32adf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 32x64x3x3x3_n"62feb53ec9ec37f1128b0d0db3aa29d1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 64x32x2x2x2_n"4b3bd5ca616cefbd3f1e9eaeb938df2b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 64x32x2x2x2_n"4173169f6f2eed207e67fe42d454b724*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 64x64x3x3x3_n"c8c33f6977897f981b1652b1fadb67c8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 64x64x3x3x3_n"89f14158b0199e5f0fd841460d3d7e5f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 64x128x3x3x3_n"8d8543dfdde2a1e38397d46e0a67833c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 64x128x3x3x3_n"05202cf2a5eadbfbd7a43ab9ef6fcb92*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 128x64x2x2x2_n"b2a1b931c1a6a2de85ed46fbc2012cdb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 128x64x2x2x2_n"4977ad34725d3cbc0a3839332977a9d1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 128x128x3x3x3_n"ac37b73b86db868bd83c448fb3271fa1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 128x128x3x3x3_n"341ba0a57bb93b03aca210db37e02211*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 128x256x3x3x3_n"1346f0941ba69f618d72ba847cb70117*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 128x256x3x3x3_n"621a3c24739fe4929d45ae87d33e9a5b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 256x128x2x2x2_n"3dd5443656c6369774635e94a43b93cf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 256x128x2x2x2_n"5a6a19995f2d02252bcc6ed361c3e83d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 256x256x3x3x3_n"bc1c57b2e50f622ae721369e740632f1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 256x256x3x3x3_n"dfea0e856f5b52ab68f19734ccb0d50b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 256x512x3x3x3_n"7aeaf98c3a8800cc7a4a7142db159ebc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 256x512x3x3x3_n"71a7ee384e2cca01675319c53f42c38c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 320x256x2x2x2_n"3ab67f8b350ab5d4917b727039e67988*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 320x256x2x2x2_n"78105e09d259b995e712b8cae63af9ba*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 320x320x3x3x3_n"55967121288a0d8b316ac0ca01b0e06e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 320x320x3x3x3_n"69f88b32580e0770f0a1fc50f8f8790e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 320x640x3x3x3_n"e23c45174a833ae0908646eb973c3733*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 320x640x3x3x3_n"bd14ced2cd9b90101d3210da271dec70*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b 320x320x2x2x2_n"1523a7f86547938d90a593fa58d2386d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 320x320x2x2x2_n"dc0247ba95f56426f4217b90957ccdd2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 320x256x3x3x3_n"516b720eb0460cdd68ec1efa0b3e11c9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 320x256x3x3x3_n"10da378ddc87e89e70c7cb0ab2cb960b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 256x128x3x3x3_n"6a5c4f9cf1b35adca0b2c914d47164e5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 256x128x3x3x3_n"64617a681a416a8798869793b2971e14*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 128x64x3x3x3_n"9f496f3c210be1d0dcc75f131debe7fa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 128x64x3x3x3_n"caf6b11a0d247c72d9ca0818bdefbd7b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a 64x32x3x3x3_n"c68ccdee86f445f294bc0c8c89838667*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 64x32x3x3x3_n"eec787bea96002c4765a7339d168f6aa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba 32x4x3x3x3_n"9552f8635f4e55779150a598d099794c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x256x55x55_n"d8dc7acc23f52649f090cbaa1433f9b7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x1280x1x1_n"d008785ce1cf46457b11aa007142608a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b 32x64x256x480_n"d0c5d815b0ae6846764f8de55c9af2b8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b 32x64x256x480_n"0bb9e682c7095dbfe5271d5b84a988e7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b 32x128x256x480_n"c0b77b20bbd726dc6e400158ef853c85*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x116x28x28_n"efa476030182ff2bed2affdfb9ab134c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x58x28x28_n"3d2507022960b09ffd2119447b853607*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x116x28x28_n"4d946506aa625171656b1a35c99f2169*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x232x14x14_n"60b9461e64e9037dfe62e19d08eb06fc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x116x14x14_n"255350c06ae1cc6303e77e13e5523266*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x232x14x14_n"81f30aaa584efa88edd3b96b79e32681*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x464x7x7_n"c959dad66a64012377e7363fcb143751*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd 1x232x7x7_n"10ff15604296e285490c6eb8bbcd70dc*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x464x7x7_n"360060bd93844939f89ce2ef526159f6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x48x1x1_n"a7285df9b60f891551da6ac047f8e464*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x64x1x1_n"dfc574b11e9418c71dbb0779b1ca755a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b 1x288x1x1_n"09ff588d7628d7c219b70ec561a57d9f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd32a 64x3x7x7_n"4bf8962d4863031cf9c19c147a146194*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 64x64x1x1_n"5f3b866b2cdcdfa2476e2a0244666878*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 64x64x3x3_n"283496f9bca63aeba0a1571a19c0844f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x64x1x1_n"56297acd579995bfb20665f7db7cf0dd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 256x64x1x1_n"61418ecada25f4961e375e9430102b26*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 64x256x1x1_n"4d269a57bb6a615d8f84cf0ff8107699*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x256x1x1_n"38ad57e07a95f07b69ade21cce1be2dc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 128x128x3x3_n"c7f695a9b1a0c2b8d6c512dcf0af9bc9*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x128x1x1_n"8cfd749b077009d0eeafb1eb112eaacb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 512x256x1x1_n"d423f5cd07103a8f0b0214c503adfb11*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x512x1x1_n"62884e965befbd85fc15f6523e85b670*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 512x128x1x1_n"c187801f0d9bd95cce2824e7948c5054*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x512x1x1_n"b1efd173d01194bb2e3181ea5d3b3dce*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x256x3x3_n"9ac89c89ac2bbb15927f499582a94f39*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 1024x256x1x1_n"d7bbfe1eee5dc5ee4759caf602f74f12*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 1024x512x1x1_n"99e961daa063f25f4542c96a4e12ebda*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x1024x1x1_n"670e89c05b7efee6a05d9f5b26cb3cef*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 1024x256x1x1_n"ced37a9f92ae50c5c008a53d3ab6af9a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x1024x1x1_n"5a1bb5677c5bb67b39c39e9ed11c1088*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x512x3x3_n"be708171991f378f8b2eafffd616a322*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 2048x512x1x1_n"677359b7b98baafa0237d907eb398180*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 2048x1024x1x1_n"738b8f6df6db0f9e61e3129d6581b11c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x2048x1x1_n"a43fc22736cd9b5661135c458b2b8875*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 2048x512x1x1_n"7979c8ba99c6f6b220836273c07885ed*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x96x56x56_n"f8362a9fcd542f32a4965b75ef11da89*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 32x80x366x1_n"abc79e42a72ca59a91eaff8999712dd3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x128x362x1_n"4694f2f061b8530cbdfbac6d9dac6fd2*1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x100x100_n"73cfd5738f932e4c4c7b99dbcc47d17f*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x100x100_n"3333b085434ef22efea3309d937e53b1*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 100x5x1x1_n"e754d2375e08ba63636cd53ecf7eca3c*5&3b1e3bb507fdbeb1d42ba5e52f174d8b*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b --strides=: 100x1024x14x14_n"4d3f4ecea506bef0421f4e600c253b04*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x2048x1x1_n"89d910b1e54af38c6df06c0682ca5fed*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x1024x1024_n"27372d043840c0946b9bdf98c0fefb07*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x512x512_n"8f07a1b0101acfbaa6ac84fadd0e26b3*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x256x256_n"141322e67b3fbe227893efa787f22e9d*5&b2e1ad61c9723ec981542b46c5781749*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x128x128_n"e9b04a009d9dd2b44c9ad91cfbd2c762*5&3b04e84e32c0178ae7e3a17802e86376*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x64x64_n"329cb98eaa00c8e8644a2a85a71caff1*55&d306b7cbb84ef8342c05b311e417d963*50&d26a669936112d9eeddd844cb32dfeeb*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x32x32_n"6e0ab89126f7aab7f9e8eaca26f268f2*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x16x16_n"a3df32036d78d4c313bb863dc3a2be96*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x8x8_n"f2d64d27af6d6d867149d48bd104ae2c*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x4x4_n"dd8b4f437230081b84bb4d63c447515d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x9x9_n"707349beaf53625576fc036816a5f770*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x17x17_n"89346bb29733180e93495ee3b035e35f*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x33x33_n"fd6da0f99edab5394b217d9797aa24d4*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x65x65_n"081e31f127375243476d30e6627d6cb0*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x129x129_n"63237a8d2db1e973adf54500ef74156d*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x257x257_n"791cbbea74b4a9b6492c5c0665824ef3*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x64x513x513_n"277a33f3e7c58a31574c7ff6abaa1934*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x1025x1025_n"a9308579f4f1577e9ff3d8b6018d8013*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x21x65x65_n"5c41db5ac9f5c790843235667a2d458c*5&f32d9104b82bf0b84e2b59c3d5f1791a*5&8753e1065afbc831c8885c252019c2a7&0c5583ff1a93a51cc60437c61cae7b8a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x200x336_n"d111b059300fea2984835d5586039c13*5&640a82d0fc7e462109e0a9bd4f5bf206"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x12x200x336_n"98716eb0683c5798e3d0315a4f0b9914*5&16d6784ea07afb030d2e8a7a39dc4774"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x18x100x168_n"3ee6342a2067d4f251a507e769039068*10&fbd073c3be5d2c00b10a738070d9f53e*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x100x168_n"8ef427602dee4ccd57b1c48ab24cff59*10&669ece30a9727b61503b148c959eb9b5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x12x100x168_n"e108cfca7fbc767625d2d24595c51b2d*5&3edb7f72a2be1bff36e62a0fe048ab52"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x18x50x84_n"1b6ff5a30caecace78b2f766493ede7b*10&5970e927de2304906bc72e023141a10a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x50x84_n"fa961fac65b3de7e9cbc71c200ab1d27*10&02cd81cc721d2117fbf0f392f9561a46"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x12x50x84_n"8a5a75a59f40fbf2b54d9d1f7a7b1153*5&ff9441095998dbfe40cc7566cc92c3a6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x18x25x42_n"1ed3816649fb70efdcbc25a7dc3564b6*10&e9873e91ccaf865f57ef4d965458366a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x25x42_n"41ea31c6b54076b7d13df86d29e497b3*5&1bc210dc8ba87701f20c9b192aa40979"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x256x13x21_n"ebb98f090f686d672cc572d2d19f4ca7"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x12x13x21_n"7b18a3f9530c910b76d2980fbca25fed*5&a022548dce750fbf85bdcb737150cca3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x12x25x42_n"647e4c402727df43c5d9b6f02acd235f*5&6fbdb2626a4b040606e8541cd4825c11"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 100x256x14x14_n"95ab8d81a8d7154ed12c15c623a19590*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 100x256x14x14_n"e75cb003c5da20d31ba3bbbc823fb34e*45&5c499b37604d5a18731b4606d5135c80"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 100x128x14x14_n"6aa7802a02ffd2e97e1e07dee3922065"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a16b --strides=: 100x1x28x28_n"7b2f7fce691a75a5c1ff50993f9b94bb"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x72x1x1_n"d36029039b9bce1b61c237451a596610&5b70269d2c04ef3f0acb6a6532126d41"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x72x16x16_n"d6a9b915deb500d385e81d47e3339779"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x120x1x1_n"f6936977097b73146636f22c0c6361ed*2&cf46a34ce000565be55f7eb8933da869*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x120x16x16_n"be36f1532ed1fd0f7f124186829d5c42*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x480x1x1_n"81ae9b025ad44817c3a8ffcd15ea2fc6&521b034f3d4344e4ff5d28c25dd1f230"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x672x1x1_n"dc0b2224fc3ecce5ebadded2ae8b62bb*2&11b7a58fb85845ec0f12fce386ee135c*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x960x1x1_n"60ca70d244d2578efadc1089ca1d4982*2&04c93f30a3a2680cd414443af78513b8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x256x6x6_n"c7bf8355f85d3bdfd28630789603047c*5&75fd21e11fceee42a2f42a858fa62358*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x80x366x1_n"d3c5d74f0573e452deb0e159a8e1e8df"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x128x362x1_n"3467bc5be7f1280f69f75736cda6ab80"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 8x1x5x5_n"aa4e9b0c25e5aeeef7e87bc04d3fdcfc*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 8x8x3x3_n"6b2658f4d1e6c8eb1f9b4f3d9a2e5e55*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 16x8x1x1_n"35201e08cb5a8d05e14c2ab62937cd3f*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 16x16x3x3_n"7aae7390401cc42058ecbdf8e89a5d73*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 24x16x1x1_n"644a853c719a5f6f742e6542ed079140*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 24x24x3x3_n"3c4ac8007eef262673aabe9f30d6b4d4*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 32x24x1x1_n"10322696b12ceb943d3a714492884a4a*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 32x32x3x3_n"b5cf4f749cdf88f42163e07357e3f233*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 40x32x1x1_n"8de7e1f5cf5ef1220947d37f72cd5d30*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 40x40x3x3_n"9aea051fee1d141317b9b82f6ac1942c*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 48x40x1x1_n"2b8f5c486bd15bb61d14ef640db7a2c1*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 48x48x3x3_n"7ac4bb9ee30e467c9e8401d0d9f11b98*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 56x48x1x1_n"af09e71682383a750871da2999b08b16*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 256x168x1x1_n"3301024a59e958d7c8c8ccb0d5dab9fb*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 256x256x5x5_n"f9b292488e53ca366d3206dffc310691*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 168x256x1x1_n"4e55022801ed62345541ea0cf383680b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 48x56x1x1_n"021b3def24eb9eadea1354fb159fa1a1*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 40x48x1x1_n"f2a5082e18d9c0089eb6fcfae0c1635e*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 32x40x1x1_n"6bd86e4910016d6cf6c7db683e5cc3c2*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 24x32x1x1_n"1728bd81a483694b6bec4154357823ba*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=BAcd16b16a --strides=: 16x24x1x1_n"d521c5add0c787f7b969bdf4ecd759fc*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 8x16x1x1_n"cf2190a1a5c6555f037802061fb0ae1b*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 3x8x1x1_n"8b84a62502c308d60aac5e90e39c0cee*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 3x8x1x1_n"7b4832f913aa45603dca60dbf6149c8f*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 3x8x1x1_n"871dbb46eec0074ef3b3d8dadd73d273*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 8x8x3x3_n"18e04f4645c5f8989daec29a9a27fe7e*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 8x8x3x3_n"bbf30ec328bc44f7d8b5d975e36487fe*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 8x16x1x1_n"289494964409b520ff03f868d29067c2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 8x16x1x1_n"780812444f915f4a71425309822d4112*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=cdba --strides=: 16x16x3x3_n"744187bedb75376c45d3995b24532eb0*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 16x16x3x3_n"f98ab698626e27e73450edec41491f73*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 16x24x1x1_n"a44813f970de8ab4947f16177269c443*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 16x24x1x1_n"559ec5f9d1345ba07be2d6148a25e6b6*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 24x24x3x3_n"ef3ba23f57a1177a3ce73be258629b0b*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 24x24x3x3_n"bf943cdfb61a0da1d8ed6bdcf434a1fc*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 24x32x1x1_n"186ab512dcf1375de8c5638ec88aae37*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 24x32x1x1_n"285ba2edcae7df55790c8658e0be1542*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 32x32x3x3_n"7bce2893325e443d0b30c50039af40ea*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 32x32x3x3_n"aa8c23bb7d47ed7284303412a3379042*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 32x40x1x1_n"5c6717453e9f8b227a216c7b9027c1c6*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 32x40x1x1_n"d1080bf3541a2e1ab79950744b9bb8b2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 40x40x3x3_n"76600b070de0804e40c8e67e19fa1612*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 40x40x3x3_n"209006e9dc2eaa2674da7031fce52cc4*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 40x48x1x1_n"f44323df79d1b9ae015e2369c6963095*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 40x48x1x1_n"54467ab03c68fe1eba0dfdbeed5e43f3*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 48x48x3x3_n"64a31ed4b06e4a1ae9c7504388a5e2ec*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 48x48x3x3_n"c8c434b15295cb2fe9a033b7aa8851d5*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 48x56x1x1_n"582487e22912f04d94e257ba4f06cdbb*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 48x56x1x1_n"ce19d0ab5b3b5e99c51485ec3f5c9853*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 168x256x1x1_n"f4053a96acdeff942a31b2ab4967b289"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 168x256x1x1_n"75eeddd2752efc84869bd437d611dfab"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 256x256x5x5_n"a7b8da971996aca27d9ce452e28d3fba*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 256x256x5x5_n"ac34366297b1ddaac8dacf8d08d1e51d*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 256x168x1x1_n"1272c53e5294ddb9b5b7e45cfe91d273"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 256x168x1x1_n"538c4728c2337f2f48c5509b75c212f9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 56x48x1x1_n"b38e516cb40b86d6d5fb189c0d687f68*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 56x48x1x1_n"0ce16461d2060fe08fce42211c861e2a*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 48x40x1x1_n"85ce5584974a7d745fcc4c15710f0073*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 48x40x1x1_n"17307e6038d4eacfcf9df0a8ad37fe80*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 40x32x1x1_n"02948a419cc5970b541be47a2ab013bc*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 40x32x1x1_n"25b619d5f01c2694d8c007f829d5cae2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 32x24x1x1_n"da8f11d30a3a116f495bb62bb8320540*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 32x24x1x1_n"cfc9658bf0244cb3eb193b13e9882300*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=BAcd16b16a --dtag=cdba --strides=: 24x16x1x1_n"080e9b11a12586bb37661b83e25025c0*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 24x16x1x1_n"129e99bd44d2370599034da538a75858*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 16x8x1x1_n"e5b1f6057da43c6fcc462853bd46f2c8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 16x8x1x1_n"6a025b79cb15f62724e0755375cb513c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 8x1x5x5_n"425b0115dd42415d9c0cfae9032e9375*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x1x30_n"f51febaeaaefca48101215e77b853ba9*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde --strides=: 32x4x3x3x3_n"f2c34c5411cd26e0f81e0639e13d54ae"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 32x32x3x3x3_n"b912bd85287f87e382ece55f827f8714*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 64x32x3x3x3_n"36e6436de7b407000840dda4b8430759"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 64x64x3x3x3_n"fb58a0803bcab9bcbb44cfdec05791fb*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 128x64x3x3x3_n"828e31dd27bc49c89f81a419d9ff52f8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 128x128x3x3x3_n"f04f79f6ac96cc42f4859c22c19d1e14*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 256x128x3x3x3_n"e3a421d47889d68247b304484d16e3df"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 256x256x3x3x3_n"1330503e597ebdba7849a4234306e9a7*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 320x256x3x3x3_n"4d800b3132cbc4e3190bc8c248dadc4a"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 320x320x3x3x3_n"f9541bdf201a49382d6fca82b2391581*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 320x320x2x2x2_n"150fa9af2ecfbbd8bc89dca71cca5ca0"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 320x640x3x3x3_n"a314ef5441f01dff6257c77ef8898293"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 320x256x2x2x2_n"d9255cc1805bd18ee65c20a7ee05e41b"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 256x512x3x3x3_n"16c06f9260de89207dfa043c4b975ff1"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 256x128x2x2x2_n"ade47757433448161cce9c6aa08eb03a"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 128x256x3x3x3_n"500b46e159a7f7181f20456abbfa891b"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 128x64x2x2x2_n"2f4b1a21e150512cd4815751b4935388"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 64x128x3x3x3_n"27d3169024db1cf1ea8c07e86687d531"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 64x32x2x2x2_n"7b2106d70a64aa1612b400c3dc10a9e6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 32x64x3x3x3_n"f4661cc4cb5321f2730bc83ee25cd21a"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde --strides=: 4x32x3x3x3_n"88061bac54024e448b831ec9eef90fce*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 4x32x3x3x3_n"1fc2c300bd1443dcad0aa0fd7882701b"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 32x32x3x3x3_n"c9cd5461029a93aa0fba914d237ee0a3*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 32x32x3x3x3_n"2d442d767661735d2ff1fd3a79139072*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 32x64x3x3x3_n"158132784f9b1f6250a7b7e5d66c24d5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 32x64x3x3x3_n"b8255f9734aee199bb9f0a6a462d23f5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 64x32x2x2x2_n"949ce64e50da81b254da8396c5152602"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 64x32x2x2x2_n"c6c0d5df24254f87853169ba58afae63"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 64x64x3x3x3_n"b4bd39127377ae2b03c0fc195d959e6c*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 64x64x3x3x3_n"2e685043e8f88804e8e75157e5ec0acb*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 64x128x3x3x3_n"59f5fa975d633aab8e4e142bfb60090c"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 64x128x3x3x3_n"081d46aa0db0dbcf32c7edf5c875e8fa"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 128x64x2x2x2_n"4ecc35cb1dd41e7628e1e307a720a5e2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 128x64x2x2x2_n"2783b6da3c4ed114a7de634c75473897"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 128x128x3x3x3_n"ac2697156488716d6dae3f2c2d28a918*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 128x128x3x3x3_n"414da793ad4137553fc546d896813aa9*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 128x256x3x3x3_n"d4d65d249abc9f2e4686f31e16861ee4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 128x256x3x3x3_n"48a0d35ea21b7093b17ccfc3ce767c6e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 256x128x2x2x2_n"3ee3b11cd0d64d80fa602e7c55741802"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 256x128x2x2x2_n"f1fe5c7753e4cfcacb2a529235187cd0"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 256x256x3x3x3_n"7a23bb979681ef8f6921294755ee79ee*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 256x256x3x3x3_n"9ba8625bb3b90880e79acd98b8d29822*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 256x512x3x3x3_n"0888773c02153536a223d8eee946e4b1"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 256x512x3x3x3_n"d82732275a844795b7fdcdc23b10e83c"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 320x256x2x2x2_n"df8c638fba86616a4e176e6fd81727f8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 320x256x2x2x2_n"56d6645023a718616270b5bf02605215"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 320x320x3x3x3_n"0c119fc477f0ac8323a054318bcdcfd3*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 320x320x3x3x3_n"fc2cee9d75263dac7959c35f05ee63cb*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 320x640x3x3x3_n"af6d9cdd50fd271819386602709c41fe"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 320x640x3x3x3_n"55ee0fad942a6e8ec909f585a2e06234"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde2a8b16a2b --strides=: 320x320x2x2x2_n"7e5a15e326d986933cfef40597bacd05"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 320x320x2x2x2_n"25479365ad101c182adf400b9157c227"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 320x256x3x3x3_n"69df25991a7c858419b0356a06364741"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 320x256x3x3x3_n"9017ef460f1eec510723d4cdc64dedd5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 256x128x3x3x3_n"7c698aaff105120c3f8f73d4e9e30434"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 256x128x3x3x3_n"32831b36a821ffeabf37e9468c73af9f"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 128x64x3x3x3_n"21868813330316680112f69f80462c50"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 128x64x3x3x3_n"5aefb3dd2594e1fdabb18f6a209463e9"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=BAcde2b8a16b2a --strides=: 64x32x3x3x3_n"3132b7db5147a7f87371617b4cb9298e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 64x32x3x3x3_n"2949596b1ed2f9bd4b8aaa2a1e83eaaa"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=cdeba --strides=: 32x4x3x3x3_n"e81ec03a1d8c577786cc049f0a82f0bb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x19x256x512_n"47f097a55348bf460396b898f7c49e9b*5&d2782bb7dc0b57ee1c7a0ca68e1c8f00*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd16a16b --strides=: 32x1088x75x128_n"ad7fae5c1c308a15eef1db7680794762"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x1088x75x128_n"78e23f50660161d041090976d3b5ed2f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=abcd --strides=: 32x1088x75x128_n"cf9c54456db3d95890939a8fdd815009"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x24x75x128_n"d2daabf55f0980a2ac54412951db817a&a02dcfc8ba532822065c2fe189dfffd2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x48x75x128_n"7ef4688c18a743564d4b88109edfec38&6709f7d7ae5697ef0cd1ba85441f2309"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x80x200x1_n"6b0468a36e86eb5cc3405fe527a109f3*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x128x196x1_n"8408f8955a0fff484193074218b28b8f*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x546x1x1_n"367dd303c4ccdaa9de1a861853bc3b2c&732d894b50e2defa9e65904a1e04e16e&3c13d169e1f14f2a2eae3843ca7fde9c&b2c0120c9e34c99df744747ea597808a&0cd0e56f8e616a77272d45560f52aeec&28e8364fb85ef7870fc444a34e38cc2b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x24x1x1_n"a31c4259975cf571b1bec3a694d470fd&330a27dbd293d63891d8b2a9ed8ae550&78f0ee8ed35c0e63d4be536d65b477a0&af0c0d39db6c0f8908e05ba1ddc2c90e&79e6a1eae3d1c17b0dfaff4c701c36cb&334867509d9593b082b66128c48e378b&2e124238ac04b797028100ea53a49ba0&3462996fa64120cf93b0f3e947b2bed5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x256x1x1_n"47c0fc25cb12039c3bf8533d591fea35*3&fff36c2a911ab3b5589d75560272d5dc*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x256x1x1_n"8c91e79a7eb5ac44b77278b75bd0d8df*3&31d2619c2caed1f7aed3f7c203519f5b*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x512x1x1_n"e6435cff0794e40b7ce093cfb93eaa19*4&a99351788c4aaf321fc239a2ebc8f20a*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x512x1x1_n"1cb74cad53336c0bb68796327afec608*4&949268808352500ad24b9b2cbaa10c6d*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x1024x1x1_n"7370a1fd47784717c026389df4fc1db1*6&714706b7a07ec180c4c427ab606677d0*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x1024x1x1_n"caf3f6dbd035866910d9474e501fff18*6&2a6f227be60d6a843472ae6550c57e48*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x2048x1x1_n"e860c085688b37644fc7a14eef7127ea*3&6d1ecf62d21f6230d9f9c5143ce5de66*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x2048x1x1_n"e419f0cfd7f9f1db0bb54b9825090896*3&0f517921799d28250c8b68c75782aedc*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x100x171_n"72c6c96a0b87f140d3f1c935ec298e80*115&b6074cd76072a1cb4ec286ce0b40dcbb*30"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 4x256x52x88_n"e1f72f78a372b8a89b6552a3b579d8c9*115&589840f8eb5aa87ed6a142fced1c9059*30"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1024x100x171_n"e90f486c302f02ba72ed38dd9ba3c16c*5&b904490a03b6aff04689529c064eaa59*5&a7452021f24b38d66fc14b421bdf79d8&48dd80150bacbc2766d3ac6089375c4a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 4x1024x52x88_n"844015f12ac76b0a01a51116d45f420f*5&94fc87e99574ede86dce1127827fc9e5*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x100x171_n"5fe988951b4c9c4b994dd386b800ef58*5&e62b5339bf1f80f66b48759ba0189895*5&beb46c4932ad202eb7223489b473864e*5&d3df7fdf709749cd41634a454d3d1c8d*5&d52a022122c4ce2086cf093e7652ddd3&8d4280cf8073c88f42f0894626914e4d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x100x171_n"cc67ebeb77c3b8ac5662ad4c9389f1db*5&e602ed13d544e23a9588e49c57c93902*5&14646c19a00674a765d0f047550f861b*5&5df9d7edca1f284709ff60c542f27bb0*5&e169b8c05797c99a0fbbbd7303814c3e&a20d09d3eb2ca8a0db87c82227eda71f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 100x5x1x1_n"b1ac8b96b552311de63481f015b355ce*5&c61a78d68e6e78e3e9f00f4754b4ccd2*10&695bc89e4c1e663583dc372c3da21ab6*10&cf4f8d569edcc2b67cbe680a72a8675c*10&e027d13de4f36fd5bda77e445229de0b*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b --strides=: 100x1024x14x14_n"0bae2718b828f8db9c02ca2ececf91ef*10&939ba14508f7f4906f4049cf304de736*10&6fb035e04d36c740a2d3ddca9bc86a3c*10&af0eeb13627bffcf7e690888b764650d*10"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 100x2048x1x1_n"d59052cd3b138b24b9d336a3f1a4cd33*5&489ff37bb92dcf44173bcc8eb10af560*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x58x28x28_n"ada68f9a02e94358bf6e088fa5c75cae*20&9691f803f4759ffac4af0e33bc5583a9*3&529c2642346521d730d513ffccb144b4*4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x116x14x14_n"901669183bcb89627783509c8baed59a*40&6fa688aa0786e35c0d8248f6da986082*7&a28a71588e817907f89b2c4b790b7816*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x232x7x7_n"2303b65d27c63373235f8ae9d3baf501*20&4556930db1fb347f7c080c461338216b*3&b8991bb0a94d4ff96a9838752a1261bc*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x19x512x1024_n"819cafeda8950dddc6c41fdc1c9ec9fc*5&3c9df69193de0e244a84bdac237caddc*5&95cd272c29f8f16af551c7581ed955e8&c67bf7d50baa1d04f1aca548f8112af6"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd --strides=: 32x24x256x256_n"15a827461e1bd478af8026a0b1ed86e7"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x48x128x128_n"b7dbb0d60c177b92318af1a5ccccdcd4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd --strides=: 32x64x64x64_n"f90a49e5a9f5e5072e3fea5edcfdd1fa"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x128x64x64_n"be54c7091c05eaa0e6363c8a72b1c437"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x144x1x1_n"25305897b6844838f46372fd7f5095c9*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x144x1x1_n"ad424f78601714c4b3baa13cb538a730*10&b116095c8cb1c66d869d7231b8375e22*5&0a78d40715ceb6fbc404a38d1f7975e3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x240x1x1_n"bb6beecdca6eb96e6d8a0d31fe7e424c*10&1a16e7df8abfae2732a7278faed8663c&4e9cf849845c7a1b2ff2eec3e71f6061"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x240x1x1_n"dc902ee0a2e48e093fa73e8cf91186c9*10&2820023a8c5301f911b654057d9ffa9d*10&7aec427ee71762aac1678180507c876a*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x512x1x30_n"900e193c4320f294f2867859cadb05b7*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x96x28x28_n"01d090c2aab404c4d4baec7b404d62aa*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x192x14x14_n"58fa8834de4b028fa42eeba2261b3e7f*4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x384x7x7_n"d7ad7eb75d125ccc7072b8031a8e33c3*23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x144x80x80_n"02e1a700d955d250ff0a69ff93fcb537*5&15fe856cd85c1583e4c80394499afceb*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x144x40x40_n"e1985395b8f55b8ccdb5b27026a3def0*5&8a367ae33771e937c217cdb5f6ded86e*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x144x20x20_n"8b1465b7ddb064992843173726b7e038*5&ba5a5155ed63b98789c4cebcd5ebe26b*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x12x368x480_n"254bafb640e9f52f39aaa9f751ed3680*5&e5552616476877e3eb19f61890b8b8a0*5&cc97d1dff46aa93cdbea0565770d44ac*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x1x1_n"088f3b0495862e996b5d85f1dc20c792*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x1x1_n"f66934c81e1f1820addb713f7ccc7a86*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x1x1_n"893a0746d627d9eb8aa63bbb2c767a69*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x1x1_n"ce22a68fabe6dada3dfd4ec607ff65bf*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x15x192x192_n"81fb6845bb4ab750182ff6c0be921aed&218bc620a72e554a55322b735668dd2c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x48x256x256_n"2ea3bb2009a859e6df0035373a1f7a7b&6074a9114960151a26e03cba96f1bf6d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 32x48x128x128_n"87c2ec01ce5226ac3d1e556bbbf9cb41&28fbb8522c81e45b8e7096525c0706a1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x96x128x128_n"f6d77cbbb75d09d0193f838984d31820&cc50b4a1e291c00df288fcb7b5119b18&1abfe5d06941bb60f2a37538ffd7b527"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x96x64x64_n"0db4b92638f813d4445bbca3c9b022dc&c6f6e457b4619010ca2e71a8f1b8d246&579f68809e7ceb15cb4815308ab1b370"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x160x64x64_n"ca558dd05b925d330103079d192acfff&23d58d265cd9b2d1a7b2497477bb6988"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 32x160x32x32_n"7a56ff752417a1450374532722d1759c&e8a9fa7fd9491ecb78176e960c2f5bee"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x464x64x64_n"e084ed188967c44bea8b1ae8ad5dda42&75bdebb05874f70d9c11c579a9f69903"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x256x128x128_n"c98e9fe886fe649635fc4ed1e0f6b09e&b3d422fe6035efb5c469ace257b605ba&e5914bb318f4bc13027741aa28727ea8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x144x256x256_n"bdc131763b5bb39462a11e77295fd18f&5c247040bd1d91f2af777242699816ae"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x64x256x256_n"9c5434ee8047753d63d514bdcdb4e30d*5&682e0b8d66d00ab0bb7952687fa221f5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x128x128_n"9beae8f598f964246366dc3a62035535*5&2827dec9fe414581e2c56d5129874a15*15&94daec374647a74c17a240d8ec4b7e01&56325f9e2f1b4537b7edac1449472474"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x128x128_n"c41410ee9c066836dbef3d601cd20697*5&fdc3cc9c164182d69b3ac619014a0194"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x64x64_n"c797b3562cc4d358526ed177f646d6db*5&871c304c6192703b5ca3efb17025a4ae"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x64x64_n"1e125655831abaf36788e91c2cea9b90*5&2ea01d0c67bb9deb93981ae51db07ee8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x32x32_n"0a855d944e83a6c40b7126a3dd255b0d*5&269f3daf616e6b34a83b6047877ebae6"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x20x6x6_n"3555ac75f6fb5fa299fc7db16f80f091"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x128x1_n"9249e9c97fac6f060a43aa4918d50cf8*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x12x640x640_n"6c3f239dd7ee61b497733cb612355b1e*5&7de7b226b0e48e0bb732e8bf7cd4a3d9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x256x1x1_n"c7950f2ae09ad489dd2b398223661791*3&1c1ecd3ed872e10550d393e7f14c02ce*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x256x1x1_n"c42fea0c1cc2ce04395dced625684d8d*3&a1088d18d7af19687f1d90ecf522902b*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x512x1x1_n"16a2632824d667215ba205523547b390*4&cfa70f90de165bf7b8b8fbb3da6a221c*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x512x1x1_n"b8679ea67d0478d8a10290b32799233c*4&8f5ae52130aa73e54b0f8175047e6809*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x1024x1x1_n"5afd920ba5aabf85a278627906feab3a*6&fa8756f07efbf77e5eda5ec814a98c77*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x1024x1x1_n"2ec40b7558f07d46960d209026dd2378*6&7adb12ec3edac07a83150ce0805f3b25*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x2048x1x1_n"3190a46d6fe77b94865344bbdd922023*3&804120a613f980f1644da59dfd5cbb46*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x2048x1x1_n"45ee19b7142a414118423a7203265e99*3&0e90b5fe95eee5a1052d9bc2333c20d1*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x16x256x256_n"f46570efa9eddf302233fb60db9d92e3*50&f8613dee1d0e4219a91f93e0c852b717*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x35x256x256_n"4d5253f5e304eedb4b6a7c59279a1fb4*5&18ef790e92c1acb54ca8fc33f1969c96"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x16x128x128_n"b301c494bc1ae79f760976a5c8b8972d*40&42da5d59eec7fe91b8df3c76ace412c8*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x128x128_n"5ca201b233f7544d2046c3555445a612*30&975149ea36feadaf524f7970fbbaa293*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x32x32_n"ab570b2c3c186d140d8b1799d6ce5bba*10&d9d54fde81099584e97bf51873c39cf7*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x96x32x32_n"36fca7fcd02923ad7e5e4c968337eb4e*5&7e63992c1e9cd41a7aaf1542a01f2500"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x16x64x64_n"7f9faa8224cf4d29efd895ee31d15eb3*10&d70e516f632e30e9f9314e4740c92997*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x64x64_n"d4b74b72cb7ed0131df89bd16e470887*5&ae87c00fb8d4c477efef31e2ab18c7e3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x99x128x128_n"d9341f74648adaded36f112476485f04*5&61cbfdd2b72e4f2e964fecb91a77dd82"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x256x256_n"4fe36d6c5a45a81e42a6c4d5ed60c086*15&2a952408b97b5c777535e5fd82fabba7*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x256x256_n"dde80b68fb3f35ba2adeab1f80d02d76*10&a18c78bee75c8ebfc431e9ee4d50e744"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x256x256_n"580ca7aaae8a300b6b57c978a347bc30*10&813f11deefcca0bd31cf3bc54acb937b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x35x512x512_n"58a50627d84810da109a5267f5be6b8a*5&10d1e9aab0c0c2c8bd1a24e150a9916c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x8x512x512_n"f223c9bef313264a28b2b3482ede7072*10&925653243ade6bfefd20281f43e9e2b6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x16x512x512_n"ea28595e3c2243666f8eed2f5359957a*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x72x1x1_n"95ea1ee9b5daa081a1adb394fa6da093*5&69ac45c7dda4f7caa63b9cc7d858dd1c*5&c506265100c9d0830ac516a942474dcd*5&68f79550c6047fb061c5219dff93e456&61a251f3b0ddb1dfde283cd137355dd3&9fd5c1f5345fab3671d46c8e7c41cdcc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x120x1x1_n"9084fb7aca06ce7137ded11a579680e9*10&9c05eb271c1c0511b096a3268e30ac24*10&54ddd2f78b50b4637b2791fb92b9480d*10&52d6ffda966e1d2cc2d40fe24d2a4467*2&04159783c839d0ef57f1b9906625872b*2&de92fd5b80ae0385f0b60a517f87a3da*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x480x1x1_n"6e0cdc05782ed1da7d3e385be170eb5e*3&df610b78bcd58d1d79a6c0d3e7b1fbb2*3&851ec7ed19225416e1ae3917bc9f385c*3&e2c4dee3ae02281f84c6fc8b0bce9133"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x672x1x1_n"b1742cc21e1e7eafdd8e74f37988b7dc*3&5dd539c6d5cf18444f9353451fed5311*3&b83e6c532c6e4f89dfae5dd7d9a02a8f*3&01238a74639260251685e42c5fb7b63d*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x960x1x1_n"52d93e110bb7833924c466a4c5375b4f&05671433210baa7a42562d0ad4e8cd63*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x126x1x1_n"c8e34f2424b98c91ad8614af9d9bd1fe&889a51a5b1114d5eb19083a440729a0f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x25x36_n"a5f8c458428cdcbad270a5bedb62361d*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x56x56_n"246737c8e5e3d72ee473446de6b1d294*50&2c38b83f6cff001daf56b1e474f39b0d*10&861c26f3290ebcf39fa13b1dfcd99eb2*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x28x28_n"edbd6e4b624ff180cafbe29891b136e3*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x16x256x256_n"252ca085ea362f88dfa9d633206c519c*25&2c9ea4ca3d6a8277258c2ba96273d4b6*40"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x16x256x256_n"6c1de97d6627be79c7421d469c193bb8*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x35x256x256_n"098130ad1622989c9bbbd848216ade94*5&86024b0b7c63c37981ea0ce1be6373ad*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x16x128x128_n"8c3b03e83dc51e7897674a592f9f80a5*20&d7e7e6c958aeb390d22e9307683eaeaf*40"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x32x128x128_n"1e7cd8cd8c2e148859efe0c5721f48ba*15&277bb6165ad0cd525d060284a1fa99bb*20"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x128x128_n"59650441cf232ff1c1b10bdbb0d82c1d*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x48x32x32_n"7bfe6455fb788ba68ad231e52509b5d9*5&97ad932f3df54d50ef4255ad79adf131*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x16x64x64_n"d8dd1da42b78d4cf2b58c2d22f1dffde*5&57e2508c7c4e3cf2650940a6b4d4f4ec*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd32b --strides=: 1x32x64x64_n"66e1da4c44a5e645fd63f7d2f748b46e*5&3a5b3ac283e17cccf886224cd7a76352*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x99x128x128_n"af19e69c956652e8c69d64bd0f89d582*5&c95d760c28626b50040e690908280d6d*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x64x256x256_n"1a9b105b54e3eb67b4f571c673ea7a52*5&f59fd92b431867669fe218f1e5b965a9*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x32x256x256_n"a96c7b8b894d44c880bd48fd5811805e*5&e698eef50fb2841657de2761aa7cafc4*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x35x512x512_n"9cce933cb27c3e281beb6d15444d6964*5&9f016d923cca5bccc05a29a47d726830*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x6x6_n"1c2502e1b418b4bc354902a6ae8ed757*5&d2cf29aa17d0040d183fa22b555066a5*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1001x1x1_n"c044358640263550791f328c6bc99fb1&f8ee0e20c503fe1c690eb487beb5d7d3&6a8cda8124dee378657077b89059a14c&e28ae0878b6fa5bfa5b7ffc4646fd4a5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x90x256x256_n"32a4c0a18f54599a496a5fa2604f4aa0*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=AB32a16b --strides=: 24576x2_n"22670042deb4368f0961ca11a683fb3f"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=AB32a16b --strides=: 24576x1024_n"2f934fc486d94d358b055b71ca41c00b*120"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd --strides=: 64x384x16x64_n"925e646ab93ba26f6916b044357f127e*72"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd --strides=: 64x16x384x64_n"9734cabb2c70554186dcd562f5da2a5d*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=AB32a16b --strides=: 24576x4096_n"5eb2e72783a714306c6857c861700c72*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=ab --strides=: 24576x1024_n"26d1ae3679bc2e47cef71dc5a97c0253*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab --strides=: 64x384x2_n"08a7b23fb37af7864dd2cdd32347d687"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x2048x1x1_n"976d6c0e330c06b48141fb564c06b8d8*5&9b61499897c34587518a2e7d2f9ff00f*5&90b87ff094eb5be14af506cde84307c5&998ac283258bd6fa38c8096a3576aeb8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x128x56x56_n"ee110189f6eb074d603444536406ea06*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x256x28x28_n"fa1fe15fd5f2cde0bc89a041208009b6*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x2048x1x1_n"26b2e0fc0c28fb6fa971ef0eec0e66e1&f23f5650baed0698e6e6519516245aa1&d1d3417ceeb9319ee99afa78a3cffd1d&2b98d6825c61ec06fb9eb0fd1f328081&720760533412ec6ca4feae4b89beccc5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x38x38_n"80ee400b3c9c6ce055d6b4764ace8301*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x38x38_n"02961a7335588ce6cf00fe1d3519227c*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b --strides=: 100x576x14x14_n"2d0184b2c5ab2b43479779ca236b35b4*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x1024x1x1_n"bf51eecee3b26c9f09a1bb8dcfe0f89c*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x128x64x64_n"ba52c7577a5808a72d08bae021fecaeb*5&40de16d0f0b22abbc47e20423ab23841"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x224x224_n"677158b77cb380dde1c46610839fc0dd*10&a906824540b65c489b518c253d7de52c*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x224x224_n"80f4edd6085cf323bfd035fac61ac804*10&35b4687ecc09b1cc08632f04f8c4d1dc"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x64x112x112_n"e92296eec91f3643e48dbac4822885c3*10&733f120a736f34b88dbcdcb9ca0533ed*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x112x112_n"9be80d4ec8f5ab19ab190890a1dc29ff*5&1cdfbe521bf46b0114685ad55ed356ff"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x56x56_n"6ed4695bb4c3f298bdefb020e4a42eff*55&a6f1b1187d8086688f7b77acacfc865f*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x112x112_n"e4a667f893be00d7bcc83084cf1f9d04*5&4793d58386b108ddce1984868b507bee"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x224x224_n"b4acb536acec22f0f867cac43f868c26*5&108ff5e296418def3242332e08176a0b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 100x1024x14x14_n"7c01e1aaafce8400accfd4ba8e8611f2&54fc4c5b4e9463c9fb7686303c4dc84b*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1280x1x1_n"b0b1bf25c731d9158fe2739b77bcd690&1236004ab36fb941717db7478d30a9dd"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x64x256x256_n"22e673616f43b582b64d24962e5355c6*5&46a95866ca0617dd19c42aef00c0e0c7"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b --strides=: 1x64x128x128_n"2b6043149f952d24a3c8f1d8bc218cd3*5&471b35137cc1a19a0c610a2722f5b90b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x128x128x128_n"014afeb21d20d8a4dd2e77d2d7bfd4e9*5&1e3eb80b8751854283f5a744076219e8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b --strides=: 1x128x64x64_n"511d655f55555ef732e0a6ba23720c04*5&9cd608fa1698079d6e2b4569a72a1b43"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x256x64x64_n"8f3d5fdccf9e4fc2b5836c262377ade1*5&0335b6e3a1e2375e80eff7f305d93db7"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b --strides=: 1x256x32x32_n"a29350719df0e1be7e759dcd18d4a575*5&7fc9b0c838527371c924d9e91acb9126"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x1x25_n"a4d83c7d860fc0fc0bac1d5d5f301910*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x16x3x3_n"2efd30ccbdf5ed9b01516286921cc26a*10&f2922a0909d5b8688f54406703ad47b0*10&0c48be79254d3e028d6d8cce31467d37*2&0887dbffad3a6cbd46de07c1b3a31010*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x324x3x3_n"3ec63536feb2dc71d775b0e5736cb856*10&ec21bb19ce4dc991b55eff6008424d21*10&8bdfa99e3e029ae97c75013eca0c7db8*2&90b7e931a4a1a0595692bd758a50374c*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x7x7_n"254d781b23e3434061ca871601c4dfd1*5&e01850c884b0938711281a60689c1ea4*5&f1d0dcf039eff8857a460b6b7a6d9807&c88fac50fccff002578c1aa3117fc9b3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x486x7x7_n"8d82505667a7332014cecf17232f8762*5&92d09de06e221ea89893d3b54bfb7e12*5&aabbd72497691179afd2ae0d9d0fd1d3&48229977fa40df4770cf043264fc2cc5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x13x13_n"9ccf7484a61001ff02598132267aeb3c*5&c6bc9e4250ecc68877f4641366545387*5&f9f585f71a40179d2c7b834bb7af9f89&7cd15b2ac1f846eaa33128dc9260d9b7"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x486x13x13_n"8a2c8ac78880117ffd6559999dbf1e23*5&2f8e605254ac72cfa8802b7525d13e57*5&b96240883a86bd72d597d069bacfa392&89bf902c805f24905794a64472e27d92"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x25x25_n"dd58cbd43b715cde1e33a9297b2dc5d7*5&017dd2a85ce5905d1c0824101881c7eb*5&6e331f0e0ebde54f05874fb489573904&5eff9059bf81246e5b46d955ea27b427"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x486x25x25_n"fb43a5edf013cf6485532f503f23d2ac*5&447cbee05dc7fe11f215e3b676397809*5&045724a00092cb05ace716530a2799e1&08f2335bebbfd65515af3c3fa4283b36"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x16x50x50_n"879a77cc5ae8d551eba7a526092316ef*5&1af4cf7234e535816043fc171225751a*5&08aca111bbe03f78e3967fbb2fb8aa43&253521143092800482fbf4a460b976d8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x324x50x50_n"ceefbdc69246c1ae96d4a75fee5fda1b*5&435c6c10cf309c5a8ab95fc688e622ef*5&f74ebf1aaec48d33f9f616031749cebc&9c2520f112d60bf153a9152eb400b5ec"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x20x112x112_n"dece5c35b1fe79f506f9903619402800*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1536x1x1_n"b0be8fbbcffe8d753a16a18645914a07&f7877be5e12dda88b83e0546db802f24"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x256x56x56_n"4a04cdd861dafa0b045bfed7968b653e&956b74c02655c5d3f300d28ce6c6f089&cce34dd98293ae1efa2fd6f040c0910f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1x1x1_n"ffa53d392b5c8d441e2a14a34e6566ff*10&8ee00b1e4896346247ea399d930800ad*10&d894b15f079d12908af52913293a3550*10&6e673bd83974b0b0a5c38b128a4144aa*2&106a0546e7ee9e28779c3402d2f54143*2&c6cf26b001a4487127a2a2bb1a03980b*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x512x7x7_n"69da6170704aa9f75dafd527a9901ba2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x8x8_n"9a94ac5cc81628f1a3f6b510ea7216c6*5&67ee977861c46f583d4e0b686d594e63*5&1e843f377bde23cc51a229c617f8fd5c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x38x64_n"4b37cf565a2f7aeb518e51f24643e6c7*5&eb71c3ddf6b2c5cdffa8a647af7b81aa&66a6f3f8f8506ad2aa087bd2ee5da795"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x38x64_n"ccad68b6ac43826b8833c0af53ab4bb1*5&ec3647b8bbf90738a9efa44ab34b65b9&942e23f8e67a8ba2c0354e237c0c5f18"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=ABcd32a16b --strides=: 100x576x14x14_n"d9a53b90b163db0d3f491d08617062af*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x1024x1x1_n"2736d577b996cdd12fec14a7fe9f4cd3*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1088x75x128_n"2103b334ea99a910123ac2e916601054&3378c0647613b699c415c06a07a4cbb0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x75x128_n"45a827ffde62180663fdf1f9e82a2616&621d5d8df7d0f7f506e4f4782628d0ee"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x75x128_n"b62b3ea878e2c0b16b805341dd2b819b&27a3202f72f08490ca0fde4635947e0e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 100x1088x17x17_n"ff95f3ae7119e392e79d97f9f404eec3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x1536x1x1_n"4599463db3a99602d9c440ca1508a678*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x7x6_n"e3005f63d7b7d745534d96fdc8548f7b*5&2fa5b1aaecc15150b25f3c05b2bfc31f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1280x1x1_n"5f405c7e10c7918ebf53c60c7f042983&dd63e590749dfa4f262e65c9573c5763&5758eb30eb49be3f0f16c9c06edd22d0&f107a1804b2918bb60d3d861da557aeb&d308bf141f09bc12141ea3c0b59b5335&e98bdb0a773b4ac1659f4f36c0a06234"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8a2b --strides=: 1024x3x224x224_n"98f00b5c0d26e9caa5b7ae5e58243701"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x2048x1x1_n"0c93059de64b7c624a6035318ff683b6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b --strides=: 1024x2048_n"04c892dcc741a4ea6749b85187069108"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ba --dtag=BA4b8a16b2a --strides=: 2048x1000_n"82deb0027d295b7207a26de4f9268abf"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x120x1x1_n"77511d430211b3b95d2b92819c5d66e7*5&02ded62bfe60f28d5d838213a3ee51a6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x20x256x512_n"7988fc0a4810e82088a3799a22c7201d*5&b7c432a9c30a948910ac800e1105e0ce*5&64adfb7eebcf433a1c0909f916caafdd&139d375a2daf321f9d8887ef97e332cd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x96x56x56_n"f37a8be91781b963066fe84b7d16aba7*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x1024x1x1_n"b65f0a4ef82d7d3ab76a05608f1bfbe4&75bdacafbdd5d71fcbadfba092c93cf9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x16x112x112_n"9c77c2a215c131cbaab70aa32be535f5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x16x112x112_n"4890dd34d2657f46df42eef50d509955"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x72x1x1_n"753ffce8dd22fb2311ffa98c3308c6df"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x120x1x1_n"fc8c366cf3ba2dfae6c483a9e3b5e4be*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x480x1x1_n"70428c9a2a2b27bd919b6275a65c59be"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x672x1x1_n"624c041432823c2c2c3cb12f18427990*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x960x1x1_n"48f48be2d764cec408d93905edbd3799*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1000x1x1_n"27d0909d53b7908a710e7837c90906c9&cfa63fec812a8162035afea5ed48e04d&1fe4537cfe9401a5ca7934adf3753e3b&30e251e177b527a62b4f06877231e6fb&67f7b4e26e38638cf75df42140f9b105"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 1x1x1x1_n"d48355ebedcc139f817996f93e1a6199*2&322d0166072de2ae569b56d77e7ebd2a*2&eba8d24fcc1c5fbf6cea932a6c83b3fb*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x100x168_n"df5c84f05ee9eeb9139182c6c7ba54ba*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x50x84_n"9f4b239df19a7b7da82c97f46b1af7bf*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x25x42_n"5547e43b474ecfa45d06cf031faf6d92*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x13x21_n"0b63d3f90b4c1a868664340193a2d24e*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x256x14x14_n"fcbae0856a7d8d2bb883707074b293b6*40"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 100x128x14x14_n"b5d54a2783521391810b9b506ec79605*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 100x1x28x28_n"2d68ddf24956cc78970b6b0a85cd37b8*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x12x320x320_n"655ec4d0ee464965a100bc565ed67512*5&62b3d75b5d23a0e2eca64c6edbaaa1c7*5&18970380fd9fb0340bd37a08f119c46b"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x512x1x25_n"f9bf95efe2a39305f2b75d5ec28324c7*5"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x224x224_n"7832c9c013cd48046f57304f1fcb01fa*10&958bd590afcc56ef50d7eeaa9aa7cfee*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x224x224_n"e9cebd15cf643924d7d142428aa66551*10&4bb1c81100d2bb8d43b7ec9870f2ae51"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x64x112x112_n"bbd56d097c1e1a7b3ed9ef355a6a27f5*10&0efbea4e2c0cc965c266d98658e01174*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x64x112x112_n"448b2d0333132d7b5c7d40299b4f7726*5&70322675d6e8a479628d7f14a9a9f2c6"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x128x56x56_n"b0e618a28ec8c852316241a9097ad4a2*55&f36ff57d194dd1bbd24a3acd506f7242*11"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x128x56x56_n"77d8cd13129cb27acc2d5cea999b3610*30&b9fa8669cb18be8df38a99db14a64368*6"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b --strides=: 1x128x56x56_n"db4419bba140fead3cde65de79c8631a*20&e8142332a781711d648947c69e3845d6*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b --strides=: 1x128x112x112_n"68c4cc83ef029ef356457cfac83ec376*5&f75b1e701d23ef099f96bbac24edb7bd"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x64x224x224_n"d09cbc83699b5558b6a80ea9b8556107*5&e554465a3183bd316516498ba29c9bab"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x512x7x6_n"e6d1e12928aa04672e8eedfe6daada86*5&15736ee012c510741a55168ecb252b45"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x6x6_n"379fcd353d94bb18046109166974749d*40&ce48f9a14ba0608d97b57f5d94cd0f52*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x6x6_n"fd9f8997d293ecd5a96f0f38c94dd104*40&60a76bad2dfe1a14a8d697b42c1e6dc4*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x11x11_n"a7ce33d3d86973aa383c9bc66fd5e2f0*40&cb3812181af0194bda1599210d039a74*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x11x11_n"376b839eeecdef3cd24f7ceba28ee108*40&32b3d85741ae2c2dc582e4b5ba0640df*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x22x22_n"159dcb85da1a5551b53171460397d586*40&2f833750a7b2f88479b51e2b1e780bda*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x22x22_n"d84a2a1c86283d0378305c9fd526f7df*40&35fd0d86e78ae791f779b95cf429f3cc*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x44x44_n"14e361601a69284eb05520f697b4609f*40&da9a56c2a84297128276cc01415661f1*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x44x44_n"16398bc11f67db4c257d7f71b0b3678d*40&f809878b6e6fa831303fc86fd13b0817*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x32x88x88_n"bf032eede86de637905e78f5a12e3d9d*40&5272d793e9ccab64fd92e2015b184acc*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x88x88_n"8ccc4d0d9878ce0734c355e4a493d3ea*40&5665b2736f2bd2abefcfbf75e4a1770e*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x96x28x28_n"f112ae140792690c62d3b3b6f0bf255a*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x192x14x14_n"4c58b8a664c7a19cce898483e692e8ba*4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x384x7x7_n"f89fa19b75c6f1ca82a2eb7b25682b8a*23"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x72x1x1_n"a1c59cc2c6d5bf9d7ba58af0dfbda6b9*5&61400e92c4cf1d2b01434967fb1bb9b3"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x120x1x1_n"6e1b0d591724b9710c213e8335dbd6df*10"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x120x1x1_n"13a69b1eb6324adda7a7bd70d9ec2e13*10&63eccf566d799bf359c0f90f793a2bf7*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 1024x64x56x56_n"265c46b100bde7b2a3e37ecc4bc2a13f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x128x56x56_n"9716d3b40fc9fdcd841e41bbc29f2630*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b --strides=: 1024x128x56x56_n"2f96ad752623d9792db4c2db340888e8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x256x56x56_n"75f38d4e6f4243b8f89c47637ebb4321*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 256x16_n"30f906182c84294ca52f146de034d93c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 16x256_n"f6d8081724b2d3d7818d9ca933dcb3b1*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b --strides=: 1024x256x56x56_n"3a5c9719c6461dd780c045e6c1173299*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b --strides=: 1024x256x28x28_n"83e32f878e3a9a90970f5005b7bd20c8*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x512x28x28_n"ab9310ddd0518d597332aa6bb0b3872a*9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 512x16_n"267795c5215989f0bfbb2d3ac798760d*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 16x512_n"a404b3aa2a3ba4db0d85b83c9f1172ab*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b --strides=: 1024x512x28x28_n"d5e16a9ac50482508b4106fbe76f1c42*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x256x28x28_n"beadd25405ab2e2a8e600c38fee88927*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b --strides=: 1024x512x28x28_n"d605d95e6918d6eba58b9d5a4be846f3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 1024x512x14x14_n"b97bf2a376d28f16192b0c83d230d775*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x1024x14x14_n"5c70911868f8796fd886d0328455f903*13"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 1024x16_n"4d91cf563acfe9338cad932c5c7954d5*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 16x1024_n"0b8deccefa346d3fcee2fe268a30b875*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b --strides=: 1024x1024x14x14_n"dcb0387c910340869a04c41f7a9a0b99*7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b --strides=: 1024x512x14x14_n"d07d200366db3c68365541d112accef2*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b --strides=: 1024x1024x14x14_n"784c069b3620303e93ee560cb5cc644a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 1024x1024x7x7_n"4c25f0f9c272dc587e783f29ac2b3e43*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=abcd --strides=: 1024x2048x7x7_n"8d231c7d5f67a1d88f87572847c30379*7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 2048x16_n"a740d64d89066e2b9164bb380340c74a*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 16x2048_n"32d8a8b3624e6a0b0e2de9c378a55e27*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a16b --strides=: 1024x2048x7x7_n"d4975efe60e2dc75a9e60ff9892d01a0*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a16b --dtag=ABcd16a16b --strides=: 1024x1024x7x7_n"6e3d3786c5c8a7cd6105a231698e1add*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 2048x1000_n"fe7f9f76ed8bb6f04351869ba4e8a0bd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x6x6_n"d8847f3bddba7512700479ff5d9f548f*40&2c7f306bbf61d1b82fca5e78213e3fd3*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x6x6_n"6cc6b72aa941ba26e46aee300770c92f*40&5d4710ead3a7386512ecb466f2d85f8c*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x11x11_n"5792ead2fd575b686f98a32c684e319b*40&c26b046170d6801fd00f484f105212c1*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x11x11_n"8c4406a0b292a345276f247c99e2c784*40&a9954eba133096f0b505427a5c577b1c*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x22x22_n"3ea506fa1fc35356437a19c0af491ae0*40&0bed0222a30ed83142f75c976568b0e8*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x22x22_n"e9911bef050940e4a7cdbd35d0ccf539*40&a1f8662192dcf0a711c4a6aa273e7706*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x44x44_n"928409591db6970160cc9da0edb3b7e5*40&01d6bfd2a2d0045471f9862cff132b80*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x44x44_n"1c40ca5664d1826a7ca7644e65931e01*40&95a9cfec2b8811b6bb40d8411b4392d8*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x32x88x88_n"1e495633deb29c54f3c228816eecd2db*40&8d696db78564a511c0ad62c9abdffd18*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x88x88_n"06100297631464581b0f5eca911a6f0d*40&63430b41987c9e5604f4baccb5aaab6a*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x72x1x1_n"e990649b54574e98dbe7cb433ae710c9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x72x1x1_n"767253bb11ebacf1497c22ce7f8387c3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x120x1x1_n"cc1c3c246fb313b60f9bc5157cc1a510*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x120x1x1_n"1ad7995a9b28faf01ec4d6340bb299a0*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x480x1x1_n"3458080c270e77600ea9414935c5e4e6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x480x1x1_n"2c9baa11e470d9a90b937bd59bd21b09"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x672x1x1_n"c64e581ea598fcbf4ac5efc1fa268469*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x672x1x1_n"013c3b7b00a4b80d7458a488109a09be*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x960x1x1_n"2dd17ff43b93f369fafe2cabdc6a2248*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x960x1x1_n"e1d3aa3855a24b3a8573fad4618f8961"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd32b --dtag=abcd --strides=: 1x1000x1x1_n"0db17162982621b97dd816a6886c6b62*5&a24f45030c8b35a9dc8b3faa5106b706"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 64x384x16x64_n"d9a2a8d2991c8addffbe32dbed2fb815*72"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 64x16x384x64_n"a07319f64360193f15823bb3dae924a8*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab --strides=: 24576x1024_n"a990eaa9b28957eccc7bff4bec7dad4c*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=cab --strides=: 64x384x2_n"13ad40743f2605f767370d31e3de79e1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1792x1x1_n"8ead07082494bacd4c2d364b169a14ec"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x3x512x512_n"466e43714e73dfc7a6f1d2cd4296a29b*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x64x128x128_n"b19537d29254c78d87b68abb068271b5*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd32b --strides=: 1x96x32x32_n"312a2c34969c1477627051df2a80ed5d*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x320x1x1_n"9a30cb794c980a8e46874e3fafbe8df3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x320x1x1_n"35aaa77ecda32fe50a54a03d24b1972e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x640x1x1_n"dbc9f7b4f6831af02b41edfe7c59b06a*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x640x1x1_n"5ad874e119952edc89a6079acd180f5e*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x960x1x1_n"e08b5a9b36497153ece283315afab4d0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x1056x1x1_n"7b5ee74937afee669c04bb1f905fd970*14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x1056x1x1_n"cdf578a7e284693ee8179f32cde51b81*14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x1824x1x1_n"b081e90a79fa4fb632e6c18aa36c1bfb*18"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x1824x1x1_n"26e81f4b21747c0a9d7aa46759e9b4a1*18"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x3072x1x1_n"6f6bd364071bdbe486599312875f0c5d*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x3072x1x1_n"5bf0bb9458f9ea0cb042f72fba94b09f*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x288x1x160_n"3cf8ce5fd10525db12d8336e67dd125d&8bb8d6fce626272749338988d0bfd094"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcde32a16b --dtag=ABcde16a16b --strides=: 32x16x16x112x112_n"df7bf9ccc219c8701b425a8393b26364&58a3c2afcac1f3c0e8ad781359a92e8b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x72x8x1x1_n"4b7293935c956587454473fc72a78386&e4797b634836854a35189904107d8093"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x120x8x1x1_n"aa44f81cf026b8a843021e5d8dfffe09*2&28358cb0052cdcb4afbb3b2a7d7c03cc*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x480x8x1x1_n"d9778326174414378d33940a6d20c03c&d55f3d7abcd7ff0d477ea49d75ba7ceb"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x672x4x1x1_n"e2264c277714bca66a5dbe6f00a346bb*3&0716eacd0f3a5d4ee377a1f4d056cd38*3&ec73fb2fd6f90b0fd260234047a359a0*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x960x4x1x1_n"557dc4ed1f644df3d946c36a3c0486cc&d394d1274ef25ec145950b39be9fe4f6&7c7b0fab754ed5eeabbb57db056076b0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x58x28x28_n"34f30ff2b69e2b3a3d02bfaa81afec19*3&9febb8e4eb85344173c3c80bccf07ca8*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x116x14x14_n"85abe8bf998870a62c8bb8df7ea9ce75*7&6afb89b49d21bab879f955d447f062fb*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x232x7x7_n"24f25c58ceeebc750d7461a984edc068*3&bf821d55290c3eaeee80159b1f13c0cd*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1024x1x1_n"9e05e4979a45731ddd97b53ff3f4b20a&9338a7d88e4dd783077349c6f10336be&15ee47dfff9fcf7160b82a5016010794&f73f5dcaae11fd8ab124b018ad0d3a0b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x38x38_n"04c8247e60bf595f7e6e922efc3ad56a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x512x1x30_n"54af73353dd87c7a8fddad89e0fe99cb"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 1x1x1x1_n"6fcc0466521108e359054b18fe8aed8c&1facab2a11e08ecc382dad8465f48529*2&cfd7fa6296c71ed40250ff6689f1e30d*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x36x360x640_n"14a0fc2ea539a42f961cdee238c5fe35"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x72x1x1_n"4f93d4f5d61cb862c6efb204b5996775*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x72x1x1_n"e0aa2ecbe2b0e2b3b74ea18050197318*5&07c64f5ff51cc6b62de23e78f3bad676"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x120x1x1_n"9306a6fff59ff93928855712662718b0*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x120x1x1_n"e9168695ff11094c57da13a271edd889*10&70681ef1fc9a224018254e5231d00811*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x512x512_n"2861be5e94be8882018881e1ce311ed3*30&775e20771fe34203431eb0d45679203a*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x512x512_n"6b08407af4089be68a7375687d45e535*30&0fda23ae065b22f0db0d70dc4f821ee0*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x256x256_n"f92d2fcd46a98128041aa2bb2590a320*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x256x256_n"fb037a5e3d00ee332437815f126edf3d*25&2272aae96c8ede6966a1342090378f15*15"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x256x256_n"df1976bde0e3b25be62630b12b37aa1d*25&e21b72348e6654b598c43fb413b69587*15"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x128x128_n"f87fce6c0aef3c7d6a06f4670d0ba2e0*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x128x128_n"05d68e178ef39da0fe824b64bf60cceb*30&3fc62f08e5e922308f6a58b7ea5f15da*15"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x128x128_n"1c72d6e7e2d8f0713818fac7cca3ec27*30&7bb3250475b7ad2e568dda27d27be8e2*15"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x64x64_n"73b33cc49f98cd6f8c9755a1c829ddd5*55&94cfebf1b80367317f097bf010532a52*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x116x28x28_n"b79b2e5b728b2589db1cbeb1b93ad463"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x58x28x28_n"e8a27f0a905d493504b1fb81b7a0e5ae*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x116x28x28_n"9ce71fc3504c0adb3fb786003c0158f8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x232x14x14_n"2830fab2af0b10b46b421ef0dc2e4980"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x116x14x14_n"1f23f2baa267be408aeb8c17201c2969*7"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x232x14x14_n"e481efb0791e104caf38c6d1d5394248"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x464x7x7_n"f8022bff4f1fe7f88ca4dddee8c9cf35"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x232x7x7_n"ee77bc520aad01f332d9875f8a6904ba*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x464x7x7_n"8fc2c78795e824aba17eed4accaf622d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x96x1x1_n"9721b129e3168f54e7c647f74eef5e19&99336c22108966635bd1e81afc01c8d3&6624244a3faf1a917dfe3f2a92fb1fed"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x144x1x1_n"a9b3c9b351daf96b673533641c427c1a*2&6324199ec3a050ce3b8f418153d8b979*2&07697477c942d775c8283340778d93e6*2&c2a49fb46310af5998d0254a1becd0b3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x240x1x1_n"fa1594c0928878d87a7c433fdd3b39e9*2&cdb290494dc3931b9094eb33e2204f6d&b918bc91f70eea0e18b0cb1a5c2e9cfe"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x240x1x1_n"eb8806b695df3acb463fe645386126f5*2&200a9db06a817b4ae7bf7e1f2acb2298*2&734c82a8f61ec11d8737e1b1743a5f60*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x480x1x1_n"a9bbc4176134ffbf72b614814e4a3ed3*3&d3941ec5e800b272194fa5be31a7c2bb*3&c867d2c555e9d3d0cb4c5e54023f0249*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x672x1x1_n"98da8813478a3445470f90deedbe7a2a*3&a5dc33a23ac513800917321d7eb09ef7*3&543991be74f57fef592b0a5d9acb9c0c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=aBcd16b --strides=: 1x1152x1x1_n"32c7f31991aa54a92bb6ee9f6e717115*4&18a2d0d89e7911d8730fb8caedc1a71d*4&57f8c7cbedfa0880874893f5e466ebbc*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x1152x1x1_n"261ee2e9c98307efac5d20281b82ad90*4&f1962a4cd279d31dfe711fcadcbba941*4&a4639c87f8d14a70429af654a9f468c4*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x48x1x1_n"89dbcba54135d778ab21f9f413e018f0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x120x1x1_n"cde4a4c9ba2b086e6334beef0bb305c3*2&26fec7f5a840140a23b3e4be7f6794c0*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x64x1x1_n"ae1e586638b9f3d9f853fc8ab0c29dd7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x72x1x1_n"c1e3a8b6938e0bbf14274fa900014e50&3a57ef5437c0095d50330fb702509d3e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x144x1x1_n"367ac29f62b546f54ed73a35de2f17ac*2&71c5a77607a4ca277954a3eb084f88bb*2&fc6188421cdbe343d1e2828ec7a9a90d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x288x1x1_n"4237d2e99876342879422eb4d2a4a507*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x288x1x160_n"8ab8b5904eb117092f18b6f80559dc42&730c76dfbca8c858dae68bab63b56a18"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd32a --strides=: 64x3x7x7_n"3bb606b8cec77a958f01cad8ba5fded7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 64x64x1x1_n"795641e01465aed3c05968cc1a381dfe"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 64x64x3x3_n"e3fd4dd93a0b58dde434f0dea50a0512*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x64x1x1_n"7c3e7adc0ee48186be1197b2963a9d84"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 256x64x1x1_n"d284cae4c9623539b29f043901220263*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 64x256x1x1_n"be67f47dbe8a2262f049cb2ae8bfad6c*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x256x1x1_n"50d7d1e48f0629537a2f3a9e51d1ec4f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 128x128x3x3_n"b3fd79ff1e9dd09e926c2b05e17b05dc*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x128x1x1_n"4a6c58fd94d0f00f7bf0c577bdee7da7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 512x256x1x1_n"9d85eaf379b2965514ec673391c4f3bc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x512x1x1_n"fd1880b60cc9777cbd275396204d09e4*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 512x128x1x1_n"8e40a48c60d2dea0762978602da2df05*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x512x1x1_n"8f774243614700cd602c893028259873"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x256x3x3_n"a7ea1e8c603bed41773483fa489a621c*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 1024x256x1x1_n"3c37771e2abf5857f325b08567456c5d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 1024x512x1x1_n"fca1e9e3169af3106fd5774b6ebd894e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x1024x1x1_n"e386e5119c929ab15167100d8ea2f77b*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 1024x256x1x1_n"bc809e7983cf84acd8152a39e5c8560a*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x1024x1x1_n"5acb1982a180a7a141b95aef4e4fd91b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x512x3x3_n"d37cc5d336d0ebbd2360ecaeb2b802ae*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 2048x512x1x1_n"a3e60afd10297f8ec2f8ddd7f5c531bc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 2048x1024x1x1_n"c468f330553ca888c72734a6899a156f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x2048x1x1_n"ed1802d20431ff628db9c630806e1679*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 2048x512x1x1_n"237faa2fce3863edcb6e53ab5535f14d*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x96x56x56_n"d5ec33b0dae666bf1658aaf89bf53859"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x58x28x28_n"9de80b937fceda6c267cbcff8d98b59c*3&deafaf0ab73f086a22cb1b3805890ea8*4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x116x14x14_n"5a3a91f1ad47934a63564976ee813ecd*7&d11841d51cf84ffe929184cbf44b3772*8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x232x7x7_n"b6d63d9390461dc1f4b089e8da66ae89*3&eb0658d5fe3f6b1b35055526429f5ac7*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x58x28x28_n"1edf9f17c89b492d4f08759030126b16*20&0e427081d802206a511b7be1ddaeb068*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x116x14x14_n"644c5d74d32d8ac2e61a381d147d51d9*40&300fc2ba92e0a408bc1b25327eabdf6c*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x232x7x7_n"40ba76866137ce601d32d21a3568f6a7*20&6f969e88694dd9f260e4c4a1fabe74ce*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x512x12x12_n"4bdca0ea7dd2721a2249075e278c8e1d&8fae004567f0716ce7ac027771321db2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x512x24x24_n"a919e998eb0e43ded4a573cd68c3fe88&80053dd316ccd933f92eb6dc338b11cc"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x512x48x48_n"253208dd372f96eb6a20424d876c2a33&cac079f4d0cef437071d28904d11cafd"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x288x96x96_n"11dcb8c995d41f0dcc8d469190563893&be4a50e837599038091ff1711f76ea5b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x160x192x192_n"e9348f967d0208c5c180e5bdfa5b6910&bcecacb006e3e491a71c4f0c4362866c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x96x192x192_n"62c1c853689468ee8801740e4b9fc09b&f7ad5482574674e5102bc5b4e9e7365f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x72x384x384_n"a2cae5e766877a3fc474764c82b98596&0af8542cf0e870774e9b197d683170e5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x96x28x28_n"2354a68c00122ccf9dc50240857661ca"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x192x14x14_n"d4279156c725726bbb4d8887884d0693"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x384x7x7_n"5f8b444f9dbf240be8615f022a5ad55d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1088x75x128_n"2ef64140c9c90b8387b304293e1e8b3f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 3200x1088x17x17_n"2c36a1c570e4345ff6175b5e85f72d50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 3200x1536x1x1_n"55e01d35ead280b5a609c8232151ef65*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x7x7_n"ba9a55738f456ec4357f8dd51c5b6b90*5&a8811745b120ae8d7eded1c1330c8290"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x128x8x8_n"32516479f88d209a83eca38504dc8d19"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x96x28x28_n"cc64a0af701f24660ef6d49091f907a1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x192x14x14_n"1e964319639b920dc9d5e9329caae4e2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x384x7x7_n"888451561d77d3aad4b6eec23fbe60d0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=aBcd32b --strides=: 1x64x8x8_n"a2cb34b45ddff0d844e97b576b745481"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x1x100_n"d82e631873c9736c76ca33dd4aeaec06*5&0ab712eb534d0abd4a1312981438ce62"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a32b --strides=: 100x1024x14x14_n"dfb2a78d59ef84993e78ad25051adb65*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 100x1024x7x7_n"2f8d0ca55a30f8212e2ad89bdecf4a5b*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x6095x1_n"842737eb81bcb88409a2e397504f02b7*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x6095x1_n"ace0d4f703d54594739e37afa32c6399*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x768x95x1_n"c0005ac46882cb759e139c1b3b997d83*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x256x55x55_n"52da8383771571c0e3763462db572794"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x1024x1x1_n"06e61400ba973c0c0b2dd3e11036f133&610dcf1df67054423bf44b668ded3811"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x80x366x1_n"afe07612c80f45723a70bf46d273c369*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x362x1_n"30eef4830cd8e259ba4145f9e7b42c29*5&fa8ce9425df8bcb56ec7bcccc0cc10a3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 32x19x32x56_n"14cc68acfcf2d9feaa6c2141958ce55e"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x19x32x56_n"8d542dfa3002528a3a81766b2cad2a7b"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd --strides=: 32x19x32x56_n"7a4edaa2812bab5c3c1181aa20e1b3d6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 32x38x32x56_n"5265d65a5582b26ed6be35e3664f80be"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x38x32x56_n"073fb70bf2525a5d7e83f13b9d8b9874"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd16a16b --dtag=abcd --strides=: 32x38x32x56_n"da288c7d9e80f92859db9b3bb844080c"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x57x32x56_n"c6b098187133e9a31258c2f161138156"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=ABcd32a32b --strides=: 1x1x1x1_n"0a2171c528df569d5dea0266a772251d&05aeef94016089befb6b8320a6e27f1e*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd32b --strides=: 1x1x1x1_n"602855a314f57245e5e8d6f1a98272aa&f946388d99ce52ad6356fcb2bf53d629*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x1x1x1_n"bf8e50354b68a0a5f6b4b2db97d3af7d&e5f8d8c0a35bc32658b61a5b5eda38e4*2&afe3f5b2dbb298b1ba3a6ca3cd17075b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x24x240x240_n"9303f20f0e98be18e16ecbc98deac4f0*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x320x1x1_n"148e1c124eefac20a69fc08187d8d2f4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x320x1x1_n"529f6994c49a92c145120690730b4a04"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x640x1x1_n"aa38ddf539d8018d64c9c85e445e3c09*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x640x1x1_n"e2070b9ea65f07571ebdae765b8f2443*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x960x1x1_n"e5f7b74bdccb90193fdc7f4babffe660"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x960x1x1_n"b07c13cbbbcc616fc9c530f239b92e52&3d7fe1307107937a8a60aa59951fd230*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x1056x1x1_n"44e50af68d6d3107afd00a52497d6f03*14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x1056x1x1_n"6c7c151060504b11392458c37e432fb6*14"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x1824x1x1_n"8cf5f5d178ecc6ca1596fc21185d0621*18"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x1824x1x1_n"b30c6805efe56b57ce4fe7b31fea7637*18"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x3072x1x1_n"4bfc182d11797ef551401587d19a5bb1*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x3072x1x1_n"322f3414bb132294fe3d1862329a0ea2*4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x256x55x55_n"671ac8824b1acffbe7ee2bacc4cbbc67"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x12x320x320_n"6d937434c64b5d73ffa18a728df1ac21"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1024x38x64_n"8adfaf8f4ee140b9711a4f2e3ac9286d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x24x38x64_n"3cb6cf66f349c962ace3ddf27a09ca03&3bb383bc094db09fcbefda2485e8c87a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x48x38x64_n"8b9ef694d6fa74caf3e0513473695169&bcb00c78db04ee52cfb9f3fa1b0ca0fb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 3200x1024x14x14_n"ac6cf12288d213d8fbaab7ea572290c4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 3200x2048x1x1_n"cb7c2fc7736469c0c00ea9453ef40d96"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x256x56x56_n"ffaf94d550634230193bbcffadcae1d8&a80d2cb8286a628ddaa518d7ae3b41e1&506f2e117b4ea9eff56bebbee8379ba3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x21x65x65_n"a259f42e4b5d76ad25a90bc46e832e41&ca26c7b017f48eb25d914a686b11bbb6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16b16a --strides=: 16x1x3x3_n"0112b76bc360d92c59e62c2221026e6f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd8b8a2b --strides=: 128x16x3x3_n"3499bcf99c93cc209128347dfb017d80"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd8b8a2b --strides=: 128x128x3x3_n"94ffc08ade801ad0e9f63f22c771b6e4*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd8b8a2b --strides=: 1x128x3x3_n"08b5bb43b7aa2a295a20bc294e93f824"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x32x128x128_n"dd8e4eabf722e73d8cc809b593ae962a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x16x256x256_n"12a638f066880a1a1cf70a8275e513a2*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x960x1x1_n"b7e56fb8c25668e1db9c661395fae452"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=aBcd32b --strides=: 1x1024x32x64_n"468e8ce31e9d11e99fb235fe35f03956"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x80x326x1_n"c208148a1ab02433ec552e73f6a921dc*5"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x128x322x1_n"66bebf8669d6504a0511b2d25a0b935c*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x256x10x10_n"97db40df81ffbe89715c984e69b719a3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x512x8x8_n"8e67a6e3abe5cb5adecc36cb192517a9"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x512x1x1_n"68d47efc7e647a712aa603f138cf0b5a"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x116x28x28_n"ed5ce6e3c3ac2444b58ae60f797a3288"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x58x28x28_n"6c2e9e7e56ad349446c013fefc98c99a*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x116x28x28_n"385273cbb0e394fb17b460a7a3a0824c"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x232x14x14_n"2723180746a96663f0229fb5d58d0ab8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x116x14x14_n"3ec31d2cf252ca5d994f55ba1231daea*7"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x232x14x14_n"094b9bb4983aef22670fdb6ff09c43cf"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x464x7x7_n"2fa9ef80c2524241267bb55a2058d83c"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x232x7x7_n"13a04e7f1d641d63ff6f6399821ce078*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x464x7x7_n"7edf4bfadb377dbf16921ac646e40e55"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x64x256x256_n"3cf5de88b5a4ad88a454bbd57c3789e2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x64x128x128_n"527e37803dd96b7258e14a382b1437f8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x128x128x128_n"5b637e65b383149b0307454313136246"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x128x64x64_n"ca1e035053da1f41c77eed0240bce10c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x256x64x64_n"b672425d4cb3263453657050b29fde64"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x256x32x32_n"45c2e60b5c2e8d12964d8070545a0d88"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x256x256_n"74b0f1c8f5a6bb4a3164fd3e9a7841c5*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x256x256_n"2bcb8cac505724b8a4908aef58c6fcfc*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x512x512_n"aa73141c05f8c3dc303e1d06d7f4e451*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x512x512_n"286414b576d3573ea08a649b8b1becc6*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x80x326x1_n"aba7e1f878d34910607c934b9c4cdd3f*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x322x1_n"6ed5e8a4c00101f85007396003ff54db*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x96x1x1_n"d3d7172613deedc10e7ded24f83bd0f6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x288x1x1_n"313bd80e17b75d1bd2be29b0de3aba31"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x576x1x1_n"28ca25417f3648c623e8553362ef8210*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x512x7x7_n"b01164edd74cc923fd7da2014ccc1024"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x20x720x1280_n"5a658927e4909e6b819f21663ae22c7f*5"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=aBcd32b --strides=: 1x128x8x8_n"2c406a8e936d7c13cb92e06cac400103"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x15x192x192_n"ada9f8e574741aa1e6cb5177bb1c6761&265f031a58e9f1a2fd08ccf5877bd607"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x80x366x1_n"2ecf7297a6a97d1a680b7b461fabdf50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x128x362x1_n"b8258f6befc678a95007c38ec48ec949"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x32x1_n"9bf8cbf92fa78a79973e987861a0d9bf*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x36x360x640_n"c775f424727fb5904426df4f2c9df66c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1024x38x64_n"d260599d021e7f5e8347b39d50e49af0&ab2126f9c001501d87446fb2819feffd"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x64x256x256_n"34682e4929f699a62f2203d85809b4b3"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x64x128x128_n"54b98ed767331618ed0a5328cf678e16"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x128x128x128_n"1b5642053626021b9bf7f76b7395e085"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x128x64x64_n"80c8f3393f48c49885bd49d08cd53b09"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x256x64x64_n"9e86a0c2b4eae7b0eea31a446bcccb55"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x256x32x32_n"32a69278b4f8bba4821a89096d12d42b"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd16a16b --strides=: 32x1024x38x64_n"2eb6cb65590387897bedcf9f54b95fea"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x1024x38x64_n"509a7f770b437868afefcaa30fc5e2df"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=abcd --strides=: 32x1024x38x64_n"e5d664a03c0590e1c42c6bafaaf87be6"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x480x1x1_n"0953a0ce9531d9c6393b272eef412aab"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x672x1x1_n"8d9d210ae236a2e5de234f33ed2a0e7b*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x960x1x1_n"3ff6b8058be0cd5d8f310b8e473f6189"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x3x224x224_n"17f1ad74e5508f36c80890e4631fa8b3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x480x1x1_n"05bc38fb112d5581d1ee069dba336a93*3&d619464b8d795d50b22facff407af377*3&9ebf9d9f7e7c46e4a441e04f5b270c93"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x672x1x1_n"23f35a5e42642046548c685b6507884b*3&f6896e6a5c81d8acf1f54a684462dbec*3&69506aed8752ab4eb6640564169cd8a7*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x1280x1x1_n"c3c9ea6ebf81c5a16b458ee0d035d9d3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=abcd --strides=: 32x1000x1x1_n"9687d0038d12ab6e3386a3dde4e98d4e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcde32a16b --dtag=ABcde16a16b --strides=: 32x16x8x112x112_n"c68aa94f4f9ca2cf07bdc3b928ab0333"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x72x4x1x1_n"2b06e5dd72b016c15da6df56430fd51f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x120x4x1x1_n"6d99e81b2f48475e205920e7a0e5b6aa*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16a16b --dtag=ABcde32a32b --strides=: 32x480x4x1x1_n"44cae8ce9c140614108f6598ae7b7463"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x48x1x1_n"1d5bba6721dd7032214a3d2e462a2d65"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x64x1x1_n"a3d93e6c46ebf08e1a0cdee829d230a9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x288x1x1_n"577f8003099fd262965e50c51a90cf67*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x324x50x50_n"fee12af6a735d29a86a97a2b96d95dd3&4c981a7fac6f55b56db1bb731bf39561"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x16x50x50_n"539b1673e50c639f0f88ed137252d1d6&887e484b5a1d4088281e7a546a006c25"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x486x25x25_n"7a9b8dc9fd89f62552ecbc5a201f517a&6c79354003608622c3458ff730064cff"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x24x25x25_n"f4f668f5dc62b05dd4a140abae3e53e7&6340d98a1fecf398a5ab3f3798f616d3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x486x13x13_n"cd4256d588e5b9fc33753c3eccde2fd6&4288b54b5b722c6bdb98dd1a7d236860"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x24x13x13_n"4f87b19783d96d71d90881e35698c2f4&625a4d49ba7f0833b6085254734a19aa"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x486x7x7_n"6c65c73c52e8d6b0926c4cb53d9b59ee&b7c921218963de3aba498bcf1b6ae29d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x24x7x7_n"630b5848efab98648a6f1b66aabe273c&9cc080281d7c312c0988902ede780704"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x324x3x3_n"089fa0b77a77dd00b5a75479af836818*2&d4d6aa6955ffd7ed97725c2f22194736*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x16x3x3_n"886ba9a965bedd021ccead571618701e*2&243e77e655f0a1c6dbdfb0d2e46a8d31*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x512x7x6_n"06dff34f36608f39a1e4cf50d6905973"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x480x1x1_n"96cf42232314fb34cb6786e539619765"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x672x1x1_n"74489ae984743d40735f12cde8431e81*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x960x1x1_n"6f9eee6f82f404759453047a47063b9b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd16a --strides=: 10x1x1x10_n"ab841b5072dd9c522ad5f49fbb9d3979"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 10x10x1x10_n"33b129614b79d02974c318177533869a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8a16b --strides=: 10x10x1x10_n"0f3334369fe7a714282951294fd0e7c5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 10x10x1x10_n"8dc2d35bf2093782fc4fa875719623b2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cdba --strides=: 10x1x1x10_n"2648faabaae2b95a77288361937ad24f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x12x640x640_n"bd80311849dbc652067ccc4ab8d9ff3c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x96x1x1_n"e1777b0013cf02a08ad68369cd4aed39&19e4efabd2deff1755de80e8cb192edb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x240x1x1_n"db5df351b51e3290b2d4c1b57fc81619*2&edd9b3d7665d41bd713ee47569000a35*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x240x1x1_n"fc0ebf53f0d8590c369e84d8fa7479c5*2&c4f1f9ffafe33e02f42cc3036b3bcc12*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x480x1x1_n"1f748da422d8f7af4fa84ba51bd4e8c5*3&47757aa96bdffba18682940426616c91*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x672x1x1_n"13e315b20d401496939f7291c8f220de*3&6e97a6bb51d70919853b0daa5d9fd098*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x1152x1x1_n"0ae972e7fd383f2b60ea5515a17f9491*4&a8071f6cacfbcaddb95049b1a2599b1e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x1152x1x1_n"ff3d6e7df0ec2f32f3ceab8697d6997b*4&eb2eb5040dd0c54a206b08bf2717382e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 1113x30x8x64_n"debe32b489facc15d09be4a6f89791f9*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 64x50x8x64_n"3a6f012097990c62248de707b5821d61*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 64x8x50x64_n"fa0b0abd01d6f1529e0332abb10e98a3*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 1113x8x30x64_n"ac2ed591629ef367732c1f43b6a8b4e4*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x116x28x28_n"692afdac8986b222fc3bd3842cab31b8"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x58x28x28_n"ac991f3ab2df2b52d4f077a69525d4d5*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x116x28x28_n"d5184b3f968cbd528d338378968d89af"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x232x14x14_n"a3a3876cc424a51cdc54de472c3e6bb4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x116x14x14_n"8f485dddf85c15c31516da51772899f3*7"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x232x14x14_n"b3cf7770af8d896680a5e953af610729"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x464x7x7_n"c70547f00fc4e8b3eb23f29d268ecaae"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x232x7x7_n"da01f58c5f989684d2a526aac1fe238f*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x464x7x7_n"ea5e1e2d44ea5c3321efb5b1fe3ae1fd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x24x64x64_n"cb2659fa90f529a734a91686f8953473&bf1c9f306291edf3e8d6d4184b595c46"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x48x64x64_n"1d7aa793405ab88ed315d22e0453121b&ab6d11c6cdaad0a0a2a0a7bb6911d217"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 300x2048x1x1_n"2e4de830e47405517a7c910145b44aea*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x128x1x100_n"e82708f5b2ead4cbb5a2819fe26fd786*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd32a32b --strides=: 32x1024x32x64_n"338e244c8cd076dd91ebd4c4a88745a0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x20x256x512_n"1ce5365f84ae58121583410ec58a5511&82515cfa9db8ae59b972683e9a58a971"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x512x7x6_n"37309d449b64028ded6525216f7d7a44"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x1280x1x1_n"ba092cebc1f5ee9927e2414933dc4294"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd32b --dtag=abcd --strides=: 1x80x366x1_n"524d1117831f11d3e71bf8b27add036e"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x128x362x1_n"b8fbb3700433d476f346cc7539bfde58"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 6x64x224x168_n"865984fa7bac1467afc8c00a91bc5bc4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 6x128x112x84_n"c9a81482dc7166d205c1b02aacdb7a12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 6x256x56x42_n"7bfeb005040f7456a9fda7e574de42b2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 6x512x28x21_n"478a5ca18221ff3a64a6befbbb6e98e7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd16a --strides=: 64x3x7x7_n"863301053b7440aa5decb2b281cdd668"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x64x1x1_n"96a2b0748a857eeddfbac39b6a1f840a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x64x3x3_n"207b4ff3aba121d86aac4cc51be819f9*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x64x1x1_n"d8ce48fa0bfaf55b9f106d30affdbd33*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x256x1x1_n"c1bf0878bd1483086a1e5b5df7dfd3ef*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x256x1x1_n"c95fb9e10d1e6dc7583d0974ba559771"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x128x3x3_n"9400b2506ef8b811fb275ecb33ec5483*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x128x1x1_n"695721201f22959ad546828a37128c20*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x256x1x1_n"b667b84a5e7eb04d3d9cb1454c394df5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x512x1x1_n"14f1604b1943e9bc558068c97e0c986e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x512x1x1_n"629b28bcdebaf3ec7b3b2a43fd2a3dcf"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x256x3x3_n"26c3abd9a003977536d351f7f47436be*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 1024x256x1x1_n"0778fd8e67f279468d109633c81bee8d*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 1024x512x1x1_n"b388dea0a6bcf3cc2136455628a18b70"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x1024x1x1_n"a6d371988bdd2cc789937e28cde959a4*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x1024x1x1_n"f3451a78d44adc4139cef321492e9793"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x512x3x3_n"fdbe10046c3285ba87251f307109d0b8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 2048x512x1x1_n"4cf267d850edd270f8d25a919d6d3a02*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 2048x1024x1x1_n"437976126f14fc50ddafc2cdd547900c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x2048x1x1_n"93316c779c8044d5bf321edf3560fc83*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x3x240x135_n"0844d86db2c1a3b6e2dc5740da49e7c5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x512x1x1_n"21f6509bdfffc3693e9738d117f78cc4&6201cb086ae379631b7b1a546e9378ad"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x64x256x480_n"f1560f7d3b9a514b31a0cc77d028a84c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x64x256x480_n"3b7c6c69e320b52f794039c43f5b9fe1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a32b --dtag=ABcd32a16b --strides=: 32x128x256x480_n"886a1837c198cc66da09645a36672716"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x512x7x7_n"77bc56d5a3a090c9c81da524f1b1506c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 5x512x1x1_n"36cf67846d60be195fa5750f098942f5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd16a16b --strides=: 32x256x150x150_n"e03368d60bec1aee7859f15d3cf85120*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=ABcd32a32b --strides=: 32x256x150x150_n"55f1a45d4a2510e9a0757baa3288abff*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=ABcd32a32b --strides=: 32x256x150x150_n"fc1357a8da43e764f455b25fe710d739*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=ABcd32a16b --strides=: 32x256x150x150_n"8fd9a19c29a126e6b3d706d69cc176d0*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd32a32b --strides=: 32x128x8x8_n"3c82fa2de63226b599d027b00c339340"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x3x224x224_n"5da046007925ef8720640d5c73a1bc7d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x80x366x1_n"4681e738bb80254a42eea63b3bf6bf57"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b --strides=: 32768x13_n"7a4370a0b91d5ea05bdfc25d0196be85"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB32a16b --strides=: 32768x512_n"e69c93766d30d34811cf26ffe2f8ed9a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB32a16b --strides=: 32768x256_n"f4263dbae6ba57f6b9da51ea409fd03e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=ab --strides=: 32768x128_n"9c8de79ad7ed9fbd67a5971438660f7b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b --strides=: 32768x479_n"426cc1b1cf85ae8ae4aba65b4d9b5b3b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB32a16b --strides=: 32768x1024_n"865a948219605ac5ccf9d5671d814696*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=AB32a16b --dtag=AB16a16b --strides=: 32768x256_n"b98d8d9f9c67b3d183cbb6550562fd23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=AB16a16b --dtag=ab --strides=: 32768x1_n"39ebd1c1b52df4f85484bcb20bed2b9a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x480x1x1_n"0887a17df557238293e25ae5b48663fb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x672x1x1_n"ad7adede0df8b44df3dac7ea5aa46481*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x960x1x1_n"21c7bacd186adedf72c4724f08cb69a3"
diff --git a/tests/benchdnn/inputs/reorder/option_set_fwks_key_gpu b/tests/benchdnn/inputs/reorder/option_set_fwks_key_gpu
index e73d589edbe..90eaa18e9b1 100644
--- a/tests/benchdnn/inputs/reorder/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/reorder/option_set_fwks_key_gpu
@@ -1,1734 +1,1920 @@
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd16a2b 64x3x7x7_n"36d03be33e0ab704583722a411ea782a*1&9ac34201c5406fc80573b5856574f747*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=Abcd16a --dtag=cdba 64x3x7x7_n"150703f8677a01cbc372cd3ad0801b8c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=bcdea --dtag=abcde --attr-scales=dst:common:0.5 1x4x224x224x160_n"b303fb1f205a880f7764a9e90eaa8296*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=acdeb 1x4x224x224x160_n"9ee20f87d710c62fcb922b80ba02c6c7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=a --dtag=a --attr-scales=src0:common:0.5 32_n"6984d8779b5b9a3c9c0cf26e94ee4bb8*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --attr-scales=src0:common:0.5 1x32x224x224x160_n"a4aa5747ecca734a79e604d8ce2534d7*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x32x224x224x160_n"1ca1e487c1a63507a8401ab476ecc616*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x32x224x224x160_n"4a777787a47479b4458f17d49629ceb5*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=a --dtag=a --attr-scales=src0:common:0.5 64_n"1dcf4b8b1141f7df118c9c35d1083dde*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --attr-scales=src0:common:0.5 1x64x112x112x80_n"de64ccb177a3b8efed8fc6292f6deffd*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x64x112x112x80_n"ac7e48e40b3e3f6412f08d910a8445ad*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x64x112x112x80_n"5a436b6c390eb73ce47d16b4bca5b18f*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=a --dtag=a --attr-scales=src0:common:0.5 128_n"6908bf62047195ab7480047751ba3871*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --attr-scales=src0:common:0.5 1x128x56x56x40_n"9ef6bfe4cceb52f3a795660bd65b18af*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x128x56x56x40_n"30032cf17516d3b9fb6d99e5e039cc53*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x128x56x56x40_n"a1d41d6da6544242c53a7ebd7edad69d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=a --dtag=a --attr-scales=src0:common:0.5 256_n"de1e6002bb6d47f6bfe155cc223f2ab5*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --attr-scales=src0:common:0.5 1x256x28x28x20_n"34bb96345d9f7de855f9879e347db5f3*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x256x28x28x20_n"4f5cff91f56ff32976b1e42c71ae8d74*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x256x28x28x20_n"4fc30efeac6436ee636162fe1750e96a*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=a --dtag=a --attr-scales=src0:common:0.5 320_n"ef82d57b25bdf6670348d584abf749ca*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --attr-scales=src0:common:0.5 1x320x14x14x10_n"e2aa91c97bcbf099586cc72cc3754a00*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x320x14x14x10_n"229f27822cf9bbc73de2c15aa0bcf991*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x320x14x14x10_n"5d54ec87b7154612ff8269dc84192701*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --attr-scales=src0:common:0.5 1x320x7x7x5_n"c01342290315d87849a754f87a7b230b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x320x7x7x5_n"c2ddb3f9f17853e9235f244bcd827545*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x320x7x7x5_n"d956f2e0c896fb4f028e2289646b3b6d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --attr-scales=src0:common:0.5 1x320x7x7x5_n"b9e562fa943f7e1992f24eeb21b3528d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 1x320x7x7x5_n"c2d410d01a410a4ceebf9a3b46f2bfcc*134&37996b30d008ff9a24e3608aa5fdfcf4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --attr-scales=src0:common:0.5 1x320x14x14x10_n"b4688452c3909042a1fed2989e5837a6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x640x14x14x10_n"12bb88f1c70c38af686c687e9b82c64f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x640x14x14x10_n"d062ce8901310557183babfd910bae19*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x640x14x14x10_n"c87d60965dcf9cfeab2389237e96f774*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 1x320x14x14x10_n"8ee0fa5dda689f9c002a06c781173e97*335&fbd8d5d323aa0f8e35b442e9212c726c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --attr-scales=src0:common:0.5 1x256x28x28x20_n"e0cce435aea9d1dd46cd361cadc0b8d3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x512x28x28x20_n"ac94fc8010b5d3a4a92a96672707d054*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x512x28x28x20_n"c5f628d2824f703712989c63a35eb6af*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x512x28x28x20_n"ff3ece14edd96c14f545d3fb0b0c4810*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 1x256x28x28x20_n"f507596326bd09c21fccecb43e2a0ca8*335&21cc92017987b3e567d40d53e9c55bd0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --attr-scales=src0:common:0.5 1x128x56x56x40_n"7983880ffd977119065d07ba15f9df02*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x256x56x56x40_n"1c01ac7f60a232fb29c6424618d0b177*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x256x56x56x40_n"9f4c33480126d8a7adc651bc619280b7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x256x56x56x40_n"a76b6754e88df9c25e094b997dcd52eb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 1x128x56x56x40_n"869970f881891c2046b865f24d9ed34f*335&88a351b46251b756e4f207f773458ea7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --attr-scales=src0:common:0.5 1x64x112x112x80_n"ab231a7df716b2fc9c23ae998f1583a0*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x128x112x112x80_n"8cda2ad50337f937cae27bc00899f597*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x128x112x112x80_n"67a719d15b37c403eb9c2bb9604370b3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x128x112x112x80_n"8af91a5df899d8d826b8dd04cb605d7c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 1x64x112x112x80_n"b27170b71b5e579808868676f216c79a*335&15ed640209da2d6eb433034811251cf5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --attr-scales=src0:common:0.5 1x32x224x224x160_n"cdcb9690a7bcf20bd99c8667de4c6185*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x64x224x224x160_n"aabc931e6949c3315593b07bd96bcbc7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --attr-scales=dst:common:0.5 1x64x224x224x160_n"5a8dada4840cf61f2ba834134e78382e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b 1x64x224x224x160_n"015f0e7d455ec1a7e7fa1e8a2e658cdf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --attr-scales=src0:common:0.5 1x4x224x224x160_n"4e0e979f6896822ac8f8dd8b7c5393a8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --attr-scales=src0:common:0.5 1x4x112x112x80_n"93cd08776ed123815072ee9251bbbc30*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --attr-scales=src0:common:0.5 1x4x56x56x40_n"d022f6fe02328d0a520d96a7a851430a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --attr-scales=src0:common:0.5 1x4x28x28x20_n"a6b59bd7d66145283ba66809d0167142*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --attr-scales=src0:common:0.5 1x4x14x14x10_n"89964515346c080737f07f8670ef2fe6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x128x56x56_n"71494c3c6bdd3d653b3fac4c12dae769*1&fa14442154a02ce42fa2e2da93a6d4ba*3&f175d568f02469f3ac47f44e6c8844e4*1&e435e4a212d4beb3fb6d96516de2e5c7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=u8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 --attr-zero-points=dst:common:1 1024x3x224x224_n"6677faf3e61d2a97cdaf951257791922*1&52d2525995b2f58a18bbad210df39482*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd 1024x2048x1x1_n"88d5b819628540af9c52b048496bdbcb*1&ea71e24beeafd494d0a3d5fd55f9e093*1&541a599be9950d8740438e271dd151b7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f32 --stag=ab --dtag=ab --attr-scales=src0:common:0.5 --attr-zero-points=src0:common:1 1024x1000_n"2daf0aa041a75c2fee896618c3af3cfb*1&8215464dfdd27694b741f4a19c084e3a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cbda --dtag=Abcd16a 64x1x16x1_n"08bb779e8ce47321d454806de0bcb7af*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x64x16x1_n"b53a7de0c928cc81f51e748d5599eb24*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x128x16x1_n"41456de32ec74ae38c7153bb51c48113*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x256x32x1_n"0033852fc1a44130111c3bbe7c138b19*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x256x4x1_n"5214bf3e3b0f8df72cd95607c341e516*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x128x4x1_n"93583775e5bc6a80c72a55aa4ca4aa6d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x128x2x1_n"f8046b5a59cbaebf01bc776877a80a61*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 64x128x2x1_n"6bdd65caaf72913c488ec046b4540045*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x128x4x1_n"bc260ae93161816c3a4066b890a0526e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x256x4x1_n"85a01b7fdac1e18cb711562d6a2dac7a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x256x32x1_n"c4d4877d218a9c1c78ac626378b87887*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x128x16x1_n"1d7ab8a0e3d003e1c10e9101104d6340*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x64x16x1_n"bce5bce2e1720285cfe8cba0f29c624e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cbda 64x1x16x1_n"5a80ecb01b048d1378ae6bbbabe24e24*1&15728d7b25664a2159854b57e9fff725*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=acdb --dtag=ABcd8a4b 1024x3x224x224_n"c33b8f397f17ed76e4acbdb301a4270c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 1024x2048x7x7_n"e7632a8935d0bded39b6601a1e06d55b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 1024x2048x7x7_n"5f41594305e2db69ba3fd640c7bc3739*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x240x4x1024_n"1ea04d7ed41d7d612ef7fa05d35250a5*1&ed8405f43634e1dc9d579e05c91ec6e3*1&e043180034e7a10e175ee3be2c82422a*2&d00616040227d64cdda5f294443d527b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x1024x4x1024_n"f3b8ce430c640636280d41e3eae168eb*8&560ac9a342d82be204449579657d42f9*8&9bfd6c36785a518536c632a55dae02a5*16&32748dd8837de06129570dacb9edb360*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x2048x4x1024_n"b55ce237a9022cade33adb39d4fb42ce*1&35bebf0e7dcd6ba2bd7fef34ab3a425f*1&82d346771af3eedd12771c25ca07f967*2&85160f242fcf1465e441a23f6ab2a601*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x512x4x512_n"55a3b88ed4986bf6245b339cb1085473*4&628a9735314be3202b4f41d7ed3cd001*8&23dbca8e3c30aeaacc72bf7804336f23*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x512x4x512_n"325ef3a2d3235c9f8a1103fcea4adeb3*4&1626b84bc4c6a983a7411f47780a29fe*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x1024x4x1024_n"645f6e402f4ba08f97844805e447c3e9*8&8b314e16ec5dc973939c653f305b98b7*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x2048x4x1024_n"11129985463564dc610675c88d17eb8d*1&2e3b9882eb5d3d5d137fbaa0d4c28d40*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x240x4x1024_n"83ffb3c37d407060e66ef356960be166*1&8410038fc78f2fa8c976c84e730ffdaf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd32b 1x32x1x1_n"e3fe48184808d429801f35c057e60820*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b 1x32x1x1_n"6d90a7fa1d5732f0827424e705f076ab*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd16a 64x3x7x7_n"db4f551c0b8183fd1f23b49d6074f797*1&f1097c9188ce8eef379112d5e8fc05e1*1&800b2b40ce287c16a96995df1a0ab6a6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x64x1x1_n"b362f7f436e8c6e6a8ceca9ce759f796*2&edefd18b3fa3832654f058e36dc038c1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x64x1x1_n"ff81816695d35a20893f5d11669f0421*8&5160c9bf058da67f0da4ef313aafdc26*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x64x3x3_n"3a4a1decd7be63987b862b079673af92*6&33aa5367b1b38cfb269344c4bdc3ee97*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 64x256x1x1_n"d5e4bd3e955eb799aa4d511a8338e331*4&1bb51f504743f60b9ef5541bfac264c1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x256x1x1_n"bf880134d04633af2ff903fd8d9c97f6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x256x1x1_n"1d683bc60583b22ce4cf9a73ff123b77*2&8358551457a0de60984b87bc425b2bde*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x256x1x1_n"c975d7c6be3df4938c33e7b6df2f9067*2&2b473a0d7c2c55d58a62fc7d37b8a84d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x128x3x3_n"e667bf37ce4f08dd13c43a207591d2b2*8&e8f0ba019503ab46a3c1c32fbc70ac98*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x128x1x1_n"71ced5fb3fbae19eacb9a35892c162a6*8&9b9a51f2a83fd136083145ad3aa45e21*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 128x512x1x1_n"2f65bfced2001793149ccec632927814*6&d79a92601a739b4b4c9502d8d5456ce0*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x512x1x1_n"b8d8b0b90b0e80fc8b0d7bd355484da4*2&4362b85421cc7c15bffded9c68f3c01b*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 1024x512x1x1_n"be3c70770032fee06ec80d2163c2cf4d*2&5f88e0edb645b796de247c07bdb44ad4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x256x3x3_n"0dee75b19ac89fc18f17cb9202bf0496*12&598d293c711bf069f8059948e4df7578*38"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 1024x256x1x1_n"11a559da906b026f28f87f167918f2c7*12&8450bedf6019d329326f36a06f1d736e*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x1024x1x1_n"69385c7367a77fc32e3296bbfd6a2eb9*10&df17e3d090e249d1092a5c120d03784a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x1024x1x1_n"7d30db2f66f77707b2a707fd7498b37e*2&55f8ba1225b3374e26d24e0516c05e78*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 2048x1024x1x1_n"ce11e400008277179f9339da7afcf338*2&8d6397fd78ebc07a1143809a9aeecb8f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x512x3x3_n"c5fb2ca9132e1c0a9cd26422e2dabdf0*6&99cf7c3ed2c2d4716a748c118d181762*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 2048x512x1x1_n"5affb243ce286abeec4a3f6ad019ff36*6&0f6577223308101c9cffb00f8df3d87a*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 512x2048x1x1_n"2db5feba116ffc7e35a03ee890b394c7*4&7fac324bf3bb360b8a36a520e7db32dc*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x2048x1x1_n"5af10a52f0794ecdafca0583c5b14f7b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 3x256x1x1_n"9cc7917e6d82291b5772209f449ca1ca*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 12x256x1x1_n"91e56f993971fec1f25543b8c6f7a52c*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 3x256x1x1_n"5a9346fb8e19b95fbeecdcd6da51dcf2*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 12x256x1x1_n"ac234e50b63b80da5e43e11a24d8d48b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x256x3x3_n"1baaf2fcdea2b7a7248483534577d1a7*6&52b14548af44fd31b89904c7b073e94f*19"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 256x256x2x2_n"8894112a2b269d3ec54dd7a435c9c0ca*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a 91x256x1x1_n"77192558f4a30489090d91dcaa6dd302*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abecd 4x128x28x28x91_n"9b455c417fbb42dda3ccea7164f54c50*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 4x128x91x28x28_n"8541955864d5ad0250a17e8eee7aca2f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 91x256x1x1_n"0a47eec9044df7a525f3e2f60dc3557b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x256x2x2_n"1b37fb90320ded20109248f58bfc0bcc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x256x1x1_n"127d688da5aba3d759be969a11a64d11*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x512x1x1_n"1c29f2405c883ae9169d26a37fdc272e*1&f166fa406b9ea0d122989bd0ec18407c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x1024x1x1_n"3001eab89c1a63fc77646a99088fd030*5&02c780ec55aad9f283a966616dd2ba5a*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x2048x1x1_n"bd8363a648f350fc08193a5a514685d2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 2048x512x1x1_n"e0fa27038420875f80c41bc495ce2658*3&708713499e7dac13b9c584e6075c7874*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 512x512x3x3_n"9ae8a87f4643aa8738c85539b3a1a0eb*3&033671149f3ef59e2c9006a6a55cd4e1*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 512x2048x1x1_n"65ee238425923e9d04757f590be8dead*2&37a8cb3152673634b8cb837e0ed50b4f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 2048x1024x1x1_n"e28c3149297708f3ab7fbb79ff3af024*1&2c8b1e0a7d09718c0ecdcfac44832a8b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 512x1024x1x1_n"5cb3c6c1b6463f08f90c849c73c971b8*1&f2d6458802277a3e29e3685e343e4bae*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 1024x256x1x1_n"f17ef5f6968dc863c153c61b4d4b12f5*6&97e773178c73428f0501c25cf52e25c3*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 1024x512x1x1_n"8a244bb5585ce1e6748ce2aca9fbaf14*1&6d3aa3d73e349eda0873f45b2497762a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 512x128x1x1_n"6bd1c2b1c0492193cc9bec6110655b89*4&494e70a09b0fda0d6f06962e92488382*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x128x3x3_n"ad84190b509a63226d175e720fb9865c*4&f571d19a811c0e58f7c964c8c3f8a840*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x512x1x1_n"1daf1c06fc6f3f606fc74fee046ae863*3&3d5485ad5ab522b4913ee15aa6834c42*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 512x256x1x1_n"ce3e1786fd2ed645c74d7ee913d21f32*1&cf67e9348e0ea3bb8db306f0e8b83e62*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 128x256x1x1_n"fe8454f6dbfa67096e3cb413e456ad46*1&02c90287fd85d2685cf17313f6b9972b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 256x64x1x1_n"eb53eecdc4ff8f8ae663dcadb25ecce2*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 64x64x3x3_n"2bda161064d942d9fdc64b6998970121*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 64x256x1x1_n"6f2d4d0ecc43fc1bfe7a094adaf105c3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba 64x64x1x1_n"dc75c0cc87fa67137f504845a5f0c1f2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cdba 64x3x7x7_n"6418bce8c79c1f91395923edb057ae16*1&ad693b7122bb4c04a1f05d033e31bf67*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 32x3x416x416_n"9c909ec2632e9a2bd0d312ad5d113a21*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b 32x64x208x208_n"4baabbc616bbbdba3498f2a7c3ee64e1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 32x64x208x208_n"fd82681d65849f05e818407a89d96813*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b 32x64x104x104_n"0120dd8ea62c5898515e8d7e57e35824*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 32x64x104x104_n"aa2411346df1a10ea6a5b006e585a8a7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b 32x128x52x52_n"0e92c496adf2b2548030bab3d8f036ac*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 32x128x52x52_n"b30102c705c41a49dd892ed7dce35775*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b 32x256x26x26_n"f79512d6df7e6a2719f49f1eda842971*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 32x256x26x26_n"56fdfb02df4c65164a210ff18f9a7919*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 32x512x13x13_n"13b25d4e57c01afbb8b8e7270933b383*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b 32x512x13x13_n"a312fac436d559503ae4f4a892693032*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --attr-scales=dst:common:0.5 32x255x52x52_n"e61ac3b1e229afef6260b91eca91471f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --attr-scales=dst:common:0.5 32x255x26x26_n"f5e2f46c42b9f28e6dfad107e6c1c0f4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --attr-scales=dst:common:0.5 32x255x13x13_n"d4ba82caee86f2666c8c4c3453af9589*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x12x320x320_n"33e3377d6cb82f7185d30084300eb4b6*1&dd89cacfa0aace61b6f1f623021893b3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 32x3x1200x1200_n"973206ddfa71b953454570684f683068*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=acdb --dtag=abcd 32x16x50x50_n"7f7a61c50ad4ac835de9f08c125be8f8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x324x50x50_n"ea605c903b831fa543ee103ac0f60919*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x24x25x25_n"f99fc2acded660cdca053a6d66c660bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x486x25x25_n"ad72c9f4b4770d23911bfa4ebd5ec36f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x24x13x13_n"7a514cc11df9d6af4f055068171b626b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x486x13x13_n"61a30764f17dc8bc7a952f60c5e5b5b0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x24x7x7_n"1eef860ab2089544226bf9d904cf15f7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x486x7x7_n"bf459139cf0e884f8ccaf63d9c2cbaf9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=acdb --dtag=abcd 32x16x3x3_n"b4670f49d3e42707331bb28a1e46f210*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 32x324x3x3_n"79f2d88db405476da0ed1942d5371a94*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abc --dtag=abc --attr-scales=src0:common:0.5 32x4x15130_n"66ddbf5578e12b9f58f9beea4dd6d830*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=abc --dtag=abc --attr-scales=src0:common:0.5 32x81x15130_n"6aea3e36502ad768521080165f5d65bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 1x3x300x300_n"b45ecd4374f7318ab4d23d771affea03*1&c45581c3a339451283c565ad77e119de*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x126x19x19_n"fe2db2f3dd9338f34412f05326b196e5*1&28aa6143813e8eec46760833e6963bc1*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x24x19x19_n"ebbf233e2d6eb700e8784b1088fa3afe*1&54fdafcb088002b8d8ead98c05f6ce8b*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x126x10x10_n"2d7b28934cfd48429902a950f3fd5041*1&94be5b2a316af9cb09890ee8193cda29*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x24x10x10_n"585ac6289e2ba8d8367ce0668f0fc5c0*1&bfd49cc756071e2532b6bcdd22867cce*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x126x5x5_n"79a97e2f2a5b9230e14a2d4d315e3335*1&81246dd5766e7cee396a7ac662b0e12e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x24x5x5_n"f5eba89f8ef2bd02d608ab90f1b9612a*1&7d564e6b767660bfc36fbafbe047deee*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x126x3x3_n"7e8427a0feb9d11eda83d6577f39a247*1&d108eaafc773bda725d8ad0439f26981*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x24x3x3_n"e21be4b7d3dc526d9a8d46bcd25808c5*1&627d10e71b7fdd1fcbc0c45911f9c690*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x126x2x2_n"b4a7e955bcb9fe4bc7ba1165a655f8de*1&59c58e6920cca24f4d0e076a442ff295*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x24x2x2_n"2508c143a74fb78158932fa61cc264ff*1&bee13ffa9a1d45cccc37a7736fbb1875*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x126x1x1_n"2b95d3edc7570f1655df8f97d97cb2d1*1&6fa4069054093c98d730bfdec99c7421*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb 1x24x1x1_n"52cd6e82eaed45d8202dc504d93437cc*1&740c2d1e61bad18ae11fdc1aeeac3188*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=aBcd16b --dtag=aBcd32b 1x128x1x1_n"7e34c33d5be64cfa8891b634504d1f1d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=abcd 1x546x1x1_n"298438dbb642fd0da741a118f7ce3e3a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=abcd 1x24x1x1_n"7d8212f4d0ce17ae93c1ccb7acabdb9f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 32x3x3x3_n"c0f8ea61de81bf2ac395dda677b4077e*1&28d73c0c33b9ba6f3e6858118725d515*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 32x1x1x3x3_n"92efe5c124102cfdd5bf2ec34b3d7106*1&cd9fa97f8d6556ffef03ba0b196fce41*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 8x32x1x1_n"cd4eb2b29e6af8aeaf2216fd9305a66b*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 32x8x1x1_n"31e38802a3a12ff057a1964b4740e173*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 16x32x1x1_n"9daec229b1246fb39240f90d115c2a94*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 96x16x1x1_n"19b3fdb0b91c4edef9535f49c8eadf4f*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 96x1x1x3x3_n"ce77d3640cef745556952ecb0d4ff0d1*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 4x96x1x1_n"1f49c3bd905849a48ffc16592f96bcaa*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 96x4x1x1_n"c8eb982c29b01d403a87db9d00e503a8*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x96x1x1_n"52207466eed240c3446f89ada88873c2*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 144x24x1x1_n"ffda85a6255ac3a04dd86203763f012a*100&f6277cfbbc7ade41ba4079211c1fe81e*50&27244cda74fffee9990bae9c4a2c6a5c*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 144x1x1x3x3_n"48bb74240e67ec58765d7b8ceb4f2d4a*50&089a25d51dd4202ca921f9dad0745f1e*50&d6a588d35a4767450850ff22b671cda2*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 6x144x1x1_n"2b1f9b849de63f106719ecd5e821e859*100&ae835da6faf8500860015833f95d48ce*50&1a28e69c970d832f50992ed4f070c623*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 144x6x1x1_n"77073c3098b4e2d2949e33bd60b8a62a*100&0b8d31503ac8df0b9401895f782e4635*50&d79b6bc0bd29fc2e6790a5538a3886fe*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x144x1x1_n"4a12e02f2a0bac951b898bb6e13f824a*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 144x1x1x5x5_n"b857c2ee515a074692589cbd3aabc736*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 40x144x1x1_n"e490c7dfe280d4279c7b787c989d340b*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 240x40x1x1_n"e0ba03bb231578f00e7a96ad6d25f97a*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 240x1x1x5x5_n"e64f6ea33afccf20e6a372aa8e8d87dc*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 10x240x1x1_n"7864a70d6656aadd1909fe5fc83e6c0b*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 240x10x1x1_n"ca1337ba62433ea6606e20bb80a875fc*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 40x240x1x1_n"53aa8f6d3bf550c164dc26a96476276a*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 240x1x1x3x3_n"c9124b6d78309dfc3233547efe263f86*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 80x240x1x1_n"d1bd4485eb26745e0b7168f3553d00e5*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 480x80x1x1_n"ec53b4f0d086bd20d7ba64ec39470d60*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 480x1x1x3x3_n"cd81d9d9f0e0fceb1da58f0208e3ce5c*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 20x480x1x1_n"404577f6056b46285eed310a72a481cf*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 480x20x1x1_n"fbc55ef0dd74632244643e7b28eaa8fc*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 80x480x1x1_n"20ed8a062883f3383124f825fea25bcb*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 480x1x1x5x5_n"dca82128ec3b51e5282917cc77356f7a*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 112x480x1x1_n"aff0ea4dd64a4849349eddd9ec22aea4*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 672x112x1x1_n"dffbfbb706c125564eac4300f4a22976*150&66c7329a64fb60653fd688fb8ecd01f3*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 672x1x1x5x5_n"76bb59d63b15af6d0530f2d7423d6d3d*150&901f3bbbb7dc7a608f0e227655d1f371*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 28x672x1x1_n"6440cb07d94f08fc83ff5f980fa7fd2b*150&d5f0651703d87a705666f106529b95d5*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 672x28x1x1_n"78261a2b3fef950d78b2061998f3af95*150&8b30abd40aa9f1aa9c779f8269fa3f76*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 112x672x1x1_n"a2e2fc87218e6fdcee71033090366c41*100&b645f693852a13bac4dcd09a72efb921*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 192x672x1x1_n"c5f601c0e250b0c101ad4472c7acab2d*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1152x192x1x1_n"d1266e32b9d448dfd08ccea801aba556*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1152x1x1x5x5_n"4cead77b0c2280ed0f1a1c09f485f149*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 48x1152x1x1_n"9bc0297160a06bbd170b0a482c7f1621*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1152x48x1x1_n"5f920623be7e099cfa4a5cca458c5aa3*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 192x1152x1x1_n"794cd4e15dfe73f9784cb4e48360b96e*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1152x1x1x3x3_n"167ca30c02093a8586128371d4c09833*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x1152x1x1_n"8f14830e02839b15fc2060c151b9faa6*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x320x1x1_n"65505acb8582b9be2f22d2b263ba2b73*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bac 128x128x14_n"9a9a84397011d4092b9165c3d5ae1b3e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bac 128x128x200_n"c7d53a433ad199f674bc576e41c50203*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ba --dtag=ab 1024x845_n"820c0cadfb33464f254270f1aa696ba3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ba --dtag=ab 512x1024_n"07b0562026931d8bfb61077d1f176e8d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ba --dtag=ab 256x512_n"572ee7f6a4adc85dc5dab94055b91058*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f32 --stag=ab --dtag=ab --attr-scales=dst:common:0.5 --attr-zero-points=src0:common:1 5000x1_n"000c661a282c9dcb9c8f313568883b21*400"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 64x32x1x1_n"b21033f5b3a43540052925c0fd1572df*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 64x1x1x3x3_n"40c825a9982d9e28a227d049d6e88dd7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 128x64x1x1_n"f6d2e11d4f73c453f7dc2a0d3a4f15e7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 128x1x1x3x3_n"b91fe3a3c3d3dd7c628bba2b6ec5a75a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 128x128x1x1_n"0cc7d764a11d6657e0fd3487277f9caf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x128x1x1_n"ff89fd872eead07be0d9c6ca97e3b81a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 256x1x1x3x3_n"4b52489d8b03f3d425086201e99f41d7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x256x1x1_n"c35cabd45f5539073c89550e64c2e7b9*1&1887afc00c5b4057143e00b4f3293c77*1&7df9287baa29c44d0311c2bff79d008e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x256x1x1_n"ea945758678c92a388b1ba428ea61ebb*1&bb4104fe152b17513df1d0041c659872*1&62972f695f06ade0ecc59afe82cad083*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 512x1x1x3x3_n"ecbdeeb5c9b9bc125c3c41ea734de92b*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x512x1x1_n"758eab2c43d58e9e28a3a068a26fe203*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 12x512x1x1_n"df2238504263cb30ba588da562b00725*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 273x512x1x1_n"9b645703d8e5988ab20d368db5aa9af0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1024x512x1x1_n"ddadc7ef2eec518c3cedeb38a98f932b*1&29200e6172221f2b5671d247dfc6653b*1&58e3ef4016b087d3c54bdfd5f71c2b17*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1024x1x1x3x3_n"8f03cdbb72e606eb1270014ae399a281*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1024x1024x1x1_n"1177582b767a63cb3dfc0b070a5510ea*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x1024x1x1_n"325aea789ae92c7f5169919e537bdaa4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 546x1024x1x1_n"191a54743c2b325ec38e9885bcb1bc22*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x1024x1x1_n"93e5e663862c024ba62954e1e2c56d9b*1&4ff94bc2eb9fa3cf62425246769f10ab*6&72b2b835ce7ced4c6d787cfadd3f273f*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x256x3x3_n"7580c871ce740021ed64d0f3a606f6ea*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x512x1x1_n"26ee5be1dd541ce462615dbff2b2696c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 546x512x1x1_n"9f979d90f47ffeb962d7197fc3bda940*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 128x512x1x1_n"5559ed73fdd53147c0f7eef25dd08e01*1&dd6aac5906251b2a0a0651f33a19357c*3&f7a06615146a320175abf5ee4a33cbb2*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x128x3x3_n"8f0c29fb494138ab08938c57c40f48d1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x256x1x1_n"5b0c21f8ae2fb4c521872aa8e1675a14*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 546x256x1x1_n"08fff9c65d2a161daf89410c4b643e07*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 128x256x1x1_n"414e1e732c68ca68757f971ce4e5c279*1&c6ab513d99f2fc7338d24108b10b2ee1*1&79a32708dce61857113946b535537802*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 64x256x1x1_n"aa039cba3fc377ee7f21946b8bb50a8b*1&105a4113e84d7c1d2cac80c3e9a4b8c0*2&a9da8131a08429ce598055a2cfc95c40*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 128x64x3x3_n"05b053f72f58d267b51c630bc1aed2e3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x128x1x1_n"d8442057fcdfa45692cc64aa3f223a7c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 546x128x1x1_n"b96b533cf1828c65ac5cd2ce2034dd2c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x12x19x19_n"258dd5e5d03f37efe49a7c054eeb6a69*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x273x19x19_n"b458b26ac34bf508264a111dec0e5433*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x24x10x10_n"7fab396b7b3c0906c25d88ca31603654*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x546x10x10_n"6957abe56417420421e5f0f8ae5e15da*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x24x5x5_n"906f3aeaf8488f90570930d988b184c0*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x546x5x5_n"5eced019ea6e2d32823f2279f91287d4*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x24x3x3_n"e66bc2d751e968734463c26774153a48*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x546x3x3_n"77076e17e15879c8e1af71d4216b0be7*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x24x2x2_n"da8dae64d46788295a83bacb6f3d76bc*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x546x2x2_n"9d871f14112c610258391baec9beb356*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x24x1x1_n"61598b5668ddd02919ff51077c632d95*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=ba 490752x4_n"9943fb694471ae67620de90de3c98681*40"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x546x1x1_n"d5217faa681546e2e3799eed260092d6*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=abc 256x1917x90_n"dc21a16601f05314805ce344fde85a8b*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=ba 4x490752_n"cb577470da201bbb5de0d51bd1ec8f79*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 4x3x640x1024_n"78e38fd5a4a19800ae490fc473ef600d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=acdb 4x1x640x1024_n"6b24572f050ba3fbebad8e95d3bc1bba*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=Abcd8a 8x1x5x5_n"d686151fb5adea115fe793bc8759f5ce*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 8x8x3x3_n"a4a73c1637a1d8d3d6a0dd99233d1598*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b8a 8x8x3x3_n"11b215c21b10191a8a07f44b8d501a62*64&81a127ce974a0833be46c932aaef537f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8b16a 16x8x1x1_n"8d8e8c7213a6d00e743ab59bee1defd5*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 16x16x3x3_n"a313c841946e1bc39bfd0fa4e3833112*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 16x16x3x3_n"71129a556f000b21750ea600eeb9af0c*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 24x16x1x1_n"d023ebe22efaea1463b7eebb46aedccf*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 24x24x3x3_n"c43bed74a5e3e8d83402cbd59ef32c06*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 24x24x3x3_n"10ce4befd71643fa4edd00050d8f2401*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 32x24x1x1_n"17fee199036ac8a577102fc2ae8a3a53*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 32x32x3x3_n"046a0a2db74d96b722544968f5132206*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 32x32x3x3_n"87e12291659a406a8b18ca3578eccb32*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 40x32x1x1_n"4313160908c6b9b80375d9151a867ab4*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 40x40x3x3_n"d8832bee2e7747b5ab5f936fa8619c44*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 40x40x3x3_n"fa93f24b34c9a6775804d3005dfc6e2f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 48x40x1x1_n"f63212d140ad8fd6f100e26188d064ff*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 48x48x3x3_n"6e67e38599a9e92bf8a176daab45d282*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 48x48x3x3_n"abaca5619544423427b42bb34ac413a4*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 56x48x1x1_n"b511929971f48efc721af4db8c652494*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 256x168x1x1_n"eb46012e15ec78f84c593e316cf1562e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 256x256x5x5_n"a5989d8c682d801312b6c58952254982*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x256x5x5_n"2a6ce618110d4b2730f6a1a3a7f02b4f*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 168x256x1x1_n"6b5df1521cc9a90fad5eab55fa3fe61d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 48x56x1x1_n"c4926f297ae5586de86cfa00e7ea536b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 40x48x1x1_n"3433793172341ad587627d89398a41f5*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 32x40x1x1_n"bd8edf276431025476b2232b05b8674d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 24x32x1x1_n"58bc62624b30d6ea167da3c2ebd927e1*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 16x24x1x1_n"16ebe0c50eecc936c2f263e3fba05563*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b8a 8x16x1x1_n"d77a4c0fe29b100c5e3d20c7b6bd15d1*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8b8a 3x8x1x1_n"07ec3ab4a39fab13a056949d501b5104*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=abcd 4x3x640x1024_n"672e4249562d0484880b36d70fc995a6*3&61e338813f97afa9a4fbd5b4b1f93c90*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=aBcd8b 3x8x1x1_n"69c164c25766380a99f2f2c7f342fb18*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b8a --dtag=acdb 3x8x1x1_n"f82ed841fe612ea29643a9f075f0b668*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8a8b 8x8x3x3_n"c481c0e25a9245f4d24272a67a19c323*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b8a --dtag=acdb 8x8x3x3_n"dcfd37d28dc8b4e9201997cb17af813d*32&161512871488446183c387fb1bb9c7c6*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8a16b 8x16x1x1_n"4bc604964a09f4342824342e997a7949*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b8a --dtag=acdb 8x16x1x1_n"92ec7f9c22527c25bc12883164a51162*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 16x16x3x3_n"b38af2ae016ba194750f986929bfb40e*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 16x16x3x3_n"46222b657748e2a1873920647068edf6*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 16x24x1x1_n"ed85b9f73189d3c7fd0a29f509dc817a*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 16x24x1x1_n"0f287666f2547cf03eea2ab6a16b719a*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 24x24x3x3_n"3edcafffcba7190d80e071ac1fa073f1*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 24x24x3x3_n"a8d54d8f0e840862b04ce8e3134f37a0*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 24x32x1x1_n"9b1c3a0a620634dd69eb353dbc94ec8d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 24x32x1x1_n"c38861c1ea15f4406afad09c2e061d05*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 32x32x3x3_n"6eacf503c558926a88f222097e162661*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 32x32x3x3_n"a57305cd94985f8c835f06b3eb698fcd*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 32x40x1x1_n"7bd271789cbf696fa7ce74af54eba426*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 32x40x1x1_n"709f825bf9be50fd35b215012341363c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 40x40x3x3_n"ed4c2e0c3e8551eee1195135473b4274*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 40x40x3x3_n"61a6dc57f28ab7b7cf5873c725b8a0ce*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 40x48x1x1_n"664599310b4f3d8dae93c60cce41cc81*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 40x48x1x1_n"25917043d51eb27de29ff893a565f8fe*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 48x48x3x3_n"d23f07fcf55db9664e526a23fccf494a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 48x48x3x3_n"b22ce68f082357e11ed98dd8f191b38e*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 48x56x1x1_n"98f40858ad3aa8e665868c00fdc6654b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 48x56x1x1_n"08f5e6ff804450aa6bf5e4e0d1fef031*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 168x256x1x1_n"dc0ded1a4650f047de8b8df85e5fbc08*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 168x256x1x1_n"fc11fee5c39b7d4af45f2de5905850f9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 256x256x5x5_n"4b37e7bccbdc72588f950ec2709342f7*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x256x5x5_n"2889cfcfb7e671f3fcac8e30a0728bbc*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 256x168x1x1_n"bee1b52c0c67f125ac859230af2e5065*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x168x1x1_n"78ac1af844f04efe59153c61b31c422d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=acdb 4x56x10x16_n"fc3c24e6ca1c3be086fc45d7adba2ecb*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 56x48x1x1_n"47734887fc561c3d71d7dca92a4ae70e*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 56x48x1x1_n"d78f0805c9f695f3d8fa251d52d2be52*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 48x40x1x1_n"55757b2abe12a20f15b67ae2b7e44008*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 48x40x1x1_n"2deb69abf15b3fad3337850c7e029a24*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 40x32x1x1_n"cf1443633e5fdcf1f8871c98b2deb048*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 40x32x1x1_n"acb310162c4dfb8aa2451344674019db*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 32x24x1x1_n"2d79e709edbcc54fdef940a3374ffc89*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 32x24x1x1_n"98692bd1c4acc337bb321143f4a2d73e*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 24x16x1x1_n"81480ea196fbb966c0e84443c40958df*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 24x16x1x1_n"4b188815e538d2c643b78b9ba9663fdc*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a8b 16x8x1x1_n"82427e50337b27480005487e771fe4b3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 16x8x1x1_n"601d3a9b7822664ff539a48bd329a4cb*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd8a --dtag=acdb 8x1x5x5_n"cab2ab86eb9b19234efb4f6a1f9a9ed0*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd8a 8x2x3x3_n"155999bdf26d7ab04df9f3d188a569f2*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 10x8x3x3_n"6e564a05d810765516daabd719f469a3*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a8b 10x8x3x3_n"64f052b632c820eb8e4573519515aa57*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 10x8x3x3_n"a24418b8b4924c877db8bfd3f945836e*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a8b 8x8x3x3_n"b170e5b466a5623b4e37eda8164dd395*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a8b 8x2x3x3_n"a94be8c0b8b77043c4ba5bbc059f7db2*31"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd8a --dtag=acdb 8x2x3x3_n"83904dd90f4a11e4efea7ffd6232cd48*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=Abcd8a 8x2x3x3_n"60e812f73698df8bebf80a085195bb8d*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8b8a 8x8x3x3_n"a71509f9ce328607f59e13326224f7a6*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8b16a 10x8x3x3_n"72f7f6e2f337b88ab21c5d3ab1766e8a*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8a8b 10x8x3x3_n"5ab7644230551660b6baa118137175fa*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=ABcd8b16a --dtag=acdb 10x8x3x3_n"4a15ff081fe7d37f4c34e19349f14150*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8a8b 8x8x3x3_n"0196619333466957f69ad541ab203704*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=ABcd8b8a --dtag=acdb 8x8x3x3_n"82ac8ef455dd7869cd7d7cb357b69f38*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8a8b 8x2x3x3_n"3b3e6c11b0053258ea679bd542c86421*31"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=Abcd8a --dtag=acdb 8x2x3x3_n"7fb40272a6da721ad5c8af6ad54e9789*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a 64x3x7x7_n"d56f124ff86b55133734881761d0ba5f*1&5719fa60583735f7c11f770876cc6c21*2&ae72d93c21ce23c19ecd90fd69bcfe29*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 256x64x1x1_n"d246b1b54f7a7aea5dada13a7af2a2c6*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBcde16b 32x8x8x3x3_n"6735ac46e76de1d01d240d51e4148bdb*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 256x256x1x1_n"9897a482022602edf59fa100a666c6cd*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 512x256x1x1_n"eddd2654556528f8099538fac68cb773*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b 32x16x16x3x3_n"efdf5070d6804257bcce894322070784*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 512x512x1x1_n"fadd0f534ebc0f1ea8c2ddf3818458bb*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 1024x512x1x1_n"579cf1c2c18fb176b68dbd2b7a845eda*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b 32x32x32x3x3_n"84d42dc5e446cec7f18229da234a016a*23"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 1024x1024x1x1_n"51722f7a708c6693dd6338fc62b8b2ef*90"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 2048x1024x1x1_n"efac71f4b0e0bac3945438c83afbd76f*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b 32x64x64x3x3_n"a70e9d6ec0df7ae49ea701cfe84a496b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a 2048x2048x1x1_n"bc8fb5b6c6e2dfad0e3f2405fe3bf5bd*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 2048x2048x1x1_n"4530dfed8837effdb5203afe15fee45a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8c16b 32x64x64x3x3_n"29135a9d0a063cab0a123b699a777aee*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec 32x64x64x3x3_n"066ab9849919c27d2d67ef910ee30ac5*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 2048x1024x1x1_n"c379842e48b495e6784fe12c072fcde3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 1024x1024x1x1_n"5fa011fb6adb36b2ee4103bf99feec0d*45"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8c16b 32x32x32x3x3_n"7d815ec0518de471492258b439b38b21*23"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec 32x32x32x3x3_n"b6697235c67dc89cdddfe8f0dc5def0e*23"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 1024x512x1x1_n"dddf85783386e58df84529552491a84a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 512x512x1x1_n"08e0f4eccc6a48c683c10231d1b0ab97*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8c16b 32x16x16x3x3_n"3a09997215b7f465d61d70b67d372cf9*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec 32x16x16x3x3_n"6639ce95453bda4c18bca1a1bdd28bf7*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 512x256x1x1_n"454dd665712a0fe6f87d84d0a79ac116*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 256x256x1x1_n"d1d505b17f6481d3392bf7f8551d7aa1*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 32x8x8x3x3_n"38f317a0cffc73c97883be690cb84373*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abdec 32x8x8x3x3_n"604db04326ae3ea482f40c45e3d6ee6f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb 256x64x1x1_n"eb0d2dcc6602a4e6bbf970f7df7dec60*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb 64x3x7x7_n"c73222cb3b10bc5fc33391eb5c4bae91*1&2d60c31d8af3c2c130b1f734bbf18b13*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 64x64x1x1_n"09122eb07ea4201e1e0b4c5d6aed74ff*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 64x64x3x3_n"237069bfd6dcc13f5372a7a00c4219ef*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 256x64x1x1_n"48b3c589cefec0bace666533cc281e23*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 64x256x1x1_n"fdbcc136ad97bf51223450c37a45ed56*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 128x256x1x1_n"05b1991bb1fb3c3247a1fa8815ae964e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 128x128x3x3_n"214de266dbfb0ef69dd568e7c149dc16*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 512x128x1x1_n"86c3e481c0c8cef82993e70f6d7ff36f*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 512x256x1x1_n"4303f0674b033686bcef396cdc789819*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 128x512x1x1_n"ece1ef99df6b550943dd052e6fcbdef0*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 256x512x1x1_n"8b21bce646bc01615894a328933a6918*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 256x256x3x3_n"20557399924a75b9703009a6c83313ba*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 1024x256x1x1_n"01479a7cd47b5183ccd81bc5e5accdf5*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 1024x512x1x1_n"8fdce2dac2b3424b0e74a902960385c7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 256x1024x1x1_n"8aaa1ac3f59d911372e287a6dd814f15*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 512x1024x1x1_n"ac36e549fed7dc636f8b407abcb3ef3d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 512x512x3x3_n"147affccd094aa9734907e432736a0e3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 2048x512x1x1_n"2919bac33c54075ee489450594f852e7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 2048x1024x1x1_n"19a7a76bd4399a2540eb2c23bc28a273*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a 512x2048x1x1_n"b017bb1e8cd659fee9248247d192b125*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=abcd 16x2048x1x1_n"2078e991919adf1da76eb33a33ca752b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 2048x1000_n"a149dbecf9414054e89bb16e7060199b*1&c695ed68ff56e7a324517ab354e037cf*1&16cccc8d1ead28d9bb2e82145937335a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ba --dtag=ab 1000x16_n"c5bcf2e9cbd35dcd9071e774c441cef0*1&4542c07d043181fbb87827b806c6bc62*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 16x2048x7x7_n"17ec61c1bb1fc13282df90067c65ac02*1&35904bb13d2e88ff9c6153711b58c424*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 2048x512x1x1_n"db474980bef435cbf63bf778558e67c9*3&1a1b595ec1362c35a1d4d80340bc6039*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 2048x512x1x1_n"30f1ece7a96f1dc0c1631900324d5206*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 512x512x3x3_n"7290cf8cd60c7d799079acac2440b527*3&cce8aafdf7400a4eedbcd7b2d159184d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 512x512x3x3_n"c7c321f0535be231a15c493ee842c19b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 512x2048x1x1_n"598ccb2eeac0144580d98f7a0b097187*2&0c9aa0186206fa959c3e25f43c1401d1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 512x2048x1x1_n"62f59aa66ab19f8f3a3b3437077b7077*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 2048x1024x1x1_n"555ffd583cd65348247bd6d960a2e5b2*1&d76c37ed1a9ec42448f6a4b4a1863ad7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 2048x1024x1x1_n"268e03a06e5ca26e470a899520a37f22*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 512x1024x1x1_n"b5feb4739a6773cd999110f2058d3162*1&85949c76a49431078531c87f43df6dd3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 512x1024x1x1_n"7f0e3d9d1de6fd95e5707cb742acd910*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 1024x256x1x1_n"20e43dcc49fba5771803d728ba2a87ee*6&dee7c4d5838a057f5085590caf95678c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 1024x256x1x1_n"168ed080db9f61e5413b4eb841b193ae*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 256x256x3x3_n"63625bf3970dc728127053666cf0e334*6&0386b984c381b75ecb80f28cd86f585c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 256x256x3x3_n"f71d6913363db3462b7839413fc2ebcf*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 256x1024x1x1_n"d2ac4d6940b64fec1fc19bb9fc3b7fcb*5&c2fa8344aefbf4c9ce0e516e5c1444ef*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 256x1024x1x1_n"2c1df8e4f336e7c02bcc12c14eaccc0b*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 1024x512x1x1_n"d55e1fde4f6c70b02c0397b7da0dc820*1&418253d2db02ebd7e2bab52099aff7ec*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 1024x512x1x1_n"ca2ade22c73474a684af891ef776cd26*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 256x512x1x1_n"bb9e1f047b33374ce64b5012069c722b*1&10d8e1a0cb98382a52497e1c80bd935b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 256x512x1x1_n"91c5f9a44dd5501367fc1b9be56b25e8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 512x128x1x1_n"c2fc75cbe95ca398c677ada70c335e9c*4&35120bd72b8a6b90bbe7bae05bfdf481*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 512x128x1x1_n"53389676b661cd406e6a9e566d5540ec*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 128x128x3x3_n"c6d4798f5866de557074b4f61f131b37*4&4490751affcb61e63f00542c085feed2*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 128x128x3x3_n"e0160301e9f6e67ee50d0f9d034d9d51*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 128x512x1x1_n"236266bd983824206f9797151658c37b*3&edf7f998475cc46fdada035484f0b668*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 128x512x1x1_n"fc135101c26064b0130314420879c9d6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 512x256x1x1_n"44de3f92d676f5a2b8648ea1c9045a64*1&3f444ca3c9491407a2cf65a911bb4019*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 512x256x1x1_n"a0dfb4f3887f791849bd2633abdb0456*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 128x256x1x1_n"a3db065f3425f319a0708d4599b7be96*1&c9a93002c2c33a50656b5589de942d8a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 128x256x1x1_n"4db464637230ad86811ab514900ab82a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 256x64x1x1_n"2448d096e2daadf790a5f1edda73bf20*4&912a036e8d52a8ab77164bc11df2825d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 256x64x1x1_n"3ed508cb3943a64f4136dd3646208c4a*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 64x64x3x3_n"2a6e2f13a064db757e5b794317b82841*3&38ef3196030d32a937f10cb9cb362860*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 64x64x3x3_n"5a1fcb8a03f36cb6b248abb593e77ce0*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 64x256x1x1_n"dd2fac32ee7835c95184480e6d057f48*2&be4a5c9619bd7cb070c2f41b6e6d60d7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 64x256x1x1_n"d5468de2bea8ee8930e51b335b082e80*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b 64x64x1x1_n"eddffe032ec07f8b238b07c268445a7c*1&57e7f39c610eaea276a5dcc0b99e63a4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd 64x64x1x1_n"73bf368fdb80a74fe3a9fd3e3e642c9b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 16x3x224x224_n"a5eccb60a111d1a35aaee61bee5cc545*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=abcd 64x3x7x7_n"70f7e88efa214da82919205b6887aefe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=Abcd16a 64x3x7x7_n"00ad36ace9646c781e31293b4e2256a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 64x64x1x1_n"990417d5f38bf189d358a7d820c299cf*1&da5b8fb49b77cb835de6edc55612acb1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x64x1x1_n"3f495d905dce547e7ac7045dded40778*4&7022e8db9708f40fb4ccd01790e4a0be*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 64x64x3x3_n"5ce037df45aa7bdca56e9fc24eaa6114*3&b42ffdfc78eb19c9c4744c82863546e8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 128x128x3x3_n"782c431b408258285c9cf61413a28741*4&73163975c9d1a912a32afbdb20acdc86*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x128x1x1_n"c160aa11d24554f037e15810d385a03a*4&226dbb0a634ce7364100456c3e72e38d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x512x1x1_n"365e5cdad38f3220c017e4da287fe970*2&1fa7a863ed25eef38d7fd289875c4168*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x256x3x3_n"135c387a0c194f09c2a5c34954b985ec*19&68a25b725ef837ff06e8249f8864261a*19"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1024x256x1x1_n"88acb7ef60ee3a65a5fe3f7db7654b18*6&b7bd37950eb30ba5f576d60adafca299*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x1024x1x1_n"a48cbe2a1ecbd855c6190ab37edbf1ee*1&573c9d6334990836b5166cf43da9f9ca*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 2048x1024x1x1_n"f32da8fe295ea71c2f99f5d9de98c0f9*1&a08e3c0bd9207571bee30ad9c9c1e48a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x512x3x3_n"35081e69b144c9c0d9020a78c053bc8e*3&753484a8760b0af33db029a2da523c96*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 2048x512x1x1_n"ab35336ec631915b6195518d6943ee48*3&9ab963132ae0f8855ef1ba4bc673789c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 512x2048x1x1_n"1b6aeae0d27cf9ee5815e2edf898e648*2&4f20349d81413059dd882c8207a0d618*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 256x2048x1x1_n"4bdc87444f0b672a4d21925c4dbd16f9*1&9ead9d6030f02874e7f681c48ef1d169*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 12x256x1x1_n"cbedd2e65b859137929306587b3f7c2c*5&7349d5f91408e653f9de3065524fc0d1*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 3x256x1x1_n"2596cd7d1df9b247d3283c1f9daef682*5&e149b1add696b3125f8e92f3b72a7dad*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8a8b2a 256x256x2x2_n"66db7813056931bbb8c3751c9e104c32*1&82bc00e2f5e2e85b1e3a75d901db7c43*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 91x256x1x1_n"f3661b5e6b74072ae4b8ff08edbcfb1d*1&c9b69a7bcc9832f08fdb3988911fe304*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcde --dtag=abecd 1x100x28x28x91_n"b2da377ab22c37e81979b7c53047d190*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 64x3x416x416_n"3927cdc997f2c8849656b7e803692890*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd8a4b 64x3x416x416_n"033a5a5e45d207c46182e11038683e40*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 64x64x208x208_n"52c8df60d93e44b79f0fc2d569d8cb9b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 64x64x104x104_n"3d52c6fb486a9f311552c29868ab0b8c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b 64x128x52x52_n"1e14a985ff1b65e8febf9b1bb9104d46*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 64x128x52x52_n"834410af08434c305928e9e43613dc07*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b 64x256x26x26_n"fe6f5a121ee879e94be2a0c4d157bbdb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 64x256x26x26_n"ce7940e0681ebc76e82a49561cf0b286*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 64x512x13x13_n"f9d34ba57fb523d229e6acf4f2aef786*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --attr-scales=dst:common:0.5 64x255x52x52_n"62daf6bfa7472967e379283106eb8357*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --attr-scales=dst:common:0.5 64x255x26x26_n"4f268ae264af1cba18af63b4fd6a19ab*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --attr-scales=dst:common:0.5 64x255x13x13_n"3ec74bf87d9871d57073c457f4e9ed7f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 --attr-zero-points=dst:common:1 1024x3x224x224_n"b68ab29a131e3d207435c259ad2175c4*1&36380b5db71fb1e834da8b31ab1729bf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd 1024x2048x1x1_n"39a824f5c17456f31b28b12b0714c35d*1&1a2dafb9bcef9639c99dbaf62b9effd4*1&84a81ec8738b1a1efb08d036d02a636a*1&dfd6c83764b292a1851debf261c71ed9*1&cd60cb3ac3ec3e2412c9704f735c3d76*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --attr-scales=src0:common:0.5 --attr-zero-points=src0:common:1 1024x1000_n"f22f20973f45bc07abc6867179530fa0*1&d1eacd56f3fe9b30c0249177ff1e854c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 1x3x224x224_n"ab60702f03a0afa9720561f018e3363e*15&3677bb89878229ca5431af2aafea9a9f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd 1x2048x1x1_n"02f718a4880c920d6f3c417bae4b74ca*15&4898df7bca89bd533d82e7063e28c0d0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --attr-scales=src0:common:0.5 1x1000_n"569cfbcfec826ce146d8b43ccfb05fcc*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 8x768x14x14_n"bdd3cd8ae2cfceff3454bfd9e871d9b5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x768x14x14_n"d43c60ee13ed79185957314041d5ddab*1&7d8def87d204a90975fa9140356d91a1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x256x3x256_n"97294055c3fe49377d83841708aa7336*2&49fedb0a575c874e72bdda2c26798b2b*4&eb504ca2d2eac8e7c75f26b85b51b007*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x379x3x681_n"01a421890f544d1a747a78c5c4890c1f*2&0bcc095ad44154f8d8bfb074648f5a26*4&3351c8e1d5ad85ebeb3e5db76a8ddca4*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x681x3x681_n"51228e79db44a340f57c9f0520bff877*10&95873d2c9173368017c542ca7f8bc166*20&97627e80dbe273f5661ab842a9d9accc*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x681x3x681_n"d2317b27633f83761cc93ac176fe31a4*5&5a528801359816ff0d33d936fb81d738*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x379x3x681_n"2007bc74b66164170eb415df221a6959*1&6c8e2af1d72a55b37bce5af9db0f134e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x92x3x256_n"b02ed220f0792d50e792912a2dda0ae5*2&270b23f1052e4ee2110abcf769f74bef*3&28f8e0cf1bebf3e2f33f783878cadb68*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x92x3x256_n"ffd3970cb4b016cf94c1805a68ecce77*1&46f08b8fe4e43f944155a04c14f58255*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec 1x1x256x3x256_n"8ec627f9811349d2d586e0cb8dc25f60*1&c9d3668f3f2a11c1efe4a728874272dd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 40x3x3x3_n"827832208f89452fcbce12a06b347903*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 40x1x1x3x3_n"0defbbf97c758e3bdd2b06a47d369c0e*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 10x40x1x1_n"99e886d25f7844964aa4b9e325f597b1*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 40x10x1x1_n"548d0a823444547325ae738134c6bf16*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x40x1x1_n"f1838b1631c2b97a8e052885e2d936ac*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 24x1x1x3x3_n"904d026a55d37f99a77e87b970e769a7*50&838a691689d22a0e4482a770c9a1a5fd*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 6x24x1x1_n"d08ef4259bb43e2558b674ff00f0612d*50&51878e11e5ad9cdd20b207afe84299e0*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 24x6x1x1_n"e285c27b3c986a96abcd13652a800cae*50&459beb08e6423dbad89618104df55266*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x24x1x1_n"d608486eca4c8ddc194241ac122790d1*50&e4b7991588836f628c42a1959456bab0*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 32x144x1x1_n"42ba7054f45f3e4f13a3ece3da09f939*50&45de6bf5e481e7d20f0f9e8ddcc8c3d0*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 192x32x1x1_n"5ac7196f9671ed78fa3eaa8d5c6c9c2c*150&35e660f00cc1e6a97921335b84234aa9*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 192x1x1x3x3_n"3eb1cd75dffcfd15cdd6fcfc1345662b*100&add56b2fde8ae2d141a7d6f1e127bdd6*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 8x192x1x1_n"e115a4b37b93a57d931bcefec2f39349*150&d05cf58525e298eddb06a9217417331f*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 192x8x1x1_n"4719cc8f95ab4635040d8b55740b1730*150&83360250e253ccef22bf24095a0e8581*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 32x192x1x1_n"fcaf9ef68858b55d0e02b9b840f29f4a*100&fc18221ac37c7b9d26b768b4836ec16b*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 192x1x1x5x5_n"5afc3e2e12567309a92e1c63a4f32d61*50&9ca15a70dc821bdcb5c1399c9c8ec348*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 48x192x1x1_n"bae3c6fd5679c6fda4870c00c8f2d84b*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 288x48x1x1_n"46681820e670909ab92d90d9bc6f1891*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 288x1x1x5x5_n"8126e60b53bb9a1b4dd75c8e897f6d23*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 12x288x1x1_n"0f33789bd3de022eed3fd096469917b7*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 288x12x1x1_n"3a52386aaa0514b1219006b1ce673dce*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 48x288x1x1_n"92ebd25de42981f4e15b94fb067bbff3*100"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 288x1x1x3x3_n"a0aba4100a7542054e76ba28fcc559e2*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 96x288x1x1_n"396329d046495d76b30f7a8f687acbd2*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 576x96x1x1_n"c58c8544a31997b42933ece4d893d382*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 576x1x1x3x3_n"1fa499579ff85e25f4a51433be8fe2c9*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x576x1x1_n"18c1e358a9bb422f598747db65ca48ae*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 576x24x1x1_n"771ca884b419c2072a125d040be4afae*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 96x576x1x1_n"a1df3a86eebc79b8a1b67e446c4389d1*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 576x1x1x5x5_n"4e09f20fb9f02f1178266a3a63a7dcfc*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 136x576x1x1_n"f541ce0badf1e5e131c5e8c6de9c7f0c*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 816x136x1x1_n"37f53c1bba41f2eb95b96add22145e15*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 816x1x1x5x5_n"a9805c5405ecf6b468c0876fa0e77542*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 34x816x1x1_n"3becf13a23055b87ec4027504a7a6ae0*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 816x34x1x1_n"d217534fbd5787b127f6ebcfe43165f6*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 136x816x1x1_n"0b0ef7bb09f03b926c9df65745c34446*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 232x816x1x1_n"ca2db40ec1fd3619929f27f037b9d324*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1392x232x1x1_n"a9ed4ad9e10005028ea4d92fe2c94887*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1392x1x1x5x5_n"2223506c180965a2fd14557d04261855*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 58x1392x1x1_n"9b7d9ad3430f75222f2d0306332351af*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1392x58x1x1_n"8ededf390c6ada28f0633f2be77501bb*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 232x1392x1x1_n"38508bbb8bc13cf64b2cbf5353527786*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1392x1x1x3x3_n"88d3157c881b7fc399a663d07e921ace*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 384x1392x1x1_n"2180e2135f66d8680dd6a1cc741cee20*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 2304x384x1x1_n"447fbcb011d2bc13c48cd05dc41a74cb*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 2304x1x1x3x3_n"6f1da7a46ff192c1b5985ccac6f9950a*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 96x2304x1x1_n"c5f2efe52c525e1b2da1ec4b547a2de0*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 2304x96x1x1_n"e8611cd645fa9f40411b89f70bb2fb41*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 384x2304x1x1_n"3a9831b8700e867f444908cbb51ade48*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1536x384x1x1_n"5e7021da80acbdeb962cc956dc585a1b*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=abc --dtag=abc 1x72x72_n"858e72b9cea6a2c340ea7e2fdc810acf*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd 256x256x256x2_n"544a7cbc1f5106ecdf8b3d803f2e97a7*28"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x72x72x256x2_n"4afd128da11bc8babd921ae41ad6a212*46"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc 23x36x72_n"967a868db5a022e05365ab4ab89a9ec8*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc 23x72x36_n"f774ad90cfae90aa531e01d4aaed55ef*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc 23x72x72_n"7df274c720d911981012f6a9a567a392*3070"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc 256x256x256_n"e9b85f24703e03e069295da36de3d91e*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd 23x72x72x1_n"9dad993537924bc6d3d23b43582089dc*6132"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd 23x72x72x2_n"6a79ff4f06eb9d390c35154f0f646aac*1024"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 23x72x72x256x2_n"55d201081b70c3e26acd8eeb80980354*3072"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd 72x72x256x2_n"29bf8b0df55a8df9022032025d2e6d04*138"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd 72x336x256x2_n"d04c815b9d3a4dd2f69b35e0d1a7f90a*46"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc 256x256x2_n"6e442bbca3b7cd88a5b8f0aade31a14d*2048"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=bcdea --dtag=acdeb 1x4x224x224x160_n"8ffe4c150dd5c3ceb668fda524f329ec*67"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=Abcde32a 32x4x3x3x3_n"b7204b7f01c1549a41b9b8b3936aa20a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x32x224x224x160_n"b20a5733065192f178771019407f7729*268"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b 1x32x224x224x160_n"f3f7521dc89dcc88f8618f9d1168369b*268"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 32x32x3x3x3_n"7ac867ebe1b6e45dc2528580e5bcb589*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 64x32x3x3x3_n"5385bf3b142e046be2ed7650b66bc13c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x64x112x112x80_n"9ac3b64f41f15b540b1acc83f5c4410c*268"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 64x64x3x3x3_n"095e2def2743fbe8d3339db329424a27*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 128x64x3x3x3_n"3acfd2d0e4ee2ceba62d34d292bf246e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x128x56x56x40_n"025bd94415ee92ce5a13dd443d3a439e*268"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 128x128x3x3x3_n"5d27a4266070737de7ffeb62da1c7a12*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 256x128x3x3x3_n"a4647006b6d59583a17e16ca51ad159b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x256x28x28x20_n"c4119865bac7ac1cd025c48016c36ca7*268"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 256x256x3x3x3_n"20b5623ebff32f4e27292134a963a147*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 320x256x3x3x3_n"70110a37de5520a821c6aafe798202d2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x320x14x14x10_n"644cbbd3c31d1536de87bc1011e15fa3*268"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 320x320x3x3x3_n"393126322aa0297b2dcbae81ca5a585a*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde 1x320x7x7x5_n"9d6fcf9243cca20bd3fe4f613aa35cf9*134"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a 320x320x2x2x2_n"df00f67647a1d67396b734cb98474b4c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 320x640x3x3x3_n"d80590c55ac292be88fe7e04bd1520f2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a 4x320x1x1x1_n"968b85096e5df71b4f1fd28c79106979*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a 256x320x2x2x2_n"4e2d375ef988b05e429cead173e13f0d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 256x512x3x3x3_n"836a797b31d51c573c82ef54872d9270*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a 4x256x1x1x1_n"0b9f67ea223ff7616fab2a602328095e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a 128x256x2x2x2_n"ac0edca6d3f57fd7ee2317cd58a5d17e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 128x256x3x3x3_n"e7c13a819612202033be69131b4ab7cf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a 4x128x1x1x1_n"efbdbc7b14031f048394899d2e776174*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a 64x128x2x2x2_n"4b247fa933b87511aa1b0d081c1e3aa8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 64x128x3x3x3_n"9c32b8e7e5cd848d541bdee2b1ffec22*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a 4x64x1x1x1_n"542fd831707ca72748ff163121cfa007*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a 32x64x2x2x2_n"bd0784044f92016c2e5cc3567ac6c69e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a 32x64x3x3x3_n"dddc3bc31b0d1dad789e7d6d55a0f30c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a 4x32x1x1x1_n"f0cd061ca6afc95758e0b4e453af30e8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=abcde 1x4x224x224x160_n"a8d402249458b64da050fa12e82dac36*67"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=Abcde16a 32x4x3x3x3_n"1ac879911b5178c7dd750f8b427ff6c0*128"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a 64x32x3x3x3_n"e1903cb7e72d84fb71e9fe978655738b*128"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a 128x64x3x3x3_n"a0d14b3143873eca8509cf1278535ee9*128"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a 256x128x3x3x3_n"cf8a8349649aeda40c4e392c04b38ea0*128"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a 512x256x3x3x3_n"eaea9ce39e73ed0972929b7caf10765d*128"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba 512x256x3x3x3_n"a4c63451c3e9918c33dce2cdf15e38c1*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b 512x256x3x3x3_n"4aa1924308ed8b48c46735c2f35edf00*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba 256x128x3x3x3_n"50b69f0b8c6f887c86da602dec2902f2*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b 256x128x3x3x3_n"18a02cff7d3022ac391282a46d80c76a*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b 128x64x3x3x3_n"cd81df7ea34f33e71160605c113d471e*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba 128x64x3x3x3_n"e50468bb3c9bfd8cecbc7e5bc748b4f9*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b 64x32x3x3x3_n"804745b884a3e13452b12be222ede49c*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba 64x32x3x3x3_n"b5a68aed977f3b1f6af95bba6b2b5d6c*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=cdeba 32x4x3x3x3_n"0f0dbb2f3bf14e9b7025d5e16d0fd333*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde16a2b 32x4x3x3x3_n"09526935d03eef72a0cd7b62879b0fa5*128"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde 512x256x3x3x3_n"02f0fdf0e3ee2ede408afcbad20f591c*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde 256x128x3x3x3_n"4927c8b792ddcf81746c7485f59f772f*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde8b16a2b 64x32x3x3x3_n"27b7e6d48fcdedb8598522ba1bbf9ece*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcde8b16a --dtag=cdeba 64x32x3x3x3_n"05101004cc8a78f4ad8722507ddb88d0*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=Abcde16a --dtag=cdeba 32x4x3x3x3_n"58b601873fe4423922374ed46aa7d07b*64"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bca 64x179x18_n"1648b65398c223fb5ad3390de778785a*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 3x179x18x64_n"639b2c5df1bffb3d8c9b59156b37b04b*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bca 64x179x32_n"af8345c8e57032d6a0de7c1406653a12*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 3x179x32x64_n"efed2cb8a56d2fd124a96b5be96484a9*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bca 64x179x17_n"ff62eeac4c7072c3ee37e902b13250b6*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 3x179x17x64_n"3b6b7e7d6af82b9244841f1a6a936b60*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 64x179x32x3_n"b83ca12224c17f49be2e5c64121bca30*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab 179x32x64_n"a49fe183e02afb228d37486aaf83c097*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 64x179x17x3_n"d2be2491f383d1c99ccaf262cb90a3b5*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab 179x17x64_n"9b9c74f0d5c16d94c8fa992d34170eb4*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 64x179x18x3_n"70dfd46b15eb1169d44fc2a1e174926e*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab 179x18x64_n"2b239fba327b58a3ec44ccf9487030c4*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 3x179x18x1_n"20b77856fcca612eb4dc503cd9bb96a8*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 3x179x32x1_n"b845c4d0170b1eb837bca7a9433be037*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca 3x179x17x1_n"4dcdaefb58d370fa93032ea969167163*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=Abcd16a 32x2x4x4_n"fe1cdfa2c952b183f892de1f36aa8e1a*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16b16a 32x32x4x4_n"b5cb45d9e32b3ae3ec0c8bd2dc409b0c*192"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16b16a 49x32x4x4_n"fee1d29b1060084121fa9b2a4b3aa7b4*192"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=Abcd16a 32x3x4x4_n"7f3cf3789d4e4c1ffabee38ade7ccc99*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a16b 49x32x4x4_n"7361be661bddfc51539e2edaae1e53e5*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=ABcd16b16a --dtag=acdb 49x32x4x4_n"81ad8748dd9d8688398821c26c14abc3*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a16b 32x32x4x4_n"62460110db6fe86e4dfb6a8fd163fd06*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=ABcd16b16a --dtag=acdb 32x32x4x4_n"e89ab87da3ea7baf97fe98cbcc84643b*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a8b 32x3x4x4_n"3def8d301593894348fda77efe57383c*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=Abcd16a --dtag=acdb 32x3x4x4_n"f568b96580c9f9c74b6b4fabb2c599fe*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a8b 32x2x4x4_n"56328ed4c0f5c63fd4e263e005eb170b*47"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f64 --ddt=f64 --stag=Abcd16a --dtag=acdb 32x2x4x4_n"58ebd5256cbbee4ebf68f38aa044b252*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x1x320x4x320_n"fcb742519b92680686ab1110c90fad41*892"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=bca 64x179x18_n"265fe120c3ccc432cabe68bff41ebe52*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 3x179x18x64_n"965a6e92ddbe3a45bd6543e1e2f0d614*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=bca 64x179x32_n"4b4685cdbd2e3ab94a4d152913a61340*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 3x179x32x64_n"8f068fa76ff5d5d9b19b091a29dd5350*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=bca 64x179x17_n"8090a1426cd53a0db808887fcfb23ed3*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 3x179x17x64_n"f646e24300a42f075c663ebc14ca6cc9*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 64x179x32x3_n"45c11856cb959e3e43c49b5a0c6e413e*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=cab 179x32x64_n"e7334326db44632756d00a6bba8be5d6*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 64x179x17x3_n"65e1ec52c1834edd005454b5b2e30225*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=cab 179x17x64_n"90ea3aaa4845665e9b9b27aa9d79ecbf*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 64x179x18x3_n"06ac2d837bbe7209df2db2b5b6e50163*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=cab 179x18x64_n"514230847f026ebe4913c7a369124c9a*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 3x179x18x1_n"f7779ee64c92735d175629a12b6c28a1*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 3x179x32x1_n"7dda469132a76da7ac701ee914f5180f*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca 3x179x17x1_n"3de9de6aa2a31e7704b5c5abe7b8cf55*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a 64x6x3x3_n"9e6fadcd9718b3831a14f51e9ac1b44c*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 64x64x3x3_n"b6ce10fa6aa09800569376c2b8c7d12e*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 10x64x3x3_n"3f6b44c215454a8c1870d1abb1c2bf68*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a16b 10x64x3x3_n"472ef6612015b6cbf3c3685e62dc2998*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 10x64x3x3_n"4d64c5282174f7a5e2d42f8e583edfae*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 64x64x3x3_n"c6d695e8b1b2d1bb2c9eece08800af24*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 64x64x3x3_n"e08f6a49865f1a99d5dd26e8babb4ef9*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a8b 64x6x3x3_n"d0729087feba34047326f72851955f4c*47"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb 64x6x3x3_n"64fd1c70c08d2a7272d57761e0507d7b*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd 32x512x16x64_n"e569148c8ce2251a3c8fcb61ef487821*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd 32x16x512x64_n"9e6459bb5d5657947a50bd233b475e65*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd32a 64x3x7x7_n"c826502cd3a432e84acaa3787be20002*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a 256x64x1x1_n"6f3d6511cbd1babdb082c920c83dde65*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b 32x8x8x3x3_n"a58a7250d6dda03ea7adf5b2740e702e*3&ba938e4188104e6c3e72d44688de55fc*8&4d803eb64bf18e58e33ea3eb4097fab9*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x256x1x1_n"c7c4d8e2e5f5914f22bfe1e2d82aac4f*5&4fce9c98c90c3b999c9976c0906b4b98*2&1818ccb63dc62c5d3ccb88ce4925094e*2&8c551247006203c073798fadb05be8a6*3&012666aa4d8a6d4faaadc8536ee0e51a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 512x256x1x1_n"d3d0fdd3ddb3d32e1d4b78e957ae684d*1&aa40ee1a5d2744d2b22db57364af32da*10&f402e9a9a829bb80648497e0c3a00d2d*10&1f2c0545ca987ad00d193ffa20af249a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde16c16b 32x16x16x3x3_n"e7b25318ffd8195a41c20adb412bcbb4*4&5df63091262c473cadbd267f3047ca69*12&930115684feadcfaf94d49602ad1ffee*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 512x512x1x1_n"2e61bbf8512e1ba9160277f82e7d95c2*7&a17cdf1dec5fce2e14e0394877599fa1*2&dd57cca1cc8ccb1ca124b05290617ff2*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a 512x256x1x1_n"35a20f626906187044b44c97b1be26a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1024x512x1x1_n"7447d42ab4362a738a20c798722175a3*1&708ec9641f06f15c56a069a993b638e4*14&e32d1a92bf143ecf0c14fa2887941d9e*14&eadc6ba7d798fbf4a23f4cc9c92b1585*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde16c16b 32x32x32x3x3_n"4d0bb40036fda8c65f9addb58ede4a48*23&740983c79e7376a108c3effc2cb52e12*6&8c2f2839a6b25772bdd504bdb36ec988*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1024x1024x1x1_n"679078a74d33a489d2d63785935d5ba5*45&c6dddd16ce815be95b665e98c0586a38*2&9ce6830bc260e1cbc46ae1c7b7f19ce4*2&fcb2fa3db64f366d1528c19df6d87115*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a 1024x512x1x1_n"a7a05dd7663ae2f118a3d14efee3b35d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 2048x1024x1x1_n"ff6671a7ad315432b5aa91e4b4332bd9*1&06be5dc618128157ef3854c851a222e5*8&7928f9ec361f3a99467f2de155c39b05*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde16c16b 32x64x64x3x3_n"35f028a15b5d07319f9560befdbcd8b9*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 2048x2048x1x1_n"6a1863f9bbefb8f1c1ea091477719935*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a 2048x1024x1x1_n"ae7353b60a79c1ad30088d387aa43ccf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 2048x2048x1x1_n"84e7d9956478a920ac4e4bb1275d72b6*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 2048x2048x1x1_n"e5b5f44fcd6b1e5b34b7442b76cf2582*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde16b16c 32x64x64x3x3_n"b636a6e62ed71af99140889a7b6b9128*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde16c16b --dtag=abdec 32x64x64x3x3_n"0c3b528d69945fed1535ace4453c6bb4*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b 2048x1024x1x1_n"14181523ff7ff9013df945f73d7f6e04*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 2048x1024x1x1_n"4654fba53f535acd5a45dfc951c04d4d*2&1a8edce796397e5af5fde825ace282cd*8&33c2481b2fcd49ff4d05dc3f82f67757*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 2048x1024x1x1_n"30e1d693b0030c3d6304f15b43aed47a*1&cdbbf61b6934c1959dc714d237845e14*8&bffe96c4f5e949879d02b853d9aa2648*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1024x1024x1x1_n"9966d08e72f58f58bd90abc21ddc1dd6*45&311c335859a599ec2fb50c79b5a58c22*2&777342c00843e3851c284c0e13c51983*2&7005d85abf113ea53f8ef4e0cb823292*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1024x1024x1x1_n"5d8ae5199dbc5913e16da9b04f969d11*45&02033bc2ef933ec3d4f2f9a427064e60*2&07c286bdb71626098768b40203815e1f*2&a82811fb9011f890cbed29ec0fa91a81*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde16b16c 32x32x32x3x3_n"83260f8713589163554d8c965c28b680*23&9ef516d6c3f350a302864aa1639ceee1*6&bb7211d3d7029672ed9594f99141cc96*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde16c16b --dtag=abdec 32x32x32x3x3_n"7e8e0fba399f7d1b0806cd97e25de9d0*23&26e5adc17ee46243261cfc8e1b83abf7*6&ee0412deae0a7698bdae6d806d608cff*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b 1024x512x1x1_n"e7de5a9fdb7778439bf25370e5ee591d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1024x512x1x1_n"71beaf1939cb7fb50b563ee84663103d*2&d4db371850a53c333f368894cdf6fbde*14&66555cca803a61e7b34d910920c976d1*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1024x512x1x1_n"7942ad82d30045199ab62e44dbcd03b8*1&0f9a4ca1968357d40b8be9cd5a66d425*14&9608cbed6bf27aa481f72af14d82018d*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 512x512x1x1_n"39d58d3e63b3445a0f89e490b5d7226f*7&ebad7bcad7ac0779796f151c6a1c7b72*2&bb855811ba8045a67076fedca4feabc5*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 512x512x1x1_n"baf8889471e66a577791fcdd3f6704cd*7&c178c5f0a9aa16b562f41280ff11bafa*2&b949bf589f6fceee344510030eaaaa0a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde16b16c 32x16x16x3x3_n"2413fd757077a8206991f2eab70f4478*4&dc1505a5e837a556c33752ff4bb59253*12&d94c29422dce6b0153bec36c51a37eb9*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde16c16b --dtag=abdec 32x16x16x3x3_n"809fae02a4297af847b27e2d244d89ce*4&840aa8f507402e03024f28a99723f52f*12&abebd583f5ee479e8532330700ecb41a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b 512x256x1x1_n"6bd509b8b63448119b64e3c8ee6a0685*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 512x256x1x1_n"8d4ce2e7fa019b7e5a8ea6858f6746ff*2&9675985a041013e9aa929a5e33369487*10&20e30ab67596607432c77efc0eba1c84*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 512x256x1x1_n"2fa22edf03c41269413a01063761b7d7*1&80cfbd2302766e43445d1b2bbdf8c7d4*8&da42f28eca891f1c0f2a0b3c2514418b*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x256x1x1_n"44220724c97154aad0dced591caa1f7d*5&1251694e5fa3fd4507180576792878dd*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x256x1x1_n"c19a65d6be8163c4fefa1988ddb44ae6*5&ae6e608160f04b6b4b446f879eada1d2*2&cd107ae76901e2108aaaaef82f4a87ab*2&5b227ea157a5543ee9e5738d3a31b55e*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8b16c 32x8x8x3x3_n"fd809edf2dbced10eb955eb31845055f*3&9094a6a7704590184c68215fcc91164a*8&c6daac46679dade60f4f09c4286e999a*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec 32x8x8x3x3_n"e514ff062647a7ea816bdf94a9d3eaaf*3&1f68632f13e31c01cb5d9bb52d8d4c3d*8&7f0ebbd31384a343b6b03ac79f35b6c5*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b 256x64x1x1_n"bed5402ec459a5ceca314fdc5e868881*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x64x1x1_n"e58d76fc478f3fcb77bbcca22f0ace0d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 64x384x16x64_n"3439083a9a8f6b74a25a898c4472c797*72&b3f4a3419288cdf6ae66ddaa588ccac4*72"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 64x16x384x64_n"33b909aa5e9e0d6b1db186ccdcd7e773*24&463e46cd98df02541feb50ed372d4396*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=cab 64x384x2_n"82454e6c4ae09e7b827d4ec4e1bcb834*1&f5b94315e46a07c603b0ef84428e6a97*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 64x256x1x1_n"110b7158610cc0e5e89beddd618786e4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 256x256x1x1_n"cef076983452a9200242430334f25730*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 128x256x1x1_n"274e82c9d15dbf7fb9d4f89f7e0464f4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 512x256x1x1_n"4793590fc01e936d04e73d4e778a0cbf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 128x128x3x3_n"11142b017afd4d3240480c7080d699e9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 256x256x3x3_n"4eeb3ac0fbfa6ddd8baecd78d1b99861*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 512x512x3x3_n"0b39480abceed4df8d98c451b041762f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 12x256x1x1_n"e5ebb450e29e2e3cdf738a17ad6baa2b*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 3x256x1x1_n"7f897884e56d30f55b5767f76729eded*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 3x256x1x1_n"c2b5de040233b9d8088301dd25f6bbc7*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 12x256x1x1_n"89324716b9b6c4c385e5a5962c6db20a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 256x256x3x3_n"2f6973430a8da8949978856dd4c9a71d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=abcd 256x256x3x3_n"6c472c901de896028c81577e2da37555*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 91x256x1x1_n"a99465a24451e7ae2a64729b96619528*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abecd 4x128x28x28x91_n"c82dacba1e0bafe17602ea7488384bc5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 4x128x91x28x28_n"673c71b7888aa346f06cda4420b12e4c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 91x256x1x1_n"12bbd7936be9ed13ec9475c56f379ace*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 256x256x1x1_n"353635badd6ad9d57ae223d187093a49*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 2048x1024x1x1_n"987b1ee5419ce43a1ab4ebdd7cbf35a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 512x512x3x3_n"3560b3829c5fba62218f903a5f36dfa9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b 1024x512x1x1_n"2a9e7e9aff06a7d129c3f1d5e1842782*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 512x256x1x1_n"9c0e2b4fb327c7496293df89f391a398*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 128x128x3x3_n"0f556003774767a20058f7cc291d4186*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba 128x256x1x1_n"a4092d2220d543aa83445d4d7ed755f8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 64x7x1x1_n"c9fb39deb6e09908e026c0ac206406dc*1&f308cba482ebc95a2431d6e472f45256*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 64x64x1x1_n"6a89b5c69da32e01a20588c5750cc318*2&41f0804cda2f27a79871df066b130828*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 128x64x1x1_n"fe95d3832537ff9e9fe9f5c747c3a9c5*2&a28da74b1fd3d0e6bda04b236411ee93*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 128x128x1x1_n"d7b507f8e79ab55876b42830653eb45a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 256x128x1x1_n"182c69fc2e78ce7c29b686f1485ec461*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 256x256x1x1_n"263c2f70245cf2b9e114a6a78e850666*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 512x256x1x1_n"c9014eaa5a2de905b7652182293998f9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 1024x512x1x1_n"2588845896941437a8e05901561c98f4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 256x1280x1x1_n"fdb1771be9eae3b1f3d7e7679fad9b89*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 256x384x1x1_n"93336932086a237a04bf9a8984fa488f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 128x256x1x1_n"9efa19e271ec3630570fcda9e0d8eb56*2&597ca0565e152c263cf09e402b40ffcd*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 3x128x1x1_n"7974701af565f6fa73f5a3576022aec2*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 3x128x1x1_n"f0f53e3d06f46ded81b8fc07e271c8a1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 128x128x1x1_n"3f06bb981aaddfc84d22b8aaab408987*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 128x256x1x1_n"e14ed1b50e27d6aa937eb38a01c9c6f9*1&391fd698ee48a5c5af44d5000c245498*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 256x384x1x1_n"409206613cc32f308afc6b25672f3bee*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 256x256x1x1_n"d981846e004bef610e52b238b0385a7b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 256x1280x1x1_n"420f2605d13f130c90c4d1df9d905890*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 1024x512x1x1_n"6a2a8d6c6aa4771d52864abdfd43881d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 512x256x1x1_n"362d67ca15a6730764c258bfbbe0e6a3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 256x128x1x1_n"b0f41952e7abf71b76886b7f2e07bfd3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 128x64x1x1_n"f628c958dfa16565a26916167bb1d53b*1&c0fa68d41eb8ba0da2daf0b3fdd7990d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 64x64x1x1_n"b10e306e40e5893fa5aa3eb34be576dd*1&43b3a85c05c88853167797831158730c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 64x7x1x1_n"d73e17bce0f29c2eb8d58680015bf782*1&7375fcfaf76e28558e383eb0571ed3dd*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s32 --ddt=f32 --stag=ab --dtag=ab 4x15000_n"052f8810c05ff9480e8987b90f395d9e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb 4x7x64x512_n"8eefd5800650f961920fb0f0778ce19c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a 64x7x1x1_n"7a7fae69baae4443f61a85b21113abf6*1&a3514db90cc4f490fd3b79a6696e7fc0*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 64x64x1x1_n"d551ee0621ac6a8d95c44331b1b070b7*1&d6617f4446fb4fa5cdfa0c433398876f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 128x64x1x1_n"2231f1ec92cae31b934b72af25d4dcf6*2&d0a7b3ba1f4c782fe649d339e73e966b*2&25a744b67ae3b3d778e8cb5da910da50*2&2c9fe4fb5eb0099ee5dd968507c0408c*1&b6ae6f5093df9da66c9548d65fc1ad48*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb 4x128x64x128_n"d8c53b1e086f6bbed176ae2386047855*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 128x128x1x1_n"e29416c75f1c44d83cb3cf71f847df24*2&cb8801437a18bca5bf525d7e71769996*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x128x1x1_n"a594d43bd75437d471c2f98debe713a0*6&0622f95cfcd7a38bd5ccbd0b66bb1cd7*6&226cd15e8a6293b7feab2198818ddeb1*2&ff5625fdb1efebe8821b5d9e6a4febda*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdc --dtag=acdb 4x256x128x1_n"71468b8a51af4492d256626edfc74f25*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdfeba --dtag=adbcef 1x128x1x4x1x1024_n"5786e09c9707e269b695ce5033af528c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 4x1280x1x128_n"79f78d9e4522d56892502eb01747940a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x1280x1x1_n"e724e7fc980f6a162e1ff3b816841dcf*1&969533b311a7bce755bca83dd4cce40b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 4x512x128_n"61c7ba9fe835613af376072ff949617b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 4x384x1x512_n"6ca8fd3c2cf5a4d360034014074b49ac*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x384x1x1_n"c6d42e9ded5957a6b34b4c908412f1f5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 128x256x1x1_n"b45d5963bd3e5ea197d155953687133b*4&3c8c740d0a711c8d70dc67d1fbc6a180*4&e48abc8dab17d956aaa479765341ad13*1&a65f3d0e423102bc9874c5058ff7dd3f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 4x15000x512_n"bab2309e97f8472c401fb7fad237a2bd*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b8a 3x128x1x1_n"90de0f23e987ab154a6631b58cca42c4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acb --dtag=abc 4x3x15000_n"18c46e0da57e9970089c84b8606942b8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s32 --ddt=f32 --stag=ab --dtag=ab 2x15000_n"f36b04ad2486718ac0a74ff13f1357d9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb 2x7x64x512_n"5dfd80597224f1b309ade06d34eb73a7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb 2x128x64x128_n"1522ce51b7957ba81d88f2f20328406b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdc --dtag=acdb 2x256x128x1_n"19ea6ad6b8e5f363278c101a6673cf34*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdfeba --dtag=adbcef 1x128x1x2x1x1024_n"d56444ee20f4e627f66d2efd799410af*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 2x1280x1x128_n"85cb0635b7b0542ce911697a17c1e2b5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 2x512x128_n"6c88a0a6a693b8bf0a1a6a79808ecae4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 2x384x1x512_n"257196be4d6f14d8273d37b7379e0b58*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 2x15000x512_n"95a37cd12d659f523e222b294831d139*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acb --dtag=abc 2x3x15000_n"9c23b1873fd3a1998b2f773d88d7ba68*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x128x56x56_n"dd95823a8c6acc68f1cc5afc99220ec7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x56x56_n"14693e7e53587de03e1dc7b5f0950613*7"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x1x1_n"0c2818caba3da9c76dfc95b096b0939c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x56x56_n"2eab11130a923fc55a177f3fa333cb6e*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x256x28x28_n"da5a24109d3b46915047747bb7e1f912*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x28x28_n"a6bcba253e0715625a1fbb5e07de908a*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x512x1x1_n"c41b8abe426b8926679181f8832b32bd*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x512x28x28_n"84e548c4100164fcaf5ab8165ad92cdf*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x28x28_n"b100d5010f67bd9f9f493c4ec1639368*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x1024x14x14_n"019ff1ff893baf8eece25595537e4686*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x1024x1x1_n"106ce5441ec4beb4ea35f4a1114a8605*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x1024x14x14_n"82ac7894d8dca285b43bb5b1c1f6899d*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x2048x7x7_n"d4a4c5e3d2a6f5b0fb02cf46acdb54eb*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x2048x1x1_n"54b495b59ca9e339bc69d4adb74de9a7*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x2048x7x7_n"08826772d01250c3e502514733773fa8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b 1x2048_n"a7a541e1bc23594f054ab8e197f9e213*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ba --dtag=BA4b8a8b2a 2048x1000_n"5b79e676a0b5b8d785eabd631382137a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=acb 16x7x15000_n"38376f5bbfee496f99718e7fa3fa2cd2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1024x128x1x1_n"a1ae21035cd795671fa99340bc0fab06*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 16x7x1x15000_n"4623183cf397c03ccc9f93b4125ee7ee*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 16x64x1x15000_n"6bd35aeb1d89c449a2b26b9217503afa*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=defcab --dtag=adbecf 1x1x15000x16x1024x1_n"a9ebceae85fe55d654aa3ef6bdbdaf77*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb 16x1088x1x15000_n"eff4406c341b64de114fe53bcfc98e08*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 512x1088x1x1_n"14a40d000d9d74e5bdabe3d7fb12fe95*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x512x1x1_n"49e7f570f3badab6673ff746c4504da6*8&e60b80691730e749ba150f6d39b851a9*8&e23d37f22aa363352787d80199d46906*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b8a 5x128x1x1_n"5f32b6c4e8891ff476ab89627c318550*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=aBcd16b 5x128x1x1_n"2d4bb7e3dca294961edd3309183a98e7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b8a --dtag=acdb 5x128x1x1_n"d823edb04b2e4d6a7948cc16257e0de2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 128x256x1x1_n"128a62f99cd606b355a9b698497fdeb3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 128x256x1x1_n"dd5840adc18155c4390e02d56e3c7ae9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x512x1x1_n"2ad79bb4f3b2be4d2bb5c851b7d6c146*8&30b46b42e06d871d75eea56f1e7bddd2*8&d35efb8560a1b3f28b3ac814ff89afa4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x512x1x1_n"3b0dcc96bc32f63556665a67ed198830*8&60008a64ce0e45a471f9344df956514a*8&3bce8b2ec05967cb4d59ba561461bcb7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 512x1088x1x1_n"02ce815833d17aa4abe386d88d9b125e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 512x1088x1x1_n"8b7614b4777867d88d85139ed4079818*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1024x128x1x1_n"6fe7f30a72550ddac8d2271cd8ce9afa*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1024x128x1x1_n"adb2faf9a221f673d7629c2e5d281cf6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 128x64x1x1_n"59d1b8ee551cb8f3833f5a86161067a7*2&24096c3f4de5270ea8c1d544c65fc3a0*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 128x64x1x1_n"9d4b062e15abed4e5e9a5440141fb7ce*6&d3570c56cd0ce4b77b2ca982810b5727*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 64x64x1x1_n"998373b434310f15d8010ed12f62efd7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 64x64x1x1_n"5beca92944f1d269e6dd7b996ac213e0*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a8b 64x7x1x1_n"61e863e802465fee1d8a9eb6fbeced31*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb 64x7x1x1_n"2033e1885ee424eae75b8f67caf7f6a7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=Abcde16a 32x2x3x3x3_n"a019c0f6811b107633848bcc86d5068f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a 32x32x3x3x3_n"53efa6bfa6969d3a0a21d65f0eab5d53*17"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b8a 1x32x1x1x1_n"653cb89d38eb690e559993d8c5b4b8b4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b8a --dtag=cdeba 1x32x1x1x1_n"0541010739038d7743202be45de15bf5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=aBcde16b 1x32x1x1x1_n"768a203878b0a3296ba4a9fc0875e69b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba 32x32x3x3x3_n"5a6f9b2d112b7dc7d2d63cc3b70cebd8*17"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b 32x32x3x3x3_n"03f2026d956791fed05380a038e7a10f*17"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=cdeba 32x2x3x3x3_n"c8c1b37343f89e0a44fd8c19c6b8add4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 1024x3x224x224_n"c29c308d4b3b3a67042b3280b5e81b0a*1&09f278c748b35d0153aac92ac21d53b6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --attr-scales=src0:common:0.5 1024x1000_n"6ed1bd72f610de5f9a82fa026fb8c55b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 512x300x300x3_n"ce924fe7b7c491b276b7db9c27cb71db*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=acdb --dtag=ABcd8a4b 512x3x300x300_n"77c5fe8564764947e13231700787ad84*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x12x19x19_n"4b3d986727e237d820fb894de9a27cf0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x273x19x19_n"e75044157d6e5292b7570d2e19df6a7e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x546x10x10_n"485d33165c89b18341ceef5055260463*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x24x10x10_n"cd6f18637a3d4c33377ffe31abc1e42f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x24x5x5_n"50bb47d4ad4fcb77da7c9aa3423c98d5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x546x5x5_n"e9c402be32742139c5496f2f19e3ebf7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x546x3x3_n"b17b2ee92115c4ae0814bcb3df1c8bd3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x24x3x3_n"15c189c4ae207d73d70ebeb172f4c24c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x546x2x2_n"e67c045c699ba31f22a2e61d4af4cc16*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x24x2x2_n"dd5e7d02fa2e9a7f75e46aa92414a958*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x546x1x1_n"b31aead9331146ab010271fa1c854480*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --attr-scales=dst:common:0.5 512x24x1x1_n"ba80b86997dc570886c9864e650a008e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x546x1x1_n"6a260222d864e5a5c35170db23ad6c9d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x24x5x5_n"5e3b9b49bb95de512c196602d12ad4b4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x546x5x5_n"db15dfbb7b0f1689693cda657ffbf200*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x24x10x10_n"d1f078b80568a9e40445689b680ddc97*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x546x10x10_n"74b9d83b1a2d0c223e76d06938a9fc35*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x24x3x3_n"50637f4511cd4bda4c7c91b420b4fb91*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x546x3x3_n"26de0da483f353e56925830c1029510f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x12x19x19_n"41599a2800a04351b727f8840389eb36*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x273x19x19_n"0993f5c02f7c4280aea5278853164f35*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x24x2x2_n"ad829f685cc6d29614a4c85b1a8560ce*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x546x2x2_n"1604d868ab3755001d84a3e77ec50b0f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb 512x24x1x1_n"35cd2b127f1dbebcc583c2910aaa6b77*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=abcd 1024x546x1x1_n"7b37f6b8b0cbff9df82bf3fbff89df80*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=abcd 1024x24x1x1_n"4ff1a1658043b5942dca30290345f857*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x1424_n"73b6ec1a3c4c18c1ff89d6c9c102a38e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 64x1x1x33_n"0fc889b7f9172bd66578d0a644e5e353*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x64x712_n"541f00f3a2c9a6271008930ad84c906d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x712_n"38e57cbe8a50e2836619a2c7f51452d9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 256x64x1_n"0f2a64e3b385b79ad2151f2e0f0941b4*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x256x712_n"3af8803c44d9422e1855fef00f366b50*62"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x256x712_n"74c1c5263dda2f284798497c0d8a0def*69"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 256x1x1x33_n"96f65ddff54c461e9c3c5b08b7ae9d04*75"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 256x256x1_n"72d34cfde4c102f82f4179d80860c6d9*180"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 256x1x1x39_n"50073762cf0d16cf0e1dacfb19688ded*75"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 256x1x1x51_n"ddff59e7ed30a377eab09be387dbcc60*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 512x256x1_n"1dfb94c706ea27b3966fcc118e5a4118*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x512x712_n"6f00b2a417d009caf8f3c6c8d8495d20*91"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x512x712_n"aa31f621a61cc82785d2522b47444cfb*99"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 512x1x1x51_n"59d2dc03f653fecffbcad6475e4330bc*70"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 512x512x1_n"70737e09d1e283b00352ca15399acffb*265"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 512x1x1x63_n"995503ddde471733048a47802f39bac6*75"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 512x1x1x75_n"391a1e24ed6e7855e7561e87c41c42e6*75"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a 512x1x1x87_n"2aaa1c24c105cc8e0e0f56db60183f06*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 1024x512x1_n"ab518d1f8c3d88f14c02734e0a24e3fa*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 29x1024x1_n"4df177bb567fe4b8d83e1158ad005198*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x29x712_n"6ed93a94a7598180179b6b95d664d57e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x2240_n"9d1fb642185cd72b4d788c18d1b40f2b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x64x1120_n"6fe0425c09325afee78b6a44764d30a4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x1120_n"4e0a96de084cdbbc03a9c914012f1351*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x256x1120_n"470d8c197c1d1412ca67132ab3a9675a*62"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x256x1120_n"181bc499b98869ce9b8e98898fca5d15*69"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x512x1120_n"803eb140c361015304b63123ae37db86*91"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x512x1120_n"1c167d771c96991804a6fe70eb353585*99"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x29x1120_n"f831dfc47e78f9fe68b7ffff2d5aa18d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x720_n"56c4e3204506cd156777fc14ca63fa81*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x64x360_n"f3c475062af0d5309dc1d0fdac893a40*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x360_n"b6bf4359a1cabd5a550dfdc5249de3d9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x256x360_n"83462b1cd2c1a916443ad8b692a844dc*62"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x256x360_n"8e00c7ddc3672f7e68b5b50890cd850d*69"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x512x360_n"35998ce5767b232f9c8c1d11a41d03ef*91"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x512x360_n"bf8a029a8b1d36ecdb122bfa340daf91*99"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x29x360_n"2d03e428071628664e5de0a062e92c03*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x576_n"147b3263b576ab0c335efe33e729baf2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x64x288_n"fc20829dcf15e7ab52e6fb5bd83337f0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x288_n"220ec005848970ed07f60401244aad40*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x256x288_n"9e61d26c892f2ee4f93461be044c8d0c*62"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x256x288_n"e5f5db3e25328abd4e1280123166550e*69"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x512x288_n"5dca790ec33d30ea7b870c68252873cc*91"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x512x288_n"9a5feddd065dc79a9013ff73e8d08e97*99"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x29x288_n"d22b9ee61fb1b3751909d574fa9a8cfa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x64x560_n"edca36c2b12458fd955a288b44c72b50*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x64x560_n"e07edfd720472e9a758fd93545786c18*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x256x560_n"5ccde6bfef3b72538438fdd9d15b10d9*62"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x256x560_n"c66c5b6d5de6b4e1df57f35dc4cd58ce*69"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x512x560_n"bd5baaa7d6208406576abb50e8965e38*91"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 8x512x560_n"b311e4b9c77f0f6f04340c1f07dc68e4*99"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 8x29x560_n"5cd4b6dff81d88446ca8b8d8bbde8a89*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ab --dtag=AB32a32b 1x2048_n"79869e2448b1120a16a6efa1d9a6acb5*15"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=AB32a32b --dtag=ab --attr-scales=src0:common:0.5 1x1000_n"010a48ec2898354f7754866098fab10a*15"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 256x3x224x224_n"2aeb7551a071de1e88fdfedd7653ac77*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd16a2b 64x3x7x7_n"ccc438bf90696195ef192b9dc48e8e5a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 64_n"27dae1ab3ced2a81696e56657577f190*21"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 64x64x1x1_n"03b0a5d4a31b6e5fcfa0536b23ffea66*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 64x64x3x3_n"311b98e5a07ecf73ab92d39dbfc66299*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 256x64x1x1_n"77cb23a622e1fb6dc90e98b7cb69a28d*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 256_n"8d78b20dbab7abf51280cc563eabfaf8*48&bea4fd8f22cccb9ef9210ad8f251d02d*18"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 64x256x1x1_n"e6a4888fd0209fa783c100b6a20106a9*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 128x256x1x1_n"5e39ea33273af779501f15f8e3a4a88f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 128_n"7cbce17976befc07e7c10270acaa6f90*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 128x128x3x3_n"40d81b2dfc52bff40c5341e50155ef6b*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 512x128x1x1_n"1d595c290c1e6ec971a41f021298efd1*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 512_n"f7c30cfb0f3eb322e4a053c92377585c*33"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 512x256x1x1_n"5221aca15b3d7f3ffdaa2f2c7f14e0a9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 128x512x1x1_n"4f5ffd3e88a911bbeb5ee5d658e9fabf*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 256x512x1x1_n"c8c84b090970e27c25440d0e1ceb117d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 256x256x3x3_n"dd7e225bb13fd9f4b6ed6788493f4563*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 1024x256x1x1_n"06098156ef75d9c53ad3ee7f7fe25786*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 1024_n"39dc0faff9b88634634140ee0d115b7e*21"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 1024x512x1x1_n"2aabe7d9d5f211d333b5a2a8685bc10e*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 256x1024x1x1_n"c5c889109876810eb51921f7440d2fa7*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 512x1024x1x1_n"e296c18d42443eb2ce9804f452d86ed4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 512x512x3x3_n"711d28cf9a1f99689b141b9ac2591eaa*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 2048x512x1x1_n"27e37330664f7b30e17614b9eb9d106f*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 2048_n"a215c716a71b987f74d069466a557b83*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 2048x1024x1x1_n"9d50b8948371b52a55d2064336123577*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 512x2048x1x1_n"b9d38ab91c1684dd7edd6d03b1f13880*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 2048_n"89d0b53b846e033f52fd61b3c26fec00*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 2048x512x1x1_n"d3a010730a46b079ab354141ffc71a54*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 512_n"7607206f8b26e3641326629c80649b50*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 512x512x3x3_n"44635f5d6f9a27c2a645454631bd7a15*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 512x2048x1x1_n"9d01061100d76aeecdff2f4e80d4ecd7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 2048x1024x1x1_n"7bb692d9d95d3d47b7865cac9d540159*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 512x1024x1x1_n"268ce2c8f4822aebda794f703892352e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 1024_n"b3fd534d2d9ff925f2640bdcca319320*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 1024x256x1x1_n"c679170c33a2d6ea1c9713f07d6c004d*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 256_n"978b1bfae637e19644b4206ea9428285*32&aecf74e842c6a8f99a375d655882abd0*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 256x256x3x3_n"f73e4197cf34b0574716aeccb75dacac*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 256x1024x1x1_n"98236123167f16de66f48bf6f3708a3a*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 1024x512x1x1_n"2c6fbc52c75050348e98032d55820815*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 256x512x1x1_n"8943953bee72f5b8e206edc23fb87318*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 512x128x1x1_n"fd17bc043c2fe35f9bea2df7e56272ac*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 128_n"76a188a5a73d7efed5bd63a452272d98*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 128x128x3x3_n"9411b7050a4b14c39824446b6bada3d1*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 128x512x1x1_n"64e286b57476f6019f0a4558949547b7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 512x256x1x1_n"3b14538f3cbff81b36cdc9cd2fb8b7ef*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 128x256x1x1_n"5ef8d1442931e427fa2675fe010b6b9e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 256x64x1x1_n"8a30c182c6e455d4e9d57b2a8902ce05*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 64_n"73458fcd71160fec373820993836be5a*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 64x64x3x3_n"d98be71df8d16c236b846e4ed0bf6c5e*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 64x256x1x1_n"5181f79a10b7557598c1ede1d45892fa*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 64x64x1x1_n"dd162c790eeab879da347277d24a66db*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb 64x3x7x7_n"f774edffa0e34b5ade17bc136da32c5b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=AcdB8a2b 64x3x7x7_n"cf66296c0056823c42e5aee582a6ae5b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 64x64x1x1_n"b1663bc19736c7c8ab59aeae060e35cd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 64x64x3x3_n"ab58c710edf9413cb8175f43a7f6bd99*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 256x64x1x1_n"fe9973f34ffc42d4a11b163c4f6c2b9c*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 64x256x1x1_n"bc9e238c17631c1a6045d12507b48384*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 128x256x1x1_n"9ff9d868bd90cffca7c64a895a4e2384*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 27x128x3x3_n"158b1575916a07a72198ef2149ef201e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x27x45x45_n"6a2a3e04585c30784e550c191ce91cc8*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 64x128x135x135_n"4261b16cbb831f80fb750a3d24d61e6d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b 64x128x45x45_n"889bc5d2fcdf2ea3a8420c6b270411e2*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 512x128x1x1_n"2bbfab6342f545319bec96b89d2e9064*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 512x256x1x1_n"7e7e44974295700f9a5033504e73bf05*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 128x512x1x1_n"d744b81aecc49149d2744b14744374a3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 256x512x1x1_n"b4d7182d3ae478365798d82cec233c7b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 27x256x3x3_n"d6e822d0b70228a67f0b785525f24952*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x27x23x23_n"611905f7538f930aae2328debbb36522*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 64x256x69x69_n"c764deda00b7b1ea31aab15f44ae7c05*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b 64x256x23x23_n"f6d16877689b3b6d0c827b032b24eb3c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 1024x256x1x1_n"e5ddf6d18760af0b4019c455f1ca4c6d*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 1024x512x1x1_n"a575b5d04026eeee9f1a3b157c28f895*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 256x1024x1x1_n"b7597af5ec2f915822e23f1c94a1019d*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 512x1024x1x1_n"38af92108f5e61866c1ada8ac8878c8e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 27x512x3x3_n"f3f86145805424b7726b5e84d407f248*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x27x12x12_n"833b938d56b788a31e8d0bd982dc351b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 64x512x36x36_n"e5cf31b107689a187f722f006548734f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b 64x512x12x12_n"530ccdacc46b6c4e03f9b23a42f099c3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 2048x512x1x1_n"d6c3203df6e39d0c8ec09be73e01aacd*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 2048x1024x1x1_n"9e69eb665eda1ec82a1f03be5e3ac1be*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 512x2048x1x1_n"3bda2236321eb1503559999719511d11*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 256x2048x1x1_n"608d83d660641162372d243797546821*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x256x12x12_n"f0da7bda5ef6c7a3a5fd1407e144c772*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x256x23x23_n"da50d627303a5986fcc8f3472fa5d262*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x256x45x45_n"bc24275c660456d16c3b824291d4c44e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 64x256x12x12_n"7a4f90512e5471ac34d0922db7b9271d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 256x256x3x3_n"bcae8e8577fad5a2e181c7e1d5c67337*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 64x256x23x23_n"aa697c8af175df3a32dd208efe65d8f1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 64x256x45x45_n"565693f32d4b9c0f4c574d18e7cbd9f6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 32x256x1x1_n"f1cd6d369daa34cbced916788e951bda*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x32x90x90_n"934d64581d13a0748c2c977e1a8b2fde*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 36x256x3x3_n"51f2bcd1bc7efcf415870b7e118e0b86*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x36x45x45_n"f8946361a61a5825b6449f68662466a6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 729x256x3x3_n"b05bb37e5ebe5b932070fa39e6c40442*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x729x45x45_n"c702ae3ff5f9fe6ed7a782a84ac9dd07*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b 288x256x3x3_n"d063d8891fa0ab9b2fe0108070e066ee*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x288x45x45_n"6c3737221fdeb69026a7bd4b3e3a7c71*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x36x23x23_n"df7cd467bf3971c6275bc18f62c5234e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x729x23x23_n"895ce733d10523b2fc9f414c53b4823e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x288x23x23_n"44e69641b429942e2ec4f7f1f007c437*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x36x12x12_n"52e847883dfd089d3048d4f6f28ed10d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x729x12x12_n"3fcea73033a6289d914662dd0961b817*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x288x12x12_n"25ec32b753c7d6ca994e7237dce73270*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x36x6x6_n"af099f1bfe532b1d8b1e741abf98da6d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x729x6x6_n"a012137e473d4bc2bb4a8802cd558996*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x288x6x6_n"5405b659effc4bc5c54fa2597e3b118b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x36x3x3_n"0c3a3017d734608563322da309a5ad30*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x729x3x3_n"3859a392d7fffca37ab4d286a5f3f9b3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 64x288x3x3_n"b092be53a08347912b1174ddc3fcf3fd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 1024x128x1x1_n"f2d495b88c7f5f1ff7c56043fa7d2fe6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 512x1088x1x1_n"5e9a9dc58b245476e98e611319e6f4de*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 256x512x1x1_n"eb795fbf261ef63c4dca547c7ce57fc2*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 5x128x1x1_n"065d6418c93ec5a30bda2de6fcf3f7e6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 5x128x1x1_n"7de66d80a47836f14d2ef3b707431e57*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb 256x512x1x1_n"ff0903d2ca64d22df97cef37909c3a89*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba 512x1088x1x1_n"57c02fc391a41106ab17cc4c36d38b35*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 512x1088x1x1_n"19ef7c2dc11931cda2fedc5b3df2ec55*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 1024x128x1x1_n"c73e15a8e61754883aeee862a1fff26d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 1024x128x1x1_n"fa87a22fac365c4d1d35ef522f06a4db*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd16b2a 64x7x1x1_n"fb827833a3182d5fe3f3f647d00fb85d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x128x12x64_n"be30a3315c66e31b50193933788d85b1*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x12x128x64_n"33b5614421e2f822340686e3734ad39f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 128x768_n"df7ba2a1e8aeefb666d07898fc5ce788*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f16 --stag=abcd --dtag=ABcd8a2b 64x3x416x416_n"7b05a446e112aeaaba27e0c012cce923*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x255x80x80_n"1ec4355fb8a2c8f040ea265ab91a0367*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x255x40x40_n"ce95f6a2c521efd781bb573eefbef3f5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x255x20x20_n"2eb3851d8fb36a4fa3c2508ab47cbf0d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=u8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 1024x3x224x224_n"d430e72ac81f8f4f9ec54c569d8f26f3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ab --dtag=AB32a32b 1024x2048_n"e94488f4f0eb799e76b641917ec34875*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f32 --stag=AB32a32b --dtag=ab --attr-scales=src0:common:0.5 --attr-zero-points=src0:common:1 1024x1000_n"497ec832521a1e03321839da0c843fca*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cbda --dtag=Abcd32a 64x1x16x1_n"ac3c6133303bfd6fe5168f46a16b74b7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x64x16x1_n"27c833ca43d47d1723bcfe93cd854f7c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x128x16x1_n"4b4c4aeaf27a64621754b877951384f9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x256x32x1_n"bb96f604ccb8d34dd7fbb5c7c5d9351e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x256x4x1_n"51096187cef71c17b8e668f0f77eb3a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x128x4x1_n"915ba55cb145995f74e949d9ff446fc3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 64x128x2x1_n"ed753a6b3937461b4e0891079bb6fb36*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 64x128x2x1_n"db9917bce649ff3fe43203bb247b2824*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 64x128x2x1_n"89a00105098041d94276678847d5ab10*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 128x128x4x1_n"be679c862157ee50df6c3a2e9de242d3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 128x128x4x1_n"3fc38217d9f127e8476e19e6fda039e2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 128x256x4x1_n"3773207fe71a8fa8db58aec8203df4b2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 128x256x4x1_n"4fcdb38a0021bafc982e9cba1d6421bd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 256x256x32x1_n"1779d1819e8b197c0cc8d26b33c043db*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 256x256x32x1_n"cc5e23543db3f131f9d0b1a97ce4fe9e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 256x128x16x1_n"ff09809c7fa698562e0dacc3593e5f75*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 256x128x16x1_n"965fc61b9dc6cf35e42ad8e9dce7fcd7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 128x64x16x1_n"a2067643bcaf7ba7e63cb4fd9b7b7a27*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 128x64x16x1_n"660432c76f953bc31cb5114f4bbbceb0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd8a4b 1024x3x224x224_n"8241e1e1af9525401f6e5b7313f5ab14*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --attr-scales=dst:common:0.5 1024x2048_n"cb3565ba1a661d34d68f44ee02942b82*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd16a2b 64x3x7x7_n"b6b6564265841ed2753bb913e6c1c118*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x2x112x112_n"2258149e9a0a80ab11ce6198664b6996*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x2x56x56_n"e5928342247099fd43c99def104ee0b0*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x2x56x56_n"ccb034800f6ed4c438305ef40186d60c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x4x28x28_n"79c042496b99470e22e8e6bb74f4c219*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x4x28x28_n"356fb1a4429c56814fda2d8edf03a95d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x8x14x14_n"ccae6f6213ed548dffdb3a405243daa3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x8x14x14_n"b64bf59c97e57154eeda5f73d7bdf251*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x16x7x7_n"1cc9a4bfab34c5e3c03030a1bcf8a619*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x16x7x7_n"283d6fea07292dc73f98385c1e128b4a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x2x1x1_n"46d0662477053931fd53e77c157c09d2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 64x140x3x1x1_n"a73de652711f43cf29a59d8725f0c735*49"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 64x64x1x3x1_n"e3aa6380a87e3416906243e5f4cb3977*196"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 64x64x1x1x3_n"97ae25266777ce97ed57c67dd1d389cb*98"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 64x64x3x1x1_n"b39d109c0ec7f0cf361cb07127fb8b03*49"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 25x64x3x1x1_n"bdcb20cee0e001beb514485646b6839b*49"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 25x25x1x3x1_n"12cb288473259baa7e758339cbef8ed8*98"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a 25x25x1x1x3_n"7e4c677013beae024f60624e6a0b4644*49"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 25x25x1x1x3_n"c8f4c3b701efd2d2d4492e0fc2550e44*47"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 25x25x1x1x3_n"5a72400c4bbf63269dd8275c57bfcfcb*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 25x25x1x3x1_n"f425b9c28ac37173d33a11565e03c72a*94"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 25x25x1x3x1_n"73c1a8db81565265c77e223495d337c4*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 25x64x3x1x1_n"400c9704e6ab24dd383fc2c6ab05e577*47"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 25x64x3x1x1_n"316288267f2a3ce987a36727a6787e87*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 64x64x1x1x3_n"aa58e31ccbff4598affcb0734c95e26c*94"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 64x64x1x1x3_n"d48bce55bdcd298b7e06e475400648f8*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 64x64x1x3x1_n"c4ddc95f9b47c7eb149d3cc320e2fdab*188"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 64x64x1x3x1_n"3e410f1206ffb4f6da7cb6927dc9ae5a*192"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 64x64x3x1x1_n"e00830cfc329820145118549e3f68968*47"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 64x64x3x1x1_n"cbfd67f927f9809f1b9813e986fa4d66*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b 64x140x3x1x1_n"564d593dd47645bcce93c07c9a1cdaf6*47"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb 64x140x3x1x1_n"a3907365230ff38cce4033a733c443d7*48"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b 25x25x1x1x3_n"f23bfe86a2b03418739f357edbab868c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b 25x25x1x3x1_n"2f4bf3be2e647f0ec8e061ea520b5eea*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b 25x64x3x1x1_n"38e650dc0ab5af229b6d342026061c25*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b 64x64x1x1x3_n"026c8c2837a6b9270ad548e4702b7d80*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b 64x64x1x3x1_n"4402b8165f1a0e13dad8a716c333443d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b 64x64x3x1x1_n"c520fe68dbc4aad707800c5fc7bfc334*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 8x128x56x56_n"08ed91c8193d12a52618f62fc7a39cbd*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x240x4x1024_n"121aedb1dd7566eff3c12c1edcc50cc4*1&1c9b70447d83ee1d19e95d3fbd3d3164*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x1024x4x1024_n"81a9f0421dabee2b579f133294801a16*8&addf901d205c6d5035b58ed96317ccae*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x2048x4x1024_n"74d3de7a1af851962cb1059780b18f61*1&c5e1764b67cccf28f90b7a00d86154fa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x320x4x320_n"756bdbcfbc34bc27be768d4a3de28f07*1436"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 32x16x3x3_n"b6dc9c39d8228046ac7c59af007c0121*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 64x32x3x3_n"2257260f87abee51c326245b962143cd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a 64x1x1x3x3_n"07b883190e290b4207d7601d0d39dafe*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a 128x1x1x3x3_n"2b92ee462837a4b0aaa6cbfbdc8f6390*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a 256x1x1x3x3_n"008f47665d58f29ef0a26cbc4be61e9e*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 728x256x1x1_n"24b3165e09f6ca4a3ecff2bcb844bdb4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a 728x1x1x3x3_n"e3f8a768e8def6564783286bd9e38ba9*104"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 728x728x1x1_n"1aa0403e7de5b32a0ef0a7d829abde4b*51"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1024x728x1x1_n"42e26a8a8d5bc17908b7f1913f9d358f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a 1024x1x1x3x3_n"17a9646ad64f1bce4e08572addf865ec*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1536x1024x1x1_n"32978ecd495b9698873faed35f916b09*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a 1536x1x1x3x3_n"827d608ecc6f1cff999c95c6f770b2db*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1536x1536x1x1_n"a0e909817bc3a075e0bd5417d1a2bb09*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 2048x1536x1x1_n"bd10e5a1539cbc765fbf8b033c6b5304*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x2048x1x1_n"164727000162af873b62cc08454828af*2&b428db6cbe8cf1e5be0456da13658769*2&61cc36a461f81ec7c7a39552f98e1d42*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x2048x3x3_n"514ed4ab7ed018d5841640dc3f859b4f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 48x128x1x1_n"b0210a4e2c597c017007a6917317ec2f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x304x3x3_n"84668dead2baafb4bd9ed198f9d4a29b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x256x3x3_n"f1ef60b8c96a30d06c89b895573f0147*90&6053a034e76b1d73c2118e6c592dc4eb*90&28de3b84fef4962115af7794842f10d6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x256x3x3_n"e52cfbf6a42856726957e900daf841ec*90&8d407a70abb74c21cc0bd740f80b4a6a*90&2c586f5a40121c4ea0483fa29542ed55*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x256x3x3_n"0ca48ed10e7519fce3283a3c1f19741d*90&74d5fc984b82151975f2c0a966eab78a*90&c5cfa9a2beb44b9df89557a04ea3c065*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x304x3x3_n"cd01ed02f6b5dba5420abba042fc2540*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x304x3x3_n"fa63338ebdbad70cbabe461ce39563d9*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 48x128x1x1_n"3238923035d26da1c1829759c3718783*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 48x128x1x1_n"51a0aa9248db76825abca2f27c66ab41*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x1280x1x1_n"a7ef9ee99c52e10a134a4df0376fd0df*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x1280x1x1_n"722305f920a73d5b0755eb4e3f180735*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x2048x1x1_n"5cbcf4ecba529d52c018919910440691*2&529e73efa8e65f3e45aa3964387707b2*2&eb7ab68737b70715704457be6b45cb9d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x2048x1x1_n"40815c121e5ac40124062eb3c774585d*2&66c81e1e8a054417c9dc1c54b2074b1d*2&32aa8881a04ee2a0efa8cb33ee9e961c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x2048x3x3_n"fa544fe345e58b26cbae548584d10bae*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x2048x3x3_n"cd96aca44c8549d0a4c31ea9370a2ec3*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 2048x1536x1x1_n"29360738cfd22672b9fbff179319633c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 2048x1536x1x1_n"5843e55d3af60f8c6733ed381d0c4a5d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec 1536x1x1x3x3_n"602d2bf12a8ea8bde6e8d56bbaf22f8a*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1536x1536x1x1_n"65d04a6e19bccd7a69db9d3ff05c26f8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1536x1536x1x1_n"97efe183f40d0c0f622f8a6a3eb3fffd*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1536x1024x1x1_n"125f54b1b874395a53abc5dd92dfc66e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1536x1024x1x1_n"1470feed067854b9886b5a1172bea2f8*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec 1024x1x1x3x3_n"73ba3c2898676c233e875b7a9d24294f*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1024x728x1x1_n"427e59a76e8859cd02c36e2bcd4a8889*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1024x728x1x1_n"6357f8000ac2efaf142914d6b201b2c6*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec 728x1x1x3x3_n"afca0f521ad4e2250ff1bba837109731*156"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 728x728x1x1_n"4a833cb991bf2749bf95ef8f8abbc1d3*51"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 728x728x1x1_n"f06054656c863a2fc083a2e0e34d2955*153"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 728x256x1x1_n"cbaf552b19fa95a0996a821c9da581f3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 728x256x1x1_n"7cbe63215522251a6633cf9a08a3782b*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec 256x1x1x3x3_n"a51ad25c4da6d14fb8a6509fcba58fdb*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x128x1x1_n"b818fb7617f4b3bb713e9c2765ac6b91*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x128x1x1_n"c963e3bb722fec7ac8a55d7d27e2d410*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec 128x1x1x3x3_n"8058cbc7bd99954f68864349c0508b35*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 128x128x1x1_n"8800da42a4dcc70c06c1c150268939b6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 128x128x1x1_n"b2e7e5821928cda60c649b848664c8a7*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec 64x1x1x3x3_n"9fdb094d3a58861fc05c4ff0effce71d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 64x32x3x3_n"1c1c6a7d44bb34d738038a065586e84c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 64x32x3x3_n"2421be7087a5afbdeb466e4c46ce348f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 32x16x3x3_n"7a8aa78d61f442e69f1216b40c402892*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 32x16x3x3_n"5fe68a79d44ef72832b6fdf38bedc789*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 64x32x3x3_n"df9f284d341e4c046d5e0c62a0dd6827*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a 64x1x1x3x3_n"080d50ae94326b2e10fd1864b69e47a8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 128x64x1x1_n"804927bf50a5b1d3362f93bf6081ef37*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a 128x1x1x3x3_n"f2f025b0e28458768f1f6eb366c9df36*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 128x128x1x1_n"629abc96fb8caed249638c4db40eeb71*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x128x1x1_n"b5b5220390b56103ddccd0be2d78bf2c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a 256x1x1x3x3_n"494f703233dc05c03e0849efe375acae*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x256x1x1_n"cada97615d7fefbd6f6660776c8a11b3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 728x256x1x1_n"cb2435e4dbf01302604d8c589b22887f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a 728x1x1x3x3_n"45ffff1485a9fc66e010b7f0ef119ee5*104"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 728x728x1x1_n"c225a845c9d02d11ceadcac0a1d126e8*51"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 1024x728x1x1_n"3c80d164b3bec2eb95805d51a1027146*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a 1024x1x1x3x3_n"e00f5e35ac3b38605cb826c80d926030*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 1024x1024x1x1_n"85ebdc56f523b72249dce1707ec0a144*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 1536x1024x1x1_n"f549d1751db9bf7f37b8d408ad28f757*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a 1536x1x1x3x3_n"e982d13850be18fccbcb4eebe1752414*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 1536x1536x1x1_n"87dec567d61203d0f55ec217e7be403a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 2048x1536x1x1_n"ae35799dc6f12dd105bcf636a109632c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x2048x1x1_n"68e0c5e6eaee15085c8e0ea3f64878f3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x2048x3x3_n"6043bd7748f9276c29d3db7be40c3b5a*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x1280x1x1_n"f348297b09c0ec86ec99049c2113fb8a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 48x128x1x1_n"7d268abb12ba8969c2e3fa9ee2087fc4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x304x3x3_n"470a09e398be436acc625d958323580a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a 256x256x3x3_n"baa26d86750826fa83252eb363e72f61*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x256x1x1_n"4d34f920238d2c97ad74d4c12fd0c22d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x256x3x3_n"db21900e77761b29fa7a9d4f78ce429d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x304x3x3_n"6fbc945af1a22d20a5d20415eb1bb141*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 48x128x1x1_n"6f064ed72496747fc783a0e018c197b0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x1280x1x1_n"d2204ce309095840e5477d404949eb0a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x2048x1x1_n"ce58229497c05dc64c95254bf1bace63*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x2048x3x3_n"09151ff111e352c87b36db17a20ca003*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 2048x1536x1x1_n"3f5b08297dca36f952affc51359c92b6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 1536x1536x1x1_n"d3641830f4a9017c4495808421edac5d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 1536x1024x1x1_n"1bf1b4d1ad43ea20ef84160c3efee790*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 1024x728x1x1_n"b4ca0411c606d964def8f591ab55df21*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 1024x1024x1x1_n"89d2030446683a63f863a84016c6cb31*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 728x728x1x1_n"a0e74d3273c93a55453cff1d0203e3f7*51"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 728x256x1x1_n"d2bde98304e604c6d4dd004891b4b1e8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 256x128x1x1_n"a5e7d5bc3eb04085e9a54b1e774d2985*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 128x64x1x1_n"68272cae8d0337e932fe1d7ec0254ea0*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 128x128x1x1_n"1d2396c60c3645781a6e50113f276f08*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b 64x32x3x3_n"e1582a570bfbb425c0471fa983d209bd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBcde16b 32x4x4x3x3_n"dfea32c19bed3fd9108c437d5e0bec7e*6&8d68e0d4bc75724f87b4631fb15c608c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x64x1x1_n"0362577765a9e3efa9ea108eb47ef12b*2&418b5ac83802964eea93b1c6c4fcfbff*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 512x1024x1x1_n"04bb1997beef64903d0281239e34ddc6*10&58d514621c047f2f724058fb8c97840d*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 1024x2048x1x1_n"2dde1a8986c3ee986645c29a205f877b*4&61aa0703a10513abfadeddedf923cc78*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 256x1024x1x1_n"6aef40a67e9ac2a94a232ccee0292485*2&4d4060a0b9c77f864cfee19b2dc695a8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 819x256x3x3_n"b33b6947f0e2df570395add1b82f2abb*10&28743bc3bb48c2273e90b6d16d304a89*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a 36x256x3x3_n"56c29cdc237cb9aebf94d9f48ef8e08a*10&f54356d44e95491a9ed0dfc7f52a0056*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 36x256x3x3_n"4f5bc60dfd907c10d12bbe4b604ddea8*10&e1cd8f4fbfcabb95c6712e191394cbbc*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 36x256x3x3_n"d59a05db6620207c8b085049832170a5*10&c3b4591896927c4b8f552c8bdbc4beb2*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 819x256x3x3_n"19cfde442fbd9e2ab3f5b447340cf7b4*10&4958598d1bf4a5ad8287b2c0fe9e8e0b*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 819x256x3x3_n"0fea7f9e75af749d6d5eee46b68b406b*10&37149701eee47035b6604d5877c6fea5*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 256x1024x1x1_n"f953fd725dc523169dabae9deb55e475*2&0d7b8285e6ba3e37334f5ebe61091576*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 256x1024x1x1_n"3798bd8b658dd3eedddedfefb0695653*2&7919c6871b254141d878143455d49a29*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 1024x2048x1x1_n"88a97c4bf5a50f74b4608d485cc4dff9*4&d499615b9db20fb5758c4409c628eba6*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 1024x2048x1x1_n"cd63b08be3488b59079755a40e7189ca*4&d5a002f811f9f2a4aed4c5a7de261e1c*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b 512x1024x1x1_n"1763b416455e5e5c5c167c0580405338*10&c39b8e7de440e111322cfec31d2000e6*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb 512x1024x1x1_n"af4af867224fba991376b5678d42fcc5*10&6892b160331b8d6006b167d5c61ba65e*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 16x512x16x64_n"b465e3bf490934540d0a77dc7955e83a*96&0bf1610a675fd85fdb6fde26dabf5ecb*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 16x16x512x64_n"21b3616f25d3bd41ee7838fd64035c5f*96&e46b739796444a1d75a5fddfae5782c4*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 64x3x7x7_n"668a1e1c1c42fa221c6b75980f91cba1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b 16x256x208x336_n"d4b9ac3d0a29101fa15252a7ebb92d17*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b 16x512x104x168_n"8f888e9a5e1855c4b2228b1dcd62a924*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b 16x1024x52x84_n"adf4c7ba64ba6d7ae632caddfc12f889*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b 16x2048x26x42_n"99dc4e65bd2efacae6b9b71a5d17152c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x256x26x42_n"899c9f9e0cc11274c76812c46c1d77f7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x256x13x21_n"01c42795db6be02c59d7f0ae7de6ae84*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x12x26x42_n"dcb077b4e5f50202d848134873323785*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x12x13x21_n"d1e5c54f1ceab75363ed0872dc6824c4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b 16x256x52x84_n"4659f88bf3090d1c95879c61108fca76*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x256x52x84_n"c10290c9299521de0aef53ef6385d8f8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x12x52x84_n"0e22a47fedd7397ac87f50926b17192c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b 16x256x104x168_n"910641af0c8a5b520fb4404a5cef2b4a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x256x104x168_n"583c4e88a3de724b62c116a567ecb312*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x12x104x168_n"5c98e6c030096b6ac2e3f92af677defe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x256x208x336_n"937005372dba152826c6f8f8d723c572*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb 16x12x208x336_n"d8f6ae20f1cb2245ae45aa8f5f3a46cd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=abc 16x1000x90_n"3212e14a1e5ae177bf19f07ea89c3e9b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 16x1000x90x4_n"d0c2dfa82e4c164e093cec1df2f6f799*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b 1600x256x14x14_n"f5e44272372f715271f40d32bd70256b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 1600x256x14x14_n"ffe07bdf7605da4be47518bb70a775d6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b 1600x256x28x28_n"2d2a7746f7c57662852e0b3a2c5385dd*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 1600x91x28x28_n"fb5c4087e73f3af41e4be9e6bf2506ec*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcde --dtag=abecd 16x100x28x28x91_n"9a54897d9723a2883843741a676f85fe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 64x64x1x1_n"e39ef1e51b6b7ee4a57c537ecd9502e9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x64x1x1_n"b16bb61e95bb554d54c435b59d460690*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 64x64x3x3_n"4ec052b1e819172f5711e2bbde5d9257*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 64x256x1x1_n"d83de06ffa7f2fd1ec8c84421801fe46*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x256x1x1_n"2d73f4249e5aa02baf47119e460d988a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x256x1x1_n"f471466439147b5024a455151d6853a9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 128x128x3x3_n"afc7be2f84c6e0c288b05d95f6a17f18*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x128x1x1_n"bf262fd2f6abafe0dd573a2f9ed027c2*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 128x512x1x1_n"25718d50974ebb994504ee21984a87c4*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 1024x512x1x1_n"c1c5593c8c2bd6ab641b7728e950df7f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x512x1x1_n"71454fae361978a5f183db077be58784*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 256x256x3x3_n"64e4573a5025b40cdfafb015c5e46e72*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 1024x256x1x1_n"0d9de8d1b45da87789cd611fed2af8b6*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x1024x1x1_n"bd0cc0a5ddd2a64a40f48bd3f7857d0c*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 256x256x3x3_n"55cdea71cae290fbec2009b45d51411d*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 2048x1024x1x1_n"ec0bf00e0770d7cfb83fb3c3ccbe4532*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x1024x1x1_n"3ad5d231dd36b6e0d20b359d9c5bde9a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a 512x512x3x3_n"346b45ae42d063ddb8f43ffd3f074c96*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 2048x512x1x1_n"9e605b24acc244c92f199d97b19736cc*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x2048x1x1_n"8a8466c40ab2f20106168cf4f5962b1a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a 512x512x3x3_n"c207bd814c9cdd97cece3196b95063e7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 2048x512x1x1_n"2989f75fbc77bc1ed51230d5358dcdd7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 2048x512x1x1_n"daa35a33a95e230e55d936b3733b3799*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 512x512x3x3_n"852456d489ce4b007493fd450576c5c6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 512x512x3x3_n"4435a926bb0be53d5e8fefdb9f0e4e0c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 512x2048x1x1_n"af258ecab189ca8260443411740e1ff6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 512x2048x1x1_n"6d2d902d95ecf1e3431aec0afedc6267*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 2048x1024x1x1_n"31fad1e2905f8d5871541304219474a2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 2048x1024x1x1_n"bc964437dbf3d9fb01ac11ca81164e25*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 512x1024x1x1_n"9a116bdaa5bc13c5c0d4793a0fede4f7*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 512x1024x1x1_n"9e3038b3252440d714a4b2a94ccde948*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 1024x256x1x1_n"825ee2a02323f37dd886b57e2c15ecef*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 1024x256x1x1_n"6f559f81e4a0fbd742ac0d1372ec4484*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 256x256x3x3_n"dc9f4581be538c8612b486fa3140f594*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 256x256x3x3_n"6c2876d962b477ae8aa03b73bc590e43*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 256x1024x1x1_n"e50e06308a204beaaf780cda8c8eee00*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 256x1024x1x1_n"cd50f9c645d2706fca901239f3fdf1eb*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 1024x512x1x1_n"9a35f8cc6aaf778e37a58cb80864ea6a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 1024x512x1x1_n"6d7a14683b40bf2bb5e8eef74e9b7b88*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 256x512x1x1_n"04a80a15e7cbd9a280457820afaa57c0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 256x512x1x1_n"34fcd43fd921235388fd40e9e87c26f3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 512x128x1x1_n"d8e21992540137da0e7b2b44ce2e0439*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 512x128x1x1_n"d1dbbc21aada8cb18386c677d5d56aac*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 128x128x3x3_n"3b4cc345f358b6989eb5e33759aa3d69*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 128x128x3x3_n"4db02b3ff83f1c7d87b99f2c6674f0c2*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 128x512x1x1_n"bc3a74f5eff5b30ae544ca0f7bc96178*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 128x512x1x1_n"88c6798e8de554eae5c1ab0a70a2b59f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 512x256x1x1_n"9693aad43f1531289ae1e4fb08b68f8d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 512x256x1x1_n"16bf9f33c6bd67a872b3c9d7aacd5b3f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 128x256x1x1_n"b37319aa10d283538778f36e89136551*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 128x256x1x1_n"483be8779cebf68f8f3220914d6784b4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 256x64x1x1_n"b0de281573da131af591e33c8579304f*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 256x64x1x1_n"64b91363853d75ffd9aacef815b6b89a*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 64x64x3x3_n"d13ff91840d2ceda953fec4154ad413a*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 64x64x3x3_n"63802c39d88381701652b8e0632ea2cd*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b 64x256x1x1_n"de52cd2cf9c30ffb3bc8b892e5402ba2*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 64x256x1x1_n"2c0d8fe0b99a230411f28da6b8c51b93*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b 64x64x1x1_n"b1c6f6ea7b72f9af4dfc9562604b3afa*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba 64x64x1x1_n"c913919762ef879fb75f35e7c65264d9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ab --dtag=AB32a32b 1024x2048_n"12f76db11c8134857f930dcadf21a8c6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=f32 --stag=AB32a32b --dtag=ab --attr-scales=src0:common:0.5 1024x1000_n"9ead14d2e6823444bd132ba77517d8db*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=u8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 --attr-zero-points=dst:common:1 1024x3x300x300_n"df951fd47c8d53ff3b631937a1713654*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x126x19x19_n"ace4dd2d0c76eb5337592bc434a69b9a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x24x19x19_n"7eaf8da23e3851c82cc1d309e4bc5726*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x126x10x10_n"fa31b3283bf6224ca4d1f219312c97e6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x24x10x10_n"499d9da59042020db7895810369d6a5e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x126x5x5_n"b42c301fb9623e8d7d6536a06e84ec94*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x24x5x5_n"e98435470a32d2cff52eb1a8809c66b1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x126x3x3_n"4483720fcb02860a92f3963777d4305b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x24x3x3_n"a98f4101f9993006ac4f1c1320a600eb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x126x2x2_n"c6db4286bf469ecf5fe2c1d6a2d3ec39*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x24x2x2_n"6b62598cede0a65ad418327e7b6ef71d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x126x1x1_n"155555047f80b0952653f5f0be3fae91*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb 1024x24x1x1_n"825a84272bf0b88bb2b97b168b5652d6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x256x3x256_n"d56ca530c8ea6d3e2355807e5ab26111*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 1x1x4x256_n"c0c2abec7d174723d32526caf62d07e9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x379x3x681_n"13de03bc39613922b542bb56fb290055*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x681x3x681_n"92f463c90b3d257ea4baccace3190df0*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 1x1x4x681_n"fe9837c04c3619dcef24b3fcb71d9c4b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x681x3x681_n"4acaf67049df69b76186071079af21d4*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x379x3x681_n"6870468b4d3902e9b1518a0c8950e173*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x92x3x256_n"5d42178a2b9d7a29f697b62ff6245375*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x92x3x256_n"66898f668c4de29f3d83e2930150dc62*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x256x3x256_n"477085ae407ae0a4d5639326825b91a2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd2b 1x1x512x512_n"bad227f495c9e606f18ef4429c31e9a0*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x16x512x512_n"8ea1823c8f2487201d328d9f3364074f*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x16x512x512_n"9104690e72764356595246cab5870f74*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x128x512x512_n"6eb441dc138a5f496b417b9050d01c67*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x128x512x512_n"9fe0b8b51c6891ddbb4a9bddc674b141*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x1x512x512_n"8c3dbd9b22ff401819024b5681fb6be5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a 128x3x4x4_n"34dc8d0e2f02204150ba90520dd2b819*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb 128x3x4x4_n"3e4e37e34bb76fef56828e89c6499049*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 128x768_n"51b37562e661728e22d1dad994ca5839*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 128x128x12x64_n"ef7c08d8d4b7a88eb21c12af93d49ed3*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd 128x12x128x64_n"c8cf6368a1f9d751324ae3c33b79a012*96"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=abcd --dtag=abcd 4x3x640x1024_n"0e507aa6bb9ba33896a9ad34a6dadfbe*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 4x3x640x1024_n"7e50fabe774fcb6b89e23375c08376ad*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=acdb 4x1x640x1024_n"72e1c6f417181e4636e1b758c66c04b5*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16a2b 8x1x5x5_n"cd1035d3e9eabf149d70e165b7dd89e9*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 8_n"c7f8ff403223274c4ea029cc189b695b*54"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 8x8x3x3_n"fc5410ab1247dc825b82b6a1ff4a5d05*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd16a2b 8x8x3x3_n"7c8819e7cf6eb2e5eaf6add7ba2ead12*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16a2b 16x8x1x1_n"3b4b04641bc3da98c8db648105ff8ccc*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 16_n"d73dab7b98bbb5ab240ecbd61829f85c*54"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 16x16x3x3_n"2d40f9fe2130b80fc908c3aa51863220*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 16x16x3x3_n"38ef027c66ad997752a6e7785f22c4f2*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 24x16x1x1_n"83a7909f8f580a2d2523ee1e41301eb3*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 24_n"bb32cbc93943a2c55e281e6625b7135f*54"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 24x24x3x3_n"5bf49bc0c8903683605231a96467137d*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 24x24x3x3_n"855739631102841f20ec932cdbd8288b*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 32x24x1x1_n"11a9feaf2a6da08ac6a434777ee0d1ef*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 32_n"d83609c4aa85a3a6ed9fb6b960823619*54"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 32x32x3x3_n"0fe9bdaa039cb9b80173462773568288*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 32x32x3x3_n"1dcf919d229249808ff649b13217e6e6*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 40x32x1x1_n"9a37d6d87808a2737b15ba5c36bf7dc2*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 40_n"1e3d5ab48c0a7a86509edfa235627941*54"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 40x40x3x3_n"a85394b9b5d9e0f1f0a195025dba1ac2*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 40x40x3x3_n"c1ed4304afd7f44a59e5d09e8d593698*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 48x40x1x1_n"945ebf642a05b863c9c18ad2d24389c8*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 48_n"fa52dbf42cb8a28233a818df77b1b6eb*54"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 48x48x3x3_n"d411a384866470007558601233e574c7*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 48x48x3x3_n"b0e38c625e6e9fceb6d756edae1a945e*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 56x48x1x1_n"6948cb59fb557d82d983cf47d28a991d*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 56_n"4c9c37e77b98e3d0f442cf5a3e1ee5a1*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 256x168x1x1_n"583e08d07d060a51129c21bab1d5e5d8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb 256x256x5x5_n"b616a43ad3497dd78351ddf1b10e7dcc*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b 256x256x5x5_n"27d860af1857904bd1d551e1110ae43c*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 168x256x1x1_n"3bb54a3b0e1d39160cf9cc8e25b79696*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=a --dtag=a 168_n"dec54390cfa6a34f7fc4588bef50c1f3*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 48x56x1x1_n"9cb297369fe0483e909fc5299f8e2cbe*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 40x48x1x1_n"0ebbc0a895b93b5df3ca3cc124db22ff*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 32x40x1x1_n"38716185dc52673772f64f8506ec7601*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 24x32x1x1_n"dfc0da41665b7c2bf0e55c7cafceb7c1*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 16x24x1x1_n"cec347c8b5d671188f502b2b285bac1a*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 8x16x1x1_n"ec10f6ce6ad0f707468f8378313f292f*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16a2b 3x8x1x1_n"c40d13e304a7703f9b4447c35cc1e0af*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=f32 --stag=acdb --dtag=acdb 4x3x640x1024_n"b5f72d8b92354352481099ca8d047a2e*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16b2a 3x8x1x1_n"a1d84f8b210106b887f2b11ffeb6db91*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb 3x8x1x1_n"5769b2b11827dac4f515cbf6a5043a5f*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 8_n"c1fe0f937354f78b75226b4ac47bac66*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16b2a 8x8x3x3_n"63ca29a36e2dc2a31acd5d7baf17c0e3*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb 8x8x3x3_n"5b84992b298fd662d4de8d5629093ba8*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 8x16x1x1_n"bfc699ef05ce2d6bb3341bea6f770249*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 16_n"7db35cc4a252729c5092295346a955a4*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 16x16x3x3_n"532adeb1af09bbe2397e89bf4f5f40d1*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 16x16x3x3_n"04803a7389f38d14fc1295fc8cb5fbc1*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 16x24x1x1_n"7e766918756b10f251d5e55de78048d6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 24_n"a0027d344534c249778907a665d75b8e*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 24x24x3x3_n"a756e54a7050df9c05cf55a3aca177a6*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 24x24x3x3_n"19b139d14245b8350d289ce65c59c48e*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 24x32x1x1_n"f549a2a8934cd58732abe8f27a5ee7a6*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 32_n"2cbf5eadbaf2095f7c82af5603836ef5*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 32x32x3x3_n"392b76c211d4e5e67676d276ffd78923*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 32x32x3x3_n"73cdc210d35af2ab6401380de3730b42*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 32x40x1x1_n"e24279accba3ada0ffeb2a872371e65d*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 40_n"4154640f603d54c65c592e39d02a909f*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 40x40x3x3_n"34777cf4669f71eaa4b85ef0c62bdcdc*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 40x40x3x3_n"b024db2418c050a0a62e150bff2ffbcc*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 40x48x1x1_n"6aa83623a0f55660668d5bd5ce6cee7b*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 48_n"444725255f8722991b0e91210377208d*36"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 48x48x3x3_n"66cbb83637ffd5179576c872a12a80b9*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 48x48x3x3_n"9611c9b10e98c4e8f4c7e51f2f2c14b7*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 48x56x1x1_n"160552377e23ec5f2e51cfddfb945d1e*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 168_n"ad66e75d541288953830462c4c13c164*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 168x256x1x1_n"8e024425910d95302b80d3ee4a1f2621*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b 256x256x5x5_n"b4248e54014f2fee75e841755f78fc3c*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 256x256x5x5_n"631264b7d50f76e901989cf6cc4ac36d*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 256x168x1x1_n"5fe72cc355a13360927938de9362db82*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=acdb 4x56x10x16_n"979c6162319a6657c1466f348d669f23*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=bf16 --stag=a --dtag=a 56_n"a2468435f3146a315f4b4c19e1816e33*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 56x48x1x1_n"988a92ae7d3efd4c87ea4b50f9929ed4*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 48x40x1x1_n"5e0e3da9121ce9d4322e42bd18a871a1*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 40x32x1x1_n"81636e2c7711844613121d0123695c64*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 32x24x1x1_n"3ae6f28d5d90dc0f38a7752a252c5540*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb 24x16x1x1_n"0517fa29a7d6545eeea353e70eb9e007*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16b2a 16x8x1x1_n"f6dd41f56b5ef621b0d4de0549abe2d7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb 16x8x1x1_n"d233a500e7a84b1d9059dbca3b77a086*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb 8x1x5x5_n"77bdad72b29764aefc1561708dcb5d9c*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=Acdb16a 64x3x7x7_n"8fe7e0785cd428722e423ef0ade63d18*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 64x64x1x1_n"1e82376261e181e59b6f0dc381f2a1b2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 64x64x3x3_n"3f1a46e7faa5a372542c798e1054a059*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 256x64x1x1_n"53e7bd7f32e464756fc38cdeb455c0df*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 64x256x1x1_n"e1bd14f2af5bde629d506775193fa0c7*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 128x256x1x1_n"cceee9c3fb688af346d67172b7540f62*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 128x128x3x3_n"1c01316a13e4627d91dedf29193570b3*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 512x128x1x1_n"04e1c0e9c47756c2fcd23f4d96b272ab*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 512x256x1x1_n"2e82049a708feaeb18c260f3b22bf683*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 128x512x1x1_n"bd49ce7714aaa3efafdd173d90ebea77*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 256x512x1x1_n"865ac5d8b8d63e307d5a9e7c0fa2cc5d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 256x256x3x3_n"a684d76e349fc80753e8af193f5a475b*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 1024x256x1x1_n"483354158fdcb6cd852406bff5fd6463*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 1024x512x1x1_n"00ef55b6db033c132321904a18259af8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 256x1024x1x1_n"08eabcb45439d0bfde9e7ceb5ae3c00f*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 512x1024x1x1_n"c668585a6d6995bf061e447e1463e601*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 512x512x3x3_n"60b68f39aa6e00c9651273302360f6ee*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 2048x512x1x1_n"bb93335bcb9ff96cda245a41ac127921*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 2048x1024x1x1_n"47f0b46e7dfc7636cc0ae162329a3ec8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a 512x2048x1x1_n"395a6078a0267bbd0611649bdff455d1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=abcd 16x2048x7x7_n"dff59e1c73616314e90411b558721a01*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 2048x512x1x1_n"32fde416219bff81306a8e8ef37871ed*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 512x512x3x3_n"03fe651242a39d73460b7c0cb1b3e446*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 512x2048x1x1_n"2c14f54c38bd5b7f5905919d4c2897c1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 2048x1024x1x1_n"e6823859b3b9dad49bd084d0c38cff28*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 512x1024x1x1_n"dd1495981362e1333b8aff5808340d57*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 1024x256x1x1_n"1421634ee4ad52b554a3eb644abc912c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 256x256x3x3_n"76fbb5f17c1350fb500b4282176c6629*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 256x1024x1x1_n"167baf85a24f762368c02663c7288737*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 1024x512x1x1_n"b7f491b158a31a788eabebc75407ee38*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 256x512x1x1_n"9a608ae3e8cfcd4787616da27c1992df*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 512x128x1x1_n"1e8425d5d1c3ca7cd416bfb91b674555*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 128x128x3x3_n"2b3299dd30c74209eb1babf93c58c415*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 128x512x1x1_n"dfbc2f4b3b5da48e2f400edd6eb191a7*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 512x256x1x1_n"8dc0067efeb86bf56f904d50f4baedf9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 128x256x1x1_n"24c88fc91eec9322ac44f8461131d436*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 256x64x1x1_n"a47599725f7d1d5c43ce412595dc233c*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 64x64x3x3_n"db642d86fce9f69a255630b8a6a4a944*3"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 64x256x1x1_n"cd56105e2ace9cf19fd24873b1c796b4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd 64x64x1x1_n"e0cb4d4d55b326e895e8d4d9a1fdcd68*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=abcd 16x64x112x112_n"7a1f62f53a63a966272388340bb2a646*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8a2b 16x3x224x224_n"d4a905a52e4924d175ad3968fb180f58*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ABcd16a2b --dtag=abcd 64x3x7x7_n"90fdd58655f5597b0c9f17f78ad27d18*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 1x80x283_n"8b48f423d18dc2241cda0acac8d12ba6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 512x80x7_n"d8dcf1b8139b9ec3bddb4a117277030a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 1x512x283_n"c8b5e32da96037b8b63c74a6fa665114*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 1x256x2264_n"edec70d1aada281055ad3baf0539d9f8*42"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 256x256x3_n"1901e1d8afb9a5c0030b723974ef69c7*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 1x256x2264_n"9803799c7efb5995d22568245198d810*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 256x256x7_n"a27a43c6965e2e503e07f1afcabe963f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 256x256x11_n"7f0eead070fd548ba075a55a18a96d67*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 1x128x18112_n"8e72f3bb8db2a4294d8c75834d5cf54c*42"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 128x128x3_n"fe787984dbd8eaa5874aba35b2451e01*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 1x128x18112_n"260630905c59f93731526ef6a43f81f6*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 128x128x7_n"b471a093392e2d567ef55e3610f2180f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 128x128x11_n"f6b8752bf390c3c97b29c2748f8053f9*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 1x64x36224_n"18ecb8deac803d33e58ddcd281faadcd*42"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 64x64x3_n"e208e556080ddde47cd89ac86c82b2e3*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 1x64x36224_n"8eb69c158420b0cc6a85711bfc98a978*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 64x64x7_n"8ac202b56c73f55e1bed3d8e84c75c29*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 64x64x11_n"2e9db91940d7bc22b604da97850d1aa5*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b 1x32x72448_n"0c7511555391dcefcf5484361ed96f29*44"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 32x32x3_n"781b650252588eaec39d44c933adf421*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc 1x32x72448_n"72ed89ced4ce0c700e856319324b7742*32"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 32x32x7_n"fae3a825cbde43b6357788f60895ff18*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 32x32x11_n"122241d52ba2b7ffbb96ee9b26108999*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b 1x32x7_n"41362851e31ae89e18e578571a02bd19*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x96x56x56_n"111ca7a862e12604650a8f908a81d83b*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x96x56x56_n"a7d551f47e81a76c65b8e64d7bb21b7c*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x96x7x7_n"ebbf162d15ea7a399dea4fe7d21c080e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x192x28x28_n"bd33281f67aadfd83ea8331bb853e1a6*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x192x28x28_n"29d6b732236c36d83d56d7baf62d1998*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x192x7x7_n"570326546b25df0e6942cddf314238c4*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x384x14x14_n"031c0f4d04d022401972ec9c402bfe3f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x384x14x14_n"974592b4e4dd62013f18babb0b663a10*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x384x7x7_n"e3fd68e0d10e5b59de644e3ef7196438*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x768x7x7_n"ed77dbfc56616347e37ee9845bd96adb*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x768x7x7_n"1a2dea5dc438cbfcfc52f73876ef7261*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde16a2b 32x2x3x3x3_n"7e94e6b8b1ee05941a0d691506745107*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde8b16a2b 32x32x3x3x3_n"688a27d0909c88d35477fb6eb878ae5d*34"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde8b16a2b 1x32x1x1x1_n"63ff88e0258a6c01f3365bfcb761814a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcde8b16a2b --dtag=cdeba 1x32x1x1x1_n"34bd7a5c41a182753144449e582bfaee*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcde8b16a2b --dtag=cdeba 32x32x3x3x3_n"90cda78ad0d17dbcef39aba8ab6e7e02*17"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=ABcde16a2b --dtag=cdeba 32x2x3x3x3_n"1c7f7b5a30a13b80e9ffd49479341754*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdce 1x25x49x49x1_n"7d7f0b00b031780321b7c77615c4f3bb*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x3x360x640_n"16aafe3175699529756a3850fe9f82b1*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x256x360x640_n"807a1fa887265c24fdd47360785d4db8*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abdfce --dtag=abcdef 1x64x360x2x640x2_n"2a49fb3f356f36c0516c959f88b10648*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b 1x64x720x1280_n"88d1de0600449a787c7388f962ae172c*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd 1x3x720x1280_n"e49d04f6d2ee1f3b506b40b511194cf2*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --attr-scales=dst:common:0.5 1024x3x300x300_n"7532551b858e15df04c06883fbfdfb73*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x126x19x19_n"03c7e952eb1f765f675cba41dc06ee20*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x24x19x19_n"83e10e6194f34c6e18a7a2fa7c679ca6*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x126x10x10_n"3d6b8b921b48d0614ad3cacb611f724d*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x24x10x10_n"be51cd702d9f08c9fc41b42eaa158640*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x126x5x5_n"8645e02d5b22ca279c7253098c84bdae*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x24x5x5_n"de3d524bc58273ab3a694f3904ba6e2b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x126x3x3_n"e3f31c29b6fb87484f827cc16d9d6ce9*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x24x3x3_n"529f6a865d84e9e2eb8f4059e91493d3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x126x2x2_n"4fa011da4d3501e5a69c25d14a8ab307*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x24x2x2_n"5e26c88aebe84688e7819dae6cbc1b56*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x126x1x1_n"17884f30a2db1e12d367e1fd8a15c6fe*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb 1024x24x1x1_n"645b707a583f40df30e184d70a9e2352*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=cbda --dtag=ABcd16a2b 64x1x16x1_n"96d114c79beaa9f1dc0ecb66df7031a5*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=Abcd16a --dtag=cbda 64x1x16x1_n"c7d46875fe6bc458256aa1a63ed21b50*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=acdb --dtag=acdb --attr-scales=dst:common:0.5 256x3x300x300_n"a4babc11a500757e73ac1e2d9054b8ba*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ab --dtag=ba 490752x4_n"d4fc29995f335e1588e5736ee096b5b9*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=ab --dtag=ba 4x490752_n"17bf9eaf9b7626a3aac6e96890bd628b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 48x3x3x3_n"3fac342256c039ebc636580b8aae21a9*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 48x1x1x3x3_n"f0b8d033caab21f27ecd6ac966ac6464*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 12x48x1x1_n"f2ffea7c556777c0d03b485c498dcfd2*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 48x12x1x1_n"dc1c569d25b0cf918532fab8d800f14c*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 24x48x1x1_n"dbcc7edd781091cc7c1e7e5c1751c632*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 56x192x1x1_n"0e30173c0557bdb09860f697245c06d4*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 336x56x1x1_n"55a80cb770758515a22e57f542252bc4*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 336x1x1x5x5_n"fbbda73975e975626929d4e4b2a30cb5*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 14x336x1x1_n"08113da6c9855d5b3ad592e91c1a7f3b*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 336x14x1x1_n"a60f16b1b6c1d89336b35856849a74a9*200"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 56x336x1x1_n"2562b419d38e97111dd3ef7fda358668*150"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 336x1x1x3x3_n"b6f40088b26ee64479ee7f4c3dc9b0e4*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 112x336x1x1_n"57f3cad7310efe96f3dd6fbfaddda4ed*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 672x1x1x3x3_n"ece1c87c3402eec7a3496ba7a46ba66a*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 160x672x1x1_n"29715d4935fa666e9fd1817a8f46c223*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 960x160x1x1_n"8d8afa06aa6286fe4ba5389ffbf77174*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 960x1x1x5x5_n"eec3f582a3c55427b24ed6ace71b92d2*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 40x960x1x1_n"1bac80f3e7ea462cd5ce7942a348c4d2*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 960x40x1x1_n"866d72e914958044769941bc8f12d368*300"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 160x960x1x1_n"7e0c1e3907389576a2cdacd1dfa3828d*250"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 272x960x1x1_n"be0b12891c403644b02c0eaf865f061e*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1632x272x1x1_n"b42e2a3fd906f1fcb7fec6850b52eb9c*400"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1632x1x1x5x5_n"66d4d3a6252551f55a91a467ee2e64d5*350"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 68x1632x1x1_n"b21bba51fa97e2f2892dd897191a6c47*400"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1632x68x1x1_n"b905b9782961b797e357423fa0d2dd0b*400"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 272x1632x1x1_n"80e3860d50cbd43d9d77a5d57f5dee1f*350"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 1632x1x1x3x3_n"7938896041d1c0db5e5bbd4a4209d825*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 448x1632x1x1_n"6327a79be1282a1499eb50876027ba8b*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 2688x448x1x1_n"273a7150de116fd10e98d7b6da0178a2*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a 2688x1x1x3x3_n"f03c16521b4e9a4057824e883320a1c4*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 112x2688x1x1_n"051a446e5344cbbcabbd6d53118f6270*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 2688x112x1x1_n"acc847b041abd4885998c4f853d0fc5a*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 448x2688x1x1_n"f58f440668dcc148ac3345fdf744dac3*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1792x448x1x1_n"db85cbf456be3b06ca1b61dd77c28725*50"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 256x256x256x2_n"3374478cf8cba19e15491df72b329db4*28"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 1x72x72x256x2_n"7aa2155d55079d0017bce8d75717d9d0*46"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 23x36x72_n"26c3b7f1dcc6ba711e0cc67ebe3e2712*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 23x72x36_n"c326cd7949698fc30280591ce118fe9c*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 23x72x72_n"2648dd8fb2797c3b7463b4c8b7224f4b*3070"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 256x256x256_n"132dd1f64bd52b86d700111d930ab9a2*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 23x72x72x1_n"9012e34a4255bb76dbe8143c4b3379d0*6132"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 23x72x72x2_n"27e0865bff220c771043c79471d7e818*1024"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde 23x72x72x256x2_n"7d1741682af9fcb192b78e598d9ff8e4*3072"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 72x72x256x2_n"e053cad0db0129f20e7b71439aa4cf4d*138"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 72x336x256x2_n"8b75c261bdefc3b6d772e843174b38af*46"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 256x256x2_n"5acd61754e2e282453d2a15a4ce7c88a*2048"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=bac 128x128x14_n"10033af6b3ff4642ded6b5efdc32ea55*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=f32 --stag=abc --dtag=bac 128x128x200_n"97ca2901e6a8a9c29a5da6afa857e70c*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b 320x4x3x3_n"bdb377f46d771200282207e0318d84b0*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x320x64x64_n"3e16f5e96e3289c51e463d795ba20445*26"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x320x64x64_n"23c11427d3937295e66310286b6b1ff6*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x320x3x3_n"656fce6148d3a2c01b01b05b7692d85c*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x320x66x66_n"c8af87870b72ba1074172c9293e9a02a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x320x64x64_n"287d3dff9809a04510c8ad75f07fe5e3*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x320x1x1_n"fb9b7e4db570adc64e608b919a7a4aac*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x4096x8x40_n"d3ec8136a88a37dbd311f67c5d30d988*30"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x4096x8x40_n"ebbb96a16070eb7fc0b559ec3565a59c*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x8x4096x40_n"7bb9cf417c9ad6f4fa6b4be8c93a3836*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x77x8x40_n"88bbcf96b264191d5ce44df5834edba6*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x77x8x80_n"2e230883da2e35e84cc380b56933c248*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x77x8x80_n"3ecf1a5cf911b20e0eceff6e415ae714*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x77x8x160_n"cc4e5b2710a0a9973c4979493eb6d8fe*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x77x8x160_n"cc84ea6b0b7ab43e1725c1f50b58fd2d*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x77x8x40_n"f0bdab54ee65998ce3d2bf21638a8d05*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x320x32x32_n"53bb1a1072c549e31b33fd68b53f2405*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x320x1x1_n"5005a96aee0fcd94b4fe3b2a8c09694d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x320x32x32_n"ad89da96953d3cf9209be8cbbe4771b8*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x320x3x3_n"1df7be5f355f1e2f6db14748162dc10d*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x640x34x34_n"94092274331d36be8a23fe27f77ce01a*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x640x3x3_n"e5af7afe15a3c5e2202576e4312ee831*16"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x640x32x32_n"817bed02a6a81827ef0e983a40c13136*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x640x32x32_n"977276d03f3dac2b0554cbc28cf98f53*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x640x1x1_n"f9a68eda9fabb33e19d1e43274339a00*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x1024x8x80_n"9119c5cde75e42b84f31269d48dcc6f3*30"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x1024x8x80_n"5affa7f55ad5a83253d403e77b01f5cf*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x8x1024x80_n"15c96a721eea3961618131075b325dd6*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x640x32x32_n"9b3cccdfa6187273d254369a0124b958*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x640x16x16_n"fc4f603e2d67eb53b0cebfdf5b62fbf5*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x640x1x1_n"663e97cfae9037733a71fd33f0a9415e*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x640x16x16_n"06f3b8d159ceb063b9c7b1c1d38a593f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x640x3x3_n"d03c5f54e8e424d1715d9437359326d1*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1280x18x18_n"e2b2dba929a3a00e66deff158e129528*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x1280x3x3_n"9718260f78a76320a7d15b5cc9ac86d9*40"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x1280x16x16_n"725fb91a66177229863001f03cbbcbc8*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1280x16x16_n"9773a229978ffcca81bd7f371e8a2217*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x1280x1x1_n"43e3634de350ec3a30a97cbbb0c10932*24"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x256x8x160_n"7abb65b200129f5d8ee2c5361474d0d3*30"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x256x8x160_n"f9ce138be24c3aa5f9f791de49efca66*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x8x256x160_n"804b7b518294457cbba7e754fd6b44e7*20"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x16x16_n"73cd6297b1657bdf24a08ecd442ce57f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x1280x8x8_n"15906b8f517f017fc29f972b533da27a*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1280x8x8_n"c0bec6b2afe2e2a4521fc53b2cf23e9f*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1280x10x10_n"fce8896215784cc3ca458e9ffbb4e01c*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b 1x1280x8x8_n"d5d11fa78fc1ad97c2920352cb996c39*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x64x8x160_n"67e5277dcdc1c0e5764c8a2f9677b822*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb 1x64x8x160_n"a4f33d32a23ff655de10d006ab23a491*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd 1x8x64x160_n"0bb06e2a29f580ceae9be670f536e8ff*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x2560x8x8_n"a03c602abd7157e977a09e17c180ce48*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x2560x1x1_n"5c0a6c704be09c78185237cbdaf90a43*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x2560x8x8_n"bd3fa813095fd101b3059ba7c4bd96bc*6"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x2560x3x3_n"5730340fb1602813ef1f50786c8fcc54*10"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x2560x16x16_n"101e6c6f35c1e766272b68e97710f075*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x2560x16x16_n"3c22850560473c116ca54d676ebdfeba*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x1920x16x16_n"273daa39b7749e84bc43e5bf229b6e4f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x1920x1x1_n"f44add2c9c4125130c9ffd4f2bc2c438*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1920x16x16_n"3fd072b1930213a8d6e4276fc268ae9a*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 1280x1920x3x3_n"ce2004fadd7acec7aaa530ebc43efaf3*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x1920x32x32_n"a3f3567d946c02ab73b627926ab981fa*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x1920x1x1_n"19d8aa11764ceb3a7f56a34b06aa9b9f*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1920x32x32_n"a4325920b8a30c59aefedf952b845568*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x1920x3x3_n"f99922e62f80e290bf361c1e31ecc067*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x1280x32x32_n"c766245588715f842c8b20ac2a9c0930*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x1280x1x1_n"e449be292536c3820f783ed780cff336*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x1280x32x32_n"3a3479ab012f5000791186bb85a7a314*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x1280x3x3_n"d036978a9cdb20ace11d81c63c726b31*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x960x32x32_n"7bb4510e00b5fc9e862112314022c972*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x960x1x1_n"32111c33aa01e27b5c488daa14ae53d4*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x960x32x32_n"07fdd0e7eb3987e0ba21119afbb79209*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 640x960x3x3_n"990f7a6903e9b61773a61a9551fe553e*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x960x64x64_n"f7d3d51ee75201ac5d70482dc43b6d82*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x960x1x1_n"36df7241c03a33df0a145b16e9e0ee63*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x960x64x64_n"08861dac1ea5df41646d66299fe77209*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x960x3x3_n"fe02ef56eb424a863ee18f6467e40218*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb 1x640x64x64_n"18c8dd363d3067393497051c87c8f36d*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x640x1x1_n"3e758948f44fd854487b3b909eacc159*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b 1x640x64x64_n"f6fa7405e14d9a6a926d8ffce044361e*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 320x640x3x3_n"da5181282c7085eda3c69f0981ab0796*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b 4x320x3x3_n"78a88ce55d94fde39695b54f01a285be*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x32x416x416_n"01fbb2d8f1351158667adc50fa4bf3ab*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x32x416x416_n"4df9eaaa956423a5c2bd55d5cde9a41a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x64x208x208_n"c08ebec4c3fa0fa75d58109f07ce1db3*12"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x64x208x208_n"92561a55f10cd9ded2caef72765d2bc6*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x32x208x208_n"4ba57134377c7bcd08dd44e3f71b7236*2"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x32x208x208_n"c06984b695f844d073cdc56d98c276bf*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x128x208x208_n"cc863687c5b0304101baeff848b451da*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x128x104x104_n"54b5e0f1cd0ac568d3aba0015483cdff*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x128x104x104_n"e027effd500357d40714fdef1e58afac*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x64x104x104_n"64512a7318dee05d389afae13df1bc4d*14"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x64x104x104_n"ca79970ce27bd78b0b451e255974dcd8*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x256x52x52_n"122b468523162deac7442bf5dac3d6ea*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x256x52x52_n"6a7598d01424ac53ff90a8a0dd846490*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x128x52x52_n"b11129649005da89c9d2878ace3902c9*38"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x128x52x52_n"a7279bbc6df3f02861ea95d25ad692d0*17"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x512x26x26_n"101fdefd7a6dd27cba0d1605e70346ca*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x512x26x26_n"463a7b3a31101a043d2d99113fc8bfba*5"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x256x26x26_n"d05643da130d859ea01195d7b31d4907*38"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x256x26x26_n"e8436e69eddbba971741222058dd3bd1*17"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x1024x13x13_n"3241aa6924e3c71cc88b60cabe894125*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x1024x13x13_n"09bc51ef6ad7a3e63d98bc2e3d5853fb*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x512x13x13_n"355b044dec719100a1fe1e8c558b87b2*22"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b 32x512x13x13_n"e51cc0f726219b56ae3b4483228fbec8*9"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x255x52x52_n"4ad3b4318201db46ec911bf2fb11fd3b*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x255x26x26_n"f95b9d4186768b82a41560366095a348*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd 32x255x13x13_n"489e7952bf03f05ebdaf0c02e52f309a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f32 --ddt=s8 --stag=acdb --dtag=acdb --attr-scales=dst:common:0.5 256x3x224x224_n"6977b1bcec89b190b751c20a7cbf7e2e*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --attr-scales=dst:common:0.5 256x2048x1x1_n"9df8ccc366f9c051bf9ddc51f1e72cc3*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb 256x2048x7x7_n"2b387bd21c666c7523484a502f3b0a3a*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde 1x1x512x4x512_n"7ee8d4d709fa91a0c570986beee7f9b4*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x512x4x512_n"1d5fd7f4ac37014f8b71735ffead1725*4"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x1024x4x1024_n"72f60062685e84bb37798105953e4d0d*8"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x2048x4x1024_n"d0932d5ff4d0878a995483940c62b5ad*1"
---reset --allow-enum-tags-only=0 --runtime-dim-mask= --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec 1x1x240x4x1024_n"996994cb58c7fc37a20b5af72a411092*1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 768x768x1x1_n"aa52302bedfb039ac9fa9dc1eb279552*48&c1cf0dc5ab2138da0cccdef0cec6be19*48"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 768x3072x1x1_n"63df46d2609cfb2eab007c48eff76149*12&e5108e4ced6598618dc0492597f53819*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 3072x768x1x1_n"038a2b6685a6b3758718843ff0940239*12&0bba40fcd16338c6d76b3c0482250402*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 320x1280x1x1_n"61a33cf1cb2a94e49b920ea17daaa9d7*10&65742c1da63525ccb836b025ada838a3*10&df2117d4ca27ba84a951efc4c6ee6f66*9&7f84395d2fe65116d5c2f7cfb512899c*10&404d12f1d2bc4af73a89406cf3706b8c*9&c93a2e75ebb5ac9aacd5e0438080a9f8*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 320x640x1x1_n"1235de0c30b302858ad8fc251c90c66a*2&f71e1b8cbdf3fef7cd9d4fea01208a78*2&da42b4998cede7df9caca8880a2da59d*2&91ba0d59b68c0aed6d5c366167131d26*2&a86435a7e8ce259b750108539705ac75*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 320x320x3x3_n"a9b28d2463ef36d340630a82f393324a*8&2d22042118846125456c55e7161c5234*8&8db808fa501830e8008c4b76452ef12f*8&510512fc379ce2e2c2221927c42c532c*8&c8ef37e569b898b65b8de0203bd3ab3a*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 320x640x3x3_n"ae2eafc7f51efda1f7d348f6d228dd86*2&5e9cace8f3ce9a618d75902d7e4db005*2&45a89604a03d6257215022b2b12f1c07*2&31bf647209711c05b4acc2a86417b934*2&b33b1c7098853c28822d64c9969cff73*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 320x960x1x1_n"07d950e2466089691721e95fcc3ccf86&b87d3f736961f51398ec5207a5a4fde7&13a4780ecb6be4e90ae530b856e55f12&11554859977a36faa4d6b0c7550d50ab&fef2fd05ec2320d8706da2af624d1a05"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 320x960x3x3_n"eb253c12d47814a71700392e68a63c01&cd4c814a85f7fc9d2cc3e006f2c18db1&af427a975df6857db871af596807cd19&d39496df1c6214366c93a45a53af9235&7471cc5ccd8c35b691dd717861b9e628"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 2560x320x1x1_n"d6be5f9ea3fb94f336c87d9af23fa645*5&dfc750d1a6de8ec9ad8d583317a05753*5&1895c053b5da4f340700c89c18c81a53*5&dcbad248b7ec4ab6096265aa13300169*5&aed75fec24710c59e89c64af84bf44c7*5&e6325db3bed38e292add60adb297a164*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 320x768x1x1_n"d42ac71d8d84f464886f08400ee7c54a*10&e99bb29a7cb5f46a922284c37c51734b*10&0abd4004a0d0166bc0a1414984347af3*10&e6240c0a99172e7948e218a46929852b*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 320x320x1x1_n"afec767a308c56a6924b2414e5c8f546*30&4b1161bb5e1b6a718884d5bf312fcf04*30&0ed849e038d811b91f2ee55e6d544e1d*30&3aebefd9b504a87dedd108c3f161fbb2*30&86996e0936e91aa15823e84fb0cc1234*40&c7949e894290fbda9f3f9e3653f7fc54*40"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 320x320x1x1_n"fc372663c0d33927705e98990f038efb*10&f77442804456ffd44247f671bd3e7a4b*10&91e3476ea23f3dcb7f284e983df888fa*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x640x3x3_n"401275d5e0e3dbd1b6cbb1b8a239c8b4*8&dc92011c9e17adfb768ddd701a810f65*8&3eb49a4df95cf093bf7bb56b3a16b422*8&a66fe2d619fb3348d1dc1fe8e95aec59*8&ec15ebf6462f867e53f52c88578f5cb0*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 640x1280x1x1_n"777beea79a460e537956ebb71e549de0*5&650d72013809a7597f21bbde90ce0a7a*5&8b2d2345b97c95958e7a5c38c38dfc97*5&44f21a1fed32d1b9a6ef2888ae2b0f35*5&fb714aa7f67e71e7a00a575491cd78b2*5&ae27fe5c2b0639e2d3a861db2780d0b2*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x960x1x1_n"bd5eab211ff502bb5f1e51848a3bc6da&5fe1ea48620e3e3b4b341256b226ef11&981086fa3747db4ffedae732d7f8b760&1a4f14fb71afcdb06cf3226eb7392a12&154baf68e3f492a0ccb1c1bd81d6b5a8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x960x3x3_n"7eed616347741a2526363c67ef8b54d3&28c7e7bd79bfb13b1fe687dd6fb316a7&5900031789eeb614d016296cae2c5dd9&a07747dcc81deab2a944f21557be5388&e6b4a2c1464e3e7d16e74bc14ae78847"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x1280x1x1_n"80535c8dd38c59a0cc50de9e139d09fd&16e22351e8f898ad72dc67d6b77c7a68&37548b5e1e88e669f67350b30820c814&11ccb88007c28c5e75016197870c0efe&397e05104d6dd329c6877c251d6093eb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x1280x3x3_n"42d6ba492f504e4a77fba7e06d8ca92e&a11d17af0d475986c7bab1c3adee146c&027416a5261df26ec82f34a10121b008&62fff3af2015e55bb98c8fb9f6fd9544&39764959cceed769b8ef135db7df41e0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x1920x1x1_n"5a983ee0da558b9411d4338d638d55d2&8b58e33138a8bb4c3f180da11d357bae&4c4aa7b8563a6629d0e2c61d1c6da6da&508b5adda40e8c57b209f81c20193916&cad1bd8809d5297d82f024983ee2cd05"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x1920x3x3_n"a5f21ab0a5cd9ae50171b44338a03479&760021cb29dfd067cc0d328011f8ea4f&cb7fb154c14302f7f1d9652108b4faa1&c2db4132305a1f90291bbea54afded90&cfcbae99177dd0cca3241a5494144b67"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 640x2560x1x1_n"c989629fa0d622164e76af81c16ee4eb*5&dd64b824320f4b5dcbf7b4d446a8a898*5&91810ba6afb44fb5b2a90b5a926f27f8*5&c7578dae0819c8030cecdccee1a07ddf*5&f1f71a34c411187a273c04e986fa7473*5&378015b287574fb0fae2776204b3268e*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 5120x640x1x1_n"39e294f6ec6ecdf7d640f1865056606c*5&d5202b221dd1785d6ac47f5f412ab7a6*5&6df041def29076f9856faae10193740d*5&12a8f83b56451998aa10f4a823c1dc05*5&37a1558677141be407f602b59bda6c2b*5&30b413272ad917167379478aabf7f72d*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 640x768x1x1_n"752fbb0170f16f9560e2d226107ca83c*10&7e7e1effd0c0cf25acf1a89b8b28db20*10&eda0e3f4b86528f6e4a51286cf7c2921*10&b2cc1ea41ddf42a46bf152f22ee2ce13*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 640x640x1x1_n"76474f9242f32180853bd9bfe4e834a3*30&1b43af4fa1440c8108050e759483967e*30&6f4f3f3ef3a54245654a742fba488dc4*30&eff4ebee3bbe0cc8410956b462682121*30&42e4d308c1ffa57d300edc47417359a8*40&d05d777475629fdcba484f0a15486a44*40"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x640x1x1_n"4f87854c68c39887c819468bcbd3a0a6*10&fd1c6e452f47a500d91e1ad52c7cad59*10&3a789e0402782f7253da2bab4c737d2c*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x1280x3x3_n"839b78042b669b777c72c1d8617cd318*20&0d085fcd5817ff46611bde15b4c057a9*20&f56c8b9d9eec2154681643f84951e520*20&4cac6ce2c3cf1cad2cc0ac7b819ffc9b*20&e5090f46d666612ef6a35e8fa5dcd6fa*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1280x1280x1x1_n"4c45b2060e546016baf0d6c4c876ebfd*49&9a5e29c93cabdb9014c478917328def6*49&2d5e6e6be46a0ce72f0f652ab8fc7ac8*49&d3203ada565522a6eced1bbccef85da0*49&b6d093743baa164cd1778db38e81c9bc*61&9a50c496d25fee265663282c39c95ccd*61"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x1920x1x1_n"7a2899378770c07c0fd401574db8bc81&eaf18a04827fa4b7fe8eb07896d161b9&3009b9cee83df3b6b220e9020a713b6d&b18874b8b62fb97a388ba0c9e6800d3b&15e77216d97355b36592022f7d41dd68"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x1920x3x3_n"9e2e3ad781c5831fbcbc8653f913fc5d&83cfcd3926f504778841ef026669eeeb&c65c735dcd51fe9ee3a5819b4d7437d1&929374532ecafc05ac4629b57edf7a13&4cc49f775b0fcb5263ae6af05464c728"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x2560x1x1_n"b7ed611b9ebbb0c90c135a2c4d2b196e*5&ff57920578bea633253ce547d4b84fe2*5&f36bf77575820fba17e0ed71ecbf9c1e*5&23470cc35fb8bb253857b64951986de1*5&22b50412a7ba6e75702030e5b604ae46*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x2560x3x3_n"6c7babec1159b708e94de1ad44362847*5&fdc037186cb9e9b902ecf2ec2c8494b7*5&fa5c395801c07c6cf1dc6f0a901b5348*5&28a215a8e3ff5cbf32d43dfa795965c7*5&1490e4e16fef0d7c73989810ade25db7*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1280x5120x1x1_n"1cbcf2bb5098357f11f94e6d639cf414*6&69deedc3cba1f9e0503ec6764af25e31*6&bec9e474ea442ac4c4bcc694cfe6ebe6*6&a66cd122fecd2348d635cef68eb70fc1*6&19bd0abef111c2763790923fb36a3728*6&9cba636393069931d10fc4e3094d6324*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 10240x1280x1x1_n"7782ce621881d6cf4361ded4b2ddace2*6&8ea079b9a3c4a18d2d46c3d6f13f7fbe*6&856e30006bace0eb31ef5d02a79ccf0c*6&5ed83d8375790ec0112bdda244718801*6&0d148e8709dfb38a6bffae51334a4e93*6&06b89dcdeeceb0c6fe22dabc30160d35*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1280x768x1x1_n"c92ece3aaa619cb3ead134a6cbf95ca1*12&1bbc2a6ca08b7216681f90a69f18469d*12&777cf3adf27ca6ebf722487f1e1def90*12&affbc24760ceb42924da70aaf9d65ab0*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x1280x1x1_n"deb590a7ea26e45914a21628fc1d6a88*12&109a7c3c2acf5e1a4056c1496eb1a636*12&ba8460a73a1fa02990c8145032392084*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1280x320x1x1_n"096736e5500cdd5a899e2137a37be158&736ed35634d50dd83dd9fa9351814edd&9dff73fda96d2e549039d99763f94d73&02a00a81125ca45e0b4df89e9ab817ad&1b057932c31f3b4a7b99755af830dd30&d46dedd79dfc56167e19a887518b8882"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 320x256x1x1_n"49760173789cafa674894b58a2e6ae63&219c5846165a0a9bbd7689655eeb5814"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x640x1x1_n"1c4bc7c7ee34a4f7e44f1c751eb986d0&07de3a8009a86f5c1d097b953c0ffd21&46c0871c0e8e7d3fcca7f11bac550850&e985fde6e0fd33e11772f0d25b8b97ca&afd1e7869398bcebc9dbe2a42893927b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 1280x640x3x3_n"231473aa98e6afe018e9df2854f70523&926ed52c92695cc40dc5250877f0714c&7af7d4151441bf0744016aa329604308&1cb0eaa9c0c893a325beea99867c1cd6&a2113e8bdeaee74068dec03c6f6b0d7b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x320x1x1_n"96696e3da52a43a92eac24bd366d4be1&44e2d966d9d571925b57342640824710&b6b46ae41d3b43ae7427cda914d8f11b&a0e802019b97eb5d9a7688f55525c348&6060e7241c46ba15597d179151e81e7d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 640x320x3x3_n"9893474a749424844c1c5e7609a236f6&f5991dabf1c43c6ab300088eb0d99044&06cd23feee4c185591a46ab2fb48e717&987d26b471572ade2de39608bdd88d55&24322107ae52fb975a5900fde6a4e490"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 4x320x3x3_n"1362b6fd3c1f1264911b15ec8cae79e7&ae3ce2abc60f945783376a1bff2169a8&f6b237ba8ac0d08cec51ac687370e7be&e15bb3b895b221c9e3533bdefdac544b&d219340f3ec9aa3a12abf341049619eb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=AcdB16a2b --strides=: 320x4x3x3_n"d77d5d5edf4e3767801e18e372a84cc0&2ee133795c54a6a291907f893cdab2e8&47c9604637090605c9782a0e44276ecc&83a3fe040c44ff052bb42dee9df37c2a&3b6905024a38428fc19396243f57b264"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=AcdB16a2b --strides=: 4x4x1x1_n"8d137e0f1f265160c1e61f5539039d22"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 128x128x3x3_n"1a51f77807eb296e7c7f162a6a983db6*5&dafcfa42aac367acffa54754e2c20f52*5&934d6187b4efdbaa98444fb8fd880ad2*5&7fe7bdc73d3729e6cafe8c2c84e27689*5&ca2f889a4bedf03667af4d28dfba4757*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 128x256x1x1_n"123bbad573dd87906dc720c551206010&fee0953cd1b2b30e4f07c22f0b8803df&2f001f9c789047ec0f2b9fe6d046c600&505dabebf25c849a6ca2962e16ee74c9&d66bb8d4c25a25ee2d1cc25398a5a942"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 128x256x3x3_n"52dc71db16a01cfdb1cc89d1c1b0e5f5&b1009fa735ac7d9a48ae0cbd08ceeeb3&560897252a3726ab947ffc30df7c7ee5&a306ab031f5517df69cb0f7e92881ddd&99c7f965850da4e87daa6be234ee54a0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 256x256x3x3_n"0d423b802efc15666ce18a3e833aaa85*6&bff74f97ca25a9ab0e16a4273919cdbe*6&d11383a9180f9021e91a3365a3b2d821*6&e220a20fbf4d4b0b2d2b8e07d3a5b7c3*6&d985149f2ea6d10c502858000be9e58c*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 256x512x1x1_n"007ecccea303d988c09b8ec11665d02b&958188b0f90802aff35f0ca1d90bdc70&de0a478806bd0eb2c692856bd20f0cd3&dd46db5c8acd94fb60f95be13facd6d5&f1a974f002af5636f647fb3f983564bc"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 256x512x3x3_n"8f91afe2bac8d14a49443694ecaf060d&db7cda67fceec8a79fc442854c39069c&ac924f16a1ef314bb66344c9d89edc51&ac5e78ae8725165fd73980f6e57cef41&51c828661666a3615d0c5e1767641112"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 512x512x3x3_n"febb8aab1cf58030b589e83fb3e9ad7a*18&4f12465f6e0822151dfa3a69f6750dcd*18&c817aeb3ec2307cb0fc2587f442e03ad*18&8ff14b26b5b1be2c1eb7c62247f2b3da*18&51fd0f3b144bb7a206372fd7cda4c09b*18"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 512x512x1x1_n"5a66fe842ee56c8050b24cfa49b2c1e8*4&a028799efe9b19b912cb11ad2445dd1b*4&407acdf29a1221650a4d8e3ddbc0389c*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 3x128x3x3_n"d219159275e8843ad262708d8b98f531&d3926641ebd48350845632717fe09143&83d8579675b0b498a5734af353b94e90&e88c2c4ae9698a7d72ccd05b18a28441&d8bd6b254ecdfbffb008960b8172968e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=AcdB16a2b --strides=: 512x4x3x3_n"25deb7a647eddd93d922f3d810b52b7e&15abe8b7a785514008b72192f90c2b4b&a5ac5b68413e38e6c34d39e477768d88&abb99b82425d47c4d3f004b082552db3&9db05a0a0f78077572558a4f220f7d39"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x320x64x64_n"226d3c3d12f8753c2ca023732b4d81ca*52&52f564d5e5f6c66b92b156a1fabb19eb*23&5bda12128f7f0f784ec22f65fcdd6d80*13"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x320x64x64_n"f3169cd496b0e51bc953bac22ea62e2d*52&1a7c532b9db8b930040f93e37175cb5a*19&966af2317893a406972c42dbdf74d94e*13"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x320x32x32_n"38599c2ed6ee7bba5ed2c9dee92cedc6*4&de1b892424974c5ee3082035cc7c94b3&3faab21294a28ae637e262aad29d1db5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x320x32x32_n"4180c210c4aa0eef2cac3f8b8eaf5c70*4&a5211a6fde41acc18f9d9305dd240037*2&3fd9f4d78c816291aeb0967a8b4e43d2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x640x32x32_n"b98e8f8e3a7f1912c6445d1fa75a3615*44&def294564b675314f501d7ee50bf2c6f*23&07f32678b3f860dfc18cbd5ba528f40c*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x640x32x32_n"af09b31b89fcc5462eb8a8f88838d1ab*44&a3f277c94d9c3b033ea0c5e5d3c4de08*17&89adff5f2ceb92796dd09fd492f69023*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x640x16x16_n"a770460deb21eab9a40ef080dedee2e2*4&85967ca74ba81ff94a96ae39e217b5e4&d1d04d11a3c0049276d6c94335c464cd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x640x16x16_n"370cc61a6ca8c064384b7a5b4eeff412*4&2f525d1617955d20f07df8b9d14e6c60*2&793e050458c7bc5f650f6e5b32cebd79"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1280x16x16_n"91f89de338607e7f0112e48efdf60253*44&53b0eccd4340751140997ebff9d6349d*23&3425d912f85f830a21fe485484c00e72*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1280x16x16_n"7fface77eb77ce9f2eabde9f1b664cf1*44&e89aa0ae6e0ecdec067d0cfb76e1d37a*17&0202b9c33bba7d7710c8ae200239d222*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1280x8x8_n"7d6c92803806fc06c67c5953c4c7b4c2*48&db954c68b787493bbaddbe4c42fac2d9*15&29a8cfe676af4846ac89fbe32de36110*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1280x8x8_n"3c239ce07b60db0d223081afc0b3918e*48&47f0170d0a15a885e8c0ec18d7417be7*13&86ffad88c3ce1bdd984962c1e00df80a*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2560x8x8_n"09c95b499d4f59e12a0d21a95138fdd4*12&e7c8e2fa06c0aba86b34c30711671b52&1e838cb4e727c0cc9b74c71cbb9683c1*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x2560x8x8_n"a69de7b3b97b1a88c061221798c7f729*12&ca58268e4b333277be9beee003aa8086*5&10b62ef1685313fa5ed5aedc0c965d20*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2560x16x16_n"b1bd300a38c7b49f54539949cf8e8b0e*8&39473a635bb2924c889963d1a1746273&9852c711a088043863682df3f08637bf*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x2560x16x16_n"bfd89f84d2fd0895b406310aa77a890a*8&26056ddc3950f3890abd4aa177172c04*3&62fb3363f74b01f114d42486408adfa1*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1920x16x16_n"9cb5c6a7d50ac672ed71063ba65938eb*4&9cf2b04ca5e3c4278b1501c44f5e5790"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1920x16x16_n"779191269b5f9197597a6b6594492805*4&bc4214f8d772169e8561fe62e60e47be*2&9fca09c7bdd341e0d4324f8255f353b5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1920x32x32_n"6c983444705d821460af66c8822737d1*4&e902b90879c993520d1a7a4067ab1390&96bf4d3a6cd910c9bba1fe6fb0f3f669"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1920x32x32_n"af3abad678a822194f5985f8fae699e8*4&20d10aa2e27fd84d8048407a946ae3ac&1ed725a6fd13c66bf9ed9c33a2c69e92"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1280x32x32_n"54c026dc7d18841ec2b0bec022109283*4&dd4ba3f3e348e036cf46e064b92776fa"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1280x32x32_n"6ce20d8f24bc560da4c67054f89402f1*4&a567a8188861caaa9695e60fe8df05b0*3&d10990abd7734342ecec15b01d8b1620"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x960x32x32_n"86926d0a386b041e6ceca8433faed29b*4&6badbb43b88dc951afd80cbbb29c4d70"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x960x32x32_n"550553fd2c2555776eeff88cdcf85180*4&74998e3d3010733f74fd5b54bbef9c98*2&0c427e5725d42fc0f80b7ae196e37355"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x960x64x64_n"c0d248d2881fa7a29f5d23a656ce9440*4&7366704744e0a4515688974d99c71f6a&f4a6325b358281be883b5b882df5ccd8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x960x64x64_n"1a4add10dad529d4c5d053d658e486bf*4&aaa597575ad3a99b97223c2fad986a0d&b89181f46354e22ef810d47b35cec628"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x640x64x64_n"b9bd73a894c258b23603ba0804f9f9a1*8&ef4efb118e9c5196bd8635e49695fb1d*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x640x64x64_n"61f34ad427dd741ae560bbc213cf6bd7*8&096fb4276c72ce75a3d2c0206912bab8*5&c0203cceb05279300dfea6d54b491d4a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x64x64_n"97dfc8db3dd42ac46cf02e080815bab6*11&cdb0da0c52fc249020d8992b45517878*11&540d49407dd0f36d1111fda66cf816b3*11&367f6a5ff9f2959e0478af33ada2ec23*11&e190f636529460cdea4c7086ff4304ad*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x64x64_n"07389dfffe79fb9b07f632b5480e0b54*11&46e7254b03274b9eb918aa488af5a588*11&6dcd5b029836f774e46f9b4f3469832d*11&13bdad955c221e38861fd3e28f2f507d*11&e4e24fabcced1ebc4e7d5a1b55332152*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x128x128_n"89cec7200eb546a1bd79e4fbac5bff53*6&bc4f970de760181ebbd555a577c80001*6&33278dbfe611d9e76749e9dca868b0e8*6&a30e8a8f2372e5c27b489ae1eabde2f7*6&389a7ed719869d5223c92201f2d87207*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x128x128_n"cf2be6089de03064b4a21e276da03ad4*6&c79e29c0714890d7cb0d5679dcbb1bd2*6&1b3bd2bdf8f47fce47cf2e009aaaa8f6*6&9302ca02d8f09ca088cc675c4d8d260f*6&816b71c8c2b57e467dc01c8d1c648090*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x256x256_n"5de28473bce0fd882954c6ae0a7f1bfe&4641aa55833897cfb34be5143cc9083e&538d606cd9416c9045b4a66d057b2454&67e4c90c1d79491e1f6b90663040c5d4&130cb74af707c8f276495201adc1a3d5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x256x256_n"2ffcd244b2c65ed8ec4492ed0d9b32a8&f6452cede84861d737198b3684f481ff&8e4492621e6c9ccfe338f9fb504eedde&40a9e80837a787bbc65b1d8dc4397255&ff4c70e44ab8fea1139adc13c2f55560"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x256x256_n"015c60fe994262bfa92a0122b0b3d319*5&b43cfb2ddbbdecf111c22bacf37eebf3*5&b5cd34c08d2800c46550775d8213b63c*5&d732c8d913b88b7e9a47b7916900a71c*5&7b27c785c6991ca50dea90ab49c05077*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x256x256_n"d1cc147c05192e6a723a6c724eec3d90*5&33fe538e418ead21ca639461380ec2bc*5&8f1613396f1f45f0539a0745c42b0536*5&969c62cf670fbd5eed94262b1b85bc28*5&c9a883fe6ff170e3f80dd55f6554485f*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x512x512_n"3f45bd485078d4d6c200fbc515ae5ad8&a8c715c7e3d1e9caf581a44841614b69&8fe3d9ecfeb39ffd64dda08d2d5d2707&142e8378f9c190cc8c044eede07002a6&f1b77a80a93f92b141f8ea9b4e416c48"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x512x512_n"88bb264db56d681709694859cf889903&17e79bfb08e678a147d3c29d58d979e0&0c8443e104fbb7b8f9228e01f0c4930a&fb60ae46095bcd64de4cfe89b6e271c7&5b94e59cdb91ae1174a49735642f553e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x512x512_n"095e6bfd1dc0e317eb0c8658d7dd55d0*6&15f5c0e2f8b2c9a08ee60b48d2ca5bf8*6&ec12cb81abdef00db894d584643639d7*6&942fa5656d6f7a45d7668116e6909a46*6&4806ca315a87b1b745c702b8ac066603*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x512x512_n"e992606041890a7870e35696c72c616b*6&d82da9d9d2915715caa02b670c82f76c*6&8cc1fdb665d316ab50b2d14890bb4219*6&8875df721e3f80da8d4d0540e11d317c*6&b2f92c0d838f7f693e86d0d17720c1d8*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cbda --dtag=Abcd16a --strides=: 64x1x16x1_n"c4766ac20bd3c2ed3c5a39a7ba526037"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x64x16x1_n"077752564167ffb3c0729ca6f742ea90*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x128x16x1_n"5ec1cb57d56cfdc5b73bc66d5e0f05e1*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x256x32x1_n"a5c2b13d928764671313f6ffa6fdda7d*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x256x4x1_n"025639f182c4da586b4f3a5bfb08d53f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x128x4x1_n"694bbff7398ece6d056a14164597f3ea*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x128x2x1_n"2e71e3cd44c83fcb6f9f2533c31cdcc4*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 64x128x2x1_n"5e85a4601d451e6a3e2b9338e4aaf7dd"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 128x128x4x1_n"b771abed34d432c39c2f050d9324e910"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 128x256x4x1_n"3f321e82d443538a63dfffe653b932a4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x256x32x1_n"97ef79d71c581100f3ce73fb275a2e99"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x128x16x1_n"7d940466caebc145cca391343a88aa96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 128x64x16x1_n"f1b814b932864276881d3eaab1808b6e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cbda --strides=: 64x1x16x1_n"191a0f38209285ad2be7b61fc21f0a7a&8405727419cedbd987a6c81d28bd98b6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x256x3x256_n"f1df00200866da94f827d82ec0a6c8bd*2&f4aa7154e45c5cd2bed74f24a1c856bd*4&ea7b5db7e47bb6ab7024d1fcfa3404bf*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x379x3x681_n"ae98ce5e9e994b42844d79cde4048376*2&ce04778a663de0ee3c867c3926ece58c*4&50539d72c51493d18140504aed166b73*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x681x3x681_n"8633b6b3b6c3d7eb64ca7ab5c9909394*10&2e622556430faffd2df6a7aeba2c910d*20&735ebf408ef2b64c8d84808ad4f46a84*20"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x681x3x681_n"ddb934cd361667d0e1aa37335c15901d*5&8345f677786ada344b3c1922a2309c63*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x379x3x681_n"d261cda27039c12c15997c9d66ca0616&0d2e727b8f2e4ec4dce64b4d21be1ee6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x92x3x256_n"e5c60de8d884301ec1cd45d82a365d34*2&e824109c27d54cc639308ab7e7e03d5a*3&8b31364fac1b86e4f22c8d985859c9b0*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x92x3x256_n"a7502c66c173f43f58040a39b4bfb582&90c91a13c8ad028416ab9987b968c914"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x256x3x256_n"6fb54fba8faf98d4d1b2b49a93cea83b&ca887747ba00756768b20b947124f83b"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde16a2b --strides=: 32x4x3x3x3_n"49694b9ce5c9461929d9e9ebccdf4d10*128"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde --strides=: 512x256x3x3x3_n"e14a21ae056169e3bd08a44e92ebb600*64"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=abcde --strides=: 256x128x3x3x3_n"20d746cc9f9927411751853bb35f647f*64"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde8b16a2b --strides=: 64x32x3x3x3_n"a17ef035e3490c0495866c7feff6da85*64"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcde8b16a --dtag=cdeba --strides=: 64x32x3x3x3_n"9ab4912e98c1c0506e4cf58526517cb3*64"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=Abcde16a --dtag=cdeba --strides=: 32x4x3x3x3_n"f66965f6a4dddd7a3d90d1e3626bd603*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=abcd --dtag=abcd --strides=: 4x3x640x1024_n"863d81730967bd2abab6efc322d038e8*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 4x3x640x1024_n"292c6955863f509de6305499ef81aaca*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=acdb --strides=: 4x1x640x1024_n"6f0a734d4863d83239f083bc183e49d1*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16a2b --strides=: 8x1x5x5_n"fe65f519c1a5483ea311bae652f2f113*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 8_n"b6432c43da3c88af05692ff6d27444a5*54"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 8x8x3x3_n"e03152529a000333f5f4de3894d6ee9c*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd16a2b --strides=: 8x8x3x3_n"af5fe387ab8e9dc50d3e4111198df8d2*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16a2b --strides=: 16x8x1x1_n"f3aecd85821842386ca98135b9df6ade*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 16_n"5370638ae5c78e2a855d08ae1e11296f*54"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 16x16x3x3_n"87c93ac68cd61e06159b06e8f40e03c4*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 16x16x3x3_n"fd8349a7efd69a44427a27a1d69e7762*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 24x16x1x1_n"9903daeaf6803a46a694da73f3d3aa2c*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 24_n"be88a0f1af294b95f5456e68143fc4d1*54"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 24x24x3x3_n"df305dec810ea16db42d0b426a747807*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 24x24x3x3_n"fdf605fbf7d8d8bc7baec6933cd2d804*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 32x24x1x1_n"747ae771f020b53957ba6684a9c3fce9*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 32_n"fcf85a82b4a27fa7123932dfed0b0590*54"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 32x32x3x3_n"8682ae287c6dda8de5cc97a3ffdac44b*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 32x32x3x3_n"59a6b991e8a4e79ab5f9758241b565fe*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 40x32x1x1_n"ab7091992d295d18e896d8492d8a4395*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 40_n"27c7dab1818c83292cef72dcd7420447*54"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 40x40x3x3_n"414a002cfa31e827ecfb5badaa760717*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 40x40x3x3_n"16ae6ff2b069469dda13619a280c10a7*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 48x40x1x1_n"ac97c7dfd5bd38718cfa36b53626b708*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 48_n"fa3cc090257e631604fd25c1cfbf8541*54"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 48x48x3x3_n"f3ec2ebeba80ff24791b6c90a94b1cbb*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 48x48x3x3_n"51ed3341fb63f81b8d506d0455cb7e97*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 56x48x1x1_n"81d8ae42f6f29981b1b4b5b844634085*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 56_n"22f9c95baa815655d08a4c48d5365f9c*9"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 256x168x1x1_n"8ce283711a7f669f557a1afef9b7287e*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 256_n"9d00d39651dd3c988f5d1f1206280833*48&0a1049599cfbee9ece6ecfca78eb5cb0*18"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 256x256x5x5_n"e3b03b6c8ccd4caeabfc086103f7856d*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 256x256x5x5_n"ebf25e4cae35d3581e4697dc5e21456d*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 168x256x1x1_n"781648c3a8e5729ea887897f0d3db746*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 168_n"84a075c81bdf34789a0da39f95369205*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 48x56x1x1_n"b57656cfcd2d7b06aad01c7d49914845*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 40x48x1x1_n"e75e55b45661b6c714aa198f226535a4*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 32x40x1x1_n"b0fb6d9d6d4c6ea322e93a060163605b*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 24x32x1x1_n"4302afa32113da0a0b69641ec5e0478b*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 16x24x1x1_n"c39dcfbbc6bd2efee71caa3f1c94af5e*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 8x16x1x1_n"7ab753d14de607577ec4c874d4e98c60*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16a2b --strides=: 3x8x1x1_n"fdabe0f2114381a7cace627ffd5a8c07*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=acdb --dtag=acdb --strides=: 4x3x640x1024_n"1c9aa4763e4df73e5343173621ef83e7*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=abcd --strides=: 4x3x640x1024_n"995e73de2b7fe3306903cc0e8e5e71e3*3&cfca501d5f4eaadeb603265d0a9d65d3*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16b2a --strides=: 3x8x1x1_n"60b5a85b8999e9130104205c5d9986b1*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb --strides=: 3x8x1x1_n"4845ae123dbb9b444cf4bf56f50238b6*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 8_n"bdea03a458a486343d1789c1328be3b6*36"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16b2a --strides=: 8x8x3x3_n"dc23ddf67358775761f9073d3e7aa5bd*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb --strides=: 8x8x3x3_n"a8677a80b169a4d304acf0758484b0dd*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 8x16x1x1_n"ef3ce0f8bb2812873438b23546b9333a*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 16_n"46f01d2387b4e5837275198297e4d3f0*36"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 16x16x3x3_n"45a44c998c65a3608c170e9314b71645*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 16x16x3x3_n"b0b6906a179752202c3ac8b0259e9d14*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 16x24x1x1_n"5a9a27494b32b901e0c470c1d8842b84*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 24_n"edbe8876ab330efc494dbded48befcdd*36"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 24x24x3x3_n"c458f074f2950c4f97979356fb58f009*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 24x24x3x3_n"6c47fa5137ad42fa59a5bc6d261ccefc*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 24x32x1x1_n"9da837d71fd60d3c7e161d1168317e2c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 32_n"fc0dfe3fb0bc11e8ba6c9bd0cf6bdbc1*36"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 32x32x3x3_n"0193f722b3863cc05cc8cc9a5ecbaea2*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 32x32x3x3_n"8f13dbc257fa7c4d4c53b5061f853a58*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 32x40x1x1_n"77502f3e9aa84d007f9cad8a65838de6*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 40_n"aea609de7853f557e733db3d9890545a*36"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 40x40x3x3_n"1151328c717d87405b116ba6c1f8b189*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 40x40x3x3_n"693813d77318b94711ba45c9c54e5fd7*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 40x48x1x1_n"7e5e5dc8db50308700f42e43fc2c7381*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 48_n"c2068107fcb4922bc6d8e46badb0090f*36"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 48x48x3x3_n"f29d91ae30177f112ed3a2f223fc6bac*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 48x48x3x3_n"2a836e4a3ffa804c94fa7e42e08f6460*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 48x56x1x1_n"e947f38337207fc14f00c1c107704924*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 168_n"e9b9abbbb84d3a7aa35c0262006ae817*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 168x256x1x1_n"1abb1107ac262ffb73c7f7234312efe5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 256_n"9099a7329143ce1a734133a5c44d1c3b*32&ebe9b14aae5adecd6de37ae5a192c693*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 256x256x5x5_n"02ce349fb06622190a42cae825c095b4*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 256x256x5x5_n"d68b91914b53ad5b815cba968d5aceb5*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 256x168x1x1_n"f475b7eea87ebd93ef7c589c79013727"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=acdb --strides=: 4x56x10x16_n"39205102c97db53c33145a16b5ec1798*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 56_n"9aaf0f30a62ce8656d72356882b0c441*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 56x48x1x1_n"0e1ab04e7b44b5b04e7b4c0112f2cf10*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 48x40x1x1_n"bd6faa1793e36b1f29ab908a2b594cb3*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 40x32x1x1_n"92c54691c88997da09d2dc5fe65055ae*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 32x24x1x1_n"f86e1d946734826672fd313a71c67547*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 24x16x1x1_n"7e76391d11568cf55fdd408fa95b228d*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=ABcd16b2a --strides=: 16x8x1x1_n"e2bd73d26bff6ddd31661ff80eccad89*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb --strides=: 16x8x1x1_n"7e50e6b8b9ec68faaf2a4fc2cb9499fb*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb --strides=: 8x1x5x5_n"07e629651f4c230c839581fb16dcd832*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd16a2b --strides=: 64x3x7x7_n"4af3fbe063ac33e8637d4433064e119c&b3886d9a0ca4e948dc6cf196bb0655c1"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 64x256x1x1_n"5105dafe6c63900873c922af5180bf29*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 256x256x1x1_n"c809a569ecaa87dc95952ead36cf2857"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 128x256x1x1_n"340c4c02b694ab7455ecefa8f22e9c22"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 512x256x1x1_n"14b9af57ef31baba346a863ed63e6e03"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 128x128x3x3_n"29adfcc1f209c0b84bd5b6f5fcd6a90b*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 256x256x3x3_n"578ec9532bdcefa59a43eae5e1df90fa*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 512x512x3x3_n"542a7f5f2a2f35f2fbfc40afd66586dc*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 12x256x1x1_n"26b1c76492cc0f0cf895c4b8a2ca1ad8*10"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 3x256x1x1_n"891e7ae6449b1394f4f45308a2608866*10"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 3x256x1x1_n"ff55d1f13b4599c3758ef71f5836864f*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 12x256x1x1_n"27e111f58304e595d025db63e9d6d359*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 256x256x3x3_n"2a1348ff0324a0bcc6e90899cc316992*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=abcd --strides=: 256x256x3x3_n"683e7e5ce04467250f935a5ecba3b07e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 91x256x1x1_n"5452d51674b07fbeff4eba2f460f367f*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abecd --strides=: 4x128x28x28x91_n"91574a6650f2af0c3bafb3b8f99c619c"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 4x128x91x28x28_n"ad27f4b6e4921c753fe4f43168e2e5b3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 91x256x1x1_n"e690173efee4cc6eb866a30388815187"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 256x256x1x1_n"d84cbc9847331544276aec2dfe7fd2d0"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 2048x1024x1x1_n"15fc0c85a286413909aa2b0c647555b4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 512x512x3x3_n"5373abda03bd9fd02ca9dfbd5f19834d"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=ABcd8b16a2b --strides=: 1024x512x1x1_n"1ce2c90335eaf927cf053517f4c0ddb5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 512x256x1x1_n"427fbbcf8489ce3341eefcfeefd7da84"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 128x128x3x3_n"6e127a2ec82da9ad64dd2447d35212e1"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a --dtag=cdba --strides=: 128x256x1x1_n"237420a2840005d8d5cf3cfba8bb72b2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd16a2b --strides=: 64x3x7x7_n"a7d1b329f213b0733ce80ab07c744f27"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 32x3x3x3_n"70f53753e387e332e947527d9e757aa9&c18b2b849c6f1125bba84ad27f7325ea*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 32x1x1x3x3_n"008f065978c68fa5162337c70b84af0b&0662c5934305ef30e5bf0e78089c701a*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 64x32x1x1_n"49a10be9b594bda277ea185ce939bbc2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 64x1x1x3x3_n"690aca7ba58a32d73d3a590d16ebbb7b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 128x64x1x1_n"df5844b3b12b1884f42aefae402b9082"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 128x1x1x3x3_n"56945b3da3120921dc6d32ab90384b9c*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 128x128x1x1_n"ec5f705d86467aff7b4952955fbc8e73"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x128x1x1_n"ce77475c407edd14470bdde679076b92"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 256x1x1x3x3_n"ac4967a24253d10818ad74bcb12ac426*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x256x1x1_n"c5bca014e7865f5a47cb7e8e7edc3ceb&2f4df6a700251f4850b82012c16352ba&82fe3ca5ef95217bfebc560fc283c999"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x256x1x1_n"60cc3be87d949389c03b0caff2dfb939&773186bf19cefe96542b1681b0b232f1&2afb80f0a24c3d34dc71de8e511dc812"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 512x1x1x3x3_n"9cfb860472c71a07f03ceea7ff820024*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x512x1x1_n"117056e9ac6b989c5defb6527899c09a*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 12x512x1x1_n"9eb6f2f82867a9c6dc6a2196e85c890f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 273x512x1x1_n"cebfefcdcbf521cdd123719cc2a9e333"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1024x512x1x1_n"6985e9f06d8e861ddab79ab0c880dcbd&b75009dd229eda2f4545878d54d6b512&ffe1a472349058d549b79053c158d96d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1024x1x1x3x3_n"82c1ce5ccffc32f1b0a464afce966142"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1024x1024x1x1_n"ae84298308119a5ddee15ba56ca6b592"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x1024x1x1_n"1034a6df24e9db0831632ac35f8272d9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 546x1024x1x1_n"b05b7bce526ac7ccda59da7d0909ab38"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x1024x1x1_n"8751af9f934ee8539a752ce5d289c080&8d3db1bf72f651698b51de34c1372bb3*6&5b2efb030917d28066eb05cfae1d192b*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x256x3x3_n"ac912c04b300f80531f13c09283505bb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x512x1x1_n"25baea1361712ce8fbe33d49ef5687f4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 546x512x1x1_n"f84885d6fa73bfcbc534bc4c4c965f23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 128x512x1x1_n"8c3c0373596cea3928b38620d5be2562&8dbe3342a4702678252b5a8078aca810*3&c0aa5b7b8cd36a46a42fa64955172801*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x128x3x3_n"a2b2e4f26c65f0b3d86451ebcae19d88*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x256x1x1_n"80e63c60f67821b9081571b2e99a4768*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 546x256x1x1_n"84e4888e9ca70ce607259573556dd2be*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 128x256x1x1_n"61db69181837eb21654d5a5cb2a54e02&eb35c3d7e4ed0c485bc7a9a811b38149&b3c893ac86dca04b010a37f3c9698cbd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 64x256x1x1_n"d84e8ea4f83b87588b036892ecc6e9d0&aa7446da724fb490b1f08adc2d678ef2*2&9b4e78750d2b76bb1e4a1ec6feef16b0*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 128x64x3x3_n"d9eb8fe8d2141bb351ae8a3470303f36"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x128x1x1_n"a70f14ade587d397bc5c48e684378a6d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 546x128x1x1_n"d73fb181997cf93a339ca36456fb6fb4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x12x19x19_n"84ed19f524d451cebc67ad72fe767319*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x273x19x19_n"0afd88d91078b0a6ce4d4a3a4abe91a3*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x24x10x10_n"91e7a5136c780117581b651c339a2b09*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x546x10x10_n"7ffcbb964961ebdd430b968f45d99a9d*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x24x5x5_n"c5310c04d7824d53f719521647dff007*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x546x5x5_n"056b349613945126bc3392e8466f5c9d*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x24x3x3_n"fcb8e2dd2ce1d06d1f3e5bd686e36f0b*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x546x3x3_n"4b1577055f1f1c9ea4fbea1162e9a1c4*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x24x2x2_n"8d65cacf621ef9f454e76849529389d8*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x546x2x2_n"1be99614e2faaf4eac464ca189cd7f93*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x24x1x1_n"017d38aad3062215d017b5e5e7bca518*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=ba --strides=: 490752x4_n"0fc844f933f2019d01303d2a328f1c24*40"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x546x1x1_n"9700d7bdf8a7038085df2a4573bf1d20*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=abc --strides=: 256x1917x90_n"779bd12d6ea01e863b70db7f9c8e0c86*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=ba --strides=: 4x490752_n"44af9a8b0ae8296074f352091db286b1*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd32b --strides=: 1x32x1x1_n"20a87927557c83e5d9bacff7aa1df353"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=abcd --dtag=aBcd32b --strides=: 1x32x1x1_n"0956951276047be9b5a672ff5ab92fdd"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:998400x998400x4160x1024x1 1x1x240x4x1024_n"99dc9fcf9ebb166439f477c3427016a4*23&d5eff0c1dcc0ddfaa37618e198ce50c7*23"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:4259840x4259840x4160x1024x1 1x1x1024x4x1024_n"5ef6e1df4cfc20fc6e406ede3080a3aa*184&c1215f452e65f745761a3698f7d16bfd*184"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:8519680x8519680x4160x1024x1 1x1x2048x4x1024_n"eaf23426c0a114c119b1fb7fa9160d36*23&af632d8ce5a1e7d1336a90ab2b40afbb*23"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:430080x430080x1344x320x1 1x1x320x4x320_n"d4d2ade0e707642340adb2f3e490885f*89988"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bca --strides=: 64x179x18_n"3d57fd46d66488c16bc866355d4e9292*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 3x179x18x64_n"14df2c269073e449269c32b9ccd04878*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bca --strides=: 64x179x32_n"8ee5e54bfc80cc599a4b1eb50daae80a*48"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 3x179x32x64_n"8d411a3f531c81e5200039288657814a*48"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bca --strides=: 64x179x17_n"6f8ef79a4d355453db1c369a7280ec98*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 3x179x17x64_n"e3e7222b6e4122be81c5856095a76cd3*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 64x179x32x3_n"6eacda4923dc5715dc78accf2535f306*48"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab --strides=: 179x32x64_n"e80653d016375ee654bab28a3399d861*48"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 64x179x17x3_n"e1f6a53dc3560307c1910beba66a38d2*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab --strides=: 179x17x64_n"4368c65574c21954f95e763eb6fbc26a*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 64x179x18x3_n"f2a360a64cdbfaca8ea547b567f4b276*22"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=cab --strides=: 179x18x64_n"929c0f24d75e2701a0ab5b3c268ff188*22"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 3x179x18x1_n"e05d34d648f7d96f842fae01b9a7702a*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 3x179x32x1_n"373f3d2f5c5d1d1c8b1035a1314b0730*48"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=dbca --strides=: 3x179x17x1_n"2dedd07a4d556c8bec8d5dd39bd8456e*24"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 64x64x1x1_n"33c487e55cb3647b4b632dfbe3868555&0cf843657722a3192f2d7027f1eeddd1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x64x1x1_n"eb103f4129e063b5951a906e6856d4b6*4&8728cd63492d3086263c7087ac42e6f8*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 64x64x3x3_n"f3744fac6f417134a9dbf8e0ff798aa8*3&a7d82e6166c3623388598e7b8a1c0b50*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 128x128x3x3_n"e68ca1402db978f41225e6f9043ad2be*4&3b0293079b2f34560bf38a1476ab63e6*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x128x1x1_n"4051e3db14fb53bdee55a494fddaedf6*4&2906e7d6b8ac1c88900f5e8419609115*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x512x1x1_n"0bab46f45f457387f9fdf55268c27149*2&1cb0588e79d87d4ced9f904cb14811ac*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x256x3x3_n"0b0b592a98573b84842027ee67c2b070*19&6d6f093f0c27d80b55aabfd98451a4d7*19"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1024x256x1x1_n"578abfb1c1902c84d2c194f9f2eceb28*6&04c87339a7f3918a805eaf6047f2d2b6*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x1024x1x1_n"052619a5fbc5506da4e057a0f6e356a0&ee6d5ca24526e31087d28fc93e60fd1b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 2048x1024x1x1_n"5a928c2326e18f9d9f3a3605283e19a0&d5dfc439a730a1ed96999501aaee0928"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x512x3x3_n"84743b85c2c5be602aa4996a23fc6992*3&6ab294602c7970e29ab1daa4ec37b479*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 2048x512x1x1_n"b5a83cd8f9112e9f7dec986324a2beac*3&9a318d047c99d6b69a83f90c6fc0bd27*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 512x2048x1x1_n"5b575f900518eaf9ee5722b1e9e5070b*2&5ff06540dad49ebba1e288231cce6e57*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 256x2048x1x1_n"dc28a556fa0a0e6f2a2e9501adb50b0e&9c012ae81051a85c93f8922857d5a009"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 12x256x1x1_n"5c96416f67adcc0fd30c56b6661c7262*5&bdc42646d4a97e1e1a5a7c5732f82ea3*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 3x256x1x1_n"361472b4f9d496194ad92f4e1fdf5498*5&b5b53b5c97e98cc01a8c97e545b6b730*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8a8b2a --strides=: 256x256x2x2_n"3827936d1979f3fc04c0e52aa9375d6b&1b89f3ee59662357629642b8adf0cbeb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 91x256x1x1_n"5002764ae4d6e8ef958ea2517bd045be&f15bdecbdd2056e75f5ce2fd7fb91b4c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcde --dtag=abecd --strides=: 1x100x28x28x91_n"d9293f41e04c46a9d6351c1f8f886684"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x1x1_n"69d50664f87b4778a995194bb94cc911*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x80x3000x1_n"31f3b82b9a9967e886127879a753216c*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 512x80x3x1_n"87a3ccfb285d2979d54f5c8a9a45e606"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b16a2b --strides=: 512x512x3x1_n"858d057a1d15e4a7c80795a6eb249163"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x1500x1_n"7dc85070691445d707448d10baa31ebd*21"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 --attr-zero-points=dst:common:1 1024x3x224x224_n"1ebcfbdfd3354773b7c1e91005bed070&fdcd6167869bef51cefbe144abc3b99d"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 1024x2048x1x1_n"b42b550e7a7228df6ff37200a4e50780&c29f5a468ea2b751b77b85396835258b&c1db50ba327969512c10190caa6147dc&a3624140529d6083824c02ed38757f4b&456198c5646345277937f40890d364a5"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --strides=: --attr-scales=src0:common:0.5 --attr-zero-points=src0:common:1 1024x1000_n"0b94f72216f9a2d17d995e75d22f24f0&02817b945a5a8af5dd42c90af25bea68"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 40x3x3x3_n"55c4bdbd1c60c38bc2f2a9a62b236a82*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 40x1x1x3x3_n"95caf50fcaa855d46aaef3e67c1e68a2*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 10x40x1x1_n"204706a324883b5eac9e60e180100ac5*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 40x10x1x1_n"8dd206f88f4010ff4a5007750f38344d*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x40x1x1_n"dab9ce062858125c72e6f8295d914aae*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 24x1x1x3x3_n"0b74d2beb4a1ef40d123a6ab34291e76*50&5bcd9e75ee2174ff46e6540730ee1fba*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 6x24x1x1_n"69c52677d546b027908d57e0cb92716c*50&e9a674ca523139e1694852367b53e2bc*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 24x6x1x1_n"b856e4710c3a38777ff716c9e17db373*50&43d218fe6e443acb750f66bae33a56f1*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x24x1x1_n"94617082ff71afe6558c847bbfd2e00e*50&5971f995d958fca2574aa74bfede1aea*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 144x24x1x1_n"06375bb47751844c62f56c777b203dfd*100&9ea64ae45959869ca125176b583d9278*50&3bfa6b683b444df2e35cb5da32f850cc*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 144x1x1x3x3_n"d5206b22fe86edd48e02674ee135e7ec*50&af6a22bf9dacb260f69789150182713d*50&5bd1e528982e34cb26fba15f67485f9a*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 6x144x1x1_n"8e25323f85e769d1aa66eb60f7ff14f0*100&b1e5de4b0671a36f495665ecd7b1117a*50&b5fb85436b4ae3e5838880530ccdd0a6*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 144x6x1x1_n"d9eeacd205c97b6efe700dfaf08a3dc3*100&2dc892ca24ef74ff79de4e87df3e1fe8*50&5ed7499e31f789bbc7b2764e8d840cee*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 32x144x1x1_n"45b08689cea6bcd14c2566c366e9ea67*50&852439d94fab0b60a9b924aad0cc2e2e*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 192x32x1x1_n"1467f07f399bf3d6479416a61823d4c8*150&e6d162a5e9a7d9d92c473ceafa955aa2*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 192x1x1x3x3_n"b193adb343453c73e17992e8bca89af4*100&ec45fcbb5c80e74d978556ac110799f6*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 8x192x1x1_n"466886d216727a2852b4c5005bc07c53*150&7e1c116e3bb92cde149abcfa2744e1d6*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 192x8x1x1_n"91a8fbe5cc8907afe91494b4b17c56b7*150&06bfb132614b8948bf43c931568e743e*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 32x192x1x1_n"922acb011bcef0cd6f60554d87d000a6*100&952c3b42bd9232f6aabe8cae5819fa21*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 192x1x1x5x5_n"0a6d10c5f03c0208137870c2cd9c7856*50&90e8c491806112b196156d90ef17e0fa*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 48x192x1x1_n"adf68f8ec22692c17a15127d392661c8*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 288x48x1x1_n"c71d204028286c04b9164d1237e0abb8*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 288x1x1x5x5_n"75e0c82dfe87c8a4f0cfc9de60404ddd*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 12x288x1x1_n"1b1ef38a7eadaa456cd49e3e4f530fc0*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 288x12x1x1_n"29683d0541101a3abc60e7fb2a8e047b*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 48x288x1x1_n"a8a7d7a984e4713b1f007a4cbf845e36*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 288x1x1x3x3_n"fa1a07b809d464915243fdf69beeb380*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 96x288x1x1_n"45900d37eab86c759909996d9d47cfe0*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 576x96x1x1_n"45a85e29b09a3b8b73d97c130b56f8b5*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 576x1x1x3x3_n"274cea84c045948e6b08fd52607bf88c*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x576x1x1_n"ad6c8c284ba8efbb7bd5e1bad9b5209c*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 576x24x1x1_n"c39563d53822ef1f88910e3a1a6cbc8e*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 96x576x1x1_n"c8175a08f9e80d5024fb45b2951863d1*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 576x1x1x5x5_n"daa7cfdb5e66ded8c0704832d3f560ad*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 136x576x1x1_n"a7605b5dbe5b70be9a1d5179026c4ba9*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 816x136x1x1_n"9e1eb2f86063a863f23b7caf283a7154*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 816x1x1x5x5_n"2f111fd4561e3b1e2dd5c08039ad3465*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 34x816x1x1_n"edca280f270da30dcf995196f53bb4ae*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 816x34x1x1_n"73c02e039c2b76bcb0b74c37f99118f3*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 136x816x1x1_n"b5188068d5c82a323afc5eb7709c0844*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 232x816x1x1_n"480d6357e726c6fe9cbe7ba7ed609b99*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1392x232x1x1_n"1f92f68cadb7224b5d3a84a60fe4b268*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1392x1x1x5x5_n"0ec74e00d1fc4c770b7664f3fedf4748*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 58x1392x1x1_n"3297fec9665ccdcbc92fd3f3d42391fc*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1392x58x1x1_n"6172edc17407ef3175f914d356556d66*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 232x1392x1x1_n"b9e5b470cdd9f7080f755510af7c7622*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1392x1x1x3x3_n"0aa3bbd94cc8139c9fba55c8c74e4112*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 384x1392x1x1_n"bb01b710b01c9adb4b20b321a65de090*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 2304x384x1x1_n"6632ee0d1a506393b6863c3547f3cfa7*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 2304x1x1x3x3_n"5d09b2d7e8d8f58f8f828474a1834738*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 96x2304x1x1_n"00c62b0a616a8d912e73e5230b4e3ad7*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 2304x96x1x1_n"a296e2de7252f0bc51ef83f2865d1acb*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 384x2304x1x1_n"65054fd86508be609583424ce218e7b2*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1536x384x1x1_n"df7492161ab7fb5a696f0f2591c6658d*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x640x1x1_n"042eddc48c457386d5291323ea2280ab*24&1b478d8db805ff0f0510eb058186c685*24&3642d9c2087505732c4568648f760a10*14"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x320x1x1_n"b6fd35fd2770d9ddfa720b9538feaa38*22&2224add615acf73d6154bc172cec56ee*22&88f2a4e3bea8654ae7bc91baf438cba3*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x4x1x1_n"5b2d78771059e735cc6daedafa3e0a5d&dad8e35fc3f78dd1d8d3dc76c79f5b8c&bf9d35dfd555c8c005d6ac66e6c6a996"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1280x1x1_n"01501257af3264da1ff7bbd7c99c1ba5*45&dfbe382195db64556c9de290246f4e94*45&2685fe9405f0859955b6d515761706b3*33"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x4x64x64_n"25bf542bd4487b3df50ba8b5112b1ffe&866069b42ec8e66d4cbc19d0562eca8a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x320x64x64_n"d86ded910f08847502a4b12aa3f0977c*260&81f0d4798fa43d5dccc17ee3835a2684*260&84d3d959ef5e9290856e306802c2efe6*23&a1456f85bb017d915d027aa10146ac84*13&c3279c3a83f75a66b43bc3291540a293*240&2d0ff1760b1409a8f32e2bbfc2cb2676*240&6286f55da99ec5d53c3c7a09d554827e*14&d0f4f9014894db823fe76b5a072f5de0*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x320x64x64_n"7761d6b8f47cc8e226f63f418e0dd9e2*260&b2bedb2f427c99165ab6993a74338b43*260&72872a4f02476951bd94d456577da942*19&cc410df4bdd753d5fdb49f3e3e13802b*13&0a3b90cb125c1b70ed33d20f8761089c*160&1bd0a7c3285121cfe5bc265bfd7ac705*160&71f8b556da3e44f3a25cd8e1e119fe33*9&af0057345db439d6a0a0840a87e1e799*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 320x320x3x3_n"7237795aa3361402a875c8cb28d19b38*8&a708d6747a138ee8443d020d1365e84a*8&4bb1060d1ae7099589e29146954fffa1*8&f8c827e468f2a747fd11f407a906cbe4*8&91aed38a5aad48d2af585065ee33ca6a*8&993f8d0834a77ceb450c67658ca45d3b*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 320x320x1x1_n"83161043827758148166d71ba1db1262*10&aedeb473577c459310b77f4a94fb91b5*10&42a15daf2c517b8c4597a7e820c7765b*10&00f2d99195985c0a812fd414e90d3d77*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x320x32x32_n"33547c59e8f6fdf388766b78481a3d00*20&89b713b47be4901fc9fe55210d714cf8*20&bc4b64265a08930cf5312d6465712172&9260926808c87825051478945a825e30&6673a122d318e4d3fccc39e1e4ed2d2e*20&463d00873811f2e76446f9e85d7b5295*20&72949e892f1e4808c750433e6e678c94&58e6b068564bf0df11eb4431cf09bd6b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x320x32x32_n"5cc69ad5de1b447173c8171c26e002c7*20&092749d24ab067e729b8ee116f1b4522*20&c3f9d047bf629e57f763281ff467545f*2&0d590dd89757b78dcf5a2c9785b3faf2&af5bdc9ea1fea11ff752358902886b73*20&7e819c02bbc42ff59b67b52b057b72af*20&08ec6ba1d882b574af3c361894e0795c*2&8047eec5387b04083147b66bed65fd38"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x320x1x1_n"b5fd8cb103f88d380743fa97be8009d7&359194b3359ed69960b53d2fff73c8a7&9329c4f7ee80ad19233d17159e22286d&6c21c4f35d01941d05a95fe6387c08bd&38fe97c4f661ab9611d9b084743535ef&0862ecf6134e462fac19c871723cc9d0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x640x32x32_n"5d4af6b79485fea9412c83959ad55ffb*220&f4aa1315634617e315823984a43c2292*220&de236f10deac9ebc31971f6455acb0a8*23&9a91c6d9fad7787017c4d1ba29c26db9*11&5283bd35e4758c3bcf01f0727a0df613*220&36c873dd05370270b1b15727161d7e4a*220&bebb0adc02a027adb6a22b2e673c63ba*14&4f51b64a06bd1e98b8380f3c33404136*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x320x3x3_n"a7636f95b7e0820301e85915ccbfad2c&fc8fd04c06ad613bf4ea1db23dd7e23c&3374e4498e589b00b952efca2c3958b8&cdd427477bc76f1dbf7d3776b82b7c43&ad6c839114d717b0bee140ace98a3e81&85315c1d0e2aeaf7d8e07a95741e60d0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x640x32x32_n"24cdbc0a064c70d06718f7f26715e208*220&d2866e4205053cf257c218fb8f9b8a85*220&e2eff84bedce8cecf6cd7df6be625304*17&ae7d3a0a63227503ff3396226054bfa1*11&3e93205fb46d3aba549e45e54691fe75*120&0bd72eaec017b93029a8d470daaa0219*120&24ab424b003254b929ede37a28c45f8a*7&69da2b52eea9cae299c83d2a0b894e53*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x640x3x3_n"d5dc7347f632243cf1f075cf7cb454cc*8&b8902d5d0dca18ce45be286f9b3f4f1f*8&730ad76301dd7b4ad27ab057a6806915*8&d7e424fa22f2d9e0cecae42340e4a729*8&935b2cac9730816efa45cc689ae7579f*8&a8c0f30dc3e69be253f793301af85c8d*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x640x1x1_n"76c33f38ee7a4a5541f8004071e1203d*10&48c744279120b3d6ab91c7377374d640*10&8678ded6999f681c3a1af9a68da073b7*10&d32b6dc96a16e66495f7b0f99c0d292d*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x640x16x16_n"8cd790e8349a3cf052b56b2de1fd329f*20&389feaee39832029481b7bfdd7c6e06b*20&88b166928d93a1c144bc823996e60800&004bcf111aaff976be9a62d997f64ede&fe206f9099a79021a04032b9261002a7*20&d4792fd2b7e2744f7e34c42a99714568*20&d594681fd7ff5d65a8866c2d703e2ac7&264ce9c14cc4801cd89bc2a7b0a15774"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x640x16x16_n"8cceaf0d4e811d84d71fee9a58600bf6*20&96837796991263ca6ac26e460e1b7279*20&05e9ed5deb1edcddd51b38e08b38b928*2&9a99daabb9fe918948b9a6ab80fbb2cb&fdc2e39ee60bb9ce9a0e3ec61e1afd27*20&8d1748ff86e073f67c80bc60e16e3f16*20&41fe6a903167750d2592400ded858800*2&8f3c43e46883b0f3de2d92d35021444c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x640x1x1_n"810579477bb6ef93bcb8534eca3e47f3&7f70c62ffbfad3d68b230a07c84c6e6c&9a4803c88eccc2f78f81d8f03ff39c56&ce2dc3021ca6de5f8baff632994b0b84&e918f9f1c1bca0851fc13cdc43d1b515&3334239345f9404de2cfc503b6365f9d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x1280x16x16_n"3300bddf3452f3431572d6d671751c79*220&0f1f90ce50ab0e5281181fa9bfd60942*220&e3150661ec76a36794f0a7849989e732*23&13223dd0775152c04f5a9ace353cd815*11&f69607b8210b11aa74d822220697f821*220&1faa8175369094ccf132403b5f12bbd3*220&afbceb934e43f10c0cac44d0be6cedc0*15&e367e7946bb244d1605c8af1b060a37e*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x640x3x3_n"6c7dd642d4ac8cf41753e1b0efe8704f&82a9e20e686a2402aab180ad518b6d6b&9ec0a3b87753e575464c242f547f3f64&d92cb2f4a6e45f61bd0c3a5e04852b76&3cae985df53136fbe06fbce47ab6caa5&873105a814c33ad0e7eb038bcf4b80d0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x1280x16x16_n"a49034a06fd7e35abf609693137bb898*220&d2561a577f8aced667e89234d9996e07*220&374a41ca2ddaf4d722d34ef79885fd2f*17&7ac64e9083618af6c40b60d5682b98d6*11&27bb1ddd7d46ce45ea26ea57a2025863*120&9896ff33a61a0a69dfeb816f07df2501*120&46ff8542e819b94666b8a97928222f9b*7&604742195d43ad25884199fdc271a3ca*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x1280x3x3_n"cd5933b473ea49ba27cb7e447d9a7a75*20&a274cfe0f87878f59cbbdcb42ca53b4b*20&05d62df14da144dcd9d61f115e1dafe3*20&3eb2a39581416ccc51cc5d5a8b1b41ad*20&2cffde8b979808c7793e4db6389933a0*20&6b20fc9a9a8ec88872da287023773d77*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x1280x1x1_n"4558c99eebfc34fcd8a4cabcef4bba64*12&41722bab13eff13c161c633574ff8c45*12&fe4cf8a9ec8ff4be752f7da9a9a34823*12&562aca5133844ca12fcf03a16f5a8ea2*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x1280x8x8_n"8899cc1428b698282e99851489114a34*240&f9aa24fef4bce8952013e825415bd40d*240&3a7f5e9a3dd8233e61c7a9b9b3f992fc*15&d310e48647dbf455cc0623df57b1bfdc*12&b25ee7b1e06460d875a8c3075a44c6bb*240&92c94d0bdf1b00147bc590f408682290*240&d1fae3ba9de135d8fd9828c468fce319*13&3904ff6228b229475ffc1a4107e16a7b*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x1280x8x8_n"3d46b2fb5321e3fcbff54f4db43d4674*240&3828d4abe7fb43858da444cbd064c41a*240&f759dc724980be865501cc17a576f315*13&3b6f730ac7353bb7e721e22bdf2fd86c*12&4e5184163b927a5fb6cee1fa68467f28*220&7df8ec401adaf55bcce8ee1f5f32a0e4*220&69571831d40cd9f04d546a3cf1092956*11&1454b16e0b93e8ed94feece07848560f*11"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x2560x8x8_n"21d5acc23e5362bd649d457b04fdc26a*60&82d2de5999ed83b9355c552cacddc8ed*60&254f8f9723c5e0fad6cbfe889f1c9888*5&c5684eaf51a1c70702f2e11ccd9c67cd*3&cd3a57c023bfbfed1d9bfa7ae4a91fe9*60&e12f148ebaf6153605f19454df9a8073*60&653db2e26a6d48645a823750f6e8af03*5&5474f9b51ee9b048cfddc6e873b77dd9*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x2560x1x1_n"905179cabfebc4a007287b234853bc26*5&203ea5c1acb65c62af077a78929bca1f*5&162ce2122c25f436e344d12d33056009*5&fb2429cc36a7c362de1bc208a6b8a776*5&f138411ee63732e1fcbf06b0ed3a1443*5&fc6ea801686e128f9fd46a5c365b1567*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x2560x3x3_n"afab83b7716830421ae5d1e548ca7b0f*5&64c6f85ff71ad5ebef61f55675dc41bb*5&6499f3d598ebfcc949c0f07eeb4dc644*5&9fc94cb75f3f5eec85740f450f9b1d53*5&903800c20c2052fa88645f7957f009b3*5&9a26bafafed126e5e22d379fe4c7e461*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x2560x8x8_n"12389bbf11468f2ad772ccf37b423eed*60&1a066a91a2ea08be80f07fd558f9aeeb*60&057a4c88446890cc748762b13ae41ded&5dc7d06b1503c0d25206f80732db7414*3&d9b60ced0bcca0921c7f9130bfb47233*60&93114ac2e17b2300c9ff28c1dd2553de*60&ee9a5442ccaf032a344f2484cf5facbe&18a54540e416e2a586ead0fbedab77b9*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x2560x16x16_n"00d0ad459e7ee83d870b416c89316704*40&41f929aafb4cf9cd75aa73e7e313e80a*40&b7e66f90db544b36d78dd38ead9c8f55&fbefc3192d78b508a4b45754fabbb126*2&93fde20f8d5728aa371040517d99b85b*40&281d14eae5c66fe30a1464bafeb4ea63*40&862dcf6a3f4c9e4201efc4725a46b4fa*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x2560x16x16_n"d9f3bd844a00991c45a14cfba8da188c*40&509899742729f551aaad306c9168c4ec*40&ea230338d0156e39d646e749160dc388*3&8746c7ab8eee719d7fede03e6406a41c*2&c72e54d8f6f6b1717c55a4ee4077b076*40&af6c7e929a386303e8c7a0dba23e8260*40&ca096a01811b5e48edff56d8fc2419ec*4&7b3ba04f01ccc2f7c51209d645551747*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x1920x16x16_n"0a066bb232db19edabab33a7e58c4044*20&7f52df0d344260a45473b953df3ba4a5*20&79a5f5999483bf801dd4d95e3b026b33*2&ec8db802cdeecc7a98ddcb446ae85446&e70620c6ef5d9ab4235ce6d8ca9ddefc*20&cb1199fc35f0e2901d706c509bd9c4d6*20&d8cabbde5c0dde9a0da22a4e5d994d17*2&6905af11f7634bfa9145f7cce79693b0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x1920x1x1_n"9e5cdff86153b63d2125997d4dff0d4b&7a59ee862fc1b0027c67a9756d966de5&8907a9d82e06dbc68edde4c9c3a920de&a2fd46f31dc84c92e52dcf1412758490&ea66221af8a568199ead2c57c924e79f&8b82ef4ca290372a59fa7ea5654b11ac"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1280x1920x3x3_n"8eaa983b6e485d1fc415215cddedafdf&bf2ac21a0222b6d7986636d8e1f772ab&ea25372490121c3e1b1e1b0b73656ab2&93875e27a27340786f86d02c8a816c3b&7e34ccb319b973468395302c20ffd5f8&bcc21c7a045111498854a32403333da3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x1280x32x32_n"f2db8cfbbe5262975dfafe8faba0cb3e*20&18ba4f67bcc83bd99b8733c457a699ea*20&dabd7c79df0e7bf3b249633021e700af*3&d9d4effa8678180c3c3b10c12b0d41fe&c2a885cbf9fc6b4f5c9e92c137836e3a*20&2108833376bcef304f4a64288ee28e41*20&db2e1c3654afaa6142ab044a7861be54*3&a20a12b4a87f96e0b61a4b8525e8c73d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x1920x32x32_n"06b71a3f8cc76842e3a2d40b54a94821*20&345d674ed0e08e3f5ccad4a2d3830284*20&ba3ec918d027c58efb64cc2de067be68&6b4f964ad92d4424cea7f2cf6c5d6cf6&a1f1e3e8e51a36dc7e49ada8de775d23*20&e3063db1121d8b7b5983c9a080b4307c*20&cbe8c1727cea7733cc0a03fa8155d254"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x1920x1x1_n"87f1fb362e68768033a54f32f5449b9b&13c7dac335e2237508a64ac26296a9e0&2f78276fab93e31250871f037ca48907&42b28e4fad03de4794bc4682dcedd381&12c6db500dd1f79b8da331b3f2fb439f&be1bca4ccdf35d940e35934ed1edf178"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x1920x32x32_n"87a986297ef7a95d79147be912c91fcc*20&d03b1958780feaba63b7594bc2fed0e4*20&c6e64c4496bcce17fc567a336abcb869&3acb18a115bdd2e001d1f4e0c94875f5&1635d8793b92c2060c67cffb266880f2*20&1edbb269c8ae12a497da95ce4f072e34*20&61c4777f4162c9fb8050a59b9020c751*2&94f39dd6bb2dc66eabe3e84e00566d1d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x1920x3x3_n"1e3fb152c6e86f61f5271134e0b92bd0&af5f669841b66e1f4f4938f605c7368d&016767d6e22f41c5ac690ac9422c96fd&3b8feeb315780327d3c9f62424aa6f78&a01858d6555dd8298785c70038f29593&99dd239a8e57e7bdf13461a42e6369ed"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x1280x1x1_n"17fb5bdc3c762e194e45a43a1457ff6b&8aab46b111bbcb04733b407990e134f6&bf520c66f31c64d943c299fd7ef0bd3c&8b6346cc16cf163308d10b73c1dc6d2f&e58a801fe9a59db6448095f4502132f0&de50178037fc981306ade1d2f38b4cf2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x1280x3x3_n"e1da796746553c62f5d32790d18a9e25&a69d1de50ece8c3b2795624b8a840fdc&e04e5efc3d3f550457e6c57c27c4abe3&5f493310e53e950ba97e672381b112e6&8619a8440714627fded060ecf6bf82ba&04f0c359b8fa04b1125fc4ddd8b70e3f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x960x32x32_n"853d8a3c0c1214b9bf27241cec32db33*20&033d73d0fb055acf9b8ba38e66918388*20&25d8bbe34ffa46c2430df94e3808e5b2*2&4e6a2fd99a6b41597dca77d7d4891221&371aa16579ebd78e30ce31c88f91035b*20&7276e808b4aef25ea95055f95365ef47*20&8358e586764cbf3405811ffc726a01c3*2&0669409779746e2a353a30cefeef895a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x960x1x1_n"6acdd965410549bd069a1517b6afaeae&f58ff06e36ceb62d78f465d0c12fac9d&b11cf0d10df15ea1552b97ace4479e02&54a3dbe888c9941938a21cab5a9347cd&4abf5ea0d3fcf369804f385726ebbf38&c640cf4c22ecbd5ded58ff9eb7c66c26"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 640x960x3x3_n"94b23726dac1845d069ec50fdba6757c&b5b6fd3e2b44f710732234639e21c81f&3e37f535806a02409c7bcd5f141d8cc0&4f078c85d525a80149a8acbc5b61c51f&36d68df9787ad52eae0b14c4ccb1c544&c21287760d8abb6eedebaea1036c637c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x640x64x64_n"b05442ad9ee0613c1337770acb73942f*40&050c6b99005439510213525775c3b8b9*40&afb3feacf46811f4698c51d1edf6b66b*5&8cceaa65211b5ab0d3faee66a36c60e0*2&5e821c84630375c2d9d4e33ed88f4b8d*40&298ebaea9c04b381c335e30a3aaec04e*40&c05664554c7891bf733a232f3bb7f91c*5&9e03b28316ed1a7a0d82831361e14073*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x960x64x64_n"51e2c805654aafa4167b505612b9e747*20&b1c3d7c72ea2222d144291c1523088f0*20&e942dc2b5f4a2717d17a1129214ea798&3e849511ae7f058a7f6fe865886711c3&441d07852421e3c55cab7abf2e8030d1*20&d7ce6fa587975b56032e91069c08abe3*20&88000560f83578c43a0162e90cea7e43"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 320x960x1x1_n"5be9bcc52e0b93e51d8ba62ae926ec2a&d2845c23100d8d3e88017d8ffd9486b2&ba5f5e8d68175e69c42d816de41a05c7&9ba2df62d2777dd91376f4111706560f&7751f84f81fd93e56114e51ff8ea455f&d17fa0b542203089f6827ce804c0dc93"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x960x64x64_n"5381db92763f6a18609955641c7c11d7*20&9b2cf42d28a9d2202fc979b7661b70a1*20&043c3f4a4d6fcd65e2bfbbfad95bf44b&4d9dba6e239318f3fb7b743d6adbde1c&d79e14d85b23f74810aada23459b11dc*20&fcac6883d53190f484cf79694c57630e*20&a7583a3b166b15c748a5a40c318aee30*2&9a63a9837c5e49c3ddaa902194642c1d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 320x960x3x3_n"6ae0f1789f930d6213c4784c911c6504&58c80db9516d7c06dba7152f59e9fed1&d0f8b4ae5c4311dfbb0a7a59e29f01fc&78a0683805b476793bfac148bcd9b629&cb60bd733717c89ffdc35d5e339035e5&512498369d5028def9f6b5ff4d7fe4a2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 320x640x1x1_n"b1d96c51e83ba2e3bd7b85c3209487ce*2&5d838b49357ee1699d9a3d7cd7bd4178*2&a9f928e101fcbff8080e81fc9857fafb*2&1146d5e600b6ee21411fccd7825873c9*2&0f6ed29745a6be7e30920ffa55c2c15f*2&d0f75ac8b7a1a745617905cdf8cba88d*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 320x640x3x3_n"20c986a12205d67865d5ce0648e9a3af*2&08453b3005cc61362247a83aaa727a14*2&44a64933f460e04dd62d2d356f371b67*2&b1bde22fb575e73a3c289c34fbabcaad*2&88c52127d54e80568e60b6fb1d3cb243*2&e70642550fdde3ecf0d01478bc345baa*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 1024x3x300x300_n"436ccec589860c54b31eb183cf1fa0ee"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x19x19_n"3fadc23e4b4fdf71a560d27dc7846cd5"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x19x19_n"86c3610697801b724563390dfcd79e99"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x10x10_n"67bf85fec8e406b5462532f0c175815f"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x10x10_n"c576268019a992a487f17f08d2649236"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x5x5_n"9f866427fbee29669a850bc861075dcd"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x5x5_n"c0a7a853f1471a52ba7d3c63e0fbe37f"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x3x3_n"c32f0a4d6e484b50648b2ee86cfa52f1"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x3x3_n"073fdbc724bcee80719c503ccc825863"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x2x2_n"5d23f055c8e31ac7042f187b37418018"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x2x2_n"5565c6edfb470a9fe1fe0c8d7e420bca"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x1x1_n"687676f7f37aae8a4c0b74ffcdbf1753"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x1x1_n"e6b15266fbd34a4e27dca9ce7333803a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=Abcde16a --strides=: 32x4x3x3x3_n"c799103df7c84c11036de89e1d56e8ed*128"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a --strides=: 64x32x3x3x3_n"10ec9c2e5acb653509af23ed575ddca1*128"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a --strides=: 128x64x3x3x3_n"a7d9874234918340620275da07a3ba67*128"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a --strides=: 256x128x3x3x3_n"f33b5172281cf7971069b56e938f81d9*128"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a --strides=: 512x256x3x3x3_n"881ce4867a31817cfcff1c8745f1b1a9*128"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba --strides=: 512x256x3x3x3_n"aef72aea8136db525fb305c0fd2c3789*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b --strides=: 512x256x3x3x3_n"ffab699bc12fe17d374413b13256ce03*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba --strides=: 256x128x3x3x3_n"1c7e9e877fcb61f7a91971f5736653c4*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b --strides=: 256x128x3x3x3_n"e3328f9c7a9196fe856aede7d155521c*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b --strides=: 128x64x3x3x3_n"4dabdd36eca3d0ecb7095f65001c2aeb*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba --strides=: 128x64x3x3x3_n"8ba1c2fbefa5d014379e73b281bdd552*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b --strides=: 64x32x3x3x3_n"e48818a3c07e6959e2bf1a69d990c09a*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba --strides=: 64x32x3x3x3_n"4c5ff26711273db1d5d0b1d2d6ba5edb*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=cdeba --strides=: 32x4x3x3x3_n"d2ef2c3969a9cff1359baeff07cc3fc4*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x240x4x1024_n"dd0f69d53bc90f677b1b7a053fe97242&9f2048c23a6b8e1c583e37c6b06af3da&26eb2a330f07b0cf48c435e2365175d0*2&7b559d02d82c21522ac94fe1b31a51e7*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x1024x4x1024_n"3e03325adf7017e93bbd2792d8b44a5d*8&eeb475906e623d5fe4b4e8bd1bfcdcd0*8&9c7107e83b412b5d80d864f08b59f4aa*16&c975296b0031123f144715768cdb6f74*16"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x2048x4x1024_n"de419ae860219a5d51484689e4270f48&df70cd33426e0aebbe5e6d3ed4292b4d&7f3ff31f7c7be8f43112b99ee81c7a1f*2&7534b39d0e22b940e101c5ce5d98b512*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x320x4x320_n"fa362744e0bcb0c997a8cc07b90c4c69*892"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=Acdb16a --strides=: 64x3x7x7_n"f2cf1eaa446726411e14948e87d95100"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 64x64x1x1_n"59d5e496ca31289723164cbb04d2f36d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 64x64x3x3_n"a91a753b68cf60769d789a8c6de58150*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 256x64x1x1_n"af6c908f9bcf48c47087b6dd0a81d39b*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 64x256x1x1_n"9c8ebd289b2c3946ee71770d4f4f018e*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 128x256x1x1_n"5bddbf0ab58e9f58ce61c6690f324114"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 128x128x3x3_n"8c1e9a03464a23a4532788d4df2c9e60*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 512x128x1x1_n"a0463d5b59649dd83c253c7f8860e7b3*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 512x256x1x1_n"8882ff1babab8adb9fa631b877253bf1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 128x512x1x1_n"8f9629e783ad9779be5fb42052dd08e1*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 256x512x1x1_n"eec17fcce912e2f9cf3ecf742f1d2143"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 256x256x3x3_n"152121c577a0b97d64a5404e5204d340*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 1024x256x1x1_n"b3ecdd9e101394becfea96cca8b0883f*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 1024x512x1x1_n"15ca621c04fcb88fe8574e88d3cee2d1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 256x1024x1x1_n"4c7f822836f22b59083abea6580b52a2*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 512x1024x1x1_n"16e60bac7cb644c46fcc4c88a290c27d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 512x512x3x3_n"83ac9339d8f5ded6c587023d0f0d0455*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 2048x512x1x1_n"33d5e738cab51c321850eefc8977453a*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 2048x1024x1x1_n"0b574d4ad2ceb5fd799e732c6d56589e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=BAcd16b16a --strides=: 512x2048x1x1_n"b9ef5be95103dbce3c7c1b5843c9fccb*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=abcd --strides=: 16x2048x7x7_n"cf9214e6491f1e6630e74b9095a2da58"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 2048x1000_n"3f4cef0694dde111394bf75c81f90ab9&84498e6d1e386734ae98fe72a2836d78&5af69024c87d4703aebc703962ec1afc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ba --dtag=ab --strides=: 1000x16_n"e1dfc2f76c2ead95bfa295b55934657c&9df7d48670f5f5243ff8f75a7edd48d6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 16x2048x7x7_n"c436c2f941f603d310858c1550c701de&9318c839df415b0cc272c2ae9ad294c0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 2048x512x1x1_n"12ad82c7db6c69e6e137f4379c211b6f*3&c2d44197ee81930c80d737d21358d931*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 2048x512x1x1_n"63450c5add715962a2d835ef0799909d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 512x512x3x3_n"1360a162dcd211929f9a35afd76e2eed*3&3fc565482ff1606f5576065af257b222*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 512x512x3x3_n"860f16b104f8e4785d253fba042f6673*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 512x2048x1x1_n"97982d0ba0b54e7ec9544224b2e291c8*2&06b2884ac4ac67438a53335a87a782cf*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 512x2048x1x1_n"9a6c7f35183b292da71a00d4bc8a1865*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 2048x1024x1x1_n"d10a418378e911b0cdb87699fc1fc41e&913925f9bd7fd193679a6f10205b9be3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 2048x1024x1x1_n"0a590547954962106e10dda665a44d93"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 512x1024x1x1_n"e03c6c211451e410db94512d33a9debc&5c85e43ba3dc9b03b147275e910ed7f2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 512x1024x1x1_n"3c99b0850e5c00ded04e22d767ca521a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 1024x256x1x1_n"f445d7ff033a3b58cd7dec2ce0c30296*6&2c36d4f8f00dfefa97d581487d5255de*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 1024x256x1x1_n"8f17d5f63d2b8d88a7b5ec8444bd7ecc*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 256x256x3x3_n"e0009d8ea9bdb87fee503099d80bba67*6&4a4533fd6ed40d6713ed0521fee771bc*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 256x256x3x3_n"c066d81209747ef362fa5a782a10cb79*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 256x1024x1x1_n"9a8c3c243f83d31f6623a9285f3cf58a*5&7343c0582d451a26632d7300705c5090*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 256x1024x1x1_n"f4e93a547d30d383f20dc358fd65789f*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 1024x512x1x1_n"b3d56b2a31b36aca35734232601a6a05&014a4a3e634c78ec88dfc9de53d643ac"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 1024x512x1x1_n"b879358939928e03b49e6e068299c76b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 256x512x1x1_n"4382bc91c28c01b1302b1dc6c2e76fc6&dfcb88fa062008dfdb571dfc9431ca40"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 256x512x1x1_n"a7ef428830033aa17b937f12a10b8e87"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 512x128x1x1_n"b9972c9316102b66757306b5aff8496b*4&3ff4fc9b234a24e643e862e4101d595b*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 512x128x1x1_n"bcda0fe9fe3c6e3f2ae56eeac639157f*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 128x128x3x3_n"08311ae35cb1bc8970ab085831bd3e92*4&7725f245678909d4a4e618905656ef8e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 128x128x3x3_n"26b372a93b95a4c1e9b472cd45a8f2b7*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 128x512x1x1_n"9eab5c527c58da9ab0e1feaa66bc1cfa*3&dd3490f5cbe223abbf2eb1e5e59228aa*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 128x512x1x1_n"180e0e70403b14e33e37f083ce2d0e09*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 512x256x1x1_n"0e983e5cbd7d1942947270c3e8112d81&bd9cd5fc76492abcc0d2836d3e9f5280"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 512x256x1x1_n"3d7ae9082c52dc35d658949c9571e74a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 128x256x1x1_n"060c3e1591b66cdcc30d5ac1d553c28b&e4e842e46a3c71a17eb1f05302885721"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 128x256x1x1_n"891cf00ba34250591025952685d12a2d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 256x64x1x1_n"bf288bc833e7e99775803dafaa7b1c50*4&4f899b6c355504273098ad71623f681e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 256x64x1x1_n"6277a7e77b48f6e9293eab1b85a76604*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 64x64x3x3_n"e1456395086fbca446b80696cdd07d9d*3&385899f399141b7b1d34471254291439*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 64x64x3x3_n"7ea099b0fa01430eb055553852c1371a*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 64x256x1x1_n"8b865e847aff277919e13f270bbe1dbb*2&f13ba37f80d7c2c77a31bcaeac6a80a7*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 64x256x1x1_n"0cc3338e3b950e357c427e6c2df460d2*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 64x64x1x1_n"edfdf8cb77b67ae9c547d4bc863c362a&9ed39cc94927d7c3ad954370f8370582"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a2b --dtag=abcd --strides=: 64x64x1x1_n"4e0a20babd5a7f6dc7a0a3dcab027ed5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=abcd --strides=: 16x64x112x112_n"feffcf0262f2316e2675f75f2a7265f6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8a2b --strides=: 16x3x224x224_n"aca74eef96d46af25ecd8e16bd9f596f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a2b --dtag=abcd --strides=: 64x3x7x7_n"2a70f4fe96efd6fa06d32e9dc3312679"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 64x3x416x416_n"8591b6929845d1bf28c1233d5d67f10b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd8a4b --strides=: 64x3x416x416_n"396e179d5819cbe89b6818a83ef6e5dd"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 64x64x208x208_n"57e19fb8b5eab29a4992e2f10a082c7e*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 64x64x104x104_n"058938eb045e4010f1f27e84855f7335*2"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 64x128x52x52_n"d6cff2f0e3a17967b1b71a0188ce8d8c"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 64x128x52x52_n"8e77184ecc06732e6186ae7bbc1c5df9*3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 64x256x26x26_n"59e005cae1c35138e95f2b56d77b700a"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 64x256x26x26_n"f21e1f4bd2ae043f21f8ead6ac7afe8c*5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 64x512x13x13_n"4898b405f4cfe5d4d47a7bc2efecf55c*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: --attr-scales=dst:common:0.5 64x255x52x52_n"5ec4d81be66008879837baa1e77773ad"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: --attr-scales=dst:common:0.5 64x255x26x26_n"735e42fb6b9dd431b147fc06f65bf25c"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: --attr-scales=dst:common:0.5 64x255x13x13_n"d5cb20154b608f3ed0b0918275ce37ac"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 16x512x16x64_n"47b40f5f2f377ab1e2d4e6b4dc81ebf7*96&fc62e9f6f1333607d376bd44a166c651*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 16x16x512x64_n"4a1d32b0458aa0898201918f32ee92ec*96&b3cf6bc547287da936b59572eca843a5*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 64x64x1x1_n"bc7b18a2e2b9aaee248994739f8e053f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 64x64x3x3_n"6202c8ac3d73fb115fb64357f8f811e2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 256x64x1x1_n"c7757ace0a09ec20c607538c20b1c387*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 64x256x1x1_n"8806f5d86d85b6957ea4dd5b46c1d529*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 128x256x1x1_n"84c8b5fcf4d932e4022fbc634eb607c1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 128x128x3x3_n"966fb9e690ba8559dcb5fd2899999ce6*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 512x128x1x1_n"71111431ef7abcbc9bd1661056c18ea0*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 512x256x1x1_n"508cc40a0015dc9706146182d1d82899"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 128x512x1x1_n"100afad9eb47383ed65e906a31fbebd1*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 256x512x1x1_n"d4b2bee2a2c5889179cb575983cad8b6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 256x256x3x3_n"9de2bb84a5a4ce42726ee64108db5b95*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 1024x256x1x1_n"42c9af2f62f741b5c049afa02bdda528*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 1024x512x1x1_n"b5c57960744768df513eb37a495e5b8d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 256x1024x1x1_n"2fdba79b0b1d7d13758da6db9c758cea*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 512x1024x1x1_n"17145e389cc95286b5f1d63ee203faaf"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 512x512x3x3_n"8d9a77d2fa63e1a424b7d9f82b24de3b*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 2048x512x1x1_n"894fc7a05471dae570443a5484b31e99*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 2048x1024x1x1_n"dc4312d86a34bccf446e276f31483da0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 512x2048x1x1_n"99a4d348f372e6281eca04d70bc95535*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16a16b --dtag=abcd --strides=: 16x2048x1x1_n"b5b5a4b091c29a7e8f9c2ec020af1b97"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 2048x512x1x1_n"e8067fedf43bf158fdfb120c4ca72af8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 512x512x3x3_n"6d3b4266ab9c019e1970df2c2e9c4b06*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 512x2048x1x1_n"3b2669792fc4f7175c840ed054418f1a*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 2048x1024x1x1_n"f61ca9e969a7a3793f7c60d62d53b3b6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 512x1024x1x1_n"572088b242e6fa748e363024e48a6d35"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 1024x256x1x1_n"1d070e867863103976e9f6bedb259ba7*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 256x256x3x3_n"e8b21b781fbc9b44cd67ce2e438a5070*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 256x1024x1x1_n"f80ea8eeeb201b73a464ff78d3948f0b*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 1024x512x1x1_n"e849974ff0bac9f23cc00ab6850e9229"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 256x512x1x1_n"78cdff63e78a42ffd90366b6a5c16e77"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 512x128x1x1_n"0d2203a99a8e38298973bf4f6eaa445e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 128x128x3x3_n"c03667479c08b9c0b336d44fe1ecd15e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 128x512x1x1_n"9bf8f8ec4f378cb7313e726d16e855bb*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 512x256x1x1_n"cf485f981bc8259468f8b6114b0b57ea"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 128x256x1x1_n"63b8916eb7a96d1f387f39b56a0225ac"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 256x64x1x1_n"201177d9fc5be0799c0986ab2952d593*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 64x64x3x3_n"ff172715c95e10d7d04fe04c3d99ed2d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 64x256x1x1_n"a18a12f038eab93ccae329758ce919a7*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=abcd --strides=: 64x64x1x1_n"e15f3be6a0dade109efc59d175e31748"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 16x3x224x224_n"b44058fa7dab2f250b90bbb081b2bf0e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=abcd --strides=: 64x3x7x7_n"728c73f105803244009d9e7108e389a1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=Abcd16a --strides=: 64x3x7x7_n"48f2bbf54c5fda9219e25e7586f48976"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 64x7x1x1_n"d8733e55f37fd8c2875b5b85563aafdf&58dec9acf825fc08acedc7f75083c429*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 64x64x1x1_n"7e7ee0e0b3df81eca1aab2ba06babffd*2&8da4b5793386bfd55a72978c65e3f11a*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 128x64x1x1_n"c8d7a9fd9461654316c008d24580041f*2&a76781c0f51364d51f6bae29d3c9f1d1*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 128x128x1x1_n"bc206729cf26a60a0147b703aa16e8d5*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 256x128x1x1_n"a2cd00a4cc124b6267a3dbc70dbd84ba*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 256x256x1x1_n"d5b1f1928613761ff3ef4e27c2f05989*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 512x256x1x1_n"715dfb1e6d135bce864970962dfbd950*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 1024x512x1x1_n"6b22fe39211249ab7ba835466c1c5dba*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 256x1280x1x1_n"cab2f5028de63a7bd7c33c5946b829be*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 256x384x1x1_n"5d4c303a1e63a9364bd63fe0a836d2d6*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 128x256x1x1_n"624048646d87b6b42de88399657b5b75*2&0b5b0f9cb3cf4d57008cace383eb0585*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 3x128x1x1_n"4f47ff0b7bc525c785402a546b1b4925*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 3x128x1x1_n"fc4ebf13a8cdddd384cfc35ee39c9e4c"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 128x128x1x1_n"4cc259b5bddfd1414e5e6eabce7bb44e*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 128x256x1x1_n"a1fb009935c65e41e20cc545eab86286&bcb410d9a99f35f4d832853303f1ffd5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 256x384x1x1_n"92e0e205915d21efdce2a2ca6623153e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 256x256x1x1_n"8dd6a7ff5dff075919441de5de16ca29*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 256x1280x1x1_n"8367bb152e2dadc2b81729fdcf1d363e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 1024x512x1x1_n"d6ddc2d6a65531637f6ce763066c6e17"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 512x256x1x1_n"3d3966a1e75b3ef719fe76c6f32dcec9"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 256x128x1x1_n"5880cd129dd70c0e72d1cc349d61621e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 128x64x1x1_n"fbbf84a508149b367ffd19faad4fbe0a&7705861e7419ef0f9270269c66baf678*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 64x64x1x1_n"6c5971cfd72b60914a1983fe7359e028&c9ef52a1460bb7975593685293d2d01e*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 64x7x1x1_n"96c25e569ef09975e28544c8cbb30301&be85f6a2282c6797b55b61e5ad96f592*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 32x3x416x416_n"a3b5f83f023e46bea14afab445cea8ba"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 32x64x208x208_n"f8611ef87339309dce48d414cb9a3ce8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 32x64x208x208_n"93a964ddd648752a2ab7d2931e895cbd"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 32x64x104x104_n"2d102949ee3e0444e2f064b5b5dc1984"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 32x64x104x104_n"67e6de952a2f8e4df1a2cd54b5e6abd6"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 32x128x52x52_n"3ca547a4bf73aae2b9252c2d31c8851b*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 32x128x52x52_n"ee4fa4e1ebe6f4456b4147d23092ad3c*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 32x256x26x26_n"b2a7c14daa4d8229233f9b3ee2feeecd*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 32x256x26x26_n"64b2d036f87abb202b479529ad1a6971*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 32x512x13x13_n"8fa8aa5ca95f98d912415003dc977def*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: 32x512x13x13_n"10e1f9b6287087744ee660c0c6c6cb60*6"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: --attr-scales=dst:common:0.5 32x255x52x52_n"5b7385a7c94a8a8b27c5d520816f0e87"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: --attr-scales=dst:common:0.5 32x255x26x26_n"6c48bf54734d7e82db97de5d3d48a50b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: --attr-scales=dst:common:0.5 32x255x13x13_n"a81f07a06bd5ac952b46594c9466ce85"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x320x1x1_n"e1420c344f11343c4ae4a195ce099c64*100&a5f629210c65f8bb31a450d221f789a5*100&26d0cd3f2d9e3f98c52632a29c3f13d7*5&1bf905aaf6b9b1e281912e2c3bf51086*100&a4116708ff8265476b7e8c6a40b31f0c*100&0aeb30ffbcca54f8248d92220d617b56*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x640x1x1_n"6f3a1c36717b56c59cdf0cd936c9888a*100&b1c69f4578365b66194f5c46adf92027*100&4bf49079da8a952828636ccf9af26e86*5&493cff1936362b9e6ef92a9c76c1970a*100&a9df71530da80eb977594fce17c66875*100&dc0599a7e0cded3ff9378b1c21ca15a1*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x1280x1x1_n"b3604fd6786b2ab3629676b70d7b32c3*240&d3de96514561849eb7325dceec5bf1fc*240&b4952ed1706592fa9b3b6d5ffab795a4*12&c7131c01de4efeb9593166e50dd2b1fa*240&3c2157690bf5a21c3bac49af863cfaa3*240&48c831eb924ed4be9d54559da312f7a7*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x1920x16x16_n"916bb22569aadfa7ec6498844a8df6b5*20&c82d56453d6e8d11dc0b234b74485dd7*20&290acede5027ad5d4350c301e04c9c44&f8b05b60c1dfddb66188d6073dcce28c*20&7aafb89b0779cf9287ad3c1e42b791f1*20&3ed859cd61f5c3cd8496912b90b18eec"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x1280x32x32_n"69f67c1f395ac21d860f64be5069a69e*20&2bfb546405445a9e4f0589491a52fbbf*20&1f135d2ac1fb1fe1bc416d861ad3ff5b&7aca9d24cb3d9f783b98473a232918e4*20&4a018e38bb478763ed2277a6dafb4aa8*20&cebe561312918dafb253912c70a42df6&56b3adaff21f0e6d612cf3dff48c1f01"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x960x32x32_n"f198936baacee25540e0c06d2d3d4994*20&e72a5ccb82d120aacc4d6c7bbfc0fcae*20&0e0559b80c8e57800a8b18cc078b721d&01f8df1ff96bb9e390ee54ec44a42cbf*20&30d3de9ed43fd16f9ca6ac5195aa30e2*20&fbae2f247f6dfcd190ff7acd27523e76"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 2x640x64x64_n"e0b99b4868d3a4f3729997c42d86bb02*40&0371e02722c5f37d81a8fc35cdf872c7*40&6b851449b541bd3cad3f6ad20adc96ac*2&5a538d14c98915b329b994433fac8076*40&2a7a29b9c334b629524805eb2d85a3b8*40&ffa10159bd2649d04d6485335464916f&6d80e8b6801461d72a42bc75158234bc*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a --strides=: 64x3x7x7_n"cc3ccca971fe57f3d3d5c06ab399d199&0c2d9e9a01c31d730b509e6bcd0ce33c*2&9188199efd468f1e0e0fa3b96f65736b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 128x64x1x1_n"80aa6a50ffcab5e2363a184b9b1373db*2&b1078ba38fc6c5dff62f2f28996532b3*2&1b9dd15916c5a49d2c5b7ffbd15dc063*2&a23fac89880130467e3a7f1d020f69bf&a77e7a6da192dac82e3fe8fb13e7ec46*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBcde16b --strides=: 32x4x4x3x3_n"ec6a03c1296eefac3c824b2aa52e5d69*6&05f80a706319eb95998a473239b967a4*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x128x1x1_n"6402cb09d1626eca635908636b7ef02a*6&9c314fab519c6b388edec69fcb60e400*6&67b74c49d939e0983b9e7abf9d2ee151*2&61a90cfec89ee205f97a95c8afe99a3d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x64x1x1_n"c67876e1f34ed99f35c0539eb5b8ec68*2&a757d0113e3760aad14982dab3646fba*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 128x256x1x1_n"73ef8b874013268675bbf7c070d5a28c*4&8607950f79a4d380825c01b12a2b2db9*4&83caaa45bf04b5e89718f3eb9490c989&fbc2e4c8b70da4a2a29b2991246dcb65"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x256x1x1_n"1a520fc9ed72175760f7c647a17a6050*5&3bb2d019bbfbe640c02a69adf77f07b6*2&96810d469476d97e0c6b30a89db80f9c*2&da7a7a28eabefca49a3734ac3cc289fb*3&83de3f6941f880a28e98b352bbce5aa5*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b --strides=: 32x8x8x3x3_n"ac666a0a2d53274b667bc8f88fe738cf*3&ac496abff803928fbcb9a3a31d3a769e*8&6fe43cd53994c9b178ae46c14f810d77*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 512x256x1x1_n"0267bad7750e1607dd1cb545c6e4078c&1063fa2d4807956fdfb8e68ec6042652*10&ce95f5af07239dcf136dfee33c75fe06*10&e6af467f20a7f43434a91addb03de881"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x512x1x1_n"94c79c4f967c622fddeabc58dd650ea6*8&5f1368a8543548b739726965c395ca71*8&a2af2e6e85721760d6cee97e729d3c35"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 512x512x1x1_n"6e9a5b7fccc0dc880fcfa91b1db90c19*7&8bb5238a294abb7c6cf8919c85826de6*2&2970d210eb087eecbf56e9dde498b96b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde16c16b --strides=: 32x16x16x3x3_n"bc3a762ba949c4d4be6c5c4647d13584*4&e48bb8aba3f5a20a2d33b47b11e83fc2*12&d3c43b494af9d0eb415749e8416e31a4*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1024x512x1x1_n"1f03babf6e2d97fed74262d50928f258&97c34ac8ebccebc63bf12d62ed95f57d*14&6bba4cf4f818ecc77db4965c0e26bdcb*14&9af8b7a9ee820c7c03ff3d249dd0e54c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 512x1024x1x1_n"43c2f9093c024bee1296dc7de68cc334*10&442150abbdb6c6154a3b675d1b7f4cff*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1024x1024x1x1_n"1005334038f1201bf75994b3a7c2ef2b*45&b070c45e2c4b1c6b4037f87bb3d5d3d3*2&6fabd28cede94bf337f924da83b36531*2&ddacecc163199f5015e481e96de8e901"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde16c16b --strides=: 32x32x32x3x3_n"2b09021863cdd005fca1a2d02230f87c*23&8ff4f4e98d50ba02403503108519d154*6&e57e6b51884cb930271f9a9e97a83bcd*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 2048x1024x1x1_n"1136908545d8421e97ef66f0d986a5c5&82943b77b1fc7e0a7a457ccfee4c78c4*8&ce509187d81b90df92540c49adb4a51f*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1024x2048x1x1_n"1f435b270b40b90305494da453c6e3a7*4&9aefb1d21c5ae071c64955824667da26*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x2048x1x1_n"f309f8095f4cb03cd8ddffb4c4324e58*2&153a1d1a8c14e0986e55a830cff0420d*2&1136a3cb6ce3c2231bc59db4746ceea2*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x256x3x3_n"0f6a031ab2edbd65e6802e4c45ee51a5*90&5054d82842e599ac946ac012359a176e*90&d6382855e5df5aa027ddd6a5f6ac756c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x1024x1x1_n"d5b8546d395b43612164c7e3d12f0f54*2&2cec9cbcd0f66a14b66490457dd9c097*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 819x256x3x3_n"45eac892b39bb100f297c1b4be3f4f43*10&079baca5cfeb94bccdded48768fc5567*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 36x256x3x3_n"ffe3d4e884aeab11c6b31eaa4ae37e34*10&ea24dcd8a2ad5872f89b2939496f6fe6*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 36x256x3x3_n"49be8d3ff543a27e62f9247a39cd554c*10&79041a01124553a59a8bd475530920d1*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 36x256x3x3_n"ae3b0965ff9a1468ee31eb55b95e58f7*10&c892e55c7c80fca105f4936e6bfde12a*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x256x3x3_n"74861613b279d541d367f33352a4e801*90&7befba53238de79b741230ee8eb81065*90&9a2b538fb4cf4d76e3f73292e45aab07"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x256x3x3_n"d9fca15fc4dc3467493c16a565e64aa9*90&b6388e481a4616d48353ae72ee6f4dee*90&a4222cbb9e06eae779042b57a20f8d2f*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 819x256x3x3_n"52a95d6c372d50b130101562caf2a9f0*10&a9fca6c9ef3e8bcca336781d350bd04f*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 819x256x3x3_n"1a9c025374eb8eafaebb5210ae4b9f25*10&6de9ccaedafd49dabd7349c22cab4864*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x512x1x1_n"89262177b77597662b1cb8df19594f37*8&19dd829972e6f345e7f5f34a603adffe*8&c5a61b61dbba0b13073c45b520f85e50"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x512x1x1_n"b74a8d1c5d9de0477a3c8ad705d48e88*8&d95c5a202878a36012f0e9ac7e7bd883*8&71de562fb20378170372198c403d8bb1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x1024x1x1_n"f654bce7c353ef01e97111af21366b03*2&0dac66d3c625dec5cdafc047731b31b1*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x1024x1x1_n"4ba9fc157504fc9e7c5c6d1dbcfcd8e2*2&d85135249ef9b743935e8b16a27bc474*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x2048x1x1_n"3f5d0068557b24d390bf414cbe529544*2&2cfd884b2494d90b185b2d77a71e853c*2&e4ea955ab8f02debec08fe5236a9cace*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x2048x1x1_n"9ef346104e696aa99df13c24925569b7*2&cb444eb062cbd30de98fc38cf2c02f5b*2&4cd019462c7d051278759e035c18ef6a*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 2048x1024x1x1_n"f58945e3300535a843b0836ce9e635be&88cb22e984430cf833ec6d82315d3c48*8&790588d06f66ae76ec5705fab75c255f*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 2048x1024x1x1_n"0708d9834e81c75241d5637345f3a94d*2&b4afebd39bd753203a2e52aaa8e39f8b*8&12506358ef0f4ab715f99037639bff7a*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde16b16c --strides=: 32x32x32x3x3_n"b9471c78a2dc36fd45ba76fa13a07ccc*23&6c7840b20731a0c5710515dcfa3be330*6&b5cab3b8507f471bf8c6ff1479bc44cd*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde16c16b --dtag=abdec --strides=: 32x32x32x3x3_n"9f4ab4f81a999b5ffa6ee5f5780eb94e*23&0f906ec5f1b753e4d64b92e52307a2fe*6&779fcea17acdeabc395b4f2d5c3ab1db*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1024x2048x1x1_n"54cbd2d551b0f95ae790f5d63311bd29*4&b4228dacdb22b2f99a95f2f1f05103b3*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1024x2048x1x1_n"32b73ba3d3d9b8e655243a380c3155b5*4&0c8a26757f1102462e43586e976f4cb0*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1024x1024x1x1_n"22f4c82deaabc967c9db75dd0e409998*45&43f3e3a7765fe8e566611e8ef17721ac*2&9375665b3f5ca9ae0e6864b5d7d46083*2&42f2c7bdacbbb7b9639c250992b86ce8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1024x1024x1x1_n"d7cc20356cfff329c86ce0c622d7897c*45&7d307c4212f3dcf1c8a806e8bbd4605c*2&7e7452979ec74b5ccf50106b2fceb411*2&079a0a42755af47109fff890e64eec12*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1024x512x1x1_n"9008bae055383fdea0ca50a4c8674ef9&b234cb829336711d6a1fdd800982d7b8*14&21975ff3bb91289c6dd111c7b6e03533*14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1024x512x1x1_n"278f92d5c2f522215bb9380199abe56b*2&1f4e06e9d98a44258b7694219ccea0c6*14&f808dad12cb6b175bddc14b99245e122*14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde16b16c --strides=: 32x16x16x3x3_n"df2c9b38c94f08788c5d9ddb4b8a101a*4&b84487fa02c31bb4902f6a90437430b3*12&00bbead95435897d39d5b59592942bcd*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde16c16b --dtag=abdec --strides=: 32x16x16x3x3_n"36695e3e3a185e0c149d7df7f4ae96c4*4&1a0ba6835d2efde9016858effc6f2b85*12&5e84a8198a69dfdf7864c9e97c36cb4e*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 512x1024x1x1_n"992fdc7ad344a0c1fca15363d99543a8*10&487f9aac2ae2396f8ced26fa0e04bdad*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 512x1024x1x1_n"abb67f2c9ef5ae71aacc72fb283a1c89*10&e5a6187bc3945c83bcdfa88afe60696e*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 512x512x1x1_n"01390559fe75433fe59b3c6baf58c242*7&8aa3c7c361f9860d1b78ce5b4974e7f2*2&134cb9c1878dca7bb13c0c3734f8b4e0*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 512x512x1x1_n"3522827ff5de6bc189b3079c4ab44cff*7&8ed854a47948df3e3366db68acd6929c*2&b21b547b8cad498c7662e56fbc1919fc*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 512x256x1x1_n"9af800b969ea1d68c6d7915c12d6aa36&b45d4662481824e81613c7c37196b893*8&8c82300cc4f08373f72144001ade34a4*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 512x256x1x1_n"e428f9455c1ad1530d97757a10d7bd70*2&f61e31597bed63c42bb8d0b3d3752744*10&eec5f6207b36006296cd37f273dc58a6*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8b16c --strides=: 32x8x8x3x3_n"1f60de9246c8ab2a7a35589bbcecdc91*3&d3e4ef609ec3727d84329c8a5b09132f*8&4b76faee8a32dfc62c718f8284fa3267*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec --strides=: 32x8x8x3x3_n"afe60ba59c550824de21a4d132706d60*3&76682d7d0751dea719ed846c173cc912*8&afc46352683d2fd6f0ad584426c66c3c*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x256x1x1_n"5ed7a4d1624be395457ac134c122b5df*5&93fc4342b50f8477ff72ed53a67cbd9c*2&4278195a1bf5237d70dbb5190f4e0ca7*2&bc2eef35732755e2fbf2a1610c255184*9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 1x3x224x224_n"5aba1bb24ca253c416462806a6ef6c79*15&60ab1c0c2fd8f940f1153f8c1d58e040"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=abcd --strides=: 1x2048x1x1_n"81a5848ea9b5aa021e7610cf420fa7bf*15&170c45808e610d3b42b5389d358ec8e5"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --strides=: --attr-scales=src0:common:0.5 1x1000_n"24752cf111e75eb9b0d855bfcfd1a6c9"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=Abcd16a --strides=: 32x2x4x4_n"ca27499339f9223d5dd6f3448e3ab3ad*96"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16b16a --strides=: 32x32x4x4_n"3bbc499cde1c1b32144573c875c44ebe*192"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16b16a --strides=: 49x32x4x4_n"7302b51e52c770c4229591b8e508c7cb*192"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=Abcd16a --strides=: 32x3x4x4_n"483c2fb64fb5221f396d16984b44e4f2*96"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a16b --strides=: 49x32x4x4_n"3e00ab8f0c401abc807d14afb51b8c51*96"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=ABcd16b16a --dtag=acdb --strides=: 49x32x4x4_n"c70fbe667fd465163bf96e3fef39ef2a*96"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a16b --strides=: 32x32x4x4_n"27ef6b21531a7e8b175a9f5121a9562b*96"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=ABcd16b16a --dtag=acdb --strides=: 32x32x4x4_n"7f6bd6d299e9b0a11dc6d3ed4b241e80*96"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a8b --strides=: 32x3x4x4_n"08eb7b22ac4df20a5cfab48cddd8ade5*48"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=Abcd16a --dtag=acdb --strides=: 32x3x4x4_n"f38764a7f077d65581d64da115af0743*48"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd16a8b --strides=: 32x2x4x4_n"28b3c66117672c906e34c0f02e2ce85d*47"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=Abcd16a --dtag=acdb --strides=: 32x2x4x4_n"7f53b8f89a6cda9194e2395f065ffc53*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=Abcde16a --strides=: 32x2x3x3x3_n"0d6999dddb5eec3a303eb67a70fadaf2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b16a --strides=: 32x32x3x3x3_n"c29878e63d1393e58ded1f9641e4f580*17"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16b8a --strides=: 1x32x1x1x1_n"a38e0d7ca4ab4b3b1e14db0047eaafb9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b8a --dtag=cdeba --strides=: 1x32x1x1x1_n"2cd553ab761c495c5e415ee9442be67b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=aBcde16b --strides=: 1x32x1x1x1_n"35e464a3f04d4e843fc5605617d72696"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=cdeba --strides=: 32x32x3x3x3_n"5a3dcb4bd822a43e9099ba273b2f838d*17"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdeba --dtag=ABcde16a16b --strides=: 32x32x3x3x3_n"3b600599bb7c5f1ea5225c6d8eb0c6bd*17"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=cdeba --strides=: 32x2x3x3x3_n"2fbf42ec7308804a08006cbf1a56529f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=acdb --dtag=acdb --strides=: --attr-scales=dst:common:0.5 256x3x224x224_n"579dfd936358c969138cf70020d1fddb"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: --attr-scales=dst:common:0.5 256x2048x1x1_n"e3ea5cf3db13e8411bdc9ae710bc1dd2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=bcdea --dtag=acdeb --strides=: 1x4x224x224x160_n"109c80965f402831d3279692d1d80385*67"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=Abcde32a --strides=: 32x4x3x3x3_n"ce0fb5ec0940df8d65446bc75070b7e8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x32x224x224x160_n"c13406c77e91ec9a881c0df35c600db3*268"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 1x32x224x224x160_n"7b7c83e453a6cd884eee5d44e8b9db67*268"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 32x32x3x3x3_n"4579a566d6b12369f577ab639206d32d*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 64x32x3x3x3_n"6645abcb41c48fa83ece269cce656659"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x64x112x112x80_n"c4a1dbe554e19a1878f7702c6c8b094e*268"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 1x64x112x112x80_n"587e5f471db2835f7fa0c109da932432*335&67b803728b68b04fd15356cf5d471b36"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 64x64x3x3x3_n"b4d6b5fd96bd7336f288d83814369522*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 128x64x3x3x3_n"d0610d60e040aebbcedab2bb57f39215"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x128x56x56x40_n"8b757d5e6aec46442faea5d11aa7eff5*268"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 1x128x56x56x40_n"428cd537f3a5d6d84b9101f17d5a2a14*335&d5fd48680f79ad470fadfb446ffeec11"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 128x128x3x3x3_n"8d59a34afdcf5fe607f002a3a135967b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 256x128x3x3x3_n"2f3df4946fe220c949993c6f2ccb2681"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x256x28x28x20_n"fe4d873ce3097fab80ce11e7e8a81646*268"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 1x256x28x28x20_n"a1e62a3e002d3a889325d0ae3db11fd5*335&c943c1c8ef616597d0c5b0e04071e213"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 256x256x3x3x3_n"1afcf147f34063dc20958297896ce0bd*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 320x256x3x3x3_n"9ed3bf349bfff57ad98b70d2b78c2892"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x320x14x14x10_n"8d4b112fe9d34da56782162ce65ed7d5*268"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 1x320x14x14x10_n"6ce04b49e7d39093c0b094cfe4e5363e*335&a3224ce3079a92051c011da707d35a18"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 320x320x3x3x3_n"dbe528fc049d4db1acdc85d55fae92ef*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x320x7x7x5_n"f72691ebd313016cb61f1e399bab1378*134"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 1x320x7x7x5_n"28144040684736c8e159772c1dc8115b*134&40ac9fa92aa95f462e26865e4b39b152"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a --strides=: 320x320x2x2x2_n"29143c07354d45649b55d0652c6f8648"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 320x640x3x3x3_n"eee54a53d971496531c36a89b27c09b1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a --strides=: 4x320x1x1x1_n"04521a1f7da9491002afe425f080b916"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a --strides=: 256x320x2x2x2_n"2d29fcb331c61b4cc40ba464b51fccb6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 256x512x3x3x3_n"46a626fbed0c42cb9542e22fdff05e16"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a --strides=: 4x256x1x1x1_n"1fc54a01d119fa9af12a474f71ff29e0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a --strides=: 128x256x2x2x2_n"29366911ff6ad0e700cf9bdfb947e617"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 128x256x3x3x3_n"26b0b83a24e756fd5d20e5f446348203"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a --strides=: 4x128x1x1x1_n"c98a264011c8e1b70f7ff80f7ca42b6d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a --strides=: 64x128x2x2x2_n"1f175538c34d5df68fedd33ddc61fefc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 64x128x3x3x3_n"53b2d25729003e7df43faa3786d82d0f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a --strides=: 4x64x1x1x1_n"e80dbb9c7c578bb575a5f2d6ece35079"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=bacde --dtag=BAcde16b32a --strides=: 32x64x2x2x2_n"378c82f26516132038ec55dfae877f27"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b32a --strides=: 32x64x3x3x3_n"7956b0b88d849cad07a929d861c619aa"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=ABcde16b16a --strides=: 4x32x1x1x1_n"9b3755649fbd031880f31f74e2e48ea5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=abcde --strides=: 1x4x224x224x160_n"aad513c324fddde5e81d22676613f58d*67"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=AcdB8a2b --strides=: 64x3x7x7_n"2ea73930282c0255935cadba6463fb09"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 64x64x1x1_n"7b1ac45e78dc62eb6b38c383cd24319e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 64x64x3x3_n"f7f1075580242bc824285b54c5110af1*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 256x64x1x1_n"7df85ca71fb77e4fb0acd9c03ad4c6d1*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 64x256x1x1_n"ab880795c1f32dd44cb4c3290d2b3348*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 128x256x1x1_n"330b68e4a067953d98d0634c9d1b3f0c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 27x128x3x3_n"cfc99d350db95aeb309ec1856ca399dd*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x27x45x45_n"8c68757cef144129c84b037eb1103009*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 64x128x135x135_n"533242cbfee35321973a6e3145dab6c3*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b --strides=: 64x128x45x45_n"c06ed6dbbe830250b45a4091fd4b16b9*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 512x128x1x1_n"66bcf6ae36d81599fba2182fc69b1f97*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 512x256x1x1_n"78df697958507a60f6cc661a2d3f50d8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 128x512x1x1_n"99f1d149109eb3aec05ee3830335f11d*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 256x512x1x1_n"2dbb4b25d83ac63b00806d677eedfe75*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 27x256x3x3_n"af9efbd63800adfe492212ec033042bd*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x27x23x23_n"7ff219e5f045e119c84354ca7c96400c*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 64x256x69x69_n"413f81706b84e7906b521d9996c79073*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b --strides=: 64x256x23x23_n"6d2798a79efafa5a3ad7db2884c0d17b*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1024x256x1x1_n"cd509f4b03d04eb3a9ecdd5d8e981506*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 1024x512x1x1_n"1260ec74f89271988f8cfcb38f2bc49f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 256x1024x1x1_n"966c2ffc433630cfb3938050398f22e7*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 512x1024x1x1_n"5c8dd4a519050ef1bbc66f4746d96216"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 27x512x3x3_n"ba2157b8fc23144a449dbb7ea5c16760*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x27x12x12_n"6d66200041691c73238a57e69ed25c30*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 64x512x36x36_n"45829ea5daceb1225132d82aec63409c*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b --strides=: 64x512x12x12_n"a610ed372796161cbca856f0e5a59a25*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 2048x512x1x1_n"b82ea55405b318dc11e96be97e5967ea*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 2048x1024x1x1_n"fb36448d72a49bbe29dd1d48b57fadbe"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 512x2048x1x1_n"c0ffbc563ebe639a56b1f8ac8efd7b22*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 256x2048x1x1_n"e1f5278a819371b1c73c5dbc1b1b86e4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x256x12x12_n"48b40de7685f2c94335b3211d97602f2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x256x23x23_n"62453f9af7f9a23ab5f6cb85e359f53c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x256x45x45_n"b82398cb3b7a59f35ed4e2a28aae3ae5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 64x256x12x12_n"11bdd12b10bf183eef2740fbe605a96d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 256x256x3x3_n"da22f1d563e6d8140a04811d3cf21a65*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 64x256x23x23_n"20fb107e09b04eed90c70c696e82b826"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 64x256x45x45_n"1bae7f69b11742cf279184b8bee7924f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 32x256x1x1_n"b4793f67a47b36c7d6ae28bfbd07a661"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x32x90x90_n"b57ce47f5856a70848402e8f9f63b24c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 36x256x3x3_n"e85c00a5e1a986ca2afb8d58ceba0df9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x36x45x45_n"f8174fd4d486435ea8e57755dc667933"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 729x256x3x3_n"6de85ec56ad096ae830a4ff21debe12c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x729x45x45_n"70a43e3bb2400cde7e23117d4259de4e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 288x256x3x3_n"e073d3246e1ecdda302429380753659f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x288x45x45_n"0ec7dac70b08513b55c5cff63f446daf"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x36x23x23_n"21c4415f18d8421904dc481ff55c41d4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x729x23x23_n"efd6fca975c281525c6872ac207093c3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x288x23x23_n"0d4d2664aed19da7b0cf5cd7c4a38549"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x36x12x12_n"bada25dcc1731c4a318ec2902a86d3e5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x729x12x12_n"c3e91b40690199b9f3935c7319ec04dd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x288x12x12_n"2a3317095d8140fc719d4501ad0e9b08"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x36x6x6_n"f6c110ae55a1d3c7cc038c16fba74041"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x729x6x6_n"93b93c5009f1174c626ac25a24dfed4b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x288x6x6_n"1b9ad4f02dbd0495b7190501d118f4e4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x36x3x3_n"5678b20f4d311bfae6e15dff679d5de0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x729x3x3_n"ebb369bf4ce400db48bad6d69f17314a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 64x288x3x3_n"ed51662688838bf8cb5ffd8cc54755dd"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x240x4x1024_n"5d026bed0833a37b812deda5368786be&06ce80359db7d99d24fd62845d1f916e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x1024x4x1024_n"a460def04fe8131e234b091203ae3e09*8&56cc93641919ed0554af769a34be672a*8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x2048x4x1024_n"00139777ad7cd003b76f33dd1cf448ad&bd15b63fdce5ae4e27f4ea89405cf9f1"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x320x4x320_n"6dd51dcc2ecd82d152ac8fcb374acd01*1436"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd32a --strides=: 64x3x7x7_n"cf52fa1b3676683f1dfae04f7a3dee13"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a --strides=: 256x64x1x1_n"997a4890b8753c7acadf64a4e61877df*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a --strides=: 512x256x1x1_n"99f20cbecb1cc9fd0dcf48e13ee3796d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a --strides=: 1024x512x1x1_n"0128f1947c6ee556c8d186da6374b356"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde16c16b --strides=: 32x64x64x3x3_n"b82262e54f19bdf0dbb334d13643a399*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 2048x2048x1x1_n"243390d4b30ea7a40e4af073f44bf039*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b32a --strides=: 2048x1024x1x1_n"3aa2ed5e1a0e06252d3cba62985372f4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 2048x2048x1x1_n"c3a002eb2ae3ec5c5e8631c1374fef48*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 2048x2048x1x1_n"2416453b7114dea8749ef379c1e7f5cc*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde16b16c --strides=: 32x64x64x3x3_n"6c7f1c0a18e021326c174be0f9eebbd5*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde16c16b --dtag=abdec --strides=: 32x64x64x3x3_n"664be5868553b902f09df8e83430acbf*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b --strides=: 2048x1024x1x1_n"0c66deb2f22c032cb6df1a8d961bebb3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b --strides=: 1024x512x1x1_n"73b9561023f70faf54b1404eaccc8051"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b --strides=: 512x256x1x1_n"f13c5fb89e263d9939e3372e9df364d8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x256x1x1_n"edc0e7f48d73ea2f99c0478082a66526*5&db9dc4388f27cda70e1449276549f64b*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a32b --strides=: 256x64x1x1_n"2dbe5f535e4ae3971907a2909a4312e4*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x64x1x1_n"8cdbde362aadea21a604f83573d8d8d6*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb --strides=: 64x3x7x7_n"658af53856956b4a4d126491c932df0d&3124e2539de5a3e9a490aa5eba011a64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=Abcd16a --strides=: 64x3x7x7_n"53182256ac0b6ab9de67661f999bc0f5&7d8a126e0c4751e60027402eb790c9c7&376c042b18e53a593c3b52779e431d54"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x64x1x1_n"5f5461f56ab8bb53b256d8aeb99db060*8&e8ed257a9eec365139533f883bc09d0c*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x64x1x1_n"bf62a12922595ac82e7656a6d45ffce3*2&0274ab9b8100a325b0f125f597719ff8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x64x3x3_n"b53df240dd319793b90a656fc904c8ae*6&af987231784008ad61cc34b98e324fc9*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 64x256x1x1_n"dc31a5118b58f31a88c333bb55caa906*4&d70a4e8b7ac6252e0674fab52e562ca9*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x256x1x1_n"6299576e10da4dc35620adbe9f7486f2*2&447304c6e01bba1f2515d276e11418f0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x256x1x1_n"e6cbc93cdaa4612ba45b06cbc665fd78*2&7f91860bf9f010c22238910ab7ae10aa"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x128x3x3_n"f297536cbbeb05ec3e29c418dc080be0*8&70d4c2a1aec3382b4f2ec80b0441539f*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x128x1x1_n"1710b43b668f076739287e0431c70db7*8&e97f257d7f73486be66a54df5e38febd*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 128x512x1x1_n"a38d089d22cb2cd3e3c9a6c1cb3ce5ac*6&97d7c3b6e506d7e0d914a1ff0d7df865*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 1024x512x1x1_n"bda859d9b03c6a8b40b364c92e096a5a*2&4777dc5c54eed314d1fa8c60764b263b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x512x1x1_n"62656196cda0eeb956f16a0e2bcfd25e*2&075df45f7bf99fc87b98d80ece6ecb07*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x256x3x3_n"8687cebbc71aa39bac572565ba8b3370*12&16771193ed7cac3f50fa03753da220f0*38"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 1024x256x1x1_n"4579629a3db0812b0da20301faba4145*12&7ab3b808eec7c5636fe58c49d57e3dd5*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x1024x1x1_n"b040843eb1d56597e047b03ea6977fce*10&12aebdb787f5bab8307384618fbe4b3e*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 2048x1024x1x1_n"e9b99f31eee5e3d701ffa6e3da4deef3*2&524392b508a37903d1e1e82d44980f7d*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x1024x1x1_n"d11cb1c46d87886a8c0ffa950264729e*2&ac9e54e0956affa67a116144317d6be2*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x512x3x3_n"4108990c9ed66e184fcfd1d96471b9d0*6&5ff1d32be18d5b94eaa40f743672da8a*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 2048x512x1x1_n"1a29c701d218159871d06e375cba30bf*6&f69c5b0d2defa1eaef1fed032508bf26*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 512x2048x1x1_n"3d34a12d110444f0ce84ca5fbdd01c3d*4&46e37344b511b5482fc9d16eeae44568*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 2048x512x1x1_n"ac956ac28d0e7619f87c0aa23953c343*3&aad986d096fe726520d0e4218c3feaa6*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 512x512x3x3_n"9490dd2db0970a83511f8f4738bcb30d*3&dc25d0a171f04a4100e0fb02c50bdeba*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 512x2048x1x1_n"02cc6d9ac88a707dfed5236f3bbca06b*2&8e68e671f4365608c44bd54edab0293b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 2048x1024x1x1_n"32508e4cc39a74289bef1533c133b378&1e3a22189facfe9b2b09f2c53ef37616"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 512x1024x1x1_n"10fa461da74df0d02c989509cbfcb3c9&eb904c47292ffe1a92c5a0367fe54659"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 1024x256x1x1_n"3325e4299bd4f76267ad4ff8d95af281*6&7f494188d8f77fcd01bf9be3111b21fe*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x256x3x3_n"f9ece02254cae10ba188604df2acb183*6&b5300f6ac935f0cdd63793ecbf2a4988*19"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x1024x1x1_n"5b86e8c42d4f7f8598d9e067e45de0f5*5&17a8a933b835a653eecf37bafc58a124*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 1024x512x1x1_n"9bec0ff14ac39d5434a5c81c94961c52&55c9b1c99650ab9549aabf07b138d9f6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x512x1x1_n"69653c0a4eb49fa31f93d238d3b7152f&00bdcc958774d4a031fbbc352b37f0e8*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 512x128x1x1_n"867e90cf3fa09743a0bdb3c90a6a51cf*4&03f4098d744b5074bf0c4bc47d116c53*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 128x128x3x3_n"64daf3e53f15ff9fe723f90e64f702f5*4&da5664e4f7196fd17a08eb0dc8c1c7b2*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 128x512x1x1_n"2405b9407fce5d00749a49a4b5f4c79c*3&d07fbffbf199840b43cb85b5a480ad86*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 512x256x1x1_n"19e1a32ac2bff99a7336d34e3366731b&10f5e320695438eb926035ae09018dc0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 128x256x1x1_n"4610dbe72d710e9ffc28a11b4a99306c&62bc0501e05f0c8830b96b1c2ba74774"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x64x1x1_n"6ee903502dbc56ce450aaca813f883bb*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 64x64x3x3_n"eab1cd1a007241e1e090ad8c921e4f34*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 64x256x1x1_n"5afdfa4f6c20b264c3bd6d2f2055ade3*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 64x64x1x1_n"46fb37c9fb8a8d8c0e4d59ed6bf327d6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=cdba --strides=: 64x3x7x7_n"007b7baa84099557f88b856ece85c4bc&8f6fcf098fa5e924d029671f2ed358d4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x256x3x256_n"d5fa0d5d5f994490436a31a2e4c77fac*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 1x1x4x256_n"31340325496e0864a5ad46de47716665"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x379x3x681_n"5662a8502e9ac0310ddf5fa2de981087*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x681x3x681_n"f562fb9d474a9d3de6210c833882ec43*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 1x1x4x681_n"97f591a8412f6175972522426b67a456*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x681x3x681_n"50c05a2a27ba870d66fd9db4faac3028*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x379x3x681_n"027556713ff2e29ef7514967350b30b6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x92x3x256_n"3839a79a38d12195821f6e9b7110e322"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x92x3x256_n"b768aaceea5c4cf1d21c068d1eaf49a2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x256x3x256_n"3f2e44635041e2eb90906837ef7f597b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ba --dtag=ab --strides=: 1024x845_n"26e4e0e819f40a8b3065792254d38cb6"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ba --dtag=ab --strides=: 512x1024_n"2ed107054958b648f97d48918fa61830"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ba --dtag=ab --strides=: 256x512_n"f5c46dd476fdc8680db807fe4316567e"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f32 --stag=ab --dtag=ab --strides=: --attr-scales=dst:common:0.5 --attr-zero-points=src0:common:1 5000x1_n"dc298848213d7b02fa7ff6a54bde70c9*400"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a --strides=: 64x6x3x3_n"96859555d4d0844b8f829d18c86e8dcc*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 64x64x3x3_n"d672cd77b566beee821e87c58d798c39*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 10x64x3x3_n"14bef1f7944a6e91accd0cfbca9ec708*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a16b --strides=: 10x64x3x3_n"7bf8eacdf9d137657d544f5e01e266cd*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 10x64x3x3_n"aa27a9d3f9939226db3d8fff9fb464ae*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 64x64x3x3_n"921042c42a97fed2fdabee05b60ad8d3*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 64x64x3x3_n"8d2840bdade6a23b6b0fc43d8ff07130*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a8b --strides=: 64x6x3x3_n"2262fe2e42c642c36cc6c56b6101cd27*47"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb --strides=: 64x6x3x3_n"beeca01df283932f8801924561504270*48"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x768x14x14_n"a95308d85ff6c96ff1fe2e441512ed56&e1d7bc660aeb3eec4b092148c9c418d9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 48x3x3x3_n"0b0fe9c54fedd394294218d9fc4724b5*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 48x1x1x3x3_n"9420982ccdffa1ba1be699a47ed7591c*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 12x48x1x1_n"6a490d7bfa5d116d26e177c32f8d2363*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 48x12x1x1_n"006437bde33ae1ffcb474e6cb22f98a5*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x48x1x1_n"762534e93296bf369705a00915785bf6*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 56x192x1x1_n"262709110238bfe63a4383d33da79d50*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 336x56x1x1_n"b7eb85d3b558cab445acfc0b980184c4*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 336x1x1x5x5_n"f157f1d92b649000a0ea8d867e76b7bf*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 14x336x1x1_n"f3122f081c398aeb4febfee3dabfc89e*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 336x14x1x1_n"a776b56f6b30363c3ed8cd3f709f56aa*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 56x336x1x1_n"b1ede8c5192699869b292f5a6960f99b*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 336x1x1x3x3_n"5a6567fd400f68745fd8719eb711e3b8*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 112x336x1x1_n"bcd37c0fa4bab11e3471befc395b7514*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 672x112x1x1_n"03b35e9b6c728089a619d1f54699b934*150&3282ed358bef618fec760e9f143b9453*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 672x1x1x3x3_n"da6a7b836cef1c660a3c691666bf06fa*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 28x672x1x1_n"235be8a74273ea0eee49b54eedc3733c*150&ee417d6b0d9b27b60d6ea6061d2dacf1*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 672x28x1x1_n"794b6c489e48365e68ca73b578d2371f*150&607ee05d0b71de29eac52e4d2e9f2b8a*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 112x672x1x1_n"d6023f346daa7129009e9d8f2af4064f*100&a4a4a9089340e2fb6d904d26d06fac24*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 672x1x1x5x5_n"8923923a2bca70df4cf52046998a830b*150&4733964b1472206ce17b8e84b0022eb3*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 160x672x1x1_n"317cb39970fb965725f59d468e082629*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 960x160x1x1_n"0d3df35896e5f6402269fd69a281d394*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 960x1x1x5x5_n"a2cd80a68a7b7b53a40ef1ed0736afd5*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 40x960x1x1_n"51761ef44921a29469294eb218f01a35*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 960x40x1x1_n"38236af2539a58d3862da5fb8b21f4aa*300"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 160x960x1x1_n"ae6637635b9d72163416a9df9af67137*250"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 272x960x1x1_n"b76259d311cf3a828a308a256063be1e*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1632x272x1x1_n"6ac5cf541a8110f49b8d231a2cd6817c*400"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1632x1x1x5x5_n"639d38c2344f70bd599f6ab7c319906f*350"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 68x1632x1x1_n"131024acb724af7ea68525cab67cf49e*400"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1632x68x1x1_n"026c69b2debcdf35e3d892443c89fecd*400"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 272x1632x1x1_n"4bf0583e017d5663e33e1bd2ea4b61d5*350"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1632x1x1x3x3_n"bdd2ddd7f1f32aac5634a6a5bb527463*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 448x1632x1x1_n"5800360c52f8fa4ee2ca91d75b5b1a65*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 2688x448x1x1_n"9b0fcca4bb106c502006b74a93dfbef0*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 2688x1x1x3x3_n"78ca764daaaa9798ea4d0f8cf6b6abc9*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 112x2688x1x1_n"f68691838e4b5a6a1ebe56fa79fc3f63*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 2688x112x1x1_n"e7766c1e541632bfe91947ee4cad23f3*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 448x2688x1x1_n"0d5269bcd9c786f2d0450d595ced6956*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1792x448x1x1_n"b55307f34f151083b9566cbb4d4518d6*50"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:1081344x1081344x2112x512x1 1x1x512x4x512_n"aa782775a689c7e82ecf35cad47d8a9d*92"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:4456448x4456448x1x1114112x1088 1x1x1024x4x1024_n"ce76a6d93e5a317dcfb7ac1d92801c84*184"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --dtag=abcde --strides=4227072x4227072x4128x1024x1: 1x1x1024x4x1024_n"3bb1fa041fba7dd7adffa8fa08f7ce4d*184"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:8650752x8650752x1x2162688x2112 1x1x2048x4x1024_n"80dd104815bad014775cbb9958e10978*23"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --dtag=abcde --strides=8454144x8454144x4128x1024x1: 1x1x2048x4x1024_n"e5aa09e2adbaf167632f7c7e198e442f*23"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:1310720x1310720x1x327680x320 1x1x240x4x1024_n"e635eae4f7e4fc6403634e238a0b5f36*23"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --dtag=abcde --strides=990720x990720x4128x1024x1: 1x1x240x4x1024_n"6ec5512588d644ba416984fd823ffd4b*23"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --strides=:1179648x1179648x1x294912x576 1x1x512x4x512_n"fcb612c85c13e8c45bba6808afb767f6*92"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --dtag=abcde --strides=1064960x1064960x2080x512x1: 1x1x512x4x512_n"dc6a8b6f8df8ed9475824fc039972133*92"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 64x384x16x64_n"3a29bafe5e9b974f3554d7dfddfa2489*72&b39fad703f7dd516c3f1d9339737f092*72"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 64x16x384x64_n"f2aa99ec999f0b023ca9c7ec3447b6de*24&0ea24ddd2dbef682993cce8533b9d151*24"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=cab --strides=: 64x384x2_n"20ff1d97d527df745ef454179b1608da&b9f0fe0f9432ebafe91bcd8a9b40b87b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=acb --strides=: 16x7x15000_n"77725aed0e069aad9b05a92464a9a770"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a --strides=: 64x7x1x1_n"202c99ffee9b91ddc293755c9f999c57&0d7522b7c9732d677f50693d5eb75d38*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1024x128x1x1_n"0f8f88f031c94c7b128c0c287f6690d2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 16x7x1x15000_n"57ea8a923b92963999b533ce18b5ad56*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 64x64x1x1_n"a20d8c157b39ffd4ab77872eb2c644c0&3e2c9a9b14bd33e20feed825e7c9966d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 16x64x1x15000_n"a29cf78b859d8589f982cbbd4cd2a24a*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=defcab --dtag=adbecf --strides=: 1x1x15000x16x1024x1_n"a70992b85f72fafebbac1a86a4dfe53d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 16x1088x1x15000_n"dfbd937e7b0a0ae2fb0958beeda6754e*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 512x1088x1x1_n"8095b47d98511d8778d43c75b164ae76"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b8a --strides=: 5x128x1x1_n"1cdb464aefe974ea7ca0d7c39ae6f76b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=aBcd16b --strides=: 5x128x1x1_n"332618f0af67788005c10c3b0ba1fd90"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b8a --dtag=acdb --strides=: 5x128x1x1_n"b39af7c72514be34e517a6f613eb1dc8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 128x256x1x1_n"11c85d96bbfbce7b6e934693defeda56"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 128x256x1x1_n"aab262dc5287e7d011c0fa1cc0dc6bce"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 512x1088x1x1_n"f67abee951f8cef092d3e19e359b1b1c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 512x1088x1x1_n"afa2e18a8d261e3029b780ab24faf34f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1024x128x1x1_n"050f405385ed134a22c2d78c840754ee*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1024x128x1x1_n"0994f59967c095f51ff69757bcb3d740*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 128x64x1x1_n"17f358aa2f0ef49d8184d773908b43e8*2&6e6b5dd3d6acc8af5f8715cdca0ad275*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 128x64x1x1_n"ccd20ba6e9ba6d46fbb237720280b0cf*6&cc84222520db6962a4a0a5420f7871d4*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 64x64x1x1_n"34ecbf76cf80b15855d73f437dd8b44f*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 64x64x1x1_n"48e871b2a9d7f5824996b1496a229f32*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a8b --strides=: 64x7x1x1_n"b9187fc0c4217052b2c2d1a76f55dbb1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb --strides=: 64x7x1x1_n"e0a0ea0832882c7b1e9e7729979c566e*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=acdb --dtag=acdb --strides=: --attr-scales=dst:common:0.5 256x3x300x300_n"cba178dd5e851de8923ec11b19cfed9d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ab --dtag=ba --strides=: 490752x4_n"3048c361744e52d8e74ff95f07874c90*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ab --dtag=ba --strides=: 4x490752_n"1eb41a729bec7b7e7c9f3c51a0d77080"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 8x128x56x56_n"23928155376d23ac44a675e1fe1e2ae7*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1024x1024x1x1_n"482eafd5ef7ac29b84647fa9c224668f*92"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1024x4096x1x1_n"fc64f815787c89bd31bb41d8e45dd5ee*23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 4096x1024x1x1_n"b7c3893272016550882b673bc23c340a*23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 320x1024x1x1_n"5a71ccf8de2d88ef5629bc52e73f2f0f*10&292709d6c2842382943859a83642c912*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 640x1024x1x1_n"c33ed71bb734856bad3e8dc0739d6afd*10&0d1418c9dfb544ce0fc2e5b813e03a0b*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 1280x1024x1x1_n"78d3b195aab5edff9121d18369ae872c*12&0df93964f4c9073d6bc0daec6c036b9d*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x64x64x320_n"9abba94ab9821f5a84a57597c8ea3c66*40&c95a6ee908ba260d19b1611b6ed658d4*40&8d7541fd1738b9dffcb84f95d8d8e5ab*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x32x32x640_n"0cf41209c0f560959f0f187c1966f739*40&64b24d80db884f03d6bae739bbd8a132*40&0f9a98b71a97aea73a6c011568a98fc1*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x16x16x1280_n"8c082c7bc8f31dc64c17da44b48f4817*40&6c6839338e8af457f66234911b7e79f9*40&6ec5b2a87dbc286431764f962946fbc2*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 2x8x8x1280_n"98b6981b9fdd50835d3e527cb275faa8*20&5f075d11cfd1c81606d989dc655fda70*20&77ec34e3ea5912442839fd5f53eef7a9"
+--reset --allow-enum-tags-only=0 --sdt=s32 --ddt=f32 --stag=ab --dtag=ab --strides=: 4x15000_n"632c513eeac329a66e0b39292414e643*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb --strides=: 4x7x64x512_n"15fdbed62d3dc55ad8de5b15318fdd85*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb --strides=: 4x128x64x128_n"6aa076c8439690dff1f51ad9ba79f966*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 128x128x1x1_n"be99a8dfa572b307e6df0e0979d3dc70*2&55f85f341d5bb9b83f963eb55d9fc227*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdc --dtag=acdb --strides=: 4x256x128x1_n"8d433da6f3d58bd77a8fe9d520c536fc*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdfeba --dtag=adbcef --strides=: 1x128x1x4x1x1024_n"5f85c9e3c079a8b190eb98a1f48d0be0*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 4x1280x1x128_n"5c0dc4ebf7671f87f928914f808ca424*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x1280x1x1_n"eb61332ddbda336d1439f46bec33f203&f2f18163a927a0f55f7803463bb05040"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 4x512x128_n"31a9c1375e655e738bc29f96435e6e2f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 4x384x1x512_n"ecc1e46df50eb21c2541a41a6cbaa2ce*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x384x1x1_n"4d9de35d4fdcae6307a9fe8471e1333c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 4x15000x512_n"ab70447b49b088d87fc041306083e552*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b8a --strides=: 3x128x1x1_n"c77c5ed76e56c60e6b4e54f1244a2f01"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acb --dtag=abc --strides=: 4x3x15000_n"e245f0d8fc7bda188c0c9b64ab243355*2"
+--reset --allow-enum-tags-only=0 --sdt=s32 --ddt=f32 --stag=ab --dtag=ab --strides=: 2x15000_n"2d6ff6b2e3ac8e0183adb957d6a0224c*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb --strides=: 2x7x64x512_n"cee76e20edbfdb0fa6ddad17cd3de5cc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=adcb --dtag=acdb --strides=: 2x128x64x128_n"11df9986c665cbb8a5857d345e906819"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdc --dtag=acdb --strides=: 2x256x128x1_n"ea6f1d7d28a35da650421d0a092f9851"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdfeba --dtag=adbcef --strides=: 1x128x1x2x1x1024_n"77b995ae0e34c8746578a343908ec183"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 2x1280x1x128_n"5b7ff970aa27de0faf067011164d89f0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 2x512x128_n"3bafdea00fa1983815e32f6a69f58d41"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 2x384x1x512_n"3994d12d3c1d0a2c064784045e8a1aa3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 2x15000x512_n"8ddd69a50992335209dd4285dece270c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acb --dtag=abc --strides=: 2x3x15000_n"4500cd33edbf41868569eb1e87de6f7b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 1024x3x224x224_n"c5cc5689884119e263cbebb70a3a9255&c55a20bc729eedeccd6fcc9f40d9cb6a"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ab --dtag=AB32a32b --strides=: 1024x2048_n"e01a21a1a150b4f9704c5fa93d258f31"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=AB32a32b --dtag=ab --strides=: --attr-scales=src0:common:0.5 1024x1000_n"5472470ec011f86c82399fd21771b56a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 32x16x3x3_n"dd90558f5291988ca56cf81807d06080"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 64x32x3x3_n"75c24e78a18696e73b46ca706dafe64b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a --strides=: 64x1x1x3x3_n"6c0e4c00f7fbd2d925cbcd3fdc67de36*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a --strides=: 128x1x1x3x3_n"0d6d7bceb22816ebe55ad87ef4a0bcc3*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a --strides=: 256x1x1x3x3_n"45c4cdeaab8a2a0a602d97b22052b59e*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 728x256x1x1_n"711659e5bedae2be84e8f08ef80bbf7f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a --strides=: 728x1x1x3x3_n"07f10857f46be4ba28ec214098e65e13*104"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 728x728x1x1_n"7e00c690e0e88302aceb080d8dc9b429*51"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1024x728x1x1_n"b7f84f1938b9253bb62c2e9c2c3fe22f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a --strides=: 1024x1x1x3x3_n"8764c8a9e36902e43984d9c6e3808b8f*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1536x1024x1x1_n"1919da41c1d335e06e19623e95846385"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=Abcde16a --strides=: 1536x1x1x3x3_n"b6284e457f4ab06d5e746c4e27ea9a02*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 1536x1536x1x1_n"6c74ed91b8b4a48e6c4c31fb87eb4a70"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 2048x1536x1x1_n"98bc0cc942d7e9a5e5deb53dd4597c27"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x2048x3x3_n"090a298b40a93587cc94d270bca9997e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 48x128x1x1_n"a795d746160d44330862d40a00b983dc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x304x3x3_n"a05d55b299d6bc550eb763a03955f4c5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x304x3x3_n"7e33611c3a6f7dbb41ab87e947c65949"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x304x3x3_n"982ef609fe91ec4fae0541cac51b78e5*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 48x128x1x1_n"25e2b834987f60ff7b9d80204139771f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 48x128x1x1_n"c23ad9416f8916c268edeba1da8f62b7*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x1280x1x1_n"0b7385178e47527cd8ce4309e55cc103"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x1280x1x1_n"71b955fe32772f2f4d804f032e7dfe19*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x2048x3x3_n"1da8449ec1c4f4779453d10808ec8f5e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x2048x3x3_n"45f550325e1136771a3eb0c6ae1e18d0*9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 2048x1536x1x1_n"0ee520dc4876d2be727c4b74c8065e89"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 2048x1536x1x1_n"b4985086ce001f47ab71bfba5cf5b172*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec --strides=: 1536x1x1x3x3_n"5815e3d67a6d3244484a8e8ed1f9cdcd*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1536x1536x1x1_n"881f34c50e02fbf6ddb351b0e35c42f0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1536x1536x1x1_n"fb65fe3266cd216e543e54f25edc523e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1536x1024x1x1_n"662c76685e770bd2cdb4aa5b04727631"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1536x1024x1x1_n"7591b1bcc3bb57db28a29f072bd3bb83*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec --strides=: 1024x1x1x3x3_n"a23adae61cc462f6deb158eea824e56c*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 1024x728x1x1_n"bcce39bff87c1dfe73764731e7f89e1d*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 1024x728x1x1_n"d3962bcaa3454f39f422e2ccb417d495*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec --strides=: 728x1x1x3x3_n"c0469089a6912d2abeb2010c456bba75*156"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 728x728x1x1_n"69ef624f90786d57af116c4ff9e36b3e*51"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 728x728x1x1_n"94d3c0b13a375c288d1eb55da108f3bf*153"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 728x256x1x1_n"165b2e7c034547c6e0f3f6fb78b2bc7f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 728x256x1x1_n"9f7b669e0860c0e8745ec9e6061dfe5e*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec --strides=: 256x1x1x3x3_n"ccff62ecc5394d0761fad68768b284a4*9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 256x128x1x1_n"4a47fee6711dc5a07c987181b328602e*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x128x1x1_n"96e135ce22203e2d0d78f8b71006762d*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec --strides=: 128x1x1x3x3_n"4d86f10bcc9a549cd89b106a00bf1ac3*9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 128x128x1x1_n"2914208b2509421956f6c7b0710a4269*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 128x128x1x1_n"e74c230322adafdba3c8efcfd85d5a2b*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=abdec --strides=: 64x1x1x3x3_n"0183363f6830e332f713a8b2de994b22*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16a16b --strides=: 64x32x3x3_n"f59f4140808110bc70b0dfb74d34d010"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 64x32x3x3_n"d77ecdf9426abb241eb8c1077ba2edd3*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 32x16x3x3_n"78c35072c0686c29423b9a26ea747f2e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 32x16x3x3_n"2086b58014791a6e94af023e25ecf54c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 64x32x3x3_n"d5831b30c8a8430dbbde4c60ec37d1fc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a --strides=: 64x1x1x3x3_n"b6e519e09058887ea10fbf338e0f9093*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 128x64x1x1_n"a363956ca507a3aaa1ed54f8206a7911*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a --strides=: 128x1x1x3x3_n"0ba96285366d1f3ee70b946230380a2a*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 128x128x1x1_n"9d927f41e3abd8435de9626d7f5e3407*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x128x1x1_n"0857c492be3d0a557705fbd56616ff02*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a --strides=: 256x1x1x3x3_n"de947d35fe0849f3a11e78a6e18dffb5*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x256x1x1_n"4ab8db5d28a3fc3acbe43c571f1293d1*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 728x256x1x1_n"55724cf19ed7d78210ac11aa088015a2*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a --strides=: 728x1x1x3x3_n"ab36491e955066b8d1cfbf2ba3e0bba1*104"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 728x728x1x1_n"33a6917ef2f4cd67033cd964219f99af*51"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 1024x728x1x1_n"3ca1e78801000e9bfc5aad43ac4f285e*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a --strides=: 1024x1x1x3x3_n"12f61bd534962cce2067239e207b7dce*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 1024x1024x1x1_n"b38a0609bddae8d599cd1184dd26f842"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 1536x1024x1x1_n"a12af4561b5d09e50c1901e8ae0ffe11"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcde16a --dtag=Abcde16a --strides=: 1536x1x1x3x3_n"f71231a25b04ebb52e3bc48afe0f7a4c*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 1536x1536x1x1_n"8020a85ea87fe98e6fe328c0ebbcf420"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 2048x1536x1x1_n"1eaf144ca4b1dc4df5374f9af2a25e74"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x2048x1x1_n"6d55b4ef67cfc98004dd876a6415652b*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x2048x3x3_n"f8d8b55d9c3b8280051ac82b2a6b4ee0*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x1280x1x1_n"598eb03bfb370bfb422a7a5b045f6c89"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 48x128x1x1_n"8d63e8e314223033f0c698f90a8ef9e7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x304x3x3_n"d4a0301daa25cbcf4b45eb6815e1d3c3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16b16a --strides=: 256x256x3x3_n"1295e453babc68876090233a9ae23d8f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x256x1x1_n"a0d89e29c5fde7841a2c08e2c4d3258c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x256x3x3_n"746bc104c9d438d38192546d9023b205"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x304x3x3_n"f6c121b21ea6bfb98f8a51e8ec93822b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 48x128x1x1_n"7451cb1fcafb751edf17146d8a1cf29b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x1280x1x1_n"632925eb70ea0c9390e4ee352f2ca244"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x2048x1x1_n"6ffef1f1c0f84600a2f31d9ffbbd3065*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x2048x3x3_n"46440665250f639e14ad9f101429ed51*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 2048x1536x1x1_n"07cfaaccfc3a17941e9d2fe2a18d4692"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 1536x1536x1x1_n"b7ec88a834435f5ae7900666d2ef25ed"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 1536x1024x1x1_n"1a62542d58e1e8863d48956600cc89c6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 1024x728x1x1_n"3ee1f3ff71ea1bccf81eb999f1569202*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 1024x1024x1x1_n"aeb6da1b4b0406bbfb85630f08c8e716"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 728x728x1x1_n"b766139c23d7706583068e83979bcff8*51"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 728x256x1x1_n"3d3059b51dadcd674888ecca17491461*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 256x128x1x1_n"8c5f782c8cc4069a85c437bd8b7034b3*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 128x64x1x1_n"a90edb6debf42fb77a110a05efe6d488*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 128x128x1x1_n"c385ebc62af706865987e1bf538cecc9*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=ABcd16a16b --strides=: 64x32x3x3_n"ed079087fa6b63f022f6872e89cb3913"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 1x80x283_n"a4b37fbb00db0322370218acd9d53ee8*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 512x80x7_n"252fc20263a4611ad1257aa77c651419*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 1x512x283_n"ca4e71ba14c203b19a7bb1e2e9ba881c*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 1x256x2264_n"86c9bfde8a85970849073edfcae81555*42"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 256x256x3_n"74bdd1019e60b305e62c6c040a0fa411*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 1x256x2264_n"ae7557fb15f6159baf14291747fc6ec3*32"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 256x256x7_n"52b7a242dc3e7b07ae21ae27f07b987d*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 256x256x11_n"0ac96002c75bbae6ed41b8418b94aee3*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 1x128x18112_n"e5495eda0e7d4f637620e3736df4909d*42"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 128x128x3_n"ece092cad41b64ce87713399c091df95*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 1x128x18112_n"03ca341af49958a21ba8d370b0bf88bd*32"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 128x128x7_n"e88b3d5f283129c3afe68cba88680de3*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 128x128x11_n"5a59086c95f75c974786bf8b4a5880e6*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 1x64x36224_n"5c223bb9a50a494f5dd8cfba04981721*42"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 64x64x3_n"9016672da89db5d5fb2b97a3c4348114*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 1x64x36224_n"254ff9ceb414afad1a783e0caef492cd*32"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 64x64x7_n"de3752222ad7facbb83dfbe32e63f925*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 64x64x11_n"9ae6f7704d78d78502e6e791bd35ec78*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 1x32x72448_n"5fece0874c0fdbb9ee6b4d1a1dcfb6a1*44"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 32x32x3_n"1656c34e7549a0bea83bf357a3e824d5*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 1x32x72448_n"28ce3accacf1b5db05ec129579829aa2*32"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 32x32x7_n"07d5e0bd14628e0c632ae638172587b2*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 32x32x11_n"31196f6972fdf1cbf034468405ff725b*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 1x32x7_n"a90af83ef189a5573a639cf3f675cf23*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde16a2b --strides=: 32x2x3x3x3_n"08d32da8172adad95ba6316f97754820"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde8b16a2b --strides=: 32x32x3x3x3_n"17b29e5796b9c23d098437bbb5f7b1d5*34"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdeba --dtag=ABcde8b16a2b --strides=: 1x32x1x1x1_n"1d36a45c24d712f2b50248cd533c2ec7*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcde8b16a2b --dtag=cdeba --strides=: 1x32x1x1x1_n"330e4a9f25f477c85ee874170b15281a"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcde8b16a2b --dtag=cdeba --strides=: 32x32x3x3x3_n"81b3ad671c40538af2d601bf7fff9c44*17"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcde16a2b --dtag=cdeba --strides=: 32x2x3x3x3_n"e4ecb69198766ea89e320e22d8160ce8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdce --strides=: 1x25x49x49x1_n"aa326e528ffe6a55dbd59b46c4db5552"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=u8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 --attr-zero-points=dst:common:1 1024x3x224x224_n"3faa1d3d0d4bf1316843aba2bacbb03f&5cd84f8637f99bc254558967ee010cc4"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=abcd --strides=: 1024x2048x1x1_n"826b4d7eb321134ecd74f22539ce3b04&3ba660cb5ba8954cd76498bc75cd3fc3&2a31c6b1886c3928fb5f22ac61fa8156"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f32 --stag=ab --dtag=ab --strides=: --attr-scales=src0:common:0.5 --attr-zero-points=src0:common:1 1024x1000_n"00db77956bcbcaa478606941600d5d45&a17a53d00760b96ca5ca63895a9ba66b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=acdb --dtag=ABcd8a4b --strides=: 1024x3x224x224_n"ab17978568c1be02484aec3a1ddf2100"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 1024x2048x7x7_n"260d0c8129b5cf0706f54d0f125dec93"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x2048x7x7_n"0fd40ba21c6ca4eade63d7732ac5ebdc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x1x512x4x512_n"19adbd4419b3984972dbc0e019dc4346*4&503c2b8ec0d7b94d6490a78be0304b6f*8&9c0798596caacaefbda3bc4f6beaefa2*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x512x4x512_n"8383f6b86d1650352af7ea6f0ec50d37*4&276ef7006a3d90b914f913e1e661bfcf*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x1024x4x1024_n"3f9594e7c6bcc4040e5e8e58a67d1109*8&c35e4728e4f258afb7c728232f854dba*8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x2048x4x1024_n"03aaeeb129c23b75afaa1b0675fc075c&4f125b74d8615a5906621d3af25e9a14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 1x1x240x4x1024_n"bdb3cb71039b0c1779279001284f78a6&ced3c014da98d17b4a633cea671e5980"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bac --strides=: 128x128x14_n"fc014381d42b1b12949db9ad0c26b158"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=bac --strides=: 128x128x200_n"334f9e672b82967c12e576ff470cf658*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd8b8a2b --strides=: 4x320x3x3_n"dc315944c04863c701599db8c60ec500&68991bd62deecb4a9f5497b8d53a9aaa&b3c4066200f0576615b9c67550442ae1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=AcdB8a2b --strides=: 320x4x3x3_n"aaaf37fc9176d5f3e181f254df5a08ac&1d1a5faa1dd5412a009ffd88111905f1&2a7e413228b383774a90d1f6f0de2a82"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x96x56x56_n"8b475a716661af886afcf10142076b51*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x96x56x56_n"3528c7800eab808979a25b748704baca*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x96x7x7_n"e5340d9e1d6c185ad21da42796197a74"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x192x28x28_n"6355a5d76ff40f0cd29e167b79df23ee*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x192x28x28_n"9f12a1f8c5e0ec6bb636d70b2050c416*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x192x7x7_n"f9342db74cedebbf0f15848d9287790c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x384x14x14_n"f6f787c90fc6265cfdabe626bba5e2ef*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x384x14x14_n"c48eb78c12e049d68bd2863f401fab9c*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x384x7x7_n"2c4739a0959aef6892f378f381796fb2*9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x768x7x7_n"7e438896116b12d9044f3490926be94c*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x768x7x7_n"1c9ed518e217f86fc64510716904238a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 32x3x1200x1200_n"643241d54f74243ee88e96f91024ce94"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=acdb --dtag=abcd --strides=: 32x16x50x50_n"e274eb194a1196c8826d96a14e9b4d92"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x324x50x50_n"e91b4065a75fd66151cccf828fff53fc"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x24x25x25_n"95237a260114d8fd5dc1bbae563eba33"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x486x25x25_n"217ff97dbed8e00b1c55a7fab482163f"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x24x13x13_n"7131ddcd03f8fe75d7f54179d9582c08"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x486x13x13_n"70133714e6a0d002d7bc37a00011c743"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x24x7x7_n"cfdf7198df36a7e7f0b84ee029b52bd3"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x486x7x7_n"21a6c32e33f766a3a256a901a1a0eb5b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=acdb --dtag=abcd --strides=: 32x16x3x3_n"a003c1a7d6d22e673500c62536203f03*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ABcd32a32b --dtag=abcd --strides=: 32x324x3x3_n"bb6714be0061761b22f7506f5eea889f*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abc --dtag=abc --strides=: --attr-scales=src0:common:0.5 32x4x15130_n"fc3ad73a0a28b38f00ea828481db9fe2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abc --dtag=abc --strides=: --attr-scales=src0:common:0.5 32x81x15130_n"92c41902301aec171b431f5a83fd2af5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 256x2048x7x7_n"602ec665497a4715c0aebe7bb2d327f7"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 8x32x1x1_n"f469ff9828ddaf6ea614040f5532777e*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 32x8x1x1_n"51d212b855cf3d9fab88e046fc2058f3*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 16x32x1x1_n"ce9a5c94b3d85e25824149fc12fa8b1d*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 96x16x1x1_n"d94ae279feb075ec45e7b61fdaa876a7*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 96x1x1x3x3_n"5bafb65a0062a28af68f2c4038838e31*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 4x96x1x1_n"385e837010cb4f115baf222b6513a391*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 96x4x1x1_n"861f6964b20404491559bdba51c85c59*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x96x1x1_n"61882c5e8d6541cbb5546d341326de63*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 24x144x1x1_n"41ee0033da92279dcc2769ec7e243c09*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 144x1x1x5x5_n"71b92698a47e07d00e97a1ac1107c3ca*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 40x144x1x1_n"db1472583173c9c7849483bc4df5d467*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 240x40x1x1_n"2f0081b09b72edfe30c8f41faf731c10*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 240x1x1x5x5_n"ee2c702dc1bd07c42ef74751ca3fdf6b*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 10x240x1x1_n"dabf09c0d8d1a054516897b224376dc1*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 240x10x1x1_n"eeabc48b3e08d90b196d12dc3f5eefc3*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 40x240x1x1_n"12d0b6d3938ef0abbfb32d8d16c544ea*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 240x1x1x3x3_n"9c07c22e494b8b78768eaa3ab386d7f2*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 80x240x1x1_n"b5e01506f379b06f3a0aad9c0954b7a3*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 480x80x1x1_n"c09b8ef680c6b7b57f330c1c08dfe6c4*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 480x1x1x3x3_n"d8bb37ddfd7cee50531db34e06a65ed0*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 20x480x1x1_n"530025a27da7a78125e2b6a8e27942ee*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 480x20x1x1_n"de1fcb0a5f1fb4e93dbe7d5c99bba583*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 80x480x1x1_n"454a2119d14821f4b1b77199b3f07994*100"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 480x1x1x5x5_n"2975e072d4cad45d95287d25b3f3c1ed*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 112x480x1x1_n"7abf135cb42be4b73b084ae80138f671*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 192x672x1x1_n"55be097a5389d98ce437bb56eee08b44*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1152x192x1x1_n"3610ca8183b7290fab6ab234033f02a1*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1152x1x1x5x5_n"0f8eb3e6aaedbbe4e6a7ded3546e025d*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 48x1152x1x1_n"0a03ab326623af3ba4add929753d522e*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1152x48x1x1_n"c1a24a0d286420989e684fb3bcc25d3c*200"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 192x1152x1x1_n"3f8f07a703ffde0d1a9b3f02966c6af1*150"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=decab --dtag=Abcde16a --strides=: 1152x1x1x3x3_n"eae387e247e8bba9f244b6b3b568bb83*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x1152x1x1_n"0c47916dc29f65cddcbfe6433818d3de*50"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x320x1x1_n"f20cd91236e0439091bf8ce795993a92*50"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=bac --strides=: 128x128x14_n"6cdf64d74fbfab450e1ca8d1ce8b9091"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=bac --strides=: 128x128x200_n"8b4856edd2586b30588faa0bcb74277a*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 64x64x1x1_n"ed4f75efb8301cc7f365235bf875473c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x64x1x1_n"e34a6444d2f64169ad53488bf94b224b*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 64x64x3x3_n"7fd3b514f751a289a7af4705eac53541*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 64x256x1x1_n"bb8876e9d0f435231f2f3f8bfadf6591*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x256x1x1_n"d19709df2398963c4003777cb5a72ee7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x256x1x1_n"ca9fec73f9461b00bedbe7bc9cb60920"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 128x128x3x3_n"99d506be5afe8243064fd9e7bc811a50*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x128x1x1_n"51c95b68879d0214060bc5bb4e7e6934*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x512x1x1_n"5af5c9ca3f0594608ca43222df577db1*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 1024x512x1x1_n"5d01236560b6892a2e7e54f4545109de"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x512x1x1_n"b78c2510e0e89782ea1557fcc0dad46e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 256x256x3x3_n"342f409f4274db0c57331bea3c78d9d8"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 1024x256x1x1_n"ca9096e6945e8704d9604838ffd159fe*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x1024x1x1_n"88e1a2aba1c2d9f94420bfc81491a5d5*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x256x3x3_n"f12cfe3a5910611b88972d88cb882ba6*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 2048x1024x1x1_n"dca31ae815488ac2251bb9eea2b02242"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x1024x1x1_n"7893b7fe84338c0021044831c220c673"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b16a --strides=: 512x512x3x3_n"747f1a36f94d6a91bdf3df3625bb073d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 2048x512x1x1_n"422b7003dce2f335379a18d14ca6b1bf*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x2048x1x1_n"ed43997a1987f44c5d4933de3ade123c*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 512x512x3x3_n"f150696c17680565c24cd510d70c0669*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 2048x512x1x1_n"99c6515cb2d68d10999b53fb0c9f4cc9*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 2048x512x1x1_n"913e0253e8c52894baf25aacddbe449d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 512x512x3x3_n"6b70e2e66dce5be25b8b6d567e71c6fd*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 512x512x3x3_n"74c41c30b43d404fc016b94aeac4443a*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 512x2048x1x1_n"b8e1e6fcd2337a2e0e82f340eab8b04c*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 512x2048x1x1_n"0771979550c92906ebcd70865dc23a79*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 2048x1024x1x1_n"535284d4010e816cdbce593ad9de51e7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 2048x1024x1x1_n"fd346de91f546ab4a1ff000903b9dbca"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 512x1024x1x1_n"fb9b1452415c5b4595cae065edc5008c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 512x1024x1x1_n"e196db7ea946fe128590e56fa01f1105"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 1024x256x1x1_n"06b36e54f903dc59a09af6dee32dc377*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 1024x256x1x1_n"5373001fa49644c6ac36ffd9a70a4f9d*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 256x256x3x3_n"5b6aafba9b09ea65fa8e98163d3e3048*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 256x256x3x3_n"ef04130ec6d0f03754379fddf5f1a4da*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 256x1024x1x1_n"48b2f9740a3c431cfc41b203f62c40db*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 256x1024x1x1_n"533d28ac995391d3ddd61d0fb4cdb3e8*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 1024x512x1x1_n"2e3472f3c6c52dd56bde9452d3a018ea"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 1024x512x1x1_n"e801b0ce5fbc3b75000bac3bffaeee1d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 256x512x1x1_n"2961ee02dccbf05f054fccb1a6f687d6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 256x512x1x1_n"d90a3ff18e571294762a1fff4724407f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 512x128x1x1_n"600f909c4dd00b0c4d7c2bc3d6071bf3*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 512x128x1x1_n"90a18db1e2617e0be308e94183558b9b*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 128x128x3x3_n"bd913174c219733c1228e19b88faf5a4*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 128x128x3x3_n"543e5574a04cecd74441da3f81189257*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 128x512x1x1_n"a9de0334235bd5497aab98f026358767*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 128x512x1x1_n"5f6c5f1299fe27cd62639f0e5b766896*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 512x256x1x1_n"43433224619c0a422211c7320f207b32"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 512x256x1x1_n"3f0b2ae1128b6620b572cc9b83127eb2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 128x256x1x1_n"6ea894c62831de07d401822ea1464e51"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 128x256x1x1_n"de3d492bf3fde2df36708a2c553d5b88"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 256x64x1x1_n"4d283d31aef341d79a1ba9fec7601dea*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 256x64x1x1_n"59499ba7a164dfdef47476857770feb3*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 64x64x3x3_n"cf8da0dc50adf08d0c58c1a77c5e3cef*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 64x64x3x3_n"5ca415cfb4381015b745e0efda8b9e84*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 64x256x1x1_n"3430954009c0a4efbbcfd6c672614859*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 64x256x1x1_n"6410323b1b634807a47d76fc7cfcb9ee*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a16b --strides=: 64x64x1x1_n"884a8784573c183e7581951e3e493361"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 64x64x1x1_n"9da4c7618ee1f8297d9d4eb81f3dcbbc"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x3x360x640_n"c84f7b50d2dc8cb9331812a5028b3eb3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x360x640_n"c12fe83bf4ef3e9557f0fb2bb6368314"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abdfce --dtag=abcdef --strides=: 1x64x360x2x640x2_n"9e5f1181f796b1903dcf1c3f97668be2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x64x720x1280_n"1da6260e3672371a61a1691c6b94f39b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x3x720x1280_n"428423550071dbc150f1dee46f3629a2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --strides=: --attr-scales=src0:common:0.5 1024x1000_n"df44da3f55de41d7c47f46c642c574d9"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x1x512x4x512_n"65ca0aea0c76afbcf1b8086a6ad5848d*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x512x4x512_n"8fbff04da5148d84d30e68b838e5c405*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x1024x4x1024_n"934db20f196925e033542789ba0120e0*8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x2048x4x1024_n"91837b3876ec10b3545d704b25b1af51"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abdec --strides=: 1x1x240x4x1024_n"18f652824a234d832b4b321ccfc5bc51"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=abc --dtag=abc --strides=: 1x72x72_n"ae48c6c78f659c2fae02a2960959371d*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd --strides=: 256x256x256x2_n"994388173e9371b614cc4f2023461df4*28"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 1x72x72x256x2_n"a2d56a5c43e0c6adad3de7618ce62ea6*46"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc --strides=: 23x36x72_n"bb9828f59a80c63e426590297d8d48ec*16"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc --strides=: 23x72x36_n"fd8f9ae7b4be33ba30ef63a1b34cf73a*16"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc --strides=: 23x72x72_n"bedd18cdaa1faec79d75176c45f9dc6f*3070"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc --strides=: 256x256x256_n"c508efc5cebceffd2d95c4bb08dfc707*22"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd --strides=: 23x72x72x1_n"5c2292bdb6376f88f9cc5e191d08178c*6132"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd --strides=: 23x72x72x2_n"a05cd85232be5aac019317a44323d074*1024"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcde --dtag=abcde --strides=: 23x72x72x256x2_n"46961096b7d182535040de826113ffe1*3072"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd --strides=: 72x72x256x2_n"c63029c9edb777e9a74ba72ee48716c5*138"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=abcd --strides=: 72x336x256x2_n"3a652c4f791f56ea23dc7dfecaa433a4*46"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abc --dtag=abc --strides=: 256x256x2_n"34407b93a373a84362747a5f13e420f0*2048"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 256x256x256x2_n"c3c678a1d3e96e5cf947c47945b62e39*28"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 1x72x72x256x2_n"be998329400f01cf69c75a7aa604848f*46"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 23x36x72_n"fabe82a638c3df0457011b22bcd2e6f0*16"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 23x72x36_n"a1e448bf02411abf750dc8552cc6a61c*16"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 23x72x72_n"c1d1f5dfd234e8729f1c19807e14ea29*3070"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 256x256x256_n"781cdd313b6b18e4442bb5af15282978*22"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 23x72x72x1_n"dbf99561c03be3d07195b2383f8482e9*6132"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 23x72x72x2_n"08d6543087086cf86e47249dfa608645*1024"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abcde --strides=: 23x72x72x256x2_n"65cf6d40b7549aff6a770a7f8503273a*3072"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 72x72x256x2_n"d48f17267530e83ffe7a652b43f3c9c3*138"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd --strides=: 72x336x256x2_n"a6baff2d76000dff8fc38574f1c83c34*46"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc --strides=: 256x256x2_n"939d5759b2da5f33f384aa1e237f10bc*2048"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x1424_n"68851543d8366f6e60b785b01042819c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 64x1x1x33_n"167ceff51ff2ec22414506f832e7798d*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x64x712_n"08717bb651b58152f3313d1b5e86dffb"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x712_n"4e96a9f4d699bf26e34042ab4a831d0d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 256x64x1_n"109a4a20479a3e77bc45a9cd4965b30e*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x256x712_n"a6d45f2669086d0ba490977d28d36be5*62"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x256x712_n"85d8b47c805cc6cb4e6c99e367c4eae2*69"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 256x1x1x33_n"73b8ce61569194630b01df483ab03f15*75"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 256x256x1_n"f2ff169588744bba51f7c14d5902d9c2*180"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 256x1x1x39_n"2486bf8bb8d3f125edbcff1619aa9818*75"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 256x1x1x51_n"a614933f706ac1ed1257410c8e7fa896*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 512x256x1_n"8d543abc92a05ff6aa8b20c0327ed095*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x512x712_n"b510af9d7c2b260225aa6b88fca97c1c*91"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x512x712_n"39c336ddef2021b9b7462c00a6166b5d*99"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 512x1x1x51_n"d676d6bf677bac7167fb505f012e2369*70"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 512x512x1_n"698cdd1391c95041a11a342b65419422*265"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 512x1x1x63_n"195e06865bc10734ce2f602a4b33b83b*75"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 512x1x1x75_n"bab43c2d3ad447176df4c03c378f6b63*75"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=Abcd16a --strides=: 512x1x1x87_n"04fa247e8187c3401d840f84f948a8c8*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 1024x512x1_n"697505f1c9eed236342cbb91241e21e6*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=ABc8b8a2b --strides=: 29x1024x1_n"19ed0efc8b75bdf035b44002113b15e0*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x29x712_n"9b84608204522b72130fd398c4d6f13f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x2240_n"aa6ab30f03c5f734db49a5ec19c9f91b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x64x1120_n"bf98abcd96e3f4f8e722ccd46489029e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x1120_n"105da39a5b4793ac33da153137e73e37*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x256x1120_n"8f696a3cb331676606998dd18d46a05e*62"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x256x1120_n"1c129a1e90d5e59e48c857ba52803699*69"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x512x1120_n"1714a2bd22b268d4da47c191a9be93e7*91"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x512x1120_n"56c905350d66eea8ef40ea6e6eb972dd*99"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x29x1120_n"8fedf9f208b6956754966f7a0a2b8b92"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x720_n"827712a363e06beec9817c78712f8d1d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x64x360_n"e70069022be6cd9823590f42038fbf08"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x360_n"7a723183228a861e80f3bf0f8cca141f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x256x360_n"73d8e3ba7a36aa6b599fb2ff19ea5cf8*62"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x256x360_n"e944218e74f1392f6797dfde05de9e89*69"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x512x360_n"bf965edb81d2adfd5738f64fb974add4*91"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x512x360_n"4667ebe903b554f923561c022ffb59e8*99"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x29x360_n"c8fb4004ffd6b7764f37b04177bb76c0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x576_n"7213242d63c160aa6af949f55213cbbc"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x64x288_n"9b3ca279432f340d72d9b0a9506c93e8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x288_n"1994091d1dba23d81a3af0208747299f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x256x288_n"8175cf9a185449aff27d2e9f5e9a0324*62"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x256x288_n"a8c1f331b9db285ad323d1a4d45f1b4c*69"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x512x288_n"681537507cce1424abd46c58d0e50b9d*91"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x512x288_n"07213dc62250b179d3fd629c99b2f5ee*99"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x29x288_n"2dcbfaa6001951e34cc161b022a88dc2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x64x560_n"fa12b119d8b0ffb448e610959a09ea23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x64x560_n"e5fe02b8cfc1b22a7c20f1a29bb4ffc1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x256x560_n"7cad635dc8b3358b2999367b5b91998e*62"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x256x560_n"430b6b7835f734a96d5374439cc43cd5*69"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x512x560_n"f5b50e240e50159b07d3edbf17004629*91"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=aBc16b --strides=: 8x512x560_n"28ac8f9430c360ea3088173566d1b037*99"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBc16b --dtag=abc --strides=: 8x29x560_n"b1805626f41f7febac576cdd19907f0c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cbda --dtag=Abcd32a --strides=: 64x1x16x1_n"d6747e1e3e4e0cc784e03def3ad0895e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x64x16x1_n"b9851f2f0dbdee3e577b6324008a61fc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x128x16x1_n"422babfb4685fbd057af0ad39fd8a75b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 256x256x32x1_n"a51fbd9e051f012345516524574ea2b5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x256x4x1_n"6a02253d629f8ea884e5c2eb40398012"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 128x128x4x1_n"c0dfcbe55b95ddf9d7b54901e711d59d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16b32a --strides=: 64x128x2x1_n"b908df52d5bbc9a7f02dca34b353aa0a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 64x128x2x1_n"cab5643f268fa678a9b7171074087854"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 64x128x2x1_n"2401731cc55bc2f51f0dddbc33b4cb74"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 128x128x4x1_n"46534ee9e67bfb93044de83a61803af5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 128x128x4x1_n"770f681d3aaa38ae2325cd80050cd254"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 128x256x4x1_n"1e9397ed6883d8ba4d9335981ec95d2a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 128x256x4x1_n"f54c6eb6a45141fc0f97dea40c737f13"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 256x256x32x1_n"ddf26269c372dc6ec3e11259b47a9f93"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 256x256x32x1_n"a8b6d33bb8ba2f2e26b0fe78d50af141"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 256x128x16x1_n"80319a6ebd4ae7481a4867d394f0e79b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 256x128x16x1_n"59591107fa748df9ae5ad260f1946caa"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=cdba --strides=: 128x64x16x1_n"91804a355365693665ee23489677e0f1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd16a32b --strides=: 128x64x16x1_n"cf27bcf2c73c2a9f74b40dd8a7bce08c"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 4x3x640x1024_n"992fe13b701e91d778196b7531737ab8*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=acdb --strides=: 4x1x640x1024_n"d04c7024303cbd0bee532a1921229e4d*6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=Abcd8a --strides=: 8x1x5x5_n"51f90af1f1f8f67bf3ebe55af50cdae7*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 8x8x3x3_n"d9cdc23f809dfdda9b8b919844ab4089*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b8a --strides=: 8x8x3x3_n"70d6d8780ead23de8ee30002f06f22ef*64&b6a3ba51cea9956b77c60ee0e459e176*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8b16a --strides=: 16x8x1x1_n"3e818c01562c632970eb76718c5b8417*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 16x16x3x3_n"231e30b0fa9cd8a7d3e59cef55b16141*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 16x16x3x3_n"af6acef02ed816d893d054529ffa564c*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 24x16x1x1_n"cf378f9c3a034bd65993a7abc3d49f48*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 24x24x3x3_n"f1a87bfe8d1b34d996fb43b514be5e4a*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 24x24x3x3_n"0549747d6f9667fbbf0d95afb2673656*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 32x24x1x1_n"05e718bdc6ad48734c00d0456c7598d6*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 32x32x3x3_n"eec7ea5d815953616ebe7f7ae70cd058*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 32x32x3x3_n"ca9fc7d51fa157fa8359c216d66fb2be*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 40x32x1x1_n"5719901f5906bbfec1f983ef299103c4*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 40x40x3x3_n"5297584ee41d567c35bbbeb6508c082d*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 40x40x3x3_n"7e50711aeec9c9127998980871fb89f8*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 48x40x1x1_n"2272620e59ddec1bd1e7c2977040f855*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 48x48x3x3_n"79d60678014c24959110ea54718dcfe8*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 48x48x3x3_n"9165f77c4256eec5a1ea7b38384a1ea7*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 56x48x1x1_n"03ad7cc70a0fb598b8b987d323bb390f*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 256x168x1x1_n"32a24415153664feab50dd6d8c40e609"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acdb --strides=: 256x256x5x5_n"675b015fc8842abeb7e0aaea9487b01b*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd16b16a --strides=: 256x256x5x5_n"cc42c63895b44084b7a025ea1770c12a*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 168x256x1x1_n"82f69779525e07369945b4300830b307"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 48x56x1x1_n"123b205c96a6309318c36c5c2217be3f*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 40x48x1x1_n"fc584535e62a382423f0d19661aec372*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 32x40x1x1_n"d36c058881159a184cc0195bc59bfd9d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 24x32x1x1_n"911a280ab8eb9c6939597edd747a6683*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b16a --strides=: 16x24x1x1_n"40809adb99f9cc4e378d1d7c49511a71*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16b8a --strides=: 8x16x1x1_n"a13ad472d0ca5c0896ad647713cdad5c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8b8a --strides=: 3x8x1x1_n"0e94236711ea5291b8f36a2abed9ed5d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=aBcd8b --strides=: 3x8x1x1_n"7afcf495d05d53f9a6ff6fde2be33262*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b8a --dtag=acdb --strides=: 3x8x1x1_n"f44b17750edc80141abf8584e52f0d19*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8a8b --strides=: 8x8x3x3_n"af909d2ccd2dca181267614d244a5a0b*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b8a --dtag=acdb --strides=: 8x8x3x3_n"79f374b5ae7505d2b1a4d14553d48988*32&e68380e3f86324ba3b90d63601966989*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd8a16b --strides=: 8x16x1x1_n"d2a988205512d44b24d6fa4abb0c6420*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b8a --dtag=acdb --strides=: 8x16x1x1_n"acc8827d776312a8a9ea7c253a96d918*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 16x16x3x3_n"4b4ebdbe048c623b810ddf4c0ca1b861*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 16x16x3x3_n"0603f75f09a52ebdbd78819b7847ad6f*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 16x24x1x1_n"21c08ae22facb9625633c682cda15a3d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 16x24x1x1_n"bf0b2f3c1d8429db9f190c0c14cceb2d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 24x24x3x3_n"3ca35fa4029d0a0ab719226af8de0ebe*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 24x24x3x3_n"95350ff4e3e8a694f756bf56745fdbca*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 24x32x1x1_n"5b0f053f5162a4f30327803aaecd5792*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 24x32x1x1_n"06d59e0cb48899243f30252ee0faaf9e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 32x32x3x3_n"7a1c232781385cfe53720d89796803b6*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 32x32x3x3_n"05e6fb5539444aa8537e512a3147758f*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 32x40x1x1_n"04367f818c4a3d9c33632c54c87f1a82*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 32x40x1x1_n"f23f6c47c84b3ba95b867874c87926a8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 40x40x3x3_n"9768dafe65dc7f418f011b42dce85944*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 40x40x3x3_n"d609a6f369613db8f102a9ef4946982e*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 40x48x1x1_n"90956eed9717c4585705c90b299c20d2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 40x48x1x1_n"69ce17a31598c77f3fc8cf4e8d68fa7b*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 48x48x3x3_n"0a1b55b5e0edcebd6f6ae0bdba18bdf1*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 48x48x3x3_n"172d249557f7f58564300f0d0ea8f5e5*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 48x56x1x1_n"c70c10b7cc8b3dc1abc8aa7ed3798aa2*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 48x56x1x1_n"84247e8601f3e6c4b9c214efaff0ec44*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 168x256x1x1_n"bc503f4ce94c834a6f63bdbfa741b64e"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 168x256x1x1_n"f5007505d2fc519763684a9e4b933c9a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 256x256x5x5_n"f5e7140c020e5c23c14f4b240fd74820*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x256x5x5_n"171d49054b60cc008bbf3ec74a2ab1e4*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 256x168x1x1_n"e45b4bf4edf1c66607f54914f6d2dfc4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 256x168x1x1_n"86601b0ad905824fb77fc4cd1c94e49a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=acdb --strides=: 4x56x10x16_n"74a76c9fd7634473a24c48490666812d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 56x48x1x1_n"78c317dfbc954255f00e6b4838da012c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 56x48x1x1_n"1e79197ea553dff34cf95dea44862d77*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 48x40x1x1_n"c8f24a346df8af5cdfa6c5771c58d4d1*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 48x40x1x1_n"2783918cecf149b17f0337f8674ac1a5*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 40x32x1x1_n"d75fda1c67881f2cd42b71e3134701a4*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 40x32x1x1_n"486662baca7fbfd4150bae2e63e5f7e8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 32x24x1x1_n"f2c65b4a57e25da4f003a78be3dddc0c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 32x24x1x1_n"c73cb3a9677d106853aaaef96c76dcc4*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a16b --strides=: 24x16x1x1_n"c8dde55d31a511315bf329c2bd1c9697*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd16b16a --dtag=acdb --strides=: 24x16x1x1_n"16baa09321fba8876fc80e491c787268*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=ABcd16a8b --strides=: 16x8x1x1_n"1625eed0b239878d19bb81ac242011ed*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 16x8x1x1_n"395e034b405acfe8e03fe8f0183efc51*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd8a --dtag=acdb --strides=: 8x1x5x5_n"0fe0995079438f059e89049cc79c6ecb*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x255x80x80_n"f4f084e9eb41a10a581ad7fc36061017"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x255x40x40_n"ae5f32da1c2e85850e597cc58c999ed3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x255x20x20_n"233374ea56c83b21cdd25ebec2915d63"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 64x140x3x1x1_n"ac8fedc2eff427590bd20ef612371f47*49"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 64x64x1x3x1_n"dfd4fa29fae6fcb0635285429ca4a20f*196"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 64x64x1x1x3_n"b63dbfe63968b6a8e80132b3572af93c*98"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 64x64x3x1x1_n"7fcffd2e0e6ea8f3b38763d590901d4b*49"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 25x64x3x1x1_n"f860faac374c42b8e61e3a27edd42819*49"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 25x25x1x3x1_n"2025d50afdc77449f72f0b60f9339a0e*98"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16b16a --strides=: 25x25x1x1x3_n"a33b5891b15d03d101a983331b79dc7c*49"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 25x25x1x1x3_n"87929ba62219c2371005f0c9f84e0160*47"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 25x25x1x1x3_n"a47e0f5fc92c9a79ce216427f36ae6d2*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 25x25x1x3x1_n"0af3376686dc4f9d02b15d59f724948b*94"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 25x25x1x3x1_n"265205cc2c26ca14e20d08395f2ec35a*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 25x64x3x1x1_n"ccf11642a2cd3b960dedfe1b872ff52a*47"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 25x64x3x1x1_n"200155ba5d383ff46b4ab10743babc8e*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 64x64x1x1x3_n"ca02fb1b3194c9cb77a5e1fc463d18c7*94"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 64x64x1x1x3_n"111b42092c09e9a88430d31d4402a554*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 64x64x1x3x1_n"7448469d350c4f14a2df9079b603c961*188"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 64x64x1x3x1_n"42d694bb21a0b8fa20d65781c232a810*192"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 64x64x3x1x1_n"eb311f3c1a59fcdfb1aa87dfd4cb7358*47"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 64x64x3x1x1_n"4a90a2271b668542cc5de899eb16aee6*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=ABcde16a16b --strides=: 64x140x3x1x1_n"364b1a07accc277565e48653edfab31c*47"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcde16b16a --dtag=acdeb --strides=: 64x140x3x1x1_n"f552b885fe9983f0f58d90f04fa4b2a4*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b --strides=: 25x25x1x1x3_n"18b4b34e662a8fe102731aac6561bdc1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b --strides=: 25x25x1x3x1_n"84f7ef34bf8c35b3943bd4b706af9dd8*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b --strides=: 25x64x3x1x1_n"ca65b3b9c75a02c99dd1a13d3f3ead7a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b --strides=: 64x64x1x1x3_n"70f0693453dacd2e223e7ebd93f890a3*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b --strides=: 64x64x1x3x1_n"16afda21f9da07d4c93983bf9b04b0fe*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdeb --dtag=ABcde16a16b --strides=: 64x64x3x1x1_n"1ffd667567363f77ad4cabcf81f042d9"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=bca --strides=: 64x179x18_n"8993fb2bec083518d7f37f1799f6027e*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 3x179x18x64_n"1176077483dc055d5240f8698c9a7021*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=bca --strides=: 64x179x32_n"cab32079ad0cfd88195e929a77733859*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 3x179x32x64_n"f7ad8acccdc53d7c3adff938ae249e66*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=bca --strides=: 64x179x17_n"012c6600ace0d19aefbefe919f116c4a*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 3x179x17x64_n"3f450f581e2fe9be7b36db5a55c43404*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 64x179x32x3_n"88a9f88ec4f56b9b85908902d3fe3750*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=cab --strides=: 179x32x64_n"1283f4e48767ff0eb3dbdf407f147dd5*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 64x179x17x3_n"f77f5abba464dc0e680ae42e0f95cfef*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=cab --strides=: 179x17x64_n"8a33bf255b8a4427176cac2878fc3be8*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 64x179x18x3_n"bb1c332508b542ad6f46e778e7d7d368*22"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abc --dtag=cab --strides=: 179x18x64_n"f47610c49fa4d6c31cff1bab8ca8705f*22"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 3x179x18x1_n"04a1c6edce7d4631c9c21895e5241493*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 3x179x32x1_n"6f823b674c09bc84fc5e75059246f78c*48"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=dbca --strides=: 3x179x17x1_n"f1f83db26816712f6cb86208ef67d036*24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x4x64x64_n"20fa5ef5eeae911908d50714faa5ab81"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd2b --strides=: 1x1x512x512_n"73734a626ff6bfabea41f07feb75a11a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x16x512x512_n"8a9bac45fc49694f0dd987acac8992e8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x16x512x512_n"08bb43b0cf35c95b50032c01781e59b1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x128x512x512_n"966518255c6629f03c79e5cedbaa3c11*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x128x512x512_n"5d9d950724bdcf0233497a0e25ce0397*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x1x512x512_n"98830e2f5dc6874389ea793ba303c292"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 320x4x3x3_n"4e334b01bd21b67f54eb2f911f32a49a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x320x64x64_n"a3aad14b8d5e4842699bc8de6134107e*26"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x320x64x64_n"f2556e4bab89702caa45878084bde805*24"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x320x3x3_n"6e09a9cbf83321b60042a8d1d6993b69*16"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x320x66x66_n"d33393ea3c853e954e3f6c1f1a7a4e0b*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --strides=: 1x320x64x64_n"e3733969eaae461db88a765188c33b74*14"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x320x1x1_n"797ff603450100098b1188d56856c997*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x4096x8x40_n"1494b7d3d171c2ac881a88d8d58b8b82*30"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x4096x8x40_n"c3264b522f3fa323febe16960835e7fa*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x8x4096x40_n"bed028921774b1440a7ac5d17c5a7969*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x77x8x40_n"5421e12c952e4b2967a539692e459852*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x77x8x80_n"0a01350c49b76d68276d4de4ce7aafd1*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x77x8x80_n"e6b3ae1e4217a99ee4c93552084b86e8*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x77x8x160_n"a085aed8aaa18a44a3e70fe9a906c647*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x77x8x160_n"cb97565162ee0b4932f8efada9fbeb80*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x77x8x40_n"1cadb8be5b2d587e5c5458b2cb5fa77c*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x320x32x32_n"749b274e1da0afda6a8c195889f22d4a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x320x1x1_n"c1dec5dc429f8eda35dc159e23bed84b*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x320x32x32_n"8179cef6e3018375b4bcc3d9e10b625e*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x320x3x3_n"9b26deff8d40264267efb8d6524b914c*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x640x34x34_n"82b75b29915740cab626c3f7a1c0301d*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x640x3x3_n"80aa38efe45d2d97367d8e0e48bb6657*16"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x640x32x32_n"2646b3c9bb73c5b1e27cc7cdb6b1b600*22"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x640x32x32_n"2e60c03c761330b27c63869ec35ed16f*22"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x640x1x1_n"ed372a7e5834f27bb059090ee98a63f2*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x1024x8x80_n"3d83f5601bcecf8fd224a58ba60704aa*30"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x1024x8x80_n"c1b434a3089018d2a3ba61216b73f534*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x8x1024x80_n"7e4fa109eb5c91f5bf6b254846ea1d1f*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --strides=: 1x640x32x32_n"88ae0d348ec57f81f072ea57e9bed40e*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x640x16x16_n"206e0fee1a4f6d4097947aa8e7cf2ae5*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x640x1x1_n"a72159b049893d74cf02b8aa892e2c26*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x640x16x16_n"6e5e7ce7f96b66aecc5307ae37eaa3d5*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x640x3x3_n"2035755521206e8d95130675ed220263*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1280x18x18_n"8f531d5217bd500dc9d15107161d2576*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x1280x3x3_n"e967e99d9cfc4941016df16a66e1c2a7*40"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x1280x16x16_n"de087c0f3c9e07571d6225130d25d39d*22"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1280x16x16_n"fd5096cf5e504a845bede81993e51684*22"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x1280x1x1_n"f66086994c1c5427c868f5b1ff90f983*24"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x256x8x160_n"76ad36dc6382bb106c228bc191978d87*30"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x256x8x160_n"c2031ca02053f6aee0d848fd14c3b7b6*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x8x256x160_n"b248dc845933f2a1322152188757f79b*20"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --strides=: 1x1280x16x16_n"8ca7021dc71b1309f68cac225fa58ebd*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x1280x8x8_n"5ac12fa6d4085ae508388a5835e4a472*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1280x8x8_n"d50b59ad2ab21a25d68f7396b8ff9409*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1280x10x10_n"b13cb0f4fa3b5d7258e316b97955ee2d*14"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=aBcd16b --strides=: 1x1280x8x8_n"ee99bb94f3ca2b4841784b8c8df8b861*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x64x8x160_n"3b951e11f21ced4ae051ab0c948ed0a7*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acdb --strides=: 1x64x8x160_n"7e8cb164d20e89965431b40a37543899*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x8x64x160_n"423cc33f6d175ea8cc197158c8951f0c*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x2560x8x8_n"487bfc02a9103169908a7045abf981cc*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x2560x1x1_n"36f06c70f0a309b87e0e235b6beac1e4*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x2560x8x8_n"ada51398c8be590f8daa8891ab3498cf*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x2560x3x3_n"be42cb1cca539dbc8a6c8fe81ce0a5fb*10"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x2560x16x16_n"cb867e79b505b514e403a327511a574d*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x2560x16x16_n"25d3b3d56a6daf58b3cb5766821649f0*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x1920x16x16_n"2d8b67fc5a823aef63799764faddccef*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x1920x1x1_n"cb0468943c1ff26c026a5e444697999e*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1920x16x16_n"16d9a17458706655242d33eba42f374e*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 1280x1920x3x3_n"789ab0d253a106983ec0ac70b229208d*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x1920x32x32_n"4a593bb56f44742d626a9914af4ce427*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x1920x1x1_n"9a85f69825c943fa211dd6e14b97fac1*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1920x32x32_n"984376187c7bddca540b1541cf70ff65*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x1920x3x3_n"0ea675c99e7dd1c8f9bddd0cb8d55450*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x1280x32x32_n"25802ca18665efa3e21d8825e37c2e6b*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x1280x1x1_n"7a91382a873c8379d07bdfd0fab49fbc*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x1280x32x32_n"722fe617b278fae0f63d4e04236a146f*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x1280x3x3_n"d41e19d6a48b3b2a6fb013e1c476144a*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x960x32x32_n"2420973171eadcfa33c3f9e5a91d7e96*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x960x1x1_n"c96961f90bd01f1655d077d702e6e84d*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x960x32x32_n"1ca79ec2b4492305b17a4f48cf562593*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 640x960x3x3_n"1ab9c3a9abd9ee94d62ad74a7bbd48ce*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x960x64x64_n"b5234a2de802d697dcec2f0d75556caf*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x960x1x1_n"edc82c99005a9f6b5cbff87066c0ceff*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x960x64x64_n"951ee386798dfba6449dbce0a60a5eb2*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x960x3x3_n"627b20bddde3596656ef68d3738b3583*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=acdb --strides=: 1x640x64x64_n"4d70df4641baa1e931673990343ddfe0*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x640x1x1_n"34b578c6c50d109c188449d5e93dadab*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=aBcd16b --strides=: 1x640x64x64_n"83a76b2c8f0893700f1a9fff6bc5b004*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 320x640x3x3_n"b2aa267464c57464f9b88829aaf120ab*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=ABcd8b8a2b --strides=: 4x320x3x3_n"b9b8d4261d223fcbc61a12c1aad82ae4*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x12x320x320_n"cd699b49507c4c5529d7f2a764aef7ac"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 512x300x300x3_n"696df5732142125d2bb035a0850cb88a"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=acdb --dtag=ABcd8a4b --strides=: 512x3x300x300_n"4656a8c5574a393b49a9975ff26ecb6d"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x12x19x19_n"21a15d64424b746a7451b65f72dd5e4e"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x273x19x19_n"afcb877b16c1f016475a35c03db7dc15"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x546x10x10_n"04f8fdd9449d32b2a9afc38b48f574ba"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x24x10x10_n"cdba38de45b887eec805f007d309ba39"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x24x5x5_n"6d8ee19c92c1039824440e2c60a33c32"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x546x5x5_n"03613fc1689cc9b5a5ea6cc0061962de"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x546x3x3_n"6ee1f361959e009f5147503aa241641f"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x24x3x3_n"ace9c0126602d4d0de696b5b8d29caea"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x546x2x2_n"4d4955aab665a91c67f3c668ce893dc8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x24x2x2_n"10612ea6efc187c404da462ff7b728d2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x546x1x1_n"52345a9a694722e9f3ae749a593af742"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ABcd32a32b --dtag=ABcd32a32b --strides=: --attr-scales=dst:common:0.5 512x24x1x1_n"edbffd1669ef48a682ae8fdc3d08666a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x546x1x1_n"65a433dddf7f85608bfcb1b07b375f07"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x24x5x5_n"c8c025331df8685b34c6c53aa2b24fe0"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x546x5x5_n"a38def8e764defd94635d5f530f42df5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x24x10x10_n"d7459da92d33e7d994943e5563cff30d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x546x10x10_n"28f410759255708d0975bb4738409883"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x24x3x3_n"5b090de714abfba66cf4fd75687b0be6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x546x3x3_n"7b4080a26c342e122953b1cb3fbd24de"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x12x19x19_n"370a7ffb8853cc7fdd80c1d5d9fba154"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x273x19x19_n"42d82ba522b437cb978f813f483ba0a7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x24x2x2_n"a3a63c99097f60f1776521019b08e991"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x546x2x2_n"48c421535909db913846d3dec60337ec"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=acdb --strides=: 512x24x1x1_n"916a98ff438f6fcdaedab9a839f9107a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd16a --strides=: 128x3x4x4_n"68682d1cfbcadd73910085b3e3abfb03*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd16a --dtag=acdb --strides=: 128x3x4x4_n"0a991ad77ad7cdbd8f037e11849bff1a*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 256x64x1x1_n"add198b85adcbe516316704534525f22*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBcde16b --strides=: 32x8x8x3x3_n"4ee0d937d3b7f4bc82cfadaf3ead026d*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 256x256x1x1_n"524f0109622f4a7aab2d636eaa7f9155*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 512x256x1x1_n"114c75e47aabd7386979b048ee5317b4*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b --strides=: 32x16x16x3x3_n"fb2b0e8f06e459651d9709d387766d39*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 512x512x1x1_n"f409e782cd2bb4dae327a2fcad7931e0*14"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 1024x512x1x1_n"9a3ae884dcd083cffef43e8d0ce4afe6*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b --strides=: 32x32x32x3x3_n"b0708f3baacb71ddeadafadb38928755*23"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 1024x1024x1x1_n"e4c6cf9904931dec7efb03759175a465*90"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 2048x1024x1x1_n"dcfcd184f3a3d60688d859317b4f7410*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abdec --dtag=aBCde8c16b --strides=: 32x64x64x3x3_n"199d333650ece1928cfcb3c8126aab0c*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 2048x2048x1x1_n"44ada299e31e1cc8974b9093ced63842*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 2048x2048x1x1_n"af0fb145efe3a56964ff4f8d3bfa1745*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8c16b --strides=: 32x64x64x3x3_n"38fffe582223f063145eea93b07179f8*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec --strides=: 32x64x64x3x3_n"ad07ac154e901817baa5eb1331f70985*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 2048x1024x1x1_n"ad593a6dd2a1de61da57d13f287eed4c*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 1024x1024x1x1_n"ac388c50e3e7c820e9b2fe42db9d7502*45"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8c16b --strides=: 32x32x32x3x3_n"57ecff409f37c563c96935f47b506744*23"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec --strides=: 32x32x32x3x3_n"4d9443ed32e719eed8a597f5205b9d45*23"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 1024x512x1x1_n"7f96101fa2a82576f6f040102dc656a0*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 512x512x1x1_n"b8bf0d57049d3840883a010e8b320675*7"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBCde8c16b --strides=: 32x16x16x3x3_n"149e7bd7e712515f70afb2f3b8d085bc*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBCde8c16b --dtag=abdec --strides=: 32x16x16x3x3_n"c7ec74b13c2920b3ee897cc91b4749f4*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 512x256x1x1_n"7dadb2f68dccac70e76db456a098770a*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 256x256x1x1_n"a5cff02c016d9c48420e31d18d1ab63c*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=aBcde16b --strides=: 32x8x8x3x3_n"4725ab1b373c4c8d929736cb48c99804*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abdec --strides=: 32x8x8x3x3_n"8299db239cafa0fb702339a2ce9ec79e*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 256x64x1x1_n"69beb47c1524796bf96470a8528bad16*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x128x56x56_n"f13f4eaf7b45d826d24cb864335e6d21&bd66060d7aaa820126da1b012670b5ea*3&8bd36547bac8e2ed6bc6fd78427c18f3&db2e879dbf536caed660a40811f265ef"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=ab --dtag=AB32a32b --strides=: 1x2048_n"875949cec294af26ae168df6052e2e3c*15"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=AB32a32b --dtag=ab --strides=: --attr-scales=src0:common:0.5 1x1000_n"1234a509f1dc1429d76b59dd811df812*15"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x256x1x1_n"dee72c28f57f9da2722314dd1ada08f4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x2048x1x1_n"442fdb25426b39d855f2063576a1a5e6*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 3x256x1x1_n"1d5ce4e87d7e47b179763f9e233a47e0*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 12x256x1x1_n"8bd0c5463907e27bdb3c4508527f91b6*10"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 3x256x1x1_n"3ad110ad498a92b04eb57c497fe038bc*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 12x256x1x1_n"a0ab2910a8512c587ad93a18eb047f5a*5"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 256x256x2x2_n"faf02a10acc708162540f6e29b526ceb*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=cdba --dtag=ABcd8b16a --strides=: 91x256x1x1_n"5fb6b923e6f84e0ee2758060c25d2040*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abecd --strides=: 4x128x28x28x91_n"d4c3e70ad21393158fc6bfb7c2214ac4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcde --dtag=abdec --strides=: 4x128x91x28x28_n"0bdf3c05755d7ed439bd63daf8c1f52b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 91x256x1x1_n"5fd978c3415b6025b51ca22757bbb6ab"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x256x2x2_n"723cea1fd3439fe32c9d463e4c395c8a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x256x1x1_n"db0f28c298db86e6db8274c6e76e433a"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=cdba --strides=: 256x2048x1x1_n"aa037a592277f9b085b28b326de569b2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=u8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 1024x3x224x224_n"7a6ca76df7109a826cd0c4c865199c64"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ab --dtag=AB32a32b --strides=: 1024x2048_n"7f25fe2f00c113e244798e363a603ac7"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f32 --stag=AB32a32b --dtag=ab --strides=: --attr-scales=src0:common:0.5 --attr-zero-points=src0:common:1 1024x1000_n"a012ac159a272ad2d7928b7475750984"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cbda --dtag=ABcd16a2b --strides=: 64x1x16x1_n"2bd26428cf97a32e3f93845f684790bf"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=Abcd16a --dtag=cbda --strides=: 64x1x16x1_n"9cebd9c10427b38cb0c1bb365646801f"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=Abcd8a --strides=: 8x2x3x3_n"bd97e877b1924087a8aa390d19c846ef*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8b16a --strides=: 10x8x3x3_n"d34471d456f26d18802934adfac669fc*64"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a8b --strides=: 10x8x3x3_n"a29ad3423ec7f4f95dd8214f084fe17b*32"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd8b16a --dtag=acdb --strides=: 10x8x3x3_n"3207dc032827574ca9952b540f77f11f*32"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a8b --strides=: 8x8x3x3_n"16b9d1122eb7dc6432f3720d54fba4c2*32"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=acdb --dtag=ABcd8a8b --strides=: 8x2x3x3_n"640d16aaeefb252a908f4d0d311b0c8d*31"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=Abcd8a --dtag=acdb --strides=: 8x2x3x3_n"8958ddcfab3eafcb4f8b8fa829349d57*32"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=Abcd8a --strides=: 8x2x3x3_n"6bae989f1ca1ed2e62ff75c3a7d8757d*64"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8b8a --strides=: 8x8x3x3_n"a56368ada503339466fe05adfbc38491*64"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8b16a --strides=: 10x8x3x3_n"9a5ea8632fc58d15a16b7a9c7501499d*64"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8a8b --strides=: 10x8x3x3_n"7eb7d998f23cf017a9762a6d2177c86b*32"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=ABcd8b16a --dtag=acdb --strides=: 10x8x3x3_n"1dd7edd1e64acb1e7049fd3e839c2a11*32"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8a8b --strides=: 8x8x3x3_n"c435c4afa4a42a2bd09fad22050100f6*32"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=ABcd8b8a --dtag=acdb --strides=: 8x8x3x3_n"14e593150441e6b989f6af050f9671f8*32"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=acdb --dtag=ABcd8a8b --strides=: 8x2x3x3_n"135c5db48333f284cf14239226c24ceb*31"
+--reset --allow-enum-tags-only=0 --sdt=f64 --ddt=f64 --stag=Abcd8a --dtag=acdb --strides=: 8x2x3x3_n"9701c4d69d5d702b2705dac555524452*32"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=cdba --dtag=AcdB8a2b --strides=: 64x3x7x7_n"281a63412a0c88dcbf6cfd8d8e96755c"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b --strides=: 16x256x208x336_n"94ec38fc383cd50a21b5316d71c9ccbf*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b --strides=: 16x512x104x168_n"19f24c43edce649e68394a868f14e9bd*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b --strides=: 16x1024x52x84_n"d7294d40e019451453dcfd8c17e3eee0*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b --strides=: 16x2048x26x42_n"97c6e7e2388d74c8fbe9f391cc836b30*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x256x26x42_n"49fe80bdedd1658a8ff2b802543cf6d5*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x256x13x21_n"9166417ad89050680d78b8d4a53c38d1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x12x26x42_n"278494dbc5544297cc49d18f67472c27"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x12x13x21_n"253683292361583427c833e55b1de83f"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b --strides=: 16x256x52x84_n"f09711249f4c377645a89fd852042673"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x256x52x84_n"79a1c034b03f16d632d6a5b86ca1ee70"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x12x52x84_n"9807f26b108666bbf99eed42b00de5a0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd16a16b --strides=: 16x256x104x168_n"9f3c72d03a93ccd6cce3a754bf69bf4e"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x256x104x168_n"da57d84b7bcb4d0f0e51163856b4e682"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x12x104x168_n"6803ff6667015419a81d93085051da23"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x256x208x336_n"3cfbe52b8c4cf737c4cbc740c052838d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd16a16b --dtag=acdb --strides=: 16x12x208x336_n"42ce5affc4df0eaeb6080d3eec980fd1"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abc --dtag=abc --strides=: 16x1000x90_n"c4bd188d0b32f89fe591d7a346272634"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd --strides=: 16x1000x90x4_n"72edf030b3828322fcfd8a0e3779292b"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b --strides=: 1600x256x14x14_n"2d73b52674e2a1cba6a48d68f55c4ac0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 1600x256x14x14_n"61fe6352af1a38ea2e481d0a468a44fd"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=acdb --dtag=ABcd32a16b --strides=: 1600x256x28x28_n"83833c00e6fcf620539edae9be052644"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=acdb --strides=: 1600x91x28x28_n"1faa1d611b73c0985dd862af3d946fff"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcde --dtag=abecd --strides=: 16x100x28x28x91_n"9508d1cb546fbe0bd895192fb68e6ead"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 1024x128x1x1_n"a94289b784c1066b6b30a84ccc47352f*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 512x1088x1x1_n"04f01802ad0d3ae94fdc81b7e3fe347b"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 256x512x1x1_n"fce5dcbdd2e27f5a6da0e8997d2a419d*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 5x128x1x1_n"c8582981af94c6105e50e0715b285b15*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 5x128x1x1_n"8bd31d600fd36c36226462e6d84afbfb"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=cdba --dtag=acdb --strides=: 256x512x1x1_n"3e12b4b78f97961c682ae5da5bc0a346"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=cdba --strides=: 512x1088x1x1_n"832323021be4802d8890a9d3d28f7a3e"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 512x1088x1x1_n"635eeaafaf142bda2018ec7f8aab9278"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 1024x128x1x1_n"b03bd273fc3c54a63e9a4ad77197343f*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 1024x128x1x1_n"39fe87f691623d7c9ced906eca58b20e*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd16b2a --strides=: 64x7x1x1_n"1a621a082cca9ce196aa591bfa3b2566"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd --strides=: 32x512x16x64_n"005ec6d25f35f50ecee4f23d965b0762*96"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acbd --strides=: 32x16x512x64_n"02c2ea9f8907820e544f4d5ce4d54a56*96"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x128x12x64_n"a43dfcce5611ac845b0ed7462d07b8fb*36"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=acbd --strides=: 1x12x128x64_n"3ad07df7f42f0f1e148738fa750a5453*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab --strides=: 128x768_n"99d0e27bd3088bfff4d75b336b7d62b5*12"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 1x3x300x300_n"fbd406450a0d5b9d0ea2ada3b0863084&9c91f1d5c5716602dbab2be108095ef5*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x126x19x19_n"eeb2fb7f98d9aaaea78b3742ba350b64&d7785b64aa94efee46387cc70d8143c2*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x24x19x19_n"d652c156621459a582dbecb44ace2f07&aa6529ba576a6b770e749fc7f0399866*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x126x10x10_n"fc731d2b0a558adc9aa34ac01732b0f4&a5f7cda7171417b72ee22d2245d7b94a*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x24x10x10_n"4db56053b886174b34925602e9e993e7&c3aee8eaa7db18ec44bb9d163afac0aa*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x126x5x5_n"174db48bcde3aa2d0f8de4f5f7879cbe&88db01363e245f5ca90d6badb165319e*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x24x5x5_n"c2ce5c3c6bbcc43f9569c6e62b5e352a&14978198f407967dab17381574e5d7fa*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x126x3x3_n"5c39b176c3ae729c0f7cf443a1c79e58&5293afd0f7cfec787b4a10597dcf8bc7*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x24x3x3_n"e69222e12a3ee2a4c97dcc22d439f249&e3f6d851ad66b9e376f4cbf882dbf451*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x126x2x2_n"4ec26aecb6b825f68d919488bf8fdc91&8529be3855d62f527201ee86df41c20a*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x24x2x2_n"9eda0fc4d9d27f0028bbe818ae2d8f7b&57fce3de44ec59341337d5c2f9a48466*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x126x1x1_n"ae4a67ab94503efe464da94f52ef2af4&93960c1c335db18704227685131f2291*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=aBcd32b --dtag=acdb --strides=: 1x24x1x1_n"61a0b8af45246b166be4d7234b26a686&bd0e22251ada2a01de157b93a7532e3f*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=abcd --dtag=acdb --strides=: 256x3x224x224_n"6167ac5fe53dcc53db01ee3e18c1444c"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd16a2b --strides=: 64x3x7x7_n"4f905086a04e4e29b6c9728ee65268b0"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 64_n"b9401b34f22c7d35df2ce979dc22d572*21"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 64x64x1x1_n"410b25541631fb9d30610ade40d23672*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 64x64x3x3_n"3f555688f071885e7ed00f408b452fb3*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 256x64x1x1_n"737d29f5eb6df4e1a038a0ddc8734df8*8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 64x256x1x1_n"2a975e9e6473bc24e7bab9a648a4069b*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 128x256x1x1_n"8ee502d27f5647ab0726993edd8da12b*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 128_n"7b1598a63c4272eed11eda1700802d21*24"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 128x128x3x3_n"124289b89d95ba3dccf2e39edf4f8db3*8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 512x128x1x1_n"f6990c2d7e88d2319244218bd648fa02*8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 512_n"f0453a7b82f818fb2b2cfed0a23cf6ec*33"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 512x256x1x1_n"c7f2d76e2ce6511d8040ac01f00e1ec4*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 128x512x1x1_n"7262384ee0608d3bb00106ae985c1a42*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 256x512x1x1_n"882d66eb611a01ae12a72b723a8391ab*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 256x256x3x3_n"a1a07782248e14960793c774a6cf7ce1*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 1024x256x1x1_n"0bff4634ae59a043208c5f9500933c6a*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 1024_n"2e9aa7d362ee59a5d47af253f82def0b*21"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 1024x512x1x1_n"1dcd8e1541ee5b13cffa43dbb69aacb1*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 256x1024x1x1_n"5fbf93f3e2ebd87148b3db7ded6f18a2*10"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 512x1024x1x1_n"8b50ff17231d1c2efaac2aae2687dee6*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 512x512x3x3_n"92ac646558b71889c427f781b93e08c3*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 2048x512x1x1_n"8a05f6524c46b836d082e4c6b28da4cc*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=f32 --stag=a --dtag=a --strides=: 2048_n"19398be16b90977816e6f4d827546ff2*12"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 2048x1024x1x1_n"0a7ebc8446a1dfac1e738dc35d10c04b*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=acdb --dtag=ABcd8b16a2b --strides=: 512x2048x1x1_n"80e11fa48c9a2f3c6a30ab216639c8be*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 2048_n"d69203d4d8a88f732801a6892679726c*8"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 2048x512x1x1_n"a181c6a842ba2d3f80b835efb70cc649*3"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 512_n"7f909594e6a3998c51526b2bae4b01a6*22"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 512x512x3x3_n"c0b45bb4f6ad1f4e82b374280a1a9c04*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 512x2048x1x1_n"ec6e59238453bdb61684170ddb4eb0b8*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 2048x1024x1x1_n"ee8a915c687808695e3751d238a0c9f7"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 512x1024x1x1_n"15a85b68e62c22804630a4e0bce05187"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 1024_n"893e23596fcc435dd84b31b0c65b5579*14"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 1024x256x1x1_n"0934a9fa19123b4f15c33e5dc6a7217b*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 256x256x3x3_n"8b95addbd39b335d3c538e5bfaad3c48*6"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 256x1024x1x1_n"20d9ef65e3df65a8e703fa7516d8b099*5"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 1024x512x1x1_n"16810224d5645709214916c84c1f8a03"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 256x512x1x1_n"f0ddf5b3789e36110af2a2b0689365f9"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 512x128x1x1_n"e92f1fc14623e8f89c0507be328eb6ce*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 128_n"0d60d1fb3ef82412bb4509693cb6acee*16"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 128x128x3x3_n"aad6512a77819358a665d79008888b1a*4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 128x512x1x1_n"6ba2605fd53b0dd278853d0ca2a12254*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 512x256x1x1_n"d1bda6ae05a0e1248833bd219afc5df4"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 128x256x1x1_n"fdf2c22c203395d346d76b57fc4f2363"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 256x64x1x1_n"7418b4de5c9e182026d53c8d632ac051*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=bf16 --stag=a --dtag=a --strides=: 64_n"d8e8d77ae6efef2ddb5605ab31f78f6c*14"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 64x64x3x3_n"183e866de020292acd94a72fa002e6bf*3"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 64x256x1x1_n"67bc6d255c0080e52ba27253f8426c24*2"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd8b16a2b --dtag=acdb --strides=: 64x64x1x1_n"8d08c28b705293868e23387de16b953c"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=ABcd16a2b --dtag=acdb --strides=: 64x3x7x7_n"753a4ea1db55e34b7a52c1ffc87e91ac"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab --strides=: 128x768_n"d0aa78157bb8053df2cfa65e444d893f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 128x128x12x64_n"9dc7431915121b482b501b31dcc92027*96"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=abcd --dtag=acbd --strides=: 128x12x128x64_n"19eee9b5dfe4d81c2e39125d86300a7e*96"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=aBcd16b --dtag=aBcd32b --strides=: 1x128x1x1_n"62881f50ac98c78aaf0d1fd5dec80b54"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=abcd --strides=: 1x546x1x1_n"c00dd9ed6919361bb1e86896d40d6645"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcd32b --dtag=abcd --strides=: 1x24x1x1_n"9d7911167b4bc17da88ebedc00e12c57"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x32x416x416_n"f96297b3ab9209d5db677d7b6d3d4fda*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x32x416x416_n"63eb11df9403434d56c94c9647457ede"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x64x208x208_n"0dde50c1767b09fe2b87b23ba94d4319*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x64x208x208_n"4dfdb19f368654a7dcced6ae8770a4c8*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x32x208x208_n"4378023601b9fc0c259f561bce3a6c87*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x32x208x208_n"cab93da1c223b08b6e9c7ab1204f5a30"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x128x208x208_n"2c1e574428b602b69703f97e9fbbee8d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x128x104x104_n"71438f82a9d0e3d39613cb3c4462ba65*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x128x104x104_n"42c855654465c57ef446841f4a741825*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x64x104x104_n"7d4695a428c33bf16078d93aafbb6b03*14"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x64x104x104_n"9e96ba0186df6113da106f0ba90a549f*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x256x52x52_n"95ef89622ff4ab408d7f19383887ed4b*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x256x52x52_n"c17a6a1289db3849fc20159c651ab52c*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x128x52x52_n"6dc29bcb80bdcca0506bc6f426e92a4d*38"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x128x52x52_n"111feacf2e78cc3380acc0519f5f25a5*17"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x512x26x26_n"b81df0f3fca800827d7a4660c82ca238*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x512x26x26_n"0103830fd10e3dee9481616d15412c24*5"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x256x26x26_n"e1d3929303276153d6d5c16877bc1e4c*38"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x256x26x26_n"2ac5c7f7de91ad80400641e3c5426288*17"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x1024x13x13_n"4f09e086ad3bde55efe3b2ee76979daa*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x1024x13x13_n"b3ea21d76abc63bf840e4dbb47d227c1*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x512x13x13_n"c2698c9f835ac346a2b9d03c13f8efec*22"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=ABcd32a16b --strides=: 32x512x13x13_n"5ffe7267ef2857a63a395dc69a25b610*9"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x255x52x52_n"c0dc916e84cf89b7c6fadc500a1af421"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x255x26x26_n"f8fa71cdd4844b1c855f12fdac23173d"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ABcd32a16b --dtag=abcd --strides=: 32x255x13x13_n"63e0209ba30783fff6fc3e6fec2f4db0"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcd --dtag=ABcd8a4b --strides=: 1024x3x224x224_n"88fa678688cae680b81e5ab5119cd8fd"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=ab --dtag=ab --strides=: --attr-scales=dst:common:0.5 1024x2048_n"73127581b966814c72dcd8b131979d04"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 8x768x14x14_n"e1f7a0a09476a4acd3b204630dc06e72"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --dtag=abcde --strides=0x8028160x35840x160x1: --attr-scales=dst:common:0.5 1x4x224x224x160_n"d9ca3d2333c5d5e94e6bf6d8298123d0"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=acdeb --strides=: 1x4x224x224x160_n"c3654c29eceaa2168b8faafcdf7b511b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=a --dtag=a --strides=: --attr-scales=src0:common:0.5 32_n"1016a18a7e3a7554f2fe862d7cacd585*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x32x224x224x160_n"07df49326f717a931b7832a29ccc68aa*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x32x224x224x160_n"34746f29d3242d3c90d38ed991af7ef0*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x32x224x224x160_n"0999147166264f753658830a335ba4b3*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=a --dtag=a --strides=: --attr-scales=src0:common:0.5 64_n"419ef439cea5a69d3de4262418e31af6*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x64x112x112x80_n"b4a4fba567a4c2df9cb669fd0f0adabc*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x64x112x112x80_n"76f3a0163a33a794be832d5ccbe98f03*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x64x112x112x80_n"93b84d954f85ff9ca4b9ad5846d3fcba*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=a --dtag=a --strides=: --attr-scales=src0:common:0.5 128_n"9d93bda2f4a27f8d136dfe710215534d*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x128x56x56x40_n"d7a4e329a1c0d51bb0acadf1f8ae92e2*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x128x56x56x40_n"318f25f7aac58528dd7040bf0f43344f*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x128x56x56x40_n"61723e887e8d9a0aba76071d5dd2e04e*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=a --dtag=a --strides=: --attr-scales=src0:common:0.5 256_n"829cd6a9505fbc1401ddbb9f8061f316*8"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x256x28x28x20_n"c26bf8a669571f045f2bf8737ecbe2b4*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x256x28x28x20_n"2a5adcb11a8b9fb82831e4b8eea822c5*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x256x28x28x20_n"7042d346a1ffcebaed256d755f530708*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=a --dtag=a --strides=: --attr-scales=src0:common:0.5 320_n"d41a47d3f2430d94ca95397a23ea63b6*12"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x320x14x14x10_n"518c807f9fb12622b8e66e7523f0c63e*4"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x320x14x14x10_n"f05870fef8fe131f09fe10393fb3314a*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x320x14x14x10_n"d1e47dcd3b378e11f0a9cfeef7469d5c*4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=aBcde32b --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x320x7x7x5_n"2f33e34f53f0de4a8346ae593d9dfc89*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x320x7x7x5_n"8e30ff922a651c869a3691a619738c57*2"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x320x7x7x5_n"e6b18590c501e06024735e7fb651ac62"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x320x7x7x5_n"3745b81fc4a4ea91d0e88d2492a06ccb"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x320x14x14x10_n"0dcc0ea71a9898c8a049005d77d3db6a*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x640x14x14x10_n"1cf2e2baed8af427c7a3ea0efacf137d"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x640x14x14x10_n"35b8ccc00475989b49321fc4982826ac"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x640x14x14x10_n"2c0a5d60869c056106ff3db46a254eb4"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x256x28x28x20_n"ac00bc40cc4bf882f1e70007a96b918f*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x512x28x28x20_n"787519d43b4bed2ff96a77c66e344310"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x512x28x28x20_n"29068e769a39b494b52b83100eb16ef9"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x512x28x28x20_n"a09f1d8c0bc0f0ed78de329fc327b72f"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x128x56x56x40_n"575e8248eb62dd12cc8e25b288b2b9f3*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x256x56x56x40_n"5f9cf3abc4becc8c6cde26ffccc32c24"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x256x56x56x40_n"6d9bc335f2a7cb6df6331216f047e0f0"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x256x56x56x40_n"74e1dac4fa8dbb82271f9f8f2fff1c3c"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x64x112x112x80_n"38cf9aafabc7f26db085889da5841eca*2"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x128x112x112x80_n"22c342aecb10a3fecad1f33209b3ae23"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x128x112x112x80_n"6bbf2028f57564b2298cdec2ed580e75"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x128x112x112x80_n"50d9e8577cc1c05892017c0912635f9a"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=abcde --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x32x224x224x160_n"5f3638e6423d414ce8f34815d4b69bfc"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=aBcde16b --dtag=abcde --strides=: 1x64x224x224x160_n"d54f03254a49d7d324ce5b0370f08c87"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=s8 --stag=abcde --dtag=abcde --strides=: --attr-scales=dst:common:0.5 1x64x224x224x160_n"51d3e4a28693f50dac28667935b8616b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=s8 --stag=abcde --dtag=aBcde32b --strides=: 1x64x224x224x160_n"b5827d3183f4c42038af704d1177748a"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x4x224x224x160_n"08304a86436d969124d392590748fa3a"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x4x112x112x80_n"e51d5472a4f7e1c527a8de6e1a3e4d6b"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x4x56x56x40_n"c8e5224a78a6890b84494e49eb94be9a"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x4x28x28x20_n"4acd23e77fc48545b30244227b348140"
+--reset --allow-enum-tags-only=0 --sdt=s8 --ddt=f32 --stag=acdeb --dtag=abcde --strides=: --attr-scales=src0:common:0.5 1x4x14x14x10_n"f169e2cdb685370ac2c99b603c4a33a1"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=u8 --stag=abcd --dtag=abcd --strides=: --attr-scales=dst:common:0.5 --attr-zero-points=dst:common:1 1024x3x300x300_n"089ab40088f86e894f4e122f9351c3d3"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x19x19_n"629f55528faf66d6b3b248ad59d1fcef"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x19x19_n"041e1a023b2b7634feceec2512e69639"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x10x10_n"eee4bc7498fc60a34b32331416d0b94d"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x10x10_n"f5ed387942cd1527f4bed2e5a2fc2e54"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x5x5_n"db3101f68203fc518dd56d8c54173099"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x5x5_n"e77fd6eec02a2371772473f4787db8db"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x3x3_n"13bd05cb672633f6fa3ea7665d2a7b0d"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x3x3_n"83cab98699b1e11cc3c007269f618956"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x2x2_n"8e9f98caf0d0691c211f8a48660c6bed"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x2x2_n"0ef05387bc36928383f145e4551bb443"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x126x1x1_n"2ced7db68d105e5d7a05dc3553a814a5"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=u8 --stag=ABcd32a32b --dtag=acdb --strides=: 1024x24x1x1_n"ab7b05b5f6045f36147e74a095b3e1e6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2x112x112_n"139c032612109562b0b49c7086519aa0"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x2x56x56_n"f505d17bcf01c318f082e476838c59c8*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2x56x56_n"bb51f2efe1b0f859026825bede81c6ba*2"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x4x28x28_n"e2a68e19ac8dfee6dbe1f04175a88664*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x4x28x28_n"de9550950e5e7d547bcc80c9d90f4d1d*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x8x14x14_n"350fd905ab9ee60470f07618c900b8e1*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x8x14x14_n"ae21d7e0ea3795da94a8560166ac7461*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x16x7x7_n"55b52751f81412004bb43e202714f850*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x16x7x7_n"d7d5caf3e328c7c2579d43893668403a"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2x1x1_n"984d595c3b947f1b5680681871c10878"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x128x56x56_n"a9500e54df16188499a3d80acf902efd*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x56x56_n"e9c9b5a620880fd3d2e099b159aa5677*7"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x1x1_n"627288cf5b10d622a39655edcfd135a8*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x56x56_n"7ea7751ca559278039314c7425c81dab*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x256x28x28_n"a15030492323dc6114d1dc1a0f929c69*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x28x28_n"a680c9d94b086e6a7592dd7f96b31b7a*8"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x512x1x1_n"87453157a01bcba7c7518e84af4e0afe*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x512x28x28_n"bf5a7929e937744b1ca68c3e78d36eb2*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x256x28x28_n"4dae724796079cb888013ca3d4fd14d2*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1024x14x14_n"58890e05b80e06b5104cc5102f9b2ebf*12"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x1024x1x1_n"da0302be2706d56937970f6a8c7ad81d*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x1024x14x14_n"2e73eea631ec8ffd913b31db86d7e700*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2048x7x7_n"360d3882e00c33727b18014d73cafbac*6"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=aBcd16b --dtag=abcd --strides=: 1x2048x1x1_n"5ed3ee2eb1b2c157e8d7c7fc64e30015*4"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=abcd --dtag=aBcd16b --strides=: 1x2048x7x7_n"3d70fa16ccf572f9496c8e01d6ba3389*3"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ab --dtag=AB32a16b --strides=: 1x2048_n"3851c517a5a63b0d54ce815befca3017"
+--reset --allow-enum-tags-only=0 --sdt=f16 --ddt=f16 --stag=ba --dtag=BA4b8a8b2a --strides=: 2048x1000_n"012be120f144f35013e0910c6c86315d"
+--reset --allow-enum-tags-only=0 --sdt=bf16 --ddt=bf16 --stag=Abcd16a --dtag=cdba --strides=: 64x3x7x7_n"aa6fa6608a009071ed8c003a40f0c715"
+--reset --allow-enum-tags-only=0 --sdt=u8 --ddt=f16 --stag=abcd --dtag=ABcd8a2b --strides=: 64x3x416x416_n"bb5ac50298b0c72dbf211d75787b842b"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: 1024x546x1x1_n"243b4310de416131c647a2a4972214a6"
+--reset --allow-enum-tags-only=0 --sdt=f32 --ddt=f32 --stag=ABcd32a32b --dtag=abcd --strides=: 1024x24x1x1_n"7bf07b8b89cbba2f91f8af97c2866085"
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_all b/tests/benchdnn/inputs/reorder/test_reorder_all
index 3fbefea1068..8c66a9956d4 100644
--- a/tests/benchdnn/inputs/reorder/test_reorder_all
+++ b/tests/benchdnn/inputs/reorder/test_reorder_all
@@ -60,16 +60,6 @@
 --stag=aBx4b,aBx8b --dtag=aBx16b 2x71x16x16 2x72x16x16 2x73x16x16
 --stag=aBx16b      --dtag=aBx8b  2x71x16x16 2x72x16x16 2x73x16x16
 
-# test if jit kernels properly handle corner cases:
-# * large stride problems
-# * huge dimensions (UINT_MAX + 1)
---reset
---skip-impl=ref,simple # ! test jit version only
---sdt=f32 --ddt=f32
---stag=abx --dtag=aBx8b 2x16x19200x19200
---skip-impl=
-1x4294967296x1
-
 # f16
 --batch=test_reorder_float16
 
@@ -99,3 +89,9 @@
 
 # Scales
 --batch=harness_reorder_scales
+
+# Decompression quantization
+--batch=harness_reorder_decompression
+
+# large problems
+--batch=harness_reorder_large
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_bfloat16 b/tests/benchdnn/inputs/reorder/test_reorder_bfloat16
index 87d47073893..4563b1a1578 100644
--- a/tests/benchdnn/inputs/reorder/test_reorder_bfloat16
+++ b/tests/benchdnn/inputs/reorder/test_reorder_bfloat16
@@ -1,6 +1,6 @@
-# f32, s8, u8 <--> bf16
+# f32, bf16, s8, u8 <--> bf16
 --reset
---sdt=f32,s8,u8,f8_e5m2,f8_e4m3 --ddt=bf16
+--sdt=f32,bf16,s8,u8,f8_e5m2,f8_e4m3 --ddt=bf16
 --stag=abx
 --dtag=aBx16b 2x64x14x14 2x56x14x14
 --dtag=gOIhw16i16o 2x64x64x3x3 2x56x56x3x3
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_ci b/tests/benchdnn/inputs/reorder/test_reorder_ci
index 97dd695aab2..d2b91b23376 100644
--- a/tests/benchdnn/inputs/reorder/test_reorder_ci
+++ b/tests/benchdnn/inputs/reorder/test_reorder_ci
@@ -12,6 +12,15 @@
 --oflag=
 2x16x3x4 1x17x5x3 30x1
 
+--reset
+
+# 4d reorders
+--sdt=s8,u8
+--ddt=f32
+--attr-zero-points=src:common:-1
+--stag=adbc
+1x12x128x33
+
 --reset
 # compensation reorders without groups
 --sdt=f32,s8,bf16
@@ -47,3 +56,35 @@
 # int4 cases
 --reset
 --batch=test_reorder_int4
+
+# fp4 cases
+--reset
+--batch=test_reorder_fp4
+
+# plain transpose cases
+--reset
+--sdt=f32 --ddt=f32
+--stag=abx --dtag=axb
+1x4x84x84
+1x512x100
+1x512x101
+2x64x15x10x20
+2x64x18x18x18
+2x64x19x19x7
+2x64x31x32x16
+2x64x8
+--stag=axb --dtag=abx
+1x4x84x84
+1x512x100
+1x512x101
+2x64x15x10x20
+2x64x18x18x18
+2x64x19x19x7
+2x64x31x32x16
+2x64x8
+--stag=ab --dtag=ba
+16x64
+7x65
+--stag=ba --dtag=ab
+16x64
+7x65
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_float8 b/tests/benchdnn/inputs/reorder/test_reorder_float8
index 7a5ff3f97d2..54b57bda8ab 100644
--- a/tests/benchdnn/inputs/reorder/test_reorder_float8
+++ b/tests/benchdnn/inputs/reorder/test_reorder_float8
@@ -31,3 +31,10 @@
 --reset
 --sdt=f8_e4m3 --ddt=f8_e4m3
 --stag=abcd --dtag=acdb  1x2048x14x14
+
+# stochastic rounding
+--reset
+--sdt=f32 --ddt=f8_e5m2,f8_e4m3
+--attr-scales=src:common:1.375
+--attr-rounding-mode=dst:stochastic:123
+1024x7
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_fp4 b/tests/benchdnn/inputs/reorder/test_reorder_fp4
new file mode 100644
index 00000000000..b1b17239e5e
--- /dev/null
+++ b/tests/benchdnn/inputs/reorder/test_reorder_fp4
@@ -0,0 +1,19 @@
+--reset
+--sdt=f32
+--ddt=f4_e2m1,f4_e3m0
+--stag=bax,abx
+--dtag=abx,bax 2x64x14x14 2x56x14x14
+--dtag=abx,bax 2x64x64x3x3 2x56x56x3x3
+--dtag=abx,bax 4x16x16x3x3 2x16x6x3x2 2x2x10x2x3
+--dtag=aBx16b 2x64x14x14
+--dtag=gOIhw16i16o,gOIhw2i4o2i,gOIhw2o4i2o,gOIhw4o8i2o 4x16x16x3x3
+
+--reset
+--sdt=f4_e2m1,f4_e3m0
+--ddt=f32
+--dtag=abx,bax
+--stag=bax,abx 2x64x14x14 2x56x14x14
+--stag=bax,abx 2x64x64x3x3 2x56x56x3x3
+--stag=bax,abx 4x16x16x3x3 2x16x6x3x2 2x2x10x2x3
+--stag=aBx16b 2x64x14x14
+--dtag=gOIhw16i16o,gOIhw2i4o2i,gOIhw2o4i2o,gOIhw4o8i2o 4x16x16x3x3
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_gpu b/tests/benchdnn/inputs/reorder/test_reorder_gpu
index 9325e614609..d3110eed11e 100644
--- a/tests/benchdnn/inputs/reorder/test_reorder_gpu
+++ b/tests/benchdnn/inputs/reorder/test_reorder_gpu
@@ -254,14 +254,21 @@
 --reset
 --batch=test_reorder_ci
 
+--reset
+--batch=test_reorder_float8
+
 # Test layers of some key and ext GPU DL Frameworks
 --reset
 --batch=option_set_fwks_key_gpu
 --reset
 --batch=option_set_fwks_ext_gpu
 
+# Test decompression features
+--batch=harness_reorder_decompression
+
 # Catch overflows
 --reset
+--skip-impl=ref
 2147483648_n"int_overflow"
 4294967296_n"uint_overflow"
 2147483869_n"nd_range_overflow"
diff --git a/tests/benchdnn/inputs/reorder/test_reorder_int4 b/tests/benchdnn/inputs/reorder/test_reorder_int4
index de773950647..ae51f297724 100644
--- a/tests/benchdnn/inputs/reorder/test_reorder_int4
+++ b/tests/benchdnn/inputs/reorder/test_reorder_int4
@@ -30,3 +30,6 @@
 --stag=aBCde2c4b2c 4x16x16x3x3 2x16x6x3x2 2x2x10x2x3
 --stag=aBCde4b8c2b 4x16x16x3x3 2x16x6x3x2 2x2x10x2x3
 --stag=aBCde4c8b2c 4x16x16x3x3 2x16x6x3x2 2x2x10x2x3
+
+--reset
+--sdt=u4 --ddt=u4 --stag=ab --dtag=ba 15x16 16x15 16x16
diff --git a/tests/benchdnn/inputs/resampling/option_set_fwks_key_gpu b/tests/benchdnn/inputs/resampling/option_set_fwks_key_gpu
index 9597cb2a717..8dda9fa89db 100644
--- a/tests/benchdnn/inputs/resampling/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/resampling/option_set_fwks_key_gpu
@@ -1,87 +1,87 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=s8 --ddt=s8 --tag=ABcd32a32b mb32ic256_ih13oh26_iw13ow26_n"cef40f76199e03d56cf2c0906725bcf0*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=s8 --ddt=s8 --tag=ABcd32a32b mb32ic128_ih26oh52_iw26ow52_n"ae316f3a30ed0be5310d9b3856ddd16d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"3e38f12cde36b36c4650399c7ca1d879*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"d0d0294583158385edd173fc3f39fa18*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"1e3efcce1c7c8d262544f7134540354a*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"3575d83ed2f04ef530b5433b7e3634d0*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"b64e97d86a370f161fac4dac28cce6a2*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"651b8bba8819ba46acfb49da6b6c1a1c*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"b00e70e27d2b7152d9a78f906a24244f*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"e1203fa994f8fc30515f51d98ee6fbf1*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"b6a58a81330092b09c07c43feffa157d*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"f4cf1ff3be8fa029b7cbb79abdea8c66*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"9e2be0dffdff47281d64faf8ae06661b*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"254e17c7cceb3f0ff70e34629cb19454*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic1280_ih8oh16_iw8ow16_n"b6abafa10003c69d5eeec460536bdc5d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic1280_ih16oh32_iw16ow32_n"d8563b36b070f09d9e07a62f73388640*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic640_ih32oh64_iw32ow64_n"d3cb56da675d63c196b6428fba458db3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=u8 --ddt=u8 --tag=ABcd32a32b mb64ic256_ih13oh26_iw13ow26_n"18521fc416127a750cf324312ab99ac9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=u8 --ddt=u8 --tag=ABcd32a32b mb64ic128_ih26oh52_iw26ow52_n"3cff420785963e2effa4df557deea48b*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih4oh8_iw4ow8_n"4bffd981df9dc513195bb48298cc06dc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih8oh16_iw8ow16_n"113e974ce736a837c035dc7bda2b84d3*1&d2bba52b9c6b56a1d5e41bb6c9c7666a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic640_ih16oh32_iw16ow32_n"2fbd5d7e3ea39c2bf8f9a43bcc2fab90*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic640_ih16oh32_iw16ow32_n"df4eaa082788631217e03e3d802f4310*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih8oh16_iw8ow16_n"b977f6d0fbbc5296a1003f3b2382350e*1&43559af3ae5ac8b59982c6cd5fd7182e*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih4oh8_iw4ow8_n"f3c8d6f221fa11daf40d4e7044eef9d9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb16ic1280_ih8oh16_iw8ow16_n"18a272c16841a238eef9dd2c9085f99f*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb16ic1280_ih16oh32_iw16ow32_n"685fe60d8276322d42138032dcea3d67*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb16ic640_ih32oh64_iw32ow64_n"00d55a75a73880a0ac18ec0a8619a0fc*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb8ic512_ih64oh128_iw64ow128_n"aeedb34604a172e94b544e522397a8ea*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb8ic512_ih128oh256_iw128ow256_n"c89d35d3814ed35777f14a33b6d07860*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb8ic256_ih256oh512_iw256ow512_n"5af590bb6d8c2b09ee08014f6e60503f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f16 --ddt=f16 --tag=acdb mb8ic3_ih512oh299_iw512ow299_n"af4caa747a0305ba823db8ece8395486*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb64ic256_ih12oh23_iw12ow23_n"c6aa7f6a4a00826c71c7b045b21f1ccd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb64ic256_ih23oh45_iw23ow45_n"3d392f78ea28de0aeebfcb1573f7e48d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb64ic256_ih45oh90_iw45ow90_n"07945062598aa8e6cd15c034a2dddb23*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic384_ih20oh40_iw20ow40_n"bc7cecaf5b38985211f6463f7a199c2d*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic192_ih40oh80_iw40ow80_n"ad5c45f2fdae0747ee4c33a8bf60a6b7*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih427oh800_iw640ow800_n"03b7819054c44b0bb79675a00979d57b*1&3a530decdd297ead9c739c3edc8207ac*1&c0e9ec40f576ec86b46bb16b5523e33c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih375oh800_iw500ow800_n"931b26686111498a38244ec174878bdd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"bd2be0c83b2ebb36d037c3353a28556a*2&1d665dad3a6f7053269959e32c2550cf*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"671f47acd3f3af3d4d1e77e0c2632229*2&673555a3870a116bfb7b4a3576f780b2*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"979649d409f9af7cffc7fd7656198a51*2&ba225c5b5d05f51d2580dc7579285ea4*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"8de126a447e49d4098a3f988a01db8ca*2&0770fb9f1a1807e423a3f19e622405aa*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih500oh800_iw375ow800_n"9e4230684e1efd18cf94a332e2231366*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih640oh800_iw362ow800_n"9f44c3aac2791b59ec08e33c23c317cc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih640oh800_iw639ow800_n"4da31bc83dc1ebac82a80645d05fd6e9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"fe32e5fe521a0f24a27d53b24485de15*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"752f20b71a297d4aee2505ba00017327*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"d11316d1e0507810c181da5b8723b47b*2"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"30765efaaf6a4cfca40e5e3d5d4b4361*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih406oh800_iw640ow800_n"dbb6fafa8c864153dee597ffa79fcd58*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih640oh800_iw427ow800_n"8b20335e6de34b02fcf0063511d82c10*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"06e8c8749bc5be64b879197f25bb35a9*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"ee560077e47430cdad089485469f6b9c*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"a7fe3844146bdc2c7f2a0bb2ec743db1*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"dfe99549cd341e77f43940794fcdfc03*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"d0b0d3c230fbe9ace77bec102c4698af*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"5a793284606290a02290c2938c0e683b*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"5b747a1e240102daaa8c4ec4bd69d852*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"3e5c2ae01bafdb4b2ea61fdbb0367415*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"6c464ca7bd5fdc3db540b10d04628b08*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"888b8fc2fec57f491a6f4d95397f04e3*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"8dd145224c68d6390b8a1aae63432909*3"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"434269551064ab0a3e2c8eed15755b36*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb8ic1280_ih16oh32_iw16ow32_n"68c9b0adf4a2dd6562138d6188bc5e32*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb8ic640_ih32oh64_iw32ow64_n"1e78cc99f22fcd094305479fa98946ed*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic640_ih32oh64_iw32ow64_n"9c343ae9d3cc7fbd57eab76761fbc6a3*1"
---reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih16oh32_iw16ow32_n"59da80f7457c3e31ca42a32adc485f79*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic1280_ih12oh24_iw12ow24_n"4c3354dffb2cddcba65d29543a82cd9f*1&617c195621fbcb5485bb1aaeb5da568a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=abcd mb2ic1280_ih24oh48_iw24ow48_n"9bab7d8fabe22389da7fd61462391a9f*1&bb77a681e37c6faa106710875270bfb4*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=abcd mb2ic640_ih48oh96_iw48ow96_n"0d4944e44b9ed5497537924f3016433f*1&6a7d64711cfad2dbff01743370b8b3bc*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic1280_ih12oh24_iw12ow24_n"71bd9511f8e1fcc00eae1d207242e1ee*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=abcd mb2ic1280_ih24oh48_iw24ow48_n"8443b87d0b10d7798d92c101787168b9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=abcd mb2ic640_ih48oh96_iw48ow96_n"3c5c3766689aab59398765ce0de47bdf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic512_ih16oh32_iw16ow32_n"2c3dced4a172b8376c187fe4706054e9*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic256_ih32oh64_iw32ow64_n"b05601964fa1976703b3fd0fc45eefd8*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic128_ih64oh128_iw64ow128_n"ddd752641381b448a41af8d093240029*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic64_ih128oh256_iw128ow256_n"a492aa097d76a460cd255497f92867ad*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic32_ih256oh512_iw256ow512_n"d6138442a31ea558030e3027b970214a*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb1ic1280_ih8oh16_iw8ow16_n"2574cd3cb78131e13b9ea6e10c0df11b*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb1ic1280_ih16oh32_iw16ow32_n"e96bfea5402ca0b609029e3ad04ee9dd*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb1ic640_ih32oh64_iw32ow64_n"a96340cb34069d62c88fcb5f5526297d*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih409oh800_iw640ow800_n"f0dea0b730628dd14f16efc9b2e3d344*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih468oh800_iw640ow800_n"1065bb70ab1e70476d3a8995b792a97f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih480oh800_iw640ow800_n"85a4e736b8fc6ffa6641acd303468631*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic256_ih13oh26_iw13ow26_n"3edb3e281a4450419c93538a2b4c9aeb*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic128_ih26oh52_iw26ow52_n"4f1ea2a7bbbcdbdeca8ce132a995975f*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"d0ef2ea96497e9a73b31e84f2de34f88*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"e204d16b3eadfb6db3c1c57d5c5eb57b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"8954c4f27068f8e2bfc8d91589fc0eae*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"40f67dc4469b3948e1b83ce16d8a1b7b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"36d228340bee3ce925ecba3d9a81238f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"23d3c662575780fb1dba9fd5b860f99a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"ecfb3d71c18807a9e8cbe06a5c643bfe*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"60dc38d5d714db3991422dacd00e22df*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"2456d63e4207c099840d6d36f329839b*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"56745a24a5d8631d3a483bf3ddb82a04*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"1c19051f1c4fc9f1dfdf204af479023a*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=bf16 --ddt=bf16 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"1aa896e10126c90cbf300592fdd4328f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih427oh800_iw640ow800_n"2295fcb9e52c26664132685bec790c63&97386341dc79c18c3a25d7a0164de8a0&9a933d4ca8eee65fbcdfd495b141c05b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih640oh800_iw639ow800_n"d97e7d98fa943247f0fb8c618ce1504a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"918ba98f3c58e24f93dcedee7a60beda*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"542094a555a93114b31fee6b07a22b45*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"e467e2f9704a22bb8a62767aaf3f53ac*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"a7ebe2f32829e2d0c0de8fb5f0767476*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih406oh800_iw640ow800_n"6b39031a8546f6646984d5b6efc9d247"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih640oh800_iw427ow800_n"f3bc22cc2e373e130567650ad4857ee7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=u8 --ddt=u8 --tag=ABcd32a32b mb64ic256_ih13oh26_iw13ow26_n"4b455508a5b33811f321cad442ec3cb5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=u8 --ddt=u8 --tag=ABcd32a32b mb64ic128_ih26oh52_iw26ow52_n"85e44df7a4980342a3fdec7fdec85cd2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=s8 --ddt=s8 --tag=ABcd32a32b mb32ic256_ih13oh26_iw13ow26_n"f294e85f77551777912fd6cdfa369ce9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=s8 --ddt=s8 --tag=ABcd32a32b mb32ic128_ih26oh52_iw26ow52_n"b6c38c51443a4ac7dea48c93aed62f20"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih375oh800_iw500ow800_n"2a6f1970ee4f7d58fc788e06fb52a45d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"9260b194daa82e75d314493def5bd5e7*2&ed8fff25f5185889954438e368ec2ed3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"5def456dca85f98f2831b0d15055a96d*2&b3b9ad5d2ace92d5263687828da20b42*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih50oh100_iw50ow100_n"aaea20323547ed66e2aee9a2e3de1477*2&491d9f81b327b56a879a7600671675cc*2"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic256_ih25oh50_iw25ow50_n"10eb237a70fc0289aeea1a1b1a5fe29e*2&decf3000608c55f586aa1d3cddda80fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih500oh800_iw375ow800_n"33275c3acaad1685977780a209738601"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih640oh800_iw362ow800_n"a20544fa95d3c6550a894b0c51feaefd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic1280_ih8oh16_iw8ow16_n"fe52e341c99ce9f01e1a4bb385492536"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic1280_ih16oh32_iw16ow32_n"c385a6a8709b583c1d5275cb9db839c2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic640_ih32oh64_iw32ow64_n"0775ba3f37a00e9f60e017b2ab2ec2f9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb64ic256_ih12oh23_iw12ow23_n"dde673a2ce7e50cb288e76346805b0df"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb64ic256_ih23oh45_iw23ow45_n"a2a11fd5cc494593d0b0b4708ed6a771"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb64ic256_ih45oh90_iw45ow90_n"a361f663b790e0cab51df3abcdaeb408"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih8oh16_iw8ow16_n"60970e1e1efa4f03d7355098cdcda3da&23bc3f26d5716b90dae3c88e4d87569c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb8ic1280_ih16oh32_iw16ow32_n"a428a0b80e004f5a4aced31c5a9d553f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb8ic640_ih32oh64_iw32ow64_n"614ed4a7487406395ad88f55c3240e52"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic640_ih32oh64_iw32ow64_n"0e6e941663316051151d0013c52227d7"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih16oh32_iw16ow32_n"054d632ab2180a74131f8bf8f6a50707"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih8oh16_iw8ow16_n"a5cdd6b546cac337971317dc1e3324d0&ff577d62c963e7940d9473ae0ebcdfcd"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb2ic1280_ih12oh24_iw12ow24_n"82cd538cecf6a7928f4f0945d7096209&56785d1991ec6322ac58102559b50467"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=abcd mb2ic1280_ih24oh48_iw24ow48_n"e0d76a92a5517b0d3561f2f31df71c9c&d33f33d37e8a9e8490dcb9e71d7aa450"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=abcd mb2ic640_ih48oh96_iw48ow96_n"b97131427efe3887327be3ba36439777&ff5e6d07373808ae3853cc14cc254748"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih409oh800_iw640ow800_n"6ebcf9d89bdab0d46bc77ba61dd914d6"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih468oh800_iw640ow800_n"5eb4ed627a588dda8b941a6d67dceb21"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=abcd mb1ic3_ih480oh800_iw640ow800_n"1fa127b2f7a6a1458446936976ae462a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"cb58b0c34b74d4d08f3ed330d280cc4f*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"9cc3a923ca19e8a34b5d27ec0c263c3b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"66b9e633376ff6bca27bba19ac34b33d*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"c5a3e0949201512008f4ecbc30467559*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"c3466222038715bf0737fb44b747714a*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"e733c12bb0859e57987dba5f63837e15*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic16_ih320oh640_iw512ow1024_n"14214c7749a1f77b387638d43a1d5dc5*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic24_ih160oh320_iw256ow512_n"290f122af65f3583190b6552539f4e88*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic32_ih80oh160_iw128ow256_n"a6fa2f813d33e7d0057ae237f6ec7331*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic40_ih40oh80_iw64ow128_n"a62e37f69ee2faa6bbc8707b4f929bfb*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic48_ih20oh40_iw32ow64_n"9597dddc35a256c77738f307335773ef*3"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=linear --sdt=f32 --ddt=f32 --tag=acdb mb4ic56_ih10oh20_iw16ow32_n"3a3f76efab4aebdc4c01eeffce1460d5*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic384_ih20oh40_iw20ow40_n"5cd7e627916a0724e94643b2d2114cca"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic192_ih40oh80_iw40ow80_n"cd1c425bb389c15f45f9ed42affe3db5"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb1ic1280_ih8oh16_iw8ow16_n"bbbd1151b88cc7147ea2c3824c2199d7*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb1ic1280_ih16oh32_iw16ow32_n"3f672368ada1628c244dee960218817b*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb1ic640_ih32oh64_iw32ow64_n"e0f5f4e998fd11f3bff373561d2334fc*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih4oh8_iw4ow8_n"e1895108ce94cca2a23263b51c7c6c8d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic640_ih16oh32_iw16ow32_n"b368817a4cfe12d075dd83155f39cefc"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic640_ih16oh32_iw16ow32_n"fe95a67b302f2c8e4e2f3b08f87a0e61"
+--reset --allow-enum-tags-only=0 --dir=BWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb8ic1280_ih4oh8_iw4ow8_n"f59e2d535f9bde2b303cd7e34467e293"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic256_ih13oh26_iw13ow26_n"1930487fceb48514170269118800a35e"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=ABcd32a16b mb32ic128_ih26oh52_iw26ow52_n"49228c5434eb0c89781761501cf55ee9"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=acdb mb16ic1280_ih8oh16_iw8ow16_n"8c633acc39d1b0a391228342e44751e5*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb16ic1280_ih16oh32_iw16ow32_n"cd041dc1d9c3542c483af61941aa1e0e*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=bf16 --ddt=bf16 --tag=abcd mb16ic640_ih32oh64_iw32ow64_n"cd0222afeb7b356b4420eca857476a0f*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb8ic512_ih64oh128_iw64ow128_n"2fe062c146b55fc8ff419b26a3c9c461"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb8ic512_ih128oh256_iw128ow256_n"efbed85a6a1393ed70fcccecaf88c1a8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=acdb mb8ic256_ih256oh512_iw256ow512_n"ec628326ba17b22239eacefc89ea744b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=linear --sdt=f16 --ddt=f16 --tag=acdb mb8ic3_ih512oh299_iw512ow299_n"86fbd9a0a76afe60dd5c5f8127d87fdb"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic512_ih16oh32_iw16ow32_n"3e541b83a156fc88dfa4406dd7882664"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic256_ih32oh64_iw32ow64_n"4c1f67e6b5ff87360f84334e9634078d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic128_ih64oh128_iw64ow128_n"6b71b4c640d6ed4ad1bc48bd79ce8309*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic64_ih128oh256_iw128ow256_n"594b182793679f6a6ac06b69660fc8dc*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f16 --ddt=f16 --tag=aBcd16b mb8ic32_ih256oh512_iw256ow512_n"ca27ecc78e85b53e0cf024cf6845b104"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=acdb mb2ic1280_ih12oh24_iw12ow24_n"3b60de7c2ddb22e5e629c7610639773f"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=abcd mb2ic1280_ih24oh48_iw24ow48_n"62c19a16c8c7d681d3c43dea2a29fde1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=nearest --sdt=f32 --ddt=f32 --tag=abcd mb2ic640_ih48oh96_iw48ow96_n"b86e17774d4425073bb1190f81ba83bc"
diff --git a/tests/benchdnn/inputs/rnn/option_set_fwks_key_gpu b/tests/benchdnn/inputs/rnn/option_set_fwks_key_gpu
index dbe629cef2c..007864aec29 100644
--- a/tests/benchdnn/inputs/rnn/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/rnn/option_set_fwks_key_gpu
@@ -1,58 +1,207 @@
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t549mb2sic1024slc240dhc1024dic1024_n"62f9ac2e94e5ab211f42e87eec5c39ad*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t549mb2sic1024slc1024dhc1024dic1024_n"28e1f4fb571f743cb658e6ae612bc1a4*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t275mb2sic1024slc2048dhc1024dic1024_n"3d6994a1f5ba1e15d17c631caaf0591b*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t275mb2sic1024slc1024dhc1024dic1024_n"82ea428355c7f9c52e165b7d16005190*2"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t226mb2sic512slc512dhc512dic512_n"1384bec5802039dae8f696972703dc4e*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t226mb2sic512slc512dhc512dic512_n"15b9e3cd35455bd52f61cf69278ceab4*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t275mb2sic1024slc1024dhc1024dic1024_n"7bbe378f1e92eaf13b854121a1ad6c12*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t275mb2sic1024slc2048dhc1024dic1024_n"a04f25e90ffd3cd429fd104467d4cb8f*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t549mb2sic1024slc1024dhc1024dic1024_n"003b0eb782bca02e321f3906395923b6*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t549mb2sic1024slc240dhc1024dic1024_n"0d75080f69133cfc866461c4eb67a26e*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic256slc92dhc256dic256_n"b80a1c7cb9ad3c2e5f87f04c4939b5f5*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc379dhc681dic681_n"cb350a6080dddb6c67af9e9ba8fc1cfd*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc681dhc681dic681_n"920532e1701641c84c14bcd6591a884b*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc681dhc681dic681_n"77fec8e2f173b98658e562da15d7f3eb*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc379dhc681dic681_n"3e467ea49c2dda5113020ebd29abd6b3*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic256slc92dhc256dic256_n"6053fbde89852f50463a4c7cde3cad7e*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t246mb1sic1024slc240dhc1024dic1024_n"eaa5bbfbcedc78f9407c1989dcdc3f14*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t246mb1sic1024slc1024dhc1024dic1024_n"eaf28f013a26a4c36c073c6e74dc6bd4*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t123mb1sic1024slc2048dhc1024dic1024_n"a1dab5f7d053ae661f26e63b1fe2465e*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t123mb1sic1024slc1024dhc1024dic1024_n"b09b9d22f53ad51c75d072a3e88b1066*2"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t1mb1sic320slc320dhc320dic320_n"d3eb3471f2af70923b6a716e5e8a8f4a*446"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t451mb2sic1024slc240dhc1024dic1024_n"cd4f4189f4de8b2838d31b281d677bcf*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t451mb2sic1024slc1024dhc1024dic1024_n"87224ecf14cd3a11a113a3e5ccdec651*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t226mb2sic1024slc2048dhc1024dic1024_n"fbc6b3944e1f9bd79a6cc12a71d4d371*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t226mb2sic1024slc1024dhc1024dic1024_n"bb91038e9a981c6862ac7a4af52d579d*2"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t261mb2sic512slc512dhc512dic512_n"9518d2a11b57b6114a9e309ce1b84f39*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t261mb2sic512slc512dhc512dic512_n"273d9a2996aff43284497c6e05ebc832*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t226mb2sic1024slc1024dhc1024dic1024_n"a0fea4bea27deb9167e4a4f47e80d1d8*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t226mb2sic1024slc2048dhc1024dic1024_n"0a2919d2de6aca78181e2047c0202212*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t451mb2sic1024slc1024dhc1024dic1024_n"d11161db3001fe8c40cc8a197e5dd5ea*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t451mb2sic1024slc240dhc1024dic1024_n"476c154ac4fbe5253e7d3667f0c9320a*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t350mb1sic1024slc240dhc1024dic1024_n"9c9496e9be8b17b8b0468a9fd58d70db*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t350mb1sic1024slc1024dhc1024dic1024_n"f0ec29e65f748ebafefc420024bad6b8*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t175mb1sic1024slc2048dhc1024dic1024_n"fcc5568ebb655d87fd1103dc9e784afc*1"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t175mb1sic1024slc1024dhc1024dic1024_n"c641f75552d5d72eb021ee578368c9f2*2"
---reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t1mb1sic320slc320dhc320dic320_n"6e40b5f835905bd8c7d0cc2e6e2cf1ea*718"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"c5de75605d24a572f4f8bcc7217c4632*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"4490ff8aa14af86d39ffd8c5987b960c*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"d7d2c798d7455e832da8915e02d7d406*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"8ba7ff4fb867e5686caf6f4c1aaec1f9*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"cb4e275dc84450a2bfc3365ec156d5d8*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"af209dc108af120d7c4c038141b8afc0*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"e2ae704df2f011e465ccf212c13d7a40*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"1098c377065b92109b6e887fae8622da*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"fef0db71147f396e9c1e2719bba4c368*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"4fe7ba5208a540b3ba23803824788f84*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"a0ce203d87f4b8e2668c51872e18983a*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"7ff76ef2282c6d9c3a0101c9ea25658d*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t555mb2sic1024slc240dhc1024dic1024_n"2460bb18556ad62530eefcf97027ea3c*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t555mb2sic1024slc1024dhc1024dic1024_n"f658c535cad29e9e139cf0ce352d3ee3*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t278mb2sic1024slc2048dhc1024dic1024_n"dd31808ceb20d4fa45df182bc163d551*1"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t278mb2sic1024slc1024dhc1024dic1024_n"d94c745a1f93b6caf3b7c56bb204dca2*2"
---reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t174mb2sic512slc512dhc512dic512_n"c1b75396bfd01495bb72330902551e4c*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t174mb2sic512slc512dhc512dic512_n"57a4a6bf3c1f529db8c16a64a8cdba2f*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t278mb2sic1024slc1024dhc1024dic1024_n"a7a528a9617209c78defaa71fe2b6d5b*2"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t278mb2sic1024slc2048dhc1024dic1024_n"8b78ae9d9738eafcb2d19812ef2e0e2a*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t555mb2sic1024slc1024dhc1024dic1024_n"40733b4b05934283c50ada62a857e97d*1"
---reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --trivial-strides=true l1t555mb2sic1024slc240dhc1024dic1024_n"71c5181d7493a64394d3bf6f7ac9e072*1"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"ba4d19596495d64b78924f2a05d00c98"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"4a15de7f980811841b3fff20915775a2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"c1668343974d20219a046b872e65662b*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"fc746468931951c9228e5b6a22262601*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"746269c05ec17532cfe564ff4e8d628d"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"f8be679245be4766e5f58b9bd32341fe"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1046mb512sic1024slc240dhc1024dic1024_n"c4df00dfd3f6e111b92deb5dda853631*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1046mb512sic1024slc1024dhc1024dic1024_n"2fe4705909d97d743e326cdc209393e0*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t523mb512sic1024slc2048dhc1024dic1024_n"3977ca0cad243c96e35f20cee8cb5cb5*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t523mb512sic1024slc1024dhc1024dic1024_n"aef265712b3da5bf9a5ab4cbd0717f36*6"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1mb512sic320slc320dhc320dic320_n"be82bf20aed5de9797f14aa333b1ae7f*44994"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1072mb512sic1024slc240dhc1024dic1024_n"ac13e45cc594fa275066c7cf7a5033fc*5"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1072mb512sic1024slc1024dhc1024dic1024_n"1577960e74b97f667c809100ae392837*5"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t536mb512sic1024slc2048dhc1024dic1024_n"08458f2a6f4db26adf723526bce96cc7*5"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t536mb512sic1024slc1024dhc1024dic1024_n"56392f81f1d9131bc1b6ca2a5d12f53e*10"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t933mb512sic1024slc240dhc1024dic1024_n"0a86f2cf40b26f3bdabaff2eefd64a48*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t933mb512sic1024slc1024dhc1024dic1024_n"9a5641c7e93314f95cba2c003c57ff71*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t467mb512sic1024slc2048dhc1024dic1024_n"6a3c1f227af57c7ea2ac682446d801b5*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t467mb512sic1024slc1024dhc1024dic1024_n"81602a20407ae4b8c41f660622388c21*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1083mb512sic1024slc240dhc1024dic1024_n"569ad167ade258b426cedf847822c752*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1083mb512sic1024slc1024dhc1024dic1024_n"de52a75d41a919cd7d5b8a09ca341406*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t542mb512sic1024slc2048dhc1024dic1024_n"d41ed2f20a777434f4e281069d075953*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t542mb512sic1024slc1024dhc1024dic1024_n"134005bebc079833de46bf82c0a2326a*8"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t965mb512sic1024slc240dhc1024dic1024_n"bc6c20ffb4d7134dfb2f0296765f7e9a"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t965mb512sic1024slc1024dhc1024dic1024_n"f4eb6c496e52f32929debb4aedf6fd7c"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t483mb512sic1024slc2048dhc1024dic1024_n"fbb8b07e6bc9b0755e5d96392a0fa007"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t483mb512sic1024slc1024dhc1024dic1024_n"01f73852adf156ee1752210339d32a15*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1089mb512sic1024slc240dhc1024dic1024_n"9415fd57f1b206f885bd8ddabe919e49*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1089mb512sic1024slc1024dhc1024dic1024_n"68032359569392b2274b4033ca884d30*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t545mb512sic1024slc2048dhc1024dic1024_n"bb6741a94de9ccfede02602c3928a945*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t545mb512sic1024slc1024dhc1024dic1024_n"97a32cb2389b55708cf5be2bb6f0e37f*8"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t953mb512sic1024slc240dhc1024dic1024_n"340aa343e5d79d7cbe60ee393e1613c4*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t953mb512sic1024slc1024dhc1024dic1024_n"62987947951eb7f8c184616970216f75*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t477mb512sic1024slc2048dhc1024dic1024_n"1931ba2fcf8b55b31b13fddb41fea235*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t477mb512sic1024slc1024dhc1024dic1024_n"0069eabc3290e8dbbdc4747f868d8865*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t980mb512sic1024slc240dhc1024dic1024_n"d2572f7bd3314ac273aa6ff25432730b"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t980mb512sic1024slc1024dhc1024dic1024_n"24cc3411632e5d5e38773a74d62d9d6d"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t490mb512sic1024slc2048dhc1024dic1024_n"a09b72608393fdfd64dbf5d1c4e19abc"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t490mb512sic1024slc1024dhc1024dic1024_n"bfc269039dd4e831d3f6a6585e5415ad*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t731mb512sic1024slc240dhc1024dic1024_n"40b18af7ae532c8b71f26d61bef37c40"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t731mb512sic1024slc1024dhc1024dic1024_n"6d325bbb65057a39b8c3c339fb27246e"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t366mb512sic1024slc2048dhc1024dic1024_n"e1aa028d51a8cc4bb4381846fd4674ce"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t366mb512sic1024slc1024dhc1024dic1024_n"4df15b650fb371788deb5673d2a595cb*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t246mb1sic1024slc240dhc1024dic1024_n"3a7604c21a0e6ca6e7f7cf56950690ca"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t246mb1sic1024slc1024dhc1024dic1024_n"c37d20263e7201352612532fd7ceedee"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t123mb1sic1024slc2048dhc1024dic1024_n"b4a96a69ae295f78769ae0edc16dfe5e"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t123mb1sic1024slc1024dhc1024dic1024_n"e243df476d56bada5478d9190549edfd*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t1mb1sic320slc320dhc320dic320_n"ac381306d58426f4f396892684e2d453*446"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t350mb1sic1024slc240dhc1024dic1024_n"baff09ecc9e2657a8b9512544d4e7f9b"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t350mb1sic1024slc1024dhc1024dic1024_n"eea3fbe70b5d043025831316ef5f031c"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t175mb1sic1024slc2048dhc1024dic1024_n"ebbb53e897c6e8aae33d6d521af2cdb3"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t175mb1sic1024slc1024dhc1024dic1024_n"448da33eb15431c36abcde9ddc1bd04b*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_I --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t1mb1sic320slc320dhc320dic320_n"93a0edd528db2035a8e3dcfba2903341*718"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"c4655b96ef10a7acaec301f1b281674a"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"7a29873a3ad28e063e4959e650ea71db"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"c4e4415949eee611e53cf7b762b591c6*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc681dhc681dic681_n"911cf00fb93db7b9650b6f2cbe9baaee*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic681slc379dhc681dic681_n"5b6c170216841d85cfefd975a5da7e18"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t83mb512sic256slc92dhc256dic256_n"da6ee852e71a55135ea540849cc0db24"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t617mb512sic1024slc240dhc1024dic1024_n"175250858ca21bcae711b3eb53ec6973*7"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t617mb512sic1024slc1024dhc1024dic1024_n"76c51f3465a7a7f01801d82adf6fe002*7"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t309mb512sic1024slc2048dhc1024dic1024_n"0ccf7bd6fc1e2775604d02a1782c65f8*10"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t309mb512sic1024slc1024dhc1024dic1024_n"149f801b79169d8469ebb54f45bc791b*20"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t316mb512sic512slc512dhc512dic512_n"933715ffda3a77b57af822b735cb5ff2*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t309mb512sic1024slc1024dhc1024dic1024_n"03cd0e618c68dd0d12503be26dde5a48*20"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t309mb512sic1024slc2048dhc1024dic1024_n"039dfd0a78b64bea0ff4096a1a4e7edf*10"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t617mb512sic1024slc1024dhc1024dic1024_n"240a7fa2e553710901db695b2c7c9c18*7"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t617mb512sic1024slc240dhc1024dic1024_n"8bd9c65c9cae6695487217a2942a57cc*7"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t316mb512sic512slc512dhc512dic512_n"d5575cb421c49a90bcaa41076515e1df*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t609mb512sic1024slc240dhc1024dic1024_n"f15e56ef40494de7c403de2882b5d436"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t609mb512sic1024slc1024dhc1024dic1024_n"42be3779f08289737154d3abb07753d6"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t305mb512sic1024slc2048dhc1024dic1024_n"454100423873d72b53de590030921771*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t305mb512sic1024slc1024dhc1024dic1024_n"f6b974d27b858c63bebc9d8d15b68555*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t314mb512sic512slc512dhc512dic512_n"78abb9eb710aa61b9eba7ed7dcc39045*4"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t305mb512sic1024slc1024dhc1024dic1024_n"635de8aca22d0a71d9a6ddcbc1bed66f*4"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t305mb512sic1024slc2048dhc1024dic1024_n"8475fdc4d36b62aaa335f0ebd66746d8*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t609mb512sic1024slc1024dhc1024dic1024_n"420abd564bfcdbaf0a90b818a3ae2992"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t609mb512sic1024slc240dhc1024dic1024_n"0285dd6bb02ba8d2dc5b6cc633c14868"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t314mb512sic512slc512dhc512dic512_n"b5f0adb68dc7e006139ef87a0c60214b*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t618mb512sic1024slc240dhc1024dic1024_n"d5fca974ac07fb7414b97b179efd80dd*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t618mb512sic1024slc1024dhc1024dic1024_n"0b4b81dbf712e958897b606a719b7af5*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t311mb512sic512slc512dhc512dic512_n"b0459dc3ef13cdd8bccb9ba927710274*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t618mb512sic1024slc1024dhc1024dic1024_n"0a454314382f7256c0b07b0d193ca204*3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t618mb512sic1024slc240dhc1024dic1024_n"096a03fa6fe818ea15ff5288eda5a735*3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t311mb512sic512slc512dhc512dic512_n"295dae97ffd9b502ac8ed0b6ba95c9fb*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t612mb512sic1024slc240dhc1024dic1024_n"bc6b6259472f794ed5ef2ae8a157d014"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t612mb512sic1024slc1024dhc1024dic1024_n"9eea7e6201bad972340edaa87f21ac66"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t306mb512sic1024slc2048dhc1024dic1024_n"c349ca2da16b21eb13a88805e44405a3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t306mb512sic1024slc1024dhc1024dic1024_n"e9997998f0681ef0b94db8ee5a94812f*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t309mb512sic512slc512dhc512dic512_n"3f578d8dd2081498caef1de3bea41885*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t306mb512sic1024slc1024dhc1024dic1024_n"6f3bce3f582f99cca8db9f50e96b3961*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t306mb512sic1024slc2048dhc1024dic1024_n"348393c568034a240e32cab3cd91b248"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t612mb512sic1024slc1024dhc1024dic1024_n"69a1cfffeebf4253acd0f81048bb49ac"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t612mb512sic1024slc240dhc1024dic1024_n"b97578f6d28315c88b438def134814c1"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t309mb512sic512slc512dhc512dic512_n"b27e989ca87701dc38a6f640e6ea9c5f*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t607mb512sic1024slc240dhc1024dic1024_n"7afdd75877d0c635b23b8bc22ebcb6f6"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t607mb512sic1024slc1024dhc1024dic1024_n"a58815fe6abb97f6d57d5239cd7f03c8"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t304mb512sic1024slc2048dhc1024dic1024_n"6b3f71e9db0e0fe35f996f7680aae9b7"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t304mb512sic1024slc1024dhc1024dic1024_n"89d11ea57ca38f98bf2a2d0a67877c80*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t323mb512sic512slc512dhc512dic512_n"b156d6a4ffe85ba622c0060ceba09ea3*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t304mb512sic1024slc1024dhc1024dic1024_n"da9b4a9d4586f0d96d29ebe7e2369504*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t304mb512sic1024slc2048dhc1024dic1024_n"3eef3e5d4758b1b8ec5a091435484145"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t607mb512sic1024slc1024dhc1024dic1024_n"032ea4e0e1338ccdd93592423679f120"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t607mb512sic1024slc240dhc1024dic1024_n"fe69f740bae03ffc24d96f735f9d640e"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t323mb512sic512slc512dhc512dic512_n"c394a7bbc726570c90ac11dcd8f5be76*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t614mb512sic1024slc240dhc1024dic1024_n"e129e535681b6212cd3520aad0b8285d*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t614mb512sic1024slc1024dhc1024dic1024_n"1a6ab25e7e0168627102d06d767dda4e*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t307mb512sic1024slc2048dhc1024dic1024_n"2df607274002a0e0ba27126e9910c542*6"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t307mb512sic1024slc1024dhc1024dic1024_n"82ad9e2718e252b084b9b69b0816ba50*12"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t303mb512sic512slc512dhc512dic512_n"ddd6e468e61b24d9fbb81063bb8e134f*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t307mb512sic1024slc1024dhc1024dic1024_n"d78bbad1f1370987e991513a2b6236b6*12"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t307mb512sic1024slc2048dhc1024dic1024_n"f325317937fc6840b2d8608693abc8f1*6"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t614mb512sic1024slc1024dhc1024dic1024_n"55209cdb7ce2ac72127249013778b3ea*3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t614mb512sic1024slc240dhc1024dic1024_n"34a35daee653babf0b23ee263757b8bb*3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t303mb512sic512slc512dhc512dic512_n"4c89288f9b721e00a695ceb7db221095*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t312mb512sic512slc512dhc512dic512_n"f52a293f2fedc970a96d19ee3e83359b*8"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t312mb512sic512slc512dhc512dic512_n"cb99f9d5c9a0ce4544c21af8292e0908*8"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t613mb512sic1024slc240dhc1024dic1024_n"57c13011e0cf32fbd399dd56fc9c8522*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t613mb512sic1024slc1024dhc1024dic1024_n"995c92603d8ea976b19b3a208e847291*3"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t308mb512sic512slc512dhc512dic512_n"23ee10519e13b36ba64334b8f6478fd6*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t613mb512sic1024slc1024dhc1024dic1024_n"370598ec3bf1ed9f72558369678a4a1c*3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t613mb512sic1024slc240dhc1024dic1024_n"5045e99b4356e9697cd134c34d148b52*3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t308mb512sic512slc512dhc512dic512_n"0ca223c5ac46535c6b32a9b3cd8ff2e1*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t305mb512sic512slc512dhc512dic512_n"32066ede7614d8838730e154f10d26ab*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t305mb512sic512slc512dhc512dic512_n"43d74fdc17d50b6385e1796e37904809*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t616mb512sic1024slc240dhc1024dic1024_n"1eda4351834afde6b43e4cb39b87c2b9"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t616mb512sic1024slc1024dhc1024dic1024_n"c593c9d554c7b97cece77e5cb91c1d8a"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t308mb512sic1024slc2048dhc1024dic1024_n"b8157500b3e67fb60689b239641dc7e9"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t308mb512sic1024slc1024dhc1024dic1024_n"52ff901aa43b5011c24dcf971f51d98a*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t325mb512sic512slc512dhc512dic512_n"8eb9e60e580a3edf076f92f6d2b19c3a*4"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t308mb512sic1024slc1024dhc1024dic1024_n"4284fcb02000d1251bcce2743ba987d5*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t308mb512sic1024slc2048dhc1024dic1024_n"d22184fe50e487a56cd9f7c836fde218"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t616mb512sic1024slc1024dhc1024dic1024_n"ce7dbdae48e852948d4b3b3186e16d56"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t616mb512sic1024slc240dhc1024dic1024_n"1f98b2d0778c536b0aa98bb7f368161c"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t325mb512sic512slc512dhc512dic512_n"1e722dd653bb8852edb8a5ae4a242711*4"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t313mb512sic512slc512dhc512dic512_n"57eead41661099c85601f9760e9c16d3*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t313mb512sic512slc512dhc512dic512_n"291e8ed0905522a842f04cf6fbdeb0af*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t306mb512sic512slc512dhc512dic512_n"eef896c027c2ece38b50d1c244cd7832*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t306mb512sic512slc512dhc512dic512_n"0a9fccc8316464c2da6e43130c53c185*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t605mb512sic1024slc240dhc1024dic1024_n"0a15de74c8c12ae65de7803f674aa682"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t605mb512sic1024slc1024dhc1024dic1024_n"240271ceea294e59584c1ce07c4baac4"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t303mb512sic1024slc2048dhc1024dic1024_n"00e1d0e3a55ff1b7fe533fdcedd251d2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t303mb512sic1024slc1024dhc1024dic1024_n"281a518c2346abc0cf2aa67956342055*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t322mb512sic512slc512dhc512dic512_n"d67379e4254a21810c90d2e9cc3693a6*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t303mb512sic1024slc1024dhc1024dic1024_n"2d04449306d653c561f012d2fb06f6a5*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t303mb512sic1024slc2048dhc1024dic1024_n"6257e8933b4ad7ffda242e486a84879e"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t605mb512sic1024slc1024dhc1024dic1024_n"9a081616b73214bc70eb58774ee33757"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t605mb512sic1024slc240dhc1024dic1024_n"4a44b9e79feb754c418e074c4dbd065e"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t322mb512sic512slc512dhc512dic512_n"971fa9dd72f4187ef963ba7f05672085*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t307mb512sic512slc512dhc512dic512_n"0ec0433f6b1b767e4bc1f7005d61fcf9*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t307mb512sic512slc512dhc512dic512_n"5e75e60284635beb25c6e52fae1bdd3e*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t301mb512sic512slc512dhc512dic512_n"9d300a6367019c11324b862e97f42cb8*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t301mb512sic512slc512dhc512dic512_n"db599bcf68d594d07ae6d1b685c4a726*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t619mb512sic1024slc240dhc1024dic1024_n"45a07e628eca6a086043b590edb65e12"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t619mb512sic1024slc1024dhc1024dic1024_n"ae974b7d4dc7715f78434aaab23b7c5c"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t310mb512sic1024slc2048dhc1024dic1024_n"e54f58079a40aef123d6afb539be0d31"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t310mb512sic1024slc1024dhc1024dic1024_n"427dde595e6d2095b619b01859c8abd1*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t310mb512sic1024slc1024dhc1024dic1024_n"ba36ef6b467fdd327cde5ed3de9733d6*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t310mb512sic1024slc2048dhc1024dic1024_n"7f49e41ca61d9dbc5e2fd9b377eeb550"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t619mb512sic1024slc1024dhc1024dic1024_n"6b86a60af5444b46091f8f89b3cb7f32"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t619mb512sic1024slc240dhc1024dic1024_n"389cf3ebf3e0e0a558d18d62e18395ac"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t330mb512sic512slc512dhc512dic512_n"546586ee0e4d431d83531fe3eed44566*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t330mb512sic512slc512dhc512dic512_n"df60e5868802ae374dee1c03cc965d20*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t610mb512sic1024slc240dhc1024dic1024_n"2e0637b04313dcad5dadb79cea073427"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t610mb512sic1024slc1024dhc1024dic1024_n"6c27f8dfaae0eb7ab80becb3002c94a9"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t320mb512sic512slc512dhc512dic512_n"80a86f745acb01f657a799c50cf88ad7*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t610mb512sic1024slc1024dhc1024dic1024_n"d6f62b558e547866b7b95162f291caa3"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t610mb512sic1024slc240dhc1024dic1024_n"3cc5261e10d4a9feb2875c20c6eaee26"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t320mb512sic512slc512dhc512dic512_n"384c5ace1622eaf51d89348585c533d0*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t302mb512sic512slc512dhc512dic512_n"ee391b87b10f62551eac4e165aa1d4bd*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t302mb512sic512slc512dhc512dic512_n"33d4ee2aab3f992eed9664d58191d2ad*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t549mb2sic1024slc240dhc1024dic1024_n"a2c50bf653b43b10b8e09c68a7735893"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t549mb2sic1024slc1024dhc1024dic1024_n"9c5c527f1e54e70c82f76e38f9229437"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t275mb2sic1024slc2048dhc1024dic1024_n"20eed09e831a4920068eaabc530aec32"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t275mb2sic1024slc1024dhc1024dic1024_n"24d7bc49130aad9c91093533d9d7db3b*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t226mb2sic512slc512dhc512dic512_n"c08577fa0ba3b4adcd2b7d57ebc3d28a*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t226mb2sic512slc512dhc512dic512_n"f06ff1855cee8c39aa285b37b41a80b6*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t275mb2sic1024slc1024dhc1024dic1024_n"2096005959b28cf341bc8d5cfb187be2*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t275mb2sic1024slc2048dhc1024dic1024_n"2481166b55430a1c4b47f72ed70efc16"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t549mb2sic1024slc1024dhc1024dic1024_n"4dc24c9ec4a0b4fe5b86438701f4555c"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t549mb2sic1024slc240dhc1024dic1024_n"410380ef92627dee86919e7071e3f9b2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t555mb2sic1024slc240dhc1024dic1024_n"b0310222721c73beaf096e64c0fb7652"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t555mb2sic1024slc1024dhc1024dic1024_n"eba088e8d2c118e0e5a2f2f49b57ae4b"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t278mb2sic1024slc2048dhc1024dic1024_n"1cdfa293aefb0aebc95a5a68c5228c15"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t278mb2sic1024slc1024dhc1024dic1024_n"f1655354ba7a7d5ff2314c56a74c7824*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t174mb2sic512slc512dhc512dic512_n"7d05f3f2430bfbcc030a55c65b42cce9*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t174mb2sic512slc512dhc512dic512_n"6b845b243b280bb76eaed60b1653d73c*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t278mb2sic1024slc1024dhc1024dic1024_n"5b2f978579c4fe1176f2c26e473ffa15*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t278mb2sic1024slc2048dhc1024dic1024_n"458b1a0c22b792c95cbb6537e9617303"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t555mb2sic1024slc1024dhc1024dic1024_n"28ad842b6727d2a9704c0cf796fbc82d"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=bf16f32 --tag=abc:abcde:abc --trivial-strides=true l1t555mb2sic1024slc240dhc1024dic1024_n"f929f150fe53d737c646c612b68091d4"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic256slc92dhc256dic256_n"b0df9739f36f9b73d34f621b3e742499"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc379dhc681dic681_n"f12eacd1b1a4adbd9796000974e98b91"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc681dhc681dic681_n"5a2c5e588d92f9f0eee362f8286e516a*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc681dhc681dic681_n"b3d05bb4c149cc905dd9a99c5b5af420*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic681slc379dhc681dic681_n"9ec8441c33f95967c27ce0a92a00bae1"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=LBR_GRU --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true --attr-fpmath=tf32 l1t83mb512sic256slc92dhc256dic256_n"3451b4e31f076dd2990832154baa36d5"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t451mb2sic1024slc240dhc1024dic1024_n"ff74ab6946d579441a06f04b985c0a4b"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t451mb2sic1024slc1024dhc1024dic1024_n"741c09f48ff0d46bb60409e8592b7635"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t226mb2sic1024slc2048dhc1024dic1024_n"f69ed8ae330097823bf92acd5f0f5851"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t226mb2sic1024slc1024dhc1024dic1024_n"75ced3434016b9ed2002e489a5c076fa*2"
+--reset --allow-enum-tags-only=0 --prop=FWD_D --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t261mb2sic512slc512dhc512dic512_n"46a73771fbb135334ad5804cd972951a*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t261mb2sic512slc512dhc512dic512_n"4b085ee61cd21f390497b88ca2abbf68*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t226mb2sic1024slc1024dhc1024dic1024_n"dc6eea92cd9f2c5572d767703c6469d9*2"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t226mb2sic1024slc2048dhc1024dic1024_n"caceaaffa27206b8bb9ebac89a4b7a6a"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t451mb2sic1024slc1024dhc1024dic1024_n"ae6ae383282da84f988b261e81616c31"
+--reset --allow-enum-tags-only=0 --prop=BWD_DW --alg=VANILLA_LSTM --direction=left2right --flags= --cfg=f32 --tag=abc:abcde:abc --trivial-strides=true l1t451mb2sic1024slc240dhc1024dic1024_n"4601fe0d9af4fd8c4731810e39779c1d"
diff --git a/tests/benchdnn/inputs/rnn/option_set_large b/tests/benchdnn/inputs/rnn/option_set_large
index 5a2e7a39732..69758df870c 100644
--- a/tests/benchdnn/inputs/rnn/option_set_large
+++ b/tests/benchdnn/inputs/rnn/option_set_large
@@ -1,8 +1,5 @@
 # option set to run all reasonable large shapes
 
 --skip-nonlinear=true
---l=1,2,3
---t=1,2,3
---mb=1,16,65
 --tag=tnc:any:tnc,ntc:any:ntc
 --batch=shapes_large
diff --git a/tests/benchdnn/inputs/rnn/option_set_lstmp_large b/tests/benchdnn/inputs/rnn/option_set_lstmp_large
index 0279193a100..8fd7ff41f8c 100644
--- a/tests/benchdnn/inputs/rnn/option_set_lstmp_large
+++ b/tests/benchdnn/inputs/rnn/option_set_lstmp_large
@@ -1,7 +1,4 @@
 # option set to run all reasonable large lstmp shapes
 
 --skip-nonlinear=true
---l=1,2,3
---t=1,2,3
---mb=1,16,17
---batch=shapes_lstmp_large
\ No newline at end of file
+--batch=shapes_lstmp_large
diff --git a/tests/benchdnn/inputs/rnn/option_set_lstmp_small b/tests/benchdnn/inputs/rnn/option_set_lstmp_small
index 18c324b28d4..e7688018412 100644
--- a/tests/benchdnn/inputs/rnn/option_set_lstmp_small
+++ b/tests/benchdnn/inputs/rnn/option_set_lstmp_small
@@ -1,7 +1,4 @@
 # option set to run all reasonable small lstmp shapes
 
 --skip-nonlinear=false
---l=1,2,3
---t=1,2,3
---mb=1,16,17
---batch=shapes_lstmp_small
\ No newline at end of file
+--batch=shapes_lstmp_small
diff --git a/tests/benchdnn/inputs/rnn/option_set_small b/tests/benchdnn/inputs/rnn/option_set_small
index 477aa7361e8..2d9dc854830 100644
--- a/tests/benchdnn/inputs/rnn/option_set_small
+++ b/tests/benchdnn/inputs/rnn/option_set_small
@@ -1,8 +1,5 @@
 # option set to run all reasonable small shapes
 
 --skip-nonlinear=false
---l=1,2,3
---t=1,2,3
---mb=1,3,4
 --tag=tnc:any:tnc,ntc:any:ntc
 --batch=shapes_small
diff --git a/tests/benchdnn/inputs/rnn/shapes_large b/tests/benchdnn/inputs/rnn/shapes_large
index 3503c92c20b..157d6f9dc0c 100644
--- a/tests/benchdnn/inputs/rnn/shapes_large
+++ b/tests/benchdnn/inputs/rnn/shapes_large
@@ -1,10 +1,8 @@
 # large shapes
 
-l1t1mb1_sic64_n"uniform"
-l1t1mb1_sic65_n"uniform:tail"
-l1t1mb1_sic64_slc128_n"non-uniform:slc_neq_sic"
-l1t1mb1_sic65_slc130_n"non-uniform:slc_neq_sic_tail"
-l1t1mb1_sic64_dhc128_n"non-uniform:slc_neq_dhc"
-l1t1mb1_sic65_dhc130_n"non-uniform:slc_neq_dhc_tail"
-l1t1mb1_sic64_slc128_dhc256_n"non-uniform:slc_neq_sic_neq_dhc"
-l1t1mb1_sic65_slc130_dhc260_n"non-uniform:slc_neq_sic_neq_dhc_tail"
\ No newline at end of file
+l1t1mb63_sic64_n"uniform"
+l1t1mb34_sic65_n"uniform:tail"
+l1t1mb19_sic64_slc128_n"non-uniform:slc_neq_sic"
+l1t1mb12_sic65_dhc130_n"non-uniform:slc_neq_dhc_tail"
+l1t1mb6_sic64_slc128_dhc256_n"non-uniform:slc_neq_sic_neq_dhc"
+l1t1mb4_sic65_slc130_dhc260_n"non-uniform:slc_neq_sic_neq_dhc_tail"
diff --git a/tests/benchdnn/inputs/rnn/shapes_large_gru b/tests/benchdnn/inputs/rnn/shapes_large_gru
index a4e978d0eb4..0cf0edfd2a1 100644
--- a/tests/benchdnn/inputs/rnn/shapes_large_gru
+++ b/tests/benchdnn/inputs/rnn/shapes_large_gru
@@ -1,4 +1,3 @@
-
-l1t1mb1_sic64_n"uniform"
-l1t1mb1_sic128_n"uniform"
-l1t1mb1_sic65_n"uniform:tail"
+l1t1mb65_sic64_n"uniform"
+l1t1mb17_sic128_n"uniform"
+l1t1mb100_sic65_n"uniform:tail"
diff --git a/tests/benchdnn/inputs/rnn/shapes_lstmp_large b/tests/benchdnn/inputs/rnn/shapes_lstmp_large
index 8ddd61dba95..ac05de3ef05 100644
--- a/tests/benchdnn/inputs/rnn/shapes_lstmp_large
+++ b/tests/benchdnn/inputs/rnn/shapes_lstmp_large
@@ -1,10 +1,8 @@
 # large shapes for lstm w/ projection when dhc != dic
 
-l1t1mb1_sic64_dic128_n"non-uniform:dhc_neq_dic"
-l1t1mb1_sic65_dic130_n"non-uniform:dhc_neq_dic_tail"
-l1t1mb1_sic64_slc128_dic128_n"non-uniform:slc_neq_sic_and_dhc_neq_dic"
-l1t1mb1_sic65_slc130_dic130_n"non-uniform:slc_neq_sic_and_dhc_neq_dic_tail"
-l1t1mb1_sic64_dhc128_dic256_n"non-uniform:slc_neq_dhc_neq_dic"
-l1t1mb1_sic65_dhc130_dic260_n"non-uniform:slc_neq_dhc_neq_dic_tail"
-l1t1mb1_sic64_slc128_dhc256_dic320_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic"
-l1t1mb1_sic65_slc130_dhc260_dic325_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail"
\ No newline at end of file
+l1t1mb31_sic64_dic128_n"non-uniform:dhc_neq_dic"
+l1t1mb32_sic65_dic130_n"non-uniform:dhc_neq_dic_tail"
+l1t1mb12_sic64_slc128_dic128_n"non-uniform:slc_neq_sic_and_dhc_neq_dic"
+l1t1mb10_sic65_dhc130_dic260_n"non-uniform:slc_neq_dhc_neq_dic_tail"
+l1t1mb3_sic64_slc128_dhc256_dic320_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic"
+l1t1mb4_sic65_slc130_dhc260_dic325_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail"
diff --git a/tests/benchdnn/inputs/rnn/shapes_lstmp_small b/tests/benchdnn/inputs/rnn/shapes_lstmp_small
index e1427fe850b..ec12d581e43 100644
--- a/tests/benchdnn/inputs/rnn/shapes_lstmp_small
+++ b/tests/benchdnn/inputs/rnn/shapes_lstmp_small
@@ -1,10 +1,8 @@
 # small shapes for lstm w/ projection when dhc != dic
 
-l1t1mb1_sic16_dic32_n"non-uniform:dhc_neq_dic"
-l1t1mb1_sic17_dic34_n"non-uniform:dhc_neq_dic_tail"
-l1t1mb1_sic16_slc32_dic32_n"non-uniform:slc_neq_sic_and_dhc_neq_dic"
-l1t1mb1_sic17_slc34_dic34_n"non-uniform:slc_neq_sic_and_dhc_neq_dic_tail"
-l1t1mb1_sic16_dhc32_dic64_n"non-uniform:slc_neq_dhc_neq_dic"
-l1t1mb1_sic17_dhc34_dic68_n"non-uniform:slc_neq_dhc_neq_dic_tail"
-l1t1mb1_sic16_slc32_dhc64_dic80_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic"
-l1t1mb1_sic17_slc34_dhc68_dic85_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail"
\ No newline at end of file
+l1t1mb9_sic16_dic32_n"non-uniform:dhc_neq_dic"
+l1t1mb7_sic17_dic34_n"non-uniform:dhc_neq_dic_tail"
+l1t1mb3_sic16_slc32_dic32_n"non-uniform:slc_neq_sic_and_dhc_neq_dic"
+l1t1mb4_sic17_dhc34_dic68_n"non-uniform:slc_neq_dhc_neq_dic_tail"
+l1t1mb2_sic16_slc32_dhc64_dic80_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic"
+l1t1mb3_sic17_slc34_dhc68_dic85_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail"
diff --git a/tests/benchdnn/inputs/rnn/shapes_small b/tests/benchdnn/inputs/rnn/shapes_small
index 96a6f86cb6f..c0c5398e541 100644
--- a/tests/benchdnn/inputs/rnn/shapes_small
+++ b/tests/benchdnn/inputs/rnn/shapes_small
@@ -1,12 +1,8 @@
 # small shapes
 
-l1t3mb1_sic16_n"uniform"
-l2t1mb2_sic17_n"uniform:tail"
-l3t2mb1_sic32_n"uniform:unroll"
-l1t3mb2_sic36_n"uniform:unroll_tail"
-l1t2mb2_sic16_slc32_n"non-uniform:slc_neq_sic"
-l1t2mb1_sic17_slc34_n"non-uniform:slc_neq_sic_tail"
-l1t1mb2_sic16_dhc32_n"non-uniform:slc_neq_dhc"
-l1t1mb2_sic17_dhc34_n"non-uniform:slc_neq_dhc_tail"
-l1t1mb1_sic16_slc32_dhc64_n"non-uniform:slc_neq_sic_neq_dhc"
-l1t1mb2_sic17_slc34_dhc68_n"non-uniform:slc_neq_sic_neq_dhc_tail"
+l8t3mb12_sic16_n"uniform"
+l4t3mb20_sic36_n"uniform:unroll_tail"
+l1t2mb6_sic16_slc32_n"non-uniform:slc_neq_sic"
+l1t1mb7_sic17_dhc34_n"non-uniform:slc_neq_dhc_tail"
+l1t1mb3_sic16_slc32_dhc64_n"non-uniform:slc_neq_sic_neq_dhc"
+l1t1mb4_sic17_slc34_dhc68_n"non-uniform:slc_neq_sic_neq_dhc_tail"
diff --git a/tests/benchdnn/inputs/rnn/shapes_small_gru b/tests/benchdnn/inputs/rnn/shapes_small_gru
index 80a36feb9ba..afc609c7e29 100644
--- a/tests/benchdnn/inputs/rnn/shapes_small_gru
+++ b/tests/benchdnn/inputs/rnn/shapes_small_gru
@@ -1,5 +1,4 @@
-
-l1t1mb1_sic16_n"uniform"
-l1t1mb1_sic17_n"uniform:tail"
-l1t1mb1_sic32_n"uniform:unroll"
-l1t1mb1_sic36_n"uniform:unroll_tail"
+l14t10mb12_sic16_n"uniform"
+l10t14mb10_sic17_n"uniform:tail"
+l5t7mb7_sic32_n"uniform:unroll"
+l1t6mb8_sic36_n"uniform:unroll_tail"
diff --git a/tests/benchdnn/inputs/rnn/test_augru_ci b/tests/benchdnn/inputs/rnn/test_augru_ci
index a78004f757d..bc9f08e0d3b 100644
--- a/tests/benchdnn/inputs/rnn/test_augru_ci
+++ b/tests/benchdnn/inputs/rnn/test_augru_ci
@@ -4,9 +4,6 @@
 --activation=UNDEF
 --direction=left2right
 --skip-nonlinear=false
---l=1
---t=1,2
---mb=1,2
 
 --trivial-strides=true,false
 --prop=FWD_I,BWD_DW
diff --git a/tests/benchdnn/inputs/rnn/test_gru_ci b/tests/benchdnn/inputs/rnn/test_gru_ci
index 9b036fc0ffc..9325b655522 100644
--- a/tests/benchdnn/inputs/rnn/test_gru_ci
+++ b/tests/benchdnn/inputs/rnn/test_gru_ci
@@ -4,9 +4,6 @@
 --activation=UNDEF
 --direction=left2right,right2left,concat,sum
 --skip-nonlinear=false
---l=1,2
---t=1,2
---mb=1,2
 
 --trivial-strides=true,false
 --prop=FWD_I,BWD_DW
diff --git a/tests/benchdnn/inputs/rnn/test_lstm_ci b/tests/benchdnn/inputs/rnn/test_lstm_ci
index af1c6fccdc0..64d32f881ff 100644
--- a/tests/benchdnn/inputs/rnn/test_lstm_ci
+++ b/tests/benchdnn/inputs/rnn/test_lstm_ci
@@ -4,9 +4,6 @@
 --activation=UNDEF
 --direction=left2right,right2left,concat,sum
 --skip-nonlinear=false
---l=1,2
---t=1,2
---mb=1,2
 
 --trivial-strides=true,false
 --prop=FWD_I,BWD_DW
diff --git a/tests/benchdnn/inputs/softmax/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/softmax/option_set_fwks_ext_gpu
index 0bc92b417e0..f263316ce97 100644
--- a/tests/benchdnn/inputs/softmax/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/softmax/option_set_fwks_ext_gpu
@@ -1,14 +1,14 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=f32 --dtag=abcd --dtag= 64x16x384x384_n"03ce4afc5f79b51e6f6b26a911b94b96*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 1113x8x30x30_n"783aec2c4b3839a62abbde5d7d8b02bf*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 1113x8x1x30_n"63027a142925325b0c9cfc792f4d9c8c*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 64x8x50x50_n"406527728ed9e539322260f658b035dd*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 64x8x1x50_n"b618d774f2595bd81471eb6539e3b5f6*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=bf16 --dtag=abcd --dtag= 64x16x384x384_n"c506308dfd71100253e61bc29d8e0eb9*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 32x16x250x250_n"90b5a458b5d9c391b7d8c827ac80ff4c*4"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 32x1x2_n"6ef2ad05f88515bc370585e9b8f090ef*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x1001_n"2a84e39685862ea4acacffecf9e2282a*1&1a4cc08dfffb26a5af616bd0f8ab6d81*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 32x1000_n"9d82ecfc8257d5e627685f598518187f*1&3a746e14d7f8aff37ed4ed46513d0767*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 32x2_n"ded4cbd4e3c0f393e28a076ad39961ce*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=f32 --dtag=abcd --dtag= 14x640x1024x3_n"5f024036cae92b798e47da65d6cd8ac4*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=bf16 --dtag=abcd --dtag= 64x16x384x384_n"25f3204b943391a601a4f5e2eb164da8*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=4 --ddt=bf16 --dtag=abcde --dtag= 1x128x128x128x4_n"4f2c3b1c468726db7ddefdf724a15eed*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=f32 14x640x1024x3_n"ebaf4acac0669ef7b7c2d30fe6693eb1*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=4 --ddt=bf16 1x128x128x128x4_n"6323005e986c0cc1e8439d5de9b6cfc2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 32x1000_n"1cdbb9941fc6dfa07ca9b7b220880814&0e81f66c689a9f432c3c7f896cfba20c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=bf16 64x16x384x384_n"b8de8b839de9a09a15ca69d769576745*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=bf16 64x16x384x384_n"415effaf8d2998b0204306797c65ff27*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=f32 64x16x384x384_n"9dc59feb846b303781a95cf47a56b013*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 32x16x250x250_n"ce917f30d04b7f33f41b945b6f8e4d5b*4"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f32 --ddt=f32 --stag=abc --dtag=abc 32x1x2_n"49f4004900b133ca93e217437550d5f7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x1001_n"dbfb042815adc4761aa4fa4d682d7a24&46e4522eae0dbbb40c0868202aac3c1c"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 32x2_n"237237483e2d85db8f11ef1b0ecaa6fa"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 1113x8x30x30_n"b7f0999b4e9eb844838de146b6bc04b7"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 1113x8x1x30_n"a6e17504fbd24d25bc4642c6d325cbc3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 64x8x50x50_n"5157f36924b2fc009e18b4d8a8df4672"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f32 --ddt=f32 --stag=abcd --dtag=abcd 64x8x1x50_n"c527b02cb81a60aaa84e830b61bee47b"
diff --git a/tests/benchdnn/inputs/softmax/option_set_fwks_key_gpu b/tests/benchdnn/inputs/softmax/option_set_fwks_key_gpu
index 1532dc55631..833efc86a96 100644
--- a/tests/benchdnn/inputs/softmax/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/softmax/option_set_fwks_key_gpu
@@ -1,32 +1,32 @@
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x1001_n"6235818ce3ea2864fd70c3f2ddea7403*1&b58848b7770118e3a65d6052477ed6ef*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 128x1000_n"9d868d6d4c57e11190040bc67c7bc283*50&857f53a9eb3178b067c868116321a328*50&d346813d100d635366de3a8206b87ac6*50"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 5000x2_n"9d10e77e3266f03b0b6a12385854d9cb*400"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 5000x2_n"cf61c2191427769c9fcb3b3cfc0a37d0*400&4757ac48bcb36349969e92fe2e717e54*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=acdb --dtag=acdb 4x3x640x1024_n"95659c26e2dec53dc77a96c256876553*3&60ee4d26b0ff14b283fd91a5772f5aa6*3"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f16 --ddt=f16 --stag=abc --dtag=abc 1x1000x91_n"3d6adda9947155a9627a72269efeb1ea*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 64x16x384x384_n"cede25da9cccf2a405b484d79a02f482*24&a9553f40c5250fe1eab002a0fd3f5396*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 4x3x15000_n"e5b1fcfab99f3d07d9ec77b4d263f685*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=0 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 4x3x15000_n"b53a521d1b4c5f9c54f956253a3b303f*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 2x3x15000_n"2a3c232f0be91b8da08f477a0c80b61f*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=0 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 2x3x15000_n"2780353d667cfc1d4f123978a85b4153*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 16x5x15000_n"3f2cfb4a8af5b9890e489bbeebb159af*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x8_n"063127e92a77c5490093e4c40b1ea03a*8"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x2_n"9641e65754c276d6291f62f73988b081*8"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 867x8_n"11cb2819bf0868af52759f79e74a06e0*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 867x2_n"bd976f62419dd518e4a907a1bf46dbba*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x12x128x128_n"65e9d24b789b84ede07c08273fba532b*12"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1x2_n"a535213de283e5a3e72cc286ca196df3*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 5000x8_n"61f8332e2d64b6f526b7d1aee4ca2911*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f16 --ddt=f16 --stag=abc --dtag=abc 16x1000x91_n"16fc89f079e1b9befb792f1a2ea8f09e*1"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=f32 --dtag=abcd --dtag= 128x12x128x128_n"b44a9c444851f4e9b936f2955f2195e0*24"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=ab 1024x8_n"968e577a0061c801c0c9f0d2b40d760d*14"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=ab 1024x2_n"c77bc5f93af011c00c4940ba30d2b2b1*14"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x4096x4096_n"f28e083cd1a7952fbf4ecc5e369bacfc*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x4096x77_n"9c6bb5eefea618bdfd98e73e5252679c*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x1024x1024_n"11f4d55f45be741257f3c5ab5432423b*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x1024x77_n"b05ee2e5ce4eee73923897bd445ca638*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x256x256_n"2285bfcf5986d41d7a5fb815fc855068*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x256x77_n"21256ac9a333314e557af70970c05b39*10"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x64x64_n"93dc67d0dd40d85026ac2c1dc57ee2f3*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x64x77_n"105131ae9f1a394ba9035d303133afed*2"
---reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 256x1001_n"af91c6d7517dfeaae45cd6bc31e985dd*1&a4e6c132827899bdec9332cb7482d50d*1"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=acdb --dtag=acdb 4x3x640x1024_n"3ac55658eac207730b2a897a6222f2fd*3&bb87072117f50f0f675373861044901b*3"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x1001_n"89f4f897c0e9f401ae029f2a490c746c&99bdb8b3f26d4b1da5c4386e2feee86d"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f16 --ddt=f16 --stag=abc --dtag=abc 1x1000x91_n"9c7fb8a246b6e9ee5a635f26b4a8a319"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 128x1000_n"89ffbcddf048ce7067ba9f26af54c1d5*50&4be813292f59150da5ca657112b8c8e7*50&a13bb13d03a9a6693df5bcbfcb1e9530*50"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 256x1001_n"8f3a1185ad4d8f23d4b8657e64743b10&beb0966c2491c916c517e95779046799"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 5000x2_n"0c8a06488d0570d72e54eb7f723bee7f*400"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 64x16x384x384_n"e81018abeac7121a8eab6a10f279e29a*24&5c6c980104d052d0f7745c58ebb5099e*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 16x5x15000_n"c07c258faeb67a1f336b1bd0e3ddf294"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 4x3x15000_n"6b173c5d90c4a4ea885413ef5fb733f3*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=0 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 4x3x15000_n"ee37cfee4e85176d6f19170acde6de4c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 2x3x15000_n"467d96daa2aef51826a2cfe671987b9a"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=0 --sdt=f32 --ddt=f32 --stag=acb --dtag=acb 2x3x15000_n"0717058b3cd2a3c5dad2802584147b1b"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x4096x4096_n"ba9556ff0d3ab6809477130a3fbd2ced*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x4096x77_n"b972c3eda5d26d91a9ea6a4438c8fb2c*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x1024x1024_n"fbdea1791aafce08bd87557c9f5e14eb*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x1024x77_n"474f1ca167941f32b247a7d6ae92a39f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x256x256_n"cb9f977967bd3b0395e88313f8cf220f*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x256x77_n"d9fef86d60c85fd9479207b047cb1b65*10"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x64x64_n"44ccbeaaad81aaab211a4f62aed97469*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x8x64x77_n"6bc2edf589820e47da7bbd1eafad3fa2*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 5000x2_n"3ebeb1c55cc0c7260ce6965a333327eb*400&c7b0d754b7d096f0fa16d1bf87ad4908*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=2 --sdt=f16 --ddt=f16 --stag=abc --dtag=abc 16x1000x91_n"9d0618f7a6998f0698c67c9241b69d23"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=3 --sdt=f16 --ddt=f16 --stag=abcd --dtag=abcd 1x12x128x128_n"1b65236311f6cad13db3a06ab80e4227*12"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1x2_n"515261f80bf8ad7c6d0e362487de6e72"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=ab 1024x8_n"ebdc4ea37af21c9feca00271f81743ce*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=bf16 --ddt=bf16 --stag=ab --dtag=ab 1024x2_n"6efc5637db7d7890abbd51f112805c32*14"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f16 --ddt=f16 --stag=ab --dtag=ab 5000x8_n"a9ad4d5cd4a8eeccb6a8e4677d0f542c*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax --axis=3 --ddt=f32 128x12x128x128_n"b90a9eecf6c08c0ef0c66f12798f9ace*24"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x8_n"8252eb81997de8223e1f87034a8be1ea*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 1024x2_n"4f81d42a7f97a21252d30ea51a8739b3*8"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 867x8_n"5ffe93d9258f7190fb9e8d495fdbd18d*2"
+--reset --allow-enum-tags-only=0 --dir=FWD_D --alg=softmax_accurate --axis=1 --sdt=f32 --ddt=f32 --stag=ab --dtag=ab 867x2_n"8e63c75fbdd562a70cd32d9aa5ec0e99*2"
diff --git a/tests/benchdnn/inputs/softmax/shapes_0d b/tests/benchdnn/inputs/softmax/shapes_0d
index d04125cd6a3..a5f71af4409 100644
--- a/tests/benchdnn/inputs/softmax/shapes_0d
+++ b/tests/benchdnn/inputs/softmax/shapes_0d
@@ -4,4 +4,4 @@
 256x10
 32x100
 2x113
-128x36548
+128x365
diff --git a/tests/benchdnn/inputs/softmax/test_softmax_all b/tests/benchdnn/inputs/softmax/test_softmax_all
index d7847175388..38f81f3a5e3 100644
--- a/tests/benchdnn/inputs/softmax/test_softmax_all
+++ b/tests/benchdnn/inputs/softmax/test_softmax_all
@@ -1,40 +1,34 @@
 --reset
 
 --inplace=true,false
-
 --alg=SOFTMAX,LOGSOFTMAX
---dtag=any
-# f32
---dir=FWD_D,BWD_D
+
+# Forward training
+--dir=FWD_D
 --sdt=f32
 --ddt=f32
-
 --stag=abx
 --axis=0,1
 --batch=set_0d
 --axis=1,3
 --batch=shapes_2d
---axis=3,4
+--axis=1,4
 --batch=shapes_3d
 
 --stag=axb
---axis=0,1
---batch=shapes_2d
---batch=shapes_3d
-
-# Blocked cases target ARM-SVE
---stag=aBx4b,aBx8b,aBx16b
---axis=0,1
+--axis=1
 --batch=shapes_2d
+--axis=1
 --batch=shapes_3d
 
+# Forward inference
 --dir=FWD_I
 --sdt=f32,s8,u8
 --ddt=s8,u8
 --stag=axb
 --attr-scales=,src:common:128,dst:common:0.125,src:common:64+dst:common:0.5
 --attr-post-ops=,add:f32:per_oc,mul:f32:per_tensor,linear:0.5:2,add:f32:common+linear:0.5:2
---axis=0,1
+--axis=1
 --batch=set_0d
 --batch=shapes_2d
 
@@ -43,17 +37,44 @@
 --stag=axb
 --attr-scales=,src:common:128
 --attr-post-ops=,add:f32:per_oc,mul:f32:per_tensor,linear:0.5:2,add:f32:common+linear:0.5:2
---axis=0,1
+--axis=1
 --batch=set_0d
 --batch=shapes_2d
 
-# f32 precison test for large tensor
+# Backward training
 --reset
---dir=FWD_D,BWD_D
+--inplace=true,false
 --alg=SOFTMAX,LOGSOFTMAX
+--dir=BWD_D
 --sdt=f32
+--ddt=f32
+--stag=abx
 --axis=0
---batch=shapes_large
+--batch=shapes_0d
+--axis=1
+--batch=set_0d
+--axis=3
+--batch=shapes_2d
+--axis=4
+--batch=shapes_3d
+
+--stag=axb
+--axis=1
+--batch=shapes_2d
+--axis=1
+--batch=shapes_3d
+
+# Blocked cases target ARM-SVE
+--reset
+--inplace=true,false
+--alg=SOFTMAX,LOGSOFTMAX
+--dir=FWD_D,BWD_D
+--sdt=f32
+--ddt=f32
+--stag=aBx4b,aBx8b,aBx16b
+--axis=1
+--batch=shapes_2d
+--batch=shapes_3d
 
 --reset --stag=acbd --dtag=acbd --sdt=f32 --ddt=f32 --axis=3 1x16x384x384_n"neighbor_dim_to_axis_has_larger_stride"
 
diff --git a/tests/benchdnn/inputs/softmax/test_softmax_bfloat16 b/tests/benchdnn/inputs/softmax/test_softmax_bfloat16
index 5f92a6fa2cf..79d4297e083 100644
--- a/tests/benchdnn/inputs/softmax/test_softmax_bfloat16
+++ b/tests/benchdnn/inputs/softmax/test_softmax_bfloat16
@@ -1,48 +1,67 @@
 --reset
 
 --inplace=true,false
-
 --alg=SOFTMAX,LOGSOFTMAX
---dtag=any
-# bf16
---dir=FWD_D,BWD_D
+
+# Forward training
+--dir=FWD_D
 --sdt=bf16
 --ddt=bf16
-
 --stag=abx
 --axis=0,1
 --batch=set_0d
 --axis=1,3
 --batch=shapes_2d
---axis=3,4
+--axis=1,4
 --batch=shapes_3d
 
 --stag=axb
---axis=0,1
---batch=shapes_2d
---batch=shapes_3d
-
---stag=aBx8b,aBx16b
---axis=0,1
+--axis=1
 --batch=shapes_2d
+--axis=1
 --batch=shapes_3d
 
+# Forward inference
 --dir=FWD_I
 --sdt=bf16
 --ddt=s8,u8
 --stag=axb
 --attr-scales=,src:common:128,dst:common:0.125,src:common:64+dst:common:0.5
---axis=0,1
+--attr-post-ops=,add:bf16:per_oc,mul:bf16:per_tensor,linear:0.5:2,add:bf16:common+linear:0.5:2
+--axis=1
 --batch=set_0d
 --batch=shapes_2d
 
---dir=FWD_I
 --sdt=s8,u8
 --ddt=bf16
 --stag=axb
 --attr-scales=,src:common:128
---axis=0,1
+--attr-post-ops=,add:bf16:per_oc,mul:bf16:per_tensor,linear:0.5:2,add:bf16:common+linear:0.5:2
+--axis=1
 --batch=set_0d
 --batch=shapes_2d
 
+# Backward training
+--reset
+--inplace=true,false
+--alg=SOFTMAX,LOGSOFTMAX
+--dir=BWD_D
+--sdt=bf16
+--ddt=bf16
+--stag=abx
+--axis=0
+--batch=shapes_0d
+--axis=1
+--batch=set_0d
+--axis=3
+--batch=shapes_2d
+--axis=4
+--batch=shapes_3d
+
+--stag=axb
+--axis=1
+--batch=shapes_2d
+--axis=1
+--batch=shapes_3d
+
 --reset --stag=acbd --dtag=acbd --sdt=bf16 --ddt=bf16 --axis=3 1x16x384x384_n"neighbor_dim_to_axis_has_larger_stride"
diff --git a/tests/benchdnn/inputs/softmax/test_softmax_ci b/tests/benchdnn/inputs/softmax/test_softmax_ci
index 6f8d3839bc4..ed560aa1e65 100644
--- a/tests/benchdnn/inputs/softmax/test_softmax_ci
+++ b/tests/benchdnn/inputs/softmax/test_softmax_ci
@@ -10,17 +10,20 @@
 --dir=FWD_D,BWD_D
 --sdt=f32,f64,bf16,f16
 --ddt=f32,f64,bf16,f16
+--attr-acc-mode=strict,relaxed
 --batch=shapes_ci
 
 --dir=FWD_I
 --sdt=f32,bf16,f16,s8,u8
 --ddt=s8,u8
+--attr-acc-mode=strict,relaxed
 --attr-scales=src:common:64+dst:common:0.5
 --attr-post-ops=,add:f32:per_oc,mul:f32:per_tensor,linear:0.5:2
 --batch=shapes_ci
 
 --sdt=s8,u8
 --ddt=f32,bf16,f16
+--attr-acc-mode=strict,relaxed
 --attr-scales=src:common:64
 --attr-post-ops=,add:f32:per_oc,mul:f32:per_tensor,linear:0.5:2
 --batch=shapes_ci
diff --git a/tests/benchdnn/inputs/softmax/test_softmax_float16 b/tests/benchdnn/inputs/softmax/test_softmax_float16
index ad072c110f9..692bb2628b3 100644
--- a/tests/benchdnn/inputs/softmax/test_softmax_float16
+++ b/tests/benchdnn/inputs/softmax/test_softmax_float16
@@ -1,43 +1,67 @@
 --reset
 
 --inplace=true,false
-
 --alg=SOFTMAX,LOGSOFTMAX
---dtag=any
-# f16
---dir=FWD_D,BWD_D
+
+# Forward training
+--dir=FWD_D
 --sdt=f16
 --ddt=f16
-
 --stag=abx
 --axis=0,1
 --batch=set_0d
 --axis=1,3
 --batch=shapes_2d
---axis=3,4
+--axis=1,4
 --batch=shapes_3d
 
 --stag=axb
---axis=0,1
+--axis=1
 --batch=shapes_2d
+--axis=1
 --batch=shapes_3d
 
+# Forward inference
 --dir=FWD_I
 --sdt=f16
 --ddt=s8,u8
 --stag=axb
 --attr-scales=,src:common:128,dst:common:0.125,src:common:64+dst:common:0.5
---axis=0,1
+--attr-post-ops=,add:f16:per_oc,mul:f16:per_tensor,linear:0.5:2,add:f16:common+linear:0.5:2
+--axis=1
 --batch=set_0d
 --batch=shapes_2d
 
---dir=FWD_I
 --sdt=s8,u8
 --ddt=f16
 --stag=axb
 --attr-scales=,src:common:128
---axis=0,1
+--attr-post-ops=,add:f16:per_oc,mul:f16:per_tensor,linear:0.5:2,add:f16:common+linear:0.5:2
+--axis=1
 --batch=set_0d
 --batch=shapes_2d
 
+# Backward training
+--reset
+--inplace=true,false
+--alg=SOFTMAX,LOGSOFTMAX
+--dir=BWD_D
+--sdt=f16
+--ddt=f16
+--stag=abx
+--axis=0
+--batch=shapes_0d
+--axis=1
+--batch=set_0d
+--axis=3
+--batch=shapes_2d
+--axis=4
+--batch=shapes_3d
+
+--stag=axb
+--axis=1
+--batch=shapes_2d
+--axis=1
+--batch=shapes_3d
+
 --reset --stag=acbd --dtag=acbd --sdt=f16 --ddt=f16 --axis=3 1x16x384x384_n"neighbor_dim_to_axis_has_larger_stride"
diff --git a/tests/benchdnn/inputs/sum/option_set_fwks_ext_gpu b/tests/benchdnn/inputs/sum/option_set_fwks_ext_gpu
index 3b2424d003a..4f82d720e57 100644
--- a/tests/benchdnn/inputs/sum/option_set_fwks_ext_gpu
+++ b/tests/benchdnn/inputs/sum/option_set_fwks_ext_gpu
@@ -1,4 +1,4 @@
---reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=ab:ab:ab:ab --dtag=ab 64x512_n"fa8f17bf8b3657b0d9d7de5d61a52941*1"
---reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x8x50x64_n"85af93eef1a80f81f3b77e30f5843be5*1"
---reset --allow-enum-tags-only=0 --sdt=f32:f32:f32 --ddt=f32 --stag=abc:abc:abc --dtag=abc 64x50x512_n"5fa71c459a90a16dbd9f81a04e014c1b*1"
---reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1113x8x30x64_n"f67e21b5c1cfb07bdd0d8164dd2d36a9*1"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=ab:ab:ab:ab --dtag=ab 64x512_n"6b784ad186b553b7f55a12c7a3579307"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x8x50x64_n"7e5b590a94ea3a068e6fc78dd624b071"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32:f32 --ddt=f32 --stag=abc:abc:abc --dtag=abc 64x50x512_n"11f5071ad0bf7edf9bbef7ccdfe80631"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1113x8x30x64_n"aa3e9fb5d5769abb50c779815755e51c"
diff --git a/tests/benchdnn/inputs/sum/option_set_fwks_key_gpu b/tests/benchdnn/inputs/sum/option_set_fwks_key_gpu
index 42678b3d733..b1498027c7e 100644
--- a/tests/benchdnn/inputs/sum/option_set_fwks_key_gpu
+++ b/tests/benchdnn/inputs/sum/option_set_fwks_key_gpu
@@ -1,10 +1,10 @@
---reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x3276x1_n"4993cfd43a3fa8ddafc0486f641fa726*2"
---reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x819x1_n"e33e8934bada28e52264ee65d151231d*2"
---reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x13104x1_n"84557458f1fd7ba3cdf0d87126c67c6d*2"
---reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x52416x1_n"94f24e4704e9d4fef4a34e85f7c53f3a*2"
---reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x209664x1_n"03eff2e7d0c7c6e70431b118c3d4c279*2"
---reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=abcd:abcd:abcd:abcd --dtag=abcd 16x1000x14x14_n"61e1c04930565026ae446e11ffefa6ac*1"
---reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 128x128x768_n"bbf893ff26b4c4c1be0d6359c9c070e7*2"
---reset --allow-enum-tags-only=0 --sdt=f32:f32:f32 --ddt=f32 --stag=ab:ab:ab --dtag=ab 16384x3072_n"6a901b86bfe2eae7fe1d2a430b7ba22e*24"
---reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768_n"db23941a6c5a8e409cc5f19d59b177bd*24"
---reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=ab:ab:ab:ab --dtag=ab 16384x768_n"1cc9df477879fd88462138e8aef41618*24"
+--reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x3276x1_n"223aec02036850e740023ab5c201ce87*2"
+--reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x819x1_n"22204e44abcac4aaf9a70ed5a361ebc2*2"
+--reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x13104x1_n"c516ef7c2dfce5e720755463f42806f6*2"
+--reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x52416x1_n"b1f9c7502bfc69ece76f8ab81431d1b4*2"
+--reset --allow-enum-tags-only=0 --sdt=f16:f16:f16 --ddt=f16 --stag=abc:abc:abc --dtag=abc 16x209664x1_n"393c86752a159bcdda444e62db0b2628*2"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=abcd:abcd:abcd:abcd --dtag=abcd 16x1000x14x14_n"9958bd34cc82f49fcac0108d71a729cc"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abc:abc --dtag=abc 128x128x768_n"0031dec83174f96598b9e67ff632f3e0*2"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32:f32 --ddt=f32 --stag=ab:ab:ab --dtag=ab 16384x3072_n"acda06dcbcae48eb07b83155b1d051df*24"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=ab:ab --dtag=ab 16384x768_n"b311ec6cda5ab943632a8cd733119116*24"
+--reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=ab:ab:ab:ab --dtag=ab 16384x768_n"7e30943017736555a69afd2f007a44a8*24"
diff --git a/tests/benchdnn/ip/bench_ip.cpp b/tests/benchdnn/ip/bench_ip.cpp
index 21b1005ba6b..a689aaee4fb 100644
--- a/tests/benchdnn/ip/bench_ip.cpp
+++ b/tests/benchdnn/ip/bench_ip.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,22 +25,13 @@
 
 namespace ip {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
     for_(const auto &i_dir : s.dir)
     for_(const auto &i_dt : s.dt)
+    for_(const auto &i_bia_dt_ : s.bia_dt)
     for_(const auto &i_stag : s.stag)
     for_(const auto &i_wtag : s.wtag)
     for_(const auto &i_dtag : s.dtag)
@@ -48,12 +39,25 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (const auto &i_mb : s.mb) {
-        const prb_t prb(s.desc, i_mb, i_dir, i_dt, i_stag, i_wtag, i_dtag,
-                i_attr, i_ctx_init, i_ctx_exe);
+        auto i_bia_dt = i_bia_dt_;
+        if (i_dir & FLAG_BIA) {
+            if (i_bia_dt != dnnl_data_type_undef) {
+                BENCHDNN_PRINT(0, "%s\n",
+                        "Warning: `--dir=FWD_B,BWD_WB` options are "
+                        "incompatible with `--bia-dt` option. To specify a "
+                        "bias data type, use `--dir=FWD_D,FWD_I,BWD_W` values "
+                        "intead.");
+            }
+            // The f32 data type should be used as the default for bias with
+            // directions that include a bias.
+            i_bia_dt = dnnl_f32;
+        }
+
+        const prb_t prb(s.desc, i_dir, i_dt, i_bia_dt, i_stag, i_wtag, i_dtag,
+                i_mb, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -83,17 +87,12 @@ int bench(int argc, char **argv) {
                 || parse_batch(bench, argv[0])
                 || parse_dir(s.dir, def.dir, argv[0])
                 || parse_multi_dt(s.dt, def.dt, argv[0], "dt")
+                || parse_dt(s.bia_dt, def.bia_dt, argv[0], "bia-dt")
                 || parse_tag(s.stag, def.stag, argv[0], "stag")
                 || parse_tag(s.wtag, def.wtag, argv[0], "wtag")
                 || parse_tag(s.dtag, def.dtag, argv[0], "dtag")
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/ip/cfg.cpp b/tests/benchdnn/ip/cfg.cpp
index a25a7df1b2d..f5f7c178476 100644
--- a/tests/benchdnn/ip/cfg.cpp
+++ b/tests/benchdnn/ip/cfg.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,9 +36,13 @@ cfg_t::cfg_t(const prb_t *prb, const std::vector<data_kind_t> &kinds) {
             && dnnl_data_type_size(this->get_dt(DST)) >= 4;
     if (is_int8_and_wide_dst) { set_range_max(SRC, 160); }
 
-    // Wider ranges make Nvidia bf16 test cases to fail by accuracy, likely due
-    // to internal dispatch into lower precision code.
-    if (is_nvidia_gpu() && this->get_dt(WEI) == dnnl_bf16) {
+    // Wider ranges make Nvidia/AMD bf16/f16 test cases to fail by accuracy,
+    // likely due to internal dispatch into lower precision accumulation code.
+    if ((is_nvidia_gpu() || is_amd_gpu())
+            && (this->get_dt(WEI) == dnnl_bf16
+                    || this->get_dt(WEI) == dnnl_f16)) {
+        set_range_min(SRC, -2);
+        set_range_max(SRC, 2);
         set_range_min(WEI, -2);
         set_range_max(WEI, 2);
         set_range_min(DST, -2);
@@ -58,8 +62,7 @@ float cfg_t::get_density(const cfg_t::density_args_t &density_args) const {
     // BWD_D will always use dense tensors. It's fine as long as accumulators
     // stay in f32 "safe digit" space, otherwise potential result mismatch may
     // happen.
-    if (!has_bench_mode_bit(mode_bit_t::corr) || density_args.data_kind != SRC)
-        return density;
+    if (density_args.data_kind != SRC) return density;
 
     const auto safe_n_acc = get_safe_n_acc();
     assert(safe_n_acc > 0);
@@ -81,6 +84,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-32, 32}},
             {{dnnl_bf16}, {-4, 4}},
             {{dnnl_f16}, {-4, 4}},
+            {{dnnl_f8_e5m2}, {-4, 4}},
+            {{dnnl_f8_e4m3}, {-4, 4}},
             {{dnnl_s8}, {-4, 4}},
             {{dnnl_u8}, {0, 8}},
     };
@@ -89,6 +94,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-32, 32}},
             {{dnnl_bf16}, {-8, 8}},
             {{dnnl_f16}, {-2, 2}},
+            {{dnnl_f8_e5m2}, {-2, 2}},
+            {{dnnl_f8_e4m3}, {-2, 2}},
             {{dnnl_s8}, {-4, 4}},
     };
 
@@ -96,6 +103,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-8, 8}},
             {{dnnl_bf16}, {-8, 8}},
             {{dnnl_f16}, {-8, 8}},
+            {{dnnl_f8_e5m2}, {-8, 8}},
+            {{dnnl_f8_e4m3}, {-8, 8}},
             {{dnnl_s8}, {-8, 8}},
             {{dnnl_u8}, {0, 8}},
             {{dnnl_s32}, {-8, 8}},
@@ -105,6 +114,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-8, 8}},
             {{dnnl_bf16}, {-8, 8}},
             {{dnnl_f16}, {-4, 4}},
+            {{dnnl_f8_e5m2}, {-4, 4}},
+            {{dnnl_f8_e4m3}, {-4, 4}},
             {{dnnl_s8}, {-4, 4}},
             {{dnnl_u8}, {0, 160}},
             {{dnnl_s32}, {-128, 128}},
diff --git a/tests/benchdnn/ip/ip.cpp b/tests/benchdnn/ip/ip.cpp
index 7c8417fa873..17d6fedb937 100644
--- a/tests/benchdnn/ip/ip.cpp
+++ b/tests/benchdnn/ip/ip.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,8 +40,11 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
             force_f32_dt ? dnnl_f32 : prb->get_dt(SRC), prb->stag);
     auto wei_d = dnn_mem_t::init_md(prb->ndims, prb->wei_dims().data(),
             force_f32_dt ? dnnl_f32 : prb->get_dt(WEI), prb->wtag);
-    auto bia_d = dnn_mem_t::init_md(1, prb->bia_dims().data(),
-            force_f32_dt ? dnnl_f32 : prb->get_dt(BIA), tag::any);
+    benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> bia_d {};
+    if (prb->bia_dt() != dnnl_data_type_undef) {
+        bia_d = dnn_mem_t::init_md(1, prb->bia_dims().data(),
+                force_f32_dt ? dnnl_f32 : prb->get_dt(BIA), tag::any);
+    }
     auto dst_d = dnn_mem_t::init_md(2, prb->dst_dims().data(),
             force_f32_dt ? dnnl_f32 : prb->get_dt(DST), prb->dtag);
 
@@ -49,7 +52,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     attr_args.prepare_post_ops_mds(prb->attr, 2, prb->dst_dims().data());
     auto wei_scale = prb->attr.scales.get(DNNL_ARG_WEIGHTS);
     if (wei_scale.policy == policy_t::PER_OC) {
-        attr_args.prepare_scales(prb->attr, DNNL_ARG_WEIGHTS, 1);
+        attr_args.prepare_quant(
+                prb->attr, DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, 1);
     }
     auto dnnl_attr = make_benchdnn_dnnl_wrapper(
             create_dnnl_attr(prb->attr, attr_args));
@@ -58,7 +62,6 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
         case FWD_D:
         case FWD_B:
         case FWD_I:
-            if (prb->dir != FWD_B) bia_d.reset(nullptr);
             TIME_C_PD(DNN_SAFE_STATUS(
                     dnnl_inner_product_forward_primitive_desc_create(
                             &init_pd_args.pd, init_pd_args.engine,
@@ -75,7 +78,6 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
             break;
         case BWD_W:
         case BWD_WB:
-            if (prb->dir == BWD_W) bia_d.reset(nullptr);
             TIME_C_PD(DNN_SAFE_STATUS(
                     dnnl_inner_product_backward_weights_primitive_desc_create(
                             &init_pd_args.pd, init_pd_args.engine, src_d, wei_d,
@@ -97,45 +99,33 @@ int init_prim_ref(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
     if (is_cpu() && (prb->src_dt() == dnnl_f32 && prb->wei_dt() == dnnl_f32))
         return OK;
 
-    // Create a new copy of prb to avoid potentially corrupting the test by
-    // modifying prb in place.
-    auto cpu_attr = prb->attr;
-    update_cpu_ref_attrs(cpu_attr);
     std::vector<std::vector<dnnl_data_type_t>> prim_ref_dt {
             prb->dt, {dnnl_f32}};
-    if (is_cpu()) prim_ref_dt.erase(prim_ref_dt.begin());
-    dnnl_primitive_t prim_ref_ {};
-
-    for (const auto &prim_ref_dt_i : prim_ref_dt) {
-        prb_t prb_cpu {*prb, prb->mb, prb->dir, prim_ref_dt_i, tag::any,
-                tag::any, tag::any, cpu_attr, prb->ctx_init, prb->ctx_exe};
-
-        init_pd_args_t<prb_t> init_pd_args(
-                /* res = */ nullptr, get_cpu_engine(), &prb_cpu, prb->dir,
-                /* hint = */ nullptr, /* src_md = */ nullptr);
-        init_pd(init_pd_args);
-
-        benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> pdw;
-        fetch_impl(pdw, init_pd_args, /* res = */ nullptr,
-                /* is_service_prim = */ true);
-
-        // Prim desc wasn't created - try the next set...
-        if (!pdw) continue;
-        // Reference impl was fetched - try the next set...
-        if (query_impl_info(pdw) == "ref:any") continue;
-
-        auto st = dnnl_primitive_create(&prim_ref_, pdw);
-        // Primitive wan't created - try the next set...
-        if (st != dnnl_success) continue;
-
-        BENCHDNN_PRINT(5, "CPU reference oneDNN implementation: %s\n",
-                query_impl_info(pdw).c_str());
-        res->prim_ref_repro = prb_cpu.str();
-        prim_ref.reset(prim_ref_);
-        return OK;
+    // If there's no bias, undef data type should be used for prim_ref as well.
+    dnnl_data_type_t cpu_bia_dt
+            = prb->bia_dt() == dnnl_data_type_undef ? prb->bia_dt() : dnnl_f32;
+    std::vector<dnnl_data_type_t> prim_ref_bia_dt {prb->bia_dt(), cpu_bia_dt};
+    if (is_cpu()) {
+        prim_ref_dt.erase(prim_ref_dt.begin());
+        prim_ref_bia_dt.erase(prim_ref_bia_dt.begin());
+    }
+
+    for_(const auto &prim_ref_dt_i : prim_ref_dt)
+    for (const auto &prim_ref_bia_dt_i : prim_ref_bia_dt) {
+        auto cpu_attr = prb->attr;
+        update_cpu_ref_attrs(cpu_attr, prim_ref_dt_i.back());
+
+        // Create a new copy of prb to avoid potentially corrupting the test by
+        // modifying prb in place.
+        prb_t prb_cpu {*prb, prb->dir, prim_ref_dt_i, prim_ref_bia_dt_i,
+                tag::any, tag::any, tag::any, prb->mb, cpu_attr, prb->ctx_init,
+                prb->ctx_exe, prb->impl_filter};
+
+        auto st = init_prim_ref_common(prim_ref, &prb_cpu, res);
+        if (st == OK) return OK;
     }
 
-    prim_ref.reset(prim_ref_);
+    prim_ref.reset(nullptr);
     return OK;
 }
 
@@ -148,7 +138,7 @@ int check_reorder_presence(
     /* Note for x64:
     Both data types of src and weight are s8, oneDNN addds 128 to one of the s8
     input to make it of type u8 instead, as explained in
-    https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html or
+    https://uxlfoundation.github.io/oneDNN/dev_guide_int8_computations.html or
     doc/advanced/int8_computations.md
     It is because `VPDPBUSD` instruction uses the combination of s8 and u8 as
     input.
@@ -188,6 +178,10 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     cfg_t::density_args_t density_args;
     density_args.data_kind = kind;
@@ -241,9 +235,9 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
 }
 
 void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
-    skip_unimplemented_data_type(
-            {prb->get_dt(SRC), prb->get_dt(WEI), prb->get_dt(DST)}, prb->dir,
-            res);
+    skip_unimplemented_data_type({prb->get_dt(SRC), prb->get_dt(WEI),
+                                         prb->get_dt(BIA), prb->get_dt(DST)},
+            prb->dir, res);
     skip_unimplemented_sum_po(prb->attr, res, dnnl_inner_product,
             prb->get_dt(SRC), prb->get_dt(DST));
     skip_unimplemented_prelu_po(prb->attr, res, dnnl_inner_product);
@@ -269,7 +263,17 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {}
 
 void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
         const args_t &ref_args) {
-    cmp.set_threshold(0.f);
+    // The nvidia implementation has different precision guarantees in some cases
+    // for large problems with post-op sum
+    if (is_nvidia_gpu()
+            && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1
+            && prb->dst_dt() == dnnl_f16 && (prb->dir & FLAG_FWD)
+            && prb->attr.acc_mode == dnnl_accumulation_mode_relaxed) {
+        const float trh = epsilon_dt(prb->dt[2]);
+        cmp.set_threshold(trh);
+    } else {
+        cmp.set_threshold(0.f);
+    }
 }
 
 std::vector<int> supported_exec_args(dir_t dir) {
@@ -299,7 +303,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -372,7 +376,7 @@ std::vector<data_kind_t> get_kinds_to_check(const prb_t *prb) {
         check_kinds = {SRC};
     } else if (prb->dir & FLAG_BWD && prb->dir & FLAG_WEI) {
         check_kinds = {WEI};
-        if (prb->dir & FLAG_BIA) check_kinds.push_back(BIA);
+        if (prb->bia_dt() != dnnl_data_type_undef) check_kinds.push_back(BIA);
     } else {
         assert(!"unexpected!");
         SAFE_V(FAIL);
@@ -390,11 +394,26 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    // Don't check caches for CPU prim as the reference.
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        const auto &prim_ref = v_prim[1];
+        if (prim_ref) {
+            // Copy res to avoid save/restore state and reason.
+            res_t res_copy = *res;
+            SAFE(check_total_size(&res_copy, prim_ref), WARN);
+            if (res_copy.state == SKIPPED) {
+                v_prim[1].reset(nullptr);
+                SAFE(check_total_size(res), WARN);
+            }
+        } else {
+            SAFE(check_total_size(res), WARN);
+        }
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        // Don't check caches for CPU prim as the reference.
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/ip/ip.hpp b/tests/benchdnn/ip/ip.hpp
index 5987f02b0da..f41944f8a19 100644
--- a/tests/benchdnn/ip/ip.hpp
+++ b/tests/benchdnn/ip/ip.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,6 +55,7 @@ struct settings_t : public base_settings_t {
 
     std::vector<dir_t> dir {FWD_B};
     std::vector<std::vector<dnnl_data_type_t>> dt {{dnnl_f32}};
+    std::vector<dnnl_data_type_t> bia_dt {dnnl_data_type_undef};
     std::vector<std::string> stag {tag::any}, wtag {tag::any}, dtag {tag::any};
 
     const char *perf_template_csv() const {
@@ -65,8 +66,8 @@ struct settings_t : public base_settings_t {
     void reset() { *this = settings_t(perf_template); }
 
     bool has_single_setup() const override {
-        return dir.size() == 1 && dt.size() == 1 && stag.size() == 1
-                && wtag.size() == 1 && dtag.size() == 1
+        return dir.size() == 1 && dt.size() == 1 && bia_dt.size() == 1
+                && stag.size() == 1 && wtag.size() == 1 && dtag.size() == 1
                 && base_settings_t::has_single_setup();
     }
 };
@@ -74,27 +75,31 @@ struct settings_t : public base_settings_t {
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.mb[0], s.dir[0], s.dt[0], s.stag[0], s.wtag[0],
-                s.dtag[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0]) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.bia_dt[0], s.stag[0], s.wtag[0],
+                s.dtag[0], s.mb[0], s.attributes.front(), s.ctx_init[0],
+                s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
-    prb_t(const desc_t &desc, int64_t mb, dir_t dir,
-            const std::vector<dnnl_data_type_t> &dt, const std::string &stag,
-            const std::string &wtag, const std::string &dtag,
-            const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe)
+    prb_t(const desc_t &desc, dir_t dir,
+            const std::vector<dnnl_data_type_t> &dt, dnnl_data_type_t bia_dt,
+            const std::string &stag, const std::string &wtag,
+            const std::string &dtag, int64_t mb, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : desc_t(desc)
         , dir(dir)
         , dt(dt)
+        , bia_dt_(bia_dt)
         , stag(stag)
         , wtag(wtag)
         , dtag(dtag)
+        , user_mb(mb)
+        , ops(0)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb)
-        , ops(0) {
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
 
         // Broadcast data types if needed
@@ -109,13 +114,14 @@ struct prb_t : public desc_t {
 
     dir_t dir;
     std::vector<dnnl_data_type_t> dt;
+    dnnl_data_type_t bia_dt_; // `_` to avoid conflicting name with bia_dt().
     std::string stag, wtag, dtag;
+    int64_t user_mb;
+    double ops;
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
-
-    double ops;
+    impl_filter_t impl_filter;
 
     void count_ops() {
         if (ops > 0) return;
@@ -127,9 +133,7 @@ struct prb_t : public desc_t {
 
     dnnl_data_type_t src_dt() const { return dt[0]; }
     dnnl_data_type_t wei_dt() const { return dt[1]; }
-    dnnl_data_type_t bia_dt() const {
-        return is_integral_dt(wei_dt()) ? dnnl_f32 : wei_dt();
-    } // TODO: customize
+    dnnl_data_type_t bia_dt() const { return bia_dt_; }
     dnnl_data_type_t dst_dt() const { return dt[2]; }
     dnnl_data_type_t get_dt(data_kind_t data_kind) const;
 
@@ -226,8 +230,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/ip/ip_aux.cpp b/tests/benchdnn/ip/ip_aux.cpp
index e8fdf084cc2..0bee4fde48f 100644
--- a/tests/benchdnn/ip/ip_aux.cpp
+++ b/tests/benchdnn/ip/ip_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ int str2desc(desc_t *desc, const char *str) {
             ok = 1; \
             s += strlen(prb); \
             char *end_s; \
-            d.c = strtol(s, &end_s, 10); \
+            d.c = strtoll(s, &end_s, 10); \
             if (end_s == s) { \
                 BENCHDNN_PRINT(0, \
                         "ERROR: No value found for `%s` setting. Full " \
@@ -156,6 +156,8 @@ std::string prb_t::set_repro_line() {
 
     if (canonical || dir != def.dir[0]) s << "--dir=" << dir << " ";
     if (canonical || !has_default_dts) s << "--dt=" << dt << " ";
+    if ((canonical || bia_dt_ != def.bia_dt[0]) && !(dir & FLAG_BIA))
+        s << "--bia-dt=" << bia_dt_ << " ";
     if (canonical || stag != def.stag[0]) s << "--stag=" << stag << " ";
     if (canonical || wtag != def.wtag[0]) s << "--wtag=" << wtag << " ";
     if (canonical || dtag != def.dtag[0]) s << "--dtag=" << dtag << " ";
@@ -165,6 +167,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/ip/ref_ip.cpp b/tests/benchdnn/ip/ref_ip.cpp
index 076eb9adcb5..56d171ea7c4 100644
--- a/tests/benchdnn/ip/ref_ip.cpp
+++ b/tests/benchdnn/ip/ref_ip.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ void compute_ref_fwd_ip(const prb_t *prb, const args_t &args) {
 
         d *= src_scale * wei_scale;
 
-        if (prb->dir & FLAG_BIA) {
+        if (prb->bia_dt() != dnnl_data_type_undef) {
             size_t bia_off = bia_off_f(prb, oc);
             d += ((float *)bia_m)[bia_off];
         }
@@ -104,7 +104,7 @@ void compute_ref_bwd_w_ip(const prb_t *prb, const args_t &args) {
     gemm("C", "T", "N", M, N, K, 1.f, (float *)diff_dst_m, M, (float *)src_m, N,
             0.f, (float *)diff_wei_m, N);
 
-    if (!(prb->dir & FLAG_BIA)) return;
+    if (prb->bia_dt() == dnnl_data_type_undef) return;
 
     benchdnn_parallel_nd(prb->oc, [&](int64_t oc) {
         size_t bia_off = bia_off_f(prb, oc);
diff --git a/tests/benchdnn/lnorm/bench_lnorm.cpp b/tests/benchdnn/lnorm/bench_lnorm.cpp
index 5f0e20fa364..60971758bc6 100644
--- a/tests/benchdnn/lnorm/bench_lnorm.cpp
+++ b/tests/benchdnn/lnorm/bench_lnorm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,17 +29,7 @@ using namespace bnorm;
 
 namespace lnorm {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -54,11 +44,11 @@ void check_correctness(
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
         const prb_t prb(s.prb_dims, i_tag, i_stat_tag, i_ss_dt, i_dir, i_dt,
-                i_flags, i_attr, i_ctx_init, i_ctx_exe, i_inplace, s.check_alg);
+                i_flags, s.check_alg, i_inplace, i_attr, i_ctx_init, i_ctx_exe,
+                s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -118,13 +108,7 @@ int bench(int argc, char **argv) {
                 || parse_vector_option(s.flags, def.flags, str2flags, argv[0],
                         "flags", help_flags)
                 || parse_inplace(s.inplace, def.inplace, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/lnorm/lnorm.cpp b/tests/benchdnn/lnorm/lnorm.cpp
index f1df3252cb9..e6301c1cb7a 100644
--- a/tests/benchdnn/lnorm/lnorm.cpp
+++ b/tests/benchdnn/lnorm/lnorm.cpp
@@ -1,5 +1,6 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,6 +47,10 @@ int fill_mean(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
 
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->n, [&](int64_t n) {
         const float val_coeff = is_integral_dt(prb->dt[0]) ? 1.f : 0.25f;
@@ -73,6 +78,10 @@ int fill_src(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const float val_coeff = is_integral_dt(prb->dt[0]) ? 1.f : 0.25f;
 
@@ -144,6 +153,10 @@ int fill_variance_fwd(const prb_t *prb, const cfg_t &cfg, dnn_mem_t &mem_fp,
                 attr_t::post_ops_t::kind_t::ADD, "variance");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->n, [&](int64_t n) {
         float val = 0.f;
@@ -175,6 +188,10 @@ int fill_scale(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->c, [&](int64_t c) {
         float val = (1.f / 8) * (1 << (c % 7));
@@ -195,6 +212,10 @@ int fill_shift(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->c, [&](int64_t c) {
         float val = ((c % 3) - 1) * (1.f / 512 * (1 << (c % 7)));
@@ -252,6 +273,10 @@ int fill_variance_bwd(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt) {
                 attr_t::post_ops_t::kind_t::ADD, "variance");
         return fill_random_real(mem_dt, mem_fp, nullptr, fill_cfg);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->n, [&](int64_t n) {
         // final variance = {0.25f, 1.f, 4.f}
@@ -273,6 +298,10 @@ int fill_src_bwd(const prb_t *prb, dnn_mem_t &mem_fp, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->n, [&](int64_t n) {
         // Idea behind the filling is to reduce a possibility of cancellation
@@ -304,6 +333,10 @@ int fill_diff_dst_bwd(
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     benchdnn_parallel_nd(prb->n, [&](int64_t n) {
         // Note: we use a different seed for each chunk to avoid
@@ -475,7 +508,7 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
                 const auto &sh = ref_args.find(DNNL_ARG_SHIFT);
                 const auto &dst = ref_args.find(DNNL_ARG_DST);
-                const int64_t c = dst.get_scale_idx(
+                const int64_t c = dst.get_idx(
                         args.idx, 1 << (prb->ndims - 1) /* last_dim_mask */);
                 const float beta = sh.get_elem(c);
                 // Using an empirically derived threshold, check if
@@ -541,7 +574,7 @@ fill_cfg_t binary_po_fill_cfg(
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     // TODO: this function still allocates the full memory print needed to fill
     // the data and each argument can't be destroyed right away since filling
@@ -603,10 +636,14 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
 std::vector<data_kind_t> get_kinds_to_check(const prb_t *prb) {
     std::vector<data_kind_t> check_kinds;
     if (prb->dir & FLAG_FWD) {
+// ACL lnorm does not return mean and variance, so these tests would fail
+// even if the normalization layer worked correctly
+#if !(DNNL_AARCH64_USE_ACL)
         if (!(prb->flags & GLOB_STATS) && !(prb->dir & FLAG_INF)) {
             check_kinds.push_back(MEAN);
             check_kinds.push_back(VAR);
         }
+#endif
         check_kinds.push_back(DST);
     } else {
         if (prb->dir & FLAG_WEI) {
@@ -626,10 +663,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/lnorm/lnorm.hpp b/tests/benchdnn/lnorm/lnorm.hpp
index 1426269b140..c18e6801cd7 100644
--- a/tests/benchdnn/lnorm/lnorm.hpp
+++ b/tests/benchdnn/lnorm/lnorm.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -80,16 +80,18 @@ struct prb_t : public prb_dims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_dims, s.tag[0], s.stat_tag[0], s.ss_dt[0], s.dir[0],
-                s.dt[0], s.flags[0], s.attributes.front(), s.ctx_init[0],
-                s.ctx_exe[0], s.inplace[0], s.check_alg) {
+                s.dt[0], s.flags[0], s.check_alg, s.inplace[0],
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_dims_t &prb_dims, const std::vector<std::string> &tag,
             const std::string &stat_tag, dnnl_data_type_t ss_dt, dir_t dir,
             const std::vector<dnnl_data_type_t> &dt, flags_t flags,
-            const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, bool inplace, check_alg_t check_alg)
+            check_alg_t check_alg, bool inplace, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_dims_t(prb_dims)
         , check_alg(check_alg)
         , tag(tag)
@@ -101,19 +103,20 @@ struct prb_t : public prb_dims_t {
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
-        n = 1;
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter)
+        , n(1)
+        , c(dims[ndims - 1])
+        , eps(1.f / 16) {
         for (int d = 0; d < ndims - 1; d++)
             n *= dims[d];
-        c = dims[ndims - 1];
-        eps = 1.f / 16;
 
         // Broadcast data types if needed
         if (dt.size() == 1) {
             const auto val = dt[0]; // Need a copy here.
             this->dt.assign(2, val);
         }
-        if (tag.size() == 1) { this->tag.push_back(tag::any); }
+        if (tag.size() == 1) { this->tag.emplace_back(tag::any); }
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
@@ -126,7 +129,8 @@ struct prb_t : public prb_dims_t {
     flags_t flags;
     bool inplace;
     attr_t attr;
-    const thr_ctx_t ctx_init, ctx_exe;
+    thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
     int64_t n, c;
     float eps;
 
@@ -261,8 +265,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/lnorm/lnorm_aux.cpp b/tests/benchdnn/lnorm/lnorm_aux.cpp
index 34a26dc5def..37c0c799452 100644
--- a/tests/benchdnn/lnorm/lnorm_aux.cpp
+++ b/tests/benchdnn/lnorm/lnorm_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,6 +72,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_dims_t>(*this);
 
diff --git a/tests/benchdnn/lrn/bench_lrn.cpp b/tests/benchdnn/lrn/bench_lrn.cpp
index 4d9c86aab67..d779ab27b90 100644
--- a/tests/benchdnn/lrn/bench_lrn.cpp
+++ b/tests/benchdnn/lrn/bench_lrn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,17 +26,7 @@
 
 namespace lrn {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -48,12 +38,11 @@ void check_correctness(
     for_(const auto &i_attr : s.attributes)
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
-        const prb_t prb(s.desc, i_mb, i_dir, i_dt, i_tag, i_alg, i_attr,
-                i_ctx_init, i_ctx_exe);
+        const prb_t prb(s.desc, i_dir, i_dt, i_tag, i_alg, i_mb, i_attr,
+                i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -71,13 +60,7 @@ int bench(int argc, char **argv) {
                 || parse_tag(s.tag, def.tag, argv[0])
                 || parse_alg(s.alg, def.alg, str2alg, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/lrn/lrn.cpp b/tests/benchdnn/lrn/lrn.cpp
index 0ec33be69dc..2cd1a695109 100644
--- a/tests/benchdnn/lrn/lrn.cpp
+++ b/tests/benchdnn/lrn/lrn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,6 +41,10 @@ int fill_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const int range = 16;
     const int f_min = prb->dt == dnnl_u8 ? 0 : -range / 2;
@@ -143,7 +147,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -203,11 +207,16 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+        if (v_prim[1]) SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/lrn/lrn.hpp b/tests/benchdnn/lrn/lrn.hpp
index d7bd9a44ed0..2bd8695d069 100644
--- a/tests/benchdnn/lrn/lrn.hpp
+++ b/tests/benchdnn/lrn/lrn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,36 +78,39 @@ struct settings_t : public base_settings_t {
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.mb[0], s.dir[0], s.dt[0], s.tag[0], s.alg[0],
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0]) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.tag[0], s.alg[0], s.mb[0],
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
-    prb_t(const desc_t &desc, int64_t mb, dir_t dir, dnnl_data_type_t dt,
-            const std::string &tag, alg_t alg, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe)
+    prb_t(const desc_t &desc, dir_t dir, dnnl_data_type_t dt,
+            const std::string &tag, alg_t alg, int64_t mb, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : desc_t(desc)
         , dir(dir)
         , dt(dt)
         , tag(tag)
         , alg(alg)
+        , user_mb(mb)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
-    ~prb_t() {}
 
     dir_t dir;
     dnnl_data_type_t dt;
     std::string tag;
     alg_t alg;
+    int64_t user_mb;
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -190,8 +193,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/lrn/lrn_aux.cpp b/tests/benchdnn/lrn/lrn_aux.cpp
index 11ef6101ef2..479cd5df31c 100644
--- a/tests/benchdnn/lrn/lrn_aux.cpp
+++ b/tests/benchdnn/lrn/lrn_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -66,8 +66,8 @@ int str2desc(desc_t *desc, const char *str) {
     const char *s = str;
     assert(s);
 
-    auto mstrtol = [](const char *nptr, char **endptr) {
-        return strtol(nptr, endptr, 10);
+    auto mstrtoll = [](const char *nptr, char **endptr) {
+        return strtoll(nptr, endptr, 10);
     };
 
 #define CASE_NN(prb, c, cvfunc) \
@@ -97,12 +97,12 @@ int str2desc(desc_t *desc, const char *str) {
 #define CASE_N(c, cvfunc) CASE_NN(#c, c, cvfunc)
     while (*s) {
         int ok = 0;
-        CASE_N(mb, mstrtol);
-        CASE_N(ic, mstrtol);
-        CASE_N(id, mstrtol);
-        CASE_N(ih, mstrtol);
-        CASE_N(iw, mstrtol);
-        CASE_N(ls, mstrtol);
+        CASE_N(mb, mstrtoll);
+        CASE_N(ic, mstrtoll);
+        CASE_N(id, mstrtoll);
+        CASE_N(ih, mstrtoll);
+        CASE_N(iw, mstrtoll);
+        CASE_N(ls, mstrtoll);
         CASE_N(alpha, strtof);
         CASE_N(beta, strtof);
         CASE_N(k, strtof);
@@ -182,6 +182,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/matmul/bench_matmul.cpp b/tests/benchdnn/matmul/bench_matmul.cpp
index 338b3f0c40c..2f010ecc750 100644
--- a/tests/benchdnn/matmul/bench_matmul.cpp
+++ b/tests/benchdnn/matmul/bench_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@
 
 namespace matmul {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -67,11 +57,10 @@ void check_correctness(
 #ifdef DNNL_EXPERIMENTAL_SPARSE
                 i_sparse_options,
 #endif
-                i_attr, i_ctx_init, i_ctx_exe);
+                i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -154,6 +143,15 @@ static const std::string help_runtime_dims_masks
           "For tensors with runtime dimensions specified a correspondent "
           "memory format must be specified, too.\n";
 
+bool parse_legacy_dt(std::vector<dnnl_data_type_t> &dt,
+        const std::vector<dnnl_data_type_t> &def_dt, const char *str,
+        const std::string &option_name /* = "dt"*/) {
+    // TODO: uncomment in v3.8
+    // BENCHDNN_PRINT(0, "%s\n", "Warning: \'--bia_dt\' option is deprecated.
+    //         Please use the \'--bia-dt\' one.");
+    return parser::parse_dt(dt, def_dt, str, option_name);
+}
+
 int bench(int argc, char **argv) {
     driver_name = "matmul";
     using namespace parser;
@@ -171,19 +169,15 @@ int bench(int argc, char **argv) {
                 || parse_encoding(s.sparse_options, argv[0], "encoding")
 #endif
                 || parse_strides(s.strides, def.strides, argv[0], "strides")
-                || parse_dt(s.bia_dt, def.bia_dt, argv[0], "bia_dt")
+                || parse_dt(s.bia_dt, def.bia_dt, argv[0], "bia-dt")
+                // TODO: remove this later
+                || parse_legacy_dt(s.bia_dt, def.bia_dt, argv[0], "bia_dt")
                 || parse_vector_option(s.bia_mask, def.bia_mask, atoi, argv[0],
                         "bia_mask", help_bia_mask)
                 || parse_multivector_option(s.rt_dims_masks, def.rt_dims_masks,
                         atoi, argv[0], "runtime_dims_masks",
                         help_runtime_dims_masks)
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/matmul/cfg.cpp b/tests/benchdnn/matmul/cfg.cpp
index cc3287f6827..ead9e751c61 100644
--- a/tests/benchdnn/matmul/cfg.cpp
+++ b/tests/benchdnn/matmul/cfg.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -43,8 +43,7 @@ cfg_t::cfg_t(const prb_t *prb, const std::vector<data_kind_t> &kinds) {
 // Adjust density based on accumulation chain.
 float cfg_t::get_density(const cfg_t::density_args_t &density_args) const {
     float density = 1.f;
-    if (!has_bench_mode_bit(mode_bit_t::corr) || density_args.data_kind != SRC)
-        return density;
+    if (density_args.data_kind != SRC) return density;
 
     const int64_t safe_n_acc = get_safe_n_acc();
     assert(safe_n_acc > 0);
@@ -67,6 +66,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-64, 64}},
             {{dnnl_bf16}, {-4, 4}},
             {{dnnl_f16}, {-4, 4}},
+            {{dnnl_f4_e2m1}, {0, 1}},
+            {{dnnl_f4_e3m0}, {0, 1}},
             {{dnnl_f8_e5m2}, {-4, 4}},
             {{dnnl_f8_e4m3}, {-4, 4}},
             {{dnnl_s8}, {-4, 4}},
@@ -78,12 +79,14 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-128, 128}},
             {{dnnl_bf16}, {-8, 8}},
             {{dnnl_f16}, {-2, 2}},
+            {{dnnl_f4_e2m1}, {-1, 1}},
+            {{dnnl_f4_e3m0}, {-1, 1}},
             {{dnnl_f8_e5m2}, {-2, 2}},
             {{dnnl_f8_e4m3}, {-2, 2}},
             {{dnnl_s8}, {-4, 4}},
             {{dnnl_u8}, {0, 8}},
-            {{dnnl_s4}, {-8, 7}},
-            {{dnnl_u4}, {0, 15}},
+            {{dnnl_s4}, {-2, 2}},
+            {{dnnl_u4}, {0, 4}},
     };
 
     static const cfg_t::cfg_entry_t::cfg_map_t bia_cfg_map = {
@@ -91,6 +94,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-8, 8}},
             {{dnnl_bf16}, {-8, 8}},
             {{dnnl_f16}, {-8, 8}},
+            {{dnnl_f4_e2m1}, {-2, 2}},
+            {{dnnl_f4_e3m0}, {-2, 2}},
             {{dnnl_f8_e5m2}, {-8, 8}},
             {{dnnl_f8_e4m3}, {-8, 8}},
             {{dnnl_s8}, {-8, 8}},
@@ -108,6 +113,8 @@ cfg_t::cfg_entry_t::cfg_map_t cfg_t::get_cfg_map(data_kind_t kind) const {
             {{dnnl_f32}, {-8, 8}},
             {{dnnl_bf16}, {-8, 8}},
             {{dnnl_f16}, {-4, 4}},
+            {{dnnl_f4_e2m1}, {-2, 2}},
+            {{dnnl_f4_e3m0}, {-2, 2}},
             {{dnnl_f8_e5m2}, {-4, 4}},
             {{dnnl_f8_e4m3}, {-4, 4}},
             {{dnnl_s8}, {-4, 4}},
diff --git a/tests/benchdnn/matmul/matmul.cpp b/tests/benchdnn/matmul/matmul.cpp
index 57ebb8d8e03..fea6ed72104 100644
--- a/tests/benchdnn/matmul/matmul.cpp
+++ b/tests/benchdnn/matmul/matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,6 +62,10 @@ benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> create_md(const prb_t *prb,
                     return dnn_mem_t::init_csr_md(prb->ndims,
                             src_rt_dims.data(), dt, nnz, dnnl_s32, dnnl_s32);
                     break;
+                case dnnl_coo:
+                    return dnn_mem_t::init_coo_md(
+                            prb->ndims, src_rt_dims.data(), dt, nnz, dnnl_s32);
+                    break;
                 default: assert(!"unsupported encoding"); return nullptr;
             }
         } else
@@ -86,6 +90,9 @@ benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> create_md(const prb_t *prb,
                     return dnn_mem_t::init_csr_md(prb->ndims,
                             weights_rt_dims.data(), dt, nnz, dnnl_s32,
                             dnnl_s32);
+                case dnnl_coo:
+                    return dnn_mem_t::init_coo_md(prb->ndims,
+                            weights_rt_dims.data(), dt, nnz, dnnl_s32);
                 case dnnl_packed:
                     return dnn_mem_t::init_sparse_packed_md(
                             prb->ndims, weights_rt_dims.data(), dt, nnz);
@@ -131,39 +138,26 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
 
     attr_args_t attr_args;
     attr_args.prepare_post_ops_mds(prb->attr, prb->ndims, prb->dst_dims.data());
-    // Overload PER_OC src_mask definition for batched case
-    auto src_scale = prb->attr.scales.get(DNNL_ARG_SRC);
-    if (src_scale.policy == policy_t::PER_OC
-            || src_scale.policy == policy_t::PER_OCIC) {
-        const auto &src_rt_dims = get_runtime_dims(
-                prb->src_dims(), prb->src_runtime_dim_mask());
-        int src_mask = 1 << (src_rt_dims.size() - 1);
-        if (src_scale.policy == policy_t::PER_OCIC)
-            src_mask += 1 << (src_rt_dims.size() - 2);
-        attr_args.prepare_scales(prb->attr, DNNL_ARG_SRC, src_mask);
-    }
-    // Overload PER_OC/PER_OCIC wei_mask definition for batched case
-    auto wei_scale = prb->attr.scales.get(DNNL_ARG_WEIGHTS);
-    if (wei_scale.policy == policy_t::PER_OC
-            || wei_scale.policy == policy_t::PER_OCIC) {
-        const auto &dst_rt_dims
-                = get_runtime_dims(prb->dst_dims, prb->dst_runtime_dim_mask());
-        int wei_mask = 1 << (dst_rt_dims.size() - 1);
-        if (wei_scale.policy == policy_t::PER_OCIC)
-            wei_mask += 1 << (dst_rt_dims.size() - 2);
-        attr_args.prepare_scales(prb->attr, DNNL_ARG_WEIGHTS, wei_mask);
-    }
-    // Overload PER_OC/PER_OCIC zp_mask definition for batched case
-    auto wei_zp = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS);
-    if (wei_zp.policy == policy_t::PER_OC
-            || wei_zp.policy == policy_t::PER_OCIC) {
-        const auto &dst_rt_dims
-                = get_runtime_dims(prb->dst_dims, prb->dst_runtime_dim_mask());
-        int wei_mask = (1 << (dst_rt_dims.size() - 1));
-        if (wei_zp.policy == policy_t::PER_OCIC)
-            wei_mask += 1 << (dst_rt_dims.size() - 2);
-        attr_args.prepare_zero_points(prb->attr, DNNL_ARG_WEIGHTS, wei_mask);
-    }
+
+    const auto overload_quant_mask = [&](policy_t policy, int arg) {
+        // Overload PER_OC/PER_OCIC mask definition for batched cases.
+        if (policy == policy_t::PER_OC || policy == policy_t::PER_OCIC) {
+            int mask = 1 << (prb->ndims - 1);
+            if (policy == policy_t::PER_OCIC) mask += 1 << (prb->ndims - 2);
+            attr_args.prepare_quant(prb->attr, arg, mask);
+        }
+    };
+
+    overload_quant_mask(prb->attr.scales.get(DNNL_ARG_SRC).policy,
+            DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
+    overload_quant_mask(prb->attr.scales.get(DNNL_ARG_WEIGHTS).policy,
+            DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
+    overload_quant_mask(prb->attr.scales.get(DNNL_ARG_DST).policy,
+            DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+    overload_quant_mask(prb->attr.zero_points.get(DNNL_ARG_SRC).policy,
+            DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC);
+    overload_quant_mask(prb->attr.zero_points.get(DNNL_ARG_WEIGHTS).policy,
+            DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS);
 
     auto dnnl_attr = make_benchdnn_dnnl_wrapper(
             create_dnnl_attr(prb->attr, attr_args));
@@ -194,10 +188,6 @@ int init_prim_ref(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
         return OK;
 #endif
 
-    // Create a new copy of prb to avoid potentially corrupting the test by
-    // modifying prb in place.
-    auto cpu_attr = prb->attr;
-    update_cpu_ref_attrs(cpu_attr);
     std::vector<std::vector<dnnl_data_type_t>> prim_ref_dt {
             prb->dt, {dnnl_f32}};
     // If there's no bias, undef data type should be used for prim_ref as well.
@@ -208,44 +198,27 @@ int init_prim_ref(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
         prim_ref_dt.erase(prim_ref_dt.begin());
         prim_ref_bia_dt.erase(prim_ref_bia_dt.begin());
     }
-    dnnl_primitive_t prim_ref_ {};
 
     for_(const auto &prim_ref_dt_i : prim_ref_dt)
     for (const auto &prim_ref_bia_dt_i : prim_ref_bia_dt) {
+        auto cpu_attr = prb->attr;
+        update_cpu_ref_attrs(cpu_attr, prim_ref_dt_i.back());
+
+        // Create a new copy of prb to avoid potentially corrupting the test by
+        // modifying prb in place.
         prb_t prb_cpu {*prb, prim_ref_dt_i, tag::any, tag::any, tag::any,
                 {vdims_t(STRIDES_SIZE)}, prim_ref_bia_dt_i, prb->bia_mask,
                 {0, 0, 0},
 #ifdef DNNL_EXPERIMENTAL_SPARSE
                 sparse_options_t(),
 #endif
-                cpu_attr, prb->ctx_init, prb->ctx_exe};
-
-        init_pd_args_t<prb_t> init_pd_args(
-                /* res = */ nullptr, get_cpu_engine(), &prb_cpu, prb->dir,
-                /* hint = */ nullptr, /* src_md = */ nullptr);
-        init_pd(init_pd_args);
-
-        benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> pdw;
-        fetch_impl(pdw, init_pd_args, /* res = */ nullptr,
-                /* is_service_prim = */ true);
-
-        // Prim desc wasn't created - try the next set...
-        if (!pdw) continue;
-        // Reference impl was fetched - try the next set...
-        if (query_impl_info(pdw) == "ref:any") continue;
-
-        auto st = dnnl_primitive_create(&prim_ref_, pdw);
-        // Primitive wan't created - try the next set...
-        if (st != dnnl_success) continue;
-
-        BENCHDNN_PRINT(5, "CPU reference oneDNN implementation: %s\n",
-                query_impl_info(pdw).c_str());
-        res->prim_ref_repro = prb_cpu.str();
-        prim_ref.reset(prim_ref_);
-        return OK;
+                cpu_attr, prb->ctx_init, prb->ctx_exe, prb->impl_filter};
+
+        auto st = init_prim_ref_common(prim_ref, &prb_cpu, res);
+        if (st == OK) return OK;
     }
 
-    prim_ref.reset(prim_ref_);
+    prim_ref.reset(nullptr);
     return OK;
 }
 
@@ -253,8 +226,8 @@ int init_prim_ref(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim_ref,
 // The main idea is to generate values and metadata directly without generating
 // the dense matrix to avoid excessive memory consumption for large problem
 // sizes.
-int fill_csr_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
-        dnn_mem_t &mem_fp, res_t *res) {
+int fill_sparse_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
+        dnn_mem_t &mem_fp, res_t *res, dnnl_sparse_encoding_t encoding) {
     if (query_md_num_handles(mem_dt.md_) != 3) return FAIL;
 
     if (kind != SRC && kind != WEI) return FAIL;
@@ -300,19 +273,36 @@ int fill_csr_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
 
     if (remaining_nnz_cnt != 0) return FAIL;
 
-    const int values_idx = 0;
-    const int indices_idx = 1;
+    int values_idx = 0;
+    int indices_idx = 1;
     const int pointers_idx = 2;
 
-    // Fill pointers.
-    mem_fp.set_elem(0, 0, pointers_idx);
-    mem_dt.set_elem(0, 0, pointers_idx);
+    if (encoding == dnnl_csr) {
+        // fill pointers for CSR encoding
+        mem_fp.set_elem(0, 0, pointers_idx);
+        mem_dt.set_elem(0, 0, pointers_idx);
 
-    for (int64_t i = 0; i < dim0; i++) {
-        const int32_t pointer
-                = mem_fp.get_elem(i, pointers_idx) + distributed_nnz[i];
-        mem_fp.set_elem(i + 1, pointer, pointers_idx);
-        mem_dt.set_elem(i + 1, pointer, pointers_idx);
+        for (int64_t i = 0; i < dim0; i++) {
+            const int32_t pointer
+                    = mem_fp.get_elem(i, pointers_idx) + distributed_nnz[i];
+            mem_fp.set_elem(i + 1, pointer, pointers_idx);
+            mem_dt.set_elem(i + 1, pointer, pointers_idx);
+        }
+    } else if (encoding == dnnl_coo) {
+        values_idx = 0;
+        indices_idx = 2;
+        const int row_indices_idx = 1;
+
+        // fill row indices for COO encoding
+        int32_t row_ptr = 0;
+
+        for (int64_t i = 0; i < dim0; i++) {
+            for (int32_t j = 0; j < distributed_nnz[i]; j++) {
+                mem_fp.set_elem(row_ptr + j, i, row_indices_idx);
+                mem_dt.set_elem(row_ptr + j, i, row_indices_idx);
+            }
+            row_ptr = row_ptr + distributed_nnz[i];
+        }
     }
 
     std::uniform_int_distribution<> indices_gen(0, dim1 - 1);
@@ -337,6 +327,9 @@ int fill_csr_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
         mem_dt.set_elem(i, index, indices_idx);
     });
 
+    // Don't fill data for `no_ref_memory` as it will be filled by benchdnn.
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
+
     // Generate values.
     cfg_t cfg(prb, {SRC, WEI, BIA, DST});
 
@@ -355,8 +348,12 @@ int fill_csr_data(data_kind_t kind, const prb_t *prb, dnn_mem_t &mem_dt,
 
         for (int64_t i = idx_start; i < idx_end; i++) {
             float val = values_gen(values_seed);
-            mem_fp.set_elem(i, val, values_idx);
-            mem_dt.set_elem(i, val, values_idx);
+            mem_fp.set_elem(i,
+                    round_to_nearest_representable(cfg.get_dt(kind), val),
+                    values_idx);
+            mem_dt.set_elem(i,
+                    round_to_nearest_representable(cfg.get_dt(kind), val),
+                    values_idx);
         }
     });
 
@@ -370,21 +367,22 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     const auto nelems = mem_dt.nelems();
     if (nelems == 0) return OK;
 
-    // Refer to modes documentation for filling principles.
-    if (has_bench_mode_bit(mode_bit_t::bitwise)) {
-        return fill_random_real(mem_dt, mem_fp, res);
-    }
-
+    bool is_sparse_packed = false;
+    bool is_any_sparse = false;
+    std::vector<bool> nnz_mask;
 #ifdef DNNL_EXPERIMENTAL_SPARSE
-    auto src_encoding = prb->sparse_options.get_encoding(DNNL_ARG_SRC);
-    auto wei_encoding = prb->sparse_options.get_encoding(DNNL_ARG_WEIGHTS);
-    if ((kind == SRC && src_encoding == dnnl_csr)
-            || (kind == WEI && wei_encoding == dnnl_csr))
-        return fill_csr_data(kind, prb, mem_dt, mem_fp, res);
+    const auto sparse_encoding = prb->sparse_options.get_encoding(kind);
+    const bool is_sparse_csr_coo
+            = sparse_encoding == dnnl_csr || sparse_encoding == dnnl_coo;
+    is_sparse_packed = sparse_encoding == dnnl_packed;
+    is_any_sparse = sparse_encoding != sparse_options_t::def_encoding;
+
+    if (is_sparse_csr_coo) {
+        return fill_sparse_data(
+                kind, prb, mem_dt, mem_fp, res, sparse_encoding);
+    }
 
-    bool is_wei_sparse_packed = wei_encoding == dnnl_packed;
-    std::vector<bool> nnz_mask;
-    if (kind == WEI && is_wei_sparse_packed) {
+    if (is_sparse_packed) {
         nnz_mask.resize(nelems, false);
         const dnnl_dim_t nnz = query_md_nnz(mem_dt.md_);
         assert(nnz > 0);
@@ -395,11 +393,41 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     }
 #endif
 
+    // Refer to modes documentation for filling principles.
+    // Note: sparse filling is more complex than a general one in a sense that
+    // it requires metadata in addition to data. To have reasonable bitwise
+    // validation for sparse, only data must be random and indices should remain
+    // identical between runs. So far, simply don't support bitwise mode for
+    // sparse problems. `CSR`/`COO` will utilize their `fill_sparse_data`
+    // function, `packed` will fall back into a regular filling as it involves
+    // `nnz_mask`.
+    if (has_bench_mode_bit(mode_bit_t::bitwise) && !is_any_sparse) {
+        return fill_random_real(mem_dt, mem_fp, res);
+    }
+    if (has_bench_mode_bit(mode_bit_t::perf) && !is_any_sparse) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
+
     cfg_t::density_args_t density_args;
     density_args.data_kind = kind;
     density_args.n_acc = prb->k;
     const auto density = cfg.get_density(density_args);
 
+    const auto &e_zp_src = prb->attr.zero_points.get(DNNL_ARG_SRC);
+    const bool has_src_zp = !e_zp_src.is_def();
+    const int src_zp_mask = attr_t::get_default_mask(e_zp_src.policy);
+    // Apply src_zp for source tensor only.
+    int src_zp = kind == SRC && has_src_zp && src_zp_mask == 0 ? e_zp_src.value
+                                                               : 0;
+
+    const auto &e_zp_wei = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS);
+    const bool has_wei_zp = !e_zp_wei.is_def();
+    const int wei_zp_mask = attr_t::get_default_mask(e_zp_wei.policy);
+    // Apply wei_zp for weights tensor only.
+    int wei_zp = kind == WEI && has_wei_zp && wei_zp_mask == 0 ? e_zp_wei.value
+                                                               : 0;
+
     /* Do fixed partitioning to have same filling for any number of threads */
     const int64_t chunk_size = 64;
     const int64_t n_chunks = div_up(nelems, chunk_size);
@@ -421,14 +449,11 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
         std::bernoulli_distribution b_dist(density);
 
         // make sure the first element is positive
-        if (idx_start == 0
-#ifdef DNNL_EXPERIMENTAL_SPARSE
-                && !is_wei_sparse_packed
-#endif
-        ) {
+        if (idx_start == 0 && !is_sparse_packed) {
             float val = 0;
             while (val <= 0)
                 val = gen(int_seed);
+            val += src_zp + wei_zp; // Add zp so that it will be subtracted.
             mem_fp.set_elem(
                     0, round_to_nearest_representable(cfg.get_dt(kind), val));
             idx_start += 1;
@@ -436,19 +461,16 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
 
         for (int64_t idx = idx_start; idx < idx_end; ++idx) {
             bool is_one = density == 1.f ? true : b_dist(b_seed);
-#ifdef DNNL_EXPERIMENTAL_SPARSE
             float val = 0.0f;
-            if (is_wei_sparse_packed && kind == WEI) {
+            if (is_sparse_packed) {
                 is_one = nnz_mask[idx];
                 while (val == 0.0f)
                     val = gen(int_seed);
                 val *= is_one;
             } else {
                 val = is_one * gen(int_seed);
+                val += src_zp + wei_zp; // Add zp so that it will be subtracted.
             }
-#else
-            float val = is_one * gen(int_seed);
-#endif
             mem_fp.set_elem(
                     idx, round_to_nearest_representable(cfg.get_dt(kind), val));
         }
@@ -468,16 +490,43 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
     skip_unimplemented_prelu_po(prb->attr, res, dnnl_matmul);
 
 #ifdef DNNL_EXPERIMENTAL_SPARSE
+    if ((is_nvidia_gpu() || is_amd_gpu()) && !prb->sparse_options.is_def()) {
+        BENCHDNN_PRINT(2,
+                "[SKIP][%s:%d]: oneDNN doesn't support sparse matmul for "
+                "NVIDIA and AMD GPUs.\n",
+                __FILE__, __LINE__);
+        res->state = SKIPPED;
+        res->reason = skip_reason::case_not_supported;
+        return;
+    }
+
     const auto wei_encoding
             = prb->sparse_options.get_encoding(DNNL_ARG_WEIGHTS);
-    if (is_gpu() && !prb->sparse_options.is_def()) {
+    bool is_wei_dense = (wei_encoding == dnnl_sparse_encoding_undef);
+    bool is_src_coo_sparse
+            = (prb->sparse_options.get_encoding(DNNL_ARG_SRC) == dnnl_coo);
+    if (!prb->sparse_options.is_def() && is_gpu()
+            && (!is_wei_dense || !is_src_coo_sparse)) {
         BENCHDNN_PRINT(2,
-                "[SKIP][%s:%d]: GPU doesn't support sparse functionality.\n",
+                "[SKIP][%s:%d]: GPU sparse matmul only supports COO encoding "
+                "for source.\n",
                 __FILE__, __LINE__);
         res->state = SKIPPED;
         res->reason = skip_reason::case_not_supported;
         return;
     }
+
+    if (!prb->sparse_options.is_def() && is_cpu() && is_wei_dense
+            && prb->wtag != "any" && prb->wtag != "ab") {
+        BENCHDNN_PRINT(2,
+                "[SKIP][%s:%d]: Only `any` and `ab` tags are supported for "
+                "dense weights on CPU.\n",
+                __FILE__, __LINE__);
+        res->state = SKIPPED;
+        res->reason = skip_reason::case_not_supported;
+        return;
+    }
+
     if (wei_encoding == dnnl_packed) {
         BENCHDNN_PRINT(2,
                 "[SKIP][%s:%d]: Weights argument doesn't support packed "
@@ -499,34 +548,44 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
             res->reason = skip_reason::case_not_supported;
             return;
         }
-    }
 
-    if (is_gpu()) {
-        if (prb->attr.zero_points.get(DNNL_ARG_SRC).policy != policy_t::COMMON
-                || prb->attr.zero_points.get(DNNL_ARG_DST).policy
-                        != policy_t::COMMON) {
+        auto is_int = [](dnnl_data_type_t t) {
+            return dnnl::impl::utils::one_of(
+                    t, dnnl_s4, dnnl_u4, dnnl_s8, dnnl_u8, dnnl_s32);
+        };
+        if (is_int(prb->src_dt()) != is_int(prb->wei_dt())) {
             BENCHDNN_PRINT(2,
-                    "[SKIP][%s:%d]: GPU doesn't support multiple zero-points "
-                    "per tensor.\n",
+                    "[SKIP][%s:%d]: CPU doesn't support mixed integer and "
+                    "floating point source and weights.\n",
+                    __FILE__, __LINE__);
+            res->state = SKIPPED;
+            res->reason = skip_reason::case_not_supported;
+        }
+
+        if (!is_int(prb->src_dt()) && !is_int(prb->wei_dt())
+                && is_int(prb->dst_dt())) {
+            BENCHDNN_PRINT(2,
+                    "[SKIP][%s:%d]: CPU doesn't support integer destination "
+                    "with  floating point source and weights.\n",
                     __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
-            return;
         }
 
-        if (prb->attr.zero_points.get(DNNL_ARG_WEIGHTS).policy
-                        == policy_t::PER_OCIC
-                || prb->attr.scales.get(DNNL_ARG_WEIGHTS).policy
-                        == policy_t::PER_OCIC) {
+        if (!prb->attr.scales.is_def(DNNL_ARG_DST)
+                && prb->attr.scales.get(DNNL_ARG_DST).policy
+                        != attr_t::COMMON) {
             BENCHDNN_PRINT(2,
-                    "[SKIP][%s:%d]: GPU doesn't support grouped scales or "
-                    "zero-points.\n",
+                    "[SKIP][%s:%d]: Only Common dst scales are supported "
+                    "on CPU.\n",
                     __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
         }
+    }
 
+    if (is_gpu()) {
         const auto &po = prb->attr.post_ops;
         if (prb->dst_dt() == dnnl_f64 && !po.is_def()) {
             BENCHDNN_PRINT(2,
@@ -603,19 +662,6 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
 }
 
 void skip_invalid_prb(const prb_t *prb, res_t *res) {
-#ifdef DNNL_EXPERIMENTAL_SPARSE
-    if (is_sycl_engine(get_test_engine()) && !prb->sparse_options.is_def()) {
-        BENCHDNN_PRINT(2,
-                "[INVALID][%s:%d]: oneDNN doesn't provide SYCL "
-                "interoperability API for creating a sparse memory therefore "
-                "all SYCL cases must be skipped.\n",
-                __FILE__, __LINE__);
-        res->state = SKIPPED;
-        res->reason = skip_reason::case_not_supported;
-        return;
-    }
-#endif
-
     if (!prb->attr.zero_points.is_def()
             && (prb->wei_dt() != dnnl_s8 && prb->wei_dt() != dnnl_u8
                     && prb->wei_dt() != dnnl_s4 && prb->wei_dt() != dnnl_u4)) {
@@ -674,15 +720,41 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {
         }
     }
 
+    // Check int4 weights byte alignment if format is specified.
     if ((prb->wei_dt() == dnnl_s4 || prb->wei_dt() == dnnl_u4)
-            && (prb->n % 2)) {
-        BENCHDNN_PRINT(2,
-                "[INVALID][%s:%d]: Int4 Weights decompression requires OC "
-                "('%d') to be even.\n",
-                __FILE__, __LINE__, (int)prb->n);
-        res->state = SKIPPED;
-        res->reason = skip_reason::invalid_case;
-        return;
+            && (!prb->strides[WEI].empty()
+                    || (prb->wtag != tag::any && prb->wtag != tag::undef))) {
+        const auto &weights_rt_dims = get_runtime_dims(
+                prb->weights_dims(), prb->weights_runtime_dim_mask());
+        const auto wei_md
+                = dnn_mem_t::init_md(prb->ndims, weights_rt_dims.data(),
+                        prb->wei_dt(), prb->wtag, prb->strides[STRIDES_WEI]);
+
+        const auto wei_strides = query_md_strides(wei_md);
+        int n_unit_strides = 0;
+        for (int d = 0; d < query_md_ndims(wei_md); d++) {
+            if (wei_strides[d] == 1) {
+                n_unit_strides++;
+                if (n_unit_strides > 1) {
+                    BENCHDNN_PRINT(2,
+                            "[INVALID][%s:%d]: Int4 Weights decompression "
+                            "requires byte alignment for the tensor.\n",
+                            __FILE__, __LINE__);
+                    res->state = SKIPPED;
+                    res->reason = skip_reason::invalid_case;
+                    return;
+                }
+            }
+            if (wei_strides[d] > 1 && (wei_strides[d] % 2)) {
+                BENCHDNN_PRINT(2,
+                        "[INVALID][%s:%d]: Int4 Weights decompression requires "
+                        "byte alignment for the tensor.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::invalid_case;
+                return;
+            }
+        }
     }
 
     auto src_rt_mask = prb->src_runtime_dim_mask();
@@ -741,9 +813,6 @@ void skip_invalid_prb(const prb_t *prb, res_t *res) {
 
 void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
         const args_t &ref_args) {
-    const auto dt = prb->get_dt(kind);
-    const float trh = dt == dnnl_f32 ? 1e-6f : epsilon_dt(dt);
-    cmp.set_threshold(trh);
     cmp.set_zero_trust_percent(90.f); // TODO: why so bad filling?
 }
 
@@ -760,7 +829,16 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    // Sparse functionality relies on indirect access to the data. While the
+    // data itself can be anything for `no_ref_memory` modifier, metadata values
+    // must be meaningful, otherwise a jump to a random memory location outside
+    // of allocated bytes will happen.
+    // If there's a sparse memory, non-sparse memory and non-metadata handles
+    // will not reach the filling.
+    const bool map_has_sparse_mem = has_sparse_md(mem_map);
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+            && !map_has_sparse_mem)
+        return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -782,28 +860,48 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
 
         const bool is_sparse_src = exec_arg == DNNL_ARG_SRC
                 && src_encoding != dnnl_sparse_encoding_undef;
-
         const bool is_sparse_wei = exec_arg == DNNL_ARG_WEIGHTS
                 && wei_encoding != dnnl_sparse_encoding_undef;
+        const bool is_sparse = is_sparse_src || is_sparse_wei;
         const bool is_sparse_wei_packed
                 = is_sparse_wei && wei_encoding == dnnl_packed;
 
-        if ((is_sparse_src || is_sparse_wei) && !is_sparse_wei_packed) {
+        // See the comment at the beginning of the function.
+        if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)
+                && !is_sparse)
+            continue;
+
+        if (is_sparse && !is_sparse_wei_packed) {
             if (is_sparse_src) {
-                auto src_fp_d = create_md(prb, SRC, dnnl_f32);
+                auto src_fp_d = create_md(prb, SRC);
                 ref_mem_map.emplace(exec_arg, dnn_mem_t(src_fp_d, ref_engine));
             }
 
             if (is_sparse_wei) {
-                auto wei_fp_d = create_md(prb, WEI, dnnl_f32);
+                auto wei_fp_d = create_md(prb, WEI);
                 ref_mem_map.emplace(exec_arg, dnn_mem_t(wei_fp_d, ref_engine));
             }
         } else
 #endif
         {
-            // Scratchpad memory relates to a primitive. If reference needs it,
-            // use switch below to define a memory desc for it.
-            if (exec_arg != DNNL_ARG_SCRATCHPAD) {
+            if (exec_arg == DNNL_ARG_WEIGHTS) {
+                // Switch the format tag from "ab" to "ba" but to handle batched
+                // cases, use strides instead.
+                const auto ndims = mem.ndims();
+                const auto &dims = mem.dims();
+                dnnl_dims_t strides {};
+                dnnl_dim_t stride = 1;
+                for (int d = ndims - 2; d >= 0; d--) {
+                    strides[d] = stride * dims[d + 1];
+                    stride = strides[d];
+                }
+                strides[ndims - 2] = 1;
+                strides[ndims - 1] = dims[ndims - 2];
+                ref_mem_map.emplace(exec_arg,
+                        dnn_mem_t(mem.md_, dnnl_f32, strides, ref_engine));
+            } else if (exec_arg != DNNL_ARG_SCRATCHPAD) {
+                // Scratchpad memory relates to a primitive. If reference needs
+                // it, use switch below to define a memory desc for it.
                 ref_mem_map.emplace(exec_arg,
                         dnn_mem_t(mem.md_, dnnl_f32, tag::abx, ref_engine));
             }
@@ -858,11 +956,26 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    // Don't check caches for CPU prim as the reference.
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        const auto &prim_ref = v_prim[1];
+        if (prim_ref) {
+            // Copy res to avoid save/restore state and reason.
+            res_t res_copy = *res;
+            SAFE(check_total_size(&res_copy, prim_ref), WARN);
+            if (res_copy.state == SKIPPED) {
+                v_prim[1].reset(nullptr);
+                SAFE(check_total_size(res), WARN);
+            }
+        } else {
+            SAFE(check_total_size(res), WARN);
+        }
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        // Don't check caches for CPU prim as the reference.
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/matmul/matmul.hpp b/tests/benchdnn/matmul/matmul.hpp
index 554f8f75ac5..f45f0eb01ff 100644
--- a/tests/benchdnn/matmul/matmul.hpp
+++ b/tests/benchdnn/matmul/matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,7 +33,7 @@
 
 namespace matmul {
 
-typedef std::bitset<DNNL_MAX_NDIMS> dims_mask_t;
+using dims_mask_t = std::bitset<DNNL_MAX_NDIMS>;
 
 struct settings_t : public base_settings_t {
     settings_t() = default;
@@ -79,7 +79,8 @@ struct prb_t : public prb_vdims_t {
 #ifdef DNNL_EXPERIMENTAL_SPARSE
                 s.sparse_options[0],
 #endif
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0]) {
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
@@ -89,10 +90,10 @@ struct prb_t : public prb_vdims_t {
             dnnl_data_type_t bia_dt, int bia_mask,
             const std::vector<dims_mask_t> &rt_dims_masks,
 #ifdef DNNL_EXPERIMENTAL_SPARSE
-            sparse_options_t sparse_options,
+            const sparse_options_t &sparse_options,
 #endif
             const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : prb_vdims_t(prb_vdims)
         , dt(dt)
         , stag(stag)
@@ -107,7 +108,8 @@ struct prb_t : public prb_vdims_t {
 #endif
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
 
         // Broadcast data types if needed
         if (dt.size() == 1) {
@@ -149,6 +151,7 @@ struct prb_t : public prb_vdims_t {
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     double ops;
 
@@ -277,10 +280,6 @@ inline int64_t src_off_f(const prb_t *prb, int64_t mb, int64_t m, int64_t k) {
     return (mb * prb->m + m) * prb->k + k;
 }
 
-inline int64_t wei_off_f(const prb_t *prb, int64_t mb, int64_t k, int64_t n) {
-    return (mb * prb->k + k) * prb->n + n;
-}
-
 inline int64_t dst_off_f(const prb_t *prb, int64_t mb, int64_t m, int64_t n) {
     return (mb * prb->m + m) * prb->n + n;
 }
@@ -300,8 +299,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/matmul/matmul_aux.cpp b/tests/benchdnn/matmul/matmul_aux.cpp
index 74fdc374b8d..3dc1015cc15 100644
--- a/tests/benchdnn/matmul/matmul_aux.cpp
+++ b/tests/benchdnn/matmul/matmul_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,7 +90,7 @@ std::string prb_t::set_repro_line() {
           << weights_runtime_dim_mask().to_ulong() << " ";
 
     if (canonical || bia_dt != def.bia_dt[0]) {
-        s << "--bia_dt=" << bia_dt << " ";
+        s << "--bia-dt=" << bia_dt << " ";
 
         if (canonical || bia_mask != def.bia_mask[0])
             s << "--bia_mask=" << bia_mask << " ";
@@ -101,6 +101,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const prb_vdims_t &>(*this);
 
diff --git a/tests/benchdnn/matmul/ref_matmul.cpp b/tests/benchdnn/matmul/ref_matmul.cpp
index afa78f15bca..81937af9af5 100644
--- a/tests/benchdnn/matmul/ref_matmul.cpp
+++ b/tests/benchdnn/matmul/ref_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,12 +14,21 @@
 * limitations under the License.
 *******************************************************************************/
 
+#include <algorithm>
+
 #include "utils/parallel.hpp"
 
 #include "matmul/matmul.hpp"
 
 namespace matmul {
 
+int64_t wei_ab_off_f(const prb_t *prb, int64_t mb, int64_t k, int64_t n) {
+    return (mb * prb->k + k) * prb->n + n;
+}
+int64_t wei_ba_off_f(const prb_t *prb, int64_t mb, int64_t k, int64_t n) {
+    return (mb * prb->n + n) * prb->k + k;
+}
+
 void compute_ref_matmul(const prb_t *prb, const args_t &args) {
     const dnn_mem_t &src_m = args.find(DNNL_ARG_SRC);
     const dnn_mem_t &wei_m = args.find(DNNL_ARG_WEIGHTS);
@@ -39,65 +48,66 @@ void compute_ref_matmul(const prb_t *prb, const args_t &args) {
             = args.find(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST);
     const dnn_mem_t &dropout = args.find(DNNL_ARG_ATTR_DROPOUT_MASK);
 
+    const int64_t M = prb->m;
+    const int64_t N = prb->n;
+    const int64_t K = prb->k;
+    const int64_t MB = prb->mb;
+    const int batch_ndims = dst_m.ndims() - 2;
+
     const bool has_src_scale = !prb->attr.scales.get(DNNL_ARG_SRC).is_def();
     const bool has_wei_scale = !prb->attr.scales.get(DNNL_ARG_WEIGHTS).is_def();
     const bool has_dst_scale = !prb->attr.scales.get(DNNL_ARG_DST).is_def();
-    assert(IMPLICATION(has_dst_scale, dst_scales.nelems() == 1));
-    float dst_scale = has_dst_scale ? 1.f / dst_scales.get_elem(0) : 1.f;
+
     const int src_scale_mask = prb->attr.scales.get_mask(
             DNNL_ARG_SRC, dnnl_matmul, src_m.ndims());
     const int wei_scale_mask = prb->attr.scales.get_mask(
             DNNL_ARG_WEIGHTS, dnnl_matmul, wei_m.ndims());
+    const int dst_scale_mask = prb->attr.scales.get_mask(
+            DNNL_ARG_DST, dnnl_matmul, dst_m.ndims());
+
+    const bool has_src_single_scale = has_src_scale && src_scale_mask == 0;
+    const bool has_wei_single_scale = has_wei_scale && wei_scale_mask == 0;
 
     const bool has_src_zp = !prb->attr.zero_points.get(DNNL_ARG_SRC).is_def();
     const bool has_wei_zp
             = !prb->attr.zero_points.get(DNNL_ARG_WEIGHTS).is_def();
     const bool has_dst_zp = !prb->attr.zero_points.get(DNNL_ARG_DST).is_def();
 
-    const int src_zp_mask = attr_t::get_default_mask(
-            prb->attr.zero_points.get(DNNL_ARG_SRC).policy);
+    const int src_zp_mask = prb->attr.zero_points.get_mask(
+            DNNL_ARG_SRC, dnnl_matmul, src_m.ndims());
     const int wei_zp_mask = prb->attr.zero_points.get_mask(
             DNNL_ARG_WEIGHTS, dnnl_matmul, wei_m.ndims());
     const int dst_zp_mask = attr_t::get_default_mask(
             prb->attr.zero_points.get(DNNL_ARG_DST).policy);
 
-    const int64_t M = prb->m;
-    const int64_t N = prb->n;
-    const int64_t K = prb->k;
-    const int64_t MB = prb->mb;
-    const int batch_ndims = dst_m.ndims() - 2;
+    const bool has_src_single_zp = has_src_zp && src_zp_mask == 0;
+    const bool has_wei_single_zp = has_wei_zp && wei_zp_mask == 0;
 
-    const bool src_scale_per_k = src_scale_mask & (1 << (src_m.ndims() - 1));
-    const bool src_scale_per_m = src_scale_mask & (1 << (src_m.ndims() - 2));
-    const auto src_scale_groups = prb->attr.scales.get(DNNL_ARG_SRC).groups;
-    const int64_t src_scale_group_k
-            = !src_scale_groups.empty() ? src_scale_groups[1] : 1;
-    const int64_t src_scale_stride_k = src_scale_per_k ? 1 : 0;
-    const int64_t src_scale_stride_m = src_scale_per_m
-            ? src_scale_per_k ? (K / src_scale_group_k) : 1
-            : 0;
-
-    const bool wei_scale_per_n = wei_scale_mask & (1 << (wei_m.ndims() - 1));
-    const bool wei_scale_per_k = wei_scale_mask & (1 << (wei_m.ndims() - 2));
-    const auto wei_scale_groups = prb->attr.scales.get(DNNL_ARG_WEIGHTS).groups;
-    const int64_t wei_scale_group_k
-            = !wei_scale_groups.empty() ? wei_scale_groups[0] : 1;
-    const int64_t wei_scale_stride_n = wei_scale_per_n ? 1 : 0;
-    const int64_t wei_scale_stride_k
-            = wei_scale_per_k ? wei_scale_per_n ? N : 1 : 0;
-
-    const bool wei_zp_per_n = wei_zp_mask & (1 << (wei_m.ndims() - 1));
-    const bool wei_zp_per_k = wei_zp_mask & (1 << (wei_m.ndims() - 2));
-    const int64_t wei_zp_stride_n = wei_zp_per_n ? 1 : 0;
-    const int64_t wei_zp_stride_k = wei_zp_per_k ? wei_zp_per_n ? N : 1 : 0;
-    const auto wei_zp_groups
+    const auto &src_scale_groups = prb->attr.scales.get(DNNL_ARG_SRC).groups;
+    const auto &wei_scale_groups
+            = prb->attr.scales.get(DNNL_ARG_WEIGHTS).groups;
+    const auto &src_zp_groups = prb->attr.zero_points.get(DNNL_ARG_SRC).groups;
+    const auto &wei_zp_groups
             = prb->attr.zero_points.get(DNNL_ARG_WEIGHTS).groups;
-    const int64_t wei_zp_group_k
-            = !wei_zp_groups.empty() ? wei_zp_groups[0] : 1;
 
-    const bool wei_decompression = prb->weights_decompression();
-    const bool apply_scales_in_ker
-            = wei_decompression || wei_scale_per_k || src_scale_per_k;
+    const int64_t src_scale_group = !src_scale_groups.empty()
+            ? src_scale_groups[1]
+            : (src_scale_mask << (src_m.ndims() - 1)) > 0 ? 1
+                                                          : K;
+    const int64_t wei_scale_group = !wei_scale_groups.empty()
+            ? wei_scale_groups[0]
+            : ((wei_scale_mask << (wei_m.ndims() - 2)) % 2) > 0 ? 1
+                                                                : K;
+    const int64_t src_zp_group = !src_zp_groups.empty() ? src_zp_groups[1]
+            : (src_zp_mask << (src_m.ndims() - 1)) > 0  ? 1
+                                                        : K;
+    const int64_t wei_zp_group = !wei_zp_groups.empty()      ? wei_zp_groups[0]
+            : ((wei_zp_mask << (wei_m.ndims() - 2)) % 2) > 0 ? 1
+                                                             : K;
+
+    const auto smallest_k_group = std::min(
+            {src_scale_group, wei_scale_group, src_zp_group, wei_zp_group});
+    const auto n_k_groups = K / smallest_k_group;
 
     // Fast return if any dim is zero. Common logic doesn't apply because of
     // broadcast semantics.
@@ -105,112 +115,152 @@ void compute_ref_matmul(const prb_t *prb, const args_t &args) {
         if (prb->src_dims()[d] == 0 || prb->weights_dims()[d] == 0) return;
     }
 
-    dnn_mem_t dst_tmp(dst_m, dnnl_f32, tag::abx, dst_m.engine());
-
     const auto src_broadcast_mask = prb->src_broadcast_mask();
     const auto wei_broadcast_mask = prb->weights_broadcast_mask();
+    const auto bias_broadcast_mask = prb->bias_broadcast_mask();
+    auto v_po_masks = prb->attr.post_ops.get_po_masks();
 
     benchdnn_parallel_nd(MB, M, N, [&](int64_t mb, int64_t m, int64_t n) {
-        auto src = (const float *)src_m;
-        auto wei = (const float *)wei_m;
-
         float dst = 0;
-        const int64_t src_mb
-                = dst_m.get_scale_idx(mb, src_broadcast_mask, batch_ndims);
-        const int64_t wei_mb
-                = dst_m.get_scale_idx(mb, wei_broadcast_mask, batch_ndims);
-
-        for (int64_t k = 0; k < K; ++k) {
-            int src_zp = has_src_zp ? src_zps.get_elem(src_zp_mask > 0 ? k : 0)
-                                    : 0;
-            int wei_zp = has_wei_zp
-                    ? wei_zps.get_elem(wei_zp_stride_k * (k / wei_zp_group_k)
-                            + wei_zp_stride_n * n)
-                    : 0;
-            auto s = src[src_off_f(prb, src_mb, m, k)] - src_zp;
-            auto w = wei[wei_off_f(prb, wei_mb, k, n)] - wei_zp;
-            // Compression scaling happens before the matmul, unlike regular
-            // quantization, to preserve the accuracy.
-            // Also, regular quantized matmul can have per group K-dim scales
-            // which require handling inside the kernel.
-            if (apply_scales_in_ker) {
-                if (has_src_scale) {
-                    float src_scale = src_scales.get_elem(
-                            src_scale_stride_k * (k / src_scale_group_k)
-                            + src_scale_stride_m * m);
-                    s *= src_scale;
-                }
-                if (has_wei_scale) {
-                    float wei_scale = wei_scales.get_elem(
-                            wei_scale_stride_k * (k / wei_scale_group_k)
-                            + wei_scale_stride_n * n);
-                    w *= wei_scale;
-                }
-            }
-            dst += s * w;
+        int64_t src_mb = 0;
+        int64_t wei_mb = 0;
+        if (MB > 1) {
+            src_mb = dst_m.get_idx(mb, src_broadcast_mask, batch_ndims);
+            wei_mb = dst_m.get_idx(mb, wei_broadcast_mask, batch_ndims);
         }
-        ((float *)dst_tmp)[dst_off_f(prb, mb, m, n)] = dst;
-    });
 
-    auto v_po_masks = prb->attr.post_ops.get_po_masks();
-    const auto bias_broadcast_mask = prb->bias_broadcast_mask();
-    benchdnn_parallel_nd(MB, M, N, [&](int64_t mb, int64_t m, int64_t n) {
-        size_t dst_off = dst_off_f(prb, mb, m, n);
-        float &dst = ((float *)dst_m)[dst_off];
-
-        float wei_scale = 1.f;
-        float src_scale = 1.f;
-        if (!apply_scales_in_ker) {
-            assert(IMPLICATION(has_src_scale, src_scales.nelems() == 1));
-            if (has_src_scale) { src_scale = src_scales.get_elem(0); }
-            if (has_wei_scale) {
-                wei_scale = wei_scales.get_elem(wei_scale_mask > 0 ? n : 0);
+        int src_zp = has_src_single_zp ? src_zps.get_elem(0) : 0;
+        int wei_zp = has_wei_single_zp ? wei_zps.get_elem(0) : 0;
+        float src_scale = has_src_single_scale ? src_scales.get_elem(0) : 1.f;
+        float wei_scale = has_wei_single_scale ? wei_scales.get_elem(0) : 1.f;
+
+        for (int64_t gK = 0; gK < n_k_groups; gK++) {
+            const auto src_gK_off
+                    = src_off_f(prb, src_mb, m, gK * smallest_k_group);
+            // Note: scales/zero-points are still always in `tag::abx` format.
+            const auto wei_gK_off
+                    = wei_ab_off_f(prb, wei_mb, gK * smallest_k_group, n);
+
+            if (has_src_zp && !has_src_single_zp) {
+                const auto src_zp_idx = src_m.get_idx(
+                        src_gK_off, src_zp_mask, src_m.ndims(), src_zp_groups);
+                src_zp = src_zps.get_elem(src_zp_idx);
+            }
+            if (has_wei_zp && !has_wei_single_zp) {
+                const auto wei_zp_idx = wei_m.get_idx(
+                        wei_gK_off, wei_zp_mask, wei_m.ndims(), wei_zp_groups);
+                wei_zp = wei_zps.get_elem(wei_zp_idx);
             }
-        }
 
-        float tmp = ((float *)dst_tmp)[dst_off] * src_scale * wei_scale;
+            if (has_src_scale && !has_src_single_scale) {
+                const auto src_scale_idx = src_m.get_idx(src_gK_off,
+                        src_scale_mask, src_m.ndims(), src_scale_groups);
+                src_scale = src_scales.get_elem(src_scale_idx);
+            }
+            if (has_wei_scale && !has_wei_single_scale) {
+                const auto wei_scale_idx = wei_m.get_idx(wei_gK_off,
+                        wei_scale_mask, wei_m.ndims(), wei_scale_groups);
+                wei_scale = wei_scales.get_elem(wei_scale_idx);
+            }
+
+            for (int64_t k = 0; k < smallest_k_group; ++k) {
+                const auto src_off
+                        = src_off_f(prb, src_mb, m, gK * smallest_k_group + k);
+                const auto wei_off = wei_ba_off_f(
+                        prb, wei_mb, gK * smallest_k_group + k, n);
+
+                auto s = src_scale * (src_m.get_elem(src_off) - src_zp);
+                auto w = wei_scale * (wei_m.get_elem(wei_off) - wei_zp);
 
+                dst += s * w;
+            }
+        }
+
+        const auto dst_off = dst_off_f(prb, mb, m, n);
         if (prb->bia_dt != dnnl_data_type_undef) {
-            int64_t bia_off = dst_m.get_scale_idx(dst_off, bias_broadcast_mask);
-            float *bia_ptr = (float *)bia_m;
-            tmp += bia_ptr[bia_off];
+            const auto bia_idx = dst_m.get_idx(dst_off, bias_broadcast_mask);
+            dst += bia_m.get_elem(bia_idx);
         }
 
         const auto v_po_vals
                 = prepare_po_vals(dst_m, args, v_po_masks, dst_off);
-
-        maybe_dropout(prb->attr, tmp, dst_off, dropout);
-        maybe_post_ops(prb->attr, tmp, dst, v_po_vals);
-
-        int dst_zp = has_dst_zp ? dst_zps.get_elem(dst_zp_mask > 0 ? n : 0) : 0;
-        dst = tmp * dst_scale + dst_zp;
-        maybe_round(prb->attr, DNNL_ARG_DST, dst, dst_off, prb->dst_dt());
+        maybe_dropout(prb->attr, dst, dst_off, dropout);
+        const auto sum_val = dst_m.get_elem(dst_off);
+        maybe_post_ops(prb->attr, dst, sum_val, v_po_vals);
+
+        int dst_zp = 0;
+        if (has_dst_zp) {
+            const auto dst_zp_idx = dst_m.get_idx(dst_off, dst_zp_mask);
+            dst_zp = dst_zps.get_elem(dst_zp_idx);
+        }
+        float dst_scale = 1.f;
+        if (has_dst_scale) {
+            dst_scale = 1.f / dst_scales.get_elem(dst_scale_mask > 0 ? n : 0);
+        }
+        float dst_val = dst_scale * dst + dst_zp;
+        maybe_round(prb->attr, DNNL_ARG_DST, dst_val, dst_off, prb->dst_dt());
+        dst_m.set_elem(dst_off, dst_val);
     });
 }
 
 #ifdef DNNL_EXPERIMENTAL_SPARSE
-void compute_ref_matmul_csr(const prb_t *prb, const args_t &args) {
+
+void cvt_coo_indices_to_csr_pointers(const int32_t *indices, int32_t *pointers,
+        const int nnz, const int nrows) {
+    for (int i = 0; i < nnz; ++i) {
+        ++pointers[indices[i] + 1];
+    }
+    for (int i = 0; i < nrows; ++i) {
+        pointers[i + 1] += pointers[i];
+    }
+}
+
+void compute_ref_sparse_matmul(const prb_t *prb, const args_t &args) {
     const dnn_mem_t &src_m = args.find(DNNL_ARG_SRC);
     const dnn_mem_t &wei_m = args.find(DNNL_ARG_WEIGHTS);
     const dnn_mem_t &dst_m = args.find(DNNL_ARG_DST);
+
+    const auto src_encoding = prb->sparse_options.get_encoding(DNNL_ARG_SRC);
+    const auto wei_encoding
+            = prb->sparse_options.get_encoding(DNNL_ARG_WEIGHTS);
+
+    const bool is_src_sparse
+            = src_encoding == dnnl_csr || src_encoding == dnnl_coo;
+    const bool is_wei_sparse
+            = wei_encoding == dnnl_csr || wei_encoding == dnnl_coo;
+    auto encoding = is_src_sparse ? src_encoding : wei_encoding;
+
     const int64_t M = prb->m;
     const int64_t N = prb->n;
     const int64_t K = prb->k;
 
+    // TODO: Depending on the matrix dimensions the pointer buffer may take
+    // up a significant amount of memory. This wil require a mechanism to
+    // register the memory needed for the current scratchpad during
+    // COO-to-CSR format conversion.
+    std::vector<int32_t> pointer_buffer(1 + (is_src_sparse ? M : K), 0);
+
     // Batch is not supported.
     const int64_t mb = 0;
-
-    float *dst = dst_m.get_mapped_pointer<float>();
-
     benchdnn_parallel_nd(M, N, [&](int64_t m, int64_t n) {
-        dst[dst_off_f(prb, mb, m, n)] = 0.0f;
+        dst_m.set_elem(dst_off_f(prb, mb, m, n), 0.0f);
     });
 
-    if (prb->sparse_options.get_encoding(DNNL_ARG_WEIGHTS) == dnnl_csr) {
-        const float *src = src_m.get_mapped_pointer<float>();
-        const float *wei_values = wei_m.get_mapped_pointer<float>(0);
-        const int32_t *wei_indices = wei_m.get_mapped_pointer<int32_t>(1);
-        const int32_t *wei_pointers = wei_m.get_mapped_pointer<int32_t>(2);
+    if (is_wei_sparse) {
+        int32_t *wei_indices = wei_m.get_mapped_pointer<int32_t>(
+                encoding == dnnl_csr ? 1 : 2);
+        int32_t *wei_pointers = wei_m.get_mapped_pointer<int32_t>(2);
+
+        if (encoding == dnnl_coo) {
+            int32_t *wei_row_indices = wei_m.get_mapped_pointer<int32_t>(1);
+            const int64_t nnz = query_md_nnz(wei_m.md_);
+
+            benchdnn_parallel_nd(
+                    K + 1, [&](int64_t i) { pointer_buffer[i] = 0; });
+            cvt_coo_indices_to_csr_pointers(
+                    wei_row_indices, pointer_buffer.data(), nnz, K);
+            wei_pointers = pointer_buffer.data();
+        }
 
         benchdnn_parallel_nd(M, [&](int64_t m) {
             for (int64_t k = 0; k < K; k++) {
@@ -220,27 +270,42 @@ void compute_ref_matmul_csr(const prb_t *prb, const args_t &args) {
                     const int64_t src_idx = src_off_f(prb, mb, m, k);
                     const int64_t dst_idx
                             = dst_off_f(prb, mb, m, wei_indices[n]);
-                    dst[dst_idx] = dst[dst_idx] + src[src_idx] * wei_values[n];
+                    const float src_val = src_m.get_elem(src_idx);
+                    const float wei_val = wei_m.get_elem(n, 0);
+                    float dst_val = dst_m.get_elem(dst_idx);
+                    dst_val += src_val * wei_val;
+                    dst_m.set_elem(dst_idx, dst_val);
                 }
             }
         });
-    } else if (prb->sparse_options.get_encoding(DNNL_ARG_SRC) == dnnl_csr) {
-        const float *weights = wei_m.get_mapped_pointer<float>();
-        const float *src_values = src_m.get_mapped_pointer<float>(0);
-        const int32_t *src_indices = src_m.get_mapped_pointer<int32_t>(1);
-        const int32_t *src_pointers = src_m.get_mapped_pointer<int32_t>(2);
+    } else if (is_src_sparse) {
+        int32_t *src_indices = src_m.get_mapped_pointer<int32_t>(
+                encoding == dnnl_csr ? 1 : 2);
+        int32_t *src_pointers = src_m.get_mapped_pointer<int32_t>(2);
+
+        if (encoding == dnnl_coo) {
+            int32_t *src_row_indices = src_m.get_mapped_pointer<int32_t>(1);
+            const int64_t nnz = query_md_nnz(src_m.md_);
+            cvt_coo_indices_to_csr_pointers(
+                    src_row_indices, pointer_buffer.data(), nnz, M);
+            src_pointers = pointer_buffer.data();
+        }
 
         benchdnn_parallel_nd(M, [&](int64_t m) {
             const int64_t row_start = src_pointers[m];
             const int64_t row_end = src_pointers[m + 1];
-            for (int64_t k = row_start; k < row_end; k++) {
-                for (int64_t n = 0; n < N; n++) {
-                    const int64_t dst_idx = dst_off_f(prb, mb, m, n);
+            for (int64_t n = 0; n < N; n++) {
+                const int64_t dst_idx = dst_off_f(prb, mb, m, n);
+                float dst_val = dst_m.get_elem(dst_idx);
+
+                for (int64_t k = row_start; k < row_end; k++) {
                     const int64_t wei_idx
-                            = wei_off_f(prb, mb, src_indices[k], n);
-                    dst[dst_idx]
-                            = dst[dst_idx] + src_values[k] * weights[wei_idx];
+                            = wei_ba_off_f(prb, mb, src_indices[k], n);
+                    const float src_val = src_m.get_elem(k, 0);
+                    const float wei_val = wei_m.get_elem(wei_idx);
+                    dst_val += src_val * wei_val;
                 }
+                dst_m.set_elem(dst_idx, dst_val);
             }
         });
     }
@@ -259,8 +324,9 @@ void compute_ref(
     const auto wei_encoding
             = prb->sparse_options.get_encoding(DNNL_ARG_WEIGHTS);
 
-    if (src_encoding == dnnl_csr || wei_encoding == dnnl_csr) {
-        compute_ref_matmul_csr(prb, args);
+    if (src_encoding == dnnl_csr || wei_encoding == dnnl_csr
+            || src_encoding == dnnl_coo || wei_encoding == dnnl_coo) {
+        compute_ref_sparse_matmul(prb, args);
     } else {
         compute_ref_matmul(prb, args);
     }
diff --git a/tests/benchdnn/pool/bench_pool.cpp b/tests/benchdnn/pool/bench_pool.cpp
index fa2e876d313..2c012f9749a 100644
--- a/tests/benchdnn/pool/bench_pool.cpp
+++ b/tests/benchdnn/pool/bench_pool.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,17 +26,7 @@
 
 namespace pool {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -48,12 +38,11 @@ void check_correctness(
     for_(const auto &i_attr : s.attributes)
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
-        const prb_t prb(s.desc, i_dir, i_dt, i_tag, i_alg, i_attr, i_ctx_init,
-                i_ctx_exe, i_mb);
+        const prb_t prb(s.desc, i_dir, i_dt, i_tag, i_alg, i_mb, i_attr,
+                i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -86,13 +75,7 @@ int bench(int argc, char **argv) {
                 || parse_tag(s.tag, def.tag, argv[0])
                 || parse_alg(s.alg, def.alg, str2alg, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/pool/pool.cpp b/tests/benchdnn/pool/pool.cpp
index 8b2324224fb..883c99c21cf 100644
--- a/tests/benchdnn/pool/pool.cpp
+++ b/tests/benchdnn/pool/pool.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 * Copyright 2022-2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,6 +41,10 @@ int fill_data(data_kind_t kind, const prb_t *prb, const cfg_t &cfg,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, res);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, res, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     /* Do fixed partitioning to have same filling for any number of threads */
     const int64_t chunk_size = 64;
@@ -188,8 +192,13 @@ bool cuda_check_correctness(const prb_t *prb,
 
 void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
         const args_t &ref_args) {
+    const bool is_strict_acc
+            = prb->attr.acc_mode == dnnl_accumulation_mode_strict
+            || prb->attr.acc_mode == dnnl_accumulation_mode_f32;
     // Threshold to compensate division error. CPU could live with 6.f coeff.
-    const float trh = 10.f * epsilon_dt(prb->dt[1]);
+    const float trh = (prb->alg == alg_t::max && is_strict_acc)
+            ? 0.f
+            : 10.f * epsilon_dt(prb->dt[1]);
     cmp.set_threshold(trh);
     // Backward may have most zeroes for ker_in_pad with huge kernels problems.
     const float zero_percent = (prb->dir & FLAG_FWD) ? 99.f : 100.f;
@@ -215,12 +224,19 @@ std::vector<int> supported_exec_args(dir_t dir) {
             DNNL_ARG_WORKSPACE,
     };
     static const std::vector<int> exec_bwd_args = {
+            DNNL_ARG_DIFF_DST,
+            DNNL_ARG_DIFF_SRC,
+            DNNL_ARG_WORKSPACE,
+    };
+    static const std::vector<int> exec_bwd_args_graph = {
             DNNL_ARG_SRC, // For Graph to compute ws on backward
             DNNL_ARG_DIFF_DST,
             DNNL_ARG_DIFF_SRC,
             DNNL_ARG_WORKSPACE,
     };
-    return (dir & FLAG_FWD) ? exec_fwd_args : exec_bwd_args;
+    return (dir & FLAG_FWD)            ? exec_fwd_args
+            : (driver_name == "graph") ? exec_bwd_args_graph
+                                       : exec_bwd_args;
 };
 
 fill_cfg_t binary_po_fill_cfg(
@@ -246,7 +262,7 @@ fill_cfg_t binary_po_fill_cfg(
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -330,11 +346,16 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+        if (v_prim[1]) SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/pool/pool.hpp b/tests/benchdnn/pool/pool.hpp
index f2da6e5f193..b18d68a2d21 100644
--- a/tests/benchdnn/pool/pool.hpp
+++ b/tests/benchdnn/pool/pool.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -115,24 +115,27 @@ struct settings_t : public base_settings_t {
 struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
-        : prb_t(s.desc, s.dir[0], s.dt[0], s.tag[0], s.alg[0],
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0], s.mb[0]) {
+        : prb_t(s.desc, s.dir[0], s.dt[0], s.tag[0], s.alg[0], s.mb[0],
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const desc_t &desc, dir_t dir,
             const std::vector<dnnl_data_type_t> &dt, const std::string &tag,
-            alg_t alg, const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, int64_t mb = 0)
+            alg_t alg, int64_t mb, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : desc_t(desc)
         , dir(dir)
         , dt(dt)
         , tag(tag)
         , alg(alg)
+        , user_mb(mb)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
 
         // Broadcast data types if needed
@@ -148,10 +151,11 @@ struct prb_t : public desc_t {
     std::vector<dnnl_data_type_t> dt;
     std::string tag;
     alg_t alg;
+    int64_t user_mb;
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    impl_filter_t impl_filter;
 
     int64_t kernel_size() const { return kd * kh * kw; }
     bool has_ker_in_pad() const {
@@ -290,8 +294,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/pool/pool_aux.cpp b/tests/benchdnn/pool/pool_aux.cpp
index 4039a565f7e..0ce22a9243a 100644
--- a/tests/benchdnn/pool/pool_aux.cpp
+++ b/tests/benchdnn/pool/pool_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -87,7 +87,7 @@ int str2desc(desc_t *desc, const char *str) {
             ok = 1; \
             s += strlen(prb); \
             char *end_s; \
-            d.c = strtol(s, &end_s, 10); \
+            d.c = strtoll(s, &end_s, 10); \
             if (end_s == s) { \
                 BENCHDNN_PRINT(0, \
                         "ERROR: No value found for `%s` setting. Full " \
@@ -339,6 +339,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/prelu/bench_prelu.cpp b/tests/benchdnn/prelu/bench_prelu.cpp
index 8bad8c29984..e26e7eb819a 100644
--- a/tests/benchdnn/prelu/bench_prelu.cpp
+++ b/tests/benchdnn/prelu/bench_prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@
 
 namespace prelu {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -46,11 +36,10 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
         const prb_t prb(s.prb_vdims, i_dir, i_sdt, i_stag, i_attr, i_ctx_init,
-                i_ctx_exe);
+                i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -98,13 +87,7 @@ int bench(int argc, char **argv) {
                 || parse_dir(s.dir, def.dir, argv[0])
                 || parse_multi_dt(s.sdt, def.sdt, argv[0])
                 || parse_multi_tag(s.stag, def.stag, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/prelu/prelu.cpp b/tests/benchdnn/prelu/prelu.cpp
index 21509b6a3e4..1e39804cdd9 100644
--- a/tests/benchdnn/prelu/prelu.cpp
+++ b/tests/benchdnn/prelu/prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,6 +39,10 @@ int fill_data(data_kind_t kind, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     // Do fixed partitioning to have same filling for any number of threads.
     const int64_t chunk_size = 64;
@@ -173,7 +177,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -232,10 +236,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/prelu/prelu.hpp b/tests/benchdnn/prelu/prelu.hpp
index e5f2999c6c3..cc00fd197c5 100644
--- a/tests/benchdnn/prelu/prelu.hpp
+++ b/tests/benchdnn/prelu/prelu.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,21 +58,24 @@ struct prb_t : public prb_vdims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_vdims, s.dir[0], s.sdt[0], s.stag[0],
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0]) {
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_vdims_t &prb_vdims, dir_t dir,
             const std::vector<dnnl_data_type_t> &sdt,
             const std::vector<std::string> &stag, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_vdims_t(prb_vdims)
         , dir(dir)
         , sdt(sdt)
         , stag(stag)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
         // Broadcast data types if needed
         if (sdt.size() == 1) {
             const auto val = sdt[0]; // Need a copy here.
@@ -88,6 +91,7 @@ struct prb_t : public prb_vdims_t {
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -149,8 +153,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/prelu/prelu_aux.cpp b/tests/benchdnn/prelu/prelu_aux.cpp
index cda2ab8e2e0..aa4c0e41c0d 100644
--- a/tests/benchdnn/prelu/prelu_aux.cpp
+++ b/tests/benchdnn/prelu/prelu_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,6 +39,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_vdims_t>(*this);
 
diff --git a/tests/benchdnn/prelu/ref_prelu.cpp b/tests/benchdnn/prelu/ref_prelu.cpp
index b1a03c9ac89..eca58e77f5d 100644
--- a/tests/benchdnn/prelu/ref_prelu.cpp
+++ b/tests/benchdnn/prelu/ref_prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2022 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ void compute_ref_fwd(const prb_t *prb, const args_t &args) {
     const auto weights_broadcast_mask = prb->get_broadcast_mask();
 
     benchdnn_parallel_nd(nelems, [&](int64_t i) {
-        const auto wei_idx = src.get_scale_idx(i, weights_broadcast_mask);
+        const auto wei_idx = src.get_idx(i, weights_broadcast_mask);
         const float s = src.get_elem(i);
         float res = s * (s > 0 ? 1.f : wei.get_elem(wei_idx));
         maybe_saturate(prb->sdt[0], res);
diff --git a/tests/benchdnn/reduction/bench_reduction.cpp b/tests/benchdnn/reduction/bench_reduction.cpp
index aedda20f3f3..95bef6c7ec0 100644
--- a/tests/benchdnn/reduction/bench_reduction.cpp
+++ b/tests/benchdnn/reduction/bench_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,17 +21,7 @@
 
 namespace reduction {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -46,11 +36,10 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
         const prb_t prb(s.prb_vdims, i_sdt, i_ddt, i_stag, i_dtag, i_alg, i_p,
-                i_eps, i_attr, i_ctx_init, i_ctx_exe);
+                i_eps, i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -94,13 +83,7 @@ int bench(int argc, char **argv) {
                 || parse_vector_option(s.p, def.p, atof, argv[0], "p", help_p)
                 || parse_vector_option(
                         s.eps, def.eps, atof, argv[0], "eps", help_eps)
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/reduction/reduction.cpp b/tests/benchdnn/reduction/reduction.cpp
index 8bb2466bce5..a0958ad530c 100644
--- a/tests/benchdnn/reduction/reduction.cpp
+++ b/tests/benchdnn/reduction/reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -115,6 +115,10 @@ int fill_mem(const prb_t *prb, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp,
     if (has_bench_mode_bit(mode_bit_t::bitwise) && prb->alg != alg_t::mul) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto sdt = mem_dt.dt();
     const auto ddt = prb->ddt;
@@ -260,7 +264,7 @@ fill_cfg_t binary_po_fill_cfg(
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -318,10 +322,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/reduction/reduction.hpp b/tests/benchdnn/reduction/reduction.hpp
index d143da64538..89d6690fb32 100644
--- a/tests/benchdnn/reduction/reduction.hpp
+++ b/tests/benchdnn/reduction/reduction.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -88,7 +88,7 @@ struct prb_t : public prb_vdims_t {
     prb_t(const settings_t &s)
         : prb_t(s.prb_vdims, s.sdt[0], s.ddt[0], s.stag[0], s.dtag[0], s.alg[0],
                 s.p[0], s.eps[0], s.attributes.front(), s.ctx_init[0],
-                s.ctx_exe[0]) {
+                s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
@@ -96,7 +96,7 @@ struct prb_t : public prb_vdims_t {
             dnnl_data_type_t ddt, const std::string &stag,
             const std::string &dtag, alg_t alg, float p, float eps,
             const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : prb_vdims_t(prb_vdims)
         , sdt(sdt)
         , ddt(ddt)
@@ -107,7 +107,8 @@ struct prb_t : public prb_vdims_t {
         , eps(eps)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
@@ -119,6 +120,7 @@ struct prb_t : public prb_vdims_t {
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -182,8 +184,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/reduction/reduction_aux.cpp b/tests/benchdnn/reduction/reduction_aux.cpp
index 8539826e959..4a252c4053b 100644
--- a/tests/benchdnn/reduction/reduction_aux.cpp
+++ b/tests/benchdnn/reduction/reduction_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -93,6 +93,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_vdims_t>(*this);
 
diff --git a/tests/benchdnn/reorder/bench_reorder.cpp b/tests/benchdnn/reorder/bench_reorder.cpp
index 1b20cb51f17..8bdeddc8263 100644
--- a/tests/benchdnn/reorder/bench_reorder.cpp
+++ b/tests/benchdnn/reorder/bench_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,17 +24,7 @@
 
 namespace reorder {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -50,12 +40,11 @@ void check_correctness(
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_runtime_dim_mask : s.runtime_dim_mask) {
         const prb_t prb(s.prb_dims, i_sdt, i_ddt, i_stag, i_dtag, i_strides,
-                i_attr, i_ctx_init, i_ctx_exe, i_oflag, i_cross_engine,
-                i_runtime_dim_mask);
+                i_oflag, i_cross_engine, i_runtime_dim_mask, i_attr, i_ctx_init,
+                i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -171,13 +160,7 @@ int bench(int argc, char **argv) {
                 || parse_vector_option(s.cross_engine, def.cross_engine,
                         str2cross_engine, argv[0], "cross-engine",
                         help_cross_engine)
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/reorder/cfg.cpp b/tests/benchdnn/reorder/cfg.cpp
index d4743d74bb8..9e9cf4e8b12 100644
--- a/tests/benchdnn/reorder/cfg.cpp
+++ b/tests/benchdnn/reorder/cfg.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,21 +28,25 @@ namespace reorder {
 
 const float int_max_exact = 1 << 24;
 const float f16_max_exact = 1 << 11;
+const float f4_max_exact = 1 << 3;
 
 #define REG(dt, min, max) \
-    const dt_conf_s CONCAT2(_conf_, dt) = {CONCAT2(dnnl_, dt), min, max}; \
-    const dt_conf_t CONCAT2(conf_, dt) = &CONCAT2(_conf_, dt);
+    const dt_conf_t CONCAT2(_conf_, dt) = {CONCAT2(dnnl_, dt), min, max}; \
+    const dt_conf_t *CONCAT2(conf_, dt) = &CONCAT2(_conf_, dt);
 
 REG(f32, -int_max_exact, int_max_exact);
 REG(f64, -int_max_exact, int_max_exact);
 REG(f16, -f16_max_exact, f16_max_exact);
 REG(bf16, -int_max_exact, int_max_exact);
 REG(f8_e5m2, -f16_max_exact, f16_max_exact);
-REG(f8_e4m3, -f16_max_exact, f16_max_exact);
-// Do not exceed max float value representable in integer. Otherwise, we get
-// a correctness issue caused by different computations in reference and the
-// library.
-REG(s32, INT_MIN, BENCHDNN_S32_TO_F32_SAT_CONST);
+REG(f8_e4m3, -127.f, 127.f);
+REG(f4_e2m1, -f16_max_exact, f16_max_exact);
+REG(f4_e3m0, -f4_max_exact, f4_max_exact);
+// Do not exceed min/max float value representable in integer. Otherwise, we get
+// a correctness issue caused by different computations or roudings in the naive
+// reference and the library. One of those can be zero-point subtracting which
+// leads to underflow or overflow.
+REG(s32, -BENCHDNN_S32_TO_F32_SAT_CONST, BENCHDNN_S32_TO_F32_SAT_CONST);
 REG(s8, INT8_MIN, INT8_MAX);
 REG(u8, 0, UINT8_MAX);
 REG(s4, -7, 8);
@@ -50,7 +54,7 @@ REG(u4, 0, 15);
 
 #undef REG
 
-dt_conf_t dt2cfg(dnnl_data_type_t dt) {
+const dt_conf_t *dt2cfg(dnnl_data_type_t dt) {
 #define CASE(cfg) \
     if (CONCAT2(dnnl_, cfg) == dt) return CONCAT2(conf_, cfg)
     CASE(f32);
@@ -59,6 +63,8 @@ dt_conf_t dt2cfg(dnnl_data_type_t dt) {
     CASE(bf16);
     CASE(f8_e5m2);
     CASE(f8_e4m3);
+    CASE(f4_e2m1);
+    CASE(f4_e3m0);
     CASE(s32);
     CASE(s8);
     CASE(u8);
@@ -69,7 +75,7 @@ dt_conf_t dt2cfg(dnnl_data_type_t dt) {
     return conf_f32;
 }
 
-dnnl_data_type_t cfg2dt(dt_conf_t cfg) {
+dnnl_data_type_t cfg2dt(const dt_conf_t *cfg) {
 #define CASE(_cfg) \
     if (cfg == CONCAT2(conf_, _cfg)) return CONCAT2(dnnl_, _cfg)
     CASE(f32);
@@ -78,6 +84,8 @@ dnnl_data_type_t cfg2dt(dt_conf_t cfg) {
     CASE(bf16);
     CASE(f8_e5m2);
     CASE(f8_e4m3);
+    CASE(f4_e2m1);
+    CASE(f4_e3m0);
     CASE(s32);
     CASE(s8);
     CASE(u8);
diff --git a/tests/benchdnn/reorder/ref_reorder.cpp b/tests/benchdnn/reorder/ref_reorder.cpp
index e383a5859a4..38c4c349ad3 100644
--- a/tests/benchdnn/reorder/ref_reorder.cpp
+++ b/tests/benchdnn/reorder/ref_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,15 +41,17 @@ void compute_ref(
             prb->attr.scales.get(DNNL_ARG_SRC).policy);
     const int dst_scale_mask = attr_t::get_default_mask(
             prb->attr.scales.get(DNNL_ARG_DST).policy);
+    const auto &src_scale_groups = prb->attr.scales.get(DNNL_ARG_SRC).groups;
 
     const auto dst_dt = prb->ddt;
     const auto nelems = src.nelems();
     // This is native to reorder zero point which comes from reorder attributes.
     const bool has_src_zp = !prb->attr.zero_points.get(DNNL_ARG_SRC).is_def();
     const bool has_dst_zp = !prb->attr.zero_points.get(DNNL_ARG_DST).is_def();
-    assert(IMPLICATION(has_src_zp, src_zps.nelems() == 1));
+    const int src_zp_mask = attr_t::get_default_mask(
+            prb->attr.zero_points.get(DNNL_ARG_SRC).policy);
+    const auto &src_zp_groups = prb->attr.zero_points.get(DNNL_ARG_SRC).groups;
     assert(IMPLICATION(has_dst_zp, dst_zps.nelems() == 1));
-    const int src_zero_point = has_src_zp ? src_zps.get_elem(0) : 0;
     const int dst_zero_point = has_dst_zp ? dst_zps.get_elem(0) : 0;
 
     float beta = 0;
@@ -67,17 +69,24 @@ void compute_ref(
     const float s8_scale_factor = need_s8_comp ? reorder_rescale_factor() : 1.f;
 
     benchdnn_parallel_nd(nelems, [&](int64_t idx) {
-        float s = src.get_elem(idx) - src_zero_point;
+        int src_zp = 0;
+        if (has_src_zp) {
+            const auto src_zp_idx
+                    = src.get_idx(idx, src_zp_mask, src.ndims(), src_zp_groups);
+            src_zp = src_zps.get_elem(src_zp_idx);
+        }
+        float s = src.get_elem(idx) - src_zp;
         float d = 0;
         if (beta_idx >= 0) d = dst.get_elem(idx) - dst_zero_point;
 
         float src_scale = 1.f, dst_scale = 1.f;
         if (has_src_scale) {
-            int64_t src_mask_idx = src.get_scale_idx(idx, src_scale_mask);
+            int64_t src_mask_idx = src.get_idx(
+                    idx, src_scale_mask, src.ndims(), src_scale_groups);
             src_scale = src_scales.get_elem(src_mask_idx);
         }
         if (has_dst_scale) {
-            int64_t dst_mask_idx = dst.get_scale_idx(idx, dst_scale_mask);
+            int64_t dst_mask_idx = dst.get_idx(idx, dst_scale_mask);
             dst_scale = dst_scales.get_elem(dst_mask_idx);
         }
         float value = (s8_scale_factor * src_scale * s + beta * d) / dst_scale
@@ -85,6 +94,7 @@ void compute_ref(
         value = maybe_saturate(dst_dt, value);
         if (dst_dt == dnnl_s32 && value >= (float)INT_MAX)
             value = BENCHDNN_S32_TO_F32_SAT_CONST;
+        maybe_round(prb->attr, DNNL_ARG_DST, value, idx, dst_dt);
 
         dst.set_elem(idx, round_to_nearest_representable(dst_dt, value));
     });
@@ -135,13 +145,11 @@ void compute_ref(
 
             float src_scale = 1.f, dst_scale = 1.f;
             if (has_src_scale) {
-                int64_t src_mask_idx
-                        = src.get_scale_idx(src_off, src_scale_mask);
+                int64_t src_mask_idx = src.get_idx(src_off, src_scale_mask);
                 src_scale = src_scales.get_elem(src_mask_idx);
             }
             if (has_dst_scale) {
-                int64_t dst_mask_idx
-                        = dst.get_scale_idx(src_off, dst_scale_mask);
+                int64_t dst_mask_idx = dst.get_idx(src_off, dst_scale_mask);
                 dst_scale = dst_scales.get_elem(dst_mask_idx);
             }
 
diff --git a/tests/benchdnn/reorder/reorder.cpp b/tests/benchdnn/reorder/reorder.cpp
index 1437f7bd305..4315b384f90 100644
--- a/tests/benchdnn/reorder/reorder.cpp
+++ b/tests/benchdnn/reorder/reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,6 +44,10 @@ int fill_mem(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto conf = prb->get_conf(kind);
 
@@ -204,114 +208,163 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
     skip_unimplemented_sum_po(prb->attr, res, dnnl_reorder, sdt);
     skip_unimplemented_prelu_po(prb->attr, res, dnnl_reorder);
 
-    bool scales_ok = true;
-#if !defined(DNNL_X64) || DNNL_X64 == 0
-    {
-        // reference reorder supports only a subset of scale policies
-        const std::vector<policy_t> supported_policy = {policy_t::COMMON,
-                policy_t::PER_DIM_0, policy_t::PER_DIM_1, policy_t::PER_DIM_01};
-
-        for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
-            scales_ok = std::any_of(supported_policy.cbegin(),
-                    supported_policy.cend(), [&](const policy_t policy) {
-                        return prb->attr.scales.get(arg).policy == policy;
-                    });
-        }
-    }
-#endif
-    if (!scales_ok) {
+    const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32,
+            ddt != dnnl_f8_e5m2 && ddt != dnnl_f8_e4m3 && ddt != dnnl_bf16
+                    && ddt != dnnl_f16);
+    const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32,
+            sdt != dnnl_f8_e5m2 && sdt != dnnl_f8_e4m3 && sdt != dnnl_bf16
+                    && sdt != dnnl_f16);
+    if (!s32_src_ok || !s32_dst_ok) {
+        BENCHDNN_PRINT(2,
+                "[SKIP][%s:%d]: Mixed (xf8,xf16)<-->s32 support is limited.\n",
+                __FILE__, __LINE__);
         res->state = SKIPPED;
         res->reason = skip_reason::case_not_supported;
         return;
     }
 
-    if (prb->is_reorder_with_compensation(FLAG_ANY)) {
-        // Compensation is supported for s8 dst data type.
-        const bool dt_ok = ddt == dnnl_s8;
-        // Compensation can be paired with dst scale only.
-        const bool attr_ok
-                = prb->attr.zero_points.is_def() && prb->attr.post_ops.is_def();
-        // Compensation does not support runtime dims.
-        const bool rt_ok = prb->runtime_dim_mask == 0;
-
-        // Compensation and scales mask should coincide
-        const auto comp_mask = prb->get_compensation_mask(FLAG_ANY);
-        bool masks_ok = true;
-        for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
-            const auto &e = prb->attr.scales.get(arg);
-            if (!e.is_def()) {
-                int e_mask = attr_t::get_default_mask(e.policy);
-                masks_ok = masks_ok && e_mask == comp_mask;
+    if (is_cpu()) {
+        bool scales_ok = true;
+#if !defined(DNNL_X64) || DNNL_X64 == 0
+        {
+            // reference reorder supports only a subset of scale policies
+            const std::vector<policy_t> supported_policy
+                    = {policy_t::COMMON, policy_t::PER_DIM_0,
+                            policy_t::PER_DIM_1, policy_t::PER_DIM_01};
+
+            for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
+                scales_ok = std::any_of(supported_policy.cbegin(),
+                        supported_policy.cend(), [&](const policy_t policy) {
+                            return prb->attr.scales.get(arg).policy == policy;
+                        });
             }
         }
-
-        if (!dt_ok || !attr_ok || !rt_ok || !masks_ok) {
+#endif
+        if (!scales_ok) {
+            BENCHDNN_PRINT(2,
+                    "[SKIP][%s:%d]: Generic CPU doesn't support specified "
+                    "scale mask.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
         }
 
-#if !defined(DNNL_X64) || DNNL_X64 == 0
-        // Simple reorder doesn't provide decent coverage for compensated cases.
-        // Shut them down unconditionally by default.
-        res->state = SKIPPED;
-        res->reason = skip_reason::case_not_supported;
-        return;
-#endif
-    }
+        if (prb->is_reorder_with_compensation(FLAG_ANY)) {
+            const bool dt_ok = ddt == dnnl_s8;
+            if (!dt_ok) {
+                BENCHDNN_PRINT(2,
+                        "[SKIP][%s:%d]: Compensation is supported only for s8 "
+                        "dst data type.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
 
-    // Destination scale is not supported for runtime dimensions since the
-    // implementation logic inverts dst scales and requires scratchpad for
-    // `mask > 0` cases which is impossible to estimate with rt dims.
-    const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST);
-    if (!dst_scales.is_def() && attr_t::get_default_mask(dst_scales.policy) > 0
-            && prb->runtime_dim_mask != 0) {
-        res->state = SKIPPED;
-        res->reason = skip_reason::case_not_supported;
-        return;
-    }
+            const bool attr_ok = prb->attr.zero_points.is_def()
+                    && prb->attr.post_ops.is_def();
+            if (!attr_ok) {
+                BENCHDNN_PRINT(2,
+                        "[SKIP][%s:%d]: Compensation is supported with scale "
+                        "attribute only.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
 
-    // Compensation is supported through jit reorder only, but jit reorder
-    // doesn't support different masks for source and destination scales.
-    const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC);
-    if (!src_scales.is_def() && !dst_scales.is_def()) {
-        if (attr_t::get_default_mask(src_scales.policy)
-                        != attr_t::get_default_mask(dst_scales.policy)
-                && prb->is_reorder_with_compensation(FLAG_ANY)) {
+            const bool rt_ok = prb->runtime_dim_mask == 0;
+            if (!rt_ok) {
+                BENCHDNN_PRINT(2,
+                        "[SKIP][%s:%d]: Compensation is not supported for "
+                        "runtime dimensions.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
+
+            const auto comp_mask = prb->get_compensation_mask(FLAG_ANY);
+            bool masks_ok = true;
+            for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
+                const auto &e = prb->attr.scales.get(arg);
+                if (!e.is_def()) {
+                    int e_mask = attr_t::get_default_mask(e.policy);
+                    masks_ok = masks_ok && e_mask == comp_mask;
+                }
+            }
+            if (!masks_ok) {
+                BENCHDNN_PRINT(2,
+                        "[SKIP][%s:%d]: Compensation mask doesn't coincide "
+                        "with scaling mask.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
+
+#if !defined(DNNL_X64) || DNNL_X64 == 0
+            // Simple reorder doesn't provide decent coverage for compensated
+            // cases. Shut them down unconditionally by default.
+            BENCHDNN_PRINT(2,
+                    "[SKIP][%s:%d]: Generic CPU doesn't support compensation "
+                    "cases uniformly.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
+#endif
         }
-    }
 
-    if (is_cpu()) {
-        // Int4 reorder support is limited on CPU.
-        if (sdt == dnnl_s4 || ddt == dnnl_s4 || sdt == dnnl_u4
-                || ddt == dnnl_u4) {
+        const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST);
+        if (!dst_scales.is_def()
+                && attr_t::get_default_mask(dst_scales.policy) > 0
+                && prb->runtime_dim_mask != 0) {
+            // Destination scale is not supported for runtime dimensions since
+            // the implementation logic inverts dst scales and requires
+            // scratchpad for `mask > 0` cases which is impossible to estimate
+            // with runtime dims.
+            BENCHDNN_PRINT(2,
+                    "[SKIP][%s:%d]: Destination scale is not supported for "
+                    "runtime dimensions.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
         }
 
-        // CPU reorder doesn't support (xf8,xf16)<-->s32 combinations.
-        const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32,
-                ddt != dnnl_f8_e5m2 && ddt != dnnl_f8_e4m3 && ddt != dnnl_bf16
-                        && ddt != dnnl_f16);
-        const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32,
-                sdt != dnnl_f8_e5m2 && sdt != dnnl_f8_e4m3 && sdt != dnnl_bf16
-                        && sdt != dnnl_f16);
-        if (!s32_src_ok || !s32_dst_ok) {
+        const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC);
+        if (!src_scales.is_def() && !dst_scales.is_def()) {
+            if (attr_t::get_default_mask(src_scales.policy)
+                            != attr_t::get_default_mask(dst_scales.policy)
+                    && prb->is_reorder_with_compensation(FLAG_ANY)) {
+                BENCHDNN_PRINT(2,
+                        "[SKIP][%s:%d]: Compensation cases when both scales "
+                        "specified but with different masks isn't supported.\n",
+                        __FILE__, __LINE__);
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
+        }
+
+        if (sdt == dnnl_s4 || ddt == dnnl_s4 || sdt == dnnl_u4
+                || ddt == dnnl_u4) {
+            BENCHDNN_PRINT(2, "[SKIP][%s:%d]: Int4 support is limited.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
         }
 
-        // CPU f16 reorders only support f16<->f32 combinations
         const bool f16_src_ok = IMPLICATION(
                 sdt == dnnl_f16, ddt == dnnl_f16 || ddt == dnnl_f32);
         const bool f16_dst_ok = IMPLICATION(
                 ddt == dnnl_f16, sdt == dnnl_f16 || sdt == dnnl_f32);
         if (!f16_src_ok || !f16_dst_ok) {
+            BENCHDNN_PRINT(2, "[SKIP][%s:%d]: f16 support is limited.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
@@ -325,6 +378,8 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
                 = IMPLICATION(sdt == dnnl_f8_e5m2 || sdt == dnnl_f8_e4m3,
                         ddt == dnnl_f16 || ddt == dnnl_f32);
         if (!xf8_src_ok || !xf8_dst_ok) {
+            BENCHDNN_PRINT(2, "[SKIP][%s:%d]: f8 support is limited.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
@@ -332,20 +387,23 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
     }
 
     if (is_gpu()) {
-        // GPU does not support run-time dims.
-        // Reorders w/ compensation are not supported by design: zp_comp is done
-        // in kernels directly, but s8s8 instructions are available in HW.
-        if (prb->runtime_dim_mask != 0
-                || prb->is_reorder_with_compensation(FLAG_ANY)) {
+        if (prb->runtime_dim_mask != 0) {
+            BENCHDNN_PRINT(2,
+                    "[SKIP][%s:%d]: GPU doesn't support runtime dimensions.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
         }
 
-        // GPU doesn't support f8_e5m2/f8_e4m3.
-        const bool is_xf8 = prb->sdt == dnnl_f8_e5m2 || prb->sdt == dnnl_f8_e4m3
-                || prb->ddt == dnnl_f8_e5m2 || prb->ddt == dnnl_f8_e4m3;
-        if (is_xf8) {
+        if (prb->is_reorder_with_compensation(FLAG_ANY)) {
+            // Reorders w/ compensation are not supported by design: zp_comp is
+            // done in kernels directly, but s8s8 instructions are available in
+            // HW.
+            BENCHDNN_PRINT(2,
+                    "[SKIP][%s:%d]: GPU doesn't support cases with "
+                    "compensation.\n",
+                    __FILE__, __LINE__);
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
@@ -354,31 +412,40 @@ void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
 }
 
 void skip_invalid_prb(const prb_t *prb, res_t *res) {
-    // No sense in cross engine reorders when one of devices is switched off.
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
         || DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE
     auto cross_engine = prb->cross_engine;
     if (cross_engine == CPU2GPU || cross_engine == GPU2CPU) {
+        BENCHDNN_PRINT(2,
+                "[INVALID][%s:%d]: Cross-engine case isn't supported when just "
+                "one runtime is enabled.\n",
+                __FILE__, __LINE__);
         res->state = SKIPPED;
         res->reason = skip_reason::invalid_case;
         return;
     }
 #endif
 
-    // Zero-points can't be used with sum post-op.
     if (!prb->attr.zero_points.is_def(DNNL_ARG_DST)
             && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1) {
+        BENCHDNN_PRINT(2,
+                "[INVALID][%s:%d]: Zero-points can't be used with sum "
+                "post-op.\n",
+                __FILE__, __LINE__);
         res->state = SKIPPED;
         res->reason = skip_reason::invalid_case;
         return;
     }
 
-    // only integral data types can have zero points
     const bool is_src_zp_ok = is_integral_dt(prb->sdt)
             || prb->attr.zero_points.is_def(DNNL_ARG_SRC);
     const bool is_dst_zp_ok = is_integral_dt(prb->ddt)
             || prb->attr.zero_points.is_def(DNNL_ARG_DST);
     if (!(is_src_zp_ok && is_dst_zp_ok)) {
+        BENCHDNN_PRINT(2,
+                "[INVALID][%s:%d]: Non-integral data types don't support "
+                "zero-points\n",
+                __FILE__, __LINE__);
         res->state = SKIPPED;
         res->reason = skip_reason::invalid_case;
         return;
@@ -420,7 +487,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -484,10 +551,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/reorder/reorder.hpp b/tests/benchdnn/reorder/reorder.hpp
index 392c9e5a05d..cc8aee57c7a 100644
--- a/tests/benchdnn/reorder/reorder.hpp
+++ b/tests/benchdnn/reorder/reorder.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,14 +40,13 @@ flag_t str2flag(const char *str);
 std::string flag2str(flag_bit_t flag);
 std::ostream &operator<<(std::ostream &s, const std::vector<flag_t> &oflag);
 
-struct dt_conf_s {
+struct dt_conf_t {
     dnnl_data_type_t dt;
     float min;
     float max;
 };
-typedef const dt_conf_s *dt_conf_t;
-dt_conf_t dt2cfg(dnnl_data_type_t dt);
-dnnl_data_type_t cfg2dt(dt_conf_t cfg);
+const dt_conf_t *dt2cfg(dnnl_data_type_t dt);
+dnnl_data_type_t cfg2dt(const dt_conf_t *cfg);
 
 enum cross_engine_t { NONE, CPU2GPU, GPU2CPU };
 cross_engine_t str2cross_engine(const char *str);
@@ -89,29 +88,32 @@ struct prb_t : public prb_dims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_dims, s.sdt[0], s.ddt[0], s.stag[0], s.dtag[0],
-                s.strides[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
-                s.oflag[0], s.cross_engine[0], s.runtime_dim_mask[0]) {
+                s.strides[0], s.oflag[0], s.cross_engine[0],
+                s.runtime_dim_mask[0], s.attributes.front(), s.ctx_init[0],
+                s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_dims_t &prb_dims, dnnl_data_type_t sdt,
             dnnl_data_type_t ddt, const std::string &stag,
-            const std::string &dtag, const vdims_t &strides, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const std::string &dtag, const vdims_t &strides,
             const std::vector<flag_t> &oflag, cross_engine_t cross_engine,
-            unsigned runtime_dim_mask)
+            unsigned runtime_dim_mask, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_dims_t(prb_dims)
         , sdt(sdt)
         , ddt(ddt)
         , stag(stag)
         , dtag(dtag)
         , strides(strides)
+        , oflag(oflag)
+        , cross_engine(cross_engine)
+        , runtime_dim_mask(runtime_dim_mask)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , oflag(oflag)
-        , cross_engine(cross_engine)
-        , runtime_dim_mask(runtime_dim_mask) {
+        , impl_filter(impl_filter) {
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
@@ -119,17 +121,18 @@ struct prb_t : public prb_dims_t {
     dnnl_data_type_t sdt, ddt;
     std::string stag, dtag;
     vdims_t strides;
-    bool inplace = false; // Lacks placement, always considered `false`.
-    attr_t attr;
-    thr_ctx_t ctx_init, ctx_exe;
     std::vector<flag_t> oflag;
     cross_engine_t cross_engine;
     unsigned runtime_dim_mask;
+    bool inplace = false; // Lacks placement, always considered `false`.
+    attr_t attr;
+    thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     bool is_reorder_with_compensation(flag_bit_t flag) const;
     dims_t get_compensation_dims(flag_bit_t flag) const;
     int get_compensation_mask(flag_bit_t flag) const;
-    dt_conf_t get_conf(data_kind_t kind) const;
+    const dt_conf_t *get_conf(data_kind_t kind) const;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -202,8 +205,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/reorder/reorder_aux.cpp b/tests/benchdnn/reorder/reorder_aux.cpp
index 9cefcfbfc37..ca46cc958ed 100644
--- a/tests/benchdnn/reorder/reorder_aux.cpp
+++ b/tests/benchdnn/reorder/reorder_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,7 +39,10 @@ flag_t str2flag(const char *str) {
     else if (sub.compare("zp_comp") == 0)
         flag = FLAG_ZP_COMP;
     else {
-        assert(!"unknown flag");
+        BENCHDNN_PRINT(0,
+                "Error: unsupported flag value \'%s\'. Supported values are "
+                "\'s8s8_comp\' and \'zp_comp\'.\n",
+                sub.c_str());
         SAFE_V(FAIL);
     }
 
@@ -100,7 +103,8 @@ void prb_t::get_compensation_parameters(
         dims_t &comp_dims, int &mask, flag_bit_t flag) const {
     if (is_reorder_with_compensation(flag)) {
         for (const auto &i_oflag : oflag) {
-            if (i_oflag.first != flag) continue;
+            const bool has_flag_bit = (i_oflag.first & flag);
+            if (!has_flag_bit) continue;
 
             mask = i_oflag.second;
             for (int d = 0; d < ndims; ++d)
@@ -123,7 +127,7 @@ int prb_t::get_compensation_mask(flag_bit_t flag) const {
     return mask;
 }
 
-dt_conf_t prb_t::get_conf(data_kind_t kind) const {
+const dt_conf_t *prb_t::get_conf(data_kind_t kind) const {
     switch (kind) {
         case SRC: return dt2cfg(sdt);
         case DST: return dt2cfg(ddt);
@@ -175,6 +179,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_dims_t>(*this);
 
diff --git a/tests/benchdnn/resampling/bench_resampling.cpp b/tests/benchdnn/resampling/bench_resampling.cpp
index 3d48bca0588..e753e7e2891 100644
--- a/tests/benchdnn/resampling/bench_resampling.cpp
+++ b/tests/benchdnn/resampling/bench_resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,17 +26,7 @@
 
 namespace resampling {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -49,12 +39,11 @@ void check_correctness(
     for_(const auto &i_attr : s.attributes)
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
-        const prb_t prb(s.desc, i_dir, i_sdt, i_ddt, i_tag, i_alg, i_attr,
-                i_ctx_init, i_ctx_exe, i_mb);
+        const prb_t prb(s.desc, i_dir, i_sdt, i_ddt, i_tag, i_alg, i_mb, i_attr,
+                i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -73,13 +62,7 @@ int bench(int argc, char **argv) {
                 || parse_tag(s.tag, def.tag, argv[0])
                 || parse_alg(s.alg, def.alg, str2alg, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/resampling/resampling.cpp b/tests/benchdnn/resampling/resampling.cpp
index 939a76284e2..71f5746e300 100644
--- a/tests/benchdnn/resampling/resampling.cpp
+++ b/tests/benchdnn/resampling/resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -40,6 +40,10 @@ int fill_dat(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto dt = mem_dt.dt();
     const int range = 16;
@@ -169,7 +173,7 @@ fill_cfg_t binary_po_fill_cfg(
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -245,10 +249,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/resampling/resampling.hpp b/tests/benchdnn/resampling/resampling.hpp
index 88c00659b43..15b0b85f0e8 100644
--- a/tests/benchdnn/resampling/resampling.hpp
+++ b/tests/benchdnn/resampling/resampling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -90,24 +90,26 @@ struct prb_t : public desc_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.desc, s.dir[0], s.sdt[0], s.ddt[0], s.tag[0], s.alg[0],
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0], s.mb[0]) {
+                s.mb[0], s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const desc_t &desc, dir_t dir, dnnl_data_type_t sdt,
-            dnnl_data_type_t ddt, const std::string &tag, alg_t alg,
+            dnnl_data_type_t ddt, const std::string &tag, alg_t alg, int64_t mb,
             const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, int64_t mb = 0)
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : desc_t(desc)
         , dir(dir)
         , sdt(sdt)
         , ddt(ddt)
         , tag(tag)
         , alg(alg)
+        , user_mb(mb)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
         if (mb) this->mb = mb;
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
@@ -116,10 +118,11 @@ struct prb_t : public desc_t {
     dnnl_data_type_t sdt, ddt;
     std::string tag;
     alg_t alg;
+    int64_t user_mb;
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -205,8 +208,7 @@ int fill_dat(
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/resampling/resampling_aux.cpp b/tests/benchdnn/resampling/resampling_aux.cpp
index 72d67925f3b..a1a897d32cc 100644
--- a/tests/benchdnn/resampling/resampling_aux.cpp
+++ b/tests/benchdnn/resampling/resampling_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -80,7 +80,7 @@ int str2desc(desc_t *desc, const char *str) {
             ok = 1; \
             s += strlen(prb); \
             char *end_s; \
-            d.c = strtol(s, &end_s, 10); \
+            d.c = strtoll(s, &end_s, 10); \
             if (end_s == s) { \
                 BENCHDNN_PRINT(0, \
                         "ERROR: No value found for `%s` setting. Full " \
@@ -228,6 +228,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/rnn/bench_rnn.cpp b/tests/benchdnn/rnn/bench_rnn.cpp
index 00d25a687bc..c476fc78129 100644
--- a/tests/benchdnn/rnn/bench_rnn.cpp
+++ b/tests/benchdnn/rnn/bench_rnn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -33,17 +33,7 @@
 
 namespace rnn {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t &,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t &, res_t *)>;
-using driver_task_executor_t = rnn_task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -69,12 +59,12 @@ void check_correctness(
         auto prb = std::make_shared<prb_t>(s.desc,
                 dt_conf_t::create(i_cfg, i_attr), i_tag, i_prop, i_alg,
                 i_with_peephole, i_with_projection, i_direction, i_scale_policy,
-                i_scale_proj_policy, i_flags, i_activation, i_attr, i_ctx_init,
-                i_ctx_exe, s.alpha, s.beta, i_skip_nonlinear, i_trivial_strides,
-                i_n_layer, i_n_iter, i_mb);
+                i_scale_proj_policy, i_flags, i_activation, s.alpha, s.beta,
+                i_skip_nonlinear, i_trivial_strides, i_n_layer, i_n_iter, i_mb,
+                i_attr, i_ctx_init, i_ctx_exe, s.impl_filter);
 
         task_executor.submit(
-                std::move(prb), s.perf_template, createit, check_cacheit, doit);
+                std::move(prb), s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -180,12 +170,7 @@ int bench(int argc, char **argv) {
                 || parse_vector_option(s.with_projection, def.with_projection,
                         str2bool, argv[0], "with-projection",
                         help_with_projection)
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/rnn/rnn.cpp b/tests/benchdnn/rnn/rnn.cpp
index af69230ac02..59e61dfbeed 100644
--- a/tests/benchdnn/rnn/rnn.cpp
+++ b/tests/benchdnn/rnn/rnn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -565,8 +565,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     dnnl_dims_t dst_layer_dims = {prb.n_iter, prb.mb, prb.dlc(PRIMITIVE)};
 
     dnnl_dims_t src_layer_dims = {prb.n_iter, prb.mb, prb.slc};
-    auto src_layer_d = dnn_mem_t::init_md(
-            3, src_layer_dims, prb.cfg[SRC_LAYER].dt, prb.tag[0]);
+    auto src_layer_d = dnn_mem_t::init_md(prb.ndims(SRC_LAYER), src_layer_dims,
+            prb.cfg[SRC_LAYER].dt, prb.tag[0]);
     if (prb.tag[0] != tag::any) {
         dims_t src_layer_strides(query_md_ndims(src_layer_d));
         std::memcpy(src_layer_strides.data(), query_md_strides(src_layer_d),
@@ -588,8 +588,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     }
 
     dnnl_dims_t src_iter_dims = {prb.n_layer, prb.n_dir(), prb.mb, prb.sic};
-    auto src_iter_d = dnn_mem_t::init_md(
-            4, src_iter_dims, prb.cfg[SRC_ITER].dt, tag::abx /* dnnl_ldnc */);
+    auto src_iter_d = dnn_mem_t::init_md(prb.ndims(SRC_ITER), src_iter_dims,
+            prb.cfg[SRC_ITER].dt, tag::abx /* dnnl_ldnc */);
     // Adjust strides for src_iter_d.
     dims_t src_iter_strides(query_md_ndims(src_iter_d));
     std::memcpy(src_iter_strides.data(), query_md_strides(src_iter_d),
@@ -603,8 +603,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
             src_iter_strides);
 
     dnnl_dims_t src_iter_c_dims = {prb.n_layer, prb.n_dir(), prb.mb, prb.dhc};
-    auto src_iter_c_d = dnn_mem_t::init_md(4, src_iter_c_dims,
-            prb.cfg[SRC_ITER_C].dt, tag::abx /* dnnl_ldnc */);
+    auto src_iter_c_d = dnn_mem_t::init_md(prb.ndims(SRC_ITER_C),
+            src_iter_c_dims, prb.cfg[SRC_ITER_C].dt, tag::abx /* dnnl_ldnc */);
     // Adjust strides for src_iter_c_d.
     dims_t src_iter_c_strides(query_md_ndims(src_iter_c_d));
     std::memcpy(src_iter_c_strides.data(), query_md_strides(src_iter_c_d),
@@ -620,30 +620,36 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     // Forward and backward support different layouts for weights. When
     // testing backward, we cannot reliably use the supplied weights tag.
     bool has_service_prim = prb.prop == dnnl_backward;
-    auto weights_layer_d = dnn_mem_t::init_md(5, weights_layer_dims,
-            prb.cfg[WEIGHTS_LAYER].dt, has_service_prim ? "any" : prb.tag[1]);
-    auto weights_iter_d = dnn_mem_t::init_md(5, weights_iter_dims,
-            prb.cfg[WEIGHTS_ITER].dt, has_service_prim ? "any" : prb.tag[1]);
+    auto weights_layer_d = dnn_mem_t::init_md(prb.ndims(WEIGHTS_LAYER),
+            weights_layer_dims, prb.cfg[WEIGHTS_LAYER].dt,
+            has_service_prim ? "any" : prb.tag[1]);
+    auto weights_iter_d = dnn_mem_t::init_md(prb.ndims(WEIGHTS_ITER),
+            weights_iter_dims, prb.cfg[WEIGHTS_ITER].dt,
+            has_service_prim ? "any" : prb.tag[1]);
 
     benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> attention_d {};
     if (prb.is_augru())
-        attention_d = dnn_mem_t::init_md(3, attention_dims,
-                prb.cfg[AUGRU_ATTENTION].dt, tag::abx /* dnnl_tnc */);
+        attention_d
+                = dnn_mem_t::init_md(prb.ndims(AUGRU_ATTENTION), attention_dims,
+                        prb.cfg[AUGRU_ATTENTION].dt, tag::abx /* dnnl_tnc */);
 
     benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> weights_peephole_d {};
     if (prb.is_lstm_peephole())
-        weights_peephole_d = dnn_mem_t::init_md(4, weights_peephole_dims,
-                prb.cfg[WEIGHTS_PEEPHOLE].dt, tag::abx /* dnnl_ldgo */);
+        weights_peephole_d = dnn_mem_t::init_md(prb.ndims(WEIGHTS_PEEPHOLE),
+                weights_peephole_dims, prb.cfg[WEIGHTS_PEEPHOLE].dt,
+                tag::abx /* dnnl_ldgo */);
 
     benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> weights_projection_d {};
     if (prb.is_lstm_projection())
-        weights_projection_d = dnn_mem_t::init_md(4, weights_projection_dims,
-                prb.cfg[WEIGHTS_PROJECTION].dt, tag::any);
+        weights_projection_d = dnn_mem_t::init_md(prb.ndims(WEIGHTS_PROJECTION),
+                weights_projection_dims, prb.cfg[WEIGHTS_PROJECTION].dt,
+                tag::any);
 
-    auto bias_d = dnn_mem_t::init_md(4, bias_dims, prb.cfg[BIAS].dt, tag::any);
+    auto bias_d = dnn_mem_t::init_md(
+            prb.ndims(BIAS), bias_dims, prb.cfg[BIAS].dt, tag::any);
 
-    auto dst_layer_d = dnn_mem_t::init_md(
-            3, dst_layer_dims, prb.cfg[DST_LAYER].dt, prb.tag[2]);
+    auto dst_layer_d = dnn_mem_t::init_md(prb.ndims(DST_LAYER), dst_layer_dims,
+            prb.cfg[DST_LAYER].dt, prb.tag[2]);
     if (prb.tag[2] != tag::any) {
         dims_t dst_layer_strides(query_md_ndims(dst_layer_d));
         std::memcpy(dst_layer_strides.data(), query_md_strides(dst_layer_d),
@@ -665,8 +671,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
     }
 
     dnnl_dims_t dst_iter_dims = {prb.n_layer, prb.n_dir(), prb.mb, prb.dic};
-    auto dst_iter_d = dnn_mem_t::init_md(
-            4, dst_iter_dims, prb.cfg[DST_ITER].dt, tag::abx /* dnnl_ldnc */);
+    auto dst_iter_d = dnn_mem_t::init_md(prb.ndims(DST_ITER), dst_iter_dims,
+            prb.cfg[DST_ITER].dt, tag::abx /* dnnl_ldnc */);
     // Adjust strides for dst_iter_d.
     dims_t dst_iter_strides(query_md_ndims(dst_iter_d));
     std::memcpy(dst_iter_strides.data(), query_md_strides(dst_iter_d),
@@ -680,8 +686,8 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
             dst_iter_strides);
 
     dnnl_dims_t dst_iter_c_dims = {prb.n_layer, prb.n_dir(), prb.mb, prb.dhc};
-    auto dst_iter_c_d = dnn_mem_t::init_md(4, dst_iter_c_dims,
-            prb.cfg[DST_ITER_C].dt, tag::abx /* dnnl_ldnc */);
+    auto dst_iter_c_d = dnn_mem_t::init_md(prb.ndims(DST_ITER_C),
+            dst_iter_c_dims, prb.cfg[DST_ITER_C].dt, tag::abx /* dnnl_ldnc */);
     // Adjust strides for dst_iter_c_d.
     dims_t dst_iter_c_strides(query_md_ndims(dst_iter_c_d));
     std::memcpy(dst_iter_c_strides.data(), query_md_strides(dst_iter_c_d),
@@ -707,43 +713,47 @@ dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
                 dst_iter_d, dst_iter_c_d, dnnl_attr, res));
     } else {
         // TODO: add stride support for diff_* tensors
-        auto diff_src_layer_d = dnn_mem_t::init_md(
-                3, src_layer_dims, prb.cfg[DIFF_SRC_LAYER].dt, prb.tag[0]);
-        auto diff_src_iter_d = dnn_mem_t::init_md(
-                4, src_iter_dims, prb.cfg[DIFF_SRC_ITER].dt, tag::any);
-        auto diff_src_iter_c_d = dnn_mem_t::init_md(
-                4, src_iter_c_dims, prb.cfg[DIFF_SRC_ITER_C].dt, tag::any);
-        auto diff_weights_layer_d = dnn_mem_t::init_md(5, weights_layer_dims,
+        auto diff_src_layer_d = dnn_mem_t::init_md(prb.ndims(DIFF_SRC_LAYER),
+                src_layer_dims, prb.cfg[DIFF_SRC_LAYER].dt, prb.tag[0]);
+        auto diff_src_iter_d = dnn_mem_t::init_md(prb.ndims(DIFF_SRC_ITER),
+                src_iter_dims, prb.cfg[DIFF_SRC_ITER].dt, tag::any);
+        auto diff_src_iter_c_d = dnn_mem_t::init_md(prb.ndims(DIFF_SRC_ITER_C),
+                src_iter_c_dims, prb.cfg[DIFF_SRC_ITER_C].dt, tag::any);
+        auto diff_weights_layer_d = dnn_mem_t::init_md(
+                prb.ndims(DIFF_WEIGHTS_LAYER), weights_layer_dims,
                 prb.cfg[DIFF_WEIGHTS_LAYER].dt, prb.tag[1]);
-        auto diff_weights_iter_d = dnn_mem_t::init_md(5, weights_iter_dims,
+        auto diff_weights_iter_d = dnn_mem_t::init_md(
+                prb.ndims(DIFF_WEIGHTS_ITER), weights_iter_dims,
                 prb.cfg[DIFF_WEIGHTS_ITER].dt, prb.tag[1]);
 
         benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> diff_attention_d {};
         if (prb.is_augru())
-            diff_attention_d = dnn_mem_t::init_md(3, attention_dims,
+            diff_attention_d = dnn_mem_t::init_md(
+                    prb.ndims(DIFF_AUGRU_ATTENTION), attention_dims,
                     prb.cfg[DIFF_AUGRU_ATTENTION].dt, tag::abx /* dnnl_tnc */);
 
         benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> diff_weights_peephole_d {};
         if (prb.is_lstm_peephole())
-            diff_weights_peephole_d = dnn_mem_t::init_md(4,
-                    weights_peephole_dims, prb.cfg[DIFF_WEIGHTS_PEEPHOLE].dt,
+            diff_weights_peephole_d = dnn_mem_t::init_md(
+                    prb.ndims(DIFF_WEIGHTS_PEEPHOLE), weights_peephole_dims,
+                    prb.cfg[DIFF_WEIGHTS_PEEPHOLE].dt,
                     tag::abx /* dnnl_ldgo */);
 
         benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t>
                 diff_weights_projection_d {};
         if (prb.is_lstm_projection())
-            diff_weights_projection_d
-                    = dnn_mem_t::init_md(4, weights_projection_dims,
-                            prb.cfg[DIFF_WEIGHTS_PROJECTION].dt, tag::any);
-
-        auto diff_bias_d = dnn_mem_t::init_md(
-                4, bias_dims, prb.cfg[DIFF_BIAS].dt, tag::any);
-        auto diff_dst_layer_d = dnn_mem_t::init_md(
-                3, dst_layer_dims, prb.cfg[DIFF_DST_LAYER].dt, prb.tag[2]);
-        auto diff_dst_iter_d = dnn_mem_t::init_md(
-                4, dst_iter_dims, prb.cfg[DIFF_DST_ITER].dt, tag::any);
-        auto diff_dst_iter_c_d = dnn_mem_t::init_md(
-                4, dst_iter_c_dims, prb.cfg[DIFF_DST_ITER_C].dt, tag::any);
+            diff_weights_projection_d = dnn_mem_t::init_md(
+                    prb.ndims(DIFF_WEIGHTS_PROJECTION), weights_projection_dims,
+                    prb.cfg[DIFF_WEIGHTS_PROJECTION].dt, tag::any);
+
+        auto diff_bias_d = dnn_mem_t::init_md(prb.ndims(DIFF_BIAS), bias_dims,
+                prb.cfg[DIFF_BIAS].dt, tag::any);
+        auto diff_dst_layer_d = dnn_mem_t::init_md(prb.ndims(DIFF_DST_LAYER),
+                dst_layer_dims, prb.cfg[DIFF_DST_LAYER].dt, prb.tag[2]);
+        auto diff_dst_iter_d = dnn_mem_t::init_md(prb.ndims(DIFF_DST_ITER),
+                dst_iter_dims, prb.cfg[DIFF_DST_ITER].dt, tag::any);
+        auto diff_dst_iter_c_d = dnn_mem_t::init_md(prb.ndims(DIFF_DST_ITER_C),
+                dst_iter_c_dims, prb.cfg[DIFF_DST_ITER_C].dt, tag::any);
 
         DNN_SAFE_STATUS(init_rnn_bwd_pd(&init_pd_args.pd, init_pd_args.engine,
                 prb, prb.prop, src_layer_d, src_iter_d, src_iter_c_d,
@@ -779,8 +789,10 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) {
             return;
         }
 #endif
-        // cpu backward only supports `any` or `abx` layouts for weights
-        if (IMPLICATION(prb.prop == dnnl_backward, prb.tag[1] != tag::abx)) {
+        const auto wei_tag
+                = normalize_tag(prb.tag[1], prb.ndims(WEIGHTS_LAYER));
+        // cpu backward only supports `any` layout for weights.
+        if (prb.prop == dnnl_backward && wei_tag != tag::any) {
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
             return;
@@ -795,7 +807,7 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) {
             return;
         }
 
-#ifdef DNNL_AARCH64_USE_ACL
+#ifdef DNNL_USE_ACL
         const bool is_acl_f16_not_ok = prb.cfg[SRC_LAYER].dt == dnnl_f16
                 && dnnl::impl::cpu::platform::has_data_type_support(dnnl_f16);
         if (is_acl_f16_not_ok) {
@@ -824,13 +836,24 @@ void skip_unimplemented_prb(const prb_t *prb_, res_t *res) {
             res->reason = skip_reason::case_not_supported;
             return;
         }
-        if (is_cpu()
-                && (prb.tag[0] != tag::abx || prb.tag[1] != tag::any
-                        || prb.tag[2] != tag::abx)) {
-            res->state = SKIPPED;
-            res->reason = skip_reason::case_not_supported;
-            return;
+
+        if (is_cpu()) {
+            const auto src_tag
+                    = normalize_tag(prb.tag[0], prb.ndims(SRC_LAYER));
+            const auto wei_tag
+                    = normalize_tag(prb.tag[1], prb.ndims(WEIGHTS_LAYER));
+            const auto dst_tag
+                    = normalize_tag(prb.tag[2], prb.ndims(DST_LAYER));
+
+            const bool tags_not_ok = src_tag != "abc" || wei_tag != tag::any
+                    || dst_tag != "abc";
+            if (tags_not_ok) {
+                res->state = SKIPPED;
+                res->reason = skip_reason::case_not_supported;
+                return;
+            }
         }
+
         if (is_gpu() && prb.tag[1] != tag::any) {
             res->state = SKIPPED;
             res->reason = skip_reason::case_not_supported;
@@ -1055,10 +1078,9 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb_, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &prb = *prb_;
-    const auto &test_engine = get_test_engine();
     const auto &ref_engine = get_cpu_engine();
 
     auto const_pd = query_pd(prim);
@@ -1101,32 +1123,16 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
                 SAFE(fill_src_iter_c(prb, mem, ref_mem, rnn_attr), WARN);
                 break;
             case DNNL_ARG_WEIGHTS_LAYER:
-                if (is_fwd_prim) {
+                if (is_fwd_prim)
                     SAFE(fill_weights(
                                  prb, WEIGHTS_LAYER, mem, ref_mem, rnn_attr),
                             WARN);
-                } else {
-                    // RNN uses a different md on backward. It's required to
-                    // re-create it and reorder from former.
-                    const auto &bwd_md = query_md(const_pd, exec_arg);
-                    dnn_mem_t bwd_mem = dnn_mem_t(bwd_md, test_engine);
-                    SAFE(bwd_mem.reorder(mem_map[exec_arg]), WARN);
-                    mem_map[exec_arg] = std::move(bwd_mem);
-                }
                 break;
             case DNNL_ARG_WEIGHTS_ITER:
-                if (is_fwd_prim) {
+                if (is_fwd_prim)
                     SAFE(fill_weights(
                                  prb, WEIGHTS_ITER, mem, ref_mem, rnn_attr),
                             WARN);
-                } else {
-                    // RNN uses a different md on backward. It's required to
-                    // re-create it and reorder from former.
-                    const auto &bwd_md = query_md(const_pd, exec_arg);
-                    dnn_mem_t bwd_mem = dnn_mem_t(bwd_md, test_engine);
-                    SAFE(bwd_mem.reorder(mem_map[exec_arg]), WARN);
-                    mem_map[exec_arg] = std::move(bwd_mem);
-                }
                 break;
             case DNNL_ARG_WEIGHTS_PEEPHOLE:
                 if (is_fwd_prim)
@@ -1134,18 +1140,10 @@ int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
                             WARN);
                 break;
             case DNNL_ARG_WEIGHTS_PROJECTION:
-                if (is_fwd_prim) {
+                if (is_fwd_prim)
                     SAFE(fill_weights(prb, WEIGHTS_PROJECTION, mem, ref_mem,
                                  rnn_attr),
                             WARN);
-                } else {
-                    // RNN uses a different md on backward. It's required to
-                    // re-create it and reorder from former.
-                    const auto &bwd_md = query_md(const_pd, exec_arg);
-                    dnn_mem_t bwd_mem = dnn_mem_t(bwd_md, test_engine);
-                    SAFE(bwd_mem.reorder(mem_map[exec_arg]), WARN);
-                    mem_map[exec_arg] = std::move(bwd_mem);
-                }
                 break;
             case DNNL_ARG_BIAS:
                 if (is_fwd_prim)
@@ -1253,11 +1251,16 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    SAFE(check_caches(v_prim[0], prb, res), WARN);
-    if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+        if (v_prim[1]) SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+        if (v_prim[1]) { SAFE(check_caches(v_prim[1], prb, res), WARN); }
+    }
     return OK;
 }
 
diff --git a/tests/benchdnn/rnn/rnn.hpp b/tests/benchdnn/rnn/rnn.hpp
index be34f13772f..39c7552b257 100644
--- a/tests/benchdnn/rnn/rnn.hpp
+++ b/tests/benchdnn/rnn/rnn.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -286,10 +286,10 @@ struct prb_t : public desc_t {
         : prb_t(s.desc, dt_conf_t::create(s.cfg[0], s.attributes.front()),
                 s.tag[0], s.prop[0], s.alg[0], s.with_peephole[0],
                 s.with_projection[0], s.direction[0], s.scale_policy[0],
-                s.scale_proj_policy[0], s.flags[0], s.activation[0],
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0], s.alpha,
+                s.scale_proj_policy[0], s.flags[0], s.activation[0], s.alpha,
                 s.beta, s.skip_nonlinear[0], s.trivial_strides[0], s.n_layer[0],
-                s.n_iter[0], s.mb[0]) {
+                s.n_iter[0], s.mb[0], s.attributes.front(), s.ctx_init[0],
+                s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
@@ -298,10 +298,11 @@ struct prb_t : public desc_t {
             bool with_peephole, bool with_projection,
             dnnl_rnn_direction_t direction, policy_t scale_policy,
             policy_t scale_proj_policy, unsigned int flags,
-            activation_t activation, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe, float alpha,
-            float beta, bool skip_nonlinear, bool trivial_strides,
-            int64_t n_layer, int64_t n_iter, int64_t mb = 0)
+            activation_t activation, float alpha, float beta,
+            bool skip_nonlinear, bool trivial_strides, int64_t n_layer,
+            int64_t n_iter, int64_t mb, const attr_t &attr,
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : desc_t(desc)
         , cfg(cfg)
         , tag(tag)
@@ -315,27 +316,24 @@ struct prb_t : public desc_t {
         , wei_proj_scales_policy(scale_proj_policy)
         , flags(flags)
         , activation(activation)
-        , attr(attr)
-        , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe)
-        , user_mb(mb)
         , alpha(alpha)
         , beta(beta)
         , skip_nonlinear(skip_nonlinear)
         , trivial_strides(trivial_strides)
+        , user_mb(mb)
         , ops(0.0)
-        , linear_cscale(0.0f) {
+        , linear_cscale(0.0f)
+        , attr(attr)
+        , ctx_init(ctx_init)
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
 
         if (n_layer) this->n_layer = n_layer;
         if (n_iter) this->n_iter = n_iter;
         if (mb) this->mb = mb;
         count_ops();
 
-        // Broadcast data types if needed
-        if (tag.size() == 1) {
-            const auto val = tag[0]; // Need a copy here.
-            this->tag.assign(3, val);
-        }
+        broadcast_vector(this->tag, 3);
 
         wei_scales = nullptr;
         wei_proj_scales = nullptr;
@@ -430,6 +428,37 @@ struct prb_t : public desc_t {
         return 0;
     }
 
+    int ndims(rnn_data_kind_t kind) const {
+        switch (kind) {
+            case SRC_LAYER:
+            case DST_LAYER:
+            case AUGRU_ATTENTION:
+            case DIFF_SRC_LAYER:
+            case DIFF_DST_LAYER:
+            case DIFF_AUGRU_ATTENTION: return 3;
+            case SRC_ITER:
+            case SRC_ITER_C:
+            case WEIGHTS_PEEPHOLE:
+            case WEIGHTS_PROJECTION:
+            case BIAS:
+            case DST_ITER:
+            case DST_ITER_C:
+            case DIFF_SRC_ITER:
+            case DIFF_SRC_ITER_C:
+            case DIFF_WEIGHTS_PEEPHOLE:
+            case DIFF_WEIGHTS_PROJECTION:
+            case DIFF_BIAS:
+            case DIFF_DST_ITER:
+            case DIFF_DST_ITER_C: return 4;
+            case WEIGHTS_LAYER:
+            case WEIGHTS_ITER:
+            case DIFF_WEIGHTS_LAYER:
+            case DIFF_WEIGHTS_ITER: return 5;
+            default: assert(!"unknown data kind");
+        }
+        return 0;
+    }
+
     bool is_int8() const {
         return cfg[SRC_LAYER].dt == dnnl_u8 || cfg[SRC_LAYER].dt == dnnl_s8;
     }
@@ -459,12 +488,17 @@ struct prb_t : public desc_t {
     policy_t wei_proj_scales_policy;
     unsigned int flags;
     activation_t activation;
+    float alpha;
+    float beta;
+    bool skip_nonlinear;
+    bool trivial_strides;
+    int64_t user_mb;
+    double ops;
+    float linear_cscale;
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
-    float alpha;
-    float beta;
+    impl_filter_t impl_filter;
 
     float data_scale, data_shift;
 
@@ -476,11 +510,7 @@ struct prb_t : public desc_t {
     int wei_proj_nscales;
     int wei_proj_scales_mask;
 
-    bool skip_nonlinear;
-    bool trivial_strides;
-    double ops;
     float *linear_scales;
-    float linear_cscale;
 
 private:
     std::string repro;
@@ -558,8 +588,7 @@ void compute_ref_bwd(const prb_t &prb, const args_t &args);
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t &prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t &prb, res_t *res);
diff --git a/tests/benchdnn/rnn/rnn_aux.cpp b/tests/benchdnn/rnn/rnn_aux.cpp
index 067cd960392..5ad21fb489c 100644
--- a/tests/benchdnn/rnn/rnn_aux.cpp
+++ b/tests/benchdnn/rnn/rnn_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -194,7 +194,7 @@ int str2desc(desc_t *desc, const char *str) {
             ok = 1; \
             s += strlen(prb); \
             char *end_s; \
-            d.c = strtol(s, &end_s, 10); \
+            d.c = strtoll(s, &end_s, 10); \
             if (end_s == s) { \
                 BENCHDNN_PRINT(0, \
                         "ERROR: No value found for `%s` setting. Full " \
@@ -308,6 +308,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<const desc_t &>(*this);
 
diff --git a/tests/benchdnn/rnn/rnn_aux.hpp b/tests/benchdnn/rnn/rnn_aux.hpp
index d0048de5829..4fb124651a4 100644
--- a/tests/benchdnn/rnn/rnn_aux.hpp
+++ b/tests/benchdnn/rnn/rnn_aux.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,6 +23,10 @@
 
 namespace rnn {
 
+//NOLINTBEGIN(modernize-use-using)
+// GCC treats using and typedef differently for enums and structs
+// https://stackoverflow.com/questions/48613758
+
 typedef enum {
     rnn_forward = 0,
     rnn_backward,
@@ -38,7 +42,12 @@ typedef enum {
     top2bottom,
 } rnn_layer_direction_t;
 
-typedef enum { action_copy = 0, action_sum, action_concat } rnn_action_t;
+typedef enum {
+    action_copy = 0,
+    action_sum,
+    action_concat,
+} rnn_action_t;
+//NOLINTEND(modernize-use-using)
 
 dnnl_status_t init_rnn_fwd_pd(dnnl_primitive_desc_t *pd, dnnl_engine_t engine,
         const prb_t &prb, dnnl_prop_kind_t prop_kind,
diff --git a/tests/benchdnn/rnn/rnn_task.hpp b/tests/benchdnn/rnn/rnn_task.hpp
index cfc77a25832..9f5dc32c335 100644
--- a/tests/benchdnn/rnn/rnn_task.hpp
+++ b/tests/benchdnn/rnn/rnn_task.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -45,6 +45,12 @@ struct rnn_task_t {
 
         v_prim_ = std::make_shared<
                 std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>>>();
+
+        // A timer for each test case. Starts from `create_func_` and ends at
+        // `parse_result`.
+        auto &tct = res_.timer_map.get_timer(timer::names::test_case_timer);
+        tct.start();
+
         const prb_t *prb = prb_.get();
         SAFE(create_func_(*v_prim_, *prb, &res_), WARN);
         return OK;
@@ -68,6 +74,11 @@ struct rnn_task_t {
             do_func_(*v_prim_, *prb, &res_);
         }
 
+        // A timer for each test case. Starts from `create_func_` and ends at
+        // `parse_result`.
+        auto &tct = res_.timer_map.get_timer(timer::names::test_case_timer);
+        tct.stamp();
+
         return report();
     }
 
diff --git a/tests/benchdnn/rnn/rnn_task_executor.hpp b/tests/benchdnn/rnn/rnn_task_executor.hpp
index d29130abda8..3c605dd5365 100644
--- a/tests/benchdnn/rnn/rnn_task_executor.hpp
+++ b/tests/benchdnn/rnn/rnn_task_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,6 +20,19 @@
 #include "rnn/rnn_task.hpp"
 #include "utils/parallel.hpp"
 
+#define TASK_EXECUTOR_DECL_TYPES \
+    using create_func_t = std::function<int( \
+            std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, \
+            const prb_t &, res_t *)>; \
+    using check_cache_func_t = std::function<int( \
+            std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, \
+            const prb_t *, res_t *)>; \
+    using do_func_t = std::function<int( \
+            const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, \
+            const prb_t &, res_t *)>; \
+    using driver_task_executor_t = rnn_task_executor_t<prb_t, perf_report_t, \
+            create_func_t, check_cache_func_t, do_func_t>;
+
 extern int repeats_per_prb;
 
 template <typename prb_t, typename perf_report_t, typename create_func_t,
@@ -27,7 +40,7 @@ template <typename prb_t, typename perf_report_t, typename create_func_t,
 struct rnn_task_executor_t {
     virtual ~rnn_task_executor_t() { assert(tasks_.empty()); }
 
-    void submit(std::shared_ptr<const prb_t> prb,
+    void submit(const std::shared_ptr<const prb_t> &prb,
             const std::string &perf_template, const create_func_t &create_func,
             const check_cache_func_t &check_cache_func,
             const do_func_t &do_func) {
diff --git a/tests/benchdnn/self/common.cpp b/tests/benchdnn/self/common.cpp
index 45f6b2aded9..e95460516d9 100644
--- a/tests/benchdnn/self/common.cpp
+++ b/tests/benchdnn/self/common.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include "dnn_types.hpp"
 #include "dnnl_common.hpp"
 #include "dnnl_memory.hpp"
+#include "utils/impl_filter.hpp"
 #include "utils/parser.hpp"
 
 #include "self/self.hpp"
@@ -117,10 +118,10 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::zero_points_t> &zp = s.zero_points;
-        SELF_CHECK_EQ(parse_attributes(s, def,
-                              "--attr-zero-points=src:common:0+wei:per_oc+dst:"
-                              "common:-2,src:per_dim_1"),
-                true);
+        std::string content_to_parse(
+                "--attr-zero-points=src:common:0+wei:per_oc+dst:common:-2,src:"
+                "per_dim_1");
+        SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true);
         SELF_CHECK_EQ(zp.size(), 2);
         const std::vector<dnnl_dim_t> def_g {};
         SELF_CHECK_ATTR_ZP(
@@ -137,11 +138,10 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::arg_scales_t> &sc = s.scales;
+        std::string content_to_parse(
+                "--attr-scales=src:common:1.5+wei:per_oc+src:common:0.5");
         // `src` scale is overridden with the latter value.
-        SELF_CHECK_EQ(parse_attributes(s, def,
-                              "--attr-scales=src:common:1.5+wei:per_oc+src:"
-                              "common:0.5"),
-                true);
+        SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true);
         SELF_CHECK_EQ(sc.size(), 1);
         SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC).policy, policy_t::COMMON);
         SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC).scale, 0.5f);
@@ -152,9 +152,9 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::arg_scales_t> &sc = s.scales;
-        SELF_CHECK_EQ(parse_attributes(s, def,
-                              "--attr-scales=src:common:2.5+src1:common:1.5"),
-                true);
+        std::string content_to_parse(
+                "--attr-scales=src:common:2.5+src1:common:1.5");
+        SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true);
         SELF_CHECK_EQ(sc.size(), 1);
         SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC_0).policy, policy_t::COMMON);
         SELF_CHECK_EQ(sc[0].get(DNNL_ARG_SRC_0).scale, 2.5);
@@ -165,9 +165,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::zero_points_t> &zp = s.zero_points;
-        SELF_CHECK_EQ(parse_attributes(
-                              s, def, "--attr-zero-points=wei:per_ocic:s8:2x1"),
-                true);
+        std::string content_to_parse("--attr-zero-points=wei:per_ocic:s8:2x1");
+        SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true);
         SELF_CHECK_EQ(zp.size(), 1);
         std::vector<dnnl_dim_t> groups = {2, 1};
         SELF_CHECK_ATTR_ZP(zp[0], DNNL_ARG_WEIGHTS, policy_t::PER_OCIC, 0,
@@ -177,9 +176,9 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::arg_scales_t> &sc = s.scales;
-        SELF_CHECK_EQ(parse_attributes(s, def,
-                              "--attr-scales=attr_post_op_dw_wei:common:2"),
-                true);
+        std::string content_to_parse(
+                "--attr-scales=attr_post_op_dw_wei:common:2");
+        SELF_CHECK_EQ(parse_attributes(s, def, content_to_parse.c_str()), true);
         SELF_CHECK_EQ(sc.size(), 1);
         const auto arg = DNNL_ARG_ATTR_POST_OP_DW | DNNL_ARG_WEIGHTS;
         SELF_CHECK_EQ(sc[0].get(arg).policy, policy_t::COMMON);
@@ -190,7 +189,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::post_ops_t> &po = s.post_ops;
-        auto st = parse_attributes(s, def, "--attr-post-ops=dw:k3s1p1");
+        std::string content_to_parse("--attr-post-ops=dw:k3s1p1");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(po[0].len(), 1);
         const auto &e = po[0].entry[0];
@@ -205,8 +205,9 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::post_ops_t> &po = s.post_ops;
-        auto st = parse_attributes(
-                s, def, "--attr-post-ops=relu:0.5+dw:k3s2p1:s8+linear:2:1");
+        std::string content_to_parse(
+                "--attr-post-ops=relu:0.5+dw:k3s2p1:s8+linear:2:1");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(po[0].len(), 3);
         auto &e = po[0].entry[0];
@@ -235,7 +236,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::fpmath_mode_t> &fm = s.fpmath_mode;
-        auto st = parse_attributes(s, def, "--attr-fpmath=strict:true");
+        std::string content_to_parse("--attr-fpmath=strict:true");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(fm[0].mode, dnnl_fpmath_mode_strict);
         SELF_CHECK_EQ(fm[0].apply_to_int, true);
@@ -244,7 +246,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::fpmath_mode_t> &fm = s.fpmath_mode;
-        auto st = parse_attributes(s, def, "--attr-fpmath=bf16");
+        std::string content_to_parse("--attr-fpmath=bf16");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(fm[0].mode, dnnl_fpmath_mode_bf16);
         SELF_CHECK_EQ(fm[0].apply_to_int, false);
@@ -256,7 +259,8 @@ static int check_attr() {
         std::vector<attr_t::fpmath_mode_t> &fm = s.fpmath_mode;
         def.fpmath_mode.emplace_back();
         def.fpmath_mode[0].set(dnnl_fpmath_mode_bf16, true);
-        auto st = parse_attributes(s, def, "--attr-fpmath=");
+        std::string content_to_parse("--attr-fpmath=");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(fm[0].mode, dnnl_fpmath_mode_bf16);
         SELF_CHECK_EQ(fm[0].apply_to_int, true);
@@ -267,7 +271,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::dropout_t> &d = s.dropout;
-        auto st = parse_attributes(s, def, "--attr-dropout=0.5:12345:axb");
+        std::string content_to_parse("--attr-dropout=0.5:12345:axb");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(d[0].p, 0.5f);
         SELF_CHECK_EQ(d[0].seed, 12345);
@@ -277,7 +282,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::dropout_t> &d = s.dropout;
-        auto st = parse_attributes(s, def, "--attr-dropout=0.75");
+        std::string content_to_parse("--attr-dropout=0.75");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(d[0].p, 0.75f);
         SELF_CHECK_EQ(d[0].seed, 0);
@@ -287,7 +293,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::dropout_t> &d = s.dropout;
-        auto st = parse_attributes(s, def, "--attr-dropout=");
+        std::string content_to_parse("--attr-dropout=");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(d[0].p, 0.f);
         SELF_CHECK_EQ(d[0].seed, 0);
@@ -297,8 +304,8 @@ static int check_attr() {
     {
         base_settings_t s;
         std::vector<attr_t::rounding_mode_t> &rm = s.rounding_mode;
-        auto st = parse_attributes(
-                s, def, "--attr-rounding-mode=dst:stochastic");
+        std::string content_to_parse("--attr-rounding-mode=dst:stochastic");
+        auto st = parse_attributes(s, def, content_to_parse.c_str());
         SELF_CHECK_EQ(st, true);
         SELF_CHECK_EQ(rm[0].get(DNNL_ARG_DST), dnnl_rounding_mode_stochastic);
         SELF_CHECK_EQ(rm[0].get(DNNL_ARG_SRC), dnnl_rounding_mode_environment);
@@ -517,14 +524,17 @@ static int check_trim_tags() {
 }
 
 static int check_skip_impl() {
-    skip_impl = "gemm";
-    SELF_CHECK_EQ(true, maybe_skip("x64:gemm:jit"));
+    impl_filter_t impl_filter({"gemm"}, /* use_impl = */ false,
+            /* respect_global_filter = */ true);
+    SELF_CHECK_EQ(true, need_next_impl("x64:gemm:jit", impl_filter));
 
-    skip_impl = "ref,x64:gemm";
-    SELF_CHECK_EQ(true, maybe_skip("x64:gemm:jit"));
+    impl_filter = impl_filter_t({"ref", "x64:gemm"}, /* use_impl = */ false,
+            /* respect_global_filter = */ true);
+    SELF_CHECK_EQ(true, need_next_impl("x64:gemm:jit", impl_filter));
 
-    skip_impl = "this,finds,nothing";
-    SELF_CHECK_EQ(false, maybe_skip("x64:gemm:jit"));
+    impl_filter = impl_filter_t({"this_finds_nothing"}, /* use_impl = */ false,
+            /* respect_global_filter = */ true);
+    SELF_CHECK_EQ(false, need_next_impl("x64:gemm:jit", impl_filter));
 
     return OK;
 }
diff --git a/tests/benchdnn/self/self.hpp b/tests/benchdnn/self/self.hpp
index 16145f6e1d4..3e0e713e801 100644
--- a/tests/benchdnn/self/self.hpp
+++ b/tests/benchdnn/self/self.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2023 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -48,13 +48,13 @@ namespace self {
 #define SELF_CHECK_CASE_STR_NE(a, b) \
     SELF_CHECK(strcasecmp(a, b), "'%s' == '%s'", a, b)
 #define SELF_CHECK_CASE_CPP_STR_EQ(a, b) \
-    SELF_CHECK(!strcasecmp(a.c_str(), b), "'%s' != '%s'", a.c_str(), b)
+    SELF_CHECK(!strcasecmp((a).c_str(), b), "'%s' != '%s'", (a).c_str(), b)
 #define SELF_CHECK_CASE_CPP_STR_NE(a, b) \
-    SELF_CHECK(strcasecmp(a.c_str(), b), "'%s' == '%s'", a.c_str(), b)
+    SELF_CHECK(strcasecmp((a).c_str(), b), "'%s' == '%s'", (a).c_str(), b)
 #define SELF_CHECK_PRINT_EQ2(obj, expect_str1, expect_str2) \
     do { \
         std::stringstream ss; \
-        ss << obj; \
+        ss << (obj); \
         std::string obj_str = ss.str(); \
         if (std::string(expect_str1) == std::string(expect_str2) \
                 && strcasecmp(obj_str.c_str(), expect_str1)) \
diff --git a/tests/benchdnn/shuffle/bench_shuffle.cpp b/tests/benchdnn/shuffle/bench_shuffle.cpp
index 57bfe300437..e3a29edf2b6 100644
--- a/tests/benchdnn/shuffle/bench_shuffle.cpp
+++ b/tests/benchdnn/shuffle/bench_shuffle.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,17 +26,7 @@
 
 namespace shuffle {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -49,11 +39,10 @@ void check_correctness(
     for_(const auto &i_ctx_init : s.ctx_init)
     for (const auto &i_ctx_exe : s.ctx_exe) {
         const prb_t prb(s.prb_dims, i_dir, i_dt, i_tag, i_axis, i_group, i_attr,
-                i_ctx_init, i_ctx_exe);
+                i_ctx_init, i_ctx_exe, s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -76,13 +65,7 @@ int bench(int argc, char **argv) {
                 || parse_vector_option(
                         s.group, def.group, atoi, argv[0], "group", help_group)
                 || parse_axis(s.axis, def.axis, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/shuffle/shuffle.cpp b/tests/benchdnn/shuffle/shuffle.cpp
index 7950d5873e0..e9ab1ca7cd5 100644
--- a/tests/benchdnn/shuffle/shuffle.cpp
+++ b/tests/benchdnn/shuffle/shuffle.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,6 +37,10 @@ int fill_src(const prb_t *prb, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     auto get_range = [](const dnnl_data_type_t dt) {
         if (dt == dnnl_s8 || dt == dnnl_u8)
@@ -121,7 +125,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -177,10 +181,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/shuffle/shuffle.hpp b/tests/benchdnn/shuffle/shuffle.hpp
index e857816c250..c1121268eca 100644
--- a/tests/benchdnn/shuffle/shuffle.hpp
+++ b/tests/benchdnn/shuffle/shuffle.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -66,13 +66,15 @@ struct prb_t : public prb_dims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_dims, s.dir[0], s.dt[0], s.tag[0], s.axis[0], s.group[0],
-                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0]) {
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_dims_t &prb_dims, dir_t dir, dnnl_data_type_t dt,
             const std::string &tag, int axis, int64_t group, const attr_t &attr,
-            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_init, const thr_ctx_t &ctx_exe,
+            const impl_filter_t &impl_filter)
         : prb_dims_t(prb_dims)
         , dir(dir)
         , dt(dt)
@@ -81,7 +83,8 @@ struct prb_t : public prb_dims_t {
         , group(group)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
 
@@ -93,6 +96,7 @@ struct prb_t : public prb_dims_t {
     bool inplace = false; // Lacks placement, always considered `false`.
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -157,8 +161,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/shuffle/shuffle_aux.cpp b/tests/benchdnn/shuffle/shuffle_aux.cpp
index 2360513bb01..2324a0942b0 100644
--- a/tests/benchdnn/shuffle/shuffle_aux.cpp
+++ b/tests/benchdnn/shuffle/shuffle_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -39,6 +39,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_dims_t>(*this);
 
diff --git a/tests/benchdnn/softmax/bench_softmax.cpp b/tests/benchdnn/softmax/bench_softmax.cpp
index 69bd493044c..84805432485 100644
--- a/tests/benchdnn/softmax/bench_softmax.cpp
+++ b/tests/benchdnn/softmax/bench_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@
 
 namespace softmax {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -52,11 +42,11 @@ void check_correctness(
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
         const prb_t prb(s.prb_dims, i_dir, i_sdt, i_ddt, i_stag, i_dtag, i_alg,
-                i_axis, i_inplace, i_attr, i_ctx_init, i_ctx_exe, i_mb);
+                i_axis, i_mb, i_inplace, i_attr, i_ctx_init, i_ctx_exe,
+                s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -90,13 +80,7 @@ int bench(int argc, char **argv) {
                 || parse_axis(s.axis, def.axis, argv[0])
                 || parse_inplace(s.inplace, def.inplace, argv[0])
                 || parse_mb(s.mb, def.mb, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/softmax/softmax.cpp b/tests/benchdnn/softmax/softmax.cpp
index cfa030e9e4d..fc28cb0f1d2 100644
--- a/tests/benchdnn/softmax/softmax.cpp
+++ b/tests/benchdnn/softmax/softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 * Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -91,6 +91,10 @@ int fill_data_fwd(const prb_t *prb, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     int64_t outer_size = 0, inner_size = 0, axis_size = 0;
     get_sizes(prb, outer_size, inner_size, axis_size);
@@ -180,6 +184,10 @@ int fill_data_bwd(data_kind_t data_kind, const prb_t *prb, dnn_mem_t &mem_dt,
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     // TODO: replace with some better filling mechanism.
     const int range = ((seed % 2 == 0) || mem_dt.dt() == dnnl_f16) ? 8 : 128;
@@ -232,28 +240,24 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
     const float trh_coeff_bwd = (prb->dir & FLAG_FWD) ? 1.f : 4.f;
     const float trh_f32 = trh_coeff_log * trh_coeff_bwd * trh_coeff_f32
             * epsilon_dt(trh_dt);
-#if DNNL_AARCH64 || defined(DNNL_SYCL_HIP)
+#if DNNL_USE_ACL || defined(DNNL_SYCL_HIP) || defined(DNNL_SYCL_CUDA)
     // MIOpen and ACL softmax accumulate in F16, but oneDNN now expects accumulation in
     // F32, this partially reverts 6727bbe8. For more information on ACL softmax, see
-    // https://github.com/oneapi-src/oneDNN/issues/1819
+    // https://github.com/uxlfoundation/oneDNN/issues/1819
     // Similarly, for bf16 on AArch64, the relaxed threshold is necessary due to
     // minor accuracy drops observed compared to f32
     const float trh = trh_f32;
 #else
     const bool is_strict_acc
-            = prb->attr.acc_mode == dnnl_accumulation_mode_strict;
-    // Relaxed fp16 computation can get an ulp difference with f32 ref values.
-    const float trh = is_flt_or_dbl || (trh_dt == dnnl_f16 && !is_strict_acc)
-            ? trh_f32
-            : 0.f;
+            = prb->attr.acc_mode == dnnl_accumulation_mode_strict
+            || prb->attr.acc_mode == dnnl_accumulation_mode_f32;
+    const bool is_relaxed_xf16
+            = !is_strict_acc && (trh_dt == dnnl_f16 || trh_dt == dnnl_bf16);
+    // Relaxed xf16 computation can get an ulp difference with f32 ref values.
+    const float trh = is_flt_or_dbl || is_relaxed_xf16 ? trh_f32 : 0.f;
 #endif
     cmp.set_threshold(trh);
 
-    // LogSoftMax is unstable enough when there are attributes on top.
-    const bool compare_with_norm
-            = (prb->alg == alg_t::LOGSOFTMAX && !prb->attr.is_def());
-    cmp.set_norm_validation_mode(compare_with_norm);
-
     const int64_t axis_size = prb->dims[prb->axis];
     const int64_t n_zeros = (prb->ddt == dnnl_s8 || prb->ddt == dnnl_u8)
             ? (axis_size - 1)
@@ -269,7 +273,7 @@ void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
 
     const auto softmax_add_check
             = [&](const compare::compare_t::driver_check_func_args_t &args) {
-#if DNNL_AARCH64_USE_ACL
+#if DNNL_USE_ACL
                   auto diff_trh = epsilon_dt(args.dt);
 #else
                   auto diff_trh = epsilon_dt(dnnl_f32);
@@ -320,7 +324,7 @@ fill_cfg_t binary_po_fill_cfg(
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
     const bool is_fwd_prim = is_fwd_prop_kind(query_prop_kind(query_pd(prim)));
@@ -409,10 +413,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/softmax/softmax.hpp b/tests/benchdnn/softmax/softmax.hpp
index 7ee0f2f85a3..4f4cde3f4ee 100644
--- a/tests/benchdnn/softmax/softmax.hpp
+++ b/tests/benchdnn/softmax/softmax.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -75,16 +75,17 @@ struct prb_t : public prb_dims_t {
     // A ctor with common interface across all drivers.
     prb_t(const settings_t &s)
         : prb_t(s.prb_dims, s.dir[0], s.sdt[0], s.ddt[0], s.stag[0], s.dtag[0],
-                s.alg[0], s.axis[0], s.inplace[0], s.attributes.front(),
-                s.ctx_init[0], s.ctx_exe[0], s.mb[0]) {
+                s.alg[0], s.axis[0], s.mb[0], s.inplace[0],
+                s.attributes.front(), s.ctx_init[0], s.ctx_exe[0],
+                s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
     prb_t(const prb_dims_t &prb_dims, dir_t dir, dnnl_data_type_t sdt,
             dnnl_data_type_t ddt, const std::string &stag,
-            const std::string &dtag, alg_t alg, int axis, bool inplace,
-            const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe, int64_t mb = 0)
+            const std::string &dtag, alg_t alg, int axis, int64_t mb,
+            bool inplace, const attr_t &attr, const thr_ctx_t &ctx_init,
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : prb_dims_t(prb_dims)
         , dir(dir)
         , sdt(sdt)
@@ -93,11 +94,12 @@ struct prb_t : public prb_dims_t {
         , dtag(dtag)
         , alg(alg)
         , axis(axis)
+        , user_mb(mb)
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
         , ctx_exe(ctx_exe)
-        , user_mb(mb) {
+        , impl_filter(impl_filter) {
         if (mb) dims[0] = mb;
         repro = set_repro_line(); // must be last in ctor to collect right info
     }
@@ -107,10 +109,11 @@ struct prb_t : public prb_dims_t {
     std::string stag, dtag;
     alg_t alg;
     int axis;
+    int64_t user_mb;
     bool inplace;
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
-    int64_t user_mb;
+    impl_filter_t impl_filter;
 
     // Used to construct memory desc when dimensions are runtime since such mds
     // can't be used directly from query and memory objects can't be constructed.
@@ -201,8 +204,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/softmax/softmax_aux.cpp b/tests/benchdnn/softmax/softmax_aux.cpp
index b9e287ccbd7..e13b2bf44e6 100644
--- a/tests/benchdnn/softmax/softmax_aux.cpp
+++ b/tests/benchdnn/softmax/softmax_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,6 +62,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_dims_t>(*this);
 
diff --git a/tests/benchdnn/sum/bench_sum.cpp b/tests/benchdnn/sum/bench_sum.cpp
index c5b46f00c99..4cd03ae08d2 100644
--- a/tests/benchdnn/sum/bench_sum.cpp
+++ b/tests/benchdnn/sum/bench_sum.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -25,17 +25,7 @@
 
 namespace sum {
 
-using create_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using check_cache_func_t = std::function<int(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, const prb_t *,
-        res_t *)>;
-using do_func_t = std::function<int(
-        const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &,
-        const prb_t *, res_t *)>;
-using driver_task_executor_t = task_executor_t<prb_t, perf_report_t,
-        create_func_t, check_cache_func_t, do_func_t>;
+TASK_EXECUTOR_DECL_TYPES;
 
 void check_correctness(
         const settings_t &s, driver_task_executor_t &task_executor) {
@@ -49,11 +39,11 @@ void check_correctness(
     for_(const auto &i_ctx_exe : s.ctx_exe)
     for (auto i_inplace : s.inplace) {
         const prb_t prb(s.prb_dims, i_sdt, i_ddt, i_stag, i_dtag,
-                i_input_scales, i_inplace, i_attr, i_ctx_init, i_ctx_exe);
+                i_input_scales, i_inplace, i_attr, i_ctx_init, i_ctx_exe,
+                s.impl_filter);
         if (s.pattern && !match_regex(prb.str(), s.pattern)) return;
 
-        task_executor.submit(
-                prb, s.perf_template, createit, check_cacheit, doit);
+        task_executor.submit(prb, s.perf_template, createit, checkit, doit);
     }
 }
 
@@ -109,13 +99,7 @@ int bench(int argc, char **argv) {
                 || parse_multivector_option(s.input_scales, def.input_scales,
                         atof, argv[0], "scales", help_scales)
                 || parse_inplace(s.inplace, def.inplace, argv[0])
-                || parse_attributes(s, def, argv[0])
-                || parse_ctx_init(s.ctx_init, def.ctx_init, argv[0])
-                || parse_ctx_exe(s.ctx_exe, def.ctx_exe, argv[0])
-                || parse_test_pattern_match(s.pattern, argv[0])
-                || parse_perf_template(s.perf_template, s.perf_template_def,
-                        s.perf_template_csv(), argv[0])
-                || parse_reset(s, argv[0]) || parse_help(argv[0]);
+                || parse_driver_shared_settings(s, def, argv[0]);
         if (!parsed_options) {
             catch_unknown_options(argv[0]);
 
diff --git a/tests/benchdnn/sum/sum.cpp b/tests/benchdnn/sum/sum.cpp
index ad46d571e83..33ed1ac2c1f 100644
--- a/tests/benchdnn/sum/sum.cpp
+++ b/tests/benchdnn/sum/sum.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,6 +72,10 @@ int fill_src(int input_idx, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
     if (has_bench_mode_bit(mode_bit_t::bitwise)) {
         return fill_random_real(mem_dt, mem_fp, nullptr);
     }
+    if (has_bench_mode_bit(mode_bit_t::perf)) {
+        return fill_random_real(
+                mem_dt, mem_fp, nullptr, get_perf_fill_cfg(mem_dt.dt()));
+    }
 
     const auto dt = mem_dt.dt();
     const int range = 16;
@@ -123,7 +127,7 @@ std::vector<int> supported_exec_args(dir_t dir) {
 int init_ref_memory_args(dnn_mem_map_t &ref_mem_map, dnn_mem_map_t &mem_map,
         dnnl_primitive_t prim, const prb_t *prb, res_t *res,
         dnnl_primitive_t prim_ref) {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return OK;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return OK;
 
     const auto &ref_engine = get_cpu_engine();
 
@@ -170,10 +174,15 @@ int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
     return OK;
 }
 
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res) {
-    return check_caches(v_prim[0], prb, res);
+    if (has_bench_mode_bit(mode_bit_t::exec)) {
+        SAFE(check_total_size(res), WARN);
+    }
+    if (has_bench_mode_bit(mode_bit_t::corr)) {
+        SAFE(check_caches(v_prim[0], prb, res), WARN);
+    }
+    return OK;
 }
 
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
diff --git a/tests/benchdnn/sum/sum.hpp b/tests/benchdnn/sum/sum.hpp
index ff81cf31087..6f435beedb0 100644
--- a/tests/benchdnn/sum/sum.hpp
+++ b/tests/benchdnn/sum/sum.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,7 +64,7 @@ struct prb_t : public prb_dims_t {
     prb_t(const settings_t &s)
         : prb_t(s.prb_dims, s.sdt[0], s.ddt[0], s.stag[0], s.dtag[0],
                 s.input_scales[0], s.inplace[0], s.attributes.front(),
-                s.ctx_init[0], s.ctx_exe[0]) {
+                s.ctx_init[0], s.ctx_exe[0], s.impl_filter) {
         SAFE_V(s.has_single_setup() ? OK : FAIL);
     }
 
@@ -72,7 +72,7 @@ struct prb_t : public prb_dims_t {
             dnnl_data_type_t ddt, const std::vector<std::string> &stag,
             const std::string &dtag, const std::vector<float> &input_scales,
             bool inplace, const attr_t &attr, const thr_ctx_t &ctx_init,
-            const thr_ctx_t &ctx_exe)
+            const thr_ctx_t &ctx_exe, const impl_filter_t &impl_filter)
         : prb_dims_t(prb_dims)
         , sdt(sdt)
         , ddt(ddt)
@@ -82,12 +82,10 @@ struct prb_t : public prb_dims_t {
         , inplace(inplace)
         , attr(attr)
         , ctx_init(ctx_init)
-        , ctx_exe(ctx_exe) {
-        // Broadcast tag if needed
-        if (stag.size() == 1) {
-            const auto val = stag[0]; // Need a copy here.
-            this->stag.assign(n_inputs(), val);
-        }
+        , ctx_exe(ctx_exe)
+        , impl_filter(impl_filter) {
+
+        broadcast_vector(this->stag, n_inputs());
 
         // Broadcast input_scale if needed
         if (input_scales.size() == 1) {
@@ -107,6 +105,7 @@ struct prb_t : public prb_dims_t {
     bool inplace;
     attr_t attr;
     thr_ctx_t ctx_init, ctx_exe;
+    impl_filter_t impl_filter;
 
     int n_inputs() const { return (int)sdt.size(); }
 
@@ -173,8 +172,7 @@ void compute_ref(const prb_t *prb, const args_t &args,
 
 int createit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
-int check_cacheit(
-        std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
+int checkit(std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
 int doit(const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &v_prim,
         const prb_t *prb, res_t *res);
diff --git a/tests/benchdnn/sum/sum_aux.cpp b/tests/benchdnn/sum/sum_aux.cpp
index 38cd74110ea..c3d4dac1d4d 100644
--- a/tests/benchdnn/sum/sum_aux.cpp
+++ b/tests/benchdnn/sum/sum_aux.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -60,6 +60,8 @@ std::string prb_t::set_repro_line() {
         s << "--ctx-init=" << ctx_init << " ";
     if (canonical || ctx_exe != def.ctx_exe[0])
         s << "--ctx-exe=" << ctx_exe << " ";
+    if (canonical || !impl_filter.is_def() || !global_impl_filter.is_def())
+        s << impl_filter;
 
     s << static_cast<prb_dims_t>(*this);
 
diff --git a/tests/benchdnn/utils/bench_mode.cpp b/tests/benchdnn/utils/bench_mode.cpp
index 90e89d104ed..54df16e3a4e 100644
--- a/tests/benchdnn/utils/bench_mode.cpp
+++ b/tests/benchdnn/utils/bench_mode.cpp
@@ -45,7 +45,7 @@ std::ostream &operator<<(std::ostream &s, bench_mode_t mode) {
 std::ostream &operator<<(std::ostream &s, mode_modifier_t modifier) {
     if (modifier == mode_modifier_t::none) s << "";
     if (has_bench_mode_modifier(mode_modifier_t::par_create)) s << "P";
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) s << "M";
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) s << "M";
     return s;
 }
 
diff --git a/tests/benchdnn/utils/bench_mode.hpp b/tests/benchdnn/utils/bench_mode.hpp
index ed1b01394c5..9b6e6ea9797 100644
--- a/tests/benchdnn/utils/bench_mode.hpp
+++ b/tests/benchdnn/utils/bench_mode.hpp
@@ -48,10 +48,9 @@ enum class mode_modifier_t : unsigned {
     // Enable parallel test object creation. Uses as many threads as identified
     // by `dnnl_get_max_threads()`.
     par_create = 0x1,
-    // Disable usage of host memories in the flow. It removes mapping,
-    // unmapping and filling functionality. Applicable for performance mode only
-    // and for GPU only.
-    no_host_memory = 0x2,
+    // Disable usage of reference memories in the flow. It removes mapping,
+    // unmapping and filling functionality.
+    no_ref_memory = 0x2,
 };
 
 mode_modifier_t operator|(mode_modifier_t lhs, mode_modifier_t rhs);
diff --git a/tests/benchdnn/utils/cold_cache.cpp b/tests/benchdnn/utils/cold_cache.cpp
index 008134bc088..a0cf9b17451 100644
--- a/tests/benchdnn/utils/cold_cache.cpp
+++ b/tests/benchdnn/utils/cold_cache.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,8 +19,12 @@
 #include "utils/cold_cache.hpp"
 #include "utils/fill.hpp"
 
-cold_cache_mode_t default_cold_cache_mode {cold_cache_mode_t::none};
-cold_cache_mode_t cold_cache_mode {default_cold_cache_mode};
+cold_cache_input_t cold_cache_input;
+
+const cold_cache_input_t &default_cold_cache_input() {
+    static const cold_cache_input_t cold_cache_input;
+    return cold_cache_input;
+}
 
 namespace cold_cache_utils {
 // Returns `arg` index in `dnnl_args` since they packed in random order.
@@ -39,20 +43,14 @@ size_t get_arg_size(const std::vector<dnnl_exec_arg_t> &dnnl_args, int arg) {
 }
 } // namespace cold_cache_utils
 
-cold_cache_t::cold_cache_t()
-    : enabled_(false)
-    , n_buffers_top_limit_(0)
-    , n_buffers_bottom_limit_(0)
-    , n_buffers_(0)
-    , override_n_buffers_(false) {}
-
-cold_cache_t::cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args)
-    : enabled_(use_cold_cache(dnnl_args))
-    , n_buffers_top_limit_(is_gpu() ? gpu_n_buffers_top_limit_ : SIZE_MAX)
-    , n_buffers_bottom_limit_(1)
-    , n_buffers_(0)
-    , override_n_buffers_(false) {
+cold_cache_t::cold_cache_t(
+        const std::vector<dnnl_exec_arg_t> &dnnl_args, dnnl_stream_t stream)
+    : cold_cache_input_(cold_cache_input)
+    , enabled_(use_cold_cache(dnnl_args))
+    , n_buffers_top_limit_(is_gpu() ? gpu_n_buffers_top_limit_ : SIZE_MAX) {
 
+    // Note: there's an additional return from ctor below if it was identified
+    // that no buffers are needed.
     if (!enabled_) return;
 
     static cpu_cache_args_t cpu_cache_args {};
@@ -79,20 +77,21 @@ cold_cache_t::cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args)
     size_t cold_args_size = 0;
 
     std::vector<int> cc_args; // future keys for cold_cache object.
-    if (cold_cache_mode == cold_cache_mode_t::wei) {
+    if (cold_cache_input_.cold_cache_mode_ == cold_cache_mode_t::wei) {
         cc_args = {DNNL_ARG_WEIGHTS};
         const auto wei_size
                 = cold_cache_utils::get_arg_size(dnnl_args, DNNL_ARG_WEIGHTS);
         hot_args_size -= wei_size;
         cold_args_size += wei_size;
-    } else if (cold_cache_mode == cold_cache_mode_t::all) {
+    } else if (cold_cache_input_.cold_cache_mode_ == cold_cache_mode_t::all) {
         cc_args.resize(dnnl_args.size());
         for (size_t i = 0; i < dnnl_args.size(); i++) {
             cc_args[i] = dnnl_args[i].arg;
         }
         hot_args_size = 0;
         cold_args_size = full_args_size;
-    } else if (cold_cache_mode == cold_cache_mode_t::custom) {
+    } else if (cold_cache_input_.cold_cache_mode_
+            == cold_cache_mode_t::custom) {
         const std::vector<int> user_args = {/* DNNL_ARG_WEIGHTS, ... */};
         cc_args = user_args;
         if (cc_args.empty()) {
@@ -111,20 +110,22 @@ cold_cache_t::cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args)
         assert(!"unknown cold cache mode!");
     }
 
-    const auto MB = [](size_t bytes) {
-        return static_cast<double>(bytes) / powf(2, 20);
-    };
     BENCHDNN_PRINT(3,
-            "[COLD_CACHE]%s Size: %.3g MB; Limit: %.3g MB; Hot args: "
-            "%.3g MB; Cold args: %.3g MB;\n",
-            (is_gpu() ? "[GPU]" : "[CPU]"), MB(cache_capacity),
-            MB(cache_size_upper_bound), MB(hot_args_size), MB(cold_args_size));
-
-    const size_t cold_mem_pool_size = hot_args_size > cache_size_upper_bound
-            ? 0
-            : cache_size_upper_bound - hot_args_size;
-    const size_t n_mem_pool_buffers
-            = div_up(cold_mem_pool_size, cold_args_size);
+            "[COLD_CACHE]%s Size:%s; Limit:%s; Hot args:%s; Cold args:%s;\n",
+            (is_gpu() ? "[GPU]" : "[CPU]"), smart_bytes(cache_capacity).c_str(),
+            smart_bytes(cache_size_upper_bound).c_str(),
+            smart_bytes(hot_args_size).c_str(),
+            smart_bytes(cold_args_size).c_str());
+
+    const size_t cold_mem_pool_size
+            = MAX2(cache_size_upper_bound - hot_args_size, 0);
+
+    size_t n_mem_pool_buffers = 0;
+    // If `cold_args_size` are greater then allowed pool_size, it means there's
+    // no sense in allocating any more buffers. Use original buffers only.
+    if (cold_mem_pool_size > cold_args_size)
+        n_mem_pool_buffers = div_up(cold_mem_pool_size, cold_args_size);
+
     n_buffers_ = MIN2(MAX2(n_mem_pool_buffers, n_buffers_bottom_limit_),
             n_buffers_top_limit_);
     override_n_buffers_ = n_mem_pool_buffers > n_buffers_top_limit_;
@@ -137,6 +138,15 @@ cold_cache_t::cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args)
                             ? "SIZE_MAX"
                             : std::to_string(n_buffers_top_limit_).c_str()),
             n_mem_pool_buffers, n_buffers_);
+    if (cold_cache_input_.cold_tlb_) {
+        BENCHDNN_PRINT(3, "[COLD_CACHE] tlb:enabled; size:%s;\n",
+                smart_bytes(cold_cache_input_.cold_tlb_size_).c_str());
+    }
+    if (n_buffers_ <= 0) {
+        // No buffers allocation needed, return to avoid scratching `cache_`
+        // object. This allows to keep rest logic intact.
+        return;
+    }
 
     for (auto arg : cc_args) {
         const int idx = cold_cache_utils::get_arg_idx(dnnl_args, arg);
@@ -171,44 +181,68 @@ cold_cache_t::cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args)
                             i, __FILE__, __LINE__);
                     return;
                 }
-            }
+            } else
 #endif
+            {
+                // Reorders are expensive. If there are multiple buffers to
+                // fill, simply rely on default memory initialization.
+                if (n_mem_pool_buffers > 100) continue;
+
+                if (cc_entry[i].is_mapped()) cc_entry[i].unmap();
+                const auto &dst_memory = cc_entry[i].m_;
+                benchdnn_dnnl_wrapper_t<dnnl_primitive_desc_t> reorder_pdw;
+                dnnl_primitive_desc_t reorder_pd {};
+                dnnl_status_t status = dnnl_reorder_primitive_desc_create(
+                        &reorder_pd, orig_cc_mem_md, query_engine(orig_mem),
+                        orig_cc_mem_md, query_engine(dst_memory), nullptr);
+                if (status != dnnl_success) {
+                    BENCHDNN_PRINT(0,
+                            "Error: cold-cache reorder failed for %s arg and "
+                            "%zu buffer (out of %zu).\n",
+                            data_kind2str(exec_arg2data_kind(arg)), i,
+                            n_buffers_);
+                    assert(status == dnnl_success);
+                    return;
+                }
+                reorder_pdw.reset(reorder_pd);
+
+                benchdnn_dnnl_wrapper_t<dnnl_primitive_t> reorder_w;
+                dnnl_primitive_t reorder {};
+                status = dnnl_primitive_create(&reorder, reorder_pdw);
+                assert(status == dnnl_success);
+                if (status != dnnl_success) { return; }
+                reorder_w.reset(reorder);
+
+                std::vector<dnnl_exec_arg_t> dnnl_args;
+                dnnl_args.resize(2);
+                dnnl_args[0].arg = DNNL_ARG_FROM;
+                dnnl_args[0].memory = orig_mem;
+                dnnl_args[1].arg = DNNL_ARG_TO;
+                dnnl_args[1].memory = dst_memory;
+
+                status = dnnl_primitive_execute(reorder_w, stream,
+                        (int)dnnl_args.size(), dnnl_args.data());
+                assert(status == dnnl_success);
+                if (status != dnnl_success) { return; }
+            }
             if (cc_entry[i].is_mapped()) cc_entry[i].unmap();
         }
     }
 
     // Refer to `gpu_n_buffers_top_limit_` comment.
-    // Exact cache size for src is needed to secure from potential non-temporal
-    // dst stores.
     // Run reorder only if memory heuristic was overrided.
     if (override_n_buffers_) {
-        dnnl_primitive_desc_t r_pd {};
-        dnnl_primitive_t prim {};
-        dnnl_primitive_attr_t attr {};
-
-        const auto &engine = get_test_engine();
-        const dnnl_dim_t nelems = static_cast<dnnl_dim_t>(
-                div_up(cache_capacity, sizeof(float)));
-        dnnl_dims_t dims {nelems};
-        dnn_mem_t src_m(1, dims, dnnl_f32, tag::abx, engine);
-        dnn_mem_t dst_m(1, dims, dnnl_f32, tag::abx, engine);
-
-        DNN_SAFE_V(dnnl_reorder_primitive_desc_create(
-                &r_pd, src_m.md_, engine, dst_m.md_, engine, attr));
-        auto r_pd_w = make_benchdnn_dnnl_wrapper(r_pd);
-
-        DNN_SAFE_V(dnnl_primitive_create(&prim, r_pd));
-        auto prim_w = make_benchdnn_dnnl_wrapper(prim);
-
-        args_t args;
-        args.set(DNNL_ARG_FROM, src_m);
-        args.set(DNNL_ARG_TO, dst_m);
-        SAFE_V(execute_and_wait(prim, args));
+        // Exact cache size for src is needed to secure from potential
+        // non-temporal dst stores or addresses collisions which would result
+        // in part cache update.
+        const size_t mem_size = cache_capacity;
+        static constexpr size_t cache_line_size = 64;
+        SAFE_V(thrash_reorder(mem_size, /* granularity = */ cache_line_size));
     }
 }
 
 cold_cache_t::~cold_cache_t() {
-    if (has_bench_mode_modifier(mode_modifier_t::no_host_memory)) return;
+    if (has_bench_mode_modifier(mode_modifier_t::no_ref_memory)) return;
 
     // Mapping memories after execution to have them destroyed gracefully.
     for (auto &e : cache_) {
@@ -230,11 +264,50 @@ cold_cache_t &cold_cache_t::operator=(cold_cache_t &&rhs) {
     n_buffers_bottom_limit_ = rhs.n_buffers_bottom_limit_;
     n_buffers_ = rhs.n_buffers_;
     override_n_buffers_ = rhs.override_n_buffers_;
+    cold_cache_input_ = std::move(rhs.cold_cache_input_);
     cache_ = std::move(rhs.cache_);
 
     return *this;
 }
 
+// `mem_size` is the amount of memory in bytes for reorder.
+// `granularity` defines the stride between elements to make an object of
+//   `granularity` size to fit the specific cache. Mostly designed for
+//   cache-line size of 64, or a page size of 4096.
+int cold_cache_t::thrash_reorder(size_t mem_size, size_t granularity) const {
+    const auto &engine = get_test_engine();
+
+    const dnnl_dim_t nelems
+            = static_cast<dnnl_dim_t>(div_up(mem_size, sizeof(float)));
+    const dnnl_dim_t stride = granularity / sizeof(float);
+
+    // Reduce the number of element by the the stride to keep the memory size
+    // as requested.
+    const dnnl_dims_t dims {div_up(nelems, stride)};
+    const dnnl_dims_t strides {stride};
+
+    dnn_mem_t src_m(1, dims, dnnl_f32, strides, engine);
+    dnn_mem_t dst_m(1, dims, dnnl_f32, strides, engine);
+
+    dnnl_primitive_desc_t r_pd {};
+    dnnl_primitive_attr_t attr {};
+    DNN_SAFE(dnnl_reorder_primitive_desc_create(
+                     &r_pd, src_m.md_, engine, dst_m.md_, engine, attr),
+            WARN);
+    auto r_pd_w = make_benchdnn_dnnl_wrapper(r_pd);
+
+    dnnl_primitive_t prim {};
+    DNN_SAFE(dnnl_primitive_create(&prim, r_pd), WARN);
+    auto prim_w = make_benchdnn_dnnl_wrapper(prim);
+
+    args_t args;
+    args.set(DNNL_ARG_FROM, src_m);
+    args.set(DNNL_ARG_TO, dst_m);
+    SAFE(execute_and_wait(prim, args), WARN);
+
+    return OK;
+}
+
 bool cold_cache_t::update_dnnl_args(std::vector<dnnl_exec_arg_t> &dnnl_args) {
     if (!enabled_) return true;
     if (should_stop()) return false;
@@ -249,6 +322,24 @@ bool cold_cache_t::update_dnnl_args(std::vector<dnnl_exec_arg_t> &dnnl_args) {
         if (cc_counter_ >= e.size()) cc_counter_ = 0;
         dnnl_args[dnnl_args_idx].memory = e[cc_counter_].m_;
     }
+
+    // Need to sweep all pages from TLB. Must be done every time the stack of
+    // memories was used for measurements as they hit TLB after being used.
+    if (cold_cache_input_.cold_tlb_ && cc_counter_ == 0) {
+        // A full size requested will be used for source AND destination in the
+        // underlying reorder. Though it's double the size, addresses of the
+        // buffers may coincide and TLB will be thrashed just by the `size`,
+        // not the double size.
+        //
+        // In theory, a single buffer split in half should work, but currently
+        // benchdnn abstractions are not ready to work with external pointers of
+        // non-host nature.
+        const size_t mem_size = cold_cache_input_.cold_tlb_size_;
+        static constexpr size_t page_size = 4096; // 4 KB.
+        auto st = thrash_reorder(mem_size, /* granularity = */ page_size);
+        if (st != OK) return false;
+    }
+
     // Update counter outside of the loop to make **all** arguments use same
     // order element from the cache.
     cc_counter_++;
@@ -261,10 +352,13 @@ bool cold_cache_t::should_stop() const {
 }
 
 bool cold_cache_t::use_cold_cache(
-        const std::vector<dnnl_exec_arg_t> &dnnl_args) {
-    const bool cc_wei = cold_cache_mode == cold_cache_mode_t::wei;
-    const bool cc_all = cold_cache_mode == cold_cache_mode_t::all;
-    const bool cc_custom = cold_cache_mode == cold_cache_mode_t::custom;
+        const std::vector<dnnl_exec_arg_t> &dnnl_args) const {
+    const bool cc_wei
+            = cold_cache_input_.cold_cache_mode_ == cold_cache_mode_t::wei;
+    const bool cc_all
+            = cold_cache_input_.cold_cache_mode_ == cold_cache_mode_t::all;
+    const bool cc_custom
+            = cold_cache_input_.cold_cache_mode_ == cold_cache_mode_t::custom;
     const bool has_weights
             = cold_cache_utils::get_arg_idx(dnnl_args, DNNL_ARG_WEIGHTS) >= 0;
     static int warning_printed = 0;
@@ -293,3 +387,16 @@ std::ostream &operator<<(std::ostream &s, cold_cache_mode_t cold_cache_mode) {
     }
     return s;
 }
+
+std::ostream &operator<<(
+        std::ostream &s, const cold_cache_input_t &cold_cache_input) {
+    s << cold_cache_input.cold_cache_mode_;
+    if (cold_cache_input.cold_tlb_) {
+        s << "+tlb";
+        if (cold_cache_input.cold_tlb_size_str_
+                != default_cold_cache_input().cold_tlb_size_str_) {
+            s << ":" << cold_cache_input.cold_tlb_size_str_;
+        }
+    }
+    return s;
+}
diff --git a/tests/benchdnn/utils/cold_cache.hpp b/tests/benchdnn/utils/cold_cache.hpp
index 53a5723e83e..af40d3c6db6 100644
--- a/tests/benchdnn/utils/cold_cache.hpp
+++ b/tests/benchdnn/utils/cold_cache.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,15 +37,46 @@ enum class cold_cache_mode_t : unsigned {
     custom = 0x4,
 };
 
-extern cold_cache_mode_t default_cold_cache_mode; // default cold cache mode
-extern cold_cache_mode_t cold_cache_mode; // user cold cache mode
+// User's choices for enabling cold-cache.
+struct cold_cache_input_t {
+    // Requested mode.
+    cold_cache_mode_t cold_cache_mode_ = cold_cache_mode_t::none;
+    // Optional cold TLB (Translation Lookaside Buffer) enabling.
+    bool cold_tlb_ = false;
+    // If TLB is enabled, the size of extra memory to touch.
+    // The less memory used, the faster the execution should be, but the effect
+    // of cold TLB might not be observed until a certain amount of memory
+    // touched to thrash TLB. This amount is system dependent.
+    //
+    // Keep string to return it to the user in the repro line.
+    std::string cold_tlb_size_str_ = "1.0G";
+    // Countable value of the string stored to use inside the implementation.
+    size_t cold_tlb_size_ = 1024 * 1024 * 1024;
+
+    bool operator==(const cold_cache_input_t &other) const {
+        // Don't compare `cold_tlb_size_` as it's the product of
+        // `cold_tlb_size_str_`.
+        return cold_cache_mode_ == other.cold_cache_mode_
+                && cold_tlb_ == other.cold_tlb_
+                && cold_tlb_size_str_ == other.cold_tlb_size_str_;
+    }
+    bool operator!=(const cold_cache_input_t &other) const {
+        return !operator==(other);
+    }
+};
+
+extern cold_cache_input_t cold_cache_input;
+
+const cold_cache_input_t &default_cold_cache_input();
 
 std::ostream &operator<<(std::ostream &s, cold_cache_mode_t cold_cache_mode);
+std::ostream &operator<<(
+        std::ostream &s, const cold_cache_input_t &cold_cache_input);
 
 struct cold_cache_t {
     // Default constructor to have an ability create cold_cache in std::vector.
     // Such cold_cache is always disabled.
-    cold_cache_t();
+    cold_cache_t() = default;
 
     // Initializes a cold_cache object with extra memories to iterate over.
     // It identifies how many buffers must be created to avoid cache hits.
@@ -55,7 +86,8 @@ struct cold_cache_t {
     // In worst case scenario when hot arguments fully occupy memory pool limit
     // devoted for cold cache, an extra memory for cold arguments will still be
     // allocated.
-    cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args);
+    cold_cache_t(const std::vector<dnnl_exec_arg_t> &dnnl_args,
+            dnnl_stream_t stream);
 
     ~cold_cache_t();
 
@@ -74,28 +106,31 @@ struct cold_cache_t {
     bool should_stop() const;
 
 private:
-    bool enabled_;
-    size_t n_buffers_top_limit_;
-    size_t n_buffers_bottom_limit_;
+    cold_cache_input_t cold_cache_input_;
+    bool enabled_ = false;
+    size_t n_buffers_top_limit_ = 0;
+    size_t n_buffers_bottom_limit_ = 0;
     // `n_buffers` is responsible for the number of allocated buffers per arg.
-    size_t n_buffers_;
-    bool override_n_buffers_;
+    size_t n_buffers_ = 0;
+    bool override_n_buffers_ = false;
     std::unordered_map<int, std::vector<dnn_mem_t>> cache_;
 
-    // Memory allocations are time consuming on GPU, thus, limiting number of
-    // buffers from above.
-    // Since `no_host_memory` allocations use `memset` call to initialize the
-    // data, the assumption is it makes newly created memory objects get into
-    // GPU cache. Using these memory objects in cold cache run will not be
-    // "cold" any more.
-    // Thus, an extra reorder is added with brand new memory objects to reset
-    // the state of the cache.
+    // Memory allocations are time consuming on GPU, thus, introducing the
+    // upper bound for the number of buffers in cold-cache.
+    // Since `no_ref_memory` allocations use `memset` call to initialize the
+    // data, the assumption is it makes newly created memory objects with newly
+    // allocated buffer underneath get into the GPU cache. Using these memory
+    // objects in cold-cache run won't be "cold" any longer.
+    // Thus, introducing an extra reorder with brand new memory objects which
+    // sole purpose is to reset the state of the cache by entirely thrashing it.
     static constexpr size_t gpu_n_buffers_top_limit_ = 100;
 
     size_t cc_counter_ = 0;
 
-    // Returns `true`, if "cold cache" was requested and eligible.
-    bool use_cold_cache(const std::vector<dnnl_exec_arg_t> &dnnl_args);
+    // Returns `true`, if cold-cache was requested and eligible.
+    bool use_cold_cache(const std::vector<dnnl_exec_arg_t> &dnnl_args) const;
+
+    int thrash_reorder(size_t mem_size, size_t granularity) const;
 
     BENCHDNN_DISALLOW_COPY_AND_ASSIGN(cold_cache_t);
 };
diff --git a/tests/benchdnn/utils/compare.cpp b/tests/benchdnn/utils/compare.cpp
index 7bd179d68af..04faeed8b5a 100644
--- a/tests/benchdnn/utils/compare.cpp
+++ b/tests/benchdnn/utils/compare.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <mutex>
 #include <sstream>
 #include <string>
+#include <thread>
 
 #include "utils/parallel.hpp"
 
@@ -31,19 +32,39 @@
 namespace compare {
 
 namespace {
-void dump_point_values(const_dnnl_memory_desc_t md, const std::string &kind_str,
-        int64_t l_offset, float exp_f32, float exp, float got, float diff,
-        float rel_diff) {
+struct dump_point_ctx_t {
+    dump_point_ctx_t(const_dnnl_memory_desc_t md, int64_t l_offset,
+            float exp_f32, float exp, float got, float diff, float rel_diff)
+        : md(md)
+        , l_offset(l_offset)
+        , exp_f32(exp_f32)
+        , exp(exp)
+        , got(got)
+        , diff(diff)
+        , rel_diff(rel_diff) {}
+
+    const_dnnl_memory_desc_t md;
+    int64_t l_offset;
+    float exp_f32;
+    float exp;
+    float got;
+    float diff;
+    float rel_diff;
+};
+
+void dump_point_values(
+        const std::string &kind_str, const dump_point_ctx_t &ctx) {
     std::stringstream ss;
-    dims_t l_dims = md2dims(md);
-    dims_t dims_idx = off2dims_idx(l_dims, l_offset);
+    dims_t l_dims = md2dims(ctx.md);
+    dims_t dims_idx = off2dims_idx(l_dims, ctx.l_offset);
     ss << dims_idx;
     std::string ind_str = ss.str();
 
     BENCHDNN_PRINT(0,
-            "[%4ld]%s[%s] exp_f32:%12g exp:%12g got:%12g diff:%8g rdiff:%8g\n",
-            (long)l_offset, kind_str.c_str(), ind_str.c_str(), exp_f32, exp,
-            got, diff, rel_diff);
+            "[%4" PRId64
+            "]%s[%s] exp_f32:%12g exp:%12g got:%12g diff:%8g rdiff:%8g\n",
+            ctx.l_offset, kind_str.c_str(), ind_str.c_str(), ctx.exp_f32,
+            ctx.exp, ctx.got, ctx.diff, ctx.rel_diff);
 }
 
 void dump_norm_values(
@@ -211,8 +232,9 @@ int compare_t::compare_norm(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
     if (need_dump) {
         for (int64_t i = 0; i < nelems; ++i) {
             driver_check_func_args_t args(exp_mem, got_f32, i, dt, trh_);
-            dump_point_values(got_mem.md_, get_kind_str(), i, args.exp_f32,
-                    args.exp, args.got, args.diff, args.rel_diff);
+            dump_point_values(get_kind_str(),
+                    {got_mem.md_, i, args.exp_f32, args.exp, args.got,
+                            args.diff, args.rel_diff});
         }
     }
 
@@ -275,7 +297,6 @@ int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
     const auto nelems_pad = nelems_per_thread * nthreads;
 
     // These global metrics are updated at the synchronization point.
-    bool global_ok = true;
     int64_t zeros = 0;
     // "all_" stuff is across the whole tensor. "err_" stuff is just for points
     // that didn't pass any criteria.
@@ -283,22 +304,53 @@ int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
     float all_max_diff = 0.f;
     float err_max_rdiff = 0.f;
     float err_max_diff = 0.f;
-    int64_t n_errors = 0;
-    volatile bool from_parallel = true;
     const bool need_dump = verbose >= 99;
 
+    // Make thread_data static so that acquiring the thread data can be
+    // performed in a static thread_local variable to minimize locking
+    static struct {
+        struct data_t {
+            int64_t n_errors;
+            std::vector<dump_point_ctx_t> dumps;
+        };
+
+        data_t &get() {
+            std::lock_guard<std::mutex> guard(m);
+            return data[std::this_thread::get_id()];
+        }
+        void reset() {
+            for (auto &d : data) {
+                d.second.n_errors = 0;
+                d.second.dumps.clear();
+            }
+        }
+
+        std::unordered_map<std::thread::id, data_t> data;
+        std::mutex m;
+    } thread_data;
+
+    // Clear data from previous runs for the static variable
+    thread_data.reset();
+
     const auto compare_point_values = [&](int64_t i) {
         // Skip padded (non-existent) elements.
         if (i >= nelems) return;
 
         // Stats for all validated points per one thread.
-        static thread_local bool ithr_ok = true;
         static thread_local int64_t ithr_zeros = 0;
         static thread_local float ithr_all_max_rdiff = 0.f;
         static thread_local float ithr_all_max_diff = 0.f;
         static thread_local float ithr_err_max_rdiff = 0.f;
         static thread_local float ithr_err_max_diff = 0.f;
 
+        // This is valid because references to data are only invalidated by
+        // erasing that element, but it does require that thread_data is a
+        // static variable and the corresponding element isn't erased between
+        // calls to this function.
+        static thread_local auto &out_data = thread_data.get();
+        auto &n_errors = out_data.n_errors;
+        auto &dumps = out_data.dumps;
+
         driver_check_func_args_t args(exp_f32, got_f32, i, dt, trh_);
 
         bool ok = args.diff == 0.f;
@@ -336,9 +388,19 @@ int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
             // function). We rely on validation of pure eltwise and let some
             // big rdiff errors slip away hoping that absolute error is good
             // enough.
-            const float experimental_eltwise_trh
+            // Note: two scenarios covered:
+            // * When rdiff is bigger due to small output values but diff is
+            //   small due to single point computation or short acc chain.
+            // * When diff is no longer small due to longer acc chain, but rdiff
+            //   is still small but greater than 0.
+            const float experimental_eltwise_trh_diff
                     = std::max(epsilon_dt(dt), 2e-5f);
-            ok = has_eltwise && args.diff <= experimental_eltwise_trh;
+            const float experimental_eltwise_trh_rel_diff
+                    = std::max(epsilon_dt(dt), 8e-6f);
+            ok = has_eltwise
+                    && (args.diff <= experimental_eltwise_trh_diff
+                            || args.rel_diff
+                                    <= experimental_eltwise_trh_rel_diff);
             if (ok) break;
 
             // For eltwise it also may happen that threshold is really small,
@@ -375,12 +437,16 @@ int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
             //
             // Note: use specific dt and correspondent values not to mess with
             // broad set of supported data types.
-            float binary_comp_po_trh = 0.f;
-            if (args.dt == dnnl_f16)
-                binary_comp_po_trh = 5.f * epsilon_dt(args.dt); // == 5e-3;
-            if (args.dt == dnnl_f32)
-                binary_comp_po_trh = 20.f * epsilon_dt(args.dt); // == 2e-6f;
-            ok = has_binary_compute_po(attr) && args.diff <= binary_comp_po_trh;
+            float binary_comp_po_diff_trh = 0.f;
+            float binary_comp_po_rdiff_trh = 0.f;
+            if (args.dt == dnnl_f16) binary_comp_po_diff_trh = 5e-3f;
+            if (args.dt == dnnl_f32) {
+                binary_comp_po_diff_trh = 4e-6f;
+                binary_comp_po_rdiff_trh = 1e-5f;
+            }
+            ok = has_binary_compute_po(attr)
+                    && (args.diff <= binary_comp_po_diff_trh
+                            || args.rel_diff <= binary_comp_po_rdiff_trh);
             if (ok) break;
 
             // Some drivers (like pooling or resampling) on integer data types
@@ -430,39 +496,30 @@ int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
         }
 
         // Update compare stats.
-        if (from_parallel) {
-            if (fabsf(args.got) == 0) ithr_zeros++;
-            ithr_all_max_rdiff = MAX2(ithr_all_max_rdiff, args.rel_diff);
-            ithr_all_max_diff = MAX2(ithr_all_max_diff, args.diff);
-            if (!ok)
-                ithr_err_max_rdiff = MAX2(ithr_err_max_rdiff, args.rel_diff);
-            if (!ok) ithr_err_max_diff = MAX2(ithr_err_max_diff, args.diff);
-        }
+        if (fabsf(args.got) == 0) ithr_zeros++;
+        ithr_all_max_rdiff = MAX2(ithr_all_max_rdiff, args.rel_diff);
+        ithr_all_max_diff = MAX2(ithr_all_max_diff, args.diff);
+        if (!ok) ithr_err_max_rdiff = MAX2(ithr_err_max_rdiff, args.rel_diff);
+        if (!ok) ithr_err_max_diff = MAX2(ithr_err_max_diff, args.diff);
 
-        if (!ok && ithr_ok) ithr_ok = false;
-        if (!ok && !from_parallel) n_errors++;
+        if (!ok) n_errors++;
 
         const bool dump
                 = need_dump || (!ok && (n_errors <= 10 || verbose >= 10));
-        if (!from_parallel && dump)
-            dump_point_values(got_mem.md_, get_kind_str(), i, args.exp_f32,
-                    args.exp, args.got, args.diff, args.rel_diff);
+        if (dump)
+            dumps.emplace_back(got_mem.md_, i, args.exp_f32, args.exp, args.got,
+                    args.diff, args.rel_diff);
 
         // Synchronization point, update global stats from thread stats.
         if (((i + 1) % nelems_per_thread == 0) || (i == nelems - 1)) {
             static std::mutex m;
             std::lock_guard<std::mutex> guard(m);
 
-            if (global_ok && !ithr_ok) global_ok = false;
-            ithr_ok = true;
-
-            if (from_parallel) {
-                zeros += ithr_zeros;
-                all_max_rdiff = MAX2(all_max_rdiff, ithr_all_max_rdiff);
-                all_max_diff = MAX2(all_max_diff, ithr_all_max_diff);
-                err_max_rdiff = MAX2(err_max_rdiff, ithr_err_max_rdiff);
-                err_max_diff = MAX2(err_max_diff, ithr_err_max_diff);
-            }
+            zeros += ithr_zeros;
+            all_max_rdiff = MAX2(all_max_rdiff, ithr_all_max_rdiff);
+            all_max_diff = MAX2(all_max_diff, ithr_all_max_diff);
+            err_max_rdiff = MAX2(err_max_rdiff, ithr_err_max_rdiff);
+            err_max_diff = MAX2(err_max_diff, ithr_err_max_diff);
             ithr_zeros = 0;
             ithr_all_max_rdiff = 0.f;
             ithr_all_max_diff = 0.f;
@@ -477,11 +534,26 @@ int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
     // With this logic, the block of code below won't be needed.
     benchdnn_parallel_nd(nelems_pad, compare_point_values);
 
+    int64_t n_errors = 0;
+    for (auto &d : thread_data.data) {
+        n_errors += d.second.n_errors;
+    }
     // serial comparison with enabled dumping when needed for nicer output.
-    if (!global_ok || need_dump) {
-        from_parallel = false;
-        for (int64_t i = 0; i < nelems; ++i)
-            compare_point_values(i);
+    if (n_errors > 0 || need_dump) {
+        std::vector<dump_point_ctx_t> dumps;
+        for (auto &d : thread_data.data) {
+            dumps.insert(
+                    dumps.end(), d.second.dumps.begin(), d.second.dumps.end());
+        }
+        std::sort(dumps.begin(), dumps.end(),
+                [](const dump_point_ctx_t &a, const dump_point_ctx_t &b) {
+                    return a.l_offset < b.l_offset;
+                });
+        size_t max_dump_size
+                = (verbose >= 10 || dumps.size() < 10) ? dumps.size() : 10;
+        for (size_t i = 0; i < max_dump_size; i++) {
+            dump_point_values(get_kind_str(), dumps[i]);
+        }
     }
 
     // Set state to FAILED in case of any errors.
diff --git a/tests/benchdnn/utils/dnnl_query.cpp b/tests/benchdnn/utils/dnnl_query.cpp
index 4e3cc11f784..dfcf7aed8a3 100644
--- a/tests/benchdnn/utils/dnnl_query.cpp
+++ b/tests/benchdnn/utils/dnnl_query.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,6 +67,12 @@ dnnl_engine_t query_engine(
     return engine;
 }
 
+dnnl_engine_t query_engine(const_dnnl_memory_t memory) {
+    dnnl_engine_t engine;
+    dnnl_memory_get_engine(memory, &engine);
+    return engine;
+}
+
 int64_t query_mem_consumption(const_dnnl_primitive_desc_t pd) {
     int64_t size = 0;
     dnnl_primitive_desc_query(pd, dnnl_query_memory_consumption_s64, 0, &size);
@@ -93,6 +99,12 @@ bool query_post_ops_has_kind(
     return false;
 }
 
+dnnl_scratchpad_mode_t query_scratchpad_mode(const_dnnl_primitive_attr_t attr) {
+    dnnl_scratchpad_mode_t mode = dnnl_scratchpad_mode_library;
+    dnnl_primitive_attr_get_scratchpad_mode(attr, &mode);
+    return mode;
+}
+
 const_dnnl_post_ops_t query_post_ops(const_dnnl_primitive_attr_t attr) {
     const_dnnl_post_ops_t post_ops {};
     dnnl_primitive_attr_get_post_ops(attr, &post_ops);
diff --git a/tests/benchdnn/utils/dnnl_query.hpp b/tests/benchdnn/utils/dnnl_query.hpp
index a6252f26264..39afc78004c 100644
--- a/tests/benchdnn/utils/dnnl_query.hpp
+++ b/tests/benchdnn/utils/dnnl_query.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ const_dnnl_memory_desc_t query_md(const_dnnl_memory_t memory);
 
 dnnl_engine_t query_engine(const_dnnl_primitive_desc_t pd,
         dnnl_query_t engine_type = dnnl_query_engine);
+dnnl_engine_t query_engine(const_dnnl_memory_t memory);
 
 int64_t query_mem_consumption(const_dnnl_primitive_desc_t pd);
 
@@ -47,6 +48,7 @@ int query_n_outputs(const_dnnl_primitive_desc_t pd);
 bool query_post_ops_has_kind(dnnl_primitive_t prim, dnnl_primitive_kind_t kind);
 bool query_post_ops_has_kind(
         const_dnnl_post_ops_t post_ops, dnnl_primitive_kind_t kind);
+dnnl_scratchpad_mode_t query_scratchpad_mode(const_dnnl_primitive_attr_t attr);
 const_dnnl_post_ops_t query_post_ops(const_dnnl_primitive_attr_t attr);
 const_dnnl_post_ops_t query_post_ops(const_dnnl_primitive_desc_t pd);
 const_dnnl_primitive_attr_t query_attr(const_dnnl_primitive_desc_t pd);
diff --git a/tests/benchdnn/utils/fill.cpp b/tests/benchdnn/utils/fill.cpp
index 8482dd76cc5..b85240f7fae 100644
--- a/tests/benchdnn/utils/fill.cpp
+++ b/tests/benchdnn/utils/fill.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -65,7 +65,7 @@ std::string fill_cfg_t::print_verbose() const {
     std::stringstream ss;
 
     ss << "[FILL_CFG]";
-    if (!name_.empty()) ss << " name:" << name_;
+    if (!name_.empty()) ss << " name:\'" << name_ << "\';";
 
     // Predefined set is mutually excluded with a range setting.
     if (!predefined_set_.empty()) {
@@ -88,6 +88,15 @@ const fill_cfg_t &get_default_fill_cfg() {
     return fill_cfg;
 }
 
+const fill_cfg_t &get_perf_fill_cfg(dnnl_data_type_t dt) {
+    assert(has_bench_mode_bit(mode_bit_t::perf));
+    static const fill_cfg_t fill_cfg(dt, MAX2(-1024.f, lowest_dt(dt)),
+            MIN2(1024.f, max_dt(dt)),
+            /* only_int = */ false, attr_t::post_ops_t::kind_t::ADD,
+            "perf_mode_fill");
+    return fill_cfg;
+}
+
 int fill_scales(
         const attr_t &attr, int arg, dnn_mem_t &mem_dt, dnn_mem_t &mem_fp) {
     const auto &e = attr.scales.get(arg);
@@ -247,11 +256,17 @@ int fill_random_real_dense(dnn_mem_t &mem, dnn_mem_t &mem_ref, res_t *res,
                         ? orig_val - 0.5f
                         : orig_val + 0.5f;
             } else if (round_dt == dnnl_s8) {
-                ; // s8 is fine.
+                ; // Using s8 val of -128 leads to a binary_mul alg magnifying
+                        // the diff (at least for eltwise) when it used to fit.
+                        // Need a general solution for the problem.
             } else if (round_dt == dnnl_u8) {
                 return 128.f; // catch faulty s8 loads instead of u8.
             } else if (round_dt == dnnl_s32) {
                 return 256.f; // catch faulty int8 loads instead of s32.
+            } else if (round_dt == dnnl_u4) {
+                return 15.f;
+            } else if (round_dt == dnnl_s4) {
+                return -8.f;
             } else {
                 assert(!"unexpected data type");
             }
@@ -276,15 +291,17 @@ int fill_random_real_dense(dnn_mem_t &mem, dnn_mem_t &mem_ref, res_t *res,
 }
 
 #ifdef DNNL_EXPERIMENTAL_SPARSE
+// Since a sparsity pattern affects performance, it's crucial to keep the
+// pattern intact and only randomize tensor values. Thus, the function relies on
+// an assumption that every sparse format contains three handles, where the
+// second and the third are responsible for a sparsity pattern, and are
+// **already filled**.
 int fill_random_real_sparse(const_dnnl_memory_t dnnl_memory, dnn_mem_t &mem,
         dnn_mem_t &mem_ref, res_t *res, const fill_cfg_t &fill_cfg) {
     auto orig_cc_mem_md = query_md(dnnl_memory);
     const int nhandles = query_md_num_handles(orig_cc_mem_md);
     assert(nhandles == 3);
-    // Since a sparsity pattern affects performance, it's crucial to keep the
-    // pattern intact and only randomize tensor values.
-    // The assumption is every sparse format contains three handles and the
-    // second and the third are responsible for a sparsity pattern.
+    // Copy-exact the content of metadata buffers. Let data handle go further.
     for (int idx = 1; idx < nhandles; idx++) {
         void *dst_ptr = mem_ref.get_mapped_pointer<void>(idx);
         void *src_ptr = nullptr;
diff --git a/tests/benchdnn/utils/fill.hpp b/tests/benchdnn/utils/fill.hpp
index d88ad8a8f46..a2662db61ee 100644
--- a/tests/benchdnn/utils/fill.hpp
+++ b/tests/benchdnn/utils/fill.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -41,8 +41,7 @@ struct fill_cfg_t {
         , range_min_val_(-16.f)
         , range_max_val_(16.f)
         , predefined_set_({})
-        , only_integer_(false)
-        , name_("") {}
+        , only_integer_(false) {}
 
     fill_cfg_t(dnnl_data_type_t dt, float range_min_val, float range_max_val,
             bool only_integer, attr_t::post_ops_t::kind_t alg,
@@ -70,6 +69,7 @@ struct fill_cfg_t {
 };
 
 const fill_cfg_t &get_default_fill_cfg();
+const fill_cfg_t &get_perf_fill_cfg(dnnl_data_type_t dt);
 
 int fill_dropout_mask(dnn_mem_t &mem_dt, dnn_mem_t &mem_fp);
 
diff --git a/tests/benchdnn/utils/impl_filter.cpp b/tests/benchdnn/utils/impl_filter.cpp
new file mode 100644
index 00000000000..8de964d4013
--- /dev/null
+++ b/tests/benchdnn/utils/impl_filter.cpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "common.hpp"
+
+#include "utils/impl_filter.hpp"
+
+impl_filter_t global_impl_filter {};
+
+const impl_filter_t &get_prim_ref_impl_filter() {
+    static const impl_filter_t prim_ref_impl_filter({"ref:any", "ref_int8:any"},
+            /* use_impl = */ false,
+            /* respect_global_filter = */ false);
+    return prim_ref_impl_filter;
+}
+
+std::string get_impl_filter_name(const impl_filter_t &impl_filter) {
+    const bool use_impl = impl_filter.use_impl();
+    std::string prefix = !use_impl ? "skip-" : "";
+    std::string s("--");
+    s.append(prefix).append("impl");
+    return s;
+}
+
+// This operator takes the global filter into account as well. No need to dump
+// it additionally in a common spot.
+std::ostream &operator<<(std::ostream &s, const impl_filter_t &impl_filter) {
+    const bool is_global_def = IMPLICATION(
+            impl_filter.respect_global_filter(), global_impl_filter.is_def());
+    const bool is_def = is_global_def && impl_filter.is_def();
+    if (is_def) return s;
+
+    const auto &option_name = !is_global_def
+            ? get_impl_filter_name(global_impl_filter)
+            : get_impl_filter_name(impl_filter);
+    s << option_name << "=";
+
+    const auto &names = !is_global_def ? global_impl_filter.get_names()
+                                       : impl_filter.get_names();
+    const size_t sz = names.size();
+    for (size_t i = 0; i < sz - 1; i++) {
+        s << names[i] << ",";
+    }
+    s << names[sz - 1] << " ";
+
+    return s;
+}
+
+bool need_next_impl(
+        const std::string &impl_name, const impl_filter_t &impl_filter) {
+    const bool is_global_def = IMPLICATION(
+            impl_filter.respect_global_filter(), global_impl_filter.is_def());
+    const bool is_def = is_global_def && impl_filter.is_def();
+    if (is_def) return false;
+
+    const bool use_impl = !is_global_def ? global_impl_filter.use_impl()
+                                         : impl_filter.use_impl();
+    const auto &names = !is_global_def ? global_impl_filter.get_names()
+                                       : impl_filter.get_names();
+
+    // If the name hits the list and `use_impl_=true`, no need the next impl.
+    // If the name hits the list and `use_impl_=false`, needs the next impl.
+    // If the name doesn't hit the list and `use_impl_=true`, needs the next impl.
+    // If the name doesn't hit the list and `use_impl_=false, no need the next impl.
+    for (const auto &e : names) {
+        if (e.empty()) continue; // Just in case though not expected.
+        if (impl_name.find(e) != std::string::npos) return !use_impl;
+    }
+    return use_impl;
+}
diff --git a/tests/benchdnn/utils/impl_filter.hpp b/tests/benchdnn/utils/impl_filter.hpp
new file mode 100644
index 00000000000..3048b0f4431
--- /dev/null
+++ b/tests/benchdnn/utils/impl_filter.hpp
@@ -0,0 +1,64 @@
+/*******************************************************************************
+* Copyright 2024-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef UTILS_IMPL_FILTER_HPP
+#define UTILS_IMPL_FILTER_HPP
+
+#include <string>
+#include <vector>
+
+struct impl_filter_t {
+    impl_filter_t() = default;
+    impl_filter_t(const std::vector<std::string> &impl_names, bool use_impl,
+            bool respect_global_filter)
+        : impl_names_(impl_names)
+        , use_impl_(use_impl)
+        , respect_global_filter_(respect_global_filter) {}
+
+    bool is_def() const { return impl_names_.empty(); }
+
+    const std::vector<std::string> &get_names() const { return impl_names_; }
+    bool use_impl() const { return use_impl_; }
+    bool respect_global_filter() const { return respect_global_filter_; }
+
+private:
+    std::vector<std::string> impl_names_;
+    bool use_impl_ = false; // `true` to `--impl`, `false` to `--skip-impl`.
+    // Test objects should respect the global filter. CPU prim_ref objects
+    // shouldn't as it affects correctness validation speed.
+    // Default is set to `true` for the cases when global is initialized. In
+    // such cases local is not initialized but always passed to `fetch_impl`,
+    // thus, to pick up values from global it should indicate the global is
+    // respected.
+    bool respect_global_filter_ = true;
+};
+
+extern impl_filter_t global_impl_filter;
+
+// Fixed filter to remove running reference impls for prim_ref support.
+const impl_filter_t &get_prim_ref_impl_filter();
+
+std::ostream &operator<<(std::ostream &s, const impl_filter_t &impl_filter);
+
+// Returns `false`, (or use currently fetched implementation) when:
+// * `impl_filter` is empty;
+// * None of `use_impl_=false` hits happened;
+// * One of `use_impl_=true` hits happened;
+// Otherwise, returns `true`, meaning the next implementation is desired.
+bool need_next_impl(
+        const std::string &impl_name, const impl_filter_t &impl_filter);
+
+#endif
diff --git a/tests/benchdnn/utils/numeric.cpp b/tests/benchdnn/utils/numeric.cpp
index ac7ea793ee4..120748f21f8 100644
--- a/tests/benchdnn/utils/numeric.cpp
+++ b/tests/benchdnn/utils/numeric.cpp
@@ -16,6 +16,7 @@
 
 #include "src/common/bfloat16.hpp"
 #include "src/common/float16.hpp"
+#include "src/common/float4.hpp"
 #include "src/common/float8.hpp"
 #include "src/common/nstl.hpp"
 
@@ -23,6 +24,14 @@
 
 #include "utils/numeric.hpp"
 
+template <>
+struct prec_traits<dnnl_f4_e2m1> {
+    using type = dnnl::impl::float4_e2m1_t;
+};
+template <>
+struct prec_traits<dnnl_f4_e3m0> {
+    using type = dnnl::impl::float4_e3m0_t;
+};
 template <>
 struct prec_traits<dnnl_e8m0> {
     using type = dnnl::impl::float8_e8m0_t;
@@ -78,6 +87,8 @@ struct prec_traits<dnnl_u4> {
 };
 #define CASE_ALL(dt) \
     switch (dt) { \
+        CASE(dnnl_f4_e2m1); \
+        CASE(dnnl_f4_e3m0); \
         CASE(dnnl_e8m0); \
         CASE(dnnl_f8_e5m2); \
         CASE(dnnl_f8_e4m3); \
@@ -168,6 +179,12 @@ float round_to_nearest_representable(dnnl_data_type_t dt, float value) {
     switch (dt) {
         case dnnl_f32: break;
         case dnnl_f64: break;
+        case dnnl_f4_e2m1:
+            value = (float)dnnl::impl::float4_e2m1_t(value);
+            break;
+        case dnnl_f4_e3m0:
+            value = (float)dnnl::impl::float4_e3m0_t(value);
+            break;
         case dnnl_e8m0: value = (float)dnnl::impl::float8_e8m0_t(value); break;
         case dnnl_f8_e5m2:
             value = (float)dnnl::impl::float8_e5m2_t(value);
diff --git a/tests/benchdnn/utils/parser.cpp b/tests/benchdnn/utils/parser.cpp
index 3e56e207cef..05aa124b868 100644
--- a/tests/benchdnn/utils/parser.cpp
+++ b/tests/benchdnn/utils/parser.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@ const size_t eol = std::string::npos;
 std::stringstream help_ss;
 
 static const std::string benchdnn_url
-        = "https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn";
+        = "https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn";
 static const std::string doc_url = benchdnn_url + "/doc/";
 
 namespace parser_utils {
@@ -103,13 +103,6 @@ attr_t::post_ops_t parse_attr_post_ops_func(const std::string &s) {
                 = attr_t::post_ops_t::str2kind(get_substr(subs, subs_pos, ':'));
         if (kind == attr_t::post_ops_t::kind_t::KIND_TOTAL) SAFE_V(FAIL);
 
-#define CATCH_DANGLING_SYMBOL \
-    if (subs_pos >= subs.size()) { \
-        BENCHDNN_PRINT(0, "%s \'%s\'\n", \
-                "Error: dangling symbol at the end of input", subs.c_str()); \
-        SAFE_V(FAIL); \
-    }
-
         v.entry.emplace_back(kind);
         if (subs_pos == std::string::npos) {
             if (kind != attr_t::post_ops_t::kind_t::DW) continue;
@@ -119,19 +112,16 @@ attr_t::post_ops_t parse_attr_post_ops_func(const std::string &s) {
                     "and 'p' values.");
             SAFE_V(FAIL);
         }
-        CATCH_DANGLING_SYMBOL;
 
         auto &e = v.entry.back();
         if (e.is_sum_kind()) {
             e.sum.scale
                     = parser_utils::stof_safe(get_substr(subs, subs_pos, ':'));
             if (subs_pos == std::string::npos) continue;
-            CATCH_DANGLING_SYMBOL;
 
             auto zp_str = get_substr(subs, subs_pos, ':');
             e.sum.zero_point = parser_utils::stoll_safe(zp_str);
             if (subs_pos == std::string::npos) continue;
-            CATCH_DANGLING_SYMBOL;
 
             const auto dt_str = get_substr(subs, subs_pos, ':');
             e.sum.dt = str2dt(dt_str.c_str());
@@ -209,7 +199,6 @@ attr_t::post_ops_t parse_attr_post_ops_func(const std::string &s) {
             e.eltwise.alpha
                     = parser_utils::stof_safe(get_substr(subs, subs_pos, ':'));
             if (subs_pos == std::string::npos) continue;
-            CATCH_DANGLING_SYMBOL;
 
             e.eltwise.beta
                     = parser_utils::stof_safe(get_substr(subs, subs_pos, ':'));
@@ -224,7 +213,6 @@ attr_t::post_ops_t parse_attr_post_ops_func(const std::string &s) {
                 SAFE_V(FAIL);
             }
             if (subs_pos == std::string::npos) continue;
-            CATCH_DANGLING_SYMBOL;
 
             const auto mask_input_str = get_substr(subs, subs_pos, ':');
             // Check if `mask_input_str` consists of only digits.
@@ -254,7 +242,6 @@ attr_t::post_ops_t parse_attr_post_ops_func(const std::string &s) {
                 e.binary.mask_input = mask_input_t::policy;
             }
             if (subs_pos == std::string::npos) continue;
-            CATCH_DANGLING_SYMBOL;
 
             const auto tag_str = get_substr(subs, subs_pos, ':');
             e.binary.tag = tag_str;
@@ -274,7 +261,6 @@ attr_t::post_ops_t parse_attr_post_ops_func(const std::string &s) {
             }
         }
         if (subs_pos == std::string::npos) continue;
-        CATCH_DANGLING_SYMBOL;
     }
 
     return v;
@@ -296,22 +282,9 @@ attr_t::fpmath_mode_t parse_attr_fpmath_mode_func(const std::string &s) {
     auto subs = get_substr(s, start_pos, ':');
     v.mode = str2fpmath_mode(subs.c_str());
     if (start_pos == std::string::npos) return v;
-    if (start_pos >= s.size()) {
-        BENCHDNN_PRINT(0, "%s \'%s\'\n",
-                "Error: dangling symbol at the end of input", s.c_str());
-        SAFE_V(FAIL);
-    }
-
-    if (start_pos != std::string::npos) {
-        subs = get_substr(s, start_pos, '\0');
-        v.apply_to_int = str2bool(subs.c_str());
 
-        if (start_pos != std::string::npos) {
-            BENCHDNN_PRINT(0, "%s \'%s\'\n",
-                    "Error: dangling symbol at the end of input", s.c_str());
-            SAFE_V(FAIL);
-        }
-    }
+    subs = get_substr(s, start_pos, '\0');
+    v.apply_to_int = str2bool(subs.c_str());
 
     return v;
 }
@@ -341,7 +314,6 @@ attr_t::rounding_mode_t parse_attr_rounding_mode_func(const std::string &s) {
 }
 
 attr_t::dropout_t parse_attr_dropout_func(const std::string &s) {
-    const char *err = "Error: dangling symbol at the end of input";
     attr_t::dropout_t v;
     if (s.empty()) return v;
 
@@ -353,38 +325,147 @@ attr_t::dropout_t parse_attr_dropout_func(const std::string &s) {
         SAFE_V(FAIL);
     }
     if (start_pos == std::string::npos) return v;
-    if (start_pos >= s.size()) {
-        BENCHDNN_PRINT(0, "%s \'%s\'\n", err, s.c_str());
+
+    subs = get_substr(s, start_pos, ':');
+    v.seed = stoll_safe(subs);
+    if (start_pos == std::string::npos) return v;
+
+    v.tag = get_substr(s, start_pos, '\0');
+    if (check_tag(v.tag) != OK) {
+        BENCHDNN_PRINT(0, "%s \'%s\' %s\n", "Error: dropout mask tag",
+                v.tag.c_str(), "is not recognized.");
         SAFE_V(FAIL);
     }
 
-    if (start_pos != std::string::npos) {
-        subs = get_substr(s, start_pos, ':');
-        v.seed = stoll_safe(subs);
+    return v;
+}
+
+bool parse_impl_filter(impl_filter_t &impl_filter,
+        const impl_filter_t &def_impl_filter, bool use_impl, const char *str,
+        const std::string &option_name, const std::string &help) {
+    const auto chars2chars = [](const char *str) { return str; };
+    const auto str2impl_filter = [&](const char *str) {
+        std::vector<std::string> v, def;
+        parse_vector_str(v, def, chars2chars, str);
+
+        // Remove all quotes from input string since they affect the search.
+        for_(auto &e : v)
+        for (auto c : {'"', '\''}) {
+            size_t start_pos = 0;
+            while (start_pos != eol) {
+                start_pos = e.find_first_of(c, start_pos);
+                if (start_pos != eol) e.erase(start_pos, 1);
+            }
+        }
+
+        return impl_filter_t(v, use_impl, /* respect_global_filter = */ true);
+    };
+    return parse_single_value_option(impl_filter, def_impl_filter,
+            str2impl_filter, str, option_name, help);
+}
+
+summary_t parse_summary_str(const std::string &s) {
+    // Allowed input: (no-)option+...
+    summary_t v;
+    if (s.empty()) return v;
+
+    size_t start_pos = 0;
+    while (start_pos != std::string::npos) {
+        auto subs = parser::get_substr(s, start_pos, '+');
+        size_t subs_pos = 0;
+
+        bool negate_option = false;
+        if (subs.find("no-", 0, 3) != std::string::npos) {
+            negate_option = true;
+            subs_pos += 3;
+        }
 
-        if (start_pos == std::string::npos) return v;
-        if (start_pos >= s.size()) {
-            BENCHDNN_PRINT(0, "%s \'%s\'\n", err, s.c_str());
+        auto option = parser::get_substr(subs, subs_pos, '\0');
+        if (option == "failures") {
+            v.failed_cases = !negate_option;
+        } else {
+            BENCHDNN_PRINT(0,
+                    "Error: unsupported option-value combination "
+                    "\'--summary=%s\'\n",
+                    option.c_str());
             SAFE_V(FAIL);
         }
+    }
 
-        if (start_pos != std::string::npos) {
-            v.tag = get_substr(s, start_pos, '\0');
+    return v;
+}
 
-            if (check_tag(v.tag) != OK) {
-                BENCHDNN_PRINT(0, "%s \'%s\' %s\n", "Error: dropout mask tag",
-                        v.tag.c_str(), "is not recognized.");
-                SAFE_V(FAIL);
-            }
+cold_cache_input_t str2cold_cache_input(const std::string &s) {
+    // Allowed input: MODE[+EXTENSION[+...]]
+    // Allowed extensions: TLB[:SIZE]
+    cold_cache_input_t c;
 
-            if (start_pos != std::string::npos) {
-                BENCHDNN_PRINT(0, "%s \'%s\'\n", err, s.c_str());
-                SAFE_V(FAIL);
+    size_t start_pos = 0;
+    std::string mode_str = get_substr(s, start_pos, '+');
+    if (mode_str == "none") {
+        c.cold_cache_mode_ = cold_cache_mode_t::none;
+    } else if (mode_str == "wei") {
+        c.cold_cache_mode_ = cold_cache_mode_t::wei;
+    } else if (mode_str == "all") {
+        c.cold_cache_mode_ = cold_cache_mode_t::all;
+    } else if (mode_str == "custom") {
+        c.cold_cache_mode_ = cold_cache_mode_t::custom;
+    } else {
+        BENCHDNN_PRINT(0,
+                "Error: unknown cold-cache mode \'%s\'. Supported values are "
+                "\'wei\', \'all\', or \'custom\'.\n",
+                mode_str.c_str());
+        SAFE_V(FAIL);
+    }
+
+    if (c.cold_cache_mode_ == cold_cache_mode_t::none
+            && start_pos != std::string::npos) {
+        BENCHDNN_PRINT(0, "%s\n",
+                "Error: cold-cache extensions can't be enabled with cold-cache "
+                "disabled");
+        SAFE_V(FAIL);
+    }
+
+    while (start_pos != std::string::npos) {
+        std::string ext_str = get_substr(s, start_pos, '+');
+
+        size_t ext_pos = 0;
+        std::string ext_main_str = get_substr(ext_str, ext_pos, ':');
+        if (ext_main_str == "tlb") {
+            c.cold_tlb_ = true;
+            if (ext_pos != std::string::npos) {
+                std::string ext_aux_str = get_substr(ext_str, ext_pos, '\0');
+
+                const auto last_char = std::toupper(ext_aux_str.back());
+                if (last_char != 'G' && last_char != 'M') {
+                    BENCHDNN_PRINT(0,
+                            "Error: cold-TLB supports only \'M\' or \'G\' "
+                            "values for size modification. Given input: "
+                            "\'%c\'.\n",
+                            last_char);
+                    SAFE_V(FAIL);
+                }
+
+                std::string size_str = ext_aux_str;
+                // Remove size modifier to feed the rest for value verification.
+                size_str.pop_back();
+                const float size = stof_safe(size_str);
+                c.cold_tlb_size_ = static_cast<size_t>(
+                        size * 1024 * 1024 * (last_char == 'G' ? 1024 : 1));
+
+                // Save the input string once all values are verified.
+                c.cold_tlb_size_str_ = ext_aux_str;
             }
+        } else {
+            BENCHDNN_PRINT(0,
+                    "Error: unknown cold-cache extension \'%s\'. Supported "
+                    "values are \'tlb\'\n.",
+                    ext_main_str.c_str());
+            SAFE_V(FAIL);
         }
     }
 
-    return v;
+    return c;
 }
 
 } // namespace parser_utils
@@ -468,8 +549,8 @@ bool parse_encoding(std::vector<sparse_options_t> &sparse_options,
     static const std::string help
             = "ENCODING[+SPARSITY]:ENCODING[+SPARSITY]:ENCODING[+SPARSITY]\n   "
               "Specifies sparse encodings and sparsity.\n    More details at "
-              "https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn/"
-              "doc/knobs_encoding.md\n";
+              "https://github.com/uxlfoundation/oneDNN/blob/main/tests/"
+              "benchdnn/doc/knobs_encoding.md\n";
 
     std::vector<sparse_options_t> def {sparse_options_t()};
     auto parse_sparse_options_func = [](const std::string &s) {
@@ -513,8 +594,8 @@ bool parse_attr_post_ops(std::vector<attr_t::post_ops_t> &po, const char *str,
               "is one of those:\n    * SUM[:SCALE[:ZERO_POINT[:DATA_TYPE]]]\n  "
               "  * ELTWISE[:ALPHA[:BETA[:SCALE]]]\n    * DW:KkSsPp[:DST_DT]\n  "
               "  * BINARY:DT[:MASK_INPUT[:TAG]]\n    More details at "
-              "https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn/"
-              "doc/knobs_attr.md\n";
+              "https://github.com/uxlfoundation/oneDNN/blob/main/tests/"
+              "benchdnn/doc/knobs_attr.md\n";
     std::vector<attr_t::post_ops_t> def {attr_t::post_ops_t()};
     return parse_vector_option(po, def, parser_utils::parse_attr_post_ops_func,
             str, option_name, help);
@@ -525,8 +606,8 @@ bool parse_attr_scales(std::vector<attr_t::arg_scales_t> &scales,
     static const std::string help
             = "ARG:POLICY[:SCALE][+...]\n    Specifies input scales "
               "attribute.\n    More details at "
-              "https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn/"
-              "doc/knobs_attr.md\n";
+              "https://github.com/uxlfoundation/oneDNN/blob/main/tests/"
+              "benchdnn/doc/knobs_attr.md\n";
     return parse_subattr(scales, str, option_name, help);
 }
 
@@ -535,8 +616,8 @@ bool parse_attr_zero_points(std::vector<attr_t::zero_points_t> &zp,
     static const std::string help
             = "ARG:POLICY[:ZEROPOINT][+...]\n    Specifies zero-points "
               "attribute.\n    More details at "
-              "https://github.com/oneapi-src/oneDNN/blob/master/tests/benchdnn/"
-              "doc/knobs_attr.md\n";
+              "https://github.com/uxlfoundation/oneDNN/blob/main/tests/"
+              "benchdnn/doc/knobs_attr.md\n";
     return parse_subattr(zp, str, option_name, help);
 }
 
@@ -651,6 +732,37 @@ bool parse_test_pattern_match(const char *&match, const char *str,
             match, def_match, chars2chars, str, option_name, help);
 }
 
+bool parse_impl(impl_filter_t &impl_filter,
+        const impl_filter_t &def_impl_filter, const char *str,
+        const std::string &option_name) {
+    static const std::string help
+            = "STRINGS    (Default: not specified)\n    Instructs the driver "
+              "to fetch the next implementation from the list if fetched "
+              "implementation name doesn't match any from the `STRINGS` list "
+              "(a.k.a. include-list).\n    `STRINGS` is a comma-separated list "
+              "of string literal entries with no spaces.\n    When empty, the "
+              "option has no effect. The option is opposite to "
+              "`--skip-impl`.\n";
+
+    return parser_utils::parse_impl_filter(impl_filter, def_impl_filter,
+            /* use_impl = */ true, str, option_name, help);
+}
+
+bool parse_skip_impl(impl_filter_t &impl_filter,
+        const impl_filter_t &def_impl_filter, const char *str,
+        const std::string &option_name) {
+    static const std::string help
+            = "STRINGS    (Default: not specified)\n    Instructs the driver "
+              "to fetch the next implementation from the list if fetched "
+              "implementation name matches any from the `STRINGS` list (a.k.a. "
+              "exclude-list).\n    `STRINGS` is a comma-separated list of "
+              "string literal entries with no spaces.\n    When empty, the "
+              "option has no effect. The option is opposite to `--impl`.\n";
+
+    return parser_utils::parse_impl_filter(impl_filter, def_impl_filter,
+            /* use_impl = */ false, str, option_name, help);
+}
+
 bool parse_inplace(std::vector<bool> &inplace,
         const std::vector<bool> &def_inplace, const char *str,
         const std::string &option_name /* = "inplace"*/) {
@@ -873,35 +985,23 @@ static bool parse_check_ref_impl(
 static bool parse_cold_cache(
         const char *str, const std::string &option_name = "cold-cache") {
     static const std::string help
-            = "MODE    (Default: `none`)\n    Instructs the driver to enable a "
-              "cold cache for performance mode.\n    When set to `none` (the "
-              "default), cold cache is disabled.\n    When set to `wei`, cold "
-              "cache is enabled for weights argument only. Targets forward "
-              "propagation kind.\n    When set to `all`, cold cache is enabled "
-              "for each execution argument.\n    When set to `custom`, cold "
-              "cache is enabled for custom arguments which should be specified "
-              "directly in the code. Refer to doc for more details.\n";
-
-    const auto str2cold_cache_mode = [](const std::string &_str) {
-        cold_cache_mode_t cc_mode = default_cold_cache_mode;
-        if (_str == "none") {
-            cc_mode = cold_cache_mode_t::none;
-        } else if (_str == "wei") {
-            cc_mode = cold_cache_mode_t::wei;
-        } else if (_str == "all") {
-            cc_mode = cold_cache_mode_t::all;
-        } else if (_str == "custom") {
-            cc_mode = cold_cache_mode_t::custom;
-        } else {
-            BENCHDNN_PRINT(0, "%s \'%s\'\n%s", "Error: unknown cold cache mode",
-                    _str.c_str(), help.c_str());
-            SAFE_V(FAIL);
-        }
-        return cc_mode;
-    };
-
-    return parse_single_value_option(cold_cache_mode, cold_cache_mode_t::none,
-            str2cold_cache_mode, str, option_name, help);
+            = "MODE[+EXTENSION]    (Default: `empty`)\n    Instructs the "
+              "driver to enable a cold-cache feature for the performance "
+              "mode.\n    When `MODE` set to `none` (the default), the "
+              "cold-cache mode is disabled.\n    When `MODE` set to `wei`, the "
+              "cold-cache is enabled for weights argument only. Targets "
+              "forward propagation kind.\n    When `MODE` set to `all`, the "
+              "cold-cache is enabled for every execution argument.\n    When "
+              "`MODE` set to `custom`, the cold-cache is enabled for custom "
+              "arguments which should be specified directly in the code. Refer "
+              "to doc for more details.\n    Supported `EXTENSION` values:\n   "
+              " * `tlb[:SIZE]`, where `SIZE` is a string-literal with "
+              "floating-point number followed by `M` (Megabytes) or `G` "
+              "(Gigabytes) characters, e.g., `tlb:500M`.\n";
+
+    return parse_single_value_option(cold_cache_input,
+            default_cold_cache_input(), parser_utils::str2cold_cache_input, str,
+            option_name, help);
 }
 
 static bool parse_cpu_isa_hints(
@@ -924,22 +1024,34 @@ static bool parse_engine(
               "use an engine with requested `KIND`.\n    `KIND` values can be "
               "`cpu` or `gpu`.\n    `INDEX` is an integer value specifying "
               "which engine to use if several were identified.\n";
+
+    // Note: this is a special case because index and engine kind are parsed
+    // into separate global objects instead of one under a common parsing
+    // function.
+    // TODO: fix this.
+    //
+    // Because of this fact, need to extract kind separated by `:`. `:` can be
+    // valid dangling for certain options in the command line (--strides=::).
+    // Thus, extract the kind allowing dangling. Verify, it's `--engine` option,
+    // and if yes, perform a safe check for dangling after.
+    size_t start_pos = 0;
+    std::string kind_str = get_substr(str, start_pos, ':', true);
+
     if (!parse_single_value_option(engine_tgt_kind, dnnl_cpu, str2engine_kind,
-                str, option_name, help))
+                kind_str.c_str(), option_name, help))
         return false;
-    // Parse engine index if present
-    std::string s(str);
-    auto start_pos = s.find_first_of(':');
-    if (start_pos != eol) engine_index = std::stoi(s.substr(start_pos + 1));
 
-    auto n_devices = dnnl_engine_get_count(engine_tgt_kind);
-    if (engine_index >= n_devices) {
-        fprintf(stderr,
-                "ERROR: requested engine with index %ld is not registered in "
-                "the system. Number of devices registered is %ld.\n",
-                (long)engine_index, (long)n_devices);
-        exit(2);
+    // This is to catch a dangling `:` at the end of `--engine`.
+    start_pos = 0;
+    kind_str = get_substr(str, start_pos, ':');
+
+    if (start_pos != std::string::npos) {
+        std::string index_str(str + start_pos);
+        // If the index is a valid number, let the library catch potential
+        // issues around unavailable devices, etc.
+        engine_index = parser_utils::stoll_safe(index_str);
     }
+
     return true;
 }
 
@@ -964,29 +1076,26 @@ static bool parse_fast_ref(
     return parsed;
 }
 
-// TODO: remove
-static bool parse_fast_ref_gpu(
-        const char *str, const std::string &option_name = "fast-ref-gpu") {
-    bool parsed = parse_single_value_option(fast_ref, default_fast_ref,
-            str2bool, str, option_name, std::string());
+static bool parse_global_impl(
+        const char *str, const std::string &option_name = "global-impl") {
+    static const std::string help
+            = "STRINGS    (Default: not specified)\n    Same as `--impl` but "
+              "overrides any values from `--impl` or `--skip-impl` options met "
+              "on the way.\n";
 
-    static bool msg_printed = false;
-    if (parsed && !msg_printed) {
-        BENCHDNN_PRINT(0, "%s\n",
-                "Warning: \'--fast-ref-gpu\' is deprecated. Use \'--fast-ref\' "
-                "instead.");
-        msg_printed = true;
-    }
-#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE
-    if (parsed && fast_ref) {
-        fast_ref = false;
-        fprintf(stderr,
-                "%s driver: WARNING: option `fast_ref` is not supported "
-                "for GPU only configurations.\n",
-                driver_name.c_str());
-    }
-#endif
-    return parsed;
+    return parser_utils::parse_impl_filter(global_impl_filter, impl_filter_t(),
+            /* use_impl = */ true, str, option_name, help);
+}
+
+static bool parse_global_skip_impl(
+        const char *str, const std::string &option_name = "global-skip-impl") {
+    static const std::string help
+            = "STRINGS    (Default: not specified)\n    Same as `--skip-impl` "
+              "but overrides any values from `--impl` or `--skip-impl` options "
+              "met on the way.\n";
+
+    return parser_utils::parse_impl_filter(global_impl_filter, impl_filter_t(),
+            /* use_impl = */ false, str, option_name, help);
 }
 
 bool parse_ctx(std::vector<thr_ctx_t> &ctx,
@@ -1185,7 +1294,10 @@ static bool parse_mode(
                 case 'i':
                 case 'I': mode = bench_mode_t::init; break;
                 case 'r':
-                case 'R': mode = bench_mode_t::exec; break;
+                case 'R':
+                    mode = bench_mode_t::exec;
+                    bench_mode_modifier |= mode_modifier_t::no_ref_memory;
+                    break;
                 case 'c':
                 case 'C': mode = bench_mode_t::corr; break;
                 case 'p':
@@ -1194,8 +1306,8 @@ static bool parse_mode(
                 case 'F':
                     mode = bench_mode_t::perf_fast;
                     max_ms_per_prb = 10;
-                    bench_mode_modifier = mode_modifier_t::par_create
-                            | mode_modifier_t::no_host_memory;
+                    bench_mode_modifier |= mode_modifier_t::par_create
+                            | mode_modifier_t::no_ref_memory;
                     break;
                 case 'b':
                 case 'B': mode = bench_mode_t::bitwise; break;
@@ -1223,21 +1335,20 @@ static bool parse_mode_modifier(
               "          The flow will create as many primitives in parallel \n"
               "          as number of threads identified on the system \n"
               "          first, then execute them one by one.\n"
-              "    * `M` to disable usage of host memory.\n"
+              "    * `M` to disable usage of reference memory.\n"
               "          It removes any overheads for mapping, unmapping and \n"
               "          reorders used in filling functions (disabled).\n"
-              "          Applicable for performance mode and GPU engine only.\n"
               "    More details at "
             + doc_url + "benchdnn_general_info.md\n";
 
     const auto str2mode_modifier = [](const std::string &_str) {
-        mode_modifier_t modifier = default_bench_mode_modifier;
+        mode_modifier_t modifier = bench_mode_modifier;
         for (auto s : _str) {
             switch (s) {
                 case 'p':
                 case 'P': modifier |= mode_modifier_t::par_create; break;
                 case 'm':
-                case 'M': modifier |= mode_modifier_t::no_host_memory; break;
+                case 'M': modifier |= mode_modifier_t::no_ref_memory; break;
                 default:
                     BENCHDNN_PRINT(0, "%s\n%s",
                             "Error: modifier value is invalid.", help.c_str());
@@ -1253,30 +1364,6 @@ static bool parse_mode_modifier(
             help);
 }
 
-static bool parse_skip_impl(
-        const char *str, const std::string &option_name = "skip-impl") {
-    static const std::string help
-            = "STRING    (Default: not specified)\n    Instructs the driver to "
-              "iterate over implementations when fetched implementation name "
-              "matching `STRING`.\n    `STRING` is a string literal with no "
-              "spaces.\n    When empty, option has no effect.\n";
-    const auto chars2chars = [](const char *str) { return str; };
-    bool parsed = parse_single_value_option(
-            skip_impl, std::string(), chars2chars, str, option_name, help);
-
-    // Remove all quotes from input string since they affect the search.
-    if (parsed) {
-        for (auto c : {'"', '\''}) {
-            size_t start_pos = 0;
-            while (start_pos != eol) {
-                start_pos = skip_impl.find_first_of(c, start_pos);
-                if (start_pos != eol) skip_impl.erase(start_pos, 1);
-            }
-        }
-    }
-    return parsed;
-}
-
 static bool parse_start(
         const char *str, const std::string &option_name = "start") {
     static const std::string help
@@ -1309,6 +1396,16 @@ static bool parse_stream_kind(
     return parsed;
 }
 
+static bool parse_summary(
+        const char *str, const std::string &option_name = "summary") {
+    static const std::string help
+            = "STRING    (Default: `failures`)\n    Instructs benchdnn to "
+              "print additional statistics and information based on the STRING "
+              "values.\n";
+    return parse_single_value_option(summary, summary_t(),
+            parser_utils::parse_summary_str, str, option_name, help);
+}
+
 static bool parse_verbose(
         const char *str, const std::string &option_name = "verbose") {
     static const std::string help
@@ -1328,6 +1425,33 @@ static bool parse_verbose(
     return false;
 }
 
+static bool parse_execution_mode(
+        const char *str, const std::string &option_name = "execution-mode") {
+    static const std::string help
+            = "MODE    (Default: direct)\n"
+              "    Specifies a `MODE` of execution.\n"
+              "    `MODE` values are:\n"
+              "    * `direct` instruction the driver to execute the primitive "
+              "directly.\n"
+              "    * `graph` to execute the primitive using a graph backend.\n"
+              "          Currently limited to the experimental SYCL Graph on "
+              "DPC++ builds.\n";
+    bool parsed = parse_single_value_option(execution_mode,
+            execution_mode_t::direct, str2execution_mode, str, option_name,
+            help);
+
+#if !defined(DNNL_WITH_SYCL)
+    if (parsed) {
+        BENCHDNN_PRINT(0,
+                "Error: option `--%s` is supported with DPC++ "
+                "builds only, exiting...\n",
+                option_name.c_str());
+        SAFE_V(FAIL);
+    }
+#endif
+    return parsed;
+}
+
 bool parse_bench_settings(const char *str) {
     last_parsed_is_problem = false; // if start parsing, expect an option
 
@@ -1337,7 +1461,7 @@ bool parse_bench_settings(const char *str) {
         help_ss << "= Global options: =\n";
         help_ss << "===================\n";
         help_ss << "(More technical details available at "
-                   "https://github.com/oneapi-src/oneDNN/blob/master/tests/"
+                   "https://github.com/uxlfoundation/oneDNN/blob/main/tests/"
                    "benchdnn/doc/knobs_common.md)\n\n";
         start_msg = true;
     }
@@ -1346,12 +1470,14 @@ bool parse_bench_settings(const char *str) {
             || parse_attr_same_pd_check(str) || parse_canonical(str)
             || parse_check_ref_impl(str) || parse_cold_cache(str)
             || parse_cpu_isa_hints(str) || parse_engine(str)
-            || parse_fast_ref(str) || parse_fast_ref_gpu(str)
-            || parse_fix_times_per_prb(str) || parse_max_ms_per_prb(str)
-            || parse_num_streams(str) || parse_repeats_per_prb(str)
-            || parse_mem_check(str) || parse_memory_kind(str) || parse_mode(str)
-            || parse_mode_modifier(str) || parse_skip_impl(str)
-            || parse_start(str) || parse_stream_kind(str) || parse_verbose(str);
+            || parse_fast_ref(str) || parse_fix_times_per_prb(str)
+            || parse_global_impl(str) || parse_global_skip_impl(str)
+            || parse_max_ms_per_prb(str) || parse_num_streams(str)
+            || parse_repeats_per_prb(str) || parse_mem_check(str)
+            || parse_memory_kind(str) || parse_mode(str)
+            || parse_mode_modifier(str) || parse_start(str)
+            || parse_stream_kind(str) || parse_summary(str)
+            || parse_verbose(str) || parse_execution_mode(str);
 
     // Last condition makes this help message to be triggered once driver_name
     // is already known.
@@ -1360,7 +1486,7 @@ bool parse_bench_settings(const char *str) {
         help_ss << "= Driver options: =\n";
         help_ss << "===================\n";
         help_ss << "(More technical details available at "
-                   "https://github.com/oneapi-src/oneDNN/blob/master/tests/"
+                   "https://github.com/uxlfoundation/oneDNN/blob/main/tests/"
                    "benchdnn/doc/driver_"
                 << driver_name << ".md)\n\n";
         end_msg = true;
@@ -1397,10 +1523,16 @@ int parse_last_argument() {
     return OK;
 }
 
-std::string get_substr(const std::string &s, size_t &start_pos, char delim) {
+std::string get_substr(const std::string &s, size_t &start_pos, char delim,
+        bool allow_dangling) {
     auto end_pos = s.find_first_of(delim, start_pos);
     auto sub = s.substr(start_pos, end_pos - start_pos);
     start_pos = end_pos + (end_pos != eol);
+    if (!allow_dangling && start_pos == s.size()) {
+        BENCHDNN_PRINT(0, "%s \'%s\'\n",
+                "Error: dangling symbol at the end of input", s.c_str());
+        SAFE_V(FAIL);
+    }
     return sub;
 }
 
diff --git a/tests/benchdnn/utils/parser.hpp b/tests/benchdnn/utils/parser.hpp
index ff87dd793c7..611878e1489 100644
--- a/tests/benchdnn/utils/parser.hpp
+++ b/tests/benchdnn/utils/parser.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include "dnnl_debug.hpp"
 #include "tests/test_thread.hpp"
 #include "utils/dims.hpp"
+#include "utils/impl_filter.hpp"
 #include "utils/settings.hpp"
 
 namespace parser {
@@ -81,8 +82,16 @@ static bool parse_vector_str(T &vec, const T &def, F process_func,
                     "is not expected to be empty. Given input:", str.c_str());
             SAFE_V(FAIL);
         }
+        //NOLINTBEGIN(readability-redundant-string-cstr)
+        // parser.hpp: error: no viable conversion from 'string' to 'const char *'
+        // note: in instantiation of function template specialization
+        // 'parser::parse_vector_option<std::vector<unsigned int>, unsigned int (*)(const char *)>'
+        // parse_vector_option(s.flags, def.flags, str2flags, argv[0],
+        // ^
+        // TODO: move all functions to std::string input type.
         vec.push_back(
                 process_func(str.substr(pos_st, pos_en - pos_st).c_str()));
+        //NOLINTEND(readability-redundant-string-cstr)
         if (pos_en == eol) break;
         idx++;
     }
@@ -232,6 +241,12 @@ bool parse_ctx_init(std::vector<thr_ctx_t> &ctx,
         const std::vector<thr_ctx_t> &def_ctx, const char *str);
 bool parse_ctx_exe(std::vector<thr_ctx_t> &ctx,
         const std::vector<thr_ctx_t> &def_ctx, const char *str);
+bool parse_impl(impl_filter_t &impl_filter,
+        const impl_filter_t &def_impl_filter, const char *str,
+        const std::string &option_name = "impl");
+bool parse_skip_impl(impl_filter_t &impl_filter,
+        const impl_filter_t &def_impl_filter, const char *str,
+        const std::string &option_name = "skip-impl");
 
 bool parse_axis(std::vector<int> &axis, const std::vector<int> &def_axis,
         const char *str, const std::string &option_name = "axis");
@@ -282,6 +297,19 @@ void parse_prb_dims(prb_dims_t &prb_dims, const std::string &str);
 // service functions
 bool parse_bench_settings(const char *str);
 
+template <typename S>
+bool parse_driver_shared_settings(S &s, const S &def, const char *str) {
+    return parse_attributes(s, def, str)
+            || parse_ctx_init(s.ctx_init, def.ctx_init, str)
+            || parse_ctx_exe(s.ctx_exe, def.ctx_exe, str)
+            || parse_test_pattern_match(s.pattern, str)
+            || parse_impl(s.impl_filter, def.impl_filter, str)
+            || parse_skip_impl(s.impl_filter, def.impl_filter, str)
+            || parse_perf_template(s.perf_template, s.perf_template_def,
+                    s.perf_template_csv(), str)
+            || parse_reset(s, str) || parse_help(str);
+}
+
 void catch_unknown_options(const char *str);
 
 int parse_last_argument();
@@ -289,13 +317,16 @@ int parse_last_argument();
 // Function returns a substring of a given string @p `s`, using @p `start_pos`
 // to start a search from this index in string and @p `delim` as a stop symbol
 // and sets a @p `start_pos` to the next symbol after `delim` or to `npos`.
+// `allow_dangling` skips a check for dangling symbol at the end of the string
+// as some inputs ending on a `delim` in rare cases are legit.
 // E.g. 1) s=apple:juice, start_pos=0, delim=':'
 //         get_substr -> apple && start_pos -> 6
 //      2) s=apple:juice, start_pos=6, delim=':'
 //         get_substr -> juice && start_pos -> npos
 //      3) s=apple:juice, start_pos=0, delim=';'
 //         get_substr -> apple:juice && start_pos -> npos
-std::string get_substr(const std::string &s, size_t &start_pos, char delim);
+std::string get_substr(const std::string &s, size_t &start_pos, char delim,
+        bool allow_dangling = false);
 
 } // namespace parser
 
diff --git a/tests/benchdnn/utils/res.hpp b/tests/benchdnn/utils/res.hpp
index 5189da78b80..15908b406e7 100644
--- a/tests/benchdnn/utils/res.hpp
+++ b/tests/benchdnn/utils/res.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,53 +17,12 @@
 #ifndef UTILS_RES_HPP
 #define UTILS_RES_HPP
 
-#include <cstring>
-#include <vector>
-
-#include "oneapi/dnnl/dnnl.h"
 #include "oneapi/dnnl/dnnl_types.h"
 
 #include "utils/timer.hpp"
 
-struct check_mem_size_args_t {
-
-    check_mem_size_args_t() = default;
-    check_mem_size_args_t(const_dnnl_primitive_desc_t pd, bool want_input)
-        : pd(pd)
-        , want_input(want_input)
-        , is_scratchpad(false)
-        , total_size_device(0)
-        , total_size_cpu(0)
-        , scratchpad_size(0) {
-        // initialize the memory size for reference path
-        memset(total_ref_md_size, 0, sizeof(total_ref_md_size));
-    }
-
-    // Input args.
-    const_dnnl_primitive_desc_t pd;
-    bool want_input;
-    bool is_scratchpad;
-
-    // Output args:
-    // `sizes` used to validate OpenCL memory requirements.
-    std::vector<size_t> sizes;
-    // `total_size_device` specifies memory allocated on device for a test obj.
-    size_t total_size_device;
-    // `total_size_cpu` specifies:
-    // * Memory allocated for reference ocmputations (`C` mode only).
-    // * Memory allocated for comparison results (`C` mode only).
-    // * Memory allocated for mapping device memory (GPU backend only).
-    // * Memory allocated on CPU for a test obj (CPU backend only).
-    size_t total_size_cpu;
-    // `total_ref_md_size` specifies the additional tag::abx f32 memory
-    // required for correctness check.
-    // * The first element refers to the total memory for input reference
-    // * The second element refers to the total memory for output reference
-    // The args are used in memory estimation for graph driver only.
-    size_t total_ref_md_size[2];
-    // `scratchpad_size` specifies a scratchpad size for specific checks.
-    size_t scratchpad_size;
-};
+#include <string>
+#include <vector>
 
 /* result structure */
 enum res_state_t {
@@ -96,6 +55,45 @@ enum dir_t {
     BWD_WB = FLAG_BWD + FLAG_WEI + FLAG_BIA,
 };
 
+struct check_mem_size_args_t {
+    check_mem_size_args_t() = default;
+    check_mem_size_args_t(
+            const_dnnl_primitive_desc_t pd, bool want_input, dir_t dir)
+        : pd(pd), want_input(want_input), dir(dir) {}
+
+    // Input args.
+    const_dnnl_primitive_desc_t pd = nullptr;
+    bool want_input = false;
+    dir_t dir = DIR_UNDEF; // See ANCHOR: MEM_CHECK_ARGS_DIR;
+
+    // Output args:
+    // `sizes` used to validate OpenCL memory requirements.
+    std::vector<size_t> sizes;
+    // `total_size_device` specifies memory allocated on device for a test obj.
+    // It's an accumulated result of `sizes` values.
+    size_t total_size_device = 0;
+    // `total_size_ref` specifies Memory allocated for reference computations
+    // (`C` mode only). This value can represent either memory sizes needed for
+    // a naive reference implementation on plain formats, or memory sizes needed
+    // for a prim_ref (--fast-ref) test object which can utilize blocked
+    // formats.
+    size_t total_size_ref = 0;
+    // `total_size_compare` specifies memory allocated for comparison results
+    // tensor (`C` mode only).
+    size_t total_size_compare = 0;
+    // `total_size_mapped` specifies memory allocated for mapped buffers on the
+    // host (GPU backend only).
+    size_t total_size_mapped = 0;
+    // `total_ref_md_size` specifies the additional tag::abx f32 memory
+    // required for correctness check.
+    // * The first element refers to the total memory for input reference
+    // * The second element refers to the total memory for output reference
+    // The args are used in memory estimation for graph driver only.
+    size_t total_ref_md_size[2] = {0, 0};
+    // `scratchpad_size` specifies a scratchpad size for specific checks.
+    size_t scratchpad_size = 0;
+};
+
 struct res_t {
     res_state_t state;
     size_t errors, total;
@@ -103,10 +101,8 @@ struct res_t {
     std::string impl_name;
     std::string prim_ref_repro;
     std::string reason;
+    // TODO: fuse `ibytes` and `obytes` into `mem_size_args`.
     size_t ibytes, obytes;
-
-    // TODO: merge mem_check_dir into check_mem_size_args_t
-    dir_t mem_check_dir = DIR_UNDEF;
     check_mem_size_args_t mem_size_args;
 };
 
diff --git a/tests/benchdnn/utils/settings.hpp b/tests/benchdnn/utils/settings.hpp
index fd34c835535..19b00c38acd 100644
--- a/tests/benchdnn/utils/settings.hpp
+++ b/tests/benchdnn/utils/settings.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #ifndef UTILS_SETTINGS_HPP
 #define UTILS_SETTINGS_HPP
 
+#include "utils/impl_filter.hpp"
+
 struct base_settings_t {
     struct settings_attributes_t {
         void clear() { attrs_.clear(); }
@@ -92,7 +94,8 @@ struct base_settings_t {
             attr_t::rounding_mode_t()};
     std::vector<thr_ctx_t> ctx_init {default_thr_ctx};
     std::vector<thr_ctx_t> ctx_exe {default_thr_ctx};
-    const char *pattern = NULL;
+    impl_filter_t impl_filter;
+    const char *pattern = nullptr;
     // Non-parsed members
     settings_attributes_t attributes;
 
@@ -127,4 +130,19 @@ struct base_settings_t {
     }
 };
 
+// TODO: move to prb.hpp when it appears.
+template <typename T>
+void broadcast_vector(std::vector<T> &v, const int n_inputs) {
+    // If it's not a single element in vector, nothing to broadcast.
+    if (v.size() != 1) return;
+
+    // std::vector<T>::assign invalidates all iterators and references.
+    // `this->stag.assign(prb_vdims.n_inputs(), stag[0]);` line can
+    // crash depending on the implementation whether it saves `stag[0]`
+    // before invalidating or not. Windows didn't.
+    //NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
+    const auto val = v[0];
+    v.assign(n_inputs, val);
+}
+
 #endif
diff --git a/tests/benchdnn/utils/task.hpp b/tests/benchdnn/utils/task.hpp
index 815b0c64900..ad2f9ac294c 100644
--- a/tests/benchdnn/utils/task.hpp
+++ b/tests/benchdnn/utils/task.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,53 +26,72 @@
 #include "utils/wrapper.hpp"
 
 template <typename prb_t, typename perf_report_t, typename create_func_t,
-        typename check_cache_func_t, typename do_func_t>
+        typename check_func_t, typename do_func_t>
 struct task_t {
     task_t(const prb_t &prb, const std::string &perf_template,
-            const create_func_t &create_func,
-            const check_cache_func_t &check_cache_func,
+            const create_func_t &create_func, const check_func_t &check_func,
             const do_func_t &do_func, int idx)
         : prb_(std::move(prb))
         , create_func_(create_func)
-        , check_cache_func_(check_cache_func)
+        , check_func_(check_func)
         , do_func_(do_func)
         , perf_template_(perf_template)
         , idx_(idx) {}
 
-    int create() {
-        BENCHDNN_PRINT(1, "create: %s\n", prb_.str());
+    int create(bool in_parallel) {
+        // Report creation status for problems only in sequential mode as in
+        // parallel it's still not clear which one failed.
+        if (!in_parallel) BENCHDNN_PRINT(1, "create: %s\n", prb_.str());
+
         if (skip_start(&res_, idx_)) return OK;
         if (bench_mode == bench_mode_t::list) return res_.state = LISTED, OK;
 
         v_prim_ = std::make_shared<
                 std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>>>();
+
+        // A timer for each test case. Starts from `create_func_` and ends at
+        // `parse_result`.
+        auto &tct = res_.timer_map.get_timer(timer::names::test_case_timer);
+        tct.start();
+
         SAFE(create_func_(*v_prim_, &prb_, &res_), WARN);
         return OK;
     }
 
-    // Since task_t doesn't have a control over primitives, it has to pass this
-    // control to a driver which is aware of what primitives should be checked
-    // for being in the cache.
-    int check_cache() {
-        if (!has_bench_mode_bit(mode_bit_t::corr)) return OK;
+    // Since `task_t` doesn't have control over primitives, it delegates the
+    // primitive-based checks to the driver.
+    int check() {
+        // No alive testing objects - no checks.
         if (res_.state != INITIALIZED) return OK;
 
-        return check_cache_func_(*v_prim_, &prb_, &res_);
+        return check_func_(*v_prim_, &prb_, &res_);
     }
 
     int exec() {
-        BENCHDNN_PRINT(1, "run: %s\n", prb_.str());
+        // Checking for `INITIALIZED` state here prevents from `SKIPPED`
+        // problems being executed.
         if (res_.state == INITIALIZED && bench_mode != bench_mode_t::init) {
+            // Differentiate a message when the run happens...
+            BENCHDNN_PRINT(1, "run: %s\n", prb_.str());
             do_func_(*v_prim_, &prb_, &res_);
+        } else {
+            // ... versus when it didn't but still indicating the problem went
+            // through this part of the flow.
+            BENCHDNN_PRINT(1, "run (just report, no exec): %s\n", prb_.str());
         }
 
+        // A timer for each test case. Starts from `create_func_` and ends at
+        // `parse_result`.
+        auto &tct = res_.timer_map.get_timer(timer::names::test_case_timer);
+        tct.stamp();
+
         return report();
     }
 
 private:
     prb_t prb_;
     create_func_t create_func_;
-    check_cache_func_t check_cache_func_;
+    check_func_t check_func_;
     do_func_t do_func_;
     std::string perf_template_;
     res_t res_ {};
diff --git a/tests/benchdnn/utils/task_executor.hpp b/tests/benchdnn/utils/task_executor.hpp
index 5fe00dc300c..7b752615321 100644
--- a/tests/benchdnn/utils/task_executor.hpp
+++ b/tests/benchdnn/utils/task_executor.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,21 +20,35 @@
 #include "utils/parallel.hpp"
 #include "utils/task.hpp"
 
+// A macro serves an unification purpose.
+// It must be a macro due to `prb_t` type is unique per driver.
+#define TASK_EXECUTOR_DECL_TYPES \
+    using create_func_t = std::function<int( \
+            std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, \
+            const prb_t *, res_t *)>; \
+    using check_func_t = std::function<int( \
+            std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, \
+            const prb_t *, res_t *)>; \
+    using do_func_t = std::function<int( \
+            const std::vector<benchdnn_dnnl_wrapper_t<dnnl_primitive_t>> &, \
+            const prb_t *, res_t *)>; \
+    using driver_task_executor_t = task_executor_t<prb_t, perf_report_t, \
+            create_func_t, check_func_t, do_func_t>;
+
 extern int repeats_per_prb;
 
 template <typename prb_t, typename perf_report_t, typename create_func_t,
-        typename check_cache_func_t, typename do_func_t>
+        typename check_func_t, typename do_func_t>
 struct task_executor_t {
     virtual ~task_executor_t() { assert(tasks_.empty()); }
 
     void submit(const prb_t &prb, const std::string &perf_template,
-            const create_func_t &create_func,
-            const check_cache_func_t &check_cache_func,
+            const create_func_t &create_func, const check_func_t &check_func,
             const do_func_t &do_func) {
         static const int nthreads = benchdnn_get_max_threads();
         for (int r = 0; r < repeats_per_prb; r++) {
-            tasks_.emplace_back(prb, perf_template, create_func,
-                    check_cache_func, do_func, get_idx());
+            tasks_.emplace_back(prb, perf_template, create_func, check_func,
+                    do_func, get_idx());
             if (has_bench_mode_modifier(mode_modifier_t::par_create)
                     && static_cast<int>(tasks_.size()) < nthreads)
                 continue;
@@ -46,14 +60,14 @@ struct task_executor_t {
         // Special case is needed for THREADPOOL RUNTIME. Both `Parallel_nd` and
         // `createit` calls activate threadpool which causes undesired behavior.
         if (tasks_.size() == 1)
-            tasks_[0].create();
+            tasks_[0].create(/* in_parallel = */ false);
         else
-            benchdnn_parallel_nd(
-                    tasks_.size(), [&](int i) { tasks_[i].create(); });
+            benchdnn_parallel_nd(tasks_.size(),
+                    [&](int i) { tasks_[i].create(/* in_parallel = */ true); });
 
         // Check caches first to avoid filling cache with service reorders.
         for (auto &t : tasks_) {
-            t.check_cache();
+            t.check();
         }
 
         for (auto &t : tasks_) {
@@ -63,7 +77,7 @@ struct task_executor_t {
         tasks_.clear();
     }
 
-    std::vector<task_t<prb_t, perf_report_t, create_func_t, check_cache_func_t,
+    std::vector<task_t<prb_t, perf_report_t, create_func_t, check_func_t,
             do_func_t>>
             tasks_;
 
diff --git a/tests/benchdnn/utils/timer.cpp b/tests/benchdnn/utils/timer.cpp
index 856e1eafea9..e8b0d4f134c 100644
--- a/tests/benchdnn/utils/timer.cpp
+++ b/tests/benchdnn/utils/timer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -112,8 +112,14 @@ const std::vector<service_timers_entry_t> &get_global_service_timers() {
     // initializer list would use explicit constructor
     // ‘constexpr std::tuple<...>’.
     static const std::vector<service_timers_entry_t> global_service_timers = {
+            service_timers_entry_t {
+                    "create_pd", mode_bit_t::init, timer::names::cpd_timer},
+            service_timers_entry_t {
+                    "create_prim", mode_bit_t::init, timer::names::cp_timer},
             service_timers_entry_t {
                     "fill", mode_bit_t::exec, timer::names::fill_timer},
+            service_timers_entry_t {
+                    "execute", mode_bit_t::exec, timer::names::execute_timer},
             service_timers_entry_t {
                     "compute_ref", mode_bit_t::corr, timer::names::ref_timer},
             service_timers_entry_t {
diff --git a/tests/benchdnn/utils/timer.hpp b/tests/benchdnn/utils/timer.hpp
index bdceec73d63..a27d6a8195c 100644
--- a/tests/benchdnn/utils/timer.hpp
+++ b/tests/benchdnn/utils/timer.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #define TIME_FUNC(func, res, name) \
     do { \
         if (res) { \
-            auto &t = res->timer_map.get_timer(name); \
+            auto &t = (res)->timer_map.get_timer(name); \
             t.start(); \
             func; \
             t.stamp(); \
@@ -39,6 +39,8 @@
 #define TIME_COMPARE(func) TIME_FUNC(func, res, timer::names::compare_timer)
 // Designated timer to calculate time spent on filling
 #define TIME_FILL(func) TIME_FUNC(func, res, timer::names::fill_timer)
+// Designated timer to calculate time spent on execute
+#define TIME_EXECUTE(func) TIME_FUNC(func, res, timer::names::execute_timer)
 
 namespace timer {
 
@@ -101,6 +103,10 @@ const std::string cp_timer = "create_prim_timer";
 const std::string compare_timer = "compare_timer";
 // Driver's memory filling.
 const std::string fill_timer = "fill_timer";
+// Test case timer from the create function till dumping the output.
+const std::string test_case_timer = "test_case_timer";
+// Driver's execute.
+const std::string execute_timer = "execute_timer";
 } // namespace names
 
 struct timer_map_t {
diff --git a/tests/benchdnn/utils/wrapper.hpp b/tests/benchdnn/utils/wrapper.hpp
index 26b4a726871..859cde15579 100644
--- a/tests/benchdnn/utils/wrapper.hpp
+++ b/tests/benchdnn/utils/wrapper.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -81,8 +81,7 @@ struct benchdnn_dnnl_wrapper_t {
         return *this;
     }
 
-    benchdnn_dnnl_wrapper_t(benchdnn_dnnl_wrapper_t &&rhs) {
-        t_ = nullptr;
+    benchdnn_dnnl_wrapper_t(benchdnn_dnnl_wrapper_t &&rhs) : t_(nullptr) {
         *this = std::move(rhs);
     }
 
diff --git a/tests/gtests/CMakeLists.txt b/tests/gtests/CMakeLists.txt
index 9a5c3448187..52f160043ef 100644
--- a/tests/gtests/CMakeLists.txt
+++ b/tests/gtests/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2016-2024 Intel Corporation
+# Copyright 2016-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ if(WIN32 AND DNNL_WITH_SYCL)
     add_definitions(-DGTEST_HAS_SEH=0)
 endif()
 
-add_subdirectory (gtest)
+add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/gtest gtest)
 
 set(APP_NAME "gtest")
 set(MAIN_SRC_GTEST ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
@@ -268,9 +268,6 @@ if(NOT DNNL_ENABLE_STACK_CHECKER)
     endif()
 
     if(ONEDNN_BUILD_GRAPH)
-        if(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND)
-            add_definitions_with_host_compiler(-DDNNL_ENABLE_COMPILER_BACKEND)
-        endif()
         add_subdirectory(graph)
     endif()
 endif()
diff --git a/tests/gtests/dnnl_test_common.hpp b/tests/gtests/dnnl_test_common.hpp
index 8d90cca58c7..ba4fba33712 100644
--- a/tests/gtests/dnnl_test_common.hpp
+++ b/tests/gtests/dnnl_test_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -101,7 +101,7 @@ dnnl::engine::kind get_test_engine_kind();
 dnnl::engine get_test_engine();
 #endif
 
-inline int get_vendor_id(const std::string &vendor) {
+inline uint32_t get_vendor_id(const std::string &vendor) {
     if (vendor == "nvidia") {
         return 0x10DE;
     } else if (vendor == "amd") {
@@ -109,7 +109,7 @@ inline int get_vendor_id(const std::string &vendor) {
     } else if (vendor == "intel") {
         return 0x8086;
     } else {
-        return -1;
+        return 0x0;
     }
 }
 
@@ -215,39 +215,39 @@ inline bool unsupported_prop_kind(
 #endif
 
 template <typename data_t>
-struct data_traits {};
+struct data_traits_t {};
 template <>
-struct data_traits<float16_t> {
+struct data_traits_t<float16_t> {
     static const auto data_type = memory::data_type::f16;
 
     using uint_type = uint16_t;
 };
 template <>
-struct data_traits<bfloat16_t> {
+struct data_traits_t<bfloat16_t> {
     static const auto data_type = memory::data_type::bf16;
 
     using uint_type = uint16_t;
 };
 template <>
-struct data_traits<float> {
+struct data_traits_t<float> {
     static const auto data_type = memory::data_type::f32;
 
     using uint_type = uint32_t;
 };
 template <>
-struct data_traits<uint8_t> {
+struct data_traits_t<uint8_t> {
     static const auto data_type = memory::data_type::u8;
 
     using uint_type = uint8_t;
 };
 template <>
-struct data_traits<int8_t> {
+struct data_traits_t<int8_t> {
     static const auto data_type = memory::data_type::s8;
 
     using uint_type = uint8_t;
 };
 template <>
-struct data_traits<int32_t> {
+struct data_traits_t<int32_t> {
     static const auto data_type = memory::data_type::s32;
 
     using uint_type = uint32_t;
@@ -297,15 +297,15 @@ inline memory::dim right_padding(memory::dim i, memory::dim o, memory::dim k,
 
 template <typename data_t>
 struct acc_t {
-    typedef data_t type;
+    using type = data_t;
 };
 template <>
 struct acc_t<int8_t> {
-    typedef int type;
+    using type = int;
 };
 template <>
 struct acc_t<uint8_t> {
-    typedef int type;
+    using type = int;
 };
 
 // Smart pointer for map/unmap operations with unique_ptr semantics
@@ -314,9 +314,8 @@ struct mapped_ptr_t {
     using nonconst_type = typename std::remove_cv<T>::type;
 
     mapped_ptr_t(std::nullptr_t) : mem_(nullptr), ptr_(nullptr) {}
-    mapped_ptr_t(const memory *mem) : mem_(mem) {
-        ptr_ = mem->map_data<nonconst_type>();
-    }
+    mapped_ptr_t(const memory *mem)
+        : mem_(mem), ptr_(mem->map_data<nonconst_type>()) {}
     mapped_ptr_t(mapped_ptr_t &&other) : mem_(other.mem_), ptr_(other.ptr_) {
         other.mem_ = nullptr;
         other.ptr_ = nullptr;
@@ -368,12 +367,12 @@ void check_zero_tail(int set_zero_flag, const memory &src) {
 
     for (memory::dim i = 0; i < nelems; ++i) {
         memory::dim off = 0;
-        bool flag = 0;
+        bool flag = false;
         for (int j = 0; j < ndims; ++j) {
             off += idx[j] * str[j];
-            if (idx[j] >= dims[ndims - j - 1]) flag = 1;
+            if (idx[j] >= dims[ndims - j - 1]) flag = true;
         }
-        if (flag == 1) {
+        if (flag == true) {
             memory::dim blk_off = mdw.off_l(off, true);
             if (set_zero_flag) {
                 src_data[blk_off] = 0.0;
@@ -391,18 +390,18 @@ void check_zero_tail(int set_zero_flag, const memory &src) {
     }
 }
 
-inline memory::desc create_md(memory::dims dims, memory::data_type data_type,
-        memory::format_tag fmt_tag) {
+inline memory::desc create_md(const memory::dims &dims,
+        memory::data_type data_type, memory::format_tag fmt_tag) {
     return memory::desc(dims, data_type, fmt_tag);
 }
 
 template <typename data_t>
 static inline data_t set_value(
         memory::dim index, data_t mean, data_t deviation, double sparsity) {
-    if (data_traits<data_t>::data_type == memory::data_type::f16
-            || data_traits<data_t>::data_type == memory::data_type::bf16) {
+    if (data_traits_t<data_t>::data_type == memory::data_type::f16
+            || data_traits_t<data_t>::data_type == memory::data_type::bf16) {
         return data_t(set_value<float>(index, mean, deviation, sparsity));
-    } else if (data_traits<data_t>::data_type == memory::data_type::f32) {
+    } else if (data_traits_t<data_t>::data_type == memory::data_type::f32) {
         const memory::dim group_size = (memory::dim)(1. / sparsity);
         const memory::dim group = index / group_size;
         const memory::dim in_group = index % group_size;
@@ -410,10 +409,10 @@ static inline data_t set_value(
         return fill ? static_cast<data_t>(
                        mean + deviation * sinf(float(index % 37)))
                     : data_t {0};
-    } else if (data_traits<data_t>::data_type == memory::data_type::s32
-            || data_traits<data_t>::data_type == memory::data_type::s8) {
+    } else if (data_traits_t<data_t>::data_type == memory::data_type::s32
+            || data_traits_t<data_t>::data_type == memory::data_type::s8) {
         return data_t(index * 13 % 21 - 10);
-    } else if (data_traits<data_t>::data_type == memory::data_type::u8) {
+    } else if (data_traits_t<data_t>::data_type == memory::data_type::u8) {
         return data_t(index * 13 % 17);
     }
     assert(!"not expected");
@@ -437,7 +436,14 @@ static void fill_data(const memory::dim nelems, const memory &mem, data_t mean,
 
 inline void fill_data(memory::data_type dt, const memory &mem, float mean,
         float deviation, double sparsity = 1.) {
-    size_t nelems = mem.get_desc().get_size() / memory::data_type_size(dt);
+    const auto dt_size = memory::data_type_size(dt);
+    if (dt_size <= 0) {
+        assert(!"unexpected data type");
+        return;
+    }
+
+    memory::dim nelems
+            = static_cast<memory::dim>(mem.get_desc().get_size() / dt_size);
     switch (dt) {
         case memory::data_type::f32:
             fill_data<float>(nelems, mem, mean, deviation, sparsity);
@@ -494,11 +500,11 @@ static void compare_data(
         const memory &ref, const memory &dst, data_t threshold = (data_t)1e-4) {
     using data_type = memory::data_type;
 
-    ASSERT_TRUE(data_traits<data_t>::data_type == data_type::f32
-            || data_traits<data_t>::data_type == data_type::f16
-            || data_traits<data_t>::data_type == data_type::bf16
-            || data_traits<data_t>::data_type == data_type::s32
-            || data_traits<data_t>::data_type == data_type::s8);
+    ASSERT_TRUE(data_traits_t<data_t>::data_type == data_type::f32
+            || data_traits_t<data_t>::data_type == data_type::f16
+            || data_traits_t<data_t>::data_type == data_type::bf16
+            || data_traits_t<data_t>::data_type == data_type::s32
+            || data_traits_t<data_t>::data_type == data_type::s8);
 
     /* Note: size_t incompatible with MSVC++ */
     auto ref_desc = ref.get_desc();
@@ -530,9 +536,9 @@ static void compare_data(
         data_t ref = ref_data[mdw_ref.off_l(i, true)];
         data_t got = dst_data[mdw_dst.off_l(i, true)];
 
-        if (data_traits<data_t>::data_type == data_type::f32
-                || data_traits<data_t>::data_type == data_type::f16
-                || data_traits<data_t>::data_type == data_type::bf16) {
+        if (data_traits_t<data_t>::data_type == data_type::f32
+                || data_traits_t<data_t>::data_type == data_type::f16
+                || data_traits_t<data_t>::data_type == data_type::bf16) {
             const float threshold_f32 = static_cast<float>(threshold);
             const float ref_f32 = static_cast<float>(ref);
             const float got_f32 = static_cast<float>(got);
@@ -559,7 +565,8 @@ inline dnnl_status_t get_conv_impl_status(
         const_dnnl_primitive_desc_t pd, const char *match_str) {
     const char *conv_str = query_impl_info(pd);
 
-    if (strstr(conv_str, match_str) != NULL) return dnnl_status_t::dnnl_success;
+    if (strstr(conv_str, match_str) != nullptr)
+        return dnnl_status_t::dnnl_success;
     return dnnl_status_t::dnnl_unimplemented;
 };
 
@@ -601,7 +608,7 @@ struct test_convolution_attr_t {
 
         bool is_def() const { return policy != NONE; }
 
-        scale_t(float s, policy_t p = NONE) : scale(s) { policy = p; }
+        scale_t(float s, policy_t p = NONE) : policy(p), scale(s) {}
 
         policy_t policy;
         float scale;
@@ -625,7 +632,7 @@ struct test_convolution_attr_t {
 
     test_convolution_attr_t(
             float s, scale_t::policy_t p = scale_t::policy_t::NONE)
-        : src_scale(s, p), wei_scale(s, p), dst_scale(s, p), dnnl_attr() {}
+        : src_scale(s, p), wei_scale(s, p), dst_scale(s, p) {}
 
     test_convolution_attr_t() : test_convolution_attr_t(1.f) {}
 
@@ -665,7 +672,8 @@ struct test_convolution_eltwise_params_t {
 
 template <typename F>
 bool catch_expected_failures(const F &f, bool expect_to_fail,
-        dnnl_status_t expected_status, bool ignore_unimplemented = false) {
+        dnnl_status_t expected_status, bool ignore_unimplemented = false,
+        const char *filename = __FILE__, int64_t line_num = __LINE__) {
     try {
         f();
     } catch (const dnnl::error &e) {
@@ -675,7 +683,8 @@ bool catch_expected_failures(const F &f, bool expect_to_fail,
             // Ignore unimplemented
             if (ignore_unimplemented && (e.status == dnnl_unimplemented)) {
                 // Print unimplemented but do not treat as error
-                std::cout << "[  UNIMPL  ] "
+                std::cout << "(" << filename << ":" << line_num << ") "
+                          << "[  UNIMPL  ] "
                           << "Implementation not found" << std::endl;
                 reset_failed_malloc_counter();
                 return true;
@@ -694,7 +703,8 @@ bool catch_expected_failures(const F &f, bool expect_to_fail,
                         expected_status, ignore_unimplemented);
             } else {
                 if (expect_to_fail && (e.status != expected_status))
-                    std::cout << "expect failure status mismatch: expect("
+                    std::cout << "(" << filename << ":" << line_num << ") "
+                              << "Expect failure status mismatch: expect("
                               << dnnl_status2str(expected_status) << ") get("
                               << dnnl_status2str(e.status)
                               << "). Re-throwing...\n";
@@ -711,7 +721,8 @@ bool catch_expected_failures(const F &f, bool expect_to_fail,
 
     // Throw an exception if the failure is expected but did not happen
     if (expect_to_fail) {
-        std::cout << "expect failure with status("
+        std::cout << "(" << filename << ":" << line_num << ") "
+                  << "Expect failure with status("
                   << dnnl_status2str(expected_status) << "), "
                   << "but operation succeed. Throwing an exception...\n";
         throw std::exception();
@@ -774,7 +785,7 @@ static char *test_malloc(size_t size) {
 #else
     int rc = ::posix_memalign(&ptr, align, padded_size);
 #endif /* _WIN32 */
-    return rc == 0 ? (char *)ptr + TEST_MALLOC_OFFSET : 0;
+    return rc == 0 ? (char *)ptr + TEST_MALLOC_OFFSET : nullptr;
 }
 
 static void test_free(char *ptr) {
@@ -787,13 +798,14 @@ static void test_free(char *ptr) {
 }
 #undef TEST_MALLOC_OFFSET
 
+//NOLINTNEXTLINE(readability-identifier-naming)
 class test_memory {
 public:
-    test_memory(const memory::desc &d, const dnnl::engine &e) {
+    test_memory(const memory::desc &d, const dnnl::engine &e)
+        : size_(d.get_size()) {
         bool is_cpu_native = (e.get_kind() == dnnl::engine::kind::cpu)
                 && DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL;
 
-        size_ = d.get_size();
         if (is_cpu_native) {
             data_.reset(test_malloc(size_), test_free);
             mem_ = test::make_memory(d, e, data_.get());
@@ -847,12 +859,13 @@ inline std::string to_string(dnnl_stream_flags_t stream_flags) {
 
 // testing all available C++ primitive descriptor constructors
 struct allows_attr_t {
-    bool po_sum;
-    bool po_eltwise;
-    bool po_binary;
-    bool po_prelu;
-    bool zp;
-    bool scales;
+    bool po_sum = false;
+    bool po_eltwise = false;
+    bool po_binary = false;
+    bool po_prelu = false;
+    bool zp = false;
+    bool scales = false;
+    int scales_arg = DNNL_ARG_SRC;
 };
 
 using engine = dnnl::engine;
@@ -891,17 +904,33 @@ void test_fwd_pd_attr_po_eltwise(const engine &eng, bool supports_po_eltwise,
 
 template <typename pd_t, typename... prim_params_t>
 void test_fwd_pd_attr_po_binary(const engine &eng, bool supports_po_binary,
-        const prim_params_t &...prim_params) {
-    dnnl::post_ops ops_binary;
-    dnnl::memory::desc src1_desc(
-            {16}, memory::data_type::s8, memory::format_tag::x);
-    ops_binary.append_binary(dnnl::algorithm::binary_mul, src1_desc);
-    dnnl::primitive_attr attr_po_binary;
-    attr_po_binary.set_post_ops(ops_binary);
-    if (supports_po_binary)
-        EXPECT_NO_THROW(pd_t pd(eng, prim_params..., attr_po_binary));
-    else
-        EXPECT_ANY_THROW(pd_t pd(eng, prim_params..., attr_po_binary));
+        const pd_t &pd, const prim_params_t &...prim_params) {
+    dnnl::primitive_attr attr_po_binary_good, attr_po_binary_bad;
+    dnnl::post_ops ops_binary_good, ops_binary_bad;
+
+    int dst_ndims = pd.dst_desc().get_ndims();
+    dnnl::memory::dims dims_good(dst_ndims, 1);
+    dnnl::memory::dims dims_bad(
+            dst_ndims > 1 ? dst_ndims - 1 : dst_ndims + 1, 1);
+
+    dnnl::memory::desc src1_desc_good(
+            dims_good, memory::data_type::f32, /* strides */ dims_good);
+    dnnl::memory::desc src1_desc_bad(
+            dims_bad, memory::data_type::s8, /* strides */ dims_bad);
+
+    ops_binary_good.append_binary(dnnl::algorithm::binary_mul, src1_desc_good);
+    ops_binary_bad.append_binary(dnnl::algorithm::binary_mul, src1_desc_bad);
+
+    attr_po_binary_good.set_post_ops(ops_binary_good);
+    attr_po_binary_bad.set_post_ops(ops_binary_bad);
+
+    if (!supports_po_binary) {
+        EXPECT_ANY_THROW(pd_t pd(eng, prim_params..., attr_po_binary_good));
+        return;
+    }
+
+    EXPECT_ANY_THROW(pd_t pd(eng, prim_params..., attr_po_binary_bad));
+    EXPECT_NO_THROW(pd_t pd(eng, prim_params..., attr_po_binary_good));
 }
 
 template <typename pd_t, typename... prim_params_t>
@@ -928,11 +957,13 @@ void test_fwd_pd_attr_zp(const engine &eng, bool supports_zero_point,
         EXPECT_ANY_THROW(pd_t pd(eng, prim_params..., attr_zp));
 }
 
+// Note: `arg` is needed to specify `DNNL_ARG_MULTIPLE_SRC` as a value for
+// supported scale for concat and sum.
 template <typename pd_t, typename... prim_params_t>
-void test_fwd_pd_attr_scales(const engine &eng, bool supports_scales,
+void test_fwd_pd_attr_scales(const engine &eng, bool supports_scales, int arg,
         const prim_params_t &...prim_params) {
     dnnl::primitive_attr attr_scales;
-    attr_scales.set_scales_mask(DNNL_ARG_SRC, 0);
+    attr_scales.set_scales_mask(arg, 0);
 
     if (supports_scales) { // Currently only used with binary ops
         EXPECT_NO_THROW(pd_t pd(eng, prim_params..., attr_scales));
@@ -967,10 +998,11 @@ void test_fwd_pd_constructors(const pd_t &pd, const allows_attr_t &aa,
     // following ctors w/ attrs may throw based on pd support
     test_fwd_pd_attr_po_sum<pd_t>(eng, aa.po_sum, prim_params...);
     test_fwd_pd_attr_po_eltwise<pd_t>(eng, aa.po_eltwise, prim_params...);
-    test_fwd_pd_attr_po_binary<pd_t>(eng, aa.po_binary, prim_params...);
+    test_fwd_pd_attr_po_binary<pd_t>(eng, aa.po_binary, pd, prim_params...);
     test_fwd_pd_attr_po_prelu<pd_t>(eng, aa.po_prelu, prim_params...);
     test_fwd_pd_attr_zp<pd_t>(eng, aa.zp, prim_params...);
-    test_fwd_pd_attr_scales<pd_t>(eng, aa.scales, prim_params...);
+    test_fwd_pd_attr_scales<pd_t>(
+            eng, aa.scales, aa.scales_arg, prim_params...);
     // check allow empty, should not throw
     test_fwd_pd_allow_empty<pd_t>(test_pd, prim_params...);
 }
@@ -1059,10 +1091,9 @@ void test_bwd_pd_allow_empty(const pd_t &pd, const hint_pd_t &hint,
 
 // Note: requires a valid primitive descriptor!
 template <typename pd_t, typename hint_pd_t, typename... prim_params_t>
-void test_bwd_pd_constructors(const pd_t &pd, const hint_pd_t &hint,
+void test_bwd_pd_constructors(const pd_t &pd, const hint_pd_t &hint_pd,
         const allows_attr_t &aa, const prim_params_t &...prim_params) {
     auto test_pd = pd_t();
-    auto hint_pd = hint;
     auto eng = pd.get_engine();
     // ctor from C pd, should not throw
     ASSERT_NO_THROW(test_pd = pd_t(pd.get()));
@@ -1080,7 +1111,7 @@ void test_bwd_pd_constructors(const pd_t &pd, const hint_pd_t &hint,
     test_bwd_pd_allow_empty<pd_t>(test_pd, hint_pd, prim_params...);
 }
 
-inline dnnl::stream make_stream(dnnl::engine engine,
+inline dnnl::stream make_stream(const dnnl::engine &engine,
         dnnl::stream::flags flags = dnnl::stream::flags::default_flags) {
 #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
     if (engine.get_kind() == dnnl::engine::kind::cpu)
diff --git a/tests/gtests/dnnl_test_common_ocl.hpp b/tests/gtests/dnnl_test_common_ocl.hpp
index 59aee2bb171..3c9bf4b85f7 100644
--- a/tests/gtests/dnnl_test_common_ocl.hpp
+++ b/tests/gtests/dnnl_test_common_ocl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #ifndef DNNL_TEST_COMMON_OCL_HPP
 #define DNNL_TEST_COMMON_OCL_HPP
 
-#include "gpu/intel/ocl/ocl_utils.hpp"
 #include "oneapi/dnnl/dnnl.hpp"
 #include "oneapi/dnnl/dnnl_debug.h"
 #include "oneapi/dnnl/dnnl_ocl.hpp"
@@ -27,9 +26,75 @@
 
 // Define a separate macro, that does not clash with OCL_CHECK from the library.
 #ifdef DNNL_ENABLE_MEM_DEBUG
+
+namespace mem_debug_utils {
+// Copy-pasted from src/xpu/ocl/utils.cpp::convert_to_dnnl() to avoid including
+// .cpp file or exposing the symbol.
+inline dnnl_status_t convert_to_dnnl(cl_int cl_status) {
+    switch (cl_status) {
+        case CL_SUCCESS: return dnnl_success;
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        case CL_OUT_OF_RESOURCES:
+        case CL_OUT_OF_HOST_MEMORY: return dnnl_out_of_memory;
+        case CL_DEVICE_NOT_FOUND:
+        case CL_DEVICE_NOT_AVAILABLE:
+        case CL_COMPILER_NOT_AVAILABLE:
+        case CL_PROFILING_INFO_NOT_AVAILABLE:
+        case CL_MEM_COPY_OVERLAP:
+        case CL_IMAGE_FORMAT_MISMATCH:
+        case CL_IMAGE_FORMAT_NOT_SUPPORTED:
+        case CL_BUILD_PROGRAM_FAILURE:
+        case CL_MAP_FAILURE:
+        case CL_MISALIGNED_SUB_BUFFER_OFFSET:
+        case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:
+        case CL_COMPILE_PROGRAM_FAILURE:
+        case CL_LINKER_NOT_AVAILABLE:
+        case CL_LINK_PROGRAM_FAILURE:
+        case CL_DEVICE_PARTITION_FAILED:
+        case CL_KERNEL_ARG_INFO_NOT_AVAILABLE:
+        case CL_INVALID_PLATFORM:
+        case CL_INVALID_DEVICE: return dnnl_runtime_error;
+        case CL_INVALID_VALUE:
+        case CL_INVALID_DEVICE_TYPE:
+        case CL_INVALID_CONTEXT:
+        case CL_INVALID_QUEUE_PROPERTIES:
+        case CL_INVALID_COMMAND_QUEUE:
+        case CL_INVALID_HOST_PTR:
+        case CL_INVALID_MEM_OBJECT:
+        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
+        case CL_INVALID_IMAGE_SIZE:
+        case CL_INVALID_SAMPLER:
+        case CL_INVALID_BINARY:
+        case CL_INVALID_BUILD_OPTIONS:
+        case CL_INVALID_PROGRAM:
+        case CL_INVALID_PROGRAM_EXECUTABLE:
+        case CL_INVALID_KERNEL_NAME:
+        case CL_INVALID_KERNEL_DEFINITION:
+        case CL_INVALID_KERNEL:
+        case CL_INVALID_ARG_INDEX:
+        case CL_INVALID_ARG_VALUE:
+        case CL_INVALID_ARG_SIZE:
+        case CL_INVALID_KERNEL_ARGS:
+        case CL_INVALID_WORK_DIMENSION:
+        case CL_INVALID_WORK_GROUP_SIZE:
+        case CL_INVALID_WORK_ITEM_SIZE:
+        case CL_INVALID_GLOBAL_OFFSET:
+        case CL_INVALID_EVENT_WAIT_LIST:
+        case CL_INVALID_EVENT:
+        case CL_INVALID_OPERATION:
+        case CL_INVALID_GL_OBJECT:
+        case CL_INVALID_BUFFER_SIZE:
+        case CL_INVALID_MIP_LEVEL:
+        case CL_INVALID_GLOBAL_WORK_SIZE: return dnnl_invalid_arguments;
+
+        default: return dnnl_runtime_error;
+    }
+}
+} // namespace mem_debug_utils
+
 #define TEST_OCL_CHECK(x) \
     do { \
-        dnnl_status_t s = dnnl::impl::xpu::ocl::convert_to_dnnl(x); \
+        dnnl_status_t s = mem_debug_utils::convert_to_dnnl(x); \
         dnnl::error::wrap_c_api(s, dnnl_status2str(s)); \
     } while (0)
 #else
diff --git a/tests/gtests/dnnl_test_macros.hpp b/tests/gtests/dnnl_test_macros.hpp
index 6184189edc2..0add3297716 100644
--- a/tests/gtests/dnnl_test_macros.hpp
+++ b/tests/gtests/dnnl_test_macros.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,6 +69,21 @@
 #define SKIP_FOR_LOOP_HIP(cond, message)
 #endif
 
+#ifdef DNNL_SYCL_GENERIC
+#define SKIP_IF_GENERIC(cond, message) \
+    do { \
+        SKIP_IF(get_test_engine_kind() == engine::kind::gpu && (cond), \
+                (message)); \
+    } while (0)
+
+#define SKIP_FOR_LOOP_GENERIC(cond, message) \
+    SKIP_FOR_LOOP( \
+            get_test_engine_kind() == engine::kind::gpu && (cond), (message));
+#else
+#define SKIP_IF_GENERIC(cond, message)
+#define SKIP_FOR_LOOP_GENERIC(cond, message)
+#endif
+
 #define TEST_F_(test_fixture, test_name) TEST_F(test_fixture, test_name)
 
 #define CPU_TEST_F(test_fixture, test_name) \
@@ -109,6 +124,7 @@
     } \
     void Testing()
 
+//NOLINTBEGIN(bugprone-macro-parentheses)
 // Wrapper around TEST from gtest, intended to catch exceptions thrown by a unit
 // test.
 #define HANDLE_EXCEPTIONS_FOR_TEST(test_fixture, test_name) \
@@ -178,4 +194,6 @@
     TEST_P(test_fixture, test_name)
 #endif
 
+//NOLINTEND(bugprone-macro-parentheses)
+
 #endif
diff --git a/tests/gtests/graph/api/CMakeLists.txt b/tests/gtests/graph/api/CMakeLists.txt
index 6b34a118622..99e151f4316 100644
--- a/tests/gtests/graph/api/CMakeLists.txt
+++ b/tests/gtests/graph/api/CMakeLists.txt
@@ -17,15 +17,10 @@ set(gpu_rt_pattern "(SYCL|DPCPP|OCL)")
 set(sycl_rt_pattern "(SYCL|DPCPP)")
 
 function(register_graph_api_test_suite test_suite_name exe_name)
-    if(${test_suite_name} MATCHES "_usm")
-        if(DNNL_GPU_RUNTIME MATCHES ${gpu_rt_pattern})
-            add_test(${test_suite_name}_gpu ${exe_name} --engine=gpu)
-            maybe_configure_windows_test(${test_suite_name}_gpu TEST)
-        endif()
-
+    if(${test_suite_name} MATCHES "_cpu")
         if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
-            add_test(${test_suite_name}_cpu ${exe_name} --engine=cpu)
-            maybe_configure_windows_test(${test_suite_name}_cpu TEST)
+            add_test(${test_suite_name} ${exe_name} --engine=cpu)
+            maybe_configure_windows_test(${test_suite_name} TEST)
         endif()
     elseif(${test_suite_name} MATCHES "_gpu")
         if(DNNL_GPU_RUNTIME MATCHES ${gpu_rt_pattern})
@@ -33,9 +28,14 @@ function(register_graph_api_test_suite test_suite_name exe_name)
             maybe_configure_windows_test(${test_suite_name} TEST)
         endif()
     else()
+        if(DNNL_GPU_RUNTIME MATCHES ${gpu_rt_pattern})
+            add_test(${test_suite_name}_gpu ${exe_name} --engine=gpu)
+            maybe_configure_windows_test(${test_suite_name}_gpu TEST)
+        endif()
+
         if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
-            add_test(${test_suite_name} ${exe_name})
-            maybe_configure_windows_test(${test_suite_name} TEST)
+            add_test(${test_suite_name}_cpu ${exe_name} --engine=cpu)
+            maybe_configure_windows_test(${test_suite_name}_cpu TEST)
         endif()
     endif()
 endfunction()
@@ -91,7 +91,7 @@ foreach(TEST_FILE ${API_TEST_ENGINE_DEPENDENT_SOURCES})
         ${DNNL_LIBRARY_NAME}
         ${EXTRA_SHARED_LIBS}
     )
-    register_graph_api_test_suite(${exe_name}_usm ${exe_name})
+    register_graph_api_test_suite(${exe_name} ${exe_name})
 endforeach()
 
 if(DNNL_CPU_RUNTIME MATCHES ${sycl_rt_pattern} OR DNNL_GPU_RUNTIME MATCHES ${sycl_rt_pattern})
diff --git a/tests/gtests/graph/api/sycl/CMakeLists.txt b/tests/gtests/graph/api/sycl/CMakeLists.txt
index e35f9bb7092..ed23779b8d7 100644
--- a/tests/gtests/graph/api/sycl/CMakeLists.txt
+++ b/tests/gtests/graph/api/sycl/CMakeLists.txt
@@ -29,5 +29,5 @@ foreach(TEST_FILE ${API_SYCL_TEST_SOURCES})
         ${DNNL_LIBRARY_NAME}
         ${EXTRA_SHARED_LIBS}
     )
-    register_graph_api_test_suite(${exe_name}_usm ${exe_name})
+    register_graph_api_test_suite(${exe_name} ${exe_name})
 endforeach()
diff --git a/tests/gtests/graph/api/test_api_common.hpp b/tests/gtests/graph/api/test_api_common.hpp
index c0ad8c42fc7..ac7a5d37225 100644
--- a/tests/gtests/graph/api/test_api_common.hpp
+++ b/tests/gtests/graph/api/test_api_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,8 +38,6 @@
 #include "xpu/sycl/compat.hpp"
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
@@ -152,7 +150,6 @@ struct sycl_deletor {
 
 struct engine_handle_t {
     dnnl_engine_t engine = nullptr;
-    ~engine_handle_t() {};
     explicit operator bool() const noexcept {
         return static_cast<bool>(engine);
     }
diff --git a/tests/gtests/graph/api/test_c_api_compile.cpp b/tests/gtests/graph/api/test_c_api_compile.cpp
index afac091af7d..11947e7cc28 100644
--- a/tests/gtests/graph/api/test_c_api_compile.cpp
+++ b/tests/gtests/graph/api/test_c_api_compile.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include "test_api_common.h"
 #include "test_api_common.hpp"
+#include "tests/gtests/dnnl_test_common.hpp"
 
 TEST(CAPI, CompileBN) {
     dnnl_graph_graph_t agraph = nullptr;
@@ -970,9 +971,15 @@ TEST(CAPI, CompileSumConv2DStridedBN) {
                            *(compiled_partition + 1), sum_output.id,
                            &opaque_sum_output),
             dnnl_success, COMPILE_SUM_CONV2D_STRIDED_BN_DESTROY_PLUS);
+// Blocked layout is supported on intel gpu only
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
     ASSERT_EQ(opaque_sum_output.layout_type,
             engine == dnnl_gpu ? dnnl_graph_layout_type_opaque
                                : dnnl_graph_layout_type_strided);
+#else
+    ASSERT_EQ(opaque_sum_output.layout_type, dnnl_graph_layout_type_strided);
+#endif
 
     // Check in-place pairs
     size_t num_inplace_pairs = 10; // Initialized with an impossible value.
@@ -993,7 +1000,14 @@ TEST(CAPI, CompileSumConv2DStridedBN) {
                       *(compiled_partition + 1), &num_inplace_pairs,
                       &inplace_pairs),
             dnnl_success);
+// Blocked layout is supported on intel gpu only, so here we will get one
+// in-place pair on gpu of other vendors
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
     EXPECT_EQ(num_inplace_pairs, 0U);
+#else
+    EXPECT_EQ(num_inplace_pairs, engine == dnnl_gpu ? 1U : 0U);
+#endif
 
     COMPILE_SUM_CONV2D_STRIDED_BN_DESTROY_PLUS;
 #undef COMPILE_SUM_CONV2D_STRIDED_BN_DESTROY
diff --git a/tests/gtests/graph/api/test_cpp_api_graph.cpp b/tests/gtests/graph/api/test_cpp_api_graph.cpp
index 765682aa07a..208a255495b 100644
--- a/tests/gtests/graph/api/test_cpp_api_graph.cpp
+++ b/tests/gtests/graph/api/test_cpp_api_graph.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,6 +20,22 @@
 
 #include "oneapi/dnnl/dnnl_graph.hpp"
 
+TEST(APIGraph, SetAndGetFloatingPointMathMode) {
+    using namespace dnnl::graph;
+    using fpmath_mode = dnnl::fpmath_mode;
+
+    dnnl::engine::kind engine_kind = dnnl::engine::kind::cpu;
+    graph g(engine_kind);
+    g.set_fpmath_mode(fpmath_mode::bf16, true);
+
+    fpmath_mode mode;
+    bool apply_to_int;
+    g.get_fpmath_mode(mode, apply_to_int);
+
+    EXPECT_EQ(mode, fpmath_mode::bf16);
+    EXPECT_EQ(apply_to_int, 1U);
+}
+
 TEST(APIGraph, GetPartitions) {
     using namespace dnnl::graph;
     dnnl::engine::kind engine_kind = dnnl::engine::kind::cpu;
diff --git a/tests/gtests/graph/api/test_cpp_api_logical_tensor.cpp b/tests/gtests/graph/api/test_cpp_api_logical_tensor.cpp
index d0c5945af3a..a5a7493e773 100644
--- a/tests/gtests/graph/api/test_cpp_api_logical_tensor.cpp
+++ b/tests/gtests/graph/api/test_cpp_api_logical_tensor.cpp
@@ -148,6 +148,11 @@ TEST(APILogicalTensor, CreateWithDataType) {
     logical_tensor lt_f8 {id, data_type::f8_e5m2, {3, 4}, layout_type::strided};
     ASSERT_EQ(lt_f8.get_id(), id);
     ASSERT_EQ(lt_f8.get_data_type(), data_type::f8_e5m2);
+
+    // u4
+    logical_tensor lt_u4 {id, data_type::u4, {3, 4}, layout_type::strided};
+    ASSERT_EQ(lt_u4.get_id(), id);
+    ASSERT_EQ(lt_u4.get_data_type(), data_type::u4);
 }
 
 TEST(APILogicalTensor, ShallowCopy) {
@@ -306,4 +311,10 @@ TEST(APILogicalTensor, LogicalTensorSize) {
     ASSERT_EQ(lt_3.get_id(), id);
     ASSERT_EQ(lt_3.get_data_type(), data_type::s8);
     ASSERT_EQ(lt_3.get_mem_size(), num_elem * sizeof(int8_t));
+
+    logical_tensor lt_4 {id, data_type::s4, shape, layout_type::strided};
+    ASSERT_EQ(lt_4.get_id(), id);
+    ASSERT_EQ(lt_4.get_data_type(), data_type::s4);
+    // in case num_elem is not even.
+    ASSERT_EQ(lt_4.get_mem_size(), (num_elem + 1) / 2);
 }
diff --git a/tests/gtests/graph/api/test_cpp_api_op.cpp b/tests/gtests/graph/api/test_cpp_api_op.cpp
index b0f05894ed9..a8b80217012 100644
--- a/tests/gtests/graph/api/test_cpp_api_op.cpp
+++ b/tests/gtests/graph/api/test_cpp_api_op.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -114,6 +114,8 @@ TEST(APIOp, CreateAllOps) {
             op::kind::Select,
             op::kind::Pow,
             op::kind::GroupNorm,
+            op::kind::GenIndex,
+            op::kind::GreaterEqual,
     };
     // clang-format on
 
diff --git a/tests/gtests/graph/api/test_cpp_api_partition.cpp b/tests/gtests/graph/api/test_cpp_api_partition.cpp
index 04c540bab08..3777d58b23b 100644
--- a/tests/gtests/graph/api/test_cpp_api_partition.cpp
+++ b/tests/gtests/graph/api/test_cpp_api_partition.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -100,18 +100,30 @@ TEST(APIPartition, PartitionTest) {
     auto cp = partitions[0].compile(in0, out0, eng);
     // query logical tensor from compiled partition
     auto lt4_opaque = cp.query_logical_tensor(3);
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
     ASSERT_EQ(lt4_opaque.get_layout_type(),
             real_engine_kind == dnnl::engine::kind::gpu
                     ? logical_tensor::layout_type::opaque
                     : logical_tensor::layout_type::strided);
+#else
+    ASSERT_EQ(
+            lt4_opaque.get_layout_type(), logical_tensor::layout_type::strided);
+#endif
 
     auto cp1 = partitions[0].compile(in0, out0, eng);
     // query logical tensor from compiled partition
     auto lt5_opaque = cp1.query_logical_tensor(3);
+#if DNNL_GPU_RUNTIME != DNNL_RUNTIME_NONE \
+        && DNNL_GPU_VENDOR == DNNL_VENDOR_INTEL
     ASSERT_EQ(lt5_opaque.get_layout_type(),
             real_engine_kind == dnnl::engine::kind::gpu
                     ? logical_tensor::layout_type::opaque
                     : logical_tensor::layout_type::strided);
+#else
+    ASSERT_EQ(
+            lt5_opaque.get_layout_type(), logical_tensor::layout_type::strided);
+#endif
 }
 
 TEST(APIPartition, GetInputOutputIDs) {
diff --git a/tests/gtests/graph/test_allocator.hpp b/tests/gtests/graph/test_allocator.hpp
index aa56093f36f..8d9c5612a5b 100644
--- a/tests/gtests/graph/test_allocator.hpp
+++ b/tests/gtests/graph/test_allocator.hpp
@@ -24,8 +24,6 @@
 #ifdef DNNL_WITH_SYCL
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
diff --git a/tests/gtests/graph/unit/CMakeLists.txt b/tests/gtests/graph/unit/CMakeLists.txt
index 77018cf818a..b0c55e564ec 100644
--- a/tests/gtests/graph/unit/CMakeLists.txt
+++ b/tests/gtests/graph/unit/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2024 Intel Corporation
+# Copyright 2021-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -76,7 +76,12 @@ set(gpu_rt_pattern "(SYCL|DPCPP|OCL)")
 foreach(i RANGE ${count})
     list(GET test_suite_names ${i} s)
     list(GET test_filters ${i} f)
-    if(${s} MATCHES "_usm")
+    if(${s} MATCHES "_cpu")
+        if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
+            add_test(${s} ${BINARY_NAME} --gtest_filter=${f} --engine=cpu)
+            maybe_configure_windows_test(${s} TEST)
+        endif()
+    else()
         if(DNNL_GPU_RUNTIME MATCHES ${gpu_rt_pattern})
             add_test(${s}_gpu ${BINARY_NAME} --gtest_filter=${f} --engine=gpu)
             maybe_configure_windows_test(${s}_gpu TEST)
@@ -86,10 +91,12 @@ foreach(i RANGE ${count})
             add_test(${s}_cpu ${BINARY_NAME} --gtest_filter=${f} --engine=cpu)
             maybe_configure_windows_test(${s}_cpu TEST)
         endif()
-    else()
-        if(NOT DNNL_CPU_RUNTIME STREQUAL "NONE")
-            add_test(${s} ${BINARY_NAME} --gtest_filter=${f})
-            maybe_configure_windows_test(${s} TEST)
-        endif()
     endif()
 endforeach()
+
+if(DNNL_WITH_SYCL)
+    if(DNNL_SYCL_CUDA OR (DNNL_SYCL_GENERIC AND NVIDIA_TARGET_SUPPORTED))
+        append(CMAKE_CXX_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda,spir64")
+    endif()
+endif()
+
diff --git a/tests/gtests/graph/unit/backend/CMakeLists.txt b/tests/gtests/graph/unit/backend/CMakeLists.txt
index afd0eb02540..a036a61261c 100644
--- a/tests/gtests/graph/unit/backend/CMakeLists.txt
+++ b/tests/gtests/graph/unit/backend/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2023 Intel Corporation
+# Copyright 2021-2025 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,4 +16,3 @@
 
 add_subdirectory(fake)
 add_subdirectory(dnnl)
-add_subdirectory(graph_compiler)
diff --git a/tests/gtests/graph/unit/backend/dnnl/CMakeLists.txt b/tests/gtests/graph/unit/backend/dnnl/CMakeLists.txt
index 11591dc1084..c7ddbecc1e4 100644
--- a/tests/gtests/graph/unit/backend/dnnl/CMakeLists.txt
+++ b/tests/gtests/graph/unit/backend/dnnl/CMakeLists.txt
@@ -22,36 +22,35 @@ if(ONEDNN_ENABLE_GRAPH_DUMP)
 endif()
 
 FILE(GLOB DNNL_COMMON_TEST_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dnnl_infer_shape_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_fusion_info_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_insert_ops_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_internal_attrs_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_layout_id_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_layout_propagator_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_logical_tensor_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_memory_planning_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_schema_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_partition_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_thread_local_cache_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_dnnl_utils_cpu.cpp
+)
+
+# op compile/execution need test on both CPU and GPU
+FILE(GLOB DNNL_OP_EXECUTION_TEST_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/test_common.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_constant_cache.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_dnnl_infer_shape.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_dnnl_partition_impl.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_fusion_info.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_graph.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_insert_ops.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_internal_attrs.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_layout_id.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_layout_propagator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_logical_tensor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_memory_planning.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_executable.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_schema.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_partition.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_pass.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_scratchpad.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_subgraph_pass.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_thread_local_cache.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_executable.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_select.cpp
-)
-
-# op compile/execution need test on both CPU and GPU
-FILE(GLOB DNNL_OP_EXECUTION_TEST_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/test_batch_norm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_binary_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_bmm.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_compiled_partition.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_concat.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_conv.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_convolution.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_convtranspose.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_dequantize.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_eltwise.cpp
@@ -69,6 +68,7 @@ FILE(GLOB DNNL_OP_EXECUTION_TEST_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/test_sdp_decomp.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_softmax.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_typecast.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_pass.cpp
 )
 
 if(_ONEDNN_GRAPH_LAYOUT_DEBUG)
@@ -85,12 +85,13 @@ set_property(GLOBAL APPEND PROPERTY GRAPH_UNIT_TEST_DEPS $<TARGET_OBJECTS:${OBJ_
 
 foreach(TEST_FILE ${DNNL_COMMON_TEST_SOURCES})
     get_filename_component(file_name ${TEST_FILE} NAME_WE)
+    string(REPLACE "_cpu" "" file_name ${file_name})
     string(REPLACE "test_" "test_graph_unit_dnnl_" test_suite_name ${file_name})
-    register_graph_test_suite(${test_suite_name} "${file_name}_*")
+    register_graph_test_suite(${test_suite_name}_cpu "${file_name}*")
 endforeach()
 
 foreach(TEST_FILE ${DNNL_OP_EXECUTION_TEST_SOURCES})
     get_filename_component(file_name ${TEST_FILE} NAME_WE)
     string(REPLACE "test_" "test_graph_unit_dnnl_" test_suite_name ${file_name})
-    register_graph_test_suite(${test_suite_name}_usm "${file_name}_*")
+    register_graph_test_suite(${test_suite_name} "${file_name}*")
 endforeach()
diff --git a/tests/gtests/graph/unit/backend/dnnl/dnnl_test_common.hpp b/tests/gtests/graph/unit/backend/dnnl/dnnl_test_common.hpp
index 418ecb7c90a..59f88921450 100644
--- a/tests/gtests/graph/unit/backend/dnnl/dnnl_test_common.hpp
+++ b/tests/gtests/graph/unit/backend/dnnl/dnnl_test_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -72,23 +72,24 @@ static inline void run_all_single_passes(dnnl::impl::graph::graph_t &agraph) {
 // given graph will be deep copied first so that all the changes inside the
 // function are not visible outside.
 static inline dnnl::impl::graph::status_t run_graph(
-        dnnl::impl::graph::graph_t &agraph,
-        const std::vector<test_tensor> &g_in_ts,
-        const std::vector<test_tensor> &g_out_ts,
+        const dnnl::impl::graph::graph_t &agraph,
+        const std::vector<test_tensor_t> &g_in_ts,
+        const std::vector<test_tensor_t> &g_out_ts,
         dnnl::impl::graph::engine_t &eng, dnnl::impl::graph::stream_t &strm) {
     namespace graph = dnnl::impl::graph;
     namespace dnnl_impl = graph::dnnl_impl;
     graph::status_t ret;
+    // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
     graph::graph_t copied(agraph);
     auto ops = copied.get_ops();
 
     // force each tensor to be strided
     for (auto &op : ops) {
-        for (auto val : op->get_input_values()) {
+        for (const auto &val : op->get_input_values()) {
             val->set_layout_type(graph::layout_type::strided);
         }
 
-        for (auto val : op->get_output_values()) {
+        for (const auto &val : op->get_output_values()) {
             val->set_layout_type(graph::layout_type::strided);
         }
     }
@@ -109,7 +110,7 @@ static inline dnnl::impl::graph::status_t run_graph(
     if (ret != graph::status::success) return ret;
 
     // used to hold the temporary buffers
-    std::unordered_map<size_t, test_tensor> temp_data;
+    std::unordered_map<size_t, test_tensor_t> temp_data;
 
     // compile and execute each op in topo order
     return graph::topo_order_visit(
@@ -175,7 +176,7 @@ static inline dnnl::impl::graph::status_t run_graph(
                 for (auto &in_val : op->get_input_values()) {
                     auto in_lt = in_val->get_logical_tensor();
                     auto pos = std::find_if(g_in_ts.begin(), g_in_ts.end(),
-                            [&](const test_tensor &t) {
+                            [&](const test_tensor_t &t) {
                                 return in_lt.id
                                         == t.get().get_logical_tensor().id;
                             });
@@ -184,14 +185,15 @@ static inline dnnl::impl::graph::status_t run_graph(
                         continue;
                     }
                     if (temp_data.find(in_lt.id) == temp_data.end()) {
-                        temp_data.insert({in_lt.id, test_tensor(in_lt, &eng)});
+                        temp_data.insert(
+                                {in_lt.id, test_tensor_t(in_lt, &eng)});
                     }
                     in_ts.emplace_back(temp_data.at(in_lt.id).get());
                 }
                 for (auto &out_val : op->get_output_values()) {
                     auto out_lt = out_val->get_logical_tensor();
                     auto pos = std::find_if(g_out_ts.begin(), g_out_ts.end(),
-                            [&](const test_tensor &t) {
+                            [&](const test_tensor_t &t) {
                                 return out_lt.id
                                         == t.get().get_logical_tensor().id;
                             });
@@ -201,7 +203,7 @@ static inline dnnl::impl::graph::status_t run_graph(
                     }
                     if (temp_data.find(out_lt.id) == temp_data.end()) {
                         temp_data.insert(
-                                {out_lt.id, test_tensor(out_lt, &eng)});
+                                {out_lt.id, test_tensor_t(out_lt, &eng)});
                     }
                     out_ts.emplace_back(temp_data.at(out_lt.id).get());
                 }
@@ -235,13 +237,27 @@ static inline bool allclose(const std::vector<T> &a, const std::vector<T> &b,
     return flag;
 }
 
+// For some unknown reason, GCC 14.x (but to be precautios, 11.x is guarded) and
+// beyond can't compile this specific function.
+#if defined(__GNUC__) && __GNUC__ > 10 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
+// new_allocator.h:172:33: warning: ‘void operator delete(void*)’ called on
+// pointer ‘<unknown>’ with nonzero offset [1, 9223372036854775807]
+// [-Wfree-nonheap-object]
+//  172 |         _GLIBCXX_OPERATOR_DELETE(_GLIBCXX_SIZED_DEALLOC(__p, __n));
+//      |                                 ^
+#endif
 template <typename T>
-static inline bool allclose(
-        const test_tensor &a, const test_tensor &b, float rtol, float atol) {
+static inline bool allclose(const test_tensor_t &a, const test_tensor_t &b,
+        float rtol, float atol) {
     auto av = a.as_vec_type<T>();
     auto bv = b.as_vec_type<T>();
     return allclose(av, bv, rtol, atol);
 }
+#if defined(__GNUC__) && __GNUC__ > 10 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 static inline size_t product(std::vector<int64_t> &in) {
     if (in.empty()) return 0;
@@ -252,56 +268,69 @@ static inline size_t product(std::vector<int64_t> &in) {
 
 #define for_ for
 #define SET_Q_DQ_DATA_ATTR(q_dq_data) \
-    q_dq_data.set_attr<std::string>( \
+    (q_dq_data).set_attr<std::string>( \
             dnnl::impl::graph::op_attr::qtype, "per_tensor"); \
-    q_dq_data.set_attr<std::vector<int64_t>>( \
+    (q_dq_data).set_attr<std::vector<int64_t>>( \
             dnnl::impl::graph::op_attr::zps, {zp_src}); \
-    q_dq_data.set_attr<std::vector<float>>( \
+    (q_dq_data).set_attr<std::vector<float>>( \
             dnnl::impl::graph::op_attr::scales, {scale_src}); \
-    q_dq_data.set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, 0);
+    (q_dq_data).set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, 0);
 
 #define SET_Q_DQ_WEIGHT_ATTR(q_dq_weight, pc_axis) \
-    q_dq_weight.set_attr<std::string>( \
-            dnnl::impl::graph::op_attr::qtype, wei_qtype); \
-    q_dq_weight.set_attr<std::vector<int64_t>>( \
-            dnnl::impl::graph::op_attr::zps, zp_wei); \
-    q_dq_weight.set_attr<std::vector<float>>( \
-            dnnl::impl::graph::op_attr::scales, scale_wei); \
-    q_dq_weight.set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, pc_axis);
+    (q_dq_weight) \
+            .set_attr<std::string>( \
+                    dnnl::impl::graph::op_attr::qtype, wei_qtype); \
+    (q_dq_weight) \
+            .set_attr<std::vector<int64_t>>( \
+                    dnnl::impl::graph::op_attr::zps, zp_wei); \
+    (q_dq_weight) \
+            .set_attr<std::vector<float>>( \
+                    dnnl::impl::graph::op_attr::scales, scale_wei); \
+    (q_dq_weight) \
+            .set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, (pc_axis));
 
 #define SET_CONV_ATTR(conv, nd) \
-    conv.set_attr<dims>(dnnl::impl::graph::op_attr::strides, dims(nd, 1)); \
-    conv.set_attr<dims>(dnnl::impl::graph::op_attr::dilations, dims(nd, 1)); \
-    conv.set_attr<dims>(dnnl::impl::graph::op_attr::pads_begin, dims(nd, 0)); \
-    conv.set_attr<dims>(dnnl::impl::graph::op_attr::pads_end, dims(nd, 0)); \
-    conv.set_attr<int64_t>(dnnl::impl::graph::op_attr::groups, g); \
-    conv.set_attr<std::string>( \
+    (conv).set_attr<dims>(dnnl::impl::graph::op_attr::strides, dims((nd), 1)); \
+    (conv).set_attr<dims>( \
+            dnnl::impl::graph::op_attr::dilations, dims((nd), 1)); \
+    (conv).set_attr<dims>( \
+            dnnl::impl::graph::op_attr::pads_begin, dims((nd), 0)); \
+    (conv).set_attr<dims>( \
+            dnnl::impl::graph::op_attr::pads_end, dims((nd), 0)); \
+    (conv).set_attr<int64_t>(dnnl::impl::graph::op_attr::groups, g); \
+    (conv).set_attr<std::string>( \
             dnnl::impl::graph::op_attr::data_format, "NCX"); \
-    conv.set_attr<std::string>( \
+    (conv).set_attr<std::string>( \
             dnnl::impl::graph::op_attr::weights_format, "OIX");
 
 #define SET_CONVTRANSPOSE_ATTR(convtranspose, nd) \
-    convtranspose.set_attr<dims>( \
-            dnnl::impl::graph::op_attr::strides, dims(nd, 1)); \
-    convtranspose.set_attr<dims>( \
-            dnnl::impl::graph::op_attr::dilations, dims(nd, 1)); \
-    convtranspose.set_attr<dims>( \
-            dnnl::impl::graph::op_attr::pads_begin, dims(nd, 0)); \
-    convtranspose.set_attr<dims>( \
-            dnnl::impl::graph::op_attr::pads_end, dims(nd, 0)); \
-    convtranspose.set_attr<int64_t>(dnnl::impl::graph::op_attr::groups, g); \
-    convtranspose.set_attr<std::string>( \
-            dnnl::impl::graph::op_attr::data_format, "NCX"); \
-    convtranspose.set_attr<std::string>( \
-            dnnl::impl::graph::op_attr::weights_format, "IOX");
+    (convtranspose) \
+            .set_attr<dims>( \
+                    dnnl::impl::graph::op_attr::strides, dims((nd), 1)); \
+    (convtranspose) \
+            .set_attr<dims>( \
+                    dnnl::impl::graph::op_attr::dilations, dims((nd), 1)); \
+    (convtranspose) \
+            .set_attr<dims>( \
+                    dnnl::impl::graph::op_attr::pads_begin, dims((nd), 0)); \
+    (convtranspose) \
+            .set_attr<dims>( \
+                    dnnl::impl::graph::op_attr::pads_end, dims((nd), 0)); \
+    (convtranspose).set_attr<int64_t>(dnnl::impl::graph::op_attr::groups, g); \
+    (convtranspose) \
+            .set_attr<std::string>( \
+                    dnnl::impl::graph::op_attr::data_format, "NCX"); \
+    (convtranspose) \
+            .set_attr<std::string>( \
+                    dnnl::impl::graph::op_attr::weights_format, "IOX");
 
 #define SET_Q_DQ_OUT_ATTR(q_dq_out) \
-    q_dq_out.set_attr<std::string>( \
+    (q_dq_out).set_attr<std::string>( \
             dnnl::impl::graph::op_attr::qtype, "per_tensor"); \
-    q_dq_out.set_attr<std::vector<int64_t>>( \
+    (q_dq_out).set_attr<std::vector<int64_t>>( \
             dnnl::impl::graph::op_attr::zps, {zp_out}); \
-    q_dq_out.set_attr<std::vector<float>>( \
+    (q_dq_out).set_attr<std::vector<float>>( \
             dnnl::impl::graph::op_attr::scales, {scale_out}); \
-    q_dq_out.set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, 0);
+    (q_dq_out).set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, 0);
 
 #endif
diff --git a/tests/gtests/graph/unit/backend/dnnl/ref_func.hpp b/tests/gtests/graph/unit/backend/dnnl/ref_func.hpp
index 9840aafdd38..1979942586d 100644
--- a/tests/gtests/graph/unit/backend/dnnl/ref_func.hpp
+++ b/tests/gtests/graph/unit/backend/dnnl/ref_func.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,34 +64,35 @@ static inline std::vector<float> hardsigmoidbackward_func(
 
 static inline std::vector<float> sigmoid_func(
         const std::vector<float> &ref_dst) {
-    std::vector<float> out;
-    for (auto &rdst : ref_dst) {
-        out.emplace_back(static_cast<float>(1 / (exp(-rdst) + 1)));
+    std::vector<float> out(ref_dst.size());
+    for (size_t i = 0; i < ref_dst.size(); ++i) {
+        out[i] = static_cast<float>(1 / (exp(-1 * ref_dst[i]) + 1));
     }
     return out;
 }
 
 static inline std::vector<float> tanh_func(const std::vector<float> &ref_dst) {
-    std::vector<float> out;
-    for (auto &rdst : ref_dst) {
-        out.emplace_back(static_cast<float>(
-                (exp(rdst) - exp(-rdst)) / (exp(rdst) + exp(-rdst))));
+    std::vector<float> out(ref_dst.size());
+    for (size_t i = 0; i < ref_dst.size(); ++i) {
+        const auto rdst = ref_dst[i];
+        out[i] = static_cast<float>(
+                (exp(rdst) - exp(-rdst)) / (exp(rdst) + exp(-rdst)));
     }
     return out;
 }
 
 static inline std::vector<float> sqrt_func(const std::vector<float> &ref_dst) {
-    std::vector<float> out;
-    for (auto &rdst : ref_dst) {
-        out.emplace_back(static_cast<float>(sqrt(rdst)));
+    std::vector<float> out(ref_dst.size());
+    for (size_t i = 0; i < ref_dst.size(); ++i) {
+        out[i] = static_cast<float>(sqrt(ref_dst[i]));
     }
     return out;
 }
 
 static inline std::vector<float> round_func(const std::vector<float> &ref_dst) {
-    std::vector<float> out;
-    for (auto &rdst : ref_dst) {
-        out.emplace_back(static_cast<float>(round(rdst)));
+    std::vector<float> out(ref_dst.size());
+    for (size_t i = 0; i < ref_dst.size(); ++i) {
+        out[i] = static_cast<float>(round(ref_dst[i]));
     }
     return out;
 }
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_batch_norm.cpp b/tests/gtests/graph/unit/backend/dnnl/test_batch_norm.cpp
index 9a4bb8c05c5..0885f83e4a2 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_batch_norm.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_batch_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,12 +26,13 @@ namespace graph = dnnl::impl::graph;
 namespace utils = dnnl::graph::tests::unit::utils;
 
 static inline void ref_batchnorm_fwd(graph::dim_t mb, graph::dim_t ic,
-        graph::dim_t ih, graph::dim_t iw, test_tensor *src, test_tensor *dst,
-        test_tensor *scale, test_tensor *shift, test_tensor *mean = nullptr,
-        test_tensor *variance = nullptr, test_tensor *running_mean = nullptr,
-        test_tensor *running_variance = nullptr,
-        test_tensor *batch_mean = nullptr,
-        test_tensor *batch_variance = nullptr, float epsilon = 0.001f,
+        graph::dim_t ih, graph::dim_t iw, test_tensor_t *src,
+        test_tensor_t *dst, test_tensor_t *scale, test_tensor_t *shift,
+        test_tensor_t *mean = nullptr, test_tensor_t *variance = nullptr,
+        test_tensor_t *running_mean = nullptr,
+        test_tensor_t *running_variance = nullptr,
+        test_tensor_t *batch_mean = nullptr,
+        test_tensor_t *batch_variance = nullptr, float epsilon = 0.001f,
         float momentum = 0.1f,
         std::function<float(const float)> activation = nullptr,
         bool channel_last = false, bool is_training = false) {
@@ -355,26 +356,28 @@ class batch_norm_4d_t : public ::testing::TestWithParam<batchnorm_params_t> {
                     });
         }
 
-        test_tensor src_ts(src, engine, src_data);
-        test_tensor scale_ts(scale, engine, scale_data);
-        test_tensor shift_ts(shift, engine, shift_data);
-        test_tensor mean_ts(mean, engine, mean_data);
-        test_tensor variance_ts(variance, engine, variance_data);
-        test_tensor running_mean_ts(running_mean, engine, running_mean_data);
-        test_tensor running_variance_ts(
+        test_tensor_t src_ts(src, engine, src_data);
+        test_tensor_t scale_ts(scale, engine, scale_data);
+        test_tensor_t shift_ts(shift, engine, shift_data);
+        test_tensor_t mean_ts(mean, engine, mean_data);
+        test_tensor_t variance_ts(variance, engine, variance_data);
+        test_tensor_t running_mean_ts(running_mean, engine, running_mean_data);
+        test_tensor_t running_variance_ts(
                 running_variance, engine, running_variance_data);
-        test_tensor batch_mean_ts(batch_mean, engine, batch_mean_data);
-        test_tensor batch_variance_ts(
+        test_tensor_t batch_mean_ts(batch_mean, engine, batch_mean_data);
+        test_tensor_t batch_variance_ts(
                 batch_variance, engine, batch_variance_data);
-        test_tensor dst_ts(params.with_relu ? relu_dst : dst, engine, dst_data);
-        test_tensor ref_dst_ts(
+        test_tensor_t dst_ts(
+                params.with_relu ? relu_dst : dst, engine, dst_data);
+        test_tensor_t ref_dst_ts(
                 params.with_relu ? relu_dst : dst, engine, ref_dst_data);
-        test_tensor ref_running_mean_ts(
+        test_tensor_t ref_running_mean_ts(
                 running_mean, engine, ref_running_mean_data);
-        test_tensor ref_running_variance_ts(
+        test_tensor_t ref_running_variance_ts(
                 running_variance, engine, ref_running_variance_data);
-        test_tensor ref_batch_mean_ts(batch_mean, engine, ref_batch_mean_data);
-        test_tensor ref_batch_variance_ts(
+        test_tensor_t ref_batch_mean_ts(
+                batch_mean, engine, ref_batch_mean_data);
+        test_tensor_t ref_batch_variance_ts(
                 batch_variance, engine, ref_batch_variance_data);
 
         graph::stream_t *strm = get_stream();
@@ -555,14 +558,14 @@ TEST(test_batch_norm_compile, BatchNormBackwardFp32) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor scale_ts(scale, engine, scale_data);
-    test_tensor mean_ts(mean, engine, mean_data);
-    test_tensor variance_ts(variance, engine, varience_data);
-    test_tensor diff_dst_ts(diff_dst, engine, diff_dst_data);
-    test_tensor diff_src_ts(diff_src, engine, diff_src_data);
-    test_tensor diff_scale_ts(diff_scale, engine, diff_scale_data);
-    test_tensor diff_shift_ts(diff_shift, engine, diff_shift_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t scale_ts(scale, engine, scale_data);
+    test_tensor_t mean_ts(mean, engine, mean_data);
+    test_tensor_t variance_ts(variance, engine, varience_data);
+    test_tensor_t diff_dst_ts(diff_dst, engine, diff_dst_data);
+    test_tensor_t diff_src_ts(diff_src, engine, diff_src_data);
+    test_tensor_t diff_scale_ts(diff_scale, engine, diff_scale_data);
+    test_tensor_t diff_shift_ts(diff_shift, engine, diff_shift_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -651,11 +654,11 @@ TEST(test_batch_norm_compile, BatchNormBackwardFp32WithSingleOutput) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor mean_ts(mean, engine, mean_data);
-    test_tensor variance_ts(variance, engine, variance_data);
-    test_tensor diff_dst_ts(diff_dst, engine, diff_dst_data);
-    test_tensor diff_src_ts(diff_src, engine, diff_src_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t mean_ts(mean, engine, mean_data);
+    test_tensor_t variance_ts(variance, engine, variance_data);
+    test_tensor_t diff_dst_ts(diff_dst, engine, diff_dst_data);
+    test_tensor_t diff_src_ts(diff_src, engine, diff_src_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -771,11 +774,11 @@ TEST(test_batch_norm_compile, BatchNormForwardTrainingWith1DSpatialInput) {
     ASSERT_TRUE(std::equal(output_strides.begin(), output_strides.end(),
             output_strides_ref.begin()));
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor scale_ts(scale, engine, scale_data);
-    test_tensor shift_ts(shift, engine, shift_data);
-    test_tensor mean_ts(mean, engine, mean_data);
-    test_tensor variance_ts(variance, engine, variance_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t scale_ts(scale, engine, scale_data);
+    test_tensor_t shift_ts(shift, engine, shift_data);
+    test_tensor_t mean_ts(mean, engine, mean_data);
+    test_tensor_t variance_ts(variance, engine, variance_data);
     graph::logical_tensor_t dst_, running_mean_, running_variance_, batch_mean_,
             batch_variance_;
     cp.query_logical_tensor(dst.id, &dst_);
@@ -783,11 +786,11 @@ TEST(test_batch_norm_compile, BatchNormForwardTrainingWith1DSpatialInput) {
     cp.query_logical_tensor(running_variance.id, &running_variance_);
     cp.query_logical_tensor(batch_mean.id, &batch_mean_);
     cp.query_logical_tensor(batch_variance.id, &batch_variance_);
-    test_tensor dst_ts(dst_, engine);
-    test_tensor running_mean_ts(running_mean_, engine);
-    test_tensor running_variance_ts(running_variance_, engine);
-    test_tensor batch_mean_ts(batch_mean_, engine);
-    test_tensor batch_variance_ts(batch_variance_, engine);
+    test_tensor_t dst_ts(dst_, engine);
+    test_tensor_t running_mean_ts(running_mean_, engine);
+    test_tensor_t running_variance_ts(running_variance_, engine);
+    test_tensor_t batch_mean_ts(batch_mean_, engine);
+    test_tensor_t batch_variance_ts(batch_variance_, engine);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -904,11 +907,11 @@ TEST(test_batch_norm_compile, BatchNormForwardTrainingWith0DSpatialInput) {
     ASSERT_TRUE(std::equal(output_strides.begin(), output_strides.end(),
             output_strides_ref.begin()));
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor scale_ts(scale, engine, scale_data);
-    test_tensor shift_ts(shift, engine, shift_data);
-    test_tensor mean_ts(mean, engine, mean_data);
-    test_tensor variance_ts(variance, engine, variance_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t scale_ts(scale, engine, scale_data);
+    test_tensor_t shift_ts(shift, engine, shift_data);
+    test_tensor_t mean_ts(mean, engine, mean_data);
+    test_tensor_t variance_ts(variance, engine, variance_data);
     graph::logical_tensor_t dst_, running_mean_, running_variance_, batch_mean_,
             batch_variance_;
     cp.query_logical_tensor(dst.id, &dst_);
@@ -916,11 +919,11 @@ TEST(test_batch_norm_compile, BatchNormForwardTrainingWith0DSpatialInput) {
     cp.query_logical_tensor(running_variance.id, &running_variance_);
     cp.query_logical_tensor(batch_mean.id, &batch_mean_);
     cp.query_logical_tensor(batch_variance.id, &batch_variance_);
-    test_tensor dst_ts(dst_, engine);
-    test_tensor running_mean_ts(running_mean_, engine);
-    test_tensor running_variance_ts(running_variance_, engine);
-    test_tensor batch_mean_ts(batch_mean_, engine);
-    test_tensor batch_variance_ts(batch_variance_, engine);
+    test_tensor_t dst_ts(dst_, engine);
+    test_tensor_t running_mean_ts(running_mean_, engine);
+    test_tensor_t running_variance_ts(running_variance_, engine);
+    test_tensor_t batch_mean_ts(batch_mean_, engine);
+    test_tensor_t batch_variance_ts(batch_variance_, engine);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -1008,18 +1011,18 @@ TEST(test_batch_norm_execute, BatchNormInt8) {
     ASSERT_EQ(g.add_op(&quant), graph::status::success);
     g.finalize();
 
-    test_tensor src_ts(src_int8, &engine);
+    test_tensor_t src_ts(src_int8, &engine);
     src_ts.fill<int8_t>(0, 5);
-    test_tensor scale_ts(scale, &engine);
+    test_tensor_t scale_ts(scale, &engine);
     scale_ts.fill<float>();
-    test_tensor shift_ts(shift, &engine);
+    test_tensor_t shift_ts(shift, &engine);
     shift_ts.fill<float>();
-    test_tensor mean_ts(mean, &engine);
+    test_tensor_t mean_ts(mean, &engine);
     mean_ts.fill<float>();
-    test_tensor variance_ts(variance, &engine);
+    test_tensor_t variance_ts(variance, &engine);
     variance_ts.fill<float>();
-    test_tensor dst_ts(dst_int8_out, &engine);
-    test_tensor ref_dst_ts(dst_int8_out, &engine);
+    test_tensor_t dst_ts(dst_int8_out, &engine);
+    test_tensor_t ref_dst_ts(dst_int8_out, &engine);
 
     ASSERT_EQ(run_graph(g, {src_ts, scale_ts, shift_ts, mean_ts, variance_ts},
                       {ref_dst_ts}, engine, strm),
@@ -1167,18 +1170,18 @@ TEST(test_batch_norm_execute, BatchNormReluInt8) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, &engine), graph::status::success);
 
-    test_tensor src_ts(src_int8, &engine);
+    test_tensor_t src_ts(src_int8, &engine);
     src_ts.fill<int8_t>(0, 5);
-    test_tensor scale_ts(scale, &engine);
+    test_tensor_t scale_ts(scale, &engine);
     scale_ts.fill<float>();
-    test_tensor shift_ts(shift, &engine);
+    test_tensor_t shift_ts(shift, &engine);
     shift_ts.fill<float>();
-    test_tensor mean_ts(mean, &engine);
+    test_tensor_t mean_ts(mean, &engine);
     mean_ts.fill<float>();
-    test_tensor variance_ts(variance, &engine);
+    test_tensor_t variance_ts(variance, &engine);
     variance_ts.fill<float>();
-    test_tensor dst_ts(dst_int8_out, &engine);
-    test_tensor ref_dst_ts(dst_int8_out, &engine);
+    test_tensor_t dst_ts(dst_int8_out, &engine);
+    test_tensor_t ref_dst_ts(dst_int8_out, &engine);
 
     graph::stream_t &strm = *get_stream();
 
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_binary_op.cpp b/tests/gtests/graph/unit/backend/dnnl/test_binary_op.cpp
index 3af8a0901c7..f501c48166b 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_binary_op.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_binary_op.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -74,9 +74,9 @@ TEST(test_binary_op_execute, BinaryOp) {
 
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-        test_tensor src0_ts(src0_lt, eng, src0);
-        test_tensor src1_ts(src1_lt, eng, src1);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src0_ts(src0_lt, eng, src0);
+        test_tensor_t src1_ts(src1_lt, eng, src1);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
 
         graph::stream_t *strm = get_stream();
         cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -133,9 +133,9 @@ TEST(test_binary_op_execute, MulEltwise) {
         std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-        test_tensor src0_ts(src0_lt, eng, src0);
-        test_tensor src1_ts(src1_lt, eng, src1);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src0_ts(src0_lt, eng, src0);
+        test_tensor_t src1_ts(src1_lt, eng, src1);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
 
         graph::stream_t *strm = get_stream();
         cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -195,10 +195,10 @@ TEST(test_binary_op_execute, BinaryOpAddFusion) {
         std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
         p.compile(&cp, inputs, outputs, eng);
 
-        test_tensor src0_ts(src0_lt, eng, src0);
-        test_tensor src1_ts(src1_lt, eng, src1);
-        test_tensor post_src_ts(post_src_lt, eng, post_src);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src0_ts(src0_lt, eng, src0);
+        test_tensor_t src1_ts(src1_lt, eng, src1);
+        test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
 
         graph::stream_t *strm = get_stream();
         cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -245,9 +245,9 @@ TEST(test_binary_op_execute, BinarySub) {
     std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -307,9 +307,9 @@ TEST(test_binary_op_execute, MinEltwise) {
         std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
         p.compile(&cp, inputs, outputs, eng);
 
-        test_tensor src0_ts(src0_lt, eng, src0);
-        test_tensor src1_ts(src1_lt, eng, src1);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src0_ts(src0_lt, eng, src0);
+        test_tensor_t src1_ts(src1_lt, eng, src1);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
 
         graph::stream_t *strm = get_stream();
         cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -317,6 +317,59 @@ TEST(test_binary_op_execute, MinEltwise) {
     }
 }
 
+TEST(test_binary_op_execute, BinarySqrt) {
+    graph::engine_t *eng = get_engine();
+
+    std::vector<float> src0 {2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0};
+    std::vector<float> src1 {-1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0};
+    std::vector<float> dst(src0.size(), 0.0);
+
+    graph::logical_tensor_t src0_lt
+            = utils::logical_tensor_init(0, {1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t src1_lt
+            = utils::logical_tensor_init(1, {1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt
+            = utils::logical_tensor_init(2, {1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt
+            = utils::logical_tensor_init(3, {1, 3, 3}, graph::data_type::f32);
+
+    graph::op_t add_op(0, graph::op_kind::Add, "add");
+    graph::op_t sqrt_op(1, graph::op_kind::Sqrt, "sqrt");
+
+    add_op.add_input(src0_lt);
+    add_op.add_input(src1_lt);
+    add_op.add_output(add_dst_lt);
+    sqrt_op.add_input(add_dst_lt);
+    sqrt_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&add_op);
+    g.add_op(&sqrt_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("binary_post_ops_fusion");
+    ASSERT_NE(apass, nullptr);
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+    std::vector<const graph::logical_tensor_t *> inputs {&src0_lt, &src1_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+    p.compile(&cp, inputs, outputs, eng);
+
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
+    strm->wait();
+}
+
 TEST(test_binary_op_execute, MaxEltwise) {
     graph::engine_t *eng = get_engine();
 
@@ -367,9 +420,9 @@ TEST(test_binary_op_execute, MaxEltwise) {
         std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
         p.compile(&cp, inputs, outputs, eng);
 
-        test_tensor src0_ts(src0_lt, eng, src0);
-        test_tensor src1_ts(src1_lt, eng, src1);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src0_ts(src0_lt, eng, src0);
+        test_tensor_t src1_ts(src1_lt, eng, src1);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
 
         graph::stream_t *strm = get_stream();
         cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -433,12 +486,12 @@ TEST(test_binary_op_execute_subgraph_fp32, BinarySwish) {
         g.add_op(&multiply);
         g.finalize();
 
-        test_tensor binary_src0_ts(binary_src0, engine, src0_data);
-        test_tensor binary_src1_ts(binary_src1, engine, src1_data);
+        test_tensor_t binary_src0_ts(binary_src0, engine, src0_data);
+        test_tensor_t binary_src1_ts(binary_src1, engine, src1_data);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(binary_dst_shape));
-        test_tensor mul_dst_ts(mul_dst, engine, case1_out_data);
+        test_tensor_t mul_dst_ts(mul_dst, engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, {binary_src0_ts, binary_src1_ts}, {mul_dst_ts},
                           *engine, *strm),
@@ -462,7 +515,7 @@ TEST(test_binary_op_execute_subgraph_fp32, BinarySwish) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(binary_dst_shape));
-        test_tensor mul_dst_ts2(mul_dst, engine, case2_out_data);
+        test_tensor_t mul_dst_ts2(mul_dst, engine, case2_out_data);
 
         cp.execute(strm, {binary_src0_ts.get(), binary_src1_ts.get()},
                 {mul_dst_ts2.get()});
@@ -541,11 +594,11 @@ TEST(test_binary_op_execute, Eltwise3BinaryPostops) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    test_tensor relu_src_ts(relu_src_lt, eng, src);
-    test_tensor div_src_ts(div_src_lt, eng, binary_src1);
-    test_tensor max_src_ts(max_src_lt, eng, binary_src2);
-    test_tensor sub_src_ts(sub_src_lt, eng, binary_src3);
-    test_tensor sub_dst_ts(sub_dst_lt, eng, dst);
+    test_tensor_t relu_src_ts(relu_src_lt, eng, src);
+    test_tensor_t div_src_ts(div_src_lt, eng, binary_src1);
+    test_tensor_t max_src_ts(max_src_lt, eng, binary_src2);
+    test_tensor_t sub_src_ts(sub_src_lt, eng, binary_src3);
+    test_tensor_t sub_dst_ts(sub_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
 
@@ -609,7 +662,7 @@ TEST(test_binary_op_execute_subgraph_fp32, Binary3Postops) {
         std::vector<graph::op_t> post_ops {};
         for (size_t i = 0; i < pop_ts.size(); ++i) {
             auto pop_t = pop_ts[i];
-            post_ops.emplace_back(graph::op_t {i + 1, pop_t, "post op"});
+            post_ops.emplace_back(i + 1, pop_t, "post op");
 
             // set additional parameters for specific ops
             if (pop_t == graph::op_kind::Elu) {
@@ -636,16 +689,15 @@ TEST(test_binary_op_execute_subgraph_fp32, Binary3Postops) {
             g.add_op(&pop);
         g.finalize();
 
-        test_tensor binary_src0_ts(lt_vec[0], engine, src_datas[0]);
-        test_tensor binary_src1_ts(lt_vec[1], engine, src_datas[1]);
-        std::vector<test_tensor> src_tss {};
+        test_tensor_t binary_src0_ts(lt_vec[0], engine, src_datas[0]);
+        test_tensor_t binary_src1_ts(lt_vec[1], engine, src_datas[1]);
+        std::vector<test_tensor_t> src_tss {};
         for (size_t i = 0; i < input_lts.size(); ++i)
-            src_tss.emplace_back(
-                    test_tensor(lt_vec[input_lts[i]], engine, src_datas[i]));
+            src_tss.emplace_back(lt_vec[input_lts[i]], engine, src_datas[i]);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(binary_src_shape));
-        test_tensor case1_dst_ts(lt_vec[lt_idx], engine, case1_out_data);
+        test_tensor_t case1_dst_ts(lt_vec[lt_idx], engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, src_tss, {case1_dst_ts}, *engine, *strm),
                 graph::status::success);
@@ -671,9 +723,9 @@ TEST(test_binary_op_execute_subgraph_fp32, Binary3Postops) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(binary_dst_shape));
-        test_tensor case2_dst_ts(lt_vec[lt_idx], engine, case2_out_data);
+        test_tensor_t case2_dst_ts(lt_vec[lt_idx], engine, case2_out_data);
 
-        cp.execute(strm, test_tensor::to_graph_tensor(src_tss),
+        cp.execute(strm, test_tensor_t::to_graph_tensor(src_tss),
                 {case2_dst_ts.get()});
         strm->wait();
         ASSERT_TRUE(allclose<float>(
@@ -727,9 +779,9 @@ TEST(test_binary_op_execute, Add) {
     ASSERT_EQ(dst_lt.layout_type, graph::layout_type::any);
     ASSERT_EQ(compiled_dst_lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -747,9 +799,9 @@ TEST(test_binary_op_execute, Add) {
             2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0};
     std::vector<float> dst_2nd(src0_2nd.size(), 0.0);
 
-    test_tensor src0_2nd_ts(src0_lt, eng, src0_2nd);
-    test_tensor src1_2nd_ts(src1_lt, eng, src1_2nd);
-    test_tensor dst_2nd_ts(compiled_dst_lt, eng, dst_2nd);
+    test_tensor_t src0_2nd_ts(src0_lt, eng, src0_2nd);
+    test_tensor_t src1_2nd_ts(src1_lt, eng, src1_2nd);
+    test_tensor_t dst_2nd_ts(compiled_dst_lt, eng, dst_2nd);
     cp.execute(
             strm, {src0_2nd_ts.get(), src1_2nd_ts.get()}, {dst_2nd_ts.get()});
     strm->wait();
@@ -802,9 +854,9 @@ TEST(test_binary_op_execute, AddWithDifferentFormat) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -863,9 +915,9 @@ TEST(test_binary_op_execute, BroadcastAdd) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -883,9 +935,9 @@ TEST(test_binary_op_execute, BroadcastAdd) {
             2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0};
     std::vector<float> dst_2nd(src0_2nd.size(), 0.0);
 
-    test_tensor src0_2nd_ts(src0_lt, eng, src0_2nd);
-    test_tensor src1_2nd_ts(src1_lt, eng, src1_2nd);
-    test_tensor dst_2nd_ts(compiled_dst_lt, eng, dst_2nd);
+    test_tensor_t src0_2nd_ts(src0_lt, eng, src0_2nd);
+    test_tensor_t src1_2nd_ts(src1_lt, eng, src1_2nd);
+    test_tensor_t dst_2nd_ts(compiled_dst_lt, eng, dst_2nd);
     cp.execute(
             strm, {src0_2nd_ts.get(), src1_2nd_ts.get()}, {dst_2nd_ts.get()});
     strm->wait();
@@ -939,9 +991,9 @@ TEST(test_binary_op_execute, SwapBroadcastAdd) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -957,7 +1009,7 @@ TEST(test_binary_op_execute, SwapBroadcastAdd) {
     ASSERT_EQ(inplace_pair[0].input_id, src1_lt.id);
     ASSERT_EQ(inplace_pair[0].output_id, dst_lt.id);
 
-    test_tensor dst_ts2(compiled_dst_lt, eng, src1);
+    test_tensor_t dst_ts2(compiled_dst_lt, eng, src1);
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts2.get()});
     strm->wait();
 
@@ -1010,9 +1062,9 @@ TEST(test_binary_op_execute, MultidirectionalBroadcastAddBA) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1029,9 +1081,9 @@ TEST(test_binary_op_execute, MultidirectionalBroadcastAddBA) {
             2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0};
     std::vector<float> dst_2nd(ref_dst_2nd.size(), 0.0);
 
-    test_tensor src0_2nd_ts(src0_lt, eng, src0_2nd);
-    test_tensor src1_2nd_ts(src1_lt, eng, src1_2nd);
-    test_tensor dst_2nd_ts(compiled_dst_lt, eng, dst_2nd);
+    test_tensor_t src0_2nd_ts(src0_lt, eng, src0_2nd);
+    test_tensor_t src1_2nd_ts(src1_lt, eng, src1_2nd);
+    test_tensor_t dst_2nd_ts(compiled_dst_lt, eng, dst_2nd);
     cp.execute(
             strm, {src0_2nd_ts.get(), src1_2nd_ts.get()}, {dst_2nd_ts.get()});
     strm->wait();
@@ -1085,9 +1137,9 @@ TEST(test_binary_op_execute, multidirectionalbBroadcastAddAB) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1142,9 +1194,9 @@ TEST(test_binary_op_execute, MultidirectionalBroadcastAdd) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1199,9 +1251,9 @@ TEST(test_binary_op_execute, MultidirectionalBroadcastAddExpandDim) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1368,9 +1420,9 @@ TEST(test_binary_op_execute, ReversedDifferentFormatBroadcastAdd) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1434,9 +1486,9 @@ TEST(test_binary_op_execute, BiasAdd) {
         std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
         p.compile(&cp, inputs, outputs, eng);
 
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor bias_ts(bias_lt, eng, bias);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t bias_ts(bias_lt, eng, bias);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
 
         graph::stream_t *strm = get_stream();
         cp.execute(strm, {src_ts.get(), bias_ts.get()}, {dst_ts.get()});
@@ -1508,10 +1560,10 @@ TEST(test_binary_op_execute, AddMul) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -1577,10 +1629,10 @@ TEST(test_binary_op_execute, AddMulPostSrcAsNxc) {
     graph::logical_tensor_t compiled_dst_lt;
     cp.query_logical_tensor(dst_lt.id, &compiled_dst_lt);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -1643,9 +1695,9 @@ TEST(test_binary_op_execute, AddRelu) {
     ASSERT_EQ(dst_lt.layout_type, graph::layout_type::any);
     ASSERT_EQ(compiled_dst_lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1706,9 +1758,9 @@ TEST(test_binary_op_execute, AddSigmoid) {
     ASSERT_EQ(dst_lt.layout_type, graph::layout_type::any);
     ASSERT_EQ(compiled_dst_lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor dst_ts(compiled_dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t dst_ts(compiled_dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
@@ -1785,11 +1837,11 @@ TEST(test_binary_op_execute, AddAdd) {
     ASSERT_EQ(inplace_pair[0].input_id, post_src_lt.id);
     ASSERT_EQ(inplace_pair[0].output_id, dst_lt.id);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_inplace_ts(dst_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_inplace_ts(dst_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -1851,9 +1903,9 @@ TEST(test_binary_op_execute, ScalarScalarAdd) {
     ASSERT_EQ(scalar_lt.layout_type, graph::layout_type::strided);
     ASSERT_EQ(scalar_lt.ndims, 0);
 
-    test_tensor src0_ts(src0, eng, src0_data);
-    test_tensor src1_ts(src1, eng, src1_data);
-    test_tensor dst_ts(scalar_lt, eng, dst_data);
+    test_tensor_t src0_ts(src0, eng, src0_data);
+    test_tensor_t src1_ts(src1, eng, src1_data);
+    test_tensor_t dst_ts(scalar_lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()}),
@@ -1912,9 +1964,9 @@ TEST(test_binary_op_execute, ScalarVectorAdd) {
     ASSERT_EQ(scalar_lt.layout_type, graph::layout_type::strided);
     ASSERT_EQ(scalar_lt.ndims, 1);
 
-    test_tensor src0_ts(src0, eng, src0_data);
-    test_tensor src1_ts(src1, eng, src1_data);
-    test_tensor dst_ts(scalar_lt, eng, dst_data);
+    test_tensor_t src0_ts(src0, eng, src0_data);
+    test_tensor_t src1_ts(src1, eng, src1_data);
+    test_tensor_t dst_ts(scalar_lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()}),
@@ -1976,10 +2028,10 @@ TEST(test_binary_op_execute, MulAddPerTensorBroadcast) {
     std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -2042,10 +2094,10 @@ TEST(test_binary_op_execute, MulAddPerHwBroadcast) {
     std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -2108,10 +2160,10 @@ TEST(test_binary_op_execute, MulAddPerChannelBroadcast) {
     std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src0_ts(src0_lt, eng, src0);
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src0_ts.get(), src1_ts.get(), post_src_ts.get()},
@@ -2186,11 +2238,11 @@ TEST(test_binary_op_execute, MulAddAdd) {
     std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor mul_src1_ts(mul_src1_lt, eng, mul_src1);
-    test_tensor mul_src2_ts(mul_src2_lt, eng, mul_src2);
-    test_tensor add1_src_ts(add1_src_lt, eng, add1_src);
-    test_tensor add2_src_ts(add2_src_lt, eng, add2_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t mul_src1_ts(mul_src1_lt, eng, mul_src1);
+    test_tensor_t mul_src2_ts(mul_src2_lt, eng, mul_src2);
+    test_tensor_t add1_src_ts(add1_src_lt, eng, add1_src);
+    test_tensor_t add2_src_ts(add2_src_lt, eng, add2_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -2248,9 +2300,9 @@ TEST(test_binary_op_execute, AddEmptyInput) {
     ASSERT_EQ(empty_lt.dims[1], 3);
     ASSERT_EQ(empty_lt.dims[2], 0);
 
-    test_tensor src0_ts(src0, eng);
-    test_tensor src1_ts(src1, eng);
-    test_tensor dst_ts(empty_lt, eng);
+    test_tensor_t src0_ts(src0, eng);
+    test_tensor_t src1_ts(src1, eng);
+    test_tensor_t dst_ts(empty_lt, eng);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()}),
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_bmm.cpp b/tests/gtests/graph/unit/backend/dnnl/test_bmm.cpp
index 20ba2e68c38..a1ef644803c 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_bmm.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_bmm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -91,12 +91,12 @@ TEST(test_bmm_execute_subgraph_int8, BmmU8u8f32) {
     g.add_op(&matmul_op);
     g.finalize();
 
-    test_tensor src_u8_ts(src_u8, engine);
+    test_tensor_t src_u8_ts(src_u8, engine);
     src_u8_ts.fill<uint8_t>(20, 10);
-    test_tensor weight_u8_ts(weight_u8, engine);
+    test_tensor_t weight_u8_ts(weight_u8, engine);
     weight_u8_ts.fill<uint8_t>(20, 10);
     // -------------------------case 1----------------------------------
-    test_tensor dst_f32_ts(dst_f32, engine);
+    test_tensor_t dst_f32_ts(dst_f32, engine);
     ASSERT_EQ(run_graph(g, {src_u8_ts, weight_u8_ts}, {dst_f32_ts}, *engine,
                       *strm),
             graph::status::success);
@@ -117,7 +117,7 @@ TEST(test_bmm_execute_subgraph_int8, BmmU8u8f32) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor dst_f32_case2_ts(dst_f32, engine);
+    test_tensor_t dst_f32_case2_ts(dst_f32, engine);
     cp.execute(strm, {src_u8_ts.get(), weight_u8_ts.get()},
             {dst_f32_case2_ts.get()});
     strm->wait();
@@ -220,11 +220,11 @@ TEST(test_bmm_execute_subgraph_int8, BmmU8u8f32NonContiguous) {
     g.add_op(&matmul_op);
     g.finalize();
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
     // -------------------------case 1----------------------------------
     std::vector<float> case1_out_data(product(dst_shape));
-    test_tensor dst_f32_ts(dst_f32, engine, case1_out_data);
+    test_tensor_t dst_f32_ts(dst_f32, engine, case1_out_data);
     ASSERT_EQ(run_graph(g, {src_u8_ts, weight_f32_ts}, {dst_f32_ts}, *engine,
                       *strm),
             graph::status::success);
@@ -246,7 +246,7 @@ TEST(test_bmm_execute_subgraph_int8, BmmU8u8f32NonContiguous) {
     p.compile(&cp, lt_ins, lt_outs, engine);
 
     std::vector<float> case2_out_data(product(dst_shape));
-    test_tensor dst_f32_case2_ts(dst_f32, engine, case2_out_data);
+    test_tensor_t dst_f32_case2_ts(dst_f32, engine, case2_out_data);
     cp.execute(strm, {weight_f32_ts.get(), src_u8_ts.get()},
             {dst_f32_case2_ts.get()});
     strm->wait();
@@ -362,11 +362,11 @@ TEST(test_bmm_execute_subgraph_int8, BmmDivU8u8f32) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_u8_ts(weight_u8, engine, weight_data);
-    test_tensor div_src1_ts(div_src1, engine, div_src1_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_u8_ts(weight_u8, engine, weight_data);
+    test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
     std::vector<float> case2_out_data(product(dst_shape));
-    test_tensor dst_f32_case2_ts(div_f32, engine, case2_out_data);
+    test_tensor_t dst_f32_case2_ts(div_f32, engine, case2_out_data);
     cp.execute(strm, {src_u8_ts.get(), weight_u8_ts.get(), div_src1_ts.get()},
             {dst_f32_case2_ts.get()});
     strm->wait();
@@ -491,11 +491,11 @@ TEST(test_bmm_execute_subgraph_int8, BmmDivAddU8u8f32) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_u8_ts(weight_u8, engine, weight_data);
-    test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-    test_tensor add_src1_ts(add_src1, engine, add_src1_data);
-    test_tensor dst_f32_case2_ts(add_f32, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_u8_ts(weight_u8, engine, weight_data);
+    test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+    test_tensor_t add_src1_ts(add_src1, engine, add_src1_data);
+    test_tensor_t dst_f32_case2_ts(add_f32, engine);
     cp.execute(strm,
             {src_u8_ts.get(), weight_u8_ts.get(), div_src1_ts.get(),
                     add_src1_ts.get()},
@@ -631,9 +631,9 @@ TEST(test_bmm_execute_subgraph_int8, BmmX8x8bf16_CPU) {
 
             p.compile(&cp, lt_ins, lt_outs, engine);
 
-            test_tensor src_ts(src, engine, src_data);
-            test_tensor weight_ts(weight, engine, weight_data);
-            test_tensor dst_ts(dst_bf16, engine);
+            test_tensor_t src_ts(src, engine, src_data);
+            test_tensor_t weight_ts(weight, engine, weight_data);
+            test_tensor_t dst_ts(dst_bf16, engine);
             cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
             strm->wait();
         }
@@ -782,10 +782,10 @@ TEST(test_bmm_execute_subgraph_int8, BmmDivX8x8bf16_CPU) {
             p.compile(&cp, lt_ins, lt_outs, engine);
 
             std::vector<bfloat16_t> div_src1_data(1);
-            test_tensor src_ts(src, engine, src_data);
-            test_tensor weight_ts(weight, engine, weight_data);
-            test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-            test_tensor dst_ts(div_bf16, engine);
+            test_tensor_t src_ts(src, engine, src_data);
+            test_tensor_t weight_ts(weight, engine, weight_data);
+            test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+            test_tensor_t dst_ts(div_bf16, engine);
             cp.execute(strm, {src_ts.get(), weight_ts.get(), div_src1_ts.get()},
                     {dst_ts.get()});
             strm->wait();
@@ -934,10 +934,10 @@ TEST(test_bmm_execute_subgraph_int8, BmmDivBlockedX8x8bf16_CPU) {
             p.compile(&cp, lt_ins, lt_outs, engine);
 
             std::vector<bfloat16_t> div_src1_data(1);
-            test_tensor src_ts(src, engine, src_data);
-            test_tensor weight_ts(weight, engine, weight_data);
-            test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-            test_tensor dst_ts(div_bf16, engine);
+            test_tensor_t src_ts(src, engine, src_data);
+            test_tensor_t weight_ts(weight, engine, weight_data);
+            test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+            test_tensor_t dst_ts(div_bf16, engine);
             cp.execute(strm, {src_ts.get(), weight_ts.get(), div_src1_ts.get()},
                     {dst_ts.get()});
             strm->wait();
@@ -1106,11 +1106,11 @@ TEST(test_bmm_execute_subgraph_int8, BmmDivAddX8x8bf16_CPU) {
 
             p.compile(&cp, lt_ins, lt_outs, engine);
 
-            test_tensor src_ts(src, engine, src_data);
-            test_tensor weight_ts(weight, engine, weight_data);
-            test_tensor div_src1_ts(div_src1, engine);
-            test_tensor add_src1_ts(add_src1, engine);
-            test_tensor dst_ts(add_bf16, engine);
+            test_tensor_t src_ts(src, engine, src_data);
+            test_tensor_t weight_ts(weight, engine, weight_data);
+            test_tensor_t div_src1_ts(div_src1, engine);
+            test_tensor_t add_src1_ts(add_src1, engine);
+            test_tensor_t dst_ts(add_bf16, engine);
             cp.execute(strm,
                     {src_ts.get(), weight_ts.get(), div_src1_ts.get(),
                             add_src1_ts.get()},
@@ -1240,12 +1240,12 @@ TEST(test_bmm_execute_subgraph_int8, BmmMulAddTransposeBU8s8f32) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-    test_tensor mul_src1_ts(mul_src1, engine, mul_src1_data);
-    test_tensor add_src1_ts(add_src1, engine, add_src1_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t mul_src1_ts(mul_src1, engine, mul_src1_data);
+    test_tensor_t add_src1_ts(add_src1, engine, add_src1_data);
     std::vector<float> case2_out_data(product(dst_shape));
-    test_tensor dst_f32_case2_ts(add_f32, engine, case2_out_data);
+    test_tensor_t dst_f32_case2_ts(add_f32, engine, case2_out_data);
     cp.execute(strm,
             {src_u8_ts.get(), weight_s8_ts.get(), mul_src1_ts.get(),
                     add_src1_ts.get()},
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_common.cpp b/tests/gtests/graph/unit/backend/dnnl/test_common.cpp
index fb6575d4e8d..284e1bdd06b 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_common.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_common.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ namespace graph = dnnl::impl::graph;
 namespace utils = dnnl::graph::tests::unit::utils;
 namespace dnnl_impl = graph::dnnl_impl;
 
-TEST(test_common_common, GetNxcStride) {
+TEST(test_common, GetNxcStride) {
     graph::dims shape {1, 2, 3, 4, 5, 6};
     graph::dims shape_def {720, 1, 240, 60, 12, 2};
     const auto &result = dnnl_impl::get_nxc_strides(shape);
@@ -39,7 +39,7 @@ TEST(test_common_common, GetNxcStride) {
             && std::equal(result.begin(), result.end(), shape_def.begin()));
 }
 
-TEST(test_common_common, GetNcxFormat) {
+TEST(test_common, GetNcxFormat) {
     ASSERT_EQ(dnnl_impl::get_ncx_format(1), dnnl_impl::format_tag::a);
     ASSERT_EQ(dnnl_impl::get_ncx_format(2), dnnl_impl::format_tag::ab);
     ASSERT_EQ(dnnl_impl::get_ncx_format(3), dnnl_impl::format_tag::abc);
@@ -49,12 +49,12 @@ TEST(test_common_common, GetNcxFormat) {
     ASSERT_EQ(dnnl_impl::get_ncx_format(7), dnnl_impl::format_tag::undef);
 }
 
-TEST(test_common_common, MakeDnnlMemory) {
+TEST(test_common, MakeDnnlMemory) {
     graph::engine_t &eng = *get_engine();
 
     graph::logical_tensor_t lt
             = utils::logical_tensor_init(0, {1, 2}, graph::data_type::f32);
-    test_tensor t1 {lt, &eng};
+    test_tensor_t t1 {lt, &eng};
     if (eng.kind() == graph::engine_kind::cpu) {
         ASSERT_NO_THROW(graph::dnnl_impl::make_dnnl_memory(
                 t1.get(), dnnl::engine(dnnl::engine::kind::cpu, 0)));
@@ -64,7 +64,7 @@ TEST(test_common_common, MakeDnnlMemory) {
     }
 }
 
-TEST(test_common_common, Is4cBlocked) {
+TEST(test_common, Is4cBlocked) {
     {
         graph::logical_tensor_t lt = utils::logical_tensor_init(
                 0, {1, 2}, graph::data_type::f32, graph::layout_type::any);
@@ -78,7 +78,7 @@ TEST(test_common_common, Is4cBlocked) {
     }
 }
 
-TEST(test_common_common, FillLayoutInfoDeathTest) {
+TEST(test_common, FillLayoutInfoDeathTest) {
     {
         graph::logical_tensor_t lt = utils::logical_tensor_init(
                 0, {1, 2}, graph::data_type::f32, graph::layout_type::any);
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_compiled_partition.cpp b/tests/gtests/graph/unit/backend/dnnl/test_compiled_partition.cpp
index f08af58cd3b..b30fa041ebe 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_compiled_partition.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_compiled_partition.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 namespace graph = dnnl::impl::graph;
 namespace utils = dnnl::graph::tests::unit::utils;
 
-TEST(test_compiled_partition_compiled_partition, Relu) {
+TEST(test_compiled_partition, Relu) {
     graph::engine_t *eng = get_engine();
 
     graph::op_t relu_op(graph::op_kind::ReLU, "relu");
@@ -82,7 +82,7 @@ TEST(test_compiled_partition_compiled_partition, Relu) {
         data_in[i] = static_cast<float>(i) - static_cast<float>(ele_num_in / 2);
     }
 
-    test_tensor t_in(lt_in, eng, data_in), t_out(query_out_lt, eng, data_out);
+    test_tensor_t t_in(lt_in, eng, data_in), t_out(query_out_lt, eng, data_out);
 
     std::vector<graph::tensor_t> t_inputs, t_outputs;
     t_inputs.emplace_back(t_in.get());
@@ -104,7 +104,7 @@ TEST(test_compiled_partition_compiled_partition, Relu) {
     }
 }
 
-TEST(test_compiled_partition_compiled_partition, SearchRequiredInputsOutputs) {
+TEST(test_compiled_partition, SearchRequiredInputsOutputs) {
     graph::engine_t *eng = get_engine();
 
     graph::op_t relu_op(graph::op_kind::ReLU, "relu");
@@ -185,24 +185,24 @@ TEST(test_compiled_partition_compiled_partition, SearchRequiredInputsOutputs) {
         data_in[i] = static_cast<float>(i) - static_cast<float>(ele_num_in / 2);
     }
 
-    test_tensor t_in(lt_in, eng, data_in), t_out(query_lt_out, eng, data_out);
-    test_tensor t_in_additional1(lt_in_additional1, eng),
+    test_tensor_t t_in(lt_in, eng, data_in), t_out(query_lt_out, eng, data_out);
+    test_tensor_t t_in_additional1(lt_in_additional1, eng),
             t_in_additional2(lt_in_additional2, eng);
 
-    test_tensor t_out_additional1(lt_out_additional1, eng),
+    test_tensor_t t_out_additional1(lt_out_additional1, eng),
             t_out_additional2(lt_out_additional2, eng);
 
     // when submit, in/outputs tensor's order must be same as compile
     // funcstion's in/outputs logical tensor
-    std::vector<test_tensor> t_inputs_correct {
+    std::vector<test_tensor_t> t_inputs_correct {
             t_in_additional1, t_in, t_in_additional2};
-    std::vector<test_tensor> t_outputs_correct {
+    std::vector<test_tensor_t> t_outputs_correct {
             t_out_additional1, t_out_additional2, t_out};
 
     graph::stream_t *strm = get_stream();
     EXPECT_SUCCESS(
-            cp.execute(strm, test_tensor::to_graph_tensor(t_inputs_correct),
-                    test_tensor::to_graph_tensor(t_outputs_correct)));
+            cp.execute(strm, test_tensor_t::to_graph_tensor(t_inputs_correct),
+                    test_tensor_t::to_graph_tensor(t_outputs_correct)));
     strm->wait();
 
     std::vector<float> ref_out(ele_num_in);
@@ -218,7 +218,7 @@ TEST(test_compiled_partition_compiled_partition, SearchRequiredInputsOutputs) {
     }
 }
 
-TEST(test_compiled_partition_compiled_partition, AllowRepeatedInputs) {
+TEST(test_compiled_partition, AllowRepeatedInputs) {
     graph::engine_t *eng = get_engine();
 
     graph::op_t n(graph::op_kind::Multiply);
@@ -271,16 +271,16 @@ TEST(test_compiled_partition_compiled_partition, AllowRepeatedInputs) {
     std::vector<float> ref_out {
             1.0f, 4.0f, 9.0f, 16.0f, 25.0f, 36.0f, 49.0f, 64.0f, 81.0f};
 
-    test_tensor t_in1(lt_in1, eng, data_in);
-    test_tensor t_out(query_lt_out, eng, data_out);
+    test_tensor_t t_in1(lt_in1, eng, data_in);
+    test_tensor_t t_out(query_lt_out, eng, data_out);
 
     // only one input
-    std::vector<test_tensor> t_ins {t_in1};
-    std::vector<test_tensor> t_outs {t_out};
+    std::vector<test_tensor_t> t_ins {t_in1};
+    std::vector<test_tensor_t> t_outs {t_out};
 
     graph::stream_t *strm = get_stream();
-    EXPECT_SUCCESS(cp.execute(strm, test_tensor::to_graph_tensor(t_ins),
-            test_tensor::to_graph_tensor(t_outs)));
+    EXPECT_SUCCESS(cp.execute(strm, test_tensor_t::to_graph_tensor(t_ins),
+            test_tensor_t::to_graph_tensor(t_outs)));
     strm->wait();
     data_out = t_out.as_vec_type<float>();
     for (size_t i = 0; i < ref_out.size(); i++) {
@@ -288,7 +288,7 @@ TEST(test_compiled_partition_compiled_partition, AllowRepeatedInputs) {
     }
 }
 
-TEST(test_compiled_partition_compiled_partition, GetAndInfoMethod) {
+TEST(test_compiled_partition, GetAndInfoMethod) {
     using ltw = graph::logical_tensor_wrapper_t;
 
     graph::engine_t &engine = *get_engine();
@@ -309,9 +309,9 @@ TEST(test_compiled_partition_compiled_partition, GetAndInfoMethod) {
             graph::dnnl_impl::dnnl_compiled_partition_impl_t>(
             engine, inputs, outputs, kernel);
     graph::partition_t par;
+    const graph::fpmath_t fpm {graph::fpmath_mode::strict, false};
     auto par_impl = std::make_shared<graph::dnnl_impl::dnnl_partition_impl_t>(
-            engine.kind(), graph::fpmath_mode::strict,
-            graph::partition_kind_t::undef);
+            engine.kind(), fpm, graph::partition_kind_t::undef);
     par.init(par_impl);
     graph::compiled_partition_t cp(par);
     cp.init(cp_impl);
@@ -330,7 +330,7 @@ TEST(test_compiled_partition_compiled_partition, GetAndInfoMethod) {
     ASSERT_EQ(std::string(cp.info()), std::string(info.c_str()));
 }
 
-TEST(test_compiled_partition_compiled_partition, GetInputsAndOutputs) {
+TEST(test_compiled_partition, GetInputsAndOutputs) {
     using ltw = graph::logical_tensor_wrapper_t;
     graph::engine_t &engine = *get_engine();
     size_t id = 0;
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_concat.cpp b/tests/gtests/graph/unit/backend/dnnl/test_concat.cpp
index 6198d7bdca2..ab96c6b70af 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_concat.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -89,9 +89,9 @@ class concat_t : public ::testing::TestWithParam<concat_params_t> {
         g.add_op(&concat_op);
         g.finalize();
 
-        test_tensor src0_ts(src0_lt, eng, src0_data);
-        test_tensor src1_ts(src1_lt, eng, src1_data);
-        test_tensor case1_dst_ts(dst_lt, eng, case1_out_data);
+        test_tensor_t src0_ts(src0_lt, eng, src0_data);
+        test_tensor_t src1_ts(src1_lt, eng, src1_data);
+        test_tensor_t case1_dst_ts(dst_lt, eng, case1_out_data);
 
         ASSERT_EQ(run_graph(g, {src0_ts, src1_ts}, {case1_dst_ts}, *eng, *strm),
                 graph::status::success);
@@ -112,7 +112,7 @@ class concat_t : public ::testing::TestWithParam<concat_params_t> {
 
         p.compile(&cp, inputs, outputs, eng);
 
-        test_tensor case2_dst_ts(dst_lt, eng, case2_out_data);
+        test_tensor_t case2_dst_ts(dst_lt, eng, case2_out_data);
         cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {case2_dst_ts.get()});
         strm->wait();
 
@@ -321,11 +321,11 @@ TEST(test_concat_execute_subgraph_int8, Concat) {
     g.add_op(&q_op);
     g.finalize();
 
-    test_tensor src0_s8_ts(src0_s8, engine, src0_s8_data);
-    test_tensor src1_s8_ts(src1_s8, engine, src1_s8_data);
-    test_tensor src2_s8_ts(src2_s8, engine, src2_s8_data);
-    test_tensor case1_dst_s8_ts(dst_s8_q, engine, case1_dst_s8_data);
-    test_tensor case2_dst_s8_ts(dst_s8_q, engine, case2_dst_s8_data);
+    test_tensor_t src0_s8_ts(src0_s8, engine, src0_s8_data);
+    test_tensor_t src1_s8_ts(src1_s8, engine, src1_s8_data);
+    test_tensor_t src2_s8_ts(src2_s8, engine, src2_s8_data);
+    test_tensor_t case1_dst_s8_ts(dst_s8_q, engine, case1_dst_s8_data);
+    test_tensor_t case2_dst_s8_ts(dst_s8_q, engine, case2_dst_s8_data);
 
     // -------------------------case 1----------------------------------
     ASSERT_EQ(run_graph(g, {src0_s8_ts, src1_s8_ts, src2_s8_ts},
@@ -413,10 +413,10 @@ TEST(test_concat_execute, ConcatEmptyInput) {
     std::vector<float> src2_data(24);
     std::vector<float> dst_data(48);
 
-    test_tensor src0_ts(src0, eng, src0_data);
-    test_tensor src1_ts(src1_lt, eng);
-    test_tensor src2_ts(src2, eng, src2_data);
-    test_tensor dst_ts(lt, eng, dst_data);
+    test_tensor_t src0_ts(src0, eng, src0_data);
+    test_tensor_t src1_ts(src1_lt, eng);
+    test_tensor_t src2_ts(src2, eng, src2_data);
+    test_tensor_t dst_ts(lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src0_ts.get(), src1_ts.get(), src2_ts.get()},
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_constant_cache.cpp b/tests/gtests/graph/unit/backend/dnnl/test_constant_cache.cpp
index 72477936dcb..f7dfc0469d6 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_constant_cache.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_constant_cache.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,13 +27,13 @@
 namespace graph = dnnl::impl::graph;
 namespace dnnl_impl = graph::dnnl_impl;
 
-TEST(test_constant_cache_constant_cache, SetGetCapacity) {
+TEST(test_constant_cache, SetGetCapacity) {
     graph::constant_tensor_cache_t cache(0);
     ASSERT_EQ(cache.set_capacity(11), graph::status::success);
     ASSERT_EQ(cache.get_capacity(), 11U);
 }
 
-TEST(test_constant_cache_constant_cache, GetOrAddEmpty) {
+TEST(test_constant_cache, GetOrAddEmpty) {
     using key_t = graph::constant_tensor_cache_t::key_t;
     using value_t = graph::constant_tensor_cache_t::value_t;
 
@@ -42,7 +42,7 @@ TEST(test_constant_cache_constant_cache, GetOrAddEmpty) {
     ASSERT_FALSE(cache.get_or_add(key_t(), key_t(), 1024, value_t()).valid());
 }
 
-TEST(test_constant_cache_constant_cache, CombineKey) {
+TEST(test_constant_cache, CombineKey) {
     using key_t = graph::constant_tensor_cache_t::key_t;
 
     key_t backend_id = 0;
@@ -62,7 +62,7 @@ TEST(test_constant_cache_constant_cache, CombineKey) {
     ASSERT_NE(key2, key3);
 }
 
-TEST(test_constant_cache_constant_cache, NoEvictWhenCacheFull) {
+TEST(test_constant_cache, NoEvictWhenCacheFull) {
     graph::engine_t &engine = *get_engine();
     auto p_engine_ = dnnl_impl::make_dnnl_engine(engine);
     auto g_alloc_ = static_cast<graph::allocator_t *>(engine.get_allocator());
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_conv.cpp b/tests/gtests/graph/unit/backend/dnnl/test_conv.cpp
deleted file mode 100644
index 6833c4bc0c9..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_conv.cpp
+++ /dev/null
@@ -1,6886 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <functional>
-#include <random>
-
-#include "interface/c_types_map.hpp"
-
-#include "gtest/gtest.h"
-
-#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
-#include "graph/unit/backend/dnnl/ref_func.hpp"
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-struct eltwise_param_t {
-    std::string pass_name;
-    std::vector<float> bias;
-    std::vector<float> ref_dst;
-    graph::op_kind_t op_kind;
-    std::string op_name;
-    std::vector<std::pair<graph::op_attr_t, float>> attrs;
-};
-
-TEST(test_conv_compile, ConvolutionFp32) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src = utils::logical_tensor_init(
-            0, {8, 3, 224, 224}, graph::data_type::f32);
-    graph::logical_tensor_t weight = utils::logical_tensor_init(
-            1, {16, 3, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst = utils::logical_tensor_init(2,
-            {8, 16, 222, 222}, graph::data_type::f32, graph::layout_type::any);
-
-    conv_op.add_input(src);
-    conv_op.add_input(weight);
-    conv_op.add_output(dst);
-
-    graph::graph_t g(engine->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src, &weight};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst};
-
-    p.compile(&cp, inputs, outputs, engine);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst.id, &lt);
-    ASSERT_EQ(lt.layout_type,
-            engine->kind() == graph::engine_kind::gpu
-                    ? graph::layout_type::opaque
-                    : graph::layout_type::strided);
-}
-
-TEST(test_conv_compile, ConvolutionBackwardDataFp32) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-
-    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardData);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    // according to spec, group should be greater than 0
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_op.set_attr<dims>(graph::op_attr::dst_shape, dims {8, 3, 224, 224});
-
-    // prepare logical tensor
-    graph::logical_tensor_t diff_src = utils::logical_tensor_init(
-            0, {8, 3, 224, 224}, graph::data_type::f32);
-    graph::logical_tensor_t weights = utils::logical_tensor_init(
-            1, {16, 3, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
-            2, {8, 16, 222, 222}, graph::data_type::f32);
-
-    conv_op.add_input(diff_dst);
-    conv_op.add_input(weights);
-    conv_op.add_output(diff_src);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_data_bw_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&diff_dst, &weights};
-    std::vector<const graph::logical_tensor_t *> outputs {&diff_src};
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(diff_src.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-}
-
-TEST(test_conv_compile, ConvolutionBackwardFilterFp32) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-
-    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardWeights);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-    conv_op.set_attr<dims>(graph::op_attr::weights_shape, dims {3, 3, 64, 64});
-
-    // prepare logical tensor
-    graph::logical_tensor_t src = utils::logical_tensor_init(
-            0, {1, 224, 224, 64}, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
-            1, {1, 222, 222, 64}, graph::data_type::f32);
-    graph::logical_tensor_t diff_weight = utils::logical_tensor_init(
-            2, {3, 3, 64, 64}, graph::data_type::f32, graph::layout_type::any);
-
-    conv_op.add_input(src);
-    conv_op.add_input(diff_dst);
-    conv_op.add_output(diff_weight);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_weights_bwd_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src, &diff_dst};
-    std::vector<const graph::logical_tensor_t *> outputs {&diff_weight};
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-}
-
-TEST(test_conv_compile,
-        ConvolutionBackwardWeightsWithGroupsAndFiltersAnyLayout) {
-    using dims = graph::dnnl_impl::dims;
-
-    const dims src_dims {2, 4, 2};
-    const dims diff_dst_dims {2, 4, 2};
-    const dims diff_wei_dims {4, 2, 3};
-
-    const dims strides {1};
-    const dims pads_begin {1};
-    const dims pads_end {1};
-    const dims dilations {1};
-    const int64_t groups {2};
-    const std::string auto_pad {"None"};
-    const std::string data_format {"NCX"};
-    const std::string filter_format {"OIX"};
-
-    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardWeights);
-    conv_op.set_attr(graph::op_attr::strides, strides)
-            .set_attr(graph::op_attr::pads_begin, pads_begin)
-            .set_attr(graph::op_attr::pads_end, pads_end)
-            .set_attr(graph::op_attr::dilations, dilations)
-            .set_attr(graph::op_attr::groups, groups)
-            .set_attr(graph::op_attr::auto_pad, auto_pad)
-            .set_attr(graph::op_attr::data_format, data_format)
-            .set_attr(graph::op_attr::weights_format, filter_format);
-
-    graph::logical_tensor_t src_lt
-            = utils::logical_tensor_init(0, src_dims, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst_lt = utils::logical_tensor_init(
-            1, diff_dst_dims, graph::data_type::f32);
-    graph::logical_tensor_t diff_wei_lt = utils::logical_tensor_init(
-            2, diff_wei_dims, graph::data_type::f32, graph::layout_type::any);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(diff_dst_lt);
-    conv_op.add_output(diff_wei_lt);
-
-    graph::engine_t *eng = get_engine();
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_weights_bwd_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &diff_dst_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&diff_wei_lt};
-
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(diff_wei_lt.id, &lt);
-    // if layout queried from the primitive will make descriptor impossible
-    // to reshape (with groups -> no groups), we make it strided (via reorder)
-    ASSERT_TRUE(lt.layout_type == graph::layout_type::opaque
-            || lt.layout_type == graph::layout_type::strided);
-}
-
-TEST(test_conv_partition, InvalidInputNumForConvolutionBackwardData) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-
-    graph::op_t conv_bwd_data_op(graph::op_kind::ConvolutionBackwardData);
-    conv_bwd_data_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_bwd_data_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_bwd_data_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_bwd_data_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    // according to spec, group should be greater than 0
-    conv_bwd_data_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_bwd_data_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_bwd_data_op.set_attr<std::string>(
-            graph::op_attr::weights_format, "OIX");
-    conv_bwd_data_op.set_attr<dims>(
-            graph::op_attr::dst_shape, dims {8, 3, 224, 224});
-
-    // prepare logical tensor
-    graph::logical_tensor_t diff_src = utils::logical_tensor_init(
-            0, {8, 3, 224, 224}, graph::data_type::f32);
-    graph::logical_tensor_t weights = utils::logical_tensor_init(
-            1, {16, 3, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst_shape
-            = utils::logical_tensor_init(2, {1, 4}, graph::data_type::s32);
-    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
-            3, {8, 16, 222, 222}, graph::data_type::f32);
-
-    conv_bwd_data_op.add_input(diff_dst);
-    conv_bwd_data_op.add_input(weights);
-    conv_bwd_data_op.add_input(dst_shape);
-    conv_bwd_data_op.add_output(diff_src);
-
-    graph::graph_t g(eng->kind());
-    ASSERT_EQ(g.add_op(&conv_bwd_data_op), graph::status::success);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_data_bw_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 0U);
-}
-
-TEST(test_conv_partition, InvalidInputNumForConvolutionBackwardWeights) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-
-    graph::op_t conv_bwd_weights_op(graph::op_kind::ConvolutionBackwardWeights);
-    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_bwd_weights_op.set_attr<std::string>(
-            graph::op_attr::data_format, "NXC");
-    conv_bwd_weights_op.set_attr<std::string>(
-            graph::op_attr::weights_format, "XIO");
-    conv_bwd_weights_op.set_attr<dims>(
-            graph::op_attr::weights_shape, dims {3, 3, 64, 64});
-
-    // prepare logical tensor
-    graph::logical_tensor_t src = utils::logical_tensor_init(
-            0, {1, 224, 224, 64}, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
-            1, {1, 222, 222, 64}, graph::data_type::f32);
-    graph::logical_tensor_t weights_shape
-            = utils::logical_tensor_init(2, {1, 4}, graph::data_type::s32);
-    graph::logical_tensor_t diff_weight = utils::logical_tensor_init(
-            3, {3, 3, 64, 64}, graph::data_type::f32, graph::layout_type::any);
-
-    conv_bwd_weights_op.add_input(src);
-    conv_bwd_weights_op.add_input(diff_dst);
-    conv_bwd_weights_op.add_input(weights_shape);
-    conv_bwd_weights_op.add_output(diff_weight);
-
-    graph::graph_t g(eng->kind());
-    ASSERT_EQ(g.add_op(&conv_bwd_weights_op), graph::status::success);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_weights_bwd_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 0U);
-}
-
-TEST(test_conv_execute, ConvolutionNcxOix) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 2, 2}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvtransposeWithGroups) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {1.0, 2.0, 3.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
-    std::vector<float> ref_dst {3.0, 0.0, 3.0, 0.0, 7.0, 0.0, 7.0, 0.0};
-    std::vector<float> dst(ref_dst.size(), 0);
-    graph::op_t convtranspose_op(graph::op_kind::ConvTranspose);
-    convtranspose_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    convtranspose_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    convtranspose_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    convtranspose_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    convtranspose_op.set_attr<int64_t>(graph::op_attr::groups, 2);
-    convtranspose_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    convtranspose_op.set_attr<std::string>(
-            graph::op_attr::weights_format, "IOX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 4, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {4, 4, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 8, 1, 1}, graph::data_type::f32);
-
-    convtranspose_op.add_input(src_lt);
-    convtranspose_op.add_input(weight_lt);
-    convtranspose_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&convtranspose_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("convtranspose_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-    ASSERT_EQ(dst_lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()}),
-            graph::status::success);
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, Convolution3DNcxOix) {
-    using dims = std::vector<int64_t>;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 1, 2, 2}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvolutionNcxXio) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {3, 3, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 2, 2}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, Convolution3DNcxXio) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 3, 3, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 1, 2, 2}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvolutionNxcXio) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {
-            -3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0, 2.5, -1.0, 0};
-    std::vector<float> weight {
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
-    std::vector<float> ref_dst {0.5};
-    std::vector<float> dst {0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 3, 4, 1}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {3, 4, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 1, 1}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, Convolution3DNxcXio) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {
-            -3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0, 2.5, -1.0, 0};
-    std::vector<float> weight {
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
-    std::vector<float> ref_dst {0.5};
-    std::vector<float> dst {0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 3, 4, 1}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 3, 4, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 1, 1, 1}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvolutionNxcOix) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 4, 4, 1}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 2, 2, 1}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, Convolution3DNxcOix) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4, 1}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 2, 2, 1}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvolutionF16F16F16_GPU) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-    SKIP_IF(eng->kind() != graph::engine_kind::gpu,
-            "Skip fp16 test for non-GPU device.");
-    std::vector<float16_t> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float16_t> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float16_t> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<float16_t> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f16);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f16);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 2, 2}, graph::data_type::f16);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-}
-
-TEST(test_conv_execute, ConvolutionBf16Bf16Bf16) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-    SKIP_IF((isa < dnnl_cpu_isa_avx512_core)
-                    && eng->kind() == graph::engine_kind::cpu,
-            "Skip bf16 tests for systems that do not support avx512_core.");
-
-    std::vector<bfloat16_t> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5,
-            2.0, 2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<bfloat16_t> weight {
-            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<bfloat16_t> ref_dst {-1.0, 2.5, 5.0, 1.5};
-    std::vector<bfloat16_t> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::bf16);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::bf16);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 2, 2}, graph::data_type::bf16);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-}
-
-TEST(test_conv_compile, ConvAddSharedInputs) {
-    /*      /\  /
-           / Conv
-           \  /
-           Add
-    */
-    using dims = graph::dnnl_impl::dims;
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            3, {1, 1, 4, 4}, graph::data_type::f32);
-
-    // create op conv
-    graph::op_t conv_op(0, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(conv_dst_lt);
-    // create op add
-    graph::op_t add_op(1, graph::op_kind::Add, "Add");
-    add_op.add_input(conv_dst_lt);
-    add_op.add_input(src_lt);
-    add_op.add_output(add_dst_lt);
-    // build graph
-    graph::engine_t *eng = get_engine();
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    // run pass
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile conv+add partition
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-
-    // check inplace pairs
-    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
-    ASSERT_EQ(inplace_pairs.size(), 0U);
-}
-
-TEST(test_conv_compile, ConvAddInplace) {
-    /*      \  /
-             Conv
-           \  /
-           Add
-    */
-    using dims = graph::dnnl_impl::dims;
-
-    // TODO(qun): re-enable this test once library and bridge align the inplace
-    // logic
-    SKIP_IF(true, "library and bridge have different inplace logic");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
-            2, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t add_src_lt = utils::logical_tensor_init(
-            3, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 1, 4, 4}, graph::data_type::f32);
-
-    // create op conv
-    graph::op_t conv_op(0, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(conv_dst_lt);
-    // create op add
-    graph::op_t add_op(1, graph::op_kind::Add, "Add");
-    add_op.add_input(add_src_lt);
-    add_op.add_input(conv_dst_lt);
-    add_op.add_output(add_dst_lt);
-    // build graph
-    graph::engine_t *eng = get_engine();
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    // run pass
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile conv+add partition
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-    // arbitrary order of inputs
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &add_src_lt, &weight_lt, &src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-
-    // check inplace pairs
-    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
-    ASSERT_EQ(inplace_pairs.size(), 1U);
-    ASSERT_EQ(inplace_pairs[0].input_id, add_src_lt.id);
-    ASSERT_EQ(inplace_pairs[0].output_id, add_dst_lt.id);
-}
-
-TEST(test_conv_execute, GroupConvolution) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 4);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {8, 32, 16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {32, 8, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            2, {8, 32, 16, 16}, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    std::vector<float> src(8 * 32 * 16 * 16, 1);
-    std::vector<float> weight(32 * 8 * 1 * 1, 1);
-    std::vector<float> dst(8 * 32 * 16 * 16, 1);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], 8);
-    }
-}
-
-TEST(test_conv_execute, ConvolutionBackwardData) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    graph::engine_t *eng = get_engine();
-
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> ref_diff_src {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0,
-            0.0, 3.0, 3.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> diff_dst {0.0, 1.0, 2.0, 3.0};
-    std::vector<float> diff_src(src.size(), 0.0);
-
-    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardData);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_op.set_attr<dims>(graph::op_attr::dst_shape, dims {1, 1, 4, 4});
-
-    // prepare logical tensor
-    graph::logical_tensor_t diff_src_lt = utils::logical_tensor_init(
-            1, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            2, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst_lt = utils::logical_tensor_init(
-            3, {1, 1, 2, 2}, graph::data_type::f32);
-
-    conv_op.add_input(diff_dst_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(diff_src_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("conv_data_bw_pass");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &diff_dst_lt, &weight_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&diff_src_lt};
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(diff_src_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(diff_src_lt, eng, diff_src);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {diff_dst_ts.get(), weight_ts.get()}, {diff_src_ts.get()});
-    strm->wait();
-    diff_src = diff_src_ts.as_vec_type<float>();
-    for (size_t i = 0; i < diff_src.size(); ++i) {
-        ASSERT_FLOAT_EQ(diff_src[i], ref_diff_src[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvolutionBnFp32) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::op_t bn_op(1, graph::op_kind::BatchNormInference, "bn");
-    bn_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    bn_op.set_attr(graph::op_attr::epsilon, 1e-6f);
-
-    // prepare logical tensor
-    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
-            0, {8, 32, 16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
-            1, {32, 32, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
-            2, {8, 32, 16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t gamma_lt
-            = utils::logical_tensor_init(3, {32}, graph::data_type::f32);
-    graph::logical_tensor_t beta_lt
-            = utils::logical_tensor_init(4, {32}, graph::data_type::f32);
-    graph::logical_tensor_t scale_lt
-            = utils::logical_tensor_init(5, {32}, graph::data_type::f32);
-    graph::logical_tensor_t shift_lt
-            = utils::logical_tensor_init(6, {32}, graph::data_type::f32);
-    graph::logical_tensor_t bn_dst_lt = utils::logical_tensor_init(
-            7, {8, 32, 16, 16}, graph::data_type::f32);
-
-    std::vector<float> conv_src(8 * 32 * 16 * 16);
-    std::vector<float> conv_weight(32 * 32 * 1 * 1);
-    std::vector<float> bn_gamma(32);
-    std::vector<float> bn_beta(32);
-    std::vector<float> bn_scale(32);
-    std::vector<float> bn_shift(32);
-    std::vector<float> bn_dst(8 * 32 * 16 * 16);
-
-    // Initialize
-    std::default_random_engine generator;
-    std::normal_distribution<float> distribution(0.0f, 0.1f);
-
-    std::generate(conv_src.begin(), conv_src.end(),
-            [&]() { return distribution(generator); });
-    std::generate(conv_weight.begin(), conv_weight.end(),
-            [&]() { return distribution(generator); });
-    std::generate(bn_gamma.begin(), bn_gamma.end(),
-            [&]() { return distribution(generator); });
-    std::generate(bn_beta.begin(), bn_beta.end(),
-            [&]() { return distribution(generator); });
-    std::generate(bn_scale.begin(), bn_scale.end(),
-            [&]() { return distribution(generator); });
-    std::generate(bn_shift.begin(), bn_shift.end(),
-            [&]() { return distribution(generator); });
-
-    test_tensor conv_src_ts(conv_src_lt, eng, conv_src);
-    test_tensor conv_weight_ts(conv_weight_lt, eng, conv_weight);
-    test_tensor bn_gamma_ts(gamma_lt, eng, bn_gamma);
-    test_tensor bn_beta_ts(beta_lt, eng, bn_beta);
-    test_tensor bn_scale_ts(scale_lt, eng, bn_scale);
-    test_tensor bn_shift_ts(shift_lt, eng, bn_shift);
-    test_tensor bn_dst_ts(bn_dst_lt, eng, bn_dst);
-
-    conv_op.add_input(conv_src_lt);
-    conv_op.add_input(conv_weight_lt);
-    conv_op.add_output(conv_dst_lt);
-    bn_op.add_input(conv_dst_lt);
-    bn_op.add_input(gamma_lt);
-    bn_op.add_input(beta_lt);
-    bn_op.add_input(scale_lt);
-    bn_op.add_input(shift_lt);
-    bn_op.add_output(bn_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&bn_op);
-    g.finalize();
-
-    // run unfused graph to compute the reference
-    ASSERT_EQ(run_graph(g,
-                      {conv_src_ts, conv_weight_ts, bn_gamma_ts, bn_beta_ts,
-                              bn_scale_ts, bn_shift_ts},
-                      {bn_dst_ts}, *eng, *strm),
-            graph::status::success);
-
-    // run fusion partition
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&conv_src_lt,
-            &conv_weight_lt, &gamma_lt, &beta_lt, &scale_lt, &shift_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&bn_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> convbn_dst(8 * 32 * 16 * 16, 0.0);
-    test_tensor convbn_dst_ts(bn_dst_lt, eng, convbn_dst);
-
-    cp.execute(strm,
-            {conv_src_ts.get(), conv_weight_ts.get(), bn_gamma_ts.get(),
-                    bn_beta_ts.get(), bn_scale_ts.get(), bn_shift_ts.get()},
-            {convbn_dst_ts.get()});
-    strm->wait();
-
-    float max_diff = 0;
-    for (size_t i = 0; i < bn_dst.size(); ++i) {
-        max_diff = std::max(max_diff, std::abs(bn_dst[i] - convbn_dst[i]));
-    }
-    ASSERT_LT(max_diff, 1e-6f);
-}
-
-TEST(test_conv_compile, ConvBnSharedInputs) {
-    // bn has shared gamma/beta/mean/var
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::op_t bn_op(1, graph::op_kind::BatchNormInference, "bn");
-    bn_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    bn_op.set_attr(graph::op_attr::epsilon, 1e-6f);
-
-    // prepare logical tensor
-    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
-            0, {8, 32, 16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
-            1, {32, 32, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
-            2, {8, 32, 16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t shared_lt
-            = utils::logical_tensor_init(3, {32}, graph::data_type::f32);
-    graph::logical_tensor_t bn_dst_lt = utils::logical_tensor_init(
-            7, {8, 32, 16, 16}, graph::data_type::f32);
-
-    std::vector<float> conv_src(8 * 32 * 16 * 16);
-    std::vector<float> conv_weight(32 * 32 * 1 * 1);
-    std::vector<float> bn_shared_input(32);
-    std::vector<float> bn_dst(8 * 32 * 16 * 16);
-
-    // Initialize
-    std::default_random_engine generator;
-    std::normal_distribution<float> distribution(0.0f, 0.1f);
-
-    std::generate(conv_src.begin(), conv_src.end(),
-            [&]() { return distribution(generator); });
-    std::generate(conv_weight.begin(), conv_weight.end(),
-            [&]() { return distribution(generator); });
-    std::generate(bn_shared_input.begin(), bn_shared_input.end(),
-            [&]() { return distribution(generator); });
-
-    test_tensor conv_src_ts(conv_src_lt, eng, conv_src);
-    test_tensor conv_weight_ts(conv_weight_lt, eng, conv_weight);
-    test_tensor bn_shared_input_ts(shared_lt, eng, bn_shared_input);
-    test_tensor bn_dst_ts(bn_dst_lt, eng, bn_dst);
-
-    conv_op.add_input(conv_src_lt);
-    conv_op.add_input(conv_weight_lt);
-    conv_op.add_output(conv_dst_lt);
-    bn_op.add_input(conv_dst_lt);
-    bn_op.add_input(shared_lt);
-    bn_op.add_input(shared_lt);
-    bn_op.add_input(shared_lt);
-    bn_op.add_input(shared_lt);
-    bn_op.add_output(bn_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&bn_op);
-    g.finalize();
-
-    // run unfused graph to compute the reference
-    ASSERT_EQ(run_graph(g,
-                      {conv_src_ts, conv_weight_ts, bn_shared_input_ts,
-                              bn_shared_input_ts, bn_shared_input_ts,
-                              bn_shared_input_ts},
-                      {bn_dst_ts}, *eng, *strm),
-            graph::status::success);
-
-    // run fusion partition
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&conv_src_lt,
-            &conv_weight_lt, &shared_lt, &shared_lt, &shared_lt, &shared_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&bn_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> convbn_dst(8 * 32 * 16 * 16, 0.0);
-    test_tensor convbn_dst_ts(bn_dst_lt, eng, convbn_dst);
-
-    cp.execute(strm,
-            {conv_src_ts.get(), conv_weight_ts.get(), bn_shared_input_ts.get(),
-                    bn_shared_input_ts.get(), bn_shared_input_ts.get(),
-                    bn_shared_input_ts.get()},
-            {convbn_dst_ts.get()});
-    strm->wait();
-    ASSERT_TRUE(
-            allclose<float>(bn_dst_ts, convbn_dst_ts, /*rtol*/ 0.1f, 1e-6f));
-}
-
-TEST(test_conv_execute, ConvAdd) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {1.0, 2.0, 3.0, 4.0};
-    std::vector<float> ref_dst {0.0, 4.5, 8.0, 5.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-
-    std::vector<bool> swaps {false, true};
-
-    for (auto swap : swaps) {
-        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-        graph::op_t add_op(2, graph::op_kind::Add, "Add");
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-                0, {1, 1, 4, 4}, graph::data_type::f32);
-        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-                1, {1, 1, 3, 3}, graph::data_type::f32);
-        graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
-                2, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-                3, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-                4, {1, 1, 2, 2}, graph::data_type::f32);
-
-        graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-        in_op.add_output(post_src_lt);
-        conv_op.add_input(src_lt);
-        conv_op.add_input(weight_lt);
-        conv_op.add_output(dst_lt);
-        if (swap) {
-            add_op.add_input(post_src_lt);
-            add_op.add_input(dst_lt);
-        } else {
-            add_op.add_input(dst_lt);
-            add_op.add_input(post_src_lt);
-        }
-        add_op.add_output(add_dst_lt);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&in_op);
-        g.add_op(&conv_op);
-        g.add_op(&add_op);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> inputs {
-                &src_lt, &weight_lt, &post_src_lt};
-        std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-        p.compile(&cp, inputs, outputs, eng);
-
-        graph::logical_tensor_t lt;
-        cp.query_logical_tensor(add_dst_lt.id, &lt);
-        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor weight_ts(weight_lt, eng, weight);
-        test_tensor post_src_ts(post_src_lt, eng, post_src);
-        test_tensor add_dst_ts(add_dst_lt, eng, dst);
-
-        graph::stream_t *strm = get_stream();
-        cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-                {add_dst_ts.get()});
-        strm->wait();
-        dst = add_dst_ts.as_vec_type<float>();
-        for (size_t i = 0; i < dst.size(); ++i) {
-            ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-        }
-    }
-}
-
-TEST(test_conv_execute, ConvAddPerTensorBroadcast) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {3.0};
-    std::vector<float> ref_dst {2.0, 5.5, 8.0, 4.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t add_op(2, graph::op_kind::Add, "Add");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    // post_src will first be unsequeeze to {1,1,1,1} and then broadcast
-    // to {1,1,2,2}
-    graph::logical_tensor_t post_src_lt
-            = utils::logical_tensor_init(2, {1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            3, {1, 1, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 1, 2, 2}, graph::data_type::f32);
-
-    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-    in_op.add_output(post_src_lt);
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-    add_op.add_input(dst_lt);
-    add_op.add_input(post_src_lt);
-    add_op.add_output(add_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op);
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &post_src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(add_dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvAddExpandedPerTensorBroadcast) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {3.0};
-    std::vector<float> ref_dst {2.0, 5.5, 8.0, 4.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t add_op(2, graph::op_kind::Add, "Add");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
-            2, {1, 1, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            3, {1, 1, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 1, 2, 2}, graph::data_type::f32);
-
-    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-    in_op.add_output(post_src_lt);
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-    add_op.add_input(dst_lt);
-    add_op.add_input(post_src_lt);
-    add_op.add_output(add_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op);
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &post_src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(add_dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvAddPerChannelBroadcast) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0,
-            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {3.0, 3.0};
-    std::vector<float> ref_dst {2.0, 5.5, 8.0, 4.5, 2.0, 5.5, 8.0, 4.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t add_op(2, graph::op_kind::Add, "Add");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {2, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
-            2, {1, 2, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            3, {1, 2, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 2, 2, 2}, graph::data_type::f32);
-
-    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-    in_op.add_output(post_src_lt);
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-    add_op.add_input(dst_lt);
-    add_op.add_input(post_src_lt);
-    add_op.add_output(add_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op);
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &post_src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(add_dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvAddPerChannelBroadcastNxc) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,
-            0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-    std::vector<float> post_src {3.0, 3.0};
-    std::vector<float> ref_dst {2.0, 2.0, 5.5, 5.5, 8.0, 8.0, 4.5, 4.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-    graph::op_t add_op(2, graph::op_kind::Add, "Add");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 4, 4, 1}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {3, 3, 1, 2}, graph::data_type::f32);
-    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
-            2, {1, 1, 1, 2}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            3, {1, 2, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 2, 2, 2}, graph::data_type::f32);
-
-    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-    in_op.add_output(post_src_lt);
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-    add_op.add_input(dst_lt);
-    add_op.add_input(post_src_lt);
-    add_op.add_output(add_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op);
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &post_src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(add_dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-    dst = dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_compile, ConvAddBroadcast) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {3.0, 3.0};
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t add_op(2, graph::op_kind::Add, "Add");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t post_src_lt
-            = utils::logical_tensor_init(2, {2}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            3, {1, 1, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 1, 2, 2}, graph::data_type::f32);
-
-    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-    in_op.add_output(post_src_lt);
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-    add_op.add_input(dst_lt);
-    add_op.add_input(post_src_lt);
-    add_op.add_output(add_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op);
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &post_src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-}
-
-TEST(test_conv_execute, ConvAddRelu) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {-1.0, -2.0, -3.0, -4.0};
-    std::vector<float> ref_dst {0.0, 0.5, 2.0, 0.0};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t add_op(2, graph::op_kind::Add, "Add");
-    graph::op_t relu_op(3, graph::op_kind::ReLU, "ReLU");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t post_lt = utils::logical_tensor_init(
-            2, {1, 1, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            3, {1, 1, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {1, 1, 2, 2}, graph::data_type::f32);
-    graph::logical_tensor_t relu_dst_lt = utils::logical_tensor_init(
-            5, {1, 1, 2, 2}, graph::data_type::f32);
-
-    in_op.add_output(post_lt);
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-    add_op.add_input(dst_lt);
-    add_op.add_input(post_lt);
-    add_op.add_output(add_dst_lt);
-    relu_op.add_input(add_dst_lt);
-    relu_op.add_output(relu_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op);
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.add_op(&relu_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src_lt, &weight_lt, &post_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&relu_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(relu_dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor post_src_ts(post_lt, eng, post_src);
-    test_tensor relu_dst_ts(relu_dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-            {relu_dst_ts.get()});
-    strm->wait();
-    dst = relu_dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvMultiplePostOps) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> bias {1.0};
-    std::vector<float> mul_other {2.0, 2.0, 2.0, 2.0};
-    std::vector<float> sum_other {-1.0, -2.0, -3.0, -4.0};
-    std::vector<float> add_other {1.0};
-    std::vector<float> ref_dst {0.0, 6.0, 10.0, 2.0};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    std::string data_format = "NXC";
-    std::string filter_format = "XIO";
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, data_format);
-    conv_op.set_attr<std::string>(
-            graph::op_attr::weights_format, filter_format);
-
-    graph::op_t mul_op(2, graph::op_kind::Multiply, "Mul");
-    graph::op_t sum_op(3, graph::op_kind::Add, "Sum");
-    graph::op_t add_op(4, graph::op_kind::Add, "Add");
-
-    std::vector<int64_t> src_dims {1, 1, 4, 4};
-    std::vector<int64_t> weight_dims {1, 1, 3, 3};
-    std::vector<int64_t> dst_dims {1, 1, 2, 2};
-    if (data_format == "NXC") {
-        src_dims = {1, 4, 4, 1};
-        dst_dims = {1, 2, 2, 1};
-    }
-    if (filter_format == "XIO") weight_dims = {3, 3, 1, 1};
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_lt
-            = utils::logical_tensor_init(0, src_dims, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt
-            = utils::logical_tensor_init(1, weight_dims, graph::data_type::f32);
-    graph::logical_tensor_t bias_lt
-            = utils::logical_tensor_init(2, {1}, graph::data_type::f32);
-    graph::logical_tensor_t mul_other_lt
-            = utils::logical_tensor_init(3, dst_dims, graph::data_type::f32);
-    graph::logical_tensor_t mul_dst_lt
-            = utils::logical_tensor_init(4, dst_dims, graph::data_type::f32);
-    graph::logical_tensor_t sum_other_lt
-            = utils::logical_tensor_init(5, dst_dims, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt
-            = utils::logical_tensor_init(6, dst_dims, graph::data_type::f32);
-    graph::logical_tensor_t sum_dst_lt
-            = utils::logical_tensor_init(7, dst_dims, graph::data_type::f32);
-    graph::logical_tensor_t add_other_lt
-            = utils::logical_tensor_init(8, {1}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst_lt
-            = utils::logical_tensor_init(9, dst_dims, graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_input(bias_lt);
-    conv_op.add_output(dst_lt);
-    mul_op.add_input(dst_lt);
-    mul_op.add_input(mul_other_lt);
-    mul_op.add_output(mul_dst_lt);
-    sum_op.add_input(mul_dst_lt);
-    sum_op.add_input(sum_other_lt);
-    sum_op.add_output(sum_dst_lt);
-    add_op.add_input(sum_dst_lt);
-    add_op.add_input(add_other_lt);
-    add_op.add_output(add_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&mul_op);
-    g.add_op(&sum_op);
-    g.add_op(&add_op);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt,
-            &bias_lt, &mul_other_lt, &sum_other_lt, &add_other_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
-
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-
-    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
-    ASSERT_EQ(inplace_pairs.size(), 1U);
-    ASSERT_EQ(inplace_pairs[0].input_id, sum_other_lt.id);
-    ASSERT_EQ(inplace_pairs[0].output_id, add_dst_lt.id);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(add_dst_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor bias_ts(bias_lt, eng, bias);
-    test_tensor mul_other_ts(mul_other_lt, eng, mul_other);
-    test_tensor sum_other_ts(sum_other_lt, eng, sum_other);
-    test_tensor add_other_ts(add_other_lt, eng, add_other);
-    test_tensor add_dst_ts(add_dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm,
-            {src_ts.get(), weight_ts.get(), bias_ts.get(), mul_other_ts.get(),
-                    sum_other_ts.get(), add_other_ts.get()},
-            {add_dst_ts.get()});
-    strm->wait();
-    dst = add_dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
-    }
-}
-
-TEST(test_conv_execute, ConvBiasEltwise) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0.0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-
-    std::vector<eltwise_param_t> params1 = {
-            eltwise_param_t {"fp_conv_post_ops", {-1.0}, {2.0, 1.5, 4.0, 0.5},
-                    graph::op_kind::Abs, "Abs", {}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    {static_cast<float>(exp(-2) - 1), 1.5, 4.0, 0.5},
-                    graph::op_kind::Elu, "Elu", {{graph::op_attr::alpha, 1.f}}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    hardsigmoid_func({-2.0, 1.5, 4.0, 0.5}, 1.f / 6, 0.5f),
-                    graph::op_kind::HardSigmoid, "HardSigmoid",
-                    {{graph::op_attr::alpha, 1.f / 6},
-                            {graph::op_attr::beta, 0.5f}}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    {-0.04f, 1.5f, 4.0f, 0.5f}, graph::op_kind::LeakyReLU,
-                    "LeakyReLU", {{graph::op_attr::alpha, 0.02f}}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    mish_func({-2.0, 1.5, 4.0, 0.5}), graph::op_kind::Mish,
-                    "Mish", {}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0}, {0.0, 1.5, 3.0, 0.5},
-                    graph::op_kind::Clamp, "Clamp",
-                    {{graph::op_attr::min, 0.f}, {graph::op_attr::max, 3.f}}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    sigmoid_func({-2.0, 1.5, 4.0, 0.5}),
-                    graph::op_kind::Sigmoid, "Sigmoid", {}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    {4.0, 2.25, 16.0, 0.25}, graph::op_kind::Square, "Square",
-                    {}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    tanh_func({-2.0, 1.5, 4.0, 0.5}), graph::op_kind::Tanh,
-                    "Tanh", {}},
-            eltwise_param_t {"fp_conv_post_ops", {1.0},
-                    sqrt_func({0.0, 3.5, 6.0, 2.5}), graph::op_kind::Sqrt,
-                    "Sqrt", {}},
-    };
-
-    for (auto &param : params1) {
-        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-        conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-        graph::op_t eltwise_op(2, param.op_kind, param.op_name);
-        for (auto &attr : param.attrs) {
-            eltwise_op.set_attr<float>(attr.first, attr.second);
-        }
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-                0, {1, 1, 4, 4}, graph::data_type::f32);
-        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-                1, {1, 1, 3, 3}, graph::data_type::f32);
-        graph::logical_tensor_t bias_lt = utils::logical_tensor_init(
-                2, dims {1}, graph::data_type::f32);
-        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-                4, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t eltwise_dst_lt = utils::logical_tensor_init(
-                5, {1, 1, 2, 2}, graph::data_type::f32);
-
-        conv_op.add_input(src_lt);
-        conv_op.add_input(weight_lt);
-        conv_op.add_input(bias_lt);
-        conv_op.add_output(dst_lt);
-        eltwise_op.add_input(dst_lt);
-        eltwise_op.add_output(eltwise_dst_lt);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&conv_op);
-        g.add_op(&eltwise_op);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass(param.pass_name);
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> inputs {
-                &src_lt, &weight_lt, &bias_lt};
-        std::vector<const graph::logical_tensor_t *> outputs {&eltwise_dst_lt};
-
-        p.compile(&cp, inputs, outputs, eng);
-
-        graph::logical_tensor_t lt;
-        cp.query_logical_tensor(eltwise_dst_lt.id, &lt);
-        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor weight_ts(weight_lt, eng, weight);
-        test_tensor bias_ts(bias_lt, eng, param.bias);
-        test_tensor eltwise_dst_ts(eltwise_dst_lt, eng, dst);
-
-        graph::stream_t *strm = get_stream();
-        cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
-                {eltwise_dst_ts.get()});
-        strm->wait();
-        dst = eltwise_dst_ts.as_vec_type<float>();
-        for (size_t i = 0; i < dst.size(); ++i) {
-            ASSERT_FLOAT_EQ(dst[i], param.ref_dst[i]);
-        }
-    }
-}
-
-TEST(test_conv_execute, ConvBiasAddEltwise) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0.0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {-2.0, 1.0, -1.0, 0.0};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-
-    std::vector<eltwise_param_t> params2 = {
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    {static_cast<float>(exp(-4.0) - 1), 2.5, 3.0, 0.5},
-                    graph::op_kind::Elu, "Elu", {{graph::op_attr::alpha, 1.f}}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    {-0.08f, 2.5f, 3.0f, 0.5f}, graph::op_kind::LeakyReLU,
-                    "LeakyReLU", {{graph::op_attr::alpha, 0.02f}}},
-            eltwise_param_t {"fp_conv_post_ops", {-1.0},
-                    mish_func({-4.0f, 2.5f, 3.0f, 0.5f}), graph::op_kind::Mish,
-                    "Mish", {}},
-            eltwise_param_t {"fp_conv_post_ops", {3.0}, {0.0, 6.f, 6.f, 4.5},
-                    graph::op_kind::Clamp, "ReLU6",
-                    {{graph::op_attr::min, 0.f}, {graph::op_attr::max, 6.f}}},
-    };
-
-    for (auto &param : params2) {
-        graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-
-        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-
-        conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t add_op(2, graph::op_kind::Add, "Add");
-        graph::op_t eltwise_op(3, param.op_kind, param.op_name);
-        for (auto &attr : param.attrs) {
-            eltwise_op.set_attr<float>(attr.first, attr.second);
-        }
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-                0, {1, 1, 4, 4}, graph::data_type::f32);
-        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-                1, {1, 1, 3, 3}, graph::data_type::f32);
-        graph::logical_tensor_t bias_lt = utils::logical_tensor_init(
-                2, dims {1}, graph::data_type::f32);
-        graph::logical_tensor_t post_lt = utils::logical_tensor_init(
-                3, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-                4, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-                5, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t eltwise_dst_lt = utils::logical_tensor_init(
-                6, {1, 1, 2, 2}, graph::data_type::f32);
-
-        in_op.add_output(post_lt);
-        conv_op.add_input(src_lt);
-        conv_op.add_input(weight_lt);
-        conv_op.add_input(bias_lt);
-        conv_op.add_output(dst_lt);
-        add_op.add_input(dst_lt);
-        add_op.add_input(post_lt);
-        add_op.add_output(add_dst_lt);
-        eltwise_op.add_input(add_dst_lt);
-        eltwise_op.add_output(eltwise_dst_lt);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&in_op);
-        g.add_op(&conv_op);
-        g.add_op(&add_op);
-        g.add_op(&eltwise_op);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass(param.pass_name);
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> inputs {
-                &src_lt, &weight_lt, &bias_lt, &post_lt};
-        std::vector<const graph::logical_tensor_t *> outputs {&eltwise_dst_lt};
-
-        p.compile(&cp, inputs, outputs, eng);
-
-        graph::logical_tensor_t lt;
-        cp.query_logical_tensor(eltwise_dst_lt.id, &lt);
-        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor weight_ts(weight_lt, eng, weight);
-        test_tensor bias_ts(bias_lt, eng, param.bias);
-        test_tensor post_src_ts(post_lt, eng, post_src);
-        test_tensor eltwise_dst_ts(eltwise_dst_lt, eng, dst);
-
-        graph::stream_t *strm = get_stream();
-        cp.execute(strm,
-                {src_ts.get(), weight_ts.get(), bias_ts.get(),
-                        post_src_ts.get()},
-                {eltwise_dst_ts.get()});
-        strm->wait();
-        for (size_t i = 0; i < dst.size(); ++i) {
-            // We noticed mish test has slight accuracy issue on GPU or AArch64
-            // CPU or SNB.
-            dst = eltwise_dst_ts.as_vec_type<float>();
-            if (eng->kind() == graph::engine_kind::gpu
-                    || (eng->kind() == graph::engine_kind::cpu
-                            && dnnl_get_effective_cpu_isa()
-                                    <= dnnl_cpu_isa_avx)) {
-                ASSERT_NEAR(dst[i], param.ref_dst[i], 1e-6);
-            } else {
-                ASSERT_FLOAT_EQ(dst[i], param.ref_dst[i]);
-            }
-        }
-    }
-}
-
-TEST(test_conv_execute, ConvAddEltwise) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
-            2.5, -1.0, 0.0, 3.0, -2.0, -1.0, 4.0};
-    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
-    std::vector<float> post_src {-2.0, 1.0, -1.0, 0.0};
-    std::vector<float> ref_dst {-3.0, 3.5, 4.0, 1.5};
-    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
-
-    std::vector<eltwise_param_t> params = {
-            eltwise_param_t {"fp_conv_post_ops", {0.0},
-                    {static_cast<float>(exp(-3.0) - 1), 3.5, 4.0, 1.5},
-                    graph::op_kind::Elu, "Elu", {{graph::op_attr::alpha, 1.f}}},
-            eltwise_param_t {"fp_conv_post_ops", {0.0},
-                    {-0.06f, 3.5f, 4.0f, 1.5f}, graph::op_kind::LeakyReLU,
-                    "LeakyReLU", {{graph::op_attr::alpha, 0.02f}}},
-            eltwise_param_t {"fp_conv_post_ops", {0.0},
-                    mish_func({-3.0f, 3.5f, 4.0f, 1.5f}), graph::op_kind::Mish,
-                    "Mish", {}},
-            eltwise_param_t {"fp_conv_post_ops", {0.0}, {0.0, 3.5, 4.f, 1.5},
-                    graph::op_kind::Clamp, "ReLU6",
-                    {{graph::op_attr::min, 0.f}, {graph::op_attr::max, 6.f}}},
-    };
-
-    for (auto &param : params) {
-        graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
-        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-        conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-        graph::op_t add_op(2, graph::op_kind::Add, "Add");
-        graph::op_t eltwise_op(3, param.op_kind, param.op_name);
-        for (auto &attr : param.attrs) {
-            eltwise_op.set_attr<float>(attr.first, attr.second);
-        }
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-                0, {1, 1, 4, 4}, graph::data_type::f32);
-        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-                1, {1, 1, 3, 3}, graph::data_type::f32);
-        graph::logical_tensor_t post_lt = utils::logical_tensor_init(
-                2, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-                3, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-                4, {1, 1, 2, 2}, graph::data_type::f32);
-        graph::logical_tensor_t eltwise_dst_lt = utils::logical_tensor_init(
-                5, {1, 1, 2, 2}, graph::data_type::f32);
-
-        in_op.add_output(post_lt);
-        conv_op.add_input(src_lt);
-        conv_op.add_input(weight_lt);
-        conv_op.add_output(dst_lt);
-        add_op.add_input(dst_lt);
-        add_op.add_input(post_lt);
-        add_op.add_output(add_dst_lt);
-        eltwise_op.add_input(add_dst_lt);
-        eltwise_op.add_output(eltwise_dst_lt);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&in_op);
-        g.add_op(&conv_op);
-        g.add_op(&add_op);
-        g.add_op(&eltwise_op);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass(param.pass_name);
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> inputs {
-                &src_lt, &weight_lt, &post_lt};
-        std::vector<const graph::logical_tensor_t *> outputs {&eltwise_dst_lt};
-
-        p.compile(&cp, inputs, outputs, eng);
-
-        graph::logical_tensor_t lt;
-        cp.query_logical_tensor(eltwise_dst_lt.id, &lt);
-        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor weight_ts(weight_lt, eng, weight);
-        test_tensor post_src_ts(post_lt, eng, post_src);
-        test_tensor eltwise_dst_ts(eltwise_dst_lt, eng, dst);
-
-        graph::stream_t *strm = get_stream();
-        cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
-                {eltwise_dst_ts.get()});
-        strm->wait();
-        dst = eltwise_dst_ts.as_vec_type<float>();
-        for (size_t i = 0; i < dst.size(); ++i) {
-            ASSERT_NEAR(dst[i], param.ref_dst[i], 0.0001f);
-        }
-    }
-}
-
-TEST(test_conv_execute_subgraph_fp32, ConvDepthwise_CPU) {
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    SKIP_IF(engine->kind() == graph::engine_kind::gpu,
-            "Skip for GPU - not supported yet.");
-
-    // N, IC, IH, IW
-    std::vector<int64_t> conv_src_shape {4, 4, 4, 4};
-    // OC, IC/G, KH, KW
-    std::vector<int64_t> conv_wei_shape {4, 4, 1, 1};
-    // N, OC, OH, OW
-    std::vector<int64_t> conv_dst_shape {4, 4, 4, 4};
-    // OC, IC/G, KH, KW
-    std::vector<int64_t> dw_wei_shape {4, 1, 3, 3};
-    // N, OC, OH, OW
-    std::vector<int64_t> dw_dst_shape {4, 4, 2, 2};
-
-    std::string dw_type {"k3s2p1"};
-
-    std::vector<float> src_data(product(conv_src_shape));
-    std::vector<float> wei_data(product(conv_wei_shape));
-    std::vector<float> dw_wei_data(product(dw_wei_shape));
-
-    std::default_random_engine generator(7);
-    std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-    std::generate(src_data.begin(), src_data.end(),
-            [&]() { return f32_distribution(generator); });
-    std::generate(wei_data.begin(), wei_data.end(),
-            [&]() { return f32_distribution(generator); });
-    std::generate(dw_wei_data.begin(), dw_wei_data.end(),
-            [&]() { return f32_distribution(generator); });
-
-    graph::op_t conv {0, graph::op_kind::Convolution, "conv"};
-    utils::set_conv_dw_base_op_attr(conv);
-
-    graph::op_t depthwise {1, graph::op_kind::Convolution, "depthwise"};
-    utils::set_conv_dw_post_op_attr(depthwise, dw_type);
-
-    graph::logical_tensor_t conv_src = utils::logical_tensor_init(
-            0, conv_src_shape, graph::data_type::f32);
-    graph::logical_tensor_t conv_wei = utils::logical_tensor_init(
-            1, conv_wei_shape, graph::data_type::f32);
-    graph::logical_tensor_t conv_dst = utils::logical_tensor_init(
-            2, conv_dst_shape, graph::data_type::f32);
-
-    graph::logical_tensor_t dw_wei = utils::logical_tensor_init(
-            3, dw_wei_shape, graph::data_type::f32);
-    graph::logical_tensor_t dw_dst = utils::logical_tensor_init(
-            4, dw_dst_shape, graph::data_type::f32);
-
-    conv.add_input(conv_src);
-    conv.add_input(conv_wei);
-    conv.add_output(conv_dst);
-
-    depthwise.add_input(conv_dst);
-    depthwise.add_input(dw_wei);
-    depthwise.add_output(dw_dst);
-
-    graph::graph_t g(engine->kind());
-    g.add_op(&conv);
-    g.add_op(&depthwise);
-    g.finalize();
-
-    test_tensor conv_src_ts(conv_src, engine, src_data);
-    test_tensor conv_wei_ts(conv_wei, engine, wei_data);
-    test_tensor dw_wei_ts(dw_wei, engine, dw_wei_data);
-
-    // -------------------------case 1----------------------------------
-    std::vector<float> case1_out_data(product(dw_dst_shape));
-    test_tensor dw_dst_ts(dw_dst, engine, case1_out_data);
-
-    ASSERT_EQ(run_graph(g, {conv_src_ts, conv_wei_ts, dw_wei_ts}, {dw_dst_ts},
-                      *engine, *strm),
-            graph::status::success);
-
-    // -------------------------case 2----------------------------------
-    graph::pass::pass_base_ptr apass
-            = get_pass("fp_conv_postops_depthwise_postops_cpu");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> lt_ins {
-            &conv_src, &conv_wei, &dw_wei};
-    std::vector<const graph::logical_tensor_t *> lt_outs {&dw_dst};
-
-    ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
-
-    std::vector<float> case2_out_data(product(dw_dst_shape));
-    test_tensor dw_dst_ts2(dw_dst, engine, case2_out_data);
-
-    cp.execute(strm, {conv_src_ts.get(), conv_wei_ts.get(), dw_wei_ts.get()},
-            {dw_dst_ts2.get()});
-    strm->wait();
-    case1_out_data = dw_dst_ts.as_vec_type<float>();
-    case2_out_data = dw_dst_ts2.as_vec_type<float>();
-    for (size_t i = 0; i < case1_out_data.size(); ++i) {
-        ASSERT_FLOAT_EQ(case1_out_data[i], case2_out_data[i]);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv1dConv2dConv3d) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::vector<size_t> nds = {1, 2, 3};
-    std::vector<int64_t> groups = {1};
-    std::vector<bool> with_biases = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-    std::vector<std::string> src_qtypes = {"symmetric", "asymmetric"};
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-
-    for_(const auto &nd : nds)
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for_(const auto &src_qtype : src_qtypes)
-    for (const auto &wei_qtype : weight_qtypes) {
-        if (engine->kind() == graph::engine_kind::gpu
-                && (src_qtype == "asymmetric" || nd == 1))
-            continue;
-
-        // prepare data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape = nd == 1
-                ? std::vector<int64_t> {1, in_channel, 12}
-                : nd == 2 ? std::vector<int64_t> {1, in_channel, 12, 12}
-                          : std::vector<int64_t> {1, in_channel, 12, 12, 12};
-        std::vector<int64_t> weight_shape = nd == 1
-                ? std::vector<int64_t> {out_channel, in_channel / g,
-                        kernel_size}
-                : nd == 2 ? std::vector<int64_t> {out_channel, in_channel / g,
-                          kernel_size, kernel_size}
-                          : std::vector<int64_t> {out_channel, in_channel / g,
-                                  kernel_size, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape = nd == 1
-                ? std::vector<int64_t> {1, out_channel, 10}
-                : nd == 2 ? std::vector<int64_t> {1, out_channel, 10, 10}
-                          : std::vector<int64_t> {1, out_channel, 10, 10, 10};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<int8_t> weight_s8_data(product(weight_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<int8_t> case1_out_data(product(dst_shape));
-        std::vector<int8_t> case2_out_data(product(dst_shape));
-
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_out = 1;
-        int64_t zp_src = src_qtype == "symmetric" ? 0 : 128;
-        int64_t zp_out = 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, nd)
-
-        graph::op_t qout_node(5, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
-                8, dst_shape, graph::data_type::s8);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        qout_node.add_input(dst_f32);
-        qout_node.add_output(dst_s8);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
-                          {dst_s8_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_s8, &bias_f32};
-        else
-            lt_ins = {&src_u8, &weight_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
-                graph::status::success);
-
-        if (with_bias)
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        else
-            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        strm->wait();
-
-        if (engine->kind() == graph::engine_kind::cpu
-                && isa < dnnl_cpu_isa_avx512_core_vnni)
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
-                            /*atol*/ 1.f));
-        else
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
-                            /*atol*/ 1.f));
-    }
-}
-
-static inline void quantized_conv2d_eltwise(
-        graph::op_kind_t eltwise, const float *alpha, const float *beta) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<bool> with_biases = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for (const auto &wei_qtype : weight_qtypes) {
-        // prepare data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
-        std::vector<int64_t> weight_shape {
-                out_channel, in_channel / g, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<int8_t> weight_s8_data(product(weight_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<int8_t> case1_out_data(product(dst_shape));
-        std::vector<int8_t> case2_out_data(product(dst_shape));
-
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_out = 1;
-        int64_t zp_src = 0;
-        int64_t zp_out = 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-
-        graph::op_t eltwise_node(5, eltwise, "eltwise_node");
-        if (alpha) eltwise_node.set_attr<float>(graph::op_attr::alpha, *alpha);
-        if (beta) eltwise_node.set_attr<float>(graph::op_attr::beta, *beta);
-
-        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_relu_f32 = utils::logical_tensor_init(
-                8, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::s8);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        eltwise_node.add_input(dst_f32);
-        eltwise_node.add_output(dst_relu_f32);
-
-        qout_node.add_input(dst_relu_f32);
-        qout_node.add_output(dst_s8);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&eltwise_node);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
-                          {dst_s8_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_s8, &bias_f32};
-        else
-            lt_ins = {&src_u8, &weight_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        p.compile(&cp, lt_ins, lt_outs, engine);
-
-        if (with_bias)
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        else
-            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        strm->wait();
-
-        if (engine->kind() == graph::engine_kind::cpu
-                && isa < dnnl_cpu_isa_avx512_core_vnni)
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
-                            /*atol*/ 1.f));
-        else
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
-                            /*atol*/ 1.f));
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dRelu) {
-    const graph::op_kind_t opk = graph::op_kind::ReLU;
-    quantized_conv2d_eltwise(opk, nullptr, nullptr);
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dLeakyRelu) {
-    const graph::op_kind_t opk = graph::op_kind::LeakyReLU;
-    const float alpha = 0.02f;
-    quantized_conv2d_eltwise(opk, &alpha, nullptr);
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dMish) {
-    const graph::op_kind_t opk = graph::op_kind::Mish;
-    quantized_conv2d_eltwise(opk, nullptr, nullptr);
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dSumRelu) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<bool> with_biases = {true, false};
-    // swap add's two inputs
-    std::vector<bool> swaps = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-    std::vector<std::string> other_qtypes = {"symmetric", "asymmetric"};
-
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for_(const auto swap : swaps)
-    for_(const auto &other_qtype : other_qtypes)
-    for (const auto &wei_qtype : weight_qtypes) {
-        // skip on gpu for unsupported per_channel group conv
-        if (engine->kind() == graph::engine_kind::gpu
-                && wei_qtype == "per_channel" && g == 4)
-            continue;
-        // prepare data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
-        std::vector<int64_t> weight_shape {
-                out_channel, in_channel / g, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<int8_t> weight_s8_data(product(weight_shape));
-        std::vector<int8_t> other_s8_data(product(dst_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<int8_t> case1_out_data(product(dst_shape));
-        std::vector<int8_t> case2_out_data(product(dst_shape));
-
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_other = 1 / 127.f;
-        float scale_out = 1;
-        int64_t zp_src = 0;
-        // post-sum didn't support zps on GPU
-        int64_t zp_other = other_qtype == "symmetric"
-                        || engine->kind() == graph::engine_kind::gpu
-                ? 0
-                : 128;
-        int64_t zp_out = 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-
-        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
-        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqother_node.set_attr<std::vector<int64_t>>(
-                graph::op_attr::zps, {zp_other});
-        dqother_node.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_other});
-        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
-
-        // prepare logical tensor
-        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
-        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
-        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
-        auto weight_f32_dq
-                = utils::logical_tensor_init(5, graph::data_type::f32);
-        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
-        auto dst_relu_f32
-                = utils::logical_tensor_init(8, graph::data_type::f32);
-        auto dst_s8 = utils::logical_tensor_init(9, graph::data_type::s8);
-        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
-        auto other_f32_dq
-                = utils::logical_tensor_init(12, graph::data_type::f32);
-        auto dst_add_f32
-                = utils::logical_tensor_init(13, graph::data_type::f32);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        dqother_node.add_input(other_s8);
-        dqother_node.add_output(other_f32_dq);
-        if (swap) {
-            add_node.add_input(dst_f32);
-            add_node.add_input(other_f32_dq);
-        } else {
-            add_node.add_input(other_f32_dq);
-            add_node.add_input(dst_f32);
-        }
-        add_node.add_output(dst_add_f32);
-
-        relu_node.add_input(dst_add_f32);
-        relu_node.add_output(dst_relu_f32);
-
-        qout_node.add_input(dst_relu_f32);
-        qout_node.add_output(dst_s8);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&dqother_node);
-        g.add_op(&add_node);
-        g.add_op(&relu_node);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        // prepare in/out with full shape
-        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
-        weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        other_s8 = utils::logical_tensor_init(
-                11, dst_shape, graph::data_type::s8);
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-        }
-        dst_s8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::s8);
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g,
-                          {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
-                          {dst_s8_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass
-                = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8x8x8_conv_add_post_ops_gpu"
-                                : "x8x8x8_conv_add_post_ops_cpu");
-
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_s8, &bias_f32, &other_s8};
-        else
-            lt_ins = {&src_u8, &weight_s8, &other_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        p.compile(&cp, lt_ins, lt_outs, engine);
-
-        if (with_bias)
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
-                            other_s8_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        else
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), other_s8_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        strm->wait();
-
-        if (engine->kind() == graph::engine_kind::cpu
-                && isa < dnnl_cpu_isa_avx512_core_vnni)
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
-                            /*atol*/ 1.f));
-        else
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
-                            /*atol*/ 1.f));
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8,
-        Conv2dSumReluWithDifferentSrc1AndDstType_GPU) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    // we will use blocked layout only on gpu devices, so this focus test
-    // targets gpu devices
-    SKIP_IF(engine->kind() != graph::engine_kind::gpu,
-            "this focus test targets gpu devices");
-
-    int64_t g = 1;
-    std::string wei_qtype = "per_channel";
-
-    // prepare data
-    int64_t in_channel = 32, out_channel = 32;
-    int64_t kernel_size = 3;
-    std::vector<int64_t> src_shape {1, in_channel, 12, 12};
-    std::vector<int64_t> weight_shape {
-            out_channel, in_channel, kernel_size, kernel_size};
-    std::vector<int64_t> bias_shape {out_channel};
-    std::vector<int64_t> dst_shape {1, out_channel, 10, 10};
-
-    std::vector<uint8_t> src_u8_data(product(src_shape));
-    std::vector<int8_t> weight_s8_data(product(weight_shape));
-    std::vector<int8_t> other_s8_data(product(dst_shape));
-    size_t bias_size = product(bias_shape);
-    std::vector<float> bias_data(bias_size);
-    std::vector<uint8_t> case1_out_data(product(dst_shape));
-    std::vector<uint8_t> case2_out_data(product(dst_shape));
-
-    // random generate src, weight and bias data random seed = 7
-    std::default_random_engine generator(7);
-    std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-    std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-    std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-    std::generate(src_u8_data.begin(), src_u8_data.end(),
-            [&]() { return static_cast<uint8_t>(u8_distribution(generator)); });
-    std::generate(weight_s8_data.begin(), weight_s8_data.end(),
-            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
-    std::generate(other_s8_data.begin(), other_s8_data.end(),
-            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
-
-    std::generate(bias_data.begin(), bias_data.end(),
-            [&]() { return f32_distribution(generator); });
-
-    float scale_src = 1 / 255.f; // map to 0~255
-    float scale_other = 1 / 127.f;
-    float scale_out = 1;
-    int64_t zp_src = 0;
-    // post-sum didn't support zps on GPU
-    int64_t zp_other = 0;
-    // The following cmd will be skiped by benchdnn, since oneDNN didn't
-    // support reorder with zps on GPU: "./tests/benchdnn/benchdnn --reorder
-    // --engine=gpu --mode=C --sdt=f32 --ddt=s8
-    // --attr-zero-points=dst:common:78 --stag=aBc8b --dtag=abc 1x8x10"
-    int64_t zp_out = 0;
-
-    size_t scale_size = out_channel;
-
-    std::vector<float> scale_wei(scale_size, 1 / 127.f);
-    std::vector<int64_t> zp_wei(scale_size, 0);
-
-    graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-    SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-    graph::op_t dqweight_node(3, graph::op_kind::Dequantize, "dqweight_node");
-    SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-    graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-    SET_CONV_ATTR(conv_node, 2)
-
-    graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-    graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-    SET_Q_DQ_OUT_ATTR(qout_node)
-
-    graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
-    dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-    dqother_node.set_attr<std::vector<int64_t>>(
-            graph::op_attr::zps, {zp_other});
-    dqother_node.set_attr<std::vector<float>>(
-            graph::op_attr::scales, {scale_other});
-    dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-    graph::op_t add_node(9, graph::op_kind::Add, "add_node");
-
-    // prepare logical tensor
-    auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
-    auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
-    auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
-    auto weight_f32_dq = utils::logical_tensor_init(5, graph::data_type::f32);
-    auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
-    auto dst_relu_f32 = utils::logical_tensor_init(8, graph::data_type::f32);
-    auto dst_u8 = utils::logical_tensor_init(9, graph::data_type::u8);
-    auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
-    auto other_f32_dq = utils::logical_tensor_init(12, graph::data_type::f32);
-    auto dst_add_f32 = utils::logical_tensor_init(13, graph::data_type::f32);
-    graph::logical_tensor_t bias_f32
-            = utils::logical_tensor_init(6, graph::data_type::f32);
-
-    dqdata_node.add_input(src_u8);
-    dqdata_node.add_output(src_f32_dq);
-
-    dqweight_node.add_input(weight_s8);
-    dqweight_node.add_output(weight_f32_dq);
-
-    conv_node.add_input(src_f32_dq);
-    conv_node.add_input(weight_f32_dq);
-    conv_node.add_input(bias_f32);
-    conv_node.add_output(dst_f32);
-
-    dqother_node.add_input(other_s8);
-    dqother_node.add_output(other_f32_dq);
-
-    add_node.add_input(dst_f32);
-    add_node.add_input(other_f32_dq);
-
-    add_node.add_output(dst_add_f32);
-
-    relu_node.add_input(dst_add_f32);
-    relu_node.add_output(dst_relu_f32);
-
-    qout_node.add_input(dst_relu_f32);
-    qout_node.add_output(dst_u8);
-
-    graph::graph_t agraph(engine->kind());
-    agraph.add_op(&dqdata_node);
-    agraph.add_op(&dqweight_node);
-    agraph.add_op(&conv_node);
-    agraph.add_op(&dqother_node);
-    agraph.add_op(&add_node);
-    agraph.add_op(&relu_node);
-    agraph.add_op(&qout_node);
-    agraph.finalize();
-
-    // prepare in/out with full shape
-    src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
-    weight_s8
-            = utils::logical_tensor_init(4, weight_shape, graph::data_type::s8);
-    other_s8 = utils::logical_tensor_init(11, dst_shape, graph::data_type::s8);
-    bias_f32 = utils::logical_tensor_init(6, bias_shape, graph::data_type::f32);
-    dst_u8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::u8);
-
-    test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-    test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-    test_tensor bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-    test_tensor dst_u8_ts(dst_u8, engine, case1_out_data);
-    test_tensor dst_u8_case2_ts(dst_u8, engine, case2_out_data);
-
-    // -------------------------case 1----------------------------------
-    ASSERT_EQ(run_graph(agraph,
-                      {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
-                      {dst_u8_ts}, *engine, *strm),
-            graph::status::success);
-
-    // -------------------------case 2----------------------------------
-    graph::pass::pass_base_ptr apass
-            = get_pass(engine->kind() == graph::engine_kind::gpu
-                            ? "x8x8x8_conv_add_post_ops_gpu"
-                            : "x8x8x8_conv_add_post_ops_cpu");
-    ASSERT_NE(apass, nullptr);
-
-    apass->run(agraph);
-    ASSERT_EQ(agraph.get_num_partitions(), 1U);
-    auto part = agraph.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> lt_ins {
-            &src_u8, &weight_s8, &bias_f32, &other_s8};
-    std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
-
-    p.compile(&cp, lt_ins, lt_outs, engine);
-
-    cp.execute(strm,
-            {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
-                    other_s8_ts.get()},
-            {dst_u8_case2_ts.get()});
-    strm->wait();
-
-    ASSERT_TRUE(allclose<uint8_t>(dst_u8_ts, dst_u8_case2_ts, /*rtol*/ 0.01f,
-            /*atol*/ 1.f));
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dSumReluNxc) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<bool> with_biases = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for (const auto &wei_qtype : weight_qtypes) {
-        // prepare fp32 data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, 12, 12, in_channel};
-        std::vector<int64_t> weight_shape {
-                kernel_size, kernel_size, in_channel / g, out_channel};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape {1, 10, 10, out_channel};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<int8_t> weight_s8_data(product(weight_shape));
-        std::vector<int8_t> other_s8_data(product(dst_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<int8_t> case1_out_data(product(dst_shape));
-        std::vector<int8_t> case2_out_data(product(dst_shape));
-
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_other = 1 / 127.f;
-        float scale_out = 1;
-        int64_t zp_src = 0;
-        int64_t zp_other = 0;
-        int64_t zp_out = 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-        dqweight_node.set_attr<int64_t>(
-                graph::op_attr::axis, wei_qtype == "per_tensor" ? 0 : 3);
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-        conv_node.set_attr<std::string>(graph::op_attr::data_format, "NXC");
-        conv_node.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
-
-        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
-        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqother_node.set_attr<std::vector<int64_t>>(
-                graph::op_attr::zps, {zp_other});
-        dqother_node.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_other});
-        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
-
-        // prepare logical tensor
-        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
-        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
-        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
-        auto weight_f32_dq
-                = utils::logical_tensor_init(5, graph::data_type::f32);
-        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
-        auto dst_relu_f32
-                = utils::logical_tensor_init(8, graph::data_type::f32);
-        auto dst_s8 = utils::logical_tensor_init(9, graph::data_type::s8);
-        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
-        auto other_f32_dq
-                = utils::logical_tensor_init(12, graph::data_type::f32);
-        auto dst_add_f32
-                = utils::logical_tensor_init(13, graph::data_type::f32);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
-        }
-
-        // -------------------------case 2----------------------------------
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        dqother_node.add_input(other_s8);
-        dqother_node.add_output(other_f32_dq);
-
-        add_node.add_input(dst_f32);
-        add_node.add_input(other_f32_dq);
-        add_node.add_output(dst_add_f32);
-
-        relu_node.add_input(dst_add_f32);
-        relu_node.add_output(dst_relu_f32);
-
-        qout_node.add_input(dst_relu_f32);
-        qout_node.add_output(dst_s8);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&dqother_node);
-        g.add_op(&add_node);
-        g.add_op(&relu_node);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        // prepare in/out with full shape
-        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
-        weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        other_s8 = utils::logical_tensor_init(
-                11, dst_shape, graph::data_type::s8);
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-        }
-        dst_s8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::s8);
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g,
-                          {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
-                          {dst_s8_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass
-                = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8x8x8_conv_add_post_ops_gpu"
-                                : "x8x8x8_conv_add_post_ops_cpu");
-
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_s8, &bias_f32, &other_s8};
-        else
-            lt_ins = {&src_u8, &weight_s8, &other_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        p.compile(&cp, lt_ins, lt_outs, engine);
-
-        if (with_bias)
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
-                            other_s8_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        else
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), other_s8_ts.get()},
-                    {dst_s8_case2_ts.get()});
-        strm->wait();
-
-        if (engine->kind() == graph::engine_kind::cpu
-                && isa < dnnl_cpu_isa_avx512_core_vnni)
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
-                            /*atol*/ 1.f));
-        else
-            ASSERT_TRUE(
-                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
-                            /*atol*/ 1.f));
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv1d2d3dX8s8f32) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::vector<size_t> nds = {1, 2, 3};
-    std::vector<int64_t> groups = {1};
-    std::vector<bool> with_biases = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-    std::vector<std::string> src_qtypes = {"symmetric", "asymmetric"};
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
-                    && engine->kind() == graph::engine_kind::cpu,
-            "Skip the test for systems that do not support "
-            "avx512_core_vnni.");
-
-    for_(const auto &nd : nds)
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for_(const auto &src_qtype : src_qtypes)
-    for (const auto &wei_qtype : weight_qtypes) {
-        if (engine->kind() == graph::engine_kind::gpu
-                && (src_qtype == "asymmetric" || nd == 1))
-            continue;
-
-        // prepare fp32 data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape = nd == 1
-                ? std::vector<int64_t> {1, in_channel, 12}
-                : nd == 2 ? std::vector<int64_t> {1, in_channel, 12, 12}
-                          : std::vector<int64_t> {1, in_channel, 12, 12, 12};
-        std::vector<int64_t> weight_shape = nd == 1
-                ? std::vector<int64_t> {out_channel, in_channel / g,
-                        kernel_size}
-                : nd == 2 ? std::vector<int64_t> {out_channel, in_channel / g,
-                          kernel_size, kernel_size}
-                          : std::vector<int64_t> {out_channel, in_channel / g,
-                                  kernel_size, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape = nd == 1
-                ? std::vector<int64_t> {1, out_channel, 10}
-                : nd == 2 ? std::vector<int64_t> {1, out_channel, 10, 10}
-                          : std::vector<int64_t> {1, out_channel, 10, 10, 10};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<int8_t> weight_s8_data(product(weight_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = src_qtype == "symmetric" ? 0 : 128;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, nd)
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.finalize();
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-
-        // -------------------------case 1----------------------------------
-        std::vector<float> case1_out_data(product(dst_shape));
-        test_tensor dst_f32_ts(dst_f32, engine, case1_out_data);
-
-        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
-                          {dst_f32_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_s8, &bias_f32};
-        else
-            lt_ins = {&src_u8, &weight_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_f32};
-
-        p.compile(&cp, lt_ins, lt_outs, engine);
-
-        std::vector<float> case2_out_data(product(dst_shape));
-        test_tensor dst_f32_case2_ts(dst_f32, engine, case2_out_data);
-        if (with_bias)
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
-                    {dst_f32_case2_ts.get()});
-        else
-            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
-                    {dst_f32_case2_ts.get()});
-        strm->wait();
-
-        if (engine->kind() == graph::engine_kind::cpu
-                && isa < dnnl_cpu_isa_avx512_core_vnni)
-            ASSERT_TRUE(
-                    allclose<float>(dst_f32_ts, dst_f32_case2_ts, /*rtol*/ 0.1f,
-                            /*atol*/ 1.f));
-        else
-            ASSERT_TRUE(allclose<float>(dst_f32_ts, dst_f32_case2_ts,
-                    /*rtol*/ 0.01f,
-                    /*atol*/ 1.f));
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dReluX8s8f32) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<bool> with_biases = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
-                    && engine->kind() == graph::engine_kind::cpu,
-            "Skip the test for systems that do not support "
-            "avx512_core_vnni.");
-
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for (const auto &wei_qtype : weight_qtypes) {
-        // prepare data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
-        std::vector<int64_t> weight_shape {
-                out_channel, in_channel / g, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<int8_t> weight_s8_data(product(weight_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<float> case1_out_data(product(dst_shape));
-        std::vector<float> case2_out_data(product(dst_shape));
-
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = 0;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-
-        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_relu_f32 = utils::logical_tensor_init(
-                8, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        relu_node.add_input(dst_f32);
-        relu_node.add_output(dst_relu_f32);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&relu_node);
-        g.finalize();
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_relu_f32_ts(dst_relu_f32, engine, case1_out_data);
-        test_tensor dst_f32_case2_ts(dst_relu_f32, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
-                          {dst_relu_f32_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_s8, &bias_f32};
-        else
-            lt_ins = {&src_u8, &weight_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_relu_f32};
-
-        p.compile(&cp, lt_ins, lt_outs, engine);
-
-        if (with_bias)
-            cp.execute(strm,
-                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
-                    {dst_f32_case2_ts.get()});
-        else
-            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
-                    {dst_f32_case2_ts.get()});
-        strm->wait();
-
-        if (engine->kind() == graph::engine_kind::cpu
-                && isa < dnnl_cpu_isa_avx512_core_vnni)
-            ASSERT_TRUE(allclose<float>(dst_relu_f32_ts, dst_f32_case2_ts,
-                    /*rtol*/ 0.1f,
-                    /*atol*/ 1.f));
-        else
-            ASSERT_TRUE(allclose<float>(dst_relu_f32_ts, dst_f32_case2_ts,
-                    /*rtol*/ 0.01f,
-                    /*atol*/ 1.f));
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, Conv2dSumReluGetInplacePair) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-    std::vector<std::string> other_qtypes = {"symmetric", "asymmetric"};
-
-    for_(const auto &g : groups)
-    for_(const auto &other_qtype : other_qtypes)
-    for (const auto &wei_qtype : weight_qtypes) {
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
-        std::vector<int64_t> weight_shape {
-                out_channel, in_channel / g, kernel_size, kernel_size};
-        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_other = 1 / 127.f;
-        float scale_out = 1;
-        int64_t zp_src = 0;
-        // post-sum didn't support zps on GPU
-        int64_t zp_other = other_qtype == "symmetric"
-                        || engine->kind() == graph::engine_kind::gpu
-                ? 0
-                : 128;
-        int64_t zp_out = 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-
-        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
-        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqother_node.set_attr<std::vector<int64_t>>(
-                graph::op_attr::zps, {zp_other});
-        dqother_node.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_other});
-        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
-
-        graph::op_t dqdata_node2(10, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node2)
-
-        graph::op_t dqweight_node2(
-                11, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node2, 0)
-
-        graph::op_t conv_node2(12, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node2, 2)
-
-        graph::op_t relu_node2(13, graph::op_kind::ReLU, "relu_node");
-
-        graph::op_t qout_node2(14, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node2)
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_relu_f32 = utils::logical_tensor_init(
-                8, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::s8);
-        graph::logical_tensor_t other_f32_dq = utils::logical_tensor_init(
-                12, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_add_f32 = utils::logical_tensor_init(
-                13, dst_shape, graph::data_type::f32);
-
-        graph::logical_tensor_t src_u8_2 = utils::logical_tensor_init(
-                14, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq_2 = utils::logical_tensor_init(
-                15, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8_2 = utils::logical_tensor_init(
-                16, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq_2 = utils::logical_tensor_init(
-                17, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32_2 = utils::logical_tensor_init(
-                18, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_s8_2 = utils::logical_tensor_init(
-                19, dst_shape, graph::data_type::s8);
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        conv_node.add_output(dst_f32);
-
-        dqother_node.add_input(dst_s8_2);
-        dqother_node.add_output(other_f32_dq);
-
-        add_node.add_input(dst_f32);
-        add_node.add_input(other_f32_dq);
-        add_node.add_output(dst_add_f32);
-
-        relu_node.add_input(dst_add_f32);
-        relu_node.add_output(dst_relu_f32);
-
-        qout_node.add_input(dst_relu_f32);
-        qout_node.add_output(dst_s8);
-
-        dqdata_node2.add_input(src_u8_2);
-        dqdata_node2.add_output(src_f32_dq_2);
-
-        dqweight_node2.add_input(weight_s8_2);
-        dqweight_node2.add_output(weight_f32_dq_2);
-
-        conv_node2.add_input(src_f32_dq_2);
-        conv_node2.add_input(weight_f32_dq_2);
-        conv_node2.add_output(dst_f32_2);
-
-        qout_node2.add_input(dst_f32_2);
-        qout_node2.add_output(dst_s8_2);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&dqother_node);
-        g.add_op(&add_node);
-        g.add_op(&relu_node);
-        g.add_op(&qout_node);
-        g.add_op(&dqdata_node2);
-        g.add_op(&dqweight_node2);
-        g.add_op(&conv_node2);
-        g.add_op(&qout_node2);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass1
-                = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8x8x8_conv_add_post_ops_gpu"
-                                : "x8x8x8_conv_add_post_ops_cpu");
-        graph::pass::pass_base_ptr apass2 = get_pass("x8x8x_conv_post_ops");
-
-        apass1->run(g);
-        apass2->run(g);
-
-        ASSERT_EQ(g.get_num_partitions(), 2U);
-        auto part2 = g.get_partitions()[0]; // int8_conv_sum_relu
-        auto part1 = g.get_partitions()[1]; // int8_conv
-
-        // compile
-        graph::partition_t p1, p2;
-        p1.init(part1);
-        p2.init(part2);
-
-        graph::compiled_partition_t cp1(p1);
-        graph::compiled_partition_t cp2(p2);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins1;
-        lt_ins1 = {&src_u8_2, &weight_s8_2};
-
-        dst_s8_2.layout_type = graph::layout_type::any;
-        std::vector<const graph::logical_tensor_t *> lt_outs1 {&dst_s8_2};
-
-        p1.compile(&cp1, lt_ins1, lt_outs1, engine);
-
-        cp1.query_logical_tensor(dst_s8_2.id, &dst_s8_2);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        lt_ins = {&src_u8, &weight_s8, &dst_s8_2};
-
-        dst_s8.layout_type = graph::layout_type::any;
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        p2.compile(&cp2, lt_ins, lt_outs, engine);
-
-        std::vector<graph::inplace_pair_t> inplace_pairs
-                = cp2.get_inplace_pairs();
-
-        ASSERT_EQ(inplace_pairs.size(), 1U);
-        ASSERT_EQ(inplace_pairs[0].input_id, dst_s8_2.id);
-        ASSERT_EQ(inplace_pairs[0].output_id, dst_s8.id);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionBiasU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::string qtype = "per_channel";
-    std::vector<int64_t> groups = {1, 4};
-    for (const auto &g_ : groups) {
-        int64_t in_channel = 8, out_channel = 8;
-        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-        std::vector<int64_t> weight_shape
-                = {out_channel, in_channel / g_, 3, 3};
-        std::vector<int64_t> bias_shape = {out_channel};
-        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-        std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<int8_t> weight_data(product(weight_shape));
-        std::vector<bfloat16_t> bias_data(product(bias_shape));
-
-        // random generate src, weight data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-        std::generate(src_data.begin(), src_data.end(), [&]() {
-            return static_cast<uint8_t>(distribution(generator));
-        });
-        std::uniform_real_distribution<float> distribution2(-127.0f, 127.0f);
-        std::generate(weight_data.begin(), weight_data.end(), [&]() {
-            return static_cast<int8_t>(distribution2(generator));
-        });
-        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-        std::generate(bias_data.begin(), bias_data.end(),
-                [&]() { return distribution3(generator); });
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = 110;
-
-        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-        float scale_out = 1 / 255.f; // map to 0~255
-        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-
-        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dqdata_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        dqweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-        graph::op_t tcweight_op {
-                3, graph::op_kind::TypeCast, "typecast_weight"};
-
-        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t tcdst_op {5, graph::op_kind::TypeCast, "typecast_dst"};
-
-        graph::op_t qout_op(6, graph::op_kind::Quantize, "qdout_op");
-        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
-        qout_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_out});
-        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                0, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::bf16);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::bf16);
-        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
-                6, bias_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
-                10, dst_shape, graph::data_type::u8);
-
-        dqdata_op.add_input(src_u8);
-        dqdata_op.add_output(src_f32_dq);
-
-        dqweight_op.add_input(weight_s8);
-        dqweight_op.add_output(weight_f32_dq);
-
-        tcdata_op.add_input(src_f32_dq);
-        tcdata_op.add_output(src_bf16);
-
-        tcweight_op.add_input(weight_f32_dq);
-        tcweight_op.add_output(weight_bf16);
-
-        conv_op.add_input(src_bf16);
-        conv_op.add_input(weight_bf16);
-        conv_op.add_input(bias_bf16);
-        conv_op.add_output(conv_bf16);
-
-        tcdst_op.add_input(conv_bf16);
-        tcdst_op.add_output(conv_f32);
-
-        qout_op.add_input(conv_f32);
-        qout_op.add_output(dst_u8);
-
-        graph::graph_t g(engine->kind());
-        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins {
-                &src_u8, &weight_s8, &bias_bf16};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
-
-        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
-                graph::status::success);
-
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_bf16_ts(bias_bf16, engine, bias_data);
-        test_tensor dst_ts(dst_u8, engine);
-        cp.execute(strm,
-                {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
-                {dst_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionBiasaddU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::string qtype = "per_channel";
-    std::vector<int64_t> groups = {1, 4};
-    for (const auto &g_ : groups) {
-        int64_t in_channel = 8, out_channel = 8;
-        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-        std::vector<int64_t> weight_shape
-                = {out_channel, in_channel / g_, 3, 3};
-        std::vector<int64_t> bias_shape = {out_channel};
-        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-        std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<float> weight_data(product(weight_shape));
-        std::vector<float> bias_data(product(bias_shape));
-
-        // random generate src, weight data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-        std::generate(src_data.begin(), src_data.end(), [&]() {
-            return static_cast<uint8_t>(distribution(generator));
-        });
-        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
-        std::generate(weight_data.begin(), weight_data.end(),
-                [&]() { return distribution2(generator); });
-        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-        std::generate(bias_data.begin(), bias_data.end(),
-                [&]() { return distribution3(generator); });
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = 110;
-
-        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-        float scale_out = 1 / 255.f; // map to 0~255
-        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-
-        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dqdata_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
-        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        qweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        dqweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-        graph::op_t tcweight_op {
-                3, graph::op_kind::TypeCast, "typecast_weight"};
-
-        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t tc_bias_op {5, graph::op_kind::TypeCast, "typecast_bias"};
-
-        graph::op_t biasadd_op {6, graph::op_kind::BiasAdd, "biasadd_op"};
-        biasadd_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-
-        graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
-
-        graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
-        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
-        qout_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_out});
-        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                0, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::bf16);
-        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-                30, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
-                6, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t bias_f32 = utils::logical_tensor_init(
-                7, bias_shape, graph::data_type::f32);
-        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
-                8, bias_shape, graph::data_type::bf16);
-        graph::logical_tensor_t bias_out_bf16 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
-                11, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
-                12, dst_shape, graph::data_type::u8);
-
-        dqdata_op.add_input(src_u8);
-        dqdata_op.add_output(src_f32_dq);
-
-        qweight_op.add_input(weight_f32);
-        qweight_op.add_output(weight_s8);
-
-        dqweight_op.add_input(weight_s8);
-        dqweight_op.add_output(weight_f32_dq);
-
-        tcdata_op.add_input(src_f32_dq);
-        tcdata_op.add_output(src_bf16);
-
-        tcweight_op.add_input(weight_f32_dq);
-        tcweight_op.add_output(weight_bf16);
-
-        conv_op.add_input(src_bf16);
-        conv_op.add_input(weight_bf16);
-        conv_op.add_output(conv_bf16);
-
-        tc_bias_op.add_input(bias_f32);
-        tc_bias_op.add_output(bias_bf16);
-
-        biasadd_op.add_input(conv_bf16);
-        biasadd_op.add_input(bias_bf16);
-        biasadd_op.add_output(bias_out_bf16);
-
-        tcdst_op.add_input(bias_out_bf16);
-        tcdst_op.add_output(conv_f32);
-
-        qout_op.add_input(conv_f32);
-        qout_op.add_output(dst_u8);
-
-        graph::graph_t g(engine->kind());
-        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&biasadd_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tc_bias_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins {
-                &src_u8, &weight_f32, &bias_f32};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
-
-        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
-                graph::status::success);
-
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-        test_tensor dst_ts(dst_u8, engine);
-        cp.execute(strm,
-                {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
-                {dst_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionBiasGeluU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::string qtype = "per_channel";
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<float> scales = {1.f, 1 / 255.f};
-    std::vector<int64_t> zps = {0, 110};
-    for_(const auto &scale : scales)
-    for_(const auto &zp : zps)
-    for (const auto &g_ : groups) {
-        int64_t in_channel = 8, out_channel = 8;
-        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-        std::vector<int64_t> weight_shape
-                = {out_channel, in_channel / g_, 3, 3};
-        std::vector<int64_t> bias_shape = {out_channel};
-        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-        std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<int8_t> weight_data(product(weight_shape));
-        std::vector<bfloat16_t> bias_data(product(bias_shape));
-
-        // random generate src, weight data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-        std::generate(src_data.begin(), src_data.end(), [&]() {
-            return static_cast<uint8_t>(distribution(generator));
-        });
-        std::uniform_real_distribution<float> distribution2(-127.0f, 127.0f);
-        std::generate(weight_data.begin(), weight_data.end(), [&]() {
-            return static_cast<int8_t>(distribution2(generator));
-        });
-        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-        std::generate(bias_data.begin(), bias_data.end(),
-                [&]() { return distribution3(generator); });
-        float scale_src = scale; // map to 0~255
-        int64_t zp_src = zp;
-
-        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-        float scale_out = scale; // map to 0~255
-        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : zp;
-
-        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dqdata_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        dqweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-        graph::op_t tcweight_op {
-                3, graph::op_kind::TypeCast, "typecast_weight"};
-
-        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t gelu_op {5, graph::op_kind::GELU, "gelu_op"};
-
-        graph::op_t tcdst_op {6, graph::op_kind::TypeCast, "typecast_dst"};
-
-        graph::op_t qout_op(7, graph::op_kind::Quantize, "qdout_op");
-        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
-        qout_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_out});
-        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                0, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::bf16);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::bf16);
-        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
-                6, bias_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t gelu_out_bf16 = utils::logical_tensor_init(
-                8, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
-                10, dst_shape, graph::data_type::u8);
-
-        dqdata_op.add_input(src_u8);
-        dqdata_op.add_output(src_f32_dq);
-
-        dqweight_op.add_input(weight_s8);
-        dqweight_op.add_output(weight_f32_dq);
-
-        tcdata_op.add_input(src_f32_dq);
-        tcdata_op.add_output(src_bf16);
-
-        tcweight_op.add_input(weight_f32_dq);
-        tcweight_op.add_output(weight_bf16);
-
-        conv_op.add_input(src_bf16);
-        conv_op.add_input(weight_bf16);
-        conv_op.add_input(bias_bf16);
-        conv_op.add_output(conv_bf16);
-
-        gelu_op.add_input(conv_bf16);
-        gelu_op.add_output(gelu_out_bf16);
-
-        tcdst_op.add_input(gelu_out_bf16);
-        tcdst_op.add_output(conv_f32);
-
-        qout_op.add_input(conv_f32);
-        qout_op.add_output(dst_u8);
-
-        graph::graph_t g(engine->kind());
-        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&gelu_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins {
-                &src_u8, &weight_s8, &bias_bf16};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
-
-        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
-                graph::status::success);
-
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_bf16_ts(bias_bf16, engine, bias_data);
-        test_tensor dst_ts(dst_u8, engine);
-        cp.execute(strm,
-                {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
-                {dst_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionReluMulS8Bf16Accuracy) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
-                    && engine->kind() == graph::engine_kind::cpu,
-            "Skip the test for systems that do not support "
-            "avx512_core_vnni.");
-
-    std::string qtype = "per_tensor";
-
-    std::vector<int64_t> src_shape = {1, 1, 3, 3};
-    std::vector<int64_t> weight_shape = {1, 1, 2, 2};
-    std::vector<int64_t> dst_shape = {1, 1, 2, 2};
-    std::vector<int64_t> mul_shape = {1, 1, 2, 2};
-
-    std::vector<int8_t> src_data = {2, -1, 4, 4, -2, 0, -1, 1, 2};
-    std::vector<int8_t> weight_data = {4, -2, 8, 4};
-    std::vector<bfloat16_t> mul_data = {0.125f, 0.5f, 0.25f, 0.125f};
-    std::vector<bfloat16_t> ref_dst_data = {0.0664f, 0.0f, 0.0625f, 0.0156f};
-    std::vector<bfloat16_t> act_dst_data = {0.0f, 0.0f, 0.0f, 0.0f};
-
-    float scale = 1.0f / 8.0f;
-    int64_t zp = 0;
-
-    graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp});
-    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale});
-    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-    graph::op_t tcdata_op {1, graph::op_kind::TypeCast, "typecast_data"};
-
-    graph::op_t dqweight_op(2, graph::op_kind::Dequantize, "dqweight_op");
-    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp});
-    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale});
-    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-    graph::op_t tcweight_op {3, graph::op_kind::TypeCast, "typecast_weight"};
-
-    graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::op_t relu_op {5, graph::op_kind::ReLU, "relu_op"};
-
-    graph::op_t mul_op {6, graph::op_kind::Multiply, "mul_op"};
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_s8
-            = utils::logical_tensor_init(0, src_shape, graph::data_type::s8);
-    graph::logical_tensor_t src_f32_dq
-            = utils::logical_tensor_init(1, src_shape, graph::data_type::f32);
-    graph::logical_tensor_t src_bf16
-            = utils::logical_tensor_init(2, src_shape, graph::data_type::bf16);
-    graph::logical_tensor_t weight_s8
-            = utils::logical_tensor_init(3, weight_shape, graph::data_type::s8);
-    graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-            4, weight_shape, graph::data_type::f32);
-    graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-            5, weight_shape, graph::data_type::bf16);
-    graph::logical_tensor_t conv_bf16
-            = utils::logical_tensor_init(7, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t relu_out_bf16
-            = utils::logical_tensor_init(12, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t mul_bf16
-            = utils::logical_tensor_init(14, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t mul_out_bf16
-            = utils::logical_tensor_init(15, dst_shape, graph::data_type::bf16);
-
-    dqdata_op.add_input(src_s8);
-    dqdata_op.add_output(src_f32_dq);
-
-    dqweight_op.add_input(weight_s8);
-    dqweight_op.add_output(weight_f32_dq);
-
-    tcdata_op.add_input(src_f32_dq);
-    tcdata_op.add_output(src_bf16);
-
-    tcweight_op.add_input(weight_f32_dq);
-    tcweight_op.add_output(weight_bf16);
-
-    conv_op.add_input(src_bf16);
-    conv_op.add_input(weight_bf16);
-    conv_op.add_output(conv_bf16);
-
-    relu_op.add_input(conv_bf16);
-    relu_op.add_output(relu_out_bf16);
-
-    mul_op.add_input(relu_out_bf16);
-    mul_op.add_input(mul_bf16);
-    mul_op.add_output(mul_out_bf16);
-
-    graph::graph_t g(engine->kind());
-    ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&relu_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&mul_op), graph::status::success);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> lt_ins {
-            &src_s8, &weight_s8, &mul_bf16};
-    std::vector<const graph::logical_tensor_t *> lt_outs {&mul_out_bf16};
-
-    ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
-
-    test_tensor src_s8_ts(src_s8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-    test_tensor mul_bf16_ts(mul_bf16, engine, mul_data);
-    test_tensor dst_bf16_ts(mul_out_bf16, engine, act_dst_data);
-    cp.execute(strm, {src_s8_ts.get(), weight_s8_ts.get(), mul_bf16_ts.get()},
-            {dst_bf16_ts.get()});
-    strm->wait();
-    act_dst_data = dst_bf16_ts.as_vec_type<bfloat16_t>();
-    for (size_t i = 0; i < act_dst_data.size(); ++i) {
-        ASSERT_FLOAT_EQ(act_dst_data[i], ref_dst_data[i]);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionBiasaddGeluU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::string qtype = "per_channel";
-    std::vector<int64_t> groups = {1, 4};
-    for (const auto &g_ : groups) {
-        int64_t in_channel = 8, out_channel = 8;
-        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-        std::vector<int64_t> weight_shape
-                = {out_channel, in_channel / g_, 3, 3};
-        std::vector<int64_t> bias_shape = {out_channel};
-        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-        std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<float> weight_data(product(weight_shape));
-        std::vector<float> bias_data(product(bias_shape));
-
-        // random generate src, weight data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-        std::generate(src_data.begin(), src_data.end(), [&]() {
-            return static_cast<uint8_t>(distribution(generator));
-        });
-        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
-        std::generate(weight_data.begin(), weight_data.end(),
-                [&]() { return distribution2(generator); });
-        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-        std::generate(bias_data.begin(), bias_data.end(),
-                [&]() { return distribution3(generator); });
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = 110;
-
-        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-        float scale_out = 1 / 255.f; // map to 0~255
-        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-
-        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dqdata_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
-        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        qweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        dqweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-        graph::op_t tcweight_op {
-                3, graph::op_kind::TypeCast, "typecast_weight"};
-
-        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t tc_bias_op {5, graph::op_kind::TypeCast, "typecast_bias"};
-
-        graph::op_t biasadd_op {6, graph::op_kind::BiasAdd, "biasadd_op"};
-        biasadd_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-
-        graph::op_t gelu_op {7, graph::op_kind::GELU, "gelu_op"};
-
-        graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
-
-        graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
-        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
-        qout_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_out});
-        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                0, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::bf16);
-        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-                30, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
-                6, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t bias_f32 = utils::logical_tensor_init(
-                7, bias_shape, graph::data_type::f32);
-        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
-                8, bias_shape, graph::data_type::bf16);
-        graph::logical_tensor_t bias_out_bf16 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t gelu_out_bf16 = utils::logical_tensor_init(
-                10, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
-                11, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
-                12, dst_shape, graph::data_type::u8);
-
-        dqdata_op.add_input(src_u8);
-        dqdata_op.add_output(src_f32_dq);
-
-        qweight_op.add_input(weight_f32);
-        qweight_op.add_output(weight_s8);
-
-        dqweight_op.add_input(weight_s8);
-        dqweight_op.add_output(weight_f32_dq);
-
-        tcdata_op.add_input(src_f32_dq);
-        tcdata_op.add_output(src_bf16);
-
-        tcweight_op.add_input(weight_f32_dq);
-        tcweight_op.add_output(weight_bf16);
-
-        conv_op.add_input(src_bf16);
-        conv_op.add_input(weight_bf16);
-        conv_op.add_output(conv_bf16);
-
-        tc_bias_op.add_input(bias_f32);
-        tc_bias_op.add_output(bias_bf16);
-
-        biasadd_op.add_input(conv_bf16);
-        biasadd_op.add_input(bias_bf16);
-        biasadd_op.add_output(bias_out_bf16);
-
-        gelu_op.add_input(bias_out_bf16);
-        gelu_op.add_output(gelu_out_bf16);
-
-        tcdst_op.add_input(gelu_out_bf16);
-        tcdst_op.add_output(conv_f32);
-
-        qout_op.add_input(conv_f32);
-        qout_op.add_output(dst_u8);
-
-        graph::graph_t g(engine->kind());
-        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&biasadd_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&gelu_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tc_bias_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
-        g.finalize();
-
-        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins {
-                &src_u8, &weight_f32, &bias_f32};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
-
-        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
-                graph::status::success);
-
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-        test_tensor dst_ts(dst_u8, engine);
-        cp.execute(strm,
-                {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
-                {dst_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionSwishU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-
-    std::string qtype = "per_channel";
-    const int64_t group = 4;
-    int64_t in_channel = 8, out_channel = 8;
-    std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-    std::vector<int64_t> weight_shape = {out_channel, in_channel / group, 3, 3};
-    std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-    float scale_src = 1 / 255.f; // map to 0~255
-    int64_t zp_src = 110;
-    size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-    std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-    std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-    float scale_out = 1 / 255.f; // map to 0~255
-    int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-    graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_src});
-    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-    graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
-    qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-    qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-    qweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
-    qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-    graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
-    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-    graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-    graph::op_t tcweight_op {3, graph::op_kind::TypeCast, "typecast_weight"};
-    graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, group);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t mul_op {6, graph::op_kind::Multiply, "mul_op"};
-    graph::op_t sigmoid_op {7, graph::op_kind::Sigmoid, "sigmoid_op"};
-    graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
-    graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
-    qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-    qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
-    qout_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_out});
-    qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-    // prepare logical tensor
-    graph::logical_tensor_t src_u8
-            = utils::logical_tensor_init(0, src_shape, graph::data_type::u8);
-    graph::logical_tensor_t src_f32_dq
-            = utils::logical_tensor_init(1, src_shape, graph::data_type::f32);
-    graph::logical_tensor_t src_bf16
-            = utils::logical_tensor_init(2, src_shape, graph::data_type::bf16);
-    graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-            30, weight_shape, graph::data_type::f32);
-    graph::logical_tensor_t weight_s8
-            = utils::logical_tensor_init(3, weight_shape, graph::data_type::s8);
-    graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-            4, weight_shape, graph::data_type::f32);
-    graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-            5, weight_shape, graph::data_type::bf16);
-    graph::logical_tensor_t conv_bf16
-            = utils::logical_tensor_init(6, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t mul_out_bf16
-            = utils::logical_tensor_init(9, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t sigmoid_out_bf16
-            = utils::logical_tensor_init(10, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t conv_f32
-            = utils::logical_tensor_init(11, dst_shape, graph::data_type::f32);
-    graph::logical_tensor_t dst_u8
-            = utils::logical_tensor_init(12, dst_shape, graph::data_type::u8);
-    dqdata_op.add_input(src_u8);
-    dqdata_op.add_output(src_f32_dq);
-    qweight_op.add_input(weight_f32);
-    qweight_op.add_output(weight_s8);
-    dqweight_op.add_input(weight_s8);
-    dqweight_op.add_output(weight_f32_dq);
-    tcdata_op.add_input(src_f32_dq);
-    tcdata_op.add_output(src_bf16);
-    tcweight_op.add_input(weight_f32_dq);
-    tcweight_op.add_output(weight_bf16);
-    conv_op.add_input(src_bf16);
-    conv_op.add_input(weight_bf16);
-    conv_op.add_output(conv_bf16);
-    sigmoid_op.add_input(conv_bf16);
-    sigmoid_op.add_output(sigmoid_out_bf16);
-    mul_op.add_input(conv_bf16);
-    mul_op.add_input(sigmoid_out_bf16);
-    mul_op.add_output(mul_out_bf16);
-    tcdst_op.add_input(mul_out_bf16);
-    tcdst_op.add_output(conv_f32);
-    qout_op.add_input(conv_f32);
-    qout_op.add_output(dst_u8);
-    graph::graph_t g(engine->kind());
-    ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&mul_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&sigmoid_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
-    g.finalize();
-    graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-    ASSERT_EQ(part->get_ops().size(), 10U);
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionSumU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-
-    std::string qtype = "per_channel";
-    std::vector<int64_t> groups = {1, 4};
-    for (const auto &g_ : groups) {
-        int64_t in_channel = 8, out_channel = 8;
-        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-        std::vector<int64_t> weight_shape
-                = {out_channel, in_channel / g_, 3, 3};
-        std::vector<int64_t> bias_shape = {1, out_channel, 1, 1};
-        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-        std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<float> weight_data(product(weight_shape));
-        std::vector<float> bias_data(product(bias_shape));
-
-        // random generate src, weight data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-        std::generate(src_data.begin(), src_data.end(), [&]() {
-            return static_cast<uint8_t>(distribution(generator));
-        });
-        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
-        std::generate(weight_data.begin(), weight_data.end(),
-                [&]() { return distribution2(generator); });
-        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-        std::generate(bias_data.begin(), bias_data.end(),
-                [&]() { return distribution3(generator); });
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-
-        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-        float scale_out = 1 / 255.f; // map to 0~255
-        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-
-        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dqdata_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
-        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        qweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        dqweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-        graph::op_t tcweight_op {
-                3, graph::op_kind::TypeCast, "typecast_weight"};
-
-        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t dq_add_op {5, graph::op_kind::Dequantize, "dq_add_op"};
-        dq_add_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dq_add_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dq_add_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dq_add_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t tc_add_op {6, graph::op_kind::TypeCast, "typecast_bias"};
-
-        graph::op_t add_op {7, graph::op_kind::Add, "add_op"};
-
-        graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
-
-        graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
-        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
-        qout_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_out});
-        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                0, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::bf16);
-        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-                30, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
-                6, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t add_u8 = utils::logical_tensor_init(
-                31, bias_shape, graph::data_type::u8);
-        graph::logical_tensor_t add_f32 = utils::logical_tensor_init(
-                7, bias_shape, graph::data_type::f32);
-        graph::logical_tensor_t add_bf16 = utils::logical_tensor_init(
-                8, bias_shape, graph::data_type::bf16);
-        graph::logical_tensor_t add_out_bf16 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
-                11, dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
-                12, dst_shape, graph::data_type::u8);
-
-        dqdata_op.add_input(src_u8);
-        dqdata_op.add_output(src_f32_dq);
-
-        qweight_op.add_input(weight_f32);
-        qweight_op.add_output(weight_s8);
-
-        dqweight_op.add_input(weight_s8);
-        dqweight_op.add_output(weight_f32_dq);
-
-        tcdata_op.add_input(src_f32_dq);
-        tcdata_op.add_output(src_bf16);
-
-        tcweight_op.add_input(weight_f32_dq);
-        tcweight_op.add_output(weight_bf16);
-
-        conv_op.add_input(src_bf16);
-        conv_op.add_input(weight_bf16);
-        conv_op.add_output(conv_bf16);
-
-        dq_add_op.add_input(add_u8);
-        dq_add_op.add_output(add_f32);
-
-        tc_add_op.add_input(add_f32);
-        tc_add_op.add_output(add_bf16);
-
-        add_op.add_input(conv_bf16);
-        add_op.add_input(add_bf16);
-        add_op.add_output(add_out_bf16);
-
-        tcdst_op.add_input(add_out_bf16);
-        tcdst_op.add_output(conv_f32);
-
-        qout_op.add_input(conv_f32);
-        qout_op.add_output(dst_u8);
-
-        graph::graph_t g(engine->kind());
-        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dq_add_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tc_add_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&add_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
-        g.finalize();
-
-        graph::pass::pass_base_ptr conv_sum_pass;
-        if (engine->kind() == graph::engine_kind::cpu) {
-            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_cpu");
-        } else {
-            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_gpu");
-        }
-        conv_sum_pass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8,
-        ConvolutionBinaryAddFailToFuseAsConvSumU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-
-    std::string qtype = "per_channel";
-    std::vector<int64_t> groups = {1, 4};
-    for (const auto &g_ : groups) {
-        int64_t in_channel = 8, out_channel = 8;
-        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-        std::vector<int64_t> weight_shape
-                = {out_channel, in_channel / g_, 3, 3};
-        std::vector<int64_t> bias_shape = {1, out_channel, 1, 1};
-        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-        std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<float> weight_data(product(weight_shape));
-        std::vector<float> bias_data(product(bias_shape));
-
-        // random generate src, weight data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-        std::generate(src_data.begin(), src_data.end(), [&]() {
-            return static_cast<uint8_t>(distribution(generator));
-        });
-        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
-        std::generate(weight_data.begin(), weight_data.end(),
-                [&]() { return distribution2(generator); });
-        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-        std::generate(bias_data.begin(), bias_data.end(),
-                [&]() { return distribution3(generator); });
-        float scale_src = 1 / 255.f; // map to 0~255
-        int64_t zp_src = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
-
-        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dqdata_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
-        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        qweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-        dqweight_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, scale_wei);
-        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-        graph::op_t tcweight_op {
-                3, graph::op_kind::TypeCast, "typecast_weight"};
-
-        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-        graph::op_t dq_add_op {5, graph::op_kind::Dequantize, "dq_add_op"};
-        dq_add_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dq_add_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-        dq_add_op.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_src});
-        dq_add_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t tc_add_op {6, graph::op_kind::TypeCast, "typecast_bias"};
-
-        graph::op_t add_op {7, graph::op_kind::Add, "add_op"};
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                0, src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                1, src_shape, graph::data_type::f32);
-        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
-                2, src_shape, graph::data_type::bf16);
-        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-                30, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                4, weight_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-                5, weight_shape, graph::data_type::bf16);
-        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
-                6, dst_shape, graph::data_type::bf16);
-        graph::logical_tensor_t add_u8 = utils::logical_tensor_init(
-                31, bias_shape, graph::data_type::u8);
-        graph::logical_tensor_t add_f32 = utils::logical_tensor_init(
-                7, bias_shape, graph::data_type::f32);
-        graph::logical_tensor_t add_bf16 = utils::logical_tensor_init(
-                8, bias_shape, graph::data_type::bf16);
-        graph::logical_tensor_t add_out_bf16 = utils::logical_tensor_init(
-                9, dst_shape, graph::data_type::bf16);
-
-        dqdata_op.add_input(src_u8);
-        dqdata_op.add_output(src_f32_dq);
-
-        qweight_op.add_input(weight_f32);
-        qweight_op.add_output(weight_s8);
-
-        dqweight_op.add_input(weight_s8);
-        dqweight_op.add_output(weight_f32_dq);
-
-        tcdata_op.add_input(src_f32_dq);
-        tcdata_op.add_output(src_bf16);
-
-        tcweight_op.add_input(weight_f32_dq);
-        tcweight_op.add_output(weight_bf16);
-
-        conv_op.add_input(src_bf16);
-        conv_op.add_input(weight_bf16);
-        conv_op.add_output(conv_bf16);
-
-        dq_add_op.add_input(add_u8);
-        dq_add_op.add_output(add_f32);
-
-        tc_add_op.add_input(add_f32);
-        tc_add_op.add_output(add_bf16);
-
-        add_op.add_input(conv_bf16);
-        add_op.add_input(add_bf16);
-        add_op.add_output(add_out_bf16);
-
-        graph::graph_t g(engine->kind());
-        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&dq_add_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&tc_add_op), graph::status::success);
-        ASSERT_EQ(g.add_op(&add_op), graph::status::success);
-        g.finalize();
-
-        graph::pass::pass_base_ptr conv_sum_pass;
-        if (engine->kind() == graph::engine_kind::cpu) {
-            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_cpu");
-        } else {
-            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_gpu");
-        }
-
-        conv_sum_pass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 0U);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvolutionAddU8s8u8MixBf16) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::string qtype = "per_channel";
-    int64_t in_channel = 8, out_channel = 8;
-    std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
-    std::vector<int64_t> weight_shape = {out_channel, in_channel, 3, 3};
-    std::vector<int64_t> bias_shape = {1, out_channel, 1, 1};
-    std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
-
-    std::vector<uint8_t> src_data(product(src_shape));
-    std::vector<float> weight_data(product(weight_shape));
-    std::vector<float> bias_data(product(bias_shape));
-    std::vector<uint16_t> post_add_src1_data(product(bias_shape));
-
-    // random generate src, weight data
-    // random seed = 7
-    std::default_random_engine generator(7);
-    std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
-    std::generate(src_data.begin(), src_data.end(),
-            [&]() { return static_cast<uint8_t>(distribution(generator)); });
-    std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
-    std::generate(weight_data.begin(), weight_data.end(),
-            [&]() { return distribution2(generator); });
-    std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
-    std::generate(bias_data.begin(), bias_data.end(),
-            [&]() { return distribution3(generator); });
-    std::uniform_real_distribution<float> distribution4(0.0f, 50.0f);
-    std::generate(post_add_src1_data.begin(), post_add_src1_data.end(),
-            [&]() { return static_cast<uint16_t>(distribution4(generator)); });
-
-    float scale_src = 1 / 255.f; // map to 0~255
-    int64_t zp_src = 110;
-
-    size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
-    std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
-    std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
-
-    graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
-    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
-    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_src});
-    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-    graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
-    qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-    qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-    qweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
-    qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-    graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
-    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
-    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
-    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
-    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
-
-    graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
-    graph::op_t tcweight_op {3, graph::op_kind::TypeCast, "typecast_weight"};
-
-    graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::op_t tc_bias_op {5, graph::op_kind::TypeCast, "typecast_bias"};
-
-    graph::op_t add_op {6, graph::op_kind::Add, "add_op"};
-
-    // prepare logical tensor
-    graph::logical_tensor_t src_u8
-            = utils::logical_tensor_init(0, src_shape, graph::data_type::u8);
-    graph::logical_tensor_t src_f32_dq
-            = utils::logical_tensor_init(1, src_shape, graph::data_type::f32);
-    graph::logical_tensor_t src_bf16
-            = utils::logical_tensor_init(2, src_shape, graph::data_type::bf16);
-    graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-            30, weight_shape, graph::data_type::f32);
-    graph::logical_tensor_t weight_s8
-            = utils::logical_tensor_init(3, weight_shape, graph::data_type::s8);
-    graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-            4, weight_shape, graph::data_type::f32);
-    graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
-            5, weight_shape, graph::data_type::bf16);
-    graph::logical_tensor_t conv_bf16
-            = utils::logical_tensor_init(6, dst_shape, graph::data_type::bf16);
-    graph::logical_tensor_t bias_f32
-            = utils::logical_tensor_init(7, bias_shape, graph::data_type::f32);
-    graph::logical_tensor_t bias_bf16
-            = utils::logical_tensor_init(8, bias_shape, graph::data_type::bf16);
-    graph::logical_tensor_t bias_out_bf16
-            = utils::logical_tensor_init(9, dst_shape, graph::data_type::bf16);
-
-    dqdata_op.add_input(src_u8);
-    dqdata_op.add_output(src_f32_dq);
-
-    qweight_op.add_input(weight_f32);
-    qweight_op.add_output(weight_s8);
-
-    dqweight_op.add_input(weight_s8);
-    dqweight_op.add_output(weight_f32_dq);
-
-    tcdata_op.add_input(src_f32_dq);
-    tcdata_op.add_output(src_bf16);
-
-    tcweight_op.add_input(weight_f32_dq);
-    tcweight_op.add_output(weight_bf16);
-
-    conv_op.add_input(src_bf16);
-    conv_op.add_input(weight_bf16);
-    conv_op.add_output(conv_bf16);
-
-    tc_bias_op.add_input(bias_f32);
-    tc_bias_op.add_output(bias_bf16);
-
-    add_op.add_input(conv_bf16);
-    add_op.add_input(bias_bf16);
-    add_op.add_output(bias_out_bf16);
-
-    graph::graph_t g(engine->kind());
-    ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&add_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
-    ASSERT_EQ(g.add_op(&tc_bias_op), graph::status::success);
-    g.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> lt_ins {
-            &src_u8, &weight_f32, &bias_bf16};
-    std::vector<const graph::logical_tensor_t *> lt_outs {&bias_out_bf16};
-
-    ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
-
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor post_add_src1_bf16_ts(bias_bf16, engine, post_add_src1_data);
-    test_tensor dst_ts(bias_out_bf16, engine);
-    cp.execute(strm,
-            {src_u8_ts.get(), weight_f32_ts.get(), post_add_src1_bf16_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-}
-
-TEST(test_conv_execute, ConvSumSum) {
-    using dims = dnnl::impl::graph::dnnl_impl::dims;
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    std::vector<float> src1(1 * 8 * 112 * 112, 1.25f);
-    std::vector<float> weight(8 * 8 * 3 * 3, 1.25f);
-    std::vector<float> src2(1 * 8 * 110 * 110, 1.3f);
-    std::vector<float> src3(1 * 8 * 110 * 110, 1.6f);
-    std::vector<float> ref_dst(1 * 8 * 110 * 110, 115.4f);
-    std::vector<float> dst(1 * 8 * 110 * 110, 0.f);
-
-    graph::op_t in_op1(0, graph::op_kind::Wildcard, "Wildcard");
-    graph::op_t in_op2(1, graph::op_kind::Wildcard, "Wildcard");
-    graph::op_t in_op3(2, graph::op_kind::Wildcard, "Wildcard");
-    graph::op_t conv_op(3, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    graph::op_t add_op1(4, graph::op_kind::Add, "Add");
-    add_op1.set_attr<std::string>(graph::op_attr::auto_broadcast, "none");
-    graph::op_t add_op2(5, graph::op_kind::Add, "Add");
-    add_op2.set_attr<std::string>(graph::op_attr::auto_broadcast, "none");
-
-    // prepare logical tensor
-    graph::logical_tensor_t src1_lt = utils::logical_tensor_init(
-            0, {1, 8, 112, 112}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {8, 8, 3, 3}, graph::data_type::f32);
-
-    graph::logical_tensor_t conv_lt = utils::logical_tensor_init(
-            2, {1, 8, 110, 110}, graph::data_type::f32);
-    graph::logical_tensor_t src2_lt = utils::logical_tensor_init(
-            3, {1, 8, 110, 110}, graph::data_type::f32);
-
-    graph::logical_tensor_t add1_lt = utils::logical_tensor_init(
-            4, {1, 8, 110, 110}, graph::data_type::f32);
-    graph::logical_tensor_t src3_lt = utils::logical_tensor_init(
-            5, {1, 8, 110, 110}, graph::data_type::f32);
-
-    graph::logical_tensor_t add2_lt = utils::logical_tensor_init(
-            6, {1, 8, 110, 110}, graph::data_type::f32);
-
-    in_op1.add_output(src1_lt);
-    conv_op.add_input(src1_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(conv_lt);
-
-    add_op1.add_input(conv_lt);
-    add_op1.add_input(src2_lt);
-    add_op1.add_output(add1_lt);
-
-    add_op2.add_input(add1_lt);
-    add_op2.add_input(src3_lt);
-    add_op2.add_output(add2_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&in_op1);
-    g.add_op(&in_op2);
-    g.add_op(&in_op3);
-    g.add_op(&conv_op);
-    g.add_op(&add_op1);
-    g.add_op(&add_op2);
-
-    g.finalize();
-
-    run_all_passes(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src1_lt, &weight_lt, &src2_lt, &src3_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&add2_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    graph::logical_tensor_t lt;
-    cp.query_logical_tensor(add2_lt.id, &lt);
-    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
-
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor src2_ts(src2_lt, eng, src2);
-    test_tensor src3_ts(src3_lt, eng, src3);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor add2_ts(add2_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm,
-            {src1_ts.get(), weight_ts.get(), src2_ts.get(), src3_ts.get()},
-            {add2_ts.get()});
-    strm->wait();
-}
-
-TEST(test_conv_execute, ConvolutionBf16InFp32Out) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    static auto isa = dnnl_get_effective_cpu_isa();
-    SKIP_IF((isa < dnnl_cpu_isa_avx512_core)
-                    && eng->kind() == graph::engine_kind::cpu,
-            "Skip bf16 examples for systems that do not support "
-            "avx512_core.");
-
-    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::op_t add_op(1, graph::op_kind::Add, "add");
-    graph::op_t tc_op(2, graph::op_kind::TypeCast, "tc");
-
-    // prepare logical tensor
-    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
-            0, {8, 32, 16, 16}, graph::data_type::bf16);
-    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
-            1, {32, 32, 1, 1}, graph::data_type::bf16);
-    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
-            2, {8, 32, 16, 16}, graph::data_type::bf16);
-    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
-            3, {8, 32, 16, 16}, graph::data_type::bf16);
-    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
-            4, {8, 32, 16, 16}, graph::data_type::bf16);
-    graph::logical_tensor_t tc_dst_lt = utils::logical_tensor_init(
-            5, {8, 32, 16, 16}, graph::data_type::f32);
-
-    // Initialize
-    std::vector<uint16_t> conv_src(8 * 32 * 16 * 16, 3);
-    std::vector<uint16_t> conv_weight(32 * 32 * 1 * 1, 2);
-    std::vector<uint16_t> post_src(8 * 32 * 16 * 16, 1);
-    std::vector<float> tc_dst(8 * 32 * 16 * 16, 0.0);
-
-    test_tensor conv_src_ts(conv_src_lt, eng, conv_src);
-    test_tensor conv_weight_ts(conv_weight_lt, eng, conv_weight);
-    test_tensor post_src_ts(post_src_lt, eng, post_src);
-    test_tensor tc_dst_ts(tc_dst_lt, eng, tc_dst);
-
-    conv_op.add_input(conv_src_lt);
-    conv_op.add_input(conv_weight_lt);
-    conv_op.add_output(conv_dst_lt);
-    add_op.add_input(conv_dst_lt);
-    add_op.add_input(post_src_lt);
-    add_op.add_output(add_dst_lt);
-    tc_op.add_input(add_dst_lt);
-    tc_op.add_output(tc_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&add_op);
-    g.add_op(&tc_op);
-    g.finalize();
-
-    //run unfused graph to compute the reference
-    ASSERT_EQ(run_graph(g, {conv_src_ts, conv_weight_ts, post_src_ts},
-                      {tc_dst_ts}, *eng, *strm),
-            graph::status::success);
-
-    // run fusion partition
-    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
-    apass->run(g);
-    ASSERT_EQ(g.get_num_partitions(), 1U);
-    auto part = g.get_partitions()[0];
-    ASSERT_EQ(part->get_ops().size(), 3U);
-
-    // compile
-    graph::partition_t p;
-    p.init(part);
-
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &conv_src_lt, &conv_weight_lt, &post_src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&tc_dst_lt};
-
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> convtc_dst(8 * 32 * 16 * 16, 0.0);
-    test_tensor convtc_dst_ts(tc_dst_lt, eng, convtc_dst);
-
-    cp.execute(strm,
-            {conv_src_ts.get(), conv_weight_ts.get(), post_src_ts.get()},
-            {convtc_dst_ts.get()});
-    strm->wait();
-
-    for (size_t i = 0; i < tc_dst.size(); ++i) {
-        ASSERT_NEAR(tc_dst[i], convtc_dst[i], 1e-10);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, QuantWeiConv2dSumRelu) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::vector<int64_t> groups = {1, 4};
-    std::vector<bool> with_biases = {true, false};
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-    std::vector<float> scales = {1.f, 1 / 127.f};
-    std::vector<int64_t> zps = {0, 110};
-
-    for_(const auto &scale : scales)
-    for_(const auto &zp : zps)
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for (const auto &wei_qtype : weight_qtypes) {
-        if (engine->kind() == graph::engine_kind::gpu
-                && wei_qtype == "per_channel" && g == 4)
-            continue;
-        // prepare fp32 data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
-        std::vector<int64_t> weight_shape {
-                out_channel, in_channel / g, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<float> weight_f32_data(product(weight_shape));
-        std::vector<int8_t> other_s8_data(product(dst_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<int8_t> case1_out_data(product(dst_shape));
-        std::vector<int8_t> case2_out_data(product(dst_shape));
-
-        // random generate src, weight and bias data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 127.0f);
-        std::uniform_real_distribution<float> s8_distribution(0.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_f32_data.begin(), weight_f32_data.end(),
-                [&]() { return f32_distribution(generator); });
-        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = scale; // map to 0~255
-        float scale_other = scale;
-        float scale_out = scale;
-        int64_t zp_src = zp;
-        int64_t zp_other = engine->kind() == graph::engine_kind::gpu ? 0 : zp;
-        int64_t zp_out = zp;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-
-        std::vector<float> scale_wei(scale_size, scale);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t qweight_node(2, graph::op_kind::Quantize, "qweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(qweight_node, 0)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-
-        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
-        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqother_node.set_attr<std::vector<int64_t>>(
-                graph::op_attr::zps, {zp_other});
-        dqother_node.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_other});
-        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
-
-        // prepare logical tensor
-        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
-        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
-        auto weight_f32 = utils::logical_tensor_init(3, graph::data_type::f32);
-        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
-        auto weight_f32_dq
-                = utils::logical_tensor_init(5, graph::data_type::f32);
-        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
-        auto dst_relu_f32
-                = utils::logical_tensor_init(8, graph::data_type::f32);
-        auto dst_s8 = utils::logical_tensor_init(9, graph::data_type::s8);
-        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
-        auto other_f32_dq
-                = utils::logical_tensor_init(12, graph::data_type::f32);
-        auto dst_add_f32
-                = utils::logical_tensor_init(13, graph::data_type::f32);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        qweight_node.add_input(weight_f32);
-        qweight_node.add_output(weight_s8);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        dqother_node.add_input(other_s8);
-        dqother_node.add_output(other_f32_dq);
-
-        add_node.add_input(dst_f32);
-        add_node.add_input(other_f32_dq);
-        add_node.add_output(dst_add_f32);
-
-        relu_node.add_input(dst_add_f32);
-        relu_node.add_output(dst_relu_f32);
-
-        qout_node.add_input(dst_relu_f32);
-        qout_node.add_output(dst_s8);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&qweight_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&dqother_node);
-        g.add_op(&add_node);
-        g.add_op(&relu_node);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        // prepare in/out with full shape
-        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
-        weight_f32 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::f32);
-        // set weight to be constant
-        weight_f32.property = graph::property_type::constant;
-        dst_s8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::s8);
-        other_s8 = utils::logical_tensor_init(
-                11, dst_shape, graph::data_type::s8);
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-            // set bias to be constant
-            bias_f32.property = graph::property_type::constant;
-        }
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_f32_data);
-        test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g,
-                          {src_u8_ts, weight_f32_ts, bias_f32_ts, other_s8_ts},
-                          {dst_s8_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass
-                = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8x8x8_conv_add_post_ops_gpu"
-                                : "x8x8x8_conv_add_post_ops_cpu");
-
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_f32, &bias_f32, &other_s8};
-        else
-            lt_ins = {&src_u8, &weight_f32, &other_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        graph::status_t ret = p.compile(&cp, lt_ins, lt_outs, engine);
-        ASSERT_EQ(ret, graph::status::success);
-
-        // single thread
-        for (size_t iter = 0; iter < 5; iter++) {
-            if (with_bias)
-                cp.execute(strm,
-                        {src_u8_ts.get(), weight_f32_ts.get(),
-                                bias_f32_ts.get(), other_s8_ts.get()},
-                        {dst_s8_case2_ts.get()});
-            else
-                cp.execute(strm,
-                        {src_u8_ts.get(), weight_f32_ts.get(),
-                                other_s8_ts.get()},
-                        {dst_s8_case2_ts.get()});
-            strm->wait();
-
-            static auto isa = dnnl_get_effective_cpu_isa();
-            if (engine->kind() == graph::engine_kind::cpu
-                    && isa < dnnl_cpu_isa_avx512_core_vnni)
-                ASSERT_TRUE(allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts,
-                        /*rtol*/ 0.1f,
-                        /*atol*/ 1.f));
-            else
-                ASSERT_TRUE(allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts,
-                        /*rtol*/ 0.01f,
-                        /*atol*/ 1.f));
-        }
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, QuantWeiConv2dSumS8Relu) {
-    static auto isa = dnnl_get_effective_cpu_isa();
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
-                    && engine->kind() == graph::engine_kind::cpu,
-            "Skip u8+s8 examples for systems that do not support "
-            "avx512_core_vnni.");
-
-    std::vector<int64_t> groups = {1};
-    std::vector<bool> with_biases = {true};
-    std::vector<std::string> weight_qtypes = {"per_channel"};
-
-    for_(const auto &g : groups)
-    for_(const auto with_bias : with_biases)
-    for (const auto &wei_qtype : weight_qtypes) {
-        // prepare fp32 data
-        int64_t in_channel = 8, out_channel = 8;
-        int64_t kernel_size = 3;
-        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
-        std::vector<int64_t> weight_shape {
-                out_channel, in_channel / g, kernel_size, kernel_size};
-        std::vector<int64_t> bias_shape {out_channel};
-        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
-        std::vector<int64_t> nxc_stride {
-                out_channel * 110 * 110, 1, out_channel * 110, out_channel};
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::vector<float> weight_f32_data(product(weight_shape));
-        std::vector<int8_t> other_s8_data(product(dst_shape));
-        size_t bias_size = with_bias ? product(bias_shape) : 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<uint8_t> case1_out_data(product(dst_shape));
-        std::vector<uint8_t> case2_out_data(product(dst_shape));
-
-        // random generate src, weight and bias data
-        // random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_f32_data.begin(), weight_f32_data.end(),
-                [&]() { return f32_distribution(generator); });
-        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
-            return static_cast<int8_t>(s8_distribution(generator));
-        });
-        if (with_bias) {
-            std::generate(bias_data.begin(), bias_data.end(),
-                    [&]() { return f32_distribution(generator); });
-        }
-
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_other = 1 / 127.f;
-        float scale_out = 1 / 255.f;
-        int64_t zp_src = 0;
-        int64_t zp_other = 0;
-        int64_t zp_out = 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
-
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t qweight_node(2, graph::op_kind::Quantize, "qweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(qweight_node, 0)
-
-        graph::op_t dqweight_node(
-                3, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-        SET_CONV_ATTR(conv_node, 2)
-
-        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
-
-        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-
-        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
-        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
-        dqother_node.set_attr<std::vector<int64_t>>(
-                graph::op_attr::zps, {zp_other});
-        dqother_node.set_attr<std::vector<float>>(
-                graph::op_attr::scales, {scale_other});
-        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
-
-        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
-
-        // prepare logical tensor
-        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
-        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
-        auto weight_f32 = utils::logical_tensor_init(3, graph::data_type::f32);
-        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
-        auto weight_f32_dq
-                = utils::logical_tensor_init(5, graph::data_type::f32);
-        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
-        auto dst_relu_f32
-                = utils::logical_tensor_init(8, graph::data_type::f32);
-        auto dst_u8 = utils::logical_tensor_init(9, graph::data_type::u8);
-        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
-        auto other_f32_dq
-                = utils::logical_tensor_init(12, graph::data_type::f32);
-        auto dst_add_f32
-                = utils::logical_tensor_init(13, graph::data_type::f32);
-        graph::logical_tensor_t bias_f32;
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
-        }
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        qweight_node.add_input(weight_f32);
-        qweight_node.add_output(weight_s8);
-
-        dqweight_node.add_input(weight_s8);
-        dqweight_node.add_output(weight_f32_dq);
-
-        conv_node.add_input(src_f32_dq);
-        conv_node.add_input(weight_f32_dq);
-        if (with_bias) conv_node.add_input(bias_f32);
-        conv_node.add_output(dst_f32);
-
-        dqother_node.add_input(other_s8);
-        dqother_node.add_output(other_f32_dq);
-
-        add_node.add_input(dst_f32);
-        add_node.add_input(other_f32_dq);
-        add_node.add_output(dst_add_f32);
-
-        relu_node.add_input(dst_add_f32);
-        relu_node.add_output(dst_relu_f32);
-
-        qout_node.add_input(dst_relu_f32);
-        qout_node.add_output(dst_u8);
-
-        graph::graph_t g(engine->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&qweight_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&conv_node);
-        g.add_op(&dqother_node);
-        g.add_op(&add_node);
-        g.add_op(&relu_node);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        // prepare in/out with full shape
-        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
-        weight_f32 = utils::logical_tensor_init(
-                3, weight_shape, graph::data_type::f32);
-        // set weight to be constant
-        weight_f32.property = graph::property_type::constant;
-        dst_u8 = utils::logical_tensor_init(
-                9, dst_shape, nxc_stride, graph::data_type::u8);
-        other_s8 = utils::logical_tensor_init(
-                11, dst_shape, nxc_stride, graph::data_type::s8);
-        if (with_bias) {
-            bias_f32 = utils::logical_tensor_init(
-                    6, bias_shape, graph::data_type::f32);
-            // set bias to be constant
-            bias_f32.property = graph::property_type::constant;
-        }
-
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_f32_data);
-        test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-        test_tensor bias_f32_ts;
-        if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
-        }
-        test_tensor dst_u8_ts(dst_u8, engine, case1_out_data);
-        test_tensor dst_u8_case2_ts(dst_u8, engine, case2_out_data);
-
-        // -------------------------case 1----------------------------------
-        ASSERT_EQ(run_graph(g,
-                          {src_u8_ts, weight_f32_ts, bias_f32_ts, other_s8_ts},
-                          {dst_u8_ts}, *engine, *strm),
-                graph::status::success);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass
-                = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8x8x8_conv_add_post_ops_gpu"
-                                : "x8x8x8_conv_add_post_ops_cpu");
-
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        if (with_bias)
-            lt_ins = {&src_u8, &weight_f32, &bias_f32, &other_s8};
-        else
-            lt_ins = {&src_u8, &weight_f32, &other_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
-
-        graph::status_t ret = p.compile(&cp, lt_ins, lt_outs, engine);
-        ASSERT_EQ(ret, graph::status::success);
-
-        std::vector<graph::inplace_pair_t> inplace_pairs
-                = cp.get_inplace_pairs();
-        ASSERT_EQ(inplace_pairs.size(), 1U);
-        ASSERT_EQ(inplace_pairs[0].input_id, other_s8.id);
-        ASSERT_EQ(inplace_pairs[0].output_id, dst_u8.id);
-
-        // single thread
-        for (size_t iter = 0; iter < 5; iter++) {
-            if (with_bias)
-                cp.execute(strm,
-                        {src_u8_ts.get(), weight_f32_ts.get(),
-                                bias_f32_ts.get(), other_s8_ts.get()},
-                        {dst_u8_case2_ts.get()});
-            else
-                cp.execute(strm,
-                        {src_u8_ts.get(), weight_f32_ts.get(),
-                                other_s8_ts.get()},
-                        {dst_u8_case2_ts.get()});
-            strm->wait();
-            ASSERT_TRUE(allclose<uint8_t>(dst_u8_ts, dst_u8_case2_ts,
-                    /*rtol*/ 0.01f,
-                    /*atol*/ 1.f));
-        }
-    }
-}
-
-TEST(test_conv_execute, ConvReluUnfused) {
-    using dims = graph::dnnl_impl::dims;
-
-    // default engine kind is cpu.
-    graph::engine_t *eng = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::op_t relu_op(1, graph::op_kind::ReLU, "relu");
-
-    // prepare logical tensor
-    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
-            0, {8, 32, 16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
-            1, {32, 32, 1, 1}, graph::data_type::f32);
-    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
-            2, {8, 32, 16, 16}, graph::data_type::f32, graph::layout_type::any);
-    graph::logical_tensor_t relu_dst_lt = utils::logical_tensor_init(
-            3, {8, 32, 16, 16}, graph::data_type::f32);
-
-    conv_op.add_input(conv_src_lt);
-    conv_op.add_input(conv_weight_lt);
-    conv_op.add_output(conv_dst_lt);
-    relu_op.add_input(conv_dst_lt);
-    relu_op.add_output(relu_dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&conv_op);
-    g.add_op(&relu_op);
-    g.finalize();
-
-    std::vector<float> conv_src(8 * 32 * 16 * 16, 1);
-    std::vector<float> conv_weight(32 * 32 * 1 * 1, 1);
-    std::vector<float> relu_dst(8 * 32 * 16 * 16, 1);
-
-    test_tensor conv_src_ts(conv_src_lt, eng, conv_src);
-    test_tensor conv_weight_ts(conv_weight_lt, eng, conv_weight);
-    test_tensor relu_dst_ts(relu_dst_lt, eng, relu_dst);
-
-    // run unfused graph to compute the reference
-    ASSERT_EQ(run_graph(g, {conv_src_ts, conv_weight_ts}, {relu_dst_ts}, *eng,
-                      *strm),
-            graph::status::success);
-    relu_dst = relu_dst_ts.as_vec_type<float>();
-    for (size_t i = 0; i < relu_dst.size(); ++i) {
-        ASSERT_FLOAT_EQ(relu_dst[i], 32);
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ConvDepthwise) {
-    graph::engine_t *eng = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
-    std::vector<std::string> weight_formats = {"OIX", "XIO"};
-    std::string dw_type {"k3s2p1"};
-
-    std::vector<int64_t> dw_src_shape, dw_wei_shape, dw_dst_shape,
-            reshape1_shape, reshape2_shape;
-    int64_t groups = 3, axis;
-    for_(const auto &weight_format : weight_formats)
-    for (const auto &wei_qtype : weight_qtypes) {
-        if (weight_format == "OIX") {
-            // N, IC, OH, OW
-            dw_src_shape = {4, 3, 8, 8};
-            // OC/G, IC, KH, KW
-            dw_wei_shape = {8, 3, 7, 7};
-            // N, OC, OH, OW
-            dw_dst_shape = {4, 24, 2, 2};
-            reshape1_shape = {24, 7, 7};
-            reshape2_shape = {24, 1, 7, 7};
-            axis = 0;
-        } else {
-            // N, OH, OW, IC
-            dw_src_shape = {4, 8, 8, 3};
-            // KH, KW, IC, OC/G
-            dw_wei_shape = {7, 7, 3, 8};
-            // N, OH, OW, OC
-            dw_dst_shape = {4, 2, 2, 24};
-            reshape1_shape = {7, 7, 24};
-            reshape2_shape = {7, 7, 1, 24};
-            axis = 2;
-        }
-        std::vector<uint8_t> src_u8_data(product(dw_src_shape));
-        std::vector<float> weight_f32_data(product(dw_wei_shape));
-        size_t bias_size = 0;
-        std::vector<float> bias_data(bias_size);
-        std::vector<int8_t> case1_out_data(product(dw_dst_shape));
-        std::vector<int8_t> case2_out_data(product(dw_dst_shape));
-
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
-        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
-        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        std::generate(weight_f32_data.begin(), weight_f32_data.end(),
-                [&]() { return f32_distribution(generator); });
-        float scale_src = 1 / 255.f; // map to 0~255
-        float scale_out = 1;
-        int64_t zp_src = 0;
-
-        int64_t zp_out = eng->kind() == graph::engine_kind::gpu ? 0 : 78;
-
-        size_t scale_size = wei_qtype == "per_tensor" ? 1 : 24;
-        std::vector<float> scale_wei(scale_size, 1 / 127.f);
-        std::vector<int64_t> zp_wei(scale_size, 0);
-
-        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-        SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-        graph::op_t reshape1_op(
-                2, graph::op_kind::StaticReshape, "reshape1_op");
-        reshape1_op.set_attr(graph::op_attr::shape, reshape1_shape);
-        reshape1_op.set_attr(graph::op_attr::special_zero, false);
-
-        graph::op_t qweight_node(3, graph::op_kind::Quantize, "qweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(qweight_node, axis);
-
-        graph::op_t dqweight_node(
-                4, graph::op_kind::Dequantize, "dqweight_node");
-        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, axis);
-
-        graph::op_t reshape2_op(
-                5, graph::op_kind::StaticReshape, "reshape2_op");
-        reshape2_op.set_attr(graph::op_attr::shape, reshape2_shape);
-        reshape2_op.set_attr(graph::op_attr::special_zero, false);
-
-        graph::op_t depthwise {6, graph::op_kind::Convolution, "depthwise"};
-        utils::set_conv_dw_post_op_attr(depthwise, dw_type);
-        depthwise.set_attr(graph::op_attr::weights_format, weight_format);
-        depthwise.set_attr(graph::op_attr::groups, groups);
-
-        graph::op_t qout_node(7, graph::op_kind::Quantize, "qout_node");
-        SET_Q_DQ_OUT_ATTR(qout_node)
-        if (weight_format == "XIO") {
-            std::string data_format = "NXC";
-            depthwise.set_attr(graph::op_attr::data_format, data_format);
-            dqdata_node.set_attr<int64_t>(
-                    dnnl::impl::graph::op_attr::axis, axis);
-            qout_node.set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, axis);
-        }
-
-        // prepare logical tensor
-        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
-                1, dw_src_shape, graph::data_type::u8);
-        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
-                2, dw_src_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
-                3, dw_wei_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_f32_reshape1
-                = utils::logical_tensor_init(
-                        4, reshape1_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_s8_q = utils::logical_tensor_init(
-                5, reshape1_shape, graph::data_type::s8);
-        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
-                6, reshape1_shape, graph::data_type::f32);
-        graph::logical_tensor_t weight_f32_reshape2
-                = utils::logical_tensor_init(
-                        7, reshape2_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
-                8, dw_dst_shape, graph::data_type::f32);
-        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
-                9, dw_dst_shape, graph::data_type::s8);
-
-        dqdata_node.add_input(src_u8);
-        dqdata_node.add_output(src_f32_dq);
-
-        reshape1_op.add_input(weight_f32);
-        reshape1_op.add_output(weight_f32_reshape1);
-
-        qweight_node.add_input(weight_f32_reshape1);
-        qweight_node.add_output(weight_s8_q);
-
-        dqweight_node.add_input(weight_s8_q);
-        dqweight_node.add_output(weight_f32_dq);
-
-        reshape2_op.add_input(weight_f32_dq);
-        reshape2_op.add_output(weight_f32_reshape2);
-
-        depthwise.add_input(src_f32_dq);
-        depthwise.add_input(weight_f32_reshape2);
-        depthwise.add_output(dst_f32);
-
-        qout_node.add_input(dst_f32);
-        qout_node.add_output(dst_s8);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&dqdata_node);
-        g.add_op(&reshape1_op);
-        g.add_op(&qweight_node);
-        g.add_op(&dqweight_node);
-        g.add_op(&reshape2_op);
-        g.add_op(&depthwise);
-        g.add_op(&qout_node);
-        g.finalize();
-
-        test_tensor src_u8_ts(src_u8, eng, src_u8_data);
-        test_tensor weight_f32_ts(weight_f32, eng, weight_f32_data);
-        test_tensor dst_s8_ts(dst_s8, eng, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, eng, case2_out_data);
-
-        // -------------------------case 2----------------------------------
-        graph::pass::pass_base_ptr apass
-                = get_pass("x8s8x_conv_reshape_post_ops");
-        apass->run(g);
-        ASSERT_EQ(g.get_num_partitions(), 1U);
-
-        auto part = g.get_partitions()[0];
-
-        // compile
-        graph::partition_t p;
-        p.init(part);
-
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins;
-        lt_ins = {&src_u8, &weight_f32};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
-
-        p.compile(&cp, lt_ins, lt_outs, eng);
-        cp.execute(strm, {src_u8_ts.get(), weight_f32_ts.get()},
-                {dst_s8_case2_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(test_conv_execute_subgraph_int8, ShareCachedWeights) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::engine_t *engine = get_engine();
-    graph::stream_t *strm = get_stream();
-
-    int64_t g = 1;
-    std::string wei_qtype = "per_channel";
-    std::string src_qtype = "symmetric";
-
-    float scale_src = 1 / 255.f;
-    int64_t zp_src = 0;
-    std::vector<int64_t> weight_shape {8, 8, 3, 3};
-
-    size_t scale_size = 8;
-    std::vector<float> scale_wei(scale_size, 1 / 127.f);
-    std::vector<int64_t> zp_wei(scale_size, 0);
-
-    graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
-    SET_Q_DQ_DATA_ATTR(dqdata_node)
-
-    graph::op_t dqweight_node(3, graph::op_kind::Dequantize, "dqweight_node");
-    SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
-
-    graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
-    SET_CONV_ATTR(conv_node, 2)
-
-    auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
-    auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
-    auto weight_s8
-            = utils::logical_tensor_init(4, weight_shape, graph::data_type::s8);
-    weight_s8.property = graph::property_type::constant;
-    auto weight_f32_dq = utils::logical_tensor_init(
-            5, weight_shape, graph::data_type::f32);
-    auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
-
-    dqdata_node.add_input(src_u8);
-    dqdata_node.add_output(src_f32_dq);
-
-    dqweight_node.add_input(weight_s8);
-    dqweight_node.add_output(weight_f32_dq);
-
-    conv_node.add_input(src_f32_dq);
-    conv_node.add_input(weight_f32_dq);
-    conv_node.add_output(dst_f32);
-
-    graph::graph_t agraph(engine->kind());
-    agraph.add_op(&dqdata_node);
-    agraph.add_op(&dqweight_node);
-    agraph.add_op(&conv_node);
-    agraph.finalize();
-
-    graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
-    apass->run(agraph);
-    ASSERT_EQ(agraph.get_num_partitions(), 1U);
-    auto part = agraph.get_partitions()[0];
-
-    graph::partition_t p;
-    p.init(part);
-
-    std::default_random_engine generator(7);
-    std::uniform_real_distribution<float> u8_distribution(0.0f, 100.0f);
-    std::uniform_real_distribution<float> s8_distribution(-64.0f, 64.0f);
-    std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
-
-    std::vector<int8_t> weight_s8_data(product(weight_shape));
-    std::generate(weight_s8_data.begin(), weight_s8_data.end(),
-            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
-    test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-
-    std::vector<std::vector<int64_t>> src_shapes {
-            {1, 8, 12, 12}, {3, 8, 12, 12}};
-    std::vector<std::vector<int64_t>> dst_shapes {
-            {1, 8, 10, 10}, {3, 8, 10, 10}};
-    std::vector<std::shared_ptr<graph::compiled_partition_t>> cps;
-    for (size_t i = 0; i < src_shapes.size(); ++i) {
-        std::cout << "---------------\n";
-        std::vector<int64_t> src_shape = src_shapes[i];
-        std::vector<int64_t> dst_shape = dst_shapes[i];
-
-        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
-        dst_f32 = utils::logical_tensor_init(
-                7, dst_shape, graph::data_type::f32);
-
-        std::vector<const graph::logical_tensor_t *> lt_ins {
-                &src_u8, &weight_s8};
-        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_f32};
-
-        cps.push_back(std::make_shared<graph::compiled_partition_t>(p));
-        auto &cp = *cps.back();
-        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
-                graph::status::success);
-
-        std::vector<uint8_t> src_u8_data(product(src_shape));
-        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
-            return static_cast<uint8_t>(u8_distribution(generator));
-        });
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor dst_ts(dst_f32, engine);
-
-        cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()}, {dst_ts.get()});
-        strm->wait();
-
-        test_tensor ref_dst_ts(dst_f32, engine);
-
-        ASSERT_EQ(run_graph(agraph, {src_u8_ts, weight_s8_ts}, {ref_dst_ts},
-                          *engine, *strm),
-                graph::status::success);
-
-        ASSERT_TRUE(allclose<float>(ref_dst_ts, dst_ts, /*rtol*/ 0.01f,
-                /*atol*/ 0.01f));
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_convolution.cpp b/tests/gtests/graph/unit/backend/dnnl/test_convolution.cpp
new file mode 100644
index 00000000000..9c9bacab4d3
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_convolution.cpp
@@ -0,0 +1,6887 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <functional>
+#include <random>
+
+#include "interface/c_types_map.hpp"
+
+#include "gtest/gtest.h"
+
+#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
+#include "graph/unit/backend/dnnl/ref_func.hpp"
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+
+struct eltwise_param_t {
+    std::string pass_name;
+    std::vector<float> bias;
+    std::vector<float> ref_dst;
+    graph::op_kind_t op_kind;
+    std::string op_name;
+    std::vector<std::pair<graph::op_attr_t, float>> attrs;
+};
+
+TEST(test_convolution_compile, ConvolutionFp32) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src = utils::logical_tensor_init(
+            0, {8, 3, 224, 224}, graph::data_type::f32);
+    graph::logical_tensor_t weight = utils::logical_tensor_init(
+            1, {16, 3, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst = utils::logical_tensor_init(2,
+            {8, 16, 222, 222}, graph::data_type::f32, graph::layout_type::any);
+
+    conv_op.add_input(src);
+    conv_op.add_input(weight);
+    conv_op.add_output(dst);
+
+    graph::graph_t g(engine->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src, &weight};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst};
+
+    p.compile(&cp, inputs, outputs, engine);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst.id, &lt);
+    ASSERT_EQ(lt.layout_type,
+            engine->kind() == graph::engine_kind::gpu
+                    ? graph::layout_type::opaque
+                    : graph::layout_type::strided);
+}
+
+TEST(test_convolution_compile, ConvolutionBackwardDataFp32) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+
+    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardData);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    // according to spec, group should be greater than 0
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    conv_op.set_attr<dims>(graph::op_attr::dst_shape, dims {8, 3, 224, 224});
+
+    // prepare logical tensor
+    graph::logical_tensor_t diff_src = utils::logical_tensor_init(
+            0, {8, 3, 224, 224}, graph::data_type::f32);
+    graph::logical_tensor_t weights = utils::logical_tensor_init(
+            1, {16, 3, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
+            2, {8, 16, 222, 222}, graph::data_type::f32);
+
+    conv_op.add_input(diff_dst);
+    conv_op.add_input(weights);
+    conv_op.add_output(diff_src);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_data_bw_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&diff_dst, &weights};
+    std::vector<const graph::logical_tensor_t *> outputs {&diff_src};
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(diff_src.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+}
+
+TEST(test_convolution_compile, ConvolutionBackwardFilterFp32) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+
+    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardWeights);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+    conv_op.set_attr<dims>(graph::op_attr::weights_shape, dims {3, 3, 64, 64});
+
+    // prepare logical tensor
+    graph::logical_tensor_t src = utils::logical_tensor_init(
+            0, {1, 224, 224, 64}, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
+            1, {1, 222, 222, 64}, graph::data_type::f32);
+    graph::logical_tensor_t diff_weight = utils::logical_tensor_init(
+            2, {3, 3, 64, 64}, graph::data_type::f32, graph::layout_type::any);
+
+    conv_op.add_input(src);
+    conv_op.add_input(diff_dst);
+    conv_op.add_output(diff_weight);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_weights_bwd_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src, &diff_dst};
+    std::vector<const graph::logical_tensor_t *> outputs {&diff_weight};
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+}
+
+TEST(test_convolution_compile,
+        ConvolutionBackwardWeightsWithGroupsAndFiltersAnyLayout) {
+    using dims = graph::dnnl_impl::dims;
+
+    const dims src_dims {2, 4, 2};
+    const dims diff_dst_dims {2, 4, 2};
+    const dims diff_wei_dims {4, 2, 3};
+
+    const dims strides {1};
+    const dims pads_begin {1};
+    const dims pads_end {1};
+    const dims dilations {1};
+    const int64_t groups {2};
+    const std::string auto_pad {"None"};
+    const std::string data_format {"NCX"};
+    const std::string filter_format {"OIX"};
+
+    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardWeights);
+    conv_op.set_attr(graph::op_attr::strides, strides)
+            .set_attr(graph::op_attr::pads_begin, pads_begin)
+            .set_attr(graph::op_attr::pads_end, pads_end)
+            .set_attr(graph::op_attr::dilations, dilations)
+            .set_attr(graph::op_attr::groups, groups)
+            .set_attr(graph::op_attr::auto_pad, auto_pad)
+            .set_attr(graph::op_attr::data_format, data_format)
+            .set_attr(graph::op_attr::weights_format, filter_format);
+
+    graph::logical_tensor_t src_lt
+            = utils::logical_tensor_init(0, src_dims, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst_lt = utils::logical_tensor_init(
+            1, diff_dst_dims, graph::data_type::f32);
+    graph::logical_tensor_t diff_wei_lt = utils::logical_tensor_init(
+            2, diff_wei_dims, graph::data_type::f32, graph::layout_type::any);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(diff_dst_lt);
+    conv_op.add_output(diff_wei_lt);
+
+    graph::engine_t *eng = get_engine();
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_weights_bwd_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &diff_dst_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&diff_wei_lt};
+
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(diff_wei_lt.id, &lt);
+    // if layout queried from the primitive will make descriptor impossible
+    // to reshape (with groups -> no groups), we make it strided (via reorder)
+    ASSERT_TRUE(lt.layout_type == graph::layout_type::opaque
+            || lt.layout_type == graph::layout_type::strided);
+}
+
+TEST(test_convolution_partition, InvalidInputNumForConvolutionBackwardData) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+
+    graph::op_t conv_bwd_data_op(graph::op_kind::ConvolutionBackwardData);
+    conv_bwd_data_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_bwd_data_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_bwd_data_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_bwd_data_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    // according to spec, group should be greater than 0
+    conv_bwd_data_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_bwd_data_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_bwd_data_op.set_attr<std::string>(
+            graph::op_attr::weights_format, "OIX");
+    conv_bwd_data_op.set_attr<dims>(
+            graph::op_attr::dst_shape, dims {8, 3, 224, 224});
+
+    // prepare logical tensor
+    graph::logical_tensor_t diff_src = utils::logical_tensor_init(
+            0, {8, 3, 224, 224}, graph::data_type::f32);
+    graph::logical_tensor_t weights = utils::logical_tensor_init(
+            1, {16, 3, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_shape
+            = utils::logical_tensor_init(2, {1, 4}, graph::data_type::s32);
+    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
+            3, {8, 16, 222, 222}, graph::data_type::f32);
+
+    conv_bwd_data_op.add_input(diff_dst);
+    conv_bwd_data_op.add_input(weights);
+    conv_bwd_data_op.add_input(dst_shape);
+    conv_bwd_data_op.add_output(diff_src);
+
+    graph::graph_t g(eng->kind());
+    ASSERT_EQ(g.add_op(&conv_bwd_data_op), graph::status::success);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_data_bw_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 0U);
+}
+
+TEST(test_convolution_partition, InvalidInputNumForConvolutionBackwardWeights) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+
+    graph::op_t conv_bwd_weights_op(graph::op_kind::ConvolutionBackwardWeights);
+    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_bwd_weights_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_bwd_weights_op.set_attr<std::string>(
+            graph::op_attr::data_format, "NXC");
+    conv_bwd_weights_op.set_attr<std::string>(
+            graph::op_attr::weights_format, "XIO");
+    conv_bwd_weights_op.set_attr<dims>(
+            graph::op_attr::weights_shape, dims {3, 3, 64, 64});
+
+    // prepare logical tensor
+    graph::logical_tensor_t src = utils::logical_tensor_init(
+            0, {1, 224, 224, 64}, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
+            1, {1, 222, 222, 64}, graph::data_type::f32);
+    graph::logical_tensor_t weights_shape
+            = utils::logical_tensor_init(2, {1, 4}, graph::data_type::s32);
+    graph::logical_tensor_t diff_weight = utils::logical_tensor_init(
+            3, {3, 3, 64, 64}, graph::data_type::f32, graph::layout_type::any);
+
+    conv_bwd_weights_op.add_input(src);
+    conv_bwd_weights_op.add_input(diff_dst);
+    conv_bwd_weights_op.add_input(weights_shape);
+    conv_bwd_weights_op.add_output(diff_weight);
+
+    graph::graph_t g(eng->kind());
+    ASSERT_EQ(g.add_op(&conv_bwd_weights_op), graph::status::success);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_weights_bwd_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 0U);
+}
+
+TEST(test_convolution_execute, ConvolutionNcxOix) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 2, 2}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvtransposeWithGroups) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {1.0, 2.0, 3.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+    std::vector<float> ref_dst {3.0, 0.0, 3.0, 0.0, 7.0, 0.0, 7.0, 0.0};
+    std::vector<float> dst(ref_dst.size(), 0);
+    graph::op_t convtranspose_op(graph::op_kind::ConvTranspose);
+    convtranspose_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    convtranspose_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    convtranspose_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    convtranspose_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    convtranspose_op.set_attr<int64_t>(graph::op_attr::groups, 2);
+    convtranspose_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    convtranspose_op.set_attr<std::string>(
+            graph::op_attr::weights_format, "IOX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 4, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {4, 4, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 8, 1, 1}, graph::data_type::f32);
+
+    convtranspose_op.add_input(src_lt);
+    convtranspose_op.add_input(weight_lt);
+    convtranspose_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&convtranspose_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("convtranspose_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+    ASSERT_EQ(dst_lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()}),
+            graph::status::success);
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, Convolution3DNcxOix) {
+    using dims = std::vector<int64_t>;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 1, 2, 2}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvolutionNcxXio) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {3, 3, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 2, 2}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, Convolution3DNcxXio) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 3, 3, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 1, 2, 2}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvolutionNxcXio) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {
+            -3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0, 2.5, -1.0, 0};
+    std::vector<float> weight {
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+    std::vector<float> ref_dst {0.5};
+    std::vector<float> dst {0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 3, 4, 1}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {3, 4, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 1, 1}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, Convolution3DNxcXio) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {
+            -3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0, 2.5, -1.0, 0};
+    std::vector<float> weight {
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0};
+    std::vector<float> ref_dst {0.5};
+    std::vector<float> dst {0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 3, 4, 1}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 3, 4, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 1, 1, 1}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvolutionNxcOix) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 4, 4, 1}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 2, 2, 1}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, Convolution3DNxcOix) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4, 1}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 2, 2, 1}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvolutionF16F16F16_GPU) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+    SKIP_IF(eng->kind() != graph::engine_kind::gpu,
+            "Skip fp16 test for non-GPU device.");
+    std::vector<float16_t> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float16_t> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float16_t> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<float16_t> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f16);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f16);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 2, 2}, graph::data_type::f16);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+}
+
+TEST(test_convolution_execute, ConvolutionBf16Bf16Bf16) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+    SKIP_IF((isa < dnnl_cpu_isa_avx512_core)
+                    && eng->kind() == graph::engine_kind::cpu,
+            "Skip bf16 tests for systems that do not support avx512_core.");
+
+    std::vector<bfloat16_t> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5,
+            2.0, 2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<bfloat16_t> weight {
+            1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<bfloat16_t> ref_dst {-1.0, 2.5, 5.0, 1.5};
+    std::vector<bfloat16_t> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::bf16);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::bf16);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 2, 2}, graph::data_type::bf16);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+}
+
+TEST(test_convolution_compile, ConvAddSharedInputs) {
+    /*      /\  /
+           / Conv
+           \  /
+           Add
+    */
+    using dims = graph::dnnl_impl::dims;
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            3, {1, 1, 4, 4}, graph::data_type::f32);
+
+    // create op conv
+    graph::op_t conv_op(0, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(conv_dst_lt);
+    // create op add
+    graph::op_t add_op(1, graph::op_kind::Add, "Add");
+    add_op.add_input(conv_dst_lt);
+    add_op.add_input(src_lt);
+    add_op.add_output(add_dst_lt);
+    // build graph
+    graph::engine_t *eng = get_engine();
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    // run pass
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile conv+add partition
+    graph::partition_t p;
+    p.init(part);
+    graph::compiled_partition_t cp(p);
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+
+    // check inplace pairs
+    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
+    ASSERT_EQ(inplace_pairs.size(), 0U);
+}
+
+TEST(test_convolution_compile, ConvAddInplace) {
+    /*      \  /
+             Conv
+           \  /
+           Add
+    */
+    using dims = graph::dnnl_impl::dims;
+
+    // TODO(qun): re-enable this test once library and bridge align the inplace
+    // logic
+    SKIP_IF(true, "library and bridge have different inplace logic");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
+            2, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t add_src_lt = utils::logical_tensor_init(
+            3, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 1, 4, 4}, graph::data_type::f32);
+
+    // create op conv
+    graph::op_t conv_op(0, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(conv_dst_lt);
+    // create op add
+    graph::op_t add_op(1, graph::op_kind::Add, "Add");
+    add_op.add_input(add_src_lt);
+    add_op.add_input(conv_dst_lt);
+    add_op.add_output(add_dst_lt);
+    // build graph
+    graph::engine_t *eng = get_engine();
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    // run pass
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile conv+add partition
+    graph::partition_t p;
+    p.init(part);
+    graph::compiled_partition_t cp(p);
+    // arbitrary order of inputs
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &add_src_lt, &weight_lt, &src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+
+    // check inplace pairs
+    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
+    ASSERT_EQ(inplace_pairs.size(), 1U);
+    ASSERT_EQ(inplace_pairs[0].input_id, add_src_lt.id);
+    ASSERT_EQ(inplace_pairs[0].output_id, add_dst_lt.id);
+}
+
+TEST(test_convolution_execute, GroupConvolution) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 4);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {8, 32, 16, 16}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {32, 8, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            2, {8, 32, 16, 16}, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    std::vector<float> src(8 * 32 * 16 * 16, 1);
+    std::vector<float> weight(32 * 8 * 1 * 1, 1);
+    std::vector<float> dst(8 * 32 * 16 * 16, 1);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], 8);
+    }
+}
+
+TEST(test_convolution_execute, ConvolutionBackwardData) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    graph::engine_t *eng = get_engine();
+
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> ref_diff_src {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0,
+            0.0, 3.0, 3.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> diff_dst {0.0, 1.0, 2.0, 3.0};
+    std::vector<float> diff_src(src.size(), 0.0);
+
+    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardData);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    conv_op.set_attr<dims>(graph::op_attr::dst_shape, dims {1, 1, 4, 4});
+
+    // prepare logical tensor
+    graph::logical_tensor_t diff_src_lt = utils::logical_tensor_init(
+            1, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            2, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst_lt = utils::logical_tensor_init(
+            3, {1, 1, 2, 2}, graph::data_type::f32);
+
+    conv_op.add_input(diff_dst_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(diff_src_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("conv_data_bw_pass");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &diff_dst_lt, &weight_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&diff_src_lt};
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(diff_src_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(diff_src_lt, eng, diff_src);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {diff_dst_ts.get(), weight_ts.get()}, {diff_src_ts.get()});
+    strm->wait();
+    diff_src = diff_src_ts.as_vec_type<float>();
+    for (size_t i = 0; i < diff_src.size(); ++i) {
+        ASSERT_FLOAT_EQ(diff_src[i], ref_diff_src[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvolutionBnFp32) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::op_t bn_op(1, graph::op_kind::BatchNormInference, "bn");
+    bn_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    bn_op.set_attr(graph::op_attr::epsilon, 1e-6f);
+
+    // prepare logical tensor
+    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
+            0, {8, 32, 16, 16}, graph::data_type::f32);
+    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
+            1, {32, 32, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
+            2, {8, 32, 16, 16}, graph::data_type::f32);
+    graph::logical_tensor_t gamma_lt
+            = utils::logical_tensor_init(3, {32}, graph::data_type::f32);
+    graph::logical_tensor_t beta_lt
+            = utils::logical_tensor_init(4, {32}, graph::data_type::f32);
+    graph::logical_tensor_t scale_lt
+            = utils::logical_tensor_init(5, {32}, graph::data_type::f32);
+    graph::logical_tensor_t shift_lt
+            = utils::logical_tensor_init(6, {32}, graph::data_type::f32);
+    graph::logical_tensor_t bn_dst_lt = utils::logical_tensor_init(
+            7, {8, 32, 16, 16}, graph::data_type::f32);
+
+    std::vector<float> conv_src(8 * 32 * 16 * 16);
+    std::vector<float> conv_weight(32 * 32 * 1 * 1);
+    std::vector<float> bn_gamma(32);
+    std::vector<float> bn_beta(32);
+    std::vector<float> bn_scale(32);
+    std::vector<float> bn_shift(32);
+    std::vector<float> bn_dst(8 * 32 * 16 * 16);
+
+    // Initialize
+    std::default_random_engine generator;
+    std::normal_distribution<float> distribution(0.0f, 0.1f);
+
+    std::generate(conv_src.begin(), conv_src.end(),
+            [&]() { return distribution(generator); });
+    std::generate(conv_weight.begin(), conv_weight.end(),
+            [&]() { return distribution(generator); });
+    std::generate(bn_gamma.begin(), bn_gamma.end(),
+            [&]() { return distribution(generator); });
+    std::generate(bn_beta.begin(), bn_beta.end(),
+            [&]() { return distribution(generator); });
+    std::generate(bn_scale.begin(), bn_scale.end(),
+            [&]() { return distribution(generator); });
+    std::generate(bn_shift.begin(), bn_shift.end(),
+            [&]() { return distribution(generator); });
+
+    test_tensor_t conv_src_ts(conv_src_lt, eng, conv_src);
+    test_tensor_t conv_weight_ts(conv_weight_lt, eng, conv_weight);
+    test_tensor_t bn_gamma_ts(gamma_lt, eng, bn_gamma);
+    test_tensor_t bn_beta_ts(beta_lt, eng, bn_beta);
+    test_tensor_t bn_scale_ts(scale_lt, eng, bn_scale);
+    test_tensor_t bn_shift_ts(shift_lt, eng, bn_shift);
+    test_tensor_t bn_dst_ts(bn_dst_lt, eng, bn_dst);
+
+    conv_op.add_input(conv_src_lt);
+    conv_op.add_input(conv_weight_lt);
+    conv_op.add_output(conv_dst_lt);
+    bn_op.add_input(conv_dst_lt);
+    bn_op.add_input(gamma_lt);
+    bn_op.add_input(beta_lt);
+    bn_op.add_input(scale_lt);
+    bn_op.add_input(shift_lt);
+    bn_op.add_output(bn_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&bn_op);
+    g.finalize();
+
+    // run unfused graph to compute the reference
+    ASSERT_EQ(run_graph(g,
+                      {conv_src_ts, conv_weight_ts, bn_gamma_ts, bn_beta_ts,
+                              bn_scale_ts, bn_shift_ts},
+                      {bn_dst_ts}, *eng, *strm),
+            graph::status::success);
+
+    // run fusion partition
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&conv_src_lt,
+            &conv_weight_lt, &gamma_lt, &beta_lt, &scale_lt, &shift_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&bn_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    std::vector<float> convbn_dst(8 * 32 * 16 * 16, 0.0);
+    test_tensor_t convbn_dst_ts(bn_dst_lt, eng, convbn_dst);
+
+    cp.execute(strm,
+            {conv_src_ts.get(), conv_weight_ts.get(), bn_gamma_ts.get(),
+                    bn_beta_ts.get(), bn_scale_ts.get(), bn_shift_ts.get()},
+            {convbn_dst_ts.get()});
+    strm->wait();
+
+    float max_diff = 0;
+    for (size_t i = 0; i < bn_dst.size(); ++i) {
+        max_diff = std::max(max_diff, std::abs(bn_dst[i] - convbn_dst[i]));
+    }
+    ASSERT_LT(max_diff, 1e-6f);
+}
+
+TEST(test_convolution_compile, ConvBnSharedInputs) {
+    // bn has shared gamma/beta/mean/var
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::op_t bn_op(1, graph::op_kind::BatchNormInference, "bn");
+    bn_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    bn_op.set_attr(graph::op_attr::epsilon, 1e-6f);
+
+    // prepare logical tensor
+    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
+            0, {8, 32, 16, 16}, graph::data_type::f32);
+    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
+            1, {32, 32, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
+            2, {8, 32, 16, 16}, graph::data_type::f32);
+    graph::logical_tensor_t shared_lt
+            = utils::logical_tensor_init(3, {32}, graph::data_type::f32);
+    graph::logical_tensor_t bn_dst_lt = utils::logical_tensor_init(
+            7, {8, 32, 16, 16}, graph::data_type::f32);
+
+    std::vector<float> conv_src(8 * 32 * 16 * 16);
+    std::vector<float> conv_weight(32 * 32 * 1 * 1);
+    std::vector<float> bn_shared_input(32);
+    std::vector<float> bn_dst(8 * 32 * 16 * 16);
+
+    // Initialize
+    std::default_random_engine generator;
+    std::normal_distribution<float> distribution(0.0f, 0.1f);
+
+    std::generate(conv_src.begin(), conv_src.end(),
+            [&]() { return distribution(generator); });
+    std::generate(conv_weight.begin(), conv_weight.end(),
+            [&]() { return distribution(generator); });
+    std::generate(bn_shared_input.begin(), bn_shared_input.end(),
+            [&]() { return distribution(generator); });
+
+    test_tensor_t conv_src_ts(conv_src_lt, eng, conv_src);
+    test_tensor_t conv_weight_ts(conv_weight_lt, eng, conv_weight);
+    test_tensor_t bn_shared_input_ts(shared_lt, eng, bn_shared_input);
+    test_tensor_t bn_dst_ts(bn_dst_lt, eng, bn_dst);
+
+    conv_op.add_input(conv_src_lt);
+    conv_op.add_input(conv_weight_lt);
+    conv_op.add_output(conv_dst_lt);
+    bn_op.add_input(conv_dst_lt);
+    bn_op.add_input(shared_lt);
+    bn_op.add_input(shared_lt);
+    bn_op.add_input(shared_lt);
+    bn_op.add_input(shared_lt);
+    bn_op.add_output(bn_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&bn_op);
+    g.finalize();
+
+    // run unfused graph to compute the reference
+    ASSERT_EQ(run_graph(g,
+                      {conv_src_ts, conv_weight_ts, bn_shared_input_ts,
+                              bn_shared_input_ts, bn_shared_input_ts,
+                              bn_shared_input_ts},
+                      {bn_dst_ts}, *eng, *strm),
+            graph::status::success);
+
+    // run fusion partition
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&conv_src_lt,
+            &conv_weight_lt, &shared_lt, &shared_lt, &shared_lt, &shared_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&bn_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    std::vector<float> convbn_dst(8 * 32 * 16 * 16, 0.0);
+    test_tensor_t convbn_dst_ts(bn_dst_lt, eng, convbn_dst);
+
+    cp.execute(strm,
+            {conv_src_ts.get(), conv_weight_ts.get(), bn_shared_input_ts.get(),
+                    bn_shared_input_ts.get(), bn_shared_input_ts.get(),
+                    bn_shared_input_ts.get()},
+            {convbn_dst_ts.get()});
+    strm->wait();
+    ASSERT_TRUE(
+            allclose<float>(bn_dst_ts, convbn_dst_ts, /*rtol*/ 0.1f, 1e-6f));
+}
+
+TEST(test_convolution_execute, ConvAdd) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {1.0, 2.0, 3.0, 4.0};
+    std::vector<float> ref_dst {0.0, 4.5, 8.0, 5.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+
+    std::vector<bool> swaps {false, true};
+
+    for (auto swap : swaps) {
+        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+        graph::op_t add_op(2, graph::op_kind::Add, "Add");
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+                0, {1, 1, 4, 4}, graph::data_type::f32);
+        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+                1, {1, 1, 3, 3}, graph::data_type::f32);
+        graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
+                2, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+                3, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+                4, {1, 1, 2, 2}, graph::data_type::f32);
+
+        graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+        in_op.add_output(post_src_lt);
+        conv_op.add_input(src_lt);
+        conv_op.add_input(weight_lt);
+        conv_op.add_output(dst_lt);
+        if (swap) {
+            add_op.add_input(post_src_lt);
+            add_op.add_input(dst_lt);
+        } else {
+            add_op.add_input(dst_lt);
+            add_op.add_input(post_src_lt);
+        }
+        add_op.add_output(add_dst_lt);
+
+        graph::graph_t g(eng->kind());
+        g.add_op(&in_op);
+        g.add_op(&conv_op);
+        g.add_op(&add_op);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> inputs {
+                &src_lt, &weight_lt, &post_src_lt};
+        std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+        p.compile(&cp, inputs, outputs, eng);
+
+        graph::logical_tensor_t lt;
+        cp.query_logical_tensor(add_dst_lt.id, &lt);
+        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t weight_ts(weight_lt, eng, weight);
+        test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+        test_tensor_t add_dst_ts(add_dst_lt, eng, dst);
+
+        graph::stream_t *strm = get_stream();
+        cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+                {add_dst_ts.get()});
+        strm->wait();
+        dst = add_dst_ts.as_vec_type<float>();
+        for (size_t i = 0; i < dst.size(); ++i) {
+            ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+        }
+    }
+}
+
+TEST(test_convolution_execute, ConvAddPerTensorBroadcast) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {3.0};
+    std::vector<float> ref_dst {2.0, 5.5, 8.0, 4.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t add_op(2, graph::op_kind::Add, "Add");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    // post_src will first be unsequeeze to {1,1,1,1} and then broadcast
+    // to {1,1,2,2}
+    graph::logical_tensor_t post_src_lt
+            = utils::logical_tensor_init(2, {1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            3, {1, 1, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 1, 2, 2}, graph::data_type::f32);
+
+    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+    in_op.add_output(post_src_lt);
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+    add_op.add_input(dst_lt);
+    add_op.add_input(post_src_lt);
+    add_op.add_output(add_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op);
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &post_src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(add_dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+            {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvAddExpandedPerTensorBroadcast) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {3.0};
+    std::vector<float> ref_dst {2.0, 5.5, 8.0, 4.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t add_op(2, graph::op_kind::Add, "Add");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
+            2, {1, 1, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            3, {1, 1, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 1, 2, 2}, graph::data_type::f32);
+
+    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+    in_op.add_output(post_src_lt);
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+    add_op.add_input(dst_lt);
+    add_op.add_input(post_src_lt);
+    add_op.add_output(add_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op);
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &post_src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(add_dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+            {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvAddPerChannelBroadcast) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0,
+            0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {3.0, 3.0};
+    std::vector<float> ref_dst {2.0, 5.5, 8.0, 4.5, 2.0, 5.5, 8.0, 4.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t add_op(2, graph::op_kind::Add, "Add");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {2, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
+            2, {1, 2, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            3, {1, 2, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 2, 2, 2}, graph::data_type::f32);
+
+    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+    in_op.add_output(post_src_lt);
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+    add_op.add_input(dst_lt);
+    add_op.add_input(post_src_lt);
+    add_op.add_output(add_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op);
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &post_src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(add_dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+            {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvAddPerChannelBroadcastNxc) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,
+            0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+    std::vector<float> post_src {3.0, 3.0};
+    std::vector<float> ref_dst {2.0, 2.0, 5.5, 5.5, 8.0, 8.0, 4.5, 4.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+    graph::op_t add_op(2, graph::op_kind::Add, "Add");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 4, 4, 1}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {3, 3, 1, 2}, graph::data_type::f32);
+    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
+            2, {1, 1, 1, 2}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            3, {1, 2, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 2, 2, 2}, graph::data_type::f32);
+
+    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+    in_op.add_output(post_src_lt);
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+    add_op.add_input(dst_lt);
+    add_op.add_input(post_src_lt);
+    add_op.add_output(add_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op);
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &post_src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(add_dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+            {dst_ts.get()});
+    strm->wait();
+    dst = dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_compile, ConvAddBroadcast) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {3.0, 3.0};
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t add_op(2, graph::op_kind::Add, "Add");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t post_src_lt
+            = utils::logical_tensor_init(2, {2}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            3, {1, 1, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 1, 2, 2}, graph::data_type::f32);
+
+    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+    in_op.add_output(post_src_lt);
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+    add_op.add_input(dst_lt);
+    add_op.add_input(post_src_lt);
+    add_op.add_output(add_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op);
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &post_src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+}
+
+TEST(test_convolution_execute, ConvAddRelu) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {-1.0, -2.0, -3.0, -4.0};
+    std::vector<float> ref_dst {0.0, 0.5, 2.0, 0.0};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+    graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t add_op(2, graph::op_kind::Add, "Add");
+    graph::op_t relu_op(3, graph::op_kind::ReLU, "ReLU");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t post_lt = utils::logical_tensor_init(
+            2, {1, 1, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            3, {1, 1, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {1, 1, 2, 2}, graph::data_type::f32);
+    graph::logical_tensor_t relu_dst_lt = utils::logical_tensor_init(
+            5, {1, 1, 2, 2}, graph::data_type::f32);
+
+    in_op.add_output(post_lt);
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+    add_op.add_input(dst_lt);
+    add_op.add_input(post_lt);
+    add_op.add_output(add_dst_lt);
+    relu_op.add_input(add_dst_lt);
+    relu_op.add_output(relu_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op);
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.add_op(&relu_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src_lt, &weight_lt, &post_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&relu_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(relu_dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t post_src_ts(post_lt, eng, post_src);
+    test_tensor_t relu_dst_ts(relu_dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+            {relu_dst_ts.get()});
+    strm->wait();
+    dst = relu_dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvMultiplePostOps) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> bias {1.0};
+    std::vector<float> mul_other {2.0, 2.0, 2.0, 2.0};
+    std::vector<float> sum_other {-1.0, -2.0, -3.0, -4.0};
+    std::vector<float> add_other {1.0};
+    std::vector<float> ref_dst {0.0, 6.0, 10.0, 2.0};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    std::string data_format = "NXC";
+    std::string filter_format = "XIO";
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, data_format);
+    conv_op.set_attr<std::string>(
+            graph::op_attr::weights_format, filter_format);
+
+    graph::op_t mul_op(2, graph::op_kind::Multiply, "Mul");
+    graph::op_t sum_op(3, graph::op_kind::Add, "Sum");
+    graph::op_t add_op(4, graph::op_kind::Add, "Add");
+
+    std::vector<int64_t> src_dims {1, 1, 4, 4};
+    std::vector<int64_t> weight_dims {1, 1, 3, 3};
+    std::vector<int64_t> dst_dims {1, 1, 2, 2};
+    if (data_format == "NXC") {
+        src_dims = {1, 4, 4, 1};
+        dst_dims = {1, 2, 2, 1};
+    }
+    if (filter_format == "XIO") weight_dims = {3, 3, 1, 1};
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_lt
+            = utils::logical_tensor_init(0, src_dims, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt
+            = utils::logical_tensor_init(1, weight_dims, graph::data_type::f32);
+    graph::logical_tensor_t bias_lt
+            = utils::logical_tensor_init(2, {1}, graph::data_type::f32);
+    graph::logical_tensor_t mul_other_lt
+            = utils::logical_tensor_init(3, dst_dims, graph::data_type::f32);
+    graph::logical_tensor_t mul_dst_lt
+            = utils::logical_tensor_init(4, dst_dims, graph::data_type::f32);
+    graph::logical_tensor_t sum_other_lt
+            = utils::logical_tensor_init(5, dst_dims, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt
+            = utils::logical_tensor_init(6, dst_dims, graph::data_type::f32);
+    graph::logical_tensor_t sum_dst_lt
+            = utils::logical_tensor_init(7, dst_dims, graph::data_type::f32);
+    graph::logical_tensor_t add_other_lt
+            = utils::logical_tensor_init(8, {1}, graph::data_type::f32);
+    graph::logical_tensor_t add_dst_lt
+            = utils::logical_tensor_init(9, dst_dims, graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_input(bias_lt);
+    conv_op.add_output(dst_lt);
+    mul_op.add_input(dst_lt);
+    mul_op.add_input(mul_other_lt);
+    mul_op.add_output(mul_dst_lt);
+    sum_op.add_input(mul_dst_lt);
+    sum_op.add_input(sum_other_lt);
+    sum_op.add_output(sum_dst_lt);
+    add_op.add_input(sum_dst_lt);
+    add_op.add_input(add_other_lt);
+    add_op.add_output(add_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&mul_op);
+    g.add_op(&sum_op);
+    g.add_op(&add_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {&src_lt, &weight_lt,
+            &bias_lt, &mul_other_lt, &sum_other_lt, &add_other_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
+
+    ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
+
+    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
+    ASSERT_EQ(inplace_pairs.size(), 1U);
+    ASSERT_EQ(inplace_pairs[0].input_id, sum_other_lt.id);
+    ASSERT_EQ(inplace_pairs[0].output_id, add_dst_lt.id);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(add_dst_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t bias_ts(bias_lt, eng, bias);
+    test_tensor_t mul_other_ts(mul_other_lt, eng, mul_other);
+    test_tensor_t sum_other_ts(sum_other_lt, eng, sum_other);
+    test_tensor_t add_other_ts(add_other_lt, eng, add_other);
+    test_tensor_t add_dst_ts(add_dst_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm,
+            {src_ts.get(), weight_ts.get(), bias_ts.get(), mul_other_ts.get(),
+                    sum_other_ts.get(), add_other_ts.get()},
+            {add_dst_ts.get()});
+    strm->wait();
+    dst = add_dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(dst[i], ref_dst[i]);
+    }
+}
+
+TEST(test_convolution_execute, ConvBiasEltwise) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0.0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+
+    std::vector<eltwise_param_t> params1 = {
+            eltwise_param_t {"fp_conv_post_ops", {-1.0}, {2.0, 1.5, 4.0, 0.5},
+                    graph::op_kind::Abs, "Abs", {}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    {static_cast<float>(exp(-2) - 1), 1.5, 4.0, 0.5},
+                    graph::op_kind::Elu, "Elu", {{graph::op_attr::alpha, 1.f}}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    hardsigmoid_func({-2.0, 1.5, 4.0, 0.5}, 1.f / 6, 0.5f),
+                    graph::op_kind::HardSigmoid, "HardSigmoid",
+                    {{graph::op_attr::alpha, 1.f / 6},
+                            {graph::op_attr::beta, 0.5f}}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    {-0.04f, 1.5f, 4.0f, 0.5f}, graph::op_kind::LeakyReLU,
+                    "LeakyReLU", {{graph::op_attr::alpha, 0.02f}}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    mish_func({-2.0, 1.5, 4.0, 0.5}), graph::op_kind::Mish,
+                    "Mish", {}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0}, {0.0, 1.5, 3.0, 0.5},
+                    graph::op_kind::Clamp, "Clamp",
+                    {{graph::op_attr::min, 0.f}, {graph::op_attr::max, 3.f}}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    sigmoid_func({-2.0, 1.5, 4.0, 0.5}),
+                    graph::op_kind::Sigmoid, "Sigmoid", {}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    {4.0, 2.25, 16.0, 0.25}, graph::op_kind::Square, "Square",
+                    {}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    tanh_func({-2.0, 1.5, 4.0, 0.5}), graph::op_kind::Tanh,
+                    "Tanh", {}},
+            eltwise_param_t {"fp_conv_post_ops", {1.0},
+                    sqrt_func({0.0, 3.5, 6.0, 2.5}), graph::op_kind::Sqrt,
+                    "Sqrt", {}},
+    };
+
+    for (auto &param : params1) {
+        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+        conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+        graph::op_t eltwise_op(2, param.op_kind, param.op_name);
+        for (auto &attr : param.attrs) {
+            eltwise_op.set_attr<float>(attr.first, attr.second);
+        }
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+                0, {1, 1, 4, 4}, graph::data_type::f32);
+        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+                1, {1, 1, 3, 3}, graph::data_type::f32);
+        graph::logical_tensor_t bias_lt = utils::logical_tensor_init(
+                2, dims {1}, graph::data_type::f32);
+        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+                4, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t eltwise_dst_lt = utils::logical_tensor_init(
+                5, {1, 1, 2, 2}, graph::data_type::f32);
+
+        conv_op.add_input(src_lt);
+        conv_op.add_input(weight_lt);
+        conv_op.add_input(bias_lt);
+        conv_op.add_output(dst_lt);
+        eltwise_op.add_input(dst_lt);
+        eltwise_op.add_output(eltwise_dst_lt);
+
+        graph::graph_t g(eng->kind());
+        g.add_op(&conv_op);
+        g.add_op(&eltwise_op);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass(param.pass_name);
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> inputs {
+                &src_lt, &weight_lt, &bias_lt};
+        std::vector<const graph::logical_tensor_t *> outputs {&eltwise_dst_lt};
+
+        p.compile(&cp, inputs, outputs, eng);
+
+        graph::logical_tensor_t lt;
+        cp.query_logical_tensor(eltwise_dst_lt.id, &lt);
+        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t weight_ts(weight_lt, eng, weight);
+        test_tensor_t bias_ts(bias_lt, eng, param.bias);
+        test_tensor_t eltwise_dst_ts(eltwise_dst_lt, eng, dst);
+
+        graph::stream_t *strm = get_stream();
+        cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
+                {eltwise_dst_ts.get()});
+        strm->wait();
+        dst = eltwise_dst_ts.as_vec_type<float>();
+        for (size_t i = 0; i < dst.size(); ++i) {
+            ASSERT_FLOAT_EQ(dst[i], param.ref_dst[i]);
+        }
+    }
+}
+
+TEST(test_convolution_execute, ConvBiasAddEltwise) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0.0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {-2.0, 1.0, -1.0, 0.0};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+
+    std::vector<eltwise_param_t> params2 = {
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    {static_cast<float>(exp(-4.0) - 1), 2.5, 3.0, 0.5},
+                    graph::op_kind::Elu, "Elu", {{graph::op_attr::alpha, 1.f}}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    {-0.08f, 2.5f, 3.0f, 0.5f}, graph::op_kind::LeakyReLU,
+                    "LeakyReLU", {{graph::op_attr::alpha, 0.02f}}},
+            eltwise_param_t {"fp_conv_post_ops", {-1.0},
+                    mish_func({-4.0f, 2.5f, 3.0f, 0.5f}), graph::op_kind::Mish,
+                    "Mish", {}},
+            eltwise_param_t {"fp_conv_post_ops", {3.0}, {0.0, 6.f, 6.f, 4.5},
+                    graph::op_kind::Clamp, "ReLU6",
+                    {{graph::op_attr::min, 0.f}, {graph::op_attr::max, 6.f}}},
+    };
+
+    for (auto &param : params2) {
+        graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+
+        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+
+        conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t add_op(2, graph::op_kind::Add, "Add");
+        graph::op_t eltwise_op(3, param.op_kind, param.op_name);
+        for (auto &attr : param.attrs) {
+            eltwise_op.set_attr<float>(attr.first, attr.second);
+        }
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+                0, {1, 1, 4, 4}, graph::data_type::f32);
+        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+                1, {1, 1, 3, 3}, graph::data_type::f32);
+        graph::logical_tensor_t bias_lt = utils::logical_tensor_init(
+                2, dims {1}, graph::data_type::f32);
+        graph::logical_tensor_t post_lt = utils::logical_tensor_init(
+                3, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+                4, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+                5, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t eltwise_dst_lt = utils::logical_tensor_init(
+                6, {1, 1, 2, 2}, graph::data_type::f32);
+
+        in_op.add_output(post_lt);
+        conv_op.add_input(src_lt);
+        conv_op.add_input(weight_lt);
+        conv_op.add_input(bias_lt);
+        conv_op.add_output(dst_lt);
+        add_op.add_input(dst_lt);
+        add_op.add_input(post_lt);
+        add_op.add_output(add_dst_lt);
+        eltwise_op.add_input(add_dst_lt);
+        eltwise_op.add_output(eltwise_dst_lt);
+
+        graph::graph_t g(eng->kind());
+        g.add_op(&in_op);
+        g.add_op(&conv_op);
+        g.add_op(&add_op);
+        g.add_op(&eltwise_op);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass(param.pass_name);
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> inputs {
+                &src_lt, &weight_lt, &bias_lt, &post_lt};
+        std::vector<const graph::logical_tensor_t *> outputs {&eltwise_dst_lt};
+
+        p.compile(&cp, inputs, outputs, eng);
+
+        graph::logical_tensor_t lt;
+        cp.query_logical_tensor(eltwise_dst_lt.id, &lt);
+        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t weight_ts(weight_lt, eng, weight);
+        test_tensor_t bias_ts(bias_lt, eng, param.bias);
+        test_tensor_t post_src_ts(post_lt, eng, post_src);
+        test_tensor_t eltwise_dst_ts(eltwise_dst_lt, eng, dst);
+
+        graph::stream_t *strm = get_stream();
+        cp.execute(strm,
+                {src_ts.get(), weight_ts.get(), bias_ts.get(),
+                        post_src_ts.get()},
+                {eltwise_dst_ts.get()});
+        strm->wait();
+        for (size_t i = 0; i < dst.size(); ++i) {
+            // We noticed mish test has slight accuracy issue on GPU or AArch64
+            // CPU or SNB.
+            dst = eltwise_dst_ts.as_vec_type<float>();
+            if (eng->kind() == graph::engine_kind::gpu
+                    || (eng->kind() == graph::engine_kind::cpu
+                            && dnnl_get_effective_cpu_isa()
+                                    <= dnnl_cpu_isa_avx)) {
+                ASSERT_NEAR(dst[i], param.ref_dst[i], 1e-6);
+            } else {
+                ASSERT_FLOAT_EQ(dst[i], param.ref_dst[i]);
+            }
+        }
+    }
+}
+
+TEST(test_convolution_execute, ConvAddEltwise) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src {-3.0, -1.5, 2.0, 0.5, -0.5, -1.0, 1.0, 1.5, 2.0,
+            2.5, -1.0, 0.0, 3.0, -2.0, -1.0, 4.0};
+    std::vector<float> weight {1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0};
+    std::vector<float> post_src {-2.0, 1.0, -1.0, 0.0};
+    std::vector<float> ref_dst {-3.0, 3.5, 4.0, 1.5};
+    std::vector<float> dst {0.0, 0.0, 0.0, 0.0};
+
+    std::vector<eltwise_param_t> params = {
+            eltwise_param_t {"fp_conv_post_ops", {0.0},
+                    {static_cast<float>(exp(-3.0) - 1), 3.5, 4.0, 1.5},
+                    graph::op_kind::Elu, "Elu", {{graph::op_attr::alpha, 1.f}}},
+            eltwise_param_t {"fp_conv_post_ops", {0.0},
+                    {-0.06f, 3.5f, 4.0f, 1.5f}, graph::op_kind::LeakyReLU,
+                    "LeakyReLU", {{graph::op_attr::alpha, 0.02f}}},
+            eltwise_param_t {"fp_conv_post_ops", {0.0},
+                    mish_func({-3.0f, 3.5f, 4.0f, 1.5f}), graph::op_kind::Mish,
+                    "Mish", {}},
+            eltwise_param_t {"fp_conv_post_ops", {0.0}, {0.0, 3.5, 4.f, 1.5},
+                    graph::op_kind::Clamp, "ReLU6",
+                    {{graph::op_attr::min, 0.f}, {graph::op_attr::max, 6.f}}},
+    };
+
+    for (auto &param : params) {
+        graph::op_t in_op(0, graph::op_kind::Wildcard, "Wildcard");
+        graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+        conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+        graph::op_t add_op(2, graph::op_kind::Add, "Add");
+        graph::op_t eltwise_op(3, param.op_kind, param.op_name);
+        for (auto &attr : param.attrs) {
+            eltwise_op.set_attr<float>(attr.first, attr.second);
+        }
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+                0, {1, 1, 4, 4}, graph::data_type::f32);
+        graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+                1, {1, 1, 3, 3}, graph::data_type::f32);
+        graph::logical_tensor_t post_lt = utils::logical_tensor_init(
+                2, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+                3, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+                4, {1, 1, 2, 2}, graph::data_type::f32);
+        graph::logical_tensor_t eltwise_dst_lt = utils::logical_tensor_init(
+                5, {1, 1, 2, 2}, graph::data_type::f32);
+
+        in_op.add_output(post_lt);
+        conv_op.add_input(src_lt);
+        conv_op.add_input(weight_lt);
+        conv_op.add_output(dst_lt);
+        add_op.add_input(dst_lt);
+        add_op.add_input(post_lt);
+        add_op.add_output(add_dst_lt);
+        eltwise_op.add_input(add_dst_lt);
+        eltwise_op.add_output(eltwise_dst_lt);
+
+        graph::graph_t g(eng->kind());
+        g.add_op(&in_op);
+        g.add_op(&conv_op);
+        g.add_op(&add_op);
+        g.add_op(&eltwise_op);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass(param.pass_name);
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> inputs {
+                &src_lt, &weight_lt, &post_lt};
+        std::vector<const graph::logical_tensor_t *> outputs {&eltwise_dst_lt};
+
+        p.compile(&cp, inputs, outputs, eng);
+
+        graph::logical_tensor_t lt;
+        cp.query_logical_tensor(eltwise_dst_lt.id, &lt);
+        ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t weight_ts(weight_lt, eng, weight);
+        test_tensor_t post_src_ts(post_lt, eng, post_src);
+        test_tensor_t eltwise_dst_ts(eltwise_dst_lt, eng, dst);
+
+        graph::stream_t *strm = get_stream();
+        cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
+                {eltwise_dst_ts.get()});
+        strm->wait();
+        dst = eltwise_dst_ts.as_vec_type<float>();
+        for (size_t i = 0; i < dst.size(); ++i) {
+            ASSERT_NEAR(dst[i], param.ref_dst[i], 0.0001f);
+        }
+    }
+}
+
+TEST(test_convolution_execute_subgraph_fp32, ConvDepthwise_CPU) {
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    SKIP_IF(engine->kind() == graph::engine_kind::gpu,
+            "Skip for GPU - not supported yet.");
+
+    // N, IC, IH, IW
+    std::vector<int64_t> conv_src_shape {4, 4, 4, 4};
+    // OC, IC/G, KH, KW
+    std::vector<int64_t> conv_wei_shape {4, 4, 1, 1};
+    // N, OC, OH, OW
+    std::vector<int64_t> conv_dst_shape {4, 4, 4, 4};
+    // OC, IC/G, KH, KW
+    std::vector<int64_t> dw_wei_shape {4, 1, 3, 3};
+    // N, OC, OH, OW
+    std::vector<int64_t> dw_dst_shape {4, 4, 2, 2};
+
+    std::string dw_type {"k3s2p1"};
+
+    std::vector<float> src_data(product(conv_src_shape));
+    std::vector<float> wei_data(product(conv_wei_shape));
+    std::vector<float> dw_wei_data(product(dw_wei_shape));
+
+    std::default_random_engine generator(7);
+    std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+    std::generate(src_data.begin(), src_data.end(),
+            [&]() { return f32_distribution(generator); });
+    std::generate(wei_data.begin(), wei_data.end(),
+            [&]() { return f32_distribution(generator); });
+    std::generate(dw_wei_data.begin(), dw_wei_data.end(),
+            [&]() { return f32_distribution(generator); });
+
+    graph::op_t conv {0, graph::op_kind::Convolution, "conv"};
+    utils::set_conv_dw_base_op_attr(conv);
+
+    graph::op_t depthwise {1, graph::op_kind::Convolution, "depthwise"};
+    utils::set_conv_dw_post_op_attr(depthwise, dw_type);
+
+    graph::logical_tensor_t conv_src = utils::logical_tensor_init(
+            0, conv_src_shape, graph::data_type::f32);
+    graph::logical_tensor_t conv_wei = utils::logical_tensor_init(
+            1, conv_wei_shape, graph::data_type::f32);
+    graph::logical_tensor_t conv_dst = utils::logical_tensor_init(
+            2, conv_dst_shape, graph::data_type::f32);
+
+    graph::logical_tensor_t dw_wei = utils::logical_tensor_init(
+            3, dw_wei_shape, graph::data_type::f32);
+    graph::logical_tensor_t dw_dst = utils::logical_tensor_init(
+            4, dw_dst_shape, graph::data_type::f32);
+
+    conv.add_input(conv_src);
+    conv.add_input(conv_wei);
+    conv.add_output(conv_dst);
+
+    depthwise.add_input(conv_dst);
+    depthwise.add_input(dw_wei);
+    depthwise.add_output(dw_dst);
+
+    graph::graph_t g(engine->kind());
+    g.add_op(&conv);
+    g.add_op(&depthwise);
+    g.finalize();
+
+    test_tensor_t conv_src_ts(conv_src, engine, src_data);
+    test_tensor_t conv_wei_ts(conv_wei, engine, wei_data);
+    test_tensor_t dw_wei_ts(dw_wei, engine, dw_wei_data);
+
+    // -------------------------case 1----------------------------------
+    std::vector<float> case1_out_data(product(dw_dst_shape));
+    test_tensor_t dw_dst_ts(dw_dst, engine, case1_out_data);
+
+    ASSERT_EQ(run_graph(g, {conv_src_ts, conv_wei_ts, dw_wei_ts}, {dw_dst_ts},
+                      *engine, *strm),
+            graph::status::success);
+
+    // -------------------------case 2----------------------------------
+    graph::pass::pass_base_ptr apass
+            = get_pass("fp_conv_postops_depthwise_postops_cpu");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> lt_ins {
+            &conv_src, &conv_wei, &dw_wei};
+    std::vector<const graph::logical_tensor_t *> lt_outs {&dw_dst};
+
+    ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
+
+    std::vector<float> case2_out_data(product(dw_dst_shape));
+    test_tensor_t dw_dst_ts2(dw_dst, engine, case2_out_data);
+
+    cp.execute(strm, {conv_src_ts.get(), conv_wei_ts.get(), dw_wei_ts.get()},
+            {dw_dst_ts2.get()});
+    strm->wait();
+    case1_out_data = dw_dst_ts.as_vec_type<float>();
+    case2_out_data = dw_dst_ts2.as_vec_type<float>();
+    for (size_t i = 0; i < case1_out_data.size(); ++i) {
+        ASSERT_FLOAT_EQ(case1_out_data[i], case2_out_data[i]);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv1dConv2dConv3d) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::vector<size_t> nds = {1, 2, 3};
+    std::vector<int64_t> groups = {1};
+    std::vector<bool> with_biases = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+    std::vector<std::string> src_qtypes = {"symmetric", "asymmetric"};
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+
+    for_(const auto &nd : nds)
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for_(const auto &src_qtype : src_qtypes)
+    for (const auto &wei_qtype : weight_qtypes) {
+        if (engine->kind() == graph::engine_kind::gpu
+                && (src_qtype == "asymmetric" || nd == 1))
+            continue;
+
+        // prepare data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape = nd == 1
+                ? std::vector<int64_t> {1, in_channel, 12}
+                : nd == 2 ? std::vector<int64_t> {1, in_channel, 12, 12}
+                          : std::vector<int64_t> {1, in_channel, 12, 12, 12};
+        std::vector<int64_t> weight_shape = nd == 1
+                ? std::vector<int64_t> {out_channel, in_channel / g,
+                        kernel_size}
+                : nd == 2 ? std::vector<int64_t> {out_channel, in_channel / g,
+                          kernel_size, kernel_size}
+                          : std::vector<int64_t> {out_channel, in_channel / g,
+                                  kernel_size, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape = nd == 1
+                ? std::vector<int64_t> {1, out_channel, 10}
+                : nd == 2 ? std::vector<int64_t> {1, out_channel, 10, 10}
+                          : std::vector<int64_t> {1, out_channel, 10, 10, 10};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<int8_t> weight_s8_data(product(weight_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<int8_t> case1_out_data(product(dst_shape));
+        std::vector<int8_t> case2_out_data(product(dst_shape));
+
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_out = 1;
+        int64_t zp_src = src_qtype == "symmetric" ? 0 : 128;
+        int64_t zp_out = 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, nd)
+
+        graph::op_t qout_node(5, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
+                8, dst_shape, graph::data_type::s8);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        qout_node.add_input(dst_f32);
+        qout_node.add_output(dst_s8);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
+                          {dst_s8_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_s8, &bias_f32};
+        else
+            lt_ins = {&src_u8, &weight_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        if (with_bias)
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        else
+            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        strm->wait();
+
+        if (engine->kind() == graph::engine_kind::cpu
+                && isa < dnnl_cpu_isa_avx512_core_vnni)
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
+                            /*atol*/ 1.f));
+        else
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
+                            /*atol*/ 1.f));
+    }
+}
+
+static inline void quantized_conv2d_eltwise(
+        graph::op_kind_t eltwise, const float *alpha, const float *beta) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<bool> with_biases = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for (const auto &wei_qtype : weight_qtypes) {
+        // prepare data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
+        std::vector<int64_t> weight_shape {
+                out_channel, in_channel / g, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<int8_t> weight_s8_data(product(weight_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<int8_t> case1_out_data(product(dst_shape));
+        std::vector<int8_t> case2_out_data(product(dst_shape));
+
+        // random generate src, weight and bias data random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_out = 1;
+        int64_t zp_src = 0;
+        int64_t zp_out = 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+
+        graph::op_t eltwise_node(5, eltwise, "eltwise_node");
+        if (alpha) eltwise_node.set_attr<float>(graph::op_attr::alpha, *alpha);
+        if (beta) eltwise_node.set_attr<float>(graph::op_attr::beta, *beta);
+
+        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_relu_f32 = utils::logical_tensor_init(
+                8, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::s8);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        eltwise_node.add_input(dst_f32);
+        eltwise_node.add_output(dst_relu_f32);
+
+        qout_node.add_input(dst_relu_f32);
+        qout_node.add_output(dst_s8);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&eltwise_node);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
+                          {dst_s8_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_s8, &bias_f32};
+        else
+            lt_ins = {&src_u8, &weight_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        p.compile(&cp, lt_ins, lt_outs, engine);
+
+        if (with_bias)
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        else
+            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        strm->wait();
+
+        if (engine->kind() == graph::engine_kind::cpu
+                && isa < dnnl_cpu_isa_avx512_core_vnni)
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
+                            /*atol*/ 1.f));
+        else
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
+                            /*atol*/ 1.f));
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dRelu) {
+    const graph::op_kind_t opk = graph::op_kind::ReLU;
+    quantized_conv2d_eltwise(opk, nullptr, nullptr);
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dLeakyRelu) {
+    const graph::op_kind_t opk = graph::op_kind::LeakyReLU;
+    const float alpha = 0.02f;
+    quantized_conv2d_eltwise(opk, &alpha, nullptr);
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dMish) {
+    const graph::op_kind_t opk = graph::op_kind::Mish;
+    quantized_conv2d_eltwise(opk, nullptr, nullptr);
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dSumRelu) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<bool> with_biases = {true, false};
+    // swap add's two inputs
+    std::vector<bool> swaps = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+    std::vector<std::string> other_qtypes = {"symmetric", "asymmetric"};
+
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for_(const auto swap : swaps)
+    for_(const auto &other_qtype : other_qtypes)
+    for (const auto &wei_qtype : weight_qtypes) {
+        // skip on gpu for unsupported per_channel group conv
+        if (engine->kind() == graph::engine_kind::gpu
+                && wei_qtype == "per_channel" && g == 4)
+            continue;
+        // prepare data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
+        std::vector<int64_t> weight_shape {
+                out_channel, in_channel / g, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<int8_t> weight_s8_data(product(weight_shape));
+        std::vector<int8_t> other_s8_data(product(dst_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<int8_t> case1_out_data(product(dst_shape));
+        std::vector<int8_t> case2_out_data(product(dst_shape));
+
+        // random generate src, weight and bias data random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_other = 1 / 127.f;
+        float scale_out = 1;
+        int64_t zp_src = 0;
+        // post-sum didn't support zps on GPU
+        int64_t zp_other = other_qtype == "symmetric"
+                        || engine->kind() == graph::engine_kind::gpu
+                ? 0
+                : 128;
+        int64_t zp_out = 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+
+        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
+        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqother_node.set_attr<std::vector<int64_t>>(
+                graph::op_attr::zps, {zp_other});
+        dqother_node.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_other});
+        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
+
+        // prepare logical tensor
+        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
+        auto weight_f32_dq
+                = utils::logical_tensor_init(5, graph::data_type::f32);
+        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+        auto dst_relu_f32
+                = utils::logical_tensor_init(8, graph::data_type::f32);
+        auto dst_s8 = utils::logical_tensor_init(9, graph::data_type::s8);
+        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
+        auto other_f32_dq
+                = utils::logical_tensor_init(12, graph::data_type::f32);
+        auto dst_add_f32
+                = utils::logical_tensor_init(13, graph::data_type::f32);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        dqother_node.add_input(other_s8);
+        dqother_node.add_output(other_f32_dq);
+        if (swap) {
+            add_node.add_input(dst_f32);
+            add_node.add_input(other_f32_dq);
+        } else {
+            add_node.add_input(other_f32_dq);
+            add_node.add_input(dst_f32);
+        }
+        add_node.add_output(dst_add_f32);
+
+        relu_node.add_input(dst_add_f32);
+        relu_node.add_output(dst_relu_f32);
+
+        qout_node.add_input(dst_relu_f32);
+        qout_node.add_output(dst_s8);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&dqother_node);
+        g.add_op(&add_node);
+        g.add_op(&relu_node);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        // prepare in/out with full shape
+        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+        weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        other_s8 = utils::logical_tensor_init(
+                11, dst_shape, graph::data_type::s8);
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+        }
+        dst_s8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::s8);
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g,
+                          {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
+                          {dst_s8_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass
+                = get_pass(engine->kind() == graph::engine_kind::gpu
+                                ? "x8x8x8_conv_add_post_ops_gpu"
+                                : "x8x8x8_conv_add_post_ops_cpu");
+
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_s8, &bias_f32, &other_s8};
+        else
+            lt_ins = {&src_u8, &weight_s8, &other_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        p.compile(&cp, lt_ins, lt_outs, engine);
+
+        if (with_bias)
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
+                            other_s8_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        else
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), other_s8_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        strm->wait();
+
+        if (engine->kind() == graph::engine_kind::cpu
+                && isa < dnnl_cpu_isa_avx512_core_vnni)
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
+                            /*atol*/ 1.f));
+        else
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
+                            /*atol*/ 1.f));
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8,
+        Conv2dSumReluWithDifferentSrc1AndDstType_GPU) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    // we will use blocked layout only on gpu devices, so this focus test
+    // targets gpu devices
+    SKIP_IF(engine->kind() != graph::engine_kind::gpu,
+            "this focus test targets gpu devices");
+
+    int64_t g = 1;
+    std::string wei_qtype = "per_channel";
+
+    // prepare data
+    int64_t in_channel = 32, out_channel = 32;
+    int64_t kernel_size = 3;
+    std::vector<int64_t> src_shape {1, in_channel, 12, 12};
+    std::vector<int64_t> weight_shape {
+            out_channel, in_channel, kernel_size, kernel_size};
+    std::vector<int64_t> bias_shape {out_channel};
+    std::vector<int64_t> dst_shape {1, out_channel, 10, 10};
+
+    std::vector<uint8_t> src_u8_data(product(src_shape));
+    std::vector<int8_t> weight_s8_data(product(weight_shape));
+    std::vector<int8_t> other_s8_data(product(dst_shape));
+    size_t bias_size = product(bias_shape);
+    std::vector<float> bias_data(bias_size);
+    std::vector<uint8_t> case1_out_data(product(dst_shape));
+    std::vector<uint8_t> case2_out_data(product(dst_shape));
+
+    // random generate src, weight and bias data random seed = 7
+    std::default_random_engine generator(7);
+    std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+    std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+    std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+    std::generate(src_u8_data.begin(), src_u8_data.end(),
+            [&]() { return static_cast<uint8_t>(u8_distribution(generator)); });
+    std::generate(weight_s8_data.begin(), weight_s8_data.end(),
+            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
+    std::generate(other_s8_data.begin(), other_s8_data.end(),
+            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
+
+    std::generate(bias_data.begin(), bias_data.end(),
+            [&]() { return f32_distribution(generator); });
+
+    float scale_src = 1 / 255.f; // map to 0~255
+    float scale_other = 1 / 127.f;
+    float scale_out = 1;
+    int64_t zp_src = 0;
+    // post-sum didn't support zps on GPU
+    int64_t zp_other = 0;
+    // The following cmd will be skiped by benchdnn, since oneDNN didn't
+    // support reorder with zps on GPU: "./tests/benchdnn/benchdnn --reorder
+    // --engine=gpu --mode=C --sdt=f32 --ddt=s8
+    // --attr-zero-points=dst:common:78 --stag=aBc8b --dtag=abc 1x8x10"
+    int64_t zp_out = 0;
+
+    size_t scale_size = out_channel;
+
+    std::vector<float> scale_wei(scale_size, 1 / 127.f);
+    std::vector<int64_t> zp_wei(scale_size, 0);
+
+    graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+    SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+    graph::op_t dqweight_node(3, graph::op_kind::Dequantize, "dqweight_node");
+    SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+    graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+    SET_CONV_ATTR(conv_node, 2)
+
+    graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+    graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+    SET_Q_DQ_OUT_ATTR(qout_node)
+
+    graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
+    dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    dqother_node.set_attr<std::vector<int64_t>>(
+            graph::op_attr::zps, {zp_other});
+    dqother_node.set_attr<std::vector<float>>(
+            graph::op_attr::scales, {scale_other});
+    dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+    graph::op_t add_node(9, graph::op_kind::Add, "add_node");
+
+    // prepare logical tensor
+    auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+    auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+    auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
+    auto weight_f32_dq = utils::logical_tensor_init(5, graph::data_type::f32);
+    auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+    auto dst_relu_f32 = utils::logical_tensor_init(8, graph::data_type::f32);
+    auto dst_u8 = utils::logical_tensor_init(9, graph::data_type::u8);
+    auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
+    auto other_f32_dq = utils::logical_tensor_init(12, graph::data_type::f32);
+    auto dst_add_f32 = utils::logical_tensor_init(13, graph::data_type::f32);
+    graph::logical_tensor_t bias_f32
+            = utils::logical_tensor_init(6, graph::data_type::f32);
+
+    dqdata_node.add_input(src_u8);
+    dqdata_node.add_output(src_f32_dq);
+
+    dqweight_node.add_input(weight_s8);
+    dqweight_node.add_output(weight_f32_dq);
+
+    conv_node.add_input(src_f32_dq);
+    conv_node.add_input(weight_f32_dq);
+    conv_node.add_input(bias_f32);
+    conv_node.add_output(dst_f32);
+
+    dqother_node.add_input(other_s8);
+    dqother_node.add_output(other_f32_dq);
+
+    add_node.add_input(dst_f32);
+    add_node.add_input(other_f32_dq);
+
+    add_node.add_output(dst_add_f32);
+
+    relu_node.add_input(dst_add_f32);
+    relu_node.add_output(dst_relu_f32);
+
+    qout_node.add_input(dst_relu_f32);
+    qout_node.add_output(dst_u8);
+
+    graph::graph_t agraph(engine->kind());
+    agraph.add_op(&dqdata_node);
+    agraph.add_op(&dqweight_node);
+    agraph.add_op(&conv_node);
+    agraph.add_op(&dqother_node);
+    agraph.add_op(&add_node);
+    agraph.add_op(&relu_node);
+    agraph.add_op(&qout_node);
+    agraph.finalize();
+
+    // prepare in/out with full shape
+    src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+    weight_s8
+            = utils::logical_tensor_init(4, weight_shape, graph::data_type::s8);
+    other_s8 = utils::logical_tensor_init(11, dst_shape, graph::data_type::s8);
+    bias_f32 = utils::logical_tensor_init(6, bias_shape, graph::data_type::f32);
+    dst_u8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::u8);
+
+    test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+    test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+    test_tensor_t bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+    test_tensor_t dst_u8_ts(dst_u8, engine, case1_out_data);
+    test_tensor_t dst_u8_case2_ts(dst_u8, engine, case2_out_data);
+
+    // -------------------------case 1----------------------------------
+    ASSERT_EQ(run_graph(agraph,
+                      {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
+                      {dst_u8_ts}, *engine, *strm),
+            graph::status::success);
+
+    // -------------------------case 2----------------------------------
+    graph::pass::pass_base_ptr apass
+            = get_pass(engine->kind() == graph::engine_kind::gpu
+                            ? "x8x8x8_conv_add_post_ops_gpu"
+                            : "x8x8x8_conv_add_post_ops_cpu");
+    ASSERT_NE(apass, nullptr);
+
+    apass->run(agraph);
+    ASSERT_EQ(agraph.get_num_partitions(), 1U);
+    auto part = agraph.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> lt_ins {
+            &src_u8, &weight_s8, &bias_f32, &other_s8};
+    std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
+
+    p.compile(&cp, lt_ins, lt_outs, engine);
+
+    cp.execute(strm,
+            {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
+                    other_s8_ts.get()},
+            {dst_u8_case2_ts.get()});
+    strm->wait();
+
+    ASSERT_TRUE(allclose<uint8_t>(dst_u8_ts, dst_u8_case2_ts, /*rtol*/ 0.01f,
+            /*atol*/ 1.f));
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dSumReluNxc) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<bool> with_biases = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for (const auto &wei_qtype : weight_qtypes) {
+        // prepare fp32 data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, 12, 12, in_channel};
+        std::vector<int64_t> weight_shape {
+                kernel_size, kernel_size, in_channel / g, out_channel};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape {1, 10, 10, out_channel};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<int8_t> weight_s8_data(product(weight_shape));
+        std::vector<int8_t> other_s8_data(product(dst_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<int8_t> case1_out_data(product(dst_shape));
+        std::vector<int8_t> case2_out_data(product(dst_shape));
+
+        // random generate src, weight and bias data random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_other = 1 / 127.f;
+        float scale_out = 1;
+        int64_t zp_src = 0;
+        int64_t zp_other = 0;
+        int64_t zp_out = 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+        dqweight_node.set_attr<int64_t>(
+                graph::op_attr::axis, wei_qtype == "per_tensor" ? 0 : 3);
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+        conv_node.set_attr<std::string>(graph::op_attr::data_format, "NXC");
+        conv_node.set_attr<std::string>(graph::op_attr::weights_format, "XIO");
+
+        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
+        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqother_node.set_attr<std::vector<int64_t>>(
+                graph::op_attr::zps, {zp_other});
+        dqother_node.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_other});
+        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
+
+        // prepare logical tensor
+        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
+        auto weight_f32_dq
+                = utils::logical_tensor_init(5, graph::data_type::f32);
+        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+        auto dst_relu_f32
+                = utils::logical_tensor_init(8, graph::data_type::f32);
+        auto dst_s8 = utils::logical_tensor_init(9, graph::data_type::s8);
+        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
+        auto other_f32_dq
+                = utils::logical_tensor_init(12, graph::data_type::f32);
+        auto dst_add_f32
+                = utils::logical_tensor_init(13, graph::data_type::f32);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
+        }
+
+        // -------------------------case 2----------------------------------
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        dqother_node.add_input(other_s8);
+        dqother_node.add_output(other_f32_dq);
+
+        add_node.add_input(dst_f32);
+        add_node.add_input(other_f32_dq);
+        add_node.add_output(dst_add_f32);
+
+        relu_node.add_input(dst_add_f32);
+        relu_node.add_output(dst_relu_f32);
+
+        qout_node.add_input(dst_relu_f32);
+        qout_node.add_output(dst_s8);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&dqother_node);
+        g.add_op(&add_node);
+        g.add_op(&relu_node);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        // prepare in/out with full shape
+        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+        weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        other_s8 = utils::logical_tensor_init(
+                11, dst_shape, graph::data_type::s8);
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+        }
+        dst_s8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::s8);
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g,
+                          {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
+                          {dst_s8_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass
+                = get_pass(engine->kind() == graph::engine_kind::gpu
+                                ? "x8x8x8_conv_add_post_ops_gpu"
+                                : "x8x8x8_conv_add_post_ops_cpu");
+
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_s8, &bias_f32, &other_s8};
+        else
+            lt_ins = {&src_u8, &weight_s8, &other_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        p.compile(&cp, lt_ins, lt_outs, engine);
+
+        if (with_bias)
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
+                            other_s8_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        else
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), other_s8_ts.get()},
+                    {dst_s8_case2_ts.get()});
+        strm->wait();
+
+        if (engine->kind() == graph::engine_kind::cpu
+                && isa < dnnl_cpu_isa_avx512_core_vnni)
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.1f,
+                            /*atol*/ 1.f));
+        else
+            ASSERT_TRUE(
+                    allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts, /*rtol*/ 0.01f,
+                            /*atol*/ 1.f));
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv1d2d3dX8s8f32) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::vector<size_t> nds = {1, 2, 3};
+    std::vector<int64_t> groups = {1};
+    std::vector<bool> with_biases = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+    std::vector<std::string> src_qtypes = {"symmetric", "asymmetric"};
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
+                    && engine->kind() == graph::engine_kind::cpu,
+            "Skip the test for systems that do not support "
+            "avx512_core_vnni.");
+
+    for_(const auto &nd : nds)
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for_(const auto &src_qtype : src_qtypes)
+    for (const auto &wei_qtype : weight_qtypes) {
+        if (engine->kind() == graph::engine_kind::gpu
+                && (src_qtype == "asymmetric" || nd == 1))
+            continue;
+
+        // prepare fp32 data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape = nd == 1
+                ? std::vector<int64_t> {1, in_channel, 12}
+                : nd == 2 ? std::vector<int64_t> {1, in_channel, 12, 12}
+                          : std::vector<int64_t> {1, in_channel, 12, 12, 12};
+        std::vector<int64_t> weight_shape = nd == 1
+                ? std::vector<int64_t> {out_channel, in_channel / g,
+                        kernel_size}
+                : nd == 2 ? std::vector<int64_t> {out_channel, in_channel / g,
+                          kernel_size, kernel_size}
+                          : std::vector<int64_t> {out_channel, in_channel / g,
+                                  kernel_size, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape = nd == 1
+                ? std::vector<int64_t> {1, out_channel, 10}
+                : nd == 2 ? std::vector<int64_t> {1, out_channel, 10, 10}
+                          : std::vector<int64_t> {1, out_channel, 10, 10, 10};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<int8_t> weight_s8_data(product(weight_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+
+        // random generate src, weight and bias data random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = src_qtype == "symmetric" ? 0 : 128;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, nd)
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.finalize();
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+
+        // -------------------------case 1----------------------------------
+        std::vector<float> case1_out_data(product(dst_shape));
+        test_tensor_t dst_f32_ts(dst_f32, engine, case1_out_data);
+
+        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
+                          {dst_f32_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_s8, &bias_f32};
+        else
+            lt_ins = {&src_u8, &weight_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_f32};
+
+        p.compile(&cp, lt_ins, lt_outs, engine);
+
+        std::vector<float> case2_out_data(product(dst_shape));
+        test_tensor_t dst_f32_case2_ts(dst_f32, engine, case2_out_data);
+        if (with_bias)
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
+                    {dst_f32_case2_ts.get()});
+        else
+            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
+                    {dst_f32_case2_ts.get()});
+        strm->wait();
+
+        if (engine->kind() == graph::engine_kind::cpu
+                && isa < dnnl_cpu_isa_avx512_core_vnni)
+            ASSERT_TRUE(
+                    allclose<float>(dst_f32_ts, dst_f32_case2_ts, /*rtol*/ 0.1f,
+                            /*atol*/ 1.f));
+        else
+            ASSERT_TRUE(allclose<float>(dst_f32_ts, dst_f32_case2_ts,
+                    /*rtol*/ 0.01f,
+                    /*atol*/ 1.f));
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dReluX8s8f32) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<bool> with_biases = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
+                    && engine->kind() == graph::engine_kind::cpu,
+            "Skip the test for systems that do not support "
+            "avx512_core_vnni.");
+
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for (const auto &wei_qtype : weight_qtypes) {
+        // prepare data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
+        std::vector<int64_t> weight_shape {
+                out_channel, in_channel / g, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<int8_t> weight_s8_data(product(weight_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<float> case1_out_data(product(dst_shape));
+        std::vector<float> case2_out_data(product(dst_shape));
+
+        // random generate src, weight and bias data random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_s8_data.begin(), weight_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = 0;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+
+        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_relu_f32 = utils::logical_tensor_init(
+                8, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        relu_node.add_input(dst_f32);
+        relu_node.add_output(dst_relu_f32);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&relu_node);
+        g.finalize();
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_relu_f32_ts(dst_relu_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_case2_ts(dst_relu_f32, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
+                          {dst_relu_f32_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_s8, &bias_f32};
+        else
+            lt_ins = {&src_u8, &weight_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_relu_f32};
+
+        p.compile(&cp, lt_ins, lt_outs, engine);
+
+        if (with_bias)
+            cp.execute(strm,
+                    {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
+                    {dst_f32_case2_ts.get()});
+        else
+            cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
+                    {dst_f32_case2_ts.get()});
+        strm->wait();
+
+        if (engine->kind() == graph::engine_kind::cpu
+                && isa < dnnl_cpu_isa_avx512_core_vnni)
+            ASSERT_TRUE(allclose<float>(dst_relu_f32_ts, dst_f32_case2_ts,
+                    /*rtol*/ 0.1f,
+                    /*atol*/ 1.f));
+        else
+            ASSERT_TRUE(allclose<float>(dst_relu_f32_ts, dst_f32_case2_ts,
+                    /*rtol*/ 0.01f,
+                    /*atol*/ 1.f));
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, Conv2dSumReluGetInplacePair) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+    std::vector<std::string> other_qtypes = {"symmetric", "asymmetric"};
+
+    for_(const auto &g : groups)
+    for_(const auto &other_qtype : other_qtypes)
+    for (const auto &wei_qtype : weight_qtypes) {
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
+        std::vector<int64_t> weight_shape {
+                out_channel, in_channel / g, kernel_size, kernel_size};
+        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_other = 1 / 127.f;
+        float scale_out = 1;
+        int64_t zp_src = 0;
+        // post-sum didn't support zps on GPU
+        int64_t zp_other = other_qtype == "symmetric"
+                        || engine->kind() == graph::engine_kind::gpu
+                ? 0
+                : 128;
+        int64_t zp_out = 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+
+        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
+        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqother_node.set_attr<std::vector<int64_t>>(
+                graph::op_attr::zps, {zp_other});
+        dqother_node.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_other});
+        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
+
+        graph::op_t dqdata_node2(10, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node2)
+
+        graph::op_t dqweight_node2(
+                11, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node2, 0)
+
+        graph::op_t conv_node2(12, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node2, 2)
+
+        graph::op_t relu_node2(13, graph::op_kind::ReLU, "relu_node");
+
+        graph::op_t qout_node2(14, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node2)
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_relu_f32 = utils::logical_tensor_init(
+                8, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::s8);
+        graph::logical_tensor_t other_f32_dq = utils::logical_tensor_init(
+                12, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_add_f32 = utils::logical_tensor_init(
+                13, dst_shape, graph::data_type::f32);
+
+        graph::logical_tensor_t src_u8_2 = utils::logical_tensor_init(
+                14, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq_2 = utils::logical_tensor_init(
+                15, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8_2 = utils::logical_tensor_init(
+                16, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq_2 = utils::logical_tensor_init(
+                17, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32_2 = utils::logical_tensor_init(
+                18, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_s8_2 = utils::logical_tensor_init(
+                19, dst_shape, graph::data_type::s8);
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        conv_node.add_output(dst_f32);
+
+        dqother_node.add_input(dst_s8_2);
+        dqother_node.add_output(other_f32_dq);
+
+        add_node.add_input(dst_f32);
+        add_node.add_input(other_f32_dq);
+        add_node.add_output(dst_add_f32);
+
+        relu_node.add_input(dst_add_f32);
+        relu_node.add_output(dst_relu_f32);
+
+        qout_node.add_input(dst_relu_f32);
+        qout_node.add_output(dst_s8);
+
+        dqdata_node2.add_input(src_u8_2);
+        dqdata_node2.add_output(src_f32_dq_2);
+
+        dqweight_node2.add_input(weight_s8_2);
+        dqweight_node2.add_output(weight_f32_dq_2);
+
+        conv_node2.add_input(src_f32_dq_2);
+        conv_node2.add_input(weight_f32_dq_2);
+        conv_node2.add_output(dst_f32_2);
+
+        qout_node2.add_input(dst_f32_2);
+        qout_node2.add_output(dst_s8_2);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&dqother_node);
+        g.add_op(&add_node);
+        g.add_op(&relu_node);
+        g.add_op(&qout_node);
+        g.add_op(&dqdata_node2);
+        g.add_op(&dqweight_node2);
+        g.add_op(&conv_node2);
+        g.add_op(&qout_node2);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass1
+                = get_pass(engine->kind() == graph::engine_kind::gpu
+                                ? "x8x8x8_conv_add_post_ops_gpu"
+                                : "x8x8x8_conv_add_post_ops_cpu");
+        graph::pass::pass_base_ptr apass2 = get_pass("x8x8x_conv_post_ops");
+
+        apass1->run(g);
+        apass2->run(g);
+
+        ASSERT_EQ(g.get_num_partitions(), 2U);
+        auto part2 = g.get_partitions()[0]; // int8_conv_sum_relu
+        auto part1 = g.get_partitions()[1]; // int8_conv
+
+        // compile
+        graph::partition_t p1, p2;
+        p1.init(part1);
+        p2.init(part2);
+
+        graph::compiled_partition_t cp1(p1);
+        graph::compiled_partition_t cp2(p2);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins1;
+        lt_ins1 = {&src_u8_2, &weight_s8_2};
+
+        dst_s8_2.layout_type = graph::layout_type::any;
+        std::vector<const graph::logical_tensor_t *> lt_outs1 {&dst_s8_2};
+
+        p1.compile(&cp1, lt_ins1, lt_outs1, engine);
+
+        cp1.query_logical_tensor(dst_s8_2.id, &dst_s8_2);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        lt_ins = {&src_u8, &weight_s8, &dst_s8_2};
+
+        dst_s8.layout_type = graph::layout_type::any;
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        p2.compile(&cp2, lt_ins, lt_outs, engine);
+
+        std::vector<graph::inplace_pair_t> inplace_pairs
+                = cp2.get_inplace_pairs();
+
+        ASSERT_EQ(inplace_pairs.size(), 1U);
+        ASSERT_EQ(inplace_pairs[0].input_id, dst_s8_2.id);
+        ASSERT_EQ(inplace_pairs[0].output_id, dst_s8.id);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionBiasU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::string qtype = "per_channel";
+    std::vector<int64_t> groups = {1, 4};
+    for (const auto &g_ : groups) {
+        int64_t in_channel = 8, out_channel = 8;
+        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+        std::vector<int64_t> weight_shape
+                = {out_channel, in_channel / g_, 3, 3};
+        std::vector<int64_t> bias_shape = {out_channel};
+        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::vector<int8_t> weight_data(product(weight_shape));
+        std::vector<bfloat16_t> bias_data(product(bias_shape));
+
+        // random generate src, weight data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(distribution(generator));
+        });
+        std::uniform_real_distribution<float> distribution2(-127.0f, 127.0f);
+        std::generate(weight_data.begin(), weight_data.end(), [&]() {
+            return static_cast<int8_t>(distribution2(generator));
+        });
+        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+        std::generate(bias_data.begin(), bias_data.end(),
+                [&]() { return distribution3(generator); });
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = 110;
+
+        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+        float scale_out = 1 / 255.f; // map to 0~255
+        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+
+        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dqdata_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        dqweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+        graph::op_t tcweight_op {
+                3, graph::op_kind::TypeCast, "typecast_weight"};
+
+        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t tcdst_op {5, graph::op_kind::TypeCast, "typecast_dst"};
+
+        graph::op_t qout_op(6, graph::op_kind::Quantize, "qdout_op");
+        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+        qout_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_out});
+        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                0, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::bf16);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::bf16);
+        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
+                6, bias_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
+                10, dst_shape, graph::data_type::u8);
+
+        dqdata_op.add_input(src_u8);
+        dqdata_op.add_output(src_f32_dq);
+
+        dqweight_op.add_input(weight_s8);
+        dqweight_op.add_output(weight_f32_dq);
+
+        tcdata_op.add_input(src_f32_dq);
+        tcdata_op.add_output(src_bf16);
+
+        tcweight_op.add_input(weight_f32_dq);
+        tcweight_op.add_output(weight_bf16);
+
+        conv_op.add_input(src_bf16);
+        conv_op.add_input(weight_bf16);
+        conv_op.add_input(bias_bf16);
+        conv_op.add_output(conv_bf16);
+
+        tcdst_op.add_input(conv_bf16);
+        tcdst_op.add_output(conv_f32);
+
+        qout_op.add_input(conv_f32);
+        qout_op.add_output(dst_u8);
+
+        graph::graph_t g(engine->kind());
+        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins {
+                &src_u8, &weight_s8, &bias_bf16};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
+
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_bf16_ts(bias_bf16, engine, bias_data);
+        test_tensor_t dst_ts(dst_u8, engine);
+        cp.execute(strm,
+                {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
+                {dst_ts.get()});
+        strm->wait();
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionBiasaddU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::string qtype = "per_channel";
+    std::vector<int64_t> groups = {1, 4};
+    for (const auto &g_ : groups) {
+        int64_t in_channel = 8, out_channel = 8;
+        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+        std::vector<int64_t> weight_shape
+                = {out_channel, in_channel / g_, 3, 3};
+        std::vector<int64_t> bias_shape = {out_channel};
+        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::vector<float> weight_data(product(weight_shape));
+        std::vector<float> bias_data(product(bias_shape));
+
+        // random generate src, weight data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(distribution(generator));
+        });
+        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
+        std::generate(weight_data.begin(), weight_data.end(),
+                [&]() { return distribution2(generator); });
+        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+        std::generate(bias_data.begin(), bias_data.end(),
+                [&]() { return distribution3(generator); });
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = 110;
+
+        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+        float scale_out = 1 / 255.f; // map to 0~255
+        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+
+        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dqdata_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
+        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        qweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        dqweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+        graph::op_t tcweight_op {
+                3, graph::op_kind::TypeCast, "typecast_weight"};
+
+        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t tc_bias_op {5, graph::op_kind::TypeCast, "typecast_bias"};
+
+        graph::op_t biasadd_op {6, graph::op_kind::BiasAdd, "biasadd_op"};
+        biasadd_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+
+        graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
+
+        graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
+        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+        qout_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_out});
+        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                0, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::bf16);
+        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+                30, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
+                6, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t bias_f32 = utils::logical_tensor_init(
+                7, bias_shape, graph::data_type::f32);
+        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
+                8, bias_shape, graph::data_type::bf16);
+        graph::logical_tensor_t bias_out_bf16 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
+                11, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
+                12, dst_shape, graph::data_type::u8);
+
+        dqdata_op.add_input(src_u8);
+        dqdata_op.add_output(src_f32_dq);
+
+        qweight_op.add_input(weight_f32);
+        qweight_op.add_output(weight_s8);
+
+        dqweight_op.add_input(weight_s8);
+        dqweight_op.add_output(weight_f32_dq);
+
+        tcdata_op.add_input(src_f32_dq);
+        tcdata_op.add_output(src_bf16);
+
+        tcweight_op.add_input(weight_f32_dq);
+        tcweight_op.add_output(weight_bf16);
+
+        conv_op.add_input(src_bf16);
+        conv_op.add_input(weight_bf16);
+        conv_op.add_output(conv_bf16);
+
+        tc_bias_op.add_input(bias_f32);
+        tc_bias_op.add_output(bias_bf16);
+
+        biasadd_op.add_input(conv_bf16);
+        biasadd_op.add_input(bias_bf16);
+        biasadd_op.add_output(bias_out_bf16);
+
+        tcdst_op.add_input(bias_out_bf16);
+        tcdst_op.add_output(conv_f32);
+
+        qout_op.add_input(conv_f32);
+        qout_op.add_output(dst_u8);
+
+        graph::graph_t g(engine->kind());
+        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&biasadd_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tc_bias_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins {
+                &src_u8, &weight_f32, &bias_f32};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
+
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t dst_ts(dst_u8, engine);
+        cp.execute(strm,
+                {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
+                {dst_ts.get()});
+        strm->wait();
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionBiasGeluU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::string qtype = "per_channel";
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<float> scales = {1.f, 1 / 255.f};
+    std::vector<int64_t> zps = {0, 110};
+    for_(const auto &scale : scales)
+    for_(const auto &zp : zps)
+    for (const auto &g_ : groups) {
+        int64_t in_channel = 8, out_channel = 8;
+        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+        std::vector<int64_t> weight_shape
+                = {out_channel, in_channel / g_, 3, 3};
+        std::vector<int64_t> bias_shape = {out_channel};
+        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::vector<int8_t> weight_data(product(weight_shape));
+        std::vector<bfloat16_t> bias_data(product(bias_shape));
+
+        // random generate src, weight data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(distribution(generator));
+        });
+        std::uniform_real_distribution<float> distribution2(-127.0f, 127.0f);
+        std::generate(weight_data.begin(), weight_data.end(), [&]() {
+            return static_cast<int8_t>(distribution2(generator));
+        });
+        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+        std::generate(bias_data.begin(), bias_data.end(),
+                [&]() { return distribution3(generator); });
+        float scale_src = scale; // map to 0~255
+        int64_t zp_src = zp;
+
+        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+        float scale_out = scale; // map to 0~255
+        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : zp;
+
+        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dqdata_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        dqweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+        graph::op_t tcweight_op {
+                3, graph::op_kind::TypeCast, "typecast_weight"};
+
+        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t gelu_op {5, graph::op_kind::GELU, "gelu_op"};
+
+        graph::op_t tcdst_op {6, graph::op_kind::TypeCast, "typecast_dst"};
+
+        graph::op_t qout_op(7, graph::op_kind::Quantize, "qdout_op");
+        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+        qout_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_out});
+        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                0, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::bf16);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::bf16);
+        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
+                6, bias_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t gelu_out_bf16 = utils::logical_tensor_init(
+                8, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
+                10, dst_shape, graph::data_type::u8);
+
+        dqdata_op.add_input(src_u8);
+        dqdata_op.add_output(src_f32_dq);
+
+        dqweight_op.add_input(weight_s8);
+        dqweight_op.add_output(weight_f32_dq);
+
+        tcdata_op.add_input(src_f32_dq);
+        tcdata_op.add_output(src_bf16);
+
+        tcweight_op.add_input(weight_f32_dq);
+        tcweight_op.add_output(weight_bf16);
+
+        conv_op.add_input(src_bf16);
+        conv_op.add_input(weight_bf16);
+        conv_op.add_input(bias_bf16);
+        conv_op.add_output(conv_bf16);
+
+        gelu_op.add_input(conv_bf16);
+        gelu_op.add_output(gelu_out_bf16);
+
+        tcdst_op.add_input(gelu_out_bf16);
+        tcdst_op.add_output(conv_f32);
+
+        qout_op.add_input(conv_f32);
+        qout_op.add_output(dst_u8);
+
+        graph::graph_t g(engine->kind());
+        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&gelu_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins {
+                &src_u8, &weight_s8, &bias_bf16};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
+
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_bf16_ts(bias_bf16, engine, bias_data);
+        test_tensor_t dst_ts(dst_u8, engine);
+        cp.execute(strm,
+                {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
+                {dst_ts.get()});
+        strm->wait();
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionReluMulS8Bf16Accuracy) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
+                    && engine->kind() == graph::engine_kind::cpu,
+            "Skip the test for systems that do not support "
+            "avx512_core_vnni.");
+
+    std::string qtype = "per_tensor";
+
+    std::vector<int64_t> src_shape = {1, 1, 3, 3};
+    std::vector<int64_t> weight_shape = {1, 1, 2, 2};
+    std::vector<int64_t> dst_shape = {1, 1, 2, 2};
+    std::vector<int64_t> mul_shape = {1, 1, 2, 2};
+
+    std::vector<int8_t> src_data = {2, -1, 4, 4, -2, 0, -1, 1, 2};
+    std::vector<int8_t> weight_data = {4, -2, 8, 4};
+    std::vector<bfloat16_t> mul_data = {0.125f, 0.5f, 0.25f, 0.125f};
+    std::vector<bfloat16_t> ref_dst_data = {0.0664f, 0.0f, 0.0625f, 0.0156f};
+    std::vector<bfloat16_t> act_dst_data = {0.0f, 0.0f, 0.0f, 0.0f};
+
+    float scale = 1.0f / 8.0f;
+    int64_t zp = 0;
+
+    graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp});
+    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale});
+    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+    graph::op_t tcdata_op {1, graph::op_kind::TypeCast, "typecast_data"};
+
+    graph::op_t dqweight_op(2, graph::op_kind::Dequantize, "dqweight_op");
+    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp});
+    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale});
+    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+    graph::op_t tcweight_op {3, graph::op_kind::TypeCast, "typecast_weight"};
+
+    graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::op_t relu_op {5, graph::op_kind::ReLU, "relu_op"};
+
+    graph::op_t mul_op {6, graph::op_kind::Multiply, "mul_op"};
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_s8
+            = utils::logical_tensor_init(0, src_shape, graph::data_type::s8);
+    graph::logical_tensor_t src_f32_dq
+            = utils::logical_tensor_init(1, src_shape, graph::data_type::f32);
+    graph::logical_tensor_t src_bf16
+            = utils::logical_tensor_init(2, src_shape, graph::data_type::bf16);
+    graph::logical_tensor_t weight_s8
+            = utils::logical_tensor_init(3, weight_shape, graph::data_type::s8);
+    graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+            4, weight_shape, graph::data_type::f32);
+    graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+            5, weight_shape, graph::data_type::bf16);
+    graph::logical_tensor_t conv_bf16
+            = utils::logical_tensor_init(7, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t relu_out_bf16
+            = utils::logical_tensor_init(12, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t mul_bf16
+            = utils::logical_tensor_init(14, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t mul_out_bf16
+            = utils::logical_tensor_init(15, dst_shape, graph::data_type::bf16);
+
+    dqdata_op.add_input(src_s8);
+    dqdata_op.add_output(src_f32_dq);
+
+    dqweight_op.add_input(weight_s8);
+    dqweight_op.add_output(weight_f32_dq);
+
+    tcdata_op.add_input(src_f32_dq);
+    tcdata_op.add_output(src_bf16);
+
+    tcweight_op.add_input(weight_f32_dq);
+    tcweight_op.add_output(weight_bf16);
+
+    conv_op.add_input(src_bf16);
+    conv_op.add_input(weight_bf16);
+    conv_op.add_output(conv_bf16);
+
+    relu_op.add_input(conv_bf16);
+    relu_op.add_output(relu_out_bf16);
+
+    mul_op.add_input(relu_out_bf16);
+    mul_op.add_input(mul_bf16);
+    mul_op.add_output(mul_out_bf16);
+
+    graph::graph_t g(engine->kind());
+    ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&relu_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&mul_op), graph::status::success);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> lt_ins {
+            &src_s8, &weight_s8, &mul_bf16};
+    std::vector<const graph::logical_tensor_t *> lt_outs {&mul_out_bf16};
+
+    ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
+
+    test_tensor_t src_s8_ts(src_s8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t mul_bf16_ts(mul_bf16, engine, mul_data);
+    test_tensor_t dst_bf16_ts(mul_out_bf16, engine, act_dst_data);
+    cp.execute(strm, {src_s8_ts.get(), weight_s8_ts.get(), mul_bf16_ts.get()},
+            {dst_bf16_ts.get()});
+    strm->wait();
+    act_dst_data = dst_bf16_ts.as_vec_type<bfloat16_t>();
+    for (size_t i = 0; i < act_dst_data.size(); ++i) {
+        ASSERT_FLOAT_EQ(act_dst_data[i], ref_dst_data[i]);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8,
+        ConvolutionBiasaddGeluU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::string qtype = "per_channel";
+    std::vector<int64_t> groups = {1, 4};
+    for (const auto &g_ : groups) {
+        int64_t in_channel = 8, out_channel = 8;
+        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+        std::vector<int64_t> weight_shape
+                = {out_channel, in_channel / g_, 3, 3};
+        std::vector<int64_t> bias_shape = {out_channel};
+        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::vector<float> weight_data(product(weight_shape));
+        std::vector<float> bias_data(product(bias_shape));
+
+        // random generate src, weight data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(distribution(generator));
+        });
+        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
+        std::generate(weight_data.begin(), weight_data.end(),
+                [&]() { return distribution2(generator); });
+        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+        std::generate(bias_data.begin(), bias_data.end(),
+                [&]() { return distribution3(generator); });
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = 110;
+
+        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+        float scale_out = 1 / 255.f; // map to 0~255
+        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+
+        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dqdata_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
+        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        qweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        dqweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+        graph::op_t tcweight_op {
+                3, graph::op_kind::TypeCast, "typecast_weight"};
+
+        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t tc_bias_op {5, graph::op_kind::TypeCast, "typecast_bias"};
+
+        graph::op_t biasadd_op {6, graph::op_kind::BiasAdd, "biasadd_op"};
+        biasadd_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+
+        graph::op_t gelu_op {7, graph::op_kind::GELU, "gelu_op"};
+
+        graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
+
+        graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
+        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+        qout_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_out});
+        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                0, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::bf16);
+        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+                30, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
+                6, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t bias_f32 = utils::logical_tensor_init(
+                7, bias_shape, graph::data_type::f32);
+        graph::logical_tensor_t bias_bf16 = utils::logical_tensor_init(
+                8, bias_shape, graph::data_type::bf16);
+        graph::logical_tensor_t bias_out_bf16 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t gelu_out_bf16 = utils::logical_tensor_init(
+                10, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
+                11, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
+                12, dst_shape, graph::data_type::u8);
+
+        dqdata_op.add_input(src_u8);
+        dqdata_op.add_output(src_f32_dq);
+
+        qweight_op.add_input(weight_f32);
+        qweight_op.add_output(weight_s8);
+
+        dqweight_op.add_input(weight_s8);
+        dqweight_op.add_output(weight_f32_dq);
+
+        tcdata_op.add_input(src_f32_dq);
+        tcdata_op.add_output(src_bf16);
+
+        tcweight_op.add_input(weight_f32_dq);
+        tcweight_op.add_output(weight_bf16);
+
+        conv_op.add_input(src_bf16);
+        conv_op.add_input(weight_bf16);
+        conv_op.add_output(conv_bf16);
+
+        tc_bias_op.add_input(bias_f32);
+        tc_bias_op.add_output(bias_bf16);
+
+        biasadd_op.add_input(conv_bf16);
+        biasadd_op.add_input(bias_bf16);
+        biasadd_op.add_output(bias_out_bf16);
+
+        gelu_op.add_input(bias_out_bf16);
+        gelu_op.add_output(gelu_out_bf16);
+
+        tcdst_op.add_input(gelu_out_bf16);
+        tcdst_op.add_output(conv_f32);
+
+        qout_op.add_input(conv_f32);
+        qout_op.add_output(dst_u8);
+
+        graph::graph_t g(engine->kind());
+        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&biasadd_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&gelu_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tc_bias_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
+        g.finalize();
+
+        graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins {
+                &src_u8, &weight_f32, &bias_f32};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
+
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t dst_ts(dst_u8, engine);
+        cp.execute(strm,
+                {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
+                {dst_ts.get()});
+        strm->wait();
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionSwishU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+
+    std::string qtype = "per_channel";
+    const int64_t group = 4;
+    int64_t in_channel = 8, out_channel = 8;
+    std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+    std::vector<int64_t> weight_shape = {out_channel, in_channel / group, 3, 3};
+    std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+    float scale_src = 1 / 255.f; // map to 0~255
+    int64_t zp_src = 110;
+    size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+    std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+    std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+    float scale_out = 1 / 255.f; // map to 0~255
+    int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+    graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_src});
+    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+    graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
+    qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+    qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+    qweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
+    qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+    graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
+    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+    graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+    graph::op_t tcweight_op {3, graph::op_kind::TypeCast, "typecast_weight"};
+    graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, group);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t mul_op {6, graph::op_kind::Multiply, "mul_op"};
+    graph::op_t sigmoid_op {7, graph::op_kind::Sigmoid, "sigmoid_op"};
+    graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
+    graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
+    qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+    qout_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_out});
+    qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+    // prepare logical tensor
+    graph::logical_tensor_t src_u8
+            = utils::logical_tensor_init(0, src_shape, graph::data_type::u8);
+    graph::logical_tensor_t src_f32_dq
+            = utils::logical_tensor_init(1, src_shape, graph::data_type::f32);
+    graph::logical_tensor_t src_bf16
+            = utils::logical_tensor_init(2, src_shape, graph::data_type::bf16);
+    graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+            30, weight_shape, graph::data_type::f32);
+    graph::logical_tensor_t weight_s8
+            = utils::logical_tensor_init(3, weight_shape, graph::data_type::s8);
+    graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+            4, weight_shape, graph::data_type::f32);
+    graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+            5, weight_shape, graph::data_type::bf16);
+    graph::logical_tensor_t conv_bf16
+            = utils::logical_tensor_init(6, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t mul_out_bf16
+            = utils::logical_tensor_init(9, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t sigmoid_out_bf16
+            = utils::logical_tensor_init(10, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t conv_f32
+            = utils::logical_tensor_init(11, dst_shape, graph::data_type::f32);
+    graph::logical_tensor_t dst_u8
+            = utils::logical_tensor_init(12, dst_shape, graph::data_type::u8);
+    dqdata_op.add_input(src_u8);
+    dqdata_op.add_output(src_f32_dq);
+    qweight_op.add_input(weight_f32);
+    qweight_op.add_output(weight_s8);
+    dqweight_op.add_input(weight_s8);
+    dqweight_op.add_output(weight_f32_dq);
+    tcdata_op.add_input(src_f32_dq);
+    tcdata_op.add_output(src_bf16);
+    tcweight_op.add_input(weight_f32_dq);
+    tcweight_op.add_output(weight_bf16);
+    conv_op.add_input(src_bf16);
+    conv_op.add_input(weight_bf16);
+    conv_op.add_output(conv_bf16);
+    sigmoid_op.add_input(conv_bf16);
+    sigmoid_op.add_output(sigmoid_out_bf16);
+    mul_op.add_input(conv_bf16);
+    mul_op.add_input(sigmoid_out_bf16);
+    mul_op.add_output(mul_out_bf16);
+    tcdst_op.add_input(mul_out_bf16);
+    tcdst_op.add_output(conv_f32);
+    qout_op.add_input(conv_f32);
+    qout_op.add_output(dst_u8);
+    graph::graph_t g(engine->kind());
+    ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&mul_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&sigmoid_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
+    g.finalize();
+    graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+    ASSERT_EQ(part->get_ops().size(), 10U);
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionSumU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+
+    std::string qtype = "per_channel";
+    std::vector<int64_t> groups = {1, 4};
+    for (const auto &g_ : groups) {
+        int64_t in_channel = 8, out_channel = 8;
+        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+        std::vector<int64_t> weight_shape
+                = {out_channel, in_channel / g_, 3, 3};
+        std::vector<int64_t> bias_shape = {1, out_channel, 1, 1};
+        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::vector<float> weight_data(product(weight_shape));
+        std::vector<float> bias_data(product(bias_shape));
+
+        // random generate src, weight data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(distribution(generator));
+        });
+        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
+        std::generate(weight_data.begin(), weight_data.end(),
+                [&]() { return distribution2(generator); });
+        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+        std::generate(bias_data.begin(), bias_data.end(),
+                [&]() { return distribution3(generator); });
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+
+        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+        float scale_out = 1 / 255.f; // map to 0~255
+        int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+
+        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dqdata_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
+        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        qweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        dqweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+        graph::op_t tcweight_op {
+                3, graph::op_kind::TypeCast, "typecast_weight"};
+
+        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t dq_add_op {5, graph::op_kind::Dequantize, "dq_add_op"};
+        dq_add_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dq_add_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dq_add_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dq_add_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t tc_add_op {6, graph::op_kind::TypeCast, "typecast_bias"};
+
+        graph::op_t add_op {7, graph::op_kind::Add, "add_op"};
+
+        graph::op_t tcdst_op {8, graph::op_kind::TypeCast, "typecast_dst"};
+
+        graph::op_t qout_op(9, graph::op_kind::Quantize, "qdout_op");
+        qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+        qout_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_out});
+        qout_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                0, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::bf16);
+        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+                30, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
+                6, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t add_u8 = utils::logical_tensor_init(
+                31, bias_shape, graph::data_type::u8);
+        graph::logical_tensor_t add_f32 = utils::logical_tensor_init(
+                7, bias_shape, graph::data_type::f32);
+        graph::logical_tensor_t add_bf16 = utils::logical_tensor_init(
+                8, bias_shape, graph::data_type::bf16);
+        graph::logical_tensor_t add_out_bf16 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_f32 = utils::logical_tensor_init(
+                11, dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_u8 = utils::logical_tensor_init(
+                12, dst_shape, graph::data_type::u8);
+
+        dqdata_op.add_input(src_u8);
+        dqdata_op.add_output(src_f32_dq);
+
+        qweight_op.add_input(weight_f32);
+        qweight_op.add_output(weight_s8);
+
+        dqweight_op.add_input(weight_s8);
+        dqweight_op.add_output(weight_f32_dq);
+
+        tcdata_op.add_input(src_f32_dq);
+        tcdata_op.add_output(src_bf16);
+
+        tcweight_op.add_input(weight_f32_dq);
+        tcweight_op.add_output(weight_bf16);
+
+        conv_op.add_input(src_bf16);
+        conv_op.add_input(weight_bf16);
+        conv_op.add_output(conv_bf16);
+
+        dq_add_op.add_input(add_u8);
+        dq_add_op.add_output(add_f32);
+
+        tc_add_op.add_input(add_f32);
+        tc_add_op.add_output(add_bf16);
+
+        add_op.add_input(conv_bf16);
+        add_op.add_input(add_bf16);
+        add_op.add_output(add_out_bf16);
+
+        tcdst_op.add_input(add_out_bf16);
+        tcdst_op.add_output(conv_f32);
+
+        qout_op.add_input(conv_f32);
+        qout_op.add_output(dst_u8);
+
+        graph::graph_t g(engine->kind());
+        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dq_add_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tc_add_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&add_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdst_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qout_op), graph::status::success);
+        g.finalize();
+
+        graph::pass::pass_base_ptr conv_sum_pass;
+        if (engine->kind() == graph::engine_kind::cpu) {
+            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_cpu");
+        } else {
+            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_gpu");
+        }
+        conv_sum_pass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8,
+        ConvolutionBinaryAddFailToFuseAsConvSumU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+
+    std::string qtype = "per_channel";
+    std::vector<int64_t> groups = {1, 4};
+    for (const auto &g_ : groups) {
+        int64_t in_channel = 8, out_channel = 8;
+        std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+        std::vector<int64_t> weight_shape
+                = {out_channel, in_channel / g_, 3, 3};
+        std::vector<int64_t> bias_shape = {1, out_channel, 1, 1};
+        std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::vector<float> weight_data(product(weight_shape));
+        std::vector<float> bias_data(product(bias_shape));
+
+        // random generate src, weight data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(distribution(generator));
+        });
+        std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
+        std::generate(weight_data.begin(), weight_data.end(),
+                [&]() { return distribution2(generator); });
+        std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+        std::generate(bias_data.begin(), bias_data.end(),
+                [&]() { return distribution3(generator); });
+        float scale_src = 1 / 255.f; // map to 0~255
+        int64_t zp_src = engine->kind() == graph::engine_kind::gpu ? 0 : 110;
+
+        size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+        std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+        graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+        dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dqdata_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
+        qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        qweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+        dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+        dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+        dqweight_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, scale_wei);
+        dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+        graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+        graph::op_t tcweight_op {
+                3, graph::op_kind::TypeCast, "typecast_weight"};
+
+        graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+        conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+        conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+        conv_op.set_attr<int64_t>(graph::op_attr::groups, g_);
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+        graph::op_t dq_add_op {5, graph::op_kind::Dequantize, "dq_add_op"};
+        dq_add_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dq_add_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+        dq_add_op.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_src});
+        dq_add_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t tc_add_op {6, graph::op_kind::TypeCast, "typecast_bias"};
+
+        graph::op_t add_op {7, graph::op_kind::Add, "add_op"};
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                0, src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                1, src_shape, graph::data_type::f32);
+        graph::logical_tensor_t src_bf16 = utils::logical_tensor_init(
+                2, src_shape, graph::data_type::bf16);
+        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+                30, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                4, weight_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+                5, weight_shape, graph::data_type::bf16);
+        graph::logical_tensor_t conv_bf16 = utils::logical_tensor_init(
+                6, dst_shape, graph::data_type::bf16);
+        graph::logical_tensor_t add_u8 = utils::logical_tensor_init(
+                31, bias_shape, graph::data_type::u8);
+        graph::logical_tensor_t add_f32 = utils::logical_tensor_init(
+                7, bias_shape, graph::data_type::f32);
+        graph::logical_tensor_t add_bf16 = utils::logical_tensor_init(
+                8, bias_shape, graph::data_type::bf16);
+        graph::logical_tensor_t add_out_bf16 = utils::logical_tensor_init(
+                9, dst_shape, graph::data_type::bf16);
+
+        dqdata_op.add_input(src_u8);
+        dqdata_op.add_output(src_f32_dq);
+
+        qweight_op.add_input(weight_f32);
+        qweight_op.add_output(weight_s8);
+
+        dqweight_op.add_input(weight_s8);
+        dqweight_op.add_output(weight_f32_dq);
+
+        tcdata_op.add_input(src_f32_dq);
+        tcdata_op.add_output(src_bf16);
+
+        tcweight_op.add_input(weight_f32_dq);
+        tcweight_op.add_output(weight_bf16);
+
+        conv_op.add_input(src_bf16);
+        conv_op.add_input(weight_bf16);
+        conv_op.add_output(conv_bf16);
+
+        dq_add_op.add_input(add_u8);
+        dq_add_op.add_output(add_f32);
+
+        tc_add_op.add_input(add_f32);
+        tc_add_op.add_output(add_bf16);
+
+        add_op.add_input(conv_bf16);
+        add_op.add_input(add_bf16);
+        add_op.add_output(add_out_bf16);
+
+        graph::graph_t g(engine->kind());
+        ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&dq_add_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&tc_add_op), graph::status::success);
+        ASSERT_EQ(g.add_op(&add_op), graph::status::success);
+        g.finalize();
+
+        graph::pass::pass_base_ptr conv_sum_pass;
+        if (engine->kind() == graph::engine_kind::cpu) {
+            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_cpu");
+        } else {
+            conv_sum_pass = get_pass("x8s8x_tc_conv_add_post_ops_gpu");
+        }
+
+        conv_sum_pass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 0U);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvolutionAddU8s8u8MixBf16) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::string qtype = "per_channel";
+    int64_t in_channel = 8, out_channel = 8;
+    std::vector<int64_t> src_shape = {1, in_channel, 12, 12};
+    std::vector<int64_t> weight_shape = {out_channel, in_channel, 3, 3};
+    std::vector<int64_t> bias_shape = {1, out_channel, 1, 1};
+    std::vector<int64_t> dst_shape = {1, out_channel, 10, 10};
+
+    std::vector<uint8_t> src_data(product(src_shape));
+    std::vector<float> weight_data(product(weight_shape));
+    std::vector<float> bias_data(product(bias_shape));
+    std::vector<uint16_t> post_add_src1_data(product(bias_shape));
+
+    // random generate src, weight data
+    // random seed = 7
+    std::default_random_engine generator(7);
+    std::uniform_real_distribution<float> distribution(0.0f, 255.0f);
+    std::generate(src_data.begin(), src_data.end(),
+            [&]() { return static_cast<uint8_t>(distribution(generator)); });
+    std::uniform_real_distribution<float> distribution2(-1.f, 1.f);
+    std::generate(weight_data.begin(), weight_data.end(),
+            [&]() { return distribution2(generator); });
+    std::uniform_real_distribution<float> distribution3(0.0f, 20.0f);
+    std::generate(bias_data.begin(), bias_data.end(),
+            [&]() { return distribution3(generator); });
+    std::uniform_real_distribution<float> distribution4(0.0f, 50.0f);
+    std::generate(post_add_src1_data.begin(), post_add_src1_data.end(),
+            [&]() { return static_cast<uint16_t>(distribution4(generator)); });
+
+    float scale_src = 1 / 255.f; // map to 0~255
+    int64_t zp_src = 110;
+
+    size_t scales_wei_sizes = qtype == "per_tensor" ? 1 : dst_shape.back();
+    std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+    std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+    graph::op_t dqdata_op(0, graph::op_kind::Dequantize, "dqdata_op");
+    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_src});
+    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+    graph::op_t qweight_op(10, graph::op_kind::Quantize, "qweight_op");
+    qweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+    qweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+    qweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
+    qweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+    graph::op_t dqweight_op(1, graph::op_kind::Dequantize, "dqweight_op");
+    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, qtype);
+    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
+    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+    graph::op_t tcdata_op {2, graph::op_kind::TypeCast, "typecast_data"};
+    graph::op_t tcweight_op {3, graph::op_kind::TypeCast, "typecast_weight"};
+
+    graph::op_t conv_op(4, graph::op_kind::Convolution, "conv_op");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims(2, 1));
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims(2, 1));
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims(2, 0));
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims(2, 0));
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::op_t tc_bias_op {5, graph::op_kind::TypeCast, "typecast_bias"};
+
+    graph::op_t add_op {6, graph::op_kind::Add, "add_op"};
+
+    // prepare logical tensor
+    graph::logical_tensor_t src_u8
+            = utils::logical_tensor_init(0, src_shape, graph::data_type::u8);
+    graph::logical_tensor_t src_f32_dq
+            = utils::logical_tensor_init(1, src_shape, graph::data_type::f32);
+    graph::logical_tensor_t src_bf16
+            = utils::logical_tensor_init(2, src_shape, graph::data_type::bf16);
+    graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+            30, weight_shape, graph::data_type::f32);
+    graph::logical_tensor_t weight_s8
+            = utils::logical_tensor_init(3, weight_shape, graph::data_type::s8);
+    graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+            4, weight_shape, graph::data_type::f32);
+    graph::logical_tensor_t weight_bf16 = utils::logical_tensor_init(
+            5, weight_shape, graph::data_type::bf16);
+    graph::logical_tensor_t conv_bf16
+            = utils::logical_tensor_init(6, dst_shape, graph::data_type::bf16);
+    graph::logical_tensor_t bias_f32
+            = utils::logical_tensor_init(7, bias_shape, graph::data_type::f32);
+    graph::logical_tensor_t bias_bf16
+            = utils::logical_tensor_init(8, bias_shape, graph::data_type::bf16);
+    graph::logical_tensor_t bias_out_bf16
+            = utils::logical_tensor_init(9, dst_shape, graph::data_type::bf16);
+
+    dqdata_op.add_input(src_u8);
+    dqdata_op.add_output(src_f32_dq);
+
+    qweight_op.add_input(weight_f32);
+    qweight_op.add_output(weight_s8);
+
+    dqweight_op.add_input(weight_s8);
+    dqweight_op.add_output(weight_f32_dq);
+
+    tcdata_op.add_input(src_f32_dq);
+    tcdata_op.add_output(src_bf16);
+
+    tcweight_op.add_input(weight_f32_dq);
+    tcweight_op.add_output(weight_bf16);
+
+    conv_op.add_input(src_bf16);
+    conv_op.add_input(weight_bf16);
+    conv_op.add_output(conv_bf16);
+
+    tc_bias_op.add_input(bias_f32);
+    tc_bias_op.add_output(bias_bf16);
+
+    add_op.add_input(conv_bf16);
+    add_op.add_input(bias_bf16);
+    add_op.add_output(bias_out_bf16);
+
+    graph::graph_t g(engine->kind());
+    ASSERT_EQ(g.add_op(&dqdata_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&qweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&dqweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&add_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcdata_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tcweight_op), graph::status::success);
+    ASSERT_EQ(g.add_op(&tc_bias_op), graph::status::success);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("x8s8x_tc_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> lt_ins {
+            &src_u8, &weight_f32, &bias_bf16};
+    std::vector<const graph::logical_tensor_t *> lt_outs {&bias_out_bf16};
+
+    ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
+
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t post_add_src1_bf16_ts(bias_bf16, engine, post_add_src1_data);
+    test_tensor_t dst_ts(bias_out_bf16, engine);
+    cp.execute(strm,
+            {src_u8_ts.get(), weight_f32_ts.get(), post_add_src1_bf16_ts.get()},
+            {dst_ts.get()});
+    strm->wait();
+}
+
+TEST(test_convolution_execute, ConvSumSum) {
+    using dims = dnnl::impl::graph::dnnl_impl::dims;
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    std::vector<float> src1(1 * 8 * 112 * 112, 1.25f);
+    std::vector<float> weight(8 * 8 * 3 * 3, 1.25f);
+    std::vector<float> src2(1 * 8 * 110 * 110, 1.3f);
+    std::vector<float> src3(1 * 8 * 110 * 110, 1.6f);
+    std::vector<float> ref_dst(1 * 8 * 110 * 110, 115.4f);
+    std::vector<float> dst(1 * 8 * 110 * 110, 0.f);
+
+    graph::op_t in_op1(0, graph::op_kind::Wildcard, "Wildcard");
+    graph::op_t in_op2(1, graph::op_kind::Wildcard, "Wildcard");
+    graph::op_t in_op3(2, graph::op_kind::Wildcard, "Wildcard");
+    graph::op_t conv_op(3, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    graph::op_t add_op1(4, graph::op_kind::Add, "Add");
+    add_op1.set_attr<std::string>(graph::op_attr::auto_broadcast, "none");
+    graph::op_t add_op2(5, graph::op_kind::Add, "Add");
+    add_op2.set_attr<std::string>(graph::op_attr::auto_broadcast, "none");
+
+    // prepare logical tensor
+    graph::logical_tensor_t src1_lt = utils::logical_tensor_init(
+            0, {1, 8, 112, 112}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {8, 8, 3, 3}, graph::data_type::f32);
+
+    graph::logical_tensor_t conv_lt = utils::logical_tensor_init(
+            2, {1, 8, 110, 110}, graph::data_type::f32);
+    graph::logical_tensor_t src2_lt = utils::logical_tensor_init(
+            3, {1, 8, 110, 110}, graph::data_type::f32);
+
+    graph::logical_tensor_t add1_lt = utils::logical_tensor_init(
+            4, {1, 8, 110, 110}, graph::data_type::f32);
+    graph::logical_tensor_t src3_lt = utils::logical_tensor_init(
+            5, {1, 8, 110, 110}, graph::data_type::f32);
+
+    graph::logical_tensor_t add2_lt = utils::logical_tensor_init(
+            6, {1, 8, 110, 110}, graph::data_type::f32);
+
+    in_op1.add_output(src1_lt);
+    conv_op.add_input(src1_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(conv_lt);
+
+    add_op1.add_input(conv_lt);
+    add_op1.add_input(src2_lt);
+    add_op1.add_output(add1_lt);
+
+    add_op2.add_input(add1_lt);
+    add_op2.add_input(src3_lt);
+    add_op2.add_output(add2_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&in_op1);
+    g.add_op(&in_op2);
+    g.add_op(&in_op3);
+    g.add_op(&conv_op);
+    g.add_op(&add_op1);
+    g.add_op(&add_op2);
+
+    g.finalize();
+
+    run_all_passes(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &src1_lt, &weight_lt, &src2_lt, &src3_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&add2_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    graph::logical_tensor_t lt;
+    cp.query_logical_tensor(add2_lt.id, &lt);
+    ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
+
+    test_tensor_t src1_ts(src1_lt, eng, src1);
+    test_tensor_t src2_ts(src2_lt, eng, src2);
+    test_tensor_t src3_ts(src3_lt, eng, src3);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t add2_ts(add2_lt, eng, dst);
+
+    graph::stream_t *strm = get_stream();
+    cp.execute(strm,
+            {src1_ts.get(), weight_ts.get(), src2_ts.get(), src3_ts.get()},
+            {add2_ts.get()});
+    strm->wait();
+}
+
+TEST(test_convolution_execute, ConvolutionBf16InFp32Out) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    static auto isa = dnnl_get_effective_cpu_isa();
+    SKIP_IF((isa < dnnl_cpu_isa_avx512_core)
+                    && eng->kind() == graph::engine_kind::cpu,
+            "Skip bf16 examples for systems that do not support "
+            "avx512_core.");
+
+    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::op_t add_op(1, graph::op_kind::Add, "add");
+    graph::op_t tc_op(2, graph::op_kind::TypeCast, "tc");
+
+    // prepare logical tensor
+    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
+            0, {8, 32, 16, 16}, graph::data_type::bf16);
+    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
+            1, {32, 32, 1, 1}, graph::data_type::bf16);
+    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
+            2, {8, 32, 16, 16}, graph::data_type::bf16);
+    graph::logical_tensor_t post_src_lt = utils::logical_tensor_init(
+            3, {8, 32, 16, 16}, graph::data_type::bf16);
+    graph::logical_tensor_t add_dst_lt = utils::logical_tensor_init(
+            4, {8, 32, 16, 16}, graph::data_type::bf16);
+    graph::logical_tensor_t tc_dst_lt = utils::logical_tensor_init(
+            5, {8, 32, 16, 16}, graph::data_type::f32);
+
+    // Initialize
+    std::vector<uint16_t> conv_src(8 * 32 * 16 * 16, 3);
+    std::vector<uint16_t> conv_weight(32 * 32 * 1 * 1, 2);
+    std::vector<uint16_t> post_src(8 * 32 * 16 * 16, 1);
+    std::vector<float> tc_dst(8 * 32 * 16 * 16, 0.0);
+
+    test_tensor_t conv_src_ts(conv_src_lt, eng, conv_src);
+    test_tensor_t conv_weight_ts(conv_weight_lt, eng, conv_weight);
+    test_tensor_t post_src_ts(post_src_lt, eng, post_src);
+    test_tensor_t tc_dst_ts(tc_dst_lt, eng, tc_dst);
+
+    conv_op.add_input(conv_src_lt);
+    conv_op.add_input(conv_weight_lt);
+    conv_op.add_output(conv_dst_lt);
+    add_op.add_input(conv_dst_lt);
+    add_op.add_input(post_src_lt);
+    add_op.add_output(add_dst_lt);
+    tc_op.add_input(add_dst_lt);
+    tc_op.add_output(tc_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&add_op);
+    g.add_op(&tc_op);
+    g.finalize();
+
+    //run unfused graph to compute the reference
+    ASSERT_EQ(run_graph(g, {conv_src_ts, conv_weight_ts, post_src_ts},
+                      {tc_dst_ts}, *eng, *strm),
+            graph::status::success);
+
+    // run fusion partition
+    graph::pass::pass_base_ptr apass = get_pass("fp_conv_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+    ASSERT_EQ(part->get_ops().size(), 3U);
+
+    // compile
+    graph::partition_t p;
+    p.init(part);
+
+    graph::compiled_partition_t cp(p);
+
+    std::vector<const graph::logical_tensor_t *> inputs {
+            &conv_src_lt, &conv_weight_lt, &post_src_lt};
+    std::vector<const graph::logical_tensor_t *> outputs {&tc_dst_lt};
+
+    p.compile(&cp, inputs, outputs, eng);
+
+    std::vector<float> convtc_dst(8 * 32 * 16 * 16, 0.0);
+    test_tensor_t convtc_dst_ts(tc_dst_lt, eng, convtc_dst);
+
+    cp.execute(strm,
+            {conv_src_ts.get(), conv_weight_ts.get(), post_src_ts.get()},
+            {convtc_dst_ts.get()});
+    strm->wait();
+
+    for (size_t i = 0; i < tc_dst.size(); ++i) {
+        ASSERT_NEAR(tc_dst[i], convtc_dst[i], 1e-10);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, QuantWeiConv2dSumRelu) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::vector<int64_t> groups = {1, 4};
+    std::vector<bool> with_biases = {true, false};
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+    std::vector<float> scales = {1.f, 1 / 127.f};
+    std::vector<int64_t> zps = {0, 110};
+
+    for_(const auto &scale : scales)
+    for_(const auto &zp : zps)
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for (const auto &wei_qtype : weight_qtypes) {
+        if (engine->kind() == graph::engine_kind::gpu
+                && wei_qtype == "per_channel" && g == 4)
+            continue;
+        // prepare fp32 data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
+        std::vector<int64_t> weight_shape {
+                out_channel, in_channel / g, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<float> weight_f32_data(product(weight_shape));
+        std::vector<int8_t> other_s8_data(product(dst_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<int8_t> case1_out_data(product(dst_shape));
+        std::vector<int8_t> case2_out_data(product(dst_shape));
+
+        // random generate src, weight and bias data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 127.0f);
+        std::uniform_real_distribution<float> s8_distribution(0.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_f32_data.begin(), weight_f32_data.end(),
+                [&]() { return f32_distribution(generator); });
+        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = scale; // map to 0~255
+        float scale_other = scale;
+        float scale_out = scale;
+        int64_t zp_src = zp;
+        int64_t zp_other = engine->kind() == graph::engine_kind::gpu ? 0 : zp;
+        int64_t zp_out = zp;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+
+        std::vector<float> scale_wei(scale_size, scale);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t qweight_node(2, graph::op_kind::Quantize, "qweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(qweight_node, 0)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+
+        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
+        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqother_node.set_attr<std::vector<int64_t>>(
+                graph::op_attr::zps, {zp_other});
+        dqother_node.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_other});
+        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
+
+        // prepare logical tensor
+        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+        auto weight_f32 = utils::logical_tensor_init(3, graph::data_type::f32);
+        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
+        auto weight_f32_dq
+                = utils::logical_tensor_init(5, graph::data_type::f32);
+        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+        auto dst_relu_f32
+                = utils::logical_tensor_init(8, graph::data_type::f32);
+        auto dst_s8 = utils::logical_tensor_init(9, graph::data_type::s8);
+        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
+        auto other_f32_dq
+                = utils::logical_tensor_init(12, graph::data_type::f32);
+        auto dst_add_f32
+                = utils::logical_tensor_init(13, graph::data_type::f32);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        qweight_node.add_input(weight_f32);
+        qweight_node.add_output(weight_s8);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        dqother_node.add_input(other_s8);
+        dqother_node.add_output(other_f32_dq);
+
+        add_node.add_input(dst_f32);
+        add_node.add_input(other_f32_dq);
+        add_node.add_output(dst_add_f32);
+
+        relu_node.add_input(dst_add_f32);
+        relu_node.add_output(dst_relu_f32);
+
+        qout_node.add_input(dst_relu_f32);
+        qout_node.add_output(dst_s8);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&qweight_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&dqother_node);
+        g.add_op(&add_node);
+        g.add_op(&relu_node);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        // prepare in/out with full shape
+        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+        weight_f32 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::f32);
+        // set weight to be constant
+        weight_f32.property = graph::property_type::constant;
+        dst_s8 = utils::logical_tensor_init(9, dst_shape, graph::data_type::s8);
+        other_s8 = utils::logical_tensor_init(
+                11, dst_shape, graph::data_type::s8);
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+            // set bias to be constant
+            bias_f32.property = graph::property_type::constant;
+        }
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_f32_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g,
+                          {src_u8_ts, weight_f32_ts, bias_f32_ts, other_s8_ts},
+                          {dst_s8_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass
+                = get_pass(engine->kind() == graph::engine_kind::gpu
+                                ? "x8x8x8_conv_add_post_ops_gpu"
+                                : "x8x8x8_conv_add_post_ops_cpu");
+
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_f32, &bias_f32, &other_s8};
+        else
+            lt_ins = {&src_u8, &weight_f32, &other_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        graph::status_t ret = p.compile(&cp, lt_ins, lt_outs, engine);
+        ASSERT_EQ(ret, graph::status::success);
+
+        // single thread
+        for (size_t iter = 0; iter < 5; iter++) {
+            if (with_bias)
+                cp.execute(strm,
+                        {src_u8_ts.get(), weight_f32_ts.get(),
+                                bias_f32_ts.get(), other_s8_ts.get()},
+                        {dst_s8_case2_ts.get()});
+            else
+                cp.execute(strm,
+                        {src_u8_ts.get(), weight_f32_ts.get(),
+                                other_s8_ts.get()},
+                        {dst_s8_case2_ts.get()});
+            strm->wait();
+
+            static auto isa = dnnl_get_effective_cpu_isa();
+            if (engine->kind() == graph::engine_kind::cpu
+                    && isa < dnnl_cpu_isa_avx512_core_vnni)
+                ASSERT_TRUE(allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts,
+                        /*rtol*/ 0.1f,
+                        /*atol*/ 1.f));
+            else
+                ASSERT_TRUE(allclose<int8_t>(dst_s8_ts, dst_s8_case2_ts,
+                        /*rtol*/ 0.01f,
+                        /*atol*/ 1.f));
+        }
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, QuantWeiConv2dSumS8Relu) {
+    static auto isa = dnnl_get_effective_cpu_isa();
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    SKIP_IF(isa < dnnl_cpu_isa_avx512_core_vnni
+                    && engine->kind() == graph::engine_kind::cpu,
+            "Skip u8+s8 examples for systems that do not support "
+            "avx512_core_vnni.");
+
+    std::vector<int64_t> groups = {1};
+    std::vector<bool> with_biases = {true};
+    std::vector<std::string> weight_qtypes = {"per_channel"};
+
+    for_(const auto &g : groups)
+    for_(const auto with_bias : with_biases)
+    for (const auto &wei_qtype : weight_qtypes) {
+        // prepare fp32 data
+        int64_t in_channel = 8, out_channel = 8;
+        int64_t kernel_size = 3;
+        std::vector<int64_t> src_shape {1, in_channel, 112, 112};
+        std::vector<int64_t> weight_shape {
+                out_channel, in_channel / g, kernel_size, kernel_size};
+        std::vector<int64_t> bias_shape {out_channel};
+        std::vector<int64_t> dst_shape {1, out_channel, 110, 110};
+        std::vector<int64_t> nxc_stride {
+                out_channel * 110 * 110, 1, out_channel * 110, out_channel};
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::vector<float> weight_f32_data(product(weight_shape));
+        std::vector<int8_t> other_s8_data(product(dst_shape));
+        size_t bias_size = with_bias ? product(bias_shape) : 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<uint8_t> case1_out_data(product(dst_shape));
+        std::vector<uint8_t> case2_out_data(product(dst_shape));
+
+        // random generate src, weight and bias data
+        // random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_f32_data.begin(), weight_f32_data.end(),
+                [&]() { return f32_distribution(generator); });
+        std::generate(other_s8_data.begin(), other_s8_data.end(), [&]() {
+            return static_cast<int8_t>(s8_distribution(generator));
+        });
+        if (with_bias) {
+            std::generate(bias_data.begin(), bias_data.end(),
+                    [&]() { return f32_distribution(generator); });
+        }
+
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_other = 1 / 127.f;
+        float scale_out = 1 / 255.f;
+        int64_t zp_src = 0;
+        int64_t zp_other = 0;
+        int64_t zp_out = 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : out_channel;
+
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t qweight_node(2, graph::op_kind::Quantize, "qweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(qweight_node, 0)
+
+        graph::op_t dqweight_node(
+                3, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+        graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+        SET_CONV_ATTR(conv_node, 2)
+
+        graph::op_t relu_node(5, graph::op_kind::ReLU, "relu_node");
+
+        graph::op_t qout_node(6, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+
+        graph::op_t dqother_node(8, graph::op_kind::Dequantize, "dqother_node");
+        dqother_node.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+        dqother_node.set_attr<std::vector<int64_t>>(
+                graph::op_attr::zps, {zp_other});
+        dqother_node.set_attr<std::vector<float>>(
+                graph::op_attr::scales, {scale_other});
+        dqother_node.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+        graph::op_t add_node(9, graph::op_kind::Add, "add_node");
+
+        // prepare logical tensor
+        auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+        auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+        auto weight_f32 = utils::logical_tensor_init(3, graph::data_type::f32);
+        auto weight_s8 = utils::logical_tensor_init(4, graph::data_type::s8);
+        auto weight_f32_dq
+                = utils::logical_tensor_init(5, graph::data_type::f32);
+        auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+        auto dst_relu_f32
+                = utils::logical_tensor_init(8, graph::data_type::f32);
+        auto dst_u8 = utils::logical_tensor_init(9, graph::data_type::u8);
+        auto other_s8 = utils::logical_tensor_init(11, graph::data_type::s8);
+        auto other_f32_dq
+                = utils::logical_tensor_init(12, graph::data_type::f32);
+        auto dst_add_f32
+                = utils::logical_tensor_init(13, graph::data_type::f32);
+        graph::logical_tensor_t bias_f32;
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(6, graph::data_type::f32);
+        }
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        qweight_node.add_input(weight_f32);
+        qweight_node.add_output(weight_s8);
+
+        dqweight_node.add_input(weight_s8);
+        dqweight_node.add_output(weight_f32_dq);
+
+        conv_node.add_input(src_f32_dq);
+        conv_node.add_input(weight_f32_dq);
+        if (with_bias) conv_node.add_input(bias_f32);
+        conv_node.add_output(dst_f32);
+
+        dqother_node.add_input(other_s8);
+        dqother_node.add_output(other_f32_dq);
+
+        add_node.add_input(dst_f32);
+        add_node.add_input(other_f32_dq);
+        add_node.add_output(dst_add_f32);
+
+        relu_node.add_input(dst_add_f32);
+        relu_node.add_output(dst_relu_f32);
+
+        qout_node.add_input(dst_relu_f32);
+        qout_node.add_output(dst_u8);
+
+        graph::graph_t g(engine->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&qweight_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&conv_node);
+        g.add_op(&dqother_node);
+        g.add_op(&add_node);
+        g.add_op(&relu_node);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        // prepare in/out with full shape
+        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+        weight_f32 = utils::logical_tensor_init(
+                3, weight_shape, graph::data_type::f32);
+        // set weight to be constant
+        weight_f32.property = graph::property_type::constant;
+        dst_u8 = utils::logical_tensor_init(
+                9, dst_shape, nxc_stride, graph::data_type::u8);
+        other_s8 = utils::logical_tensor_init(
+                11, dst_shape, nxc_stride, graph::data_type::s8);
+        if (with_bias) {
+            bias_f32 = utils::logical_tensor_init(
+                    6, bias_shape, graph::data_type::f32);
+            // set bias to be constant
+            bias_f32.property = graph::property_type::constant;
+        }
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_f32_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+        test_tensor_t bias_f32_ts;
+        if (with_bias) {
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
+        }
+        test_tensor_t dst_u8_ts(dst_u8, engine, case1_out_data);
+        test_tensor_t dst_u8_case2_ts(dst_u8, engine, case2_out_data);
+
+        // -------------------------case 1----------------------------------
+        ASSERT_EQ(run_graph(g,
+                          {src_u8_ts, weight_f32_ts, bias_f32_ts, other_s8_ts},
+                          {dst_u8_ts}, *engine, *strm),
+                graph::status::success);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass
+                = get_pass(engine->kind() == graph::engine_kind::gpu
+                                ? "x8x8x8_conv_add_post_ops_gpu"
+                                : "x8x8x8_conv_add_post_ops_cpu");
+
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        if (with_bias)
+            lt_ins = {&src_u8, &weight_f32, &bias_f32, &other_s8};
+        else
+            lt_ins = {&src_u8, &weight_f32, &other_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_u8};
+
+        graph::status_t ret = p.compile(&cp, lt_ins, lt_outs, engine);
+        ASSERT_EQ(ret, graph::status::success);
+
+        std::vector<graph::inplace_pair_t> inplace_pairs
+                = cp.get_inplace_pairs();
+        ASSERT_EQ(inplace_pairs.size(), 1U);
+        ASSERT_EQ(inplace_pairs[0].input_id, other_s8.id);
+        ASSERT_EQ(inplace_pairs[0].output_id, dst_u8.id);
+
+        // single thread
+        for (size_t iter = 0; iter < 5; iter++) {
+            if (with_bias)
+                cp.execute(strm,
+                        {src_u8_ts.get(), weight_f32_ts.get(),
+                                bias_f32_ts.get(), other_s8_ts.get()},
+                        {dst_u8_case2_ts.get()});
+            else
+                cp.execute(strm,
+                        {src_u8_ts.get(), weight_f32_ts.get(),
+                                other_s8_ts.get()},
+                        {dst_u8_case2_ts.get()});
+            strm->wait();
+            ASSERT_TRUE(allclose<uint8_t>(dst_u8_ts, dst_u8_case2_ts,
+                    /*rtol*/ 0.01f,
+                    /*atol*/ 1.f));
+        }
+    }
+}
+
+TEST(test_convolution_execute, ConvReluUnfused) {
+    using dims = graph::dnnl_impl::dims;
+
+    // default engine kind is cpu.
+    graph::engine_t *eng = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    graph::op_t conv_op(0, graph::op_kind::Convolution, "conv");
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::op_t relu_op(1, graph::op_kind::ReLU, "relu");
+
+    // prepare logical tensor
+    graph::logical_tensor_t conv_src_lt = utils::logical_tensor_init(
+            0, {8, 32, 16, 16}, graph::data_type::f32);
+    graph::logical_tensor_t conv_weight_lt = utils::logical_tensor_init(
+            1, {32, 32, 1, 1}, graph::data_type::f32);
+    graph::logical_tensor_t conv_dst_lt = utils::logical_tensor_init(
+            2, {8, 32, 16, 16}, graph::data_type::f32, graph::layout_type::any);
+    graph::logical_tensor_t relu_dst_lt = utils::logical_tensor_init(
+            3, {8, 32, 16, 16}, graph::data_type::f32);
+
+    conv_op.add_input(conv_src_lt);
+    conv_op.add_input(conv_weight_lt);
+    conv_op.add_output(conv_dst_lt);
+    relu_op.add_input(conv_dst_lt);
+    relu_op.add_output(relu_dst_lt);
+
+    graph::graph_t g(eng->kind());
+    g.add_op(&conv_op);
+    g.add_op(&relu_op);
+    g.finalize();
+
+    std::vector<float> conv_src(8 * 32 * 16 * 16, 1);
+    std::vector<float> conv_weight(32 * 32 * 1 * 1, 1);
+    std::vector<float> relu_dst(8 * 32 * 16 * 16, 1);
+
+    test_tensor_t conv_src_ts(conv_src_lt, eng, conv_src);
+    test_tensor_t conv_weight_ts(conv_weight_lt, eng, conv_weight);
+    test_tensor_t relu_dst_ts(relu_dst_lt, eng, relu_dst);
+
+    // run unfused graph to compute the reference
+    ASSERT_EQ(run_graph(g, {conv_src_ts, conv_weight_ts}, {relu_dst_ts}, *eng,
+                      *strm),
+            graph::status::success);
+    relu_dst = relu_dst_ts.as_vec_type<float>();
+    for (size_t i = 0; i < relu_dst.size(); ++i) {
+        ASSERT_FLOAT_EQ(relu_dst[i], 32);
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ConvDepthwise) {
+    graph::engine_t *eng = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    std::vector<std::string> weight_qtypes = {"per_tensor", "per_channel"};
+    std::vector<std::string> weight_formats = {"OIX", "XIO"};
+    std::string dw_type {"k3s2p1"};
+
+    std::vector<int64_t> dw_src_shape, dw_wei_shape, dw_dst_shape,
+            reshape1_shape, reshape2_shape;
+    int64_t groups = 3, axis;
+    for_(const auto &weight_format : weight_formats)
+    for (const auto &wei_qtype : weight_qtypes) {
+        if (weight_format == "OIX") {
+            // N, IC, OH, OW
+            dw_src_shape = {4, 3, 8, 8};
+            // OC/G, IC, KH, KW
+            dw_wei_shape = {8, 3, 7, 7};
+            // N, OC, OH, OW
+            dw_dst_shape = {4, 24, 2, 2};
+            reshape1_shape = {24, 7, 7};
+            reshape2_shape = {24, 1, 7, 7};
+            axis = 0;
+        } else {
+            // N, OH, OW, IC
+            dw_src_shape = {4, 8, 8, 3};
+            // KH, KW, IC, OC/G
+            dw_wei_shape = {7, 7, 3, 8};
+            // N, OH, OW, OC
+            dw_dst_shape = {4, 2, 2, 24};
+            reshape1_shape = {7, 7, 24};
+            reshape2_shape = {7, 7, 1, 24};
+            axis = 2;
+        }
+        std::vector<uint8_t> src_u8_data(product(dw_src_shape));
+        std::vector<float> weight_f32_data(product(dw_wei_shape));
+        size_t bias_size = 0;
+        std::vector<float> bias_data(bias_size);
+        std::vector<int8_t> case1_out_data(product(dw_dst_shape));
+        std::vector<int8_t> case2_out_data(product(dw_dst_shape));
+
+        // random generate src, weight and bias data random seed = 7
+        std::default_random_engine generator(7);
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+        std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        std::generate(weight_f32_data.begin(), weight_f32_data.end(),
+                [&]() { return f32_distribution(generator); });
+        float scale_src = 1 / 255.f; // map to 0~255
+        float scale_out = 1;
+        int64_t zp_src = 0;
+
+        int64_t zp_out = eng->kind() == graph::engine_kind::gpu ? 0 : 78;
+
+        size_t scale_size = wei_qtype == "per_tensor" ? 1 : 24;
+        std::vector<float> scale_wei(scale_size, 1 / 127.f);
+        std::vector<int64_t> zp_wei(scale_size, 0);
+
+        graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+        SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+        graph::op_t reshape1_op(
+                2, graph::op_kind::StaticReshape, "reshape1_op");
+        reshape1_op.set_attr(graph::op_attr::shape, reshape1_shape);
+        reshape1_op.set_attr(graph::op_attr::special_zero, false);
+
+        graph::op_t qweight_node(3, graph::op_kind::Quantize, "qweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(qweight_node, axis);
+
+        graph::op_t dqweight_node(
+                4, graph::op_kind::Dequantize, "dqweight_node");
+        SET_Q_DQ_WEIGHT_ATTR(dqweight_node, axis);
+
+        graph::op_t reshape2_op(
+                5, graph::op_kind::StaticReshape, "reshape2_op");
+        reshape2_op.set_attr(graph::op_attr::shape, reshape2_shape);
+        reshape2_op.set_attr(graph::op_attr::special_zero, false);
+
+        graph::op_t depthwise {6, graph::op_kind::Convolution, "depthwise"};
+        utils::set_conv_dw_post_op_attr(depthwise, dw_type);
+        depthwise.set_attr(graph::op_attr::weights_format, weight_format);
+        depthwise.set_attr(graph::op_attr::groups, groups);
+
+        graph::op_t qout_node(7, graph::op_kind::Quantize, "qout_node");
+        SET_Q_DQ_OUT_ATTR(qout_node)
+        if (weight_format == "XIO") {
+            std::string data_format = "NXC";
+            depthwise.set_attr(graph::op_attr::data_format, data_format);
+            dqdata_node.set_attr<int64_t>(
+                    dnnl::impl::graph::op_attr::axis, axis);
+            qout_node.set_attr<int64_t>(dnnl::impl::graph::op_attr::axis, axis);
+        }
+
+        // prepare logical tensor
+        graph::logical_tensor_t src_u8 = utils::logical_tensor_init(
+                1, dw_src_shape, graph::data_type::u8);
+        graph::logical_tensor_t src_f32_dq = utils::logical_tensor_init(
+                2, dw_src_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_f32 = utils::logical_tensor_init(
+                3, dw_wei_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_f32_reshape1
+                = utils::logical_tensor_init(
+                        4, reshape1_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_s8_q = utils::logical_tensor_init(
+                5, reshape1_shape, graph::data_type::s8);
+        graph::logical_tensor_t weight_f32_dq = utils::logical_tensor_init(
+                6, reshape1_shape, graph::data_type::f32);
+        graph::logical_tensor_t weight_f32_reshape2
+                = utils::logical_tensor_init(
+                        7, reshape2_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_f32 = utils::logical_tensor_init(
+                8, dw_dst_shape, graph::data_type::f32);
+        graph::logical_tensor_t dst_s8 = utils::logical_tensor_init(
+                9, dw_dst_shape, graph::data_type::s8);
+
+        dqdata_node.add_input(src_u8);
+        dqdata_node.add_output(src_f32_dq);
+
+        reshape1_op.add_input(weight_f32);
+        reshape1_op.add_output(weight_f32_reshape1);
+
+        qweight_node.add_input(weight_f32_reshape1);
+        qweight_node.add_output(weight_s8_q);
+
+        dqweight_node.add_input(weight_s8_q);
+        dqweight_node.add_output(weight_f32_dq);
+
+        reshape2_op.add_input(weight_f32_dq);
+        reshape2_op.add_output(weight_f32_reshape2);
+
+        depthwise.add_input(src_f32_dq);
+        depthwise.add_input(weight_f32_reshape2);
+        depthwise.add_output(dst_f32);
+
+        qout_node.add_input(dst_f32);
+        qout_node.add_output(dst_s8);
+
+        graph::graph_t g(eng->kind());
+        g.add_op(&dqdata_node);
+        g.add_op(&reshape1_op);
+        g.add_op(&qweight_node);
+        g.add_op(&dqweight_node);
+        g.add_op(&reshape2_op);
+        g.add_op(&depthwise);
+        g.add_op(&qout_node);
+        g.finalize();
+
+        test_tensor_t src_u8_ts(src_u8, eng, src_u8_data);
+        test_tensor_t weight_f32_ts(weight_f32, eng, weight_f32_data);
+        test_tensor_t dst_s8_ts(dst_s8, eng, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, eng, case2_out_data);
+
+        // -------------------------case 2----------------------------------
+        graph::pass::pass_base_ptr apass
+                = get_pass("x8s8x_conv_reshape_post_ops");
+        apass->run(g);
+        ASSERT_EQ(g.get_num_partitions(), 1U);
+
+        auto part = g.get_partitions()[0];
+
+        // compile
+        graph::partition_t p;
+        p.init(part);
+
+        graph::compiled_partition_t cp(p);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins;
+        lt_ins = {&src_u8, &weight_f32};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        p.compile(&cp, lt_ins, lt_outs, eng);
+        cp.execute(strm, {src_u8_ts.get(), weight_f32_ts.get()},
+                {dst_s8_case2_ts.get()});
+        strm->wait();
+    }
+}
+
+TEST(test_convolution_execute_subgraph_int8, ShareCachedWeights) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+
+    int64_t g = 1;
+    std::string wei_qtype = "per_channel";
+    std::string src_qtype = "symmetric";
+
+    float scale_src = 1 / 255.f;
+    int64_t zp_src = 0;
+    std::vector<int64_t> weight_shape {8, 8, 3, 3};
+
+    size_t scale_size = 8;
+    std::vector<float> scale_wei(scale_size, 1 / 127.f);
+    std::vector<int64_t> zp_wei(scale_size, 0);
+
+    graph::op_t dqdata_node(1, graph::op_kind::Dequantize, "dqdata_node");
+    SET_Q_DQ_DATA_ATTR(dqdata_node)
+
+    graph::op_t dqweight_node(3, graph::op_kind::Dequantize, "dqweight_node");
+    SET_Q_DQ_WEIGHT_ATTR(dqweight_node, 0)
+
+    graph::op_t conv_node(4, graph::op_kind::Convolution, "conv_node");
+    SET_CONV_ATTR(conv_node, 2)
+
+    auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+    auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+    auto weight_s8
+            = utils::logical_tensor_init(4, weight_shape, graph::data_type::s8);
+    weight_s8.property = graph::property_type::constant;
+    auto weight_f32_dq = utils::logical_tensor_init(
+            5, weight_shape, graph::data_type::f32);
+    auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+
+    dqdata_node.add_input(src_u8);
+    dqdata_node.add_output(src_f32_dq);
+
+    dqweight_node.add_input(weight_s8);
+    dqweight_node.add_output(weight_f32_dq);
+
+    conv_node.add_input(src_f32_dq);
+    conv_node.add_input(weight_f32_dq);
+    conv_node.add_output(dst_f32);
+
+    graph::graph_t agraph(engine->kind());
+    agraph.add_op(&dqdata_node);
+    agraph.add_op(&dqweight_node);
+    agraph.add_op(&conv_node);
+    agraph.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("x8x8x_conv_post_ops");
+    apass->run(agraph);
+    ASSERT_EQ(agraph.get_num_partitions(), 1U);
+    auto part = agraph.get_partitions()[0];
+
+    graph::partition_t p;
+    p.init(part);
+
+    std::default_random_engine generator(7);
+    std::uniform_real_distribution<float> u8_distribution(0.0f, 100.0f);
+    std::uniform_real_distribution<float> s8_distribution(-64.0f, 64.0f);
+    std::uniform_real_distribution<float> f32_distribution(0.0f, 1.0f);
+
+    std::vector<int8_t> weight_s8_data(product(weight_shape));
+    std::generate(weight_s8_data.begin(), weight_s8_data.end(),
+            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+
+    std::vector<std::vector<int64_t>> src_shapes {
+            {1, 8, 12, 12}, {3, 8, 12, 12}};
+    std::vector<std::vector<int64_t>> dst_shapes {
+            {1, 8, 10, 10}, {3, 8, 10, 10}};
+    std::vector<std::shared_ptr<graph::compiled_partition_t>> cps;
+    for (size_t i = 0; i < src_shapes.size(); ++i) {
+        std::cout << "---------------\n";
+        std::vector<int64_t> src_shape = src_shapes[i];
+        std::vector<int64_t> dst_shape = dst_shapes[i];
+
+        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+        dst_f32 = utils::logical_tensor_init(
+                7, dst_shape, graph::data_type::f32);
+
+        std::vector<const graph::logical_tensor_t *> lt_ins {
+                &src_u8, &weight_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_f32};
+
+        cps.push_back(std::make_shared<graph::compiled_partition_t>(p));
+        auto &cp = *cps.back();
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        std::vector<uint8_t> src_u8_data(product(src_shape));
+        std::generate(src_u8_data.begin(), src_u8_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t dst_ts(dst_f32, engine);
+
+        cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()}, {dst_ts.get()});
+        strm->wait();
+
+        test_tensor_t ref_dst_ts(dst_f32, engine);
+
+        ASSERT_EQ(run_graph(agraph, {src_u8_ts, weight_s8_ts}, {ref_dst_ts},
+                          *engine, *strm),
+                graph::status::success);
+
+        ASSERT_TRUE(allclose<float>(ref_dst_ts, dst_ts, /*rtol*/ 0.01f,
+                /*atol*/ 0.01f));
+    }
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_convtranspose.cpp b/tests/gtests/graph/unit/backend/dnnl/test_convtranspose.cpp
index dc59d8cd2fb..a3f109b2248 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_convtranspose.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_convtranspose.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -128,13 +128,13 @@ class convtranspose_4d_5d_t
         p.compile(&cp, inputs, outputs, eng);
         ASSERT_EQ(dst_lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, eng, src_data);
-        test_tensor weight_ts(weight_lt, eng, weight_data);
-        test_tensor dst1_ts(dst_lt, eng, case1_out_data);
-        test_tensor dst2_ts(dst_lt, eng, case2_out_data);
-        test_tensor bias_ts;
+        test_tensor_t src_ts(src_lt, eng, src_data);
+        test_tensor_t weight_ts(weight_lt, eng, weight_data);
+        test_tensor_t dst1_ts(dst_lt, eng, case1_out_data);
+        test_tensor_t dst2_ts(dst_lt, eng, case2_out_data);
+        test_tensor_t bias_ts;
         if (params.with_bias) {
-            bias_ts = test_tensor(bias_lt, eng, bias_data);
+            bias_ts = test_tensor_t(bias_lt, eng, bias_data);
         }
         graph::stream_t *strm = get_stream();
         if (params.with_bias) {
@@ -243,10 +243,10 @@ class convtranspose_backprop_data_t
         cp.query_logical_tensor(diff_src_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-        test_tensor weight_ts(weight_lt, eng, weight);
-        test_tensor diff_src1_ts(diff_src_lt, eng, case1_out_data);
-        test_tensor diff_src2_ts(diff_src_lt, eng, case2_out_data);
+        test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+        test_tensor_t weight_ts(weight_lt, eng, weight);
+        test_tensor_t diff_src1_ts(diff_src_lt, eng, case1_out_data);
+        test_tensor_t diff_src2_ts(diff_src_lt, eng, case2_out_data);
 
         graph::stream_t *strm = get_stream();
         ASSERT_EQ(run_graph(g, {diff_dst_ts, weight_ts}, {diff_src1_ts}, *eng,
@@ -336,10 +336,10 @@ class convtranspose_backprop_filters_t
         cp.query_logical_tensor(diff_wei_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-        test_tensor diff_wei_ts1(diff_wei_lt, eng, case1_out_data);
-        test_tensor diff_wei_ts2(diff_wei_lt, eng, case2_out_data);
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+        test_tensor_t diff_wei_ts1(diff_wei_lt, eng, case1_out_data);
+        test_tensor_t diff_wei_ts2(diff_wei_lt, eng, case2_out_data);
 
         graph::stream_t *strm = get_stream();
         ASSERT_EQ(run_graph(g, {src_ts, diff_dst_ts}, {diff_wei_ts1}, *eng,
@@ -452,13 +452,13 @@ class test_convtranspose_add_compile_t
         cp.query_logical_tensor(add_dst_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, eng, src_data);
-        test_tensor weight_ts(weight_lt, eng, weight_data);
-        test_tensor bias_ts;
-        if (params.with_bias) bias_ts = test_tensor(bias_lt, eng, bias_data);
-        test_tensor add_src_ts(add_src_lt, eng, add_src_data);
-        test_tensor add_dst_ts1(add_dst_lt, eng, case1_out_data);
-        test_tensor add_dst_ts2(add_dst_lt, eng, case2_out_data);
+        test_tensor_t src_ts(src_lt, eng, src_data);
+        test_tensor_t weight_ts(weight_lt, eng, weight_data);
+        test_tensor_t bias_ts;
+        if (params.with_bias) bias_ts = test_tensor_t(bias_lt, eng, bias_data);
+        test_tensor_t add_src_ts(add_src_lt, eng, add_src_data);
+        test_tensor_t add_dst_ts1(add_dst_lt, eng, case1_out_data);
+        test_tensor_t add_dst_ts2(add_dst_lt, eng, case2_out_data);
 
         graph::stream_t *strm = get_stream();
         if (params.with_bias) {
@@ -824,11 +824,11 @@ TEST(test_convtranspose_operator_kernel, convtranspose_relu) {
         cp.query_logical_tensor(relu_dst_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, eng, src_data);
-        test_tensor weight_ts(weight_lt, eng, weight_data);
-        test_tensor bias_ts;
-        if (with_bias) bias_ts = test_tensor(bias_lt, eng, bias_data);
-        test_tensor relu_dst_ts(relu_dst_lt, eng, dst_data);
+        test_tensor_t src_ts(src_lt, eng, src_data);
+        test_tensor_t weight_ts(weight_lt, eng, weight_data);
+        test_tensor_t bias_ts;
+        if (with_bias) bias_ts = test_tensor_t(bias_lt, eng, bias_data);
+        test_tensor_t relu_dst_ts(relu_dst_lt, eng, dst_data);
 
         graph::stream_t *strm = get_stream();
         if (with_bias)
@@ -926,11 +926,11 @@ TEST(test_convtranspose_operator_kernel, convtranspose_swish) {
         cp.query_logical_tensor(multiply_dst_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, eng, src_data);
-        test_tensor weight_ts(weight_lt, eng, weight_data);
-        test_tensor bias_ts;
-        if (with_bias) bias_ts = test_tensor(bias_lt, eng, bias_data);
-        test_tensor multiply_dst_ts(multiply_dst_lt, eng, dst_data);
+        test_tensor_t src_ts(src_lt, eng, src_data);
+        test_tensor_t weight_ts(weight_lt, eng, weight_data);
+        test_tensor_t bias_ts;
+        if (with_bias) bias_ts = test_tensor_t(bias_lt, eng, bias_data);
+        test_tensor_t multiply_dst_ts(multiply_dst_lt, eng, dst_data);
 
         graph::stream_t *strm = get_stream();
         if (with_bias)
@@ -1034,10 +1034,10 @@ TEST(test_convtranspose_execute_subgraph_int8,
     agraph.add_op(&qout_node);
     agraph.finalize();
 
-    test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-    test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-    test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+    test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+    test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
 
     // -------------------------case 1----------------------------------
     ASSERT_EQ(run_graph(agraph, {src_u8_ts, weight_s8_ts}, {dst_s8_ts}, *engine,
@@ -1173,10 +1173,10 @@ TEST(test_convtranspose_execute_subgraph_int8,
     agraph.add_op(&qout_node);
     agraph.finalize();
 
-    test_tensor src_s8_ts(src_s8, engine, src_s8_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-    test_tensor dst_s8_case1_ts(dst_s8, engine, case1_out_data);
-    test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+    test_tensor_t src_s8_ts(src_s8, engine, src_s8_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+    test_tensor_t dst_s8_case1_ts(dst_s8, engine, case1_out_data);
+    test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
 
     // -------------------------case 1----------------------------------
     ASSERT_EQ(run_graph(agraph, {src_s8_ts, weight_s8_ts}, {dst_s8_case1_ts},
@@ -1351,14 +1351,14 @@ TEST(test_convtranspose_execute_subgraph_int8, ConvTranspose1d2d3d) {
         g.add_op(&qout_node);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
         if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
         }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
@@ -1594,14 +1594,14 @@ TEST(test_convtranspose_execute_subgraph_int8, ConvTranspose2dEltwise_CPU) {
         graph.add_op(&qout_node);
         graph.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
         if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
         }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(graph, {src_u8_ts, weight_s8_ts, bias_f32_ts},
@@ -1808,14 +1808,14 @@ TEST(test_convtranspose_execute_subgraph_int8,
         // graph.add_op(&qout_node);
         graph.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor bias_f32_ts;
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t bias_f32_ts;
         if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
         }
-        test_tensor dst_f32_ts(dst_eltwise_f32, engine, case1_out_data);
-        test_tensor dst_f32_case2_ts(dst_eltwise_f32, engine, case2_out_data);
+        test_tensor_t dst_f32_ts(dst_eltwise_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_case2_ts(dst_eltwise_f32, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(graph, {src_u8_ts, weight_s8_ts, bias_f32_ts},
@@ -1976,10 +1976,11 @@ TEST(test_convtranspose_execute_subgraph_int8, X8X8F32ConvTransposeSwish) {
         graph.add_op(&multiply_node);
         graph.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor dst_f32_ts(dst_multiply_f32, engine, case1_out_data);
-        test_tensor dst_f32_case2_ts(dst_multiply_f32, engine, case2_out_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t dst_f32_ts(dst_multiply_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_case2_ts(
+                dst_multiply_f32, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(graph, {src_u8_ts, weight_s8_ts}, {dst_f32_ts},
@@ -2219,15 +2220,15 @@ TEST(test_convtranspose_execute_subgraph_int8, ConvTranspose1d2d3dAdd) {
         graph.add_op(&qout_node);
         graph.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-        test_tensor bias_f32_ts;
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+        test_tensor_t bias_f32_ts;
         if (with_bias) {
-            bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
+            bias_f32_ts = test_tensor_t(bias_f32, engine, bias_data);
         }
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(graph,
@@ -2431,11 +2432,11 @@ TEST(test_convtranspose_execute_subgraph_int8, ConvTranspose1d2d3dBinary) {
         graph.add_op(&qout_node);
         graph.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_s8_data);
-        test_tensor other_f32_ts(other_f32, engine, other_f32_data);
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_s8_data);
+        test_tensor_t other_f32_ts(other_f32, engine, other_f32_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(graph, {src_u8_ts, weight_s8_ts, other_f32_ts},
@@ -2870,9 +2871,9 @@ TEST(test_convtranspose_execute_subgraph_fp32, Convtranspose3Postops) {
 
         agraph.finalize();
 
-        test_tensor dst_case1_ts(lt_vec[output_lts[0]], engine);
-        test_tensor dst_case2_ts(lt_vec[output_lts[0]], engine);
-        std::vector<test_tensor> src_tss {};
+        test_tensor_t dst_case1_ts(lt_vec[output_lts[0]], engine);
+        test_tensor_t dst_case2_ts(lt_vec[output_lts[0]], engine);
+        std::vector<test_tensor_t> src_tss {};
         for (size_t i = 0; i < input_lts.size(); ++i) {
             src_tss.emplace_back(lt_vec[input_lts[i]], engine);
             src_tss.back().fill<float>();
@@ -2906,7 +2907,7 @@ TEST(test_convtranspose_execute_subgraph_fp32, Convtranspose3Postops) {
 
         p.compile(&cp, lt_ins, lt_outs, engine);
 
-        cp.execute(strm, test_tensor::to_graph_tensor(src_tss),
+        cp.execute(strm, test_tensor_t::to_graph_tensor(src_tss),
                 {dst_case2_ts.get()});
         strm->wait();
 
@@ -2968,9 +2969,9 @@ TEST(test_convtranspose_execute, ConvtransposeWithCache) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
     ASSERT_EQ(dst_lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor weight_ts(weight_lt, eng, weight);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t weight_ts(weight_lt, eng, weight);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()}),
@@ -2989,9 +2990,9 @@ TEST(test_convtranspose_execute, ConvtransposeWithCache) {
     std::vector<float> ref_dst2 {3.0, 0.0, 3.0, 0.0, 7.0, 0.0, 7.0, 0.0};
     std::vector<float> dst2(ref_dst.size(), 0);
 
-    test_tensor src_ts2(src_lt, eng, src2);
-    test_tensor weight_ts2(weight_lt, eng, weight2);
-    test_tensor dst_ts2(dst_lt, eng, dst2);
+    test_tensor_t src_ts2(src_lt, eng, src2);
+    test_tensor_t weight_ts2(weight_lt, eng, weight2);
+    test_tensor_t dst_ts2(dst_lt, eng, dst2);
 
     ASSERT_EQ(cp.execute(
                       strm, {src_ts2.get(), weight_ts2.get()}, {dst_ts2.get()}),
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dequantize.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dequantize.cpp
index c99be0714fc..26781871860 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_dequantize.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_dequantize.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,8 +78,8 @@ TEST(test_dequantize_execute, DequantizePerTensor) {
         cp.query_logical_tensor(dst_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, engine, src);
-        test_tensor dst_ts(dst_lt, engine, dst);
+        test_tensor_t src_ts(src_lt, engine, src);
+        test_tensor_t dst_ts(dst_lt, engine, dst);
 
         graph::stream_t *strm = get_stream();
         ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -140,8 +140,8 @@ TEST(test_dequantize_execute, DequantizePerTensorAnyLayout) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(lt, engine, dst);
 
     graph::stream_t *strm = get_stream();
 
@@ -200,8 +200,8 @@ TEST(test_dequantize_execute, DequantizePerChannelSymmetric) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(dst_lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(dst_lt, engine, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -270,10 +270,10 @@ TEST(test_dequantize_execute, DynamicDequantizeS32ZpsPerTensor_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor zps_ts(zps_lt, eng, zps);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t zps_ts(zps_lt, eng, zps);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get(), zps_ts.get()},
@@ -343,10 +343,10 @@ TEST(test_dequantize_execute, DynamicDequantizeS8ZpsPerTensor_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor zps_ts(zps_lt, eng, zps);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t zps_ts(zps_lt, eng, zps);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get(), zps_ts.get()},
@@ -411,9 +411,9 @@ TEST(test_dequantize_execute, DynamicDequantizeNoZpsPerTensor_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get()}, {dst_ts.get()}),
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_backend.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_backend.cpp
deleted file mode 100644
index 25dba01a777..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_backend.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include "gtest/gtest.h"
-
-#include "backend/dnnl/dnnl_backend.hpp"
-
-namespace graph = dnnl::impl::graph;
-
-TEST(test_dnnl_backend_larger_partition, LargerPartitionKernelCreator) {
-    ASSERT_NO_THROW(graph::dnnl_impl::large_partition_kernel_creator());
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_infer_shape.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_infer_shape.cpp
deleted file mode 100644
index 9a8a01dc9ac..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_infer_shape.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include <tuple>
-
-#include "gtest/gtest.h"
-
-#include "interface/shape_infer.hpp"
-
-#include "backend/dnnl/common.hpp"
-#include "backend/dnnl/dnnl_shape_infer.hpp"
-#include "backend/dnnl/internal_attrs.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_dnnl_infer_shape_dnnl_infer_shape, InferDnnlConvOutputShape) {
-    using dims = graph::dnnl_impl::dims;
-    using ltw = graph::logical_tensor_wrapper_t;
-
-    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
-    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    conv_op.set_attr<std::string>(graph::dnnl_impl::op_attr::dw_type, "k3s2p1");
-
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
-            1, {1, 1, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(3,
-            {1, 1, DNNL_GRAPH_UNKNOWN_DIM, DNNL_GRAPH_UNKNOWN_DIM},
-            graph::data_type::f32);
-
-    conv_op.add_input(src_lt);
-    conv_op.add_input(weight_lt);
-    conv_op.add_output(dst_lt);
-
-    std::vector<graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
-    std::vector<graph::logical_tensor_t *> outputs {&dst_lt};
-
-    ASSERT_EQ(graph::dnnl_impl::infer_dnnl_conv_output_shape(
-                      &conv_op, inputs, outputs),
-            graph::status::success);
-
-    auto output_dims = ltw(outputs[0]).vdims();
-    dims output_dims_ref {1, 1, 1, 1};
-
-    ASSERT_TRUE(std::equal(
-            output_dims.begin(), output_dims.end(), output_dims_ref.begin()));
-
-    auto output_strides = ltw(outputs[0]).vstrides();
-    dims output_strides_ref {1, 1, 1, 1}; // add value
-    ASSERT_TRUE(std::equal(output_strides.begin(), output_strides.end(),
-            output_strides_ref.begin()));
-}
-
-TEST(test_dnnl_infer_shape_dnnl_shape_infer, InferFromGroupOutputShape) {
-    using namespace dnnl::graph::tests::unit::utils;
-    size_t id = 0;
-
-    {
-        graph::dims dims {1, 2, 2, 2};
-        auto op = std::make_shared<graph::op_t>(
-                id++, graph::op_kind::Wildcard, "wild_card");
-        graph::logical_tensor_t in_lt
-                = logical_tensor_init(id++, dims, graph::data_type::f32);
-        graph::logical_tensor_t out_lt = logical_tensor_init(
-                id++, {1, -1, 2, 2}, graph::data_type::f32);
-        op->set_attr<int64_t>(graph::op_attr::groups, 2);
-        std::vector<graph::logical_tensor_t *> inputs {&in_lt};
-        std::vector<graph::logical_tensor_t *> outputs {&out_lt};
-        ASSERT_EQ(graph::dnnl_impl::infer_from_group_output_shape(
-                          op.get(), inputs, outputs),
-                graph::status::success);
-        const auto ret_dims = graph::logical_tensor_wrapper_t(&out_lt).vdims();
-        ASSERT_EQ(ret_dims.size(), 3U);
-        ASSERT_EQ(ret_dims[0], 4U);
-    }
-
-    {
-        graph::dims dims {1, 2, 2, 2};
-        auto op = std::make_shared<graph::op_t>(
-                id++, graph::op_kind::Wildcard, "wild_card");
-        graph::logical_tensor_t in_lt
-                = logical_tensor_init(id++, dims, graph::data_type::f32);
-        graph::logical_tensor_t out_lt = logical_tensor_init(
-                id++, {1, -1, 2, 2}, graph::data_type::f32);
-        op->set_attr<int64_t>(graph::op_attr::groups, 2);
-        op->set_attr<bool>(graph::dnnl_impl::op_attr::is_convtranspose, true);
-        std::vector<graph::logical_tensor_t *> inputs {&in_lt};
-        std::vector<graph::logical_tensor_t *> outputs {&out_lt};
-        ASSERT_EQ(graph::dnnl_impl::infer_from_group_output_shape(
-                          op.get(), inputs, outputs),
-                graph::status::success);
-        const auto ret_dims = graph::logical_tensor_wrapper_t(&out_lt).vdims();
-        ASSERT_EQ(ret_dims.size(), 3U);
-        ASSERT_EQ(ret_dims[1], 4);
-    }
-}
-
-TEST(test_dnnl_infer_shape_dnnl_shape_infer, InferBnFoldingOutputShape) {
-    using namespace dnnl::graph::tests::unit::utils;
-    using item_type = std::tuple<graph::dims, graph::dims, graph::dims,
-            graph::dims, graph::status_t>;
-    std::vector<item_type> items {
-            item_type(graph::dims {1, 2}, graph::dims {1, 2},
-                    graph::dims {1, 2}, graph::dims {1, 2},
-                    graph::status::success),
-            item_type(graph::dims {1}, graph::dims {1, 2}, graph::dims {1, 2},
-                    graph::dims {-1}, graph::status::invalid_shape),
-            item_type(graph::dims {1, 2}, graph::dims {1}, graph::dims {-1},
-                    graph::dims {1, 2}, graph::status::invalid_shape),
-    };
-
-    size_t id = 0;
-    for (const auto &item : items) {
-        auto &in0 = std::get<0>(item);
-        auto &in1 = std::get<1>(item);
-        auto &out0 = std::get<2>(item);
-        auto &out1 = std::get<3>(item);
-        auto status = std::get<4>(item);
-        auto op = std::make_shared<graph::op_t>(
-                id++, graph::op_kind::Wildcard, "wild_card");
-        graph::logical_tensor_t in_lt0
-                = logical_tensor_init(id++, in0, graph::data_type::f32);
-        graph::logical_tensor_t in_lt1
-                = logical_tensor_init(id++, in1, graph::data_type::f32);
-        graph::logical_tensor_t out_lt0
-                = logical_tensor_init(id++, out0, graph::data_type::f32);
-        graph::logical_tensor_t out_lt1
-                = logical_tensor_init(id++, out1, graph::data_type::f32);
-        std::vector<graph::logical_tensor_t *> inputs {&in_lt0, &in_lt1};
-        std::vector<graph::logical_tensor_t *> outputs {&out_lt0, &out_lt1};
-        ASSERT_EQ(graph::dnnl_impl::infer_bn_folding_output_shape(
-                          op.get(), inputs, outputs),
-                status);
-    }
-}
-
-TEST(test_dnnl_infer_shape_dnnl_shape_infer, InferDnnlConvBwdDataOutputShape) {
-    using namespace dnnl::graph::tests::unit::utils;
-    using dims = graph::dims;
-    auto conv_op = std::make_shared<graph::op_t>(
-            graph::op_kind::ConvolutionBackwardData);
-    conv_op->set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op->set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op->set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op->set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    // according to spec, group should be greater than 0
-    conv_op->set_attr<int64_t>(graph::op_attr::groups, 2);
-    conv_op->set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op->set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_op->set_attr<dims>(graph::op_attr::dst_shape, dims {1, 6, 7, 7});
-
-    // prepare logical tensor
-    graph::logical_tensor_t diff_src
-            = logical_tensor_init(0, {1, 4, 5, 5}, graph::data_type::f32);
-    graph::logical_tensor_t weights
-            = logical_tensor_init(1, {2, 3, 4, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst
-            = logical_tensor_init(2, {1, 6, 7, 7}, graph::data_type::f32);
-
-    conv_op->add_input(diff_dst);
-    conv_op->add_input(weights);
-    conv_op->add_output(diff_src);
-    std::vector<graph::logical_tensor_t *> inputs {&diff_src, &weights};
-    std::vector<graph::logical_tensor_t *> outputs {&diff_dst};
-
-    ASSERT_EQ(graph::dnnl_impl::infer_dnnl_conv_bwd_data_output_shape(
-                      conv_op.get(), inputs, outputs),
-            graph::status::success);
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_infer_shape_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_infer_shape_cpu.cpp
new file mode 100644
index 00000000000..972105165c8
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_infer_shape_cpu.cpp
@@ -0,0 +1,193 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <tuple>
+
+#include "gtest/gtest.h"
+
+#include "interface/shape_infer.hpp"
+
+#include "backend/dnnl/common.hpp"
+#include "backend/dnnl/dnnl_shape_infer.hpp"
+#include "backend/dnnl/internal_attrs.hpp"
+
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+
+TEST(test_dnnl_infer_shape, InferDnnlConvOutputShape) {
+    using dims = graph::dnnl_impl::dims;
+    using ltw = graph::logical_tensor_wrapper_t;
+
+    graph::op_t conv_op(1, graph::op_kind::Convolution, "Convolution");
+    conv_op.set_attr<dims>(graph::op_attr::strides, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    conv_op.set_attr<std::string>(graph::dnnl_impl::op_attr::dw_type, "k3s2p1");
+
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t weight_lt = utils::logical_tensor_init(
+            1, {1, 1, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(3,
+            {1, 1, DNNL_GRAPH_UNKNOWN_DIM, DNNL_GRAPH_UNKNOWN_DIM},
+            graph::data_type::f32);
+
+    conv_op.add_input(src_lt);
+    conv_op.add_input(weight_lt);
+    conv_op.add_output(dst_lt);
+
+    std::vector<graph::logical_tensor_t *> inputs {&src_lt, &weight_lt};
+    std::vector<graph::logical_tensor_t *> outputs {&dst_lt};
+
+    ASSERT_EQ(graph::dnnl_impl::infer_dnnl_conv_output_shape(
+                      &conv_op, inputs, outputs),
+            graph::status::success);
+
+    auto output_dims = ltw(outputs[0]).vdims();
+    dims output_dims_ref {1, 1, 1, 1};
+
+    ASSERT_TRUE(std::equal(
+            output_dims.begin(), output_dims.end(), output_dims_ref.begin()));
+
+    auto output_strides = ltw(outputs[0]).vstrides();
+    dims output_strides_ref {1, 1, 1, 1}; // add value
+    ASSERT_TRUE(std::equal(output_strides.begin(), output_strides.end(),
+            output_strides_ref.begin()));
+}
+
+TEST(test_dnnl_infer_shape, InferFromGroupOutputShape) {
+    using namespace dnnl::graph::tests::unit::utils;
+    size_t id = 0;
+
+    {
+        graph::dims dims {1, 2, 2, 2};
+        auto op = std::make_shared<graph::op_t>(
+                id++, graph::op_kind::Wildcard, "wild_card");
+        graph::logical_tensor_t in_lt
+                = logical_tensor_init(id++, dims, graph::data_type::f32);
+        graph::logical_tensor_t out_lt = logical_tensor_init(
+                id++, {1, -1, 2, 2}, graph::data_type::f32);
+        op->set_attr<int64_t>(graph::op_attr::groups, 2);
+        std::vector<graph::logical_tensor_t *> inputs {&in_lt};
+        std::vector<graph::logical_tensor_t *> outputs {&out_lt};
+        ASSERT_EQ(graph::dnnl_impl::infer_from_group_output_shape(
+                          op.get(), inputs, outputs),
+                graph::status::success);
+        const auto ret_dims = graph::logical_tensor_wrapper_t(&out_lt).vdims();
+        ASSERT_EQ(ret_dims.size(), 3U);
+        ASSERT_EQ(ret_dims[0], 4U);
+    }
+
+    {
+        graph::dims dims {1, 2, 2, 2};
+        auto op = std::make_shared<graph::op_t>(
+                id++, graph::op_kind::Wildcard, "wild_card");
+        graph::logical_tensor_t in_lt
+                = logical_tensor_init(id++, dims, graph::data_type::f32);
+        graph::logical_tensor_t out_lt = logical_tensor_init(
+                id++, {1, -1, 2, 2}, graph::data_type::f32);
+        op->set_attr<int64_t>(graph::op_attr::groups, 2);
+        op->set_attr<bool>(graph::dnnl_impl::op_attr::is_convtranspose, true);
+        std::vector<graph::logical_tensor_t *> inputs {&in_lt};
+        std::vector<graph::logical_tensor_t *> outputs {&out_lt};
+        ASSERT_EQ(graph::dnnl_impl::infer_from_group_output_shape(
+                          op.get(), inputs, outputs),
+                graph::status::success);
+        const auto ret_dims = graph::logical_tensor_wrapper_t(&out_lt).vdims();
+        ASSERT_EQ(ret_dims.size(), 3U);
+        ASSERT_EQ(ret_dims[1], 4);
+    }
+}
+
+TEST(test_dnnl_infer_shape, InferBnFoldingOutputShape) {
+    using namespace dnnl::graph::tests::unit::utils;
+    using item_type = std::tuple<graph::dims, graph::dims, graph::dims,
+            graph::dims, graph::status_t>;
+    std::vector<item_type> items {
+            item_type(graph::dims {1, 2}, graph::dims {1, 2},
+                    graph::dims {1, 2}, graph::dims {1, 2},
+                    graph::status::success),
+            item_type(graph::dims {1}, graph::dims {1, 2}, graph::dims {1, 2},
+                    graph::dims {-1}, graph::status::invalid_shape),
+            item_type(graph::dims {1, 2}, graph::dims {1}, graph::dims {-1},
+                    graph::dims {1, 2}, graph::status::invalid_shape),
+    };
+
+    size_t id = 0;
+    for (const auto &item : items) {
+        auto &in0 = std::get<0>(item);
+        auto &in1 = std::get<1>(item);
+        auto &out0 = std::get<2>(item);
+        auto &out1 = std::get<3>(item);
+        auto status = std::get<4>(item);
+        auto op = std::make_shared<graph::op_t>(
+                id++, graph::op_kind::Wildcard, "wild_card");
+        graph::logical_tensor_t in_lt0
+                = logical_tensor_init(id++, in0, graph::data_type::f32);
+        graph::logical_tensor_t in_lt1
+                = logical_tensor_init(id++, in1, graph::data_type::f32);
+        graph::logical_tensor_t out_lt0
+                = logical_tensor_init(id++, out0, graph::data_type::f32);
+        graph::logical_tensor_t out_lt1
+                = logical_tensor_init(id++, out1, graph::data_type::f32);
+        std::vector<graph::logical_tensor_t *> inputs {&in_lt0, &in_lt1};
+        std::vector<graph::logical_tensor_t *> outputs {&out_lt0, &out_lt1};
+        ASSERT_EQ(graph::dnnl_impl::infer_bn_folding_output_shape(
+                          op.get(), inputs, outputs),
+                status);
+    }
+}
+
+TEST(test_dnnl_infer_shape, InferDnnlConvBwdDataOutputShape) {
+    using namespace dnnl::graph::tests::unit::utils;
+    using dims = graph::dims;
+    auto conv_op = std::make_shared<graph::op_t>(
+            graph::op_kind::ConvolutionBackwardData);
+    conv_op->set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op->set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op->set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op->set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    // according to spec, group should be greater than 0
+    conv_op->set_attr<int64_t>(graph::op_attr::groups, 2);
+    conv_op->set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op->set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    conv_op->set_attr<dims>(graph::op_attr::dst_shape, dims {1, 6, 7, 7});
+
+    // prepare logical tensor
+    graph::logical_tensor_t diff_src
+            = logical_tensor_init(0, {1, 4, 5, 5}, graph::data_type::f32);
+    graph::logical_tensor_t weights
+            = logical_tensor_init(1, {2, 3, 4, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst
+            = logical_tensor_init(2, {1, 6, 7, 7}, graph::data_type::f32);
+
+    conv_op->add_input(diff_dst);
+    conv_op->add_input(weights);
+    conv_op->add_output(diff_src);
+    std::vector<graph::logical_tensor_t *> inputs {&diff_src, &weights};
+    std::vector<graph::logical_tensor_t *> outputs {&diff_dst};
+
+    ASSERT_EQ(graph::dnnl_impl::infer_dnnl_conv_bwd_data_output_shape(
+                      conv_op.get(), inputs, outputs),
+            graph::status::success);
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_partition_impl.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_partition_impl.cpp
deleted file mode 100644
index 59b0cd6446e..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_partition_impl.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <memory>
-
-#include "gtest/gtest.h"
-
-#include "backend/dnnl/dnnl_partition_impl.hpp"
-#include "backend/dnnl/kernels/pool.hpp"
-
-#include "interface/partition.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_dnnl_partition_impl_dnnl_partition_impl, InferShape) {
-    graph::engine_t &engine = *get_engine();
-    size_t id = 0;
-
-    graph::logical_tensor_t lt1
-            = utils::logical_tensor_init(id++, graph::data_type::f32);
-    graph::logical_tensor_t lt2
-            = utils::logical_tensor_init(id++, graph::data_type::f32);
-    graph::logical_tensor_t lt3
-            = utils::logical_tensor_init(id++, graph::data_type::f32);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&lt1, &lt2};
-    std::vector<graph::logical_tensor_t *> outputs {&lt3};
-
-    auto par = std::make_shared<graph::dnnl_impl::dnnl_partition_impl_t>(
-            engine.kind(), graph::fpmath_mode::strict,
-            graph::partition_kind_t::undef);
-    ASSERT_EQ(par->infer_shape(inputs, outputs), graph::status::success);
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_utils.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_utils.cpp
deleted file mode 100644
index 1e6c9a06a4a..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_utils.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <cstddef>
-#include <cstdint>
-
-#include "gtest/gtest.h"
-
-#include "backend/dnnl/utils.hpp"
-
-#include "cpp/unit/unit_test_common.hpp"
-#include "cpp/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_dnnl_utils_dnnl_utils, TryReverseAxis) {
-    auto par1 = std::make_pair<bool, int64_t>(true, 0);
-    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(0, 3), par1);
-
-    auto par2 = std::make_pair<bool, int64_t>(true, 2);
-    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(-1, 3), par2);
-
-    auto par3 = std::make_pair<bool, int64_t>(false, -4);
-    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(-4, 3), par3);
-
-    auto par4 = std::make_pair<bool, int64_t>(false, 4);
-    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(4, 3), par4);
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_dnnl_utils_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_utils_cpu.cpp
new file mode 100644
index 00000000000..d9046cb49fb
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_dnnl_utils_cpu.cpp
@@ -0,0 +1,38 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cstddef>
+#include <cstdint>
+
+#include "gtest/gtest.h"
+
+#include "backend/dnnl/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+
+TEST(test_dnnl_utils, TryReverseAxis) {
+    auto par1 = std::make_pair<bool, int64_t>(true, 0);
+    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(0, 3), par1);
+
+    auto par2 = std::make_pair<bool, int64_t>(true, 2);
+    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(-1, 3), par2);
+
+    auto par3 = std::make_pair<bool, int64_t>(false, -4);
+    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(-4, 3), par3);
+
+    auto par4 = std::make_pair<bool, int64_t>(false, 4);
+    ASSERT_EQ(graph::dnnl_impl::utils::try_reverse_axis(4, 3), par4);
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_eltwise.cpp b/tests/gtests/graph/unit/backend/dnnl/test_eltwise.cpp
index 88b2a659629..0f15894ccfc 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_eltwise.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,8 +78,8 @@ static inline void test_eltwise_common(std::vector<float> &src,
 
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t dst_ts(lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
 
@@ -102,8 +102,8 @@ static inline void test_eltwise_common(std::vector<float> &src,
     //if the memory cache runs correctly
     std::vector<float> dst2(dst.size(), 0.0);
 
-    test_tensor src_ts2(src_lt, eng, src);
-    test_tensor dst_ts2(lt, eng, dst2);
+    test_tensor_t src_ts2(src_lt, eng, src);
+    test_tensor_t dst_ts2(lt, eng, dst2);
 
     cp.execute(strm, {src_ts2.get()}, {dst_ts2.get()});
     strm->wait();
@@ -225,15 +225,15 @@ static inline void test_eltwise_bwd_common(
     std::vector<const graph::logical_tensor_t *> outputs {&diff_src_lt};
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor fwd_data_ts(fwd_data_lt, eng, fwd_data);
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst_data);
-    test_tensor diff_src_ts(diff_src_lt, eng, diff_src_data);
+    test_tensor_t fwd_data_ts(fwd_data_lt, eng, fwd_data);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst_data);
+    test_tensor_t diff_src_ts(diff_src_lt, eng, diff_src_data);
 
-    std::vector<test_tensor> input_ts {fwd_data_ts, diff_dst_ts};
-    std::vector<test_tensor> output_ts {diff_src_ts};
+    std::vector<test_tensor_t> input_ts {fwd_data_ts, diff_dst_ts};
+    std::vector<test_tensor_t> output_ts {diff_src_ts};
     graph::stream_t *strm = get_stream();
-    cp.execute(strm, test_tensor::to_graph_tensor(input_ts),
-            test_tensor::to_graph_tensor(output_ts));
+    cp.execute(strm, test_tensor_t::to_graph_tensor(input_ts),
+            test_tensor_t::to_graph_tensor(output_ts));
     strm->wait();
     diff_src_data = diff_src_ts.as_vec_type<float>();
     for (size_t i = 0; i < diff_src_data.size(); ++i) {
@@ -723,8 +723,8 @@ TEST(test_eltwise_execute_subgraph_fp32, Shuffle) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> dst_data(product(reshape1_dst_shape));
-        test_tensor reshape0_src_ts(reshape0_src, engine, src_data);
-        test_tensor reshape1_dst_ts(reshape1_dst, engine, dst_data);
+        test_tensor_t reshape0_src_ts(reshape0_src, engine, src_data);
+        test_tensor_t reshape1_dst_ts(reshape1_dst, engine, dst_data);
 
         cp.execute(strm, {reshape0_src_ts.get()}, {reshape1_dst_ts.get()});
         strm->wait();
@@ -786,9 +786,9 @@ TEST(test_eltwise_execute_subgraph_fp32, ReciprocalMul) {
     cp.query_logical_tensor(mul_dst.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src0_ts(src0, eng, src0_data);
-    test_tensor src1_ts(src1, eng, src1_data);
-    test_tensor dst_ts(lt, eng, dst_data);
+    test_tensor_t src0_ts(src0, eng, src0_data);
+    test_tensor_t src1_ts(src1, eng, src1_data);
+    test_tensor_t dst_ts(lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()}),
@@ -893,12 +893,12 @@ TEST(test_eltwise_execute, Sum) {
     std::vector<float> input4_data(product(input_dims), 1);
     std::vector<float> output_data(product(input_dims), 0);
 
-    test_tensor input0_ts(input0, engine, input0_data);
-    test_tensor input1_ts(input1, engine, input1_data);
-    test_tensor input2_ts(input2, engine, input2_data);
-    test_tensor input3_ts(input3, engine, input3_data);
-    test_tensor input4_ts(input4, engine, input4_data);
-    test_tensor output_ts(output3, engine, output_data);
+    test_tensor_t input0_ts(input0, engine, input0_data);
+    test_tensor_t input1_ts(input1, engine, input1_data);
+    test_tensor_t input2_ts(input2, engine, input2_data);
+    test_tensor_t input3_ts(input3, engine, input3_data);
+    test_tensor_t input4_ts(input4, engine, input4_data);
+    test_tensor_t output_ts(output3, engine, output_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -1038,9 +1038,9 @@ class eltwise_binary_t
         cp.query_logical_tensor(eltwise_src_lt.id, &esrc_lt);
         cp.query_logical_tensor(binary_src_lt.id, &bsrc_lt);
         cp.query_logical_tensor(binary_dst_lt.id, &bdst_lt);
-        test_tensor src_ts(esrc_lt, eng, src);
-        test_tensor binary_src_ts(bsrc_lt, eng, binary_src);
-        test_tensor binary_dst_ts(bdst_lt, eng);
+        test_tensor_t src_ts(esrc_lt, eng, src);
+        test_tensor_t binary_src_ts(bsrc_lt, eng, binary_src);
+        test_tensor_t binary_dst_ts(bdst_lt, eng);
 
         graph::stream_t *strm = get_stream();
 
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_fusion_info.cpp b/tests/gtests/graph/unit/backend/dnnl/test_fusion_info.cpp
deleted file mode 100644
index fb0963e6c75..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_fusion_info.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include <memory>
-
-#include "gtest/gtest.h"
-
-#include "backend/dnnl/fusion_info.hpp"
-
-#include "interface/c_types_map.hpp"
-
-namespace graph = dnnl::impl::graph;
-
-TEST(test_fusion_info_fusion_info, GetMutableZeroPoints) {
-    auto zp_op = std::make_shared<graph::op_t>(
-            graph::dnnl_impl::op_kind::dnnl_add_zps, "zps_op");
-
-    graph::dnnl_impl::fusion_info_t info;
-    ASSERT_NO_THROW(info.set_zero_points(zp_op, false, 0));
-    ASSERT_EQ(info.get_mutable_zero_points(false, 0), zp_op.get());
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_fusion_info_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_fusion_info_cpu.cpp
new file mode 100644
index 00000000000..84c1fd3ed63
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_fusion_info_cpu.cpp
@@ -0,0 +1,33 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "backend/dnnl/fusion_info.hpp"
+
+#include "interface/c_types_map.hpp"
+
+namespace graph = dnnl::impl::graph;
+
+TEST(test_fusion_info, GetMutableZeroPoints) {
+    auto zp_op = std::make_shared<graph::op_t>(
+            graph::dnnl_impl::op_kind::dnnl_add_zps, "zps_op");
+
+    graph::dnnl_impl::fusion_info_t info;
+    ASSERT_NO_THROW(info.set_zero_points(zp_op, false, 0));
+    ASSERT_EQ(info.get_mutable_zero_points(false, 0), zp_op.get());
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_graph.cpp b/tests/gtests/graph/unit/backend/dnnl/test_graph.cpp
deleted file mode 100644
index 9bb6b138c66..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_graph.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <string>
-
-#include "gtest/gtest.h"
-
-#include "interface/c_types_map.hpp"
-#include "interface/graph.hpp"
-#include "interface/logical_tensor.hpp"
-#include "interface/value.hpp"
-
-#include "backend/dnnl/dnnl_backend.hpp"
-#include "backend/fake/fake_backend.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-TEST(test_graph_graph, GetDnnlPartitions) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    const auto engine_kind = get_test_engine_kind();
-    graph_t agraph(engine_kind);
-    op_t conv {0, op_kind::Convolution, std::string("conv2d")};
-    op_t relu {1, op_kind::ReLU, std::string("relu")};
-    op_t end {2, op_kind::End, std::string("end")};
-    logical_tensor_t conv_src = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t conv_wei = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t conv_dst = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t relu_dst = logical_tensor_init(3, data_type::f32);
-    conv.add_input(conv_src);
-    conv.add_input(conv_wei);
-    conv.add_output(conv_dst);
-    conv.set_attr<std::vector<int64_t>>(op_attr::strides, {1, 1});
-    conv.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
-    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {0, 0});
-    conv.set_attr<std::vector<int64_t>>(op_attr::pads_end, {0, 0});
-    conv.set_attr<int64_t>(op_attr::groups, 1);
-    conv.set_attr<std::string>(op_attr::data_format, "NCX");
-    conv.set_attr<std::string>(op_attr::weights_format, "OIX");
-    relu.add_input(conv_dst);
-    relu.add_output(relu_dst);
-    end.add_input(relu_dst);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&end), status::success);
-    ASSERT_EQ(agraph.num_ops(), 3U);
-
-    ASSERT_EQ(agraph.finalize(), status::success);
-    auto &dnnl_bkd = dnnl_impl::dnnl_backend_t::get_singleton();
-    dnnl_bkd.get_partitions(agraph, partition_policy::fusion);
-    auto &fake_bkd = fake_impl::fake_backend_t::get_singleton();
-    fake_bkd.get_partitions(agraph, partition_policy::fusion);
-    ASSERT_EQ(agraph.get_num_partitions(), 2U);
-    auto p1 = agraph.get_partitions()[0];
-    ASSERT_NE(p1->get_assigned_backend()->get_name(),
-            std::string("fake_backend"));
-    ASSERT_TRUE(p1->is_initialized());
-    auto p2 = agraph.get_partitions()[1];
-    ASSERT_EQ(p2->get_assigned_backend()->get_name(),
-            std::string("fake_backend"));
-    ASSERT_TRUE(p2->is_initialized());
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_graph_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_graph_cpu.cpp
new file mode 100644
index 00000000000..f7ee77e33fd
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_graph_cpu.cpp
@@ -0,0 +1,80 @@
+/*******************************************************************************
+* Copyright 2021-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+#include "interface/c_types_map.hpp"
+#include "interface/graph.hpp"
+#include "interface/logical_tensor.hpp"
+#include "interface/value.hpp"
+
+#include "backend/dnnl/dnnl_backend.hpp"
+#include "backend/fake/fake_backend.hpp"
+
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+TEST(test_graph, GetDnnlPartitions) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    std::vector<engine_kind_t> engine_kinds
+            = {engine_kind::cpu, engine_kind::gpu};
+    for (const auto &engine_kind : engine_kinds) {
+        graph_t agraph(engine_kind);
+        op_t conv {0, op_kind::Convolution, std::string("conv2d")};
+        op_t relu {1, op_kind::ReLU, std::string("relu")};
+        op_t end {2, op_kind::End, std::string("end")};
+        logical_tensor_t conv_src = logical_tensor_init(0, data_type::f32);
+        logical_tensor_t conv_wei = logical_tensor_init(1, data_type::f32);
+        logical_tensor_t conv_dst = logical_tensor_init(2, data_type::f32);
+        logical_tensor_t relu_dst = logical_tensor_init(3, data_type::f32);
+        conv.add_input(conv_src);
+        conv.add_input(conv_wei);
+        conv.add_output(conv_dst);
+        conv.set_attr<std::vector<int64_t>>(op_attr::strides, {1, 1});
+        conv.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
+        conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {0, 0});
+        conv.set_attr<std::vector<int64_t>>(op_attr::pads_end, {0, 0});
+        conv.set_attr<int64_t>(op_attr::groups, 1);
+        conv.set_attr<std::string>(op_attr::data_format, "NCX");
+        conv.set_attr<std::string>(op_attr::weights_format, "OIX");
+        relu.add_input(conv_dst);
+        relu.add_output(relu_dst);
+        end.add_input(relu_dst);
+        ASSERT_EQ(agraph.add_op(&conv), status::success);
+        ASSERT_EQ(agraph.add_op(&relu), status::success);
+        ASSERT_EQ(agraph.add_op(&end), status::success);
+        ASSERT_EQ(agraph.num_ops(), 3U);
+
+        ASSERT_EQ(agraph.finalize(), status::success);
+        auto &dnnl_bkd = dnnl_impl::dnnl_backend_t::get_singleton();
+        dnnl_bkd.get_partitions(agraph, partition_policy::fusion);
+        auto &fake_bkd = fake_impl::fake_backend_t::get_singleton();
+        fake_bkd.get_partitions(agraph, partition_policy::fusion);
+        ASSERT_EQ(agraph.get_num_partitions(), 2U);
+        auto p1 = agraph.get_partitions()[0];
+        ASSERT_NE(p1->get_assigned_backend()->get_name(),
+                std::string("fake_backend"));
+        ASSERT_TRUE(p1->is_initialized());
+        auto p2 = agraph.get_partitions()[1];
+        ASSERT_EQ(p2->get_assigned_backend()->get_name(),
+                std::string("fake_backend"));
+        ASSERT_TRUE(p2->is_initialized());
+    }
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_group_norm.cpp b/tests/gtests/graph/unit/backend/dnnl/test_group_norm.cpp
index f38e9ecd2e5..ddbff71f78a 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_group_norm.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_group_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -96,12 +96,12 @@ TEST(test_group_norm_execute, GroupnormTraining) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scale_ts(scale_lt, eng, scale);
-    test_tensor shift_ts(shift_lt, eng, shift);
-    test_tensor dst_ts(dst_lt, eng, dst);
-    test_tensor mean_ts(mean_lt, eng, mean);
-    test_tensor var_ts(variance_lt, eng, var);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scale_ts(scale_lt, eng, scale);
+    test_tensor_t shift_ts(shift_lt, eng, shift);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+    test_tensor_t mean_ts(mean_lt, eng, mean);
+    test_tensor_t var_ts(variance_lt, eng, var);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), scale_ts.get(), shift_ts.get()},
@@ -182,10 +182,10 @@ TEST(test_group_norm_execute, GroupnormInference) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scale_ts(scale_lt, eng, scale);
-    test_tensor shift_ts(shift_lt, eng, shift);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scale_ts(scale_lt, eng, scale);
+    test_tensor_t shift_ts(shift_lt, eng, shift);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), scale_ts.get(), shift_ts.get()},
@@ -289,11 +289,11 @@ TEST(test_group_norm_execute, GroupnormSwishTypecastQuant) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scale_ts(scale_lt, eng, scale);
-    test_tensor shift_ts(shift_lt, eng, shift);
-    test_tensor dst_ts(quant_dst_lt, eng);
-    test_tensor ref_dst_ts(quant_dst_lt, eng);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scale_ts(scale_lt, eng, scale);
+    test_tensor_t shift_ts(shift_lt, eng, shift);
+    test_tensor_t dst_ts(quant_dst_lt, eng);
+    test_tensor_t ref_dst_ts(quant_dst_lt, eng);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), scale_ts.get(), shift_ts.get()},
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_insert_ops.cpp b/tests/gtests/graph/unit/backend/dnnl/test_insert_ops.cpp
deleted file mode 100644
index b688cd19c31..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_insert_ops.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <algorithm>
-#include <tuple>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "interface/graph.hpp"
-#include "interface/partition.hpp"
-
-#include "backend/dnnl/dnnl_backend.hpp"
-#include "backend/dnnl/dnnl_partition_impl.hpp"
-
-#include "backend/dnnl/kernels/large_partition.hpp"
-#include "backend/dnnl/op_executable.hpp"
-#include "backend/dnnl/passes/constant_propagation.hpp"
-#include "backend/dnnl/passes/insert_ops.hpp"
-#include "backend/dnnl/passes/layout_propagation.hpp"
-#include "backend/dnnl/passes/lower.hpp"
-#include "backend/dnnl/passes/memory_planning.hpp"
-#include "backend/dnnl/passes/transform.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_insert_ops_insert_ops, InsertPermuteForOpOnlyRequireDataFormat) {
-    graph::engine_t &g_eng = *get_engine();
-    dnnl::engine p_eng = graph::dnnl_impl::make_dnnl_engine(g_eng);
-    size_t id = 0;
-
-    auto op = std::make_shared<graph::op_t>(id++,
-            static_cast<graph::op_kind_t>(
-                    graph::dnnl_impl::op_kind::kDnnl_prelu),
-            "prelu");
-
-    op->set_attr<std::string>(graph::op_attr::data_format, "NXC");
-    op->set_attr<bool>(graph::op_attr::per_channel_broadcast, true);
-
-    graph::dims dims {1, 2, 2, 2};
-    graph::dims wei_dims {1, 2, 2, 2};
-
-    graph::logical_tensor_t src_lt
-            = utils::logical_tensor_init(id++, dims, graph::data_type::f32);
-    graph::logical_tensor_t wei_lt
-            = utils::logical_tensor_init(id++, wei_dims, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt
-            = utils::logical_tensor_init(id++, dims, graph::data_type::f32);
-
-    op->add_input(src_lt);
-    op->add_input(wei_lt);
-    op->add_output(dst_lt);
-    UNUSED(p_eng);
-    auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(
-            std::vector<std::shared_ptr<graph::op_t>> {op},
-            /* reset_layout */ false);
-    graph::dnnl_impl::dnnl_backend_t::get_singleton();
-
-    auto &prelu_op = subgraph->get_ops()[0];
-    graph::logical_tensor_t in_lt1
-            = utils::logical_tensor_init(id++, dims, graph::data_type::f32);
-    prelu_op->add_input(in_lt1);
-
-    auto &mgr = subgraph->fusion_info_mgr_;
-    auto key = mgr.init_info();
-    prelu_op->set_attr<int64_t>(
-            graph::dnnl_impl::op_attr::fusion_info_key, key);
-    auto &fusion_info = mgr.get_mutable_info(key);
-
-    auto post_op = std::make_shared<graph::op_t>(id++,
-            static_cast<graph::op_kind_t>(
-                    graph::dnnl_impl::op_kind::dnnl_binary),
-            "add");
-    post_op->set_attr<int64_t>(graph::dnnl_impl::op_attr::alg_kind,
-            static_cast<int64_t>(graph::dnnl_impl::get_binary_alg_map().at(
-                    graph::op_kind::Add)));
-    fusion_info.append_post_binary(post_op, {2});
-    ASSERT_EQ(graph::dnnl_impl::insert_permute_for_op_only_require_data_format(
-                      subgraph),
-            graph::status::success);
-}
-
-TEST(test_insert_ops_insert_ops, InsertToGroupForReorder) {
-    graph::engine_t &g_eng = *get_engine();
-    dnnl::engine p_eng = graph::dnnl_impl::make_dnnl_engine(g_eng);
-    size_t id = 0;
-    using item_type = std::tuple<graph::dims, graph::dims, graph::status_t>;
-    std::vector<item_type> items {
-            item_type(graph::dims {1, 2, 2, 2}, graph::dims {1, 2, 2, 2},
-                    graph::status::success),
-            item_type(graph::dims {1, 2, 2, 2}, graph::dims {1, 2, 2},
-                    graph::status::unimplemented),
-            item_type(graph::dims {1, 2, 2}, graph::dims {1, 2, 2, 2},
-                    graph::status::invalid_shape),
-            item_type(graph::dims {2, 2, 2}, graph::dims {1, 2, 2, 2},
-                    graph::status::success),
-            item_type(graph::dims {2}, graph::dims {1, 2, 2, 2},
-                    graph::status::invalid_shape),
-    };
-    for (auto &item : items) {
-        auto &in_dims = std::get<0>(item);
-        auto &out_dims = std::get<1>(item);
-        auto &status = std::get<2>(item);
-
-        auto op = std::make_shared<graph::op_t>(id++,
-                static_cast<graph::op_kind_t>(
-                        graph::dnnl_impl::op_kind::dnnl_reorder),
-                "reorder");
-        graph::logical_tensor_t in_lt = utils::logical_tensor_init(
-                id++, in_dims, graph::data_type::f32);
-        graph::logical_tensor_t out_lt = utils::logical_tensor_init(
-                id++, out_dims, graph::data_type::f32);
-        op->add_input(in_lt);
-        op->add_output(out_lt);
-        UNUSED(p_eng);
-        auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(
-                std::vector<std::shared_ptr<graph::op_t>> {op},
-                /* reset_layout */ false);
-        ASSERT_EQ(graph::dnnl_impl::insert_to_group_for_reorder(subgraph),
-                status);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_insert_ops_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_insert_ops_cpu.cpp
new file mode 100644
index 00000000000..ec348aa1b03
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_insert_ops_cpu.cpp
@@ -0,0 +1,139 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <algorithm>
+#include <tuple>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "interface/graph.hpp"
+#include "interface/partition.hpp"
+
+#include "backend/dnnl/dnnl_backend.hpp"
+#include "backend/dnnl/dnnl_partition_impl.hpp"
+
+#include "backend/dnnl/kernels/large_partition.hpp"
+#include "backend/dnnl/op_executable.hpp"
+#include "backend/dnnl/passes/constant_propagation.hpp"
+#include "backend/dnnl/passes/insert_ops.hpp"
+#include "backend/dnnl/passes/layout_propagation.hpp"
+#include "backend/dnnl/passes/lower.hpp"
+#include "backend/dnnl/passes/memory_planning.hpp"
+#include "backend/dnnl/passes/transform.hpp"
+
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+
+TEST(test_insert_ops, InsertPermuteForOpOnlyRequireDataFormat) {
+    graph::engine_t &g_eng = *get_engine();
+    dnnl::engine p_eng = graph::dnnl_impl::make_dnnl_engine(g_eng);
+    size_t id = 0;
+
+    auto op = std::make_shared<graph::op_t>(id++,
+            static_cast<graph::op_kind_t>(
+                    graph::dnnl_impl::op_kind::kDnnl_prelu),
+            "prelu");
+
+    op->set_attr<std::string>(graph::op_attr::data_format, "NXC");
+    op->set_attr<bool>(graph::op_attr::per_channel_broadcast, true);
+
+    graph::dims dims {1, 2, 2, 2};
+    graph::dims wei_dims {1, 2, 2, 2};
+
+    graph::logical_tensor_t src_lt
+            = utils::logical_tensor_init(id++, dims, graph::data_type::f32);
+    graph::logical_tensor_t wei_lt
+            = utils::logical_tensor_init(id++, wei_dims, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt
+            = utils::logical_tensor_init(id++, dims, graph::data_type::f32);
+
+    op->add_input(src_lt);
+    op->add_input(wei_lt);
+    op->add_output(dst_lt);
+    UNUSED(p_eng);
+    auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(
+            std::vector<std::shared_ptr<graph::op_t>> {op},
+            /* reset_layout */ false);
+    graph::dnnl_impl::dnnl_backend_t::get_singleton();
+
+    auto &prelu_op = subgraph->get_ops()[0];
+    graph::logical_tensor_t in_lt1
+            = utils::logical_tensor_init(id++, dims, graph::data_type::f32);
+    prelu_op->add_input(in_lt1);
+
+    auto &mgr = subgraph->fusion_info_mgr_;
+    auto key = mgr.init_info();
+    prelu_op->set_attr<int64_t>(
+            graph::dnnl_impl::op_attr::fusion_info_key, key);
+    auto &fusion_info = mgr.get_mutable_info(key);
+
+    auto post_op = std::make_shared<graph::op_t>(id++,
+            static_cast<graph::op_kind_t>(
+                    graph::dnnl_impl::op_kind::dnnl_binary),
+            "add");
+    post_op->set_attr<int64_t>(graph::dnnl_impl::op_attr::alg_kind,
+            static_cast<int64_t>(graph::dnnl_impl::get_binary_alg_map().at(
+                    graph::op_kind::Add)));
+    fusion_info.append_post_binary(post_op, {2});
+    ASSERT_EQ(graph::dnnl_impl::insert_permute_for_op_only_require_data_format(
+                      subgraph),
+            graph::status::success);
+}
+
+TEST(test_insert_ops, InsertToGroupForReorder) {
+    graph::engine_t &g_eng = *get_engine();
+    dnnl::engine p_eng = graph::dnnl_impl::make_dnnl_engine(g_eng);
+    size_t id = 0;
+    using item_type = std::tuple<graph::dims, graph::dims, graph::status_t>;
+    std::vector<item_type> items {
+            item_type(graph::dims {1, 2, 2, 2}, graph::dims {1, 2, 2, 2},
+                    graph::status::success),
+            item_type(graph::dims {1, 2, 2, 2}, graph::dims {1, 2, 2},
+                    graph::status::unimplemented),
+            item_type(graph::dims {1, 2, 2}, graph::dims {1, 2, 2, 2},
+                    graph::status::invalid_shape),
+            item_type(graph::dims {2, 2, 2}, graph::dims {1, 2, 2, 2},
+                    graph::status::success),
+            item_type(graph::dims {2}, graph::dims {1, 2, 2, 2},
+                    graph::status::invalid_shape),
+    };
+    for (auto &item : items) {
+        auto &in_dims = std::get<0>(item);
+        auto &out_dims = std::get<1>(item);
+        auto &status = std::get<2>(item);
+
+        auto op = std::make_shared<graph::op_t>(id++,
+                static_cast<graph::op_kind_t>(
+                        graph::dnnl_impl::op_kind::dnnl_reorder),
+                "reorder");
+        graph::logical_tensor_t in_lt = utils::logical_tensor_init(
+                id++, in_dims, graph::data_type::f32);
+        graph::logical_tensor_t out_lt = utils::logical_tensor_init(
+                id++, out_dims, graph::data_type::f32);
+        op->add_input(in_lt);
+        op->add_output(out_lt);
+        UNUSED(p_eng);
+        auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(
+                std::vector<std::shared_ptr<graph::op_t>> {op},
+                /* reset_layout */ false);
+        ASSERT_EQ(graph::dnnl_impl::insert_to_group_for_reorder(subgraph),
+                status);
+    }
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_internal_attrs.cpp b/tests/gtests/graph/unit/backend/dnnl/test_internal_attrs.cpp
deleted file mode 100644
index 11658013828..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_internal_attrs.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gtest/gtest.h"
-
-#include "backend/dnnl/internal_attrs.hpp"
-
-namespace graph = dnnl::impl::graph;
-
-TEST(test_internal_attrs_internal_attrs, InternalAttr2str) {
-    using namespace graph::dnnl_impl::op_attr;
-#define CASE(a) ASSERT_EQ(internal_attr2str(a), #a)
-    CASE(canonicalized);
-    CASE(change_layout);
-    CASE(is_constant);
-    CASE(is_convtranspose);
-    CASE(is_training);
-    CASE(fwd_alg_kind);
-    CASE(fuse_relu);
-    CASE(with_bias);
-    CASE(with_runtime_scales);
-    CASE(with_runtime_zps);
-    CASE(with_runtime_src_zps);
-    CASE(with_runtime_dst_zps);
-    CASE(is_bias_add);
-    CASE(with_sum);
-    CASE(alg_kind);
-    CASE(fusion_info_key);
-    CASE(dw_type);
-    CASE(kind);
-    CASE(p);
-    CASE(dst_zps);
-    CASE(src_zps);
-    CASE(permutation);
-#undef CASE
-    ASSERT_EQ(internal_attr2str(graph::op_attr::alpha), "undefined_attr");
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_internal_attrs_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_internal_attrs_cpu.cpp
new file mode 100644
index 00000000000..0e45a5ce410
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_internal_attrs_cpu.cpp
@@ -0,0 +1,50 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gtest/gtest.h"
+
+#include "backend/dnnl/internal_attrs.hpp"
+
+namespace graph = dnnl::impl::graph;
+
+TEST(test_internal_attrs, InternalAttr2str) {
+    using namespace graph::dnnl_impl::op_attr;
+#define CASE(a) ASSERT_EQ(internal_attr2str(a), #a)
+    CASE(canonicalized);
+    CASE(change_layout);
+    CASE(is_constant);
+    CASE(is_convtranspose);
+    CASE(is_training);
+    CASE(fwd_alg_kind);
+    CASE(fuse_relu);
+    CASE(with_bias);
+    CASE(with_runtime_scales);
+    CASE(with_runtime_zps);
+    CASE(with_runtime_src_zps);
+    CASE(with_runtime_dst_zps);
+    CASE(is_bias_add);
+    CASE(with_sum);
+    CASE(alg_kind);
+    CASE(fusion_info_key);
+    CASE(dw_type);
+    CASE(kind);
+    CASE(p);
+    CASE(dst_zps);
+    CASE(src_zps);
+    CASE(permutation);
+#undef CASE
+    ASSERT_EQ(internal_attr2str(graph::op_attr::alpha), "undefined_attr");
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_interpolate.cpp b/tests/gtests/graph/unit/backend/dnnl/test_interpolate.cpp
index 237a9cb3523..380dcdd815d 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_interpolate.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_interpolate.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -69,8 +69,8 @@ TEST(test_interpolate_execute, InterpolateForwardNearest) {
     graph::logical_tensor_t lt;
     cp.query_logical_tensor(dst_lt.id, &lt);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(lt, engine, dst);
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
     strm->wait();
     dst = dst_ts.as_vec_type<float>();
@@ -137,9 +137,9 @@ TEST(test_interpolate_execute, InterpolateAddForwardNearest) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor src1_ts(src1_lt, engine, src1);
-    test_tensor dst_add_ts(*lt_outs[0], engine, dst_add);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t src1_ts(src1_lt, engine, src1);
+    test_tensor_t dst_add_ts(*lt_outs[0], engine, dst_add);
     cp.execute(strm, {src_ts.get(), src1_ts.get()}, {dst_add_ts.get()});
     strm->wait();
     dst_add = dst_add_ts.as_vec_type<float>();
@@ -207,8 +207,8 @@ TEST(test_interpolate_execute, InterpolateSwish) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_mul_ts(*lt_outs[0], engine, dst_mul);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_mul_ts(*lt_outs[0], engine, dst_mul);
     cp.execute(strm, {src_ts.get()}, {dst_mul_ts.get()});
     strm->wait();
 }
@@ -280,9 +280,9 @@ TEST(test_interpolate_execute, Interpolate3PostOps) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor src_div_ts(src_div_lt, engine, src_div);
-    test_tensor dst_div_ts(dst_div_lt, engine, dst_div);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t src_div_ts(src_div_lt, engine, src_div);
+    test_tensor_t dst_div_ts(dst_div_lt, engine, dst_div);
     cp.execute(strm, {src_ts.get(), src_div_ts.get()}, {dst_div_ts.get()});
     strm->wait();
 }
@@ -397,9 +397,9 @@ TEST(test_interpolate_execute, InterpolatePostOps) {
 
         p.compile(&cp, lt_ins, lt_outs, engine);
 
-        test_tensor src_ts(src_lt, engine, src);
-        test_tensor src1_ts(src1_lt, engine, src1);
-        test_tensor dst_add_ts(*lt_outs[0], engine, dst_add);
+        test_tensor_t src_ts(src_lt, engine, src);
+        test_tensor_t src1_ts(src1_lt, engine, src1);
+        test_tensor_t dst_add_ts(*lt_outs[0], engine, dst_add);
         if (std::find(
                     two_inputs_ops.begin(), two_inputs_ops.end(), post_op_kind)
                 != two_inputs_ops.end()) {
@@ -458,8 +458,8 @@ TEST(test_interpolate_execute, InterpolateForwardLinear) {
     graph::logical_tensor_t dst_lt_tmp;
     cp.query_logical_tensor(dst_lt.id, &dst_lt_tmp);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(dst_lt_tmp, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(dst_lt_tmp, engine, dst);
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
     strm->wait();
     dst = dst_ts.as_vec_type<float>();
@@ -510,9 +510,9 @@ TEST(test_interpolate_execute, InterpolateBackwardNearest) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(diff_src_lt, eng, diff_src);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(diff_src_lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), diff_dst_ts.get()}, {diff_src_ts.get()});
@@ -566,9 +566,9 @@ TEST(test_interpolate_execute, InterpolateBackwardLinear) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(diff_src_lt, eng, diff_src);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(diff_src_lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), diff_dst_ts.get()}, {diff_src_ts.get()});
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_large_partition.cpp b/tests/gtests/graph/unit/backend/dnnl/test_large_partition.cpp
index c283577004b..6042eaa62b7 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_large_partition.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_large_partition.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,6 @@
 #include <functional>
 #include <random>
 
-#include "oneapi/dnnl/dnnl_graph.hpp"
 #include "gtest/gtest.h"
 
 #include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
@@ -57,7 +56,7 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2Block) {
     graph::engine_t *eng = get_engine();
     graph::stream_t *strm = get_stream();
 
-    utils::id_generator id_gen;
+    utils::id_generator_t id_gen;
     graph::graph_t g(eng->kind());
     utils::construct_int8_resnet50_stage2_block(
             &g, id_gen, 3, false, false, 1.2f, 1, 0);
@@ -93,7 +92,7 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2Block) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts, ref_outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts, ref_outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -110,8 +109,8 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2Block) {
     ASSERT_EQ(run_graph(g, inputs_ts, ref_outputs_ts, *eng, *strm),
             graph::status::success);
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 
@@ -124,7 +123,7 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2BlockWithZeroZps) {
     graph::engine_t *eng = get_engine();
     graph::stream_t *strm = get_stream();
 
-    utils::id_generator id_gen;
+    utils::id_generator_t id_gen;
     graph::graph_t g(eng->kind());
     int64_t zps_postbinary = eng->kind() == graph::engine_kind::gpu ? 0 : 78;
     utils::construct_int8_resnet50_stage2_block(
@@ -161,7 +160,7 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2BlockWithZeroZps) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts, ref_outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts, ref_outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -178,8 +177,8 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2BlockWithZeroZps) {
     ASSERT_EQ(run_graph(g, inputs_ts, ref_outputs_ts, *eng, *strm),
             graph::status::success);
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 
@@ -192,7 +191,7 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2BlockWithQuantWei) {
     graph::engine_t *eng = get_engine();
     graph::stream_t *strm = get_stream();
 
-    utils::id_generator id_gen;
+    utils::id_generator_t id_gen;
     graph::graph_t g(eng->kind());
     utils::construct_int8_resnet50_stage2_block(&g, id_gen, 3, true, true);
     g.finalize();
@@ -227,7 +226,7 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2BlockWithQuantWei) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -240,8 +239,8 @@ TEST(test_large_partition_execute, Int8Resnet50Stage2BlockWithQuantWei) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -250,7 +249,7 @@ TEST(test_large_partition_execute, F32Resnet50Stage2Block) {
     graph::engine_t *eng = get_engine();
     graph::stream_t *strm = get_stream();
 
-    utils::id_generator id_gen;
+    utils::id_generator_t id_gen;
     graph::graph_t g(eng->kind());
     utils::construct_f32_resnet50_stage2_block(
             &g, id_gen, 3, /* use biasadd */ true);
@@ -263,6 +262,10 @@ TEST(test_large_partition_execute, F32Resnet50Stage2Block) {
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
 
+    // set constant tensor cache capacity as 1GB
+    dnnl_graph_set_constant_tensor_cache_capacity(
+            static_cast<dnnl_engine_kind_t>(eng->kind()), 1024);
+
     // compile
     graph::partition_t p;
     p.init(part);
@@ -290,11 +293,10 @@ TEST(test_large_partition_execute, F32Resnet50Stage2Block) {
 
     std::vector<std::vector<float>> inputs_data;
     std::vector<std::vector<float>> outputs_data, ref_outputs_data;
-    std::vector<test_tensor> inputs_ts, outputs_ts, ref_outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts, ref_outputs_ts;
 
     for (auto &lt : inputs) {
-        inputs_data.emplace_back(
-                std::vector<float>(utils::product(ltw(lt).vdims())));
+        inputs_data.emplace_back(utils::product(ltw(lt).vdims()));
         fill_data(inputs_data.back(), ltw(lt).data_type());
         inputs_ts.emplace_back(*lt, eng, inputs_data.back());
     }
@@ -304,32 +306,30 @@ TEST(test_large_partition_execute, F32Resnet50Stage2Block) {
         cp.query_logical_tensor(lt->id, &compiled_output);
         const std::vector<int64_t> dims = ltw(compiled_output).vdims();
         auto size = utils::product(dims);
-        outputs_data.emplace_back(std::vector<float>(size));
+        outputs_data.emplace_back(size);
         outputs_ts.emplace_back(compiled_output, eng, outputs_data.back());
-        ref_outputs_data.emplace_back(std::vector<float>(size));
+        ref_outputs_data.emplace_back(size);
         ref_outputs_ts.emplace_back(
                 compiled_output, eng, ref_outputs_data.back());
     }
 
-    // set constant tensor cache capacity as 1GB
-    dnnl::graph::set_constant_tensor_cache_capacity(
-            static_cast<engine::kind>(eng->kind()), 1024);
-
     ASSERT_EQ(run_graph(g, inputs_ts, ref_outputs_ts, *eng, *strm),
             graph::status::success);
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     // execute another iteration to test constant cache hit
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     // disable constant tensor cache and then to test constant cache miss
-    dnnl::graph::set_constant_tensor_cache_capacity(
-            static_cast<engine::kind>(eng->kind()), 0);
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    dnnl_graph_set_constant_tensor_cache_capacity(
+            static_cast<dnnl_engine_kind_t>(eng->kind()), 0);
+    graph::compiled_partition_t cp1(p);
+    ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng), graph::status::success);
+    ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 
@@ -342,7 +342,7 @@ TEST(test_large_partition_execute, ItexInt8Resnet50Stage2Block) {
     graph::engine_t *eng = get_engine();
     graph::stream_t *strm = get_stream();
 
-    utils::id_generator id_gen;
+    utils::id_generator_t id_gen;
     graph::graph_t g(eng->kind());
     utils::construct_itex_int8_resnet50_stage2_block(&g, id_gen, 3);
     g.finalize();
@@ -378,7 +378,7 @@ TEST(test_large_partition_execute, ItexInt8Resnet50Stage2Block) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -392,12 +392,12 @@ TEST(test_large_partition_execute, ItexInt8Resnet50Stage2Block) {
     }
 
     std::cout << "----------------iter 1----------------\n";
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     std::cout << "----------------iter 2----------------\n";
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -517,7 +517,7 @@ TEST(test_large_partition_execute, Int8Mha_CPU) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -530,8 +530,8 @@ TEST(test_large_partition_execute, Int8Mha_CPU) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -574,7 +574,7 @@ TEST(test_large_partition_execute, Int8DistilBertMha) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -587,8 +587,8 @@ TEST(test_large_partition_execute, Int8DistilBertMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -631,7 +631,7 @@ TEST(test_large_partition_execute, Int8GptMha) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -644,8 +644,8 @@ TEST(test_large_partition_execute, Int8GptMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -660,7 +660,9 @@ TEST(test_large_partition_execute, F32Mha) {
 
     ASSERT_EQ(g.get_ops().size(), 7U);
 
-    graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+    graph::pass::pass_base_ptr apass = get_pass(
+            eng->kind() == graph::engine_kind::cpu ? "float_sdp_fusion_cpu"
+                                                   : "float_sdp_fusion_gpu");
     apass->run(g);
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
@@ -688,7 +690,7 @@ TEST(test_large_partition_execute, F32Mha) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -701,8 +703,8 @@ TEST(test_large_partition_execute, F32Mha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -717,7 +719,9 @@ TEST(test_large_partition_execute, F32DistilBertMha) {
 
     ASSERT_EQ(g.get_ops().size(), 7U);
 
-    graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+    graph::pass::pass_base_ptr apass = get_pass(
+            eng->kind() == graph::engine_kind::cpu ? "float_sdp_fusion_cpu"
+                                                   : "float_sdp_fusion_gpu");
     apass->run(g);
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
@@ -747,7 +751,7 @@ TEST(test_large_partition_execute, F32DistilBertMha) {
 
     using ltw = graph::logical_tensor_wrapper_t;
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -764,8 +768,8 @@ TEST(test_large_partition_execute, F32DistilBertMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -811,7 +815,7 @@ TEST(test_large_partition_execute, F32GptMha) {
 
     using ltw = graph::logical_tensor_wrapper_t;
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -828,8 +832,8 @@ TEST(test_large_partition_execute, F32GptMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -872,7 +876,7 @@ TEST(test_large_partition_execute, F32JaxMha) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -885,8 +889,8 @@ TEST(test_large_partition_execute, F32JaxMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -927,14 +931,14 @@ TEST(test_large_partition_execute, F32JaxMqa) {
     }
 
     // Enable large partition test
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
     // Set back to avoid affecting other tests
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -947,8 +951,8 @@ TEST(test_large_partition_execute, F32JaxMqa) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -989,7 +993,9 @@ TEST(test_large_partition_execute, Bf16Mha_CPU) {
 
     ASSERT_EQ(g.get_ops().size(), 7U);
 
-    graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+    graph::pass::pass_base_ptr apass = get_pass(
+            eng->kind() == graph::engine_kind::cpu ? "float_sdp_fusion_cpu"
+                                                   : "float_sdp_fusion_gpu");
     apass->run(g);
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
@@ -1020,13 +1026,13 @@ TEST(test_large_partition_execute, Bf16Mha_CPU) {
     using ltw = graph::logical_tensor_wrapper_t;
 
     std::vector<std::vector<uint16_t>> inputs_data;
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         // set all the input value to 1.f, then the value after softmax should
         // be 1.f/seq_len, and the final output should be 1.f
-        inputs_data.emplace_back(std::vector<uint16_t>(
-                utils::product(ltw(lt).vdims()), f32_to_bf16(1.f)));
+        inputs_data.emplace_back(
+                utils::product(ltw(lt).vdims()), f32_to_bf16(1.f));
         inputs_ts.emplace_back(*lt, eng, inputs_data.back());
     }
 
@@ -1036,8 +1042,8 @@ TEST(test_large_partition_execute, Bf16Mha_CPU) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 
@@ -1095,7 +1101,7 @@ TEST(test_large_partition_execute, Bf16GptMha) {
 
     using ltw = graph::logical_tensor_wrapper_t;
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -1112,8 +1118,8 @@ TEST(test_large_partition_execute, Bf16GptMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -1134,7 +1140,9 @@ TEST(test_large_partition_execute, Bf16DistilBertMha) {
 
     ASSERT_EQ(g.get_ops().size(), 7U);
 
-    graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+    graph::pass::pass_base_ptr apass = get_pass(
+            eng->kind() == graph::engine_kind::cpu ? "float_sdp_fusion_cpu"
+                                                   : "float_sdp_fusion_gpu");
     apass->run(g);
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
@@ -1164,7 +1172,7 @@ TEST(test_large_partition_execute, Bf16DistilBertMha) {
 
     using ltw = graph::logical_tensor_wrapper_t;
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
@@ -1181,8 +1189,8 @@ TEST(test_large_partition_execute, Bf16DistilBertMha) {
         outputs_ts.emplace_back(compiled_output, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -1229,7 +1237,7 @@ TEST(test_large_partition_execute, Int8Bf16Mha_CPU) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : partition_inputs) {
         inputs_ts.emplace_back(lt, eng);
@@ -1239,8 +1247,8 @@ TEST(test_large_partition_execute, Int8Bf16Mha_CPU) {
         outputs_ts.emplace_back(lt, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -1288,7 +1296,7 @@ TEST(test_large_partition_execute, Int8Bf16DistilBertMha_CPU) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : partition_inputs) {
         inputs_ts.emplace_back(lt, eng);
@@ -1298,8 +1306,8 @@ TEST(test_large_partition_execute, Int8Bf16DistilBertMha_CPU) {
         outputs_ts.emplace_back(lt, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -1347,7 +1355,7 @@ TEST(test_large_partition_execute, Int8Bf16GptMha_CPU) {
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
 
     for (auto &lt : partition_inputs) {
         inputs_ts.emplace_back(lt, eng);
@@ -1357,8 +1365,8 @@ TEST(test_large_partition_execute, Int8Bf16GptMha_CPU) {
         outputs_ts.emplace_back(lt, eng);
     }
 
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_layer_norm.cpp b/tests/gtests/graph/unit/backend/dnnl/test_layer_norm.cpp
index bf761ccffcb..7eab5106099 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_layer_norm.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_layer_norm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -131,12 +131,12 @@ TEST(test_layer_norm_execute, LayernormTraining) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scale_ts(scale_lt, eng, scale);
-    test_tensor shift_ts(shift_lt, eng, shift);
-    test_tensor dst_ts(dst_lt, eng, dst);
-    test_tensor mean_ts(mean_lt, eng, mean);
-    test_tensor var_ts(variance_lt, eng, var);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scale_ts(scale_lt, eng, scale);
+    test_tensor_t shift_ts(shift_lt, eng, shift);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+    test_tensor_t mean_ts(mean_lt, eng, mean);
+    test_tensor_t var_ts(variance_lt, eng, var);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), scale_ts.get(), shift_ts.get()},
@@ -207,10 +207,10 @@ TEST(test_layer_norm_execute, LayernormInference) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scale_ts(scale_lt, eng, scale);
-    test_tensor shift_ts(shift_lt, eng, shift);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scale_ts(scale_lt, eng, scale);
+    test_tensor_t shift_ts(shift_lt, eng, shift);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), scale_ts.get(), shift_ts.get()},
@@ -265,8 +265,8 @@ TEST(test_layer_norm_execute, LayernormInferenceWithoutScaleShift) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
@@ -383,26 +383,26 @@ TEST(test_layer_norm_execute, LayerNormBackwardFp32) {
         ASSERT_EQ(inplace_pairs[0].input_id, diff_dst.id);
         ASSERT_EQ(inplace_pairs[0].output_id, diff_src.id);
 
-        test_tensor src_ts(src, engine, src_data);
-        test_tensor diff_dst_ts(diff_dst, engine,
+        test_tensor_t src_ts(src, engine, src_data);
+        test_tensor_t diff_dst_ts(diff_dst, engine,
                 use_affine ? diff_dst_data : diff_dst_data_no_affine);
-        test_tensor mean_ts(mean, engine, mean_data);
-        test_tensor var_ts(var, engine, var_data);
-        test_tensor scale_ts(scale, engine, scale_data);
-        test_tensor diff_src_ts(diff_src, engine, diff_src_data);
-        test_tensor diff_scale_ts(diff_scale, engine, diff_scale_data);
-        test_tensor diff_shift_ts(diff_shift, engine, diff_shift_data);
-
-        std::vector<test_tensor> inputs_ts {
+        test_tensor_t mean_ts(mean, engine, mean_data);
+        test_tensor_t var_ts(var, engine, var_data);
+        test_tensor_t scale_ts(scale, engine, scale_data);
+        test_tensor_t diff_src_ts(diff_src, engine, diff_src_data);
+        test_tensor_t diff_scale_ts(diff_scale, engine, diff_scale_data);
+        test_tensor_t diff_shift_ts(diff_shift, engine, diff_shift_data);
+
+        std::vector<test_tensor_t> inputs_ts {
                 src_ts, diff_dst_ts, mean_ts, var_ts, scale_ts};
-        std::vector<test_tensor> outputs_ts {diff_src_ts};
+        std::vector<test_tensor_t> outputs_ts {diff_src_ts};
         if (use_affine) {
             outputs_ts.push_back(diff_scale_ts);
             outputs_ts.push_back(diff_shift_ts);
         }
 
-        cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                test_tensor::to_graph_tensor(outputs_ts));
+        cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                test_tensor_t::to_graph_tensor(outputs_ts));
         strm->wait();
 
         const float abs_err {0.001f};
@@ -508,11 +508,11 @@ TEST(test_layer_norm_execute_subgraph_int8, LayernormTypecastQuant_CPU) {
     std::vector<float> scale(product(scale_lt_shape));
     std::vector<float> shift(product(shift_lt_shape));
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor scale_ts(scale_lt, engine, scale);
-    test_tensor shift_ts(shift_lt, engine, shift);
-    test_tensor dst_ts(quant_dst, engine);
-    test_tensor ref_ts(quant_dst, engine);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t scale_ts(scale_lt, engine, scale);
+    test_tensor_t shift_ts(shift_lt, engine, shift);
+    test_tensor_t dst_ts(quant_dst, engine);
+    test_tensor_t ref_ts(quant_dst, engine);
 
     ASSERT_EQ(run_graph(g, {src_ts, scale_ts, shift_ts}, {ref_ts}, *engine,
                       *strm),
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_layout_id.cpp b/tests/gtests/graph/unit/backend/dnnl/test_layout_id.cpp
deleted file mode 100644
index 3ae45d335f3..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_layout_id.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <iostream>
-#include <memory>
-
-#include "gtest/gtest.h"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-#include "interface/backend.hpp"
-
-#include "backend/dnnl/dnnl_backend.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace dnnl_impl = dnnl::impl::graph::dnnl_impl;
-
-TEST(test_layout_id_layout_id, OpaqueMdLayoutIdMapping) {
-    using memory = dnnl_impl::memory;
-    using data_type = dnnl_impl::data_type;
-    using format_tag = dnnl_impl::format_tag;
-
-    dnnl_impl::dnnl_layout_id_manager_t &mgr
-            = graph::dnnl_impl::dnnl_backend_t::get_singleton()
-                      .get_layout_id_manager();
-
-    // opaque md should be cached and generate a layout id, and the later
-    // layout id should be greater than the former one
-    memory::desc md1({8, 3, 224, 224}, data_type::f32, format_tag::nChw16c);
-    auto id1 = mgr.set_mem_desc(md1);
-    ASSERT_TRUE(id1.has_value());
-
-    memory::desc md2({8, 16, 96, 96}, data_type::f32, format_tag::nChw8c);
-    auto id2 = mgr.set_mem_desc(md2);
-    ASSERT_TRUE(id2.has_value());
-
-#ifdef DNNL_GRAPH_LAYOUT_DEBUG
-    ASSERT_EQ(id1.value(), static_cast<size_t>(format_tag::nChw16c));
-    ASSERT_EQ(id2.value(), static_cast<size_t>(format_tag::nChw8c));
-
-    dnnl::engine eng(dnnl::engine::kind::cpu, 0);
-    dnnl::memory::desc conv_src(
-            {1, 64, 224, 224}, data_type::u8, format_tag::any);
-    dnnl::memory::desc conv_wei({64, 64, 1, 1}, data_type::s8, format_tag::any);
-    dnnl::memory::desc conv_dst(
-            {1, 64, 224, 224}, data_type::u8, format_tag::any);
-    dnnl::primitive_attr conv_attr;
-    conv_attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
-    conv_attr.set_zero_points_mask(DNNL_ARG_DST, 0);
-    conv_attr.set_scales_mask(DNNL_ARG_DST, 0);
-    conv_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
-    dnnl::convolution_forward::primitive_desc conv_pd(eng,
-            dnnl::prop_kind::forward, dnnl::algorithm::convolution_direct,
-            conv_src, conv_wei, conv_dst, dnnl::memory::dims {1, 1},
-            dnnl::memory::dims {0, 0}, dnnl::memory::dims {0, 0}, conv_attr);
-    // the weight desc for asymc conv will have extra flags
-    memory::desc md3 = conv_pd.weights_desc();
-
-    auto id3_asym = mgr.set_mem_desc(md3);
-    auto recovered_md3_asym = mgr.get_mem_desc(id3_asym.value());
-    ASSERT_TRUE(recovered_md3_asym.has_value());
-    ASSERT_EQ(graph::utils::any_cast<memory::desc>(recovered_md3_asym.value()),
-            md3);
-#else
-    ASSERT_GT(id2.value(), id1.value());
-
-    // we should be able to get cached opaque md out according to the
-    // layout id
-    auto recovered_md1 = mgr.get_mem_desc(id1.value());
-    ASSERT_TRUE(recovered_md1.has_value());
-    ASSERT_EQ(graph::utils::any_cast<memory::desc>(recovered_md1.value()), md1);
-
-    auto recovered_md2 = mgr.get_mem_desc(id2.value());
-    ASSERT_TRUE(recovered_md2.has_value());
-    ASSERT_EQ(graph::utils::any_cast<memory::desc>(recovered_md2.value()), md2);
-#endif // DNNL_GRAPH_LAYOUT_DEBUG
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_layout_id_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_layout_id_cpu.cpp
new file mode 100644
index 00000000000..483a7a066f1
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_layout_id_cpu.cpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <iostream>
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+#include "interface/backend.hpp"
+
+#include "backend/dnnl/dnnl_backend.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace dnnl_impl = dnnl::impl::graph::dnnl_impl;
+
+TEST(test_layout_id, OpaqueMdLayoutIdMapping) {
+    using memory = dnnl_impl::memory;
+    using data_type = dnnl_impl::data_type;
+    using format_tag = dnnl_impl::format_tag;
+
+    dnnl_impl::dnnl_layout_id_manager_t &mgr
+            = graph::dnnl_impl::dnnl_backend_t::get_singleton()
+                      .get_layout_id_manager();
+
+    // opaque md should be cached and generate a layout id, and the later
+    // layout id should be greater than the former one
+    memory::desc md1({8, 3, 224, 224}, data_type::f32, format_tag::nChw16c);
+    auto id1 = mgr.set_mem_desc(md1);
+    ASSERT_TRUE(id1.has_value());
+
+    memory::desc md2({8, 16, 96, 96}, data_type::f32, format_tag::nChw8c);
+    auto id2 = mgr.set_mem_desc(md2);
+    ASSERT_TRUE(id2.has_value());
+
+#ifdef DNNL_GRAPH_LAYOUT_DEBUG
+    ASSERT_EQ(id1.value(), static_cast<size_t>(format_tag::nChw16c));
+    ASSERT_EQ(id2.value(), static_cast<size_t>(format_tag::nChw8c));
+
+    dnnl::engine eng(dnnl::engine::kind::cpu, 0);
+    dnnl::memory::desc conv_src(
+            {1, 64, 224, 224}, data_type::u8, format_tag::any);
+    dnnl::memory::desc conv_wei({64, 64, 1, 1}, data_type::s8, format_tag::any);
+    dnnl::memory::desc conv_dst(
+            {1, 64, 224, 224}, data_type::u8, format_tag::any);
+    dnnl::primitive_attr conv_attr;
+    conv_attr.set_zero_points_mask(DNNL_ARG_SRC, 0);
+    conv_attr.set_zero_points_mask(DNNL_ARG_DST, 0);
+    conv_attr.set_scales_mask(DNNL_ARG_DST, 0);
+    conv_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    dnnl::convolution_forward::primitive_desc conv_pd(eng,
+            dnnl::prop_kind::forward, dnnl::algorithm::convolution_direct,
+            conv_src, conv_wei, conv_dst, dnnl::memory::dims {1, 1},
+            dnnl::memory::dims {0, 0}, dnnl::memory::dims {0, 0}, conv_attr);
+    // the weight desc for asymc conv will have extra flags
+    memory::desc md3 = conv_pd.weights_desc();
+
+    auto id3_asym = mgr.set_mem_desc(md3);
+    auto recovered_md3_asym = mgr.get_mem_desc(id3_asym.value());
+    ASSERT_TRUE(recovered_md3_asym.has_value());
+    ASSERT_EQ(graph::utils::any_cast<memory::desc>(recovered_md3_asym.value()),
+            md3);
+#else
+    ASSERT_GT(id2.value(), id1.value());
+
+    // we should be able to get cached opaque md out according to the
+    // layout id
+    auto recovered_md1 = mgr.get_mem_desc(id1.value());
+    ASSERT_TRUE(recovered_md1.has_value());
+    ASSERT_EQ(graph::utils::any_cast<memory::desc>(recovered_md1.value()), md1);
+
+    auto recovered_md2 = mgr.get_mem_desc(id2.value());
+    ASSERT_TRUE(recovered_md2.has_value());
+    ASSERT_EQ(graph::utils::any_cast<memory::desc>(recovered_md2.value()), md2);
+#endif // DNNL_GRAPH_LAYOUT_DEBUG
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_layout_propagator.cpp b/tests/gtests/graph/unit/backend/dnnl/test_layout_propagator.cpp
deleted file mode 100644
index cb34ca29214..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_layout_propagator.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include <memory>
-
-#include "interface/c_types_map.hpp"
-
-#include "backend/dnnl/common.hpp"
-#include "backend/dnnl/internal_attrs.hpp"
-#include "backend/dnnl/layout_propagator.hpp"
-
-#include "gtest/gtest.h"
-
-#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-namespace dnnl_impl = graph::dnnl_impl;
-
-TEST(test_layout_propagator_layout_propagator, LayoutPropagatorForPermute) {
-    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
-    op->set_attr<std::vector<int64_t>>(dnnl_impl::op_attr::permutation, {1, 0});
-    auto lt_in = utils::logical_tensor_init(
-            0, {1, 1}, graph::data_type::f32, graph::layout_type::any);
-    auto lt_out = utils::logical_tensor_init(1, {1, 1}, graph::data_type::f32);
-
-    op->add_input(lt_in);
-    op->add_output(lt_out);
-
-    graph::engine_t &eng = *get_engine();
-    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
-    dnnl_impl::fusion_info_mgr_t mgr;
-    dnnl_impl::pd_cache_t pd_cache;
-    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
-            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine,
-            graph::fpmath_mode::any, false, false);
-    dnnl_impl::subgraph_rewriter_t rewriter {sg};
-    ASSERT_EQ(dnnl_impl::layout_propagator_for_permute(
-                      op, p_engine, mgr, pd_cache, rewriter),
-            graph::status::success);
-}
-
-TEST(test_layout_propagator_layout_propagator, LayoutPropagatorForReorder) {
-    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
-    op->set_attr<std::vector<int64_t>>(dnnl_impl::op_attr::permutation, {1, 0});
-    auto lt_in = utils::logical_tensor_init(
-            0, {1, 2}, graph::data_type::f32, graph::layout_type::any);
-    auto lt_out = utils::logical_tensor_init(
-            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
-
-    op->add_input(lt_in);
-    op->add_output(lt_out);
-
-    graph::engine_t &eng = *get_engine();
-    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
-    dnnl_impl::fusion_info_mgr_t mgr;
-    dnnl_impl::pd_cache_t pd_cache;
-    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
-            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine,
-            graph::fpmath_mode::any, false, false);
-    dnnl_impl::subgraph_rewriter_t rewriter {sg};
-    ASSERT_EQ(layout_propagator_for_reorder(
-                      op, p_engine, mgr, pd_cache, rewriter),
-            graph::status::success);
-}
-
-TEST(test_layout_propagator_layout_propagator,
-        LayoutPropagatorForSumDeathTest) {
-    graph::engine_t &eng = *get_engine();
-    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
-    dnnl_impl::fusion_info_mgr_t mgr;
-    dnnl_impl::pd_cache_t pd_cache;
-    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
-    auto lt_in = utils::logical_tensor_init(
-            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
-    auto lt_out = utils::logical_tensor_init(
-            2, {1, 2}, graph::data_type::f32, graph::layout_type::any);
-
-    op->add_input(lt_in);
-    op->add_output(lt_out);
-    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
-            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine,
-            graph::fpmath_mode::any, false, false);
-    dnnl_impl::subgraph_rewriter_t rewriter {sg};
-    ASSERT_EQ(layout_propagator_for_sum(op, p_engine, mgr, pd_cache, rewriter),
-            graph::status::success);
-}
-
-TEST(test_layout_propagator_layout_propagator,
-        LayoutPropagatorForSumFailDeathTest) {
-    dnnl::engine p_engine;
-    dnnl_impl::fusion_info_mgr_t mgr;
-    dnnl_impl::pd_cache_t pd_cache;
-    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
-    auto lt_in = utils::logical_tensor_init(
-            0, {1, 2}, graph::data_type::f32, graph::layout_type::any);
-    auto lt_out = utils::logical_tensor_init(
-            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
-
-    op->add_input(lt_in);
-    op->add_output(lt_out);
-    std::shared_ptr<dnnl_impl::subgraph_t> sg;
-    dnnl_impl::subgraph_rewriter_t rewriter {sg};
-#ifndef NDEBUG
-    EXPECT_DEATH(
-            layout_propagator_for_sum(op, p_engine, mgr, pd_cache, rewriter),
-            "input format of sum primitive cannot be any.");
-#endif
-}
-
-TEST(test_layout_propagator_layout_propagator,
-        LayoutPropagatorForSubZpsDeathTest) {
-    graph::engine_t &eng = *get_engine();
-    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
-    dnnl_impl::fusion_info_mgr_t mgr;
-    dnnl_impl::pd_cache_t pd_cache;
-    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
-    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
-            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine,
-            graph::fpmath_mode::any, false, false);
-    dnnl_impl::subgraph_rewriter_t rewriter {sg};
-#ifndef NDEBUG
-    EXPECT_DEATH(dnnl_impl::layout_propagator_for_sub_zps(
-                         op, p_engine, mgr, pd_cache, rewriter),
-            "");
-#else
-    ASSERT_EQ(dnnl_impl::layout_propagator_for_sub_zps(
-                      op, p_engine, mgr, pd_cache, rewriter),
-            graph::status::invalid_graph_op);
-#endif
-}
-
-TEST(test_layout_propagator_layout_propagator,
-        LayoutPropagatorForAddZpsDeathTest) {
-    graph::engine_t &eng = *get_engine();
-    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
-    dnnl_impl::fusion_info_mgr_t mgr;
-    dnnl_impl::pd_cache_t pd_cache;
-    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
-    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
-            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine,
-            graph::fpmath_mode::any, false, false);
-    dnnl_impl::subgraph_rewriter_t rewriter {sg};
-#ifndef NDEBUG
-    EXPECT_DEATH(dnnl_impl::layout_propagator_for_add_zps(
-                         op, p_engine, mgr, pd_cache, rewriter),
-            "");
-#else
-    ASSERT_EQ(dnnl_impl::layout_propagator_for_add_zps(
-                      op, p_engine, mgr, pd_cache, rewriter),
-            graph::status::invalid_graph_op);
-#endif
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_layout_propagator_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_layout_propagator_cpu.cpp
new file mode 100644
index 00000000000..953001879e7
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_layout_propagator_cpu.cpp
@@ -0,0 +1,168 @@
+/*******************************************************************************
+* Copyright 2022-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <memory>
+
+#include "interface/c_types_map.hpp"
+
+#include "backend/dnnl/common.hpp"
+#include "backend/dnnl/internal_attrs.hpp"
+#include "backend/dnnl/layout_propagator.hpp"
+
+#include "gtest/gtest.h"
+
+#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+namespace dnnl_impl = graph::dnnl_impl;
+
+TEST(test_layout_propagator, LayoutPropagatorForPermute) {
+    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
+    op->set_attr<std::vector<int64_t>>(dnnl_impl::op_attr::permutation, {1, 0});
+    auto lt_in = utils::logical_tensor_init(
+            0, {1, 1}, graph::data_type::f32, graph::layout_type::any);
+    auto lt_out = utils::logical_tensor_init(1, {1, 1}, graph::data_type::f32);
+
+    op->add_input(lt_in);
+    op->add_output(lt_out);
+
+    graph::engine_t &eng = *get_engine();
+    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
+    dnnl_impl::fusion_info_mgr_t mgr;
+    dnnl_impl::pd_cache_t pd_cache;
+    const graph::fpmath_t fpm {graph::fpmath_mode::any, false};
+    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
+            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine, fpm,
+            false, false);
+    dnnl_impl::subgraph_rewriter_t rewriter {sg};
+    ASSERT_EQ(dnnl_impl::layout_propagator_for_permute(
+                      op, p_engine, mgr, pd_cache, rewriter),
+            graph::status::success);
+}
+
+TEST(test_layout_propagator, LayoutPropagatorForReorder) {
+    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
+    op->set_attr<std::vector<int64_t>>(dnnl_impl::op_attr::permutation, {1, 0});
+    auto lt_in = utils::logical_tensor_init(
+            0, {1, 2}, graph::data_type::f32, graph::layout_type::any);
+    auto lt_out = utils::logical_tensor_init(
+            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
+
+    op->add_input(lt_in);
+    op->add_output(lt_out);
+
+    graph::engine_t &eng = *get_engine();
+    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
+    dnnl_impl::fusion_info_mgr_t mgr;
+    dnnl_impl::pd_cache_t pd_cache;
+    const graph::fpmath_t fpm {graph::fpmath_mode::any, false};
+    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
+            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine, fpm,
+            false, false);
+    dnnl_impl::subgraph_rewriter_t rewriter {sg};
+    ASSERT_EQ(layout_propagator_for_reorder(
+                      op, p_engine, mgr, pd_cache, rewriter),
+            graph::status::success);
+}
+
+TEST(test_layout_propagator, LayoutPropagatorForSumDeathTest) {
+    graph::engine_t &eng = *get_engine();
+    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
+    dnnl_impl::fusion_info_mgr_t mgr;
+    dnnl_impl::pd_cache_t pd_cache;
+    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
+    auto lt_in = utils::logical_tensor_init(
+            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
+    auto lt_out = utils::logical_tensor_init(
+            2, {1, 2}, graph::data_type::f32, graph::layout_type::any);
+
+    op->add_input(lt_in);
+    op->add_output(lt_out);
+    const graph::fpmath_t fpm {graph::fpmath_mode::any, false};
+    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
+            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine, fpm,
+            false, false);
+    dnnl_impl::subgraph_rewriter_t rewriter {sg};
+    ASSERT_EQ(layout_propagator_for_sum(op, p_engine, mgr, pd_cache, rewriter),
+            graph::status::success);
+}
+
+TEST(test_layout_propagator, LayoutPropagatorForSumFailDeathTest) {
+    dnnl::engine p_engine;
+    dnnl_impl::fusion_info_mgr_t mgr;
+    dnnl_impl::pd_cache_t pd_cache;
+    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
+    auto lt_in = utils::logical_tensor_init(
+            0, {1, 2}, graph::data_type::f32, graph::layout_type::any);
+    auto lt_out = utils::logical_tensor_init(
+            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
+
+    op->add_input(lt_in);
+    op->add_output(lt_out);
+    std::shared_ptr<dnnl_impl::subgraph_t> sg;
+    dnnl_impl::subgraph_rewriter_t rewriter {sg};
+#ifndef NDEBUG
+    EXPECT_DEATH(
+            layout_propagator_for_sum(op, p_engine, mgr, pd_cache, rewriter),
+            "input format of sum primitive cannot be any.");
+#endif
+}
+
+TEST(test_layout_propagator, LayoutPropagatorForSubZpsDeathTest) {
+    graph::engine_t &eng = *get_engine();
+    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
+    dnnl_impl::fusion_info_mgr_t mgr;
+    dnnl_impl::pd_cache_t pd_cache;
+    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
+    const graph::fpmath_t fpm {graph::fpmath_mode::any, false};
+    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
+            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine, fpm,
+            false, false);
+    dnnl_impl::subgraph_rewriter_t rewriter {sg};
+#ifndef NDEBUG
+    EXPECT_DEATH(dnnl_impl::layout_propagator_for_sub_zps(
+                         op, p_engine, mgr, pd_cache, rewriter),
+            "");
+#else
+    ASSERT_EQ(dnnl_impl::layout_propagator_for_sub_zps(
+                      op, p_engine, mgr, pd_cache, rewriter),
+            graph::status::invalid_graph_op);
+#endif
+}
+
+TEST(test_layout_propagator, LayoutPropagatorForAddZpsDeathTest) {
+    graph::engine_t &eng = *get_engine();
+    dnnl::engine p_engine = dnnl_impl::make_dnnl_engine(eng);
+    dnnl_impl::fusion_info_mgr_t mgr;
+    dnnl_impl::pd_cache_t pd_cache;
+    auto op = std::make_shared<graph::op_t>(0, graph::op_kind::Wildcard, "op");
+    const graph::fpmath_t fpm {graph::fpmath_mode::any, false};
+    auto sg = std::make_shared<dnnl_impl::subgraph_t>(
+            std::vector<std::shared_ptr<graph::op_t>> {op}, p_engine, fpm,
+            false, false);
+    dnnl_impl::subgraph_rewriter_t rewriter {sg};
+#ifndef NDEBUG
+    EXPECT_DEATH(dnnl_impl::layout_propagator_for_add_zps(
+                         op, p_engine, mgr, pd_cache, rewriter),
+            "");
+#else
+    ASSERT_EQ(dnnl_impl::layout_propagator_for_add_zps(
+                      op, p_engine, mgr, pd_cache, rewriter),
+            graph::status::invalid_graph_op);
+#endif
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_logical_tensor.cpp b/tests/gtests/graph/unit/backend/dnnl/test_logical_tensor.cpp
deleted file mode 100644
index d28a188a1d3..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_logical_tensor.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "graph/unit/utils.hpp"
-
-#include "backend/dnnl/dnnl_backend.hpp"
-#include "interface/backend.hpp"
-#include "interface/logical_tensor.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace dnnl_impl = dnnl::impl::graph::dnnl_impl;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_logical_tensor_logical_tensor, ImplicitEqualLayout) {
-    using ltw = graph::logical_tensor_wrapper_t;
-
-    dnnl::memory::desc md({1, 2, 3, 4}, dnnl::memory::data_type::f32,
-            dnnl::memory::format_tag::nchw);
-    auto layout_idx
-            = dnnl_impl::dnnl_backend_t::get_singleton().set_mem_desc(md);
-    ASSERT_TRUE(layout_idx.has_value());
-    auto backend_idx = dnnl_impl::dnnl_backend_t::get_singleton().get_id();
-    auto id = graph::backend_registry_t::encode_layout_id(
-            layout_idx.value(), backend_idx);
-
-    graph::logical_tensor_t lt1 = utils::logical_tensor_init(
-            0, {1, 2, 3, 4}, graph::data_type::f32, graph::layout_type::any);
-    // set opaque layout id
-    lt1.layout_type = graph::layout_type::opaque;
-    lt1.layout.layout_id = id;
-
-    // public layout
-    graph::logical_tensor_t lt2 = utils::logical_tensor_init(0, {1, 2, 3, 4},
-            graph::data_type::f32, graph::layout_type::strided);
-
-    ASSERT_TRUE(ltw(lt1).has_same_layout_as(ltw(lt2)));
-    ASSERT_TRUE(ltw(lt2).has_same_layout_as(ltw(lt1)));
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_logical_tensor_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_logical_tensor_cpu.cpp
new file mode 100644
index 00000000000..8fd64216b5f
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_logical_tensor_cpu.cpp
@@ -0,0 +1,54 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "graph/unit/utils.hpp"
+
+#include "backend/dnnl/dnnl_backend.hpp"
+#include "interface/backend.hpp"
+#include "interface/logical_tensor.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace dnnl_impl = dnnl::impl::graph::dnnl_impl;
+namespace utils = dnnl::graph::tests::unit::utils;
+
+TEST(test_logical_tensor, ImplicitEqualLayout) {
+    using ltw = graph::logical_tensor_wrapper_t;
+
+    dnnl::memory::desc md({1, 2, 3, 4}, dnnl::memory::data_type::f32,
+            dnnl::memory::format_tag::nchw);
+    auto layout_idx
+            = dnnl_impl::dnnl_backend_t::get_singleton().set_mem_desc(md);
+    ASSERT_TRUE(layout_idx.has_value());
+    auto backend_idx = dnnl_impl::dnnl_backend_t::get_singleton().get_id();
+    auto id = graph::backend_registry_t::encode_layout_id(
+            layout_idx.value(), backend_idx);
+
+    graph::logical_tensor_t lt1 = utils::logical_tensor_init(
+            0, {1, 2, 3, 4}, graph::data_type::f32, graph::layout_type::any);
+    // set opaque layout id
+    lt1.layout_type = graph::layout_type::opaque;
+    lt1.layout.layout_id = id;
+
+    // public layout
+    graph::logical_tensor_t lt2 = utils::logical_tensor_init(0, {1, 2, 3, 4},
+            graph::data_type::f32, graph::layout_type::strided);
+
+    ASSERT_TRUE(ltw(lt1).has_same_layout_as(ltw(lt2)));
+    ASSERT_TRUE(ltw(lt2).has_same_layout_as(ltw(lt1)));
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_matmul.cpp b/tests/gtests/graph/unit/backend/dnnl/test_matmul.cpp
index 484e921b9ac..0ff5652abf7 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_matmul.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include "graph/unit/utils.hpp"
 
 #include "backend/dnnl/dnnl_constant_tensor_cache.hpp"
+#include "oneapi/dnnl/dnnl_graph.hpp"
 
 namespace graph = dnnl::impl::graph;
 namespace utils = dnnl::graph::tests::unit::utils;
@@ -76,10 +77,10 @@ TEST(test_matmul_execute, MatmulFp32) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor bias_ts(bias, eng, bias_data);
-    test_tensor dst_ts(dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t bias_ts(bias, eng, bias_data);
+    test_tensor_t dst_ts(dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -98,10 +99,10 @@ TEST(test_matmul_execute, MatmulFp32) {
     std::vector<float> ref_dst_data2 {5};
     std::vector<float> dst_data2(ref_dst_data2.size(), 0.0);
 
-    test_tensor src_ts2(src, eng, src_data2);
-    test_tensor weight_ts2(weight, eng, weight_data2);
-    test_tensor bias_ts2(bias, eng, bias_data2);
-    test_tensor dst_ts2(dst, eng, dst_data2);
+    test_tensor_t src_ts2(src, eng, src_data2);
+    test_tensor_t weight_ts2(weight, eng, weight_data2);
+    test_tensor_t bias_ts2(bias, eng, bias_data2);
+    test_tensor_t dst_ts2(dst, eng, dst_data2);
 
     cp.execute(strm, {src_ts2.get(), weight_ts2.get(), bias_ts2.get()},
             {dst_ts2.get()});
@@ -161,9 +162,9 @@ TEST(test_matmul_execute, MatmulF16F16F16_GPU) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor dst_ts(dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t dst_ts(dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
@@ -216,9 +217,9 @@ TEST(test_matmul_execute, MatmulBf16Bf16Bf16) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor dst_ts(dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t dst_ts(dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
@@ -398,9 +399,9 @@ TEST(test_matmul_execute, MatmulNdx1d) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, engine),
                     graph::status::success);
 
-            test_tensor src_ts(src, engine, src_data);
-            test_tensor weight_ts(weight, engine, weight_data);
-            test_tensor dst_ts(dst, engine, dst_data);
+            test_tensor_t src_ts(src, engine, src_data);
+            test_tensor_t weight_ts(weight, engine, weight_data);
+            test_tensor_t dst_ts(dst, engine, dst_data);
 
             ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()},
                               {dst_ts.get()}),
@@ -493,9 +494,9 @@ TEST(test_matmul_execute, Matmul1dxNd) {
 
             p.compile(&cp, inputs, outputs, engine);
 
-            test_tensor src_ts(src, engine, src_data);
-            test_tensor weight_ts(weight, engine, weight_data);
-            test_tensor dst_ts(dst, engine, dst_data);
+            test_tensor_t src_ts(src, engine, src_data);
+            test_tensor_t weight_ts(weight, engine, weight_data);
+            test_tensor_t dst_ts(dst, engine, dst_data);
 
             cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
             strm->wait();
@@ -563,9 +564,9 @@ TEST(test_matmul_execute, Matmul3dx3d) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor dst_ts(dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t dst_ts(dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
@@ -630,10 +631,10 @@ TEST(test_matmul_execute, MatmulBiasAdd) {
     p.compile(&cp, inputs, outputs, engine);
     cp.query_logical_tensor(add_dst.id, &add_dst);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor post_src_ts(post_src, engine, post_src_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t post_src_ts(post_src, engine, post_src_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), post_src_ts.get()},
@@ -706,11 +707,11 @@ TEST(test_matmul_execute, MatmulBiasAddPerTensorBroadcast) {
 
         p.compile(&cp, inputs, outputs, engine);
 
-        test_tensor src_ts(src, engine, src_data);
-        test_tensor weight_ts(weight, engine, weight_data);
-        test_tensor bias_ts(bias, engine, bias_data);
-        test_tensor post_src_ts(post_src, engine, post_src_data);
-        test_tensor dst_ts(add_dst, engine, dst_data);
+        test_tensor_t src_ts(src, engine, src_data);
+        test_tensor_t weight_ts(weight, engine, weight_data);
+        test_tensor_t bias_ts(bias, engine, bias_data);
+        test_tensor_t post_src_ts(post_src, engine, post_src_data);
+        test_tensor_t dst_ts(add_dst, engine, dst_data);
 
         cp.execute(strm,
                 {src_ts.get(), weight_ts.get(), bias_ts.get(),
@@ -784,11 +785,11 @@ TEST(test_matmul_execute, MatmulBiasAddPerChannelBroadcast) {
 
         p.compile(&cp, inputs, outputs, engine);
 
-        test_tensor src_ts(src, engine, src_data);
-        test_tensor weight_ts(weight, engine, weight_data);
-        test_tensor bias_ts(bias, engine, bias_data);
-        test_tensor post_src_ts(post_src, engine, post_src_data);
-        test_tensor dst_ts(add_dst, engine, dst_data);
+        test_tensor_t src_ts(src, engine, src_data);
+        test_tensor_t weight_ts(weight, engine, weight_data);
+        test_tensor_t bias_ts(bias, engine, bias_data);
+        test_tensor_t post_src_ts(post_src, engine, post_src_data);
+        test_tensor_t dst_ts(add_dst, engine, dst_data);
 
         cp.execute(strm,
                 {src_ts.get(), weight_ts.get(), bias_ts.get(),
@@ -1065,13 +1066,13 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx2d) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
 
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
                           {dst_s8_ts}, *engine, *strm),
                 graph::status::success);
@@ -1096,7 +1097,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx2d) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
                 {dst_s8_case2_ts.get()});
@@ -1209,13 +1210,13 @@ TEST(test_matmul_execute_subgraph_int8, MatmulU8U8) {
         g.add_op(&matmul_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_u8_ts(weight_u8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_u8_ts(weight_u8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(dst_shape));
-        test_tensor dst_f32_ts(dst_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_ts(dst_f32, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_u8_ts, bias_f32_ts},
                           {dst_f32_ts}, *engine, *strm),
                 graph::status::success);
@@ -1239,7 +1240,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulU8U8) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(dst_shape));
-        test_tensor dst_f32_case2_ts(dst_f32, engine, case2_out_data);
+        test_tensor_t dst_f32_case2_ts(dst_f32, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_u8_ts.get(), bias_f32_ts.get()},
                 {dst_f32_case2_ts.get()});
@@ -1362,11 +1363,11 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx1d) {
             g.add_op(&qout_op);
             g.finalize();
 
-            test_tensor src_u8_ts(src_u8, engine, src_data);
-            test_tensor weight_s8_ts(weight_s8, engine, weight_data);
+            test_tensor_t src_u8_ts(src_u8, engine, src_data);
+            test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
             // -------------------------case 1----------------------------------
             std::vector<int8_t> case1_out_data(product(dst_shape));
-            test_tensor dst_s8_case1_ts(dst_s8, engine, case1_out_data);
+            test_tensor_t dst_s8_case1_ts(dst_s8, engine, case1_out_data);
             ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts}, {dst_s8_case1_ts},
                               *engine, *strm),
                     graph::status::success);
@@ -1391,7 +1392,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx1d) {
             p.compile(&cp, lt_ins, lt_outs, engine);
 
             std::vector<int8_t> case2_out_data(product(dst_shape));
-            test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+            test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
             cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
                     {dst_s8_case2_ts.get()});
             strm->wait();
@@ -1525,12 +1526,13 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx2dWithTranspose) {
             g.add_op(&qout_op);
             g.finalize();
 
-            test_tensor src_u8_ts(src_u8, engine, src_data);
-            test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-            test_tensor bias_f32_ts = test_tensor(bias_f32, engine, bias_data);
+            test_tensor_t src_u8_ts(src_u8, engine, src_data);
+            test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+            test_tensor_t bias_f32_ts
+                    = test_tensor_t(bias_f32, engine, bias_data);
             // -------------------------case 1----------------------------------
             std::vector<int8_t> case1_out_data(product(dst_shape));
-            test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+            test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
             ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
                               {dst_s8_ts}, *engine, *strm),
                     graph::status::success);
@@ -1555,7 +1557,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx2dWithTranspose) {
             p.compile(&cp, lt_ins, lt_outs, engine);
 
             std::vector<int8_t> case2_out_data(product(dst_shape));
-            test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+            test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
             cp.execute(strm,
                     {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
                     {dst_s8_case2_ts.get()});
@@ -1728,13 +1730,13 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasSumNdx2d) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-        test_tensor other_s8_ts(other_s8, engine, other_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_data);
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g,
                           {src_u8_ts, weight_s8_ts, bias_f32_ts, other_s8_ts},
                           {dst_s8_ts}, *engine, *strm),
@@ -1742,7 +1744,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasSumNdx2d) {
         // -------------------------case 2----------------------------------
         graph::pass::pass_base_ptr apass
                 = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8s8x8_matmul_add_post_ops_gpu"
+                                ? "x8x8x8_matmul_add_post_ops_gpu"
                                 : "x8x8x8_matmul_add_post_ops_cpu");
         apass->run(g);
         ASSERT_EQ(g.get_num_partitions(), 1U);
@@ -1762,7 +1764,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasSumNdx2d) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
                         other_s8_ts.get()},
@@ -1914,13 +1916,13 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasBinary) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-        test_tensor other_f32_ts(other_f32, engine, other_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t other_f32_ts(other_f32, engine, other_data);
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g,
                           {src_u8_ts, weight_s8_ts, bias_f32_ts, other_f32_ts},
                           {dst_s8_ts}, *engine, *strm),
@@ -1945,7 +1947,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasBinary) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
                         other_f32_ts.get()},
@@ -2136,14 +2138,14 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasAddMul) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-        test_tensor add_other_s8_ts(other_s8, engine, add_other_data);
-        test_tensor mul_other_f32_ts(mul_src_f32, engine, mul_other_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t add_other_s8_ts(other_s8, engine, add_other_data);
+        test_tensor_t mul_other_f32_ts(mul_src_f32, engine, mul_other_data);
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g,
                           {src_u8_ts, weight_s8_ts, bias_f32_ts,
                                   add_other_s8_ts, mul_other_f32_ts},
@@ -2152,7 +2154,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasAddMul) {
         // -------------------------case 2----------------------------------
         graph::pass::pass_base_ptr apass
                 = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8s8x8_matmul_add_post_ops_gpu"
+                                ? "x8x8x8_matmul_add_post_ops_gpu"
                                 : "x8x8x8_matmul_add_post_ops_cpu");
         apass->run(g);
         ASSERT_EQ(g.get_num_partitions(), 1U);
@@ -2172,7 +2174,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasAddMul) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get(),
                         add_other_s8_ts.get(), mul_other_f32_ts.get()},
@@ -2285,12 +2287,12 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasNdx2dX8s8f32) {
         g.add_op(&matmul_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(dst_shape));
-        test_tensor dst_f32_ts(dst_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_ts(dst_f32, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
                           {dst_f32_ts}, *engine, *strm),
                 graph::status::success);
@@ -2313,7 +2315,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasNdx2dX8s8f32) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(dst_shape));
-        test_tensor dst_f32_case2_ts(dst_f32, engine, case2_out_data);
+        test_tensor_t dst_f32_case2_ts(dst_f32, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
                 {dst_f32_case2_ts.get()});
@@ -2417,11 +2419,11 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx2dX8s8f32) {
         g.add_op(&matmul_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(dst_shape));
-        test_tensor dst_f32_ts(dst_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_ts(dst_f32, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts}, {dst_f32_ts}, *engine,
                           *strm),
                 graph::status::success);
@@ -2444,7 +2446,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulNdx2dX8s8f32) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(dst_shape));
-        test_tensor dst_f32_case2_ts(dst_f32, engine, case2_out_data);
+        test_tensor_t dst_f32_case2_ts(dst_f32, engine, case2_out_data);
         cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
                 {dst_f32_case2_ts.get()});
         strm->wait();
@@ -2563,12 +2565,12 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasGeluNdx2dX8s8f32) {
         g.add_op(&gelu_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(dst_shape));
-        test_tensor dst_f32_ts(gelu_f32, engine, case1_out_data);
+        test_tensor_t dst_f32_ts(gelu_f32, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts, bias_f32_ts},
                           {dst_f32_ts}, *engine, *strm),
                 graph::status::success);
@@ -2591,7 +2593,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasGeluNdx2dX8s8f32) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(dst_shape));
-        test_tensor dst_f32_case2_ts(gelu_f32, engine, case2_out_data);
+        test_tensor_t dst_f32_case2_ts(gelu_f32, engine, case2_out_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
                 {dst_f32_case2_ts.get()});
@@ -2818,11 +2820,11 @@ TEST(test_matmul_execute_subgraph_int8, Matmul2dx3dWithTranspose) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         // execute
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
         std::vector<int8_t> dst_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, dst_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, dst_data);
         cp.execute(strm,
                 {src_u8_ts.get(), weight_s8_ts.get(), bias_f32_ts.get()},
                 {dst_s8_ts.get()});
@@ -3162,10 +3164,10 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasU8s8bf16_CPU) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-    test_tensor bias_bf16_ts(bias_bf16, engine, bias_data);
-    test_tensor dst_ts(dst_bf16, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t bias_bf16_ts(bias_bf16, engine, bias_data);
+    test_tensor_t dst_ts(dst_bf16, engine);
     cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
             {dst_ts.get()});
     strm->wait();
@@ -3274,9 +3276,9 @@ TEST(test_matmul_execute_subgraph_int8, MatmulU8U8bf16) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_u8, engine, weight_data);
-    test_tensor dst_ts(dst_bf16, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_u8, engine, weight_data);
+    test_tensor_t dst_ts(dst_bf16, engine);
     cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()}, {dst_ts.get()});
     strm->wait();
 }
@@ -3410,11 +3412,11 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasAddBF16U8s8bf16_CPU) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-    test_tensor bias_bf16_ts(bias_bf16, engine, bias_data);
-    test_tensor other_bf16_ts(other_bf16, engine, other_data);
-    test_tensor dst_ts(dst_bf16, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t bias_bf16_ts(bias_bf16, engine, bias_data);
+    test_tensor_t other_bf16_ts(other_bf16, engine, other_data);
+    test_tensor_t dst_ts(dst_bf16, engine);
     cp.execute(strm,
             {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get(),
                     other_bf16_ts.get()},
@@ -3581,11 +3583,11 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasaddAddBF16U8s8bf16_CPU) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor other_bf16_ts(other_bf16, engine, other_data);
-    test_tensor dst_ts(dst_bf16, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t other_bf16_ts(other_bf16, engine, other_data);
+    test_tensor_t dst_ts(dst_bf16, engine);
     cp.execute(strm,
             {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get(),
                     other_bf16_ts.get()},
@@ -3726,10 +3728,10 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasU8s8u8MixBf16) {
     p.compile(&cp, lt_ins, lt_outs, engine);
 
     std::vector<uint8_t> dst_data(product(dst_shape));
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-    test_tensor bias_bf16_ts(bias_bf16, engine, bias_data);
-    test_tensor dst_ts(dst_u8, engine, dst_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t bias_bf16_ts(bias_bf16, engine, bias_data);
+    test_tensor_t dst_ts(dst_u8, engine, dst_data);
     cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
             {dst_ts.get()});
     strm->wait();
@@ -3898,10 +3900,10 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasaddU8s8u8MixBf16) {
     p.compile(&cp, lt_ins, lt_outs, engine);
 
     std::vector<uint8_t> dst_data(product(dst_shape));
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor dst_ts(dst_u8, engine, dst_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t dst_ts(dst_u8, engine, dst_data);
     cp.execute(strm, {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
             {dst_ts.get()});
     strm->wait();
@@ -4059,10 +4061,10 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasGeluU8s8u8MixBf16) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-    test_tensor bias_bf16_ts(bias_bf16, engine, bias_data);
-    test_tensor dst_ts(dst_u8, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t bias_bf16_ts(bias_bf16, engine, bias_data);
+    test_tensor_t dst_ts(dst_u8, engine);
     cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get(), bias_bf16_ts.get()},
             {dst_ts.get()});
     strm->wait();
@@ -4237,10 +4239,10 @@ TEST(test_matmul_execute_subgraph_int8, MatmulBiasaddGeluU8s8u8MixBf16) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_bf32_ts(bias_f32, engine, bias_data);
-    test_tensor dst_ts(dst_u8, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_bf32_ts(bias_f32, engine, bias_data);
+    test_tensor_t dst_ts(dst_u8, engine);
     cp.execute(strm, {src_u8_ts.get(), weight_f32_ts.get(), bias_bf32_ts.get()},
             {dst_ts.get()});
     strm->wait();
@@ -4326,9 +4328,9 @@ TEST(test_matmul_execute, MatmulTransposeReorder) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_f32_ts(src_f32, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor dst_ts(reorder_f32, engine);
+    test_tensor_t src_f32_ts(src_f32, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t dst_ts(reorder_f32, engine);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(
                 strm, {src_f32_ts.get(), weight_f32_ts.get()}, {dst_ts.get()});
@@ -4482,11 +4484,11 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasTransposeReorder) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
     std::vector<int8_t> dst_out_data(product(dst_shape));
-    test_tensor dst_ts(dst_s8, engine, dst_out_data);
+    test_tensor_t dst_ts(dst_s8, engine, dst_out_data);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -4656,10 +4658,10 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMixBf16MatmulTransposeReorder) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
     std::vector<int8_t> dst_out_data(product(dst_shape));
-    test_tensor dst_ts(dst_s8, engine, dst_out_data);
+    test_tensor_t dst_ts(dst_s8, engine, dst_out_data);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(
                 strm, {src_u8_ts.get(), weight_f32_ts.get()}, {dst_ts.get()});
@@ -4715,9 +4717,9 @@ TEST(test_matmul_execute, MatmulScalarOutput) {
     ASSERT_EQ(scalar_lt.layout_type, graph::layout_type::strided);
     ASSERT_EQ(scalar_lt.ndims, 0);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor dst_ts(scalar_lt, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t dst_ts(scalar_lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()}),
@@ -4908,13 +4910,13 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasSumNdx2d) {
         // set bias to be constant
         bias_f32.property = graph::property_type::constant;
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-        test_tensor other_s8_ts(other_s8, engine, other_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g,
                           {src_u8_ts, weight_f32_ts, bias_f32_ts, other_s8_ts},
                           {dst_s8_ts}, *engine, *strm),
@@ -4923,7 +4925,7 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasSumNdx2d) {
         // -------------------------case 2----------------------------------
         graph::pass::pass_base_ptr apass
                 = get_pass(engine->kind() == graph::engine_kind::gpu
-                                ? "x8s8x8_matmul_add_post_ops_gpu"
+                                ? "x8x8x8_matmul_add_post_ops_gpu"
                                 : "x8x8x8_matmul_add_post_ops_cpu");
         apass->run(g);
         ASSERT_EQ(g.get_num_partitions(), 1U);
@@ -4942,7 +4944,7 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasSumNdx2d) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
         for (size_t iter = 0; iter < 5; iter++) {
             cp.execute(strm,
                     {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get(),
@@ -5088,12 +5090,12 @@ TEST(test_matmul_execute_subgraph_int8, U8S8U8MatmulAddF32) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-        test_tensor other_f32_ts(other_f32, engine, other_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+        test_tensor_t other_f32_ts(other_f32, engine, other_data);
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_u8_ts(dst_u8, engine, case1_out_data);
+        test_tensor_t dst_u8_ts(dst_u8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_f32_ts, other_f32_ts},
                           {dst_u8_ts}, *engine, *strm),
                 graph::status::success);
@@ -5117,7 +5119,7 @@ TEST(test_matmul_execute_subgraph_int8, U8S8U8MatmulAddF32) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_u8_case2_ts(dst_u8, engine, case2_out_data);
+        test_tensor_t dst_u8_case2_ts(dst_u8, engine, case2_out_data);
         for (size_t iter = 0; iter < 5; iter++) {
             cp.execute(strm,
                     {src_u8_ts.get(), weight_f32_ts.get(), other_f32_ts.get()},
@@ -5271,12 +5273,12 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasNdx2dWithTranspose) {
         // set bias to be constant
         bias_f32.property = graph::property_type::constant;
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-        test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+        test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
         // -------------------------case 1----------------------------------
         std::vector<int8_t> case1_out_data(product(dst_shape));
-        test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+        test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_f32_ts, bias_f32_ts},
                           {dst_s8_ts}, *engine, *strm),
                 graph::status::success);
@@ -5299,7 +5301,7 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasNdx2dWithTranspose) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<int8_t> case2_out_data(product(dst_shape));
-        test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
         for (size_t iter = 0; iter < 1; iter++) {
             cp.execute(strm,
                     {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -5434,14 +5436,14 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasReluNdx2d) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine);
+        test_tensor_t src_u8_ts(src_u8, engine);
         src_u8_ts.fill<uint8_t>(20, 5);
-        test_tensor weight_f32_ts(weight_f32, engine);
+        test_tensor_t weight_f32_ts(weight_f32, engine);
         weight_f32_ts.fill<float>();
-        test_tensor bias_f32_ts(bias_f32, engine);
+        test_tensor_t bias_f32_ts(bias_f32, engine);
         bias_f32_ts.fill<float>();
         // -------------------------case 1----------------------------------
-        test_tensor dst_s8_ts(dst_s8, engine);
+        test_tensor_t dst_s8_ts(dst_s8, engine);
         ASSERT_EQ(run_graph(g, {src_u8_ts, weight_f32_ts, bias_f32_ts},
                           {dst_s8_ts}, *engine, *strm),
                 graph::status::success);
@@ -5465,7 +5467,7 @@ TEST(test_matmul_execute_subgraph_int8, QuantWeiMatmulBiasReluNdx2d) {
 
         p.compile(&cp, lt_ins, lt_outs, engine);
 
-        test_tensor dst_s8_case2_ts(dst_s8, engine);
+        test_tensor_t dst_s8_case2_ts(dst_s8, engine);
         for (size_t iter = 0; iter < 5; iter++) {
             if (with_bias) {
                 cp.execute(strm,
@@ -5540,9 +5542,9 @@ TEST(test_matmul_execute, MatmulReluFusion) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor dst_ts(relu_dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t dst_ts(relu_dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()});
@@ -5599,10 +5601,10 @@ TEST(test_matmul_execute, MatmulBiasFusion) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor bias_ts(bias, eng, bias_data);
-    test_tensor dst_ts(dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t bias_ts(bias, eng, bias_data);
+    test_tensor_t dst_ts(dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -5667,10 +5669,10 @@ TEST(test_matmul_execute, MatmulSumBroadcast1d) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor add_src1_ts(add_src1, engine, add_src1_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t add_src1_ts(add_src1, engine, add_src1_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), add_src1_ts.get()},
@@ -5735,10 +5737,10 @@ TEST(test_matmul_execute, MatmulSumFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor add_src1_ts(add_src1, engine, add_src1_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t add_src1_ts(add_src1, engine, add_src1_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), add_src1_ts.get()},
@@ -5808,10 +5810,10 @@ TEST(test_matmul_execute, MatmulSumGeluFusion) {
 
     p.compile(&cp, inputs, outputs, eng);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor other_ts(other, eng, other_data);
-    test_tensor dst_ts(gelu_dst, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t other_ts(other, eng, other_data);
+    test_tensor_t dst_ts(gelu_dst, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), other_ts.get()},
@@ -5881,10 +5883,10 @@ TEST(test_matmul_execute, MatmulSumReluFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor other_ts(other, engine, other_data);
-    test_tensor dst_ts(relu_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t other_ts(other, engine, other_data);
+    test_tensor_t dst_ts(relu_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), other_ts.get()},
@@ -5947,10 +5949,10 @@ TEST(test_matmul_execute, MatmulBiasReluFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor dst_ts(relu_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t dst_ts(relu_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -6013,10 +6015,10 @@ TEST(test_matmul_execute, MatmulBiasGeluFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor dst_ts(gelu_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t dst_ts(gelu_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -6082,10 +6084,10 @@ TEST(test_matmul_execute, MatmulBiasRelu6Fusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor dst_ts(clamp_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t dst_ts(clamp_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -6151,10 +6153,10 @@ TEST(test_matmul_execute, MatmulBiasClampFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor dst_ts(clamp_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t dst_ts(clamp_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -6219,10 +6221,10 @@ TEST(test_matmul_execute, MatmulBiasEluFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor dst_ts(elu_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t dst_ts(elu_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -6285,10 +6287,10 @@ TEST(test_matmul_execute, MatmulBiasSigmoidFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor dst_ts(sigmoid_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t dst_ts(sigmoid_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), bias_ts.get()},
@@ -6356,11 +6358,11 @@ TEST(test_matmul_execute, MatmulBiasAddFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor post_src_ts(post_src, engine, post_src_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t post_src_ts(post_src, engine, post_src_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -6427,10 +6429,10 @@ TEST(test_matmul_execute, MatmulDivFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-    test_tensor dst_ts(div_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+    test_tensor_t dst_ts(div_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), weight_ts.get(), div_src1_ts.get()},
@@ -6506,11 +6508,11 @@ TEST(test_matmul_execute, MatmulDivAddFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-    test_tensor add_src1_ts(add_src1, engine, add_src1_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+    test_tensor_t add_src1_ts(add_src1, engine, add_src1_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -6603,11 +6605,11 @@ TEST(test_matmul_execute, MatmulSwapBinaryMulAddFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-    test_tensor add_src1_ts(add_src1, engine, add_src1_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+    test_tensor_t add_src1_ts(add_src1, engine, add_src1_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -6714,11 +6716,11 @@ TEST(test_matmul_execute_subgraph_int8, MatmulReluFusion) {
     g.add_op(&qout_op);
     g.finalize();
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_s8_ts(weight_s8, engine, weight_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
     // -------------------------case 1----------------------------------
     std::vector<int8_t> case1_out_data(product(dst_shape));
-    test_tensor dst_s8_ts(dst_s8, engine, case1_out_data);
+    test_tensor_t dst_s8_ts(dst_s8, engine, case1_out_data);
     ASSERT_EQ(run_graph(g, {src_u8_ts, weight_s8_ts}, {dst_s8_ts}, *engine,
                       *strm),
             graph::status::success);
@@ -6741,7 +6743,7 @@ TEST(test_matmul_execute_subgraph_int8, MatmulReluFusion) {
     p.compile(&cp, lt_ins, lt_outs, engine);
 
     std::vector<int8_t> case2_out_data(product(dst_shape));
-    test_tensor dst_s8_case2_ts(dst_s8, engine, case2_out_data);
+    test_tensor_t dst_s8_case2_ts(dst_s8, engine, case2_out_data);
     cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
             {dst_s8_case2_ts.get()});
     strm->wait();
@@ -6903,10 +6905,10 @@ TEST(test_matmul_execute_subgraph_int8,
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor dst_s8_ts(dst_s8, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t dst_s8_ts(dst_s8, engine);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -7064,11 +7066,11 @@ TEST(test_matmul_execute_subgraph_int8,
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
     std::vector<int8_t> dst_out_data(product(dst_shape));
-    test_tensor dst_s8_ts(dst_s8, engine, dst_out_data);
+    test_tensor_t dst_s8_ts(dst_s8, engine, dst_out_data);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -7266,10 +7268,10 @@ TEST(test_matmul_execute_subgraph_int8,
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor dst_s8_ts(dst_s8, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t dst_s8_ts(dst_s8, engine);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -7468,10 +7470,10 @@ TEST(test_matmul_execute_subgraph_int8,
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_u8_ts(src_u8, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor dst_s8_ts(dst_s8, engine);
+    test_tensor_t src_u8_ts(src_u8, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t dst_s8_ts(dst_s8, engine);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_u8_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -7571,12 +7573,12 @@ TEST(test_matmul_execute, MatmulBiasReshapeTranspose) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_f32_ts(src_f32, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t src_f32_ts(src_f32, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
     graph::logical_tensor_t lt;
     cp.query_logical_tensor(transpose_f32.id, &lt);
-    test_tensor dst_f32_ts(lt, engine);
+    test_tensor_t dst_f32_ts(lt, engine);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_f32_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -7677,10 +7679,10 @@ TEST(test_matmul_execute, MatmulBiasTransposeReshape) {
 
     p.compile(&cp, lt_ins, lt_outs, engine);
 
-    test_tensor src_f32_ts(src_f32, engine, src_data);
-    test_tensor weight_f32_ts(weight_f32, engine, weight_data);
-    test_tensor bias_f32_ts(bias_f32, engine, bias_data);
-    test_tensor dst_f32_ts(reshape_f32, engine);
+    test_tensor_t src_f32_ts(src_f32, engine, src_data);
+    test_tensor_t weight_f32_ts(weight_f32, engine, weight_data);
+    test_tensor_t bias_f32_ts(bias_f32, engine, bias_data);
+    test_tensor_t dst_f32_ts(reshape_f32, engine);
     for (size_t iter = 0; iter < 5; iter++) {
         cp.execute(strm,
                 {src_f32_ts.get(), weight_f32_ts.get(), bias_f32_ts.get()},
@@ -7737,9 +7739,9 @@ TEST(test_matmul_execute, MatmulStridedScalarOutput) {
     ASSERT_EQ(scalar_lt.layout_type, graph::layout_type::strided);
     ASSERT_EQ(scalar_lt.ndims, 0);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor weight_ts(weight, eng, weight_data);
-    test_tensor dst_ts(scalar_lt, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t weight_ts(weight, eng, weight_data);
+    test_tensor_t dst_ts(scalar_lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()}),
@@ -7813,11 +7815,11 @@ TEST(test_matmul_execute, MatmulBiasAddReluFusion) {
 
     p.compile(&cp, inputs, outputs, engine);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor bias_ts(bias, engine, bias_data);
-    test_tensor post_src_ts(post_src, engine, post_src_data);
-    test_tensor dst_ts(relu_dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t bias_ts(bias, engine, bias_data);
+    test_tensor_t post_src_ts(post_src, engine, post_src_data);
+    test_tensor_t dst_ts(relu_dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm,
@@ -7876,9 +7878,9 @@ TEST(test_matmul_execute, MatmulEmptyInput) {
     cp.query_logical_tensor(weight.id, &weight);
     cp.query_logical_tensor(dst.id, &dst);
 
-    test_tensor src_ts(src, eng);
-    test_tensor weight_ts(weight, eng);
-    test_tensor dst_ts(dst, eng);
+    test_tensor_t src_ts(src, eng);
+    test_tensor_t weight_ts(weight, eng);
+    test_tensor_t dst_ts(dst, eng);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), weight_ts.get()}, {dst_ts.get()}),
@@ -7968,6 +7970,19 @@ TEST(test_matmul_execute_subgraph_int8, ShareCachedWeight) {
     std::vector<std::vector<int64_t>> dst_shapes {{4, 4096}, {32, 4096}};
     std::vector<std::shared_ptr<graph::compiled_partition_t>> cps;
     size_t prv_cache_size = 0;
+
+    // random generate src, weight and bias data random seed = 7.
+    // Weight tensor keeps same for different src.
+    std::default_random_engine generator(7);
+    std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+    std::vector<int8_t> weight_data(product(weight_shape));
+    std::generate(weight_data.begin(), weight_data.end(),
+            [&]() { return static_cast<int8_t>(s8_distribution(generator)); });
+    test_tensor_t weight_s8_ts(weight_s8, engine, weight_data);
+
+    // set constant tensor cache capacity as 1GB
+    dnnl::graph::set_constant_tensor_cache_capacity(
+            static_cast<engine::kind>(engine->kind()), 1024);
     for (size_t i = 0; i < src_shapes.size(); ++i) {
         std::vector<int64_t> src_shape = src_shapes[i];
         std::vector<int64_t> dst_shape = dst_shapes[i];
@@ -7988,23 +8003,161 @@ TEST(test_matmul_execute_subgraph_int8, ShareCachedWeight) {
         cp.query_logical_tensor(dst_s8.id, &compiled_output);
 
         std::vector<uint8_t> src_data(product(src_shape));
-        std::vector<int8_t> weight_data(product(weight_shape));
 
-        // random generate src, weight and bias data random seed = 7
-        std::default_random_engine generator(7);
         std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
-        std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
         std::generate(src_data.begin(), src_data.end(), [&]() {
             return static_cast<uint8_t>(u8_distribution(generator));
         });
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t dst_s8_ts(compiled_output, engine);
+        ASSERT_EQ(cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
+                          {dst_s8_ts.get()}),
+                graph::status::success);
+
+        size_t curr_cache_size = graph::get_constant_tensor_cache(
+                engine->kind(), engine->index())
+                                         ->get_size();
+        if (i != 0) {
+            // cache size should not change since no new weight cached
+            ASSERT_EQ(prv_cache_size, curr_cache_size);
+        }
+        prv_cache_size = curr_cache_size;
+
+        strm->wait();
+    }
+    // Reset constant tensor cache capacity as 0
+    dnnl::graph::set_constant_tensor_cache_capacity(
+            static_cast<engine::kind>(engine->kind()), 0);
+}
+
+TEST(test_matmul_execute_subgraph_int8, NoShareCachedWeight) {
+    graph::engine_t *engine = get_engine();
+    graph::stream_t *strm = get_stream();
+    std::string qtype = "per_channel";
+
+    std::vector<int64_t> weight_shape = {1024, 1024};
+
+    float scale_src = 1 / 255.f; // map to 0~255
+    float scale_out = 1;
+    int64_t zp_src = 0;
+    int64_t zp_out = engine->kind() == graph::engine_kind::gpu ? 0 : 78;
+
+    size_t scales_wei_sizes = weight_shape.back();
+    std::vector<float> scale_wei(scales_wei_sizes, 1 / 127.f);
+    std::vector<int64_t> zp_wei(scales_wei_sizes, 0);
+
+    graph::op_t dqdata_op(1, graph::op_kind::Dequantize, "dqdata_op");
+    dqdata_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    dqdata_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_src});
+    dqdata_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_src});
+    dqdata_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+    graph::op_t dqweight_op(2, graph::op_kind::Dequantize, "dqweight_op");
+    dqweight_op.set_attr<std::string>(graph::op_attr::qtype, "per_channel");
+    dqweight_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, zp_wei);
+    dqweight_op.set_attr<std::vector<float>>(graph::op_attr::scales, scale_wei);
+    dqweight_op.set_attr<int64_t>(graph::op_attr::axis, 1);
+
+    graph::op_t matmul_op(3, graph::op_kind::MatMul, "matmul_op");
+    matmul_op.set_attr<bool>(graph::op_attr::transpose_a, false);
+    matmul_op.set_attr<bool>(graph::op_attr::transpose_b, false);
+
+    graph::op_t qout_op(4, graph::op_kind::Quantize, "qout_op");
+    qout_op.set_attr<std::string>(graph::op_attr::qtype, "per_tensor");
+    qout_op.set_attr<std::vector<int64_t>>(graph::op_attr::zps, {zp_out});
+    qout_op.set_attr<std::vector<float>>(graph::op_attr::scales, {scale_out});
+    qout_op.set_attr<int64_t>(graph::op_attr::axis, 0);
+
+    // prepare logical tensor
+    auto src_u8 = utils::logical_tensor_init(1, graph::data_type::u8);
+    auto src_f32_dq = utils::logical_tensor_init(2, graph::data_type::f32);
+    auto weight_s8
+            = utils::logical_tensor_init(4, weight_shape, graph::data_type::s8);
+    weight_s8.property = graph::property_type::constant;
+    auto weight_f32_dq = utils::logical_tensor_init(
+            5, weight_shape, graph::data_type::f32);
+    auto dst_f32 = utils::logical_tensor_init(7, graph::data_type::f32);
+    auto dst_s8 = utils::logical_tensor_init(8, graph::data_type::s8);
+
+    dqdata_op.add_input(src_u8);
+    dqdata_op.add_output(src_f32_dq);
+
+    dqweight_op.add_input(weight_s8);
+    dqweight_op.add_output(weight_f32_dq);
+
+    matmul_op.add_input(src_f32_dq);
+    matmul_op.add_input(weight_f32_dq);
+    matmul_op.add_output(dst_f32);
+
+    qout_op.add_input(dst_f32);
+    qout_op.add_output(dst_s8);
+
+    graph::graph_t g(engine->kind());
+    g.add_op(&dqdata_op);
+    g.add_op(&dqweight_op);
+    g.add_op(&matmul_op);
+    g.add_op(&qout_op);
+    g.finalize();
+
+    graph::pass::pass_base_ptr apass = get_pass("x8x8x_matmul_post_ops");
+    apass->run(g);
+    ASSERT_EQ(g.get_num_partitions(), 1U);
+    auto part = g.get_partitions()[0];
+    ASSERT_EQ(part->get_ops().size(), 4U);
+
+    graph::partition_t p;
+    p.init(part);
+
+    std::vector<std::vector<int64_t>> src_shapes {{4, 1024}, {32, 1024}};
+    std::vector<std::vector<int64_t>> dst_shapes {{4, 1024}, {32, 1024}};
+    std::vector<std::shared_ptr<graph::compiled_partition_t>> cps;
+    size_t prv_cache_size = 0;
+
+    //set constant tensor cache capacity as 1GB
+    dnnl::graph::set_constant_tensor_cache_capacity(
+            static_cast<engine::kind>(engine->kind()), 1024);
+
+    std::default_random_engine generator(7);
+    std::uniform_real_distribution<float> s8_distribution(-127.0f, 128.0f);
+    std::vector<int8_t> weight_data(product(weight_shape));
+    // Construct different weight tensor obejct with different memory address.
+    std::vector<test_tensor_t> weight_s8_ts_vec;
+    for (size_t i = 0; i < src_shapes.size(); i++) {
         std::generate(weight_data.begin(), weight_data.end(), [&]() {
             return static_cast<int8_t>(s8_distribution(generator));
         });
+        weight_s8_ts_vec.emplace_back(weight_s8, engine, weight_data);
+    }
 
-        test_tensor src_u8_ts(src_u8, engine, src_data);
-        test_tensor weight_s8_ts(weight_s8, engine, weight_data);
-        test_tensor dst_s8_ts(compiled_output, engine);
-        ASSERT_EQ(cp.execute(strm, {src_u8_ts.get(), weight_s8_ts.get()},
+    for (size_t i = 0; i < src_shapes.size(); ++i) {
+        std::vector<int64_t> src_shape = src_shapes[i];
+        std::vector<int64_t> dst_shape = dst_shapes[i];
+
+        src_u8 = utils::logical_tensor_init(1, src_shape, graph::data_type::u8);
+        dst_s8 = utils::logical_tensor_init(8, dst_shape, graph::data_type::s8);
+        std::vector<const graph::logical_tensor_t *> lt_ins {
+                &src_u8, &weight_s8};
+        std::vector<const graph::logical_tensor_t *> lt_outs {&dst_s8};
+
+        cps.push_back(std::make_shared<graph::compiled_partition_t>(p));
+        auto &cp = *cps.back();
+
+        ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine),
+                graph::status::success);
+
+        graph::logical_tensor_t compiled_output;
+        cp.query_logical_tensor(dst_s8.id, &compiled_output);
+
+        std::vector<uint8_t> src_data(product(src_shape));
+        std::uniform_real_distribution<float> u8_distribution(0.0f, 255.0f);
+        std::generate(src_data.begin(), src_data.end(), [&]() {
+            return static_cast<uint8_t>(u8_distribution(generator));
+        });
+
+        test_tensor_t src_u8_ts(src_u8, engine, src_data);
+        test_tensor_t dst_s8_ts(compiled_output, engine);
+        ASSERT_EQ(cp.execute(strm, {src_u8_ts.get(), weight_s8_ts_vec[i].get()},
                           {dst_s8_ts.get()}),
                 graph::status::success);
 
@@ -8013,11 +8166,16 @@ TEST(test_matmul_execute_subgraph_int8, ShareCachedWeight) {
                                          ->get_size();
 
         if (i != 0) {
-            // cache size should not change since no new weight cached
-            ASSERT_EQ(prv_cache_size, curr_cache_size);
+            // cache size changes since new weight tensor with different address
+            // will be cached
+            ASSERT_NE(prv_cache_size, curr_cache_size);
         }
         prv_cache_size = curr_cache_size;
 
         strm->wait();
     }
+
+    // Reset constant tensor cache capacity as 0
+    dnnl::graph::set_constant_tensor_cache_capacity(
+            static_cast<engine::kind>(engine->kind()), 0);
 }
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_memory_planning.cpp b/tests/gtests/graph/unit/backend/dnnl/test_memory_planning.cpp
deleted file mode 100644
index aa9758caddd..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_memory_planning.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include <memory>
-
-#include "interface/c_types_map.hpp"
-
-#include "backend/dnnl/passes/memory_planning.hpp"
-
-#include "gtest/gtest.h"
-
-#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-namespace dnnl_impl = graph::dnnl_impl;
-
-TEST(test_memory_planning_memory_planning, GetMemoryInfo) {
-    dnnl_impl::memory_planner_t mp;
-    graph::op_t op {0, graph::op_kind::Abs, "abs"};
-    auto lt = utils::logical_tensor_init(
-            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
-    graph::value_t val {op, 0, lt};
-    ASSERT_NO_THROW(mp.get_memory_info(&val));
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_memory_planning_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_memory_planning_cpu.cpp
new file mode 100644
index 00000000000..dea7ad9de57
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_memory_planning_cpu.cpp
@@ -0,0 +1,39 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <memory>
+
+#include "interface/c_types_map.hpp"
+
+#include "backend/dnnl/passes/memory_planning.hpp"
+
+#include "gtest/gtest.h"
+
+#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+namespace dnnl_impl = graph::dnnl_impl;
+
+TEST(test_memory_planning, GetMemoryInfo) {
+    dnnl_impl::memory_planner_t mp;
+    graph::op_t op {0, graph::op_kind::Abs, "abs"};
+    auto lt = utils::logical_tensor_init(
+            1, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
+    graph::value_t val {op, 0, lt};
+    ASSERT_NO_THROW(mp.get_memory_info(&val));
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_mqa_decomp.cpp b/tests/gtests/graph/unit/backend/dnnl/test_mqa_decomp.cpp
index afe34ad1503..f065a656015 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_mqa_decomp.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_mqa_decomp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -81,12 +81,11 @@ TEST(test_mqa_decomp_execute, F32MqaDecomp_CPU) {
         outputs.emplace_back(&lt);
     }
 
-    // Force enable to avoid some unexpected env settings
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
         inputs_ts.back().fill<float>();
@@ -97,8 +96,8 @@ TEST(test_mqa_decomp_execute, F32MqaDecomp_CPU) {
         cp.query_logical_tensor(lt->id, &compiled_output);
         outputs_ts.emplace_back(compiled_output, eng);
     }
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -147,12 +146,11 @@ TEST(test_mqa_decomp_execute, Bf16MqaDecomp_CPU) {
         outputs.emplace_back(&lt);
     }
 
-    // Force enable to avoid some unexpected env settings
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts, outputs_ts;
+    std::vector<test_tensor_t> inputs_ts, outputs_ts;
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
         inputs_ts.back().fill<float>();
@@ -163,8 +161,8 @@ TEST(test_mqa_decomp_execute, Bf16MqaDecomp_CPU) {
         cp.query_logical_tensor(lt->id, &compiled_output);
         outputs_ts.emplace_back(compiled_output, eng);
     }
-    ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs_ts)),
+    ASSERT_EQ(cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs_ts)),
             graph::status::success);
     strm->wait();
 }
@@ -208,12 +206,11 @@ TEST(test_mqa_decomp_execute, MultithreaMqaDecomp_CPU) {
         outputs.emplace_back(&lt);
     }
 
-    // Force enable to avoid some unexpected env settings
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
     graph::compiled_partition_t cp(p);
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> inputs_ts;
+    std::vector<test_tensor_t> inputs_ts;
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
         inputs_ts.back().fill<float>();
@@ -222,15 +219,16 @@ TEST(test_mqa_decomp_execute, MultithreaMqaDecomp_CPU) {
     auto func = [&]() {
         graph::stream_t *strm;
         dnnl_stream_create(&strm, eng, dnnl_stream_in_order);
-        std::vector<test_tensor> outputs_ts;
+        std::vector<test_tensor_t> outputs_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp.query_logical_tensor(lt->id, &compiled_output);
             outputs_ts.emplace_back(compiled_output, eng);
         }
         for (int i = 0; i < 5; i++) {
-            ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs_ts)),
+            ASSERT_EQ(
+                    cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs_ts)),
                     graph::status::success);
         }
         strm->wait();
@@ -287,41 +285,41 @@ TEST(test_mqa_decomp_execute, F32MqaCorr_CPU) {
         outputs.emplace_back(&lt);
     }
 
-    std::vector<test_tensor> inputs_ts;
+    std::vector<test_tensor_t> inputs_ts;
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
         inputs_ts.back().fill<float>();
     }
 
     // ---------------------------case1-----------------------------------------
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
     graph::compiled_partition_t cp1(p);
     ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> outputs1_ts;
+    std::vector<test_tensor_t> outputs1_ts;
     for (auto &lt : outputs) {
         graph::logical_tensor_t compiled_output;
         cp1.query_logical_tensor(lt->id, &compiled_output);
         outputs1_ts.emplace_back(compiled_output, eng);
     }
-    ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs1_ts)),
+    ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs1_ts)),
             graph::status::success);
     strm->wait();
 
     // ---------------------------case2-----------------------------------------
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
     graph::compiled_partition_t cp2(p);
     ASSERT_EQ(p.compile(&cp2, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> outputs2_ts;
+    std::vector<test_tensor_t> outputs2_ts;
     for (auto &lt : outputs) {
         graph::logical_tensor_t compiled_output;
         cp2.query_logical_tensor(lt->id, &compiled_output);
         outputs2_ts.emplace_back(compiled_output, eng);
     }
-    ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs2_ts)),
+    ASSERT_EQ(cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs2_ts)),
             graph::status::success);
     strm->wait();
 
@@ -375,41 +373,41 @@ TEST(test_mqa_decomp_execute, Bf16MqaCorr_CPU) {
         outputs.emplace_back(&lt);
     }
 
-    std::vector<test_tensor> inputs_ts;
+    std::vector<test_tensor_t> inputs_ts;
     for (auto &lt : inputs) {
         inputs_ts.emplace_back(*lt, eng);
         inputs_ts.back().fill<float>();
     }
 
     // ---------------------------case1-----------------------------------------
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
     graph::compiled_partition_t cp1(p);
     ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> outputs1_ts;
+    std::vector<test_tensor_t> outputs1_ts;
     for (auto &lt : outputs) {
         graph::logical_tensor_t compiled_output;
         cp1.query_logical_tensor(lt->id, &compiled_output);
         outputs1_ts.emplace_back(compiled_output, eng);
     }
-    ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs1_ts)),
+    ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs1_ts)),
             graph::status::success);
     strm->wait();
 
     // ---------------------------case2-----------------------------------------
-    custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+    custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
     graph::compiled_partition_t cp2(p);
     ASSERT_EQ(p.compile(&cp2, inputs, outputs, eng), graph::status::success);
 
-    std::vector<test_tensor> outputs2_ts;
+    std::vector<test_tensor_t> outputs2_ts;
     for (auto &lt : outputs) {
         graph::logical_tensor_t compiled_output;
         cp2.query_logical_tensor(lt->id, &compiled_output);
         outputs2_ts.emplace_back(compiled_output, eng);
     }
-    ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                      test_tensor::to_graph_tensor(outputs2_ts)),
+    ASSERT_EQ(cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                      test_tensor_t::to_graph_tensor(outputs2_ts)),
             graph::status::success);
     strm->wait();
 
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp b/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp
index 27407f826a2..d7e30d2ffa1 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_op_executable.cpp
@@ -31,7 +31,7 @@
 namespace graph = dnnl::impl::graph;
 namespace dnnl_impl = graph::dnnl_impl;
 
-TEST(test_op_executable_op_executable, DummyArgIndicesGetterDeathTest) {
+TEST(test_op_executable, DummyArgIndicesGetterDeathTest) {
     graph::op_t op {0, graph::op_kind::Wildcard, "op"};
     dnnl_impl::fusion_info_mgr_t mgr;
 #ifndef NDEBUG
@@ -40,7 +40,7 @@ TEST(test_op_executable_op_executable, DummyArgIndicesGetterDeathTest) {
 #endif
 }
 
-TEST(test_op_executable_op_executable, DummyExecutableCreatorDeathTest) {
+TEST(test_op_executable, DummyExecutableCreatorDeathTest) {
     dnnl::engine p_engine;
     dnnl_impl::fusion_info_mgr_t mgr;
     dnnl_impl::pd_cache_t pd_cache;
@@ -51,7 +51,7 @@ TEST(test_op_executable_op_executable, DummyExecutableCreatorDeathTest) {
 }
 
 #ifdef DNNL_WITH_SYCL
-TEST(test_op_executable_op_executable, DummyImpl) {
+TEST(test_op_executable, DummyImpl) {
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
     graph::engine_kind_t kind = get_test_engine_kind();
     SKIP_IF(kind == graph::engine_kind::cpu,
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_op_schema.cpp b/tests/gtests/graph/unit/backend/dnnl/test_op_schema.cpp
deleted file mode 100644
index 5585022c4f6..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_op_schema.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <gtest/gtest.h>
-
-#include "interface/graph.hpp"
-#include "interface/op_schema.hpp"
-
-#include "backend/dnnl/dnnl_backend.hpp"
-#include "backend/dnnl/dnnl_shape_infer.hpp"
-#include "backend/dnnl/internal_attrs.hpp"
-#include "backend/dnnl/internal_ops.hpp"
-
-#include "graph/unit/utils.hpp"
-
-using namespace dnnl::impl::graph;
-using namespace dnnl::graph::tests::unit::utils;
-
-TEST(test_op_schema_op_schema, InferSqueezeOutputShape) {
-    auto &be = graph::dnnl_impl::dnnl_backend_t::get_singleton();
-    EXPECT_EQ(be.get_name(), "dnnl_backend");
-    const op_kind_t kind = dnnl_impl::op_kind::dnnl_squeeze;
-    const op_schema_t *op_schema_ = op_schema_registry_t::get_op_schema(kind);
-    std::vector<std::vector<int64_t>> axes_list {{1}, {1, 2}, {-1}, {-1, -2}};
-    std::vector<std::vector<int64_t>> src_shapes {
-            {3, 1, 4, 5}, {3, 1, 1, 4, 5}, {3, 4, 5, 1}, {3, 4, 5, 1, 1}};
-    std::vector<std::vector<int64_t>> dst_shapes {
-            {3, 4, 5}, {3, 4, 5}, {3, 4, 5}, {3, 4, 5}};
-    for (size_t i = 0; i < axes_list.size(); ++i) {
-        op_t op {kind, "squeeze"};
-        op.set_attr<std::vector<int64_t>>(op_attr::axes, axes_list[i]);
-
-        logical_tensor_t lt_in = logical_tensor_init(
-                0, src_shapes[i], data_type::f32, layout_type::strided);
-        logical_tensor_t lt_out
-                = logical_tensor_init(1, data_type::f32, layout_type::strided);
-
-        std::vector<logical_tensor_t *> in {&lt_in};
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        status_t ret = op_schema_->shape_infer(&op, in, out);
-        EXPECT_EQ(ret, status::success);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        const std::vector<int64_t> expected_out_shape = dst_shapes[i];
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-    }
-}
-
-TEST(test_op_schema_op_schema, InferUnsqueezeOutputShape) {
-    auto &be = graph::dnnl_impl::dnnl_backend_t::get_singleton();
-    EXPECT_EQ(be.get_name(), "dnnl_backend");
-    const op_kind_t kind = dnnl_impl::op_kind::dnnl_unsqueeze;
-    const op_schema_t *op_schema_ = op_schema_registry_t::get_op_schema(kind);
-
-    const std::vector<int64_t> src_shape {4};
-
-    const std::vector<std::vector<int64_t>> dst_shapes {
-            {1, 1, 1, 4}, {1, 1, 4, 1}};
-    const std::vector<std::vector<int64_t>> axes {{0, 1, 2}, {0, 1, -1}};
-
-    for (size_t i = 0; i < dst_shapes.size(); ++i) {
-        op_t op {kind, "unsqueeze"};
-        op.set_attr<std::vector<int64_t>>(dnnl_impl::op_attr::axes, axes[i]);
-
-        logical_tensor_t lt_in = logical_tensor_init(
-                0, src_shape, data_type::f32, layout_type::strided);
-        logical_tensor_t lt_out
-                = logical_tensor_init(1, data_type::f32, layout_type::strided);
-
-        std::vector<logical_tensor_t *> in {&lt_in};
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        status_t ret = op_schema_->shape_infer(&op, in, out);
-        EXPECT_EQ(ret, status::success);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        const std::vector<int64_t> &expected_out_shape = dst_shapes[i];
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-    }
-}
-
-TEST(test_op_schema_op_schema, InferUnsqueezeOutputShapeBasedOnAxes) {
-    auto &be = graph::dnnl_impl::dnnl_backend_t::get_singleton();
-    EXPECT_EQ(be.get_name(), "dnnl_backend");
-    const op_kind_t kind = dnnl_impl::op_kind::dnnl_unsqueeze;
-    const op_schema_t *op_schema_ = op_schema_registry_t::get_op_schema(kind);
-
-    const std::vector<std::vector<int64_t>> axes_list {
-            {1}, {1, 2}, {-1}, {-1, -2}};
-    const std::vector<std::vector<int64_t>> src_shapes {
-            {3, 4, 5}, {3, 4, 5}, {3, 4, 5}, {3, 4, 5}};
-    const std::vector<std::vector<int64_t>> dst_shapes {
-            {3, 1, 4, 5}, {3, 1, 1, 4, 5}, {3, 4, 5, 1}, {3, 4, 5, 1, 1}};
-
-    for (size_t i = 0; i < axes_list.size(); ++i) {
-        op_t op {kind, "unsqueeze"};
-        op.set_attr<std::vector<int64_t>>(op_attr::axes, axes_list[i]);
-
-        logical_tensor_t lt_in = logical_tensor_init(
-                0, src_shapes[i], data_type::f32, layout_type::strided);
-        logical_tensor_t lt_out
-                = logical_tensor_init(1, data_type::f32, layout_type::strided);
-
-        std::vector<logical_tensor_t *> in {&lt_in};
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        status_t ret = op_schema_->shape_infer(&op, in, out);
-        EXPECT_EQ(ret, status::success);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        const std::vector<int64_t> &expected_out_shape = dst_shapes[i];
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-    }
-}
-
-TEST(test_op_schema_op_schema, DnnlBinary) {
-    op_kind_t op_kind = dnnl_impl::op_kind::dnnl_binary;
-    const size_t expected_in_size_lower = 2;
-    const size_t expected_out_size = 2;
-    const size_t expected_attr_size = 7;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false},
-                    {dnnl_impl::op_attr::alg_kind, true}};
-    verify_op_schema(op_kind, expected_in_size_lower, expected_out_size,
-            expected_attr_size, attrs_data);
-
-    const size_t expected_in_size_upper = 32;
-    verify_op_schema(op_kind, expected_in_size_upper, expected_out_size,
-            expected_attr_size, attrs_data);
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_op_schema_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_op_schema_cpu.cpp
new file mode 100644
index 00000000000..812e02cf464
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_op_schema_cpu.cpp
@@ -0,0 +1,147 @@
+/*******************************************************************************
+* Copyright 2021-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <gtest/gtest.h>
+
+#include "interface/graph.hpp"
+#include "interface/op_schema.hpp"
+
+#include "backend/dnnl/dnnl_backend.hpp"
+#include "backend/dnnl/dnnl_shape_infer.hpp"
+#include "backend/dnnl/internal_attrs.hpp"
+#include "backend/dnnl/internal_ops.hpp"
+
+#include "graph/unit/utils.hpp"
+
+using namespace dnnl::impl::graph;
+using namespace dnnl::graph::tests::unit::utils;
+
+TEST(test_op_schema, InferSqueezeOutputShape) {
+    auto &be = graph::dnnl_impl::dnnl_backend_t::get_singleton();
+    EXPECT_EQ(be.get_name(), "dnnl_backend");
+    const op_kind_t kind = dnnl_impl::op_kind::dnnl_squeeze;
+    const op_schema_t *op_schema_ = op_schema_registry_t::get_op_schema(kind);
+    std::vector<std::vector<int64_t>> axes_list {{1}, {1, 2}, {-1}, {-1, -2}};
+    std::vector<std::vector<int64_t>> src_shapes {
+            {3, 1, 4, 5}, {3, 1, 1, 4, 5}, {3, 4, 5, 1}, {3, 4, 5, 1, 1}};
+    std::vector<std::vector<int64_t>> dst_shapes {
+            {3, 4, 5}, {3, 4, 5}, {3, 4, 5}, {3, 4, 5}};
+    for (size_t i = 0; i < axes_list.size(); ++i) {
+        op_t op {kind, "squeeze"};
+        op.set_attr<std::vector<int64_t>>(op_attr::axes, axes_list[i]);
+
+        logical_tensor_t lt_in = logical_tensor_init(
+                0, src_shapes[i], data_type::f32, layout_type::strided);
+        logical_tensor_t lt_out
+                = logical_tensor_init(1, data_type::f32, layout_type::strided);
+
+        std::vector<logical_tensor_t *> in {&lt_in};
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        status_t ret = op_schema_->shape_infer(&op, in, out);
+        EXPECT_EQ(ret, status::success);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        const std::vector<int64_t> expected_out_shape = dst_shapes[i];
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+    }
+}
+
+TEST(test_op_schema, InferUnsqueezeOutputShape) {
+    auto &be = graph::dnnl_impl::dnnl_backend_t::get_singleton();
+    EXPECT_EQ(be.get_name(), "dnnl_backend");
+    const op_kind_t kind = dnnl_impl::op_kind::dnnl_unsqueeze;
+    const op_schema_t *op_schema_ = op_schema_registry_t::get_op_schema(kind);
+
+    const std::vector<int64_t> src_shape {4};
+
+    const std::vector<std::vector<int64_t>> dst_shapes {
+            {1, 1, 1, 4}, {1, 1, 4, 1}};
+    const std::vector<std::vector<int64_t>> axes {{0, 1, 2}, {0, 1, -1}};
+
+    for (size_t i = 0; i < dst_shapes.size(); ++i) {
+        op_t op {kind, "unsqueeze"};
+        op.set_attr<std::vector<int64_t>>(dnnl_impl::op_attr::axes, axes[i]);
+
+        logical_tensor_t lt_in = logical_tensor_init(
+                0, src_shape, data_type::f32, layout_type::strided);
+        logical_tensor_t lt_out
+                = logical_tensor_init(1, data_type::f32, layout_type::strided);
+
+        std::vector<logical_tensor_t *> in {&lt_in};
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        status_t ret = op_schema_->shape_infer(&op, in, out);
+        EXPECT_EQ(ret, status::success);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        const std::vector<int64_t> &expected_out_shape = dst_shapes[i];
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+    }
+}
+
+TEST(test_op_schema, InferUnsqueezeOutputShapeBasedOnAxes) {
+    auto &be = graph::dnnl_impl::dnnl_backend_t::get_singleton();
+    EXPECT_EQ(be.get_name(), "dnnl_backend");
+    const op_kind_t kind = dnnl_impl::op_kind::dnnl_unsqueeze;
+    const op_schema_t *op_schema_ = op_schema_registry_t::get_op_schema(kind);
+
+    const std::vector<std::vector<int64_t>> axes_list {
+            {1}, {1, 2}, {-1}, {-1, -2}};
+    const std::vector<std::vector<int64_t>> src_shapes {
+            {3, 4, 5}, {3, 4, 5}, {3, 4, 5}, {3, 4, 5}};
+    const std::vector<std::vector<int64_t>> dst_shapes {
+            {3, 1, 4, 5}, {3, 1, 1, 4, 5}, {3, 4, 5, 1}, {3, 4, 5, 1, 1}};
+
+    for (size_t i = 0; i < axes_list.size(); ++i) {
+        op_t op {kind, "unsqueeze"};
+        op.set_attr<std::vector<int64_t>>(op_attr::axes, axes_list[i]);
+
+        logical_tensor_t lt_in = logical_tensor_init(
+                0, src_shapes[i], data_type::f32, layout_type::strided);
+        logical_tensor_t lt_out
+                = logical_tensor_init(1, data_type::f32, layout_type::strided);
+
+        std::vector<logical_tensor_t *> in {&lt_in};
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        status_t ret = op_schema_->shape_infer(&op, in, out);
+        EXPECT_EQ(ret, status::success);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        const std::vector<int64_t> &expected_out_shape = dst_shapes[i];
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+    }
+}
+
+TEST(test_op_schema, DnnlBinary) {
+    op_kind_t op_kind = dnnl_impl::op_kind::dnnl_binary;
+    const size_t expected_in_size_lower = 2;
+    const size_t expected_out_size = 2;
+    const size_t expected_attr_size = 7;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false},
+                    {dnnl_impl::op_attr::alg_kind, true}};
+    verify_op_schema(op_kind, expected_in_size_lower, expected_out_size,
+            expected_attr_size, attrs_data);
+
+    const size_t expected_in_size_upper = 32;
+    verify_op_schema(op_kind, expected_in_size_upper, expected_out_size,
+            expected_attr_size, attrs_data);
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_partition.cpp b/tests/gtests/graph/unit/backend/dnnl/test_partition.cpp
deleted file mode 100644
index a000494c3f6..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_partition.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <memory>
-
-#include "gtest/gtest.h"
-
-#include "interface/op.hpp"
-#include "interface/partition.hpp"
-
-#include "backend/dnnl/dnnl_partition_impl.hpp"
-
-#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
-
-using namespace dnnl::impl::graph;
-
-TEST(test_partition_partition, CreateSimple) {
-    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
-            engine_kind::cpu, fpmath_mode::strict, partition_kind_t::undef);
-    ASSERT_EQ(p.get_ops().size(), 0U);
-    ASSERT_EQ(p.get_fpmath_mode(), fpmath_mode::strict);
-    ASSERT_EQ(p.get_kind(), partition_kind_t::undef);
-}
-
-TEST(test_partition_partition, AddOps) {
-    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
-            engine_kind::cpu, fpmath_mode::strict, partition_kind_t::undef);
-    size_t id = 100;
-    std::shared_ptr<op_t> n(new op_t(id, op_kind::Wildcard, "Wildcard"));
-    p.add_op(n);
-    ASSERT_EQ(p.get_ops().size(), 1U);
-
-    std::vector<size_t> ids {101, 102};
-    std::vector<std::shared_ptr<op_t>> ops;
-    for (auto id : ids) {
-        ops.emplace_back(new op_t(id, op_kind::Wildcard, "Wildcard"));
-        p.add_op(ops.back());
-    }
-
-    ASSERT_EQ(p.get_ops().size(), 3U);
-}
-
-TEST(test_partition_partition, GetOps) {
-    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
-            engine_kind::cpu, fpmath_mode::strict, partition_kind_t::undef);
-    size_t id = 100;
-    std::shared_ptr<op_t> n(new op_t(id, op_kind::Wildcard, "Wildcard"));
-    p.add_op(n);
-    auto ops = p.get_ops();
-    ASSERT_EQ(ops.size(), 1U);
-    ASSERT_EQ(ops[0]->get_id(), 100U);
-}
-
-TEST(test_partition_partition, Init) {
-    // (todo)xinyu: improve engine test
-    engine_t *eng = get_engine();
-    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
-            eng->kind(), fpmath_mode::strict, partition_kind_t::undef);
-    std::shared_ptr<op_t> n(new op_t(0, op_kind::Convolution, "Conv"));
-    n->set_attr<int64_t>(op_attr::groups, 0);
-    p.add_op(n);
-    ASSERT_FALSE(p.is_initialized());
-    ASSERT_TRUE(p.get_assigned_backend()->get_name() != "fake_backend");
-}
-
-TEST(test_partition_partition, Clone) {
-    engine_t *eng = get_engine();
-    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(eng->kind(),
-            fpmath_mode::strict, partition_kind_t::convolution_post_ops);
-    auto n = std::make_shared<op_t>(op_kind::Convolution);
-    n->set_attr<int64_t>(op_attr::groups, 1);
-
-    p.add_op(n); // the subgraph
-
-    ASSERT_FALSE(p.is_initialized());
-    ASSERT_TRUE(p.get_assigned_backend()->get_name() == "dnnl_backend");
-    ASSERT_EQ(p.get_ops()[0]->get_kind(), op_kind::Convolution);
-    ASSERT_TRUE(p.get_ops()[0]->has_attr(op_attr::groups));
-    ASSERT_EQ(p.get_ops()[0]->get_attr<int64_t>(op_attr::groups), 1);
-
-    // clone the partition
-    auto p_copy = std::dynamic_pointer_cast<
-            dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t>(p.clone());
-    ASSERT_NE(p_copy, nullptr);
-    ASSERT_FALSE(p_copy->is_initialized());
-    ASSERT_TRUE(p_copy->get_assigned_backend()->get_name() == "dnnl_backend");
-    ASSERT_EQ(p_copy->get_ops()[0]->get_kind(), op_kind::Convolution);
-    ASSERT_TRUE(p_copy->get_ops()[0]->has_attr(op_attr::groups));
-    ASSERT_EQ(p_copy->get_ops()[0]->get_attr<int64_t>(op_attr::groups), 1);
-}
-
-TEST(test_partition_op, AssignedPartition) {
-    using namespace dnnl::impl::graph;
-
-    op_t conv {0, op_kind::Convolution, std::string("convolution")};
-
-    ASSERT_EQ(conv.get_partition(), nullptr);
-
-    auto part = std::make_shared<
-            dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t>(
-            engine_kind::cpu, fpmath_mode::strict,
-            partition_kind_t::convolution_post_ops);
-    conv.set_partition(part.get());
-    ASSERT_EQ(conv.get_partition(), part.get());
-}
-
-TEST(test_partition_partition, SetFpmathMode) {
-    engine_t *eng = get_engine();
-    for (auto m : {fpmath_mode::strict, fpmath_mode::bf16, fpmath_mode::f16,
-                 fpmath_mode::any}) {
-        dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
-                eng->kind(), m, partition_kind_t::undef);
-        ASSERT_EQ(p.get_fpmath_mode(), m);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_partition_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_partition_cpu.cpp
new file mode 100644
index 00000000000..8632971d567
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_partition_cpu.cpp
@@ -0,0 +1,168 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "interface/op.hpp"
+#include "interface/partition.hpp"
+
+#include "backend/dnnl/dnnl_partition_impl.hpp"
+
+#include "graph/unit/backend/dnnl/dnnl_test_common.hpp"
+#include "graph/unit/unit_test_common.hpp"
+
+using namespace dnnl::impl::graph;
+using namespace dnnl::graph::tests::unit::utils;
+
+TEST(test_partition, CreateSimple) {
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
+            engine_kind::cpu, fpm, partition_kind_t::undef);
+    ASSERT_EQ(p.get_ops().size(), 0U);
+    ASSERT_EQ(p.get_fpmath_mode().mode_, fpmath_mode::strict);
+    ASSERT_EQ(p.get_kind(), partition_kind_t::undef);
+}
+
+TEST(test_partition, AddOps) {
+    std::vector<engine_kind_t> engine_kinds
+            = {engine_kind::cpu, engine_kind::gpu};
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    for (const auto &engine_kind : engine_kinds) {
+        dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
+                engine_kind, fpm, partition_kind_t::undef);
+        size_t id = 100;
+        std::shared_ptr<op_t> n(new op_t(id, op_kind::Wildcard, "Wildcard"));
+        p.add_op(n);
+        ASSERT_EQ(p.get_ops().size(), 1U);
+
+        std::vector<size_t> ids {101, 102};
+        std::vector<std::shared_ptr<op_t>> ops;
+        for (auto id : ids) {
+            ops.emplace_back(new op_t(id, op_kind::Wildcard, "Wildcard"));
+            p.add_op(ops.back());
+        }
+
+        ASSERT_EQ(p.get_ops().size(), 3U);
+    }
+}
+
+TEST(test_partition, GetOps) {
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
+            engine_kind::cpu, fpm, partition_kind_t::undef);
+    size_t id = 100;
+    std::shared_ptr<op_t> n(new op_t(id, op_kind::Wildcard, "Wildcard"));
+    p.add_op(n);
+    auto ops = p.get_ops();
+    ASSERT_EQ(ops.size(), 1U);
+    ASSERT_EQ(ops[0]->get_id(), 100U);
+}
+
+TEST(test_partition, Init) {
+    std::vector<engine_kind_t> engine_kinds
+            = {engine_kind::cpu, engine_kind::gpu};
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    for (const auto &engine_kind : engine_kinds) {
+        dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
+                engine_kind, fpm, partition_kind_t::undef);
+        std::shared_ptr<op_t> n(new op_t(0, op_kind::Convolution, "Conv"));
+        n->set_attr<int64_t>(op_attr::groups, 0);
+        p.add_op(n);
+        ASSERT_FALSE(p.is_initialized());
+        ASSERT_TRUE(p.get_assigned_backend()->get_name() != "fake_backend");
+    }
+}
+
+TEST(test_partition, Clone) {
+    std::vector<engine_kind_t> engine_kinds
+            = {engine_kind::cpu, engine_kind::gpu};
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    for (const auto &engine_kind : engine_kinds) {
+        dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
+                engine_kind, fpm, partition_kind_t::convolution_post_ops);
+        auto n = std::make_shared<op_t>(op_kind::Convolution);
+        n->set_attr<int64_t>(op_attr::groups, 1);
+
+        p.add_op(n); // the subgraph
+
+        ASSERT_FALSE(p.is_initialized());
+        ASSERT_TRUE(p.get_assigned_backend()->get_name() == "dnnl_backend");
+        ASSERT_EQ(p.get_ops()[0]->get_kind(), op_kind::Convolution);
+        ASSERT_TRUE(p.get_ops()[0]->has_attr(op_attr::groups));
+        ASSERT_EQ(p.get_ops()[0]->get_attr<int64_t>(op_attr::groups), 1);
+
+        // clone the partition
+        auto p_copy = std::dynamic_pointer_cast<
+                dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t>(p.clone());
+        ASSERT_NE(p_copy, nullptr);
+        ASSERT_FALSE(p_copy->is_initialized());
+        ASSERT_TRUE(
+                p_copy->get_assigned_backend()->get_name() == "dnnl_backend");
+        ASSERT_EQ(p_copy->get_ops()[0]->get_kind(), op_kind::Convolution);
+        ASSERT_TRUE(p_copy->get_ops()[0]->has_attr(op_attr::groups));
+        ASSERT_EQ(p_copy->get_ops()[0]->get_attr<int64_t>(op_attr::groups), 1);
+    }
+}
+
+TEST(test_partition_op, AssignedPartition) {
+    using namespace dnnl::impl::graph;
+
+    op_t conv {0, op_kind::Convolution, std::string("convolution")};
+
+    ASSERT_EQ(conv.get_partition(), nullptr);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    auto part = std::make_shared<
+            dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t>(
+            engine_kind::cpu, fpm, partition_kind_t::convolution_post_ops);
+    conv.set_partition(part.get());
+    ASSERT_EQ(conv.get_partition(), part.get());
+}
+
+TEST(test_partition, SetFpmathMode) {
+    engine_t *eng = get_engine();
+    for (auto m : {fpmath_mode::strict, fpmath_mode::bf16, fpmath_mode::f16,
+                 fpmath_mode::any}) {
+        const graph::fpmath_t fpm {m, false};
+        dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t p(
+                eng->kind(), fpm, partition_kind_t::undef);
+        ASSERT_EQ(p.get_fpmath_mode().mode_, m);
+    }
+}
+
+TEST(test_partition, InferShape) {
+    std::vector<engine_kind_t> engine_kinds
+            = {engine_kind::cpu, engine_kind::gpu};
+    for (const auto &engine_kind : engine_kinds) {
+        size_t id = 0;
+
+        graph::logical_tensor_t lt1
+                = logical_tensor_init(id++, graph::data_type::f32);
+        graph::logical_tensor_t lt2
+                = logical_tensor_init(id++, graph::data_type::f32);
+        graph::logical_tensor_t lt3
+                = logical_tensor_init(id++, graph::data_type::f32);
+
+        std::vector<const graph::logical_tensor_t *> inputs {&lt1, &lt2};
+        std::vector<graph::logical_tensor_t *> outputs {&lt3};
+        const graph::fpmath_t fpm {fpmath_mode::strict, false};
+        auto par = std::make_shared<
+                dnnl::impl::graph::dnnl_impl::dnnl_partition_impl_t>(
+                engine_kind, fpm, graph::partition_kind_t::undef);
+        ASSERT_EQ(par->infer_shape(inputs, outputs), graph::status::success);
+    }
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp b/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp
index 2925ca70b52..5eb34d7b482 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_pass.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@ bool is_supported_dtype(data_type_t dt, dir_t dir = dir_t::FLAG_FWD) {
  * 4. Pass the graph to the pass
  * 5. Check if conv_bn can be fused
  */
-TEST(test_pass_pass, FuseConvBn) {
+TEST(test_pass, FuseConvBn) {
     /*   conv
           |
          bn
@@ -131,7 +131,7 @@ TEST(test_pass_pass, FuseConvBn) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseConvBnWithSharedInputs) {
+TEST(test_pass, FuseConvBnWithSharedInputs) {
     /*   conv
           |
          bn
@@ -188,7 +188,7 @@ TEST(test_pass_pass, FuseConvBnWithSharedInputs) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FailToFuseConvBnWithConvSecondOutput) {
+TEST(test_pass, FailToFuseConvBnWithConvSecondOutput) {
     /*   conv
         /    \
        bn   relu
@@ -231,7 +231,7 @@ TEST(test_pass_pass, FailToFuseConvBnWithConvSecondOutput) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass, FuseConvRelu) {
+TEST(test_pass, FuseConvRelu) {
     /*   conv
           |
          relu
@@ -267,7 +267,7 @@ TEST(test_pass_pass, FuseConvRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseConvBiasadd) {
+TEST(test_pass, FuseConvBiasadd) {
     /*   conv
           |
          bias
@@ -304,7 +304,7 @@ TEST(test_pass_pass, FuseConvBiasadd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvWithInputBias) {
+TEST(test_pass, FuseConvWithInputBias) {
     /*   conv
           |
          bias
@@ -345,7 +345,7 @@ TEST(test_pass_pass, FuseConvWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseConvSum) {
+TEST(test_pass, FuseConvSum) {
     /*   conv
            \  /
            add
@@ -382,7 +382,7 @@ TEST(test_pass_pass, FuseConvSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddBn) {
+TEST(test_pass, FuseConvBiasaddBn) {
     /*   conv
           |
          bias
@@ -436,7 +436,7 @@ TEST(test_pass_pass, FuseConvBiasaddBn) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass, FuseConvBiasBnWithInputBias) {
+TEST(test_pass, FuseConvBiasBnWithInputBias) {
     /*   conv
           |
          bias
@@ -486,7 +486,7 @@ TEST(test_pass_pass, FuseConvBiasBnWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddRelu) {
+TEST(test_pass, FuseConvBiasaddRelu) {
     /*   conv
           |
          bias
@@ -531,7 +531,7 @@ TEST(test_pass_pass, FuseConvBiasaddRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasReluWithInputBias) {
+TEST(test_pass, FuseConvBiasReluWithInputBias) {
     /*   conv
           |
          bias
@@ -572,7 +572,7 @@ TEST(test_pass_pass, FuseConvBiasReluWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddRelu6) {
+TEST(test_pass, FuseConvBiasaddRelu6) {
     /*   conv
           |
          bias
@@ -619,7 +619,7 @@ TEST(test_pass_pass, FuseConvBiasaddRelu6) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasElu) {
+TEST(test_pass, FuseConvBiasElu) {
     /*   conv
           |
          bias
@@ -661,7 +661,7 @@ TEST(test_pass_pass, FuseConvBiasElu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvBiasSigmoid) {
+TEST(test_pass, FuseConvBiasSigmoid) {
     /*   conv
           |
          bias
@@ -702,7 +702,7 @@ TEST(test_pass_pass, FuseConvBiasSigmoid) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvBiasSwish) {
+TEST(test_pass, FuseConvBiasSwish) {
     // swish: f(x) = x * sigmoid(x)
     /*   conv
           |
@@ -752,7 +752,7 @@ TEST(test_pass_pass, FuseConvBiasSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvSwish) {
+TEST(test_pass, FuseConvSwish) {
     // swish: f(x) = x * sigmoid(x)
     /*   conv
         /    |
@@ -795,7 +795,7 @@ TEST(test_pass_pass, FuseConvSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvSwishSigmoid) {
+TEST(test_pass, FuseConvSwishSigmoid) {
     // swish: f(x) = x * sigmoid(x)
     /*   conv
         /    |
@@ -844,7 +844,7 @@ TEST(test_pass_pass, FuseConvSwishSigmoid) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasClamp) {
+TEST(test_pass, FuseConvBiasClamp) {
     /*   conv
           |
          bias
@@ -891,7 +891,7 @@ TEST(test_pass_pass, FuseConvBiasClamp) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasSquare) {
+TEST(test_pass, FuseConvBiasSquare) {
     /*   conv
           |
          bias
@@ -936,7 +936,7 @@ TEST(test_pass_pass, FuseConvBiasSquare) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasTanh) {
+TEST(test_pass, FuseConvBiasTanh) {
     /*   conv
           |
          bias
@@ -981,7 +981,7 @@ TEST(test_pass_pass, FuseConvBiasTanh) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasAbs) {
+TEST(test_pass, FuseConvBiasAbs) {
     /*   conv
           |
          bias
@@ -1026,7 +1026,7 @@ TEST(test_pass_pass, FuseConvBiasAbs) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasSqrt) {
+TEST(test_pass, FuseConvBiasSqrt) {
     /*   conv
           |
          bias
@@ -1071,7 +1071,7 @@ TEST(test_pass_pass, FuseConvBiasSqrt) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddSum) {
+TEST(test_pass, FuseConvBiasaddSum) {
     /*   conv
           |
          bias
@@ -1118,7 +1118,7 @@ TEST(test_pass_pass, FuseConvBiasaddSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseConvBiasSum) {
+TEST(test_pass, FuseConvBiasSum) {
     /*   conv
           |
          bias
@@ -1161,7 +1161,7 @@ TEST(test_pass_pass, FuseConvBiasSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddSumRelu) {
+TEST(test_pass, FuseConvBiasaddSumRelu) {
     /*   conv
           |
          bias
@@ -1214,7 +1214,7 @@ TEST(test_pass_pass, FuseConvBiasaddSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass_system, TestConvRelated) {
+TEST(test_pass_system, TestConvRelated) {
     /*   conv
           |
          bias conv
@@ -1284,7 +1284,7 @@ TEST(test_pass_pass_system, TestConvRelated) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddSumElu) {
+TEST(test_pass, FuseConvBiasaddSumElu) {
     /*   conv
           |
          bias
@@ -1338,7 +1338,7 @@ TEST(test_pass_pass, FuseConvBiasaddSumElu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddSumRelu6) {
+TEST(test_pass, FuseConvBiasaddSumRelu6) {
     /*   conv
           |
          bias
@@ -1393,7 +1393,7 @@ TEST(test_pass_pass, FuseConvBiasaddSumRelu6) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass_system, FuseConvDepthwise) {
+TEST(test_pass_system, FuseConvDepthwise) {
     /*   conv
           |
          conv (depthwise)
@@ -1471,7 +1471,7 @@ TEST(test_pass_pass_system, FuseConvDepthwise) {
     }
 }
 
-TEST(test_pass_pass, FuseBinarySum) {
+TEST(test_pass, FuseBinarySum) {
     /* binary here represents Multiply, Minimum, Maximum
 
         binary
@@ -1524,7 +1524,7 @@ TEST(test_pass_pass, FuseBinarySum) {
     }
 }
 
-TEST(test_pass_pass_system, TestConvSumAndBinary) {
+TEST(test_pass_system, TestConvSumAndBinary) {
     /* binary here represents Multiply, Minimum, Maximum
 
         binary conv
@@ -1579,7 +1579,7 @@ TEST(test_pass_pass_system, TestConvSumAndBinary) {
     }
 }
 
-TEST(test_pass_pass, FuseBinarySumWithSupportBroadcast) {
+TEST(test_pass, FuseBinarySumWithSupportBroadcast) {
     auto &backend_ptr = dnnl_impl::dnnl_backend_t::get_singleton();
     auto pm = pass::pass_manager_t(backend_ptr.get_pass_registry());
     std::vector<std::pair<op_kind_t, partition_kind_t>> opkind_pair {
@@ -1622,7 +1622,7 @@ TEST(test_pass_pass, FuseBinarySumWithSupportBroadcast) {
     }
 }
 
-TEST(test_pass_pass, FailToFuseBinarySumWithUnsupportBroadcast) {
+TEST(test_pass, FailToFuseBinarySumWithUnsupportBroadcast) {
     auto &backend_ptr = dnnl_impl::dnnl_backend_t::get_singleton();
     auto pm = pass::pass_manager_t(backend_ptr.get_pass_registry());
     std::vector<std::pair<op_kind_t, partition_kind_t>> opkind_pair {
@@ -1675,7 +1675,7 @@ TEST(test_pass_pass, FailToFuseBinarySumWithUnsupportBroadcast) {
     }
 }
 
-TEST(test_pass_pass, FailToFuseBinarySumWithUnknownShape) {
+TEST(test_pass, FailToFuseBinarySumWithUnknownShape) {
     auto &backend_ptr = dnnl_impl::dnnl_backend_t::get_singleton();
     auto pm = pass::pass_manager_t(backend_ptr.get_pass_registry());
     std::vector<std::pair<op_kind_t, partition_kind_t>> opkind_pair {
@@ -1714,7 +1714,7 @@ TEST(test_pass_pass, FailToFuseBinarySumWithUnknownShape) {
     }
 }
 
-TEST(test_pass_pass, FuseBinaryAddMul) {
+TEST(test_pass, FuseBinaryAddMul) {
     /*
          \  /
           add
@@ -1754,7 +1754,7 @@ TEST(test_pass_pass, FuseBinaryAddMul) {
             partition_kind_t::binary_post_ops);
 }
 
-TEST(test_pass_pass, FuseBinaryEltwise) {
+TEST(test_pass, FuseBinaryEltwise) {
     /* binary here represents Add, Multiply, Minimum, Maximum
        eltwise here represents Sigmoid, ReLU
 
@@ -1808,7 +1808,7 @@ TEST(test_pass_pass, FuseBinaryEltwise) {
     }
 }
 
-TEST(test_pass_pass, FuseEltwiseBinary3PostOps) {
+TEST(test_pass, FuseEltwiseBinary3PostOps) {
     /*
            |
         eltwise
@@ -1859,7 +1859,7 @@ TEST(test_pass_pass, FuseEltwiseBinary3PostOps) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseEltwiseBinaryFail) {
+TEST(test_pass, FuseEltwiseBinaryFail) {
     /*
            |
           eltwise
@@ -1889,7 +1889,7 @@ TEST(test_pass_pass, FuseEltwiseBinaryFail) {
     ASSERT_EQ(agraph.get_num_partitions(), 0U);
 }
 
-TEST(test_pass_pass, ReciprocalMultiply2Divide) {
+TEST(test_pass, ReciprocalMultiply2Divide) {
     /* convert the following pattern to division
                 1
                 /
@@ -1928,7 +1928,7 @@ TEST(test_pass_pass, ReciprocalMultiply2Divide) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass_system, TestBinaryEltwise) {
+TEST(test_pass_system, TestBinaryEltwise) {
     /* binary here represents Add, Multiply, Minimum, Maximum
        eltwise here represents Sigmoid, ReLU
 
@@ -1983,7 +1983,7 @@ TEST(test_pass_pass_system, TestBinaryEltwise) {
     }
 }
 
-TEST(test_pass_pass, FuseBnRelu) {
+TEST(test_pass, FuseBnRelu) {
     /*
          bn
          |
@@ -2028,7 +2028,7 @@ TEST(test_pass_pass, FuseBnRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, TestBnRelu) {
+TEST(test_pass_system, TestBnRelu) {
     /*
          bn
          |
@@ -2074,7 +2074,7 @@ TEST(test_pass_pass_system, TestBnRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseBnBwdReluBwd) {
+TEST(test_pass, FuseBnBwdReluBwd) {
     /*
         ReLUBackward
          |
@@ -2123,7 +2123,7 @@ TEST(test_pass_pass, FuseBnBwdReluBwd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[2].id, 9U);
 }
 
-TEST(test_pass_pass_system, TestBnBwdReluBwd) {
+TEST(test_pass_system, TestBnBwdReluBwd) {
     /*
         ReLUBackward
          |
@@ -2175,7 +2175,7 @@ TEST(test_pass_pass_system, TestBnBwdReluBwd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[2].id, 9U);
 }
 
-TEST(test_pass_pass, FuseConvSumRelu) {
+TEST(test_pass, FuseConvSumRelu) {
     /*   conv
            \   /
             add
@@ -2220,7 +2220,7 @@ TEST(test_pass_pass, FuseConvSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvSumElu) {
+TEST(test_pass, FuseConvSumElu) {
     /*   conv
            \   /
             add
@@ -2266,7 +2266,7 @@ TEST(test_pass_pass, FuseConvSumElu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvSumRelu6) {
+TEST(test_pass, FuseConvSumRelu6) {
     /*   conv
            \   /
             add
@@ -2313,7 +2313,7 @@ TEST(test_pass_pass, FuseConvSumRelu6) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddSumSum) {
+TEST(test_pass, FuseConvBiasaddSumSum) {
     /*  conv
           |
         bias   conv
@@ -2389,7 +2389,7 @@ TEST(test_pass_pass, FuseConvBiasaddSumSum) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 11U);
 }
 
-TEST(test_pass_pass, FuseConvBnSum) {
+TEST(test_pass, FuseConvBnSum) {
     /*   conv
           |
           bn
@@ -2443,7 +2443,7 @@ TEST(test_pass_pass, FuseConvBnSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass, FuseConvBnSumWithRelu) {
+TEST(test_pass, FuseConvBnSumWithRelu) {
     /*   conv
           |
           bn   relu
@@ -2501,7 +2501,7 @@ TEST(test_pass_pass, FuseConvBnSumWithRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseConvBiasBnSum) {
+TEST(test_pass, FuseConvBiasBnSum) {
     /*   conv
           |
          bias
@@ -2559,7 +2559,7 @@ TEST(test_pass_pass, FuseConvBiasBnSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseConvBnRelu) {
+TEST(test_pass, FuseConvBnRelu) {
     /*   conv
           |
           bn
@@ -2611,7 +2611,7 @@ TEST(test_pass_pass, FuseConvBnRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass_system, TestConvBnRelu) {
+TEST(test_pass_system, TestConvBnRelu) {
     /*   conv
           |
          bn
@@ -2665,7 +2665,7 @@ TEST(test_pass_pass_system, TestConvBnRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass, FuseConvBiasaddBnRelu) {
+TEST(test_pass, FuseConvBiasaddBnRelu) {
     /*   conv
           |
          bias
@@ -2725,7 +2725,7 @@ TEST(test_pass_pass, FuseConvBiasaddBnRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseConvBiasBnReluWithInputBias) {
+TEST(test_pass, FuseConvBiasBnReluWithInputBias) {
     /*   conv
           |
          bias
@@ -2781,7 +2781,7 @@ TEST(test_pass_pass, FuseConvBiasBnReluWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass_system, TestConvBiasBnRelu) {
+TEST(test_pass_system, TestConvBiasBnRelu) {
     /*   conv
           |
          bias
@@ -2844,7 +2844,7 @@ TEST(test_pass_pass_system, TestConvBiasBnRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseConvBnSumRelu) {
+TEST(test_pass, FuseConvBnSumRelu) {
     /*   conv
           |
          bn
@@ -2904,7 +2904,7 @@ TEST(test_pass_pass, FuseConvBnSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseConvBiasBnSumRelu) {
+TEST(test_pass, FuseConvBiasBnSumRelu) {
     /*   conv
           |
          bias
@@ -2968,7 +2968,7 @@ TEST(test_pass_pass, FuseConvBiasBnSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 11U);
 }
 
-TEST(test_pass_pass, FuseConvBiasPostOpsChain) {
+TEST(test_pass, FuseConvBiasPostOpsChain) {
     size_t max_num_post_ops = 4;
     const std::vector<op_kind_t> two_inputs_ops {
             op_kind::Multiply,
@@ -3104,7 +3104,7 @@ TEST(test_pass_pass, FuseConvBiasPostOpsChain) {
     }
 }
 
-TEST(test_pass_pass, FuseConvPostOpsChain) {
+TEST(test_pass, FuseConvPostOpsChain) {
     size_t max_num_post_ops = 3;
     const std::vector<op_kind_t> two_inputs_ops {
             op_kind::Multiply,
@@ -3235,7 +3235,7 @@ TEST(test_pass_pass, FuseConvPostOpsChain) {
     }
 }
 
-TEST(test_pass_pass, FuseConvtransposeBiasadd) {
+TEST(test_pass, FuseConvtransposeBiasadd) {
     /*   conv
           |
          bias
@@ -3272,7 +3272,7 @@ TEST(test_pass_pass, FuseConvtransposeBiasadd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseConvtransposeAdd) {
+TEST(test_pass, FuseConvtransposeAdd) {
     /*   convtranspose
           |
          w/wo bias
@@ -3325,7 +3325,7 @@ TEST(test_pass_pass, FuseConvtransposeAdd) {
     }
 }
 
-TEST(test_pass_pass, FuseConvtransposeAddTwoInputs) {
+TEST(test_pass, FuseConvtransposeAddTwoInputs) {
     /*   convtranspose
           |
          bias (is a convtranspose third input)
@@ -3371,7 +3371,7 @@ TEST(test_pass_pass, FuseConvtransposeAddTwoInputs) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseConvtransposeRelu) {
+TEST(test_pass, FuseConvtransposeRelu) {
     /*   convtranspose
           |
          w/wo bias (is a convtranspose third input)
@@ -3420,7 +3420,7 @@ TEST(test_pass_pass, FuseConvtransposeRelu) {
     }
 }
 
-TEST(test_pass_pass, FuseConvtransposeReLUTwoInputs) {
+TEST(test_pass, FuseConvtransposeReLUTwoInputs) {
     /*   convtranspose
           |
          bias
@@ -3463,7 +3463,7 @@ TEST(test_pass_pass, FuseConvtransposeReLUTwoInputs) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseMatmulRelu) {
+TEST(test_pass, FuseMatmulRelu) {
     /*  matmul
           |
         relu
@@ -3500,7 +3500,7 @@ TEST(test_pass_pass, FuseMatmulRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseMatmulReluCase2) {
+TEST(test_pass, FuseMatmulReluCase2) {
     /*  matmul
           |
         relu
@@ -3541,7 +3541,7 @@ TEST(test_pass_pass, FuseMatmulReluCase2) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FailToFuseReluMatmul) {
+TEST(test_pass, FailToFuseReluMatmul) {
     /*  relu
           |
         matmul
@@ -3574,7 +3574,7 @@ TEST(test_pass_pass, FailToFuseReluMatmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_inputs()[1].id, 2U);
 }
 
-TEST(test_pass_pass, FuseMatmulElu) {
+TEST(test_pass, FuseMatmulElu) {
     /*  matmul
           |
         elu
@@ -3612,7 +3612,7 @@ TEST(test_pass_pass, FuseMatmulElu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseMatmulSigmoid) {
+TEST(test_pass, FuseMatmulSigmoid) {
     /*  matmul
           |
         sigmoid
@@ -3649,7 +3649,7 @@ TEST(test_pass_pass, FuseMatmulSigmoid) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseMatmulClamp) {
+TEST(test_pass, FuseMatmulClamp) {
     /*  matmul
           |
         clamp
@@ -3688,7 +3688,7 @@ TEST(test_pass_pass, FuseMatmulClamp) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseMatmulGelu) {
+TEST(test_pass, FuseMatmulGelu) {
     /*  matmul
           |
         gelu
@@ -3725,7 +3725,7 @@ TEST(test_pass_pass, FuseMatmulGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseMatmulSum) {
+TEST(test_pass, FuseMatmulSum) {
     /*  matmul  wildcard
           \    /
             add
@@ -3767,7 +3767,7 @@ TEST(test_pass_pass, FuseMatmulSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseMatmulSumWithCommunicativeOrder) {
+TEST(test_pass, FuseMatmulSumWithCommunicativeOrder) {
     /* wildcard matmul
           \    /
             add
@@ -3809,7 +3809,7 @@ TEST(test_pass_pass, FuseMatmulSumWithCommunicativeOrder) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseMatmulSumGelu) {
+TEST(test_pass, FuseMatmulSumGelu) {
     /*  matmul  wildcard
           \    /
             add
@@ -3857,7 +3857,7 @@ TEST(test_pass_pass, FuseMatmulSumGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseMatmulSumRelu) {
+TEST(test_pass, FuseMatmulSumRelu) {
     /*  matmul wildcard
           \    /
             add
@@ -3905,7 +3905,7 @@ TEST(test_pass_pass, FuseMatmulSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseMatmulDiv) {
+TEST(test_pass, FuseMatmulDiv) {
     /*  matmul  wildcard
           \    /
             div
@@ -3947,7 +3947,7 @@ TEST(test_pass_pass, FuseMatmulDiv) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass_system, FuseMatmulDiv) {
+TEST(test_pass_system, FuseMatmulDiv) {
     /*  matmul  wildcard
           \    /
             div
@@ -3982,7 +3982,7 @@ TEST(test_pass_pass_system, FuseMatmulDiv) {
             partition_kind_t::matmul_post_ops);
 }
 
-TEST(test_pass_pass, FuseMatmulDivAdd) {
+TEST(test_pass, FuseMatmulDivAdd) {
     /*  matmul  wildcard
           \    /
             div wildcard
@@ -4035,7 +4035,7 @@ TEST(test_pass_pass, FuseMatmulDivAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, FuseMatmulDivAdd) {
+TEST(test_pass_system, FuseMatmulDivAdd) {
     /*  matmul  wildcard
           \    /
             div wildcard
@@ -4080,7 +4080,7 @@ TEST(test_pass_pass_system, FuseMatmulDivAdd) {
             partition_kind_t::matmul_post_ops);
 }
 
-TEST(test_pass_pass_system, TestMatmulDivAdd) {
+TEST(test_pass_system, TestMatmulDivAdd) {
     /*  matmul
           \    /
             div
@@ -4126,7 +4126,7 @@ TEST(test_pass_pass_system, TestMatmulDivAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasadd) {
+TEST(test_pass, FuseMatmulBiasadd) {
     /*  matmul
            |
          bias
@@ -4165,7 +4165,7 @@ TEST(test_pass_pass, FuseMatmulBiasadd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseMatmulBias) {
+TEST(test_pass, FuseMatmulBias) {
     /*  matmul
            |
          bias
@@ -4200,7 +4200,7 @@ TEST(test_pass_pass, FuseMatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasSigmoid) {
+TEST(test_pass, FuseMatmulBiasSigmoid) {
     /*  matmul
            |
          bias
@@ -4246,7 +4246,7 @@ TEST(test_pass_pass, FuseMatmulBiasSigmoid) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasaddElu) {
+TEST(test_pass, FuseMatmulBiasaddElu) {
     /*  matmul
            |
          bias
@@ -4292,7 +4292,7 @@ TEST(test_pass_pass, FuseMatmulBiasaddElu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasaddRelu) {
+TEST(test_pass, FuseMatmulBiasaddRelu) {
     /*  matmul
            |
          bias
@@ -4337,7 +4337,7 @@ TEST(test_pass_pass, FuseMatmulBiasaddRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasaddClamp) {
+TEST(test_pass, FuseMatmulBiasaddClamp) {
     /*  matmul
            |
          bias
@@ -4385,7 +4385,7 @@ TEST(test_pass_pass, FuseMatmulBiasaddClamp) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseMatmulReluSum) {
+TEST(test_pass, FuseMatmulReluSum) {
     /*  matmul
            |
          bias  relu
@@ -4432,7 +4432,7 @@ TEST(test_pass_pass, FuseMatmulReluSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasSumRelu) {
+TEST(test_pass, FuseMatmulBiasSumRelu) {
     /*  matmul
            |
          bias  wildcard
@@ -4486,7 +4486,7 @@ TEST(test_pass_pass, FuseMatmulBiasSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass_system, TestMatmulBiasSumRelu) {
+TEST(test_pass_system, TestMatmulBiasSumRelu) {
     /*  matmul
            |
          bias  wildcard
@@ -4541,7 +4541,7 @@ TEST(test_pass_pass_system, TestMatmulBiasSumRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasaddSwish) {
+TEST(test_pass, FuseMatmulBiasaddSwish) {
     /*       matmul
                |
               bias
@@ -4595,7 +4595,7 @@ TEST(test_pass_pass, FuseMatmulBiasaddSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, FuseMatmulBiasaddSwish) {
+TEST(test_pass_system, FuseMatmulBiasaddSwish) {
     /*       matmul
                |
               bias
@@ -4649,7 +4649,7 @@ TEST(test_pass_pass_system, FuseMatmulBiasaddSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseMatmulBiasaddRelu6) {
+TEST(test_pass, FuseMatmulBiasaddRelu6) {
     /*  matmul
            |
          bias
@@ -4692,7 +4692,7 @@ TEST(test_pass_pass, FuseMatmulBiasaddRelu6) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, DnnlSingleOpReplacement) {
+TEST(test_pass, DnnlSingleOpReplacement) {
     using namespace dnnl::impl::graph;
     using namespace dnnl::impl::graph::op_kind;
 
@@ -4849,7 +4849,7 @@ INSTANTIATE_TEST_SUITE_P(test_pass_single_op_pass, test_single_op_pass_t,
                 single_op_params_t {graph::op_kind::LayerNormBackward, 6, 3,
                         graph::data_type::bf16, false}));
 
-TEST(test_pass_pass, ConvSingleOpReplacement) {
+TEST(test_pass, ConvSingleOpReplacement) {
     const auto engine_kind = get_test_engine_kind();
     graph_t agraph(engine_kind);
     op_t conv {0, Convolution, "conv"};
@@ -4878,7 +4878,7 @@ TEST(test_pass_pass, ConvSingleOpReplacement) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass, ConvSingleOpReplacementWithBias) {
+TEST(test_pass, ConvSingleOpReplacementWithBias) {
     const auto engine_kind = get_test_engine_kind();
     graph_t agraph(engine_kind);
     op_t conv {0, Convolution, "conv"};
@@ -4913,7 +4913,7 @@ TEST(test_pass_pass, ConvSingleOpReplacementWithBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, SaveLoadJson) {
+TEST(test_pass, SaveLoadJson) {
     /*   \  /
           conv
             |
@@ -4991,7 +4991,7 @@ TEST(test_pass_pass, SaveLoadJson) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass, TestPassFilterFunc) {
+TEST(test_pass, TestPassFilterFunc) {
     /*   \   /
           Matmul
            |
@@ -5032,7 +5032,7 @@ TEST(test_pass_pass, TestPassFilterFunc) {
     ASSERT_EQ(agraph.get_num_partitions(), 2U);
 }
 
-TEST(test_pass_pass, InputJsonIsValid) {
+TEST(test_pass, InputJsonIsValid) {
     /*   \   /
           conv
            |
@@ -5089,7 +5089,7 @@ TEST(test_pass_pass, InputJsonIsValid) {
     ASSERT_EQ(agraph.get_num_partitions(), 2U);
 }
 
-TEST(test_pass_pass, InputJsonIsInvalidWithIncompleteHash) {
+TEST(test_pass, InputJsonIsInvalidWithIncompleteHash) {
     /*   \   /
           conv
            |
@@ -5148,7 +5148,7 @@ TEST(test_pass_pass, InputJsonIsInvalidWithIncompleteHash) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, InputJsonIsInvalidWithMissingFiled) {
+TEST(test_pass, InputJsonIsInvalidWithMissingFiled) {
     /*   \   /
           conv
            |
@@ -5200,7 +5200,7 @@ TEST(test_pass_pass, InputJsonIsInvalidWithMissingFiled) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, InputJsonIsInvalidWithWrongFormat) {
+TEST(test_pass, InputJsonIsInvalidWithWrongFormat) {
     /*   \   /
           conv
            |
@@ -5248,7 +5248,7 @@ TEST(test_pass_pass, InputJsonIsInvalidWithWrongFormat) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, FuseTwoConvReluWithSharedWeight) {
+TEST(test_pass, FuseTwoConvReluWithSharedWeight) {
     /*    \   /\    /
           conv  conv
             |     |
@@ -5309,7 +5309,7 @@ TEST(test_pass_pass, FuseTwoConvReluWithSharedWeight) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, CheckSameInput) {
+TEST(test_pass, CheckSameInput) {
     /*     conv
             ||
            add
@@ -5360,7 +5360,7 @@ TEST(test_pass_pass, CheckSameInput) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass_system, FuseToInt8Conv) {
+TEST(test_pass_system, FuseToInt8Conv) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5430,7 +5430,7 @@ TEST(test_pass_pass_system, FuseToInt8Conv) {
     }
 }
 
-TEST(test_pass_pass, FuseToInt8Fp32Conv) {
+TEST(test_pass, FuseToInt8Fp32Conv) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5498,7 +5498,7 @@ TEST(test_pass_pass, FuseToInt8Fp32Conv) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass_system, TestInt8) {
+TEST(test_pass_system, TestInt8) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5569,7 +5569,7 @@ TEST(test_pass_pass_system, TestInt8) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FailToFuseToInt8Conv) {
+TEST(test_pass, FailToFuseToInt8Conv) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5631,7 +5631,7 @@ wildcard     | (f32)
     ASSERT_EQ(agraph.get_num_partitions(), 4U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvBias) {
+TEST(test_pass, FuseToInt8ConvBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5700,7 +5700,7 @@ TEST(test_pass_pass, FuseToInt8ConvBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, TestInt8ConvBias) {
+TEST(test_pass_system, TestInt8ConvBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5771,7 +5771,7 @@ TEST(test_pass_pass_system, TestInt8ConvBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvRelu) {
+TEST(test_pass, FuseToInt8ConvRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5845,7 +5845,7 @@ TEST(test_pass_pass, FuseToInt8ConvRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvSwish) {
+TEST(test_pass, FuseToInt8ConvSwish) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -5928,7 +5928,7 @@ TEST(test_pass_pass, FuseToInt8ConvSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass_system, TestInt8ConvRelu) {
+TEST(test_pass_system, TestInt8ConvRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6004,7 +6004,7 @@ TEST(test_pass_pass_system, TestInt8ConvRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvBiasRelu) {
+TEST(test_pass, FuseToInt8ConvBiasRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6080,7 +6080,7 @@ TEST(test_pass_pass, FuseToInt8ConvBiasRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass_system, TestInt8ConvBiasRelu) {
+TEST(test_pass_system, TestInt8ConvBiasRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6158,7 +6158,7 @@ TEST(test_pass_pass_system, TestInt8ConvBiasRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvBiasAdd) {
+TEST(test_pass, FuseToInt8ConvBiasAdd) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6252,7 +6252,7 @@ TEST(test_pass_pass, FuseToInt8ConvBiasAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvBinary) {
+TEST(test_pass, FuseToInt8ConvBinary) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6363,7 +6363,7 @@ TEST(test_pass_pass, FuseToInt8ConvBinary) {
     }
 }
 
-TEST(test_pass_pass_system, TestInt8ConvBiasAdd) {
+TEST(test_pass_system, TestInt8ConvBiasAdd) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6467,7 +6467,7 @@ TEST(test_pass_pass_system, TestInt8ConvBiasAdd) {
     }
 }
 
-TEST(test_pass_pass, FuseToInt8ConvBiasAddRelu) {
+TEST(test_pass, FuseToInt8ConvBiasAddRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6570,7 +6570,7 @@ TEST(test_pass_pass, FuseToInt8ConvBiasAddRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, TestInt8ConvBiasAddRelu) {
+TEST(test_pass_system, TestInt8ConvBiasAddRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6671,7 +6671,7 @@ TEST(test_pass_pass_system, TestInt8ConvBiasAddRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvBiasAddReluWithInputBias) {
+TEST(test_pass, FuseToInt8ConvBiasAddReluWithInputBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6779,7 +6779,7 @@ TEST(test_pass_pass, FuseToInt8ConvBiasAddReluWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32Conv) {
+TEST(test_pass, FuseToX8s8f32Conv) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6834,7 +6834,7 @@ TEST(test_pass_pass, FuseToX8s8f32Conv) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32ConvBiasWithInputBias) {
+TEST(test_pass, FuseToX8s8f32ConvBiasWithInputBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6893,7 +6893,7 @@ TEST(test_pass_pass, FuseToX8s8f32ConvBiasWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32ConvReluWithInputBias) {
+TEST(test_pass, FuseToX8s8f32ConvReluWithInputBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -6956,7 +6956,7 @@ TEST(test_pass_pass, FuseToX8s8f32ConvReluWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32ConvBiasReluWithInputBias) {
+TEST(test_pass, FuseToX8s8f32ConvBiasReluWithInputBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7022,7 +7022,7 @@ TEST(test_pass_pass, FuseToX8s8f32ConvBiasReluWithInputBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, TestQuantizedConv) {
+TEST(test_pass, TestQuantizedConv) {
     /*
         | (u8/s8)  | (s8)   | (u8/s8)  | (s8)
      dequant    dequant   dequant    dequant
@@ -7169,7 +7169,7 @@ TEST(test_pass_pass, TestQuantizedConv) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseToInt8Matmul) {
+TEST(test_pass, FuseToInt8Matmul) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7233,7 +7233,7 @@ TEST(test_pass_pass, FuseToInt8Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass_system, TestInt8Matmul) {
+TEST(test_pass_system, TestInt8Matmul) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7299,7 +7299,7 @@ TEST(test_pass_pass_system, TestInt8Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, OptionalQuantForInt8Matmul) {
+TEST(test_pass, OptionalQuantForInt8Matmul) {
     /*
     quant_wei has a producer, so it will not
     be fused into int8_matmul_post_ops
@@ -7388,7 +7388,7 @@ TEST(test_pass_pass, OptionalQuantForInt8Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass, OptionalQuantWith2ConsumersForInt8Matmul) {
+TEST(test_pass, OptionalQuantWith2ConsumersForInt8Matmul) {
     /*
     quant_wei has two consumers, so it will not
     be fused into int8_matmul_post_ops
@@ -7474,7 +7474,7 @@ TEST(test_pass_pass, OptionalQuantWith2ConsumersForInt8Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_ops().size(), 4U);
 }
 
-TEST(test_pass_pass, FuseToInt8MatMulBinary) {
+TEST(test_pass, FuseToInt8MatMulBinary) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7586,7 +7586,7 @@ TEST(test_pass_pass, FuseToInt8MatMulBinary) {
     }
 }
 
-TEST(test_pass_pass, FailToFuseToInt8MatMulDivOrSubtract) {
+TEST(test_pass, FailToFuseToInt8MatMulDivOrSubtract) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7666,7 +7666,7 @@ TEST(test_pass_pass, FailToFuseToInt8MatMulDivOrSubtract) {
     }
 }
 
-TEST(test_pass_pass_system, FuseToInt8MatMulSwishReLU) {
+TEST(test_pass_system, FuseToInt8MatMulSwishReLU) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7759,7 +7759,7 @@ TEST(test_pass_pass_system, FuseToInt8MatMulSwishReLU) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass, FuseToInt8MatmulBias) {
+TEST(test_pass, FuseToInt8MatmulBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7827,7 +7827,7 @@ TEST(test_pass_pass, FuseToInt8MatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, TestInt8MatmulBias) {
+TEST(test_pass_system, TestInt8MatmulBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7896,7 +7896,7 @@ TEST(test_pass_pass_system, TestInt8MatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseToInt8MatmulRelu) {
+TEST(test_pass, FuseToInt8MatmulRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -7970,7 +7970,7 @@ TEST(test_pass_pass, FuseToInt8MatmulRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, FuseToInt8MatmulRelu) {
+TEST(test_pass_system, FuseToInt8MatmulRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8044,7 +8044,7 @@ TEST(test_pass_pass_system, FuseToInt8MatmulRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass, FuseToInt8MatmulBiasRelu) {
+TEST(test_pass, FuseToInt8MatmulBiasRelu) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8121,7 +8121,7 @@ TEST(test_pass_pass, FuseToInt8MatmulBiasRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32Matmul) {
+TEST(test_pass, FuseToX8s8f32Matmul) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -8176,7 +8176,7 @@ TEST(test_pass_pass, FuseToX8s8f32Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32MatmulBias) {
+TEST(test_pass, FuseToX8s8f32MatmulBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8235,7 +8235,7 @@ TEST(test_pass_pass, FuseToX8s8f32MatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseToX8s8f32MatmulEltwise) {
+TEST(test_pass, FuseToX8s8f32MatmulEltwise) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8306,7 +8306,7 @@ TEST(test_pass_pass, FuseToX8s8f32MatmulEltwise) {
     }
 }
 
-TEST(test_pass_pass, FuseToX8s8f32MatmulBiasEltwise) {
+TEST(test_pass, FuseToX8s8f32MatmulBiasEltwise) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8381,7 +8381,7 @@ TEST(test_pass_pass, FuseToX8s8f32MatmulBiasEltwise) {
     }
 }
 
-TEST(test_pass_pass, FuseToInt8Maxpool) {
+TEST(test_pass, FuseToInt8Maxpool) {
     /*
              | (u8/s8)
           dequant
@@ -8445,7 +8445,7 @@ TEST(test_pass_pass, FuseToInt8Maxpool) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass_system, TestInt8Maxpool) {
+TEST(test_pass_system, TestInt8Maxpool) {
     /*
              | (u8/s8)
           dequant
@@ -8511,7 +8511,7 @@ TEST(test_pass_pass_system, TestInt8Maxpool) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseToInt8Avgpool) {
+TEST(test_pass, FuseToInt8Avgpool) {
     /*
              | (u8/s8)
           dequant
@@ -8570,7 +8570,7 @@ TEST(test_pass_pass, FuseToInt8Avgpool) {
             partition_kind_t::quantized_pooling_post_ops);
 }
 
-TEST(test_pass_pass_system, FuseToInt8PoolAdd) {
+TEST(test_pass_system, FuseToInt8PoolAdd) {
     /*    
              | (u8/s8)
           dequant
@@ -8679,7 +8679,7 @@ TEST(test_pass_pass_system, FuseToInt8PoolAdd) {
     }
 }
 
-TEST(test_pass_pass_system, Quantize) {
+TEST(test_pass_system, Quantize) {
     const auto engine_kind = get_test_engine_kind();
 
     graph_t agraph(engine_kind);
@@ -8706,7 +8706,7 @@ TEST(test_pass_pass_system, Quantize) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, FuseToInt8MatmulAdd) {
+TEST(test_pass, FuseToInt8MatmulAdd) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8782,7 +8782,7 @@ TEST(test_pass_pass, FuseToInt8MatmulAdd) {
 
     graph::pass::pass_base_ptr apass
             = get_pass(engine_kind == graph::engine_kind::gpu
-                            ? "x8s8x8_matmul_add_post_ops_gpu"
+                            ? "x8x8x8_matmul_add_post_ops_gpu"
                             : "x8x8x8_matmul_add_post_ops_cpu");
     ASSERT_NE(apass, nullptr);
     apass->run(agraph);
@@ -8799,7 +8799,7 @@ TEST(test_pass_pass, FuseToInt8MatmulAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass, FuseToInt8MatmulBiasAdd) {
+TEST(test_pass, FuseToInt8MatmulBiasAdd) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -8877,7 +8877,7 @@ TEST(test_pass_pass, FuseToInt8MatmulBiasAdd) {
 
     graph::pass::pass_base_ptr apass
             = get_pass(engine_kind == graph::engine_kind::gpu
-                            ? "x8s8x8_matmul_add_post_ops_gpu"
+                            ? "x8x8x8_matmul_add_post_ops_gpu"
                             : "x8x8x8_matmul_add_post_ops_cpu");
     ASSERT_NE(apass, nullptr);
     apass->run(agraph);
@@ -8895,7 +8895,7 @@ TEST(test_pass_pass, FuseToInt8MatmulBiasAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass_system, FuseReluAdd) {
+TEST(test_pass_system, FuseReluAdd) {
     /*
          relu
            \  /
@@ -8937,7 +8937,7 @@ TEST(test_pass_pass_system, FuseReluAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseToX8x8f32MatmulDivAdd) {
+TEST(test_pass, FuseToX8x8f32MatmulDivAdd) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9017,7 +9017,7 @@ TEST(test_pass_pass, FuseToX8x8f32MatmulDivAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass_system, FuseToX8x8f32MatmulDivAdd_CPU) {
+TEST(test_pass_system, FuseToX8x8f32MatmulDivAdd_CPU) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9088,7 +9088,7 @@ TEST(test_pass_pass_system, FuseToX8x8f32MatmulDivAdd_CPU) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass, FuseToX8s8bf16Matmul) {
+TEST(test_pass, FuseToX8s8bf16Matmul) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9160,7 +9160,7 @@ TEST(test_pass_pass, FuseToX8s8bf16Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, FuseToX8s8bf16Matmul_CPU) {
+TEST(test_pass_system, FuseToX8s8bf16Matmul_CPU) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9227,7 +9227,7 @@ TEST(test_pass_pass_system, FuseToX8s8bf16Matmul_CPU) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass, FuseToX8s8bf16MatmulDiv) {
+TEST(test_pass, FuseToX8s8bf16MatmulDiv) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9310,7 +9310,7 @@ TEST(test_pass_pass, FuseToX8s8bf16MatmulDiv) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 8U);
 }
 
-TEST(test_pass_pass_system, FuseToX8s8bf16MatmulDiv_CPU) {
+TEST(test_pass_system, FuseToX8s8bf16MatmulDiv_CPU) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9387,7 +9387,7 @@ TEST(test_pass_pass_system, FuseToX8s8bf16MatmulDiv_CPU) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass, FailToAddMatmul) {
+TEST(test_pass, FailToAddMatmul) {
     /*
     (bf16) \     / (f16)
            matmul
@@ -9407,7 +9407,7 @@ TEST(test_pass_pass, FailToAddMatmul) {
     ASSERT_EQ(agraph.add_op(&matmul), status::invalid_graph_op);
 }
 
-TEST(test_pass_pass, FuseToX8s8bf16MatmulScaleAdd) {
+TEST(test_pass, FuseToX8s8bf16MatmulScaleAdd) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9509,7 +9509,7 @@ TEST(test_pass_pass, FuseToX8s8bf16MatmulScaleAdd) {
     }
 }
 
-TEST(test_pass_pass_system, FuseToX8s8bf16MatmulScaleAdd_CPU) {
+TEST(test_pass_system, FuseToX8s8bf16MatmulScaleAdd_CPU) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -9604,7 +9604,7 @@ TEST(test_pass_pass_system, FuseToX8s8bf16MatmulScaleAdd_CPU) {
     }
 }
 
-TEST(test_pass_pass, FuseToX8s8bf16MatmulBias) {
+TEST(test_pass, FuseToX8s8bf16MatmulBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -9676,7 +9676,7 @@ TEST(test_pass_pass, FuseToX8s8bf16MatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass_system, FuseToX8s8bf16MatmulBias) {
+TEST(test_pass_system, FuseToX8s8bf16MatmulBias) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -9744,7 +9744,7 @@ TEST(test_pass_pass_system, FuseToX8s8bf16MatmulBias) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass, FuseSingleTypecast) {
+TEST(test_pass, FuseSingleTypecast) {
     /*
         | (f32)
      typecast
@@ -9768,7 +9768,7 @@ TEST(test_pass_pass, FuseSingleTypecast) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, FuseToX8s8bf16MatmulBiasAddBF16) {
+TEST(test_pass, FuseToX8s8bf16MatmulBiasAddBF16) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -9857,7 +9857,7 @@ TEST(test_pass_pass, FuseToX8s8bf16MatmulBiasAddBF16) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 11U);
 }
 
-TEST(test_pass_pass_system, FuseToX8s8bf16MatmulBiasAddBF16) {
+TEST(test_pass_system, FuseToX8s8bf16MatmulBiasAddBF16) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -9940,7 +9940,7 @@ TEST(test_pass_pass_system, FuseToX8s8bf16MatmulBiasAddBF16) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass, MixInt8AndBf16MatmulBiasGelu) {
+TEST(test_pass, MixInt8AndBf16MatmulBiasGelu) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10038,7 +10038,7 @@ TEST(test_pass_pass, MixInt8AndBf16MatmulBiasGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16MatmulBiasGelu) {
+TEST(test_pass_system, MixInt8AndBf16MatmulBiasGelu) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10131,7 +10131,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16MatmulBiasGelu) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, MixInt8AndBf16MatmulGelu) {
+TEST(test_pass, MixInt8AndBf16MatmulGelu) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10226,7 +10226,7 @@ TEST(test_pass_pass, MixInt8AndBf16MatmulGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16MatmulGelu) {
+TEST(test_pass_system, MixInt8AndBf16MatmulGelu) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10325,7 +10325,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16MatmulGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, MixInt8AndBf16MatmulBias) {
+TEST(test_pass, MixInt8AndBf16MatmulBias) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10415,7 +10415,7 @@ TEST(test_pass_pass, MixInt8AndBf16MatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16MatmulBias) {
+TEST(test_pass_system, MixInt8AndBf16MatmulBias) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10508,7 +10508,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16MatmulBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass, MixInt8AndBf16Matmul) {
+TEST(test_pass, MixInt8AndBf16Matmul) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10595,7 +10595,7 @@ TEST(test_pass_pass, MixInt8AndBf16Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16Matmul) {
+TEST(test_pass_system, MixInt8AndBf16Matmul) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -10686,7 +10686,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16Matmul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, QuantWeiMixBf16MatmulBiasTransposeReshapeQuantize) {
+TEST(test_pass_system, QuantWeiMixBf16MatmulBiasTransposeReshapeQuantize) {
     SKIP_IF(!is_supported_dtype(data_type::bf16),
             "Skip bf16 tests for systems that do not support avx512_core.");
     const auto engine_kind = get_test_engine_kind();
@@ -10815,7 +10815,7 @@ TEST(test_pass_pass_system, QuantWeiMixBf16MatmulBiasTransposeReshapeQuantize) {
     }
 }
 
-TEST(test_pass_pass, MixInt8AndBf16ConvolutionBias) {
+TEST(test_pass, MixInt8AndBf16ConvolutionBias) {
     /*
         | (u8/s8)  | s8
      dequant    dequant
@@ -10907,7 +10907,7 @@ TEST(test_pass_pass, MixInt8AndBf16ConvolutionBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16ConvolutionBias) {
+TEST(test_pass_system, MixInt8AndBf16ConvolutionBias) {
     /*
         | (u8/s8)  | s8
      dequant    dequant
@@ -11004,8 +11004,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16ConvolutionBias) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 10U);
 }
 
-TEST(test_pass_pass,
-        FailToFuseMixInt8AndBf16ConvolutionWithoutQuantAfterTypecast) {
+TEST(test_pass, FailToFuseMixInt8AndBf16ConvolutionWithoutQuantAfterTypecast) {
     /*
         | (u8/s8)  | s8
      dequant    dequant
@@ -11088,7 +11087,7 @@ TEST(test_pass_pass,
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, MixInt8AndBf16ConvolutionBiasGelu) {
+TEST(test_pass, MixInt8AndBf16ConvolutionBiasGelu) {
     /*
         | (u8/s8)  | s8
      dequant    dequant
@@ -11188,7 +11187,7 @@ TEST(test_pass_pass, MixInt8AndBf16ConvolutionBiasGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16ConvolutionBiasGelu) {
+TEST(test_pass_system, MixInt8AndBf16ConvolutionBiasGelu) {
     /*
         | (u8/s8)  | s8
      dequant    dequant
@@ -11292,7 +11291,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16ConvolutionBiasGelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 9U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16ConvolutionAdd) {
+TEST(test_pass_system, MixInt8AndBf16ConvolutionAdd) {
     /*
         | (u8/s8)  | s8
      dequant    dequant
@@ -11396,7 +11395,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16ConvolutionAdd) {
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 7U);
 }
 
-TEST(test_pass_pass, FuseAddIntoSum) {
+TEST(test_pass, FuseAddIntoSum) {
     /*
         \   /
          Add
@@ -11447,7 +11446,7 @@ TEST(test_pass_pass, FuseAddIntoSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, rep_times);
 }
 
-TEST(test_pass_pass, FuseBroadcastAddIntoSum) {
+TEST(test_pass, FuseBroadcastAddIntoSum) {
     /*
         \   /
          Add
@@ -11486,7 +11485,7 @@ TEST(test_pass_pass, FuseBroadcastAddIntoSum) {
     ASSERT_EQ(agraph.get_num_partitions(), 0U);
 }
 
-TEST(test_pass_pass, FuseTypecaseQuantize) {
+TEST(test_pass, FuseTypecaseQuantize) {
     /*
              | (bf16)
            typecast
@@ -11529,7 +11528,7 @@ TEST(test_pass_pass, FuseTypecaseQuantize) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass_system, FuseSoftmaxQuantize) {
+TEST(test_pass_system, FuseSoftmaxQuantize) {
     /*
              | (f32)
            softmax
@@ -11572,7 +11571,7 @@ TEST(test_pass_pass_system, FuseSoftmaxQuantize) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass_system, FuseLayernormQuantize_CPU) {
+TEST(test_pass_system, FuseLayernormQuantize_CPU) {
     /*
              | (f32)
            layernorm
@@ -11625,7 +11624,7 @@ TEST(test_pass_pass_system, FuseLayernormQuantize_CPU) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass_system, FuseSoftmaxTypecast) {
+TEST(test_pass_system, FuseSoftmaxTypecast) {
     /*
              | (bf16)
            softmax
@@ -11667,7 +11666,7 @@ TEST(test_pass_pass_system, FuseSoftmaxTypecast) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass_system, FuseLayernormTypecast_CPU) {
+TEST(test_pass_system, FuseLayernormTypecast_CPU) {
     /*
              | (bf16)
            layernorm
@@ -11719,7 +11718,7 @@ TEST(test_pass_pass_system, FuseLayernormTypecast_CPU) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass_system, FuseSoftmaxTypecastQuantize) {
+TEST(test_pass_system, FuseSoftmaxTypecastQuantize) {
     /*
              | (bf16)
            softmax
@@ -11773,7 +11772,7 @@ TEST(test_pass_pass_system, FuseSoftmaxTypecastQuantize) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass_system, FuseLayernormTypecastQuantize_CPU) {
+TEST(test_pass_system, FuseLayernormTypecastQuantize_CPU) {
     /*
              | (bf16)
            layernorm
@@ -11836,7 +11835,7 @@ TEST(test_pass_pass_system, FuseLayernormTypecastQuantize_CPU) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass_system, NotFuseLayernormTypecast_GPU) {
+TEST(test_pass_system, NotFuseLayernormTypecast_GPU) {
     /*
              | (bf16)
            layernorm
@@ -11889,24 +11888,24 @@ TEST(test_pass_pass_system, NotFuseLayernormTypecast_GPU) {
 
     ASSERT_EQ(agraph.get_num_partitions(), 2U);
 
-    // partition 0: layernorm + tc
-    ASSERT_EQ(agraph.get_partitions()[0]->get_inputs().size(), 3U);
-    ASSERT_EQ(agraph.get_partitions()[0]->get_inputs()[0].id, 0U);
-    ASSERT_EQ(agraph.get_partitions()[0]->get_inputs()[1].id, 1U);
-    ASSERT_EQ(agraph.get_partitions()[0]->get_inputs()[2].id, 2U);
+    // partition 0: tc + quant
+    ASSERT_EQ(agraph.get_partitions()[0]->get_inputs().size(), 1U);
+    ASSERT_EQ(agraph.get_partitions()[0]->get_inputs()[0].id, 3U);
 
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs().size(), 1U);
-    ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
+    ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 
-    // partition 1: quant
-    ASSERT_EQ(agraph.get_partitions()[1]->get_inputs().size(), 1U);
-    ASSERT_EQ(agraph.get_partitions()[1]->get_inputs()[0].id, 4U);
+    // partition 1: layernorm
+    ASSERT_EQ(agraph.get_partitions()[1]->get_inputs().size(), 3U);
+    ASSERT_EQ(agraph.get_partitions()[1]->get_inputs()[0].id, 0U);
+    ASSERT_EQ(agraph.get_partitions()[1]->get_inputs()[1].id, 1U);
+    ASSERT_EQ(agraph.get_partitions()[1]->get_inputs()[2].id, 2U);
 
     ASSERT_EQ(agraph.get_partitions()[1]->get_outputs().size(), 1U);
-    ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 5U);
+    ASSERT_EQ(agraph.get_partitions()[1]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, ShuffleFusion) {
+TEST(test_pass, ShuffleFusion) {
     /*   reshape
             |
         transpose
@@ -11972,7 +11971,7 @@ TEST(test_pass_pass, ShuffleFusion) {
     }
 }
 
-TEST(test_pass_pass_system, FuseTypecaseQuantize) {
+TEST(test_pass_system, FuseTypecaseQuantize) {
 
     /*
              | (bf16)
@@ -12020,7 +12019,7 @@ TEST(test_pass_pass_system, FuseTypecaseQuantize) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16MatmulAdd) {
+TEST(test_pass_system, MixInt8AndBf16MatmulAdd) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -12131,7 +12130,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16MatmulAdd) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass_system, MixInt8AndBf16MatmulDiv) {
+TEST(test_pass_system, MixInt8AndBf16MatmulDiv) {
     /*
         | (u8/s8)  | (u8/s8)
      dequant    dequant
@@ -12226,7 +12225,7 @@ TEST(test_pass_pass_system, MixInt8AndBf16MatmulDiv) {
             partition_kind_t::quantized_matmul_post_ops);
 }
 
-TEST(test_pass_pass, FuseBnReLUWithSharedInputs) {
+TEST(test_pass, FuseBnReLUWithSharedInputs) {
     /*   bn
           |
          relu
@@ -12272,7 +12271,7 @@ TEST(test_pass_pass, FuseBnReLUWithSharedInputs) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseReorderAdd) {
+TEST(test_pass, FuseReorderAdd) {
     /*
              |
          reorder
@@ -12319,7 +12318,7 @@ TEST(test_pass_pass, FuseReorderAdd) {
     }
 }
 
-TEST(test_pass_pass, FailToFuseReorderAdd) {
+TEST(test_pass, FailToFuseReorderAdd) {
     /*
              |
          reorder
@@ -12359,7 +12358,7 @@ TEST(test_pass_pass, FailToFuseReorderAdd) {
     ASSERT_EQ(agraph.get_num_partitions(), 0U);
 }
 
-TEST(test_pass_pass, FuseInt8Reorder) {
+TEST(test_pass, FuseInt8Reorder) {
     /*
          dequantize
              |
@@ -12413,7 +12412,7 @@ TEST(test_pass_pass, FuseInt8Reorder) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass_system, FuseInt8Reorder) {
+TEST(test_pass_system, FuseInt8Reorder) {
     /*
          dequantize
              |
@@ -12463,7 +12462,7 @@ TEST(test_pass_pass_system, FuseInt8Reorder) {
             partition_kind_t::misc_quantized_post_ops);
 }
 
-TEST(test_pass_pass, FuseInt8ReorderAdd) {
+TEST(test_pass, FuseInt8ReorderAdd) {
     /*
          dequantize
              |
@@ -12542,7 +12541,7 @@ TEST(test_pass_pass, FuseInt8ReorderAdd) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 6U);
 }
 
-TEST(test_pass_pass_system, FuseInt8ReorderAdd) {
+TEST(test_pass_system, FuseInt8ReorderAdd) {
     /*
          dequantize
              |
@@ -12612,7 +12611,7 @@ TEST(test_pass_pass_system, FuseInt8ReorderAdd) {
             partition_kind_t::misc_quantized_post_ops);
 }
 
-TEST(test_pass_pass, SingleInterpolatePass) {
+TEST(test_pass, SingleInterpolatePass) {
     const auto engine_kind = get_test_engine_kind();
     graph_t agraph(engine_kind);
     op_t interpolate {0, Interpolate, "interpolate"};
@@ -12642,7 +12641,7 @@ TEST(test_pass_pass, SingleInterpolatePass) {
     ASSERT_EQ(fgraph.get_num_partitions(), 0U);
 }
 
-TEST(test_pass_pass, FuseInterpolateRelu) {
+TEST(test_pass, FuseInterpolateRelu) {
     /* interpolate
             |
            relu
@@ -12679,7 +12678,7 @@ TEST(test_pass_pass, FuseInterpolateRelu) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 2U);
 }
 
-TEST(test_pass_pass, FuseInterpolateSwish) {
+TEST(test_pass, FuseInterpolateSwish) {
     /*    interpolate
             /    |
       sigmoid    |
@@ -12730,7 +12729,7 @@ TEST(test_pass_pass, FuseInterpolateSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass_system, FuseInterpolateSwish) {
+TEST(test_pass_system, FuseInterpolateSwish) {
     /*    interpolate
             /    |
       sigmoid    |
@@ -12782,7 +12781,7 @@ TEST(test_pass_pass_system, FuseInterpolateSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass, FuseInterpolate3PostOps) {
+TEST(test_pass, FuseInterpolate3PostOps) {
     /*    interpolate
                |
            sigmoid
@@ -12833,7 +12832,7 @@ TEST(test_pass_pass, FuseInterpolate3PostOps) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FuseInterpolateSum) {
+TEST(test_pass, FuseInterpolateSum) {
     /*   interpolate
              \           /
                \        /
@@ -12872,7 +12871,7 @@ TEST(test_pass_pass, FuseInterpolateSum) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, FuseInterpolateMul) {
+TEST(test_pass, FuseInterpolateMul) {
     /*   interpolate
              \           /
                \        /
@@ -12911,7 +12910,7 @@ TEST(test_pass_pass, FuseInterpolateMul) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 3U);
 }
 
-TEST(test_pass_pass, Int8MhaFusion) {
+TEST(test_pass, Int8MhaFusion) {
     const auto engine_kind = get_test_engine_kind();
     graph_t agraph(engine_kind);
     dnnl::graph::tests::unit::utils::construct_int8_MHA(&agraph);
@@ -12923,19 +12922,21 @@ TEST(test_pass_pass, Int8MhaFusion) {
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, F32MhaFusion) {
+TEST(test_pass, F32MhaFusion) {
     const auto engine_kind = get_test_engine_kind();
     graph_t agraph(engine_kind);
     dnnl::graph::tests::unit::utils::construct_dnnl_float_MHA(&agraph);
     agraph.finalize();
     ASSERT_EQ(agraph.get_ops().size(), 7U);
 
-    dnnl::impl::graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+    graph::pass::pass_base_ptr apass = get_pass(
+            engine_kind == graph::engine_kind::cpu ? "float_sdp_fusion_cpu"
+                                                   : "float_sdp_fusion_gpu");
     apass->run(agraph);
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
 }
 
-TEST(test_pass_pass, FuseReduceAdd) {
+TEST(test_pass, FuseReduceAdd) {
     /* reduce
           |
          add
@@ -12977,7 +12978,7 @@ TEST(test_pass_pass, FuseReduceAdd) {
     }
 }
 
-TEST(test_pass_pass, FuseReduceRelu) {
+TEST(test_pass, FuseReduceRelu) {
     /* reduce
           |
         relu
@@ -13017,7 +13018,7 @@ TEST(test_pass_pass, FuseReduceRelu) {
     }
 }
 
-TEST(test_pass_pass_system, FuseReduceSwish) {
+TEST(test_pass_system, FuseReduceSwish) {
     /*       reduce
             /    |
         sigmoid  |
@@ -13071,7 +13072,7 @@ TEST(test_pass_pass_system, FuseReduceSwish) {
     }
 }
 
-TEST(test_pass_pass_system, FuseReduceWith3PostOps) {
+TEST(test_pass_system, FuseReduceWith3PostOps) {
     /*       reducel1
                |
              relu
@@ -13130,7 +13131,7 @@ TEST(test_pass_pass_system, FuseReduceWith3PostOps) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 5U);
 }
 
-TEST(test_pass_pass, FailToFuseReduceWithEmptyScales) {
+TEST(test_pass, FailToFuseReduceWithEmptyScales) {
     const std::vector<op_kind_t> configs {ReduceL1, ReduceL2, ReduceMax,
             ReduceMean, ReduceMin, ReduceProd, ReduceSum};
     const auto engine_kind = get_test_engine_kind();
@@ -13154,7 +13155,7 @@ TEST(test_pass_pass, FailToFuseReduceWithEmptyScales) {
     }
 }
 
-TEST(test_pass_pass, Int8Concat) {
+TEST(test_pass, Int8Concat) {
     /*
          dq  dq dq  ..
           \  |  |  /
@@ -13219,7 +13220,7 @@ TEST(test_pass_pass, Int8Concat) {
     }
 }
 
-TEST(test_pass_pass, FailToFuseInt8Concat) {
+TEST(test_pass, FailToFuseInt8Concat) {
     /*
          dq  dq not_dq
           \  |    /
@@ -13281,7 +13282,7 @@ TEST(test_pass_pass, FailToFuseInt8Concat) {
     ASSERT_EQ(agraph.get_num_partitions(), 0U);
 }
 
-TEST(test_pass_pass, FuseToInt8ConvTransposeAdd_CPU) {
+TEST(test_pass, FuseToInt8ConvTransposeAdd_CPU) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -13395,7 +13396,7 @@ TEST(test_pass_pass, FuseToInt8ConvTransposeAdd_CPU) {
     }
 }
 
-TEST(test_pass_pass_system, FuseToInt8ConvTransposeAdd) {
+TEST(test_pass_system, FuseToInt8ConvTransposeAdd) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -13514,7 +13515,7 @@ TEST(test_pass_pass_system, FuseToInt8ConvTransposeAdd) {
     }
 }
 
-TEST(test_pass_pass, FuseToInt8ConvtransposeEltwise_CPU) {
+TEST(test_pass, FuseToInt8ConvtransposeEltwise_CPU) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -13635,7 +13636,7 @@ TEST(test_pass_pass, FuseToInt8ConvtransposeEltwise_CPU) {
     }
 }
 
-TEST(test_pass_pass_system, FuseToInt8ConvtransposeEltwise_CPU) {
+TEST(test_pass_system, FuseToInt8ConvtransposeEltwise_CPU) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -13757,7 +13758,7 @@ TEST(test_pass_pass_system, FuseToInt8ConvtransposeEltwise_CPU) {
     }
 }
 
-TEST(test_pass_pass, FuseToInt8ConvtransposeBinary_CPU) {
+TEST(test_pass, FuseToInt8ConvtransposeBinary_CPU) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -13872,7 +13873,7 @@ TEST(test_pass_pass, FuseToInt8ConvtransposeBinary_CPU) {
     }
 }
 
-TEST(test_pass_pass, FailToFuseInt8ConcatDifferentScales) {
+TEST(test_pass, FailToFuseInt8ConcatDifferentScales) {
     /*
           dq     dq
            \     /
@@ -13931,7 +13932,7 @@ TEST(test_pass_pass, FailToFuseInt8ConcatDifferentScales) {
     ASSERT_EQ(agraph.get_num_partitions(), 0U);
 }
 
-TEST(test_pass_pass, SingleSoftPlusForwardAndBackwardPass) {
+TEST(test_pass, SingleSoftPlusForwardAndBackwardPass) {
     std::vector<std::pair<op_kind_t, std::string>> op_infos {
             {SoftPlus, "eltwise_fwd"}, {SoftPlusBackward, "eltwise_bwd"}};
     std::vector<float> beta_values {-3.f, -1.f, 0.f, 1.f, 3.f};
@@ -13961,7 +13962,7 @@ TEST(test_pass_pass, SingleSoftPlusForwardAndBackwardPass) {
     }
 }
 
-TEST(test_pass_pass, FuseConvBwdBiasaddBwd) {
+TEST(test_pass, FuseConvBwdBiasaddBwd) {
     /*       Wildcard
         \        /\
       Convolution  BiasAddBackward
@@ -14015,7 +14016,7 @@ TEST(test_pass_pass, FuseConvBwdBiasaddBwd) {
 
 // TODO(zitian): wait for the implementation of comparison ops:
 //      Gt, Ge, Le, Lt, Eq, Ne
-TEST(test_pass_pass, BinaryPostops) {
+TEST(test_pass, BinaryPostops) {
     /*
         0       1
         \       /
@@ -14125,7 +14126,7 @@ TEST(test_pass_pass, BinaryPostops) {
 
 // TODO(zitian): wait for the implementation of comparison ops:
 //      Gt, Ge, Le, Lt, Eq, Ne
-TEST(test_pass_pass, Binary3Postops) {
+TEST(test_pass, Binary3Postops) {
     /*
         0       1
         \       /
@@ -14203,7 +14204,7 @@ TEST(test_pass_pass, Binary3Postops) {
 
         for (size_t i = 0; i < pop_seq.size(); ++i) {
             auto pop = pop_seq[i];
-            post_ops.emplace_back(op_t {i + 1, pop, "post op"});
+            post_ops.emplace_back(i + 1, pop, "post op");
 
             // set additional parameters for specific ops
             switch (pop) {
@@ -14256,7 +14257,7 @@ TEST(test_pass_pass, Binary3Postops) {
     }
 }
 
-TEST(test_pass_pass_system, FuseBinarySwish) {
+TEST(test_pass_system, FuseBinarySwish) {
     /*       binary
             /    |
         sigmoid  |
@@ -14311,7 +14312,7 @@ TEST(test_pass_pass_system, FuseBinarySwish) {
     }
 }
 
-TEST(test_pass_pass, ConvtransposePostops) {
+TEST(test_pass, ConvtransposePostops) {
     /*
         0       1
         \       /
@@ -14456,7 +14457,7 @@ TEST(test_pass_pass, ConvtransposePostops) {
                 }
 }
 
-TEST(test_pass_pass, Convtranspose3Postops) {
+TEST(test_pass, Convtranspose3Postops) {
     /*
         0       1
         \       /
@@ -14603,7 +14604,7 @@ TEST(test_pass_pass, Convtranspose3Postops) {
     }
 }
 
-TEST(test_pass_pass_system, FuseConvTransposeSwish) {
+TEST(test_pass_system, FuseConvTransposeSwish) {
     // swish: f(x) = x * sigmoid(x)
     /*convtranspose
         /    |
@@ -14649,7 +14650,7 @@ TEST(test_pass_pass_system, FuseConvTransposeSwish) {
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs()[0].id, 4U);
 }
 
-TEST(test_pass_pass_system, FuseToInt8ConvTransposeSwishReLU_CPU) {
+TEST(test_pass_system, FuseToInt8ConvTransposeSwishReLU_CPU) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -14748,7 +14749,7 @@ TEST(test_pass_pass_system, FuseToInt8ConvTransposeSwishReLU_CPU) {
 
 // TODO(zitian): wait for the implementation of comparison ops:
 //      Gt, Ge, Le, Lt, Eq, Ne
-TEST(test_pass_pass, Pool3Postops) {
+TEST(test_pass, Pool3Postops) {
     /*
         0       1
         \       /
@@ -14792,7 +14793,7 @@ TEST(test_pass_pass, Pool3Postops) {
         std::vector<op_t> post_ops {};
         for (size_t i = 0; i < post_op_t_seq.size(); ++i) {
             auto pop_t = post_op_t_seq[i];
-            post_ops.emplace_back(op_t {i + 1, pop_t, "post op"});
+            post_ops.emplace_back(i + 1, pop_t, "post op");
 
             post_ops.back().add_input(lt_vec[lt_idx]);
             if (std::find(binary_op_ts.begin(), binary_op_ts.end(), pop_t)
@@ -14827,7 +14828,7 @@ TEST(test_pass_pass, Pool3Postops) {
     }
 }
 
-TEST(test_pass_pass_system, PoolFusionWithInternalInputs) {
+TEST(test_pass_system, PoolFusionWithInternalInputs) {
     /*
         AvgPool/MaxPool
                 |  (both inputs come from the pooling op,
@@ -14882,7 +14883,7 @@ TEST(test_pass_pass_system, PoolFusionWithInternalInputs) {
     }
 }
 
-TEST(test_pass_pass_system, EltwiseFusionWithInternalInputs) {
+TEST(test_pass_system, EltwiseFusionWithInternalInputs) {
     /*
         Abs/Clamp/Elu/Exp/GELU/HardSwish/LeakyReLU/Log/
         Mish/Sigmoid/SoftPlus/ReLU/Round/Sqrt/Square/Tanh
@@ -14938,7 +14939,7 @@ TEST(test_pass_pass_system, EltwiseFusionWithInternalInputs) {
     }
 }
 
-TEST(test_pass_pass, BatchNormReluU8Unfuse) {
+TEST(test_pass, BatchNormReluU8Unfuse) {
     using dims = graph::dnnl_impl::dims;
     namespace utils = dnnl::graph::tests::unit::utils;
 
@@ -15032,7 +15033,7 @@ TEST(test_pass_pass, BatchNormReluU8Unfuse) {
     }
 }
 
-TEST(test_pass_pass, FuseMatmulSwish) {
+TEST(test_pass, FuseMatmulSwish) {
     const std::vector<std::string> seqs_1 {"first", "second"};
     const std::vector<std::string> seqs_2 {"left", "right"};
     const auto engine_kind = get_test_engine_kind();
@@ -15090,7 +15091,7 @@ TEST(test_pass_pass, FuseMatmulSwish) {
         }
 }
 
-TEST(test_pass_pass_system, LayernormWithSpecialAxis) {
+TEST(test_pass_system, LayernormWithSpecialAxis) {
     /*
              | (bf16)
            layernorm
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_pool.cpp b/tests/gtests/graph/unit/backend/dnnl/test_pool.cpp
index 78215282757..b8cc621ccb7 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_pool.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_pool.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -171,10 +171,10 @@ TEST(test_pool_execute_subgraph_int8, PoolAdd) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_s8_ts(src_s8, engine, src_s8_data);
-        test_tensor other_s8_ts(other_s8, engine, other_s8_data);
-        test_tensor case1_dst_s8_ts(dst_s8, engine, case1_dst_s8_data);
-        test_tensor case2_dst_s8_ts(dst_s8, engine, case2_dst_s8_data);
+        test_tensor_t src_s8_ts(src_s8, engine, src_s8_data);
+        test_tensor_t other_s8_ts(other_s8, engine, other_s8_data);
+        test_tensor_t case1_dst_s8_ts(dst_s8, engine, case1_dst_s8_data);
+        test_tensor_t case2_dst_s8_ts(dst_s8, engine, case2_dst_s8_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(g, {src_s8_ts, other_s8_ts}, {case1_dst_s8_ts},
@@ -284,7 +284,7 @@ TEST(test_pool_execute_subgraph_fp32, Pool3Postops) {
         std::vector<graph::op_t> post_ops {};
         for (size_t i = 0; i < post_op_ts.size(); ++i) {
             auto pop_t = post_op_ts[i];
-            post_ops.emplace_back(graph::op_t {i + 1, pop_t, "post op"});
+            post_ops.emplace_back(i + 1, pop_t, "post op");
 
             post_ops.back().add_input(lt_vec[lt_idx]);
             if (std::find(binary_op_ts.begin(), binary_op_ts.end(), pop_t)
@@ -303,13 +303,13 @@ TEST(test_pool_execute_subgraph_fp32, Pool3Postops) {
             g.add_op(&pop);
         g.finalize();
 
-        std::vector<test_tensor> src_tss {};
+        std::vector<test_tensor_t> src_tss {};
         for (size_t i = 0; i < input_lts.size(); ++i)
             src_tss.emplace_back(lt_vec[input_lts[i]], engine, src_datas[i]);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(pool_dst_shape));
-        test_tensor case1_dst_ts(lt_vec[lt_idx], engine, case1_out_data);
+        test_tensor_t case1_dst_ts(lt_vec[lt_idx], engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, src_tss, {case1_dst_ts}, *engine, *strm),
                 graph::status::success);
@@ -335,9 +335,9 @@ TEST(test_pool_execute_subgraph_fp32, Pool3Postops) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(pool_dst_shape));
-        test_tensor case2_dst_ts(lt_vec[lt_idx], engine, case2_out_data);
+        test_tensor_t case2_dst_ts(lt_vec[lt_idx], engine, case2_out_data);
 
-        cp.execute(strm, test_tensor::to_graph_tensor(src_tss),
+        cp.execute(strm, test_tensor_t::to_graph_tensor(src_tss),
                 {case2_dst_ts.get()});
         strm->wait();
 
@@ -400,8 +400,8 @@ TEST(test_pool_execute, AvgPoolExcludePad) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t dst_ts(lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
@@ -463,8 +463,8 @@ TEST(test_pool_execute, AvgPoolIncludePad) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t dst_ts(lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
@@ -522,8 +522,8 @@ TEST(test_pool_execute, AvgPoolBackwardExcludePad) {
     cp.query_logical_tensor(diff_src_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(lt, eng, diff_src);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {diff_dst_ts.get()}, {diff_src_ts.get()});
@@ -581,8 +581,8 @@ TEST(test_pool_execute, AvgPoolBackwardIncludePad) {
     cp.query_logical_tensor(diff_src_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(lt, eng, diff_src);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {diff_dst_ts.get()}, {diff_src_ts.get()});
@@ -689,9 +689,9 @@ TEST(test_pool_execute_subgraph_int8, Avgpool) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor dst_u8_ts(dst_u8, engine, case1_out_data);
-        test_tensor dst_u8_case2_ts(dst_u8, engine, case2_out_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t dst_u8_ts(dst_u8, engine, case1_out_data);
+        test_tensor_t dst_u8_case2_ts(dst_u8, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(g, {src_u8_ts}, {dst_u8_ts}, *engine, *strm),
@@ -780,8 +780,8 @@ TEST(test_pool_execute, MaxPool) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t dst_ts(lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
@@ -842,8 +842,8 @@ TEST(test_pool_execute, MaxPoolwithCache) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t dst_ts(lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
@@ -860,8 +860,8 @@ TEST(test_pool_execute, MaxPoolwithCache) {
     std::vector<float> ref_dst2 {-1.0, 2.0, 4.0, 2.0};
     std::vector<float> dst2(ref_dst2.size(), 0.0);
 
-    test_tensor src_ts2(src_lt, eng, src2);
-    test_tensor dst_ts2(lt, eng, dst2);
+    test_tensor_t src_ts2(src_lt, eng, src2);
+    test_tensor_t dst_ts2(lt, eng, dst2);
 
     cp.execute(strm, {src_ts2.get()}, {dst_ts2.get()});
     strm->wait();
@@ -996,9 +996,9 @@ TEST(test_pool_execute, MaxPoolBackward) {
     cp.query_logical_tensor(diff_src_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(lt, eng, diff_src);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), diff_dst_ts.get()}, {diff_src_ts.get()});
@@ -1059,9 +1059,9 @@ TEST(test_pool_execute, MaxPoolBackwardPlainGrad) {
     std::vector<float> diff_dst(product(output_dims), 1);
     std::vector<float> diff_src(product(input_dims), 1);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-    test_tensor diff_src_ts(lt, eng, diff_src);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     cp.execute(strm, {src_ts.get(), diff_dst_ts.get()}, {diff_src_ts.get()});
@@ -1165,9 +1165,9 @@ TEST(test_pool_execute_subgraph_int8, Maxpool) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-        test_tensor dst_u8_ts(dst_u8, engine, case1_out_data);
-        test_tensor dst_u8_case2_ts(dst_u8, engine, case2_out_data);
+        test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+        test_tensor_t dst_u8_ts(dst_u8, engine, case1_out_data);
+        test_tensor_t dst_u8_case2_ts(dst_u8, engine, case2_out_data);
 
         // -------------------------case 1----------------------------------
         ASSERT_EQ(run_graph(g, {src_u8_ts}, {dst_u8_ts}, *engine, *strm),
@@ -1281,9 +1281,9 @@ TEST(test_pool_execute_subgraph_int8, MaxpoolAsymmetric) {
     g.add_op(&qout_op);
     g.finalize();
 
-    test_tensor src_u8_ts(src_u8, engine, src_u8_data);
-    test_tensor dst_u8_ts(dst_u8, engine, case1_out_data);
-    test_tensor dst_u8_case2_ts(dst_u8, engine, case2_out_data);
+    test_tensor_t src_u8_ts(src_u8, engine, src_u8_data);
+    test_tensor_t dst_u8_ts(dst_u8, engine, case1_out_data);
+    test_tensor_t dst_u8_case2_ts(dst_u8, engine, case2_out_data);
 
     // -------------------------case 1----------------------------------
     ASSERT_EQ(run_graph(g, {src_u8_ts}, {dst_u8_ts}, *engine, *strm),
@@ -1457,11 +1457,11 @@ class pool_binary_t : public ::testing::TestWithParam<pool_binary_params_t> {
             cp.query_logical_tensor(add_dst_lt.id, &lt);
             ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-            test_tensor src_ts(src_lt, eng);
+            test_tensor_t src_ts(src_lt, eng);
             src_ts.fill<bfloat16_t>(4.0);
-            test_tensor post_src_ts(post_src_lt, eng);
+            test_tensor_t post_src_ts(post_src_lt, eng);
             post_src_ts.fill<bfloat16_t>(2.0);
-            test_tensor add_dst_ts(add_dst_lt, eng);
+            test_tensor_t add_dst_ts(add_dst_lt, eng);
 
             graph::stream_t *strm = get_stream();
             ASSERT_EQ(cp.execute(strm, {src_ts.get(), post_src_ts.get()},
@@ -1627,8 +1627,8 @@ TEST(test_pool_execute_subgraph_int8, DequantizePoolReshapeQunatize) {
         g.add_op(&qout_op);
         g.finalize();
 
-        test_tensor src_s8_ts(dq_in_s8, engine, src_s8_data);
-        test_tensor case1_dst_s8_ts(q_out_s8, engine, case1_dst_s8_data);
+        test_tensor_t src_s8_ts(dq_in_s8, engine, src_s8_data);
+        test_tensor_t case1_dst_s8_ts(q_out_s8, engine, case1_dst_s8_data);
         graph::pass::pass_base_ptr apass
                 = get_pass("x8_pool_reshape_transpose");
         apass->run(g);
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_prelu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_prelu.cpp
index f0f0d4dcca9..133e3679e1d 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_prelu.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -87,9 +87,9 @@ class prelu_t : public ::testing::TestWithParam<prelu_params_t> {
 
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor wei_ts(wei_lt, eng, wei);
-        test_tensor dst_ts(dst_lt, eng, dst);
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t wei_ts(wei_lt, eng, wei);
+        test_tensor_t dst_ts(dst_lt, eng, dst);
         graph::stream_t *strm = get_stream();
 
         cp.execute(strm, {src_ts.get(), wei_ts.get()}, {dst_ts.get()});
@@ -175,11 +175,11 @@ class prelu_backprop_t : public ::testing::TestWithParam<prelu_bwd_params_t> {
 
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor wei_ts(wei_lt, eng, wei);
-        test_tensor diff_dst_ts(diff_dst_lt, eng, diff_dst);
-        test_tensor diff_src_ts(diff_src_lt, eng, diff_src);
-        test_tensor diff_wei_ts(diff_wei_lt, eng, diff_wei);
+        test_tensor_t src_ts(src_lt, eng, src);
+        test_tensor_t wei_ts(wei_lt, eng, wei);
+        test_tensor_t diff_dst_ts(diff_dst_lt, eng, diff_dst);
+        test_tensor_t diff_src_ts(diff_src_lt, eng, diff_src);
+        test_tensor_t diff_wei_ts(diff_wei_lt, eng, diff_wei);
 
         ASSERT_EQ(cp.execute(strm,
                           {src_ts.get(), wei_ts.get(), diff_dst_ts.get()},
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_quantize.cpp b/tests/gtests/graph/unit/backend/dnnl/test_quantize.cpp
index 65f7fba17e1..8a41b906893 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_quantize.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_quantize.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -82,8 +82,8 @@ TEST(test_quantize_execute, QuantizePerTensor) {
         cp.query_logical_tensor(dst_lt.id, &lt);
         ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-        test_tensor src_ts(src_lt, engine, src);
-        test_tensor dst_ts(dst_lt, engine, dst);
+        test_tensor_t src_ts(src_lt, engine, src);
+        test_tensor_t dst_ts(dst_lt, engine, dst);
 
         graph::stream_t *strm = get_stream();
         ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -144,8 +144,8 @@ TEST(test_quantize_execute, QuantizePerTensorAnyLayout) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(lt, engine, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -202,8 +202,8 @@ TEST(test_quantize_execute, QuantizePerChannelSymmetric) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(lt, engine, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -268,8 +268,8 @@ TEST(test_quantize_execute, TypecastQuantize) {
     ASSERT_EQ(p.compile(&cp, lt_ins, lt_outs, engine), graph::status::success);
 
     std::vector<uint8_t> dst_data(product(src_shape));
-    test_tensor src_ts(src_bf16, engine, src_data);
-    test_tensor dst_ts(dst_int8, engine, dst_data);
+    test_tensor_t src_ts(src_bf16, engine, src_data);
+    test_tensor_t dst_ts(dst_int8, engine, dst_data);
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
             graph::status::success);
     strm->wait();
@@ -332,10 +332,10 @@ TEST(test_quantize_execute, DynamicQuantizeS32ZpsPerTensor_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor zps_ts(zps_lt, eng, zps);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t zps_ts(zps_lt, eng, zps);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get(), zps_ts.get()},
@@ -411,10 +411,10 @@ TEST(test_quantize_execute, DynamicQuantizeS32ZpsPerChannel_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor zps_ts(zps_lt, eng, zps);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t zps_ts(zps_lt, eng, zps);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get(), zps_ts.get()},
@@ -484,10 +484,10 @@ TEST(test_quantize_execute, DynamicQuantizeS8ZpsPerTensor_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor zps_ts(zps_lt, eng, zps);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t zps_ts(zps_lt, eng, zps);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get(), zps_ts.get()},
@@ -552,9 +552,9 @@ TEST(test_quantize_execute, DynamicQuantizeNoZpsPerTensor_CPU) {
     cp.query_logical_tensor(dst_lt.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor scales_ts(scales_lt, eng, scales);
-    test_tensor dst_ts(dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t scales_ts(scales_lt, eng, scales);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get(), scales_ts.get()}, {dst_ts.get()}),
@@ -619,8 +619,8 @@ TEST(test_quantize_execute, QuantizeZeroVolume) {
     ASSERT_EQ(lt.layout.strides[2], 1U);
     ASSERT_EQ(graph::logical_tensor_wrapper_t(lt).size(), 0U);
 
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(lt, engine, dst);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_reduce.cpp b/tests/gtests/graph/unit/backend/dnnl/test_reduce.cpp
index 5f2b4af4067..d53be7155c3 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_reduce.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_reduce.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -106,9 +106,9 @@ TEST(test_reduce_compile, TestReduce) {
 
         std::vector<float> case1_out_data(product(new_reduce_dst_shape));
         std::vector<float> case2_out_data(product(new_reduce_dst_shape));
-        test_tensor reduce_src_ts(reduce_src, engine, src_data);
-        test_tensor reduce_dst_ts1(reduce_dst, engine, case1_out_data);
-        test_tensor reduce_dst_ts2(reduce_dst, engine, case2_out_data);
+        test_tensor_t reduce_src_ts(reduce_src, engine, src_data);
+        test_tensor_t reduce_dst_ts1(reduce_dst, engine, case1_out_data);
+        test_tensor_t reduce_dst_ts2(reduce_dst, engine, case2_out_data);
 
         ASSERT_EQ(
                 run_graph(g, {reduce_src_ts}, {reduce_dst_ts1}, *engine, *strm),
@@ -193,12 +193,12 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceAdd) {
         g.add_op(&add);
         g.finalize();
 
-        test_tensor reduce_src_ts(reduce_src, engine, src_data);
-        test_tensor add_src1_ts(add_src1, engine, src1_data);
+        test_tensor_t reduce_src_ts(reduce_src, engine, src_data);
+        test_tensor_t add_src1_ts(add_src1, engine, src1_data);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(reduce_dst_shape));
-        test_tensor add_dst_ts(add_dst, engine, case1_out_data);
+        test_tensor_t add_dst_ts(add_dst, engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, {reduce_src_ts, add_src1_ts}, {add_dst_ts},
                           *engine, *strm),
@@ -223,7 +223,7 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceAdd) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(reduce_dst_shape));
-        test_tensor add_dst_ts2(add_dst, engine, case2_out_data);
+        test_tensor_t add_dst_ts2(add_dst, engine, case2_out_data);
 
         cp.execute(strm, {reduce_src_ts.get(), add_src1_ts.get()},
                 {add_dst_ts2.get()});
@@ -290,11 +290,11 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceRelu) {
         g.add_op(&relu);
         g.finalize();
 
-        test_tensor reduce_src_ts(reduce_src, engine, src_data);
+        test_tensor_t reduce_src_ts(reduce_src, engine, src_data);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(reduce_dst_shape));
-        test_tensor relu_dst_ts(relu_dst, engine, case1_out_data);
+        test_tensor_t relu_dst_ts(relu_dst, engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, {reduce_src_ts}, {relu_dst_ts}, *engine, *strm),
                 graph::status::success);
@@ -317,7 +317,7 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceRelu) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(reduce_dst_shape));
-        test_tensor relu_dst_ts2(relu_dst, engine, case2_out_data);
+        test_tensor_t relu_dst_ts2(relu_dst, engine, case2_out_data);
 
         cp.execute(strm, {reduce_src_ts.get()}, {relu_dst_ts2.get()});
         strm->wait();
@@ -388,11 +388,11 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceSwish) {
         g.add_op(&multiply);
         g.finalize();
 
-        test_tensor reduce_src_ts(reduce_src, engine, src_data);
+        test_tensor_t reduce_src_ts(reduce_src, engine, src_data);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(reduce_dst_shape));
-        test_tensor mul_dst_ts(mul_dst, engine, case1_out_data);
+        test_tensor_t mul_dst_ts(mul_dst, engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, {reduce_src_ts}, {mul_dst_ts}, *engine, *strm),
                 graph::status::success);
@@ -415,7 +415,7 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceSwish) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(reduce_dst_shape));
-        test_tensor mul_dst_ts2(mul_dst, engine, case2_out_data);
+        test_tensor_t mul_dst_ts2(mul_dst, engine, case2_out_data);
 
         cp.execute(strm, {reduce_src_ts.get()}, {mul_dst_ts2.get()});
         strm->wait();
@@ -501,13 +501,13 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceWith3PostOps_CPU) {
         g.add_op(&multiply);
         g.finalize();
 
-        test_tensor reduce_src_ts(reduce_src, engine, src_data);
-        test_tensor max_src_ts(max_src, engine, max_src_data);
-        test_tensor mul_src_ts(mul_src, engine, mul_src_data);
+        test_tensor_t reduce_src_ts(reduce_src, engine, src_data);
+        test_tensor_t max_src_ts(max_src, engine, max_src_data);
+        test_tensor_t mul_src_ts(mul_src, engine, mul_src_data);
 
         // -------------------------case 1----------------------------------
         std::vector<float> case1_out_data(product(reduce_dst_shape));
-        test_tensor mul_dst_ts(mul_dst, engine, case1_out_data);
+        test_tensor_t mul_dst_ts(mul_dst, engine, case1_out_data);
 
         ASSERT_EQ(run_graph(g, {reduce_src_ts, max_src_ts, mul_src_ts},
                           {mul_dst_ts}, *engine, *strm),
@@ -532,7 +532,7 @@ TEST(test_reduce_execute_subgraph_fp32, ReduceWith3PostOps_CPU) {
         p.compile(&cp, lt_ins, lt_outs, engine);
 
         std::vector<float> case2_out_data(product(reduce_dst_shape));
-        test_tensor mul_dst_ts2(mul_dst, engine, case2_out_data);
+        test_tensor_t mul_dst_ts2(mul_dst, engine, case2_out_data);
 
         cp.execute(strm,
                 {reduce_src_ts.get(), max_src_ts.get(), mul_src_ts.get()},
@@ -599,8 +599,8 @@ TEST(test_reduce_execute, ReduceMeanOutputDims) {
         cp.query_logical_tensor(dst.id, &dst_lt);
         ASSERT_EQ(dst_lt.layout_type, graph::layout_type::strided);
         ASSERT_EQ(dst_lt.ndims, static_cast<int>(new_dst_shape.size()));
-        test_tensor src0_ts(src0, engine, src0_data);
-        test_tensor dst_ts(dst_lt, engine, dst_data);
+        test_tensor_t src0_ts(src0, engine, src0_data);
+        test_tensor_t dst_ts(dst_lt, engine, dst_data);
 
         ASSERT_EQ(cp.execute(strm, {src0_ts.get()}, {dst_ts.get()}),
                 graph::status::success);
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp b/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp
index c9a6607a644..da974bc90d9 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -64,8 +64,8 @@ TEST(test_reorder_execute, ReorderData) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
     graph::stream_t *stream = get_stream();
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(dst_lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(dst_lt, engine, dst);
 
     cp.execute(stream, {src_ts.get()}, {dst_ts.get()});
     stream->wait();
@@ -140,8 +140,8 @@ TEST(test_reorder_execute, Int8Reorder) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
     graph::stream_t *stream = get_stream();
-    test_tensor src_ts(int8_src_lt, engine, int8_src);
-    test_tensor dst_ts(int8_dst_lt, engine, int8_dst);
+    test_tensor_t src_ts(int8_src_lt, engine, int8_src);
+    test_tensor_t dst_ts(int8_dst_lt, engine, int8_dst);
 
     cp.execute(stream, {src_ts.get()}, {dst_ts.get()});
     stream->wait();
@@ -226,8 +226,8 @@ TEST(test_reorder_execute, ReorderDataBf16) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
     graph::stream_t *stream = get_stream();
-    test_tensor src_ts(src_lt, engine, src);
-    test_tensor dst_ts(dst_lt, engine, dst);
+    test_tensor_t src_ts(src_lt, engine, src);
+    test_tensor_t dst_ts(dst_lt, engine, dst);
 
     cp.execute(stream, {src_ts.get()}, {dst_ts.get()});
     stream->wait();
@@ -286,9 +286,9 @@ TEST(test_reorder_execute, ReorderAddBf16) {
     std::vector<const graph::logical_tensor_t *> outputs {&add_dst_lt};
     ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor post_src_ts(add_src_lt, eng, post_src);
-    test_tensor dst_ts(add_dst_lt, eng, dst);
+    test_tensor_t src_ts(src_lt, eng, src);
+    test_tensor_t post_src_ts(add_src_lt, eng, post_src);
+    test_tensor_t dst_ts(add_dst_lt, eng, dst);
     cp.execute(strm, {src_ts.get(), post_src_ts.get()}, {dst_ts.get()});
     strm->wait();
 }
@@ -435,9 +435,9 @@ TEST(test_reorder_execute, Int8ReorderAdd) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
     graph::stream_t *stream = get_stream();
-    test_tensor src_ts(int8_src_lt, engine, int8_src);
-    test_tensor src_other_ts(int8_src_other_lt, engine, int8_src_other);
-    test_tensor dst_ts(int8_dst_add_lt, engine, int8_dst);
+    test_tensor_t src_ts(int8_src_lt, engine, int8_src);
+    test_tensor_t src_other_ts(int8_src_other_lt, engine, int8_src_other);
+    test_tensor_t dst_ts(int8_dst_add_lt, engine, int8_dst);
 
     cp.execute(stream, {src_ts.get(), src_other_ts.get()}, {dst_ts.get()});
     stream->wait();
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_scratchpad.cpp b/tests/gtests/graph/unit/backend/dnnl/test_scratchpad.cpp
index 03f52d5c835..eddaad8767d 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_scratchpad.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_scratchpad.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,7 +30,7 @@
 
 namespace graph = dnnl::impl::graph;
 
-TEST(test_scratchpad_scratchpad, TemporaryScratchpad) {
+TEST(test_scratchpad, TemporaryScratchpad) {
     using dnnl::impl::graph::allocator_t;
     using dnnl::impl::graph::dnnl_impl::temporary_scratchpad_t;
 
@@ -55,7 +55,7 @@ TEST(test_scratchpad_scratchpad, TemporaryScratchpad) {
     }
 }
 
-TEST(test_scratchpad_scratchpad, Registry) {
+TEST(test_scratchpad, Registry) {
     using dnnl::impl::graph::allocator_t;
     using dnnl::impl::graph::dnnl_impl::grantor_t;
     using dnnl::impl::graph::dnnl_impl::registrar_t;
@@ -105,7 +105,7 @@ TEST(test_scratchpad_scratchpad, Registry) {
     ASSERT_TRUE(piece_end <= total_end); // make sure no overflow
 }
 
-TEST(test_scratchpad_scratchpad, RegistryMultithreading) {
+TEST(test_scratchpad, RegistryMultithreading) {
     using dnnl::impl::graph::allocator_t;
     using dnnl::impl::graph::dnnl_impl::grantor_t;
     using dnnl::impl::graph::dnnl_impl::registrar_t;
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_sdp_decomp.cpp b/tests/gtests/graph/unit/backend/dnnl/test_sdp_decomp.cpp
index eeba8a341de..3cb331c9965 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_sdp_decomp.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_sdp_decomp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -76,7 +76,7 @@ TEST(test_sdp_decomp_execute, F32SdpDecomp_CPU) {
             else
                 ASSERT_EQ(g.get_ops().size(), 6U);
 
-            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion_cpu");
             apass->run(g);
             ASSERT_EQ(g.get_num_partitions(), 1U);
             auto part = g.get_partitions()[0];
@@ -121,7 +121,7 @@ TEST(test_sdp_decomp_execute, F32SdpDecomp_CPU) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, eng),
                     graph::status::success);
 
-            std::vector<test_tensor> inputs_ts, outputs_ts;
+            std::vector<test_tensor_t> inputs_ts, outputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<float>();
@@ -132,8 +132,9 @@ TEST(test_sdp_decomp_execute, F32SdpDecomp_CPU) {
                 cp.query_logical_tensor(lt->id, &compiled_output);
                 outputs_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs_ts)),
+            ASSERT_EQ(
+                    cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs_ts)),
                     graph::status::success);
             strm->wait();
         }
@@ -179,7 +180,7 @@ TEST(test_sdp_decomp_execute, Bf16SdpDecomp_CPU) {
             else
                 ASSERT_EQ(g.get_ops().size(), 6U);
 
-            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion_cpu");
             apass->run(g);
             ASSERT_EQ(g.get_num_partitions(), 1U);
             auto part = g.get_partitions()[0];
@@ -224,7 +225,7 @@ TEST(test_sdp_decomp_execute, Bf16SdpDecomp_CPU) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, eng),
                     graph::status::success);
 
-            std::vector<test_tensor> inputs_ts, outputs_ts;
+            std::vector<test_tensor_t> inputs_ts, outputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<bfloat16_t>();
@@ -235,8 +236,9 @@ TEST(test_sdp_decomp_execute, Bf16SdpDecomp_CPU) {
                 cp.query_logical_tensor(lt->id, &compiled_output);
                 outputs_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs_ts)),
+            ASSERT_EQ(
+                    cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs_ts)),
                     graph::status::success);
             strm->wait();
         }
@@ -321,7 +323,7 @@ TEST(test_sdp_decomp_execute, Int8SdpDecomp_CPU) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, eng),
                     graph::status::success);
 
-            std::vector<test_tensor> inputs_ts, outputs_ts;
+            std::vector<test_tensor_t> inputs_ts, outputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<uint8_t>();
@@ -332,8 +334,9 @@ TEST(test_sdp_decomp_execute, Int8SdpDecomp_CPU) {
                 cp.query_logical_tensor(lt->id, &compiled_output);
                 outputs_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs_ts)),
+            ASSERT_EQ(
+                    cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs_ts)),
                     graph::status::success);
             strm->wait();
         }
@@ -419,7 +422,7 @@ TEST(test_sdp_decomp_execute, Int8Bf16SdpDecomp_CPU) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, eng),
                     graph::status::success);
 
-            std::vector<test_tensor> inputs_ts, outputs_ts;
+            std::vector<test_tensor_t> inputs_ts, outputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<bfloat16_t>();
@@ -429,8 +432,9 @@ TEST(test_sdp_decomp_execute, Int8Bf16SdpDecomp_CPU) {
                 outputs_ts.emplace_back(lt, eng);
             }
 
-            ASSERT_EQ(cp.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs_ts)),
+            ASSERT_EQ(
+                    cp.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs_ts)),
                     graph::status::success);
 
             strm->wait();
@@ -506,7 +510,7 @@ TEST(test_sdp_decomp_execute, MultithreaSdpDecomp_CPU) {
 
         graph::compiled_partition_t cp(p);
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> inputs_ts;
+        std::vector<test_tensor_t> inputs_ts;
         for (auto &lt : inputs) {
             inputs_ts.emplace_back(*lt, eng);
             inputs_ts.back().fill<uint8_t>();
@@ -515,15 +519,15 @@ TEST(test_sdp_decomp_execute, MultithreaSdpDecomp_CPU) {
         auto func = [&]() {
             graph::stream_t *strm;
             dnnl_stream_create(&strm, eng, dnnl_stream_in_order);
-            std::vector<test_tensor> outputs_ts;
+            std::vector<test_tensor_t> outputs_ts;
             outputs_ts.reserve(partition_outputs.size());
             for (auto &lt : partition_outputs) {
                 outputs_ts.emplace_back(lt, eng);
             }
             for (int i = 0; i < 10; i++)
                 ASSERT_EQ(cp.execute(strm,
-                                  test_tensor::to_graph_tensor(inputs_ts),
-                                  test_tensor::to_graph_tensor(outputs_ts)),
+                                  test_tensor_t::to_graph_tensor(inputs_ts),
+                                  test_tensor_t::to_graph_tensor(outputs_ts)),
                         graph::status::success);
             strm->wait();
             dnnl_stream_destroy(strm);
@@ -570,7 +574,7 @@ TEST(test_sdp_decomp_execute, F32SdpCorr_CPU) {
                     attention_mask_vec[j]);
             g.finalize();
 
-            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion_cpu");
             apass->run(g);
             ASSERT_EQ(g.get_num_partitions(), 1U);
             auto part = g.get_partitions()[0];
@@ -610,7 +614,7 @@ TEST(test_sdp_decomp_execute, F32SdpCorr_CPU) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, eng),
                     graph::status::success);
 
-            std::vector<test_tensor> inputs_ts, outputs_ts;
+            std::vector<test_tensor_t> inputs_ts, outputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<float>();
@@ -623,34 +627,36 @@ TEST(test_sdp_decomp_execute, F32SdpCorr_CPU) {
             }
 
             // -------------------------case 1----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
             graph::compiled_partition_t cp1(p);
             ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs1_ts;
+            std::vector<test_tensor_t> outputs1_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp1.query_logical_tensor(lt->id, &compiled_output);
                 outputs1_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs1_ts)),
+            ASSERT_EQ(
+                    cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs1_ts)),
                     graph::status::success);
             strm->wait();
 
             // -------------------------case 2----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
             graph::compiled_partition_t cp2(p);
             ASSERT_EQ(p.compile(&cp2, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs2_ts;
+            std::vector<test_tensor_t> outputs2_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp2.query_logical_tensor(lt->id, &compiled_output);
                 outputs2_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs2_ts)),
+            ASSERT_EQ(
+                    cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs2_ts)),
                     graph::status::success);
             strm->wait();
 
@@ -688,7 +694,7 @@ TEST(test_sdp_decomp_execute, F32DistilBertSdpCorr_CPU) {
                 batch_size, seq_len, num_head, head_dim, transpose_b[i]);
         g.finalize();
 
-        graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+        graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion_cpu");
         apass->run(g);
         ASSERT_EQ(g.get_num_partitions(), 1U);
         auto part = g.get_partitions()[0];
@@ -726,7 +732,7 @@ TEST(test_sdp_decomp_execute, F32DistilBertSdpCorr_CPU) {
         graph::compiled_partition_t cp(p);
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-        std::vector<test_tensor> inputs_ts, outputs_ts;
+        std::vector<test_tensor_t> inputs_ts, outputs_ts;
         for (auto &lt : inputs) {
             inputs_ts.emplace_back(*lt, eng);
             inputs_ts.back().fill<float>();
@@ -739,34 +745,34 @@ TEST(test_sdp_decomp_execute, F32DistilBertSdpCorr_CPU) {
         }
 
         // -------------------------case 1----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
         graph::compiled_partition_t cp1(p);
         ASSERT_EQ(
                 p.compile(&cp1, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs1_ts;
+        std::vector<test_tensor_t> outputs1_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp1.query_logical_tensor(lt->id, &compiled_output);
             outputs1_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs1_ts)),
+        ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs1_ts)),
                 graph::status::success);
         strm->wait();
 
         // -------------------------case 2----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
         graph::compiled_partition_t cp2(p);
         ASSERT_EQ(
                 p.compile(&cp2, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs2_ts;
+        std::vector<test_tensor_t> outputs2_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp2.query_logical_tensor(lt->id, &compiled_output);
             outputs2_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs2_ts)),
+        ASSERT_EQ(cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs2_ts)),
                 graph::status::success);
         strm->wait();
 
@@ -811,7 +817,7 @@ TEST(test_sdp_decomp_execute, Bf16SdpCorr_CPU) {
                     attention_mask_vec[j]);
             g.finalize();
 
-            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+            graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion_cpu");
             apass->run(g);
             ASSERT_EQ(g.get_num_partitions(), 1U);
             auto part = g.get_partitions()[0];
@@ -851,7 +857,7 @@ TEST(test_sdp_decomp_execute, Bf16SdpCorr_CPU) {
             ASSERT_EQ(p.compile(&cp, inputs, outputs, eng),
                     graph::status::success);
 
-            std::vector<test_tensor> inputs_ts, outputs_ts;
+            std::vector<test_tensor_t> inputs_ts, outputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<bfloat16_t>();
@@ -864,34 +870,36 @@ TEST(test_sdp_decomp_execute, Bf16SdpCorr_CPU) {
             }
 
             // -------------------------case 1----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
             graph::compiled_partition_t cp1(p);
             ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs1_ts;
+            std::vector<test_tensor_t> outputs1_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp1.query_logical_tensor(lt->id, &compiled_output);
                 outputs1_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs1_ts)),
+            ASSERT_EQ(
+                    cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs1_ts)),
                     graph::status::success);
             strm->wait();
 
             // -------------------------case 2----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
             graph::compiled_partition_t cp2(p);
             ASSERT_EQ(p.compile(&cp2, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs2_ts;
+            std::vector<test_tensor_t> outputs2_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp2.query_logical_tensor(lt->id, &compiled_output);
                 outputs2_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs2_ts)),
+            ASSERT_EQ(
+                    cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs2_ts)),
                     graph::status::success);
             strm->wait();
 
@@ -934,7 +942,7 @@ TEST(test_sdp_decomp_execute, Bf16DistilBertSdpCorr_CPU) {
                 batch_size, seq_len, num_head, head_dim, transpose_b[i]);
         g.finalize();
 
-        graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion");
+        graph::pass::pass_base_ptr apass = get_pass("float_sdp_fusion_cpu");
         apass->run(g);
         ASSERT_EQ(g.get_num_partitions(), 1U);
         auto part = g.get_partitions()[0];
@@ -972,7 +980,7 @@ TEST(test_sdp_decomp_execute, Bf16DistilBertSdpCorr_CPU) {
         graph::compiled_partition_t cp(p);
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
 
-        std::vector<test_tensor> inputs_ts, outputs_ts;
+        std::vector<test_tensor_t> inputs_ts, outputs_ts;
         for (auto &lt : inputs) {
             inputs_ts.emplace_back(*lt, eng);
             inputs_ts.back().fill<bfloat16_t>();
@@ -985,34 +993,34 @@ TEST(test_sdp_decomp_execute, Bf16DistilBertSdpCorr_CPU) {
         }
 
         // -------------------------case 1----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
         graph::compiled_partition_t cp1(p);
         ASSERT_EQ(
                 p.compile(&cp1, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs1_ts;
+        std::vector<test_tensor_t> outputs1_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp1.query_logical_tensor(lt->id, &compiled_output);
             outputs1_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs1_ts)),
+        ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs1_ts)),
                 graph::status::success);
         strm->wait();
 
         // -------------------------case 2----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
         graph::compiled_partition_t cp2(p);
         ASSERT_EQ(
                 p.compile(&cp2, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs2_ts;
+        std::vector<test_tensor_t> outputs2_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp2.query_logical_tensor(lt->id, &compiled_output);
             outputs2_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs2_ts)),
+        ASSERT_EQ(cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs2_ts)),
                 graph::status::success);
         strm->wait();
 
@@ -1088,40 +1096,42 @@ TEST(test_sdp_decomp_execute, Int8SdpCorr_CPU) {
                         lt.id, lt.data_type, graph::layout_type::strided);
                 outputs.emplace_back(&lt);
             }
-            std::vector<test_tensor> inputs_ts;
+            std::vector<test_tensor_t> inputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<uint8_t>();
             }
             // -------------------------case 1----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
             graph::compiled_partition_t cp1(p);
             ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs1_ts;
+            std::vector<test_tensor_t> outputs1_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp1.query_logical_tensor(lt->id, &compiled_output);
                 outputs1_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs1_ts)),
+            ASSERT_EQ(
+                    cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs1_ts)),
                     graph::status::success);
             strm->wait();
 
             // -------------------------case 2----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
             graph::compiled_partition_t cp2(p);
             ASSERT_EQ(p.compile(&cp2, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs2_ts;
+            std::vector<test_tensor_t> outputs2_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp2.query_logical_tensor(lt->id, &compiled_output);
                 outputs2_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs2_ts)),
+            ASSERT_EQ(
+                    cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs2_ts)),
                     graph::status::success);
             strm->wait();
 
@@ -1203,40 +1213,42 @@ TEST(test_sdp_decomp_execute, Int8Bf16SdpCorr_CPU) {
                         lt.id, lt.data_type, graph::layout_type::strided);
                 outputs.emplace_back(&lt);
             }
-            std::vector<test_tensor> inputs_ts;
+            std::vector<test_tensor_t> inputs_ts;
             for (auto &lt : inputs) {
                 inputs_ts.emplace_back(*lt, eng);
                 inputs_ts.back().fill<uint8_t>();
             }
             // -------------------------case 1----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
             graph::compiled_partition_t cp1(p);
             ASSERT_EQ(p.compile(&cp1, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs1_ts;
+            std::vector<test_tensor_t> outputs1_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp1.query_logical_tensor(lt->id, &compiled_output);
                 outputs1_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs1_ts)),
+            ASSERT_EQ(
+                    cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs1_ts)),
                     graph::status::success);
             strm->wait();
 
             // -------------------------case 2----------------------------------
-            custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+            custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
             graph::compiled_partition_t cp2(p);
             ASSERT_EQ(p.compile(&cp2, inputs, outputs, eng),
                     graph::status::success);
-            std::vector<test_tensor> outputs2_ts;
+            std::vector<test_tensor_t> outputs2_ts;
             for (auto &lt : outputs) {
                 graph::logical_tensor_t compiled_output;
                 cp2.query_logical_tensor(lt->id, &compiled_output);
                 outputs2_ts.emplace_back(compiled_output, eng);
             }
-            ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                              test_tensor::to_graph_tensor(outputs2_ts)),
+            ASSERT_EQ(
+                    cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                            test_tensor_t::to_graph_tensor(outputs2_ts)),
                     graph::status::success);
             strm->wait();
 
@@ -1310,40 +1322,40 @@ TEST(test_sdp_decomp_execute, Int8DistilBertSdpCorr_CPU) {
                     lt.id, lt.data_type, graph::layout_type::strided);
             outputs.emplace_back(&lt);
         }
-        std::vector<test_tensor> inputs_ts;
+        std::vector<test_tensor_t> inputs_ts;
         for (auto &lt : inputs) {
             inputs_ts.emplace_back(*lt, eng);
             inputs_ts.back().fill<uint8_t>();
         }
         // -------------------------case 1----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
         graph::compiled_partition_t cp1(p);
         ASSERT_EQ(
                 p.compile(&cp1, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs1_ts;
+        std::vector<test_tensor_t> outputs1_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp1.query_logical_tensor(lt->id, &compiled_output);
             outputs1_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs1_ts)),
+        ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs1_ts)),
                 graph::status::success);
         strm->wait();
 
         // -------------------------case 2----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
         graph::compiled_partition_t cp2(p);
         ASSERT_EQ(
                 p.compile(&cp2, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs2_ts;
+        std::vector<test_tensor_t> outputs2_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp2.query_logical_tensor(lt->id, &compiled_output);
             outputs2_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs2_ts)),
+        ASSERT_EQ(cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs2_ts)),
                 graph::status::success);
         strm->wait();
 
@@ -1422,40 +1434,40 @@ TEST(test_sdp_decomp_execute, Int8Bf16DistilBertSdpCorr_CPU) {
                     lt.id, lt.data_type, graph::layout_type::strided);
             outputs.emplace_back(&lt);
         }
-        std::vector<test_tensor> inputs_ts;
+        std::vector<test_tensor_t> inputs_ts;
         for (auto &lt : inputs) {
             inputs_ts.emplace_back(*lt, eng);
             inputs_ts.back().fill<uint8_t>();
         }
         // -------------------------case 1----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
         graph::compiled_partition_t cp1(p);
         ASSERT_EQ(
                 p.compile(&cp1, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs1_ts;
+        std::vector<test_tensor_t> outputs1_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp1.query_logical_tensor(lt->id, &compiled_output);
             outputs1_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs1_ts)),
+        ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs1_ts)),
                 graph::status::success);
         strm->wait();
 
         // -------------------------case 2----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "1", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "0", 1);
         graph::compiled_partition_t cp2(p);
         ASSERT_EQ(
                 p.compile(&cp2, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs2_ts;
+        std::vector<test_tensor_t> outputs2_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp2.query_logical_tensor(lt->id, &compiled_output);
             outputs2_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp2.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs2_ts)),
+        ASSERT_EQ(cp2.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs2_ts)),
                 graph::status::success);
         strm->wait();
 
@@ -1534,40 +1546,40 @@ TEST(test_sdp_decomp_execute, MultithreaSdpDecompCorr_CPU) {
 
         graph::compiled_partition_t cp(p);
         ASSERT_EQ(p.compile(&cp, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> inputs_ts;
+        std::vector<test_tensor_t> inputs_ts;
         for (auto &lt : inputs) {
             inputs_ts.emplace_back(*lt, eng);
             inputs_ts.back().fill<uint8_t>();
         }
 
         // -------------------------case 1----------------------------------
-        custom_setenv("_ONEDNN_ENABLE_SDP_DECOMP", "0", 1);
+        custom_setenv("_ONEDNN_GRAPH_SDPA_FORCE_PRIMITIVE", "1", 1);
         graph::compiled_partition_t cp1(p);
         ASSERT_EQ(
                 p.compile(&cp1, inputs, outputs, eng), graph::status::success);
-        std::vector<test_tensor> outputs1_ts;
+        std::vector<test_tensor_t> outputs1_ts;
         for (auto &lt : outputs) {
             graph::logical_tensor_t compiled_output;
             cp1.query_logical_tensor(lt->id, &compiled_output);
             outputs1_ts.emplace_back(compiled_output, eng);
         }
-        ASSERT_EQ(cp1.execute(strm, test_tensor::to_graph_tensor(inputs_ts),
-                          test_tensor::to_graph_tensor(outputs1_ts)),
+        ASSERT_EQ(cp1.execute(strm, test_tensor_t::to_graph_tensor(inputs_ts),
+                          test_tensor_t::to_graph_tensor(outputs1_ts)),
                 graph::status::success);
         strm->wait();
 
         auto func = [&]() {
             graph::stream_t *strm_eng;
             dnnl_stream_create(&strm_eng, eng, dnnl_stream_in_order);
-            std::vector<test_tensor> outputs_ts;
+            std::vector<test_tensor_t> outputs_ts;
             outputs_ts.reserve(partition_outputs.size());
             for (auto &lt : partition_outputs) {
                 outputs_ts.emplace_back(lt, eng);
             }
             for (int i = 0; i < 10; i++) {
                 ASSERT_EQ(cp.execute(strm_eng,
-                                  test_tensor::to_graph_tensor(inputs_ts),
-                                  test_tensor::to_graph_tensor(outputs_ts)),
+                                  test_tensor_t::to_graph_tensor(inputs_ts),
+                                  test_tensor_t::to_graph_tensor(outputs_ts)),
                         graph::status::success);
                 strm_eng->wait();
             }
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_select.cpp b/tests/gtests/graph/unit/backend/dnnl/test_select.cpp
index 4044269851b..ed1dd1bba3e 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_select.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_select.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2024 Intel Corporation
+* Copyright 2024-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -74,10 +74,10 @@ TEST(test_select_execute, TestSelect) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
     graph::stream_t *stream = get_stream();
-    test_tensor cond_ts(cond_lt, engine, cond);
-    test_tensor src0_ts(src0_lt, engine, src0);
-    test_tensor src1_ts(src1_lt, engine, src1);
-    test_tensor dst_ts(dst_lt, engine, dst);
+    test_tensor_t cond_ts(cond_lt, engine, cond);
+    test_tensor_t src0_ts(src0_lt, engine, src0);
+    test_tensor_t src1_ts(src1_lt, engine, src1);
+    test_tensor_t dst_ts(dst_lt, engine, dst);
 
     ASSERT_EQ(cp.execute(stream, {cond_ts.get(), src0_ts.get(), src1_ts.get()},
                       {dst_ts.get()}),
@@ -170,12 +170,12 @@ TEST(test_select_execute, MatmulSelect) {
 
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor weight_ts(weight, engine, weight_data);
-    test_tensor div_src1_ts(div_src1, engine, div_src1_data);
-    test_tensor cond_ts(cond, engine, cond_data);
-    test_tensor select_src0_ts(select_src0, engine, select_src1_data);
-    test_tensor dst_ts(dst, engine, dst_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t weight_ts(weight, engine, weight_data);
+    test_tensor_t div_src1_ts(div_src1, engine, div_src1_data);
+    test_tensor_t cond_ts(cond, engine, cond_data);
+    test_tensor_t select_src0_ts(select_src0, engine, select_src1_data);
+    test_tensor_t dst_ts(dst, engine, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm,
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_softmax.cpp b/tests/gtests/graph/unit/backend/dnnl/test_softmax.cpp
index ad8de24d5c3..629ac2d5fd7 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_softmax.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,8 +67,8 @@ TEST(test_softmax_execute, Softmax) {
     cp.query_logical_tensor(dst.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor dst_ts(lt, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t dst_ts(lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -121,8 +121,8 @@ TEST(test_softmax_execute, SoftmaxWithLastDim) {
     cp.query_logical_tensor(dst.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor dst_ts(lt, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t dst_ts(lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -185,9 +185,9 @@ static inline void test_softmax_bwd_common(const graph::op_kind_t op_kind,
     for (auto i = 0; i < dst_lt.ndims; i++)
         ASSERT_EQ(q_lt.dims[i], dst_lt.dims[i]);
 
-    test_tensor dst_ts(dst_lt, eng, dst);
-    test_tensor diff_dst_ts(d_lt, eng, diff_dst);
-    test_tensor diff_src_ts(q_lt, eng, diff_src);
+    test_tensor_t dst_ts(dst_lt, eng, dst);
+    test_tensor_t diff_dst_ts(d_lt, eng, diff_dst);
+    test_tensor_t diff_src_ts(q_lt, eng, diff_src);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {diff_dst_ts.get(), dst_ts.get()},
@@ -265,8 +265,8 @@ TEST(test_softmax_execute, LogSoftmax) {
     cp.query_logical_tensor(dst.id, &lt);
     ASSERT_EQ(lt.layout_type, graph::layout_type::strided);
 
-    test_tensor src_ts(src, eng, src_data);
-    test_tensor dst_ts(lt, eng, dst_data);
+    test_tensor_t src_ts(src, eng, src_data);
+    test_tensor_t dst_ts(lt, eng, dst_data);
 
     graph::stream_t *strm = get_stream();
     ASSERT_EQ(cp.execute(strm, {src_ts.get()}, {dst_ts.get()}),
@@ -419,9 +419,9 @@ TEST(test_softmax_execute_subgraph_int8, SoftmaxTypecastQuant) {
 
     std::vector<uint8_t> dst_data(product(softmax_shape));
     std::vector<uint8_t> ref_data(product(softmax_shape));
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor dst_ts(quant_dst, engine, dst_data);
-    test_tensor ref_ts(quant_dst, engine, ref_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t dst_ts(quant_dst, engine, dst_data);
+    test_tensor_t ref_ts(quant_dst, engine, ref_data);
 
     ASSERT_EQ(run_graph(g, {src_ts}, {ref_ts}, *engine, *strm),
             graph::status::success);
@@ -492,10 +492,10 @@ TEST(test_softmax_compile, SoftmaxAdd) {
 
     std::vector<float> dst_data(product(softmax_shape));
     std::vector<float> ref_data(product(softmax_shape));
-    test_tensor src_ts(src, engine, src_data);
-    test_tensor src1_ts(add_src1, engine, src1_data);
-    test_tensor dst_ts(add_dst, engine, dst_data);
-    test_tensor ref_ts(add_dst, engine, ref_data);
+    test_tensor_t src_ts(src, engine, src_data);
+    test_tensor_t src1_ts(add_src1, engine, src1_data);
+    test_tensor_t dst_ts(add_dst, engine, dst_data);
+    test_tensor_t ref_ts(add_dst, engine, ref_data);
 
     ASSERT_EQ(run_graph(g, {src_ts, src1_ts}, {ref_ts}, *engine, *strm),
             graph::status::success);
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_subgraph_pass.cpp b/tests/gtests/graph/unit/backend/dnnl/test_subgraph_pass.cpp
index 6df85928f02..cdacfcbd893 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_subgraph_pass.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_subgraph_pass.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,14 +62,18 @@ dnnl::impl::graph::pass::pass_base_ptr get_pass(const std::string &pass_name) {
 }
 } // namespace
 
-TEST(test_subgraph_pass_subgraph, Kind2Str) {
+TEST(test_subgraph_pass, Kind2Str) {
     ASSERT_EQ(graph::dnnl_impl::kind2str(graph::op_kind::Abs), "Abs");
     ASSERT_EQ(
             graph::dnnl_impl::kind2str(graph::dnnl_impl::op_kind::dnnl_add_zps),
             "Dnnl_add_zps");
 }
 
-TEST(test_subgraph_pass_subgraph_pass, LowerDownToInt8Conv) {
+TEST(test_subgraph_pass, LargerPartitionKernelCreator) {
+    ASSERT_NO_THROW(graph::dnnl_impl::large_partition_kernel_creator());
+}
+
+TEST(test_subgraph_pass, LowerDownToInt8Conv) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -216,7 +220,7 @@ TEST(test_subgraph_pass_subgraph_pass, LowerDownToInt8Conv) {
     ASSERT_EQ(post_ops.size(), 2U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, LowerDownToInt8Matmul) {
+TEST(test_subgraph_pass, LowerDownToInt8Matmul) {
     /*
         | (u8/s8)  | (s8)
      dequant    dequant
@@ -335,7 +339,7 @@ TEST(test_subgraph_pass_subgraph_pass, LowerDownToInt8Matmul) {
     ASSERT_EQ(post_ops.size(), 1U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, Conv2dNxcPlainDst) {
+TEST(test_subgraph_pass, Conv2dNxcPlainDst) {
     using dims = graph::dnnl_impl::dims;
     graph::engine_t *engine = get_engine();
     dnnl::engine p_eng
@@ -404,10 +408,9 @@ TEST(test_subgraph_pass_subgraph_pass, Conv2dNxcPlainDst) {
 
     std::vector<graph::logical_tensor_t> lt_ins {src_u8, weight_s8};
     std::vector<graph::logical_tensor_t> lt_outs {dst_u8};
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            agraph.get_partitions()[0]->get_ops(), p_eng, fpmath_mode::strict,
-            true, true);
+            agraph.get_partitions()[0]->get_ops(), p_eng, fpm, true, true);
     ASSERT_EQ(subgraph->get_ops().size(), 4U);
 
     ASSERT_EQ(dnnl_impl::set_given_inputs_outputs(subgraph, lt_ins, lt_outs),
@@ -457,7 +460,7 @@ TEST(test_subgraph_pass_subgraph_pass, Conv2dNxcPlainDst) {
     }
 }
 
-TEST(test_subgraph_pass_subgraph_pass, Int8ConvSumRelu) {
+TEST(test_subgraph_pass, Int8ConvSumRelu) {
     /*
                    | (f32, constant)
                  quant
@@ -615,9 +618,9 @@ TEST(test_subgraph_pass_subgraph_pass, Int8ConvSumRelu) {
 
     weight_f32.property = graph::property_type::constant;
     bias_f32.property = graph::property_type::constant;
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            part->get_ops(), p_eng, fpmath_mode::strict, false, true);
+            part->get_ops(), p_eng, fpm, false, true);
 
     std::vector<logical_tensor_t> inputs
             = {src_u8, weight_f32, bias_f32, other_s8};
@@ -867,10 +870,9 @@ TEST_P(int8_matmul_with_diff_inputs_t, Int8MatmulPasses) {
             graph::partition_kind_t::quantized_matmul_post_ops);
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs().size(), 1U);
     ASSERT_EQ(agraph.get_partitions()[0]->get_inputs().size(), 3U);
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            agraph.get_partitions()[0]->get_ops(), p_eng, fpmath_mode::strict,
-            true, true);
+            agraph.get_partitions()[0]->get_ops(), p_eng, fpm, true, true);
     ASSERT_EQ(subgraph->get_ops().size(), 5U);
 
     dnnl_impl::check_with_bias(subgraph);
@@ -932,8 +934,7 @@ TEST_P(int8_matmul_with_diff_inputs_t, Int8MatmulPasses) {
     ASSERT_EQ(subgraph->get_ops().size(), params.final_subgraph_size);
 }
 
-INSTANTIATE_TEST_SUITE_P(test_subgraph_pass_subgraph_pass,
-        int8_matmul_with_diff_inputs_t,
+INSTANTIATE_TEST_SUITE_P(test_subgraph_pass, int8_matmul_with_diff_inputs_t,
         testing::Values(matmul_params_t {{1, 1024}, {1000, 1024}, {1000},
                                 {1, 1000}, false, true, false, 7, 8},
                 matmul_params_t {{1, 1024}, {1000, 1024}, {1000}, {1, 1000},
@@ -989,10 +990,9 @@ TEST_P(matmul_with_diff_inputs_t, MatmulPasses) {
             graph::partition_kind_t::matmul_post_ops);
     ASSERT_EQ(agraph.get_partitions()[0]->get_outputs().size(), 1U);
     ASSERT_EQ(agraph.get_partitions()[0]->get_inputs().size(), 3U);
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            agraph.get_partitions()[0]->get_ops(), p_eng, fpmath_mode::strict,
-            true, true);
+            agraph.get_partitions()[0]->get_ops(), p_eng, fpm, true, true);
     ASSERT_EQ(subgraph->get_ops().size(), 2U);
 
     dnnl_impl::check_with_bias(subgraph);
@@ -1047,8 +1047,7 @@ TEST_P(matmul_with_diff_inputs_t, MatmulPasses) {
     ASSERT_EQ(subgraph->get_ops().size(), final_subgraph_size);
 }
 
-INSTANTIATE_TEST_SUITE_P(test_subgraph_pass_subgraph_pass,
-        matmul_with_diff_inputs_t,
+INSTANTIATE_TEST_SUITE_P(test_subgraph_pass, matmul_with_diff_inputs_t,
         testing::Values(matmul_params_t {{1, 1024}, {1000, 1024}, {1000},
                                 {1, 1000}, false, true, false, 4, 5},
                 matmul_params_t {{4, 3, 64}, {3, 64}, {3}, {4, 3, 3}, false,
@@ -1056,7 +1055,7 @@ INSTANTIATE_TEST_SUITE_P(test_subgraph_pass_subgraph_pass,
                 matmul_params_t {{4, 64, 3}, {3, 64}, {3}, {4, 3, 3}, true,
                         true, false, 6, 8}));
 
-TEST(test_subgraph_pass_subgraph_pass, ExecutionArgsSet) {
+TEST(test_subgraph_pass, ExecutionArgsSet) {
     ///////////////////////////
     // val1    val2
     //   \     /
@@ -1196,7 +1195,7 @@ TEST(test_subgraph_pass_subgraph_pass, ExecutionArgsSet) {
             && cloned_mem5 == cloned_op2_args[DNNL_ARG_DST]);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, MemoryPlanning) {
+TEST(test_subgraph_pass, MemoryPlanning) {
     /*
                 / -> dnnl_reorder -> dnnl_reorder
                /
@@ -1276,9 +1275,9 @@ TEST(test_subgraph_pass_subgraph_pass, MemoryPlanning) {
     g.add_op(&op8);
     g.add_op(&op9);
     g.finalize();
-
-    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(g.get_ops(), p_eng,
-            fpmath_mode::strict, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
+            g.get_ops(), p_eng, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(subgraph->get_ops().size(), 9U);
 
     std::vector<logical_tensor_t> inputs = {val0};
@@ -1295,7 +1294,7 @@ TEST(test_subgraph_pass_subgraph_pass, MemoryPlanning) {
     ASSERT_TRUE(mem_offkeys.empty());
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FusePostOpsForConvDepthwise_CPU) {
+TEST(test_subgraph_pass, FusePostOpsForConvDepthwise_CPU) {
     /*   conv
           |
          conv (depthwise)
@@ -1352,9 +1351,9 @@ TEST(test_subgraph_pass_subgraph_pass, FusePostOpsForConvDepthwise_CPU) {
     apass->run(g);
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            part->get_ops(), p_eng, fpmath_mode::strict, false, true);
+            part->get_ops(), p_eng, fpm, false, true);
     dnnl_impl::subgraph_visualizer_t vis(part->id(), [](const value_t *val) {
         (void)val;
         return std::string();
@@ -1366,7 +1365,7 @@ TEST(test_subgraph_pass_subgraph_pass, FusePostOpsForConvDepthwise_CPU) {
     ASSERT_EQ(subgraph->num_ops(), 2U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FailToFusePostOpsForConvDepthwise_CPU) {
+TEST(test_subgraph_pass, FailToFusePostOpsForConvDepthwise_CPU) {
     /*   conv
           |
          conv (depthwise)
@@ -1424,7 +1423,7 @@ TEST(test_subgraph_pass_subgraph_pass, FailToFusePostOpsForConvDepthwise_CPU) {
     ASSERT_EQ(g.get_num_partitions(), 0U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FuseSigmoidMultiplyToSwish) {
+TEST(test_subgraph_pass, FuseSigmoidMultiplyToSwish) {
     /*   
               /\
         sigmoid \
@@ -1463,9 +1462,9 @@ TEST(test_subgraph_pass_subgraph_pass, FuseSigmoidMultiplyToSwish) {
     apass->run(g);
     ASSERT_EQ(g.get_num_partitions(), 1U);
     auto part = g.get_partitions()[0];
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            part->get_ops(), p_eng, fpmath_mode::strict, false, true);
+            part->get_ops(), p_eng, fpm, false, true);
     dnnl_impl::pass_pipeline_t pipeline(
             dnnl_impl::subgraph_visualizer_t(), true, false);
     dnnl_impl::larger_partition_kernel_t::setup_pipeline_stage1(pipeline);
@@ -1570,10 +1569,9 @@ TEST(test_subgraph_pass_int8_matmul_passes_with_diff_inputs,
                 graph::partition_kind_t::quantized_matmul_post_ops);
         ASSERT_EQ(agraph.get_partitions()[0]->get_outputs().size(), 1U);
         ASSERT_EQ(agraph.get_partitions()[0]->get_inputs().size(), 4U);
-
+        const graph::fpmath_t fpm {fpmath_mode::strict, false};
         auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-                agraph.get_partitions()[0]->get_ops(), p_eng,
-                fpmath_mode::strict, false, true);
+                agraph.get_partitions()[0]->get_ops(), p_eng, fpm, false, true);
         // dequant, dequant, tc, tc, matmul, scale, add
         ASSERT_EQ(subgraph->get_ops().size(), 7U);
 
@@ -1625,7 +1623,7 @@ TEST(test_subgraph_pass_int8_matmul_passes_with_diff_inputs,
     }
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FuseTypecastToQuantize) {
+TEST(test_subgraph_pass, FuseTypecastToQuantize) {
     graph::engine_t *g_eng = get_engine();
     dnnl::engine p_eng = dnnl::impl::graph::dnnl_impl::make_dnnl_engine(*g_eng);
     graph_t agraph;
@@ -1657,10 +1655,9 @@ TEST(test_subgraph_pass_subgraph_pass, FuseTypecastToQuantize) {
     pass::pass_base_ptr apass = get_pass("typecast_quantize_fusion");
     apass->run(agraph);
     ASSERT_EQ(agraph.get_num_partitions(), 1U);
-
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            agraph.get_partitions()[0]->get_ops(), p_eng, fpmath_mode::strict,
-            false, true);
+            agraph.get_partitions()[0]->get_ops(), p_eng, fpm, false, true);
     // tc, quant
     ASSERT_EQ(subgraph->get_ops().size(), 2U);
 
@@ -1693,9 +1690,9 @@ TEST(test_subgraph_pass_layout_propagation, ReshapeWithSpecifiedOutputLayout) {
     graph::graph_t g;
     g.add_op(&op1);
     g.finalize();
-
-    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(g.get_ops(), p_eng,
-            fpmath_mode::strict, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
+            g.get_ops(), p_eng, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(subgraph->get_ops().size(), 1U);
 
     ASSERT_EQ(dnnl_impl::lower_down(subgraph), graph::status::success);
@@ -1735,9 +1732,9 @@ TEST(test_subgraph_pass_layout_propagation,
     graph::graph_t g;
     g.add_op(&op1);
     g.finalize();
-
-    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(g.get_ops(), p_eng,
-            fpmath_mode::strict, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
+            g.get_ops(), p_eng, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(subgraph->get_ops().size(), 1U);
 
     ASSERT_EQ(dnnl_impl::lower_down(subgraph), graph::status::success);
@@ -1773,9 +1770,9 @@ TEST(test_subgraph_pass_layout_propagation, ReshapeWithReshapableInputLayout) {
     graph::graph_t g;
     g.add_op(&op1);
     g.finalize();
-
-    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(g.get_ops(), p_eng,
-            fpmath_mode::strict, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
+            g.get_ops(), p_eng, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(subgraph->get_ops().size(), 1U);
 
     ASSERT_EQ(dnnl_impl::lower_down(subgraph), graph::status::success);
@@ -1806,9 +1803,9 @@ TEST(test_subgraph_pass_layout_propagation, Transpose) {
     graph::graph_t g;
     g.add_op(&op1);
     g.finalize();
-
-    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(g.get_ops(), p_eng,
-            fpmath_mode::strict, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
+    auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
+            g.get_ops(), p_eng, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(subgraph->get_ops().size(), 1U);
 
     ASSERT_EQ(dnnl_impl::lower_down(subgraph), graph::status::success);
@@ -1827,7 +1824,7 @@ TEST(test_subgraph_pass_layout_propagation, Transpose) {
     ASSERT_EQ(md_stride, out_stride);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FuseTypecastBeforeFusePostops) {
+TEST(test_subgraph_pass, FuseTypecastBeforeFusePostops) {
     graph::engine_t *engine = get_engine();
 
     // prepare fp32 data
@@ -1942,9 +1939,9 @@ TEST(test_subgraph_pass_subgraph_pass, FuseTypecastBeforeFusePostops) {
 
     graph::engine_t *g_eng = get_engine();
     dnnl::engine p_eng = dnnl::impl::graph::dnnl_impl::make_dnnl_engine(*g_eng);
+    const graph::fpmath_t fpm {fpmath_mode::strict, false};
     auto subgraph = std::make_shared<dnnl_impl::subgraph_t>(
-            g.get_partitions()[0]->get_ops(), p_eng, fpmath_mode::strict, false,
-            true);
+            g.get_partitions()[0]->get_ops(), p_eng, fpm, false, true);
     ASSERT_EQ(subgraph->get_ops().size(), 8U);
 
     dnnl_impl::subgraph_visualizer_t vis(0, [](const value_t *val) {
@@ -1958,7 +1955,7 @@ TEST(test_subgraph_pass_subgraph_pass, FuseTypecastBeforeFusePostops) {
     ASSERT_EQ(subgraph->num_ops(), 9U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, CheckUndefinedOpAttribute) {
+TEST(test_subgraph_pass, CheckUndefinedOpAttribute) {
     /*
     (f32) \     / (f32)
             conv
@@ -2014,7 +2011,7 @@ TEST(test_subgraph_pass_subgraph_pass, CheckUndefinedOpAttribute) {
     ASSERT_EQ(validator.run(subgraph), status::invalid_graph_op);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, CommonReorderElimination) {
+TEST(test_subgraph_pass, CommonReorderElimination) {
     graph::engine_t &g_eng = *get_engine();
     dnnl::engine p_eng = graph::dnnl_impl::make_dnnl_engine(g_eng);
     size_t id = 0;
@@ -2048,14 +2045,15 @@ TEST(test_subgraph_pass_subgraph_pass, CommonReorderElimination) {
     g.add_op(&reorder_op2);
     g.add_op(&op1);
     g.finalize();
-    auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(g.get_ops(),
-            p_eng, graph::fpmath_mode::any, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::any, false};
+    auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(
+            g.get_ops(), p_eng, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(graph::dnnl_impl::common_reorder_elimination(subgraph),
             graph::status::success);
     ASSERT_EQ(subgraph->get_ops().size(), 3U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, CombineBinaryPostOpScales) {
+TEST(test_subgraph_pass, CombineBinaryPostOpScales) {
     namespace utils = dnnl::graph::tests::unit::utils;
     dnnl_impl::dnnl_backend_t::get_singleton();
     using dims = graph::dnnl_impl::dims;
@@ -2154,9 +2152,9 @@ TEST(test_subgraph_pass_subgraph_pass, CombineBinaryPostOpScales) {
     g.add_op(&mul_op);
     g.add_op(&qout_op);
     g.finalize();
-
-    auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(g.get_ops(),
-            p_engine, graph::fpmath_mode::any, false, /* reset_layout */ false);
+    const graph::fpmath_t fpm {fpmath_mode::any, false};
+    auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(
+            g.get_ops(), p_engine, fpm, false, /* reset_layout */ false);
     ASSERT_EQ(graph::dnnl_impl::lower_down(subgraph), graph::status::success);
     ASSERT_EQ(graph::dnnl_impl::fuse_to_int8_pool(subgraph),
             graph::status::success);
@@ -2182,7 +2180,7 @@ TEST(test_subgraph_pass_subgraph_pass, CombineBinaryPostOpScales) {
     ASSERT_EQ(subgraph->num_ops(), 2U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FuseNCXConvolutionBinaryAddNC11PostSrc) {
+TEST(test_subgraph_pass, FuseNCXConvolutionBinaryAddNC11PostSrc) {
     using dims = dnnl::impl::graph::dnnl_impl::dims;
     namespace utils = dnnl::graph::tests::unit::utils;
     dnnl_impl::dnnl_backend_t::get_singleton();
@@ -2292,9 +2290,9 @@ TEST(test_subgraph_pass_subgraph_pass, FuseNCXConvolutionBinaryAddNC11PostSrc) {
     ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
     ASSERT_EQ(g.add_op(&add_op), graph::status::success);
     g.finalize();
-
+    const graph::fpmath_t fpm {fpmath_mode::any, false};
     auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(g.get_ops(),
-            p_engine, graph::fpmath_mode::any, false,
+            p_engine, fpm, false,
             /* reset_layout */ false);
     ASSERT_EQ(graph::dnnl_impl::lower_down(subgraph), graph::status::success);
     ASSERT_EQ(
@@ -2338,7 +2336,7 @@ TEST(test_subgraph_pass_subgraph_pass, FuseNCXConvolutionBinaryAddNC11PostSrc) {
     ASSERT_EQ(post_ops.size(), 1U);
 }
 
-TEST(test_subgraph_pass_subgraph_pass, FuseNXCConvolutionBinaryAddNC11PostSrc) {
+TEST(test_subgraph_pass, FuseNXCConvolutionBinaryAddNC11PostSrc) {
     using dims = dnnl::impl::graph::dnnl_impl::dims;
     namespace utils = dnnl::graph::tests::unit::utils;
     dnnl_impl::dnnl_backend_t::get_singleton();
@@ -2448,9 +2446,9 @@ TEST(test_subgraph_pass_subgraph_pass, FuseNXCConvolutionBinaryAddNC11PostSrc) {
     ASSERT_EQ(g.add_op(&conv_op), graph::status::success);
     ASSERT_EQ(g.add_op(&add_op), graph::status::success);
     g.finalize();
-
+    const graph::fpmath_t fpm {fpmath_mode::any, false};
     auto subgraph = std::make_shared<graph::dnnl_impl::subgraph_t>(g.get_ops(),
-            p_engine, graph::fpmath_mode::any, false,
+            p_engine, fpm, false,
             /* reset_layout */ false);
     ASSERT_EQ(graph::dnnl_impl::lower_down(subgraph), graph::status::success);
     ASSERT_EQ(
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_thread_local_cache.cpp b/tests/gtests/graph/unit/backend/dnnl/test_thread_local_cache.cpp
deleted file mode 100644
index 54d3017f4db..00000000000
--- a/tests/gtests/graph/unit/backend/dnnl/test_thread_local_cache.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <memory>
-#include <thread>
-
-#include "gtest/gtest.h"
-
-#include "interface/c_types_map.hpp"
-
-#include "backend/dnnl/thread_local_cache.hpp"
-
-#include <dnnl.hpp>
-
-template <typename T>
-using thread_local_cache_t
-        = dnnl::impl::graph::dnnl_impl::thread_local_cache_t<T>;
-
-struct test_resource_t {
-    test_resource_t(size_t data) : data_(data) {}
-    size_t data_;
-};
-
-TEST(test_thread_local_cache_thread_local_cache, SingleThread) {
-    thread_local_cache_t<test_resource_t> cache;
-    cache.clear();
-
-    size_t key1 = 1U;
-    test_resource_t *resource_ptr1 = cache.get_or_add(
-            key1, []() { return std::make_shared<test_resource_t>(10); });
-
-    size_t key2 = 2U;
-    test_resource_t *resource_ptr2 = cache.get_or_add(
-            key2, []() { return std::make_shared<test_resource_t>(20); });
-
-    ASSERT_TRUE(cache.has_resource(key1));
-    ASSERT_TRUE(cache.has_resource(key2));
-    ASSERT_EQ(resource_ptr1->data_, 10U);
-    ASSERT_EQ(resource_ptr2->data_, 20U);
-
-    // the given creator will not take effect since the key1 is already in the
-    // mapper
-    resource_ptr1 = cache.get_or_add(
-            key1, []() { return std::make_shared<test_resource_t>(100); });
-    ASSERT_EQ(resource_ptr1->data_, 10U);
-    ASSERT_EQ(cache.size(), 2U);
-
-    cache.remove_if_exist(key1);
-    cache.remove_if_exist(key2);
-}
-
-TEST(test_thread_local_cache_thread_local_cache, Multithreading) {
-    auto func = []() {
-        thread_local_cache_t<test_resource_t> cache;
-        cache.clear();
-
-        ASSERT_EQ(cache.size(), 0U);
-
-        size_t key1 = 1U;
-        test_resource_t *resource_ptr1 = cache.get_or_add(
-                key1, []() { return std::make_shared<test_resource_t>(10); });
-
-        size_t key2 = 2U;
-        test_resource_t *resource_ptr2 = cache.get_or_add(
-                key2, []() { return std::make_shared<test_resource_t>(20); });
-
-        ASSERT_TRUE(cache.has_resource(key1));
-        ASSERT_TRUE(cache.has_resource(key2));
-        ASSERT_EQ(resource_ptr1->data_, 10U);
-        ASSERT_EQ(resource_ptr2->data_, 20U);
-
-        resource_ptr1->data_ = 30;
-        ASSERT_EQ(resource_ptr1->data_, 30U);
-
-        size_t key3 = 3U;
-        cache.get_or_add(
-                key3, []() { return std::make_shared<test_resource_t>(100); });
-
-        ASSERT_EQ(cache.size(), 3U);
-    };
-
-    std::thread t1(func);
-    std::thread t2(func);
-    std::thread t3(func);
-    t1.join();
-    t2.join();
-    t3.join();
-}
-
-TEST(test_thread_local_cache_thread_local_cache, Clear) {
-    thread_local_cache_t<test_resource_t> cache;
-    size_t key1 = (size_t)1;
-    cache.get_or_add(
-            key1, []() { return std::make_shared<test_resource_t>(10); });
-    size_t key2 = (size_t)2;
-    cache.get_or_add(
-            key2, []() { return std::make_shared<test_resource_t>(20); });
-    ASSERT_NO_THROW(cache.clear());
-}
-
-TEST(test_thread_local_cache_thread_local_cache, RetainAndRelease) {
-    auto func = []() {
-        thread_local_cache_t<test_resource_t> cache;
-        cache.retain();
-        cache.clear();
-
-        ASSERT_EQ(cache.size(), 0U);
-
-        size_t key1 = 1U;
-        test_resource_t *resource_ptr1 = cache.get_or_add(
-                key1, []() { return std::make_shared<test_resource_t>(10); });
-
-        ASSERT_TRUE(cache.has_resource(key1));
-        ASSERT_EQ(resource_ptr1->data_, 10U);
-        ASSERT_EQ(cache.size(), 1U);
-        cache.release();
-    };
-
-    std::thread t1(func);
-    func();
-    t1.join();
-}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_thread_local_cache_cpu.cpp b/tests/gtests/graph/unit/backend/dnnl/test_thread_local_cache_cpu.cpp
new file mode 100644
index 00000000000..2d3c0c3d97e
--- /dev/null
+++ b/tests/gtests/graph/unit/backend/dnnl/test_thread_local_cache_cpu.cpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+* Copyright 2021-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <memory>
+#include <thread>
+
+#include "gtest/gtest.h"
+
+#include "interface/c_types_map.hpp"
+
+#include "backend/dnnl/thread_local_cache.hpp"
+
+#include <dnnl.hpp>
+
+template <typename T>
+using thread_local_cache_t
+        = dnnl::impl::graph::dnnl_impl::thread_local_cache_t<T>;
+
+struct test_resource_t {
+    test_resource_t(size_t data) : data_(data) {}
+    size_t data_;
+};
+
+TEST(test_thread_local_cache, SingleThread) {
+    thread_local_cache_t<test_resource_t> cache;
+    cache.clear();
+
+    size_t key1 = 1U;
+    test_resource_t *resource_ptr1 = cache.get_or_add(
+            key1, []() { return std::make_shared<test_resource_t>(10); });
+
+    size_t key2 = 2U;
+    test_resource_t *resource_ptr2 = cache.get_or_add(
+            key2, []() { return std::make_shared<test_resource_t>(20); });
+
+    ASSERT_TRUE(cache.has_resource(key1));
+    ASSERT_TRUE(cache.has_resource(key2));
+    ASSERT_EQ(resource_ptr1->data_, 10U);
+    ASSERT_EQ(resource_ptr2->data_, 20U);
+
+    // the given creator will not take effect since the key1 is already in the
+    // mapper
+    resource_ptr1 = cache.get_or_add(
+            key1, []() { return std::make_shared<test_resource_t>(100); });
+    ASSERT_EQ(resource_ptr1->data_, 10U);
+    ASSERT_EQ(cache.size(), 2U);
+
+    cache.remove_if_exist(key1);
+    cache.remove_if_exist(key2);
+}
+
+TEST(test_thread_local_cache, Multithreading) {
+    auto func = []() {
+        thread_local_cache_t<test_resource_t> cache;
+        cache.clear();
+
+        ASSERT_EQ(cache.size(), 0U);
+
+        size_t key1 = 1U;
+        test_resource_t *resource_ptr1 = cache.get_or_add(
+                key1, []() { return std::make_shared<test_resource_t>(10); });
+
+        size_t key2 = 2U;
+        test_resource_t *resource_ptr2 = cache.get_or_add(
+                key2, []() { return std::make_shared<test_resource_t>(20); });
+
+        ASSERT_TRUE(cache.has_resource(key1));
+        ASSERT_TRUE(cache.has_resource(key2));
+        ASSERT_EQ(resource_ptr1->data_, 10U);
+        ASSERT_EQ(resource_ptr2->data_, 20U);
+
+        resource_ptr1->data_ = 30;
+        ASSERT_EQ(resource_ptr1->data_, 30U);
+
+        size_t key3 = 3U;
+        cache.get_or_add(
+                key3, []() { return std::make_shared<test_resource_t>(100); });
+
+        ASSERT_EQ(cache.size(), 3U);
+    };
+
+    std::thread t1(func);
+    std::thread t2(func);
+    std::thread t3(func);
+    t1.join();
+    t2.join();
+    t3.join();
+}
+
+TEST(test_thread_local_cache, Clear) {
+    thread_local_cache_t<test_resource_t> cache;
+    size_t key1 = (size_t)1;
+    cache.get_or_add(
+            key1, []() { return std::make_shared<test_resource_t>(10); });
+    size_t key2 = (size_t)2;
+    cache.get_or_add(
+            key2, []() { return std::make_shared<test_resource_t>(20); });
+    ASSERT_NO_THROW(cache.clear());
+}
+
+TEST(test_thread_local_cache, RetainAndRelease) {
+    auto func = []() {
+        thread_local_cache_t<test_resource_t> cache;
+        cache.retain();
+        cache.clear();
+
+        ASSERT_EQ(cache.size(), 0U);
+
+        size_t key1 = 1U;
+        test_resource_t *resource_ptr1 = cache.get_or_add(
+                key1, []() { return std::make_shared<test_resource_t>(10); });
+
+        ASSERT_TRUE(cache.has_resource(key1));
+        ASSERT_EQ(resource_ptr1->data_, 10U);
+        ASSERT_EQ(cache.size(), 1U);
+        cache.release();
+    };
+
+    std::thread t1(func);
+    func();
+    t1.join();
+}
diff --git a/tests/gtests/graph/unit/backend/dnnl/test_typecast.cpp b/tests/gtests/graph/unit/backend/dnnl/test_typecast.cpp
index 0804a16b877..cf7eaeb0c55 100644
--- a/tests/gtests/graph/unit/backend/dnnl/test_typecast.cpp
+++ b/tests/gtests/graph/unit/backend/dnnl/test_typecast.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 namespace graph = dnnl::impl::graph;
 namespace utils = dnnl::graph::tests::unit::utils;
 
-TEST(test_typecase_execute, Typecast) {
+TEST(test_typecast_execute, Typecast) {
     graph::engine_t *engine = get_engine();
 
     std::vector<float> f32_val {
@@ -62,8 +62,8 @@ TEST(test_typecase_execute, Typecast) {
     ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
 
     graph::stream_t *stream = get_stream();
-    test_tensor f32_ts(f32_lt, engine, f32_val);
-    test_tensor bf16_ts(bf16_lt, engine, bf16_val);
+    test_tensor_t f32_ts(f32_lt, engine, f32_val);
+    test_tensor_t bf16_ts(bf16_lt, engine, bf16_val);
 
     // f32 --> bf16
     ASSERT_EQ(cp.execute(stream, {f32_ts.get()}, {bf16_ts.get()}),
@@ -71,7 +71,7 @@ TEST(test_typecase_execute, Typecast) {
     stream->wait();
 }
 
-TEST(test_typecase_compile, TypecastNegativeInput) {
+TEST(test_typecast_compile, TypecastNegativeInput) {
     graph::engine_t *engine = get_engine();
 
     graph::op_t typecast_op(graph::op_kind::TypeCast);
diff --git a/tests/gtests/graph/unit/backend/fake/test_fake_backend.cpp b/tests/gtests/graph/unit/backend/fake/test_fake_backend.cpp
index a9ae90d58d7..a0f9ca734c3 100644
--- a/tests/gtests/graph/unit/backend/fake/test_fake_backend.cpp
+++ b/tests/gtests/graph/unit/backend/fake/test_fake_backend.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
 namespace graph = dnnl::impl::graph;
 namespace utils = dnnl::graph::tests::unit::utils;
 
-TEST(test_fake_fake_backend, GetMemSize) {
+TEST(test_fake_backend, GetMemSize) {
     graph::logical_tensor_t lt = utils::logical_tensor_init(
             /* tid= */ 1, {1, 1, 3, 3}, graph::data_type::f32);
     auto &fake_backend = graph::fake_impl::fake_backend_t::get_singleton();
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/CMakeLists.txt b/tests/gtests/graph/unit/backend/graph_compiler/CMakeLists.txt
deleted file mode 100644
index d9720f3b999..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/CMakeLists.txt
+++ /dev/null
@@ -1,65 +0,0 @@
-#===============================================================================
-# Copyright 2021-2024 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-if(NOT ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_BACKEND)
-    return()
-endif()
-
-get_property(DEFINITIONS GLOBAL PROPERTY GRAPH_COMPILER_DEFINITIONS)
-
-if(${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
-    get_property(CCXX_NOWARN_FLAGS GLOBAL PROPERTY GRAPH_COMPILER_CCXX_NOWARN_FLAGS)
-    append(CMAKE_CCXX_NOWARN_FLAGS ${CCXX_NOWARN_FLAGS})
-endif()
-
-if(CMAKE_BASE_NAME MATCHES "(icx|icpx)")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function")
-    append_host_compiler_options(CMAKE_CXX_FLAGS "-Wno-deprecated-declarations -Wno-unused-variable -Wno-unused-function")
-endif()
-
-append(CMAKE_CXX_FLAGS "${CMAKE_CCXX_NOWARN_FLAGS}")
-append_host_compiler_options(CMAKE_CXX_FLAGS "${DPCPP_CXX_NOWARN_FLAGS}")
-
-add_definitions_with_host_compiler(${DEFINITIONS})
-set(OBJ_LIB graph_unit_test_compiler_backend)
-
-file(GLOB_RECURSE TEST_SRC
-${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
-)
-
-
-get_property(SC_BUILTIN_ENABLED GLOBAL PROPERTY GRAPH_COMPILER_BUILTIN_ENABLED)
-if(NOT SC_BUILTIN_ENABLED)
-    list(REMOVE_ITEM TEST_SRC ${CMAKE_CURRENT_SOURCE_DIR}/core/test_intrisics_combine.cpp)
-endif()
-
-add_library(${OBJ_LIB} OBJECT ${TEST_SRC})
-
-include_directories_with_host_compiler(${OBJ_LIB}
-    PRIVATE
-    ${PROJECT_SOURCE_DIR}/tests/gtests/graph # gtest related headers
-    ${PROJECT_SOURCE_DIR}/src/
-    ${PROJECT_SOURCE_DIR}/src/graph/backend/graph_compiler/core/src # for context
-    ${CMAKE_CURRENT_SOURCE_DIR}/core
-    ${CMAKE_CURRENT_SOURCE_DIR}/core/reference
-    ${PROJECT_SOURCE_DIR}/src/cpu/x64
-    )
-
-set_property(GLOBAL APPEND PROPERTY GRAPH_UNIT_TEST_DEPS
-    $<TARGET_OBJECTS:${OBJ_LIB}>)
-
-register_graph_test_suite("test_graph_unit_graph_compiler_cpu" 
-    "GCPatternTests.*:GCBackendApi.*:GCGraphTest.*:GCCore_*")
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/checked_ptr.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/checked_ptr.hpp
deleted file mode 100644
index 2b95912baf8..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/checked_ptr.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_CHECKED_PTR_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_CHECKED_PTR_HPP
-#include <runtime/aligned_ptr.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-struct checked_ptr_policy_t {
-    static void *alloc(size_t sz, size_t alignment);
-    static void dealloc(void *ptr, size_t sz);
-};
-
-using generic_checked_ptr_t = raii_ptr_t<checked_ptr_policy_t>;
-template <typename T>
-using checked_ptr_t = aligned_ptr_t<T, generic_checked_ptr_t>;
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/context.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/context.hpp
deleted file mode 100644
index 1cc1e2f51bb..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/context.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_CONTEXT_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_CONTEXT_HPP
-#include <memory>
-#include "test_utils.hpp"
-#include <compiler/config/context.hpp>
-#include <util/utils.hpp>
-
-inline dnnl::impl::graph::gc::context_ptr get_test_ctx() {
-    namespace gc = dnnl::impl::graph::gc;
-    static gc::context_ptr ctx = []() {
-        auto ret = std::make_shared<gc::context_t>(*gc::get_default_context());
-        ret->flags_.use_cost_model_ = false;
-        return ret;
-    }();
-    return ctx;
-}
-
-inline bool is_builtin_test_ctx() {
-    namespace gc = dnnl::impl::graph::gc;
-#if SC_BUILTIN_JIT_ENABLED
-    if (get_test_ctx()->flags_.jit_kind_ == gc::jit_kind::xbyak) {
-        return true;
-    } else {
-        return false;
-    }
-#else
-    return false;
-#endif
-}
-
-inline dnnl::impl::graph::gc::context_ptr get_test_ctx_without_amx() {
-    namespace gc = dnnl::impl::graph::gc;
-    // forcibly disable fAVX512AMXTILE
-    if (IS_AMX_AVAILABLE()) {
-        static auto ctx = []() {
-            auto new_ctx = std::make_shared<gc::context_t>(*get_test_ctx());
-            new_ctx->machine_.cpu_flags_.fAVX512AMXTILE = false;
-            return new_ctx;
-        }();
-        return ctx;
-    } else {
-        return get_test_ctx();
-    }
-};
-
-inline dnnl::impl::graph::gc::context_ptr get_avx2_test_ctx() {
-    namespace gc = dnnl::impl::graph::gc;
-    if ((IS_AVX512_AVAILABLE())) {
-        static auto ctx = []() {
-            auto new_ctx = std::make_shared<gc::context_t>(*get_test_ctx());
-            new_ctx->machine_.cpu_flags_.fAVX512AMXTILE = false;
-            new_ctx->machine_.cpu_flags_.fAVX512AMXINT8 = false;
-            new_ctx->machine_.cpu_flags_.fAVX512AMXBF16 = false;
-            new_ctx->machine_.cpu_flags_.fAVX512BF16 = false;
-            new_ctx->machine_.cpu_flags_.fAVX512VNNI = false;
-            new_ctx->machine_.cpu_flags_.fAVX512F = false;
-            new_ctx->machine_.cpu_flags_.fAVX512BW = false;
-            new_ctx->machine_.cpu_flags_.fAVX512VL = false;
-            new_ctx->machine_.cpu_flags_.fAVX512DQ = false;
-            return new_ctx;
-        }();
-        return ctx;
-    } else {
-        return get_test_ctx();
-    }
-};
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/exception_util.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/exception_util.hpp
deleted file mode 100644
index da0ffe3b3f9..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/exception_util.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_EXCEPTION_UTIL_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_EXCEPTION_UTIL_HPP
-
-#include <stdexcept>
-#include <string>
-
-#define EXPECT_EXCEPTION(TRY_BLOCK, EXCEPTION_TYPE, MESSAGE) \
-    try { \
-        TRY_BLOCK; \
-        FAIL() << "Expecting an exception!"; \
-    } catch (const EXCEPTION_TYPE &e) { \
-        EXPECT_TRUE(std::string(e.what()).find(MESSAGE) != std::string::npos) \
-                << " Wrong exception message: " << e.what() \
-                << "\nExpecting:" << MESSAGE; \
-    } catch (...) { FAIL() << " Wrong exception type!"; }
-
-#define EXPECT_SC_ERROR(CODE, MESSAGE) \
-    EXPECT_EXCEPTION(CODE, ::std::exception, MESSAGE)
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/act_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/act_ref.hpp
deleted file mode 100644
index 329ba24101e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/act_ref.hpp
+++ /dev/null
@@ -1,371 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_ACT_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_ACT_REF_HPP
-
-#include <algorithm>
-#include <cfenv>
-#include <cmath>
-#include <stdlib.h>
-#include "util/any_map.hpp"
-#include <test_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-template <typename T>
-static void ref_relu(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { out[i] = std::max(in[i], static_cast<T>(0)); });
-}
-
-template <typename T>
-static void ref_leaky_relu(T *out, const T *in, size_t size, float alpha) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = (in[i] > 0) ? in[i] : T(alpha * in[i]);
-    });
-}
-
-template <typename T>
-static void ref_prelu(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attr) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = (in_1[i] > 0) ? in_1[i] : T(in_2[i] * in_1[i]);
-    });
-}
-
-template <typename T>
-static void ref_abs_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = (in_1[i] > 0) ? in_2[i] : (T)((in_1[i] < 0) ? -in_2[i] : 0);
-    });
-}
-
-template <typename T>
-static void ref_clamp_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float min = attrs.get<float>("min");
-        float max = attrs.get<float>("max");
-        out[i] = (in_1[i] > min) && (in_1[i] < max) ? in_2[i] : (T)0;
-    });
-}
-
-template <typename T>
-static void ref_elu_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float alpha = attrs.get<float>("alpha");
-        bool use_dst = attrs.get<bool>("use_dst");
-        if (use_dst) {
-            out[i] = in_1[i] > 0 ? (T)in_2[i]
-                                 : (T)(in_2[i] * (in_1[i] * alpha));
-        } else {
-            out[i] = in_1[i] > 0
-                    ? (T)in_2[i]
-                    : (T)(in_2[i] * (T)(alpha * std::exp(in_1[i])));
-        }
-    });
-}
-
-template <typename T>
-static void ref_hardswish_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float alpha = attrs.get<float>("alpha");
-        float beta = attrs.get<float>("beta");
-        auto v = alpha * in_1[i] + beta;
-        auto w = 2.f * alpha * in_1[i] + beta;
-        out[i] = v <= 0.f ? (T)0.f : v >= 1.f ? (T)in_2[i] : (T)(in_2[i] * w);
-    });
-}
-
-template <typename T>
-static void ref_hardsigmoid_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float alpha = attrs.get<float>("alpha");
-        float beta = attrs.get<float>("beta");
-        auto v = (T)(alpha * in_1[i] + beta);
-        auto w = (T)(alpha * in_2[i]);
-        out[i] = v > 0.f ? v < 1.f ? (T)w : (T)0.f : (T)0.f;
-    });
-}
-
-template <typename T>
-static void ref_sqrt_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float use_dst = attrs.get<bool>("use_dst");
-        auto d = (T)(in_2[i] / (in_1[i] * 2.f));
-        auto s = (T)(in_2[i] / (std::sqrt(in_1[i]) * 2.f));
-        out[i] = use_dst ? d : s;
-    });
-}
-
-template <typename T>
-static void ref_mish_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        auto w = (T)(std::exp(3.f * in_1[i]) + 4.f * std::exp(in_1[i] * 2.f)
-                + std::exp(in_1[i]) * (4.f * in_1[i] + 6.f)
-                + 4.f * (in_1[i] + 1.f));
-        auto sig = (T)(std::exp(2.f * in_1[i]) + 2.f * std::exp(in_1[i]) + 2.f);
-        out[i] = in_2[i] * ((std::exp(in_1[i]) * w) / std::pow(sig, 2));
-    });
-}
-
-template <typename T>
-static void ref_tanh_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        bool use_dst = attrs.get<bool>("use_dst");
-        auto src = (T)in_2[i] * (1 - std::pow(std::tanh(in_1[i]), 2));
-        auto dst = (T)in_2[i] * (1 - std::pow(in_1[i], in_1[i]));
-        out[i] = use_dst ? dst : src;
-    });
-}
-
-template <typename T>
-static void ref_softplus_bwd(
-        T *out, const T *in_1, const T *in_2, size_t size, any_map_t &attrs) {
-    T beta = attrs.get_or_else<T>("beta", 1.f);
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        T inp = beta * in_1[i];
-        T ret = (T)1.f / (T)((T)1.f + 1 / expf(inp));
-        out[i] = ret * in_2[i];
-    });
-}
-
-template <typename T>
-static void ref_prelu_bwd(T *out1, T *out2, const T *in_1, const T *in_2,
-        const T *in_3, size_t size, any_map_t &attrs) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out1[i] = in_1[i] > (T)0 ? (T)in_3[i] : (T)(in_3[i] * in_2[i]);
-        out2[i] = std::min((T)0, in_1[i]) * in_3[i];
-    });
-}
-
-template <typename T>
-static void ref_round(T *out, const T *in, size_t size) {
-    auto old_round = std::fegetround();
-    std::fesetround(FE_TONEAREST);
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { out[i] = std::nearbyint(in[i]); });
-    std::fesetround(old_round);
-}
-
-template <typename T>
-static void ref_sigmoid(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { out[i] = 1.0f / (1.0f + expf(-in[i])); });
-}
-
-template <typename T>
-static void ref_tanh(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(
-            static_cast<int>(size), [&](int64_t i) { out[i] = tanhf(in[i]); });
-}
-
-template <typename T>
-static void ref_clamp(
-        T *out, const T *in, size_t size, float clamp_min, float clamp_max) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = std::max(std::min(in[i], static_cast<T>(clamp_max)),
-                static_cast<T>(clamp_min));
-    });
-}
-
-template <typename T>
-static void ref_erf(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(
-            static_cast<int>(size), [&](int64_t i) { out[i] = erff(in[i]); });
-}
-
-template <typename T>
-static void ref_gelu_tanh(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float sqrt_2_over_pi = 0.79788458347320556640625;
-        float fitting_const = 0.044715;
-        float v = tanh(
-                sqrt_2_over_pi * in[i] * (1 + fitting_const * in[i] * in[i]));
-        out[i] = 0.5f * in[i] * (1.0f + v);
-    });
-}
-
-template <typename T>
-static void ref_gelu_erf(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float sqrt_2 = 1.414213562f;
-        out[i] = 0.5f * in[i] * (1.0f + erff(in[i] / sqrt_2));
-    });
-}
-
-template <typename T>
-static void ref_reciprocal(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(
-            static_cast<int>(size), [&](int64_t i) { out[i] = 1.0f / in[i]; });
-}
-
-template <typename T>
-static void ref_abs(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(
-            static_cast<int>(size), [&](int64_t i) { out[i] = fabs(in[i]); });
-}
-
-template <typename T>
-static void ref_elu(T *out, const T *in, size_t size, T alpha) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = (float)in[i] >= 0.f ? in[i] : (T)(alpha * (expf(in[i]) - 1));
-    });
-}
-
-template <typename T>
-static void ref_hardsigmoid(T *out, const T *in, size_t size, T alpha, T beta) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = std::max(0.f, std::min(1.f, alpha * in[i] + beta));
-    });
-}
-
-template <typename T>
-static void ref_hardswish(T *out, const T *in, size_t size, T alpha, T beta) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = in[i] * std::max(0.f, std::min(1.f, alpha * in[i] + beta));
-    });
-}
-
-template <typename T>
-static void ref_linear_elementwise(
-        T *out, const T *in, size_t size, T alpha, T beta) {
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { out[i] = in[i] * alpha + beta; });
-}
-
-template <typename T>
-static void ref_log(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(
-            static_cast<int>(size), [&](int64_t i) { out[i] = logf(in[i]); });
-}
-
-template <typename T>
-static void ref_mish(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { out[i] = in[i] * tanhf(logf(1 + expf(in[i]))); });
-}
-
-template <typename T>
-static void ref_pow(T *out, const T *in, size_t size, T ex) {
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { out[i] = powf(in[i], ex); });
-}
-
-template <typename T>
-static void ref_soft_plus(T *out, const T *in, size_t size, T alpha) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = (T)1.f / alpha * logf((T)1.f + expf(in[i] * alpha));
-    });
-}
-
-template <typename T>
-static void ref_square(T *out, const T *in, size_t size) {
-    test_utils::parallel_nd(
-            static_cast<int>(size), [&](int64_t i) { out[i] = in[i] * in[i]; });
-}
-
-template <typename T>
-static void ref_swish(T *out, const T *in, size_t size, T alpha) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = in[i] / ((T)1.f + expf(-alpha * in[i]));
-    });
-}
-
-/** a relu bwd reference implementation.
- * @tparam T data type of inputs
- * @param gout gradient output after relu bwd, as a input to calculate gradient
- * data and weight.
- * @param gin gradient output before relu bwd, received from last layer.
- * @param out fwd output after relu this layer.
- * @param size input size.
- * */
-template <typename T>
-static void ref_relu_bwd(T *gout, T *gin, T *out, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        if (out[i] < 0) {
-            gout[i] = 0.f;
-        } else {
-            gout[i] = gin[i];
-        }
-    });
-}
-
-/** a sigmoid bwd reference implementation.
- * @tparam T data type of inputs
- * @param gout gradient output after sigmoid bwd, as a input to calculate
- * gradient data and weight.
- * @param gin gradient output before sigmoid bwd, received from last layer.
- * @param out fwd output after sigmoid this layer.
- * @param size input size.
- * */
-template <typename T>
-static void ref_sigmoid_bwd(T *gout, T *gin, T *out, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size),
-            [&](int64_t i) { gout[i] = gin[i] * (out[i] - out[i] * out[i]); });
-}
-
-/** a gelu_tanh bwd reference implementation.
- * @tparam T data type of inputs
- * @param gout gradient output after gelu_tanh bwd, as a input to calculate
- * gradient data and weight.
- * @param gin gradient output before gelu_tanh bwd, received from last layer.
- * @param in fwd input before gelu_tanh this layer.
- * @param size input size.
- * */
-template <typename T>
-static void ref_gelu_tanh_bwd(T *gout, T *gin, T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float sqrt_2_over_pi = 0.79788458347320556640625f;
-        float fitting_const = 0.044715f;
-        float g = in[i] * sqrt_2_over_pi * (1 + fitting_const * in[i] * in[i]);
-        float dg = sqrt_2_over_pi * (1 + 3 * fitting_const * in[i] * in[i]);
-        float v = tanh(g);
-        gout[i] = gin[i] * 0.5 * (1 + v) * (1 + in[i] * (1 - v) * dg);
-    });
-}
-
-/** a gelu_erf bwd reference implementation.
- * @tparam T data type of inputs
- * @param gout gradient output after gelu_erf bwd, as a input to calculate
- * gradient data and weight.
- * @param gin gradient output before gelu_erf bwd, received from last layer.
- * @param in fwd input before gelu_erf this layer.
- * @param size input size.
- * */
-template <typename T>
-static void ref_gelu_erf_bwd(T *gout, T *gin, T *in, size_t size) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        float sqrt_2 = 1.414213562f;
-        float sqrt_2pi = 2.506628275f;
-        gout[i] = gin[i]
-                * (0.5f * (1.0f + erff(in[i] / sqrt_2))
-                        + in[i] / sqrt_2pi * expf(-in[i] * in[i] / 2));
-    });
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/bertBMM_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/bertBMM_ref.hpp
deleted file mode 100644
index b5329d525ad..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/bertBMM_ref.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_BERTBMM_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_BERTBMM_REF_HPP
-
-#include <stdlib.h>
-#include <vector>
-#include <test_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-template <typename T0, typename T1, typename TOut>
-inline void ref_bertBMM_QK(TOut *out, const T0 *A, const T1 *B,
-        const sc_dims &dims_2D, const int batch_size, const int num_head) {
-    // reshape + transpose + BMM
-    const auto size = test_utils::product(dims_2D);
-    std::vector<T0> A_trans(size, 0.f);
-    std::vector<T1> B_trans(size, 0.f);
-    const auto seq_len = dims_2D[0] / batch_size;
-    const auto head_dim = dims_2D[1] / num_head;
-    const sc_dims stride
-            = {num_head * seq_len * head_dim, seq_len * head_dim, head_dim, 1};
-    const sc_dims orig_stride
-            = {seq_len * num_head * head_dim, head_dim, num_head * head_dim, 1};
-    const sc_dims c_stride
-            = {seq_len * num_head * seq_len, seq_len, num_head * seq_len, 1};
-    // do transpose [0, 2, 1, 3]
-    utils::parallel_for(0, batch_size, 1, [&](int64_t i) {
-        for (int j = 0; j < num_head; ++j) {
-            for (int ii = 0; ii < seq_len; ++ii) {
-                for (int jj = 0; jj < head_dim; ++jj) {
-                    A_trans[i * stride[0] + j * stride[1] + ii * stride[2] + jj]
-                            = A[i * orig_stride[0] + j * orig_stride[1]
-                                    + ii * orig_stride[2] + jj];
-                    B_trans[i * stride[0] + j * stride[1] + ii * stride[2] + jj]
-                            = B[i * orig_stride[0] + j * orig_stride[1]
-                                    + ii * orig_stride[2] + jj];
-                }
-            }
-        }
-    });
-
-    // compute BMM
-    utils::parallel_for(0, batch_size, 1, [&](int64_t i) {
-        for (int j = 0; j < num_head; ++j) {
-            auto offset = i * stride[0] + j * stride[1];
-            T0 *A_base = &A_trans[offset];
-            T1 *B_base = &B_trans[offset];
-
-            for (int m = 0; m < seq_len; ++m) {
-                TOut *out_base = &out[i * c_stride[0] + j * c_stride[1]
-                        + m * c_stride[2]];
-                for (int n = 0; n < seq_len; ++n) {
-                    TOut sum = 0.f;
-                    for (int k = 0; k < head_dim; ++k) {
-                        sum += (TOut)A_base[m * head_dim + k]
-                                * B_base[n * head_dim + k];
-                    }
-                    out_base[n] = sum;
-                }
-            }
-        }
-    });
-}
-
-template <typename T0, typename T1, typename TOut>
-inline void ref_bertBMM_V(TOut *out, const T0 *A, const T1 *B,
-        const sc_dims &A_dims_2D, const sc_dims &B_dims_2D,
-        const int batch_size, const int num_head) {
-    const auto A_size = test_utils::product(A_dims_2D);
-    const auto B_size = test_utils::product(B_dims_2D);
-    std::vector<T0> A_trans(A_size, 0.f);
-    std::vector<T1> B_trans(B_size, 0.f);
-    const auto seq_len = A_dims_2D[0] / batch_size;
-    const auto A_dim = A_dims_2D[1] / num_head;
-    const auto B_dim = B_dims_2D[1] / num_head;
-    const sc_dims a_stride
-            = {num_head * seq_len * A_dim, seq_len * A_dim, A_dim, 1};
-    const sc_dims orig_a_stride
-            = {seq_len * num_head * A_dim, A_dim, num_head * A_dim, 1};
-
-    const sc_dims b_stride
-            = {num_head * seq_len * B_dim, seq_len * B_dim, B_dim, 1};
-    const sc_dims orig_b_stride
-            = {seq_len * num_head * B_dim, B_dim, num_head * B_dim, 1};
-
-    const sc_dims c_stride
-            = {seq_len * num_head * B_dim, B_dim, num_head * B_dim, 1};
-
-    // do transpose [0, 2, 1, 3]
-    utils::parallel_for(0, batch_size, 1, [&](int64_t i) {
-        for (int j = 0; j < num_head; ++j) {
-            for (int ii = 0; ii < seq_len; ++ii) {
-                for (int jj = 0; jj < A_dim; ++jj) {
-                    A_trans[i * a_stride[0] + j * a_stride[1] + ii * a_stride[2]
-                            + jj]
-                            = A[i * orig_a_stride[0] + j * orig_a_stride[1]
-                                    + ii * orig_a_stride[2] + jj];
-                }
-                for (int jj = 0; jj < B_dim; ++jj) {
-                    B_trans[i * b_stride[0] + j * b_stride[1] + ii * b_stride[2]
-                            + jj]
-                            = B[i * orig_b_stride[0] + j * orig_b_stride[1]
-                                    + ii * orig_b_stride[2] + jj];
-                }
-            }
-        }
-    });
-
-    // compute BMM
-    utils::parallel_for(0, batch_size, 1, [&](int64_t i) {
-        for (int j = 0; j < num_head; ++j) {
-            T0 *A_base = &A_trans[i * a_stride[0] + j * a_stride[1]];
-            T1 *B_base = &B_trans[i * b_stride[0] + j * b_stride[1]];
-
-            for (int m = 0; m < seq_len; ++m) {
-                TOut *out_base = &out[i * c_stride[0] + j * c_stride[1]
-                        + m * c_stride[2]];
-                for (int n = 0; n < B_dim; ++n) {
-                    TOut sum = 0.f;
-                    for (int k = 0; k < A_dim; ++k) {
-                        sum += (TOut)A_base[m * A_dim + k]
-                                * B_base[k * B_dim + n];
-                    }
-                    out_base[n] = sum;
-                }
-            }
-        }
-    });
-}
-
-template <typename T>
-static void ref_bertBMM_softmax_mask(T *C, T *mask, const sc_dims &dims_C,
-        const sc_dims &dims_mask, int size_per_head = 64) {
-    /** Only suitable for BMM_QK
-     * adder = (1.0 - attention_mask) * (-10000.0)
-     * attention_score("ATTEN_QK") =
-     *     attention_score * (1/sqrt(SIZE_PER_HEAD)) + adder
-     */
-    COMPILE_ASSERT(dims_C[2] == dims_C[3],
-            "softmax mask fusion should only occurs in bmm_qk mode.");
-    int qk_M_num_block = dims_C[0], qk_N_num_block = dims_C[1],
-        M_block = dims_C[2];
-    int batch_size = dims_mask[0], seq_len = dims_mask[2];
-    int M_num_block = qk_M_num_block / batch_size,
-        N_num_block = seq_len / M_block;
-    int num_head = qk_N_num_block / N_num_block;
-    COMPILE_ASSERT(
-            dims_mask[1] == 1, "dims[1] of softmax mask is expected to be 1.");
-    COMPILE_ASSERT(M_num_block == seq_len / M_block,
-            "seq_len should equal to M_num_block *  M_block");
-    COMPILE_ASSERT(M_num_block == N_num_block,
-            "M_num_block should be equal to N_num_block");
-    // compute mask
-    utils::parallel_for(0, batch_size, 1, [&](int64_t i) {
-        for (int ii = 0; ii < seq_len; ++ii) {
-            for (int jj = 0; jj < seq_len; ++jj) {
-                mask[i * seq_len * seq_len + ii * seq_len + jj]
-                        = (1.0
-                                  - mask[i * seq_len * seq_len + ii * seq_len
-                                          + jj])
-                        * (-10000.0);
-            }
-        }
-    });
-    // compute bmm add mask
-    utils::parallel_for(0, batch_size, 1, [&](int64_t bs) {
-        for (int nh = 0; nh < num_head; ++nh) {
-            for (int m_o = 0; m_o < M_num_block; ++m_o) {
-                for (int n_o = 0; n_o < N_num_block; ++n_o) {
-                    for (int m_i = 0; m_i < M_block; ++m_i) {
-                        for (int n_i = 0; n_i < M_block; ++n_i) {
-                            C[(bs * M_num_block + m_o) * qk_N_num_block
-                                            * M_block * M_block
-                                    + (nh * N_num_block + n_o) * M_block
-                                            * M_block
-                                    + m_i * M_block + n_i]
-                                    = C[(bs * M_num_block + m_o)
-                                                      * qk_N_num_block * M_block
-                                                      * M_block
-                                              + (nh * N_num_block + n_o)
-                                                      * M_block * M_block
-                                              + m_i * M_block + n_i]
-                                            / std::sqrt(size_per_head)
-                                    + mask[bs * seq_len * seq_len
-                                            + (m_o * M_block + m_i) * seq_len
-                                            + (n_o * M_block + n_i)];
-                        }
-                    }
-                }
-            }
-        }
-    });
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/bias_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/bias_ref.hpp
deleted file mode 100644
index de315dd2f67..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/bias_ref.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_BIAS_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_BIAS_REF_HPP
-
-#include <stdlib.h>
-#include <test_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-template <typename T>
-static void ref_bias_bwd(T *out, T *in, size_t size, size_t sum_size) {
-    test_utils::parallel_nd(static_cast<int>(size), [&](int64_t i) {
-        out[i] = 0.f;
-        for (unsigned k = 0; k < sum_size; ++k) {
-            out[i] += in[k * size + i];
-        }
-    });
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/checked_ptr.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/checked_ptr.cpp
deleted file mode 100644
index f7e05a23479..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/checked_ptr.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <checked_ptr.hpp>
-#ifdef _WIN32
-#include <Windows.h>
-#else
-#include <sys/mman.h>
-#endif
-#include <assert.h>
-#include <stdexcept>
-#include <runtime/os.hpp>
-#include <util/assert.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-static void fill_memory(char *start, char *end) {
-    memset(start, 0xcc, end - start);
-}
-
-void *checked_ptr_policy_t::alloc(size_t sz, size_t alignment) {
-    size_t page_sz = runtime::get_os_page_size();
-    size_t data_size = utils::rnd_up(sz, page_sz);
-    size_t real_sz = data_size + page_sz * 2;
-    COMPILE_ASSERT(
-            real_sz > 2 * page_sz, "At least 3 pages should be allocated");
-#ifdef _WIN32
-    throw std::runtime_error("checked_ptr_policy_t::alloc not implemented");
-#else
-    auto ret = mmap(nullptr, real_sz, PROT_READ | PROT_WRITE,
-            MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
-    COMPILE_ASSERT(ret, "mmap failed");
-    mprotect(ret, page_sz, PROT_NONE);
-    auto protect_page_rhs = (char *)ret + real_sz - page_sz;
-    mprotect(protect_page_rhs, page_sz, PROT_NONE);
-    auto result = protect_page_rhs - utils::rnd_up(sz, alignment);
-    fill_memory(result + sz, protect_page_rhs);
-    fill_memory((char *)ret + page_sz, result);
-    return result;
-#endif
-}
-
-static void check(
-        uint8_t *start, uint8_t *end, void *base_buffer, size_t real_sz) {
-#ifdef _WIN32
-    throw std::runtime_error("checked_ptr_policy_t::alloc not implemented");
-#else
-    for (uint8_t *p = start; p < end; p++) {
-        if (*p != 0xcc) {
-            fputs("Buffer overflow detected\n", stderr);
-            munmap(base_buffer, real_sz);
-            std::abort();
-        }
-    }
-#endif
-}
-void checked_ptr_policy_t::dealloc(void *ptr, size_t sz) {
-    size_t page_sz = runtime::get_os_page_size();
-    size_t data_size = utils::rnd_up(sz, page_sz);
-    size_t real_sz = data_size + page_sz * 2;
-    auto buffer = (uint8_t *)(utils::rnd_dn((size_t)ptr, page_sz) - page_sz);
-#ifdef _WIN32
-    throw std::runtime_error("checked_ptr_policy_t::dealloc not implemented");
-#else
-    auto protect_page_rhs = (uint8_t *)buffer + real_sz - page_sz;
-    check((uint8_t *)ptr + sz, protect_page_rhs, buffer, real_sz);
-    check(buffer + page_sz, (uint8_t *)ptr, buffer, real_sz);
-    munmap(buffer, real_sz);
-#endif
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/conv_block_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/conv_block_ref.hpp
deleted file mode 100644
index 56642e5ebf3..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/conv_block_ref.hpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_CONV_BLOCK_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_CONV_BLOCK_REF_HPP
-
-#include <assert.h>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "conv_ref.hpp"
-#include "eltwise_ref.hpp"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/body_generator.hpp>
-#include <ops/convolution.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <template/graph/conv_block.hpp>
-
-namespace gc = dnnl::impl::graph::gc;
-using gc::sc_dims;
-
-static init_action buffer_action(bool need_rand) {
-    return need_rand ? INIT_RANDOM : INIT_NOOP;
-}
-test_buffer<float> alloc_conv_block_array(size_t size, bool rand_non_negative) {
-    if (rand_non_negative) {
-        return alloc_array<float>(size, INIT_RANGE, 0.0, 1.0);
-    } else {
-        return alloc_array<float>(size);
-    }
-}
-void compute_conv_block(gc::sc_graph_t &g, std::vector<gc::sc_op_ptr> &args,
-        const sc_dims &input_dims,
-        const std::vector<sc_dims> &weight_dims_block,
-        const sc_dims &stride_block, const sc_dims &dilations_block,
-        const sc_dims &padding_block,
-        const std::vector<std::vector<gc::postop_type>> &post_types_block,
-        const std::vector<gc::ops::conv_fwd_config_t> &cfg_ptr,
-        test_buffer<float> &sc_output, test_buffer<float> &ref_output,
-        bool rand_non_negative = false) {
-    using namespace dnnl::impl::graph::gc;
-
-    int nblocks = weight_dims_block.size();
-    std::vector<bool> bias_block(nblocks, false), bn_relu_block(nblocks, false);
-    for (int i = 0; i < nblocks; i++) {
-        for (auto post_type : post_types_block.at(i)) {
-            if (post_type == postop_type::bias) {
-                bias_block[i] = true;
-            } else if (post_type == postop_type::bn) {
-                bn_relu_block[i] = true;
-            }
-        }
-    }
-
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = true;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = true;
-
-    std::vector<std::pair<int, int>> p_q;
-    std::vector<ops::conv_fwd_config_t> configs;
-    int idx = 0;
-    int N = input_dims.at(0);
-    for (auto &op : g.ops_) {
-        if (op->op_name_ == "conv_fwd_core") {
-            auto conv_op = op->dyn_cast<ops::conv_fwd_core_op_t>();
-
-            auto &out0_shape
-                    = op->get_outputs().at(0)->details_.get_plain_dims();
-            p_q.emplace_back(
-                    std::make_pair(out0_shape.at(2), out0_shape.at(3)));
-            if (!cfg_ptr.empty()) {
-                conv_op->set_config(
-                        reflection::general_object_t::make(cfg_ptr[idx]));
-                configs.emplace_back(cfg_ptr[idx]);
-            } else {
-                body_generator_ptr gen = conv_op->create_generator();
-                configs.emplace_back(
-                        *(ops::conv_fwd_config_t *)gen
-                                 ->get_default_config(get_default_context())
-                                 .data_.get());
-            }
-            idx += 1;
-        }
-    }
-    graph_driver(g);
-    auto f = lower_graph(get_default_context(), g, args);
-    auto fptr = jit_engine_t::make(get_default_context())
-                        ->get_entry_func(f, true);
-    int P, Q, H, W;
-    H = input_dims.at(2);
-    W = input_dims.at(3);
-
-    std::tie(P, Q) = p_q.at(0);
-    auto input = alloc_conv_block_array((size_t)N
-                    * weight_dims_block.at(0).at(1) / configs.at(0).C_block * H
-                    * W * configs.at(0).C_block,
-            rand_non_negative);
-    auto output = alloc_array<float>((size_t)N * weight_dims_block.at(0).at(0)
-                    / configs.at(0).K_block * P * Q * configs.at(0).K_block,
-            INIT_NOOP);
-    auto weight = alloc_conv_block_array((size_t)weight_dims_block.at(0).at(0)
-                    / configs.at(0).K_block * weight_dims_block.at(0).at(1)
-                    / configs.at(0).C_block * weight_dims_block.at(0).at(2)
-                    * weight_dims_block.at(0).at(3) * configs.at(0).C_block
-                    * configs.at(0).K_block,
-            rand_non_negative);
-    auto ele_add = alloc_conv_block_array((size_t)N
-                    * weight_dims_block.at(0).at(0) / configs.at(0).K_block * P
-                    * Q * configs.at(0).K_block,
-            rand_non_negative);
-    auto bias = alloc_array<float>(
-            weight_dims_block.at(0).at(0), buffer_action(bias_block.at(0)));
-    auto bn_mul = alloc_array<float>(
-            weight_dims_block.at(0).at(0), buffer_action(bn_relu_block.at(0)));
-    auto bn_add = alloc_array<float>(
-            weight_dims_block.at(0).at(0), buffer_action(bn_relu_block.at(0)));
-
-    // ref data
-    auto ref_input = input.copy();
-    auto ref_weight = weight.copy();
-    auto ref_ele_add = ele_add.copy();
-    auto ref_bias = bias.copy();
-    auto ref_bn_mul = bn_mul.copy();
-    auto ref_bn_add = bn_add.copy();
-    test_buffer<float> ref_out(
-            (size_t)N * weight_dims_block.at(0).at(0) * P * Q);
-
-    std::vector<test_buffer<float>> sc_args;
-    std::vector<test_buffer<float>> ref_outputs, ref_weights, ref_biases,
-            ref_bn_muls, ref_bn_adds, ref_ele_adds;
-
-    std::vector<generic_val> generic_args;
-    sc_args.emplace_back(std::move(input));
-    sc_args.emplace_back(std::move(weight));
-
-    for (auto post_type : post_types_block.at(0)) {
-        switch (post_type) {
-            case postop_type::bias:
-                sc_args.emplace_back(std::move(bias));
-                break;
-            case postop_type::bn:
-                sc_args.emplace_back(std::move(bn_mul));
-                sc_args.emplace_back(std::move(bn_add));
-                break;
-            case postop_type::eleadd:
-                sc_args.emplace_back(std::move(ele_add));
-                break;
-            default: break;
-        }
-    }
-    // IF nblocks==1, push final output
-    if (nblocks == 1) { sc_args.emplace_back(std::move(output)); }
-
-    ref_outputs.emplace_back(std::move(ref_out));
-    ref_weights.emplace_back(std::move(ref_weight));
-    ref_biases.emplace_back(std::move(ref_bias));
-    ref_ele_adds.emplace_back(std::move(ref_ele_add));
-    ref_bn_muls.emplace_back(std::move(ref_bn_mul));
-    ref_bn_adds.emplace_back(std::move(ref_bn_add));
-
-    for (auto i = 1; i < nblocks; ++i) {
-        std::tie(P, Q) = p_q.at(i);
-        output = alloc_array<float>((size_t)N * weight_dims_block.at(i).at(0)
-                        / configs.at(i).K_block * P * Q * configs.at(i).K_block,
-                INIT_NOOP);
-        weight = alloc_conv_block_array((size_t)weight_dims_block.at(i).at(0)
-                        / configs.at(i).K_block * weight_dims_block.at(i).at(1)
-                        / configs.at(i).C_block * weight_dims_block.at(i).at(2)
-                        * weight_dims_block.at(i).at(3) * configs.at(i).C_block
-                        * configs.at(i).K_block,
-                rand_non_negative);
-        ele_add = alloc_conv_block_array((size_t)N
-                        * weight_dims_block.at(i).at(0) / configs.at(i).K_block
-                        * P * Q * configs.at(i).K_block,
-                rand_non_negative);
-        bias = alloc_array<float>(
-                weight_dims_block.at(i).at(0), buffer_action(bias_block.at(i)));
-        bn_mul = alloc_array<float>(weight_dims_block.at(i).at(0),
-                buffer_action(bn_relu_block.at(i)));
-        bn_add = alloc_array<float>(weight_dims_block.at(i).at(0),
-                buffer_action(bn_relu_block.at(i)));
-
-        // ref data
-        ref_weight = weight.copy();
-        ref_ele_add = ele_add.copy();
-        ref_bias = bias.copy();
-        ref_bn_mul = bn_mul.copy();
-        ref_bn_add = bn_add.copy();
-        test_buffer<float> ref_out(
-                (size_t)N * weight_dims_block.at(i).at(0) * P * Q);
-
-        sc_args.emplace_back(std::move(weight));
-        for (auto post_type : post_types_block.at(i)) {
-            switch (post_type) {
-                case postop_type::bias:
-                    sc_args.emplace_back(std::move(bias));
-                    break;
-                case postop_type::bn:
-                    sc_args.emplace_back(std::move(bn_mul));
-                    sc_args.emplace_back(std::move(bn_add));
-                    break;
-                case postop_type::eleadd:
-                    sc_args.emplace_back(std::move(ele_add));
-                    break;
-                default: break;
-            }
-        }
-        // only push final output
-        if (i == nblocks - 1) { sc_args.emplace_back(std::move(output)); }
-
-        ref_outputs.emplace_back(std::move(ref_out));
-        ref_weights.emplace_back(std::move(ref_weight));
-        ref_ele_adds.emplace_back(std::move(ref_ele_add));
-        ref_biases.emplace_back(std::move(ref_bias));
-        ref_bn_muls.emplace_back(std::move(ref_bn_mul));
-        ref_bn_adds.emplace_back(std::move(ref_bn_add));
-    }
-
-    for (unsigned i = 0; i < sc_args.size(); ++i)
-        generic_args.emplace_back(sc_args.at(i).data());
-
-    fptr->call_generic_default(generic_args.data());
-
-    std::tie(P, Q) = p_q.at(0);
-    compute_ref_direct_fwd(N, 1, weight_dims_block.at(0).at(0),
-            weight_dims_block.at(0).at(1), H, W, P, Q,
-            weight_dims_block.at(0).at(2), weight_dims_block.at(0).at(3),
-            stride_block.at(0), stride_block.at(0), padding_block.at(0),
-            padding_block.at(0), ref_input.data(), ref_weights.at(0).data(),
-            ref_biases.at(0).data(), ref_outputs.at(0).data(),
-            bias_block.at(0) ? dir_t::FWD_B : FWD_I, ref_bn_muls.at(0).data(),
-            ref_bn_adds.at(0).data(), bn_relu_block.at(0), 1, 1, 1, 0, 1, 1,
-            dilations_block.at(0), dilations_block.at(0));
-    if (post_types_block.at(0).end()
-            != std::find(post_types_block.at(0).begin(),
-                    post_types_block.at(0).end(), postop_type::eleadd)) {
-        compute_elementwise_ref_direct_fwd(ref_outputs.at(0).data(),
-                ref_ele_adds.at(0).data(),
-                {N, weight_dims_block.at(0).at(0), P, Q});
-    }
-
-    for (auto i = 1; i < nblocks; ++i) {
-        H = P;
-        W = Q;
-        std::tie(P, Q) = p_q.at(i);
-        compute_ref_direct_fwd(N, 1, weight_dims_block.at(i).at(0),
-                weight_dims_block.at(i).at(1), H, W, P, Q,
-                weight_dims_block.at(i).at(2), weight_dims_block.at(i).at(3),
-                stride_block.at(i), stride_block.at(i), padding_block.at(i),
-                padding_block.at(i), ref_outputs.at(i - 1).data(),
-                ref_weights.at(i).data(), ref_biases.at(i).data(),
-                ref_outputs.at(i).data(),
-                bias_block.at(i) ? dir_t::FWD_B : FWD_I,
-                ref_bn_muls.at(i).data(), ref_bn_adds.at(i).data(),
-                bn_relu_block.at(i), 1, 1, 1, 0, 1, 1, dilations_block.at(i),
-                dilations_block.at(i));
-        if (post_types_block.at(i).end()
-                != std::find(post_types_block.at(i).begin(),
-                        post_types_block.at(i).end(), postop_type::eleadd)) {
-            compute_elementwise_ref_direct_fwd(ref_outputs.at(i).data(),
-                    ref_ele_adds.at(i).data(),
-                    {N, weight_dims_block.at(i).at(0), P, Q});
-        }
-    }
-
-    std::tie(P, Q) = p_q.at(nblocks - 1);
-    sc_output = sc_args.back().copy();
-    ref_output = std::move(ref_outputs.back());
-}
-
-void compute_conv_block(gc::sc_graph_t &g, std::vector<gc::sc_op_ptr> &args,
-        const sc_dims &input_dims,
-        const std::vector<sc_dims> &weight_dims_block,
-        const sc_dims &stride_block, const sc_dims &padding_block,
-        const std::vector<std::vector<gc::postop_type>> &post_types_block,
-        const std::vector<gc::ops::conv_fwd_config_t> &cfg_ptr,
-        test_buffer<float> &sc_output, test_buffer<float> &ref_output,
-        bool rand_non_negative = false) {
-    compute_conv_block(g, args, input_dims, weight_dims_block, stride_block,
-            sc_dims(stride_block.size(), 1), padding_block, post_types_block,
-            cfg_ptr, sc_output, ref_output, rand_non_negative);
-}
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/conv_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/conv_ref.hpp
deleted file mode 100644
index 7fac62425f1..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/conv_ref.hpp
+++ /dev/null
@@ -1,699 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_CONV_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_CONV_REF_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <string>
-#include <vector>
-#include "../test_utils_arr_fill.hpp"
-#include <compiler/ir/graph/graph.hpp>
-#include <test_utils.hpp>
-
-enum dir_t {
-    DIR_UNDEF = 0,
-    FLAG_DAT = 1,
-    FLAG_WEI = 2,
-    FLAG_BIA = 4,
-    FLAG_FWD = 32,
-    FLAG_BWD = 64,
-    FLAG_INF = 128,
-    FWD_D = FLAG_FWD + FLAG_DAT,
-    FWD_I = FLAG_FWD + FLAG_DAT + FLAG_INF,
-    FWD_B = FLAG_FWD + FLAG_DAT + FLAG_BIA,
-    BWD_D = FLAG_BWD + FLAG_DAT,
-    BWD_DW = FLAG_BWD + FLAG_DAT + FLAG_WEI,
-    BWD_W = FLAG_BWD + FLAG_WEI,
-    BWD_WB = FLAG_BWD + FLAG_WEI + FLAG_BIA,
-};
-
-namespace sc = dnnl::impl::graph::gc;
-
-inline int64_t src_off_f(const int64_t G, const int64_t IC, const int64_t ID,
-        const int64_t IH, const int64_t IW, int64_t mb, int64_t g, int64_t ic,
-        int64_t id, int64_t ih, int64_t iw) {
-    return (((mb * IC + g * IC / G + ic) * ID + id) * IH + ih) * IW + iw;
-}
-inline int64_t wei_off_f(const int64_t G, const int64_t OC, const int64_t IC,
-        const int64_t KD, const int64_t KH, const int64_t KW, int64_t g,
-        int64_t oc, int64_t ic, int64_t kd, int64_t kh, int64_t kw) {
-    return ((((g * OC / G + oc) * IC / G + ic) * KD + kd) * KH + kh) * KW + kw;
-}
-inline int64_t bia_off_f(
-        const int64_t G, const int64_t OC, int64_t g, int64_t oc) {
-    return g * OC / G + oc;
-}
-inline int64_t bn_off_f(
-        const int64_t G, const int64_t OC, int64_t g, int64_t oc) {
-    return g * OC / G + oc;
-}
-inline int64_t dst_off_f(const int64_t G, const int64_t OC, const int64_t OD,
-        const int64_t OH, const int64_t OW, int64_t mb, int64_t g, int64_t oc,
-        int64_t od, int64_t oh, int64_t ow) {
-    return (((mb * OC + g * OC / G + oc) * OD + od) * OH + oh) * OW + ow;
-}
-
-template <typename src_type, typename wei_type, typename dst_type,
-        typename bias_type = float>
-void compute_ref_direct_fwd(const int64_t MB, const int64_t G, const int64_t OC,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t OH,
-        const int64_t OW, const int64_t KH, const int64_t KW, const int64_t SH,
-        const int64_t SW, const int64_t PH, const int64_t PW, src_type *src_m,
-        wei_type *wei_m, bias_type *bia_m, dst_type *dst_m, dir_t dir,
-        float *mul_m = nullptr, float *add_m = nullptr, bool bn_relu = false,
-        const int64_t OD = 1, const int64_t ID = 1, const int64_t SD = 1,
-        const int64_t PD = 0, const int64_t KD = 1, const int64_t DD = 1,
-        const int64_t DH = 1, const int64_t DW = 1, bool qconv = false,
-        std::vector<int> data_zero_points = {0},
-        std::vector<int> weight_zero_points = {0}) {
-    /* help compiler optimize the code */
-
-    const int64_t OCG = OC / G, ICG = IC / G;
-    bool data_zps_per_tensor = data_zero_points.size() == 1;
-    bool weight_zps_per_tensor = weight_zero_points.size() == 1;
-
-    auto ker = [&](dst_type &d, int64_t g, int64_t mb, int64_t oc, int64_t od,
-                       int64_t oh, int64_t ow) {
-        const src_type *__restrict src_loc
-                = (const src_type *)src_m + (mb * IC + g * ICG) * ID * IH * IW;
-        const wei_type *__restrict wei_loc
-                = (const wei_type *)wei_m + (g * OCG + oc) * ICG * KD * KH * KW;
-
-        for (int64_t kd = 0; kd < KD; ++kd) {
-            const int64_t id = od * SD - PD + kd * DD;
-            if (id < 0 || id >= ID) continue;
-            for (int64_t kh = 0; kh < KH; ++kh) {
-                const int64_t ih = oh * SH - PH + kh * DH;
-                if (ih < 0 || ih >= IH) continue;
-                for (int64_t kw = 0; kw < KW; ++kw) {
-                    const int64_t iw = ow * SW - PW + kw * DW;
-                    if (iw < 0 || iw >= IW) continue;
-                    for (int64_t ic = 0; ic < ICG; ++ic) {
-                        int64_t src_off = ((ic * ID + id) * IH + ih) * IW + iw;
-                        int64_t wei_off = ((ic * KD + kd) * KH + kh) * KW + kw;
-                        auto src_val = dst_type(src_loc[src_off])
-                                - dst_type(data_zero_points[data_zps_per_tensor
-                                                ? 0
-                                                : ic]);
-                        auto wei_val = dst_type(wei_loc[wei_off])
-                                - dst_type(
-                                        weight_zero_points[weight_zps_per_tensor
-                                                        ? 0
-                                                        : ic]);
-                        d += src_val * wei_val;
-                    }
-                }
-            }
-        }
-    };
-
-    sc::utils::parallel_for(0, G * MB * OCG * OD * OH * OW, 1, [&](int64_t i) {
-        int64_t g = i / (MB * OCG * OD * OH * OW),
-                mb = i / (OCG * OD * OH * OW) % MB,
-                oc = i / (OD * OH * OW) % OCG;
-        int64_t od = i / (OH * OW) % OD, oh = i / OW % OH, ow = i % OW;
-        const size_t dst_off
-                = dst_off_f(G, OC, OD, OH, OW, mb, g, oc, od, oh, ow);
-        dst_type &dst = ((dst_type *)dst_m)[dst_off];
-
-        dst_type conv_res = 0;
-        ker(conv_res, g, mb, oc, od, oh, ow);
-
-        if (dir & FLAG_BIA) {
-            const size_t bia_off = bia_off_f(G, OC, g, oc);
-            if (qconv) {
-                conv_res += ((int32_t *)bia_m)[bia_off];
-            } else {
-                conv_res += ((float *)bia_m)[bia_off];
-            }
-        }
-
-        if (bn_relu) {
-            // y = max((a*x+b),0)
-            const size_t bn_off = bn_off_f(G, OC, g, oc);
-            conv_res *= ((float *)mul_m)[bn_off];
-            conv_res += ((float *)add_m)[bn_off];
-            conv_res = std::max(conv_res, static_cast<dst_type>(0));
-        }
-
-        dst = conv_res;
-    });
-}
-
-inline void compute_ref_direct_bwd_d(const int64_t MB, const int64_t G,
-        const int64_t OC, const int64_t IC, const int64_t IH, const int64_t IW,
-        const int64_t OH, const int64_t OW, const int64_t KH, const int64_t KW,
-        const int64_t SH, const int64_t SW, const int64_t PH, const int64_t PW,
-        float *diff_src_m, float *wei_m, float *bia_m, float *diff_dst_m,
-        dir_t dir = dir_t::BWD_D, const int64_t OD = 1, const int64_t ID = 1,
-        const int64_t SD = 1, const int64_t PD = 0, const int64_t KD = 1,
-        const int64_t DD = 1, const int64_t DH = 1, const int64_t DW = 1) {
-    /* help compiler optimize the code */
-
-    const int64_t OCG = OC / G, ICG = IC / G;
-
-    enum { precompute_size = 16 };
-    const bool fast = false;
-
-    /* pre-computes arrays of oh(ow) and kh(kw) for traversing in kernel */
-    auto precompute_ok
-            = [](int64_t i, int64_t O, int64_t K, int64_t S, int64_t P,
-                      int64_t D, int64_t &num, int64_t *_o, int64_t *_k) {
-                  assert(K <= precompute_size);
-                  num = 0;
-                  for (int64_t k = 0; k < K; ++k) {
-                      int64_t o = i - k * D + P;
-                      if (o < 0 || o % S) continue;
-                      o /= S;
-                      if (o >= O) continue;
-                      _k[num] = k;
-                      _o[num] = o;
-                      ++num;
-                  }
-              };
-
-    auto ker_fast = [&](float &ds, int64_t g, int64_t mb, int64_t ic,
-                            int64_t id, int64_t ih, int64_t iw) {
-        int64_t kd[precompute_size], od[precompute_size], num_d;
-        int64_t kh[precompute_size], oh[precompute_size], num_h;
-        int64_t kw[precompute_size], ow[precompute_size], num_w;
-        precompute_ok(id, OD, KD, SD, PD, DD, num_d, od, kd);
-        precompute_ok(ih, OH, KH, SH, PH, DH, num_h, oh, kh);
-        precompute_ok(iw, OW, KW, SW, PW, DW, num_w, ow, kw);
-
-        const float *__restrict diff_dst_loc = (const float *)diff_dst_m
-                + (mb * OC + g * OCG) * OD * OH * OW;
-        const float *__restrict wei_loc
-                = (const float *)wei_m + ((g * OCG) * ICG + ic) * KD * KH * KW;
-
-        for (int64_t d = 0; d < num_d; ++d)
-            for (int64_t h = 0; h < num_h; ++h)
-                for (int64_t w = 0; w < num_w; ++w) {
-                    for (int64_t oc = 0; oc < OCG; ++oc) {
-                        const int64_t diff_dst_off
-                                = ((oc * OD + od[d]) * OH + oh[h]) * OW + ow[w];
-                        const int64_t wei_off
-                                = ((oc * ICG * KD + kd[d]) * KH + kh[h]) * KW
-                                + kw[w];
-                        ds += diff_dst_loc[diff_dst_off] * wei_loc[wei_off];
-                    }
-                }
-    };
-
-    auto ker = [&](float &ds, int64_t g, int64_t mb, int64_t ic, int64_t id,
-                       int64_t ih, int64_t iw) {
-        const float *__restrict diff_dst_loc = (const float *)diff_dst_m
-                + (mb * OC + g * OCG) * OD * OH * OW;
-        const float *__restrict wei_loc
-                = (const float *)wei_m + ((g * OCG) * ICG + ic) * KD * KH * KW;
-
-        for (int64_t kd = 0; kd < KD; ++kd) {
-            int64_t od = id - kd * DD + PD;
-            if (od < 0 || od % SD || od >= OD * SD) continue;
-            od /= SD;
-            for (int64_t kh = 0; kh < KH; ++kh) {
-                int64_t oh = ih - kh * DH + PH;
-                if (oh < 0 || oh % SH || oh >= OH * SH) continue;
-                oh /= SH;
-                for (int64_t kw = 0; kw < KW; ++kw) {
-                    int64_t ow = iw - kw * DW + PW;
-                    if (ow < 0 || ow % SW || ow >= OW * SW) continue;
-                    ow /= SW;
-                    for (int64_t oc = 0; oc < OCG; ++oc) {
-                        const int64_t diff_dst_off
-                                = ((oc * OD + od) * OH + oh) * OW + ow;
-                        const int64_t wei_off
-                                = ((oc * ICG * KD + kd) * KH + kh) * KW + kw;
-                        ds += diff_dst_loc[diff_dst_off] * wei_loc[wei_off];
-                    }
-                }
-            }
-        }
-    };
-    sc::utils::parallel_for(0, G * MB * ICG * ID * IH * IW, 1, [&](int64_t i) {
-        int64_t g = i / (MB * ICG * ID * IH * IW),
-                mb = i / (ICG * ID * IH * IW) % MB,
-                ic = i / (ID * IH * IW) % ICG;
-        int64_t id = i / (IH * IW) % ID, ih = i / IW % IH, iw = i % IW;
-        size_t src_off = src_off_f(G, IC, ID, IH, IW, mb, g, ic, id, ih, iw);
-        float &ds = ((float *)diff_src_m)[src_off];
-        float conv_res = 0;
-        if (fast)
-            ker_fast(conv_res, g, mb, ic, id, ih, iw);
-        else
-            ker(conv_res, g, mb, ic, id, ih, iw);
-
-        if (dir & FLAG_BIA) {
-            const size_t bia_off = (size_t)g * ICG + ic;
-            conv_res += ((float *)bia_m)[bia_off];
-        }
-
-        ds = conv_res;
-    });
-}
-
-inline void compute_ref_bwd_weights(const int64_t MB, const int64_t G,
-        const int64_t OC, const int64_t IC, const int64_t IH, const int64_t IW,
-        const int64_t OH, const int64_t OW, const int64_t KH, const int64_t KW,
-        const int64_t SH, const int64_t SW, const int64_t PH, const int64_t PW,
-        float *src_m, float *diff_wei_m, float *diff_dst_m,
-        dir_t dir = dir_t::BWD_W, const int64_t OD = 1, const int64_t ID = 1,
-        const int64_t SD = 1, const int64_t PD = 0, const int64_t KD = 1,
-        const int64_t DD = 1, const int64_t DH = 1, const int64_t DW = 1) {
-    /* help compiler optimize the code */
-
-    const int64_t OCG = OC / G, ICG = IC / G;
-
-    auto ker = [&](float &dw, int64_t g, int64_t oc, int64_t ic, int64_t kd,
-                       int64_t kh, int64_t kw) {
-        for (int64_t mb = 0; mb < MB; ++mb) {
-            const float *__restrict diff_dst_loc = (const float *)diff_dst_m
-                    + (mb * OC + g * OCG + oc) * OD * OH * OW;
-            const float *__restrict src_loc = (const float *)src_m
-                    + (mb * IC + g * ICG + ic) * ID * IH * IW;
-            for (int64_t od = 0; od < OD; ++od) {
-                for (int64_t oh = 0; oh < OH; ++oh) {
-                    for (int64_t ow = 0; ow < OW; ++ow) {
-                        const int64_t id = od * SD + kd * DD - PD;
-                        const int64_t ih = oh * SH + kh * DH - PH;
-                        const int64_t iw = ow * SW + kw * DW - PW;
-                        if (id < 0 || id >= ID || ih < 0 || ih >= IH || iw < 0
-                                || iw >= IW) {
-                            continue;
-                        }
-                        size_t diff_dst_off = (od * OH + oh) * OW + ow;
-                        size_t src_off = (id * IH + ih) * IW + iw;
-                        dw += diff_dst_loc[diff_dst_off] * src_loc[src_off];
-                    }
-                }
-            }
-        }
-    };
-
-    // (OC, IC, KD, KH, KW)
-    // TODO(xxx): fix grouped conv here
-    sc::utils::parallel_for(0, G * OCG * ICG * KD * KH * KW, 1, [&](int64_t i) {
-        int64_t g = i / (OCG * ICG * KD * KH * KW),
-                oc = i / (ICG * KD * KH * KW) % OCG,
-                ic = i / (KD * KH * KW) % ICG;
-        int64_t kd = i / (KH * KW) % KD, kh = i / KW % KH, kw = i % KW;
-        size_t wei_off
-                = wei_off_f(G, OC, IC, KD, KH, KW, g, oc, ic, kd, kh, kw);
-        float &dw = ((float *)diff_wei_m)[wei_off];
-        dw = 0;
-        ker(dw, g, oc, ic, kd, kh, kw);
-    });
-}
-
-template <typename T>
-T NCHW2NCHWc(T &input, int N, int C, int H, int W, int c) {
-    size_t dim3 = C * H * W * c, dim2 = H * W, dim1 = W;
-    T output(input.size());
-    sc::utils::parallel_for(0, N * C * H, 1, [&](int64_t fuse) {
-        auto n = fuse / (C * H), c_o = fuse / H % C, h = fuse % H;
-        for (auto w = 0; w < W; ++w) {
-            for (auto c_i = 0; c_i < c; ++c_i) {
-                output[n * dim3 + c_o * H * W * c + h * W * c + w * c + c_i]
-                        = input[n * dim3 + (c_o * c + c_i) * dim2 + h * dim1
-                                + w];
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NCHWc2NCHW(T &input, int N, int C, int H, int W, int c) {
-    size_t dim3 = C * H * W * c, dim2 = H * W, dim1 = W;
-    T output(input.size());
-    sc::utils::parallel_for(0, N * C * H, 1, [&](int64_t fuse) {
-        auto n = fuse / (C * H), c_o = fuse / H % C, h = fuse % H;
-        for (auto w = 0; w < W; ++w) {
-            for (auto c_i = 0; c_i < c; ++c_i) {
-                output[n * dim3 + (c_o * c + c_i) * dim2 + h * dim1 + w]
-                        = input[n * dim3 + c_o * H * W * c + h * W * c + w * c
-                                + c_i];
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NHWC2NCHW(T &input, int N, int H, int W, int C) {
-    size_t dim3 = C * H * W, dim2 = H * W, dim1 = W;
-    T output(input.size());
-    sc::utils::parallel_for(0, N * C * H, 1, [&](int64_t fuse) {
-        auto n = fuse / (C * H), c = fuse / H % C, h = fuse % H;
-        for (auto w = 0; w < W; ++w) {
-            output[n * dim3 + c * dim2 + h * dim1 + w]
-                    = input[n * dim3 + h * W * C + w * C + c];
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NHWC2NCHWcn(T &input, int N, int C, int H, int W, int c, int n,
-        int origin_n = 0, int origin_c = 0) {
-    size_t dim1 = n, dim2 = c * dim1, dim3 = W * dim2, dim4 = H * dim3,
-           dim5 = C * dim4;
-    if (!origin_n) { origin_n = N * n; }
-    if (!origin_c) { origin_c = C * c; }
-    T output(N * dim5);
-    sc::utils::parallel_for(0, N * C * H, 1, [&](int64_t fuse) {
-        auto out_n = fuse / (C * H), out_c = fuse / H % C, h = fuse % H;
-        for (auto w = 0; w < W; ++w) {
-            for (auto in_c = 0; in_c < c; in_c++) {
-                for (auto in_n = 0; in_n < n; in_n++) {
-                    if (out_n * n + in_n < origin_n
-                            && out_c * c + in_c < origin_c) {
-                        output[out_n * dim5 + out_c * dim4 + h * dim3 + w * dim2
-                                + in_c * dim1 + in_n]
-                                = input[(out_n * n + in_n) * origin_c * H * W
-                                        + h * W * origin_c + w * origin_c
-                                        + out_c * c + in_c];
-                    } else {
-                        output[out_n * dim5 + out_c * dim4 + h * dim3 + w * dim2
-                                + in_c * dim1 + in_n]
-                                = 0;
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T any2NCHW(sc::sc_data_format_t input_format, T &input, int N, int C, int H,
-        int W, int c) {
-    if (input_format == sc::sc_data_format_t::NCHWc(c)
-            || input_format == sc::sc_data_format_t::NGCHWc(c)) {
-        if (c <= 0) { COMPILE_ASSERT(0, "Invalid blocking dim NCHWc: c <= 0"); }
-        return NCHWc2NCHW(input, N, C / c, H, W, c);
-    } else if (input_format == sc::sc_data_format_t::NHWC()
-            || input_format == sc::sc_data_format_t::NHWGC()) {
-        return NHWC2NCHW(input, N, H, W, C);
-    } else {
-        COMPILE_ASSERT(0,
-                "Unsupported input format, only NCHWc and NHWC are supported")
-    }
-}
-
-template <typename T>
-T KCRS2KCRSck(T &input, int K, int C, int R, int S, int c, int k) {
-    size_t dim3 = C * R * S * c, dim2 = R * S, dim1 = S;
-    T output(input.size());
-    sc::utils::parallel_for(0, K * C * R, 1, [&](int64_t fuse) {
-        auto k_o = fuse / (C * R), c_o = fuse / R % C, r = fuse % R;
-        for (auto s = 0; s < S; ++s) {
-            for (auto c_i = 0; c_i < c; ++c_i) {
-                for (auto k_i = 0; k_i < k; ++k_i) {
-                    output[k_o * C * R * S * c * k + c_o * R * S * c * k
-                            + r * S * c * k + s * c * k + c_i * k + k_i]
-                            = input[(k_o * k + k_i) * dim3
-                                    + (c_o * c + c_i) * dim2 + r * dim1 + s];
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T KCRSck2KCRS(T &input, int K, int C, int R, int S, int c, int k) {
-    size_t dim3 = C * R * S * c, dim2 = R * S, dim1 = S;
-    T output(input.size());
-    sc::utils::parallel_for(0, K * C * R, 1, [&](int64_t fuse) {
-        auto k_o = fuse / (C * R), c_o = fuse / R % C, r = fuse % R;
-        for (auto s = 0; s < S; ++s) {
-            for (auto c_i = 0; c_i < c; ++c_i) {
-                for (auto k_i = 0; k_i < k; ++k_i) {
-                    output[(k_o * k + k_i) * dim3 + (c_o * c + c_i) * dim2
-                            + r * dim1 + s]
-                            = input[k_o * C * R * S * c * k
-                                    + c_o * R * S * c * k + r * S * c * k
-                                    + s * c * k + c_i * k + k_i];
-                }
-            }
-        }
-    });
-    return output;
-}
-
-// for vnni blocking format
-template <typename T>
-T KCRSckc2KCRS(T &input, int K_num_blk, int C_num_blk, int R, int S, int C_blk,
-        int K_blk, int C_pack = 4) {
-    COMPILE_ASSERT((C_pack == 2 || C_pack == 4),
-            "Invalid C_pack (" << C_pack
-                               << "), which should be either 2 or 4!");
-    const int K = K_num_blk * K_blk;
-    const int C = C_num_blk * C_blk * C_pack;
-    const int C_full_blk = C_blk * C_pack;
-    const std::vector<int> out_strides = {C * R * S, R * S, S, 1};
-    const std::vector<int> in_strides
-            = {C_num_blk * R * S * C_blk * K_blk * C_pack,
-                    R * S * C_blk * K_blk * C_pack, S * C_blk * K_blk * C_pack,
-                    C_blk * K_blk * C_pack, K_blk * C_pack, C_pack, 1};
-    T output(input.size());
-
-    sc::utils::parallel_for(0, K, 1, [&](int64_t k) {
-        auto k_o = k / K_blk, k_i = k % K_blk;
-        for (int c = 0; c < C; ++c) {
-            auto c_o = c / C_full_blk, c_i = c % C_full_blk / C_pack,
-                 c_ii = c % C_full_blk % C_pack;
-            for (int r = 0; r < R; ++r) {
-                for (int s = 0; s < S; ++s) {
-                    output[k * out_strides[0] + c * out_strides[1]
-                            + r * out_strides[2] + s]
-                            = input[k_o * in_strides[0] + c_o * in_strides[1]
-                                    + r * in_strides[2] + s * in_strides[3]
-                                    + c_i * in_strides[4] + k_i * in_strides[5]
-                                    + c_ii];
-                }
-            }
-        }
-    });
-    return output;
-}
-
-// for vnni blocking format
-template <typename T>
-T KCRS2KCRSckc(T &input, int K_num_blk, int C_num_blk, int R, int S, int C_blk,
-        int K_blk, int C_pack = 4) {
-    COMPILE_ASSERT((C_pack == 2 || C_pack == 4),
-            "Invalid C_pack (" << C_pack
-                               << "), which should be either 2 or 4!");
-    const int K = K_num_blk * K_blk;
-    const int C = C_num_blk * C_blk * C_pack;
-    const int C_full_blk = C_blk * C_pack;
-    const std::vector<int> out_strides = {C * R * S, R * S, S, 1};
-    const std::vector<int> in_strides
-            = {C_num_blk * R * S * C_blk * K_blk * C_pack,
-                    R * S * C_blk * K_blk * C_pack, S * C_blk * K_blk * C_pack,
-                    C_blk * K_blk * C_pack, K_blk * C_pack, C_pack, 1};
-    T output(input.size());
-
-    sc::utils::parallel_for(0, K, 1, [&](int64_t k) {
-        auto k_o = k / K_blk, k_i = k % K_blk;
-        for (int c = 0; c < C; ++c) {
-            auto c_o = c / C_full_blk, c_i = c % C_full_blk / C_pack,
-                 c_ii = c % C_full_blk % C_pack;
-            for (int r = 0; r < R; ++r) {
-                for (int s = 0; s < S; ++s) {
-                    output[k_o * in_strides[0] + c_o * in_strides[1]
-                            + r * in_strides[2] + s * in_strides[3]
-                            + c_i * in_strides[4] + k_i * in_strides[5] + c_ii]
-                            = input[k * out_strides[0] + c * out_strides[1]
-                                    + r * out_strides[2] + s];
-                }
-            }
-        }
-    });
-    return output;
-}
-
-// for 3d convolution
-template <typename T>
-T NCDHWc2NCDHW(T &input, int N, int C_num_blk, int D, int H, int W, int C_blk) {
-    const std::vector<int> in_strides = {C_num_blk * D * H * W * C_blk,
-            D * H * W * C_blk, H * W * C_blk, W * C_blk, C_blk, 1};
-    const std::vector<int> out_strides
-            = {C_num_blk * C_blk * D * H * W, D * H * W, H * W, W, 1};
-    T output(input.size());
-    // SC_OMP_CLAUSE("omp parallel for collapse(6)")
-    sc::utils::parallel_for(0, N, 1, [&](int64_t n) {
-        for (auto c_o = 0; c_o < C_num_blk; ++c_o) {
-            for (auto d = 0; d < D; ++d) {
-                for (auto h = 0; h < H; ++h) {
-                    for (auto w = 0; w < W; ++w) {
-                        for (auto c_i = 0; c_i < C_blk; ++c_i) {
-                            output[n * out_strides[0]
-                                    + (c_o * C_blk + c_i) * out_strides[1]
-                                    + d * out_strides[2] + h * out_strides[3]
-                                    + w * out_strides[4]]
-                                    = input[n * in_strides[0]
-                                            + c_o * in_strides[1]
-                                            + d * in_strides[2]
-                                            + h * in_strides[3]
-                                            + w * in_strides[4]
-                                            + c_i * in_strides[5]];
-                        }
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NDHWC2NCDHW(T &input, int N, int D, int H, int W, int C) {
-    const std::vector<int> in_strides = {D * H * W * C, H * W * C, W * C, C, 1};
-    const std::vector<int> out_strides
-            = {C * D * H * W, D * H * W, H * W, W, 1};
-    T output(input.size());
-    sc::utils::parallel_for(0, N, 1, [&](int64_t n) {
-        for (auto d = 0; d < D; ++d) {
-            for (auto h = 0; h < H; ++h) {
-                for (auto w = 0; w < W; ++w) {
-                    for (auto c = 0; c < C; ++c) {
-                        output[n * out_strides[0] + c * out_strides[1]
-                                + d * out_strides[2] + h * out_strides[3]
-                                + w * out_strides[4]]
-                                = input[n * in_strides[0] + d * in_strides[1]
-                                        + h * in_strides[2] + w * in_strides[3]
-                                        + c * in_strides[4]];
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T any2NCDHW(sc::sc_data_format_t input_format, T &input, int N, int C, int D,
-        int H, int W, int c) {
-    if (input_format == sc::sc_data_format_t::NCDHWc(c)) {
-        if (c <= 0) {
-            COMPILE_ASSERT(0, "Invalid blocking dim NCDHWc: c <= 0");
-        }
-        return NCDHWc2NCDHW(input, N, C / c, D, H, W, c);
-    } else if (input_format == sc::sc_data_format_t::NDHWC()) {
-        return NDHWC2NCDHW(input, N, D, H, W, C);
-    } else {
-        COMPILE_ASSERT(0,
-                "Unsupported input format, only NCDHWc and NDHWC are supported")
-    }
-}
-
-template <typename T>
-T KCDRSck2KCDRS(T &input, int K_num_blk, int C_num_blk, int D, int R, int S,
-        int C_blk, int K_blk) {
-    T output(input.size());
-    const std::vector<int> in_strides = {C_num_blk * D * R * S * C_blk * K_blk,
-            D * R * S * C_blk * K_blk, R * S * C_blk * K_blk, S * C_blk * K_blk,
-            C_blk * K_blk, K_blk, 1};
-    const std::vector<int> out_strides
-            = {C_num_blk * C_blk * D * R * S, D * R * S, R * S, S, 1};
-    // SC_OMP_CLAUSE("omp parallel for collapse(7)")
-    sc::utils::parallel_for(0, K_num_blk, 1, [&](int64_t k_o) {
-        for (auto c_o = 0; c_o < C_num_blk; ++c_o) {
-            for (auto d = 0; d < D; ++d) {
-                for (auto r = 0; r < R; ++r) {
-                    for (auto s = 0; s < S; ++s) {
-                        for (auto c_i = 0; c_i < C_blk; ++c_i) {
-                            for (auto k_i = 0; k_i < K_blk; ++k_i) {
-                                output[(k_o * K_blk + k_i) * out_strides[0]
-                                        + (c_o * C_blk + c_i) * out_strides[1]
-                                        + d * out_strides[2]
-                                        + r * out_strides[3]
-                                        + s * out_strides[4]]
-                                        = input[k_o * in_strides[0]
-                                                + c_o * in_strides[1]
-                                                + d * in_strides[2]
-                                                + r * in_strides[3]
-                                                + s * in_strides[4]
-                                                + c_i * in_strides[5]
-                                                + k_i * in_strides[6]];
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    });
-
-    return output;
-}
-
-template <typename T>
-T KCDRSckc2KCDRS(T &input, int K_num_blk, int C_num_blk, int D, int R, int S,
-        int C_blk, int K_blk, int C_pack = 4) {
-    COMPILE_ASSERT((C_pack == 2 || C_pack == 4),
-            "Invalid C_pack (" << C_pack
-                               << "), which should be either 2 or 4!");
-
-    T output(input.size());
-    const std::vector<int> in_strides
-            = {C_num_blk * D * R * S * C_blk * K_blk * C_pack,
-                    D * R * S * C_blk * K_blk * C_pack,
-                    R * S * C_blk * K_blk * C_pack, S * C_blk * K_blk * C_pack,
-                    C_blk * K_blk * C_pack, K_blk * C_pack, C_pack, 1};
-    const std::vector<int> out_strides
-            = {C_num_blk * C_blk * C_pack * D * R * S, D * R * S, R * S, S, 1};
-    // SC_OMP_CLAUSE("omp parallel for collapse(8)")
-    sc::utils::parallel_for(0, K_num_blk, 1, [&](int64_t k_o) {
-        for (auto c_o = 0; c_o < C_num_blk; ++c_o) {
-            for (auto d = 0; d < D; ++d) {
-                for (auto r = 0; r < R; ++r) {
-                    for (auto s = 0; s < S; ++s) {
-                        for (auto c_i = 0; c_i < C_blk; ++c_i) {
-                            for (auto k_i = 0; k_i < K_blk; ++k_i) {
-                                for (auto c_p = 0; c_p < C_pack; ++c_p) {
-                                    output[(k_o * K_blk + k_i) * out_strides[0]
-                                            + ((c_o * C_blk + c_i) * C_pack
-                                                      + c_p)
-                                                    * out_strides[1]
-                                            + d * out_strides[2]
-                                            + r * out_strides[3]
-                                            + s * out_strides[4]]
-                                            = input[k_o * in_strides[0]
-                                                    + c_o * in_strides[1]
-                                                    + d * in_strides[2]
-                                                    + r * in_strides[3]
-                                                    + s * in_strides[4]
-                                                    + c_i * in_strides[5]
-                                                    + k_i * in_strides[6]
-                                                    + c_p * in_strides[7]];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    });
-
-    return output;
-}
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/eltwise_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/eltwise_ref.hpp
deleted file mode 100644
index 5566eb50c9d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/eltwise_ref.hpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_ELTWISE_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_ELTWISE_REF_HPP
-
-#include <assert.h>
-#include <memory>
-#include <utility>
-#include <test_utils.hpp>
-
-using dnnl::impl::graph::gc::sc_dims;
-
-inline void compute_elementwise_ref_direct_fwd(float *src_m1, float *src_m2,
-        sc_dims dims, float *dst_m = nullptr, bool inplace = true,
-        char op = '+') {
-    int64_t ranges = 1;
-    float *ref;
-    for (auto d : dims)
-        ranges *= d;
-    if (inplace)
-        ref = src_m1;
-    else
-        ref = dst_m;
-    std::function<void(int64_t)> thefunc;
-    switch (op) {
-        case '+':
-            thefunc = [&](int64_t i) { ref[i] = src_m1[i] + src_m2[i]; };
-            break;
-        case '-':
-            thefunc = [&](int64_t i) { ref[i] = src_m1[i] - src_m2[i]; };
-            break;
-        case '*':
-            thefunc = [&](int64_t i) { ref[i] = src_m1[i] * src_m2[i]; };
-            break;
-        case '/':
-            thefunc = [&](int64_t i) { ref[i] = src_m1[i] / src_m2[i]; };
-            break;
-        default:
-            std::cout << "Unexpected elementwise opertaor: " << op << std::endl;
-            break;
-    }
-    dnnl::impl::graph::gc::utils::parallel_for(0, ranges, 1, thefunc);
-}
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/gemm_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/gemm_ref.hpp
deleted file mode 100644
index 509db03ede9..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/gemm_ref.hpp
+++ /dev/null
@@ -1,952 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_GEMM_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_GEMM_REF_HPP
-
-#include <stdlib.h>
-#include <time.h>
-#include <vector>
-#include <test_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-struct gemm_params {
-    bool transA;
-    bool transB;
-    int64_t M;
-    int64_t N;
-    int64_t K;
-    float alpha;
-    float beta;
-    int64_t lda;
-    int64_t ldb;
-    int64_t ldc;
-};
-
-template <typename a_t, typename b_t, typename c_t>
-static void ref_gemm(const gemm_params &param, const a_t *A, const b_t *B,
-        c_t *C, const c_t *bias = nullptr) {
-    const bool transA = param.transA;
-    const bool transB = param.transB;
-
-    const auto lda = transA ? param.M : param.K;
-    const auto ldb = transB ? param.K : param.N;
-    const auto ldc = param.N;
-    TEST_ASSERT((param.lda == lda),
-            "lda is " << param.lda << ", but expected to be " << lda);
-    TEST_ASSERT((param.ldb == ldb),
-            "ldb is " << param.ldb << ", but expected to be " << ldb);
-    TEST_ASSERT((param.ldc == ldc),
-            "ldc is " << param.ldc << ", but expected to be " << ldc);
-
-    auto pa = [&](int i, int j) { return A[i * param.lda + j]; };
-    auto pb = [&](int i, int j) { return B[i * param.ldb + j]; };
-    auto pc = [&](int i, int j) -> c_t & { return C[i * param.ldc + j]; };
-    auto pbias = [&](int i) {
-        if (bias)
-            return bias[i];
-        else
-            return (c_t)0.0;
-    };
-
-    test_utils::parallel_nd(param.M, param.N, [&](int64_t im, int64_t in) {
-        c_t c_elem = (param.beta == 0.) ? 0. : pc(im, in) * param.beta;
-
-        for (int ik = 0; ik < param.K; ik++) {
-            const a_t a_elem = transA ? pa(ik, im) : pa(im, ik);
-            const b_t b_elem = transB ? pb(in, ik) : pb(ik, in);
-            c_elem += param.alpha * a_elem * b_elem;
-        }
-        pc(im, in) = c_elem + pbias(in);
-    });
-}
-
-template <typename T>
-T MK2MKmk(T &input, int M, int K, int m, int k, int origin_m = 0,
-        int origin_k = 0) {
-    origin_k = origin_k ? origin_k : K * k;
-    origin_m = origin_m ? origin_m : M * m;
-    T output(test_utils::product({M, K, m, k}));
-    int dim3 = K * m * k, dim2 = m * k, dim1 = k;
-    utils::parallel_for(0, M, 1, [&](int64_t m_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            for (auto m_i = 0; m_i < m; ++m_i) {
-                for (auto k_i = 0; k_i < k; ++k_i) {
-                    if ((m_o * m + m_i < origin_m)
-                            && (k_o * k + k_i < origin_k)) {
-                        output[m_o * dim3 + k_o * dim2 + m_i * dim1 + k_i]
-                                = input[(m_o * m + m_i) * origin_k + k_o * k
-                                        + k_i];
-                    } else {
-                        output[m_o * dim3 + k_o * dim2 + m_i * dim1 + k_i] = 0;
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T MKmk2MK(T &input, int M, int K, int m, int k, int origin_m = 0,
-        int origin_k = 0) {
-    origin_k = origin_k ? origin_k : K * k;
-    origin_m = origin_m ? origin_m : M * m;
-    T output(origin_m * origin_k);
-    int dim3 = K * m * k, dim2 = m * k, dim1 = k;
-    utils::parallel_for(0, M, 1, [&](int64_t m_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            for (auto m_i = 0; m_i < m; ++m_i) {
-                for (auto k_i = 0; k_i < k; ++k_i) {
-                    if ((m_o * m + m_i) < origin_m
-                            && (k_o * k + k_i) < origin_k) {
-                        output[(m_o * m + m_i) * origin_k + k_o * k + k_i]
-                                = input[m_o * dim3 + k_o * dim2 + m_i * dim1
-                                        + k_i];
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T batch_MKmk2MK(T &input, int batch, int M, int K, int m, int k) {
-    T output(input.size());
-    int dim4 = M * K * m * k, dim3 = K * m * k, dim2 = m * k, dim1 = k;
-    utils::parallel_for(0, batch, 1, [&](int64_t b) {
-        for (auto m_o = 0; m_o < M; ++m_o) {
-            for (auto k_o = 0; k_o < K; ++k_o) {
-                for (auto m_i = 0; m_i < m; ++m_i) {
-                    for (auto k_i = 0; k_i < k; ++k_i) {
-                        output[b * dim4 + (m_o * m + m_i) * K * k + k_o * k
-                                + k_i]
-                                = input[b * dim4 + m_o * dim3 + k_o * dim2
-                                        + m_i * dim1 + k_i];
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T KN2NKkn(T &input, int N, int K, int k, int n, int origin_k = 0,
-        int origin_n = 0, int dtype_block = 1) {
-    origin_k = origin_k ? origin_k : K * k;
-    origin_n = origin_n ? origin_n : N * n;
-    int pad_k = utils::divide_and_ceil(k, dtype_block);
-    T output(test_utils::product({N, K, pad_k, n, dtype_block}));
-    int dim4 = K * pad_k * n * dtype_block, dim3 = pad_k * n * dtype_block,
-        dim2 = n * dtype_block, dim1 = dtype_block;
-    // #pragma omp parallel for
-    for (auto n_o = 0; n_o < N; ++n_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            for (auto k_i = 0; k_i < pad_k; ++k_i) {
-                for (auto n_i = 0; n_i < n; ++n_i) {
-                    for (auto k_b = 0; k_b < dtype_block; ++k_b) {
-                        if ((n_o * n + n_i < origin_n)
-                                && (k_o * k + k_i * dtype_block + k_b
-                                        < origin_k)
-                                && (k_i * dtype_block + k_b) < k) {
-                            output[n_o * dim4 + k_o * dim3 + k_i * dim2
-                                    + n_i * dim1 + k_b]
-                                    = input[(k_o * k + k_i * dtype_block + k_b)
-                                                    * origin_n
-                                            + n_o * n + n_i];
-                        } else {
-                            output[n_o * dim4 + k_o * dim3 + k_i * dim2
-                                    + n_i * dim1 + k_b]
-                                    = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return output;
-}
-template <typename T>
-T ABC2ABCbcb(T &input, int A, int B, int C, int b, int c, int origin_A,
-        int origin_B, int origin_C) {
-    const int dtype_block = 1;
-    T output(test_utils::product({A, B, C, b, c, dtype_block}));
-    int dim6 = B * C * b * c * dtype_block, dim5 = C * b * c * dtype_block,
-        dim4 = b * c * dtype_block, dim3 = c * dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto b_i = 0; b_i < b; ++b_i) {
-                    for (auto c_i = 0; c_i < c; ++c_i) {
-                        if ((c_o * c + c_i < origin_C)
-                                && (b_o * b + b_i) < origin_B) {
-                            auto input_idx = (b_o * b + b_i) * origin_C
-                                    + (c_o * c + c_i)
-                                    + (a_o) * (origin_C * origin_B);
-                            output[a_o * dim6 + b_o * dim5 + c_o * dim4
-                                    + b_i * dim3 + c_i]
-                                    = input[input_idx];
-                        } else {
-                            output[a_o * dim6 + b_o * dim5 + c_o * dim4
-                                    + b_i * dim3 + c_i]
-                                    = 0;
-                        }
-                    }
-                }
-    }
-    return output;
-}
-template <typename T>
-T ABCbcb2ABC(T &input, int A, int B, int C, int b, int c, int origin_A,
-        int origin_B, int origin_C) {
-    const int dtype_block = 1;
-    T output(test_utils::product({A, B, C, b, c, dtype_block}));
-    int dim6 = B * C * b * c * dtype_block, dim5 = C * b * c * dtype_block,
-        dim4 = b * c * dtype_block, dim3 = c * dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto b_i = 0; b_i < b; ++b_i) {
-                    for (auto c_i = 0; c_i < c; ++c_i) {
-                        if ((c_o * c + c_i < origin_C)
-                                && (b_o * b + b_i) < origin_B) {
-                            auto output_idx = (b_o * b + b_i) * origin_C
-                                    + (c_o * c + c_i)
-                                    + (a_o) * (origin_C * origin_B);
-                            auto input_idx = a_o * dim6 + b_o * dim5
-                                    + c_o * dim4 + b_i * dim3 + c_i;
-                            output[output_idx] = input[input_idx];
-                        } else {
-                            auto output_idx = (b_o * b + b_i) * origin_C
-                                    + (c_o * c + c_i)
-                                    + (a_o) * (origin_C * origin_B);
-                            output[output_idx] = 0;
-                        }
-                    }
-                }
-    }
-    return output;
-}
-template <typename T>
-T ABCD2BACD(T &input, int B, int A, int C, int D, int origin_A, int origin_B,
-        int origin_C, int origin_D) {
-    T output(test_utils::product({B, A, C, D}));
-    int dim1 = origin_D, dim2 = origin_C * dim1, dim3 = origin_B * dim2;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto d_o = 0; d_o < D; ++d_o) {
-                    if ((c_o < origin_C) && b_o < origin_B && a_o < origin_A
-                            && d_o < origin_D) {
-                        auto output_idx = b_o * (origin_A * origin_C * origin_D)
-                                + a_o * (origin_C * origin_D) + c_o * (origin_D)
-                                + d_o;
-                        auto input_idx
-                                = a_o * dim3 + b_o * dim2 + c_o * dim1 + d_o;
-                        output[output_idx] = input[input_idx];
-                    } else {
-                        auto output_idx = b_o * (origin_A * origin_C * origin_D)
-                                + a_o * (origin_C * origin_D) + c_o * (origin_D)
-                                + d_o;
-                        output[output_idx] = 0;
-                    }
-                }
-    }
-    return output;
-}
-template <typename T>
-T ABab2BAab(T &input, int B, int A, int a, int b, int origin_A, int origin_B,
-        int origin_a, int origin_b) {
-    T output(test_utils::product({B, A, a, b}));
-    int plain_A = origin_A * origin_a;
-    int plain_B = origin_B * origin_b;
-    for (auto b_o = 0; b_o < B; ++b_o) {
-        for (auto a_o = 0; a_o < A; ++a_o)
-            for (auto a_i = 0; a_i < a; ++a_i)
-                for (auto b_i = 0; b_i < b; ++b_i) {
-                    if ((a_o * a + a_i) < plain_A
-                            && (b_o * b + b_i) < plain_B) {
-                        auto output_idx = b_o * (A * a * b) + a_o * (a * b)
-                                + a_i * b + b_i;
-                        int cur_b_o = (b_o * b + b_i) / origin_b;
-                        int cur_a_o = (a_o * a + a_i) / origin_a;
-                        int cur_b_i = (b_o * b + b_i) % origin_b;
-                        int cur_a_i = (a_o * a + a_i) % origin_a;
-                        auto input_idx
-                                = cur_a_o * (origin_B * origin_a * origin_b)
-                                + cur_b_o * (origin_a * origin_b)
-                                + cur_a_i * origin_b + cur_b_i;
-                        output[output_idx] = input[input_idx];
-                    } else {
-                        auto output_idx = b_o * (A * a * b) + a_o * (a * b)
-                                + a_i * b + b_i;
-                        output[output_idx] = 0;
-                    }
-                }
-    }
-    return output;
-}
-template <typename T>
-T ACBD2ABDCcd(T &input, int A, int B, int D, int C, int c, int d, int origin_A,
-        int origin_C, int origin_B, int origin_D, int dtype_block = 1) {
-    int pad_c = utils::divide_and_ceil(c, dtype_block);
-    T output(test_utils::product({A, B, C, D, pad_c, d, dtype_block}));
-    int dim6 = B * C * D * pad_c * d * dtype_block,
-        dim5 = C * D * pad_c * d * dtype_block,
-        dim4 = C * pad_c * d * dtype_block, dim3 = pad_c * d * dtype_block,
-        dim2 = d * dtype_block, dim1 = dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto d_o = 0; d_o < D; ++d_o) {
-                for (auto c_o = 0; c_o < C; ++c_o)
-                    for (auto c_i = 0; c_i < pad_c; ++c_i) {
-                        for (auto d_i = 0; d_i < d; ++d_i) {
-                            for (auto c_b = 0; c_b < dtype_block; ++c_b) {
-                                if ((d_o * d + d_i < origin_D)
-                                        && (c_o * c + c_i * dtype_block + c_b
-                                                < origin_C)
-                                        && (c_i * dtype_block + c_b) < c) {
-                                    auto input_idx = (d_o * d + d_i)
-                                            + (b_o)*origin_D
-                                            + (c_o * c + c_i * dtype_block
-                                                      + c_b)
-                                                    * (origin_B * origin_D)
-                                            + (a_o)
-                                                    * (origin_C * origin_B
-                                                            * origin_D);
-                                    output[a_o * dim6 + b_o * dim5 + d_o * dim4
-                                            + c_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = input[input_idx];
-                                } else {
-                                    output[a_o * dim6 + b_o * dim5 + d_o * dim4
-                                            + c_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = 0;
-                                }
-                            }
-                        }
-                    }
-            }
-    }
-    return output;
-}
-template <typename T>
-T ADBC2ABDCcd(T &input, int A, int B, int D, int C, int c, int d, int origin_A,
-        int origin_D, int origin_B, int origin_C, int dtype_block = 1) {
-    int pad_c = utils::divide_and_ceil(c, dtype_block);
-    T output(test_utils::product({A, B, C, D, pad_c, d, dtype_block}));
-    int dim6 = B * C * D * pad_c * d * dtype_block,
-        dim5 = C * D * pad_c * d * dtype_block,
-        dim4 = C * pad_c * d * dtype_block, dim3 = pad_c * d * dtype_block,
-        dim2 = d * dtype_block, dim1 = dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto d_o = 0; d_o < D; ++d_o) {
-                for (auto c_o = 0; c_o < C; ++c_o)
-                    for (auto c_i = 0; c_i < pad_c; ++c_i) {
-                        for (auto d_i = 0; d_i < d; ++d_i) {
-                            for (auto c_b = 0; c_b < dtype_block; ++c_b) {
-                                if ((d_o * d + d_i < origin_D)
-                                        && (c_o * c + c_i * dtype_block + c_b
-                                                < origin_C)
-                                        && (c_i * dtype_block + c_b) < c) {
-                                    auto input_idx
-                                            = (c_o * c + c_i * dtype_block
-                                                      + c_b)
-                                            + (b_o)*origin_C
-                                            + (d_o * d + d_i) * origin_C
-                                                    * origin_B
-                                            + (a_o)
-                                                    * (origin_C * origin_B
-                                                            * origin_D);
-                                    output[a_o * dim6 + b_o * dim5 + d_o * dim4
-                                            + c_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = input[input_idx];
-                                } else {
-                                    output[a_o * dim6 + b_o * dim5 + d_o * dim4
-                                            + c_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = 0;
-                                }
-                            }
-                        }
-                    }
-            }
-    }
-    return output;
-}
-template <typename T>
-T ACBD2ABCDcd(T &input, int A, int B, int C, int D, int c, int d, int origin_A,
-        int origin_C, int origin_B, int origin_D, int dtype_block = 1) {
-    int pad_c = utils::divide_and_ceil(c, dtype_block);
-    T output(test_utils::product({A, B, C, D, pad_c, d, dtype_block}));
-    int dim6 = B * C * D * pad_c * d * dtype_block,
-        dim5 = C * D * pad_c * d * dtype_block,
-        dim4 = D * pad_c * d * dtype_block, dim3 = pad_c * d * dtype_block,
-        dim2 = d * dtype_block, dim1 = dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto d_o = 0; d_o < D; ++d_o) {
-                    for (auto c_i = 0; c_i < pad_c; ++c_i) {
-                        for (auto d_i = 0; d_i < d; ++d_i) {
-                            for (auto c_b = 0; c_b < dtype_block; ++c_b) {
-                                if ((d_o * d + d_i < origin_D)
-                                        && (c_o * c + c_i * dtype_block + c_b
-                                                < origin_C)
-                                        && (c_i * dtype_block + c_b) < c) {
-                                    auto input_idx = (d_o * d + d_i)
-                                            + (b_o)*origin_D
-                                            + (c_o * c + c_i * dtype_block
-                                                      + c_b)
-                                                    * (origin_B * origin_D)
-                                            + (a_o)
-                                                    * (origin_C * origin_B
-                                                            * origin_D);
-                                    output[a_o * dim6 + b_o * dim5 + c_o * dim4
-                                            + d_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = input[input_idx];
-                                } else {
-                                    output[a_o * dim6 + b_o * dim5 + c_o * dim4
-                                            + d_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-    }
-    return output;
-}
-
-template <typename T>
-T ABC2ABCbac(T &input, int A, int B, int C, int b, int a, int c, int origin_A,
-        int origin_B, int origin_C, int dtype_block = 1) {
-    int pad_b = utils::divide_and_ceil(b, dtype_block);
-    T output(test_utils::product({A, B, C, pad_b, a, c, dtype_block}));
-    int dim5 = B * C * pad_b * a * dtype_block * c,
-        dim4 = C * pad_b * a * dtype_block * c,
-        dim3 = pad_b * a * dtype_block * c, dim2 = a * dtype_block * c,
-        dim1 = dtype_block * c, dim0 = c;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto b_i = 0; b_i < pad_b; ++b_i) {
-                    for (auto a_i = 0; a_i < a; ++a_i) {
-                        for (auto b_b = 0; b_b < dtype_block; ++b_b) {
-                            for (auto c_i = 0; c_i < c; ++c_i) {
-                                if ((b_b + b_i * dtype_block + b_o * b
-                                            < origin_B)
-                                        && (c_o * c + c_i < origin_C)
-                                        && (a_o * a + a_i) < origin_A) {
-                                    auto input_idx = (a_o * a + a_i)
-                                                    * (origin_B) * (origin_C)
-                                            + (b_o * b + b_i * dtype_block
-                                                      + b_b)
-                                                    * (origin_C)
-                                            + c_o * c + c_i;
-                                    output[a_o * dim5 + b_o * dim4 + c_o * dim3
-                                            + b_i * dim2 + a_i * dim1
-                                            + b_b * dim0 + c_i]
-                                            = input[input_idx];
-                                } else {
-                                    output[a_o * dim5 + b_o * dim4 + c_o * dim3
-                                            + b_i * dim2 + a_i * dim1
-                                            + b_b * dim0 + c_i]
-                                            = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-    }
-    return output;
-}
-
-template <typename T>
-T ABC2ABCabc(T &input, int A, int B, int C, int a, int b, int c, int origin_A,
-        int origin_B, int origin_C) {
-    T output(test_utils::product({A, B, C, a, b, c}));
-    int dim4 = B * C * a * b * c, dim3 = C * a * b * c, dim2 = a * b * c,
-        dim1 = b * c, dim0 = c;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto a_i = 0; a_i < a; ++a_i) {
-                    for (auto b_i = 0; b_i < b; ++b_i) {
-                        for (auto c_i = 0; c_i < c; ++c_i) {
-                            if ((b_i + b_o * b < origin_B)
-                                    && (c_o * c + c_i < origin_C)
-                                    && (a_o * a + a_i) < origin_A) {
-                                auto input_idx = (a_o * a + a_i) * (origin_B)
-                                                * (origin_C)
-                                        + (b_o * b + b_i) * (origin_C) + c_o * c
-                                        + c_i;
-                                output[a_o * dim4 + b_o * dim3 + c_o * dim2
-                                        + a_i * dim1 + b_i * dim0 + c_i]
-                                        = input[input_idx];
-                            } else {
-                                output[a_o * dim4 + b_o * dim3 + c_o * dim2
-                                        + a_i * dim1 + b_i * dim0 + c_i]
-                                        = 0;
-                            }
-                        }
-                    }
-                }
-    }
-    return output;
-}
-
-template <typename T>
-T ABaba2ABab(T &input, int A, int B, int a0, int b0, int a1, int output_A,
-        int output_B, int output_a, int output_b) {
-    T output(test_utils::product({output_A, output_B, output_a, output_b}));
-    int dim4 = B * a1 * b0 * a1, dim3 = a1 * b0 * a1, dim2 = b0 * a1, dim1 = a1;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o) {
-            for (auto a_i = 0; a_i < a1; ++a_i) {
-                for (auto b_i = 0; b_i < b0; ++b_i) {
-                    for (auto a_i_i = 0; a_i_i < a1; ++a_i_i) {
-                        auto total_a = (a_i_i + a_i * a1 + a_o * (a0 * a1));
-                        auto total_b = (b_i + b_o * b0);
-                        auto output_idx = (total_a / output_a)
-                                        * (output_B * output_a * output_b)
-                                + (total_b / output_b) * (output_a * output_b)
-                                + (total_a % output_a) * (output_b)
-                                + (total_b % output_b);
-                        if ((total_b < output_B * output_b)
-                                && (total_a < output_A * output_a)) {
-                            auto input_idx = a_o * dim4 + b_o * dim3
-                                    + a_i * dim2 + b_i * dim1 + a_i_i;
-                            output[output_idx] = input[input_idx];
-                        } else {
-                            output[output_idx] = 0;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return output;
-}
-
-template <typename T>
-T ACBDcd2ABCDcd(T &input, int A, int B, int C, int D, int c, int d,
-        int origin_A, int origin_C, int origin_B, int origin_D, int origin_c,
-        int origin_d) {
-    T output(test_utils::product({A, B, C, D, c, d}));
-    int plain_A = origin_A;
-    int plain_B = origin_B;
-    int plain_C = origin_C * origin_c;
-    int plain_D = origin_D * origin_d;
-    for (auto a_o = 0; a_o < A; ++a_o)
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto d_o = 0; d_o < D; ++d_o) {
-                    for (auto c_i = 0; c_i < c; ++c_i)
-                        for (auto d_i = 0; d_i < d; ++d_i) {
-                            if ((a_o) < plain_A && (b_o) < plain_B
-                                    && (c_o * c + c_i) < plain_C
-                                    && (d_o * d + d_i) < plain_D) {
-                                auto output_idx = a_o * (B * C * D * c * d)
-                                        + b_o * (C * D * c * d)
-                                        + c_o * (D * c * d) + d_o * (c * d)
-                                        + c_i * d + d_i;
-
-                                int cur_a_o = (a_o);
-                                int cur_c_o = (c_o * c + c_i) / origin_c;
-                                int cur_c_i = (c_o * c + c_i) % origin_c;
-                                int cur_b_o = (b_o);
-                                int cur_d_o = (d_o * d + d_i) / origin_d;
-                                int cur_d_i = (d_o * d + d_i) % origin_d;
-                                auto input_idx = cur_a_o
-                                                * (origin_C * origin_B
-                                                        * origin_D * origin_c
-                                                        * origin_d)
-                                        + cur_c_o
-                                                * (origin_B * origin_D
-                                                        * origin_c * origin_d)
-                                        + cur_b_o
-                                                * (origin_D * origin_c
-                                                        * origin_d)
-
-                                        + cur_d_o * (origin_c * origin_d)
-                                        + cur_c_i * origin_d + cur_d_i;
-                                output[output_idx] = input[input_idx];
-                            } else {
-                                auto output_idx = a_o * (C * B * D * c * d)
-                                        + c_o * (B * D * c * d)
-                                        + b_o * (D * c * d) + d_o * (c * d)
-                                        + c_i * d + d_i;
-                                output[output_idx] = 0;
-                            }
-                        }
-                }
-    return output;
-}
-
-template <typename T>
-T ABDC2ABCDcd(T &input, int A, int B, int C, int D, int c, int d, int origin_A,
-        int origin_B, int origin_D, int origin_C, int dtype_block = 1) {
-    int pad_c = utils::divide_and_ceil(c, dtype_block);
-    T output(test_utils::product({A, B, C, D, pad_c, d, dtype_block}));
-    int dim6 = B * C * D * pad_c * d * dtype_block,
-        dim5 = C * D * pad_c * d * dtype_block,
-        dim4 = D * pad_c * d * dtype_block, dim3 = pad_c * d * dtype_block,
-        dim2 = d * dtype_block, dim1 = dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto c_o = 0; c_o < C; ++c_o)
-                for (auto d_o = 0; d_o < D; ++d_o) {
-                    for (auto c_i = 0; c_i < pad_c; ++c_i) {
-                        for (auto d_i = 0; d_i < d; ++d_i) {
-                            for (auto c_b = 0; c_b < dtype_block; ++c_b) {
-                                if ((d_o * d + d_i < origin_D)
-                                        && (c_o * c + c_i * dtype_block + c_b
-                                                < origin_C)
-                                        && (c_i * dtype_block + c_b) < c) {
-                                    auto input_idx
-                                            = (c_o * c + c_i * dtype_block
-                                                      + c_b)
-                                            + (d_o * d + d_i) * origin_C
-                                            + (b_o) * (origin_D * origin_C)
-                                            + (a_o)
-                                                    * (origin_B * origin_D
-                                                            * origin_C);
-                                    output[a_o * dim6 + b_o * dim5 + c_o * dim4
-                                            + d_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = input[input_idx];
-                                } else {
-                                    output[a_o * dim6 + b_o * dim5 + c_o * dim4
-                                            + d_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = 0;
-                                }
-                            }
-                        }
-                    }
-                }
-    }
-    return output;
-}
-
-template <typename T>
-T ABCD2ABDCcd(T &input, int A, int B, int C, int D, int c, int d, int origin_A,
-        int origin_B, int origin_C, int origin_D, int dtype_block = 1) {
-    int pad_c = utils::divide_and_ceil(c, dtype_block);
-    T output(test_utils::product({A, B, C, D, pad_c, d, dtype_block}));
-    int dim6 = B * C * D * pad_c * d * dtype_block,
-        dim5 = C * D * pad_c * d * dtype_block,
-        dim4 = C * pad_c * d * dtype_block, dim3 = pad_c * d * dtype_block,
-        dim2 = d * dtype_block, dim1 = dtype_block;
-    for (auto a_o = 0; a_o < A; ++a_o) {
-        for (auto b_o = 0; b_o < B; ++b_o)
-            for (auto d_o = 0; d_o < D; ++d_o) {
-                for (auto c_o = 0; c_o < C; ++c_o)
-                    for (auto c_i = 0; c_i < pad_c; ++c_i) {
-                        for (auto d_i = 0; d_i < d; ++d_i) {
-                            for (auto c_b = 0; c_b < dtype_block; ++c_b) {
-                                if ((d_o * d + d_i < origin_D)
-                                        && (c_o * c + c_i * dtype_block + c_b
-                                                < origin_C)
-                                        && (c_i * dtype_block + c_b) < c) {
-                                    auto input_idx = (d_o * d + d_i)
-                                            + (c_o * c + c_i * dtype_block
-                                                      + c_b)
-                                                    * origin_D
-                                            + (b_o) * (origin_D * origin_C)
-                                            + (a_o)
-                                                    * (origin_B * origin_D
-                                                            * origin_C);
-                                    output[a_o * dim6 + b_o * dim5 + d_o * dim4
-                                            + c_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = input[input_idx];
-                                } else {
-                                    output[a_o * dim6 + b_o * dim5 + d_o * dim4
-                                            + c_o * dim3 + c_i * dim2
-                                            + d_i * dim1 + c_b]
-                                            = 0;
-                                }
-                            }
-                        }
-                    }
-            }
-    }
-    return output;
-}
-
-template <typename T, typename A>
-std::vector<T, A> KN2KNkn(
-        std::vector<T, A> &input, int K, int N, int k, int n) {
-    std::vector<T, A> output(input.size());
-    int dim3 = N * k * n, dim2 = k * n, dim1 = n;
-    utils::parallel_for(0, K, 1, [&](int64_t k_o) {
-        for (auto n_o = 0; n_o < N; ++n_o) {
-            for (auto k_i = 0; k_i < k; ++k_i) {
-                for (auto n_i = 0; n_i < n; ++n_i) {
-                    output[k_o * dim3 + n_o * dim2 + k_i * dim1 + n_i]
-                            = input[(k_o * k + k_i) * N * n + n_o * n + n_i];
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NK2NKknk(T &input, int N, int K, int k, int n, int k2) {
-    T output(input.size());
-    int dim4 = K * k * n * k2, dim3 = k * n * k2, dim2 = n * k2, dim1 = k2;
-    utils::parallel_for(0, N, 1, [&](int64_t n_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            for (auto k_i = 0; k_i < k; ++k_i) {
-                for (auto n_i = 0; n_i < n; ++n_i) {
-                    for (auto k2_i = 0; k2_i < k2; ++k2_i) {
-                        output[n_o * dim4 + k_o * dim3 + k_i * dim2 + n_i * dim1
-                                + k2_i]
-                                = input[(n_o * n + n_i) * K * k * k2
-                                        + k_o * k * k2 + k_i * k2 + k2_i];
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T ABC2ABCcb4c(T &input, int B, int N, int K, int k, int n, int k2) {
-    T output(input.size());
-    int dim5 = N * K * k * n * k2, dim4 = K * k * n * k2, dim3 = k * n * k2,
-        dim2 = n * k2, dim1 = k2;
-    utils::parallel_for(0, B, 1, [&](int64_t b_o) {
-        for (auto n_o = 0; n_o < N; ++n_o) {
-            for (auto k_o = 0; k_o < K; ++k_o) {
-                for (auto k_i = 0; k_i < k; ++k_i) {
-                    for (auto n_i = 0; n_i < n; ++n_i) {
-                        for (auto k2_i = 0; k2_i < k2; ++k2_i) {
-                            output[b_o * dim5 + n_o * dim4 + k_o * dim3
-                                    + k_i * dim2 + n_i * dim1 + k2_i]
-                                    = input[b_o * dim5
-                                            + (n_o * n + n_i) * K * k * k2
-                                            + k_o * k * k2 + k_i * k2 + k2_i];
-                        }
-                    }
-                }
-            }
-        }
-    });
-
-    return output;
-}
-
-template <typename T>
-T batch_NK2NKkn(T &input, int batch, int N, int K, int k, int n,
-        int origin_k = 0, int origin_n = 0, int dtype_block = 1) {
-    origin_k = origin_k ? origin_k : K * k;
-    origin_n = origin_n ? origin_n : N * n;
-    int pad_k = utils::divide_and_ceil(k, dtype_block);
-    T output(test_utils::product({N, K, pad_k, n, dtype_block}));
-    int dim5 = N * K * pad_k * n * dtype_block,
-        dim4 = K * pad_k * n * dtype_block, dim3 = pad_k * n * dtype_block,
-        dim2 = n * dtype_block, dim1 = dtype_block;
-    utils::parallel_for(0, batch, 1, [&](int64_t b_o) {
-        for (auto n_o = 0; n_o < N; ++n_o) {
-            for (auto k_o = 0; k_o < K; ++k_o) {
-                for (auto k_i = 0; k_i < pad_k; ++k_i) {
-                    for (auto n_i = 0; n_i < n; ++n_i) {
-                        for (auto k_b = 0; k_b < dtype_block; ++k_b) {
-                            if ((n_o * n + n_i < origin_n)
-                                    && (k_o * k + k_i * dtype_block + k_b
-                                            < origin_k)
-                                    && (k_i * dtype_block + k_b) < k) {
-                                output[batch * dim5 + n_o * dim4 + k_o * dim3
-                                        + k_i * dim2 + n_i * dim1 + k_b]
-                                        = input[batch * origin_k * origin_n
-                                                + (k_o * k + k_i * dtype_block
-                                                        + k_b)
-                                                + (n_o * n + n_i) * origin_k];
-                            } else {
-                                output[batch * dim5 + n_o * dim4 + k_o * dim3
-                                        + k_i * dim2 + n_i * dim1 + k_b]
-                                        = 0;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NK2NKkn(T &input, int N, int K, int k, int n, int origin_k = 0,
-        int origin_n = 0) {
-    T output(N * K * k * n);
-    if (origin_k == 0) { origin_k = K * k; }
-    if (origin_n == 0) { origin_n = N * n; }
-    int dim3 = K * k * n, dim2 = k * n, dim1 = n;
-    utils::parallel_for(0, N, 1, [&](int64_t n_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            for (auto k_i = 0; k_i < k; ++k_i) {
-                for (auto n_i = 0; n_i < n; ++n_i) {
-                    if (k_o * k + k_i < origin_k && n_o * n + n_i < origin_n) {
-                        output[n_o * dim3 + k_o * dim2 + k_i * dim1 + n_i]
-                                = input[(n_o * n + n_i) * origin_k + k_o * k
-                                        + k_i];
-                    } else {
-                        output[n_o * dim3 + k_o * dim2 + k_i * dim1 + n_i] = 0;
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T NKkn2KN(T &input, int N, int K, int k, int n, int origin_k = 0,
-        int origin_n = 0) {
-    origin_k = origin_k ? origin_k : K * k;
-    origin_n = origin_n ? origin_n : N * n;
-    T output(origin_k * origin_n);
-    int dim3 = K * k * n, dim2 = k * n, dim1 = n;
-    utils::parallel_for(0, N, 1, [&](int64_t n_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            for (auto k_i = 0; k_i < k; ++k_i) {
-                for (auto n_i = 0; n_i < n; ++n_i) {
-                    if ((n_o * n + n_i) < origin_n
-                            && (k_o * k + k_i) < origin_k) {
-                        output[(k_o * k + k_i) * origin_n + n_o * n + n_i]
-                                = input[n_o * dim3 + k_o * dim2 + k_i * dim1
-                                        + n_i];
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T, typename A>
-std::vector<T, A> batch_NKkn2KN(
-        std::vector<T, A> &input, int batch, int N, int K, int k, int n) {
-    std::vector<T, A> output(input.size());
-    int dim4 = N * K * k * n, dim3 = K * k * n, dim2 = k * n, dim1 = n;
-    utils::parallel_for(0, batch, 1, [&](int64_t b) {
-        for (auto n_o = 0; n_o < N; ++n_o) {
-            for (auto k_o = 0; k_o < K; ++k_o) {
-                for (auto k_i = 0; k_i < k; ++k_i) {
-                    for (auto n_i = 0; n_i < n; ++n_i) {
-                        output[b * dim4 + (k_o * k + k_i) * N * n + n_o * n
-                                + n_i]
-                                = input[b * dim4 + n_o * dim3 + k_o * dim2
-                                        + k_i * dim1 + n_i];
-                    }
-                }
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T, typename A>
-std::vector<T, A> padding_weight(
-        const std::vector<T, A> &input, int N, int K, int dim = 0) {
-    std::vector<T, A> output(N * K, 0);
-    int insize = input.size();
-    auto inN = (dim == 1 ? N : insize / K), inK = (dim == 1 ? insize / N : K);
-    utils::parallel_for(0, N, 1, [&](int64_t n_o) {
-        for (int k_o = 0; k_o < K; ++k_o) {
-            if ((dim == 1 && k_o < inK) || (dim == 0 && n_o < inN)) {
-                output[n_o * K + k_o] = input[n_o * inK + k_o];
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T, typename A>
-std::vector<T, A> drop_padding_weight(
-        std::vector<T, A> &input, int N, int K, int insize, int dim = 0) {
-    std::vector<T, A> output(insize, 0);
-    auto inN = (dim == 1 ? N : insize / K), inK = (dim == 1 ? insize / N : K);
-    utils::parallel_for(0, N, 1, [&](int64_t n_o) {
-        for (auto k_o = 0; k_o < K; ++k_o) {
-            if ((dim == 1 && k_o < inK) || (dim == 0 && n_o < inN)) {
-                output[n_o * inK + k_o] = input[n_o * K + k_o];
-            }
-        }
-    });
-    return output;
-}
-
-template <typename T, typename A>
-std::vector<T, A> transpose(std::vector<T, A> &input, int M, int N) {
-    std::vector<T, A> output(M * N);
-    utils::parallel_for(0, M, 1, [&](int64_t i) {
-        for (auto j = 0; j < N; j++) {
-            output[j * M + i] = input[i * N + j];
-        }
-    });
-    return output;
-}
-
-template <typename T>
-T ref_reduce_sum_MKmk(T &input, int M, int K, int m, int k,
-        int axis = 0) { // axis should be 0(M) or 1(K)
-    int A, B, a, b;
-    std::tie(A, B, a, b) = axis ? std::tie(M, K, m, k) : std::tie(K, M, k, m);
-    using TElem = typename std::remove_reference<decltype(input[0])>::type;
-    T output = alloc_array<TElem>(A * a, INIT_ZERO);
-    int stride0 = K * m * k, stride1 = m * k, stride2 = k;
-    utils::parallel_for(0, A, 1, [&](int64_t a_o) {
-        for (int a_i = 0; a_i < a; a_i++) {
-            float reduce_out = 0.f;
-            for (int b_o = 0; b_o < B; b_o++) {
-                for (int b_i = 0; b_i < b; b_i++) {
-                    if (axis == 1) {
-                        reduce_out += input[a_o * stride0 + b_o * stride1
-                                + a_i * stride2 + b_i];
-                    } else {
-                        reduce_out += input[b_o * stride0 + a_o * stride1
-                                + b_i * stride2 + a_i];
-                    }
-                }
-            }
-            output[a_o * a + a_i] = reduce_out;
-        }
-    });
-    return output;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/instancenorm_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/instancenorm_ref.hpp
deleted file mode 100644
index d2b65039291..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/instancenorm_ref.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_INSTANCENORM_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_INSTANCENORM_REF_HPP
-
-#include <cmath>
-#include <stdlib.h>
-#include <vector>
-#include <test_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-// TODO(xxx): last dims is default axis to normlization
-static void ref_instancenorm_fwd(float *out, float *in, float *gamma,
-        float *beta, int N, int C, int H, int W, float eps = 1e-5f,
-        bool use_scale_shift = false) {
-    std::vector<float> mean(N * C);
-    std::vector<float> var(N * C);
-
-    utils::parallel_for(0, N, 1, [&](int64_t i) {
-        for (uint64_t j = 0; j < (uint64_t)C; ++j) {
-            float sum = 0.0f;
-            for (uint64_t u = 0; u < (uint64_t)H; ++u) {
-                for (uint64_t v = 0; v < (uint64_t)W; ++v) {
-                    sum += in[i * C * H * W + j * H * W + u * W + v];
-                }
-            }
-            mean[i * C + j] = sum / (H * W);
-            float sqd_diff = 0.0f;
-            for (uint64_t u = 0; u < (uint64_t)H; ++u) {
-                for (uint64_t v = 0; v < (uint64_t)W; ++v) {
-                    sqd_diff += (in[i * C * H * W + j * H * W + u * W + v]
-                                        - mean[i * C + j])
-                            * (in[i * C * H * W + j * H * W + u * W + v]
-                                    - mean[i * C + j]);
-                }
-            }
-            var[i * C + j] = sqd_diff / (H * W);
-            for (uint64_t u = 0; u < (uint64_t)H; ++u) {
-                for (uint64_t v = 0; v < (uint64_t)W; ++v) {
-                    out[i * C * H * W + j * H * W + u * W + v]
-                            = ((in[i * C * H * W + j * H * W + u * W + v]
-                                       - mean[i * C + j])
-                                      / std::sqrt(var[i * C + j] + eps))
-                                    * (use_scale_shift ? gamma[u * W + v] : 1)
-                            + (use_scale_shift ? beta[u * W + v] : 0);
-                }
-            }
-        }
-    });
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/layernorm_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/layernorm_ref.hpp
deleted file mode 100644
index ba47310f198..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/layernorm_ref.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_LAYERNORM_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_LAYERNORM_REF_HPP
-
-#include <cmath>
-#include <stdlib.h>
-#include <test_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-// TODO(xxx): last dims is default axis to normlization
-inline void ref_layernorm_fwd_with_mean_var(float *out, float *mean, float *var,
-        float *in, float *gamma, float *beta, int N, int T, int C,
-        float eps = 1e-5f, bool use_scale_shift = false) {
-    utils::parallel_for(0, N, 1, [&](int64_t i) {
-        for (uint64_t j = 0; j < (uint64_t)T; ++j) {
-            float sum = 0.0f;
-            for (uint64_t k = 0; k < (uint64_t)C; ++k) {
-                sum += in[i * T * C + j * C + k];
-            }
-            mean[i * T + j] = sum / C;
-            float sqd_diff = 0.0f;
-            for (uint64_t k = 0; k < (uint64_t)C; ++k) {
-                sqd_diff += (in[i * T * C + j * C + k] - mean[i * T + j])
-                        * (in[i * T * C + j * C + k] - mean[i * T + j]);
-            }
-            var[i * T + j] = sqd_diff / C;
-            for (uint64_t k = 0; k < (uint64_t)C; ++k) {
-                out[i * T * C + j * C + k]
-                        = ((in[i * T * C + j * C + k] - mean[i * T + j])
-                                  / std::sqrt(var[i * T + j] + eps))
-                                * (use_scale_shift ? gamma[k] : 1)
-                        + (use_scale_shift ? beta[k] : 0);
-            }
-        }
-    });
-}
-
-inline void ref_layernorm_fwd(float *out, float *in, float *gamma, float *beta,
-        int N, int T, int C, float eps = 1e-5f, bool use_scale_shift = false) {
-    float mean[N * T];
-    float var[N * T];
-
-    ref_layernorm_fwd_with_mean_var(
-            out, mean, var, in, gamma, beta, N, T, C, eps, use_scale_shift);
-}
-
-// TODO(xxx): K and k axis are default axis to normlization
-inline void ref_layernorm_block_fwd_with_mean_var(float *out, float *mean,
-        float *var, float *in, float *gamma, float *beta, int M, int K, int m,
-        int k, float eps = 1e-5f, bool use_scale_shift = false) {
-    utils::parallel_for(0, M, 1, [&](int64_t i) {
-        for (uint64_t j = 0; j < (uint64_t)m; ++j) {
-            float sum = 0.0f;
-            for (uint64_t u = 0; u < (uint64_t)K; ++u) {
-                for (uint64_t v = 0; v < (uint64_t)k; ++v) {
-                    sum += in[i * K * m * k + u * m * k + j * k + v];
-                }
-            }
-            mean[i * m + j] = sum / (K * k);
-            float sqd_diff = 0.0f;
-            for (uint64_t u = 0; u < (uint64_t)K; ++u) {
-                for (uint64_t v = 0; v < (uint64_t)k; ++v) {
-                    sqd_diff += (in[i * K * m * k + u * m * k + j * k + v]
-                                        - mean[i * m + j])
-                            * (in[i * K * m * k + u * m * k + j * k + v]
-                                    - mean[i * m + j]);
-                }
-            }
-            var[i * m + j] = sqd_diff / (K * k);
-            for (uint64_t u = 0; u < (uint64_t)K; ++u) {
-                for (uint64_t v = 0; v < (uint64_t)k; ++v) {
-                    out[i * K * m * k + u * m * k + j * k + v]
-                            = ((in[i * K * m * k + u * m * k + j * k + v]
-                                       - mean[i * m + j])
-                                      / std::sqrt(var[i * m + j] + eps))
-                                    * (use_scale_shift ? gamma[u * k + v] : 1)
-                            + (use_scale_shift ? beta[u * k + v] : 0);
-                }
-            }
-        }
-    });
-}
-
-inline void ref_layernorm_block_fwd(float *out, float *in, float *gamma,
-        float *beta, int M, int K, int m, int k, float eps = 1e-5f,
-        bool use_scale_shift = false) {
-    float mean[M * 1 * m * 1]; // NOLINT
-    float var[M * 1 * m * 1]; // NOLINT
-
-    ref_layernorm_block_fwd_with_mean_var(out, &mean[0], &var[0], in, gamma,
-            beta, M, K, m, k, eps, use_scale_shift);
-}
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/padding_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/padding_ref.hpp
deleted file mode 100644
index 155d1cf746f..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/padding_ref.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_PADDING_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_PADDING_REF_HPP
-
-#include <stdlib.h>
-#include <test_utils.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-template <typename T>
-static void ref_padding_2d(
-        T *out, T *in, sc_dims out_dims, sc_dims pads_begin, sc_dims pads_end) {
-    auto N = out_dims[0], C = out_dims[1], H = out_dims[2], W = out_dims[3];
-
-    test_utils::parallel_nd(
-            static_cast<int>(N * C * H * W), [&](int64_t i) { out[i] = 0; });
-
-    auto ow_ = W - pads_begin[1] - pads_end[1];
-    auto oh_ = H - pads_begin[0] - pads_end[0];
-
-    int dim1 = C * oh_ * ow_, dim2 = oh_ * ow_, dim3 = ow_;
-    utils::parallel_for(0, N, 1, [&](int64_t n) {
-        int offset = n * C * H * W;
-        for (auto c = 0; c < C; ++c) {
-            offset += pads_begin[0] * W + pads_begin[1];
-            for (auto h = 0; h < oh_; ++h) {
-                for (auto w = 0; w < ow_; ++w) {
-                    out[offset + w] = in[n * dim1 + c * dim2 + h * dim3 + w];
-                }
-                offset += W;
-            }
-            offset -= pads_begin[1];
-            offset += pads_end[0] * W;
-        }
-    });
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/pool_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/pool_ref.hpp
deleted file mode 100644
index 4c67e369c25..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/pool_ref.hpp
+++ /dev/null
@@ -1,317 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_POOL_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_POOL_REF_HPP
-
-#include <algorithm>
-#include <assert.h>
-#include <limits>
-#include <memory>
-#include <string>
-#include <utility>
-#include "conv_ref.hpp"
-#include <test_utils.hpp>
-#include <util/parallel.hpp>
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-inline void compute_pooling_ref_direct_fwd(std::string &pooling_type,
-        const int64_t MB, const int64_t G, const int64_t OC, const int64_t IC,
-        const int64_t IH, const int64_t IW, const int64_t OH, const int64_t OW,
-        const int64_t KH, const int64_t KW, const int64_t SH, const int64_t SW,
-        const int64_t PH, const int64_t PW, float *src_m, float *dst_m,
-        float *mul_m = nullptr, float *add_m = nullptr, bool bn_relu = false,
-        const int64_t OD = 1, const int64_t ID = 1, const int64_t SD = 1,
-        const int64_t PD = 1, const int64_t KD = 1, const int64_t DD = 1,
-        const int64_t DH = 1, const int64_t DW = 1) {
-    /* help compiler optimize the code */
-
-    const int64_t OCG = OC / G, ICG = IC / G;
-
-    auto ker = [&](float &d, int64_t g, int64_t mb, int64_t oc, int64_t od,
-                       int64_t oh, int64_t ow) {
-        const float *__restrict src_loc
-                = (const float *)src_m + (mb * IC + g * ICG) * ID * IH * IW;
-
-        bool first = false;
-        for (int64_t kd = 0; kd < KD; ++kd) {
-            for (int64_t kh = 0; kh < KH; ++kh) {
-                const int64_t ih = oh * SH - PH + kh;
-                if (ih < 0 || ih >= IH) {
-                    if (!first) {
-                        d = 0.0f;
-                        first = true;
-                    } else {
-                        if (pooling_type == "max") d = std::max(d, 0.0f);
-                    }
-                    continue;
-                }
-                for (int64_t kw = 0; kw < KW; ++kw) {
-                    const int64_t iw = ow * SW - PW + kw;
-                    if (iw < 0 || iw >= IW) {
-                        if (!first) {
-                            d = 0.0f;
-                            first = true;
-                        } else {
-                            if (pooling_type == "max") d = std::max(d, 0.0f);
-                        }
-                        continue;
-                    }
-                    int64_t src_off = (oc * IH + ih) * IW + iw;
-                    if (!first) {
-                        d = src_loc[src_off];
-                        first = true;
-                    } else {
-                        if (pooling_type == "max")
-                            d = std::max(d, src_loc[src_off]);
-                        else if (pooling_type == "avg")
-                            d = d + src_loc[src_off];
-                    }
-                }
-            }
-        }
-        if (pooling_type == "avg") d = d / (KH * KW);
-    };
-    using namespace dnnl::impl::graph::gc;
-    utils::parallel_for(0, G * MB * OCG * OD * OH * OW, 1, [&](int64_t i) {
-        int64_t g = i / (MB * OCG * OD * OH * OW),
-                mb = i / (OCG * OD * OH * OW) % MB,
-                oc = i / (OD * OH * OW) % OCG;
-        int64_t od = i / (OH * OW) % OD, oh = i / OW % OH, ow = i % OW;
-        const size_t dst_off
-                = (((mb * OC + g * OC / G + oc) * OD + od) * OH + oh) * OW + ow;
-        float &dst = ((float *)dst_m)[dst_off];
-
-        float pooling_res = 0;
-        ker(pooling_res, g, mb, oc, od, oh, ow);
-
-        if (bn_relu) {
-            // y = max((a*x+b),0)
-            const size_t bn_off = g * OC / G + oc;
-            pooling_res *= ((float *)mul_m)[bn_off];
-            pooling_res += ((float *)add_m)[bn_off];
-            pooling_res = std::max(pooling_res, 0.0f);
-        }
-
-        dst = pooling_res;
-    });
-}
-
-// max_indices shape: n oc oh ow
-// which stores i*h + w of max value of this window in input tensor
-template <typename Store_type, typename Compute_type = Store_type,
-        typename Max_indice_type = int32_t>
-inline void compute_pooling_ref_bwd(std::string &pooling_type, const int64_t MB,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t OH,
-        const int64_t OW, const int64_t KH, const int64_t KW, const int64_t SH,
-        const int64_t SW, const int64_t PH, const int64_t PW,
-        Store_type *dst_delta, Store_type *src_delta,
-        Store_type *input_tensor = nullptr,
-        Max_indice_type *max_indices = nullptr, bool exclude_pad = false) {
-    assert(pooling_type == "avg"
-            || (pooling_type == "max" && input_tensor != nullptr));
-
-    auto ker = [&](const Compute_type s_delta, int64_t mb, int64_t ic,
-                       int64_t ih, int64_t iw) {
-        int64_t nc_offset = (mb * IC + ic) * OH * OW;
-        Store_type *dst = dst_delta + nc_offset;
-
-        int64_t max_offset = -1;
-        Store_type max_val = -std::numeric_limits<Compute_type>::infinity();
-
-        bool has_max_indices = max_indices != nullptr;
-        if (has_max_indices) {
-            Max_indice_type max_hw_offset
-                    = ((mb * IC + ic) * OH + ih) * OW + iw;
-            max_offset = max_indices[max_hw_offset];
-        }
-
-        for (int64_t kh = 0; kh < KH; ++kh) {
-            for (int64_t kw = 0; kw < KW; ++kw) {
-                const int64_t oh = ih * SH - PH + kh;
-                const int64_t ow = iw * SW - PW + kw;
-                int64_t cur_offset = oh * OW + ow;
-
-                bool is_out_of_bound = oh < 0 || oh >= OH || ow < 0 || ow >= OW;
-                if (!is_out_of_bound) {
-                    if (pooling_type == "max") {
-                        if (has_max_indices) {
-                            if (max_offset == cur_offset)
-                                dst[cur_offset] = static_cast<Store_type>(
-                                        dst[cur_offset] + s_delta);
-                        } else {
-                            if (input_tensor[nc_offset + cur_offset]
-                                    > max_val) {
-                                max_offset = cur_offset;
-                                max_val = input_tensor[nc_offset + cur_offset];
-                            }
-                        }
-                    } else if (pooling_type == "avg") {
-                        int64_t k = 0;
-                        if (exclude_pad) {
-                            int hs = ih * SH - PH;
-                            int he = hs + KH;
-                            if (hs < 0) hs = 0;
-                            if (he > OH) he = OH;
-                            int ws = iw * SW - PW;
-                            int we = ws + KW;
-                            if (ws < 0) ws = 0;
-                            if (we > OW) we = OW;
-                            k = (he - hs) * (we - ws);
-                            assert(k > 0 && k <= (KW * KH));
-
-                        } else {
-                            k = int(KW * KH);
-                        }
-                        dst[cur_offset] = static_cast<Store_type>(
-                                dst[cur_offset] + s_delta / k);
-                    }
-                }
-            }
-        }
-        if (!has_max_indices && pooling_type == "max" && max_offset >= 0) {
-            dst[max_offset]
-                    = dst[max_offset] + static_cast<Store_type>(s_delta);
-        }
-    };
-
-    sc::utils::parallel_for(
-            0, MB * IC * OH * OW, 1, [&](int64_t i) { dst_delta[i] = 0; });
-
-    sc::utils::parallel_for(0, MB * IC * IH * IW, 1, [&](int64_t i) {
-        int64_t mb = i / (IC * IH * IW), ic = i % (IC * IH * IW) / (IH * IW),
-                ih = i % (IC * IH * IW) % (IH * IW) / IW,
-                iw = i % (IC * IH * IW) % (IH * IW) % IW;
-        Store_type s_delta = src_delta[i];
-        ker(static_cast<Store_type>(s_delta), mb, ic, ih, iw);
-    });
-}
-
-template <typename Store_type, typename Compute_type = Store_type>
-inline void compute_pooling_ref_fwd(std::string &pooling_type, const int64_t MB,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t OH,
-        const int64_t OW, const int64_t KH, const int64_t KW, const int64_t SH,
-        const int64_t SW, const int64_t PH, const int64_t PW, Store_type *src_m,
-        Store_type *dst_m, float *mul_m = nullptr, float *add_m = nullptr,
-        bool bn_relu = false, bool exclude_pad = false) {
-    assert(pooling_type == "avg" || pooling_type == "max");
-    Compute_type zero = static_cast<Compute_type>(0);
-    auto ker = [&](Compute_type &d, int64_t mb, int64_t oc, int64_t oh,
-                       int64_t ow) {
-        const Store_type *src_loc = src_m + (mb * IC + oc) * IH * IW;
-
-        int count = 0;
-        SC_UNUSED(count);
-        if (pooling_type == "max")
-            d = std::numeric_limits<Compute_type>::lowest();
-        else
-            d = zero;
-        for (int64_t kh = 0; kh < KH; ++kh) {
-            for (int64_t kw = 0; kw < KW; ++kw) {
-                const int64_t ih = oh * SH - PH + kh;
-                const int64_t iw = ow * SW - PW + kw;
-                if (ih < 0 || ih >= IH || iw < 0 || iw >= IW) { continue; }
-                int64_t src_off = ih * IW + iw;
-                if (pooling_type == "max")
-                    d = std::max(
-                            d, static_cast<Compute_type>(src_loc[src_off]));
-                else if (pooling_type == "avg") {
-                    d = d + src_loc[src_off];
-                    count++;
-                }
-            }
-        }
-        if (pooling_type == "avg") {
-            if (exclude_pad) {
-                int hs = oh * SH - PH;
-                int he = hs + KH;
-                if (hs < 0) hs = 0;
-                if (he > IH) he = IH;
-                int ws = ow * SW - PW;
-                int we = ws + KW;
-                if (ws < 0) ws = 0;
-                if (we > IW) we = IW;
-                int k = (he - hs) * (we - ws);
-                assert(k == count);
-                d = d / k;
-            } else {
-                d = d / (KH * KW);
-            }
-        }
-        if (bn_relu) {
-            // y = max((a*x+b),0)
-            const size_t bn_off = oc;
-            d = d * mul_m[bn_off];
-            d = d + add_m[bn_off];
-            d = std::max(d, zero);
-        }
-    };
-
-    sc::utils::parallel_for(0, MB * IC * OH * OW, 1, [&](int64_t i) {
-        int64_t mb = i / (IC * OH * OW), oc = i % (IC * OH * OW) / (OH * OW),
-                oh = i % (IC * OH * OW) % (OH * OW) / OW,
-                ow = i % (IC * OH * OW) % (OH * OW) % OW;
-        Compute_type pooling_res = zero;
-        ker(pooling_res, mb, oc, oh, ow);
-        Store_type &dst = dst_m[i];
-        dst = static_cast<Store_type>(pooling_res);
-    });
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void compute_conv_pooling_postops_ref(const int64_t MB, const int64_t OC,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t OH,
-        const int64_t OW, const int64_t KH, const int64_t KW, const int64_t SH,
-        const int64_t SW, const int64_t PH, const int64_t PW,
-        src_type *conv_data, wei_type *conv_weight, dst_type *conv_ouput,
-        std::string &pooling_type, const int64_t p_OH, const int64_t p_OW,
-        const int64_t p_KH, const int64_t p_KW, const int64_t p_SH,
-        const int64_t p_SW, const int64_t p_PH, const int64_t p_PW,
-        dst_type *final_output, bool exclude_pad = false, bool bn_relu = false,
-        float *mul_m = nullptr, float *add_m = nullptr) {
-    compute_ref_direct_fwd<src_type, wei_type, dst_type, float>(MB, 1, OC, IC,
-            IH, IW, OH, OW, KH, KW, SH, SW, PH, PW, conv_data, conv_weight,
-            nullptr, conv_ouput, dir_t::FWD_I, mul_m, add_m, false);
-    compute_pooling_ref_fwd<src_type, src_type>(pooling_type, MB, OC, OH, OW,
-            p_OH, p_OW, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, conv_ouput,
-            final_output, mul_m, add_m, bn_relu, exclude_pad);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void compute_conv_postops_pooling_ref(const int64_t MB, const int64_t OC,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t OH,
-        const int64_t OW, const int64_t KH, const int64_t KW, const int64_t SH,
-        const int64_t SW, const int64_t PH, const int64_t PW,
-        src_type *conv_data, wei_type *conv_weight, dst_type *conv_ouput,
-        std::string &pooling_type, const int64_t p_OH, const int64_t p_OW,
-        const int64_t p_KH, const int64_t p_KW, const int64_t p_SH,
-        const int64_t p_SW, const int64_t p_PH, const int64_t p_PW,
-        dst_type *final_output, bool exclude_pad = false, bool bn_relu = false,
-        float *mul_m = nullptr, float *add_m = nullptr) {
-    compute_ref_direct_fwd<src_type, wei_type, dst_type, float>(MB, 1, OC, IC,
-            IH, IW, OH, OW, KH, KW, SH, SW, PH, PW, conv_data, conv_weight,
-            nullptr, conv_ouput, dir_t::FWD_I, mul_m, add_m, true);
-    compute_pooling_ref_fwd<src_type, src_type>(pooling_type, MB, OC, OH, OW,
-            p_OH, p_OW, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, conv_ouput,
-            final_output, mul_m, add_m, false, exclude_pad);
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/softmax_ref.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/softmax_ref.hpp
deleted file mode 100644
index 10e4ef05b9c..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/softmax_ref.hpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_SOFTMAX_REF_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_REFERENCE_SOFTMAX_REF_HPP
-
-#include <algorithm>
-#include <cmath>
-#include <stdlib.h>
-#include <vector>
-#include <test_utils.hpp>
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-enum class test_op_type {
-    SOFTMAX = 0,
-    SOFTMAX_BWD,
-    LOG_SOFTMAX,
-    LOG_SOFTMAX_BWD
-};
-
-inline sc_dim product(const sc_dims &vec) {
-    sc_dim res = 1;
-    for (auto e : vec) {
-        res *= e;
-    }
-    return res;
-}
-
-inline sc_dim flatten_index(const sc_dims &index, const sc_dims &range) {
-    int index_size = index.size();
-    sc_dim acc = 1;
-    sc_dim result = 0;
-    for (auto i = index_size - 1; i >= 0; i--) {
-        if (i == index_size - 1) {
-            result += index[i];
-        } else {
-            acc *= range[i + 1];
-            result += index[i] * acc;
-        }
-    }
-    return result;
-}
-
-inline static sc_dims find_rd_axis(
-        const sc_dims &input_dims, std::vector<int> &keep_axis) {
-    sc_dims axis;
-    if (keep_axis.size() == 1 && keep_axis[0] == -1) {
-        for (size_t i = 0; i < input_dims.size() - 1; i++) {
-            axis.push_back(i);
-        }
-    } else {
-        for (size_t i = 0; i < input_dims.size(); i++) {
-            if (std::find(keep_axis.begin(), keep_axis.end(), i)
-                    == keep_axis.end()) {
-                axis.push_back(i);
-            }
-        }
-    }
-    return axis;
-}
-
-inline static void get_rd_dims(
-        sc_dims &axis, const sc_dims &input_dims, sc_dims &reduced_dims) {
-    if (axis.size() == input_dims.size()) {
-        reduced_dims.push_back(1);
-    } else {
-        if (axis.size() == 1) {
-            reduced_dims.push_back(input_dims[axis[0]]);
-        } else {
-            for (auto a : axis) {
-                reduced_dims.push_back(input_dims[a]);
-            }
-        }
-    }
-}
-
-inline std::vector<float> ref_softmax(const std::vector<float> &data,
-        const sc_dims &input_dims, std::vector<int> keep_axis,
-        test_op_type test_type = test_op_type::SOFTMAX,
-        const std::vector<float> &inp2 = {}) {
-    sc_dims axis = find_rd_axis(input_dims, keep_axis);
-    std::sort(axis.begin(), axis.end());
-    const int num_of_loops = input_dims.size();
-    sc_dims lp_vars(num_of_loops, 0);
-    std::vector<float> ret(data.size());
-
-    // exp or mul
-    for (unsigned i = 0; i < data.size(); i++) {
-        switch (test_type) {
-            case test_op_type::SOFTMAX:
-            case test_op_type::LOG_SOFTMAX: {
-                ret[i] = exp(data[i]);
-            } break;
-            case test_op_type::SOFTMAX_BWD: {
-                ret[i] = data[i] * inp2[i];
-            } break;
-            case test_op_type::LOG_SOFTMAX_BWD: {
-                ret[i] = data[i];
-            } break;
-        }
-    }
-
-    // reduce
-    sc_dims reduced_dims;
-    get_rd_dims(axis, input_dims, reduced_dims);
-
-    std::vector<float> reduce_result;
-    reduce_result.resize(product(reduced_dims));
-
-    std::function<void(int)> reduce;
-    reduce = [&](int lp_index) {
-        for (; lp_vars[lp_index] < input_dims[lp_index]; lp_vars[lp_index]++) {
-            if (lp_index == num_of_loops - 1) {
-                sc_dims reduce_vars;
-                int reduced_index;
-                if (axis.size() == 1) {
-                    reduced_index = lp_vars[axis[0]];
-                } else {
-                    if (axis.size() == input_dims.size()) {
-                        reduced_index = 0;
-                    } else {
-                        for (auto a : axis) {
-                            reduce_vars.push_back(lp_vars[a]);
-                        }
-                        reduced_index
-                                = flatten_index(reduce_vars, reduced_dims);
-                    }
-                }
-                auto index = flatten_index(lp_vars, input_dims);
-                reduce_result[reduced_index] += ret[index];
-            } else {
-                reduce(lp_index + 1);
-            }
-        }
-        lp_vars[lp_index] = 0;
-    };
-    reduce(0);
-    std::fill(lp_vars.begin(), lp_vars.end(), 0);
-
-    // division or mul(data - rd_val)
-    std::function<void(int)> divide_or_mul;
-    divide_or_mul = [&](int lp_index) {
-        for (; lp_vars[lp_index] < input_dims[lp_index]; lp_vars[lp_index]++) {
-            if (lp_index == num_of_loops - 1) {
-                sc_dims reduce_vars;
-                int reduced_index;
-                if (axis.size() == 1) {
-                    reduced_index = lp_vars[axis[0]];
-                } else {
-                    if (axis.size() == input_dims.size()) {
-                        reduced_index = 0;
-                    } else {
-                        for (auto a : axis) {
-                            reduce_vars.push_back(lp_vars[a]);
-                        }
-                        reduced_index
-                                = flatten_index(reduce_vars, reduced_dims);
-                    }
-                }
-                auto index = flatten_index(lp_vars, input_dims);
-                switch (test_type) {
-                    case test_op_type::SOFTMAX: {
-                        ret[index] /= reduce_result[reduced_index];
-                    } break;
-                    case test_op_type::LOG_SOFTMAX: {
-                        ret[index] /= reduce_result[reduced_index];
-                        ret[index] = std::log(ret[index]);
-                    } break;
-                    case test_op_type::SOFTMAX_BWD: {
-                        ret[index] = inp2[index]
-                                * (data[index] - reduce_result[reduced_index]);
-                    } break;
-                    case test_op_type::LOG_SOFTMAX_BWD: {
-                        ret[index] = data[index]
-                                - (std::exp(inp2[index])
-                                        * reduce_result[reduced_index]);
-                    } break;
-                }
-            } else {
-                divide_or_mul(lp_index + 1);
-            }
-        }
-        lp_vars[lp_index] = 0;
-    };
-    divide_or_mul(0);
-
-    return ret;
-}
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/threadpool.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/reference/threadpool.cpp
deleted file mode 100644
index f6a1218d3fe..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/reference/threadpool.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <util/def.hpp>
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include "graph/unit/unit_test_common.hpp"
-#include <runtime/context.hpp>
-struct gc_env_initializer {
-    gc_env_initializer() {
-        dnnl::impl::graph::gc::runtime::get_default_stream = []() {
-            static auto the_stream = []() {
-                dnnl::impl::graph::gc::runtime::stream_t ret
-                        = dnnl::impl::graph::gc::runtime::default_stream;
-                ::set_test_engine_kind(
-                        dnnl::impl::graph::engine_kind_t::dnnl_cpu);
-                ret.vtable_.stream = ::get_stream();
-                return ret;
-            }();
-            return &the_stream;
-        };
-    }
-};
-static gc_env_initializer gc_test_init;
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_autocast.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_autocast.cpp
deleted file mode 100644
index 256610dc828..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_autocast.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-TEST(GCCore_CPU_auto_cast_cpp, TestAutoCastBinary) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    ir_comparer cmp(false, true, true);
-
-    _tensor_(t, datatypes::f32, {100});
-    _var_(tptr, datatypes::pointer);
-    tptr = t;
-    EXPECT_EQ(auto_caster_t()(builder.get_current_scope().as_seq().back())
-                      .checked_as<assign>()
-                      ->value_->dtype_,
-            datatypes::pointer);
-
-    _var_(v, datatypes::index);
-    expr add_result = (v + 1);
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(add_result + add_result),
-            (v + make_cast(datatypes::index, 1))
-                    + (v + make_cast(datatypes::index, 1))));
-
-    // binary
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(expr(1.0f) + 1),
-            expr(1.0f) + make_cast(datatypes::f32, 1)));
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(expr(1.0f) + UINT64_C(1)),
-            expr(1.0f) + make_cast(datatypes::f32, UINT64_C(1))));
-
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(expr(1) + UINT64_C(1)),
-            make_cast(datatypes::index, 1) + UINT64_C(1)));
-    EXPECT_TRUE(cmp.compare(
-            auto_caster_t()(
-                    make_expr<constant_node>(INT64_C(1), datatypes::s8) + 1),
-            make_cast(datatypes::s32,
-                    make_expr<constant_node>(INT64_C(1), datatypes::s8))
-                    + 1));
-    // f16 to f32
-    EXPECT_TRUE(cmp.compare(
-            auto_caster_t()(
-                    make_expr<constant_node>(1.2f, datatypes::f16) + 1.2f),
-            make_cast(datatypes::f32,
-                    make_expr<constant_node>(1.2f, datatypes::f16))
-                    + 1.2f));
-
-    // cmp
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(expr(1.0f) > 1),
-            expr(1.0f) > make_cast(datatypes::f32, 1)));
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(expr(1.0f) > UINT64_C(1)),
-            expr(1.0f) > make_cast(datatypes::f32, UINT64_C(1))));
-
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(expr(1) > UINT64_C(1)),
-            make_cast(datatypes::index, 1) > UINT64_C(1)));
-    EXPECT_TRUE(cmp.compare(
-            auto_caster_t()(
-                    make_expr<constant_node>(INT64_C(1), datatypes::s8) > 1),
-            make_cast(datatypes::s32,
-                    make_expr<constant_node>(INT64_C(1), datatypes::s8))
-                    > 1));
-
-    _var_(aaa, datatypes::f32);
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(make_stmt<assign_node_t>(aaa, 1)),
-            make_stmt<assign_node_t>(aaa, make_cast(datatypes::f32, 1))));
-    EXPECT_TRUE(cmp.compare(
-            auto_caster_t()(make_stmt<assign_node_t>(
-                    aaa, make_expr<constant_node>(INT64_C(1), datatypes::s8))),
-            make_stmt<assign_node_t>(aaa,
-                    make_cast(datatypes::f32,
-                            make_expr<constant_node>(
-                                    INT64_C(1), datatypes::s8)))));
-    builder.get_current_scope().as_seq().clear();
-
-    // intrin
-    any_map_t attr;
-    attr["hidden"] = true;
-    auto intrin_out = auto_caster_t()(make_expr<intrin_call_node>(
-            intrin_type::min, std::vector<expr> {expr(1.0f), expr(1)}, attr));
-    auto intrin_expected = make_expr<intrin_call_node>(intrin_type::min,
-            std::vector<expr> {expr(1.0f), make_cast(datatypes::f32, expr(1))},
-            attr);
-    EXPECT_TRUE(cmp.compare(intrin_out, intrin_expected));
-    EXPECT_TRUE(
-            intrin_out.checked_as<intrin_call>()->intrin_attrs_->get_or_else(
-                    "hidden", false));
-}
-
-TEST(GCCore_CPU_auto_cast_cpp, TestAutoCastCall) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    ir_comparer cmp(false, true, true);
-    _decl_func_(datatypes::void_t, abc, _arg_("a", datatypes::f32),
-            _arg_("b", datatypes::index));
-
-    EXPECT_TRUE(cmp.compare(auto_caster_t()(abc(1, 1)),
-            abc(make_cast(datatypes::f32, 1), make_cast(datatypes::index, 1))));
-}
-
-TEST(GCCore_CPU_auto_cast_cpp, TestAutoCastFor) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    ir_comparer cmp(false, true, false);
-    for_loop loop;
-    _named_for_(loop, i, 1, 200UL, 20) {}
-    auto expected = builder.push_for_loop(make_var(datatypes::index, "i"),
-            make_cast(datatypes::index, 1), 200UL,
-            make_cast(datatypes::index, 20),
-            make_stmt<stmts_node_t>(std::vector<stmt> {}), true,
-            for_type::NORMAL);
-    _named_for_(loop, i, 1UL, 200UL, 20UL) {}
-    expected = builder.push_for_loop(make_var(datatypes::index, "i"), 1UL,
-            200UL, 20UL, make_stmt<stmts_node_t>(std::vector<stmt> {}), true,
-            for_type::NORMAL);
-    stmt_c after = auto_caster_t()(loop);
-    EXPECT_TRUE(after.ptr_same(loop));
-    EXPECT_TRUE(cmp.compare(after, expected));
-}
-
-TEST(GCCore_CPU_auto_cast_cpp, TestAutoCastIndexing) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    ir_comparer cmp(false, true, false);
-    _tensor_(tsr, datatypes::f32, {100, 200, 200});
-    expr before = tsr[{10, 20, 20}];
-    expr_c after = auto_caster_t()(before);
-    EXPECT_TRUE(after.ptr_same(before));
-    before = tsr[{10, 20, 20UL}];
-    after = auto_caster_t()(before);
-    EXPECT_TRUE(cmp.compare(after,
-            tsr[{make_cast(datatypes::index, 10),
-                        make_cast(datatypes::index, 20), 20UL}]
-                    .get()));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_batch_matmul.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_batch_matmul.cpp
deleted file mode 100644
index 891012812c6..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_batch_matmul.cpp
+++ /dev/null
@@ -1,575 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <utility>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "util/bf16.hpp"
-#include "util/fp16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <reference/act_ref.hpp>
-#include <reference/bias_ref.hpp>
-#include <reference/gemm_ref.hpp>
-#include <util/parallel.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::ops;
-using namespace dnnl::impl::graph::gc::test_utils;
-struct batch_gemm_params_t {
-    batch_gemm_params_t(sc_dims input_dims, sc_dims weight_dims,
-            sc_dims out_dims, sc_data_format_t in_format,
-            sc_data_format_t weight_format,
-            sc_data_type_t input_dtype = datatypes::f32,
-            sc_data_type_t weight_dtype = datatypes::f32,
-            bool is_input_constant = true)
-        : input_dims_(std::move(input_dims))
-        , weight_dims_(std::move(weight_dims))
-        , out_dims_(std::move(out_dims))
-        , input_format_(in_format)
-        , weight_format_(weight_format)
-        , input_dtype_(input_dtype)
-        , weight_dtype_(weight_dtype)
-        , is_input_constant_(is_input_constant) {}
-    sc_dims input_dims_;
-    sc_dims weight_dims_;
-    sc_dims out_dims_;
-    sc_data_format_t input_format_;
-    sc_data_format_t weight_format_;
-    sc_data_type_t input_dtype_;
-    sc_data_type_t weight_dtype_;
-    bool is_input_constant_;
-};
-
-template <typename Reftype>
-void transpose_ref(std::vector<Reftype> &ref, const sc_dims in_dims,
-        const sc_data_format_t in_format) {
-    if (in_format.is_any() || in_format.format_code_ == format_kinds::ABC
-            || in_format.format_code_ == format_kinds::ABCD
-            || in_dims.size() == 2) {
-        return;
-    }
-    COMPILE_ASSERT(!in_format.is_blocking(),
-            "transpose_ref does not support blocking input in test");
-    const std::vector<int64_t> stride = {in_dims[1] * in_dims[2] * in_dims[3],
-            in_dims[2] * in_dims[3], in_dims[3], 1};
-    std::vector<int64_t> orig_stride;
-    orig_stride.reserve(in_dims.size());
-
-    for (uint8_t i = 0; i < in_dims.size(); i++) {
-        int64_t temp_stride = 1;
-        for (uint8_t j = 0; j < in_dims.size(); j++) {
-            if (in_format.format_code_.get(j) == i) {
-                uint8_t ii = j + 1;
-                while (ii < in_dims.size()) {
-                    temp_stride *= in_dims[in_format.format_code_.get(ii)];
-                    ii++;
-                }
-                break;
-            }
-        }
-        orig_stride.push_back(temp_stride);
-    }
-    std::vector<Reftype> ref_ = ref;
-    utils::parallel_for(0, in_dims[0], 1, [&](int64_t i) {
-        for (int64_t j = 0; j < in_dims[1]; ++j) {
-            for (int64_t ii = 0; ii < in_dims[2]; ++ii) {
-                for (int64_t jj = 0; jj < in_dims[3]; ++jj) {
-                    ref_[i * stride[0] + j * stride[1] + ii * stride[2] + jj]
-                            = ref[i * orig_stride[0] + j * orig_stride[1]
-                                    + ii * orig_stride[2] + jj];
-                }
-            }
-        }
-    });
-    ref.swap(ref_);
-}
-
-template <typename Atype, typename Btype>
-void alloc_sc_input_and_weight(test_buffer<Atype> &sc_input,
-        test_buffer<Btype> &sc_weight, const sc_dims input_dims,
-        const sc_dims weight_dims) {
-    sc_input = alloc_array<Atype>(cal_size(input_dims));
-    sc_weight = alloc_array<Btype>(cal_size(weight_dims));
-}
-
-template <typename Atype, typename Btype, typename Ctype>
-void run_bmm_test(const std::shared_ptr<jit_function_t> &fptr, int M, int N,
-        int K, int batch_size, const sc_dims input_dims,
-        const sc_data_format_t in_format, const sc_dims weight_dims,
-        const sc_data_format_t weight_format, const sc_dims out_dims) {
-    test_buffer<Atype> sc_input;
-    test_buffer<Btype> sc_weight;
-    alloc_sc_input_and_weight(sc_input, sc_weight, input_dims, weight_dims);
-    auto sc_output = alloc_array<Ctype>(cal_size(out_dims));
-    auto ref_input = std::vector<Ctype>(sc_input.begin(), sc_input.end());
-    auto ref_weight = std::vector<Ctype>(sc_weight.begin(), sc_weight.end());
-    auto ref_output = std::vector<Ctype>(cal_size(out_dims));
-
-    fptr->call_default(&sc_output[0], &sc_input[0], &sc_weight[0]);
-
-    transpose_ref(ref_input, input_dims, in_format);
-    transpose_ref(ref_weight, weight_dims, weight_format);
-
-    int input_size = M * K;
-    int weight_size = K * N;
-    int out_size = M * N;
-    // Reorder dst from block format to plain format
-    for (int b = 0; b < batch_size; b++) {
-        int b_i = input_dims.size() >= weight_dims.size() ? b
-                : input_dims.size() == 2                  ? 0
-                                                          : b
-                        % cal_size(sc_dims {
-                                input_dims.begin(), input_dims.end() - 2});
-        int b_w = weight_dims.size() >= input_dims.size() ? b
-                : weight_dims.size() == 2                 ? 0
-                                                          : b
-                        % cal_size(sc_dims {
-                                weight_dims.begin(), weight_dims.end() - 2});
-        gemm_params gemm_param {false, false, M, N, K, 1.0, 0.0, K, N, N};
-        ref_gemm(gemm_param, &ref_input[b_i * input_size],
-                &ref_weight[b_w * weight_size], &ref_output[b * out_size]);
-        test_utils::compare_data(sc_output.data() + b * out_size,
-                ref_output.data() + b * out_size, out_size, 1e-4f, 1e-4f);
-    }
-}
-
-static void check_batch_matmul(
-        const batch_gemm_params_t &param, const matmul_core_config_t &cfg) {
-    REQUIRE_AVX2();
-    const sc_dims input_dims = param.input_dims_;
-    const sc_dims weight_dims = param.weight_dims_;
-    const sc_dims out_dims = param.out_dims_;
-    const sc_data_format_t input_format = param.input_format_;
-    const sc_data_format_t weight_format = param.weight_format_;
-    const sc_data_type_t input_dtype = param.input_dtype_;
-    const sc_data_type_t weight_dtype = param.weight_dtype_;
-    sc_dims batch_dims;
-    sc_graph_t graph;
-    int M, N, K;
-    bool is_quantized
-            = utils::is_one_of(input_dtype, datatypes::u8, datatypes::s8);
-    bool is_s8s8
-            = input_dtype == datatypes::s8 && weight_dtype == datatypes::s8;
-    bool is_u8s8
-            = input_dtype == datatypes::u8 && weight_dtype == datatypes::s8;
-    bool is_bf16
-            = input_dtype == datatypes::bf16 && weight_dtype == datatypes::bf16;
-    bool is_f16
-            = input_dtype == datatypes::f16 && weight_dtype == datatypes::f16;
-
-    auto ctx = get_test_ctx();
-
-    batch_dims = input_dims.size() > weight_dims.size()
-            ? sc_dims {input_dims.begin(), input_dims.end() - 2}
-            : sc_dims {weight_dims.begin(), weight_dims.end() - 2};
-    M = input_dims[input_dims.size() - 2];
-    N = weight_dims[weight_dims.size() - 1];
-    K = input_dims[input_dims.size() - 1];
-    const int batch_size = cal_size(batch_dims);
-
-    auto data = graph.make_input(
-            {graph_tensor::make(input_dims, input_format, input_dtype)});
-    auto weight = graph.make_input(
-            {graph_tensor::make(weight_dims, weight_format, weight_dtype)});
-    auto bmm = graph.make("matmul_core",
-            {data->get_outputs()[0], weight->get_outputs()[0]},
-            {graph_tensor::make(out_dims, sc_data_format_t(),
-                    is_quantized ? datatypes::s32 : datatypes::f32)},
-            {});
-    bmm->stc_cast<tunable_op_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    bmm->dyn_cast<op_traits::may_quantize_t>()->is_quantized_ = is_quantized;
-    auto output = graph.make_output(bmm->get_outputs());
-    graph.attrs_[sc_graph_t::attr_key_t::quantize] = is_quantized;
-    if (param.is_input_constant_) {
-        data->attrs_.set("constant", const_kind::local_const);
-        weight->attrs_.set("constant", const_kind::local_const);
-    }
-    graph_driver(graph, ctx);
-
-    auto f = lower_graph(
-            ctx, graph, std::vector<sc_op_ptr> {output, data, weight});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-
-    if (is_quantized && is_s8s8) { // s8s8
-        run_bmm_test<int8_t, int8_t, int32_t>(fptr, M, N, K, batch_size,
-                input_dims, input_format, weight_dims, weight_format, out_dims);
-    } else if (is_quantized && is_u8s8) { // u8s8
-        run_bmm_test<uint8_t, int8_t, int32_t>(fptr, M, N, K, batch_size,
-                input_dims, input_format, weight_dims, weight_format, out_dims);
-    } else if (is_bf16) { // bf16
-        run_bmm_test<bf16_t, bf16_t, float>(fptr, M, N, K, batch_size,
-                input_dims, input_format, weight_dims, weight_format, out_dims);
-    } else if (is_f16) { // f16
-        run_bmm_test<fp16_t, fp16_t, float>(fptr, M, N, K, batch_size,
-                input_dims, input_format, weight_dims, weight_format, out_dims);
-    } else { // f32
-        run_bmm_test<float, float, float>(fptr, M, N, K, batch_size, input_dims,
-                input_format, weight_dims, weight_format, out_dims);
-    }
-}
-
-const matmul_core_config_t cfg_fwd = {
-        16, // M_block
-        8, // N_block
-        4, // K_block
-};
-// f32
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD1) {
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD2) {
-    check_batch_matmul({{2, 2048, 512}, {2, 512, 4096}, {2, 2048, 4096},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD3) {
-    check_batch_matmul({{8, 4, 64, 512}, {8, 4, 512, 32}, {8, 4, 64, 32},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD4) {
-    check_batch_matmul({{2, 4, 64, 256}, {2, 4, 256, 128}, {2, 4, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD5) {
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD6) {
-    check_batch_matmul({{32, 2, 64, 256}, {256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t::KN()},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD7) {
-    check_batch_matmul({{32, 2, 64, 256}, {2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABC)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD8) {
-    check_batch_matmul({{64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t::MK(),
-                               sc_data_format_t(format_kinds::ABCD)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD9) {
-    check_batch_matmul({{2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABCD)},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD10) {
-    check_batch_matmul({{2, 2048, 512}, {2, 512, 4096}, {2, 2048, 4096},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::f32, datatypes::f32, false},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWD11) {
-    check_batch_matmul({{8, 4, 64, 512}, {8, 4, 512, 32}, {8, 4, 64, 32},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::f32, datatypes::f32, false},
-            cfg_fwd);
-}
-// s8s8
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD1) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD2) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD3) {
-    REQUIRE_VNNI();
-    check_batch_matmul(
-            {{32, 2, 64, 256}, {256, 128}, {32, 2, 64, 128},
-                    sc_data_format_t(format_kinds::ABCD),
-                    sc_data_format_t::KN(), datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD4) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{32, 2, 64, 256}, {2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD5) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t::MK(),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD6) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD7) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::s8, datatypes::s8, false},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemms8s8FWD8) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::s8, datatypes::s8, false},
-            cfg_fwd);
-}
-// u8s8
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD1) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD2) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD3) {
-    REQUIRE_VNNI();
-    check_batch_matmul(
-            {{32, 2, 64, 256}, {256, 128}, {32, 2, 64, 128},
-                    sc_data_format_t(format_kinds::ABCD),
-                    sc_data_format_t::KN(), datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD4) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{32, 2, 64, 256}, {2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD5) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t::MK(),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD6) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD7) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::u8, datatypes::s8, false},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmu8s8FWD8) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::u8, datatypes::s8, false},
-            cfg_fwd);
-}
-// bf16
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmbf16FWD1) {
-    REQUIRE_VNNI();
-    REQUIRE_BF16();
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::bf16, datatypes::bf16},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmbf16FWD2) {
-    REQUIRE_VNNI();
-    REQUIRE_BF16();
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::bf16, datatypes::bf16},
-            cfg_fwd);
-}
-// f16
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmf16FWD1) {
-    REQUIRE_FP16();
-    check_batch_matmul({{2, 256, 64}, {2, 64, 128}, {2, 256, 128},
-                               sc_data_format_t(format_kinds::ABC),
-                               sc_data_format_t(format_kinds::ABC),
-                               datatypes::f16, datatypes::f16},
-            cfg_fwd);
-}
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmf16FWD2) {
-    REQUIRE_FP16();
-    check_batch_matmul({{32, 2, 64, 256}, {32, 2, 256, 128}, {32, 2, 64, 128},
-                               sc_data_format_t(format_kinds::ABCD),
-                               sc_data_format_t(format_kinds::ABCD),
-                               datatypes::f16, datatypes::f16},
-            cfg_fwd);
-}
-// bert case
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERT_QK) {
-    check_batch_matmul({{2, 16, 384, 64}, {2, 16, 64, 384}, {2, 16, 384, 384},
-                               sc_data_format_t(), sc_data_format_t()},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERT_QK2) {
-    check_batch_matmul({{2, 16, 384, 64}, {2, 16, 64, 384}, {2, 16, 384, 384},
-                               sc_data_format_t(), sc_data_format_t(),
-                               datatypes::f32, datatypes::f32, false},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTs8s8_QK) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 16, 384, 64}, {2, 16, 64, 384}, {2, 16, 384, 384},
-                               sc_data_format_t(), sc_data_format_t(),
-                               datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTs8s8_QK2) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 16, 384, 64}, {2, 16, 64, 384}, {2, 16, 384, 384},
-                               sc_data_format_t(), sc_data_format_t(),
-                               datatypes::s8, datatypes::s8, false},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTu8s8_QK) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 16, 384, 64}, {2, 16, 64, 384}, {2, 16, 384, 384},
-                               sc_data_format_t(), sc_data_format_t(),
-                               datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTu8s8_QK2) {
-    REQUIRE_VNNI();
-    check_batch_matmul({{2, 16, 384, 64}, {2, 16, 64, 384}, {2, 16, 384, 384},
-                               sc_data_format_t(), sc_data_format_t(),
-                               datatypes::u8, datatypes::s8, false},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERT_V) {
-    check_batch_matmul(
-            {{2, 16, 384, 384}, {2, 16, 384, 64}, {2, 16, 384, 64},
-                    sc_data_format_t(), sc_data_format_t(format_kinds::ACBD)},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERT_V2) {
-    check_batch_matmul(
-            {{2, 16, 384, 384}, {2, 16, 384, 64}, {2, 16, 384, 64},
-                    sc_data_format_t(), sc_data_format_t(format_kinds::ACBD),
-                    datatypes::f32, datatypes::f32, false},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTs8s8_V) {
-    REQUIRE_VNNI();
-    check_batch_matmul(
-            {{2, 16, 384, 384}, {2, 16, 384, 64}, {2, 16, 384, 64},
-                    sc_data_format_t(), sc_data_format_t(format_kinds::ACBD),
-                    datatypes::s8, datatypes::s8},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTs8s8_V2) {
-    REQUIRE_VNNI();
-    check_batch_matmul(
-            {{2, 16, 384, 384}, {2, 16, 384, 64}, {2, 16, 384, 64},
-                    sc_data_format_t(), sc_data_format_t(format_kinds::ACBD),
-                    datatypes::s8, datatypes::s8, false},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTu8s8_V) {
-    REQUIRE_VNNI();
-    check_batch_matmul(
-            {{2, 16, 384, 384}, {2, 16, 384, 64}, {2, 16, 384, 64},
-                    sc_data_format_t(), sc_data_format_t(format_kinds::ACBD),
-                    datatypes::u8, datatypes::s8},
-            cfg_fwd);
-}
-
-TEST(GCCore_CPU_batch_matmul_test, TestBatchGemmFWDBERTu8s8_V2) {
-    REQUIRE_VNNI();
-    check_batch_matmul(
-            {{2, 16, 384, 384}, {2, 16, 384, 64}, {2, 16, 384, 64},
-                    sc_data_format_t(), sc_data_format_t(format_kinds::ACBD),
-                    datatypes::u8, datatypes::s8, false},
-            cfg_fwd);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_bf16_fp16_legalize.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_bf16_fp16_legalize.cpp
deleted file mode 100644
index cad8f2a791f..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_bf16_fp16_legalize.cpp
+++ /dev/null
@@ -1,457 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <iostream>
-#include <vector>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/bf16_fp16_legalize.hpp>
-#include <compiler/ir/transform/cpu/target_specific_lower.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/jit/jit.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::utils;
-using namespace dnnl::impl::graph::gc::test_utils;
-
-#define LANES 8
-#define DBF16 sc_data_type_t::bf16(LANES)
-#define DF16 sc_data_type_t::f16(LANES)
-#define DF32 sc_data_type_t::f32(LANES)
-#define BF16(x) builder::make_cast(DBF16, x)
-#define F16(x) builder::make_cast(DF16, x)
-#define F32(x) builder::make_cast(DF32, x)
-void dotest_low_precision_fp_promote_binary(
-        expr (*OP)(const expr_c &, const expr_c &)) {
-    {
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        {
-            expr c = builder::make_var(DBF16, "c");
-            expr a = builder::make_var(DBF16, "a");
-            expr b = builder::make_var(DBF16, "b");
-            bld.push_assign(c, OP(a, b));
-        }
-        aaa = bld.pop_scope();
-
-        bld.push_scope();
-        {
-            expr c = builder::make_var(DBF16, "c");
-            expr a = builder::make_var(DBF16, "a");
-            expr b = builder::make_var(DBF16, "b");
-            bld.push_assign(c, BF16(OP(F32(a), F32(b))));
-        }
-        bbb = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, bbb, false));
-    }
-    {
-        SKIP_F16(datatypes::f16);
-        builder::ir_builder_t bld;
-        stmt aaa;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        {
-            expr c = builder::make_var(DF16, "c");
-            expr a = builder::make_var(DF16, "a");
-            expr b = builder::make_var(DF16, "b");
-            bld.push_assign(c, OP(a, b));
-        }
-        aaa = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, aaa, false));
-    }
-}
-
-void dotest_low_precision_fp_promote_cmp(
-        expr (*OP)(const expr_c &, const expr_c &)) {
-    {
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        {
-            expr c = builder::make_var(DBF16, "c");
-            expr a = builder::make_var(DBF16, "a");
-            expr b = builder::make_var(DBF16, "b");
-            bld.push_assign(c, OP(a, b));
-        }
-        aaa = bld.pop_scope();
-        bld.push_scope();
-        {
-            expr c = builder::make_var(DBF16, "c");
-            expr a = builder::make_var(DBF16, "a");
-            expr b = builder::make_var(DBF16, "b");
-            bld.push_assign(c, OP(F32(a), F32(b)));
-        }
-        bbb = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, bbb, false));
-    }
-    {
-        SKIP_F16(datatypes::f16);
-        builder::ir_builder_t bld;
-        stmt aaa;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        {
-            expr c = builder::make_var(DF16, "c");
-            expr a = builder::make_var(DF16, "a");
-            expr b = builder::make_var(DF16, "b");
-            bld.push_assign(c, OP(a, b));
-        }
-        aaa = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, aaa, false));
-    }
-}
-
-void dotest_low_precision_fp_promote_single(expr (*OP)(const expr_c &)) {
-    {
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        bld.push_assign(builder::make_var(DBF16, "c"),
-                OP(builder::make_var(DBF16, "a")));
-        aaa = bld.pop_scope();
-        bld.push_scope();
-        bld.push_assign(builder::make_var(DBF16, "c"),
-                BF16(OP(F32(builder::make_var(DBF16, "a")))));
-        bbb = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, bbb, false));
-    }
-    {
-        SKIP_F16(datatypes::f16);
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        bld.push_assign(
-                builder::make_var(DF16, "c"), OP(builder::make_var(DF16, "a")));
-        aaa = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, aaa, false));
-    }
-}
-
-void dotest_low_precision_fp_promote_assign() {
-    {
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        builder::make_var(DBF16, "c") = builder::make_var(DF32, "a");
-        builder::make_var(DF32, "b") = builder::make_var(DBF16, "c");
-        aaa = bld.pop_scope();
-        bld.push_scope();
-        builder::make_var(DBF16, "c") = BF16(builder::make_var(DF32, "a"));
-        builder::make_var(DF32, "b") = F32(builder::make_var(DBF16, "c"));
-
-        bbb = bld.pop_scope();
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, bbb, false));
-    }
-    {
-        SKIP_F16(datatypes::f16);
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        builder::make_var(DF16, "c") = builder::make_var(DF32, "a");
-        builder::make_var(DF32, "b") = builder::make_var(DF16, "c");
-        aaa = bld.pop_scope();
-        bld.push_scope();
-        builder::make_var(DF16, "c") = F16(builder::make_var(DF32, "a"));
-        builder::make_var(DF32, "b") = F32(builder::make_var(DF16, "c"));
-
-        bbb = bld.pop_scope();
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, aaa, false));
-    }
-}
-
-void dotest_low_precision_fp_promote_select() {
-    {
-        // check max vector lanes to ensure promotion will take place
-        SKIP_ON_INSUFFICIENT_LANES(LANES, sc_data_etype::F32);
-        builder::ir_builder_t bld;
-        stmt aaa, bbb;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        // fake dtype of condition.
-        bld.push_assign(builder::make_var(DBF16, "c"),
-                builder::make_select(builder::make_var(DBF16, "cond"),
-                        builder::make_var(DBF16, "a"),
-                        builder::make_var(DBF16, "b")));
-        aaa = bld.pop_scope();
-
-        bld.push_scope();
-        bld.push_assign(builder::make_var(DBF16, "c"),
-                BF16(builder::make_select(builder::make_var(DBF16, "cond"),
-                        F32(builder::make_var(DBF16, "a")),
-                        F32(builder::make_var(DBF16, "b")))));
-        bbb = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, bbb, false));
-    }
-    {
-        SKIP_F16(datatypes::f16);
-        // check max vector lanes to ensure promotion will take place
-        SKIP_ON_INSUFFICIENT_LANES(LANES, sc_data_etype::F32);
-        builder::ir_builder_t bld;
-        stmt aaa;
-        ir_comparer cmper(true);
-        bld.push_scope();
-        // fake dtype of condition.
-        bld.push_assign(builder::make_var(DF16, "c"),
-                builder::make_select(builder::make_var(DF16, "cond"),
-                        builder::make_var(DF16, "a"),
-                        builder::make_var(DF16, "b")));
-        aaa = bld.pop_scope();
-
-        bf16_fp16_promote_impl_t pass;
-        auto ccc = pass.dispatch(aaa);
-        EXPECT_TRUE(cmper.compare(ccc, aaa, false));
-    }
-}
-
-TEST(GCCore_CPU_hplegalize_cpp, TestHPPromote) {
-    dotest_low_precision_fp_promote_binary(builder::make_add);
-    dotest_low_precision_fp_promote_binary(builder::make_sub);
-    dotest_low_precision_fp_promote_binary(builder::make_mul);
-    dotest_low_precision_fp_promote_binary(builder::make_div);
-    dotest_low_precision_fp_promote_cmp(builder::make_cmp_eq);
-    dotest_low_precision_fp_promote_cmp(builder::make_cmp_ne);
-    dotest_low_precision_fp_promote_cmp(builder::make_cmp_ge);
-    dotest_low_precision_fp_promote_cmp(builder::make_cmp_gt);
-    dotest_low_precision_fp_promote_cmp(builder::make_cmp_le);
-    dotest_low_precision_fp_promote_cmp(builder::make_cmp_lt);
-    dotest_low_precision_fp_promote_binary(builder::make_max);
-    dotest_low_precision_fp_promote_binary(builder::make_min);
-    dotest_low_precision_fp_promote_single(builder::make_abs);
-    dotest_low_precision_fp_promote_assign();
-    dotest_low_precision_fp_promote_select();
-}
-
-TEST(GCCore_CPU_hplegalize_cpp, TestBF16CastElimination) {
-    REQUIRE_AVX();
-    builder::ir_builder_t builder;
-    ir_comparer cmper(true);
-    _function_(datatypes::s32, aaa, _arg_("a", DBF16), _arg_("b", DBF16),
-            _arg_("c", DBF16)) {
-        _bind_(a, b, c);
-        _var_(e, DBF16);
-        e = builder::make_max(a + b, a - b);
-        _var_(d, DBF16);
-        d = builder::make_max(a, e);
-        c = d;
-        _return_(1);
-    }
-    _function_(datatypes::s32, bbb, _arg_("a", DBF16), _arg_("b", DBF16),
-            _arg_("c", DBF16)) {
-        _bind_(a, b, c);
-        _var_(e, DBF16);
-        e = BF16(builder::make_max(
-                F32(BF16(F32(a) + F32(b))), F32(BF16(F32(a) - F32(b)))));
-        _var_(d, DBF16);
-        d = BF16(builder::make_max(F32(a), F32(e)));
-        c = d;
-        _return_(1);
-    }
-    _function_(datatypes::s32, ccc, _arg_("a", DBF16), _arg_("b", DBF16),
-            _arg_("c", DBF16)) {
-        _bind_(a, b, c);
-        _var_(e, DF32);
-        e = builder::make_max(F32(a) + F32(b), F32(a) - F32(b));
-        _var_(d, DBF16);
-        d = BF16(builder::make_max(F32(a), e));
-        c = d;
-        _return_(1);
-    }
-    bf16_fp16_promote_impl_t promote_pass;
-    bf16_fp16_elimination_analyzer_t analysis_pass(get_test_ctx());
-    bf16_fp16_cast_elimination_impl_t elimination_pass(
-            get_test_ctx(), analysis_pass.var_use_cnt_);
-    auto ddd = promote_pass.dispatch(aaa);
-    EXPECT_TRUE(cmper.compare(ddd, bbb, false));
-    cmper.reset();
-    auto eee = analysis_pass.dispatch(ddd);
-    auto fff = elimination_pass.dispatch(eee);
-    EXPECT_TRUE(cmper.compare(fff, ccc, false));
-}
-
-float get_rand() {
-    return static_cast<float>(rand()) / static_cast<float>(RAND_MAX) // NOLINT
-            * 2 // NOLINT
-            - 1.0; // NOLINT
-}
-
-TEST(GCCore_CPU_hplegalize_cpp, TestBF16Lower) {
-    REQUIRE_AVX2();
-    builder::ir_builder_t builder;
-    _function_(datatypes::bf16, aaa, _arg_("a", datatypes::bf16),
-            _arg_("b", datatypes::bf16)) {
-        _bind_(a, b);
-        _return_((a + b) * (a - b)
-                + builder::make_cast(datatypes::bf16, 3.14159F));
-    }
-    float fa = get_rand(), fb = get_rand(), fc, fc_ref;
-    bf16_t a = fa;
-    bf16_t b = fb;
-    bf16_t c;
-    fc_ref = float(
-            bf16_t((float(a) + float(b)) * (float(a) - float(b)) + 3.14159F));
-    auto fptr = jit_engine_t::make(get_default_context())
-                        ->get_entry_func(ir_module_t::from_entry_func(
-                                                 get_default_context(), aaa),
-                                false);
-    // due to MSVC ABI, we cannot return bf16_t, because it is a struct and
-    // MSVC will treat bf16_t differently with uint16_t when returning it
-    c = bf16_t::from_storage(fptr->call<uint16_t>(a, b));
-    fc = float(c);
-    EXPECT_TRUE(std::abs(fc - fc_ref) < 1e-5f);
-    // vector type
-    auto ctx = get_default_context();
-    int lanes = 8;
-    if (ctx->machine_.device_type_ == runtime::target_machine_t::type::cpu
-            && ctx->machine_.cpu_flags_.fAVX512F) {
-        _function_(datatypes::s32, bbb, _arg_("a", datatypes::bf16, {lanes}),
-                _arg_("b", datatypes::bf16, {lanes}),
-                _arg_("c", datatypes::bf16, {lanes})) {
-            _bind_(a, b, c);
-            c[span_t({0}, lanes)]
-                    = (a[span_t({0}, lanes)] + b[span_t({0}, lanes)])
-                            * (a[span_t({0}, lanes)] - b[span_t({0}, lanes)])
-                    + builder::make_constant(
-                            std::vector<union_val>(lanes, 3.14159F), DBF16);
-            _return_(1);
-        }
-
-        std::vector<float> vfa(lanes), vfb(lanes), vfc(lanes), vfc_ref(lanes);
-        std::vector<bf16_t> va(lanes), vb(lanes), vc(lanes);
-        for (auto i = 0; i < lanes; i++) {
-            vfa[i] = get_rand();
-            vfb[i] = get_rand();
-            va[i] = bf16_t(vfa[i]);
-            vb[i] = bf16_t(vfb[i]);
-            vfc_ref[i] = float(bf16_t((float(va[i]) + float(vb[i]))
-                            * (float(va[i]) - float(vb[i]))
-                    + 3.14159));
-        }
-
-        fptr = jit_engine_t::make(get_default_context())
-                       ->get_entry_func(ir_module_t::from_entry_func(
-                                                get_default_context(), bbb),
-                               false);
-        fptr->call<int>(&va[0], &vb[0], &vc[0]);
-
-        vfc = std::vector<float>(vc.begin(), vc.end());
-        test_utils::compare_data(vfc, vfc_ref);
-    }
-}
-
-TEST(GCCore_CPU_hplegalize_cpp, TestF16Lower) {
-    REQUIRE_AVX512FP16();
-    builder::ir_builder_t builder;
-    int lanes = 1;
-    _function_(datatypes::s32, aaa, _arg_("a", datatypes::f16, {lanes}),
-            _arg_("b", datatypes::f16, {lanes}),
-            _arg_("c", datatypes::f16, {lanes})) {
-        _bind_(a, b, c);
-        c[span_t({0}, lanes)] = (a[span_t({0}, lanes)] + b[span_t({0}, lanes)])
-                        * (a[span_t({0}, lanes)] - b[span_t({0}, lanes)])
-                + builder::make_constant(
-                        std::vector<union_val>(lanes, 3.14159F),
-                        datatypes::f16);
-        _return_(1);
-    }
-    float fa = get_rand(), fb = get_rand(), fc, fc_ref;
-    fp16_t a[1] = {fa};
-    fp16_t b[1] = {fb};
-    fp16_t c[1];
-    fc_ref = float(fp16_t((a[0] + b[0]) * (a[0] - b[0]) + fp16_t(3.14159F)));
-    auto fptr = jit_engine_t::make(get_default_context())
-                        ->get_entry_func(ir_module_t::from_entry_func(
-                                                 get_default_context(), aaa),
-                                false);
-    fptr->call<int>(&a, &b, &c);
-    fc = float(c[0]);
-    EXPECT_TRUE(std::abs(fc - fc_ref) < 5e-3f);
-    // vector type
-    auto ctx = get_default_context();
-    lanes = 8;
-    if (ctx->machine_.device_type_ == runtime::target_machine_t::type::cpu
-            && ctx->machine_.cpu_flags_.fAVX512F) {
-        _function_(datatypes::s32, bbb, _arg_("a", datatypes::f16, {lanes}),
-                _arg_("b", datatypes::f16, {lanes}),
-                _arg_("c", datatypes::f16, {lanes})) {
-            _bind_(a, b, c);
-            c[span_t({0}, lanes)]
-                    = (a[span_t({0}, lanes)] + b[span_t({0}, lanes)])
-                            * (a[span_t({0}, lanes)] - b[span_t({0}, lanes)])
-                    + builder::make_constant(
-                            std::vector<union_val>(lanes, 3.14159F), DF16);
-            _return_(1);
-        }
-
-        std::vector<float> vfa(lanes), vfb(lanes), vfc(lanes), vfc_ref(lanes);
-        std::vector<fp16_t> va(lanes), vb(lanes), vc(lanes);
-        for (auto i = 0; i < lanes; i++) {
-            vfa[i] = get_rand();
-            vfb[i] = get_rand();
-            va[i] = fp16_t(vfa[i]);
-            vb[i] = fp16_t(vfb[i]);
-            vfc_ref[i] = float(fp16_t(
-                    (va[i] + vb[i]) * (va[i] - vb[i]) + fp16_t(3.14159F)));
-        }
-
-        fptr = jit_engine_t::make(get_default_context())
-                       ->get_entry_func(ir_module_t::from_entry_func(
-                                                get_default_context(), bbb),
-                               false);
-        fptr->call<int>(&va[0], &vb[0], &vc[0]);
-
-        vfc = std::vector<float>(vc.begin(), vc.end());
-        for (int i = 0; i < lanes; ++i) {
-            EXPECT_NEAR(vfc[i], vfc_ref[i], std::abs(1e-3 * vfc_ref[i]));
-        }
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_binary_backward.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_binary_backward.cpp
deleted file mode 100644
index 26fd8a92094..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_binary_backward.cpp
+++ /dev/null
@@ -1,235 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <numeric>
-#include <vector>
-
-#include "context.hpp"
-#include "reference/act_ref.hpp"
-#include "test_utils.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/trait/may_broadcast.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/jit/jit.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-template <class T>
-static void cmp_result(bool is_bf16, const std::vector<T> &sc_out_1,
-        const std::vector<T> &sc_out_2, const std::vector<T> ref_output1,
-        const std::vector<T> ref_output2, const size_t input_size,
-        const float rtol = 1e-4f, const float atol = 1e-5,
-        bool fused_test = false) {
-    if (!is_bf16) {
-        test_utils::compare_data(sc_out_1, ref_output1, rtol, atol);
-        if (!fused_test) {
-            test_utils::compare_data(sc_out_2, ref_output2, rtol, atol);
-        }
-    } else {
-        auto test_f = [&rtol](const std::vector<T> &dst_inp,
-                              const std::vector<T> &ref_inp) {
-            float sum = 0.f;
-            auto ff = [&sum](std::vector<T> sc_output) {
-                std::for_each(sc_output.begin(), sc_output.end(),
-                        [&sum](const T &n) { sum += std::abs(float(n)); });
-            };
-
-            auto rmse = test_utils::cal_rmse(dst_inp, ref_inp);
-            ff(dst_inp);
-            EXPECT_TRUE(((sum != 0.f) ? rmse / sum : rmse) < rtol);
-        };
-        test_f(sc_out_1, ref_output1);
-        if (!fused_test) { test_f(sc_out_2, ref_output2); }
-    }
-}
-
-template <class T>
-static void prepare_input_value(
-        std::vector<std::vector<T>> &inp_data, const size_t input_size) {
-    auto &input_data1 = inp_data[0];
-    auto &input_data2 = inp_data[1];
-    auto &input_data3 = inp_data[2];
-    auto &elt_add = inp_data[3];
-
-    test_utils::fill_data(&input_data1[0], input_size);
-    test_utils::fill_data(&input_data2[0], input_size);
-    test_utils::fill_data(&input_data3[0], input_size);
-    test_utils::fill_data(&elt_add[0], input_size);
-}
-
-template <class T>
-static void test_gc(const sc_dims &input_dims, const sc_data_type_t &dtype,
-        const std::string &op_name, const sc_data_format_t &infmt,
-        std::vector<std::vector<T>> &inp_data, std::vector<T> &sc_output1,
-        std::vector<T> &sc_output2, const any_map_t &attrs,
-        const std::function<sc_op_ptr(sc_graph_t &, graph_tensor_ptr,
-                const sc_dims &, const sc_data_format_t &,
-                const sc_data_type_t &)> &graph_func
-        = nullptr,
-        const std::function<void(std::vector<generic_val> &, std::vector<T> &)>
-                &args_func
-        = nullptr) {
-    auto &input_data1 = inp_data[0];
-    auto &input_data2 = inp_data[1];
-    auto &input_data3 = inp_data[2];
-    auto &elt_add = inp_data[3];
-
-    sc_graph_t g;
-    sc_op_ptr ins;
-    ins = g.make_input({graph_tensor::make(input_dims,
-                                graph_func ? infmt : sc_data_format_t(), dtype),
-            graph_tensor::make(
-                    input_dims, graph_func ? infmt : sc_data_format_t(), dtype),
-            graph_tensor::make(input_dims,
-                    graph_func ? infmt : sc_data_format_t(), dtype)});
-    auto op = g.make(op_name, ins->get_outputs(), {}, attrs);
-    if (graph_func) {
-        op = graph_func(g, op->get_outputs()[0], input_dims, infmt, dtype);
-    }
-    g.make_output(op->get_outputs());
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = false;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = false;
-    graph_driver(g, get_test_ctx());
-    auto f = lower_graph(get_test_ctx(), g, {});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f);
-
-    std::vector<generic_val> args;
-    if (args_func) { args_func(args, elt_add); }
-    args.emplace_back(input_data1.data());
-    args.emplace_back(input_data2.data());
-    args.emplace_back(input_data3.data());
-    args.emplace_back(sc_output1.data());
-    if (!args_func) { args.emplace_back(sc_output2.data()); }
-
-    fptr->call_generic_default(args.data());
-}
-
-template <class T>
-static void prepare_ref_value(void (*ref_func)(T *, T *, const T *, const T *,
-                                      const T *, size_t, any_map_t &attrs),
-        std::vector<T> &ref_output1, std::vector<T> &ref_output2,
-        std::vector<std::vector<T>> &inp_data, any_map_t &attrs,
-        const size_t input_size, bool need_fused = false) {
-    auto &input_data1 = inp_data[0];
-    auto &input_data2 = inp_data[1];
-    auto &input_data3 = inp_data[2];
-    auto &elt_add = inp_data[3];
-    ref_func(ref_output1.data(), ref_output2.data(), input_data1.data(),
-            input_data2.data(), input_data3.data(), input_size, attrs);
-    if (need_fused) {
-        std::transform(ref_output1.begin(), ref_output1.end(), elt_add.begin(),
-                ref_output1.begin(), std::plus<T>());
-    }
-}
-
-template <typename T,
-        typename dummy = typename std::enable_if<
-                std::is_same<typename std::decay<T>::type, float>::value
-                || std::is_same<typename std::decay<T>::type, bf16_t>::value>>
-static void check_binary_backward(const std::string &op_name,
-        const sc_dims &input_dims,
-        void (*ref_func)(T *, T *, const T *, const T *, const T *, size_t,
-                any_map_t &attrs),
-        const sc_data_format_t &inp_format = format_kinds::A,
-        const std::function<sc_op_ptr(sc_graph_t &, graph_tensor_ptr,
-                const sc_dims &, const sc_data_format_t &,
-                const sc_data_type_t &)> &graph_func
-        = nullptr,
-        const std::function<void(std::vector<generic_val> &, std::vector<T> &)>
-                &args_func
-        = nullptr) {
-    bool is_bf16 = std::is_same<typename std::decay<T>::type, bf16_t>::value;
-    if (is_bf16
-            && !::dnnl::impl::graph::gc::get_default_context()
-                        ->machine_.cpu_flags_.fAVX512F) {
-        return;
-    }
-
-    // define used var
-    any_map_t attrs;
-    float rtol = 1e-4f, atol = 1e-5f;
-    auto dtype = is_bf16 ? datatypes::bf16 : datatypes::f32;
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<T> input_data1(input_size), input_data2(input_size),
-            input_data3(input_size), elt_add(input_size);
-    std::vector<T> sc_out_1(input_size), sc_out_2(input_size);
-    std::vector<T> ref_output1(input_size), ref_output2(input_size);
-
-    // prepare input value
-    std::vector<std::vector<T>> inp_data
-            = {input_data1, input_data2, input_data3, elt_add};
-    prepare_input_value(inp_data, input_size);
-
-    // test gc
-    test_gc<T>(input_dims, dtype, op_name, inp_format, inp_data, sc_out_1,
-            sc_out_2, attrs, graph_func, args_func);
-
-    // prepare ref result
-    prepare_ref_value<T>(ref_func, ref_output1, ref_output2, inp_data, attrs,
-            input_size, graph_func != nullptr);
-
-    // compare test result
-    cmp_result<T>(is_bf16, sc_out_1, sc_out_2, ref_output1, ref_output2,
-            input_size, rtol, atol, graph_func != nullptr);
-}
-
-template <class T>
-static void check_fused(const std::string &op_name, const sc_dims &test_shapes,
-        void (*ref_func)(T *, T *, const T *, const T *, const T *, size_t,
-                any_map_t &attrs),
-        const sc_data_format_t &inp_format) {
-    check_binary_backward<T>(
-            op_name, test_shapes, ref_func, inp_format,
-            [](sc_graph_t &g, graph_tensor_ptr op, const sc_dims &inputdims,
-                    const sc_data_format_t &outfmt,
-                    const sc_data_type_t &dtype) {
-                auto extra_in = g.make_input({graph_tensor::make(
-                                                     inputdims, outfmt, dtype)})
-                                        ->get_outputs()[0];
-                return g.make("add", {std::move(op), extra_in}, {}, {});
-            },
-            [](std::vector<generic_val> &args, std::vector<T> &eltadd) {
-                args.emplace_back(eltadd.data());
-            });
-}
-
-static std::vector<sc_dims> test_shapes
-        = {{16, 63}, {2, 8, 4}, {4, 16, 256, 1024}};
-static std::vector<sc_data_format_t> test_inp_formats
-        = {format_kinds::AB, format_kinds::ABC, format_kinds::ABCD};
-template <class T>
-static void make_binary_backward_test(const std::string &op_name,
-        void (*ref_func)(T *, T *, const T *, const T *, const T *, size_t,
-                any_map_t &attrs)) {
-    for (size_t idx = 0; idx < test_shapes.size(); idx++) {
-        check_binary_backward<T>(op_name, test_shapes[idx], ref_func);
-        check_fused<T>(
-                op_name, test_shapes[idx], ref_func, test_inp_formats[idx]);
-    }
-}
-TEST(GCCore_CPU_binary_backward_test, TestPreluBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_backward_test<float>("prelu_bwd", ref_prelu_bwd);
-    make_binary_backward_test<bf16_t>("prelu_bwd", ref_prelu_bwd);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_binary_elementwise.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_binary_elementwise.cpp
deleted file mode 100644
index 29a124c62c5..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_binary_elementwise.cpp
+++ /dev/null
@@ -1,580 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <numeric>
-#include <vector>
-
-#include "context.hpp"
-#include "reference/act_ref.hpp"
-#include "test_utils.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/trait/may_broadcast.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/jit/jit.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static void binary_add_ref(const sc_dims &lhs_plain_dims,
-        const sc_dims &rhs_plain_dims, const sc_dims &out_plain_dims,
-        const std::vector<std::vector<int>> &plain_bc_axis,
-        std::vector<float> &lhs, std::vector<float> &rhs,
-        std::vector<float> &out) {
-    auto &lhs_plain_axis = plain_bc_axis[0];
-    auto &rhs_plain_axis = plain_bc_axis[1];
-    auto extended_lhs_plain_dims = test_utils::get_extended_plain_dims(
-            lhs_plain_axis, lhs_plain_dims, out_plain_dims);
-    auto extended_rhs_plain_dims = test_utils::get_extended_plain_dims(
-            rhs_plain_axis, rhs_plain_dims, out_plain_dims);
-    sc_dims lhs_strides
-            = test_utils::compute_dense_stride(extended_lhs_plain_dims);
-    sc_dims rhs_strides
-            = test_utils::compute_dense_stride(extended_rhs_plain_dims);
-    sc_dims output_strides = test_utils::compute_dense_stride(out_plain_dims);
-    const size_t total_size = out.size();
-    utils::parallel_for(0, total_size, 1, [&](int64_t i) {
-        size_t lhs_idx_flattened = 0, rhs_idx_flattened = 0;
-        size_t idx = i;
-        for (size_t d = 0; d < output_strides.size(); ++d) {
-            auto output_idx = idx / output_strides[d];
-            idx -= output_idx * output_strides[d];
-            auto lhs_idx = extended_lhs_plain_dims[d] == 1 ? 0 : output_idx;
-            lhs_idx_flattened += lhs_idx * lhs_strides[d];
-            auto rhs_idx = extended_rhs_plain_dims[d] == 1 ? 0 : output_idx;
-            rhs_idx_flattened += rhs_idx * rhs_strides[d];
-        }
-        out[i] = lhs[lhs_idx_flattened] + rhs[rhs_idx_flattened];
-    });
-}
-
-static void check_broadcast_correctness(const sc_dims &lhs_plain_dims,
-        const sc_dims &rhs_plain_dims,
-        sc_data_format_t lhs_format = sc_data_format_t(),
-        sc_data_format_t rhs_format = sc_data_format_t(),
-        const std::vector<int> &bc_axis = {}) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-    auto input = graph.make_input({std::make_shared<graph_tensor>(nullptr,
-                                           lhs_format.to_plain(),
-                                           lhs_plain_dims, datatypes::f32),
-            std::make_shared<graph_tensor>(nullptr, rhs_format.to_plain(),
-                    rhs_plain_dims, datatypes::f32)});
-    sc_op_ptr reorder_lhs, reorder_rhs;
-    reorder_lhs = graph.make("reorder", {input->get_outputs()[0]}, {},
-            {{"out_format", lhs_format}, {"internal", true}});
-    reorder_rhs = graph.make("reorder", {input->get_outputs()[1]}, {},
-            {{"out_format", rhs_format}, {"internal", true}});
-    auto add = graph.make("add",
-            {reorder_lhs->get_outputs()[0], reorder_rhs->get_outputs()[0]}, {},
-            {{"bc_axis", bc_axis}});
-    auto output = graph.make_output(add->get_outputs());
-    graph_driver(graph, get_test_ctx());
-    ir_module_ptr mod = lower_graph(get_test_ctx(), graph, {input, output});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(mod, true);
-    std::vector<generic_val> gargs;
-    sc_dim lhs_size = test_utils::product(lhs_plain_dims);
-    sc_dim rhs_size = test_utils::product(rhs_plain_dims);
-    const auto &out_plain_dims
-            = output->get_inputs()[0]->details_.get_plain_dims();
-    const auto &plain_bc_axis
-            = dynamic_cast<op_traits::may_broadcast_t *>(add.get())
-                      ->get_plain_bc_axis();
-    sc_dim out_size = test_utils::product(out_plain_dims);
-    std::vector<float> lhs(lhs_size, 0.f);
-    std::iota(lhs.begin(), lhs.end(), 0);
-    std::vector<float> rhs(rhs_size, 0.f);
-    std::iota(rhs.begin(), rhs.end(), 0);
-    std::vector<float> out(out_size, 0.f);
-    std::vector<float> ref_out(out_size, 0.f);
-    gargs.emplace_back(lhs.data());
-    gargs.emplace_back(rhs.data());
-    gargs.emplace_back(out.data());
-    fptr->call_generic_default(gargs.data());
-    binary_add_ref(lhs_plain_dims, rhs_plain_dims, out_plain_dims,
-            plain_bc_axis, lhs, rhs, ref_out);
-    test_utils::compare_data<float>(out.data(), ref_out.data(), ref_out.size());
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestCorrectnessNonBlocking) {
-    // 4D + 1D
-    check_broadcast_correctness({2, 3, 5, 7}, {1},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 3, 2)),
-            sc_data_format_t(format_kinds::A));
-    check_broadcast_correctness({2, 3, 5, 7}, {1},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(format_kinds::A));
-    check_broadcast_correctness({2, 3, 5, 7}, {7},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 3, 2)),
-            sc_data_format_t(format_kinds::A));
-    check_broadcast_correctness({2, 3, 5, 7}, {7},
-            sc_data_format_t(sc_data_format_kind_t(3, 1, 0, 2)),
-            sc_data_format_t(format_kinds::A));
-    check_broadcast_correctness({2, 3, 5, 7}, {7},
-            sc_data_format_t(sc_data_format_kind_t(1, 0, 3, 2)),
-            sc_data_format_t(format_kinds::A));
-    check_broadcast_correctness({2, 3, 5, 7}, {3},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3)),
-            sc_data_format_t(format_kinds::A), {1});
-    // 4D + 2D
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 3, 2)),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 3, 2)),
-            sc_data_format_t(format_kinds::BA));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::BA));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::BA));
-    check_broadcast_correctness({2, 3, 5, 7}, {5, 7},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(format_kinds::BA));
-    // 4D + 3D
-    check_broadcast_correctness({2, 3, 5, 7}, {3, 5, 7},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(sc_data_format_kind_t(2, 1, 0)));
-    check_broadcast_correctness({2, 3, 5, 7}, {3, 5, 7},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(sc_data_format_kind_t(1, 2, 0)));
-    check_broadcast_correctness({2, 3, 5, 7}, {3, 5, 7},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(sc_data_format_kind_t(0, 2, 1)));
-    check_broadcast_correctness({2, 3, 5, 7}, {3, 5, 7},
-            sc_data_format_t(sc_data_format_kind_t(3, 2, 1, 0)),
-            sc_data_format_t(sc_data_format_kind_t(2, 0, 1)));
-    check_broadcast_correctness({2, 3, 5, 7}, {3, 5, 7},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(sc_data_format_kind_t(2, 0, 1)));
-    check_broadcast_correctness({2, 3, 5, 7}, {3, 5, 7},
-            sc_data_format_t(sc_data_format_kind_t(0, 3, 1, 2)),
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2)));
-    // 4D + 4D
-    check_broadcast_correctness({2, 3, 5, 7}, {2, 3, 5, 7},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::ABCD));
-    check_broadcast_correctness({2, 3, 5, 7}, {2, 1, 1, 7},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(sc_data_format_kind_t(3, 0, 2, 1)));
-    // 3D + 4D
-    check_broadcast_correctness({96, 1, 1}, {2, 96, 56, 56},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2)),
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3)));
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestCorrectnessSingleSideBlockingLHS) {
-    // lhs blocking
-    check_broadcast_correctness({1, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({1, 128, 16, 64}, {1, 128, 1, 64},
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::ABCD));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBDcdc, {16, 16, 4}),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBDcdc, {16, 16, 4}),
-            sc_data_format_t(format_kinds::BA));
-    check_broadcast_correctness({64, 64, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ABCDba, {4, 16}),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({64, 64, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ABCDba, {4, 16}),
-            sc_data_format_t(format_kinds::BA));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 1, 3), {4, 16}),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 1, 3), {4, 16}),
-            sc_data_format_t(format_kinds::BA));
-    check_broadcast_correctness({128 * 16, 64}, {1, 64},
-            sc_data_format_t(sc_data_format_kind_t(0, 0, 0, 1), {128 * 16, 16}),
-            sc_data_format_t(format_kinds::AB));
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestCorrectnessSingleSideBlockingRHS) {
-    // rhs blocking
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::ABba, {16, 4}));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::BAab, {4, 16}));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(sc_data_format_kind_t(1, 0, 1, 0), {16, 4}));
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestCorrectnessBlocking) {
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::ABab, {4, 16}));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::ABab, {8, 8}));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(format_kinds::ACBDcd, {8, 16}),
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 0, 1, 0), {4, 16, 2}));
-    check_broadcast_correctness({1, 128, 16, 64}, {16, 64},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 1, 3), {4, 16}),
-            sc_data_format_t(format_kinds::ABab, {4, 16}));
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestConstInferShape) {
-    sc_graph_t graph;
-    std::vector<float> const_data(320);
-    auto const1 = graph.make("constant", {}, {},
-            {{"values", std::make_shared<static_data_t>(const_data)},
-                    {"dtype", datatypes::f32},
-                    {"plain_dims", sc_dims {1, 1, 1, 320}},
-                    {"format", sc_data_format_t()}});
-    auto input = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t(), sc_dims {1}, datatypes::f32)});
-    auto add = graph.make(
-            "add", {const1->get_outputs()[0], input->get_outputs()[0]}, {}, {});
-    auto output = graph.make_output(add->get_outputs());
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    constexpr const char *expected
-            = R"(graph(v0: f32[1]) -> [v1: f32[1, 1, 1, 320]] {
-  [v2: f32[1, 1, 1, 320]] = constant([1, 1, 1, 320])
-  [v1: f32[1, 1, 1, 320]] = add(v2, v0)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-template <class T>
-static void compare_result(bool is_bf16, const std::vector<T> &ref_output,
-        const std::vector<T> &sc_output, float rtol = 1e-4f,
-        float atol = 1e-5f) {
-    if (!is_bf16) {
-        test_utils::compare_data(sc_output, ref_output, rtol, atol);
-    } else {
-        float sum = 0.f;
-        std::for_each(sc_output.begin(), sc_output.end(),
-                [&sum](const T &n) { sum += std::abs(float(n)); });
-        auto rmse = test_utils::cal_rmse(sc_output, ref_output);
-        if (std::isnan(rmse) || std::isinf(rmse)) { return; }
-        auto check = (sum != 0.f) ? rmse / sum : rmse;
-        EXPECT_TRUE(check < rtol);
-    }
-}
-
-template <class T>
-static void fill_test_data(const std::string &op_name,
-        std::vector<std::vector<T>> &inp_data, const int input_size) {
-    auto &input_data1 = inp_data[0];
-    auto &input_data2 = inp_data[1];
-    auto &elt_add = inp_data[2];
-    auto fill_func = [&](const T l, const T r) {
-        test_utils::fill_data(&input_data1[0], input_size, static_cast<T>(l),
-                static_cast<T>(r));
-        test_utils::fill_data(&input_data2[0], input_size, static_cast<T>(l),
-                static_cast<T>(r));
-        test_utils::fill_data(
-                &elt_add[0], input_size, static_cast<T>(l), static_cast<T>(r));
-    };
-    if (utils::is_one_of(op_name, std::string("pow"))) {
-        fill_func(static_cast<T>(1e-4f), static_cast<T>(1.f));
-    } else if (utils::is_one_of(op_name, std::string("hardsigmoid_bwd"))) {
-        fill_func(static_cast<T>(1.f), static_cast<T>(4.f));
-    } else if (utils::is_one_of(op_name, std::string("sqrt_bwd"))) {
-        fill_func(static_cast<T>(1.f), static_cast<T>(256.f));
-    } else {
-        test_utils::fill_data(&input_data1[0], input_size);
-        test_utils::fill_data(&input_data2[0], input_size);
-        test_utils::fill_data(&elt_add[0], input_size);
-    }
-}
-
-template <class T>
-static void cal_ref_output(
-        void (*ref_func)(T *, const T *, const T *, size_t, any_map_t &attr),
-        std::vector<std::vector<T>> &inp_data, std::vector<T> &ref_output,
-        const int input_size, any_map_t &attrs, bool need_fused = false) {
-    auto &input_data1 = inp_data[0];
-    auto &input_data2 = inp_data[1];
-    auto &elt_add = inp_data[2];
-    ref_func(ref_output.data(), input_data1.data(), input_data2.data(),
-            input_size, attrs);
-    if (need_fused) {
-        std::transform(ref_output.begin(), ref_output.end(), elt_add.begin(),
-                ref_output.begin(), std::plus<T>());
-    }
-}
-
-template <class T>
-static void set_attrs(
-        const std::string &op_name, any_map_t &attrs, float &rtol) {
-    if (op_name == "clamp_bwd") {
-        attrs.set("min", 0.f);
-        attrs.set("max", 1.f);
-        attrs.set("use_dst", false);
-    } else if (op_name == "elu_bwd") {
-        attrs.set("alpha", 0.f);
-        attrs.set("use_dst", false);
-    } else if (op_name == "hardswish_bwd" || op_name == "hardsigmoid_bwd") {
-        attrs.set("alpha", 1.f);
-        attrs.set("beta", 1.f);
-        rtol = 1e-4f;
-    } else if (op_name == "sqrt_bwd" || op_name == "tanh_bwd") {
-        attrs.set("use_dst", false);
-    } else if (op_name == "prelu_bwd") {
-        attrs.set("alpha", 1.f);
-    } else if (op_name == "mish_bwd") {
-        rtol = 1e-4f * 5;
-    }
-}
-
-template <class T>
-static void make_gc_test(bool is_bf16, const std::string &op_name,
-        const sc_data_format_t &infmt, std::vector<std::vector<T>> &inp_data,
-        std::vector<T> &sc_output, const sc_dims &input_dims, any_map_t &attrs,
-        const std::function<sc_op_ptr(sc_graph_t &, graph_tensor_ptr,
-                const sc_dims &, const sc_data_format_t &,
-                const sc_data_type_t &)> &graph_func
-        = nullptr,
-        const std::function<void(std::vector<generic_val> &, std::vector<T> &)>
-                &args_func
-        = nullptr) {
-    auto &input_data1 = inp_data[0];
-    auto &input_data2 = inp_data[1];
-    auto &elt_add = inp_data[2];
-
-    auto dtype = is_bf16 ? datatypes::bf16 : datatypes::f32;
-    sc_graph_t g;
-    std::shared_ptr<sc_op> op, reorder1;
-    sc_op_ptr ins = g.make_input(
-            {graph_tensor::make(input_dims,
-                     graph_func ? infmt : sc_data_format_t(), dtype),
-                    graph_tensor::make(input_dims,
-                            graph_func ? infmt : sc_data_format_t(), dtype)});
-    op = g.make(op_name, ins->get_outputs(), {}, attrs);
-    if (graph_func) {
-        op = graph_func(g, op->get_outputs()[0], input_dims, infmt, dtype);
-    }
-    g.make_output(op->get_outputs());
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = false;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = false;
-    graph_driver(g, get_test_ctx());
-    auto f = lower_graph(get_test_ctx(), g, {});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f);
-
-    std::vector<generic_val> args;
-    if (args_func) { args_func(args, elt_add); }
-    args.emplace_back(input_data1.data());
-    args.emplace_back(input_data2.data());
-    args.emplace_back(sc_output.data());
-    fptr->call_generic_default(args.data());
-}
-
-static bool do_test_condition(bool is_bf16) {
-    if (is_bf16
-            && !::dnnl::impl::graph::gc::get_default_context()
-                        ->machine_.cpu_flags_.fAVX512F) {
-        return false;
-    }
-    return true;
-}
-
-template <class T>
-static void add_to_container(std::vector<std::vector<T>> &inp_data,
-        const std::vector<T> &input_data1, const std::vector<T> &input_data2,
-        const std::vector<T> &elt_add) {
-    inp_data.emplace_back(input_data1);
-    inp_data.emplace_back(input_data2);
-    inp_data.emplace_back(elt_add);
-}
-
-template <typename T,
-        typename dummy = typename std::enable_if<
-                std::is_same<typename std::decay<T>::type, float>::value
-                || std::is_same<typename std::decay<T>::type, bf16_t>::value>>
-static void check_binary_elementwise(const std::string &op_name,
-        const sc_dims &input_dims,
-        void (*ref_func)(T *, const T *, const T *, size_t, any_map_t &attr),
-        const sc_data_format_t &infmt = format_kinds::A,
-        const std::function<sc_op_ptr(sc_graph_t &, graph_tensor_ptr,
-                const sc_dims &, const sc_data_format_t &,
-                const sc_data_type_t &)> &graph_func
-        = nullptr,
-        const std::function<void(std::vector<generic_val> &, std::vector<T> &)>
-                &args_func
-        = nullptr) {
-    bool is_bf16 = std::is_same<typename std::decay<T>::type, bf16_t>::value;
-    // reject to do the test
-    if (!do_test_condition(is_bf16)) { return; }
-
-    // define input and output data container
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<T> input_data1(input_size), input_data2(input_size),
-            ref_output(input_size), elt_add(input_size), sc_output(input_size);
-    float rtol = 1e-4f, atol = 1e-5f;
-    std::vector<std::vector<T>> inp_data;
-
-    // set op attrs
-    any_map_t attrs;
-    set_attrs<T>(op_name, attrs, rtol);
-
-    // prepare input data
-    add_to_container<T>(inp_data, input_data1, input_data2, elt_add);
-    fill_test_data<T>(op_name, inp_data, input_size);
-
-    // do gc test
-    make_gc_test<T>(is_bf16, op_name, infmt, inp_data, sc_output, input_dims,
-            attrs, graph_func, args_func);
-
-    // calculate ref output
-    cal_ref_output(ref_func, inp_data, ref_output, input_size, attrs,
-            graph_func != nullptr);
-
-    // compare ref and sc output
-    compare_result(is_bf16, ref_output, sc_output, rtol, atol);
-}
-
-template <class T>
-static void check_fused(const std::string &op_name, const sc_dims &test_shapes,
-        void (*ref_func)(T *, const T *, const T *, size_t, any_map_t &attr),
-        const sc_data_format_t &inp_format) {
-    check_binary_elementwise<T>(
-            op_name, test_shapes, ref_func, inp_format,
-            [](sc_graph_t &g, graph_tensor_ptr op, const sc_dims &inputdims,
-                    const sc_data_format_t &outfmt,
-                    const sc_data_type_t &dtype) {
-                auto extra_in = g.make_input({graph_tensor::make(
-                                                     inputdims, outfmt, dtype)})
-                                        ->get_outputs()[0];
-                return g.make("add", {std::move(op), extra_in}, {}, {});
-            },
-            [](std::vector<generic_val> &args, std::vector<T> &eltadd) {
-                args.emplace_back(eltadd.data());
-            });
-}
-
-static std::vector<sc_dims> test_shapes
-        = {{16, 63}, {2, 8, 4}, {4, 16, 256, 1024}};
-static std::vector<sc_data_format_t> test_inp_formats
-        = {format_kinds::AB, format_kinds::ABC, format_kinds::ABCD};
-template <class T>
-static void make_binary_elt_test(const std::string &op_name,
-        void (*ref_func)(T *, const T *, const T *, size_t, any_map_t &attr)) {
-    for (size_t idx = 0; idx < test_shapes.size(); idx++) {
-        check_binary_elementwise<T>(op_name, test_shapes[idx], ref_func);
-        check_fused<T>(
-                op_name, test_shapes[idx], ref_func, test_inp_formats[idx]);
-    }
-}
-TEST(GCCore_CPU_binary_elementwise_test, TestPreluOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_binary_elementwise<float>("prelu", shape, ref_prelu);
-        check_binary_elementwise<bf16_t>("prelu", shape, ref_prelu);
-    }
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestAbsBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("abs_bwd", ref_abs_bwd);
-    make_binary_elt_test<bf16_t>("abs_bwd", ref_abs_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestClampBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("clamp_bwd", ref_clamp_bwd);
-    make_binary_elt_test<bf16_t>("clamp_bwd", ref_clamp_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestEluBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("elu_bwd", ref_elu_bwd);
-    make_binary_elt_test<bf16_t>("elu_bwd", ref_elu_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestHardSwishBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("hardswish_bwd", ref_hardswish_bwd);
-    make_binary_elt_test<bf16_t>("hardswish_bwd", ref_hardswish_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestHardSigmoidBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("hardsigmoid_bwd", ref_hardsigmoid_bwd);
-    make_binary_elt_test<bf16_t>("hardsigmoid_bwd", ref_hardsigmoid_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestSqrtBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("sqrt_bwd", ref_sqrt_bwd);
-    make_binary_elt_test<bf16_t>("sqrt_bwd", ref_sqrt_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestMishBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("mish_bwd", ref_mish_bwd);
-    make_binary_elt_test<bf16_t>("mish_bwd", ref_mish_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestTanhBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("tanh_bwd", ref_tanh_bwd);
-    make_binary_elt_test<bf16_t>("tanh_bwd", ref_tanh_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestSoftPlusBwdOp) {
-    REQUIRE_AVX2();
-    make_binary_elt_test<float>("soft_plus_bwd", ref_softplus_bwd);
-    make_binary_elt_test<bf16_t>("soft_plus_bwd", ref_softplus_bwd);
-}
-
-TEST(GCCore_CPU_binary_elementwise_test, TestCorrectnessBidirectional) {
-    check_broadcast_correctness({1, 64, 16, 1}, {16, 64},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::AB));
-    check_broadcast_correctness({4, 1, 1, 1}, {16, 32, 16},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABC));
-    check_broadcast_correctness({1, 16, 64}, {1, 64, 16, 1},
-            sc_data_format_t(format_kinds::ABC),
-            sc_data_format_t(format_kinds::ABCD));
-    check_broadcast_correctness({4, 16, 1, 1}, {16, 1, 16},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABC));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_binding_axis.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_binding_axis.cpp
deleted file mode 100644
index 6b36622caf3..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_binding_axis.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/loop_merge.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_graph_binding_axis, TestMHAPostOps) {
-    sc_graph_t graph;
-    SET_THREADS_OR_SKIP(8);
-    // build test graph
-    auto input0 = graph.make_input(test_utils::make_tsr({8, 16, 16, 16, 32}));
-    auto relu0 = graph.make("relu", input0->get_outputs(), {}, {});
-    auto reduce0 = graph.make("reduce", relu0->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {2, 4}}, {"rd_op", 0}});
-    auto relu1 = graph.make("relu", reduce0->get_outputs(), {}, {});
-    auto mul0 = graph.make(
-            "mul", {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {}, {});
-    graph.make_output(mul0->get_outputs());
-    // get test context
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver_before_fusion(graph, ctx);
-    // diable partition optimization
-    ctx->flags_.opt_level_ = sc_opt_level::lv0;
-    mixed_partition(graph, ctx);
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    // get func of parti
-    func_c f = mixed_op->parti_list_[0]->func_;
-    ir_simplifier_t simp {false};
-    loop_merger_t lm;
-    // simplify TIR
-    f = simp(f);
-    // do loop merge
-    f = lm(f);
-    // get lanes
-    int lanes = ctx->get_max_vector_lanes(sc_data_etype::S32);
-    builder::ir_builder_t builder;
-    _function_(datatypes::boolean, expected,
-            _arg_("out0", datatypes::s32, {8UL, 16UL, 16UL, 16UL, 32UL}),
-            _arg_("inp0", datatypes::s32, {8UL, 16UL, 16UL, 16UL, 32UL})) {
-        _bind_(out0, inp0);
-        _for_(bs, 0, 8UL) {
-            _for_(m_o, 0, 16UL) {
-                _tensor_(_relu_out_1, datatypes::s32,
-                        {8UL, 16UL, 1UL, 16UL, 1UL});
-                _tensor_(_relu_out_0, datatypes::s32,
-                        {8UL, 16UL, 16UL, 16UL, 32UL});
-                _tensor_(_reduce_out_0, datatypes::s32,
-                        {8UL, 16UL, 1UL, 16UL, 1UL});
-                // fuse relu
-                _for_(n_o, 0, 16UL) {
-                    _for_(m_i, 0, 16UL) {
-                        _for_(n_i, 0, 32UL, lanes) {
-                            builder::tensor_ptr(_relu_out_0,
-                                    {bs, m_o, n_o, m_i, 0}, {},
-                                    true)[span_t({0, 0, 0, 0, n_i}, lanes)]
-                                    = builder::make_max(
-                                            make_expr<constant_node>((int64_t)0,
-                                                    sc_data_type_t::s32(lanes)),
-                                            builder::tensor_ptr(inp0,
-                                                    {bs, m_o, n_o, m_i, 0}, {},
-                                                    true)[span_t(
-                                                    {0, 0, 0, 0, n_i}, lanes)]);
-                        }
-                    }
-                }
-                // fuse reduce
-                _for_(m_i, 0, 16UL) {
-                    _var_(reduce_sum, sc_data_type_t::s32(lanes));
-                    reduce_sum = make_expr<constant_node>(
-                            int64_t(0), sc_data_type_t::s32(lanes));
-                    _for_(n_o, 0, 16UL) {
-                        _for_(n_i, 0, 32UL, lanes) {
-                            reduce_sum = builder::make_add(
-                                    builder::tensor_ptr(_relu_out_0,
-                                            {bs, m_o, 0, 0, 0}, {},
-                                            true)[span_t(
-                                            {0, 0, n_o, m_i, n_i}, lanes)],
-                                    reduce_sum);
-                        }
-                    }
-                    builder::tensor_ptr(_reduce_out_0, {bs, m_o, 0, 0, 0}, {},
-                            true)[{0, 0, 0, m_i, 0}]
-                            = builder::make_reduce_add(reduce_sum);
-                    /** NOTE: Even though m_o loop has been transformed several
-                     * times for both `reduce` and `relu` op, they can be merged
-                     * safely, ensured by binding axis system. **/
-                    // fuse relu, and merge two loops: `m_o` and `m_o`
-                    builder::tensor_ptr(_relu_out_1, {bs, m_o, 0, 0, 0}, {},
-                            true)[{0, 0, 0, m_i, 0}]
-                            = builder::make_max(
-                                    make_expr<constant_node>(
-                                            (int64_t)0, sc_data_type_t::s32()),
-                                    builder::tensor_ptr(_reduce_out_0,
-                                            {bs, m_o, 0, 0, 0}, {},
-                                            true)[{0, 0, 0, m_i, 0}]);
-                }
-                /** NOTE: `m_o` and `n_o` loop come from different binding axis
-                 * respectively, as the result, they could never be merged
-                 * although two loops have same range `16` and become nearby
-                 * after ir simplify pass. **/
-                // fuse broadcast mul but could not merge `m_o` and `n_o` loops
-                _for_(n_o, 0, 16UL) {
-                    _for_(m_i, 0, 16UL) {
-                        _for_(n_i, 0, 32UL, lanes) {
-                            builder::tensor_ptr(out0, {bs, m_o, n_o, m_i, 0},
-                                    {}, true)[span_t({0, 0, 0, 0, n_i}, lanes)]
-                                    = builder::tensor_ptr(_relu_out_0,
-                                              {bs, m_o, n_o, m_i, 0}, {},
-                                              true)[span_t(
-                                              {0, 0, 0, 0, n_i}, lanes)]
-                                    * builder::make_broadcast(
-                                            builder::tensor_ptr(_relu_out_1,
-                                                    {bs, m_o, 0, m_i, 0}, {},
-                                                    true)[{0, 0, 0, 0, 0}],
-                                            lanes);
-                        }
-                    }
-                }
-            }
-        }
-        _return_(true);
-    }
-    // compare final TIR
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(f, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_bmm_broadcast.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_bmm_broadcast.cpp
deleted file mode 100644
index b5bb22d2c7e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_bmm_broadcast.cpp
+++ /dev/null
@@ -1,333 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-
-#include <fstream>
-#include <iostream>
-#include "compiler/ir/graph/driver.hpp"
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/graph/graph_op.hpp"
-#include "compiler/ir/graph/lowering.hpp"
-#include "compiler/ir/graph/pass/pass.hpp"
-#include "compiler/ir/graph/transform/transform.hpp"
-#include "compiler/jit/jit.hpp"
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-void run_single_bmm(const sc_dims &A_dims, const sc_dims &B_dims,
-        const sc_dims &out_dims, test_buffer<float> &A_data,
-        test_buffer<float> &B_data, test_buffer<float> &out_data) {
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto make_tensor = [](const sc_dims &shape) {
-        return std::make_shared<graph_tensor>(nullptr, sc_data_format_t(),
-                shape, sc_data_type_t(sc_data_etype::F32, 1));
-    };
-
-    sc_graph_t graph;
-    auto ins0 = make_tensor(A_dims);
-    auto ins1 = make_tensor(B_dims);
-    auto outs0 = make_tensor(out_dims);
-    any_map_t attrs({{"transpose_a", false}, {"transpose_b", false},
-            {"use_mmm", false}});
-    auto in = graph.make_input({ins0, ins1});
-    auto matmul = graph.make("matmul_core", in->get_outputs(), {outs0}, attrs);
-    auto output = graph.make_output(matmul->get_outputs());
-
-    graph_driver(graph, ctx);
-    auto mod = lower_graph(ctx, graph, {output, in});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(mod, true);
-
-    std::vector<float *> sc_args = {
-            &out_data[0],
-            &A_data[0],
-            &B_data[0],
-    };
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case1) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {1, 16, 16, 128}, B_dims {1, 1, 128, 16},
-            out_dims {1, 16, 16, 16};
-    sc_dims B_bc_dims {1, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 16; ++i) {
-        for (size_t j = 0; j < B_size; ++j) {
-            B_bc_data[i * B_size + j] = B_data[j];
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_dims, B_bc_dims, out_dims, A_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case2) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {8, 16, 16, 128}, B_dims {8, 1, 128, 16},
-            out_dims {8, 16, 16, 16};
-    sc_dims B_bc_dims {8, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < 16; ++j) {
-            for (size_t k = 0; k < 128 * 16; ++k) {
-                B_bc_data[i * 16 * 128 * 16 + j * 128 * 16 + k]
-                        = B_data[i * 128 * 16 + k];
-            }
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_dims, B_bc_dims, out_dims, A_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case3) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {8, 16, 16, 128}, B_dims {1, 16, 128, 16},
-            out_dims {8, 16, 16, 16};
-    sc_dims B_bc_dims {8, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < B_size; ++j) {
-            B_bc_data[i * B_size + j] = B_data[j];
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_dims, B_bc_dims, out_dims, A_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case4) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {8, 16, 16, 128}, B_dims {1, 1, 128, 16},
-            out_dims {8, 16, 16, 16};
-    sc_dims B_bc_dims {8, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < 16; ++j) {
-            for (size_t k = 0; k < B_size; ++k) {
-                B_bc_data[i * 16 * B_size + j * B_size + k] = B_data[k];
-            }
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_dims, B_bc_dims, out_dims, A_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case5) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {1, 16, 16, 128}, B_dims {8, 1, 128, 16},
-            out_dims {8, 16, 16, 16};
-    sc_dims A_bc_dims {8, 16, 16, 128}, B_bc_dims {8, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t A_bc_size = test_utils::product(A_bc_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto A_bc_data = alloc_array<float>(A_bc_size);
-    // broadcast A_data to A_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < A_size; ++j) {
-            A_bc_data[i * A_size + j] = A_data[j];
-        }
-    }
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < 16; ++j) {
-            for (size_t k = 0; k < 128 * 16; ++k) {
-                B_bc_data[i * 16 * 128 * 16 + j * 128 * 16 + k]
-                        = B_data[i * 128 * 16 + k];
-            }
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_bc_dims, B_bc_dims, out_dims, A_bc_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case6) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {8, 1, 16, 128}, B_dims {1, 16, 128, 16},
-            out_dims {8, 16, 16, 16};
-    sc_dims A_bc_dims {8, 16, 16, 128}, B_bc_dims {8, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t A_bc_size = test_utils::product(A_bc_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto A_bc_data = alloc_array<float>(A_bc_size);
-    // broadcast A_data to A_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < 16; ++j) {
-            for (size_t k = 0; k < 16 * 128; ++k) {
-                A_bc_data[i * 16 * 16 * 128 + j * 16 * 128 + k]
-                        = A_data[i * 16 * 128 + k];
-            }
-        }
-    }
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < B_size; ++j) {
-            B_bc_data[i * B_size + j] = B_data[j];
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_bc_dims, B_bc_dims, out_dims, A_bc_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case7) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {1, 16, 16, 128}, B_dims {32, 8, 1, 128, 16},
-            out_dims {32, 8, 16, 16, 16};
-    sc_dims A_bc_dims {32, 8, 16, 16, 128}, B_bc_dims {32, 8, 16, 128, 16};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t A_bc_size = test_utils::product(A_bc_dims);
-    const size_t B_bc_size = test_utils::product(B_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto A_bc_data = alloc_array<float>(A_bc_size);
-    // broadcast A_data to A_bc_data manually
-    for (size_t i = 0; i < 32; ++i) {
-        for (size_t j = 0; j < 8; ++j) {
-            for (size_t k = 0; k < A_size; ++k) {
-                A_bc_data[i * 8 * A_size + j * A_size + k] = A_data[k];
-            }
-        }
-    }
-    auto B_bc_data = alloc_array<float>(B_bc_size);
-    // broadcast B_data to B_bc_data manually
-    for (size_t i = 0; i < 32; ++i) {
-        for (size_t j = 0; j < 8; ++j) {
-            for (size_t k = 0; k < 16; ++k) {
-                // replicate 32 * 8 * 16 times.
-                for (size_t l = 0; l < 128 * 16; ++l) {
-                    B_bc_data[i * 8 * 16 * 128 * 16 + j * 16 * 128 * 16
-                            + k * 128 * 16 + l]
-                            = B_data[i * 8 * 128 * 16 + j * 128 * 16 + l];
-                }
-            }
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_bc_dims, B_bc_dims, out_dims, A_bc_data, B_bc_data, ref_out_data);
-    test_utils::compare_data(out_data, ref_out_data, 1e-3f, 1e-4f);
-}
-
-TEST(GCCore_CPU_test_bmm_broadcast, Case8) {
-    REQUIRE_AVX2();
-    sc_dims A_dims {1, 16, 128}, B_dims {8, 16, 128, 16},
-            out_dims {8, 16, 16, 16};
-    sc_dims A_bc_dims {8, 16, 16, 128};
-    const size_t A_size = test_utils::product(A_dims);
-    const size_t B_size = test_utils::product(B_dims);
-    const size_t A_bc_size = test_utils::product(A_bc_dims);
-    const size_t out_size = test_utils::product(out_dims);
-    auto A_data = alloc_array<float>(A_size);
-    auto B_data = alloc_array<float>(B_size);
-    auto A_bc_data = alloc_array<float>(A_bc_size);
-    // broadcast A_data to A_bc_data manually
-    for (size_t i = 0; i < 8; ++i) {
-        for (size_t j = 0; j < 16; ++j) {
-            for (size_t k = 0; k < A_size; ++k) {
-                A_bc_data[i * 16 * A_size + j * A_size + k] = A_data[k];
-            }
-        }
-    }
-    auto out_data = alloc_array<float>(out_size);
-    auto ref1_out_data = alloc_array<float>(out_size);
-    auto ref2_out_data = alloc_array<float>(out_size);
-
-    run_single_bmm(A_dims, B_dims, out_dims, A_data, B_data, out_data);
-    run_single_bmm(
-            A_bc_dims, B_dims, out_dims, A_bc_data, B_data, ref1_out_data);
-    test_utils::compare_data(out_data, ref1_out_data, 1e-3f, 1e-4f);
-
-    sc_dims A_dims_squeezed {16, 128}; // original: {1, 16, 128}
-    run_single_bmm(
-            A_dims_squeezed, B_dims, out_dims, A_data, B_data, ref2_out_data);
-    test_utils::compare_data(out_data, ref2_out_data, 1e-3f, 1e-4f);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_broadcast.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_broadcast.cpp
deleted file mode 100644
index 158694aef39..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_broadcast.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <numeric>
-#include <vector>
-
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/broadcast.hpp>
-#include <util/math_utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static void broadcast_ref(const sc_dims &in_plain_dims,
-        const sc_dims &out_plain_dims, const std::vector<int> &bc_axis,
-        test_buffer<float> &in, std::vector<float> &out) {
-    auto extended_in_plain_dims = test_utils::get_extended_plain_dims(
-            bc_axis, in_plain_dims, out_plain_dims);
-    sc_dims input_strides
-            = test_utils::compute_dense_stride(extended_in_plain_dims);
-    sc_dims output_strides = test_utils::compute_dense_stride(out_plain_dims);
-    const size_t total_size = out.size();
-    utils::parallel_for(0, total_size, 1, [&](int64_t i) {
-        sc_dims output_idx
-                = test_utils::flattened_idx_to_ndims_idx(i, output_strides);
-        sc_dims input_idx(output_idx.size());
-        for (size_t d = 0; d < input_idx.size(); ++d) {
-            input_idx[d] = extended_in_plain_dims[d] == 1 ? 0 : output_idx[d];
-        }
-        auto prod = math_utils::vector_mul(input_idx, input_strides);
-        size_t in_idx = std::accumulate(prod.begin(), prod.end(), 0);
-        out[i] = in[in_idx];
-    });
-}
-
-static void check_broadcast_correctness(const sc_dims &in_plain_dims,
-        const sc_dims &out_plain_dims,
-        const std::vector<int> &bc_axis = std::vector<int> {}) {
-    REQUIRE_AVX2(); // llvm stuck when SSE
-    sc_graph_t graph;
-    auto input = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t(), in_plain_dims, datatypes::f32)});
-    auto broadcast = graph.make("broadcast", input->get_outputs(), {},
-            {{"output_shape", out_plain_dims}, {"bc_axis", bc_axis}});
-    auto output = graph.make_output(broadcast->get_outputs());
-    graph_driver(graph, get_test_ctx());
-    ir_module_ptr mod = lower_graph(get_test_ctx(), graph, {input, output});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(mod, true);
-    std::vector<generic_val> gargs;
-    sc_dim in_size = test_utils::product(in_plain_dims);
-    sc_dim out_size = test_utils::product(out_plain_dims);
-
-    test_buffer<float> in = alloc_array<float>(in_size, INIT_RANDOM);
-    test_buffer<float> out = alloc_array<float>(out_size, INIT_ZERO);
-    std::vector<float> ref_out(out_size, 0.f);
-    gargs.emplace_back(in.data());
-    gargs.emplace_back(out.data());
-    fptr->call_generic_default(gargs.data());
-    broadcast_ref(in_plain_dims, out_plain_dims, bc_axis, in, ref_out);
-    test_utils::compare_data<float>(out.data(), ref_out.data(), ref_out.size());
-}
-
-TEST(GCCore_CPU_broadcast_test, TestCorrectness) {
-    check_broadcast_correctness({32}, {32});
-    check_broadcast_correctness({34}, {34});
-    check_broadcast_correctness({1}, {32});
-    check_broadcast_correctness({1}, {2});
-    check_broadcast_correctness({2}, {2});
-    check_broadcast_correctness({34}, {34});
-    check_broadcast_correctness({1}, {34});
-    check_broadcast_correctness({1, 2}, {2, 2});
-    check_broadcast_correctness({2, 1}, {2, 2});
-    check_broadcast_correctness({1, 17}, {17, 17});
-    check_broadcast_correctness({17, 1}, {17, 17});
-    check_broadcast_correctness({2}, {3, 2});
-    check_broadcast_correctness({3}, {3, 2}, {0});
-    check_broadcast_correctness({1}, {2, 2, 2, 2});
-    check_broadcast_correctness({1, 16, 1, 1}, {4, 16, 8, 4});
-    check_broadcast_correctness({4, 1, 1, 4}, {4, 16, 8, 4});
-    check_broadcast_correctness({4, 16, 8, 1}, {4, 16, 8, 4});
-    check_broadcast_correctness({4, 1, 8, 4}, {4, 16, 8, 4});
-    check_broadcast_correctness({4, 1, 1, 1}, {4, 1, 1, 4});
-    check_broadcast_correctness({1, 16, 1, 4}, {1, 16, 8, 4});
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_buffer_reschedule_tensor_hoist.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_buffer_reschedule_tensor_hoist.cpp
deleted file mode 100644
index 5f73f5e0189..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_buffer_reschedule_tensor_hoist.cpp
+++ /dev/null
@@ -1,248 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include "test_utils.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/module_pass.hpp>
-#include <compiler/ir/transform/buffer_reschedule_tensor_hoist.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <compiler/jit/jit.hpp>
-#include <runtime/barrier.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_buffer_reschedule_tensor_hoist_cpp,
-        TestHoistTensorOutOfParallel_Levels1) {
-    auto ctx = get_test_ctx();
-    builder::ir_builder_t bld;
-
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {4096}),
-            _arg_("B", datatypes::f32, {4096})) {
-        _bind_(A, B);
-        _for_(i, 0, 1024, 1, for_type::PARALLEL, 4) { // inner-most
-            _tensor_(C, datatypes::f32, {UINT64_C(1024)});
-            _for_(j, 0, 1024, 1) { // normal for
-                C[i] = A[i] * 2 + 3.0f;
-            }
-            B[i * 2] = C[i] / 2;
-        }
-    }
-
-    buffer_rescheduling_tensor_hoisting_t pass(ctx, true);
-    func_c out = pass(aaa);
-
-    // tensor C cannot be hoisted because it is in inner-most
-    _function_(datatypes::void_t, expected, _arg_("A", datatypes::f32, {4096}),
-            _arg_("B", datatypes::f32, {4096})) {
-        _bind_(A, B);
-        _for_(i, 0, 1024, 1, for_type::PARALLEL, 4) {
-            _tensor_(C, datatypes::f32, {UINT64_C(1024)});
-            _for_(j, 0, 1024, 1) { C[i] = A[i] * 2 + 3.0f; }
-            B[i * 2] = C[i] / 2;
-        }
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_buffer_reschedule_tensor_hoist_cpp,
-        TestHoistTensorOutOfParallel_Levels2) {
-    auto ctx = get_test_ctx();
-    builder::ir_builder_t bld;
-
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {4096}),
-            _arg_("B", datatypes::f32, {4096})) {
-        _bind_(A, B);
-        _for_(i, 0, 1024, 1, for_type::PARALLEL, 4) {
-            _tensor_(C, datatypes::f32, {UINT64_C(1024)});
-            _for_(j, 0, 128, 1, for_type::PARALLEL, 2) {
-                _tensor_(D, datatypes::f32, {UINT64_C(256)});
-                B[i * 2] = C[i] + D[j] + A[i * 3];
-            }
-        }
-    }
-
-    buffer_rescheduling_tensor_hoisting_t pass(ctx, true);
-    func_c out = pass(aaa);
-
-    _function_(datatypes::void_t, expected, _arg_("A", datatypes::f32, {4096}),
-            _arg_("B", datatypes::f32, {4096})) {
-        _bind_(A, B);
-        bld.push_scope();
-        _tensor_(hoisted_rescheduled_0, datatypes::s8, {UINT64_C(16384)});
-        _for_(i, 0, 1024, 1, for_type::PARALLEL, 4) {
-            _tensor_(rescheduled_0, datatypes::s8, {UINT64_C(4096)});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(hoisted_rescheduled_0.get(),
-                            {(builder::make_get_group_id(UINT64_C(0))
-                                    * UINT64_C(4096))});
-            _tensor_(C, datatypes::f32, {UINT64_C(1024)});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(rescheduled_0.get(), {UINT64_C(0)});
-            _for_(j, 0, 128, 1, for_type::PARALLEL, 2) {
-                _tensor_(rescheduled_1, datatypes::s8, {UINT64_C(1024)});
-                _tensor_(D, datatypes::f32, {UINT64_C(256)});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(
-                                rescheduled_1.get(), {UINT64_C(0)});
-                B[i * 2] = C[i] + D[j] + A[i * 3];
-            }
-        }
-        bld.emit(bld.pop_scope());
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_buffer_reschedule_tensor_hoist_cpp,
-        TestHoistTensorOutOfParallel_Levels_InitTensorD) {
-    auto ctx = get_test_ctx();
-    builder::ir_builder_t bld;
-
-    _function_(datatypes::void_t, aaa, _arg_("A_Input", datatypes::f32, {4096}),
-            _arg_("B_Output", datatypes::f32, {4096})) {
-        _bind_(A_Input, B_Output);
-
-        _for_(i, 0, 1024, 1, for_type::PARALLEL, 2) {
-            _tensor_(C, datatypes::f32, {UINT64_C(1024)});
-            _for_(j, 0, 1024, 1, for_type::PARALLEL, 2) {
-                _tensor_(D, datatypes::f32, {UINT64_C(1024)});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(A_Input, {UINT64_C(1024)});
-                B_Output[j] = C[i] + D[j];
-            }
-        }
-    }
-
-    buffer_rescheduling_tensor_hoisting_t pass(ctx, true);
-    func_c out = pass(aaa);
-
-    _function_(datatypes::void_t, expected,
-            _arg_("A_Input", datatypes::f32, {4096}),
-            _arg_("B_Output", datatypes::f32, {4096})) {
-        _bind_(A_Input, B_Output);
-        bld.push_scope();
-        _tensor_(hoisted_C, datatypes::f32, {UINT64_C(2048)});
-        _for_(i, 0, 1024, 1, for_type::PARALLEL, 2) {
-            _tensor_(C, datatypes::f32, {UINT64_C(1024)});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(hoisted_C.get(),
-                            {(builder::make_get_group_id(UINT64_C(0))
-                                    * UINT64_C(1024))});
-            _for_(j, 0, 1024, 1, for_type::PARALLEL, 2) {
-                _tensor_(D, datatypes::f32, {UINT64_C(1024)});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(A_Input, {UINT64_C(1024)});
-                B_Output[j] = C[i] + D[j];
-            }
-        }
-        bld.emit(bld.pop_scope());
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_buffer_reschedule_tensor_hoist_cpp,
-        TestHoistTensorOutOfParallel_Levels2_Parallel) {
-    auto ctx = get_test_ctx();
-    builder::ir_builder_t bld;
-
-    _function_(datatypes::void_t, aaa) {
-        _for_(i, 0, 100, 1, for_type::PARALLEL, 4) {
-            _tensor_(A, datatypes::f32, {UINT64_C(100)});
-            _for_(j, 0, 100, 1, for_type::PARALLEL, 2) {
-                _tensor_(B, datatypes::f32, {UINT64_C(100)});
-                B[j] = A[j] + 1;
-            }
-            _tensor_(A2, datatypes::f32, {UINT64_C(100)});
-            _for_(j, 0, 100, 1, for_type::PARALLEL, 2) {
-                _tensor_(B2, datatypes::f32, {UINT64_C(100)});
-                B2[j] = A2[j] + 2;
-            }
-        }
-    }
-    buffer_rescheduling_tensor_hoisting_t pass(ctx, false);
-    func_c out = pass(aaa);
-
-    _function_(datatypes::void_t, expected) {
-        bld.push_scope();
-        _tensor_(hoisted_rescheduled_0, datatypes::s8, {UINT64_C(1792)});
-        _for_(i, 0, 100, 1, for_type::PARALLEL, 4) {
-            _tensor_(rescheduled_0, datatypes::s8, {UINT64_C(448)});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(hoisted_rescheduled_0.get(),
-                            {(builder::make_get_group_id(UINT64_C(0))
-                                    * UINT64_C(448))});
-            _tensor_(A, datatypes::f32, {UINT64_C(100)});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(rescheduled_0.get(), {UINT64_C(0)});
-            _for_(j, 0, 100, 1, for_type::PARALLEL, 2) {
-                _tensor_(rescheduled_1, datatypes::s8, {UINT64_C(448)});
-                _tensor_(B, datatypes::f32, {UINT64_C(100)});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(
-                                rescheduled_1.get(), {UINT64_C(0)});
-                B[j] = A[j] + 1;
-            }
-            _tensor_(A2, datatypes::f32, {UINT64_C(100)});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(rescheduled_0.get(), {UINT64_C(0)});
-            _for_(j, 0, 100, 1, for_type::PARALLEL, 2) {
-                _tensor_(rescheduled_2, datatypes::s8, {UINT64_C(448)});
-                _tensor_(B2, datatypes::f32, {UINT64_C(100)});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(
-                                rescheduled_2.get(), {UINT64_C(0)});
-                B2[j] = A2[j] + 2;
-            }
-        }
-        bld.emit(bld.pop_scope());
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_buffer_schedule.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_buffer_schedule.cpp
deleted file mode 100644
index 02d5048e000..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_buffer_schedule.cpp
+++ /dev/null
@@ -1,985 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/transform/tensor_inplace_info.hpp>
-#include <util/any_map.hpp>
-
-#include <unordered_set>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-static context_ptr make_ctx() {
-    auto ret = std::make_shared<context_t>(*get_test_ctx());
-    ret->flags_.buffer_schedule_ = 1;
-    return ret;
-}
-static context_ptr cur_ctx = make_ctx();
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestSimpleSchedule) {
-    ir_builder_t bld;
-    bld.push_scope();
-    _tensor_(external, datatypes::f32, {100});
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _tensor_(b, datatypes::f32, {50});
-        // make a staic local tensor here. This tensor should not be touched
-        _tensor_(static_v, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->linkage_
-                = linkage::static_local;
-
-        _var_(scalar, datatypes::f32);
-        _var_(ptr, datatypes::pointer);
-        ptr = a;
-        a[0] = external[0];
-        b[0] = a[0];
-
-        _tensor_(c, datatypes::f32, {100});
-        c[0] = b[0];
-
-        _tensor_(d, datatypes::f32, {100});
-        d[0] = c[0];
-
-        _tensor_(e, datatypes::f32, {100});
-        // overwriting b in the below line prevents reusing b for d
-        b[10] = 2;
-        e[0] = d[0];
-
-        _tensor_(f, datatypes::f32, {100});
-        scalar = e[0];
-
-        _tensor_(int8, datatypes::s8, {100});
-        int8[0] = external[0];
-        external[0] = int8[0];
-    }
-    auto body = bld.pop_scope();
-    body->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_WHOLE;
-
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        // check dimension extension
-        _tensor_(b, datatypes::f32, {100UL});
-        // make a staic local tensor here. This tensor should not be touched
-        _tensor_(static_v, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->linkage_
-                = linkage::static_local;
-
-        _var_(scalar, datatypes::f32);
-        _var_(ptr, datatypes::pointer);
-        ptr = a;
-        a[0] = external[0];
-        b[0] = a[0];
-
-        // tensor c
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        b[0] = b[0];
-
-        _tensor_(d, datatypes::f32, {100});
-        d[0] = b[0];
-
-        // tensor e
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        b[10] = 2;
-        d[0] = d[0];
-
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        scalar = d[0];
-
-        _tensor_(int8, datatypes::s8, {100});
-        int8[0] = external[0];
-        external[0] = int8[0];
-    }
-    auto body2 = bld.pop_scope();
-    buffer_scheduler_t sch(cur_ctx, false);
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(sch(body), body2));
-
-    ////Test for aggressive mode
-    bld.push_scope();
-    {
-        _tensor_(rescheduled, datatypes::s8, {704UL});
-        _tensor_(a, datatypes::f32, {100});
-        // tensor b
-        _tensor_(b, datatypes::f32, {50});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {0UL});
-        // make a staic local tensor here. This tensor should not be touched
-        _tensor_(static_v, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->linkage_
-                = linkage::static_local;
-
-        _var_(scalar, datatypes::f32);
-        _var_(ptr, datatypes::pointer);
-        ptr = a;
-        a[0] = external[0];
-        b[0] = a[0];
-
-        // tensor c
-        _tensor_(c, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {256UL});
-        c[0] = b[0];
-
-        // tensor d
-        _tensor_(d, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {256UL});
-        d[0] = c[0];
-
-        // tensor e
-        _tensor_(e, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {0UL});
-        b[10] = 2;
-        e[0] = d[0];
-
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        scalar = e[0];
-
-        _tensor_(int8, datatypes::s8, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {0UL});
-        int8[0] = external[0];
-        external[0] = int8[0];
-    }
-    auto body_aggresive = bld.pop_scope();
-    body->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_SIZE;
-    EXPECT_TRUE(cmper.compare(sch(body), body_aggresive));
-
-    // test2, cannot reuse because of interleaved access
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {10});
-        _tensor_(b, datatypes::f32, {50});
-        a[0] = external[0]; // 2...1
-        b[0] = a[0]; // 4...3
-        b[2] = a[2]; // 6...5
-        b[1] = external[2]; // 8...7
-        a[0] = b[1]; // 10...9
-    }
-    body = bld.pop_scope();
-    body->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_WHOLE;
-    // A: FirstAccessTick=FAT~2, LastReadTick=LRT~5
-    // B: FAT~4, LRT~9
-    EXPECT_TRUE(sch(body).ptr_same(body));
-
-    bld.push_scope();
-    {
-        _tensor_(rescheduled, datatypes::s8, {320UL});
-        _tensor_(a, datatypes::f32, {10});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {0UL});
-        _tensor_(b, datatypes::f32, {50});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(rescheduled, {64UL});
-        a[0] = external[0]; // 2...1
-        b[0] = a[0]; // 4...3
-        b[2] = a[2]; // 6...5
-        b[1] = external[2]; // 8...7
-        a[0] = b[1]; // 10...9
-    }
-    body_aggresive = bld.pop_scope();
-    body->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_SIZE;
-    EXPECT_TRUE(cmper.compare(sch(body), body_aggresive));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestFuncSchedule) {
-    ir_builder_t bld;
-    bld.push_scope();
-
-    _decl_func_(datatypes::f32, somefunc, _arg_("a", datatypes::f32, {100}),
-            _arg_("b", datatypes::f32, {100}));
-    somefunc->params_[0]->attr()["write_buffer"] = true;
-    somefunc->params_[0]->attr()["read_buffer"] = true;
-
-    _decl_func_(datatypes::f32, unary_func, _arg_("a", datatypes::f32, {100}));
-    unary_func->params_[0]->attr()["write_buffer"] = true;
-    _tensor_(external, datatypes::f32, {100});
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _tensor_(b, datatypes::f32, {50});
-
-        a[0] = external[0];
-        _evaluate_call_(somefunc, b, a);
-
-        _tensor_(c, datatypes::f32, {100});
-        _evaluate_call_(somefunc, c, builder::tensor_ptr(b, {1}));
-
-        _tensor_(d, datatypes::f32, {100});
-        _evaluate_call_(somefunc, builder::tensor_ptr(d, {1}), c);
-
-        _tensor_(e, datatypes::f32, {100});
-        _evaluate_call_(somefunc, e, d);
-    }
-    auto body = bld.pop_scope();
-
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _tensor_(b, datatypes::f32, {100UL});
-        a[0] = external[0];
-        _evaluate_call_(somefunc, b, a);
-
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _evaluate_call_(somefunc, a, builder::tensor_ptr(b, {1}));
-
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _evaluate_call_(somefunc, builder::tensor_ptr(b, {1}), a);
-
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _evaluate_call_(somefunc, a, b);
-    }
-    auto body2 = bld.pop_scope();
-    buffer_scheduler_t sch(cur_ctx, false);
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(sch(body), body2));
-
-    // original
-    bld.push_scope();
-    {
-        _tensor_(b, datatypes::f32, {50});
-        _tensor_(a, datatypes::f32, {100});
-        b[0] = b[0];
-        _evaluate_call_(unary_func, a);
-    }
-    body = bld.pop_scope();
-    // expected when unary_func is write only: b can reuse a
-    bld.push_scope();
-    {
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _tensor_(a, datatypes::f32, {100});
-        a[0] = a[0];
-        _evaluate_call_(unary_func, a);
-    }
-    body2 = bld.pop_scope();
-    EXPECT_TRUE(cmper.compare(sch(body), body2));
-
-    // expected when unary_func is read only or rw: a can reuse b
-    bld.push_scope();
-    {
-        _tensor_(b, datatypes::f32, {100UL});
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        b[0] = b[0];
-        _evaluate_call_(unary_func, b);
-    }
-    body2 = bld.pop_scope();
-    unary_func->params_[0]->attr().remove("write_buffer");
-    // assume readwrite
-    EXPECT_TRUE(cmper.compare(sch(body), body2));
-
-    unary_func->params_[0]->attr()["read_buffer"] = true;
-    EXPECT_TRUE(cmper.compare(sch(body), body2));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestScope) {
-    ir_builder_t bld;
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _var_(scalar, datatypes::f32);
-        bld.push_scope();
-        {
-            _tensor_(b, datatypes::f32, {50});
-            scalar = b[0];
-            a[0] = 1;
-        }
-        bld.emit(bld.pop_scope());
-        scalar = a[2];
-    }
-    auto body = bld.pop_scope();
-    // b.LRT < a.FAT
-    // but a cannot reuse buffer of b, because a outlives b
-
-    buffer_scheduler_t sch(cur_ctx, false);
-    EXPECT_TRUE(sch(body).ptr_same(body));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestThreadLocal) {
-    ir_builder_t bld;
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {50UL});
-        a[0] = 0;
-        _for_(i, 0, 10) {
-            _tensor_(b, datatypes::f32, {50UL});
-            b[0] = 1;
-            _tensor_(c, datatypes::f32, {20UL});
-            c[10] = b[0];
-            _tensor_(f, datatypes::f32, {50UL});
-            _for_(j, 0, 100, 1, for_type::PARALLEL) {
-                _tensor_(d, datatypes::f32, {50UL});
-                d[10] = 2;
-                _tensor_(e, datatypes::f32, {20UL});
-                e[0] = d[10];
-                f[0] = 1;
-                c[0] = 1;
-                _for_(k, 0, 200) {
-                    _tensor_(g, datatypes::f32, {50UL});
-                    _tensor_(h, datatypes::f32, {50UL});
-                    g->attr().set(attr_keys::hint_first_access_tick,
-                            static_cast<int64_t>(1));
-                    g->attr().set(attr_keys::hint_last_access_tick,
-                            static_cast<int64_t>(3));
-                    h->attr().set(attr_keys::hint_first_access_tick,
-                            static_cast<int64_t>(5));
-                    h->attr().set(attr_keys::hint_last_access_tick,
-                            static_cast<int64_t>(7));
-                    e[0] = 3;
-                    g[0] = e[10];
-                    h[0] = g[0];
-                }
-            }
-        }
-        _for_(i, 0, 200, 1, for_type::PARALLEL) {
-            _tensor_(m, datatypes::f32, {50UL});
-            _tensor_(n, datatypes::f32, {20UL});
-            m[0] = 3;
-            n[0] = m[0];
-            _for_(j, 0, 100) {
-                _tensor_(p, datatypes::f32, {50UL});
-                _tensor_(q, datatypes::f32, {10UL});
-                p[0] = 3;
-                q[0] = p[0];
-            }
-        }
-    }
-    auto body = bld.pop_scope();
-    body->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_HOT;
-
-    buffer_scheduler_t sch(cur_ctx, false);
-    auto result = sch(body);
-    bld.push_scope();
-    {
-        _tensor_(resch, datatypes::s8, {640UL});
-        _tensor_(a, datatypes::f32, {50UL});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(resch.get(), {0UL});
-        a[0] = 0;
-        _for_(i, 0, 10) {
-            _tensor_(b, datatypes::f32, {50UL});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(resch.get(), {0UL});
-            b[0] = 1;
-            _tensor_(c, datatypes::f32, {20UL});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(resch.get(), {512UL});
-            c[10] = b[0];
-            _tensor_(f, datatypes::f32, {50UL});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(resch.get(), {256UL});
-            _for_(j, 0, 100, 1, for_type::PARALLEL) {
-                // parallel scope 1
-                _tensor_(resch_in_parallel_0, datatypes::s8, {384UL});
-                _tensor_(d, datatypes::f32, {50UL});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(resch_in_parallel_0.get(), {0UL});
-                d[10] = 2;
-                _tensor_(e, datatypes::f32, {20UL});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(resch_in_parallel_0.get(), {0UL});
-                e[0] = d[10];
-                f[0] = 1;
-                c[0] = 1;
-                _for_(k, 0, 200) {
-                    _tensor_(g, datatypes::f32, {50UL});
-                    bld.get_current_scope()
-                            .as_seq()
-                            .back()
-                            .checked_as<define>()
-                            ->init_
-                            = builder::tensor_ptr(
-                                    resch_in_parallel_0.get(), {128UL});
-                    _tensor_(h, datatypes::f32, {50UL});
-                    bld.get_current_scope()
-                            .as_seq()
-                            .back()
-                            .checked_as<define>()
-                            ->init_
-                            = builder::tensor_ptr(
-                                    resch_in_parallel_0.get(), {128UL});
-                    e[0] = 3;
-                    g[0] = e[10];
-                    h[0] = g[0];
-                }
-            }
-        }
-        _for_(i, 0, 200, 1, for_type::PARALLEL) {
-            // parallel scope 2
-            _tensor_(resch_in_parallel_2, datatypes::s8, {320UL});
-            _tensor_(m, datatypes::f32, {50UL});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(resch_in_parallel_2.get(), {0UL});
-            _tensor_(n, datatypes::f32, {20UL});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(resch_in_parallel_2.get(), {0UL});
-            m[0] = 3;
-            n[0] = m[0];
-            _for_(j, 0, 100) {
-                _tensor_(p, datatypes::f32, {50UL});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(resch_in_parallel_2.get(), {0UL});
-                _tensor_(q, datatypes::f32, {10UL});
-                bld.get_current_scope()
-                        .as_seq()
-                        .back()
-                        .checked_as<define>()
-                        ->init_
-                        = builder::tensor_ptr(
-                                resch_in_parallel_2.get(), {256UL});
-                p[0] = 3;
-                q[0] = p[0];
-            }
-        }
-    }
-    auto expected = bld.pop_scope();
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(result, expected, false));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestFor) {
-    ir_builder_t bld;
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _tensor_(b, datatypes::f32, {50});
-        _for_(i, 0, 10) {
-            b[i] = b[i + UINT64_C(100)];
-            a[i] = 0;
-        }
-        a[0] = a[10];
-    }
-    auto body = bld.pop_scope();
-    // b.LRT < a.FAT in the loop
-    // but a cannot reuse buffer of b, because they are in a loop
-    // the ticks will be adjusted to: b.LRT = a.LWT > a.FAT
-
-    buffer_scheduler_t sch(cur_ctx, false);
-    EXPECT_TRUE(sch(body).ptr_same(body));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestArgBuff) {
-    ir_builder_t bld;
-
-    _function_(datatypes::void_t, aaa, _arg_("a", datatypes::f32, {100})) {
-        _bind_(a);
-        a->attr()["write_buffer"] = true;
-        _tensor_(b, datatypes::f32, {50});
-        b[10] = 123.f;
-        _tensor_(c, datatypes::f32, {1000});
-        c[100] = 0.f;
-        a[0] = c[10];
-    }
-
-    _function_(
-            datatypes::void_t, aaa_check, _arg_("a", datatypes::f32, {100})) {
-        _bind_(a);
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        a[10] = 123.f;
-        _tensor_(c, datatypes::f32, {1000});
-        c[100] = 0.f;
-        a[0] = c[10];
-    }
-    buffer_scheduler_t sch(cur_ctx, false);
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(sch(aaa), aaa_check));
-
-    _function_(datatypes::void_t, aaa2, _arg_("a", datatypes::f32, {100})) {
-        _bind_(a);
-        a->attr()["write_buffer"] = true;
-        _tensor_(b, datatypes::f32, {50});
-        a[0] = 1.f;
-        // cannot reuse a for b, because b may write to the final result of a
-        b[10] = 123.f;
-    }
-    EXPECT_TRUE(sch(aaa2) == aaa2);
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestDeadWriteEliminate) {
-    ir_builder_t bld;
-    _function_(datatypes::void_t, aaa, _arg_("a", datatypes::f32, {100}),
-            _arg_("b", datatypes::f32, {100})) {
-        _bind_(a, b);
-        b[10] = 123;
-        a->attr()["write_buffer"] = true;
-        _tensor_(c, datatypes::f32, {50});
-        c[10] = 123.f;
-        _tensor_(d, datatypes::f32, {1000});
-        d[100] = c[10];
-        _if_(true) { d[0] = 2; }
-        _tensor_(e, datatypes::f32, {1000});
-        _for_(i, 0, 10) {
-            i = e[0];
-            e[0] = 20;
-        }
-        a[0] = c[10];
-    }
-    _function_(datatypes::void_t, aaa_check, _arg_("a", datatypes::f32, {100}),
-            _arg_("b", datatypes::f32, {100})) {
-        _bind_(a, b);
-        b[10] = 123;
-        a->attr()["write_buffer"] = true;
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        a[10] = 123.f;
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _if_(true) {
-            bld.push_scope();
-            bld.emit(bld.pop_scope());
-        }
-        _tensor_(e, datatypes::f32, {1000});
-        _for_(i, 0, 10) {
-            i = e[0];
-            e[0] = 20;
-        }
-        a[0] = a[10];
-    }
-    buffer_scheduler_t sch(cur_ctx, true);
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(sch(aaa), aaa_check));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestDeadWriteDiffScope) {
-    ir_builder_t bld;
-    _function_(datatypes::void_t, aaa, _arg_("a", datatypes::f32, {100}),
-            _arg_("b", datatypes::f32, {100})) {
-        _bind_(a, b);
-        _tensor_(c, datatypes::f32, {20UL});
-        _tensor_(d, datatypes::f32, {50UL});
-        _tensor_(e, datatypes::f32, {50UL});
-        _for_(i, 0, 10, 1, for_type::PARALLEL) {
-            // normal schedule
-            d[10] = a[0];
-            // deadwrite
-            c[0] = 0;
-        }
-        // normal schedule
-        e[20] = d[1];
-        b[40] = e[3];
-    }
-    aaa->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_HOT;
-
-    _function_(datatypes::void_t, expected, _arg_("a", datatypes::f32, {100}),
-            _arg_("b", datatypes::f32, {100})) {
-        _bind_(a, b);
-        _tensor_(resch_0, datatypes::s8, {256UL});
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _tensor_(d, datatypes::f32, {50UL});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(resch_0.get(), {0UL});
-        _tensor_(e, datatypes::f32, {50UL});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(resch_0.get(), {0UL});
-        _for_(i, 0, 10, 1, for_type::PARALLEL) {
-            // normal schedule
-            d[10] = a[0];
-            bld.push_scope();
-            bld.emit(bld.pop_scope());
-        }
-        // normal schedule
-        e[20] = d[1];
-        b[40] = e[3];
-    }
-    buffer_scheduler_t sch(cur_ctx, true);
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(sch(aaa), expected));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestRecursive) {
-    // test the case that a->b, b->c, d->c
-    // a should reuse c instead of b
-    ir_builder_t bld;
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _tensor_(b, datatypes::f32, {50});
-        _tensor_(c, datatypes::f32, {100});
-        _tensor_(d, datatypes::f32, {100});
-        a[0] = b[0];
-        c[0] = 1;
-        b[0] = 1;
-        c[0] = d[0];
-    }
-    auto body = bld.pop_scope();
-
-    bld.push_scope();
-    {
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-        _tensor_(c, datatypes::f32, {100});
-        bld.push_scope();
-        bld.emit(bld.pop_scope());
-
-        c[0] = c[0];
-        c[0] = 1;
-        c[0] = 1;
-        c[0] = c[0];
-    }
-    auto body2 = bld.pop_scope();
-
-    buffer_scheduler_t sch(cur_ctx, false);
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(sch(body), body2));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestListBrgemm) {
-    ir_builder_t bld;
-    bld.push_scope();
-    {
-        _tensor_(a, datatypes::f32, {100});
-        _tensor_(b, datatypes::f32, {50});
-        _tensor_(c, datatypes::f32, {50});
-        _tensor_(d, datatypes::f32, {50});
-        _tensor_(e, datatypes::f32, {50});
-
-        _tensor_(l_a, datatypes::pointer, {1});
-        l_a[0] = builder::make_cast(
-                datatypes::pointer, builder::tensor_ptr(a, {1}));
-        _tensor_(l_b, datatypes::pointer, {1});
-        l_b[0] = builder::make_cast(
-                datatypes::pointer, builder::tensor_ptr(b, {1}));
-        _tensor_(l_c, datatypes::pointer, {1});
-        l_c[0] = builder::make_cast(
-                datatypes::pointer, builder::tensor_ptr(c, {1}));
-        _tensor_(l_d, datatypes::pointer, {1});
-        l_d[0] = builder::make_cast(datatypes::pointer, d);
-
-        builtin::brgemm_list_update(l_a, l_b, c, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                datatypes::f32, datatypes::f32);
-        builtin::brgemm_list_update(l_c, l_b, d, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                datatypes::f32, datatypes::f32);
-        builtin::brgemm_list_update(l_d, l_b, e, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                datatypes::f32, datatypes::f32);
-    }
-    auto body = bld.pop_scope();
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.buffer_schedule_ = 3;
-    buffer_scheduler_t sch(ctx, false);
-    auto out = sch(body);
-
-    bld.push_scope();
-    {
-        _tensor_(scheduled, datatypes::s8, {1216UL});
-        _tensor_(a, datatypes::f32, {100});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {256UL});
-        _tensor_(b, datatypes::f32, {50});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {704UL});
-        _tensor_(c, datatypes::f32, {50});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {960UL});
-        _tensor_(d, datatypes::f32, {50});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {256UL});
-        _tensor_(e, datatypes::f32, {50});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {960UL});
-
-        _tensor_(l_a, datatypes::pointer, {1});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {0UL});
-        l_a[0] = builder::make_cast(
-                datatypes::pointer, builder::tensor_ptr(a, {1}));
-        _tensor_(l_b, datatypes::pointer, {1});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {64UL});
-        l_b[0] = builder::make_cast(
-                datatypes::pointer, builder::tensor_ptr(b, {1}));
-        _tensor_(l_c, datatypes::pointer, {1});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {128UL});
-        l_c[0] = builder::make_cast(
-                datatypes::pointer, builder::tensor_ptr(c, {1}));
-        _tensor_(l_d, datatypes::pointer, {1});
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(scheduled.get(), {192UL});
-        l_d[0] = builder::make_cast(datatypes::pointer, d);
-
-        builtin::brgemm_list_update(l_a, l_b, c, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                datatypes::f32, datatypes::f32);
-        builtin::brgemm_list_update(l_c, l_b, d, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                datatypes::f32, datatypes::f32);
-        builtin::brgemm_list_update(l_d, l_b, e, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                datatypes::f32, datatypes::f32);
-    }
-    auto expected = bld.pop_scope();
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(expected, expected, false));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestForScope) {
-    ir_builder_t bld;
-    bld.push_scope();
-    {
-        for_loop loop;
-        _named_for_(loop, i, 0, 10) {
-            _tensor_(a, datatypes::f32, {50});
-            _tensor_(b, datatypes::f32, {50});
-            a[0] = 1;
-            b[0] = a[0];
-            builtin::print_float(a[0]);
-            _tensor_(c, datatypes::f32, {50});
-            c[0] = b[0];
-            builtin::print_float(b[0]);
-            builtin::print_float(c[0]);
-        }
-        loop->attr()[attr_keys::buf_sched_top_scope] = true;
-    }
-    auto body = bld.pop_scope();
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.buffer_schedule_ = 3;
-    buffer_scheduler_t sch(ctx, false);
-    auto out = sch(body);
-
-    bld.push_scope();
-    {
-        _tensor_(scheduled, datatypes::s8, {512UL});
-        _for_(i, 0, 10) {
-            _tensor_(a, datatypes::f32, {50});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(scheduled.get(), {0UL});
-            _tensor_(b, datatypes::f32, {50});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(scheduled.get(), {256UL});
-            a[0] = 1;
-            b[0] = a[0];
-            builtin::print_float(a[0]);
-            _tensor_(c, datatypes::f32, {50});
-            bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                    = builder::tensor_ptr(scheduled.get(), {0UL});
-            c[0] = b[0];
-            builtin::print_float(b[0]);
-            builtin::print_float(c[0]);
-        }
-    }
-    auto expected = bld.pop_scope();
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(expected, expected, false));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestTempBufferInplace) {
-    ir_builder_t bld;
-
-    _function_(
-            datatypes::void_t, main_entry, _arg_("C", datatypes::f32, {100})) {
-        _bind_(C);
-        _tensor_(A, datatypes::f32, 100);
-        auto id_a = alias_info::get_or_create_alias_info(*A.get());
-        _tensor_(B, datatypes::f32, 100);
-        auto id_b = alias_info::get_or_create_alias_info(*B.get());
-        _tensor_(temp1, datatypes::f32, 100);
-        A[0] = 1;
-        B[0] = 1;
-        _for_(i, 0, 100) { temp1[i] = A[i] + B[i]; }
-
-        temp1->attr()[attr_keys::tensor_inplace_hint]
-                = std::vector<temp_tensor_inplace_info_t> {
-                        {id_a, inplace_kind::FREE}};
-        temp1[0] = 1;
-    }
-    auto ctx = make_ctx();
-    ctx->flags_.buffer_schedule_ = 2;
-    buffer_scheduler_t pass {ctx, false, true};
-
-    _function_(datatypes::void_t, expected, _arg_("C", datatypes::f32, {100})) {
-        _bind_(C);
-        _tensor_(sched, datatypes::s8, UINT64_C(896));
-        _tensor_(A, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        _tensor_(B, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(448)});
-        _tensor_(temp1, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        A[0] = 1;
-        B[0] = 1;
-        _for_(i, 0, 100) { temp1[i] = A[i] + B[i]; }
-        temp1[0] = 1;
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(pass(main_entry), expected, false));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestAlreadyScheduled) {
-    ir_builder_t bld;
-
-    _function_(
-            datatypes::void_t, main_entry, _arg_("C", datatypes::f32, {100})) {
-        _bind_(C);
-        _tensor_(A_base, datatypes::f32, 100);
-        A_base.get()->attr()[attr_keys::can_be_scheduled] = true;
-        _tensor_(A, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(A_base, {UINT64_C(0)});
-        A[0] = 1;
-
-        _tensor_(B_base, datatypes::f32, 100);
-        B_base.get()->attr()[attr_keys::can_be_scheduled] = true;
-        _tensor_(B, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(B_base, {UINT64_C(0)});
-        B[0] = 1;
-        C[0] = A[0];
-        C[0] = B[0];
-
-        _tensor_(D_base, datatypes::f32, 100);
-        D_base.get()->attr()[attr_keys::can_be_scheduled] = true;
-        _tensor_(D, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(D_base, {UINT64_C(0)});
-        C[0] = D[0];
-    }
-    auto ctx = make_ctx();
-    ctx->flags_.buffer_schedule_ = 2;
-    buffer_scheduler_t pass {ctx, false, true};
-
-    _function_(datatypes::void_t, expected, _arg_("C", datatypes::f32, {100})) {
-        _bind_(C);
-        _tensor_(sched, datatypes::s8, UINT64_C(896));
-        _tensor_(A_base, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        _tensor_(A, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(A_base, {UINT64_C(0)});
-        A[0] = 1;
-
-        _tensor_(B_base, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(448)});
-        _tensor_(B, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(B_base, {UINT64_C(0)});
-        B[0] = 1;
-        C[0] = A[0];
-        C[0] = B[0];
-
-        // D can reuse A
-        _tensor_(D_base, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        _tensor_(D, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(D_base, {UINT64_C(0)});
-        C[0] = D[0];
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(pass(main_entry), expected, false));
-}
-
-TEST(GCCore_CPU_buffer_schedule_cpp, TestInplaceOutputArg) {
-    ir_builder_t bld;
-    _function_(datatypes::void_t, main_entry,
-            _arg_("out", datatypes::f32, {100})) {
-        _bind_(out);
-        out->attr()["write_buffer"] = true;
-        _tensor_(A, datatypes::f32, 100);
-        A[0] = 1;
-        _tensor_(B, datatypes::f32, 100);
-        auto id_B = alias_info::get_or_create_alias_info(*B.get());
-        B[0] = 1;
-        _tensor_(C, datatypes::f32, 50);
-        auto id_C = alias_info::get_or_create_alias_info(*C.get());
-        C->attr()[attr_keys::tensor_inplace_hint]
-                = std::vector<temp_tensor_inplace_info_t> {
-                        {id_B, inplace_kind::ZERO_OFFSET}};
-        C[0] = 1;
-        B[0] = 1;
-        out->attr()[attr_keys::tensor_inplace_hint]
-                = std::vector<temp_tensor_inplace_info_t> {
-                        {id_C, inplace_kind::ZERO_OFFSET}};
-    }
-    _function_(
-            datatypes::void_t, expected, _arg_("out", datatypes::f32, {100})) {
-        _bind_(out);
-        out->attr()["write_buffer"] = true;
-        _tensor_(sched, datatypes::s8, UINT64_C(448));
-        _tensor_(A, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        A[0] = 1;
-        _tensor_(B, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(out, {UINT64_C(0)});
-        auto id_B = alias_info::get_or_create_alias_info(*B.get());
-        B[0] = 1;
-        _tensor_(C, datatypes::f32, 50);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(out, {UINT64_C(0)});
-        auto id_C = alias_info::get_or_create_alias_info(*C.get());
-        C->attr()[attr_keys::tensor_inplace_hint]
-                = std::vector<temp_tensor_inplace_info_t> {
-                        {id_B, inplace_kind::ZERO_OFFSET}};
-        C[0] = 1;
-        B[0] = 1;
-        out->attr()[attr_keys::tensor_inplace_hint]
-                = std::vector<temp_tensor_inplace_info_t> {
-                        {id_C, inplace_kind::ZERO_OFFSET}};
-    }
-
-    auto ctx = make_ctx();
-    ctx->flags_.buffer_schedule_ = 2;
-    buffer_scheduler_t pass {ctx, false, true};
-    auto out = pass(main_entry);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_builder.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_builder.cpp
deleted file mode 100644
index 07efc9e0d1d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_builder.cpp
+++ /dev/null
@@ -1,303 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <compiler/config/context.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/validator.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-TEST(GCCore_CPU_ir_builder_cpp, TestIRBuilder) {
-    ir_builder_t builder;
-    auto callee = make_func("some_func", {}, stmt(), datatypes::f32);
-    builder.push_scope();
-    {
-        auto i = make_var(datatypes::s32, "i");
-        auto buf = make_tensor("buf", {100}, datatypes::s32);
-        builder.push_evaluate(buf);
-        builder.push_scope();
-        {
-            auto ptr = make_indexing(buf, i);
-            builder.push_assign(ptr, 23);
-            builder.push_assign(ptr, 24);
-        }
-        auto body = builder.pop_scope();
-        builder.push_for_loop(i, 10, 20, 1, body, true, for_type::NORMAL);
-
-        builder.push_scope();
-        {
-            auto ptr = make_indexing(buf, 1);
-            builder.push_assign(ptr, 23);
-        }
-        auto true_block = builder.pop_scope();
-
-        builder.push_scope();
-        {
-            auto ptr = make_indexing(buf, 1);
-            builder.push_assign(ptr, 100);
-            builder.push_evaluate(callee(1.2f));
-        }
-        auto false_block = builder.pop_scope();
-
-        builder.push_if_else(
-                make_cmp_eq(make_indexing(buf, 1), 0), true_block, false_block);
-        builder.push_returns();
-    }
-    auto body = builder.pop_scope();
-
-    {
-        std::vector<stmt> expected;
-        auto i = make_expr<var_node>(datatypes::s32, "i");
-        auto buf = make_expr<tensor_node>(datatypes::s32, "buf",
-                std::vector<expr> {make_expr<constant_node>(INT64_C(100))});
-        expected.emplace_back(make_stmt<evaluate_node_t>(buf));
-        stmts for_body;
-        {
-            std::vector<stmt> for_body_s;
-            auto ptr = make_expr<indexing_node>(
-                    buf, std::vector<expr> {i}, expr());
-            for_body_s.emplace_back(make_stmt<assign_node_t>(
-                    ptr, make_expr<constant_node>(INT64_C(23))));
-            for_body_s.emplace_back(make_stmt<assign_node_t>(
-                    ptr, make_expr<constant_node>(INT64_C(24))));
-            for_body = make_stmt<stmts_node_t>(std::move(for_body_s));
-        }
-        expected.emplace_back(make_stmt<for_loop_node_t>(i,
-                make_expr<constant_node>(INT64_C(10)),
-                make_expr<constant_node>(INT64_C(20)),
-                make_expr<constant_node>(INT64_C(1)), for_body, true,
-                for_type::NORMAL));
-
-        stmts true_block;
-        {
-            auto ptr = make_expr<indexing_node>(buf,
-                    std::vector<expr> {make_expr<constant_node>(INT64_C(1))},
-                    expr());
-            true_block = make_stmt<stmts_node_t>(
-                    std::vector<stmt> {make_stmt<assign_node_t>(
-                            ptr, make_expr<constant_node>(INT64_C(23)))});
-        }
-
-        stmts false_block;
-        {
-            auto ptr = make_expr<indexing_node>(buf,
-                    std::vector<expr> {make_expr<constant_node>(INT64_C(1))},
-                    expr());
-            false_block = make_stmt<stmts_node_t>(std::vector<stmt> {
-                    make_stmt<assign_node_t>(
-                            ptr, make_expr<constant_node>(INT64_C(100))),
-                    make_stmt<evaluate_node_t>(make_expr<call_node>(callee,
-                            std::vector<expr> {
-                                    make_expr<constant_node>(1.2f)}))});
-        }
-        expected.emplace_back(make_stmt<if_else_node_t>(
-                make_expr<cmp_eq_node>(
-                        make_expr<indexing_node>(buf,
-                                std::vector<expr> {
-                                        make_expr<constant_node>(INT64_C(1))},
-                                expr()),
-                        make_expr<constant_node>(INT64_C(0))),
-                true_block, false_block));
-        expected.emplace_back(make_stmt<returns_node_t>(expr()));
-        stmts expected_stmt = make_stmt<stmts_node_t>(std::move(expected));
-        ir_comparer cmper;
-        EXPECT_TRUE(expected_stmt->equals(body, cmper));
-    }
-}
-
-TEST(GCCore_CPU_ir_builder_cpp, TestEasyBuilder) {
-    builder::ir_builder_t builder;
-    for_loop loop;
-    _function_(datatypes::f32, aaa, _arg_("ii", datatypes::s32),
-            _arg_("jj", datatypes::s32),
-            _arg_("buf", datatypes::s32, {100, 200})) {
-        _bind_(ii, jj, buf);
-        _tensor_(buf2, datatypes::s32, {100, 200});
-        _tensor_(buf3, datatypes::s32, {100, 200});
-        _tensor_(buf4, datatypes::s32, {100, 200, 300, 15});
-        _var_(v1, datatypes::f32);
-        _var_(v2, datatypes::f32);
-        buf3[{ii, 1}] = buf3[{jj, 2}];
-        _named_for_(loop, i, 0, 20, 1, for_type::PARALLEL) {
-            _for_(j, 0, 50) {
-                _for_(k, 0, 30) {
-                    buf3[{i, j}] = buf3[{i, j}] + buf[{i, k}] * buf2[{k, j}];
-                    buf3[{i, j % UINT64_C(3)}] = 3;
-                    _if_(buf3[{i, j}] == 0) { buf3[{i, j}] = 3; }
-                    _else_ _if_(buf3[{i, j}] == 2) { buf3[{i, j}] = 4; }
-                    _else_ {
-                        buf3[{k, j}] = 4;
-                        buf4[{i, j, k, 3}] = 12;
-                        buf3[span_t({i, j}, 16)] = buf3[span_t({i, j}, 16)];
-                    }
-                }
-            }
-        }
-        _return_(1.2f);
-    }
-
-    // generate expected
-    auto ii = make_var(datatypes::s32, "ii");
-    auto jj = make_var(datatypes::s32, "jj");
-    auto buf = make_tensor("buf", {100, 200}, datatypes::s32);
-    builder.push_scope();
-    {
-        auto buf2 = make_tensor("buf2", {100, 200}, datatypes::s32);
-        builder.push_evaluate(buf2);
-        auto buf3 = make_tensor("buf3", {100, 200}, datatypes::s32);
-        builder.push_evaluate(buf3);
-        auto buf4 = make_tensor("buf4", {100, 200, 300, 15}, datatypes::s32);
-        builder.push_evaluate(buf4);
-        auto v1 = make_var(datatypes::f32, "v1");
-        builder.push_evaluate(v1);
-        auto v2 = make_var(datatypes::f32, "v2");
-        builder.push_evaluate(v2);
-        builder.push_assign(
-                make_indexing(buf3, {ii, 1}), make_indexing(buf3, {jj, 2}));
-
-        auto i = make_var(datatypes::index, "i");
-        builder.push_scope();
-        {
-            auto j = make_var(datatypes::index, "j");
-            builder.push_scope();
-            {
-                auto k = make_var(datatypes::index, "k");
-                builder.push_scope();
-                {
-                    auto v = make_mul(make_indexing(buf, {i, k}),
-                            make_indexing(buf2, {k, j}));
-                    v = make_add(make_indexing(buf3, {i, j}), v);
-                    builder.push_assign(make_indexing(buf3, {i, j}), v);
-                    builder.push_assign(
-                            make_indexing(buf3, {i, j % UINT64_C(3)}), 3);
-                    builder.push_scope();
-                    { builder.push_assign(make_indexing(buf3, {i, j}), 3); }
-                    auto tblock = builder.pop_scope();
-                    builder.push_scope();
-                    {
-                        builder.push_scope();
-                        { builder.push_assign(make_indexing(buf3, {i, j}), 4); }
-                        auto inner_tblock = builder.pop_scope();
-                        builder.push_scope();
-                        {
-                            builder.push_assign(make_indexing(buf3, {k, j}), 4);
-                            builder.push_assign(
-                                    make_indexing(buf4, {i, j, k, 3}), 12);
-                            builder.push_assign(make_indexing(buf3, {i, j}, 16),
-                                    make_indexing(buf3, {i, j}, 16));
-                        }
-                        auto inner_fblock = builder.pop_scope();
-                        builder.push_if_else(
-                                make_cmp_eq(make_indexing(buf3, {i, j}), 2),
-                                inner_tblock, inner_fblock);
-                    }
-                    auto fblock = builder.pop_scope();
-                    builder.push_if_else(
-                            make_cmp_eq(make_indexing(buf3, {i, j}), 0), tblock,
-                            fblock);
-                }
-                builder.push_for_loop(k, 0, 30, 1, builder.pop_scope(), true,
-                        for_type::NORMAL);
-            }
-            builder.push_for_loop(
-                    j, 0, 50, 1, builder.pop_scope(), true, for_type::NORMAL);
-        }
-        builder.push_for_loop(
-                i, 0, 20, 1, builder.pop_scope(), true, for_type::PARALLEL);
-    }
-    builder.push_returns(1.2f);
-    auto fbody = builder.pop_scope();
-    auto expected = make_func("aaa", {ii, jj, buf}, fbody, datatypes::f32);
-
-    ir_comparer cmper(true);
-    expected->equals(aaa, cmper);
-    EXPECT_TRUE(cmper.same_);
-}
-
-TEST(GCCore_CPU_ir_builder_cpp, TestEasyBuilderNestedLoops) {
-    ir_builder_t builder;
-    builder.push_scope();
-    _tensor_(buf2, datatypes::s32, {100, 200});
-    _tensor_(buf3, datatypes::s32, {100, 200});
-    for_loop a, b;
-    _nested_for_(range(a, 0, 20, 1, for_type::PARALLEL), range(0, 50),
-            range(0, 30)) {
-        _iter_var_(i);
-        _iter_var_(j);
-        _iter_var_(k);
-        buf3[{i, j}] = buf3[{i, j}] + buf2[{k, j}];
-        buf3[{i, j % UINT64_C(3)}] = 3;
-    }
-    _named_for_(b, i, 0, 20, 1, for_type::PARALLEL) {
-        _for_(j, 0, 50) {
-            _for_(k, 0, 30) {
-                buf3[{i, j}] = buf3[{i, j}] + buf2[{k, j}];
-                buf3[{i, j % UINT64_C(3)}] = 3;
-            }
-        }
-    }
-    ir_comparer cmper(false, true);
-    EXPECT_TRUE(a->equals(b, cmper));
-}
-
-TEST(GCCore_CPU_ir_builder_cpp, TestLValue) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    _var_(a, datatypes::s32);
-    var va = a.data_.checked_as<var>();
-    EXPECT_TRUE(a.get().ptr_same(va));
-    EXPECT_TRUE(a.get().ptr_same(va));
-    a = 123;
-    EXPECT_TRUE(a.get().ptr_same(va));
-    _var_(b, datatypes::s32);
-    a = b;
-    EXPECT_TRUE(a.get().ptr_same(va));
-
-    auto s = builder.pop_scope();
-
-    // create expected
-    builder.push_scope();
-    expr vara = builder::make_var(datatypes::s32, "a");
-    builder.push_var_tensor_def(vara);
-    builder.push_assign(vara, 123);
-    expr varb = builder::make_var(datatypes::s32, "b");
-    builder.push_var_tensor_def(varb);
-    builder.push_assign(vara, varb);
-    auto exp = builder.pop_scope();
-
-    ir_comparer cmp(false, true);
-    EXPECT_TRUE(cmp.compare(s, exp));
-}
-
-TEST(GCCore_CPU_ir_builder_cpp, TestArgs) {
-    builder::ir_builder_t builder;
-    std::vector<expr> args = {builder::make_var(datatypes::bf16, "A"),
-            builder::make_var(datatypes::bf16, "B")};
-    _function_(datatypes::boolean, AAA, _arg_("v", datatypes::s32),
-            _varg_(args)) {}
-    EXPECT_EQ(AAA->params_.size(), 3u);
-    EXPECT_TRUE(AAA->params_[0].as<var>().defined());
-    EXPECT_TRUE(AAA->params_[1].ptr_same(args[0]));
-    EXPECT_TRUE(AAA->params_[2].ptr_same(args[1]));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_builtin.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_builtin.cpp
deleted file mode 100644
index f1d09da6bc3..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_builtin.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/jit/jit.hpp>
-#include <test_utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_builtin_cpp, TestBrgemmOnednn) {
-    REQUIRE_AVX512();
-    builder::ir_builder_t builder;
-    const int M = 32;
-    const int N = 64;
-    const int K = 16;
-    const int blocks = 10;
-    expr ir_nullptr = make_expr<constant_node>(0UL, datatypes::pointer);
-    _function_(datatypes::boolean, brgemm_test,
-            _arg_("A", datatypes::f32, {blocks, M, K}),
-            _arg_("B", datatypes::f32, {blocks, N, K}),
-            _arg_("C", datatypes::f32, {M, N})) {
-        _bind_(A, B, C);
-        _evaluate_call_(
-                builtin::get_brgemm_update_funcs(builtin::brgemm_mode::stride,
-                        scflags_t::brgemm_backend_t::dnnl)
-                        .second,
-                builder::tensor_ptr(A, {0, 0, 0}),
-                builder::tensor_ptr(B, {0, 0, 0}),
-                builder::tensor_ptr(C, {0, 0}), blocks, M, N, K, K, N, N, M * K,
-                K * N, datatypes::f32.as_etype_int(),
-                datatypes::f32.as_etype_int(), ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _return_(true);
-    }
-    // auto c = create_c_generator(std::cout);
-    // c(brgemm_test);
-    auto fptr = jit_engine_t::make(get_default_context())
-                        ->get_entry_func(ir_module_t::from_entry_func(
-                                get_default_context(), brgemm_test));
-    std::vector<float> A(blocks * M * K, 1.f);
-    std::vector<float> B(blocks * N * K, 1.f);
-    std::vector<float> C(M * N);
-    fptr->call_default(A.data(), B.data(), C.data());
-    for (auto i : C) {
-        EXPECT_EQ(i, 160.f);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_casting.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_casting.cpp
deleted file mode 100644
index d6b62e691e6..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_casting.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_cast_cpp, TestCast) {
-    expr a = expr(1) + 2;
-    builder::ir_builder_t b;
-    b.push_scope();
-    b.push_assign(a, 9);
-    auto v = b.pop_scope();
-    auto assn = v.checked_as<stmts>()->seq_[0].checked_as<assign>();
-    assign_c assn_c = assn;
-    assign_c assn2 = std::move(assn_c);
-    EXPECT_FALSE(assn_c.defined());
-    auto val = assn->value_.dyn_as<constant>()->get_s32();
-    stmt stm = v.checked_as<stmts>();
-    // should fail:
-    // stmts stm2 = v;
-    EXPECT_EQ(val, 9);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_closurize.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_closurize.cpp
deleted file mode 100644
index d83618bce38..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_closurize.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/transform/cpu/closurize.hpp>
-
-#include <iostream>
-#include "context.hpp"
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <runtime/config.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_closurize_cpp, TestSingleCore) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, tester) {
-        _tensor_(t, datatypes::f32, {100});
-        _for_(i, 2, 10, 2, for_type::PARALLEL) { t[i] = 2; }
-    }
-    _function_(datatypes::void_t, expected) {
-        _tensor_(t, datatypes::f32, {100});
-        _for_(i, 2, 10, 2) { t[i] = 2; }
-    }
-
-    auto mod = closurizer_cpu_t(true)(
-            ir_module_t::from_entry_func(get_default_context(), tester));
-    auto func = mod->get_func("tester");
-    ir_comparer cmper {};
-
-    ASSERT_TRUE(func);
-    ASSERT_TRUE(cmper.compare(func, expected));
-}
-
-TEST(GCCore_CPU_closurize_cpp, TestClosurizeCPU) {
-    builder::ir_builder_t builder;
-    auto m = std::make_shared<ir_module_t>(get_default_context());
-    _global_var_(m, gv, datatypes::s32, 1);
-    _function_(datatypes::void_t, tester) {
-        _var_(b, datatypes::s32);
-        _tensor_(t, datatypes::f32, {100});
-        _for_(i, 2, 10, 2, for_type::PARALLEL) {
-            gv = 2;
-            t[b + i] = gv;
-        }
-        _for_(i, 2, 10, 2, for_type::PARALLEL) {
-            gv = 2;
-            t[b + i] = gv + 1;
-        }
-    }
-    m->add_func({tester});
-    bool use_managed = runtime_config_t::get().managed_thread_pool_
-            == thread_pool_mode_t::MANAGED;
-    m->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-            = runtime_config_t::get().managed_thread_pool_;
-
-    auto testerout = closurizer_cpu_t(false)(m);
-
-    auto outfuncs = testerout->get_contents();
-    ASSERT_EQ(outfuncs.size(), 5u);
-    // 0 -> tester
-    // 1 -> closure0
-    // 2 -> closure0_wrapper
-    // 3 -> closure1
-    // 4 -> closure1_wrapper
-
-    _function_(datatypes::void_t, closure1, _arg_("i", datatypes::index),
-            _arg_("t", datatypes::f32, {100}), _arg_("b", datatypes::s32)) {
-        _bind_(i, t, b);
-        gv = 2;
-        t[b + i] = gv;
-    }
-
-    _function_(datatypes::void_t, closure2, _arg_("i", datatypes::index),
-            _arg_("t", datatypes::f32, {100}), _arg_("b", datatypes::s32)) {
-        _bind_(i, t, b);
-        gv = 2;
-        t[b + i] = gv + 1;
-    }
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(outfuncs[1], closure1));
-    EXPECT_TRUE(cmp.compare(outfuncs[3], closure2));
-
-    auto u64_0 = make_expr<constant_node>(UINT64_C(0));
-    auto pointer_0 = make_expr<constant_node>(UINT64_C(0), datatypes::pointer);
-    auto u8_pointer_0 = make_expr<constant_node>(
-            UINT64_C(0), datatypes::s8.get_pointerof());
-
-    _function_(datatypes::void_t, tester2) {
-        _var_(b, datatypes::s32);
-        _tensor_(t, datatypes::f32, {100});
-        builder.push_scope();
-        {
-            _tensor_(args, datatypes::generic, {2UL});
-            args[0UL] = builder::make_cast(datatypes::generic, t);
-            args[1UL] = builder::make_cast(datatypes::generic, b);
-            expr callnode = builder::make_call(
-                    get_parallel_call_with_env_func(use_managed),
-                    {builder::make_func_addr(outfuncs[2]), u64_0, pointer_0,
-                            u8_pointer_0, 2UL, 10UL, 2UL, args});
-            builder.push_evaluate(callnode);
-        }
-        builder.emit(builder.pop_scope());
-
-        builder.push_scope();
-        {
-            _tensor_(args, datatypes::generic, {2UL});
-            args[0UL] = builder::make_cast(datatypes::generic, t);
-            args[1UL] = builder::make_cast(datatypes::generic, b);
-            expr callnode = builder::make_call(
-                    get_parallel_call_with_env_func(use_managed),
-                    {builder::make_func_addr(outfuncs[4]), u64_0, pointer_0,
-                            u8_pointer_0, 2UL, 10UL, 2UL, args});
-            builder.push_evaluate(callnode);
-        }
-        builder.emit(builder.pop_scope());
-    }
-    EXPECT_TRUE(cmp.compare(outfuncs[0], tester2, false));
-}
-
-static optional<uint64_t> get_parallel_call_flag(const func_t f, int idx = 0) {
-    return f->body_.static_as<stmts>()
-            ->seq_.at(idx)
-            .cast<stmts>()
-            .map([](const stmts &v) { return v->seq_.back().as<evaluate>(); })
-            .map([](const evaluate &v) { return v->value_.as<call>(); })
-            .map([](const call &v) { return v->args_.at(1).as<constant>(); })
-            .map([](const constant &v) { return v->get_index(); });
-}
-
-TEST(GCCore_CPU_closurize_cpp, TestClosurizeCPURemoveBarrier) {
-    builder::ir_builder_t builder;
-    if (runtime_config_t::get().managed_thread_pool_
-            != thread_pool_mode_t::MANAGED) {
-        GTEST_SKIP();
-    }
-    _function_(datatypes::boolean, aaa) {
-        _for_(i, 0, 10, 2, for_type::PARALLEL) {}
-        _return_(true);
-    }
-
-    {
-        _function_(datatypes::void_t, tester1) {
-            _evaluate_call_(aaa);
-            _tensor_(b, datatypes::index, 1);
-        }
-        tester1->attr()[function_attrs::is_main] = true;
-        auto m1 = ir_module_t::from_entry_func(get_test_ctx(), tester1);
-        m1->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-                = thread_pool_mode_t::MANAGED;
-        auto testerout1 = closurizer_cpu_t(false)(m1);
-        auto f = testerout1->get_func("aaa");
-        ASSERT_TRUE(f);
-        ASSERT_EQ(f->body_.static_as<stmts>()->seq_.size(), 2UL);
-        auto flag = get_parallel_call_flag(f);
-        ASSERT_TRUE(flag.has_value() && flag.get() == 0);
-    }
-
-    {
-        _function_(datatypes::void_t, tester1) { _evaluate_call_(aaa); }
-        tester1->attr()[function_attrs::is_main] = true;
-        auto m1 = ir_module_t::from_entry_func(get_test_ctx(), tester1);
-        m1->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-                = thread_pool_mode_t::MANAGED;
-        auto testerout1 = closurizer_cpu_t(false)(m1);
-        auto f = testerout1->get_func("aaa");
-        ASSERT_TRUE(f);
-        ASSERT_EQ(f->body_.static_as<stmts>()->seq_.size(), 2UL);
-        auto flag = get_parallel_call_flag(f);
-        ASSERT_TRUE(flag.has_value() && flag.get() == 4UL);
-    }
-
-    _function_(datatypes::void_t, bbb) {
-        _for_(i, 0, 10, 2, for_type::PARALLEL) {}
-        _tensor_(b, datatypes::index, 1);
-    }
-    {
-        _function_(datatypes::void_t, tester1) { _evaluate_call_(bbb); }
-        tester1->attr()[function_attrs::is_main] = true;
-        auto m1 = ir_module_t::from_entry_func(get_test_ctx(), tester1);
-        m1->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-                = thread_pool_mode_t::MANAGED;
-        auto testerout1 = closurizer_cpu_t(false)(m1);
-        auto f = testerout1->get_func("bbb");
-        ASSERT_TRUE(f);
-        ASSERT_EQ(f->body_.static_as<stmts>()->seq_.size(), 2UL);
-        auto flag = get_parallel_call_flag(f);
-        ASSERT_TRUE(flag.has_value() && flag.get() == 0);
-    }
-}
-
-TEST(GCCore_CPU_closurize_cpp, TestClosurizeCPURemoveBarrierPinMemory) {
-    builder::ir_builder_t builder;
-    if (runtime_config_t::get().managed_thread_pool_
-            != thread_pool_mode_t::MANAGED) {
-        GTEST_SKIP();
-    }
-    {
-        expr bbb_A, aaa_A, tester_A, tester_B, tester_T;
-        _function_(datatypes::boolean, aaa, _arg_("t", datatypes::f32, {100})) {
-            _bind_(t);
-            _tensor_(A, datatypes::f32, 100);
-            aaa_A = A;
-            _for_(i, 0, 10, 2, for_type::PARALLEL) {
-                t[0] = 1;
-                A[i] = 0.0f;
-            }
-            _return_(true);
-        }
-        _function_(datatypes::boolean, bbb, _arg_("t", datatypes::f32, {100}),
-                _arg_("t2", datatypes::f32, {100})) {
-            _bind_(t, t2);
-            _tensor_(A, datatypes::f32, 100);
-            bbb_A = A;
-            _for_(i, 0, 10, 2, for_type::PARALLEL) {
-                t[0] = 1;
-                t2[0] = 1;
-                A[i] = 0.0f;
-            }
-            _return_(true);
-        }
-
-        _function_(
-                datatypes::void_t, tester1, _arg_("t", datatypes::f32, {100})) {
-            _bind_(t);
-            tester_T = t;
-            _tensor_(A, datatypes::f32, 100);
-            tester_A = A;
-            _tensor_(B, datatypes::f32, 100);
-            tester_B = B;
-            _tensor_(C, datatypes::f32, 100);
-            builder.get_current_scope()
-                    .as_seq()
-                    .back()
-                    .checked_as<define>()
-                    ->init_
-                    = builder::tensor_ptr(B, {0UL});
-            _evaluate_call_(aaa, A);
-            _evaluate_call_(bbb, C, t);
-        }
-        tester1->attr()[function_attrs::is_main] = true;
-        auto m1 = ir_module_t::from_entry_func(get_test_ctx(), tester1);
-        m1->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-                = thread_pool_mode_t::MANAGED;
-        auto testerout1 = closurizer_cpu_t(false)(m1);
-        ASSERT_TRUE(bbb_A->attr().get<bool>(attr_keys::runtime_stack_alloc));
-        ASSERT_TRUE(tester_B->attr().get<bool>(attr_keys::runtime_stack_alloc));
-        ASSERT_FALSE(tester_A->attr().get_or_else<bool>(
-                attr_keys::runtime_stack_alloc, false));
-    }
-
-    {
-        _function_(datatypes::boolean, aaa,
-                _arg_("t", datatypes::pointer, {100})) {
-            _bind_(t);
-            _tensor_(A, datatypes::f32, 100);
-            builder.get_current_scope()
-                    .as_seq()
-                    .back()
-                    .checked_as<define>()
-                    ->init_
-                    = t[0];
-            _for_(i, 0, 10, 2, for_type::PARALLEL) { A[i] = 0.0f; }
-            _return_(true);
-        }
-
-        _function_(
-                datatypes::void_t, tester1, _arg_("t", datatypes::f32, {100})) {
-            _bind_(t);
-            _tensor_(A, datatypes::f32, 100);
-            _evaluate_call_(aaa, A);
-        }
-        tester1->attr()[function_attrs::is_main] = true;
-        auto m1 = ir_module_t::from_entry_func(get_test_ctx(), tester1);
-        m1->attr_[ir_module_t::attr_key_t::MANAGED_THREAD_POOL]
-                = thread_pool_mode_t::MANAGED;
-        auto testerout1 = closurizer_cpu_t(false)(m1);
-        auto f = testerout1->get_func("aaa");
-        ASSERT_TRUE(f);
-        ASSERT_EQ(get_parallel_call_flag(f, 1).get(), 0UL);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_codegen_llvm_expected.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_codegen_llvm_expected.hpp
deleted file mode 100644
index 35382e6d175..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_codegen_llvm_expected.hpp
+++ /dev/null
@@ -1,546 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_CODEGEN_LLVM_EXPECTED_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_CODEGEN_LLVM_EXPECTED_HPP
-
-static const char *expected_base = R"(; ModuleID = 'name'
-source_filename = "name"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: nounwind
-define i32 @ccc(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, float* noalias nocapture nonnull %A_arg, i32 %len_arg) #0 {
-entry:
-  %len = alloca i32, align 4
-  store i32 %len_arg, i32* %len, align 4
-  %i = alloca i64, align 8
-  store i64 0, i64* %i, align 8
-  br label %for_check
-
-for_check:                                        ; preds = %for_cont3, %entry
-  %0 = load i64, i64* %i, align 8
-  %1 = icmp ult i64 %0, 128
-  br i1 %1, label %for_body, label %for_cont
-
-for_body:                                         ; preds = %for_check
-  %j = alloca i64, align 8
-  store i64 0, i64* %j, align 8
-  br label %for_check1
-
-for_check1:                                       ; preds = %for_body2, %for_body
-  %2 = load i64, i64* %j, align 8
-  %3 = icmp ult i64 %2, 128
-  br i1 %3, label %for_body2, label %for_cont3
-
-for_body2:                                        ; preds = %for_check1
-  %i_v = load i64, i64* %i, align 8
-  %4 = mul i64 %i_v, 128
-  %j_v = load i64, i64* %j, align 8
-  %5 = add i64 %4, %j_v
-  %6 = getelementptr float, float* %A_arg, i64 %5
-  store float 0.000000e+00, float* %6, align 4
-  %7 = load i64, i64* %j, align 8
-  %8 = add i64 %7, 1
-  store i64 %8, i64* %j, align 8
-  br label %for_check1
-
-for_cont3:                                        ; preds = %for_check1
-  %len_v = load i32, i32* %len, align 4
-  %9 = getelementptr float, float* %A_arg, i64 100
-  %10 = bitcast float* %9 to i8*
-  call void @bbb(float* %A_arg, i32 %len_v, i8* %10)
-  %11 = load i64, i64* %i, align 8
-  %12 = add i64 %11, 1
-  store i64 %12, i64* %i, align 8
-  br label %for_check
-
-for_cont:                                         ; preds = %for_check
-  %len_v4 = load i32, i32* %len, align 4
-  call void @aaa(i8* %__stream_arg, i8* %__module_data_arg, float* %A_arg, float* %A_arg, float* %A_arg, i32 %len_v4)
-  ret i32 12
-}
-
-; Function Attrs: nounwind
-declare void @bbb(float*, i32, i8*) #1
-
-; Function Attrs: nounwind
-define void @aaa(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, float* noalias nocapture nonnull %A_arg, float* noalias nocapture nonnull %B_arg, float* noalias nocapture nonnull %C_arg, i32 %len_arg) #0 {
-entry:
-  %len = alloca i32, align 4
-  store i32 %len_arg, i32* %len, align 4
-  %0 = getelementptr i8, i8* %__module_data_arg, i64 64
-  %gtsr = bitcast i8* %0 to float*
-  %1 = getelementptr i8, i8* %__module_data_arg, i64 0
-  %val = bitcast i8* %1 to float*
-  %D = alloca float, i64 20, align 64
-  %2 = call i8* @sc_aligned_malloc(i8* %__stream_arg, i64 8000)
-  %E = bitcast i8* %2 to float*
-  %len_v = load i32, i32* %len, align 4
-  %3 = sext i32 %len_v to i64
-  %4 = mul i64 %3, 4
-  %5 = call i8* @sc_aligned_malloc(i8* %__stream_arg, i64 %4)
-  %F = bitcast i8* %5 to float*
-  %6 = getelementptr float, float* %F, i32 3
-  %F_view = bitcast float* %6 to i32*
-  %i = alloca i64, align 8
-  store i64 0, i64* %i, align 8
-  br label %for_check
-
-for_check:                                        ; preds = %for_cont3, %entry
-  %7 = load i64, i64* %i, align 8
-  %8 = icmp ult i64 %7, 128
-  br i1 %8, label %for_body, label %for_cont
-
-for_body:                                         ; preds = %for_check
-  %j = alloca i64, align 8
-  store i64 0, i64* %j, align 8
-  br label %for_check1
-
-for_check1:                                       ; preds = %for_cont6, %for_body
-  %9 = load i64, i64* %j, align 8
-  %10 = icmp ult i64 %9, 128
-  br i1 %10, label %for_body2, label %for_cont3
-
-for_body2:                                        ; preds = %for_check1
-  %k = alloca i64, align 8
-  store i64 0, i64* %k, align 8
-  br label %for_check4
-
-for_check4:                                       ; preds = %for_body5, %for_body2
-  %11 = load i64, i64* %k, align 8
-  %12 = icmp ult i64 %11, 128
-  br i1 %12, label %for_body5, label %for_cont6
-
-for_body5:                                        ; preds = %for_check4
-  %i_v = load i64, i64* %i, align 8
-  %13 = mul i64 %i_v, 128
-  %j_v = load i64, i64* %j, align 8
-  %14 = add i64 %13, %j_v
-  %15 = getelementptr float, float* %C_arg, i64 %14
-  %16 = load float, float* %15, align 4, !alias.scope !0, !noalias !3
-  %i_v7 = load i64, i64* %i, align 8
-  %17 = mul i64 %i_v7, 128
-  %k_v = load i64, i64* %k, align 8
-  %18 = add i64 %17, %k_v
-  %19 = getelementptr float, float* %A_arg, i64 %18
-  %20 = load float, float* %19, align 4, !alias.scope !8, !noalias !9
-  %j_v8 = load i64, i64* %j, align 8
-  %k_v9 = load i64, i64* %k, align 8
-  %21 = mul i64 %k_v9, 128
-  %22 = add i64 %j_v8, %21
-  %23 = getelementptr float, float* %B_arg, i64 %22
-  %24 = load float, float* %23, align 4, !alias.scope !10, !noalias !11
-  %25 = fmul reassoc nnan contract float %20, %24
-  %26 = fadd reassoc nnan contract float %16, %25
-  %i_v10 = load i64, i64* %i, align 8
-  %27 = mul i64 %i_v10, 128
-  %j_v11 = load i64, i64* %j, align 8
-  %28 = add i64 %27, %j_v11
-  %29 = getelementptr float, float* %C_arg, i64 %28
-  store float %26, float* %29, align 4, !alias.scope !0, !noalias !3
-  %30 = load i64, i64* %k, align 8
-  %31 = add i64 %30, 1
-  store i64 %31, i64* %k, align 8
-  br label %for_check4
-
-for_cont6:                                        ; preds = %for_check4
-  %32 = load i64, i64* %j, align 8
-  %33 = add i64 %32, 1
-  store i64 %33, i64* %j, align 8
-  br label %for_check1
-
-for_cont3:                                        ; preds = %for_check1
-  %34 = load i64, i64* %i, align 8
-  %35 = add i64 %34, 1
-  store i64 %35, i64* %i, align 8
-  br label %for_check
-
-for_cont:                                         ; preds = %for_check
-  %36 = getelementptr i32, i32* %F_view, i32 0
-  store i32 1, i32* %36, align 4, !alias.scope !12, !noalias !13
-  %37 = getelementptr float, float* %gtsr, i32 0
-  store float 1.000000e+00, float* %37, align 4
-  store float 1.000000e+00, float* %val, align 4
-  %38 = bitcast float* %F to i8*
-  call void @sc_aligned_free(i8* %__stream_arg, i8* %38)
-  %39 = bitcast float* %E to i8*
-  call void @sc_aligned_free(i8* %__stream_arg, i8* %39)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @sc_aligned_malloc(i8*, i64) #1
-
-; Function Attrs: nounwind
-declare void @sc_aligned_free(i8*, i8*) #1
-
-; Function Attrs: nounwind
-define void @ddd(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, i32 %len_arg) #0 {
-entry:
-  %len = alloca i32, align 4
-  store i32 %len_arg, i32* %len, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-define void @__sc_init__(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg) #0 {
-entry:
-  %0 = getelementptr i8, i8* %__module_data_arg, i64 0
-  %val = bitcast i8* %0 to float*
-  %1 = getelementptr i8, i8* %__module_data_arg, i64 4
-  %val2 = bitcast i8* %1 to float*
-  %2 = getelementptr i8, i8* %__module_data_arg, i64 8
-  %val3 = bitcast i8* %2 to float*
-  store float 0x4028AE1480000000, float* %val, align 4
-  %3 = call reassoc nnan contract float @ginit()
-  store float %3, float* %val2, align 4
-  %4 = call reassoc nnan contract float @ginit()
-  store float %4, float* %val3, align 4
-  ret void
-}
-
-; Function Attrs: nounwind
-declare float @ginit() #1
-
-attributes #0 = { nounwind "frame-pointer"="all" "no-frame-pointer-elim"="true" }
-attributes #1 = { nounwind }
-
-!0 = !{!1}
-!1 = distinct !{!1, !2, !"-4"}
-!2 = distinct !{!2, !"aaa"}
-!3 = !{!4, !5, !6, !7}
-!4 = distinct !{!4, !2, !"-3"}
-!5 = distinct !{!5, !2, !"-2"}
-!6 = distinct !{!6, !2, !"-1"}
-!7 = distinct !{!7, !2, !"1"}
-!8 = !{!5}
-!9 = !{!1, !4, !6, !7}
-!10 = !{!4}
-!11 = !{!1, !5, !6, !7}
-!12 = !{!7}
-!13 = !{!1, !4, !5, !6}
-)";
-
-static const char *expected_parallel_for = R"(; ModuleID = 'name'
-source_filename = "name"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: nounwind
-define void @aaa(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, float* noalias nocapture nonnull %A_arg, float* noalias nocapture nonnull %B_arg, float* noalias nocapture nonnull %C_arg, i32 %len_arg) #0 {
-entry:
-  %len = alloca i32, align 4
-  store i32 %len_arg, i32* %len, align 4
-  %__tempargs0 = alloca i64, i64 4, align 8
-  %0 = ptrtoint float* %C_arg to i64
-  %1 = getelementptr i64, i64* %__tempargs0, i64 0
-  store i64 %0, i64* %1, align 8
-  %2 = ptrtoint float* %A_arg to i64
-  %3 = getelementptr i64, i64* %__tempargs0, i64 1
-  store i64 %2, i64* %3, align 8
-  %4 = ptrtoint float* %B_arg to i64
-  %5 = getelementptr i64, i64* %__tempargs0, i64 2
-  store i64 %4, i64* %5, align 8
-  %len_v = load i32, i32* %len, align 4
-  %6 = zext i32 %len_v to i64
-  %7 = getelementptr i64, i64* %__tempargs0, i64 3
-  store i64 %6, i64* %7, align 8
-  call void @sc_parallel_call_cpu_with_env(i8* bitcast (void (i8*, i8*, i64, i64*)* @aaa0_closure_0_0wrapper to i8*), i64 0, i8* %__stream_arg, i8* %__module_data_arg, i64 0, i64 128, i64 1, i64* %__tempargs0)
-  %t = alloca i32, align 4
-  %len_v1 = load i32, i32* %len, align 4
-  store i32 %len_v1, i32* %t, align 4
-  %__tempargs1 = alloca i64, i64 2, align 8
-  %8 = ptrtoint float* %A_arg to i64
-  %9 = getelementptr i64, i64* %__tempargs1, i64 0
-  store i64 %8, i64* %9, align 8
-  %t_v = load i32, i32* %t, align 4
-  %10 = zext i32 %t_v to i64
-  %11 = getelementptr i64, i64* %__tempargs1, i64 1
-  store i64 %10, i64* %11, align 8
-  call void @sc_parallel_call_cpu_with_env(i8* bitcast (void (i8*, i8*, i64, i64*)* @aaa0_closure_1_0wrapper to i8*), i64 0, i8* %__stream_arg, i8* %__module_data_arg, i64 1, i64 100, i64 2, i64* %__tempargs1)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @sc_parallel_call_cpu_with_env(i8*, i64, i8*, i8*, i64, i64, i64, i64*) #1
-
-; Function Attrs: nounwind
-define internal void @aaa0_closure_0_0wrapper(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, i64 %i_arg, i64* noalias nocapture nonnull %args_arg) #0 {
-entry:
-  %i = alloca i64, align 8
-  store i64 %i_arg, i64* %i, align 8
-  %i_v = load i64, i64* %i, align 8
-  %0 = getelementptr i64, i64* %args_arg, i64 0
-  %1 = load i64, i64* %0, align 8
-  %2 = inttoptr i64 %1 to float*
-  %3 = getelementptr i64, i64* %args_arg, i64 1
-  %4 = load i64, i64* %3, align 8
-  %5 = inttoptr i64 %4 to float*
-  %6 = getelementptr i64, i64* %args_arg, i64 2
-  %7 = load i64, i64* %6, align 8
-  %8 = inttoptr i64 %7 to float*
-  %9 = getelementptr i64, i64* %args_arg, i64 3
-  %10 = load i64, i64* %9, align 8
-  %11 = trunc i64 %10 to i32
-  call void @aaa0_closure_0(i8* %__stream_arg, i8* %__module_data_arg, i64 %i_v, float* %2, float* %5, float* %8, i32 %11)
-  ret void
-}
-
-; Function Attrs: nounwind
-define internal void @aaa0_closure_1_0wrapper(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, i64 %i_arg, i64* noalias nocapture nonnull %args_arg) #0 {
-entry:
-  %i = alloca i64, align 8
-  store i64 %i_arg, i64* %i, align 8
-  %i_v = load i64, i64* %i, align 8
-  %0 = getelementptr i64, i64* %args_arg, i64 0
-  %1 = load i64, i64* %0, align 8
-  %2 = inttoptr i64 %1 to float*
-  %3 = getelementptr i64, i64* %args_arg, i64 1
-  %4 = load i64, i64* %3, align 8
-  %5 = trunc i64 %4 to i32
-  call void @aaa0_closure_1(i8* %__stream_arg, i8* %__module_data_arg, i64 %i_v, float* %2, i32 %5)
-  ret void
-}
-
-; Function Attrs: nounwind
-define internal void @aaa0_closure_0(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, i64 %i_arg, float* noalias nocapture nonnull %C_arg, float* noalias nocapture nonnull %A_arg, float* noalias nocapture nonnull %B_arg, i32 %len_arg) #0 {
-entry:
-  %i = alloca i64, align 8
-  store i64 %i_arg, i64* %i, align 8
-  %len = alloca i32, align 4
-  store i32 %len_arg, i32* %len, align 4
-  %0 = getelementptr i8, i8* %__module_data_arg, i64 0
-  %gv = bitcast i8* %0 to i32*
-  store i32 1, i32* %gv, align 4
-  %v1 = alloca float, align 4
-  %D = alloca float, i64 20, align 64
-  %1 = call i8* @sc_thread_aligned_malloc(i8* %__stream_arg, i64 8000)
-  %E = bitcast i8* %1 to float*
-  %j = alloca i64, align 8
-  store i64 0, i64* %j, align 8
-  br label %for_check
-
-for_check:                                        ; preds = %for_cont3, %entry
-  %2 = load i64, i64* %j, align 8
-  %3 = icmp ult i64 %2, 128
-  br i1 %3, label %for_body, label %for_cont
-
-for_body:                                         ; preds = %for_check
-  %k = alloca i64, align 8
-  store i64 0, i64* %k, align 8
-  br label %for_check1
-
-for_check1:                                       ; preds = %for_body2, %for_body
-  %4 = load i64, i64* %k, align 8
-  %5 = icmp ult i64 %4, 128
-  br i1 %5, label %for_body2, label %for_cont3
-
-for_body2:                                        ; preds = %for_check1
-  %i_v = load i64, i64* %i, align 8
-  %6 = mul i64 %i_v, 128
-  %j_v = load i64, i64* %j, align 8
-  %7 = add i64 %6, %j_v
-  %8 = getelementptr float, float* %C_arg, i64 %7
-  %9 = load float, float* %8, align 4
-  %i_v4 = load i64, i64* %i, align 8
-  %10 = mul i64 %i_v4, 10
-  %k_v = load i64, i64* %k, align 8
-  %11 = add i64 %10, %k_v
-  %12 = getelementptr float, float* %D, i64 %11
-  %13 = load float, float* %12, align 4
-  %14 = fadd reassoc nnan contract float %9, %13
-  %i_v5 = load i64, i64* %i, align 8
-  %15 = mul i64 %i_v5, 128
-  %k_v6 = load i64, i64* %k, align 8
-  %16 = add i64 %15, %k_v6
-  %17 = getelementptr float, float* %A_arg, i64 %16
-  %18 = load float, float* %17, align 4
-  %j_v7 = load i64, i64* %j, align 8
-  %k_v8 = load i64, i64* %k, align 8
-  %19 = mul i64 %k_v8, 128
-  %20 = add i64 %j_v7, %19
-  %21 = getelementptr float, float* %B_arg, i64 %20
-  %22 = load float, float* %21, align 4
-  %23 = fmul reassoc nnan contract float %18, %22
-  %24 = fadd reassoc nnan contract float %14, %23
-  %len_v = load i32, i32* %len, align 4
-  %25 = sitofp i32 %len_v to float
-  %26 = fadd reassoc nnan contract float %24, %25
-  %v1_v = load float, float* %v1, align 4
-  %27 = fadd reassoc nnan contract float %26, %v1_v
-  %i_v9 = load i64, i64* %i, align 8
-  %28 = mul i64 %i_v9, 128
-  %j_v10 = load i64, i64* %j, align 8
-  %29 = add i64 %28, %j_v10
-  %30 = getelementptr float, float* %C_arg, i64 %29
-  store float %27, float* %30, align 4
-  %31 = load i64, i64* %k, align 8
-  %32 = add i64 %31, 1
-  store i64 %32, i64* %k, align 8
-  br label %for_check1
-
-for_cont3:                                        ; preds = %for_check1
-  %33 = load i64, i64* %j, align 8
-  %34 = add i64 %33, 1
-  store i64 %34, i64* %j, align 8
-  br label %for_check
-
-for_cont:                                         ; preds = %for_check
-  %35 = bitcast float* %E to i8*
-  call void @sc_thread_aligned_free(i8* %__stream_arg, i8* %35)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @sc_thread_aligned_malloc(i8*, i64) #1
-
-; Function Attrs: nounwind
-declare void @sc_thread_aligned_free(i8*, i8*) #1
-
-; Function Attrs: nounwind
-define internal void @aaa0_closure_1(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, i64 %i_1_arg, float* noalias nocapture nonnull %A_arg, i32 %t_arg) #0 {
-entry:
-  %i_1 = alloca i64, align 8
-  store i64 %i_1_arg, i64* %i_1, align 8
-  %t = alloca i32, align 4
-  store i32 %t_arg, i32* %t, align 4
-  %t_v = load i32, i32* %t, align 4
-  %0 = sext i32 %t_v to i64
-  %i_1_v = load i64, i64* %i_1, align 8
-  %1 = add i64 %0, %i_1_v
-  %2 = uitofp i64 %1 to float
-  %i_1_v1 = load i64, i64* %i_1, align 8
-  %i_1_v2 = load i64, i64* %i_1, align 8
-  %3 = mul i64 %i_1_v2, 128
-  %4 = add i64 %i_1_v1, %3
-  %5 = getelementptr float, float* %A_arg, i64 %4
-  store float %2, float* %5, align 4
-  ret void
-}
-
-attributes #0 = { nounwind "frame-pointer"="all" "no-frame-pointer-elim"="true" }
-attributes #1 = { nounwind }
-)";
-
-static const char *expected_vector = R"(; ModuleID = 'name'
-source_filename = "name"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: nounwind
-define void @aaa(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, float* noalias nocapture nonnull %A_arg, float* noalias nocapture nonnull %B_arg, float* noalias nocapture nonnull %C_arg, i32* noalias nocapture nonnull %D_arg) #0 {
-entry:
-  %A_val = alloca <8 x float>, align 32
-  %i = alloca i64, align 8
-  store i64 0, i64* %i, align 8
-  br label %for_check
-
-for_check:                                        ; preds = %for_body, %entry
-  %0 = load i64, i64* %i, align 8
-  %1 = icmp ult i64 %0, 512
-  br i1 %1, label %for_body, label %for_cont
-
-for_body:                                         ; preds = %for_check
-  %i_v = load i64, i64* %i, align 8
-  %2 = getelementptr i32, i32* %D_arg, i64 %i_v
-  %3 = bitcast i32* %2 to <8 x i32>*
-  store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %3, align 1
-  %i_v1 = load i64, i64* %i, align 8
-  %4 = getelementptr float, float* %C_arg, i64 %i_v1
-  %5 = bitcast float* %4 to <8 x float>*
-  store <8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>, <8 x float>* %5, align 1
-  %i_v2 = load i64, i64* %i, align 8
-  %6 = getelementptr i32, i32* %D_arg, i64 %i_v2
-  %7 = bitcast i32* %6 to <8 x i32>*
-  store <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32>* %7, align 1
-  %i_v3 = load i64, i64* %i, align 8
-  %8 = getelementptr float, float* %C_arg, i64 %i_v3
-  %9 = bitcast float* %8 to <8 x float>*
-  store <8 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <8 x float>* %9, align 1
-  %i_v4 = load i64, i64* %i, align 8
-  %10 = getelementptr float, float* %A_arg, i64 %i_v4
-  %11 = bitcast float* %10 to <8 x float>*
-  %12 = load <8 x float>, <8 x float>* %11, align 1
-  store <8 x float> %12, <8 x float>* %A_val, align 32
-  %A_val_v = load <8 x float>, <8 x float>* %A_val, align 32
-  %i_v5 = load i64, i64* %i, align 8
-  %13 = getelementptr float, float* %B_arg, i64 %i_v5
-  %14 = bitcast float* %13 to <8 x float>*
-  %15 = load <8 x float>, <8 x float>* %14, align 1
-  %16 = fadd reassoc nnan contract <8 x float> %A_val_v, %15
-  %i_v6 = load i64, i64* %i, align 8
-  %17 = getelementptr float, float* %C_arg, i64 %i_v6
-  %18 = bitcast float* %17 to <8 x float>*
-  store <8 x float> %16, <8 x float>* %18, align 1
-  %19 = load i64, i64* %i, align 8
-  %20 = add i64 %19, 8
-  store i64 %20, i64* %i, align 8
-  br label %for_check
-
-for_cont:                                         ; preds = %for_check
-  ret void
-}
-
-attributes #0 = { nounwind "frame-pointer"="all" "no-frame-pointer-elim"="true" }
-)";
-
-constexpr const char *expected_alias = R"(; ModuleID = 'name'
-source_filename = "name"
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-
-; Function Attrs: nounwind
-define void @aaa(i8* %__stream_arg, i8* noalias nocapture nonnull %__module_data_arg, float* nocapture nonnull %A_arg, float* nocapture nonnull %B_arg, float* noalias nocapture nonnull %C_arg, float* noalias nocapture nonnull %D_arg) #0 {
-entry:
-  %0 = getelementptr float, float* %A_arg, i32 0
-  %1 = load float, float* %0, align 4, !alias.scope !0, !noalias !3
-  %2 = fadd reassoc nnan contract float %1, 1.000000e+00
-  %3 = getelementptr float, float* %A_arg, i32 0
-  store float %2, float* %3, align 4, !alias.scope !0, !noalias !3
-  %4 = getelementptr float, float* %B_arg, i32 0
-  %5 = load float, float* %4, align 4, !alias.scope !0, !noalias !3
-  %6 = fadd reassoc nnan contract float %5, 1.000000e+00
-  %7 = getelementptr float, float* %B_arg, i32 0
-  store float %6, float* %7, align 4, !alias.scope !0, !noalias !3
-  %8 = getelementptr float, float* %C_arg, i32 0
-  %9 = load float, float* %8, align 4, !alias.scope !7, !noalias !8
-  %10 = fadd reassoc nnan contract float %9, 1.000000e+00
-  %11 = getelementptr float, float* %C_arg, i32 0
-  store float %10, float* %11, align 4, !alias.scope !7, !noalias !8
-  %12 = getelementptr float, float* %D_arg, i32 0
-  %13 = load float, float* %12, align 4, !alias.scope !9, !noalias !10
-  %14 = fadd reassoc nnan contract float %13, 1.000000e+00
-  %15 = getelementptr float, float* %D_arg, i32 0
-  store float %14, float* %15, align 4, !alias.scope !9, !noalias !10
-  ret void
-}
-
-attributes #0 = { nounwind "frame-pointer"="all" "no-frame-pointer-elim"="true" }
-
-!0 = !{!1}
-!1 = distinct !{!1, !2, !"0"}
-!2 = distinct !{!2, !"aaa"}
-!3 = !{!4, !5, !6}
-!4 = distinct !{!4, !2, !"-3"}
-!5 = distinct !{!5, !2, !"-2"}
-!6 = distinct !{!6, !2, !"-1"}
-!7 = !{!5}
-!8 = !{!4, !6, !1}
-!9 = !{!4}
-!10 = !{!5, !6, !1}
-)";
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_codegenc.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_codegenc.cpp
deleted file mode 100644
index 083c48ea98d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_codegenc.cpp
+++ /dev/null
@@ -1,590 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <sstream>
-#include "test_utils.hpp"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-static context_ptr get_ctx() {
-    static context_ptr ret
-            = std::make_shared<context_t>(*get_default_context());
-    ret->flags_.dead_write_elimination_ = false;
-    ret->flags_.index2var_ = false;
-    ret->flags_.tensor2var_ = false;
-    return ret;
-}
-
-TEST(GCCore_CPU_codegenc_cpp, TestCodegenC) {
-    REQUIRE_PARALLEL();
-    if (runtime_config_t::get().managed_thread_pool_
-            == thread_pool_mode_t::DYNAMIC) {
-        GTEST_SKIP();
-    }
-    builder::ir_builder_t builder;
-    const int shape1 = 128;
-    for_loop li, lj, lk, lp;
-    std::stringstream ss;
-    std::stringstream offline_source;
-    std::stringstream header_source;
-    std::stringstream data_source;
-    c_generator_optional_out_t opt_out {
-            &offline_source, &header_source, &data_source};
-    auto cgen = create_c_generator(ss, get_ctx(), true, &opt_out);
-
-    _decl_func_(datatypes::void_t, bbb,
-            _arg_("A", datatypes::f32, {shape1, shape1}),
-            _arg_("len", datatypes::s32), _arg_("tsr", datatypes::pointer));
-
-    // called twice in global var, check if is declared once
-    _decl_func_(datatypes::f32, ginit);
-
-    ir_module_ptr m = std::make_shared<ir_module_t>(get_ctx());
-    _module_var_(m, val, datatypes::f32, 12.34f);
-    _module_var_(m, val2, datatypes::f32, ginit());
-    _global_var_(m, val3, datatypes::f32, ginit());
-    _global_tensor_(m, gtsr, datatypes::f32, 100);
-    _module_tensor_(m, stsr, datatypes::f32, 10000);
-    _function_(datatypes::void_t, aaa,
-            _arg_("A", datatypes::f32, {shape1, shape1}),
-            _arg_("B", datatypes::f32, {shape1, shape1}),
-            _arg_("C", datatypes::f32, {shape1, shape1}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, C, len);
-        _tensor_(D, datatypes::f32, 2, 10);
-        _tensor_(E, datatypes::f32, 100, 20);
-        _tensor_(F, datatypes::f32, len);
-        _tensor_(F_view, datatypes::s32, 10);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(F, {3});
-        builder.get_current_scope().as_seq().back()->attr()["comments"]
-                = std::vector<std::string> {"hello", "hi"};
-        _named_for_(li, i, 0, shape1) {
-            _named_for_(lj, j, 0, shape1) {
-                _named_for_(lk, k, 0, shape1) {
-                    C[{i, j}] = C[{i, j}] + A[{i, k}] * B[{k, j}];
-                }
-            }
-        }
-        F_view[0] = 1;
-        gtsr[0] = 1.0f;
-        val = 1.0f;
-    }
-    aaa->attr()["comments"]
-            = std::vector<std::string> {"aaa", "@param 1", "@param 2"};
-    _function_(datatypes::void_t, ddd, _arg_("len", datatypes::s32)) {}
-    _function_(datatypes::s32, ccc,
-            _arg_("A", datatypes::f32, {shape1, shape1}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, len);
-        _for_(i, 0, shape1) {
-            _for_(j, 0, shape1) { A[{i, j}] = 0; }
-            builder.push_evaluate(
-                    bbb(A, len, builder::tensor_ptr(A, {0, 100})));
-        }
-        _evaluate_call_(aaa, A, A, A, len);
-        _evaluate_call_(ddd, 0);
-        _var_(bbb, datatypes::pointer);
-        bbb->attr()["prototype"] = ddd;
-        builder.push_evaluate(
-                make_expr<call_node>(bbb.get(), std::vector<expr> {2}));
-        _return_(12);
-    }
-    aaa->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_NONE;
-    aaa->attr()["allow_tensor_view"] = true;
-    bbb->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_NONE;
-    ccc->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_NONE;
-    ddd->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_NONE;
-    m->add_func({ccc});
-
-    aaa->attr()[function_attrs::is_main] = true;
-    cgen(m);
-    std::string expected1 = R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-extern "C" void bbb(float* __restrict__ A, int32_t len, void* tsr) noexcept __attribute__((nonnull (1)));
-/**
- * aaa
- * @param __stream the stream pointer, usually get_default_stream()
- * @param __module_data the module global data
- * @param 1
- * @param 2
-*/
-extern "C" void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t len) noexcept __attribute__((nonnull (2,3,4,5)));
-void* (*sc_aligned_malloc_fptr)(void* stream, uint64_t size) noexcept __attribute__((returns_nonnull))  /*__attribute__((malloc))*/;
-void (*sc_aligned_free_fptr)(void* stream, void* ptr) noexcept;
-extern "C" float ginit() noexcept;
-extern "C" int32_t ccc(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, int32_t len) noexcept __attribute__((nonnull (2,3)));
-extern "C" void __sc_init__(void* __stream, int8_t* __restrict__ __module_data) noexcept __attribute__((nonnull (2)));
-
-
-extern "C" int32_t ccc(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, int32_t len) noexcept{
-  for (uint64_t i = 0UL; i < 128UL; i += 1UL) {
-    for (uint64_t j = 0UL; j < 128UL; j += 1UL) {
-      A[((i * 128UL) + j)] = 0.f;
-    }
-    bbb(A, len, &A[100UL]);
-  }
-  aaa(__stream, __module_data, A, A, A, len);
-  void* bbb;
-  ((void(*)(void*, int8_t*, int32_t))bbb)(__stream, __module_data, 2);
-  return 12;
-}
-
-extern "C" void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t len) noexcept{
-  float* gtsr = (float*)&__module_data[64UL];
-  float& val = *(float*)(__module_data + 0);
-  alignas(64) float D[20UL];
-  float* E = (float*)sc_aligned_malloc_fptr(__stream, 8000UL);
-  float* F = (float*)sc_aligned_malloc_fptr(__stream, ((uint64_t)len * 4UL));
-  // hello
-  // hi
-  int32_t* F_view = (int32_t*)&F[3];
-  for (uint64_t i = 0UL; i < 128UL; i += 1UL) {
-    for (uint64_t j = 0UL; j < 128UL; j += 1UL) {
-      for (uint64_t k = 0UL; k < 128UL; k += 1UL) {
-        C[((i * 128UL) + j)] = (C[((i * 128UL) + j)] + (A[((i * 128UL) + k)] * B[(j + (k * 128UL))]));
-      }
-    }
-  }
-  F_view[0] = 1;
-  gtsr[0] = 1.f;
-  val = 1.f;
-  sc_aligned_free_fptr(__stream, F);
-  sc_aligned_free_fptr(__stream, E);
-}
-
-extern "C" void ddd(void* __stream, int8_t* __restrict__ __module_data, int32_t len) noexcept{
-}
-
-extern "C" void __sc_init__(void* __stream, int8_t* __restrict__ __module_data) noexcept{
-  float& val = *(float*)(__module_data + 0);
-  float& val2 = *(float*)(__module_data + 4);
-  float& val3 = *(float*)(__module_data + 8);
-  val = 12.3400002;
-  val2 = ginit();
-  val3 = ginit();
-}
-
-extern "C" void ccc_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  ccc(__stream, __module_data, (float*)(args[0UL].v_ptr), args[1UL].v_int32_t);
-}
-
-extern "C" void aaa_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  aaa(__stream, __module_data, (float*)(args[0UL].v_ptr), (float*)(args[1UL].v_ptr), (float*)(args[2UL].v_ptr), args[3UL].v_int32_t);
-}
-
-extern "C" void ddd_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-}
-
-extern "C" void __sc_init___0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  __sc_init__(__stream, __module_data);
-}
-
-)";
-    EXPECT_EQ(ss.str(), expected1);
-
-    std::string expected_offline
-            = R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-#include <omp.h>
-#define sc_get_thread_id omp_get_thread_num
-#define sc_parallel_call_cpu_with_env sc_parallel_call_cpu_with_env_impl
-static void bbb(float* __restrict__ A, int32_t len, void* tsr) noexcept __attribute__((nonnull (1)));
-/**
- * aaa
- * @param __stream the stream pointer, usually get_default_stream()
- * @param __module_data the module global data
- * @param 1
- * @param 2
-*/
-static void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t len) noexcept __attribute__((nonnull (2,3,4,5)));
-extern "C" void* sc_aligned_malloc(void* stream, uint64_t size) noexcept __attribute__((returns_nonnull))  __attribute__((malloc));
-extern "C" void sc_aligned_free(void* stream, void* ptr) noexcept;
-static float ginit() noexcept;
-static int32_t ccc(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, int32_t len) noexcept __attribute__((nonnull (2,3)));
-extern "C" void __sc_init__(void* __stream, int8_t* __restrict__ __module_data) noexcept __attribute__((nonnull (2)));
-
-
-static int32_t ccc(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, int32_t len) noexcept{
-  for (uint64_t i = 0UL; i < 128UL; i += 1UL) {
-    for (uint64_t j = 0UL; j < 128UL; j += 1UL) {
-      A[((i * 128UL) + j)] = 0.f;
-    }
-    bbb(A, len, &A[100UL]);
-  }
-  aaa(__stream, __module_data, A, A, A, len);
-  void* bbb;
-  ((void(*)(void*, int8_t*, int32_t))bbb)(__stream, __module_data, 2);
-  return 12;
-}
-
-static void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t len) noexcept{
-  float* gtsr = (float*)&__module_data[64UL];
-  float& val = *(float*)(__module_data + 0);
-  alignas(64) float D[20UL];
-  float* E = (float*)sc_aligned_malloc(__stream, 8000UL);
-  float* F = (float*)sc_aligned_malloc(__stream, ((uint64_t)len * 4UL));
-  // hello
-  // hi
-  int32_t* F_view = (int32_t*)&F[3];
-  for (uint64_t i = 0UL; i < 128UL; i += 1UL) {
-    for (uint64_t j = 0UL; j < 128UL; j += 1UL) {
-      for (uint64_t k = 0UL; k < 128UL; k += 1UL) {
-        C[((i * 128UL) + j)] = (C[((i * 128UL) + j)] + (A[((i * 128UL) + k)] * B[(j + (k * 128UL))]));
-      }
-    }
-  }
-  F_view[0] = 1;
-  gtsr[0] = 1.f;
-  val = 1.f;
-  sc_aligned_free(__stream, F);
-  sc_aligned_free(__stream, E);
-}
-
-static void ddd(void* __stream, int8_t* __restrict__ __module_data, int32_t len) noexcept{
-}
-
-extern "C" void sc_init_aaa(void* __stream, int8_t* __restrict__ __module_data) noexcept{
-  float& val = *(float*)(__module_data + 0);
-  float& val2 = *(float*)(__module_data + 4);
-  float& val3 = *(float*)(__module_data + 8);
-  val = 12.3400002;
-  val2 = ginit();
-  val3 = ginit();
-}
-
-static void ccc_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  ccc(__stream, __module_data, (float*)(args[0UL].v_ptr), args[1UL].v_int32_t);
-}
-
-extern "C" void aaa_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  aaa(__stream, __module_data, (float*)(args[0UL].v_ptr), (float*)(args[1UL].v_ptr), (float*)(args[2UL].v_ptr), args[3UL].v_int32_t);
-}
-
-static void ddd_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-}
-
-)";
-    EXPECT_EQ(offline_source.str(), expected_offline);
-
-    const char *expected_header = R"(#include <stdint.h>
-#include <runtime/generic_val.hpp>
-using generic_val = dnnl::impl::graph::gc::generic_val;
-
-extern uint8_t aaa_data[40512];
-
-/**
- * Initialize the aaa
- * @param __stream the stream pointer, usually get_default_stream()
- * @param __module_data the module global data
-*/
-extern "C" void sc_init_aaa(void* __stream, int8_t* __restrict__ __module_data) noexcept __attribute__((nonnull (2)));
-/**
- * aaa
- * @param __stream the stream pointer, usually get_default_stream()
- * @param __module_data the module global data
- * @param args The array of arguments. It should contain the following:
- *   -param 1
- *   -param 2
-*/
-extern "C" void aaa_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept __attribute__((nonnull (2,3)));
-)";
-
-    EXPECT_EQ(header_source.str(), expected_header);
-    const char *expected_data = R"(#include <stdint.h>
-
-alignas(64) uint8_t aaa_data[40512] = {)";
-    EXPECT_TRUE(utils::string_startswith(data_source.str(), expected_data));
-}
-
-TEST(GCCore_CPU_codegenc_cpp, TestCodegenCParallelFor) {
-    REQUIRE_PARALLEL();
-    if (runtime_config_t::get().managed_thread_pool_
-            == thread_pool_mode_t::DYNAMIC) {
-        GTEST_SKIP();
-    }
-    builder::ir_builder_t builder;
-    const int shape1 = 128;
-    for_loop li, lj, lk, lp;
-    std::stringstream ss;
-    auto cgen = create_c_generator(ss, get_ctx(), true);
-    auto m = std::make_shared<ir_module_t>(get_ctx());
-    _global_var_(m, gv, datatypes::s32, expr());
-    _function_(datatypes::void_t, aaa,
-            _arg_("A", datatypes::f32, {shape1, shape1}),
-            _arg_("B", datatypes::f32, {shape1, shape1}),
-            _arg_("C", datatypes::f32, {shape1, shape1}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, C, len);
-        _named_for_(li, i, 0, shape1, 1, for_type::PARALLEL) {
-            gv = 1;
-            _var_(v1, datatypes::f32);
-            _tensor_(D, datatypes::f32, 2, 10);
-            _tensor_(E, datatypes::f32, 10, 200);
-            _named_for_(lj, j, 0, shape1) {
-                _named_for_(lk, k, 0, shape1) {
-                    C[{i, j}] = C[{i, j}] + D[{i, k}] + A[{i, k}] * B[{k, j}]
-                            + len + v1;
-                }
-            }
-        }
-        _var_(t, datatypes::s32);
-        t = t & 10;
-        t = t | 10;
-        t = t >> 10;
-        t = t << t;
-        t = len;
-        _for_(i, 1, 100, 2, for_type::PARALLEL) { A[{i, i}] = i + t; }
-
-        _tensor_(D, datatypes::s32, {8});
-        _tensor_(E, datatypes::s32, {8});
-        D[span_t({expr(0)}, 8)] = D[span_t({expr(0)}, 8)]
-                << E[span_t({expr(0)}, 8)];
-    }
-    aaa->attr()[attr_keys::buf_sched_type] = attr_keys::BUF_SCHED_NONE;
-    m->add_func({aaa});
-    cgen(m);
-    std::string expected1 = R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-void (*sc_parallel_call_cpu_with_env_fptr)(void* func, uint64_t flags, void* stream, int8_t* env, uint64_t begin, uint64_t end, uint64_t step, generic_val* args) noexcept;
-static void aaa0_closure_0_0wrapper(void* __stream, int8_t* __restrict__ __module_data, uint64_t i, generic_val* __restrict__ args) noexcept __attribute__((nonnull (2,4)));
-static void aaa0_closure_1_0wrapper(void* __stream, int8_t* __restrict__ __module_data, uint64_t i, generic_val* __restrict__ args) noexcept __attribute__((nonnull (2,4)));
-extern "C" void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t len) noexcept __attribute__((nonnull (2,3,4,5)));
-void* (*sc_thread_aligned_malloc_fptr)(void* stream, uint64_t size) noexcept __attribute__((returns_nonnull))  /*__attribute__((malloc))*/;
-void (*sc_thread_aligned_free_fptr)(void* stream, void* ptr) noexcept;
-static void aaa0_closure_0(void* __stream, int8_t* __restrict__ __module_data, uint64_t i, float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int32_t len) noexcept __attribute__((nonnull (2,4,5,6)));
-static void aaa0_closure_1(void* __stream, int8_t* __restrict__ __module_data, uint64_t i_1, float* __restrict__ A, int32_t t) noexcept __attribute__((nonnull (2,4)));
-
-
-extern "C" void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t len) noexcept{
-  generic_val __tempargs0[4UL];
-  __tempargs0[0UL] = C;
-  __tempargs0[1UL] = A;
-  __tempargs0[2UL] = B;
-  __tempargs0[3UL] = len;
-  sc_parallel_call_cpu_with_env_fptr((void*)&aaa0_closure_0_0wrapper, 0UL, __stream, __module_data, 0UL, 128UL, 1UL, __tempargs0);
-  int32_t t;
-  t = (t & 10);
-  t = (t | 10);
-  t = (t >> 10);
-  t = (t << t);
-  t = len;
-  generic_val __tempargs1[2UL];
-  __tempargs1[0UL] = A;
-  __tempargs1[1UL] = t;
-  sc_parallel_call_cpu_with_env_fptr((void*)&aaa0_closure_1_0wrapper, 0UL, __stream, __module_data, 1UL, 100UL, 2UL, __tempargs1);
-  int32_t D_2[8];
-  int32_t E_3[8];
-  vec_s32x8::store((vec_s32x8::load(&D_2[0]) << vec_s32x8::load(&E_3[0])), &D_2[0]);
-}
-
-extern "C" void aaa_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  aaa(__stream, __module_data, (float*)(args[0UL].v_ptr), (float*)(args[1UL].v_ptr), (float*)(args[2UL].v_ptr), args[3UL].v_int32_t);
-}
-
-static void aaa0_closure_0(void* __stream, int8_t* __restrict__ __module_data, uint64_t i, float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int32_t len) noexcept{
-  int32_t& gv = *(int32_t*)(__module_data + 0);
-  gv = 1;
-  float v1;
-  alignas(64) float D[20UL];
-  float* E = (float*)sc_thread_aligned_malloc_fptr(__stream, 8000UL);
-  for (uint64_t j = 0UL; j < 128UL; j += 1UL) {
-    for (uint64_t k = 0UL; k < 128UL; k += 1UL) {
-      C[((i * 128UL) + j)] = ((((C[((i * 128UL) + j)] + D[((i * 10UL) + k)]) + (A[((i * 128UL) + k)] * B[(j + (k * 128UL))])) + (float)len) + v1);
-    }
-  }
-  sc_thread_aligned_free_fptr(__stream, E);
-}
-
-static void aaa0_closure_0_0wrapper(void* __stream, int8_t* __restrict__ __module_data, uint64_t i, generic_val* __restrict__ args) noexcept{
-  aaa0_closure_0(__stream, __module_data, i, (float*)(args[0UL].v_ptr), (float*)(args[1UL].v_ptr), (float*)(args[2UL].v_ptr), args[3UL].v_int32_t);
-}
-
-static void aaa0_closure_1(void* __stream, int8_t* __restrict__ __module_data, uint64_t i_1, float* __restrict__ A, int32_t t) noexcept{
-  A[(i_1 + (i_1 * 128UL))] = (float)((uint64_t)t + i_1);
-}
-
-static void aaa0_closure_1_0wrapper(void* __stream, int8_t* __restrict__ __module_data, uint64_t i, generic_val* __restrict__ args) noexcept{
-  aaa0_closure_1(__stream, __module_data, i, (float*)(args[0UL].v_ptr), args[1UL].v_int32_t);
-}
-
-)";
-    EXPECT_EQ(ss.str(), expected1);
-}
-
-TEST(GCCore_CPU_codegenc_cpp, TestCodegenCVector) {
-    builder::ir_builder_t builder;
-    std::stringstream ss;
-    auto cgen = create_c_generator(ss, get_ctx(), true);
-
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {512}),
-            _arg_("B", datatypes::f32, {512}),
-            _arg_("C", datatypes::f32, {512}),
-            _arg_("D", datatypes::s32, {512})) {
-        _bind_(A, B, C, D);
-        _for_(i, 0, 512, 8) {
-            D[span_t({i}, 8)] = make_expr<constant_node>(
-                    std::vector<union_val> {INT64_C(1), INT64_C(2), INT64_C(3),
-                            INT64_C(4), INT64_C(5), INT64_C(6), INT64_C(7),
-                            INT64_C(8)},
-                    sc_data_type_t::s32(8));
-            C[span_t({i}, 8)] = make_expr<constant_node>(
-                    std::vector<union_val> {
-                            1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
-                    sc_data_type_t::f32(8));
-            D[span_t({i}, 8)] = make_expr<constant_node>(
-                    INT64_C(1), sc_data_type_t::s32(8));
-            C[span_t({i}, 8)]
-                    = make_expr<constant_node>(2.0f, sc_data_type_t::f32(8));
-            C[span_t({i}, 8)] = A[span_t({i}, 8)] + B[span_t({i}, 8)];
-        }
-    }
-
-    cgen(ir_module_t::from_entry_func(get_ctx(), aaa));
-    std::string expected1 = R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-extern "C" void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t* __restrict__ D) noexcept __attribute__((nonnull (2,3,4,5,6)));
-
-
-extern "C" void aaa(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, int32_t* __restrict__ D) noexcept{
-  for (uint64_t i = 0UL; i < 512UL; i += 8UL) {
-    vec_s32x8::store(vec_s32x8(1, 2, 3, 4, 5, 6, 7, 8), &D[i]);
-    vec_f32x8::store(vec_f32x8(1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f), &C[i]);
-    vec_s32x8::store(vec_s32x8(1), &D[i]);
-    vec_f32x8::store(vec_f32x8(2.f), &C[i]);
-    vec_f32x8::store((vec_f32x8::load(&A[i]) + vec_f32x8::load(&B[i])), &C[i]);
-  }
-}
-
-extern "C" void aaa_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  aaa(__stream, __module_data, (float*)(args[0UL].v_ptr), (float*)(args[1UL].v_ptr), (float*)(args[2UL].v_ptr), (int32_t*)(args[3UL].v_ptr));
-}
-
-)";
-    EXPECT_EQ(ss.str(), expected1);
-}
-
-TEST(GCCore_CPU_codegenc_cpp, TestCodegenCGenericVal) {
-    builder::ir_builder_t builder;
-    std::stringstream ss;
-    auto cgen = create_c_generator(ss, get_ctx(), false);
-
-    _function_(datatypes::s32, aaa) {
-        _var_(a, datatypes::generic);
-        a = 100;
-        a = 1.2f;
-        _var_(b, datatypes::pointer);
-        b = builder::make_cast(datatypes::pointer, a);
-        b = builder::make_cast(
-                sc_data_type_t::pointerof(sc_data_etype::F32), a);
-        _return_(builder::make_cast(datatypes::s32, a));
-    }
-
-    cgen(aaa);
-    std::string expected1 = R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-extern "C" int32_t aaa(void* __stream, int8_t* __restrict__ __module_data) noexcept{
-  generic_val a;
-  a = 100;
-  a = 1.20000005;
-  void* b;
-  b = (void*)(a.v_ptr);
-  b = (float*)(a.v_ptr);
-  return a.v_int32_t;
-}
-
-)";
-    EXPECT_EQ(ss.str(), expected1);
-}
-
-TEST(GCCore_CPU_codegenc_cpp, TestCodegenCCondition) {
-    builder::ir_builder_t builder;
-    std::stringstream ss;
-    auto cgen = create_c_generator(ss, get_ctx(), true);
-
-    _function_(datatypes::void_t, bbb, _arg_("A", datatypes::f32, {512}),
-            _arg_("B", datatypes::f32, {512}),
-            _arg_("C", datatypes::f32, {512}),
-            _arg_("D", datatypes::f32, {512}),
-            _arg_("E", datatypes::f32, {512})) {
-        _bind_(A, B, C, D, E);
-        _for_(i, 0, 512, 8) {
-            E[span_t({i}, 8)] = builder::make_select(
-                    C[span_t({i}, 8)] <= D[span_t({i}, 8)], A[span_t({i}, 8)],
-                    B[span_t({i}, 8)]);
-        }
-    }
-
-    cgen(ir_module_t::from_entry_func(get_ctx(), bbb));
-    std::string expected1 = R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-extern "C" void bbb(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, float* __restrict__ D, float* __restrict__ E) noexcept __attribute__((nonnull (2,3,4,5,6,7)));
-
-
-extern "C" void bbb(void* __stream, int8_t* __restrict__ __module_data, float* __restrict__ A, float* __restrict__ B, float* __restrict__ C, float* __restrict__ D, float* __restrict__ E) noexcept{
-  for (uint64_t i = 0UL; i < 512UL; i += 8UL) {
-    vec_f32x8::store(sc_select((vec_f32x8::load(&C[i]) <= vec_f32x8::load(&D[i])), vec_f32x8::load(&A[i]), vec_f32x8::load(&B[i])), &E[i]);
-  }
-}
-
-extern "C" void bbb_0wrapper(void* __stream, int8_t* __restrict__ __module_data, generic_val* __restrict__ args) noexcept{
-  bbb(__stream, __module_data, (float*)(args[0UL].v_ptr), (float*)(args[1UL].v_ptr), (float*)(args[2UL].v_ptr), (float*)(args[3UL].v_ptr), (float*)(args[4UL].v_ptr));
-}
-
-)";
-    EXPECT_EQ(ss.str(), expected1);
-}
-
-TEST(GCCore_CPU_codegenc_cpp, TestCodegenCGlobalTensor) {
-    builder::ir_builder_t builder;
-    std::stringstream ss;
-    auto m = std::make_shared<ir_module_t>(get_ctx());
-    _global_tensor_(m, gv, datatypes::s32, 4);
-    int32_t values[] = {1, 2, 3, 4};
-    gv.static_as<tensor>()->init_value_
-            = std::make_shared<static_data_t>(values, 4 * sizeof(int32_t));
-
-    auto cgen = create_c_generator(ss, get_ctx(), false);
-
-    _function_(datatypes::s32, bbb) {
-        gv[1] = 123;
-        _return_(gv[0]);
-    }
-    m->add_func({bbb});
-    auto ret = cgen(m);
-    ret->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS);
-    std::stringstream expected;
-    expected << R"(#include <runtime/kernel_include/cpu_include.hpp>
-
-
-
-extern "C" int32_t bbb(void* __stream, int8_t* __restrict__ __module_data) noexcept{
-  int32_t* gv = (int32_t*)&__module_data[0UL];
-  gv[1] = 123;
-  return gv[0];
-}
-
-)";
-    EXPECT_EQ(ss.str(), expected.str());
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_combined_dispatch_key.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_combined_dispatch_key.cpp
deleted file mode 100644
index f8915beb6b5..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_combined_dispatch_key.cpp
+++ /dev/null
@@ -1,329 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/graph.hpp>
-
-#include <iostream>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_combined_dispatch_key_cpp, TestCombinedKeyEmptyInput) {
-    combined_dispatch_key_set_t combined_set(
-            (std::vector<std::shared_ptr<dispatch_key_set_base_t>>()));
-    EXPECT_EQ(combined_set.size(), UINT64_C(0));
-}
-
-TEST(GCCore_CPU_combined_dispatch_key_cpp, TestCombinedKeyOneInput) {
-    auto in_set = std::make_shared<dispatch_key_set_t>();
-    in_set->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 16), sc_data_format_t::MK()}),
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {
-                            sc_data_format_t::MKmk(32, 16),
-                            sc_data_format_t::MK()},
-                    impl_kind_t::no_padding)};
-    combined_dispatch_key_set_t::inner_set_t expected {
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::MKmk(32, 16),
-                            sc_data_format_t::MK()})},
-            combined_op_dispatch_key_t {op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {
-                            sc_data_format_t::MKmk(32, 16),
-                            sc_data_format_t::MK()},
-                    impl_kind_t::no_padding)}};
-    combined_dispatch_key_set_t combined_set({in_set});
-    EXPECT_EQ(combined_set.size(), UINT64_C(2));
-    EXPECT_EQ(combined_set.set_, expected);
-}
-
-TEST(GCCore_CPU_combined_dispatch_key_cpp, TestCombinedKeyMultiInput) {
-    auto in_set1 = std::make_shared<dispatch_key_set_t>();
-    auto in_set2 = std::make_shared<dispatch_key_set_t>();
-    in_set1->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {sc_data_format_t::MK()}),
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {
-                            sc_data_format_t::MKmk(32, 16)},
-                    impl_kind_t::no_padding)};
-    in_set2->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {sc_data_format_t::NCHWc(16),
-                            sc_data_format_t::KCRSck(16, 32)}),
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {sc_data_format_t::NCHWc(16),
-                            sc_data_format_t::KCRSck(32, 32)}),
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {sc_data_format_t::NCHWc(32),
-                            sc_data_format_t::KCRS()},
-                    impl_kind_t::no_padding)};
-    combined_dispatch_key_set_t::inner_set_t expected {
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::MK()}),
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::NCHWc(16),
-                            sc_data_format_t::KCRSck(16, 32)})},
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::MK()}),
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::NCHWc(16),
-                            sc_data_format_t::KCRSck(32, 32)})},
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::MK()}),
-                    op_dispatch_key_t(
-                            std::vector<sc_data_format_t> {
-                                    sc_data_format_t::NCHWc(32),
-                                    sc_data_format_t::KCRS()},
-                            impl_kind_t::no_padding)},
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(
-                            std::vector<sc_data_format_t> {
-                                    sc_data_format_t::MKmk(32, 16)},
-                            impl_kind_t::no_padding),
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::NCHWc(16),
-                            sc_data_format_t::KCRSck(16, 32)})},
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(
-                            std::vector<sc_data_format_t> {
-                                    sc_data_format_t::MKmk(32, 16)},
-                            impl_kind_t::no_padding),
-                    op_dispatch_key_t(std::vector<sc_data_format_t> {
-                            sc_data_format_t::NCHWc(16),
-                            sc_data_format_t::KCRSck(32, 32)})},
-            combined_op_dispatch_key_t {
-                    op_dispatch_key_t(
-                            std::vector<sc_data_format_t> {
-                                    sc_data_format_t::MKmk(32, 16)},
-                            impl_kind_t::no_padding),
-                    op_dispatch_key_t(
-                            std::vector<sc_data_format_t> {
-                                    sc_data_format_t::NCHWc(32),
-                                    sc_data_format_t::KCRS()},
-                            impl_kind_t::no_padding)},
-    };
-    combined_dispatch_key_set_t combined_set(
-            std::vector<dispatch_set_ptr> {in_set1, in_set2});
-    EXPECT_EQ(combined_set.size(), UINT64_C(6));
-    EXPECT_EQ(combined_set.set_, expected);
-}
-
-TEST(GCCore_CPU_combined_dispatch_key_cpp, TestCombinedKeyGraphLinked) {
-    auto in_set1 = std::make_shared<dispatch_key_set_t>(); // data reorder
-    auto in_set2 = std::make_shared<dispatch_key_set_t>(); // weight_reorder
-    auto in_set3 = std::make_shared<dispatch_key_set_t>(); // matmul1
-    auto in_set4 = std::make_shared<dispatch_key_set_t>(); // bias reorder
-    auto in_set5 = std::make_shared<dispatch_key_set_t>(); // weight 2 reorder
-    auto in_set6 = std::make_shared<dispatch_key_set_t>(); // matmul2
-    auto in_set7 = std::make_shared<dispatch_key_set_t>(); // last reorder
-
-    in_set1->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MK()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(32, 32)})};
-    in_set2->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::NKkn(32, 64)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::NKkn(32, 32)})};
-    in_set3->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::NKkn(32, 64),
-                    sc_data_format_t::MKmk(4, 64)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32),
-                    sc_data_format_t::NKkn(32, 32),
-                    sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 32),
-                    sc_data_format_t::NKkn(32, 64),
-                    sc_data_format_t::MKmk(32, 64)})};
-    in_set4->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(1, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(1, 64)})};
-    in_set5->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::KN(), sc_data_format_t::KN()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::KN(), sc_data_format_t::NKkn(32, 32)})};
-    in_set6->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(4, 64), sc_data_format_t::KN(),
-                    sc_data_format_t::MKmk(4, 64)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32), sc_data_format_t::KN(),
-                    sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32),
-                    sc_data_format_t::NKkn(32, 32),
-                    sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 64), sc_data_format_t::KN(),
-                    sc_data_format_t::MKmk(32, 64)})};
-    in_set7->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(4, 64), sc_data_format_t::MK()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32), sc_data_format_t::MK()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 64), sc_data_format_t::MK()})};
-    sc_graph_t graph;
-    auto input1 = graph.make_input({graph_tensor::make({-1, 32})}); // data
-    auto input2 = graph.make_input({graph_tensor::make({32, 64})}); // weight
-    auto input3 = graph.make_input({graph_tensor::make({1, 64})}); // bias
-    auto input4 = graph.make_input({graph_tensor::make({64, 8})}); // weight2
-    auto data_reorder = graph.make("reorder", input1->get_outputs(),
-            {graph_tensor::make({-1, 32})}, {});
-    data_reorder->get_dispatch_key_set() = in_set1;
-    auto weight_reorder = graph.make("reorder", input2->get_outputs(),
-            {graph_tensor::make({32, 64})}, {});
-    weight_reorder->get_dispatch_key_set() = in_set2;
-    auto matmul1 = graph.make("matmul_core",
-            {data_reorder->get_outputs()[0], weight_reorder->get_outputs()[0]},
-            {}, {});
-    matmul1->get_dispatch_key_set() = in_set3;
-    auto bias_reorder = graph.make("reorder", input3->get_outputs(),
-            {graph_tensor::make({1, 64})}, {});
-    bias_reorder->get_dispatch_key_set() = in_set4;
-    auto bias_add = graph.make("add",
-            {matmul1->get_outputs()[0], bias_reorder->get_outputs()[0]}, {},
-            {});
-    auto weight_2_reorder = graph.make("reorder", input4->get_outputs(),
-            {graph_tensor::make({64, 8})}, {});
-    weight_2_reorder->get_dispatch_key_set() = in_set5;
-    auto matmul2 = graph.make("matmul_core",
-            {bias_add->get_outputs()[0], weight_2_reorder->get_outputs()[0]},
-            {}, {});
-    matmul2->get_dispatch_key_set() = in_set6;
-    auto last_reorder = graph.make("reorder", matmul2->get_outputs(),
-            {graph_tensor::make({64, 8})}, {});
-    last_reorder->get_dispatch_key_set() = in_set7;
-    auto out = graph.make_output(last_reorder->get_outputs());
-    std::vector<sc_op_ptr> input_ops = {data_reorder, weight_reorder, matmul1,
-            bias_reorder, weight_2_reorder, matmul2, last_reorder};
-    combined_dispatch_key_set_t combined_set(input_ops);
-    EXPECT_EQ(combined_set.size(), UINT64_C(4));
-}
-
-TEST(GCCore_CPU_combined_dispatch_key_cpp, TestCombinedKeyGraphLinked2) {
-    auto in_set1 = std::make_shared<dispatch_key_set_t>(); // data reorder
-    auto in_set2 = std::make_shared<dispatch_key_set_t>(); // weight_reorder
-    auto in_set3 = std::make_shared<dispatch_key_set_t>(); // matmul1
-
-    in_set1->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::NKkn(32, 64),
-                    sc_data_format_t::MKmk(4, 64)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32),
-                    sc_data_format_t::NKkn(32, 32),
-                    sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 32),
-                    sc_data_format_t::NKkn(32, 64),
-                    sc_data_format_t::MKmk(32, 64)})};
-    in_set2->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(4, 64)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::MKmk(32, 64)}),
-    };
-    in_set3->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(4, 64), sc_data_format_t::MK()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32), sc_data_format_t::MK()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 64), sc_data_format_t::MK()})};
-    sc_graph_t graph;
-    auto input1 = graph.make_input({graph_tensor::make({-1, 32})}); // data
-    auto input2 = graph.make_input({graph_tensor::make({32, 64})}); // weight
-    auto input3
-            = graph.make_input({graph_tensor::make({16, 64})}); // add input 1
-
-    auto matmul1 = graph.make("matmul_core",
-            {input1->get_outputs()[0], input2->get_outputs()[0]}, {}, {});
-    matmul1->get_dispatch_key_set() = in_set1;
-    auto reorder1 = graph.make("reorder", input3->get_outputs(),
-            {graph_tensor::make({16, 64})}, {});
-    reorder1->get_dispatch_key_set() = in_set2;
-    auto right_add = graph.make("add",
-            {reorder1->get_outputs()[0], matmul1->get_outputs()[0]}, {},
-            {{op_attr_key::layout_input_index, 1}});
-    auto last_reorder = graph.make("reorder", right_add->get_outputs(),
-            {graph_tensor::make({16, 64})}, {});
-    last_reorder->get_dispatch_key_set() = in_set3;
-    auto out = graph.make_output(last_reorder->get_outputs());
-    std::vector<sc_op_ptr> input_ops = {matmul1, reorder1, last_reorder};
-    combined_dispatch_key_set_t combined_set(input_ops);
-    EXPECT_EQ(combined_set.size(), UINT64_C(3));
-}
-
-TEST(GCCore_CPU_combined_dispatch_key_cpp, TestCombinedKeyGraphCheck) {
-    auto in_set1 = std::make_shared<dispatch_key_set_t>(); // reorder 1
-    auto in_set2 = std::make_shared<dispatch_key_set_t>(); // reorder 2
-
-    in_set1->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MK(), sc_data_format_t::NKkn(32, 64),
-                    sc_data_format_t::MKmk(4, 64)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32),
-                    sc_data_format_t::NKkn(32, 32),
-                    sc_data_format_t::MKmk(16, 32)}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 32),
-                    sc_data_format_t::NKkn(32, 64),
-                    sc_data_format_t::MKmk(32, 64)})};
-    in_set2->set_ = dispatch_key_set_t::inner_set_t {
-            op_dispatch_key_t(
-                    std::vector<sc_data_format_t> {
-                            sc_data_format_t::MKmk(4, 64),
-                            sc_data_format_t::MK()},
-                    impl_kind_t::no_padding),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(16, 32), sc_data_format_t::MK()}),
-            op_dispatch_key_t(std::vector<sc_data_format_t> {
-                    sc_data_format_t::MKmk(32, 64), sc_data_format_t::MK()})};
-    sc_graph_t graph;
-    auto input1 = graph.make_input({graph_tensor::make({-1, 32})}); // data
-    auto reorder1 = graph.make("reorder", input1->get_outputs(),
-            {graph_tensor::make({-1, 32})}, {});
-    reorder1->get_dispatch_key_set() = in_set1;
-    auto reorder2 = graph.make("reorder", reorder1->get_outputs(),
-            {graph_tensor::make({-1, 32})}, {});
-    reorder2->get_dispatch_key_set() = in_set2;
-    auto out = graph.make_output(reorder2->get_outputs());
-    std::vector<sc_op_ptr> input_ops = {reorder1, reorder2};
-    EXPECT_SC_ERROR(combined_dispatch_key_set_t combined_set((input_ops)),
-            "Wrong dispatch key sets, could not construct combined "
-            "dispatch key.");
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_commit_op.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_commit_op.cpp
deleted file mode 100644
index 0905634271b..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_commit_op.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <ops/templates/commit_op.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-constexpr auto f32 = datatypes::f32;
-
-TEST(GCCore_CPU_commit_op, TestCommitOP) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", f32, {512, 1024}),
-            _arg_("B", f32, {32, 128, 16, 16}),
-            _arg_("C", f32, {32, 128, 16, 16})) {
-        _bind_(A, B, C);
-        _tensor_(tmp, f32, 32, 128, 16, 16);
-        _for_(i, 0, 32) {
-            _for_(j, 0, 32) {
-                tmp->attr()[tensor_shrinker_attrs::should_shrink]
-                        = tensor_shrinker_t::shrink_info_t {
-                                /*base*/ {i, j, 0, 0}, /*shape*/ {1, 1, 16, 16},
-                                /*move def*/ stmts()};
-                // think that A is output of a plain matmul
-                ops::commit_op(get_test_ctx(), "reorder",
-                        /*inslice*/
-                        {tensor_slice(A, {{i * 16, 16}, {j * 16, 16}})},
-                        /*outslice*/
-                        {tensor_slice(tmp, {{i, 1}, {j, 1}, {0, 16}, {0, 16}})},
-                        /*in*/
-                        {graph_tensor::make(
-                                {512, 1024}, sc_data_format_t::MK())},
-                        /*out*/ {},
-                        /*attr*/
-                        {{"out_format", sc_data_format_t::MKmk(16, 16)}});
-                ops::commit_op(get_test_ctx(), "add",
-                        /*inslice*/
-                        {tensor_slice(tmp, {{i, 1}, {j, 1}, {0, 16}, {0, 16}}),
-                                tensor_slice(
-                                        B, {{i, 1}, {j, 1}, {0, 16}, {0, 16}})},
-                        {tensor_slice(C, {{i, 1}, {j, 1}, {0, 16}, {0, 16}})},
-                        {graph_tensor::make({32, 128, 16, 16}),
-                                graph_tensor::make({32, 128, 16, 16})});
-            }
-        }
-    }
-    auto constexpr UL16 = UINT64_C(16);
-    int simd_len = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    auto build_expected = [&](bool shrink) {
-        _function_(datatypes::void_t, expected, _arg_("A", f32, {512, 1024}),
-                _arg_("B", f32, {32, 128, 16, 16}),
-                _arg_("C", f32, {32, 128, 16, 16})) {
-            _bind_(A, B, C);
-            _tensor_(tmp, f32, shrink ? 1 : 32, shrink ? 1 : 128, 16, 16);
-            _for_(i, 0, 32) {
-                _for_(j, 0, 32) {
-                    _for_(f0, 0, 16) {
-                        _for_(f1, 0, 16, simd_len) {
-                            std::vector<expr> idx = {(f0 + i * UL16) / 16,
-                                    (f1 + j * UL16) / 16, (f0 + i * UL16) % 16,
-                                    (f1 + j * UL16) % 16};
-                            if (shrink) {
-                                idx[0] = idx[0] - i;
-                                idx[1] = idx[1] - j;
-                                idx[2] = idx[2] - 0;
-                                idx[3] = idx[3] - 0;
-                            }
-                            tmp[span_t(idx, simd_len)] = builder::tensor_ptr(A,
-                                    {i * UL16, j * UL16}, {},
-                                    true)[span_t({f0, f1}, simd_len)];
-                        }
-                    }
-                    _for_(f4, 0, 16) {
-                        _for_(f5, 0, 16, simd_len) {
-                            auto c_ptr = builder::tensor_ptr(
-                                    C, {i, j, 0, 0}, {}, true);
-                            std::vector<expr> idx = {i, j, 0, 0};
-                            if (shrink) {
-                                idx[0] = idx[0] - i;
-                                idx[1] = idx[1] - j;
-                                idx[2] = idx[2] - 0;
-                                idx[3] = idx[3] - 0;
-                            }
-                            auto tmp_ptr
-                                    = builder::tensor_ptr(tmp, idx, {}, true);
-                            tmp_ptr.checked_as<tensorptr>()->shape_
-                                    = {32, 128, 16, 16};
-                            auto B_ptr = builder::tensor_ptr(
-                                    B, {i, j, 0, 0}, {}, true);
-                            c_ptr[span_t({0, 0, f4, f5}, simd_len)]
-                                    = tmp_ptr[span_t({0, 0, f4, f5}, simd_len)]
-                                    + B_ptr[span_t({0, 0, f4, f5}, simd_len)];
-                        }
-                    }
-                }
-            }
-        }
-        return expected;
-    };
-
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(ccc, build_expected(false)));
-    EXPECT_TRUE(cmper.compare(
-            tensor_shrinker_t()(ccc), build_expected(true), false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_compiler_driver.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_compiler_driver.cpp
deleted file mode 100644
index 3697791e66d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_compiler_driver.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <utility>
-#include "context.hpp"
-#include "test_graph.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/pass/graph_code_cache.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/jit/compiler_driver.hpp>
-#include <unordered_set>
-
-using namespace dnnl::impl::graph::gc;
-
-static sc_graph_t make_graph(size_t id1, size_t id2, float data) {
-    sc_graph_t g;
-    auto in = g.make_input({make_tensor({128, 128})})->get_outputs()[0];
-    auto weight = g.make_input({make_tensor({128, 128})},
-                           {{"constant", const_kind::local_const},
-                                   {"temp.tensor_id", id1}})
-                          ->get_outputs()[0];
-    auto weight_bias = g.make_input({make_tensor({1, 128})},
-                                {{"constant", const_kind::local_const},
-                                        {"temp.tensor_id", id2}})
-                               ->get_outputs()[0];
-    auto c_1 = g.make("constant", {}, {graph_tensor::make({1})},
-                        {{"values",
-                                 std::make_shared<static_data_t>(
-                                         std::vector<float> {data})},
-                                {"dtype", datatypes::f32},
-                                {"plain_dims", sc_dims {1}}})
-                       ->get_outputs()[0];
-
-    auto weightx2 = g.make("add", {weight, weight}, {}, {})->get_outputs()[0];
-    auto addout
-            = g.make("add", {weight, weight_bias}, {}, {})->get_outputs()[0];
-    auto mm = g.make("matmul_core", {in, addout}, {}, {})->get_outputs()[0];
-    auto add2 = g.make("add", {mm, weightx2}, {}, {})->get_outputs()[0];
-    auto add3 = g.make("mul", {add2, c_1}, {}, {});
-    g.make_output(add3->get_outputs());
-    return g;
-}
-
-static std::unordered_set<std::shared_ptr<cached_const_graph_tensor>> to_set(
-        const std::vector<std::shared_ptr<cached_const_graph_tensor>> &v) {
-    return std::unordered_set<std::shared_ptr<cached_const_graph_tensor>> {
-            v.begin(), v.end()};
-}
-
-TEST(GCCore_CPU_compiler_driver, TestCodeReuse) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.const_share_ = true;
-
-    auto g1 = make_graph(0, 1, 1.23f);
-    auto mod1
-            = compiler_driver(ctx, g1, std::vector<sc_op_ptr> {})->get_module();
-
-    {
-        // g1 is exactly the same as g2
-        auto g2 = make_graph(0, 1, 1.23f);
-        auto mod2 = compiler_driver(ctx, g2, std::vector<sc_op_ptr> {})
-                            ->get_module();
-        EXPECT_EQ(mod1->code_, mod2->code_);
-        ASSERT_EQ(mod1->globals_.shared_tensors_.size(), 3UL);
-        ASSERT_EQ(mod2->globals_.shared_tensors_.size(), 3UL);
-        EXPECT_EQ(mod1->globals_.shared_tensors_[1],
-                mod2->globals_.shared_tensors_[1]);
-    }
-
-    {
-        // cannot share code with g1. the lazy init tensors are split into two
-        // pieces
-        // hash=4342983318926907364, base tsr=[0, 1, 2], offsets=[0, 0, 65536]
-        auto g2 = make_graph(0, 2, 1.23f);
-        auto mod2 = compiler_driver(ctx, g2, std::vector<sc_op_ptr> {})
-                            ->get_module();
-        EXPECT_NE(mod1->code_, mod2->code_);
-        EXPECT_NE(to_set(mod1->globals_.shared_tensors_),
-                to_set(mod2->globals_.shared_tensors_));
-        EXPECT_TRUE(mod2->code_->graph_cache_handle_);
-
-        // g3 can share code with g2: the lazy init tensors are split into two
-        // pieces
-        // hash=4342983318926907364, base tsr=[0, 1, 2], offsets=[0, 0, 65536]
-        auto g3 = make_graph(0, 4, 1.23f);
-        auto mod3 = compiler_driver(ctx, g3, std::vector<sc_op_ptr> {})
-                            ->get_module();
-        EXPECT_EQ(mod3->code_, mod2->code_);
-        EXPECT_NE(to_set(mod3->globals_.shared_tensors_),
-                to_set(mod2->globals_.shared_tensors_));
-
-        // hash=4342983318926853943, base tsr=[0, 1, 1], offsets=[0, 0, 65536]
-        auto g4 = make_graph(5, 1, 1.23f);
-        auto mod4 = compiler_driver(ctx, g4, std::vector<sc_op_ptr> {})
-                            ->get_module();
-        EXPECT_EQ(mod4->code_, mod1->code_);
-    }
-    {
-        // can share code with g1, because the values of the pure constants are
-        // not considered in graph compare
-        auto g2 = make_graph(0, 1, 10.23f);
-        auto mod2 = compiler_driver(ctx, g2, std::vector<sc_op_ptr> {})
-                            ->get_module();
-        EXPECT_EQ(mod1->code_, mod2->code_);
-        ASSERT_EQ(mod2->globals_.shared_tensors_.size(), 3UL);
-        EXPECT_EQ(mod1->globals_.shared_tensors_[1],
-                mod2->globals_.shared_tensors_[1]);
-    }
-
-    {
-        // g1 is exactly the same as g2, but cannot share code because of
-        // different contexts
-        auto ctx2 = std::make_shared<context_t>(*ctx);
-        auto g2 = make_graph(0, 1, 1.23f);
-        auto mod2 = compiler_driver(ctx2, g2, std::vector<sc_op_ptr> {})
-                            ->get_module();
-        EXPECT_NE(mod1->code_, mod2->code_);
-    }
-
-    // release the compiled module and check that the cached code has been
-    // removed
-    mod1 = nullptr;
-    EXPECT_EQ(query_cached_code_of_context(ctx), 0UL);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_concat_op.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_concat_op.cpp
deleted file mode 100644
index 736e945e44d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_concat_op.cpp
+++ /dev/null
@@ -1,678 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static const int A = 4, A0 = 4, A1 = 6, A2 = 8; // for concat at axis #0
-static const int B = 8;
-static const int C = 16, C0 = 16, C1 = 32, C2 = 64; // for concat at axis #2
-static const int D = 32;
-
-TEST(GCCore_CPU_concat_op_t_cpp, ConcatInferOutputShape) {
-    sc_graph_t graph;
-    auto in0 = graph.make_input({graph_tensor::make({A0, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph.make_input({graph_tensor::make({A1, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in2 = graph.make_input({graph_tensor::make({A2, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto concat = graph.make("concat",
-            {in0->get_outputs()[0], in1->get_outputs()[0],
-                    in2->get_outputs()[0]},
-            {}, {{"axis", -4}});
-    auto out = graph.make_output(concat->get_outputs());
-
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[4, 8, 16, 32], v1: f32[6, 8, 16, 32], v2: f32[8, 8, 16, 32]) -> [v3: f32[18, 8, 16, 32]] {
-  [v3: f32[18, 8, 16, 32]] = concat(v0, v1, v2)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, FourDimsConcatAxis0) {
-    REQUIRE_AVX2();
-    std::vector<float> input0_data(A0 * B * C * D);
-    test_utils::fill_data(&input0_data[0], A0 * B * C * D);
-    std::vector<float> input1_data(A1 * B * C * D);
-    test_utils::fill_data(&input1_data[0], A1 * B * C * D);
-    std::vector<float> input2_data(A2 * B * C * D);
-    test_utils::fill_data(&input2_data[0], A2 * B * C * D);
-
-    // concat at axis 0
-    std::vector<float> ref_output0_data = input0_data;
-    ref_output0_data.insert(
-            ref_output0_data.end(), input1_data.begin(), input1_data.end());
-    ref_output0_data.insert(
-            ref_output0_data.end(), input2_data.begin(), input2_data.end());
-    for (auto &e : ref_output0_data) {
-        e *= 4;
-    }
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A0, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A1, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make({A2, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto add1 = graph0.make(
-            "add", {in1->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    auto add2 = graph0.make(
-            "add", {in2->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-
-    auto concat0 = graph0.make("concat",
-            {add0->get_outputs()[0], add1->get_outputs()[0],
-                    add2->get_outputs()[0]},
-            {}, {{"axis", 0}});
-
-    auto add3 = graph0.make("add",
-            {concat0->get_outputs()[0], concat0->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add3->get_outputs());
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    std::vector<float> graph_output0_data((A0 + A1 + A2) * B * C * D);
-    fptr->call_default(&input0_data[0], &input1_data[0], &input2_data[0],
-            &graph_output0_data[0]);
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-// Note: the input shapes of this function are fixed
-static std::vector<float> calc_ref_output(std::vector<float> &input0_data,
-        std::vector<float> &input1_data, std::vector<float> &input2_data) {
-    std::vector<uint64_t> strides_in0 = {B * C0 * D, C0 * D, D, 1};
-    std::vector<uint64_t> strides_in1 = {B * C1 * D, C1 * D, D, 1};
-    std::vector<uint64_t> strides_in2 = {B * C2 * D, C2 * D, D, 1};
-    std::vector<uint64_t> strides_concat
-            = {B * (C0 + C1 + C2) * D, (C0 + C1 + C2) * D, D, 1};
-
-    // concat at axis 2
-    std::vector<float> ref_output0_data(A * B * (C0 + C1 + C2) * D);
-    for (int i0 = 0; i0 < A; ++i0) {
-        for (int i1 = 0; i1 < B; ++i1) {
-            for (int i3 = 0; i3 < D; ++i3) {
-                for (int j = 0; j < C0; ++j) {
-                    ref_output0_data[i0 * strides_concat[0]
-                            + i1 * strides_concat[1] + j * strides_concat[2]
-                            + i3 * strides_concat[3]]
-                            = 4
-                            * input0_data[i0 * strides_in0[0]
-                                    + i1 * strides_in0[1] + j * strides_in0[2]
-                                    + i3 * strides_in0[3]];
-                }
-                for (int j = 0; j < C1; ++j) {
-                    ref_output0_data[i0 * strides_concat[0]
-                            + i1 * strides_concat[1]
-                            + (j + C0) * strides_concat[2]
-                            + i3 * strides_concat[3]]
-                            = 4
-                            * input1_data[i0 * strides_in1[0]
-                                    + i1 * strides_in1[1] + j * strides_in1[2]
-                                    + i3 * strides_in1[3]];
-                }
-                for (int j = 0; j < 64; ++j) {
-                    ref_output0_data[i0 * strides_concat[0]
-                            + i1 * strides_concat[1]
-                            + (j + C0 + C1) * strides_concat[2]
-                            + i3 * strides_concat[3]]
-                            = 4
-                            * input2_data[i0 * strides_in2[0]
-                                    + i1 * strides_in2[1] + j * strides_in2[2]
-                                    + i3 * strides_in2[3]];
-                }
-            }
-        }
-    }
-
-    return ref_output0_data;
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, FourDimsConcatAxis2) {
-    REQUIRE_AVX2();
-    std::vector<float> input0_data(A * B * C0 * D);
-    test_utils::fill_data(&input0_data[0], A * B * C0 * D);
-    std::vector<float> input1_data(A * B * C1 * D);
-    test_utils::fill_data(&input1_data[0], A * B * C1 * D);
-    std::vector<float> input2_data(A * B * C2 * D);
-    test_utils::fill_data(&input2_data[0], A * B * C2 * D);
-    std::vector<float> ref_output0_data
-            = calc_ref_output(input0_data, input1_data, input2_data);
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A, B, C1, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make({A, B, C2, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto add1 = graph0.make(
-            "add", {in1->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    auto add2 = graph0.make(
-            "add", {in2->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-
-    auto concat0 = graph0.make("concat",
-            {add0->get_outputs()[0], add1->get_outputs()[0],
-                    add2->get_outputs()[0]},
-            {}, {{"axis", 2}});
-
-    auto add3 = graph0.make("add",
-            {concat0->get_outputs()[0], concat0->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add3->get_outputs());
-
-    graph_driver(graph0, ctx);
-
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    std::vector<float> graph_output0_data(A * B * (C0 + C1 + C2) * D);
-    fptr->call_default(&input0_data[0], &input1_data[0], &input2_data[0],
-            &graph_output0_data[0]);
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, ConcatManagedMatmulAxis0) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    int M0 = 32, M1 = 256, K = 128, N = 64;
-    std::vector<float> input0_data(M0 * K);
-    test_utils::fill_data(&input0_data[0], M0 * K);
-    std::vector<float> input1_data(M1 * K);
-    test_utils::fill_data(&input1_data[0], M1 * K);
-    std::vector<float> weight0_data(K * N);
-    test_utils::fill_data(&weight0_data[0], K * N);
-    std::vector<float> graph_output0_data((M0 + M1) * N);
-    std::vector<float> ref_output0_data((M0 + M1) * N);
-
-    {
-        /*
-        A1 * B = C1, A2 * B = C2,
-        D = [C1]
-            [C2] (concat C1 and C2 at axis #0)
-        */
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make(
-                {M0, K}, sc_data_format_t(format_kinds::MK), datatypes::f32)});
-        auto in1 = graph0.make_input({graph_tensor::make(
-                {M1, K}, sc_data_format_t(format_kinds::MK), datatypes::f32)});
-        auto weight0 = graph0.make_input({graph_tensor::make(
-                {K, N}, sc_data_format_t(format_kinds::NK), datatypes::f32)});
-        auto mm0 = graph0.make("managed_matmul_core",
-                {in0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-        auto relu0 = graph0.make("relu", {mm0->get_outputs()[0]}, {}, {});
-        auto mm1 = graph0.make("managed_matmul_core",
-                {in1->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-        auto relu1 = graph0.make("relu", {mm1->get_outputs()[0]}, {}, {});
-        auto concat1 = graph0.make("concat",
-                {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {},
-                {{"axis", 0}});
-        // concat1 output: [M0 + M1, N]
-        auto out = graph0.make_output(concat1->get_outputs());
-        graph_driver(graph0, ctx);
-        auto ir_mod = lower_graph(ctx, graph0, {in0, in1, weight0, out});
-        auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-        fptr0->call_default(&input0_data[0], &input1_data[0], &weight0_data[0],
-                &graph_output0_data[0]);
-    }
-
-    {
-        /*
-        concat A1 and A2 at axis #0,
-        [A1] * B = [C1] = D
-        [A2]       [C2]
-        */
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make(
-                {M0, K}, sc_data_format_t(format_kinds::MK), datatypes::f32)});
-        auto in1 = graph0.make_input({graph_tensor::make(
-                {M1, K}, sc_data_format_t(format_kinds::MK), datatypes::f32)});
-        auto weight0 = graph0.make_input({graph_tensor::make(
-                {K, N}, sc_data_format_t(format_kinds::NK), datatypes::f32)});
-        auto concat1 = graph0.make("concat",
-                {in0->get_outputs()[0], in1->get_outputs()[0]}, {},
-                {{"axis", 0}});
-        // concat1 output: [M0 + M1, K]
-        auto mm0 = graph0.make("managed_matmul_core",
-                {concat1->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-        auto relu0 = graph0.make("relu", {mm0->get_outputs()[0]}, {}, {});
-        auto out = graph0.make_output(relu0->get_outputs());
-        graph_driver(graph0, ctx);
-        auto ir_mod = lower_graph(ctx, graph0, {in0, in1, weight0, out});
-        auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-        fptr0->call_default(&input0_data[0], &input1_data[0], &weight0_data[0],
-                &ref_output0_data[0]);
-    }
-
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, ConcatManagedMatmulAxis1) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    int M = 64, K = 128, N0 = 256, N1 = 32;
-    std::vector<float> input0_data(M * K);
-    test_utils::fill_data(&input0_data[0], M * K);
-    std::vector<float> weight0_data(K * N0);
-    test_utils::fill_data(&weight0_data[0], K * N0);
-    std::vector<float> weight1_data(K * N1);
-    test_utils::fill_data(&weight1_data[0], K * N1);
-    std::vector<float> graph_output0_data(M * (N0 + N1));
-    std::vector<float> ref_output0_data(M * (N0 + N1));
-
-    {
-        // A * B1 = C1, A * B2 = C2, D = [C1 C2] (concat at axis #1)
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make(
-                {M, K}, sc_data_format_t(format_kinds::MK), datatypes::f32)});
-        auto weight0 = graph0.make_input({graph_tensor::make(
-                {K, N0}, sc_data_format_t(format_kinds::KN), datatypes::f32)});
-        auto mm0 = graph0.make("managed_matmul_core",
-                {in0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-        auto relu0 = graph0.make("relu", {mm0->get_outputs()[0]}, {}, {});
-        auto weight1 = graph0.make_input({graph_tensor::make(
-                {K, N1}, sc_data_format_t(format_kinds::KN), datatypes::f32)});
-        auto mm1 = graph0.make("managed_matmul_core",
-                {in0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-        auto relu1 = graph0.make("relu", {mm1->get_outputs()[0]}, {}, {});
-        auto concat1 = graph0.make("concat",
-                {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {},
-                {{"axis", 1}});
-        // concat1 output: [M, N0 + N1]
-        auto out = graph0.make_output(concat1->get_outputs());
-        graph_driver(graph0, ctx);
-        auto ir_mod = lower_graph(ctx, graph0, {in0, weight0, weight1, out});
-        auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-        fptr0->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-                &graph_output0_data[0]);
-    }
-
-    {
-        // concat B1 and B2 at axis #1, A * [B1 B2] = [C1 C2] = D
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make(
-                {M, K}, sc_data_format_t(format_kinds::MK), datatypes::f32)});
-        auto weight0 = graph0.make_input({graph_tensor::make(
-                {K, N0}, sc_data_format_t(format_kinds::KN), datatypes::f32)});
-        auto weight1 = graph0.make_input({graph_tensor::make(
-                {K, N1}, sc_data_format_t(format_kinds::KN), datatypes::f32)});
-        auto concat1 = graph0.make("concat",
-                {weight0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-                {{"axis", 1}});
-        // concat1 output: [K, N0 + N1]
-        auto mm0 = graph0.make("managed_matmul_core",
-                {in0->get_outputs()[0], concat1->get_outputs()[0]}, {}, {});
-        auto relu0 = graph0.make("relu", {mm0->get_outputs()[0]}, {}, {});
-        auto out = graph0.make_output(relu0->get_outputs());
-        graph_driver(graph0, ctx);
-        auto ir_mod = lower_graph(ctx, graph0, {in0, weight0, weight1, out});
-        auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-        fptr0->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-                &ref_output0_data[0]);
-    }
-
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-/*
-         -- conv1x1 relu --
-        /                  \
-input ----- conv3x3 relu ------ concat -- output
-        \                  /
-         -- avg_pool ------
-*/
-TEST(GCCore_CPU_concat_op_t_cpp, InceptionLikeTopoConv) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    sc_graph_t graph0;
-    int N = 16, Cin = 128, Hin = 56, Win = 56;
-    auto in0 = graph0.make_input({graph_tensor::make({N, Cin, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-
-    int Cout0 = 32, kw0 = 1, kh0 = 1, stride0 = 1, padding0 = 0;
-    auto weight0 = graph0.make_input({graph_tensor::make({Cout0, Cin, kw0, kh0},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides0 = {stride0, stride0}, paddings0 = {padding0, padding0};
-    auto conv0 = graph0.make("conv_fwd_core",
-            {in0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", strides0}, {"paddings", paddings0}});
-    auto relu0 = graph0.make("relu", {conv0->get_outputs()[0]}, {}, {});
-
-    int Cout1 = 32, kw1 = 3, kh1 = 3, stride1 = 1, padding1 = 1;
-    auto weight1 = graph0.make_input({graph_tensor::make({Cout1, Cin, kw1, kh1},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides1 = {stride1, stride1}, paddings1 = {padding1, padding1};
-    auto conv1 = graph0.make("conv_fwd_core",
-            {in0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-            {{"strides", strides1}, {"paddings", paddings1}});
-    auto relu1 = graph0.make("relu", {conv1->get_outputs()[0]}, {}, {});
-
-    // PreCI do not support avg_pooling_fwd op. Use conv.
-    int Cout2 = 32, kw2 = 3, kh2 = 3, stride2 = 1, padding2 = 1;
-    auto weight2 = graph0.make_input({graph_tensor::make({Cout2, Cin, kw2, kh2},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides2 = {stride2, stride2}, paddings2 = {padding2, padding2};
-    auto conv2 = graph0.make("conv_fwd_core",
-            {in0->get_outputs()[0], weight2->get_outputs()[0]}, {},
-            {{"strides", strides2}, {"paddings", paddings2}});
-    auto relu2 = graph0.make("relu", {conv2->get_outputs()[0]}, {}, {});
-
-    auto concat1 = graph0.make("concat",
-            {relu0->get_outputs()[0], relu1->get_outputs()[0],
-                    relu2->get_outputs()[0]},
-            {}, {{"axis", 1}});
-    // concat1 output: [N, Cout0 + Cout1 + Cout2, Hin, Win]
-    auto out = graph0.make_output(concat1->get_outputs());
-    auto graph1 = copy_graph(graph0);
-
-    std::vector<float> input0_data(N * Cin * Hin * Win);
-    test_utils::fill_data(&input0_data[0], N * Cin * Hin * Win);
-    std::vector<float> weight0_data(Cout0 * Cin * kw0 * kh0);
-    test_utils::fill_data(&weight0_data[0], Cout0 * Cin * kw0 * kh0);
-    std::vector<float> weight1_data(Cout1 * Cin * kw1 * kh1);
-    test_utils::fill_data(&weight1_data[0], Cout1 * Cin * kw1 * kh1);
-    std::vector<float> weight2_data(Cout2 * Cin * kw1 * kh1);
-    test_utils::fill_data(&weight2_data[0], Cout2 * Cin * kw2 * kh2);
-    std::vector<float> graph_output0_data(
-            N * (Cout0 + Cout1 + Cout2) * Hin * Win);
-    std::vector<float> ref_output0_data(
-            N * (Cout0 + Cout1 + Cout2) * Hin * Win);
-
-    graph_driver(graph0, ctx);
-    auto ir_mod
-            = lower_graph(ctx, graph0, {in0, weight0, weight1, weight2, out});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    fptr0->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &weight2_data[0], &graph_output0_data[0]);
-
-    graph_driver(graph1, ctx);
-    auto ir_mod1 = lower_graph(ctx, graph1,
-            {graph1.get_input_ops()[0], graph1.get_input_ops()[1],
-                    graph1.get_input_ops()[2], graph1.get_input_ops()[3],
-                    graph1.get_output_ops()[0]});
-    auto fptr1 = jit_engine_t::make(ctx)->get_entry_func(ir_mod1);
-    fptr1->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &weight2_data[0], &ref_output0_data[0]);
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, ConcatPermuteConcat) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    int N = 32, L = 1024, D = 256;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make(
-            {N, L, D}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make(
-            {N, L, D}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make(
-            {N, L, D}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    // in3: plain dims and blocking format
-    auto in3 = graph0.make_input({graph_tensor::make({N, 2 * L, D},
-            sc_data_format_t(format_kinds::ACB), datatypes::f32)});
-
-    auto add1 = graph0.make(
-            "add", {in1->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {in0->get_outputs()[0], add1->get_outputs()[0]}, {}, {{"axis", 1}});
-    // concat2 output: (N, 2*L, D) @ ABC
-    auto permute3 = graph0.make("reorder", {concat2->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t(format_kinds::ACB)},
-                    {"internal", true}});
-    // permute3 output: (N, D, 2*L) @ACB
-    auto concat4 = graph0.make("concat",
-            {permute3->get_outputs()[0], in3->get_outputs()[0]}, {},
-            {{"axis", 2}}); // Note: axis = 2 in plain format
-    // concat4 output: (N, 2*D, 2*L) @ACB
-    auto out = graph0.make_output(concat4->get_outputs());
-    // output: (N, 2*L, 2*D) @ABC
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0,
-            {graph0.get_input_ops()[0], graph0.get_input_ops()[1],
-                    graph0.get_input_ops()[2], graph0.get_input_ops()[3],
-                    graph0.get_output_ops()[0]});
-    std::stringstream ss;
-    ss << ir_mod->get_entry_func();
-    // Note the shapes should be right.
-    expr out_buf = ir_mod->get_entry_func()->params_.back();
-    std::vector<int64_t> expected_shape = {32, 2048, 512};
-    auto shape = out_buf.checked_as<tensor>()->dims_;
-    for (size_t i = 0; i < shape.size(); ++i) {
-        EXPECT_EQ(get_const_as_int(shape[i].static_as<constant>()),
-                expected_shape[i]);
-    }
-}
-
-static sc_graph_t build_single_concat_graph(bool inner_most) {
-    sc_data_format_t input_format = inner_most
-            ? sc_data_format_t(format_kinds::ACDB)
-            : sc_data_format_t(format_kinds::ABCD);
-    int A = 112, B0 = 28, B1 = 56, B2 = 64, B3 = 112, C = 28, D = 56;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input(
-            {graph_tensor::make({A, B0, C, D}, input_format, datatypes::f32)});
-    auto in1 = graph0.make_input(
-            {graph_tensor::make({A, B1, C, D}, input_format, datatypes::f32)});
-    auto in2 = graph0.make_input(
-            {graph_tensor::make({A, B2, C, D}, input_format, datatypes::f32)});
-    auto in3 = graph0.make_input(
-            {graph_tensor::make({A, B3, C, D}, input_format, datatypes::f32)});
-
-    // concat at B-axis. If in plain format, concat axis is not the inner-most.
-    // If in permuted format, concat axis is the inner-most axis.
-    auto concat = graph0.make("concat",
-            {in0->get_outputs()[0], in1->get_outputs()[0],
-                    in2->get_outputs()[0], in3->get_outputs()[0]},
-            {}, {{"axis", 1}});
-    // concat output: (A, C, D, B0+B1+B2+B3) @ ACDB, if inner_most.
-    // concat output: (A, B0+B1+B2+B3, C, D) @ ABCD, if not inner_most.
-    auto out = graph0.make_output(concat->get_outputs());
-    graph0.attrs_["is_input_plain"] = !inner_most;
-    graph0.attrs_["is_output_plain"] = !inner_most;
-    return graph0;
-}
-
-// print IR to check if vectorization works when concat axis is/isnot
-// the inner-most.
-TEST(GCCore_CPU_concat_op_t_cpp, CheckVectorized) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    for (bool inner_most : std::vector<bool> {true, false}) {
-        sc_graph_t graph0 = build_single_concat_graph(inner_most);
-        graph_driver(graph0, ctx);
-        auto ir_mod = lower_graph(ctx, graph0,
-                {graph0.get_input_ops()[0], graph0.get_input_ops()[1],
-                        graph0.get_input_ops()[2], graph0.get_input_ops()[3],
-                        graph0.get_output_ops()[0]});
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    }
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, DeqConv_Concat) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    sc_graph_t graph0;
-    // concat input
-    auto in0 = graph0.make_input({graph_tensor::make({1, 64, 56, 56},
-            sc_data_format_t(format_kinds::ABCD), datatypes::u8)});
-    // conv input feature
-    auto in1 = graph0.make_input({graph_tensor::make({1, 128, 56, 56},
-            sc_data_format_t(format_kinds::ABCD), datatypes::u8)});
-    // conv input weight
-    auto in2 = graph0.make_input({graph_tensor::make({32, 128, 3, 3},
-            sc_data_format_t(format_kinds::ABCD), datatypes::s8)});
-
-    auto dequant0 = graph0.make("dequantize", in1->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto dequant1 = graph0.make("dequantize", in2->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    sc_dims strides = {1, 1}, paddings = {1, 1};
-    auto conv = graph0.make("conv_fwd_core",
-            {dequant0->get_outputs()[0], dequant1->get_outputs()[0]}, {},
-            {{"strides", strides}, {"paddings", paddings}});
-    auto dequant2 = graph0.make("dequantize", in0->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto concat = graph0.make("concat",
-            {dequant2->get_outputs()[0], conv->get_outputs()[0]}, {},
-            {{"axis", 1}});
-    auto out0 = graph0.make_output(conv->get_outputs());
-    auto out1 = graph0.make_output(concat->get_outputs());
-
-    graph_driver(graph0, ctx);
-    std::stringstream ss;
-    print_graph(graph0, ss, true);
-    std::string expected_str
-            = R"(graph(v0: u8[1, 64, 56, 56], v1: u8[1, 128, 56, 56], v2: s8[32, 128, 3, 3]) -> [v3: f32[1, 32, 56, 56], v4: f32[1, 96, 56, 56]] {
-  [v5: s8[1, 1, 3, 3, 32, 32, 4]] = reorder(v2)
-  [v6: u8[1, 56, 56, 128]] = reorder(v1)
-  [v3: f32[1, 32, 56, 56], v4: f32[1, 96, 56, 56]] = partition_cast_quantized_conv_fwd_core_cast_reorder_concat_reorder(v6, v5, v0)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-
-    std::vector<float> input0_data(1 * 64 * 56 * 56);
-    test_utils::fill_data(&input0_data[0], 1 * 64 * 56 * 56);
-    std::vector<float> input1_data(1 * 128 * 56 * 56);
-    test_utils::fill_data(&input1_data[0], 1 * 128 * 56 * 56);
-    std::vector<float> weight0_data(32 * 128 * 3 * 3);
-    test_utils::fill_data(&weight0_data[0], 32 * 128 * 3 * 3);
-    std::vector<float> graph_output0_data(1 * 32 * 56 * 56);
-    std::vector<float> graph_output1_data(1 * (64 + 32) * 56 * 56);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out0, out1});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    fptr0->call_default(&input0_data[0], &input1_data[0], &weight0_data[0],
-            &graph_output0_data[0], &graph_output1_data[0]);
-}
-
-TEST(GCCore_CPU_concat_op_t_cpp, DeqConv_DeqConv_Concat) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    sc_graph_t graph0;
-    // conv0 input feature
-    auto in0 = graph0.make_input({graph_tensor::make({1, 128, 56, 56},
-            sc_data_format_t(format_kinds::ABCD), datatypes::u8)});
-    // conv0 input weight
-    auto in1 = graph0.make_input({graph_tensor::make({32, 128, 3, 3},
-            sc_data_format_t(format_kinds::ABCD), datatypes::s8)});
-
-    // conv1 input feature
-    auto in2 = graph0.make_input({graph_tensor::make({1, 64, 56, 56},
-            sc_data_format_t(format_kinds::ABCD), datatypes::u8)});
-    // conv1 input weight
-    auto in3 = graph0.make_input({graph_tensor::make({16, 64, 3, 3},
-            sc_data_format_t(format_kinds::ABCD), datatypes::s8)});
-
-    auto dequant0 = graph0.make("dequantize", in0->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto dequant1 = graph0.make("dequantize", in1->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    sc_dims strides = {1, 1}, paddings = {1, 1};
-    auto conv0 = graph0.make("conv_fwd_core",
-            {dequant0->get_outputs()[0], dequant1->get_outputs()[0]}, {},
-            {{"strides", strides}, {"paddings", paddings}});
-
-    auto dequant2 = graph0.make("dequantize", in2->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto dequant3 = graph0.make("dequantize", in3->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto conv1 = graph0.make("conv_fwd_core",
-            {dequant2->get_outputs()[0], dequant3->get_outputs()[0]}, {},
-            {{"strides", strides}, {"paddings", paddings}});
-
-    auto concat = graph0.make("concat",
-            {conv0->get_outputs()[0], conv1->get_outputs()[0]}, {},
-            {{"axis", 1}});
-    auto out0 = graph0.make_output(concat->get_outputs());
-
-    graph_driver(graph0, ctx);
-    std::stringstream ss;
-    print_graph(graph0, ss, true);
-    std::string expected_str
-            = R"(graph(v0: u8[1, 128, 56, 56], v1: s8[32, 128, 3, 3], v2: u8[1, 64, 56, 56], v3: s8[16, 64, 3, 3]) -> [v4: f32[1, 48, 56, 56]] {
-  [v5: s8[1, 1, 3, 3, 16, 16, 4]] = reorder(v3)
-  [v6: u8[1, 56, 56, 64]] = reorder(v2)
-  [v7: f32[1, 56, 56, 16]] = partition_quantized_conv_fwd_core_cast(v6, v5)
-  [v8: s8[1, 1, 3, 3, 32, 32, 4]] = reorder(v1)
-  [v9: u8[1, 56, 56, 128]] = reorder(v0)
-  [v4: f32[1, 48, 56, 56]] = partition_quantized_conv_fwd_core_cast_concat_reorder(v9, v8, v7)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-
-    std::vector<float> input0_data(1 * 128 * 56 * 56);
-    test_utils::fill_data(&input0_data[0], 1 * 128 * 56 * 56);
-    std::vector<float> input1_data(32 * 128 * 3 * 3);
-    test_utils::fill_data(&input1_data[0], 32 * 128 * 3 * 3);
-    std::vector<float> input2_data(1 * 64 * 56 * 56);
-    test_utils::fill_data(&input2_data[0], 1 * 64 * 56 * 56);
-    std::vector<float> input3_data(16 * 64 * 3 * 3);
-    test_utils::fill_data(&input3_data[0], 16 * 64 * 3 * 3);
-    std::vector<float> graph_output0_data(1 * (32 + 16) * 56 * 56);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, in3, out0});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    fptr0->call_default(&input0_data[0], &input1_data[0], &input2_data[0],
-            &input3_data[0], &graph_output0_data[0]);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_concat_optimization.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_concat_optimization.cpp
deleted file mode 100644
index 5aa0725cc2a..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_concat_optimization.cpp
+++ /dev/null
@@ -1,861 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_utils.hpp>
-#include <compiler/ir/transform/concat_memory_planning.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <runtime/runtime.hpp>
-#ifdef DO_BENCH
-#include <tuner/time_evaluator.hpp>
-#endif
-#include <iostream>
-#include <memory>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-static void ir_compare_test_on_graph(
-        std::function<sc_graph_t(void)> graph_builder,
-        std::string &expected_ir) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.concat_optimization_ = true;
-    builder::ir_builder_t bld;
-    auto graph = graph_builder();
-    graph_driver(graph, ctx);
-
-    std::vector<sc_op_ptr> graph_args;
-    for (auto &op : graph.get_input_ops()) {
-        graph_args.push_back(op);
-    }
-    for (auto &op : graph.get_output_ops()) {
-        graph_args.push_back(op);
-    }
-    auto ir_mod = lower_graph(ctx, graph, graph_args);
-
-    concat_memory_planning_t pass;
-    auto ret_mod = pass(ir_mod);
-    std::stringstream ss;
-    ss << ret_mod->get_entry_func();
-
-    EXPECT_EQ(ss.str(), expected_ir);
-}
-
-// Test the accuracy after concat optimization.
-// All tensor are float. Other dtypes should not use this function.
-static void accuracy_test_on_graph(
-        std::function<sc_graph_t(void)> graph_builder) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(56);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-    sc_graph_t graph0 = graph_builder();
-    sc_graph_t graph1 = graph_builder();
-
-    std::vector<test_buffer<float>> arg_buffers;
-    std::vector<sc_op_ptr> graph0_args;
-    for (auto &op : graph0.get_input_ops()) {
-        graph0_args.push_back(op);
-        auto dims = op->get_outputs()[0]->details_.get_plain_dims();
-        size_t num_elements = 1;
-        for (auto dim : dims) {
-            num_elements *= dim;
-        }
-        arg_buffers.push_back(alloc_array<float>(num_elements));
-    }
-    for (auto &op : graph0.get_output_ops()) {
-        graph0_args.push_back(op);
-        auto dims = op->get_inputs()[0]->details_.get_plain_dims();
-        size_t num_elements = 1;
-        for (auto dim : dims) {
-            num_elements *= dim;
-        }
-        arg_buffers.push_back(alloc_array<float>(num_elements));
-    }
-    arg_buffers.back().zeroout();
-
-    ctx->flags_.concat_optimization_ = false;
-    graph_driver(graph0, ctx);
-    auto f0 = lower_graph(ctx, graph0, graph0_args);
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(f0, true);
-    std::vector<generic_val> generic_args0;
-    for (unsigned i = 0; i < arg_buffers.size(); ++i) {
-        generic_args0.emplace_back(arg_buffers.at(i).data());
-    }
-    fptr0->call_generic_default(generic_args0.data());
-    test_buffer<float> disable_output = arg_buffers.back().copy();
-
-    arg_buffers.back().zeroout();
-    std::vector<sc_op_ptr> graph1_args;
-    for (auto &op : graph1.get_input_ops()) {
-        graph1_args.push_back(op);
-    }
-    for (auto &op : graph1.get_output_ops()) {
-        graph1_args.push_back(op);
-    }
-    ctx->flags_.concat_optimization_ = true;
-    graph_driver(graph1, ctx);
-    auto f1 = lower_graph(ctx, graph1, graph1_args);
-    auto fptr1 = jit_engine_t::make(ctx)->get_entry_func(f1, true);
-    std::vector<generic_val> generic_args1;
-    for (unsigned i = 0; i < arg_buffers.size(); ++i) {
-        generic_args1.emplace_back(arg_buffers.at(i).data());
-    }
-    fptr1->call_generic_default(generic_args1.data());
-    test_buffer<float> enable_output = arg_buffers.back().copy();
-
-    test_utils::compare_data(enable_output, disable_output, 1e-3f);
-
-#ifdef DO_BENCH
-    auto exec0 = [&]() { fptr0->call_generic_default(generic_args0.data()); };
-    auto exec1 = [&]() { fptr1->call_generic_default(generic_args1.data()); };
-    const int repeat = 5, warm_up = 10, loop = 100;
-    double cost0 = 1e12, cost1 = 1e12;
-    for (int r = 0; r < repeat; r++) {
-        double cost0_r = 0.f, cost1_r = 0.f;
-        for (int t = 0; t < warm_up + loop; t++) {
-            auto time0 = evaluate_time(exec0);
-            if (t >= warm_up) cost0_r += time0;
-            auto time1 = evaluate_time(exec1);
-            if (t >= warm_up) cost1_r += time1;
-        }
-        cost0 = std::min(cost0_r, cost0);
-        cost1 = std::min(cost1_r, cost1);
-    }
-    printf("@Time cost: not optimized %f ms vs optimized %f ms\n", cost0 / loop,
-            cost1 / loop);
-#endif
-}
-
-static const int A = 4, A0 = 4, A1 = 8;
-static const int B = 8;
-static const int C0 = 16, C1 = 32, C2 = 64;
-static const int D = 32;
-
-TEST(GCCore_CPU_concat_optimization_cpp, MergeConsecutiveConcats) {
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A0, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A1, B, C1, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make({A1, B, C2, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto mul0 = graph0.make(
-            "mul", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto add1 = graph0.make(
-            "add", {in1->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    auto add2 = graph0.make(
-            "add", {in2->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-
-    auto concat0 = graph0.make("concat",
-            {mul0->get_outputs()[0], add0->get_outputs()[0]}, {},
-            {{"axis", 0}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat0 output: [A0+A0, B, C0, D], A0+A0 = A1
-
-    auto concat1 = graph0.make("concat",
-            {concat0->get_outputs()[0], add1->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat1 output: [A1, B, C0+C1, D]
-
-    auto concat2 = graph0.make("concat",
-            {concat1->get_outputs()[0], add2->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat2 output: [A1, B, C0+C1+C2, D]
-
-    auto concat3 = graph0.make("concat",
-            {concat2->get_outputs()[0], add2->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat3 output: [A1, B, C0+C1+C2+C2, D]
-
-    auto add3 = graph0.make("add",
-            {concat3->get_outputs()[0], concat3->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add3->get_outputs());
-
-    graph_driver(graph0, ctx);
-    // concat1 and concat2 is merged into concat3
-    std::stringstream ss;
-    print_graph(graph0, ss, true);
-
-    std::string expected_str
-            = R"(graph(v0: f32[4, 8, 16, 32], v1: f32[8, 8, 32, 32], v2: f32[8, 8, 64, 32]) -> [v3: f32[8, 8, 176, 32]] {
-  [v4: f32[8, 8, 64, 32]] = add(v2, v2)
-  [v5: f32[8, 8, 32, 32]] = add(v1, v1)
-  [v6: f32[4, 8, 16, 32]] = mul(v0, v0)
-  [v7: f32[4, 8, 16, 32]] = add(v0, v0)
-  [v8: f32[8, 8, 16, 32]] = concat(v6, v7)
-  [v9: f32[8, 8, 176, 32]] = concat(v8, v5, v4, v4)
-  [v3: f32[8, 8, 176, 32]] = add(v9, v9)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-// DenseNet-like topo, concat ops are standalone
-static sc_graph_t build_sequential_standalone_concats() {
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-
-    auto relu1 = graph0.make("tanh", {add0->get_outputs()[0]}, {}, {});
-    auto concat1 = graph0.make("concat",
-            {add0->get_outputs()[0], relu1->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat1 output: [A, B, C0*2, D]
-
-    auto relu2 = graph0.make("sigmoid", {concat1->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {concat1->get_outputs()[0], relu2->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat2 output: [A, B, C0*4, D]
-
-    auto relu3 = graph0.make("relu", {concat2->get_outputs()[0]}, {}, {});
-    auto concat3 = graph0.make("concat",
-            {concat2->get_outputs()[0], relu3->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat3 output: [A, B, C0*8, D]
-
-    auto add1 = graph0.make("add",
-            {concat3->get_outputs()[0], concat3->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add1->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, SequentialStandaloneConcats) {
-    accuracy_test_on_graph(build_sequential_standalone_concats);
-    std::string expected_str = R"(/**
- * main_entry
- * @param buffer_0 [f32 [4, 8, 16, 32] @ ABCD]
- * @param buffer_8 [f32 [4, 8, 128, 32] @ ABCD]
-*/
-func main_entry(buffer_0: [f32 * 4UL * 8UL * 16UL * 32UL], buffer_8: [f32 * 4UL * 8UL * 128UL * 32UL]): void {
-  // [f32 [4, 8, 128, 32] @ ABCD]
-  tensor buffer_7: [f32 * 4UL * 8UL * 128UL * 32UL]
-  evaluate{outerloop_4X8X16_partition_add_tanh_8(&buffer_7[0UL, 0UL, 0UL, 0UL], &buffer_7[0UL, 0UL, 16UL, 0UL], buffer_0)}
-  evaluate{sigmoid_2(&buffer_7[0UL, 0UL, 32UL, 0UL], &buffer_7[0UL, 0UL, 0UL, 0UL])}
-  evaluate{relu_4(&buffer_7[0UL, 0UL, 64UL, 0UL], &buffer_7[0UL, 0UL, 0UL, 0UL])}
-  evaluate{add_6(buffer_8, buffer_7, buffer_7)}
-})";
-    ir_compare_test_on_graph(build_sequential_standalone_concats, expected_str);
-}
-
-// DenseNet-like topo, concat ops are fused into one mixed partition
-static sc_graph_t build_sequential_concats_in_one_partition() {
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-
-    auto relu1 = graph0.make("relu", {add0->get_outputs()[0]}, {}, {});
-    auto concat1 = graph0.make("concat",
-            {add0->get_outputs()[0], relu1->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat1 output: [A, B, C0*2, D]
-
-    auto relu2 = graph0.make("relu", {concat1->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {concat1->get_outputs()[0], relu2->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat2 output: [A, B, C0*4, D]
-
-    auto relu3 = graph0.make("relu", {concat2->get_outputs()[0]}, {}, {});
-    auto concat3 = graph0.make("concat",
-            {concat2->get_outputs()[0], relu3->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat3 output: [A, B, C0*8, D]
-
-    auto add1 = graph0.make("add",
-            {concat3->get_outputs()[0], concat3->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add1->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, SequentialConcatsInOnePartition) {
-    accuracy_test_on_graph(build_sequential_concats_in_one_partition);
-    // The added new buffer arguemnt is wrapped into inlined function
-    // illustrated below
-    std::string expected_str = R"(/**
- * main_entry
- * @param buffer_0 [f32 [4, 8, 16, 32] @ ABCD]
- * @param buffer_1 [f32 [4, 8, 128, 32] @ ABCD]
-*/
-func main_entry(buffer_0: [f32 * 4UL * 8UL * 16UL * 32UL], buffer_1: [f32 * 4UL * 8UL * 128UL * 32UL]): void {
-  evaluate{outerloop_4X8_partition_add_relu_concat_relu_concat_relu_concat_add(buffer_1, buffer_0)}
-})";
-    ir_compare_test_on_graph(
-            build_sequential_concats_in_one_partition, expected_str);
-}
-
-// Both standalone and fused concats exist in this graph.
-static sc_graph_t build_sequential_concats_standalone_and_in_one_partition() {
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-
-    auto relu1 = graph0.make("tanh", {add0->get_outputs()[0]}, {}, {});
-    auto concat1 = graph0.make("concat",
-            {add0->get_outputs()[0], relu1->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat1 output: [A, B, C0*2, D]
-
-    auto relu2 = graph0.make("relu", {concat1->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {concat1->get_outputs()[0], relu2->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat2 output: [A, B, C0*4, D]
-
-    auto relu3 = graph0.make("sigmoid", {concat2->get_outputs()[0]}, {}, {});
-    auto concat3 = graph0.make("concat",
-            {concat2->get_outputs()[0], relu3->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat3 output: [A, B, C0*8, D]
-
-    auto relu4 = graph0.make("tanh", {concat3->get_outputs()[0]}, {}, {});
-    auto concat4 = graph0.make("concat",
-            {concat3->get_outputs()[0], relu4->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat4 output: [A, B, C0*16, D], standalone
-
-    auto relu5 = graph0.make("relu", {concat4->get_outputs()[0]}, {}, {});
-    auto concat5 = graph0.make("concat",
-            {concat4->get_outputs()[0], relu5->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat5 output: [A, B, C0*32, D]
-
-    auto relu6 = graph0.make("tanh", {concat5->get_outputs()[0]}, {}, {});
-    auto concat6 = graph0.make("concat",
-            {concat5->get_outputs()[0], relu6->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat6 output: [A, B, C0*64, D]
-
-    auto relu7 = graph0.make("sigmoid", {concat6->get_outputs()[0]}, {}, {});
-    auto concat7 = graph0.make("concat",
-            {concat6->get_outputs()[0], relu7->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat7 output: [A, B, C0*128, D], standalone
-
-    auto add1 = graph0.make("add",
-            {concat7->get_outputs()[0], concat7->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add1->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp,
-        SequentialStandaloneAndInOnePartitionConcats) {
-    accuracy_test_on_graph(
-            build_sequential_concats_standalone_and_in_one_partition);
-
-    std::string expected_str = R"(/**
- * main_entry
- * @param buffer_0 [f32 [4, 8, 16, 32] @ ABCD]
- * @param buffer_7 [f32 [4, 8, 2048, 32] @ ABCD]
-*/
-func main_entry(buffer_0: [f32 * 4UL * 8UL * 16UL * 32UL], buffer_7: [f32 * 4UL * 8UL * 2048UL * 32UL]): void {
-  // [f32 [4, 8, 256, 32] @ ABCD]
-  tensor buffer_3: [f32 * 4UL * 8UL * 256UL * 32UL]
-  evaluate{outerloop_4X8_partition_add_tanh_concat_relu_concat_sigmoid_concat_tanh(&buffer_3[0UL, 0UL, 0UL, 0UL], &buffer_3[0UL, 0UL, 128UL, 0UL], buffer_0)}
-  // [f32 [4, 8, 2048, 32] @ ABCD]
-  tensor buffer_6: [f32 * 4UL * 8UL * 2048UL * 32UL]
-  evaluate{outerloop_4X8_partition_relu_concat_tanh_concat_sigmoid(&buffer_6[0UL, 0UL, 0UL, 0UL], &buffer_6[0UL, 0UL, 1024UL, 0UL], buffer_3)}
-  evaluate{add_3(buffer_7, buffer_6, buffer_6)}
-})";
-    ir_compare_test_on_graph(
-            build_sequential_concats_standalone_and_in_one_partition,
-            expected_str);
-}
-
-static sc_graph_t build_reduce_reduce_concat() {
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto reduce0 = graph0.make("reduce", {in0->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {2}}, {"rd_op", 0}});
-    auto reduce1 = graph0.make("reduce", {in1->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {2}}, {"rd_op", 0}});
-    auto concat1 = graph0.make("concat",
-            {reduce0->get_outputs()[0], reduce1->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat1 output: [A, B, 2, D]
-    auto out = graph0.make_output(concat1->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, ReduceReduceConcat) {
-    // In this case, reduce0 is a standalone op, reduce1 and concat are in one
-    // partition. So the concat operation of reduce0's output is remained, the
-    // concat operation of reduce1's output is deleted.
-    accuracy_test_on_graph(build_reduce_reduce_concat);
-}
-
-static sc_graph_t build_tensorview_add_concat() {
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B * C0, D},
-            sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto tv0 = graph0.make("tensor_view", {in0->get_outputs()[0]}, {},
-            {{"shape", sc_dims {A, B, C0, D}}});
-    auto add1 = graph0.make(
-            "add", {in1->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    auto concat1 = graph0.make("concat",
-            {tv0->get_outputs()[0], add1->get_outputs()[0]}, {}, {{"axis", 2}});
-    // concat1 output: [A, B, 2 * C0, D]
-    auto out = graph0.make_output(concat1->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, TensorviewAddConcat) {
-    accuracy_test_on_graph(build_tensorview_add_concat);
-
-    std::string expected_str = R"(/**
- * main_entry
- * @param buffer_1 [f32 [4, 128, 32] @ ABC]
- * @param buffer_0 [f32 [4, 8, 16, 32] @ ABCD]
- * @param buffer_2 [f32 [4, 8, 32, 32] @ ABCD]
-*/
-func main_entry(buffer_1: [f32 * 4UL * 128UL * 32UL], buffer_0: [f32 * 4UL * 8UL * 16UL * 32UL], buffer_2: [f32 * 4UL * 8UL * 32UL * 32UL]): void {
-  evaluate{outerloop_4X8_partition_add_concat(buffer_2, buffer_0, &buffer_1[0, 0, 0])}
-})";
-    ir_compare_test_on_graph(build_tensorview_add_concat, expected_str);
-}
-
-/*
-     -- B --
-    /       \
-A ----- C ---- Concat, all these ops are in one partition
-    \       /
-     -- D --
-*/
-static sc_graph_t build_inception_block_with_adds() {
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C0, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto relu1 = graph0.make("tanh", {add0->get_outputs()[0]}, {}, {});
-    auto relu2 = graph0.make("relu", {add0->get_outputs()[0]}, {}, {});
-    auto relu3 = graph0.make("sigmoid", {add0->get_outputs()[0]}, {}, {});
-    auto concat1 = graph0.make("concat",
-            {relu1->get_outputs()[0], relu2->get_outputs()[0],
-                    relu3->get_outputs()[0]},
-            {}, {{"axis", 2}});
-    // concat1 output: [A, B, C0*3, D]
-    auto out = graph0.make_output(concat1->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, InceptionLikeTopoAdd) {
-    // In this case, all the parent ops of concat op are in the same partition,
-    // so all the concat operations are deleted.
-    accuracy_test_on_graph(build_inception_block_with_adds);
-}
-
-struct inception_block_config {
-    // input feature
-    int N = 64;
-    int Cin = 128;
-    int Hin = 56;
-    int Win = 56;
-
-    // conv0
-    int Cout0 = 32;
-    int k0 = 1;
-    int stride0 = 1;
-    int padding0 = 0;
-
-    // conv1
-    int Cout1 = 32;
-    int k1 = 3;
-    int stride1 = 1;
-    int padding1 = 1;
-
-    // pool
-    int k2 = 3;
-    int stride2 = 1;
-    int padding2 = 1;
-};
-
-/*
-     -- B --
-    /       \
-A ----- C ---- Concat, these ops are in different partitions
-    \       /
-     -- D --
-We can only optimize the input tensors whose producers are in the same partition
-with the concat op.
-*/
-static sc_graph_t build_inception_block() {
-    inception_block_config config; // use default config
-
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input(
-            {graph_tensor::make({config.N, config.Cin, config.Hin, config.Win},
-                    sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-
-    auto weight0 = graph0.make_input({graph_tensor::make(
-            {config.Cout0, config.Cin, config.k0, config.k0},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides0 = {config.stride0, config.stride0},
-            paddings0 = {config.padding0, config.padding0};
-    auto conv0 = graph0.make("conv_fwd_core",
-            {in0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", strides0}, {"paddings", paddings0}});
-    auto relu0 = graph0.make("relu", {conv0->get_outputs()[0]}, {}, {});
-
-    auto weight1 = graph0.make_input({graph_tensor::make(
-            {config.Cout1, config.Cin, config.k1, config.k1},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides1 = {config.stride1, config.stride1},
-            paddings1 = {config.padding1, config.padding1};
-    auto conv1 = graph0.make("conv_fwd_core",
-            {in0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-            {{"strides", strides1}, {"paddings", paddings1}});
-    auto relu1 = graph0.make("relu", {conv1->get_outputs()[0]}, {}, {});
-
-    // PreCI do not support pooling for now. Omit the third branch.
-
-    auto concat1 = graph0.make("concat",
-            {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {},
-            {{"axis", 1}});
-    // concat1 output: [N, Cout0 + Cout1, Hin, Win]
-    auto out = graph0.make_output(concat1->get_outputs());
-
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, InceptionLikeTopoConv) {
-    accuracy_test_on_graph(build_inception_block);
-}
-
-struct conv_config_t {
-    int Cin;
-    int Cout;
-    int kernel;
-    int stride;
-    int padding;
-};
-
-static graph_tensor_ptr add_conv(sc_graph_t &graph,
-        const graph_tensor_ptr &data, const conv_config_t &config) {
-    auto weight = graph.make_input({graph_tensor::make(
-            {config.Cout, config.Cin, config.kernel, config.kernel},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides = {config.stride, config.stride};
-    sc_dims paddings = {config.padding, config.padding};
-    auto conv = graph.make("conv_fwd_core", {data, weight->get_outputs()[0]},
-            {}, {{"strides", strides}, {"paddings", paddings}});
-    return conv->get_outputs()[0];
-}
-
-static const std::vector<int> bn_bc_axis = {1}; // NCHW-format
-static graph_tensor_ptr add_bn(sc_graph_t &graph, const graph_tensor_ptr &data,
-        int K, std::vector<int> bc_axis) {
-    auto bn_mul = graph.make_input(
-            {graph_tensor::make({K})}, {{"constant", const_kind::local_const}});
-    auto bn_add = graph.make_input(
-            {graph_tensor::make({K})}, {{"constant", const_kind::local_const}});
-
-    auto out = graph.make("mul", {data, bn_mul->get_outputs()[0]}, {},
-            {{"bc_axis", bn_bc_axis}});
-    out = graph.make("add", {out->get_outputs()[0], bn_add->get_outputs()[0]},
-            {}, {{"bc_axis", bn_bc_axis}});
-    return out->get_outputs()[0];
-}
-
-// each conv block contains several conv layers and one concat
-// param growth: how many filters to add after a conv block
-// param bn_size: factor of bottleneck layer in the conv block
-static graph_tensor_ptr build_conv_block(sc_graph_t &graph,
-        graph_tensor_ptr data, int num_input_features, int growth_rate,
-        int bn_size, float drop_rate) {
-    graph_tensor_ptr ori_data = data;
-    data = add_bn(graph, ori_data, num_input_features, bn_bc_axis);
-    data = graph.make("relu", {data}, {}, {})->get_outputs()[0];
-    data = add_conv(
-            graph, data, {num_input_features, bn_size * growth_rate, 1, 1, 0});
-
-    data = add_bn(graph, data, bn_size * growth_rate, bn_bc_axis);
-    data = graph.make("relu", {data}, {}, {})->get_outputs()[0];
-    data = add_conv(graph, data, {bn_size * growth_rate, growth_rate, 3, 1, 1});
-
-    // do not fuse concat
-    data = graph.make("concat", {ori_data, data}, {},
-                        {{"axis", 1}, {op_attr_key::break_pre_fuse, true},
-                                {op_attr_key::break_post_fuse, true}})
-                   ->get_outputs()[0];
-
-    // currently do not support dropout layer
-
-    return data;
-}
-
-// each dense block contains several conv blocks
-static graph_tensor_ptr build_dense_block(sc_graph_t &graph,
-        graph_tensor_ptr data, int num_layers, int num_input_features,
-        int bn_size, int growth_rate, float drop_rate) {
-    for (int i = 0; i < num_layers; ++i) {
-        data = build_conv_block(graph, data,
-                num_input_features + i * growth_rate, growth_rate, bn_size,
-                drop_rate);
-    }
-    return data;
-}
-
-static graph_tensor_ptr add_transition_module(sc_graph_t &graph,
-        graph_tensor_ptr data, int num_input_features,
-        int num_output_features) {
-    data = add_bn(graph, data, num_input_features, bn_bc_axis);
-    data = graph.make("relu", {data}, {}, {})->get_outputs()[0];
-    data = add_conv(
-            graph, data, {num_input_features, num_output_features, 1, 1, 0});
-    // PreCI do not support pooling for now. Use conv to downsample.
-    data = add_conv(
-            graph, data, {num_output_features, num_output_features, 3, 2, 1});
-    return data;
-}
-
-static sc_graph_t build_densenet() {
-    int growth_rate = 32;
-    // std::vector<int> block_config = {6, 12, 24, 16}; // Densenet121
-    // std::vector<int> block_config = {6, 12, 36, 24}; // Densenet161
-    // std::vector<int> block_config = {6, 12, 32, 32}; // Densenet169
-    // std::vector<int> block_config = {6, 12, 48, 32}; // Densenet201
-    // std::vector<int> block_config = {2, 4, 8, 5};
-    std::vector<int> block_config = {2}; // fast for test
-    int num_init_features = 64;
-    int bn_size = 4;
-    float drop_rate = 0;
-
-    sc_graph_t graph;
-    int N = 16, Cin = 3, Hin = 224, Win = 224;
-    auto in = graph.make_input({graph_tensor::make({N, Cin, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-
-    // first laysers: conv-bn-relu-pool
-    auto conv1_out = add_conv(
-            graph, in->get_outputs()[0], {Cin, num_init_features, 3, 1, 1});
-    auto bn1_out = add_bn(graph, conv1_out, num_init_features, bn_bc_axis);
-    auto relu1_out = graph.make("relu", {bn1_out}, {}, {})->get_outputs()[0];
-    // PreCI do not support pooling for now. Use conv to downsample.
-    auto max_pool1_out = add_conv(
-            graph, relu1_out, {num_init_features, num_init_features, 3, 2, 1});
-
-    // dense blocks
-    graph_tensor_ptr data = max_pool1_out;
-    int num_features = num_init_features;
-    for (size_t i = 0; i < block_config.size(); ++i) {
-        int num_layers = block_config[i];
-        data = build_dense_block(graph, data, num_layers, num_features, bn_size,
-                growth_rate, drop_rate);
-        num_features = num_features + num_layers * growth_rate;
-        // add transition module between dense blocks
-        if (i != block_config.size() - 1) {
-            data = add_transition_module(
-                    graph, data, num_features, num_features / 2);
-            num_features /= 2;
-        }
-    }
-
-    // we omit the final avg_pooling, flattening and classifier layers
-    graph.make_output({data}, {});
-    return graph;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, Densenet) {
-    accuracy_test_on_graph(build_densenet);
-}
-
-static sc_graph_t build_gptj_subgraph() {
-    int N = 32, L = 1024, D = 256;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make(
-            {N, L, D}, sc_data_format_t(format_kinds::ABC), datatypes::bf16)});
-    auto in1 = graph0.make_input({graph_tensor::make(
-            {N, L, D}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make(
-            {N, L, D}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in3 = graph0.make_input({graph_tensor::make({N, 2 * L, D},
-            sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto to0 = graph0.make(
-            "cast", {in0->get_outputs()[0]}, {}, {{"dtype", datatypes::f32}});
-    auto add1 = graph0.make(
-            "add", {in1->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {to0->get_outputs()[0], add1->get_outputs()[0]}, {},
-            {{"axis", 1}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat2 output: (N, 2*L, D) @ ABC
-    auto permute3 = graph0.make("reorder", {concat2->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t(format_kinds::ACB)},
-                    {"internal", true}});
-    // permute3 output: (N, D, 2*L) @ACB
-    auto permute_in3 = graph0.make("reorder", {in3->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t(format_kinds::ACB)},
-                    {"internal", true}});
-    // permute_in3 output: (N, D, 2*L) @ACB
-    auto concat4 = graph0.make("concat",
-            {permute3->get_outputs()[0], permute_in3->get_outputs()[0]}, {},
-            {{"axis", 1}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat4 output: (N, 2*D, 2*L) @ACB
-    auto to5 = graph0.make("cast", {concat4->get_outputs()[0]}, {},
-            {{"dtype", datatypes::bf16}});
-    auto out = graph0.make_output(to5->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, GPTJ) {
-    REQUIRE_BF16();
-    accuracy_test_on_graph(build_gptj_subgraph);
-}
-
-// Both standalone and fused concats exist in this simple graph.
-// Check ir to see if the vectorization and memory optimization can co-work.
-static sc_graph_t build_concats_standalone_and_in_one_partition() {
-    static const int A = 4, B = 8, C = 16, D = 56;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto relu1 = graph0.make("relu", {add0->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {in1->get_outputs()[0], relu1->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat2 output: [A, B, C*2, D]
-    auto relu3 = graph0.make("relu", {concat2->get_outputs()[0]}, {}, {});
-    auto concat4 = graph0.make("concat",
-            {concat2->get_outputs()[0], relu3->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat4 output: [A, B, C*4, D], standalone
-    auto relu5 = graph0.make("relu", {concat4->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(relu5->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, StandaloneAndInOnePartitionConcats) {
-    accuracy_test_on_graph(build_concats_standalone_and_in_one_partition);
-}
-
-// Both standalone and fused concats exist in this simple graph with one graph
-// output as another concat input. Check ir to see if the vectorization and
-// memory optimization can co-work.
-static sc_graph_t build_concats_standalone_and_in_one_partition2() {
-    static const int A = 1, B = 28, C = 28, D = 32;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A, B, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto relu1 = graph0.make("relu", {add0->get_outputs()[0]}, {}, {});
-    auto concat2 = graph0.make("concat",
-            {in1->get_outputs()[0], relu1->get_outputs()[0]}, {},
-            {{"axis", 2}});
-    // concat2 output: [A, B, C*2, D]
-    auto relu3 = graph0.make("relu", {concat2->get_outputs()[0]}, {}, {});
-    auto concat4 = graph0.make("concat",
-            {concat2->get_outputs()[0], relu3->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    // concat4 output: [A, B, C*4, D], standalone
-    auto relu5 = graph0.make("relu", {concat4->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(relu5->get_outputs());
-    auto out2 = graph0.make_output(relu3->get_outputs());
-    return graph0;
-}
-
-TEST(GCCore_CPU_concat_optimization_cpp, StandaloneAndInOnePartitionConcats2) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(56);
-
-    accuracy_test_on_graph(build_concats_standalone_and_in_one_partition2);
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-    sc_graph_t graph = build_concats_standalone_and_in_one_partition2();
-
-    std::vector<sc_op_ptr> graph_args;
-    for (auto &op : graph.get_input_ops()) {
-        graph_args.push_back(op);
-    }
-    for (auto &op : graph.get_output_ops()) {
-        graph_args.push_back(op);
-    }
-    ctx->flags_.concat_optimization_ = true;
-    graph_driver(graph, ctx);
-    ir_module_ptr f = lower_graph(ctx, graph, graph_args);
-
-    concat_memory_planning_t pass;
-    auto f_opt = pass(f);
-    auto main_entry = f_opt->get_entry_func();
-
-    for (auto &stmt : main_entry->body_.checked_as<stmts>()->seq_) {
-        if (stmt.isa<evaluate_c>()) {
-            auto call = stmt.checked_as<evaluate_c>()
-                                ->value_.checked_as<call_c>();
-            if (is_standalone_concat_call(call)) {
-                for (auto &arg : call->args_) {
-                    auto &dims = arg.checked_as<tensor_c>()->dims_;
-                    auto &strides = arg.checked_as<tensor>()->strides_;
-                    auto expected_strides = dims_to_dense_stride(dims);
-                    EXPECT_EQ(strides.size(), expected_strides.size());
-                    for (size_t i = 0; i < strides.size(); ++i) {
-                        EXPECT_EQ(get_expr_as_int(strides[i]),
-                                get_expr_as_int(expected_strides[i]));
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_const_fold.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_const_fold.cpp
deleted file mode 100644
index 829d7b765fc..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_const_fold.cpp
+++ /dev/null
@@ -1,715 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/validator.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-
-#include <algorithm>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstCompute) {
-    constant_folder_t f;
-    var va = make_expr<var_node>(datatypes::s32, "a");
-#define MAKE_TEST(op, a, b) \
-    EXPECT_TRUE(f(expr(a) op expr(b))->equals(expr(a op b)))
-#define MAKE_TEST2(op, a, b) \
-    EXPECT_TRUE(f(builder::make_##op(expr(a), expr(b))) \
-                        ->equals(expr(std::op(a, b))))
-#define MAKE_TESTS(op, a, b) \
-    MAKE_TEST(op, a, b); \
-    MAKE_TEST(op, (uint64_t)(a), (uint64_t)(b)); \
-    MAKE_TEST(op, (float)(a), (float)(b));
-#define MAKE_TESTS2(op, a, b) \
-    MAKE_TEST2(op, a, b); \
-    MAKE_TEST2(op, (uint64_t)(a), (uint64_t)(b)); \
-    MAKE_TEST2(op, (float)(a), (float)(b));
-
-    MAKE_TESTS(+, 11, 2);
-    MAKE_TESTS(-, 11, 2);
-    MAKE_TEST(*, 11, 2);
-    MAKE_TEST(/, 11, 2);
-    MAKE_TEST(%, 11, 2);
-    MAKE_TEST2(min, 11, 2);
-    MAKE_TEST2(max, 11, 2);
-    MAKE_TEST(&, 11, 2);
-    MAKE_TEST(|, 11, 2);
-
-    // cmp
-    MAKE_TESTS(>, 11, 2);
-    MAKE_TESTS(>=, 11, 2);
-    MAKE_TESTS(<, 11, 2);
-    MAKE_TESTS(<=, 11, 2);
-    MAKE_TESTS(==, 11, 2);
-    MAKE_TESTS(!=, 11, 2);
-
-    // logic
-    MAKE_TEST(&&, true, false);
-    MAKE_TEST(&&, true, true);
-    MAKE_TEST(&&, false, false);
-    MAKE_TEST(&&, false, true);
-    MAKE_TEST(||, true, false);
-    MAKE_TEST(||, true, true);
-    MAKE_TEST(||, false, false);
-    MAKE_TEST(||, false, true);
-
-    EXPECT_TRUE(f(!expr(false))->equals(expr(true)));
-    EXPECT_TRUE(f(!expr(true))->equals(expr(false)));
-
-    // cast from s32
-    EXPECT_TRUE(
-            f(builder::make_cast(datatypes::index, 32))->equals(expr(32UL)));
-    EXPECT_TRUE(f(builder::make_cast(datatypes::f32, 32))->equals(expr(32.0f)));
-    EXPECT_TRUE(f(builder::make_cast(datatypes::s32, 32))->equals(expr(32)));
-
-    // cast from f32
-    EXPECT_TRUE(
-            f(builder::make_cast(datatypes::index, 32.0f))->equals(expr(32UL)));
-    EXPECT_TRUE(
-            f(builder::make_cast(datatypes::f32, 32.0f))->equals(expr(32.0f)));
-    EXPECT_TRUE(f(builder::make_cast(datatypes::s32, 32.0f))->equals(expr(32)));
-
-    // cast from index
-    EXPECT_TRUE(f(builder::make_cast(datatypes::index, expr(32UL)))
-                        ->equals(expr(32UL)));
-    EXPECT_TRUE(f(builder::make_cast(datatypes::f32, expr(32UL)))
-                        ->equals(expr(32.0f)));
-    EXPECT_TRUE(f(builder::make_cast(datatypes::s32, expr(32UL)))
-                        ->equals(expr(32)));
-
-    // SHR. SHL
-    EXPECT_TRUE(f(expr(UINT64_C(1234)) >> expr(UINT64_C(3)))
-                        ->equals(expr(uint64_t(1234 >> 3))));
-    EXPECT_TRUE(f(expr(INT32_C(-1234)) >> expr(INT32_C(3)))
-                        ->equals(expr(int32_t((-1234) >> 3))));
-    EXPECT_TRUE(f(expr(UINT64_C(1234)) << expr(UINT64_C(3)))
-                        ->equals(expr(uint64_t(1234 << 3))));
-    EXPECT_TRUE(f(expr(INT32_C(1234)) << expr(INT32_C(3)))
-                        ->equals(expr(int32_t((1234) << 3))));
-    // failures
-    expr tmp = builder::make_cast(datatypes::s32, va);
-    EXPECT_TRUE(f(tmp).ptr_same(tmp));
-    tmp = va + 2;
-    EXPECT_TRUE(f(tmp).ptr_same(tmp));
-    tmp = va == 2;
-    EXPECT_TRUE(f(tmp).ptr_same(tmp));
-    tmp = va && va->remake();
-    EXPECT_TRUE(f(tmp).ptr_same(tmp));
-    tmp = !va;
-    EXPECT_TRUE(f(tmp).ptr_same(tmp));
-#undef MAKE_TEST
-#undef MAKE_TESTS
-#undef MAKE_TEST2
-#undef MAKE_TESTS2
-}
-
-#define EXPECT_NO_CHANGE(exp) \
-    tmp = (exp); \
-    EXPECT_TRUE(f(tmp).ptr_same(tmp))
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldRotation) {
-    var va = make_expr<var_node>(datatypes::s32, "a");
-    var vb = make_expr<var_node>(datatypes::boolean, "b");
-    constant_folder_t f {false};
-    ir_comparer cmper(false, true, true);
-    expr tmp;
-    // c + x => x + c
-    EXPECT_TRUE(cmper.compare(f(2 * va), va * 2));
-    EXPECT_NO_CHANGE(va * 2);
-    EXPECT_NO_CHANGE(2 - va);
-
-    // (x + c1) + c2 => x + (c1 + c2)
-    EXPECT_TRUE(cmper.compare(f((va * 2) * 3), va * 6));
-
-    // (x + c) + y => (x + y) + c
-    EXPECT_TRUE(cmper.compare(f((va * 2) * va), va * va * 2));
-    EXPECT_NO_CHANGE(va + va + 3);
-
-    // x + (y + c) => (x + y) + c
-    EXPECT_TRUE(cmper.compare(f(va * (va * 2)), va * va * 2));
-    EXPECT_TRUE(cmper.compare(f(va + (va - 2)), va + va - 2));
-    EXPECT_NO_CHANGE(va + va + va);
-
-    // (x + c1) + (y + c2) => (x + y) + (c1 + c2)
-    EXPECT_TRUE(cmper.compare(f((va * 4) * (va * 2)), va * va * 8));
-    EXPECT_TRUE(cmper.compare(f((va + 4) + (va - 2)), va + va + 2));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldSpecialConst) {
-    var va = make_expr<var_node>(datatypes::s32, "a");
-    var fa = make_expr<var_node>(datatypes::f32, "fa");
-    var vb = make_expr<var_node>(datatypes::boolean, "b");
-    var vc = make_expr<var_node>(datatypes::boolean, "c");
-    constant_folder_t f {false};
-    ir_comparer cmper(false, true, true);
-    expr tmp;
-
-    // scalar
-    EXPECT_TRUE(cmper.compare(f(false && vb), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(true && vb), vb));
-    EXPECT_NO_CHANGE(vb && vc);
-
-    EXPECT_TRUE(cmper.compare(f(false || vb), vb));
-    EXPECT_TRUE(cmper.compare(f(true || vb), expr(true)));
-    EXPECT_NO_CHANGE(vb || vc);
-
-    EXPECT_TRUE(cmper.compare(f(va & va), va));
-    EXPECT_TRUE(cmper.compare(f(va | va), va));
-    EXPECT_TRUE(cmper.compare(f(0 + va), va));
-    EXPECT_TRUE(cmper.compare(f(0.0f + fa), fa));
-    EXPECT_TRUE(cmper.compare(f(va - 0), va));
-    EXPECT_TRUE(cmper.compare(f(fa - 0.0f), fa));
-    EXPECT_TRUE(cmper.compare(f(0 * va), expr(0)));
-    EXPECT_TRUE(cmper.compare(f(0.0f * fa), expr(0.0f)));
-    EXPECT_TRUE(cmper.compare(f(1 * va), va));
-    EXPECT_TRUE(cmper.compare(f(1.0f * fa), fa));
-    EXPECT_TRUE(cmper.compare(f(va / 1), va));
-    EXPECT_TRUE(cmper.compare(f(fa / 1.0f), fa));
-    EXPECT_TRUE(cmper.compare(f(va % 1), expr(0)));
-
-    // vec
-    var vax = make_expr<var_node>(sc_data_type_t::s32(16), "a");
-    var fax = make_expr<var_node>(sc_data_type_t::f32(16), "fa");
-    var vbx = make_expr<var_node>(sc_data_type_t::boolean(16), "b");
-    var vcx = make_expr<var_node>(sc_data_type_t::boolean(16), "c");
-    expr zero_i = builder::make_constant({0UL}, sc_data_type_t::s32(16));
-    expr one_i = builder::make_constant({1UL}, sc_data_type_t::s32(16));
-    expr zero_f = builder::make_constant({0.f}, sc_data_type_t::f32(16));
-    expr one_f = builder::make_constant({1.f}, sc_data_type_t::f32(16));
-    expr zero_b = builder::make_constant({0UL}, sc_data_type_t::boolean(16));
-    expr one_b = builder::make_constant({1UL}, sc_data_type_t::boolean(16));
-
-    EXPECT_TRUE(cmper.compare(f(zero_b && vbx), zero_b));
-    EXPECT_TRUE(cmper.compare(f(one_b && vbx), vbx));
-    EXPECT_NO_CHANGE(vbx && vcx);
-
-    EXPECT_TRUE(cmper.compare(f(zero_b || vbx), vbx));
-    EXPECT_TRUE(cmper.compare(f(one_b || vbx), one_b));
-    EXPECT_NO_CHANGE(vbx || vcx);
-
-    EXPECT_TRUE(cmper.compare(f(vax & vax), vax));
-    EXPECT_TRUE(cmper.compare(f(vax | vax), vax));
-    EXPECT_TRUE(cmper.compare(f(zero_i + vax), vax));
-    EXPECT_TRUE(cmper.compare(f(zero_f + fax), fax));
-    EXPECT_TRUE(cmper.compare(f(vax - zero_i), vax));
-    EXPECT_TRUE(cmper.compare(f(fax - zero_f), fax));
-    EXPECT_TRUE(cmper.compare(f(zero_i * vax), zero_i));
-    EXPECT_TRUE(cmper.compare(f(zero_f * fax), zero_f));
-    EXPECT_TRUE(cmper.compare(f(one_i * vax), vax));
-    EXPECT_TRUE(cmper.compare(f(one_f * fax), fax));
-    EXPECT_TRUE(cmper.compare(f(vax / one_i), vax));
-    EXPECT_TRUE(cmper.compare(f(fax / one_f), fax));
-    EXPECT_TRUE(cmper.compare(f(vax % one_i), zero_i));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldSpecialExpr) {
-    var va = make_expr<var_node>(datatypes::s32, "a");
-    var fa = make_expr<var_node>(datatypes::f32, "fa");
-    var vb = make_expr<var_node>(datatypes::boolean, "b");
-    var vc = make_expr<var_node>(datatypes::boolean, "c");
-    constant_folder_t f {false};
-    ir_comparer cmper(false, true, true);
-    expr tmp;
-    EXPECT_TRUE(cmper.compare(f(va * expr(8UL) / expr(4UL)), va * expr(2UL)));
-    EXPECT_TRUE(cmper.compare(f(va - va), expr(0)));
-    EXPECT_TRUE(cmper.compare(f(fa - fa), expr(0.0f)));
-    EXPECT_TRUE(cmper.compare(f(va / va), expr(1)));
-    EXPECT_TRUE(cmper.compare(f(fa / fa), expr(1.0f)));
-    EXPECT_TRUE(cmper.compare(f(va % va), expr(0)));
-    EXPECT_TRUE(cmper.compare(f(vb && vb), vb));
-    EXPECT_TRUE(cmper.compare(f(vb || vb), vb));
-    EXPECT_TRUE(cmper.compare(f(va * expr(13) * expr(5) % expr(5)), expr(0)));
-    EXPECT_TRUE(cmper.compare(f(va % expr(12) % expr(12)), va % expr(12)));
-    EXPECT_TRUE(cmper.compare(f(builder::make_min(va, va)), va));
-    EXPECT_TRUE(cmper.compare(f(builder::make_max(va, va)), va));
-    EXPECT_TRUE(cmper.compare(f(builder::make_min(fa, fa)), fa));
-    EXPECT_TRUE(cmper.compare(f(builder::make_max(fa, fa)), fa));
-
-    EXPECT_TRUE(cmper.compare(f(va > va), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(fa > fa), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(va < va), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(fa < fa), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(va != va), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(fa != fa), expr(false)));
-    EXPECT_TRUE(cmper.compare(f(va >= va), expr(true)));
-    EXPECT_TRUE(cmper.compare(f(fa >= fa), expr(true)));
-    EXPECT_TRUE(cmper.compare(f(va <= va), expr(true)));
-    EXPECT_TRUE(cmper.compare(f(fa <= fa), expr(true)));
-    EXPECT_TRUE(cmper.compare(f(va == va), expr(true)));
-    EXPECT_TRUE(cmper.compare(f(fa == fa), expr(true)));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldFmadd) {
-#define ADD(a, b) builder::make_add(a, b)
-#define MUL(a, b) builder::make_mul(a, b)
-#define FMADD(a, b, c) builder::make_fmadd(a, b, c)
-    var fax = make_expr<var_node>(sc_data_type_t::f32(16), "fax");
-    var fbx = make_expr<var_node>(sc_data_type_t::f32(16), "fbx");
-    var fcx = make_expr<var_node>(sc_data_type_t::f32(16), "fcx");
-    expr zero_f = builder::make_constant({0.f}, sc_data_type_t::f32(16));
-    expr one_f = builder::make_constant({1.f}, sc_data_type_t::f32(16));
-
-    constant_folder_t f;
-    ir_comparer cmper(false, true, true);
-    expr tmp;
-
-    EXPECT_NO_CHANGE(FMADD(fax, fbx, fcx));
-
-    EXPECT_TRUE(cmper.compare(f(FMADD(fax, fbx, zero_f)), MUL(fax, fbx)));
-    EXPECT_TRUE(cmper.compare(f(FMADD(one_f, fbx, fcx)), ADD(fbx, fcx)));
-    EXPECT_TRUE(cmper.compare(f(FMADD(fax, one_f, fcx)), ADD(fax, fcx)));
-    EXPECT_TRUE(cmper.compare(f(FMADD(one_f, fbx, zero_f)), fbx));
-    EXPECT_TRUE(cmper.compare(f(FMADD(zero_f, fbx, fcx)), fcx));
-    EXPECT_TRUE(cmper.compare(f(FMADD(fax, zero_f, fcx)), fcx));
-#undef ADD
-#undef MUL
-#undef FMADD
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestCanonialize) {
-    var a = make_expr<var_node>(datatypes::s32, "a");
-    var b = make_expr<var_node>(datatypes::s32, "b");
-    var c = make_expr<var_node>(datatypes::s32, "c");
-    var d = make_expr<var_node>(datatypes::index, "d");
-    var e = make_expr<var_node>(datatypes::index, "e");
-    auto tsr = builder::make_tensor("C", {100}, datatypes::f32);
-    constant_folder_t folder {false};
-    // complex add/sub
-    EXPECT_TRUE(folder((2 + a) + ((b + a) - (a + b - 10 - (a - b))))
-                        ->equals(((a - b) + a) + 12));
-    //((a - b) + a) + 12
-
-    // test unchanged
-    expr inp = (b * a) * c;
-    EXPECT_TRUE(folder(inp).ptr_same(inp));
-    // if the order is already sorted, but is not chained to the right
-    EXPECT_TRUE(folder(b * (a * c))->equals(inp));
-
-    // add negative => sub
-    EXPECT_TRUE(folder(10 + (a + 99 - 200))->equals(a - 91));
-    EXPECT_TRUE(folder(UINT64_C(10) + (d + UINT64_C(99) - UINT64_C(200)))
-                        ->equals(d - UINT64_C(91)));
-    EXPECT_TRUE(folder(UINT64_C(10) + (d + UINT64_C(99) - UINT64_C(100)))
-                        ->equals(d + UINT64_C(9)));
-    // add zero
-    EXPECT_TRUE(folder(10 + (a + 190 - 200))->equals(a));
-    EXPECT_TRUE(folder(UINT64_C(10) + (d + UINT64_C(190) - UINT64_C(200)))
-                        ->equals(d));
-
-    // fold to zero
-    EXPECT_TRUE(folder(a + a - a - a + 10 - 10)->equals(0));
-
-    // fold to negative start
-    EXPECT_TRUE(folder(a + b - a - b - a - 10)->equals(0 - a - 10));
-    EXPECT_TRUE(folder(e + d - e - d - e - UINT64_C(190))
-                        ->equals(UINT64_C(0) - e - UINT64_C(190)));
-
-    // in other expr
-    EXPECT_TRUE(folder(tsr[a + 10 - b - a])->equals(tsr[10 - b]));
-
-    // mixed
-    EXPECT_TRUE(folder(a + 10 * (10 + b + c) + 10 + b)
-                        ->equals(((b + a) + ((b + c) * 10)) + 110));
-
-    EXPECT_TRUE(folder(tsr[a + (10 - a)] + tsr[b + (10 - a)])
-                        ->equals(tsr[10] + tsr[((b - a) + 10)]));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldMutiLevel) {
-    var a = make_expr<var_node>(datatypes::s32, "a");
-    var b = make_expr<var_node>(datatypes::s32, "b");
-    var c = make_expr<var_node>(datatypes::s32, "c");
-    EXPECT_TRUE(constant_folder_t(false)(a + 2 + (a + 4) * 2 + (4 + a * 0))
-                        ->equals(a + a * 2 + 14));
-    EXPECT_TRUE(constant_folder_t(false)((2 + a) + (4 + a) + (6 + a))
-                        ->equals(a + a + a + 12));
-    EXPECT_TRUE(constant_folder_t(false)(
-            builder::make_max(1, builder::make_max(a, 2)))
-                        ->equals(builder::make_max(a, 2)));
-    // special values
-    EXPECT_TRUE(constant_folder_t(false)(
-            (0 + a) + (a - 0) + (a * 1) + (a * 0) + (a / 1))
-                        ->equals(a + a + a + a));
-    EXPECT_TRUE(constant_folder_t(false)((b && expr(true)) || expr(true))
-                        ->equals(expr(true)));
-
-    // special exprs
-    EXPECT_TRUE(constant_folder_t(false)((a - 12) - (a - 12))->equals(expr(0)));
-    EXPECT_TRUE(constant_folder_t(false)((b && b) || (b || b))->equals(b));
-    EXPECT_TRUE(constant_folder_t(false)(a + c - a)->equals(c));
-    EXPECT_TRUE(constant_folder_t(false)(c + a - a)->equals(c));
-}
-
-static const uint32_t lanes = 4;
-TEST(GCCore_CPU_const_fold_cpp, TestConstVectorCompute) {
-    constant_folder_t f;
-
-    auto make_constf = [](std::initializer_list<float> v,
-                               sc_data_type_t type = sc_data_type_t::f32(4)) {
-        std::vector<union_val> val;
-        for (auto i : v) {
-            val.push_back(i);
-        }
-        return make_expr<constant_node>(val, type);
-    };
-    auto make_consti = [](std::initializer_list<int> v) {
-        std::vector<union_val> val;
-        for (auto i : v) {
-            val.push_back(static_cast<int64_t>(i));
-        }
-        return make_expr<constant_node>(val, sc_data_type_t::s32(lanes));
-    };
-    auto make_constb = [](std::initializer_list<bool> v) {
-        std::vector<union_val> val;
-        for (auto i : v) {
-            val.push_back(static_cast<uint64_t>(i));
-        }
-        return make_expr<constant_node>(val, sc_data_type_t::boolean(lanes));
-    };
-    constant ca_vec_s32 = make_consti({1, 2, 3, 4});
-    constant cb_vec_s32 = make_consti({0});
-    constant ca_vec_f32 = make_constf({1.0f, 2.0f, 3.0f, 4.0f});
-    constant cb_vec_f32 = make_constf({0.0f});
-
-    constant all_true = make_constb({true});
-    constant all_false = make_constb({false});
-
-    EXPECT_TRUE(f(ca_vec_s32 + cb_vec_s32)->equals(ca_vec_s32));
-    EXPECT_TRUE(f(ca_vec_s32 - cb_vec_s32)->equals(ca_vec_s32));
-    EXPECT_TRUE(f(ca_vec_s32 - ca_vec_s32)->equals(cb_vec_s32));
-    EXPECT_TRUE(f(ca_vec_s32 * cb_vec_s32)->equals(cb_vec_s32));
-    EXPECT_TRUE(f(cb_vec_s32 / ca_vec_s32)->equals(cb_vec_s32));
-    EXPECT_TRUE(f(cb_vec_s32 % ca_vec_s32)->equals(cb_vec_s32));
-    EXPECT_TRUE(f(ca_vec_s32 > cb_vec_s32)->equals(all_true));
-    EXPECT_TRUE(f(ca_vec_s32 >= cb_vec_s32)->equals(all_true));
-    EXPECT_TRUE(f(ca_vec_s32 == cb_vec_s32)->equals(all_false));
-    EXPECT_TRUE(f(ca_vec_s32 < cb_vec_s32)->equals(all_false));
-    EXPECT_TRUE(f(ca_vec_s32 <= cb_vec_s32)->equals(all_false));
-    EXPECT_TRUE(f(ca_vec_s32 != cb_vec_s32)->equals(all_true));
-    EXPECT_TRUE(
-            f(builder::make_max(ca_vec_s32, cb_vec_s32))->equals(ca_vec_s32));
-    EXPECT_TRUE(
-            f(builder::make_min(ca_vec_s32, cb_vec_s32))->equals(cb_vec_s32));
-
-    EXPECT_TRUE(f(ca_vec_f32 + cb_vec_f32)->equals(ca_vec_f32));
-    EXPECT_TRUE(f(ca_vec_f32 - cb_vec_f32)->equals(ca_vec_f32));
-    EXPECT_TRUE(f(ca_vec_f32 * cb_vec_f32)->equals(cb_vec_f32));
-    EXPECT_TRUE(f(cb_vec_f32 / ca_vec_f32)->equals(cb_vec_f32));
-    EXPECT_TRUE(f(ca_vec_f32 > cb_vec_f32)->equals(all_true));
-    EXPECT_TRUE(f(ca_vec_f32 >= cb_vec_f32)->equals(all_true));
-    EXPECT_TRUE(f(ca_vec_f32 == cb_vec_f32)->equals(all_false));
-    EXPECT_TRUE(f(ca_vec_f32 < cb_vec_f32)->equals(all_false));
-    EXPECT_TRUE(f(ca_vec_f32 <= cb_vec_f32)->equals(all_false));
-    EXPECT_TRUE(f(ca_vec_f32 != cb_vec_f32)->equals(all_true));
-    EXPECT_TRUE(
-            f(builder::make_max(ca_vec_f32, cb_vec_f32))->equals(ca_vec_f32));
-    EXPECT_TRUE(
-            f(builder::make_min(ca_vec_f32, cb_vec_f32))->equals(cb_vec_f32));
-
-    // compute complex result
-    EXPECT_TRUE(f(make_constf({0, 1, 3, 4}) < make_constf({1, 2, 4, 3}))
-                        ->equals(make_constb({true, true, true, false})));
-
-    // compute complex result
-    EXPECT_TRUE(f(make_constf({0, 1, 3, 4}) + make_constf({1, 2, 4, 3}))
-                        ->equals(make_constf({1, 3, 7, 7})));
-
-    EXPECT_TRUE(f(builder::make_cast(
-                          sc_data_type_t::s32(4), make_constf({0, 1, 3, 4})))
-                        ->equals(make_consti({0, 1, 3, 4})));
-
-    // const f32=>bf16 cast
-    EXPECT_TRUE(
-            f(builder::make_cast(
-                      sc_data_type_t::bf16(lanes), make_constf({1.0f})))
-                    ->equals(make_constf({1.0f}, sc_data_type_t::bf16(lanes))));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldwithPolynomialExpansion) {
-    var va = make_expr<var_node>(datatypes::s32, "a");
-    var vb = make_expr<var_node>(datatypes::s32, "b");
-    var vd = make_expr<var_node>(datatypes::s32, "d");
-    var iva = make_expr<var_node>(datatypes::index, "a");
-    var ivb = make_expr<var_node>(datatypes::index, "b");
-    var ivd = make_expr<var_node>(datatypes::index, "d");
-    //
-    constant_folder_t f {false};
-    ir_comparer cmper(false, true, true);
-    expr tmp;
-
-    // ((a+b)*20+d)*30 = a*600+b*600+d*30
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((va + vb) * expr(20) + vd) * expr(30), 2),
-            va * expr(600) + vb * expr(600) + vd * expr(30)));
-    // (a + 5) / 10 = (a + 5) / 10 (sint div remain same)
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((va + expr(5)) / expr(10)), 2), //
-            ((va + expr(5)) / expr(10))));
-    // (a - 26) % 3 = ((a % 3) - 2) % 3
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((va - expr(26)) % expr(3)), 2), //
-            ((va % expr(3) - expr(2)) % expr(3))));
-    // (a - 26) % 3 = (a - 26) % 3 (skip sint mod expand)
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((va - expr(26)) % expr(3)), 2, true), //
-            ((va - expr(26)) % expr(3))));
-
-    // ((ia+ib)*20+id)*30 = ia*600+ib*600+id*30
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(
-                    ((iva + ivb) * expr(20UL) + ivd) * expr(30UL), 2, true),
-            iva * expr(600UL) + ivb * expr(600UL) + ivd * expr(30UL)));
-    // (ia + 5) / 10 = (ia + 5) / 10 (uint div remain same)
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((iva + expr(5UL)) / expr(10UL)), 2), //
-            ((iva + expr(5UL)) / expr(10UL))));
-    // (ia - 26) % 3 = (ia - 26) % 3 (uint mod remain same)
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((iva - expr(26UL)) % expr(3UL)), 2), //
-            ((iva - expr(26UL)) % expr(3UL))));
-    // (ia + 26) % 3 = ((a % 3) + 2) % 3
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((iva + expr(26UL)) % expr(3UL)), 2), //
-            (((iva % expr(3UL)) + expr(2UL)) % expr(3UL))));
-    // (ia + 26) % 3 = (ia + 26) % 3 (skip uint mod expand)
-    EXPECT_TRUE(cmper.compare(
-            f.expand_polynomial(((iva + expr(26UL)) % expr(3UL)), 2, true), //
-            ((iva + expr(26UL)) % expr(3UL))));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldSuccessiveDiv) {
-    var va = make_expr<var_node>(datatypes::s32, "a");
-    var fa = make_expr<var_node>(datatypes::f32, "fa");
-    var ua = make_expr<var_node>(datatypes::index, "ua");
-
-    constant_folder_t f;
-    ir_comparer cmper(false, true, true);
-    expr tmp;
-
-    EXPECT_TRUE(cmper.compare(f(va / 2 / 3), va / 6));
-    EXPECT_TRUE(cmper.compare(f(fa / 1.4f / 1.5f / 1.6f), fa / 3.36f));
-    EXPECT_TRUE(cmper.compare(f(ua / UINT64_C(128) / UINT64_C(512) % UINT64_C(2)
-                                      / UINT64_C(64) / UINT64_C(1024)),
-            ua / UINT64_C(65536) % UINT64_C(2) / UINT64_C(65536)));
-}
-
-#define U64(c) UINT64_C(c)
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldRange) {
-    builder::ir_builder_t bld;
-    _function_(datatypes::void_t, aaa) {
-        _var_(a, datatypes::index);
-        a = U64(1);
-
-        _var_(c, datatypes::index);
-        c = U64(1);
-        c = U64(2); // not single assign
-
-        _var_(d, datatypes::index);
-        d = c;
-        _for_(i, UINT64_C(0), UINT64_C(10)) {
-            _var_ex_(b, datatypes::index, linkage::local, a * U64(2) + i);
-
-            _if_(i < U64(100)) {
-                d = builder::make_select(b < U64(100), U64(10), U64(20));
-            }
-            d = builder::make_select(b < U64(10), U64(10), U64(20));
-            d = builder::make_select(b < U64(2), U64(10), U64(20));
-            d = builder::make_select(
-                    (i / U64(5) + U64(2)) / U64(4) == U64(0), U64(10), U64(20));
-            d = builder::make_select(
-                    d % (i + U64(10)) / U64(10) == U64(0), U64(10), U64(20));
-            d = i % U64(10);
-        }
-    }
-    constant_folder_t f {false};
-    ir_comparer cmper {true};
-    _function_(datatypes::void_t, expected) {
-        _var_(a, datatypes::index);
-        a = U64(1);
-
-        _var_(c, datatypes::index);
-        c = U64(1);
-        c = U64(2); // not single assign
-
-        _var_(d, datatypes::index);
-        d = c;
-        _for_(i, UINT64_C(0), UINT64_C(10)) {
-            _var_ex_(b, datatypes::index, linkage::local, i + U64(2));
-
-            bld.push_scope();
-            { d = U64(10); }
-            bld.emit(bld.pop_scope());
-            d = builder::make_select(b < U64(10), U64(10), U64(20));
-            d = U64(20);
-            d = U64(10);
-            d = U64(10);
-            d = i;
-        }
-    }
-    EXPECT_TRUE(cmper.compare(f(aaa), expected, false));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldRangeGEGT) {
-    builder::ir_builder_t bld;
-    _function_(datatypes::void_t, aaa) {
-        _var_(p, datatypes::s32);
-        _for_(i, UINT64_C(0), UINT64_C(3)) {
-            _if_(i + U64(5) >= U64(7)) { p = 1; }
-            _else_ { p = 2; }
-            _if_(i + U64(5) > U64(7)) { p = 1; }
-            _else_ { p = 2; }
-        }
-    }
-    constant_folder_t f {false};
-    auto out = f(aaa);
-
-    _function_(datatypes::void_t, expected) {
-        _var_(p, datatypes::s32);
-        _for_(i, UINT64_C(0), UINT64_C(3)) {
-            _if_(i + U64(5) >= U64(7)) { p = 1; }
-            _else_ { p = 2; }
-            bld.push_scope();
-            { p = 2; }
-            bld.emit(bld.pop_scope());
-        }
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldElseBlock) {
-    builder::ir_builder_t bld;
-    _function_(datatypes::void_t, aaa) {
-        _var_(p, datatypes::s32);
-        _if_(false) { p = 1; }
-        _else_ { p = expr(2) + expr(3); }
-    }
-
-    constant_folder_t f {true};
-    auto out = f(aaa);
-
-    _function_(datatypes::void_t, expected) {
-        _var_(p, datatypes::s32);
-        bld.push_scope();
-        { p = 5; }
-        bld.emit(bld.pop_scope());
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_const_fold_cpp, TestConstFoldSelect) {
-    builder::ir_builder_t builder;
-    const int simd = 8;
-    auto vec_type = sc_data_type_t::f32(simd);
-    expr cmp_true = make_expr<constant_node>(1.f, vec_type)
-            > make_expr<constant_node>(0.f, vec_type);
-    expr cmp_false = make_expr<constant_node>(0.f, vec_type)
-            > make_expr<constant_node>(1.f, vec_type);
-    expr val_one = builder::make_constant({1UL}, vec_type);
-    expr val_zero = builder::make_constant({0UL}, vec_type);
-    expr val_sel = builder::make_constant(
-            {1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL}, vec_type);
-
-    _function_(
-            datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10, 20, 32})) {
-        _bind_(A);
-        // Case 1:
-        A[0] = builder::make_select(false, 0, 1);
-        // Case 2:
-        A[1] = builder::make_select(true, 1, 0);
-        // Case 3:
-        A[span_t({2}, simd)]
-                = builder::make_select(cmp_true, val_one, val_zero);
-        // Case 4:
-        A[span_t({10}, simd)]
-                = builder::make_select(cmp_false, val_zero, val_one);
-        // Case 5:
-        A[span_t({18}, simd)] = builder::make_select(cmp_false, val_zero,
-                builder::make_select(cmp_false, val_zero, val_one));
-        // Case 6:
-        A[span_t({26}, simd)] = builder::make_select(
-                builder::make_constant({0UL}, datatypes::u8), val_zero,
-                builder::make_select(
-                        builder::make_constant({0UL}, datatypes::u8), val_zero,
-                        val_one));
-        // Case 7:
-        A[span_t({34}, simd)] = builder::make_select(UINT64_C(0), val_zero,
-                builder::make_select(UINT64_C(0), val_zero, val_one));
-        // Case 8:
-        A[span_t({42}, simd)] = builder::make_select(UINT64_C(0xFF),
-                builder::make_select(UINT64_C(0xFF), val_one, val_zero),
-                val_zero);
-        // Case 9:
-        A[span_t({43}, 1)] = builder::make_select(
-                UINT64_C(0x1), builder::make_select(UINT64_C(0x1), 1, 0), 0);
-        // Case 10:
-        A[span_t({44}, simd)]
-                = builder::make_select(UINT64_C(0b01010101), val_one, val_zero);
-        // case 11:
-        A[span_t({52}, simd)] = builder::make_select(
-                UINT64_C(0xFF), builder::make_exp(val_one), val_zero);
-        // case 12:
-        A[60] = builder::make_select(true, builder::make_exp(1.f), 0.f);
-        // case 13:
-        A[span_t({61}, simd)] = builder::make_select(
-                builder::make_constant({1UL, 0UL, 1UL, 0UL, 1UL, 0UL, 1UL, 0UL},
-                        sc_data_type_t::boolean(8)),
-                val_one, val_zero);
-    }
-    constant_folder_t s {false};
-    auto out = s(ccc);
-
-    _function_(datatypes::void_t, expected,
-            _arg_("A", datatypes::f32, {10, 20, 32})) {
-        _bind_(A);
-        // Case 1:
-        A[0] = 1;
-        // Case 2:
-        A[1] = 1;
-        // Case 3:
-        A[span_t({2}, simd)] = val_one;
-        // Case 4:
-        A[span_t({10}, simd)] = val_one;
-        // Case 5:
-        A[span_t({18}, simd)] = val_one;
-        // Case 6:
-        A[span_t({26}, simd)] = val_one;
-        // Case 7:
-        A[span_t({34}, simd)] = val_one;
-        // Case 8:
-        A[span_t({42}, simd)] = val_one;
-        // Case 9:
-        A[43] = 1;
-        // Case 10:
-        A[span_t({44}, simd)] = val_sel;
-        // Case 11:
-        A[span_t({52}, simd)] = builder::make_exp(val_one);
-        // Case 12:
-        A[60] = builder::make_exp(1.f);
-        // Case 13:
-        A[span_t({61}, simd)] = val_sel;
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_content_hash.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_content_hash.cpp
deleted file mode 100644
index b537321d636..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_content_hash.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/content_hash.hpp>
-#include <compiler/ir/easy_build.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-template <typename T>
-constant_c mk_const(T v) {
-    return expr(v).static_as<constant_c>();
-}
-
-TEST(GCCore_CPU_content_hash_cpp, TestContentHashConst) {
-    builder::ir_builder_t builder;
-    content_hash_t<constant_c> hasher;
-    content_hash_t<std::vector<constant_c>> vhasher;
-    content_equals_t<std::vector<constant_c>> veq;
-    EXPECT_EQ(hasher(mk_const(100UL)), hasher(mk_const(100UL)));
-    EXPECT_EQ(hasher(mk_const(100.0f)), hasher(mk_const(100.0f)));
-    EXPECT_EQ(hasher(mk_const(int(100))), hasher(mk_const(int(100))));
-
-    EXPECT_NE(hasher(mk_const(int(100))), hasher(mk_const(100UL)));
-    EXPECT_NE(hasher(mk_const(0.0f)), hasher(mk_const(0UL)));
-
-    std::vector<constant_c> vec {mk_const(1), mk_const(1.f), mk_const(100UL)};
-    std::vector<constant_c> vec2 {mk_const(1), mk_const(1.f), mk_const(100UL)};
-    std::vector<constant_c> vec3 {mk_const(1), mk_const(1.f)};
-    std::vector<constant_c> vec4 {mk_const(0), mk_const(1.f), mk_const(100UL)};
-
-    EXPECT_EQ(vhasher(vec), vhasher(vec2));
-    EXPECT_TRUE(veq(vec, vec2));
-
-    EXPECT_NE(vhasher(vec), vhasher(vec3));
-    EXPECT_FALSE(veq(vec, vec3));
-
-    EXPECT_NE(vhasher(vec), vhasher(vec4));
-    EXPECT_FALSE(veq(vec, vec4));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_conv2d.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_conv2d.cpp
deleted file mode 100644
index 54c63aea32f..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_conv2d.cpp
+++ /dev/null
@@ -1,2100 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "context.hpp"
-#include "reference/conv_ref.hpp"
-#include "reference/eltwise_ref.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/convolution.hpp>
-#include <ops/templates/conv1x1_backprop_data.hpp>
-#include <ops/templates/conv1x1_backprop_weight.hpp>
-#include <ops/templates/conv_bwd.hpp>
-#include <ops/templates/conv_dw_fwd.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <ops/templates/nested_conv_fwd.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-using namespace dnnl::impl::graph::gc;
-
-using conv_fwd_config_t = ops::conv_fwd_config_t;
-using nested_conv_fwd_config_t = ops::nested_conv_fwd_config_t;
-using conv_bwd_data_config_t = ops::conv_bwd_data_config_t;
-
-const conv_fwd_config_t cfg_fwd = {
-        64, // K_block
-        32, // C_block
-        1, // tile_d
-        7, // tile_p
-        14, // tile_q
-        14, // tile_os
-        0, // pack_input
-        1 // loop_sched
-};
-
-const conv_fwd_config_t cfg_fwd_3x3 = {
-        64, // K_block
-        32, // C_block
-        1, // tile_d
-        1, // tile_p
-        1, // tile_q
-        1, // tile_os
-        0, // pack_input
-        1 // loop_sched
-};
-
-bool verbose = false;
-
-void check_conv_correctness_and_tuning_fwd(conv_fwd_config_t cfg, int N, int G,
-        int K, int C, int H, int W, int R, int S, sc_dims stride,
-        sc_dims pads_begin, sc_dims pads_end, sc_dims dilation,
-        bool fuse_bias = false, bool fuse_bn_relu = false,
-        bool fuse_eleadd = false, bool default_cfg = false,
-        bool force_blocking = false, bool force_channel_last = false) {
-    REQUIRE_AVX2();
-    int stride_h = stride[0], stride_w = stride[0];
-    if (stride.size() > 1) { stride_w = stride[1]; }
-    int padding_h = pads_begin[0], padding_w = pads_begin[0];
-    if (pads_begin.size() > 1) { padding_w = pads_begin[1]; }
-    int dilation_h = dilation[0], dilation_w = dilation[0];
-    if (dilation.size() > 1) { dilation_w = dilation[1]; }
-    COMPILE_ASSERT(C % G == 0 && K % G == 0,
-            "C and K should be dividable by G, but got C("
-                    << C << "), K(" << K << "), G(" << G << ").");
-
-    bool is_dw = (G > 1) && (G == C) && (G == K);
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_dims data_dims
-            = G > 1 ? sc_dims {N, G, C / G, H, W} : sc_dims {N, C, H, W};
-    sc_dims weight_dims
-            = G > 1 ? sc_dims {G, K / G, C / G, R, S} : sc_dims {K, C, R, S};
-    auto in_a = mgr.make_input({graph_tensor::make(data_dims)});
-    auto in_weight = mgr.make_input({graph_tensor::make(weight_dims)});
-    auto conv_out = mgr.make("conv_fwd_core",
-            {in_a->get_outputs()[0], in_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"pads_begin", pads_begin},
-                    {"pads_end", pads_end}, {"dilations", dilation},
-                    {"groups", G}});
-    COMPILE_ASSERT(!force_blocking || !force_channel_last,
-            "only one of force_blocking and force_channel_last allowed");
-    if (force_blocking) {
-        conv_out->attrs_.set<std::string>(
-                "temp.test_format", G > 1 ? "NGCHWc" : "NCHWc");
-    } else if (force_channel_last) {
-        conv_out->attrs_.set<std::string>(
-                "temp.test_format", G > 1 ? "NGHWC" : "NHWC");
-    }
-
-    auto tunop = conv_out->dyn_cast<tunable_op_t>();
-    auto gen = tunop->create_generator();
-    int D = 0, P = 0, Q = 0;
-    if (is_dw) {
-        auto conv_gen = (ops::gen_conv_dw_fwd_t *)gen.get();
-        std::tie(D, P, Q) = conv_gen->get_output_shape();
-    } else {
-        auto conv_gen = (ops::gen_conv_fwd_t *)gen.get();
-        std::tie(D, P, Q) = conv_gen->get_output_shape();
-        reflection::shared_general_object_t cfgptr;
-        if (!default_cfg) {
-            cfgptr = reflection::general_object_t::make(cfg);
-        } else {
-            cfgptr = gen->get_default_config(get_test_ctx());
-            cfg = *(conv_fwd_config_t *)cfgptr.get();
-        }
-        tunop->set_config(cfgptr);
-        auto pcfg = (conv_fwd_config_t *)cfgptr.get();
-        tunop->get_inputs()[0]->details_.set_format(G > 1
-                        ? sc_data_format_t::NGCHWc(pcfg->C_block)
-                        : sc_data_format_t::NCHWc(pcfg->C_block));
-        tunop->get_inputs()[1]->details_.set_format(
-                G > 1 ? sc_data_format_t::GKCRSck(pcfg->C_block, pcfg->K_block)
-                      : sc_data_format_t::KCRSck(pcfg->C_block, pcfg->K_block));
-        tunop->get_outputs()[0]->details_.set_format(G > 1
-                        ? sc_data_format_t::NGCHWc(pcfg->K_block)
-                        : sc_data_format_t::NCHWc(pcfg->K_block));
-    }
-    fuse_arg_ops = {in_a, in_weight};
-    sc_op_ptr final_out = conv_out;
-    auto bc_axis = std::vector<int> {1};
-    if (fuse_bias) {
-        auto bias_in = mgr.make_input({graph_tensor::make({K})});
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], bias_in->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        fuse_arg_ops.emplace_back(bias_in);
-    }
-    if (fuse_bn_relu) {
-        auto fbn_mul = mgr.make_input({graph_tensor::make({K})});
-        auto fbn_add = mgr.make_input({graph_tensor::make({K})});
-        final_out = mgr.make("mul",
-                {final_out->get_outputs()[0], fbn_mul->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], fbn_add->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = mgr.make("relu", {final_out->get_outputs()[0]}, {}, {});
-        fuse_arg_ops.emplace_back(fbn_mul);
-        fuse_arg_ops.emplace_back(fbn_add);
-    }
-    sc_op_ptr ele_add_in;
-    if (fuse_eleadd) {
-        ele_add_in = mgr.make_input({std::make_shared<graph_tensor>(
-                nullptr, conv_out->get_outputs()[0]->details_)});
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], ele_add_in->get_outputs()[0]}, {},
-                {});
-        fuse_arg_ops.emplace_back(ele_add_in);
-    }
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    mgr.attrs_.set(
-            sc_graph_t::attr_key_t::is_input_plain, is_dw ? true : false);
-    mgr.attrs_.set(
-            sc_graph_t::attr_key_t::is_output_plain, is_dw ? true : false);
-
-    graph_driver(mgr, get_test_ctx());
-    auto f = lower_graph(get_test_ctx(), mgr, fuse_arg_ops);
-    if (verbose) { std::cout << f; }
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    auto output = alloc_array<float>(N * K * P * Q, INIT_NOOP);
-    auto input = alloc_array<float>(math_utils::get_dims_product(data_dims));
-    auto weight = alloc_array<float>(math_utils::get_dims_product(weight_dims));
-    auto ele_add = alloc_array<float>(N * K * P * Q);
-    auto bias = alloc_array<float>(K);
-    auto bn_mul = alloc_array<float>(K);
-    auto bn_add = alloc_array<float>(K);
-
-    // save original ele_add in advance, to avoid overwrite in fuse_eleadd
-    // condition
-    auto plain_ele_add = is_dw
-            ? std::move(ele_add)
-            : NCHWc2NCHW(ele_add, N, K / cfg.K_block, P, Q, cfg.K_block);
-    std::vector<float *> sc_args = {&output[0], &input[0], &weight[0]};
-
-    if (fuse_bias) sc_args.emplace_back(&bias[0]);
-    if (fuse_bn_relu) {
-        sc_args.emplace_back(&bn_mul[0]);
-        sc_args.emplace_back(&bn_add[0]);
-    }
-    if (fuse_eleadd) {
-        // TODO(xxx): use in-place: just let output arg point to eleadd
-        sc_args.emplace_back(&ele_add[0]);
-    }
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-    auto output_format = out->get_inputs().at(0)->details_.get_format();
-    test_buffer<float> sc_output = is_dw
-            ? std::move(output)
-            : any2NCHW(output_format, output, N, K, P, Q, cfg.K_block);
-
-    auto plain_input = is_dw
-            ? std::move(input)
-            : NCHWc2NCHW(input, N, C / cfg.C_block, H, W, cfg.C_block);
-    auto plain_weight = is_dw
-            ? std::move(weight)
-            : KCRSck2KCRS(weight, K / cfg.K_block, C / G / cfg.C_block, R, S,
-                    cfg.C_block, cfg.K_block);
-
-    auto plain_bias = std::move(bias);
-    auto plain_mul = std::move(bn_mul);
-    auto plain_add = std::move(bn_add);
-    test_buffer<float> plain_output(N * K * P * Q);
-
-    compute_ref_direct_fwd(N, G, K, C, H, W, P, Q, R, S, stride_h, stride_w,
-            padding_h, padding_w, &plain_input[0], &plain_weight[0],
-            &plain_bias[0], &plain_output[0], fuse_bias ? dir_t::FWD_B : FWD_I,
-            &plain_mul[0], &plain_add[0], fuse_bn_relu, 1, 1, 1, 0, 1, 1,
-            dilation_h, dilation_w);
-    if (fuse_eleadd)
-        compute_elementwise_ref_direct_fwd(
-                &plain_output[0], &plain_ele_add[0], {N, K, P, Q});
-    test_utils::compare_data(sc_output, plain_output, 1e-3f, 1e-3f);
-}
-
-void check_conv_correctness_and_tuning_fwd(conv_fwd_config_t cfg, int N, int K,
-        int C, int H, int W, int R, int S, sc_dims stride, sc_dims padding,
-        sc_dims dilation, bool fuse_bias = false, bool fuse_bn_relu = false,
-        bool fuse_eleadd = false, bool default_cfg = false,
-        bool force_blocking = false, bool force_channel_last = false) {
-    check_conv_correctness_and_tuning_fwd(cfg, N, 1, K, C, H, W, R, S, stride,
-            padding, padding, dilation, fuse_bias, fuse_bn_relu, fuse_eleadd,
-            default_cfg, force_blocking, force_channel_last);
-}
-
-void check_conv_correctness_and_tuning_fwd(conv_fwd_config_t cfg, int N, int K,
-        int C, int H, int W, int R, int S, sc_dims stride, sc_dims padding,
-        bool fuse_bias = false, bool fuse_bn_relu = false,
-        bool fuse_eleadd = false, bool default_cfg = false,
-        bool force_blocking = false, bool force_channel_last = false) {
-    check_conv_correctness_and_tuning_fwd(cfg, N, K, C, H, W, R, S, stride,
-            padding, {1, 1}, fuse_bias, fuse_bn_relu, fuse_eleadd, default_cfg,
-            force_blocking, force_channel_last);
-}
-
-void check_conv_correctness_and_tuning_fwd(conv_fwd_config_t cfg, int N, int K,
-        int C, int H, int W, int R, int S, int stride, int padding,
-        bool fuse_bias = false, bool fuse_bn_relu = false,
-        bool fuse_eleadd = false, bool default_cfg = false,
-        bool force_blocking = false, bool force_channel_last = false) {
-    check_conv_correctness_and_tuning_fwd(cfg, N, K, C, H, W, R, S,
-            {stride, stride}, {padding, padding}, fuse_bias, fuse_bn_relu,
-            fuse_eleadd, default_cfg, force_blocking, force_channel_last);
-}
-
-void check_conv_correctness_and_tuning_bwd_d(int N, int K, int C, int H, int W,
-        int R, int S, int stride, int padding,
-        bool use_inverse_filter = false) {
-    REQUIRE_AVX2();
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_dims stride_arr = {stride, stride};
-    sc_dims padding_arr = {padding, padding};
-    int P = (H + 2 * padding - R) / stride + 1;
-    int Q = (W + 2 * padding - S) / stride + 1;
-    auto in_a = mgr.make_input({graph_tensor::make({N, K, P, Q})});
-    auto in_weight = mgr.make_input({graph_tensor::make({K, C, R, S})});
-    sc_op_ptr conv_out;
-    if (use_inverse_filter) {
-        auto permute_channel
-                = mgr.make("transpose", {in_weight->get_outputs()[0]}, {},
-                        {{"order", std::vector<int> {1, 0, 2, 3}}});
-        conv_out = mgr.make("conv_fwd_core",
-                {in_a->get_outputs()[0], permute_channel->get_outputs()[0]}, {},
-                {{"strides", stride_arr}, {"paddings", padding_arr},
-                        {"dst_shape", sc_dims {N, C, H, W}},
-                        {"inverse_filter", true}});
-    } else {
-        conv_out = mgr.make("conv_bwd_data_core",
-                {in_a->get_outputs()[0], in_weight->get_outputs()[0]},
-                {graph_tensor::make({N, C, H, W})},
-                {{"strides", stride_arr}, {"paddings", padding_arr},
-                        {"dst_shape", sc_dims {N, C, H, W}}});
-    }
-
-    fuse_arg_ops = {in_a, in_weight};
-    const sc_op_ptr &final_out = conv_out;
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    graph_driver(mgr, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), mgr, fuse_arg_ops);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    auto grad = alloc_array<float>(N * P * Q * K);
-    auto grad_data = alloc_array<float>(N * H * W * C);
-    auto weight = alloc_array<float>(K * C * R * S);
-    test_buffer<float> bias(K);
-    bias.zeroout();
-
-    std::vector<float *> sc_args = {&grad_data[0], &grad[0], &weight[0]};
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-
-    auto plain_grad = std::move(grad);
-    auto plain_weight = std::move(weight);
-    auto plain_bias = std::move(bias);
-    test_buffer<float> plain_grad_data(N * C * H * W);
-    compute_ref_direct_bwd_d(N, 1, K, C, H, W, P, Q, R, S, stride, stride,
-            padding, padding, &plain_grad_data[0], &plain_weight[0],
-            &plain_bias[0], &plain_grad[0]);
-    test_utils::compare_data(grad_data, plain_grad_data, 1e-3f, 1e-3f);
-}
-
-void check_conv_correctness_and_tuning_bwd_w(int N, int K, int C, int H, int W,
-        int R, int S, int stride, int padding,
-        sc_data_type_t dtype = datatypes::f32) {
-    REQUIRE_AVX2();
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_dims stride_arr = {stride, stride};
-    sc_dims padding_arr = {padding, padding};
-    int P = (H + 2 * padding - R) / stride + 1;
-    int Q = (W + 2 * padding - S) / stride + 1;
-    auto in_data = mgr.make_input({graph_tensor::make({N, C, H, W})});
-    auto in_diff_dst = mgr.make_input({graph_tensor::make({N, K, P, Q})});
-    auto conv_in_data = in_data, conv_in_diff_dst = in_diff_dst;
-    if (utils::is_one_of(dtype, datatypes::bf16)) {
-        auto cast_data = mgr.make(
-                "cast", in_data->get_outputs(), {}, {{"dtype", dtype}});
-        auto cast_diff_dst = mgr.make(
-                "cast", in_diff_dst->get_outputs(), {}, {{"dtype", dtype}});
-        conv_in_data = cast_data;
-        conv_in_diff_dst = cast_diff_dst;
-    }
-    auto conv_out = mgr.make("conv_bwd_weight_core",
-            {conv_in_data->get_outputs()[0],
-                    conv_in_diff_dst->get_outputs()[0]},
-            {graph_tensor::make({K, C, R, S})},
-            {{"strides", stride_arr}, {"paddings", padding_arr},
-                    {"weights_shape", sc_dims {K, C, R, S}}});
-
-    fuse_arg_ops = {in_data, in_diff_dst};
-    const sc_op_ptr &final_out = conv_out;
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    graph_driver(mgr, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), mgr, fuse_arg_ops);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    auto data = alloc_array<float>(N * H * W * C);
-    auto grad = alloc_array<float>(N * P * Q * K);
-    auto grad_weight = alloc_array<float>(K * C * R * S);
-
-    std::vector<float *> sc_args = {&grad_weight[0], &data[0], &grad[0]};
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-
-    auto plain_grad = std::move(grad);
-    auto plain_data = std::move(data);
-    test_buffer<float> plain_grad_weight(K * C * R * S);
-
-    compute_ref_bwd_weights(N, 1, K, C, H, W, P, Q, R, S, stride, stride,
-            padding, padding, &plain_data[0], &plain_grad_weight[0],
-            &plain_grad[0]);
-
-    if (utils::is_one_of(dtype, datatypes::bf16, datatypes::f16)) {
-        test_utils::compare_data(grad_weight, plain_grad_weight, 1e-1f, 5e-1f);
-    } else {
-        test_utils::compare_data(grad_weight, plain_grad_weight, 1e-3f, 5e-3f);
-    }
-}
-
-void check_conv_correctness_and_tuning_fwd(int N, int K, int C, int H, int W,
-        int R, int S, sc_dims stride, sc_dims padding, bool fuse_bias = false,
-        bool fuse_bn_relu = false, bool fuse_eleadd = false, int real_N = -1,
-        int real_H = -1, int real_W = -1) {
-    int stride_h = stride[0], stride_w = stride[0];
-    if (stride.size() > 1) { stride_w = stride[1]; }
-    int padding_h = padding[0], padding_w = padding[0];
-    if (padding.size() > 1) { padding_w = padding[1]; }
-    bool is_dynamic = N < 0 || H < 0 || W < 0;
-
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    auto in_a = mgr.make_input({graph_tensor::make({N, C, H, W})});
-    auto in_weight = mgr.make_input({graph_tensor::make({K, C, R, S})});
-    auto conv_out = mgr.make("conv_fwd_core",
-            {in_a->get_outputs()[0], in_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"paddings", padding}, {"no_fuse", false}});
-
-    conv_out->attrs_.set<std::string>("temp.test_format", "NHWC");
-    auto tunop = conv_out->dyn_cast<tunable_op_t>();
-    int D = 0, P = 0, Q = 0;
-
-    auto gen = tunop->create_generator();
-    auto conv_gen = (ops::gen_nested_conv_fwd_t *)gen.get();
-    std::tie(D, P, Q) = conv_gen->get_output_shape();
-    reflection::shared_general_object_t cfgptr;
-    cfgptr = gen->get_default_config(get_test_ctx());
-    nested_conv_fwd_config_t cfg = *(nested_conv_fwd_config_t *)cfgptr.get();
-    tunop->set_config(cfgptr);
-    tunop->get_inputs()[0]->details_.set_format(sc_data_format_t::NHWC());
-    tunop->get_inputs()[1]->details_.set_format(
-            sc_data_format_t::KCRSck(cfg.im_ic_block, cfg.im_oc_block));
-    tunop->get_outputs()[0]->details_.set_format(sc_data_format_t::NHWC());
-
-    fuse_arg_ops = {in_a, in_weight};
-    sc_op_ptr final_out = conv_out;
-    auto bc_axis = std::vector<int> {1};
-    if (fuse_bias) {
-        auto bias_in = mgr.make_input({graph_tensor::make({K})});
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], bias_in->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        fuse_arg_ops.emplace_back(bias_in);
-    }
-    if (fuse_bn_relu) {
-        auto fbn_mul = mgr.make_input({graph_tensor::make({K})});
-        auto fbn_add = mgr.make_input({graph_tensor::make({K})});
-        final_out = mgr.make("mul",
-                {final_out->get_outputs()[0], fbn_mul->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], fbn_add->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = mgr.make("relu", {final_out->get_outputs()[0]}, {}, {});
-        fuse_arg_ops.emplace_back(fbn_mul);
-        fuse_arg_ops.emplace_back(fbn_add);
-    }
-    sc_op_ptr ele_add_in;
-    if (fuse_eleadd) {
-        ele_add_in = mgr.make_input({std::make_shared<graph_tensor>(
-                nullptr, conv_out->get_outputs()[0]->details_)});
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], ele_add_in->get_outputs()[0]}, {},
-                {});
-        fuse_arg_ops.emplace_back(ele_add_in);
-    }
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    mgr.attrs_.set(sc_graph_t::attr_key_t::is_input_plain, false);
-    mgr.attrs_.set(sc_graph_t::attr_key_t::is_output_plain, false);
-
-    graph_driver(mgr);
-    auto f = lower_graph(get_default_context(), mgr, fuse_arg_ops);
-    if (verbose) { std::cout << f; }
-
-    auto fptr = jit_engine_t::make(get_default_context())
-                        ->get_entry_func(f, true);
-    uint8_t in_mask = 0;
-    if (is_dynamic) {
-        if (is_dynamic_dim(N)) {
-            assert(real_N > 0);
-            N = real_N;
-            in_mask |= 1 << 0;
-        }
-        if (is_dynamic_dim(H)) {
-            assert(real_H > 0);
-            H = real_H;
-            in_mask |= 1 << 1;
-        }
-        if (is_dynamic_dim(W)) {
-            assert(real_W > 0);
-            W = real_W;
-            in_mask |= 1 << 2;
-        }
-        P = (H + padding_h * 2 - R) / stride_h + 1;
-        Q = (W + padding_w * 2 - S) / stride_w + 1;
-    }
-
-    sc_dims out_dims = sc_dims {N, K, P, Q};
-    sc_dims in_a_dims = sc_dims {N, C, H, W};
-    sc_dims in_weight_dims = sc_dims {K, C, R, S};
-    sc_dims in_postop_dims = sc_dims {K};
-    auto output = alloc_array<float>(
-            N * K / cfg.im_oc_block * P * Q * cfg.im_oc_block, INIT_NOOP);
-    auto input = alloc_array<float>(
-            N * C / cfg.im_ic_block * H * W * cfg.im_ic_block);
-    auto weight = alloc_array<float>(K / cfg.im_oc_block * C / cfg.im_ic_block
-            * R * S * cfg.im_ic_block * cfg.im_oc_block);
-    auto ele_add = alloc_array<float>(
-            N * K / cfg.im_oc_block * P * Q * cfg.im_oc_block);
-    auto bias = alloc_array<float>(K);
-    auto bn_mul = alloc_array<float>(K);
-    auto bn_add = alloc_array<float>(K);
-
-    // Define dynamic tensor
-    runtime::dynamic_tensor_t dyn_output(&output[0], &out_dims[0],
-            out_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_input(&input[0], &in_a_dims[0],
-            in_a_dims.size(), uint32_t(sc_data_etype::F32), in_mask);
-    runtime::dynamic_tensor_t dyn_weight(&weight[0], &in_weight_dims[0],
-            in_weight_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_bias(&bias[0], &in_postop_dims[0],
-            in_postop_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_bn_mul(&bn_mul[0], &in_postop_dims[0],
-            in_postop_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_bn_add(&bn_add[0], &in_postop_dims[0],
-            in_postop_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_ele_add(&ele_add[0], &out_dims[0],
-            out_dims.size(), uint32_t(sc_data_etype::F32), 0);
-
-    std::vector<void *> sc_args = is_dynamic
-            ? std::vector<void *> {&dyn_output, &dyn_input, &dyn_weight}
-            : std::vector<void *> {&output[0], &input[0], &weight[0]};
-
-    if (fuse_bias) {
-        if (is_dynamic)
-            sc_args.emplace_back(&dyn_bias);
-        else
-            sc_args.emplace_back(&bias[0]);
-    }
-    if (fuse_bn_relu) {
-        if (is_dynamic) {
-            sc_args.emplace_back(&dyn_bn_mul);
-            sc_args.emplace_back(&dyn_bn_add);
-        } else {
-            sc_args.emplace_back(&bn_mul[0]);
-            sc_args.emplace_back(&bn_add[0]);
-        }
-    }
-    if (fuse_eleadd) {
-        // TODO(xxx): use in-place: just let output arg point to eleadd
-        if (is_dynamic)
-            sc_args.emplace_back(&dyn_ele_add);
-        else
-            sc_args.emplace_back(&ele_add[0]);
-    }
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-    auto output_format = out->get_inputs().at(0)->details_.get_format();
-    test_buffer<float> sc_output
-            = any2NCHW(output_format, output, N, K, P, Q, cfg.im_oc_block);
-
-    auto in_a_format = in_a->get_outputs()[0]->details_.get_format();
-    auto plain_input
-            = any2NCHW(in_a_format, input, N, C, H, W, cfg.im_ic_block);
-    auto plain_weight = KCRSck2KCRS(weight, K / cfg.im_oc_block,
-            C / cfg.im_ic_block, R, S, cfg.im_ic_block, cfg.im_oc_block);
-    auto plain_ele_add
-            = any2NCHW(output_format, ele_add, N, K, P, Q, cfg.im_oc_block);
-
-    auto plain_bias = std::move(bias);
-    auto plain_mul = std::move(bn_mul);
-    auto plain_add = std::move(bn_add);
-
-    test_buffer<float> plain_output(N * K * P * Q);
-
-    compute_ref_direct_fwd(N, 1, K, C, H, W, P, Q, R, S, stride_h, stride_w,
-            padding_h, padding_w, &plain_input[0], &plain_weight[0],
-            &plain_bias[0], &plain_output[0], fuse_bias ? dir_t::FWD_B : FWD_I,
-            &plain_mul[0], &plain_add[0], fuse_bn_relu);
-    if (fuse_eleadd)
-        compute_elementwise_ref_direct_fwd(
-                &plain_output[0], &plain_ele_add[0], {N, K, P, Q});
-    test_utils::compare_data(sc_output, plain_output, 1e-3f, 1e-3f);
-}
-
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16,
-            28 * 28, 1, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(
-            conv_fwd_config_t(63 * 8, 63 * 2, 1, 1, 1, 0, 0, 0), 7, 63 * 8,
-            63 * 2, 64 * 64, 1, 1, 1, 1, 0, false, false, false, false, false,
-            true);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_2_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16,
-            28 * 28, 1, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_2_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16,
-            28 * 28, 1, 1, 1, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_3_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 28, 16, 16,
-            28 * 28, 1, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_3_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 28, 16, 16,
-            28 * 28, 1, 1, 1, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_4_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 8, 512, 2048, 7,
-            7, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv1d_fwd_cpp, Test_1DConv_1x1_4_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 8, 512, 2048,
-            7 * 7, 1, 1, 1, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 14,
-            14, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 14,
-            14, 1, 1, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_2_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 3, 3, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_2_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 3, 3, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_2_NCX_single_thread) {
-    SET_THREADS_OR_SKIP(1);
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 3, 3, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_3_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 3, 3, 2, 3, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_3_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 3, 3, 2, 3, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_4_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 1, 1, {2, 1}, {0, 0}, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_4_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 1, 16, 16, 28,
-            28, 1, 1, {2, 1}, {0, 0}, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_5_NCX) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 28, 16, 16, 28,
-            28, 3, 3, {1, 1}, {2, 1}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_5_NXC) {
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 28, 16, 16, 28,
-            28, 3, 3, {1, 1}, {2, 1}, true, true, true, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_6_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 256, 64, 56,
-            56, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_6_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 256, 64, 56,
-            56, 1, 1, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_7_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 512, 128, 28,
-            28, 1, 1, 1, 0, false, false, false, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_7_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 512, 128, 28,
-            28, 1, 1, 1, 0, false, false, false, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_8_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 1, 1, 1, 0, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_8_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 1, 1, 1, 0, true, true, true, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_9_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 1, 1, {1, 1}, {0, 0}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_9_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 1, 1, {1, 1}, {0, 0}, true, true, true, true, false, true);
-}
-// test asymmetric stride
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_10_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 1, 1, {1, 2}, {0, 0}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_10_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 1, 1, {1, 2}, {0, 0}, true, true, true, true, false, true);
-}
-// test asymmetric padding
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_11_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 1}, {1, 2}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_11_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 1}, {1, 2}, true, true, true, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_12_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 1}, {1, 0}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_12_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 1}, {1, 0}, true, true, true, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_13_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 1}, {0, 1}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_13_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 1}, {0, 1}, true, true, true, true, false, true);
-}
-
-// test asymmetric stride & padding
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_14_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 2}, {2, 1}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_14_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 2}, {2, 1}, true, true, true, true, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_15_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 2}, {0, 1}, true, true, true, true, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_15_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), 16, 64, 64, 56,
-            56, 3, 3, {1, 2}, {0, 1}, true, true, true, true, false, true);
-}
-
-// conv1x1 with given cfg
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_16_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 64, 56, 56, 1, 1, 1,
-            0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_16_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 64, 56, 56, 1, 1, 1,
-            0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_17_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 128, 28, 28, 1, 1, 1,
-            0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_17_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 128, 28, 28, 1, 1, 1,
-            0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_18_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 1, 128, 128, 28, 28, 1, 1, 1,
-            0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_18_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 1, 128, 128, 28, 28, 1, 1, 1,
-            0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_19_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 128, 56, 56, 1, 1,
-            2, 0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_19_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 128, 56, 56, 1, 1,
-            2, 0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_20_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 1, 64, 64, 56, 56, 1, 1, 1,
-            0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_20_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 1, 64, 64, 56, 56, 1, 1, 1,
-            0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_21_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({64, 32, 1, 7, 28, 28, 0, 4}, 28, 128,
-            64, 28, 28, 1, 1, 1, 0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_21_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({64, 32, 1, 7, 28, 28, 0, 4}, 28, 128,
-            64, 28, 28, 1, 1, 1, 0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_22_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({64, 32, 1, 7, 28, 28, 0, 5}, 28, 128,
-            64, 28, 28, 1, 1, 1, 0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_22_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({64, 32, 1, 7, 28, 28, 0, 5}, 28, 128,
-            64, 28, 28, 1, 1, 1, 0, false, false, false, false, false, true);
-}
-
-// conv1x1 with bias
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 64, 56, 56, 1, 1, 1,
-            0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 64, 56, 56, 1, 1, 1,
-            0, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_2_NCX) {
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 4, 4, 4, 1, 1}, 28, 3, 16,
-            28, 28, 1, 1, 1, 0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_2_NXC) {
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 4, 4, 4, 1, 1}, 28, 3, 16,
-            28, 28, 1, 1, 1, 0, true, false, false, false, false, true);
-}
-
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_3_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 32, 56, 56, 1, 1, 2,
-            0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_3_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 32, 56, 56, 1, 1, 2,
-            0, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_4_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 512, 128, 28, 28, 1, 1,
-            1, 0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_4_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 512, 128, 28, 28, 1, 1,
-            1, 0, true, false, false, false, false, true);
-}
-// conv1x1 with bias with given cfg
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_5_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({32, 32, 1, 7, 28, 28, 0, 4}, 28, 64,
-            32, 28, 28, 1, 1, 1, 0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_1x1_5_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({32, 32, 1, 7, 28, 28, 0, 4}, 28, 64,
-            32, 28, 28, 1, 1, 1, 0, true, false, false, false, false, true);
-}
-
-// conv3x3
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            1, 0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            1, 0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_2_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            2, 0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_2_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            2, 0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_3_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 56, 56, 3, 3,
-            1, 1, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_3_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 56, 56, 3, 3,
-            1, 1, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_4_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 14, 14, 3,
-            3, 2, 1, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_4_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 14, 14, 3,
-            3, 2, 1, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_large_pad) {
-    check_conv_correctness_and_tuning_fwd(
-            {16, 17, 1, 1, 1, -1, -1, 3}, 1, 16, 17, 27, 27, 3, 3, 1, 4);
-}
-
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_5_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 128, 14, 14, 3,
-            3, 1, 0, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_5_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 128, 14, 14, 3,
-            3, 1, 0, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_6_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_6_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_7_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 1, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_7_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 1, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_8_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 128, 14, 14, 3,
-            3, 1, 1, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_8_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 128, 14, 14, 3,
-            3, 1, 1, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_9_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 3, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_9_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 3, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_10_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_10_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_11_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_11_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_12_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 256, 56, 56, 3,
-            3, 2, 1, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_12_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 256, 56, 56, 3,
-            3, 2, 1, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_13_NCX_asym_pad) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 1, 64, 64, 224, 224,
-            7, 7, sc_dims {2, 2}, sc_dims {3, 3}, sc_dims {2, 2},
-            sc_dims {1, 1}, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_13_NXC_asym_pad) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 1, 64, 64, 224, 224,
-            7, 7, sc_dims {2, 2}, sc_dims {3, 3}, sc_dims {2, 2},
-            sc_dims {1, 1}, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_14_NCX_asym_pad) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 1, 64, 64, 224, 224,
-            7, 7, sc_dims {1, 1}, sc_dims {3, 3}, sc_dims {2, 2},
-            sc_dims {1, 1}, false, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_14_NXC_asym_pad) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 1, 64, 64, 224, 224,
-            4, 4, sc_dims {1, 1}, sc_dims {3, 3}, sc_dims {2, 2},
-            sc_dims {1, 1}, false, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_with_dilation) {
-    REQUIRE_AVX512();
-    std::vector<std::vector<int>> workload_list = {
-            // prepadding
-            {1, 256, 960, 38, 38, 12, 1, 0}, // deeplabv3_mobilenet
-            {1, 256, 960, 62, 62, 24, 1, 0}, // deeplabv3_mobilenet
-            {1, 256, 960, 86, 86, 36, 1, 0}, // deeplabv3_mobilenet
-            {1, 256, 256, 32, 32, 2, 1, 0}, // deeplabv3_resnet101
-            {1, 256, 256, 36, 36, 4, 1, 0}, // deeplabv3_resnet101
-            {1, 1024, 512, 31, 31, 6, 1, 0}, // ssd300_vgg16
-            // with padding
-            {1, 256, 960, 14, 14, 12, 1, 12}, // deeplabv3_mobilenet
-            {1, 256, 960, 14, 14, 24, 1, 24}, // deeplabv3_mobilenet
-            {1, 256, 960, 14, 14, 36, 1, 36}, // deeplabv3_mobilenet
-            {1, 256, 256, 28, 28, 2, 1, 2}, // deeplabv3_resnet101
-            {1, 256, 256, 28, 28, 4, 1, 4}, // deeplabv3_resnet101
-            {1, 1024, 512, 19, 19, 6, 1, 6}, // ssd300_vgg16
-            // tile_q != ow_
-            {1, 32, 32, 128, 128, 12, 1, 12},
-            {1, 256, 256, 128, 128, 2, 1, 2},
-    }; // N, K, C, H, W, Dilation, Stride, Padding
-
-    int G = 1;
-    int R = 3, S = 3;
-    for (auto workload : workload_list) {
-        auto N = workload[0];
-        auto K = workload[1];
-        auto C = workload[2];
-        auto H = workload[3];
-        auto W = workload[4];
-        auto dilation = workload[5];
-        auto stride = workload[6];
-        auto padding = workload[7];
-        if (dilation * 2 + 1 > H + 2 * padding) { continue; }
-        check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), N, G, K, C,
-                H, W, R, S, {stride, stride}, {padding, padding},
-                {padding, padding}, {dilation, dilation}, false, false, false,
-                true, false, true);
-    }
-    return;
-}
-
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_1x1_with_groups) {
-    std::vector<std::vector<int>> workload_list = {
-            // N, G, K, C, H, W, R, S, stride, padding, dilation
-            {1, 4, 8, 8, 12, 12, 1, 1, 2, 0, 1},
-            {14, 2, 48, 48, 114, 114, 1, 1, 2, 0, 1},
-    };
-    for (auto &wl : workload_list) {
-        int idx = 0;
-        auto N = wl[idx++];
-        auto G = wl[idx++];
-        auto K = wl[idx++];
-        auto C = wl[idx++];
-        auto H = wl[idx++];
-        auto W = wl[idx++];
-        auto R = wl[idx++];
-        auto S = wl[idx++];
-        auto stride = wl[idx++];
-        auto padding = wl[idx++];
-        auto dilation = wl[idx++];
-        check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), N, G, K, C,
-                H, W, R, S, {stride, stride}, {padding, padding},
-                {padding, padding}, {dilation, dilation}, false, false, false,
-                true, false, true);
-    }
-}
-
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_with_groups) {
-    std::vector<std::vector<int>> workload_list = {
-            // N, G, K, C, H, W, R, S, stride, padding, dilation
-            {1, 2, 8, 8, 12, 12, 3, 3, 2, 0, 1},
-            {1, 4, 8, 8, 12, 12, 3, 3, 2, 0, 1},
-            {14, 2, 48, 48, 114, 114, 3, 3, 2, 0, 1},
-    };
-    for (auto &wl : workload_list) {
-        int idx = 0;
-        auto N = wl[idx++];
-        auto G = wl[idx++];
-        auto K = wl[idx++];
-        auto C = wl[idx++];
-        auto H = wl[idx++];
-        auto W = wl[idx++];
-        auto R = wl[idx++];
-        auto S = wl[idx++];
-        auto stride = wl[idx++];
-        auto padding = wl[idx++];
-        auto dilation = wl[idx++];
-        check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), N, G, K, C,
-                H, W, R, S, {stride, stride}, {padding, padding},
-                {padding, padding}, {dilation, dilation}, false, false, false,
-                true, false, true);
-    }
-}
-
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_with_groups_padding) {
-    std::vector<std::vector<int>> workload_list = {
-            // N, G, K, C, H, W, R, S, stride, padding, dilation
-            {1, 2, 8, 8, 12, 12, 3, 3, 2, 1, 1},
-            {1, 4, 8, 8, 12, 12, 3, 3, 2, 1, 1},
-            {14, 2, 48, 48, 114, 114, 3, 3, 2, 1, 1},
-    };
-    for (auto &wl : workload_list) {
-        int idx = 0;
-        auto N = wl[idx++];
-        auto G = wl[idx++];
-        auto K = wl[idx++];
-        auto C = wl[idx++];
-        auto H = wl[idx++];
-        auto W = wl[idx++];
-        auto R = wl[idx++];
-        auto S = wl[idx++];
-        auto stride = wl[idx++];
-        auto padding = wl[idx++];
-        auto dilation = wl[idx++];
-        check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), N, G, K, C,
-                H, W, R, S, {stride, stride}, {padding, padding},
-                {padding, padding}, {dilation, dilation}, false, false, false,
-                true, false, true);
-    }
-}
-
-TEST(GCCore_CPU_conv2d_fwd_cpp, Test_2DConv_3x3_with_dw) {
-    std::vector<std::vector<int>> workload_list = {
-            // N, G, K, C, H, W, R, S, stride, padding, dilation
-            {1, 8, 8, 8, 12, 12, 3, 3, 1, 0, 1},
-            {14, 48, 48, 48, 114, 114, 3, 3, 2, 0, 1},
-            {14, 48, 48, 48, 114, 114, 3, 3, 2, 1, 1},
-            {14, 48, 48, 48, 114, 114, 5, 5, 2, 2, 1},
-    };
-    for (auto &wl : workload_list) {
-        int idx = 0;
-        auto N = wl[idx++];
-        auto G = wl[idx++];
-        auto K = wl[idx++];
-        auto C = wl[idx++];
-        auto H = wl[idx++];
-        auto W = wl[idx++];
-        auto R = wl[idx++];
-        auto S = wl[idx++];
-        auto stride = wl[idx++];
-        auto padding = wl[idx++];
-        auto dilation = wl[idx++];
-        check_conv_correctness_and_tuning_fwd(conv_fwd_config_t(), N, G, K, C,
-                H, W, R, S, {stride, stride}, {padding, padding},
-                {padding, padding}, {dilation, dilation}, false, false, false,
-                true, false, true);
-    }
-}
-
-// conv3x3 with bias
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 0, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_2_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_2_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, true, false, false, false, false, true);
-}
-
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_3_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 128, 56, 56, 3,
-            3, 1, 0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_3_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 128, 56, 56, 3,
-            3, 1, 0, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_4_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            3, 0, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_4_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            3, 0, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_5_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_5_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_6_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_6_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, true, false, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_7_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 56, 56, 3,
-            3, 2, 1, true, false, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_cpp, Test_2DConv_3x3_7_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 56, 56, 3,
-            3, 2, 1, true, false, false, false, false, true);
-}
-
-// conv with bias/bn/relu
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 64, 56, 56, 1, 1, 1,
-            0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 64, 56, 56, 1, 1, 1,
-            0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_2_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 56, 56, 3, 3,
-            2, 1, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_2_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 56, 56, 3, 3,
-            2, 1, true, true, false, false, false, true);
-}
-
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_3_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 32, 56, 56, 1, 1, 2,
-            0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_3_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 32, 56, 56, 1, 1, 2,
-            0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_4_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 4, 4, 4, 1, 1}, 28, 3, 16,
-            28, 28, 1, 1, 1, 0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_4_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 4, 4, 4, 1, 1}, 28, 3, 16,
-            28, 28, 1, 1, 1, 0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_5_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({32, 32, 1, 7, 28, 28, 0, 4}, 1, 64,
-            32, 28, 28, 1, 1, 1, 0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_5_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({32, 32, 1, 7, 28, 28, 0, 4}, 1, 64,
-            32, 28, 28, 1, 1, 1, 0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_6_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_6_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_7_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_7_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_8_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            2, 0, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_8_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            2, 0, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_9_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_9_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_10_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_10_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_11_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_11_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_12_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 3, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_12_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 3, true, true, false, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_13_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 2, 2, 2, 1, 1}, 28, 3, 16,
-            28, 28, 3, 3, 1, 1, true, true, false, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_cpp, Test_2DConv_13_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 2, 2, 2, 1, 1}, 28, 3, 16,
-            28, 28, 3, 3, 1, 1, true, true, false, false, false, true);
-}
-
-// conv with bias/bn/relu/eltwise-add
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_1_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 128, 56, 56, 1, 1,
-            2, 0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_1_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 128, 128, 56, 56, 1, 1,
-            2, 0, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_2_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_2_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            1, 0, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_3_NCX) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 56, 56, 3,
-            3, 2, 1, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_3_NXC) {
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 56, 56, 3,
-            3, 2, 1, true, true, true, false, false, true);
-}
-
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_4_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 64, 56, 56, 1, 1, 1,
-            0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_4_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 28, 64, 64, 56, 56, 1, 1, 1,
-            0, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_5_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 1, 64, 64, 56, 56, 1, 1, 1,
-            0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_5_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd, 1, 64, 64, 56, 56, 1, 1, 1,
-            0, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_6_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 4, 4, 4, 1, 1}, 28, 3, 16,
-            28, 28, 1, 1, 1, 0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_6_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 4, 4, 4, 1, 1}, 28, 3, 16,
-            28, 28, 1, 1, 1, 0, true, true, true, false, false, true);
-}
-
-// conv3x3 with bias/bn/relu/eltwise-add
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_7_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_7_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 0, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_8_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            3, 0, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_8_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 128, 28, 28, 3, 3,
-            3, 0, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_9_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_9_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 128, 64, 28, 28, 3,
-            3, 1, 1, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_10_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_10_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 56, 56, 3, 3,
-            2, 2, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_11_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_11_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 1, 64, 64, 28, 28, 3, 3,
-            2, 3, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_12_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 3, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_12_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd(cfg_fwd_3x3, 28, 64, 64, 28, 28, 3, 3,
-            1, 3, true, true, true, false, false, true);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_13_NCX) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 2, 2, 2, 1, 1}, 28, 3, 16,
-            28, 28, 3, 3, 1, 1, true, true, true, false, true, false);
-}
-TEST(GCCore_CPU_conv2d_fwd_bias_bn_relu_eleadd_cpp, Test_2DConv_13_NXC) {
-    REQUIRE_AVX512();
-    check_conv_correctness_and_tuning_fwd({3, 1, 1, 2, 2, 2, 1, 1}, 28, 3, 16,
-            28, 28, 3, 3, 1, 1, true, true, true, false, false, true);
-}
-
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_1x1_1) {
-    check_conv_correctness_and_tuning_bwd_d(28, 256, 128, 28, 28, 1, 1, 1, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_1x1_2) {
-    check_conv_correctness_and_tuning_bwd_d(28, 256, 128, 28, 28, 1, 1, 2, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_1x1_3) {
-    check_conv_correctness_and_tuning_bwd_d(28, 256, 128, 112, 112, 1, 1, 1, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_1x1_4) {
-    check_conv_correctness_and_tuning_bwd_d(28, 64, 64, 56, 56, 1, 1, 1, 2);
-}
-
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_3x3_1) {
-    check_conv_correctness_and_tuning_bwd_d(28, 256, 128, 28, 28, 3, 3, 1, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_3x3_2) {
-    check_conv_correctness_and_tuning_bwd_d(28, 256, 128, 28, 28, 3, 3, 2, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_3x3_3) {
-    REQUIRE_AVX512();
-    SET_THREADS_OR_SKIP(28);
-    check_conv_correctness_and_tuning_bwd_d(28, 256, 128, 28, 28, 3, 3, 1, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_3x3_4) {
-    REQUIRE_AVX512();
-    SET_THREADS_OR_SKIP(28);
-    check_conv_correctness_and_tuning_bwd_d(28, 64, 64, 28, 28, 3, 3, 2, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_d_cpp, TestCONV2D_3x3_5) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    check_conv_correctness_and_tuning_bwd_d(
-            1, 64, 64, 56, 56, 3, 3, 1, 1, true);
-    check_conv_correctness_and_tuning_bwd_d(
-            56, 64, 64, 28, 28, 3, 3, 1, 1, true);
-    check_conv_correctness_and_tuning_bwd_d(
-            56, 65, 121, 28, 28, 3, 3, 1, 1, true);
-}
-
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_1) {
-    check_conv_correctness_and_tuning_bwd_w(28, 256, 128, 28, 28, 1, 1, 1, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_2) {
-    check_conv_correctness_and_tuning_bwd_w(28, 256, 128, 28, 28, 1, 1, 2, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_3) {
-    check_conv_correctness_and_tuning_bwd_w(28, 256, 128, 28, 28, 1, 1, 1, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_4) {
-    check_conv_correctness_and_tuning_bwd_w(28, 64, 64, 56, 56, 1, 1, 1, 2);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_5) {
-    REQUIRE_BF16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 1, 1, 1, 0, datatypes::bf16);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_6) {
-    REQUIRE_BF16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 1, 1, 2, 0, datatypes::bf16);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_5_f16) {
-    REQUIRE_FP16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 1, 1, 1, 0, datatypes::f16);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_1x1_6_f16) {
-    REQUIRE_FP16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 1, 1, 2, 0, datatypes::f16);
-}
-
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_1) {
-    check_conv_correctness_and_tuning_bwd_w(28, 256, 128, 28, 28, 3, 3, 1, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_2) {
-    check_conv_correctness_and_tuning_bwd_w(28, 256, 128, 28, 28, 3, 3, 2, 0);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_3) {
-    check_conv_correctness_and_tuning_bwd_w(28, 256, 128, 28, 28, 3, 3, 1, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_4) {
-    check_conv_correctness_and_tuning_bwd_w(28, 64, 64, 56, 56, 3, 3, 2, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_5) {
-    REQUIRE_AVX512();
-    SET_THREADS_OR_SKIP(28);
-    check_conv_correctness_and_tuning_bwd_w(32, 32, 32, 28, 28, 3, 3, 1, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_6) {
-    REQUIRE_AVX512();
-    SET_THREADS_OR_SKIP(28);
-    check_conv_correctness_and_tuning_bwd_w(32, 32, 32, 56, 56, 3, 3, 2, 1);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_7) {
-    REQUIRE_BF16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 3, 3, 1, 1, datatypes::bf16);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_8) {
-    REQUIRE_BF16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 3, 3, 2, 1, datatypes::bf16);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_7_f16) {
-    REQUIRE_FP16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 3, 3, 1, 1, datatypes::f16);
-}
-TEST(GCCore_CPU_conv2d_bwd_w_cpp, TestCONV2D_3x3_8_f16) {
-    REQUIRE_FP16();
-    check_conv_correctness_and_tuning_bwd_w(
-            1, 64, 64, 56, 56, 3, 3, 2, 1, datatypes::f16);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_1_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_1_NXC_fuse_bias) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 1, 1, {1, 1},
-            {0, 0}, true, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_1_NXC_fuse_bn_relu) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 1, 1, {1, 1},
-            {0, 0}, true, true, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_1_NXC_fuse_eleadd) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_2_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_2_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_3_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 55, /*real_W*/ 55);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_3_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 55, /*real_W*/ 55);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_4_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 67, /*real_W*/ 67);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_4_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 67, /*real_W*/ 67);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_5_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 12, 12, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-    check_conv_correctness_and_tuning_fwd(1, 256, 64, -1, 12, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-    check_conv_correctness_and_tuning_fwd(1, 256, 64, 12, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_5_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_6_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_6_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_7_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 1024, 256, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_7_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 1024, 256, -1, -1, 1, 1, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_1_NXC_stride2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 1, 1, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_1_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_2_NXC_stride2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_2_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_3_NXC_stride2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 55, 55, 1, 1, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 55, /*real_W*/ 55);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_3_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 55, 55, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 55, /*real_W*/ 55);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_4_NXC_stride2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 67, /*real_W*/ 67);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_4_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 67, /*real_W*/ 67);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_5_NXC_stride2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_5_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_6_NXC_stride2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_6_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_1_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 58, 58, 3, 3, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 58, /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_2_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 58, /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_3_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 69, /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_4_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 9, /*real_W*/ 9);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_5_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 6, /*real_W*/ 6);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_6_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 58, 58, 3, 3, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 58, /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_7_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 69, /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_8_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {3, 2},
-            {0, 0}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 69, /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_1_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 3, 3, {1, 1},
-            {1, 1}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_2_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_3_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 67, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_4_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 67, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_5_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {2, 2}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_6_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 7, /*real_W*/ 7);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_7_NXC) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {1, 1}, false, false, false,
-            /*real_N*/ 8, /*real_H*/ 20, /*real_W*/ 20);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_1_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 58, 58, 3, 3, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 58, /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_2_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 58, /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_3_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 69, /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_4_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 9, /*real_W*/ 9);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_5_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 6, /*real_W*/ 6);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_6_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 58, 58, 3, 3, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 58, /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_7_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 69, /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_8_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {3, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 69, /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_1_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 56, 56, 3, 3, {1, 1},
-            {1, 1}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_2_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_3_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 67, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_4_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 67, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_5_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {2, 2}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_6_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 7, /*real_W*/ 7);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_7_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {1, 1}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 20, /*real_W*/ 20);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_3x3_padding_8_NXC_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 256, 12, 12, 3, 3, {3, 3},
-            {1, 1}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp, TestConv2D_1x1_7_NXC_stride2_fuse) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, 2, 2, 1, 1, {2, 2},
-            {0, 0}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 2, /*real_W*/ 2);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp,
-        TestConv2D_3x3_padding_large_padding_1) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 64, 64, -1, -1, 3, 3, {1, 1},
-            {5, 5}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp,
-        TestConv2D_3x3_padding_large_padding_2) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 64, 64, -1, -1, 3, 3, {1, 1},
-            {6, 6}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp,
-        TestConv2D_3x3_padding_large_padding_3) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 64, 64, -1, -1, 3, 3, {2, 2},
-            {6, 6}, false, false, false,
-            /*real_N*/ 1, /*real_H*/ 56, /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp,
-        TestConv2D_3x3_padding_large_padding_4) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 256, 12, 12, 3, 3, {3, 3},
-            {6, 6}, true, true, true,
-            /*real_N*/ 1, /*real_H*/ 12, /*real_W*/ 12);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp,
-        TestConv2D_3x3_padding_large_padding_5) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {4, 4}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 20, /*real_W*/ 20);
-}
-
-TEST(GCCore_CPU_dynamic_conv2d_fwd_cpp,
-        TestConv2D_3x3_padding_large_padding_6) {
-    REQUIRE_AVX2();
-    check_conv_correctness_and_tuning_fwd(-1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {100, 100}, true, true, true,
-            /*real_N*/ 8, /*real_H*/ 20, /*real_W*/ 20);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_conv3d.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_conv3d.cpp
deleted file mode 100644
index 9dbf8a442c7..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_conv3d.cpp
+++ /dev/null
@@ -1,377 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "reference/conv_ref.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/convolution.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <util/reflection.hpp>
-using namespace dnnl::impl::graph::gc;
-
-using conv_fwd_config_t = ops::conv_fwd_config_t;
-
-static inline graph_tensor_ptr make_tensor(const sc_dims &shape,
-        const sc_data_type_t dtype = datatypes::f32,
-        const sc_data_format_t &fmt = sc_data_format_t()) {
-    return std::make_shared<graph_tensor>(nullptr, fmt, shape, dtype);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void check_conv_fwd_correctness(conv_fwd_config_t cfg,
-        const sc_dims &input_dims, const sc_dims &weight_dims,
-        const sc_dims &stride, const sc_dims &padding, bool fuse_bias = false,
-        bool default_cfg = false) {
-    REQUIRE_AVX2();
-    COMPILE_ASSERT(input_dims.size() == 5,
-            "input_dims is expected to be 5D tensor, but got "
-                    << input_dims.size() << "D.");
-    COMPILE_ASSERT(weight_dims.size() == 5,
-            "weight_dims is expected to be 5D tensor, but got "
-                    << weight_dims.size() << "D.");
-    int mb = input_dims[0], ic = input_dims[1], id = input_dims[2],
-        ih = input_dims[3], iw = input_dims[4];
-    int oc = weight_dims[0], kd = weight_dims[2], kh = weight_dims[3],
-        kw = weight_dims[4];
-
-    int sd = stride[0], sh = stride[0], sw = stride[0];
-    if (stride.size() > 1) {
-        COMPILE_ASSERT(stride.size() == 3,
-                "stride is expected to be 3D tensor, but got " << stride.size()
-                                                               << "D.");
-        sh = stride[1];
-        sw = stride[2];
-    }
-    int pd = padding[0], ph = padding[0], pw = padding[0];
-    if (padding.size() > 1) {
-        COMPILE_ASSERT(padding.size() == 3,
-                "padding is expected to be 3D tensor, but got "
-                        << padding.size() << "D.");
-        ph = padding[1];
-        pw = padding[2];
-    }
-
-    sc_graph_t g;
-    auto src_dtype = sc_data_traits_t<src_type>::type();
-    auto wei_dtype = sc_data_traits_t<wei_type>::type();
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    auto in_data = g.make_input({make_tensor(input_dims, src_dtype)});
-    auto in_weight = g.make_input({make_tensor(weight_dims, wei_dtype)});
-    auto conv_out = g.make("conv_fwd_core",
-            {in_data->get_outputs()[0], in_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"paddings", padding}});
-    auto tunop = conv_out->template dyn_cast<tunable_op_t>();
-    int od = 0, oh = 0, ow = 0;
-    {
-        auto gen = tunop->create_generator();
-        auto conv_gen = (ops::gen_conv_fwd_t *)gen.get();
-        std::tie(od, oh, ow) = conv_gen->get_output_shape();
-        reflection::shared_general_object_t cfgptr;
-        if (!default_cfg) {
-            cfgptr = reflection::general_object_t::make(cfg);
-        } else {
-            cfgptr = gen->get_default_config(get_default_context());
-            cfg = *(conv_fwd_config_t *)cfgptr.get();
-        }
-        tunop->set_config(cfgptr);
-        auto pcfg = (conv_fwd_config_t *)cfgptr.get();
-        tunop->get_inputs()[0]->details_.set_format(
-                sc_data_format_t::NCDHWc(pcfg->C_block));
-        if (std::is_same<wei_type, int8_t>::value) {
-            tunop->get_inputs()[1]->details_.set_format(
-                    sc_data_format_t::KCDRSck4c(pcfg->C_block, pcfg->K_block));
-        } else {
-            tunop->get_inputs()[1]->details_.set_format(
-                    sc_data_format_t::KCDRSck(pcfg->C_block, pcfg->K_block));
-        }
-        tunop->get_outputs()[0]->details_.set_format(
-                sc_data_format_t::NCDHWc(pcfg->K_block));
-    }
-    fuse_arg_ops = {in_data, in_weight};
-    sc_op_ptr final_out = conv_out;
-    auto bc_axis = std::vector<std::pair<int, sc_dims>> {{1, {}}};
-    if (fuse_bias) {
-        auto bias_in = g.make_input({make_tensor({oc})});
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], bias_in->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        fuse_arg_ops.emplace_back(bias_in);
-    }
-    auto out = g.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-    auto f = lower_graph(get_test_ctx(), g, fuse_arg_ops);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    auto output = alloc_array<dst_type>(mb * oc * od * oh * ow, INIT_NOOP);
-    auto input = alloc_array<src_type>(mb * ic * id * ih * iw);
-    auto weight = alloc_array<wei_type>(oc * ic * kd * kh * kw);
-    auto bias = alloc_array<float>(oc);
-
-    std::vector<generic_val> sc_args = {&output[0], &input[0], &weight[0]};
-
-    if (fuse_bias) sc_args.emplace_back(&bias[0]);
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-
-    auto sc_output = NCDHWc2NCDHW(
-            output, mb, oc / cfg.K_block, od, oh, ow, cfg.K_block);
-    auto ref_input = NCDHWc2NCDHW(
-            input, mb, ic / cfg.C_block, id, ih, iw, cfg.C_block);
-    test_buffer<wei_type> ref_weight;
-
-    if (std::is_same<wei_type, int8_t>::value) {
-        ref_weight = KCDRSckc2KCDRS(weight, oc / cfg.K_block, ic / cfg.C_block,
-                kd, kh, kw, utils::divide_and_ceil(cfg.C_block, 4), cfg.K_block,
-                4);
-    } else {
-        ref_weight = KCDRSck2KCDRS(weight, oc / cfg.K_block, ic / cfg.C_block,
-                kd, kh, kw, cfg.C_block, cfg.K_block);
-    }
-
-    auto ref_bias = std::move(bias);
-    test_buffer<dst_type> ref_output(mb * oc * od * oh * ow);
-
-    compute_ref_direct_fwd(mb, 1, oc, ic, ih, iw, oh, ow, kh, kw, sh, sw, ph,
-            pw, &ref_input[0], &ref_weight[0], &ref_bias[0], &ref_output[0],
-            fuse_bias ? dir_t::FWD_B : FWD_I, nullptr, nullptr, false, od, id,
-            sd, pd, kd);
-
-    test_utils::compare_data(sc_output, ref_output, 1e-3f, 1e-3f);
-}
-
-void check_conv_bwd_d_correctness(int N, int K, int C, int D, int H, int W,
-        int KD, int R, int S, int stride, int padding) {
-    REQUIRE_AVX2();
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_dims stride_arr = {stride, stride, stride};
-    sc_dims padding_arr = {padding, padding, padding};
-    int O = (D + 2 * padding - KD) / stride + 1;
-    int P = (H + 2 * padding - R) / stride + 1;
-    int Q = (W + 2 * padding - S) / stride + 1;
-    auto in_a = mgr.make_input({graph_tensor::make({N, K, O, P, Q})});
-    auto in_weight = mgr.make_input({graph_tensor::make({K, C, KD, R, S})});
-    auto conv_out = mgr.make("conv_bwd_data_core",
-            {in_a->get_outputs()[0], in_weight->get_outputs()[0]},
-            {graph_tensor::make({N, C, D, H, W})},
-            {{"strides", stride_arr}, {"paddings", padding_arr},
-                    {"dst_shape", sc_dims {N, C, D, H, W}}});
-
-    fuse_arg_ops = {in_a, in_weight};
-    const sc_op_ptr &final_out = conv_out;
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    graph_driver(mgr, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), mgr, fuse_arg_ops);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    auto grad = alloc_array<float>(N * P * O * Q * K);
-    auto grad_data = alloc_array<float>(N * H * D * W * C);
-    auto weight = alloc_array<float>(K * C * KD * R * S);
-    test_buffer<float> bias(K);
-    bias.zeroout();
-
-    std::vector<float *> sc_args = {&grad_data[0], &grad[0], &weight[0]};
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-
-    auto mkldnn_grad = std::move(grad);
-    auto mkldnn_weight = std::move(weight);
-    auto mkldnn_bias = std::move(bias);
-    test_buffer<float> mkldnn_grad_data(N * C * D * H * W);
-    compute_ref_direct_bwd_d(N, 1, K, C, H, W, P, Q, R, S, stride, stride,
-            padding, padding, &mkldnn_grad_data[0], &mkldnn_weight[0],
-            &mkldnn_bias[0], &mkldnn_grad[0], dir_t::BWD_D, O, D, stride,
-            padding, KD);
-
-    test_utils::compare_data(grad_data, mkldnn_grad_data, 1e-3f, 1e-4f);
-}
-
-void check_conv_bwd_w_correctness(int N, int K, int C, int D, int H, int W,
-        int KD, int R, int S, int stride, int padding) {
-    REQUIRE_AVX2();
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_dims stride_arr = {stride, stride, stride};
-    sc_dims padding_arr = {padding, padding, padding};
-    int O = (D + 2 * padding - KD) / stride + 1;
-    int P = (H + 2 * padding - R) / stride + 1;
-    int Q = (W + 2 * padding - S) / stride + 1;
-    auto in_data = mgr.make_input({graph_tensor::make({N, C, D, H, W})});
-    auto in_diff_dst = mgr.make_input({graph_tensor::make({N, K, O, P, Q})});
-    auto conv_out = mgr.make("conv_bwd_weight_core",
-            {in_data->get_outputs()[0], in_diff_dst->get_outputs()[0]},
-            {graph_tensor::make({K, C, KD, R, S})},
-            {{"strides", stride_arr}, {"paddings", padding_arr},
-                    {"weights_shape", sc_dims {K, C, KD, R, S}}});
-
-    fuse_arg_ops = {in_data, in_diff_dst};
-    const sc_op_ptr &final_out = conv_out;
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    graph_driver(mgr, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), mgr, fuse_arg_ops);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    auto data = alloc_array<float>(N * D * H * W * C);
-    auto grad = alloc_array<float>(N * O * P * Q * K);
-    auto grad_weight = alloc_array<float>(K * C * KD * R * S);
-
-    std::vector<float *> sc_args = {&grad_weight[0], &data[0], &grad[0]};
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-
-    auto mkldnn_grad = std::move(grad);
-    auto mkldnn_data = std::move(data);
-    test_buffer<float> mkldnn_grad_weight(K * C * KD * R * S);
-
-    compute_ref_bwd_weights(N, 1, K, C, H, W, P, Q, R, S, stride, stride,
-            padding, padding, &mkldnn_data[0], &mkldnn_grad_weight[0],
-            &mkldnn_grad[0], dir_t::BWD_W, O, D, stride, padding, KD);
-    test_utils::compare_data(grad_weight, mkldnn_grad_weight, 1e-3f, 5e-3f);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_1) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {1, 1, 5, 5, 5}, {1, 1, 3, 3, 3}, {1, 1, 1}, {0, 0, 0}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_2) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {16, 64, 56, 56, 56}, {64, 64, 3, 3, 3}, {2, 2, 2}, {0, 0, 0},
-            false, true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_PAD_1) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {1, 1, 9, 9, 9}, {1, 1, 3, 3, 3}, {2, 2, 2}, {1, 1, 1}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_PAD_2) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {1, 1, 9, 9, 9}, {1, 1, 3, 3, 3}, {2, 2, 2}, {2, 2, 2}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_PAD_3) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {16, 16, 28, 28, 28}, {16, 16, 3, 3, 3}, {1, 1, 2}, {2, 2, 1},
-            false, true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_PAD_4) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {16, 16, 28, 28, 28}, {16, 16, 3, 3, 3}, {2, 1, 2}, {1, 2, 2},
-            false, true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_PAD_5) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {1, 1, 5, 5, 5}, {1, 1, 3, 3, 3}, {2, 1, 2}, {3, 4, 4}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, CONV3D_3x3_PAD_6) {
-    check_conv_fwd_correctness<float, float, float>(conv_fwd_config_t(),
-            {16, 16, 28, 28, 28}, {16, 16, 3, 3, 3}, {2, 1, 2}, {3, 4, 4},
-            false, true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, QCONV3D_3X3_PAD_1) {
-    REQUIRE_AMX();
-    check_conv_fwd_correctness<uint8_t, int8_t, int32_t>(conv_fwd_config_t(),
-            {1, 4, 5, 5, 5}, {4, 4, 3, 3, 3}, {1, 1, 1}, {1, 1, 1}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, QCONV3D_3X3_PAD_2) {
-    REQUIRE_AMX();
-    check_conv_fwd_correctness<uint8_t, int8_t, int32_t>(conv_fwd_config_t(),
-            {16, 16, 28, 28, 28}, {16, 16, 3, 3, 3}, {2, 2, 2}, {1, 2, 1},
-            false, true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, QCONV3D_3X3_PAD_3) {
-    REQUIRE_AMX();
-    check_conv_fwd_correctness<uint8_t, int8_t, int32_t>(conv_fwd_config_t(),
-            {1, 16, 28, 28, 28}, {16, 16, 3, 3, 3}, {1, 1, 2}, {3, 4, 4}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, QCONV3D_3x3_1) {
-    REQUIRE_VNNI();
-    check_conv_fwd_correctness<uint8_t, int8_t, int32_t>(conv_fwd_config_t(),
-            {1, 4, 5, 5, 5}, {4, 4, 3, 3, 3}, {1, 1, 1}, {0, 0, 0}, false,
-            true);
-}
-
-TEST(GCCore_CPU_conv3d_fwd, QCONV3D_3x3_2) {
-    REQUIRE_VNNI();
-    check_conv_fwd_correctness<uint8_t, int8_t, int32_t>(conv_fwd_config_t(),
-            {16, 64, 56, 56, 56}, {64, 64, 3, 3, 3}, {2, 2, 2}, {0, 0, 0},
-            false, true);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_d, CONV3D_1x1_1) {
-    check_conv_bwd_d_correctness(1, 4, 8, 2, 5, 5, 1, 1, 1, 1, 0);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_d, CONV3D_1x1_2) {
-    check_conv_bwd_d_correctness(1, 4, 8, 2, 5, 5, 1, 1, 1, 2, 0);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_d, CONV3D_1x1_3) {
-    check_conv_bwd_d_correctness(16, 64, 64, 8, 28, 28, 1, 1, 1, 2, 0);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_d, CONV3D_1x1_4) {
-    check_conv_bwd_d_correctness(16, 64, 64, 8, 28, 28, 1, 1, 1, 1, 1);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_w, CONV3D_1x1_1) {
-    check_conv_bwd_w_correctness(1, 4, 8, 2, 5, 5, 1, 1, 1, 1, 0);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_w, CONV3D_1x1_2) {
-    check_conv_bwd_w_correctness(1, 4, 8, 2, 5, 5, 1, 1, 1, 2, 0);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_w, CONV3D_1x1_3) {
-    check_conv_bwd_w_correctness(16, 64, 64, 8, 28, 28, 1, 1, 1, 2, 0);
-}
-
-TEST(GCCore_CPU_conv3d_bwd_w, CONV3D_1x1_4) {
-    check_conv_bwd_w_correctness(16, 64, 64, 8, 28, 28, 1, 1, 1, 1, 1);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_copier.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_copier.cpp
deleted file mode 100644
index 1fe38ee8808..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_copier.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <util/any_map.hpp>
-
-#include <unordered_set>
-
-#include <iostream>
-#include <utility>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-namespace copier_test {
-// Flattens the original IR tree into a set and checks if the copied IR nodes
-// are in the old set
-class copy_validator_t : public ir_viewer_t {
-    std::unordered_set<expr_c> met_expr;
-    std::unordered_set<stmt_c> met_stmt;
-
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    copy_validator_t() = default;
-    enum { FLATTENING = 0, CHECKING } state;
-    void push(const expr_c &v) {
-        if (state == CHECKING) {
-            auto itr = met_expr.find(v);
-            EXPECT_TRUE(itr == met_expr.end());
-        } else {
-            met_expr.insert(v);
-        }
-    }
-
-    void push(const stmt_c &v) {
-        if (state == CHECKING) {
-            auto itr = met_stmt.find(v);
-            EXPECT_TRUE(itr == met_stmt.end());
-        } else {
-            met_stmt.insert(v);
-        }
-    }
-
-    void view(constant_c v) override { push(v); }
-    void view(var_c v) override { push(v); }
-    void view(cast_c v) override {
-        push(v);
-        dispatch(v->in_);
-    }
-    void view(binary_c v) override {
-        push(v);
-        dispatch(v->l_);
-        dispatch(v->r_);
-    }
-
-    void view(cmp_c v) override {
-        push(v);
-        dispatch(v->l_);
-        dispatch(v->r_);
-    }
-
-    void view(logic_c v) override {
-        push(v);
-        dispatch(v->l_);
-        dispatch(v->r_);
-    }
-
-    void view(logic_not_c v) override {
-        push(v);
-        dispatch(v->in_);
-    }
-
-    void view(indexing_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(call_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(intrin_call_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(func_addr_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(ssa_phi_c v) override {
-        push(v);
-        ir_viewer_t::dispatch_expr_arr(v->values_);
-    }
-    void view(tensor_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-
-    void view(tensorptr_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-
-    void view(assign_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(stmts_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(if_else_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(evaluate_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(returns_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(define_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-    void view(for_loop_c v) override {
-        push(v);
-        ir_viewer_t::view(v);
-    }
-};
-
-template <typename T>
-void check_impl(T a) {
-    ir_comparer cmper(true, true);
-    std::unordered_map<expr_c, expr> map;
-    ir_copier_t c(map);
-    auto copied = c(a);
-    std::unordered_set<expr_c> orig_flattened;
-    copy_validator_t val;
-    val.state = copy_validator_t::FLATTENING;
-    val.dispatch(a);
-    val.state = copy_validator_t::CHECKING;
-    val.dispatch(copied);
-
-    EXPECT_TRUE(cmper.compare(a, copied));
-}
-
-void check(expr v) {
-    check_impl(std::move(v));
-}
-void check(stmt v) {
-    check_impl(std::move(v));
-}
-void check(func_t v) {
-    check_impl(std::move(v));
-}
-
-} // namespace copier_test
-
-using namespace copier_test;
-
-TEST(GCCore_CPU_copier_cpp, TestCopierExpr) {
-    expr a = 1;
-    check(a);
-    check(make_cast(datatypes::f32, 1));
-    check(make_min(2, 1));
-    check(make_max(2, 1));
-    check(make_add(2, 1));
-    check(make_sub(2, 1));
-    check(make_mul(2, 1));
-    check(make_div(2, 1));
-    check(make_mod(2, 1));
-
-    check(make_cmp_eq(2, 1));
-    check(make_cmp_lt(2, 1));
-    check(make_cmp_le(2, 1));
-    check(make_cmp_gt(2, 1));
-    check(make_cmp_ge(2, 1));
-    check(make_cmp_ne(2, 1));
-    check(make_logic_and(true, false));
-    check(make_logic_or(true, false));
-    check(make_logic_not(false));
-    check(make_tensor("a", {100}, datatypes::f32, address_space::device)[10]);
-    check(make_tensor("a", {100}, datatypes::f32)[10]);
-    auto tmp_tsr = make_tensor("a", {100}, datatypes::f32);
-    int a_val = 123;
-    tmp_tsr.static_as<tensor>()->init_value_
-            = std::make_shared<static_data_t>(&a_val, sizeof(int));
-    check(tmp_tsr);
-    check(tensor_ptr(make_tensor("a", {100}, datatypes::f32), {10}));
-    _decl_func_(datatypes::f32, aaa, _arg_("v1", datatypes::f32));
-    check(aaa(1.23f));
-    check(make_func_addr(aaa));
-    check(make_expr<ssa_phi_node>(std::vector<expr> {1, 2, 3}, false));
-
-    intrin_call intrinnode = make_expr<intrin_call_node>(
-            intrin_type::min, std::vector<expr> {1, 2}, any_map_t());
-    check(intrinnode);
-
-    std::unordered_map<expr_c, expr> map;
-    ir_copier_t c(map);
-    expr callnode = aaa(1.23f);
-    callnode->attr()["someattr"] = 123;
-    EXPECT_EQ(c(callnode)->attr_->get<int>("someattr"), 123);
-
-    intrinnode = builder::make_reinterpret(1, datatypes::f32)
-                         .checked_as<intrin_call>();
-    auto result = c(intrinnode);
-    EXPECT_EQ(result.checked_as<intrin_call>()->intrin_attrs_->get_or_else(
-                      intrin_attr::out_dtype, datatypes::undef),
-            datatypes::f32);
-    EXPECT_EQ(result->dtype_, datatypes::f32);
-}
-
-TEST(GCCore_CPU_copier_cpp, TestCopierStmt) {
-    ir_builder_t bld;
-    bld.push_scope();
-    _var_(a, datatypes::f32);
-    a = 1.23f;
-    check(bld.get_current_scope().as_seq().back());
-
-    _if_(a == 1.23f) { a = 1.23f; }
-    _else_ { a = 123.0f; }
-    check(bld.get_current_scope().as_seq().back());
-
-    _if_(a == 1.23f) { a = 1.23f; }
-    check(bld.get_current_scope().as_seq().back());
-
-    _return_();
-    check(bld.get_current_scope().as_seq().back());
-
-    _return_(1.2345f);
-    check(bld.get_current_scope().as_seq().back());
-
-    _for_(i, 0, 10, 20, for_type::PARALLEL) { a = i; }
-    check(bld.get_current_scope().as_seq().back());
-
-    _var_(va, datatypes::boolean);
-    check(bld.get_current_scope().as_seq().back());
-
-    _var_ex_(vb, datatypes::boolean, linkage::local, expr(true));
-    check(bld.get_current_scope().as_seq().back());
-}
-
-TEST(GCCore_CPU_copier_cpp, TestCopierVar) {
-    auto v = builder::make_var(datatypes::f32, "abc");
-    auto vmapped = builder::make_var(datatypes::f32, "vvv");
-    check(v);
-    std::unordered_map<expr_c, expr> map = {{v, vmapped}};
-    ir_copier_t c(map);
-    expr_c copied = c(v + 1.23f);
-    EXPECT_TRUE(copied.checked_as<add>()->l_.ptr_same(vmapped));
-}
-
-TEST(GCCore_CPU_copier_cpp, TestCopierTensor) {
-    auto v = builder::make_tensor("abc", {1, 2, 3}, datatypes::f32);
-    auto vmapped = builder::make_tensor("ccc", {1, 2, 3}, datatypes::f32);
-    check(v);
-    std::unordered_map<expr_c, expr> map = {{v, vmapped}};
-    ir_copier_t c(map);
-    expr_c copied = c(v[10].get());
-    EXPECT_TRUE(copied.checked_as<indexing>()->ptr_.ptr_same(vmapped));
-}
-
-TEST(GCCore_CPU_copier_cpp, TestCopierFunc) {
-    ir_builder_t bld;
-    _function_(datatypes::f32, AAA, _arg_("a", datatypes::f32),
-            _arg_("A", datatypes::f32, {100, 100})) {
-        _bind_(a, A);
-        _for_(i, 0, 100) {
-            _for_(j, 0, 100) { A[{i, j}] = a; }
-        }
-    }
-    check(AAA);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_data_format.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_data_format.cpp
deleted file mode 100644
index 05235bc6303..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_data_format.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/sc_data_format.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static std::vector<int> get_format_as_vec(sc_data_format_kind_t k) {
-    std::vector<int> ret;
-    ret.reserve(k.ndims());
-    for (int i = 0; i < k.ndims(); i++) {
-        ret.push_back(k.get(i));
-    }
-    return ret;
-}
-
-TEST(GCCore_CPU_data_format_cpp, TestFormatKind) {
-    auto fmt = format_kinds::ABCD;
-    EXPECT_EQ(fmt.ndims(), 4);
-    EXPECT_EQ(fmt.norig_dims(), 4);
-    EXPECT_EQ(get_format_as_vec(fmt), (std::vector<int> {0, 1, 2, 3}));
-
-    fmt = format_kinds::ABCDba;
-    EXPECT_EQ(fmt.ndims(), 6);
-    EXPECT_EQ(fmt.norig_dims(), 4);
-    EXPECT_EQ(get_format_as_vec(fmt), (std::vector<int> {0, 1, 2, 3, 1, 0}));
-
-    EXPECT_EQ(sc_data_format_t::get_plain_by_dims(4),
-            sc_data_format_t(format_kinds::ABCD));
-}
-
-static std::string fmt2str(const sc_data_format_t &v) {
-    std::stringstream ss;
-    ss << v;
-    return ss.str();
-}
-
-TEST(GCCore_CPU_data_format_cpp, TestFormatPrint) {
-    auto fmt = sc_data_format_t::NCHWc(16);
-    EXPECT_EQ(fmt2str(fmt), "ABCD16b");
-}
-
-TEST(GCCore_CPU_data_format_cpp, TestFormatGetShape) {
-    // reorder, no blocking
-    sc_dims plain, blocking;
-    plain = {1, 2, 3, 4};
-    blocking = {2, 1, 4, 3};
-    EXPECT_EQ(sc_data_format_t::get_blocking_shapes(plain,
-                      sc_data_format_t(sc_data_format_kind_t(1, 0, 3, 2))),
-            blocking);
-    EXPECT_EQ(sc_data_format_t::get_padded_plain_shapes(blocking,
-                      sc_data_format_t(sc_data_format_kind_t(1, 0, 3, 2))),
-            plain);
-    // simple blocking
-    EXPECT_EQ(sc_data_format_t::get_blocking_shapes(
-                      {16, 31, 64, 128}, sc_data_format_t::NCHWc(16)),
-            (sc_dims {16, 2, 64, 128, 16}));
-    EXPECT_EQ(sc_data_format_t::get_padded_plain_shapes(
-                      {16, 2, 64, 128, 16}, sc_data_format_t::NCHWc(16)),
-            (sc_dims {16, 32, 64, 128}));
-
-    // nested blocking
-    EXPECT_EQ(sc_data_format_t::get_blocking_shapes(
-                      {63, 256}, sc_data_format_t::NKkn2k(32, 16)),
-            (sc_dims {16, 2, 16, 16, 2}));
-    EXPECT_EQ(sc_data_format_t::get_padded_plain_shapes(
-                      {16, 2, 16, 16, 2}, sc_data_format_t::NKkn2k(32, 16)),
-            (sc_dims {64, 256}));
-    EXPECT_EQ(sc_data_format_t::get_blocking_shapes(
-                      {63, 256}, sc_data_format_t::NKkn2k(-1, 16)),
-            (sc_dims {16, -1, -1, 16, 2}));
-    EXPECT_EQ(sc_data_format_t::get_padded_plain_shapes(
-                      {16, 2, 16, 16, 2}, sc_data_format_t::NKkn2k(-1, 16)),
-            (sc_dims(2, -1)));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dead_func_elim.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dead_func_elim.cpp
deleted file mode 100644
index 8e3f519d784..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dead_func_elim.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/sc_stmt.hpp>
-#include <compiler/ir/transform/dead_func_eliminate.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_dead_func_elimination, TestDFE) {
-    builder::ir_builder_t builder;
-    // indirectly referenced
-    _function_(datatypes::void_t, eee) {}
-    _function_(datatypes::void_t, aaa) { _evaluate_call_(eee); }
-    aaa->body_.as<stmts>()->seq_.emplace_back(
-            builder::make_evaluate_unattached(aaa()));
-    // not referenced
-    _function_(datatypes::void_t, bbb) {}
-    _function_(datatypes::void_t, ccc) { _evaluate_call_(bbb); }
-    // referenced by pointer
-    _function_(datatypes::void_t, ddd) {}
-    _function_(datatypes::void_t, mainf, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        _evaluate_call_(aaa);
-        _var_(a, datatypes::pointer);
-        a = builder::make_func_addr(ddd);
-    }
-
-    aaa->attr()[function_attrs::private_] = true;
-    bbb->attr()[function_attrs::private_] = true;
-    ccc->attr()[function_attrs::private_] = true;
-    ddd->attr()[function_attrs::private_] = true;
-    eee->attr()[function_attrs::private_] = true;
-
-    dead_func_eliminate_t dfe;
-    auto out = dfe(ir_module_t::from_entry_func(get_default_context(), mainf));
-    auto newmain = out->get_entry_func();
-    ASSERT_EQ(newmain, mainf);
-    ASSERT_EQ(out->get_contents().size(), 4UL);
-    ASSERT_EQ(out->get_func("mainf"), mainf);
-    ASSERT_EQ(out->get_func("aaa"), aaa);
-    ASSERT_EQ(out->get_func("eee"), eee);
-    ASSERT_EQ(out->get_func("ddd"), ddd);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dead_write_elim.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dead_write_elim.cpp
deleted file mode 100644
index 31a979559b1..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dead_write_elim.cpp
+++ /dev/null
@@ -1,244 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/pass/dependency_analyzer.hpp>
-#include <compiler/ir/sc_stmt.hpp>
-#include <compiler/ir/transform/dead_write_eliminate.hpp>
-#include <compiler/ir/viewer.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using dependency_analysis::stmt_weak_set;
-namespace ut {
-using namespace dnnl::impl::graph::gc;
-class dep_printer_t : public ir_viewer_t {
-    void view(assign_c v) override {
-        std::cout << "IR=" << v << "\ndepended by:\n";
-        auto &dep = dependency_analysis::get_dep_info(v.get());
-        for (auto v : dep.depended_by_) {
-            auto ptr = v.lock();
-            assert(ptr);
-            std::cout << '\t' << ptr.get() << '\n';
-        }
-        std::cout << "depending on:\n";
-        for (auto v : dep.depends_on_) {
-            auto ptr = v.lock();
-            assert(ptr);
-            std::cout << '\t' << ptr.get() << '\n';
-        }
-    }
-};
-
-} // namespace ut
-
-static std::shared_ptr<stmt_base_t> get_builder_top() {
-    return builder::get_current_builder()
-            ->get_current_scope()
-            .as_seq()
-            .back()
-            .impl;
-}
-
-static void check_dep(const std::shared_ptr<stmt_base_t> &ptr,
-        const stmt_weak_set &depended_by, const stmt_weak_set &depends_on) {
-    auto &dep = dependency_analysis::get_dep_info(ptr.get());
-    bool result = dep.depended_by_ == depended_by;
-    if (!result) { std::cout << "depended_by failed:" << ptr.get() << "\n"; }
-    EXPECT_TRUE(result);
-    result = dep.depends_on_ == depends_on;
-    if (!result) { std::cout << "depends_on failed:" << ptr.get() << "\n"; }
-    EXPECT_TRUE(result);
-}
-
-#define SAVE(v) v = get_builder_top();
-TEST(GCCore_CPU_dependency_analyzer, TestDependency) {
-    builder::ir_builder_t builder;
-    std::shared_ptr<stmt_base_t> b_eq_3, c_eq_3, c_eq_4, c_eq_5, bc_eq_ab,
-            bc_eq_2, c_eq_bb, ab_eq_3, if_else_s, c_eq_ab, t_eq_ai, ai_eq_t,
-            loop_i;
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10000}),
-            _arg_("b", datatypes::s32)) {
-        _bind_(A, b);
-        _tensor_(B, datatypes::f32, 100);
-        _tensor_(D, datatypes::f32, 100);
-        b = 3;
-        SAVE(b_eq_3);
-        _var_(c, datatypes::s32);
-        c = 3;
-        SAVE(c_eq_3);
-        _if_(b == 2) {
-            c = 4;
-            SAVE(c_eq_4);
-        }
-        _else_ {
-            c = 5;
-            SAVE(c_eq_5);
-        }
-        SAVE(if_else_s);
-        B[c] = A[b];
-        SAVE(bc_eq_ab);
-
-        _for_(i, 0, 10) {
-            B[c] = 2;
-            SAVE(bc_eq_2);
-            c = B[b];
-            SAVE(c_eq_bb);
-            A[b] = 3;
-            SAVE(ab_eq_3);
-            c = A[b];
-            SAVE(c_eq_ab);
-        }
-        _for_(i, 0, 10) {
-            _var_(t, datatypes::s32);
-            t = D[i];
-            SAVE(t_eq_ai);
-            D[i] = t + 1;
-            SAVE(ai_eq_t);
-        }
-        SAVE(loop_i);
-    }
-    dependency_analyzer_t ana;
-    ana(ccc);
-    // ut::dep_printer_t p;
-    // p.dispatch(ccc);
-
-    check_dep(b_eq_3, {c_eq_ab, if_else_s, ab_eq_3, c_eq_bb, bc_eq_ab}, {});
-    check_dep(c_eq_3, {c_eq_bb, bc_eq_2, bc_eq_ab, c_eq_4, c_eq_5}, {});
-    check_dep(c_eq_4, {c_eq_bb, bc_eq_2, bc_eq_ab}, {c_eq_3});
-    check_dep(c_eq_5, {c_eq_bb, bc_eq_2, bc_eq_ab}, {c_eq_3});
-    check_dep(bc_eq_ab, {c_eq_ab, c_eq_bb, bc_eq_2, ab_eq_3},
-            {b_eq_3, c_eq_5, c_eq_3, c_eq_4});
-    check_dep(bc_eq_2, {c_eq_bb}, {c_eq_bb, bc_eq_ab, c_eq_5, c_eq_3, c_eq_4});
-    check_dep(c_eq_bb, {c_eq_ab, bc_eq_2},
-            {bc_eq_2, bc_eq_ab, c_eq_5, c_eq_3, c_eq_4, b_eq_3});
-    check_dep(ab_eq_3, {c_eq_ab}, {b_eq_3, bc_eq_ab, c_eq_ab});
-    check_dep(c_eq_ab, {ab_eq_3}, {ab_eq_3, bc_eq_ab, c_eq_bb, b_eq_3});
-    check_dep(t_eq_ai, {ai_eq_t}, {loop_i});
-    check_dep(ai_eq_t, {}, {loop_i, t_eq_ai});
-}
-
-TEST(GCCore_CPU_dead_write_elimination, TestDWE) {
-    builder::ir_builder_t builder;
-    auto mod = std::make_shared<ir_module_t>(get_default_context());
-    _global_tensor_(mod, G, datatypes::f32, 100);
-    _function_(datatypes::f32, ccc, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        _tensor_(B, datatypes::f32, 100);
-        _tensor_(C, datatypes::f32, 100);
-        _tensor_(D, datatypes::f32, 100);
-        _tensor_(E, datatypes::f32, 100);
-
-        // constant index, required by the code in the loop
-        C[1] = 1.0f;
-        _for_(i, 0, 100) {
-            _var_(t, datatypes::f32);
-            _var_(t2, datatypes::f32);
-            _for_(j, 0, 100) {
-                t = D[j];
-                D[j] = t + 1;
-            }
-            t2 = B[i];
-            t = t + A[i] + G[i];
-            t = t * t2;
-            A[i] = C[1];
-            G[i] = t;
-            B[i] = t;
-            // constant index, dead write
-            C[0] = 1.0f;
-            // constant index, required by the code after the loop
-            C[2] = 1.0f;
-            // constant index, overlap with SIMD load
-            C[5] = 1.0f;
-        }
-        _for_(i, 0, 100) {
-            _var_(t, datatypes::f32);
-            t = E[i];
-            t = t + 1;
-            E[i] = t;
-        }
-        _return_(C[2] + builder::make_reduce_add(C[span_t {{4}, 4}]));
-    }
-    dead_write_eliminator_t dwe;
-    auto out = dwe(ccc);
-    _function_(datatypes::f32, expected, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        _tensor_(B, datatypes::f32, 100);
-        _tensor_(C, datatypes::f32, 100);
-        _tensor_(D, datatypes::f32, 100);
-        _tensor_(E, datatypes::f32, 100);
-        C[1] = 1.0f;
-
-        _for_(i, 0, 100) {
-            _var_(t, datatypes::f32);
-            _var_(t2, datatypes::f32);
-            _for_(j, 0, 100) {
-                t = D[j];
-                D[j] = t + 1;
-            }
-            t2 = B[i];
-            t = t + A[i] + G[i];
-            t = t * t2;
-            A[i] = C[1];
-            G[i] = t;
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            C[2] = 1.0f;
-            // constant index, overlap with SIMD load
-            C[5] = 1.0f;
-        }
-        _for_(i, 0, 100) {
-            _var_(t, datatypes::f32);
-            t = E[i];
-            t = t + 1; // successful elimination
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-        }
-        _return_(C[2] + builder::make_reduce_add(C[span_t {{4}, 4}]));
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_dead_write_elimination, TestDWELoopIndependent) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::f32, ccc, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        _tensor_(B, datatypes::f32, 100);
-        _var_(tid, datatypes::f32);
-        tid = 122;
-        B[tid] = 1.0f;
-        _for_(i, 0, 100) {
-            _var_(t, datatypes::f32);
-            t = B[tid];
-            _if_(t == 1.0f) {
-                _var_(t2, datatypes::f32);
-                t2 = 2.0f;
-                B[tid] = t2;
-            }
-        }
-        _return_(0);
-    }
-    dead_write_eliminator_t dwe;
-    auto out = dwe(ccc);
-    EXPECT_TRUE(out == ccc);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dessa_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dessa_transform.cpp
deleted file mode 100644
index ba5ddc9a20b..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dessa_transform.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/dessa_transform.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <compiler/ir/transform/value_numbering.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-constexpr auto s32 = datatypes::s32;
-
-TEST(GCCore_CPU_dessa_transform, TestDeSSATransform) {
-    builder::ir_builder_t builder;
-    func_t print_int_f = builder::make_func("print_int",
-            {builder::make_var(s32, "v")}, stmt(), datatypes::void_t);
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(c, s32, 20);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        g = 1;
-        _if_(a < 10) { a = a + 1; }
-        A[0] = a;
-        _evaluate_call_(print_int_f, a);
-        _return_(a + c);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    dessa_transform_t de;
-    out = de(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(a0, s32);
-        _var_init_(a1, s32, a);
-        _var_init_(c, s32, 20);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        _var_init_(t0, s32, 1);
-        g = t0;
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, datatypes::boolean, a < t1);
-        _if_(t2) {
-            _var_init_(t3, s32, 1);
-            a0 = a + t3;
-            a1 = a0;
-        }
-        _var_init_(t6, s32, 0);
-        A[t6] = a1;
-        _evaluate_call_(print_int_f, a1);
-        _var_init_(t7, s32, a1 + c);
-        _return_(t7);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_dessa_transform, TestDeSSATransformForLostCopy) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(c, s32);
-        _var_(d, s32);
-        d = 0;
-        _for_(i, 0, 10) {
-            c = d;
-            d = d + 1;
-            A[0] = c;
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    dessa_transform_t de;
-    out = de(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(c2, s32);
-        _var_(c2_s, s32);
-        _var_init_(d0, s32, 0);
-        c2_s = d0;
-        _var_init_(t9, s32, 0);
-        _var_init_(t10, s32, 10);
-        _var_init_(t11, s32, 1);
-        _for_(i, t9, t10, t11) {
-            c2 = c2_s;
-            _var_init_(t13, s32, 1);
-            _var_init_(d3, s32, c2 + t13);
-            c2_s = d3;
-            _var_init_(t15, s32, 0);
-            A[t15] = c2;
-        }
-        _var_init_(t18, s32, 0);
-        _return_(t18);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_dessa_transform, TestDeSSATransformForNoRedundantCopy) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(c, s32);
-        _var_init_(d, s32, 1);
-        _for_(i, 0, 10) {
-            _if_(d == 0) { c = 1; }
-            _else_ { c = 2; }
-            A[0] = c;
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    dessa_transform_t de;
-    out = de(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(d, s32, 1);
-        _var_init_(t9, s32, 0);
-        _var_init_(t10, s32, 10);
-        _var_init_(t11, s32, 1);
-        _for_(i, t9, t10, t11) {
-            _var_(c1, s32);
-            _var_(c2, s32);
-            _var_(c3, s32);
-            _var_init_(d0, s32, d);
-            _var_init_(t23, s32, 0);
-            _var_init_(t24, datatypes::boolean, d0 == t23);
-            _if_(t24) {
-                c1 = 1;
-                c3 = c1;
-            }
-            _else_ {
-                c2 = 2;
-                c3 = c2;
-            }
-            _var_init_(t28, s32, 0);
-            A[t28] = c3;
-        }
-        _var_init_(t29, s32, 0);
-        _return_(t29);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_dessa_transform, TestDeSSATransformForSwapProblem) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(c, s32, 20);
-        _var_init_(d, s32, 1);
-        d = 0;
-        _for_(i, 0, 10) {
-            _var_init_(tmp, s32, c);
-            c = d;
-            d = tmp;
-        }
-        _return_(c + d);
-    }
-
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    dessa_transform_t de;
-    out = de(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(c1, s32);
-        _var_(c1_s, s32);
-        _var_(d4, s32);
-        _var_(c3, s32);
-        _var_(c3_s, s32);
-        _var_(c5, s32);
-        _var_(d6, s32);
-        _var_init_(c, s32, 20);
-        c1_s = c;
-        c5 = c;
-        _var_init_(d0, s32, 0);
-        c3_s = d0;
-        d6 = d0;
-        _var_init_(t9, s32, 0);
-        _var_init_(t10, s32, 10);
-        _var_init_(t11, s32, 1);
-        _for_(i, t9, t10, t11) {
-            c1 = c1_s;
-            c3 = c3_s;
-            d4 = c1;
-            c3_s = d4;
-            d6 = d4;
-            c1_s = c3;
-            c5 = c3;
-        }
-        _var_init_(t40, s32, c5 + d6);
-        _return_(t40);
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_dessa_transform, TestDeSSATransformCoalesce) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(d, s32, 1);
-
-        _for_(i, 0, 10, 1) {
-            i->dtype_ = s32;
-            _var_init_(f, s32, d);
-            a = a + A[i];
-            _for_(j, 0, 10, 1) {
-                f = f + a;
-                f = f + A[i];
-            }
-            d = f;
-        }
-
-        _return_(a + d);
-    }
-
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    dessa_transform_t de;
-    out = de(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(a_c, s32, a);
-        _var_init_(d, s32, 1);
-        _var_init_(d_c, s32, d);
-        _var_init_(t0, s32, 0);
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, s32, 1);
-        _for_(i, t0, t1, t2) {
-            i->dtype_ = s32;
-            _var_init_(f, s32, d_c);
-            _var_init_(f_c, s32, f);
-            _var_init_(t5, s32, A[i]);
-            a_c = a_c + t5;
-            _var_init_(t7, s32, 0);
-            _var_init_(t8, s32, 10);
-            _var_init_(t9, s32, 1);
-            _for_(j, t7, t8, t9) {
-                _var_init_(a4, s32, a_c);
-                _var_init_(f1, s32, f_c + a4);
-                _var_init_(i_6, s32, i);
-                _var_init_(t14, s32, A[i_6]);
-                f_c = f1 + t14;
-            }
-            d_c = f_c;
-        }
-        _var_init_(a1, s32, a_c);
-        _var_init_(d1, s32, d_c);
-        _var_init_(t19, s32, a1 + d1);
-        _return_(t19);
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_dessa_transform, TestDeSSATransformCoalesceCSE) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("a", s32), _arg_("b", s32), _arg_("c", s32)) {
-        _bind_(a, b, c);
-
-        _var_init_(d, s32, (b + c));
-        _for_(i, 0, 10, 1) {
-            i->dtype_ = s32;
-            _var_init_(f, s32, d);
-            a = a + (b + c);
-            _for_(j, 0, 10, 1) {
-                f = f + a;
-                f = f + (b + c);
-            }
-            d = f;
-        }
-        _return_(a + d + (b + c));
-    }
-
-    ssa_transform_t s;
-    auto out = s(ccc);
-    out = value_numbering_t()(out);
-    dessa_transform_t de;
-    out = de(out);
-
-    _function_(
-            s32, expected, _arg_("a", s32), _arg_("b", s32), _arg_("c", s32)) {
-        _bind_(a, b, c);
-
-        _var_init_(a_c, s32, a);
-        _var_init_(d, s32, (b + c));
-        _var_init_(d_c, s32, d);
-        _for_(i, 0, 10, 1) {
-            i->dtype_ = s32;
-            _var_init_(f, s32, d_c);
-            _var_init_(f_c, s32, f);
-            a_c = a_c + d;
-            _for_(j, 0, 10, 1) {
-                _var_init_(f1, s32, f_c + a_c);
-                f_c = f1 + d;
-            }
-            d_c = f_c;
-        }
-        _var_init_(a1, s32, a_c);
-        _var_init_(d1, s32, d_c);
-        _var_init_(t19, s32, a1 + d1);
-        _var_init_(t20, s32, t19 + d);
-        _return_(t20);
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dyn_tsr_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dyn_tsr_transform.cpp
deleted file mode 100644
index 39f33df1ff7..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dyn_tsr_transform.cpp
+++ /dev/null
@@ -1,268 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/dyn_tsr_transform.hpp>
-#include <compiler/ir/util_module_passes.hpp>
-#include <compiler/jit/jit.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-expr dyn_var0 = builder::make_var(datatypes::index, "dyn_var0");
-expr dyn_var1 = builder::make_var(datatypes::index, "dyn_var1");
-expr dyn_var2 = builder::make_var(datatypes::index, "dyn_var2");
-
-TEST(GCCore_CPU_dyn_tsr_transform_cpp, TestFunctionDefinitionParams) {
-    REQUIRE_AVX2();
-    builder::ir_builder_t builder;
-    _function_(datatypes::index, ccc,
-            _arg_("A", datatypes::f32, {dyn_var0, 100}),
-            _arg_("B", datatypes::f32, {200, dyn_var0}),
-            _arg_("C", datatypes::f32, {dyn_var1, 300}),
-            _arg_("D", datatypes::f32, {400, 500})) {
-        _bind_(A, B, C, D);
-        A->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var0, 100});
-        B->attr().set(attr_keys::plain_dims, std::vector<expr> {200, dyn_var0});
-        C->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var1, 300});
-
-        D[{0, 0}] = A[{0, 0}] + B[{0, 1}] + C[{0, 2}];
-        _return_(dyn_var0 + dyn_var1);
-    }
-    expr sz = sizeof(runtime::dynamic_tensor_t);
-    _function_(datatypes::index, expected, _arg_("dyn_A", datatypes::u8, {sz}),
-            _arg_("dyn_B", datatypes::u8, {sz}),
-            _arg_("dyn_C", datatypes::u8, {sz}),
-            _arg_("D", datatypes::f32, {400, 500})) {
-        _bind_(dyn_A, dyn_B, dyn_C, D);
-        expr A = builder::make_tensor("A", {dyn_var0, 100}, datatypes::f32);
-        expr A_shape = builder::make_tensor(
-                "dyn_shape_A", {UINT64_C(2)}, datatypes::index);
-        expr B = builder::make_tensor("B", {200, dyn_var0}, datatypes::f32);
-        expr C = builder::make_tensor("C", {dyn_var1, 300}, datatypes::f32);
-        expr C_shape = builder::make_tensor(
-                "dyn_shape_C", {UINT64_C(2)}, datatypes::index);
-        builder.push_var_tensor_def(A_shape, linkage::local,
-                builder::make_read_struct(dyn_A, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr));
-        builder.push_var_tensor_def(dyn_var0, linkage::local,
-                builder::make_indexing(A_shape, {UINT64_C(0)}));
-        builder.push_var_tensor_def(A, linkage::local,
-                builder::make_read_struct(dyn_A, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        builder.push_var_tensor_def(B, linkage::local,
-                builder::make_read_struct(dyn_B, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        builder.push_var_tensor_def(C_shape, linkage::local,
-                builder::make_read_struct(dyn_C, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr));
-        builder.push_var_tensor_def(dyn_var1, linkage::local,
-                builder::make_indexing(C_shape, {UINT64_C(0)}));
-        builder.push_var_tensor_def(C, linkage::local,
-                builder::make_read_struct(dyn_C, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        D[{0, 0}] = builder::make_indexing(A, {0, 0})
-                + builder::make_indexing(B, {0, 1})
-                + builder::make_indexing(C, {0, 2});
-        _return_(dyn_var0 + dyn_var1);
-    }
-
-    dyn_tensor_transformer_t transformer;
-    func_c newf = transformer(ccc);
-    ir_comparer cmper;
-    ASSERT_TRUE(cmper.compare(newf, expected));
-    std::vector<float> data(10000);
-
-    runtime::dynamic_tensor_t A, B, C;
-
-    auto jitf = jit_engine_t::make(get_test_ctx())
-                        ->get_entry_func(ir_module_t::from_entry_func(
-                                get_test_ctx(), ccc));
-    std::vector<float> in(10000), out(10000);
-    in[0] = 3, in[1] = 4, in[2] = 6;
-    int64_t A_shape[2] = {256, 100};
-    int64_t C_shape[2] = {512, 300};
-    A.data_ = in.data();
-    B.data_ = in.data();
-    C.data_ = in.data();
-    A.dims_ = A_shape;
-    C.dims_ = C_shape;
-    uint64_t ret = jitf->call<uint64_t>(&A, &B, &C, out.data());
-    ASSERT_EQ(out[0], 13);
-    ASSERT_EQ(ret, 768UL);
-}
-
-TEST(GCCore_CPU_dyn_tsr_transform_cpp, TestFunctionCaller) {
-    REQUIRE_AVX2();
-    builder::ir_builder_t builder;
-    _function_(datatypes::index, ccc,
-            _arg_("A", datatypes::f32, {dyn_var0, 100}),
-            _arg_("B", datatypes::f32, {200, dyn_var0}),
-            _arg_("C", datatypes::f32, {dyn_var1, 300}),
-            _arg_("D", datatypes::f32, {dyn_var2, dyn_var0})) {
-        _bind_(A, B, C, D);
-        A->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var0, 100});
-        B->attr().set(attr_keys::plain_dims, std::vector<expr> {200, dyn_var0});
-        C->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var1, 300});
-        D->attr().set(
-                attr_keys::plain_dims, std::vector<expr> {dyn_var2, dyn_var0});
-        D[{0, 0}] = A[{0, 0}] + B[{0, 1}] + C[{0, 2}];
-        _return_(dyn_var0 + dyn_var1 + dyn_var2);
-    }
-
-    _function_(datatypes::index, the_main,
-            _arg_("A", datatypes::f32, {dyn_var0, 100}),
-            _arg_("B", datatypes::f32, {200, dyn_var0}),
-            _arg_("C", datatypes::f32, {400, dyn_var1})) {
-        _bind_(A, B, C);
-        A->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var0, 100});
-        B->attr().set(attr_keys::plain_dims, std::vector<expr> {200, dyn_var0});
-        C->attr().set(attr_keys::plain_dims, std::vector<expr> {400, dyn_var1});
-        _var_init_(dyn_var2, datatypes::index, UINT64_C(123));
-        _tensor_(D, datatypes::f32, {dyn_var1, 300});
-        D->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var1, 300});
-        _tensor_(E, datatypes::f32, {dyn_var2, dyn_var0});
-        E->attr().set(
-                attr_keys::plain_dims, std::vector<expr> {dyn_var2, dyn_var0});
-        _var_init_(ret, datatypes::index,
-                builder::make_call(ccc->decl_, {A, B, D, E}));
-        _return_(ret);
-    }
-
-    dyn_tensor_transformer_t transformer;
-    auto mod = ir_module_t::from_entry_func(get_test_ctx(), the_main);
-    mod->add_func({ccc});
-
-    auto new_mod = transformer(mod);
-    auto new_main = new_mod->get_entry_func();
-    auto new_ccc = new_mod->get_func("ccc");
-
-    expr sz = sizeof(runtime::dynamic_tensor_t);
-    _function_(datatypes::index, expected_main,
-            _arg_("dyn_A", datatypes::u8, {sz}),
-            _arg_("dyn_B", datatypes::u8, {sz}),
-            _arg_("dyn_C", datatypes::u8, {sz})) {
-        _bind_(dyn_A, dyn_B, dyn_C);
-        expr A = builder::make_tensor("A", {dyn_var0, 100}, datatypes::f32);
-        expr A_shape = builder::make_tensor(
-                "dyn_shape_A", {UINT64_C(2)}, datatypes::index);
-        expr B = builder::make_tensor("B", {200, dyn_var0}, datatypes::f32);
-        expr C = builder::make_tensor("C", {400, dyn_var1}, datatypes::f32);
-        expr C_shape = builder::make_tensor(
-                "dyn_shape_C", {UINT64_C(2)}, datatypes::index);
-        builder.push_var_tensor_def(A_shape, linkage::local,
-                builder::make_read_struct(dyn_A, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr));
-        builder.push_var_tensor_def(dyn_var0, linkage::local,
-                builder::make_indexing(A_shape, {UINT64_C(0)}));
-        builder.push_var_tensor_def(A, linkage::local,
-                builder::make_read_struct(dyn_A, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        builder.push_var_tensor_def(B, linkage::local,
-                builder::make_read_struct(dyn_B, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        builder.push_var_tensor_def(C_shape, linkage::local,
-                builder::make_read_struct(dyn_C, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr));
-        builder.push_var_tensor_def(dyn_var1, linkage::local,
-                builder::make_indexing(C_shape, {UINT64_C(1)}));
-        builder.push_var_tensor_def(C, linkage::local,
-                builder::make_read_struct(dyn_C, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        _var_init_(dyn_var2, datatypes::index, UINT64_C(123));
-        _tensor_(D, datatypes::f32, {dyn_var1, 300});
-        D->attr().set(attr_keys::plain_dims, std::vector<expr> {dyn_var1, 300});
-        _tensor_(E, datatypes::f32, {dyn_var2, dyn_var0});
-        E->attr().set(
-                attr_keys::plain_dims, std::vector<expr> {dyn_var2, dyn_var0});
-        _tensor_(dyn_D, datatypes::u8, {sz});
-        _tensor_(dyn_shape_D, datatypes::index, {UINT64_C(2)});
-        dyn_shape_D[UINT64_C(0)] = dyn_var1;
-        dyn_shape_D[UINT64_C(1)] = 300;
-        _var_init_(dyn_mask_D, datatypes::u8,
-                builder::make_constant({UINT64_C(1)}, datatypes::u8));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_D, D, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_D, dyn_shape_D,
-                        dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_D, 2, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::ndims));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_D,
-                        builder::make_constant({UINT64_C(4)}, datatypes::u32),
-                        dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dtype));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_D, dyn_mask_D,
-                        dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dyn_mask));
-        _tensor_(dyn_E, datatypes::u8, {sz});
-        _tensor_(dyn_shape_E, datatypes::index, {UINT64_C(2)});
-        dyn_shape_E[UINT64_C(0)] = dyn_var2;
-        dyn_shape_E[UINT64_C(1)] = dyn_var0;
-        _var_init_(dyn_mask_E, datatypes::u8,
-                builder::make_constant({UINT64_C(3)}, datatypes::u8));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_E, E, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::data_ptr));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_E, dyn_shape_E,
-                        dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dim_ptr));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_E, 2, dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::ndims));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_E,
-                        builder::make_constant({UINT64_C(4)}, datatypes::u32),
-                        dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dtype));
-        builder::get_current_builder()->push_evaluate(
-                builder::make_write_struct(dyn_E, dyn_mask_E,
-                        dyn_tsr_struct_t::name,
-                        dyn_tsr_struct_t::fields::dyn_mask));
-        _var_init_(ret, datatypes::index,
-                builder::make_call(
-                        new_ccc->decl_, {dyn_A, dyn_B, dyn_D, dyn_E}));
-        _return_(ret);
-    }
-
-    ir_comparer cmper;
-    ASSERT_TRUE(cmper.compare(new_main, expected_main));
-
-    runtime::dynamic_tensor_t A, B, C;
-    auto jitf = jit_engine_t::make(get_test_ctx())->get_entry_func(mod);
-    std::vector<float> in(1000);
-    int64_t A_shape[2] = {256, 100};
-    int64_t C_shape[2] = {300, 512};
-    A.data_ = in.data();
-    B.data_ = in.data();
-    C.data_ = in.data();
-    A.dims_ = A_shape;
-    C.dims_ = C_shape;
-    uint64_t ret = jitf->call<uint64_t>(&A, &B, &C);
-    ASSERT_EQ(ret, 768UL + 123);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_impl_kind.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_impl_kind.cpp
deleted file mode 100644
index c6ee3f417f4..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_impl_kind.cpp
+++ /dev/null
@@ -1,251 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include "reference/gemm_ref.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/templates/managed_matmul_core.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-using namespace dnnl::impl::graph::gc;
-class customized_managed_matmul_core_op_t
-    : public ops::managed_matmul_core_op_t {
-public:
-    using inner_config = ops::managed_matmul_core_config_t;
-    customized_managed_matmul_core_op_t(
-            const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs)
-        : ops::managed_matmul_core_op_t(producer_lt, consumer_lt, attrs) {
-        op_name_ = "customized_managed_matmul_core";
-    }
-    std::vector<config_ptr> get_dynamic_config_candidates(
-            const context_ptr &ctx) override {
-        config_ptr_vec ret;
-        int num_threads = runtime_config_t::get().get_num_threads();
-        auto M_split_candidates = utils::get_factors(num_threads);
-        auto N_split_candidates = utils::get_factors(num_threads);
-        std::vector<int> MNK_sub_candidates = {1, 2, 8};
-        for (auto &M_split_num : M_split_candidates) {
-            for (auto &N_split_num : N_split_candidates) {
-                if (num_threads % (M_split_num * N_split_num)) { continue; }
-                for (auto &M_sub_block : MNK_sub_candidates) {
-                    for (auto &N_sub_block : MNK_sub_candidates) {
-                        for (auto &K_sub_block : MNK_sub_candidates) {
-                            auto gcfg = reflection::general_object_t::make<
-                                    inner_config>();
-                            inner_config &cfg
-                                    = *gcfg.unchecked_get_as<inner_config>();
-                            cfg.M_split_num = M_split_num;
-                            cfg.N_split_num = N_split_num;
-                            cfg.M_sub_block = M_sub_block;
-                            cfg.N_sub_block = N_sub_block;
-                            cfg.K_sub_block = K_sub_block;
-                            cfg.im_loop_order = 0;
-                            ret.emplace_back(std::move(gcfg));
-                        }
-                    }
-                }
-            }
-        }
-        return ret;
-    }
-};
-TEST(GCCore_CPU_dynamic_impl_kind_cpp, TestImplKindManagedMatmulCore) {
-    sc_graph_t g;
-    auto in_a = g.make_input(
-            {graph_tensor::make({28, 64}, sc_data_format_t::MK())});
-    auto in_b = g.make_input(
-            {graph_tensor::make({64, 32}, sc_data_format_t::KN())});
-    auto ctx = get_test_ctx();
-    auto cfg_sz = sizeof(ops::managed_matmul_core_config_t);
-    {
-        // threads == 1
-        SET_THREADS_OR_SKIP(1);
-        auto mmm = g.make<customized_managed_matmul_core_op_t>(
-                std::vector<graph_tensor_ptr> {
-                        in_a->get_outputs()[0], in_b->get_outputs()[0]},
-                std::vector<graph_tensor_ptr> {graph_tensor::make({28, 32})},
-                any_map_t());
-        auto tun_mmm = mmm->dyn_cast<tunable_op_t>();
-        auto configs = tun_mmm->get_dynamic_config_candidates(ctx);
-        EXPECT_EQ(configs.size(), UINT64_C(1) * 1 * 3 * 3 * 3);
-        auto cfg0 = configs[0]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        auto cfg1 = configs[3]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        auto cfg2 = configs[5]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        ops::managed_matmul_core_config_t expect0 {1, 1, 1, 1, 1, 0};
-        ops::managed_matmul_core_config_t expect1 {1, 1, 1, 2, 1, 0};
-        ops::managed_matmul_core_config_t expect2 {1, 1, 1, 2, 8, 0};
-        EXPECT_TRUE(!memcmp(cfg0, &expect0, cfg_sz));
-        EXPECT_TRUE(!memcmp(cfg1, &expect1, cfg_sz));
-        EXPECT_TRUE(!memcmp(cfg2, &expect2, cfg_sz));
-    }
-    {
-        // threads == 4
-        SET_THREADS_OR_SKIP(4);
-        auto mmm = g.make<customized_managed_matmul_core_op_t>(
-                std::vector<graph_tensor_ptr> {
-                        in_a->get_outputs()[0], in_b->get_outputs()[0]},
-                std::vector<graph_tensor_ptr> {graph_tensor::make({28, 32})},
-                any_map_t());
-        auto tun_mmm = mmm->dyn_cast<tunable_op_t>();
-        auto configs = tun_mmm->get_dynamic_config_candidates(ctx);
-        EXPECT_EQ(configs.size(), UINT64_C(6) * 3 * 3 * 3);
-        auto cfg0 = configs[27]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        auto cfg1 = configs[44]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        auto cfg2 = configs[50]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        ops::managed_matmul_core_config_t expect0 {1, 2, 1, 1, 1, 0};
-        ops::managed_matmul_core_config_t expect1 {1, 2, 2, 8, 8, 0};
-        ops::managed_matmul_core_config_t expect2 {1, 2, 8, 2, 8, 0};
-        EXPECT_TRUE(!memcmp(cfg0, &expect0, cfg_sz));
-        EXPECT_TRUE(!memcmp(cfg1, &expect1, cfg_sz));
-        EXPECT_TRUE(!memcmp(cfg2, &expect2, cfg_sz));
-    }
-
-    {
-        // threads == 56
-        SET_THREADS_OR_SKIP(56);
-        auto mmm = g.make<customized_managed_matmul_core_op_t>(
-                std::vector<graph_tensor_ptr> {
-                        in_a->get_outputs()[0], in_b->get_outputs()[0]},
-                std::vector<graph_tensor_ptr> {graph_tensor::make({28, 32})},
-                any_map_t());
-        auto tun_mmm = mmm->dyn_cast<tunable_op_t>();
-        auto configs = tun_mmm->get_dynamic_config_candidates(ctx);
-        EXPECT_EQ(configs.size(), UINT64_C(30) * 3 * 3 * 3);
-        auto cfg0 = configs[808]
-                            .unchecked_get_as<
-                                    ops::managed_matmul_core_config_t>();
-        ops::managed_matmul_core_config_t expect0 {56, 1, 8, 8, 2, 0};
-        EXPECT_TRUE(!memcmp(cfg0, &expect0, cfg_sz));
-    }
-}
-
-// use customized matmul core here as currently we only support matmul but
-// matmul does not need dynamic impl kinds.
-class customized_matmul_core_op_t : public ops::matmul_core_op_t {
-public:
-    customized_matmul_core_op_t(
-            const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs)
-        : matmul_core_op_t(producer_lt, consumer_lt, attrs) {
-        op_name_ = "customized_matmul_core";
-    }
-    std::vector<int> get_impl_dispatch_candidates(
-            const context_ptr &ctx) override {
-        return get_dynamic_impl_dispatch_candidates(this, ctx);
-    }
-    std::vector<config_ptr> get_dynamic_config_candidates(
-            const context_ptr &ctx) override {
-        if (dyn_config_candidates_.empty()) {
-            auto set = get_dispatch_key_set()->get_inner_set();
-            auto &ret = dyn_config_candidates_;
-            ret.reserve(set.size());
-            for (auto &it : set) {
-                auto gcfg = reflection::general_object_t::make<
-                        ops::matmul_core_config_t>();
-                auto cfg = gcfg.unchecked_get_as<ops::matmul_core_config_t>();
-                if (it.var_block_[0][0] < 16) { continue; }
-                cfg->M_block = it.var_block_[0][0];
-                cfg->N_block = it.var_block_[1][1];
-                cfg->K_block = it.var_block_[0][1];
-                ret.emplace_back(std::move(gcfg));
-            }
-        }
-        return dyn_config_candidates_;
-    }
-    impl_kind_map convert_config_candidates_to_impl_map(
-            const std::vector<config_ptr> &configs) override {
-        impl_kind_map ret;
-        ret.reserve(configs.size());
-        for (int i = 0; i < static_cast<int>(configs.size()); i++) {
-            auto cfg = configs[i].unchecked_get_as<ops::matmul_core_config_t>();
-            std::vector<uint64_t> keys = {static_cast<uint64_t>(cfg->M_block),
-                    static_cast<uint64_t>(cfg->N_block),
-                    static_cast<uint64_t>(cfg->K_block)};
-            ret[keys] = i;
-        }
-        return ret;
-    }
-};
-
-TEST(GCCore_CPU_dynamic_impl_kind_cpp, TestImplKindMatmulCoreExec) {
-    REQUIRE_AVX2();
-    sc_graph_t g;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    auto in_a = g.make_input(
-            {graph_tensor::make({-1, 64}, sc_data_format_t::MK())});
-    auto in_b = g.make_input(
-            {graph_tensor::make({64, 32}, sc_data_format_t::KN())});
-
-    auto mmm = std::make_shared<customized_matmul_core_op_t>(
-            std::vector<graph_tensor_ptr> {
-                    in_a->get_outputs()[0], in_b->get_outputs()[0]},
-            std::vector<graph_tensor_ptr> {graph_tensor::make({-1, 32})},
-            any_map_t());
-    g.add(mmm);
-    auto out = g.make_output(mmm->get_outputs());
-    // disable copy during fusion as copy may remake op with op name.
-    ctx->flags_.opt_level_ = sc_opt_level::lv0;
-    graph_driver(g, ctx);
-    std::vector<sc_op_ptr> gargs {out, in_a, in_b};
-    auto modu = lower_graph(ctx, g, gargs);
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(modu);
-    test_buffer<float> sc_a(28 * 64), sc_b(64 * 32), sc_out(28 * 32),
-            ref_out(28 * 32);
-    test_utils::fill_data<float>(sc_a.data(), 28 * 64);
-    test_utils::fill_data<float>(sc_b.data(), 32 * 64);
-    runtime::dynamic_tensor_t dyn_a, dyn_b, dyn_out;
-    sc_dims shape_a = {28, 64}, shape_b = {64, 32}, shape_out = {28, 32};
-    dyn_a.data_ = sc_a.data();
-    dyn_a.dims_ = shape_a.data();
-    dyn_a.ndims_ = 2;
-    dyn_a.dtype_ = uint32_t(sc_data_etype::F32);
-    dyn_a.dyn_mask_ = 1 << 0;
-    dyn_b.data_ = sc_b.data();
-    dyn_b.dims_ = shape_b.data();
-    dyn_b.ndims_ = 2;
-    dyn_b.dtype_ = uint32_t(sc_data_etype::F32);
-    dyn_b.dyn_mask_ = 0;
-    dyn_out.data_ = sc_out.data();
-    dyn_out.dims_ = shape_out.data();
-    dyn_out.ndims_ = 2;
-    fptr->call_default(&dyn_out, &dyn_a, &dyn_b);
-    gemm_params l_param = {false, false, 28, 32, 64, 1.0, 0, 64, 32, 32};
-    ref_gemm(l_param, sc_a.data(), sc_b.data(), ref_out.data());
-    test_utils::compare_data(sc_out, ref_out, 1e-4f, 1e-4f);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_parallel_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_parallel_transform.cpp
deleted file mode 100644
index 3f05d90a1a3..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_parallel_transform.cpp
+++ /dev/null
@@ -1,475 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <array>
-#include "context.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/dynamic_parallel_transform.hpp>
-#include <compiler/jit/jit.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_threadpool_c.hpp>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-static constexpr auto s32 = datatypes::s32;
-
-#define U64 UINT64_C
-
-TEST(GCCore_CPU_dyn_parallel_transform, TestDynamicParallelTransform) {
-    SET_THREADS_OR_SKIP(32);
-    builder::ir_builder_t builder;
-    ir_module_ptr mod = std::make_shared<ir_module_t>(get_test_ctx());
-    expr G = mod->make_global_tensor(s32, "G", {1000});
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _tensor_(B, s32, 100);
-        _for_(i, UINT64_C(0), UINT64_C(100), UINT64_C(1), for_type::PARALLEL,
-                2) {
-            _for_(j, UINT64_C(0), UINT64_C(40), UINT64_C(1), for_type::PARALLEL,
-                    4) {
-                A[j] = i + j;
-                // captured
-                _tensor_(C, s32, 100);
-                C[0] = 10;
-                // no need to capture
-                _tensor_(D, s32, 100);
-                D[0] = 10;
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1),
-                        for_type::PARALLEL, 4) {
-                    B[k] = i + j + k;
-                }
-                C[i + j] = 9;
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1),
-                        for_type::PARALLEL, 4) {
-                    C[k] = i + j + 1;
-                }
-            }
-            G[i] = i;
-        }
-        _return_(0);
-    }
-
-    mod->add_func({ccc});
-    dynamic_parallel_transform_t pass {true};
-    auto mod2 = pass(mod);
-
-    uint64_t num_threads = runtime_config_t::get().get_num_threads();
-    static const std::array<int, 6> closure_names_id {1, 2, 3, 4, 5, 7};
-    std::vector<func_t> wrappers, bodies;
-    wrappers.reserve(closure_names_id.size());
-    bodies.reserve(closure_names_id.size());
-    for (auto v : closure_names_id) {
-        auto f = mod2->get_func("ccc_0_closure_N" + std::to_string(v));
-        ASSERT_TRUE(f);
-        bodies.emplace_back(f);
-        f = mod2->get_func(f->name_ + "_wrapper");
-        ASSERT_TRUE(f);
-        wrappers.emplace_back(f);
-    }
-    using namespace dnnl::impl::graph::gc::runtime::dynamic_threadpool;
-    _function_(s32, ccc_expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _tensor_(B, s32, 100);
-        builder.push_scope();
-        {
-            _tensor_(captures, datatypes::generic, U64(2));
-            captures[U64(0)] = builder::make_cast(datatypes::generic, A);
-            captures[U64(1)] = builder::make_cast(datatypes::generic, B);
-            _evaluate_call_(get_dyn_threadpool_init_func(), get_ir_null(),
-                    get_ir_null(), captures, /*num_roots*/ U64(1),
-                    /*queue_size*/ U64(256), num_threads);
-            _evaluate_call_(get_dyn_threadpool_submit_func(),
-                    builder::make_func_addr(wrappers[0]->decl_), get_ir_null(),
-                    /*num_iter*/ U64(0), /*loop_len*/ U64(100),
-                    /*num_blocks*/ U64(2), /*outer_loop_hash*/ U64(0),
-                    /*num_buffers*/ U64(0),
-                    /*buffers*/ get_ir_null(), /*flags*/
-                    uint64_t(work_item_flags::is_root
-                            | work_item_flags::bind_last_level | 16));
-            _evaluate_call_(get_dyn_threadpool_run_func());
-            _evaluate_call_(get_dyn_threadpool_destroy_func());
-        }
-        auto scope = builder.pop_scope();
-        builder.get_current_scope().emit(scope);
-        _return_(0);
-    }
-    ir_comparer cmper {true};
-    auto new_main = mod2->get_func("ccc");
-    ASSERT_TRUE(new_main);
-    EXPECT_TRUE(cmper.compare(new_main, ccc_expected, true));
-
-    _function_(datatypes::void_t, closure1, _arg_("i", datatypes::index),
-            _arg_("A", s32, {10000}), _arg_("B", s32, {100})) {
-        _bind_(i, A, B);
-        _evaluate_call_(get_dyn_threadpool_submit_func(),
-                builder::make_func_addr(wrappers[1]->decl_), get_ir_null(),
-                /*num_iter*/ U64(1), /*loop_len*/ U64(40),
-                /*num_blocks*/ U64(4), /*outer_loop_hash*/ U64(0),
-                /*num_buffers*/ U64(0),
-                /*buffers*/ get_ir_null(),
-                /*flags*/ uint64_t(work_item_flags::bind_last_level | 4));
-    }
-    ASSERT_TRUE(cmper.compare(bodies[0], closure1, true));
-
-    _function_(datatypes::void_t, closure2, _arg_("i", datatypes::index),
-            _arg_("j", datatypes::index), _arg_("A", s32, {10000}),
-            _arg_("B", s32, {100})) {
-        _bind_(i, j, A, B);
-        A[j] = i + j;
-        // captured
-        _tensor_(C, s32, 100);
-        builder.get_current_scope().body->seq_.back().static_as<define>()->init_
-                = get_dyn_threadpool_shared_buffer_func()(U64(400));
-        C[0] = 10;
-        // no need to capture
-        _tensor_(D, s32, 100);
-        D[0] = 10;
-        _tensor_(shared, datatypes::generic, U64(1));
-        shared[U64(0)] = builder::make_cast(datatypes::generic, C);
-        _evaluate_call_(get_dyn_threadpool_submit_func(),
-                builder::make_func_addr(wrappers[2]->decl_), get_ir_null(),
-                /*num_iter*/ U64(2), /*loop_len*/ U64(16),
-                /*num_blocks*/ U64(4), /*outer_loop_hash*/ U64(0),
-                /*num_buffers*/ U64(1),
-                /*buffers*/ builder::make_cast(datatypes::pointer, shared),
-                /*flags*/ uint64_t(work_item_flags::bind_last_level | 1));
-    }
-    EXPECT_TRUE(cmper.compare(bodies[1], closure2, true));
-
-    _function_(datatypes::void_t, closure3, _arg_("i", datatypes::index),
-            _arg_("j", datatypes::index), _arg_("k", datatypes::index),
-            _arg_("C", s32, {100}), _arg_("A", s32, {10000}),
-            _arg_("B", s32, {100})) {
-        _bind_(i, j, k, C, A, B);
-        B[k] = i + j + k;
-        _var_init_(handle, datatypes::index,
-                get_dyn_threadpool_loop_end_func()(U64(0), U64(0)));
-        _if_(handle != U64(0)) {
-            //     _tensor_(shared, datatypes::generic, U64(1));
-            //     shared[U64(0)] = builder::make_cast(datatypes::generic, C);
-            _evaluate_call_(get_dyn_threadpool_submit_func(),
-                    builder::make_func_addr(wrappers[3]->decl_), get_ir_null(),
-                    /*num_iter*/ U64(2), /*loop_len*/ U64(1),
-                    /*num_blocks*/ U64(1), /*outer_loop_hash*/ U64(0),
-                    /*num_buffers*/ U64(1),
-                    /*buffers*/ get_ir_null(),
-                    /*flags*/ uint64_t(work_item_flags::bind_last_level | 4));
-        }
-    }
-    EXPECT_TRUE(cmper.compare(bodies[2], closure3, true));
-
-    _function_(datatypes::void_t, closure4, _arg_("i", datatypes::index),
-            _arg_("j", datatypes::index), _arg_("C", s32, {100}),
-            _arg_("A", s32, {10000}), _arg_("B", s32, {100})) {
-        _bind_(i, j, C, A, B);
-        C[i + j] = 9;
-        // _tensor_(shared, datatypes::generic, U64(1));
-        // shared[U64(0)] = builder::make_cast(datatypes::generic, C);
-        _evaluate_call_(get_dyn_threadpool_submit_func(),
-                builder::make_func_addr(wrappers[4]->decl_), get_ir_null(),
-                /*num_iter*/ U64(2), /*loop_len*/ U64(16),
-                /*num_blocks*/ U64(4), /*outer_loop_hash*/ U64(0),
-                /*num_buffers*/ U64(1),
-                /*buffers*/ get_ir_null(),
-                /*flags*/ uint64_t(work_item_flags::bind_last_level | 1));
-    }
-    EXPECT_TRUE(cmper.compare(bodies[3], closure4, true));
-
-    _function_(datatypes::void_t, closure5, _arg_("i", datatypes::index),
-            _arg_("j", datatypes::index), _arg_("k", datatypes::index),
-            _arg_("C", s32, {100}), _arg_("A", s32, {10000}),
-            _arg_("B", s32, {100})) {
-        _bind_(i, j, k, C, A, B);
-        C[k] = i + j + 1;
-        _var_init_(handle, datatypes::index,
-                get_dyn_threadpool_loop_end_func()(U64(0), U64(0)));
-        _if_(handle != U64(0)) {
-            _var_init_(handle2, datatypes::index,
-                    get_dyn_threadpool_loop_end_func()(handle, U64(3)));
-            _if_(handle2 != U64(0)) {
-                _evaluate_call_(get_dyn_threadpool_submit_func(),
-                        builder::make_func_addr(wrappers[5]->decl_),
-                        get_ir_null(),
-                        /*num_iter*/ U64(1), /*loop_len*/ U64(1),
-                        /*num_blocks*/ U64(1), /*outer_loop_hash*/ U64(0),
-                        /*num_buffers*/ U64(0),
-                        /*buffers*/ get_ir_null(),
-                        /*flags*/
-                        uint64_t(work_item_flags::bind_last_level | 16));
-            }
-        }
-    }
-    EXPECT_TRUE(cmper.compare(bodies[4], closure5, true));
-
-    _function_(datatypes::void_t, closure7, _arg_("i", datatypes::index),
-            _arg_("A", s32, {10000}), _arg_("B", s32, {100})) {
-        _bind_(i, A, B);
-        G[i] = i;
-    }
-    EXPECT_TRUE(cmper.compare(bodies[5], closure7, true));
-
-    // compare the wrappers
-    _function_(datatypes::void_t, closure1_wrapper,
-            _arg_("itr", datatypes::index, {1UL}),
-            _arg_("buffers", datatypes::generic, {0UL}),
-            _arg_("args", datatypes::generic, {2UL})) {
-        _bind_(itr, buffers, args);
-        _evaluate_call_(bodies[0]->decl_, itr[U64(0)],
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(1)]));
-    }
-    EXPECT_TRUE(cmper.compare(wrappers[0], closure1_wrapper, true));
-
-    _function_(datatypes::void_t, closure2_wrapper,
-            _arg_("itr", datatypes::index, {2UL}),
-            _arg_("buffers", datatypes::generic, {0UL}),
-            _arg_("args", datatypes::generic, {2UL})) {
-        _bind_(itr, buffers, args);
-        _evaluate_call_(bodies[1]->decl_, itr[U64(0)], itr[U64(1)],
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(1)]));
-    }
-    EXPECT_TRUE(cmper.compare(wrappers[1], closure2_wrapper, true));
-
-    _function_(datatypes::void_t, closure3_wrapper,
-            _arg_("itr", datatypes::index, {3UL}),
-            _arg_("buffers", datatypes::generic, {1UL}),
-            _arg_("args", datatypes::generic, {2UL})) {
-        _bind_(itr, buffers, args);
-        _evaluate_call_(bodies[2]->decl_, itr[U64(0)], itr[U64(1)], itr[U64(2)],
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), buffers[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(1)]));
-    }
-    EXPECT_TRUE(cmper.compare(wrappers[2], closure3_wrapper, true));
-
-    _function_(datatypes::void_t, closure4_wrapper,
-            _arg_("itr", datatypes::index, {2UL}),
-            _arg_("buffers", datatypes::generic, {1UL}),
-            _arg_("args", datatypes::generic, {2UL})) {
-        _bind_(itr, buffers, args);
-        _evaluate_call_(bodies[3]->decl_, itr[U64(0)], itr[U64(1)],
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), buffers[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(1)]));
-    }
-    EXPECT_TRUE(cmper.compare(wrappers[3], closure4_wrapper, true));
-
-    _function_(datatypes::void_t, closure5_wrapper,
-            _arg_("itr", datatypes::index, {3UL}),
-            _arg_("buffers", datatypes::generic, {1UL}),
-            _arg_("args", datatypes::generic, {2UL})) {
-        _bind_(itr, buffers, args);
-        _evaluate_call_(bodies[4]->decl_, itr[U64(0)], itr[U64(1)], itr[U64(2)],
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), buffers[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(1)]));
-    }
-    EXPECT_TRUE(cmper.compare(wrappers[4], closure5_wrapper, true));
-
-    _function_(datatypes::void_t, closure7_wrapper,
-            _arg_("itr", datatypes::index, {1UL}),
-            _arg_("buffers", datatypes::generic, {0UL}),
-            _arg_("args", datatypes::generic, {2UL})) {
-        _bind_(itr, buffers, args);
-        _evaluate_call_(bodies[5]->decl_, itr[U64(0)],
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(0)]),
-                builder::make_cast(
-                        datatypes::s32.get_pointerof(), args[U64(1)]));
-    }
-    EXPECT_TRUE(cmper.compare(wrappers[5], closure7_wrapper, true));
-}
-
-TEST(GCCore_CPU_dyn_parallel_transform, TestInlineAndBufferPassing) {
-    using namespace dnnl::impl::graph::gc::runtime::dynamic_threadpool;
-    SET_THREADS_OR_SKIP(32);
-    builder::ir_builder_t builder;
-    ir_module_ptr mod = std::make_shared<ir_module_t>(get_test_ctx());
-    expr G = mod->make_global_tensor(s32, "G", {1000});
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _tensor_(B, s32, 100);
-        _for_(j, UINT64_C(0), UINT64_C(40), UINT64_C(1), for_type::PARALLEL,
-                4) {
-            // captured
-            _tensor_(C, s32, 100);
-            C[0] = 10;
-            _tensor_(D, s32, 100);
-            _for_(i, UINT64_C(0), UINT64_C(40), UINT64_C(1), for_type::PARALLEL,
-                    2) {
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1),
-                        for_type::PARALLEL, 4) {
-                    C[k] = k;
-                    D[k] = k;
-                }
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1),
-                        for_type::PARALLEL, 4) {
-                    D[k] = 9;
-                }
-            }
-            C[0] = 1;
-        }
-        _return_(0);
-    }
-
-    mod->add_func({ccc});
-    dynamic_parallel_transform_t pass {true};
-    auto mod2 = pass(mod);
-
-    auto result3 = mod2->get_func("ccc_0_closure_N3");
-    auto result4 = mod2->get_func("ccc_0_closure_N4");
-    auto result5 = mod2->get_func("ccc_0_closure_N5");
-    auto wrapper7 = mod2->get_func("ccc_0_closure_N7_wrapper");
-    ASSERT_TRUE(result3);
-    ASSERT_TRUE(result4);
-    ASSERT_TRUE(result5);
-    ASSERT_TRUE(wrapper7);
-    _function_(datatypes::void_t, closure3, _arg_("j", datatypes::index),
-            _arg_("i", datatypes::index), _arg_("k", datatypes::index),
-            _arg_("C", s32, {100}), _arg_("D", s32, {100})) {
-        _bind_(j, i, k, C, D);
-        C[k] = k;
-        D[k] = k;
-        _var_init_(handle, datatypes::index,
-                get_dyn_threadpool_loop_end_func()(U64(0), U64(0)));
-        // check that closure4 is inlined (instead of another job)
-        _if_(handle != U64(0)) { _evaluate_call_(result4->decl_, j, i, D, C); }
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(result3, closure3, true));
-
-    _function_(datatypes::void_t, closure5, _arg_("j", datatypes::index),
-            _arg_("i", datatypes::index), _arg_("k", datatypes::index),
-            _arg_("D", s32, {100}), _arg_("C", s32, {100})) {
-        _bind_(j, i, k, D, C);
-        D[k] = k;
-        _var_init_(handle, datatypes::index,
-                get_dyn_threadpool_loop_end_func()(U64(0), U64(0)));
-        _if_(handle != U64(0)) {
-            _var_init_(handle2, datatypes::index,
-                    get_dyn_threadpool_loop_end_func()(handle, U64(3)));
-            _if_(handle2 != U64(0)) {
-                _tensor_(shared, datatypes::generic, U64(1));
-                shared[U64(0)] = builder::make_cast(datatypes::generic, C);
-                _evaluate_call_(get_dyn_threadpool_submit_func(),
-                        builder::make_func_addr(wrapper7->decl_), get_ir_null(),
-                        /*num_iter*/ U64(1), /*loop_len*/ U64(1),
-                        /*num_blocks*/ U64(1), /*outer_loop_hash*/ U64(0),
-                        /*num_buffers*/ U64(0),
-                        /*buffers*/
-                        builder::make_cast(datatypes::pointer, shared),
-                        /*flags*/
-                        uint64_t(work_item_flags::bind_last_level | 8));
-            }
-        }
-    }
-}
-
-struct tp_reseter {
-    thread_pool_mode_t old_;
-    tp_reseter() {
-        old_ = runtime_config_t::get().managed_thread_pool_;
-        runtime_config_t::get().managed_thread_pool_
-                = thread_pool_mode_t::DYNAMIC;
-    }
-    ~tp_reseter() { runtime_config_t::get().managed_thread_pool_ = old_; }
-};
-
-TEST(GCCore_CPU_dyn_parallel_transform, TestExecute) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(32);
-    tp_reseter reseter;
-    builder::ir_builder_t builder;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    ir_module_ptr mod = std::make_shared<ir_module_t>(ctx);
-    expr G = mod->make_global_tensor(s32, "G", {16});
-    _function_(s32, ccc, _arg_("A", s32, {10, 40}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _tensor_(B, s32, 10, 40, 16);
-        _for_(i, UINT64_C(0), UINT64_C(10), UINT64_C(1), for_type::PARALLEL,
-                2) {
-            _for_(j, UINT64_C(0), UINT64_C(40), UINT64_C(1), for_type::PARALLEL,
-                    4) {
-                A[{i, j}] = builder::make_cast(s32, i + j);
-                // captured
-                _tensor_(C, s32, 16);
-                // no need to capture
-                _tensor_(D, s32, 16);
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1),
-                        for_type::PARALLEL, 4) {
-                    C[k] = builder::make_select(
-                            A[{i, j}] == i + j, builder::make_cast(s32, k), -1);
-                }
-                // non-parallel, check the C buffer
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1)) {
-                    _if_(C[k] == k) { A[{i, j}] = A[{i, j}] + 10; }
-                }
-                A[{i, j}] = A[{i, j}] - 3;
-                _for_(k, UINT64_C(0), UINT64_C(16), UINT64_C(1),
-                        for_type::PARALLEL, 4) {
-                    B[{i, j, k}]
-                            = builder::make_select(A[{i, j}] == i + j + 160 - 3,
-                                    builder::make_cast(s32, k), -2);
-                }
-            }
-            G[i] = builder::make_cast(s32, A[{i, 0}]) + a;
-            _for_(j, UINT64_C(0), UINT64_C(40)) {
-                _for_(k, UINT64_C(0), UINT64_C(16)) {
-                    _if_(B[{i, j, k}] != k) { G[i] = -100; }
-                }
-            }
-        }
-        _return_(0);
-    }
-
-    mod->add_func({ccc});
-    auto jitm = jit_engine_t::make(ctx)->make_jit_module(mod, true);
-    auto jitf = jitm->get_function("ccc");
-    auto buf = alloc_array<int32_t>(10 * 40);
-    generic_val args[] = {buf.data(), 17};
-    jitf->call_generic_default(args);
-    for (int i = 0; i < 10; i++) {
-        for (int j = 0; j < 40; j++) {
-            ASSERT_EQ(buf[i * 40 + j], i + j + 160 - 3);
-        }
-    }
-    auto global_tsr = (int32_t *)jitm->get_address_of_symbol("G");
-    ASSERT_TRUE(global_tsr);
-    for (int i = 0; i < 10; i++) {
-        ASSERT_EQ(global_tsr[i], i + 160 - 3 + 17);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_threadpool.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_threadpool.cpp
deleted file mode 100644
index 6e647b0460c..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_dynamic_threadpool.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <cmath>
-#include <iostream>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-#include "test_utils.hpp"
-#include "test_utils_arr_fill.hpp"
-#include "gtest/gtest.h"
-#include <runtime/context.hpp>
-#include <runtime/dynamic_threadpool.hpp>
-#include <runtime/dynamic_threadpool_c.hpp>
-#include <runtime/generic_val.hpp>
-#include <runtime/low_level_threadpool_wrapper.hpp>
-#include <util/parallel.hpp>
-#include <util/scoped_timer.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-variable"
-#endif
-
-using namespace runtime::dynamic_threadpool;
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <test_thread.hpp>
-#define dnnl_thread_env() \
-    dnnl::testing::scoped_tp_activation_t unused_raii {}
-#else
-#define dnnl_thread_env()
-#endif
-
-namespace dyn_tp_test {
-// a testing threadpool assuming there are N+4 threads, when there are actually
-// N threads
-struct threadpool_adapter_for_test_t : threadpool_adapter_t {
-    static threadpool_scheduler *all_thread_prepare(
-            threadpool_scheduler *ths, runtime::stream_t *stream, int threads) {
-        return threadpool_adapter_t::all_thread_prepare(
-                ths, stream, threads + 6);
-    }
-};
-
-static void thread_main_testing(main_func_t f, runtime::stream_t *stream,
-        void *mod_data, generic_val *args) {
-    runtime::call_threadpool<threadpool_adapter_for_test_t,
-            threadpool_scheduler>(nullptr, f, stream, mod_data, args);
-}
-} // namespace dyn_tp_test
-
-namespace dyn_tp_test {
-static void layer0_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args);
-static void layer1_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args);
-static void layer2_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args);
-static void layer3_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args);
-static void layer3outer_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args);
-static void layer_start_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args) {
-    auto bs = itr[0];
-    void *buf = sc_dyn_threadpool_shared_buffer(128 * sizeof(float));
-    sc_dyn_threadpool_create_work_items(layer0_func, itr, /*num_iter**/ 1,
-            /*loop_len*/ 128, /*num_blocks*/ 16, /*outer_loop_hash*/ bs,
-            /*num_buffers*/ 1, &buf, 1);
-}
-static void layer0_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args) {
-    auto bs = itr[0];
-    auto i = itr[1];
-    float *input = (float *)args[0].v_ptr;
-    float *out = (float *)args[1].v_ptr;
-    // float *buf = (float *)buffer[0];
-    for (int j = 0; j < 1024; j++) {
-        out[bs * 128 * 1024 + i * 1024 + j] = 1.0f;
-    }
-    if (sc_dyn_threadpool_loop_end(nullptr, 0)) {
-        sc_dyn_threadpool_create_work_items(layer1_func, nullptr,
-                /*num_iter**/ 1,
-                /*loop_len*/ 128, /*num_blocks*/ 16, /*outer_loop_hash*/ bs,
-                /*num_buffers*/ 1, buffer, 1);
-    }
-}
-
-static void layer1_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args) {
-    auto bs = itr[0];
-    auto i = itr[1];
-    float *input = (float *)args[0].v_ptr;
-    float *out = (float *)args[1].v_ptr;
-    float *buf = (float *)buffer[0];
-    buf[i] = 0;
-    if (sc_dyn_threadpool_loop_end(nullptr, 0)) {
-        sc_dyn_threadpool_create_work_items(layer2_func, nullptr,
-                /*num_iter**/ 1,
-                /*loop_len*/ 128, /*num_blocks*/ 16, /*outer_loop_hash*/ bs,
-                /*num_buffers*/ 1, buffer, 1);
-    }
-};
-
-static void layer2_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args) {
-    auto bs = itr[0];
-    auto i = itr[1];
-    // auto j = itr[2];
-    float *input = (float *)args[0].v_ptr;
-    float *out = (float *)args[1].v_ptr;
-    float *buf = (float *)buffer[0];
-    assert(buf[i] == 0);
-    for (int j = 0; j < 1024; j++) {
-        buf[i] += input[bs * 128 * 1024 + i * 1024 + j];
-    }
-    if (sc_dyn_threadpool_loop_end(nullptr, 0)) {
-        sc_dyn_threadpool_create_work_items(layer3outer_func, nullptr,
-                /*num_iter**/ 1,
-                /*loop_len*/ 128, /*num_blocks*/ 16, /*outer_loop_hash*/ bs,
-                /*num_buffers*/ 1, nullptr, 1);
-    }
-}
-static void layer3outer_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args) {
-    auto bs = itr[0];
-    auto i = itr[1];
-    float *input = (float *)args[0].v_ptr;
-    float *out = (float *)args[1].v_ptr;
-    float *buf = (float *)buffer[0];
-    ///// checking
-    for (int j = 0; j < 1024; j++) {
-        assert(out[bs * 128 * 1024 + i * 1024 + j] == 1.0f);
-    }
-    ///// end of checking
-    sc_dyn_threadpool_create_work_items(layer3_func, itr,
-            /*num_iter**/ 2,
-            /*loop_len*/ 1024, /*num_blocks*/ 4,
-            /*outer_loop_hash*/ bs * 23 + i,
-            /*num_buffers*/ 1, nullptr, 1);
-}
-static void layer3_func(void *stream, void *mod_data, uint64_t *itr,
-        void **buffer, generic_val *args) {
-    auto bs = itr[0];
-    auto i = itr[1];
-    auto j = itr[2];
-    float *input = (float *)args[0].v_ptr;
-    float *out = (float *)args[1].v_ptr;
-    float *buf = (float *)buffer[0];
-    assert(buf[i] != 0);
-    out[bs * 128 * 1024 + i * 1024 + j] += buf[i] + std::cos(j);
-}
-} // namespace dyn_tp_test
-
-static constexpr int num_expected_threads = 16;
-static constexpr int num_real_threads = num_expected_threads - 6;
-TEST(GCCore_CPU_dyn_thread_pool, TestBarrier) {
-    /*
-    float input[32,128,1024]
-    float out[32,128,1024]
-    parallelfor(int bs=0;bs<32;bs++) {
-        float buf[128];
-        # loop 1: triggered=true
-        parallelfor(int i=0;i<128;i++) {
-            for(int j=0;j<1024;j++) {
-                out[bs,i,j] = 1.0
-            }
-        }
-        # loop 2: triggered=true
-        parallelfor(int i=0;i<128;i++) {
-            buf[i] = 0
-        }
-        # loop 3: dep=loop2, sync_level=1
-        parallelfor(int i=0;i<128;i++) {
-            for(int j=0;j<1024;j++) {
-                buf[i] += input[bs,i,j]
-            }
-        }
-
-        # loop 4: dep=loop1-loop3, sync_level=1
-        parallelfor(int i=0;i<128;i++) {
-            parallelfor(int j=0;j<1024;j++) {
-                out[bs,i,j] = buf[i] + cos(j)
-            }
-        }
-    }
-    */
-
-    test_buffer<float> input = alloc_array<float>(32 * 128 * 1024);
-    test_buffer<float> out = alloc_array<float>(32 * 128 * 1024);
-    generic_val buffers[] = {input.data(), out.data()};
-    auto run = [](decltype(
-                          runtime::dynamic_threadpool::thread_main) thread_main,
-                       generic_val *buffers) {
-        thread_main(
-                [](runtime::stream_t *, void *, generic_val *buffers) {
-                    sc_dyn_threadpool_sched_init(runtime::get_default_stream(),
-                            nullptr, buffers, 1, /*queue size*/ 1024,
-                            num_expected_threads);
-                    sc_dyn_threadpool_create_work_items(
-                            dyn_tp_test::layer_start_func, nullptr, 0, 32, 32,
-                            0, 0, nullptr,
-                            work_item_flags::bind_last_level
-                                    | work_item_flags::is_root | 1);
-
-                    sc_dyn_threadpool_run();
-                    sc_dyn_threadpool_sched_destroy();
-                },
-                runtime::get_default_stream(), nullptr, buffers);
-    };
-
-    test_buffer<float> expected;
-    {
-        dnnl_thread_env();
-        expected = alloc_array<float>(32 * 128 * 1024);
-        utils::parallel_for(0, 32, 1, [&](int bs) {
-            for (int i = 0; i < 128; i++) {
-                float sum = 0;
-                for (int j = 0; j < 1024; j++) {
-                    sum += input[bs * 128 * 1024 + i * 1024 + j];
-                }
-                for (int j = 0; j < 1024; j++) {
-                    expected[bs * 128 * 1024 + i * 1024 + j]
-                            = 1.0f + sum + std::cos(j);
-                }
-            }
-        });
-        run(runtime::dynamic_threadpool::thread_main, buffers);
-    }
-    test_utils::compare_data(out, expected);
-    SET_THREADS_OR_SKIP(num_real_threads);
-    out.zeroout();
-    {
-        dnnl_thread_env();
-        run(dyn_tp_test::thread_main_testing, buffers);
-    }
-    test_utils::compare_data(out, expected);
-}
-
-namespace test2 {
-static void layer3_func(void *stream, void *mod_data, uint64_t *itr, void **,
-        generic_val *args) {
-    auto i = itr[0];
-    std::atomic<int> *input = (std::atomic<int> *)args[0].v_ptr;
-    int *result = (int *)args[1].v_ptr;
-    int accu = i;
-    for (int j = 0; j < 8; j++) {
-        int check = i * 3 + j;
-        for (int k = 0; k < 4; k++) {
-            accu += (check + k);
-        }
-    }
-    result[i] = (accu == input[i]) ? 2 : 1;
-};
-
-static void layer2_func(void *stream, void *mod_data, uint64_t *itr,
-        void **bufs, generic_val *args) {
-    auto i = itr[0];
-    auto j = itr[1];
-    auto k = itr[2];
-    std::atomic<int> *input = (std::atomic<int> *)args[0].v_ptr;
-    int *result = (int *)args[1].v_ptr;
-    int *check = (int *)bufs[0];
-    input[i] += (check[0] + k);
-    if (auto scope = sc_dyn_threadpool_loop_end(nullptr, 0)) {
-        if (sc_dyn_threadpool_loop_end(scope, 1)) {
-            sc_dyn_threadpool_create_work_items(layer3_func, nullptr,
-                    /*num_iter*/ 1, /*looplen*/ 1, /*num_blocks*/ 1,
-                    /*loop_hash*/ i, /*numbuffers*/ 0, nullptr, /*flags*/ 1);
-        }
-    }
-};
-
-static void layer1_func(void *stream, void *mod_data, uint64_t *itr,
-        void **bufs, generic_val *args) {
-    auto i = itr[0];
-    auto j = itr[1];
-    std::atomic<int> *input = (std::atomic<int> *)args[0].v_ptr;
-    int *result = (int *)args[1].v_ptr;
-    int *check = (int *)sc_dyn_threadpool_shared_buffer(sizeof(int));
-    check[0] = i * 3 + j;
-    sc_dyn_threadpool_create_work_items(layer2_func, nullptr,
-            /*num_iter*/ 2, /*looplen*/ 4, /*num_blocks*/ 4,
-            /*loop_hash*/ i * 8 + j, /*numbuffers*/ 1, (void **)&check,
-            /*flags*/ 1);
-};
-
-static void layer0_func(void *stream, void *mod_data, uint64_t *itr,
-        void **bufs, generic_val *args) {
-    auto i = itr[0];
-    std::atomic<int> *input = (std::atomic<int> *)args[0].v_ptr;
-    int *result = (int *)args[1].v_ptr;
-    input[i] = i;
-    sc_dyn_threadpool_create_work_items(layer1_func, nullptr,
-            /*num_iter*/ 1, /*looplen*/ 8, /*num_blocks*/ 4,
-            /*loop_hash*/ i * 8, /*numbuffers*/ 0, nullptr,
-            /*flags*/ 1);
-};
-} // namespace test2
-
-TEST(GCCore_CPU_dyn_thread_pool, TestMultiLevelSync) {
-    /*
-    atomic<int> data[4];
-    parallelfor(int i=0;i<4;i++) {
-        // scope0
-        data[i]=i;
-        parallelfor(int j=0;j<8;j++) {
-            int check[1];
-            check[0] = i*3+j; // scope1
-            parallelfor(int k=0;k<4;k++) {
-                data[i] += (check[0]+k) // scope2
-            }
-        }
-        assert(data[i]==XXX) // scope3
-    }
-    */
-
-    test_buffer<int> input = alloc_array<int>(4, INIT_ZERO);
-    test_buffer<int> result = alloc_array<int>(4, INIT_ZERO);
-    generic_val buffers[] = {input.data(), result.data()};
-    auto run = [](decltype(
-                          runtime::dynamic_threadpool::thread_main) thread_main,
-                       generic_val *buffers) {
-        thread_main(
-                [](runtime::stream_t *, void *, generic_val *buffers) {
-                    sc_dyn_threadpool_sched_init(runtime::get_default_stream(),
-                            nullptr, buffers,
-                            /*num_roots*/ 1,
-                            /*queue size*/ 512, num_expected_threads);
-
-                    sc_dyn_threadpool_create_work_items(test2::layer0_func,
-                            nullptr,
-                            /*num_iter*/ 0, /*looplen*/ 4, /*num_blocks*/ 4,
-                            /*loop_hash*/ 0, /*numbuffers*/ 0, nullptr,
-                            /*flags*/
-                            runtime::dynamic_threadpool::work_item_flags::
-                                            bind_last_level
-                                    | runtime::dynamic_threadpool::
-                                            work_item_flags::is_root
-                                    | 1);
-                    sc_dyn_threadpool_run();
-                    sc_dyn_threadpool_sched_destroy();
-                },
-                runtime::get_default_stream(), nullptr, buffers);
-    };
-    {
-        dnnl_thread_env();
-        run(runtime::dynamic_threadpool::thread_main, buffers);
-    }
-    for (int i = 0; i < 4; i++) {
-        ASSERT_EQ(result[i], 2);
-    }
-    result.zeroout();
-    {
-        dnnl_thread_env();
-        SET_THREADS_OR_SKIP(num_real_threads);
-        run(dyn_tp_test::thread_main_testing, buffers);
-    }
-    for (int i = 0; i < 4; i++) {
-        ASSERT_EQ(result[i], 2);
-    }
-}
-
-#if defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_eliminate_zero_shaped_tensors.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_eliminate_zero_shaped_tensors.cpp
deleted file mode 100644
index 4d2f2ba0e9e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_eliminate_zero_shaped_tensors.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, ConcatInputShapeZero1) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    sc_graph_t graph0;
-    int A = 32, B0 = 32, B1 = 0, C = 128;
-    auto in0 = graph0.make_input({graph_tensor::make(
-            {A, B0, C}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make(
-            {A, B0, C}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make(
-            {A, B1, C}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto add = graph0.make(
-            "add", {in0->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    // break pre_fuse to show the inputs of concat op.
-    auto concat = graph0.make("concat",
-            {add->get_outputs()[0], in2->get_outputs()[0]}, {},
-            {{"axis", 1}, {op_attr_key::break_pre_fuse, true}});
-    auto out = graph0.make_output(concat->get_outputs());
-
-    graph_driver(graph0, ctx);
-    EXPECT_TRUE(std::any_of(
-            graph0.ops_.begin(), graph0.ops_.end(), [](const sc_op_ptr &op) {
-                return op->isa<concat_op_t>() && op->info_.inputs_.size() == 1;
-            }));
-
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-    auto input0_data = alloc_array<float>(A * B0 * C);
-    auto input1_data = alloc_array<float>(A * B0 * C);
-    // For tensor whose shape contains 0, use alloc_size 1
-    auto input2_data = alloc_array<float>(1);
-    auto graph_output0_data = alloc_array<float>(A * (B0 + B1) * C);
-    fptr->call_default(&input0_data[0], &input1_data[0], &input2_data[0],
-            &graph_output0_data[0]);
-
-    auto ref_output0_data = alloc_array<float>(A * (B0 + B1) * C);
-    for (size_t i = 0; i < ref_output0_data.size(); ++i) {
-        ref_output0_data[i] = input0_data[i] + input1_data[i];
-    }
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, ConcatInputShapeZero2) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    sc_graph_t graph0;
-    int A = 4, B0 = 34, B1 = 0, C = 32, D = 128;
-    auto in0 = graph0.make_input({graph_tensor::make({A, B0, C, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({A, C, B1, D},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto trans = graph0.make("transpose", {in0->get_outputs()[0]}, {},
-            {{"order", std::vector<int> {0, 2, 1, 3}}});
-    auto concat = graph0.make("concat",
-            {in1->get_outputs()[0], trans->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    auto quant = graph0.make("quantize", concat->get_outputs(), {},
-            {{"channel_axis", 1}, {"zero_points", std::vector<int> {99}},
-                    {"scales", std::vector<float> {0.00773799f}},
-                    {"per_channel", false}, {"dtype", datatypes::u8}});
-    auto out = graph0.make_output(quant->get_outputs());
-
-    graph_driver(graph0, ctx);
-    EXPECT_TRUE(std::any_of(
-            graph0.ops_.begin(), graph0.ops_.end(), [](const sc_op_ptr &op) {
-                return op->isa<concat_op_t>() && op->info_.inputs_.size() == 1;
-            }));
-
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, out});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-}
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, ConcatInputShapeAllZero) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make({4, 0, 1, 2},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({4, 0, 1, 2},
-            sc_data_format_t(format_kinds::ABCD), datatypes::f32)});
-    auto concat = graph0.make("concat",
-            {in0->get_outputs()[0], in1->get_outputs()[0]}, {},
-            {{"axis", 2}, {op_attr_key::break_pre_fuse, true},
-                    {op_attr_key::break_post_fuse, true}});
-    auto out = graph0.make_output(concat->get_outputs());
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, out});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-}
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, AddInputShapeZero) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    int A = 32, B0 = 0, B1 = 32, C = 128;
-    auto input0_data = alloc_array<float>(1);
-    auto input1_data = alloc_array<float>(1);
-    auto input2_data = alloc_array<float>(A * B1 * C);
-    auto graph_output0_data = alloc_array<float>(A * (B0 + B1) * C);
-    auto ref_output0_data = alloc_array<float>(A * (B0 + B1) * C);
-
-    // graph with zero shaped input tensor
-    {
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make({A, B0, C},
-                sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-        auto in1 = graph0.make_input({graph_tensor::make({A, B0, C},
-                sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-        auto in2 = graph0.make_input({graph_tensor::make({A, B1, C},
-                sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-        auto add = graph0.make(
-                "add", {in0->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-        auto concat = graph0.make("concat",
-                {add->get_outputs()[0], in2->get_outputs()[0]}, {},
-                {{"axis", 1}, {op_attr_key::break_pre_fuse, true}});
-        auto out = graph0.make_output(concat->get_outputs());
-
-        graph_driver(graph0, ctx);
-        // add op is deleted and concat op only has one input left.
-        EXPECT_TRUE(std::none_of(graph0.ops_.begin(), graph0.ops_.end(),
-                [](const sc_op_ptr &op) { return op->isa<add_op_t>(); }));
-        EXPECT_TRUE(std::any_of(graph0.ops_.begin(), graph0.ops_.end(),
-                [](const sc_op_ptr &op) {
-                    return op->isa<concat_op_t>()
-                            && op->info_.inputs_.size() == 1;
-                }));
-
-        auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-        fptr->call_default(&input0_data[0], &input1_data[0], &input2_data[0],
-                &graph_output0_data[0]);
-    }
-
-    // equivalent graph
-    {
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make({A, B1, C},
-                sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-        auto concat = graph0.make(
-                "concat", {in0->get_outputs()[0]}, {}, {{"axis", 1}});
-        auto out = graph0.make_output(concat->get_outputs());
-
-        graph_driver(graph0, ctx);
-        auto ir_mod = lower_graph(ctx, graph0, {in0, out});
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-        fptr->call_default(&input2_data[0], &ref_output0_data[0]);
-    }
-
-    test_utils::compare_data(
-            graph_output0_data, ref_output0_data, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, SingleOpShapeZero) {
-    // 64x128x28x0
-    // graph with zero shaped input tensor
-    {
-        sc_graph_t graph0;
-        auto in0 = graph0.make_input({graph_tensor::make(
-                {64, 128, 28, 0}, sc_data_format_t(), datatypes::f32)});
-        auto in1 = graph0.make_input({graph_tensor::make(
-                {64, 128, 28, 0}, sc_data_format_t(), datatypes::f32)});
-        auto add = graph0.make(
-                "add", {in0->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-        auto out = graph0.make_output(add->get_outputs());
-
-        graph_driver(graph0, get_test_ctx());
-        auto ir_mod = lower_graph(get_test_ctx(), graph0, {in0, in1, out});
-    }
-    {
-        sc_graph_t graph0;
-        auto in = graph0.make_input({graph_tensor::make(
-                {64, 128, 28, 0}, sc_data_format_t(), datatypes::f32)});
-        auto reorder = graph0.make("reorder", in->get_outputs(), {},
-                {{"internal", true}, {"out_format", sc_data_format_t::NHWC()}});
-        auto out = graph0.make_output(reorder->get_outputs());
-
-        graph_driver(graph0, get_test_ctx());
-        auto ir_mod = lower_graph(get_test_ctx(), graph0, {in, out});
-    }
-}
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, TunableOpShapeZero) {
-    {
-        sc_graph_t graph;
-        auto in0 = graph.make_input({graph_tensor::make(
-                {0, 64, 56, 56}, sc_data_format_t(), datatypes::f32)});
-        auto in1 = graph.make_input({graph_tensor::make(
-                {128, 64, 3, 3}, sc_data_format_t(), datatypes::f32)});
-        auto conv = graph.make("conv_fwd",
-                {in0->get_outputs()[0], in1->get_outputs()[0]}, {},
-                {{"data_format", "NCX"}, {"weights_format", "OIX"},
-                        {"strides", sc_dims {1, 1}},
-                        {"pads_begin", sc_dims {0, 0}},
-                        {"pads_end", sc_dims {0, 0}}});
-        auto out = graph.make_output(conv->get_outputs());
-        graph_driver(graph, get_test_ctx());
-        auto ir_mod = lower_graph(get_test_ctx(), graph, {in0, in1, out});
-    }
-    {
-        sc_graph_t graph;
-        auto in0 = graph.make_input({graph_tensor::make(
-                {64, 64}, sc_data_format_t(), datatypes::f32)});
-        auto in1 = graph.make_input({graph_tensor::make(
-                {64, 0}, sc_data_format_t(), datatypes::f32)});
-        auto matmul = graph.make("matmul",
-                {in0->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-        auto out = graph.make_output(matmul->get_outputs());
-
-        graph_driver(graph, get_test_ctx());
-        auto ir_mod = lower_graph(get_test_ctx(), graph, {in0, in1, out});
-    }
-}
-
-TEST(GCCore_CPU_test_eliminate_zero_shaped_tensors, NormOpShapeZero) {
-    {
-        sc_graph_t graph;
-        auto in = graph.make_input({graph_tensor::make(
-                {0, 56, 56, 64}, sc_data_format_t(), datatypes::f32)});
-        auto layernorm = graph.make("layernorm", {in->get_outputs()[0]}, {},
-                {{"use_affine", false}});
-        auto out = graph.make_output(layernorm->get_outputs());
-        graph_driver(graph, get_test_ctx());
-        auto ir_mod = lower_graph(get_test_ctx(), graph, {in, out});
-    }
-    {
-        sc_graph_t graph;
-        auto in0 = graph.make_input({graph_tensor::make(
-                {0, 56, 56, 64}, sc_data_format_t(), datatypes::f32)});
-        auto in1 = graph.make_input(
-                {graph_tensor::make({64}, sc_data_format_t(), datatypes::f32)});
-        auto in2 = graph.make_input(
-                {graph_tensor::make({64}, sc_data_format_t(), datatypes::f32)});
-        auto layernorm = graph.make("layernorm",
-                {in0->get_outputs()[0], in1->get_outputs()[0],
-                        in2->get_outputs()[0]},
-                {}, {{"use_affine", true}});
-        auto out = graph.make_output(layernorm->get_outputs());
-
-        graph_driver(graph, get_test_ctx());
-        auto ir_mod = lower_graph(get_test_ctx(), graph, {in0, in1, in2, out});
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_equals_check.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_equals_check.cpp
deleted file mode 100644
index 2feca88e1cd..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_equals_check.cpp
+++ /dev/null
@@ -1,990 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <compiler/ir/builder.hpp>
-
-#include "gtest/gtest.h"
-#include <compiler/ir/ir_comparer.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-template <typename T, typename T2>
-void run_binary() {
-    ir_comparer cmper {};
-    // binary
-    { // good
-        expr a = make_expr<T>(
-                make_expr<constant_node>(INT64_C(1), datatypes::s32),
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        auto b = make_expr<T>(
-                make_expr<constant_node>(INT64_C(1), datatypes::s32),
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        EXPECT_TRUE(a->equals(b));
-        EXPECT_TRUE(cmper.compare(a, b));
-    }
-    { // different op
-        expr a = make_expr<T>(
-                make_expr<constant_node>(INT64_C(1), datatypes::s32),
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        auto b = make_expr<T2>(
-                make_expr<constant_node>(INT64_C(1), datatypes::s32),
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        EXPECT_FALSE(a->equals(b));
-    }
-    { // different operand
-        expr a = make_expr<T>(
-                make_expr<constant_node>(INT64_C(1), datatypes::s32),
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        auto b = make_expr<T>(
-                make_expr<constant_node>(INT64_C(1111), datatypes::s32),
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        EXPECT_FALSE(a->equals(b));
-    }
-}
-
-TEST(GCCore_CPU_test_equals_check, TestEqualsExpr) {
-    ir_comparer cmper {};
-    // const
-    {
-        expr a = make_expr<constant_node>(12.3f, datatypes::f16);
-        auto b = make_expr<constant_node>(12.3f, datatypes::f16);
-        EXPECT_TRUE(a->equals(b));
-    }
-    {
-        auto a = make_expr<constant_node>(12.3f, datatypes::f16);
-        auto b = make_expr<constant_node>(12.3f, datatypes::f32);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    {
-        auto a = make_expr<constant_node>(INT64_C(123), datatypes::s32);
-        auto b = make_expr<constant_node>(INT64_C(124), datatypes::s32);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    // const vector
-    {
-        expr a = make_expr<constant_node>(
-                std::vector<union_val> {12.3f, 12.3f, 12.3f, 12.3f},
-                sc_data_type_t::f32(4));
-        auto b = make_expr<constant_node>(
-                std::vector<union_val> {12.3f, 12.3f, 12.3f, 12.3f},
-                sc_data_type_t::f32(4));
-        EXPECT_TRUE(a->equals(b));
-    }
-    {
-        auto a = make_expr<constant_node>(
-                std::vector<union_val> {12.0f, 12.0f, 12.0f, 12.0f},
-                sc_data_type_t::f32(4));
-        auto b = make_expr<constant_node>(
-                std::vector<union_val> {
-                        INT64_C(12), INT64_C(12), INT64_C(12), INT64_C(12)},
-                sc_data_type_t::s32(4));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    {
-        auto a = make_expr<constant_node>(
-                std::vector<union_val> {
-                        INT64_C(123), INT64_C(123), INT64_C(123), INT64_C(123)},
-                sc_data_type_t::s32(4));
-        auto b = make_expr<constant_node>(
-                std::vector<union_val> {
-                        INT64_C(123), INT64_C(123), INT64_C(123), INT64_C(124)},
-                sc_data_type_t::s32(4));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    // var
-    {
-        auto a = make_expr<var_node>(datatypes::f16, "a");
-        auto b = make_expr<var_node>(datatypes::f16, "b");
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-        ir_comparer cmp2(true, true, false, true);
-        EXPECT_FALSE(a->equals(b, cmp2));
-        EXPECT_EQ(cmp2.diff->first_diff_expr_.first.get(), a.get());
-        EXPECT_EQ(cmp2.diff->first_diff_expr_.second.get(), b.get());
-
-        auto c = make_expr<var_node>(datatypes::f16, "a");
-        ir_comparer cmp3(true, true, true, true);
-        EXPECT_TRUE(a->equals(c, cmper));
-        EXPECT_TRUE(a->equals(c, cmp2));
-        cmper.reset();
-        EXPECT_FALSE(a->equals(c, cmp3));
-    }
-    {
-        auto a = make_expr<var_node>(datatypes::f16, "a");
-        auto b = make_expr<var_node>(datatypes::s32, "b");
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // cast
-    { // good
-        auto a = make_expr<cast_node>(datatypes::f16,
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        auto b = make_expr<cast_node>(datatypes::f16,
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // different val
-        auto a = make_expr<cast_node>(datatypes::f16,
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        auto b = make_expr<cast_node>(datatypes::f16,
-                make_expr<constant_node>(INT64_C(124), datatypes::s32));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // different type
-        auto a = make_expr<cast_node>(datatypes::f16,
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        auto b = make_expr<cast_node>(datatypes::u8,
-                make_expr<constant_node>(INT64_C(123), datatypes::s32));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    run_binary<add_node, mul_node>();
-    run_binary<logic_and_node, logic_or_node>();
-    run_binary<cmp_eq_node, cmp_lt_node>();
-
-    // logic_not_node
-    { // good
-        auto a = make_expr<logic_not_node>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean));
-        auto b = make_expr<logic_not_node>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // NE
-        auto a = make_expr<logic_not_node>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean));
-        auto b = make_expr<logic_not_node>(make_expr<logic_not_node>(
-                make_expr<constant_node>(UINT64_C(0), datatypes::boolean)));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // call
-    func_t f1 = func_t(new func_base("AA", {}, stmt(), datatypes::f16));
-    func_t f2 = func_t(new func_base("AA", {}, stmt(), datatypes::f16));
-    expr ptr1 = make_expr<var_node>(datatypes::pointer, "p1");
-    ptr1->attr()["prototype"] = f1;
-    expr ptr2 = make_expr<var_node>(datatypes::pointer, "p2");
-    ptr2->attr()["prototype"] = f2;
-    { // good
-        auto a = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-
-        a = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                std::vector<call_node::parallel_attr_t> {{1, 2, 3}});
-        b = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                std::vector<call_node::parallel_attr_t> {{1, 2, 3}});
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-
-        // check call function pointer
-        a = make_expr<call_node>(ptr1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        b = make_expr<call_node>(ptr2,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // different func_t (even if they are actually the same)
-        auto a = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<call_node>(f2,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        // check call function ptr v.s. call function
-        b = make_expr<call_node>(ptr1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // different arg num
-        auto a = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<call_node>(f2,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(UINT64_C(2), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // different arg
-        auto a = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<call_node>(f2,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(UINT64_C(2), datatypes::s32)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // different para_attr
-        auto a = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                std::vector<call_node::parallel_attr_t> {{1, 2, 3}});
-        auto b = make_expr<call_node>(f1,
-                std::vector<expr> {make_expr<constant_node>(
-                                           UINT64_C(1), datatypes::boolean),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        b->para_attr_ = {{2, 2, 3}};
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // tensor
-    { // good
-        auto a = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<tensor_node>(datatypes::f32, "BB",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-
-        ir_comparer cmp2(true, true, false, true);
-        EXPECT_FALSE(a->equals(b, cmp2));
-        EXPECT_EQ(cmp2.diff->first_diff_expr_.first.get(), a.get());
-        EXPECT_EQ(cmp2.diff->first_diff_expr_.second.get(), b.get());
-        cmp2.reset();
-
-        auto c = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-
-        EXPECT_TRUE(a->equals(c, cmper));
-        cmper.reset();
-        EXPECT_TRUE(a->equals(c, cmp2));
-        ir_comparer cmp3(true, true, true, true);
-        EXPECT_FALSE(a->equals(c, cmp3));
-        cmp3.set_expr_mapping(a, c);
-        EXPECT_TRUE(a->equals(c, cmp3));
-
-        a->address_space_ = address_space::device;
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff dim
-        auto a = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(4), datatypes::index)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff type
-        auto a = make_expr<tensor_node>(datatypes::s32, "AA",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff init_val
-        auto a = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {make_expr<constant_node>(UINT64_C(10))});
-        int a_v[] = {1, 2, 3, 4};
-        a->init_value_ = std::make_shared<static_data_t>(a_v, sizeof(a_v));
-        auto c = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {make_expr<constant_node>(UINT64_C(10))});
-        c->init_value_ = std::make_shared<static_data_t>(a_v, sizeof(a_v));
-        auto b = make_expr<tensor_node>(datatypes::f32, "AA",
-                std::vector<expr> {make_expr<constant_node>(UINT64_C(10))});
-        int b_v[] = {1, 2, 3, 6};
-        b->init_value_ = std::make_shared<static_data_t>(b_v, sizeof(b_v));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-        EXPECT_TRUE(a->equals(c, cmper));
-        cmper.reset();
-    }
-    // indexing
-
-    auto tensora = make_expr<tensor_node>(datatypes::f32, "AA",
-            std::vector<expr> {
-                    make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                    make_expr<constant_node>(UINT64_C(2), datatypes::index)});
-    auto tensorb = make_expr<tensor_node>(datatypes::f32, "AA",
-            std::vector<expr> {
-                    make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                    make_expr<constant_node>(UINT64_C(4), datatypes::index)});
-    { // good
-        auto a = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                expr());
-        auto b = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                expr());
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-
-        a = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                make_expr<constant_node>(UINT64_C(2), datatypes::index));
-        b = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                make_expr<constant_node>(UINT64_C(2), datatypes::index));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff tensor
-        auto a = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                expr());
-        auto b = make_expr<indexing_node>(tensorb,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                expr());
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff index
-        auto a = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(1), datatypes::index)},
-                expr());
-        auto b = make_expr<indexing_node>(tensorb,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                expr());
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff mask
-        auto a = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(1), datatypes::index)},
-                make_expr<constant_node>(UINT64_C(1), datatypes::index));
-        auto b = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)},
-                expr());
-        auto c = make_expr<indexing_node>(tensora,
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(1), datatypes::index)},
-                make_expr<constant_node>(UINT64_C(2), datatypes::index));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-        EXPECT_FALSE(b->equals(a, cmper));
-        cmper.reset();
-        EXPECT_FALSE(a->equals(c, cmper));
-        cmper.reset();
-    }
-
-    auto make_tensor_ptr = [](const expr &tsr, uint64_t idx1, uint64_t idx2) {
-        return make_expr<tensorptr_node>(
-                make_expr<indexing_node>(tsr,
-                        std::vector<expr> {make_expr<constant_node>(
-                                                   idx1, datatypes::index),
-                                make_expr<constant_node>(
-                                        idx2, datatypes::index)},
-                        expr()),
-                std::vector<expr> {expr(1), 2}, false);
-    };
-    { // good
-        auto a = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(2));
-        auto b = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(2));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-        // diff shape
-        a->shape_ = {};
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-        b->shape_ = {};
-    }
-    { // diff tensor
-        auto a = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(2));
-        auto b = make_tensor_ptr(tensorb, UINT64_C(1), UINT64_C(2));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff index
-        auto a = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(1));
-        auto b = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(2));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff is_slice
-        auto a = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(2));
-        auto b = make_tensor_ptr(tensora, UINT64_C(1), UINT64_C(2));
-        a->is_slice_ = true;
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    auto make_intrin_call = [](uint64_t idx1, uint64_t idx2,
-                                    intrin_type name = intrin_type::max) {
-        return make_expr<intrin_call_node>(
-                name, std::vector<expr> {expr(idx1), idx2}, any_map_t());
-    };
-    { // good
-        auto a = make_intrin_call(UINT64_C(1), UINT64_C(2));
-        auto b = make_intrin_call(UINT64_C(1), UINT64_C(2));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff param
-        auto a = make_intrin_call(UINT64_C(1), UINT64_C(2));
-        auto b = make_intrin_call(UINT64_C(2), UINT64_C(2));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff name
-        auto a = make_intrin_call(UINT64_C(1), UINT64_C(2));
-        auto b = make_intrin_call(UINT64_C(1), UINT64_C(2), intrin_type::min);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    { // good
-        auto a = make_expr<func_addr_node>(f1);
-        auto b = make_expr<func_addr_node>(f1);
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff func_t
-        auto a = make_expr<func_addr_node>(f1);
-        auto b = make_expr<func_addr_node>(f2);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    { // good
-        auto a = make_expr<ssa_phi_node>(std::vector<expr> {1, 2, 3}, false);
-        auto b = make_expr<ssa_phi_node>(std::vector<expr> {1, 2, 3}, false);
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff
-        auto a = make_expr<ssa_phi_node>(std::vector<expr> {1}, false);
-        auto b = make_expr<ssa_phi_node>(std::vector<expr> {1, 2, 3}, false);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-}
-
-TEST(GCCore_CPU_test_equals_check, TestEqualsAutoReset) {
-    ir_comparer cmper {};
-    auto var1 = make_expr<var_node>(datatypes::f32, "A");
-    auto var2 = make_expr<var_node>(datatypes::f32, "B");
-    {
-        stmt a = make_stmt<assign_node_t>(
-                var1, make_expr<constant_node>(1.0f, datatypes::f16));
-        auto b = make_stmt<assign_node_t>(
-                var2, make_expr<constant_node>(1.0f, datatypes::f16));
-        EXPECT_TRUE(cmper.compare(a, b, false));
-        EXPECT_TRUE(cmper.get_expr_mapping(var1, var2));
-        cmper.reset();
-        EXPECT_TRUE(cmper.compare(a, b));
-        EXPECT_FALSE(cmper.get_expr_mapping(var1, var2));
-    }
-}
-
-TEST(GCCore_CPU_test_equals_check, TestEqualsStmt) {
-    ir_comparer cmper {};
-    // assign
-    auto var1 = make_expr<var_node>(datatypes::f32, "A");
-    auto var2 = make_expr<var_node>(datatypes::f32, "B");
-    auto var3 = make_expr<var_node>(datatypes::f16, "A");
-    {
-        stmt a = make_stmt<assign_node_t>(
-                var1, make_expr<constant_node>(1.0f, datatypes::f16));
-        auto b = make_stmt<assign_node_t>(
-                var2, make_expr<constant_node>(1.0f, datatypes::f16));
-        EXPECT_TRUE(a->equals(b));
-        EXPECT_TRUE(cmper.compare(a, b));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff var
-        auto a = make_stmt<assign_node_t>(
-                var1, make_expr<constant_node>(1.0f, datatypes::f16));
-        auto b = make_stmt<assign_node_t>(
-                var3, make_expr<constant_node>(1.0f, datatypes::f16));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff val
-        auto a = make_stmt<assign_node_t>(
-                var1, make_expr<constant_node>(1.0f, datatypes::f16));
-        auto b = make_stmt<assign_node_t>(
-                var3, make_expr<constant_node>(1.2f, datatypes::f16));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // stmts
-    {
-        auto a = make_stmt<stmts_node_t>(
-                std::vector<stmt> {make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16))});
-        auto b = make_stmt<stmts_node_t>(
-                std::vector<stmt> {make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16))});
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    {
-        auto a = make_stmt<stmts_node_t>(
-                std::vector<stmt> {make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16))});
-        auto b = make_stmt<stmts_node_t>(
-                std::vector<stmt> {make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.4f, datatypes::f16))});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-        b = make_stmt<stmts_node_t>(std::vector<stmt> {});
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // ifelse
-    {
-        auto a = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                stmt());
-        auto b = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                stmt());
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-        a = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)));
-        b = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // cond
-        auto a = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(2), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                stmt());
-        auto b = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                stmt());
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // then
-        auto a = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.4f, datatypes::f16)),
-                stmt());
-        auto b = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                stmt());
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // else
-        auto a = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.4f, datatypes::f16)),
-                stmt());
-        auto b = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.4f, datatypes::f16)));
-        auto c = make_stmt<if_else_node_t>(
-                make_expr<constant_node>(UINT64_C(1), datatypes::boolean),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.2f, datatypes::f16)),
-                make_stmt<evaluate_node_t>(
-                        make_expr<constant_node>(1.8f, datatypes::f16)));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-        EXPECT_FALSE(b->equals(a, cmper));
-        cmper.reset();
-        EXPECT_FALSE(b->equals(c, cmper));
-        cmper.reset();
-    }
-
-    // evaluate
-    {
-        auto a = make_stmt<evaluate_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::f16));
-        auto b = make_stmt<evaluate_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::f16));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    {
-        auto a = make_stmt<evaluate_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::f16));
-        auto b = make_stmt<evaluate_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::s32));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // var def
-    {
-        auto a = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"), linkage::local,
-                expr(1));
-        auto b = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"), linkage::local,
-                expr(1));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    { // diff linkage
-        auto a = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"), linkage::local,
-                expr(1));
-        auto b = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"),
-                linkage::private_global, expr(1));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    { // diff init v
-        auto a = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"), linkage::local,
-                expr());
-        auto b = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"), linkage::local,
-                expr(1));
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    { // diff var
-        auto a = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f16, "a"), linkage::local,
-                expr());
-        auto b = make_stmt<define_node_t>(
-                make_expr<var_node>(datatypes::f32, "a"), linkage::local,
-                expr());
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // returns
-    {
-        auto a = make_stmt<returns_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::f16));
-        auto b = make_stmt<returns_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::f16));
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-    }
-    {
-        auto a = make_stmt<returns_node_t>(
-                make_expr<constant_node>(1.2f, datatypes::f16));
-        auto b = make_stmt<returns_node_t>(expr());
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    // for-loop
-    auto const1 = make_expr<constant_node>(UINT64_C(123), datatypes::index);
-    auto const2 = make_expr<constant_node>(UINT64_C(321), datatypes::index);
-    auto const3 = make_expr<constant_node>(UINT64_C(1321), datatypes::index);
-    {
-        auto a = make_stmt<for_loop_node_t>(var1, const1, const2, const3,
-                make_stmt<evaluate_node_t>(const1), true, for_type::NORMAL);
-        auto b = make_stmt<for_loop_node_t>(var2, const1, const2, const3,
-                make_stmt<evaluate_node_t>(const1), true, for_type::NORMAL);
-        EXPECT_TRUE(a->equals(b, cmper));
-        cmper.reset();
-
-        b = make_stmt<for_loop_node_t>(var3, const1, const2, const3,
-                make_stmt<evaluate_node_t>(const1), true, for_type::NORMAL);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        b = make_stmt<for_loop_node_t>(var1, const2, const2, const3,
-                make_stmt<evaluate_node_t>(const1), true, for_type::NORMAL);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        b = make_stmt<for_loop_node_t>(var1, const1, const2, const2,
-                make_stmt<evaluate_node_t>(const1), true, for_type::NORMAL);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        b = make_stmt<for_loop_node_t>(var1, const1, const2, const3,
-                make_stmt<evaluate_node_t>(const2), true, for_type::NORMAL);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        b = make_stmt<for_loop_node_t>(var1, const1, const2, const3,
-                make_stmt<evaluate_node_t>(const1), false, for_type::NORMAL);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-
-        b = make_stmt<for_loop_node_t>(var1, const1, const2, const3,
-                make_stmt<evaluate_node_t>(const1), true, for_type::PARALLEL);
-        EXPECT_FALSE(a->equals(b, cmper));
-        cmper.reset();
-    }
-
-    { // var order checking
-        // var a; var b; a=b
-        // should be different from:
-        // var a; var b; b=a
-        auto a = make_expr<var_node>(datatypes::s32, "AA1");
-        auto b = make_expr<var_node>(datatypes::s32, "AA2");
-        std::vector<stmt> body1 {
-                make_stmt<evaluate_node_t>(a),
-                make_stmt<evaluate_node_t>(b),
-                make_stmt<assign_node_t>(a, b),
-        };
-
-        std::vector<stmt> body2 {
-                make_stmt<evaluate_node_t>(b),
-                make_stmt<evaluate_node_t>(a),
-                make_stmt<assign_node_t>(a, b),
-        };
-        EXPECT_FALSE(make_stmt<stmts_node_t>(std::move(body1))
-                             ->equals(make_stmt<stmts_node_t>(std::move(body2)),
-                                     cmper));
-        cmper.reset();
-    }
-
-    { // tensor order checking
-        auto a = make_expr<tensor_node>(datatypes::s32, "AA1",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        auto b = make_expr<tensor_node>(datatypes::s32, "AA2",
-                std::vector<expr> {
-                        make_expr<constant_node>(UINT64_C(1), datatypes::index),
-                        make_expr<constant_node>(
-                                UINT64_C(2), datatypes::index)});
-        std::vector<stmt> body1 {
-                make_stmt<evaluate_node_t>(a),
-                make_stmt<evaluate_node_t>(b),
-                make_stmt<assign_node_t>(a, b),
-        };
-
-        std::vector<stmt> body2 {
-                make_stmt<evaluate_node_t>(b),
-                make_stmt<evaluate_node_t>(a),
-                make_stmt<assign_node_t>(a, b),
-        };
-        EXPECT_FALSE(make_stmt<stmts_node_t>(std::move(body1))
-                             ->equals(make_stmt<stmts_node_t>(std::move(body2)),
-                                     cmper));
-        cmper.reset();
-    }
-}
-
-TEST(GCCore_CPU_test_equals_check, TestEqualsFunction) {
-    ir_comparer cmper {};
-    auto var1 = make_expr<var_node>(datatypes::f32, "A");
-    auto var2 = make_expr<var_node>(datatypes::s32, "B");
-    auto var3 = make_expr<var_node>(datatypes::f16, "A");
-    auto const1 = make_stmt<evaluate_node_t>(
-            make_expr<constant_node>(UINT64_C(123), datatypes::index));
-    auto const2 = make_stmt<evaluate_node_t>(
-            make_expr<constant_node>(UINT64_C(321), datatypes::index));
-
-    func_t f1 = func_t(new func_base(
-            "AA", std::vector<expr> {var1, var2}, const1, datatypes::f16));
-    func_t f2 = func_t(new func_base(
-            "AA", std::vector<expr> {var1, var2}, const1, datatypes::f16));
-    EXPECT_TRUE(f1->equals(f2, cmper));
-    cmper.reset();
-    EXPECT_TRUE(f1->equals(f2));
-
-    f2 = func_t(new func_base(
-            "AA2", std::vector<expr> {var1, var2}, const1, datatypes::f16));
-    EXPECT_TRUE(f1->equals(f2, cmper));
-    cmper.reset();
-    {
-        ir_comparer cmp2(true, true, false, true);
-        EXPECT_FALSE(f1->equals(f2, cmp2));
-        EXPECT_EQ(cmp2.diff->first_diff_func_.first.get(), f1.get());
-        EXPECT_EQ(cmp2.diff->first_diff_func_.second.get(), f2.get());
-        EXPECT_FALSE(cmp2.same_);
-        cmp2.reset();
-        EXPECT_EQ(cmp2.diff->first_diff_func_.first.get(), nullptr);
-    }
-
-    f2 = func_t(new func_base(
-            "AA", std::vector<expr> {var1, var3}, const1, datatypes::f16));
-    EXPECT_FALSE(f1->equals(f2, cmper));
-    cmper.reset();
-    {
-        ir_comparer cmp2(true, true, false, true);
-        EXPECT_FALSE(f1->equals(f2, cmp2));
-        EXPECT_EQ(cmp2.diff->first_diff_expr_.first.get(), var2.get());
-        EXPECT_EQ(cmp2.diff->first_diff_expr_.second.get(), var3.get());
-    }
-
-    f2 = func_t(new func_base(
-            "AA", std::vector<expr> {var1, var2}, const2, datatypes::f16));
-    EXPECT_FALSE(f1->equals(f2, cmper));
-    cmper.reset();
-
-    f2 = func_t(new func_base(
-            "AA", std::vector<expr> {var1, var2}, const1, datatypes::bf16));
-    EXPECT_FALSE(f1->equals(f2, cmper));
-    cmper.reset();
-}
-
-TEST(GCCore_CPU_test_equals_check, TestEqualsComm) {
-    ir_comparer cmper {true, false, true, false, true};
-    ir_comparer cmper2 {true, false, true, false, false};
-    auto var1 = make_expr<var_node>(datatypes::f32, "A");
-    auto var2 = make_expr<var_node>(datatypes::f32, "B");
-    expr v1 = var1 + var2;
-    expr v2 = var2 + var1;
-
-    // binary
-    EXPECT_TRUE(cmper.compare(v1, v2));
-    EXPECT_FALSE(cmper2.compare(v1, v2));
-    EXPECT_FALSE(cmper.compare(var1 - var2, var2 - var1));
-
-    // cmp
-    EXPECT_TRUE(cmper.compare(var1 == var2, var2 == var1));
-    EXPECT_FALSE(cmper.compare(var1 < var2, var2 < var1));
-
-    auto var3 = make_expr<var_node>(datatypes::boolean, "A");
-    auto var4 = make_expr<var_node>(datatypes::boolean, "B");
-
-    EXPECT_TRUE(cmper.compare(var3 && var4, var4 && var3));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_execution_verbose.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_execution_verbose.cpp
deleted file mode 100644
index 44b6d00e153..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_execution_verbose.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-
-// clang-format off
-#include <test_utils.hpp>
-// clang-format on
-#include "test_utils_arr_fill.hpp"
-#include <compiler/ir/graph/lowering.hpp>
-#if SC_CFAKE_JIT_ENABLED
-#include <compiler/jit/cfake/cfake_jit.hpp>
-#endif
-#include <compiler/jit/llvm/llvm_jit.hpp>
-#if SC_BUILTIN_JIT_ENABLED
-#include <compiler/jit/xbyak/xbyak_jit.hpp>
-#endif
-#include <runtime/config.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace std;
-
-struct runtime_config_saver_t {
-    bool execution_verbose;
-
-    runtime_config_saver_t() {
-        execution_verbose = runtime_config_t::get().execution_verbose_;
-    }
-
-    ~runtime_config_saver_t() {
-        runtime_config_t::get().execution_verbose_ = execution_verbose;
-    }
-};
-
-static map<string, shared_ptr<jit_engine_t>> test_jit_engines {
-#if SC_CFAKE_JIT_ENABLED
-    {"cfake_jit", make_shared<cfake_jit>()},
-#endif
-#ifdef SC_LLVM_BACKEND
-            {"llvm_jit", make_shared<llvm_jit>()},
-#endif
-#if SC_BUILTIN_JIT_ENABLED
-            {"xbyak_jit", make_shared<xbyak_jit>()},
-#endif
-};
-
-TEST(GCCore_CPU_test_execution_verbose, TestTimer) {
-    REQUIRE_AVX2();
-    sc_graph_t g;
-    auto ins = g.make_input(
-            {graph_tensor::make({2, 2}), graph_tensor::make({2, 2})});
-    auto add = g.make(
-            "add", {ins->get_outputs()[0], ins->get_outputs()[1]}, {}, {});
-    auto out = g.make_output(add->get_outputs());
-
-    vector<float> v1(4), v2(4), outf(4);
-    test_utils::fill_data(v1.data(), 4);
-    test_utils::fill_data(v2.data(), 4);
-
-    vector<generic_val> generic_args;
-    generic_args.emplace_back((void *)v1.data());
-    generic_args.emplace_back((void *)v2.data());
-    generic_args.emplace_back((void *)outf.data());
-
-    context_ptr ctx = make_shared<context_t>(*get_default_context());
-    runtime_config_saver_t conf;
-    runtime_config_t::get().execution_verbose_ = true;
-    auto mod = lower_graph(ctx, g, {ins, out});
-    for (auto &kv : test_jit_engines) {
-        if (kv.first == "xbyak_jit"
-                && !get_default_context()->machine_.cpu_flags_.fAVX512F) {
-            continue;
-        }
-        testing::internal::CaptureStdout();
-        shared_ptr<jit_engine_t> je = kv.second;
-        auto jitf = je->get_entry_func(mod, true);
-        jitf->call_generic_default(generic_args.data());
-        std::string output = testing::internal::GetCapturedStdout();
-
-        ASSERT_TRUE(output.find("Entry point: main_entry") != string::npos);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fp16_legalizer.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_fp16_legalizer.cpp
deleted file mode 100644
index 72c3843bf81..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fp16_legalizer.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#if SC_BUILTIN_JIT_ENABLED
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <util/any_map.hpp>
-
-#include <cstdint>
-#include "compiler/ir/builder.hpp"
-#include "compiler/ir/transform/simplify.hpp"
-#include "compiler/jit/xbyak/ir/transform/fp16_legalizer.hpp"
-#include "context.hpp"
-#include "util/fp16.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-TEST(GCCore_CPU_fp16_legalizer, TestFp16Transform) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f16, {123, 321}),
-            _arg_("B", datatypes::f16, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B);
-        B[0] = A[0];
-        B[1] = make_expr<constant_node>(32.f, datatypes::f16);
-        _var_(c, datatypes::f16);
-        c = make_expr<constant_node>(32.f, datatypes::f16);
-        B[2] = builder::make_max(
-                A[2], make_expr<constant_node>(0.f, datatypes::f16));
-        B[3] = c;
-    }
-
-    _function_(datatypes::void_t, expected,
-            _arg_("A", datatypes::f16, {123, 321}),
-            _arg_("B", datatypes::f16, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        builder::make_reinterpret(B, datatypes::u16.get_pointerof())[0]
-                = builder::make_reinterpret(
-                        builder::make_reinterpret(
-                                builder::make_reinterpret(
-                                        A, datatypes::u16.get_pointerof())[0],
-                                datatypes::f16),
-                        datatypes::u16);
-        builder::make_reinterpret(B, datatypes::u16.get_pointerof())[1]
-                = builder::make_reinterpret(
-                        make_expr<constant_node>(32.f, datatypes::f16),
-                        datatypes::u16);
-
-        _var_(c, datatypes::f16);
-        c = make_expr<constant_node>(32.f, datatypes::f16);
-        builder::make_reinterpret(B, datatypes::u16.get_pointerof())[2]
-                = builder::make_reinterpret(
-                        builder::make_max(
-                                builder::make_reinterpret(
-                                        builder::make_reinterpret(A,
-                                                datatypes::u16
-                                                        .get_pointerof())[2],
-                                        datatypes::f16),
-                                make_expr<constant_node>(0.f, datatypes::f16)),
-                        datatypes::u16);
-        builder::make_reinterpret(B, datatypes::u16.get_pointerof())[3]
-                = builder::make_reinterpret(c, datatypes::u16);
-    }
-    auto ctx = get_test_ctx();
-    auto mh = ctx->machine_;
-    mh.cpu_flags_.fAVX512AMXFP16 = false;
-    mh.cpu_flags_.fAVX512FP16 = false;
-    fp16_legalizer_t f(mh);
-    auto out = f(aaa);
-    ir_simplifier_t f_sim(false);
-    out = f_sim(out);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fpmath_mode.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_fpmath_mode.cpp
deleted file mode 100644
index 43c9ee39585..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fpmath_mode.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/jit/jit.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_fpmath_mode_cpp, TestAddCast2bf16) {
-    REQUIRE_BF16();
-    sc_dims input_dims = {100, 200};
-    const auto input_size = test_utils::product(input_dims);
-    test_buffer<float> input_data0(input_size);
-    test_buffer<float> input_data1(input_size);
-    test_buffer<float> input_data2(input_size);
-    test_utils::fill_data(&input_data0[0], input_size);
-    test_utils::fill_data(&input_data1[0], input_size);
-    test_utils::fill_data(&input_data2[0], input_size);
-    test_buffer<float> ref_output(input_size);
-    test_buffer<float> sc_output(input_size);
-
-    auto get_graph = [input_dims]() {
-        sc_graph_t g;
-
-        auto in0 = g.make_input({graph_tensor::make(input_dims)})
-                           ->get_outputs()[0];
-        auto in1 = g.make_input({graph_tensor::make(input_dims)})
-                           ->get_outputs()[0];
-        auto in2 = g.make_input({graph_tensor::make(input_dims)})
-                           ->get_outputs()[0];
-        auto addout1 = g.make("add", {in0, in1}, {}, {})->get_outputs()[0];
-        auto addout2 = g.make("add", {addout1, in2}, {}, {})->get_outputs()[0];
-        auto out = g.make_output({addout2});
-
-        return g;
-    };
-    auto ctx = get_test_ctx();
-    sc_graph_t g = get_graph();
-    g.attrs_["fpmath_mode"] = 1;
-    fpmath_mode(g, ctx);
-
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[100, 200], v1: f32[100, 200], v2: f32[100, 200]) -> [v3: f32[100, 200]] {
-  [v4: bf16[100, 200]] = cast(v2)
-  [v5: bf16[100, 200]] = cast(v1)
-  [v6: bf16[100, 200]] = cast(v0)
-  [v7: bf16[100, 200]] = add(v6, v5)
-  [v8: bf16[100, 200]] = add(v7, v4)
-  [v3: f32[100, 200]] = cast(v8)
-}
-)";
-    // Check graph
-    EXPECT_EQ(ss.str(), expected_str);
-
-    // Check accuracy
-    {
-        g = get_graph();
-        g.attrs_["fpmath_mode"] = 1;
-        graph_driver(g, ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto f = lower_graph(ctx, g, ins_out);
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-        fptr->call_default(&sc_output[0], &input_data0[0], &input_data1[0],
-                &input_data2[0]);
-    }
-    // ref
-    {
-        g = get_graph();
-        auto ref_ctx = get_test_ctx();
-        graph_driver(g, ref_ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto ref_f = lower_graph(ref_ctx, g, ins_out);
-        auto ref_fptr = jit_engine_t::make(ref_ctx)->get_entry_func(ref_f);
-        ref_fptr->call_default(&ref_output[0], &input_data0[0], &input_data1[0],
-                &input_data2[0]);
-    }
-    test_utils::compare_data(sc_output, ref_output, 1e-2f, 1e-2f);
-}
-
-TEST(GCCore_CPU_fpmath_mode_cpp, TestMatmulCast2bf16) {
-    REQUIRE_BF16();
-    sc_dims A_dims = {32, 64};
-    sc_dims B_dims = {64, 64};
-    sc_dims bias_dims = {64};
-    sc_dims C_dims = {32, 64};
-    test_buffer<float> input_A(test_utils::product(A_dims));
-    test_buffer<float> input_B(test_utils::product(B_dims));
-    test_buffer<float> input_bias(test_utils::product(bias_dims));
-    test_utils::fill_data(&input_A[0], test_utils::product(A_dims));
-    test_utils::fill_data(&input_B[0], test_utils::product(B_dims));
-    test_utils::fill_data(&input_bias[0], test_utils::product(bias_dims));
-    std::vector<float> ref_output(test_utils::product(C_dims));
-    std::vector<float> sc_output(test_utils::product(C_dims));
-
-    auto get_graph = [A_dims, B_dims, bias_dims]() {
-        sc_graph_t g;
-        auto in0 = g.make_input({graph_tensor::make(A_dims)})->get_outputs()[0];
-        auto in1 = g.make_input({graph_tensor::make(B_dims)})->get_outputs()[0];
-        auto bias = g.make_input({graph_tensor::make(bias_dims)})
-                            ->get_outputs()[0];
-        auto mmout
-                = g.make("matmul", {in0, in1, bias}, {}, {})->get_outputs()[0];
-        auto reluout = g.make("relu", {mmout}, {}, {})->get_outputs()[0];
-        auto out = g.make_output({reluout});
-        return g;
-    };
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    sc_graph_t g = get_graph();
-    g.attrs_["fpmath_mode"] = 1;
-    graph_inline(g, ctx);
-    fpmath_mode(g, ctx);
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    // mmm1 and mmm2 could not be parallel merged due to paritition ring risk
-    std::string expected_str
-            = R"(graph(v0: f32[32, 64], v1: f32[64, 64], v2: f32[64]) -> [v3: f32[32, 64]] {
-  [v4: bf16[64]] = cast(v2)
-  [v5: bf16[64, 64]] = cast(v1)
-  [v6: bf16[32, 64]] = cast(v0)
-  [v7: f32[32, 64]] = managed_matmul_core(v6, v5)
-  [v8: bf16[32, 64]] = cast(v7)
-  [v9: bf16[32, 64]] = add(v8, v4)
-  [v10: bf16[32, 64]] = relu(v9)
-  [v3: f32[32, 64]] = cast(v10)
-}
-)";
-    // Check graph
-    EXPECT_EQ(ss.str(), expected_str);
-    // Check accuracy
-    {
-        g = get_graph();
-        g.attrs_["fpmath_mode"] = 1;
-        graph_driver(g, ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto f = lower_graph(ctx, g, ins_out);
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-        fptr->call_default(
-                &sc_output[0], &input_A[0], &input_B[0], &input_bias[0]);
-    }
-    {
-        g = get_graph();
-        auto &ref_ctx = ctx;
-        graph_driver(g, ref_ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto ref_f = lower_graph(ref_ctx, g, ins_out);
-        auto ref_fptr = jit_engine_t::make(ref_ctx)->get_entry_func(ref_f);
-        ref_fptr->call_default(
-                &ref_output[0], &input_A[0], &input_B[0], &input_bias[0]);
-    }
-
-    auto rmse = test_utils::cal_rmse(sc_output, ref_output);
-    EXPECT_TRUE(rmse < 1e-2f);
-}
-
-TEST(GCCore_CPU_fpmath_mode_cpp, TestMLPInstanceNorm) {
-    REQUIRE_BF16();
-    sc_dims A_dims = {1, 64, 128};
-    sc_dims B_dims = {128, 128};
-    sc_dims bias_dims = {128};
-    sc_dims C_dims = {1, 64, 128};
-    test_buffer<float> input_0(test_utils::product(A_dims));
-    test_buffer<float> input_1(test_utils::product(B_dims));
-    test_buffer<float> input_2(test_utils::product(bias_dims));
-    test_buffer<float> input_3(test_utils::product(A_dims));
-    test_buffer<float> input_4(test_utils::product(bias_dims));
-    test_buffer<float> input_5(test_utils::product(bias_dims));
-    test_utils::fill_data(&input_0[0], test_utils::product(A_dims));
-    test_utils::fill_data(&input_1[0], test_utils::product(B_dims));
-    test_utils::fill_data(&input_2[0], test_utils::product(bias_dims));
-    test_utils::fill_data(&input_3[0], test_utils::product(A_dims));
-    test_utils::fill_data(&input_4[0], test_utils::product(bias_dims));
-    test_utils::fill_data(&input_5[0], test_utils::product(bias_dims));
-    std::vector<float> ref_output(test_utils::product(C_dims));
-    std::vector<float> sc_output(test_utils::product(C_dims));
-
-    auto get_graph = [A_dims, B_dims, bias_dims]() {
-        sc_graph_t g;
-        auto in0 = g.make_input({graph_tensor::make(A_dims)})->get_outputs()[0];
-        auto in1 = g.make_input({graph_tensor::make(B_dims)})->get_outputs()[0];
-        auto in2 = g.make_input({graph_tensor::make(bias_dims)})
-                           ->get_outputs()[0];
-        auto in3 = g.make_input({graph_tensor::make(A_dims)})->get_outputs()[0];
-        auto in4 = g.make_input({graph_tensor::make(bias_dims)})
-                           ->get_outputs()[0];
-        auto in5 = g.make_input({graph_tensor::make(bias_dims)})
-                           ->get_outputs()[0];
-        auto mmout
-                = g.make("matmul", {in0, in1, in2}, {}, {{"transpose_b", true}})
-                          ->get_outputs()[0];
-        auto addout = g.make("add", {mmout, in3}, {}, {})->get_outputs()[0];
-        auto lnout
-                = g.make("layernorm", {addout, in4, in5}, {},
-                           {{"begin_norm_axis", -1}, {"rd_axis", 2},
-                                   {"epsilon", 9.9996e-13f},
-                                   {"use_affine", true}, {"keep_stats", false}})
-                          ->get_outputs()[0];
-        auto out = g.make_output({lnout});
-        return g;
-    };
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    sc_graph_t g = get_graph();
-    g.attrs_["fpmath_mode"] = 1;
-    graph_inline(g, ctx);
-    fpmath_mode(g, ctx);
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    // mmm1 and mmm2 could not be parallel merged due to paritition ring risk
-    std::string expected_str
-            = R"(graph(v0: f32[1, 64, 128], v1: f32[128, 128], v2: f32[128], v3: f32[1, 64, 128], v4: f32[128], v5: f32[128]) -> [v6: f32[1, 64, 128]] {
-  [v7: f32[1]] = constant([1])
-  [v8: f32[1]] = constant([1])
-  [v9: bf16[128]] = cast(v5)
-  [v10: bf16[128]] = cast(v4)
-  [v11: bf16[1, 64, 128]] = cast(v3)
-  [v12: bf16[128]] = cast(v2)
-  [v13: bf16[128, 128]] = cast(v1)
-  [v14: bf16[128, 128]] = transpose(v13)
-  [v15: bf16[1, 64, 128]] = cast(v0)
-  [v16: bf16[64, 128]] = tensor_view(v15)
-  [v17: f32[64, 128]] = managed_matmul_core(v16, v14)
-  [v18: bf16[64, 128]] = cast(v17)
-  [v19: bf16[1, 64, 128]] = tensor_view(v18)
-  [v20: bf16[1, 64, 128]] = add(v19, v12)
-  [v21: bf16[1, 64, 128]] = add(v20, v11)
-  [v22: bf16[1, 64]] = reduce(v21)
-  [v23: f32[1, 64]] = cast(v22)
-  [v24: f32[1, 64]] = div(v23, v7)
-  [v25: bf16[1, 64]] = cast(v24)
-  [v26: bf16[1, 64, 128]] = sub(v21, v25)
-  [v27: bf16[1, 64, 128]] = mul(v26, v26)
-  [v28: bf16[1, 64]] = reduce(v27)
-  [v29: f32[1, 64]] = cast(v28)
-  [v30: f32[1, 64]] = div(v29, v7)
-  [v31: f32[1, 64]] = add(v30, v8)
-  [v32: bf16[1, 64]] = cast(v31)
-  [v33: bf16[1, 64]] = squared_root(v32)
-  [v34: bf16[1, 64, 128]] = div(v26, v33)
-  [v35: bf16[1, 64, 128]] = mul(v34, v10)
-  [v36: bf16[1, 64, 128]] = add(v35, v9)
-  [v6: f32[1, 64, 128]] = cast(v36)
-}
-)";
-    // Check graph
-    EXPECT_EQ(ss.str(), expected_str);
-    // Check accuracy
-    {
-        g = get_graph();
-        g.attrs_["fpmath_mode"] = 1;
-        graph_driver(g, ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto f = lower_graph(ctx, g, ins_out);
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-        fptr->call_default(&sc_output[0], &input_0[0], &input_1[0], &input_2[0],
-                &input_3[0], &input_4[0], &input_5[0]);
-    }
-    {
-        g = get_graph();
-        auto &ref_ctx = ctx;
-        graph_driver(g, ref_ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto ref_f = lower_graph(ref_ctx, g, ins_out);
-        auto ref_fptr = jit_engine_t::make(ref_ctx)->get_entry_func(ref_f);
-        ref_fptr->call_default(&ref_output[0], &input_0[0], &input_1[0],
-                &input_2[0], &input_3[0], &input_4[0], &input_5[0]);
-    }
-
-    auto rmse = test_utils::cal_rmse(sc_output, ref_output);
-    EXPECT_TRUE(rmse < 1e-2f);
-}
-
-TEST(GCCore_CPU_fpmath_mode_cpp, TestConvCast2bf16) {
-    REQUIRE_BF16();
-    int N = 32, IC = 128, OC = 128, H = 12, W = 12, R = 1, S = 1;
-    sc_dims input_dims = {N, H, W, IC};
-    sc_dims filter_dims = {R, S, IC, OC};
-    sc_dims bias_dims = {OC};
-    sc_dims output_dims = {N, H, W, OC};
-    test_buffer<float> input_A(test_utils::product(input_dims));
-    test_buffer<float> input_B1(test_utils::product(filter_dims));
-    test_buffer<float> input_B2(test_utils::product(filter_dims));
-    test_buffer<float> input_bias(test_utils::product(bias_dims));
-    test_utils::fill_data(&input_A[0], test_utils::product(input_dims));
-    test_utils::fill_data(&input_B1[0], test_utils::product(filter_dims));
-    test_utils::fill_data(&input_B2[0], test_utils::product(filter_dims));
-    test_utils::fill_data(&input_bias[0], test_utils::product(bias_dims));
-    std::vector<float> ref_output(test_utils::product(output_dims));
-    std::vector<float> sc_output(test_utils::product(output_dims));
-
-    auto get_graph = [input_dims, filter_dims, bias_dims]() {
-        sc_graph_t g;
-        auto data = g.make_input({graph_tensor::make(input_dims,
-                                         sc_data_format_t(format_kinds::ABCD))})
-                            ->get_outputs()[0];
-        auto w1 = g.make_input({graph_tensor::make(filter_dims,
-                                       sc_data_format_t(format_kinds::ABCD))})
-                          ->get_outputs()[0];
-        auto bias = g.make_input({graph_tensor::make(bias_dims)})
-                            ->get_outputs()[0];
-        auto w2 = g.make_input({graph_tensor::make(filter_dims,
-                                       sc_data_format_t(format_kinds::ABCD))})
-                          ->get_outputs()[0];
-        std::unordered_map<std::string, any_t> attrs
-                = {{"strides", sc_dims {1, 1}}, {"pads_begin", sc_dims {0, 0}},
-                        {"pads_end", sc_dims {0, 0}}};
-
-        auto conv1_out
-                = g.make("conv_fwd", {data, w1, bias}, {}, any_map_t(attrs))
-                          ->get_outputs()[0];
-        auto relu1_out = g.make("relu", {conv1_out}, {}, {})->get_outputs()[0];
-        auto conv2_out = g.make("conv_fwd", {data, w2}, {}, any_map_t(attrs))
-                                 ->get_outputs()[0];
-        auto relu2_out = g.make("relu", {conv2_out}, {}, {})->get_outputs()[0];
-        auto out = g.make("add", {relu1_out, relu2_out}, {}, {})
-                           ->get_outputs()[0];
-        g.make_output({out});
-        return g;
-    };
-    auto ctx = get_test_ctx();
-    sc_graph_t g = get_graph();
-    g.attrs_["fpmath_mode"] = 1;
-    graph_inline(g, ctx);
-    fpmath_mode(g, ctx);
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    // mmm1 and mmm2 could not be parallel merged due to paritition ring risk
-    std::string expected_str
-            = R"(graph(v0: f32[32, 12, 12, 128], v1: f32[1, 1, 128, 128], v2: f32[128], v3: f32[1, 1, 128, 128]) -> [v4: f32[32, 12, 12, 128]] {
-  [v5: bf16[1, 1, 128, 128]] = cast(v3)
-  [v6: bf16[128, 128, 1, 1]] = transpose(v5)
-  [v7: bf16[128]] = cast(v2)
-  [v8: bf16[1, 1, 128, 128]] = cast(v1)
-  [v9: bf16[128, 128, 1, 1]] = transpose(v8)
-  [v10: bf16[32, 12, 12, 128]] = cast(v0)
-  [v11: bf16[32, 128, 12, 12]] = transpose(v10)
-  [v12: f32[32, 128, 12, 12]] = conv_fwd_core(v11, v6)
-  [v13: bf16[32, 128, 12, 12]] = cast(v12)
-  [v14: bf16[32, 12, 128, 12]] = transpose(v13)
-  [v15: bf16[32, 12, 12, 128]] = relu(v14)
-  [v16: bf16[32, 12, 12, 128]] = cast(v0)
-  [v17: bf16[32, 128, 12, 12]] = transpose(v16)
-  [v18: f32[32, 128, 12, 12]] = conv_fwd_core(v17, v9)
-  [v19: bf16[32, 128, 12, 12]] = cast(v18)
-  [v20: bf16[32, 12, 128, 12]] = transpose(v19)
-  [v21: bf16[32, 12, 128, 12]] = add(v20, v7)
-  [v22: bf16[32, 12, 12, 128]] = relu(v21)
-  [v23: bf16[32, 12, 12, 128]] = add(v22, v15)
-  [v4: f32[32, 12, 12, 128]] = cast(v23)
-}
-)";
-    // Check graph
-    EXPECT_EQ(ss.str(), expected_str);
-    // Check accuracy
-    {
-        g = get_graph();
-        g.attrs_["fpmath_mode"] = 1;
-        graph_driver(g, ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto f = lower_graph(ctx, g, ins_out);
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-        fptr->call_default(&sc_output[0], &input_A[0], &input_B1[0],
-                &input_bias[0], &input_B2[0]);
-    }
-    {
-        g = get_graph();
-        auto ref_ctx = get_test_ctx();
-        graph_driver(g, ref_ctx);
-        std::vector<sc_op_ptr> ins_out = g.get_input_ops();
-        ins_out.insert(ins_out.begin(), g.get_output_ops()[0]);
-        auto ref_f = lower_graph(ref_ctx, g, ins_out);
-        auto ref_fptr = jit_engine_t::make(ref_ctx)->get_entry_func(ref_f);
-        ref_fptr->call_default(&ref_output[0], &input_A[0], &input_B1[0],
-                &input_bias[0], &input_B2[0]);
-    }
-
-    auto rmse = test_utils::cal_rmse(sc_output, ref_output);
-    EXPECT_TRUE(rmse < 1e-2f);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_func_dep_finder.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_func_dep_finder.cpp
deleted file mode 100644
index 587fef19356..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_func_dep_finder.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/pass/func_dependency.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_func_dep_finder_cpp, TestFuncDepFinder) {
-    builder::ir_builder_t builder;
-    _decl_func_(datatypes::void_t, bbb, _arg_("A", datatypes::f32, {100, 100}),
-            _arg_("len", datatypes::s32), _arg_("tsr", datatypes::pointer));
-    _function_(datatypes::s32, ccc, _arg_("A", datatypes::f32, {100, 100}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, len);
-        _for_(i, 0, 100) {
-            _for_(j, 0, 100) { A[{i, j}] = 0; }
-            builder.push_evaluate(bbb(A, len, A + 100));
-            builtin::print_str("1234");
-        }
-        builder.push_evaluate(bbb(A, len, A + 100));
-        builtin::print_str("1234");
-        builtin::print_int(1234);
-        _return_(12);
-    }
-    std::vector<func_t> f;
-    func_dependency_finder_t finder(f);
-    finder(ccc);
-    ASSERT_EQ(f.size(), 3u);
-    EXPECT_EQ(f[0]->name_, "bbb");
-    EXPECT_EQ(f[1]->name_, "print_str");
-    EXPECT_EQ(f[2]->name_, "print_int");
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fuse_mgr.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_fuse_mgr.cpp
deleted file mode 100644
index d2c8b32ff0d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fuse_mgr.cpp
+++ /dev/null
@@ -1,1888 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <chrono>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "context.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/buffer_schedule_utils.hpp>
-#include <compiler/ir/transform/loop_merge.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-#include <compiler/ir/visitor.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/reduce.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/reduce_graph_op.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <reference/act_ref.hpp>
-#include <test_utils.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-using namespace std::chrono;
-
-#define CMP_SIMPLIFIED_IR(AAA, BBB) \
-    ir_simplifier_t simp {false}; \
-    ir_comparer cmper(true); \
-    loop_merger_t lm; \
-    auto A = simp(lm(AAA)); \
-    auto B = simp(lm(BBB)); \
-    EXPECT_TRUE(cmper.compare(A, B, false));
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerElemBlock) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(test_utils::make_tsr({100}, datatypes::s32));
-    auto frelu = g.make("relu", finput->get_outputs(), {}, {});
-    auto fmul = g.make(
-            "mul", {frelu->get_outputs()[0], finput->get_outputs()[0]}, {}, {});
-    auto foutput = g.make_output(fmul->get_outputs());
-    EXPECT_EQ(fmul->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], slice_range {{0, 32}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-    int32_t lanes
-            = frelu->dyn_cast<const unary_elementwise_op_impl_t>()->get_lanes();
-
-    ///// Expected func:
-    _function_(datatypes::s32, bbb, _arg_("out", datatypes::s32, {100UL}),
-            _arg_("buf", datatypes::s32, {100UL})) {
-        _bind_(out, buf);
-        _tensor_(tbuf, datatypes::s32, {100UL});
-        auto tb = tensor_ptr(tbuf, {0}, {}, true);
-        auto b = tensor_ptr(buf, {0}, {}, true);
-        _for_(i, 0, 32, lanes) {
-            tb[span_t({i}, lanes)] = builder::make_max(
-                    builder::make_constant(
-                            {0UL}, sc_data_type_t(sc_data_etype::S32, lanes)),
-                    b[span_t({i}, lanes)]);
-            auto o = tensor_ptr(out, {0}, {}, true);
-            tb = tensor_ptr(tbuf, {0}, {}, true);
-            b = tensor_ptr(buf, {0}, {}, true);
-            o[span_t({i}, lanes)]
-                    = tb[span_t({i}, lanes)] * b[span_t({i}, lanes)];
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestConcatOP) {
-    REQUIRE_AVX512(); // vec lane is 16 for s32 dtype
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput0 = g.make_input(test_utils::make_tsr({100, 200, 10}));
-    auto finput1 = g.make_input(test_utils::make_tsr({100, 300, 10}));
-    auto finput2 = g.make_input(test_utils::make_tsr({100, 400, 10}));
-    auto fconcat = g.make("concat",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0],
-                    finput2->get_outputs()[0]},
-            {}, {{"axis", 1}});
-    auto fout = g.make_output(fconcat->get_outputs());
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                slice_map {{finput0->get_outputs()[0].get(),
-                                   {{/*dim1*/ {0, 10}, /*dim2*/ {0, 200},
-                                           /*dim3*/ {0, 6}}}},
-                        {finput1->get_outputs()[0].get(),
-                                {{/*dim1*/ {0, 10}, /*dim2*/ {0, 300},
-                                        /*dim3*/ {0, 6}}}},
-                        {finput2->get_outputs()[0].get(),
-                                {{/*dim1*/ {0, 10}, /*dim2*/ {0, 400},
-                                        /*dim3*/ {0, 6}}}}});
-        _return_(123);
-    }
-
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes = 16; // for avx512 and s32 dtype
-    _function_(datatypes::s32, bbb,
-            _arg_("out0", datatypes::s32, {100UL, 900UL, 10UL}),
-            _arg_("inp0", datatypes::s32, {100UL, 200UL, 10UL}),
-            _arg_("inp1", datatypes::s32, {100UL, 300UL, 10UL}),
-            _arg_("inp2", datatypes::s32, {100UL, 400UL, 10UL})) {
-        _bind_(out0, inp0, inp1, inp2);
-        auto out0_ptr = builder::tensor_ptr(out0, {0, 0, 0}, {}, true);
-        auto inp0_ptr = builder::tensor_ptr(inp0, {0, 0, 0}, {}, true);
-        auto inp1_ptr = builder::tensor_ptr(inp1, {0, 0, 0}, {}, true);
-        auto inp2_ptr = builder::tensor_ptr(inp2, {0, 0, 0}, {}, true);
-        _for_(ii, 0, 10) {
-            _for_(jj0, 0, 200) {
-                _for_(hh0, 0, 6, int(lanes)) {
-                    auto mask = last_dim_generate_mask(
-                            hh0, 0, 6, int(lanes), true);
-                    out0_ptr[span_t({ii, jj0 + expr(0), hh0}, lanes, mask)]
-                            = inp0_ptr[span_t({ii, jj0, hh0}, lanes, mask)];
-                }
-            }
-            _for_(jj1, 0, 300) {
-                _for_(hh1, 0, 6, int(lanes)) {
-                    auto mask = last_dim_generate_mask(
-                            hh1, 0, 6, int(lanes), true);
-                    out0_ptr[span_t({ii, jj1 + (expr(0) + expr(200)), hh1},
-                            lanes, mask)]
-                            = inp1_ptr[span_t({ii, jj1, hh1}, lanes, mask)];
-                }
-            }
-            _for_(jj2, 0, 400) {
-                _for_(hh2, 0, 6, int(lanes)) {
-                    auto mask = last_dim_generate_mask(
-                            hh2, 0, 6, int(lanes), true);
-                    out0_ptr[span_t(
-                            {ii, jj2 + (expr(0) + expr(200) + expr(300)), hh2},
-                            lanes, mask)]
-                            = inp2_ptr[span_t({ii, jj2, hh2}, lanes, mask)];
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-#ifdef __AVX512F__
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestLeakyReluOP) {
-    auto check_leaky_relu = [&](sc_data_type_t type, const int M, const int K,
-                                    float alpha) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input(test_utils::make_tsr({M, K}, type));
-        auto flkrelu = g.make(
-                "leaky_relu", fin->get_outputs(), {}, {{"alpha", alpha}});
-        auto fout = g.make_output(flkrelu->get_outputs());
-
-        _function_(datatypes::void_t, testf) {
-            fusion.create_fusion_anchor(
-                    fin->get_outputs()[0], {{0, M}, {0, K}});
-        }
-        commit_graph_to_func(g, testf, fusion);
-
-        auto testf_mod
-                = ir_module_t::from_entry_func(get_default_context(), testf);
-        auto testf_ptr = jit_engine_t::make(get_test_ctx())
-                                 ->get_entry_func(testf_mod, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-
-        if (type == datatypes::bf16) {
-            std::vector<bf16_t> cast_in_buf(M * K);
-            std::vector<bf16_t> cast_out_buf(M * K);
-            std::vector<bf16_t> ref_cast_out_buf(M * K);
-
-            for (int i = 0; i < M * K; i++) {
-                cast_in_buf[i] = bf16_t(in_buf[i]);
-            }
-            testf_ptr->call_default(cast_out_buf.data(), cast_in_buf.data());
-            for (int i = 0; i < M * K; i++) {
-                out_buf[i] = cast_out_buf[i];
-            }
-            ref_leaky_relu(
-                    ref_cast_out_buf.data(), cast_in_buf.data(), M * K, alpha);
-            for (int i = 0; i < M * K; i++) {
-                ref_out_buf[i] = ref_cast_out_buf[i];
-            }
-        } else {
-            testf_ptr->call_default(out_buf.data(), in_buf.data());
-            ref_leaky_relu(ref_out_buf.data(), in_buf.data(), M * K, alpha);
-        }
-
-        test_utils::compare_data(out_buf, ref_out_buf, 1e-4f, 1e-4f);
-    };
-
-    check_leaky_relu(datatypes::f32, 100, 200, 0.01);
-    check_leaky_relu(datatypes::f32, 100, 256, 0.01);
-    check_leaky_relu(datatypes::f32, 100, 200, 0.5f);
-    check_leaky_relu(datatypes::f32, 100, 256, 0.5f);
-
-    check_leaky_relu(datatypes::bf16, 100, 200, 0.01);
-    check_leaky_relu(datatypes::bf16, 100, 256, 0.01);
-    check_leaky_relu(datatypes::bf16, 100, 200, 0.5f);
-    check_leaky_relu(datatypes::bf16, 100, 256, 0.5f);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestTanhOP) {
-    auto check_tanh = [&](const int M, const int K) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input(test_utils::make_tsr({M, K}, datatypes::f32));
-        auto ftanh = g.make("tanh", fin->get_outputs(), {}, {});
-        auto fout = g.make_output(ftanh->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    fin->get_outputs()[0], {{0, M}, {0, K}});
-        }
-        commit_graph_to_func(g, aaa, fusion);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-        // verify the upper/under bound values
-        in_buf[0] = -10.f;
-        in_buf[1] = -100.f;
-        in_buf[2] = -500.f;
-        in_buf[3] = 10.f;
-        in_buf[4] = 100.f;
-        in_buf[5] = 500.f;
-
-        fptr->call_default(out_buf.data(), in_buf.data());
-        ref_tanh(ref_out_buf.data(), in_buf.data(), M * K);
-
-        test_utils::compare_data(out_buf, ref_out_buf, 5e-3f, 1e-4f);
-    };
-    // scalar version
-    check_tanh(100, 200);
-
-    // vectorization version
-    check_tanh(100, 256);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestErfOP) {
-    auto check_erf = [&](const int M, const int K) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input(test_utils::make_tsr({M, K}, datatypes::f32));
-        auto ffwd = g.make("erf", fin->get_outputs(), {}, {});
-        auto fout = g.make_output(ffwd->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    fin->get_outputs()[0], {{0, M}, {0, K}});
-        }
-
-        commit_graph_to_func(g, aaa, fusion);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-        // verify the upper/under bound values
-        in_buf[0] = -10.f;
-        in_buf[1] = -100.f;
-        in_buf[2] = -500.f;
-        in_buf[3] = 10.f;
-        in_buf[4] = 100.f;
-        in_buf[5] = 500.f;
-
-        fptr->call_default(out_buf.data(), in_buf.data());
-        ref_erf(ref_out_buf.data(), in_buf.data(), M * K);
-
-        test_utils::compare_data(out_buf, ref_out_buf, 5e-3f, 1e-4f);
-    };
-    // scalar version
-    check_erf(128, 200);
-
-    // vectorization version
-    check_erf(128, 256);
-}
-#endif
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerBroadcast1) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    int bc_block = 64;
-    auto finput = g.make_input(test_utils::make_tsr({50, 100, 200}));
-    auto finput_add
-            = g.make_input(test_utils::make_tsr({1, 100, 200}, datatypes::s32));
-    auto fadd = g.make("add",
-            {finput->get_outputs()[0], finput_add->get_outputs()[0]}, {}, {});
-    auto foutput = g.make_output(fadd->get_outputs());
-    EXPECT_EQ(fadd->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(finput->get_outputs()[0],
-                {{0, 50}, {0, bc_block}, {0, bc_block}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes
-            = fadd->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("buf", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("bc_args_add", datatypes::s32, {1UL, 100UL, 200UL})) {
-        _bind_(out, buf, bc_args_add);
-        auto bc_args_add_tptr
-                = builder::tensor_ptr(bc_args_add, {0, 0, 0}, {}, true);
-        auto buf_tptr = builder::tensor_ptr(buf, {0, 0, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {0, 0, 0}, {}, true);
-        _for_(n, 0, 50) {
-            _for_(i, 0, bc_block, 1) {
-                _for_(j, 0, bc_block, int(lanes)) {
-                    out_tptr[span_t({n, i, j}, lanes)]
-                            = buf_tptr[span_t({n, i, j}, lanes)]
-                            + bc_args_add_tptr[span_t({0, i, j}, lanes)];
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerBroadcast2) {
-    bool is_builtin = is_builtin_test_ctx();
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    int bc_block = 64;
-    auto finput = g.make_input(test_utils::make_tsr({50, 100, 200}));
-    auto finput_add
-            = g.make_input(test_utils::make_tsr({1, 100, 1}, datatypes::s32));
-    auto fadd = g.make("add",
-            {finput->get_outputs()[0], finput_add->get_outputs()[0]}, {}, {});
-    auto foutput = g.make_output(fadd->get_outputs());
-    EXPECT_EQ(fadd->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{0, 50}, {0, bc_block}, {0, 50}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes
-            = fadd->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("buf", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("bc_args_add", datatypes::s32, {1UL, 100UL, 1UL})) {
-        _bind_(out, buf, bc_args_add);
-        auto bc_args_add_tptr
-                = builder::tensor_ptr(bc_args_add, {0, 0, 0}, {}, true);
-        auto buf_tptr = builder::tensor_ptr(buf, {0, 0, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {0, 0, 0}, {}, true);
-        _for_(n, 0, 50) {
-            _for_(i, 0, bc_block) {
-                _for_(j, 0, 48, int(lanes)) {
-                    out_tptr[span_t({n, i, j}, lanes)]
-                            = buf_tptr[span_t({n, i, j}, lanes)]
-                            + builder::make_broadcast(
-                                    bc_args_add_tptr[{0, i, 0}],
-                                    static_cast<int>(lanes));
-                }
-                if (is_builtin) {
-                    _for_(j, 48, 50, int(lanes)) {
-                        auto mask = last_dim_generate_mask(
-                                j, 48, 50, int(lanes), true);
-                        out_tptr[span_t({n, i, j}, lanes, mask)]
-                                = buf_tptr[span_t({n, i, j}, lanes, mask)]
-                                + builder::make_broadcast(
-                                        bc_args_add_tptr[{0, i, 0}],
-                                        static_cast<int>(lanes));
-                    }
-                } else {
-                    _for_(j, 48, 50, 1) {
-                        out_tptr[span_t({n, i, j}, 1)]
-                                = buf_tptr[span_t({n, i, j}, 1)]
-                                + bc_args_add_tptr[{0, i, 0}];
-                    }
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerBroadcast3) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    int bc_block = 64;
-    auto finput = g.make_input(test_utils::make_tsr({50, 100, 200}));
-    auto finput_add = g.make_input(test_utils::make_tsr({100}, datatypes::s32));
-    auto fadd = g.make("add",
-            {finput->get_outputs()[0], finput_add->get_outputs()[0]}, {},
-            any_map_t {{"bc_axis", std::vector<int> {1}}});
-    auto foutput = g.make_output(fadd->get_outputs());
-    EXPECT_EQ(fadd->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(finput->get_outputs()[0],
-                {{0, 50}, {0, bc_block}, {0, bc_block}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes
-            = fadd->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("buf", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("bc_args_add", datatypes::s32, {100UL})) {
-        _bind_(out, buf, bc_args_add);
-        auto bc_args_add_tptr = builder::tensor_ptr(bc_args_add, {0}, {}, true);
-        auto buf_tptr = builder::tensor_ptr(buf, {0, 0, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {0, 0, 0}, {}, true);
-        _for_(n, 0, 50) {
-            _for_(i, 0, bc_block, 1) {
-                _for_(j, 0, bc_block, int(lanes)) {
-                    out_tptr[span_t({n, i, j}, lanes)]
-                            = buf_tptr[span_t({n, i, j}, lanes)]
-                            + builder::make_broadcast(bc_args_add_tptr[i],
-                                    static_cast<int>(lanes));
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerBroadcast4) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    int bc_block = 64;
-    auto finput = g.make_input(test_utils::make_tsr({100}));
-    auto finput_add = g.make_input(test_utils::make_tsr({50, 100, 200}));
-    auto fadd = g.make("sub",
-            {finput->get_outputs()[0], finput_add->get_outputs()[0]}, {},
-            any_map_t {{"bc_axis", std::vector<int> {1}}});
-    auto foutput = g.make_output(fadd->get_outputs());
-    EXPECT_EQ(fadd->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                slice_map {{finput_add->get_outputs()[0].get(),
-                                   {{{0, 50}, {0, bc_block}, {0, bc_block}}}},
-                        {finput->get_outputs()[0].get(), {{{0, bc_block}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes
-            = fadd->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {50UL, 100UL, 200UL}),
-            _arg_("bc_args_add", datatypes::s32, {100UL}),
-            _arg_("buf", datatypes::s32, {50UL, 100UL, 200UL})) {
-        _bind_(out, bc_args_add, buf);
-        auto bc_args_add_tptr = builder::tensor_ptr(bc_args_add, {0}, {}, true);
-        auto buf_tptr = builder::tensor_ptr(buf, {0, 0, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {0, 0, 0}, {}, true);
-        _for_(n, 0, 50) {
-            _for_(i, 0, bc_block, 1) {
-                _for_(j, 0, bc_block, int(lanes)) {
-                    out_tptr[span_t({n, i, j}, lanes)]
-                            = builder::make_broadcast(bc_args_add_tptr[i],
-                                      static_cast<int>(lanes))
-                            - buf_tptr[span_t({n, i, j}, lanes)];
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerVectorizedReLU) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(test_utils::make_tsr({100, 256}));
-    auto frelu = g.make("relu", finput->get_outputs(), {}, {});
-    auto foutput = g.make_output(frelu->get_outputs());
-    EXPECT_EQ(frelu->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{0, 100}, {0, 256}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes
-            = frelu->dyn_cast<const unary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {100UL, 256UL}),
-            _arg_("buf", datatypes::s32, {100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {0, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {0, 0}, {}, true);
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 256, int(lanes)) {
-                out_tptr[span_t({i, j}, lanes)]
-                        = builder::make_max(make_expr<constant_node>((int64_t)0,
-                                                    sc_data_type_t::s32(lanes)),
-                                buf_tptr[span_t({i, j}, lanes)]);
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerExp) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(test_utils::make_tsr({100, 256}));
-    auto fexp = g.make<exp_op_t>(finput->get_outputs()[0]);
-    auto foutput = g.make_output(fexp->get_outputs());
-    EXPECT_EQ(fexp->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::f32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{0, 100}, {0, 256}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    uint32_t lanes
-            = fexp->dyn_cast<const unary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::f32, bbb,
-            _arg_("out", datatypes::s32, {100UL, 256UL}),
-            _arg_("buf", datatypes::s32, {100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {0, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {0, 0}, {}, true);
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 256, int(lanes)) {
-                out_tptr[span_t({i, j}, lanes)]
-                        = builder::make_exp(buf_tptr[span_t({i, j}, lanes)]);
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestSigmoidOP) {
-    REQUIRE_AVX2();
-    auto check_sigmoid = [&](const int M, const int K) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input(test_utils::make_tsr({M, K}, datatypes::f32));
-        auto fsig = g.make("sigmoid", fin->get_outputs(), {}, {});
-        auto fout = g.make_output(fsig->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    fin->get_outputs()[0], {{0, M}, {0, K}});
-        }
-        commit_graph_to_func(g, aaa, fusion);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-        // verify the upper/under bound values
-        in_buf[0] = -10.f;
-        in_buf[1] = -100.f;
-        in_buf[2] = -std::numeric_limits<float>::infinity();
-        in_buf[3] = 10.f;
-        in_buf[4] = 100.f;
-        in_buf[5] = std::numeric_limits<float>::infinity();
-
-        in_buf[6] = std::numeric_limits<float>::min();
-        in_buf[7] = std::numeric_limits<float>::max();
-
-        in_buf[8] = -std::numeric_limits<float>::min();
-        in_buf[9] = -std::numeric_limits<float>::max();
-
-        fptr->call_default(out_buf.data(), in_buf.data());
-
-        ref_sigmoid(ref_out_buf.data(), in_buf.data(), M * K);
-
-        test_utils::compare_data(out_buf, ref_out_buf, 1e-5f, 1e-5f);
-    };
-    // scalar version
-    check_sigmoid(100, 200);
-
-    // vectorization version
-    check_sigmoid(100, 256);
-}
-
-// #define BENCH_SIGMOID
-
-#ifdef BENCH_SIGMOID
-TEST(GCCore_CPU_fuse_mgr_cpp, BenchVectorizedSigmoidOP) {
-    auto bench_sigmoid = [&](const int M, const int K) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input();
-        auto fsig = g.make("sigmoid", fin->get_outputs(), {}, {});
-        auto fout = g.make_output(fsig->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    finput->get_outputs()[0], {{0, M}, {0, K}});
-        }
-        fusion.commit(ctx, aaa);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-
-        auto start = high_resolution_clock::now();
-
-        int count = 100;
-        for (int i = 0; i < count; i++) {
-            fptr->call_default(out_buf.data(), in_buf.data());
-        }
-
-        auto stop = high_resolution_clock::now();
-
-        auto duration
-                = duration_cast<microseconds>(stop - start) / (1000.f * count);
-        std::cout << "Data shape: " << M << "x" << K << ". "
-                  << "Average time taken by op: " << duration.count()
-                  << " milliseconds\n";
-    };
-    // vectorization version
-    bench_sigmoid(100, 256);
-
-    // vectorization version
-    bench_sigmoid(100, 2560);
-
-    // vectorization version
-    bench_sigmoid(100, 25600);
-
-    // vectorization version
-    bench_sigmoid(100, 512000);
-}
-#endif
-
-//#define BENCH_ROUND
-
-#ifdef BENCH_ROUND
-TEST(GCCore_CPU_fuse_mgr_cpp, BenchVectorizedRoundOP) {
-    auto bench_round = [&](const int M, const int K) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input();
-        auto fround = g.make("round", fin->get_outputs(), {}, {});
-        auto fout = g.make_output(fround->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    finput->get_outputs()[0], {{0, M}, {0, K}});
-        }
-        fusion.commit(ctx, aaa);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-
-        auto start = high_resolution_clock::now();
-
-        int count = 100;
-        for (int i = 0; i < count; i++) {
-            fptr->call_default(out_buf.data(), in_buf.data());
-        }
-
-        auto stop = high_resolution_clock::now();
-
-        auto duration
-                = duration_cast<microseconds>(stop - start) / (1000.f * count);
-        std::cout << "Data shape: " << M << "x" << K << ". "
-                  << "Average time taken by op: " << duration.count()
-                  << " milliseconds\n";
-    };
-    // vectorization version
-    bench_round(100, 256);
-
-    // vectorization version
-    bench_round(100, 2560);
-
-    // vectorization version
-    bench_round(100, 25600);
-
-    // vectorization version
-    bench_round(100, 512000);
-}
-#endif
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerReduceSum1) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(
-            test_utils::make_tsr({20, 100, 256}, datatypes::f32));
-    auto fsum = g.make("reduce", finput->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {2}},
-                    {"rd_op", static_cast<int>(reduce_operator::add)},
-                    {"keep_dims", false}});
-    auto foutput = g.make_output(fsum->get_outputs());
-    EXPECT_EQ(fsum->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{10, 1}, {30, 1}, {0, 256}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    _function_(datatypes::s32, bbb, _arg_("out", datatypes::f32, {20UL, 100UL}),
-            _arg_("buf", datatypes::f32, {20UL, 100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {10, 30, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {10, 30}, {}, true);
-        _for_(i, 0, 1) {
-            _for_(j, 0, 1) {
-                _var_(reduce_value, sc_data_type_t::f32(lanes));
-                reduce_value = make_expr<constant_node>(
-                        0.0f, sc_data_type_t::f32(lanes));
-                _for_(k, 0, 256, static_cast<int>(lanes)) {
-                    reduce_value = builder::make_add(
-                            buf_tptr[span_t({i, j, k}, lanes)], reduce_value);
-                }
-                out_tptr[{i, j}] = builder::make_reduce_add(reduce_value);
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerReduceSum2) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(
-            test_utils::make_tsr({20, 100, 256}, datatypes::f32));
-    auto fsum = g.make("reduce", finput->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}},
-                    {"rd_op", static_cast<int>(reduce_operator::add)},
-                    {"keep_dims", true}});
-    auto foutput = g.make_output(fsum->get_outputs());
-    EXPECT_EQ(fsum->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{10, 1}, {0, 100}, {20, 1}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::f32, {20UL, 1UL, 256UL}),
-            _arg_("buf", datatypes::f32, {20UL, 100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {10, 0, 20}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {10, 0, 20}, {}, true);
-        _for_(i, 0, 1) {
-            _for_(k, 0, 1) {
-                _var_(reduce_sum, sc_data_type_t::f32());
-                reduce_sum = 0.f;
-                _for_(j, 0, 100) {
-                    reduce_sum = builder::make_add(
-                            buf_tptr[{i, j, k}], reduce_sum);
-                }
-                out_tptr[{i, 0, k}] = reduce_sum;
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerReduceProd) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(
-            test_utils::make_tsr({20, 100, 256}, datatypes::f32));
-    auto fprod = g.make("reduce", finput->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {2}},
-                    {"rd_op", static_cast<int>(reduce_operator::mul)},
-                    {"keep_dims", false}});
-    auto foutput = g.make_output(fprod->get_outputs());
-    EXPECT_EQ(fprod->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{10, 1}, {30, 1}, {0, 256}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    uint32_t lanes = fprod->dyn_cast<reduce_op_t>()->get_lanes();
-    ///// Expected func:
-    _function_(datatypes::s32, bbb, _arg_("out", datatypes::f32, {20UL, 100UL}),
-            _arg_("buf", datatypes::f32, {20UL, 100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {10, 30, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {10, 30}, {}, true);
-        _for_(i, 0, 1) {
-            _for_(j, 0, 1) {
-                _var_(reduce_mul, sc_data_type_t::f32(lanes));
-                reduce_mul = make_expr<constant_node>(
-                        1.0f, sc_data_type_t::f32(lanes));
-                _for_(k, 0, 256, static_cast<int>(lanes)) {
-                    reduce_mul = builder::make_mul(
-                            buf_tptr[span_t({i, j, k}, lanes)], reduce_mul);
-                }
-                out_tptr[{i, j}] = builder::make_reduce_mul(reduce_mul);
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerSquaredDiff1) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(
-            test_utils::make_tsr({20, 100, 256}, datatypes::f32));
-    auto fsum = g.make("reduce", finput->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {2}},
-                    {"rd_op", static_cast<int>(reduce_operator::add)},
-                    {"keep_dims", true}});
-    auto fsquare_diff = g.make("squared_diff",
-            {finput->get_outputs()[0], fsum->get_outputs()[0]}, {}, {});
-    auto foutput = g.make_output(fsquare_diff->get_outputs());
-    EXPECT_EQ(fsquare_diff->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{10, 1}, {30, 1}, {0, 256}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t rd_lanes
-            = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    uint32_t sd_lanes
-            = fsquare_diff->dyn_cast<const binary_elementwise_op_impl_t>()
-                      ->get_lanes();
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::f32, {20UL, 100UL, 256UL}),
-            _arg_("buf", datatypes::f32, {20UL, 100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {10, 30, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {10, 30, 0}, {}, true);
-        _tensor_(_fuse_buf_0, datatypes::f32, {20UL, 100UL, 1UL});
-        auto fuse_buf_tptr
-                = builder::tensor_ptr(_fuse_buf_0, {10, 30, 0}, {}, true);
-        _for_(i, 0, 1) {
-            _for_(j, 0, 1) {
-                _var_(reduce_value, sc_data_type_t::f32(rd_lanes));
-                reduce_value = make_expr<constant_node>(
-                        0.0f, sc_data_type_t::f32(rd_lanes));
-                _for_(k, 0, 256, static_cast<int>(rd_lanes)) {
-                    reduce_value = builder::make_add(
-                            buf_tptr[span_t({i, j, k}, rd_lanes)],
-                            reduce_value);
-                }
-                fuse_buf_tptr[{i, j, 0}]
-                        = builder::make_reduce_add(reduce_value);
-            }
-        }
-
-        _for_(i, 0, 1) {
-            _for_(j, 0, 1) {
-                _for_(k, 0, 256, static_cast<int>(sd_lanes)) {
-                    out_tptr[span_t({i, j, k}, sd_lanes)]
-                            = (buf_tptr[span_t({i, j, k}, sd_lanes)]
-                                      - builder::make_broadcast(
-                                              builder::tensor_ptr(_fuse_buf_0,
-                                                      {10, 30, 0}, {},
-                                                      true)[{i, j, 0}],
-                                              static_cast<int>(sd_lanes)))
-                            * (buf_tptr[span_t({i, j, k}, sd_lanes)]
-                                    - builder::make_broadcast(
-                                            builder::tensor_ptr(_fuse_buf_0,
-                                                    {10, 30, 0}, {},
-                                                    true)[{i, j, 0}],
-                                            static_cast<int>(sd_lanes)));
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerSquaredDiff2) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(test_utils::make_tsr({20, 100, 256}));
-    auto fmean = g.make_input(test_utils::make_tsr({20, 100, 256}));
-    auto fsquare_diff = g.make("squared_diff",
-            {finput->get_outputs()[0], fmean->get_outputs()[0]}, {}, {});
-    auto foutput = g.make_output(fsquare_diff->get_outputs());
-    EXPECT_EQ(fsquare_diff->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                slice_map {{finput->get_outputs()[0].get(),
-                                   {{{10, 1}, {30, 50}, {0, 256}}}},
-                        {fmean->get_outputs()[0].get(),
-                                {{{10, 1}, {30, 50}, {0, 256}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes
-            = fsquare_diff->dyn_cast<const binary_elementwise_op_impl_t>()
-                      ->get_lanes();
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {20UL, 100UL, 256UL}),
-            _arg_("buf", datatypes::s32, {20UL, 100UL, 256UL}),
-            _arg_("sqd_mean", datatypes::s32, {20UL, 100UL, 256UL})) {
-        _bind_(out, buf, sqd_mean);
-        auto buf_tptr = builder::tensor_ptr(buf, {10, 30, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {10, 30, 0}, {}, true);
-        auto sqd_mean_tptr
-                = builder::tensor_ptr(sqd_mean, {10, 30, 0}, {}, true);
-        _for_(i, 0, 1) {
-            _for_(j, 0, 50) {
-                _for_(k, 0, 256, static_cast<int>(lanes)) {
-                    out_tptr[span_t({i, j, k}, lanes)]
-                            = (buf_tptr[span_t({i, j, k}, lanes)]
-                                      - sqd_mean_tptr[span_t({i, j, k}, lanes)])
-                            * (buf_tptr[span_t({i, j, k}, lanes)]
-                                    - sqd_mean_tptr[span_t({i, j, k}, lanes)]);
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerSquaredRoot1) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(test_utils::make_tsr({20, 100, 256}));
-    auto fsquare_root = g.make("squared_root", finput->get_outputs(), {}, {});
-    auto foutput = g.make_output(fsquare_root->get_outputs());
-    EXPECT_EQ(fsquare_root->get_outputs()[0], foutput->get_inputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                finput->get_outputs()[0], {{10, 1}, {30, 50}, {0, 256}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    uint32_t lanes = fsquare_root->dyn_cast<const unary_elementwise_op_impl_t>()
-                             ->get_lanes();
-
-    _function_(datatypes::s32, bbb,
-            _arg_("out", datatypes::s32, {20UL, 100UL, 256UL}),
-            _arg_("buf", datatypes::s32, {20UL, 100UL, 256UL})) {
-        _bind_(out, buf);
-        auto buf_tptr = builder::tensor_ptr(buf, {10, 30, 0}, {}, true);
-        auto out_tptr = builder::tensor_ptr(out, {10, 30, 0}, {}, true);
-        _for_(i, 0, 1) {
-            _for_(j, 0, 50) {
-                _for_(k, 0, 256, static_cast<int>(lanes)) {
-                    out_tptr[span_t({i, j, k}, lanes)]
-                            = make_sqrt(buf_tptr[span_t({i, j, k}, lanes)]);
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerCastBF16) {
-    REQUIRE_AVX512();
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finputf32
-            = g.make_input(test_utils::make_tsr({20, 259}, datatypes::f32));
-    auto finputbf16
-            = g.make_input(test_utils::make_tsr({20, 259}, datatypes::bf16));
-
-    auto fcastf32tobf16 = g.make(
-            "cast", finputf32->get_outputs(), {}, {{"dtype", datatypes::bf16}});
-    auto fcastbf16tof32 = g.make(
-            "cast", finputbf16->get_outputs(), {}, {{"dtype", datatypes::f32}});
-
-    auto foutf32tobf16 = g.make_output(fcastf32tobf16->get_outputs());
-    auto foutbf16tof32 = g.make_output(fcastbf16tof32->get_outputs());
-
-    EXPECT_EQ(foutf32tobf16->get_inputs()[0], fcastf32tobf16->get_outputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(slice_map {
-                {finputf32->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputbf16->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    std::vector<float> inf32(20 * 259), outbf16tof32(20 * 259);
-    std::vector<bf16_t> inbf16(20 * 259), outf32tobf16(20 * 259);
-    test_utils::fill_data<float>(inf32.data(), 20 * 259);
-    for (int i = 0; i < 20 * 259; i++) {
-        inbf16[i] = inf32[i];
-    }
-
-    auto fptr = jit_engine_t::make(get_test_ctx())
-                        ->get_entry_func(ir_module_t::from_entry_func(
-                                                 get_default_context(), aaa),
-                                true);
-    fptr->call_default(outbf16tof32.data(), outf32tobf16.data(), inbf16.data(),
-            inf32.data());
-
-    for (unsigned i = 0; i < 20 * 259; i++) {
-        if (std::abs(outf32tobf16[i] - inbf16[i]) >= 1e-5f) {
-            std::cout << outf32tobf16[i] << "\n";
-        }
-        EXPECT_TRUE(
-                std::abs(outf32tobf16[i] - float(bf16_t(inf32[i]))) < 1e-5f);
-        EXPECT_TRUE(std::abs(outbf16tof32[i] - inbf16[i]) < 1e-5f);
-    }
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerCastU8S8) {
-    BUILTIN_REQUIRE_AVX512(); // AVX2 no cast instruction
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finputf32tou8
-            = g.make_input(test_utils::make_tsr({20, 259}, datatypes::f32));
-    auto finputf32tos8
-            = g.make_input(test_utils::make_tsr({20, 259}, datatypes::f32));
-    auto finputs32tou8 = g.make_input(test_utils::make_tsr({20, 259}));
-    auto finputs32tos8 = g.make_input(test_utils::make_tsr({20, 259}));
-    auto finputu8
-            = g.make_input(test_utils::make_tsr({20, 259}, datatypes::u8));
-    auto finputs8
-            = g.make_input(test_utils::make_tsr({20, 259}, datatypes::s8));
-
-    auto fcastf32tos32 = g.make("cast", finputf32tos8->get_outputs(), {},
-            {{"dtype", datatypes::s32}});
-    auto fcastf32tou8 = g.make("cast", finputf32tou8->get_outputs(), {},
-            {{"dtype", datatypes::u8}});
-    auto fcastf32tos8 = g.make("cast", finputf32tos8->get_outputs(), {},
-            {{"dtype", datatypes::s8}});
-
-    auto fcasts32tof32 = g.make("cast", finputs32tos8->get_outputs(), {},
-            {{"dtype", datatypes::f32}});
-    auto fcasts32tou8 = g.make("cast", finputs32tou8->get_outputs(), {},
-            {{"dtype", datatypes::u8}});
-    auto fcasts32tos8 = g.make("cast", finputs32tos8->get_outputs(), {},
-            {{"dtype", datatypes::s8}});
-    auto fcastu8tos32 = g.make(
-            "cast", finputu8->get_outputs(), {}, {{"dtype", datatypes::s32}});
-    auto fcastu8tof32 = g.make(
-            "cast", finputu8->get_outputs(), {}, {{"dtype", datatypes::f32}});
-    auto fcasts8tos32 = g.make(
-            "cast", finputs8->get_outputs(), {}, {{"dtype", datatypes::s32}});
-    auto fcasts8tof32 = g.make(
-            "cast", finputs8->get_outputs(), {}, {{"dtype", datatypes::f32}});
-
-    auto foutf32tos32 = g.make_output(fcastf32tos32->get_outputs());
-    auto foutf32tou8 = g.make_output(fcastf32tou8->get_outputs());
-    auto foutf32tos8 = g.make_output(fcastf32tos8->get_outputs());
-
-    auto fouts32tof32 = g.make_output(fcasts32tof32->get_outputs());
-    auto fouts32tou8 = g.make_output(fcasts32tou8->get_outputs());
-    auto fouts32tos8 = g.make_output(fcasts32tos8->get_outputs());
-
-    auto foutu8tof32 = g.make_output(fcastu8tof32->get_outputs());
-    auto foutu8tos32 = g.make_output(fcastu8tos32->get_outputs());
-
-    auto fouts8tof32 = g.make_output(fcasts8tof32->get_outputs());
-    auto fouts8tos32 = g.make_output(fcasts8tos32->get_outputs());
-
-    EXPECT_EQ(fouts8tof32->get_inputs()[0], fcasts8tof32->get_outputs()[0]);
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(slice_map {
-                {finputf32tou8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputf32tos8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputs32tou8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputs32tos8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputf32tos8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputu8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}},
-                {finputs8->get_outputs()[0].get(), {{{0, 20}, {0, 259}}}}});
-        _return_(123);
-    }
-
-    commit_graph_to_func(g, aaa, fusion);
-
-    std::vector<float> outs32tof32(20 * 259), outu8tof32(20 * 259),
-            outs8tof32(20 * 259);
-    std::vector<int32_t> outf32tos32(20 * 259, 1), outu8tos32(20 * 259),
-            outs8tos32(20 * 259);
-    std::vector<uint8_t> inu8(20 * 259), outf32tou8(20 * 259),
-            outs32tou8(20 * 259);
-    std::vector<int8_t> ins8(20 * 259), outf32tos8(20 * 259),
-            outs32tos8(20 * 259);
-    test_utils::fill_data<uint8_t>(inu8.data(), 20 * 259);
-    test_utils::fill_data<int8_t>(ins8.data(), 20 * 259);
-    std::vector<float> inf32tou8(inu8.begin(), inu8.end()),
-            inf32tos8(ins8.begin(), ins8.end());
-    std::vector<int32_t> ins32tou8(inu8.begin(), inu8.end()),
-            ins32tos8(ins8.begin(), ins8.end());
-
-    auto fptr = jit_engine_t::make(get_test_ctx())
-                        ->get_entry_func(ir_module_t::from_entry_func(
-                                                 get_default_context(), aaa),
-                                true);
-    fptr->call_default(outs8tof32.data(), outs8tos32.data(), outu8tof32.data(),
-            outu8tos32.data(), outs32tos8.data(), outs32tof32.data(),
-            outs32tou8.data(), outf32tos8.data(), outf32tos32.data(),
-            outf32tou8.data(), ins8.data(), inu8.data(), ins32tos8.data(),
-            ins32tou8.data(), inf32tos8.data(), inf32tou8.data());
-    for (unsigned i = 0; i < 20 * 259; i++) {
-        EXPECT_TRUE(outf32tos32[i] == static_cast<int>(ins8[i]));
-        EXPECT_TRUE(outf32tou8[i] == static_cast<int>(inu8[i]));
-        EXPECT_TRUE(outf32tos8[i] == static_cast<int>(ins8[i]));
-        EXPECT_TRUE(
-                static_cast<int>(outs32tof32[i]) == static_cast<int>(ins8[i]));
-        EXPECT_TRUE(outs32tou8[i] == static_cast<int>(inu8[i]));
-        EXPECT_TRUE(outs32tos8[i] == static_cast<int>(ins8[i]));
-        EXPECT_TRUE(
-                static_cast<int>(outu8tof32[i]) == static_cast<int>(inu8[i]));
-        EXPECT_TRUE(outu8tos32[i] == static_cast<int>(inu8[i]));
-        EXPECT_TRUE(
-                static_cast<int>(outs8tof32[i]) == static_cast<int>(ins8[i]));
-        EXPECT_TRUE(outs8tos32[i] == static_cast<int>(ins8[i]));
-    }
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerMultiAnchor) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput = g.make_input(
-            test_utils::make_tsr({20, 30, 40, 64}, datatypes::f32));
-    auto fexp = g.make("exp", finput->get_outputs(), {}, {});
-    auto fsum = g.make("reduce", {fexp->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {1, 3}}, {"rd_op", 0},
-                    {"keep_dims", true}});
-    auto fadd = g.make(
-            "add", {finput->get_outputs()[0], fsum->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fadd->get_outputs());
-    _function_(datatypes::s32, aaa) {
-        _for_(m_o, 0, 20) {
-            _for_(n_o, 0, 30) {
-                // Anchor 0
-                fusion.create_fusion_anchor(finput->get_outputs()[0],
-                        {{m_o, 1}, {n_o, 1}, {0, 40}, {0, 64}});
-            }
-            // Anchor 1
-            fusion.create_fusion_anchor(finput->get_outputs()[0],
-                    {{m_o, 1}, {0, 30}, {0, 40}, {0, 64}});
-        }
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    auto ctx = get_test_ctx();
-    uint32_t lanes = ctx->get_max_vector_lanes(sc_data_etype::F32);
-    _function_(datatypes::s32, bbb,
-            _arg_("out0", datatypes::f32, {20UL, 30UL, 40UL, 64UL}),
-            _arg_("inp0", datatypes::f32, {20UL, 30UL, 40UL, 64UL})) {
-        _bind_(out0, inp0);
-        _for_(m_o, 0, 20) {
-            _tensor_(_fuse_buf_1, datatypes::f32, {20UL, 1UL, 40UL, 1UL});
-            _tensor_(_fuse_buf_0, datatypes::f32, {20UL, 30UL, 40UL, 64UL});
-            _for_(n_o, 0, 30) {
-                // fuse exp
-                _for_(i, 0, 1) {
-                    _for_(j, 0, 1) {
-                        _for_(ii, 0, 40) {
-                            _for_(jj, 0, 64, static_cast<int>(lanes)) {
-                                builder::tensor_ptr(_fuse_buf_0,
-                                        {m_o, n_o, 0, 0}, {},
-                                        true)[span_t({i, j, ii, jj}, lanes)]
-                                        = make_exp(tensor_ptr(inp0,
-                                                {m_o, n_o, 0, 0}, {},
-                                                true)[span_t(
-                                                {i, j, ii, jj}, lanes)]);
-                            }
-                        }
-                    }
-                }
-            }
-            // fuse reduce
-            _for_(i, 0, 1) {
-                _for_(ii, 0, 40) {
-                    _var_(reduce_sum, sc_data_type_t::f32(lanes));
-                    reduce_sum = make_expr<constant_node>(
-                            0.0f, sc_data_type_t::f32(lanes));
-                    _for_(j, 0, 30) {
-                        _for_(jj, 0, 64, static_cast<int>(lanes)) {
-                            reduce_sum = builder::make_add(
-                                    builder::tensor_ptr(_fuse_buf_0,
-                                            {m_o, 0, 0, 0}, {}, true)[span_t(
-                                            {i, j, ii, jj}, lanes)],
-                                    reduce_sum);
-                        }
-                    }
-                    builder::tensor_ptr(_fuse_buf_1, {m_o, 0, 0, 0}, {},
-                            true)[{i, 0, ii, 0}]
-                            = builder::make_reduce_add(reduce_sum);
-                }
-            }
-            // fuse add
-            _for_(i, 0, 1) {
-                _for_(j, 0, 30) {
-                    _for_(ii, 0, 40) {
-                        _for_(jj, 0, 64, static_cast<int>(lanes)) {
-                            builder::tensor_ptr(out0, {m_o, j, ii, 0}, {},
-                                    true)[span_t({i, 0, 0, jj}, lanes)]
-                                    = builder::tensor_ptr(inp0, {m_o, j, ii, 0},
-                                              {}, true)[span_t(
-                                              {i, 0, 0, jj}, lanes)]
-                                    + builder::make_broadcast(
-                                            builder::tensor_ptr(_fuse_buf_1,
-                                                    {m_o, 0, ii, 0}, {},
-                                                    true)[{i, 0, 0, 0}],
-                                            static_cast<int>(lanes));
-                        }
-                    }
-                }
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerMultiAnchorShrink) {
-    sc_graph_t mgr;
-    SET_THREADS_OR_SKIP(56);
-
-    // gemm + exp + reduce fusion pattern
-    auto x = graph_tensor::make({8192, 256});
-    auto w = graph_tensor::make({256, 128});
-    auto input = mgr.make_input({x, w});
-    input->attrs_.set("constant", const_kind::local_const);
-
-    const ops::matmul_core_config_t gemm_cfg = {32, 32, 32};
-    auto gemm = mgr.make("matmul_core", input->get_outputs(), {}, {});
-    gemm->stc_cast<tunable_op_t>()->set_config(
-            reflection::general_object_t::make(gemm_cfg));
-    auto exp = mgr.make("exp", {gemm->get_outputs()[0]}, {}, {});
-    auto reduce = mgr.make("reduce", {exp->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto add = mgr.make(
-            "add", {gemm->get_outputs()[0], reduce->get_outputs()[0]}, {}, {});
-    mgr.make_output(add->get_outputs());
-    layout_propagation(mgr, get_test_ctx());
-    mixed_partition(mgr, get_test_ctx());
-
-    auto ir_mod = lower_graph(get_test_ctx(), mgr, {});
-    auto func = ir_mod->get_func(
-            "outerloop_256_partition_reorder_matmul_core_exp_reduce_compute_"
-            "reduce_collect_add_reorder_3");
-    COMPILE_ASSERT(func, "no function got");
-    tensor_shrinker_t pass;
-    auto ss = pass(func->body_).checked_as<stmts>()->seq_;
-    COMPILE_ASSERT(ss.size() > 1, "Unexpected stmts size found");
-    auto cur_loop = ss[0].checked_as<for_loop>().get();
-    while (true) {
-        auto nextloop = get_inner_for_loop(cur_loop).get();
-        if (!nextloop) break;
-        cur_loop = nextloop;
-    }
-    ss = cur_loop->body_.checked_as<stmts>()->seq_;
-    bool found = false;
-    for (auto &s : ss) {
-        if (!s.isa<define>()) break;
-        auto tsr = s.checked_as<define>()->var_.checked_as<tensor>();
-        bool name_eq = (tsr->name_.find("matmul_core") != std::string::npos
-                && tsr->name_.find("shr") != std::string::npos);
-        if (!name_eq) continue;
-        int N_num_block = 128 / gemm_cfg.N_block;
-        found |= (get_expr_to_dims(tsr->dims_)
-                == sc_dims {
-                        1, N_num_block, gemm_cfg.M_block, gemm_cfg.N_block});
-    }
-    EXPECT_TRUE(found);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestBinaryElementwiseOp) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput0 = g.make_input(test_utils::make_tsr({100, 200}));
-    auto finput1 = g.make_input(test_utils::make_tsr({100, 200}));
-    auto fmin = g.make("min",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0]}, {}, {});
-    auto fmax = g.make(
-            "max", {finput0->get_outputs()[0], fmin->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fmax->get_outputs());
-
-    EXPECT_EQ(fmax->get_outputs()[0], fout->get_inputs()[0]);
-
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(
-                slice_map {{finput0->get_outputs()[0].get(),
-                                   {{/*dim1*/ {0, 10}, /*dim2*/ {0, 128}}}},
-                        {finput1->get_outputs()[0].get(),
-                                {{/*dim1*/ {0, 10}, /*dim2*/ {0, 128}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    auto ctx = get_test_ctx();
-    int lanes = ctx->get_max_vector_lanes(sc_data_etype::F32);
-    ///// Expected func:
-    _function_(datatypes::s32, bbb,
-            _arg_("inp0", datatypes::s32, {100UL, 200UL}),
-            _arg_("out0", datatypes::s32, {100UL, 200UL}),
-            _arg_("inp1", datatypes::s32, {100UL, 200UL})) {
-        _bind_(out0, inp0, inp1);
-        _tensor_(tmp_out, datatypes::s32, {100UL, 200UL});
-        auto inp0_tptr = builder::tensor_ptr(inp0, {0, 0}, {}, true);
-        auto tout_tptr = builder::tensor_ptr(tmp_out, {0, 0}, {}, true);
-        auto inp1_tptr = builder::tensor_ptr(inp1, {0, 0}, {}, true);
-        _for_(ii, 0, 10) {
-            _for_(jj, 0, 128, lanes) {
-                tout_tptr[span_t({ii, jj}, lanes)]
-                        = builder::make_min(inp0_tptr[span_t({ii, jj}, lanes)],
-                                inp1_tptr[span_t({ii, jj}, lanes)]);
-                auto out0_tptr = builder::tensor_ptr(out0, {ii, 0}, {}, true);
-                inp0_tptr = builder::tensor_ptr(inp0, {ii, 0}, {}, true);
-                tout_tptr = builder::tensor_ptr(tmp_out, {ii, 0}, {}, true);
-                out0_tptr[span_t({0, jj}, lanes)]
-                        = builder::make_max(inp0_tptr[span_t({0, jj}, lanes)],
-                                tout_tptr[span_t({0, jj}, lanes)]);
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestReshapeCopyOp) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput0 = g.make_input(test_utils::make_tsr({100, 200}));
-    auto freshape = g.make("reshape", finput0->get_outputs(), {},
-            {{"shape", sc_dims {10, 10, 20, 10}}});
-    auto fout = g.make_output(freshape->get_outputs());
-
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(finput0->get_outputs()[0],
-                {/*dim1*/ {0, 100UL}, /*dim2*/ {0, 200UL}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    _function_(datatypes::s32, bbb,
-            _arg_("out0", datatypes::s32, {10UL, 10UL, 20UL, 10UL}),
-            _arg_("inp0", datatypes::s32, {100UL, 200UL})) {
-        _bind_(out0, inp0);
-        auto inp0_tptr = builder::tensor_ptr(inp0, {0, 0}, {}, true);
-        auto out0_tptr = builder::tensor_ptr(out0, {0, 0, 0, 0}, {}, true);
-        _for_(ii, 0, 10 * 10 * 20 * UINT64_C(10)) {
-            out0_tptr[{ii / (10 * 20 * UINT64_C(10)),
-                    ii % (10 * 20 * UINT64_C(10)) / (20 * UINT64_C(10)),
-                    ii % (20 * UINT64_C(10)) / UINT64_C(10), ii % UINT64_C(10)}]
-                    = inp0_tptr[{ii / UINT64_C(200), ii % UINT64_C(200)}];
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestVecterizedClampOP) {
-    REQUIRE_AVX2();
-    auto check_clamp = [&](const int M, const int K, bool vectorized,
-                               const float clamp_min, const float clamp_max) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input(test_utils::make_tsr({M, K}, datatypes::f32));
-        auto fclamp = g.make("clamp", fin->get_outputs(), {},
-                {{"min", clamp_min}, {"max", clamp_max}});
-        auto fout = g.make_output(fclamp->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    fin->get_outputs()[0], {/*dim1*/ {0, M}, /*dim2*/ {0, K}});
-        }
-        commit_graph_to_func(g, aaa, fusion);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-        // verify the upper/under bound values
-        in_buf[0] = -10.f;
-        in_buf[1] = -100.f;
-        in_buf[2] = -std::numeric_limits<float>::infinity();
-        in_buf[3] = 10.f;
-        in_buf[4] = 100.f;
-        in_buf[5] = std::numeric_limits<float>::infinity();
-
-        in_buf[6] = std::numeric_limits<float>::min();
-        in_buf[7] = std::numeric_limits<float>::max();
-
-        in_buf[8] = -std::numeric_limits<float>::min();
-        in_buf[9] = -std::numeric_limits<float>::max();
-
-        fptr->call_default(out_buf.data(), in_buf.data());
-
-        ref_clamp(
-                ref_out_buf.data(), in_buf.data(), M * K, clamp_min, clamp_max);
-
-        test_utils::compare_data(out_buf, ref_out_buf, 1e-5f, 1e-5f);
-    };
-    // scalar version
-    // check_clamp(100, 200, false, 0.1f, 0.5f);
-
-    // vectorization version
-    check_clamp(100, 256, true, 0.1f, 0.5f);
-}
-
-// #define BENCH_CLAMP
-
-#ifdef BENCH_CLAMP
-TEST(GCCore_CPU_fuse_mgr_cpp, BenchVectorizedClampOP) {
-    auto bench_clamp = [&](const int M, const int K, bool vectorized,
-                               const float clamp_min, const float clamp_max) {
-        builder::ir_builder_t builder;
-        fusion_anchor_mgr_t fusion;
-        sc_graph_t g;
-        auto fin = g.make_input();
-        auto fclamp = g.make("clamp", fin->get_outputs(),
-                {{"min", clamp_min}, {"max", clamp_max}});
-        auto fout = g.make_output(fclamp->get_outputs());
-
-        _function_(datatypes::void_t, aaa) {
-            fusion.create_fusion_anchor(
-                    fin->get_outputs()[0], {{0, M}, {0, K}});
-        }
-        fusion.commit(ctx, aaa);
-
-        auto aaa1 = ir_module_t::from_entry_func(get_default_context(), aaa);
-        auto fptr = jit_engine_t::make(get_test_ctx())
-                            ->get_entry_func(aaa1, true);
-
-        std::vector<float> in_buf(M * K);
-        std::vector<float> out_buf(M * K);
-        std::vector<float> ref_out_buf(M * K);
-
-        test_utils::fill_data(in_buf.data(), M * K);
-
-        auto start = high_resolution_clock::now();
-
-        int count = 100;
-        for (int i = 0; i < count; i++) {
-            fptr->call_default(out_buf.data(), in_buf.data());
-        }
-
-        auto stop = high_resolution_clock::now();
-
-        auto duration
-                = duration_cast<microseconds>(stop - start) / (1000.f * count);
-        std::cout << "Data shape: " << M << "x" << K << ". "
-                  << "Average time taken by op: " << duration.count()
-                  << " milliseconds\n";
-    };
-    // vectorization version
-    bench_clamp(100, 256, true, 0.1f, 0.5f);
-
-    // vectorization version
-    bench_clamp(100, 2560, true, 0.1f, 0.5f);
-
-    // vectorization version
-    bench_clamp(100, 25600, true, 0.1f, 0.5f);
-
-    // vectorization version
-    bench_clamp(100, 512000, true, 0.1f, 0.5f);
-}
-
-#endif
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestPreOpFusionUnaryOp) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    sc_dims arg_dims = {100, 256};
-    auto finput0 = g.make_input(test_utils::make_tsr(arg_dims, datatypes::s32));
-    auto finput1 = g.make_input(test_utils::make_tsr(arg_dims, datatypes::s32));
-    auto frelu = g.make("relu", finput1->get_outputs(), {}, {});
-    auto fadd = g.make("add",
-            {finput0->get_outputs()[0], frelu->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fadd->get_outputs());
-
-    EXPECT_EQ(fadd->get_outputs()[0], fout->get_inputs()[0]);
-
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(slice_map {
-                {finput0->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}},
-                {finput1->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    auto lanes
-            = frelu->dyn_cast<const unary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::s32, bbb,
-            _arg_("out0", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp0", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp1", datatypes::s32, {100UL, 256UL})) {
-        _bind_(out0, inp0, inp1);
-
-        auto inp0_tptr = builder::tensor_ptr(inp0, {0, 0}, {}, true);
-
-        _tensor_(_fuse_buf, datatypes::s32, {100UL, 256UL});
-        auto fuse_tptr = builder::tensor_ptr(_fuse_buf, {0, 0}, {}, true);
-        _for_(ii, 0, 10) {
-            _for_(jj, 0, 256, static_cast<int>(lanes)) {
-                fuse_tptr[span_t({ii, jj}, lanes)] = builder::make_max(
-                        make_expr<constant_node>(
-                                (uint64_t)0, sc_data_type_t::s32(lanes)),
-                        inp0_tptr[span_t({ii, jj}, lanes)]);
-                auto out0_tptr = builder::tensor_ptr(out0, {ii, 0}, {}, true);
-                auto inp1_tptr = builder::tensor_ptr(inp1, {ii, 0}, {}, true);
-                fuse_tptr = builder::tensor_ptr(_fuse_buf, {ii, 0}, {}, true);
-                out0_tptr[span_t({0, jj}, lanes)]
-                        = inp1_tptr[span_t({0, jj}, lanes)]
-                        + fuse_tptr[span_t({0, jj}, lanes)];
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestPreOpFusionBinaryOp) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    sc_dims arg_dims_1 = {100, 256};
-    auto finput0
-            = g.make_input(test_utils::make_tsr(arg_dims_1, datatypes::s32));
-    auto finput1
-            = g.make_input(test_utils::make_tsr(arg_dims_1, datatypes::s32));
-    auto finput2
-            = g.make_input(test_utils::make_tsr(arg_dims_1, datatypes::s32));
-    auto fsub = g.make("sub",
-            {finput1->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-    auto fadd = g.make(
-            "add", {finput0->get_outputs()[0], fsub->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fadd->get_outputs());
-
-    EXPECT_EQ(fadd->get_outputs()[0], fout->get_inputs()[0]);
-
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(slice_map {
-                {finput0->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}},
-                {finput1->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}},
-                {finput2->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    auto lanes
-            = fsub->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::s32, bbb,
-            _arg_("out0", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp0", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp1", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp2", datatypes::s32, {100UL, 256UL})) {
-        _bind_(out0, inp0, inp1, inp2);
-        auto inp0_tptr = builder::tensor_ptr(inp0, {0, 0}, {}, true);
-        auto inp1_tptr = builder::tensor_ptr(inp1, {0, 0}, {}, true);
-        _tensor_(_fuse_buf, datatypes::s32, {100UL, 256UL});
-        auto fuse_tptr = builder::tensor_ptr(_fuse_buf, {0, 0}, {}, true);
-        _for_(ii, 0, 10) {
-            _for_(jj, 0, 256, static_cast<int>(lanes)) {
-                fuse_tptr[span_t({ii, jj}, lanes)]
-                        = inp0_tptr[span_t({ii, jj}, lanes)]
-                        - inp1_tptr[span_t({ii, jj}, lanes)];
-                auto out0_tptr = builder::tensor_ptr(out0, {ii, 0}, {}, true);
-                auto inp2_tptr = builder::tensor_ptr(inp2, {ii, 0}, {}, true);
-                fuse_tptr = builder::tensor_ptr(_fuse_buf, {ii, 0}, {}, true);
-                out0_tptr[span_t({0, jj}, lanes)]
-                        = inp2_tptr[span_t({0, jj}, lanes)]
-                        + fuse_tptr[span_t({0, jj}, lanes)];
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestPreOpFusionBinaryOpWithBroadCast) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    sc_dims arg_dims_1 = {100, 256};
-    sc_dims arg_dims_2 = {1, 256};
-    auto finput0
-            = g.make_input(test_utils::make_tsr(arg_dims_1, datatypes::s32));
-    auto finput1
-            = g.make_input(test_utils::make_tsr(arg_dims_1, datatypes::s32));
-    auto finput2
-            = g.make_input(test_utils::make_tsr(arg_dims_2, datatypes::s32));
-    auto fmul = g.make("mul",
-            {finput1->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-    auto fadd = g.make(
-            "add", {finput0->get_outputs()[0], fmul->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fadd->get_outputs());
-
-    EXPECT_EQ(fadd->get_outputs()[0], fout->get_inputs()[0]);
-
-    _function_(datatypes::s32, aaa) {
-        fusion.create_fusion_anchor(slice_map {
-                {finput0->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}},
-                {finput1->get_outputs()[0].get(), {{{0, 10}, {0, 256}}}},
-                {finput2->get_outputs()[0].get(), {{{0, 1}, {0, 256}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    auto lanes
-            = fmul->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::s32, bbb,
-            _arg_("out0", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp0", datatypes::s32, {100UL, 256UL}),
-            _arg_("inp1", datatypes::s32, {1UL, 256UL}),
-            _arg_("inp2", datatypes::s32, {100UL, 256UL})) {
-        _bind_(out0, inp0, inp1, inp2);
-        auto inp0_tptr = builder::tensor_ptr(inp0, {0, 0}, {}, true);
-        auto inp1_tptr = builder::tensor_ptr(inp1, {0, 0}, {}, true);
-        _tensor_(_fuse_buf, datatypes::s32, {100UL, 256UL});
-        auto fuse_tptr = builder::tensor_ptr(_fuse_buf, {0, 0}, {}, true);
-        _for_(ii, 0, 10) {
-            _for_(jj, 0, 256, static_cast<int>(lanes)) {
-                fuse_tptr[span_t({ii, jj}, lanes)]
-                        = inp0_tptr[span_t({ii, jj}, lanes)]
-                        * builder::make_indexing(inp1_tptr, {0, jj}, lanes);
-                fuse_tptr = builder::tensor_ptr(_fuse_buf, {ii, 0}, {}, true);
-                auto inp2_tptr = builder::tensor_ptr(inp2, {ii, 0}, {}, true);
-                auto out0_tptr = builder::tensor_ptr(out0, {ii, 0}, {}, true);
-                out0_tptr[span_t({00, jj}, lanes)]
-                        = inp2_tptr[span_t({00, jj}, lanes)]
-                        + fuse_tptr[span_t({00, jj}, lanes)];
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestPreOpFusionReduceOp) {
-    builder::ir_builder_t builder;
-    fusion_anchor_mgr_t fusion;
-    sc_graph_t g;
-    auto finput0 = g.make_input(test_utils::make_tsr({16, 16}, datatypes::f32));
-    sc_dims arg_dims_1 = {10, 16, 10, 16};
-    auto finput1
-            = g.make_input(test_utils::make_tsr(arg_dims_1, datatypes::f32));
-    auto freduce = g.make("reduce", finput1->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {0, 2}},
-                    {"rd_op", static_cast<int>(reduce_operator::add)},
-                    {"keep_dims", false}});
-    auto fadd = g.make("add",
-            {finput0->get_outputs()[0], freduce->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fadd->get_outputs());
-    EXPECT_EQ(fadd->get_outputs()[0], fout->get_inputs()[0]);
-
-    _function_(datatypes::f32, aaa) {
-        fusion.create_fusion_anchor(slice_map {
-                {finput0->get_outputs()[0].get(), {{{0, 16}, {0, 16}}}},
-                {finput1->get_outputs()[0].get(),
-                        {{{0, 10}, {0, 16}, {0, 10}, {0, 16}}}}});
-        _return_(123);
-    }
-    commit_graph_to_func(g, aaa, fusion);
-
-    ///// Expected func:
-    auto rd_lanes = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    auto add_lanes
-            = fadd->dyn_cast<const binary_elementwise_op_impl_t>()->get_lanes();
-    _function_(datatypes::f32, bbb, _arg_("out0", datatypes::f32, {16UL, 16UL}),
-            _arg_("inp0", datatypes::f32, {10UL, 16UL, 10UL, 16UL}),
-            _arg_("inp1", datatypes::f32, {16UL, 16UL})) {
-        _bind_(out0, inp0, inp1);
-
-        auto inp0_tptr = builder::tensor_ptr(inp0, {0, 0, 0, 0}, {}, true);
-        _tensor_(_fuse_buf, datatypes::f32, {16UL, 16UL});
-        auto fuse_buf_tptr = builder::tensor_ptr(_fuse_buf, {0, 0}, {}, true);
-        _for_(j, 0, 16) {
-            _for_(jj, 0, 16, static_cast<int>(rd_lanes)) {
-                _var_(reduce_value, sc_data_type_t::f32(rd_lanes));
-                reduce_value = make_expr<constant_node>(
-                        0.0f, sc_data_type_t::f32(rd_lanes));
-                _for_(i, 0, 10) {
-                    _for_(ii, 0, 10) {
-                        reduce_value = builder::make_add(
-                                inp0_tptr[span_t({i, j, ii, jj}, rd_lanes)],
-                                reduce_value);
-                    }
-                }
-                fuse_buf_tptr[span_t({j, jj}, rd_lanes)] = reduce_value;
-                auto inp1_tptr = builder::tensor_ptr(inp1, {j, 0}, {}, true);
-                auto out0_tptr = builder::tensor_ptr(out0, {j, 0}, {}, true);
-                fuse_buf_tptr
-                        = builder::tensor_ptr(_fuse_buf, {j, 0}, {}, true);
-                out0_tptr[span_t({0, jj}, add_lanes)]
-                        = inp1_tptr[span_t({0, jj}, add_lanes)]
-                        + fuse_buf_tptr[span_t({0, jj}, add_lanes)];
-            }
-        }
-        _return_(123);
-    }
-
-    CMP_SIMPLIFIED_IR(aaa, bbb);
-}
-
-TEST(GCCore_CPU_fuse_mgr_cpp, TestFusionManagerDeclareAndShrinkForTensorPtr) {
-    sc_graph_t mgr;
-    // gemm + exp + reduce fusion pattern
-    auto input_A = mgr.make_input({graph_tensor::make({16, 384, 64})});
-    auto input_B = mgr.make_input({graph_tensor::make({16, 64, 384})});
-    auto matmul0 = mgr.make("matmul_core",
-            {input_A->get_outputs()[0], input_B->get_outputs()[0]}, {}, {});
-    auto cast0 = mgr.make("cast", {matmul0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::u8}});
-    auto cast1 = mgr.make(
-            "cast", {cast0->get_outputs()[0]}, {}, {{"dtype", datatypes::f32}});
-    auto tv0 = mgr.make("tensor_view", {cast1->get_outputs()[0]}, {},
-            {{"shape", sc_dims {16 * 384, 384}}});
-    auto softmax = mgr.make("reduce", {tv0->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto output = mgr.make_output(softmax->get_outputs());
-    layout_propagation(mgr, get_test_ctx());
-    mixed_partition(mgr, get_test_ctx());
-
-    auto ir_mod = lower_graph(get_test_ctx(), mgr, {});
-    auto func = ir_mod->get_func(
-            "outerloop_16X6_partition_matmul_core_cast_cast_tensor_view_reduce_"
-            "compute_reduce_collect_reorder_3");
-    ASSERT_TRUE(func);
-    tensor_shrinker_t pass;
-    auto newbody = pass(func->body_);
-    auto ss = newbody.checked_as<stmts>()->seq_;
-    ASSERT_TRUE(ss.size() > 1);
-    auto cur_loop = ss[0].checked_as<for_loop>().get();
-    ss = cur_loop->body_.checked_as<stmts>()->seq_;
-    ASSERT_TRUE(!ss.empty());
-    for (auto &s : ss) {
-        if (!s.isa<define>()) break;
-        auto tsr = s.as<define>()->var_.as<tensor>();
-        if (tsr->name_.find("reduce_compute") != std::string::npos
-                && tsr->name_.find("shr") != std::string::npos) {
-            bool dim_eq = (get_expr_to_dims(tsr->dims_)
-                    == sc_dims {1, 1, 1, 64, 1,
-                            get_test_ctx()->get_max_vector_lanes(
-                                    sc_data_etype::F32)});
-            EXPECT_TRUE(dim_eq);
-        }
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusible_pooling.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusible_pooling.cpp
deleted file mode 100644
index 8ab3fd986d2..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusible_pooling.cpp
+++ /dev/null
@@ -1,1231 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <cmath>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-#include "compiler/dimensions.hpp"
-#include "compiler/ir/graph/driver.hpp"
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/graph/lowering.hpp"
-#include "compiler/ir/graph/pass/pass.hpp"
-#include "compiler/ir/graph/quantization/quantize_info.hpp"
-#include "context.hpp"
-#include "gtest.h"
-#include "ops/fusible/pooling.hpp"
-#include "reference/pool_ref.hpp"
-#include "util/any_map.hpp"
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/transform/loop_merge.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/templates/nested_conv_fwd.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using nested_conv_fwd_config_t = ops::nested_conv_fwd_config_t;
-
-template <typename Store_type, typename Compute_type = Store_type>
-static void check_fusible_pooling_fwd(pooling_type_t pool_type, int N, int C,
-        int H, int W, int R, int S, sc_dims strides, sc_dims padding,
-        bool add_bn_relu = true, int c_block = 1, bool exclude_pad = false,
-        bool round_floor = true, std::string auto_pad = auto_pad_options::none,
-        bool channel_last = false) {
-    REQUIRE_AVX2();
-    int stride_h = strides[0], stride_w = strides[0];
-    if (strides.size() > 1) stride_w = strides[1];
-
-    sc_data_type_t dtype = sc_data_traits_t<Store_type>::type();
-    sc_graph_t mgr;
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    auto pooling_op_type_str
-            = pool_type == pooling_type_t::max ? "pooling_max" : "pooling_avg";
-    auto in_fmt = (c_block == 1) ? sc_data_format_t::NCHW()
-                                 : sc_data_format_t::NCHWc(c_block);
-    sc_dims in_shape = {N, C, H, W};
-    std::string data_format = data_format_options::NCX;
-    if (channel_last) {
-        // when use channel_last input, the channel axis should not be blocked.
-        EXPECT_EQ(c_block, 1);
-        data_format = data_format_options::NXC;
-        in_shape = {N, H, W, C};
-    }
-    auto in_a = mgr.make_input({graph_tensor::make(in_shape, in_fmt, dtype)});
-    std::string rounding_type = round_floor ? rounding_type_options::floor
-                                            : rounding_type_options::ceil;
-    auto pooling_out
-            = mgr.make(pooling_op_type_str, {in_a->get_outputs()[0]}, {},
-                    {{pooling_attr_key::strides, strides},
-                            {pooling_attr_key::pads_begin, padding},
-                            {pooling_attr_key::pads_end, padding},
-                            {pooling_attr_key::kernel, sc_dims {R, S}},
-                            {pooling_attr_key::exclude_pad, exclude_pad},
-                            {pooling_attr_key::rounding_type, rounding_type},
-                            {pooling_attr_key::auto_pad, auto_pad},
-                            {pooling_attr_key::data_format, data_format}});
-    fuse_arg_ops = {
-            in_a,
-    };
-    sc_op_ptr final_out = pooling_out;
-
-    auto bc_axis = std::vector<int> {1};
-    if (add_bn_relu) {
-        auto bn_mul = mgr.make_input({graph_tensor::make({C})});
-        auto bn_add = mgr.make_input({graph_tensor::make({C})});
-        final_out = mgr.make("mul",
-                {final_out->get_outputs()[0], bn_mul->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-
-        final_out = mgr.make("add",
-                {final_out->get_outputs()[0], bn_add->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = mgr.make("relu", {final_out->get_outputs()[0]}, {}, {});
-        fuse_arg_ops.emplace_back(bn_mul);
-        fuse_arg_ops.emplace_back(bn_add);
-    }
-    auto out = mgr.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    mgr.attrs_.set(sc_graph_t::attr_key_t::is_input_plain,
-            c_block == 1 && !channel_last);
-    mgr.attrs_.set(sc_graph_t::attr_key_t::is_output_plain,
-            c_block == 1 && !channel_last);
-
-    int P, Q;
-    auto pool = pooling_out->dyn_cast<pooling_op_t>();
-    auto output_dims = pool->info_.outputs_[0]->details_.get_plain_dims();
-    P = output_dims[channel_last ? 1 : 2];
-    Q = output_dims[channel_last ? 2 : 3];
-
-    // check auto pad
-    const sc_dims &pads_begin = pool->pads_begin_;
-    const sc_dims &pads_end = pool->pads_end_;
-    if (auto_pad == auto_pad_options::valid) {
-        EXPECT_TRUE(std::all_of(pads_begin.begin(), pads_begin.end(),
-                [](int64_t x) { return x == 0; }));
-        EXPECT_TRUE(std::all_of(pads_end.begin(), pads_end.end(),
-                [](int64_t x) { return x == 0; }));
-    } else if (auto_pad == auto_pad_options::same_upper
-            || auto_pad == auto_pad_options::same_lower) {
-        EXPECT_EQ(P, H);
-        EXPECT_EQ(Q, W);
-        if (auto_pad == auto_pad_options::same_upper)
-            EXPECT_TRUE(std::equal(pads_begin.begin(), pads_begin.end(),
-                    pads_end.begin(),
-                    [](int64_t x, int64_t y) { return x <= y; }));
-        else
-            EXPECT_TRUE(std::equal(pads_begin.begin(), pads_begin.end(),
-                    pads_end.begin(),
-                    [](int64_t x, int64_t y) { return x >= y; }));
-    }
-
-    // check round type
-    auto round_func = [&round_floor](double x) {
-        return round_floor ? std::floor(x) : std::ceil(x);
-    };
-    int expected_P = round_func(
-            1.0 * (H + pads_begin[0] + pads_end[0] - R) / stride_h + 1);
-    int expected_Q = round_func(
-            1.0 * (W + pads_begin[1] + pads_end[1] - S) / stride_w + 1);
-    EXPECT_EQ(P, expected_P);
-    EXPECT_EQ(Q, expected_Q);
-
-    auto output = alloc_array<Store_type>(N * C * P * Q);
-    auto input = alloc_array<Store_type>(N * C * H * W);
-    auto ref_output = alloc_array<Store_type>(N * C * P * Q);
-    auto bn_mul = alloc_array<float>(C);
-    auto bn_add = alloc_array<float>(C);
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver(mgr, ctx);
-
-    auto f = lower_graph(ctx, mgr, fuse_arg_ops);
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-
-    if (add_bn_relu) {
-        fptr->call_default(&output[0], &input[0], &bn_mul[0], &bn_add[0]);
-    } else {
-        fptr->call_default(&output[0], &input[0]);
-    }
-
-    auto ref_input = NCHWc2NCHW(input, N, C / c_block, H, W, c_block);
-    auto sc_output = NCHWc2NCHW(output, N, C / c_block, P, Q, c_block);
-    if (channel_last) {
-        ref_input = NHWC2NCHW(input, N, H, W, C);
-        sc_output = NHWC2NCHW(output, N, P, Q, C);
-    }
-
-    auto ref_mul = std::move(bn_mul);
-    auto ref_add = std::move(bn_add);
-    std::string pool_type_str;
-    if (pool_type == pooling_type_t::max) {
-        pool_type_str = "max";
-    } else if (pool_type == pooling_type_t::avg) {
-        pool_type_str = "avg";
-    }
-    compute_pooling_ref_fwd<Store_type, Compute_type>(pool_type_str, N, C, H, W,
-            P, Q, R, S, stride_h, stride_w, pads_begin[0], pads_begin[1],
-            &ref_input[0], &ref_output[0], &ref_mul[0], &ref_add[0],
-            add_bn_relu, exclude_pad);
-    test_utils::compare_data(
-            sc_output.data(), ref_output.data(), sc_output.size());
-}
-
-// maxpool
-TEST(GCCore_CPU_fusible_pooling, Test_max1) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 256, 64, 56, 1, 1, {1, 1}, {0, 0}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_max2) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 3, {1, 1}, {0, 0}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_max3) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 3, {2, 2}, {0, 0}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_max4) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 3, {1, 1}, {1, 1}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_max5) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 3, {2, 2}, {1, 1}, false);
-}
-
-// avgpool
-TEST(GCCore_CPU_fusible_pooling, Test_avg1) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 256, 64, 56, 1, 1, {1, 1}, {0, 0}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg2) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {1, 1}, {0, 0}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg3) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {2, 2}, {0, 0}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg4) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {1, 1}, {1, 1}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg5) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {2, 2}, {1, 1}, false);
-}
-
-// avgpool exclude pads
-TEST(GCCore_CPU_fusible_pooling, Test_avg_exclude_pad1) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 256, 64, 56, 1, 1,
-            {1, 1}, {0, 0}, false, 1, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg_exclude_pad2) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {0, 0}, false, 1, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg_exclude_pad3) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {0, 0}, false, 1, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg_exclude_pad4) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, false, 1, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_avg_exclude_pad5) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, false, 1, true);
-}
-
-// maxpool c_blocking
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_max1) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::max, 56, 256, 64, 56, 1, 1,
-            {1, 1}, {0, 0}, false, 64);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_max2) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {0, 0}, false, 16);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_max3) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {0, 0}, false, 64);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_max4) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, false, 64);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_max5) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {1, 1}, false, 16);
-}
-
-// avgpool c_blocking
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_avg1) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 256, 64, 56, 1, 1,
-            {1, 1}, {0, 0}, false, 16);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_avg2) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {0, 0}, false, 8);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_avg3) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {0, 0}, false, 16);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_avg4) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, false, 16);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_blocking_avg5) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {1, 1}, false, 8);
-}
-
-// asymmetric
-TEST(GCCore_CPU_fusible_pooling, Test_asymmetric1) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 3, {1, 2}, {2, 1}, false);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_asymmetric2) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 2, {1, 2}, {2, 1}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_asymmetric3) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::max, 56, 64, 56, 56, 3, 1, {1, 2}, {2, 1}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_asymmetric4) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {1, 2}, {2, 1}, false);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_asymmetric5) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 2, {1, 2}, {2, 1}, false);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_asymmetric6) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 1, {1, 2}, {2, 1}, false);
-}
-
-// bn_relu_fuse
-TEST(GCCore_CPU_fusible_pooling, Test_bn_relu1) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 256, 64, 56, 1, 1, {1, 1}, {0, 0}, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bn_relu2) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {1, 1}, {0, 0}, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bn_relu3) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {2, 2}, {0, 0}, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bn_relu4) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {1, 1}, {1, 1}, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bn_relu5) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 3, {2, 2}, {1, 1}, true);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bn_relu6) {
-    check_fusible_pooling_fwd<float>(
-            pooling_type_t::avg, 56, 64, 56, 56, 3, 1, {1, 2}, {2, 1}, true);
-}
-
-// auto pad
-TEST(GCCore_CPU_fusible_pooling, Test_auto_pad_valid) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, false, 1, false, true, auto_pad_options::valid);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_auto_pad_same_upper) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 2, 2,
-            {1, 1}, {1, 1}, false, 1, false, true,
-            auto_pad_options::same_upper);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_auto_pad_same_lower) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 56, 56, 2, 2,
-            {1, 1}, {1, 1}, false, 1, false, true,
-            auto_pad_options::same_lower);
-}
-
-// rounding type
-TEST(GCCore_CPU_fusible_pooling, Test_rounding_type_floor) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 224, 224, 3,
-            3, {2, 2}, {0, 0}, false, 1, false, true);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_rounding_type_ceil) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 56, 64, 224, 224, 3,
-            3, {2, 2}, {0, 0}, false, 1, false, false);
-}
-
-// bf16 maxpool
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_max1) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::max, 56, 256, 64,
-            56, 1, 1, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_max2) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::max, 56, 64, 56,
-            56, 3, 3, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_max3) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::max, 56, 64, 56,
-            56, 3, 3, {2, 2}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_max4) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::max, 56, 64, 56,
-            56, 3, 3, {1, 1}, {1, 1}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_max5) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::max, 56, 64, 56,
-            56, 3, 3, {2, 2}, {1, 1}, false, 1);
-}
-
-// NXC
-TEST(GCCore_CPU_fusible_pooling, Test_NXC_max) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::max, 64, 32, 56, 56, 3, 3,
-            {2, 2}, {1, 1}, false, 1, false, true, auto_pad_options::none,
-            true);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_NXC_avg) {
-    check_fusible_pooling_fwd<float>(pooling_type_t::avg, 64, 32, 56, 56, 3, 3,
-            {2, 2}, {1, 1}, false, 1, false, true, auto_pad_options::none,
-            true);
-}
-
-// bf16 avgpool
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_avg1) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::avg, 56, 256, 64,
-            56, 1, 1, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_avg2) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_avg3) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {2, 2}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_avg4) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {1, 1}, {1, 1}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_bf16_avg5) {
-    check_fusible_pooling_fwd<bf16_t, float>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {2, 2}, {1, 1}, false, 1);
-}
-
-// int8 maxpool
-TEST(GCCore_CPU_fusible_pooling, Test_int8_max1) {
-    check_fusible_pooling_fwd<int8_t>(pooling_type_t::max, 56, 256, 64, 56, 1,
-            1, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_max2) {
-    check_fusible_pooling_fwd<int8_t>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_max3) {
-    check_fusible_pooling_fwd<int8_t>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_max4) {
-    check_fusible_pooling_fwd<int8_t>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_max5) {
-    check_fusible_pooling_fwd<int8_t>(pooling_type_t::max, 56, 64, 56, 56, 3, 3,
-            {2, 2}, {1, 1}, false, 1);
-}
-
-// int8 avgpool
-TEST(GCCore_CPU_fusible_pooling, Test_int8_avg1) {
-    check_fusible_pooling_fwd<int8_t, int32_t>(pooling_type_t::avg, 56, 256, 64,
-            56, 1, 1, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_avg2) {
-    check_fusible_pooling_fwd<int8_t, int32_t>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {1, 1}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_avg3) {
-    check_fusible_pooling_fwd<int8_t, int32_t>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {2, 2}, {0, 0}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_avg4) {
-    check_fusible_pooling_fwd<int8_t, int32_t>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {1, 1}, {1, 1}, false, 1);
-}
-TEST(GCCore_CPU_fusible_pooling, Test_int8_avg5) {
-    check_fusible_pooling_fwd<int8_t, int32_t>(pooling_type_t::avg, 56, 64, 56,
-            56, 3, 3, {2, 2}, {1, 1}, false, 1);
-}
-
-// test conv fusion
-template <typename src_type, typename wei_type, typename dst_type>
-static sc_graph_t make_conv_pooling_postops_graph(const int64_t N,
-        const int64_t K, const int64_t C, const int64_t H, const int64_t W,
-        const int64_t R, const int64_t S, const int64_t SH, const int64_t SW,
-        const int64_t PH, const int64_t PW, pooling_type_t pool_type,
-        const int64_t p_P, const int64_t p_Q, const int64_t p_SH,
-        const int64_t p_SW, const int64_t p_PH, const int64_t p_PW,
-        std::vector<sc_op_ptr> &fuse_arg_ops, bool exclude_pad = false,
-        bool bn_relu = false,
-        const std::string &auto_pad = auto_pad_options::none,
-        const int64_t c_block = 1, const bool export_conv = true) {
-    sc_graph_t g;
-    auto in_fmt = c_block == 1 ? sc_data_format_t::NCHW()
-                               : sc_data_format_t::NCHWc(c_block);
-    sc_data_type_t src_dtype = datatypes::f32;
-    if (std::is_same<src_type, bf16_t>::value) src_dtype = datatypes::bf16;
-    sc_data_type_t weight_dtype = datatypes::f32;
-    if (std::is_same<wei_type, bf16_t>::value) weight_dtype = datatypes::bf16;
-    sc_op_ptr data_input = g.make_input(
-            {graph_tensor::make({N, C, H, W}, in_fmt, src_dtype)});
-    sc_op_ptr weight_input = g.make_input(
-            {graph_tensor::make({K, C, R, S}, in_fmt, weight_dtype)});
-    sc_dims paddings = {PH, PW};
-    sc_op_ptr conv_op = g.make("conv_fwd_core",
-            {data_input->get_outputs()[0], weight_input->get_outputs()[0]}, {},
-            {{"strides", sc_dims {SH, SW}}, {"pads_begin", paddings},
-                    {"pads_end", paddings}, {"data_format", "NCX"},
-                    {"weights_format", "OIX"}});
-    fuse_arg_ops = {data_input, weight_input};
-    if (export_conv) {
-        sc_op_ptr conv_out = g.make_output(conv_op->get_outputs());
-        fuse_arg_ops.insert(fuse_arg_ops.begin(), conv_out);
-    }
-    auto pooling_op_type_str
-            = pool_type == pooling_type_t::max ? "pooling_max" : "pooling_avg";
-    sc_op_ptr pooling_op = g.make(pooling_op_type_str,
-            {conv_op->get_outputs()[0]}, {},
-            {{pooling_attr_key::strides, sc_dims {p_SH, p_SW}},
-                    {pooling_attr_key::paddings, sc_dims {p_PH, p_PW}},
-                    {pooling_attr_key::kernel, sc_dims {p_P, p_Q}},
-                    {pooling_attr_key::exclude_pad, exclude_pad},
-                    {pooling_attr_key::auto_pad, auto_pad},
-                    {pooling_attr_key::data_format, data_format_options::NCX}});
-    sc_op_ptr final_out = pooling_op;
-    auto bc_axis = std::vector<int> {1};
-    if (bn_relu) {
-        sc_op_ptr bn_mul = g.make_input({graph_tensor::make({K})});
-        sc_op_ptr bn_add = g.make_input({graph_tensor::make({K})});
-        final_out = g.make("mul",
-                {final_out->get_outputs()[0], bn_mul->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], bn_add->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = g.make("relu", {final_out->get_outputs()[0]}, {}, {});
-        fuse_arg_ops.emplace_back(bn_mul);
-        fuse_arg_ops.emplace_back(bn_add);
-    }
-    sc_op_ptr out = g.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    g.attrs_.set(sc_graph_t::attr_key_t::is_input_plain, c_block == 1);
-    g.attrs_.set(sc_graph_t::attr_key_t::is_output_plain, c_block == 1);
-
-    return g;
-}
-
-void compute_conv_pooling_outshape(const int64_t MB, const int64_t OC,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t KH,
-        const int64_t KW, const int64_t SH, const int64_t SW, const int64_t PH,
-        const int64_t PW, pooling_type_t pooling_type, const int64_t p_KH,
-        const int64_t p_KW, const int64_t p_SH, const int64_t p_SW,
-        int64_t &p_PH, int64_t &p_PW, const bool exclude_pad,
-        const std::string &auto_pad, int64_t &conv_p, int64_t &conv_q,
-        int64_t &pool_p, int64_t &pool_q) {
-    conv_p = (IH + PH * 2 - KH) / SH + 1;
-    conv_q = (IW + PW * 2 - KW) / SW + 1;
-    if (auto_pad == auto_pad_options::same_upper
-            || auto_pad == auto_pad_options::same_lower) {
-        pool_q = (conv_q + p_SH - 1) / p_SH;
-        pool_p = (conv_p + p_SW - 1) / p_SW;
-        int64_t total_pad_h
-                = std::max((pool_p - 1) * p_SH + p_KH - conv_p, int64_t(0));
-        int64_t total_pad_w
-                = std::max((pool_q - 1) * p_SW + p_KW - conv_q, int64_t(0));
-        if (auto_pad == auto_pad_options::same_upper) {
-            p_PH = total_pad_h / 2;
-            p_PW = total_pad_w / 2;
-        } else if (auto_pad == auto_pad_options::same_lower) {
-            p_PH = (total_pad_h + 1) / 2;
-            p_PW = (total_pad_w + 1) / 2;
-        }
-    } else if (auto_pad == auto_pad_options::valid) {
-        pool_p = (conv_p - p_KH) / p_SH + 1;
-        pool_q = (conv_q - p_KW) / p_SW + 1;
-        p_PH = 0;
-        p_PW = 0;
-    } else {
-        pool_p = (conv_p + p_PH * 2 - p_KH) / p_SH + 1;
-        pool_q = (conv_q + p_PW * 2 - p_KW) / p_SW + 1;
-    }
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-static sc_graph_t make_dyn_conv_pooling_postops_graph(const int64_t N,
-        const int64_t K, const int64_t C, const int64_t H, const int64_t W,
-        const int64_t R, const int64_t S, const int64_t SH, const int64_t SW,
-        const int64_t PH, const int64_t PW, pooling_type_t pool_type,
-        const int64_t p_P, const int64_t p_Q, const int64_t p_SH,
-        const int64_t p_SW, const int64_t p_PH, const int64_t p_PW,
-        std::vector<sc_op_ptr> &fuse_arg_ops, nested_conv_fwd_config_t &cfg,
-        bool exclude_pad = false, bool bn_relu = false,
-        const std::string &auto_pad = auto_pad_options::none) {
-    sc_graph_t g;
-    sc_data_type_t src_dtype = datatypes::f32;
-    if (std::is_same<src_type, bf16_t>::value) src_dtype = datatypes::bf16;
-    sc_data_type_t weight_dtype = datatypes::f32;
-    if (std::is_same<wei_type, bf16_t>::value) weight_dtype = datatypes::bf16;
-    sc_op_ptr data_input = g.make_input(
-            {graph_tensor::make({N, C, H, W}, sc_data_format_t(), src_dtype)});
-    sc_op_ptr weight_input = g.make_input({graph_tensor::make(
-            {K, C, R, S}, sc_data_format_t(), weight_dtype)});
-    sc_dims paddings = {PH, PW};
-    sc_op_ptr conv_op = g.make("conv_fwd_core",
-            {data_input->get_outputs()[0], weight_input->get_outputs()[0]}, {},
-            {{"strides", sc_dims {SH, SW}}, {"pads_begin", paddings},
-                    {"pads_end", paddings}});
-    conv_op->attrs_.set<std::string>("temp.test_format", "NHWC");
-    auto tunop = conv_op->dyn_cast<tunable_op_t>();
-    reflection::shared_general_object_t cfgptr;
-    cfgptr = tunop->create_generator()->get_default_config(
-            get_default_context());
-    cfg = *(nested_conv_fwd_config_t *)cfgptr.get();
-    tunop->set_config(cfgptr);
-    tunop->get_inputs()[0]->details_.set_format(sc_data_format_t::NHWC());
-    tunop->get_inputs()[1]->details_.set_format(
-            sc_data_format_t::KCRSck(cfg.im_ic_block, cfg.im_oc_block));
-    tunop->get_outputs()[0]->details_.set_format(sc_data_format_t::NHWC());
-
-    fuse_arg_ops = {data_input, weight_input};
-    auto pooling_op_type_str
-            = pool_type == pooling_type_t::max ? "pooling_max" : "pooling_avg";
-    sc_op_ptr pooling_op = g.make(pooling_op_type_str,
-            {conv_op->get_outputs()[0]}, {},
-            {{pooling_attr_key::strides, sc_dims {p_SH, p_SW}},
-                    {pooling_attr_key::paddings, sc_dims {p_PH, p_PW}},
-                    {pooling_attr_key::kernel, sc_dims {p_P, p_Q}},
-                    {pooling_attr_key::exclude_pad, exclude_pad},
-                    {pooling_attr_key::auto_pad, auto_pad},
-                    {pooling_attr_key::data_format, data_format_options::NCX}});
-    sc_op_ptr final_out = pooling_op;
-    auto bc_axis = std::vector<int> {1};
-    if (bn_relu) {
-        sc_op_ptr bn_mul = g.make_input({graph_tensor::make({K})});
-        sc_op_ptr bn_add = g.make_input({graph_tensor::make({K})});
-        final_out = g.make("mul",
-                {final_out->get_outputs()[0], bn_mul->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], bn_add->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        final_out = g.make("relu", {final_out->get_outputs()[0]}, {}, {});
-        fuse_arg_ops.emplace_back(bn_mul);
-        fuse_arg_ops.emplace_back(bn_add);
-    }
-    sc_op_ptr out = g.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    g.attrs_.set(sc_graph_t::attr_key_t::is_input_plain, false);
-    g.attrs_.set(sc_graph_t::attr_key_t::is_output_plain, false);
-
-    return g;
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-static void check_dyn_conv_pooling_postops_graph(
-        const std::string &expected_fusion, const int64_t MB, const int64_t OC,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t KH,
-        const int64_t KW, const int64_t SH, const int64_t SW, const int64_t PH,
-        const int64_t PW, pooling_type_t pooling_type, const int64_t p_KH,
-        const int64_t p_KW, const int64_t p_SH, const int64_t p_SW,
-        int64_t p_PH, int64_t p_PW, const int64_t REAL_MB,
-        const int64_t REAL_IH, const int64_t REAL_IW,
-        const std::string auto_pad = auto_pad_options::none,
-        const bool exclude_pad = false, const bool bn_relu = true) {
-    REQUIRE_AVX2();
-    int64_t conv_p, conv_q, pool_p, pool_q;
-
-    compute_conv_pooling_outshape(REAL_MB, OC, IC, REAL_IH, REAL_IW, KH, KW, SH,
-            SW, PH, PW, pooling_type, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW,
-            exclude_pad, auto_pad, conv_p, conv_q, pool_p, pool_q);
-    // make graph
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    nested_conv_fwd_config_t cfg;
-    sc_graph_t g
-            = make_dyn_conv_pooling_postops_graph<src_type, wei_type, dst_type>(
-                    MB, OC, IC, IH, IW, KH, KW, SH, SW, PH, PW, pooling_type,
-                    p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, fuse_arg_ops, cfg,
-                    exclude_pad, bn_relu, auto_pad);
-
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    // print_graph(g, std::cout, 1);
-    graph_driver(g, ctx);
-    // print_graph(g, std::cout, 1);
-    // check graph
-    auto has_expected_fusion = false;
-    for (auto &op : g.ops_) {
-        if (op->op_name_.find(expected_fusion) != std::string::npos) {
-            has_expected_fusion = true;
-            break;
-        }
-    }
-    EXPECT_TRUE(has_expected_fusion);
-
-    // compute sc
-    uint8_t in_mask = 0;
-    if (is_dynamic_dim(MB)) { in_mask |= 1 << 0; }
-    if (is_dynamic_dim(IH)) { in_mask |= 1 << 1; }
-    if (is_dynamic_dim(IW)) { in_mask |= 1 << 2; }
-    auto sc_output
-            = alloc_array<float>(REAL_MB * OC * pool_p * pool_q, INIT_NOOP);
-    auto input = alloc_array<float>(REAL_MB * IC * REAL_IH * REAL_IW);
-    auto weight = alloc_array<float>(OC * IC * KH * KW);
-    auto bias = alloc_array<float>(OC);
-    auto bn_mul = alloc_array<float>(OC);
-    auto bn_add = alloc_array<float>(OC);
-
-    sc_dims out_dims = sc_dims {REAL_MB, OC, pool_p, pool_q};
-    sc_dims data_dims = sc_dims {REAL_MB, IC, REAL_IH, REAL_IW};
-    sc_dims in_weight_dims = sc_dims {OC, IC, KH, KW};
-    sc_dims in_postop_dims = sc_dims {OC};
-    // Define dynamic tensor
-    runtime::dynamic_tensor_t dyn_output(&sc_output[0], &out_dims[0],
-            out_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_input(&input[0], &data_dims[0],
-            data_dims.size(), uint32_t(sc_data_etype::F32), in_mask);
-    runtime::dynamic_tensor_t dyn_weight(&weight[0], &in_weight_dims[0],
-            in_weight_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_bn_mul(&bn_mul[0], &in_postop_dims[0],
-            in_postop_dims.size(), uint32_t(sc_data_etype::F32), 0);
-    runtime::dynamic_tensor_t dyn_bn_add(&bn_add[0], &in_postop_dims[0],
-            in_postop_dims.size(), uint32_t(sc_data_etype::F32), 0);
-
-    std::vector<void *> sc_args = {&dyn_output, &dyn_input, &dyn_weight};
-    if (bn_relu) {
-        sc_args.emplace_back(&dyn_bn_mul);
-        sc_args.emplace_back(&dyn_bn_add);
-    }
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-
-    auto f = lower_graph(ctx, g, fuse_arg_ops);
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_generic_default(generic_args.data());
-
-    // compute ref
-    auto ref_data = NHWC2NCHW(input, REAL_MB, REAL_IH, REAL_IW, IC);
-    auto ref_weight = KCRSck2KCRS(weight, OC / cfg.im_oc_block,
-            IC / cfg.im_ic_block, KH, KW, cfg.im_ic_block, cfg.im_oc_block);
-    auto ref_bn_mul = std::move(bn_mul);
-    auto ref_bn_add = std::move(bn_add);
-    auto ref_output = alloc_array<dst_type>(REAL_MB * OC * pool_p * pool_q);
-    auto ref_conv_output
-            = alloc_array<dst_type>(REAL_MB * OC * conv_p * conv_q);
-
-    std::string pooling_op_type_str
-            = pooling_type == pooling_type_t::max ? "max" : "avg";
-    compute_conv_pooling_postops_ref<src_type, wei_type, dst_type>(REAL_MB, OC,
-            IC, REAL_IH, REAL_IW, conv_p, conv_q, KH, KW, SH, SW, PH, PW,
-            &ref_data[0], &ref_weight[0], &ref_conv_output[0],
-            pooling_op_type_str, pool_p, pool_q, p_KH, p_KW, p_SH, p_SW, p_PH,
-            p_PW, &ref_output[0], exclude_pad, bn_relu, &ref_bn_mul[0],
-            &ref_bn_add[0]);
-
-    auto sc_output_plain = NHWC2NCHW(sc_output, REAL_MB, pool_p, pool_q, OC);
-    test_utils::compare_data(sc_output_plain.data(), ref_output.data(),
-            sc_output_plain.size(), 1e-3f, 1e-3f);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-static void check_conv_pooling_postops_graph(const std::string &expected_fusion,
-        const int64_t MB, const int64_t c_block, const int64_t OC,
-        const int64_t IC, const int64_t IH, const int64_t IW, const int64_t KH,
-        const int64_t KW, const int64_t SH, const int64_t SW, const int64_t PH,
-        const int64_t PW, pooling_type_t pooling_type, const int64_t p_KH,
-        const int64_t p_KW, const int64_t p_SH, const int64_t p_SW,
-        int64_t p_PH, int64_t p_PW, const bool exclude_pad = false,
-        const bool bn_relu = false,
-        const std::string auto_pad = auto_pad_options::none,
-        const bool check_conv_out = false) {
-    REQUIRE_AVX2();
-    int64_t conv_p, conv_q, pool_p, pool_q;
-    compute_conv_pooling_outshape(MB, OC, IC, IH, IW, KH, KW, SH, SW, PH, PW,
-            pooling_type, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, exclude_pad,
-            auto_pad, conv_p, conv_q, pool_p, pool_q);
-
-    // compute ref
-    auto ref_data = alloc_array<src_type>(MB * IC * IH * IW);
-    auto ref_weight = alloc_array<wei_type>(IC * OC * KH * KW);
-    auto ref_conv_output = alloc_array<dst_type>(MB * OC * conv_p * conv_q);
-    auto ref_output = alloc_array<dst_type>(MB * OC * pool_p * pool_q);
-    auto bn_mul = alloc_array<float>(OC);
-    auto bn_add = alloc_array<float>(OC);
-    std::string pooling_op_type_str
-            = pooling_type == pooling_type_t::max ? "max" : "avg";
-    compute_conv_pooling_postops_ref<src_type, wei_type, dst_type>(MB, OC, IC,
-            IH, IW, conv_p, conv_q, KH, KW, SH, SW, PH, PW, &ref_data[0],
-            &ref_weight[0], &ref_conv_output[0], pooling_op_type_str, pool_p,
-            pool_q, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, &ref_output[0],
-            exclude_pad, bn_relu, &bn_mul[0], &bn_add[0]);
-
-    // make graph
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_graph_t g
-            = make_conv_pooling_postops_graph<src_type, wei_type, dst_type>(MB,
-                    OC, IC, IH, IW, KH, KW, SH, SW, PH, PW, pooling_type, p_KH,
-                    p_KW, p_SH, p_SW, p_PH, p_PW, fuse_arg_ops, exclude_pad,
-                    bn_relu, auto_pad, c_block, check_conv_out);
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver(g, ctx);
-
-    // check graph
-    auto has_expected_fusion = false;
-    for (auto &op : g.ops_) {
-        if (op->op_name_.find(expected_fusion) != std::string::npos) {
-            has_expected_fusion = true;
-            break;
-        }
-    }
-    EXPECT_TRUE(has_expected_fusion);
-
-    // compute sc
-    auto sc_data = NCHW2NCHWc(ref_data, MB, IC / c_block, IH, IW, c_block);
-    auto sc_weight = NCHW2NCHWc(ref_weight, OC, IC / c_block, KH, KW, c_block);
-    auto sc_conv_output = alloc_array<dst_type>(MB * OC * conv_p * conv_q);
-    auto sc_output = alloc_array<dst_type>(MB * OC * pool_p * pool_q);
-
-    std::vector<generic_val> generic_args = {&sc_output[0]};
-    if (check_conv_out) { generic_args.emplace_back(&sc_conv_output[0]); }
-    generic_args.emplace_back(&sc_data[0]);
-    generic_args.emplace_back(&sc_weight[0]);
-    if (bn_relu) {
-        generic_args.emplace_back(&bn_mul[0]);
-        generic_args.emplace_back(&bn_add[0]);
-    }
-    auto f = lower_graph(ctx, g, fuse_arg_ops);
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_generic_default(generic_args.data());
-    auto sc_output_plain
-            = NCHWc2NCHW(sc_output, MB, OC / c_block, pool_p, pool_q, c_block);
-
-    // compare ref and sc
-    if (check_conv_out) {
-        auto sc_conv_output_plain = NCHWc2NCHW(
-                sc_conv_output, MB, OC / c_block, conv_p, conv_q, c_block);
-        test_utils::compare_data(sc_conv_output_plain.data(),
-                ref_conv_output.data(), ref_conv_output.size(), 1e-3f, 1e-3f);
-    }
-    test_utils::compare_data(sc_output_plain.data(), ref_output.data(),
-            sc_output_plain.size(), 1e-3f, 1e-3f);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_fwd_f32_conv_padding_pooling_avg_postop) {
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion = "conv_fwd_core_pooling_avg_mul_add_relu";
-    check_conv_pooling_postops_graph<float, float, float>(expected_fusion, 56,
-            1, 128, 128, 56, 56, 3, 3, 1, 1, 1, 1, pooling_type_t::avg, 2, 2, 2,
-            2, 0, 0, false, true, auto_pad_options::none, false);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_fwd_f32_conv_pooling_avg_postop) {
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion = "conv_fwd_core_pooling_avg_mul_add_relu";
-    check_conv_pooling_postops_graph<float, float, float>(expected_fusion, 56,
-            1, 128, 128, 56, 56, 3, 3, 1, 1, 0, 0, pooling_type_t::avg, 2, 2, 2,
-            2, 0, 0, false, true, auto_pad_options::none, false);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_fwd_f32_conv_padding_pooling_max_postop) {
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion = "conv_fwd_core_pooling_max_mul_add_relu";
-    check_conv_pooling_postops_graph<float, float, float>(expected_fusion, 56,
-            1, 48, 128, 64, 64, 3, 3, 1, 1, 1, 1, pooling_type_t::max, 3, 3, 2,
-            2, 1, 1, false, true, auto_pad_options::none, false);
-}
-
-TEST(GCCore_CPU_fusible_pooling, Test_fwd_f32_conv_pooling_max_postop) {
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion = "conv_fwd_core_pooling_max_mul_add_relu";
-    check_conv_pooling_postops_graph<float, float, float>(expected_fusion, 56,
-            1, 48, 128, 64, 64, 3, 3, 1, 1, 0, 0, pooling_type_t::max, 3, 3, 2,
-            2, 1, 1, false, true, auto_pad_options::none, false);
-}
-
-TEST(GCCore_CPU_fusible_pooling,
-        Test_fwd_f32_conv_pooling_max_fuse_pooling_avg) {
-    SET_THREADS_OR_SKIP(1);
-    int stride_h = 1, stride_w = 1;
-    int padding_h = 1, padding_w = 1;
-    int R = 3, S = 3;
-    sc_data_type_t dtype = datatypes::f32;
-    sc_graph_t mgr;
-    auto in_fmt = sc_data_format_t::NCHW();
-    sc_dims input_tensor_shape = {1, 256, 7, 7};
-    sc_op_ptr in_tensor;
-    std::vector<std::shared_ptr<graph_tensor>> inputs;
-    in_tensor = mgr.make_input(
-            {graph_tensor::make(input_tensor_shape, in_fmt, dtype)});
-    inputs.emplace_back(in_tensor->get_outputs()[0]);
-
-    auto pooling_out = mgr.make("pooling_max", inputs, {},
-            {{pooling_attr_key::strides, sc_dims {stride_h, stride_w}},
-                    {pooling_attr_key::paddings,
-                            sc_dims {padding_h, padding_w}},
-                    {pooling_attr_key::kernel, sc_dims {R, S}},
-                    {pooling_attr_key::src_shape, input_tensor_shape},
-                    {pooling_attr_key::exclude_pad, true},
-                    {pooling_attr_key::data_format, "NCX"}});
-    pooling_out = mgr.make("pooling_avg", pooling_out->get_outputs(), {},
-            {{pooling_attr_key::strides, sc_dims {stride_h, stride_w}},
-                    {pooling_attr_key::paddings,
-                            sc_dims {padding_h, padding_w}},
-                    {pooling_attr_key::kernel, sc_dims {R, S}},
-                    {pooling_attr_key::src_shape, input_tensor_shape},
-                    {pooling_attr_key::exclude_pad, true},
-                    {pooling_attr_key::data_format, "NCX"}});
-
-    auto out = mgr.make_output(pooling_out->get_outputs());
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver(mgr, ctx);
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(mgr);
-    auto body = fused_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_[1]
-                        .checked_as<stmts>();
-    ir_simplifier_t simp {false};
-    loop_merger_t lm;
-    auto sim_body = simp(lm(body));
-    EXPECT_EQ(sim_body.checked_as<stmts>()->seq_.size(), 2UL);
-}
-
-template <typename src_type>
-static sc_graph_t make_conv_postops_pooling_graph_int8(const int64_t N,
-        const int64_t K, const int64_t C, const int64_t H, const int64_t W,
-        const int64_t R, const int64_t S, const int64_t SH, const int64_t SW,
-        const int64_t PH, const int64_t PW, pooling_type_t pool_type,
-        const int64_t p_P, const int64_t p_Q, const int64_t p_SH,
-        const int64_t p_SW, const int64_t p_PH, const int64_t p_PW,
-        std::vector<sc_op_ptr> &fuse_arg_ops, std::vector<any_map_t> qinfos,
-        bool exclude_pad = false,
-        const std::string &auto_pad = auto_pad_options::none,
-        const int64_t c_block = 1) {
-    static_assert(std::is_same<src_type, int8_t>::value
-                    || std::is_same<src_type, uint8_t>::value,
-            "src_type should be int8_t or uint8_t");
-    sc_graph_t g;
-    // input
-    auto in_fmt = c_block == 1 ? sc_data_format_t::NCHW()
-                               : sc_data_format_t::NCHWc(c_block);
-    sc_data_type_t src_dtype = sc_data_traits_t<src_type>::type();
-    sc_data_type_t weight_dtype = datatypes::s8;
-    // input dequantize
-    sc_op_ptr data_input = g.make_input(
-            {graph_tensor::make({N, C, H, W}, in_fmt, src_dtype)});
-    sc_op_ptr weight_input = g.make_input(
-            {graph_tensor::make({K, C, R, S}, in_fmt, weight_dtype)});
-    auto deq_data
-            = g.make("dequantize", data_input->get_outputs(), {}, qinfos[0]);
-    auto deq_weight
-            = g.make("dequantize", weight_input->get_outputs(), {}, qinfos[1]);
-    // conv relu quantize
-    sc_dims paddings = {PH, PW};
-    sc_op_ptr conv_op = g.make("conv_fwd_core",
-            {deq_data->get_outputs()[0], deq_weight->get_outputs()[0]}, {},
-            {{"strides", sc_dims {SH, SW}}, {"pads_begin", paddings},
-                    {"pads_end", paddings}, {"data_format", "NCX"},
-                    {"weights_format", "OIX"}});
-
-    fuse_arg_ops = {data_input, weight_input};
-    sc_op_ptr relu_out = g.make("relu", {conv_op->get_outputs()[0]}, {}, {});
-    auto conv_relu_quan_out
-            = g.make("quantize", relu_out->get_outputs(), {}, qinfos[2]);
-    // dequantize pooling quantize
-    auto deq_pool_in = g.make(
-            "dequantize", conv_relu_quan_out->get_outputs(), {}, qinfos[3]);
-    auto pooling_op_type_str
-            = pool_type == pooling_type_t::max ? "pooling_max" : "pooling_avg";
-    sc_op_ptr pooling_op = g.make(pooling_op_type_str,
-            {deq_pool_in->get_outputs()[0]}, {},
-            {{pooling_attr_key::strides, sc_dims {p_SH, p_SW}},
-                    {pooling_attr_key::paddings, sc_dims {p_PH, p_PW}},
-                    {pooling_attr_key::kernel, sc_dims {p_P, p_Q}},
-                    {pooling_attr_key::exclude_pad, exclude_pad},
-                    {pooling_attr_key::auto_pad, auto_pad},
-                    {pooling_attr_key::data_format, data_format_options::NCX}});
-    auto pooling_quan_out
-            = g.make("quantize", pooling_op->get_outputs(), {}, qinfos[4]);
-    const sc_op_ptr &final_out = pooling_quan_out;
-
-    sc_op_ptr out = g.make_output(final_out->get_outputs());
-    fuse_arg_ops.insert(fuse_arg_ops.begin(), out);
-
-    g.attrs_.set(sc_graph_t::attr_key_t::is_input_plain, c_block == 1);
-    g.attrs_.set(sc_graph_t::attr_key_t::is_output_plain, c_block == 1);
-
-    return g;
-}
-
-static any_map_t get_qinfo(sc_data_type_t dtype,
-        std::vector<float> scales = {1.f}, bool per_channel = false,
-        int channel_axis = 0, std::vector<int> zero_points = {0},
-        bool asymmetric = false) {
-    return {
-            {attr_keys::quan_dtype, dtype},
-            {attr_keys::scales, scales},
-            {attr_keys::per_channel, per_channel},
-            {attr_keys::channel_axis, channel_axis},
-            {attr_keys::zero_points, zero_points},
-            {attr_keys::asymmetric, asymmetric},
-    };
-}
-
-template <typename T, typename src_type>
-static std::pair<float, int> get_scale_zero(
-        T rmax, T rmin, bool asymmetric = false) {
-    static_assert(std::is_same<src_type, int8_t>::value
-                    || std::is_same<src_type, uint8_t>::value,
-            "src_type should be int8_t or uint8_t");
-    int zero_point = 0;
-    float scale;
-    if (asymmetric) {
-        int qmax = 255;
-        int qmin = 0;
-        if (std::is_same<src_type, int8_t>::value) {
-            qmax = 127;
-            qmin = -128;
-        }
-        scale = (rmax - rmin) / static_cast<float>(qmax - qmin);
-        zero_point = std::round(qmax - rmax / scale);
-    } else {
-        if (std::is_same<src_type, uint8_t>::value) {
-            scale = std::max(rmax, 0.f) / 255;
-        } else
-            scale = std::max(std::abs(rmax), std::abs(rmin)) / 127;
-    }
-
-    return {scale, zero_point};
-}
-
-template <typename T, typename src_type>
-static std::pair<float, int> get_list_scale_zero(
-        T *begin, T *end, bool asymmetric = false) {
-    static_assert(std::is_same<src_type, int8_t>::value
-                    || std::is_same<src_type, uint8_t>::value,
-            "src_type should be int8_t or uint8_t");
-    T rmax = *std::max_element(begin, end);
-    T rmin = *std::min_element(begin, end);
-    return get_scale_zero<T, src_type>(rmax, rmin, asymmetric);
-}
-
-template <typename src_type, typename dst_type = src_type>
-static void check_conv_postops_pooling_graph_int8(
-        const std::string &expected_fusion, const int64_t MB,
-        const int64_t c_block, const int64_t OC, const int64_t IC,
-        const int64_t IH, const int64_t IW, const int64_t KH, const int64_t KW,
-        const int64_t SH, const int64_t SW, const int64_t PH, const int64_t PW,
-        pooling_type_t pooling_type, const int64_t p_KH, const int64_t p_KW,
-        const int64_t p_SH, const int64_t p_SW, int64_t p_PH, int64_t p_PW,
-        const bool exclude_pad = false,
-        const std::string auto_pad = auto_pad_options::none,
-        const bool check_conv_out = false) {
-    REQUIRE_VNNI();
-    int64_t conv_p, conv_q, pool_p, pool_q;
-    compute_conv_pooling_outshape(MB, OC, IC, IH, IW, KH, KW, SH, SW, PH, PW,
-            pooling_type, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, exclude_pad,
-            auto_pad, conv_p, conv_q, pool_p, pool_q);
-
-    // compute ref
-    float data_min = -1;
-    if (std::is_same<src_type, uint8_t>::value) {
-        // seems conv not support asymmetric uint8 data input
-        // so set ref_data all positive.
-        data_min = 0;
-    }
-    auto ref_data
-            = alloc_array<float>(MB * IC * IH * IW, INIT_RANGE, data_min, 1);
-    auto ref_weight = alloc_array<float>(IC * OC * KH * KW);
-    auto ref_conv_output = alloc_array<float>(MB * OC * conv_p * conv_q);
-    auto ref_output = alloc_array<float>(MB * OC * pool_p * pool_q);
-    auto bn_mul = alloc_array<float>(OC, INIT_RANGE, 1, 1);
-    auto bn_add = alloc_array<float>(OC, INIT_ZERO);
-    std::string pooling_op_type_str
-            = pooling_type == pooling_type_t::max ? "max" : "avg";
-    compute_conv_postops_pooling_ref<float, float, float>(MB, OC, IC, IH, IW,
-            conv_p, conv_q, KH, KW, SH, SW, PH, PW, &ref_data[0],
-            &ref_weight[0], &ref_conv_output[0], pooling_op_type_str, pool_p,
-            pool_q, p_KH, p_KW, p_SH, p_SW, p_PH, p_PW, &ref_output[0],
-            exclude_pad, true, &bn_mul[0], &bn_add[0]);
-
-    // make graph
-    bool input_asymmetric = false;
-    bool output_asymmetric = false;
-    auto sz_data = get_list_scale_zero<float, src_type>(
-            ref_data.begin(), ref_data.end(), input_asymmetric);
-    auto sz_conv_relu_out = get_list_scale_zero<float, dst_type>(
-            ref_conv_output.begin(), ref_conv_output.end(), output_asymmetric);
-    auto sz_pool_out = get_list_scale_zero<float, dst_type>(
-            ref_output.begin(), ref_output.end(), output_asymmetric);
-    std::vector<float> scale_weights(OC);
-    int64_t kernel_size = IC * KH * KW;
-    for (int i = 0; i < OC; i++) {
-        scale_weights[i] = get_list_scale_zero<float, int8_t>(
-                ref_weight.begin() + i * kernel_size,
-                ref_weight.begin() + (i + 1) * kernel_size)
-                                   .first;
-    }
-    auto qinfos = std::vector<any_map_t> {
-            get_qinfo(datatypes::f32, {sz_data.first}, false, 0,
-                    {sz_data.second}, input_asymmetric),
-            get_qinfo(datatypes::f32, scale_weights, true, 0, {0},
-                    false), // weights should be signed int8
-            get_qinfo(sc_data_traits_t<dst_type>::type(),
-                    {sz_conv_relu_out.first}, false, 0,
-                    {sz_conv_relu_out.second}, output_asymmetric),
-            get_qinfo(datatypes::f32, {sz_conv_relu_out.first}, false, 0,
-                    {sz_conv_relu_out.second}, output_asymmetric),
-            get_qinfo(sc_data_traits_t<dst_type>::type(), {sz_pool_out.first},
-                    false, 0, {sz_pool_out.second}, output_asymmetric),
-    };
-    std::vector<sc_op_ptr> fuse_arg_ops;
-    sc_graph_t g = make_conv_postops_pooling_graph_int8<src_type>(MB, OC, IC,
-            IH, IW, KH, KW, SH, SW, PH, PW, pooling_type, p_KH, p_KW, p_SH,
-            p_SW, p_PH, p_PW, fuse_arg_ops, qinfos, exclude_pad, auto_pad,
-            c_block);
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver(g, ctx);
-    // check graph
-    auto has_expected_fusion = false;
-    for (auto &op : g.ops_) {
-        if (op->op_name_.find(expected_fusion) != std::string::npos) {
-            has_expected_fusion = true;
-            break;
-        }
-    }
-    EXPECT_TRUE(has_expected_fusion);
-
-    // compute sc
-    auto sc_data_plain = alloc_array<src_type>(MB * IC * IH * IW, INIT_NOOP);
-    auto sc_weight_plain = alloc_array<int8_t>(OC * IC * KH * KW, INIT_NOOP);
-    for (size_t i = 0; i < ref_data.size(); i++) {
-        sc_data_plain[i] = static_cast<src_type>(
-                ref_data[i] / sz_data.first + sz_data.second);
-    }
-    for (int i = 0; i < OC; i++) {
-        for (int j = 0; j < kernel_size; j++) {
-            sc_weight_plain[i * kernel_size + j] = static_cast<int8_t>(
-                    ref_weight[i * kernel_size + j] / scale_weights[i]);
-        }
-    }
-    auto sc_data = NCHW2NCHWc(sc_data_plain, MB, IC / c_block, IH, IW, c_block);
-    auto sc_weight
-            = NCHW2NCHWc(sc_weight_plain, OC, IC / c_block, KH, KW, c_block);
-    auto sc_output = alloc_array<dst_type>(MB * OC * pool_p * pool_q);
-
-    std::vector<generic_val> generic_args = {&sc_output[0]};
-    generic_args.emplace_back(&sc_data[0]);
-    generic_args.emplace_back(&sc_weight[0]);
-    auto f = lower_graph(ctx, g, fuse_arg_ops);
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_generic_default(generic_args.data());
-    auto sc_output_plain
-            = NCHWc2NCHW(sc_output, MB, OC / c_block, pool_p, pool_q, c_block);
-    auto sc_output_f32 = alloc_array<float>(MB * OC * pool_p * pool_q);
-    for (int i = 0; i < MB; i++) {
-        for (int j = 0; j < OC; j++) {
-            for (int k = 0; k < pool_p * pool_q; k++) {
-                int idx = i * OC * pool_p * pool_q + j * pool_p * pool_q + k;
-                sc_output_f32[idx]
-                        = static_cast<float>((int32_t)sc_output_plain[idx]
-                                  - sz_pool_out.second)
-                        * sz_pool_out.first;
-            }
-        }
-    }
-
-    // compare ref and sc
-    auto ref_vector = std::vector<float>(ref_output.begin(), ref_output.end());
-    auto sc_f32_vector
-            = std::vector<float>(sc_output_f32.begin(), sc_output_f32.end());
-    EXPECT_TRUE(test_utils::cal_rmse(ref_vector, sc_f32_vector) < 3);
-}
-
-TEST(GCCore_CPU_fusible_pooling,
-        Test_fwd_quantized_conv_relu_max_pooling_uint8) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion
-            = "quantized_conv_fwd_core_cast_mul_relu_cast_pooling_max_cast_mul_"
-              "cast";
-    check_conv_postops_pooling_graph_int8<uint8_t>(expected_fusion, 56, 1, 64,
-            128, 56, 56, 3, 3, 1, 1, 0, 0, pooling_type_t::max, 3, 3, 2, 2, 1,
-            1, false, auto_pad_options::none);
-}
-
-TEST(GCCore_CPU_fusible_pooling,
-        Test_fwd_quantized_conv_relu_avg_pooling_int8) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion
-            = "quantized_conv_fwd_core_cast_mul_relu_cast_pooling_avg_mul_cast";
-    check_conv_postops_pooling_graph_int8<int8_t, uint8_t>(expected_fusion, 56,
-            1, 64, 128, 56, 56, 3, 3, 1, 1, 0, 0, pooling_type_t::avg, 3, 3, 2,
-            2, 1, 1, false, auto_pad_options::none);
-}
-
-TEST(GCCore_CPU_fusible_pooling,
-        Test_fwd_quantized_conv_padding_relu_avg_pooling_uint8) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion
-            = "quantized_conv_fwd_core_cast_mul_relu_cast_pooling_avg_mul_cast";
-    check_conv_postops_pooling_graph_int8<uint8_t>(expected_fusion, 56, 1, 64,
-            128, 56, 56, 3, 3, 1, 1, 1, 1, pooling_type_t::avg, 3, 3, 2, 2, 1,
-            1, false, auto_pad_options::none);
-}
-
-TEST(GCCore_CPU_fusible_pooling,
-        Test_fwd_quantized_conv_padding_relu_max_pooling_int8) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    std::string expected_fusion
-            = "quantized_conv_fwd_core_cast_mul_relu_cast_pooling_max_cast_mul_"
-              "cast";
-    check_conv_postops_pooling_graph_int8<int8_t, uint8_t>(expected_fusion, 56,
-            1, 64, 128, 56, 56, 3, 3, 1, 1, 1, 1, pooling_type_t::max, 3, 3, 2,
-            2, 1, 1, false, auto_pad_options::none);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusion_anchor.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusion_anchor.cpp
deleted file mode 100644
index e3327cb1deb..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusion_anchor.cpp
+++ /dev/null
@@ -1,396 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <ops/fusible/binary_elemwise.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-TEST(GCCore_CPU_fusion_anchor_cpp, TestGroupedFusionAnchor) {
-    sc_graph_t g;
-    sc_dims input_dims = {20, 35, 32, 16};
-    auto finput0 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput1 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput2 = g.make_input(test_utils::make_tsr(input_dims));
-
-    auto fadd = g.make("add",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0]}, {}, {});
-    auto fmul = g.make(
-            "mul", {fadd->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-
-    auto fout = g.make_output(fmul->get_outputs());
-
-    fusion_anchor_mgr_t fmgr;
-    builder::ir_builder_t bld;
-    _function_(datatypes::boolean, aaa) {
-        _for_(m_o, 0, 20) {
-            _for_(n_o, 0, 32) {
-                // Grouped Anchor 0
-                fmgr.create_fusion_anchor(/*group_id*/ 0,
-                        slice_map {{fadd->get_inputs()[0].get(),
-                                {{{m_o, 1}, {n_o, 1}, {0, 32}, {0, 16}}}}});
-            }
-            _for_(n_o, 0, 3) {
-                // Grouped Anchor 0
-                fmgr.create_fusion_anchor(/*group_id*/ 0,
-                        slice_map {{fadd->get_inputs()[0].get(),
-                                {{{m_o, 1}, {n_o, 1}, {0, 32}, {0, 16}}}}});
-            }
-        }
-        _return_(true);
-    }
-
-    // commit graph to existed TIR function body with fusion anchor mgr
-    commit_graph_to_func(g, aaa, fmgr);
-
-    int lanes = fadd->stc_cast<binary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::boolean, bbb,
-            _arg_("mul_out", datatypes::s32, {20UL, 35UL, 32UL, 16UL}),
-            _arg_("add_in_0", datatypes::s32, {20UL, 35UL, 32UL, 16UL}),
-            _arg_("add_in_1", datatypes::s32, {20UL, 35UL, 32UL, 16UL}),
-            _arg_("mul_in_1", datatypes::s32, {20UL, 35UL, 32UL, 16UL})) {
-        _bind_(mul_out, add_in_0, add_in_1, mul_in_1);
-        _for_(m_o, 0, 20) {
-            // Common parent node is right position for tensor define
-            _tensor_(add_out, datatypes::s32, {20UL, 35UL, 32UL, 16UL});
-            _for_(n_o, 0, 32) {
-                // Grouped Anchor 0
-                bld.push_scope();
-                {
-                    auto mul_out_tptr = builder::tensor_ptr(
-                            mul_out, {m_o, n_o, 0, 0}, {}, true);
-                    auto add_out_tptr = builder::tensor_ptr(
-                            add_out, {m_o, n_o, 0, 0}, {}, true);
-                    auto add_in_0_tptr = builder::tensor_ptr(
-                            add_in_0, {m_o, n_o, 0, 0}, {}, true);
-                    auto add_in_1_tptr = builder::tensor_ptr(
-                            add_in_1, {m_o, n_o, 0, 0}, {}, true);
-                    auto mul_in_1_tptr = builder::tensor_ptr(
-                            mul_in_1, {m_o, n_o, 0, 0}, {}, true);
-
-                    _for_(ii, 0, 32) {
-                        _for_(jj, 0, 16, lanes) {
-                            add_out_tptr[span_t({0, 0, ii, jj}, lanes)]
-                                    = add_in_0_tptr[span_t(
-                                              {0, 0, ii, jj}, lanes)]
-                                    + add_in_1_tptr[span_t(
-                                            {0, 0, ii, jj}, lanes)];
-                        }
-                    }
-
-                    _for_(ii, 0, 32) {
-                        _for_(jj, 0, 16, lanes) {
-                            mul_out_tptr[span_t({0, 0, ii, jj}, lanes)]
-                                    = add_out_tptr[span_t(
-                                              {0, 0, ii, jj}, lanes)]
-                                    * mul_in_1_tptr[span_t(
-                                            {0, 0, ii, jj}, lanes)];
-                        }
-                    }
-                }
-                bld.emit(bld.pop_scope());
-            }
-            _for_(n_o, 0, 3) {
-                // Grouped Anchor 0
-                bld.push_scope();
-                {
-                    auto mul_out_tptr = builder::tensor_ptr(
-                            mul_out, {m_o, n_o, 0, 0}, {}, true);
-                    auto add_out_tptr = builder::tensor_ptr(
-                            add_out, {m_o, n_o, 0, 0}, {}, true);
-                    auto add_in_0_tptr = builder::tensor_ptr(
-                            add_in_0, {m_o, n_o, 0, 0}, {}, true);
-                    auto add_in_1_tptr = builder::tensor_ptr(
-                            add_in_1, {m_o, n_o, 0, 0}, {}, true);
-                    auto mul_in_1_tptr = builder::tensor_ptr(
-                            mul_in_1, {m_o, n_o, 0, 0}, {}, true);
-
-                    _for_(ii, 0, 32) {
-                        _for_(jj, 0, 16, lanes) {
-                            add_out_tptr[span_t({0, 0, ii, jj}, lanes)]
-                                    = add_in_0_tptr[span_t(
-                                              {0, 0, ii, jj}, lanes)]
-                                    + add_in_1_tptr[span_t(
-                                            {0, 0, ii, jj}, lanes)];
-                        }
-                    }
-
-                    _for_(ii, 0, 32) {
-                        _for_(jj, 0, 16, lanes) {
-                            mul_out_tptr[span_t({0, 0, ii, jj}, lanes)]
-                                    = add_out_tptr[span_t(
-                                              {0, 0, ii, jj}, lanes)]
-                                    * mul_in_1_tptr[span_t(
-                                            {0, 0, ii, jj}, lanes)];
-                        }
-                    }
-                }
-                bld.emit(bld.pop_scope());
-            }
-        }
-        _return_(true);
-    }
-
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(aaa, bbb, false));
-}
-
-TEST(GCCore_CPU_fusion_anchor_cpp, TestDynamicFusionAnchor) {
-    sc_graph_t g;
-    sc_dims input_dims = {20, 35, 16};
-    auto finput0 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput1 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput2 = g.make_input(test_utils::make_tsr(input_dims));
-
-    auto fadd = g.make("add",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0]}, {}, {});
-    auto fmul = g.make(
-            "mul", {fadd->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-
-    auto fout = g.make_output(fmul->get_outputs());
-
-    fusion_anchor_mgr_t fmgr;
-    builder::ir_builder_t bld;
-    _function_(datatypes::boolean, aaa) {
-        _for_(m_o, 0, 20) {
-            // dynamic shape according to `m_o` value
-            _var_(dyn_shape, sc_data_type_t::s32());
-            dyn_shape = builder::make_select(m_o > 10, expr(32), expr(3));
-            // Dynamic Anchor with dynamic slice shape
-            fmgr.create_fusion_anchor(slice_map {{fadd->get_inputs()[0].get(),
-                    {{{m_o, 1}, {0, dyn_shape}, {0, 16}}}}});
-        }
-        _return_(true);
-    }
-
-    // commit graph to existed TIR function body with fusion anchor mgr
-    commit_graph_to_func(g, aaa, fmgr);
-
-    int lanes = fadd->stc_cast<binary_elementwise_op_impl_t>()->get_lanes();
-
-    _function_(datatypes::boolean, bbb,
-            _arg_("mul_out", datatypes::s32, {20UL, 35UL, 16UL}),
-            _arg_("add_in_0", datatypes::s32, {20UL, 35UL, 16UL}),
-            _arg_("add_in_1", datatypes::s32, {20UL, 35UL, 16UL}),
-            _arg_("mul_in_1", datatypes::s32, {20UL, 35UL, 16UL})) {
-        _bind_(mul_out, add_in_0, add_in_1, mul_in_1);
-        _for_(m_o, 0, 20) {
-            _var_(dyn_shape, sc_data_type_t::s32());
-            _tensor_(add_out, datatypes::s32, {20UL, 35UL, 16UL});
-            dyn_shape = builder::make_select(m_o > 10, expr(32), expr(3));
-            bld.push_scope();
-            {
-                auto mul_out_tptr
-                        = builder::tensor_ptr(mul_out, {m_o, 0, 0}, {}, true);
-                auto add_out_tptr
-                        = builder::tensor_ptr(add_out, {m_o, 0, 0}, {}, true);
-                auto add_in_0_tptr
-                        = builder::tensor_ptr(add_in_0, {m_o, 0, 0}, {}, true);
-                auto add_in_1_tptr
-                        = builder::tensor_ptr(add_in_1, {m_o, 0, 0}, {}, true);
-                auto mul_in_1_tptr
-                        = builder::tensor_ptr(mul_in_1, {m_o, 0, 0}, {}, true);
-                _for_(j, 0, dyn_shape) {
-                    _for_(k, 0, 16, lanes) {
-                        add_out_tptr[span_t({0, j, k}, static_cast<int>(lanes))]
-                                = add_in_0_tptr[span_t({0, j, k}, lanes)]
-                                + add_in_1_tptr[span_t({0, j, k}, lanes)];
-                    }
-                }
-                _for_(j, 0, dyn_shape) {
-                    _for_(k, 0, 16, lanes) {
-                        mul_out_tptr[span_t({0, j, k}, static_cast<int>(lanes))]
-                                = add_out_tptr[span_t({0, j, k}, lanes)]
-                                * mul_in_1_tptr[span_t({0, j, k}, lanes)];
-                    }
-                }
-            }
-            bld.emit(bld.pop_scope());
-        }
-        _return_(true);
-    }
-
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(aaa, bbb, false));
-}
-
-TEST(GCCore_CPU_fusion_anchor_cpp, TestSplitGroupedFusionAnchor1) {
-    SET_THREADS_OR_SKIP(20);
-    sc_graph_t g;
-    // The last dim 55 could not be divided by max lanes, as the result, it
-    // should be auto split into grouped anchor in avoid of loop merge break
-    sc_dims input_dims = {20, 30, 55};
-    auto finput0 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput1 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput2 = g.make_input(test_utils::make_tsr(input_dims));
-
-    auto fadd = g.make("add",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0]}, {}, {});
-    auto fmul = g.make(
-            "mul", {fadd->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-
-    auto fout = g.make_output(fmul->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    do_mixed_partition(ctx, g);
-
-    auto mixed_op = get_mixed_op_from_graph(g);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto parti = mixed_op->parti_list_[0];
-    ASSERT_TRUE(parti->fanchors_.size() == 1);
-    auto fanchor = parti->fanchors_[0];
-    // grouped anchor is expected
-    auto grouped_anchor = fanchor->dyn_cast<grouped_fusion_anchor_t>();
-    ASSERT_TRUE(grouped_anchor);
-    // own two group size
-    ASSERT_TRUE(
-            grouped_anchor->anchor_position_.checked_as<stmts>()->seq_.size()
-            == 2);
-}
-
-TEST(GCCore_CPU_fusion_anchor_cpp, TestSplitGroupedFusionAnchor2) {
-    SET_THREADS_OR_SKIP(20);
-    sc_graph_t g;
-    // The last dim 55 could not be divided by max lanes, as the result, it
-    // should be auto split into grouped anchor in avoid of loop merge break
-    sc_dims input_dims = {20, 30, 55};
-    auto finput0 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput1 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput2 = g.make_input(test_utils::make_tsr(input_dims));
-
-    auto fadd = g.make("add",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0]}, {}, {});
-    auto fmul = g.make(
-            "mul", {fadd->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-    // However, this reduce op would break split anchor optimization, which may
-    // drive reduce op to select larger fusion anchor than non-optimized version
-    auto freduce = g.make("reduce", fmul->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {2}}, {"rd_op", 0}});
-    auto fout = g.make_output(freduce->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    do_mixed_partition(ctx, g);
-
-    auto mixed_op = get_mixed_op_from_graph(g);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto parti = mixed_op->parti_list_[0];
-    ASSERT_TRUE(parti->fanchors_.size() == 1);
-    auto fanchor = parti->fanchors_[0];
-    // grouped anchor is not expected
-    auto grouped_anchor = fanchor->dyn_cast<grouped_fusion_anchor_t>();
-    ASSERT_FALSE(grouped_anchor);
-}
-
-TEST(GCCore_CPU_fusion_anchor_cpp, TestSplitGroupedFusionAnchor3) {
-    SET_THREADS_OR_SKIP(20);
-    sc_graph_t g;
-    // The last dim 55 could not be divided by max lanes, as the result, it
-    // should be auto split into grouped anchor in avoid of loop merge break
-    sc_dims input_dims = {20, 30, 55};
-    auto finput0 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput1 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput2 = g.make_input(test_utils::make_tsr(input_dims));
-
-    auto frelu0 = g.make("relu", {finput0->get_outputs()[0]}, {}, {});
-    auto frelu1 = g.make("relu", {finput1->get_outputs()[0]}, {}, {});
-    auto fadd = g.make("add",
-            {frelu1->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-    auto fmul = g.make(
-            "mul", {frelu0->get_outputs()[0], fadd->get_outputs()[0]}, {}, {});
-    auto fout = g.make_output(fmul->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    do_mixed_partition(ctx, g);
-
-    auto mixed_op = get_mixed_op_from_graph(g);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto parti = mixed_op->parti_list_[0];
-    // No grouped anchor is expected explicitly
-    ASSERT_TRUE(std::all_of(parti->fanchors_.begin(), parti->fanchors_.end(),
-            [](const fusion_anchor_ptr &fanchor) {
-                return !fanchor->isa<grouped_fusion_anchor_t>();
-            }));
-    // check tensor shrink info for fadd
-    auto body
-            = parti->get_outer_loops().back()->body_.checked_as<stmts>()->seq_;
-    EXPECT_TRUE(std::any_of(body.begin(), body.end(), [](const stmt &s) {
-        return s.cast<define>()
-                .map([](const define &d) { return d->var_.as<tensor>(); })
-                .filter([](const tensor &t) {
-                    // check output buffer of add whether has tensor shrink attr
-                    return t->name_.find("add") != std::string::npos
-                            && t->attr().has_key(
-                                    tensor_shrinker_attrs::should_shrink);
-                })
-                .has_value();
-    }));
-}
-
-TEST(GCCore_CPU_fusion_anchor_cpp, TestSplitGroupedFusionAnchor4) {
-    SET_THREADS_OR_SKIP(20);
-    sc_graph_t g;
-    // The last dim 55 could not be divided by max lanes, as the result, it
-    // should be auto split into grouped anchor in avoid of loop merge break
-    sc_dims input_dims = {20, 30, 55};
-    auto finput0 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput1 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput2 = g.make_input(test_utils::make_tsr(input_dims));
-    auto finput3 = g.make_input(test_utils::make_tsr(input_dims));
-
-    auto fadd0 = g.make("add",
-            {finput0->get_outputs()[0], finput1->get_outputs()[0]}, {}, {});
-    auto fmul0 = g.make("mul",
-            {fadd0->get_outputs()[0], finput2->get_outputs()[0]}, {}, {});
-    auto frelu0 = g.make("relu", finput3->get_outputs(), {}, {});
-    auto fadd1 = g.make(
-            "add", {frelu0->get_outputs()[0], fmul0->get_outputs()[0]}, {}, {});
-    auto frelu1 = g.make("relu", fadd1->get_outputs(), {}, {});
-
-    auto fout = g.make_output(frelu1->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    do_mixed_partition(ctx, g);
-
-    auto mixed_op = get_mixed_op_from_graph(g);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto parti = mixed_op->parti_list_[0];
-    ASSERT_TRUE(parti->fanchors_.size() == 1);
-    auto fanchor = parti->fanchors_[0];
-    // implicit grouped anchor is expected
-    ASSERT_TRUE(std::any_of(fanchor->fsmap_.datamap_.begin(),
-            fanchor->fsmap_.datamap_.end(),
-            [](const std::pair<graph_tensor *, slice_range_list> &kv) {
-                return kv.second.size() > 1;
-            }));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusion_cost_model.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusion_cost_model.cpp
deleted file mode 100644
index 035a6f8d813..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_fusion_cost_model.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_fusion_cost_model_cpp, TestBroadcastOp1) {
-    sc_graph_t graph;
-    SET_THREADS_OR_SKIP(28);
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({1, 64}, sc_data_format_t::MK())});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({32, 64}, sc_data_format_t::MKmk(16, 16))});
-
-    auto reo_node = graph.make("reorder", {input0->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(1, 16)}});
-    // mul op still can be added into reorder partition although mul has more
-    // loop parallelism than reorder, because it is small op workload
-    auto mul_node = graph.make("mul",
-            {reo_node->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output(mul_node->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    // turn on cost model
-    ctx->flags_.use_cost_model_ = true;
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[1, 64], v1: f32[2, 4, 16, 16]) -> [v2: f32[2, 4, 16, 16]] {
-  [v2: f32[2, 4, 16, 16]] = outerloop_1X4X1_partition_reorder_mul(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_fusion_cost_model_cpp, TestBroadcastOp2) {
-    sc_graph_t graph;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    SET_THREADS_OR_SKIP(28);
-    if (vectorize_step(ctx, sc_data_etype::F32) > 16) { GTEST_SKIP(); }
-
-    // build N more than small op workload threshold
-    int N = (mixed_partition_hint::small_op_workload_threshold / 2 + 1) * 16;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({16, N}, sc_data_format_t::MKmk(16, 16))});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({16, N}, sc_data_format_t::MKmk(16, 16))});
-
-    auto red_node = graph.make("reduce", {input0->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    // mul op could not be added into reduce partition due to mul has more loop
-    // parallelism than reduce.
-    auto mul_node = graph.make("mul",
-            {red_node->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    auto relu_node = graph.make("relu", mul_node->get_outputs(), {}, {});
-
-    auto output0 = graph.make_output(relu_node->get_outputs());
-
-    // turn on cost model
-    ctx->flags_.use_cost_model_ = true;
-    mixed_partition(graph, ctx);
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    COMPILE_ASSERT(fused_op, "No mixed fused op is found, please check")
-    // fused op should have two partition due to mul op is expected to break
-    // fusion by cost model
-    EXPECT_EQ(fused_op->parti_list_.size(), (size_t)2);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // multi_partitions prefix means it finally contains more than one
-    // partition: `reduce_compute+reduce_collect` and `mul+relu`
-    std::string expected_str
-            = R"(graph(v0: f32[1, 845, 16, 16], v1: f32[1, 845, 16, 16]) -> [v2: f32[1, 845, 16, 16]] {
-  [v2: f32[1, 845, 16, 16]] = multi_partitions_mul_relu_reduce_compute_reduce_collect(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_fusion_cost_model_cpp, TestBroadcastOp3) {
-    SET_THREADS_OR_SKIP(28);
-
-    sc_graph_t graph;
-    int BS = 1, C = 64, H = 1, W = 1, K = 64;
-
-    auto input0 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto weight0 = graph.make_input({graph_tensor::make({K, C, 1, 1})});
-    auto input1 = graph.make_input({graph_tensor::make({BS, K, 112, 112})});
-    auto conv0 = graph.make("conv_fwd_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    auto relu0 = graph.make("relu", conv0->get_outputs(), {}, {});
-    auto sigmoid0 = graph.make("sigmoid", relu0->get_outputs(), {}, {});
-    // This broadcast add should be rejected by cost model due to paralellism
-    auto add0 = graph.make("add",
-            {sigmoid0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    graph.make_output(add0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    // turn on cost model
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-
-    // standalone add op is expected
-    EXPECT_TRUE(std::any_of(graph.ops_.begin(), graph.ops_.end(),
-            [](const sc_op_ptr &op) { return op->op_name_ == "add"; }));
-}
-
-TEST(GCCore_CPU_fusion_cost_model_cpp, TestFusePreLoadBufferCheck) {
-    sc_graph_t graph;
-
-    int run_threads = 28;
-    SET_THREADS_OR_SKIP(run_threads);
-
-    int BS = run_threads, M = 384, K = 1024, N = 1024;
-
-    auto input0 = graph.make_input({graph_tensor::make(
-            {BS, M, K}, sc_data_format_t(format_kinds::ABC))});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {BS, K, N}, sc_data_format_t(format_kinds::ABC))});
-
-    auto cast0 = graph.make("cast", {input0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-    auto cast1 = graph.make("cast", {weight0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-    auto cast2 = graph.make(
-            "cast", {cast1->get_outputs()[0]}, {}, {{"dtype", datatypes::f32}});
-
-    auto matmul0 = graph.make("matmul_core",
-            {cast0->get_outputs()[0], cast2->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output(matmul0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver_before_fusion(graph, ctx);
-    // turn on cost model
-    ctx->flags_.use_cost_model_ = true;
-    // Simulate the pre-load weight size of matmul is larger than L2 cache *
-    // run_threads, as the result, the weight branch will not be merged with
-    // input branch.
-    ctx->machine_.cpu_flags_.dataCacheSize_[2]
-            = N * utils::get_sizeof_type(datatypes::s32) / run_threads - 1;
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[28, 384, 1024], v1: f32[28, 1024, 1024]) -> [v2: f32[28, 384, 1024]] {
-  [v3: f32[28, 1024, 1024]] = outerloop_28X1024_partition_cast_cast(v1)
-  [v2: f32[28, 384, 1024]] = outerloop_28_partition_cast_matmul_core(v0, v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_fusion_cost_model_cpp, TestVerticalMergeForImageAffine) {
-    sc_graph_t graph;
-
-    auto get_conv_block_graph = [](int BS) {
-        sc_graph_t g;
-        int C = 64, H = 56, W = 56, K = 128;
-        auto input = g.make_input({graph_tensor::make({BS, C, H, W})});
-        auto weight0 = g.make_input({graph_tensor::make({K, C, 1, 1})});
-        auto weight1 = g.make_input({graph_tensor::make({K, C, 1, 1})});
-
-        auto conv_data0 = g.make("conv_fwd_core",
-                {input->get_outputs()[0], weight0->get_outputs()[0]}, {},
-                {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-        auto relu_out0 = g.make("relu", {conv_data0->get_outputs()[0]}, {}, {});
-
-        auto conv_data1 = g.make("conv_fwd_core",
-                {input->get_outputs()[0], weight1->get_outputs()[0]}, {},
-                {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-        // break normal vertical merge
-        auto relu_out1 = g.make("relu", {conv_data1->get_outputs()[0]}, {},
-                {{op_attr_key::break_post_fuse, true}});
-        auto mul0 = g.make("mul",
-                {relu_out0->get_outputs()[0], relu_out1->get_outputs()[0]}, {},
-                {});
-        g.make_output(mul0->get_outputs());
-        return g;
-    };
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    int num_threads = 28;
-    SET_THREADS_OR_SKIP(num_threads);
-
-    // case 1: can be merged
-    graph = get_conv_block_graph(num_threads);
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[28, 64, 56, 56], v1: f32[128, 64, 1, 1], v2: f32[128, 64, 1, 1]) -> [v3: f32[28, 128, 56, 56]] {
-  [v3: f32[28, 128, 56, 56]] = outerloop_28_partition_conv_fwd_core_relu_conv_fwd_core_relu_mul(v0, v2, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-
-    // case 2: can not be merged
-    graph = get_conv_block_graph(1);
-    mixed_partition(graph, ctx);
-    ss.str("");
-    print_graph(graph, ss, true);
-    expected_str
-            = R"(graph(v0: f32[1, 64, 56, 56], v1: f32[128, 64, 1, 1], v2: f32[128, 64, 1, 1]) -> [v3: f32[1, 128, 56, 56]] {
-  [v4: f32[1, 128, 56, 56]] = outerloop_1X1X1X56_partition_conv_fwd_core_relu(v0, v2)
-  [v3: f32[1, 128, 56, 56]] = outerloop_1X1X1X56_partition_conv_fwd_core_relu_mul(v0, v1, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_fusion_cost_model_cpp, TestTunableOp) {
-    sc_graph_t graph;
-    SET_THREADS_OR_SKIP(56);
-
-    int BS = 64, M = 64, K = 1024, N = 64;
-
-    auto input0 = graph.make_input({graph_tensor::make(
-            {BS, M, K}, sc_data_format_t(format_kinds::ABC))});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {BS, K, N}, sc_data_format_t(format_kinds::ABC))});
-
-    auto cast0 = graph.make("cast", {input0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-
-    auto matmul0 = graph.make("matmul_core",
-            {cast0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-
-    graph.make_output(matmul0->get_outputs());
-
-    ops::matmul_core_config_t cfg = {32, 32, 32};
-    matmul0->stc_cast<ops::matmul_core_op_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver_before_fusion(graph, ctx);
-    // turn on cost model
-    ctx->flags_.use_cost_model_ = true;
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // Fuse `BMM0` will reduce `cast0` loop parallelism from 64*64 to 64, which
-    // would be rejected by cost model. However, if not fused, `BMM0` could only
-    // get 64*2*2 of parallel loop by itself, which does not reach parallelism
-    // requirement yet. As the result, cost model suggests to fuse them in order
-    // to acheive better cache efficiency.
-    std::string expected_str
-            = R"(graph(v0: f32[64, 64, 1024], v1: f32[64, 1024, 64]) -> [v2: f32[64, 64, 64]] {
-  [v2: f32[64, 64, 64]] = outerloop_64_partition_cast_matmul_core(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_global_reschedule.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_global_reschedule.cpp
deleted file mode 100644
index 764e67bd164..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_global_reschedule.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <sstream>
-#include "compiler/ir/graph/fusible_op.hpp"
-#include "compiler/jit/jit.hpp"
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/utils.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_graph_reshedule, TestGraphReschedule5) {
-    SET_THREADS_OR_SKIP(8);
-
-    auto get_test_graph = []() {
-        sc_graph_t mgr;
-        auto in_a = mgr.make_input({graph_tensor::make({4, 16, 1, 34},
-                sc_data_format_t(format_kinds::ABCD), datatypes::u8)});
-        auto in_b = mgr.make_input({graph_tensor::make(
-                {4, 16, 34, 256}, sc_data_format_t(format_kinds::ABCD))});
-
-        auto quant_b = mgr.make("quantize", in_b->get_outputs(), {},
-                {{"dtype", datatypes::u8},
-                        {"scales", std::vector<float> {1.1f}},
-                        {"zero_points", std::vector<int> {2}},
-                        {"channel_axis", 0}});
-
-        auto dequant_a = mgr.make("dequantize", in_a->get_outputs(), {},
-                {{"dtype", datatypes::f32},
-                        {"scales", std::vector<float> {1.1f}},
-                        {"zero_points", std::vector<int> {0}},
-                        {"channel_axis", 0}});
-        auto dequant_b = mgr.make("dequantize", quant_b->get_outputs(), {},
-                {{"dtype", datatypes::f32},
-                        {"scales", std::vector<float> {1.1f}},
-                        {"zero_points", std::vector<int> {2}},
-                        {"channel_axis", 0}});
-
-        auto matmul = mgr.make("matmul",
-                {dequant_a->get_outputs()[0], dequant_b->get_outputs()[0]}, {},
-                {});
-
-        auto quant_output = mgr.make("quantize", matmul->get_outputs(), {},
-                {{"dtype", datatypes::u8},
-                        {"scales", std::vector<float> {1.1f}},
-                        {"zero_points", std::vector<int> {2}},
-                        {"channel_axis", 0}});
-        mgr.make_output(quant_output->get_outputs());
-        return mgr;
-    };
-
-    {
-        sc_graph_t graph = get_test_graph();
-        auto ctx = std::make_shared<context_t>(*get_test_ctx());
-        graph_driver(graph, ctx);
-
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        std::string expected_str_spr
-                = R"(graph(v0: u8[4, 16, 1, 34], v1: f32[4, 16, 34, 256]) -> [v2: u8[4, 16, 1, 256]] {
-  [v3: s32[1]] = constant([1])
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = mul(v6, v5)
-  [v8: f32[1]] = constant([1])
-  [v9: f32[1]] = constant([1])
-  [v10: u8[4, 16, 34, 256]] = outerloop_4X16X34_partition_mul_add_cast(v1, v9, v8)
-  [v2: u8[4, 16, 1, 256]] = outerloop_4X16_partition_cast_reduce_compute_reduce_collect_mul_reorder_reorder_quantized_matmul_core_sub_cast_mul_add_cast(v10, v0, v3, v7, v4)
-}
-)";
-        std::string expected_str_clx
-                = R"(graph(v0: u8[4, 16, 1, 34], v1: f32[4, 16, 34, 256]) -> [v2: u8[4, 16, 1, 256]] {
-  [v3: s32[1]] = constant([1])
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = mul(v6, v5)
-  [v8: f32[1]] = constant([1])
-  [v9: f32[1]] = constant([1])
-  [v10: s8[4, 16, 34, 256]] = outerloop_4X16X34_partition_mul_add_cast(v1, v9, v8)
-  [v2: u8[4, 16, 1, 256]] = outerloop_4X16_partition_cast_reduce_compute_reduce_collect_mul_reorder_reorder_quantized_matmul_core_sub_cast_mul_add_cast(v10, v0, v3, v7, v4)
-}
-)";
-        if (IS_AMX_AVAILABLE()) {
-            EXPECT_EQ(ss.str(), expected_str_spr);
-        } else {
-            EXPECT_EQ(ss.str(), expected_str_clx);
-        }
-    }
-}
-
-static void add_single_matmul_to_graph(sc_graph_t &graph, const sc_dims &A_dims,
-        const sc_dims &B_dims, const sc_dims &bias_dims,
-        const sc_dims &out_dims) {
-    auto A_fmt = A_dims.size() == 3 ? sc_data_format_t(format_kinds::ABC)
-                                    : sc_data_format_t(format_kinds::AB);
-
-    auto ins0 = graph.make_input(
-            {graph_tensor::make(A_dims, A_fmt, datatypes::u8)});
-    auto ins1 = graph.make_input({graph_tensor::make(
-            B_dims, sc_data_format_t(format_kinds::AB), datatypes::s8)});
-    auto ins2 = graph.make_input({graph_tensor::make(
-            bias_dims, sc_data_format_t(format_kinds::A), datatypes::bf16)});
-    auto dequant0 = graph.make("dequantize", ins0->get_outputs(), {},
-            {{"channel_axis", 1},
-                    {"scales", std::vector<float> {0.00391666731f}},
-                    {"zero_points", std::vector<int> {0}},
-                    {"per_channel", false}, {"dtype", datatypes::f32}});
-    auto cast0 = graph.make(
-            "cast", dequant0->get_outputs(), {}, {{"dtype", datatypes::bf16}});
-    auto dequant1 = graph.make("dequantize", ins1->get_outputs(), {},
-            {{"channel_axis", 0},
-                    {"scales", std::vector<float>(768, 0.00148682f)},
-                    {"zero_points", std::vector<int>(768, 0)},
-                    {"per_channel", true}, {"dtype", datatypes::f32}});
-    auto cast1 = graph.make(
-            "cast", dequant1->get_outputs(), {}, {{"dtype", datatypes::bf16}});
-    any_map_t attrs({{"transpose_a", false}, {"transpose_b", false}});
-    auto matmul = graph.make("matmul",
-            {cast0->get_outputs()[0], cast1->get_outputs()[0],
-                    ins2->get_outputs()[0]},
-            {}, attrs);
-    auto cast2 = graph.make(
-            "cast", matmul->get_outputs(), {}, {{"dtype", datatypes::f32}});
-    auto outs0 = std::make_shared<graph_tensor>(
-            nullptr, A_fmt, out_dims, datatypes::u8);
-    auto quant2 = graph.make("quantize", cast2->get_outputs(), {outs0},
-            {{"channel_axis", 1},
-                    {"scales", std::vector<float> {0.00391666731f}},
-                    {"zero_points", std::vector<int> {0}},
-                    {"per_channel", false}, {"dtype", datatypes::u8}});
-    auto output = graph.make_output(quant2->get_outputs());
-}
-
-TEST(GCCore_CPU_graph_reshedule, TestNDx2DMatMulWithBias1) {
-    REQUIRE_AVX512();
-    SET_THREADS_OR_SKIP(8);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    sc_graph_t graph_2D;
-    sc_dims A_dims_2D {133 * 197, 768}, B_dims_2D {768, 768},
-            bias_dims_2D {768}, out_dims_2D {133 * 197, 768};
-    add_single_matmul_to_graph(
-            graph_2D, A_dims_2D, B_dims_2D, bias_dims_2D, out_dims_2D);
-    graph_driver(graph_2D, ctx);
-    auto mod_2D = lower_graph(ctx, graph_2D,
-            {graph_2D.get_output_ops()[0], graph_2D.get_input_ops()[0],
-                    graph_2D.get_input_ops()[1], graph_2D.get_input_ops()[2]});
-    auto fptr_2D = jit_engine_t::make(ctx)->get_entry_func(mod_2D);
-
-    sc_graph_t graph_3D;
-    sc_dims A_dims_3D {133, 197, 768}, B_dims_3D {768, 768}, bias_dims_3D {768},
-            out_dims_3D {133, 197, 768};
-    add_single_matmul_to_graph(
-            graph_3D, A_dims_3D, B_dims_3D, bias_dims_3D, out_dims_3D);
-    graph_driver(graph_3D, ctx);
-    std::stringstream ss;
-    print_graph(graph_3D, ss, true);
-    // {reorder, tensorview(v13)} is moved to last
-    std::string expected
-            = R"(graph(v0: u8[133, 197, 768], v1: s8[768, 768], v2: bf16[768]) -> [v3: u8[133, 197, 768]] {
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1, 1, 768]] = constant([1, 1, 768])
-  [v6: f32[1, 768]] = tensor_view(v5)
-  [v7: f32[1, 12, 1, 64]] = reorder(v6)
-  [v8: bf16[1, 1, 768]] = tensor_view(v2)
-  [v9: bf16[1, 768]] = tensor_view(v8)
-  [v10: bf16[1, 12, 1, 64]] = reorder(v9)
-  [v11: s8[12, 12, 16, 64, 4]] = reorder(v1)
-  [v12: u8[26201, 768]] = tensor_view(v0)
-  [v13: u8[26201, 768]] = outerloop_8X1X1_partition_reorder_quantized_managed_matmul_core_cast_mul_cast_add_cast_mul_cast_reorder(v12, v11, v7, v10, v4)
-  [v3: u8[133, 197, 768]] = tensor_view(v13)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-    auto mod_3D = lower_graph(ctx, graph_3D,
-            {graph_3D.get_output_ops()[0], graph_3D.get_input_ops()[0],
-                    graph_3D.get_input_ops()[1], graph_3D.get_input_ops()[2]});
-    auto fptr_3D = jit_engine_t::make(ctx)->get_entry_func(mod_3D);
-
-    const int A_size = test_utils::product(A_dims_2D);
-    const int B_size = test_utils::product(B_dims_2D);
-    const int out_size = test_utils::product(out_dims_2D);
-    test_buffer<uint8_t> A_data(A_size);
-    test_buffer<int8_t> B_data(B_size);
-    test_buffer<fp16_t> bias_data(bias_dims_2D[0]);
-    test_buffer<uint8_t> out_data_2D(out_size);
-    test_buffer<uint8_t> out_data_3D(out_size);
-    test_utils::fill_data(&A_data[0], A_size);
-    test_utils::fill_data(&B_data[0], B_size);
-    test_utils::fill_data(&bias_data[0], bias_dims_2D[0]);
-
-    fptr_2D->call_default(
-            &out_data_2D[0], &A_data[0], &B_data[0], &bias_data[0]);
-    fptr_3D->call_default(
-            &out_data_3D[0], &A_data[0], &B_data[0], &bias_data[0]);
-    test_utils::compare_data(out_data_3D, out_data_2D, 1e-4f, 1e-5f);
-}
-
-static void build_two_matmul_graph(sc_graph_t &graph, const sc_dims &A_dims,
-        const sc_dims &B_dims, const sc_dims &bias_dims,
-        const sc_dims &out_dims) {
-    for (size_t i = 0; i < 2; ++i) {
-        add_single_matmul_to_graph(graph, A_dims, B_dims, bias_dims, out_dims);
-    }
-    auto add = graph.make("add",
-            {graph.get_output_ops()[0]->get_inputs()[0],
-                    graph.get_output_ops()[1]->get_inputs()[0]},
-            {}, {});
-    auto add_output = graph.make_output(add->get_outputs());
-    graph.get_output_ops()[0]->remove();
-    graph.get_output_ops()[1]->remove();
-    graph.reset_op_ids();
-}
-
-TEST(GCCore_CPU_graph_reshedule, TestNDx2DMatMulWithBias2) {
-    REQUIRE_AVX512();
-    SET_THREADS_OR_SKIP(8);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    sc_graph_t graph_2D;
-    sc_dims A_dims_2D {133 * 197, 768}, B_dims_2D {768, 768},
-            bias_dims_2D {768}, out_dims_2D {133 * 197, 768};
-    build_two_matmul_graph(
-            graph_2D, A_dims_2D, B_dims_2D, bias_dims_2D, out_dims_2D);
-    graph_driver(graph_2D, ctx);
-    auto mod_2D = lower_graph(ctx, graph_2D,
-            {graph_2D.get_output_ops()[0], graph_2D.get_input_ops()[0],
-                    graph_2D.get_input_ops()[1], graph_2D.get_input_ops()[2],
-                    graph_2D.get_input_ops()[3], graph_2D.get_input_ops()[4],
-                    graph_2D.get_input_ops()[5]});
-    auto fptr_2D = jit_engine_t::make(ctx)->get_entry_func(mod_2D);
-
-    sc_graph_t graph_3D;
-    sc_dims A_dims_3D {133, 197, 768}, B_dims_3D {768, 768}, bias_dims_3D {768},
-            out_dims_3D {133, 197, 768};
-    build_two_matmul_graph(
-            graph_3D, A_dims_3D, B_dims_3D, bias_dims_3D, out_dims_3D);
-    graph_driver(graph_3D, ctx);
-    std::stringstream ss;
-    print_graph(graph_3D, ss, true);
-    // {reorder, tensorview(v27))} and {reorder, tensor_view(v20)} are moved
-    // just before the add6
-    std::string expected
-            = R"(graph(v0: u8[133, 197, 768], v1: s8[768, 768], v2: bf16[768], v3: u8[133, 197, 768], v4: s8[768, 768], v5: bf16[768]) -> [v6: u8[133, 197, 768]] {
-  [v7: f32[1]] = constant([1])
-  [v8: f32[1, 1, 768]] = constant([1, 1, 768])
-  [v9: f32[1, 768]] = tensor_view(v8)
-  [v10: f32[1, 12, 1, 64]] = reorder(v9)
-  [v11: f32[1]] = constant([1])
-  [v12: f32[1, 1, 768]] = constant([1, 1, 768])
-  [v13: f32[1, 768]] = tensor_view(v12)
-  [v14: f32[1, 12, 1, 64]] = reorder(v13)
-  [v15: bf16[1, 1, 768]] = tensor_view(v5)
-  [v16: bf16[1, 768]] = tensor_view(v15)
-  [v17: bf16[1, 12, 1, 64]] = reorder(v16)
-  [v18: s8[12, 12, 16, 64, 4]] = reorder(v4)
-  [v19: u8[26201, 768]] = tensor_view(v3)
-  [v20: u8[26201, 768]] = outerloop_8X1X1_partition_reorder_quantized_managed_matmul_core_cast_mul_cast_add_cast_mul_cast_reorder(v19, v18, v10, v17, v7)
-  [v21: u8[133, 197, 768]] = tensor_view(v20)
-  [v22: bf16[1, 1, 768]] = tensor_view(v2)
-  [v23: bf16[1, 768]] = tensor_view(v22)
-  [v24: bf16[1, 12, 1, 64]] = reorder(v23)
-  [v25: s8[12, 12, 16, 64, 4]] = reorder(v1)
-  [v26: u8[26201, 768]] = tensor_view(v0)
-  [v27: u8[26201, 768]] = outerloop_8X1X1_partition_reorder_quantized_managed_matmul_core_cast_mul_cast_add_cast_mul_cast_reorder(v26, v25, v14, v24, v11)
-  [v28: u8[133, 197, 768]] = tensor_view(v27)
-  [v6: u8[133, 197, 768]] = add(v28, v21)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-    auto mod_3D = lower_graph(ctx, graph_3D,
-            {graph_3D.get_output_ops()[0], graph_3D.get_input_ops()[0],
-                    graph_3D.get_input_ops()[1], graph_3D.get_input_ops()[2],
-                    graph_3D.get_input_ops()[3], graph_3D.get_input_ops()[4],
-                    graph_3D.get_input_ops()[5]});
-    auto fptr_3D = jit_engine_t::make(ctx)->get_entry_func(mod_3D);
-
-    const int A_size = test_utils::product(A_dims_2D);
-    const int B_size = test_utils::product(B_dims_2D);
-    const int out_size = test_utils::product(out_dims_2D);
-    test_buffer<uint8_t> A0_data(A_size);
-    test_buffer<int8_t> B0_data(B_size);
-    test_buffer<fp16_t> bias0_data(bias_dims_2D[0]);
-    test_buffer<uint8_t> A1_data(A_size);
-    test_buffer<int8_t> B1_data(B_size);
-    test_buffer<fp16_t> bias1_data(bias_dims_2D[0]);
-    test_buffer<uint8_t> out_data_2D(out_size);
-    test_buffer<uint8_t> out_data_3D(out_size);
-    test_utils::fill_data(&A0_data[0], A_size);
-    test_utils::fill_data(&B0_data[0], B_size);
-    test_utils::fill_data(&bias0_data[0], bias_dims_2D[0]);
-    test_utils::fill_data(&A1_data[0], A_size);
-    test_utils::fill_data(&B1_data[0], B_size);
-    test_utils::fill_data(&bias1_data[0], bias_dims_2D[0]);
-
-    fptr_2D->call_default(&out_data_2D[0], &A0_data[0], &B0_data[0],
-            &bias0_data[0], &A1_data[0], &B1_data[0], &bias1_data[0]);
-    fptr_3D->call_default(&out_data_3D[0], &A0_data[0], &B0_data[0],
-            &bias0_data[0], &A1_data[0], &B1_data[0], &bias1_data[0]);
-    test_utils::compare_data(out_data_3D, out_data_2D, 1e-4f, 1e-5f);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph.hpp
deleted file mode 100644
index 24aba7a4bd4..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph.hpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_GRAPH_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_GRAPH_HPP
-#include <memory>
-#include <string>
-#include <vector>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <ops/templates/managed_matmul_core.hpp>
-#include <util/any_map.hpp>
-using namespace dnnl::impl::graph::gc;
-
-inline graph_tensor_ptr make_tensor(const sc_dims &shape,
-        sc_data_type_t dtype = sc_data_type_t(sc_data_etype::F32, 1)) {
-    return std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t(), shape, dtype);
-};
-
-inline void get_logical_tensors(
-        ltensors *ins, const std::vector<graph_tensor_ptr> &flts) {
-    ins->reserve(flts.size());
-    for (auto &in : flts) {
-        ins->emplace_back(in->details_);
-    }
-}
-
-inline sc_graph_t get_test_graph(sc_op_ptr &in_a, sc_op_ptr &in_b,
-        sc_op_ptr &in_weight, sc_op_ptr &in_c) {
-    sc_graph_t mgr;
-    // make a graph of:
-    // out = relu(conv(data=a+b, weight)) + c
-    in_a = mgr.make_input({make_tensor({28, 64, 16, 16})}); // 0
-    in_b = mgr.make_input({make_tensor({28, 64, 16, 16})}); // 1
-    auto conv_data
-            = mgr.make("add", {in_a->get_outputs()[0], in_b->get_outputs()[0]},
-                    {make_tensor({28, 64, 16, 16})}, {}); // 2
-
-    in_weight = mgr.make_input({make_tensor({64, 64, 1, 1})}); // 3
-    auto conv_out = mgr.make("conv_fwd_core",
-            {conv_data->get_outputs()[0], in_weight->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    auto relu_out = mgr.make("relu", {conv_out->get_outputs()[0]}, {}, {}); // 5
-    in_c = mgr.make_input({make_tensor({28, 64, 16, 16})}); // 6
-    auto add_out = mgr.make("add",
-            {relu_out->get_outputs()[0], in_c->get_outputs()[0]}, {}, {}); // 7
-    auto out = mgr.make_output(add_out->get_outputs()); // 8
-    return mgr;
-}
-
-inline sc_graph_t get_test_sorting_graph(
-        sc_op_ptr &in_a, const sc_dims &src_dim) {
-    // just test for last dim
-    int rd_axis = (int)src_dim.size() - 1;
-    sc_dims rd_dim(src_dim);
-    rd_dim.at(rd_axis) = 1;
-
-    sc_graph_t graph;
-    // make a graph of layernorm
-    in_a = graph.make_input({make_tensor(src_dim)}); // 0
-    // x^2
-    auto fsquare = graph.make("mul",
-            {in_a->get_outputs()[0], in_a->get_outputs()[0]},
-            {make_tensor(src_dim)}, {}); // 1
-    // mean of x^2
-    auto fsqd_mean = graph.make("reduce", {fsquare->get_outputs()[0]},
-            {make_tensor(rd_dim)},
-            {{"need_mean", true}, {"rd_axis", std::vector<int> {rd_axis}},
-                    {"rd_op", 0}}); // 2
-    // mean of X
-    auto fmean = graph.make("reduce", {in_a->get_outputs()[0]},
-            {make_tensor(rd_dim)},
-            {{"need_mean", true}, {"rd_axis", std::vector<int> {rd_axis}},
-                    {"rd_op", 0}}); // 3
-    // square of mean
-    auto fmean_sqd = graph.make("mul",
-            {fmean->get_outputs()[0], fmean->get_outputs()[0]},
-            {make_tensor(rd_dim)}, {}); // 4
-    // x-x_mean
-    auto fdiff = graph.make("sub",
-            {in_a->get_outputs()[0], fmean->get_outputs()[0]},
-            {make_tensor(src_dim)}, {}); // 5
-    auto fvar = graph.make("sub",
-            {fsqd_mean->get_outputs()[0], fmean_sqd->get_outputs()[0]},
-            {make_tensor(rd_dim)}, {}); // 6
-    // rsqrt
-    auto frsqd_root = graph.make("squared_root", {fvar->get_outputs()[0]},
-            {make_tensor(rd_dim)}, {{"reciprocal", true}}); // 7
-    auto fmul = graph.make("mul",
-            {fdiff->get_outputs()[0], frsqd_root->get_outputs()[0]},
-            {make_tensor(src_dim)}, {}); // 8
-
-    auto out = graph.make_output(fmul->get_outputs()); // 9
-    return graph;
-}
-
-inline sc_graph_t get_test_speculative_graph() {
-    sc_graph_t mgr;
-    // make a graph of:
-    auto in_a = mgr.make_input({make_tensor({32, 64})}); // 0
-    auto relu0 = mgr.make("relu", {in_a->get_outputs()[0]}, {}, {}); // 1
-    auto relu1 = mgr.make("relu", {relu0->get_outputs()[0]}, {}, {}); // 2
-    auto in_b = mgr.make_input({make_tensor({128, 32, 64})}); // 3
-    auto add = mgr.make("add",
-            {relu0->get_outputs()[0], in_b->get_outputs()[0]}, {}, {}); // 4
-    auto in_c = mgr.make_input({make_tensor({64, 32})}); // 5
-    auto matmul = mgr.make("matmul_core",
-            {relu0->get_outputs()[0], in_c->get_outputs()[0]}, {}, {}); // 6
-
-    mgr.make_output(matmul->get_outputs()); // 7
-    mgr.make_output(add->get_outputs()); // 8
-    mgr.make_output(relu1->get_outputs()); // 9
-    return mgr;
-}
-
-inline sc_graph_t get_conv_bn_relu_graph(const sc_dims &src_dims,
-        const sc_dims &weight_dims, const sc_dims &dst_dims,
-        const sc_dims &strides, const sc_dims &paddings,
-        const sc_dims &mean_dims, const sc_dims &var_dims,
-        const sc_dims &shift_dims, const sc_dims &scale_dims,
-        const std::string &data_format, float epsilon) {
-    sc_graph_t graph;
-    // make a graph of:
-    // out = relu(bn(conv))
-    auto in = graph.make_input({make_tensor(src_dims), make_tensor(weight_dims),
-            make_tensor(scale_dims), make_tensor(shift_dims),
-            make_tensor(mean_dims), make_tensor(var_dims)});
-    auto conv_out = graph.make("conv_fwd_core",
-            {in->get_outputs()[0], in->get_outputs()[1]},
-            {make_tensor(dst_dims)},
-            {{"strides", strides}, {"paddings", paddings}});
-    auto bn_out = graph.make("batchnorm_inference",
-            {conv_out->get_outputs()[0], in->get_outputs()[2],
-                    in->get_outputs()[3], in->get_outputs()[4],
-                    in->get_outputs()[5]},
-            {make_tensor(dst_dims)},
-            {{"epsilon", epsilon}, {"data_format", data_format}});
-    auto relu_out = graph.make(
-            "relu", {bn_out->get_outputs()[0]}, {make_tensor(dst_dims)}, {});
-    auto out = graph.make_output(relu_out->get_outputs());
-    return graph;
-}
-
-inline sc_graph_t get_conv_relu_graph(const sc_dims &src_dims,
-        const sc_dims &weight_dims, const sc_dims &dst_dims,
-        const sc_dims &strides, const sc_dims &paddings) {
-    sc_graph_t graph;
-    // make a graph of:
-    // out = relu(conv)
-    auto in = graph.make_input(
-            {make_tensor(src_dims), make_tensor(weight_dims)});
-    auto conv_out = graph.make("conv_fwd_core",
-            {in->get_outputs()[0], in->get_outputs()[1]},
-            {make_tensor(dst_dims)},
-            {{"strides", strides}, {"paddings", paddings}});
-    auto relu_out = graph.make(
-            "relu", {conv_out->get_outputs()[0]}, {make_tensor(dst_dims)}, {});
-    auto out = graph.make_output(relu_out->get_outputs());
-    return graph;
-}
-
-inline sc_graph_t get_conv_relu_add_graph(const sc_dims &src0_dims,
-        const sc_dims &weight_dims, const sc_dims &src1_dims,
-        const sc_dims &dst_dims, const sc_dims &strides,
-        const sc_dims &paddings) {
-    sc_graph_t graph;
-    // make a graph of:
-    // out = relu(conv) + add
-    auto in = graph.make_input({make_tensor(src0_dims),
-            make_tensor(weight_dims), make_tensor(src1_dims)});
-    auto conv_out = graph.make("conv_fwd_core",
-            {in->get_outputs()[0], in->get_outputs()[1]},
-            {make_tensor(dst_dims)},
-            {{"strides", strides}, {"paddings", paddings}});
-    auto relu_out = graph.make(
-            "relu", {conv_out->get_outputs()[0]}, {make_tensor(dst_dims)}, {});
-
-    auto add_out = graph.make(
-            "add", {relu_out->get_outputs()[0], in->get_outputs()[2]}, {}, {});
-    auto out = graph.make_output(add_out->get_outputs());
-    return graph;
-}
-
-inline sc_graph_t get_conv_bn_add_relu_graph(const sc_dims &src_dims,
-        const sc_dims &weight_dims, const sc_dims &dst_dims,
-        const sc_dims &strides, const sc_dims &paddings,
-        const sc_dims &mean_dims, const sc_dims &var_dims,
-        const sc_dims &shift_dims, const sc_dims &scale_dims,
-        const std::string &data_format, float epsilon) {
-    sc_graph_t graph;
-    // make a graph of:
-    // out = relu(add(bn(conv(relu(bn(conv))))))
-    auto in = graph.make_input({make_tensor(src_dims), make_tensor(weight_dims),
-            make_tensor(scale_dims), make_tensor(shift_dims),
-            make_tensor(mean_dims), make_tensor(var_dims)});
-    auto conv_out = graph.make("conv_fwd_core",
-            {in->get_outputs()[0], in->get_outputs()[1]},
-            {make_tensor(dst_dims)},
-            {{"strides", strides}, {"paddings", paddings}});
-    auto bn_out = graph.make("batchnorm_inference",
-            {conv_out->get_outputs()[0], in->get_outputs()[2],
-                    in->get_outputs()[3], in->get_outputs()[4],
-                    in->get_outputs()[5]},
-            {make_tensor(dst_dims)},
-            {{"epsilon", epsilon}, {"data_format", data_format}});
-    auto relu_out = graph.make(
-            "relu", {bn_out->get_outputs()[0]}, {make_tensor(dst_dims)}, {});
-    auto in1 = graph.make_input(
-            {make_tensor(weight_dims), make_tensor(scale_dims),
-                    make_tensor(shift_dims), make_tensor(mean_dims),
-                    make_tensor(var_dims), make_tensor(dst_dims)});
-    auto conv_out1 = graph.make("conv_fwd_core",
-            {relu_out->get_outputs()[0], in1->get_outputs()[0]},
-            {make_tensor(dst_dims)},
-            {{"strides", strides}, {"paddings", paddings}});
-    auto bn_out1 = graph.make("batchnorm_inference",
-            {conv_out1->get_outputs()[0], in1->get_outputs()[1],
-                    in1->get_outputs()[2], in1->get_outputs()[3],
-                    in1->get_outputs()[4]},
-            {make_tensor(dst_dims)},
-            {{"epsilon", epsilon}, {"data_format", data_format}});
-    auto add_out1 = graph.make(
-            "add", {bn_out1->get_outputs()[0], in1->get_outputs()[5]}, {}, {});
-    auto relu_out1 = graph.make(
-            "relu", {add_out1->get_outputs()[0]}, {make_tensor(dst_dims)}, {});
-    auto out = graph.make_output(relu_out1->get_outputs());
-    return graph;
-}
-
-inline sc_graph_t get_parallel_merge_mlp_graph(int M = 10752, int N = 1024,
-        int K = 1024,
-        const ops::managed_matmul_core_config_t mmm_cfg = {14, 2, 1, 1, 1, 1}) {
-    sc_graph_t graph;
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MN))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight2 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    mmm0->stc_cast<tunable_op_t>()->set_config(
-            reflection::general_object_t::make(mmm_cfg));
-    // mmm1
-    auto mmm1 = graph.make("managed_matmul_core",
-            {mmm0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    mmm1->stc_cast<tunable_op_t>()->set_config(
-            reflection::general_object_t::make(mmm_cfg));
-    mmm1 = graph.make("gelu", {mmm1->get_outputs()[0]}, {}, {});
-
-    // mmm2
-    auto mmm2 = graph.make("managed_matmul_core",
-            {mmm1->get_outputs()[0], weight2->get_outputs()[0]}, {}, {});
-    mmm2->stc_cast<tunable_op_t>()->set_config(
-            reflection::general_object_t::make(mmm_cfg));
-    mmm2 = graph.make(
-            "add", {mmm1->get_outputs()[0], mmm2->get_outputs()[0]}, {}, {});
-    graph.make_output(mmm2->get_outputs());
-    return graph;
-}
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_conv.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_conv.cpp
deleted file mode 100644
index e5c20723dc9..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_conv.cpp
+++ /dev/null
@@ -1,643 +0,0 @@
-
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
-#include <iostream>
-#include "compiler/ir/graph/driver.hpp"
-#include "compiler/ir/graph/fusible_op.hpp"
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/graph/graph_op.hpp"
-#include "compiler/ir/graph/lowering.hpp"
-#include "compiler/ir/graph/pass/pass.hpp"
-#include "compiler/ir/graph/quantization/quantize_info.hpp"
-#include "compiler/ir/graph/transform/transform.hpp"
-#include "compiler/jit/jit.hpp"
-#include "context.hpp"
-#include "reference/conv_ref.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionWithBias) {
-    REQUIRE_AVX2();
-    int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 1, S = 1;
-    sc_dims input_dims {N, IC, H, W};
-    sc_dims filter_dims {OC, IC, R, S};
-    sc_dims output_dims {N, OC, H, W};
-    auto ins0 = graph_tensor::make(input_dims);
-    auto ins1 = graph_tensor::make(filter_dims);
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"data_format", "NCX"},
-            {"weights_format", "OIX"}, {"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto conv = graph.make(
-            "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    // The conv1d graph is different from the reference
-    graph.attrs_["no_conv1d"] = true;
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[64, 16, 32, 32], v1: f32[64, 16, 1, 1], v2: f32[64]) -> [v3: f32[64, 64, 32, 32]] {
-  [v4: f32[1, 1, 1, 64]] = tensor_view(v2)
-  [v5: f32[1, 1, 1, 1, 16, 64]] = reorder(v1)
-  [v6: f32[64, 32, 32, 16]] = reorder(v0)
-  [v3: f32[64, 64, 32, 32]] = outerloop_64X1X1_partition_conv_fwd_core_add_reorder(v6, v5, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionNXCXIO) {
-    int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 1, S = 1;
-    sc_dims input_dims {N, H, W, IC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims output_dims {N, H, W, OC};
-    auto ins0 = graph_tensor::make(
-            input_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto conv = graph.make(
-            "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    // The conv1d graph is different from the reference
-    graph.attrs_["no_conv1d"] = true;
-    graph_driver(graph, get_test_ctx());
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[64, 32, 32, 16], v1: f32[1, 1, 16, 64], v2: f32[64]) -> [v3: f32[64, 32, 32, 64]] {
-  [v4: f32[1, 1, 1, 64]] = tensor_view(v2)
-  [v5: f32[1, 1, 1, 1, 16, 64]] = tensor_view(v1)
-  [v6: f32[64, 32, 32, 16]] = tensor_view(v0)
-  [v3: f32[64, 32, 32, 64]] = outerloop_64X1X1X32_partition_conv_fwd_core_tensor_view_add(v6, v5, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolution7x1) {
-    int N = 112, IC = 128, OC = 128, H = 12, W = 12, R = 7, S = 1;
-    sc_dims input_dims {N, H, W, IC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims output_dims {N, H, W, OC};
-    auto ins0 = graph_tensor::make(
-            input_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins2 = graph_tensor::make({OC});
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {3, 0}}, {"pads_end", sc_dims {3, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto conv = graph.make("conv_fwd", in->get_outputs(), {}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    // The conv1d graph is different from the reference
-    graph.attrs_["no_conv1d"] = true;
-    graph_driver(graph, get_test_ctx());
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolution1DFusion) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(112);
-    int N = 128, IC = 64, OC = 64, H = 56, W = 56, R = 3, S = 3;
-    any_map_t dqinfo = {
-            {attr_keys::quan_dtype, datatypes::f32},
-            {attr_keys::scales, std::vector<float>({1.f})},
-            {attr_keys::per_channel, false},
-            {attr_keys::channel_axis, 1},
-            {attr_keys::zero_points, std::vector<int>({0})},
-            {attr_keys::asymmetric, false},
-    };
-    any_map_t qinfo = {
-            {attr_keys::quan_dtype, datatypes::s8},
-            {attr_keys::scales, std::vector<float>({1.f})},
-            {attr_keys::per_channel, false},
-            {attr_keys::channel_axis, 1},
-            {attr_keys::zero_points, std::vector<int>({0})},
-            {attr_keys::asymmetric, false},
-    };
-    sc_graph_t g;
-    // input
-    sc_data_type_t src_dtype = datatypes::s8;
-    sc_data_type_t weight_dtype = datatypes::s8;
-    // input dequantize
-    sc_op_ptr data_input = g.make_input({graph_tensor::make(
-            {N, IC, H, W}, sc_data_format_t(format_kinds::ABCD), src_dtype)});
-    sc_op_ptr weight_input1 = g.make_input({graph_tensor::make({OC, IC, 1, 1},
-            sc_data_format_t(format_kinds::ABCD), weight_dtype)});
-    weight_input1->attrs_.set("constant", const_kind::local_const);
-    sc_op_ptr weight_input2 = g.make_input({graph_tensor::make({OC, IC, R, S},
-            sc_data_format_t(format_kinds::ABCD), weight_dtype)});
-    weight_input2->attrs_.set("constant", const_kind::local_const);
-    auto deq_data1
-            = g.make("dequantize", data_input->get_outputs(), {}, dqinfo);
-    auto deq_weight1
-            = g.make("dequantize", weight_input1->get_outputs(), {}, dqinfo);
-    // conv relu quantize
-    sc_dims paddings1 = {0, 0};
-    sc_op_ptr conv_op1 = g.make("conv_fwd_core",
-            {deq_data1->get_outputs()[0], deq_weight1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"pads_begin", paddings1},
-                    {"pads_end", paddings1}, {"data_format", "NCX"},
-                    {"weights_format", "OIX"}});
-    auto conv_relu_quan_out1
-            = g.make("quantize", conv_op1->get_outputs(), {}, qinfo);
-
-    auto deq_data2 = g.make(
-            "dequantize", conv_relu_quan_out1->get_outputs(), {}, dqinfo);
-    auto deq_weight2
-            = g.make("dequantize", weight_input2->get_outputs(), {}, dqinfo);
-    sc_dims paddings2 = {1, 1};
-    // conv relu quantize
-    sc_op_ptr conv_op2 = g.make("conv_fwd_core",
-            {deq_data2->get_outputs()[0], deq_weight2->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"pads_begin", paddings2},
-                    {"pads_end", paddings2}, {"data_format", "NCX"},
-                    {"weights_format", "OIX"}});
-    auto conv_relu_quan_out2
-            = g.make("quantize", conv_op2->get_outputs(), {}, qinfo);
-    const sc_op_ptr &final_out = conv_relu_quan_out2;
-
-    sc_op_ptr out = g.make_output(final_out->get_outputs());
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver(g, ctx);
-    std::stringstream ss;
-    print_graph(g, ss, true, false, true, false);
-    const char *expected_graph
-            = R"(graph(v0: s8[128, 64, 56, 56], v1: s8[64, 64, 1, 1], v2: s8[64, 64, 3, 3]) -> [v3: s8[128, 64, 56, 56]] {
-  [v4: s8[1, 1, 3, 3, 16, 64, 4]] = reorder(v2)
-  [v5: s8[64, 64, 1]] = tensor_view(v1)
-  [v6: s8[1, 1, 1, 16, 64, 4]] = reorder(v5)
-  [v7: s8[128, 56, 56, 64]] = reorder(v0)
-  [v8: s8[1, 401408, 64]] = tensor_view(v7)
-  [v9: s8[128, 56, 56, 64]] = outerloop_1X112X1X1X1X1X1_partition_quantized_conv_fwd_core_tensor_view_cast_cast(v8, v6)
-  [v3: s8[128, 64, 56, 56]] = outerloop_128_partition_padding_conv_fwd_core_cast_cast_reorder(v9, v4)
-}
-)";
-    EXPECT_TRUE(ss.str() == expected_graph);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionWithDilation) {
-    SET_THREADS_OR_SKIP(56);
-    int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 3, S = 3, dilation = 2;
-    sc_dims input_dims {N, H, W, IC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims output_dims {N, H - 2 * dilation, W - 2 * dilation, OC};
-    auto ins0 = graph_tensor::make(
-            input_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}},
-            {"dilations", sc_dims {dilation, dilation}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto conv = graph.make(
-            "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    graph_driver(graph, get_test_ctx());
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph_blocking =
-            R"(graph(v0: f32[64, 32, 32, 16], v1: f32[3, 3, 16, 64], v2: f32[64]) -> [v3: f32[64, 28, 28, 64]] {
-  [v4: f32[1, 1, 1, 1, 64]] = tensor_view(v2)
-  [v5: f32[1, 1, 3, 3, 16, 64]] = tensor_view(v1)
-  [v6: f32[64, 1, 32, 32, 16]] = tensor_view(v0)
-  [v7: f32[64, 1, 28, 28, 64]] = outerloop_1X64X1X28X1_partition_conv_fwd_core_tensor_view_add(v6, v5, v4)
-  [v3: f32[64, 28, 28, 64]] = tensor_view(v7)
-}
-)";
-    const char *expected_graph_plain =
-            R"(graph(v0: f32[64, 32, 32, 16], v1: f32[3, 3, 16, 64], v2: f32[64]) -> [v3: f32[64, 28, 28, 64]] {
-  [v4: f32[1, 1, 1, 64]] = tensor_view(v2)
-  [v5: f32[1, 1, 3, 3, 16, 64]] = tensor_view(v1)
-  [v6: f32[64, 32, 32, 16]] = tensor_view(v0)
-  [v3: f32[64, 28, 28, 64]] = outerloop_1X64X1X28X1_partition_conv_fwd_core_tensor_view_add(v6, v5, v4)
-}
-)";
-    EXPECT_TRUE(ss.str() == expected_graph_plain
-            || ss.str() == expected_graph_blocking);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionNXCXIODifferentHW) {
-    int N = 64, IC = 16, OC = 64, H = 128, W = 32, R = 1, S = 1;
-    sc_dims input_dims {N, H, W, IC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims output_dims {N, H, W, OC};
-    auto ins0 = graph_tensor::make(
-            input_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto conv = graph.make(
-            "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    // The conv1d graph is different from the reference
-    graph.attrs_["no_conv1d"] = true;
-    graph_driver(graph, get_test_ctx());
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[64, 128, 32, 16], v1: f32[1, 1, 16, 64], v2: f32[64]) -> [v3: f32[64, 128, 32, 64]] {
-  [v4: f32[1, 1, 1, 64]] = tensor_view(v2)
-  [v5: f32[1, 1, 1, 1, 16, 64]] = tensor_view(v1)
-  [v6: f32[64, 128, 32, 16]] = tensor_view(v0)
-  [v3: f32[64, 128, 32, 64]] = outerloop_64X1X1X128_partition_conv_fwd_core_tensor_view_add(v6, v5, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionAutoPad) {
-    {
-        int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 3, S = 3;
-        sc_dims input_dims {N, H, W, IC};
-        sc_dims filter_dims {R, S, IC, OC};
-        sc_dims output_dims {N, H - 2, W - 2, OC};
-        auto ins0 = graph_tensor::make(
-                input_dims, sc_data_format_t(format_kinds::ABCD));
-        auto ins1 = graph_tensor::make(
-                filter_dims, sc_data_format_t(format_kinds::ABCD));
-        auto ins2 = graph_tensor::make({OC});
-        auto out0 = graph_tensor::make(output_dims);
-
-        std::unordered_map<std::string, any_t> attrs = {
-                {"strides", sc_dims {1, 1}}, {"auto_pad", "VALID"},
-                {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}}};
-
-        sc_graph_t graph;
-        auto in = graph.make_input({ins0, ins1, ins2});
-        auto conv = graph.make(
-                "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-        auto out = graph.make_output(conv->get_outputs());
-        graph.attrs_["temp.fuse"] = 0;
-        graph_driver(graph, get_test_ctx());
-
-        for (auto op : graph.ops_) {
-            if (op->op_name_ == "conv_fwd_core") {
-                EXPECT_TRUE(op->attrs_.has_key("pads_begin"));
-                EXPECT_EQ(
-                        op->attrs_.get<sc_dims>("pads_begin"), sc_dims({0, 0}));
-                EXPECT_TRUE(op->attrs_.has_key("pads_end"));
-                EXPECT_EQ(op->attrs_.get<sc_dims>("pads_end"), sc_dims({0, 0}));
-            }
-        }
-    }
-    {
-        int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 3, S = 3;
-        sc_dims input_dims {N, H, W, IC};
-        sc_dims filter_dims {R, S, IC, OC};
-        sc_dims output_dims {N, H, W, OC};
-        auto ins0 = graph_tensor::make(
-                input_dims, sc_data_format_t(format_kinds::ABCD));
-        auto ins1 = graph_tensor::make(
-                filter_dims, sc_data_format_t(format_kinds::ABCD));
-        auto ins2 = graph_tensor::make({OC});
-        auto out0 = graph_tensor::make(output_dims);
-
-        std::unordered_map<std::string, any_t> attrs = {
-                {"strides", sc_dims {1, 1}}, {"auto_pad", "SAME_UPPER"},
-                {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}}};
-
-        sc_graph_t graph;
-        auto in = graph.make_input({ins0, ins1, ins2});
-        auto conv = graph.make(
-                "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-        auto out = graph.make_output(conv->get_outputs());
-        graph.attrs_["temp.fuse"] = 0;
-        graph_driver(graph, get_test_ctx());
-
-        for (auto op : graph.ops_) {
-            if (op->op_name_ == "conv_fwd_core") {
-                EXPECT_TRUE(op->attrs_.has_key("pads_begin"));
-                EXPECT_EQ(
-                        op->attrs_.get<sc_dims>("pads_begin"), sc_dims({1, 1}));
-                EXPECT_TRUE(op->attrs_.has_key("pads_end"));
-                EXPECT_EQ(op->attrs_.get<sc_dims>("pads_end"), sc_dims({1, 1}));
-            }
-        }
-    }
-}
-
-TEST(GCCore_CPU_graph_conv_test,
-        TestGraphConvolutionAutoPadSameUpperStride2x2) {
-    int N = 1, IC = 1024, OC = 2048, IH = 13, IW = 13, OH = 7, OW = 7, R = 5,
-        S = 5;
-    sc_dims input_dims {N, IH, IW, IC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims output_dims {N, OH, OW, OC};
-    auto ins0 = graph_tensor::make(
-            input_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {2, 2}},
-            {"auto_pad", "SAME_UPPER"}, {"pads_begin", sc_dims {0, 0}},
-            {"pads_end", sc_dims {0, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto conv = graph.make(
-            "conv_fwd", in->get_outputs(), {out0}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    graph.attrs_["temp.fuse"] = 0;
-    graph_driver(graph, get_test_ctx());
-
-    for (const auto &op : graph.ops_) {
-        if (op->op_name_ == "conv_fwd_core") {
-            EXPECT_TRUE(op->attrs_.has_key("pads_begin"));
-            EXPECT_EQ(op->attrs_.get<sc_dims>("pads_begin"), sc_dims({2, 2}));
-            EXPECT_TRUE(op->attrs_.has_key("pads_end"));
-            EXPECT_EQ(op->attrs_.get<sc_dims>("pads_end"), sc_dims({2, 2}));
-        }
-    }
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionBwdData) {
-    int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 1, S = 1;
-    sc_dims output_delta_dims {N, H, W, OC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims input_delta_dims {N, H, W, IC};
-    auto ins0 = graph_tensor::make(
-            output_delta_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}},
-            {"dst_shape", input_delta_dims}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1});
-    auto conv = graph.make(
-            "conv_bwd_data", in->get_outputs(), {}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    graph_inline(graph);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[64, 32, 32, 64], v1: f32[1, 1, 16, 64]) -> [v2: f32[64, 32, 32, 16]] {
-  [v3: f32[64, 16, 1, 1]] = transpose(v1)
-  [v4: f32[64, 64, 32, 32]] = transpose(v0)
-  [v5: f32[64, 16, 32, 32]] = conv_bwd_data_core(v4, v3)
-  [v2: f32[64, 32, 32, 16]] = transpose(v5)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-
-    for (auto op : graph.ops_) {
-        if (op->op_name_ == "conv_bwd_data_core") {
-            sc_dims out_shape = op->attrs_.get<sc_dims>("dst_shape");
-            sc_dims expected_out_shape {N, IC, H, W};
-            EXPECT_EQ(out_shape, expected_out_shape);
-            break;
-        }
-    }
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionBwdDataWithInverseWeight) {
-    REQUIRE_AMX();
-    int N = 8, IC = 64, OC = 64, H = 32, W = 32, R = 3, S = 3;
-    auto ctx = get_test_ctx();
-    sc_dims ginput_dims {N, H, W, IC};
-    sc_dims filter_dims {R, S, OC, IC}; // bwd's semantic
-    sc_dims goutput_dims {N, H, W, OC}; // valid_pad mode
-    auto ins0 = graph_tensor::make(
-            ginput_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(format_kinds::ABCD));
-
-    std::unordered_map<std::string, any_t> attrs
-            = {{"strides", sc_dims {1, 1}}, {"pads_begin", sc_dims {1, 1}},
-                    {"pads_end", sc_dims {1, 1}}, {"dst_shape", goutput_dims}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1});
-    auto conv = graph.make(
-            "conv_bwd_data", in->get_outputs(), {}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    graph_driver(graph, ctx);
-
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[8, 32, 32, 64], v1: f32[3, 3, 64, 64]) -> [v2: f32[8, 32, 32, 64]] {
-  [v3: f32[3, 3, 64, 64]] = tensor_view(v1)
-  [v4: f32[1, 1, 3, 3, 64, 64]] = reorder(v3)
-  [v5: f32[8, 32, 32, 64]] = tensor_view(v0)
-  [v2: f32[8, 32, 32, 64]] = partition_conv_fwd_core_tensor_view(v5, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-
-    for (auto op : graph.ops_) {
-        if (op->op_name_ == "conv_bwd_data_core") {
-            sc_dims out_shape = op->attrs_.get<sc_dims>("dst_shape");
-            sc_dims expected_out_shape {N, IC, H, W};
-            EXPECT_EQ(out_shape, expected_out_shape);
-            break;
-        }
-    }
-    auto f = lower_graph(ctx, graph, {in, out});
-    ASSERT_TRUE(f);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestGraphConvolutionBwdWeight) {
-    int N = 64, IC = 16, OC = 64, H = 32, W = 32, R = 1, S = 1;
-    sc_dims output_delta_dims {N, H, W, OC};
-    sc_dims filter_dims {R, S, IC, OC};
-    sc_dims input_dims {N, H, W, IC};
-    auto ins0 = graph_tensor::make(
-            input_dims, sc_data_format_t(format_kinds::ABCD));
-    auto ins1 = graph_tensor::make(
-            output_delta_dims, sc_data_format_t(format_kinds::ABCD));
-
-    std::unordered_map<std::string, any_t> attrs = {{"strides", sc_dims {1, 1}},
-            {"pads_begin", sc_dims {0, 0}}, {"pads_end", sc_dims {0, 0}},
-            {"weights_shape", filter_dims}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1});
-    auto conv = graph.make(
-            "conv_bwd_weight", in->get_outputs(), {}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    graph_inline(graph);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: f32[64, 32, 32, 16], v1: f32[64, 32, 32, 64]) -> [v2: f32[1, 1, 16, 64]] {
-  [v3: f32[64, 64, 32, 32]] = transpose(v1)
-  [v4: f32[64, 16, 32, 32]] = transpose(v0)
-  [v5: f32[64, 16, 1, 1]] = conv_bwd_weight_core(v4, v3)
-  [v2: f32[1, 1, 16, 64]] = transpose(v5)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-
-    for (auto op : graph.ops_) {
-        if (op->op_name_ == "conv_bwd_data_core") {
-            sc_dims filter_shape = op->attrs_.get<sc_dims>("weights_shape");
-            sc_dims expected_filter_shape {OC, IC, R, S};
-            EXPECT_EQ(filter_shape, expected_filter_shape);
-            break;
-        }
-    }
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestQuantizedGraphGroupedConvolution) {
-    REQUIRE_AVX2();
-    int N = 64, IC = 16, OC = 32, H = 32, W = 32, R = 1, S = 1, G = 4;
-    sc_dims input_dims {N, IC, H, W};
-    sc_dims filter_dims {OC, IC / G, R, S};
-    sc_dims output_dims {N, OC, H, W};
-    auto ins0
-            = graph_tensor::make(input_dims, sc_data_format_t(), datatypes::u8);
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(), datatypes::s8);
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"data_format", "NCX"},
-            {"weights_format", "OIX"}, {"strides", sc_dims {1, 1}},
-            {"groups", G}, {"pads_begin", sc_dims {0, 0}},
-            {"pads_end", sc_dims {0, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto dequantize_input = graph.make("dequantize", {in->get_outputs()[0]}, {},
-            {{"channel_axis", 1}, {"scales", std::vector<float> {0.1f}},
-                    {"zero_points", std::vector<int> {126}},
-                    {"per_channel", false}, {"dtype", datatypes::f32}});
-    auto dequantize_weight = graph.make("dequantize", {in->get_outputs()[1]},
-            {},
-            {{"channel_axis", 0}, {"scales", std::vector<float>(OC, 0.1f)},
-                    {"zero_points", std::vector<int>(OC, 0)},
-                    {"per_channel", true}, {"dtype", datatypes::f32}});
-    auto conv = graph.make("conv_fwd",
-            {dequantize_input->get_outputs()[0],
-                    dequantize_weight->get_outputs()[0], in->get_outputs()[2]},
-            {}, any_map_t(attrs));
-    auto out = graph.make_output(conv->get_outputs());
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    // The conv1d graph is different from the reference
-    graph.attrs_["no_conv1d"] = true;
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected_graph
-            = R"(graph(v0: u8[64, 16, 32, 32], v1: s8[32, 4, 1, 1], v2: f32[32]) -> [v3: f32[64, 32, 32, 32]] {
-  [v4: s32[1]] = constant([1])
-  [v5: f32[1, 32, 1, 1]] = constant([1, 32, 1, 1])
-  [v6: f32[1, 1, 1, 4, 8]] = tensor_view(v5)
-  [v7: f32[1, 1, 1, 4, 8]] = tensor_view(v2)
-  [v8: s8[4, 8, 4, 1, 1]] = tensor_view(v1)
-  [v9: s32[4, 8]] = outerloop_4X8_partition_cast_reduce_compute_reduce_collect_mul(v8, v4)
-  [v10: s32[1, 1, 1, 4, 8]] = tensor_view(v9)
-  [v11: s8[4, 1, 1, 1, 1, 1, 8, 4]] = tensor_view(v8)
-  [v12: u8[64, 4, 4, 32, 32]] = tensor_view(v0)
-  [v13: u8[64, 32, 32, 4, 4]] = reorder(v12)
-  [v3: f32[64, 32, 32, 32]] = outerloop_64_partition_quantized_conv_fwd_core_sub_tensor_view_cast_mul_add_reorder(v13, v11, v10, v6, v7)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-}
-
-TEST(GCCore_CPU_graph_conv_test, TestQuantizedGraphGroupedConvolution2) {
-    REQUIRE_AVX2();
-    int N = 64, IC = 16, OC = 16, H = 32, W = 32, R = 1, S = 1, G = 16;
-    sc_dims input_dims {N, IC, H, W};
-    sc_dims filter_dims_shrink {OC, R, S};
-    sc_dims filter_dims {OC, 1, R, S};
-    sc_dims output_dims {N, OC, H, W};
-    auto ins0
-            = graph_tensor::make(input_dims, sc_data_format_t(), datatypes::u8);
-    auto ins1 = graph_tensor::make(
-            filter_dims, sc_data_format_t(), datatypes::f32);
-    auto ins2 = graph_tensor::make({OC});
-    auto out0 = graph_tensor::make(output_dims);
-
-    std::unordered_map<std::string, any_t> attrs = {{"data_format", "NCX"},
-            {"weights_format", "OIX"}, {"strides", sc_dims {1, 1}},
-            {"groups", G}, {"pads_begin", sc_dims {0, 0}},
-            {"pads_end", sc_dims {0, 0}}};
-
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto reshape_weight = graph.make("static_reshape", {in->get_outputs()[1]},
-            {}, {{"shape", filter_dims_shrink}, {"special_zero", false}});
-    auto quantize_weight
-            = graph.make("quantize", reshape_weight->get_outputs(), {},
-                    any_map_t {{"channel_axis", 0}, {"per_channel", true},
-                            {"scales", std::vector<float> {0.1f}},
-                            {"zero_points", std::vector<int>(OC, 0)},
-                            {"dtype", datatypes::s8}});
-
-    auto dequantize_input = graph.make("dequantize", {in->get_outputs()[0]}, {},
-            {{"channel_axis", 1}, {"scales", std::vector<float> {0.1f}},
-                    {"zero_points", std::vector<int> {126}},
-                    {"per_channel", false}, {"dtype", datatypes::f32}});
-    auto dequantize_weight = graph.make("dequantize",
-            quantize_weight->get_outputs(), {},
-            {{"channel_axis", 0}, {"scales", std::vector<float>(OC, 0.1f)},
-                    {"zero_points", std::vector<int>(OC, 0)},
-                    {"per_channel", true}, {"dtype", datatypes::f32}});
-    dequantize_weight
-            = graph.make("static_reshape", dequantize_weight->get_outputs(), {},
-                    {{"shape", filter_dims}, {"special_zero", false}});
-    auto conv = graph.make("conv_fwd",
-            {dequantize_input->get_outputs()[0],
-                    dequantize_weight->get_outputs()[0], in->get_outputs()[2]},
-            {}, any_map_t(attrs));
-    auto quantize_conv = graph.make("quantize", conv->get_outputs(), {},
-            any_map_t {{"channel_axis", 1}, {"per_channel", false},
-                    {"scales", std::vector<float> {0.1f}},
-                    {"zero_points", std::vector<int> {0}},
-                    {"dtype", datatypes::u8}});
-    auto out = graph.make_output(quantize_conv->get_outputs());
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver(graph, ctx);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_elemwise_bcast_swap.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_elemwise_bcast_swap.cpp
deleted file mode 100644
index e4251cabe19..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_elemwise_bcast_swap.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <sstream>
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_elemwise_bcast_swap, TestElemwiseBcastSwap) {
-    sc_graph_t mgr;
-    auto in_a = mgr.make_input({graph_tensor::make({28, 64, 16, 16})});
-    auto in_b = mgr.make_input({graph_tensor::make({28, 64, 16, 16})});
-    auto in_c = mgr.make_input({graph_tensor::make({28, 64, 1, 16})});
-    auto normal_add = mgr.make(
-            "add", {in_a->get_outputs()[0], in_b->get_outputs()[0]}, {}, {});
-    auto bcast_add = mgr.make("add",
-            {normal_add->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-    auto normal_add2 = mgr.make("add",
-            {bcast_add->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-    auto out = mgr.make_output(
-            {bcast_add->get_outputs()[0], normal_add2->get_outputs()[0]});
-    elemwise_bcast_swap(mgr);
-    EXPECT_EQ(check_graph_connection(mgr), true);
-    std::stringstream ss;
-    print_graph(mgr, ss, true);
-
-    sc_graph_t mgr2;
-    {
-        auto in_a = mgr2.make_input({graph_tensor::make({28, 64, 16, 16})});
-        auto in_b = mgr2.make_input({graph_tensor::make({28, 64, 16, 16})});
-        auto in_c = mgr2.make_input({graph_tensor::make({28, 64, 1, 16})});
-        auto b_add = mgr2.make("add",
-                {in_a->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-        auto add1 = mgr2.make("add",
-                {b_add->get_outputs()[0], in_b->get_outputs()[0]}, {}, {});
-        auto add2 = mgr2.make("add",
-                {add1->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-        auto out = mgr2.make_output(
-                {add1->get_outputs()[0], add2->get_outputs()[0]});
-        std::swap(in_c->get_outputs()[0]->uses_[0],
-                in_c->get_outputs()[0]->uses_[1]);
-    }
-
-    EXPECT_TRUE(compare_graph(mgr, mgr2));
-}
-
-// should not swap if parent is also bcast
-TEST(GCCore_CPU_elemwise_bcast_swap, TestFailParentBCast) {
-    auto make_graph = []() {
-        sc_graph_t mgr;
-        auto in_a = mgr.make_input({graph_tensor::make({28, 64, 16, 16})});
-        auto in_b = mgr.make_input({graph_tensor::make({28, 64, 1, 16})});
-        auto in_c = mgr.make_input({graph_tensor::make({28, 64, 1, 16})});
-        // parent is also bcast
-        auto normal_add = mgr.make("add",
-                {in_a->get_outputs()[0], in_b->get_outputs()[0]}, {}, {});
-        auto bcast_add = mgr.make("add",
-                {normal_add->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-        auto normal_add2 = mgr.make("add",
-                {bcast_add->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-        auto out = mgr.make_output(
-                {bcast_add->get_outputs()[0], normal_add2->get_outputs()[0]});
-        return mgr;
-    };
-
-    auto mgr = make_graph();
-    elemwise_bcast_swap(mgr);
-    EXPECT_EQ(check_graph_connection(mgr), true);
-    EXPECT_TRUE(compare_graph(mgr, make_graph()));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_fusible_op_gen.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_fusible_op_gen.cpp
deleted file mode 100644
index 13a57b924e4..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_fusible_op_gen.cpp
+++ /dev/null
@@ -1,470 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <algorithm>
-#include <iostream>
-#include <string>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <runtime/config.hpp>
-
-#define CMP_SIMPLIFIED_IR(AAA, BBB) \
-    ir_simplifier_t simp {false}; \
-    constant_folder_t cf; \
-    auto_caster_t ac; \
-    ir_comparer cmper(true); \
-    auto AA = cf(ac(simp(AAA))); \
-    auto BB = cf(ac(simp(BBB))); \
-    EXPECT_TRUE(cmper.compare(AA, BB, false));
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorAdd) {
-    REQUIRE_PARALLEL();
-    SET_THREADS_OR_SKIP(56);
-    sc_graph_t mgr;
-    auto ins = mgr.make_input({graph_tensor::make({32, 16, 64}),
-            graph_tensor::make({32, 16, 64})});
-    auto addop = mgr.make("add", ins->get_outputs(), {}, {});
-    mgr.make_output(addop->get_outputs());
-    auto addf = lower_graph(get_test_ctx(), mgr, {})->get_func("add_1");
-    ASSERT_TRUE(addf);
-    int simd_len = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    builder::ir_builder_t builder;
-    for_loop l0, l1;
-    _function_(datatypes::boolean, bbb,
-            _arg_("out", datatypes::f32, {32UL, 16UL, 64UL}),
-            _arg_("in0", datatypes::f32, {32UL, 16UL, 64UL}),
-            _arg_("in1", datatypes::f32, {32UL, 16UL, 64UL})) {
-        _bind_(out, in0, in1);
-        _named_for_(l0, ii, 0, 32UL, 1, for_type::PARALLEL) {
-            _named_for_(l1, jj, 0, 16UL) {
-                _for_(f1, 0, 1) {
-                    _for_(f2, 0, 1) {
-                        _for_(f3, 0, UINT64_C(64), simd_len) {
-                            auto op = builder::tensor_ptr(
-                                    out, {ii, jj, 0}, {}, true);
-                            auto ip0 = builder::tensor_ptr(
-                                    in0, {ii, jj, 0}, {}, true);
-                            auto ip1 = builder::tensor_ptr(
-                                    in1, {ii, jj, 0}, {}, true);
-                            op[span_t({f1, f2, f3}, simd_len)]
-                                    = ip0[span_t({f1, f2, f3}, simd_len)]
-                                    + ip1[span_t({f1, f2, f3}, simd_len)];
-                        }
-                    }
-                }
-            }
-        }
-        _return_(true);
-    }
-    l0->fuse(l1);
-    CMP_SIMPLIFIED_IR(addf, bbb);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorAdd2) {
-    REQUIRE_PARALLEL();
-    REQUIRE_AVX();
-    bool is_builtin = is_builtin_test_ctx();
-    sc_graph_t mgr;
-    auto ins = mgr.make_input(
-            {graph_tensor::make({1030}), graph_tensor::make({1030})});
-    auto addop = mgr.make("add", ins->get_outputs(), {}, {});
-    mgr.make_output(addop->get_outputs());
-    auto addf = lower_graph(get_test_ctx(), mgr, {})->get_func("add_1");
-    ASSERT_TRUE(addf);
-    int simd_len = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    builder::ir_builder_t builder;
-    _function_(datatypes::boolean, bbb, _arg_("out", datatypes::f32, {1030UL}),
-            _arg_("in0", datatypes::f32, {1030UL}),
-            _arg_("in1", datatypes::f32, {1030UL})) {
-        _bind_(out, in0, in1);
-        auto op = builder::tensor_ptr(out, {0}, {}, true);
-        auto ip0 = builder::tensor_ptr(in0, {0}, {}, true);
-        auto ip1 = builder::tensor_ptr(in1, {0}, {}, true);
-        auto iter = builder::make_var(datatypes::index, "iter");
-        auto iter1 = builder::make_var(datatypes::index, "iter1");
-        auto iter2 = builder::make_var(datatypes::index, "iter2");
-
-        auto mask = last_dim_generate_mask(
-                iter, expr(1024UL), 1030, simd_len, true);
-
-        auto loop = builder::make_for_loop_unattached(iter, 0, UINT64_C(1024),
-                simd_len,
-                builder::make_stmts_unattached({builder::make_assign_unattached(
-                        op[span_t({iter}, simd_len)],
-                        ip0[span_t({iter}, simd_len)]
-                                + ip1[span_t({iter}, simd_len)])}),
-                true, for_type::PARALLEL);
-        auto loop1 = builder::make_for_loop_unattached(iter1, expr(1024UL),
-                UINT64_C(1030), simd_len,
-                builder::make_stmts_unattached({builder::make_assign_unattached(
-                        op[span_t({iter1}, simd_len, mask)],
-                        ip0[span_t({iter1}, simd_len, mask)]
-                                + ip1[span_t({iter1}, simd_len, mask)])}),
-                true, for_type::PARALLEL);
-        auto loop2 = builder::make_for_loop_unattached(iter2, expr(1024UL),
-                UINT64_C(1030), 1,
-                builder::make_stmts_unattached({builder::make_assign_unattached(
-                        op[span_t({iter2}, 1)],
-                        ip0[span_t({iter2}, 1)] + ip1[span_t({iter2}, 1)])}),
-                true, for_type::PARALLEL);
-        builder.emit(loop);
-        if (is_builtin) {
-            builder.emit(loop1);
-        } else {
-            builder.emit(loop2);
-        }
-
-        _return_(true);
-    }
-    CMP_SIMPLIFIED_IR(addf, bbb);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorReorder) {
-    SET_THREADS_OR_SKIP(56);
-
-    sc_graph_t mgr;
-    auto ins = mgr.make_input(
-            {graph_tensor::make({32, 64}, sc_data_format_t::MKmk(4, 8))});
-    auto reorderop = mgr.make("reorder", ins->get_outputs(),
-            {graph_tensor::make({32, 64}, sc_data_format_t::MKmk(16, 16))},
-            {{"use_input_loop", true}});
-    mgr.make_output(reorderop->get_outputs());
-    auto reorderf = lower_graph(get_test_ctx(), mgr, {})->get_func("reorder_1");
-    ASSERT_TRUE(reorderf);
-    builder::ir_builder_t builder;
-    for_loop l0, l1, l2;
-    int lanes = std::min(
-            (int)get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32), 8);
-    _function_(datatypes::boolean, bbb,
-            _arg_("out", datatypes::f32, {2UL, 4UL, 16UL, 16UL}),
-            _arg_("in0", datatypes::f32, {8UL, 8UL, 4UL, 8UL})) {
-        _bind_(out, in0);
-        _named_for_(l0, ii, 0, 8UL, 1, for_type::PARALLEL) {
-            _named_for_(l1, jj, 0, 8) {
-                _named_for_(l2, kk, 0, 4) {
-                    _for_(ll, 0, 8UL, lanes) {
-                        auto ip0 = builder::tensor_ptr(
-                                in0, {ii, jj, kk, 0}, {}, true);
-                        out[span_t({(((0 + (kk + 0)) + ((ii + 0) * 4)) / 16),
-                                           (((0 + (ll + 0)) + ((jj + 0) * 8))
-                                                   / 16),
-                                           (((0 + (kk + 0)) + ((ii + 0) * 4))
-                                                   % 16),
-                                           (((0 + (ll + 0)) + ((jj + 0) * 8))
-                                                   % 16)},
-                                lanes)]
-                                = ip0[span_t({0, 0, 0, ll}, lanes)];
-                    }
-                }
-            }
-        }
-        _return_(true);
-    }
-    l0->fuse(l1)->fuse(l2);
-    CMP_SIMPLIFIED_IR(reorderf, bbb);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorReorder2) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(32);
-    sc_graph_t mgr;
-    auto ins = mgr.make_input({graph_tensor::make(
-            {2048, 32, 64, 16}, sc_data_format_t(format_kinds::ACBD))});
-    auto reorderop = mgr.make("reorder", ins->get_outputs(),
-            {graph_tensor::make({2048, 32, 64, 16},
-                    sc_data_format_t(format_kinds::ABCDdc, {16, 16, 0, 0}))},
-            {});
-    mgr.make_output(reorderop->get_outputs());
-    auto reorderf = lower_graph(get_test_ctx(), mgr, {})->get_func("reorder_1");
-    ASSERT_TRUE(reorderf);
-
-    builder::ir_builder_t builder;
-    for_loop li, lj;
-    _function_(datatypes::boolean, bbb,
-            _arg_("out", datatypes::f32, {2048UL, 32UL, 4UL, 1UL, 16UL, 16UL}),
-            _arg_("in0", datatypes::f32, {2048UL, 64UL, 32UL, 16UL})) {
-        _bind_(out, in0);
-        _named_for_(li, l0, 0, 2048UL, 1, for_type::PARALLEL) {
-            _named_for_(lj, l1, 0, 32UL) _for_(l2, 0, 4UL) _for_(l3, 0, 1, 1UL)
-                    _for_(l4, 0, 16UL, 8) _for_(l5, 0, 16UL, 8) {
-                auto ip0 = in0;
-                _var_(row0, sc_data_type_t::f32(8));
-                _var_(row1, sc_data_type_t::f32(8));
-                _var_(row2, sc_data_type_t::f32(8));
-                _var_(row3, sc_data_type_t::f32(8));
-                _var_(row4, sc_data_type_t::f32(8));
-                _var_(row5, sc_data_type_t::f32(8));
-                _var_(row6, sc_data_type_t::f32(8));
-                _var_(row7, sc_data_type_t::f32(8));
-                _var_(row8, sc_data_type_t::f32(8));
-                _var_(row9, sc_data_type_t::f32(8));
-                _var_(row10, sc_data_type_t::f32(8));
-                _var_(row11, sc_data_type_t::f32(8));
-                row0 = ip0[span_t({l0, l5 + l2 * UINT64_C(16), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row1 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(1), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row2 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(2), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row3 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(3), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row4 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(4), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row5 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(5), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row6 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(6), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row7 = ip0[span_t({l0, l5 + l2 * UINT64_C(16) + UINT64_C(7), l1,
-                                          l4 + l3 * UINT64_C(16)},
-                        8)];
-                row8 = builder::make_unpack_low(row0, row1);
-                row0 = builder::make_unpack_high(row0, row1);
-                row9 = builder::make_unpack_low(row2, row3);
-                row1 = builder::make_unpack_high(row2, row3);
-                row10 = builder::make_unpack_low(row4, row5);
-                row2 = builder::make_unpack_high(row4, row5);
-                row11 = builder::make_unpack_low(row6, row7);
-                row3 = builder::make_unpack_high(row6, row7);
-
-                row4 = builder::make_shuffle(row8, row9, 68, 32);
-                row5 = builder::make_shuffle(row8, row9, 238, 32);
-                row6 = builder::make_shuffle(row0, row1, 68, 32);
-                row7 = builder::make_shuffle(row0, row1, 238, 32);
-                row8 = builder::make_shuffle(row10, row11, 68, 32);
-                row9 = builder::make_shuffle(row10, row11, 238, 32);
-                row10 = builder::make_shuffle(row2, row3, 68, 32);
-                row11 = builder::make_shuffle(row2, row3, 238, 32);
-
-                row0 = builder::make_permute(row4, row8, 32, 128);
-                row1 = builder::make_permute(row5, row9, 32, 128);
-                row2 = builder::make_permute(row6, row10, 32, 128);
-                row3 = builder::make_permute(row7, row11, 32, 128);
-                row4 = builder::make_permute(row4, row8, 49, 128);
-                row5 = builder::make_permute(row5, row9, 49, 128);
-                row6 = builder::make_permute(row6, row10, 49, 128);
-                row7 = builder::make_permute(row7, row11, 49, 128);
-                out[span_t({l0, l1, l2, l3, l4, l5}, 8)] = row0;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(1), l5}, 8)] = row1;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(2), l5}, 8)] = row2;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(3), l5}, 8)] = row3;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(4), l5}, 8)] = row4;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(5), l5}, 8)] = row5;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(6), l5}, 8)] = row6;
-                out[span_t({l0, l1, l2, l3, l4 + UINT64_C(7), l5}, 8)] = row7;
-            }
-        }
-        _return_(true);
-    }
-    if (runtime_config_t::get().get_num_threads() == 224) {
-        // outer 2 loops will be fused when OMP_NUM_THREADS=224
-        li->fuse(lj);
-    }
-    CMP_SIMPLIFIED_IR(reorderf, bbb);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorReduce) {
-    REQUIRE_PARALLEL();
-    SET_THREADS_OR_SKIP(56);
-    sc_graph_t mgr;
-    auto ins = mgr.make_input(
-            {graph_tensor::make({32, 32, 64, 64}, sc_data_format_t::NCHW())});
-    auto addop = mgr.make("reduce", ins->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0},
-                    {"temp.no_split_reduce", true}});
-    mgr.make_output(addop->get_outputs());
-    auto reducef = lower_graph(get_test_ctx(), mgr, {})->get_func("reduce_1");
-    ASSERT_TRUE(reducef);
-    int simd_len = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    builder::ir_builder_t builder;
-    _function_(datatypes::boolean, bbb,
-            _arg_("out", datatypes::f32, {32UL, 1UL, 64UL, 64UL}),
-            _arg_("in", datatypes::f32, {32UL, 32UL, 64UL, 64UL})) {
-        _bind_(out, in);
-        _for_(fused, 0UL, 2048UL, 1UL, for_type::PARALLEL) {
-            _for_(itr1, 0, 32UL, 1) {}
-            auto op = builder::tensor_ptr(out,
-                    {(fused / UINT64_C(64)), 0, (fused % UINT64_C(64)), 0}, {},
-                    true);
-            auto ip0 = builder::tensor_ptr(in,
-                    {(fused / UINT64_C(64)), 0, (fused % UINT64_C(64)), 0}, {},
-                    true);
-            _for_(f7, 0, 1) {
-                _for_(f9, 0, 1) {
-                    _for_(f10, 0, 64UL, simd_len) {
-                        _var_(reduce_v, sc_data_type_t::f32(simd_len));
-                        reduce_v = make_expr<constant_node>(
-                                0UL, sc_data_type_t::f32(simd_len));
-                        _for_(f8, 0, 32UL, 1) {
-                            reduce_v = ip0[span_t({f7, f8, f9, f10}, simd_len)]
-                                    + reduce_v;
-                        }
-                        op[span_t({f7, 0, f9, f10}, simd_len)] = reduce_v;
-                    }
-                }
-            }
-        }
-        _return_(true);
-    }
-
-    CMP_SIMPLIFIED_IR(reducef, bbb);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorNoAxisOptim) {
-    sc_graph_t mgr;
-    auto run_threads = runtime_config_t::get().get_num_threads();
-    // This N value will ensure parallel_num > run_threads in any cases
-    int N = run_threads + 1;
-    auto ins = mgr.make_input(
-            {graph_tensor::make({N, 32, 64, 64}, sc_data_format_t::NCHW())});
-
-    auto addop = mgr.make(
-            "add", {ins->get_outputs()[0], ins->get_outputs()[0]}, {}, {});
-    auto tviewop = mgr.make("tensor_view", addop->get_outputs(), {},
-            {{"shape", sc_dims {N, 32, 64, 64}}});
-    auto rdop = mgr.make("reduce", tviewop->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-
-    mgr.make_output(rdop->get_outputs());
-
-    auto ctx = get_default_context();
-    mixed_partition(mgr, ctx);
-    auto a = lower_graph(ctx, mgr, {});
-    auto reducef = a->get_func("outerloop_" + std::to_string(N)
-            + "_partition_add_tensor_view_reduce_compute_reduce_collect_2");
-    ASSERT_TRUE(reducef);
-    EXPECT_EQ(get_expr_as_int(reducef->body_.checked_as<stmts>()
-                                      ->seq_[0]
-                                      .checked_as<for_loop>()
-                                      ->iter_end_),
-            N);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorForcedAxisOptim) {
-    REQUIRE_PARALLEL();
-    sc_graph_t mgr;
-    auto ins = mgr.make_input({graph_tensor::make(
-            {1 * 64, 8 * 64}, sc_data_format_t::MKmk(64, 64))});
-
-    auto addop = mgr.make(
-            "add", {ins->get_outputs()[0], ins->get_outputs()[0]}, {}, {});
-    auto rdop = mgr.make("reduce", addop->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto reo = mgr.make("reorder", rdop->get_outputs(), {},
-            {{"out_format", sc_data_format_t::MK()}});
-
-    mgr.make_output(reo->get_outputs());
-
-    auto ctx = get_default_context();
-    mixed_partition(mgr, ctx);
-    auto a = lower_graph(ctx, mgr, {});
-    auto add_reducef = a->get_func(
-            "outerloop_1X64_partition_add_reduce_compute_reduce_collect_3");
-    ASSERT_TRUE(add_reducef);
-    auto reorderf = a->get_func("reorder_2");
-    ASSERT_TRUE(reorderf);
-    EXPECT_EQ(get_expr_as_int(add_reducef->body_.checked_as<stmts>()
-                                      ->seq_[0]
-                                      .checked_as<for_loop>()
-                                      ->iter_end_),
-            64);
-}
-
-TEST(GCCore_CPU_fusible_op_gen, TestFusibleOpGeneratorExpMask) {
-    REQUIRE_PARALLEL();
-    REQUIRE_AVX();
-    sc_graph_t mgr;
-    auto ins = mgr.make_input(
-            {graph_tensor::make({32, 35}, sc_data_format_t::MKmk(32, 32))});
-    auto expop = mgr.make("exp", ins->get_outputs(), {}, {});
-    mgr.make_output(expop->get_outputs());
-    auto ctx = get_default_context();
-    mixed_partition(mgr, ctx);
-    auto a = lower_graph(ctx, mgr, {});
-    auto expf = a->get_func("exp_1");
-    ASSERT_TRUE(expf);
-    auto expf2 = constant_folder_t()(expf);
-    builder::ir_builder_t builder;
-    int simd_len = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    // 8 for windows ci
-    auto mask_dtype = simd_len == 16 ? datatypes::u16 : datatypes::u8;
-    auto full_mask = simd_len == 16
-            ? builder::make_constant({UINT64_C(0xffff)}, mask_dtype)
-            : builder::make_constant({UINT64_C(0xff)}, mask_dtype);
-    auto empty_mask = builder::make_constant({UINT64_C(0)}, mask_dtype);
-    auto mask_zero
-            = builder::make_constant(std::vector<union_val>(simd_len, 0.f),
-                    sc_data_type_t::f32(simd_len));
-    _function_(datatypes::boolean, bbb,
-            _arg_("out", datatypes::f32, {1UL, 2UL, 32UL, 32UL}),
-            _arg_("in", datatypes::f32, {1UL, 2UL, 32UL, 32UL})) {
-        _bind_(out, in);
-        _for_(fused, 0UL, 64UL, 1UL, for_type::PARALLEL) {
-            _for_(iter3, 0, UINT64_C(32), simd_len) {
-                auto in_ptr = builder::tensor_ptr(in,
-                        {fused / UINT64_C(64),
-                                fused / UINT64_C(32) % UINT64_C(2),
-                                fused % UINT64_C(32), 0},
-                        {}, true);
-                auto out_ptr = builder::tensor_ptr(out,
-                        {fused / UINT64_C(64),
-                                fused / UINT64_C(32) % UINT64_C(2),
-                                fused % UINT64_C(32), 0},
-                        {}, true);
-                expr cur_idx = (iter3
-                        + (0 + fused / UINT64_C(32) % UINT64_C(2)) * 32);
-                expr offset = builder::make_min(
-                        builder::make_max(35
-                                        - builder::make_cast(
-                                                datatypes::s32, cur_idx),
-                                0),
-                        simd_len);
-                expr mask_select = builder::make_select(offset == 0, empty_mask,
-                        builder::make_select(offset == simd_len, full_mask,
-                                full_mask >> builder::make_cast(
-                                        mask_dtype, simd_len - offset)));
-                out_ptr[span_t({0, 0, 0, iter3}, simd_len)] = builder::make_exp(
-                        in_ptr[span_t({0, 0, 0, iter3}, simd_len)]);
-                _var_init_(mask_var, mask_dtype, mask_select);
-                out_ptr[span_t({0, 0, 0, iter3}, simd_len)]
-                        = builder::make_select(mask_var,
-                                out_ptr[span_t({0, 0, 0, iter3}, simd_len)],
-                                mask_zero);
-            }
-        }
-        _return_(true);
-    }
-    CMP_SIMPLIFIED_IR(expf2, constant_folder_t()(bbb));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_inplace_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_inplace_transform.cpp
deleted file mode 100644
index c7ba611a854..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_inplace_transform.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestInputOutputInplacement1) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto out = graph.make_output(inp->get_outputs());
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[2, 64, 128]] {
-  [v1: f32[2, 64, 128]] = reorder(v0)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestInputOutputInplacement2) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto tv = graph.make("tensor_view", inp->get_outputs(), {},
-            {{"shape", sc_dims {2, 64, 32, 4}}});
-    auto out = graph.make_output(tv->get_outputs());
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[2, 64, 32, 4]] {
-  [v2: f32[2, 64, 32, 4]] = tensor_view(v0)
-  [v1: f32[2, 64, 32, 4]] = reorder(v2)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestInputOutputInplacement3) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto tv1 = graph.make("tensor_view", inp->get_outputs(), {},
-            {{"shape", sc_dims {2, 64, 32, 4}}});
-    auto tv2 = graph.make("tensor_view", inp->get_outputs(), {},
-            {{"shape", sc_dims {4, 16, 8, 32}}});
-    auto tv3 = graph.make("tensor_view", tv1->get_outputs(), {},
-            {{"shape", sc_dims {128, 4, 32}}});
-    auto tv4 = graph.make("tensor_view", tv1->get_outputs(), {},
-            {{"shape", sc_dims {128, 128}}});
-    auto tv5 = graph.make("tensor_view", tv2->get_outputs(), {},
-            {{"shape", sc_dims {256, 4, 16}}});
-    auto relu = graph.make("relu", tv2->get_outputs(), {}, {});
-    auto tv6 = graph.make("tensor_view", relu->get_outputs(), {},
-            {{"shape", sc_dims {512, 32}}});
-    auto out = graph.make_output({tv3->get_outputs()[0], tv4->get_outputs()[0],
-            tv2->get_outputs()[0], tv5->get_outputs()[0],
-            tv6->get_outputs()[0]});
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[128, 4, 32], v2: f32[128, 128], v3: f32[4, 16, 8, 32], v4: f32[256, 4, 16], v5: f32[512, 32]] {
-  [v6: f32[2, 64, 128]] = reorder(v0)
-  [v3: f32[4, 16, 8, 32]] = tensor_view(v6)
-  [v7: f32[4, 16, 8, 32]] = relu(v3)
-  [v5: f32[512, 32]] = tensor_view(v7)
-  [v8: f32[256, 4, 16]] = tensor_view(v3)
-  [v4: f32[256, 4, 16]] = reorder(v8)
-  [v9: f32[2, 64, 128]] = reorder(v0)
-  [v10: f32[2, 64, 32, 4]] = tensor_view(v9)
-  [v2: f32[128, 128]] = tensor_view(v10)
-  [v11: f32[128, 4, 32]] = tensor_view(v10)
-  [v1: f32[128, 4, 32]] = reorder(v11)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestMultiOutputInplacement1) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto relu = graph.make("relu", inp->get_outputs(), {}, {});
-    auto tv1 = graph.make("tensor_view", relu->get_outputs(), {},
-            {{"shape", sc_dims {512, 32}}});
-    auto out = graph.make_output(
-            {relu->get_outputs()[0], tv1->get_outputs()[0]});
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[2, 64, 128], v2: f32[512, 32]] {
-  [v1: f32[2, 64, 128]] = relu(v0)
-  [v3: f32[512, 32]] = tensor_view(v1)
-  [v2: f32[512, 32]] = reorder(v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestRedudantCopy1) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto relu = graph.make("relu", inp->get_outputs(), {}, {});
-    auto cp_reo1 = graph.make("reorder", relu->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto out = graph.make_output({cp_reo1->get_outputs()[0]});
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[2, 64, 128]] {
-  [v1: f32[2, 64, 128]] = relu(v0)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestRedudantCopy2) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto relu1 = graph.make("relu", inp->get_outputs(), {}, {});
-    auto relu2 = graph.make("relu", inp->get_outputs(), {}, {});
-    auto cp_reo1 = graph.make("reorder", relu1->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto cp_reo2 = graph.make("reorder", relu1->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto relu3 = graph.make("relu", cp_reo2->get_outputs(), {}, {});
-    auto cp_reo3 = graph.make("reorder", relu2->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto cp_reo4 = graph.make("reorder", relu2->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto out = graph.make_output(
-            {cp_reo1->get_outputs()[0], relu3->get_outputs()[0],
-                    cp_reo3->get_outputs()[0], cp_reo4->get_outputs()[0]});
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[2, 64, 128], v2: f32[2, 64, 128], v3: f32[2, 64, 128], v4: f32[2, 64, 128]] {
-  [v3: f32[2, 64, 128]] = relu(v0)
-  [v4: f32[2, 64, 128]] = reorder(v3)
-  [v1: f32[2, 64, 128]] = relu(v0)
-  [v2: f32[2, 64, 128]] = relu(v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestRedudantCopy3) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto tv1 = graph.make("tensor_view", inp->get_outputs(), {},
-            {{"shape", sc_dims {64, 4, 64}}});
-    auto cp_reo1 = graph.make("reorder", tv1->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto tv2 = graph.make("tensor_view", cp_reo1->get_outputs(), {},
-            {{"shape", sc_dims {2, 64, 128}}});
-    auto tv3 = graph.make("tensor_view", inp->get_outputs(), {},
-            {{"shape", sc_dims {64, 4, 64}}});
-    auto cp_reo2 = graph.make("reorder", tv3->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto tv4 = graph.make("tensor_view", cp_reo2->get_outputs(), {},
-            {{"shape", sc_dims {2, 64, 128}}});
-    auto relu1 = graph.make("relu", tv4->get_outputs(), {}, {});
-    auto relu2 = graph.make("relu", inp->get_outputs(), {}, {});
-    auto tv5 = graph.make("tensor_view", relu2->get_outputs(), {},
-            {{"shape", sc_dims {64, 4, 64}}});
-    auto cp_reo3 = graph.make("reorder", tv5->get_outputs(), {},
-            {{"out_format", sc_data_format_t(format_kinds::ABC)},
-                    {"actually_copy", true}});
-    auto tv6 = graph.make("tensor_view", cp_reo3->get_outputs(), {},
-            {{"shape", sc_dims {2, 64, 128}}});
-    auto out = graph.make_output({tv2->get_outputs()[0],
-            relu1->get_outputs()[0], tv6->get_outputs()[0]});
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[2, 64, 128]) -> [v1: f32[2, 64, 128], v2: f32[2, 64, 128], v3: f32[2, 64, 128]] {
-  [v4: f32[2, 64, 128]] = relu(v0)
-  [v5: f32[64, 4, 64]] = tensor_view(v4)
-  [v3: f32[2, 64, 128]] = tensor_view(v5)
-  [v6: f32[64, 4, 64]] = tensor_view(v0)
-  [v7: f32[2, 64, 128]] = tensor_view(v6)
-  [v2: f32[2, 64, 128]] = relu(v7)
-  [v8: f32[64, 4, 64]] = tensor_view(v0)
-  [v9: f32[64, 4, 64]] = reorder(v8)
-  [v1: f32[2, 64, 128]] = tensor_view(v9)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestBatchNormNoCopy) {
-    auto graph = sc_graph_t();
-    sc_dims input_dims {64, 32, 32, 16};
-    int channel_axis = 3;
-    auto ins0 = graph.make_input({graph_tensor::make(input_dims)});
-    auto ins1 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    auto ins2 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    auto ins3 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    auto ins4 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    std::unordered_map<std::string, any_t> attrs = {{"epsilon", 1e-5f},
-            {"momentum", 1.f}, {"data_format", std::string("NXC")}};
-    auto relu0 = graph.make("relu", ins0->get_outputs(), {}, {});
-    auto relu1 = graph.make("relu", ins1->get_outputs(), {}, {});
-    auto relu2 = graph.make("relu", ins2->get_outputs(), {}, {});
-    auto relu3 = graph.make("relu", ins3->get_outputs(), {}, {});
-    auto relu4 = graph.make("relu", ins4->get_outputs(), {}, {});
-    auto bn = graph.make("batchnorm_forward_training",
-            {relu0->get_outputs()[0], relu1->get_outputs()[0],
-                    relu2->get_outputs()[0], relu3->get_outputs()[0],
-                    relu4->get_outputs()[0]},
-            {}, any_map_t(attrs));
-    auto out = graph.make_output(bn->get_outputs());
-    graph_inline(graph);
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    int reorder_count = 0;
-    for (auto &op : graph.ops_) {
-        if (op->isa<reorder_op_t>()) { reorder_count++; }
-    }
-    EXPECT_EQ(reorder_count, 0);
-}
-
-TEST(GCCore_CPU_graph_inplace_transform_cpp, TestBatchNormCopy) {
-    auto graph = sc_graph_t();
-    sc_dims input_dims {64, 32, 32, 16};
-    int channel_axis = 3;
-    auto ins0 = graph.make_input({graph_tensor::make(input_dims)});
-    auto ins1 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    auto ins2 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    auto ins3 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    auto ins4 = graph.make_input(
-            {graph_tensor::make({input_dims[channel_axis]})});
-    std::unordered_map<std::string, any_t> attrs = {{"epsilon", 1e-5f},
-            {"momentum", 1.f}, {"data_format", std::string("NXC")}};
-    auto bn = graph.make("batchnorm_forward_training",
-            {ins0->get_outputs()[0], ins1->get_outputs()[0],
-                    ins2->get_outputs()[0], ins3->get_outputs()[0],
-                    ins4->get_outputs()[0]},
-            {}, any_map_t(attrs));
-    auto out = graph.make_output(bn->get_outputs());
-    graph_inline(graph);
-    inplace_transform(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    int reorder_count = 0;
-    for (auto &op : graph.ops_) {
-        if (op->isa<reorder_op_t>()) { reorder_count++; }
-    }
-    EXPECT_EQ(reorder_count, 2);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_padded_mask_mark.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_padded_mask_mark.cpp
deleted file mode 100644
index a2bbb53115f..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_padded_mask_mark.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include <vector>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_graph_padded_mask_mark_cpp, TestPaddedMaskMark) {
-    auto graph = sc_graph_t();
-    auto inp1 = graph.make_input(
-            {graph_tensor::make({64, 128}, sc_data_format_t::MKmk(16, 32))});
-    auto inp2 = graph.make_input({graph_tensor::make({1})});
-    auto inp3 = graph.make_input({graph_tensor::make(
-            {128, 64}, sc_data_format_t(format_kinds::AB))});
-    auto relu = graph.make("relu", inp1->get_outputs(), {}, {});
-    auto add1 = graph.make(
-            "add", {inp1->get_outputs()[0], inp2->get_outputs()[0]}, {}, {});
-    auto mul1 = graph.make(
-            "mul", {relu->get_outputs()[0], add1->get_outputs()[0]}, {}, {});
-    auto div1 = graph.make(
-            "div", {mul1->get_outputs()[0], add1->get_outputs()[0]}, {}, {});
-    auto exp1 = graph.make("exp", {div1->get_outputs()[0]}, {}, {});
-    auto matmul = graph.make("matmul_core",
-            {exp1->get_outputs()[0], inp3->get_outputs()[0]}, {}, {});
-    auto exp2 = graph.make("exp", relu->get_outputs(), {}, {});
-    auto select = graph.make("select",
-            {inp1->get_outputs()[0], inp1->get_outputs()[0],
-                    inp2->get_outputs()[0]},
-            {}, {});
-    // output
-    auto mul2 = graph.make(
-            "mul", {mul1->get_outputs()[0], exp1->get_outputs()[0]}, {}, {});
-    auto reduce1 = graph.make("reduce", add1->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto reduce2 = graph.make("reduce", select->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto tv1 = graph.make("tensor_view", relu->get_outputs(), {},
-            {{"shape", sc_dims {2, 64, 64}}});
-    auto tv2 = graph.make("tensor_view", exp2->get_outputs(), {},
-            {{"shape", sc_dims {2, 32, 128}}});
-    auto outs = graph.make_output({mul2->get_outputs()[0],
-            reduce1->get_outputs()[0], reduce2->get_outputs()[0],
-            tv1->get_outputs()[0], tv2->get_outputs()[0]});
-    padded_mask_mark(graph, get_test_ctx());
-    EXPECT_EQ(relu->attrs_.get_or_else(op_attr_key::use_padded_mask, true),
-            false);
-    EXPECT_EQ(
-            add1->attrs_.get_or_else(op_attr_key::use_padded_mask, true), true);
-    EXPECT_EQ(mul1->attrs_.get_or_else(op_attr_key::use_padded_mask, true),
-            false);
-    EXPECT_EQ(div1->attrs_.get_or_else(op_attr_key::use_padded_mask, true),
-            false);
-    EXPECT_EQ(
-            exp1->attrs_.get_or_else(op_attr_key::use_padded_mask, true), true);
-    EXPECT_EQ(mul2->attrs_.get_or_else(op_attr_key::use_padded_mask, true),
-            false);
-    EXPECT_EQ(
-            exp2->attrs_.get_or_else(op_attr_key::use_padded_mask, true), true);
-    EXPECT_EQ(select->attrs_.get_or_else(op_attr_key::use_padded_mask, true),
-            true);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_reshape.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_reshape.cpp
deleted file mode 100644
index ee97678d386..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_reshape.cpp
+++ /dev/null
@@ -1,407 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/jit/jit.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_graph_reshape_cpp, TestGraphReshapeCreation) {
-    sc_graph_t g;
-    graph_tensor_ptr in0, in1;
-    auto make_graph = [&](int32_t *values, size_t num_ele) {
-        sc_graph_t newg;
-        in0 = newg.make_input({graph_tensor::make({100, 200})})
-                      ->get_outputs()[0];
-        in1 = newg.make_input({graph_tensor::make(
-                                      {4}, sc_data_format_t(), datatypes::s32)},
-                          {{"values",
-                                  std::make_shared<static_data_t>(
-                                          values, num_ele * sizeof(int32_t))}})
-                      ->get_outputs()[0];
-        g = std::move(newg);
-    };
-    any_map_t attr = {{"special_zero", true}};
-    {
-        int32_t values[] = {1, 2, 3};
-        make_graph(values, 3);
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "Bad shape data");
-    }
-
-    {
-        sc_graph_t g;
-        auto in0 = g.make_input({graph_tensor::make({100, 200})})
-                           ->get_outputs()[0];
-        auto in1 = g.make_input({graph_tensor::make({4}, sc_data_format_t(),
-                                        datatypes::s32)})
-                           ->get_outputs()[0];
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "Since dynamic shape is not supported yet, we are expecting "
-                "the");
-    }
-
-    {
-        sc_graph_t g;
-        auto in0 = g.make_input({graph_tensor::make({100, 200})})
-                           ->get_outputs()[0];
-        auto in1 = g.make_input({graph_tensor::make({1, 4}, sc_data_format_t(),
-                                        datatypes::s32)},
-                            {{"values",
-                                    std::make_shared<static_data_t>(20,
-                                            runtime::get_default_stream()
-                                                    ->engine_)}})
-                           ->get_outputs()[0];
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "Expecting 1D and int32/int64 tensor for input 2 of");
-    }
-
-    {
-        sc_graph_t g;
-        auto in0 = g.make_input({graph_tensor::make({10}, sc_data_format_t(),
-                                        datatypes::s32)})
-                           ->get_outputs()[0];
-        auto in1 = g.make("add", {in0, in0}, {}, {})->get_outputs()[0];
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "Reshape requires compile-time constant shape for now");
-    }
-
-    {
-        int32_t values[] = {100, -1, 2, -1};
-        make_graph(values, 4);
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "reshape only support one -1 shape");
-    }
-
-    {
-        int32_t values[] = {0, 2, 10, 0};
-        make_graph(values, 4);
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "The special zero at ");
-    }
-
-    {
-        int32_t values[] = {100, 500, 1, -1};
-        make_graph(values, 4);
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "Reshape: The input tensor size does not match the given "
-                "shape");
-    }
-
-    {
-        int32_t values[] = {100, 1, 1, 1};
-        make_graph(values, 4);
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1}, {}, attr),
-                "Reshape: The input tensor size does not match the given "
-                "shape");
-    }
-
-    {
-        int32_t values[] = {100, 200, 1, 1};
-        make_graph(values, 4);
-        EXPECT_SC_ERROR(g.make("dynamic_reshape", {in0, in1},
-                                {graph_tensor::make({200, 100, 1, 1})}, attr),
-                "Reshape: Expecting output shape = ");
-    }
-
-    // tests for inferring the output shape
-
-    {
-        int32_t values[] = {100, 2, 10, -1};
-        make_graph(values, 4);
-        auto ret = g.make("dynamic_reshape", {in0, in1}, {}, attr);
-        EXPECT_EQ(ret->get_outputs()[0]->details_.get_plain_dims(),
-                (sc_dims {100, 2, 10, 10}));
-    }
-    {
-        int32_t values[] = {0, 2, 10, 10};
-        make_graph(values, 4);
-        auto ret = g.make("dynamic_reshape", {in0, in1}, {}, attr);
-        EXPECT_EQ(ret->get_outputs()[0]->details_.get_plain_dims(),
-                (sc_dims {100, 2, 10, 10}));
-    }
-}
-
-TEST(GCCore_CPU_graph_reshape_cpp, TestConstOptimize) {
-    std::vector<bool> reshape_type {true, false};
-    for (auto dynamic_reshape : reshape_type) {
-        auto get_graph = [](bool dynamic_reshape) {
-            sc_graph_t g;
-
-            auto in0 = g.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            auto in1 = g.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            auto in2 = g.make_input({graph_tensor::make({10, 10, 20, 10})})
-                               ->get_outputs()[0];
-
-            auto addout = g.make("add", {in0, in1}, {}, {})->get_outputs()[0];
-            graph_tensor_ptr shape_in;
-            if (dynamic_reshape) {
-                int32_t shapes[] = {10, 10, 20, 10};
-                shape_in = g.make_input(
-                                    {graph_tensor::make({4}, sc_data_format_t(),
-                                            datatypes::s32)},
-                                    {{"values",
-                                            std::make_shared<static_data_t>(
-                                                    shapes,
-                                                    4 * sizeof(int32_t))}})
-                                   ->get_outputs()[0];
-            }
-            any_map_t attr = {{"special_zero", true}};
-            if (!dynamic_reshape) {
-                attr.set("shape", sc_dims {10, 10, 20, 10});
-            }
-            std::string op_name
-                    = dynamic_reshape ? "dynamic_reshape" : "static_reshape";
-            std::vector<graph_tensor_ptr> ins = {addout};
-            if (dynamic_reshape) { ins.emplace_back(shape_in); }
-            auto reshape_out = g.make(op_name, ins, {}, attr)->get_outputs()[0];
-            auto addout2 = g.make("add", {reshape_out, in2}, {}, {})
-                                   ->get_outputs()[0];
-            g.make_output({addout2});
-            return g;
-        };
-        sc_graph_t g = get_graph(dynamic_reshape);
-
-        constant_optimization(g, get_default_context());
-
-        sc_graph_t expected;
-        {
-            auto in0 = expected.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            auto in1 = expected.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            auto in2 = expected.make_input(
-                                       {graph_tensor::make({10, 10, 20, 10})})
-                               ->get_outputs()[0];
-
-            auto addout = expected.make("add", {in0, in1}, {}, {})
-                                  ->get_outputs()[0];
-            graph_tensor_ptr shape_in;
-            if (dynamic_reshape) {
-                int32_t shapes[] = {10, 10, 20, 10};
-                auto data = std::make_shared<static_data_t>(
-                        shapes, 4 * sizeof(int32_t));
-                shape_in = expected.make_input({graph_tensor::make({4},
-                                                       sc_data_format_t(),
-                                                       datatypes::s32)},
-                                           {{"values", data}})
-                                   ->get_outputs()[0];
-            }
-            any_map_t attr = {{"special_zero", true}};
-            auto reshape_out
-                    = expected.make("tensor_view", {addout}, {},
-                                      {{"shape", sc_dims {10, 10, 20, 10}}})
-                              ->get_outputs()[0];
-            auto addout2 = expected.make("add", {reshape_out, in2}, {}, {})
-                                   ->get_outputs()[0];
-            expected.make_output({addout2});
-        }
-        EXPECT_TRUE(compare_graph(g, expected));
-
-        g = get_graph(dynamic_reshape);
-        graph_driver(g);
-        lower_graph(get_default_context(), g, {});
-    }
-}
-
-TEST(GCCore_CPU_graph_reshape_cpp, TestConstOptimizeInputPermuted) {
-    sc_graph_t g;
-    auto in0 = g.make_input(
-            {graph_tensor::make({3, 25}, sc_data_format_t(format_kinds::BA))});
-    auto reshape = g.make("static_reshape", in0->get_outputs(), {},
-            {{"shape", sc_dims {75}}, {"special_zero", false}});
-    g.make_output(reshape->get_outputs());
-
-    constant_optimization(g, get_test_ctx());
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    const char *expected_graph = R"(graph(v0: f32[25, 3]) -> [v1: f32[75]] {
-  [v2: f32[3, 25]] = reorder(v0)
-  [v1: f32[75]] = tensor_view(v2)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_graph);
-}
-
-TEST(GCCore_CPU_graph_reshape_cpp, TestSingleOptimize) {
-    std::vector<bool> reshape_type {true, false};
-    for (auto dynamic_reshape : reshape_type) {
-        auto get_graph = [](bool dynamic_reshape) {
-            sc_graph_t g;
-
-            auto in0 = g.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            graph_tensor_ptr shape_in;
-            if (dynamic_reshape) {
-                int32_t shapes[] = {10, 10, 20, 10};
-                shape_in = g.make_input(
-                                    {graph_tensor::make({4}, sc_data_format_t(),
-                                            datatypes::s32)},
-                                    {{"values",
-                                            std::make_shared<static_data_t>(
-                                                    shapes,
-                                                    4 * sizeof(int32_t))}})
-                                   ->get_outputs()[0];
-            }
-            any_map_t attr = {{"special_zero", true}};
-            if (!dynamic_reshape) {
-                attr.set("shape", sc_dims {10, 10, 20, 10});
-            }
-            std::string op_name
-                    = dynamic_reshape ? "dynamic_reshape" : "static_reshape";
-            std::vector<graph_tensor_ptr> ins = {in0};
-            if (dynamic_reshape) { ins.emplace_back(shape_in); }
-            auto reshape_out = g.make(op_name, ins, {}, attr)->get_outputs()[0];
-            g.make_output({reshape_out});
-            return g;
-        };
-        sc_graph_t g = get_graph(dynamic_reshape);
-
-        constant_optimization(g, get_default_context());
-        inplace_transform(g, get_default_context());
-
-        sc_graph_t expected;
-        {
-            auto in0 = expected.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            graph_tensor_ptr shape_in;
-            if (dynamic_reshape) {
-                int32_t shapes[] = {10, 10, 20, 10};
-                auto data = std::make_shared<static_data_t>(
-                        shapes, 4 * sizeof(int32_t));
-                shape_in = expected.make_input({graph_tensor::make({4},
-                                                       sc_data_format_t(),
-                                                       datatypes::s32)},
-                                           {{"values", data}})
-                                   ->get_outputs()[0];
-            }
-            auto tv = expected.make("tensor_view", {in0},
-                    {graph_tensor::make({10, 10, 20, 10})},
-                    {{"shape", sc_dims {10, 10, 20, 10}}});
-            auto cp_reorder = expected.make("reorder", tv->get_outputs(), {},
-                    {{"internal", true}, {"actually_copy", true},
-                            {"out_format",
-                                    tv->get_outputs()[0]
-                                            ->details_.get_format()}});
-            expected.make_output(cp_reorder->get_outputs());
-        }
-        EXPECT_TRUE(compare_graph(g, expected));
-    }
-}
-
-TEST(GCCore_CPU_graph_reshape_cpp, TestSingleOptimizeMultipleUse) {
-    std::vector<bool> reshape_type {true, false};
-    for (auto dynamic_reshape : reshape_type) {
-        auto get_graph = [](bool dynamic_reshape) {
-            sc_graph_t g;
-
-            auto in0 = g.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            graph_tensor_ptr shape_in;
-            if (dynamic_reshape) {
-                int32_t shapes[] = {10, 10, 20, 10};
-                shape_in = g.make_input(
-                                    {graph_tensor::make({4}, sc_data_format_t(),
-                                            datatypes::s32)},
-                                    {{"values",
-                                            std::make_shared<static_data_t>(
-                                                    shapes,
-                                                    4 * sizeof(int32_t))}})
-                                   ->get_outputs()[0];
-            }
-            any_map_t attr = {{"special_zero", true}};
-            if (!dynamic_reshape) {
-                attr.set("shape", sc_dims {10, 10, 20, 10});
-            }
-            std::string op_name
-                    = dynamic_reshape ? "dynamic_reshape" : "static_reshape";
-            std::vector<graph_tensor_ptr> ins = {in0};
-            if (dynamic_reshape) { ins.emplace_back(shape_in); }
-            auto reshape_out = g.make(op_name, ins, {}, attr)->get_outputs()[0];
-            auto relu_out
-                    = g.make("relu", {reshape_out}, {}, {})->get_outputs()[0];
-            g.make_output({reshape_out, relu_out});
-            return g;
-        };
-        sc_graph_t g = get_graph(dynamic_reshape);
-
-        constant_optimization(g, get_default_context());
-        inplace_transform(g, get_default_context());
-
-        sc_graph_t expected;
-        {
-            auto in0 = expected.make_input({graph_tensor::make({100, 200})})
-                               ->get_outputs()[0];
-            graph_tensor_ptr shape_in;
-            if (dynamic_reshape) {
-                int32_t shapes[] = {10, 10, 20, 10};
-                auto data = std::make_shared<static_data_t>(
-                        shapes, 4 * sizeof(int32_t));
-                shape_in = expected.make_input({graph_tensor::make({4},
-                                                       sc_data_format_t(),
-                                                       datatypes::s32)},
-                                           {{"values", data}})
-                                   ->get_outputs()[0];
-            }
-            auto cp_reorder = expected.make("reorder", {in0}, {},
-                    {{"internal", true}, {"actually_copy", true},
-                            {"out_format", in0->details_.get_format()}});
-            auto tv = expected.make("tensor_view", cp_reorder->get_outputs(),
-                    {graph_tensor::make({10, 10, 20, 10})},
-                    {{"shape", sc_dims {10, 10, 20, 10}}});
-            auto relu_out
-                    = expected.make("relu", {tv->get_outputs()[0]}, {}, {})
-                              ->get_outputs()[0];
-            expected.make_output({tv->get_outputs()[0], relu_out});
-        }
-        EXPECT_TRUE(compare_graph(g, expected));
-    }
-}
-
-TEST(GCCore_CPU_graph_reshape_cpp, TestSingleExecution) {
-    REQUIRE_AVX2();
-    sc_graph_t g;
-
-    auto in = g.make_input({graph_tensor::make({112, 197})});
-    std::string op_name = "reshape";
-    auto reshape_op = g.make(
-            op_name, in->get_outputs(), {}, {{"shape", sc_dims {112, 197}}});
-    auto out = g.make_output(reshape_op->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver(g, ctx);
-    auto f = lower_graph(ctx, g, {out, in});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f, true);
-    auto output = alloc_array<float>(112 * 197, INIT_NOOP);
-    auto input = alloc_array<float>(112 * 197, INIT_RANGE, 0, 112 * 197);
-    std::vector<float *> sc_args = {&output[0], &input[0]};
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    fptr->call_generic_default(generic_args.data());
-
-    for (auto i = 0; i < 112 * 197; i++) {
-        EXPECT_EQ(input[i], output[i]);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_shape_relationship_binding.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_shape_relationship_binding.cpp
deleted file mode 100644
index 53e28c5b6ad..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_shape_relationship_binding.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <sstream>
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_shape_relationship_binding,
-        TestDynamicShapeRelationshipBinding1) {
-    sc_graph_t mgr;
-    auto in_a = mgr.make_input({graph_tensor::make({28, -1, -1, -1})});
-    auto in_b = mgr.make_input({graph_tensor::make({28, -1, -1, -1})});
-    auto in_c = mgr.make_input({graph_tensor::make({28, -1, -1, -1})});
-    auto matmul1 = mgr.make("matmul_core",
-            {in_b->get_outputs()[0], in_c->get_outputs()[0]}, {}, {});
-    auto matmul2 = mgr.make("matmul_core",
-            {in_a->get_outputs()[0], matmul1->get_outputs()[0]}, {}, {});
-    auto out = mgr.make_output({matmul2->get_outputs()[0]});
-    shape_relationship_binding(mgr);
-    EXPECT_EQ(check_graph_connection(mgr), true);
-    sc_dims expected_a_dims {28, -2, -3, -4};
-    sc_dims expected_b_dims {28, -2, -4, -5};
-    sc_dims expected_c_dims {28, -2, -5, -6};
-    sc_dims expected_out_dims {28, -2, -3, -6};
-    EXPECT_EQ(
-            in_a->get_outputs()[0]->details_.get_plain_dims(), expected_a_dims);
-    EXPECT_EQ(
-            in_b->get_outputs()[0]->details_.get_plain_dims(), expected_b_dims);
-    EXPECT_EQ(
-            in_c->get_outputs()[0]->details_.get_plain_dims(), expected_c_dims);
-    EXPECT_EQ(
-            out->get_inputs()[0]->details_.get_plain_dims(), expected_out_dims);
-}
-
-TEST(GCCore_CPU_shape_relationship_binding,
-        TestDynamicShapeRelationshipBinding2) {
-    sc_graph_t mgr;
-    auto in_a = mgr.make_input({graph_tensor::make({28, -1, -1, -1})});
-    auto in_b = mgr.make_input({graph_tensor::make({28, -1, -1, -1})});
-    auto in_c = mgr.make_input({graph_tensor::make({28, -1, -1, -1})});
-    auto in_d = mgr.make_input({graph_tensor::make({28, 16, -1, -1})});
-    auto matmul1 = mgr.make("matmul_core",
-            {in_a->get_outputs()[0], in_b->get_outputs()[0]}, {}, {});
-    auto matmul2 = mgr.make("matmul_core",
-            {in_c->get_outputs()[0], in_d->get_outputs()[0]}, {}, {});
-    auto add = mgr.make("add",
-            {matmul1->get_outputs()[0], matmul2->get_outputs()[0]}, {}, {});
-    auto out = mgr.make_output({add->get_outputs()[0]});
-    shape_relationship_binding(mgr);
-    EXPECT_EQ(check_graph_connection(mgr), true);
-    sc_dims expected_a_dims {28, 16, -2, -3};
-    sc_dims expected_b_dims {28, 16, -3, -4};
-    sc_dims expected_c_dims {28, 16, -2, -5};
-    sc_dims expected_d_dims {28, 16, -5, -4};
-    sc_dims expected_out_dims {28, 16, -2, -4};
-    EXPECT_EQ(
-            in_a->get_outputs()[0]->details_.get_plain_dims(), expected_a_dims);
-    EXPECT_EQ(
-            in_b->get_outputs()[0]->details_.get_plain_dims(), expected_b_dims);
-    EXPECT_EQ(
-            in_c->get_outputs()[0]->details_.get_plain_dims(), expected_c_dims);
-    EXPECT_EQ(
-            in_d->get_outputs()[0]->details_.get_plain_dims(), expected_d_dims);
-    EXPECT_EQ(
-            out->get_inputs()[0]->details_.get_plain_dims(), expected_out_dims);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_simplify.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_simplify.cpp
deleted file mode 100644
index e7b842f1547..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_simplify.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <ops/fusible/unary_elemwise.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_graph_simplify_cpp, TestSameOpElimination) {
-    auto graph = sc_graph_t();
-    auto data = graph.make_input({graph_tensor::make(
-            {2, 64, 128}, sc_data_format_t(format_kinds::ABC))});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {2, 128, 256}, sc_data_format_t(format_kinds::ABC))});
-    auto weight1 = graph.make_input({graph_tensor::make(
-            {2, 128, 512}, sc_data_format_t(format_kinds::ABC))});
-    auto weight2 = graph.make_input({graph_tensor::make(
-            {2, 128, 1024}, sc_data_format_t(format_kinds::ABC))});
-    auto gemm0 = graph.make("matmul_core",
-            {data->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto gemm1 = graph.make("matmul_core",
-            {data->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto gemm2 = graph.make("matmul_core",
-            {data->get_outputs()[0], weight2->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", data->get_outputs(), {}, {});
-    auto relu1 = graph.make("relu", data->get_outputs(), {}, {});
-    auto gemm3 = graph.make("matmul_core",
-            {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {}, {});
-    auto output0 = graph.make_output(gemm3->get_outputs());
-    auto output1 = graph.make_output({gemm0->get_outputs()[0],
-            gemm1->get_outputs()[0], gemm2->get_outputs()[0]});
-    graph.attrs_[sc_graph_t::attr_key_t::is_output_plain] = true;
-    permute_propagation(graph);
-    layout_propagation(graph);
-    EXPECT_EQ(graph.ops_.size(), 12UL);
-    graph_simplify(graph);
-    EXPECT_EQ(check_graph_connection(graph), true);
-    EXPECT_EQ(graph.ops_.size(), 11UL);
-
-    int reorder_count = 0, relu_count = 0;
-    for (auto &op : graph.ops_) {
-        if (op->isa<reorder_op_t>()) {
-            reorder_count++;
-            if (op->get_inputs()[0]->producer_owner_->isa<input_op>()
-                    && op->get_outputs()[0]->uses_[0].first == 0) {
-                EXPECT_EQ(op->get_outputs()[0]->uses_.size(), 3UL);
-            }
-        } else if (op->isa<relu_op_t>()) {
-            relu_count++;
-            EXPECT_EQ(op->get_outputs()[0]->uses_.size(), 2UL);
-        }
-    }
-    EXPECT_EQ(reorder_count, 0);
-    EXPECT_EQ(relu_count, 1);
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestTensorViewElimination) {
-    sc_dims in_plain_dims {128, 64, 256};
-    sc_dims inter_plain_dims0 {128, 64, 32, 8};
-    sc_dims inter_plain_dims1 {32, 128, 64, 8};
-    sc_dims out_plain_dims0 {1024, 2048};
-    sc_dims out_plain_dims1 {512, 4096};
-
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(in_plain_dims)});
-    auto tv0 = graph.make("tensor_view", inp->get_outputs(),
-            {graph_tensor::make(inter_plain_dims0)},
-            {{"shape", inter_plain_dims0}});
-    auto tv1 = graph.make("tensor_view", tv0->get_outputs(),
-            {graph_tensor::make(
-                    out_plain_dims0, sc_data_format_t(format_kinds::BA))},
-            {{"shape", out_plain_dims0}});
-    auto tv2 = graph.make("tensor_view", inp->get_outputs(),
-            {graph_tensor::make(inter_plain_dims1)},
-            {{"shape", inter_plain_dims1}});
-    auto tv3 = graph.make("tensor_view", tv2->get_outputs(),
-            {graph_tensor::make(
-                    out_plain_dims1, sc_data_format_t(format_kinds::BA))},
-            {{"shape", out_plain_dims1}});
-    auto relu0 = graph.make("relu", tv3->get_outputs(), {}, {});
-    auto relu1 = graph.make("relu", tv3->get_outputs(), {}, {});
-
-    auto output0 = graph.make_output(tv1->get_outputs());
-    auto output1 = graph.make_output(relu0->get_outputs());
-    auto output2 = graph.make_output(relu1->get_outputs());
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_EQ(ops.size(), 8UL);
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), tv0) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), tv1) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), tv2) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), tv3) == ops.end());
-
-    EXPECT_EQ(tv0->get_inputs()[0]->details_.get_plain_dims(), in_plain_dims);
-    EXPECT_EQ(tv0->get_inputs()[0]->details_.get_format(), sc_data_format_t());
-    EXPECT_EQ(
-            tv0->get_outputs()[0]->details_.get_plain_dims(), out_plain_dims0);
-    EXPECT_EQ(tv0->get_outputs()[0]->details_.get_format(),
-            sc_data_format_t(format_kinds::BA));
-    EXPECT_EQ(
-            tv2->get_outputs()[0]->details_.get_plain_dims(), out_plain_dims1);
-    EXPECT_EQ(tv2->get_outputs()[0]->details_.get_format(),
-            sc_data_format_t(format_kinds::BA));
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestBinaryOpElimination) {
-    sc_dims in_plain_dims {128, 64, 256};
-
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(in_plain_dims)});
-    auto zero = graph.make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {0.f}),
-            datatypes::f32, sc_dims {1});
-    auto one = graph.make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {1.f}),
-            datatypes::f32, sc_dims {1});
-    auto two = graph.make<constant_op_t>(
-            std::make_shared<static_data_t>(std::vector<float> {2.f}),
-            datatypes::f32, sc_dims {1});
-    auto add1 = graph.make(
-            "add", {inp->get_outputs()[0], zero->get_outputs()[0]}, {}, {});
-    auto add2 = graph.make(
-            "add", {two->get_outputs()[0], add1->get_outputs()[0]}, {}, {});
-    auto mul1 = graph.make(
-            "mul", {add1->get_outputs()[0], add2->get_outputs()[0]}, {}, {});
-    auto div1 = graph.make(
-            "div", {mul1->get_outputs()[0], one->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output({div1->get_outputs()[0]});
-
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_EQ(ops.size(), 5UL);
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), zero) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), one) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), add1) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), div1) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), two) != ops.end());
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestGraphConstantFoldingF32) {
-    sc_dims in_plain_dims {128, 64, 256};
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(in_plain_dims)});
-    auto const_inp = graph.make_input({graph_tensor::make(in_plain_dims)},
-            {{"constant", const_kind::local_const}, {"all_positive", true}});
-    float v = 2.f;
-    auto const_op = graph.make("constant", {}, {},
-            {{"all_positive", true}, {"plain_dims", sc_dims {1}},
-                    {"format", sc_data_format_t()}, {"dtype", datatypes::f32},
-                    {"values",
-                            std::make_shared<static_data_t>(
-                                    (void *)&v, sizeof(float))}});
-    auto &const_inp_tsr = const_inp->get_outputs()[0];
-    auto &const_op_tsr = const_op->get_outputs()[0];
-    auto sub0
-            = graph.make("sub", {inp->get_outputs()[0], const_op_tsr}, {}, {});
-    auto add0 = graph.make(
-            "add", {const_inp_tsr, sub0->get_outputs()[0]}, {}, {});
-    auto mul0 = graph.make(
-            "mul", {add0->get_outputs()[0], const_inp_tsr}, {}, {});
-    auto div0
-            = graph.make("div", {mul0->get_outputs()[0], const_op_tsr}, {}, {});
-    auto sub1 = graph.make(
-            "sub", {div0->get_outputs()[0], const_inp_tsr}, {}, {});
-    auto relu = graph.make("relu", {sub1->get_outputs()[0]}, {}, {});
-    auto mul1
-            = graph.make("mul", {relu->get_outputs()[0], const_op_tsr}, {}, {});
-    auto div1 = graph.make(
-            "div", {mul1->get_outputs()[0], const_inp_tsr}, {}, {});
-
-    auto output0 = graph.make_output(div1->get_outputs());
-
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: f32[128, 64, 256], v1: f32[128, 64, 256]) -> [v2: f32[128, 64, 256]] {
-  [v3: f32[1]] = constant([1])
-  [v4: f32[128, 64, 256]] = mul(v1, v3)
-  [v5: f32[128, 64, 256]] = div(v4, v1)
-  [v6: f32[128, 64, 256]] = div(v1, v3)
-  [v7: f32[128, 64, 256]] = mul(v6, v3)
-  [v8: f32[128, 64, 256]] = div(v7, v1)
-  [v9: f32[128, 64, 256]] = sub(v3, v1)
-  [v10: f32[128, 64, 256]] = sub(v0, v9)
-  [v11: f32[128, 64, 256]] = mul(v10, v8)
-  [v12: f32[128, 64, 256]] = sub(v11, v5)
-  [v2: f32[128, 64, 256]] = relu(v12)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestGraphConstantFoldingS32) {
-    sc_dims in_plain_dims {128, 64, 256};
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            in_plain_dims, sc_data_format_t(), datatypes::s32)});
-    auto const_inp = graph.make_input(
-            {graph_tensor::make(
-                    in_plain_dims, sc_data_format_t(), datatypes::s32)},
-            {{"constant", const_kind::local_const}, {"all_positive", true}});
-    int v = 2;
-    auto const_op = graph.make("constant", {}, {},
-            {{"all_positive", true}, {"plain_dims", sc_dims {1}},
-                    {"format", sc_data_format_t()}, {"dtype", datatypes::s32},
-                    {"values",
-                            std::make_shared<static_data_t>(
-                                    (void *)&v, sizeof(int))}});
-    auto &const_inp_tsr = const_inp->get_outputs()[0];
-    auto &const_op_tsr = const_op->get_outputs()[0];
-    auto sub0
-            = graph.make("sub", {inp->get_outputs()[0], const_op_tsr}, {}, {});
-    auto add0 = graph.make(
-            "add", {const_inp_tsr, sub0->get_outputs()[0]}, {}, {});
-    auto mul0 = graph.make(
-            "mul", {add0->get_outputs()[0], const_inp_tsr}, {}, {});
-    auto div0
-            = graph.make("div", {mul0->get_outputs()[0], const_op_tsr}, {}, {});
-    auto sub1 = graph.make(
-            "sub", {div0->get_outputs()[0], const_inp_tsr}, {}, {});
-    auto relu = graph.make("relu", {sub1->get_outputs()[0]}, {}, {});
-    auto mul1
-            = graph.make("mul", {relu->get_outputs()[0], const_op_tsr}, {}, {});
-    auto div1 = graph.make(
-            "div", {mul1->get_outputs()[0], const_inp_tsr}, {}, {});
-
-    auto output0 = graph.make_output(div1->get_outputs());
-
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: s32[128, 64, 256], v1: s32[128, 64, 256]) -> [v2: s32[128, 64, 256]] {
-  [v3: s32[1]] = constant([1])
-  [v4: s32[128, 64, 256]] = sub(v3, v1)
-  [v5: s32[128, 64, 256]] = sub(v0, v4)
-  [v6: s32[128, 64, 256]] = mul(v5, v1)
-  [v7: s32[128, 64, 256]] = div(v6, v3)
-  [v8: s32[128, 64, 256]] = sub(v7, v1)
-  [v9: s32[128, 64, 256]] = mul(v8, v3)
-  [v10: s32[128, 64, 256]] = div(v9, v1)
-  [v2: s32[128, 64, 256]] = relu(v10)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestGraphConstantFoldingRecursiveStop) {
-    sc_dims in_plain_dims {128, 64, 256};
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            in_plain_dims, sc_data_format_t(), datatypes::s32)});
-    auto const_inp = graph.make_input(
-            {graph_tensor::make(
-                    in_plain_dims, sc_data_format_t(), datatypes::s32)},
-            {{"constant", const_kind::local_const}, {"all_positive", true}});
-    int v = 2;
-    auto const_op = graph.make("constant", {}, {},
-            {{"all_positive", true}, {"plain_dims", sc_dims {1}},
-                    {"format", sc_data_format_t()}, {"dtype", datatypes::s32},
-                    {"values",
-                            std::make_shared<static_data_t>(
-                                    (void *)&v, sizeof(int))}});
-    auto &const_inp_tsr = const_inp->get_outputs()[0];
-    auto &const_op_tsr = const_op->get_outputs()[0];
-    auto mul0
-            = graph.make("mul", {inp->get_outputs()[0], const_op_tsr}, {}, {});
-    auto mul1 = graph.make(
-            "mul", {const_inp_tsr, mul0->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output(mul1->get_outputs());
-
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: s32[128, 64, 256], v1: s32[128, 64, 256]) -> [v2: s32[128, 64, 256]] {
-  [v3: s32[1]] = constant([1])
-  [v4: s32[128, 64, 256]] = mul(v3, v1)
-  [v2: s32[128, 64, 256]] = mul(v0, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestGraphConstantFoldingMultiOutput) {
-    sc_dims in_plain_dims {128, 64, 256};
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            in_plain_dims, sc_data_format_t(), datatypes::s32)});
-    auto const_inp = graph.make_input(
-            {graph_tensor::make(
-                    in_plain_dims, sc_data_format_t(), datatypes::s32)},
-            {{"constant", const_kind::local_const}, {"all_positive", true}});
-    int v = 2;
-    auto const_op = graph.make("constant", {}, {},
-            {{"all_positive", true}, {"plain_dims", sc_dims {1}},
-                    {"format", sc_data_format_t()}, {"dtype", datatypes::s32},
-                    {"values",
-                            std::make_shared<static_data_t>(
-                                    (void *)&v, sizeof(int))}});
-    auto &const_inp_tsr = const_inp->get_outputs()[0];
-    auto &const_op_tsr = const_op->get_outputs()[0];
-    auto mul0
-            = graph.make("mul", {inp->get_outputs()[0], const_op_tsr}, {}, {});
-    auto add0 = graph.make(
-            "add", {const_inp_tsr, mul0->get_outputs()[0]}, {}, {});
-    auto mul1 = graph.make(
-            "mul", {add0->get_outputs()[0], const_inp_tsr}, {}, {});
-    auto relu0 = graph.make("relu", add0->get_outputs(), {}, {});
-    auto output0 = graph.make_output(
-            {relu0->get_outputs()[0], mul1->get_outputs()[0]});
-
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: s32[128, 64, 256], v1: s32[128, 64, 256]) -> [v2: s32[128, 64, 256], v3: s32[128, 64, 256]] {
-  [v4: s32[1]] = constant([1])
-  [v5: s32[128, 64, 256]] = mul(v0, v4)
-  [v6: s32[128, 64, 256]] = add(v5, v1)
-  [v2: s32[128, 64, 256]] = relu(v6)
-  [v3: s32[128, 64, 256]] = mul(v6, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
-
-TEST(GCCore_CPU_graph_simplify_cpp, TestGraphPushReluBackNegative) {
-    sc_dims in_plain_dims {128, 64, 256};
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            in_plain_dims, sc_data_format_t(), datatypes::s32)});
-    auto const_inp = graph.make_input(
-            {graph_tensor::make(
-                    in_plain_dims, sc_data_format_t(), datatypes::s32)},
-            {{"constant", const_kind::local_const}, {"all_positive", false}});
-    auto &const_inp_tsr = const_inp->get_outputs()[0];
-    auto relu = graph.make("relu", {inp->get_outputs()[0]}, {}, {});
-    auto mul1 = graph.make(
-            "mul", {relu->get_outputs()[0], const_inp_tsr}, {}, {});
-    auto div1 = graph.make(
-            "div", {mul1->get_outputs()[0], const_inp_tsr}, {}, {});
-
-    auto output0 = graph.make_output(div1->get_outputs());
-
-    graph_simplify(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    const char *expected
-            = R"(graph(v0: s32[128, 64, 256], v1: s32[128, 64, 256]) -> [v2: s32[128, 64, 256]] {
-  [v3: s32[128, 64, 256]] = relu(v0)
-  [v4: s32[128, 64, 256]] = mul(v3, v1)
-  [v2: s32[128, 64, 256]] = div(v4, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_tensor_view_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_tensor_view_transform.cpp
deleted file mode 100644
index 1c695407ff6..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_graph_tensor_view_transform.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <ops/fusible/memory_movement.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform0) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input(
-            {graph_tensor::make({64}, sc_data_format_t(format_kinds::A))});
-    auto reorder = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(
-                    {64}, sc_data_format_t(format_kinds::Aa, {16, 0, 0, 0}))},
-            {});
-    auto output = graph.make_output(reorder->get_outputs());
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    EXPECT_EQ(graph.ops_.size(), 3UL);
-    for (auto &op : graph.ops_) {
-        if (!op->isa<input_op>() && !op->isa<output_op>()) {
-            EXPECT_EQ(op->get_outputs()[0]->details_.get_format(),
-                    sc_data_format_t(format_kinds::Aa, {16, 0, 0, 0}));
-        }
-    }
-}
-
-constexpr sc_data_format_kind_t fmtMmKk = sc_data_format_kind_t {0, 0, 1, 1};
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform1) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input(
-            {graph_tensor::make({64, 128}, sc_data_format_t::MK())});
-    auto reorder0 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({64, 128}, sc_data_format_t::MKmk(16, 16))},
-            {});
-    auto reorder1 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(
-                    {64, 128}, sc_data_format_t(fmtMmKk, {16, 16, 0, 0}))},
-            {});
-    auto output0 = graph.make_output(reorder0->get_outputs());
-    auto output1 = graph.make_output(reorder1->get_outputs());
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    EXPECT_EQ(graph.ops_.size(), 5UL);
-    for (auto &op : graph.ops_) {
-        if (op->isa<reorder_op_t>()) {
-            EXPECT_EQ(op->get_outputs()[0]->details_.get_format(),
-                    sc_data_format_t::MKmk(16, 16));
-        } else if (op->isa<tensor_view_op_t>()) {
-            EXPECT_EQ(op->get_outputs()[0]->details_.get_format(),
-                    sc_data_format_t(fmtMmKk, {16, 16, 0, 0}));
-        }
-    }
-}
-
-constexpr sc_data_format_kind_t fmtAaBCbbc
-        = sc_data_format_kind_t {0, 0, 1, 2, 1, 1, 2};
-constexpr sc_data_format_kind_t fmtABCbc
-        = sc_data_format_kind_t {0, 1, 2, 1, 2};
-constexpr sc_data_format_kind_t fmtAaBbCbc
-        = sc_data_format_kind_t {0, 0, 1, 1, 2, 1, 2};
-constexpr sc_data_format_kind_t fmtABCbcb
-        = sc_data_format_kind_t {0, 1, 2, 1, 2, 1};
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform2) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {128, 64, 256}, sc_data_format_t(fmtAaBCbbc, {32, 64, 16, 128}))});
-    // can eliminate
-    auto reorder0 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 64, 256},
-                    sc_data_format_t(fmtAaBCbbc, {32, 64, 32, 128}))},
-            {});
-    // can not eliminate
-    auto reorder1 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 64, 256},
-                    sc_data_format_t(fmtAaBCbbc, {32, 32, 16, 128}))},
-            {});
-    // can eliminate
-    auto reorder2 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 64, 256},
-                    sc_data_format_t(fmtABCbc, {64, 128, 0, 0}))},
-            {});
-    // can not eliminate
-    auto reorder3 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 64, 256},
-                    sc_data_format_t(fmtABCbc, {32, 128, 0, 0}))},
-            {});
-    // can not eliminate
-    auto reorder4 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 64, 256},
-                    sc_data_format_t(fmtAaBbCbc, {32, 64, 16, 128}))},
-            {});
-    // can not eliminate
-    auto reorder5 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 64, 256},
-                    sc_data_format_t(fmtABCbcb, {64, 128, 16, 0}))},
-            {});
-    auto output0 = graph.make_output(reorder0->get_outputs());
-    auto output1 = graph.make_output(reorder1->get_outputs());
-    auto output2 = graph.make_output(reorder2->get_outputs());
-    auto output3 = graph.make_output(reorder3->get_outputs());
-    auto output4 = graph.make_output(reorder4->get_outputs());
-    auto output5 = graph.make_output(reorder5->get_outputs());
-
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder0) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder1) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder2) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder3) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder4) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder5) != ops.end());
-    int tensor_view_op_count = 0;
-    for (auto &op : ops) {
-        if (op->isa<tensor_view_op_t>()) { tensor_view_op_count++; }
-    }
-    EXPECT_EQ(tensor_view_op_count, 2);
-}
-
-// Ones at begin of shapes.
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform3) {
-    auto graph = sc_graph_t();
-    auto inp = graph.make_input({graph_tensor::make(
-            {128, 256}, sc_data_format_t(format_kinds::KN))});
-    // can eliminate
-    auto reorder0 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 256},
-                    sc_data_format_t(format_kinds::NKkn, {32, 256}))},
-            {});
-    // can not eliminate
-    auto reorder1 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 256},
-                    sc_data_format_t(format_kinds::NKkn, {32, 32}))},
-            {});
-    // can not eliminate
-    auto reorder2 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 256},
-                    sc_data_format_t(format_kinds::NKknk, {32, 256, 4}))},
-            {});
-    // can eliminate
-    auto reorder3 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(
-                    {128, 256}, sc_data_format_t(format_kinds::MKmk, {1, 64}))},
-            {});
-    // can eliminate
-    auto reorder4 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make({128, 256},
-                    sc_data_format_t(format_kinds::MKmk, {32, 256}))},
-            {});
-    auto output0 = graph.make_output(reorder0->get_outputs());
-    auto output1 = graph.make_output(reorder1->get_outputs());
-    auto output2 = graph.make_output(reorder2->get_outputs());
-    auto output3 = graph.make_output(reorder3->get_outputs());
-    auto output4 = graph.make_output(reorder4->get_outputs());
-
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder0) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder1) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder2) != ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder3) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder4) == ops.end());
-
-    int tensor_view_op_count = 0;
-    for (auto &op : ops) {
-        if (op->isa<tensor_view_op_t>()) { tensor_view_op_count++; }
-    }
-    EXPECT_EQ(tensor_view_op_count, 3);
-}
-
-// Ones at middle of shapes.
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform4) {
-    auto graph = sc_graph_t();
-    sc_dims plain_dims = {128, 1, 1, 384};
-    auto inp = graph.make_input({graph_tensor::make(
-            plain_dims, sc_data_format_t(format_kinds::ABCD))});
-    // can eliminate
-    auto reorder0 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims,
-                    sc_data_format_t(format_kinds::ABCDcd, {1, 64}))},
-            {});
-    // can eliminate
-    auto reorder1 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims,
-                    sc_data_format_t(format_kinds::ABCDcd, {1, 1}))},
-            {});
-    // can eliminate
-    auto reorder2 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims,
-                    sc_data_format_t(format_kinds::ABCDcd, {1, 384}))},
-            {});
-    // can not eliminate
-    auto reorder3 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims,
-                    sc_data_format_t(format_kinds::ABCDcd, {32, 64}))},
-            {});
-    auto output0 = graph.make_output(reorder0->get_outputs());
-    auto output1 = graph.make_output(reorder1->get_outputs());
-    auto output2 = graph.make_output(reorder2->get_outputs());
-    auto output3 = graph.make_output(reorder3->get_outputs());
-
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder0) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder1) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder2) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder3) != ops.end());
-    int tensor_view_op_count = 0;
-    for (auto &op : ops) {
-        if (op->isa<tensor_view_op_t>()) { tensor_view_op_count++; }
-    }
-    EXPECT_EQ(tensor_view_op_count, 3);
-}
-
-constexpr sc_data_format_kind_t fmtABaCa
-        = sc_data_format_kind_t {0, 1, 0, 2, 0};
-// Ones at end of shapes
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform5) {
-    auto graph = sc_graph_t();
-    sc_dims plain_dims = {128, 256, 384};
-    auto inp = graph.make_input({graph_tensor::make(
-            plain_dims, sc_data_format_t(format_kinds::ABC))});
-    // can eliminate
-    auto reorder0 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(
-                    plain_dims, sc_data_format_t(fmtABaCa, {1, 1}))},
-            {});
-    // can not eliminate
-    auto reorder1 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(
-                    plain_dims, sc_data_format_t(fmtABaCa, {64, 64}))},
-            {});
-    auto output0 = graph.make_output(reorder0->get_outputs());
-    auto output1 = graph.make_output(reorder1->get_outputs());
-
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder0) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder1) != ops.end());
-
-    int tensor_view_op_count = 0;
-    for (auto &op : ops) {
-        if (op->isa<tensor_view_op_t>()) { tensor_view_op_count++; }
-    }
-    EXPECT_EQ(tensor_view_op_count, 1);
-}
-
-constexpr sc_data_format_kind_t fmtABDC = sc_data_format_kind_t {0, 1, 3, 2};
-constexpr sc_data_format_kind_t fmtADCB = sc_data_format_kind_t {0, 3, 2, 1};
-constexpr sc_data_format_kind_t fmtDABC = sc_data_format_kind_t {3, 0, 1, 2};
-TEST(GCCore_CPU_graph_tensor_view_transform, TestReorderToTransform6) {
-    auto graph = sc_graph_t();
-    sc_dims plain_dims = {128, 1, 1, 384};
-    auto inp = graph.make_input({graph_tensor::make(
-            plain_dims, sc_data_format_t(format_kinds::ABCD))});
-    // can eliminate
-    auto reorder0 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(
-                    plain_dims, sc_data_format_t(format_kinds::ACBD))},
-            {});
-    // can eliminate
-    auto reorder1 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims, sc_data_format_t(fmtABDC))}, {});
-    // can eliminate
-    auto reorder2 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims, sc_data_format_t(fmtADCB))}, {});
-    // can not eliminate
-    auto reorder3 = graph.make("reorder", inp->get_outputs(),
-            {graph_tensor::make(plain_dims, sc_data_format_t(fmtDABC))}, {});
-    auto output0 = graph.make_output(reorder0->get_outputs());
-    auto output1 = graph.make_output(reorder1->get_outputs());
-    auto output2 = graph.make_output(reorder2->get_outputs());
-    auto output3 = graph.make_output(reorder3->get_outputs());
-
-    tensor_view_transform(graph, get_test_ctx());
-    EXPECT_EQ(check_graph_connection(graph), true);
-    auto &ops = graph.ops_;
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder0) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder1) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder2) == ops.end());
-    EXPECT_TRUE(std::find(ops.begin(), ops.end(), reorder3) != ops.end());
-
-    int tensor_view_op_count = 0;
-    for (auto &op : ops) {
-        if (op->isa<tensor_view_op_t>()) { tensor_view_op_count++; }
-    }
-    EXPECT_EQ(tensor_view_op_count, 3);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_index2var.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_index2var.cpp
deleted file mode 100644
index 6ebffbf74a8..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_index2var.cpp
+++ /dev/null
@@ -1,592 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/index2var.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-TEST(GCCore_CPU_index2var_cpp, TestIndex2Var) {
-    builder::ir_builder_t builder;
-    _decl_func_(datatypes::s32, functest);
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(tmp, datatypes::f32, 100, 200);
-        tmp[{0, 0}] = tmp[{0, 0}] + 1;
-        _for_(i, 0, len) {
-            A[{i, 10}] = A[{i, 10}] + A[{i, 10}];
-            B[{i, 10}] = B[{i, 10}] + A[{i, 10}];
-            i = i + 1; // changed index dependency, should evict
-
-            A[{i, 10}] = A[{i, 10}] + B[{i, 10}];
-
-            // index changed, should evict above
-            A[{i, i}] = A[{i, 10}] + B[{i, i}];
-
-            // read the value cache of parent scope, should evict (TODO(xxx): we
-            // can avoid evict that)
-            A[{i, i}] = tmp[{0, 0}];
-        }
-
-        _if_(true) { tmp[{0, 0}] = 1.2f; }
-        _else_ {
-            tmp[{0, 0}] = 1.2f;
-            // touched the cached tensor outside of the scope, should evict
-            tmp[{0, 1}] = 1.2f;
-        }
-        _for_(i, 0, len) {
-            A[{i, 10}] = 3;
-            // should evict above, use it in function
-            builtin::mem_zero(A, 0, datatypes::f32);
-            A[{i, 10}] = 4;
-            // should evict above, tensor_ptr
-            builtin::mem_zero(
-                    builder::tensor_ptr(A, {0, 0}), 0, datatypes::f32);
-            A[{i, 10}] = 5;
-        }
-        _for_(i, 0, len) {
-            A[{i, 10}] = 3;
-            // should evict above, indexing as index.
-            A[{B[{i, 0}], 0}] = 3;
-            A[{i, 10}] = 4;
-            // should evict above, func call as index
-            A[{functest(), 0}] = 3;
-            A[{i, 10}] = 5;
-        }
-
-        _for_(i, 0, len) {
-            A[span_t({i, 10}, 16)]
-                    = A[span_t({i, 10}, 16)] + A[span_t({i, 10}, 16)];
-            B[span_t({i, 10}, 16)] = A[span_t({i, 10}, 16)];
-        }
-
-        _return_(12);
-    }
-
-    _function_(datatypes::s32, expected, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(tmp, datatypes::f32, 100, 200);
-        _var_init_(cached0, datatypes::f32, (tmp[{0, 0}]));
-        {
-            builder.push_scope();
-            cached0 = cached0 + 1;
-            tmp[{0, 0}] = cached0;
-            builder.emit(builder.pop_scope());
-        }
-        _for_(i, 0, len) {
-            _var_init_(cached1, datatypes::f32, (A[{i, 10}]));
-            {
-                builder.push_scope();
-                cached1 = cached1 + cached1;
-                A[{i, 10}] = cached1;
-                builder.emit(builder.pop_scope());
-            }
-            _var_init_(cached2, datatypes::f32, (B[{i, 10}]));
-            {
-                builder.push_scope();
-                cached2 = cached2 + cached1;
-                B[{i, 10}] = cached2;
-                builder.emit(builder.pop_scope());
-            }
-            i = i + 1; // changed index dependency, should evict
-
-            _var_init_(cached3, datatypes::f32, (A[{i, 10}]));
-            _var_init_(cached4, datatypes::f32, (B[{i, 10}]));
-            {
-                builder.push_scope();
-                cached3 = cached3 + cached4;
-                A[{i, 10}] = cached3;
-                builder.emit(builder.pop_scope());
-            }
-
-            _var_init_(cached5, datatypes::f32, (B[{i, i}]));
-            _var_(cached6, datatypes::f32);
-            {
-                builder.push_scope();
-                cached6 = cached3 + cached5;
-                builder.emit(builder.pop_scope());
-            }
-            {
-                builder.push_scope();
-                cached6 = cached0;
-                A[{i, i}] = cached6;
-                builder.emit(builder.pop_scope());
-            }
-        }
-
-        _if_(true) {
-            _var_(cached7, datatypes::f32);
-            {
-                builder.push_scope();
-                cached7 = 1.2f;
-                tmp[{0, 0}] = cached7;
-                builder.emit(builder.pop_scope());
-            }
-        }
-        _else_ {
-            _var_(cached7, datatypes::f32);
-            {
-                builder.push_scope();
-                cached7 = 1.2f;
-                tmp[{0, 0}] = cached7;
-                builder.emit(builder.pop_scope());
-            }
-            _var_(cached9, datatypes::f32);
-            {
-                builder.push_scope();
-                cached9 = 1.2f;
-                tmp[{0, 1}] = cached9;
-                builder.emit(builder.pop_scope());
-            }
-        }
-        _for_(i, 0, len) {
-            _var_(cached10, datatypes::f32);
-            {
-                builder.push_scope();
-                cached10 = 3;
-                A[{i, 10}] = cached10;
-                builder.emit(builder.pop_scope());
-            }
-            // should evict above, use it in function
-            builtin::mem_zero(A, 0, datatypes::f32);
-            _var_(cached11, datatypes::f32);
-            {
-                builder.push_scope();
-                cached11 = 4;
-                A[{i, 10}] = cached11;
-                builder.emit(builder.pop_scope());
-            }
-            // should evict above, tensor_ptr
-            builtin::mem_zero(
-                    builder::tensor_ptr(A, {0, 0}), 0, datatypes::f32);
-            _var_(cached12, datatypes::f32);
-            {
-                builder.push_scope();
-                cached12 = 5;
-                A[{i, 10}] = cached12;
-                builder.emit(builder.pop_scope());
-            }
-        }
-        _for_(i, 0, len) {
-            _var_(cached10, datatypes::f32);
-            {
-                builder.push_scope();
-                cached10 = 3;
-                A[{i, 10}] = cached10;
-                builder.emit(builder.pop_scope());
-            }
-
-            _var_init_(cached14, datatypes::f32, (B[{i, 0}]));
-            A[{cached14, 0}] = 3;
-            _var_(cached11, datatypes::f32);
-            {
-                builder.push_scope();
-                cached11 = 4;
-                A[{i, 10}] = cached11;
-                builder.emit(builder.pop_scope());
-            }
-            A[{functest(), 0}] = 3;
-            _var_(cached12, datatypes::f32);
-            {
-                builder.push_scope();
-                cached12 = 5;
-                A[{i, 10}] = cached12;
-                builder.emit(builder.pop_scope());
-            }
-        }
-
-        _for_(i, 0, len) {
-            _var_init_(cached18, sc_data_type_t::f32(16),
-                    (A[span_t({i, 10}, 16)]));
-            {
-                builder.push_scope();
-                cached18 = cached18 + cached18;
-                A[span_t({i, 10}, 16)] = cached18;
-                builder.emit(builder.pop_scope());
-            }
-            _var_(cached19, sc_data_type_t::f32(16));
-            {
-                builder.push_scope();
-                cached19 = cached18;
-                B[span_t({i, 10}, 16)] = cached19;
-                builder.emit(builder.pop_scope());
-            }
-        }
-
-        _return_(12);
-    }
-    index2var_t f;
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(f(aaa), expected, false));
-}
-
-// a test case which have pointer alias
-TEST(GCCore_CPU_index2var_cpp, TestIndex2VarAlias) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        {
-            auto clique = std::make_shared<alias_info::alias_set_t>();
-            alias_info::get_or_create_alias_info(*A.get())->add_to_clique(
-                    clique);
-            alias_info::get_or_create_alias_info(*B.get())->add_to_clique(
-                    clique);
-        }
-        A[{0, 1}] = 1.0f;
-        A[{0, 1}] = A[{0, 1}] + 1.0f;
-        _var_(a, datatypes::f32);
-        a = B[{0, 1}];
-        A[{0, 1}] = A[{0, 1}] + 1.0f;
-        a = B[{0, 1}] + 1;
-        B[{0, 1}] = B[{0, 1}] + 1;
-        _return_(A[{0, 1}] + B[{0, 1}]);
-    }
-
-    index2var_t f;
-    auto out = f(aaa);
-
-    _function_(datatypes::s32, expected, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _var_(cached0, sc_data_type_t::f32());
-        {
-            builder.push_scope();
-            cached0 = 1.0f;
-            builder.emit(builder.pop_scope());
-        }
-        {
-            builder.push_scope();
-            cached0 = cached0 + 1.0f;
-            builder.emit(builder.pop_scope());
-        }
-        _var_(a, datatypes::f32);
-        _var_init_(cached1, sc_data_type_t::f32(), (B[{0, 1}]));
-        a = cached1;
-        {
-            // original code: A[{0, 1}] = A[{0, 1}] + 1.0f;
-            // B is read, OK to keep cache of A
-            builder.push_scope();
-            cached0 = cached0 + 1.0f;
-            A[{0, 1}] = cached0;
-            builder.emit(builder.pop_scope());
-        }
-        // A is written, need to evict and re-load B
-        _var_init_(cached2, sc_data_type_t::f32(), (B[{0, 1}]));
-        a = cached2 + 1;
-        {
-            // original code: B[{0, 1}] = B[{0, 1}] + 1.0f;
-            builder.push_scope();
-            cached2 = cached2 + 1;
-            B[{0, 1}] = cached2;
-            builder.emit(builder.pop_scope());
-        }
-        // B is written, need to reload A
-        _var_init_(cached3, sc_data_type_t::f32(), (A[{0, 1}]));
-        _return_(cached3 + cached2);
-    }
-
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_index2var_cpp, TestIndex2VarLoopLift) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        // can lift
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _var_(i2, datatypes::index);
-                    i2 = i + 1;
-                    A[{i2, 0}] = 0;
-                }
-            }
-        }
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _var_(mask, datatypes::u8);
-                    mask = make_constant({UINT64_C(1)}, datatypes::u8);
-                    _var_(tmp, sc_data_type_t::f32(8));
-                    tmp = B[span_t({i, 0}, 8, mask)];
-                }
-            }
-        }
-        // not nested for
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                len = len + 1;
-                _for_(k, 0, 100, 1) { A[{i, 0}] = 0; }
-            }
-        }
-        // flush before
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    A[{0, 0}] = 0;
-                    A[{i, 0}] = 0;
-                }
-            }
-        }
-        // complex
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) { A[{len, 0}] = 0; }
-            }
-        }
-        // tensor defined in inner loop
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _tensor_(C, datatypes::s32, {10, 20});
-                    C[{i, 0}] = 0;
-                }
-            }
-        }
-        _return_(0);
-    }
-
-    _function_(datatypes::s32, expected, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        // can lift
-        _for_(i, 0, 100, 1) {
-            _var_(i2, datatypes::index);
-            i2 = i + 1;
-            _var_(cache0, datatypes::f32);
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    builder.push_scope();
-                    cache0 = 0;
-                    builder.emit(builder.pop_scope());
-                }
-            }
-            A[{i2, 0}] = cache0;
-        }
-        _for_(i, 0, 100, 1) {
-            _var_(mask, datatypes::u8);
-            mask = make_constant({UINT64_C(1)}, datatypes::u8);
-            _var_init_(
-                    cache1, sc_data_type_t::f32(8), B[span_t({i, 0}, 8, mask)]);
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _var_(tmp, sc_data_type_t::f32(8));
-                    tmp = cache1;
-                }
-            }
-        }
-        // not nested for
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                len = len + 1;
-                _var_(cache0, datatypes::f32);
-                _for_(k, 0, 100, 1) {
-                    builder.push_scope();
-                    cache0 = 0;
-                    builder.emit(builder.pop_scope());
-                }
-                A[{i, 0}] = cache0;
-            }
-        }
-        // flush before
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _var_(cache0, datatypes::f32);
-                    builder.push_scope();
-                    cache0 = 0;
-                    A[{0, 0}] = cache0;
-                    builder.emit(builder.pop_scope());
-
-                    _var_(cache1, datatypes::f32);
-                    builder.push_scope();
-                    cache1 = 0;
-                    A[{i, 0}] = cache1;
-                    builder.emit(builder.pop_scope());
-                }
-            }
-        }
-        // complex
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _var_(cache0, datatypes::f32);
-                    builder.push_scope();
-                    cache0 = 0;
-                    A[{len, 0}] = cache0;
-                    builder.emit(builder.pop_scope());
-                }
-            }
-        }
-        // tensor defined in inner loop
-        _for_(i, 0, 100, 1) {
-            _for_(j, 0, 100, 1) {
-                _for_(k, 0, 100, 1) {
-                    _tensor_(C, datatypes::s32, {10, 20});
-                    _var_(cache5, datatypes::s32);
-                    builder.push_scope();
-                    cache5 = 0;
-                    C[{i, 0}] = cache5;
-                    builder.emit(builder.pop_scope());
-                }
-            }
-        }
-        _return_(0);
-    }
-
-    index2var_t f;
-    auto out = f(aaa);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_index2var_cpp, TestIndex2VarMask) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::f32, aaa, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        expr mask = builder::make_constant({UINT64_C(16)}, datatypes::u16);
-        A[span_t {{len}, 16, mask}] = B[span_t {{len}, 16}];
-        _var_(a, sc_data_type_t::f32(16));
-        _var_(b, sc_data_type_t::f32(16));
-        b = A[span_t {{len}, 16}];
-        a = A[span_t {{len}, 16, mask}];
-        _return_(builder::make_reduce_add(a));
-    }
-
-    _function_(datatypes::f32, expected, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        expr mask = builder::make_constant({UINT64_C(16)}, datatypes::u16);
-        _var_init_(cached0, sc_data_type_t::f32(16), (B[span_t {{len}, 16}]));
-        _var_(cached1, sc_data_type_t::f32(16));
-        {
-            // original code: A[{0, 1}] = A[{0, 1}] + 1.0f;
-            // B is read, OK to keep cache of A
-            builder.push_scope();
-            // mask > 0 is true, index2var will use cached0.
-            cached1 = cached0;
-            A[span_t {{len}, 16, mask}] = cached1;
-            builder.emit(builder.pop_scope());
-        }
-        _var_(a, sc_data_type_t::f32(16));
-        _var_(b, sc_data_type_t::f32(16));
-        _var_init_(cached2, sc_data_type_t::f32(16), (A[span_t {{len}, 16}]));
-        b = cached2;
-        _var_init_(cached3, sc_data_type_t::f32(16),
-                (A[span_t {{len}, 16, mask}]));
-        a = cached3;
-        _return_(builder::make_reduce_add(a));
-    }
-
-    index2var_t f;
-    auto out = f(aaa);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_index2var_cpp, TestIndex2VarMultiIndex) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::f32, aaa, _arg_("A", datatypes::f32, {1000}),
-            _arg_("B", datatypes::f32, {1000}), _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _for_(i, 0, 100) {
-            _for_(j, 0, 100, 64) {
-                B[span_t({i * UINT64_C(100)}, 16)]
-                        = B[span_t({i * UINT64_C(100)}, 16)]
-                        + A[span_t({i * UINT64_C(10) + j * UINT64_C(16)}, 16)];
-                B[span_t({i * UINT64_C(100) + UINT64_C(16)}, 16)]
-                        = B[span_t({i * UINT64_C(100) + UINT64_C(16)}, 16)]
-                        + A[span_t({i * UINT64_C(10) + j * UINT64_C(16)}, 16)];
-                B[span_t({i * UINT64_C(100) + UINT64_C(32)}, 16)]
-                        = B[span_t({i * UINT64_C(100) + UINT64_C(32)}, 16)]
-                        + A[span_t({i * UINT64_C(10) + j * UINT64_C(16)}, 16)];
-                B[span_t({i * UINT64_C(100) + UINT64_C(48)}, 16)]
-                        = B[span_t({i * UINT64_C(100) + UINT64_C(48)}, 16)]
-                        + A[span_t({i * UINT64_C(10) + j * UINT64_C(16)}, 16)];
-            }
-        }
-    }
-
-    _function_(datatypes::f32, expected, _arg_("A", datatypes::f32, {1000}),
-            _arg_("B", datatypes::f32, {1000}), _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _for_(i, 0, 100) {
-            _var_init_(cache1, sc_data_type_t::f32(16),
-                    B[span_t({i * UINT64_C(100)}, 16)]);
-            _var_init_(cache2, sc_data_type_t::f32(16),
-                    B[span_t({i * UINT64_C(100) + UINT64_C(16)}, 16)]);
-            _var_init_(cache3, sc_data_type_t::f32(16),
-                    B[span_t({i * UINT64_C(100) + UINT64_C(32)}, 16)]);
-            _var_init_(cache4, sc_data_type_t::f32(16),
-                    B[span_t({i * UINT64_C(100) + UINT64_C(48)}, 16)]);
-            _for_(j, 0, 100, 64) {
-                _var_init_(cacheA, sc_data_type_t::f32(16),
-                        A[span_t({i * UINT64_C(10) + j * UINT64_C(16)}, 16)]);
-                {
-                    builder.push_scope();
-                    cache1 = cache1 + cacheA;
-                    builder.emit(builder.pop_scope());
-                }
-                {
-                    builder.push_scope();
-                    cache2 = cache2 + cacheA;
-                    builder.emit(builder.pop_scope());
-                }
-                {
-                    builder.push_scope();
-                    cache3 = cache3 + cacheA;
-                    builder.emit(builder.pop_scope());
-                }
-                {
-                    builder.push_scope();
-                    cache4 = cache4 + cacheA;
-                    builder.emit(builder.pop_scope());
-                }
-            }
-            B[span_t({i * UINT64_C(100)}, 16)] = cache1;
-            B[span_t({i * UINT64_C(100) + UINT64_C(16)}, 16)] = cache2;
-            B[span_t({i * UINT64_C(100) + UINT64_C(32)}, 16)] = cache3;
-            B[span_t({i * UINT64_C(100) + UINT64_C(48)}, 16)] = cache4;
-        }
-    }
-
-    index2var_t f;
-    auto out = f(aaa);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_index_flatten.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_index_flatten.cpp
deleted file mode 100644
index 57c1bc1cfd5..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_index_flatten.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_index_flatten_cpp, TestIndexFlatten) {
-    builder::ir_builder_t builder;
-    for_loop li, lj, lk, lp;
-    auto ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), std::vector<func_t> {}, 0);
-    _global_tensor_(ir_mod, gv, datatypes::f32, 2, 2);
-    int arr[] = {1, 2, 3, 4};
-    gv.static_as<tensor>()->init_value_
-            = std::make_shared<static_data_t>(arr, sizeof(arr));
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222, 333}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(D, datatypes::f32, 10, 20);
-        _tensor_(F, datatypes::f32, 100, len);
-        _var_(ptr, datatypes::pointer);
-        gv[{1, 1}] = 123;
-        _for_(i, 0, len) {
-            A[{i, 10}] = B[{10, i, 4}] + D[{1, i}] + F[{11, i}];
-            ptr = builder::tensor_ptr(A, {i, 2});
-            expr ptr1 = builder::tensor_ptr(A, {i, 2}, {4, 4});
-            ptr1[{1, 1}] = 3;
-            expr ptr2 = builder::tensor_ptr(ptr1, {1, 1}, {4, 4});
-            ptr2[{2, 1}] = 3;
-            // slice mode
-            expr ptr3 = builder::tensor_ptr(A, {i, 2}, {}, true);
-            ptr3[{1, 1}] = 3;
-            expr ptr4 = builder::tensor_ptr(ptr3, {1, 1}, {}, true);
-            ptr4[{2, 1}] = 3;
-            // reshape slice
-            expr ptr5 = builder::tensor_ptr(A, {1, 1}, {120, 120}, true);
-            expr ptr6 = builder::tensor_ptr(ptr5, {0, 0}, {1, 14400}, false);
-            ptr6[{0, 123}] = 5;
-        }
-    }
-
-    expr expected_gv = builder::make_tensor("gv",
-            std::vector<expr> {(2 - expr(1)) * UINT64_C(1)
-                    + (((2 - expr(1)) * UINT64_C(2)) + 1)},
-            datatypes::f32, address_space::automatic,
-            gv.static_as<tensor>()->init_value_);
-
-    _function_(datatypes::s32, expected,
-            _arg_("A", datatypes::f32, {123 * 321}),
-            _arg_("B", datatypes::f32, {111}), _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(D, datatypes::f32,
-                (20 - expr(1)) * UINT64_C(1)
-                        + ((10 - expr(1)) * UINT64_C(20) + 1));
-        _tensor_(F, datatypes::f32,
-                (len - 1) * UINT64_C(1)
-                        + ((100 - expr(1))
-                                        * builder::make_cast(
-                                                datatypes::index, len)
-                                + 1));
-        _var_(ptr, datatypes::pointer);
-        expected_gv[1 * expr(UINT64_C(2)) + 1 * expr(UINT64_C(1))] = 123;
-        _for_(i, 0, len) {
-            A[i * expr(UINT64_C(321)) + 10 * expr(UINT64_C(1))]
-                    = B[10 * expr(UINT64_C(73926))
-                              + (i * expr(UINT64_C(333))
-                                      + 4 * expr(UINT64_C(1)))]
-                    + D[1 * expr(UINT64_C(20)) + i * UINT64_C(1)]
-                    + F[11 * builder::make_cast(datatypes::index, len)
-                            + i * UINT64_C(1)];
-            ptr = builder::tensor_ptr(
-                    A, {i * expr(UINT64_C(321)) + 2 * expr(UINT64_C(1))});
-            A[(i * expr(UINT64_C(321)) + 2 * expr(UINT64_C(1)))
-                    + (1 * (1 * expr(4)) + 1 * expr(1))]
-                    = 3;
-            A[(i * expr(UINT64_C(321)) + 2 * expr(UINT64_C(1)))
-                    + (1 * (1 * expr(4)) + 1 * expr(1))
-                    + (2 * (1 * expr(4)) + 1 * expr(1))]
-                    = 3;
-            // slice mode
-            A[(i * expr(UINT64_C(321)) + 2 * expr(UINT64_C(1)))
-                    + (1 * expr(UINT64_C(321)) + 1 * expr(UINT64_C(1)))]
-                    = 3;
-            A[(i * expr(UINT64_C(321)) + 2 * expr(UINT64_C(1)))
-                    + (1 * expr(UINT64_C(321)) + 1 * expr(UINT64_C(1)))
-                    + (2 * expr(UINT64_C(321)) + 1 * expr(UINT64_C(1)))]
-                    = 3;
-            A[((((1 * expr(321UL)) + (1 * expr(1UL)))
-                       + ((0 * expr(321UL)) + (0 * expr(1UL))))
-                    + (((((123 * expr(1)) + expr(0UL)) / expr(120UL))
-                               * expr(321UL))
-                            + ((((((123 * expr(1)) + expr(0UL)) / expr(1UL))
-                                        % expr(120UL))
-                                       * expr(1UL))
-                                    + expr(0))))]
-                    = 5;
-        }
-    }
-    expected->name_ = "aaa";
-    expected->params_[0].as<tensor>()->dims_ = {(321 - expr(1)) * UINT64_C(1)
-            + ((123 - expr(1)) * expr(UINT64_C(321)) + 1)};
-    expected->params_[1].as<tensor>()->dims_ = {(333 - expr(1)) * UINT64_C(1)
-            + ((222 - expr(1)) * expr(UINT64_C(333))
-                    + ((111 - expr(1)) * expr(UINT64_C(73926)) + 1))};
-    ir_mod->add_func({aaa});
-    auto out = index_flattener_t()(ir_mod);
-    ir_comparer cmper(false, true);
-    expr gv_new = out->get_module_vars().at(0)->var_;
-    EXPECT_TRUE(cmper.compare(gv_new, expected_gv, false));
-    EXPECT_TRUE(cmper.compare(out->get_entry_func(), expected));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_inliner.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_inliner.cpp
deleted file mode 100644
index ca8c987fb9b..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_inliner.cpp
+++ /dev/null
@@ -1,325 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/func_inline.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-TEST(GCCore_CPU_func_inline_cpp, TestInlineAt) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123, 321}),
-            _arg_("B", datatypes::f32, {111, 222, 333}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(D, datatypes::f32, 10, 20);
-        _tensor_(F, datatypes::f32, 100, len);
-        _for_(i, 0, len) {
-            A[{i, 10}] = B[{10, i, 4}] + D[{1, i}] + F[{11, i}];
-        }
-        _return_(12);
-    }
-
-    auto get_func = [&aaa]() -> func_t {
-        _function_(datatypes::s32, mainfunc,
-                _arg_("A1", datatypes::f32, {123, 321}),
-                _arg_("B1", datatypes::f32, {111, 222, 333}),
-                _arg_("len1", datatypes::s32)) {
-            _bind_(A, B, len);
-            A[{0, 0}] = 1;
-            A[{0, 0}] = with_attr(aaa(A, B, len), "inline_level", 2);
-        }
-        return mainfunc;
-    };
-
-    func_t mainfunc = get_func();
-    stmts body = mainfunc->body_.as<stmts>();
-    assign the_assign = body->seq_[1].as<assign>();
-    call callnode = the_assign->value_.as<call>();
-    func_inliner_t inliner;
-    the_assign->value_
-            = inliner.inline_at(callnode, body->seq_, 1).remove_const();
-
-    _function_(datatypes::s32, reference,
-            _arg_("A1", datatypes::f32, {123, 321}),
-            _arg_("B1", datatypes::f32, {111, 222, 333}),
-            _arg_("len1", datatypes::s32)) {
-        _bind_(A, B, len);
-        A[{0, 0}] = 1;
-        _var_(retval, datatypes::s32);
-        builder.push_scope();
-        {
-            _tensor_(D, datatypes::f32, 10, 20);
-            _tensor_(F, datatypes::f32, 100, len);
-            _for_(i, 0, len) {
-                A[{i, 10}] = B[{10, i, 4}] + D[{1, i}] + F[{11, i}];
-            }
-            retval = 12;
-        }
-        builder.emit(builder.pop_scope());
-        A[{0, 0}] = retval;
-    }
-
-    ir_comparer cmp;
-    EXPECT_TRUE(cmp.compare(mainfunc, reference));
-
-    func_inliner_t inl;
-    func_c f = get_func();
-    f = inl(f);
-    EXPECT_TRUE(cmp.compare(f, reference));
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineTensor) {
-    builder::ir_builder_t builder;
-
-    _function_(
-            datatypes::void_t, aaa, _arg_("A", datatypes::f32, {100 * 200})) {
-        _bind_(A);
-        _for_(i, 0, 100) {
-            _for_(j, 0, 200) { A[i * 200 + j] = A[i * 200 + j] + 1; }
-        }
-        _var_init_(ptr, datatypes::pointer, builder::tensor_ptr(A, {0}));
-    }
-
-    _function_(datatypes::s32, mainfunc,
-            _arg_("A1", datatypes::f32, {200 * 400})) {
-        _bind_(A);
-        A[0] = 1;
-        builder.push_evaluate(
-                with_attr(aaa(tensor_ptr(A, {100}, {100})), "inline_level", 2));
-    }
-    func_inliner_t inl;
-
-    _function_(datatypes::s32, expected,
-            _arg_("A1", datatypes::f32, {200 * 400})) {
-        _bind_(A);
-        A[0] = 1;
-        builder.push_scope();
-        {
-            _for_(i, 0, 100) {
-                _for_(j, 0, 200) {
-                    A[100 + (i * 200 + j)] = A[100 + (i * 200 + j)] + 1;
-                }
-            }
-        }
-        _var_init_(ptr, datatypes::pointer,
-                builder::tensor_ptr(A, {100 + expr(0)}));
-        builder.emit(builder.pop_scope());
-
-        builder.push_scope();
-        builder.emit(builder.pop_scope());
-    }
-    auto out = inl(mainfunc);
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineFailure) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, aaa) {
-        _for_(i, 0, 100) { _return_(12); }
-        _return_(12);
-    }
-    func_inliner_t inliner;
-    std::vector<stmt> seq;
-    EXPECT_SC_ERROR(inliner.inline_at(aaa().as<call_c>(), seq, 0),
-            "return_node should be the last statement in the IR, got");
-    _function_(datatypes::s32, aaa2) {
-        _return_(12);
-        builder.push_evaluate(1);
-    }
-    EXPECT_SC_ERROR(inliner.inline_at(aaa2().as<call_c>(), seq, 0),
-            "return_node should be the last statement in the IR, got");
-
-    _function_(datatypes::void_t, aaa3) {
-        _var_(a, datatypes::s32);
-        _return_(12);
-    }
-    EXPECT_SC_ERROR(inliner.inline_at(aaa3().as<call_c>(), seq, 0),
-            "The function to inline returns a value, but ret_var_ is");
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineSingleExpr) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, add1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _return_(v + 1);
-    }
-    func_inliner_t inliner;
-    _function_(datatypes::void_t, aaa) {
-        _var_(a, datatypes::s32);
-        a = with_attr(add1(123), "inline_level", 2);
-    }
-
-    _function_(datatypes::void_t, expected) {
-        _var_(a, datatypes::s32);
-        a = 123 + expr(1);
-    }
-
-    auto bbb = inliner(aaa);
-    ir_comparer cmp;
-    EXPECT_TRUE(cmp.compare(bbb, expected));
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineNestedSimple) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, add1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _return_(v + 1);
-    }
-
-    _function_(datatypes::s32, sub1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _return_(with_attr(add1(v), "inline_level", 2) - 1);
-    }
-    func_inliner_t inliner;
-    _function_(datatypes::void_t, aaa) {
-        _var_(a, datatypes::s32);
-        a = with_attr(sub1(123), "inline_level", 2);
-    }
-
-    _function_(datatypes::void_t, expected) {
-        _var_(a, datatypes::s32);
-        a = expr(123) + expr(1) - 1;
-    }
-
-    auto bbb = inliner(aaa);
-    ir_comparer cmp;
-    EXPECT_TRUE(cmp.compare(bbb, expected));
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineNested) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, add1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _var_(b, datatypes::s32);
-        b = v + 1;
-        _return_(b + 2);
-    }
-
-    _function_(datatypes::s32, sub1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _var_(b, datatypes::s32);
-        b = with_attr(add1(v), "inline_level", 2);
-        _return_(b - 1);
-    }
-
-    func_inliner_t inliner;
-    _function_(datatypes::void_t, aaa) {
-        _var_(a, datatypes::s32);
-        a = with_attr(sub1(a), "inline_level", 2);
-    }
-
-    _function_(datatypes::void_t, expected) {
-        _var_(a, datatypes::s32);
-        _var_(ret1, datatypes::s32);
-        builder.push_scope();
-        {
-            _var_(localv, datatypes::s32);
-            _var_(ret2, datatypes::s32);
-            builder.push_scope();
-            {
-                _var_(b, datatypes::s32);
-                b = a + 1;
-                ret2 = b + 2;
-            }
-            builder.emit(builder.pop_scope());
-            localv = ret2;
-            ret1 = localv - 1;
-        }
-        builder.emit(builder.pop_scope());
-        a = ret1;
-    }
-
-    auto bbb = inliner(aaa);
-    ir_comparer cmp;
-    EXPECT_TRUE(cmp.compare(bbb, expected, false));
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineNestedExceed) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, add1, _arg_("v", datatypes::s32)) {}
-
-    _function_(datatypes::s32, sub1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _var_(b, datatypes::s32);
-        b = with_attr(add1(v), "inline_level", 2);
-        _return_(b - 1);
-    }
-
-    builder.push_scope();
-    {
-        expr::lvalue_proxy_t v(add1->params_[0], false);
-        _return_(with_attr(sub1(v), "inline_level", 2));
-    }
-    add1->body_ = builder.pop_scope();
-
-    func_inliner_t inliner;
-    EXPECT_SC_ERROR(inliner(add1), "Reached max inline recursion depth");
-    // clear the body, or there is a loop in shared_ptr
-    add1->body_.checked_as<stmts>()->seq_.clear();
-}
-
-TEST(GCCore_CPU_func_inline_cpp, TestInlineCorrectDecl) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, add1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _return_(v + 1);
-    }
-
-    func_t add2;
-    {
-        _function_(datatypes::s32, add1, _arg_("v", datatypes::s32)) {
-            _bind_(v);
-            _return_(v + 2);
-        }
-        add2 = add1;
-    }
-
-    _function_(datatypes::s32, sub1, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _return_(with_attr(add1(v), "inline_level", 2) - 1);
-    }
-    auto mod = std::make_shared<ir_module_t>(
-            get_default_context(), std::vector<func_t> {sub1});
-    mod->add_func({add1});
-    // simulate a pass that changes add1 to add2, without changing the reference
-    // in sub1
-    mod->get_contents()[1] = add2;
-    func_inliner_t inliner;
-
-    _function_(datatypes::s32, expected, _arg_("v", datatypes::s32)) {
-        _bind_(v);
-        _return_(v + 2 - 1);
-    }
-
-    auto bbb = inliner(mod);
-    ir_comparer cmp;
-    EXPECT_TRUE(cmp.compare(bbb->get_contents()[0], expected));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_insert_trace.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_insert_trace.cpp
deleted file mode 100644
index c73c740dd5d..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_insert_trace.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/insert_trace.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-extern int get_last_trace_func_id();
-
-}
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-TEST(GCCore_CPU_insert_trace, TestInsertTrace) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc1, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        _tensor_(B, datatypes::f32, 100);
-        A[1010] = A[3010];
-        B[span_t({1000}, 8)] = A[span_t({0}, 8)];
-    }
-    _function_(datatypes::void_t, ccc2, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        A[1010] = A[3010];
-        _return_();
-    }
-    _function_(datatypes::void_t, ccc3, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        A[1010] = A[3010];
-        _if_(A[1010] == 1) { _return_(); }
-        _return_();
-    }
-
-    ir_module_ptr mod = std::make_shared<ir_module_t>(get_default_context());
-    mod->add_func({ccc1, ccc2, ccc3});
-    auto outmod = trace_inserter_t()(mod);
-
-    int func_id = get_last_trace_func_id();
-    _function_(datatypes::void_t, eccc1, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        builder.push_evaluate(builtin::make_trace(func_id - 2, 0, 0));
-        _tensor_(B, datatypes::f32, 100);
-        A[1010] = A[3010];
-        B[span_t({1000}, 8)] = A[span_t({0}, 8)];
-        builder.push_evaluate(builtin::make_trace(func_id - 2, 1, 0));
-    }
-    _function_(datatypes::void_t, eccc2, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        builder.push_evaluate(builtin::make_trace(func_id - 1, 0, 0));
-        A[1010] = A[3010];
-        builder.push_scope();
-        {
-            builder.push_evaluate(builtin::make_trace(func_id - 1, 1, 0));
-            _return_();
-        }
-        builder.emit(builder.pop_scope());
-    }
-    _function_(datatypes::void_t, eccc3, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        builder.push_evaluate(builtin::make_trace(func_id, 0, 0));
-        A[1010] = A[3010];
-        _if_(A[1010] == 1) {
-            builder.push_scope();
-            {
-                builder.push_evaluate(builtin::make_trace(func_id, 1, 0));
-                _return_();
-            }
-            builder.emit(builder.pop_scope());
-        }
-        builder.push_scope();
-        {
-            builder.push_evaluate(builtin::make_trace(func_id, 1, 0));
-            _return_();
-        }
-        builder.emit(builder.pop_scope());
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(outmod->get_contents()[0], eccc1));
-    EXPECT_TRUE(cmper.compare(outmod->get_contents()[1], eccc2));
-    EXPECT_TRUE(cmper.compare(outmod->get_contents()[2], eccc3));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_intrisics_combine.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_intrisics_combine.cpp
deleted file mode 100644
index 7fd3d15723b..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_intrisics_combine.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <compiler/ir/transform/value_numbering.hpp>
-#include <compiler/jit/xbyak/ir/transform/intrinsics_combine.hpp>
-#include <util/any_map.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-constexpr auto s32 = datatypes::s32;
-constexpr auto f32 = datatypes::f32;
-constexpr auto f32x16 = sc_data_type_t::f32(16);
-
-TEST(GCCore_CPU_test_intrinsics_combine, TestFmaddCombine) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", f32, {10000}),
-            _arg_("B", f32, {10000}), _arg_("C", f32, {10000}),
-            _arg_("D", f32, {10000})) {
-        _bind_(A, B, C, D);
-
-        _var_(v1, f32);
-        v1 = A[0] * B[0];
-        _var_(v2, f32);
-        v2 = v1 + C[0]; // can combine
-        D[0] = v2;
-
-        _var_(v3, f32);
-        v3 = A[1] * B[1];
-        _var_(v4, f32);
-        v4 = C[1] + v3; // can combine
-        D[1] = v4;
-
-        _var_(v5, f32);
-        v5 = A[2] * B[2];
-        _var_(v6, f32);
-        v6 = v5 + C[2]; // cannot combine
-        D[2] = v6;
-        D[3] = v5;
-
-        _var_(v7, f32);
-        v7 = A[3] * B[3];
-        _var_(v8, f32);
-        v8 = C[3] - v7; // can combine
-        D[4] = v8;
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    xbyak::intrinsics_combine_t ic;
-    out = ic(out);
-
-    _function_(datatypes::void_t, expected, _arg_("A", f32, {10000}),
-            _arg_("B", f32, {10000}), _arg_("C", f32, {10000}),
-            _arg_("D", f32, {10000})) {
-        _bind_(A, B, C, D);
-
-        _var_init_(i0, s32, 0);
-        _var_init_(t0, f32, A[i0]);
-        _var_init_(i1, s32, 0);
-        _var_init_(t1, f32, B[i1]);
-        _var_init_(v0, f32, t0 * t1);
-        _var_init_(i2, s32, 0);
-        _var_init_(t2, f32, C[i2]);
-        _var_init_(v1, f32, builder::make_fmadd(t0, t1, t2));
-        _var_init_(i3, s32, 0);
-        D[i3] = v1;
-
-        _var_init_(i4, s32, 1);
-        _var_init_(t3, f32, A[i4]);
-        _var_init_(i5, s32, 1);
-        _var_init_(t4, f32, B[i5]);
-        _var_init_(v2, f32, t3 * t4);
-        _var_init_(i6, s32, 1);
-        _var_init_(t5, f32, C[i6]);
-        _var_init_(v3, f32, builder::make_fmadd(t3, t4, t5));
-        _var_init_(i7, s32, 1);
-        D[i7] = v3;
-
-        _var_init_(i8, s32, 2);
-        _var_init_(t6, f32, A[i8]);
-        _var_init_(i9, s32, 2);
-        _var_init_(t7, f32, B[i9]);
-        _var_init_(v4, f32, t6 * t7);
-        _var_init_(i10, s32, 2);
-        _var_init_(t8, f32, C[i10]);
-        _var_init_(v5, f32, v4 + t8);
-        _var_init_(i11, s32, 2);
-        D[i11] = v5;
-        _var_init_(i12, s32, 3);
-        D[i12] = v4;
-
-        _var_init_(i13, s32, 3);
-        _var_init_(t9, f32, A[i13]);
-        _var_init_(i14, s32, 3);
-        _var_init_(t10, f32, B[i14]);
-        _var_init_(v6, f32, t9 * t10);
-        _var_init_(i15, s32, 3);
-        _var_init_(t11, f32, C[i15]);
-        _var_init_(v7, f32, builder::make_fnmadd(t9, t10, t11));
-        _var_init_(i16, s32, 4);
-        D[i16] = v7;
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_test_intrinsics_combine, TestFmaddCombineLoop) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", f32, {10000}),
-            _arg_("B", f32, {10000}), _arg_("C", f32, {10000}),
-            _arg_("D", f32, {10000})) {
-        _bind_(A, B, C, D);
-        _var_(v7, f32);
-        v7 = A[2] * B[2];
-        _for_(i, 0, 10, 1) {
-            _var_(v8, f32);
-            v8 = C[i] + v7; // cannot combine
-            D[i] = v8;
-        }
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    value_numbering_t vn;
-    out = vn(out);
-    xbyak::intrinsics_combine_t ic;
-    out = ic(out);
-
-    _function_(datatypes::void_t, expected, _arg_("A", f32, {10000}),
-            _arg_("B", f32, {10000}), _arg_("C", f32, {10000}),
-            _arg_("D", f32, {10000})) {
-        _bind_(A, B, C, D);
-
-        _var_init_(t21, f32, A[2]);
-        _var_init_(t23, f32, B[2]);
-        _var_init_(t24, f32, (t21 * t23));
-        _for_(i, 0, 10, 1) {
-            _var_init_(t28, f32, C[i]);
-            _var_init_(v210, f32, (t28 + t24));
-            D[i] = v210;
-        }
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_test_intrinsics_combine, TestBroadcastCombine) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", f32, {10000}),
-            _arg_("B", f32, {10000}), _arg_("C", f32, {10000}),
-            _arg_("D", f32, {10000})) {
-        _bind_(A, B, C, D);
-
-        _var_(v1, f32);
-        v1 = A[0]; // can combine
-        B[span_t({0}, 16)] = builder::make_broadcast(v1, 16);
-
-        _var_(v2, f32);
-        v2 = A[1]; // cannot combine
-        C[span_t({0}, 16)] = builder::make_broadcast(v2, 16);
-        D[0] = v2;
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    xbyak::intrinsics_combine_t ic;
-    out = ic(out);
-
-    _function_(datatypes::void_t, expected, _arg_("A", f32, {10000}),
-            _arg_("B", f32, {10000}), _arg_("C", f32, {10000}),
-            _arg_("D", f32, {10000})) {
-        _bind_(A, B, C, D);
-
-        _var_init_(i0, s32, 0);
-        _var_init_(t0, f32, A[i0]);
-        _var_init_(i1, s32, 0);
-        auto bcast_idx
-                = builder::make_x86_intrin(x86_intrin_type::avx_broadcast_idx,
-                        {A, i0, 1}, {{"lanes", 16}});
-        _var_init_(v0, f32x16, bcast_idx);
-        B[span_t({i1}, 16)] = v0;
-
-        _var_init_(i2, s32, 1);
-        _var_init_(t1, f32, A[i2]);
-        _var_init_(i3, s32, 0);
-        _var_init_(v1, f32x16, builder::make_broadcast(t1, 16));
-        C[span_t({i3}, 16)] = v1;
-        _var_init_(i4, s32, 0);
-        D[i4] = t1;
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ir_module.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_ir_module.cpp
deleted file mode 100644
index e8effd3ad65..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ir_module.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-TEST(GCCore_CPU_ir_module_cpp, TestIRModule) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::f32, AAA) { _return_(1.0f); }
-
-    _function_(datatypes::f32, BBB) { _return_(AAA()); }
-
-    _function_(datatypes::f32, CCC) { _return_(AAA() + BBB()); }
-
-    _function_(datatypes::f32, DDD) { _return_(BBB() + BBB()); }
-    DDD->name_ = "CCC"; // make a nameing conflict
-
-    _function_(datatypes::f32, entry) { _return_(CCC() + DDD()); }
-
-    auto modu = ir_module_t::from_entry_func(get_default_context(), entry);
-
-    _global_var_(modu, val1, datatypes::f32, expr());
-    {
-        auto newfunc = modu->get_contents();
-        ASSERT_EQ(newfunc.size(), 5u);
-
-        func_t newDDD = std::const_pointer_cast<func_base>(newfunc[2]);
-        _function_(datatypes::f32, entry_expected) {
-            _return_(CCC->decl_() + newDDD->decl_());
-        }
-        EXPECT_TRUE(newfunc[0]->equals(entry_expected));
-        EXPECT_EQ(newfunc[1], CCC);
-        EXPECT_EQ(newfunc[2], DDD);
-        EXPECT_EQ(newfunc[3], AAA);
-        EXPECT_EQ(newfunc[4], BBB);
-        EXPECT_EQ(DDD->name_.find_first_of("CCC_"), 0u);
-    }
-
-    _function_(datatypes::f32, EEE) { _return_(1.0f); }
-    auto modu2
-            = ir_module_t::from_entry_func(get_default_context(), {AAA, EEE});
-    _module_var_(modu2, val2, datatypes::f32, expr());
-    // make a name conflict
-    val2.get().as<var>()->name_ = "val1";
-    {
-        auto newfunc = modu2->get_contents();
-        ASSERT_EQ(newfunc.size(), 2u);
-
-        EXPECT_EQ(newfunc[0], AAA);
-        EXPECT_EQ(newfunc[1], EEE);
-    }
-    modu->merge(*modu2);
-    auto newfunc = modu->get_contents();
-    ASSERT_EQ(newfunc.size(), 6u);
-    EXPECT_EQ(newfunc[5], EEE);
-    EXPECT_GT(val2.get().as<var>()->name_.size(), std::string("val1").size());
-    EXPECT_EQ(val2.get().as<var>()->name_.find_first_of("val1"), 0u);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ir_printer.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_ir_printer.cpp
deleted file mode 100644
index a0631d3b3ec..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ir_printer.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <sstream>
-#include "context.hpp"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/pass/printer.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_printer_cpp, TestTrackPosPrinter) {
-    builder::ir_builder_t builder;
-    const int shape1 = 128;
-    for_loop li, lj, lk, lp;
-    std::stringstream ss;
-
-    stmt the_assign;
-
-    _function_(datatypes::void_t, aaa,
-            _arg_("A", datatypes::f32, {shape1, shape1}),
-            _arg_("B", datatypes::f32, {shape1, shape1}),
-            _arg_("C", datatypes::f32, {shape1, shape1}),
-            _arg_("len", datatypes::s32)) {
-        _bind_(A, B, C, len);
-        _tensor_(D, datatypes::f32, 2, 10);
-        _tensor_(E, datatypes::f32, 100, 20);
-        _tensor_(F, datatypes::f32, len);
-        _tensor_(F_view, datatypes::s32, 10);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(F, {3});
-        builder.get_current_scope().as_seq().back()->attr()["comments"]
-                = std::vector<std::string> {"hello", "hi"};
-        _named_for_(li, i, 0, shape1) {
-            _named_for_(lj, j, 0, shape1) {
-                _named_for_(lk, k, 0, shape1) {
-                    C[{i, j}] = C[{i, j}] + A[{i, k}] * B[{k, j}];
-                    the_assign = builder.get_current_scope().as_seq().back();
-                }
-            }
-        }
-    }
-
-    ir_module_ptr m = ir_module_t::from_entry_func(get_test_ctx(), aaa);
-
-    std::string expected1
-            = R"(func aaa(A: [f32 * 128 * 128], B: [f32 * 128 * 128], C: [f32 * 128 * 128], len: s32): void {
-  tensor D: [f32 * 2 * 10]
-  tensor E: [f32 * 100 * 20]
-  tensor F: [f32 * len]
-  // hello
-  // hi
-  tensor F_view: [s32 * 10] = &F[3]
-  for i in (0, 128, 1) {
-    for j in (0, 128, 1) {
-      for k in (0, 128, 1) {
-        C[i, j] = (C[i, j] + (A[i, k] * B[k, j]))
-      }
-    }
-  }
-}
-)";
-    print_ir_and_annotate_source_pos(*m, ss);
-    EXPECT_EQ(ss.str(), expected1);
-    auto pos = aaa->attr().get_or_null<source_pos>("source_pos");
-    ASSERT_TRUE(pos);
-    ASSERT_EQ(*pos, (source_pos {1, 1}));
-    pos = li->attr().get_or_null<source_pos>("source_pos");
-    ASSERT_TRUE(pos);
-    ASSERT_EQ(*pos, (source_pos {2, 8}));
-    pos = the_assign->attr().get_or_null<source_pos>("source_pos");
-    ASSERT_TRUE(pos);
-    ASSERT_EQ(*pos, (source_pos {8, 11}));
-
-    pos = the_assign.static_as<assign>()
-                  ->value_.static_as<add>()
-                  ->r_->attr()
-                  .get_or_null<source_pos>("source_pos");
-    ASSERT_TRUE(pos);
-    ASSERT_EQ(*pos, (source_pos {29, 11}));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit.cpp
deleted file mode 100644
index 6118335f3dc..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit.cpp
+++ /dev/null
@@ -1,1248 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <cassert>
-#include <cmath>
-#include <cstdlib>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <stdlib.h>
-#include "context.hpp"
-#include "util/fp16.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/dynamic_dispatch_key.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#if SC_BUILTIN_JIT_ENABLED
-#include <compiler/jit/xbyak/ir/util/invariant_int.hpp>
-#include <compiler/jit/xbyak/xbyak_jit.hpp>
-#endif
-#if SC_CFAKE_JIT_ENABLED
-#include <compiler/jit/cfake/cfake_jit.hpp>
-#endif
-#if defined(SC_LLVM_BACKEND)
-#include <compiler/jit/llvm/llvm_jit.hpp>
-#endif
-#include <cfenv>
-#include <cmath>
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/graph/dynamic_utils.hpp>
-#include <runtime/config.hpp>
-#include <util/uint128.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-static bool path_exists(const std::string &path) {
-    return !std::ifstream(path).fail();
-}
-
-static const char *get_engine_name(std::unique_ptr<jit_engine_t> &engine) {
-    auto &engineobj = *engine;
-    return typeid(engineobj).name();
-}
-
-static std::vector<std::unique_ptr<jit_engine_t>> get_engines() {
-    std::vector<std::unique_ptr<jit_engine_t>> ret;
-#if SC_CFAKE_JIT_ENABLED
-    ret.emplace_back(utils::make_unique<cfake_jit>());
-#endif
-#if defined(SC_LLVM_BACKEND)
-    ret.emplace_back(utils::make_unique<llvm_jit>());
-#endif
-#if SC_BUILTIN_JIT_ENABLED
-    if (get_default_context()->machine_.cpu_flags_.fAVX2) {
-        ret.emplace_back(utils::make_unique<xbyak_jit>());
-    }
-#endif
-    return ret;
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJIT) {
-    ir_builder_t builder;
-    auto m = std::make_shared<ir_module_t>(get_default_context());
-    _global_tensor_(m, gtsr, datatypes::f32, 10);
-    _global_var_(m, func_ptr, datatypes::pointer, expr());
-    _function_(datatypes::s32, proto_func, _arg_("ii", datatypes::s32)) {}
-    func_ptr->attr()["prototype"] = proto_func;
-    _function_(datatypes::s32, aaa, _arg_("ii", datatypes::s32),
-            _arg_("jj", datatypes::s32), _arg_("buf", datatypes::s32, {100})) {
-        _bind_(ii, jj, buf);
-        _for_(i, 0, 10) {
-            buf[i] = ii + jj + builder::make_cast(datatypes::s32, i);
-            _if_(i < 2) {
-                builtin::print_str("hahah");
-                builtin::print_int(buf[i]);
-            }
-        }
-        gtsr[0] = 1.0f;
-        gtsr[1] = builder::make_reinterpret(jj, datatypes::f32);
-        _return_(make_expr<call_node>(func_ptr, std::vector<expr> {122}));
-    }
-    m->add_func({aaa});
-
-    auto engines = get_engines();
-    std::vector<std::string> temp_files;
-    auto the_call_back = +[](int32_t c) { return c + 1; };
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto mod = engine->make_jit_module(m, false);
-        auto fptr = mod->get_function("aaa");
-
-        void **gfunc_ptr = (void **)mod->get_address_of_symbol("func_ptr");
-        ASSERT_TRUE(gfunc_ptr);
-        *gfunc_ptr = (void *)the_call_back;
-        EXPECT_TRUE(fptr);
-        int buf[100];
-        testing::internal::CaptureStdout();
-        EXPECT_EQ(fptr->call<int>(1, 23, buf), 123);
-        float *gtsrptr = (float *)mod->get_address_of_symbol("gtsr");
-        std::string output = testing::internal::GetCapturedStdout();
-        EXPECT_EQ(output, "hahah24\nhahah25\n");
-        EXPECT_EQ(buf[4], 28);
-        ASSERT_TRUE(gtsrptr);
-        EXPECT_EQ(gtsrptr[0], 1.0f);
-        // now test reinterpret
-        union {
-            int v;
-            float v2;
-        } reint;
-        reint.v = 23;
-        EXPECT_EQ(gtsrptr[1], reint.v2);
-        // reinterpret test done
-        temp_files = fptr->get_module()->get_temp_filenames();
-        for (const auto &file : temp_files) {
-            EXPECT_TRUE(path_exists(file));
-        }
-        fptr = nullptr;
-        mod = nullptr;
-
-        // auto removal check
-        for (const auto &file : temp_files) {
-            EXPECT_FALSE(path_exists(file));
-        }
-    }
-
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        func_c faaa = aaa;
-        // test loading after unloading a module
-        auto mod = engine->make_jit_module(m, true);
-        auto fptr = mod->get_function("aaa");
-        void **gfunc_ptr = (void **)mod->get_address_of_symbol("func_ptr");
-        ASSERT_TRUE(gfunc_ptr);
-        *gfunc_ptr = (void *)the_call_back;
-        int buf[100];
-        testing::internal::CaptureStdout();
-        fptr->call<float>(3, 23, buf);
-        std::string output = testing::internal::GetCapturedStdout();
-        EXPECT_EQ(output, "hahah26\nhahah27\n");
-        EXPECT_EQ(buf[8], 34);
-
-        temp_files = fptr->get_module()->get_temp_filenames();
-        // generic calls
-        generic_val args[3];
-        args[0].v_int32_t = 12;
-        args[1].v_int32_t = 14;
-        args[2].v_ptr = buf;
-        testing::internal::CaptureStdout();
-        fptr->call_generic_default(args);
-        output = testing::internal::GetCapturedStdout();
-        EXPECT_EQ(output, "hahah26\nhahah27\n");
-        EXPECT_EQ(buf[9], 35);
-        for (const auto &file : temp_files) {
-            EXPECT_TRUE(path_exists(file));
-        }
-        mod = nullptr;
-        fptr = nullptr;
-        // auto removal check
-        for (const auto &file : temp_files) {
-            EXPECT_FALSE(path_exists(file));
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITCast) {
-    // TODO(longsheng): on sse-only machine:
-    // LLVM ERROR: Do not know how to split this operator's operand!
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-    auto make_module = [](int lanes) {
-        _function_(datatypes::s32, aaa, _arg_("buf", datatypes::f32, {1024}),
-                _arg_("out", datatypes::s32, {1024}),
-                _arg_("out2", datatypes::s32, {1024})) {
-            _bind_(buf, out, out2);
-            _for_(i, 0, 1024, lanes) {
-                out[span_t({i}, lanes)] = builder::make_cast(
-                        sc_data_type_t::s32(lanes), buf[span_t({i}, lanes)]);
-                out2[span_t({i}, lanes)] = builder::make_round_and_cast(
-                        buf[span_t({i}, lanes)], sc_data_type_t::s32(lanes));
-            }
-            _return_(123);
-        }
-        return ir_module_t::from_entry_func(get_default_context(), aaa);
-    };
-
-    auto engines = get_engines();
-    std::vector<int> lanes_v = {1, 4, 8, 16};
-    if (!get_default_context()->machine_.cpu_flags_.fAVX512F) {
-        lanes_v.pop_back();
-    }
-    for (int lanes : lanes_v) {
-        for (auto &engine : engines) {
-            SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine)
-                    + " lanes=" + std::to_string(lanes));
-            auto fptr = engine->get_entry_func(make_module(lanes));
-            ASSERT_TRUE(fptr);
-            auto inbuf = alloc_array<float>(1024);
-            uint32_t seed = rand(); // NOLINT
-            for (auto &v : inbuf) {
-                v = v * 10 + test_utils::rand_for_test<float>(seed, -1.0, 1.0);
-            }
-            auto out = alloc_array<int32_t>(1024, INIT_NOOP);
-            auto out2 = alloc_array<int32_t>(1024, INIT_NOOP);
-            EXPECT_EQ(fptr->call<int>(inbuf.data(), out.data(), out2.data()),
-                    123);
-
-            for (int i = 0; i < 1024; i++) {
-                ASSERT_EQ(static_cast<int32_t>(inbuf[i]), out[i]);
-                ASSERT_EQ(static_cast<int32_t>(std::roundf(inbuf[i])), out2[i]);
-            }
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITCastToBF16) {
-    REQUIRE_BF16();
-    ir_builder_t builder;
-    auto make_module = [](int lanes) {
-        _function_(datatypes::s32, aaa, _arg_("buf", datatypes::f32, {1024}),
-                _arg_("out", datatypes::bf16, {1024})) {
-            _bind_(buf, out);
-            _for_(i, 0, 1024, lanes) {
-                out[span_t({i}, lanes)] = builder::make_cast(
-                        sc_data_type_t::bf16(lanes), buf[span_t({i}, lanes)]);
-            }
-            _return_(123);
-        }
-        return ir_module_t::from_entry_func(get_default_context(), aaa);
-    };
-
-    auto engines = get_engines();
-    std::vector<int> lanes_v = {1, 4, 8, 16};
-    if (!get_default_context()->machine_.cpu_flags_.fAVX512F) {
-        lanes_v.pop_back();
-    }
-    for (int lanes : lanes_v) {
-        for (auto &engine : engines) {
-#if SC_CFAKE_JIT_ENABLED
-            if (auto c_jit_ptr = dynamic_cast<cfake_jit *>(engine.get())) {
-                auto compiler_flags = c_jit_ptr->get_compiler_flags();
-                if (!compiler_flags.fAVX512BF16
-                        || !compiler_flags.fAVX512AMXBF16) {
-                    continue;
-                }
-            }
-#endif
-            SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine)
-                    + " lanes=" + std::to_string(lanes));
-            auto fptr = engine->get_entry_func(make_module(lanes));
-            ASSERT_TRUE(fptr);
-            auto inbuf = alloc_array<float>(1024);
-            uint32_t seed = rand(); // NOLINT
-            for (auto &v : inbuf) {
-                v = v * 10 + test_utils::rand_for_test<float>(seed);
-            }
-            auto out = alloc_array<bf16_t>(1024, INIT_NOOP);
-
-            EXPECT_EQ(fptr->call<int>(inbuf.data(), out.data()), 123);
-
-            for (int i = 0; i < 1024; i++) {
-                ASSERT_EQ(static_cast<bf16_t>(inbuf[i]), out[i]);
-            }
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITParallelFor) {
-    ir_builder_t builder;
-    _function_(datatypes::s32, aaa, _arg_("ii", datatypes::s32),
-            _arg_("jj", datatypes::s32),
-            _arg_("buf", datatypes::s32, {10000})) {
-        _bind_(ii, jj, buf);
-        _for_(i, 0, 10000, 1, for_type::PARALLEL) {
-            buf[i] = ii + builder::make_abs(jj)
-                    + builder::make_cast(datatypes::s32, i);
-
-            buf[builtin::get_thread_id_func()()] = 333;
-        }
-        buf[9998] = builtin::get_thread_id_func()();
-        _return_(123);
-    }
-
-    int threads = runtime_config_t::get().get_num_threads();
-    ASSERT_LT(threads, 1000);
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto fptr = engine->get_entry_func(
-                ir_module_t::from_entry_func(get_default_context(), aaa));
-        ASSERT_TRUE(fptr);
-        std::vector<int> buf(10000);
-        fptr->call_default(1, 23, buf.data());
-        int collected_num_threads = 0;
-        for (int i = 0; i < threads; i++) {
-            if (buf[i] == 333) { collected_num_threads++; }
-        }
-        EXPECT_GT(collected_num_threads, 0);
-        for (int i = threads; i < 9998; i++) {
-            EXPECT_EQ(buf[i], 1 + 23 + i);
-        }
-        EXPECT_EQ(buf[9998], 0);
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITVector) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {1024}),
-            _arg_("B", datatypes::f32, {1024}),
-            _arg_("C", datatypes::f32, {1024}),
-            _arg_("D", datatypes::f32, {1024}),
-            _arg_("bias", datatypes::f32, {8}),
-            _arg_("bias1", datatypes::f32, {8}),
-            _arg_("bias2", datatypes::f32, {8})) {
-        _bind_(A, B, C, D, bias, bias1, bias2);
-        _var_(bia, sc_data_type_t::f32(8));
-        _var_(bia1, sc_data_type_t::f32(8));
-        _var_(bia2, sc_data_type_t::f32(8));
-        bia = bias[span_t({expr(0)}, 8)];
-        bia1 = bias1[span_t({expr(0)}, 8)];
-        bia2 = bias2[span_t({expr(0)}, 8)];
-        _for_(i, 0, 1024, 8) {
-            D[span_t({i}, 8)] = A[span_t({i}, 8)]
-                    + B[span_t({i}, 8)] * C[span_t({i}, 8)] + bia
-                    + builder::make_floor(bia1) + builder::make_ceil(bia2);
-        }
-    }
-
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto fptr = engine->get_entry_func(
-                ir_module_t::from_entry_func(get_default_context(), aaa));
-        ASSERT_TRUE(fptr);
-        auto getA = []() {
-            std::vector<float> A(2048);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = i;
-            }
-            return A;
-        };
-        auto getB = []() {
-            std::vector<float> A(2048);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = 2 * i - 100;
-            }
-            return A;
-        };
-        auto getC = []() {
-            std::vector<float> A(2048);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = -0.1f * i;
-            }
-            return A;
-        };
-        auto getBias = []() {
-            std::vector<float> A(8);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = i;
-            }
-            return A;
-        };
-        auto getBias1 = []() {
-            std::vector<float> A(8, -12.3f);
-            return A;
-        };
-        auto getBias2 = []() {
-            std::vector<float> A(8, -45.6f);
-            return A;
-        };
-        std::vector<float> D(1024);
-        auto A = getA();
-        auto B = getB();
-        auto C = getC();
-        auto bias = getBias();
-        auto bias1 = getBias1();
-        auto bias2 = getBias2();
-        fptr->call<void>(A.data(), B.data(), C.data(), D.data(), bias.data(),
-                bias1.data(), bias2.data());
-        for (int i = 0; i < 1024; i++) {
-            auto expected = float(i) + float(2 * i - 100) * float(-0.1f * i)
-                    + float(i % 8) + float(-13) + float(-45);
-            EXPECT_NEAR(D[i], expected, 0.1f);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITVectorShift) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::s32, {1024}),
-            _arg_("B", datatypes::s32, {1024}),
-            _arg_("C", datatypes::s32, {1024}),
-            _arg_("D", datatypes::s32, {1024})) {
-        _bind_(A, B, C, D);
-        _for_(i, 0, 1024, 8) {
-            C[span_t({i}, 8)] = A[span_t({i}, 8)] << B[span_t({i}, 8)];
-            D[span_t({i}, 8)] = A[span_t({i}, 8)] >> B[span_t({i}, 8)];
-        }
-    }
-
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto fptr = engine->get_entry_func(
-                ir_module_t::from_entry_func(get_default_context(), aaa));
-        ASSERT_TRUE(fptr);
-        auto getA = []() {
-            std::vector<int32_t> A(2048);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = i;
-            }
-            return A;
-        };
-        auto getB = []() {
-            std::vector<int32_t> A(2048, 2);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = 2;
-            }
-            return A;
-        };
-        std::vector<int32_t> C(1024);
-        std::vector<int32_t> D(1024);
-        auto A = getA();
-        auto B = getB();
-        fptr->call<void>(A.data(), B.data(), C.data(), D.data());
-        for (int i = 0; i < 1024; i++) {
-            auto expected1 = i * 4;
-            auto expected2 = i / 4;
-            EXPECT_NEAR(C[i], expected1, 1.0f);
-            EXPECT_NEAR(D[i], expected2, 1.0f);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITVectorShuffle) {
-    REQUIRE_AVX512()
-    auto test_float_func = []() {
-        builder::ir_builder_t builder;
-        const int type_bits
-                = utils::get_sizeof_type(sc_data_type_t::f32(4)) * 8;
-        const int elem_bits = utils::get_sizeof_type(datatypes::f32) * 8;
-        _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {1024}),
-                _arg_("B", datatypes::f32, {1024}),
-                _arg_("C", datatypes::f32, {1024}),
-                _arg_("D", datatypes::f32, {1024}),
-                _arg_("E", datatypes::f32, {1024}),
-                _arg_("F", datatypes::f32, {1024}),
-                _arg_("H", datatypes::f32, {1024}),
-                _arg_("G", datatypes::f32, {1024})) {
-            _bind_(A, B, C, D, E, F, H, G);
-            _for_(i, 0, 1024, 8) {
-                C[span_t({i}, 8)] = builder::make_unpack_high(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)]);
-                D[span_t({i}, 8)] = builder::make_unpack_low(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)]);
-                E[span_t({i}, 8)] = builder::make_shuffle(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)], 68, elem_bits);
-                F[span_t({i}, 8)] = builder::make_permute(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)], 32, type_bits);
-                H[span_t({i}, 8)] = builder::make_permute(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)], 0x31, type_bits);
-                G[span_t({i}, 8)] = builder::make_shuffle(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)], 0b11, type_bits);
-            }
-        }
-
-        auto engines = get_engines();
-        for (auto &engine : engines) {
-            SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-            auto fptr = engine->get_entry_func(
-                    ir_module_t::from_entry_func(get_default_context(), aaa));
-            ASSERT_TRUE(fptr);
-            auto getA = []() {
-                std::vector<float> A(1024);
-                for (int i = 0; i < (int)A.size(); i++) {
-                    A[i] = 3 * i;
-                }
-                return A;
-            };
-            auto getB = []() {
-                std::vector<float> A(1024);
-                for (int i = 0; i < (int)A.size(); i++) {
-                    A[i] = 2 * i;
-                }
-                return A;
-            };
-            std::vector<float> C(1024);
-            std::vector<float> D(1024);
-            std::vector<float> E(1024);
-            std::vector<float> F(1024);
-            std::vector<float> H(1024);
-            std::vector<float> G(1024);
-
-            auto A = getA();
-            auto B = getB();
-            fptr->call<void>(A.data(), B.data(), C.data(), D.data(), E.data(),
-                    F.data(), H.data(), G.data());
-            for (int i = 0; i < 1024; i += 8) {
-                EXPECT_NEAR(C[i + 0], A[i + 2], 1e-5);
-                EXPECT_NEAR(C[i + 1], B[i + 2], 1e-5);
-                EXPECT_NEAR(C[i + 2], A[i + 3], 1e-5);
-                EXPECT_NEAR(C[i + 3], B[i + 3], 1e-5);
-                EXPECT_NEAR(C[i + 4], A[i + 6], 1e-5);
-                EXPECT_NEAR(C[i + 5], B[i + 6], 1e-5);
-                EXPECT_NEAR(C[i + 6], A[i + 7], 1e-5);
-                EXPECT_NEAR(C[i + 7], B[i + 7], 1e-5);
-
-                EXPECT_NEAR(D[i + 0], A[i + 0], 1e-5);
-                EXPECT_NEAR(D[i + 1], B[i + 0], 1e-5);
-                EXPECT_NEAR(D[i + 2], A[i + 1], 1e-5);
-                EXPECT_NEAR(D[i + 3], B[i + 1], 1e-5);
-                EXPECT_NEAR(D[i + 4], A[i + 4], 1e-5);
-                EXPECT_NEAR(D[i + 5], B[i + 4], 1e-5);
-                EXPECT_NEAR(D[i + 6], A[i + 5], 1e-5);
-                EXPECT_NEAR(D[i + 7], B[i + 5], 1e-5);
-
-                // 0x01000100
-                EXPECT_NEAR(E[i + 0], A[i + 0], 1e-5);
-                EXPECT_NEAR(E[i + 1], A[i + 1], 1e-5);
-                EXPECT_NEAR(E[i + 2], B[i + 0], 1e-5);
-                EXPECT_NEAR(E[i + 3], B[i + 1], 1e-5);
-                EXPECT_NEAR(E[i + 4], A[i + 4], 1e-5);
-                EXPECT_NEAR(E[i + 5], A[i + 5], 1e-5);
-                EXPECT_NEAR(E[i + 6], B[i + 4], 1e-5);
-                EXPECT_NEAR(E[i + 7], B[i + 5], 1e-5);
-                // 0x00100000
-                EXPECT_NEAR(F[i + 0], A[i + 0], 1e-5);
-                EXPECT_NEAR(F[i + 1], A[i + 1], 1e-5);
-                EXPECT_NEAR(F[i + 2], A[i + 2], 1e-5);
-                EXPECT_NEAR(F[i + 3], A[i + 3], 1e-5);
-                EXPECT_NEAR(F[i + 4], B[i + 0], 1e-5);
-                EXPECT_NEAR(F[i + 5], B[i + 1], 1e-5);
-                EXPECT_NEAR(F[i + 6], B[i + 2], 1e-5);
-                EXPECT_NEAR(F[i + 7], B[i + 3], 1e-5);
-                // 0x31
-                EXPECT_NEAR(H[i + 0], A[i + 4], 1e-5);
-                EXPECT_NEAR(H[i + 1], A[i + 5], 1e-5);
-                EXPECT_NEAR(H[i + 2], A[i + 6], 1e-5);
-                EXPECT_NEAR(H[i + 3], A[i + 7], 1e-5);
-                EXPECT_NEAR(H[i + 4], B[i + 4], 1e-5);
-                EXPECT_NEAR(H[i + 5], B[i + 5], 1e-5);
-                EXPECT_NEAR(H[i + 6], B[i + 6], 1e-5);
-                EXPECT_NEAR(H[i + 7], B[i + 7], 1e-5);
-                // 0b11
-                EXPECT_NEAR(G[i + 0], A[i + 4], 1e-5);
-                EXPECT_NEAR(G[i + 1], A[i + 5], 1e-5);
-                EXPECT_NEAR(G[i + 2], A[i + 6], 1e-5);
-                EXPECT_NEAR(G[i + 3], A[i + 7], 1e-5);
-                EXPECT_NEAR(G[i + 4], B[i + 4], 1e-5);
-                EXPECT_NEAR(G[i + 5], B[i + 5], 1e-5);
-                EXPECT_NEAR(G[i + 6], B[i + 6], 1e-5);
-                EXPECT_NEAR(G[i + 7], B[i + 7], 1e-5);
-            }
-        }
-    };
-    auto test_index_func = []() {
-        builder::ir_builder_t builder;
-        const int type_bits
-                = utils::get_sizeof_type(sc_data_type_t::index(2)) * 8;
-        _function_(datatypes::void_t, aaa, _arg_("A", datatypes::index, {1024}),
-                _arg_("B", datatypes::index, {1024}),
-                _arg_("G", datatypes::index, {1024}),
-                _arg_("I", datatypes::index, {1024}), ) {
-            _bind_(A, B, G, I);
-            _for_(i, 0, 1024, 4) {
-                G[span_t({i}, 4)] = builder::make_shuffle(
-                        A[span_t({i}, 4)], B[span_t({i}, 4)], 0b11, type_bits);
-            }
-            _for_(i, 0, 1024, 8) {
-                I[span_t({i}, 8)] = builder::make_shuffle(
-                        A[span_t({i}, 8)], B[span_t({i}, 8)], 0xee, type_bits);
-            }
-        }
-
-        auto engines = get_engines();
-        for (auto &engine : engines) {
-            SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-            auto fptr = engine->get_entry_func(
-                    ir_module_t::from_entry_func(get_default_context(), aaa));
-            ASSERT_TRUE(fptr);
-            auto getA = []() {
-                std::vector<uint64_t> A(1024);
-                for (int i = 0; i < (int)A.size(); i++) {
-                    A[i] = 3 * i;
-                }
-                return A;
-            };
-            auto getB = []() {
-                std::vector<uint64_t> A(1024);
-                for (int i = 0; i < (int)A.size(); i++) {
-                    A[i] = 2 * i;
-                }
-                return A;
-            };
-            std::vector<uint64_t> G(1024);
-            std::vector<uint64_t> I(1024);
-
-            auto A = getA();
-            auto B = getB();
-            fptr->call<void>(A.data(), B.data(), G.data(), I.data());
-
-            for (int i = 0; i < 1024; i += 4) {
-                // 0x1b indexx2
-                EXPECT_NEAR(G[i + 0], A[i + 2], 1e-5);
-                EXPECT_NEAR(G[i + 1], A[i + 3], 1e-5);
-                EXPECT_NEAR(G[i + 2], B[i + 2], 1e-5);
-                EXPECT_NEAR(G[i + 3], B[i + 3], 1e-5);
-            }
-            for (int i = 0; i < 1024; i += 8) {
-                // 0x1b indexx2
-                EXPECT_NEAR(I[i + 0], A[i + 4], 1e-5);
-                EXPECT_NEAR(I[i + 1], A[i + 5], 1e-5);
-                EXPECT_NEAR(I[i + 2], A[i + 6], 1e-5);
-                EXPECT_NEAR(I[i + 3], A[i + 7], 1e-5);
-                EXPECT_NEAR(I[i + 4], B[i + 4], 1e-5);
-                EXPECT_NEAR(I[i + 5], B[i + 5], 1e-5);
-                EXPECT_NEAR(I[i + 6], B[i + 6], 1e-5);
-                EXPECT_NEAR(I[i + 7], B[i + 7], 1e-5);
-            }
-        }
-    };
-    auto test_bf16_func = []() {
-        builder::ir_builder_t builder;
-        const int type_bits
-                = utils::get_sizeof_type(sc_data_type_t::bf16(8)) * 8;
-        _function_(datatypes::void_t, aaa, _arg_("A", datatypes::bf16, {1024}),
-                _arg_("B", datatypes::bf16, {1024}),
-                _arg_("G", datatypes::bf16, {1024})) {
-            _bind_(A, B, G);
-            _for_(i, 0, 1024, 16) {
-                G[span_t({i}, 16)] = builder::make_shuffle(A[span_t({i}, 16)],
-                        B[span_t({i}, 16)], 0b11, type_bits);
-            }
-        }
-
-        auto engines = get_engines();
-        for (auto &engine : engines) {
-            SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-            auto fptr = engine->get_entry_func(
-                    ir_module_t::from_entry_func(get_default_context(), aaa));
-            ASSERT_TRUE(fptr);
-            auto getA = []() {
-                std::vector<uint16_t> A(1024);
-                for (int i = 0; i < (int)A.size(); i++) {
-                    A[i] = 3 * i;
-                }
-                return A;
-            };
-            auto getB = []() {
-                std::vector<uint16_t> A(1024);
-                for (int i = 0; i < (int)A.size(); i++) {
-                    A[i] = 2 * i;
-                }
-                return A;
-            };
-            std::vector<uint16_t> G(1024);
-
-            auto A = getA();
-            auto B = getB();
-            fptr->call<void>(A.data(), B.data(), G.data());
-
-            for (int i = 0; i < 1024; i += 16) {
-                // 0x1b bf16
-                EXPECT_NEAR(G[i + 0], A[i + 8], 1e-5);
-                EXPECT_NEAR(G[i + 1], A[i + 9], 1e-5);
-                EXPECT_NEAR(G[i + 2], A[i + 10], 1e-5);
-                EXPECT_NEAR(G[i + 3], A[i + 11], 1e-5);
-                EXPECT_NEAR(G[i + 4], A[i + 12], 1e-5);
-                EXPECT_NEAR(G[i + 5], A[i + 13], 1e-5);
-                EXPECT_NEAR(G[i + 6], A[i + 14], 1e-5);
-                EXPECT_NEAR(G[i + 7], A[i + 15], 1e-5);
-                EXPECT_NEAR(G[i + 8], B[i + 8], 1e-5);
-                EXPECT_NEAR(G[i + 9], B[i + 9], 1e-5);
-                EXPECT_NEAR(G[i + 10], B[i + 10], 1e-5);
-                EXPECT_NEAR(G[i + 11], B[i + 11], 1e-5);
-                EXPECT_NEAR(G[i + 12], B[i + 12], 1e-5);
-                EXPECT_NEAR(G[i + 13], B[i + 13], 1e-5);
-                EXPECT_NEAR(G[i + 14], B[i + 14], 1e-5);
-                EXPECT_NEAR(G[i + 15], B[i + 15], 1e-5);
-            }
-        }
-    };
-    test_float_func();
-    test_index_func();
-    test_bf16_func();
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITVectorExp) {
-    builder::ir_builder_t builder;
-    int lanes = get_default_context()->get_max_vector_lanes(sc_data_etype::F32);
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {4 * 16}),
-            _arg_("out", datatypes::f32, {4 * 16})) {
-        _bind_(A, out);
-        _for_(i, 0, 4 * 16, lanes) {
-            out[span_t({i}, lanes)] = builder::make_exp(A[span_t({i}, lanes)]);
-        }
-    }
-
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto fptr = engine->get_entry_func(
-                ir_module_t::from_entry_func(get_default_context(), aaa));
-        ASSERT_TRUE(fptr);
-        std::vector<float> A {-400.f, -300.f, -200.f, 0.f, 88.54f, -87.32736f,
-                -0.0000000023431783f, 0.00000537133123132f, -77.36143377854597f,
-                -31.265935302161033f, -44.375920170147786f, 53.45083212459927f,
-                -46.00751423020855f, -77.80392243036792f, -51.47062321531107f,
-                12.29778060970466f, -73.16798893622726f, -6.448146416082253f,
-                29.355474974724515f, 9.493752483855133f, -18.690698248686658f,
-                22.2158171736952f, 12.127730168284998f, 34.77565690140503f,
-                -36.56118977835629f, -0.048786066934596306f, 86.30609073702799f,
-                -64.70384703267281f, 35.43627610059812f, 67.76313421869965f,
-                -64.51974340337865f, -39.850006944087596f, 1.492907139628315f,
-                -84.2008668172895f, -46.281348832117345f, 84.87902235985125f,
-                -83.21309421146465f, -38.3112098289613f, -15.55520431275707f,
-                22.24779132743943f, -60.47383900817228f, -3.2541373529870867f,
-                9.050757135279753f, -37.443981734529984f, 5.023265684648337f,
-                -79.69942646301337f, 39.922178527841936f, -33.99523678044496f,
-                10.047838688566046f, -1.5905743454625707f, 19.752474224382297f,
-                54.520029453447435f, -85.21571279171789f, -79.80216815226852f,
-                83.35114654903332f, -9.600846495008582f, 35.06101727500716f,
-                24.70281125550754f, -0.9431269946899761f, -33.21298977151045f,
-                -10.254364349128764f, 32.98638056767258f, -49.648058417016884f,
-                -35.939207939124046f};
-
-        std::vector<float> out(16 * 4);
-        fptr->call<void>(A.data(), out.data());
-        for (int i = 0; i < 16 * 4; i++) {
-            float expected = expf(A[i]);
-            EXPECT_NEAR(out[i], expected, 0.00001 * expected);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITVectorLoad) {
-    builder::ir_builder_t builder;
-    {
-        int lanes = get_default_context()->get_max_vector_lanes(
-                sc_data_etype::F32);
-        _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {4 * 16}),
-                _arg_("out", datatypes::f32, {4 * 16})) {
-            _bind_(A, out);
-            _for_(i, 0, 4 * 16, lanes) {
-                out[span_t({i}, lanes)]
-                        = builder::make_exp(A[span_t({i}, lanes)]);
-            }
-        }
-
-        auto engines = get_engines();
-        for (auto &engine : engines) {
-            SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-            auto fptr = engine->get_entry_func(
-                    ir_module_t::from_entry_func(get_default_context(), aaa));
-            ASSERT_TRUE(fptr);
-            std::vector<float> A {-400.f, -300.f, -200.f, 0.f, 88.54f,
-                    -87.32736f, -0.0000000023431783f, 0.00000537133123132f,
-                    -77.36143377854597f, -31.265935302161033f,
-                    -44.375920170147786f, 53.45083212459927f,
-                    -46.00751423020855f, -77.80392243036792f,
-                    -51.47062321531107f, 12.29778060970466f,
-                    -73.16798893622726f, -6.448146416082253f,
-                    29.355474974724515f, 9.493752483855133f,
-                    -18.690698248686658f, 22.2158171736952f,
-                    12.127730168284998f, 34.77565690140503f,
-                    -36.56118977835629f, -0.048786066934596306f,
-                    86.30609073702799f, -64.70384703267281f, 35.43627610059812f,
-                    67.76313421869965f, -64.51974340337865f,
-                    -39.850006944087596f, 1.492907139628315f,
-                    -84.2008668172895f, -46.281348832117345f,
-                    84.87902235985125f, -83.21309421146465f, -38.3112098289613f,
-                    -15.55520431275707f, 22.24779132743943f,
-                    -60.47383900817228f, -3.2541373529870867f,
-                    9.050757135279753f, -37.443981734529984f,
-                    5.023265684648337f, -79.69942646301337f,
-                    39.922178527841936f, -33.99523678044496f,
-                    10.047838688566046f, -1.5905743454625707f,
-                    19.752474224382297f, 54.520029453447435f,
-                    -85.21571279171789f, -79.80216815226852f,
-                    83.35114654903332f, -9.600846495008582f, 35.06101727500716f,
-                    24.70281125550754f, -0.9431269946899761f,
-                    -33.21298977151045f, -10.254364349128764f,
-                    32.98638056767258f, -49.648058417016884f,
-                    -35.939207939124046f};
-
-            std::vector<float> out(16 * 4);
-            fptr->call<void>(A.data(), out.data());
-            for (int i = 0; i < 16 * 4; i++) {
-                float expected = expf(A[i]);
-                EXPECT_NEAR(out[i], expected, 0.00001 * expected);
-            }
-        }
-    }
-}
-
-#ifdef __AVX512F__
-TEST(GCCore_CPU_jit_cpp, TestJITVectorUnpackElemLanes) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::u16, {1024}),
-            _arg_("B", datatypes::u16, {1024}),
-            _arg_("C", datatypes::u16, {1024}),
-            _arg_("D", datatypes::u16, {1024}),
-            _arg_("E", datatypes::u16, {1024}),
-            _arg_("F", datatypes::u16, {1024}),
-            _arg_("G", datatypes::u16, {1024}),
-            _arg_("H", datatypes::u16, {1024})) {
-        _bind_(A, B, C, D, E, F, G, H);
-        _for_(i, 0, 1024, 32) {
-            C[span_t({i}, 32)] = builder::make_unpack_high(
-                    A[span_t({i}, 32)], B[span_t({i}, 32)], 16);
-            D[span_t({i}, 32)] = builder::make_unpack_low(
-                    A[span_t({i}, 32)], B[span_t({i}, 32)], 16);
-            E[span_t({i}, 32)] = builder::make_unpack_high(
-                    A[span_t({i}, 32)], B[span_t({i}, 32)], 32);
-            F[span_t({i}, 32)] = builder::make_unpack_low(
-                    A[span_t({i}, 32)], B[span_t({i}, 32)], 32);
-            G[span_t({i}, 32)] = builder::make_unpack_high(
-                    A[span_t({i}, 32)], B[span_t({i}, 32)], 64);
-            H[span_t({i}, 32)] = builder::make_unpack_low(
-                    A[span_t({i}, 32)], B[span_t({i}, 32)], 64);
-        }
-    }
-
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto fptr = engine->get_entry_func(
-                ir_module_t::from_entry_func(get_default_context(), aaa));
-        ASSERT_TRUE(fptr);
-        auto getA = []() {
-            std::vector<uint16_t> A(1024);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = 3 * i;
-            }
-            return A;
-        };
-        auto getB = []() {
-            std::vector<uint16_t> A(1024);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = 2 * i;
-            }
-            return A;
-        };
-        std::vector<uint16_t> C(1024);
-        std::vector<uint16_t> D(1024);
-        std::vector<uint16_t> E(1024);
-        std::vector<uint16_t> F(1024);
-        std::vector<uint16_t> G(1024);
-        std::vector<uint16_t> H(1024);
-        auto A = getA();
-        auto B = getB();
-        fptr->call<void>(A.data(), B.data(), C.data(), D.data(), E.data(),
-                F.data(), G.data(), H.data());
-        for (int i = 0; i < 1024; i += 32) {
-            EXPECT_NEAR(C[i + 0], A[i + 4], 1e-5);
-            EXPECT_NEAR(C[i + 1], B[i + 4], 1e-5);
-            EXPECT_NEAR(C[i + 6], A[i + 7], 1e-5);
-            EXPECT_NEAR(C[i + 7], B[i + 7], 1e-5);
-            EXPECT_NEAR(C[i + 22], A[i + 23], 1e-5);
-            EXPECT_NEAR(C[i + 23], B[i + 23], 1e-5);
-            EXPECT_NEAR(C[i + 30], A[i + 31], 1e-5);
-            EXPECT_NEAR(C[i + 31], B[i + 31], 1e-5);
-
-            EXPECT_NEAR(D[i + 0], A[i + 0], 1e-5);
-            EXPECT_NEAR(D[i + 1], B[i + 0], 1e-5);
-            EXPECT_NEAR(D[i + 6], A[i + 3], 1e-5);
-            EXPECT_NEAR(D[i + 7], B[i + 3], 1e-5);
-            EXPECT_NEAR(D[i + 22], A[i + 19], 1e-5);
-            EXPECT_NEAR(D[i + 23], B[i + 19], 1e-5);
-            EXPECT_NEAR(D[i + 30], A[i + 27], 1e-5);
-            EXPECT_NEAR(D[i + 31], B[i + 27], 1e-5);
-
-            EXPECT_NEAR(E[i + 0], A[i + 4], 1e-5);
-            EXPECT_NEAR(E[i + 1], A[i + 5], 1e-5);
-            EXPECT_NEAR(E[i + 6], B[i + 6], 1e-5);
-            EXPECT_NEAR(E[i + 7], B[i + 7], 1e-5);
-            EXPECT_NEAR(E[i + 22], B[i + 22], 1e-5);
-            EXPECT_NEAR(E[i + 23], B[i + 23], 1e-5);
-            EXPECT_NEAR(E[i + 30], B[i + 30], 1e-5);
-            EXPECT_NEAR(E[i + 31], B[i + 31], 1e-5);
-
-            EXPECT_NEAR(F[i + 0], A[i + 0], 1e-5);
-            EXPECT_NEAR(F[i + 1], A[i + 1], 1e-5);
-            EXPECT_NEAR(F[i + 6], B[i + 2], 1e-5);
-            EXPECT_NEAR(F[i + 7], B[i + 3], 1e-5);
-            EXPECT_NEAR(F[i + 22], B[i + 18], 1e-5);
-            EXPECT_NEAR(F[i + 23], B[i + 19], 1e-5);
-            EXPECT_NEAR(F[i + 30], B[i + 26], 1e-5);
-            EXPECT_NEAR(F[i + 31], B[i + 27], 1e-5);
-
-            EXPECT_NEAR(G[i + 0], A[i + 4], 1e-5);
-            EXPECT_NEAR(G[i + 1], A[i + 5], 1e-5);
-            EXPECT_NEAR(G[i + 6], B[i + 6], 1e-5);
-            EXPECT_NEAR(G[i + 7], B[i + 7], 1e-5);
-            EXPECT_NEAR(G[i + 18], A[i + 22], 1e-5);
-            EXPECT_NEAR(G[i + 19], A[i + 23], 1e-5);
-            EXPECT_NEAR(G[i + 30], B[i + 30], 1e-5);
-            EXPECT_NEAR(G[i + 31], B[i + 31], 1e-5);
-
-            EXPECT_NEAR(H[i + 0], A[i + 0], 1e-5);
-            EXPECT_NEAR(H[i + 1], A[i + 1], 1e-5);
-            EXPECT_NEAR(H[i + 6], B[i + 2], 1e-5);
-            EXPECT_NEAR(H[i + 7], B[i + 3], 1e-5);
-            EXPECT_NEAR(H[i + 18], A[i + 18], 1e-5);
-            EXPECT_NEAR(H[i + 19], A[i + 19], 1e-5);
-            EXPECT_NEAR(H[i + 30], B[i + 26], 1e-5);
-            EXPECT_NEAR(H[i + 31], B[i + 27], 1e-5);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITVectorBroadcast) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::u16, {256}),
-            _arg_("B", datatypes::u16, {1024})) {
-        _bind_(A, B);
-        _for_(i, 0, 256, 8) {
-            B[span_t({i * 4}, 32)]
-                    = builder::make_broadcast(A[span_t({i}, 8)], 32);
-        }
-    }
-
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto fptr = engine->get_entry_func(
-                ir_module_t::from_entry_func(get_default_context(), aaa));
-        ASSERT_TRUE(fptr);
-        auto getA = []() {
-            std::vector<uint16_t> A(256);
-            for (int i = 0; i < (int)A.size(); i++) {
-                A[i] = 3 * i;
-            }
-            return A;
-        };
-        std::vector<uint16_t> B(1024);
-        auto A = getA();
-        fptr->call<void>(A.data(), B.data());
-        for (int i = 0; i < 1024; i++) {
-            EXPECT_NEAR(B[i], A[i % 8 + i / 32 * 8], 1e-5);
-        }
-    }
-}
-#endif
-
-TEST(GCCore_CPU_jit_cpp, TestJITGlobalTensor) {
-    builder::ir_builder_t builder;
-    auto m = std::make_shared<ir_module_t>(get_default_context());
-    _global_tensor_(m, gv, datatypes::s32, 2, 2);
-    _global_var_(m, gvar, datatypes::s32, 2);
-    int32_t values[] = {1456, 2, 3, 4};
-    gv.static_as<tensor>()->init_value_
-            = std::make_shared<static_data_t>(values, sizeof(values));
-
-    _function_(datatypes::s32, bbb) {
-        gv[{0, 1}] = 123;
-        gvar = gvar + 1;
-        _return_(gv[{0, 0}]);
-    }
-    m->add_func({bbb});
-
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        SCOPED_TRACE(std::string("Testing ") + get_engine_name(engine));
-        auto jitmod = engine->make_jit_module(m, false);
-        statics_table_t *pglobals = &jitmod->globals_;
-        auto &globals = *pglobals;
-        void *entry = globals.get_or_null("gv");
-        ASSERT_NE(entry, nullptr);
-        int32_t *real_gv = reinterpret_cast<int32_t *>(entry);
-        ASSERT_EQ(real_gv, jitmod->get_address_of_symbol("gv"));
-        // makes sure real_gv is a copy of values[]
-        ASSERT_EQ(real_gv[3], 4);
-
-        int32_t *pvar = reinterpret_cast<int32_t *>(
-                jitmod->get_address_of_symbol("gvar"));
-        ASSERT_TRUE(pvar);
-        ASSERT_EQ(*pvar, 2);
-
-        auto jitfunc = jitmod->get_function("bbb");
-        ASSERT_EQ(jitfunc->call<int32_t>(), 1456);
-        ASSERT_EQ(real_gv[1], 123);
-        ASSERT_EQ(*pvar, 3);
-
-        EXPECT_EQ(globals.impl_["gvar"], 0UL);
-        EXPECT_EQ(globals.impl_["gv"], 4UL);
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestJITDispatchTable) {
-    builder::ir_builder_t builder;
-    auto m = std::make_shared<ir_module_t>(get_default_context());
-    _function_(datatypes::s32, bbb_0) { _return_(0); }
-    _function_(datatypes::s32, bbb_1) { _return_(1); }
-    _function_(datatypes::s32, bbb_2) { _return_(2); }
-    _function_(datatypes::s32, ccc_0) { _return_(3); }
-    _function_(datatypes::s32, ccc_1) { _return_(4); }
-    m->add_func({bbb_0, bbb_1, bbb_2, ccc_0, ccc_1});
-    auto bbb_table = std::make_shared<op_dispatch_tables_t>();
-    auto dispatch_key = op_dispatch_key_t();
-    dispatch_key.in_out_formats_ = {sc_data_format_t::MK()};
-    using inf = op_dispatch_tables_t::op_func_info;
-    add_dispatch_symbol_to_kernel_table(bbb_table, &dispatch_key, inf("bbb_0"));
-    dispatch_key.in_out_formats_ = {sc_data_format_t::NK()};
-    add_dispatch_symbol_to_kernel_table(bbb_table, &dispatch_key, inf("bbb_1"));
-    dispatch_key.in_out_formats_ = {sc_data_format_t::NCHW()};
-    add_dispatch_symbol_to_kernel_table(bbb_table, &dispatch_key, inf("bbb_2"));
-    m->add_op_table(std::make_pair("bbb_table", bbb_table));
-    m->make_global_var(
-            datatypes::pointer, "bbb_table", linkage::private_global);
-    auto ccc_table = std::make_shared<op_dispatch_tables_t>();
-    dispatch_key.in_out_formats_ = {sc_data_format_t::MK()};
-    add_dispatch_symbol_to_kernel_table(ccc_table, &dispatch_key, inf("ccc_0"));
-    dispatch_key.in_out_formats_ = {sc_data_format_t::NK()};
-    add_dispatch_symbol_to_kernel_table(ccc_table, &dispatch_key, inf("ccc_1"));
-    m->add_op_table(std::make_pair("ccc_table", ccc_table));
-    m->make_global_var(
-            datatypes::pointer, "ccc_table", linkage::private_global);
-    auto engines = get_engines();
-    for (auto &engine : engines) {
-        auto jitm = engine->make_jit_module(m, false);
-        auto bbb_runtime_table = jitm->code_->op_tables_["bbb_table"];
-        auto ccc_runtime_table = jitm->code_->op_tables_["ccc_table"];
-        EXPECT_TRUE(bbb_runtime_table->kernel_dispatch_func_);
-        auto runtime_format = uint64_t(sc_data_format_t::MK().to_runtime());
-        EXPECT_TRUE(bbb_runtime_table->kernel_table_->get(&runtime_format, 1));
-        runtime_format = uint64_t(sc_data_format_t::NK().to_runtime());
-        EXPECT_TRUE(bbb_runtime_table->kernel_table_->get(&runtime_format, 1));
-        runtime_format = uint64_t(sc_data_format_t::NCHW().to_runtime());
-        EXPECT_TRUE(bbb_runtime_table->kernel_table_->get(&runtime_format, 1));
-        runtime_format = uint64_t(sc_data_format_t::MK().to_runtime());
-        EXPECT_TRUE(ccc_runtime_table->kernel_table_->get(&runtime_format, 1));
-        runtime_format = uint64_t(sc_data_format_t::NK().to_runtime());
-        EXPECT_TRUE(ccc_runtime_table->kernel_table_->get(&runtime_format, 1));
-        runtime_format = uint64_t(sc_data_format_t::NCHW().to_runtime());
-        EXPECT_TRUE(ccc_runtime_table->kernel_table_->get(&runtime_format, 1)
-                == nullptr);
-
-        EXPECT_EQ(*reinterpret_cast<runtime::op_dispatch_tables_t **>(
-                          jitm->globals_.get("bbb_table")),
-                bbb_runtime_table.get());
-        EXPECT_EQ(*reinterpret_cast<runtime::op_dispatch_tables_t **>(
-                          jitm->globals_.get("ccc_table")),
-                ccc_runtime_table.get());
-    }
-}
-
-#if SC_BUILTIN_JIT_ENABLED
-TEST(GCCore_CPU_jit_cpp, TestDivisionInvariantInteger) {
-    // Test 64 bit unsigned division by invariant integer
-    const auto mulh_u64 = [](const uint64_t n, const uint64_t m) {
-        return uint64_t((utils::uint128_t(n) * utils::uint128_t(m)) >> 64);
-    };
-    // use random 64 bit numbers to test
-    std::random_device rd;
-    std::mt19937_64 e2(rd());
-    std::uniform_int_distribution<uint64_t> dist64;
-    for (int i = 0; i < 1024; i++) {
-        uint64_t result;
-        uint64_t x = dist64(e2);
-        uint64_t y = dist64(e2) % INT_MAX + 2;
-        // get multiplier
-        const auto mult = xbyak::invariant_int::UintDivMultiplier(y, 64);
-        // use multiplier to div
-        if (mult.power_of_2_) {
-            result = x >> mult.sft_pre_;
-        } else if (mult.compensate_) {
-            uint64_t t = mulh_u64(x, mult.magic_);
-            result = (((x - t) >> mult.sft_pre_) + t) >> mult.sft_post_;
-        } else {
-            uint64_t t = mulh_u64(x >> mult.sft_pre_, mult.magic_);
-            result = t >> mult.sft_post_;
-        }
-        // compare result
-        EXPECT_EQ(result, x / y);
-    }
-
-    // Test 32 bit signed division by invariant integer
-    const auto mulh_s32 = [](const int32_t n, const int32_t m) {
-        return int32_t((int64_t(n) * int64_t(m)) >> 32);
-    };
-    // use random 32 bit numbers to test
-    std::uniform_int_distribution<uint32_t> dist32;
-    for (int i = 0; i < 1024; i++) {
-        int32_t result;
-        int32_t x = int32_t(dist32(e2));
-        int32_t y = int32_t(dist32(e2) % INT_MAX + 2);
-        x = (x % 3 == 0) ? -x : x;
-        y = (y % 3 == 0) ? -y : y;
-        // get multiplier
-        const auto mult = xbyak::invariant_int::SintDivMultiplier(y, 32);
-        // use multiplier to div
-        if (mult.power_of_2_) {
-            auto a = uint32_t(x >> (mult.sft_ - 1)) >> (32 - mult.sft_);
-            result = (int32_t(a) + x) >> mult.sft_;
-            if (mult.negative_) { result = -result; }
-        } else if (mult.compensate_) {
-            int32_t t = mulh_s32(x, (int32_t)mult.magic_);
-            int32_t s = x >> 31;
-            result = ((x + t) >> mult.sft_) - s;
-            if (mult.negative_) { result = -result; }
-        } else {
-            int32_t t = mulh_s32(x, (int32_t)mult.magic_);
-            int32_t s = x >> 31;
-            result = (t >> mult.sft_) - s;
-            if (mult.negative_) { result = -result; }
-        }
-        // compare result
-        EXPECT_EQ(result, x / y);
-    }
-}
-
-TEST(GCCore_CPU_jit_cpp, TestDivisionInvariantIntegerSIMD) {
-    REQUIRE_AVX2();
-    int lanes = get_default_context()->get_max_vector_lanes(sc_data_etype::S32);
-    auto s32xlanes = sc_data_type_t::s32(lanes);
-    //
-    std::random_device rd;
-    std::mt19937_64 e2(rd());
-    std::uniform_int_distribution<uint32_t> dist32;
-    auto gen = [&]() {
-        int32_t x = int32_t(dist32(e2));
-        x = (x % 3 == 0) ? -x : x;
-        return x;
-    };
-    // Test 32 bit signed division by invariant integer
-    for (int i = 0; i < 16; i++) {
-        std::vector<int32_t> host_in(lanes);
-        std::vector<int32_t> host_out(lanes);
-        std::vector<int32_t> host_ref(lanes);
-        //
-        std::generate(host_in.begin(), host_in.end(), gen);
-        //
-        int32_t y = i + 2;
-        y = (y % 3 == 0) ? -y : y;
-        //
-        std::transform(host_in.begin(), host_in.end(), host_ref.begin(),
-                [y](int32_t n) { return n / y; });
-        // Test jit
-        ir_builder_t builder;
-        _function_(datatypes::void_t, foo,
-                _arg_("tensor_in", datatypes::s32, {lanes}),
-                _arg_("tensor_out", datatypes::s32, {lanes})) {
-            _bind_(tensor_in, tensor_out);
-            _var_(out, s32xlanes);
-            out = tensor_in[span_t({0}, lanes)];
-            out = out / make_constant({int64_t(y)}, s32xlanes);
-            tensor_out[span_t({0}, lanes)] = out;
-        }
-        // make and call jit function
-        auto ir_mod = std::make_shared<ir_module_t>(
-                get_default_context(), std::vector<func_t> {foo}, 0);
-        auto je = std::make_shared<xbyak_jit>();
-        auto jm = je->make_jit_module(ir_mod, true);
-        auto jf = jm->get_function("foo");
-        //
-        generic_val generic_args[] = {
-                (void *)(host_in.data()),
-                (void *)(host_out.data()),
-        };
-        jf->call_generic_default(generic_args);
-        // Comapre result
-        for (int i = 0; i < lanes; ++i) {
-            ASSERT_EQ(host_out[i], host_ref[i]);
-        }
-    }
-}
-#endif
-
-#if 0
-
-/*
-The following test check that the alias info is correctly propagated to LLVM
-codegen. It heavily depends on LLVM's optimization result, so we decide not add
-it to the test set. The result LLVM IR should be:
-
-define i32 @aaa(i8* nocapture readnone %__stream_arg, i8* noalias nocapture
-nonnull readnone %__module_data_arg, i32* nocapture nonnull %buf_arg, i32*
-nocapture nonnull %out_arg, i32* noalias nocapture nonnull %out2_arg, i32*
-noalias nocapture nonnull %out3_arg) local_unnamed_addr #0 { entry: store i32 1,
-i32* %buf_arg, align 4, !alias.scope !0, !noalias !3 store i32 3, i32* %out_arg,
-align 4, !alias.scope !0, !noalias !3 %0 = load i32, i32* %buf_arg, align 4,
-!alias.scope !0, !noalias !3 %1 = add i32 %0, 1 store i32 %1, i32* %buf_arg,
-align 4, !alias.scope !0, !noalias !3 %2 = getelementptr i32, i32* %buf_arg, i64
-1 store i32 3, i32* %out2_arg, align 4, !alias.scope !7, !noalias !8 store i32
-2, i32* %2, align 4, !alias.scope !0, !noalias !3 %3 = getelementptr i32, i32*
-%out2_arg, i64 1 store i32 3, i32* %3, align 4, !alias.scope !7, !noalias !8
-  store i32 2, i32* %out3_arg, align 4, !alias.scope !9, !noalias !10
-  ret i32 123
-}
-
-Note that out and buf has alias, so no optimization on buf[0] = buf[0] + 1;
-out & buf2, buf3 & buf2 has no alias, so LLVM can optimize it
-*/
-
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-TEST(GCCore_CPU_jit_cpp, TestJITLLVMAlias) {
-    ir_builder_t builder;
-    _function_(datatypes::s32, aaa, _arg_("buf", datatypes::s32, {2}),
-            _arg_("out", datatypes::s32, {2}),
-            _arg_("out2", datatypes::s32, {2}),
-            _arg_("out3", datatypes::s32, {2})) {
-        _bind_(buf, out, out2, out3);
-        {
-            auto clique = std::make_shared<alias_info::alias_set_t>();
-            alias_info::get_or_create_alias_info(*buf.get())
-                    ->add_to_clique(clique);
-            alias_info::get_or_create_alias_info(*out.get())
-                    ->add_to_clique(clique);
-        }
-        buf[0] = 1;
-        out[0] = 3;
-        buf[0] = buf[0] + 1;
-
-        buf[1] = 1;
-        out2[0] = 3;
-        buf[1] = buf[1] + 1;
-
-        out3[0] = 1;
-        out2[1] = 3;
-        out3[0] = out3[0] + 1;
-        _return_(123);
-    }
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    ctx->flags_.index2var_ = false;
-    auto mod = ir_module_t::from_entry_func(ctx, aaa);
-
-    auto jitf = llvm_jit(ctx).get_entry_func(mod, false);
-    float A[2], B[2], C[2], D[2];
-    jitf->call<int32_t>(A, B, C, D);
-}
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit_engine_equivalence.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit_engine_equivalence.cpp
deleted file mode 100644
index 49ffaaf815c..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit_engine_equivalence.cpp
+++ /dev/null
@@ -1,3135 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include "compiler/config/context.hpp"
-#include "context.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#if SC_CFAKE_JIT_ENABLED
-#include <compiler/jit/cfake/cfake_jit.hpp>
-#endif
-#include "test_utils.hpp"
-#if defined(SC_LLVM_BACKEND)
-#include <compiler/jit/llvm/llvm_jit.hpp>
-#endif
-#include <compiler/ir/builtin.hpp>
-#if SC_BUILTIN_JIT_ENABLED
-#include <compiler/jit/xbyak/xbyak_jit.hpp>
-#endif
-#include "util/bf16.hpp"
-#include "util/fp16.hpp"
-#include <runtime/generic_val.hpp>
-#include <runtime/runtime.hpp>
-#include <util/string_utils.hpp>
-#include <util/utils.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-using std::endl;
-using std::make_shared;
-using std::map;
-using std::ostringstream;
-using std::shared_ptr;
-using std::string;
-using std::vector;
-
-static const map<string, shared_ptr<jit_engine_t>> &get_engines() {
-    auto f = []() {
-        map<string, shared_ptr<jit_engine_t>> ret;
-#if SC_CFAKE_JIT_ENABLED
-        ret["cfake_jit"] = make_shared<cfake_jit>();
-#endif
-#if defined(SC_LLVM_BACKEND)
-        ret["llvm_jit"] = make_shared<llvm_jit>();
-#endif
-#if SC_BUILTIN_JIT_ENABLED
-        if (get_default_context()->machine_.cpu_flags_.fAVX2) {
-            ret["xbyak_jit"] = make_shared<xbyak_jit>();
-        }
-#endif
-        return ret;
-    };
-    static auto ret = f();
-    return ret;
-}
-
-static const runtime::cpu_flags_t &test_cpu_flags() {
-    static auto ret = get_default_context()->machine_.cpu_flags_;
-    return ret;
-}
-static const bool is_cpu_support_fp16() {
-    return get_default_context()->machine_.cpu_flags_.fAVX512FP16;
-}
-static bool use_cfake = true;
-
-static bool is_cfake(const std::string &jitname) {
-    return jitname == "cfake_jit";
-}
-
-static bool is_jit_support_fp16(const std::string &jitname) {
-    // Our cfake uses _Float16, which needs to be supported by g++.
-    // Our xbyak has some invalid instructions when do fp16 cast under
-    // non-avx512fp16 scenario.
-    if (jitname == "llvm_jit") {
-#if defined(SC_LLVM_BACKEND)
-        return true;
-#else
-        return false;
-#endif
-    } else if (jitname == "cfake_jit") {
-#if SC_CFAKE_JIT_ENABLED
-        auto f = []() {
-            auto tm = get_default_context()->machine_;
-            cfake_jit::set_target_machine(tm);
-            auto ret = tm.cpu_flags_.fAVX512FP16;
-            return ret;
-        };
-        static bool ret = f();
-        return ret;
-#else
-        return false;
-#endif
-    } else {
-#if SC_BUILTIN_JIT_ENABLED
-        return true;
-#else
-        return false;
-#endif
-    }
-}
-
-static uint16_t get_lanes(
-        const sc_data_etype &etype, const uint16_t data_lanes = 16) {
-    static context_ptr cptr = get_default_context();
-    return std::min(data_lanes, cptr->get_max_vector_lanes(etype));
-}
-
-//===========================================================================
-// Pre-defined dataset
-//===========================================================================
-#define DATA_LEN_8 8
-#define DATA_LEN_16 16
-#define DATA_LEN_64 64
-
-#define DATASET_I1 \
-    { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }
-#define DATASET_I2 \
-    { 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }
-#define DATASET_I3 \
-    { -16, -15, -14, -13, -12, -11, -10, -9, 8, 7, 6, 5, 4, 3, 2, 1 }
-#define DATASET_I1_8 \
-    { 1, 2, 3, 4, 5, 6, 7, 8 }
-#define DATASET_I1_4 \
-    { 1, 2, 3, 4 }
-#define DATASET_I1_64 \
-    { \
-        1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \
-                21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, \
-                36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, \
-                51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 \
-    }
-#define DATASET_F1 \
-    { \
-        1.16f, 2.15f, 3.14f, 4.13f, 5.12f, 6.11f, 7.10f, 8.9f, 9.8f, 10.7f, \
-                11.6f, 12.5f, 13.4f, 14.3f, 15.2f, 16.1f \
-    }
-#define DATASET_F2 \
-    { \
-        16.1f, 15.2f, 14.3f, 13.4f, 12.5f, 11.6f, 10.7f, 9.8f, 8.9f, 7.10f, \
-                6.11f, 5.12f, 4.13f, 3.14f, 2.15f, 1.16f \
-    }
-#define DATASET_F3 \
-    { \
-        -64.1f, -32.9f, -16.2f, -8.8f, -4.3f, -2.7f, -1.4f, 0.6f, 0.5f, 1.5f, \
-                2.6f, 4.4f, 8.7f, 16.3f, 32.8f, 64.2f \
-    }
-#define DATASET_F4 \
-    { \
-        -64.1f, -32.9f, -16.2f, -8.8f, -4.3f, -2.7f, -1.4f, 0.6f, 1.16f, \
-                2.15f, 3.14f, 4.13f, 5.12f, 6.11f, 7.10f, 8.9f \
-    }
-#define DATASET_BF1 \
-    { \
-        0x3f94, 0x400a, 0x4049, 0x4084, 0x40a4, 0x40c4, 0x40e3, 0x410e, \
-                0x411d, 0x412b, 0x413a, 0x4148, 0x4156, 0x4165, 0x4173, 0x4181 \
-    }
-#define DATASET_BF2 \
-    { \
-        0x4181, 0x4173, 0x4165, 0x4156, 0x4148, 0x413a, 0x412b, 0x411d, \
-                0x410e, 0x40e3, 0x40c4, 0x40a4, 0x4084, 0x4049, 0x400a, 0x3f94 \
-    }
-#define DATASET_BF3 \
-    { \
-        0xc280, 0xc204, 0xc182, 0xc10d, 0xc08a, 0xc02d, 0xbfb3, 0x3f1a, \
-                0x3f00, 0x3fc0, 0x4026, 0x408d, 0x410b, 0x4182, 0x4203, 0x4280 \
-    }
-
-//===========================================================================
-// Pre-defined cast
-//===========================================================================
-
-#define F32_TO_BF16(X) (bf16_t(X).storage_)
-#define BF16_TO_F32(X) ((float)bf16_t::from_storage(X))
-
-//===========================================================================
-// Test op kind
-//===========================================================================
-#define UNARY 1
-#define BINARY 2
-#define TRINARY 3
-
-//===========================================================================
-// Test options
-//===========================================================================
-#define EXACT true
-#define APPROX false
-#define TEST_SCALAR true
-#define SKIP_SCALAR false
-#define TEST_SIMD true
-#define SKIP_SIMD false
-static float precision_threshold = 1e-4F;
-//===========================================================================
-// MAKE_OP wrapper
-//===========================================================================
-#define MAKE_UNARY_OP(MAKE_OP) \
-    [](expr buf, expr idx, int lanes) { \
-        return MAKE_OP(buf[span_t({0, idx}, lanes)]); \
-    }
-#define MAKE_BINARY_OP(MAKE_OP) \
-    [](expr buf, expr idx, int lanes) { \
-        return MAKE_OP( \
-                buf[span_t({0, idx}, lanes)], buf[span_t({1, idx}, lanes)]); \
-    }
-#define MAKE_TRINARY_OP(MAKE_OP) \
-    [](expr buf, expr idx, int lanes) { \
-        return MAKE_OP(buf[span_t({0, idx}, lanes)], \
-                buf[span_t({1, idx}, lanes)], buf[span_t({2, idx}, lanes)]); \
-    }
-#define MAKE_CAST(DTYPE) \
-    [](expr buf, expr idx, int lanes) { \
-        return make_cast(sc_data_type_t(DTYPE.type_code_, lanes), \
-                buf[span_t({0, idx}, lanes)]); \
-    }
-#define MAKE_MASK_MOV(MASK_LOCAL) \
-    [MASK_LOCAL](expr buf, expr idx, int lanes) { \
-        return (expr)buf[span_t({0, idx}, lanes, MASK_LOCAL)]; \
-    }
-
-using MAKE_EXPR_OP = std::function<expr(expr, expr, int)>;
-
-//===========================================================================
-// Macro for element-wise operation tests
-//---------------------------------------------------------------------------
-// INPUT: int(UNARY/BINARY/TRINARY); determine op kind and input size
-// PRECISION: bool(EXACT/APPROX); determine use EXPECT_EQ or EXPECT_NEAR
-// TYPE_IN: c++ types; input array type
-// TYPE_OUT: c++ types; output and ref array type
-// DTYPE_IN: sc_data_type_t; input data type
-// DTYPE_OUT: sc_data_type_t; output data type
-// DATA_LEN: int(16/32/64); dataset length want to be tested
-// NUM_LANES: int(2/4/8/16); simd lanes want to be tested
-// SCALAR: bool(TEST_SCALAR/SKIP_SCALAR); determine if test scalar op
-// SIMD: bool(TEST_SIMD/SKIP_SIMD); determine if test simd op
-// REF_OP: Macro; provide calculation for reference
-// MAKE_OP: MAKE_EXPR_OP; provide builder::make_xxx() func for op maker on ir
-// __VA_ARGS__: pre defined datasets for testing, num must match INPUT
-//===========================================================================
-#define TEST_OP(INPUT, PRECISION, TYPE_IN, TYPE_OUT, DTYPE_IN, DTYPE_OUT, \
-        DATA_LEN, NUM_LANES, SCALAR, SIMD, REF_OP, MAKE_OP, ...) \
-    { \
-        TYPE_IN tensor_in[INPUT][DATA_LEN] = {__VA_ARGS__}; \
-        TYPE_OUT tensor_out[DATA_LEN]; \
-        TYPE_OUT tensor_ref[DATA_LEN]; \
-        generic_val generic_args[] = { \
-                tensor_in[0], \
-                tensor_out, \
-        }; \
-        for (int i = 0; i < DATA_LEN; ++i) { \
-            tensor_ref[i] = REF_OP(tensor_in, NUM_LANES, i); \
-        } \
-        TEST_IR_OP<TYPE_OUT, PRECISION, INPUT, SCALAR, SIMD>(MAKE_OP, \
-                tensor_out, tensor_ref, generic_args, DTYPE_IN, DTYPE_OUT, \
-                DATA_LEN, NUM_LANES); \
-    }
-
-//===========================================================================
-// Template function for element-wise operation tests
-//===========================================================================
-template <typename TYPE, bool PRECISION, int INPUT, bool SCALAR, bool SIMD>
-void TEST_IR_OP(MAKE_EXPR_OP make_op, TYPE *out, TYPE *ref,
-        generic_val *generic_args, sc_data_type_t type_in,
-        sc_data_type_t type_out, int data_len, int lanes) {
-// Make ir function with op
-#define DEFINE_IR_FUNC(NAME, LANES) \
-    _function_(datatypes::void_t, NAME, \
-            _arg_("buf_I", type_in, {INPUT, data_len}), \
-            _arg_("buf_O", type_out, {data_len})) { \
-        _bind_(buf_I, buf_O); \
-        _for_(idx, 0, data_len, LANES) { \
-            buf_O[span_t({idx}, LANES)] = make_op(buf_I, idx, LANES); \
-        } \
-    } \
-    ir_mod->add_func({NAME});
-// Test ir function with op
-#define TEST_IR_FUNC(NAME) \
-    shared_ptr<jit_function_t> jf = jm->get_function(NAME); \
-    EXPECT_NE(jf, nullptr); \
-    if (!jf) { continue; } \
-    for (int i = 0; i < data_len; i++) { \
-        out[i] = 0; \
-    } \
-    jf->call_generic_default(generic_args); \
-    if (PRECISION) { \
-        for (int i = 0; i < data_len; i++) { \
-            EXPECT_EQ(out[i], ref[i]); \
-        } \
-    } else { \
-        for (int i = 0; i < data_len; i++) { \
-            EXPECT_NEAR( \
-                    out[i], ref[i], std::abs(precision_threshold *ref[i])); \
-        } \
-    }
-    // Make ir module
-    ir_builder_t builder;
-    auto ctx = get_default_context();
-    auto ir_mod = std::make_shared<ir_module_t>(ctx);
-    // Add test functions
-    if (SCALAR) { DEFINE_IR_FUNC(test_scalar, 1); }
-    if (SIMD) { DEFINE_IR_FUNC(test_simd, lanes); }
-    // Run the test
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-        if (!use_cfake) {
-            if (je_name == "cfake_jit") continue;
-        }
-        if (!is_jit_support_fp16(je_name)) { continue; }
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-        // jit_engine_t
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-        // jit_module
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-        if (!jm) { continue; }
-        if (SCALAR) { TEST_IR_FUNC("test_scalar"); }
-        if (SIMD) { TEST_IR_FUNC("test_simd"); }
-    }
-#undef DEFINE_IR_FUNC
-#undef TEST_IR_FUNC
-}
-
-/// ==================================
-/// Test Group 1: functionality & stmt
-/// ==================================
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestLocalTensor) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("x_in", datatypes::s32),
-            _arg_("x_out", datatypes::s32, {1})) {
-        _bind_(x_in, x_out);
-        _tensor_(my_tensor, datatypes::s32, {2});
-        my_tensor[0] = x_in;
-        x_out[0] = my_tensor[0];
-    }
-
-    ir_module_ptr ir_mod
-            = ir_module_t::from_entry_func(get_default_context(), foo);
-
-    for (auto &kv : get_engines()) {
-        const int32_t host_x_in = 42;
-        int32_t host_x_out[1] = {0};
-
-        generic_val generic_args[] = {host_x_in, &host_x_out};
-
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = je->get_entry_func(ir_mod, true);
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        void *native_func = jf->get_function_pointer();
-        EXPECT_NE(native_func, nullptr);
-        if (!native_func) { continue; }
-
-        jf->call_generic_default(generic_args);
-        EXPECT_EQ(host_x_in, host_x_out[0]);
-    }
-}
-
-/// Verifies that the address of a caller-supplied tensor is accurately
-/// passed down to JIT-generated code that accesses that tensor.
-TEST(GCCore_CPU_jit_engine_equivalence, TestTensorAddrPassing) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("tensor_in", datatypes::f32, {1}),
-            _arg_("result", datatypes::index, {1})) {
-        _bind_(tensor_in, result);
-        result[0] = make_cast(datatypes::index, tensor_in);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    float host_tensor_in[1];
-    void *host_result;
-
-    generic_val generic_args[] = {(void *)(host_tensor_in), &host_result};
-
-    for (auto &kv : get_engines()) {
-        host_result = nullptr;
-
-        const string &je_name = kv.first;
-        if (je_name == "llvm_jit") continue;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        // TODO(xxx): In the cfake_jit, the generic wrapper has TWO parameters
-        // if 'is_parallel' is true. (See 'write_cpp_generic_wrapper(...)' in
-        // codegen/codegen_c.cpp) For now we're assuming 'is_parallel' is false.
-        jf->call_generic_default(generic_args);
-
-        ASSERT_EQ(&host_tensor_in, host_result);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestSequentialElwiseAdd) {
-    ir_builder_t builder;
-
-    const int d1_size = 2;
-    const int d2_size = 2;
-
-    // Compute C = A + B
-    _function_(datatypes::void_t, foo,
-            _arg_("buf_A", datatypes::f32, {d1_size, d2_size}),
-            _arg_("buf_B", datatypes::f32, {d1_size, d2_size}),
-            _arg_("buf_C", datatypes::f32, {d1_size, d2_size})) {
-        _bind_(buf_A, buf_B, buf_C);
-        _for_(idx1, 0, d1_size) {
-            _for_(idx2, 0, d2_size) {
-                buf_C[{idx1, idx2}] = buf_A[{idx1, idx2}] + buf_B[{idx1, idx2}];
-            }
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    const int num_elem = d1_size * d2_size;
-    float A[num_elem];
-    float B[num_elem];
-    float C[num_elem];
-    float C_expected[num_elem];
-
-    for (int i = 0; i < num_elem; ++i) {
-        A[i] = float(i);
-        B[i] = float(i + num_elem);
-        C_expected[i] = A[i] + B[i];
-    }
-
-    generic_val generic_args[] = {(void *)(A), (void *)(B), (void *)(C)};
-
-    for (auto &kv : get_engines()) {
-        for (int i = 0; i < num_elem; ++i) {
-            C[i] = float(-1);
-        }
-
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        // TODO(xxx): In the cfake_jit, the generic wrapper has TWO parameters
-        // if 'is_parallel' is true. (See 'write_cpp_generic_wrapper(...)' in
-        // codegen/codegen_c.cpp) For now we're assuming 'is_parallel' is false.
-        jf->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elem; ++i) {
-            ASSERT_EQ(C[i], C_expected[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestTrivialGenericWrapper) {
-    ir_builder_t builder;
-    _function_(datatypes::void_t, foo) {}
-
-    ir_module_ptr ir_mod
-            = ir_module_t::from_entry_func(get_default_context(), foo);
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_function_t> jf = je->get_entry_func(ir_mod, true);
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        jf->call_generic_default(nullptr);
-    }
-}
-
-// Disabled because the WIP Xbyak backend currently produces only
-// the generic wrapper entry function, not the stronger-typed
-// entry function.
-TEST(GCCore_CPU_jit_engine_equivalence, DISABLED_TestSimpleEntryFunction) {
-    ir_builder_t builder;
-    _function_(datatypes::s32, foo) { _return_(42); }
-
-    ir_module_ptr ir_mod
-            = ir_module_t::from_entry_func(get_default_context(), foo);
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_function_t> jf = je->get_entry_func(ir_mod, true);
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        void *native_func = jf->get_function_pointer();
-        EXPECT_NE(native_func, nullptr);
-        if (!native_func) { continue; }
-
-        EXPECT_EQ(jf->call<int32_t>(), 42);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestFuncAddrNode) {
-    const void *host_print_int_addr = (void *)(&print_int);
-
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("jit_print_int_addr", datatypes::index, {1})) {
-        _bind_(jit_print_int_addr);
-        jit_print_int_addr[0] = make_cast(datatypes::index,
-                make_func_addr(make_func("print_int",
-                        {_arg_("x", datatypes::s32)}, {}, datatypes::void_t)));
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    const void *returned_print_int_addr;
-    generic_val generic_args[] = {&returned_print_int_addr};
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-        if (je_name == "llvm_jit") continue;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        returned_print_int_addr = nullptr;
-        jf->call_generic_default(generic_args);
-
-        ASSERT_EQ(host_print_int_addr, returned_print_int_addr);
-    }
-}
-
-// Verify that a named-for-loop's index variable has the expected
-// sequence of values.
-TEST(GCCore_CPU_jit_engine_equivalence, TestNamedForLoop) {
-    constexpr int expected_num_iter = 4;
-    const uint64_t expected_idx_values[] = {0, 3, 6, 9};
-
-    uint64_t host_num_iter;
-    uint64_t host_idx_values[expected_num_iter]; // NOLINT
-
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("num_iter", datatypes::index, {1}),
-            _arg_("idx_values", datatypes::index, {expected_num_iter}), ) {
-        _bind_(num_iter, idx_values);
-        num_iter[0] = 0;
-        for_loop l;
-        _named_for_(l, idx, 0, 10, 3) {
-            idx_values[num_iter[0]] = idx;
-            num_iter[0] = num_iter[0] + 1;
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    generic_val gv0 = &host_num_iter;
-    generic_val gv1 = &host_idx_values;
-    generic_val generic_args[] = {gv0, gv1};
-
-    for (auto &kv : get_engines()) {
-        // Initialize buffers to incorrect values...
-        host_num_iter = 42;
-        host_idx_values[0] = 42;
-        host_idx_values[1] = 42;
-        host_idx_values[2] = 42;
-        host_idx_values[3] = 42;
-
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        j_foo->call_generic_default(generic_args);
-
-        auto exp_num_iter = static_cast<uint64_t>(expected_num_iter);
-        EXPECT_EQ(exp_num_iter, host_num_iter);
-        for (int i = 0; i < expected_num_iter; ++i) {
-            EXPECT_EQ(expected_idx_values[i], host_idx_values[i]);
-        }
-    }
-}
-
-// Verify that an if_else statement flows into the correct branch,
-// and DOES NOT flow into the incorrect branch.
-TEST(GCCore_CPU_jit_engine_equivalence, TestIfElse) {
-    uint64_t host_basic_blocks_visited[5];
-    const uint64_t host_first_tested_value = 1;
-    const uint64_t host_second_tested_value = 1;
-
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("basic_blocks_visited", datatypes::index, {5}),
-            _arg_("first_tested_value", datatypes::index),
-            _arg_("second_tested_value", datatypes::index), ) {
-        _bind_(basic_blocks_visited, first_tested_value, second_tested_value);
-        _if_(first_tested_value == 1) { basic_blocks_visited[0] = 1; }
-        else {
-            basic_blocks_visited[1] = 1;
-        }
-
-        _if_(second_tested_value != 1) { basic_blocks_visited[2] = 1; }
-        else {
-            basic_blocks_visited[3] = 1;
-        }
-
-        _if_(!(first_tested_value == 0)) { basic_blocks_visited[4] = 2; }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    generic_val gv0 = &host_basic_blocks_visited;
-    generic_val gv1 = host_first_tested_value;
-    generic_val gv2 = host_second_tested_value;
-    generic_val generic_args[] = {gv0, gv1, gv2};
-
-    for (auto &kv : get_engines()) {
-        // Mark all 4 basic blocks as not-visited...
-        host_basic_blocks_visited[0] = 0;
-        host_basic_blocks_visited[1] = 0;
-        host_basic_blocks_visited[2] = 0;
-        host_basic_blocks_visited[3] = 0;
-        host_basic_blocks_visited[4] = 0;
-
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        j_foo->call_generic_default(generic_args);
-
-        EXPECT_EQ(host_basic_blocks_visited[0], 1UL);
-        EXPECT_EQ(host_basic_blocks_visited[1], 0UL);
-        EXPECT_EQ(host_basic_blocks_visited[2], 0UL);
-        EXPECT_EQ(host_basic_blocks_visited[3], 1UL);
-        EXPECT_EQ(host_basic_blocks_visited[4], 2UL);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestCMPExprSint) {
-    ir_builder_t builder;
-
-    const int num_elems = 16;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("first", datatypes::s32, {num_elems}),
-            _arg_("second", datatypes::s32, {num_elems}),
-            _arg_("result_eq", datatypes::boolean, {num_elems}),
-            _arg_("result_le", datatypes::boolean, {num_elems}),
-            _arg_("result_lt", datatypes::boolean, {num_elems}),
-            _arg_("result_ne", datatypes::boolean, {num_elems}),
-            _arg_("result_ge", datatypes::boolean, {num_elems}),
-            _arg_("result_gt", datatypes::boolean, {num_elems}), ) {
-        _bind_(first, second, result_eq, result_le, result_lt, result_ne,
-                result_ge, result_gt);
-        _for_(i, 0, num_elems) {
-            result_eq[i] = first[i] == second[i];
-            result_le[i] = first[i] <= second[i];
-            result_lt[i] = first[i] < second[i];
-            result_ne[i] = first[i] != second[i];
-            result_ge[i] = first[i] >= second[i];
-            result_gt[i] = first[i] > second[i];
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        const int32_t INT_MIN_ = -2147483647 - 1;
-        // first (prev 8 positions) test positive number as : == <= < != >= > ==
-        // > second test negative number as : == <= < != >= > == >
-        int32_t host_input_first[num_elems]
-                = {100, 0, 3000, 5, 6, 8, 0x7fffffff, 0x7ffffff1, -1, INT_MIN_,
-                        -2, -3000, -200, -1, INT_MIN_ + 3, INT_MIN_ + 2};
-        int32_t host_input_second[num_elems]
-                = {100, 0, 4000, 4, 5, 6, 0x7fffffff, 0x7ffffff0, -1, INT_MIN_,
-                        -1, -4000, -200, -2, INT_MIN_ + 3, INT_MIN_ + 1};
-        bool host_out_result_eq[num_elems] = {false};
-        bool host_out_result_le[num_elems] = {false};
-        bool host_out_result_lt[num_elems] = {false};
-        bool host_out_result_ne[num_elems] = {false};
-        bool host_out_result_ge[num_elems] = {false};
-        bool host_out_result_gt[num_elems] = {false};
-        bool expected_result_eq[num_elems]
-                = {true, true, false, false, false, false, true, false, true,
-                        true, false, false, true, false, true, false};
-        bool expected_result_le[num_elems]
-                = {true, true, true, false, false, false, true, false, true,
-                        true, true, false, true, false, true, false};
-        bool expected_result_lt[num_elems]
-                = {false, false, true, false, false, false, false, false, false,
-                        false, true, false, false, false, false, false};
-        bool expected_result_ne[num_elems]
-                = {false, false, true, true, true, true, false, true, false,
-                        false, true, true, false, true, false, true};
-        bool expected_result_gt[num_elems]
-                = {false, false, false, true, true, true, false, true, false,
-                        false, false, true, false, true, false, true};
-        bool expected_result_ge[num_elems]
-                = {true, true, false, true, true, true, true, true, true, true,
-                        false, true, true, true, true, true};
-
-        generic_val generic_args[] = {&host_input_first, &host_input_second,
-                &host_out_result_eq, &host_out_result_le, &host_out_result_lt,
-                &host_out_result_ne, &host_out_result_ge, &host_out_result_gt};
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_EQ(host_out_result_eq[i], expected_result_eq[i]);
-            EXPECT_EQ(host_out_result_lt[i], expected_result_lt[i]);
-            EXPECT_EQ(host_out_result_le[i], expected_result_le[i]);
-            EXPECT_EQ(host_out_result_ne[i], expected_result_ne[i]);
-            EXPECT_EQ(host_out_result_ge[i], expected_result_ge[i]);
-            EXPECT_EQ(host_out_result_gt[i], expected_result_gt[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestCMPExprUint) {
-    ir_builder_t builder;
-
-    const int num_elems = 16;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("first", datatypes::u32, {num_elems}),
-            _arg_("second", datatypes::u32, {num_elems}),
-            _arg_("result_eq", datatypes::boolean, {num_elems}),
-            _arg_("result_le", datatypes::boolean, {num_elems}),
-            _arg_("result_lt", datatypes::boolean, {num_elems}),
-            _arg_("result_ne", datatypes::boolean, {num_elems}),
-            _arg_("result_ge", datatypes::boolean, {num_elems}),
-            _arg_("result_gt", datatypes::boolean, {num_elems}), ) {
-        _bind_(first, second, result_eq, result_le, result_lt, result_ne,
-                result_ge, result_gt);
-        _for_(i, 0, num_elems) {
-            result_eq[i] = first[i] == second[i];
-            result_le[i] = first[i] <= second[i];
-            result_lt[i] = first[i] < second[i];
-            result_ne[i] = first[i] != second[i];
-            result_ge[i] = first[i] >= second[i];
-            result_gt[i] = first[i] > second[i];
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        // fist (prev eight position) == <= < != >= > == >
-        // == <= < != >= > == >
-        uint32_t host_input_first[num_elems] = {0xfffffff0, 0xfffffff1,
-                0xfffffff1, 0xfffffff1, 0xfffffff2, 0xfffffff1, 0, 100011,
-                0xffff0000, 0xfff00000, 0xff000000, 0xf0000000, 0xf0000000,
-                0xf00000ff, 0xff888888, 0xfff88888};
-        uint32_t host_input_second[num_elems] = {0xfffffff0, 0xfffffff1,
-                0xfffffff2, 0xfffffff0, 0xfffffff2, 0xfffffff0, 0, 100010,
-                0xffff0000, 0xffffffff, 0xfff00000, 0xf000000f, 0xf0000000,
-                0xf0000000, 0xff888888, 0xfff88880};
-        bool host_out_result_eq[num_elems] = {false};
-        bool host_out_result_le[num_elems] = {false};
-        bool host_out_result_lt[num_elems] = {false};
-        bool host_out_result_ne[num_elems] = {false};
-        bool host_out_result_ge[num_elems] = {false};
-        bool host_out_result_gt[num_elems] = {false};
-        bool expected_result_eq[num_elems]
-                = {true, true, false, false, true, false, true, false, true,
-                        false, false, false, true, false, true, false};
-        bool expected_result_le[num_elems]
-                = {true, true, true, false, true, false, true, false, true,
-                        true, true, true, true, false, true, false};
-        bool expected_result_lt[num_elems]
-                = {false, false, true, false, false, false, false, false, false,
-                        true, true, true, false, false, false, false};
-        bool expected_result_ne[num_elems]
-                = {false, false, true, true, false, true, false, true, false,
-                        true, true, true, false, true, false, true};
-        bool expected_result_gt[num_elems]
-                = {false, false, false, true, false, true, false, true, false,
-                        false, false, false, false, true, false, true};
-        bool expected_result_ge[num_elems]
-                = {true, true, false, true, true, true, true, true, true, false,
-                        false, false, true, true, true, true};
-
-        generic_val generic_args[] = {&host_input_first, &host_input_second,
-                &host_out_result_eq, &host_out_result_le, &host_out_result_lt,
-                &host_out_result_ne, &host_out_result_ge, &host_out_result_gt};
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_EQ(host_out_result_eq[i], expected_result_eq[i]);
-            EXPECT_EQ(host_out_result_lt[i], expected_result_lt[i]);
-            EXPECT_EQ(host_out_result_le[i], expected_result_le[i]);
-            EXPECT_EQ(host_out_result_ne[i], expected_result_ne[i]);
-            EXPECT_EQ(host_out_result_ge[i], expected_result_ge[i]);
-            EXPECT_EQ(host_out_result_gt[i], expected_result_gt[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestConstantBroadcast) {
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-
-    // We'll use over-sized tensors to help us notice errors in the indexing
-    // calculations.
-    const int num_elems = 1 + DATA_LEN_16 + 1;
-    const int num_lanes_i8 = get_lanes(sc_data_etype::U8, DATA_LEN_16);
-    const int num_lanes_i16 = get_lanes(sc_data_etype::U16, DATA_LEN_16);
-    const int num_lanes_i32 = get_lanes(sc_data_etype::U32, DATA_LEN_16);
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    const int num_lanes_f16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_out", datatypes::f32, {num_elems}),
-            _arg_("tensor_int8", datatypes::s8, {num_elems}),
-            _arg_("tensor_uint8", datatypes::u8, {num_elems}),
-            _arg_("tensor_int32", datatypes::s32, {num_elems}),
-            _arg_("tensor_uint32", datatypes::u32, {num_elems}),
-            _arg_("tensor_uint16", datatypes::u16, {num_elems}), ) {
-        _bind_(tensor_out, tensor_int8, tensor_uint8, tensor_int32,
-                tensor_uint32, tensor_uint16);
-        union_val vali(UINT64_C(42));
-        union_val valf(42.f);
-        _for_(i, 0, DATA_LEN_16, num_lanes_i8) {
-            tensor_int8[span_t({1 + i}, num_lanes_i8)]
-                    = make_expr<constant_node>(
-                            vali, sc_data_type_t::s8(num_lanes_i8));
-            tensor_uint8[span_t({1 + i}, num_lanes_i8)]
-                    = make_expr<constant_node>(
-                            vali, sc_data_type_t::u8(num_lanes_i8));
-        }
-        _for_(i, 0, DATA_LEN_16, num_lanes_i16) {
-            tensor_uint16[span_t({1 + i}, num_lanes_i16)]
-                    = make_expr<constant_node>(
-                            vali, sc_data_type_t::u16(num_lanes_i16));
-        }
-        _for_(i, 0, DATA_LEN_16, num_lanes_i32) {
-            tensor_int32[span_t({1 + i}, num_lanes_i32)]
-                    = make_expr<constant_node>(
-                            vali, sc_data_type_t::s32(num_lanes_i32));
-            tensor_uint32[span_t({1 + i}, num_lanes_i32)]
-                    = make_expr<constant_node>(
-                            vali, sc_data_type_t::u32(num_lanes_i32));
-        }
-        _for_(i, 0, DATA_LEN_16, num_lanes_f32) {
-            tensor_out[span_t({1 + i}, num_lanes_f32)]
-                    = make_expr<constant_node>(
-                            valf, sc_data_type_t::f32(num_lanes_f32));
-        }
-    }
-
-    _function_(datatypes::void_t, foofp16,
-            _arg_("tensor_f16", datatypes::f16, {num_elems}), ) {
-        _bind_(tensor_f16);
-        union_val valf(42.f);
-        _for_(i, 0, DATA_LEN_16, num_lanes_f16) {
-            tensor_f16[span_t({1 + i}, num_lanes_f16)]
-                    = make_expr<constant_node>(
-                            valf, sc_data_type_t::f16(num_lanes_f16));
-        }
-    }
-    auto ctx = get_default_context();
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(ctx);
-    ir_mod->add_func({foo});
-    ir_mod->add_func({foofp16});
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        if (!is_jit_support_fp16(je_name)) { continue; }
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-        shared_ptr<jit_function_t> j_foo_fp16 = jm->get_function("foofp16");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        int8_t host_tensor_int8[num_elems] = {
-                -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1};
-        const int8_t expected_int8[num_elems] = {-1, 42, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, -1};
-
-        uint8_t host_tensor_uint8[num_elems]
-                = {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1};
-        const uint8_t expected_uint8[num_elems] = {1, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 1};
-
-        uint16_t host_tensor_uint16[num_elems]
-                = {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1};
-        const uint16_t expected_uint16[num_elems] = {1, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 1};
-
-        int32_t host_tensor_int32[num_elems] = {
-                -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1};
-        const int32_t expected_int32[num_elems] = {-1, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, -1};
-
-        uint32_t host_tensor_uint32[num_elems]
-                = {1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1};
-        const uint32_t expected_uint32[num_elems] = {1, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 1};
-
-        float host_tensor_out[num_elems] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        const float expected_result[num_elems] = {-1, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, -1};
-
-        fp16_t host_tensor_f16[num_elems] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        const fp16_t expected_result_f16[num_elems] = {-1, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, -1};
-        generic_val generic_args[] = {&host_tensor_out, &host_tensor_int8,
-                &host_tensor_uint8, &host_tensor_int32, &host_tensor_uint32,
-                &host_tensor_uint16};
-        generic_val generic_args_fp16[] = {&host_tensor_f16};
-        j_foo->call_generic_default(generic_args);
-        j_foo_fp16->call_generic_default(generic_args_fp16);
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_EQ(host_tensor_out[i], expected_result[i]);
-            EXPECT_EQ(host_tensor_int8[i], expected_int8[i]);
-            EXPECT_EQ(host_tensor_uint8[i], expected_uint8[i]);
-            EXPECT_EQ(host_tensor_int32[i], expected_int32[i]);
-            EXPECT_EQ(host_tensor_uint32[i], expected_uint32[i]);
-            EXPECT_EQ(host_tensor_uint16[i], expected_uint16[i]);
-            EXPECT_EQ(host_tensor_f16[i], expected_result_f16[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicBroadcast) {
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-
-    // We'll use over-sized tensors to help us notice errors in the indexing
-    // calculations.
-    const int num_elems = 1 + DATA_LEN_16 + 1;
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-
-    _function_(datatypes::void_t, foo, _arg_("x", datatypes::f32),
-            _arg_("tensor_out", datatypes::f32, {num_elems}), ) {
-        _bind_(x, tensor_out);
-        _for_(i, 0, DATA_LEN_16, num_lanes_f32) {
-            tensor_out[span_t({1 + i}, num_lanes_f32)]
-                    = make_broadcast(x, num_lanes_f32);
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        float host_x = 42;
-        float host_tensor_out[num_elems] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
-                -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        const float expected_result[num_elems] = {-1, 42, 42, 42, 42, 42, 42,
-                42, 42, 42, 42, 42, 42, 42, 42, 42, 42, -1};
-
-        generic_val generic_args[] = {
-                host_x,
-                &host_tensor_out,
-        };
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_EQ(host_tensor_out[i], expected_result[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicGather) {
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-
-    // We'll use over-sized tensors to help us notice errors in the indexing
-    // calculations.
-    const int num_elems = DATA_LEN_16;
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_out", datatypes::f32, {num_elems}),
-            _arg_("tensor_in", datatypes::f32, {num_elems}),
-            _arg_("tensor_idx", datatypes::s32, {num_elems})) {
-        _bind_(tensor_out, tensor_in, tensor_idx);
-        _for_(i, 0, DATA_LEN_16, num_lanes_f32) {
-            tensor_out[span_t({0 + i}, num_lanes_f32)] = make_gather(
-                    tensor_in, tensor_idx[span_t({0 + i}, num_lanes_f32)]);
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        float host_tensor_out[num_elems] = {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        float host_tensor_in[num_elems] = DATASET_I1;
-        int host_tensor_idx[num_elems] = DATASET_I2;
-        host_tensor_idx[0] = 0;
-        const float expected_result[num_elems]
-                = {1, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2};
-
-        generic_val generic_args[]
-                = {&host_tensor_out, &host_tensor_in, &host_tensor_idx};
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_EQ(host_tensor_out[i], expected_result[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicpermute2) {
-    REQUIRE_AVX512();
-    ir_builder_t builder;
-
-    const int simd_lanes = 16;
-    const int num_elems = simd_lanes;
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_out1", datatypes::u16, {num_elems}),
-            _arg_("tensor_out2", datatypes::u16, {num_elems}),
-            _arg_("tensor_src1", datatypes::u16, {num_elems}),
-            _arg_("tensor_src2", datatypes::u16, {num_elems})) {
-        _bind_(tensor_out1, tensor_out2, tensor_src1, tensor_src2);
-        tensor_out1[span_t({0}, simd_lanes)]
-                = make_permute(tensor_src1[span_t({0}, simd_lanes)],
-                        tensor_src2[span_t({0}, simd_lanes)], 0x20, 128);
-        tensor_out2[span_t({0}, simd_lanes)]
-                = make_permute(tensor_src1[span_t({0}, simd_lanes)],
-                        tensor_src2[span_t({0}, simd_lanes)], 0x31, 128);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        uint16_t host_tensor_out1[num_elems] = {0};
-        uint16_t host_tensor_out2[num_elems] = {0};
-        uint16_t host_tensor_src1[num_elems] = DATASET_I1;
-        uint16_t host_tensor_src2[num_elems] = DATASET_I2;
-        const uint16_t expected_result1[num_elems]
-                = {1, 2, 3, 4, 5, 6, 7, 8, 16, 15, 14, 13, 12, 11, 10, 9};
-        const uint16_t expected_result2[num_elems]
-                = {9, 10, 11, 12, 13, 14, 15, 16, 8, 7, 6, 5, 4, 3, 2, 1};
-
-        generic_val generic_args[] = {&host_tensor_out1, &host_tensor_out2,
-                &host_tensor_src1, &host_tensor_src2};
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_EQ(host_tensor_out1[i], expected_result1[i]);
-            EXPECT_EQ(host_tensor_out2[i], expected_result2[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicSaturateCastU8S8) {
-    REQUIRE_AVX512(); // Due to f32->u32 need avx512f
-
-    const int num_elems = 16;
-    float max_f = std::numeric_limits<float>::infinity();
-    float min_f = -std::numeric_limits<float>::infinity();
-    float host_tensor_src1[num_elems]
-            = {0.f, max_f, min_f, -1.f, -2.f, 0.333F, -0.f, 1.f, 2.f, 255.f,
-                    256.f, -128.f, 127.f, -129.f, 128.f, -0.333f};
-    std::vector<uint8_t> exp_u8_result {
-            0, 255, 0, 0, 0, 0, 0, 1, 2, 255, 255, 0, 127, 0, 128, 0};
-    std::vector<int8_t> exp_s8_result {0, 127, -128, -1, -2, 0, 0, 1, 2, 127,
-            127, -128, 127, -128, 127, 0};
-
-    auto test_saturatecast = [&](const int lanes,
-                                     const sc_data_type_t &dst_dtype) {
-        ir_builder_t builder;
-        _function_(datatypes::void_t, foo,
-                _arg_("tensor_out", dst_dtype, {num_elems}),
-                _arg_("tensor_src", datatypes::f32, {num_elems})) {
-            _bind_(tensor_out1, tensor_src1);
-            _for_(i, 0, num_elems, lanes) {
-                tensor_out1[span_t({i}, lanes)] = builder::make_saturated_cast(
-                        tensor_src1[span_t({i}, lanes)],
-                        sc_data_type_t(dst_dtype.type_code_, lanes));
-            }
-        }
-        bool is_u8 = dst_dtype.type_code_ == sc_data_etype::U8;
-        ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-                get_default_context(), vector<func_t> {foo}, 0);
-        for (auto &kv : get_engines()) {
-            ostringstream err_context;
-            const string &je_name = kv.first;
-            err_context << "jit_engine_t class '" << je_name << "'";
-            SCOPED_TRACE(err_context.str());
-
-            shared_ptr<jit_engine_t> je = kv.second;
-            EXPECT_NE(je, nullptr);
-            if (!je) { continue; }
-
-            shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-            EXPECT_NE(jm, nullptr);
-
-            shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-            EXPECT_NE(j_foo, nullptr);
-            if (!j_foo) { continue; }
-
-            uint8_t host_tensor_out_u8[16] = {0};
-            int8_t host_tensor_out_s8[16] = {0};
-
-            generic_val generic_args[2];
-            generic_args[1] = &host_tensor_src1;
-            if (is_u8) {
-                generic_args[0] = &host_tensor_out_u8;
-            } else {
-                generic_args[0] = &host_tensor_out_s8;
-            }
-
-            j_foo->call_generic_default(generic_args);
-
-            for (int i = 0; i < num_elems; ++i) {
-                if (is_u8) {
-                    EXPECT_EQ(host_tensor_out_u8[i], exp_u8_result[i]);
-                } else {
-                    EXPECT_EQ(host_tensor_out_s8[i], exp_s8_result[i]);
-                }
-            }
-        }
-    };
-
-    test_saturatecast(1, datatypes::u8);
-    test_saturatecast(16, datatypes::u8);
-    test_saturatecast(1, datatypes::s8);
-    test_saturatecast(16, datatypes::s8);
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestModuleVar) {
-    ir_builder_t builder;
-
-    auto ir_mod = std::make_shared<ir_module_t>(get_default_context());
-
-    _module_var_(ir_mod, x, datatypes::f32, expr());
-
-    _function_(datatypes::void_t, foo_set_x, _arg_("new_x", datatypes::f32)) {
-        _bind_(new_x);
-        x = new_x;
-    }
-
-    _function_(
-            datatypes::void_t, foo_get_x, _arg_("out_x", datatypes::f32, {1})) {
-        _bind_(out_x);
-        out_x[0] = x;
-    }
-
-    ir_mod->add_func({foo_set_x, foo_get_x});
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo_set_x = jm->get_function("foo_set_x");
-        EXPECT_NE(j_foo_set_x, nullptr);
-        if (!j_foo_set_x) { continue; }
-
-        shared_ptr<jit_function_t> j_foo_get_x = jm->get_function("foo_get_x");
-        EXPECT_NE(j_foo_get_x, nullptr);
-        if (!j_foo_get_x) { continue; }
-
-        const float host_new_x1 = 0;
-        const float host_new_x2 = 42;
-        float host_out_x;
-
-        generic_val generic_args_set1[] = {
-                host_new_x1,
-        };
-
-        generic_val generic_args_set2[] = {
-                host_new_x2,
-        };
-
-        generic_val generic_args_get[] = {
-                &host_out_x,
-        };
-
-        j_foo_set_x->call_generic_default(generic_args_set1);
-        j_foo_get_x->call_generic_default(generic_args_get);
-        EXPECT_EQ(host_new_x1, host_out_x);
-
-        j_foo_set_x->call_generic_default(generic_args_set2);
-        j_foo_get_x->call_generic_default(generic_args_get);
-        EXPECT_EQ(host_new_x2, host_out_x);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestSubtract8Args) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("x1", datatypes::index),
-            _arg_("x2", datatypes::index), _arg_("x3", datatypes::index),
-            _arg_("x4", datatypes::index), _arg_("x5", datatypes::index),
-            _arg_("x6", datatypes::index), _arg_("x7", datatypes::index),
-            _arg_("result", datatypes::index, {1})) {
-        _bind_(x1, x2, x3, x4, x5, x6, x7, result);
-        result[0] = x1 - x2 - x3 - x4 - x5 - x6 - x7;
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    int64_t expected_result = 1UL;
-    int64_t result;
-
-    generic_val generic_args[] = {UINT64_C(22), UINT64_C(1), UINT64_C(2),
-            UINT64_C(3), UINT64_C(4), UINT64_C(5), UINT64_C(6), &result};
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        result = 0;
-        jf->call_generic_default(generic_args);
-
-        ASSERT_EQ(result, expected_result);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicReduceAdd) {
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-
-    // We'll use over-sized tensors to help us notice errors in the indexing
-    // calculations.
-    const int num_elems = 1 + DATA_LEN_16 + 1;
-    const int num_lanes_i32 = get_lanes(sc_data_etype::U32, DATA_LEN_16);
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    const int num_lanes_f16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_f32_in", datatypes::f32, {num_elems}),
-            _arg_("out_f32", datatypes::f32, {1}),
-            _arg_("tensor_s32_in", datatypes::s32, {num_elems}),
-            _arg_("out_s32", datatypes::s32, {1}), ) {
-        _bind_(tensor_f32_in, out_f32, tensor_s32_in, out_s32);
-
-        _var_init_(
-                out_var_s32, sc_data_type_t::s32(1), make_constant(INT32_C(0)));
-        _for_(i, 0, DATA_LEN_16, num_lanes_i32) {
-            _var_(local_temp_int32, sc_data_type_t::s32(num_lanes_i32));
-            local_temp_int32 = tensor_s32_in[span_t({1 + i}, num_lanes_i32)];
-            out_var_s32 = out_var_s32 + make_reduce_add(local_temp_int32);
-        }
-        out_s32[0] = out_var_s32;
-
-        _var_init_(out_var_f32, sc_data_type_t::f32(1), make_constant(0.f));
-        _for_(i, 0, DATA_LEN_16, num_lanes_f32) {
-            _var_(local_temp, sc_data_type_t::f32(num_lanes_f32));
-            local_temp = tensor_f32_in[span_t({1 + i}, num_lanes_f32)];
-            out_var_f32 = out_var_f32 + make_reduce_add(local_temp);
-        }
-        out_f32[0] = out_var_f32;
-    }
-
-    _function_(datatypes::void_t, foofp16,
-            _arg_("tensor_f16_in", datatypes::f16, {num_elems}),
-            _arg_("out_f16", datatypes::f16, {1}), ) {
-        _bind_(tensor_f16_in, out_f16);
-        _var_init_(out_var_f16, sc_data_type_t::f16(1),
-                make_constant({0.f}, datatypes::f16));
-        _for_(i, 0, DATA_LEN_16, num_lanes_f16) {
-            _var_(local_temp, sc_data_type_t::f16(num_lanes_f16));
-            local_temp = tensor_f16_in[span_t({1 + i}, num_lanes_f16)];
-            out_var_f16 = out_var_f16 + make_reduce_add(local_temp);
-        }
-        out_f16[0] = out_var_f16;
-    }
-
-    auto ctx = get_default_context();
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(ctx);
-    ir_mod->add_func({foo});
-    if (is_cpu_support_fp16()) { ir_mod->add_func({foofp16}); }
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        if (!is_jit_support_fp16(je_name)) { continue; }
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-        shared_ptr<jit_function_t> j_foo_fp16 = jm->get_function("foofp16");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        float host_in[num_elems] = {
-                -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1};
-        float host_out = 0;
-        const float expected_out = 136;
-        int32_t host_int32_in[num_elems] = {
-                -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1};
-        int32_t host_int32_out = 0;
-        const float expected_int32_out = 136;
-
-        fp16_t host_f16_in[num_elems] = {
-                -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, -1};
-        fp16_t host_f16_out = 0;
-        const fp16_t expected_f16_out = 136;
-
-        generic_val generic_args[] = {
-                host_in,
-                &host_out,
-                host_int32_in,
-                &host_int32_out,
-        };
-        generic_val generic_args_fp16[] = {
-                host_f16_in,
-                &host_f16_out,
-        };
-
-        j_foo->call_generic_default(generic_args);
-        if (is_cpu_support_fp16()) {
-            j_foo_fp16->call_generic_default(generic_args_fp16);
-        }
-
-        EXPECT_EQ(host_out, expected_out);
-        EXPECT_EQ(host_int32_out, expected_int32_out);
-        if (is_cpu_support_fp16()) {
-            EXPECT_EQ(host_f16_out, expected_f16_out);
-        }
-    }
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestUnpackHighLow) {
-    REQUIRE_AVX2();
-    const int array_len = 32;
-    ir_builder_t builder;
-    const int data_len = 8;
-    const int data_len_1 = 16;
-
-    _function_(datatypes::void_t, foo, _arg_("x", datatypes::bf16, {data_len}),
-            _arg_("y", datatypes::bf16, {data_len}),
-            _arg_("k", datatypes::bf16, {data_len}),
-            _arg_("z", datatypes::bf16, {data_len}),
-            _arg_("result", datatypes::bf16, {array_len}),
-            _arg_("x_1", datatypes::u8, {data_len_1}),
-            _arg_("y_1", datatypes::u8, {data_len_1}),
-            _arg_("result_1", datatypes::u8, {array_len})) {
-        _bind_(x, y, k, z, result, x_1, y_1, result_1);
-        result[span_t({0}, data_len)] = builder::make_unpack_low(
-                x[span_t({0}, data_len)], y[span_t({0}, data_len)], 16);
-        result[span_t({8}, data_len)] = builder::make_unpack_high(
-                x[span_t({0}, data_len)], y[span_t({0}, data_len)], 16);
-        result[span_t({16}, data_len)] = builder::make_unpack_low(
-                k[span_t({0}, data_len)], z[span_t({0}, data_len)], 16);
-        result[span_t({24}, data_len)] = builder::make_unpack_high(
-                k[span_t({0}, data_len)], z[span_t({0}, data_len)], 16);
-        result_1[span_t({0}, data_len_1)] = builder::make_unpack_low(
-                x_1[span_t({0}, data_len_1)], y_1[span_t({0}, data_len_1)], 8);
-        result_1[span_t({16}, data_len_1)] = builder::make_unpack_high(
-                x_1[span_t({0}, data_len_1)], y_1[span_t({0}, data_len_1)], 8);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    uint16_t x[8] = {1, 3, 5, 7, 9, 11, 13, 15};
-    uint16_t y[8] = {2, 4, 6, 8, 10, 12, 14, 16};
-    uint16_t k[8] = {17, 19, 21, 23, 25, 27, 29, 31};
-    uint16_t z[8] = {18, 20, 22, 24, 26, 28, 30, 32};
-    uint8_t x_1[16]
-            = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31};
-    uint8_t y_1[16]
-            = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32};
-    uint16_t result[array_len] = {0};
-    uint16_t expected_result[array_len] = {0};
-    uint8_t result_1[array_len] = {0};
-    for (int i = 0; i < array_len; i++) {
-        expected_result[i] = i + 1;
-    }
-
-    generic_val generic_args[] = {x, y, k, z, &result, x_1, y_1, &result_1};
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        jf->call_generic_default(generic_args);
-        for (int i = 0; i < array_len; i++) {
-            EXPECT_EQ(result[i], expected_result[i]);
-            EXPECT_EQ(result_1[i], expected_result[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicPermutexvar) {
-    REQUIRE_AVX512VBMI();
-    ir_builder_t builder;
-
-    any_map_t reinterpret_attr;
-    expr idx0;
-#define MAKE_IDX_NEW(name, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \
-        v12, v13, v14, v15) \
-    idx##name = make_expr<intrin_call_node>(intrin_type::reinterpret, \
-            std::vector<expr> {make_expr<constant_node>( \
-                    std::vector<union_val> {UINT64_C(v0), UINT64_C(v1), \
-                            UINT64_C(v2), UINT64_C(v3), UINT64_C(v4), \
-                            UINT64_C(v5), UINT64_C(v6), UINT64_C(v7), \
-                            UINT64_C(v8), UINT64_C(v9), UINT64_C(v10), \
-                            UINT64_C(v11), UINT64_C(v12), UINT64_C(v13), \
-                            UINT64_C(v14), UINT64_C(v15)}, \
-                    sc_data_type_t::u32(16))}, \
-            reinterpret_attr);
-    const int simd_lanes = 64;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_in", datatypes::u8, {simd_lanes}),
-            _arg_("tensor_in_1", datatypes::u8, {simd_lanes / 2}), ) {
-        _bind_(tensor_in, tensor_in_1);
-        _var_(local_temp, sc_data_type_t::u8(64));
-        _var_(local_temp_1, sc_data_type_t::u8(32));
-        local_temp = tensor_in[span_t({0}, simd_lanes)];
-        local_temp_1 = tensor_in_1[span_t({0}, simd_lanes / 2)];
-        reinterpret_attr[intrin_attr::out_dtype] = sc_data_type_t::u8(64);
-        MAKE_IDX_NEW(0, 0x30201000, 0x31211101, 0x32221202, 0x33231303,
-                0x34241404, 0x35251505, 0x36261606, 0x37271707, 0x38281808,
-                0x39291909, 0x3a2a1a0a, 0x3b2b1b0b, 0x3c2c1c0c, 0x3d2d1d0d,
-                0x3e2e1e0e, 0x3f2f1f0f)
-        tensor_in[span_t({0}, simd_lanes)] = make_permutexvar(idx0, local_temp);
-        tensor_in_1[span_t({0}, simd_lanes / 2)] = make_permutexvar(
-                builder::make_constant(UINT64_C(0b11011000)), local_temp_1, 8);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-        int rows = 4, cols = 16;
-        int init = 1;
-        uint8_t host_in[simd_lanes] = {0};
-        for (int i = 0; i < rows; i++) {
-            for (int j = 0; j < cols; j++) {
-                host_in[i * cols + j] = init++;
-            }
-        }
-        uint8_t host_in_1[simd_lanes / 2] = {0};
-        init = 1;
-        for (int i = 0; i < simd_lanes / 2; i++) {
-            host_in_1[i] = init++;
-        }
-        uint8_t expected_out[simd_lanes] = {1, 17, 33, 49, 2, 18, 34, 50, 3, 19,
-                35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39,
-                55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43,
-                59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47,
-                63, 16, 32, 48, 64};
-        uint8_t expected_out_1[simd_lanes / 2] = {1, 2, 3, 4, 5, 6, 7, 8, 17,
-                18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16, 25,
-                26, 27, 28, 29, 30, 31, 32};
-
-        generic_val generic_args[] = {&host_in, &host_in_1};
-
-        j_foo->call_generic_default(generic_args);
-        for (int i = 0; i < simd_lanes; i++) {
-            EXPECT_EQ((int)host_in[i], (int)expected_out[i]);
-            if (i < simd_lanes / 2) {
-                EXPECT_EQ((int)host_in_1[i], (int)expected_out_1[i]);
-            }
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicInsert) {
-    REQUIRE_AVX512();
-    ir_builder_t builder;
-    const int simd_lanes = 32;
-#define INSERT_CMP_DATA(type, imm) \
-    type host_in_1[32] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
-            16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \
-    type host_in_2[16] \
-            = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; \
-    const type expected_out_1[32] \
-            = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 1, 2, \
-                    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; \
-    const type expected_out_0[32] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, \
-            13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; \
-    generic_val generic_args[] = { \
-            &host_in_1, \
-            &host_in_2, \
-    }; \
-    j_foo->call_generic_default(generic_args); \
-    for (int i = 0; i < simd_lanes; i++) { \
-        if ((imm)&1) { \
-            EXPECT_EQ(host_in_1[i], expected_out_1[i]); \
-        } else { \
-            EXPECT_EQ(host_in_1[i], expected_out_0[i]); \
-        } \
-    }
-
-    auto test_func = [&](const sc_data_type_t &dtype, const int imm) {
-        _function_(datatypes::void_t, foo,
-                _arg_("tensor_in_1", dtype, {simd_lanes}),
-                _arg_("tensor_in_2", dtype, {simd_lanes / 2}), ) {
-            _bind_(tensor_in_1, tensor_in_2);
-            _var_(local_temp_1, sc_data_type_t(dtype.type_code_, simd_lanes));
-            local_temp_1->attr()["can_promote_to_f32"] = false;
-            _var_(local_temp_2,
-                    sc_data_type_t(dtype.type_code_, simd_lanes / 2));
-            local_temp_2->attr()["can_promote_to_f32"] = false;
-            local_temp_1 = tensor_in_1[span_t({0}, simd_lanes)];
-            local_temp_2 = tensor_in_2[span_t({0}, simd_lanes / 2)];
-            tensor_in_1[span_t({0}, simd_lanes)]
-                    = make_insert(local_temp_1, local_temp_2, imm);
-        }
-        for (auto &kv : get_engines()) {
-            ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-                    get_default_context(), vector<func_t> {foo}, 0);
-            ostringstream err_context;
-            const string &je_name = kv.first;
-            err_context << "jit_engine_t class '" << je_name << "'";
-            SCOPED_TRACE(err_context.str());
-            shared_ptr<jit_engine_t> je = kv.second;
-            EXPECT_NE(je, nullptr);
-            if (!je) { continue; }
-            shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-            EXPECT_NE(jm, nullptr);
-            shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-            EXPECT_NE(j_foo, nullptr);
-            if (!j_foo) { continue; }
-            switch (dtype.as_etype()) {
-                case sc_data_etype::U8: {
-                    INSERT_CMP_DATA(uint8_t, imm);
-                } break;
-                case sc_data_etype::S8: {
-                    INSERT_CMP_DATA(int8_t, imm);
-                } break;
-                case sc_data_etype::U16: {
-                    INSERT_CMP_DATA(uint16_t, imm);
-                } break;
-                case sc_data_etype::BF16: {
-                    INSERT_CMP_DATA(uint16_t, imm);
-                } break;
-                default: {
-                    assert(0 && "Do not support this type.");
-                } break;
-            }
-        }
-    };
-    test_func(datatypes::s8, 1); // insert imm = 1
-    test_func(datatypes::s8, 0); // insert imm = 0
-    test_func(datatypes::u8, 1); // insert imm = 1
-    test_func(datatypes::u8, 0); // insert imm = 0
-    test_func(datatypes::bf16, 1); // insert imm = 1
-    test_func(datatypes::bf16, 1); // insert imm = 0
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicExtractAVX2) {
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-    const int simd_lanes = 8;
-#define EXTRACT_CMP_DATA(type, imm) \
-    type host_in_1[8] = {1, 2, 3, 4, 5, 6, 7, 8}; \
-    type host_in_2[8] = {0}; \
-    const type expected_out_1 = 1; \
-    const type expected_out_2 = 2; \
-    generic_val generic_args[] = { \
-            &host_in_1, \
-            &host_in_2, \
-    }; \
-    j_foo->call_generic_default(generic_args); \
-    for (int i = 0; i < simd_lanes; i++) { \
-        if ((imm)&1) { \
-            EXPECT_EQ(host_in_2[i], expected_out_2); \
-        } else { \
-            EXPECT_EQ(host_in_2[i], expected_out_1); \
-        } \
-    }
-
-    auto test_func = [&](sc_data_type_t dtype, const int imm) {
-        _function_(datatypes::void_t, foo,
-                _arg_("tensor_in_1", dtype, {simd_lanes}),
-                _arg_("tensor_in_2", dtype, {simd_lanes})) {
-            _bind_(tensor_in_1, tensor_in_2);
-            _var_(local_temp_1, sc_data_type_t(dtype.type_code_, simd_lanes));
-            local_temp_1->attr()["can_promote_to_f32"] = false;
-            local_temp_1 = tensor_in_1[span_t({0}, simd_lanes)];
-            _for_(idx, 0, simd_lanes) {
-                tensor_in_2[span_t({idx}, 1)] = make_extract(local_temp_1, imm);
-            }
-        }
-        for (auto &kv : get_engines()) {
-            ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-                    get_default_context(), vector<func_t> {foo}, 0);
-            ostringstream err_context;
-            const string &je_name = kv.first;
-            err_context << "jit_engine_t class '" << je_name << "'";
-            SCOPED_TRACE(err_context.str());
-            shared_ptr<jit_engine_t> je = kv.second;
-            EXPECT_NE(je, nullptr);
-            if (!je) { continue; }
-            shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-            EXPECT_NE(jm, nullptr);
-            shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-            EXPECT_NE(j_foo, nullptr);
-            if (!j_foo) { continue; }
-            switch (dtype.as_etype()) {
-                case sc_data_etype::U8: {
-                    EXTRACT_CMP_DATA(uint8_t, imm);
-                } break;
-                case sc_data_etype::S8: {
-                    EXTRACT_CMP_DATA(int8_t, imm);
-                } break;
-                case sc_data_etype::U16: {
-                    EXTRACT_CMP_DATA(uint16_t, imm);
-                } break;
-                case sc_data_etype::BF16: {
-                    EXTRACT_CMP_DATA(uint16_t, imm);
-                } break;
-                default: {
-                    assert(0 && "Do not support this type.");
-                } break;
-            }
-        }
-    };
-
-    test_func(datatypes::u8, 1); // extract imm = 1
-    test_func(datatypes::u8, 0); // extract imm = 0
-    test_func(datatypes::u16, 1); // extract imm = 1
-    test_func(datatypes::u16, 0); // extract imm = 0
-    test_func(datatypes::bf16, 1); // extract imm = 1
-    test_func(datatypes::bf16, 0); // extract imm = 0
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestIntrinsicInsertAVX2) {
-    REQUIRE_AVX2();
-    ir_builder_t builder;
-    const int simd_lanes = 8;
-#define INSERT_AVX2_CMP_DATA(type) \
-    type x = 1; \
-    type expected_result[8] = {0, 1, 2, 2, 2, 2, 0, 0}; \
-    type result[8] = {0}; \
-    generic_val generic_args[] = {x, &result}; \
-    jf->call_generic_default(generic_args); \
-    for (int i = 0; i < 8; i++) { \
-        ASSERT_EQ(result[i], expected_result[i]); \
-    }
-
-    auto test_func = [&](const sc_data_type_t &dtype) {
-        _function_(datatypes::void_t, foo, _arg_("x", dtype),
-                _arg_("result", dtype, {simd_lanes})) {
-            _bind_(x, result);
-            result[span_t({0}, simd_lanes)] = builder::make_insert(
-                    result[span_t({0}, simd_lanes)], x, 1);
-            x = x + builder::make_cast(dtype, 1);
-            result[span_t({0}, simd_lanes)] = builder::make_insert(
-                    result[span_t({0}, simd_lanes)], x, 2);
-            result[span_t({0}, simd_lanes)] = builder::make_insert(
-                    result[span_t({0}, simd_lanes)], x, 3);
-            result[span_t({0}, simd_lanes)] = builder::make_insert(
-                    result[span_t({0}, simd_lanes)], x, 4);
-            result[span_t({0}, simd_lanes)] = builder::make_insert(
-                    result[span_t({0}, simd_lanes)], x, 5);
-        }
-
-        ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-                get_default_context(), vector<func_t> {foo}, 0);
-
-        for (auto &kv : get_engines()) {
-            const string &je_name = kv.first;
-            if (je_name != "xbyak_jit") continue;
-
-            ostringstream err_context;
-            err_context << "jit_engine_t class '" << je_name << "'";
-            SCOPED_TRACE(err_context.str());
-
-            shared_ptr<jit_engine_t> je = kv.second;
-            EXPECT_NE(je, nullptr);
-            if (!je) { continue; }
-
-            shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-            EXPECT_NE(jm, nullptr);
-
-            shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-            EXPECT_NE(jf, nullptr);
-            if (!jf) { continue; }
-            switch (dtype.as_etype()) {
-                case sc_data_etype::U8: {
-                    INSERT_AVX2_CMP_DATA(uint8_t);
-                } break;
-                case sc_data_etype::S8: {
-                    INSERT_AVX2_CMP_DATA(int8_t);
-                } break;
-                case sc_data_etype::U16: {
-                    INSERT_AVX2_CMP_DATA(uint16_t);
-                } break;
-                case sc_data_etype::BF16: {
-                    INSERT_AVX2_CMP_DATA(uint16_t);
-                } break;
-                default: {
-                    assert(0 && "Do not support this type.");
-                } break;
-            }
-        }
-    };
-    test_func(datatypes::s8);
-    test_func(datatypes::u8);
-    test_func(datatypes::u16);
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestMaskIndexingAVX2) {
-    REQUIRE_AVX2();
-
-    ir_builder_t builder;
-    const int simd_lanes = 16;
-    const int test_len = simd_lanes - 1;
-
-    _function_(datatypes::void_t, foo, _arg_("a", datatypes::s8, {test_len}),
-            _arg_("b", datatypes::s8, {test_len}),
-            _arg_("c", datatypes::s8, {test_len}),
-            _arg_("d", datatypes::s8, {test_len})) {
-        _bind_(a, b, c, d);
-        auto mask = builder::make_constant(
-                {UINT64_C((1 << test_len) - 1)}, datatypes::u16);
-        a[span_t({0UL}, simd_lanes, mask)] = b[span_t({0UL}, simd_lanes, mask)];
-        d[span_t({0UL}, simd_lanes, mask)]
-                = builder::make_max(a[span_t({0UL}, simd_lanes, mask)],
-                        c[span_t({0UL}, simd_lanes, mask)]);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_avx2_test_ctx(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-        // just builtin need to test this
-        if (je_name != "xbyak_jit") continue;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        int8_t host_a[test_len] = {0};
-        int8_t host_b[test_len]
-                = {0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-        int8_t host_c[test_len]
-                = {-1, -1, -1, 100, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
-        int8_t host_d[test_len] = {0};
-
-        const int8_t exp_a[test_len]
-                = {0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-        const int8_t exp_d[test_len]
-                = {0, 2, 3, 100, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-
-        generic_val generic_args[] = {
-                &host_a,
-                &host_b,
-                &host_c,
-                &host_d,
-        };
-
-        jf->call_generic_default(generic_args);
-
-        for (int i = 0; i < test_len; ++i) {
-            EXPECT_EQ(host_a[i], exp_a[i]);
-            EXPECT_EQ(host_d[i], exp_d[i]);
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestConstantBF16) {
-    REQUIRE_AVX512();
-    ir_builder_t builder;
-
-    // We'll use over-sized tensors to help us notice errors in the indexing
-    // calculations.
-    const int simd_lanes = 16;
-    const int num_elems = simd_lanes;
-
-    auto make_constf = [](std::initializer_list<float> v, sc_data_type_t type) {
-        std::vector<union_val> val;
-        for (auto i : v) {
-            val.push_back(i);
-        }
-        return make_expr<constant_node>(val, type);
-    };
-
-    const float v1[num_elems] = DATASET_F2;
-    const float v2[num_elems] = DATASET_F1;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_in", datatypes::f32, {num_elems}),
-            _arg_("tensor_out", datatypes::f32, {num_elems}),
-            _arg_("single_out", datatypes::f32, {num_elems}), ) {
-        _bind_(tensor_in, tensor_out, single_out);
-
-        _var_(local_temp, sc_data_type_t::bf16(simd_lanes));
-        local_temp = make_constf(DATASET_F2, sc_data_type_t::bf16(simd_lanes));
-        local_temp = local_temp
-                + make_cast(sc_data_type_t::bf16(simd_lanes),
-                        tensor_in[span_t({0}, simd_lanes)]);
-        local_temp = local_temp
-                + make_constf(DATASET_F1, sc_data_type_t::bf16(simd_lanes));
-        tensor_out[span_t({0}, simd_lanes)]
-                = make_cast(sc_data_type_t::f32(simd_lanes), local_temp);
-
-        _var_(single_tmp, datatypes::bf16);
-        for (int i = 0; i < simd_lanes; i++) {
-            single_tmp = make_expr<constant_node>(v1[i], datatypes::bf16);
-            single_tmp = single_tmp + make_cast(datatypes::bf16, tensor_in[i]);
-            single_tmp = single_tmp
-                    + make_expr<constant_node>(v2[i], datatypes::bf16);
-            single_out[i] = make_cast(datatypes::f32, single_tmp);
-        }
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        float host_tensor_in[num_elems] = DATASET_F3;
-        float host_tensor_out[num_elems] = {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        float host_tensor_out_single[num_elems] = {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        const float expected_result[num_elems] = {-46.815f, -15.5625f, 1.2525f,
-                8.705f, 13.32f, 15.035f, 16.387f, 19.3125f, 19.175f, 19.294f,
-                20.325f, 22.025f, 26.225f, 33.7406f, 50.1562f, 81.4562f};
-
-        generic_val generic_args[] = {
-                &host_tensor_in,
-                &host_tensor_out,
-                &host_tensor_out_single,
-        };
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_NEAR(host_tensor_out[i], expected_result[i],
-                    std::abs(1e-2 * expected_result[i]));
-            EXPECT_NEAR(host_tensor_out_single[i], expected_result[i],
-                    std::abs(1e-2 * expected_result[i]));
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestConstantFP16) {
-    REQUIRE_AVX512();
-    ir_builder_t builder;
-    // We'll use over-sized tensors to help us notice errors in the indexing
-    // calculations.
-    const int simd_lanes = 16;
-    const int num_elems = simd_lanes;
-
-    auto make_constf = [](std::initializer_list<float> v, sc_data_type_t type) {
-        std::vector<union_val> val;
-        for (auto i : v) {
-            val.push_back(i);
-        }
-        return make_expr<constant_node>(val, type);
-    };
-
-    const float v1[num_elems] = DATASET_F2;
-    const float v2[num_elems] = DATASET_F1;
-
-    _function_(datatypes::void_t, foo,
-            _arg_("tensor_in", datatypes::f32, {num_elems}),
-            _arg_("tensor_out", datatypes::f32, {num_elems}),
-            _arg_("single_out", datatypes::f32, {num_elems}), ) {
-        _bind_(tensor_in, tensor_out, single_out);
-
-        _var_(local_temp, sc_data_type_t::f16(simd_lanes));
-        local_temp = make_constf(DATASET_F2, sc_data_type_t::f16(simd_lanes));
-        local_temp = local_temp
-                + make_cast(sc_data_type_t::f16(simd_lanes),
-                        tensor_in[span_t({0}, simd_lanes)]);
-        local_temp = local_temp
-                + make_constf(DATASET_F1, sc_data_type_t::f16(simd_lanes));
-        tensor_out[span_t({0}, simd_lanes)]
-                = make_cast(sc_data_type_t::f32(simd_lanes), local_temp);
-
-        _var_(single_tmp, datatypes::f16);
-        for (int i = 0; i < simd_lanes; i++) {
-            single_tmp = make_expr<constant_node>(v1[i], datatypes::f16);
-            single_tmp = single_tmp + make_cast(datatypes::f16, tensor_in[i]);
-            single_tmp = single_tmp
-                    + make_expr<constant_node>(v2[i], datatypes::f16);
-            single_out[i] = make_cast(datatypes::f32, single_tmp);
-        }
-    }
-    auto ctx = get_default_context();
-    ir_module_ptr ir_mod
-            = std::make_shared<ir_module_t>(ctx, vector<func_t> {foo}, 0);
-
-    for (auto &kv : get_engines()) {
-        ostringstream err_context;
-        const string &je_name = kv.first;
-        if (!is_jit_support_fp16(je_name)) { continue; }
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        float host_tensor_in[num_elems] = DATASET_F3;
-        float host_tensor_out[num_elems] = {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        float host_tensor_out_single[num_elems] = {
-                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-        const float expected_result[num_elems] = {-46.815f, -15.5625f, 1.2525f,
-                8.705f, 13.32f, 15.035f, 16.387f, 19.3125f, 19.175f, 19.294f,
-                20.325f, 22.025f, 26.225f, 33.7406f, 50.1562f, 81.4562f};
-
-        generic_val generic_args[] = {
-                &host_tensor_in,
-                &host_tensor_out,
-                &host_tensor_out_single,
-        };
-
-        j_foo->call_generic_default(generic_args);
-
-        for (int i = 0; i < num_elems; ++i) {
-            EXPECT_NEAR(host_tensor_out[i], (expected_result[i]),
-                    std::abs(1e-1 * (expected_result[i])));
-            EXPECT_NEAR(host_tensor_out_single[i], expected_result[i],
-                    std::abs(1e-1 * expected_result[i]));
-        }
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestConstDivModMul) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("x", datatypes::index),
-            _arg_("y", datatypes::index),
-            _arg_("result", datatypes::index, {1})) {
-        _bind_(x, y, result);
-        result[0] = ((x % UINT64_C(16)) + (y / UINT64_C(64))) * UINT64_C(32);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    uint64_t x = 431;
-    uint64_t y = 975;
-    uint64_t expected_result = 960;
-    uint64_t result;
-
-    generic_val generic_args[] = {x, y, &result};
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        result = 0;
-        jf->call_generic_default(generic_args);
-
-        ASSERT_EQ(result, expected_result);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestConstExceed32bit) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("result", datatypes::index, {1})) {
-        _bind_(result);
-        result[0] = UINT64_C(0x1FFFFFFFFF);
-        _var_init_(x, datatypes::index, UINT64_C(0xFFFFFFFFF));
-        result[0] = x;
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    uint64_t expected_result = 0xFFFFFFFFF;
-    uint64_t result;
-
-    generic_val generic_args[] = {&result};
-
-    for (auto &kv : get_engines()) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        result = 0;
-        jf->call_generic_default(generic_args);
-
-        ASSERT_EQ(result, expected_result);
-    }
-}
-
-TEST(GCCore_CPU_jit_engine_equivalence, TestJITCondition) {
-    REQUIRE_AVX2()
-    builder::ir_builder_t builder;
-    auto is_avx512 = test_cpu_flags().fAVX512F;
-#define TEST_FUNC( \
-        test_type, test_step, test_dtype, test_const_type, test_func_name) \
-    auto test_func_name = [&](const int step, sc_data_type_t dtype_sc, \
-                                  sc_data_type_t sc_const_type) { \
-        _function_(datatypes::void_t, aaa, _arg_("A", dtype_sc, {1024}), \
-                _arg_("B", dtype_sc, {1024}), _arg_("C", dtype_sc, {1024}), \
-                _arg_("D", dtype_sc, {1024}), _arg_("E", dtype_sc, {1024}), \
-                _arg_("F", dtype_sc, {1024})) { \
-            _bind_(A, B, C, D, E, F); \
-            _for_(i, 0, 1024, step) { \
-                E[span_t({i}, step)] = builder::make_select( \
-                        C[span_t({i}, step)] > D[span_t({i}, step)], \
-                        A[span_t({i}, step)], B[span_t({i}, step)]); \
-                F[span_t({i}, step)] = builder::make_select( \
-                        builder::make_constant( \
-                                {UINT64_C(0x03)}, sc_const_type), \
-                        A[span_t({i}, step)], B[span_t({i}, step)]); \
-            } \
-        } \
-        auto ctx = get_default_context(); \
-        for (auto &kv : get_engines()) { \
-            const string &je_name = kv.first; \
-            ostringstream err_context; \
-            err_context << "jit_engine_t class '" << je_name << "'"; \
-            if (is_cfake(je_name) && !use_cfake) { continue; } \
-            if (!is_jit_support_fp16(je_name)) { continue; } \
-            shared_ptr<jit_engine_t> engine = kv.second; \
-            SCOPED_TRACE(err_context.str()); \
-            auto fptr = engine->get_entry_func( \
-                    ir_module_t::from_entry_func(ctx, aaa)); \
-            ASSERT_TRUE(fptr); \
-            auto getC = []() { \
-                std::vector<test_type> A(2048); \
-                for (int i = 0; i < (int)A.size(); i++) { \
-                    A[i] = i % 2; \
-                } \
-                return A; \
-            }; \
-            auto getD = [&]() { \
-                std::vector<test_type> A(2048); \
-                for (int i = 0; i < (int)A.size(); i++) { \
-                    A[i] = 2 * i % step; \
-                } \
-                return A; \
-            }; \
-            auto getA = []() { \
-                std::vector<test_type> A(2048); \
-                for (int i = 0; i < (int)A.size(); i++) { \
-                    A[i] = 2 * i + 100; \
-                } \
-                return A; \
-            }; \
-            auto getB = []() { \
-                std::vector<test_type> A(2048); \
-                for (int i = 0; i < (int)A.size(); i++) { \
-                    A[i] = 2 * i - 100; \
-                } \
-                return A; \
-            }; \
-            std::vector<test_type> E(1024); \
-            std::vector<test_type> F(1024); \
-            auto A = getA(); \
-            auto B = getB(); \
-            auto C = getC(); \
-            auto D = getD(); \
-            fptr->call<void>(A.data(), B.data(), C.data(), D.data(), E.data(), \
-                    F.data()); \
-            for (int i = 0; i < 1024; i++) { \
-                auto expected_e = C[i] > D[i] ? A[i] : B[i]; \
-                auto expected_f = (i % step) < 2 ? A[i] : B[i]; \
-                EXPECT_NEAR(E[i], expected_e, 1e-5); \
-                EXPECT_NEAR(F[i], expected_f, 1e-5); \
-            } \
-        } \
-    }; \
-    test_func_name(test_step, test_dtype, test_const_type);
-    if (is_avx512) {
-        TEST_FUNC(float, 16, datatypes::f32, datatypes::u16, test_floatx16)
-        TEST_FUNC(int32_t, 16, datatypes::s32, datatypes::u16, test_s32x16)
-        TEST_FUNC(uint32_t, 16, datatypes::u32, datatypes::u16, test_u32x16)
-        TEST_FUNC(uint16_t, 32, datatypes::u16, datatypes::u32, test_u16x32)
-        TEST_FUNC(uint8_t, 64, datatypes::u8, datatypes::index, test_u8x64)
-        TEST_FUNC(int8_t, 64, datatypes::s8, datatypes::index, test_s8x64)
-        TEST_FUNC(fp16_t, 8, datatypes::f16, datatypes::u8, test_f16x8)
-        TEST_FUNC(fp16_t, 16, datatypes::f16, datatypes::u16, test_f16x16)
-        if (is_cpu_support_fp16()) {
-            // if don't have fp16 cast instructions, fp16 lanes must less
-            // than 16.
-            TEST_FUNC(fp16_t, 32, datatypes::f16, datatypes::u32, test_f16x32)
-        }
-    }
-    // todo our cfake need avx2 refactor
-    if (!is_avx512) { use_cfake = false; }
-    TEST_FUNC(float, 8, datatypes::f32, datatypes::u8, test_floatx8)
-    TEST_FUNC(int32_t, 8, datatypes::s32, datatypes::u8, test_s32x8)
-    TEST_FUNC(uint32_t, 8, datatypes::u32, datatypes::u8, test_u32x8)
-    TEST_FUNC(uint16_t, 8, datatypes::u16, datatypes::u8, test_u16x8)
-    TEST_FUNC(uint16_t, 16, datatypes::u16, datatypes::u16, test_u16x16)
-    TEST_FUNC(uint8_t, 8, datatypes::u8, datatypes::u8, test_u8x8)
-    TEST_FUNC(uint8_t, 16, datatypes::u8, datatypes::u16, test_u8x16)
-    TEST_FUNC(uint8_t, 32, datatypes::u8, datatypes::u32, test_u8x32)
-    TEST_FUNC(int8_t, 8, datatypes::s8, datatypes::u8, test_s8x8)
-    TEST_FUNC(int8_t, 16, datatypes::s8, datatypes::u16, test_s8x16)
-    TEST_FUNC(int8_t, 32, datatypes::s8, datatypes::u32, test_s8x32)
-    if (!is_avx512) { use_cfake = true; }
-}
-
-/// ===================================
-/// Test Group 2: oprations & data type
-/// ===================================
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpCast) {
-    REQUIRE_AVX512();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    //-----------------------------
-    // Cast to datatypes::s8
-    //-----------------------------
-#define REF_CAST_TO_S8(IN, LANES, I) (static_cast<int8_t>(IN[0][I]))
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int8_t, datatypes::s32, datatypes::s8,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_S8,
-            MAKE_CAST(datatypes::s8), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, int8_t, datatypes::f32, datatypes::s8,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_S8,
-            MAKE_CAST(datatypes::s8), DATASET_F3);
-#undef REF_CAST_TO_S8
-    //-----------------------------
-    // Cast to datatypes::u8
-    //-----------------------------
-#define REF_CAST_TO_U8(IN, LANES, I) (static_cast<uint8_t>(IN[0][I]))
-    // data_type: uint_32
-    TEST_OP(UNARY, EXACT, uint32_t, uint8_t, datatypes::s32, datatypes::u8,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_U8,
-            MAKE_CAST(datatypes::u8), DATASET_I1);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, uint8_t, datatypes::f32, datatypes::u8,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_U8,
-            MAKE_CAST(datatypes::u8), DATASET_F1);
-#undef REF_CAST_TO_U8
-    //-----------------------------
-    // Cast to datatypes::u16
-    //-----------------------------
-#define REF_CAST_TO_U16(IN, LANES, I) (static_cast<uint16_t>(IN[0][I]))
-    // data_type: uint_32
-    TEST_OP(UNARY, EXACT, uint32_t, uint16_t, datatypes::u32, datatypes::u16,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_U16,
-            MAKE_CAST(datatypes::u16), DATASET_I1);
-    // data_type: index
-    TEST_OP(UNARY, EXACT, uint64_t, uint16_t, datatypes::index, datatypes::u16,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_CAST_TO_U16,
-            MAKE_CAST(datatypes::u16), DATASET_I1);
-    if (is_cpu_support_fp16()) {
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        // data_type: fp16_t
-        TEST_OP(UNARY, EXACT, fp16_t, uint16_t, datatypes::f16, datatypes::u16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, TEST_SIMD,
-                REF_CAST_TO_U16, MAKE_CAST(datatypes::u16), DATASET_I1);
-    }
-#undef REF_CAST_TO_U16
-    //-----------------------------
-    // Cast to datatypes::u32
-    //-----------------------------
-#define REF_CAST_TO_U32(IN, LANES, I) (static_cast<uint32_t>(IN[0][I]))
-    TEST_OP(UNARY, EXACT, uint16_t, uint32_t, datatypes::u16, datatypes::u32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_U32,
-            MAKE_CAST(datatypes::u32), DATASET_I1);
-    if (is_cpu_support_fp16()) {
-        TEST_OP(UNARY, EXACT, fp16_t, uint32_t, datatypes::f16, datatypes::u32,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_U32,
-                MAKE_CAST(datatypes::u32), DATASET_I1);
-    }
-#undef REF_CAST_TO_U32
-    //-----------------------------
-    // Cast to datatypes::index
-    //-----------------------------
-#define REF_CAST_TO_INDEX(IN, LANES, I) (static_cast<uint64_t>(IN[0][I]))
-    TEST_OP(UNARY, EXACT, uint8_t, uint64_t, datatypes::u8, datatypes::index,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_CAST_TO_INDEX,
-            MAKE_CAST(datatypes::index), DATASET_I1);
-    if (is_cpu_support_fp16()) {
-        TEST_OP(UNARY, EXACT, fp16_t, uint64_t, datatypes::f16,
-                datatypes::index, DATA_LEN_16, num_lanes, TEST_SCALAR,
-                SKIP_SIMD, REF_CAST_TO_INDEX, MAKE_CAST(datatypes::index),
-                DATASET_I1);
-    }
-#undef REF_CAST_TO_INDEX
-    //-----------------------------
-    // Cast to datatypes::s32
-    //-----------------------------
-#define REF_CAST_TO_S32(IN, LANES, I) (static_cast<int32_t>(IN[0][I]))
-    // data_type: uint_8
-    TEST_OP(UNARY, EXACT, uint8_t, int32_t, datatypes::u8, datatypes::s32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_CAST_TO_S32,
-            MAKE_CAST(datatypes::s32), DATASET_I1);
-    // data_type: sint_8
-    TEST_OP(UNARY, EXACT, int8_t, int32_t, datatypes::s8, datatypes::s32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_S32,
-            MAKE_CAST(datatypes::s32), DATASET_I3);
-    if (is_cpu_support_fp16()) {
-        TEST_OP(UNARY, EXACT, fp16_t, int32_t, datatypes::f16, datatypes::s32,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_CAST_TO_S32,
-                MAKE_CAST(datatypes::s32), DATASET_I3);
-    }
-    // data_type: uint_16
-    TEST_OP(UNARY, EXACT, uint16_t, int32_t, datatypes::u16, datatypes::s32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_S32,
-            MAKE_CAST(datatypes::s32), DATASET_I1);
-#undef REF_CAST_TO_S32
-    //-----------------------------
-    // Cast to datatypes::f16
-    //-----------------------------
-    if (is_cpu_support_fp16()) {
-#define REF_CAST_TO_F16(IN, LANES, I) (static_cast<fp16_t>(IN[0][I]))
-        TEST_OP(UNARY, EXACT, float, fp16_t, datatypes::f32, datatypes::f16,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F16,
-                MAKE_CAST(datatypes::f16), DATASET_F3);
-        TEST_OP(UNARY, EXACT, uint32_t, fp16_t, datatypes::u32, datatypes::f16,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F16,
-                MAKE_CAST(datatypes::f16), DATASET_I1);
-        TEST_OP(UNARY, EXACT, int32_t, fp16_t, datatypes::s32, datatypes::f16,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F16,
-                MAKE_CAST(datatypes::f16), DATASET_I1);
-        TEST_OP(UNARY, EXACT, uint64_t, fp16_t, datatypes::index,
-                datatypes::f16, DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD,
-                REF_CAST_TO_F16, MAKE_CAST(datatypes::f16), DATASET_I1);
-        TEST_OP(UNARY, EXACT, uint8_t, fp16_t, datatypes::u8, datatypes::f16,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_CAST_TO_F16,
-                MAKE_CAST(datatypes::f16), DATASET_I1);
-        TEST_OP(UNARY, EXACT, uint16_t, fp16_t, datatypes::u16, datatypes::f16,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_CAST_TO_F16,
-                MAKE_CAST(datatypes::f16), DATASET_I1);
-#undef REF_CAST_TO_F32
-    }
-    //-----------------------------
-    // Cast to datatypes::f32
-    //-----------------------------
-#define REF_CAST_TO_F32(IN, LANES, I) (static_cast<float>(IN[0][I]))
-    // data_type: uint_8
-    TEST_OP(UNARY, EXACT, uint8_t, float, datatypes::u8, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F32,
-            MAKE_CAST(datatypes::f32), DATASET_I1);
-    // data_type: sint_8
-    TEST_OP(UNARY, EXACT, int8_t, float, datatypes::s8, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F32,
-            MAKE_CAST(datatypes::f32), DATASET_I3);
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, float, datatypes::s32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F32,
-            MAKE_CAST(datatypes::f32), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F32,
-            MAKE_CAST(datatypes::f32), DATASET_F3);
-    if (is_cpu_support_fp16()) {
-        TEST_OP(UNARY, EXACT, fp16_t, float, datatypes::f16, datatypes::f32,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_F32,
-                MAKE_CAST(datatypes::f32), DATASET_F3);
-    }
-#undef REF_CAST_TO_F32
-    //-----------------------------
-    // Cast to datatypes::generic
-    //-----------------------------
-#define REF_CAST_TO_GENERIC(IN, LANES, I) \
-    ((uint32_t)generic_val(IN[0][I]).v_uint64_t)
-    // data_type: sint_32, truncate generic_val to only check lower bits
-    TEST_OP(UNARY, EXACT, int32_t, uint64_t, datatypes::s32, datatypes::generic,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_CAST_TO_GENERIC,
-            MAKE_CAST(datatypes::generic), DATASET_I3);
-#undef REF_CAST_TO_GENERIC
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpCastAVX2) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_8);
-    //-----------------------------
-    // Cast to datatypes::s8
-    //-----------------------------
-#define REF_CAST_TO_S8(IN, LANES, I) (static_cast<int8_t>(IN[0][I]))
-    use_cfake = false;
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int8_t, datatypes::s32, datatypes::s8,
-            DATA_LEN_8, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_S8,
-            MAKE_CAST(datatypes::s8), DATASET_I1_8);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, int8_t, datatypes::f32, datatypes::s8,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_CAST_TO_S8,
-            MAKE_CAST(datatypes::s8), DATASET_I1_8);
-    use_cfake = true;
-#undef REF_CAST_TO_S8
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpAdd) {
-    REQUIRE_AVX2();
-#define REF_ADD(IN, LANES, I) (IN[0][I] + IN[1][I])
-    // data_type: sint_32
-    const int num_lanes_s32 = get_lanes(sc_data_etype::S32, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes_s32, TEST_SCALAR, SKIP_SIMD, REF_ADD,
-            MAKE_BINARY_OP(make_add), DATASET_I1, DATASET_I3);
-    // data_type: float_32
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes_f32, TEST_SCALAR, TEST_SIMD, REF_ADD,
-            MAKE_BINARY_OP(make_add), DATASET_F1, DATASET_F3);
-    // data_type: uint_64
-    const int num_lanes_u64 = get_lanes(sc_data_etype::INDEX, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, uint64_t, uint64_t, datatypes::index,
-            datatypes::index, DATA_LEN_16, num_lanes_u64, TEST_SCALAR,
-            SKIP_SIMD, REF_ADD, MAKE_BINARY_OP(make_add), DATASET_I1,
-            DATASET_I2);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(BINARY, EXACT, uint16_t, uint16_t, datatypes::f16,
-                datatypes::f16, DATA_LEN_16, num_lanes_fp16, TEST_SCALAR,
-                TEST_SIMD, REF_ADD, MAKE_BINARY_OP(make_add), DATASET_I1,
-                DATASET_I2);
-    }
-
-#undef REF_ADD
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpSub) {
-    REQUIRE_AVX2();
-#define REF_SUB(IN, LANES, I) (IN[0][I] - IN[1][I])
-    // data_type: float_32
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes_f32, TEST_SCALAR, TEST_SIMD, REF_SUB,
-            MAKE_BINARY_OP(make_sub), DATASET_F1, DATASET_F3);
-    // data_type: uint_64
-    const int num_lanes_u64 = get_lanes(sc_data_etype::INDEX, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, uint64_t, uint64_t, datatypes::index,
-            datatypes::index, DATA_LEN_16, num_lanes_u64, TEST_SCALAR,
-            SKIP_SIMD, REF_SUB, MAKE_BINARY_OP(make_sub), DATASET_I1,
-            DATASET_I2);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type:: float16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(BINARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, TEST_SIMD, REF_SUB,
-                MAKE_BINARY_OP(make_sub), DATASET_I1, DATASET_I2);
-    }
-#undef REF_SUB
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpMul) {
-    REQUIRE_AVX2();
-#define REF_MUL(IN, LANES, I) (IN[0][I] * IN[1][I])
-    // data_type: float_32
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes_f32, TEST_SCALAR, TEST_SIMD, REF_MUL,
-            MAKE_BINARY_OP(make_mul), DATASET_F1, DATASET_F3);
-    // data_type: uint_64
-    const int num_lanes_u64 = get_lanes(sc_data_etype::INDEX, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, uint64_t, uint64_t, datatypes::index,
-            datatypes::index, DATA_LEN_16, num_lanes_u64, TEST_SCALAR,
-            SKIP_SIMD, REF_MUL, MAKE_BINARY_OP(make_mul), DATASET_I1,
-            DATASET_I2);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type:: float_16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(BINARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, SKIP_SIMD, REF_MUL,
-                MAKE_BINARY_OP(make_mul), DATASET_F1, DATASET_F3);
-    }
-#undef REF_MUL
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpDiv) {
-    REQUIRE_AVX2();
-#define REF_DIV(IN, LANES, I) (IN[0][I] / IN[1][I])
-    // data_type: float_32
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    TEST_OP(BINARY, APPROX, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes_f32, TEST_SCALAR, TEST_SIMD, REF_DIV,
-            MAKE_BINARY_OP(make_div), DATASET_F1, DATASET_F3);
-    // data_type: uint_64
-    const int num_lanes_u64 = get_lanes(sc_data_etype::INDEX, DATA_LEN_16);
-    TEST_OP(BINARY, APPROX, uint64_t, uint64_t, datatypes::index,
-            datatypes::index, DATA_LEN_16, num_lanes_u64, TEST_SCALAR,
-            SKIP_SIMD, REF_DIV, MAKE_BINARY_OP(make_div), DATASET_I1,
-            DATASET_I2);
-    if (test_cpu_flags().fAVX512F) {
-        const int num_lanes_bf16 = get_lanes(sc_data_etype::BF16, DATA_LEN_16);
-        TEST_OP(BINARY, APPROX, bf16_t, bf16_t, datatypes::bf16,
-                datatypes::bf16, DATA_LEN_16, num_lanes_bf16, SKIP_SCALAR,
-                TEST_SIMD, REF_DIV, MAKE_BINARY_OP(make_div), DATASET_F1,
-                DATASET_F2);
-        // fp16 div in llvm use fast math, precision 1e-3 can meet require.
-        precision_threshold = 1e-3F;
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(BINARY, APPROX, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, SKIP_SCALAR, TEST_SIMD, REF_DIV,
-                MAKE_BINARY_OP(make_div), DATASET_F1, DATASET_F2);
-        precision_threshold = 1e-4F;
-    }
-#undef REF_DIV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpMod) {
-    REQUIRE_AVX2();
-#define REF_MOD(IN, LANES, I) (IN[0][I] % IN[1][I])
-    // data_type: uint_64
-    const int num_lanes = get_lanes(sc_data_etype::INDEX, DATA_LEN_16);
-    TEST_OP(BINARY, EXACT, uint64_t, uint64_t, datatypes::index,
-            datatypes::index, DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD,
-            REF_MOD, MAKE_BINARY_OP(make_mod), DATASET_I1, DATASET_I2);
-#undef REF_MOD
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestOpCmp) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_EQ(IN, LANES, I) (IN[0][I] == IN[1][I])
-    // data_type: float32
-    TEST_OP(BINARY, EXACT, float, bool, datatypes::f32, datatypes::boolean,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_EQ,
-            MAKE_BINARY_OP(make_cmp_eq), DATASET_F3, DATASET_F4);
-    if (test_cpu_flags().fAVX512F) {
-        TEST_OP(BINARY, EXACT, fp16_t, bool, datatypes::f16, datatypes::boolean,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_EQ,
-                MAKE_BINARY_OP(make_cmp_eq), DATASET_F3, DATASET_F4);
-    }
-#undef REF_EQ
-#define REF_NE(IN, LANES, I) (IN[0][I] != IN[1][I])
-    // data_type: float32
-    TEST_OP(BINARY, EXACT, float, bool, datatypes::f32, datatypes::boolean,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_NE,
-            MAKE_BINARY_OP(make_cmp_ne), DATASET_F3, DATASET_F4);
-    if (test_cpu_flags().fAVX512F) {
-        TEST_OP(BINARY, EXACT, fp16_t, bool, datatypes::f16, datatypes::boolean,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_NE,
-                MAKE_BINARY_OP(make_cmp_ne), DATASET_F3, DATASET_F4);
-    }
-#undef REF_NE
-#define REF_LT(IN, LANES, I) (IN[0][I] < IN[1][I])
-    // data_type: float32
-    TEST_OP(BINARY, EXACT, float, bool, datatypes::f32, datatypes::boolean,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_LT,
-            MAKE_BINARY_OP(make_cmp_lt), DATASET_F3, DATASET_F4);
-    if (test_cpu_flags().fAVX512F) {
-        TEST_OP(BINARY, EXACT, fp16_t, bool, datatypes::f16, datatypes::boolean,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_LT,
-                MAKE_BINARY_OP(make_cmp_lt), DATASET_F3, DATASET_F4);
-    }
-#undef REF_LT
-#define REF_LE(IN, LANES, I) (IN[0][I] <= IN[1][I])
-    // data_type: float32
-    TEST_OP(BINARY, EXACT, float, bool, datatypes::f32, datatypes::boolean,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_LE,
-            MAKE_BINARY_OP(make_cmp_le), DATASET_F3, DATASET_F4);
-    if (test_cpu_flags().fAVX512F) {
-        TEST_OP(BINARY, EXACT, fp16_t, bool, datatypes::f16, datatypes::boolean,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_LE,
-                MAKE_BINARY_OP(make_cmp_le), DATASET_F3, DATASET_F4);
-    }
-#undef REF_LE
-#define REF_GT(IN, LANES, I) (IN[0][I] > IN[1][I])
-    // data_type: float32
-    TEST_OP(BINARY, EXACT, float, bool, datatypes::f32, datatypes::boolean,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_GT,
-            MAKE_BINARY_OP(make_cmp_gt), DATASET_F3, DATASET_F4);
-    if (test_cpu_flags().fAVX512F) {
-        TEST_OP(BINARY, EXACT, fp16_t, bool, datatypes::f16, datatypes::boolean,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_GT,
-                MAKE_BINARY_OP(make_cmp_gt), DATASET_F3, DATASET_F4);
-    }
-#undef REF_GT
-#define REF_GE(IN, LANES, I) (IN[0][I] >= IN[1][I])
-    // data_type: float32
-    TEST_OP(BINARY, EXACT, float, bool, datatypes::f32, datatypes::boolean,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_GE,
-            MAKE_BINARY_OP(make_cmp_ge), DATASET_F3, DATASET_F4);
-    if (test_cpu_flags().fAVX512F) {
-        TEST_OP(BINARY, EXACT, fp16_t, bool, datatypes::f16, datatypes::boolean,
-                DATA_LEN_16, num_lanes, TEST_SCALAR, SKIP_SIMD, REF_GE,
-                MAKE_BINARY_OP(make_cmp_ge), DATASET_F3, DATASET_F4);
-    }
-#undef REF_GE
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinMin) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_MIN(IN, LANES, I) (std::min(IN[0][I], IN[1][I]))
-    // data_type: float_32
-    TEST_OP(BINARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_MIN,
-            MAKE_BINARY_OP(make_min), DATASET_F1, DATASET_F2);
-    if (test_cpu_flags().fAVX512F) {
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        // data_type: fp16
-        TEST_OP(BINARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, TEST_SIMD, REF_MIN,
-                MAKE_BINARY_OP(make_min), DATASET_F1, DATASET_F2);
-    }
-#undef REF_MIN
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinMax) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_MAX(IN, LANES, I) (std::max(IN[0][I], IN[1][I]))
-    // data_type: float_32
-    TEST_OP(BINARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_MAX,
-            MAKE_BINARY_OP(make_max), DATASET_F1, DATASET_F2);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(BINARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, TEST_SIMD, REF_MAX,
-                MAKE_BINARY_OP(make_max), DATASET_F1, DATASET_F2);
-    }
-#undef REF_MAX
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinFloor) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_FLOOR(IN, LANES, I) (floor(IN[0][I]))
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_FLOOR,
-            MAKE_UNARY_OP(make_floor), DATASET_F3);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(UNARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, SKIP_SCALAR, TEST_SIMD, REF_FLOOR,
-                MAKE_UNARY_OP(make_floor), DATASET_F3);
-    }
-#undef REF_FLOOR
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinCeil) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_CEIL(IN, LANES, I) (ceil(IN[0][I]))
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_CEIL,
-            MAKE_UNARY_OP(make_ceil), DATASET_F3);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(UNARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, SKIP_SCALAR, TEST_SIMD, REF_CEIL,
-                MAKE_UNARY_OP(make_ceil), DATASET_F3);
-    }
-#undef REF_CEIL
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinExp) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_EXP(IN, LANES, I) (expf(IN[0][I]))
-    // data_type: float_32
-    TEST_OP(UNARY, APPROX, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_EXP,
-            MAKE_UNARY_OP(make_exp), DATASET_F3);
-#undef REF_EXP
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinLog) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_LOG(IN, LANES, I) (logf(IN[0][I]))
-    // data_type: float_32
-    TEST_OP(UNARY, APPROX, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_LOG,
-            MAKE_UNARY_OP(make_log), DATASET_F1);
-#undef REF_LOG
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinFnmadd) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_FNMADD(IN, LANES, I) (-(IN[0][I] * IN[1][I]) + IN[2][I])
-    // data_type: float_32
-    TEST_OP(TRINARY, APPROX, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_FNMADD,
-            MAKE_TRINARY_OP(make_fnmadd), DATASET_F1, DATASET_F2, DATASET_F3);
-    if (is_cpu_support_fp16()) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(TRINARY, APPROX, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, TEST_SIMD, REF_FNMADD,
-                MAKE_TRINARY_OP(make_fnmadd), DATASET_F1, DATASET_F2,
-                DATASET_F3);
-    }
-#undef REF_FNMADD
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinFmadd) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_FMADD(IN, LANES, I) (IN[0][I] * IN[1][I] + IN[2][I])
-    // data_type: float_32
-    TEST_OP(TRINARY, APPROX, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_FMADD,
-            MAKE_TRINARY_OP(make_fmadd), DATASET_F1, DATASET_F2, DATASET_F3);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(TRINARY, APPROX, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, TEST_SCALAR, TEST_SIMD, REF_FMADD,
-                MAKE_TRINARY_OP(make_fmadd), DATASET_F1, DATASET_F2,
-                DATASET_F3);
-    }
-#undef REF_FMADD
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinAbs) {
-    REQUIRE_AVX2();
-#define REF_ABS(IN, LANES, I) (std::abs(IN[0][I]))
-    // data_type: sint_8
-    const int num_lanes_s8 = DATA_LEN_16;
-    TEST_OP(UNARY, EXACT, int8_t, int8_t, datatypes::s8, datatypes::s8,
-            DATA_LEN_16, num_lanes_s8, SKIP_SCALAR, TEST_SIMD, REF_ABS,
-            MAKE_UNARY_OP(make_abs), DATASET_I3);
-    // data_type: sint_32
-    const int num_lanes_s32 = get_lanes(sc_data_etype::S32, DATA_LEN_16);
-    TEST_OP(UNARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes_s32, SKIP_SCALAR, TEST_SIMD, REF_ABS,
-            MAKE_UNARY_OP(make_abs), DATASET_I3);
-    // data_type: float_32
-    const int num_lanes_f32 = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes_f32, TEST_SCALAR, TEST_SIMD, REF_ABS,
-            MAKE_UNARY_OP(make_abs), DATASET_F3);
-    if (test_cpu_flags().fAVX512F) {
-        const int num_lanes_bf16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(UNARY, EXACT, bf16_t, bf16_t, datatypes::bf16, datatypes::bf16,
-                DATA_LEN_16, num_lanes_bf16, SKIP_SCALAR, TEST_SIMD, REF_ABS,
-                MAKE_UNARY_OP(make_abs), DATASET_F3);
-        // data_type: fp16
-        const int num_lanes_f16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(UNARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_f16, SKIP_SCALAR, TEST_SIMD, REF_ABS,
-                MAKE_UNARY_OP(make_abs), DATASET_F3);
-    }
-#undef REF_ABS
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestIntrinRound) {
-    REQUIRE_AVX2();
-    const int num_lanes = get_lanes(sc_data_etype::F32, DATA_LEN_16);
-#define REF_ROUND(IN, LANES, I) (std::rint(IN[0][I]))
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, TEST_SCALAR, TEST_SIMD, REF_ROUND,
-            MAKE_UNARY_OP(make_round), DATASET_F3);
-    if (test_cpu_flags().fAVX512F) {
-        // data_type: fp16
-        const int num_lanes_fp16 = get_lanes(sc_data_etype::F16, DATA_LEN_16);
-        TEST_OP(UNARY, EXACT, fp16_t, fp16_t, datatypes::f16, datatypes::f16,
-                DATA_LEN_16, num_lanes_fp16, SKIP_SCALAR, TEST_SIMD, REF_ROUND,
-                MAKE_UNARY_OP(make_round), DATASET_F3);
-    }
-#undef REF_ROUND
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestAVX2MaskMovx4) {
-    REQUIRE_AVX2();
-    const int num_lanes = 4;
-    const uint64_t mask_val = 0xf >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::u8);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_F1);
-#undef REF_MASK_MOV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestAVX2MaskMovx8) {
-    REQUIRE_AVX2();
-    const int num_lanes = 8;
-    const uint64_t mask_val = 0xff >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::u8);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_F1);
-#undef REF_MASK_MOV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestMaskMovx4) {
-    REQUIRE_AVX512();
-    const int num_lanes = 4;
-    const uint64_t mask_val = 0xf >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::u8);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_F1);
-#undef REF_MASK_MOV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestMaskMovx8) {
-    REQUIRE_AVX512();
-    const int num_lanes = 8;
-    const uint64_t mask_val = 0xff >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::u8);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: sint_8
-    TEST_OP(UNARY, EXACT, int8_t, int8_t, datatypes::s8, datatypes::s8,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1);
-    // data_type: uint_8
-    TEST_OP(UNARY, EXACT, uint8_t, uint8_t, datatypes::u8, datatypes::u8,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1);
-    // data_type: uint_16
-    TEST_OP(UNARY, EXACT, uint16_t, uint16_t, datatypes::u16, datatypes::u16,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1);
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_F1);
-#undef REF_MASK_MOV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestMaskMovx16) {
-    REQUIRE_AVX512();
-    const int num_lanes = 16;
-    const uint64_t mask_val = 0xffff >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::u16);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: uint_8
-    TEST_OP(UNARY, EXACT, uint8_t, uint8_t, datatypes::u8, datatypes::u8,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1);
-    // data_type: uint_16
-    TEST_OP(UNARY, EXACT, uint16_t, uint16_t, datatypes::u16, datatypes::u16,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I2);
-    // data_type: sint_32
-    TEST_OP(UNARY, EXACT, int32_t, int32_t, datatypes::s32, datatypes::s32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I3);
-    // data_type: float_32
-    TEST_OP(UNARY, EXACT, float, float, datatypes::f32, datatypes::f32,
-            DATA_LEN_16, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_F1);
-#undef REF_MASK_MOV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestMaskMovx32) {
-    REQUIRE_AVX512();
-    const int num_lanes = 32;
-    const uint64_t mask_val = 0xffffffff >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::u32);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: uint_8
-    TEST_OP(UNARY, EXACT, uint8_t, uint8_t, datatypes::u8, datatypes::u8,
-            DATA_LEN_64, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1_64);
-    // data_type: uint_16
-    TEST_OP(UNARY, EXACT, uint16_t, uint16_t, datatypes::u16, datatypes::u16,
-            DATA_LEN_64, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1_64);
-#undef REF_MASK_MOV
-}
-
-TEST(GCCore_CPU_test_jit_engine_equivalence, TestMaskMovx64) {
-    REQUIRE_AVX512();
-    const int num_lanes = 64;
-    const uint64_t mask_val = 0xffffffffffffffff >> 1;
-    const expr mask = make_constant({mask_val}, datatypes::index);
-#define REF_MASK_MOV(IN, LANES, I) \
-    ((mask_val & UINT64_C(1) << (I % LANES)) ? IN[0][I] : 0)
-    // data_type: uint_8
-    TEST_OP(UNARY, EXACT, uint8_t, uint8_t, datatypes::u8, datatypes::u8,
-            DATA_LEN_64, num_lanes, SKIP_SCALAR, TEST_SIMD, REF_MASK_MOV,
-            MAKE_MASK_MOV(mask), DATASET_I1_64);
-#undef REF_MASK_MOV
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit_workload_for_debugging.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit_workload_for_debugging.cpp
deleted file mode 100644
index b1d7220b7a5..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_jit_workload_for_debugging.cpp
+++ /dev/null
@@ -1,650 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include "test_utils.hpp"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#if SC_CFAKE_JIT_ENABLED
-#include <compiler/jit/cfake/cfake_jit.hpp>
-#endif
-#include "gtest/gtest.h"
-#include <compiler/ir/builtin.hpp>
-#if SC_BUILTIN_JIT_ENABLED
-#include <compiler/jit/xbyak/xbyak_jit.hpp>
-#endif
-#include <compiler/jit/jit.hpp>
-#include <runtime/generic_val.hpp>
-#include <util/string_utils.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-using std::endl;
-using std::make_shared;
-using std::map;
-using std::ostringstream;
-using std::shared_ptr;
-using std::string;
-using std::vector;
-
-static map<string, shared_ptr<jit_engine_t>> get_engines() {
-    map<string, shared_ptr<jit_engine_t>> ret;
-#if SC_CFAKE_JIT_ENABLED
-    ret["cfake_jit"] = make_shared<cfake_jit>();
-#endif
-#if SC_BUILTIN_JIT_ENABLED
-    if (get_default_context()->machine_.cpu_flags_.fAVX2) {
-        ret["xbyak_jit"] = make_shared<xbyak_jit>();
-    }
-#endif
-    return ret;
-}
-
-static map<string, shared_ptr<jit_engine_t>> test_jit_engines = get_engines();
-
-//===========================================================================
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestAssignConstToTensorElem) {
-    SKIP_BOUNDARY_CHECK();
-    ir_builder_t builder;
-
-    // Has no observable behavior; just used to generate object code snippets
-    // for debugging purposes.
-    _function_(datatypes::void_t, foo, _arg_("buf_A", datatypes::f32, {10})) {
-        _bind_(buf_A);
-        _var_(idx_B, datatypes::s32);
-        idx_B = 1;
-        buf_A[idx_B] = 42.0f;
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    float A[10];
-
-    for (size_t i = 0; i < 10; ++i) {
-        A[i] = float(i);
-    }
-
-    generic_val generic_args[] = {(void *)(A)};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        // TODO(xxx): In the cfake_jit, the generic wrapper has TWO
-        // parameters if 'is_parallel' is true. (See
-        // 'write_cpp_generic_wrapper(...)' in codegen/codegen_c.cpp) For now
-        // we're assuming 'is_parallel' is false.
-        jf->call_generic_default(generic_args);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestAssignS32ConstToVar) {
-    ir_builder_t builder;
-
-    // Has no observable behavior; just used to generate object code snippets
-    // for debugging purposes.
-    _function_(datatypes::void_t, foo) {
-        _var_(my_int, datatypes::s32);
-        my_int = 42;
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    generic_val generic_args[] = {1.0f};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        // TODO(xxx): In the cfake_jit, the generic wrapper has TWO
-        // parameters if 'is_parallel' is true. (See
-        // 'write_cpp_generic_wrapper(...)' in codegen/codegen_c.cpp) For now
-        // we're assuming 'is_parallel' is false.
-        jf->call_generic_default(generic_args);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestAssignS32VarToVar) {
-    ir_builder_t builder;
-
-    // Has no observable behavior; just used to generate object code snippets
-    // for debugging purposes.
-    _function_(datatypes::void_t, foo) {
-        _var_(my_src, datatypes::s32);
-        _var_(my_dest, datatypes::s32);
-        my_src = 42;
-        my_dest = my_src;
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    generic_val generic_args[] = {1.0f};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        // TODO(xxx): In the cfake_jit, the generic wrapper has TWO
-        // parameters if 'is_parallel' is true. (See
-        // 'write_cpp_generic_wrapper(...)' in codegen/codegen_c.cpp) For now
-        // we're assuming 'is_parallel' is false.
-        jf->call_generic_default(generic_args);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestIndexingRvalue) {
-    ir_builder_t builder;
-
-    // Has no observable behavior; just used to generate object code snippets
-    // for debugging purposes.
-    //    _function_(
-    //            datatypes::void_t, bar, _arg_("bar_buf_A", datatypes::f32,
-    //            {10})) {
-    //        _bind_(bar_buf_A);
-    //    }
-
-    _function_(datatypes::void_t, foo, _arg_("buf_A", datatypes::f32, {10}),
-            _arg_("buf_B", datatypes::f32, {10})) {
-        _bind_(buf_A, buf_B);
-        buf_A[3] = buf_B[5];
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-    float A[10];
-    float B[10];
-
-    for (size_t i = 0; i < 10; ++i) {
-        A[i] = float(i);
-        B[i] = float(-1.0 * i);
-    }
-
-    generic_val generic_args[] = {(void *)(A), (void *)(B)};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("foo");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        // TODO(xxx): In the cfake_jit, the generic wrapper has TWO
-        // parameters if 'is_parallel' is true. (See
-        // 'write_cpp_generic_wrapper(...)' in codegen/codegen_c.cpp) For now
-        // we're assuming 'is_parallel' is false.
-        jf->call_generic_default(generic_args);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestOneReturnF32) {
-    ir_builder_t builder;
-
-    _function_(datatypes::f32, foo) { _return_(42.0f); }
-
-    _function_(datatypes::void_t, bar, _arg_("pf32", datatypes::f32, {1})) {
-        _bind_(pf32);
-        pf32[0] = foo();
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo, bar}, 0);
-    float x = 21;
-
-    generic_val generic_args[] = {(void *)(&x)};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_bar = jm->get_function("bar");
-
-        EXPECT_NE(j_bar, nullptr);
-        if (!j_bar) { continue; }
-
-        j_bar->call_generic_default(generic_args);
-        EXPECT_FLOAT_EQ(x, 42.0f);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestOneReturnU64) {
-    ir_builder_t builder;
-
-    _function_(datatypes::index, foo) { _return_(uint64_t(42)); }
-
-    _function_(datatypes::void_t, bar, _arg_("p", datatypes::index, {1})) {
-        _bind_(p);
-        p[0] = foo();
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo, bar}, 0);
-    uint64_t x = 123;
-
-    generic_val generic_args[] = {(void *)(&x)};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_bar = jm->get_function("bar");
-
-        EXPECT_NE(j_bar, nullptr);
-        if (!j_bar) { continue; }
-
-        j_bar->call_generic_default(generic_args);
-        EXPECT_EQ(x, uint64_t(42));
-    }
-}
-
-// Disabled until Xbyak JIT engine has the necessary support for u8.
-TEST(GCCore_CPU_jit_workload_for_debugging, DISABLED_TestOneReturnU8) {
-    ir_builder_t builder;
-
-    _function_(datatypes::u8, foo) {
-        _return_(make_expr<constant_node>(uint64_t(42), datatypes::u8));
-    }
-
-    _function_(datatypes::void_t, bar, _arg_("p", datatypes::u8, {1})) {
-        _bind_(p);
-        p[0] = foo();
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo, bar}, 0);
-    uint8_t x = 21;
-
-    generic_val generic_args[] = {(void *)(&x)};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_bar = jm->get_function("bar");
-
-        EXPECT_NE(j_bar, nullptr);
-        if (!j_bar) { continue; }
-
-        j_bar->call_generic_default(generic_args);
-        EXPECT_EQ(x, 42);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestOneReturnS32) {
-    ir_builder_t builder;
-
-    _function_(datatypes::s32, foo) { _return_(-42); }
-
-    _function_(datatypes::void_t, bar, _arg_("p", datatypes::s32, {1})) {
-        _bind_(p);
-        p[0] = foo();
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo, bar}, 0);
-    int32_t x = 21;
-
-    generic_val generic_args[] = {(void *)(&x)};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_bar = jm->get_function("bar");
-
-        EXPECT_NE(j_bar, nullptr);
-        if (!j_bar) { continue; }
-
-        j_bar->call_generic_default(generic_args);
-        EXPECT_EQ(x, -42);
-    }
-}
-
-// Disabled until Xbyak-jit-engine supports the if/else branch used by the
-// test function 'foo'.
-TEST(GCCore_CPU_jit_workload_for_debugging, TestMultiReturnS32) {
-    const uint64_t branch_selector_a = 0;
-    const uint64_t branch_selector_b = 1;
-
-    const int32_t return_val_a = 42;
-    const int32_t return_val_b = -42;
-
-    ir_builder_t builder;
-
-    _function_(
-            datatypes::s32, foo, _arg_("branch_selector", datatypes::index)) {
-        _bind_(branch_selector);
-        _if_(branch_selector == branch_selector_a) { _return_(return_val_a); }
-        _else_ { _return_(return_val_b); }
-    }
-
-    _function_(datatypes::void_t, bar, _arg_("p", datatypes::s32, {1}),
-            _arg_("branch_selector", datatypes::index)) {
-        _bind_(p, branch_selector);
-        p[0] = foo(branch_selector);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo, bar}, 1);
-
-    int32_t x = 21;
-
-    // branch A...
-    {
-        generic_val generic_args[] = {(void *)(&x), branch_selector_a};
-        for (auto &kv : test_jit_engines) {
-            const string &je_name = kv.first;
-
-            ostringstream err_context;
-            err_context << "jit_engine_t class '" << je_name << "'";
-            SCOPED_TRACE(err_context.str());
-
-            shared_ptr<jit_engine_t> je = kv.second;
-            EXPECT_NE(je, nullptr);
-            if (!je) { continue; }
-
-            shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-            EXPECT_NE(jm, nullptr);
-
-            shared_ptr<jit_function_t> j_bar = jm->get_function("bar");
-
-            EXPECT_NE(j_bar, nullptr);
-            if (!j_bar) { continue; }
-
-            j_bar->call_generic_default(generic_args);
-            EXPECT_EQ(x, return_val_a);
-        }
-    }
-
-    // branch B...
-    {
-        generic_val generic_args[] = {(void *)(&x), branch_selector_b};
-        for (auto &kv : test_jit_engines) {
-            const string &je_name = kv.first;
-
-            ostringstream err_context;
-            err_context << "jit_engine_t class '" << je_name << "'";
-            SCOPED_TRACE(err_context.str());
-
-            shared_ptr<jit_engine_t> je = kv.second;
-            EXPECT_NE(je, nullptr);
-            if (!je) { continue; }
-
-            shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-            EXPECT_NE(jm, nullptr);
-
-            shared_ptr<jit_function_t> j_bar = jm->get_function("bar");
-
-            EXPECT_NE(j_bar, nullptr);
-            if (!j_bar) { continue; }
-
-            j_bar->call_generic_default(generic_args);
-            EXPECT_EQ(x, return_val_b);
-        }
-    }
-}
-
-// At a basic level, this unit test simply confirms that it's
-// possible to JIT-compile a builder function that calls one of
-// our `builtin::` functions.
-//
-// To confirm that the specified function (`print_int`) was
-// actually called successfully, examine the gtest stdout for
-// confirmation that the value '42' was actually printed.
-TEST(GCCore_CPU_jit_workload_for_debugging, TestCallExternal) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo) {
-        builtin::print_int(make_constant(int32_t(42)));
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    generic_val *generic_args = nullptr;
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        j_foo->call_generic_default(generic_args);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestCallIntegerStackArgs) {
-    ir_builder_t builder;
-
-    _function_(datatypes::void_t, foo, _arg_("result", datatypes::s32, {1}),
-            _arg_("p1", datatypes::s32), _arg_("p2", datatypes::s32),
-            _arg_("p3", datatypes::s32), _arg_("p4", datatypes::s32),
-            _arg_("p5", datatypes::s32), _arg_("p6", datatypes::s32),
-            _arg_("p7", datatypes::s32), _arg_("p8", datatypes::s32)) {
-        _bind_(result, p1, p2, p3, p4, p5, p6, p7, p8);
-        result[0] = p8 * (p7 + p6 + p5 + p4 + p3 + p2 + p1);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo}, 0);
-
-    int32_t host_p1 = 1;
-    int32_t host_p2 = 2;
-    int32_t host_p3 = 4;
-    int32_t host_p4 = 8;
-    int32_t host_p5 = 16;
-    int32_t host_p6 = 32;
-    int32_t host_p7 = 64;
-    int32_t host_p8 = 128;
-
-    int32_t actual_result = 0;
-    int32_t expected_result = host_p8
-            * (host_p7 + host_p6 + host_p5 + host_p4 + host_p3 + host_p2
-                    + host_p1);
-
-    generic_val generic_args[] = {&actual_result, host_p1, host_p2, host_p3,
-            host_p4, host_p5, host_p6, host_p7, host_p8};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> j_foo = jm->get_function("foo");
-
-        EXPECT_NE(j_foo, nullptr);
-        if (!j_foo) { continue; }
-
-        j_foo->call_generic_default(generic_args);
-
-        EXPECT_EQ(expected_result, actual_result);
-    }
-}
-
-TEST(GCCore_CPU_jit_workload_for_debugging, TestDeadFuncCallReturnValue) {
-    ir_builder_t builder;
-
-    // TODO(xxx): add case for dead return value but still need call happen
-    _function_(datatypes::f32, tmp) {
-        _var_(c, datatypes::f32);
-        c = 1.0f;
-        _return_(c);
-    }
-    _function_(datatypes::f32, foo, _arg_("idx", datatypes::f32)) {
-        _bind_(idx);
-        _var_(b, datatypes::f32);
-        b = 0;
-        _for_(i, 0, 200) {
-            _var_(c, datatypes::f32);
-            c = tmp();
-            b = b + 1;
-        }
-        _return_(b);
-    }
-
-    ir_module_ptr ir_mod = std::make_shared<ir_module_t>(
-            get_default_context(), vector<func_t> {foo, tmp}, 0);
-
-    generic_val generic_args[] = {1.0f};
-
-    for (auto &kv : test_jit_engines) {
-        const string &je_name = kv.first;
-
-        ostringstream err_context;
-        err_context << "jit_engine_t class '" << je_name << "'";
-        SCOPED_TRACE(err_context.str());
-
-        shared_ptr<jit_engine_t> je = kv.second;
-        EXPECT_NE(je, nullptr);
-        if (!je) { continue; }
-
-        shared_ptr<jit_module> jm = je->make_jit_module(ir_mod, true);
-        EXPECT_NE(jm, nullptr);
-
-        shared_ptr<jit_function_t> jf = jm->get_function("tmp");
-
-        EXPECT_NE(jf, nullptr);
-        if (!jf) { continue; }
-
-        jf->call_generic_default(generic_args);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_kernel_lowering.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_kernel_lowering.cpp
deleted file mode 100644
index 05e05ba1acb..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_kernel_lowering.cpp
+++ /dev/null
@@ -1,721 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/transform/cpu/kernel_lower.hpp>
-
-#include <iostream>
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-brgemm::attrs_setting_t::attrs_map_t attr_0
-        = {brgemm::attr_key::max_top_vpad, 2};
-brgemm::attrs_setting_t::attrs_map_t attr_1
-        = {brgemm::attr_key::hint_expected_A_size, 1024};
-brgemm::attrs_setting_t::attrs_map_t attr_2
-        = {brgemm::attr_key::hint_expected_B_size, 1024};
-brgemm::attrs_setting_t::attrs_map_t attr_3
-        = {brgemm::attr_key::bd_mask_level, 2};
-brgemm::attrs_setting_t::attrs_map_t range_attr_0
-        = {brgemm::attr_key::M_range_upper_bound, 64};
-brgemm::attrs_setting_t::attrs_map_t range_attr_1
-        = {brgemm::attr_key::N_range_upper_bound, 64};
-brgemm::attrs_setting_t::attrs_map_t range_attr_2
-        = {brgemm::attr_key::K_range_upper_bound, 64};
-brgemm::attrs_setting_t::attrs_map_t range_attr_3
-        = {brgemm::attr_key::M_range_tail_value, 2};
-sc_brgemm_bd_mask_t bd_mask_0 {1, 0};
-sc_brgemm_bd_mask_t bd_mask_1 {0, 1};
-
-int shape_0[2] = {2, 4};
-int shape_1[2] = {3, 4};
-brgemm::postop_setting_t postop_0((brgemm::bias_op_t(sc_data_etype::F32)));
-brgemm::postop_setting_t postop_1 = brgemm::scale_op_t();
-brgemm::postop_setting_t postop_2(brgemm::bin_op_t(
-        brgemm::alg_kind_t::binary_add, shape_0, sc_data_etype::F32));
-brgemm::postop_setting_t postop_3(brgemm::bin_op_t(
-        brgemm::alg_kind_t::binary_add, shape_1, sc_data_etype::F32));
-
-#define DEFINE_ATTRS_BDMSK_TENSORS(mod) \
-    size_t attrs_size = sizeof(brgemm::attrs_setting_t::attrs_map_t) * 3 \
-            + sizeof(int64_t); \
-    std::vector<char> attrs_data(attrs_size, 0); \
-    brgemm::attrs_setting_t *attrs_ptr \
-            = reinterpret_cast<brgemm::attrs_setting_t *>(attrs_data.data()); \
-    attrs_ptr->num_ = 3; \
-    attrs_ptr->map_[0] = attr_0; \
-    attrs_ptr->map_[1] = attr_1; \
-    attrs_ptr->map_[2] = attr_3; \
-    _module_tensor_(mod, attrs_tsr, datatypes::u8, {attrs_size}); \
-    attrs_tsr.get().checked_as<tensor>()->init_value_ \
-            = std::make_shared<static_data_t>(attrs_ptr, attrs_size); \
-    _module_tensor_(mod, bd_mask_tsr, datatypes::u8, {bd_mask.size()}); \
-    bd_mask_tsr.get().checked_as<tensor>()->init_value_ \
-            = std::make_shared<static_data_t>(bd_mask.data(), bd_mask.size());
-
-#define DEFINE_ATTRS_MODULE_TENSORS(num) \
-    _module_tensor_(m2, attrs_tsr_##num, datatypes::u8, {attrs_sz_##num}); \
-    attrs_tsr_##num.get().checked_as<tensor>()->init_value_ \
-            = std::make_shared<static_data_t>( \
-                    attrs_ptr_##num, attrs_sz_##num); \
-    _module_tensor_( \
-            m2, bd_mask_tsr_##num, datatypes::u8, {bd_mask_##num.size()}); \
-    bd_mask_tsr_##num.get().checked_as<tensor>()->init_value_ \
-            = std::make_shared<static_data_t>(bd_mask_##num.data(), 2); \
-    _module_tensor_(m2, bd_mask_arr_##num, datatypes::pointer, {1}); \
-    builder::make_assign_unattached( \
-            builder::make_indexing(bd_mask_arr_##num, {0}), \
-            builder::tensor_ptr(bd_mask_tsr_##num, {0})); \
-    _module_tensor_( \
-            m2, postop_set_tsr_##num, datatypes::u8, {postop_set_sz_##num}); \
-    postop_set_tsr_##num.get().checked_as<tensor>()->init_value_ \
-            = std::make_shared<static_data_t>( \
-                    postop_set_ptr_##num, postop_set_sz_##num);
-
-#define DEFINE_REF_MODULE_TENSOR_0() \
-    size_t attrs_sz_0 = sizeof(brgemm::attrs_setting_t::attrs_map_t) * 3 \
-            + sizeof(int64_t); \
-    std::vector<char> attrs_data_0(attrs_sz_0, 0); \
-    brgemm::attrs_setting_t *attrs_ptr_0 \
-            = reinterpret_cast<brgemm::attrs_setting_t *>( \
-                    attrs_data_0.data()); \
-    attrs_ptr_0->num_ = 3; \
-    attrs_ptr_0->map_[0] = attr_0; \
-    attrs_ptr_0->map_[1] = attr_1; \
-    attrs_ptr_0->map_[2] = attr_3; \
-    size_t postop_set_sz_0 \
-            = sizeof(brgemm::postop_setting_t) * postop_set_0.size() \
-            + sizeof(int64_t); \
-    std::vector<char> postop_set_data_0(postop_set_sz_0, 0); \
-    brgemm::postops_setting_t *postop_set_ptr_0 \
-            = reinterpret_cast<brgemm::postops_setting_t *>( \
-                    postop_set_data_0.data()); \
-    postop_set_ptr_0->num_ = 3; \
-    postop_set_ptr_0->ops_[0] = postop_0; \
-    postop_set_ptr_0->ops_[1] = postop_1; \
-    postop_set_ptr_0->ops_[2] = postop_2; \
-    DEFINE_ATTRS_MODULE_TENSORS(0);
-
-#define DEFINE_REF_MODULE_TENSOR_1() \
-    size_t attrs_sz_1 = sizeof(brgemm::attrs_setting_t::attrs_map_t) * 3 \
-            + sizeof(int64_t); \
-    std::vector<char> attrs_data_1(attrs_sz_1, 0); \
-    brgemm::attrs_setting_t *attrs_ptr_1 \
-            = reinterpret_cast<brgemm::attrs_setting_t *>( \
-                    attrs_data_1.data()); \
-    attrs_ptr_1->num_ = 3; \
-    attrs_ptr_1->map_[0] = attr_0; \
-    attrs_ptr_1->map_[1] = attr_2; \
-    attrs_ptr_1->map_[2] = attr_3; \
-    size_t postop_set_sz_1 \
-            = sizeof(brgemm::postop_setting_t) * postop_set_1.size() \
-            + sizeof(int64_t); \
-    std::vector<char> postop_set_data_1(postop_set_sz_1, 0); \
-    brgemm::postops_setting_t *postop_set_ptr_1 \
-            = reinterpret_cast<brgemm::postops_setting_t *>( \
-                    postop_set_data_1.data()); \
-    postop_set_ptr_1->num_ = 3; \
-    postop_set_ptr_1->ops_[0] = postop_0; \
-    postop_set_ptr_1->ops_[1] = postop_3; \
-    postop_set_ptr_1->ops_[2] = postop_1; \
-    DEFINE_ATTRS_MODULE_TENSORS(1);
-
-#define DEFINE_POSTOP_DATA_TENSORS(num) \
-    _tensor_(bias_##num, datatypes::f32, {100}); \
-    _tensor_(scales_##num, datatypes::f32, {100}); \
-    _tensor_(binary_tsr_##num, datatypes::f32, {50});
-#define POSTOP_DATA_INIT(dst_num, src_num) \
-    _tensor_(binary_rhs_ptr_##dst_num, datatypes::pointer, {1}); \
-    binary_rhs_ptr_##dst_num[UINT64_C(0)] = binary_tsr_##src_num; \
-    _tensor_(postop_data_##dst_num, datatypes::u8, \
-            {brgemm::postops_data_size}); \
-    _evaluate_call_(postop_data_init, postop_data_##dst_num, bias_##src_num, \
-            scales_##src_num, binary_rhs_ptr_##dst_num, ir_zero, ir_zero, \
-            ir_nullptr, ir_zero, ir_nullptr, ir_nullptr, ir_nullptr, ir_false, \
-            ir_zero_s32, ir_false, ir_false);
-
-TEST(GCCore_CPU_kernel_lowering_cpp, TestKernelLowering) {
-    builder::ir_builder_t builder;
-    sc_brgemm_attrs_t attrs_0 = {attr_0, attr_1, attr_3};
-    sc_brgemm_postops_setting_t postop_set_0 = {postop_0, postop_1, postop_2};
-    sc_brgemm_attrs_t attrs_1 = {attr_0, attr_2, attr_3};
-    sc_brgemm_postops_setting_t postop_set_1 = {postop_0, postop_3, postop_1};
-    sc_brgemm_attrs_t attrs_2 = {range_attr_0};
-    _function_(datatypes::void_t, aaa, {}) {
-        _tensor_(A, datatypes::f32, {100});
-        _tensor_(B, datatypes::f32, {100});
-        _tensor_(C, datatypes::f32, {100});
-        DEFINE_POSTOP_DATA_TENSORS(0);
-        DEFINE_POSTOP_DATA_TENSORS(1);
-        _tensor_(c_buf, datatypes::f32, {100});
-
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-        // same parameters as above, there should only be one kernel entry
-        builtin::brgemm_init_update(A, B, C, 2, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-        _var_(c, datatypes::s32);
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16, attrs_2);
-
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-        // different parameters from above, there should only be 2 different
-        // entries
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 10,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-
-        auto postop_data_0 = builtin::create_initialed_postops_data();
-        postop_data_0[brgemm::postop_data_kind::bias] = bias_0;
-        postop_data_0[brgemm::postop_data_kind::scales] = scales_0;
-        postop_data_0[brgemm::postop_data_kind::binary_post_ops_rhs]
-                = binary_tsr_0;
-        auto postop_data_1 = builtin::create_initialed_postops_data();
-        postop_data_1[brgemm::postop_data_kind::bias] = bias_1;
-        postop_data_1[brgemm::postop_data_kind::scales] = scales_1;
-        postop_data_1[brgemm::postop_data_kind::binary_post_ops_rhs]
-                = binary_tsr_1;
-        expr brg_c_buf = c_buf;
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16, attrs_0, bd_mask_0, 0, 1,
-                get_ir_null(), get_ir_null(), postop_set_0, postop_data_0,
-                brg_c_buf);
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16, attrs_1, bd_mask_1, 0, 1,
-                get_ir_null(), get_ir_null(), postop_set_1, postop_data_1);
-        ///////////// list calls
-        builtin::brgemm_list_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_list_update(A, B, C, 1, 2, c, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_list_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_list_update(A, B, C, 1, 2, c, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_list_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16, datatypes::bf16, attrs_0, bd_mask_0, 0, 1,
-                get_ir_null(), get_ir_null(), postop_set_0, postop_data_0,
-                brg_c_buf);
-    }
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    ctx->flags_.brgemm_backend_ = scflags_t::brgemm_backend_t::dnnl;
-    auto m = ir_module_t::from_entry_func(ctx, aaa);
-    auto res = kernel_lowering_cpu_t(true)(m);
-    scflags_t::brgemm_backend_t backend = scflags_t::brgemm_backend_t::dnnl;
-    expr ir_nullptr = make_expr<constant_node>(0UL, datatypes::pointer);
-    expr ir_zero = make_expr<constant_node>(0UL, datatypes::index);
-    expr ir_zero_s32 = make_expr<constant_node>(0UL, datatypes::s32);
-    expr ir_false = make_expr<constant_node>(0UL, datatypes::boolean);
-    func_t strd_call, strd_create, strd_init_update, strd_update, ptr_call,
-            ptr_create, ptr_init_update, ptr_update;
-    func_t strd_call_postop, strd_create_postop, ptr_call_postop,
-            ptr_create_postop;
-    std::tie(strd_create, strd_call)
-            = builtin::get_brgemm_creator_and_call_func(
-                    builtin::brgemm_mode::stride, backend, false);
-    std::tie(ptr_create, ptr_call) = builtin::get_brgemm_creator_and_call_func(
-            builtin::brgemm_mode::addr_list, backend, false);
-
-    std::tie(strd_create_postop, strd_call_postop)
-            = builtin::get_brgemm_creator_and_call_func(
-                    builtin::brgemm_mode::stride, backend, true);
-    std::tie(ptr_create_postop, ptr_call_postop)
-            = builtin::get_brgemm_creator_and_call_func(
-                    builtin::brgemm_mode::addr_list, backend, true);
-
-    std::tie(strd_update, strd_init_update) = builtin::get_brgemm_update_funcs(
-            builtin::brgemm_mode::stride, backend);
-    std::tie(ptr_update, ptr_init_update) = builtin::get_brgemm_update_funcs(
-            builtin::brgemm_mode::addr_list, backend);
-    /////////////////// expected
-    auto m2 = std::make_shared<ir_module_t>(ctx);
-
-    func_t postop_data_init = builtin::get_brgemm_postops_data_init_func();
-    _module_var_(m2, kernel1, datatypes::pointer,
-            strd_create(2, 3, 4, 5, 6, 7, 8, 9, 0.0f,
-                    datatypes::bf16.as_etype_int(),
-                    datatypes::bf16.as_etype_int(), ir_nullptr, ir_nullptr,
-                    ir_nullptr));
-    _module_var_(m2, kernel2, datatypes::pointer,
-            strd_create(2, 3, 4, 5, 6, 7, 8, 9, 1.0f,
-                    datatypes::bf16.as_etype_int(),
-                    datatypes::bf16.as_etype_int(), ir_nullptr, ir_nullptr,
-                    ir_nullptr));
-    _module_var_(m2, kernel5, datatypes::pointer,
-            strd_create(2, 3, 4, 5, 6, 7, 8, 10, 1.0f,
-                    datatypes::bf16.as_etype_int(),
-                    datatypes::bf16.as_etype_int(), ir_nullptr, ir_nullptr,
-                    ir_nullptr));
-    DEFINE_REF_MODULE_TENSOR_0();
-    _module_tensor_(m2, kernel6, datatypes::pointer, {1});
-    auto cachev = strd_create_postop(2, 3, 4, 5, 6, 7, 8, 9, 0.0f,
-            datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-            attrs_tsr_0, bd_mask_arr_0[0], postop_set_tsr_0);
-    builder::make_assign_unattached(
-            builder::make_indexing(kernel6, {0}), cachev);
-
-    DEFINE_REF_MODULE_TENSOR_1();
-    _module_tensor_(m2, kernel7, datatypes::pointer, {1});
-    cachev = strd_create_postop(2, 3, 4, 5, 6, 7, 8, 9, 1.0f,
-            datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-            attrs_tsr_1, bd_mask_arr_1[0], postop_set_tsr_1);
-    builder::make_assign_unattached(
-            builder::make_indexing(kernel7, {0}), cachev);
-
-    _module_var_(m2, list_k1, datatypes::pointer,
-            ptr_create(2, 3, 4, 5, 6, 7, 1.0f, datatypes::bf16.as_etype_int(),
-                    datatypes::bf16.as_etype_int(), ir_nullptr, ir_nullptr,
-                    ir_nullptr));
-
-    _module_tensor_(m2, list_k2, datatypes::pointer, {1});
-    cachev = ptr_create_postop(2, 3, 4, 5, 6, 7, 1.0f,
-            datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-            attrs_tsr_0, bd_mask_arr_0[0], postop_set_tsr_0);
-    builder::make_assign_unattached(
-            builder::make_indexing(list_k2, {0}), cachev);
-
-    _function_(datatypes::void_t, expected, {}) {
-        _tensor_(A, datatypes::f32, {100});
-        _tensor_(B, datatypes::f32, {100});
-        _tensor_(C, datatypes::f32, {100});
-        DEFINE_POSTOP_DATA_TENSORS(0);
-        DEFINE_POSTOP_DATA_TENSORS(1);
-        _tensor_(c_buf, datatypes::f32, {100});
-
-        _evaluate_call_(strd_call, kernel1, A, B, C, 1, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _evaluate_call_(strd_call, kernel1, A, B, C, 2, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _var_(c, datatypes::s32);
-        _evaluate_call_(strd_init_update, A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-
-        _evaluate_call_(strd_call, kernel2, A, B, C, 1, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _evaluate_call_(strd_update, A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-
-        _evaluate_call_(strd_call, kernel1, A, B, C, 1, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _evaluate_call_(strd_init_update, A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-
-        _evaluate_call_(strd_call, kernel2, A, B, C, 1, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _evaluate_call_(strd_call, kernel5, A, B, C, 1, ir_nullptr, ir_nullptr,
-                ir_nullptr);
-        _evaluate_call_(strd_update, A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-
-        POSTOP_DATA_INIT(0, 0);
-        _evaluate_call_(strd_call_postop, kernel6[0], A, B, C, 1, postop_data_0,
-                c_buf, ir_nullptr, ir_nullptr, ir_nullptr);
-        POSTOP_DATA_INIT(1, 1);
-        _tensor_(c_buf_1, datatypes::f32, {expr(2) * 3});
-        _evaluate_call_(strd_call_postop, kernel7[0], A, B, C, 1, postop_data_1,
-                c_buf_1, ir_nullptr, ir_nullptr, ir_nullptr);
-
-        _evaluate_call_(ptr_call, list_k1, A, B, C, 1, 8, 9, 10,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _evaluate_call_(ptr_update, A, B, C, 1, 2, c, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _evaluate_call_(ptr_call, list_k1, A, B, C, 1, 8, 9, 10,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _evaluate_call_(ptr_update, A, B, C, 1, 2, c, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        POSTOP_DATA_INIT(2, 0);
-        _evaluate_call_(ptr_call_postop, list_k2[0], A, B, C, 1, 8, 9, 10,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                postop_data_2, c_buf, ir_nullptr, ir_nullptr, ir_nullptr);
-    }
-    m2->add_func({expected});
-    ir_comparer cmp(true);
-
-    ASSERT_TRUE(m2->get_module_vars().size() == res->get_module_vars().size());
-    for (unsigned i = 0; i < m2->get_module_vars().size(); i++) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_module_vars()[i], res->get_module_vars()[i]));
-    }
-    for (unsigned i = 0; i < m2->get_contents().size(); i++) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_contents()[i], res->get_contents()[i], false));
-    }
-}
-
-TEST(GCCore_CPU_kernel_lowering_cpp, TestKernelLoweringNoOptim) {
-    builder::ir_builder_t builder;
-    sc_brgemm_attrs_t attrs_0 = {attr_0, attr_1, attr_3};
-    sc_brgemm_postops_setting_t postop_set_0 = {postop_0, postop_1, postop_2};
-    sc_brgemm_attrs_t attrs_1 = {attr_0, attr_2, attr_3};
-    sc_brgemm_postops_setting_t postop_set_1 = {postop_0, postop_3, postop_1};
-    _function_(datatypes::void_t, aaa, {}) {
-        _tensor_(A, datatypes::f32, {100});
-        _tensor_(B, datatypes::f32, {100});
-        _tensor_(C, datatypes::f32, {100});
-        DEFINE_POSTOP_DATA_TENSORS(0);
-        DEFINE_POSTOP_DATA_TENSORS(1);
-        _tensor_(c_buf, datatypes::f32, {100});
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-        _var_(c, datatypes::s32);
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16);
-
-        auto postop_data_0 = builtin::create_initialed_postops_data();
-        postop_data_0[brgemm::postop_data_kind::bias] = bias_0;
-        postop_data_0[brgemm::postop_data_kind::scales] = scales_0;
-        postop_data_0[brgemm::postop_data_kind::binary_post_ops_rhs]
-                = binary_tsr_0;
-        auto postop_data_1 = builtin::create_initialed_postops_data();
-        postop_data_1[brgemm::postop_data_kind::bias] = bias_1;
-        postop_data_1[brgemm::postop_data_kind::scales] = scales_1;
-        postop_data_1[brgemm::postop_data_kind::binary_post_ops_rhs]
-                = binary_tsr_1;
-        expr brg_c_buf = c_buf;
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16, attrs_0, bd_mask_0, 0, 1,
-                get_ir_null(), get_ir_null(), postop_set_0, postop_data_0,
-                brg_c_buf);
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16, datatypes::bf16, attrs_1, bd_mask_1, 0, 1,
-                get_ir_null(), get_ir_null(), postop_set_1, postop_data_1);
-        ///////////// list calls
-        builtin::brgemm_list_update(A, B, C, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
-                datatypes::bf16, datatypes::bf16);
-        builtin::brgemm_list_update(A, B, C, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
-                datatypes::bf16, datatypes::bf16, attrs_0, bd_mask_0, 0, 1,
-                get_ir_null(), get_ir_null(), postop_set_0, postop_data_0,
-                brg_c_buf);
-    }
-    expr ir_nullptr = make_expr<constant_node>(0UL, datatypes::pointer);
-    expr ir_zero = make_expr<constant_node>(0UL, datatypes::index);
-    expr ir_zero_s32 = make_expr<constant_node>(0UL, datatypes::s32);
-    expr ir_false = make_expr<constant_node>(0UL, datatypes::boolean);
-    auto m = ir_module_t::from_entry_func(get_default_context(), aaa);
-    auto res = kernel_lowering_cpu_t(false)(m);
-    auto backend = get_default_context()->flags_.brgemm_backend_;
-    func_t strd_init_update, strd_update, ptr_init_update, ptr_update;
-    std::tie(strd_update, strd_init_update) = builtin::get_brgemm_update_funcs(
-            builtin::brgemm_mode::stride, backend);
-    std::tie(ptr_update, ptr_init_update) = builtin::get_brgemm_update_funcs(
-            builtin::brgemm_mode::addr_list, backend);
-    /////////////////// expected
-    auto m2 = std::make_shared<ir_module_t>(get_default_context());
-
-    func_t postop_data_init = builtin::get_brgemm_postops_data_init_func();
-    DEFINE_REF_MODULE_TENSOR_0();
-    DEFINE_REF_MODULE_TENSOR_1();
-    _function_(datatypes::void_t, expected, {}) {
-        _tensor_(A, datatypes::f32, {100});
-        _tensor_(B, datatypes::f32, {100});
-        _tensor_(C, datatypes::f32, {100});
-        DEFINE_POSTOP_DATA_TENSORS(0);
-        DEFINE_POSTOP_DATA_TENSORS(1);
-        _tensor_(c_buf, datatypes::f32, {100});
-        _evaluate_call_(strd_init_update, A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _var_(c, datatypes::s32);
-        _evaluate_call_(strd_update, A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        POSTOP_DATA_INIT(0, 0);
-        _evaluate_call_(strd_init_update, A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                attrs_tsr_0, bd_mask_arr_0[0], postop_set_tsr_0, ir_nullptr,
-                ir_nullptr, postop_data_0, c_buf, ir_nullptr);
-        POSTOP_DATA_INIT(1, 1);
-        _tensor_(c_buf_1, datatypes::f32, {expr(2) * 3});
-        _evaluate_call_(strd_update, A, B, C, 1, 2, 3, 4, 5, c, 7, 8, 9,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                attrs_tsr_1, bd_mask_arr_1[0], postop_set_tsr_1, ir_nullptr,
-                ir_nullptr, postop_data_1, c_buf_1, ir_nullptr);
-
-        _evaluate_call_(ptr_update, A, B, C, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr, ir_nullptr,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        POSTOP_DATA_INIT(2, 0);
-        _evaluate_call_(ptr_update, A, B, C, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
-                datatypes::bf16.as_etype_int(), datatypes::bf16.as_etype_int(),
-                attrs_tsr_0, bd_mask_arr_0[0], postop_set_tsr_0, ir_nullptr,
-                ir_nullptr, postop_data_2, c_buf, ir_nullptr);
-    }
-    m2->add_func({expected});
-    ir_comparer cmp(true);
-    ASSERT_TRUE(m2->get_module_vars().size() == res->get_module_vars().size());
-    for (unsigned i = 0; i < m2->get_module_vars().size(); i++) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_module_vars()[i], res->get_module_vars()[i]));
-    }
-    for (size_t i = 0; i < m2->get_contents().size(); ++i) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_contents()[i], res->get_contents()[i], false));
-    }
-}
-
-TEST(GCCore_CPU_kernel_lowering_cpp, TestBrgemmAttrs) {
-    builder::ir_builder_t builder;
-    sc_brgemm_attrs_t attrs = {attr_0, attr_1, attr_3};
-    sc_brgemm_bd_mask_t bd_mask {1, 0};
-    expr ir_nullptr = make_expr<constant_node>(0UL, datatypes::pointer);
-
-    _function_(datatypes::void_t, tested_func, {}) {
-        _tensor_(A, datatypes::s8, {100});
-        _tensor_(B, datatypes::s8, {100});
-        _tensor_(C, datatypes::s32, {100});
-        builtin::brgemm_list_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::s8, datatypes::s8, attrs, bd_mask);
-        builtin::brgemm_list_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
-                datatypes::s8, datatypes::s8);
-    }
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    scflags_t::brgemm_backend_t backend = scflags_t::brgemm_backend_t::dnnl;
-    ctx->flags_.brgemm_backend_ = backend;
-    auto tested_mod = ir_module_t::from_entry_func(ctx, tested_func);
-    auto tested = kernel_lowering_cpu_t(true)(tested_mod);
-
-    auto m2 = std::make_shared<ir_module_t>(ctx);
-    func_t ptr_create, ptr_call;
-    std::tie(ptr_create, ptr_call) = builtin::get_brgemm_creator_and_call_func(
-            builtin::brgemm_mode::addr_list, backend, false);
-
-    DEFINE_ATTRS_BDMSK_TENSORS(m2);
-    _module_tensor_(m2, bd_mask_arr, datatypes::pointer, {1});
-
-    auto cachev2 = ptr_create(2, 3, 4, 5, 6, 7, 1.0f,
-            datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-            attrs_tsr, bd_mask_arr[0], ir_nullptr);
-    auto cachev1 = ptr_create(2, 3, 4, 5, 6, 7, 1.0f,
-            datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-            ir_nullptr, ir_nullptr, ir_nullptr);
-    _module_tensor_(m2, list_k, datatypes::pointer, {1});
-    _module_var_(m2, k, datatypes::pointer, cachev1);
-
-    _function_(datatypes::void_t, expected_func, {}) {
-        _tensor_(A, datatypes::s8, {100});
-        _tensor_(B, datatypes::s8, {100});
-        _tensor_(C, datatypes::s32, {100});
-
-        _evaluate_call_(ptr_call, list_k[0UL], A, B, C, 1, 8, 9, 10,
-                datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-                ir_nullptr, get_ir_null(), get_ir_null());
-        _evaluate_call_(ptr_call, k, A, B, C, 1, 8, 9, 10,
-                datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-                ir_nullptr, get_ir_null(), get_ir_null());
-    }
-
-    _function_(datatypes::void_t, init_func, {}) {
-        k = cachev1;
-        bd_mask_arr[0] = builder::tensor_ptr(bd_mask_tsr, {0 * expr(2)});
-        list_k[0] = cachev2;
-    }
-    m2->add_func({expected_func});
-    m2->add_func({init_func});
-
-    ir_comparer cmp(true);
-    ASSERT_TRUE(
-            m2->get_module_vars().size() == tested->get_module_vars().size());
-    for (size_t i = 0; i < m2->get_module_vars().size(); i++) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_module_vars()[i], tested->get_module_vars()[i]));
-    }
-
-    for (size_t i = 0; i < m2->get_contents().size(); ++i) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_contents()[i], tested->get_contents()[i], false));
-    }
-}
-
-TEST(GCCore_CPU_kernel_lowering_cpp, TestBrgemmSharedBdmask) {
-    builder::ir_builder_t builder;
-    sc_brgemm_attrs_t attrs = {attr_0, attr_1, attr_3};
-    sc_brgemm_bd_mask_t bd_mask {1, 1, 0, 0};
-    expr ir_nullptr = make_expr<constant_node>(0UL, datatypes::pointer);
-
-    _function_(datatypes::void_t, tested_func, {}) {
-        _tensor_(A, datatypes::s8, {100});
-        _tensor_(B, datatypes::s8, {100});
-        _tensor_(C, datatypes::s32, {100});
-        builtin::brgemm_list_update(A, B, C, 1, 4, 6, 4, 4, 6, 4, 8, 9, 2,
-                datatypes::s8, datatypes::s8, attrs, bd_mask, 0, 1);
-        builtin::brgemm_list_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9, 2,
-                datatypes::s8, datatypes::s8, attrs, bd_mask, 0, 2);
-    }
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    scflags_t::brgemm_backend_t backend = scflags_t::brgemm_backend_t::dnnl;
-    ctx->flags_.brgemm_backend_ = backend;
-    auto tested_mod = ir_module_t::from_entry_func(ctx, tested_func);
-    auto tested = kernel_lowering_cpu_t(true)(tested_mod);
-
-    // expected
-    auto m2 = std::make_shared<ir_module_t>(ctx);
-    func_t ptr_create, ptr_call;
-    std::tie(ptr_create, ptr_call) = builtin::get_brgemm_creator_and_call_func(
-            builtin::brgemm_mode::addr_list, backend, false);
-
-    DEFINE_ATTRS_BDMSK_TENSORS(m2);
-
-    _module_tensor_(m2, bd_mask_arr_1, datatypes::pointer, {1});
-    _module_tensor_(m2, list_k1, datatypes::pointer, {1});
-    auto cachev1 = ptr_create(4, 6, 4, 4, 6, 4, 1.0f,
-            datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-            attrs_tsr, bd_mask_arr_1[0], ir_nullptr);
-    _module_tensor_(m2, bd_mask_arr_2, datatypes::pointer, {2});
-    _module_tensor_(m2, list_k2, datatypes::pointer, {2});
-
-    _function_(datatypes::void_t, expected_func, {}) {
-        _tensor_(A, datatypes::s8, {100});
-        _tensor_(B, datatypes::s8, {100});
-        _tensor_(C, datatypes::s32, {100});
-
-        _evaluate_call_(ptr_call, list_k1[0], A, B, C, 1, 8, 9, 2,
-                datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-                ir_nullptr, get_ir_null(), get_ir_null());
-        _evaluate_call_(ptr_call, list_k2[0], A, B, C, 1, 8, 9, 2,
-                datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-                ir_nullptr, get_ir_null(), get_ir_null());
-    }
-
-    _function_(datatypes::void_t, init_func, {}) {
-        bd_mask_arr_1[0] = builder::tensor_ptr(bd_mask_tsr, {0 * expr(4)});
-        bd_mask_arr_2[0] = builder::tensor_ptr(bd_mask_tsr, {0 * expr(2)});
-        bd_mask_arr_2[1] = builder::tensor_ptr(bd_mask_tsr, {1 * expr(2)});
-        list_k1[0] = cachev1;
-        list_k2[0] = ptr_create(2, 3, 4, 5, 6, 7, 1.0f,
-                datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-                attrs_tsr, bd_mask_arr_2[0], ir_nullptr);
-        list_k2[1] = ptr_create(2, 3, 4, 5, 6, 7, 1.0f,
-                datatypes::s8.as_etype_int(), datatypes::s8.as_etype_int(),
-                attrs_tsr, bd_mask_arr_2[1], ir_nullptr);
-    }
-    m2->add_func({expected_func});
-    m2->add_func({init_func});
-
-    ir_comparer cmp(true);
-    ASSERT_TRUE(
-            m2->get_module_vars().size() == tested->get_module_vars().size());
-    for (size_t i = 0; i < m2->get_module_vars().size(); i++) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_module_vars()[i], tested->get_module_vars()[i]));
-    }
-
-    for (size_t i = 0; i < m2->get_contents().size(); ++i) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_contents()[i], tested->get_contents()[i], false));
-    }
-}
-
-TEST(GCCore_CPU_kernel_lowering_cpp, TestRangeKernelLowering) {
-    REQUIRE_AVX2();
-    auto backend = get_default_context()->flags_.brgemm_backend_;
-    if (backend != scflags_t::brgemm_backend_t::dnnl) { GTEST_SKIP(); }
-    builder::ir_builder_t builder;
-    sc_brgemm_attrs_t attrs_0 = {range_attr_0, range_attr_3};
-    sc_brgemm_attrs_t attrs_1 = {range_attr_0, range_attr_1, range_attr_2};
-    _function_(datatypes::void_t, aaa, {}) {
-        _tensor_(A, datatypes::f32, {100});
-        _tensor_(B, datatypes::f32, {100});
-        _tensor_(C, datatypes::f32, {100});
-        builtin::brgemm_init_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::f32, datatypes::f32, attrs_0);
-        builtin::brgemm_update(A, B, C, 1, 2, 3, 4, 5, 6, 7, 8, 9,
-                datatypes::f32, datatypes::f32, attrs_1);
-        ///////////// list calls
-        builtin::brgemm_init_list_update(A, B, C, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
-                datatypes::f32, datatypes::f32, attrs_1);
-        builtin::brgemm_list_update(A, B, C, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1,
-                datatypes::f32, datatypes::f32, attrs_0);
-    }
-    expr ir_nullptr = make_expr<constant_node>(0UL, datatypes::pointer);
-    auto m = ir_module_t::from_entry_func(get_default_context(), aaa);
-    auto res = kernel_lowering_cpu_t(true)(m);
-    func_t strd_range_func, list_range_func;
-    strd_range_func
-            = builtin::get_brgemm_call_range_func(builtin::brgemm_mode::stride);
-    list_range_func = builtin::get_brgemm_call_range_func(
-            builtin::brgemm_mode::addr_list);
-    /////////////////// expected
-    auto m2 = std::make_shared<ir_module_t>(get_default_context());
-
-    _module_var_(
-            m2, handle_0, datatypes::pointer, res->get_module_vars()[0]->init_);
-    _module_var_(
-            m2, handle_1, datatypes::pointer, res->get_module_vars()[1]->init_);
-    _module_var_(
-            m2, handle_2, datatypes::pointer, res->get_module_vars()[2]->init_);
-    _module_var_(
-            m2, handle_3, datatypes::pointer, res->get_module_vars()[3]->init_);
-
-    _function_(datatypes::void_t, expected, {}) {
-        _tensor_(A, datatypes::f32, {100});
-        _tensor_(B, datatypes::f32, {100});
-        _tensor_(C, datatypes::f32, {100});
-        _evaluate_call_(strd_range_func, handle_0, 2, 3, 4, A, B, C, 1,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _evaluate_call_(strd_range_func, handle_1, 2, 3, 4, A, B, C, 1,
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _evaluate_call_(list_range_func, handle_2, 3, 4, 5, A, B, C, 2, 9, 10,
-                1, datatypes::f32.as_etype_int(), datatypes::f32.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr);
-        _evaluate_call_(list_range_func, handle_3, 3, 4, 5, A, B, C, 2, 9, 10,
-                1, datatypes::f32.as_etype_int(), datatypes::f32.as_etype_int(),
-                ir_nullptr, ir_nullptr, ir_nullptr);
-    }
-    m2->add_func({expected});
-    ir_comparer cmp(true);
-    ASSERT_TRUE(m2->get_module_vars().size() == res->get_module_vars().size());
-    for (unsigned i = 0; i < m2->get_module_vars().size(); i++) {
-        EXPECT_TRUE(cmp.compare(
-                m2->get_module_vars()[i], res->get_module_vars()[i]));
-    }
-    EXPECT_TRUE(
-            cmp.compare(m2->get_contents()[0], res->get_contents()[0], false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_local_tensor_lower.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_local_tensor_lower.cpp
deleted file mode 100644
index d9aa3a11d5e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_local_tensor_lower.cpp
+++ /dev/null
@@ -1,363 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <utility>
-#include "gtest/gtest.h"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/pass/graph_constant_cache.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/cpu/local_tensor_lower.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_local_tensor_lower, TestLocalTensorLowering) {
-    builder::ir_builder_t builder;
-    local_tensor_lowering_cpu_t pass {128};
-
-    _function_(datatypes::void_t, aaa, _arg_("stream", datatypes::pointer),
-            _arg_("globals", datatypes::s8, {0}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(ctx, globals, args);
-        _var_(a, datatypes::s32);
-        _tensor_(b1, datatypes::f32, 200);
-        _tensor_(dyn1, datatypes::f32, a);
-        _tensor_(b2, datatypes::f32, 10);
-        b1[0] = args;
-        builder.push_scope();
-        {
-            _tensor_(b3, datatypes::u8, 1000);
-            b3[0] = 9;
-            _return_();
-        }
-        builder.emit(builder.pop_scope());
-        builder.push_scope();
-        {
-            _tensor_(b4, datatypes::u8, 1000);
-            _tensor_(b5, datatypes::u8, 1000);
-            b4[0] = 9;
-            builder.push_scope();
-            {
-                _tensor_(b6, datatypes::u8, 1000);
-                b4[0] = 9;
-            }
-            builder.emit(builder.pop_scope());
-        }
-        builder.emit(builder.pop_scope());
-
-        _tensor_(b5, datatypes::u8, 1000);
-
-        _for_(i, 0, 100, 1, for_type::PARALLEL) {
-            _tensor_(b6, datatypes::u8, 1000);
-            b6->attr()["is_thread_buffer"] = true;
-        }
-    }
-
-    auto mod2 = pass(aaa);
-
-    _function_(datatypes::void_t, expected, _arg_("stream", datatypes::pointer),
-            _arg_("globals", datatypes::s8, {0}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(ctx, globals, args);
-
-        auto set_buffer = [&builder, &ctx](bool is_parallel, expr sz) {
-            builder.get_current_scope()
-                    .as_seq()
-                    .back()
-                    .checked_as<define>()
-                    ->init_
-                    = get_cpu_temp_malloc_func(is_parallel)(ctx, std::move(sz));
-        };
-
-        auto release_buffer = [&ctx](bool is_parallel, expr sz) {
-            _evaluate_call_(
-                    get_cpu_temp_free_func(is_parallel), ctx, std::move(sz));
-        };
-
-        _var_(a, datatypes::s32);
-        _tensor_(b1, datatypes::f32, 200);
-        set_buffer(false, 800UL);
-        _tensor_(dyn1, datatypes::f32, a);
-        set_buffer(
-                false, builder::make_cast(datatypes::index, a) * UINT64_C(4));
-        _tensor_(b2, datatypes::f32, 10);
-        b1[0] = args;
-        builder.push_scope();
-        {
-            _tensor_(b3, datatypes::u8, 1000);
-            set_buffer(false, 1000UL);
-            b3[0] = 9;
-            release_buffer(false, b3);
-            _return_();
-        }
-        builder.emit(builder.pop_scope());
-        builder.push_scope();
-        {
-            _tensor_(b4, datatypes::u8, 1000);
-            set_buffer(false, 1000UL);
-            _tensor_(b5, datatypes::u8, 1000);
-            set_buffer(false, 1000UL);
-            b4[0] = 9;
-            builder.push_scope();
-            {
-                _tensor_(b6, datatypes::u8, 1000);
-                set_buffer(false, 1000UL);
-                b4[0] = 9;
-                release_buffer(false, b6);
-            }
-            builder.emit(builder.pop_scope());
-
-            release_buffer(false, b5);
-            release_buffer(false, b4);
-        }
-        builder.emit(builder.pop_scope());
-
-        _tensor_(b5, datatypes::u8, 1000);
-        set_buffer(false, 1000UL);
-
-        _for_(i, 0, 100, 1, for_type::PARALLEL) {
-            _tensor_(b6, datatypes::u8, 1000);
-            set_buffer(true, 1000UL);
-            release_buffer(true, b6);
-        }
-        release_buffer(false, b5);
-        release_buffer(false, dyn1);
-        release_buffer(false, b1);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(mod2, expected, false));
-}
-
-TEST(GCCore_CPU_local_tensor_lower, TestSharedConst) {
-    builder::ir_builder_t builder;
-    local_tensor_lowering_cpu_t pass {128};
-
-    auto dummy_buffer = std::make_shared<int>();
-    // compile-time const buffer
-    auto base1 = std::make_shared<runtime::const_cache_proxy>(
-            dummy_buffer, dummy_buffer.get(), 32, false);
-    auto dummy_buffer2 = std::make_shared<int>();
-    auto base2 = std::make_shared<runtime::const_cache_proxy>(
-            dummy_buffer2, dummy_buffer2.get(), 256, true);
-
-    auto graph_tsr1
-            = std::make_shared<cached_const_graph_tensor>(nullptr, 32, nullptr);
-    graph_tsr1->buf_base_ = base1;
-    graph_tsr1->offset_ = 0;
-
-    auto graph_tsr2 = std::make_shared<cached_const_graph_tensor>(
-            nullptr, 256, nullptr);
-    graph_tsr2->buf_base_ = base2;
-    graph_tsr2->offset_ = 0;
-
-    _function_(datatypes::void_t, aaa, _arg_("stream", datatypes::pointer),
-            _arg_("mod_data", datatypes::s8, {0UL}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(stream, moddata, args);
-        _tensor_(__shared_const_handle, datatypes::index, UINT64_C(2));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(moddata, {0UL});
-        args = 1.0f;
-        _tensor_(__is_init, datatypes::s32, 1);
-        __is_init[0] = 1;
-
-        _tensor_(normal, datatypes::u8, UINT64_C(256));
-        _tensor_(base0, datatypes::u8, UINT64_C(32));
-        auto &attr1 = builder.get_current_scope()
-                              .as_seq()
-                              .back()
-                              .checked_as<define>()
-                              ->var_->attr();
-        attr1[attr_keys::shared_const] = graph_tsr1;
-        attr1[attr_keys::shared_const_base_idx] = size_t(0);
-        _tensor_(base1, datatypes::u8, UINT64_C(256));
-        auto &attr2 = builder.get_current_scope()
-                              .as_seq()
-                              .back()
-                              .checked_as<define>()
-                              ->var_->attr();
-        attr2[attr_keys::shared_const] = graph_tsr2;
-        attr2[attr_keys::shared_const_base_idx] = size_t(1);
-        _tensor_(normal2, datatypes::u8, UINT64_C(256));
-    }
-
-    auto out = pass(aaa);
-
-    _function_(datatypes::void_t, expected, _arg_("stream", datatypes::pointer),
-            _arg_("mod_data", datatypes::s8, {0UL}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(stream, moddata, args);
-        _tensor_(__shared_const_handle, datatypes::index, UINT64_C(2));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(moddata, {0UL});
-        args = 1.0f;
-        _tensor_(__is_init, datatypes::s32, 1);
-        __is_init[0] = 1;
-
-        _tensor_(normal, datatypes::u8, UINT64_C(256));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = get_cpu_temp_malloc_func(false)(stream.get(), UINT64_C(256));
-        _tensor_(base0, datatypes::u8, UINT64_C(32));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::make_reinterpret(
-                        __shared_const_handle[UINT64_C(0)], datatypes::pointer);
-        _tensor_(base1, datatypes::u8, UINT64_C(256));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = get_acquire_const_cache_func()(stream,
-                        __shared_const_handle[UINT64_C(1)], UINT64_C(256),
-                        __is_init);
-        _tensor_(normal2, datatypes::u8, UINT64_C(256));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = get_cpu_temp_malloc_func(false)(stream.get(), UINT64_C(256));
-        _evaluate_call_(get_cpu_temp_free_func(false), stream, normal2);
-        _evaluate_call_(get_release_const_cache_func(), stream,
-                __shared_const_handle[UINT64_C(1)], base1);
-        _evaluate_call_(get_cpu_temp_free_func(false), stream, normal);
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-    // prevent unregistering the buffer to graph API cache manager
-    base2->is_lazy_ = false;
-    base1->is_lazy_ = false;
-}
-
-TEST(GCCore_CPU_local_tensor_lower, TestAlias) {
-    builder::ir_builder_t builder;
-    local_tensor_lowering_cpu_t pass {128};
-
-    auto set_buffer = [&builder](const expr &base, uint64_t sz) {
-        auto def = builder.get_current_scope()
-                           .as_seq()
-                           .back()
-                           .checked_as<define>();
-        def->init_ = builder::tensor_ptr(base, {sz});
-    };
-
-    _function_(datatypes::void_t, aaa, _arg_("stream", datatypes::pointer),
-            _arg_("globals", datatypes::s8, {0}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(ctx, globals, args);
-        _tensor_(base, datatypes::u8, 200);
-        base->attr()["can_be_scheduled"] = true;
-        //[0,15]
-        _tensor_(b1, datatypes::u8, 16);
-        set_buffer(base, 0);
-        //[16,79]
-        _tensor_(b2, datatypes::f32, 16);
-        set_buffer(base, 16);
-        //[32,95]
-        _tensor_(b3, datatypes::f32, 16);
-        set_buffer(base, 32);
-        //[80,143], should not alias with b2
-        _tensor_(b4, datatypes::index, 8);
-        set_buffer(base, 80);
-
-        // indirect base
-        _tensor_(base2, datatypes::u8, 64);
-        set_buffer(base, 144);
-        base2->attr()["can_be_scheduled"] = true;
-
-        //[144,...]
-        _tensor_(b5, datatypes::index, 8);
-        set_buffer(base2, 0);
-        //[150,...]
-        _tensor_(b6, datatypes::index, 8);
-        set_buffer(base, 150);
-
-        // check for hoist_and_schedule result
-        // hoisted base
-        _tensor_(hbase, datatypes::u8, 200);
-        set_buffer(base, 214);
-        hbase->attr()["hoisted"] = true;
-
-        // hoisted scheduled base
-        _tensor_(base0, datatypes::u8, 100);
-        base0->attr()["can_be_scheduled"] = true;
-        set_buffer(hbase, 10); // in reality, should be based on get_gid()*10
-
-        _tensor_(b7, datatypes::index, 2);
-        set_buffer(base0, 0);
-
-        _tensor_(b8, datatypes::index, 2);
-        set_buffer(base0, 16);
-
-        // should be alias with b8 and b7
-        _tensor_(b9, datatypes::u8, 128);
-        set_buffer(base, 214);
-
-        _tensor_(b10_hoisted, datatypes::u8, 32);
-        set_buffer(base0, 32);
-        b10_hoisted->attr()["hoisted"] = true;
-
-        _tensor_(base_b10, datatypes::u8, 16);
-        set_buffer(b10_hoisted, 0);
-        base_b10->attr()["can_be_scheduled"] = true;
-
-        _tensor_(b11, datatypes::u8, 16);
-        set_buffer(base_b10, 0);
-    }
-
-    auto mod2 = pass(aaa);
-    auto body = mod2->body_.as<stmts>();
-    std::vector<std::vector<int64_t>> expected = {
-            {1}, // b1
-            {2}, // b2
-            {2, 3}, // b3
-            {3}, // b4
-            {4}, // base2
-            {4}, // b5
-            {4}, // b6
-            {5, 10, 11, 12, 14}, // hbase
-            // b7 b8 and b11 are not alias
-            {6, 10}, // b7
-            {7, 11}, // b8
-            // b7 b8 and b11 are alias with b9
-            {5, 10, 11, 12, 14}, // b9
-            {8, 12, 13}, // b10_hoisted
-            {9, 13, 14}, // b11
-    };
-    int cur_tensor = 0;
-    std::unordered_set<alias_info::tensor_alias_identity_t *> idset;
-    for (auto &s : body->seq_) {
-        if (s.isa<define>()) {
-            auto def = s.static_as<define>();
-            if (auto alias_id = alias_info::get_alias_info(*def->var_)) {
-                idset.insert(alias_id);
-                auto &cur_expected = expected.at(cur_tensor);
-                cur_tensor++;
-                std::vector<int64_t> ids;
-                for (auto &cli : alias_id->alias_cliques_) {
-                    ids.emplace_back(cli->id_);
-                }
-                EXPECT_EQ(ids, cur_expected);
-            }
-        }
-    }
-    ASSERT_TRUE(mod2->attr_);
-    auto a_ids = mod2->attr_->get_or_null<
-            std::vector<std::shared_ptr<alias_info::tensor_alias_identity_t>>>(
-            "alias_sets");
-    ASSERT_TRUE(a_ids);
-    ASSERT_EQ(a_ids->size(), idset.size());
-    for (auto &v : *a_ids) {
-        ASSERT_TRUE(idset.count(v.get()));
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_invariant_code_motion.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_invariant_code_motion.cpp
deleted file mode 100644
index 1bd6256ac44..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_invariant_code_motion.cpp
+++ /dev/null
@@ -1,825 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/loop_invariant_code_motion.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <util/any_map.hpp>
-#include <util/def.hpp>
-
-#if SC_BUILTIN_JIT_ENABLED
-#include <compiler/jit/xbyak/ir/transform/indexing_transform.hpp>
-#endif
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-constexpr auto s32 = datatypes::s32;
-constexpr auto f32 = datatypes::f32;
-constexpr auto idx = datatypes::index;
-TEST(GCCore_CPU_licm_transform, TestLICMTransformSingleLoop) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {100}), _arg_("B", s32, {100}),
-            _arg_("a", s32)) {
-        _bind_(A, B, a);
-        _var_init_(c, s32, 20);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        g = 1;
-        _for_(i, 0, 100) {
-            _var_init_(d, s32, 10);
-            _var_init_(e, s32, a);
-            g = d + 2;
-            g = e + 3;
-            _var_init_(
-                    f, sc_data_type_t::s32(16), builder::make_broadcast(2, 16));
-            _var_init_(h, sc_data_type_t::s32(16),
-                    builder::make_broadcast(builder::make_cast(s32, i), 16));
-            B[span_t({0}, 16)] = f + h;
-            _tensor_(C, s32, {100});
-            A[i] = i;
-            B[i] = 3;
-            c = C[0];
-        }
-        _return_(c);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    _function_(s32, expected, _arg_("A", s32, {100}), _arg_("B", s32, {100}),
-            _arg_("a", s32)) {
-        _bind_(A, B, a);
-        _var_init_(c, s32, 20);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        _var_init_(t0, s32, 1);
-        g = t0;
-        _var_init_(t1, s32, 0);
-        _var_init_(t2, s32, 100);
-        _var_init_(t3, s32, 1);
-        _var_init_(d, s32, 10);
-        _var_init_(a_0, s32, builder::make_phi({a}));
-        _var_init_(e, s32, a_0);
-        _var_init_(t5, s32, 2);
-        _var_init_(t6, s32, d + t5);
-        _var_init_(t7, s32, 3);
-        _var_init_(t8, s32, e + t7);
-        _var_init_(t9, s32, 2);
-        _var_init_(f, sc_data_type_t::s32(16), builder::make_broadcast(t9, 16));
-        _var_init_(t11, s32, 0);
-        _var_init_(t13, s32, 3);
-        _var_init_(t14, s32, 0);
-
-        expr c1_;
-        _for_(i, t1, t2, t3) {
-            g = t6;
-            g = t8;
-            _var_init_(t10, s32, builder::make_cast(s32, i));
-            _var_init_(h, sc_data_type_t::s32(16),
-                    builder::make_broadcast(t10, 16));
-            _var_init_(t12, sc_data_type_t::s32(16), f + h);
-            B[span_t({t11}, 16)] = t12;
-            _tensor_(C, s32, {100});
-            A[i] = i;
-            B[i] = t13;
-            _var_init_copy_(c1, s32, C[t14]);
-        }
-        _var_init_(c2, s32, builder::make_phi({c, c1_}));
-        _return_(c2);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_licm_transform, TestLICMTransformComplexLoop) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", idx, {100, 200, 300}),
-            _arg_("B", idx, {100, 200, 300}), _arg_("C", idx, {100, 200, 300}),
-            _arg_("D", idx, {100, 200, 300})) {
-        _bind_(A, B, C, D);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        g = 1;
-        _for_(l0, 0, 100) {
-            // out of l0
-            _var_init_(c0, s32, 8);
-            _for_(l1, 0, 100) {
-                // out of l1
-                _var_init_(c1, idx, l0 + c0);
-                // don't hoist
-                _var_init_(v1, idx, c1 + l1);
-                // don't hoist
-                g = c0;
-                // don't hoist
-                _var_init_(r1, idx, 0);
-                _for_(l2, 0, 300) {
-                    A[l0 + l1] = B[l0];
-                    C[l0] = c1;
-                    // don't hoist
-                    D[l2 + v1] = c1;
-                    // don't hoist
-                    g = c1;
-                    // don't hoist
-                    r1 = r1 + UINT64_C(1);
-                }
-                // don't hoist
-                g = r1;
-            }
-            _for_(l3, 0, 100) {
-                // out of l0
-                _var_init_(c2, s32, 13);
-                // out of l3
-                _var_init_(v2, idx, l0 + c2);
-                A[c2] = c2;
-                B[v2] = c2;
-                // don't hoist
-                C[v2 + l3] = c2;
-                // don't hoist
-                g = c2;
-            }
-            // don't hoist
-            A[0] = g;
-        }
-        _return_(g);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    // expected
-    _function_(s32, expected, _arg_("A", idx, {100, 200, 300}),
-            _arg_("B", idx, {100, 200, 300}), _arg_("C", idx, {100, 200, 300}),
-            _arg_("D", idx, {100, 200, 300})) {
-        _bind_(A, B, C, D);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        _var_init_(t0, s32, 1);
-        g = t0;
-        _var_init_(t1, s32, 0);
-        _var_init_(t2, s32, 100);
-        _var_init_(t3, s32, 1);
-        _var_init_(c0, s32, 8);
-        _var_init_(t4, s32, 0);
-        _var_init_(t5, s32, 100);
-        _var_init_(t6, s32, 1);
-        _var_init_(c0_1, s32, builder::make_phi({c0}));
-        _var_init_(t9, s32, 0);
-        _var_init_(t10, s32, 300);
-        _var_init_(t11, s32, 1);
-        _var_init_(tr1, idx, UINT64_C(1));
-        _var_init_(t19, s32, 0);
-        _var_init_(t20, s32, 100);
-        _var_init_(t21, s32, 1);
-        _var_init_(c2, s32, 13);
-        _var_init_(t24, s32, 0);
-        _var_init_(t25, s32, g);
-        _for_(l0, t1, t2, t3) {
-            _var_init_(l0_0, idx, builder::make_phi({l0}));
-            _var_init_(c1, idx, l0_0 + c0_1);
-            _var_init_(l0_2, idx, builder::make_phi({l0_0}));
-            _var_init_(t15, idx, B[l0_2]);
-            _var_init_(c1_4, idx, builder::make_phi({c1}));
-            _for_(l1, t4, t5, t6) {
-                _var_init_(v1, idx, c1 + l1);
-                g = c0_1;
-                _var_init_(r1, idx, 0);
-                _var_init_(l1_3, idx, builder::make_phi({l1}));
-                _var_init_(t14, idx, l0_2 + l1_3);
-                _var_init_(v1_5, idx, builder::make_phi({v1}));
-                expr r1_7 = builder::make_var(idx, "r1_7");
-                _for_(l2, t9, t10, t11) {
-                    A[t14] = t15;
-                    C[l0_2] = c1_4;
-                    _var_init_(t18, idx, l2 + v1_5);
-                    D[t18] = c1_4;
-                    g = c1_4;
-                    _var_init_(r1_6, idx, builder::make_phi({r1, r1_7}, true));
-                    builder::get_current_builder()->push_var_tensor_def(
-                            r1_7, linkage::local, r1_6 + tr1);
-                }
-                _var_init_(r1_8, idx, builder::make_phi({r1, r1_7}));
-                g = r1_8;
-            }
-            _var_init_(l0_6, idx, builder::make_phi({l0}));
-            _var_init_(v2, idx, l0_6 + c2);
-            _for_(l3, t19, t20, t21) {
-                A[c2] = c2;
-                B[v2] = c2;
-                _var_init_(t23, idx, v2 + l3);
-                C[t23] = c2;
-                g = c2;
-            }
-            A[t24] = t25;
-        }
-        _var_init_(t26, s32, g);
-        _return_(t26);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_licm_transform, TestLICMTransformIfNodeHoist) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000})) {
-        _bind_(A);
-        // can't hoist
-        _var_(a, s32);
-        _for_(i, 0, 10) {
-            _if_(true) { a = 1; }
-            _else_ { a = A[i]; }
-            A[a] = a;
-        }
-        // can't hoist
-        _var_(c, s32);
-        _for_(i, 0, 10) {
-            _if_(true) { c = A[i]; }
-            A[c] = c;
-        }
-        // can't hoist
-        _var_(e, s32);
-        _var_init_(f, s32, 1);
-        _for_(i, 0, 10) {
-            _if_(i == 0) { f = 3; }
-            _if_(f == 0) { e = 1; }
-            _else_ { e = 2; }
-            A[e] = e;
-        }
-        // can hoist
-        _var_(m, s32);
-        _for_(i, 0, 10) {
-            _if_(true) {
-                _var_init_(tmp, s32, 1);
-                m = 1;
-            }
-            _else_ { m = 2; }
-            A[m] = m;
-        }
-        // can hoist inside if scope
-        _var_init_(d, s32, 0);
-        _for_(i, 0, 10) {
-            _if_(true) {
-                _for_(i1, 0, 10) {
-                    _var_init_(tmp, s32, d + 2);
-                    A[i1] = tmp;
-                }
-            }
-            _else_ {
-                _for_(i2, 0, 10) {
-                    _var_init_(tmp2, s32, d + 1);
-                    A[i2] = tmp2;
-                }
-            }
-        }
-        // can hoist
-        _var_init_(m1, s32, 0);
-        _var_init_(m2, s32, 0);
-        _for_(i, 0, 10) {
-            _if_(true) {
-                _for_(j, 0, 10) {
-                    _var_init_(tmp, s32, m2 + 1);
-                    m1 = tmp;
-                }
-            }
-            _else_ { m1 = 2; }
-            A[m1] = m1;
-        }
-
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000})) {
-        _bind_(A);
-        //
-        _var_init_(t0, s32, 0);
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, s32, 1);
-        _var_init_(t3, datatypes::boolean, true);
-        expr a_1_, a_2_;
-        _for_(i, t0, t1, t2) {
-            _if_(t3) { _var_init_copy_(a_1, s32, 1); }
-            _else_ { _var_init_copy_(a_2, s32, A[i]); }
-            _var_init_(a_3, s32, builder::make_phi({a_1_, a_2_}));
-            A[a_3] = a_3;
-        }
-        //
-        _var_init_(t8, s32, 0);
-        _var_init_(t9, s32, 10);
-        _var_init_(t10, s32, 1);
-        _var_init_(t11, datatypes::boolean, true);
-        _for_(i, t8, t9, t10) {
-            expr c_6 = builder::make_var(s32, "c6");
-            _var_init_(c_5, s32, builder::make_phi({0, c_6}, true));
-            expr c_4_;
-            _if_(t11) { _var_init_copy_(c_4, s32, A[i]); }
-            builder::get_current_builder()->push_var_tensor_def(
-                    c_6, linkage::local, builder::make_phi({c_5, c_4_}));
-            A[c_6] = c_6;
-        }
-        //
-        _var_init_(f, s32, 1);
-        _var_init_(t15, s32, 0);
-        _var_init_(t16, s32, 10);
-        _var_init_(t17, s32, 1);
-        _var_init_(t18, s32, 0);
-        _var_init_(t22, s32, 0);
-        _for_(i, t15, t16, t17) {
-            expr f_10 = builder::make_var(s32, "f10");
-            _var_init_(t19, datatypes::boolean, i == t18);
-            _var_init_(f_9, s32, builder::make_phi({f, f_10}, true));
-            expr e_1_, e_2_, f_7_;
-            _if_(t19) { _var_init_copy_(f_7, s32, 3); }
-            builder::get_current_builder()->push_var_tensor_def(
-                    f_10, linkage::local, builder::make_phi({f_9, f_7_}));
-            _var_init_(t23, datatypes::boolean, f_10 == t22);
-            _if_(t23) { _var_init_copy_(e_1, s32, 1); }
-            _else_ { _var_init_copy_(e_2, s32, 2); }
-            _var_init_(e_3, s32, builder::make_phi({e_1_, e_2_}));
-            A[e_3] = e_3;
-        }
-        //
-        _var_init_(t29, s32, 0);
-        _var_init_(t30, s32, 10);
-        _var_init_(t31, s32, 1);
-        _var_init_(t32, datatypes::boolean, true);
-        expr m_1_, m_2_;
-        _if_(t32) { _var_init_copy_(m_1, s32, 1); }
-        _else_ { _var_init_copy_(m_2, s32, 2); }
-        _var_init_(m_3, s32, builder::make_phi({m_1_, m_2_}));
-        _for_(i, t29, t30, t31) { A[m_3] = m_3; }
-        //
-        _var_init_(d, s32, 0);
-        _var_init_(t37, s32, 0);
-        _var_init_(t38, s32, 10);
-        _var_init_(t39, s32, 1);
-        _var_init_(d_18, s32, builder::make_phi({d}));
-        _var_init_(t40, datatypes::boolean, true);
-        _for_(i, t37, t38, t39) {
-            _if_(t40) {
-                _var_init_(t41, s32, 0);
-                _var_init_(t42, s32, 10);
-                _var_init_(t43, s32, 1);
-                _var_init_(d_19, s32, builder::make_phi({d_18}));
-                _var_init_(t46, s32, 2);
-                _var_init_(tmp, s32, (d_19 + t46));
-                _for_(i1, t41, t42, t43) { A[i1] = tmp; }
-            }
-            _else_ {
-                _var_init_(t47, s32, 0);
-                _var_init_(t48, s32, 10);
-                _var_init_(t49, s32, 1);
-                _var_init_(d_20, s32, builder::make_phi({d_18}));
-                _var_init_(t51, s32, 1);
-                _var_init_(tmp2, s32, (d_20 + t51));
-                _for_(i2, t47, t48, t49) { A[i2] = tmp2; }
-            }
-        }
-        //
-        _var_init_(m100, s32, 0);
-        _var_init_(m200, s32, 0);
-        _var_init_(t50, s32, 0);
-        _var_init_(t51, s32, 10);
-        _var_init_(t52, s32, 1);
-        _var_init_(m201, s32, builder::make_phi({m200}));
-        _var_init_(t53, datatypes::boolean, true);
-        expr m103_, m104_;
-        _if_(t53) {
-            _var_init_(t54, s32, 0);
-            _var_init_(t55, s32, 10);
-            _var_init_(t56, s32, 1);
-            _var_init_(m202, s32, builder::make_phi({m201}));
-            _var_init_(t59, s32, 1);
-            _var_init_(m102, s32, (m202 + t59));
-            _for_(j, t54, t55, t56) {}
-            _var_init_copy_(m103, s32, builder::make_phi({m100, m102}));
-        }
-        _else_ { _var_init_copy_(m104, s32, 2); }
-        _var_init_(m105, s32, builder::make_phi({m103_, m104_}));
-        _for_(i, t50, t51, t52) { A[m105] = m105; }
-        //
-        _var_init_(t33, s32, 0);
-        _return_(t33);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-    std::cout << cmper;
-}
-
-TEST(GCCore_CPU_licm_transform, TestLICMTransformIndexing) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", datatypes::s32, {10000}),
-            _arg_("B", datatypes::s32, {10000})) {
-        _bind_(A, B);
-        _for_(i, 0, 10) {
-            _for_(j, 0, 10) {
-                A[i] = B[0] + builder::make_cast(s32, i); // can hoist B[0]
-                A[j] = B[i]; // can hoist B[i]
-            }
-        }
-        _for_(i, 0, 10) {
-            B[i] = 0; // canot hoist
-            _for_(j, 0, 10) {
-                A[i + j] = B[0]; // can hoist B[0] only outside loop j
-            }
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    _function_(s32, expected, _arg_("A", datatypes::s32, {10000}),
-            _arg_("B", datatypes::s32, {10000})) {
-        _bind_(A, B);
-        _var_init_(t0, s32, 0);
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, s32, 1);
-        _var_init_(t3, s32, 0);
-        _var_init_(t4, s32, 10);
-        _var_init_(t5, s32, 1);
-        _var_init_(t7, s32, 0);
-        _var_init_(t8, s32, B[t7]);
-        _for_(i, t0, t1, t2) {
-            _var_init_(i_0, idx, builder::make_phi({i}));
-            _var_init_(t9, s32, builder::make_cast(s32, i_0));
-            _var_init_(t10, s32, t8 + t9);
-            _var_init_(t11, s32, B[i_0]);
-            _for_(j, t3, t4, t5) {
-                A[i_0] = t10;
-                A[j] = t11;
-            }
-        }
-
-        _var_init_(t12, s32, 0);
-        _var_init_(t13, s32, 10);
-        _var_init_(t14, s32, 1);
-        _var_init_(t15, s32, 0);
-        _var_init_(t16, s32, 0);
-        _var_init_(t17, s32, 10);
-        _var_init_(t18, s32, 1);
-        _var_init_(t21, s32, 0);
-        _for_(i, t12, t13, t14) {
-            B[i] = t15;
-            _var_init_(i_1, idx, builder::make_phi({i}));
-            _var_init_(t22, s32, B[t21]);
-            _for_(j, t16, t17, t18) {
-                _var_init_(t20, idx, i_1 + j);
-                A[t20] = t22;
-            }
-        }
-        _var_init_(t23, s32, 0);
-        _return_(t23);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_licm_transform, TestLICMTransformLoopWithFuncCall) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", datatypes::f32, {10000}),
-            _arg_("B", datatypes::f32, {10000}), _arg_("m", datatypes::index),
-            _arg_("n", datatypes::index)) {
-        _bind_(A, B, m, n);
-        _for_(i, 0, 10) {
-            _for_(j, 0, 10) {
-                _for_(k, 0, 10) {
-                    // can hoist
-                    A[m + n + i + j + k] = B[0] + 1.f;
-                }
-            }
-            // loop contain func call, don't promote beyond this loop
-            _evaluate_call_(builtin::get_barrier_arrive_func(), A);
-            _for_(j, 0, 10) {
-                // loop contain func call, don't promote beyond this loop
-                _evaluate_call_(builtin::get_barrier_arrive_func(), B);
-                A[n + i + j] = 2.f;
-            }
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    _function_(s32, expected, _arg_("A", datatypes::f32, {10000}),
-            _arg_("B", datatypes::f32, {10000}), _arg_("m", datatypes::index),
-            _arg_("n", datatypes::index)) {
-        _bind_(A, B, m, n);
-        _var_init_(t0, s32, 0);
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, s32, 1);
-        _for_(i, t0, t1, t2) {
-            _var_init_(m_0, idx, builder::make_phi({m}));
-            _var_init_(n_3, idx, builder::make_phi({n}));
-            _var_init_(t3, s32, 0);
-            _var_init_(t4, s32, 10);
-            _var_init_(t5, s32, 1);
-            _var_init_(m_1, idx, builder::make_phi({m_0}));
-            _var_init_(n_4, idx, builder::make_phi({n_3}));
-            _var_init_(i_6, idx, builder::make_phi({i}));
-            _var_init_(t6, s32, 0);
-            _var_init_(t7, s32, 10);
-            _var_init_(t8, s32, 1);
-            _var_init_(m_2, idx, builder::make_phi({m_1}));
-            _var_init_(n_5, idx, builder::make_phi({n_4}));
-            _var_init_(t11, idx, (m_2 + n_5));
-            _var_init_(i_7, idx, builder::make_phi({i_6}));
-            _var_init_(t13, idx, (t11 + i_7));
-            _var_init_(t17, s32, 0);
-            _var_init_(t18, f32, B[t17]);
-            _var_init_(t19, f32, 1.f);
-            _var_init_(t20, f32, (t18 + t19));
-            _for_(j, t3, t4, t5) {
-                _var_init_(j_3, idx, builder::make_phi({j}));
-                _var_init_(t15, idx, (t13 + j_3));
-                _for_(k, t6, t7, t8) {
-                    _var_init_(t16, idx, (t15 + k));
-                    A[t16] = t20;
-                }
-            }
-            _var_init_(t21, datatypes::void_t,
-                    builder::make_call(
-                            builtin::get_barrier_arrive_func(), {A}));
-            builder.push_evaluate(t21);
-            _var_init_(t22, s32, 0);
-            _var_init_(t23, s32, 10);
-            _var_init_(t24, s32, 1);
-            _for_(j, t22, t23, t24) {
-                _var_init_(t25, datatypes::void_t,
-                        builder::make_call(
-                                builtin::get_barrier_arrive_func(), {B}));
-                builder.push_evaluate(t25);
-                _var_init_(n_4, idx, builder::make_phi({n_3}));
-                _var_init_(i_5, idx, builder::make_phi({i}));
-                _var_init_(t28, idx, (n_4 + i_5));
-                _var_init_(t29, idx, (t28 + j));
-                _var_init_(t30, f32, 2.f);
-                A[t29] = t30;
-            }
-        }
-        _var_init_(t31, s32, 0);
-        _return_(t31);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_licm_transform, TestLICMTransformAliasTensor) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", datatypes::s32, {10000}),
-            _arg_("B", datatypes::s32, {10000}),
-            _arg_("C", datatypes::s32, {10000}),
-            _arg_("D", datatypes::s32, {10000})) {
-        _bind_(A, B, C, D);
-        {
-            auto clique = std::make_shared<alias_info::alias_set_t>();
-            alias_info::get_or_create_alias_info(*A.get())->add_to_clique(
-                    clique);
-            alias_info::get_or_create_alias_info(*B.get())->add_to_clique(
-                    clique);
-        }
-        _for_(i, 0, 10) {
-            _for_(j, 0, 10) {
-                C[i] = B[0]; // alias A volatile, cannot hoist B[0]
-                A[j] = D[0]; // A volatile, D non-volatile
-            }
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    _function_(s32, expected, _arg_("A", datatypes::s32, {10000}),
-            _arg_("B", datatypes::s32, {10000}),
-            _arg_("C", datatypes::s32, {10000}),
-            _arg_("D", datatypes::s32, {10000})) {
-        _bind_(A, B, C, D);
-        _var_init_(t0, s32, 0);
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, s32, 1);
-        _var_init_(t3, s32, 0);
-        _var_init_(t4, s32, 10);
-        _var_init_(t5, s32, 1);
-        _var_init_(t7, s32, 0);
-        _var_init_(t9, s32, 0);
-        _var_init_(t10, s32, D[t9]);
-        _for_(i, t0, t1, t2) {
-            _var_init_(i_0, idx, builder::make_phi({i}));
-            _for_(j, t3, t4, t5) {
-                _var_init_(t8, s32, B[t7]);
-                C[i_0] = t8;
-                A[j] = t10;
-            }
-        }
-
-        _var_init_(t23, s32, 0);
-        _return_(t23);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_licm_transform, TestLICMNonLoopPHI) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", datatypes::s32, {10000}),
-            _arg_("B", datatypes::s32, {10000})) {
-        _bind_(A, B);
-        _for_(i, 0, 10) {
-            _var_init_(a, datatypes::s32, 0);
-            _if_(i > 0) { a = 2; }
-            A[0] = a;
-            _var_init_(c, datatypes::s32, 1);
-            _var_init_(d, datatypes::s32, 2);
-            _var_init_(e, datatypes::s32, c + d);
-            _if_(0) { e = 5; }
-            A[1] = e;
-            _var_init_(f, datatypes::s32, 0);
-            _if_(0) {
-                B[1] = 2;
-                f = B[0];
-            }
-            A[2] = f;
-            _for_(jj, 0, 10) {
-                _var_init_(h, datatypes::s32, 0);
-                _var_init_(h_tmp, datatypes::s32, 0);
-                _if_(i > 0) { h = 2; }
-                h_tmp = h + 1;
-                A[0] = h;
-                A[1] = h_tmp;
-            }
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    _function_(s32, expected, _arg_("A", datatypes::s32, {10000}),
-            _arg_("B", datatypes::s32, {10000})) {
-        _bind_(A, B);
-        _var_init_(t0, s32, 0);
-        _var_init_(t1, s32, 10);
-        _var_init_(t2, s32, 1);
-        _var_init_(t3, s32, 0);
-        _var_init_(t7, s32, 0);
-        _var_init_(c, s32, 1);
-        _var_init_(d, s32, 2);
-        _var_init_(e, s32, c + d);
-        expr e2_;
-        _var_init_(t8, s32, 0);
-        _if_(t8) { _var_init_copy_(e2, s32, 5); }
-        _var_init_(e3, s32, builder::make_phi({e, e2_}));
-        _var_init_(t12, s32, 1);
-        _var_init_(t13, s32, 0);
-        _var_init_(t19, s32, 2);
-        _var_init_(t20, s32, 0);
-        _var_init_(t21, s32, 10);
-        _var_init_(t22, s32, 1);
-        _var_init_(t23, s32, 0);
-        _var_init_(t27, s32, 1);
-        _var_init_(t24, s32, 0);
-        _var_init_(t28, s32, 1);
-
-        _for_(i, t0, t1, t2) {
-            _var_init_(a, s32, 0);
-            _var_init_(t4, datatypes::boolean, i > t3);
-            expr a0_;
-            _if_(t4) { _var_init_copy_(a0, s32, 2); }
-            _var_init_(a1, s32, builder::make_phi({a, a0_}));
-            A[t7] = a1;
-            A[t12] = e3;
-            _var_init_(f, s32, 0);
-            expr f4_;
-            _if_(t13) {
-                _var_init_(t14, s32, 1);
-                _var_init_(t15, s32, 2);
-                B[t14] = t15;
-                _var_init_(t16, s32, 0);
-                _var_init_copy_(f4, s32, B[t16]);
-            }
-            _var_init_(f5, s32, builder::make_phi({f, f4_}));
-            A[t19] = f5;
-            _var_init_(h, s32, 0);
-            _var_init_(i6, idx, builder::make_phi({i}));
-            _var_init_(t25, datatypes::boolean, i6 > t23);
-            expr h_7_;
-            _if_(t25) { _var_init_copy_(h_7, s32, 2); }
-            _var_init_(h_8, s32, builder::make_phi({h, h_7_}));
-            _var_init_(h_tmp, s32, h_8 + t27);
-            _for_(jj, t20, t21, t22) {
-                A[t24] = h_8;
-                A[t28] = h_tmp;
-            }
-        }
-
-        _var_init_(t26, s32, 0);
-        _return_(t26);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-#if SC_BUILTIN_JIT_ENABLED
-TEST(GCCore_CPU_licm_transform, TestIndexingTransformLICM) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, original, _arg_("A", f32, {65536UL}),
-            _arg_("B", f32, {65536UL}), _arg_("C", f32, {65536UL})) {
-        _bind_(A, B, C);
-        _for_(idx1, 0UL, 128UL, 1UL) {
-            _for_(idx2, 0UL, 512UL, 1UL) {
-                _var_init_(a, f32, A[(UINT64_C(512) * idx1) + idx2]);
-                _var_init_(b, f32, B[(UINT64_C(512) * idx1) + idx2]);
-                C[(UINT64_C(512) * idx1) + idx2] = a + b;
-            }
-        }
-    }
-
-    constant_folder_t c1(false);
-    auto out = c1(original);
-    xbyak::indexing_transform_t t;
-    out = t(out);
-    constant_folder_t c2(false);
-    out = c2(out);
-    ssa_transform_t s;
-    out = s(out);
-    loop_invariant_code_motion_t licm;
-    out = licm(out);
-
-    auto f32_ptr = f32.get_pointerof();
-    _function_(datatypes::void_t, expected, _arg_("A", f32, {65536UL}),
-            _arg_("B", f32, {65536UL}), _arg_("C", f32, {65536UL})) {
-        _bind_(A, B, C);
-        _var_init_(t0, idx, 0UL);
-        _var_init_(t1, idx, 128UL);
-        _var_init_(t2, idx, 1UL);
-        _var_init_(t3, idx, 0UL);
-        _var_init_(t4, idx, 512UL);
-        _var_init_(t5, idx, 1UL);
-        _var_init_(t6, idx, builder::make_cast(idx, A));
-        _var_init_(t8, idx, 2048UL);
-        _var_init_(t12, idx, builder::make_cast(idx, B));
-        _var_init_(t13, idx, 2048UL);
-        _var_init_(t17, idx, builder::make_cast(idx, C));
-        _var_init_(t18, idx, 2048UL);
-        _for_(idx1, t0, t1, t2) {
-            _var_init_(i1, idx, builder::make_phi({idx1}));
-            _var_init_(t9, idx, (i1 * t8));
-            _var_init_(t10, idx, (t6 + t9));
-            _var_init_(ptr_a, f32_ptr, builder::make_cast(f32_ptr, t10));
-            _var_init_(t14, idx, (i1 * t13));
-            _var_init_(t15, idx, (t12 + t14));
-            _var_init_(ptr_b, f32_ptr, builder::make_cast(f32_ptr, t15));
-            _var_init_(t19, idx, (i1 * t18));
-            _var_init_(t20, idx, (t17 + t19));
-            _var_init_(ptr_c, f32_ptr, builder::make_cast(f32_ptr, t20));
-            _for_(idx2, t3, t4, t5) {
-                _var_init_(a, f32, ptr_a[idx2]);
-                _var_init_(b, f32, ptr_b[idx2]);
-                _var_init_(t22, f32, (a + b));
-                ptr_c[idx2] = t22;
-            }
-        }
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_splitter.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_splitter.cpp
deleted file mode 100644
index c4317edd265..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_splitter.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/transform/loop_split.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_loop_split_cpp, TestIfSpilttedLoop) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::void_t, aa, _arg_("jj", datatypes::index),
-            _arg_("buf", datatypes::index, {100, 200})) {
-        _bind_(jj, buf);
-        _for_(i, 0, 20) {
-            _for_(j1, 0, 50) {
-                _if_(j1 < 20) {
-                    // _for_(j1, 0, 20)
-                    buf[{i, j1}] = i * j1;
-                }
-            }
-            _for_(j2, 0, 50) {
-                _if_(j2 <= 20) {
-                    // _for_(j2, 0, 21)
-                    buf[{i, j2}] = i * j2;
-                }
-            }
-            _for_(j3, 1, 50, 4) {
-                _if_(j3 > 20) {
-                    // _for_(j3, 21, 50, 4)
-                    buf[{i, j3}] = i * j3;
-                }
-            }
-            _for_(j4, 1, 50, 4) {
-                _if_(j4 >= 20) {
-                    // _for_(j4, 21, 50, 4)
-                    buf[{i, j4}] = i * j4;
-                }
-            }
-            _for_(j5, 0, jj) {
-                _if_(j5 < 20) {
-                    // no change
-                    buf[{i, j5}] = i * j5;
-                }
-            }
-            _for_(j6, jj, 50) {
-                _if_(j6 > 20) {
-                    // no change
-                    buf[{i, j6}] = i * j6;
-                }
-            }
-            _for_(j7, 0, 50) {
-                _if_(i < 10) {
-                    // no change
-                    buf[{i, j7}] = i * j7;
-                }
-            }
-        }
-    }
-
-    _function_(datatypes::void_t, bbb, _arg_("jj", datatypes::index),
-            _arg_("buf", datatypes::index, {100, 200})) {
-        _bind_(jj, buf);
-        _for_(i, 0, 20, 1) {
-            _for_(j1, 0, 20) { buf[{i, j1}] = i * j1; }
-            _for_(j2, 0, 21) { buf[{i, j2}] = i * j2; }
-            _for_(j3, 21, 50, 4) { buf[{i, j3}] = i * j3; }
-            _for_(j4, 21, 50, 4) { buf[{i, j4}] = i * j4; }
-            _for_(j5, 0, jj) {
-                _if_(j5 < 20) { buf[{i, j5}] = i * j5; }
-            }
-            _for_(j6, jj, 50) {
-                _if_(j6 > 20) { buf[{i, j6}] = i * j6; }
-            }
-            _for_(j7, 0, 50) {
-                _if_(i < 10) { buf[{i, j7}] = i * j7; }
-            }
-        }
-    }
-
-    loop_splitter_t loop_splitter;
-    auto aaa = loop_splitter(aa);
-
-    ir_comparer cmper;
-    EXPECT_TRUE(aaa->equals(bbb, cmper));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_transform.cpp
deleted file mode 100644
index e7850475c4e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_loop_transform.cpp
+++ /dev/null
@@ -1,737 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/transform/loop_merge.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/ir/transform/loop_unroll.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_loop_transform_cpp, TestLoopTransform) {
-    builder::ir_builder_t builder;
-
-    for_loop li, lj, lk, lp;
-    _function_(datatypes::f32, aaa, _arg_("ii", datatypes::s32),
-            _arg_("jj", datatypes::s32),
-            _arg_("buf", datatypes::s32, {100, 200})) {
-        _bind_(ii, jj, buf);
-        _tensor_(buf2, datatypes::s32, {100, 200});
-        _tensor_(buf3, datatypes::s32, {100, 200});
-        _var_(v1, datatypes::f32);
-        _var_(v2, datatypes::f32);
-        _named_for_(li, i, 0, 20, 1, for_type::PARALLEL) {
-            _named_for_(lj, j, 0, 50) {
-                _named_for_(lk, k, 0, 30) {
-                    buf3[{i, j}] = buf3[{i, j}] + buf[{i, k}] * buf2[{k, j}];
-                    _named_for_(lp, p, 0, 20) { buf3[{i, p}] = 3; }
-                }
-            }
-        }
-    }
-
-    auto lk_i = lk->split(5); //[li,lj,lk,lk_i]
-    auto li_i = li->split(5); //[li,li_i,lj,lk,lk_i]
-    auto li_i_j = li_i->fuse(lj);
-    li->reorder(aaa->body_, {li, lk, lk_i, li_i_j});
-    lp->split(5);
-
-    _function_(datatypes::f32, bbb, _arg_("ii", datatypes::s32),
-            _arg_("jj", datatypes::s32),
-            _arg_("buf", datatypes::s32, {100, 200})) {
-        _bind_(ii, jj, buf);
-        _tensor_(buf2, datatypes::s32, {100, 200});
-        _tensor_(buf3, datatypes::s32, {100, 200});
-        _var_(v1, datatypes::f32);
-        _var_(v2, datatypes::f32);
-        _for_(io, 0UL, 4UL, 1, for_type::PARALLEL) {
-            _for_(ko, 0UL, 6UL) {
-                _for_(ki, 0UL, 5UL, 1UL) {
-                    _for_(ii_j, 0UL, 250UL, 1UL) {
-                        expr j = (ii_j % UINT64_C(50));
-                        expr i = io * UINT64_C(5) + ii_j / UINT64_C(50);
-                        expr k = (ko * UINT64_C(5)) + ki;
-                        buf3[{i, j}]
-                                = buf3[{i, j}] + buf[{i, k}] * buf2[{k, j}];
-                        _for_(po, 0UL, 4UL, 1) {
-                            _for_(pi, 0UL, 5UL, 1UL) {
-                                expr p = po * UINT64_C(5) + pi;
-                                buf3[{i, p}] = 3;
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(aaa->equals(bbb, cmper));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformSplit) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop li, lj;
-    _tensor_(buf, datatypes::f32, {100});
-    _named_for_(li, i, 0, 20) { buf[i] = 4.0f; }
-    expr idx;
-    _named_for_(lj, i, 0, 20) {
-        idx = i + 2;
-        buf[idx] = buf[idx + idx];
-    }
-    builder.pop_scope();
-    // infinite loop check
-    lj->split(2);
-    EXPECT_SC_ERROR(li->split(6),
-            "The loop length 20 should be divisible of and larger than "
-            "the block size 6");
-
-    builder.push_scope();
-    _var_(len, datatypes::s32);
-    _named_for_(li, i, len, 20) { buf[i] = 4.0f; }
-    builder.pop_scope();
-    EXPECT_SC_ERROR(li->split(4),
-            "Only support constant for loops for for-loop-transforms: ");
-
-    builder.push_scope();
-    _named_for_(li, i, 1, 20, 2) { buf[i] = 4.0f; }
-    builder.pop_scope();
-    EXPECT_SC_ERROR(li->split(4),
-            "for-loop-transforms only support step=1: for i in (1, 20, 2)");
-
-    builder.push_scope();
-    _named_for_(li, i, 100, 20, 1) { buf[i] = 4.0f; }
-    builder.pop_scope();
-    EXPECT_SC_ERROR(li->split(4),
-            "for-loop-transforms: the begin should be less than or eq to end: "
-            "for i ");
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformFuse) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop li, lj, lk, lp;
-    _tensor_(buf, datatypes::f32, {100});
-    _named_for_(li, i, 0, 20) {
-        _named_for_(lj, j, 0, 20) {
-            _named_for_(lk, k, 0, 20) {
-                _named_for_(lp, p, 0, 20) { buf[i + j + k + p] = 4.0f; }
-            }
-        }
-    }
-
-    builder.pop_scope();
-    lk->fuse(lp);
-    li->fuse(lj);
-    EXPECT_SC_ERROR(lj->fuse(lk), "Transforming an invalid for-loop: this");
-    EXPECT_SC_ERROR(li->fuse(lj), "Transforming an invalid for-loop: ax");
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformReorder) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop li, lj, lk, lp, lg;
-    _tensor_(buf, datatypes::f32, {100});
-    _named_for_(li, i, 0, 20) {
-        _named_for_(lj, j, 0, 20) {
-            _named_for_(lk, k, 0, 20) {
-                _named_for_(lp, p, 0, 20) {
-                    buf[i + j + k + p] = 4.0f;
-                    _named_for_(lg, g, 0, 20) { buf[g] = 3.0f; }
-                }
-            }
-        }
-    }
-
-    auto body = builder.pop_scope();
-    EXPECT_SC_ERROR(li->reorder(body, {}),
-            "The number of axises to reorder should > 0");
-    EXPECT_SC_ERROR(li->reorder(body, {li, lj, lk, lp, lg}),
-            "Bad number of axises to reorder. Got 5 to reorder, but only have "
-            "4 nested for-loops");
-    EXPECT_SC_ERROR(li->reorder(body, {li, lk}),
-            "Cannot find axis j in the given axises to reorder");
-    EXPECT_SC_ERROR(li->reorder(body, {li, lj, lj}),
-            "Cannot find axis k in the given axises to reorder");
-    EXPECT_SC_ERROR(li->reorder(make_stmt<stmts_node_t>(std::vector<stmt> {}),
-                            {li, lj, lk}),
-            "Cannot find the for-loop to replace in the parent stmt");
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformReorderTwoLoops) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop li, lj;
-    _tensor_(buf, datatypes::f32, {100});
-    _named_for_(li, i, 1, 30) {
-        _named_for_(lj, j, 0, 20) { buf[i + j] = 3; }
-    }
-
-    auto body = builder.pop_scope();
-    li->reorder(body, {lj, li});
-
-    builder.push_scope();
-    _tensor_(buf2, datatypes::f32, {100});
-    _for_(j, 0, 20) {
-        _for_(i, 1, 30) { buf2[i + j] = 3; }
-    }
-
-    ir_comparer cmper;
-    EXPECT_TRUE(body->equals(builder.pop_scope(), cmper));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformMergeGood) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop li, lj;
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    _named_for_(li, i, 0, len, len / 20) { buf[i] = 3; }
-    _named_for_(lj, j, 0, len, len / 20) { buf[j] = buf[j] + 1; }
-    auto body = builder.pop_scope();
-    li->merge(body, lj);
-    EXPECT_FALSE(lj->isvalid());
-
-    builder.push_scope();
-    _tensor_(buf2, datatypes::f32, {100});
-    _var_(len2, datatypes::s32);
-    _for_(i, 0, len2, len2 / 20) {
-        buf2[i] = 3;
-        buf2[i] = buf2[i] + 1;
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(body->equals(builder.pop_scope(), cmper));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformMergeDeath) {
-    builder::ir_builder_t builder;
-    for_loop li, lj;
-    stmt body;
-    expr len_v;
-    auto make = [&]() {
-        builder.push_scope();
-        _tensor_(buf, datatypes::f32, {100});
-        _var_(len, datatypes::s32);
-        len_v = len.get();
-        _named_for_(li, i, 0, len, len / 20) { buf[i] = 3; }
-        _named_for_(lj, j, 0, len, len / 20) { buf[j] = buf[j] + 1; }
-        body = builder.pop_scope();
-    };
-
-    make();
-    stmt s = make_stmt<evaluate_node_t>(expr(0));
-    EXPECT_SC_ERROR(lj->merge(s, li), "The parent should be an stmts_node_t");
-
-    make();
-    s = make_stmt<stmts_node_t>(std::vector<stmt>());
-    EXPECT_SC_ERROR(lj->merge(s, li), "Cannot find the axises in the parent");
-
-    make();
-    EXPECT_SC_ERROR(
-            lj->merge(body, lj), "The axis to merge should not be \'this\'");
-    make();
-    lj->step_ = expr(20);
-    EXPECT_SC_ERROR(lj->merge(body, li),
-            "The ranges of the merged for-loops should be the same");
-
-    make();
-    lj->step_ = len_v / 20;
-    li->merge(body, lj);
-    EXPECT_SC_ERROR(lj->merge(body, li),
-            "Invalid for-loop. It has been fused or merged");
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformMergeMultiGood) {
-    builder::ir_builder_t builder;
-    for_loop li, lj;
-    stmt body;
-    auto make = [&]() {
-        builder.push_scope();
-        _tensor_(buf, datatypes::f32, {100});
-        _var_(len, datatypes::s32);
-        _named_for_(li, i, 0, len, 1) {
-            _for_(j, 0, i, 1) {
-                _for_(k, 0, j, 1) { buf[i + j + k] = 3; }
-            }
-        }
-        _named_for_(lj, i, 0, len, 1) {
-            _for_(j, 0, i, 1) {
-                _for_(k, 0, j, 1) { buf[i + j * k] = 1; }
-            }
-        }
-        body = builder.pop_scope();
-    };
-    make();
-    EXPECT_SC_ERROR(li->merge(body, lj, 4),
-            "Merging 4 inner loops, but have only 3 loops in the IR");
-
-    make();
-    li->merge(body, lj, 3);
-    EXPECT_FALSE(lj->isvalid());
-
-    builder.push_scope();
-    _tensor_(buf2, datatypes::f32, {100});
-    _var_(len2, datatypes::s32);
-    _named_for_(li, i, 0, len2, 1) {
-        _for_(j, 0, i, 1) {
-            _for_(k, 0, j, 1) {
-                buf2[i + j + k] = 3;
-                buf2[i + j * k] = 1;
-            }
-        }
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(body->equals(builder.pop_scope(), cmper));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopTransformMergeAll) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop li, lj;
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    _named_for_(li, i, 0, len, 1) {
-        _for_(j, 0, i, 1) {
-            _for_(k, 0, j, 1) {
-                _for_(t, 0, j, 1) { buf[i + j + k] = 3; }
-            }
-        }
-    }
-    li->attr().set(stmt_attr_key::skip_axis_binding_check, true);
-    _named_for_(lj, i, 0, len, 1) {
-        _for_(j, 0, i, 1) {
-            _for_(k, 0, j, 1) {
-                _for_(t, 0, i, 1) { buf[i + j + k] = 1; }
-            }
-        }
-    }
-    auto body = builder.pop_scope();
-    EXPECT_EQ(li->merge_all(body, lj), 3);
-
-    builder.push_scope();
-    _tensor_(buf2, datatypes::f32, {100});
-    _var_(len2, datatypes::s32);
-    _named_for_(li, i, 0, len2, 1) {
-        _for_(j, 0, i, 1) {
-            _for_(k, 0, j, 1) {
-                _for_(t, 0, j, 1) { buf2[i + j + k] = 3; }
-                _for_(t, 0, i, 1) { buf2[i + j + k] = 1; }
-            }
-        }
-    }
-    ir_comparer cmper(true);
-    EXPECT_TRUE(body->equals(builder.pop_scope(), cmper));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopMergeRecursivePass) {
-    builder::ir_builder_t builder;
-    for_loop li[3], lj;
-    builder.push_scope();
-    _tensor_(buf, datatypes::f32, {100, 100});
-    _var_(len, datatypes::s32);
-    builder.push_scope();
-    _named_for_(li[0], i, 0, len) {
-        _for_(j, 0, len) { buf[i][j] = 3; }
-    }
-    _named_for_(li[1], i, 0, len) {
-        // the inner loop is not mergable with first loop, but can be merged
-        // with last loop
-        _named_for_(lj, j, 1, len) { buf[i][j] = buf[i][j] * 1; }
-    }
-    _named_for_(li[2], i, 0, len) {
-        _for_(j, 1, len) { buf[i][j] = buf[i][j] + 1; }
-    }
-    auto body = builder.pop_scope().static_as<stmts>();
-
-    li[0]->attr()[stmt_attr_key::merge_loop] = true;
-    li[0]->attr()[stmt_attr_key::skip_axis_binding_check] = true;
-    li[1]->attr()[stmt_attr_key::merge_loop] = true;
-    lj->attr()[stmt_attr_key::skip_axis_binding_check] = true;
-    li[2]->attr()[stmt_attr_key::merge_loop] = true;
-
-    auto out = loop_merger_t()(body);
-    // std::cout << out << std::endl;
-
-    builder.push_scope();
-    _for_(i, 0, len) {
-        _for_(j, 0, len) { buf[i][j] = 3; }
-        _for_(j, 1, len) {
-            buf[i][j] = buf[i][j] * 1;
-            buf[i][j] = buf[i][j] + 1;
-        }
-    }
-    ir_comparer cmper(true);
-    auto expected = builder.pop_scope();
-    // std::cout << expected << std::endl;
-    EXPECT_TRUE(cmper.compare(expected, out, false));
-    // std::cout << cmper << std::endl;
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopMergePass) {
-    builder::ir_builder_t builder;
-    for_loop li[3];
-    builder.push_scope();
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    builder.push_scope();
-    _named_for_(li[0], i, 0, len) { buf[i] = 3; }
-    _named_for_(li[1], j, 0, len) { buf[j] = buf[j] + 1; }
-    // the third loop is not mergable
-    _named_for_(li[2], j, 1, len) { buf[j] = buf[j] + 1; }
-    auto body = builder.pop_scope().static_as<stmts>();
-
-    // make a copy of the original body
-    builder.push_scope();
-    _for_(i, 0, len) { buf[i] = 3; }
-    _for_(j, 0, len) { buf[j] = buf[j] + 1; }
-    // the third loop is not mergable
-    _for_(j, 1, len) { buf[j] = buf[j] + 1; }
-    auto body_copy = builder.pop_scope();
-
-    li[0]->attr()[stmt_attr_key::merge_loop] = true;
-    li[0]->attr()[stmt_attr_key::skip_axis_binding_check] = true;
-    li[1]->attr()[stmt_attr_key::merge_loop] = true;
-    li[2]->attr()[stmt_attr_key::merge_loop] = true;
-
-    auto out = loop_merger_t()(body);
-
-    builder.push_scope();
-    _for_(i, 0, len) {
-        buf[i] = 3;
-        buf[i] = buf[i] + 1;
-    }
-    _for_(j, 1, len) { buf[j] = buf[j] + 1; }
-    ir_comparer cmper(true);
-    auto expected = builder.pop_scope();
-    EXPECT_TRUE(cmper.compare(expected, out));
-    // make sure original body is unchanged
-    EXPECT_TRUE(cmper.compare(body, body_copy));
-
-    // simulate inlined function bodies (stmts node with only one for-node)
-    auto mk_stmts = [](const std::vector<stmt> &arr) {
-        return make_stmt<stmts_node_t>(std::vector<stmt>(arr));
-    };
-    stmts inlined_body = mk_stmts({
-            mk_stmts({body->seq_[0]}),
-            mk_stmts({body->seq_[1]}),
-            mk_stmts({body->seq_[2]}),
-    });
-    EXPECT_TRUE(cmper.compare(expected, loop_merger_t()(inlined_body)));
-
-    // loops without the attr should not be merged
-    li[0]->attr().remove(stmt_attr_key::merge_loop);
-    out = loop_merger_t()(body);
-    EXPECT_TRUE(out->equals(body, cmper));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopMergeWithExprPass) {
-    builder::ir_builder_t builder;
-    for_loop li[4];
-    builder.push_scope();
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    builder.push_scope();
-    _var_(test_1, datatypes::s32);
-    _named_for_(li[0], i, 0, len) { buf[i] = test_1; }
-    _var_(test_2, datatypes::s32);
-    _named_for_(li[1], j, 0, len) { buf[j] = buf[j] + test_2; }
-    _var_(test_3, datatypes::s32);
-    _named_for_(li[2], k, 0, len) { buf[k] = buf[k] * test_3; }
-    // the forth loop is not mergable
-    _named_for_(li[3], m, 1, len) { buf[m] = buf[m] + 1; }
-    auto body = builder.pop_scope().static_as<stmts>();
-
-    li[0]->attr()[stmt_attr_key::merge_loop] = true;
-    li[0]->attr()[stmt_attr_key::skip_axis_binding_check] = true;
-    li[1]->attr()[stmt_attr_key::merge_loop] = true;
-    li[2]->attr()[stmt_attr_key::merge_loop] = true;
-    li[3]->attr()[stmt_attr_key::merge_loop] = true;
-
-    auto out = loop_merger_t()(body);
-    // std::cout << out << std::endl;
-
-    builder.push_scope();
-    _var_(test_exp_1, datatypes::s32);
-    _var_(test_exp_2, datatypes::s32);
-    _var_(test_exp_3, datatypes::s32);
-    _for_(i, 0, len) {
-        buf[i] = test_exp_1;
-        buf[i] = buf[i] + test_exp_2;
-        buf[i] = buf[i] * test_exp_3;
-    }
-    _for_(j, 1, len) { buf[j] = buf[j] + 1; }
-    auto expected = builder.pop_scope();
-    // std::cout << expected << std::endl;
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(expected, out, false));
-    // std::cout << cmper << std::endl;
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, LoopUnrollPass) {
-    builder::ir_builder_t builder;
-    for_loop li, li2, lj, lj1;
-    builder.push_scope();
-    _tensor_(A, datatypes::f32, 100, 100);
-    _tensor_(B, datatypes::f32, 100, 100);
-    _named_for_(li, i, 0, 100, 2) {
-        _for_(k, 0, 100, 1) {
-            _named_for_(lj1, j, 0, 4, 2) {
-                A[{i, j}] = A[{j, i}];
-                A[{i, j}] = i + 1;
-            }
-        }
-    }
-    _named_for_(li2, i, 0, 100, 2) {
-        _for_(k, 0, 100, 1) {
-            _named_for_(lj, j, 0, 4, 2) {
-                B[{i, j}] = A[{j, i}];
-                B[{i, j}] = i + 1;
-            }
-        }
-    }
-    auto body = builder.pop_scope().static_as<stmts>();
-
-    li->attr()[stmt_attr_key::merge_loop] = true;
-    li->attr()[stmt_attr_key::skip_axis_binding_check] = true;
-    li2->attr()[stmt_attr_key::merge_loop] = true;
-    lj->attr()[stmt_attr_key::unroll_loop] = 0;
-
-    auto out = loop_unroller_t()(loop_merger_t()(body));
-
-    builder.push_scope();
-    {
-        _tensor_(A, datatypes::f32, 100, 100);
-        _tensor_(B, datatypes::f32, 100, 100);
-        _named_for_(li, i, 0, 100, 2) {
-            _for_(k, 0, 100, 1) {
-                {
-                    uint64_t j = 0;
-                    builder.push_scope();
-                    A[{i, j}] = A[{j, i}];
-                    A[{i, j}] = i + 1;
-                    B[{i, j}] = A[{j, i}];
-                    B[{i, j}] = i + 1;
-                    builder.emit(builder.pop_scope());
-                }
-                {
-                    uint64_t j = 2;
-                    builder.push_scope();
-                    A[{i, j}] = A[{j, i}];
-                    A[{i, j}] = i + 1;
-                    B[{i, j}] = A[{j, i}];
-                    B[{i, j}] = i + 1;
-                    builder.emit(builder.pop_scope());
-                }
-            }
-        }
-    }
-    auto expected = builder.pop_scope();
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(expected, out, false));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, Unroll) {
-    builder::ir_builder_t builder;
-    for_loop lo1, lo2, lo3, lo4;
-    for_loop lr1, lr2, lr3, lr4;
-    builder.push_scope();
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    builder.push_scope();
-    _named_for_(lo1, i, 3, 300) { buf[i] = i; }
-    _named_for_(lo2, j, 0, 20, 2) {
-        buf[j] = buf[j] + 1;
-        buf[j + len] = 0;
-    }
-
-    _named_for_(lr1, j, 1, len) { buf[j] = 0; }
-    _named_for_(lr2, j, 1, 4) { buf[j] = 0; }
-    _named_for_(lr3, j, 1, 3) {
-        buf[j] = 0;
-        buf[j + UINT64_C(3)] = j * j;
-    }
-    stmt body = builder.pop_scope();
-
-    /////// expected
-    builder.push_scope();
-    _for_(i_u, 0, 99UL, 1) {
-        _var_ex_(i, datatypes::index, linkage::local, i_u * UINT64_C(3) + 3);
-        builder.push_scope();
-        buf[i + UINT64_C(0)] = i + UINT64_C(0);
-        builder.emit(builder.pop_scope());
-        builder.push_scope();
-        buf[i + UINT64_C(1)] = i + UINT64_C(1);
-        builder.emit(builder.pop_scope());
-        builder.push_scope();
-        buf[i + UINT64_C(2)] = i + UINT64_C(2);
-        builder.emit(builder.pop_scope());
-    }
-    _for_(j_u, 0, 5UL, 1) {
-        _var_ex_(j, datatypes::index, linkage::local, j_u * UINT64_C(4) + 0);
-        builder.push_scope();
-        buf[j + UINT64_C(0)] = buf[j + UINT64_C(0)] + 1;
-        buf[j + UINT64_C(0) + len] = 0;
-        builder.emit(builder.pop_scope());
-        builder.push_scope();
-        buf[j + UINT64_C(2)] = buf[j + UINT64_C(2)] + 1;
-        buf[j + UINT64_C(2) + len] = 0;
-        builder.emit(builder.pop_scope());
-    }
-
-    _for_(i_u, 0, (((len - 1) / (expr(1) * UINT64_C(2))) + 1), 1) {
-        _if_(i_u < (len - 1) / (expr(1) * UINT64_C(2))) {
-            _var_ex_(i, datatypes::index, linkage::local,
-                    i_u * (expr(1) * UINT64_C(2)) + 1);
-            builder.push_scope();
-            buf[i + expr(0UL) * 1] = 0;
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            buf[i + expr(1UL) * 1] = 0;
-            builder.emit(builder.pop_scope());
-        }
-        else {
-            _for_(i,
-                    (len - 1) / (expr(1) * UINT64_C(2))
-                                    * (expr(1) * UINT64_C(2))
-                            + 1,
-                    len) {
-                buf[i] = 0;
-            }
-        }
-    }
-
-    _for_(i_u, 0, 2UL, 1) {
-        _if_(i_u < UINT64_C(1)) {
-            _var_ex_(
-                    i, datatypes::index, linkage::local, i_u * UINT64_C(2) + 1);
-            builder.push_scope();
-            buf[i + UINT64_C(0)] = 0;
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            buf[i + UINT64_C(1)] = 0;
-            builder.emit(builder.pop_scope());
-        }
-        else {
-            _for_(i, 3UL, 4) { buf[i] = 0; }
-        }
-    }
-    expr a = make_expr<constant_node>((uint64_t)1, datatypes::index);
-    builder.push_scope();
-    buf[a + UINT64_C(0)] = 0;
-    buf[a + UINT64_C(0) + UINT64_C(3)] = (a + UINT64_C(0)) * (a + UINT64_C(0));
-    builder.emit(builder.pop_scope());
-    builder.push_scope();
-    buf[a + UINT64_C(1)] = 0;
-    buf[a + UINT64_C(1) + UINT64_C(3)] = (a + UINT64_C(1)) * (a + UINT64_C(1));
-    builder.emit(builder.pop_scope());
-    auto expected = builder.pop_scope();
-
-    lo1->unroll(3);
-    lo2->unroll(2);
-    lr1->unroll(2);
-    lr2->unroll(2);
-    lr3->unroll(0, body);
-
-    ir_comparer cmp(true);
-    cmp.compare(body, expected, false);
-    EXPECT_TRUE(cmp.compare(body, expected, false));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, UnrollBad) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    for_loop lo1, lo2;
-    _var_(a, datatypes::f32);
-    _named_for_(lo1, i, 3, 300) {
-        a = i;
-        _var_ex_(a2, datatypes::f32, linkage::static_local);
-    }
-    _named_for_(lo2, i, 3, 300) {
-        a = i;
-        _var_(a2, datatypes::f32);
-    }
-
-    // good unroll with local var
-    lo2->unroll(2);
-    // bad unroll with static var
-    EXPECT_SC_ERROR(
-            lo1->unroll(2), "Only allow local variables in unroll, got:");
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, SerialMerge) {
-    builder::ir_builder_t builder;
-    for_loop lo1, lo2;
-    builder.push_scope();
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    builder.push_scope();
-    _named_for_(lo1, i, 3, 300) { buf[i] = i; }
-    _named_for_(lo2, j, 1, 20) { buf[j + len] = 0; }
-    stmt body = builder.pop_scope();
-
-    /////// expected
-    builder.push_scope();
-    _for_(i, 3, expr(300) + 20 - 1) {
-        _if_(i < 300) { buf[i] = i; }
-        _else_ {
-            _var_ex_(j, datatypes::index, linkage::local, i - 300 + 1);
-            buf[j + len] = 0;
-        }
-    }
-    auto expected = builder.pop_scope();
-
-    lo1->parallel_merge(body, lo2);
-
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(body, expected, false));
-}
-
-TEST(GCCore_CPU_loop_transform_cpp, Normalize) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    _tensor_(buf, datatypes::f32, {100});
-    _var_(len, datatypes::s32);
-    builder.push_scope();
-    for_loop lo1, lo2;
-#define U64 UINT64_C
-    _named_for_(lo1, i, U64(0), U64(3), U64(2), for_type::PARALLEL) {
-        _for_(j, U64(0), U64(3), U64(2)) {
-            _for_(k, i, U64(10), U64(3), for_type::PARALLEL) {
-                buf[i + j + k] = 0;
-            }
-        }
-    }
-    auto newloop = normalize_parallel_for_loop(lo1);
-    _named_for_(lo2, i, U64(0), U64(2), U64(1), for_type::PARALLEL) {
-        _for_(j, U64(0), U64(3), U64(2)) {
-            _for_(k, U64(0), (U64(10) - i * U64(2) + U64(3) - U64(1)) / U64(3),
-                    U64(1), for_type::PARALLEL) {
-                buf[U64(0) + i * U64(2) + j
-                        + ((U64(0) + i * U64(2) + k * U64(3)))]
-                        = 0;
-            }
-        }
-    }
-#undef U64
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(newloop, lo2, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_managed_matmul.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_managed_matmul.cpp
deleted file mode 100644
index 8656cb4a45f..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_managed_matmul.cpp
+++ /dev/null
@@ -1,484 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <utility>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/templates/managed_matmul_core.hpp>
-#include <reference/act_ref.hpp>
-#include <reference/gemm_ref.hpp>
-#include <runtime/config.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <util/reflection.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::ops;
-using namespace dnnl::impl::graph::gc::test_utils;
-struct managed_gemm_params_t {
-    managed_gemm_params_t(const sc_dims &input_dims, const sc_dims &weight_dims,
-            const sc_dims &out_dims,
-            sc_data_type_t input_dtype = datatypes::f32,
-            sc_data_type_t weight_dtype = datatypes::f32,
-            bool is_input_constant = true,
-            const sc_dims &real_input_dims = sc_dims(),
-            const sc_dims &real_weight_dims = sc_dims(),
-            const sc_dims &real_out_dims = sc_dims())
-        : input_dims_(std::move(input_dims))
-        , weight_dims_(std::move(weight_dims))
-        , out_dims_(std::move(out_dims))
-        , input_dtype_(input_dtype)
-        , weight_dtype_(weight_dtype)
-        , is_input_constant_(is_input_constant)
-        , real_input_dims_(real_input_dims)
-        , real_weight_dims_(real_weight_dims)
-        , real_out_dims_(real_out_dims) {}
-    const sc_dims &get_real_input_dims() const {
-        return real_input_dims_.empty() ? input_dims_ : real_input_dims_;
-    }
-    const sc_dims &get_real_weight_dims() const {
-        return real_weight_dims_.empty() ? weight_dims_ : real_weight_dims_;
-    }
-    const sc_dims &get_real_out_dims() const {
-        return real_out_dims_.empty() ? out_dims_ : real_out_dims_;
-    }
-    bool is_dynamic() const { return !real_input_dims_.empty(); }
-    sc_dims input_dims_;
-    sc_dims weight_dims_;
-    sc_dims out_dims_;
-    sc_data_type_t input_dtype_;
-    sc_data_type_t weight_dtype_;
-    bool is_input_constant_;
-    sc_dims real_input_dims_;
-    sc_dims real_weight_dims_;
-    sc_dims real_out_dims_;
-};
-
-static bool is_param_valid(const managed_gemm_params_t &param,
-        const managed_matmul_core_config_t &cfg, const int imm_block,
-        const int imn_block, const int imk_block) {
-    if (runtime_config_t::get().get_num_threads()
-                    % (cfg.M_split_num * cfg.N_split_num)
-            != 0) {
-        std::cout << "Skip as the given splits are invalid, (M_split_num, "
-                     "N_split_num)=("
-                  << cfg.M_split_num << ", " << cfg.N_split_num
-                  << "), while the number of threads is: "
-                  << runtime_config_t::get().get_num_threads() << ". "
-                  << std::endl;
-        return false;
-    }
-    if (utils::divide_and_ceil(param.input_dims_[0], imm_block)
-                            / cfg.M_split_num
-                    < (size_t)cfg.M_sub_block
-            || utils::divide_and_ceil(param.weight_dims_[1], imn_block)
-                            / cfg.N_split_num
-                    < (size_t)cfg.N_sub_block
-            || utils::divide_and_ceil(param.weight_dims_[0], imk_block)
-                            / (runtime_config_t::get().get_num_threads()
-                                    / (cfg.M_split_num * cfg.N_split_num))
-                    < (size_t)cfg.K_sub_block) {
-        std::cout << "Skip as the given sub_blocks are invalid when the number "
-                     "of threads is: "
-                  << runtime_config_t::get().get_num_threads() << ". "
-                  << std::endl;
-        return false;
-    }
-    return true;
-}
-
-template <typename Atype, typename Btype>
-void alloc_sc_input_and_weight(test_buffer<Atype> &sc_input,
-        test_buffer<Btype> &sc_weight, const sc_dims input_dims,
-        const sc_dims weight_dims) {
-    sc_input = alloc_array<Atype>(cal_size(input_dims));
-    sc_weight = alloc_array<Btype>(cal_size(weight_dims));
-}
-
-template <typename Atype, typename Btype, typename Ctype>
-void run_mmm_test(const std::shared_ptr<jit_function_t> &fptr, int M, int N,
-        int K, const sc_dims input_dims, const sc_dims weight_dims,
-        const sc_dims out_dims, bool fuse_sigmoid = false,
-        bool is_dynamic = false) {
-    test_buffer<Atype> sc_input;
-    test_buffer<Btype> sc_weight;
-    alloc_sc_input_and_weight(sc_input, sc_weight, input_dims, weight_dims);
-
-    auto sc_output = alloc_array<Ctype>(cal_size(out_dims));
-    auto ref_input = std::vector<Ctype>(sc_input.begin(), sc_input.end());
-    auto ref_weight = std::vector<Ctype>(sc_weight.begin(), sc_weight.end());
-    auto ref_output = std::vector<Ctype>(cal_size(out_dims));
-    auto ref_output_s = std::vector<Ctype>(cal_size(out_dims));
-    if (is_dynamic) {
-        runtime::dynamic_tensor_t dyn_sc_output, dyn_sc_input, dyn_sc_weight;
-        dyn_sc_output.data_ = sc_output.data();
-        dyn_sc_output.ndims_ = 2;
-        dyn_sc_output.dims_ = const_cast<sc_dim *>(out_dims.data());
-        dyn_sc_output.dyn_mask_ = 0;
-        dyn_sc_output.dtype_
-                = uint32_t(sc_data_traits_t<Ctype>::type().type_code_);
-        dyn_sc_input.data_ = sc_input.data();
-        dyn_sc_input.ndims_ = 2;
-        dyn_sc_input.dims_ = const_cast<sc_dim *>(input_dims.data());
-        dyn_sc_input.dyn_mask_ = 1 << 0;
-        dyn_sc_input.dtype_
-                = uint32_t(sc_data_traits_t<Atype>::type().type_code_);
-        dyn_sc_weight.data_ = sc_weight.data();
-        dyn_sc_weight.ndims_ = 2;
-        dyn_sc_weight.dims_ = const_cast<sc_dim *>(weight_dims.data());
-        dyn_sc_weight.dyn_mask_ = 0;
-        dyn_sc_weight.dtype_
-                = uint32_t(sc_data_traits_t<Btype>::type().type_code_);
-        fptr->call_default(&dyn_sc_output, &dyn_sc_input, &dyn_sc_weight);
-    } else {
-        fptr->call_default(&sc_output[0], &sc_input[0], &sc_weight[0]);
-    }
-    int out_size = M * N;
-
-    gemm_params gemm_param {false, false, M, N, K, 1.0, 0.0, K, N, N};
-    ref_gemm(gemm_param, &ref_input[0], &ref_weight[0], &ref_output[0]);
-    if (fuse_sigmoid) {
-        ref_sigmoid(ref_output_s.data(), ref_output.data(), cal_size(out_dims));
-        test_utils::compare_data(
-                sc_output.data(), ref_output_s.data(), out_size, 1e-3f, 1e-3f);
-    } else {
-        test_utils::compare_data(
-                sc_output.data(), ref_output.data(), out_size, 1e-3f, 1e-3f);
-    }
-}
-
-static void check_managed_matmul(const managed_gemm_params_t &param,
-        const managed_matmul_core_config_t &cfg, bool default_cfg = false,
-        bool fuse_sigmoid = false) {
-    REQUIRE_AVX2();
-    const sc_dims &input_dims = param.input_dims_;
-    const sc_dims &weight_dims = param.weight_dims_;
-    const sc_dims &out_dims = param.out_dims_;
-    const sc_data_type_t &input_dtype = param.input_dtype_;
-    const sc_data_type_t &weight_dtype = param.weight_dtype_;
-    const sc_dims &real_input_dims = param.get_real_input_dims();
-    const sc_dims &real_weight_dims = param.get_real_weight_dims();
-    const sc_dims &real_out_dims = param.get_real_out_dims();
-
-    sc_graph_t graph;
-    int M, N, K;
-    bool is_quantized
-            = utils::is_one_of(input_dtype, datatypes::u8, datatypes::s8);
-    bool is_s8s8
-            = input_dtype == datatypes::s8 && weight_dtype == datatypes::s8;
-    bool is_u8s8
-            = input_dtype == datatypes::u8 && weight_dtype == datatypes::s8;
-    bool is_bf16
-            = input_dtype == datatypes::bf16 && weight_dtype == datatypes::bf16;
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    M = real_input_dims[input_dims.size() - 2];
-    N = real_weight_dims[weight_dims.size() - 1];
-    K = real_input_dims[input_dims.size() - 1];
-
-    auto data = graph.make_input(
-            {graph_tensor::make(input_dims, sc_data_format_t(), input_dtype)});
-    auto weight = graph.make_input({graph_tensor::make(
-            weight_dims, sc_data_format_t(), weight_dtype)});
-    auto mmm = graph.make("managed_matmul_core",
-            {data->get_outputs()[0], weight->get_outputs()[0]},
-            {graph_tensor::make(out_dims, sc_data_format_t(),
-                    is_quantized ? datatypes::s32 : datatypes::f32)},
-            {});
-    if (!param.is_dynamic()) {
-        if (default_cfg) {
-            auto mmm_gen = mmm->dyn_cast<ops::managed_matmul_core_op_t>()
-                                   ->create_generator();
-            auto dcfg = *(managed_matmul_core_config_t *)mmm_gen
-                                 ->get_default_config(get_default_context())
-                                 .get();
-            mmm->stc_cast<tunable_op_t>()->set_config(
-                    reflection::general_object_t::make(dcfg));
-        } else {
-            auto mmm_gen = mmm->dyn_cast<ops::managed_matmul_core_op_t>()
-                                   ->create_generator();
-            auto gen = static_cast<gen_managed_matmul_core_t *>(mmm_gen.get());
-            if (!is_param_valid(param, cfg, gen->iim_block_, gen->iin_block_,
-                        gen->iik_block_)) {
-                GTEST_SKIP();
-            }
-            mmm->stc_cast<tunable_op_t>()->set_config(
-                    reflection::general_object_t::make(cfg));
-        }
-    }
-    thread_num_reset reseter;
-    // reduce the kernel number in ut.
-    if (param.is_dynamic()) {
-        if (!runtime_config_t::get().set_num_threads(14)) { GTEST_SKIP(); };
-    }
-    mmm->dyn_cast<op_traits::may_quantize_t>()->is_quantized_ = is_quantized;
-    sc_op_ptr output;
-    if (fuse_sigmoid) {
-        auto sig = graph.make("sigmoid", {mmm->get_outputs()[0]}, {}, {});
-        output = graph.make_output(sig->get_outputs());
-    } else {
-        output = graph.make_output(mmm->get_outputs());
-    }
-    graph.attrs_[sc_graph_t::attr_key_t::quantize] = is_quantized;
-    if (param.is_input_constant_) {
-        data->attrs_.set("constant", const_kind::local_const);
-        weight->attrs_.set("constant", const_kind::local_const);
-    }
-    graph_driver(graph, ctx);
-
-    auto f = lower_graph(
-            ctx, graph, std::vector<sc_op_ptr> {output, data, weight});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-
-    if (is_quantized && is_s8s8) { // s8s8
-        run_mmm_test<int8_t, int8_t, int32_t>(fptr, M, N, K, real_input_dims,
-                real_weight_dims, real_out_dims, fuse_sigmoid,
-                param.is_dynamic());
-    } else if (is_quantized && is_u8s8) { // u8s8
-        run_mmm_test<uint8_t, int8_t, int32_t>(fptr, M, N, K, real_input_dims,
-                real_weight_dims, real_out_dims, fuse_sigmoid,
-                param.is_dynamic());
-    } else if (is_bf16) { // bf16
-        run_mmm_test<bf16_t, bf16_t, float>(fptr, M, N, K, real_input_dims,
-                real_weight_dims, real_out_dims, fuse_sigmoid,
-                param.is_dynamic());
-    } else { // f32
-        run_mmm_test<float, float, float>(fptr, M, N, K, real_input_dims,
-                real_weight_dims, real_out_dims, fuse_sigmoid,
-                param.is_dynamic());
-    }
-}
-
-const managed_matmul_core_config_t cfg1 = {
-        28, // M_split_num
-        1, // N_split_num
-        2, // M_sub_block
-        2, // N_sub_block
-        2, // K_sub_block
-        1, // im_loop_order
-};
-
-const managed_matmul_core_config_t cfg2 = {
-        14, // M_split_num
-        2, // N_split_num
-        2, // M_sub_block
-        2, // N_sub_block
-        2, // K_sub_block
-        1, // im_loop_order
-};
-
-const managed_matmul_core_config_t cfg3 = {
-        7, // M_split_num
-        2, // N_split_num
-        3, // M_sub_block
-        3, // N_sub_block
-        3, // K_sub_block
-        1, // im_loop_order
-};
-
-const managed_matmul_core_config_t dummy_cfg = {1, 1, 1, 1, 1, 0};
-
-// f32
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_1) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg1);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_2) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg2);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_3) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg3);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_4) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, false},
-            managed_matmul_core_config_t(), true);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_5) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, true},
-            cfg1);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_6) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, true},
-            cfg2);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_7) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, true},
-            cfg3);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_8) {
-    check_managed_matmul({{1792, 1792}, {1792, 1792}, {1792, 1792},
-                                 datatypes::f32, datatypes::f32, true},
-            managed_matmul_core_config_t(), true);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_9) {
-    check_managed_matmul({{1125, 1115}, {1115, 1120}, {1125, 1120},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg1);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_10) {
-    check_managed_matmul({{1125, 1115}, {1115, 1120}, {1125, 1120},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg2);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_11) {
-    check_managed_matmul({{1125, 1115}, {1115, 1120}, {1125, 1120},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg3);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_12) {
-    check_managed_matmul({{1125, 1115}, {1115, 1120}, {1125, 1120},
-                                 datatypes::f32, datatypes::f32, false},
-            managed_matmul_core_config_t(), true);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_13) {
-    REQUIRE_VNNI();
-    check_managed_matmul({{2250, 2230}, {2230, 2240}, {2250, 2240},
-                                 datatypes::s8, datatypes::s8, false},
-            cfg1);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_14) {
-    REQUIRE_VNNI();
-    check_managed_matmul({{2250, 2230}, {2230, 2240}, {2250, 2240},
-                                 datatypes::s8, datatypes::s8, false},
-            cfg2);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_15) {
-    REQUIRE_VNNI();
-    check_managed_matmul({{2250, 2230}, {2230, 2240}, {2250, 2240},
-                                 datatypes::s8, datatypes::s8, false},
-            cfg3);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_16) {
-    REQUIRE_VNNI();
-    check_managed_matmul({{2250, 2230}, {2230, 2240}, {2250, 2240},
-                                 datatypes::s8, datatypes::s8, false},
-            managed_matmul_core_config_t(), true);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_17) {
-    REQUIRE_AMX();
-    REQUIRE_VNNI();
-    SET_THREADS_OR_SKIP(8);
-    const managed_gemm_params_t param {{1, 4096}, {4096, 16384}, {1, 16384},
-            datatypes::u8, datatypes::s8, false};
-    check_managed_matmul(param, managed_matmul_core_config_t(), true);
-
-    sc_graph_t graph;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    auto data = graph.make_input({graph_tensor::make(
-            param.input_dims_, sc_data_format_t(), param.input_dtype_)});
-    auto weight = graph.make_input({graph_tensor::make(
-            param.weight_dims_, sc_data_format_t(), param.weight_dtype_)});
-    auto mmm = graph.make("managed_matmul_core",
-            {data->get_outputs()[0], weight->get_outputs()[0]},
-            {graph_tensor::make(
-                    param.out_dims_, sc_data_format_t(), datatypes::s32)},
-            {});
-    auto gen = mmm->dyn_cast<ops::managed_matmul_core_op_t>()
-                       ->create_generator();
-    auto mmm_gen = dynamic_cast<ops::gen_managed_matmul_core_t *>(gen.get());
-    // If use amx, iik_block is 64; if use avx, iik_block is 16.
-    ASSERT_EQ(mmm_gen->iik_block_, 16);
-}
-
-// test iter anchor, currently only mmm that 1) outputs plain format; 2) has
-// post fusion; 3) has splits on K; 4) has imbalance, will use iter anchor
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_FUSED_SIGMOID1) {
-    check_managed_matmul({{912, 1344}, {1344, 912}, {912, 912}, datatypes::f32,
-                                 datatypes::f32, false},
-            cfg3, false, true);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_FUSED_SIGMOID2) {
-    check_managed_matmul({{912, 1344}, {1344, 1360}, {912, 1360},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg3, false, true);
-}
-TEST(GCCore_CPU_managed_matmul_test, TestMATMUL2D_FUSED_SIGMOID3) {
-    check_managed_matmul({{1136, 912}, {912, 1344}, {1136, 1344},
-                                 datatypes::f32, datatypes::f32, false},
-            cfg3, false, true);
-}
-
-TEST(GCCore_CPU_managed_matmul_test, TestDynamicMATMUL2D_F32) {
-    check_managed_matmul({{-1, 1792}, {1792, 1792}, {-1, 1792}, datatypes::f32,
-                                 datatypes::f32, false, {1792, 1792},
-                                 {1792, 1792}, {1792, 1792}},
-            dummy_cfg);
-    check_managed_matmul({{-1, 1115}, {1115, 1122}, {-1, 1122}, datatypes::f32,
-                                 datatypes::f32, false, {1125, 1115},
-                                 {1115, 1122}, {1125, 1122}},
-            dummy_cfg);
-    check_managed_matmul({{-1, 2230}, {2230, 2240}, {-1, 2240}, datatypes::f32,
-                                 datatypes::f32, false, {2250, 2230},
-                                 {2230, 2240}, {2250, 2240}},
-            dummy_cfg);
-}
-
-TEST(GCCore_CPU_managed_matmul_test, TestDynamicMATMUL2D_INT8) {
-    REQUIRE_VNNI();
-    check_managed_matmul(
-            {{-1, 1792}, {1792, 1792}, {-1, 1792}, datatypes::s8, datatypes::s8,
-                    false, {1792, 1792}, {1792, 1792}, {1792, 1792}},
-            dummy_cfg);
-    check_managed_matmul(
-            {{-1, 1115}, {1115, 1122}, {-1, 1122}, datatypes::s8, datatypes::s8,
-                    false, {1125, 1115}, {1115, 1122}, {1125, 1122}},
-            dummy_cfg);
-    check_managed_matmul(
-            {{-1, 2230}, {2230, 2240}, {-1, 2240}, datatypes::s8, datatypes::s8,
-                    false, {2250, 2230}, {2230, 2240}, {2250, 2240}},
-            dummy_cfg);
-}
-
-TEST(GCCore_CPU_managed_matmul_test, TestDynamicMATMUL2D_FUSED_SIGMOID) {
-    check_managed_matmul(
-            {{-1, 1344}, {1344, 913}, {-1, 913}, datatypes::f32, datatypes::f32,
-                    false, {913, 1344}, {1344, 913}, {913, 913}},
-            dummy_cfg, false, true);
-    check_managed_matmul({{-1, 1344}, {1344, 1360}, {-1, 1360}, datatypes::f32,
-                                 datatypes::f32, false, {912, 1344},
-                                 {1344, 1360}, {912, 1360}},
-            dummy_cfg, false, true);
-    check_managed_matmul(
-            {{-1, 912}, {-1, 1344}, {-1, 1344}, datatypes::f32, datatypes::f32,
-                    false, {1136, 912}, {912, 1344}, {1136, 1344}},
-            dummy_cfg, false, true);
-    check_managed_matmul({{-1, 4096}, {4096, 16384}, {-1, 16384},
-                                 datatypes::f32, datatypes::f32, false,
-                                 {4, 4096}, {4096, 16384}, {4, 16384}},
-            dummy_cfg, false, true);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_mark_inplace_in_main_entry.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_mark_inplace_in_main_entry.cpp
deleted file mode 100644
index 03da90568ee..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_mark_inplace_in_main_entry.cpp
+++ /dev/null
@@ -1,311 +0,0 @@
-/*******************************************************************************
- * Copyright 2023-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <memory>
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/jit/jit.hpp>
-#ifdef DO_BENCH
-#include <tuner/time_evaluator.hpp>
-#endif
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_mark_inplace_in_main_entry_cpp, AddAdd) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    sc_graph_t graph0;
-    int N = 64, Cin = 128, Hin = 56, Win = 56;
-    auto in0 = graph0.make_input({graph_tensor::make({N, Cin, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({N, Cin, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-    auto add0 = graph0.make(
-            "add", {in0->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    auto add1 = graph0.make(
-            "add", {in0->get_outputs()[0], in1->get_outputs()[0]}, {}, {});
-    auto out0 = graph0.make_output(add0->get_outputs());
-    auto out1 = graph0.make_output(add1->get_outputs());
-    // out1 can inplace inputs, but out0 can not inplace inputs.
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, out0, out1});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod);
-
-    /*
-    main_entry(buffer_1: [f32 * 25690112UL], buffer_0: [f32 * 25690112UL],
-            buffer_3: [f32 * 25690112UL], buffer_2: [f32 * 25690112UL])
-
-    buffer_3 -> buffer_0
-    */
-    EXPECT_EQ(fptr0->inplace_pairs_.size(), 0UL);
-    /*
-    EXPECT_EQ(fptr0->inplace_pairs_[0].first, 1UL); // input id
-    EXPECT_EQ(fptr0->inplace_pairs_[0].second, 2UL); // output id
-
-    auto in0_data = alloc_array<float>(N * Cin * Hin * Win);
-    auto in1_data = alloc_array<float>(N * Cin * Hin * Win);
-    auto out0_data = alloc_array<float>(N * Cin * Hin * Win);
-    auto out1_data = alloc_array<float>(N * Cin * Hin * Win);
-    fptr0->call_default(
-            &in0_data[0], &in1_data[0], &out0_data[0], &out1_data[0]);
-    fptr0->call_default(
-            &in0_data[0], &in1_data[0], &in1_data[0], &out1_data[0]);
-    for (size_t i = 0; i < size_t(N * Cin * Hin * Win); ++i) {
-        EXPECT_FLOAT_EQ(out0_data[i], in1_data[i]);
-    }
-    */
-}
-
-TEST(GCCore_CPU_mark_inplace_in_main_entry_cpp, ConvAdd0) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    sc_graph_t graph0;
-    int N = 64, Cin = 128, Hin = 56, Win = 56; // input feature
-    int Cout0 = 32, k0 = 3, stride0 = 1, padding0 = 1; // conv0
-    auto in0 = graph0.make_input({graph_tensor::make({N, Cout0, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({N, Cin, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make({Cout0, Cin, k0, k0},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides0 = {stride0, stride0}, paddings0 = {padding0, padding0};
-    auto conv = graph0.make("conv_fwd_core",
-            {in1->get_outputs()[0], in2->get_outputs()[0]}, {},
-            {{"strides", strides0}, {"paddings", paddings0}});
-    auto add = graph0.make(
-            "add", {conv->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add->get_outputs());
-    /*
-    The graph with topo:
-    in0          in1      in2
-     \            |        |
-      \        reorder4  reorder5
-       \        \         /
-     reorder3      conv6
-         \         /
-            add7
-             |
-          reorder8
-             |
-            out
-    */
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod, true);
-
-    /*
-    main_entry(buffer_2: [f32 * 6422528UL], buffer_1: [f32 * 25690112UL],
-            buffer_0: [f32 * 36864UL], buffer_5: [f32 * 6422528UL])
-
-    buffer_5 can not inplace buffer_1 because their sizes are not equal.
-    */
-
-    EXPECT_EQ(fptr0->inplace_pairs_.size(), 0UL);
-}
-
-TEST(GCCore_CPU_mark_inplace_in_main_entry_cpp, ConvAdd1) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    sc_graph_t graph0;
-    int N = 1, Cin = 1, Hin = 4, Win = 4; // input feature
-    int Cout0 = 1, k0 = 1, stride0 = 1, padding0 = 0; // conv0
-    auto in0 = graph0.make_input({graph_tensor::make({N, Cout0, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make({N, Cin, Hin, Win},
-            sc_data_format_t(format_kinds::NCHW), datatypes::f32)});
-    auto in2 = graph0.make_input({graph_tensor::make({Cout0, Cin, k0, k0},
-            sc_data_format_t(format_kinds::KCRS), datatypes::f32)});
-    sc_dims strides0 = {stride0, stride0}, paddings0 = {padding0, padding0};
-    auto conv = graph0.make("conv_fwd_core",
-            {in1->get_outputs()[0], in2->get_outputs()[0]}, {},
-            {{"strides", strides0}, {"paddings", paddings0}});
-    auto add = graph0.make(
-            "add", {conv->get_outputs()[0], in0->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add->get_outputs());
-    /*
-    The graph with topo:
-    in0   in1   in2
-      \    \     /
-       \    conv6
-        \   /
-         add7
-          |
-         out
-    */
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod, true);
-
-    /*
-    * @param buffer_2 [f32 [1, 1, 4, 4] @ ABCD]
-    * @param buffer_1 [f32 [1, 1, 4, 4] @ ABCD]
-    * @param buffer_0 [f32 [1, 1, 1, 1] @ ABCD]
-    * @param buffer_3 [f32 [1, 1, 4, 4] @ ABCD]
-    func main_entry(buffer_2: [f32 * 16UL], buffer_1: [f32 * 16UL],
-            buffer_0: [f32 * 1UL], buffer_3: [f32 * 16UL]): void {
-        evaluate{outerloop_1X1X1X4_partition_conv_fwd_core_add_8(&buffer_3[0UL],
-                &buffer_1[0UL], &buffer_0[0UL], &buffer_2[0UL])}
-    }
-    buffer_3 -> buffer_2
-    */
-    EXPECT_EQ(fptr0->inplace_pairs_.size(), 0UL);
-    /*
-    EXPECT_EQ(fptr0->inplace_pairs_[0].first, 0UL); // input id
-    EXPECT_EQ(fptr0->inplace_pairs_[0].second, 3UL); // output id
-
-    auto in0_data = alloc_array<float>(N * Cout0 * Hin * Win);
-    auto in1_data = alloc_array<float>(N * Cin * Hin * Win);
-    auto in2_data = alloc_array<float>(Cout0 * Cin * k0 * k0);
-    auto out_data = alloc_array<float>(N * Cout0 * Hin * Win);
-    fptr0->call_default(&in0_data[0], &in1_data[0], &in2_data[0], &out_data[0]);
-    fptr0->call_default(&in0_data[0], &in1_data[0], &in2_data[0], &in0_data[0]);
-    for (size_t i = 0; i < size_t(N * Cout0 * Hin * Win); ++i) {
-        EXPECT_FLOAT_EQ(out_data[i], in0_data[i]);
-    }
-    */
-}
-
-TEST(GCCore_CPU_mark_inplace_in_main_entry_cpp, MatmulAdd) {
-    REQUIRE_AVX2();
-    REQUIRE_BF16();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    int M = 384, K = 1024, N = 768;
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make(
-            {M, N}, sc_data_format_t(format_kinds::MN), datatypes::f32)});
-    auto in1 = graph0.make_input({graph_tensor::make(
-            {M, K}, sc_data_format_t(format_kinds::MK), datatypes::bf16)});
-    auto in2 = graph0.make_input({graph_tensor::make(
-            {K, N}, sc_data_format_t(format_kinds::KN), datatypes::bf16)});
-    auto matmul = graph0.make("matmul_core",
-            {in1->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-    auto add = graph0.make(
-            "add", {in0->get_outputs()[0], matmul->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add->get_outputs());
-    /*
-    The graph with topo:
-    in0      in1      in2
-     \        |        |
-      \       |    reorder3
-       \      \       /
-        \      matmul4
-         \     /
-          add5
-           |
-          out
-    add5's output, output of the graph, will inplace in0, input of the graph.
-    */
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, out});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod, true);
-
-    /*
-    main_entry(buffer_2: [f32 * 4096UL], buffer_1: [bf16 * 8192UL],
-            buffer_0: [bf16 * 131072UL], buffer_4: [f32 * 4096UL])
-
-    buffer_4 -> buffer_2
-    */
-    EXPECT_EQ(fptr0->inplace_pairs_.size(), 0UL);
-    /*
-    EXPECT_EQ(fptr0->inplace_pairs_[0].first, 0UL); // input id
-    EXPECT_EQ(fptr0->inplace_pairs_[0].second, 3UL); // output id
-
-    auto in0_data = alloc_array<float>(M * N);
-    auto in1_data = alloc_array<bf16_t>(M * K);
-    auto in2_data = alloc_array<bf16_t>(K * N);
-    auto out_data = alloc_array<float>(M * N);
-    fptr0->call_default(&in0_data[0], &in1_data[0], &in2_data[0], &out_data[0]);
-    fptr0->call_default(&in0_data[0], &in1_data[0], &in2_data[0], &in0_data[0]);
-    for (size_t i = 0; i < size_t(M * N); ++i) {
-        EXPECT_FLOAT_EQ(out_data[i], in0_data[i]);
-    }
-    */
-}
-
-TEST(GCCore_CPU_mark_inplace_in_main_entry_cpp, MatmulMulAdd) {
-    REQUIRE_AVX2();
-    REQUIRE_VNNI();
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    builder::ir_builder_t bld;
-
-    sc_graph_t graph0;
-    auto in0 = graph0.make_input({graph_tensor::make(
-            {16, 512}, sc_data_format_t(format_kinds::MK), datatypes::u8)});
-    auto in1 = graph0.make_input({graph_tensor::make(
-            {512, 256}, sc_data_format_t(format_kinds::KN), datatypes::s8)});
-    auto in2 = graph0.make_input({graph_tensor::make(
-            {16, 256}, sc_data_format_t(format_kinds::MN), datatypes::f32)});
-    auto in3 = graph0.make_input({graph_tensor::make(
-            {16, 256}, sc_data_format_t(format_kinds::MN), datatypes::f32)});
-    auto deq0 = graph0.make("dequantize", {in0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto deq1 = graph0.make("dequantize", {in1->get_outputs()[0]}, {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto matmul = graph0.make("matmul_core",
-            {deq0->get_outputs()[0], deq1->get_outputs()[0]}, {}, {});
-    auto mul = graph0.make(
-            "mul", {matmul->get_outputs()[0], in2->get_outputs()[0]}, {}, {});
-    auto add = graph0.make(
-            "add", {mul->get_outputs()[0], in3->get_outputs()[0]}, {}, {});
-    auto out = graph0.make_output(add->get_outputs());
-    /*
-    The graph with topo:
-    in0      in1    in2    in3
-    |        |       |      |
-    |    reorder5    |      |
-     \       /       |      |
-    quant_matmul6   /      /
-            \      /      /
-              mul7       /
-                \       /
-                  add8
-                   |
-                  out
-    Ideally, out will inplace use in3.
-    */
-
-    graph_driver(graph0, ctx);
-    auto ir_mod = lower_graph(ctx, graph0, {in0, in1, in2, in3, out});
-    auto fptr0 = jit_engine_t::make(ctx)->get_entry_func(ir_mod, true);
-
-    /*
-    main_entry(buffer_3: [u8 * 8192UL], buffer_2: [s8 * 131072UL],
-            buffer_1: [f32 * 4096UL], buffer_0: [f32 * 4096UL],
-            buffer_5: [f32 * 4096UL])
-
-    no output arg can inplace input arg.
-    */
-    EXPECT_EQ(fptr0->inplace_pairs_.size(), 0UL);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_memorypool.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_memorypool.cpp
deleted file mode 100644
index f7f037ed73b..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_memorypool.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <thread>
-#include <runtime/config.hpp>
-#include <runtime/context.hpp>
-#include <runtime/memorypool.hpp>
-#include <runtime/os.hpp>
-#include <runtime/parallel.hpp>
-#include <runtime/runtime.hpp>
-#include <runtime/thread_locals.hpp>
-#include <runtime/thread_locals_registry.hpp>
-#include <util/utils.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc::memory_pool;
-
-TEST(GCCore_CPU_test_memorypool, TestMemoryPool) {
-    // push 10; push large; push 100; push 200; pop 200; pop 100; pop large;
-    // push page-200; push 200; pop 200; pop page-200; push 10
-    unsigned pagesize = dnnl::impl::graph::gc::runtime::get_os_page_size();
-    auto *rctx = dnnl::impl::graph::gc::runtime::get_default_stream();
-    filo_memory_pool_t pool(pagesize);
-    auto ptr1 = pool.alloc(rctx, 10);
-    ASSERT_EQ(reinterpret_cast<intptr_t>(ptr1) % 64, 0);
-    auto block1 = pool.current_;
-    ASSERT_EQ(block1->allocated_, 64u + 10);
-    ASSERT_EQ(block1->size_, pagesize);
-
-    auto ptr2 = pool.alloc(rctx,
-            2 * pagesize
-                    - 64); // very large buffer, but can be allocated in 2 pages
-    ASSERT_EQ(reinterpret_cast<intptr_t>(ptr2) % 64, 0);
-    auto block2 = pool.current_;
-    ASSERT_NE(block1, block2);
-    ASSERT_EQ(block2->allocated_, 2 * pagesize);
-    ASSERT_EQ(block2->size_, 2 * pagesize);
-
-    // small object in new block
-    auto ptr3 = pool.alloc(rctx, 100);
-    ASSERT_EQ(reinterpret_cast<intptr_t>(ptr3) % 64, 0);
-    auto block3 = pool.current_;
-    ASSERT_NE(block3, block2);
-    ASSERT_EQ(block3->allocated_, 64u + 100);
-    ASSERT_EQ(block3->size_, pagesize);
-
-    // small object in the current block
-    auto ptr4 = pool.alloc(rctx, 200);
-    ASSERT_EQ(reinterpret_cast<intptr_t>(ptr4) % 64, 0);
-    ASSERT_EQ(block3, pool.current_);
-    ASSERT_EQ(block3->allocated_, 64u * 3 + 200);
-
-    // pop 200
-    pool.dealloc(ptr4);
-    ASSERT_EQ(block3, pool.current_);
-    ASSERT_EQ(block3->allocated_, 64u + 100);
-
-    // pop 100
-    pool.dealloc(ptr3);
-    ASSERT_EQ(block2, pool.current_);
-    ASSERT_EQ(block2->allocated_, 2u * pagesize);
-
-    // pop ptr2
-    pool.dealloc(ptr2);
-    ASSERT_EQ(block1, pool.current_);
-    ASSERT_EQ(block1->allocated_, 64u + 10);
-
-    // alloc pagesize -200, should not switch block
-    auto ptr5 = pool.alloc(rctx, pagesize - 200);
-    ASSERT_EQ(reinterpret_cast<intptr_t>(ptr5) % 64, 0);
-    ASSERT_EQ(block1, pool.current_);
-    ASSERT_EQ(block1->allocated_, 128u + pagesize - 200);
-
-    // alloc 200, should reuse block2
-    auto ptr6 = pool.alloc(rctx, 200);
-    ASSERT_EQ(reinterpret_cast<intptr_t>(ptr6) % 64, 0);
-    ASSERT_EQ(block2, pool.current_);
-    ASSERT_EQ(block2->allocated_, 64u + 200);
-
-    // pop ptr6
-    pool.dealloc(ptr6);
-    ASSERT_EQ(block1, pool.current_);
-    ASSERT_EQ(block1->allocated_, 128u + pagesize - 200);
-
-    // pop ptr5
-    pool.dealloc(ptr5);
-    ASSERT_EQ(block1, pool.current_);
-    ASSERT_EQ(block1->allocated_, 64u + 10);
-
-    // pop ptr1
-    pool.dealloc(ptr1);
-    ASSERT_EQ(block1, pool.current_);
-    ASSERT_EQ(block1->allocated_, sizeof(memory_block_t));
-}
-
-using namespace dnnl::impl::graph::gc;
-
-static void thread_workload(void *v1, void *v2, int64_t i, generic_val *args) {
-    auto s = dnnl::impl::graph::gc::runtime::get_default_stream();
-    void *a = sc_aligned_malloc(s, 64);
-    void *b = sc_thread_aligned_malloc(s, 64);
-    sc_aligned_free(s, a);
-    sc_thread_aligned_free(s, b);
-}
-
-static void run_alloc_and_free() {
-    sc_parallel_call_cpu_with_env_impl(thread_workload, 0, nullptr, nullptr, 0,
-            runtime_config_t::get().get_num_threads(), 1, nullptr);
-};
-
-TEST(GCCore_CPU_test_memorypool, TestMemoryPoolRelease) {
-    run_alloc_and_free();
-    auto stream = runtime::get_default_stream();
-    filo_memory_pool_t *thread_p
-            = &runtime::get_tls(stream).thread_memory_pool_,
-            *main_p = &runtime::get_tls(stream).main_memory_pool_;
-    ASSERT_NE(thread_p->buffers_, nullptr);
-    ASSERT_NE(main_p->buffers_, nullptr);
-    dnnl::impl::graph::gc::release_runtime_memory(stream->engine_);
-    ASSERT_EQ(thread_p->buffers_, nullptr);
-    ASSERT_EQ(main_p->buffers_, nullptr);
-    // make sure that after resetting, the memory pool can still work
-    run_alloc_and_free();
-
-    {
-        // make sure when a thread is destroyed before exit(), everything is ok
-        std::thread th {thread_workload, nullptr, nullptr, 0, nullptr};
-        th.join();
-    }
-}
-
-TEST(GCCore_CPU_test_memorypool, TestTraceRelease) {
-    std::shared_ptr<runtime::thread_local_registry_t> reg;
-    runtime::thread_local_buffer_t::additional_t *the_additional = nullptr;
-    {
-        // make sure when a thread is destroyed before exit(), the trace is
-        // preserved before release_runtime_memory is called.
-        std::thread th {[&]() {
-            runtime::thread_local_buffer_t::tls_buffer()
-                    .additional_->trace_.trace_logs_.emplace_back();
-            reg = runtime::thread_local_buffer_t::tls_buffer()
-                          .additional_->registry_;
-            the_additional = runtime::thread_local_buffer_t::tls_buffer()
-                                     .additional_.get();
-        }};
-        th.join();
-    }
-    ASSERT_TRUE(reg && the_additional);
-    bool found = false;
-    reg->for_each_tls_additional(
-            [&](runtime::thread_local_buffer_t::additional_t *v) {
-                if (v == the_additional) {
-                    ASSERT_FALSE(v->registry_);
-                    found = true;
-                    v->trace_.trace_logs_.clear();
-                }
-            });
-    ASSERT_TRUE(found);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_microkernel_cpu.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_microkernel_cpu.cpp
deleted file mode 100644
index a6b24243650..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_microkernel_cpu.cpp
+++ /dev/null
@@ -1,395 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <numeric>
-#include "reference/act_ref.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/sc_data_type.hpp>
-#include <ops/templates/utils.hpp>
-#include <runtime/context.hpp>
-#include <runtime/microkernel/cpu/brgemm_common.hpp>
-#include <runtime/microkernel/cpu/brgemm_range_handle.hpp>
-#include <runtime/microkernel/cpu/microkernel.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::test_utils;
-using namespace dnnl::impl::graph::gc::brgemm;
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnF32) {
-    REQUIRE_AVX512();
-    const int M = 32;
-    const int N = 64;
-    const int K = 16;
-    const int blocks = 10;
-    std::vector<float> A(blocks * M * K, 1.f);
-    std::vector<float> B(blocks * N * K, 1.f);
-    std::vector<float> C(M * N);
-    dnnl_brgemm_init_update(A.data(), B.data(), C.data(), blocks, M, N, K, K, N,
-            N, M * K, K * N, datatypes::f32.as_etype_int(),
-            datatypes::f32.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    for (auto i : C) {
-        EXPECT_EQ(i, 160.f);
-    }
-}
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnBF16) {
-    REQUIRE_BF16();
-    const int M = 32;
-    const int N = 64;
-    const int K = 16;
-    const int blocks = 10;
-    std::vector<bf16_t> A_bf16(blocks * M * K);
-    std::vector<bf16_t> tmpB_bf16(blocks * K * N);
-    fill_data<bf16_t>(A_bf16.data(), blocks * M * K);
-    fill_data<bf16_t>(tmpB_bf16.data(), blocks * K * N);
-    std::vector<float> A, B;
-    A.reserve(blocks * M * K);
-    B.reserve(blocks * K * N);
-    for (auto &it : A_bf16) {
-        A.emplace_back(float(it));
-    }
-    for (auto &it : tmpB_bf16) {
-        B.emplace_back(float(it));
-    }
-    std::vector<bf16_t> B_bf16
-            = reorder_low_accuracy_format<bf16_t>(tmpB_bf16, blocks, K, N);
-    std::vector<float> C_bf16(blocks * M * N), C_ref(blocks * M * N);
-    dnnl_brgemm_init_update(A_bf16.data(), B_bf16.data(), C_bf16.data(), blocks,
-            M, N, K, K, N, N, M * K, K * N, datatypes::bf16.as_etype_int(),
-            datatypes::bf16.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    dnnl_brgemm_init_update(A.data(), B.data(), C_ref.data(), blocks, M, N, K,
-            K, N, N, M * K, K * N, datatypes::f32.as_etype_int(),
-            datatypes::f32.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    for (unsigned i = 0; i < C_bf16.size(); i++) {
-        EXPECT_TRUE(std::abs(C_bf16[i] - C_ref[i]) < 1e-4f);
-    }
-}
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnS8S8) {
-    REQUIRE_VNNI();
-    const int M = 32;
-    const int N = 64;
-    const int K = 16;
-    const int blocks = 10;
-    std::vector<int8_t> qA(blocks * M * K);
-    std::vector<int8_t> tmpB(blocks * N * K);
-    fill_data(qA.data(), blocks * M * K);
-    fill_data(tmpB.data(), blocks * N * K);
-    std::vector<float> refA(qA.begin(), qA.end());
-    std::vector<float> refB(tmpB.begin(), tmpB.end());
-    auto ctx = get_default_context();
-    if (!ctx->use_amx()) {
-        for (auto &it : refA) {
-            it += 128;
-        }
-    }
-    std::vector<int8_t> qB
-            = reorder_low_accuracy_format<int8_t>(tmpB, blocks, K, N);
-    std::vector<float> refC(M * N);
-    std::vector<int32_t> qC(M * N);
-
-    dnnl_brgemm_init_update(qA.data(), qB.data(), qC.data(), blocks, M, N, K, K,
-            N, N, M * K, K * N, datatypes::s8.as_etype_int(),
-            datatypes::s8.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    dnnl_brgemm_init_update(refA.data(), refB.data(), refC.data(), blocks, M, N,
-            K, K, N, N, M * K, K * N, datatypes::f32.as_etype_int(),
-            datatypes::f32.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    for (unsigned i = 0; i < qC.size(); i++) {
-        EXPECT_TRUE(std::abs(qC[i] - refC[i]) < 1e-4f);
-    }
-}
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnU8S8) {
-    REQUIRE_VNNI();
-    const int M = 32;
-    const int N = 64;
-    const int K = 14;
-    const int blocks = 10;
-    std::vector<uint8_t> qA(blocks * M * K);
-    std::vector<int8_t> tmpB(blocks * N * K);
-    fill_data(qA.data(), blocks * M * K);
-    fill_data(tmpB.data(), blocks * N * K);
-    std::vector<float> refA(qA.begin(), qA.end());
-    std::vector<float> refB(tmpB.begin(), tmpB.end());
-    std::vector<int8_t> qB
-            = reorder_low_accuracy_format<int8_t>(tmpB, blocks, K, N);
-    std::vector<float> refC(M * N);
-    std::vector<int32_t> qC(M * N);
-    dnnl_brgemm_init_update(qA.data(), qB.data(), qC.data(), blocks, M, N, K, K,
-            N, N, M * K, 16 * N, datatypes::u8.as_etype_int(),
-            datatypes::s8.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    dnnl_brgemm_init_update(refA.data(), refB.data(), refC.data(), blocks, M, N,
-            K, K, N, N, M * K, K * N, datatypes::f32.as_etype_int(),
-            datatypes::f32.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    for (unsigned i = 0; i < qC.size(); i++) {
-        EXPECT_TRUE(std::abs(qC[i] - refC[i]) < 1e-4f);
-    }
-}
-
-// fix-me(brgemm-fuse): recover the following tests when postop is fixed
-#if 0
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnPostOpCombined) {
-    REQUIRE_VNNI();
-    const int M = 32;
-    const int N = 64;
-    const int K = 14;
-    const int padK = utils::divide_and_ceil(K, 4) * 4;
-    const int blocks = 10;
-    float single_scale = 0.001f;
-    std::vector<uint8_t> qA(blocks * M * K);
-    std::vector<int8_t> tmpB(blocks * N * K);
-
-    std::vector<float> bias(N);
-    std::vector<float> scales(N, single_scale * single_scale);
-    std::vector<float> qbias(N);
-    std::vector<float> bin_in(1 * N);
-    // currently not support zp because of brgemm interface.
-    // But it is effective.
-    int a_zp = 0;
-    int b_zp = 0;
-    int c_zp = 0;
-
-    fill_data(qA.data(), blocks * M * K);
-    fill_data(tmpB.data(), blocks * N * K);
-    fill_data(bias.data(), N);
-    fill_data(bin_in.data(), 1 * N);
-    test_utils::parallel_nd(N, [&](int n) { qbias[n] = bias[n] / scales[n]; });
-    std::vector<int8_t> qB
-            = reorder_low_accuracy_format<int8_t>(tmpB, blocks, K, N);
-    std::vector<float> qC(M * N);
-    std::vector<int32_t> refC_s32(M * N);
-    std::vector<float> refA(qA.begin(), qA.end());
-    std::vector<float> refB(tmpB.begin(), tmpB.end());
-    std::vector<float> refC_f32(M * N);
-    std::vector<int32_t> c_buf(M * N);
-    std::vector<int32_t> a_compen(1 * N, 0);
-    std::vector<int32_t> b_compen(M * 1, 0);
-
-    test_utils::parallel_nd(blocks * M * K, [&](int bs_m_k) {
-        refA[bs_m_k] = (refA[bs_m_k] - a_zp) * single_scale;
-    });
-    test_utils::parallel_nd(blocks * K * N, [&](int bs_k_n) {
-        refB[bs_k_n] = (refB[bs_k_n] - b_zp) * single_scale;
-    });
-    test_utils::parallel_nd(N, [&](int n) {
-        for (int i = 0; i < blocks; i++) {
-            for (int j = 0; j < K; j++) {
-                a_compen[n] += 0 - a_zp * tmpB[i * K * N + j * N + n];
-            }
-        }
-    });
-    test_utils::parallel_nd(M, [&](int m) {
-        for (int i = 0; i < blocks; i++) {
-            for (int j = 0; j < K; j++) {
-                b_compen[m] += b_zp * (a_zp - qA[i * K * M + m * K + j]);
-            }
-        }
-    });
-
-    int postops_num = 5; // 8 for zp
-    std::shared_ptr<char> dset(
-            new char[sizeof(postop_setting_t) * postops_num + sizeof(int64_t)]);
-    std::shared_ptr<char> ddata(new char[postops_data_size]);
-    postops_setting_t *pset = reinterpret_cast<postops_setting_t *>(dset.get());
-    void *pdata = reinterpret_cast<void *>(ddata.get());
-    pset->num_ = postops_num;
-    pset->ops_[0].scale_op_ = scale_op_t();
-    pset->ops_[1].bias_op_ = bias_op_t(sc_data_etype::F32);
-    int bin_shape[2] = {1, N};
-    pset->ops_[2].bin_op_
-            = bin_op_t(alg_kind_t::binary_add, bin_shape, sc_data_etype::F32);
-    pset->ops_[3].elt_op_ = elt_op_t(alg_kind_t::eltwise_relu, 1.f, 0.f);
-    pset->ops_[4].out_op_ = out_op_t(sc_data_etype::F32);
-    // pset->ops_[5].zp_op_ = zp_op_t(alg_kind_t::a_zp);
-    // pset->ops_[6].zp_op_ = zp_op_t(alg_kind_t::b_zp);
-    // pset->ops_[7].zp_op_ = zp_op_t(alg_kind_t::c_zp);
-    void *bin_ptr = bin_in.data();
-    void *bin_ptr2 = &bin_ptr;
-    dnnl_brgemm_postops_data_init(pdata, qbias.data(), scales.data(), bin_ptr2,
-            0, 0, qC.data(), 0, a_compen.data(), b_compen.data(), &c_zp);
-    dnnl_brgemm_init_update(qA.data(), qB.data(), qC.data(), blocks, M, N, K, K,
-            N, N, M * K, padK * N, datatypes::u8.as_etype_int(),
-            datatypes::s8.as_etype_int(), nullptr, nullptr, pset, pdata,
-            c_buf.data(), nullptr);
-    dnnl_brgemm_init_update(refA.data(), refB.data(), refC_f32.data(), blocks,
-            M, N, K, K, N, N, M * K, K * N, datatypes::f32.as_etype_int(),
-            datatypes::f32.as_etype_int(), nullptr, nullptr, nullptr,
-            nullptr, nullptr, nullptr);
-    test_utils::parallel_nd(M, N, [&](int m, int n) {
-        refC_f32[m * N + n] = refC_f32[m * N + n] + bias[n];
-    });
-    test_utils::parallel_nd(
-            M, N, [&](int m, int n) { refC_f32[m * N + n] += bin_in[n]; });
-    ref_relu(refC_f32.data(), refC_f32.data(), M * N);
-    test_utils::parallel_nd(
-            M, N, [&](int m, int n) { refC_f32[m * N + n] += c_zp; });
-    EXPECT_TRUE(!std::all_of(qC.begin(), qC.end(),
-            [&](const float &x) { return std::abs(x) < 1e-6f; }));
-    for (unsigned i = 0; i < qC.size(); i++) {
-        EXPECT_TRUE(std::abs(qC[i] - refC_f32[i]) < 1e-4f);
-    }
-}
-#endif
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnAttrs) {
-    // bd mask is only supported when
-    // use_uker=true
-    // bd_mask_level>0
-    // use list_addr brgemm
-    REQUIRE_AMX();
-    const int M = 63;
-    const int N = 64;
-    const int K = 64;
-    const int blocks = 9;
-    const int ow = 9;
-    const int attr_num = 4;
-
-    std::vector<char> bd_mask(M, 1);
-    for (int i = 0; i < M; i++) {
-        bd_mask[i] = (i % ow == (ow - 1) || i % ow == (ow - 2)) ? 0 : 1;
-    }
-
-    std::vector<uint8_t> A(blocks * M * K, 1);
-    std::vector<int8_t> B(blocks * N * K, 1);
-    std::vector<int32_t> C(M * N, 0);
-    std::vector<char> attr_data(
-            sizeof(int64_t) + sizeof(attrs_setting_t::attrs_map_t) * attr_num);
-    attrs_setting_t *attrs
-            = reinterpret_cast<attrs_setting_t *>(attr_data.data());
-    attrs->num_ = attr_num;
-    attrs->map_[0] = std::make_pair(attr_key::bd_mask_level, 2);
-    attrs->map_[1] = std::make_pair(attr_key::use_uker, true);
-    attrs->map_[2] = std::make_pair(attr_key::max_bs, blocks);
-    attrs->map_[3] = std::make_pair(attr_key::use_interleave_stores, true);
-
-    const void *A_ptr = A.data();
-    const void *B_ptr = B.data();
-    dnnl_brgemm_list_update(&A_ptr, &B_ptr, C.data(), blocks, M, N, K, K, N, N,
-            M * K, K * N, 1, datatypes::u8.as_etype_int(),
-            datatypes::s8.as_etype_int(), /*attrs*/ attrs,
-            /*bd_mask*/ bd_mask.data(), /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr, /*top_pad*/ nullptr,
-            /*bottom_pad*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-
-    int mask_start = std::accumulate(bd_mask.begin(), bd_mask.end(), 0);
-    const int expected = K * blocks;
-    for (int i = 0; i < M; i++) {
-        for (int j = 0; j < N; j++) {
-            if (i < mask_start) {
-                EXPECT_EQ(C[i * N + j], expected);
-            } else {
-                EXPECT_EQ(C[i * N + j], 0);
-            }
-        }
-    }
-}
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBrgemmOnednnRange) {
-    REQUIRE_VNNI();
-    const int M = 32;
-    const int N = 64;
-    const int K = 14;
-    const int blocks = 10;
-    std::vector<uint8_t> qA(blocks * M * K);
-    std::vector<int8_t> tmpB(blocks * N * K);
-    fill_data(qA.data(), blocks * M * K);
-    fill_data(tmpB.data(), blocks * N * K);
-    std::vector<float> refA(qA.begin(), qA.end());
-    std::vector<float> refB(tmpB.begin(), tmpB.end());
-    std::vector<int8_t> qB
-            = reorder_low_accuracy_format<int8_t>(tmpB, blocks, K, N);
-    std::vector<float> refC(M * N);
-    std::vector<int32_t> qC_strd(M * N), qC_list(M * N);
-
-    brg_range_handle_t stride_handle(M + 1, N, K, K, N, N, M * K, 16 * N, 0.f,
-            datatypes::u8.as_etype_int(), datatypes::s8.as_etype_int(),
-            /*attrs*/ nullptr, /*M_tail_value*/ brg_range_tail_value::dyn_tail,
-            /*N_tail_value*/ brg_range_tail_value::no_tail,
-            /*K_tail_value*/ brg_range_tail_value::no_tail);
-    brg_range_handle_t list_handle(M + 1, N, K, K, N, N, 0.f,
-            datatypes::u8.as_etype_int(), datatypes::s8.as_etype_int(),
-            /*attrs*/ nullptr, /*M_tail_value*/ M,
-            /*N_tail_value*/ brg_range_tail_value::no_tail,
-            /*K_tail_value*/ brg_range_tail_value::no_tail);
-    dnnl_brgemm_call_range(&stride_handle, M, N, K, qA.data(), qB.data(),
-            qC_strd.data(), blocks, nullptr, nullptr,
-            runtime::get_default_stream());
-    const void *qA_ptr = qA.data(), *qB_ptr = qB.data();
-    dnnl_brgemm_list_call_range(&list_handle, M, N, K, &qA_ptr, &qB_ptr,
-            qC_list.data(), blocks, M * K, 16 * N, 1,
-            datatypes::u8.as_etype_int(), datatypes::s8.as_etype_int(), nullptr,
-            nullptr, runtime::get_default_stream());
-    dnnl_brgemm_init_update(refA.data(), refB.data(), refC.data(), blocks, M, N,
-            K, K, N, N, M * K, K * N, datatypes::f32.as_etype_int(),
-            datatypes::f32.as_etype_int(), /*attrs*/ nullptr,
-            /*bd_mask*/ nullptr, nullptr, nullptr, /*postop set*/ nullptr,
-            /*postop data*/ nullptr, /*c_buf*/ nullptr,
-            /*ctx*/ runtime::get_default_stream());
-    for (unsigned i = 0; i < qC_strd.size(); i++) {
-        EXPECT_TRUE(std::abs(qC_strd[i] - refC[i]) < 1e-4f);
-        EXPECT_TRUE(std::abs(qC_list[i] - refC[i]) < 1e-4f);
-    }
-}
-
-template <typename dtype>
-static bool check_brgemm_init(const int &M, const int &LDC, const int M1) {
-    const int buf_size = M * LDC;
-    const int N = LDC / 2;
-    auto buf = alloc_array<dtype>(buf_size);
-    auto ref = buf.copy();
-    for (int i = 0; i < M1; ++i) {
-        for (int j = 0; j < N; ++j) {
-            auto idx = i * LDC + j;
-            ref[idx] = (dtype)0;
-        }
-    }
-
-    auto sc_dtype = sc_data_traits_t<dtype>::type();
-    dnnl_brgemm_init(&buf[0], M1, N, LDC, sc_dtype, 0);
-
-    return (memcmp(&buf[0], &ref[0], M * LDC * sizeof(dtype)) == 0);
-}
-
-TEST(GCCore_CPU_microkernel_cpu_cpp, TestBRGEMMInit) {
-    const int M = 16;
-    const int LDC = 64;
-    EXPECT_TRUE(check_brgemm_init<int8_t>(M, LDC, 3));
-    EXPECT_TRUE(check_brgemm_init<bf16_t>(M, LDC, 5));
-    EXPECT_TRUE(check_brgemm_init<float>(M, LDC, 6));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_mixed_partition.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_mixed_partition.cpp
deleted file mode 100644
index 5ee370e35fa..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_mixed_partition.cpp
+++ /dev/null
@@ -1,2705 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "exception_util.hpp"
-#include "test_graph.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/fusible_op_utils.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/fusible/pooling.hpp>
-#include <ops/managed_matmul_core.hpp>
-#include <ops/matmul_core.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <ops/templates/managed_matmul_core.hpp>
-#include <ops/templates/matmul_core.hpp>
-#include <reference/act_ref.hpp>
-#include <reference/gemm_ref.hpp>
-#include <runtime/config.hpp>
-#ifdef DO_BENCH
-#include <tuner/time_evaluator.hpp>
-#endif
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphFuseOpPass) {
-    SET_THREADS_OR_SKIP(28);
-
-    sc_graph_t graph;
-    auto in_a = graph.make_input({graph_tensor::make(
-            {28, 64, 16, 16}, sc_data_format_t::NCHWc(32))});
-    auto in_2 = graph.make_input({graph_tensor::make(
-            {28, 64, 16, 16}, sc_data_format_t::NCHWc(32))});
-    auto in_weight1 = graph.make_input({graph_tensor::make(
-            {64, 64, 1, 1}, sc_data_format_t::KCRSck(32, 32))});
-
-    auto conv = graph.make("conv_fwd_core",
-            {in_a->get_outputs()[0], in_weight1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    conv->get_outputs()[0]->details_.set_format(sc_data_format_t::NCHWc(32));
-    auto addop1 = graph.make(
-            "add", {conv->get_outputs()[0], in_2->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", addop1->get_outputs(), {}, {});
-    auto raddop = graph.make("reduce", relu1->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {0}}, {"rd_op", 0}});
-    auto in_3 = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, raddop->get_outputs()[0]->details_)});
-    auto addop = graph.make(
-            "add", {raddop->get_outputs()[0], in_3->get_outputs()[0]}, {}, {});
-    auto relu2 = graph.make("relu", addop->get_outputs(), {}, {});
-    graph.make_output(relu2->get_outputs());
-
-    auto ctx = get_test_ctx();
-
-    ops::conv_fwd_config_t cfg = {32, 32, 1, 1, 1, 1, 0, 1};
-    conv->dyn_cast<ops::conv_fwd_core_op_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[28, 2, 16, 16, 32], v1: f32[28, 2, 16, 16, 32], v2: f32[2, 2, 1, 1, 32, 32], v3: f32[1, 2, 16, 16, 32]) -> [v4: f32[1, 2, 16, 16, 32]] {
-  [v5: f32[28, 2, 16, 16, 32]] = outerloop_28X1X2X16_partition_conv_fwd_core_add_relu(v0, v2, v1)
-  [v6: f32[1, 2, 16, 16, 32]] = reduce(v5)
-  [v4: f32[1, 2, 16, 16, 32]] = outerloop_1X2X16X16_partition_add_relu(v6, v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestFuseOpBreakAndNoFuse) {
-    auto ctx = get_test_ctx();
-
-    auto get_test_graph = [&ctx](const char *fuse_type = nullptr) {
-        int M = 32, K = 64, N = 32;
-        sc_graph_t mgr;
-        auto in_a = mgr.make_input(
-                {graph_tensor::make({M, K}, sc_data_format_t::MK())});
-        auto in_b = mgr.make_input(
-                {graph_tensor::make({K, N}, sc_data_format_t::KN())});
-        auto in_c = mgr.make_input(
-                {graph_tensor::make({M, N}, sc_data_format_t::MK())});
-        in_a->attrs_.set("constant", const_kind::local_const);
-        in_b->attrs_.set("constant", const_kind::local_const);
-
-        auto gemm = mgr.make("matmul_core",
-                {in_a->get_outputs()[0], in_b->get_outputs()[0]}, {}, {});
-        auto bias = mgr.make("add",
-                {gemm->get_outputs()[0], in_c->get_outputs()[0]},
-                {graph_tensor::make({M, N}, sc_data_format_t::MK())}, {});
-        auto relu = mgr.make("relu", {bias->get_outputs()[0]},
-                {graph_tensor::make({M, N}, sc_data_format_t::MK())}, {});
-
-        if (fuse_type) { relu->attrs_.set(fuse_type, true); }
-        auto quan = mgr.make("quantize", relu->get_outputs(),
-                {graph_tensor::make(
-                        {M, N}, sc_data_format_t::MK(), datatypes::s8)},
-                {{"dtype", datatypes::s8},
-                        {"scales", std::vector<float> {1.2f}},
-                        {"zero_points", std::vector<int> {2}},
-                        {"channel_axis", 0}});
-
-        mgr.make_output(quan->get_outputs());
-
-        graph_inline(mgr);
-        quantize::quantize_inline(mgr);
-        elemwise_dimension_alignment(mgr);
-        layout_propagation(mgr);
-        mixed_partition(mgr, ctx);
-        return mgr;
-    };
-
-    // full fusion version
-    {
-        sc_graph_t graph = get_test_graph();
-
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        std::string expected_str
-                = R"(graph(v0: f32[32, 64], v1: f32[64, 32], v2: f32[32, 32]) -> [v3: s8[32, 32]] {
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1, 1, 64, 32]] = reorder(v1)
-  [v3: s8[32, 32]] = outerloop_1_partition_reorder_reorder_matmul_core_add_relu_mul_add_cast_reorder(v2, v0, v6, v5, v4)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-
-    // break pre fusion version
-    {
-        sc_graph_t graph = get_test_graph(op_attr_key::break_pre_fuse);
-
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        std::string expected_str
-                = R"(graph(v0: f32[32, 64], v1: f32[64, 32], v2: f32[32, 32]) -> [v3: s8[32, 32]] {
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1, 1, 64, 32]] = reorder(v1)
-  [v7: f32[1, 1, 32, 32]] = outerloop_1_partition_reorder_reorder_matmul_core_add(v2, v0, v6)
-  [v3: s8[32, 32]] = outerloop_1X1X32_partition_relu_mul_add_cast_reorder(v7, v5, v4)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-
-    // break post fusion version
-    {
-        sc_graph_t graph = get_test_graph(op_attr_key::break_post_fuse);
-
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        std::string expected_str
-                = R"(graph(v0: f32[32, 64], v1: f32[64, 32], v2: f32[32, 32]) -> [v3: s8[32, 32]] {
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1, 1, 64, 32]] = reorder(v1)
-  [v7: f32[1, 1, 32, 32]] = outerloop_1_partition_reorder_reorder_matmul_core_add_relu(v2, v0, v6)
-  [v3: s8[32, 32]] = outerloop_1X1X32_partition_mul_add_cast_reorder(v7, v5, v4)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-
-    // no fuse fusion version
-    {
-        sc_graph_t graph = get_test_graph(op_attr_key::no_fuse);
-
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        std::string expected_str
-                = R"(graph(v0: f32[32, 64], v1: f32[64, 32], v2: f32[32, 32]) -> [v3: s8[32, 32]] {
-  [v4: f32[1]] = constant([1])
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1, 1, 64, 32]] = reorder(v1)
-  [v7: f32[1, 1, 32, 32]] = outerloop_1_partition_reorder_reorder_matmul_core_add(v2, v0, v6)
-  [v8: f32[1, 1, 32, 32]] = relu(v7)
-  [v3: s8[32, 32]] = outerloop_1X1X32_partition_mul_add_cast_reorder(v8, v5, v4)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphBatchWiseFuse) {
-    REQUIRE_PARALLEL();
-    sc_graph_t graph;
-    auto input_A = graph.make_input({graph_tensor::make({16, 32, 384, 1024})});
-    auto input_B = graph.make_input({graph_tensor::make({16, 32, 384, 1024})});
-    auto input_C = graph.make_input({graph_tensor::make({16, 1, 384, 1})});
-
-    auto add0 = graph.make("add",
-            {input_A->get_outputs()[0], input_B->get_outputs()[0]}, {}, {});
-    auto red1 = graph.make("reduce", {add0->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {1, 3}}, {"rd_op", 0}});
-    auto add1 = graph.make(
-            "add", {add0->get_outputs()[0], red1->get_outputs()[0]}, {}, {});
-    auto add2 = graph.make(
-            "add", {add1->get_outputs()[0], input_C->get_outputs()[0]}, {}, {});
-
-    auto tv0 = graph.make("tensor_view", {add2->get_outputs()[0]}, {},
-            {{"shape", sc_dims {8, 2, 32, 384, 1024}}});
-
-    auto output = graph.make_output(tv0->get_outputs());
-
-    auto ctx = get_test_ctx();
-
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[16, 32, 384, 1024], v1: f32[16, 32, 384, 1024], v2: f32[16, 1, 384, 1]) -> [v3: f32[8, 2, 32, 384, 1024]] {
-  [v4: f32[16, 32, 384, 1024]] = outerloop_16X384_partition_add_reduce_compute_reduce_collect_add_add(v0, v1, v2)
-  [v3: f32[8, 2, 32, 384, 1024]] = tensor_view(v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphHorizontalMerge) {
-    SET_THREADS_OR_SKIP(32);
-
-    sc_graph_t graph;
-    int M = 384, K = 1024, N = 1024;
-    auto input = graph.make_input({graph_tensor::make({M, K})});
-    auto weight0 = graph.make_input({graph_tensor::make({K, N})});
-    auto weight1 = graph.make_input({graph_tensor::make({K, N})});
-    auto weight2 = graph.make_input({graph_tensor::make({K, N})});
-    auto weight3 = graph.make_input({graph_tensor::make({K, N})});
-    auto weight4 = graph.make_input({graph_tensor::make({K, N})});
-
-    auto matmul0 = graph.make("matmul_core",
-            {input->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto matmul1 = graph.make("matmul_core",
-            {input->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto matmul2 = graph.make("matmul_core",
-            {input->get_outputs()[0], weight2->get_outputs()[0]}, {}, {});
-    auto matmul3 = graph.make("matmul_core",
-            {input->get_outputs()[0], weight3->get_outputs()[0]}, {}, {});
-    auto matmul4 = graph.make("matmul_core",
-            {input->get_outputs()[0], weight4->get_outputs()[0]}, {}, {});
-    auto output0 = graph.make_output(matmul0->get_outputs());
-    auto output1 = graph.make_output(matmul1->get_outputs());
-    auto output2 = graph.make_output(matmul2->get_outputs());
-    auto output3 = graph.make_output(matmul3->get_outputs());
-    auto output4 = graph.make_output(matmul4->get_outputs());
-    auto ctx = get_test_ctx();
-
-    graph_driver_before_fusion(graph, ctx);
-    ops::matmul_core_config_t cfg = {32, 32, 32};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-
-    int M_num_block = M / cfg.M_block;
-    int N_num_block = N / cfg.N_block;
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[384, 1024], v1: f32[1024, 1024], v2: f32[1024, 1024], v3: f32[1024, 1024], v4: f32[1024, 1024], v5: f32[1024, 1024]) -> [v6: f32[384, 1024], v7: f32[384, 1024], v8: f32[384, 1024], v9: f32[384, 1024], v10: f32[384, 1024]] {
-  [v10: f32[384, 1024], v9: f32[384, 1024], v8: f32[384, 1024], v7: f32[384, 1024], v6: f32[384, 1024]] = outerloop_)"
-            + std::to_string(M_num_block * N_num_block * 5) +
-            R"(_partition_matmul_core_matmul_core_matmul_core_matmul_core_matmul_core(v0, v5, v4, v3, v2, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphRunSingleThreads) {
-    sc_graph_t graph;
-
-    auto input = graph.make_input({graph_tensor::make({10, 20, 30})});
-    auto weight = graph.make_input({graph_tensor::make({10, 20, 30})});
-    auto cast_node = graph.make(
-            "cast", {input->get_outputs()[0]}, {}, {{"dtype", datatypes::s32}});
-    auto reduce_node = graph.make("reduce", {cast_node->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto mul_node = graph.make("mul",
-            {reduce_node->get_outputs()[0], weight->get_outputs()[0]}, {}, {});
-    auto output = graph.make_output(mul_node->get_outputs());
-
-    auto ctx = get_test_ctx();
-    SET_THREADS_OR_SKIP(1);
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[10, 20, 30], v1: f32[10, 20, 30]) -> [v2: f32[10, 20, 30]] {
-  [v2: f32[10, 20, 30]] = outerloop_10_partition_cast_reduce_compute_reduce_collect_mul(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp,
-        TestGraphFuseFakeTensorviewSharedWithOutput) {
-    sc_graph_t graph;
-
-    auto input0 = graph.make_input({graph_tensor::make({10, 20, 30})});
-    auto input1 = graph.make_input({graph_tensor::make({10, 20, 30})});
-    auto mul_node = graph.make("mul",
-            {input0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    // Although this tensorview op is shared with output op, it is actually fake
-    // one which has same input and output dims. It is expected to fuse rather
-    // than break.
-    auto tv_node = graph.make("tensor_view", {mul_node->get_outputs()[0]}, {},
-            {{"shape", sc_dims {10, 20, 30}}});
-    auto cast_node = graph.make("cast", {tv_node->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-    auto output0 = graph.make_output(tv_node->get_outputs());
-    auto output1 = graph.make_output(cast_node->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[10, 20, 30], v1: f32[10, 20, 30]) -> [v2: f32[10, 20, 30], v3: s32[10, 20, 30]] {
-  [v2: f32[10, 20, 30], v3: s32[10, 20, 30]] = outerloop_10X20_partition_mul_tensor_view_cast(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphOptimizedReorder) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({10, 64, 32, 32}, sc_data_format_t::NCHW())});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({10, 64, 32, 32}, sc_data_format_t::NCHW())});
-
-    auto mul_node = graph.make("mul",
-            {input0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-
-    // This reorder is speical reoder case which can run into opmtimized reorder
-    // kernel, but require larger fusion anchor
-    auto reo_node = graph.make("reorder", {mul_node->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::NCHWc(32)}});
-
-    auto output0 = graph.make_output(reo_node->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[10, 64, 32, 32], v1: f32[10, 64, 32, 32]) -> [v2: f32[10, 2, 32, 32, 32]] {
-  [v2: f32[10, 2, 32, 32, 32]] = outerloop_10_partition_mul_reorder(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphLastDimPaddedReorder) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({32, 31}, sc_data_format_t::MKmk(4, 48))});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({32, 31}, sc_data_format_t::MKmk(4, 32))});
-
-    auto relu_node = graph.make("relu", {input0->get_outputs()[0]}, {}, {});
-    // This reorder is speical reoder case with last dim padded and would not
-    // break following `mul` fusion
-    auto reo_node = graph.make("reorder", {relu_node->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(4, 32)}});
-    auto mul_node = graph.make("mul",
-            {reo_node->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output(mul_node->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[8, 1, 4, 48], v1: f32[8, 1, 4, 32]) -> [v2: f32[8, 1, 4, 32]] {
-  [v2: f32[8, 1, 4, 32]] = outerloop_8X1X4_partition_relu_reorder_mul(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphFuseBRGemmPreOpFusion1) {
-    sc_graph_t graph;
-    auto run_threads = runtime_config_t::get().get_num_threads();
-    int BS = run_threads, C = 64, H = 56, W = 56, K = 64;
-    auto input0 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto weight0 = graph.make_input({graph_tensor::make({K, C, 1, 1})});
-    auto input1 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto input2 = graph.make("constant", {}, {graph_tensor::make({1})},
-            {{"values",
-                     std::make_shared<static_data_t>(
-                             std::vector<float> {1.0f})},
-                    {"dtype", datatypes::f32}, {"plain_dims", sc_dims {1}}});
-
-    auto cast0 = graph.make("cast", {input1->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-    auto add0 = graph.make(
-            "add", {cast0->get_outputs()[0], input2->get_outputs()[0]}, {}, {});
-    auto conv0 = graph.make("conv_fwd_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    auto add1 = graph.make(
-            "add", {conv0->get_outputs()[0], add0->get_outputs()[0]}, {}, {});
-    auto output0 = graph.make_output(add1->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    // Due to brgemm pre-op fusion, the number of outer loops is equal to what
-    // is written in conv template
-    EXPECT_EQ(fused_op->parti_list_[0]->get_outer_loops().size(), (size_t)4);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphFuseBRGemmPreOpFusion2) {
-    SET_THREADS_OR_SKIP(56);
-    sc_graph_t graph;
-    int BS = 56, C = 64, H = 56, W = 56, K = 64;
-    auto input0 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto weight0 = graph.make_input({graph_tensor::make({K, C, 1, 1})});
-    auto input1 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto input2 = graph.make("constant", {}, {graph_tensor::make({1})},
-            {{"values",
-                     std::make_shared<static_data_t>(
-                             std::vector<float> {1.0f})},
-                    {"dtype", datatypes::f32}, {"plain_dims", sc_dims {1}}});
-
-    auto cast0 = graph.make("cast", {input1->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-    auto relu0 = graph.make("relu", {cast0->get_outputs()[0]}, {}, {});
-    graph.make_output(relu0->get_outputs());
-    auto add0 = graph.make(
-            "add", {cast0->get_outputs()[0], input2->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", {cast0->get_outputs()[0]}, {}, {});
-    graph.make_output(relu1->get_outputs());
-    auto conv0 = graph.make("conv_fwd_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    auto add1 = graph.make(
-            "add", {conv0->get_outputs()[0], add0->get_outputs()[0]}, {}, {});
-    graph.make_output(add1->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphFuseOptimizedReduce2) {
-    sc_graph_t graph;
-
-    SET_THREADS_OR_SKIP(56);
-
-    auto input0 = graph.make_input({graph_tensor::make({1024, 1024})});
-    // This reduce op does not satisfy register requirement of `tsr2var`
-    // optimization due to 1024 is larger than max tolerence.
-    auto reduce0 = graph.make("reduce", input0->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {0}}, {"rd_op", 0}});
-    graph.make_output(reduce0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    graph_driver_before_fusion(graph, ctx);
-    bool found = false;
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "reduce_compute") {
-            auto mod = op->get_func(ctx);
-            // this reduce_compute_op should not be split
-            found = (mod->get_func("reduce_compute_2") != nullptr);
-        }
-    }
-    ASSERT_TRUE(found);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphPartitionRingRiskCheck1) {
-    sc_graph_t graph;
-
-    SET_THREADS_OR_SKIP(1);
-
-    int M, N, K;
-    M = N = K = 256;
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MN))});
-    auto bias = graph.make_input(
-            {graph_tensor::make({N}, sc_data_format_t(format_kinds::A))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight2 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    // mmm1
-    auto ret = graph.make("matmul",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    ret = graph.make("add", {ret->get_outputs()[0], bias->get_outputs()[0]}, {},
-            {{"bc_axis", std::vector<int> {1}}});
-    // mm0
-    ret = graph.make("matmul_core",
-            {ret->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    ret = graph.make("add", {ret->get_outputs()[0], bias->get_outputs()[0]}, {},
-            {{"bc_axis", std::vector<int> {1}}});
-    // mmm2
-    ret = graph.make("matmul",
-            {ret->get_outputs()[0], weight2->get_outputs()[0]}, {}, {});
-    ret = graph.make("add", {ret->get_outputs()[0], bias->get_outputs()[0]}, {},
-            {{"bc_axis", std::vector<int> {1}}});
-    ret = graph.make_output(ret->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_inline(graph, ctx);
-    ops::managed_matmul_core_config_t cfg = {1, 1, 1, 1, 2, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // mmm1 and mmm2 could not be parallel merged due to paritition ring risk
-    std::string expected_str
-            = R"(graph(v0: f32[256, 256], v1: f32[256], v2: f32[256, 256], v3: f32[256, 256], v4: f32[256, 256]) -> [v5: f32[256, 256]] {
-  [v6: f32[1, 256]] = tensor_view(v1)
-  [v7: f32[256, 256]] = outerloop_1X1X1X1X1_partition_managed_matmul_core_add(v0, v2, v6)
-  [v8: f32[256, 256]] = outerloop_4X4_partition_matmul_core_add(v7, v3, v6)
-  [v5: f32[256, 256]] = outerloop_1X1X1X1X1_partition_managed_matmul_core_add(v8, v4, v6)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphBreakOpPreFusion1) {
-    sc_graph_t graph;
-    int BS = 6, M = 384, K = 1024, N = 1024;
-    auto input = graph.make_input({graph_tensor::make({BS, M, K})});
-    auto weight0 = graph.make_input({graph_tensor::make({BS, K, N})});
-
-    auto cast0 = graph.make(
-            "cast", {input->get_outputs()[0]}, {}, {{"dtype", datatypes::s32}});
-    // break pre fuse for relu0, but it is expected to be pre-op fused yet
-    auto relu0 = graph.make("relu", {cast0->get_outputs()[0]}, {},
-            {{op_attr_key::break_pre_fuse, true}});
-    auto matmul0 = graph.make("matmul_core",
-            {cast0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto add0 = graph.make("add",
-            {matmul0->get_outputs()[0], relu0->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output(add0->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[6, 384, 1024], v1: f32[6, 1024, 1024]) -> [v2: f32[6, 384, 1024]] {
-  [v2: f32[6, 384, 1024]] = outerloop_6_partition_cast_matmul_core_relu_add(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphBreakOpPreFusion2) {
-    sc_graph_t graph;
-
-    int BS = 28, C = 64, H = 56, W = 56, K = 64;
-
-    SET_THREADS_OR_SKIP(BS);
-
-    auto input0 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto weight0 = graph.make_input({graph_tensor::make({K, C, 1, 1})});
-    auto weight1 = graph.make_input({graph_tensor::make({K, C, 1, 1})});
-
-    weight0->attrs_.set("constant", const_kind::local_const);
-    weight1->attrs_.set("constant", const_kind::local_const);
-
-    auto relu0 = graph.make("relu", {input0->get_outputs()[0]}, {}, {});
-
-    auto conv0 = graph.make("conv_fwd_core",
-            {relu0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-
-    auto cast0 = graph.make(
-            "cast", {conv0->get_outputs()[0]}, {}, {{"dtype", datatypes::s32}});
-    // break pre fuse
-    auto cast1 = graph.make("cast", {cast0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}, {op_attr_key::break_pre_fuse, true}});
-
-    auto conv1 = graph.make("conv_fwd_core",
-            {relu0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-
-    auto relu1 = graph.make("relu", {conv1->get_outputs()[0]}, {}, {});
-
-    auto add0 = graph.make(
-            "add", {cast1->get_outputs()[0], relu1->get_outputs()[0]}, {}, {});
-
-    graph.make_output(add0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[28, 64, 56, 56], v1: f32[64, 64, 1, 1], v2: f32[64, 64, 1, 1]) -> [v3: f32[28, 64, 56, 56]] {
-  [v3: f32[28, 64, 56, 56]] = outerloop_28_partition_relu_conv_fwd_core_relu_conv_fwd_core_cast_cast_add(v0, v2, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestaxisBinding1) {
-    sc_graph_t graph;
-
-    auto input0 = graph.make_input({graph_tensor::make({100, 100, 200})});
-    auto input1 = graph.make_input({graph_tensor::make({100, 100, 200})});
-    // broadcast side input
-    auto input2 = graph.make_input({graph_tensor::make({100, 200})});
-
-    // add0 owns 'for i in range(100)' outer loop, `100` represents first axis
-    // for dims {100, 100, 200}
-    auto add0 = graph.make("add",
-            {input0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", {add0->get_outputs()[0]}, {}, {});
-
-    // relu1 also owns 'for i in range(100)' outer loop, but here `100` is
-    // broadcast axis for second axis for dims {100, 100, 200}
-    auto relu1 = graph.make("relu", {input2->get_outputs()[0]}, {}, {});
-
-    // add1 is broadcast add, the two input relu0 and relu1 should not be merged
-    // with `outer_loop = 100` due to axis binding conflict
-    auto add1 = graph.make(
-            "add", {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {}, {});
-
-    graph.make_output(add1->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    /** the expected graph could not be like below, which is actully wrong and
-     * may cause correctness issue:
-     * graph(v0: f32[100, 100, 200], v1: f32[100, 100, 200], v2: f32[100, 200])
-     * -> [v3: f32[100, 100, 200]] {
-     * [v3: f32[100, 100, 200]] =
-     * outerloop_100_partition_relu_tensor_view_add_relu_add(v2, v0, v1)
-     *  }
-     * */
-    std::string expected_str
-            = R"(graph(v0: f32[100, 100, 200], v1: f32[100, 100, 200], v2: f32[100, 200]) -> [v3: f32[100, 100, 200]] {
-  [v4: f32[100, 200]] = relu(v2)
-  [v5: f32[1, 100, 200]] = tensor_view(v4)
-  [v3: f32[100, 100, 200]] = outerloop_100X100_partition_add_relu_add(v0, v1, v5)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestaxisBinding2) {
-    sc_graph_t graph;
-
-    SET_THREADS_OR_SKIP(2);
-
-    int M, N, K;
-    M = N = 256;
-    K = 64;
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MN))});
-    auto bias = graph.make_input(
-            {graph_tensor::make({N}, sc_data_format_t(format_kinds::A))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("matmul",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    mmm0 = graph.make("add", {mmm0->get_outputs()[0], bias->get_outputs()[0]},
-            {}, {{"bc_axis", std::vector<int> {1}}});
-    // mmm1
-    auto mmm1 = graph.make("matmul",
-            {input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    mmm1 = graph.make("add", {mmm1->get_outputs()[0], bias->get_outputs()[0]},
-            {}, {{"bc_axis", std::vector<int> {1}}});
-    graph.make_output(mmm0->get_outputs());
-    graph.make_output(mmm1->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // mmm0 and mmm1 are exactly two forking partitions
-    // three levels of loops are merged
-    std::string expected_str
-            = R"(graph(v0: f32[256, 64], v1: f32[256], v2: f32[64, 256], v3: f32[64, 256]) -> [v4: f32[256, 256], v5: f32[256, 256]] {
-  [v6: f32[1, 256]] = tensor_view(v1)
-  [v5: f32[256, 256], v4: f32[256, 256]] = outerloop_2X1X1X1X1_partition_managed_matmul_core_add_managed_matmul_core_add(v0, v3, v6, v2)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-/* Case: same weight
-out1 = x1 * w, out2 = x2 * w
-*/
-TEST(GCCore_CPU_graph_mixed_partition_cpp, SplitAndMergeInners_Accuracy0) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-
-    int M0 = 1024, M1 = 512, K = 2048, N = 256;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M0, K}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({M1, K}, sc_data_format_t(format_kinds::MK))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", {mmm0->get_outputs()[0]}, {}, {});
-    // mmm1, same weight with mmm0
-    auto mmm1 = graph.make("managed_matmul_core",
-            {input1->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", {mmm1->get_outputs()[0]}, {}, {});
-    auto out0 = graph.make_output(relu0->get_outputs());
-    auto out1 = graph.make_output(relu1->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto f1 = lower_graph(ctx, graph, {input0, weight0, input1, out0, out1});
-    auto fptr1 = jit_engine_t::make(ctx)->get_entry_func(f1);
-
-    // split outmost and merge inners, outerloop_8X2
-    mixed_partition(graph, ctx);
-
-    auto f2 = lower_graph(ctx, graph, {input0, weight0, input1, out0, out1});
-    auto fptr2 = jit_engine_t::make(ctx)->get_entry_func(f2);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[1024, 2048], v1: f32[2048, 256], v2: f32[512, 2048]) -> [v3: f32[1024, 256], v4: f32[512, 256]] {
-  [v4: f32[512, 256], v3: f32[1024, 256]] = outerloop_8X2_partition_managed_matmul_core_relu_managed_matmul_core_relu(v2, v1, v0)
-}
-)";
-    bool is_scpi = ctx->machine_.cpu_flags_.is_skx_like();
-    if (is_scpi) {
-        // managed matmul core will have different config under such machine
-        // Only compare result in this case
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-
-    std::vector<float> input0_data(M0 * K);
-    test_utils::fill_data(&input0_data[0], M0 * K);
-    std::vector<float> weight0_data(K * N);
-    test_utils::fill_data(&weight0_data[0], K * N);
-    std::vector<float> input1_data(M1 * K);
-    test_utils::fill_data(&input1_data[0], M1 * K);
-
-    std::vector<float> ori_output0_data(M0 * N);
-    std::vector<float> ori_output1_data(M1 * N);
-    fptr1->call_default(&input0_data[0], &weight0_data[0], &input1_data[0],
-            &ori_output0_data[0], &ori_output1_data[0]);
-
-    std::vector<float> pass_output0_data(M0 * N);
-    std::vector<float> pass_output1_data(M1 * N);
-    fptr2->call_default(&input0_data[0], &weight0_data[0], &input1_data[0],
-            &pass_output0_data[0], &pass_output1_data[0]);
-
-    test_utils::compare_data(ori_output0_data, pass_output0_data, 1e-4f, 1e-5f);
-    test_utils::compare_data(ori_output1_data, pass_output1_data, 1e-4f, 1e-5f);
-}
-
-/* Case: same input_data
-out1 = x1 * w1, out2 = x1 * w2
-*/
-TEST(GCCore_CPU_graph_mixed_partition_cpp, SplitAndMergeInners_Accuracy1) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-
-    int M = 1024, K = 2048, N = 256;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", {mmm0->get_outputs()[0]}, {}, {});
-    // mmm1, irrelevant with mmm0
-    auto mmm1 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", {mmm1->get_outputs()[0]}, {}, {});
-    auto out0 = graph.make_output(relu0->get_outputs());
-    auto out1 = graph.make_output(relu1->get_outputs());
-    ops::managed_matmul_core_config_t cfg = {16, 1, 1, 1, 1, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto f1 = lower_graph(ctx, graph, {input0, weight0, weight1, out0, out1});
-    auto fptr1 = jit_engine_t::make(ctx)->get_entry_func(f1);
-
-    // merge inners, outerloop_16x1x1
-    mixed_partition(graph, ctx);
-
-    auto f2 = lower_graph(ctx, graph, {input0, weight0, weight1, out0, out1});
-    auto fptr2 = jit_engine_t::make(ctx)->get_entry_func(f2);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[1024, 2048], v1: f32[2048, 256], v2: f32[2048, 256]) -> [v3: f32[1024, 256], v4: f32[1024, 256]] {
-  [v4: f32[1024, 256], v3: f32[1024, 256]] = outerloop_16X1X1X1X1_partition_managed_matmul_core_relu_managed_matmul_core_relu(v0, v2, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-
-    std::vector<float> input0_data(M * K);
-    test_utils::fill_data(&input0_data[0], M * K);
-    std::vector<float> weight0_data(K * N);
-    test_utils::fill_data(&weight0_data[0], K * N);
-    std::vector<float> weight1_data(K * N);
-    test_utils::fill_data(&weight1_data[0], K * N);
-
-    std::vector<float> ori_output0_data(M * N);
-    std::vector<float> ori_output1_data(M * N);
-    fptr1->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &ori_output0_data[0], &ori_output1_data[0]);
-
-    std::vector<float> pass_output0_data(M * N);
-    std::vector<float> pass_output1_data(M * N);
-    fptr2->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &pass_output0_data[0], &pass_output1_data[0]);
-
-    test_utils::compare_data(ori_output0_data, pass_output0_data, 1e-4f, 1e-5f);
-    test_utils::compare_data(ori_output1_data, pass_output1_data, 1e-4f, 1e-5f);
-}
-
-/* Case: two consective MMMs
-out1 = x1 * w1, out2 = out1 * w2
-*/
-static ir_module_ptr get_two_consective_mmm(
-        int M, int K1, int K2, int K3, context_ptr ctx, sc_graph_t &graph) {
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K1}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K1, K2}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K2, K3}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", {mmm0->get_outputs()[0]}, {}, {});
-    // mmm1, using mmm0's output
-    auto mmm1 = graph.make("managed_matmul_core",
-            {relu0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", {mmm1->get_outputs()[0]}, {}, {});
-    auto out0 = graph.make_output(relu0->get_outputs());
-    auto out1 = graph.make_output(relu1->get_outputs());
-
-    graph_driver(graph, ctx);
-
-    auto f1 = lower_graph(ctx, graph, {input0, weight0, weight1, out0, out1});
-
-    return f1;
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, SplitAndMergeInners_Accuracy2) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-
-    int M = 256, K1 = 2048, K2 = 512, K3 = 1024;
-    std::vector<float> input0_data(M * K1);
-    test_utils::fill_data(&input0_data[0], M * K1);
-    std::vector<float> weight0_data(K1 * K2);
-    test_utils::fill_data(&weight0_data[0], K1 * K2);
-    std::vector<float> weight1_data(K2 * K3);
-    test_utils::fill_data(&weight1_data[0], K2 * K3);
-
-    std::vector<float> ori_output0_data(M * K2);
-    std::vector<float> ori_output1_data(M * K3);
-    std::vector<float> pass_output0_data(M * K2);
-    std::vector<float> pass_output1_data(M * K3);
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    sc_graph_t graph1;
-    auto f1 = get_two_consective_mmm(M, K1, K2, K3, ctx, graph1);
-    auto fptr1 = jit_engine_t::make(ctx)->get_entry_func(f1);
-    fptr1->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &ori_output0_data[0], &ori_output1_data[0]);
-
-    sc_graph_t graph2;
-    auto f2 = get_two_consective_mmm(M, K1, K2, K3, ctx, graph2);
-    std::stringstream ss;
-    print_graph(graph2, ss, true);
-    // split and merge the outer-most
-    std::string expected_str
-            = R"(graph(v0: f32[256, 2048], v1: f32[2048, 512], v2: f32[512, 1024]) -> [v3: f32[256, 512], v4: f32[256, 1024]] {
-  [v3: f32[256, 512], v4: f32[256, 1024]] = outerloop_2_partition_managed_matmul_core_relu_managed_matmul_core_relu(v0, v1, v2)
-}
-)";
-    bool is_scpi = ctx->machine_.cpu_flags_.is_skx_like();
-    if (is_scpi) {
-        // managed matmul core will have different config under such machine
-        // Only compare result in this case
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-    auto fptr2 = jit_engine_t::make(ctx)->get_entry_func(f2);
-    fptr2->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &pass_output0_data[0], &pass_output1_data[0]);
-
-    test_utils::compare_data(ori_output0_data, pass_output0_data, 1e-4f, 1e-5f);
-    test_utils::compare_data(ori_output1_data, pass_output1_data, 1e-4f, 1e-5f);
-
-#ifdef DO_BENCH
-    auto exec = [&]() {
-        fptr2->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-                &pass_output0_data[0], &pass_output1_data[0]);
-    };
-    const int warm_up = 5;
-    const int repeat = 5;
-    const int loop = 20;
-    double cost1 = 1e12;
-    for (int r = 0; r < repeat; r++) {
-        double cost1_r = 0.f;
-        for (int t = 0; t < loop + warm_up; t++) {
-            auto time1 = evaluate_time(exec);
-            if (t >= warm_up) cost1_r += time1;
-        }
-        cost1 = std::min(cost1_r, cost1);
-    }
-    printf("\n@mlp cost: %f ms\n", cost1 / loop);
-#endif
-}
-
-static ir_module_ptr get_three_consective_mmm(int M, int K1, int K2, int K3,
-        int K4, context_ptr ctx, sc_graph_t &graph) {
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K1}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K1, K2}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K2, K3}, sc_data_format_t(format_kinds::KN))});
-    auto weight2 = graph.make_input(
-            {graph_tensor::make({K3, K4}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", {mmm0->get_outputs()[0]}, {}, {});
-    // mmm1, using mmm0's output
-    auto mmm1 = graph.make("managed_matmul_core",
-            {relu0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", {mmm1->get_outputs()[0]}, {}, {});
-    // mmm2, using mmm1's output
-    auto mmm2 = graph.make("managed_matmul_core",
-            {relu1->get_outputs()[0], weight2->get_outputs()[0]}, {}, {});
-    auto relu2 = graph.make("relu", {mmm2->get_outputs()[0]}, {}, {});
-    auto out0 = graph.make_output(relu0->get_outputs());
-    auto out1 = graph.make_output(relu1->get_outputs());
-    auto out2 = graph.make_output(relu2->get_outputs());
-
-    graph_driver(graph, ctx);
-
-    auto f1 = lower_graph(
-            ctx, graph, {input0, weight0, weight1, weight2, out0, out1, out2});
-
-    return f1;
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, SplitAndMergeInners_Accuracy3) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(56);
-
-    int M = 1024, K1 = 13, K2 = 512, K3 = 256, K4 = 128;
-    std::vector<float> input0_data(M * K1);
-    test_utils::fill_data(&input0_data[0], M * K1);
-    std::vector<float> weight0_data(K1 * K2);
-    test_utils::fill_data(&weight0_data[0], K1 * K2);
-    std::vector<float> weight1_data(K2 * K3);
-    test_utils::fill_data(&weight1_data[0], K2 * K3);
-    std::vector<float> weight2_data(K2 * K3);
-    test_utils::fill_data(&weight2_data[0], K3 * K4);
-
-    std::vector<float> ori_output0_data(M * K2);
-    std::vector<float> ori_output1_data(M * K3);
-    std::vector<float> ori_output2_data(M * K4);
-    std::vector<float> pass_output0_data(M * K2);
-    std::vector<float> pass_output1_data(M * K3);
-    std::vector<float> pass_output2_data(M * K4);
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    // uncomment to dump output
-    // ctx->flags_.graph_dump_results_ = "path=./dump";
-    // old fusion mgr has correctness issue when facing imbalance under some
-    // specific configs, here we use ref model for validation
-    gemm_params gemm_param0 {
-            false, false, 1024, 512, 13, 1.0, 0.0, 13, 512, 512};
-    gemm_params gemm_param1 {
-            false, false, 1024, 256, 512, 1.0, 0.0, 512, 256, 256};
-    gemm_params gemm_param2 {
-            false, false, 1024, 128, 256, 1.0, 0.0, 256, 128, 128};
-    ref_gemm(gemm_param0, &input0_data[0], &weight0_data[0],
-            &ori_output0_data[0]);
-    ref_relu(ori_output0_data.data(), ori_output0_data.data(),
-            ori_output0_data.size());
-    ref_gemm(gemm_param1, &ori_output0_data[0], &weight1_data[0],
-            &ori_output1_data[0]);
-    ref_relu(ori_output1_data.data(), ori_output1_data.data(),
-            ori_output1_data.size());
-    ref_gemm(gemm_param2, &ori_output1_data[0], &weight2_data[0],
-            &ori_output2_data[0]);
-    ref_relu(ori_output2_data.data(), ori_output2_data.data(),
-            ori_output2_data.size());
-
-    sc_graph_t graph;
-    auto f = get_three_consective_mmm(M, K1, K2, K3, K4, ctx, graph);
-
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_default(&input0_data[0], &weight0_data[0], &weight1_data[0],
-            &weight2_data[0], &pass_output0_data[0], &pass_output1_data[0],
-            &pass_output2_data[0]);
-
-    // fix-me (xxx): a special iim_block=19 will be given in this ut, making it
-    // unable to converge with rtol=1e-4f, atol=1e-5f
-    test_utils::compare_data(pass_output0_data, ori_output0_data, 5e-2f, 1e-4f);
-    test_utils::compare_data(pass_output1_data, ori_output1_data, 5e-2f, 1e-4f);
-    test_utils::compare_data(pass_output2_data, ori_output2_data, 5e-2f, 1e-4f);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, SplitOuterMostLoopWithTensorShrink) {
-    SET_THREADS_OR_SKIP(16);
-    int M = 256, K1 = 2048, K2 = 512, N = 1024;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K1}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K1, K2}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K2, N}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    // mmm1
-    auto mmm1 = graph.make("managed_matmul_core",
-            {mmm0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto out0 = graph.make_output(mmm1->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver_before_fusion(graph, ctx);
-    // split outmost and merge inners
-    mixed_partition(graph, ctx);
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto body = mixed_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_;
-    auto mm0_out = body[0].checked_as<define>()->var_;
-    bool is_scpi = ctx->machine_.cpu_flags_.is_skx_like();
-    if (is_scpi) {
-        // managed matmul core will have different config under such machine
-        // Only compare result in this case
-        EXPECT_TRUE(
-                mm0_out->attr().has_key(tensor_shrinker_attrs::should_shrink));
-    }
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp,
-        ParitialReduceMatmulTensorViewShrink) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(16);
-    int M = 256, K = 256, N = 320;
-
-    sc_graph_t graph;
-    auto input = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MK))});
-    auto weight = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm + tensor_view + post_op
-    auto mmm = graph.make("managed_matmul_core",
-            {input->get_outputs()[0], weight->get_outputs()[0]}, {}, {});
-    auto tv = graph.make("tensor_view", {mmm->get_outputs()[0]},
-            {graph_tensor::make(sc_dims {1, M, N}, sc_data_format_t(),
-                    mmm->get_outputs()[0]->details_.dtype_)},
-            {{"shape", sc_dims {1, M, N}}, {"format", sc_data_format_t()}});
-    auto relu = graph.make("relu", {tv->get_outputs()[0]}, {}, {});
-    auto out = graph.make_output(relu->get_outputs());
-
-    ops::managed_matmul_core_config_t cfg = {1, 16, 1, 1, 4, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto f1 = lower_graph(ctx, graph, {input, weight, out});
-    auto fptr1 = jit_engine_t::make(ctx)->get_entry_func(f1);
-
-    graph_driver(graph, ctx);
-
-    auto f2 = lower_graph(ctx, graph, {input, weight, out});
-    auto fptr2 = jit_engine_t::make(ctx)->get_entry_func(f2);
-
-    std::vector<float> input_data(M * K);
-    test_utils::fill_data(&input_data[0], M * K);
-    std::vector<float> weight_data(K * N);
-    test_utils::fill_data(&weight_data[0], K * N);
-
-    std::vector<float> ori_output_data(M * N);
-    fptr1->call_default(&input_data[0], &weight_data[0], &ori_output_data[0]);
-
-    std::vector<float> pass_output_data(M * N);
-    fptr2->call_default(&input_data[0], &weight_data[0], &pass_output_data[0]);
-
-    test_utils::compare_data(ori_output_data, pass_output_data, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, SyncTensorViewShrinkInfo) {
-    SET_THREADS_OR_SKIP(56);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    bool is_special_fm = ctx->machine_.cpu_flags_.is_spr_like();
-    if (!is_special_fm) {
-        // managed matmul core will have different config under such machine.
-        GTEST_SKIP();
-    }
-
-    sc_dims A_dims {32, 32, 1024}, B_dims {1024, 1024}, bias {1024},
-            out {32, 32, 1024};
-    auto ins0 = graph_tensor::make(A_dims, sc_data_format_t(format_kinds::ABC));
-    auto ins1 = graph_tensor::make(B_dims);
-    auto ins2 = graph_tensor::make(bias);
-    auto outs0 = graph_tensor::make(out, sc_data_format_t(format_kinds::ABC));
-    any_map_t attrs({{"transpose_a", false}, {"transpose_b", false}});
-    sc_graph_t graph;
-    auto in = graph.make_input({ins0, ins1, ins2});
-    auto matmul = graph.make("matmul", in->get_outputs(), {outs0}, attrs);
-    graph.make_output(matmul->get_outputs());
-
-    graph_inline(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // check graph
-    std::string expected_str
-            = R"(graph(v0: f32[32, 32, 1024], v1: f32[1024, 1024], v2: f32[1024]) -> [v3: f32[32, 32, 1024]] {
-  [v4: f32[1024, 1024]] = tensor_view(v0)
-  [v5: f32[1024, 1024]] = managed_matmul_core(v4, v1)
-  [v6: f32[32, 32, 1024]] = tensor_view(v5)
-  [v3: f32[32, 32, 1024]] = add(v6, v2)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-    // set config
-    ops::managed_matmul_core_config_t cfg = {14, 4, 1, 1, 2, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-    mixed_partition(graph, ctx);
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto body = mixed_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_;
-    EXPECT_TRUE(body.size() > 2 && body[2].isa<define>());
-    // check tensor attr
-    auto matmul_out = body[2].static_as<define>()->var_;
-    EXPECT_TRUE(matmul_out.isa<tensor>()
-            && utils::string_startswith(matmul_out.static_as<tensor>()->name_,
-                    "managed_matmul_core"));
-    EXPECT_TRUE(
-            matmul_out->attr().has_key(tensor_shrinker_attrs::should_shrink));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint1) {
-    sc_graph_t graph;
-    int batch_size = 56;
-    SET_THREADS_OR_SKIP(batch_size);
-    auto input0 = graph.make_input({graph_tensor::make(
-            {batch_size, 64, 32, 32}, sc_data_format_t::NCHW())});
-    auto input1 = graph.make_input({graph_tensor::make(
-            {batch_size, 64, 32, 32}, sc_data_format_t::NCHW())});
-    auto input2 = graph.make_input({graph_tensor::make(
-            {batch_size, 64, 32, 32}, sc_data_format_t::NCHW())});
-
-    // cast0 is f32
-    auto cast0 = graph.make("cast", {input0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::f32}});
-    auto cast1 = graph.make(
-            "cast", {cast0->get_outputs()[0]}, {}, {{"dtype", datatypes::s32}});
-    auto mul0 = graph.make(
-            "mul", {cast1->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    auto add0 = graph.make(
-            "add", {mul0->get_outputs()[0], input2->get_outputs()[0]}, {}, {});
-    // cast2 is also f32, can be inplaced with cast0
-    auto cast2 = graph.make(
-            "cast", {add0->get_outputs()[0]}, {}, {{"dtype", datatypes::f32}});
-    auto cast3 = graph.make(
-            "cast", {cast2->get_outputs()[0]}, {}, {{"dtype", datatypes::s32}});
-    graph.make_output(cast3->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    auto body = fused_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_;
-
-    // search cast tensor define node.
-    EXPECT_TRUE(std::any_of(body.begin(), body.end(), [](const stmt &s) {
-        return s.cast<define>()
-                .map([](const define &d) { return d->var_.as<tensor>(); })
-                .filter([](const tensor &t) {
-                    // check cast_7_outs_0 inplace attr
-                    return t->name_ == "cast_7_outs_0"
-                            && t->attr().has_key(
-                                    attr_keys::tensor_inplace_hint);
-                })
-                .has_value();
-    }));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint2) {
-    sc_graph_t graph;
-    int batch_size = 28;
-    SET_THREADS_OR_SKIP(batch_size);
-    auto input0 = graph.make_input({graph_tensor::make({batch_size, 64, 32})});
-
-    auto relu0 = graph.make("relu", {input0->get_outputs()[0]}, {}, {});
-    auto relu1 = graph.make("relu", {relu0->get_outputs()[0]}, {}, {});
-
-    auto relu2 = graph.make("relu", {relu1->get_outputs()[0]}, {}, {});
-    // relu3 should not inplace relu0, due to relu1 share same buffer with
-    // relu0, which is not truely last use for relu_0_out buffer
-    auto relu3 = graph.make("relu", {relu1->get_outputs()[0]}, {}, {});
-    auto mul0 = graph.make(
-            "mul", {relu2->get_outputs()[0], relu3->get_outputs()[0]}, {}, {});
-    graph.make_output(mul0->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    auto body = fused_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_;
-
-    // search relu tensor define node.
-    EXPECT_TRUE(std::any_of(body.begin(), body.end(), [](const stmt &s) {
-        return s.cast<define>()
-                .map([](const define &d) { return d->var_.as<tensor>(); })
-                .filter([](const tensor &t) {
-                    // check relu_4_outs_0 inplace attr
-                    return t->name_ == "relu_4_outs_0"
-                            && !t->attr().has_key(
-                                    attr_keys::tensor_inplace_hint);
-                })
-                .has_value();
-    }));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint4) {
-    sc_graph_t graph;
-
-    int run_threads = 28;
-    SET_THREADS_OR_SKIP(run_threads);
-
-    int BS = run_threads, M = 384, K = 1024, N = 1024;
-
-    auto input0 = graph.make_input({graph_tensor::make(
-            {BS, M, K}, sc_data_format_t(format_kinds::ABC))});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {BS, K, N}, sc_data_format_t(format_kinds::ABC))});
-
-    auto cast0 = graph.make("cast", {input0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::s32}});
-    auto relu0 = graph.make("relu", {cast0->get_outputs()[0]}, {}, {});
-    // matmul0 will use relu0_out in larger anchor, as the result, it should not
-    // be set inplace hint
-    auto matmul0 = graph.make("matmul_core",
-            {relu0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-
-    auto output0 = graph.make_output(matmul0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    auto body = fused_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_;
-
-    // search relu tensor define node.
-    EXPECT_TRUE(std::any_of(body.begin(), body.end(), [](const stmt &s) {
-        return s.cast<define>()
-                .map([](const define &d) { return d->var_.as<tensor>(); })
-                .filter([](const tensor &t) {
-                    // check relu_3_outs_0 inplace attr
-                    return t->name_ == "relu_3_outs_0"
-                            && !t->attr().has_key(
-                                    attr_keys::tensor_inplace_hint);
-                })
-                .has_value();
-    }));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint5) {
-    sc_graph_t graph;
-
-    int run_threads = 28;
-    SET_THREADS_OR_SKIP(run_threads);
-
-    auto input0 = graph.make_input({graph_tensor::make({28, 100, 200})});
-    auto input1 = graph.make_input({graph_tensor::make({28, 200, 100})});
-
-    auto relu0 = graph.make("relu", {input0->get_outputs()[0]}, {}, {});
-    // tensorptr
-    auto tv0 = graph.make("tensor_view", {relu0->get_outputs()[0]}, {},
-            {{"shape", sc_dims {28, 200, 100}}});
-    // add0 out will try to inplace tensorptr buffer
-    auto add0 = graph.make(
-            "add", {tv0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    // add0 is also output of the graph
-    auto output0 = graph.make_output(add0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    EXPECT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    // get output buffer named `add_4_outs_0`
-    auto output_buf = fused_op->parti_list_[0]->func_->params_[0];
-    // the output buffer should not do inplace due to tensorptr found
-    EXPECT_TRUE(!output_buf->attr().has_key(attr_keys::tensor_inplace_hint));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint7) {
-    sc_graph_t graph;
-    int batch_size = 56;
-    SET_THREADS_OR_SKIP(batch_size);
-    auto input0 = graph.make_input({graph_tensor::make(
-            {batch_size, 64, 32, 32}, sc_data_format_t::NCHW())});
-    auto input1 = graph.make_input({graph_tensor::make(
-            {batch_size, 64, 32, 32}, sc_data_format_t::NCHW())});
-    auto input2 = graph.make_input({graph_tensor::make(
-            {batch_size, 64, 32, 32}, sc_data_format_t::NCHW())});
-
-    auto mul0 = graph.make("mul",
-            {input0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    // add0 could not do inplace on mul0, due to it is not last user
-    auto add0 = graph.make(
-            "add", {mul0->get_outputs()[0], input2->get_outputs()[0]}, {}, {});
-    auto sub0 = graph.make(
-            "sub", {add0->get_outputs()[0], add0->get_outputs()[0]}, {}, {});
-    auto div0 = graph.make(
-            "div", {mul0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    auto add1 = graph.make(
-            "add", {div0->get_outputs()[0], sub0->get_outputs()[0]}, {}, {});
-    auto relu0 = graph.make("relu", {add1->get_outputs()[0]}, {}, {});
-    graph.make_output(relu0->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    auto body = fused_op->parti_list_[0]
-                        ->get_outer_loops()
-                        .back()
-                        ->body_.checked_as<stmts>()
-                        ->seq_;
-
-    // add_4_outs_0 could not inplace mul_3_outs_0
-    EXPECT_TRUE(std::any_of(body.begin(), body.end(), [](const stmt &s) {
-        return s.cast<define>()
-                .map([](const define &d) { return d->var_.as<tensor>(); })
-                .filter([](const tensor &t) {
-                    return t->name_ == "add_4_outs_0"
-                            && !t->attr().has_key(
-                                    attr_keys::tensor_inplace_hint);
-                })
-                .has_value();
-    }));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint8) {
-    int run_threads = 28;
-    SET_THREADS_OR_SKIP(run_threads);
-
-    sc_graph_t graph;
-    auto input_shape = sc_dims {28, 16, 56, 56};
-    auto weight_shape = sc_dims {16, 16, 3, 3};
-    const sc_dims stride = {1, 1};
-    const sc_dims padding_conv = {0, 0};
-    const sc_dims padding_pad = {1, 1};
-
-    auto in_data = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t::NCHW(), input_shape, datatypes::f32)});
-    auto in_weight = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t::KCRS(), weight_shape, datatypes::f32)});
-    auto conv_out = graph.make("conv_fwd_core",
-            {in_data->get_outputs()[0], in_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"pads_begin", padding_conv},
-                    {"pads_end", padding_conv}});
-    auto relu0 = graph.make("relu", conv_out->get_outputs(), {}, {});
-    // relu1 output could have inplace on relu0 output
-    auto relu1 = graph.make("relu", relu0->get_outputs(), {}, {});
-    // pre-padding replace the output of relu1, as the result, it should
-    // remove related inpalce hint at the same time
-    auto pad_out = graph.make("padding", {relu1->get_outputs()[0]}, {},
-            {{"pads_begin", padding_pad}, {"pads_end", padding_pad},
-                    {"break_post_fuse", true}});
-    auto output0 = graph.make_output(pad_out->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    // get inplace map
-    auto inplace_map = fused_op->parti_list_[0]->buf_alloc_.inplace_map_;
-    // there is expected nothing in inplace map, due to `relu_4_outs_0 ==>
-    // relu_3_outs_0` hint is removed
-    EXPECT_EQ(inplace_map.size(), (size_t)0);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphMarkInplaceHint9) {
-    int run_threads = 28;
-    SET_THREADS_OR_SKIP(run_threads);
-
-    sc_graph_t graph;
-    auto input_shape = sc_dims {1, 16, 56, 56};
-
-    auto in1 = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t::NCHW(), input_shape, datatypes::f32)});
-    auto in2 = graph.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t::NCHW(), input_shape, datatypes::f32)});
-    auto cat = graph.make("concat",
-            {in1->get_outputs()[0], in2->get_outputs()[0]}, {}, {{"axis", 1}});
-    auto pool = graph.make("pooling_max", cat->get_outputs(), {},
-            {{pooling_attr_key::strides, sc_dims {1, 1}},
-                    {pooling_attr_key::pads_begin, sc_dims {1, 1}},
-                    {pooling_attr_key::pads_end, sc_dims {1, 1}},
-                    {pooling_attr_key::kernel, sc_dims {3, 3}},
-                    {pooling_attr_key::rounding_type, "floor"},
-                    {pooling_attr_key::auto_pad, "None"},
-                    {pooling_attr_key::data_format, "NCX"}});
-    auto output0 = graph.make_output(pool->get_outputs());
-
-    auto ctx = get_test_ctx();
-    mixed_partition(graph, ctx);
-
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    // get inplace map
-    auto inplace_map = fused_op->parti_list_[0]->buf_alloc_.inplace_map_;
-    // there is expected nothing in inplace map, due to `pooling_max_4_outs_0`
-    // cannot be inplaced by `concat_2_outs_0`
-    EXPECT_EQ(inplace_map.size(), (size_t)0);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestUserDefinedShrintTensor) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-    auto ctx = get_test_ctx();
-    auto input = graph.make_input({graph_tensor::make({56, 16, 196})});
-    auto weight0 = graph.make_input({graph_tensor::make({32, 16, 1})});
-    auto weight1 = graph.make_input({graph_tensor::make({64, 32, 1})});
-
-    auto conv0 = graph.make("conv_fwd_core",
-            {input->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {2}}, {"paddings", sc_dims {0}}});
-    auto conv1 = graph.make("conv_fwd_core",
-            {conv0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {2}}, {"paddings", sc_dims {0}}});
-    graph.make_output(conv1->get_outputs());
-    layout_propagation(graph, ctx);
-    mixed_partition(graph, ctx);
-    auto mod = lower_graph(ctx, graph, {});
-    auto jitf = jit_engine_t::make(mod->ctx_)->get_entry_func(mod, true);
-    ASSERT_TRUE(jitf);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestInputFusionAnchor1) {
-    sc_graph_t graph;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto input = graph.make_input(
-            {graph_tensor::make({128, 32}, sc_data_format_t::MK())});
-    auto weight = graph.make_input(
-            {graph_tensor::make({32, 128}, sc_data_format_t::KN())});
-
-    auto reo0 = graph.make("reorder", {input->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(16, 16)}});
-    auto reo1 = graph.make("reorder", {weight->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::NKkn(16, 16)}});
-
-    auto gemm = graph.make("managed_matmul_core",
-            {reo0->get_outputs()[0], reo1->get_outputs()[0]}, {}, {});
-    graph.make_output(gemm->get_outputs());
-    SET_THREADS_OR_SKIP(32);
-    ops::managed_matmul_core_config_t cfg = {32, 1, 1, 1, 1, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[128, 32], v1: f32[32, 128]) -> [v2: f32[128, 128]] {
-  [v2: f32[128, 128]] = outerloop_32X1X1_partition_reorder_reorder_managed_matmul_core(v1, v0)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestInputFusionAnchor2) {
-    sc_graph_t graph;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto input = graph.make_input(
-            {graph_tensor::make({128, 16}, sc_data_format_t::MK())});
-    auto weight = graph.make_input(
-            {graph_tensor::make({1, 32}, sc_data_format_t::MKmk(4, 16))});
-
-    auto reo0 = graph.make("reorder", {input->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(16, 2)}});
-    // This reorder could not be executed input fusion because it use input loop
-    auto reo1 = graph.make("reorder", {weight->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::NKkn(1, 16)}});
-
-    auto gemm = graph.make("managed_matmul_core",
-            {reo0->get_outputs()[0], reo1->get_outputs()[0]}, {}, {});
-    graph.make_output(gemm->get_outputs());
-    SET_THREADS_OR_SKIP(32);
-    ops::managed_matmul_core_config_t cfg = {32, 1, 1, 1, 1, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // reorder1 should not be fused
-    std::string expected_str
-            = R"(graph(v0: f32[128, 16], v1: f32[1, 2, 4, 16]) -> [v2: f32[128, 32]] {
-  [v3: f32[2, 1, 1, 16]] = reorder(v1)
-  [v2: f32[128, 32]] = outerloop_32X1X1_partition_reorder_managed_matmul_core(v0, v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestMergeMixedPartiVertically1) {
-    sc_graph_t graph;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({4, 4096}, sc_data_format_t::MK())});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({4096, 11008}, sc_data_format_t::KN())});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({4, 4096}, sc_data_format_t::MK())});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({4096, 11008}, sc_data_format_t::KN())});
-
-    auto gemm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto sig = graph.make("sigmoid", {gemm0->get_outputs()[0]}, {}, {});
-    auto reo0 = graph.make("reorder", {sig->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(4, 16)},
-                    {"internal", true}});
-    auto gemm1 = graph.make("managed_matmul_core",
-            {input1->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto relu = graph.make("sigmoid", {gemm1->get_outputs()[0]}, {}, {});
-    auto reo1 = graph.make("reorder", {relu->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(4, 32)},
-                    {"internal", true}});
-
-    auto add = graph.make(
-            "add", {reo1->get_outputs()[0], reo0->get_outputs()[0]}, {}, {});
-    graph.make_output(add->get_outputs());
-    SET_THREADS_OR_SKIP(56);
-    ops::managed_matmul_core_config_t cfg = {1, 56, 1, 1, 1, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[4, 4096], v1: f32[4096, 11008], v2: f32[4, 4096], v3: f32[4096, 11008]) -> [v4: f32[4, 11008]] {
-  [v4: f32[4, 11008]] = outerloop_1X56X1X1X1_partition_managed_matmul_core_sigmoid_reorder_managed_matmul_core_sigmoid_reorder_add_reorder(v2, v3, v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestMergeMixedPartiVertically2) {
-    sc_graph_t graph;
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({4, 4096}, sc_data_format_t::MK())});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({4096, 11008}, sc_data_format_t::KN())});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({4, 4096}, sc_data_format_t::MK())});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({4096, 11008}, sc_data_format_t::KN())});
-
-    auto gemm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    auto sig = graph.make("sigmoid", {gemm0->get_outputs()[0]}, {}, {});
-    auto reo0 = graph.make("reorder", {sig->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(4, 64)},
-                    {"internal", true}});
-    auto gemm1 = graph.make("managed_matmul_core",
-            {input1->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    auto reo1 = graph.make("reorder", {gemm1->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(4, 32)},
-                    {"internal", true}});
-
-    auto add = graph.make(
-            "add", {reo0->get_outputs()[0], reo1->get_outputs()[0]}, {}, {});
-    graph.make_output(add->get_outputs());
-    SET_THREADS_OR_SKIP(56);
-    ops::managed_matmul_core_config_t cfg = {1, 56, 1, 1, 1, 0};
-    for (auto &op : graph.ops_) {
-        if (op->op_name_ == "managed_matmul_core") {
-            auto matmul_op = op->dyn_cast<ops::managed_matmul_core_op_t>();
-            matmul_op->set_config(reflection::general_object_t::make(cfg));
-        }
-    }
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[4, 4096], v1: f32[4096, 11008], v2: f32[4, 4096], v3: f32[4096, 11008]) -> [v4: f32[4, 11008]] {
-  [v4: f32[4, 11008]] = outerloop_1X56X1X1X1_partition_managed_matmul_core_managed_matmul_core_sigmoid_reorder_add_reorder(v2, v3, v0, v1)
-}
-)";
-    bool is_special_fm = ctx->machine_.cpu_flags_.is_spr_like();
-    if (is_special_fm) {
-        // managed matmul core will have different config under such machine
-        // Only compare result in this case
-        EXPECT_EQ(ss.str(), expected_str);
-    }
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestMergeMixedPartiVertically3) {
-    sc_graph_t graph;
-
-    SET_THREADS_OR_SKIP(8);
-
-    int M, N, K;
-    M = N = 256;
-    K = 64;
-
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MN))});
-    auto bias = graph.make_input(
-            {graph_tensor::make({N}, sc_data_format_t(format_kinds::A))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-
-    ops::matmul_core_config_t cfg = {32, 32, 32};
-    // mm0
-    auto mm0 = graph.make("matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    // set config
-    mm0->dyn_cast<ops::matmul_core_op_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    mm0 = graph.make("add", {mm0->get_outputs()[0], bias->get_outputs()[0]}, {},
-            {{"bc_axis", std::vector<int> {1}}});
-    // mm1
-    auto mm1 = graph.make("matmul_core",
-            {input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    // set config
-    mm1->dyn_cast<ops::matmul_core_op_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    mm1 = graph.make("add", {mm1->get_outputs()[0], bias->get_outputs()[0]}, {},
-            {{"bc_axis", std::vector<int> {1}}});
-    graph.make_output(mm0->get_outputs());
-    graph.make_output(mm1->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // mm0 and mm1 are exactly two forking partitions, as the result, they
-    // should not be merged vertically, due to no benifit expected.
-    std::string expected_str
-            = R"(graph(v0: f32[256, 64], v1: f32[256], v2: f32[64, 256], v3: f32[64, 256]) -> [v4: f32[256, 256], v5: f32[256, 256]] {
-  [v6: f32[1, 256]] = tensor_view(v1)
-  [v5: f32[256, 256]] = outerloop_8X8_partition_matmul_core_add(v0, v3, v6)
-  [v4: f32[256, 256]] = outerloop_8X8_partition_matmul_core_add(v0, v2, v6)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestMergeMixedPartiVertically4) {
-    REQUIRE_BF16();
-    int num_threads = 56;
-    SET_THREADS_OR_SKIP(num_threads);
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {dimensions::dynamic_any, dimensions::dynamic_any, 16, 256},
-            sc_data_format_t(), datatypes::bf16)});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {dimensions::dynamic_any, dimensions::dynamic_any, 16, 256},
-            sc_data_format_t(), datatypes::bf16)});
-    auto trans0 = graph.make("transpose", input0->get_outputs(), {},
-            {{"order", std::vector<int> {0, 2, 1, 3}}});
-    auto trans1 = graph.make("transpose", weight0->get_outputs(), {},
-            {{"order", std::vector<int> {0, 2, 3, 1}}});
-    // bmm0 may find no suitable anchor after two input partitions merged
-    auto bmm0 = graph.make("matmul_core",
-            {trans0->get_outputs()[0], trans1->get_outputs()[0]}, {}, {});
-    graph.make_output(bmm0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    ctx->flags_.use_cost_model_ = true;
-    graph_driver(graph, ctx);
-    std::vector<sc_op_ptr> lower_args(graph.get_output_ops());
-    auto input_ops = graph.get_input_ops();
-    lower_args.insert(lower_args.end(), input_ops.begin(), input_ops.end());
-    // During dynamic dispatch stage, some `block` may cause exception if no
-    // fall-back mechanism supported
-    auto mod = lower_graph(ctx, graph, lower_args);
-    // the bmm will be fused into one of suitable input partition instead of
-    // merging them
-    EXPECT_TRUE(mod);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestMergeMixedPartiVertically5) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(16);
-
-    int M = 4, N = 11008, K = 4096;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({M, K}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-    auto weight1 = graph.make_input(
-            {graph_tensor::make({K, N}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    {
-        ops::managed_matmul_core_config_t cfg = {1, 16, 1, 8, 4, 0};
-        mmm0->dyn_cast<op_traits::configurable_t>()->set_config(
-                reflection::general_object_t::make(cfg));
-    }
-    auto relu0 = graph.make("relu", mmm0->get_outputs(), {}, {});
-    // mmm1
-    auto mmm1 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-    {
-        ops::managed_matmul_core_config_t cfg = {1, 16, 1, 8, 4, 0};
-        mmm1->dyn_cast<op_traits::configurable_t>()->set_config(
-                reflection::general_object_t::make(cfg));
-    }
-    auto relu1 = graph.make("relu", mmm1->get_outputs(), {}, {});
-    // Although mmm0 and mmm1 are both inputs of add0, they should not be merged
-    // due to nested parallel loop found
-    auto add0 = graph.make(
-            "add", {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {}, {});
-    auto out0 = graph.make_output(add0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    // skip parallel merge
-    ctx->flags_.fusion_level_ = fusion_opt_level::lv2;
-    graph_driver(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[4, 4096], v1: f32[4096, 11008], v2: f32[4096, 11008]) -> [v3: f32[4, 11008]] {
-  [v4: f32[4, 11008]] = outerloop_1X16X1X1X8_partition_managed_matmul_core_relu(v0, v2)
-  [v3: f32[4, 11008]] = outerloop_1X16X1X1X8_partition_managed_matmul_core_relu_add(v0, v1, v4)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestMergeMixedPartiVertically6) {
-    sc_graph_t graph;
-
-    int BS = 28, C = 64, H = 56, W = 56, K = 64;
-    SET_THREADS_OR_SKIP(BS);
-
-    auto input0 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    auto weight0 = graph.make_input({graph_tensor::make({K, C, 1, 1})});
-    weight0->attrs_.set("constant", const_kind::local_const);
-
-    auto relu0 = graph.make("relu", input0->get_outputs(), {}, {});
-    auto radd0 = graph.make("reduce", relu0->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {3}}, {"rd_op", 0}});
-    auto relu1 = graph.make("relu", radd0->get_outputs(), {}, {});
-    // Based on current tunable template implement, conv_fwd does not belong to
-    // any fusion anchor
-    auto conv0 = graph.make("conv_fwd_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    auto add0 = graph.make(
-            "add", {relu1->get_outputs()[0], conv0->get_outputs()[0]}, {}, {});
-
-    graph.make_output(add0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    // disable partition optimization
-    ctx->flags_.opt_level_ = sc_opt_level::lv0;
-
-    mixed_partition(graph, ctx);
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto &parti = mixed_op->parti_list_[0];
-    // tunable op conv0 should be found in op_anchor_map
-    EXPECT_TRUE(parti->op_anchor_map_.find(conv0.get())
-            != parti->op_anchor_map_.end());
-}
-
-class test_prefetchable_op : public tunable_op_t,
-                             public op_traits::may_prefetch_t {
-public:
-    std::vector<int> indices_;
-    test_prefetchable_op(const std::string &op_name,
-            const std::vector<graph_tensor_ptr> &producer_lt,
-            const std::vector<graph_tensor_ptr> &consumer_lt,
-            const any_map_t &attrs)
-        : tunable_op_t(op_name, producer_lt, {producer_lt[0]->copy()}, attrs) {}
-
-    body_generator_ptr create_generator() override { return nullptr; }
-    infer_status_code infer_slice_ranges(
-            const context_ptr &ctx, fslice_map &fsmap) override {
-        throw std::runtime_error("Not implemented");
-    }
-
-    std::vector<int> query_prefetch(const context_ptr &ctx, bool is_global,
-            const std::vector<tensor_slice> &ins) override {
-        return {0, 1};
-    }
-
-    void generate_prefetcher_body_for_tensor(const context_ptr &ctx,
-            const std::vector<expr> &func_args, const std::vector<expr> &ins,
-            const std::vector<int> &indices) override {
-        indices_ = indices;
-    }
-
-    virtual sc_op_ptr copy(const std::vector<graph_tensor_ptr> &ins,
-            const std::vector<graph_tensor_ptr> &outs,
-            sc_graph_t &mgr) override {
-        return mgr.make<test_prefetchable_op>("tperf", ins, outs, attrs_);
-    }
-    ir_module_ptr get_func(context_ptr ctx) override { return nullptr; }
-    void query_format(context_ptr ctx,
-            std::vector<std::vector<format_stride_pair>> &supported_ins,
-            std::vector<std::vector<format_stride_pair>> &supported_outs)
-            override {}
-};
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestPrefetchNone) {
-    sc_graph_t graph;
-    auto ctx = get_test_ctx();
-
-    auto input = graph.make_input(
-            {graph_tensor::make({128, 128}, sc_data_format_t::MK())});
-    auto weight = graph.make_input(
-            {graph_tensor::make({128, 128}, sc_data_format_t::KN())});
-
-    auto reo0 = graph.make("reorder", {input->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(64, 64)}});
-    auto reo1 = graph.make("reorder", {weight->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::NKkn(64, 32)}});
-
-    auto gemm = graph.make<test_prefetchable_op>("tperf",
-            std::vector<graph_tensor_ptr> {
-                    reo0->get_outputs()[0], reo1->get_outputs()[0]},
-            std::vector<graph_tensor_ptr> {}, any_map_t {});
-    gemm->attrs_.set(mixed_partition_hint::first_prefetch_op, true);
-    graph.make_output(gemm->get_outputs());
-
-    std::vector<mixed_parti_t::ptr> par(graph.ops_.size());
-
-    mixed_fuse_op_t tester {"test", par, nullptr, graph,
-            {graph_tensor::make({128, 128}, sc_data_format_t::MK()),
-                    graph_tensor::make({128, 128}, sc_data_format_t::MK())},
-            {graph_tensor::make({128, 128}, sc_data_format_t::MK())}, {}};
-    EXPECT_EQ(tester.query_prefetch(ctx, true, {}), std::vector<int>());
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestPrefetchSelected) {
-    sc_graph_t graph;
-    auto ctx = get_test_ctx();
-
-    auto input = graph.make_input(
-            {graph_tensor::make({128, 128}, sc_data_format_t::MK())});
-    auto weight = graph.make_input(
-            {graph_tensor::make({128, 128}, sc_data_format_t::KN())});
-
-    auto reo1 = graph.make("reorder", {weight->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::NKkn(64, 32)}});
-
-    auto gemm = graph.make<test_prefetchable_op>("tperf",
-            std::vector<graph_tensor_ptr> {
-                    reo1->get_outputs()[0], input->get_outputs()[0]},
-            std::vector<graph_tensor_ptr> {}, any_map_t {});
-    gemm->attrs_.set(mixed_partition_hint::first_prefetch_op, true);
-    graph.make_output(gemm->get_outputs());
-
-    std::vector<mixed_parti_t::ptr> par(graph.ops_.size());
-
-    mixed_fuse_op_t tester {"test", par, nullptr, graph,
-            {graph_tensor::make({128, 128}, sc_data_format_t::MK()),
-                    graph_tensor::make({128, 128}, sc_data_format_t::MK())},
-            {graph_tensor::make({128, 128}, sc_data_format_t::MK())}, {}};
-
-    // prefetch op returns {0,1}, and only 1 is input. Input 1 of the op
-    // maps to the input 0 of the graph
-    EXPECT_EQ(tester.query_prefetch(ctx, true, {}), std::vector<int>({0}));
-
-    tester.generate_prefetcher_body_for_tensor(ctx, {}, {}, {0});
-    bool found = false;
-    for (auto &op : tester.sub_graph_.ops_) {
-        if (auto theop = op->dyn_cast<test_prefetchable_op>()) {
-            EXPECT_EQ(theop->indices_, std::vector<int>({1}));
-            found = true;
-        }
-    }
-    EXPECT_TRUE(found);
-}
-
-// loop finder
-class loop_finder_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    using ir_viewer_t::view;
-    void operator()(stmt_c v) { ir_viewer_t::dispatch(std::move(v)); }
-    bool has_illegal_var() const { return illegal_loop_var_; }
-    bool has_dummy_range() const { return dummy_loop_range_; }
-    bool has_no_barrier_attr() const { return no_barrier_attr_; }
-    void view(for_loop_c f) override {
-        // check `var_` if var type
-        if (!f->var_.isa<var>()) { illegal_loop_var_ = true; }
-        // check loop range if dummy
-        if (f->iter_begin_.isa<constant>() && f->iter_end_.isa<constant>()
-                && get_expr_as_int(f->iter_begin_) == 0
-                && get_expr_as_int(f->iter_end_) == 1) {
-            dummy_loop_range_ = true;
-        }
-        if (f->attr_
-                && f->attr_->get_or_else(
-                        stmt_attr_key::no_post_barrier, false)) {
-            no_barrier_attr_ = true;
-        }
-        ir_viewer_t::view(f);
-    }
-
-private:
-    bool illegal_loop_var_ = false;
-    bool dummy_loop_range_ = false;
-    bool no_barrier_attr_ = false;
-};
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, ParallelMergeAndBarrier) {
-    SET_THREADS_OR_SKIP(16);
-
-    auto make_parallel_mmm_graph = [](bool barrier) {
-        int M = 256, K1 = 2048, N = 1024;
-        sc_graph_t graph;
-        auto input0 = graph.make_input({graph_tensor::make(
-                {M, K1}, sc_data_format_t(format_kinds::MK))});
-        auto weight0 = graph.make_input({graph_tensor::make(
-                {K1, N}, sc_data_format_t(format_kinds::KN))});
-        auto weight1 = graph.make_input({graph_tensor::make(
-                {K1, N}, sc_data_format_t(format_kinds::KN))});
-        ops::managed_matmul_core_config_t cfg0 = {2, 8, 1, 1, 1, 0},
-                                          cfg1 = {2, 4, 1, 1, 1, 0};
-        // mmm0
-        auto mmm0 = graph.make("managed_matmul_core",
-                {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-        mmm0->dyn_cast<op_traits::configurable_t>()->set_config(
-                reflection::general_object_t::make(barrier ? cfg1 : cfg0));
-        // mmm1
-        auto mmm1 = graph.make("managed_matmul_core",
-                {input0->get_outputs()[0], weight1->get_outputs()[0]}, {}, {});
-        mmm1->dyn_cast<op_traits::configurable_t>()->set_config(
-                reflection::general_object_t::make(cfg1));
-        graph.make_output({mmm0->get_outputs()[0]});
-        graph.make_output({mmm1->get_outputs()[0]});
-        return graph;
-    };
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    bool barrier;
-    {
-        /* Case 0: need barrier */
-        barrier = true;
-        auto graph = make_parallel_mmm_graph(barrier);
-        mixed_partition(graph, ctx);
-        auto mixed_op = get_mixed_op_from_graph(graph);
-        ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-        auto &inner_body
-                = mixed_op->parti_list_[0]->get_outer_loops().back()->body_;
-        // loop attr finder
-        loop_finder_t la_finder;
-        la_finder(inner_body);
-        // `no_post_barrier attr` is not expected
-        ASSERT_FALSE(la_finder.has_no_barrier_attr());
-    }
-    {
-        /* Case 1: remove barrier */
-        barrier = false;
-        auto graph = make_parallel_mmm_graph(barrier);
-        mixed_partition(graph, ctx);
-        auto mixed_op = get_mixed_op_from_graph(graph);
-        ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-        auto &body = mixed_op->parti_list_[0]->func_->body_;
-        auto inner_loop = body.cast<stmts>()
-                                  .map([](const stmts &v) {
-                                      return v->seq_.at(0).as<for_loop>();
-                                  })
-                                  .map([](const for_loop &v) {
-                                      return v->body_.as<stmts>()
-                                              ->seq_.at(0)
-                                              .as<for_loop>();
-                                  })
-                                  .get_or_else(for_loop());
-        ASSERT_TRUE(inner_loop.defined());
-        // `no_post_barrier` attr is expected
-        ASSERT_TRUE(inner_loop->attr().get_or_else(
-                stmt_attr_key::no_post_barrier, false));
-    }
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, ParallelMergeNotAppendInputAnchor) {
-    SET_THREADS_OR_SKIP(4);
-    int M = 256, K = 252, N = 256;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {M, K}, sc_data_format_t(format_kinds::MK), sc_data_type_t::u8())});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {K, N}, sc_data_format_t(format_kinds::KN), sc_data_type_t::s8())});
-    auto weight1 = graph.make_input({graph_tensor::make(
-            {K, N}, sc_data_format_t(format_kinds::KN), sc_data_type_t::s8())});
-
-    ops::managed_matmul_core_config_t cfg = {4, 1, 1, 1, 1, 0};
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{op_attr_key::break_pre_fuse, true}});
-    mmm0->dyn_cast<op_traits::configurable_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    //  mmm1
-    auto mmm1 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-            {{op_attr_key::break_pre_fuse, true}});
-    mmm1->dyn_cast<op_traits::configurable_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    graph.make_output({mmm0->get_outputs()[0]});
-    graph.make_output({mmm1->get_outputs()[0]});
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    graph_driver(graph, ctx);
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    // mmm0 and mmm1 both have input anchor, which is under outer loop when
-    // parallel merged. As the result, it could not be straightfowardly append
-    // to target partition fanchor list as inner loop anchor. Otherwise, fusion
-    // partition will finally try to remove an unattached anchor in TIR and
-    // throw assertion error.
-    ASSERT_TRUE(mixed_op);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp,
-        ReplaceBufferForTensorviewPaddingPattern) {
-    SET_THREADS_OR_SKIP(4);
-
-    // cached gt
-    graph_tensor_ptr relu0_out;
-
-    auto make_test_graph = [&relu0_out](bool shared_with_tv = false) {
-        int BS = 1, H = 46, W = 46, C = 16;
-        sc_graph_t graph;
-        auto input0 = graph.make_input({graph_tensor::make(
-                {BS, H, W, C}, sc_data_format_t(), sc_data_type_t::u8())});
-        // relu
-        auto relu0 = graph.make("relu", input0->get_outputs(), {}, {});
-        // cache output of relu0
-        relu0_out = relu0->get_outputs()[0];
-        sc_op_ptr tv0 = nullptr;
-        if (shared_with_tv) {
-            // tensorview0
-            tv0 = graph.make("tensor_view", {relu0->get_outputs()[0]}, {},
-                    {{"shape", sc_dims {BS, H, W, C}}});
-        }
-        auto relu1 = graph.make("relu",
-                tv0 ? tv0->get_outputs() : relu0->get_outputs(), {}, {});
-        graph.make_output({relu1->get_outputs()[0]});
-        // tensorview1
-        auto tv1 = graph.make("tensor_view", {relu0->get_outputs()[0]}, {},
-                {{"shape", sc_dims {BS, H, W, C}}});
-        // padding
-        auto padding0 = graph.make("padding", {tv1->get_outputs()[0]}, {},
-                {{"pads_begin", sc_dims {1, 1}}, {"pads_end", sc_dims {1, 1}}});
-        // relu
-        auto relu2 = graph.make("relu", padding0->get_outputs(), {}, {});
-        graph.make_output({relu2->get_outputs()[0]});
-        return graph;
-    };
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-
-    // Case 1
-    {
-        auto graph = make_test_graph(false);
-        mixed_partition(graph, ctx);
-        // The output buffer of `relu0` should be replaced by padding op
-        auto mixed_op = get_mixed_op_from_graph(graph);
-        ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-        auto allocator = mixed_op->parti_list_[0]->buf_alloc_;
-        auto relu_base_tsr = get_real_tensor(allocator.g2b_map_.get(relu0_out));
-        ASSERT_TRUE(relu_base_tsr->name_ == "padding_5_outs_0");
-    }
-
-    // Case 2
-    {
-        auto graph = make_test_graph(true);
-        mixed_partition(graph, ctx);
-        // The output buffer of `relu0` should not be replaced by padding op
-        // because it is shared with another tensorview op
-        auto mixed_op = get_mixed_op_from_graph(graph);
-        ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-        auto allocator = mixed_op->parti_list_[0]->buf_alloc_;
-        auto relu_base_tsr = get_real_tensor(allocator.g2b_map_.get(relu0_out));
-        ASSERT_TRUE(relu_base_tsr->name_ == "relu_1_outs_0");
-    }
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, CommitPaddingToContentOfAnchor) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(4);
-
-    int BS = 4, H = 46, W = 46, C = 16, K = 16;
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {BS, C, H, W}, sc_data_format_t::NCHW(), sc_data_type_t::s8())});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {K, C, 1, 1}, sc_data_format_t::KCRS(), sc_data_type_t::s8())});
-    weight0->attrs_.set("constant", const_kind::local_const);
-    auto weight1 = graph.make_input({graph_tensor::make(
-            {K, C, 1, 1}, sc_data_format_t::KCRS(), sc_data_type_t::f32())});
-    weight1->attrs_.set("constant", const_kind::local_const);
-
-    // conv0
-    auto conv0 = graph.make("conv_fwd_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    // relu0
-    auto relu0 = graph.make("relu", conv0->get_outputs(), {}, {});
-    // padding0
-    auto padding0 = graph.make("padding", {relu0->get_outputs()[0]}, {},
-            {{"pads_begin", sc_dims {1, 1}}, {"pads_end", sc_dims {1, 1}}});
-    // cast0
-    auto cast0 = graph.make("cast", {padding0->get_outputs()[0]}, {},
-            {{"dtype", datatypes::f32}});
-    // conv1
-    auto conv1 = graph.make("conv_fwd_core",
-            {cast0->get_outputs()[0], weight1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {1, 1}}, {"paddings", sc_dims {0, 0}}});
-    // relu1
-    auto relu1 = graph.make("relu", padding0->get_outputs(), {},
-            {{op_attr_key::break_pre_fuse, true}});
-    // relu2
-    auto relu2 = graph.make("relu", relu1->get_outputs(), {}, {});
-    // relu3
-    auto relu3 = graph.make("relu", conv1->get_outputs(), {}, {});
-    // add0
-    auto addd0 = graph.make(
-            "add", {relu3->get_outputs()[0], relu2->get_outputs()[0]}, {}, {});
-    graph.make_output(addd0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    ASSERT_TRUE(padding0);
-    auto &op_anchor_map = mixed_op->parti_list_[0]->op_anchor_map_;
-    auto iter = op_anchor_map.find(padding0.get());
-    ASSERT_TRUE(iter != op_anchor_map.end());
-    auto &content_number_map = iter->second->content_number_map_;
-    // padding0 shoud be committed to content number map of committed anchor
-    ASSERT_TRUE(content_number_map.find(padding0.get())
-            != content_number_map.end());
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop1) {
-    REQUIRE_AVX2();
-    SET_THREADS_OR_SKIP(28);
-    int BS = 28, H = 32, W = 32, C = 64;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({BS, C, H, W}, sc_data_format_t::NCHW())});
-    auto input1 = graph.make_input({graph_tensor::make({1, 1, H, 1})});
-    // relu
-    auto relu0 = graph.make("relu", input0->get_outputs(), {}, {});
-    // add
-    auto add0 = graph.make(
-            "add", {relu0->get_outputs()[0], input1->get_outputs()[0]}, {}, {});
-    // reorder
-    auto reo0 = graph.make("reorder", add0->get_outputs(), {},
-            {{"out_format", sc_data_format_t::NCHWc(32)}});
-    graph.make_output({reo0->get_outputs()[0]});
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    mixed_partition(graph, ctx);
-
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto func = mixed_op->parti_list_[0]->func_;
-    ASSERT_TRUE(func && func->body_.isa<stmts>());
-    auto body = func->body_.static_as<stmts>();
-    ASSERT_TRUE(body->seq_.size() == 2 && body->seq_[0].isa<for_loop>());
-    auto outer_loop = body->seq_[0].static_as<for_loop>();
-    ASSERT_TRUE(outer_loop->body_.isa<stmts>());
-    auto loop_body = outer_loop->body_.static_as<stmts>();
-    ASSERT_TRUE(loop_body->seq_.size() == 3 && loop_body->seq_[2].isa<stmts>());
-    auto second_anchor = loop_body->seq_[2].static_as<stmts>();
-    ASSERT_TRUE(second_anchor->seq_.size() == 1
-            && second_anchor->seq_[0].isa<for_loop>());
-    auto reo_inner_loop = second_anchor->seq_[0].static_as<for_loop>();
-    // The first inner loop of reorder op should have range of [0, 32] rather
-    // than [0, 1], and own `merge_loop` attr
-    ASSERT_TRUE(get_expr_as_int(reo_inner_loop->iter_end_) == 32
-            && reo_inner_loop->attr_
-            && reo_inner_loop->attr_->get_or_else(
-                    stmt_attr_key::merge_loop, false));
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop2) {
-    SET_THREADS_OR_SKIP(1);
-    int BS = 1, H = 8, W = 8, C = 64;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make({BS, C, H, W})});
-    // relu
-    auto relu0 = graph.make("relu", input0->get_outputs(), {}, {});
-    // reduce
-    auto radd0 = graph.make("reduce", relu0->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {0, 2, 3}}, {"rd_op", 0},
-                    {"keep_dims", false}});
-    graph.make_output(radd0->get_outputs());
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    mixed_partition(graph, ctx);
-
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto &func = mixed_op->parti_list_[0]->func_;
-    // loop var finder
-    loop_finder_t lv_finder;
-    lv_finder(func->body_);
-    // All loop var should be `var` type. `for 0 in (0, 1, 1)` is not expected.
-    EXPECT_FALSE(lv_finder.has_illegal_var());
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, CleanFusibleInnerLoop3) {
-    SET_THREADS_OR_SKIP(28);
-    int BS = 28, M = 32, N = 32;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {BS, M, N}, sc_data_format_t(format_kinds::ABC), datatypes::u8)});
-    auto input1 = graph.make_input({graph_tensor::make(
-            {BS, M, N}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    auto input2 = graph.make_input({graph_tensor::make(
-            {BS, M, N}, sc_data_format_t(format_kinds::ABC), datatypes::f32)});
-    // select0
-    auto select0 = graph.make("select",
-            {input0->get_outputs()[0], input1->get_outputs()[0],
-                    input2->get_outputs()[0]},
-            {}, {});
-    // relu0
-    auto relu0 = graph.make("relu", select0->get_outputs(), {}, {});
-    graph.make_output({relu0->get_outputs()[0]});
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-    mixed_partition(graph, ctx);
-
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto &func = mixed_op->parti_list_[0]->func_;
-    // loop range finder
-    loop_finder_t lr_finder;
-    lr_finder(func->body_);
-    // All loop range should not be dummp like (0, 1, 1)
-    EXPECT_FALSE(lr_finder.has_dummy_range());
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, PoolingLoopReSchedule) {
-    int num_threads = 4;
-    SET_THREADS_OR_SKIP(num_threads);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    auto dtype = sc_data_type_t::f32();
-    auto lanes = vectorize_step(ctx, dtype.as_etype());
-    int N = 1, H = 56, W = 56, C = num_threads * lanes;
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input(
-            {graph_tensor::make({N, H, W, C}, sc_data_format_t(), dtype)});
-    // pooling
-    auto pooling_out = graph.make("pooling_max", input0->get_outputs(), {},
-            {{pooling_attr_key::strides, sc_dims {1, 1}},
-                    {pooling_attr_key::pads_begin, sc_dims {0, 0}},
-                    {pooling_attr_key::pads_end, sc_dims {0, 0}},
-                    {pooling_attr_key::kernel, sc_dims {3, 3}},
-                    {pooling_attr_key::data_format, data_format_options::NXC}});
-    // relu
-    auto relu0 = graph.make("relu", pooling_out->get_outputs(), {}, {});
-    graph.make_output({relu0->get_outputs()[0]});
-    ctx->flags_.use_cost_model_ = true;
-    layout_propagation(graph, ctx);
-    mixed_partition(graph, ctx);
-
-    auto mixed_op = get_mixed_op_from_graph(graph);
-    ASSERT_TRUE(mixed_op && mixed_op->parti_list_.size() == 1);
-    auto parti = mixed_op->parti_list_[0];
-    ASSERT_TRUE(parti->get_outer_loops().size() == 1);
-    auto outer_loop = parti->get_outer_loops()[0].checked_as<for_loop>();
-    // pooling op should reschedule its outer loop to seek more parallelism
-    EXPECT_TRUE(outer_loop->kind_ == for_type::PARALLEL
-            && get_expr_as_int(outer_loop->iter_end_) == num_threads);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, ComplexTensorViewInferSlice) {
-    SET_THREADS_OR_SKIP(4);
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {224, 256}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {256, 256}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    ops::managed_matmul_core_config_t cfg = {4, 1, 1, 1, 1, 0};
-    mmm0->dyn_cast<op_traits::configurable_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    auto relu0 = graph.make("relu", {mmm0->get_outputs()[0]}, {}, {});
-    auto tv0 = graph.make("tensor_view", {relu0->get_outputs()[0]}, {},
-            {{"shape", sc_dims {4, 56, 256}}});
-    auto relu1 = graph.make("relu", {tv0->get_outputs()[0]}, {}, {});
-    auto radd0 = graph.make("reduce", relu1->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {2}}, {"rd_op", 0}});
-    auto relu2 = graph.make("relu", {radd0->get_outputs()[0]}, {}, {});
-    graph.make_output({relu2->get_outputs()[0]});
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // The reduce op could not find suitable anchor to commit, because of fusion
-    // flow was breaked by tensorview op
-    std::string expected_str
-            = R"(graph(v0: f32[224, 256], v1: f32[256, 256]) -> [v2: f32[4, 56, 1]] {
-  [v3: f32[4, 56, 256]] = outerloop_4X1X1X1X1_partition_managed_matmul_core_relu_tensor_view_relu(v0, v1)
-  [v2: f32[4, 56, 1]] = outerloop_4X56_partition_reduce_compute_reduce_collect_relu(v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp,
-        DoNotSplitReduceInNestedParallelTemplate) {
-    SET_THREADS_OR_SKIP(8);
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {224, 256}, sc_data_format_t(format_kinds::MK))});
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {256, 256}, sc_data_format_t(format_kinds::KN))});
-
-    // mmm0
-    auto mmm0 = graph.make("managed_matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, {});
-    ops::managed_matmul_core_config_t cfg = {4, 2, 1, 1, 1, 0};
-    mmm0->dyn_cast<op_traits::configurable_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    auto relu0 = graph.make("relu", {mmm0->get_outputs()[0]}, {}, {});
-    auto radd0 = graph.make("reduce", relu0->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 0}});
-    auto relu1 = graph.make("relu", {radd0->get_outputs()[0]}, {}, {});
-    graph.make_output({relu1->get_outputs()[0]});
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // The reduce op could not be split
-    std::string expected_str
-            = R"(graph(v0: f32[224, 256], v1: f32[256, 256]) -> [v2: f32[224, 1]] {
-  [v2: f32[224, 1]] = outerloop_4_partition_managed_matmul_core_relu_reduce_relu(v0, v1)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp,
-        TransposeSemanticTensorViewBindAxis) {
-    SET_THREADS_OR_SKIP(64);
-    REQUIRE_AMX();
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-
-    sc_graph_t mlp_graph;
-    auto sigmoid_backprop_in0 = mlp_graph.make_input(
-            {graph_tensor::make({128, 1}, sc_data_format_t())});
-    auto sigmoid_backprop_in1 = mlp_graph.make_input(
-            {graph_tensor::make({128, 1}, sc_data_format_t())});
-    auto transpose_in0 = mlp_graph.make_input(
-            {graph_tensor::make({256, 1}, sc_data_format_t())});
-    auto relu_backprop_in0 = mlp_graph.make_input(
-            {graph_tensor::make({128, 256}, sc_data_format_t())});
-    auto transpose_in1 = mlp_graph.make_input(
-            {graph_tensor::make({512, 256}, sc_data_format_t())});
-    auto transpose_in2 = mlp_graph.make_input(
-            {graph_tensor::make({1024, 512}, sc_data_format_t())});
-    auto relu_backprop_in2 = mlp_graph.make_input(
-            {graph_tensor::make({128, 512}, sc_data_format_t())});
-    auto matmul_in1 = mlp_graph.make_input(
-            {graph_tensor::make({1024, 1024}, sc_data_format_t())});
-    auto matmul_in2 = mlp_graph.make_input(
-            {graph_tensor::make({128, 1024}, sc_data_format_t())});
-
-    auto sigmoid_backprop = mlp_graph.make("sigmoid_backprop",
-            {sigmoid_backprop_in0->get_outputs()[0],
-                    sigmoid_backprop_in1->get_outputs()[0]},
-            {}, {{"use_dst", true}});
-    auto static_transpose0
-            = mlp_graph.make("transpose", {transpose_in0->get_outputs()[0]}, {},
-                    {{"order", std::vector<int> {1, 0}}});
-    auto matmul0 = mlp_graph.make("matmul",
-            {sigmoid_backprop->get_outputs()[0],
-                    static_transpose0->get_outputs()[0]},
-            {}, {});
-    auto static_transpose0_0
-            = mlp_graph.make("transpose", {matmul0->get_outputs()[0]}, {},
-                    {{"order", std::vector<int> {1, 0}}});
-    auto matmul0_0 = mlp_graph.make("matmul",
-            {static_transpose0_0->get_outputs()[0],
-                    relu_backprop_in0->get_outputs()[0]},
-            {}, {});
-    auto relu_backprop1 = mlp_graph.make("relu_backprop",
-            {relu_backprop_in0->get_outputs()[0], matmul0->get_outputs()[0]},
-            {}, {{"use_dst", true}});
-    auto static_transpose1
-            = mlp_graph.make("transpose", {transpose_in1->get_outputs()[0]}, {},
-                    {{"order", std::vector<int> {1, 0}}});
-    auto static_transpose2
-            = mlp_graph.make("transpose", {relu_backprop1->get_outputs()[0]},
-                    {}, {{"order", std::vector<int> {1, 0}}});
-    auto matmul1 = mlp_graph.make("matmul",
-            {relu_backprop1->get_outputs()[0],
-                    static_transpose1->get_outputs()[0]},
-            {}, {});
-    auto matmul2 = mlp_graph.make("matmul",
-            {static_transpose2->get_outputs()[0],
-                    relu_backprop_in2->get_outputs()[0]},
-            {}, {});
-    auto relu_backprop2 = mlp_graph.make("relu_backprop",
-            {relu_backprop_in2->get_outputs()[0], matmul1->get_outputs()[0]},
-            {}, {{"use_dst", true}});
-    auto static_transpose3
-            = mlp_graph.make("transpose", {transpose_in2->get_outputs()[0]}, {},
-                    {{"order", std::vector<int> {1, 0}}});
-    auto static_transpose4
-            = mlp_graph.make("transpose", {relu_backprop2->get_outputs()[0]},
-                    {}, {{"order", std::vector<int> {1, 0}}});
-    auto matmul3 = mlp_graph.make("matmul",
-            {relu_backprop2->get_outputs()[0],
-                    static_transpose3->get_outputs()[0]},
-            {}, {});
-    auto matmul4 = mlp_graph.make("matmul",
-            {static_transpose4->get_outputs()[0], matmul_in2->get_outputs()[0]},
-            {}, {});
-    auto matmul5 = mlp_graph.make("matmul",
-            {matmul3->get_outputs()[0], matmul_in1->get_outputs()[0]}, {}, {});
-    auto static_transpose5
-            = mlp_graph.make("transpose", {matmul3->get_outputs()[0]}, {},
-                    {{"order", std::vector<int> {1, 0}}});
-    auto matmul6 = mlp_graph.make("matmul",
-            {static_transpose5->get_outputs()[0], matmul_in2->get_outputs()[0]},
-            {}, {});
-    auto matmul7 = mlp_graph.make("matmul",
-            {matmul5->get_outputs()[0], matmul_in2->get_outputs()[0]}, {}, {});
-
-    mlp_graph.make_output({matmul0_0->get_outputs()[0]});
-    mlp_graph.make_output({matmul2->get_outputs()[0]});
-    mlp_graph.make_output({matmul4->get_outputs()[0]});
-    mlp_graph.make_output({matmul6->get_outputs()[0]});
-    mlp_graph.make_output({matmul7->get_outputs()[0]});
-
-    graph_driver(mlp_graph, ctx);
-    std::stringstream ss;
-    print_graph(mlp_graph, ss, true);
-    // The reduce op could not be split
-    std::string expected_str_spr
-            = R"(graph(v0: f32[128, 1], v1: f32[128, 1], v2: f32[256, 1], v3: f32[128, 256], v4: f32[512, 256], v5: f32[1024, 512], v6: f32[128, 512], v7: f32[1024, 1024], v8: f32[128, 1024]) -> [v9: f32[256, 256], v10: f32[256, 512], v11: f32[512, 1024], v12: f32[1024, 1024], v13: f32[128, 1024]] {
-  [v14: f32[32, 32, 4, 16]] = outerloop_32X32X4_partition_reorder_select_one(v6)
-  [v15: f32[1024, 512]] = tensor_view(v5)
-  [v16: f32[512, 1024]] = reorder(v15)
-  [v17: f32[512, 256]] = tensor_view(v4)
-  [v18: f32[256, 512]] = reorder(v17)
-  [v19: f32[32, 16, 4, 16]] = outerloop_32X16X4_partition_reorder_select_one(v3)
-  [v20: f32[1, 256]] = tensor_view(v2)
-  [v21: f32[128, 1]] = outerloop_128_partition_mul_sub_mul(v0, v1)
-  [v22: f32[64, 8, 4, 16], v23: f32[64, 8, 4, 16], v24: f32[32, 32, 4, 16], v25: f32[32, 8, 16, 16]] = outerloop_32_partition_managed_matmul_core_tensor_view_reorder_mul_tensor_view_reorder_managed_matmul_core_mul_tensor_view_reorder(v21, v20, v19, v18, v14)
-  [v26: f32[64, 8, 16, 16], v13: f32[128, 1024]] = outerloop_8_partition_reorder_managed_matmul_core_tensor_view_reorder_managed_matmul_core_managed_matmul_core_reorder(v24, v16, v7, v8)
-  [v11: f32[512, 1024], v12: f32[1024, 1024]] = outerloop_32X2X1X1_partition_managed_matmul_core_reorder_managed_matmul_core_reorder(v25, v8, v26)
-  [v10: f32[256, 512]] = outerloop_64X1X1X1X1_partition_managed_matmul_core_reorder(v23, v6)
-  [v9: f32[256, 256]] = outerloop_64X1X1X1X1_partition_managed_matmul_core_reorder(v22, v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str_spr);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, InferSliceForBMMWithBroadcast) {
-    SET_THREADS_OR_SKIP(8);
-
-    sc_graph_t graph;
-    auto input0 = graph.make_input({graph_tensor::make(
-            {8, 16, 64, 128}, sc_data_format_t(), sc_data_type_t::u8())});
-    // explicit broadcast semantic on weight side
-    auto weight0 = graph.make_input({graph_tensor::make(
-            {8, 1, 128, 64}, sc_data_format_t(), sc_data_type_t::s8())});
-
-    any_map_t attrs({{"transpose_a", false}, {"transpose_b", false},
-            {"use_mmm", false}});
-    // bmm
-    auto bmm = graph.make("matmul_core",
-            {input0->get_outputs()[0], weight0->get_outputs()[0]}, {}, attrs);
-    ops::matmul_core_config_t cfg = {32, 32, 32};
-    bmm->dyn_cast<ops::matmul_core_op_t>()->set_config(
-            reflection::general_object_t::make(cfg));
-    graph.make_output({bmm->get_outputs()[0]});
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    ctx->flags_.use_cost_model_ = true;
-
-    graph_driver_before_fusion(graph, ctx);
-    mixed_partition(graph, ctx);
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    // The matmul op could not be fused into reorder op along weight side
-    std::string expected_str
-            = R"(graph(v0: u8[8, 16, 64, 128], v1: s8[8, 1, 128, 64]) -> [v2: s32[8, 16, 64, 64]] {
-  [v3: s8[8, 1, 2, 4, 8, 32, 4]] = reorder(v1)
-  [v2: s32[8, 16, 64, 64]] = matmul_core(v0, v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
-
-TEST(GCCore_CPU_graph_mixed_partition_cpp, TestGraphFilterInputPartition) {
-    sc_graph_t graph;
-
-    SET_THREADS_OR_SKIP(10);
-    /** Build following graph
-     *      relu0
-     *      /   \
-     *     |    relu1 ("break_post_fues":true)
-     *      \   /
-     *      add0
-     */
-    auto input = graph.make_input({graph_tensor::make({10, 20, 30})});
-    auto relu0 = graph.make("relu", input->get_outputs(), {}, {});
-    // relu1 is marked as `break_post_fuse`
-    auto relu1 = graph.make(
-            "relu", relu0->get_outputs(), {}, {{"break_post_fuse", true}});
-    // Although `add0` can still be fused with `relu0`, it could not be fused
-    // with input partition including `relu1`
-    auto add0 = graph.make(
-            "add", {relu0->get_outputs()[0], relu1->get_outputs()[0]}, {}, {});
-    graph.make_output(add0->get_outputs());
-
-    mixed_partition(graph, get_test_ctx());
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[10, 20, 30]) -> [v1: f32[10, 20, 30]] {
-  [v2: f32[10, 20, 30], v3: f32[10, 20, 30]] = outerloop_10X20_partition_relu_relu(v0)
-  [v1: f32[10, 20, 30]] = add(v2, v3)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_module_globals_resolver.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_module_globals_resolver.cpp
deleted file mode 100644
index c91af96d476..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_module_globals_resolver.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "gtest/gtest.h"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/pass/graph_constant_cache.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/cpu/closurize.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <runtime/config.hpp>
-#include <runtime/const_cache_wrapper.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_module_globals_resolver_t, TestGlobalTensorExtract) {
-    builder::ir_builder_t builder;
-    module_globals_resolver_t pass {};
-    ir_module_ptr mod = std::make_shared<ir_module_t>(get_default_context());
-    _global_tensor_(mod, gv1, datatypes::f32, 30);
-    _global_tensor_(mod, gv2, datatypes::s32, 1);
-    _global_tensor_(mod, gv3, datatypes::f32, 1);
-    _global_var_(mod, scalar_gv, datatypes::f32, 1.0f);
-
-    auto u64_0 = make_expr<constant_node>(UINT64_C(0));
-    auto pointer_0 = make_expr<constant_node>(UINT64_C(0), datatypes::pointer);
-    auto u8_pointer_0 = make_expr<constant_node>(
-            UINT64_C(0), datatypes::s8.get_pointerof());
-
-    int data = 123;
-    gv2.static_as<tensor>()->init_value_
-            = std::make_shared<static_data_t>(&data, sizeof(data));
-    float data2 = 123.0f;
-    gv3.static_as<tensor>()->init_value_
-            = std::make_shared<static_data_t>(&data2, sizeof(data2));
-
-    auto null_v = make_expr<constant_node>(UINT64_C(0), datatypes::pointer);
-    auto brg_func = builtin::get_brgemm_creator_and_call_func(
-            builtin::brgemm_mode::stride, scflags_t::brgemm_backend_t::dnnl,
-            false)
-                            .second;
-    _function_(datatypes::void_t, aaa, _arg_("args", datatypes::f32)) {
-        _bind_(args);
-        scalar_gv = 2.0f;
-        gv1[0] = 1.0f;
-        _evaluate_call_(get_parallel_call_with_env_func(false),
-                builder::make_func_addr(get_parallel_call_with_env_func(false)),
-                u64_0, pointer_0, u8_pointer_0, 0UL, 1UL, 0UL, gv1);
-        _evaluate_call_(brg_func, null_v, null_v, null_v, null_v, 100, null_v);
-    }
-    mod->add_func({aaa});
-
-    auto mod2 = pass(mod);
-    auto &globals = *mod2->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS);
-    EXPECT_EQ(globals.impl_.size(), 4UL);
-
-    ASSERT_EQ(12UL, globals.initialized_size_);
-    auto offset = globals.impl_.find("scalar_gv");
-    ASSERT_TRUE(offset != globals.impl_.end());
-    EXPECT_EQ(offset->second, 0UL);
-
-    offset = globals.impl_.find("gv2");
-    ASSERT_TRUE(offset != globals.impl_.end());
-    EXPECT_EQ(offset->second, 4UL);
-
-    offset = globals.impl_.find("gv3");
-    ASSERT_TRUE(offset != globals.impl_.end());
-    EXPECT_EQ(offset->second, 8UL);
-
-    offset = globals.impl_.find("gv1");
-    ASSERT_TRUE(offset != globals.impl_.end());
-    EXPECT_EQ(offset->second, 64UL);
-
-    EXPECT_EQ(*(int *)globals.get("gv2"), data);
-    EXPECT_EQ(*(float *)globals.get("gv3"), data2);
-
-    EXPECT_EQ(globals.data_.size_, 64UL + (1 * 10 * 3) * 4);
-
-    _function_(datatypes::void_t, expected, _arg_("stream", datatypes::pointer),
-            _arg_("mod_data", datatypes::s8, {0UL}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(ctx, mod_data, args);
-        _var_(scalar_gv, datatypes::f32);
-        _tensor_(gv1, datatypes::f32, 30);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(mod_data, {64UL});
-        scalar_gv = 2.0f;
-        gv1[0] = 1.0f;
-        _evaluate_call_(get_parallel_call_with_env_func(false),
-                builder::make_func_addr(get_parallel_call_with_env_func(false)),
-                u64_0, ctx, mod_data, 0UL, 1UL, 0UL, gv1);
-        _evaluate_call_(brg_func, null_v, null_v, null_v, null_v, 100, ctx);
-    }
-    expected->params_[1].checked_as<tensor>()->dims_[0] = 0UL;
-    auto func_aaa = mod2->get_func("aaa");
-    ASSERT_TRUE(func_aaa);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(expected, func_aaa, false));
-
-    globals.save_to_file("globals.bin");
-    auto loaded = statics_table_t::load_from_file("globals.bin");
-    ASSERT_TRUE(loaded.initialized_size_ == globals.initialized_size_);
-    ASSERT_TRUE(loaded.data_.size_ == globals.data_.size_);
-    ASSERT_EQ(memcmp(loaded.data_.data_, globals.data_.data_,
-                      loaded.initialized_size_),
-            0);
-}
-
-TEST(GCCore_CPU_module_globals_resolver_t, TestGlobalSharedTensor) {
-    auto dummy_buffer = std::make_shared<int>();
-    // compile-time const buffer
-    auto base1 = std::make_shared<runtime::const_cache_proxy>(
-            dummy_buffer, dummy_buffer.get(), 128, false);
-    auto dummy_buffer2 = std::make_shared<int>();
-    auto base2 = std::make_shared<runtime::const_cache_proxy>(
-            dummy_buffer2, dummy_buffer2.get(), 128, true);
-    auto dummy_buffer3 = std::make_shared<int>();
-    auto base3 = std::make_shared<runtime::const_cache_proxy>(
-            dummy_buffer3, dummy_buffer3.get(), 128, true);
-
-    auto graph_tsr1
-            = std::make_shared<cached_const_graph_tensor>(nullptr, 64, nullptr);
-    graph_tsr1->buf_base_ = base1;
-    graph_tsr1->offset_ = 0;
-    auto graph_tsr2
-            = std::make_shared<cached_const_graph_tensor>(nullptr, 64, nullptr);
-    graph_tsr2->buf_base_ = base1;
-    graph_tsr2->offset_ = 64;
-
-    auto graph_tsr3
-            = std::make_shared<cached_const_graph_tensor>(nullptr, 64, nullptr);
-    graph_tsr3->buf_base_ = base2;
-    graph_tsr3->offset_ = 0;
-    auto graph_tsr4
-            = std::make_shared<cached_const_graph_tensor>(nullptr, 64, nullptr);
-    graph_tsr4->buf_base_ = base2;
-    graph_tsr4->offset_ = 64;
-
-    auto graph_tsr5
-            = std::make_shared<cached_const_graph_tensor>(nullptr, 64, nullptr);
-    graph_tsr5->buf_base_ = base3;
-    graph_tsr5->offset_ = 64;
-
-    builder::ir_builder_t builder;
-    module_globals_resolver_t pass {};
-    _function_(datatypes::void_t, aaa, _arg_("args", datatypes::f32)) {
-        _bind_(args);
-        args = 1.0f;
-        _tensor_(__is_init, datatypes::s32, 1);
-        builder.get_current_scope()
-                .as_seq()
-                .back()
-                ->attr()[attr_keys::is_shared_const_init_stmt]
-                = true;
-        __is_init[0] = 1;
-        builder.get_current_scope()
-                .as_seq()
-                .back()
-                ->attr()[attr_keys::is_shared_const_init_stmt]
-                = true;
-
-        _tensor_(A, datatypes::f32, 16);
-        A->attr()[attr_keys::shared_const] = graph_tsr1;
-        _tensor_(B, datatypes::f32, 16);
-        B->attr()[attr_keys::shared_const] = graph_tsr2;
-        _tensor_(C, datatypes::f32, 16);
-        C->attr()[attr_keys::shared_const] = graph_tsr3;
-        _tensor_(D, datatypes::f32, 16);
-        D->attr()[attr_keys::shared_const] = graph_tsr4;
-        _tensor_(E, datatypes::f32, 16);
-        E->attr()[attr_keys::shared_const] = graph_tsr5;
-    }
-    auto mod = ir_module_t::from_entry_func(get_default_context(), aaa);
-    mod->attr_[ir_module_t::attr_key_t::SHARED_CONST_BASES]
-            = std::vector<std::shared_ptr<runtime::const_cache_proxy>> {
-                    base1, base2, base3};
-
-    auto out_mod = pass(mod);
-    _function_(datatypes::void_t, expected, _arg_("stream", datatypes::pointer),
-            _arg_("mod_data", datatypes::s8, {0UL}),
-            _arg_("args", datatypes::f32)) {
-        _bind_(stream, moddata, args);
-        _tensor_(handles, datatypes::index, UINT64_C(3));
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(moddata, {0UL});
-        args = 1.0f;
-        _tensor_(__is_init, datatypes::s32, 1);
-        __is_init[0] = 1;
-
-        _tensor_(base0, datatypes::u8, UINT64_C(128));
-        _tensor_(base1, datatypes::u8, UINT64_C(128));
-        _tensor_(base2, datatypes::u8, UINT64_C(128));
-
-        _tensor_(A, datatypes::f32, 16);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(base0, {0UL});
-        _tensor_(B, datatypes::f32, 16);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(base0, {64UL});
-        _tensor_(C, datatypes::f32, 16);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(base1, {0UL});
-        _tensor_(D, datatypes::f32, 16);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(base1, {64UL});
-        _tensor_(E, datatypes::f32, 16);
-        builder.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(base2, {64UL});
-    }
-
-    ir_comparer cmper {true};
-    ASSERT_TRUE(cmper.compare(out_mod->get_entry_func(), expected, true));
-    auto &globals = *out_mod->attr_.get<std::shared_ptr<statics_table_t>>(
-            ir_module_t::attr_key_t::MODULE_DATA_BUFFERS);
-    EXPECT_EQ(globals.impl_.size(), 1UL);
-    auto handles_buf = (void **)globals.get_or_null("__shared_const_handle");
-    ASSERT_TRUE(handles_buf);
-    // const buffer, directly use buffer ptr
-    EXPECT_EQ(handles_buf[0], dummy_buffer.get());
-    // lazy const buffer, use buffer proxy
-    EXPECT_EQ(handles_buf[1], base2.get());
-    // lazy const buffer, use buffer proxy
-    EXPECT_EQ(handles_buf[2], base3.get());
-
-    auto &body = out_mod->get_entry_func()->body_.checked_as<stmts>()->seq_;
-    auto is_ok = [&body](int index, size_t expected,
-                         cached_const_graph_tensor *shared) {
-        return body.at(index)
-                .cast<define>()
-                .filter([expected, shared](const define &v) {
-                    auto pshared = any_map_t::fetch_or_null<
-                            std::shared_ptr<cached_const_graph_tensor>>(
-                            v->var_->attr_.get(), attr_keys::shared_const);
-                    if (!pshared) { return false; }
-                    return any_map_t::fetch_or_else(v->var_->attr_.get(),
-                                   attr_keys::shared_const_base_idx,
-                                   size_t(10000))
-                            == expected
-                            && pshared->get() == shared;
-                })
-                .has_value();
-    };
-    // check shared_const_base_idx attr
-    EXPECT_TRUE(is_ok(4, 0, graph_tsr1.get()));
-    EXPECT_TRUE(is_ok(5, 1, graph_tsr3.get()));
-    EXPECT_TRUE(is_ok(6, 2, graph_tsr5.get()));
-    // prevent unregistering the buffer to graph API cache manager
-    base2->is_lazy_ = false;
-    base3->is_lazy_ = false;
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_node_uniquify.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_node_uniquify.cpp
deleted file mode 100644
index 9deaf9bcbfc..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_node_uniquify.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/node_uniquify.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <unordered_set>
-
-using namespace dnnl::impl::graph::gc;
-
-class node_uniquify_checker_t : public ir_viewer_t {
-public:
-    using ir_viewer_t::dispatch;
-    std::unordered_set<expr_c> exprs;
-    std::unordered_set<stmt_c> stmts;
-    expr_c dispatch(expr_c v) override {
-        if (!v.isa<var>() && !v.isa<tensor>()) {
-            EXPECT_TRUE(exprs.find(v) == exprs.end());
-            if (exprs.find(v) != exprs.end()) { std::cout << v << "\n"; }
-            exprs.insert(v);
-            return ir_viewer_t::dispatch(v);
-        }
-        return v;
-    }
-
-    stmt_c dispatch(stmt_c v) override {
-        EXPECT_TRUE(stmts.find(v) == stmts.end());
-        stmts.insert(v);
-        return ir_viewer_t::dispatch(v);
-    }
-};
-
-TEST(GCCore_CPU_node_uniquify_cpp, TestNodeUniquify) {
-    builder::ir_builder_t builder;
-    expr temp;
-    for_loop lo;
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        _tensor_(D, datatypes::f32, 10, 20); // stmt0
-        _var_(d, datatypes::s32); // stmt1
-        _named_for_(lo, i, 0, 100) { A[{i, 10}] = D[{1, i}]; } // stmt2
-        // the loop is duplicated
-        builder.emit(lo); // stmt3
-        temp = d + 32;
-        // expr temp is used twice
-        A[{0, 10}] = temp; // stmt4
-        _return_(temp); // stmt5
-    }
-
-    ir_comparer cmper(false, true);
-    auto result = node_uniquifier_t()(aaa);
-    node_uniquify_checker_t().dispatch(result);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_opt_level.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_opt_level.cpp
deleted file mode 100644
index dc1de69b863..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_opt_level.cpp
+++ /dev/null
@@ -1,364 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <fstream>
-#include <iostream>
-#include <utility>
-#include "context.hpp"
-#include "test_graph.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fused_op.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/jit/jit.hpp>
-
-using namespace std;
-
-static sc_graph_t get_quantized_matmul_graph(const sc_dims &data_dims,
-        const sc_dims &weight_dims, const sc_dims &gamma_dims,
-        const sc_dims &beta_dims) {
-    sc_graph_t graph;
-    auto data = graph.make_input(
-            {graph_tensor::make(data_dims, sc_data_format_t(), datatypes::u8)});
-    auto weight = graph.make_input({graph_tensor::make(
-            weight_dims, sc_data_format_t(), datatypes::s8)});
-    data = graph.make("dequantize", data->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.2f}},
-                    {"zero_points", std::vector<int> {0}}});
-    weight = graph.make("dequantize", weight->get_outputs(), {},
-            {{"dtype", datatypes::f32}, {"scales", std::vector<float> {1.2f}},
-                    {"zero_points", std::vector<int> {0}}});
-    auto mm = graph.make("matmul",
-            {data->get_outputs()[0], weight->get_outputs()[0]}, {}, {});
-    auto gelu = graph.make("gelu", mm->get_outputs(), {}, {});
-    auto gamma = graph.make_input({graph_tensor::make(gamma_dims)});
-    auto beta = graph.make_input({graph_tensor::make(beta_dims)});
-    auto lnorm = graph.make("layernorm",
-            {gelu->get_outputs()[0], gamma->get_outputs()[0],
-                    beta->get_outputs()[0]},
-            {},
-            {{"use_affine", true}, {"begin_norm_axis", 1}, {"epsilon", 1e-5f},
-                    {"keep_stats", false}});
-    auto output = graph.make("quantize", lnorm->get_outputs(), {},
-            {{"dtype", datatypes::u8}, {"scales", std::vector<float> {1.3f}},
-                    {"zero_points", std::vector<int> {5}}});
-    output = graph.make_output(output->get_outputs());
-    return graph;
-}
-
-TEST(GCCore_CPU_opt_level_cpp, TestCompilerOptLevel) {
-    REQUIRE_VNNI();
-    SKIP_AMX();
-    sc_dims data_dims = {16, 64}, weight_dims = {64, 256}, gamma_dims = {256},
-            beta_dims = {256}, out_dims = {16, 256};
-    test_buffer<uint8_t> data(test_utils::product(data_dims));
-    test_buffer<int8_t> weight(test_utils::product(weight_dims));
-    test_buffer<float> gamma(test_utils::product(gamma_dims));
-    test_buffer<float> beta(test_utils::product(beta_dims));
-    test_utils::fill_data<uint8_t>(data.get(), test_utils::product(data_dims));
-    test_utils::fill_data<int8_t>(
-            weight.get(), test_utils::product(weight_dims));
-    test_utils::fill_data<float>(gamma.get(), test_utils::product(gamma_dims));
-    test_utils::fill_data<float>(beta.get(), test_utils::product(beta_dims));
-    std::vector<generic_val> gargs;
-    gargs.emplace_back((void *)data.get());
-    gargs.emplace_back((void *)weight.get());
-    gargs.emplace_back((void *)gamma.get());
-    gargs.emplace_back((void *)beta.get());
-    gargs.emplace_back((void *)nullptr);
-    test_buffer<uint8_t> output_lv0(test_utils::product(out_dims));
-    test_buffer<uint8_t> output_lv1(test_utils::product(out_dims));
-    test_buffer<uint8_t> output_lv2(test_utils::product(out_dims));
-    test_buffer<uint8_t> output_lv3(test_utils::product(out_dims));
-    {
-        // opt level 0
-        auto graph = get_quantized_matmul_graph(
-                data_dims, weight_dims, gamma_dims, beta_dims);
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv0;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv0;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_0
-                = R"(graph(v0: u8[16, 64], v1: s8[64, 256], v2: f32[256], v3: f32[256]) -> [v4: u8[16, 256]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[1]] = constant([1])
-  [v9: f32[1]] = constant([1])
-  [v10: f32[1]] = constant([1])
-  [v11: f32[1]] = constant([1])
-  [v12: f32[1]] = constant([1])
-  [v13: f32[1]] = constant([1])
-  [v14: f32[1, 256]] = tensor_view(v3)
-  [v15: f32[1, 256]] = tensor_view(v2)
-  [v16: s8[4, 4, 4, 64, 4]] = reorder(v1)
-  [v17: s32[16, 256]] = quantized_managed_matmul_core(v0, v16)
-  [v18: f32[16, 256]] = cast(v17)
-  [v19: f32[16, 256]] = mul(v18, v7)
-  [v20: f32[16, 256]] = cast(v17)
-  [v21: f32[16, 256]] = mul(v20, v8)
-  [v22: f32[16, 256]] = mul(v21, v11)
-  [v23: f32[16, 256]] = erf(v22)
-  [v24: f32[16, 256]] = add(v23, v10)
-  [v25: f32[16, 256]] = mul(v24, v19)
-  [v26: f32[16, 256]] = mul(v25, v9)
-  [v27: f32[16]] = reduce(v26)
-  [v28: f32[16]] = div(v27, v12)
-  [v29: f32[1, 16]] = tensor_view(v28)
-  [v30: f32[16, 1]] = tensor_view(v29)
-  [v31: f32[16, 256]] = sub(v26, v30)
-  [v32: f32[16, 256]] = mul(v31, v31)
-  [v33: f32[16]] = reduce(v32)
-  [v34: f32[16]] = div(v33, v12)
-  [v35: f32[16]] = add(v34, v13)
-  [v36: f32[16]] = squared_root(v35)
-  [v37: f32[1, 16]] = tensor_view(v36)
-  [v38: f32[16, 1]] = tensor_view(v37)
-  [v39: f32[16, 256]] = div(v31, v38)
-  [v40: f32[16, 256]] = mul(v39, v15)
-  [v41: f32[16, 256]] = add(v40, v14)
-  [v42: f32[16, 256]] = mul(v41, v6)
-  [v43: f32[16, 256]] = add(v42, v5)
-  [v4: u8[16, 256]] = cast(v43)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_0);
-        std::vector<sc_op_ptr> op_args = graph.get_input_ops();
-        const std::vector<sc_op_ptr> &out_args = graph.get_output_ops();
-        op_args.insert(op_args.end(), out_args.begin(), out_args.end());
-        auto ir_mod = lower_graph(temp_ctx, graph, op_args);
-        auto jitf = jit_engine_t::make(temp_ctx)->get_entry_func(ir_mod, true);
-        gargs.back() = (void *)output_lv0.data();
-        jitf->call_generic_default(gargs.data());
-    }
-    {
-        // opt level 1
-        auto graph = get_quantized_matmul_graph(
-                data_dims, weight_dims, gamma_dims, beta_dims);
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv1;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv1;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_1
-                = R"(graph(v0: u8[16, 64], v1: s8[64, 256], v2: f32[256], v3: f32[256]) -> [v4: u8[16, 256]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[1]] = constant([1])
-  [v9: f32[1]] = constant([1])
-  [v10: f32[1]] = constant([1])
-  [v11: f32[1]] = constant([1])
-  [v12: f32[1]] = constant([1])
-  [v13: f32[1]] = constant([1])
-  [v14: f32[1, 256]] = tensor_view(v3)
-  [v15: f32[1, 256]] = tensor_view(v2)
-  [v16: s8[4, 4, 4, 64, 4]] = reorder(v1)
-  [v17: s32[16, 256]] = quantized_managed_matmul_core(v0, v16)
-  [v18: f32[16, 256]] = outerloop_16_partition_cast_mul(v17, v7)
-  [v19: f32[16, 256]] = outerloop_16_partition_cast_mul(v17, v8)
-  [v20: f32[16, 256]] = outerloop_16_partition_mul_erf_add_mul_mul(v19, v11, v10, v18, v9)
-  [v21: f32[16, 256]] = outerloop_16_partition_reduce_compute_reduce_collect_div_tensor_view_tensor_view_sub_mul_reduce_compute_reduce_collect_div_add_squared_root_tensor_view_tensor_view_div_mul_add(v20, v12, v13, v15, v14)
-  [v4: u8[16, 256]] = outerloop_16_partition_mul_add_cast(v21, v6, v5)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_1);
-        std::vector<sc_op_ptr> op_args = graph.get_input_ops();
-        const std::vector<sc_op_ptr> &out_args = graph.get_output_ops();
-        op_args.insert(op_args.end(), out_args.begin(), out_args.end());
-        auto ir_mod = lower_graph(temp_ctx, graph, op_args);
-        auto jitf = jit_engine_t::make(temp_ctx)->get_entry_func(ir_mod, true);
-        gargs.back() = (void *)output_lv1.data();
-        jitf->call_generic_default(gargs.data());
-    }
-    {
-        // opt level 2
-        auto graph = get_quantized_matmul_graph(
-                data_dims, weight_dims, gamma_dims, beta_dims);
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv2;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv2;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_2
-                = R"(graph(v0: u8[16, 64], v1: s8[64, 256], v2: f32[256], v3: f32[256]) -> [v4: u8[16, 256]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[1]] = constant([1])
-  [v9: f32[1]] = constant([1])
-  [v10: f32[1]] = constant([1])
-  [v11: f32[1]] = constant([1])
-  [v12: f32[1]] = mul(v8, v11)
-  [v13: f32[1]] = constant([1])
-  [v14: f32[1]] = constant([1])
-  [v15: f32[1, 256]] = tensor_view(v3)
-  [v16: f32[1, 256]] = tensor_view(v2)
-  [v4: u8[16, 256]] = outerloop_4X1X1X1X1_partition_reorder_quantized_managed_matmul_core_cast_mul_erf_add_mul_mul_mul_reduce_div_tensor_view_sub_mul_reduce_div_add_squared_root_tensor_view_reciprocal_tensor_view_mul_mul_add_mul_add_cast(v1, v0, v12, v10, v7, v9, v13, v14, v16, v15, v6, v5)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_2);
-        std::vector<sc_op_ptr> op_args = graph.get_input_ops();
-        const std::vector<sc_op_ptr> &out_args = graph.get_output_ops();
-        op_args.insert(op_args.end(), out_args.begin(), out_args.end());
-        auto ir_mod = lower_graph(temp_ctx, graph, op_args);
-        auto jitf = jit_engine_t::make(temp_ctx)->get_entry_func(ir_mod, true);
-        gargs.back() = (void *)output_lv2.data();
-        jitf->call_generic_default(gargs.data());
-    }
-    {
-        // opt level 3
-        auto graph = get_quantized_matmul_graph(
-                data_dims, weight_dims, gamma_dims, beta_dims);
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv3;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv3;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_3
-                = R"(graph(v0: u8[16, 64], v1: s8[64, 256], v2: f32[256], v3: f32[256]) -> [v4: u8[16, 256]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[1]] = constant([1])
-  [v9: f32[1]] = constant([1])
-  [v10: f32[1]] = constant([1])
-  [v11: f32[1]] = constant([1])
-  [v12: f32[1]] = mul(v8, v11)
-  [v13: f32[1]] = constant([1])
-  [v14: f32[1]] = constant([1])
-  [v15: f32[1, 256]] = tensor_view(v3)
-  [v16: f32[1, 256]] = tensor_view(v2)
-  [v4: u8[16, 256]] = outerloop_4X1X1X1X1_partition_reorder_quantized_managed_matmul_core_cast_mul_erf_add_mul_mul_mul_reduce_div_tensor_view_sub_mul_reduce_div_add_squared_root_tensor_view_reciprocal_tensor_view_mul_mul_add_mul_add_cast(v1, v0, v12, v10, v7, v9, v13, v14, v16, v15, v6, v5)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_3);
-        std::vector<sc_op_ptr> op_args = graph.get_input_ops();
-        const std::vector<sc_op_ptr> &out_args = graph.get_output_ops();
-        op_args.insert(op_args.end(), out_args.begin(), out_args.end());
-        auto ir_mod = lower_graph(temp_ctx, graph, op_args);
-        auto jitf = jit_engine_t::make(temp_ctx)->get_entry_func(ir_mod, true);
-        gargs.back() = (void *)output_lv3.data();
-        jitf->call_generic_default(gargs.data());
-    }
-    test_utils::compare_data(output_lv0.data(), output_lv1.data(),
-            test_utils::product(out_dims), 1e-4f, 1.0f);
-    test_utils::compare_data(output_lv0.data(), output_lv2.data(),
-            test_utils::product(out_dims), 1e-4f, 1.0f);
-    test_utils::compare_data(output_lv0.data(), output_lv3.data(),
-            test_utils::product(out_dims), 1e-4f, 1.0f);
-}
-
-TEST(GCCore_CPU_opt_level_cpp, TestCompilerOptLevel2) {
-    SET_THREADS_OR_SKIP(28);
-    {
-        // opt level 0
-        auto graph = get_parallel_merge_mlp_graph();
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv0;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv0;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_0
-                = R"(graph(v0: f32[10752, 1024], v1: f32[1024, 1024], v2: f32[1024, 1024], v3: f32[1024, 1024]) -> [v4: f32[10752, 1024]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[168, 16, 64, 64]] = managed_matmul_core(v0, v1)
-  [v9: f32[10752, 1024]] = managed_matmul_core(v8, v2)
-  [v10: f32[10752, 1024]] = mul(v9, v7)
-  [v11: f32[10752, 1024]] = erf(v10)
-  [v12: f32[10752, 1024]] = add(v11, v6)
-  [v13: f32[10752, 1024]] = mul(v12, v9)
-  [v14: f32[10752, 1024]] = mul(v13, v5)
-  [v15: f32[10752, 1024]] = managed_matmul_core(v14, v3)
-  [v4: f32[10752, 1024]] = add(v14, v15)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_0);
-    }
-    {
-        // opt level 1
-        auto graph = get_parallel_merge_mlp_graph();
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv1;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv1;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_1
-                = R"(graph(v0: f32[10752, 1024], v1: f32[1024, 1024], v2: f32[1024, 1024], v3: f32[1024, 1024]) -> [v4: f32[10752, 1024]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[168, 16, 64, 64]] = managed_matmul_core(v0, v1)
-  [v9: f32[10752, 1024]] = managed_matmul_core(v8, v2)
-  [v10: f32[10752, 1024]] = outerloop_10752_partition_mul_erf_add_mul_mul(v9, v7, v6, v5)
-  [v4: f32[10752, 1024]] = outerloop_14X2X1X1X1_partition_managed_matmul_core_add(v10, v3)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_1);
-    }
-    {
-        // opt level 2
-        auto graph = get_parallel_merge_mlp_graph();
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv2;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv2;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_2
-                = R"(graph(v0: f32[10752, 1024], v1: f32[1024, 1024], v2: f32[1024, 1024], v3: f32[1024, 1024]) -> [v4: f32[10752, 1024]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v8: f32[168, 16, 64, 64]] = managed_matmul_core(v0, v1)
-  [v9: f32[10752, 1024]] = outerloop_14X2X1X1X1_partition_managed_matmul_core_mul_erf_add_mul_mul(v8, v2, v7, v6, v5)
-  [v4: f32[10752, 1024]] = outerloop_14X2X1X1X1_partition_managed_matmul_core_add(v9, v3)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_2);
-    }
-    {
-        // opt level 3
-        auto graph = get_parallel_merge_mlp_graph();
-        auto temp_ctx = std::make_shared<context_t>(*get_test_ctx());
-        temp_ctx->flags_.opt_level_ = sc_opt_level::lv3;
-        temp_ctx->flags_.fusion_level_ = fusion_opt_level::lv3;
-        graph_driver(graph, temp_ctx);
-        std::stringstream ss;
-        print_graph(graph, ss, true);
-        constexpr const char *expected_3
-                = R"(graph(v0: f32[10752, 1024], v1: f32[1024, 1024], v2: f32[1024, 1024], v3: f32[1024, 1024]) -> [v4: f32[10752, 1024]] {
-  [v5: f32[1]] = constant([1])
-  [v6: f32[1]] = constant([1])
-  [v7: f32[1]] = constant([1])
-  [v4: f32[10752, 1024]] = outerloop_14_partition_managed_matmul_core_managed_matmul_core_mul_erf_add_mul_mul_managed_matmul_core_add(v0, v1, v2, v7, v6, v5, v3)
-}
-)";
-        EXPECT_EQ(ss.str(), expected_3);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_optional.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_optional.cpp
deleted file mode 100644
index 50d3632c27e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_optional.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <memory>
-#include <string>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builder.hpp>
-#include <unordered_map>
-#include <util/optional.hpp>
-#include <util/optional_find.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-namespace optional_test {
-struct AAA {
-    virtual ~AAA() = default;
-};
-
-struct BBB : public AAA {
-    static int cnt;
-    static int dtor_cnt;
-    static int move_cnt;
-    int val;
-    BBB(const BBB &v) : val(v.val) { cnt++; }
-    BBB(BBB &&v) : val(v.val) {
-        move_cnt++;
-        cnt++;
-    }
-    BBB(int val) : val(val) { cnt++; }
-    ~BBB() { dtor_cnt++; }
-};
-
-int BBB::cnt = 0;
-int BBB::dtor_cnt = 0;
-int BBB::move_cnt = 0;
-} // namespace optional_test
-
-using namespace optional_test;
-
-TEST(GCCore_CPU_optional_cpp, TestOptional) {
-    // check some_opt and map
-    {
-        auto v = some_opt(std::unique_ptr<BBB>(new BBB {10}))
-                         .map([](const std::unique_ptr<BBB> &v) {
-                             return v->val;
-                         })
-                         .get_or_else(-1);
-        ASSERT_EQ(v, 10);
-        ASSERT_EQ(BBB::cnt, 1);
-        ASSERT_EQ(BBB::dtor_cnt, 1);
-    }
-
-    // check map fall through and nullptr
-    {
-        auto v = some_opt(std::unique_ptr<BBB>())
-                         .map([](const std::unique_ptr<BBB> &v) {
-                             return v->val;
-                         })
-                         .get_or_else(-1);
-        ASSERT_EQ(v, -1);
-    }
-    static_assert(sizeof(optional<std::unique_ptr<BBB>>)
-                    == sizeof(std::unique_ptr<BBB>),
-            "Sizeof optional check failed");
-
-    // check std::move(optional) and std::move(unique_ptr)
-    {
-        optional<std::unique_ptr<AAA>> v {std::unique_ptr<BBB>(new BBB {11})};
-        // check move construct
-        optional<std::unique_ptr<AAA>> v2 = std::move(v);
-        ASSERT_EQ(BBB::cnt, 2);
-        ASSERT_EQ(BBB::dtor_cnt, 1);
-        // check moved optional
-        ASSERT_FALSE(v.has_value());
-        // check get() throw
-        EXPECT_SC_ERROR({ v.get(); }, "Bad optional");
-
-        auto ptr = std::move(v2.get());
-        ASSERT_EQ(BBB::cnt, 2);
-        ASSERT_EQ(BBB::dtor_cnt, 1);
-
-        optional<std::unique_ptr<AAA>> v3 = std::move(ptr);
-        ASSERT_EQ(BBB::cnt, 2);
-        ASSERT_EQ(BBB::dtor_cnt, 1);
-
-        // check move assign
-        v3 = std::move(v2);
-        ASSERT_EQ(BBB::dtor_cnt, 2);
-        ASSERT_FALSE(v3.has_value());
-
-        v3 = optional<std::unique_ptr<AAA>> {
-                std::unique_ptr<BBB>(new BBB {11})};
-    }
-    ASSERT_EQ(BBB::dtor_cnt, 3);
-
-    //////////////////// check copyable
-    {
-        auto opt = some_opt(BBB {123});
-        ASSERT_EQ(BBB::move_cnt, 1);
-        ASSERT_EQ(BBB::cnt, 5);
-        ASSERT_EQ(BBB::dtor_cnt, 4);
-        optional<BBB> opt2 {opt};
-        ASSERT_TRUE(opt.has_value());
-        ASSERT_EQ(BBB::cnt, 6);
-        ASSERT_EQ(BBB::move_cnt, 1);
-        ASSERT_EQ(BBB::dtor_cnt, 4);
-
-        opt2 = std::move(opt);
-        ASSERT_FALSE(opt.has_value());
-        ASSERT_EQ(BBB::cnt, 7);
-        ASSERT_EQ(BBB::move_cnt, 2);
-        // old opt2 and opt destroyed
-        ASSERT_EQ(BBB::dtor_cnt, 6);
-        ASSERT_EQ(opt2.get().val, 123);
-
-        optional<BBB> opt3;
-        ASSERT_FALSE(opt3.has_value());
-        opt3 = std::move(opt2);
-        ASSERT_EQ(BBB::cnt, 8);
-        ASSERT_EQ(BBB::move_cnt, 3);
-        // opt2 destroyed
-        ASSERT_EQ(BBB::dtor_cnt, 7);
-        ASSERT_EQ(opt3.get().val, 123);
-    }
-    ASSERT_EQ(BBB::dtor_cnt, 8);
-
-    //////////////////// check flat_map
-    {
-        auto check_func = [](int input) {
-            return some_opt(input).flat_map([](int v) {
-                if (v == 2) {
-                    return some_opt(v + 1);
-                } else {
-                    return optional<int> {};
-                }
-            });
-        };
-        ASSERT_FALSE(check_func(1).has_value());
-        auto opt = check_func(2);
-        ASSERT_TRUE(opt.has_value());
-        ASSERT_TRUE(opt.get() == 3);
-    }
-
-    // check or_else
-    {
-        int outv1 = 1;
-        auto check_func = [&outv1](int *input) {
-            return some_opt(input).or_else(
-                    [&outv1]() { return some_opt(&outv1); });
-        };
-        auto opt = check_func(nullptr);
-        ASSERT_TRUE(opt.has_value());
-        ASSERT_TRUE(opt.get() == &outv1);
-
-        opt = check_func(&outv1);
-        ASSERT_TRUE(opt.has_value());
-        ASSERT_TRUE(opt.get() == &outv1);
-    }
-
-    // check filter
-    {
-        auto check_func = [](optional<int> v) {
-            return std::move(v).filter([](int v) { return v == 2; });
-        };
-        ASSERT_FALSE(check_func(1).has_value());
-        ASSERT_FALSE(check_func(none_opt {}).has_value());
-        auto opt = check_func(2);
-        ASSERT_TRUE(opt.has_value());
-        ASSERT_TRUE(opt.get() == 2);
-    }
-
-    // check functional get_or_else
-    {
-        int v = optional<int> {}.get_or_else([]() { return 123; });
-        ASSERT_TRUE(v == 123);
-    }
-
-    {
-        expr e = expr(1) + (2 * builder::make_var(datatypes::s32, "aaa"));
-
-        auto the_var_type
-                = e.cast<add>()
-                          .flat_map([](add v) { return v->r_.cast<mul>(); })
-                          .map([](mul v) { return v->r_.as<var>(); })
-                          .map([](var v) { return v->dtype_; })
-                          .get_or_else(datatypes::undef);
-        ASSERT_EQ(the_var_type, datatypes::s32);
-
-        ASSERT_FALSE(some_opt(expr {}).has_value());
-    }
-
-    // map find
-    {
-        std::unordered_map<int, int> v = {{1, 2}, {3, 4}};
-        auto opt1 = utils::find_map_value(v, 1);
-        ASSERT_TRUE(*opt1.get() == 2);
-        opt1 = utils::find_map_value(v, 2);
-        ASSERT_FALSE(opt1.has_value());
-
-        auto opt2 = utils::find_map_pair(v, 1);
-        ASSERT_TRUE(opt2.get()->second == 2);
-        opt2 = utils::find_map_pair(v, 2);
-        ASSERT_FALSE(opt2.has_value());
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_parallel_merge.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_parallel_merge.cpp
deleted file mode 100644
index 4a73ec60fa5..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_parallel_merge.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include "test_utils.hpp"
-#include <compiler/ir/attr_keys.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/parallel_merge.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-static void make_set_idle(const expr &v) {
-    builder::get_current_builder()->push_evaluate(
-            make_expr<intrin_call_node>(intrin_type::set_thread_idle_func,
-                    std::vector<expr> {get_ir_null(), v}, any_map_t {}));
-}
-
-static func_t get_func(
-        const char *name, const char *A, const char *B, int threads) {
-    _function_(datatypes::s32, aaa, _arg_(A, datatypes::f32, {100}),
-            _arg_(B, datatypes::f32, {100})) {
-        _bind_(A, B);
-        _for_(i, UINT64_C(0), uint64_t(threads), UINT64_C(1),
-                for_type::PARALLEL) {
-            A[i] = B[i];
-        }
-        _return_(12);
-    }
-    aaa->name_ = name;
-    aaa->decl_->name_ = name;
-    return aaa;
-}
-
-static void set_no_barrier(const char *next_func) {
-    builder::get_current_builder()
-            ->get_current_scope()
-            .as_seq()
-            .back()
-            ->attr()[attr_keys::no_post_barrier]
-            = next_func;
-}
-
-TEST(GCCore_CPU_parallel_merge, TestMergeOK) {
-    SET_THREADS_OR_SKIP(16);
-    builder::ir_builder_t builder;
-
-    auto aaa = get_func("aaa", "A", "B", 16);
-    auto bbb = get_func("bbb", "C", "D", 16);
-    auto ccc = get_func("ccc", "E", "F", 16);
-    _function_(datatypes::void_t, mainf) {
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa, A, B);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa, A, B);
-        set_no_barrier("bbb");
-        make_set_idle(A);
-        _tensor_(C, datatypes::f32, 100);
-        _evaluate_call_(bbb, C, B);
-        set_no_barrier("ccc");
-
-        make_set_idle(A);
-        _evaluate_call_(ccc, A, B);
-
-        make_set_idle(A);
-        _evaluate_call_(bbb, A, B);
-    }
-
-    auto irm = ir_module_t::from_entry_func(get_test_ctx(), mainf);
-    parallel_merge_t pass;
-    auto retmod = pass(irm);
-
-    _function_(datatypes::void_t, expected_merged,
-            _arg_("A", datatypes::f32, {100}),
-            _arg_("B", datatypes::f32, {100}),
-            _arg_("C", datatypes::f32, {100}),
-            _arg_("D", datatypes::f32, {100}),
-            _arg_("E", datatypes::f32, {100}),
-            _arg_("F", datatypes::f32, {100})) {
-        _bind_(A, B, C, D, E, F);
-        _for_(i, UINT64_C(0), uint64_t(16), UINT64_C(1), for_type::PARALLEL) {
-            A[i] = B[i];
-            C[i] = D[i];
-            E[i] = F[i];
-        }
-    }
-
-    auto mergedf = retmod->get_func("parallel__aaa__bbb__ccc");
-    ASSERT_TRUE(mergedf);
-    _function_(datatypes::void_t, expectedmainf) {
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa->decl_, A, B);
-        _tensor_(C, datatypes::f32, 100);
-        _evaluate_call_(mergedf->decl_, A, B, C, B, A, B);
-        make_set_idle(A);
-        _evaluate_call_(bbb->decl_, A, B);
-    }
-
-    auto new_mainf = retmod->get_func("mainf");
-    ASSERT_TRUE(new_mainf);
-
-    ir_comparer cmp {true};
-    EXPECT_TRUE(cmp.compare(new_mainf, expectedmainf));
-    EXPECT_TRUE(cmp.compare(mergedf, expected_merged));
-}
-
-TEST(GCCore_CPU_parallel_merge, TestMergeThreads) {
-    SET_THREADS_OR_SKIP(16);
-    builder::ir_builder_t builder;
-
-    auto aaa1 = get_func("aaa1", "A", "B", 10);
-    auto aaa2 = get_func("aaa2", "C", "D", 2);
-    auto aaa3 = get_func("aaa3", "E", "F", 3);
-    auto aaa4 = get_func("aaa4", "G", "H", 10);
-    auto aaa5 = get_func("aaa5", "I", "J", 3);
-    auto aaa6 = get_func("aaa6", "K", "L", 16);
-    _function_(datatypes::void_t, mainf) {
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa1, A, B);
-        set_no_barrier("aaa2");
-
-        make_set_idle(A);
-        _tensor_(C, datatypes::f32, 100);
-        _evaluate_call_(aaa2, C, B);
-        set_no_barrier("aaa3");
-
-        _tensor_(D, datatypes::f32, 100);
-        _evaluate_call_(aaa3, D, B);
-        set_no_barrier("aaa4");
-
-        _tensor_(E, datatypes::f32, 100);
-        _evaluate_call_(aaa4, E, B);
-        set_no_barrier("aaa5");
-
-        _tensor_(F, datatypes::f32, 100);
-        _evaluate_call_(aaa5, F, B);
-        set_no_barrier("aaa6");
-
-        _tensor_(G, datatypes::f32, 100);
-        _evaluate_call_(aaa6, G, B);
-        set_no_barrier("");
-    }
-
-    auto irm = ir_module_t::from_entry_func(get_test_ctx(), mainf);
-    parallel_merge_t pass;
-    auto retmod = pass(irm);
-
-    _function_(datatypes::void_t, expected_merged,
-            _arg_("K", datatypes::f32, {100}),
-            _arg_("L", datatypes::f32, {100}),
-            _arg_("A", datatypes::f32, {100}),
-            _arg_("B", datatypes::f32, {100}),
-            _arg_("E", datatypes::f32, {100}),
-            _arg_("F", datatypes::f32, {100}),
-            _arg_("I", datatypes::f32, {100}),
-            _arg_("J", datatypes::f32, {100}),
-            _arg_("G", datatypes::f32, {100}),
-            _arg_("H", datatypes::f32, {100}),
-            _arg_("C", datatypes::f32, {100}),
-            _arg_("D", datatypes::f32, {100})) {
-        _bind_(K, L, A, B, E, F, I, J, G, H, C, D);
-        _for_(i, UINT64_C(0), uint64_t(16), UINT64_C(1), for_type::PARALLEL) {
-            K[i] = L[i];
-            _if_(i >= UINT64_C(0) && i < UINT64_C(10)) { A[i] = B[i]; }
-            _if_(i >= UINT64_C(10) && i < UINT64_C(13)) {
-                E[i - UINT64_C(10)] = F[i - UINT64_C(10)];
-            }
-            _if_(i >= UINT64_C(13) && i < UINT64_C(16)) {
-                I[i - UINT64_C(13)] = J[i - UINT64_C(13)];
-            }
-            _if_(i >= UINT64_C(6) && i < UINT64_C(16)) {
-                G[i - UINT64_C(6)] = H[i - UINT64_C(6)];
-            }
-            _if_(i >= UINT64_C(4) && i < UINT64_C(6)) {
-                C[i - UINT64_C(4)] = D[i - UINT64_C(4)];
-            }
-        }
-    }
-
-    auto mergedf
-            = retmod->get_func("parallel__aaa6__aaa1__aaa3__aaa5__aaa4__aaa2");
-    ASSERT_TRUE(mergedf);
-
-    ir_comparer cmp {true};
-    EXPECT_TRUE(cmp.compare(mergedf, expected_merged));
-}
-
-TEST(GCCore_CPU_parallel_merge, TestMergeFailThreads) {
-    SET_THREADS_OR_SKIP(16);
-    builder::ir_builder_t builder;
-
-    auto aaa1 = get_func("aaa1", "A", "B", 10);
-    auto aaa2 = get_func("aaa2", "A", "B", 1000);
-    _function_(datatypes::void_t, mainf) {
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa1, A, B);
-        set_no_barrier("aaa2");
-
-        _tensor_(C, datatypes::f32, 100);
-        _evaluate_call_(aaa2, C, B);
-        set_no_barrier("aaa2");
-
-        _tensor_(D, datatypes::f32, 100);
-        _evaluate_call_(aaa2, D, B);
-    }
-    auto irm = ir_module_t::from_entry_func(get_test_ctx(), mainf);
-    parallel_merge_t pass;
-    auto retmod = pass(irm);
-    EXPECT_TRUE(retmod == irm);
-}
-
-TEST(GCCore_CPU_parallel_merge, TestMergeFailComplexBody) {
-    SET_THREADS_OR_SKIP(16);
-    builder::ir_builder_t builder;
-
-    auto aaa1 = get_func("aaa1", "A", "B", 10);
-    _function_(datatypes::s32, complexf, _arg_("A", datatypes::f32, {100}),
-            _arg_("B", datatypes::f32, {100})) {
-        _bind_(A, B);
-        _var_(a, datatypes::f32);
-        a = 10;
-        _for_(i, UINT64_C(0), uint64_t(16), UINT64_C(1), for_type::PARALLEL) {
-            A[i] = B[i];
-        }
-        _return_(12);
-    }
-    _function_(datatypes::void_t, mainf) {
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa1, A, B);
-        set_no_barrier("complexf");
-
-        _tensor_(C, datatypes::f32, 100);
-        _evaluate_call_(complexf, C, B);
-        set_no_barrier("aaa1");
-
-        _tensor_(D, datatypes::f32, 100);
-        _evaluate_call_(aaa1, D, B);
-    }
-    auto irm = ir_module_t::from_entry_func(get_test_ctx(), mainf);
-    parallel_merge_t pass;
-    auto retmod = pass(irm);
-    EXPECT_TRUE(retmod == irm);
-}
-
-TEST(GCCore_CPU_parallel_merge, TestMergeFailComplexMain) {
-    SET_THREADS_OR_SKIP(16);
-    builder::ir_builder_t builder;
-
-    auto aaa1 = get_func("aaa1", "A", "B", 10);
-    auto aaa2 = get_func("aaa2", "A", "B", 10);
-    _function_(datatypes::void_t, mainf) {
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-
-        make_set_idle(A);
-        _evaluate_call_(aaa1, A, B);
-        set_no_barrier("aaa2");
-        _var_(a, datatypes::f32);
-        a = 10;
-        _tensor_(C, datatypes::f32, 100);
-        _evaluate_call_(aaa2, C, B);
-    }
-    auto irm = ir_module_t::from_entry_func(get_test_ctx(), mainf);
-    parallel_merge_t pass;
-    auto retmod = pass(irm);
-    EXPECT_TRUE(retmod == irm);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_parallel_workload_dispatch.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_parallel_workload_dispatch.cpp
deleted file mode 100644
index b6dab981bf8..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_parallel_workload_dispatch.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/traits.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/parallel_workload_dispatch.hpp>
-#include <test_utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_parallel_workload_dispatch, TestValidateWorkload) {
-    builder::ir_builder_t builder;
-    parallel_workload_dispatcher_t pass(true);
-    for_loop lk, lm, lp;
-    _function_(datatypes::void_t, aaa, _arg_("args", datatypes::f32)) {
-        _bind_(args);
-        _for_(i, 0, 123) {
-            _tensor_(temp, datatypes::f32, {100});
-            _for_(j, 0, 456) {
-                args = 1;
-                _if_(args > 1) {
-                    _named_for_(lk, k, 0, 1024, 16) { args = 2; }
-                }
-                _else_ {
-                    _tensor_(temp1, datatypes::f32, {100});
-                    _named_for_(lm, m, 0, 16, 16) { args = 3; }
-                }
-            }
-            _named_for_(lp, p, 0, 789) { args = 4; }
-        }
-    }
-    lk->body_->attr().set(
-            op_traits::workload_computable_t::workload_number, size_t(2));
-    lm->body_->attr().set(
-            op_traits::workload_computable_t::workload_number, size_t(5));
-    lp->body_->attr().set(
-            op_traits::workload_computable_t::workload_number, size_t(3));
-    auto bbb = pass(aaa);
-    size_t wkld_j = (size_t(2) * 1024) * 456;
-    size_t wkld_i = (wkld_j + size_t(3) * 789) * 123;
-    stmt li = bbb->body_.checked_as<stmts>()->seq_[0];
-    stmt lj = li.checked_as<for_loop>()->body_.checked_as<stmts>()->seq_[1];
-    EXPECT_EQ(pass.stmt_workload_map_[li], wkld_i);
-    EXPECT_EQ(pass.stmt_workload_map_[lj], wkld_j);
-}
-
-TEST(GCCore_CPU_parallel_workload_dispatch, TestParallelElimination) {
-    SET_THREADS_OR_SKIP(16);
-    builder::ir_builder_t builder;
-    parallel_workload_dispatcher_t pass;
-    _function_(datatypes::void_t, aaa, _arg_("args", datatypes::f32)) {
-        _bind_(args);
-        auto assign_stmt = builder::make_assign_unattached(args, 4);
-        assign_stmt->attr().set(
-                op_traits::workload_computable_t::workload_number, size_t(4));
-        // will reduce the number of threads
-        _for_(i, 0UL, 64UL, 1, for_type::PARALLEL) {
-            _for_(k, 0, 256, 1) { builder.emit(assign_stmt); }
-        }
-        _for_(i, 0, 1024, 1, for_type::PARALLEL) {
-            _var_(a, datatypes::s32);
-            a = builder::make_get_group_thread_id(-1);
-            _for_(j, 0, 16, 16) {
-                _for_(k, 0, 512, 1) { builder.emit(assign_stmt); }
-            }
-        }
-    }
-    auto bbb = pass(aaa);
-
-    _function_(datatypes::void_t, expected, _arg_("args", datatypes::f32)) {
-        _bind_(args);
-        auto assign_stmt = builder::make_assign_unattached(args, 4);
-        assign_stmt->attr().set(
-                op_traits::workload_computable_t::workload_number, size_t(4));
-        // will be split.
-        _for_(tid, 0UL, 2UL, 1UL, for_type::PARALLEL) {
-            _var_init_(start, datatypes::index, tid * UINT64_C(32));
-            _var_init_(end, datatypes::index, start + UINT64_C(32));
-            _for_(i, start, end, 1) {
-                _for_(k, 0, 256, 1) { builder.emit(assign_stmt); }
-            }
-        }
-        _for_(tid, 0UL, 16UL, 1UL, for_type::PARALLEL) {
-            _var_init_(start, datatypes::index, tid * UINT64_C(64));
-            _var_init_(end, datatypes::index, start + UINT64_C(64));
-            _for_(i, start, end, 1) {
-                _var_(a, datatypes::s32);
-                a = builder::make_cast(datatypes::s32, tid);
-                _for_(j, 0, 16, 16) {
-                    _for_(k, 0, 512, 1) { builder.emit(assign_stmt); }
-                }
-            }
-        }
-    }
-    ir_comparer cmper(true);
-    EXPECT_TRUE(cmper.compare(bbb, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_pre_padding.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_pre_padding.cpp
deleted file mode 100644
index 9d8569969c1..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_pre_padding.cpp
+++ /dev/null
@@ -1,469 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "reference/conv_ref.hpp"
-#include "reference/eltwise_ref.hpp"
-#include "reference/padding_ref.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/mixed_partition.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/graph/tunable_op.hpp>
-#include <compiler/ir/graph/visitor.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/loop_transform.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/convolution.hpp>
-#include <ops/fusible/padding.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <util/any_map.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_Standalone) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-
-    sc_graph_t g;
-    auto input_shape = sc_dims {64, 256, 56, 56};
-    auto paddings = sc_dims {4, 4};
-    auto output_shape = {input_shape[0], input_shape[1],
-            input_shape[2] + paddings[0] * 2, input_shape[3] + paddings[0] * 2};
-
-    auto input = g.make_input({graph_tensor::make(
-            input_shape, sc_data_format_t::NCHW(), datatypes::f32)});
-
-    auto padding = g.make("padding", {input->get_outputs()[0]}, {},
-            {{"pads_begin", paddings}, {"pads_end", paddings}});
-
-    auto out = g.make_output(padding->get_outputs());
-
-    graph_driver(g, ctx);
-    std::vector<sc_op_ptr> arg_list = {input, out};
-    auto f = lower_graph(ctx, g, arg_list);
-    auto input_data = alloc_array<float>(test_utils::product(input_shape));
-    auto sc_output = alloc_array<float>(test_utils::product(output_shape));
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_default(&input_data[0], &sc_output[0]);
-
-    auto ref_output = alloc_array<float>(test_utils::product(output_shape));
-    ref_padding_2d(
-            &ref_output[0], &input_data[0], output_shape, paddings, paddings);
-
-    test_utils::compare_data(sc_output, ref_output, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_Graph) {
-    REQUIRE_AMX();
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-
-    sc_graph_t g;
-    auto input = g.make_input({graph_tensor::make(
-            {9, 256, 56, 56}, sc_data_format_t::NCHW(), datatypes::bf16)});
-
-    auto conv_1 = g.make("conv_fwd_core",
-            {input->get_outputs()[0],
-                    g.make_input({graph_tensor::make({64, 256, 1, 1},
-                                         sc_data_format_t::KCRS(),
-                                         datatypes::bf16)},
-                             {{"constant", const_kind::local_const}})
-                            ->get_outputs()[0]},
-            {},
-            {{"strides", sc_dims {1, 1}}, {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    conv_1 = g.make("cast", {conv_1->get_outputs()[0]}, {},
-            {{"dtype", datatypes::bf16}});
-
-    auto conv_2 = g.make("conv_fwd_core",
-            {conv_1->get_outputs()[0],
-                    g.make_input({graph_tensor::make({64, 64, 3, 3},
-                                         sc_data_format_t::KCRS(),
-                                         datatypes::bf16)},
-                             {{"constant", const_kind::local_const}})
-                            ->get_outputs()[0]},
-            {},
-            {{"strides", sc_dims {1, 1}}, {"pads_begin", sc_dims {1, 1}},
-                    {"pads_end", sc_dims {1, 1}}});
-
-    conv_2 = g.make("cast", {conv_2->get_outputs()[0]}, {},
-            {{"dtype", datatypes::bf16}});
-    auto conv_3 = g.make("conv_fwd_core",
-            {conv_2->get_outputs()[0],
-                    g.make_input({graph_tensor::make({64, 64, 1, 1},
-                                         sc_data_format_t::KCRS(),
-                                         datatypes::bf16)},
-                             {{"constant", const_kind::local_const}})
-                            ->get_outputs()[0]},
-            {},
-            {{"strides", sc_dims {1, 1}}, {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    auto add = g.make("add",
-            {conv_3->get_outputs()[0], conv_3->get_outputs()[0]}, {}, {});
-    auto out = g.make_output(add->get_outputs());
-    graph_driver(g, ctx);
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    EXPECT_TRUE(ss.str().find("padding") != std::string::npos);
-}
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_NoInplace) {
-    SET_THREADS_OR_SKIP(32);
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    sc_graph_t g;
-    auto input = g.make_input({graph_tensor::make(
-            {64, 56, 56}, sc_data_format_t(), datatypes::u8)});
-
-    auto relu0 = g.make("relu", input->get_outputs(), {}, {});
-    // used for query buffer later
-    auto cache_gt = relu0->get_outputs()[0];
-    // `relu0` is shared with `relu1`(marked as `break_pre_fuse`) and `tv0`, as
-    // the result, `padding0` could not inplace output buffer of `relu0` in
-    // avoid of potential `tensorptr` node occuring on function argument
-    auto relu1 = g.make("relu", relu0->get_outputs(), {},
-            {{op_attr_key::break_pre_fuse, true}});
-    auto out0 = g.make_output(relu1->get_outputs());
-    auto tv0 = g.make("tensor_view", relu0->get_outputs(), {},
-            {{"shape", sc_dims {32, 2, 56, 56}}});
-    auto padding0 = g.make("padding", tv0->get_outputs(), {},
-            {{"pads_begin", sc_dims {1, 1}}, {"pads_end", sc_dims {1, 1}}});
-    auto relu2 = g.make("relu", padding0->get_outputs(), {}, {});
-    auto out1 = g.make_output(relu2->get_outputs());
-    mixed_partition(g, ctx);
-    mixed_fuse_op_t *fused_op = get_mixed_op_from_graph(g);
-    ASSERT_TRUE(fused_op && fused_op->parti_list_.size() == 1);
-    auto parti = fused_op->parti_list_[0];
-    // The output buffer of `relu0` is expected to be a `tensor` node rather
-    // than `tensorptr`
-    EXPECT_TRUE(parti->buf_alloc_.g2b_map_.get(cache_gt).isa<tensor>());
-}
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_Conv_Padding_Reorder) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-
-    sc_graph_t g;
-    auto input_shape = sc_dims {1, 64, 14, 14};
-    auto weight_shape_1 = sc_dims {64, 64, 1, 1};
-    auto strides = sc_dims {1};
-
-    const sc_dims pads_begin = {1, 1};
-    const sc_dims pads_end = {1, 1};
-
-    auto conv1_output_shape = sc_dims {input_shape[0], weight_shape_1[0],
-            (input_shape[2] - weight_shape_1[2]) / strides[0] + 1,
-            (input_shape[3] - weight_shape_1[3]) / strides[0] + 1};
-
-    auto padding_output_shape = sc_dims {input_shape[0], weight_shape_1[0],
-            conv1_output_shape[2] + pads_begin[0] + pads_end[0],
-            conv1_output_shape[3] + pads_begin[1] + pads_end[1]};
-
-    auto out_shape = padding_output_shape;
-
-    auto input = g.make_input({graph_tensor::make(
-            input_shape, sc_data_format_t::NCHW(), datatypes::f32)});
-    auto weight_1 = g.make_input({std::make_shared<graph_tensor>(nullptr,
-            sc_data_format_t::KCRS(), weight_shape_1, datatypes::f32)});
-
-    auto conv_1 = g.make("conv_fwd_core",
-            {input->get_outputs()[0], weight_1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {strides[0], strides[0]}},
-                    {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    auto padding = g.make("padding", {conv_1->get_outputs()[0]}, {},
-            {{"pads_begin", pads_begin}, {"pads_end", pads_end}});
-
-    auto reorder = g.make("reorder", padding->get_outputs(), {},
-            {{"out_format", sc_data_format_t::NCHWc(16)}, {"internal", true}});
-
-    auto out = g.make_output(reorder->get_outputs());
-    g.attrs_["is_input_plain"] = true;
-    g.attrs_["is_output_plain"] = false;
-    graph_driver(g, ctx);
-    //     print_graph(g, std::cout,1);
-    std::vector<sc_op_ptr> arg_list = {input, weight_1, out};
-    auto f = lower_graph(ctx, g, arg_list);
-    auto input_data = alloc_array<float>(test_utils::product(input_shape));
-    auto weight_1_data
-            = alloc_array<float>(test_utils::product(weight_shape_1));
-    auto conv1_output
-            = alloc_array<float>(test_utils::product(conv1_output_shape));
-    auto sc_output = alloc_array<float>(test_utils::product(out_shape));
-    auto ref_output = alloc_array<float>(test_utils::product(out_shape));
-
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_default(&input_data[0], &weight_1_data[0], &sc_output[0]);
-
-    auto padding_output
-            = alloc_array<float>(test_utils::product(padding_output_shape));
-
-    compute_ref_direct_fwd(input_shape[0], 1, weight_shape_1[0], input_shape[1],
-            input_shape[2], input_shape[3], conv1_output_shape[2],
-            conv1_output_shape[3], weight_shape_1[2], weight_shape_1[3],
-            strides[0], strides[0], 0, 0, &input_data[0], &weight_1_data[0],
-            static_cast<float *>(nullptr), &conv1_output[0], FWD_I);
-
-    ref_padding_2d(&padding_output[0], &conv1_output[0], padding_output_shape,
-            pads_begin, pads_end);
-    ref_output = NCHW2NCHWc(padding_output, 1, 4, 16, 16, 16);
-    test_utils::compare_data(sc_output, ref_output, 1e-3f, 1e-3f);
-}
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_Conv_Padding) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-
-    sc_graph_t g;
-    auto input_shape = sc_dims {16, 374, 66, 66};
-    auto weight_shape_1 = sc_dims {52, 374, 1, 1};
-    auto weight_shape_2 = sc_dims {279, 52, 5, 5};
-    auto strides = sc_dims {1, 3};
-
-    const sc_dims pads_begin = {4, 4};
-    const sc_dims pads_end = {4, 4};
-
-    auto conv1_output_shape = sc_dims {input_shape[0], weight_shape_1[0],
-            (input_shape[2] - weight_shape_1[2]) / strides[0] + 1,
-            (input_shape[3] - weight_shape_1[3]) / strides[0] + 1};
-
-    auto padding_output_shape = sc_dims {input_shape[0], weight_shape_1[0],
-            conv1_output_shape[2] + pads_begin[0] + pads_end[0],
-            conv1_output_shape[3] + pads_begin[1] + pads_end[1]};
-
-    auto out_shape = sc_dims {padding_output_shape[0], weight_shape_2[0],
-            (padding_output_shape[2] - weight_shape_2[2]) / strides[1] + 1,
-            (padding_output_shape[3] - weight_shape_2[3]) / strides[1] + 1};
-
-    auto input = g.make_input({graph_tensor::make(
-            input_shape, sc_data_format_t::NCHW(), datatypes::f32)});
-    auto weight_1 = g.make_input({std::make_shared<graph_tensor>(nullptr,
-            sc_data_format_t::KCRS(), weight_shape_1, datatypes::f32)});
-    auto weight_2 = g.make_input({std::make_shared<graph_tensor>(nullptr,
-            sc_data_format_t::KCRS(), weight_shape_2, datatypes::f32)});
-
-    auto conv_1 = g.make("conv_fwd_core",
-            {input->get_outputs()[0], weight_1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {strides[0], strides[0]}},
-                    {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    auto padding = g.make("padding", {conv_1->get_outputs()[0]}, {},
-            {{"pads_begin", pads_begin}, {"pads_end", pads_end}});
-
-    auto conv_2 = g.make("conv_fwd_core",
-            {padding->get_outputs()[0], weight_2->get_outputs()[0]}, {},
-            {{"strides", sc_dims {strides[1], strides[1]}},
-                    {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    auto out = g.make_output(conv_2->get_outputs());
-    graph_driver(g, ctx);
-    std::vector<sc_op_ptr> arg_list = {input, weight_1, weight_2, out};
-    auto f = lower_graph(ctx, g, arg_list);
-
-    auto input_data = alloc_array<float>(test_utils::product(input_shape));
-
-    auto weight_1_data
-            = alloc_array<float>(test_utils::product(weight_shape_1));
-    auto weight_2_data
-            = alloc_array<float>(test_utils::product(weight_shape_2));
-
-    auto conv1_output
-            = alloc_array<float>(test_utils::product(conv1_output_shape));
-
-    auto sc_output = alloc_array<float>(test_utils::product(out_shape));
-    auto ref_output = alloc_array<float>(test_utils::product(out_shape));
-
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_default(&input_data[0], &weight_1_data[0], &weight_2_data[0],
-            &sc_output[0]);
-
-    auto padding_output
-            = alloc_array<float>(test_utils::product(padding_output_shape));
-
-    compute_ref_direct_fwd(input_shape[0], 1, weight_shape_1[0], input_shape[1],
-            input_shape[2], input_shape[3], conv1_output_shape[2],
-            conv1_output_shape[3], weight_shape_1[2], weight_shape_1[3],
-            strides[0], strides[0], 0, 0, &input_data[0], &weight_1_data[0],
-            static_cast<float *>(nullptr), &conv1_output[0], FWD_I);
-
-    ref_padding_2d(&padding_output[0], &conv1_output[0], padding_output_shape,
-            pads_begin, pads_end);
-
-    compute_ref_direct_fwd(padding_output_shape[0], 1, weight_shape_2[0],
-            padding_output_shape[1], padding_output_shape[2],
-            padding_output_shape[3], out_shape[2], out_shape[3],
-            weight_shape_2[2], weight_shape_2[3], strides[1], strides[1], 0, 0,
-            &padding_output[0], &weight_2_data[0],
-            static_cast<float *>(nullptr), &ref_output[0], FWD_I);
-
-    test_utils::compare_data(sc_output, ref_output, 1e-3f, 1e-3f);
-}
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_Conv_Asym_Padding) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-
-    sc_graph_t g;
-    auto input_shape = sc_dims {1, 64, 224, 224};
-    auto weight_shape_1 = sc_dims {64, 64, 1, 1};
-    auto weight_shape_2 = sc_dims {64, 64, 7, 7};
-    auto strides = sc_dims {1, 2};
-
-    const sc_dims pads_begin = {3, 3};
-    const sc_dims pads_end = {2, 2};
-
-    auto conv1_output_shape = sc_dims {input_shape[0], weight_shape_1[0],
-            (input_shape[2] - weight_shape_1[2]) / strides[0] + 1,
-            (input_shape[3] - weight_shape_1[3]) / strides[0] + 1};
-
-    auto padding_output_shape = sc_dims {input_shape[0], weight_shape_1[0],
-            conv1_output_shape[2] + pads_begin[0] + pads_end[0],
-            conv1_output_shape[3] + pads_begin[1] + pads_end[1]};
-
-    auto out_shape = sc_dims {padding_output_shape[0], weight_shape_2[0],
-            (padding_output_shape[2] - weight_shape_2[2]) / strides[1] + 1,
-            (padding_output_shape[3] - weight_shape_2[3]) / strides[1] + 1};
-
-    auto input = g.make_input({graph_tensor::make(
-            input_shape, sc_data_format_t::NCHW(), datatypes::f32)});
-    auto weight_1 = g.make_input({std::make_shared<graph_tensor>(nullptr,
-            sc_data_format_t::KCRS(), weight_shape_1, datatypes::f32)});
-    auto weight_2 = g.make_input({std::make_shared<graph_tensor>(nullptr,
-            sc_data_format_t::KCRS(), weight_shape_2, datatypes::f32)});
-
-    auto conv_1 = g.make("conv_fwd_core",
-            {input->get_outputs()[0], weight_1->get_outputs()[0]}, {},
-            {{"strides", sc_dims {strides[0], strides[0]}},
-                    {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    auto padding = g.make("padding", {conv_1->get_outputs()[0]}, {},
-            {{"pads_begin", pads_begin}, {"pads_end", pads_end}});
-
-    auto conv_2 = g.make("conv_fwd_core",
-            {padding->get_outputs()[0], weight_2->get_outputs()[0]}, {},
-            {{"strides", sc_dims {strides[1], strides[1]}},
-                    {"pads_begin", sc_dims {0, 0}},
-                    {"pads_end", sc_dims {0, 0}}});
-
-    auto out = g.make_output(conv_2->get_outputs());
-    graph_driver(g, ctx);
-    std::vector<sc_op_ptr> arg_list = {input, weight_1, weight_2, out};
-    auto f = lower_graph(ctx, g, arg_list);
-
-    auto input_data = alloc_array<float>(test_utils::product(input_shape));
-
-    auto weight_1_data
-            = alloc_array<float>(test_utils::product(weight_shape_1));
-    auto weight_2_data
-            = alloc_array<float>(test_utils::product(weight_shape_2));
-
-    auto conv1_output
-            = alloc_array<float>(test_utils::product(conv1_output_shape));
-
-    auto sc_output = alloc_array<float>(test_utils::product(out_shape));
-    auto ref_output = alloc_array<float>(test_utils::product(out_shape));
-
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_default(&input_data[0], &weight_1_data[0], &weight_2_data[0],
-            &sc_output[0]);
-
-    auto padding_output
-            = alloc_array<float>(test_utils::product(padding_output_shape));
-
-    compute_ref_direct_fwd(input_shape[0], 1, weight_shape_1[0], input_shape[1],
-            input_shape[2], input_shape[3], conv1_output_shape[2],
-            conv1_output_shape[3], weight_shape_1[2], weight_shape_1[3],
-            strides[0], strides[0], 0, 0, &input_data[0], &weight_1_data[0],
-            static_cast<float *>(nullptr), &conv1_output[0], FWD_I);
-
-    ref_padding_2d(&padding_output[0], &conv1_output[0], padding_output_shape,
-            pads_begin, pads_end);
-
-    compute_ref_direct_fwd(padding_output_shape[0], 1, weight_shape_2[0],
-            padding_output_shape[1], padding_output_shape[2],
-            padding_output_shape[3], out_shape[2], out_shape[3],
-            weight_shape_2[2], weight_shape_2[3], strides[1], strides[1], 0, 0,
-            &padding_output[0], &weight_2_data[0],
-            static_cast<float *>(nullptr), &ref_output[0], FWD_I);
-
-    test_utils::compare_data(sc_output, ref_output, 1e-3f, 1e-3f);
-}
-
-TEST(GCCore_CPU_pre_padding_test, TestPre_Padding_Fuse) {
-    REQUIRE_AVX2();
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-
-    sc_graph_t g;
-    auto input_shape = sc_dims {1, 1, 4, 4};
-    auto weight_shape = sc_dims {1, 1, 3, 3};
-    const sc_dims stride = {1, 1};
-    const sc_dims padding_conv = {0, 0};
-    const sc_dims padding_pad = {1, 1};
-
-    auto in_data = g.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t::NCHW(), input_shape, datatypes::f32)});
-    auto in_weight = g.make_input({std::make_shared<graph_tensor>(
-            nullptr, sc_data_format_t::KCRS(), weight_shape, datatypes::f32)});
-    auto conv_out = g.make("conv_fwd_core",
-            {in_data->get_outputs()[0], in_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"pads_begin", padding_conv},
-                    {"pads_end", padding_conv}});
-    auto pad_out = g.make("padding", {conv_out->get_outputs()[0]}, {},
-            {{"pads_begin", padding_pad}, {"pads_end", padding_pad}});
-    auto out = g.make_output(pad_out->get_outputs());
-    g.attrs_["is_input_plain"] = true;
-    g.attrs_["is_output_plain"] = true;
-    graph_driver(g, ctx);
-
-    std::vector<sc_op_ptr> arg_list = {in_data, in_weight, out};
-    auto f = lower_graph(ctx, g, arg_list);
-
-    const auto input_size = test_utils::product(input_shape);
-    auto input_data = alloc_array<float>(input_size);
-    for (size_t i = 0; i < input_size; i++) {
-        input_data[i] = 1.0 * i;
-    }
-
-    const auto weight_size = test_utils::product(weight_shape);
-    auto weight_data = alloc_array<float>(weight_size);
-    for (size_t i = 0; i < weight_size; i++) {
-        weight_data[i] = 1.0;
-    }
-
-    int out_size = 4 * 4;
-    auto sc_output = alloc_array<float>(out_size);
-
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f);
-    fptr->call_default(&input_data[0], &weight_data[0], &sc_output[0]);
-
-    std::vector<float> expected
-            = {0, 0, 0, 0, 0, 45, 54, 0, 0, 81, 90, 0, 0, 0, 0, 0};
-    test_utils::compare_data(sc_output, expected, 1e-4f, 1e-5f);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_quantized_conv2d.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_quantized_conv2d.cpp
deleted file mode 100644
index 9e4c8feed97..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_quantized_conv2d.cpp
+++ /dev/null
@@ -1,1556 +0,0 @@
-/*******************************************************************************
- * Copyright 2021-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "reference/conv_ref.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/ir/sc_data_type.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/convolution.hpp>
-#include <ops/templates/conv_dw_fwd.hpp>
-#include <ops/templates/conv_fwd.hpp>
-#include <ops/templates/conv_rl.hpp>
-#include <ops/templates/nested_conv_fwd.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <runtime/runtime.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/reflection.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-using conv_fwd_config_t = ops::conv_fwd_config_t;
-using conv_fwd_rl_config_t = ops::conv_fwd_rl_config_t;
-using nested_conv_fwd_config_t = ops::nested_conv_fwd_config_t;
-
-const conv_fwd_config_t cfg_fwd = {
-        16, // K_block
-        16, // C_block
-        1, // tile_d
-        2, // tile_p
-        2, // tile_q
-        2, // tile_os
-        0, // pack_input
-        1 // loop_sched
-};
-
-static graph_tensor_ptr make_tensor(const sc_dims &d, sc_data_type_t dtype) {
-    return graph_tensor::make(d, sc_data_format_t(), dtype);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void check_qconv(conv_fwd_config_t cfg, int N, int G, int K, int C, int H,
-        int W, int R, int S, const sc_dims &stride, const sc_dims &dilations,
-        const sc_dims &pads_begin, const sc_dims &pads_end,
-        bool fuse_bias = false, bool default_cfg = false,
-        bool force_blocking = false, bool force_channel_last = false,
-        std::vector<int> data_zero_points = {0},
-        std::vector<int> weight_zero_points = {0}) {
-    int stride_h = stride[0], stride_w = stride[0];
-    if (stride.size() == 2) { stride_w = stride[1]; }
-    int padding_h = pads_begin[0], padding_w = pads_begin[0];
-    if (pads_begin.size() == 2) { padding_w = pads_begin[1]; }
-    int dilation_h = dilations[0], dilation_w = dilations[0];
-    if (dilations.size() == 2) { dilation_w = dilations[1]; }
-    COMPILE_ASSERT(C % G == 0 && K % G == 0,
-            "C and K should be dividable by G, but got C("
-                    << C << "), K(" << K << "), G(" << G << ").");
-
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-    sc_graph_t g;
-    bool is_dw = (G > 1) && (G == C);
-
-    auto src_dtype = sc_data_traits_t<src_type>::type();
-    auto wei_dtype = sc_data_traits_t<wei_type>::type();
-    sc_dims data_dims
-            = G > 1 ? sc_dims {N, G, C / G, H, W} : sc_dims {N, C, H, W};
-    sc_dims weight_dims
-            = G > 1 ? sc_dims {G, K / G, C / G, R, S} : sc_dims {K, C, R, S};
-    auto g_data = g.make_input({make_tensor(data_dims, src_dtype)});
-    auto g_weight = g.make_input({make_tensor(weight_dims, wei_dtype)});
-    auto g_conv_out = g.make("conv_fwd_core",
-            {g_data->get_outputs()[0], g_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"pads_begin", pads_begin},
-                    {"pads_end", pads_end}, {"use_nested", false},
-                    {"dilations", dilations}, {"groups", G},
-                    {"data_zero_points", data_zero_points},
-                    {"weight_zero_points", weight_zero_points}});
-    dynamic_cast<ops::conv_fwd_core_op_t *>(g_conv_out.get())->is_quantized_
-            = true;
-    dynamic_cast<ops::conv_fwd_core_op_t *>(g_conv_out.get())
-            ->need_compensation_
-            = true;
-    g.attrs_[sc_graph_t::attr_key_t::quantize] = true;
-    COMPILE_ASSERT(!force_blocking || !force_channel_last,
-            "only one of force_blocking and force_channel_last allowed");
-    if (force_blocking) {
-        g_conv_out->attrs_.set("temp.test_format", "NCHWc");
-    } else if (force_channel_last) {
-        g_conv_out->attrs_.set("temp.test_format", "NHWC");
-    }
-
-    auto tunop = g_conv_out->template dyn_cast<tunable_op_t>();
-    auto gen = tunop->create_generator();
-    int D = 0, P = 0, Q = 0;
-    if (is_dw) {
-        auto conv_gen = (ops::gen_conv_dw_fwd_t *)gen.get();
-        std::tie(D, P, Q) = conv_gen->get_output_shape();
-    } else {
-        auto conv_gen = (ops::gen_conv_fwd_t *)gen.get();
-        std::tie(D, P, Q) = conv_gen->get_output_shape();
-        if (!default_cfg) {
-            reflection::shared_general_object_t cfgptr
-                    = reflection::general_object_t::make(cfg);
-            tunop->set_config(cfgptr);
-            auto pcfg = (conv_fwd_config_t *)cfgptr.get();
-            tunop->get_inputs()[0]->details_.set_format(
-                    sc_data_format_t::NCHWc(pcfg->C_block));
-            tunop->get_inputs()[1]->details_.set_format(
-                    sc_data_format_t::KCRSck4c(pcfg->C_block, pcfg->K_block));
-            tunop->get_outputs()[0]->details_.set_format(
-                    sc_data_format_t::NCHWc(pcfg->K_block));
-        }
-    }
-
-    std::vector<sc_op_ptr> args = {g_data, g_weight};
-    sc_op_ptr final_out = g_conv_out;
-    auto bc_axis = std::vector<int> {1};
-    if (fuse_bias) {
-        auto g_bias = g.make_input({make_tensor({K}, datatypes::f32)});
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], g_bias->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        args.emplace_back(g_bias);
-    }
-    auto g_out = g.make_output(final_out->get_outputs());
-    args.insert(args.begin(), g_out);
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = true;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = true;
-    g.attrs_["use_rl"] = ops::rl_kind::NO_LOWERING;
-
-    graph_driver(g, ctx);
-    auto f = lower_graph(ctx, g, args);
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-
-    auto output = alloc_array<dst_type>(N * K * P * Q);
-    auto input = alloc_array<src_type>(math_utils::get_dims_product(data_dims));
-    auto weight
-            = alloc_array<wei_type>(math_utils::get_dims_product(weight_dims));
-    auto bias = alloc_array<float>(K);
-
-    std::vector<generic_val> generic_args = {&output[0], &input[0], &weight[0]};
-    if (fuse_bias) generic_args.emplace_back(&bias[0]);
-    fptr->call_generic_default(generic_args.data());
-
-    auto sc_output = std::move(output);
-    auto plain_input = std::move(input);
-    auto plain_weight = std::move(weight);
-    test_buffer<float> plain_bias = std::move(bias);
-    auto plain_output = alloc_array<dst_type>(N * K * P * Q, INIT_ZERO);
-
-    compute_ref_direct_fwd(N, G, K, C, H, W, P, Q, R, S, stride_h, stride_w,
-            padding_h, padding_w, &plain_input[0], &plain_weight[0],
-            &plain_bias[0], &plain_output[0], fuse_bias ? dir_t::FWD_B : FWD_I,
-            nullptr, nullptr, false, 1, 1, 1, 0, 1, 1, dilation_h, dilation_w,
-            true, data_zero_points, weight_zero_points);
-
-    test_utils::compare_data(sc_output, plain_output, 1e-3f, 1e-3f);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void check_qconv(conv_fwd_config_t cfg, int N, int K, int C, int H, int W,
-        int R, int S, const sc_dims &stride, const sc_dims &dilations,
-        const sc_dims &padding, bool fuse_bias = false,
-        bool default_cfg = false, bool force_blocking = false,
-        bool force_channel_last = false) {
-    check_qconv<src_type, wei_type, dst_type>(cfg, N, 1, K, C, H, W, R, S,
-            stride, dilations, padding, padding, fuse_bias, default_cfg,
-            force_blocking, force_channel_last);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void check_nested_qconv(nested_conv_fwd_config_t cfg, int N, int K, int C,
-        int H, int W, int R, int S, const sc_dims &stride,
-        const sc_dims &padding, bool fuse_bias = false,
-        bool default_cfg = false, bool force_blocking = false,
-        bool force_channel_last = false) {
-    int stride_h = stride[0], stride_w = stride[0];
-    if (stride.size() == 2) { stride_w = stride[1]; }
-    int padding_h = padding[0], padding_w = padding[0];
-    if (padding.size() == 2) { padding_w = padding[1]; }
-
-    sc_graph_t g;
-
-    auto src_dtype = sc_data_traits_t<src_type>::type();
-    auto wei_dtype = sc_data_traits_t<wei_type>::type();
-    auto g_data = g.make_input({make_tensor({N, C, H, W}, src_dtype)});
-    auto g_weight = g.make_input({make_tensor({K, C, R, S}, wei_dtype)});
-    auto g_conv_out = g.make("conv_fwd_core",
-            {g_data->get_outputs()[0], g_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"paddings", padding}});
-    COMPILE_ASSERT(!force_blocking || !force_channel_last,
-            "only one of force_blocking and force_channel_last allowed");
-    if (force_blocking) {
-        g_conv_out->attrs_.set("temp.test_format", "NCHWc");
-    } else if (force_channel_last) {
-        g_conv_out->attrs_.set("temp.test_format", "NHWC");
-    }
-    auto tunop = g_conv_out->template dyn_cast<tunable_op_t>();
-
-    auto gen = tunop->create_generator();
-    auto conv_gen = (ops::gen_conv_fwd_t *)gen.get();
-    int D = 0, P = 0, Q = 0;
-    std::tie(D, P, Q) = conv_gen->get_output_shape();
-
-    std::vector<sc_op_ptr> args = {g_data, g_weight};
-    sc_op_ptr final_out = g_conv_out;
-    auto bc_axis = std::vector<int> {1};
-    if (fuse_bias) {
-        auto g_bias = g.make_input({make_tensor({K}, datatypes::f32)});
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], g_bias->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        args.emplace_back(g_bias);
-    }
-    auto g_out = g.make_output(final_out->get_outputs());
-    args.insert(args.begin(), g_out);
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = true;
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = true;
-
-    graph_driver(g, get_test_ctx());
-    auto f = lower_graph(get_test_ctx(), g, args);
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-
-    auto output = alloc_array<dst_type>(N * K * P * Q);
-    auto input = alloc_array<src_type>(N * C * H * W);
-    auto weight = alloc_array<wei_type>(K * C * R * S);
-    auto bias = alloc_array<float>(K);
-
-    std::vector<generic_val> generic_args = {&output[0], &input[0], &weight[0]};
-    if (fuse_bias) generic_args.emplace_back(&bias[0]);
-    fptr->call_generic_default(generic_args.data());
-
-    auto sc_output = std::move(output);
-
-    auto plain_input = std::move(input);
-    auto plain_weight = std::move(weight);
-
-    test_buffer<float> plain_bias = std::move(bias);
-    auto plain_output = alloc_array<dst_type>(N * K * P * Q, INIT_ZERO);
-
-    compute_ref_direct_fwd(N, 1, K, C, H, W, P, Q, R, S, stride_h, stride_w,
-            padding_h, padding_w, &plain_input[0], &plain_weight[0],
-            &plain_bias[0], &plain_output[0], fuse_bias ? dir_t::FWD_B : FWD_I);
-
-    test_utils::compare_data(sc_output, plain_output, 1e-3f, 1e-3f);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void check_dynamic_netsed_qconv(nested_conv_fwd_config_t cfg, int N, int K,
-        int C, int H, int W, int R, int S, const sc_dims &stride,
-        const sc_dims &padding, bool fuse_bias = false,
-        bool default_cfg = false, bool force_blocking = false,
-        bool force_channel_last = false, int real_N = -1, int real_H = -1,
-        int real_W = -1) {
-    int stride_h = stride[0], stride_w = stride[0];
-    if (stride.size() == 2) { stride_w = stride[1]; }
-    int padding_h = padding[0], padding_w = padding[0];
-    if (padding.size() == 2) { padding_w = padding[1]; }
-    bool is_dynamic = N < 0 || H < 0 || W < 0;
-
-    sc_graph_t g;
-    auto src_dtype = sc_data_traits_t<src_type>::type();
-    auto wei_dtype = sc_data_traits_t<wei_type>::type();
-    auto g_data = g.make_input({make_tensor({N, C, H, W}, src_dtype)});
-    auto g_weight = g.make_input({make_tensor({K, C, R, S}, wei_dtype)});
-    auto g_conv_out = g.make("conv_fwd_core",
-            {g_data->get_outputs()[0], g_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"paddings", padding}, {"no_fuse", false}});
-    COMPILE_ASSERT(!force_blocking || !force_channel_last,
-            "only one of force_blocking and force_channel_last allowed");
-    if (force_blocking) {
-        g_conv_out->attrs_.set("temp.test_format", "NCHWc");
-    } else if (force_channel_last) {
-        g_conv_out->attrs_.set("temp.test_format", "NHWC");
-    }
-    auto tunop = g_conv_out->template dyn_cast<tunable_op_t>();
-
-    auto gen = tunop->create_generator();
-    auto conv_gen = (ops::gen_nested_conv_fwd_t *)gen.get();
-    int D = 0, P = 0, Q = 0;
-    std::tie(D, P, Q) = conv_gen->get_output_shape();
-    reflection::shared_general_object_t cfgptr;
-    cfgptr = gen->get_default_config(get_test_ctx());
-    cfg = *(nested_conv_fwd_config_t *)cfgptr.get();
-    tunop->set_config(cfgptr);
-    tunop->get_inputs()[0]->details_.set_format(sc_data_format_t::NHWC());
-    tunop->get_inputs()[1]->details_.set_format(
-            sc_data_format_t::KCRSck4c(cfg.im_ic_block, cfg.im_oc_block));
-    tunop->get_outputs()[0]->details_.set_format(sc_data_format_t::NHWC());
-
-    std::vector<sc_op_ptr> args = {g_data, g_weight};
-    sc_op_ptr final_out = g_conv_out;
-    auto bc_axis = std::vector<int> {1};
-    if (fuse_bias) {
-        auto g_bias = g.make_input({make_tensor({K}, datatypes::f32)});
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], g_bias->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        args.emplace_back(g_bias);
-    }
-    auto g_out = g.make_output(final_out->get_outputs());
-    args.insert(args.begin(), g_out);
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = false;
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = false;
-
-    graph_driver(g, get_default_context());
-    auto f = lower_graph(get_default_context(), g, args);
-    auto fptr = jit_engine_t::make(get_default_context())
-                        ->get_entry_func(f, true);
-
-    if (is_dynamic) {
-        if (is_dynamic_dim(N)) {
-            assert(real_N > 0);
-            N = real_N;
-        }
-        if (is_dynamic_dim(H)) {
-            assert(real_H > 0);
-            H = real_H;
-        }
-        if (is_dynamic_dim(W)) {
-            assert(real_W > 0);
-            W = real_W;
-        }
-        P = (H + padding_h * 2 - R) / stride_h + 1;
-        Q = (W + padding_w * 2 - S) / stride_w + 1;
-    }
-
-    auto output = alloc_array<dst_type>(N * K * P * Q, INIT_NOOP);
-    auto input = alloc_array<src_type>(N * C * H * W);
-    auto weight = alloc_array<wei_type>(K * C * R * S);
-    auto bias = alloc_array<float>(K);
-
-    sc_dims out_dims = sc_dims {N, K, P, Q};
-    sc_dims in_a_dims = sc_dims {N, C, H, W};
-    sc_dims in_weight_dims = sc_dims {K, C, R, S};
-    sc_dims in_postop_dims = sc_dims {K};
-
-    // Define dynamic tensor
-    runtime::dynamic_tensor_t dyn_output(&output[0], &out_dims[0],
-            out_dims.size(), uint32_t(sc_data_traits_t<dst_type>::type()), 0);
-    runtime::dynamic_tensor_t dyn_input(&input[0], &in_a_dims[0],
-            in_a_dims.size(), uint32_t(sc_data_traits_t<src_type>::type()), 0);
-    runtime::dynamic_tensor_t dyn_weight(&weight[0], &in_weight_dims[0],
-            in_weight_dims.size(), uint32_t(sc_data_traits_t<wei_type>::type()),
-            0);
-    runtime::dynamic_tensor_t dyn_bias(&bias[0], &in_postop_dims[0],
-            in_postop_dims.size(), uint32_t(datatypes::f32), 0);
-
-    std::vector<void *> sc_args = is_dynamic
-            ? std::vector<void *> {&dyn_output, &dyn_input, &dyn_weight}
-            : std::vector<void *> {&output[0], &input[0], &weight[0]};
-    std::vector<generic_val> generic_args;
-    for (unsigned i = 0; i < sc_args.size(); i++)
-        generic_args.emplace_back(sc_args.at(i));
-    if (fuse_bias) {
-        if (is_dynamic)
-            generic_args.emplace_back(&dyn_bias);
-        else
-            generic_args.emplace_back(&bias[0]);
-    }
-    fptr->call_generic_default(generic_args.data());
-
-    auto sc_output
-            = any2NCHW(g_conv_out->get_outputs()[0]->details_.get_format(),
-                    output, N, K, P, Q, cfg.im_oc_block);
-
-    auto plain_input = any2NCHW(g_data->get_outputs()[0]->details_.get_format(),
-            input, N, C, H, W, cfg.im_ic_block);
-    auto plain_weight = KCRSckc2KCRS(weight, K / cfg.im_oc_block,
-            utils::divide_and_ceil(C, cfg.im_ic_block), R, S,
-            utils::divide_and_ceil(cfg.im_ic_block, 4), cfg.im_oc_block);
-
-    test_buffer<float> plain_bias = std::move(bias);
-    auto plain_output = alloc_array<dst_type>(N * K * P * Q, INIT_ZERO);
-
-    compute_ref_direct_fwd(N, 1, K, C, H, W, P, Q, R, S, stride_h, stride_w,
-            padding_h, padding_w, &plain_input[0], &plain_weight[0],
-            &plain_bias[0], &plain_output[0], fuse_bias ? dir_t::FWD_B : FWD_I);
-
-    test_utils::compare_data(sc_output, plain_output, 1e-3f, 1e-3f);
-}
-
-template <typename src_type, typename wei_type, typename dst_type>
-void check_rl_qconv(conv_fwd_rl_config_t cfg, int N, int G, int K, int C, int H,
-        int W, int R, int S, const sc_dims &stride, const sc_dims &dilations,
-        const sc_dims &pads_begin, const sc_dims &pads_end,
-        bool fuse_bias = false, bool default_cfg = false,
-        std::vector<int> data_zero_points = {0},
-        std::vector<int> weight_zero_points = {0}) {
-    COMPILE_ASSERT(default_cfg, "only default cfg is supported!");
-    // use new fusion manager
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    int stride_h = stride[0], stride_w = stride[0];
-    if (stride.size() == 2) { stride_w = stride[1]; }
-    int padding_h = pads_begin[0], padding_w = pads_begin[0];
-    if (pads_begin.size() == 2) { padding_w = pads_begin[1]; }
-    int padding_h_end = pads_end[0], padding_w_end = pads_end[0];
-    if (pads_end.size() == 2) { padding_w_end = pads_end[1]; }
-    int dilation_h = dilations[0], dilation_w = dilations[0];
-    if (dilations.size() == 2) { dilation_w = dilations[1]; }
-    COMPILE_ASSERT(C % G == 0 && K % G == 0,
-            "C and K should be dividable by G, but got C("
-                    << C << "), K(" << K << "), G(" << G << ").");
-
-    sc_graph_t g;
-    auto src_shape = G > 1 ? sc_dims {N, G, C / G, H, W} : sc_dims {N, C, H, W};
-    auto wei_shape = G > 1 ? sc_dims {G, K / G, C / G, R, S}
-                           : sc_dims {K, C / G, R, S};
-    auto src_dtype = sc_data_traits_t<src_type>::type();
-    auto wei_dtype = sc_data_traits_t<wei_type>::type();
-    auto g_data = g.make_input({make_tensor(src_shape, src_dtype)});
-    auto g_weight = g.make_input({make_tensor(wei_shape, wei_dtype)});
-    auto g_conv_out = g.make("conv_fwd_core",
-            {g_data->get_outputs()[0], g_weight->get_outputs()[0]}, {},
-            {{"strides", stride}, {"pads_begin", pads_begin},
-                    {"pads_end", pads_end}, {"use_nested", false},
-                    {"dilations", dilations}, {"groups", G},
-                    {"data_zero_points", data_zero_points},
-                    {"weight_zero_points", weight_zero_points}});
-    dynamic_cast<ops::conv_fwd_core_op_t *>(g_conv_out.get())->is_quantized_
-            = true;
-    dynamic_cast<ops::conv_fwd_core_op_t *>(g_conv_out.get())
-            ->need_compensation_
-            = true;
-    g.attrs_[sc_graph_t::attr_key_t::quantize] = true;
-    int P = (H + padding_h + padding_h_end - dilation_h * (R - 1) - 1)
-                    / stride_h
-            + 1,
-        Q = (W + padding_w + padding_w_end - dilation_w * (S - 1) - 1)
-                    / stride_w
-            + 1;
-    std::vector<sc_op_ptr> args = {g_data, g_weight};
-    sc_op_ptr final_out = g_conv_out;
-    auto bc_axis = std::vector<int> {1};
-    if (fuse_bias) {
-        auto g_bias = g.make_input({make_tensor({K}, datatypes::s32)});
-        final_out = g.make("add",
-                {final_out->get_outputs()[0], g_bias->get_outputs()[0]}, {},
-                {{"bc_axis", bc_axis}});
-        args.emplace_back(g_bias);
-    }
-    auto g_out = g.make_output(final_out->get_outputs());
-    args.insert(args.begin(), g_out);
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = true;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = true;
-
-    graph_driver(g, ctx);
-    auto f = lower_graph(ctx, g, args);
-    int tile_col = (src_dtype == datatypes::bf16) ? 32 : 64;
-    int threshold = static_cast<int>(tile_col * 0.75);
-    if (C / G * S <= threshold) {
-        std::stringstream ss;
-        ss << f;
-        EXPECT_TRUE(ss.str().find("aux_buf") != std::string::npos);
-    }
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f, true);
-
-    auto output = alloc_array<dst_type>(N * K * P * Q);
-    auto input = alloc_array<src_type>(math_utils::get_dims_product(src_shape));
-    auto weight
-            = alloc_array<wei_type>(math_utils::get_dims_product(wei_shape));
-    auto bias = alloc_array<int32_t>(K);
-
-    std::vector<generic_val> generic_args = {&output[0], &input[0], &weight[0]};
-    if (fuse_bias) generic_args.emplace_back(&bias[0]);
-    fptr->call_generic_default(generic_args.data());
-
-    auto sc_output = std::move(output);
-    auto plain_input = std::move(input);
-    auto plain_weight = std::move(weight);
-
-    test_buffer<int32_t> plain_bias = std::move(bias);
-    auto plain_output = alloc_array<dst_type>(N * K * P * Q, INIT_ZERO);
-
-    compute_ref_direct_fwd(N, G, K, C, H, W, P, Q, R, S, stride_h, stride_w,
-            padding_h, padding_w, &plain_input[0], &plain_weight[0],
-            &plain_bias[0], &plain_output[0], fuse_bias ? dir_t::FWD_B : FWD_I,
-            nullptr, nullptr, false, 1, 1, 1, 0, 1, 1, dilation_h, dilation_w,
-            true, data_zero_points, weight_zero_points);
-
-    test_utils::compare_data(sc_output, plain_output, 1e-3f, 1e-3f);
-}
-
-auto partial_ow_cfg = conv_fwd_config_t {64, 64, 1, -1, -1, 8, -1, 1};
-auto full_ow_cfg = conv_fwd_config_t {64, 32, 1, -1, -1, 56, -1, 3};
-auto single_os_block_cfg = conv_fwd_config_t {64, 64, 1, -1, -1, 33, -1, 1};
-auto single_os_block_cfg1 = conv_fwd_config_t {8, 64, 1, -1, -1, 61, -1, 1};
-auto single_os_block_cfg2 = conv_fwd_config_t {16, 64, 1, -1, -1, 33, -1, 1};
-auto multi_os_block_cfg = conv_fwd_config_t {64, 64, 1, -1, -1, 11, -1, 1};
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, partial_ow_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(partial_ow_cfg, 64, 64, 64, 58, 58, 3,
-            3, {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, partial_ow_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(partial_ow_cfg, 64, 64, 64, 58, 58, 3,
-            3, {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, full_ow_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(full_ow_cfg, 64, 64, 64, 58, 58, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, full_ow_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(full_ow_cfg, 64, 64, 64, 58, 58, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg, 4, 128, 64, 7, 7,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg, 4, 128, 64, 7, 7,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_1_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg1, 1, 8, 64, 9, 9,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_1_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg1, 1, 8, 64, 9, 9,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_2_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg2, 1, 16, 64, 7, 7,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_2_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg2, 1, 16, 64, 7, 7,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_3_NCX) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(1);
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg2, 1, 64, 64, 16,
-            16, 3, 3, {2, 2}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_3_NXC) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(1);
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg2, 1, 16, 64, 14,
-            14, 3, 3, {2, 2}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_4_NCX) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(1);
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg2, 1, 64, 64, 16,
-            16, 3, 3, {2, 2}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, single_os_block_4_NXC) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(1);
-    check_qconv<uint8_t, int8_t, int32_t>(single_os_block_cfg2, 1, 16, 64, 14,
-            14, 3, 3, {2, 2}, {2, 2}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, multi_os_block_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(multi_os_block_cfg, 4, 128, 64, 7, 7,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, asymmetric_padding_3x3_NCX_1) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 1, 64, 64,
-            224, 224, 7, 7, {2, 2}, {1, 1}, {3, 3}, {2, 2}, false, true, true,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, asymmetric_padding_3x3_NXC_1) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 1, 64, 64,
-            224, 224, 7, 7, {2, 2}, {1, 1}, {3, 3}, {2, 2}, false, true, false,
-            true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, asymmetric_padding_3x3_NCX_2) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 1, 64, 64,
-            224, 224, 7, 7, {1, 1}, {1, 1}, {3, 3}, {2, 2}, false, true, true,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, asymmetric_padding_3x3_NXC_2) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 1, 64, 64,
-            224, 224, 7, 7, {1, 1}, {1, 1}, {3, 3}, {2, 2}, false, true, false,
-            true);
-}
-TEST(GCCore_CPU_qconv2d, Test_2DConv_3x3_with_dilation_int8) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    std::vector<std::vector<int>> workload_list = {
-            // prepadding
-            {1, 256, 960, 38, 38, 12, 1, 0}, // deeplabv3_mobilenet
-            {1, 256, 960, 62, 62, 24, 1, 0}, // deeplabv3_mobilenet
-            {1, 256, 960, 86, 86, 36, 1, 0}, // deeplabv3_mobilenet
-            {1, 256, 256, 32, 32, 2, 1, 0}, // deeplabv3_resnet101
-            {1, 256, 256, 36, 36, 4, 1, 0}, // deeplabv3_resnet101
-            {1, 1024, 512, 31, 31, 6, 1, 0}, // ssd300_vgg16
-            // with padding
-            {1, 256, 960, 14, 14, 12, 1, 12}, // deeplabv3_mobilenet
-            {1, 256, 960, 14, 14, 24, 1, 24}, // deeplabv3_mobilenet
-            {1, 256, 960, 14, 14, 36, 1, 36}, // deeplabv3_mobilenet
-            {1, 256, 256, 28, 28, 2, 1, 2}, // deeplabv3_resnet101
-            {1, 256, 256, 28, 28, 4, 1, 4}, // deeplabv3_resnet101
-            {1, 1024, 512, 19, 19, 6, 1, 6}, // ssd300_vgg16
-            {7, 1024, 32, 19, 19, 6, 1, 6},
-            {7, 1024, 4, 6, 6, 2, 1, 2},
-    }; // N, K, C, H, W, Dilation, Stride, Padding
-    int R = 3, S = 3;
-    int G = 1;
-    for (auto workload : workload_list) {
-        auto N = workload[0];
-        auto K = workload[1];
-        auto C = workload[2];
-        auto H = workload[3];
-        auto W = workload[4];
-        auto dilation = workload[5];
-        auto stride = workload[6];
-        auto padding = workload[7];
-        if (dilation * 2 + 1 > H + 2 * padding) { continue; }
-        check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), N, G, K, C,
-                H, W, R, S, {stride, stride}, {dilation, dilation},
-                {padding, padding}, {padding, padding}, false, true, false,
-                true);
-        check_qconv<int8_t, int8_t, int32_t>(conv_fwd_config_t(), N, G, K, C, H,
-                W, R, S, {stride, stride}, {dilation, dilation},
-                {padding, padding}, {padding, padding}, false, true, false,
-                true);
-    }
-    return;
-}
-TEST(GCCore_CPU_qconv2d, Test_2DConv_3x3_with_asymmetric_dilation_int8) {
-    REQUIRE_AMX();
-    SET_THREADS_OR_SKIP(56);
-    std::vector<std::vector<int>> workload_list = {
-            {7, 1024, 32, 19, 19, 1, 6, 1, 6},
-            {7, 1024, 4, 6, 6, 1, 2, 1, 2},
-    }; // N, K, C, H, W, Dilation_H, Dilation_W, Stride, Padding
-    int R = 3, S = 3;
-    for (auto workload : workload_list) {
-        auto N = workload[0];
-        auto K = workload[1];
-        auto C = workload[2];
-        auto H = workload[3];
-        auto W = workload[4];
-        auto dilation_h = workload[5];
-        auto dilation_w = workload[6];
-        auto stride = workload[7];
-        auto padding = workload[8];
-        if ((dilation_h * 2 + 1 > H + 2 * padding)
-                || (dilation_w * 2 + 1 > W + 2 * padding)) {
-            continue;
-        }
-        check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), N, K, C, H,
-                W, R, S, {stride, stride}, {dilation_h, dilation_w},
-                {padding, padding}, false, true, false, true);
-        check_qconv<int8_t, int8_t, int32_t>(conv_fwd_config_t(), N, K, C, H, W,
-                R, S, {stride, stride}, {dilation_h, dilation_w},
-                {padding, padding}, false, true, false, true);
-    }
-    return;
-}
-TEST(GCCore_CPU_qconv2d_s8s8s32_1x1, no_padding_1_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<int8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_s8s8s32_1x1, no_padding_1_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<int8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_with_zps1) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false,
-            std::vector<int> {129}, std::vector<int> {0});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_with_zps2) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false,
-            std::vector<int> {71}, std::vector<int> {56});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_with_zps3) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false,
-            std::vector<int> {125}, std::vector<int> {128});
-}
-TEST(GCCore_CPU_qconv2d_s8s8s32_3x3, no_padding_1_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<int8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_s8s8s32_3x3, no_padding_1_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<int8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_with_zps1) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false,
-            std::vector<int> {129}, std::vector<int> {0});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_with_zps2) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false,
-            std::vector<int> {27}, std::vector<int> {56});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_with_zps3) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, true, false,
-            std::vector<int> {125}, std::vector<int> {128});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, padding_with_zps1) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, true, false,
-            std::vector<int> {129}, std::vector<int> {0});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, padding_with_zps2) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, true, false,
-            std::vector<int> {43}, std::vector<int> {56});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, padding_with_zps3) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, true, false,
-            std::vector<int> {125}, std::vector<int> {128});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_1_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_1_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 64, 64, 56, 56, 1, 1,
-            {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_2_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 128, 128, 64, 64, 1, 1,
-            {1, 2}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_2_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 128, 128, 64, 64, 1, 1,
-            {1, 2}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_3_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 256, 128, 64, 64, 1, 1,
-            {2, 3}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_1x1, no_padding_3_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 256, 128, 64, 64, 1, 1,
-            {2, 3}, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, oob_efficientnet_conv_NXC) {
-    SET_THREADS_OR_SKIP(4);
-    REQUIRE_AMX();
-    check_qconv<int8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 144, 6, 1, 1,
-            1, 1, {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_1_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_1_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 64, 64, 56, 56, 3, 3,
-            {1, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_2_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 128, 128, 64, 64, 3, 3,
-            {2, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_2_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 128, 128, 64, 64, 3, 3,
-            {2, 1}, {1, 1}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_3_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 256, 128, 64, 64, 3, 3,
-            {2, 3}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, no_padding_3_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 256, 128, 64, 64, 3, 3,
-            {2, 3}, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_1_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 64, 64, 56, 56, 3, 3,
-            {1, 2}, {1, 1}, {1, 2}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_2) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 1, 1, 1, 1, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 1, 1, 1, 1, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, false, false);
-}
-
-#define conv_padding_support_NXC 0
-
-#if conv_padding_support_NXC
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_1_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 64, 64, 56, 56, 3, 3,
-            {1, 2}, {1, 1}, {1, 2}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 128, 1, 64, 64, 56, 56, 3, 3,
-            {1, 2}, {1, 1}, {1, 2}, {2, 1}, false, true, false, true);
-}
-#endif
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_2_NCX) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 256, 128, 64, 64, 3, 3,
-            {2, 3}, {1, 1}, {1, 0}, false, true, true, false);
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 1, 256, 128, 64, 64, 3,
-            3, {2, 3}, {1, 1}, {1, 0}, {0, 1}, false, true, true, false);
-}
-#if conv_padding_support_NXC
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_2_NXC) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 256, 128, 64, 64, 3, 3,
-            {2, 3}, {1, 1}, {1, 0}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 32, 1, 256, 128, 64, 64, 3,
-            3, {2, 3}, {1, 1}, {1, 0}, {0, 1}, false, true, false, true);
-}
-#endif
-// top/middle/bottom padding region, left padding only, no padding, right
-// padding only
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_3_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 1, 1, -1, -1}, 1, 1, 4, 6, 6, 3, 3,
-            {1, 1}, {1, 1}, {2, 2}, false, true, true, false);
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 1, 1, -1, -1}, 1, 1, 1, 4, 6, 6, 3,
-            3, {1, 1}, {1, 1}, {2, 2}, {1, 1}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_3_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 1, 1, -1, -1}, 1, 1, 4, 6, 6, 3, 3,
-            {1, 1}, {1, 1}, {2, 2}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 1, 1, -1, -1}, 1, 1, 1, 4, 6, 6, 3,
-            3, {1, 1}, {1, 1}, {2, 2}, {1, 1}, false, true, false, true);
-}
-// top/middle/bottom padding region, left and right padding
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_4_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 6, 6, -1, -1}, 1, 1, 4, 6, 6, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, false, true, true, false);
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 6, 6, -1, -1}, 1, 1, 1, 4, 6, 6, 3,
-            3, {1, 1}, {1, 1}, {1, 1}, {0, 0}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_4_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 6, 6, -1, -1}, 1, 1, 4, 6, 6, 3, 3,
-            {1, 1}, {1, 1}, {1, 1}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(
-            conv_fwd_config_t {1, 4, 1, 1, 6, 6, -1, -1}, 1, 1, 1, 4, 6, 6, 3,
-            3, {1, 1}, {1, 1}, {1, 1}, {2, 2}, false, true, false, true);
-}
-// top/middle/bottom padding region, left padding only, right padding not
-// being used
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_5_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 4, 4, 4, 3, 3, {2, 2},
-            {1, 1}, {1, 1}, false, true, true, false);
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 1, 4, 4, 4, 3, 3,
-            {2, 2}, {1, 1}, {1, 1}, {2, 2}, false, true, true, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3, with_padding_5_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 4, 4, 4, 3, 3, {2, 2},
-            {1, 1}, {1, 1}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(cfg_fwd, 1, 1, 1, 4, 4, 4, 3, 3,
-            {2, 2}, {1, 1}, {1, 1}, {2, 2}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, no_padding_1_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 4, 8, 8, 12,
-            12, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, no_padding_1_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 4, 8, 8, 12,
-            12, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true, false, true);
-}
-// os-blocking cases
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, no_padding_2_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 2, 48, 48,
-            28, 28, 3, 3, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, no_padding_2_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 2, 48, 48,
-            28, 28, 3, 3, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false,
-            true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, no_padding_3_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 16, 3712,
-            3712, 14, 14, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            false, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, no_padding_3_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 16, 3712,
-            3712, 14, 14, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            false, true);
-}
-// dw conv
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_dw, no_padding_1) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 8, 8, 8, 12,
-            12, 3, 3, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 8, 8, 8, 12,
-            12, 3, 3, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_dw, no_padding_2) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 48, 48, 48,
-            28, 28, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true, false,
-            true);
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 48, 48, 48,
-            28, 28, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_dw, padding_1) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 8, 8, 8, 12,
-            12, 3, 3, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, false, true);
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 8, 8, 8, 12,
-            12, 3, 3, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_dw, padding_2) {
-    REQUIRE_VNNI();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 48, 48, 48,
-            28, 28, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true, false,
-            true);
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 48, 48, 48,
-            28, 28, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true, false,
-            false);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, with_padding_1_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 2, 8, 8, 12,
-            12, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, with_padding_1_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 2, 8, 8, 12,
-            12, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, with_padding_2_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 2, 48, 48,
-            114, 114, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true, false,
-            false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, with_padding_2_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 14, 2, 48, 48,
-            114, 114, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true, false,
-            true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, with_padding_3_NCX) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 16, 3712,
-            3712, 14, 14, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true,
-            false, false);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_groups, with_padding_3_NXC) {
-    REQUIRE_AMX();
-    check_qconv<uint8_t, int8_t, int32_t>(conv_fwd_config_t(), 1, 16, 3712,
-            3712, 14, 14, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true,
-            false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage1_NCX) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            64, 64, 58, 58, 3, 3, {1, 1}, {0, 0}, false, true, true, false);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage1_NXC) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            64, 64, 58, 58, 3, 3, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage2_NCX) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            128, 128, 30, 30, 3, 3, {1, 1}, {0, 0}, false, true, true, false);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage2_NXC) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            128, 128, 30, 30, 3, 3, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage3_NCX) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            256, 256, 16, 16, 3, 3, {1, 1}, {0, 0}, false, true, true, false);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage3_NXC) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            256, 256, 16, 16, 3, 3, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage4_NCX) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            512, 512, 9, 9, 3, 3, {1, 1}, {0, 0}, false, true, true, false);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, rn50_stage4_NXC) {
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            512, 512, 9, 9, 3, 3, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_3x3, oob_rn50_conv_NXC) {
-    SET_THREADS_OR_SKIP(4);
-    REQUIRE_AMX();
-    check_nested_qconv<int8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 1,
-            512, 512, 21, 21, 3, 3, {2, 2}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_nested_u8s8s32_1x1, rn50_stage4_NXC) {
-    REQUIRE_AMX();
-    check_nested_qconv<uint8_t, int8_t, int32_t>(nested_conv_fwd_config_t(), 12,
-            512, 512, 56, 56, 1, 1, {1, 1}, {0, 0}, false, true, false, true);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_1x1, ut1) {
-    REQUIRE_AMX();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 512, 512, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 56,
-            /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_1x1, ut2) {
-    REQUIRE_AMX();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 512, 512, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 55,
-            /*real_W*/ 55);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_1x1, ut3) {
-    REQUIRE_AMX();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 512, 512, -1, -1, 1, 1, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 67,
-            /*real_W*/ 67);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut1) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, 58, 58, 3, 3, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 58,
-            /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut2) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 58,
-            /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut3) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 69,
-            /*real_W*/ 69);
-}
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut4) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 9,
-            /*real_W*/ 9);
-}
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut5) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 6,
-            /*real_W*/ 6);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut6) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, 58, 58, 3, 3, {2, 2},
-            {0, 0}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 58,
-            /*real_W*/ 58);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut7) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {0, 0}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 69,
-            /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, no_padding_ut8) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {3, 2},
-            {0, 0}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 69,
-            /*real_W*/ 69);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut1) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, 56, 56, 3, 3, {1, 1},
-            {1, 1}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 56,
-            /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut2) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 56,
-            /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut3) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 67,
-            /*real_W*/ 67);
-}
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut4) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 67,
-            /*real_W*/ 56);
-}
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut5) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {2, 2}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 56,
-            /*real_W*/ 56);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut6) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {1, 1},
-            {1, 1}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 7,
-            /*real_W*/ 7);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut7) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 64, -1, -1, 3, 3, {2, 2},
-            {1, 1}, false, true, false, true, /*real_N*/ 8, /*real_H*/ 20,
-            /*real_W*/ 20);
-}
-
-TEST(GCCore_CPU_dynamic_qconv2d_nested_u8s8s32_3x3, padding_ut8) {
-    REQUIRE_VNNI();
-    check_dynamic_netsed_qconv<uint8_t, int8_t, int32_t>(
-            nested_conv_fwd_config_t(), -1, 256, 256, 12, 12, 3, 3, {3, 3},
-            {1, 1}, false, true, false, true, /*real_N*/ 1, /*real_H*/ 12,
-            /*real_W*/ 12);
-}
-
-/* rl conv with padding */
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_1) {
-    // single real_pr
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 64,
-            3, 224, 224, 7, 7, {2, 2}, {1, 1}, {3, 3}, {3, 3}, false, true,
-            std::vector<int> {128}, std::vector<int> {57});
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 64,
-            3, 224, 224, 7, 7, {2, 2}, {1, 1}, {3, 3}, {2, 2}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_2) {
-    // double real_pr
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 12, 12, 5, 5, {2, 2}, {1, 1}, {4, 4}, {4, 4}, false, true,
-            std::vector<int> {128}, std::vector<int> {57});
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 12, 12, 5, 5, {2, 2}, {1, 1}, {4, 4}, {3, 3}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 16, 16, 7, 7, {2, 2}, {1, 1}, {4, 3}, {4, 3}, false, true,
-            std::vector<int> {128}, std::vector<int> {57});
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 16, 16, 7, 7, {2, 2}, {1, 1}, {4, 3},
-            {
-                    3,
-                    4,
-            },
-            false, true, std::vector<int> {67}, std::vector<int> {57});
-}
-// top/middle/bottom padding region, left padding only, no padding, right
-// padding only
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_4) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t {1, 1}, 1, 1,
-            1, 4, 6, 6, 3, 3, {1, 1}, {1, 1}, {2, 2}, {2, 2}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t {1, 1}, 1, 1,
-            1, 4, 6, 6, 3, 3, {1, 1}, {1, 1}, {2, 2}, {1, 1}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-// top/middle/bottom padding region, left and right padding
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_5) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t {1, 1}, 1, 1,
-            1, 4, 6, 6, 3, 3, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t {1, 1}, 1, 1,
-            1, 4, 6, 6, 3, 3, {1, 1}, {1, 1}, {1, 1}, {2, 2}, false, true);
-}
-// top/middle/bottom padding region, left padding only, right padding not
-// being used
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_6) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t {1, 1}, 1, 1,
-            1, 4, 4, 4, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t {1, 1}, 1, 1,
-            1, 4, 4, 4, 3, 3, {2, 2}, {1, 1}, {1, 1}, {2, 2}, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_7) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            1, 12, 12, 7, 7, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            1, 12, 12, 7, 7, {2, 2}, {1, 1}, {1, 1}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_8) {
-    REQUIRE_AMX();
-    // specify num_threads(4) to cover parallel at width axis
-    SET_THREADS_OR_SKIP(4);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 22, 22, 3, 3, {2, 2}, {1, 1}, {1, 2}, {1, 2}, false, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 22, 22, 3, 3, {2, 2}, {1, 1}, {1, 2}, {2, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_9) {
-    REQUIRE_AMX();
-    // specify num_threads(4) to cover parallel at batch axis
-    SET_THREADS_OR_SKIP(4);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 4, 1, 16,
-            3, 22, 22, 3, 3, {2, 2}, {1, 1}, {1, 2}, {1, 2}, false, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 4, 1, 16,
-            3, 22, 22, 3, 3, {2, 2}, {1, 1}, {1, 2}, {2, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_10) {
-    REQUIRE_AMX();
-    // specify num_threads(4) to cover parallel at width axis with pads >
-    // strides
-    SET_THREADS_OR_SKIP(4);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 5, 5, 5, 5, {2, 2}, {1, 1}, {3, 3}, {3, 3}, false, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 5, 5, 5, 5, {2, 2}, {1, 1}, {3, 3}, {2, 2}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, padding_11) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 1, 1,
-            3, 3, 3, 3, {1, 1}, {1, 1}, {1, 0}, {1, 0}, false, true);
-}
-
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_bias, padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 12, 12, 5, 5, {2, 2}, {1, 1}, {4, 4}, {4, 4}, true, true);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 12, 12, 5, 5, {2, 2}, {1, 1}, {4, 4}, {3, 3}, true, true);
-}
-
-/* rl conv without padding */
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, no_padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 64,
-            3, 230, 230, 7, 7, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            std::vector<int> {128}, std::vector<int> {57});
-
-    // specify odd num_threads to cover different parallelism at width axis
-    SET_THREADS_OR_SKIP(7);
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 64,
-            3, 230, 230, 7, 7, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, no_padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 64,
-            3, 17, 17, 7, 7, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            std::vector<int> {128}, std::vector<int> {57});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, no_padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 8, 1, 64,
-            3, 16, 16, 7, 7, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, no_padding_4) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 64,
-            3, 12, 12, 7, 7, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl, no_padding_5) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            3, 13, 13, 7, 7, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true,
-            std::vector<int> {67}, std::vector<int> {57});
-}
-
-/*  rl conv with groups */
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_with_groups, no_padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 4, 8, 8,
-            12, 12, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_with_groups, no_padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 14, 4, 48,
-            48, 28, 28, 3, 3, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_with_groups, no_padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 2, 8, 8,
-            14, 14, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_with_groups, no_padding_4) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 14, 4, 36,
-            36, 116, 116, 5, 5, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_with_groups, with_padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 2, 8, 8,
-            12, 12, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_rl_with_groups, with_padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 14, 4, 36,
-            36, 114, 114, 5, 5, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_dw, no_padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 14, 48, 96,
-            48, 28, 28, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_3x3_with_dw, padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 14, 48, 96,
-            48, 28, 28, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-}
-
-/* kl lowering without padding */
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl, no_padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            32, 14, 14, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl, no_padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            24, 13, 13, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl, no_padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            12, 13, 13, 5, 5, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-/* kl lowering with padding */
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl, padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            32, 13, 13, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl, padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            24, 13, 13, 3, 3, {2, 2}, {1, 1}, {2, 3}, {3, 2}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl, padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 1, 16,
-            12, 13, 13, 5, 5, {1, 1}, {1, 1}, {4, 4}, {4, 4}, false, true);
-}
-/* kl lowering with group and without padding */
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl_with_groups, no_padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 2, 16,
-            64, 14, 14, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl_with_groups, no_padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 3, 18,
-            72, 13, 13, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl_with_groups, no_padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 2, 16,
-            24, 13, 13, 5, 5, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-/* kl lowering with group and padding */
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl_with_groups, padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 2, 16,
-            64, 13, 13, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl_with_groups, padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 4, 16,
-            96, 13, 13, 3, 3, {2, 2}, {1, 1}, {2, 3}, {3, 2}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_u8s8s32_kl_with_groups, padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<uint8_t, int8_t, int32_t>(conv_fwd_rl_config_t(), 1, 3, 18,
-            36, 13, 13, 5, 5, {1, 1}, {1, 1}, {4, 4}, {4, 4}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, no_padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 2, 8, 32,
-            14, 14, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, no_padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 3, 9, 36,
-            13, 13, 3, 3, {2, 2}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, no_padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 2, 8, 12,
-            13, 13, 5, 5, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, padding_1) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 2, 8, 32,
-            13, 13, 3, 3, {2, 2}, {1, 1}, {1, 1}, {1, 1}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, padding_2) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 4, 8, 48,
-            13, 13, 3, 3, {2, 2}, {1, 1}, {2, 3}, {3, 2}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, padding_3) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 3, 9, 18,
-            13, 13, 5, 5, {1, 1}, {1, 1}, {4, 4}, {4, 4}, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_kl_with_groups, padding_4) {
-    REQUIRE_AMX();
-    check_rl_qconv<bf16_t, bf16_t, float>(conv_fwd_rl_config_t(), 1, 2, 4, 4,
-            13, 13, 5, 5, {1, 1}, {1, 1}, {4, 4}, {4, 4}, false, true);
-}
-// dw
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_with_dw, no_padding_1) {
-    REQUIRE_AMX();
-    check_qconv<bf16_t, bf16_t, float>(conv_fwd_config_t(), 1, 8, 8, 8, 13, 13,
-            3, 3, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_with_dw, no_padding_2) {
-    REQUIRE_AMX();
-    check_qconv<bf16_t, bf16_t, float>(conv_fwd_config_t(), 12, 32, 32, 32, 28,
-            28, 5, 5, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_with_dw, padding_1) {
-    REQUIRE_AMX();
-    check_qconv<bf16_t, bf16_t, float>(conv_fwd_config_t(), 1, 8, 8, 8, 13, 13,
-            3, 3, {1, 1}, {1, 1}, {1, 1}, {1, 1}, false, true, false, true);
-}
-TEST(GCCore_CPU_qconv2d_bf16bf16f32_with_dw, padding_2) {
-    REQUIRE_AMX();
-    check_qconv<bf16_t, bf16_t, float>(conv_fwd_config_t(), 12, 32, 32, 32, 28,
-            28, 5, 5, {1, 1}, {1, 1}, {2, 2}, {2, 2}, false, true, false, true);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_reciprocal.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_reciprocal.cpp
deleted file mode 100644
index 7fcea47af7c..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_reciprocal.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <cmath>
-#include <iostream>
-#include <vector>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/jit/jit.hpp>
-#include <reference/act_ref.hpp>
-#include <reference/softmax_ref.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-static bool verbose = false;
-
-static void check_reciprocal(const sc_dims &input_dims) {
-    REQUIRE_AVX2();
-    sc_graph_t g;
-    auto ins = g.make_input({graph_tensor::make(input_dims)});
-    auto op = g.make("reciprocal", ins->get_outputs(), {}, {});
-    g.make_output(op->get_outputs());
-    graph_driver(g, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), g, {});
-    if (verbose) std::cout << f << std::endl;
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<float> input_data(input_size);
-    test_utils::fill_data(&input_data[0], input_size);
-    std::vector<float> ref_output(input_size);
-    ref_reciprocal(ref_output.data(), input_data.data(), input_size);
-    std::vector<float> sc_output(input_size);
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f);
-    fptr->call_default(&input_data[0], &sc_output[0]);
-    test_utils::compare_data(sc_output, ref_output, 1e-4f, 1e-5f);
-}
-
-TEST(GCCore_CPU_reciprocal_test, TestReciprocalOp) {
-    check_reciprocal({32, 64, 48});
-    check_reciprocal({1, 12, 128, 128});
-}
-
-TEST(GCCore_CPU_reciprocal_test, TestAttentionSubGraph) {
-    sc_graph_t g;
-    sc_dims output_dims = {12, 128, 128};
-    sc_dims input_dims_13 = {12, 128, 128};
-    sc_dims input_dims_4 = {12, 128, 1};
-    sc_dims input_dims_3 = {1, 1, 128};
-
-    ///////////////
-    auto v13 = g.make_input({graph_tensor::make(
-            sc_dims(input_dims_13), sc_data_format_t(), datatypes::f32)});
-    auto v4 = g.make_input({graph_tensor::make(
-            sc_dims(input_dims_4), sc_data_format_t(), datatypes::f32)});
-    auto v3 = g.make_input({graph_tensor::make(
-            sc_dims(input_dims_3), sc_data_format_t(), datatypes::f32)});
-
-    ///////////////
-    auto v14 = g.make("div", //
-            {v13->get_outputs()[0], v4->get_outputs()[0]}, {}, {});
-    auto v15 = g.make("add", //
-            {v14->get_outputs()[0], v3->get_outputs()[0]},
-            {graph_tensor::make(
-                    output_dims, sc_data_format_t(), datatypes::f32)},
-            {});
-    auto v16 = g.make("softmax", //
-            {v15->get_outputs()[0]},
-            {graph_tensor::make(
-                    output_dims, sc_data_format_t(), datatypes::f32)},
-            {{"axis", std::vector<int> {2}}});
-    auto out = g.make_output(v16->get_outputs());
-
-    graph_inline(g);
-    div_bcast_transform(g);
-    std::stringstream ss;
-    print_graph(g, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[12, 128, 128], v1: f32[12, 128, 1], v2: f32[1, 1, 128]) -> [v3: f32[12, 128, 128]] {
-  [v4: f32[12, 128, 1]] = reciprocal(v1)
-  [v5: f32[12, 128, 128]] = mul(v0, v4)
-  [v6: f32[12, 128, 128]] = add(v5, v2)
-  [v7: f32[12, 128, 1]] = reduce(v6)
-  [v8: f32[12, 128, 128]] = sub(v6, v7)
-  [v9: f32[12, 128, 128]] = exp(v8)
-  [v10: f32[12, 128, 1]] = reduce(v9)
-  [v11: f32[12, 128, 1]] = reciprocal(v10)
-  [v3: f32[12, 128, 128]] = mul(v9, v11)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_reduce_op.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_reduce_op.cpp
deleted file mode 100644
index 2b61c7adbdd..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_reduce_op.cpp
+++ /dev/null
@@ -1,1134 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <limits>
-#include "compiler/ir/graph/fusible_op_utils.hpp"
-#include "context.hpp"
-#include "util/bf16.hpp"
-#include "util/fp16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <compiler/jit/jit.hpp>
-#include <reference/gemm_ref.hpp>
-#include <runtime/config.hpp>
-#include <test_utils.hpp>
-#include <util/any_map.hpp>
-#include <util/parallel.hpp>
-#include <util/utils.hpp>
-using namespace dnnl::impl::graph::gc;
-
-static bool verbose = false;
-static const float the_atol = 1e-5f;
-static const float the_rtol = 1e-4f;
-
-template <typename Dtype>
-static void do_test_reduce_op(const sc_dims &in_shape,
-        const std::vector<int> &rd_axis, const std::string &reduce_name,
-        int out_size, sc_data_type_t in_dtype,
-        const std::function<std::vector<float>(std::vector<float> &)> &ref_func,
-        bool keep_dims = false, const sc_dims &out_shape = {}) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-    auto input = graph.make_input(
-            {graph_tensor::make(in_shape, sc_data_format_t(), in_dtype)});
-    auto output_tensors = std::vector<graph_tensor_ptr>();
-    if (out_shape.size() > 0) {
-        output_tensors.emplace_back(
-                graph_tensor::make(out_shape, sc_data_format_t(), in_dtype));
-    }
-    auto reduce
-            = graph.make(reduce_name, {input->get_outputs()[0]}, output_tensors,
-                    {
-                            {"rd_axis", rd_axis},
-                            {"keep_dims", keep_dims},
-                    });
-    auto output = graph.make_output(reduce->get_outputs());
-    graph_driver(graph, get_test_ctx());
-    auto f = lower_graph(get_test_ctx(), graph, {output, input});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-
-    auto in = alloc_array<Dtype>(test_utils::product(in_shape));
-    auto out = alloc_array<Dtype>(out_size, INIT_NOOP);
-
-    std::vector<generic_val> generic_args;
-    generic_args.emplace_back(&out[0]);
-    generic_args.emplace_back(&in[0]);
-
-    fptr->call_generic_default(generic_args.data());
-
-    auto ref_in = std::vector<float>(in.begin(), in.end());
-    auto ref_out = ref_func(ref_in);
-    auto out_f32 = std::vector<float>(out.begin(), out.end());
-    if (verbose) {
-        std::cout << ref_out[0] << " " << ref_out[1] << " " << ref_out[2] << " "
-                  << std::endl;
-        std::cout << out_f32[0] << " " << out_f32[1] << " " << out_f32[2] << " "
-                  << std::endl;
-    }
-    test_utils::compare_data(out_f32, ref_out, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp1) {
-    const int out_size = 3;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_sum", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = 0;
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] += input[i * 3 * 3 + j * 3 + k];
-                return ref_out;
-            });
-}
-
-// test reduce on all axis with predefined output tensor
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp2) {
-    const int out_size = 1;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({0, 1, 2}),
-            "reduce_sum", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += input[i];
-                return ref_out;
-            });
-}
-
-// test reduce on all axis with auto-infered output tensor
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp3) {
-    const int out_size = 1;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3, 3}),
-            std::vector<int>({0, 1, 2, 3}), "reduce_sum", out_size,
-            datatypes::f32, [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += input[i];
-                return ref_out;
-            });
-}
-
-// test reduce mul
-static void test_single_mul(int lastdim) {
-    const int out_size = 3;
-    do_test_reduce_op<float>(sc_dims({3, 3, lastdim}), std::vector<int>({1, 2}),
-            "reduce_prod", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = 1;
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < (size_t)lastdim; ++k)
-                            ref_out[i]
-                                    *= input[i * 3 * lastdim + j * lastdim + k];
-                return ref_out;
-            });
-}
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp4) {
-    test_single_mul(3);
-    SET_THREADS_OR_SKIP(1);
-    test_single_mul(256);
-}
-
-// test reduce max
-static void test_single_max() {
-    const int out_size = 3;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_max", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = -std::numeric_limits<float>::infinity();
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::max(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp5) {
-    test_single_max();
-    SET_THREADS_OR_SKIP(1);
-    test_single_max();
-}
-
-// test reduce mean
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp6) {
-    const int out_size = 3;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_mean", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = 0;
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] += input[i * 3 * 3 + j * 3 + k] / 9;
-                return ref_out;
-            });
-}
-
-// test reduce min
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp7) {
-    const int out_size = 3;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_min", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = std::numeric_limits<float>::infinity();
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::min(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test bf16 reduce max
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp8) {
-    const int out_size = 3;
-    do_test_reduce_op<bf16_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_max", out_size, datatypes::bf16,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i) {
-                    ref_out[i] = -std::numeric_limits<float>::infinity();
-                }
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::max(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test bf16 reduce min
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp9) {
-    const int out_size = 3;
-    do_test_reduce_op<bf16_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_min", out_size, datatypes::bf16,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = std::numeric_limits<float>::infinity();
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::min(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test s8 reduce max
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp10) {
-    REQUIRE_PARALLEL();
-    const int out_size = 3;
-    do_test_reduce_op<int8_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_max", out_size, datatypes::s8,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i) {
-                    ref_out[i] = -std::numeric_limits<float>::infinity();
-                }
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::max(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test s8 reduce min
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp11) {
-    REQUIRE_PARALLEL();
-    const int out_size = 3;
-    do_test_reduce_op<int8_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_min", out_size, datatypes::s8,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = std::numeric_limits<float>::infinity();
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::min(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test u8 reduce max
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp12) {
-    REQUIRE_PARALLEL();
-    const int out_size = 3;
-    do_test_reduce_op<uint8_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_max", out_size, datatypes::u8,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i) {
-                    ref_out[i] = -std::numeric_limits<float>::infinity();
-                }
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::max(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test u8 reduce min
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp13) {
-    REQUIRE_PARALLEL();
-    const int out_size = 3;
-    do_test_reduce_op<uint8_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_min", out_size, datatypes::u8,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = std::numeric_limits<float>::infinity();
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::min(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test all reduce + partial reduce + last reduce
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp14) {
-    const int out_size = 1;
-    // set num threads to trigger corner condition
-    SET_THREADS_OR_SKIP(4);
-    do_test_reduce_op<float>(sc_dims({3, 3, 3, 3}),
-            std::vector<int>({0, 1, 2, 3}), "reduce_sum", out_size,
-            datatypes::f32, [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += input[i];
-                return ref_out;
-            });
-}
-
-// test all reduce + partial reduce + last reduce
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp15) {
-    const int out_size = 1;
-    // set num threads to trigger corner condition
-    SET_THREADS_OR_SKIP(4);
-    do_test_reduce_op<float>(sc_dims({3, 3, 3, 3}),
-            std::vector<int>({0, 1, 2, 3}), "reduce_l1", out_size,
-            datatypes::f32, [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += std::abs(input[i]);
-                return ref_out;
-            });
-}
-
-// test all reduce + partial reduce + last reduce
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp16) {
-    const int out_size = 1;
-    // set num threads to trigger corner condition
-    SET_THREADS_OR_SKIP(4);
-    do_test_reduce_op<float>(sc_dims({3, 3, 3, 3}),
-            std::vector<int>({0, 1, 2, 3}), "reduce_l2", out_size,
-            datatypes::f32, [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += input[i] * input[i];
-                ref_out[0] = std::sqrt(ref_out[0]);
-                return ref_out;
-            });
-}
-
-// test reduce on all axis with predefined output tensor
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp17) {
-    const int out_size = 1;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({0, 1, 2}),
-            "reduce_l1", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += std::abs(input[i]);
-                return ref_out;
-            });
-}
-
-// test reduce on all axis with predefined output tensor
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp18) {
-    const int out_size = 1;
-    do_test_reduce_op<float>(sc_dims({3, 3, 3}), std::vector<int>({0, 1, 2}),
-            "reduce_l2", out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                ref_out[0] = 0;
-                for (size_t i = 0; i < input.size(); ++i)
-                    ref_out[0] += input[i] * input[i];
-                ref_out[0] = std::sqrt(ref_out[0]);
-                return ref_out;
-            });
-}
-
-// test fp16 reduce max
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp19) {
-    REQUIRE_AVX512FP16();
-    const int out_size = 3;
-    do_test_reduce_op<fp16_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_max", out_size, datatypes::f16,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i) {
-                    ref_out[i] = -std::numeric_limits<float>::infinity();
-                }
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::max(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test fp16 reduce min
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOp20) {
-    REQUIRE_AVX512FP16();
-    const int out_size = 3;
-    do_test_reduce_op<fp16_t>(sc_dims({3, 3, 3}), std::vector<int>({1, 2}),
-            "reduce_min", out_size, datatypes::f16,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = std::numeric_limits<float>::infinity();
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] = std::min(
-                                    ref_out[i], input[i * 3 * 3 + j * 3 + k]);
-                return ref_out;
-            });
-}
-
-// test reduce on all axis with fusing enabled
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOpFuse) {
-    REQUIRE_AVX2();
-    // auto skip in avoid of reduce split
-    if (runtime_config_t::get().get_num_threads() < 6) { GTEST_SKIP(); }
-    sc_graph_t graph;
-    sc_dims in_shape = {3, 3, 3, 3};
-    auto input_a = graph.make_input(
-            {graph_tensor::make(in_shape, sc_data_format_t())});
-    auto input_b = graph.make_input(
-            {graph_tensor::make(in_shape, sc_data_format_t())});
-    auto input_c = graph.make_input(
-            {graph_tensor::make(sc_dims({1}), sc_data_format_t())});
-    auto add = graph.make("add",
-            {input_a->get_outputs()[0], input_b->get_outputs()[0]}, {}, {});
-    auto reduce = graph.make("reduce", {add->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int>({0, 1, 2, 3})}, {"keep_dims", false},
-                    {"rd_op", 0}});
-    auto mul = graph.make("mul",
-            {reduce->get_outputs()[0], input_c->get_outputs()[0]}, {}, {});
-    auto output = graph.make_output(mul->get_outputs());
-    graph_driver(graph, get_test_ctx());
-
-    std::stringstream ss;
-    print_graph(graph, ss, true);
-    std::string expected_str
-            = R"(graph(v0: f32[3, 3, 3, 3], v1: f32[3, 3, 3, 3], v2: f32[1]) -> [v3: f32[1]] {
-  [v4: f32[3, 3, 3, 3]] = add(v0, v1)
-  [v5: f32[1]] = reduce(v4)
-  [v3: f32[1]] = mul(v5, v2)
-}
-)";
-    EXPECT_EQ(ss.str(), expected_str);
-
-    auto f = lower_graph(
-            get_test_ctx(), graph, {output, input_a, input_b, input_c});
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-
-    auto in_a = alloc_array<float>(test_utils::product(in_shape));
-    auto in_b = alloc_array<float>(test_utils::product(in_shape));
-    auto in_c = alloc_array<float>(1);
-    auto out = alloc_array<float>(1, INIT_NOOP);
-    auto ref_out = std::vector<float>(1);
-
-    std::vector<generic_val> generic_args;
-    generic_args.emplace_back(&out[0]);
-    generic_args.emplace_back(&in_a[0]);
-    generic_args.emplace_back(&in_b[0]);
-    generic_args.emplace_back(&in_c[0]);
-    fptr->call_generic_default(generic_args.data());
-
-    for (size_t i = 0; i < in_a.size(); i++)
-        ref_out[0] += (in_a[i] + in_b[i]) * in_c[0];
-
-    if (verbose) {
-        for (size_t i = 0; i < in_a.size(); i++) {
-            std::cout << in_a[i] << " ";
-            if ((i + 1) % 3 == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl;
-        for (size_t i = 0; i < in_b.size(); i++) {
-            std::cout << in_b[i] << " ";
-            if ((i + 1) % 3 == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl;
-        std::cout << in_c[0] << std::endl;
-        std::cout << out[0] << std::endl;
-        std::cout << ref_out[0] << std::endl;
-    }
-    test_utils::compare_data(out, ref_out, the_rtol, the_atol);
-}
-
-class reduce_checker : public ir_viewer_t {
-public:
-    int num_assign = 0;
-    int num_reduce_add = 0;
-    bool is_var_ = true;
-    const char *buffer_name = "reduce";
-    void view(assign_c v) override {
-        if (v->var_->dtype_
-                == sc_data_type_t::f32(get_test_ctx()->get_max_vector_lanes(
-                        sc_data_etype::F32))) {
-            std::stringstream ss;
-            v->var_->to_string(ss);
-            if (ss.str().find(buffer_name) != std::string::npos) {
-                num_assign++;
-            }
-        }
-        ir_viewer_t::view(v);
-    }
-
-    void view(tensor_c v) override {
-        if (utils::string_startswith(v->name_, buffer_name)) {
-            is_var_ &= v->attr_
-                    && v->attr_->get_or_else("must_tensor2var", false);
-        }
-    }
-
-    void view(intrin_call_c v) override {
-        if (v->type_ == intrin_type::reduce_add) {
-            std::stringstream ss;
-            v->to_string(ss);
-            if (ss.str().find(buffer_name) != std::string::npos) {
-                num_reduce_add++;
-            }
-        }
-    }
-};
-// test gemm+relu+reduce+add. Reduction on K axis for MK
-static void do_test_last_axis(bool &done, bool reduce_output, bool input_plain,
-        bool keep_dims, bool mean, test_buffer<float> &out,
-        test_buffer<float> &refout, int rd_axis = 1, int num_threads = 16) {
-    done = false;
-    const int shape = 1024;
-    sc_graph_t graph;
-    SET_THREADS_OR_SKIP(num_threads);
-    // make sure matmul has last level fusion anchor
-    sc_data_format_t fmt1 = input_plain ? sc_data_format_t::MK()
-                                        : sc_data_format_t::MKmk(32, 32);
-    sc_data_format_t fmt2 = sc_data_format_t::MK();
-    auto input_a = graph.make_input({graph_tensor::make({shape, shape}, fmt1),
-            graph_tensor::make({shape, shape}, fmt2)});
-    auto mm = graph.make("matmul_core", input_a->get_outputs(), {}, {});
-    auto relu = graph.make("relu", {mm->get_outputs()[0]}, {}, {});
-    std::vector<int> rdax {rd_axis};
-    if (rd_axis == -1) {
-        // all reduce
-        rdax = {0, 1};
-    }
-    std::string reduce_op_name = mean ? "reduce_mean" : "reduce_sum";
-    auto reduce = graph.make(reduce_op_name, {relu->get_outputs()[0]}, {},
-            {
-                    {"rd_axis", std::move(rdax)},
-                    {"keep_dims", keep_dims},
-            });
-    if (!reduce_output) {
-        reduce = graph.make("add",
-                {reduce->get_outputs()[0], input_a->get_outputs()[0]}, {}, {});
-    }
-
-    auto output = graph.make_output(reduce->get_outputs());
-
-    context_ptr ctx = get_test_ctx();
-    graph_driver(graph, ctx);
-
-    auto f = lower_graph(ctx, graph, {output, input_a});
-
-    reduce_checker chk;
-    func_t thefunc;
-
-    for (auto &func : f->get_contents()) {
-        if (func->name_.find("matmul_core") != std::string::npos) {
-            thefunc = func;
-        }
-    }
-    ASSERT_TRUE(thefunc);
-
-    // check that local-reduce-compute is enabled
-    chk.buffer_name = "reduce_compute";
-    chk.dispatch(thefunc);
-    ASSERT_EQ(chk.num_assign, 1);
-    // ASSERT_EQ(chk.is_var_, true);
-    if (rd_axis == 1) {
-        ASSERT_EQ(chk.num_reduce_add, 1);
-    } else {
-        ASSERT_TRUE(thefunc->name_.find("reduce_compute") != std::string::npos);
-    }
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    if (!runtime_config_t::get().set_num_threads(reseter__.old_)) {
-        GTEST_SKIP();
-    }
-
-    auto in_a = alloc_array<float>(shape * shape);
-    auto &fmt = input_a->get_outputs()[0]->details_.get_format();
-    test_buffer<float> in_a_plain;
-    auto in_b = alloc_array<float>(shape * shape);
-    size_t out_size;
-    if (reduce_output) {
-        if (rd_axis == -1) {
-            out_size = 1;
-        } else {
-            out_size = shape;
-        }
-    } else {
-        out_size = shape * shape;
-    }
-    out = alloc_array<float>(out_size, INIT_NOOP);
-
-    auto ref_out = alloc_array<float>(shape * shape, INIT_NOOP);
-    auto ref_tmp = alloc_array<float>((rd_axis == -1) ? 1 : shape, INIT_ZERO);
-    gemm_params gparams {
-            false, false, shape, shape, shape, 1.0f, 0.0f, shape, shape, shape};
-    if (input_plain) {
-        ref_gemm(gparams, in_a.data(), in_b.data(), ref_out.data());
-    } else {
-        in_a_plain = MKmk2MK(in_a, shape / fmt.blocks_[0],
-                shape / fmt.blocks_[1], fmt.blocks_[0], fmt.blocks_[1]);
-        ref_gemm(gparams, in_a_plain.data(), in_b.data(), ref_out.data());
-    }
-    test_buffer<float> &plain_a = input_plain ? in_a : in_a_plain;
-    for (auto &v : ref_out) {
-        v = std::max(v, 0.0f);
-    }
-    if (rd_axis == 1) {
-        utils::parallel_for(0, shape, 1, [&](int64_t i) {
-            for (int j = 0; j < shape; j++) {
-                ref_tmp[i] += ref_out[i * shape + j];
-            }
-        });
-    } else if (rd_axis == 0) {
-        utils::parallel_for(0, shape, 1, [&](int64_t j) {
-            for (int i = 0; i < shape; i++) {
-                ref_tmp[j] += ref_out[i * shape + j];
-            }
-        });
-    } else {
-        for (auto v : ref_out) {
-            ref_tmp[0] += v;
-        }
-    }
-    if (mean) {
-        utils::parallel_for(
-                0, ref_tmp.size(), 1, [&](int64_t b) { ref_tmp[b] /= shape; });
-    }
-
-    if (!reduce_output) {
-        utils::parallel_for(0, shape, 1, [&](int64_t i) {
-            for (int j = 0; j < shape; j++) {
-                ref_out[i * shape + j] = ref_tmp[i] + plain_a[i * shape + j];
-            }
-        });
-        refout = std::move(ref_out);
-    } else {
-        refout = std::move(ref_tmp);
-    }
-
-    if (!runtime_config_t::get().set_num_threads(num_threads)) { GTEST_SKIP(); }
-    fptr->call_default(out.data(), in_a.data(), in_b.data());
-    done = true;
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPartialReduceAsOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, true, false, out, refout, 0);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPaddingReduceAsOutput) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-    auto ctx = get_test_ctx();
-    const int cols = vectorize_step(ctx, sc_data_etype::F32) - 1;
-    sc_dims input_shape = {32, cols};
-    auto input0 = graph.make_input(
-            {graph_tensor::make({32, cols}, sc_data_format_t::MK())});
-    auto input1 = graph.make_input(
-            {graph_tensor::make({32, cols}, sc_data_format_t::MK())});
-    auto reo0 = graph.make("reorder", {input0->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(16, cols + 1)},
-                    {"internal", true}});
-    auto reo1 = graph.make("reorder", {input1->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t::MKmk(16, cols + 1)},
-                    {"internal", true}});
-    auto add0 = graph.make(
-            "add", {reo0->get_outputs()[0], reo1->get_outputs()[0]}, {}, {});
-    auto reduce0 = graph.make("reduce", add0->get_outputs(), {},
-            {{"rd_axis", std::vector<int> {1}}, {"rd_op", 2}});
-    auto output0 = graph.make_output(reduce0->get_outputs());
-
-    mixed_partition(graph, ctx);
-
-    auto f = lower_graph(get_test_ctx(), graph, {output0, input0, input1});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(f, true);
-
-    auto in_a = alloc_array<float>(test_utils::product(input_shape),
-            init_action::INIT_RANGE, -100, -1);
-    auto in_b = alloc_array<float>(test_utils::product(input_shape),
-            init_action::INIT_RANGE, -100, -1);
-    // auto in_c = alloc_array<float>(1);
-    auto out = alloc_array<float>(32, INIT_NOOP);
-    auto ref_out = std::vector<float>(32);
-
-    std::vector<generic_val> generic_args;
-    generic_args.emplace_back(&out[0]);
-    generic_args.emplace_back(&in_a[0]);
-    generic_args.emplace_back(&in_b[0]);
-    fptr->call_generic_default(generic_args.data());
-
-    for (auto i = 0; i < input_shape[0]; i++) {
-        float max = -std::numeric_limits<float>::infinity();
-        for (auto j = 0; j < cols; j++) {
-            max = std::max(max, in_a[i * cols + j] + in_b[i * cols + j]);
-        }
-        ref_out[i] = max;
-    }
-    if (verbose) {
-        for (size_t i = 0; i < in_a.size(); i++) {
-            std::cout << in_a[i] << " ";
-            if ((i + 1) % cols == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl;
-        for (size_t i = 0; i < in_b.size(); i++) {
-            std::cout << in_b[i] << " ";
-            if ((i + 1) % cols == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl;
-        std::cout << out[0] << std::endl;
-        std::cout << ref_out[0] << std::endl;
-    }
-    test_utils::compare_data(out, ref_out, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPartialReduceAsOutputMixedFuse) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, true, false, out, refout, 0, 16);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPartialReduceAsOutputNoKeepDims) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, false, false, out, refout, 0);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPartialReduceAsOutputAllReduce) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, true, false, out, refout, -1);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceAsOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, true, false, out, refout);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceAsOutputNoKeepDims) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, false, false, out, refout);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceNotOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, false, true, true, false, out, refout);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceBlockingNotOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, false, false, true, false, out, refout);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceAsOutputNoKeepDimsMean) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    bool done = false;
-    do_test_last_axis(done, true, true, false, true, out, refout);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-// test bmm+relu+reduce+add. Reduction on M axis for B_MN
-static void do_test_not_last_axis(bool reduce_output, bool input_plain,
-        bool keep_dims, bool mean, test_buffer<float> &out,
-        test_buffer<float> &refout) {
-    sc_graph_t graph;
-    sc_data_format_t fmt1 = input_plain
-            ? sc_data_format_t(format_kinds::ABCD)
-            : sc_data_format_t(format_kinds::ABCDcd, {32, 32});
-    sc_data_format_t fmt2 = sc_data_format_t(format_kinds::ABCD);
-    const int B = 128, M = 256, N = 256, K = 256;
-    auto input_a
-            = graph.make_input({graph_tensor::make({16, 8, 256, 256}, fmt1),
-                    graph_tensor::make({16, 8, 256, 256}, fmt2)});
-    auto mm = graph.make("matmul_core", input_a->get_outputs(), {}, {});
-    auto relu = graph.make("relu", {mm->get_outputs()[0]}, {}, {});
-    std::string reduce_op_name = mean ? "reduce_mean" : "reduce_sum";
-    auto reduce = graph.make(reduce_op_name, {relu->get_outputs()[0]}, {},
-            {{"rd_axis", std::vector<int>({2})}, {"keep_dims", keep_dims}});
-    if (!reduce_output) {
-        reduce = graph.make("add",
-                {reduce->get_outputs()[0], input_a->get_outputs()[0]}, {}, {});
-    }
-
-    auto output = graph.make_output(reduce->get_outputs());
-
-    graph_driver(graph, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), graph, {output, input_a});
-
-    reduce_checker chk;
-    chk.buffer_name = "reduce_compute";
-    func_t thefunc;
-
-    for (auto &func : f->get_contents()) {
-        if (func->name_.find("matmul_core") != std::string::npos) {
-            thefunc = func;
-        }
-    }
-    ASSERT_TRUE(thefunc);
-    chk.dispatch(thefunc);
-    ASSERT_EQ(chk.num_assign, 1);
-    ASSERT_EQ(chk.num_reduce_add, 0);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-
-    auto in_a = alloc_array<float>(B * M * K);
-    auto &fmt = input_a->get_outputs()[0]->details_.get_format();
-    test_buffer<float> in_a_plain;
-    auto in_b = alloc_array<float>(B * K * N);
-    out = alloc_array<float>(reduce_output ? B * K : B * M * N, INIT_NOOP);
-
-    auto ref_out = alloc_array<float>(B * M * N, INIT_NOOP);
-    auto ref_tmp = alloc_array<float>(B * N, INIT_ZERO);
-    gemm_params gparams {false, false, M, N, K, 1.0f, 0.0f, M, N, K};
-    test_buffer<float> &plain_a = input_plain ? in_a : in_a_plain;
-    if (!input_plain) {
-        in_a_plain = batch_MKmk2MK(in_a, B, M / fmt.blocks_[0],
-                K / fmt.blocks_[1], fmt.blocks_[0], fmt.blocks_[1]);
-    }
-    for (int b = 0; b < B; b++) {
-        ref_gemm(gparams, plain_a.data() + b * M * K, in_b.data() + b * K * N,
-                ref_out.data() + b * M * N);
-    }
-    for (auto &v : ref_out) {
-        v = std::max(v, 0.0f);
-    }
-    utils::parallel_for(0, B, 1, [&](int64_t b) {
-        for (int i = 0; i < M; i++) {
-            for (int j = 0; j < N; j++) {
-                ref_tmp[b * N + j] += ref_out[b * M * N + i * N + j];
-            }
-        }
-    });
-    if (mean) {
-        utils::parallel_for(
-                0, ref_tmp.size(), 1, [&](int64_t b) { ref_tmp[b] /= 256; });
-    }
-
-    if (!reduce_output) {
-        utils::parallel_for(0, B, 1, [&](int64_t b) {
-            for (int i = 0; i < M; i++) {
-                for (int j = 0; j < N; j++) {
-                    ref_out[b * M * N + i * N + j] = ref_tmp[b * N + j]
-                            + plain_a[b * M * N + i * N + j];
-                }
-            }
-        });
-        refout = std::move(ref_out);
-    } else {
-        refout = std::move(ref_tmp);
-    }
-    fptr->call_default(out.data(), in_a.data(), in_b.data());
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceNotLastAxisNotOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    do_test_not_last_axis(false, true, true, false, out, refout);
-    test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceNotLastAxisAsOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    do_test_not_last_axis(true, true, true, false, out, refout);
-    test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp,
-        TestTwoStageReduceNotLastAxisAsOutputNoKeepDims) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    do_test_not_last_axis(true, true, false, false, out, refout);
-    test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceNotLastAxisBlockingNotOutput) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    do_test_not_last_axis(false, false, true, false, out, refout);
-    test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceNotLastAxisAsOutputMean) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    do_test_not_last_axis(true, true, true, true, out, refout);
-    test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-static void do_no_main_op_single_core(
-        test_buffer<float> &out, test_buffer<float> &refout) {
-    thread_num_reset reseter;
-    sc_graph_t graph;
-    const int shape = 1024;
-    if (!runtime_config_t::get().set_num_threads(1)) { GTEST_SKIP(); }
-    // make sure matmul has last level fusion anchor
-    sc_data_format_t fmt1 = sc_data_format_t::MK();
-    std::vector<int> rdax {0};
-    auto input_a = graph.make_input({graph_tensor::make({shape, shape}, fmt1)});
-    auto relu = graph.make("relu", {input_a->get_outputs()[0]}, {}, {});
-    auto reduce = graph.make("reduce", {relu->get_outputs()[0]}, {},
-            {{"rd_axis", std::move(rdax)}, {"keep_dims", true}, {"rd_op", 0}});
-
-    auto output = graph.make_output(reduce->get_outputs());
-
-    graph_driver(graph, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), graph, {output, input_a});
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    if (!runtime_config_t::get().set_num_threads(reseter.old_)) {
-        GTEST_SKIP();
-    }
-    auto in_a = alloc_array<float>(shape * shape);
-    size_t out_size = shape;
-
-    out = alloc_array<float>(out_size, INIT_NOOP);
-
-    auto ref_out = in_a.copy();
-    auto ref_tmp = alloc_array<float>(shape, INIT_ZERO);
-
-    for (auto &v : ref_out) {
-        v = std::max(v, 0.0f);
-    }
-
-    utils::parallel_for(0, shape, 1, [&](int64_t j) {
-        for (int i = 0; i < shape; i++) {
-            ref_tmp[j] += ref_out[i * shape + j];
-        }
-    });
-
-    refout = std::move(ref_tmp);
-
-    if (!runtime_config_t::get().set_num_threads(1)) { GTEST_SKIP(); }
-    fptr->call_default(out.data(), in_a.data());
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestTwoStageReduceNoMainOp) {
-    REQUIRE_AVX2();
-    test_buffer<float> out;
-    test_buffer<float> refout;
-    do_no_main_op_single_core(out, refout);
-    test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-template <typename T>
-static void do_test_low_precsion_float(test_buffer<T> &out,
-        test_buffer<T> &refout, bool &done, const bool &is_bf16) {
-    thread_num_reset reseter;
-    sc_graph_t graph;
-    const int shape = 1024;
-    done = false;
-    if (!runtime_config_t::get().set_num_threads(16)) { GTEST_SKIP(); }
-    // make sure matmul has last level fusion anchor
-    sc_data_format_t fmt1 = sc_data_format_t::MK();
-    auto input_a = graph.make_input({graph_tensor::make(
-            {shape, shape}, fmt1, is_bf16 ? datatypes::bf16 : datatypes::f16)});
-    std::vector<int> rdax {0};
-
-    auto reduce = graph.make("reduce", {input_a->get_outputs()[0]}, {},
-            {{"rd_axis", std::move(rdax)}, {"keep_dims", true}, {"rd_op", 0}});
-
-    auto output = graph.make_output(reduce->get_outputs());
-
-    graph_driver(graph, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), graph, {output, input_a});
-    func_t thefunc;
-
-    for (auto &func : f->get_contents()) {
-        if (func->name_.find("reduce_compute") != std::string::npos) {
-            thefunc = func;
-            break;
-        }
-    }
-    ASSERT_TRUE(thefunc);
-
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    if (!runtime_config_t::get().set_num_threads(reseter.old_)) {
-        GTEST_SKIP();
-    }
-
-    auto in_a = alloc_array<T>(shape * shape);
-    out = alloc_array<T>(shape, INIT_NOOP);
-    auto ref_outf32 = alloc_array<float>(shape, INIT_ZERO);
-    refout = alloc_array<T>(shape, INIT_NOOP);
-
-    utils::parallel_for(0, shape, 1, [&](int64_t j) {
-        for (int i = 0; i < shape; i++) {
-            ref_outf32[j] += in_a[i * shape + j];
-        }
-    });
-    utils::parallel_for(
-            0, shape, 1, [&](int64_t j) { refout[j] = T(ref_outf32[j]); });
-    if (!runtime_config_t::get().set_num_threads(16)) { GTEST_SKIP(); }
-    fptr->call_default(out.data(), in_a.data());
-    done = true;
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPartialBf16) {
-    REQUIRE_BF16();
-    test_buffer<bf16_t> out;
-    test_buffer<bf16_t> refout;
-    bool done = false;
-    do_test_low_precsion_float(out, refout, done, true);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestPartialfp16) {
-    REQUIRE_FP16();
-    test_buffer<fp16_t> out;
-    test_buffer<fp16_t> refout;
-    bool done = false;
-    do_test_low_precsion_float(out, refout, done, false);
-    if (done) test_utils::compare_data(out, refout, the_rtol, the_atol);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOpkeepDims1) {
-    const int out_size = 3;
-    do_test_reduce_op<float>(
-            sc_dims({3, 3, 3}), std::vector<int>({1, 2}), "reduce_mean",
-            out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = 0;
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] += input[i * 3 * 3 + j * 3 + k] / 9;
-                return ref_out;
-            },
-            true);
-}
-
-TEST(GCCore_CPU_reduce_op_cpp, TestReduceOpkeepDims2) {
-    const int out_size = 3;
-    do_test_reduce_op<float>(
-            sc_dims({3, 3, 3}), std::vector<int>({1, 2}), "reduce_mean",
-            out_size, datatypes::f32,
-            [&](std::vector<float> &input) {
-                auto ref_out = std::vector<float>(out_size, 0);
-                for (size_t i = 0; i < ref_out.size(); ++i)
-                    ref_out[i] = 0;
-                for (size_t i = 0; i < 3; ++i)
-                    for (size_t j = 0; j < 3; ++j)
-                        for (size_t k = 0; k < 3; ++k)
-                            ref_out[i] += input[i * 3 * 3 + j * 3 + k] / 9;
-                return ref_out;
-            },
-            false, sc_dims({3, 1, 1}));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_runtime_data_format.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_runtime_data_format.cpp
deleted file mode 100644
index 481951b42e1..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_runtime_data_format.cpp
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <chrono>
-#include "gtest/gtest.h"
-#include <compiler/ir/sc_data_format.hpp>
-#include <runtime/dispatch_key.hpp>
-#include <runtime/dynamic_dispatch/dyn_dispatch_table.hpp>
-#include <runtime/dynamic_dispatch/hash_dispatch_table.hpp>
-#include <runtime/dynamic_dispatch/static_dispatch_table.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static uint64_t fast_rand(uint64_t &seed) {
-    seed = seed * 0x213f23eba219 + 17;
-    return seed;
-}
-
-//#define DO_BENCH_IN_UT
-//#define EXP_DISTRIBUTION
-
-static std::array<uint64_t, 3> generate_format(
-        const std::vector<runtime::dyn_dispatch_table_t::format_arg_t>
-                &lookuptable,
-        int fmt_id, int block_id) {
-    auto fmt0 = lookuptable[0].info_[fmt_id % 2].key_;
-    auto fmt1 = lookuptable[1].info_[fmt_id / 2 % 2].key_;
-    auto fmt2 = lookuptable[2].info_[fmt_id / 4 % 2].key_;
-
-    int impl_type = block_id % 2;
-    int M = (block_id / 2 % 4 + 1) * 16;
-    int N = (block_id / 8 % 4 + 1) * 16;
-    int K = (block_id / 32 % 4 + 1) * 16;
-    uint64_t f0 = runtime::dispatch_key(fmt0, M, K, impl_type);
-    uint64_t f1 = runtime::dispatch_key(fmt1, K, N, impl_type);
-    uint64_t f2 = runtime::dispatch_key(fmt2, M, N, impl_type);
-    return {f0, f1, f2};
-}
-
-static constexpr uint64_t MK = format_kinds::MK;
-static constexpr uint64_t MKmk = format_kinds::MKmk;
-static constexpr uint64_t NKkn = format_kinds::NKkn;
-
-static constexpr uint64_t NCHWc = format_kinds::NCHWc;
-static constexpr uint64_t ACBD = format_kinds::ACBD;
-static constexpr uint64_t ABCD = format_kinds::ABCD;
-static constexpr uint64_t ACBDcd = format_kinds::ACBDcd;
-static constexpr uint64_t ACBDdc = format_kinds::ACBDdc;
-
-TEST(GCCore_CPU_runtime_data_format, Benchmark) {
-    using namespace runtime;
-    struct block_compute {
-        static uint64_t call(uint64_t *args, uint64_t v) {
-            // args[0] for MK blocking and args[1].block_idx2_ for N blocking
-            return dispatch_key(args[0]).get_linear_index() * 4
-                    + dispatch_key(args[1]).block_idx2_;
-        }
-    };
-
-    using format_key1 = static_dispatch_keys<MK, MKmk>;
-    using format_key2 = static_dispatch_keys<MK, NKkn>;
-    using format_key3 = static_dispatch_keys<MK, MKmk>;
-    using static_table = static_dispatch_table_t<block_compute, 128,
-            format_key1, format_key2, format_key3>;
-    static_table sta_table;
-    std::vector<dyn_dispatch_table_t::format_arg_t> loopup_table = {
-            {{{MKmk}, {MK}}},
-            {{{NKkn}, {MK}}},
-            {{{MKmk}, {MK}}},
-    };
-
-    dyn_dispatch_table_t dyn_table
-            = {std::vector<dyn_dispatch_table_t::format_arg_t>(loopup_table),
-                    block_compute::call, 128};
-    hash_dispatch_table_t hash_table {3, 128 * 8 * 4};
-
-    for (int fmt_id = 0; fmt_id < 8; fmt_id++) {
-        for (int block_id = 0; block_id < 128; block_id++) {
-            auto fmt = generate_format(loopup_table, fmt_id, block_id);
-            void *the_value = (void *)(uint64_t)((block_id << 16) | fmt_id);
-            sta_table.set(fmt.data(), 3, the_value);
-            dyn_table.set(fmt.data(), 3, the_value);
-            hash_table.set(fmt.data(), 3, the_value);
-        }
-    }
-
-    struct noop_dispatch_table : public dispatch_table_t {
-        dispatch_func_t get_dispatch_func() override {
-            return [](dispatch_table_t *, uint64_t *keys,
-                           uint64_t num_keys) -> void * { return nullptr; };
-        }
-        void *get(uint64_t *keys, uint64_t num_keys) override {
-            return nullptr;
-        }
-        void set(uint64_t *keys, uint64_t num_keys, void *value) override {}
-    };
-
-    noop_dispatch_table noop_table;
-
-    auto run = [](dispatch_table_t *table,
-                       const std::vector<dyn_dispatch_table_t::format_arg_t>
-                               &lookuptable,
-                       bool check) {
-        uint64_t rnd = 12232123;
-        dispatch_table_t::dispatch_func_t f = table->get_dispatch_func();
-        for (int i = 0; i < 1000; i++) {
-            auto next_rnd = fast_rand(rnd);
-            int block_id = next_rnd % 128;
-            int fmt_id = next_rnd / 128 % 8;
-            auto fmts = generate_format(lookuptable, fmt_id, block_id);
-            f(table, fmts.data(), 3);
-        }
-
-        int64_t count = 0;
-        uint64_t sum = 0;
-        SC_UNUSED(sum);
-        auto start = std::chrono::high_resolution_clock::now();
-        for (int i = 0; i < 10000000; i++) {
-            auto next_rnd = fast_rand(rnd);
-            int block_id = next_rnd % 128;
-#ifdef EXP_DISTRIBUTION
-            int fmt_id = next_rnd / 128 % 128;
-            if (fmt_id > 64) {
-                fmt_id = 0;
-            } else if (fmt_id > 32) {
-                fmt_id = 1;
-            } else if (fmt_id > 16) {
-                fmt_id = 2;
-            } else if (fmt_id > 8) {
-                fmt_id = 3;
-            } else if (fmt_id > 4) {
-                fmt_id = 4;
-            } else if (fmt_id > 2) {
-                fmt_id = 5;
-            } else if (fmt_id > 1) {
-                fmt_id = 6;
-            } else {
-                fmt_id = 7;
-            }
-#else
-            int fmt_id = next_rnd / 128 % 8;
-#endif
-            auto fmts = generate_format(lookuptable, fmt_id, block_id);
-            auto ret = (uint64_t)f(table, fmts.data(), 3);
-
-            if (check) {
-                if ((int64_t)ret != (int64_t)((block_id << 16) | fmt_id)) {
-                    throw std::runtime_error("Check failed");
-                }
-            }
-            sum += ret;
-        }
-        auto end = std::chrono::high_resolution_clock::now();
-        count += std::chrono::duration_cast<std::chrono::microseconds>(
-                end - start)
-                         .count();
-        return count;
-    };
-
-    auto base = run(&noop_table, loopup_table, false);
-    auto sta = run(&sta_table, loopup_table, true) - base;
-    auto hash = run(&hash_table, loopup_table, true) - base;
-    auto dyn = run(&dyn_table, loopup_table, true) - base;
-    SC_UNUSED(base + sta + hash + dyn);
-#ifdef DO_BENCH_IN_UT
-    std::cout << "static:" << sta << "\nhash:" << hash << "\ndyn:" << dyn
-              << "\n";
-#endif
-}
-
-TEST(GCCore_CPU_runtime_data_format, TestDataFormat) {
-    runtime::dispatch_key a(0);
-    a.impl_alg_ = 1;
-    a.block_idx1_ = 2;
-    a.block_idx2_ = 3;
-    a.block1_ = 0;
-    a.block2_ = 0;
-    a.format_kind_ = 1234;
-
-    EXPECT_FALSE(a.is_blocks_uncompressed());
-    EXPECT_EQ(a.get_linear_index(), 1UL * 16 + 3 * 4 + 2);
-    EXPECT_EQ(a.format_kind_, 1234UL);
-}
-
-TEST(GCCore_CPU_runtime_data_format, TestDataFormatConvert) {
-    sc_data_format_t fmt = sc_data_format_t::MKmk(16, 32);
-    runtime::dispatch_key rfmt = fmt.to_runtime();
-
-    EXPECT_FALSE(rfmt.is_blocks_uncompressed());
-    EXPECT_EQ(rfmt.get_linear_index(), 0UL * 16 + 1UL * 4 + 0);
-    EXPECT_EQ(rfmt.format_kind_, (uint32_t)format_kinds::MKmk.storage_);
-    EXPECT_EQ(rfmt.get_block1(), 16UL);
-    EXPECT_EQ(rfmt.get_block2(), 32UL);
-
-    fmt = sc_data_format_t::NCHWc(16);
-    rfmt = fmt.to_runtime();
-
-    EXPECT_FALSE(rfmt.is_blocks_uncompressed());
-    EXPECT_EQ(rfmt.get_linear_index(), 0UL * 16 + 0 * 4 + 0);
-    EXPECT_EQ(rfmt.format_kind_, (uint32_t)format_kinds::NCHWc.storage_);
-    EXPECT_EQ(rfmt.get_block1(), 16UL);
-    EXPECT_EQ(rfmt.block2_, 0UL);
-
-    fmt = sc_data_format_t::MKmk(16, 3);
-    rfmt = fmt.to_runtime();
-    EXPECT_TRUE(rfmt.is_blocks_uncompressed());
-    EXPECT_EQ(rfmt.format_kind_, (uint32_t)format_kinds::MKmk.storage_);
-    EXPECT_EQ(rfmt.get_block1(), 16UL);
-    EXPECT_EQ(rfmt.get_block2(), 3UL);
-}
-
-TEST(GCCore_CPU_runtime_data_format, TestDataFormatLinear) {
-    sc_data_format_t fmt = sc_data_format_t::MKmk(16, 32);
-    runtime::dispatch_key rfmt = fmt.to_runtime();
-    auto to_idx = [](runtime::dispatch_key v) {
-        return runtime::dispatch_key::linear_converter<format_kinds::MKmk,
-                format_kinds::NKkn, format_kinds::ABCD>::call(v);
-    };
-
-    EXPECT_EQ(to_idx(rfmt), 2UL);
-    EXPECT_EQ(to_idx(sc_data_format_t::NKkn(32, 32).to_runtime()), 1UL);
-    EXPECT_EQ(to_idx(runtime::dispatch_key(
-                      uint32_t(format_kinds::ABCD), 16, 32, true)),
-            0UL);
-}
-
-TEST(GCCore_CPU_runtime_data_format, TestDataFormatStaticDispatch) {
-    using namespace runtime;
-    using format_key1
-            = static_dispatch_keys<format_kinds::MKmk, format_kinds::NKkn>;
-    using format_key2 = static_dispatch_keys<format_kinds::MKmk,
-            format_kinds::NKkn, format_kinds::ABCD>;
-    struct block_func {
-        static uint64_t call(uint64_t *v, uint64_t num) {
-            return dispatch_key(v[0]).get_linear_index() * 32ULL
-                    + dispatch_key(v[1]).get_linear_index();
-        }
-    };
-    using the_table = static_dispatch_table_t<block_func, 32 * 32, format_key1,
-            format_key2>;
-    the_table table;
-    uint64_t formats[]
-            = {dispatch_key(uint32_t(format_kinds::MKmk), 16, 16, false),
-                    dispatch_key(uint32_t(format_kinds::NKkn), 16, 32, true)};
-    EXPECT_EQ(the_table::compute_linear_index(formats, 2),
-            4UL * 1024 + 1 * 16 + 1 * 4 + 0);
-    formats[0] = dispatch_key(uint32_t(format_kinds::MKmk), 48, 16, false);
-    EXPECT_EQ(the_table::compute_linear_index(formats, 2),
-            4UL * 1024 + 2 * 32ULL + 1 * 16 + 1 * 4 + 0);
-}
-
-TEST(GCCore_CPU_runtime_data_format, TestDataFormatDynDispatch) {
-    using namespace runtime;
-    dyn_dispatch_table_t table(
-            {
-                    {{{MKmk}, {NKkn}, {NCHWc}}},
-                    {{{ACBD}, {ABCD}, {ACBDcd}, {ACBDdc}}},
-                    {{{MKmk}}},
-            },
-            [](uint64_t *v, uint64_t num) -> uint64_t {
-                return dispatch_key(v[0]).get_linear_index() * 32ULL * 32
-                        + dispatch_key(v[1]).get_linear_index() * 32ULL
-                        + dispatch_key(v[2]).get_linear_index();
-            },
-            32 * 32 * 32);
-    uint64_t formats[]
-            = {dispatch_key(uint32_t(format_kinds::NKkn), 32, 16, false),
-                    dispatch_key(uint32_t(format_kinds::ABCD), 32, 0, true),
-                    dispatch_key(uint32_t(format_kinds::MKmk), 16, 16, true)};
-    EXPECT_EQ(table.compute_linear_index(formats, 3),
-            (1UL + 1 * 3 + 0) * 32ULL * 32 * 32 + 1UL * 32 * 32
-                    + (1 * 16 + 1) * 32ULL + 1 * 16);
-}
-
-TEST(GCCore_CPU_runtime_data_format, TestDataFormatHashDispatch) {
-    using namespace runtime;
-    using namespace format_kinds;
-    hash_dispatch_table_t table {3, 256};
-    std::array<uint64_t, 3> keys;
-    keys = {dispatch_key(uint32_t(format_kinds::NKkn), 32, 16, false),
-            dispatch_key(uint32_t(format_kinds::ABCD), 32, 0, true),
-            dispatch_key(uint32_t(format_kinds::MKmk), 16, 16, true)};
-    table.set(keys.data(), 3, (void *)123);
-
-    uint64_t seed = 11234;
-    for (uint64_t i = 0; i < 126; i++) {
-        keys = {fast_rand(seed), fast_rand(seed), fast_rand(seed)};
-        table.set(keys.data(), 3, (void *)i);
-    }
-
-    seed = 11234;
-    for (uint64_t i = 0; i < 126; i++) {
-        keys = {fast_rand(seed), fast_rand(seed), fast_rand(seed)};
-        EXPECT_EQ(table.get(keys.data(), 3), (void *)i);
-    }
-    keys = {dispatch_key(uint32_t(format_kinds::NKkn), 32, 16, false),
-            dispatch_key(uint32_t(format_kinds::ABCD), 32, 0, true),
-            dispatch_key(uint32_t(format_kinds::MKmk), 16, 16, true)};
-    EXPECT_EQ(table.get(keys.data(), 3), (void *)123);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_sc_op.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_sc_op.cpp
deleted file mode 100644
index 5d909672033..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_sc_op.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <vector>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/fusible_op.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <gtest/gtest.h>
-#include <unordered_map>
-#include <util/any_map.hpp>
-
-#include "context.hpp"
-
-using namespace dnnl::impl::graph::gc;
-
-namespace llga_fake {
-enum class op_kind_t { kconv, kmatmul };
-enum class data_type {
-    undef = 0,
-    f16 = 1,
-    bf16 = 2,
-    f32 = 3,
-    s32 = 4,
-    s8 = 5,
-    u8 = 6,
-};
-
-enum class layout_type {
-    undef = 0,
-    any = 1,
-    strided = 2,
-    opaque = 3,
-};
-
-struct logical_tensor_t {
-    int64_t id;
-    int32_t ndims;
-    std::vector<int64_t> dims;
-    data_type dtype;
-    layout_type ltype;
-};
-
-class node;
-class value_t : public logical_tensor_t {
-public:
-    class consumer_t {
-    public:
-        consumer_t(node &op, size_t offset) : op_(&op), offset_(offset) {}
-        consumer_t(const consumer_t &c) = default;
-        bool operator==(const consumer_t &c) const {
-            return op_ == c.op_ && offset_ == c.offset_;
-        };
-        node &get_op() const { return *op_; }
-        size_t get_offset() const { return offset_; }
-
-    private:
-        node *op_ = nullptr;
-        size_t offset_ = 0;
-    };
-
-    value_t(const logical_tensor_t &logical_tensor)
-        : logical_tensor_t(logical_tensor) {}
-    value_t(node &producer, size_t offset,
-            const logical_tensor_t &logical_tensor)
-        : logical_tensor_t(logical_tensor)
-        , producer_(&producer)
-        , offset_(offset) {}
-    node *get_producer() const { return producer_; }
-    size_t get_offset() const { return offset_; }
-    std::vector<consumer_t> get_consumers() const { return consumers_; }
-    void set_producer(node &anode) { producer_ = &anode; }
-    void set_offset(size_t aoffset) { offset_ = aoffset; }
-
-    void add_consumer(node &op, size_t offset) {
-        if (std::find(consumers_.begin(), consumers_.end(),
-                    consumer_t(op, offset))
-                == consumers_.end()) {
-            consumers_.emplace_back(consumer_t {op, offset});
-        }
-    }
-
-private:
-    node *producer_ {nullptr};
-    size_t offset_;
-    std::vector<consumer_t> consumers_;
-};
-
-using value_ptr_t = std::shared_ptr<value_t>;
-using value_ptrs_t = std::vector<value_ptr_t>;
-class node {
-private:
-    int64_t id_;
-    std::string name_;
-    op_kind_t op_kind_;
-    std::unordered_map<std::string, any_t> attrs_;
-
-public:
-    value_ptrs_t outputs_ {};
-    value_ptrs_t inputs_ {};
-    node(int64_t id, const std::string &name, op_kind_t op_kind)
-        : id_(id), name_(name), op_kind_(op_kind) {
-        SC_UNUSED(id_);
-    }
-    template <typename T>
-    void set_attr(const std::string &name, T value) {
-        attrs_[name] = value;
-    }
-    const std::unordered_map<std::string, any_t> &get_attrs() { return attrs_; }
-    op_kind_t get_op_kind() const { return op_kind_; }
-    const std::string &get_name() { return name_; }
-
-    value_ptr_t get_input_value(size_t offset) const {
-        return inputs_.at(offset);
-    }
-    value_ptr_t get_output_value(size_t offset) const {
-        return outputs_.at(offset);
-    }
-
-    void add_input(const logical_tensor_t &t) {
-        add_input(std::make_shared<value_t>(t));
-    }
-
-    void add_input(const value_ptr_t &v) { inputs_.push_back(v); }
-
-    value_ptr_t add_output(const logical_tensor_t &t) {
-        value_ptr_t v = std::make_shared<value_t>(*this, outputs_.size(), t);
-        outputs_.push_back(v);
-        return v;
-    }
-
-    void add_output(const value_ptr_t &v) {
-        v->set_producer(*this);
-        v->set_offset(outputs_.size());
-        outputs_.push_back(v);
-    }
-    /*!
-     * \brief Set input of this node.
-     * \param offset The index of this node's inputs.
-     * \param input_node The input node to this node.
-     * \param input_offset The index of the input node's outputs.
-     * \return
-     */
-    void set_input(size_t offset, node &input_node, size_t input_offset) {
-        set_input(offset, input_node.get_output_value(input_offset));
-    }
-    void set_input(size_t offset, const value_ptr_t &output) {
-        output->add_consumer(*this, offset);
-        if (inputs_.size() <= offset) { inputs_.resize(offset + 1); }
-        inputs_[offset] = output;
-    }
-    value_ptr_t get_output_value(size_t offset) {
-        if (offset < outputs_.size()) {
-            return outputs_[offset];
-        } else {
-            value_ptr_t v;
-            while (offset >= outputs_.size()) {
-                v = std::make_shared<value_t>(*this, outputs_.size(),
-                        logical_tensor_t {0, -1, {-1}, data_type::undef,
-                                layout_type::undef});
-                outputs_.push_back(v);
-            }
-            return v;
-        }
-    }
-
-    /*! \brief get number of output tensors of this op */
-    size_t num_output_users(size_t offset) const {
-        return get_output_value(offset)->get_consumers().size();
-    }
-
-    size_t num_output_ops() const {
-        size_t num = 0;
-        for (const auto &out : outputs_) {
-            num += out->get_consumers().size();
-        }
-        return num;
-    }
-
-    size_t num_input_ops() const {
-        size_t num = 0;
-        for (const auto &in : inputs_) {
-            if (in->get_producer() != nullptr) { num++; }
-        }
-        return num;
-    }
-};
-
-} // namespace llga_fake
-
-TEST(GCCore_CPU_sc_op_test, add_template_op) {
-    // using namespace llga_fake;
-    llga_fake::node conv(0, "conv0", llga_fake::op_kind_t::kconv);
-    llga_fake::logical_tensor_t conv_src {0, 4,
-            std::vector<int64_t> {4, 5, 64, 64}, llga_fake::data_type::f32,
-            llga_fake::layout_type::any};
-    llga_fake::logical_tensor_t conv_weight {1, 4,
-            std::vector<int64_t> {5, 5, 1, 1}, llga_fake::data_type::f32,
-            llga_fake::layout_type::any};
-    llga_fake::logical_tensor_t conv_dst {2, 4,
-            std::vector<int64_t> {4, 5, 64, 64}, llga_fake::data_type::f32,
-            llga_fake::layout_type::any};
-    conv.add_input(conv_src);
-    conv.add_input(conv_weight);
-    conv.add_output(conv_dst);
-    conv.set_attr<sc_dims>("strides", {1, 1});
-    conv.set_attr<sc_dims>("paddings", {0, 0});
-    conv.set_attr<sc_dims>("dilations", {1, 1});
-    conv.set_attr<std::string>("data_format", "NCX");
-
-    llga_fake::node mm(1, "matmul0", llga_fake::op_kind_t::kmatmul);
-    llga_fake::logical_tensor_t mm_src {3, 4,
-            std::vector<int64_t> {4, 5, 64, 64}, llga_fake::data_type::f32,
-            llga_fake::layout_type::any};
-    llga_fake::logical_tensor_t mm_dst {4, 4,
-            std::vector<int64_t> {4, 5, 64, 64}, llga_fake::data_type::f32,
-            llga_fake::layout_type::any};
-
-    // inputs should be matched or can be broadcasted.
-    mm.set_input(0, conv, 0);
-    mm.add_input(mm_src);
-    mm.add_output(mm_dst);
-    mm.set_attr<bool>("transpose_a", true);
-    mm.set_attr<bool>("transpose_b", true);
-
-    // llga fake subgraph
-    std::vector<llga_fake::node *> node_lists {&conv, &mm};
-
-    // create  llga_fake_node->sc_op mapping
-    using sc_op_ptr = std::shared_ptr<sc_op>;
-    std::unordered_map<llga_fake::node *, sc_op_ptr> op_mapping;
-
-    // create sc_graph_t
-    sc_graph_t opmg;
-
-    // TODO(zhichen): 1. llga and sc logical_tensor order mapping
-    // convert llga fake node to sc op
-    // should pre-order traverse.
-    for (auto llga_node : node_lists) {
-        // add connection and create sc op
-        std::shared_ptr<sc_op> ret;
-        std::vector<graph_tensor_ptr> producer_lt, consumer_lt;
-        producer_lt.reserve(llga_node->inputs_.size());
-        consumer_lt.reserve(llga_node->outputs_.size());
-        for (auto &in_value : llga_node->inputs_) {
-            auto scop = in_value->get_producer() == nullptr
-                    ? nullptr
-                    : op_mapping[in_value->get_producer()];
-            if (scop) {
-                producer_lt.emplace_back(
-                        scop->get_info().outputs_[in_value->get_offset()]);
-            } else {
-                auto lrt = std::make_shared<graph_tensor>(scop.get(),
-                        sc_data_format_t(),
-                        sc_dims {in_value->dims.begin(), in_value->dims.end()},
-                        sc_data_type_t(sc_data_etype::F32, 1));
-                producer_lt.emplace_back(lrt);
-            }
-        }
-        for (auto &out_value : llga_node->outputs_) {
-            auto scop = out_value->get_producer() == nullptr
-                    ? nullptr
-                    : op_mapping[out_value->get_producer()];
-            auto lrt = std::make_shared<graph_tensor>(scop.get(),
-                    sc_data_format_t(),
-                    sc_dims {out_value->dims.begin(), out_value->dims.end()},
-                    sc_data_type_t(sc_data_etype::F32, 1));
-            consumer_lt.emplace_back(lrt);
-        }
-        if (llga_node->get_op_kind() == llga_fake::op_kind_t::kconv) {
-            ret = opmg.make("conv_fwd_core", producer_lt, consumer_lt,
-                    any_map_t(llga_node->get_attrs()));
-        } else if (llga_node->get_op_kind() == llga_fake::op_kind_t::kmatmul) {
-            ret = opmg.make("matmul", producer_lt, consumer_lt,
-                    any_map_t(llga_node->get_attrs()));
-        }
-
-        op_mapping[llga_node] = ret;
-        if (!llga_node->num_output_ops()) {
-            auto foutput = opmg.make_output(consumer_lt);
-        }
-    }
-
-    ASSERT_TRUE(opmg.ops_.size() == 6);
-    for (auto &op : opmg.ops_) {
-        if (op->op_name_ == "conv_fwd_core") {
-            ASSERT_EQ(op->is_fusible_, false);
-            ASSERT_TRUE(op->get_inputs().size() == 2);
-            ASSERT_TRUE(op->get_outputs().size() == 1);
-            // conv's output op(matmul)'s input_op is "conv_fwd_core"
-            ASSERT_EQ(op->get_outputs()[0]->producer_owner_->op_name_,
-                    "conv_fwd_core");
-            ASSERT_EQ(op->get_info().inputs_[0]->producer_owner_->op_name_,
-                    "input");
-            ASSERT_EQ(op->get_info().inputs_[1]->producer_owner_->op_name_,
-                    "input");
-        } else if (op->op_name_ == "matmul") {
-            ASSERT_EQ(op->is_fusible_, false);
-            ASSERT_TRUE(op->get_inputs().size() == 2);
-            ASSERT_TRUE(op->get_outputs().size() == 1);
-            ASSERT_EQ(op->get_info().outputs_[0]->producer_owner_->op_name_,
-                    "matmul");
-            ASSERT_EQ(op->get_info().inputs_[0]->producer_owner_->op_name_,
-                    "conv_fwd_core");
-            ASSERT_EQ(op->get_info().inputs_[1]->producer_owner_->op_name_,
-                    "input");
-        } else if (op->op_name_ == "input") {
-            ASSERT_TRUE(op->get_inputs().empty());
-            ASSERT_TRUE(op->get_outputs().size() == 1);
-        } else if (op->op_name_ == "output") {
-            ASSERT_TRUE(op->get_inputs().size() == 1);
-            ASSERT_TRUE(op->get_outputs().empty());
-        } else {
-            FAIL();
-        }
-    }
-}
-
-TEST(GCCore_CPU_sc_op_test, add_fusible_op) {
-    sc_graph_t opmg;
-    std::unordered_map<std::string, any_t> transpose_attrs;
-    auto transpose_in1
-            = std::make_shared<graph_tensor>(nullptr, sc_data_format_t(),
-                    sc_dims {4, 3}, sc_data_type_t(sc_data_etype::F32, 1));
-    auto transpose_out
-            = std::make_shared<graph_tensor>(nullptr, sc_data_format_t(),
-                    sc_dims {3, 4}, sc_data_type_t(sc_data_etype::F32, 1));
-    transpose_attrs["order"] = std::vector<int> {1, 0};
-    auto mm_op = opmg.make(
-            "transpose", {transpose_in1}, {transpose_out}, transpose_attrs);
-    ASSERT_TRUE(opmg.ops_.size() == 2);
-    for (auto &op : opmg.ops_) {
-        if (op->op_name_ == "transpose") {
-            ASSERT_EQ(op->isa<fusible_op_t>(), true);
-            ASSERT_TRUE(op->get_inputs().size() == 1);
-            ASSERT_TRUE(op->get_outputs().size() == 1);
-        }
-    }
-}
-
-#if 0
-// TODO(xxx): we should check the result instead of check IR. we may generate
-// optimial IR.
-TEST(GCCore_CPU_sc_op_test, check_get_func) {
-    sc_graph_t opmg;
-    auto conv_src = std::make_shared<logical_tensor_t>(nullptr,
-            sc_data_format_t(), sc_dims {28, 64, 16, 16},
-            sc_dims {28, 64, 16, 16},
-            sc_data_type_t(sc_data_etype::F32, 1));
-    auto conv_weight = std::make_shared<logical_tensor_t>(nullptr,
-            sc_data_format_t(), sc_dims {64, 64, 1, 1},
-            sc_dims {64, 64, 1, 1},
-            sc_data_type_t(sc_data_etype::F32, 1));
-    auto conv_out = std::make_shared<logical_tensor_t>(nullptr,
-            sc_data_format_t(), sc_dims {64, 64, 1, 1},
-            sc_dims {64, 64, 1, 1},
-            sc_data_type_t(sc_data_etype::F32, 1));
-    auto relu_out = std::make_shared<logical_tensor_t>(nullptr,
-            sc_data_format_t(), sc_dims {64, 64, 1, 1},
-            sc_dims {64, 64, 1, 1},
-            sc_data_type_t(sc_data_etype::F32, 1));
-    std::unordered_map<std::string, any_t> attrs {
-            {"strides", sc_dims {1, 1}},
-            {"paddings", sc_dims {0, 0}}};
-    auto ret = opmg.make("conv_fwd_core", {conv_src, conv_weight}, {conv_out},
-            any_map_t(attrs));
-    auto f1 = ret->get_func(get_test_ctx());
-
-    ret = opmg.make("relu", {conv_out}, {relu_out}, any_map_t());
-    auto f2 = ret->get_func(get_test_ctx());
-
-    builder::ir_builder_t bld;
-    _function_(datatypes::void_t, relu_f,
-            _arg_("out", datatypes::f32, {64, 64, 1, 1}),
-            _arg_("in", datatypes::f32, {64, 64, 1, 1})) {
-        _bind_(out, in);
-        _for_(i, 0, 64) {
-            _for_(j, 0, 64) {
-                _for_(k, 0, 1) {
-                    _for_(p, 0, 1) {
-                        out[{i + 0, j + 0, k + 0, p + 0}] = builder::make_max(
-                                in[{i + 0, j + 0, k + 0, p + 0}], 0);
-                    }
-                }
-            }
-        }
-    }
-    ir_comparer ircmp;
-    EXPECT_TRUE(ircmp.compare(f2->get_entry_func(), relu_f));
-}
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_scope_flatten.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_scope_flatten.cpp
deleted file mode 100644
index 88c105aa918..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_scope_flatten.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <compiler/ir/builder.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/scope_flatten.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_scope_flatten_cpp, TestScopeFlatten) {
-    builder::ir_builder_t builder;
-    auto mymake = [&]() {
-        builder.push_scope();
-        {
-            builder.push_scope();
-            {
-                _var_(len1, datatypes::s32);
-                _var_(len, datatypes::f32);
-            }
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            {
-                _var_(len2, datatypes::s32);
-                _var_(len3, datatypes::f32);
-            }
-            builder.emit(builder.pop_scope());
-        }
-        return builder.pop_scope().as<stmts>();
-    };
-
-    /////// expected
-    builder.push_scope();
-    {
-        _var_(len1, datatypes::s32);
-        _var_(len, datatypes::f32);
-        builder.push_scope();
-        {
-            _var_(len2, datatypes::s32);
-            _var_(len3, datatypes::f32);
-        }
-        builder.emit(builder.pop_scope());
-    }
-    auto exp = builder.pop_scope();
-
-    ir_comparer cmp(true);
-    auto body = mymake();
-    scope_flatten(body, 0);
-    EXPECT_TRUE(cmp.compare(body, exp));
-
-    builder.push_scope();
-    {
-        _var_(len1, datatypes::s32);
-        _var_(len, datatypes::f32);
-        _var_(len2, datatypes::s32);
-        _var_(len3, datatypes::f32);
-    }
-    exp = builder.pop_scope();
-    body = mymake();
-    scope_flatten(body, -1);
-    EXPECT_TRUE(cmp.compare(body, exp));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_select.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_select.cpp
deleted file mode 100644
index 8c18b5e44cb..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_select.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <cstdlib>
-#include <iostream>
-#include <numeric>
-#include <time.h>
-#include <vector>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/ternary_elemwise.hpp>
-#include <util/any_map.hpp>
-#include <util/math_utils.hpp>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static void select_ref(const sc_dims &cond_plain_dims,
-        const sc_dims &then_plain_dims, const sc_dims &else_plain_dims,
-        const sc_dims &out_plain_dims,
-        const std::vector<std::vector<int>> &ternary_bc_axis,
-        test_buffer<uint8_t> &cond, test_buffer<float> &then,
-        test_buffer<float> &els, test_buffer<float> &out) {
-    auto &cond_plain_axis = ternary_bc_axis[0];
-    auto &then_plain_axis = ternary_bc_axis[1];
-    auto &else_plain_axis = ternary_bc_axis[2];
-
-    auto extended_cond_plain_dims = test_utils::get_extended_plain_dims(
-            cond_plain_axis, cond_plain_dims, out_plain_dims);
-    auto extended_then_plain_dims = test_utils::get_extended_plain_dims(
-            then_plain_axis, then_plain_dims, out_plain_dims);
-    auto extended_else_plain_dims = test_utils::get_extended_plain_dims(
-            else_plain_axis, else_plain_dims, out_plain_dims);
-
-    sc_dims cond_strides
-            = test_utils::compute_dense_stride(extended_cond_plain_dims);
-    sc_dims then_strides
-            = test_utils::compute_dense_stride(extended_then_plain_dims);
-    sc_dims else_strides
-            = test_utils::compute_dense_stride(extended_else_plain_dims);
-    sc_dims output_strides = test_utils::compute_dense_stride(out_plain_dims);
-
-    const size_t total_size = out.size();
-    utils::parallel_for(0, total_size, 1, [&](int64_t i) {
-        size_t cond_idx_flattened = 0, then_idx_flattened = 0,
-               else_idx_flattened = 0;
-        size_t idx = i;
-        for (size_t d = 0; d < output_strides.size(); ++d) {
-            auto output_idx = idx / output_strides[d];
-            idx -= output_idx * output_strides[d];
-            size_t cond_idx = extended_cond_plain_dims[d] == 1 ? 0 : output_idx;
-            cond_idx_flattened += cond_idx * cond_strides[d];
-            cond_idx = extended_then_plain_dims[d] == 1 ? 0 : output_idx;
-            then_idx_flattened += cond_idx * then_strides[d];
-            cond_idx = extended_else_plain_dims[d] == 1 ? 0 : output_idx;
-            else_idx_flattened += cond_idx * else_strides[d];
-        }
-        out[i] = cond[cond_idx_flattened] > 0UL ? then[then_idx_flattened]
-                                                : els[else_idx_flattened];
-    });
-}
-
-static void check_select_correctness(const sc_dims &cond_plain_dims,
-        const sc_dims &else_plain_dims, const sc_dims &then_plain_dims,
-        sc_data_format_t cond_format = sc_data_format_t(),
-        sc_data_format_t else_format = sc_data_format_t(),
-        sc_data_format_t then_format = sc_data_format_t()) {
-    REQUIRE_AVX2(); // llvm stuck when SSE
-    sc_graph_t graph;
-    auto input = graph.make_input({std::make_shared<graph_tensor>(nullptr,
-                                           cond_format.to_plain(),
-                                           cond_plain_dims, datatypes::u8),
-            std::make_shared<graph_tensor>(nullptr, then_format.to_plain(),
-                    then_plain_dims, datatypes::f32),
-            std::make_shared<graph_tensor>(nullptr, else_format.to_plain(),
-                    else_plain_dims, datatypes::f32)});
-    sc_op_ptr reorder_cond, reorder_then, reorder_else, select, reorder_out;
-    reorder_cond = graph.make("reorder", {input->get_outputs()[0]}, {},
-            {{"out_format", cond_format}, {"internal", true}});
-    reorder_then = graph.make("reorder", {input->get_outputs()[1]}, {},
-            {{"out_format", then_format}, {"internal", true}});
-    reorder_else = graph.make("reorder", {input->get_outputs()[2]}, {},
-            {{"out_format", else_format}, {"internal", true}});
-    select = graph.make("select",
-            {reorder_cond->get_outputs()[0], reorder_then->get_outputs()[0],
-                    reorder_else->get_outputs()[0]},
-            {}, {});
-    reorder_out = graph.make("reorder", {select->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t(format_kinds::ABCD)},
-                    {"internal", true}});
-
-    auto output = graph.make_output(reorder_out->get_outputs());
-    graph_driver(graph, get_test_ctx());
-    ir_module_ptr mod = lower_graph(get_test_ctx(), graph, {input, output});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(mod, true);
-    std::vector<generic_val> gargs;
-    sc_dim cond_size = test_utils::product(cond_plain_dims);
-    sc_dim then_size = test_utils::product(then_plain_dims);
-    sc_dim else_size = test_utils::product(else_plain_dims);
-    auto out_plain_dims = output->get_inputs()[0]->details_.get_plain_dims();
-    sc_dim out_size = test_utils::product(out_plain_dims);
-
-    const auto &ternary_bc_axis
-            = dynamic_cast<select_op_t *>(select.get())->get_plain_bc_axis();
-
-    auto cond = alloc_array<uint8_t>(cond_size, INIT_RANDOM);
-    auto then = alloc_array<float>(then_size, INIT_RANDOM);
-    auto els = alloc_array<float>(else_size, INIT_ZERO);
-    auto out = alloc_array<float>(out_size, INIT_ZERO);
-    auto ref_out = alloc_array<float>(out_size, INIT_ZERO);
-
-    gargs.emplace_back(cond.data());
-    gargs.emplace_back(then.data());
-    gargs.emplace_back(els.data());
-    gargs.emplace_back(out.data());
-    fptr->call_generic_default(gargs.data());
-
-    select_ref(cond_plain_dims, then_plain_dims, else_plain_dims,
-            out_plain_dims, ternary_bc_axis, cond, then, els, ref_out);
-    test_utils::compare_data<float>(out.data(), ref_out.data(), ref_out.size());
-}
-
-static void check_distill_bert_mha(const sc_dims &feature_plain_dims,
-        const sc_dims &weight_plain_dims, const sc_dims &cond_plain_dims,
-        const sc_dims &then_plain_dims, const sc_dims &feature2_plain_dims,
-        sc_data_format_t feature_format = sc_data_format_t(),
-        sc_data_format_t weight_format = sc_data_format_t(),
-        sc_data_format_t cond_format = sc_data_format_t(),
-        sc_data_format_t then_format = sc_data_format_t(),
-        sc_data_format_t feature2_format = sc_data_format_t(),
-        bool use_then_as_else = false) {
-    REQUIRE_AVX2(); // llvm stuck when SSE
-    sc_graph_t graph;
-    auto input = graph.make_input(
-            {std::make_shared<graph_tensor>(nullptr, feature_format,
-                     feature_plain_dims, datatypes::f32),
-                    std::make_shared<graph_tensor>(nullptr, weight_format,
-                            weight_plain_dims, datatypes::f32),
-                    std::make_shared<graph_tensor>(nullptr, cond_format,
-                            cond_plain_dims, datatypes::u8),
-                    std::make_shared<graph_tensor>(nullptr, then_format,
-                            then_plain_dims, datatypes::f32),
-                    std::make_shared<graph_tensor>(nullptr, feature2_format,
-                            feature2_plain_dims, datatypes::f32)});
-
-    sc_op_ptr matmul, select, softmax, matmul2, transpose, reorder;
-    matmul = graph.make("matmul_core",
-            {input->get_outputs()[0], input->get_outputs()[1]}, {}, {});
-    if (use_then_as_else) {
-        select = graph.make("select",
-                {input->get_outputs()[2], matmul->get_outputs()[0],
-                        input->get_outputs()[3]},
-                {}, {});
-    } else {
-        select = graph.make("select",
-                {input->get_outputs()[2], input->get_outputs()[3],
-                        matmul->get_outputs()[0]},
-                {}, {});
-    }
-    softmax = graph.make("softmax", {select->get_outputs()[0]}, {},
-            {{"axis", std::vector<int> {3}}});
-    matmul2 = graph.make("matmul_core",
-            {softmax->get_outputs()[0], input->get_outputs()[4]}, {}, {});
-    transpose = graph.make("transpose", {matmul2->get_outputs()[0]}, {},
-            {{"order", std::vector<int> {0, 2, 1, 3}}});
-    reorder = graph.make("reorder", {transpose->get_outputs()[0]}, {},
-            {{"out_format", sc_data_format_t(format_kinds::ABCD)},
-                    {"internal", true}});
-
-    auto output = graph.make_output(reorder->get_outputs());
-    auto ctx = std::make_shared<context_t>(*get_test_ctx());
-
-    graph_driver(graph, ctx);
-    ir_module_ptr mod = lower_graph(ctx, graph, {input, output});
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(mod, true);
-    std::vector<generic_val> gargs;
-    sc_dim feature_size = test_utils::product(feature_plain_dims);
-    sc_dim weight_size = test_utils::product(weight_plain_dims);
-    sc_dim cond_size = test_utils::product(cond_plain_dims);
-    sc_dim then_size = test_utils::product(then_plain_dims);
-    sc_dim feature2_size = test_utils::product(feature2_plain_dims);
-    sc_dim out_size = test_utils::product(
-            output->get_inputs()[0]->details_.get_plain_dims());
-
-    std::vector<float> feature(feature_size, 0.f);
-    std::iota(feature.begin(), feature.end(), 0);
-    std::vector<float> weight(weight_size, 1.f);
-    std::vector<uint8_t> cond(cond_size, 1);
-    std::vector<float> then(then_size, 0.f);
-    std::vector<float> feature2(feature2_size, 1.f);
-    std::vector<float> out(out_size, 0.f);
-    gargs.emplace_back(feature.data());
-    gargs.emplace_back(weight.data());
-    gargs.emplace_back(cond.data());
-    gargs.emplace_back(then.data());
-    gargs.emplace_back(feature2.data());
-    gargs.emplace_back(out.data());
-    fptr->call_generic_default(gargs.data());
-}
-
-TEST(GCCore_CPU_select_test, TestCorrectnessNonBlocking) {
-    check_select_correctness({1}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A));
-    // 1D + 4D + 1D
-    check_select_correctness({68}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A));
-    // 2D + 4D + 1D
-    check_select_correctness({16, 68}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::AB),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 68}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::BA),
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::A));
-    //  3D + 4D + 1D
-    check_select_correctness({128, 16, 68}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::ABC),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({128, 16, 68}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::ACB),
-            sc_data_format_t(format_kinds::ACDB),
-            sc_data_format_t(format_kinds::A));
-    //  4D + 4D + 1D
-    check_select_correctness({2, 128, 16, 68}, {2, 128, 16, 68}, {1},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({205, 1, 1, 132}, {205, 12, 132, 132}, {1},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::A));
-    //  4D + 1D + 4D
-    check_select_correctness({2, 128, 16, 68}, {1}, {2, 128, 16, 68},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ABCD));
-    check_select_correctness({205, 1, 1, 132}, {1}, {205, 12, 132, 132},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ACBD));
-    //  4D + 4D + 4D
-    check_select_correctness({205, 1, 1, 132}, {205, 1, 1, 132},
-            {205, 1, 1, 132}, sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD));
-    check_select_correctness({16, 1, 1, 32}, {16, 1, 1, 32}, {16, 1, 1, 32},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD));
-    check_select_correctness({16, 16, 1, 32}, {16, 16, 16, 32}, {16, 1, 16, 32},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD));
-    check_select_correctness({16, 16, 1, 32}, {16, 16, 16, 32}, {16, 1, 16, 32},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD));
-    check_select_correctness({16, 16, 1, 32}, {16, 16, 16, 32}, {16, 1, 16, 32},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCD));
-}
-
-TEST(GCCore_CPU_select_test, TestCorrectnessSingleSideBlockingThen) {
-    check_select_correctness({64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ABCDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::AB),
-            sc_data_format_t(format_kinds::ABCDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::BA),
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::BA),
-            sc_data_format_t(format_kinds::ACBDcdc, {4, 16, 4}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::AB),
-            sc_data_format_t(format_kinds::ABCDdc, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {4, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::BA),
-            sc_data_format_t(format_kinds::ABCDba, {4, 4}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {4, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::BA),
-            sc_data_format_t(format_kinds::ABCDb, {4}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({205, 1, 1, 132}, {205, 12, 128, 132}, {1},
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::ABCDcd, {64, 66}),
-            sc_data_format_t(format_kinds::A));
-}
-
-TEST(GCCore_CPU_select_test, TestCorrectnessSingleSideBlockingCond) {
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ABab, {4, 16}),
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ABba, {4, 16}),
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::BAab, {4, 16}),
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_t(format_kinds::A));
-}
-
-TEST(GCCore_CPU_select_test, TestCorrectnessBlocking) {
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ABab, {4, 16}),
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ABba, {4, 16}),
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::BAab, {4, 16}),
-            sc_data_format_t(format_kinds::ACBDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ABab, {4, 16}),
-            sc_data_format_t(format_kinds::ABCDcd, {4, 16}),
-            sc_data_format_t(format_kinds::A));
-    check_select_correctness({16, 64}, {2, 128, 16, 64}, {1},
-            sc_data_format_t(format_kinds::ABba, {4, 16}),
-            sc_data_format_t(format_kinds::ACBDcdc, {4, 16, 4}),
-            sc_data_format_t(format_kinds::A));
-}
-
-TEST(GCCore_CPU_distill_bert_test, TestFuntionality) {
-    check_distill_bert_mha({205, 12, 132, 64}, {205, 12, 64, 132},
-            {205, 1, 1, 132}, {1}, {205, 12, 132, 64},
-            sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_kind_t(0, 3, 1, 2),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ACBD));
-    check_distill_bert_mha({16, 16, 1, 256}, {16, 16, 256, 48}, {1, 1, 1, 48},
-            {1}, {16, 16, 48, 256}, sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_kind_t(0, 3, 1, 2),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ACBD), true);
-    check_distill_bert_mha({16, 16, 1, 256}, {16, 16, 256, 48}, {16, 16, 1, 48},
-            {1}, {16, 16, 48, 256}, sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_kind_t(0, 3, 1, 2),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ACBD));
-    check_distill_bert_mha({16, 16, 1, 256}, {16, 16, 256, 48}, {16, 16, 1, 48},
-            {1}, {16, 16, 48, 256}, sc_data_format_t(format_kinds::ACBD),
-            sc_data_format_kind_t(0, 3, 1, 2),
-            sc_data_format_t(format_kinds::ABCD),
-            sc_data_format_t(format_kinds::A),
-            sc_data_format_t(format_kinds::ACBD), true);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_shape_of_tensor.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_shape_of_tensor.cpp
deleted file mode 100644
index 52b1a9d87e5..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_shape_of_tensor.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/graph/transform/transform.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/shape_of_tensor.hpp>
-#include <runtime/dynamic_dispatch/dynamic_tensor.hpp>
-#include <runtime/dynamic_dispatch/ops/config.hpp>
-#include <runtime/runtime.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-auto get_shape_of_tensor_graph
-        = [](int shape_idx, padding_shape_etype_t shape_type,
-                  bool is_batch
-                  = false) -> std::pair<sc_graph_t, std::vector<sc_op_ptr>> {
-    sc_graph_t graph;
-    const sc_dims input_dims = {-2, -3, 1024, -4};
-    auto input = graph.make_input({graph_tensor::make(input_dims)});
-    auto shape_of_tsr = graph.make("shape_of_tensor", input->get_outputs(), {},
-            {{"shape_idx", shape_idx},
-                    {attr_keys::padding_shape_type,
-                            static_cast<int>(shape_type)},
-                    {attr_keys::shape_of_tensor_is_batch, is_batch}});
-    auto output = graph.make_output(shape_of_tsr->get_outputs());
-    return std::make_pair(
-            std::move(graph), std::vector<sc_op_ptr> {input, output});
-};
-
-TEST(GCCore_CPU_shape_of_tensor_test, TestShapeOfTensorWithoutPadding) {
-    REQUIRE_AVX2();
-    runtime::dynamic_tensor_t in, out;
-    sc_dims in_shapes = {16, 32, 1024, 4096}, out_shapes = {0};
-    in.dims_ = in_shapes.data();
-    out.data_ = out_shapes.data();
-    in.ndims_ = 4;
-    out.ndims_ = 1;
-    for (int i = 0; i < 4; i++) {
-        sc_graph_t graph;
-        std::vector<sc_op_ptr> args;
-        std::tie(graph, args) = get_shape_of_tensor_graph(
-                i, padding_shape_etype_t::without_padding);
-        auto ctx = get_test_ctx();
-        graph_driver(graph, ctx);
-        ir_module_ptr mod = lower_graph(ctx, graph, args);
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(mod, true);
-        std::vector<generic_val> gargs;
-        gargs.emplace_back(&in);
-        gargs.emplace_back(&out);
-        fptr->call_generic_default(gargs.data());
-        EXPECT_EQ(out_shapes[0], in_shapes[i]);
-    }
-}
-
-TEST(GCCore_CPU_shape_of_tensor_test, TestShapeOfTensorMatmulPadding) {
-    REQUIRE_AVX2();
-    runtime::dynamic_tensor_t in, out;
-    sc_dims in_shapes = {1, 33, 1024, 97}, out_shapes = {0};
-    in.dims_ = in_shapes.data();
-    out.data_ = out_shapes.data();
-    in.ndims_ = 4;
-    out.ndims_ = 1;
-    for (int i = 0; i < 4; i++) {
-        bool is_batch = i == 0;
-        sc_graph_t graph;
-        std::vector<sc_op_ptr> args;
-        std::tie(graph, args) = get_shape_of_tensor_graph(
-                i, padding_shape_etype_t::matmul_padding, is_batch);
-        auto ctx = get_test_ctx();
-        graph_driver(graph, ctx);
-        ir_module_ptr mod = lower_graph(ctx, graph, args);
-        auto fptr = jit_engine_t::make(ctx)->get_entry_func(mod, true);
-        std::vector<generic_val> gargs;
-        gargs.emplace_back(&in);
-        gargs.emplace_back(&out);
-        fptr->call_generic_default(gargs.data());
-        int block = get_matmul_dyn_cfg_single(in_shapes[i], is_batch);
-        EXPECT_EQ(out_shapes[0],
-                static_cast<int>(utils::divide_and_ceil(in_shapes[i], block))
-                        * block);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_shared_const_cache.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_shared_const_cache.cpp
deleted file mode 100644
index efd094a8e82..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_shared_const_cache.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <runtime/const_cache_wrapper.hpp>
-#include <runtime/context.hpp>
-#include <util/utils.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc::runtime;
-
-struct gc_simulated_const_cache_item {
-    int buffer_[32];
-    std::shared_ptr<const_cache_proxy> proxy_;
-};
-
-TEST(GCCore_CPU_shared_const_cache, TestFunctional) {
-    auto item = std::make_shared<gc_simulated_const_cache_item>();
-    item->proxy_ = std::make_shared<const_cache_proxy>(
-            item, item->buffer_, sizeof(int) * 32, false);
-
-    auto proxy = item->proxy_;
-    int32_t inited = 1;
-    auto retbuf = sc_acquire_const_cache(
-            get_default_stream(), proxy.get(), 32 * sizeof(int), &inited);
-    ASSERT_EQ(inited, 0);
-    ASSERT_EQ(retbuf, item->buffer_);
-    sc_release_const_cache(get_default_stream(), proxy.get(), retbuf);
-
-    // second run of the kernel
-    inited = 1;
-    retbuf = sc_acquire_const_cache(
-            get_default_stream(), proxy.get(), 32 * sizeof(int), &inited);
-    ASSERT_EQ(inited, 1);
-    ASSERT_EQ(retbuf, item->buffer_);
-    sc_release_const_cache(get_default_stream(), proxy.get(), retbuf);
-
-    // third run of the kernel
-    inited = 1;
-    retbuf = sc_acquire_const_cache(
-            get_default_stream(), proxy.get(), 32 * sizeof(int), &inited);
-    // simulate the eviction of the cache while the kernel is running
-    proxy->deref();
-    ASSERT_TRUE(proxy->is_alive());
-    ASSERT_EQ(inited, 1);
-    ASSERT_EQ(retbuf, item->buffer_);
-    sc_release_const_cache(get_default_stream(), proxy.get(), retbuf);
-
-    ASSERT_FALSE(proxy->is_alive());
-    // fourth run of the kernel, the buffer is dead
-    inited = 1;
-    retbuf = sc_acquire_const_cache(
-            get_default_stream(), proxy.get(), 32 * sizeof(int), &inited);
-    ASSERT_EQ(inited, 0);
-    ASSERT_NE(retbuf, item->buffer_);
-    sc_release_const_cache(get_default_stream(), proxy.get(), retbuf);
-
-    proxy->is_lazy_ = false;
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_simple_licm.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_simple_licm.cpp
deleted file mode 100644
index f7ccb3f0b90..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_simple_licm.cpp
+++ /dev/null
@@ -1,314 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/loop_function_motion.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/simple_licm.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-static constexpr auto s32 = datatypes::s32;
-TEST(GCCore_CPU_simple_licm_cpp, TestSimpleLICMTransform) {
-    builder::ir_builder_t builder;
-    auto dim1 = builder::make_var(s32, "dim1");
-    dim1->attr().set(attr_key::const_attr, true);
-    auto dim2 = builder::make_var(s32, "dim2");
-    _function_(s32, ccc, _arg_("A", s32, {100}), _arg_("B", s32, {100})) {
-        _bind_(A, B);
-
-        _for_(i, 0, 100) {
-            _tensor_(c, s32, {100});
-            _var_init_(d, s32, 3);
-            d = c[0] + 3;
-            _tensor_(e, s32, {dim1, 100});
-            e[{0, 0}] = d;
-            _tensor_(g, s32, {100});
-            g.get().checked_as<tensor>()->init_value_
-                    = tensor_node::get_zero_tensor_initializer();
-            _for_(j, 0, 100) {
-                _tensor_(f, s32, {200});
-                _tensor_(h, s32, {i, 100});
-                _tensor_(l, s32, {dim2, 100});
-                f[1] = d;
-                g[j] = g[j] + 1;
-                h[0] = 2;
-            }
-        }
-        _return_(1);
-    }
-
-    simple_loop_invariant_code_motion_t licm;
-    auto out = licm(ccc);
-
-    _function_(s32, expected, _arg_("A", s32, {100}), _arg_("B", s32, {100})) {
-        _bind_(A, B);
-        builder.push_scope();
-        _tensor_(c, s32, {100});
-        _tensor_(e, s32, {dim1, 100});
-        _tensor_(f, s32, {200});
-        _for_(i, 0, 100) {
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            _var_init_(d, s32, 3);
-            d = c[0] + 3;
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            e[{0, 0}] = d;
-            _tensor_(g, s32, {100});
-            g.get().checked_as<tensor>()->init_value_
-                    = tensor_node::get_zero_tensor_initializer();
-            _for_(j, 0, 100) {
-                builder.push_scope();
-                builder.emit(builder.pop_scope());
-                _tensor_(h, s32, {i, 100});
-                _tensor_(l, s32, {dim2, 100});
-                f[1] = d;
-                g[j] = g[j] + 1;
-                h[0] = 2;
-            }
-        }
-        builder.emit(builder.pop_scope());
-        _return_(1);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_simple_licm_cpp, TestSimpleLICMMultiScope) {
-    builder::ir_builder_t builder;
-    auto dim1 = builder::make_var(s32, "dim1");
-    dim1->attr().set(attr_key::const_attr, true);
-    auto dim2 = builder::make_var(s32, "dim2");
-    _function_(s32, ccc, _arg_("A", s32, {100}), _arg_("B", s32, {100})) {
-        _bind_(A, B);
-
-        _for_(i, 0, 100, 1, for_type::PARALLEL) {
-            _for_(j, 0, 100, 1, for_type::PARALLEL) {
-                _tensor_(c, s32, {100});
-                _for_(k, 0, 100) {
-                    _tensor_(d, s32, {i, 100});
-                    _for_(l, 0, 100) {
-                        _tensor_(e, s32, {dim1, 100});
-                        _for_(m, 0, 100) {
-                            _tensor_(f, s32, {dim2});
-                            _tensor_(g, s32, {100});
-                            g.get().checked_as<tensor>()->init_value_
-                                    = tensor_node::
-                                            get_zero_tensor_initializer();
-                            _tensor_(h, s32, {i, k});
-                        }
-                        _for_(n, 0, 100) {
-                            _tensor_(q, s32, {i, 100});
-                            _tensor_(t, s32, {100});
-                        }
-                    }
-                }
-            }
-        }
-        _return_(1);
-    }
-
-    simple_loop_invariant_code_motion_t licm;
-    auto out = licm(ccc);
-
-    _function_(s32, expected, _arg_("A", s32, {100}), _arg_("B", s32, {100})) {
-        _bind_(A, B);
-
-        _for_(i, 0, 100, 1, for_type::PARALLEL) {
-            _for_(j, 0, 100, 1, for_type::PARALLEL) {
-                _tensor_(c, s32, {100});
-                builder.push_scope();
-                _tensor_(e, s32, {dim1, 100});
-                _tensor_(t, s32, {100});
-                _for_(k, 0, 100) {
-                    _tensor_(d, s32, {i, 100});
-                    _for_(l, 0, 100) {
-                        builder.push_scope();
-                        builder.emit(builder.pop_scope());
-                        _for_(m, 0, 100) {
-                            _tensor_(f, s32, {dim2});
-                            _tensor_(g, s32, {100});
-                            g.get().checked_as<tensor>()->init_value_
-                                    = tensor_node::
-                                            get_zero_tensor_initializer();
-                            _tensor_(h, s32, {i, k});
-                        }
-                        _for_(n, 0, 100) {
-                            _tensor_(q, s32, {i, 100});
-                            builder.push_scope();
-                            builder.emit(builder.pop_scope());
-                        }
-                    }
-                }
-                builder.emit(builder.pop_scope());
-            }
-        }
-        _return_(1);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_loop_function_motion_cpp, TestPureFunctionMotion) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", datatypes::pointer, {10000})) {
-        _bind_(A);
-        _var_init_(t0, datatypes::s32,
-                builder::make_call(builtin::get_thread_id_func(), {}));
-        _for_(i, 0, 10) {
-            builder.push_scope();
-            {
-                _var_init_(f0, datatypes::boolean,
-                        builder::make_call(builtin::get_is_in_parallel_func(),
-                                {})); // can hoist
-            }
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            {
-                _evaluate_call_(builtin::get_barrier_arrive_func(),
-                        A[0]); // can't hoist
-            }
-            builder.emit(builder.pop_scope());
-        }
-        _for_(i, 0, 10) {
-            _for_(j, 0, 10) {
-                A[builder::make_call(builtin::get_thread_id_func(), {}) + i + j]
-                        = 0; // can hoist
-            }
-        }
-        _return_(0);
-    }
-
-    simple_loop_function_motion_t licm;
-    auto out = licm(ccc);
-
-    _function_(s32, expected, _arg_("A", datatypes::pointer, {10000})) {
-        _bind_(A);
-        _var_init_(t0, datatypes::s32,
-                builder::make_call(builtin::get_thread_id_func(), {}));
-        builder.push_scope();
-        _var_init_(call_var_sc_is_in_parallel, datatypes::s32,
-                builder::make_call(builtin::get_is_in_parallel_func(), {}));
-        _for_(i, 0, 10) {
-            builder.push_scope();
-            { _var_init_(f0, datatypes::boolean, call_var_sc_is_in_parallel); }
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            { _evaluate_call_(builtin::get_barrier_arrive_func(), A[0]); }
-            builder.emit(builder.pop_scope());
-        }
-        builder.emit(builder.pop_scope());
-
-        builder.push_scope();
-        _var_init_(call_var_sc_get_thread_id, datatypes::s32,
-                builder::make_call(builtin::get_thread_id_func(), {}));
-        _for_(i, 0, 10) {
-            _for_(j, 0, 10) { A[call_var_sc_get_thread_id + i + j] = 0; }
-        }
-        builder.emit(builder.pop_scope());
-        _return_(0);
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_simple_licm_cpp, TestSimpleLICMVarMotion) {
-    builder::ir_builder_t builder;
-    auto dim1 = builder::make_var(s32, "dim1");
-    dim1->attr().set(attr_key::const_attr, true);
-    auto dim2 = builder::make_var(s32, "dim2");
-    _function_(s32, ccc, _arg_("A", s32, {100}), _arg_("B", s32, {100})) {
-        _bind_(A, B);
-
-        _for_(i, 0, 100) {
-            _var_(var1, s32);
-            _tensor_(c, s32, {100});
-            _var_init_(d, s32, 3);
-            d = c[0] + 3;
-            _tensor_(e, s32, {dim1, 100});
-            e[{0, 0}] = d;
-            _tensor_(g, s32, {100});
-            var1 = 10;
-            g.get().checked_as<tensor>()->init_value_
-                    = tensor_node::get_zero_tensor_initializer();
-            _for_(j, 0, 100) {
-                _tensor_(f, s32, {200});
-                _tensor_(h, s32, {i, 100});
-                _tensor_(l, s32, {dim2, 100});
-                var1 = var1 + 1;
-                f[1] = d;
-                g[j] = g[j] + 1;
-                _var_(var2, s32);
-                var2 = 5;
-                h[0] = 2;
-            }
-        }
-        _return_(1);
-    }
-
-    simple_loop_invariant_code_motion_t licm;
-    auto out = licm(ccc);
-
-    _function_(s32, expected, _arg_("A", s32, {100}), _arg_("B", s32, {100})) {
-        _bind_(A, B);
-        builder.push_scope();
-        _var_(var1, s32);
-        _tensor_(c, s32, {100});
-        _tensor_(e, s32, {dim1, 100});
-        _tensor_(f, s32, {200});
-        _var_(var2, s32);
-        _for_(i, 0, 100) {
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            _var_init_(d, s32, 3);
-            d = c[0] + 3;
-            builder.push_scope();
-            builder.emit(builder.pop_scope());
-            e[{0, 0}] = d;
-            _tensor_(g, s32, {100});
-            var1 = 10;
-            g.get().checked_as<tensor>()->init_value_
-                    = tensor_node::get_zero_tensor_initializer();
-            _for_(j, 0, 100) {
-                builder.push_scope();
-                builder.emit(builder.pop_scope());
-                _tensor_(h, s32, {i, 100});
-                _tensor_(l, s32, {dim2, 100});
-                var1 = var1 + 1;
-                f[1] = d;
-                g[j] = g[j] + 1;
-                builder.push_scope();
-                builder.emit(builder.pop_scope());
-                var2 = 5;
-                h[0] = 2;
-            }
-        }
-        builder.emit(builder.pop_scope());
-        _return_(1);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_simplify.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_simplify.cpp
deleted file mode 100644
index 45672171acd..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_simplify.cpp
+++ /dev/null
@@ -1,436 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/simplify.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-
-TEST(GCCore_CPU_ir_simplify, TestSimplify) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        // no collapse
-        _if_(A[0] == 0) {}
-        _else_ { A[0] = 0; }
-
-        // remove
-        builder.push_scope();
-        {
-            _var_(aaa, datatypes::s32);
-            aaa = 4;
-            // remove
-            builder.push_scope();
-            {}
-            builder.emit(builder.pop_scope());
-            // promote
-            builder.push_scope();
-            {
-                aaa = 4;
-                aaa = 5;
-            }
-            builder.emit(builder.pop_scope());
-        }
-        builder.emit(builder.pop_scope());
-    }
-    ir_simplifier_t s {false};
-    auto out = s(ccc);
-
-    _function_(
-            datatypes::void_t, expected, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        // no collapse
-        _if_(!(A[0] == 0)) { A[0] = 0; }
-
-        _var_(aaa, datatypes::s32);
-        aaa = 4;
-        aaa = 4;
-        aaa = 5;
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ir_simplify, TestVarRename) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-        _var_(var0, datatypes::f32);
-        _tensor_(var1, datatypes::s32, {1});
-        var0 = 1;
-        var1[0] = 2;
-        // no collapse
-        builder.push_scope();
-        {
-            _var_(var1, datatypes::s32);
-            var1 = 4;
-            // remove
-            builder.push_scope();
-            {}
-            builder.emit(builder.pop_scope());
-            // promote and remove
-            builder.push_scope();
-            {
-                _tensor_(var0, datatypes::f32, {1});
-                _var_(var2, datatypes::s32);
-                var0[0] = 3;
-                var2 = 6;
-                var1 = 4;
-                var1 = 5;
-            }
-            builder.emit(builder.pop_scope());
-            _var_(var2, datatypes::f32);
-            var2 = 7;
-        }
-        builder.emit(builder.pop_scope());
-    }
-    ir_simplifier_t s {false};
-    auto out = s(ccc);
-
-    _function_(
-            datatypes::void_t, expected, _arg_("A", datatypes::f32, {10000})) {
-        _bind_(A);
-
-        _var_(var0, datatypes::f32);
-        _tensor_(var1, datatypes::s32, {1});
-        var0 = 1;
-        var1[0] = 2;
-
-        _var_(var1_1, datatypes::s32);
-        var1_1 = 4;
-
-        _tensor_(var0_2, datatypes::f32, {1});
-        _var_(var2, datatypes::s32);
-        var0_2[0] = 3;
-        var2 = 6;
-        var1_1 = 4;
-        var1_1 = 5;
-
-        _var_(var2_3, datatypes::f32);
-        var2_3 = 7;
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ir_simplify, TestLoopVarRename) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::index, {300})) {
-        _bind_(A);
-        _for_(iter, 0, 100) {
-            auto &l1_iter = iter;
-            _for_(iter, 0, 100) {
-                auto &l2_iter = iter;
-                _for_(iter, 0, 100) {
-                    auto &l3_iter = iter;
-                    A[l1_iter + l2_iter + l3_iter]
-                            = l1_iter + l2_iter + l3_iter;
-                }
-            }
-        }
-    }
-    ir_simplifier_t s {false};
-    auto out = s(ccc);
-
-    _function_(
-            datatypes::void_t, expected, _arg_("A", datatypes::index, {300})) {
-        _bind_(A);
-        _for_(iter, 0, 100) {
-            _for_(iter_1, 0, 100) {
-                _for_(iter_2, 0, 100) {
-                    A[iter + iter_1 + iter_2] = iter + iter_1 + iter_2;
-                }
-            }
-        }
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ir_simplify, TestSimplifyLoopEliminateWithConstantFolding) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10})) {
-        _bind_(A);
-        _var_(idx, datatypes::index);
-        idx = builder::make_cast(datatypes::index, A[0]);
-        _var_(idx2, datatypes::index);
-        idx2 = idx + 1;
-        // Case 1
-        _for_(i, idx, idx2) { A[i] = A[i] + 1; }
-    }
-
-    auto result = auto_caster_t()(ccc);
-    result = constant_folder_t(false)(result);
-    result = ir_simplifier_t(true)(result);
-
-    _function_(datatypes::void_t, expected, _arg_("A", datatypes::f32, {10})) {
-        _bind_(A);
-        _var_(idx, datatypes::index);
-        idx = builder::make_cast(datatypes::index, A[0]);
-        _var_(idx2, datatypes::index);
-        idx2 = idx + UINT64_C(1);
-        A[idx] = A[idx] + 1.0f;
-    }
-
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(result, expected));
-}
-
-TEST(GCCore_CPU_ir_simplify, TestSimplifyLoopEliminate) {
-    builder::ir_builder_t builder;
-
-    _function_(
-            datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10, 20, 32})) {
-        _bind_(A);
-        for_loop outer;
-        // Case 1
-        _for_(i, 5, 6) {
-            _for_(j, 11, 12) {
-                _for_(k, 0, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-            }
-        }
-        // Case 2
-        _for_(i, 5, 6) {
-            _for_(j, i, 18) {
-                _for_(k, 0, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-            }
-        }
-        // Case 3
-        _for_(i, 5, 6) {
-            _for_(j, 11, 12) {
-                _for_(k, 16, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-            }
-        }
-        // Case 4
-        _for_(i, 5, 6) {
-            _for_(j, 11, 12) {
-                _for_(k, 0, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-                _for_(k, 0, 32, 16) {
-                    // expected to be eliminated
-                }
-            }
-        }
-        // Case 5
-        _for_(i, 5, 6) {
-            _for_(j, 11, 12) {
-                _for_(k, 0, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-                _for_(k, 16, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-            }
-        }
-        // Case 6
-        _named_for_(outer, i, 5, 6) {
-            _for_(j, 11, 12) {
-                _for_(k, 0, 32, 16) {
-                    A[span_t({i, j, k}, 16)] = A[span_t({i, j, k}, 16)]
-                            + builder::make_broadcast(1, 16);
-                }
-            }
-        }
-        outer->attr()[stmt_attr_key::merge_loop] = true;
-    }
-    ir_simplifier_t s {false};
-    auto out = s(ccc);
-
-    _function_(datatypes::void_t, expected,
-            _arg_("A", datatypes::f32, {10, 20, 32})) {
-        _bind_(A);
-        for_loop new_outer;
-        // Case 1
-        _for_(k, 0, 32, 16) {
-            A[span_t({5, 11, k}, 16)] = A[span_t({5, 11, k}, 16)]
-                    + builder::make_broadcast(1, 16);
-        }
-
-        // Case 2
-        _for_(j, 5, 18) {
-            _for_(k, 0, 32, 16) {
-                A[span_t({5, j, k}, 16)] = A[span_t({5, j, k}, 16)]
-                        + builder::make_broadcast(1, 16);
-            }
-        }
-
-        // Case 3
-        A[span_t({5, 11, 16}, 16)]
-                = A[span_t({5, 11, 16}, 16)] + builder::make_broadcast(1, 16);
-
-        // Case 4
-        _for_(k, 0, 32, 16) {
-            A[span_t({5, 11, k}, 16)] = A[span_t({5, 11, k}, 16)]
-                    + builder::make_broadcast(1, 16);
-        }
-
-        // Case 5
-        _for_(k, 0, 32, 16) {
-            A[span_t({5, 11, k}, 16)] = A[span_t({5, 11, k}, 16)]
-                    + builder::make_broadcast(1, 16);
-        }
-        A[span_t({5, 11, 16}, 16)]
-                = A[span_t({5, 11, 16}, 16)] + builder::make_broadcast(1, 16);
-
-        // Case 6
-        _named_for_(new_outer, k, 0, 32, 16) {
-            A[span_t({5, 11, k}, 16)] = A[span_t({5, 11, k}, 16)]
-                    + builder::make_broadcast(1, 16);
-        }
-        new_outer->attr()[stmt_attr_key::merge_loop] = true;
-    }
-    ir_comparer cmper;
-
-    EXPECT_TRUE(cmper.compare(out, expected));
-    // check attr pass down for last gtest case
-    EXPECT_TRUE(ccc->body_.checked_as<stmts>()
-                        ->seq_.back()
-                        .checked_as<for_loop>()
-                        ->attr_->get_or_else(stmt_attr_key::merge_loop, false)
-            == true);
-}
-
-TEST(GCCore_CPU_ir_simplify, TestSimplifyIfAndElseEliminate) {
-    builder::ir_builder_t builder;
-    _function_(
-            datatypes::void_t, ccc, _arg_("A", datatypes::f32, {10, 20, 32})) {
-        _bind_(A);
-        // Case 1:
-        _if_(A[0] == 0) {}
-        A[0] = 0;
-        // Case 2:
-        _if_(A[0] == 0) {}
-        _else_ { A[0] = 0; }
-        // Case 3:
-        _if_(A[0] != 0) { A[0] = 0; }
-        _else_ {}
-        // Case 4:
-        _if_(1 > 0) { A[0] = 1; }
-        _else_ { A[0] = 0; }
-        // Case 5:
-        _if_(1 < 0) { A[0] = 0; }
-        _else_ { A[0] = 1; }
-        // Case 6:
-        _if_(A[0] != 0) {
-            _if_(A[0] > 0) {}
-            _else_ { A[0] = 1; }
-        }
-        _else_ {}
-        // Case 7:
-        _if_(true) {
-            _if_(A[0] > 0) {}
-            _else_ { A[0] = 1; }
-        }
-        // Case 8:
-        _if_(true) {
-            _if_(true) { A[0] = 1; }
-            _else_ { A[0] = -1; }
-        }
-    }
-    ir_simplifier_t s {false};
-    auto out = s(ccc);
-
-    _function_(datatypes::void_t, expected,
-            _arg_("A", datatypes::f32, {10, 20, 32})) {
-        _bind_(A);
-        // Case 1:
-        A[0] = 0;
-        // Case 2:
-        _if_(!(A[0] == 0)) { A[0] = 0; }
-        // Case 3:
-        _if_(A[0] != 0) { A[0] = 0; }
-        // Case 4:
-        A[0] = 1;
-        // Case 5:
-        A[0] = 1;
-        // Case 6:
-        _if_(A[0] != 0) {
-            _if_(!(A[0] > 0)) { A[0] = 1; }
-        }
-        // Case 7:
-        _if_(!(A[0] > 0)) { A[0] = 1; }
-        // Case 8:
-        A[0] = 1;
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ir_simplify, TestVarRenameDynTensor) {
-    builder::ir_builder_t builder;
-    expr dyn_var0 = builder::make_var(datatypes::index, "dyn_var0");
-    _function_(datatypes::void_t, ccc,
-            _arg_("A", datatypes::f32, {dyn_var0, 100})) {
-        _bind_(A);
-
-        _var_init_(dyn_var1, datatypes::index, 1);
-        _tensor_(B, datatypes::f32, {dyn_var1, 100});
-
-        builder.push_scope();
-        {
-            _var_(dyn_var1, datatypes::index);
-            dyn_var1 = dyn_var0 + 1;
-            _tensor_(C, datatypes::f32, {100, dyn_var1});
-        }
-        builder.emit(builder.pop_scope());
-
-        dyn_var1 = 2;
-        _tensor_(D, datatypes::f32, {dyn_var0, dyn_var1});
-        _tensor_(E, datatypes::f32, {dyn_var1, 100});
-    }
-
-    ir_simplifier_t s {false};
-    auto out = s(ccc);
-
-    _function_(datatypes::void_t, expected,
-            _arg_("A", datatypes::f32, {dyn_var0, 100})) {
-        _bind_(A);
-
-        _var_init_(dyn_var1, datatypes::index, 1);
-        _tensor_(B, datatypes::f32, {dyn_var1, 100});
-
-        _var_(dyn_var1_1, datatypes::index);
-        dyn_var1_1 = dyn_var0 + 1;
-        _tensor_(C, datatypes::f32, {100, dyn_var1_1});
-
-        dyn_var1 = 2;
-        _tensor_(D, datatypes::f32, {dyn_var0, dyn_var1});
-        _tensor_(E, datatypes::f32, {dyn_var1, 100});
-    }
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ssa_transform.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_ssa_transform.cpp
deleted file mode 100644
index 32c03bed0d9..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ssa_transform.cpp
+++ /dev/null
@@ -1,514 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-constexpr auto s32 = datatypes::s32;
-
-TEST(GCCore_CPU_ssa_transform, TestSSATransform) {
-    builder::ir_builder_t builder;
-    func_t print_int_f = builder::make_func("print_int",
-            {builder::make_var(s32, "v")}, stmt(), datatypes::void_t);
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(v, s32);
-        _var_(b, s32);
-        _var_init_(c, s32, 20);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        g = 1;
-        v = A[0]; // indexing for read
-        v = (v + 1) * (v * 2); // complex expr
-        b = 1; // check phi
-        c = b + v;
-        a = a + 1; // assign and read params
-        _if_(v < 10) {
-            v = 3 + A[1];
-            b = 2; // check phi
-            A[v + 3] = A[b]; // indexing for read & write
-        }
-        _tensor_(gtsr, s32, 100);
-        builder::get_current_builder()
-                ->get_current_scope()
-                .as_seq()
-                .back()
-                .checked_as<define>()
-                ->init_
-                = builder::tensor_ptr(A, {100}); // simulate global tensor
-        gtsr[b] = a;
-        _evaluate_call_(print_int_f, a);
-        _return_(v + b + c + a + g);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    auto get_expected = [&print_int_f](bool aftergc) {
-        _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-            _bind_(A, a);
-            if (!aftergc) { _var_init_(c, s32, 20); }
-            _var_(g, s32);
-            g->attr()[attr_keys::module_global_offset] = size_t(1);
-            _var_init_(t0, s32, 1);
-            g = t0;
-            _var_init_(t1, s32, 0);
-            _var_init_(v0, s32, A[t1]); // v = A[0]
-
-            // start of v = (v + 1) * (v * 2);
-            _var_init_(t3, s32, 1);
-            _var_init_(t4, s32, v0 + t3);
-            _var_init_(t5, s32, 2);
-            _var_init_(t6, s32, v0 * t5);
-            _var_init_(v1, s32, t4 * t6);
-            // end of v = (v + 1) * (v * 2);
-
-            _var_init_(b2, s32, 1); // b = 1; // check phi
-            _var_init_(c3, s32, b2 + v1); // c = b + v;
-
-            _var_init_(t10, s32, 1);
-            _var_init_(a4, s32, a + t10); // a = a + 1;
-
-            _var_init_(t12, s32, 10);
-            // start of if(){}
-            expr b6_, v5_;
-            _var_init_(t13, datatypes::boolean, v1 < t12);
-            _if_(t13) {
-                _var_init_(t14, s32, 3);
-                _var_init_(t15, s32, 1);
-                _var_init_(t16, s32, A[t15]);
-                _var_init_copy_(v5, s32,
-                        t14 + t16); // v = 3 + A[1];
-                _var_init_copy_(b6, s32, 2); // b=2
-
-                _var_init_(t19, s32, 3);
-                _var_init_(t20, s32, v5 + t19);
-                _var_init_(t21, s32, A[b6]);
-                A[t20] = t21; // indexing for read & write
-            }
-
-            _var_init_(t22, s32, builder::make_phi({b2, b6_}));
-            _var_init_(t23, s32, builder::make_phi({v1, v5_}));
-
-            _var_init_(t100, s32, 100);
-            _tensor_(gtsr, s32, 100);
-            builder::get_current_builder()
-                    ->get_current_scope()
-                    .as_seq()
-                    .back()
-                    .checked_as<define>()
-                    ->init_
-                    = builder::tensor_ptr(A, {t100});
-            gtsr[t22] = a4;
-
-            _var_init_(teval, datatypes::void_t, print_int_f(a4));
-            builder::get_current_builder()->push_evaluate(teval);
-            _var_init_(t24, s32, t23 + t22);
-            _var_init_(t25, s32, t24 + c3);
-            _var_init_(t26, s32, t25 + a4);
-            _var_init_(t_g, s32, g);
-            _var_init_(t27, s32, t26 + t_g);
-            _return_(t27);
-        }
-        return expected;
-    };
-    ir_comparer cmper {true};
-    ASSERT_TRUE(cmper.compare(out, get_expected(false)));
-
-    // find if GC works
-    ssa_visitor_t ssa_gc;
-    out = ssa_gc.top_level_dispatch(out);
-    EXPECT_TRUE(cmper.compare(out, get_expected(true), false));
-}
-
-TEST(GCCore_CPU_ssa_transform, TestSSATransformIfElse) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(v, s32);
-        _var_(b, s32);
-        v = A[0];
-        b = a;
-        _if_(v < 10) {
-            b = v + 2;
-            A[v] = b;
-        }
-        _else_ {
-            b = v + b;
-            A[v] = b;
-        }
-        _return_(b);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(t1, s32, 0);
-        _var_init_(v0, s32, A[t1]);
-        _var_init_(t2, s32, 10);
-        _var_init_(t3, datatypes::boolean, v0 < t2);
-        expr b1_, b2_;
-        _if_(t3) {
-            _var_init_(t4, s32, 2);
-            _var_init_copy_(b1, s32, v0 + t4);
-            A[v0] = b1;
-        }
-        _else_ {
-            _var_init_copy_(b2, s32, v0 + a);
-            A[v0] = b2;
-        }
-        _var_init_(bout, s32, builder::make_phi({b1_, b2_}));
-        _return_(bout);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-    // find if GC works
-    ssa_visitor_t ssa_gc;
-    out = ssa_gc.top_level_dispatch(out);
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ssa_transform, TestSSATransformAssignmentToCopy) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(v, s32);
-        _var_(b, s32);
-        v = A[0];
-        b = a;
-        _if_(v < 10) { b = v; }
-        _return_(b);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(t1, s32, 0);
-        _var_init_(v0, s32, A[t1]);
-        _var_init_(t2, s32, 10);
-        _var_init_(t3, datatypes::boolean, v0 < t2);
-        expr b1_;
-        _if_(t3) { _var_init_copy_(b1, s32, v0); }
-        _var_init_(bout, s32, builder::make_phi({a, b1_}));
-        _return_(bout);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ssa_transform, TestSSATransformLoopPhiAfterIf) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(v, s32);
-        _var_(b, datatypes::index);
-        b = UINT64_C(0);
-        _for_(i, 0, 100) {
-            _if_(i < 10) { b = i; }
-            _else_ { A[i] = i; }
-            A[i] = b;
-        }
-        _return_(0);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(b0, datatypes::index, UINT64_C(0));
-        _var_init_(t1, s32, 0);
-        _var_init_(t2, s32, 100);
-        _var_init_(t3, s32, 1);
-        auto b3 = builder::make_var(datatypes::index, "b3");
-        _for_(i, t1, t2, t3) {
-            _var_init_(t4, s32, 10);
-            _var_init_(t5, datatypes::boolean, i < t4);
-            _var_init_(b2, datatypes::index, builder::make_phi({b0, b3}, true));
-            expr b1_;
-            _if_(t5) { _var_init_copy_(b1, datatypes::index, i); }
-            _else_ { A[i] = i; }
-            builder::get_current_builder()->push_var_tensor_def(
-                    b3, linkage::local, builder::make_phi({b1_, b2}));
-            A[i] = b3;
-        }
-        _var_init_(b4, datatypes::index, builder::make_phi({b0, b3}));
-        _var_init_(t10, s32, 0);
-        _return_(t10);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ssa_transform, TestSSATransformForLoop) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(v, s32, 0);
-        _var_(b, s32);
-        _var_init_(c, s32, 0);
-        b = a;
-        _for_(i, 0, 100, 1) {
-            _if_(c == c) {
-                v = v + 1;
-                b = v + builder::make_cast(s32, i);
-                A[v] = b;
-            }
-            A[b] = c;
-        }
-        _return_(b);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    auto get_expected = [](bool aftergc) {
-        _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-            _bind_(A, a);
-            _var_init_(v, s32, 0);
-            _var_init_(c, s32, 0);
-            _var_init_(t0, s32, 0);
-            _var_init_(t1, s32, 100);
-            _var_init_(t2, s32, 1);
-            auto v6 = builder::make_var(s32, "v6");
-            auto b5 = builder::make_var(s32, "b5");
-            expr b4_, b3_, v2_, c0_;
-            _for_(i, t0, t1, t2) {
-                _var_init_(v1, s32, builder::make_phi({v, v6}, true));
-                _var_init_copy_(c0, s32, builder::make_phi({c}));
-                _var_init_(tcmp, datatypes::boolean, c0 == c0);
-                _var_init_copy_(b4, s32, builder::make_phi({a, b5}, true));
-                _if_(tcmp) {
-                    _var_init_(t3, s32, 1);
-                    _var_init_copy_(v2, s32, v1 + t3);
-                    _var_init_(t4, s32, builder::make_cast(s32, i));
-                    _var_init_copy_(b3, s32, v2 + t4);
-                    A[v2] = b3;
-                }
-                builder::get_current_builder()->push_var_tensor_def(
-                        b5, linkage::local, builder::make_phi({b4, b3_}));
-                builder::get_current_builder()->push_var_tensor_def(
-                        v6, linkage::local, builder::make_phi({v1, v2_}));
-                A[b5] = c0;
-            }
-            _var_init_(b6, s32, builder::make_phi({a, b5}));
-            if (!aftergc) { _var_init_(v7, s32, builder::make_phi({v, v6})); }
-            _return_(b6);
-        }
-        return expected;
-    };
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, get_expected(false)));
-
-    // find if GC works
-    ssa_visitor_t ssa_gc;
-    out = ssa_gc.top_level_dispatch(out);
-    ASSERT_TRUE(cmper.compare(out, get_expected(true)));
-}
-
-// test for nested if+for
-TEST(GCCore_CPU_ssa_transform, TestSSATransformForLoop2) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(v, s32, 0);
-        _var_init_(c, s32, 0);
-        _for_(i, 0, a, 1) {
-            _if_(c == c) {
-                _if_(c == c) { v = v + 1; }
-                _else_ { v = v + 2; }
-            }
-        }
-        _return_(v);
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    auto get_expected = [](bool aftergc) {
-        _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-            _bind_(A, a);
-            _var_init_(v, s32, 0);
-            _var_init_(c, s32, 0);
-            _var_init_(t0, s32, 0);
-            _var_init_(t1, s32, 1);
-            auto v6 = builder::make_var(s32, "v6");
-            expr c0_;
-            _for_(i, t0, a, t1) {
-                expr v5_;
-                _var_init_(v1, s32, builder::make_phi({v, v6}, true));
-                _var_init_copy_(c0, s32, builder::make_phi({c}));
-                _var_init_(t3, datatypes::boolean, c0 == c0);
-                _if_(t3) {
-                    _var_init_(t4, datatypes::boolean, c0 == c0);
-                    expr v2_, v4_;
-                    _if_(t4) {
-                        _var_init_(t6, s32, 1);
-                        _var_init_copy_(v2, s32, v1 + t6);
-                    }
-                    _else_ {
-                        _var_init_(t6, s32, 2);
-                        _var_init_copy_(v4, s32, v1 + t6);
-                    }
-                    _var_init_copy_(v5, s32, builder::make_phi({v2_, v4_}));
-                }
-                builder::get_current_builder()->push_var_tensor_def(
-                        v6, linkage::local, builder::make_phi({v1, v5_}));
-            }
-            _var_init_(v7, s32, builder::make_phi({v, v6}));
-            _return_(v7);
-        }
-        return expected;
-    };
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, get_expected(false)));
-    // find if GC works
-    ssa_visitor_t ssa_gc;
-    out = ssa_gc.top_level_dispatch(out);
-    ASSERT_TRUE(cmper.compare(out, get_expected(true)));
-}
-
-// test for nested if+for
-TEST(GCCore_CPU_ssa_transform, TestSSATransformGCReferenced) {
-    builder::ir_builder_t builder;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(v, s32, 0);
-        A[0] = v;
-        _var_init_(b, s32, v);
-    }
-    ssa_transform_t s;
-    ssa_visitor_t ssa_gc;
-    auto out = s(ccc);
-    out = ssa_gc.top_level_dispatch(out);
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_init_(v, s32, 0);
-        _var_init_(tmp, s32, 0);
-        A[tmp] = v;
-    }
-    ir_comparer cmper {true};
-    ASSERT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ssa_transform, TestSameVarCompare) {
-    builder::ir_builder_t builder;
-    // Same named var should be renamed by simpilfier
-    // However, ssa_transform need to tell apart different var node
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::index, {300})) {
-        _bind_(A);
-        _for_(iter, 0, 100) {
-            auto &l1_iter = iter;
-            _for_(iter, 0, 100) {
-                auto &l2_iter = iter;
-                A[l1_iter + l2_iter] = l1_iter + l2_iter;
-            }
-        }
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    _function_(
-            datatypes::void_t, expected, _arg_("A", datatypes::index, {300})) {
-        _bind_(A);
-        _var_init_(t0, datatypes::s32, 0);
-        _var_init_(t1, datatypes::s32, 100);
-        _var_init_(t2, datatypes::s32, 1);
-        _for_(iter, t0, t1, t2) {
-            auto &l1_iter = iter;
-            _var_init_(t3, datatypes::s32, 0);
-            _var_init_(t4, datatypes::s32, 100);
-            _var_init_(t5, datatypes::s32, 1);
-            _for_(iter, t3, t4, t5) {
-                auto &l2_iter = iter;
-                _var_init_(it0, datatypes::index,
-                        builder::make_phi({l1_iter}, false));
-                _var_init_(t7, datatypes::index, it0 + l2_iter);
-                _var_init_(t8, datatypes::index, it0 + l2_iter);
-                A[t7] = t8;
-            }
-        }
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected));
-}
-
-TEST(GCCore_CPU_ssa_transform, TestMultiLevelForPHI) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::index, {300})) {
-        _bind_(A);
-        _var_init_(a, datatypes::s32, 1);
-        _for_(x, 0, 100, 1) {
-            _for_(y, 0, 100, 1) {
-                _for_(i, 0, 100, 1) {
-                    _if_(i == 0) { A[i] = a; }
-                }
-                _for_(j, 0, 100, 1) { a = a + 1; }
-            }
-        }
-    }
-    ssa_transform_t s;
-    auto out = s(ccc);
-    _function_(
-            datatypes::void_t, expected, _arg_("A", datatypes::index, {300})) {
-        _bind_(A);
-        _var_init_(a, datatypes::s32, 1);
-        _var_init_(t0, datatypes::s32, 0);
-        _var_init_(t1, datatypes::s32, 100);
-        _var_init_(t2, datatypes::s32, 1);
-        auto a4 = builder::make_var(datatypes::s32, "a4");
-        auto a5 = builder::make_var(datatypes::s32, "a5");
-        auto a6 = builder::make_var(datatypes::s32, "a6");
-        _for_(x, t0, t1, t2) {
-            _var_init_(a0, datatypes::s32, builder::make_phi({a, a6}, true));
-            _var_init_(t3, datatypes::s32, 0);
-            _var_init_(t4, datatypes::s32, 100);
-            _var_init_(t5, datatypes::s32, 1);
-            _for_(y, t3, t4, t5) {
-                _var_init_(
-                        a1, datatypes::s32, builder::make_phi({a0, a5}, true));
-                _var_init_(t6, datatypes::s32, 0);
-                _var_init_(t7, datatypes::s32, 100);
-                _var_init_(t8, datatypes::s32, 1);
-                _for_(i, t6, t7, t8) {
-                    _var_init_(
-                            a2, datatypes::s32, builder::make_phi({a1}, false));
-                    _var_init_(cmp0, datatypes::s32, 0);
-                    _var_init_(cmpv, datatypes::boolean, i == cmp0);
-                    _if_(cmpv) { A[i] = a2; }
-                }
-                _var_init_(t9, datatypes::s32, 0);
-                _var_init_(t10, datatypes::s32, 100);
-                _var_init_(t11, datatypes::s32, 1);
-                _for_(j, t9, t10, t11) {
-                    _var_init_(a3, datatypes::s32,
-                            builder::make_phi({a1, a4}, true));
-                    _var_init_(t12, datatypes::s32, 1);
-                    builder.push_var_tensor_def(a4, linkage::local, a3 + t12);
-                }
-
-                builder.push_var_tensor_def(
-                        a5, linkage::local, builder::make_phi({a1, a4}));
-            }
-            builder.push_var_tensor_def(
-                    a6, linkage::local, builder::make_phi({a0, a5}));
-        }
-        builder.push_var_tensor_def(
-                a6, linkage::local, builder::make_phi({a, a6}));
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ssa_value_hash.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_ssa_value_hash.cpp
deleted file mode 100644
index a4ed72de97a..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_ssa_value_hash.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <iostream>
-#include <vector>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/passlet/ssa_value_hash.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <gtest/gtest.h>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-using namespace passlet;
-namespace ssa_hash {
-
-struct viewer_t : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-
-    ssa_value_hash_t hasher_;
-    struct result_t {
-        size_t val_ = 0;
-    };
-    viewer_t()
-        : hasher_ {temp_data_addresser<result_t, size_t, &result_t::val_>()} {}
-    void view(define_c v) override {
-        hasher_.view(v, passlet::PRE_VISIT);
-        ssa_viewer_t::view(v);
-        hasher_.view(v, passlet::POST_VISIT);
-    };
-};
-
-} // namespace ssa_hash
-
-static constexpr auto s32 = datatypes::s32;
-
-static size_t get_status(const stmt &v) {
-    return v->temp_data().get<ssa_hash::viewer_t::result_t>().val_;
-}
-
-TEST(GCCore_CPU_ssa_value_hash, TestSSAValueHash) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        _var_ex_(v0, s32, linkage::local, 2);
-        A[a] = a + 2;
-        A[a] = b + 2;
-        A[a] = a + g;
-        A[a] = a + g;
-        A[a] = (a + 2) + A[0];
-        A[a] = (a + 2) + A[0];
-        A[a] = (a + 2) + A[0] + 2;
-        A[a] = (a + 2) + A[0] + 2;
-        _var_ex_(v1, s32, linkage::local, 2);
-        _var_ex_(v2, s32, linkage::local, v0);
-        _return_(v1 + v2);
-    }
-
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    ssa_hash::viewer_t().dispatch(ssa_ccc);
-    auto &seq = ssa_ccc->body_.checked_as<stmts>()->seq_;
-
-    /* Reference hash value (may change because of change of address)
-0 var g: s32 0
-1 var v0: s32 = 2 2654436219
-2 var __tmp0: s32 = 2 2654436219
-3 var __tmp1: s32 = (a + __tmp0) 179049339825
-5 var __tmp2: s32 = 2 2654436219
-6 var __tmp3: s32 = (b + __tmp2) 179049350365
-8 var __tmp4: s32 = g 15973760
-9 var __tmp5: s32 = (a + __tmp4) 172048795018
-11 var __tmp6: s32 = g 15973760
-12 var __tmp7: s32 = (a + __tmp6) 172048795018
-14 var __tmp8: s32 = 2 2654436219
-15 var __tmp9: s32 = (a + __tmp8) 179049339825
-16 var __tmp10: s32 = 0 2654436221
-17 var __tmp11: s32 = A[__tmp10] 179049365863
-18 var __tmp12: s32 = (__tmp9 + __tmp11) 11966659815956
-20 var __tmp13: s32 = 2 2654436219
-21 var __tmp14: s32 = (a + __tmp13) 179049339825
-22 var __tmp15: s32 = 0 2654436221
-23 var __tmp16: s32 = A[__tmp15] 179049365863
-24 var __tmp17: s32 = (__tmp14 + __tmp16) 11966659815956
-26 var __tmp18: s32 = 2 2654436219
-27 var __tmp19: s32 = (a + __tmp18) 179049339825
-28 var __tmp20: s32 = 0 2654436221
-29 var __tmp21: s32 = A[__tmp20] 179049365863
-30 var __tmp22: s32 = (__tmp19 + __tmp21) 11966659815956
-31 var __tmp23: s32 = 2 2654436219
-32 var __tmp24: s32 = (__tmp22 + __tmp23) 758166112429100
-34 var __tmp25: s32 = 2 2654436219
-35 var __tmp26: s32 = (a + __tmp25) 179049339825
-36 var __tmp27: s32 = 0 2654436221
-37 var __tmp28: s32 = A[__tmp27] 179049365863
-38 var __tmp29: s32 = (__tmp26 + __tmp28) 11966659815956
-39 var __tmp30: s32 = 2 2654436219
-40 var __tmp31: s32 = (__tmp29 + __tmp30) 758166112429100
-42 var v1: s32 = 2 2654436219
-43 var v2: s32 = v0 2654436219
-44 var __tmp32: s32 = (v1 + v2) 350495361329
-    */
-    auto hash_2 = get_status(seq.at(1)); // hash of constant 2
-    ASSERT_EQ(get_status(seq.at(5)), hash_2);
-    // test of copy propagation: v2=v0, and v2
-    // and v0 share the same hash value
-    ASSERT_EQ(get_status(seq.at(43)), hash_2);
-
-    auto hash_a_plus_2 = get_status(seq.at(3)); // hash of a+2
-    ASSERT_EQ(get_status(seq.at(15)), hash_a_plus_2);
-    ASSERT_EQ(get_status(seq.at(21)), hash_a_plus_2);
-    ASSERT_EQ(get_status(seq.at(27)), hash_a_plus_2);
-    ASSERT_EQ(get_status(seq.at(35)), hash_a_plus_2);
-
-    auto hash_g_plus_2 = get_status(seq.at(9)); // hash of g+2
-    ASSERT_EQ(get_status(seq.at(12)), hash_g_plus_2);
-
-    auto hash_a_plus_2_plus_a0 = get_status(seq.at(18)); // a+2+A[0]
-    ASSERT_EQ(get_status(seq.at(24)), hash_a_plus_2_plus_a0);
-    ASSERT_EQ(get_status(seq.at(30)), hash_a_plus_2_plus_a0);
-    ASSERT_EQ(get_status(seq.at(38)), hash_a_plus_2_plus_a0);
-
-    auto last_level = get_status(seq.at(32)); // a+2+A[0]+2
-    ASSERT_EQ(get_status(seq.at(40)), last_level);
-}
-
-TEST(GCCore_CPU_ssa_value_hash, TestSSAValueHashAdd) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _return_((a + b) + (b + a));
-    }
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    ssa_hash::viewer_t().dispatch(ssa_ccc);
-    auto &seq = ssa_ccc->body_.checked_as<stmts>()->seq_;
-    ASSERT_EQ(get_status(seq.at(0)), get_status(seq.at(1)));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_static_memory_planner.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_static_memory_planner.cpp
deleted file mode 100644
index 202a6ba0847..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_static_memory_planner.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/transform/static_memory_planner.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_static_memory_planner, TestStaticMemoryPlanning) {
-    /*
-    {0}                   {160}               {280}                 {450}
-    |           0         |           1        |          2          |
-    |    3    |   4/5     |                                          |
-    |    7    |                                     6                |
-              {100}
-    */
-    std::vector<memory_optim::memory_alloc_trace_t> traces
-            = {{0, 100}, {1, 120}, {2, 100}, {0, 0}, {3, 50}, {4, 60}, {2, 0},
-                    {4, 0}, {8, 100}, {8, 0}, {5, 60}, {5, 0}, {1, 0}, {6, 350},
-                    {3, 0}, {7, 100}};
-    std::unordered_map<uintptr_t, size_t> out;
-    std::unordered_map<uintptr_t, std::vector<uintptr_t>> inplace_selection;
-    size_t total = memory_optim::schedule_memory_allocations(
-            traces, 1, false, {}, out, inplace_selection);
-    std::unordered_map<uintptr_t, size_t> expected_out = {{0, 0}, {1, 160},
-            {2, 280}, {3, 0}, {4, 100}, {5, 100}, {6, 100}, {7, 0}, {8, 280}};
-    EXPECT_EQ(total, 450UL);
-    EXPECT_EQ(out, expected_out);
-
-    total = memory_optim::schedule_memory_allocations(
-            traces, 1, true, {}, out, inplace_selection);
-    expected_out = {{0, 0}, {1, 160}, {2, 280}, {3, 0}, {4, 100}, {5, 100},
-            {6, 100}, {7, 0}, {8, 280}};
-    EXPECT_EQ(total, 450UL);
-    EXPECT_EQ(out, expected_out);
-}
-
-TEST(GCCore_CPU_static_memory_planner, TestStaticMemoryPlanningInplace) {
-    using inplace_outdata
-            = std::unordered_map<uintptr_t, std::vector<uintptr_t>>;
-    using inplace_data = std::unordered_map<uintptr_t,
-            std::vector<memory_optim::inplace_info>>;
-
-    // simple inplace (need merge + split)
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces = {{1, 120},
-                {2, 100}, {3, 200}, {1, 0}, {2, 0}, {3, 0}, {4, 220}, {4, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint
-                = {{3, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 0}, {4, 0}};
-        EXPECT_EQ(total, 220UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {1, 2}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // inplace extend
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces = {{1, 120},
-                {2, 100}, {4, 250}, {3, 250}, {1, 0}, {2, 0}, {3, 0}, {4, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint
-                = {{3, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 0}, {4, 250}};
-        EXPECT_EQ(total, 500UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {1, 2}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // inplace 2 buffers into one
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces
-                = {{1, 120}, {2, 100}, {3, 150}, {4, 50}, {5, 10}, {1, 0},
-                        {2, 0}, {3, 0}, {4, 0}, {5, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint = {
-                {3, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}},
-                {4, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 0}, {4, 150}, {5, 220}};
-        EXPECT_EQ(total, 230UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {1, 2}}, {4, {2}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // inplace 2 buffers into one, but require zero offset
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces
-                = {{1, 120}, {2, 100}, {3, 150}, {4, 50}, {5, 10}, {1, 0},
-                        {2, 0}, {3, 0}, {4, 0}, {5, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint = {
-                {3, {{1, inplace_kind::FREE}, {2, inplace_kind::ZERO_OFFSET}}},
-                {4, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 150}, {3, 0}, {4, 150}, {5, 250}};
-        EXPECT_EQ(total, 260UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {1}}, {4, {2}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // inplace 2 buffers into one, but require zero offset for split buffer
-    // buffer4 cannot reuse buffer 2 because it requires zero offset
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces
-                = {{1, 120}, {2, 100}, {3, 150}, {4, 50}, {5, 10}, {1, 0},
-                        {2, 0}, {3, 0}, {4, 0}, {5, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint = {
-                {3, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}},
-                {4, {{1, inplace_kind::FREE}, {2, inplace_kind::ZERO_OFFSET}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 0}, {4, 220}, {5, 270}};
-        EXPECT_EQ(total, 280UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {1, 2}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // merge free to the right
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces
-                = {{1, 120}, {2, 100}, {3, 150}, {2, 0}, {4, 150}, {5, 10},
-                        {1, 0}, {3, 0}, {4, 0}, {5, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint = {{4, {{1, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 220}, {4, 0}, {5, 150}};
-        EXPECT_EQ(total, 370UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{4, {1}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // perfect matches
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces
-                = {{1, 120}, {2, 100}, {3, 100}, {4, 120}, {1, 0}, {2, 0},
-                        {3, 0}, {4, 0}, {5, 200}, {5, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint = {
-                {3, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}},
-                {4, {{1, inplace_kind::FREE}, {2, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 120}, {4, 0}, {5, 0}};
-        EXPECT_EQ(total, 220UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {2}}, {4, {1}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-
-    // selected inputs
-    {
-        std::vector<memory_optim::memory_alloc_trace_t> traces
-                = {{1, 120}, {2, 100}, {3, 100}, {4, 120}, {1, 0}, {2, 0},
-                        {3, 0}, {4, 0}, {5, 200}, {5, 0}};
-        std::unordered_map<uintptr_t, size_t> out;
-        inplace_outdata inplace_selection;
-        inplace_data inplace_hint = {
-                {3, {{1, inplace_kind::FREE}}}, {4, {{2, inplace_kind::FREE}}}};
-        size_t total = memory_optim::schedule_memory_allocations(
-                traces, 1, false, inplace_hint, out, inplace_selection);
-        std::unordered_map<uintptr_t, size_t> expected_out
-                = {{1, 0}, {2, 120}, {3, 0}, {4, 120}, {5, 0}};
-        EXPECT_EQ(total, 240UL);
-        EXPECT_EQ(out, expected_out);
-
-        inplace_outdata expected_inplace = {{3, {1}}, {4, {2}}};
-        EXPECT_EQ(inplace_selection, expected_inplace);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_target_specific_lower.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_target_specific_lower.cpp
deleted file mode 100644
index 55f8186a5d7..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_target_specific_lower.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "exception_util.hpp"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/cpu/target_specific_lower.hpp>
-#include <util/any_map.hpp>
-
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-void check(const func_t &aaa, uint64_t contents_size, uint64_t seq_size,
-        int idx, const char *name) {
-    target_specific_lowering_cpu_t pass {get_default_context()};
-    auto mod = ir_module_t::from_entry_func(get_default_context(), aaa);
-    auto retmod = pass(mod);
-    ASSERT_EQ(retmod->get_contents().size(), contents_size);
-    auto ret = retmod->get_func("aaa");
-    ASSERT_TRUE(ret);
-    auto seq = ret->body_.checked_as<stmts>();
-    ASSERT_EQ(seq->seq_.size(), seq_size);
-    auto &body = seq->seq_;
-    auto call_n = body[idx].checked_as<assign>()->value_.as<call>();
-    ASSERT_TRUE(call_n.defined());
-    auto funct = std::dynamic_pointer_cast<func_base>(call_n->func_);
-    ASSERT_TRUE(funct);
-    EXPECT_EQ(funct->name_, name);
-    EXPECT_EQ(call_n->attr().get_or_else("inline_level", 0), 2);
-    EXPECT_EQ(retmod->get_func(name), call_n->func_);
-}
-
-TEST(GCCore_CPU_target_specific_lower_cpp, TestLowerIntrinsics) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        A[0] = builder::make_exp(A[span_t({0}, 16)]);
-        A[0] = builder::make_exp(A[span_t({0}, 8)]);
-        A[0] = builder::make_exp(A[span_t({0}, 4)]);
-        A[0] = builder::make_exp(A[span_t({0}, 1)]);
-        A[0] = builder::make_exp(A[span_t({0}, 16)]);
-    }
-    check(aaa, 5, 5, 0, "_should_inline_exp_f32x16");
-    check(aaa, 5, 5, 1, "_should_inline_exp_f32x8");
-    check(aaa, 5, 5, 2, "_should_inline_exp_f32x4");
-    check(aaa, 5, 5, 3, "_should_inline_exp_f32");
-    // exp_f32x16 is used twice, check if it is duplicated.
-    check(aaa, 5, 5, 4, "_should_inline_exp_f32x16");
-}
-
-TEST(GCCore_CPU_target_specific_lower_cpp, TestLowerIntrinsics2) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        _var_(B, sc_data_type_t::boolean(16));
-        _var_(C, sc_data_type_t::boolean(8));
-        _var_(D, sc_data_type_t::boolean(4));
-        _var_(E, sc_data_type_t::boolean(1));
-        B[0] = builder::make_isnan(A[span_t({0}, 16)]);
-        C[0] = builder::make_isnan(A[span_t({0}, 8)]);
-        D[0] = builder::make_isnan(A[span_t({0}, 4)]);
-        E[0] = builder::make_isnan(A[span_t({0}, 1)]);
-    }
-    check(aaa, 5, 8, 4, "_should_inline_isnan_boolx16");
-    check(aaa, 5, 8, 5, "_should_inline_isnan_boolx8");
-    check(aaa, 5, 8, 6, "_should_inline_isnan_boolx4");
-    check(aaa, 5, 8, 7, "_should_inline_isnan_bool");
-}
-
-static expr make_const(int32_t v, uint32_t lanes) {
-    return make_expr<constant_node>((int64_t)v, sc_data_type_t::s32(lanes));
-}
-
-static expr make_const(float v, uint32_t lanes) {
-    return make_expr<constant_node>(v, sc_data_type_t::f32(lanes));
-}
-
-TEST(GCCore_CPU_target_specific_lower_cpp, TestLowerSaturatedCast) {
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::void_t, aaa) {
-        _var_(a, datatypes::f32);
-        _var_(b, datatypes::s32);
-        _var_(c, sc_data_type_t::s32(16));
-        _var_(d, sc_data_type_t::f32(16));
-
-        _var_(e, datatypes::s8);
-        _var_(f, datatypes::u8);
-        _var_(g, sc_data_type_t::s8(16));
-        _var_(h, sc_data_type_t::u8(16));
-
-        e = builder::make_saturated_cast(a, datatypes::s8);
-        e = builder::make_saturated_cast(b, datatypes::s8);
-        f = builder::make_saturated_cast(a, datatypes::u8);
-        f = builder::make_saturated_cast(b, datatypes::u8);
-
-        g = builder::make_saturated_cast(c, sc_data_type_t::s8(16));
-        g = builder::make_saturated_cast(d, sc_data_type_t::s8(16));
-        h = builder::make_saturated_cast(c, sc_data_type_t::u8(16));
-        h = builder::make_saturated_cast(d, sc_data_type_t::u8(16));
-    }
-    target_specific_lowering_cpu_t pass {get_default_context()};
-    auto ctx = std::make_shared<context_t>(*get_default_context());
-    ctx->machine_.cpu_flags_.fAVX512F = true;
-    auto mod = ir_module_t::from_entry_func(ctx, aaa);
-    auto retmod = pass(mod);
-    auto ret = retmod->get_func("aaa");
-    ASSERT_TRUE(ret);
-    using namespace builder;
-    _function_(datatypes::void_t, expected) {
-        _var_(a, datatypes::f32);
-        _var_(b, datatypes::s32);
-        _var_(c, sc_data_type_t::s32(16));
-        _var_(d, sc_data_type_t::f32(16));
-
-        _var_(e, datatypes::s8);
-        _var_(f, datatypes::u8);
-        _var_(g, sc_data_type_t::s8(16));
-        _var_(h, sc_data_type_t::u8(16));
-
-        e = make_cast(datatypes::s8,
-                make_round_and_cast(
-                        builder::make_min(make_const(127.f, 1),
-                                builder::make_max(make_const(-128.f, 1), a)),
-                        datatypes::s32));
-        e = make_cast(datatypes::s8,
-                make_max(make_min(b, make_const(127, 1)), make_const(-128, 1)));
-
-        f = make_cast(datatypes::u8,
-                make_round_and_cast(
-                        builder::make_min(make_const(255.f, 1),
-                                builder::make_max(make_const(0.f, 1), a)),
-                        datatypes::s32));
-        f = make_cast(datatypes::u8,
-                make_max(make_min(b, make_const(255, 1)), make_const(0, 1)));
-
-        g = builder::make_saturated_cast(c, sc_data_type_t::s8(16));
-        g = builder::make_saturated_cast(
-                make_round_and_cast(builder::make_min(make_const(127.f, 16), d),
-                        sc_data_type_t::s32(16)),
-                sc_data_type_t::s8(16));
-        h = builder::make_saturated_cast(
-                make_max(c, make_const(0, 16)), sc_data_type_t::u8(16));
-        h = builder::make_saturated_cast(
-                make_round_and_cast(builder::make_max(make_const(0.f, 16), d),
-                        sc_data_type_t::u32(16)),
-                sc_data_type_t::u8(16));
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(ret, expected, false));
-}
-
-TEST(GCCore_CPU_target_specific_lower_cpp, TestLowerGetTidGid) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        A[0] = builder::make_get_group_thread_id(-1);
-    }
-    _function_(datatypes::void_t, expected,
-            _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        A[0] = builtin::get_thread_id_func()();
-    }
-    target_specific_lowering_cpu_t pass {get_default_context()};
-    auto ret = pass(ir_module_t::from_entry_func(get_default_context(), aaa));
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(ret->get_entry_func(), expected, false));
-
-    _function_(datatypes::void_t, ccc, _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        A[0] = builder::make_get_group_thread_id(0);
-    }
-    auto cccmod = ir_module_t::from_entry_func(get_default_context(), ccc);
-    EXPECT_SC_ERROR(pass(cccmod), "get_group_thread_id");
-
-    _function_(datatypes::void_t, ddd, _arg_("A", datatypes::f32, {123, 321})) {
-        _bind_(A);
-        A[0] = builder::make_get_group_id(0);
-    }
-    auto ddddmod = ir_module_t::from_entry_func(get_default_context(), ddd);
-    EXPECT_SC_ERROR(pass(ddddmod), "get_group_id");
-}
-
-#ifndef NDEBUG
-TEST(GCCore_CPU_target_specific_lower_cpp, TestMinMaxAutoSwap) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::void_t, aaa) {
-        _var_(a, datatypes::f32);
-        _var_(b, datatypes::f32);
-
-        b = make_max(make_min(a, make_expr<constant_node>(127.0f)),
-                make_expr<constant_node>(-128.0f));
-    }
-    _function_(datatypes::void_t, expected) {
-        _var_(a, datatypes::f32);
-        _var_(b, datatypes::f32);
-        // auto swap const and var position in avoid of NaN
-        b = make_max(make_expr<constant_node>(-128.0f),
-                make_min(make_expr<constant_node>(127.0f), a));
-    }
-
-    auto ctx = get_default_context();
-    target_specific_lowering_cpu_t pass {ctx};
-    auto mod = ir_module_t::from_entry_func(ctx, aaa);
-    auto retmod = pass(mod);
-    auto ret = retmod->get_func("aaa");
-    ASSERT_TRUE(ret);
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(ret, expected, false));
-}
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_targetmachine.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_targetmachine.cpp
deleted file mode 100644
index 01870251915..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_targetmachine.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "gtest/gtest.h"
-#include <compiler/jit/cfake/cfake_jit.hpp>
-#include <runtime/target_machine.hpp>
-#include <util/utils.hpp>
-
-namespace gc = dnnl::impl::graph::gc;
-TEST(GCCore_CPU_targetmachine_cpp, TestTargetMachine) {
-    // mainly test if we can compile and run the instructions (cpuid)
-    {
-        gc::runtime::target_machine_t tm
-                = gc::runtime::get_native_target_machine();
-        std::cout << "Native: AVX2:" << tm.cpu_flags_.fAVX2
-                  << " AXV512:" << tm.cpu_flags_.fAVX512F
-                  << " AXV512BF16:" << tm.cpu_flags_.fAVX512BF16
-                  << " max bits:" << tm.cpu_flags_.max_simd_bits << '\n';
-    }
-#if SC_CFAKE_JIT_ENABLED
-    {
-        gc::runtime::target_machine_t tm
-                = gc::runtime::get_native_target_machine();
-        gc::cfake_jit::set_target_machine(tm);
-        std::cout << "Compiler: AVX2:" << tm.cpu_flags_.fAVX2
-                  << " AXV512:" << tm.cpu_flags_.fAVX512F
-                  << " AXV512BF16:" << tm.cpu_flags_.fAVX512BF16
-                  << " max bits:" << tm.cpu_flags_.max_simd_bits << '\n';
-        // test bf16 flag
-        std::vector<std::string> option = {"g++", "-v"};
-        int exit_status;
-        std::string rstderr;
-        bool success = gc::utils::create_process_and_await(
-                "g++", option, exit_status, nullptr, nullptr, &rstderr);
-        if (success && !exit_status) {
-            std::size_t vstart = rstderr.find("gcc version", 0) + 12;
-            if (std::stoi(rstderr.substr(vstart, 2)) < 10) {
-                ASSERT_EQ(tm.cpu_flags_.fAVX512BF16, false);
-            }
-        }
-    }
-#endif
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor2var.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor2var.cpp
deleted file mode 100644
index 6cc717cbb77..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor2var.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/transform/tensor2var.hpp>
-#include <util/any_map.hpp>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-TEST(GCCore_CPU_tensor2var_cpp, TestTensor2Var) {
-    builder::ir_builder_t builder;
-    _function_(datatypes::s32, aaa, _arg_("A", datatypes::f32, {123}),
-            _arg_("B", datatypes::f32, {122}), _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(tmp, datatypes::f32, 32);
-        // arg tensor, don't change
-        A[0] = 1.0f;
-        // local tensor with different SIMD len
-        tmp[0] = 1.0f;
-        tmp[span_t({0}, 8)] = tmp[span_t({0}, 8)];
-        _tensor_(tmpb, datatypes::f32, 32);
-        _var_(t, datatypes::pointer);
-        // directly use tensor node
-        t = tmpb;
-        tmpb[span_t({0}, 8)] = tmpb[span_t({8}, 8)];
-
-        // tensorptr
-        _tensor_(tmpc, datatypes::f32, 32);
-        t = builder::tensor_ptr(tmpc, {0});
-        tmpc[span_t({0}, 8)] = tmpc[span_t({8}, 8)];
-
-        // tensor too large
-        _tensor_(tmpd, datatypes::f32, 4096);
-        tmpd[span_t({0}, 8)] = tmpd[span_t({8}, 8)];
-
-        // bad index
-        _tensor_(tmpe, datatypes::f32, 32);
-        tmpe[span_t({0}, 8)] = tmpe[span_t({11}, 8)];
-
-        // good
-        _tensor_(tmpf, datatypes::f32, 32);
-        tmpf[span_t({0}, 8)] = tmpf[span_t({16}, 8)];
-
-        // good
-        _tensor_(tmpg, datatypes::f32, 2);
-        tmpg[0] = 3.0f;
-        tmpg[1] = tmpg[0] + 3.0f;
-
-        _tensor_(tmph, datatypes::f32, 32);
-
-        _for_(i, 0, len, 1, for_type::PARALLEL) {
-            // in parallel
-            tmph[0] = 1.0f;
-        }
-
-        _return_(12);
-    }
-
-    tensor2var_t f;
-    auto outf = f(aaa);
-
-    _function_(datatypes::s32, expected, _arg_("A", datatypes::f32, {123}),
-            _arg_("B", datatypes::f32, {122}), _arg_("len", datatypes::s32)) {
-        _bind_(A, B, len);
-        _tensor_(tmp, datatypes::f32, 32);
-        // arg tensor, don't change
-        A[0] = 1.0f;
-        // local tensor with different SIMD len
-        tmp[0] = 1.0f;
-        tmp[span_t({0}, 8)] = tmp[span_t({0}, 8)];
-        _tensor_(tmpb, datatypes::f32, 32);
-        _var_(t, datatypes::pointer);
-        // directly use tensor node
-        t = tmpb;
-        tmpb[span_t({0}, 8)] = tmpb[span_t({8}, 8)];
-
-        // tensorptr
-        _tensor_(tmpc, datatypes::f32, 32);
-        t = builder::tensor_ptr(tmpc, {0});
-        tmpc[span_t({0}, 8)] = tmpc[span_t({8}, 8)];
-
-        // tensor too large
-        _tensor_(tmpd, datatypes::f32, 4096);
-        tmpd[span_t({0}, 8)] = tmpd[span_t({8}, 8)];
-
-        // bad index
-        _tensor_(tmpe, datatypes::f32, 32);
-        tmpe[span_t({0}, 8)] = tmpe[span_t({11}, 8)];
-
-        // good
-        _var_(tmpf0, sc_data_type_t(sc_data_etype::F32, 8));
-        _var_(tmpf2, sc_data_type_t(sc_data_etype::F32, 8));
-        tmpf0 = tmpf2;
-
-        // good
-        _var_(tmpg0, datatypes::f32);
-        _var_(tmpg1, datatypes::f32);
-        tmpg0 = 3.0f;
-        tmpg1 = tmpg0 + 3.0f;
-
-        _tensor_(tmph, datatypes::f32, 32);
-
-        _for_(i, 0, len, 1, for_type::PARALLEL) {
-            // in parallel
-            tmph[0] = 1.0f;
-        }
-
-        _return_(12);
-    }
-    ir_comparer cmp(true);
-    EXPECT_TRUE(cmp.compare(outf, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_init.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_init.cpp
deleted file mode 100644
index 2734b8dcb55..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_init.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/transform/parallel_workload_attr.hpp>
-#include <compiler/ir/transform/tensor_init.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_tensor_init_cpp, TestTensorInit) {
-    REQUIRE_PARALLEL();
-    REQUIRE_AVX();
-    builder::ir_builder_t builder;
-
-    _function_(datatypes::s32, bbb, _arg_("a", datatypes::f32, {128})) {
-        _bind_(a);
-        a.static_as<tensor>()->init_value_
-                = tensor_node::make_tensor_initializer(2.123f);
-        _tensor_(b, datatypes::f32, 260);
-        b.static_as<tensor>()->init_value_
-                = tensor_node::get_zero_tensor_initializer();
-        _var_(vvv, datatypes::index);
-        _tensor_(c, datatypes::f32, vvv);
-        c.static_as<tensor>()->init_value_
-                = tensor_node::get_zero_tensor_initializer();
-        _for_(t, 0, 100, 1, for_type::PARALLEL) {
-            _tensor_(d, datatypes::f32, 128);
-            d.static_as<tensor>()->init_value_
-                    = tensor_node::get_zero_tensor_initializer();
-        }
-        _return_(1);
-    }
-
-    tensor_init_t pass {get_test_ctx()};
-    auto f = pass(bbb);
-
-    uint64_t simdlen = get_test_ctx()->get_max_vector_lanes(sc_data_etype::F32);
-    _function_(datatypes::s32, expected, _arg_("a", datatypes::f32, {128})) {
-        _bind_(a);
-        a.static_as<tensor>()->init_value_
-                = tensor_node::make_tensor_initializer(2.123f);
-        _for_(i, UINT64_C(0), UINT64_C(128), simdlen, for_type::PARALLEL) {
-            a[span_t({i}, simdlen)]
-                    = make_expr<constant_node>(union_val(2.123f),
-                            sc_data_type_t(sc_data_etype::F32, simdlen));
-        }
-        _tensor_(b, datatypes::f32, 260);
-        b.static_as<tensor>()->init_value_
-                = tensor_node::get_zero_tensor_initializer();
-        _for_(i, UINT64_C(0), UINT64_C(260) / simdlen * simdlen, simdlen,
-                for_type::PARALLEL) {
-            b[span_t({i}, simdlen)] = make_expr<constant_node>(union_val(0.0f),
-                    sc_data_type_t(sc_data_etype::F32, simdlen));
-        }
-        _for_(i, UINT64_C(260) / simdlen * simdlen, UINT64_C(260),
-                UINT64_C(1)) {
-            b[i] = 0.0f;
-        }
-        _var_(vvv, datatypes::index);
-        _tensor_(c, datatypes::f32, vvv);
-        c.static_as<tensor>()->init_value_
-                = tensor_node::get_zero_tensor_initializer();
-        _for_(i, UINT64_C(0), vvv, UINT64_C(1), for_type::PARALLEL) {
-            c[i] = 0.0f;
-        }
-
-        _for_(t, 0, 100, 1, for_type::PARALLEL) {
-            _tensor_(d, datatypes::f32, 128);
-            d.static_as<tensor>()->init_value_
-                    = tensor_node::get_zero_tensor_initializer();
-            _for_(i, UINT64_C(0), UINT64_C(128), simdlen) {
-                d[span_t({i}, simdlen)]
-                        = make_expr<constant_node>(union_val(0.0f),
-                                sc_data_type_t(sc_data_etype::F32, simdlen));
-            }
-        }
-        _return_(1);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(f, expected, false));
-    auto assign_nd = f->body_.checked_as<stmts>()
-                             ->seq_.at(0)
-                             .checked_as<for_loop>()
-                             ->body_.checked_as<stmts>()
-                             ->seq_.at(0);
-    EXPECT_EQ(assign_nd->attr().get_or_else(
-                      parallel_workload::attr_workload_number, size_t(0)),
-            4UL);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_inplace.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_inplace.cpp
deleted file mode 100644
index 8e504ce9b0f..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_inplace.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include "context.hpp"
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/ir_copy.hpp>
-#include <compiler/ir/transform/buffer_schedule.hpp>
-#include <compiler/ir/transform/pointer_alias_info.hpp>
-#include <compiler/ir/transform/tensor_inplace.hpp>
-#include <util/any_map.hpp>
-
-#include <unordered_set>
-
-#include <iostream>
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-static context_ptr make_ctx() {
-    auto ret = std::make_shared<context_t>(*get_test_ctx());
-    ret->flags_.buffer_schedule_ = 2;
-    return ret;
-}
-
-static void do_test(bool copyit) {
-    ir_builder_t bld;
-    _function_(datatypes::void_t, aaa, _arg_("A", datatypes::f32, {100}),
-            _arg_("B", datatypes::f32, {100}),
-            _arg_("C", datatypes::f32, {100})) {}
-    aaa->attr()[function_attrs::inplace_hint]
-            = std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>> {
-                    {2, {{0, inplace_kind::FREE}}}};
-    // make sure decl and def does not share the same tensor node
-    for (auto &arg : aaa->decl_->params_) {
-        arg = arg->remake();
-    }
-
-    _function_(datatypes::void_t, bbb, _arg_("A", datatypes::f32, {100}),
-            _arg_("B", datatypes::f32, {100}),
-            _arg_("C", datatypes::f32, {100})) {}
-    bbb->attr()[function_attrs::inplace_hint]
-            = std::vector<std::pair<int, std::vector<tensor_inplace_info_t>>> {
-                    {2, {{0, inplace_kind::FREE}}}};
-    // make sure decl and def does not share the same tensor node
-    for (auto &arg : bbb->decl_->params_) {
-        arg = arg->remake();
-    }
-    std::unordered_map<expr_c, expr> replace_map;
-    ir_copier_t cpyer {replace_map};
-    func_t bbb_decl_copy = copyit
-            ? std::const_pointer_cast<func_base>(cpyer(bbb->decl_))
-            : bbb->decl_;
-    _function_(
-            datatypes::void_t, main_entry, _arg_("C", datatypes::f32, {100})) {
-        _bind_(C);
-        _tensor_(A, datatypes::f32, 100);
-        _tensor_(B, datatypes::f32, 100);
-        _tensor_(temp1, datatypes::f32, 100);
-        A[0] = 1;
-        B[0] = 1;
-        _evaluate_call_(aaa, A, B, temp1);
-        // check that we cannot reuse A, because call of aaa is not the last use
-        // of A
-        C[0] = A[0];
-
-        _tensor_(D, datatypes::f32, 100);
-        // check that we cannot reuse D for temp1, because call of aaa is not
-        // the first use of temp1
-        _evaluate_call_(aaa, D, B, temp1);
-
-        _tensor_(E, datatypes::f32, 100);
-        E[0] = 1;
-        _tensor_(temp2, datatypes::f32, 100);
-        // we can reuse E for temp1
-        _evaluate_call_(bbb_decl_copy, E, B, temp2);
-        C[1] = temp2[1];
-    }
-    auto ctx = make_ctx();
-    auto mod = ir_module_t::from_entry_func(ctx, main_entry);
-    mod->add_func({bbb});
-    auto out_mod = tensor_inplace_t(ctx)(mod);
-
-    _function_(datatypes::void_t, expected, _arg_("C", datatypes::f32, {100})) {
-        _bind_(C);
-        _tensor_(sched, datatypes::s8, UINT64_C(1344));
-        _tensor_(A, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        _tensor_(B, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(448)});
-        _tensor_(temp1, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(896)});
-        A[0] = 1;
-        B[0] = 1;
-        _evaluate_call_(aaa->decl_, A, B, temp1);
-        // check that we cannot reuse A, because call of aaa is not the last use
-        // of A
-        C[0] = A[0];
-
-        _tensor_(D, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(0)});
-        // check that we cannot reuse D for temp1, because call of aaa is not
-        // the first use of temp1
-        _evaluate_call_(aaa->decl_, D, B, temp1);
-
-        _tensor_(E, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(896)});
-        E[0] = 1;
-        _tensor_(temp2, datatypes::f32, 100);
-        bld.get_current_scope().as_seq().back().checked_as<define>()->init_
-                = builder::tensor_ptr(sched, {UINT64_C(896)});
-        // we can reuse E for temp1
-        _evaluate_call_(bbb_decl_copy, E, B, temp2);
-        C[1] = temp2[1];
-    }
-
-    EXPECT_FALSE(aaa->params_[0]->attr_);
-    EXPECT_FALSE(aaa->params_[1]->attr_);
-    EXPECT_FALSE(aaa->params_[2]->attr_);
-
-    // check that alias info is propagated back to func definition
-    ASSERT_TRUE(bbb->params_[0]->attr_);
-    auto group1 = alias_info::get_alias_info(*bbb->params_[0]);
-    ASSERT_TRUE(group1);
-    EXPECT_FALSE(bbb->params_[1]->attr_);
-    ASSERT_TRUE(bbb->params_[2]->attr_);
-    auto group2 = alias_info::get_alias_info(*bbb->params_[2]);
-    ASSERT_TRUE(group2);
-
-    ASSERT_NE(group1, group2);
-    ASSERT_EQ(group1->alias_cliques_.size(), 1UL);
-    ASSERT_EQ(group2->alias_cliques_.size(), 1UL);
-    ASSERT_EQ(group1->alias_cliques_[0], group1->alias_cliques_[0]);
-    ASSERT_EQ(group1->alias_cliques_[0]->set_.size(), 2UL);
-    auto &theset = group1->alias_cliques_[0]->set_;
-    auto in_set = theset.find(group2->shared_from_this()) != theset.end();
-    ASSERT_TRUE(in_set);
-
-    buffer_scheduler_t scheduler {out_mod->ctx_, true, true};
-    auto after_sched = scheduler(out_mod->get_entry_func());
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(after_sched, expected, false));
-}
-
-TEST(GCCore_CPU_tensor_inplace_cpp, TestSimpleSchedule) {
-    do_test(false);
-}
-
-TEST(GCCore_CPU_tensor_inplace_cpp, TestSimpleScheduleDeclCopied) {
-    do_test(true);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_shrink.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_shrink.cpp
deleted file mode 100644
index 31d55aa43f6..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_tensor_shrink.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ir_module.hpp>
-#include <compiler/ir/transform/tensor_shrink.hpp>
-
-using namespace dnnl::impl::graph::gc;
-TEST(GCCore_CPU_tensor_shrink_cpp, TestTensorShrink) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    {
-        _tensor_(tsr1, datatypes::f32, 100, 200);
-        _tensor_(tsr2, datatypes::f32, 100, 200);
-        _tensor_(tsr3, datatypes::f32, 100, 200);
-        _var_(ptr, datatypes::pointer);
-        expr ptr2;
-        _for_(i, 0, 100) {
-            auto placeholder
-                    = builder::make_stmts_unattached({}).checked_as<stmts>();
-            builder.get_current_scope().as_seq().emplace_back(placeholder);
-            tsr1[{i, 10}] = 10;
-            tsr2[{i, 10}] = 10;
-            tsr3[{i, 10}] = 10;
-            ptr2 = builder::tensor_ptr(tsr1, {i, 20});
-            ptr = ptr2;
-            ptr = tsr2;
-            ptr2[{i, 10}] = 10;
-            tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                    = tensor_shrinker_t::shrink_info_t {
-                            /*base*/ {i, 10}, /*shape*/ {1, 190}, stmts()};
-            tsr3->attr()[tensor_shrinker_attrs::should_shrink]
-                    = tensor_shrinker_t::shrink_info_t {/*base*/ {i, 10},
-                            /*shape*/ {1, 190}, placeholder};
-        }
-    }
-    auto body = builder.pop_scope();
-    tensor_shrinker_t pass;
-    auto after_body = pass(body);
-
-    builder.push_scope();
-    {
-        _tensor_(tsr1_sh, datatypes::f32, 1, 190);
-        _tensor_(tsr2, datatypes::f32, 100, 200);
-        builder.get_current_scope().as_seq().emplace_back(
-                builder::make_stmts_unattached({}));
-        _var_(ptr, datatypes::pointer);
-        expr ptr2;
-        _for_(i, 0, 100) {
-            _tensor_(tsr3_sh, datatypes::f32, 1, 190);
-            tsr1_sh[{i - i, expr(10) - 10}] = 10;
-            tsr2[{i, 10}] = 10;
-            tsr3_sh[{i - i, expr(10) - 10}] = 10;
-            ptr = builder::tensor_ptr(tsr1_sh, {i - i, expr(20) - 10});
-            ptr = tsr2;
-            ptr2 = builder::tensor_ptr(tsr1_sh, {i, expr(20)});
-            ptr2[{0, 0}] = 10;
-        }
-    }
-    auto expected = builder.pop_scope();
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(after_body, expected));
-}
-
-TEST(GCCore_CPU_tensor_shrink_cpp, TestTensorShrinkUnroll) {
-    builder::ir_builder_t builder;
-    for_loop loop;
-    builder.push_scope();
-    {
-        _tensor_(tsr1, datatypes::f32, 100, 200);
-        _named_for_(loop, i, 0, 1) {
-            auto placeholder
-                    = builder::make_stmts_unattached({}).checked_as<stmts>();
-            builder.get_current_scope().as_seq().emplace_back(placeholder);
-            tsr1[{i, 10}] = i;
-            tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                    = tensor_shrinker_t::shrink_info_t {
-                            /*base*/ {i, 10}, /*shape*/ {1, 190}, placeholder};
-            placeholder->attr()[tensor_shrinker_attrs::tensor_for_placerholder]
-                    = std::weak_ptr<expr_base>(tsr1.get().impl);
-        }
-    }
-    auto body = builder.pop_scope();
-    loop->unroll(0, body);
-
-    tensor_shrinker_t pass;
-    auto after_body = pass(body);
-
-    builder.push_scope();
-    {
-        builder.get_current_scope().emit(builder::make_stmts_unattached({}));
-        builder.push_scope();
-        {
-            _tensor_(tsr1_sh, datatypes::f32, 1, 190);
-            tsr1_sh[{expr(0UL) + UINT64_C(0) - (expr(0UL) + UINT64_C(0)),
-                    expr(10) - 10}]
-                    = (expr(0UL) + UINT64_C(0));
-        }
-        auto bd = builder.pop_scope();
-        builder.get_current_scope().emit(bd);
-    }
-    auto expectedbody = builder.pop_scope();
-
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(after_body, expectedbody));
-}
-
-TEST(GCCore_CPU_tensor_shrink_cpp, TestTensorShrinkFail) {
-    builder::ir_builder_t builder;
-    builder.push_scope();
-    {
-        _tensor_(tsr1, datatypes::f32, 100, 200);
-        _var_(ptr, datatypes::pointer);
-        _for_(i, 0, 100) {
-            tsr1[{i, 10}] = 10;
-            ptr = tsr1;
-            tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                    = tensor_shrinker_t::shrink_info_t {
-                            /*base*/ {i, 10}, /*shape*/ {1, 190}, stmts()};
-        }
-    }
-    auto body = builder.pop_scope();
-    tensor_shrinker_t pass;
-    EXPECT_SC_ERROR(
-            pass(body), "The shrinked tensor is referenced without indexing");
-
-    builder.push_scope();
-    {
-        auto tsr1 = builder::make_tensor("tsr1", {100, 200}, datatypes::f32);
-        tsr1[{0, 10}] = 10;
-        tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                = tensor_shrinker_t::shrink_info_t {
-                        /*base*/ {0, 10}, /*shape*/ {1, 190}, stmts()};
-    }
-    body = builder.pop_scope();
-    EXPECT_SC_ERROR(pass(body), "Tensor used before definition");
-
-    builder.push_scope();
-    {
-        _tensor_(tsr1, datatypes::f32, 100, 200);
-        tsr1[{0, 10, 0}] = 10;
-        tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                = tensor_shrinker_t::shrink_info_t {
-                        /*base*/ {0, 10}, /*shape*/ {1, 190}, stmts()};
-    }
-    body = builder.pop_scope();
-    EXPECT_SC_ERROR(pass(body), "Bad number of dimensions for indexing access");
-
-    builder.push_scope();
-    {
-        _tensor_(tsr1, datatypes::f32, 100, 200);
-        tsr1[{0, 10}] = 10;
-        tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                = tensor_shrinker_t::shrink_info_t {
-                        /*base*/ {0, 10}, /*shape*/ {1, 190, 20}, stmts()};
-    }
-    body = builder.pop_scope();
-    EXPECT_SC_ERROR(pass(body), "Bad shape for shrinking the tensor:");
-
-    builder.push_scope();
-    {
-        _tensor_(tsr1, datatypes::f32, 100, 200);
-        builder.get_current_scope()
-                .as_seq()
-                .back()
-                .checked_as<define>()
-                ->linkage_
-                = linkage::static_local;
-        tsr1[{0, 10}] = 10;
-        tsr1->attr()[tensor_shrinker_attrs::should_shrink]
-                = tensor_shrinker_t::shrink_info_t {
-                        /*base*/ {0, 10}, /*shape*/ {1, 190}, stmts()};
-    }
-    body = builder.pop_scope();
-    EXPECT_SC_ERROR(pass(body), "And it should be a local tensor");
-}
-
-TEST(GCCore_CPU_tensor_shrink_cpp, TestTensorShrinkBRGEMM) {
-    builder::ir_builder_t builder;
-    for_loop loop;
-    builder.push_scope();
-    {
-        _tensor_(A, datatypes::s32, 100, 200, 300, 400);
-        _tensor_(B, datatypes::s32, 100, 200, 400, 300);
-        _tensor_(C, datatypes::s32, 100, 300, 200, 300);
-        _for_(i, 0, 100) {
-            _for_(j, 0, 200) {
-                auto placeholder = builder::make_stmts_unattached({})
-                                           .checked_as<stmts>();
-                builder.get_current_scope().as_seq().emplace_back(placeholder);
-                expr orig_LDC = 200 * 300;
-                builtin::brgemm_update(builder::tensor_ptr(A, {i, j, 0, 0}),
-                        builder::tensor_ptr(B, {i, j, 0, 0}),
-                        // discontinuous memory access for C buffer
-                        builder::tensor_ptr(C, {i, 0, j, 0}), 20, 300, 300, 400,
-                        /*LDA*/ 400, /*LDB*/ 300, /*LDC*/ orig_LDC,
-                        /*stride_A*/ 300 * 400, /*stride_B*/ 400 * 300,
-                        datatypes::s32, datatypes::s32);
-                C->attr()[tensor_shrinker_attrs::should_shrink]
-                        = tensor_shrinker_t::shrink_info_t {
-                                /*base*/ {i, 0, j, 0},
-                                /*shape*/ {1, 300, 1, 300}, placeholder};
-                placeholder
-                        ->attr()[tensor_shrinker_attrs::tensor_for_placerholder]
-                        = std::weak_ptr<expr_base>(C.get().impl);
-            }
-        }
-    }
-    auto body = builder.pop_scope();
-
-    tensor_shrinker_t pass;
-    auto after_body = pass(body);
-
-    builder.push_scope();
-    {
-        _tensor_(A, datatypes::s32, 100, 200, 300, 400);
-        _tensor_(B, datatypes::s32, 100, 200, 400, 300);
-        builder.get_current_scope().emit(builder::make_stmts_unattached({}));
-        _for_(i, 0, 100) {
-            _for_(j, 0, 200) {
-                _tensor_(C_shr, datatypes::s32, 1, 300, 1, 300);
-                builtin::brgemm_update(builder::tensor_ptr(A, {i, j, 0, 0}),
-                        builder::tensor_ptr(B, {i, j, 0, 0}),
-                        builder::tensor_ptr(C_shr,
-                                {i - i, expr(0) - expr(0), j - j,
-                                        expr(0) - expr(0)}),
-                        20, 300, 300, 400, /*LDA*/ 400, /*LDB*/ 300,
-                        /*LDC*/
-                        300 /*instead of 200*300 */,
-                        /*stride_A*/ 300 * 400, /*stride_B*/ 400 * 300,
-                        datatypes::s32, datatypes::s32);
-            }
-        }
-    }
-    auto expectedbody = builder.pop_scope();
-    ir_comparer cmper;
-    EXPECT_TRUE(cmper.compare(after_body, expectedbody));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_thread_pool.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_thread_pool.cpp
deleted file mode 100644
index c99177f46c2..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_thread_pool.cpp
+++ /dev/null
@@ -1,186 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <atomic>
-#include <iostream>
-#include <stdexcept>
-#include <thread>
-#include "gtest/gtest.h"
-#include <runtime/barrier.hpp>
-#include <runtime/config.hpp>
-#include <runtime/managed_thread_pool.hpp>
-#include <runtime/managed_thread_pool_exports.hpp>
-#include <runtime/parallel.hpp>
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <test_thread.hpp>
-#define dnnl_thread_env() \
-    dnnl::testing::scoped_tp_activation_t unused_raii {}
-#else
-#define dnnl_thread_env()
-#endif
-
-using namespace dnnl::impl::graph::gc;
-
-#if SC_CPU_THREADPOOL > 0
-
-TEST(GCCore_CPU_thread_pool, TestBarrier) {
-    dnnl_thread_env();
-    runtime::barrier_t bar[2];
-    sc_init_barrier(bar, 2, 16);
-    int data[16 * 16] = {0};
-    std::vector<std::thread> threads;
-    bool result = true;
-    for (int t = 0; t < 16; t++) {
-        threads.emplace_back(
-                [&data, &bar, &result](int tid) {
-                    for (int i = 0; i < 500; i++) {
-                        data[tid * 16] = i;
-                        sc_arrive_at_barrier(bar, nullptr, nullptr);
-                        // check if all of data has been updated
-                        for (int j = 0; j < 16; j++) {
-                            if (data[j * 16] != i) result = false;
-                        }
-                        sc_arrive_at_barrier(bar, nullptr, nullptr);
-                    }
-
-                    for (int i = 0; i < 100; i++) {
-                        data[tid * 16] = i;
-                        sc_arrive_at_barrier(bar + 1, nullptr, nullptr);
-                        for (int j = 0; j < 16; j++) {
-                            if (data[j * 16] != i) result = false;
-                        }
-                        sc_arrive_at_barrier(bar + 1, nullptr, nullptr);
-                    }
-                },
-                t);
-    }
-    for (int t = 0; t < 16; t++) {
-        threads[t].join();
-    }
-    EXPECT_TRUE(result);
-}
-
-TEST(GCCore_CPU_thread_pool, TestThreadPool) {
-    if (runtime_config_t::get().managed_thread_pool_
-            == thread_pool_mode_t::DYNAMIC) {
-        GTEST_SKIP();
-    }
-    dnnl_thread_env();
-    auto &cfg = runtime_config_t::get();
-    std::vector<std::atomic<int>> v(100000);
-    std::vector<int> counts(
-            runtime_config_t::get().thread_pool_table_->get_num_threads());
-    struct env_t {
-        std::vector<std::atomic<int>> &v;
-        std::vector<int> &counts;
-    } env {v, counts};
-    auto funct = [](runtime::stream_t *s, void *mod_data,
-                         generic_val *args) noexcept {
-        env_t *penv = (env_t *)mod_data;
-        auto pcall = runtime_config_t::get().thread_pool_table_->parallel_call;
-        if (runtime_config_t::get().managed_thread_pool_
-                == thread_pool_mode_t::MANAGED) {
-            pcall = runtime_config_t::get()
-                            .thread_pool_table_->parallel_call_managed;
-        }
-        pcall(
-                [](void *a, void *b, int64_t idx, generic_val *args) {
-                    std::vector<std::atomic<int>> &v
-                            = *(std::vector<std::atomic<int>> *)b;
-
-                    if (idx % 2 != 0) throw std::runtime_error("Bad index");
-                    v.at(idx - 2)++;
-                    if (args) throw std::runtime_error("Bad arg");
-                    std::vector<int> &counts = *(std::vector<int> *)a;
-                    counts.at(runtime_config_t::get()
-                                      .thread_pool_table_->get_thread_id())++;
-                },
-                0, (void *)&penv->counts, (void *)&penv->v, 2, 100002, 2,
-                nullptr);
-    };
-
-    if (cfg.managed_thread_pool_ == thread_pool_mode_t::MANAGED) {
-        runtime::thread_manager::cur_mgr.run_main_function(
-                funct, nullptr, &env, nullptr);
-    } else {
-        funct(nullptr, &env, nullptr);
-    }
-    // int actual_num_threads = 0;
-    // for (auto &v : counts) {
-    //     if (v > 0) { actual_num_threads++; }
-    // }
-    // EXPECT_GT(actual_num_threads, 3);
-
-    for (size_t i = 0; i < v.size(); i++) {
-        if (i % 2 == 0) {
-            ASSERT_EQ(v[i].load(), 1);
-        } else {
-            ASSERT_EQ(v[i].load(), 0);
-        }
-    }
-    int old_num_threads = cfg.thread_pool_table_->get_num_threads();
-    cfg.thread_pool_table_->set_num_threads(12);
-    EXPECT_EQ(cfg.thread_pool_table_->get_num_threads(), 12);
-    cfg.thread_pool_table_->set_num_threads(old_num_threads);
-    EXPECT_EQ(cfg.thread_pool_table_->get_num_threads(), old_num_threads);
-}
-#endif
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-TEST(GCCore_CPU_thread_pool, TestThreadNum) {
-    if (runtime_config_t::get().managed_thread_pool_
-            == thread_pool_mode_t::DYNAMIC) {
-        GTEST_SKIP();
-    }
-    dnnl_thread_env();
-    auto &cfg = runtime_config_t::get();
-    int nthreads
-            = runtime_config_t::get().thread_pool_table_->get_num_threads();
-    std::vector<int> counts(nthreads);
-
-    auto funct = [](runtime::stream_t *s, void *mod_data,
-                         generic_val *args) noexcept {
-        std::vector<int> *penv = (std::vector<int> *)mod_data;
-        int nthreads
-                = runtime_config_t::get().thread_pool_table_->get_num_threads();
-        auto pcall = runtime_config_t::get().thread_pool_table_->parallel_call;
-        if (runtime_config_t::get().managed_thread_pool_
-                == thread_pool_mode_t::MANAGED) {
-            pcall = runtime_config_t::get()
-                            .thread_pool_table_->parallel_call_managed;
-        }
-        pcall(
-                [](void *a, void *mod_data, int64_t idx, generic_val *args) {
-                    std::this_thread::sleep_for(
-                            std::chrono::milliseconds(1000));
-                    std::vector<int> *penv = (std::vector<int> *)mod_data;
-                    penv->at(runtime_config_t::get()
-                                     .thread_pool_table_->get_thread_id())++;
-                },
-                0, nullptr, (void *)penv, 0, nthreads * 2, 1, nullptr);
-    };
-
-    if (cfg.managed_thread_pool_ == thread_pool_mode_t::MANAGED) {
-        runtime::thread_manager::cur_mgr.run_main_function(
-                funct, nullptr, &counts, nullptr);
-    } else {
-        funct(nullptr, &counts, nullptr);
-    }
-    for (auto &v : counts) {
-        ASSERT_EQ(v, 2);
-    }
-}
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_transpose.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_transpose.cpp
deleted file mode 100644
index 9485fcd57df..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_transpose.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <iostream>
-#include <numeric>
-#include <vector>
-#include "context.hpp"
-#include "test_utils.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/fusion_anchor.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <compiler/ir/transform/auto_cast.hpp>
-#include <compiler/ir/transform/constant_fold.hpp>
-#include <compiler/ir/transform/index_flatten.hpp>
-#include <compiler/jit/jit.hpp>
-#include <ops/fusible/memory_movement.hpp>
-#include <util/any_map.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-template <class T>
-static std::vector<T> ref_transpose(const std::vector<T> &data,
-        const sc_dims &input_dims, std::vector<int> axis) {
-    assert(axis.size() == 2 || axis.empty());
-    const int num_of_loops = input_dims.size();
-    sc_dims output_dims = input_dims;
-    if (axis.empty()) { axis = {0, 0}; }
-    std::swap(output_dims[axis[0]], output_dims[axis[1]]);
-    std::vector<int> lp_vars(num_of_loops);
-    std::vector<T> ret(data.size());
-
-    std::function<void(int)> do_for_loop;
-    do_for_loop = [&](int lp_index) {
-        for (; lp_vars[lp_index] < output_dims[lp_index]; lp_vars[lp_index]++) {
-            if (lp_index == num_of_loops - 1) {
-                auto input_lp_vars = lp_vars;
-                std::swap(input_lp_vars[axis[0]], input_lp_vars[axis[1]]);
-                int out_index = 0, in_index = 0;
-                for (auto i = 0; i < num_of_loops; i++) {
-                    if (i == num_of_loops - 1) {
-                        out_index += lp_vars[i];
-                        in_index += input_lp_vars[i];
-                    } else {
-                        out_index
-                                = (out_index + lp_vars[i]) * output_dims[i + 1];
-                        in_index = (in_index + input_lp_vars[i])
-                                * input_dims[i + 1];
-                    }
-                }
-                ret[out_index] = data[in_index];
-            } else {
-                do_for_loop(lp_index + 1);
-            }
-        }
-        lp_vars[lp_index] = 0;
-    };
-    do_for_loop(0);
-    return ret;
-}
-
-void transpose_test(const sc_dims &input_dims, const std::vector<int> &order,
-        const std::vector<int> &axis) {
-    REQUIRE_AVX2();
-    sc_graph_t graph;
-    auto in = graph_tensor::make(
-            input_dims, sc_data_format_t(), datatypes::f32);
-    auto trans = graph.make("transpose", {in}, {}, {{"order", order}});
-    auto out = graph.make_output(trans->get_outputs());
-
-    graph_driver(graph, get_test_ctx());
-    ir_module_ptr mod = lower_graph(get_test_ctx(), graph,
-            {graph.get_input_ops()[0], graph.get_output_ops()[0]});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(mod, true);
-
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<float> input_data(input_size);
-    test_utils::fill_data(input_data.data(), input_size);
-    std::vector<float> output_data(input_size);
-    std::vector<generic_val> gargs;
-    gargs.emplace_back(input_data.data());
-    gargs.emplace_back(output_data.data());
-    fptr->call_generic_default(gargs.data());
-    std::vector<float> ref_output(input_size);
-    ref_output = ref_transpose<float>(input_data, input_dims, axis);
-    test_utils::compare_data(output_data, ref_output, 1e-4f, 1e-5f);
-}
-
-static void check_format_correctness(const sc_dims &plain_dims,
-        const std::vector<int> &order, sc_data_format_t input_format,
-        sc_data_format_t ref_output_format) {
-    sc_graph_t graph;
-    auto in = graph_tensor::make(plain_dims, input_format, datatypes::f32);
-    auto trans = graph.make("transpose", {in}, {}, {{"order", order}});
-    auto out = trans->get_outputs()[0];
-
-    std::vector<std::vector<format_stride_pair>> in_supported_lts,
-            out_supported_lts;
-    trans->query_format(
-            get_default_context(), in_supported_lts, out_supported_lts);
-    auto fs_pair = out_supported_lts[0][0];
-    out->details_.set_format_and_stride(fs_pair.first, fs_pair.second);
-    ASSERT_TRUE(fs_pair.first == ref_output_format);
-    ASSERT_TRUE(in->details_.get_blocking_dims()
-            == out->details_.get_blocking_dims());
-}
-
-TEST(GCCore_CPU_transpose_test, TestQueryFormat) {
-    check_format_correctness({1, 2, 3, 4}, {0, 2, 1, 3},
-            sc_data_format_t(sc_data_format_kind_t(0, 3, 1, 2)),
-            sc_data_format_t(sc_data_format_kind_t(0, 3, 2, 1)));
-    check_format_correctness({1, 12, 128, 64}, {0, 1, 3, 2},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 2, 3), {12, 64}),
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 1, 3, 2, 3, 2), {12, 64}));
-    check_format_correctness({1, 12, 128, 64}, {0, 3, 2, 1},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3, 2, 3), {64, 32}),
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 3, 2, 1, 2, 1), {64, 32}));
-    check_format_correctness({1, 12, 128, 64}, {0, 1, 3, 2},
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 1, 2, 3, 2, 3, 2), {32, 16, 2}),
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 1, 3, 2, 3, 2, 3), {32, 16, 2}));
-    check_format_correctness({1, 12, 128, 64}, {0, 3, 2, 1},
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 1, 2, 3, 2, 3, 2), {32, 16, 2}),
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 3, 2, 1, 2, 1, 2), {32, 16, 2}));
-    check_format_correctness({1, 384, 16, 64}, {0, 2, 3, 1},
-            sc_data_format_t(sc_data_format_kind_t(0, 1, 2, 3)),
-            sc_data_format_t(sc_data_format_kind_t(0, 3, 1, 2)));
-    check_format_correctness({1, 12, 128, 64}, {3, 2, 0, 1},
-            sc_data_format_t(
-                    sc_data_format_kind_t(0, 3, 1, 2, 3, 2, 3), {32, 16, 2}),
-            sc_data_format_t(
-                    sc_data_format_kind_t(2, 0, 3, 1, 0, 1, 0), {32, 16, 2}));
-}
-
-TEST(GCCore_CPU_transpose_test, TestSingleTranspose) {
-    transpose_test({4, 8, 16, 32}, {0, 1, 3, 2}, {2, 3});
-    transpose_test({4, 8, 16, 32}, {0, 2, 1, 3}, {1, 2});
-    transpose_test({4, 8, 16, 32}, {0, 1, 2, 3}, {});
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_unary_elementwise.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_unary_elementwise.cpp
deleted file mode 100644
index dd80072d821..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_unary_elementwise.cpp
+++ /dev/null
@@ -1,304 +0,0 @@
-/*******************************************************************************
- * Copyright 2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include "act_ref.hpp"
-#include "compiler/ir/graph/driver.hpp"
-#include "compiler/ir/graph/graph.hpp"
-#include "compiler/ir/graph/lowering.hpp"
-#include "compiler/ir/sc_data_format.hpp"
-#include "context.hpp"
-#include "gtest/gtest.h"
-#include <compiler/jit/jit.hpp>
-
-using namespace dnnl::impl::graph::gc;
-static bool verbose = false;
-static float alpha = 0.35f;
-static float thebeta = 1.47f;
-
-TEST(GCCore_CPU_unary_elemwise, TestStridedRelu) {
-    REQUIRE_AVX2();
-    sc_dims input_dims = {128, 64, 56, 56};
-    sc_graph_t g;
-    auto ins = g.make_input({graph_tensor::make(input_dims)});
-    auto reorderop = g.make("reorder", ins->get_outputs(),
-            {graph_tensor::make(input_dims, sc_data_format_t::NCHWc(16))}, {});
-    auto out = g.make_output(g.make("relu", reorderop->get_outputs(),
-                                      {graph_tensor::make(input_dims)}, {})
-                                     ->get_outputs());
-
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<float> input_data(input_size), sc_output(input_size),
-            ref_output(input_size);
-    test_utils::fill_data(&input_data[0], input_size);
-
-    graph_driver(g);
-    auto f = lower_graph(get_test_ctx(), g, {ins, out});
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f);
-    fptr->call_default(&input_data[0], &sc_output[0]);
-
-    ref_relu(ref_output.data(), input_data.data(), input_size);
-    test_utils::compare_data(sc_output, ref_output, 1e-4f, 1e-5f);
-}
-
-template <typename T>
-static void check_unary_elementwise(const std::string &op_name,
-        const sc_dims &input_dims,
-        std::function<void(T *, const T *, size_t)> &ref_func) {
-    sc_graph_t g;
-    sc_op_ptr ins;
-    bool is_bf16 = std::is_same<typename std::decay<T>::type, bf16_t>::value;
-    if (is_bf16
-            && !::dnnl::impl::graph::gc::get_default_context()
-                        ->machine_.cpu_flags_.fAVX512F) {
-        return;
-    }
-    ins = g.make_input({graph_tensor::make(input_dims, sc_data_format_t(),
-            is_bf16 ? datatypes::bf16 : datatypes::f32)});
-    std::shared_ptr<sc_op> op;
-    if (op_name == "clamp") {
-        op = g.make(op_name, ins->get_outputs(), {},
-                {{"min", alpha}, {"max", thebeta}});
-    } else {
-        op = g.make(op_name, ins->get_outputs(), {},
-                {{"alpha", alpha}, {"beta", thebeta}});
-    }
-
-    g.make_output(op->get_outputs());
-    graph_driver(g, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), g, {});
-    if (verbose) std::cout << f << std::endl;
-    if (op_name == "pow" && thebeta == -0.5f) {
-        std::stringstream ss;
-        ss << f;
-        EXPECT_TRUE(ss.str().find("rsqrt") != std::string::npos);
-    }
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<T> input_data(input_size);
-    if (utils::is_one_of(op_name, std::string("log"), std::string("pow"))) {
-        test_utils::fill_data(&input_data[0], input_size, static_cast<T>(1e-4f),
-                static_cast<T>(1.f));
-    } else {
-        test_utils::fill_data(&input_data[0], input_size);
-    }
-    std::vector<T> ref_output(input_size);
-    ref_func(ref_output.data(), input_data.data(), input_size);
-    std::vector<T> sc_output(input_size);
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f);
-    fptr->call_default(&input_data[0], &sc_output[0]);
-    if (!is_bf16) {
-        test_utils::compare_data(sc_output, ref_output, 1e-4f, 1e-5f);
-    } else {
-        float sum = 0.f;
-        std::for_each(sc_output.begin(), sc_output.end(),
-                [&sum](const T &n) { sum += std::abs(float(n)); });
-        EXPECT_TRUE(test_utils::cal_rmse(sc_output, ref_output) / sum < 1e-3f);
-    }
-}
-
-template <typename T,
-        typename dummy = typename std::enable_if<
-                std::is_same<typename std::decay<T>::type, float>::value
-                || std::is_same<typename std::decay<T>::type, bf16_t>::value>>
-static void check_unary_elementwise(const std::string &op_name,
-        const sc_dims &input_dims, void (*ref_func)(T *, const T *, size_t)) {
-    std::function<void(T *, const T *, size_t)> ref_func_obj = ref_func;
-    check_unary_elementwise(op_name, input_dims, ref_func_obj);
-}
-
-static void check_qnan_log(const sc_dims &input_dims) {
-    sc_graph_t g;
-    sc_op_ptr ins;
-    ins = g.make_input({graph_tensor::make(
-            input_dims, sc_data_format_t(), datatypes::f32)});
-    auto op = g.make("log", ins->get_outputs(), {},
-            {{"alpha", alpha}, {"beta", thebeta}});
-    g.make_output(op->get_outputs());
-    graph_driver(g, get_test_ctx());
-
-    auto f = lower_graph(get_test_ctx(), g, {});
-    if (verbose) std::cout << f << std::endl;
-    const auto input_size = test_utils::product(input_dims);
-    std::vector<float> input_data(input_size);
-    test_utils::fill_data(&input_data[0], input_size,
-            std::numeric_limits<float>::quiet_NaN());
-    std::vector<float> sc_output(input_size);
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f);
-    fptr->call_default(&input_data[0], &sc_output[0]);
-    for (auto &it : sc_output) {
-        EXPECT_TRUE(std::isnan(it));
-    }
-}
-
-static std::vector<sc_dims> test_shapes
-        = {{16, 63}, {2, 8, 4}, {4, 16, 256, 1024}};
-TEST(GCCore_CPU_unary_elementwise_test, TestAbsOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("abs", shape, ref_abs);
-        check_unary_elementwise<bf16_t>("abs", shape, ref_abs);
-    }
-}
-
-template <typename T>
-void ref_elu_func(T *out, const T *in, size_t size) {
-    auto func = std::bind(ref_elu<T>, std::placeholders::_1,
-            std::placeholders::_2, std::placeholders::_3, alpha);
-    func(out, in, size);
-}
-TEST(GCCore_CPU_unary_elementwise_test, TestEluOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("elu", shape, ref_elu_func);
-        check_unary_elementwise<bf16_t>("elu", shape, ref_elu_func);
-    }
-}
-
-template <typename T>
-void ref_hardsigmoid_func(T *out, const T *in, size_t size) {
-    auto func = std::bind(ref_hardsigmoid<T>, std::placeholders::_1,
-            std::placeholders::_2, std::placeholders::_3, alpha, thebeta);
-    func(out, in, size);
-}
-TEST(GCCore_CPU_unary_elementwise_test, TestHardSigmoidOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>(
-                "hardsigmoid", shape, ref_hardsigmoid_func);
-        check_unary_elementwise<bf16_t>(
-                "hardsigmoid", shape, ref_hardsigmoid_func);
-    }
-}
-
-template <typename T>
-void ref_hardswish_func(T *out, const T *in, size_t size) {
-    auto func = std::bind(ref_hardswish<T>, std::placeholders::_1,
-            std::placeholders::_2, std::placeholders::_3, alpha, thebeta);
-    func(out, in, size);
-}
-TEST(GCCore_CPU_unary_elementwise_test, TestHardSwishOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("hardswish", shape, ref_hardswish_func);
-        check_unary_elementwise<bf16_t>("hardswish", shape, ref_hardswish_func);
-    }
-}
-
-template <typename T>
-void ref_clamp_func(T *out, const T *in, size_t size) {
-    auto func = std::bind(ref_clamp<T>, std::placeholders::_1,
-            std::placeholders::_2, std::placeholders::_3, alpha, thebeta);
-    func(out, in, size);
-}
-TEST(GCCore_CPU_unary_elementwise_test, TestClampOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("clamp", shape, ref_clamp_func);
-        check_unary_elementwise<bf16_t>("clamp", shape, ref_clamp_func);
-    }
-}
-
-TEST(GCCore_CPU_unary_elementwise_test, TestLogOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("log", shape, ref_log);
-        check_unary_elementwise<bf16_t>("log", shape, ref_log);
-        check_qnan_log(shape);
-    }
-}
-
-TEST(GCCore_CPU_unary_elementwise_test, TestMishOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("mish", shape, ref_mish);
-        check_unary_elementwise<bf16_t>("mish", shape, ref_mish);
-    }
-}
-
-template <typename T>
-void ref_soft_plus_func(T *out, const T *in, size_t size) {
-    auto func = std::bind(ref_soft_plus<T>, std::placeholders::_1,
-            std::placeholders::_2, std::placeholders::_3, thebeta);
-    func(out, in, size);
-}
-TEST(GCCore_CPU_unary_elementwise_test, TestSoftPlusOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("soft_plus", shape, ref_soft_plus_func);
-        check_unary_elementwise<bf16_t>("soft_plus", shape, ref_soft_plus_func);
-    }
-}
-
-TEST(GCCore_CPU_unary_elementwise_test, TestSquareOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("square", shape, ref_square);
-        check_unary_elementwise<bf16_t>("square", shape, ref_square);
-    }
-}
-
-template <typename T>
-void ref_swish_func(T *out, const T *in, size_t size) {
-    auto func = std::bind(ref_swish<T>, std::placeholders::_1,
-            std::placeholders::_2, std::placeholders::_3, alpha);
-    func(out, in, size);
-}
-TEST(GCCore_CPU_unary_elementwise_test, TestSwishOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("swish", shape, ref_swish_func);
-        check_unary_elementwise<bf16_t>("swish", shape, ref_swish_func);
-    }
-}
-
-TEST(GCCore_CPU_unary_elementwise_test, TestTanhOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("tanh", shape, ref_tanh);
-        check_unary_elementwise<bf16_t>("tanh", shape, ref_tanh);
-    }
-}
-
-TEST(GCCore_CPU_unary_elementwise_test, TestErfOp) {
-    REQUIRE_AVX2();
-    for (auto &shape : test_shapes) {
-        check_unary_elementwise<float>("erf", shape, ref_erf);
-        check_unary_elementwise<bf16_t>("erf", shape, ref_erf);
-    }
-}
-
-TEST(GCCore_CPU_unary_elementwise_test, TestPowOp) {
-    REQUIRE_AVX2(); // SSE no vgatherdps
-    BUILTIN_REQUIRE_AVX512(); // AVX2 accuracy fail (rsqrt)
-    std::vector<float> beta_candidates
-            = {thebeta, 0.f, 1.f, 2.f, 3.f, 0.5f, -0.5f, -1.f};
-    auto old_beta = thebeta;
-    for (auto cur_beta : beta_candidates) {
-        thebeta = cur_beta;
-        for (auto &shape : test_shapes) {
-            std::function<void(float *, const float *, size_t)> f32_func(
-                    std::bind(ref_pow<float>, std::placeholders::_1,
-                            std::placeholders::_2, std::placeholders::_3,
-                            cur_beta));
-            std::function<void(bf16_t *, const bf16_t *, size_t)> bf16_func(
-                    std::bind(ref_pow<bf16_t>, std::placeholders::_1,
-                            std::placeholders::_2, std::placeholders::_3,
-                            bf16_t(cur_beta)));
-            check_unary_elementwise<float>("pow", shape, f32_func);
-            check_unary_elementwise<bf16_t>("pow", shape, bf16_func);
-        }
-    }
-    thebeta = old_beta;
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_utils.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_utils.hpp
deleted file mode 100644
index 04e6055dc9a..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_utils.hpp
+++ /dev/null
@@ -1,728 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_UTILS_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_UTILS_HPP
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include <numeric>
-#include <utility>
-#include <vector>
-
-#include "util/def.hpp"
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-#include <test_thread.hpp>
-#endif
-
-#include "test_utils_arr_fill.hpp"
-#include "util/bf16.hpp"
-#include "gtest/gtest.h"
-#include <compiler/dimensions.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/sc_data_format.hpp>
-#include <util/parallel.hpp>
-#include <util/utils.hpp>
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-#include <omp.h>
-#endif
-
-#include <common/dnnl_thread.hpp>
-
-#define SKIP_BF16(dtype) \
-    if (dtype == datatypes::bf16 \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512AMXBF16 \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512BF16) { \
-        return; \
-    }
-
-#define SKIP_F16(dtype) \
-    if (dtype == datatypes::f16 \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512AMXFP16 \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512FP16) { \
-        return; \
-    }
-
-#define SKIP_BF16_FP16(dtype) \
-    if ((dtype == datatypes::f16 \
-                && !::dnnl::impl::graph::gc::get_default_context() \
-                            ->machine_.cpu_flags_.fAVX512AMXFP16 \
-                && !::dnnl::impl::graph::gc::get_default_context() \
-                            ->machine_.cpu_flags_.fAVX512FP16) \
-            || (dtype == datatypes::bf16 \
-                    && !::dnnl::impl::graph::gc::get_default_context() \
-                                ->machine_.cpu_flags_.fAVX512AMXBF16 \
-                    && !::dnnl::impl::graph::gc::get_default_context() \
-                                ->machine_.cpu_flags_.fAVX512BF16)) { \
-        return; \
-    }
-
-#define REQUIRE_BF16() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXBF16 \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512BF16) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_FP16() \
-    if (!(::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512FP16 \
-                || (::dnnl::impl::graph::gc::get_default_context() \
-                                ->machine_.cpu_flags_.fAVX512AMXFP16 \
-                        && ::dnnl::impl::graph::gc::get_default_context() \
-                                   ->machine_.cpu_flags_.fAVX512VNNI))) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_AVX() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_VNNI() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512VNNI) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_AVX512() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512F) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_AVX512FP16() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512FP16) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_AVX512AMXFP16() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXFP16) { \
-        GTEST_SKIP(); \
-    }
-
-#define REQUIRE_AVX2() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX2) { \
-        GTEST_SKIP(); \
-    }
-
-#if SC_BUILTIN_JIT_ENABLED
-#define BUILTIN_REQUIRE_AVX512() \
-    if (::dnnl::impl::graph::gc::get_default_context()->flags_.jit_kind_ \
-            == ::dnnl::impl::graph::gc::jit_kind::xbyak) { \
-        REQUIRE_AVX512(); \
-    }
-#else
-#define BUILTIN_REQUIRE_AVX512()
-#endif
-
-#define REQUIRE_AVX512VBMI() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512VBMI) { \
-        GTEST_SKIP(); \
-    }
-
-#define IS_AVX512_AVAILABLE() \
-    (::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512F)
-
-#define IS_AMX_AVAILABLE() \
-    (::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXTILE \
-            && ::dnnl::impl::graph::gc::get_default_context() \
-                       ->machine_.brgemm_use_amx_)
-
-#define REQUIRE_AMX() \
-    if (!IS_AMX_AVAILABLE()) { GTEST_SKIP(); }
-
-#define SKIP_AMX() \
-    if (IS_AMX_AVAILABLE()) { GTEST_SKIP(); }
-
-#define SKIP_ON_INSUFFICIENT_LANES(lanes, etype) \
-    if (::dnnl::impl::graph::gc::get_default_context()->get_max_vector_lanes( \
-                etype) \
-            < lanes) { \
-        GTEST_SKIP(); \
-    }
-
-#define SKIP_BOUNDARY_CHECK()
-
-#define SKIP_ON_XBYAK() \
-    if (::dnnl::impl::graph::gc::get_default_context()->flags_.jit_kind_ \
-            == jit_kind::xbyak) { \
-        GTEST_SKIP(); \
-    }
-
-#if defined(SC_LLVM_BACKEND)
-#define SKIP_ON_LLVM() \
-    if (::dnnl::impl::graph::gc::get_default_context()->flags_.jit_kind_ \
-            == jit_kind::llvm) { \
-        GTEST_SKIP(); \
-    }
-#else
-#define SKIP_ON_LLVM()
-#endif
-
-#if defined(_MSC_VER)
-#define SKIP_ON_WINDOWS() GTEST_SKIP();
-#else
-#define SKIP_ON_WINDOWS()
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-#define SC_OMP_CLAUSE(CONTENT) _Pragma(CONTENT)
-#else
-#define SC_OMP_CLAUSE(CONTENT)
-#endif
-
-namespace test_utils {
-#define TEST_ASSERT(cond, ...) \
-    if (!(cond)) { \
-        ::std::cerr << __FILE__ << "[" << __LINE__ << "]: " << __VA_ARGS__ \
-                    << "\n"; \
-        ::std::abort(); \
-    }
-
-template <typename T>
-std::vector<T> reorder_low_accuracy_format(
-        const std::vector<T> &input, unsigned blocks, unsigned K, unsigned N) {
-    if (input.size() != blocks * K * N) {
-        std::cerr << "wrong input shapes." << std::endl;
-        exit(1);
-    }
-    unsigned inner_blocks = 2;
-    if (std::is_same<T, uint16_t>() || std::is_same<T, bf16_t>()) {
-        inner_blocks = 2;
-    } else if (std::is_same<T, int8_t>() || std::is_same<T, uint8_t>()) {
-        inner_blocks = 4;
-    } else {
-        std::cerr << "unsupported datatype reorder!" << std::endl;
-        exit(1);
-    }
-    std::vector<T> results;
-    results.resize(blocks * utils::divide_and_ceil(K, inner_blocks)
-            * inner_blocks * N);
-    for (unsigned b = 0; b < blocks; b++) {
-        for (unsigned k = 0; k < utils::divide_and_ceil(K, inner_blocks); k++) {
-            for (unsigned n = 0; n < N; n++) {
-                for (unsigned i = 0; i < inner_blocks; i++) {
-                    if (k * inner_blocks + i < K) {
-                        results[b * utils::divide_and_ceil(K, inner_blocks)
-                                        * inner_blocks * N
-                                + k * N * inner_blocks + n * inner_blocks + i]
-                                = input[b * K * N + (k * inner_blocks + i) * N
-                                        + n];
-                    } else {
-                        results[b * utils::divide_and_ceil(K, inner_blocks) * N
-                                        * inner_blocks
-                                + k * N * inner_blocks + n * inner_blocks + i]
-                                = 0;
-                    }
-                }
-            }
-        }
-    }
-    return results;
-}
-
-inline std::pair<float, int8_t> get_scale_and_zeropoint(
-        const std::vector<float> &input, bool symmetric = true) {
-    if (input.size() < 2) {
-        std::cerr << "too few elements in input tensor." << std::endl;
-        exit(1);
-    }
-    float xmin = FLT_MAX, xmax = FLT_MIN;
-    for (auto &it : input) {
-        xmin = std::min(xmin, it);
-        xmax = std::max(xmax, it);
-    }
-    float scale = 0.f;
-    int8_t zero_point = 0;
-    if (symmetric) {
-        scale = std::max(std::abs(xmin), std::abs(xmax)) / 127;
-        return std::make_pair(scale, zero_point);
-    } else {
-        int8_t qmin = -128;
-        scale = (xmax - xmin) / (float)255;
-        zero_point = qmin - xmin / scale;
-        return std::make_pair(scale, zero_point);
-    }
-}
-
-inline int8_t f32_to_int8(float x, float scale) {
-    return x / scale;
-}
-
-inline uint8_t f32_to_uint8(float x, float scale, int8_t zero_point = 0) {
-    return (int)(x / scale) + (int)zero_point;
-}
-
-inline float int8_to_f32(int x, float scale, int8_t zero_point = 0) {
-    return (x - zero_point) * scale;
-}
-
-template <class T>
-std::vector<float> convert_int8_to_f32(const std::vector<T> &ref_int8) {
-    std::vector<float> ref_f32;
-    ref_f32.reserve(ref_int8.size());
-    std::for_each(ref_int8.begin(), ref_int8.end(),
-            [&](T x) { ref_f32.push_back(static_cast<float>(x)); });
-    return ref_f32;
-}
-
-template <class T>
-std::vector<T> convert_f32_to_int8(const std::vector<float> &ref_f32) {
-    int clip_min = 0;
-    std::vector<T> ref_int8;
-    ref_int8.reserve(ref_f32.size());
-    std::for_each(ref_f32.begin(), ref_f32.end(), [&](float x) {
-        if (std::is_same<T, uint8_t>()) {
-            clip_min = 0;
-        } else if (std::is_same<T, int8_t>()) {
-            clip_min = -128;
-        }
-        ref_int8.push_back(static_cast<T>(std::max(
-                std::min(static_cast<int>(std::round(x)), 255), clip_min)));
-    });
-    return ref_int8;
-}
-
-/**
- * Helper function to get the product result for a given dims.
- * @param dims the shape vector
- * @return the total number of elements for t tensor
- * */
-inline size_t product(const std::vector<int64_t> &dims) {
-    size_t ret = 1;
-    for (unsigned i = 0; i < dims.size(); ++i) {
-        ret *= dims[i];
-    }
-    return ret;
-}
-
-/**
- * Helper function to calculate rmse given two arrays of float number
- * @param a float array 1
- * @param b float array 2
- * @return the rmse of two arrays
- * */
-template <typename T>
-inline float cal_rmse(const std::vector<T> &a, const std::vector<T> &b) {
-    static_assert(
-            std::is_same<T, float>::value || std::is_same<T, bf16_t>::value,
-            "Need f32/bf16 data type.");
-    COMPILE_ASSERT(a.size() && a.size() == b.size(),
-            "Two vector should have same size and can not be empty.");
-    float sum = 0.f;
-    SC_OMP_CLAUSE("omp parallel for reduction(+ : sum)")
-    for (int64_t i = 0; i < (int64_t)a.size(); i++) {
-        auto e = a[i] - b[i];
-        sum = sum + e * e;
-    }
-    return std::sqrt(sum / a.size());
-}
-
-/**
- * Helper function to calculate RMSRE(Root Mean Squared Relative Errors) given
- * two arrays of float number
- * @param a float array 1
- * @param b float array 2
- * @return the RMSRE of two arrays
- * */
-inline float cal_rmsre(
-        const std::vector<float> &a, const std::vector<float> &b) {
-    COMPILE_ASSERT(a.size() && a.size() == b.size(),
-            "Two vector should have same size and can not be empty.");
-    float sum_sre = 0.f;
-    SC_OMP_CLAUSE("omp parallel for reduction(+ : sum_sre)")
-    for (int64_t i = 0; i < (int64_t)a.size(); i++) {
-        auto e = a[i] - b[i];
-        // Use Relative Percent Difference(RPD) to avoid floating point error
-        auto re = e < 1e-10 ? e : 2.f * e / (fabs(a[i]) + fabs(b[i]));
-        sum_sre += re * re;
-    }
-    // printf("rmsre = %f\n", std::sqrt(sum_sre / a.size()));
-    return std::sqrt(sum_sre / a.size());
-}
-
-// TODO(xxx): Data type specification, might be mergeed and align with
-// implementation part.
-enum class data_type {
-    /// Undefined data type (used for void).
-    undef = 0,
-    /// 16-bit/half-precision floating point.
-    f16,
-    /// non-standard 16-bit floating point with 7-bit mantissa.
-    bf16,
-    /// 32-bit/single-precision floating point.
-    f32,
-    /// 32-bit signed integer.
-    s32,
-    /// 8-bit signed integer.
-    s8,
-    /// 8-bit unsigned integer.
-    u8,
-    /// boolean
-    boolean,
-};
-
-template <typename T>
-struct data_traits {};
-
-template <>
-struct data_traits<int32_t> {
-    static const auto dtype = data_type::s32;
-};
-
-template <>
-struct data_traits<bf16_t> {
-    static const auto dtype = data_type::bf16;
-};
-
-template <>
-struct data_traits<uint8_t> {
-    static const auto dtype = data_type::u8;
-};
-
-template <>
-struct data_traits<int8_t> {
-    static const auto dtype = data_type::s8;
-};
-
-template <>
-struct data_traits<float> {
-    static const auto dtype = data_type::f32;
-};
-
-template <typename... Args>
-void parallel_nd(Args &&...args) {
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-    dnnl::impl::threadpool_utils::activate_threadpool(
-            dnnl::testing::get_threadpool());
-#endif
-    dnnl::impl::parallel_nd(std::forward<Args>(args)...);
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-    dnnl::impl::threadpool_utils::deactivate_threadpool();
-#endif
-}
-
-template <typename T>
-void rand_fill_stable(T *v, size_t sz, uint32_t &seed) {
-    for (size_t i = 0; i < sz; i++) {
-        v[i] = test_utils::rand_for_test<T>(seed);
-    }
-}
-
-template <typename T, typename A>
-static void fill_data(T *buf, size_t size, A val) {
-    parallel_nd(size, [&](size_t i) { buf[i] = static_cast<T>(val); });
-}
-
-// without gtest interface so we can return count
-template <typename T>
-static int compare_data_count(T *dst, T *ref, size_t size, float rtol = 1e-4f,
-        float atol = 1e-6f, std::function<void()> on_error = nullptr) {
-    bool pass = true;
-    std::atomic<int> count(0);
-    parallel_nd(size, [&](size_t i) {
-        // early stopping to avoid verbose outputs
-        if (data_traits<T>::dtype == data_type::f32
-                || data_traits<T>::dtype == data_type::bf16) {
-            const float ref_f32 = static_cast<float>(ref[i]);
-            const float dst_f32 = static_cast<float>(dst[i]);
-            const float diff_f32 = dst_f32 - ref_f32;
-            const float gap = rtol
-                            * (std::abs(ref_f32) > std::abs(dst_f32)
-                                            ? std::abs(ref_f32)
-                                            : std::abs(dst_f32))
-                    + atol;
-            if (std::abs(diff_f32) > gap) {
-                pass = false;
-                count = count + 1;
-            }
-        } else {
-            EXPECT_EQ(ref[i], dst[i]) << "Index: " << i;
-            if (ref[i] != dst[i]) {
-                pass = false;
-                count = count + 1;
-            }
-        }
-    });
-    if (!pass && on_error) { on_error(); }
-    return count;
-}
-
-template <typename T>
-inline void compare_data_single(
-        const T *dst, const T *ref, int i, float rtol, float atol, bool &pass) {
-    EXPECT_EQ(ref[i], dst[i]) << "Index: " << i;
-    if (ref[i] != dst[i]) { pass = false; }
-}
-
-template <typename T>
-inline void compare_data_int(
-        const T *dst, const T *ref, int i, float rtol, float atol, bool &pass) {
-    int64_t ref_64 = static_cast<int64_t>(ref[i]);
-    int64_t dst_64 = static_cast<int64_t>(dst[i]);
-    const int64_t diff_64 = dst_64 - ref_64;
-    const double gap = double(rtol)
-                    * (std::abs(ref_64) > std::abs(dst_64) ? std::abs(ref_64)
-                                                           : std::abs(dst_64))
-            + atol;
-    bool good = std::abs(diff_64) <= gap;
-    EXPECT_TRUE(good) << "Index: " << i << ", ref_64=" << ref_64
-                      << ", dst_64=" << dst_64;
-    if (!good) { pass = false; }
-}
-
-template <>
-inline void compare_data_single(const uint8_t *dst, const uint8_t *ref, int i,
-        float rtol, float atol, bool &pass) {
-    compare_data_int(dst, ref, i, rtol, atol, pass);
-}
-
-template <>
-inline void compare_data_single(const int8_t *dst, const int8_t *ref, int i,
-        float rtol, float atol, bool &pass) {
-    compare_data_int(dst, ref, i, rtol, atol, pass);
-}
-
-template <typename T>
-inline void compare_data_fp(
-        const T *dst, const T *ref, int i, float rtol, float atol, bool &pass) {
-    const float ref_f32 = static_cast<float>(ref[i]);
-    const float dst_f32 = static_cast<float>(dst[i]);
-    const double diff_f32 = dst_f32 - ref_f32;
-    const double gap = double(rtol)
-                    * (std::abs(ref_f32) > std::abs(dst_f32)
-                                    ? std::abs(ref_f32)
-                                    : std::abs(dst_f32))
-            + atol;
-    bool good = std::abs(diff_f32) <= gap;
-    EXPECT_TRUE(good) << "Index: " << i << ", ref_f32=" << ref_f32
-                      << ", dst_f32=" << dst_f32;
-    if (!good) { pass = false; }
-}
-
-template <>
-inline void compare_data_single(const float *dst, const float *ref, int i,
-        float rtol, float atol, bool &pass) {
-    compare_data_fp(dst, ref, i, rtol, atol, pass);
-}
-
-template <>
-inline void compare_data_single(const bf16_t *dst, const bf16_t *ref, int i,
-        float rtol, float atol, bool &pass) {
-    compare_data_fp(dst, ref, i, rtol, atol, pass);
-}
-
-template <typename T>
-inline void compare_data(const T *dst, const T *ref, size_t size,
-        float rtol = 1e-4f, float atol = 1e-6f,
-        std::function<void()> on_error = nullptr) {
-    bool pass = true;
-    parallel_nd(size, [&](size_t i) {
-        // early stopping to avoid verbose outputs
-        if (!pass) { return; }
-        compare_data_single(dst, ref, i, rtol, atol, pass);
-    });
-    if (!pass && on_error) { on_error(); }
-}
-
-template <typename T, bool is_struct = std::is_class<T>::value>
-struct is_vector_like {
-    static constexpr bool value
-            = std::is_same<decltype(std::declval<T>().size()), size_t>::value
-            && std::is_pointer<decltype(std::declval<T>().data())>::value;
-};
-
-template <typename T>
-struct is_vector_like<T, false> {
-    static constexpr bool value = false;
-};
-
-// compares to vector like containers
-template <typename T1, typename T2>
-inline void compare_data(const T1 &dst, const T2 &ref, float rtol = 1e-4f,
-        typename std::enable_if<is_vector_like<T1>::value
-                        && is_vector_like<T2>::value,
-                float>::type atol
-        = 1e-6f,
-        std::function<void()> on_error = nullptr) {
-    ASSERT_NE(ref.size(), 0u) << "The ref size is 0";
-    ASSERT_EQ(dst.size(), ref.size())
-            << "The dst and ref size is not equal (" << ref.size() << " vs "
-            << dst.size() << ").";
-    compare_data(dst.data(), ref.data(), ref.size(), rtol, atol, on_error);
-}
-
-template <typename T>
-inline void check_fp_data(T *dat, size_t size, bool check_nan = true,
-        bool check_inf = true, std::function<void()> on_error = nullptr) {
-    bool pass = true;
-    parallel_nd(size, [&](size_t i) {
-        // early stopping to avoid verbose outputs
-        if (!pass) { return; }
-
-        bool is_nan = check_nan && std::isnan(dat[i]);
-        bool is_inf = check_inf && std::isinf(dat[i]);
-
-        bool good = !is_nan && !is_inf;
-
-        EXPECT_TRUE(good) << "Index: " << i << ", dat_fp=" << dat[i];
-
-        if (!good) { pass = false; }
-    });
-    if (!pass && on_error) { on_error(); }
-}
-
-template <typename T>
-inline void check_fp_data(std::vector<T> &dat, bool check_nan = true,
-        bool check_inf = true, std::function<void()> on_error = nullptr) {
-    ASSERT_NE(dat.size(), 0u) << "Data size is 0";
-    ASSERT_TRUE(data_traits<T>::dtype == data_type::f16
-            || data_traits<T>::dtype == data_type::bf16
-            || data_traits<T>::dtype == data_type::f32)
-            << "Data must be floating point";
-    check_fp_data(dat.data(), dat.size(), check_nan, check_inf, on_error);
-}
-
-template <typename T>
-inline void dump_data(const std::vector<T> &ref) {
-    // static_assert(sizeof(T) == 4, "Expecting size_t(T)==4");
-    // not sure if the string representaion of float is stable
-    // use int32 bit pattern
-    for (auto &v : ref) {
-        std::cout << std::hex << *(int32_t *)(&v) << ' ';
-    }
-    std::cout << '\n';
-}
-
-template <typename T>
-inline T cal_size(
-        const std::vector<T> &input, const std::vector<T> &strides = {}) {
-    if (input.empty()) { return 0; }
-    T result = 1;
-    if (strides.empty()) {
-        for (auto it : input) {
-            result *= it;
-        }
-    } else {
-        for (size_t i = 0; i < input.size(); ++i) {
-            result += (input[i] - 1) * strides[i];
-        }
-    }
-    return result;
-}
-
-inline sc_dims compute_dense_stride(const sc_dims &dim) {
-    sc_dims result(dim.size(), 1);
-    for (int i = dim.size() - 2; i >= 0; --i) {
-        result[i] = result[i + 1] * dim[i + 1];
-    }
-    return result;
-}
-
-inline sc_dims flattened_idx_to_ndims_idx(size_t idx, const sc_dims &strides) {
-    sc_dims ret(strides.size());
-    for (size_t i = 0; i < strides.size(); ++i) {
-        ret[i] = idx / strides[i];
-        idx -= ret[i] * strides[i];
-    }
-    return ret;
-}
-
-inline sc_dims get_extended_plain_dims(const std::vector<int> &plain_axis,
-        const sc_dims &plain_dims, const sc_dims &out_plain_dims) {
-    sc_dims extended_plain_dims(out_plain_dims.size(), 1);
-    if (plain_axis != std::vector<int> {-1}) {
-        if (plain_axis.size() == 1 && plain_dims.size() == 1) {
-            // bias_add semantics
-            extended_plain_dims[plain_axis[0]] = plain_dims[0];
-        } else {
-            // auto_broadcast semantics
-            int offset = out_plain_dims.size() - plain_dims.size();
-            for (size_t i = 0; i < plain_dims.size(); ++i) {
-                if (plain_dims[i] == out_plain_dims[i + offset]) {
-                    extended_plain_dims[i + offset] = plain_dims[i];
-                }
-            }
-        }
-    }
-    return extended_plain_dims;
-}
-
-inline uint8_t get_dyn_mask(const sc_dims &in) {
-    uint8_t ret = 0;
-    for (size_t i = 0; i < in.size(); i++) {
-        ret |= (is_dynamic_dim(in[i]) << i);
-    }
-    return ret;
-}
-
-inline std::vector<graph_tensor_ptr> make_tsr(const sc_dims &dims,
-        sc_data_type_t dtype = datatypes::s32,
-        sc_data_format_t format = sc_data_format_t()) {
-    if (format.is_any()) {
-        // auto reset format
-        std::vector<int> storage_args(dims.size());
-        std::iota(storage_args.begin(), storage_args.end(), 0);
-        format = sc_data_format_t(sc_data_format_kind_t(storage_args));
-    }
-    return {graph_tensor::make(dims, format, dtype)};
-}
-} // namespace test_utils
-
-struct thread_num_reset {
-    int old_;
-    thread_num_reset() : old_(runtime_config_t::get().get_num_threads()) {}
-    ~thread_num_reset() { runtime_config_t::get().set_num_threads(old_); }
-};
-
-#define REQUIRE_PARALLEL() \
-    if (::dnnl::impl::graph::gc::runtime_config_t::get().get_num_threads() \
-            <= 1) { \
-        GTEST_SKIP(); \
-    }
-
-#define SET_THREADS_OR_SKIP(NUM) \
-    thread_num_reset reseter__; \
-    if (!runtime_config_t::get().set_num_threads(NUM)) { GTEST_SKIP(); }
-
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_utils_arr_fill.hpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_utils_arr_fill.hpp
deleted file mode 100644
index 106af3a739e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_utils_arr_fill.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#ifndef GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_UTILS_ARR_FILL_HPP
-#define GRAPH_UNIT_BACKEND_GRAPH_COMPILER_CORE_TEST_UTILS_ARR_FILL_HPP
-
-#include <algorithm>
-#include <limits>
-#include <random>
-#include <stdlib.h>
-#include <typeinfo>
-#include <vector>
-#include <checked_ptr.hpp>
-#include <runtime/aligned_ptr.hpp>
-#include <util/bf16.hpp>
-#include <util/fp16.hpp>
-#include <util/parallel.hpp>
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_OMP
-#include <omp.h>
-#endif
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace gc {
-namespace test_utils {
-
-// rand number [0,0x7FFF]
-inline uint32_t randint_for_test(uint32_t &seed) {
-    seed = (214013 * seed + 2531011);
-    return (seed >> 16) & 0x7FFF;
-}
-
-template <typename T>
-T rand_for_test(uint32_t &seed, T a = 0, T b = 0x7FFF) {
-    return randint_for_test(seed) % (b - a) + a;
-}
-
-// rand number in [-1,1]
-template <>
-inline float rand_for_test<float>(uint32_t &seed, float a, float b) {
-    uint32_t rnd_int = rand_for_test<uint32_t>(seed);
-    // map to float
-    return rnd_int / float(0x7FFF) * (b - a) + a;
-}
-
-template <>
-inline double rand_for_test<double>(uint32_t &seed, double a, double b) {
-    uint32_t rnd_int = rand_for_test<uint32_t>(seed);
-    // map to double
-    return rnd_int / double(0x7FFF) * (b - a) + a;
-}
-
-// rand number in [-1,1]
-template <>
-inline bf16_t rand_for_test<bf16_t>(uint32_t &seed, bf16_t a, bf16_t b) {
-    return rand_for_test<float>(seed, a, b);
-}
-
-// rand number in [-1,1]
-template <>
-inline fp16_t rand_for_test<fp16_t>(uint32_t &seed, fp16_t a, fp16_t b) {
-    return rand_for_test<float>(seed, a, b);
-}
-
-template <typename T>
-inline void fill_data(T *data, size_t size, T a, T b) {
-    size_t num_thread = runtime_config_t::get().get_num_threads();
-    auto workload = size / num_thread;
-    if (workload < 64 / (sizeof(T))) {
-        uint32_t seed = rand(); // NOLINT
-        for (size_t i = 0; i < size; i++) {
-            data[i] = rand_for_test<T>(seed, a, b);
-        }
-        return;
-    }
-    dnnl::impl::graph::gc::utils::parallel_for(
-            0, num_thread, 1, [&](int64_t t0) {
-                uint64_t t = t0;
-                auto start = t * workload;
-                auto end = (t == num_thread - 1) ? size : (t + 1) * workload;
-                uint32_t seed = rand() + (uint32_t)t; // NOLINT
-                for (auto i = start; i < end; i++) {
-                    data[i] = rand_for_test<T>(seed, a, b);
-                }
-            });
-}
-
-template <typename T,
-        bool is_float
-        = std::is_floating_point<T>::value || std::is_same<T, bf16_t>::value>
-struct fill_data_impl_t {
-    // default: is_float=true, value range from -1 to 1
-    static void call(T *data, size_t size) {
-        fill_data(data, size, (T)-1.0, (T)1.0);
-    }
-};
-
-template <typename T>
-struct fill_data_impl_t<T, false> {
-    // default: is_float=false
-    static void call(T *data, size_t size) {
-        fill_data(data, size, std::numeric_limits<T>::min(),
-                std::numeric_limits<T>::max());
-    }
-};
-
-template <typename T>
-inline void fill_data(T *data, size_t size) {
-    fill_data_impl_t<T>::call(data, size);
-}
-} // namespace test_utils
-} // namespace gc
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#if defined(NDEBUG) || defined(_WIN32)
-template <typename T>
-using test_buffer = dnnl::impl::graph::gc::aligned_ptr_t<T>;
-#else
-template <typename T>
-using test_buffer = dnnl::impl::graph::gc::checked_ptr_t<T>;
-#endif
-
-enum init_action { INIT_NOOP, INIT_ZERO, INIT_RANDOM, INIT_RANGE };
-
-template <typename T>
-test_buffer<T> alloc_array(
-        size_t size, init_action action = INIT_RANDOM, T a = 0, T b = 1) {
-    test_buffer<T> data(size);
-    if (action == INIT_RANDOM) {
-        dnnl::impl::graph::gc::test_utils::fill_data(data.data(), size);
-    } else if (action == INIT_ZERO) {
-        data.zeroout();
-    } else if (action == INIT_RANGE) {
-        // for bf16, range is [approximate a , approximate b]
-        // for others, range is [a, b]
-        dnnl::impl::graph::gc::test_utils::fill_data(data.data(), size, a, b);
-    }
-    return data;
-}
-
-template <typename T>
-dnnl::impl::graph::gc::aligned_ptr_t<T> alloc_array2(
-        size_t size, bool fill_rnd = true) {
-    dnnl::impl::graph::gc::aligned_ptr_t<T> data(size);
-    if (fill_rnd) {
-        dnnl::impl::graph::gc::test_utils::fill_data(data.data(), size);
-    }
-    return data;
-}
-
-#endif
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_validator.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_validator.cpp
deleted file mode 100644
index 944635dd829..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_validator.cpp
+++ /dev/null
@@ -1,442 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/pass/validator.hpp>
-
-#include <iostream>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-
-TEST(GCCore_CPU_validator_cpp, TestValidatorBinaryTensor) {
-    validator_t vali;
-    ir_builder_t builder;
-    builder.push_scope();
-    _tensor_(b, datatypes::f32, {1000});
-    EXPECT_SC_ERROR(vali(tensor_ptr(b, {100, 200})),
-            "Indexing node should have the same dimemsion of the tensor");
-    EXPECT_SC_ERROR(vali(tensor_ptr(b, {100.0f})), "Expecting an integer");
-
-    vali(tensor_ptr(b, {100}));
-    vali(tensor_ptr(b, {100UL}));
-}
-
-TEST(GCCore_CPU_validator_cpp, TestValidatorBinaryBaseTest) {
-    validator_t vali;
-    ir_builder_t builder;
-    builder.push_scope();
-    const char *expected = "Invalid type: met undef/void";
-    _var_(a, datatypes::void_t);
-    _var_(b, datatypes::void_t);
-    // void
-    EXPECT_SC_ERROR(vali(a + b), expected);
-    // undef
-    EXPECT_SC_ERROR(vali(a + 0), expected);
-
-    _var_(c, datatypes::s32);
-    _var_(d, datatypes::index);
-    expr tmp = c + d;
-    tmp->dtype_ = datatypes::index;
-    EXPECT_SC_ERROR(vali(tmp),
-            "The types of LHS and RHS should be the same: s32 v.s. index, "
-            "expr");
-    EXPECT_SC_ERROR(vali(expr(1.23f) % 2.3f),
-            "%% operator cannot be applied on this type: ");
-
-    // zero-length vector
-    _var_(e, sc_data_type_t::s32(0));
-    _var_(f, sc_data_type_t::s32(0));
-    EXPECT_SC_ERROR(vali(e + f), "met undef");
-
-    vali(c % 10);
-}
-
-TEST(GCCore_CPU_validator_cpp, TestCmp) {
-    validator_t vali;
-    expr tmp = expr(1) == expr(2);
-    tmp->dtype_ = datatypes::s32;
-    EXPECT_SC_ERROR(vali(tmp),
-            "The type of cmp should be boolean, got: s32. The expr is ");
-    EXPECT_SC_ERROR(vali(expr(12) >= expr(14UL)),
-            "The type of LHS and RHS should be the same: ");
-    expr a = make_var(datatypes::pointer, "a");
-    const char *expected
-            = "comparison expressions should have valid type, got ";
-    EXPECT_SC_ERROR(vali(a < a), expected);
-
-    vali(expr(12) <= expr(14));
-}
-
-TEST(GCCore_CPU_validator_cpp, TestIntrinsics) {
-    validator_t vali;
-    auto a = make_min(1, 2.5f);
-    EXPECT_SC_ERROR(vali(a), "Invalid type: met undef/void/zero-length vector");
-    a->dtype_ = datatypes::bf16;
-    EXPECT_SC_ERROR(vali(a), "The types of LHS and RHS should be the same");
-
-    a = make_abs(1.23f);
-    vali(a);
-    a = make_floor(1.23f);
-    vali(a);
-    a = make_ceil(1.23f);
-    vali(a);
-    a = make_exp(1.23f);
-    vali(a);
-
-    {
-        auto b = expr(1.2f) & 1.3f;
-        EXPECT_SC_ERROR(vali(b), "int_and and int_or only supports ints, got");
-    }
-    {
-        auto b = expr(1.2f) | 1.3f;
-        EXPECT_SC_ERROR(vali(b), "int_and and int_or only supports ints, got");
-    }
-    a = make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4))
-            | make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4));
-    vali(a);
-    a.as<intrin_call>()->type_ = intrin_type::int_and;
-    vali(a);
-
-    // SHL, SHR
-    {
-        auto b = expr(123) << 1.23f;
-        EXPECT_SC_ERROR(vali(b), "operands of shl and shr should be ints, got");
-        b = expr(123) >> 1.23f;
-        EXPECT_SC_ERROR(vali(b), "operands of shl and shr should be ints, got");
-        b = expr(12.f) << 1;
-        EXPECT_SC_ERROR(vali(b), "operands of shl and shr should be ints, got");
-        b = expr(12.f) >> 1;
-        EXPECT_SC_ERROR(vali(b), "operands of shl and shr should be ints, got");
-        b = make_constant({1UL, 2UL, 3UL, 4UL, 1UL, 2UL, 3UL, 4UL},
-                    sc_data_type_t::s32(8))
-                >> make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4));
-        EXPECT_SC_ERROR(vali(b),
-                "shl shr does not support A << B or A >> B where A is a scalar "
-                "and B is a vector ");
-        b = expr(12)
-                >> make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4));
-        EXPECT_SC_ERROR(vali(b),
-                "shl shr does not support A << B or A >> B where A is a scalar "
-                "and B is a vector ");
-    }
-    a = make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4))
-            << make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4));
-    vali(a);
-    a = make_constant({1UL, 2UL, 3UL, 4UL}, sc_data_type_t::s32(4)) << 3;
-    vali(a);
-}
-
-TEST(GCCore_CPU_validator_cpp, TestLogic) {
-    validator_t vali;
-    expr tmp = expr(true) || expr(false);
-    tmp->dtype_ = datatypes::s32;
-    EXPECT_SC_ERROR(vali(tmp), "The type of logic should be boolean, got: ");
-    EXPECT_SC_ERROR(vali(expr(true) || expr(1)),
-            "The type of RHS should be a boolean expr: ");
-    EXPECT_SC_ERROR(vali(expr(1) || expr(true)),
-            "The type of LHS should be a boolean expr: ");
-    vali(expr(true) || expr(false));
-}
-
-TEST(GCCore_CPU_validator_cpp, TestLogicNot) {
-    validator_t vali;
-    logic_not tmp = make_logic_not(expr(true)).as<logic_not>();
-    tmp->dtype_ = datatypes::s32;
-    EXPECT_SC_ERROR(
-            vali(tmp), "The type of logic not should be boolean, got: ");
-    tmp = make_logic_not(expr(true)).as<logic_not>();
-    tmp->in_ = expr(1);
-    EXPECT_SC_ERROR(vali(tmp), "The type of in_ should be a boolean expr: ");
-    vali(!expr(true));
-}
-
-TEST(GCCore_CPU_validator_cpp, TestSelect) {
-    validator_t vali;
-    dnnl::impl::graph::gc::select tmp
-            = make_select(expr(1), expr(1), expr(0))
-                      .as<dnnl::impl::graph::gc::select>();
-    tmp->cond_ = expr(true);
-    tmp->l_ = make_tensor("a", {16}, datatypes::f32);
-    tmp->l_ = make_indexing(tmp->l_, {0}, 16);
-    EXPECT_SC_ERROR(
-            vali(tmp), "The two candidates in select should have same dtype");
-    tmp->r_ = make_tensor("b", {16}, datatypes::f32);
-    tmp->r_ = make_indexing(tmp->r_, {0}, 16);
-    tmp->cond_ = builder::make_constant(16);
-    EXPECT_SC_ERROR(vali(tmp),
-            "When condition is bit mask, its number of bit should equal to "
-            "number of left/right hand vector");
-    tmp->cond_ = make_tensor("c", {16}, datatypes::boolean);
-    tmp->cond_ = make_indexing(tmp->cond_, {0}, 16);
-    vali(!expr(true));
-}
-
-TEST(GCCore_CPU_validator_cpp, TestIndexing) {
-    validator_t vali;
-    expr tsr = make_tensor("aaa", {100}, datatypes::f32);
-    expr tmp = tsr[12];
-    tmp.as<indexing>()->ptr_ = expr(10);
-    EXPECT_SC_ERROR(vali(tmp),
-            "Indexing node is expecting a tensor/tensorptr as the ptr:");
-    tmp = tsr[10];
-    tmp->dtype_ = datatypes::bf16;
-    EXPECT_SC_ERROR(vali(tmp),
-            "Indexing node should have the same type of the tensor element, "
-            "got bf16 and f32");
-    tmp = tsr[{10, 20}];
-    EXPECT_SC_ERROR(vali(tmp),
-            "Indexing node should have the same dimemsion of the tensor "
-            "element, expecting 1, got 2");
-    tmp = tsr[true];
-    EXPECT_SC_ERROR(vali(tmp),
-            "The 1-th index of the indexing expr has type bool. Expecting "
-            "an integer: ");
-    expr tsr2 = make_tensor("aaa", {100, 200}, datatypes::f32);
-    EXPECT_SC_ERROR(vali(tsr2[{10, 20UL}]),
-            "Expecting all the indices within the indexing expression "
-            "having the same dtype. Current dimemsion: 2");
-
-    vali(tsr[10]);
-    vali(tsr[10UL]);
-    vali(tsr2[{10, 20}]);
-    vali(tsr2[{10UL, 20UL}]);
-}
-
-TEST(GCCore_CPU_validator_cpp, TestCall) {
-    validator_t vali;
-    ir_builder_t builder;
-    _decl_func_(datatypes::undef, AAA, _arg_("len", datatypes::s32),
-            _arg_("len2", datatypes::f32));
-    EXPECT_SC_ERROR(vali(AAA(12)), "Met undef");
-    AAA->ret_type_ = datatypes::index;
-    EXPECT_SC_ERROR(
-            vali(AAA(12)), "Wrong number of parameters, given 1, expecting 2");
-    EXPECT_SC_ERROR(
-            vali(AAA()), "Wrong number of parameters, given 0, expecting 2");
-    EXPECT_SC_ERROR(vali(AAA(1, 2, 3)),
-            "Wrong number of parameters, given 3, expecting 2");
-    EXPECT_SC_ERROR(vali(AAA(1, 2UL)),
-            "Unmatched types for parameter 2 : given index, expecting f32");
-    expr tmp = AAA(1, 2.0f);
-    tmp->dtype_ = datatypes::f16;
-    EXPECT_SC_ERROR(vali(tmp), "Unmatched types of call node and the func_t:");
-
-    vali(AAA(1, 2.0f));
-}
-
-TEST(GCCore_CPU_validator_cpp, TestTensor) {
-    validator_t vali;
-    ir_builder_t builder;
-    builder.push_scope();
-    _tensor_(tsr, datatypes::f32, {199});
-    tsr->dtype_ = datatypes::f16;
-    EXPECT_SC_ERROR(vali(make_tensor("aaa", {10}, sc_data_type_t::s32(10))),
-            "tensor cannot contain vector types");
-    EXPECT_SC_ERROR(vali(tsr), "Tensor should have tensor type, got: f16");
-    EXPECT_SC_ERROR(vali(make_tensor("aaa", {10}, datatypes::void_t)),
-            "Invalid type: met undef/void");
-    EXPECT_SC_ERROR(vali(make_tensor("aaa", {10}, sc_data_type_t::s32(0))),
-            "Invalid type: met undef/void");
-    EXPECT_SC_ERROR(vali(make_tensor("aaa", {}, datatypes::s32)),
-            "Expecting the dimension > 0: ");
-    EXPECT_SC_ERROR(vali(make_tensor("aaa", {1.2f}, datatypes::s32)),
-            "The 1-th index of the tensor has type f32. Expecting an integer");
-    EXPECT_SC_ERROR(vali(make_tensor("aaa", {1, 2UL}, datatypes::s32)),
-            "Expecting the all dimemsions within the tensor definition "
-            "having the same dtype. Current dimemsion: 2");
-}
-
-TEST(GCCore_CPU_validator_cpp, TestVar) {
-    validator_t vali;
-    ir_builder_t builder;
-    builder.push_scope();
-    _var_(v, datatypes::void_t);
-    EXPECT_SC_ERROR(vali(v), "Invalid type: met undef/void");
-}
-
-TEST(GCCore_CPU_validator_cpp, TestVarTensorDef) {
-    validator_t vali;
-    ir_builder_t builder;
-    EXPECT_SC_ERROR(vali(builder::make_var_tensor_def_unattached(expr(1))),
-            "Expecting var/tensor");
-    EXPECT_SC_ERROR(vali(builder::make_var_tensor_def_unattached(
-                            builder::make_var(datatypes::bf16, "a"),
-                            linkage::local, expr(1))),
-            "The init val has different type from the var definition");
-    EXPECT_SC_ERROR(vali(builder::make_var_tensor_def_unattached(
-                            builder::make_tensor("a", {1}, datatypes::f32),
-                            linkage::local, expr(1))),
-            "The init val of tensor should come from dynamic");
-}
-
-TEST(GCCore_CPU_validator_cpp, TestAssign) {
-    validator_t vali;
-    ir_builder_t builder;
-    builder.push_scope();
-    EXPECT_SC_ERROR(vali(builder.push_assign(expr(1), expr(1))),
-            "Assignment only supports tensor or var, got: 1");
-    _var_(a, datatypes::bf16);
-    EXPECT_SC_ERROR(vali(builder.push_assign(a, expr(1))),
-            "Assignment expects the LHS and RHS of the same type, but got bf16 "
-            "and s32");
-}
-
-TEST(GCCore_CPU_validator_cpp, TestReturnAndFunc) {
-    validator_t vali;
-    {
-        ir_builder_t builder;
-        _function_(datatypes::void_t, aaa) { _return_(1); }
-        EXPECT_SC_ERROR(vali(aaa),
-                "The current function should return void, but got s32");
-    }
-    {
-        ir_builder_t builder;
-        _function_(datatypes::f16, aaa) { _return_(1); }
-        EXPECT_SC_ERROR(vali(aaa),
-                "The current function should return f16, but got s32");
-    }
-    {
-        ir_builder_t builder;
-        _function_(datatypes::f16, aaa) { _return_(); }
-        EXPECT_SC_ERROR(vali(aaa), "Returning void in a non-void function:");
-    }
-    {
-        ir_builder_t builder;
-        _function_(datatypes::s32, aaa) {
-            _for_(i, 0, 10) { _return_(1); }
-        }
-        EXPECT_SC_ERROR(vali(aaa), "Cannot return in a for-loop: ");
-    }
-    {
-        ir_builder_t builder;
-        builder.push_scope();
-        vali(builder.push_returns());
-    }
-    {
-        ir_builder_t builder;
-        _function_(datatypes::void_t, aaa) { _return_(); }
-        vali(aaa);
-    }
-    {
-        ir_builder_t builder;
-        _function_(datatypes::s32, aaa) { _return_(1); }
-        vali(aaa);
-    }
-}
-
-TEST(GCCore_CPU_validator_cpp, TestIf) {
-    ir_builder_t builder;
-    validator_t vali;
-    builder.push_scope();
-    _if_(expr(1)) {}
-    stmt s = builder.pop_scope();
-    EXPECT_SC_ERROR(vali(s),
-            "If-else node expects an boolean expr as the condition, got ");
-}
-
-TEST(GCCore_CPU_validator_cpp, TestFor) {
-    ir_builder_t builder;
-    validator_t vali;
-    builder.push_scope();
-    for_loop loop;
-    _named_for_(loop, i, 0, 10UL, 1UL) {}
-    loop->var_ = expr();
-    EXPECT_SC_ERROR(vali(loop), "met an invalid for-loop");
-
-    loop->var_ = make_var(datatypes::bf16, "a");
-    EXPECT_SC_ERROR(vali(loop),
-            "for_loop node expects an index or s32 itervar, got bf16");
-
-    _named_for_(loop, i, 0, 10UL, 1UL) {}
-    EXPECT_SC_ERROR(vali(loop),
-            "iter_begin of for_loop node expects an index as the itervar, got "
-            "s32");
-
-    _named_for_(loop, i, 0UL, 10, 1UL) {}
-    EXPECT_SC_ERROR(vali(loop),
-            "iter_end of for_loop node expects an index as the itervar, got "
-            "s32");
-
-    _named_for_(loop, i, 0UL, 10UL, 1) {}
-    EXPECT_SC_ERROR(vali(loop),
-            "step of for_loop node expects an index as the itervar, got "
-            "s32");
-}
-
-TEST(GCCore_CPU_validator_cpp, TestVarDefCheck) {
-    ir_builder_t builder;
-    {
-        validator_t vali;
-        expr vaaa = builder::make_var(datatypes::f32, "vaaa");
-        _function_(datatypes::void_t, aaa) {
-            builder.push_assign(vaaa, 2);
-            _return_();
-        }
-        EXPECT_SC_ERROR(vali(aaa), "Use before define:");
-    }
-
-    {
-        validator_t vali;
-        _function_(datatypes::void_t, aaa) {
-            _var_(va, datatypes::f32);
-            builder.push_var_tensor_def(va);
-            _return_();
-        }
-        EXPECT_SC_ERROR(vali(aaa), "is already defined");
-    }
-
-    {
-        validator_t vali;
-        _function_(datatypes::void_t, aaa) {
-            _var_(va, datatypes::s32);
-            builder.push_for_loop(va, 0, 1, 1,
-                    builder::make_stmts_unattached({}), true, for_type::NORMAL);
-            _return_();
-        }
-        EXPECT_SC_ERROR(vali(aaa), "is already defined");
-    }
-
-    // good case
-    {
-        validator_t vali;
-        _function_(datatypes::void_t, aaa, _arg_("len", datatypes::s32)) {
-            _bind_(len);
-            _var_(va, datatypes::s32);
-            len = 1;
-            va = 2;
-            _return_();
-        }
-        vali(aaa);
-    }
-
-    {
-        validator_t vali;
-        ir_module_ptr mod
-                = std::make_shared<ir_module_t>(get_default_context());
-        _global_tensor_(mod, tsr, datatypes::f32, 100);
-        _function_(datatypes::void_t, aaa, _arg_("len", datatypes::s32)) {
-            _bind_(len);
-            tsr[len] = 1.0f;
-            _return_();
-        }
-        mod->add_func({aaa});
-        vali(mod);
-    }
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_value_numbering.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_value_numbering.cpp
deleted file mode 100644
index 7ad0fee51c2..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_value_numbering.cpp
+++ /dev/null
@@ -1,305 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <iostream>
-#include <vector>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <compiler/ir/transform/value_numbering.hpp>
-#include <gtest/gtest.h>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-static constexpr auto s32 = datatypes::s32;
-
-TEST(GCCore_CPU_value_numbering, TestValueNumbering) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        A[a] = g + 1;
-        A[a] = g + 1;
-        A[a] = A[a] + 1;
-        A[a] = A[a] + 1;
-        A[a + b] = a + 1 + b;
-        A[a + b + 1] = 1 + a + b; // check that a+1 and 1+a can be folded
-        A[a + b + 2] = a + 1 + b + 2;
-        _return_(a);
-    }
-
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    auto out = value_numbering_t()(ssa_ccc);
-    out = ssa_visitor_t().top_level_dispatch(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_(g, s32);
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        _var_init_(t0, s32, g);
-        _var_init_(t2, s32, t0 + 1);
-        A[a] = t2;
-        _var_init_(t3, s32, g);
-        _var_init_(t5, s32, t3 + 1);
-        A[a] = t5;
-
-        _var_init_(t6, s32, A[a]);
-        _var_init_(t8, s32, t6 + 1);
-        A[a] = t8;
-        _var_init_(t9, s32, A[a]);
-        _var_init_(t11, s32, t9 + 1);
-        A[a] = t11;
-
-        _var_init_(t12, s32, a + b);
-        _var_init_(t14, s32, a + 1);
-        _var_init_(t15, s32, t14 + b);
-        A[t12] = t15;
-        _var_init_(t18, s32, t12 + 1);
-        A[t18] = t15;
-        _var_init_(t24, s32, t12 + 2);
-        _var_init_(t29, s32, t15 + 2);
-        A[t24] = t29;
-        _return_(a);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_value_numbering, TestValueNumberingScopes) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        A[a] = b + 1;
-        _var_(c, s32);
-        _if_(a == 0) {
-            A[a] = a + b;
-            A[b] = b + 1;
-            c = 1;
-            _if_(a == 0) { A[b] = a + b + 2; }
-            _else_ { A[a] = a + b + 2; }
-            A[b] = a + b + 2;
-        }
-        _else_ {
-            A[a] = a + b;
-            A[b] = b + 1;
-            c = 2; // check that phi nodes are not altered by const propagation
-        }
-        A[b] = a + b + 2;
-        // values in for loops are unsafe to hoist
-        _for_(i, 0, 100) { A[b] = a + 3; }
-        A[b] = a + 3;
-        _return_(a + b + c);
-    }
-
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    auto out = value_numbering_t()(ssa_ccc);
-    out = ssa_visitor_t().top_level_dispatch(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_init_(t1, s32, b + 1);
-        A[a] = t1;
-        _var_init_(t3, datatypes::boolean, a == 0);
-        _var_init_(t4, s32, a + b);
-        _var_init_(t12, s32, t4 + 2);
-        expr c0_c, c1_c;
-        _if_(t3) {
-            A[a] = t4;
-            A[b] = t1;
-            _var_init_(c0, s32, 1);
-            c0_c = c0;
-
-            _if_(t3) { A[b] = t12; }
-            _else_ { A[a] = t12; }
-            A[b] = t12;
-        }
-        _else_ {
-            A[a] = t4;
-            A[b] = t1;
-            _var_init_(c1, s32, 2);
-            c1_c = c1;
-        }
-
-        _var_init_(c2, s32, builder::make_phi({c0_c, c1_c}));
-        A[b] = t12;
-        _for_(i, 0, 100) {
-            _var_init_(t33, s32, a + 3);
-            A[b] = t33;
-        }
-        _var_init_(t35, s32, a + 3);
-        A[b] = t35;
-        _var_init_(t16, s32, t4 + c2);
-        _return_(t16);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_value_numbering, TestValueNumberingScopes2) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        {
-            bld.push_scope();
-            A[a] = a + b;
-            bld.emit(bld.pop_scope());
-        }
-        {
-            bld.push_scope();
-            A[a] = a + b;
-            bld.emit(bld.pop_scope());
-        }
-        _return_(a);
-    }
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    auto out = value_numbering_t()(ssa_ccc);
-    out = ssa_visitor_t().top_level_dispatch(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_init_(t4, s32, a + b);
-        {
-            bld.push_scope();
-            A[a] = t4;
-            bld.emit(bld.pop_scope());
-        }
-        {
-            bld.push_scope();
-            A[a] = t4;
-            bld.emit(bld.pop_scope());
-        }
-        _return_(a);
-    }
-
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_value_numbering, TestValueNumberingFor) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_ex_(c, s32, linkage::local, 0);
-        _var_ex_(d, s32, linkage::local, 0);
-        _for_(i, 0, 100) {
-            A[builder::make_cast(s32, i) * 100 + 2]
-                    = builder::make_cast(s32, i) + a + b;
-            A[builder::make_cast(s32, i) * 100]
-                    = builder::make_cast(s32, i) + a + b;
-            d = d + 1;
-            c = c + 1;
-        }
-        _return_(c + d);
-    }
-
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    auto out = value_numbering_t()(ssa_ccc);
-    out = ssa_visitor_t().top_level_dispatch(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000}), _arg_("a", s32),
-            _arg_("b", s32)) {
-        _bind_(A, a, b);
-        _var_init_(c, s32, 0);
-        _var_init_(d, s32, 0);
-        expr c5_c, d3_c;
-        _for_(i, 0, 100) {
-            _var_init_(t3, s32, builder::make_cast(s32, i));
-            _var_init_(t5, s32, t3 * 100);
-            _var_init_(t7, s32, t5 + 2);
-            _var_init_(t10, s32, t3 + a);
-            _var_init_(t12, s32, t10 + b);
-            A[t7] = t12;
-            A[t5] = t12;
-
-            _var_init_(d2, s32, 0);
-            auto d2_def = bld.get_current_scope().as_seq().back();
-            _var_init_(d3, s32, d2 + 1);
-            d2_def.checked_as<define>()->init_
-                    = builder::make_phi({d, d3}, true);
-
-            _var_init_(c4, s32, 0);
-            auto c4_def = bld.get_current_scope().as_seq().back();
-            _var_init_(c5, s32, c4 + 1);
-            c4_def.checked_as<define>()->init_
-                    = builder::make_phi({c, c5}, true);
-            c5_c = c5;
-            d3_c = d3;
-        }
-
-        _var_init_(c6, s32, builder::make_phi({c, c5_c}));
-        _var_init_(d7, s32, builder::make_phi({d, d3_c}));
-        _var_init_(t27, s32, c6 + d7);
-        _return_(t27);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
-
-TEST(GCCore_CPU_value_numbering, TestValueNumberingFuncCall) {
-    builder::ir_builder_t bld;
-    _function_(s32, normal_func) { _return_(0); }
-    _function_(s32, ccc, _arg_("A", s32, {10000})) {
-        _bind_(A);
-        _var_ex_(c, s32, linkage::local, // pure func
-                builder::make_call(builtin::get_thread_id_func(), {}));
-        _var_ex_(d, s32, linkage::local, // pure func
-                builder::make_call(builtin::get_thread_id_func(), {}));
-        _var_ex_(e, s32, linkage::local, builder::make_call(normal_func, {}));
-        _var_ex_(f, s32, linkage::local, builder::make_call(normal_func, {}));
-        _for_(i, 0, 100) {
-            A[builder::make_cast(s32, i) * 100 + 2] = c + d + e + f;
-        }
-        _return_(c + d);
-    }
-
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    auto out = value_numbering_t()(ssa_ccc);
-    out = ssa_visitor_t().top_level_dispatch(out);
-
-    _function_(s32, expected, _arg_("A", s32, {10000})) {
-        _bind_(A);
-        _var_ex_(c, s32, linkage::local, // pure func
-                builder::make_call(builtin::get_thread_id_func(), {}));
-        _var_ex_(e, s32, linkage::local, builder::make_call(normal_func, {}));
-        _var_ex_(f, s32, linkage::local, builder::make_call(normal_func, {}));
-        _for_(i, 0, 100) {
-            _var_init_(t3, s32, builder::make_cast(s32, i));
-            _var_init_(t5, s32, t3 * 100);
-            _var_init_(t7, s32, t5 + 2);
-            _var_init_(t10, s32, c + c);
-            _var_init_(t12, s32, t10 + e);
-            _var_init_(t13, s32, t12 + f);
-            A[t7] = t13;
-        }
-        _var_init_(t27, s32, c + c);
-        _return_(t27);
-    }
-    ir_comparer cmper {true};
-    EXPECT_TRUE(cmper.compare(out, expected, false));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_variant.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_variant.cpp
deleted file mode 100644
index ef71c06438a..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_variant.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <string>
-#include "exception_util.hpp"
-#include "gtest/gtest.h"
-#include <util/variant.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-struct vtest_AAA {
-    static int num_dtor;
-    ~vtest_AAA() { num_dtor++; }
-};
-int vtest_AAA::num_dtor = 0;
-
-TEST(GCCore_CPU_variant_cpp, TestVarient) {
-    using the_var = variant<std::string, int, float, vtest_AAA>;
-    the_var a;
-    vtest_AAA mystruct;
-    a = vtest_AAA {};
-    std::string str = "123";
-    a = std::move(str);
-    // old value dtor called
-    ASSERT_EQ(vtest_AAA::num_dtor, 2);
-    // moved
-    ASSERT_TRUE(str.empty());
-    ASSERT_EQ(a.get<std::string>(), "123");
-
-    // test copy assign
-    str = "321";
-    a = str;
-    ASSERT_FALSE(str.empty());
-    ASSERT_EQ(a.get<std::string>(), "321");
-    a = str;
-
-    // test POD
-    ASSERT_FALSE(a.isa<int>());
-    a = 123.0f;
-    ASSERT_TRUE(a.isa<float>());
-    ASSERT_EQ(a.get<float>(), 123.0f);
-
-    // test type check
-    EXPECT_SC_ERROR({ a.get<int>(); }, "Bad variant cast");
-
-    // test move ctor
-    {
-        the_var b {std::move(str)};
-        ASSERT_TRUE(str.empty());
-        ASSERT_EQ(b.get<std::string>(), "321");
-    }
-
-    str = "321";
-    // test copy ctor
-    {
-        the_var b {str};
-        ASSERT_FALSE(str.empty());
-        ASSERT_EQ(b.get<std::string>(), "321");
-        // test dtor
-        b = mystruct;
-    }
-    ASSERT_EQ(vtest_AAA::num_dtor, 3);
-
-    {
-        // variant to variant copy
-        the_var b {std::string("aaa")};
-        a = b;
-        ASSERT_EQ(a.get<std::string>(), "aaa");
-        ASSERT_EQ(b.get<std::string>(), "aaa");
-
-        ASSERT_TRUE(a.defined());
-        a.clear();
-        // check calling clear() for multiple times
-        a.clear();
-        ASSERT_FALSE(a.defined());
-
-        // variant to variant move
-        a = std::move(b);
-        ASSERT_FALSE(b.defined());
-        ASSERT_EQ(a.get<std::string>(), "aaa");
-    }
-
-    // the following line should not compile
-    // a.get<long>();
-}
-
-struct vtest_Base {
-    int val = 0;
-};
-
-struct vtest_BBB : public vtest_Base {
-    vtest_BBB() { val = 1; }
-};
-
-struct vtest_CCC : public vtest_Base {
-    vtest_CCC() { val = 2; }
-};
-
-struct vtest_DDD : public vtest_BBB {
-    vtest_DDD() { val = 3; }
-};
-
-TEST(GCCore_CPU_variant_cpp, TestVarientAs) {
-    using the_var = variant<vtest_Base, vtest_BBB, vtest_CCC, vtest_DDD, int>;
-    {
-        the_var a {vtest_DDD {}};
-        auto &base = a.as<vtest_Base>();
-        ASSERT_EQ(base.val, 3);
-        auto &ddd = a.as<vtest_DDD>();
-        ASSERT_EQ(ddd.val, 3);
-        auto &bbb = a.as<vtest_BBB>();
-        ASSERT_EQ(bbb.val, 3);
-        auto cptr = a.as_or_null<vtest_CCC>();
-        ASSERT_EQ(cptr, nullptr);
-        auto iptr = a.as_or_null<int>();
-        ASSERT_EQ(iptr, nullptr);
-    }
-
-    {
-        the_var a {vtest_BBB {}};
-        auto &base = a.as<vtest_Base>();
-        ASSERT_EQ(base.val, 1);
-        auto dptr = a.as_or_null<vtest_DDD>();
-        ASSERT_EQ(dptr, nullptr);
-        auto &bbb = a.as<vtest_BBB>();
-        ASSERT_EQ(bbb.val, 1);
-        auto cptr = a.as_or_null<vtest_CCC>();
-        ASSERT_EQ(cptr, nullptr);
-        auto iptr = a.as_or_null<int>();
-        ASSERT_EQ(iptr, nullptr);
-    }
-
-    {
-        the_var a {10};
-        auto dptr = a.as_or_null<vtest_DDD>();
-        ASSERT_EQ(dptr, nullptr);
-        auto iptr = a.as_or_null<int>();
-        ASSERT_EQ(*iptr, 10);
-    }
-
-    using the_var2 = variant<std::shared_ptr<vtest_CCC>, int>;
-    the_var2 bbb = {std::make_shared<vtest_CCC>()};
-    EXPECT_SC_ERROR({ bbb.cast<float>(); }, "Bad variant cast");
-    auto ptr = bbb.cast<std::shared_ptr<vtest_Base>>();
-    ASSERT_EQ(ptr->val, 2);
-
-    bbb = 1;
-    EXPECT_SC_ERROR(
-            { bbb.cast<std::shared_ptr<vtest_Base>>(); }, "Bad variant cast");
-    auto fv = bbb.cast<float>();
-    ASSERT_EQ(fv, 1.0f);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_visitor.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_visitor.cpp
deleted file mode 100644
index ce9245498a6..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_visitor.cpp
+++ /dev/null
@@ -1,541 +0,0 @@
-/*******************************************************************************
- * Copyright 2020-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <iostream>
-#include <vector>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/ir/viewer.hpp>
-#include <util/any_map.hpp>
-using namespace dnnl::impl::graph::gc;
-
-#include "gtest/gtest.h"
-
-// test 3 functionalities
-// 1. whether dispatch() for a base case can be dispatched to a sub-class
-// 2. whether default visit() calls dispatch() for each of the field
-// 3. when a sub-node is changed, whether the parent node calls remake()?
-
-struct {
-    std::vector<expr_c> exprlist;
-    std::vector<stmt_c> stmtlist;
-    func_t f;
-    void reset() {
-        exprlist.clear();
-        stmtlist.clear();
-        f = nullptr;
-    }
-} history;
-
-ir_comparer cmper;
-
-using expr_v = std::vector<expr_c>;
-
-namespace std {
-bool operator==(const std::vector<expr_c> &a, const std::vector<expr_c> &b) {
-    if (a.size() != b.size()) return false;
-    for (size_t i = 0; i < a.size(); i++) {
-        if (!a.at(i).ptr_same(b.at(i))) return false;
-    }
-    return true;
-}
-} // namespace std
-
-/**
- * Check the visitor/inplace visitor/viewer with the input.
- * T should be expr or stmt. The leaf nodes of the inputshould be
- * constants. It will check that:
- *  if the output after dispatch() is ptr_same of original
- *  if the output equals original
- *  if inplace visitor's changed_ field is "should_inplace_change"
- *  if the leaf nodes are the same of expected
- *
- * The "expected" functor takes an input node and generates a sequence
- * of leaf nodes that are expected
- * */
-template <typename T, typename TVisitor, typename TIPVisitor, typename TViewer>
-void type_check(T input, bool should_inplace_change,
-        std::function<expr_v(T)> expected) {
-    {
-        T tmp = input->remake();
-        auto expected_v = expected(tmp);
-        TVisitor vis;
-        auto ret = vis.dispatch(tmp);
-        EXPECT_TRUE(cmper.compare(ret, input));
-        EXPECT_FALSE(ret.ptr_same(tmp));
-        EXPECT_EQ(history.exprlist, expected_v);
-        history.reset();
-    }
-    {
-        T tmp = input->remake();
-        auto expected_v = expected(tmp);
-        TIPVisitor vis;
-        auto ret = vis.dispatch_impl(tmp);
-        EXPECT_EQ(vis.changed_, should_inplace_change);
-        EXPECT_TRUE(ret.ptr_same(tmp));
-        EXPECT_TRUE(cmper.compare(ret, input));
-        EXPECT_EQ(history.exprlist, expected_v);
-        history.reset();
-    }
-    {
-        T tmp = input->remake();
-        TViewer vis;
-        EXPECT_TRUE(vis.dispatch(tmp).ptr_same(tmp));
-        EXPECT_EQ(history.exprlist, expected(tmp));
-        history.reset();
-    }
-}
-
-template <typename T>
-void simple_type_check(expr input) {
-    using T_c = decltype(T().to_const());
-    class visitor_t : public ir_visitor_t {
-        expr_c visit(T_c c) override {
-            history.exprlist.push_back(c);
-            return c.remove_const()->remake();
-        }
-    };
-
-    class ipvisitor_t : public ir_inplace_visitor_t {
-        expr visit_impl(T c) override {
-            history.exprlist.push_back(c);
-            return c;
-        }
-    };
-
-    class viewer_t : public ir_viewer_t {
-        void view(T_c c) override { history.exprlist.push_back(c); }
-    };
-
-    type_check<expr, visitor_t, ipvisitor_t, viewer_t>(
-            input, false, [](const expr &e) { return expr_v {e}; });
-}
-
-class visitor_test_base_t : public ir_visitor_t {
-public:
-    using ir_visitor_t::visit;
-    expr_c visit(constant_c c) override {
-        history.exprlist.emplace_back(c);
-        return c.remove_const()->remake();
-    }
-    expr_c visit(var_c c) override {
-        history.exprlist.emplace_back(c);
-        return c.remove_const()->remake();
-    }
-};
-
-class ip_visitor_test_base_t : public ir_inplace_visitor_t {
-public:
-    using ir_inplace_visitor_t::visit_impl;
-    expr visit_impl(constant c) override {
-        history.exprlist.emplace_back(c);
-        return c->remake();
-    }
-    expr visit_impl(var c) override {
-        history.exprlist.emplace_back(c);
-        return c->remake();
-    }
-};
-
-class viewer_test_base_t : public ir_viewer_t {
-public:
-    void view(constant_c c) override { history.exprlist.emplace_back(c); }
-    void view(var_c c) override { history.exprlist.emplace_back(c); }
-};
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorConst) {
-    simple_type_check<constant>(expr(1));
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorVar) {
-    simple_type_check<var>(builder::make_var(datatypes::bf16, "a"));
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorCast) {
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(builder::make_cast(datatypes::f16, 32), true,
-            [](const expr &e) { return expr_v {e.as<cast>()->in_}; });
-}
-
-// if we override base and return non-empty, check if the sub-class is never
-// called
-template <typename Base, typename Derived>
-void check_visit_in_inherited(expr input) {
-    using Derived_c = decltype(Derived().to_const());
-    class visitor_t : public visitor_test_base_t {
-    public:
-        using visitor_test_base_t::visit;
-        expr_c visit(Derived_c c) override {
-            history.exprlist.push_back(c);
-            return c.remove_const()->remake();
-        }
-    };
-
-    class ip_visitor_t : public ip_visitor_test_base_t {
-    public:
-        using ip_visitor_test_base_t::visit_impl;
-        expr visit_impl(Derived c) override {
-            history.exprlist.push_back(c);
-            return c;
-        }
-    };
-
-    class viewer_t : public viewer_test_base_t {
-    public:
-        using viewer_test_base_t::view;
-        void view(Derived_c c) override { history.exprlist.push_back(c); }
-    };
-    type_check<expr, visitor_t, ip_visitor_t, viewer_t>(
-            input, false, [](const expr &e) { return expr_v {e}; });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorBinary) {
-    check_visit_in_inherited<binary, add>(expr(12) + expr(13));
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorCmp) {
-    check_visit_in_inherited<cmp, cmp_eq>(expr(12) == expr(13));
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorLogic) {
-    check_visit_in_inherited<logic, logic_and>(expr(false) && expr(false));
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorBinaries) {
-    expr v1 = 12;
-    expr v2 = 13;
-    auto check_op_bin = [&](expr (*op)(const expr_c &, const expr_c &)) {
-        type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-                viewer_test_base_t>(op(v1, v2), true, [](const expr &e) {
-            return expr_v {
-                    e.static_as<binary>()->l_, e.static_as<binary>()->r_};
-        });
-    };
-    check_op_bin(builder::make_add);
-    check_op_bin(builder::make_sub);
-    check_op_bin(builder::make_mul);
-    check_op_bin(builder::make_div);
-    check_op_bin(builder::make_mod);
-
-    auto check_op_cmp = [&](expr (*op)(const expr_c &, const expr_c &)) {
-        type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-                viewer_test_base_t>(op(v1, v2), true, [](const expr &e) {
-            return expr_v {e.static_as<cmp>()->l_, e.static_as<cmp>()->r_};
-        });
-    };
-
-    check_op_cmp(builder::make_cmp_lt);
-    check_op_cmp(builder::make_cmp_le);
-    check_op_cmp(builder::make_cmp_gt);
-    check_op_cmp(builder::make_cmp_ge);
-    check_op_cmp(builder::make_cmp_eq);
-    check_op_cmp(builder::make_cmp_ne);
-
-    auto check_op_logic = [&](expr (*op)(const expr_c &, const expr_c &)) {
-        type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-                viewer_test_base_t>(op(v1, v2), true, [](const expr &e) {
-            return expr_v {e.static_as<logic>()->l_, e.static_as<logic>()->r_};
-        });
-    };
-
-    check_op_logic(builder::make_logic_and);
-    check_op_logic(builder::make_logic_or);
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorLogicNot) {
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(!expr(false), true,
-            [](const expr &e) { return expr_v {e.as<logic_not>()->in_}; });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorCondition) {
-    namespace gc = dnnl::impl::graph::gc;
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(
-            builder::make_select(true, 1, 0), true, [](const expr &e) {
-                return expr_v {e.static_as<gc::select>()->cond_,
-                        e.static_as<gc::select>()->l_,
-                        e.static_as<gc::select>()->r_};
-            });
-    auto cond_l = builder::make_tensor("cond_l", {16}, datatypes::f32);
-    auto cond_r = builder::make_tensor("cond_r", {16}, datatypes::f32);
-    auto l = builder::make_tensor("l", {16}, datatypes::f32);
-    auto r = builder::make_tensor("r", {16}, datatypes::f32);
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(builder::make_select(cond_l[span_t({0}, 16)]
-                                                < cond_r[span_t({0}, 16)],
-                                        l[span_t({0}, 16)], r[span_t({0}, 16)]),
-            false, [](const expr &e) {
-                auto cond
-                        = e.static_as<gc::select>()->cond_.static_as<cmp_lt>();
-                auto cl = cond->l_.static_as<indexing>();
-                auto cl_t = cl->ptr_.static_as<tensor>();
-                auto cr = cond->r_.static_as<indexing>();
-                auto cr_t = cr->ptr_.static_as<tensor>();
-                auto rl = e.static_as<gc::select>()->l_.static_as<indexing>();
-                auto rl_t = rl->ptr_.static_as<tensor>();
-                auto rr = e.static_as<gc::select>()->r_.static_as<indexing>();
-                auto rr_t = rr->ptr_.static_as<tensor>();
-                return expr_v {cl_t->dims_[0], cl_t->strides_[0], cl->idx_[0],
-                        cr_t->dims_[0], cr_t->strides_[0], cr->idx_[0],
-                        rl_t->dims_[0], rl_t->strides_[0], rl->idx_[0],
-                        rr_t->dims_[0], rr_t->strides_[0], rr->idx_[0]};
-            });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorCall) {
-    builder::ir_builder_t b;
-    _decl_func_(datatypes::f32, aaaa, _arg_("c1", datatypes::s32),
-            _arg_("c2", datatypes::s32));
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(
-            aaaa(expr(1), expr(2)), true, [](const expr &e) {
-                auto call_e = e.as<call>();
-                return expr_v {call_e->args_[0], call_e->args_[1]};
-            });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorIntrinCall) {
-    builder::ir_builder_t b;
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(
-            make_expr<intrin_call_node>(intrin_type::max,
-                    std::vector<expr> {expr(1), 2}, any_map_t()),
-            true, [](const expr &e) {
-                auto call_e = e.as<intrin_call>();
-                return expr_v {call_e->args_[0], call_e->args_[1]};
-            });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorIndexing) {
-    auto tsr = builder::make_stensor(
-            "AAA", std::vector<expr> {100, 200}, {200, 1}, datatypes::s32);
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(tsr[{10, 20}], true, [](const expr &e) {
-        auto call_e = e.as<indexing>();
-        auto t = call_e->ptr_.as<tensor>();
-        return expr_v {t->dims_[0], t->dims_[1], t->strides_[0], t->strides_[1],
-                call_e->idx_[0], call_e->idx_[1]};
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorTensorPtr) {
-    auto tsr = builder::make_stensor(
-            "AAA", std::vector<expr> {100, 200}, {200, 1}, datatypes::s32);
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(
-            builder::tensor_ptr(tsr, {10, 20}), false, [](const expr &e) {
-                auto call_e = e.as<tensorptr>()->base_.as<indexing>();
-                auto t = call_e->ptr_.as<tensor>();
-                return expr_v {t->dims_[0], t->dims_[1], t->strides_[0],
-                        t->strides_[1], call_e->idx_[0], call_e->idx_[1]};
-            });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorTensor) {
-    auto tsr = builder::make_stensor("AAA", std::vector<expr> {100, 200},
-            {200, 1}, datatypes::s32, address_space::device);
-    type_check<expr, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(tsr, true, [](const expr &e) {
-        auto t = e.as<tensor>();
-        return expr_v {
-                t->dims_[0], t->dims_[1], t->strides_[0], t->strides_[1]};
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorAssign) {
-    auto tsr = builder::make_stensor(
-            "AAA", std::vector<expr> {100, 200}, {200, 1}, datatypes::s32);
-    assign asn = make_stmt<assign_node_t>(tsr[{10, 20}].get(), expr(123));
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(asn, true, [](const stmt &e) {
-        auto asn = e.as<assign>();
-        auto call_e = asn->var_.as<indexing>();
-        auto t = call_e->ptr_.as<tensor>();
-        return expr_v {t->dims_[0], t->dims_[1], t->strides_[0], t->strides_[1],
-                call_e->idx_[0], call_e->idx_[1], asn->value_};
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorStmts) {
-    std::vector<stmt> s {make_stmt<evaluate_node_t>(expr(1)),
-            make_stmt<evaluate_node_t>(expr(2)),
-            make_stmt<evaluate_node_t>(expr(3))};
-    stmt asn = make_stmt<stmts_node_t>(std::move(s));
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(asn, true, [](const stmt &e) {
-        auto asn = e.as<stmts>();
-        return expr_v {asn->seq_[0].as<evaluate>()->value_,
-                asn->seq_[1].as<evaluate>()->value_,
-                asn->seq_[2].as<evaluate>()->value_};
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorEval) {
-    stmt asn = make_stmt<evaluate_node_t>(expr(1));
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(asn, true,
-            [](const stmt &e) { return expr_v {e.as<evaluate>()->value_}; });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorReturn) {
-    stmt asn = make_stmt<returns_node_t>(expr(1));
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(asn, true,
-            [](const stmt &e) { return expr_v {e.as<returns>()->value_}; });
-
-    asn = make_stmt<returns_node_t>(expr());
-    auto ret = visitor_test_base_t().dispatch(asn);
-    EXPECT_TRUE(cmper.compare(ret, asn));
-    EXPECT_TRUE(ret.ptr_same(asn));
-    EXPECT_TRUE(history.exprlist.empty());
-    ret = ip_visitor_test_base_t().dispatch_impl(asn);
-    EXPECT_TRUE(cmper.compare(ret, asn));
-    EXPECT_TRUE(ret.ptr_same(asn));
-    EXPECT_TRUE(history.exprlist.empty());
-    ret = viewer_test_base_t().dispatch(asn);
-    EXPECT_TRUE(cmper.compare(ret, asn));
-    EXPECT_TRUE(ret.ptr_same(asn));
-    EXPECT_TRUE(history.exprlist.empty());
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorVarTensorDef) {
-    stmt asn = builder::make_var_tensor_def_unattached(
-            builder::make_var(datatypes::f32, "a"));
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(asn, true,
-            [](const stmt &e) { return expr_v {e.as<define>()->var_}; });
-
-    asn = builder::make_var_tensor_def_unattached(
-            builder::make_var(datatypes::f32, "a"), linkage::local, expr(1.0f));
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(asn, true, [](const stmt &e) {
-        auto def = e.as<define>();
-        return expr_v {def->init_, def->var_};
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorIfElse) {
-    builder::ir_builder_t bui;
-    bui.push_scope();
-    _if_(expr(false)) { bui.push_evaluate(12); }
-    _if_(expr(true)) { bui.push_evaluate(14); }
-    _else_ { bui.push_evaluate(15); }
-    auto s = bui.pop_scope();
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(s, true, [](const stmt &e) {
-        expr_v ret;
-        stmts lst = e.as<stmts>();
-        auto ifelse1 = lst->seq_[0].as<if_else>();
-        ret.push_back(ifelse1->condition_);
-        ret.push_back(ifelse1->then_case_.as<stmts>()
-                              ->seq_[0]
-                              .as<evaluate>()
-                              ->value_);
-        ifelse1 = lst->seq_[1].as<if_else>();
-        ret.push_back(ifelse1->condition_);
-        ret.push_back(ifelse1->then_case_.as<stmts>()
-                              ->seq_[0]
-                              .as<evaluate>()
-                              ->value_);
-        ret.push_back(ifelse1->else_case_.as<stmts>()
-                              ->seq_[0]
-                              .as<evaluate>()
-                              ->value_);
-        return ret;
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestVisitorFor) {
-    builder::ir_builder_t bui;
-    bui.push_scope();
-    _for_(i, 0, 100, 1) { bui.push_evaluate(123); }
-    auto s = bui.pop_scope();
-    type_check<stmt, visitor_test_base_t, ip_visitor_test_base_t,
-            viewer_test_base_t>(s, true, [](const stmt &e) {
-        auto loop = e.as<stmts>()->seq_[0].as<for_loop>();
-        return expr_v {loop->var_, loop->iter_begin_, loop->iter_end_,
-                loop->step_,
-                loop->body_.as<stmts>()->seq_[0].as<evaluate>()->value_};
-    });
-}
-
-TEST(GCCore_CPU_visitor_cpp, TestConsistentVisitor) {
-    builder::ir_builder_t bui;
-    bui.push_scope();
-    _var_(AAA, datatypes::f32);
-    _tensor_(BBB, datatypes::f32, {100});
-    bui.push_assign(AAA, 1.2f);
-    BBB[10] = 1.2f;
-    auto s = bui.pop_scope().as<stmts>();
-    // save the old pointers
-    expr AAA2 = AAA;
-    expr BBB2 = BBB;
-    class vis_t : public ir_consistent_visitor_t {
-    public:
-        bool change_tensor = false;
-        bool change_var = false;
-        expr_c visit(tensor_c f) override {
-            if (change_tensor && f->name_ == "BBB") {
-                return builder::make_tensor("BBB2", {200}, datatypes::f32);
-            }
-            return std::move(f);
-        }
-
-        expr_c visit(var_c f) override {
-            if (change_var && f->name_ == "AAA") {
-                return builder::make_var(datatypes::f32, "AAA2");
-            }
-            return std::move(f);
-        }
-    } v;
-    v.dispatch(s);
-    EXPECT_TRUE(s->seq_[0].checked_as<define>()->var_.ptr_same(AAA2));
-    EXPECT_TRUE(s->seq_[1].checked_as<define>()->var_.ptr_same(BBB2));
-    EXPECT_TRUE(s->seq_[2].checked_as<assign>()->var_.ptr_same(AAA2));
-    EXPECT_TRUE(s->seq_[3]
-                        .checked_as<assign>()
-                        ->var_.checked_as<indexing>()
-                        ->ptr_.ptr_same(BBB2));
-
-    v.change_tensor = true;
-    s = v.dispatch(s).checked_as<stmts>();
-    EXPECT_TRUE(s->seq_[0].checked_as<define>()->var_.ptr_same(AAA2));
-    // BBB is changed to BBB2
-    EXPECT_FALSE(s->seq_[1].checked_as<define>()->var_.ptr_same(BBB2));
-    BBB2 = s->seq_[1].checked_as<define>()->var_;
-    EXPECT_TRUE(BBB2.isa<tensor>());
-    EXPECT_TRUE(s->seq_[2].checked_as<assign>()->var_.ptr_same(AAA2));
-    // The assignment using BBB should be changed to BBB2
-    EXPECT_TRUE(s->seq_[3]
-                        .checked_as<assign>()
-                        ->var_.checked_as<indexing>()
-                        ->ptr_.ptr_same(BBB2));
-
-    v.change_tensor = false;
-    v.change_var = true;
-    s = v.dispatch(s).checked_as<stmts>();
-    EXPECT_FALSE(s->seq_[0].checked_as<define>()->var_.ptr_same(AAA2));
-    AAA2 = s->seq_[0].checked_as<define>()->var_;
-    EXPECT_TRUE(AAA2.isa<var>());
-    EXPECT_TRUE(s->seq_[1].checked_as<define>()->var_.ptr_same(BBB2));
-    EXPECT_TRUE(s->seq_[2].checked_as<assign>()->var_.ptr_same(AAA2));
-    EXPECT_TRUE(s->seq_[3]
-                        .checked_as<assign>()
-                        ->var_.checked_as<indexing>()
-                        ->ptr_.ptr_same(BBB2));
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_vnni_reorder.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_vnni_reorder.cpp
deleted file mode 100644
index f6b1b00aa24..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_vnni_reorder.cpp
+++ /dev/null
@@ -1,819 +0,0 @@
-
-/*******************************************************************************
- * Copyright 2022-2024 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-#include <iostream>
-#include "context.hpp"
-#include "reference/conv_ref.hpp"
-#include "reference/gemm_ref.hpp"
-#include "gtest/gtest.h"
-#include <compiler/codegen/codegen_c.hpp>
-#include <compiler/ir/builtin.hpp>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/graph/driver.hpp>
-#include <compiler/ir/graph/graph.hpp>
-#include <compiler/ir/graph/lowering.hpp>
-#include <compiler/ir/graph/pass/pass.hpp>
-#include <compiler/ir/ir_comparer.hpp>
-#include <compiler/jit/jit.hpp>
-#include <test_utils.hpp>
-#include <util/any_map.hpp>
-#include <util/utils.hpp>
-// #define DO_PERF_IN_UT
-#ifdef DO_PERF_IN_UT
-#include <tuner/time_evaluator.hpp>
-#endif
-
-using namespace dnnl::impl::graph::gc;
-using namespace dnnl::impl::graph::gc::builder;
-static bool verbose = false;
-static bool ut_use_random_data = true;
-
-static sc_data_format_kind_t ABCcbc {0, 1, 2, 2, 1, 2};
-static sc_data_format_t ABCcb4c(int c, int b) {
-    return sc_data_format_t(ABCcbc, {c, b, 4});
-}
-static sc_data_format_kind_t ABCDcdc {0, 1, 2, 3, 2, 3, 2};
-static sc_data_format_t ABCDcd2c(int c, int d) {
-    return sc_data_format_t(ABCDcdc, {c, d, 2});
-}
-static sc_data_format_kind_t ABDC {0, 1, 3, 2};
-static sc_data_format_kind_t ABCD {0, 1, 2, 3};
-static sc_data_format_kind_t ABDCcdc {0, 1, 3, 2, 2, 3, 2};
-static sc_data_format_t ABDCcd2c(int c, int d) {
-    return sc_data_format_t(ABDCcdc, {c, d, 2});
-}
-static sc_data_format_kind_t ABCbabc {0, 1, 2, 1, 0, 1, 2};
-static sc_data_format_t ABCba2bc(int a, int b, int c) {
-    return sc_data_format_t(ABCbabc, {b, a, 2, c});
-}
-static sc_data_format_kind_t ABCabc {0, 1, 2, 0, 1, 2};
-template <typename T>
-static void compute_elementwise_add(T *src_m1, T *src_m2, sc_dims dims,
-        T *dst_m = nullptr, bool inplace = true, char op = '+') {
-    int64_t ranges = 1;
-    T *ref;
-    for (auto d : dims)
-        ranges *= d;
-    if (inplace)
-        ref = src_m1;
-    else
-        ref = dst_m;
-
-    utils::parallel_for(
-            0, ranges, 1, [&](int64_t i) { ref[i] = src_m1[i] + src_m2[i]; });
-}
-
-template <typename T>
-void compute_reorder_op(T &in, T &out, sc_dims in_dims, sc_data_format_t infmt,
-        sc_data_format_t outfmt, sc_data_type_t dtype,
-        bool use_input_loop = false) {
-    sc_graph_t g;
-    auto ins = g.make_input({graph_tensor::make(in_dims, infmt, dtype)});
-    auto op = g.make("reorder", ins->get_outputs(), {},
-            {{"out_format", outfmt}, {"internal", true},
-                    {"use_input_loop", use_input_loop}});
-    g.make_output(op->get_outputs());
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = false;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = false;
-
-    graph_driver(g, get_test_ctx());
-
-    std::vector<sc_op_ptr> args;
-    auto outputs = g.get_output_ops();
-    args.insert(args.end(), outputs.begin(), outputs.end());
-    auto inputs = g.get_input_ops();
-    args.insert(args.end(), inputs.begin(), inputs.end());
-
-    auto f = lower_graph(get_test_ctx(), g, {});
-
-    std::vector<generic_val> generic_args = {
-            &out[0],
-            &in[0],
-    };
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-    fptr->call_generic_default(generic_args.data());
-}
-
-void test_simple_instructions(
-        int in_N, int in_K, int N, int K, int k, int n, int k2) {
-    ir_builder_t builder;
-    _function_(datatypes::void_t, aaa,
-            _arg_("in0", datatypes::u8, {in_K, in_N}),
-            _arg_("out0", datatypes::u8, {N, K, k / k2, n, k2})) {
-        _bind_(in0, out0);
-        // using input for loop
-        _for_(i, 0, in_K, 4) {
-            _for_(j, 0, in_N, 16) {
-                _var_(zmm0, sc_data_type_t::u8(16));
-                _var_(zmm1, sc_data_type_t::u8(16));
-                _var_(zmm2, sc_data_type_t::u8(16));
-                _var_(zmm3, sc_data_type_t::u8(16));
-                _var_(zmmx, sc_data_type_t::u8(16));
-
-                zmm0 = in0[span_t({i + 0, j}, 16)];
-                zmm1 = in0[span_t({i + 1, j}, 16)];
-                zmm2 = in0[span_t({i + 2, j}, 16)];
-                zmm3 = in0[span_t({i + 3, j}, 16)];
-
-                any_map_t reinterpret_attr;
-                reinterpret_attr[intrin_attr::out_dtype]
-                        = sc_data_type_t::u8(16);
-
-                zmmx = make_unpack_low(zmm0, zmm2, 8);
-                zmm2 = make_unpack_high(zmm0, zmm2, 8);
-                zmm0 = zmmx;
-                zmmx = make_unpack_low(zmm1, zmm3, 8);
-                zmm3 = make_unpack_high(zmm1, zmm3, 8);
-                zmm1 = zmmx;
-                zmmx = make_unpack_low(zmm0, zmm1, 8);
-                zmm1 = make_unpack_high(zmm0, zmm1, 8);
-                zmm0 = zmmx;
-                zmmx = make_unpack_low(zmm2, zmm3, 8);
-                zmm3 = make_unpack_high(zmm2, zmm3, 8);
-                zmm2 = zmmx;
-                out0[span_t(
-                        {j / n, i / k, (i % k) / k2, (j % n + 0), (i % k) % k2},
-                        16)]
-                        = make_reinterpret(zmm0, sc_data_type_t::u8(16));
-                out0[span_t(
-                        {j / n, i / k, (i % k) / k2, (j % n + 4), (i % k) % k2},
-                        16)]
-                        = make_reinterpret(zmm1, sc_data_type_t::u8(16));
-                out0[span_t(
-                        {j / n, i / k, (i % k) / k2, (j % n + 8), (i % k) % k2},
-                        16)]
-                        = make_reinterpret(zmm2, sc_data_type_t::u8(16));
-                out0[span_t({j / n, i / k, (i % k) / k2, (j % n + 12),
-                                    (i % k) % k2},
-                        16)]
-                        = make_reinterpret(zmm3, sc_data_type_t::u8(16));
-            }
-        }
-    }
-    context_ptr ctx = std::make_shared<context_t>(*get_default_context());
-    auto fptr = jit_engine_t::make(ctx)->get_entry_func(
-            ir_module_t::from_entry_func(get_default_context(), aaa));
-
-    ASSERT_TRUE(fptr);
-
-    auto in = alloc_array<uint8_t>(in_N * in_K, init_action::INIT_RANDOM);
-
-    if (!ut_use_random_data) {
-        for (size_t i = 0; i < in.size(); i++) {
-            in[i] = uint8_t(i % 256);
-        }
-    }
-
-    if (verbose) {
-        for (size_t i = 0; i < in.size(); i++) {
-            std::cout << int(in[i]) << " ";
-            if ((i + 1) % in_K == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl << std::endl;
-    }
-
-    auto out = alloc_array<uint8_t>(in_N * in_K, init_action::INIT_NOOP);
-    auto ref_out = alloc_array<uint8_t>(in_N * in_K, init_action::INIT_NOOP);
-    auto reorder_op_out
-            = alloc_array<uint8_t>(in_N * in_K, init_action::INIT_NOOP);
-
-    fptr->call_default(in.data(), out.data());
-
-    ref_out = KN2NKkn(in, N, K, k, n, in_K, in_N, k2);
-    if (verbose) {
-        for (size_t i = 0; i < ref_out.size(); i++) {
-            std::cout << int(ref_out[i]) << " ";
-            if ((i + 1) % in_K == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl;
-        for (size_t i = 0; i < out.size(); i++) {
-            std::cout << int(out[i]) << " ";
-            if ((i + 1) % in_K == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl;
-    }
-
-    test_utils::compare_data(out.data(), ref_out.data(), in_N * in_K);
-
-    compute_reorder_op(reorder_op_out, in, sc_dims({in_K, in_N}),
-            sc_data_format_t::KN(), sc_data_format_t::NKkn4k(k, n),
-            sc_data_type_t::u8());
-
-    test_utils::compare_data(reorder_op_out, ref_out);
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestSimpleInstructions1) {
-    REQUIRE_AVX2()
-    test_simple_instructions(64, 16, 4, 1, 16, 16, 4);
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestSimpleInstructions2) {
-    REQUIRE_AVX2()
-    test_simple_instructions(1024, 512, 8, 4, 128, 128, 4);
-}
-
-bool is_vnni_reorder_triggered(ir_module_ptr ir) {
-    // vnni fast reorder should contain unpack intrinsic in IR.
-    // Some of the unit tests in this file is NOT vnni reorder.
-    std::stringstream ss;
-    ss << ir;
-    return ss.str().find("_vnni_reorder_") != std::string::npos;
-}
-
-template <typename T>
-static void check(const sc_dims &inputdims, const sc_data_format_t &infmt,
-        const sc_data_format_t &outfmt, sc_data_type_t dtype,
-        bool is_vnni_reorder, bool use_input_loop,
-        const std::function<test_buffer<T>(test_buffer<T> &)> &ref_func,
-        const std::function<sc_op_ptr(sc_graph_t &, graph_tensor_ptr)>
-                &graph_func
-        = nullptr,
-        std::function<void(std::vector<generic_val> &)> args_func = nullptr,
-        bool fuse = true) {
-    sc_graph_t g;
-    auto ins = g.make_input({graph_tensor::make(inputdims, infmt, dtype)});
-    auto op = g.make("reorder", ins->get_outputs(), {},
-            {{"out_format", outfmt}, {"internal", true},
-                    {"use_input_loop", use_input_loop}});
-    if (graph_func) { op = graph_func(g, op->get_outputs()[0]); }
-    auto g_out = g.make_output(op->get_outputs());
-    g.attrs_[sc_graph_t::attr_key_t::is_input_plain] = false;
-    g.attrs_[sc_graph_t::attr_key_t::is_output_plain] = false;
-    if (!fuse) { g.attrs_["temp.disable_graph_fusion"] = 1; }
-    size_t input_total = test_utils::product(
-            ins->get_outputs()[0]->details_.get_blocking_dims());
-    size_t output_total = test_utils::product(
-            op->get_outputs()[0]->details_.get_blocking_dims());
-    auto x = op->get_outputs()[0]->details_.get_blocking_dims();
-    graph_driver(g, get_test_ctx());
-    auto f = lower_graph(get_test_ctx(), g, {});
-
-    auto input = alloc_array<T>(input_total, init_action::INIT_RANDOM);
-    auto sc_output = alloc_array<T>(output_total, init_action::INIT_NOOP);
-    auto ref_output = alloc_array<T>(output_total, init_action::INIT_NOOP);
-
-    if (!ut_use_random_data) {
-        for (size_t i = 0; i < input.size(); i++) {
-            input[i] = T(i % 256);
-        }
-    }
-
-    std::vector<generic_val> args;
-    if (args_func) { args_func(args); }
-    args.emplace_back(input.data());
-    args.emplace_back(sc_output.data());
-    auto fptr = jit_engine_t::make(get_test_ctx())->get_entry_func(f, true);
-
-    auto exec = [&]() { fptr->call_generic_default(args.data()); };
-
-#ifdef DO_PERF_IN_UT
-    const int warm_up = 5000;
-    const int repeat = 3;
-    const int loop = 10000;
-    double cost1 = 1e12;
-    for (int r = 0; r < repeat; r++) {
-        double cost = 0.f;
-        for (int t = 0; t < loop + warm_up; t++) {
-            auto time = evaluate_time(exec);
-            if (t >= warm_up) cost += time;
-        }
-        cost1 = std::min(cost, cost1);
-    }
-    printf("\ncost %f ms\n", cost1 / loop);
-#else
-    exec();
-#endif
-    ref_output = ref_func(input);
-    if (verbose) {
-        std::cout << "input:" << std::endl;
-        for (size_t i = 0; i < input.size(); i++) {
-            std::cout << int(input[i]) << " ";
-            if ((i + 1) % 16 == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl << std::endl;
-
-        std::cout << "ref_output:" << std::endl;
-        for (size_t i = 0; i < ref_output.size(); i++) {
-            std::cout << int(ref_output[i]) << " ";
-            if ((i + 1) % 16 == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl << std::endl;
-
-        std::cout << "sc_output:" << std::endl;
-        for (size_t i = 0; i < sc_output.size(); i++) {
-            std::cout << int(sc_output[i]) << " ";
-            if ((i + 1) % 16 == 0) std::cout << std::endl;
-        }
-        std::cout << std::endl << std::endl;
-    }
-    test_utils::compare_data(sc_output, ref_output);
-    EXPECT_EQ(is_vnni_reorder_triggered(f), is_vnni_reorder);
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose1) {
-    REQUIRE_AVX2()
-    check<uint8_t>({128, 64}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(16 * 4, 16), sc_data_type_t::u8(), true,
-            false, [](test_buffer<uint8_t> &input) {
-                return NK2NKknk(input, 4, 2, 16, 16, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose2) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 384}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(8 * 4, 32), sc_data_type_t::u8(), true,
-            true, [](test_buffer<uint8_t> &input) {
-                return NK2NKknk(input, 12, 2, 8, 32, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose3) {
-    REQUIRE_AVX2()
-    check<uint8_t>({1024, 4096}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(32 * 4, 64), sc_data_type_t::u8(), true,
-            false, [](test_buffer<uint8_t> &input) {
-                return NK2NKknk(input, 64, 8, 32, 64, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose4) {
-    REQUIRE_AVX2()
-    check<bf16_t>({128, 64}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn2k(16 * 2, 16), sc_data_type_t::bf16(), true,
-            true, [](test_buffer<bf16_t> &input) {
-                return NK2NKknk(input, 4, 4, 16, 16, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose5) {
-    REQUIRE_AVX2()
-    check<bf16_t>({64, 384}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn2k(8 * 2, 32), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return NK2NKknk(input, 12, 4, 8, 32, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose6) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1024, 4096}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn2k(32 * 2, 64), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return NK2NKknk(input, 64, 16, 32, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose7) {
-    REQUIRE_AVX2()
-    check<uint8_t>({32, 16}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(4 * 4, 8), sc_data_type_t::u8(), true,
-            true, [](test_buffer<uint8_t> &input) {
-                return NK2NKknk(input, 2, 2, 4, 8, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose8) {
-    REQUIRE_AVX2()
-    check<uint8_t>({128, 16, 32}, sc_data_format_t(format_kinds::ABC),
-            ABCcb4c(4 * 4, 8), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return ABC2ABCcb4c(input, 128, 2, 2, 4, 8, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose9) {
-    REQUIRE_AVX2()
-    check<bf16_t>({64, 384}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn2k(8 * 2, 32), sc_data_type_t::bf16(), true,
-            true, [](test_buffer<bf16_t> &input) {
-                return NK2NKknk(input, 12, 4, 8, 32, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose10) {
-    REQUIRE_AVX2()
-    check<int8_t>({16, 16}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(4 * 4, 16), sc_data_type_t::u8(), true,
-            false, [](test_buffer<int8_t> &input) {
-                return NK2NKknk(input, 1, 1, 4, 16, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder1) {
-    REQUIRE_AVX2()
-    check<int8_t>({16, 16}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(4 * 4, 16), sc_data_type_t::s8(), true,
-            true, [](test_buffer<int8_t> &input) {
-                return KN2NKkn(input, 1, 1, 16, 16, 16, 16, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder2) {
-    REQUIRE_AVX2()
-    check<uint8_t>({16, 64}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(16, 16), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 4, 1, 16, 16, 16, 64, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder3) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 64}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(16, 16), sc_data_type_t::u8(), true, true,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 4, 4, 16, 16, 64, 64, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder4) {
-    REQUIRE_AVX2()
-    check<uint8_t>({4, 16}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(4, 16), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 1, 1, 4, 16, 4, 16, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder5) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 384}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(16, 32), sc_data_type_t::u8(), true, true,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 12, 4, 16, 32, 64, 384, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder6) {
-    REQUIRE_AVX2()
-    check<uint8_t>({1024, 4096}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(128, 64), sc_data_type_t::u8(), true,
-            false, [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 64, 8, 128, 64, 1024, 4096, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder7) {
-    REQUIRE_AVX2()
-    check<bf16_t>({4, 8}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(4, 8), sc_data_type_t::bf16(), true, true,
-            [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 1, 1, 4, 8, 4, 8, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder8) {
-    REQUIRE_AVX2()
-    check<bf16_t>({16, 16}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(16, 16), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 1, 1, 16, 16, 16, 16, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder9) {
-    REQUIRE_AVX2()
-    check<bf16_t>({64, 384}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(16, 32), sc_data_type_t::bf16(), true,
-            true, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 12, 4, 16, 32, 64, 384, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder10) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1024, 4096}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(64, 64), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 64, 16, 64, 64, 1024, 4096, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding1) {
-    REQUIRE_AVX2()
-    check<uint8_t>({63, 64}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(16, 16), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 4, 4, 16, 16, 63, 64, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding2) {
-    REQUIRE_AVX2()
-    check<uint8_t>({479, 1024}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(64, 64), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 16, 8, 64, 64, 479, 1024, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding3) {
-    REQUIRE_AVX2()
-    check<bf16_t>({63, 64}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(16, 16), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 4, 4, 16, 16, 63, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding4) {
-    REQUIRE_AVX2()
-    check<bf16_t>({479, 1024}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(64, 64), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 16, 8, 64, 64, 479, 1024, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding5) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 32}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(64, 64), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 1, 1, 64, 64, 1, 32, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding6) {
-    REQUIRE_AVX2()
-    check<bf16_t>({64, 15}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(64, 64), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 1, 1, 64, 64, 64, 15, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding7) {
-    REQUIRE_AVX2()
-    check<bf16_t>({15, 15}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(64, 64), sc_data_type_t::bf16(), true,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 1, 1, 64, 64, 15, 15, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderPadding8) {
-    REQUIRE_AVX2()
-    check<uint8_t>({15, 15}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(64, 64), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 1, 1, 64, 64, 15, 15, 4);
-            });
-}
-
-// not vnni reorder
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose11) {
-    REQUIRE_AVX2()
-    check<uint8_t>({256, 128}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(1 * 4, 8), sc_data_type_t::u8(), false,
-            true, [](test_buffer<uint8_t> &input) {
-                return NK2NKknk(input, 16, 64, 1, 8, 4);
-            });
-}
-
-// not vnni reorder
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose12) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 64, 3, 3}, sc_data_format_t::KCRS(),
-            sc_data_format_t::KCRSck4c(32, 32), sc_data_type_t::u8(), false,
-            false, [](test_buffer<uint8_t> &input) {
-                return KCRS2KCRSckc(input, 2, 2, 3, 3, 8, 32, 4);
-            });
-} // ABCD->ABCDba4
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder13) {
-    REQUIRE_AVX2()
-    check<bf16_t>({28, 12, 128, 64}, sc_data_format_t(format_kinds::ACBD),
-            ABCDcd2c(64, 64), sc_data_type_t::bf16(), true, true,
-            [](test_buffer<bf16_t> &input) {
-                // 28 12 2 1 32 64 2
-                return ACBD2ABCDcd(
-                        input, 28, 12, 2, 1, 64, 64, 28, 128, 12, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder14) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 1, 384, 64}, sc_data_format_t(format_kinds::ACBD),
-            ABCDcd2c(64, 64), sc_data_type_t::bf16(), true, false,
-            [](test_buffer<bf16_t> &input) {
-                return ACBD2ABCDcd(input, 1, 1, 6, 1, 64, 64, 1, 384, 1, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder15) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 16, 384, 64}, sc_data_format_t(format_kinds::ACBD),
-            ABCDcd2c(64, 64), sc_data_type_t::bf16(), true, true,
-            [](test_buffer<bf16_t> &input) {
-                return ACBD2ABCDcd(
-                        input, 1, 16, 6, 1, 64, 64, 1, 384, 16, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder16) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 1, 16, 16}, sc_data_format_t(ABCD), ABDCcd2c(16, 16),
-            sc_data_type_t::bf16(), true, false,
-            [](test_buffer<bf16_t> &input) {
-                return ABCD2ABDCcd(input, 1, 1, 1, 1, 16, 16, 1, 1, 16, 16, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose17) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 1, 64, 384}, sc_data_format_t(ABDC), ABCDcd2c(64, 64),
-            sc_data_type_t::bf16(), true, true, [](test_buffer<bf16_t> &input) {
-                return ABDC2ABCDcd(input, 1, 1, 1, 6, 64, 64, 1, 1, 384, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder18) {
-    REQUIRE_AVX2()
-    check<uint8_t>({768, 2304}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(256, 256), sc_data_type_t::s8(), true,
-            false, [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 9, 3, 256, 256, 768, 2304, 4);
-            });
-}
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder19) {
-    REQUIRE_AVX2()
-    check<uint8_t>({768, 2304}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(256, 18), sc_data_type_t::s8(), false,
-            false, [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 128, 3, 256, 18, 768, 2304, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder20) {
-    REQUIRE_AVX2()
-    check<bf16_t>({13, 512}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn2k(18, 32), sc_data_type_t::bf16(), false,
-            false, [](test_buffer<bf16_t> &input) {
-                return KN2NKkn(input, 16, 1, 18, 32, 13, 512, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder21) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 12, 15, 64}, format_kinds::ACBD, ABDCcd2c(64, 64),
-            sc_data_type_t::bf16(), true, false,
-            [](test_buffer<bf16_t> &input) {
-                return ACBD2ABDCcd(
-                        input, 1, 12, 1, 1, 64, 64, 1, 15, 12, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder22) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 15}, sc_data_format_t::KN(),
-            sc_data_format_t::NKkn4k(16, 32), sc_data_type_t::u8(), true, false,
-            [](test_buffer<uint8_t> &input) {
-                return KN2NKkn(input, 1, 4, 16, 32, 64, 15, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder23) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 64, 1, 1}, sc_data_format_t::KCRS(),
-            sc_data_format_t::KCRSck4c(32, 32), sc_data_type_t::u8(), true,
-            false, [](test_buffer<uint8_t> &input) {
-                return KCRS2KCRSckc(input, 2, 2, 1, 1, 8, 32, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder25) {
-    REQUIRE_AVX2()
-    check<uint8_t>({64, 64, 2, 1}, sc_data_format_t::KCRS(),
-            sc_data_format_t::KCRSck4c(32, 32), sc_data_type_t::u8(), false,
-            false, [](test_buffer<uint8_t> &input) {
-                return KCRS2KCRSckc(input, 2, 2, 2, 1, 8, 32, 4);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder26) {
-    REQUIRE_AVX2()
-    check<bf16_t>({16, 128, 1}, sc_data_format_t(format_kinds::ABC),
-            ABCba2bc(16, 16, 1), sc_data_type_t::bf16(), true, true,
-            [](test_buffer<bf16_t> &input) {
-                return ABC2ABCbac(input, 1, 8, 1, 16, 16, 1, 16, 128, 1, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder27) {
-    REQUIRE_AVX2()
-    check<bf16_t>({16, 128, 8}, sc_data_format_t(format_kinds::ABC),
-            ABCba2bc(16, 16, 8), sc_data_type_t::bf16(), false, true,
-            [](test_buffer<bf16_t> &input) {
-                return ABC2ABCbac(input, 1, 8, 1, 16, 16, 8, 16, 128, 8, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder28) {
-    REQUIRE_AVX2()
-    check<bf16_t>({16, 128, 1}, sc_data_format_t(format_kinds::ABC),
-            sc_data_format_t(ABCabc, {16, 2, 1}), sc_data_type_t::bf16(), true,
-            true, [](test_buffer<bf16_t> &input) {
-                return ABC2ABCabc(input, 1, 64, 1, 16, 2, 1, 16, 128, 1);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder29) {
-    REQUIRE_AVX2()
-    check<bf16_t>({16, 128, 16}, sc_data_format_t(format_kinds::ABC),
-            sc_data_format_t(ABCabc, {16, 4, 2}), sc_data_type_t::bf16(), true,
-            true, [](test_buffer<bf16_t> &input) {
-                return ABC2ABCabc(input, 1, 32, 8, 16, 4, 2, 16, 128, 16);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorder30) {
-    REQUIRE_AVX2()
-    check<bf16_t>({16, 128, 16}, sc_data_format_t(format_kinds::ABC),
-            sc_data_format_t(ABCabc, {16, 4, 4}), sc_data_type_t::bf16(), false,
-            true, [](test_buffer<bf16_t> &input) {
-                return ABC2ABCabc(input, 1, 32, 4, 16, 4, 4, 16, 128, 16);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose22) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 12, 64, 15},
-            sc_data_format_t(sc_data_format_kind_t {0, 3, 1, 2}),
-            ABDCcd2c(64, 16), sc_data_type_t::bf16(), true, false,
-            [](test_buffer<bf16_t> &input) {
-                return ADBC2ABDCcd(
-                        input, 1, 12, 1, 1, 64, 16, 1, 15, 12, 64, 2);
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestVNNIReorderTranspose23) {
-    REQUIRE_AVX2()
-    check<bf16_t>({1, 12, 64, 16},
-            sc_data_format_t(sc_data_format_kind_t {0, 3, 1, 2}),
-            ABDCcd2c(64, 16), sc_data_type_t::bf16(), true, false,
-            [](test_buffer<bf16_t> &input) {
-                return ADBC2ABDCcd(
-                        input, 1, 12, 1, 1, 64, 16, 1, 16, 12, 64, 2);
-            });
-}
-
-template <typename T>
-static void check_fuse_add(const sc_dims &inputdims,
-        const sc_data_format_t &infmt, const sc_data_format_t &outfmt,
-        sc_data_type_t dtype, bool is_vnni_reorder,
-        std::function<test_buffer<T>(test_buffer<T> &, test_buffer<T> &)>
-                ref_func) {
-    auto eltadd = alloc_array<T>(test_utils::product(inputdims));
-    check<T>(
-            inputdims, infmt, outfmt, dtype, is_vnni_reorder, false,
-            [&ref_func, &eltadd](
-                    test_buffer<T> &input) { return ref_func(input, eltadd); },
-            [&inputdims, &outfmt](sc_graph_t &g, graph_tensor_ptr op) {
-                auto extra_in = g.make_input({graph_tensor::make(
-                        inputdims, outfmt, sc_data_traits_t<T>().type())});
-                return g.make("add",
-                        {std::move(op), extra_in->get_outputs()[0]}, {}, {});
-            },
-            [&eltadd](std::vector<generic_val> &args) {
-                args.emplace_back(eltadd.data());
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestUInt8VNNIReorderTransposeFuseAdd1) {
-    REQUIRE_AVX2()
-    check_fuse_add<uint8_t>({128, 64}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(16 * 4, 16), sc_data_type_t::u8(), true,
-            [](test_buffer<uint8_t> &input, test_buffer<uint8_t> &eltadd) {
-                auto ref_output = NK2NKknk(input, 4, 2, 16, 16, 4);
-                compute_elementwise_add<uint8_t>(
-                        &ref_output[0], &eltadd[0], {4, 2, 16, 16, 4});
-                return ref_output;
-            });
-}
-
-TEST(GCCore_CPU_vnni_reorder_test, TestUInt8VNNIReorderTransposeFuseAdd2) {
-    REQUIRE_AVX2()
-    check_fuse_add<uint8_t>({64, 384}, sc_data_format_t::NK(),
-            sc_data_format_t::NKkn4k(8 * 2, 32), sc_data_type_t::u8(), true,
-            [](test_buffer<uint8_t> &input, test_buffer<uint8_t> &eltadd) {
-                auto ref_output = NK2NKknk(input, 12, 2, 8, 32, 4);
-                compute_elementwise_add<uint8_t>(
-                        &ref_output[0], &eltadd[0], {12, 2, 8, 32, 4});
-                return ref_output;
-            });
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/core/test_volatility_analysis.cpp b/tests/gtests/graph/unit/backend/graph_compiler/core/test_volatility_analysis.cpp
deleted file mode 100644
index 0d7c366fd95..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/core/test_volatility_analysis.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/*******************************************************************************
- * Copyright 2022-2023 Intel Corporation
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *******************************************************************************/
-
-#include <functional>
-#include <iostream>
-#include <vector>
-#include <compiler/ir/easy_build.hpp>
-#include <compiler/ir/passlet/volatility_analysis.hpp>
-#include <compiler/ir/ssa_visitor.hpp>
-#include <compiler/ir/transform/module_globals_resolve.hpp>
-#include <compiler/ir/transform/ssa_transform.hpp>
-#include <gtest/gtest.h>
-#include <util/utils.hpp>
-
-using namespace dnnl::impl::graph::gc;
-
-namespace volatility_analysis {
-
-using namespace passlet;
-struct viewer_t : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    using ssa_viewer_t::view;
-
-    volatility_analysis_t v_ana_;
-    struct result_t {
-        int r2 = 0;
-        volatility_result_t r1;
-    };
-    viewer_t(bool is_loop,
-            std::unordered_map<stmt_c, volatility_result_t> *mapper)
-        : v_ana_ {is_loop, nullptr} {
-        volatility_analysis_t::typed_addresser_t addresser;
-        if (mapper) {
-            addresser = map_addresser<
-                    std::unordered_map<stmt_c, volatility_result_t>>(*mapper);
-        } else {
-            addresser = temp_data_addresser<result_t, volatility_result_t,
-                    &result_t::r1>();
-        }
-        v_ana_.stmt_result_func_ = addresser;
-    }
-    void view(define_c v) override {
-        v_ana_.view(v, passlet::PRE_VISIT);
-        ssa_viewer_t::view(v);
-        v_ana_.view(v, passlet::POST_VISIT);
-    };
-
-    func_c dispatch(func_c f) override {
-        v_ana_.view(f, passlet::PRE_VISIT);
-        ssa_viewer_t::dispatch(f);
-        v_ana_.view(f, passlet::POST_VISIT);
-        return f;
-    }
-};
-
-struct reset_temp_data : public ssa_viewer_t {
-    using ssa_viewer_t::dispatch;
-    stmt_c dispatch(stmt_c f) override {
-        f->temp_data().clear();
-        ssa_viewer_t::dispatch(f);
-        return f;
-    }
-};
-
-} // namespace volatility_analysis
-
-using namespace passlet;
-static constexpr auto s32 = datatypes::s32;
-
-static volatility_result_t::state_t get_status(const stmt &v) {
-    return v->get_temp_data()
-            .get<volatility_analysis::viewer_t::result_t>()
-            .r1.is_volatile_;
-}
-
-static std::string get_str(const stmt &v) {
-    std::stringstream ss;
-    v->to_string(ss, 0);
-    return ss.str();
-}
-
-static void check(const func_c &ssa_ccc, bool is_loop,
-        volatility_result_t::state_t expected_loop_var) {
-    volatility_analysis::viewer_t(is_loop, nullptr).dispatch(ssa_ccc);
-    auto &body = ssa_ccc->body_.static_as<stmts>()->seq_;
-    ASSERT_EQ(get_str(body.at(1)), "var __tmp0: s32 = 3");
-    ASSERT_EQ(get_status(body.at(1)), volatility_result_t::NO);
-
-    ASSERT_EQ(get_str(body.at(2)), "var __tmp1: s32 = (a + __tmp0)");
-    ASSERT_EQ(get_status(body.at(2)), volatility_result_t::NO);
-
-    ASSERT_EQ(get_str(body.at(4)), "var __tmp3: s32 = (a + __tmp2)");
-    ASSERT_EQ(get_status(body.at(4)), volatility_result_t::NO);
-
-    // indexing should be volatile
-    ASSERT_EQ(get_str(body.at(9)), "var __tmp7: s32 = A[__tmp6]");
-    ASSERT_EQ(get_status(body.at(9)), volatility_result_t::YES);
-
-    // depends on indexing, should be volatile
-    ASSERT_EQ(get_str(body.at(10)), "var __tmp8: s32 = (__tmp5 + __tmp7)");
-    ASSERT_EQ(get_status(body.at(10)), volatility_result_t::YES);
-
-    ASSERT_EQ(get_str(body.at(13)), "var __tmp10: s32 = (a + __tmp9)");
-    ASSERT_EQ(get_status(body.at(13)), volatility_result_t::NO);
-
-    // depends on global var, should be volatile
-    ASSERT_EQ(get_str(body.at(15)), "var __tmp12: s32 = (__tmp10 + __tmp11)");
-    ASSERT_EQ(get_status(body.at(15)), volatility_result_t::YES);
-
-    auto &the_for_body = body.at(22)
-                                 .checked_as<for_loop>()
-                                 ->body_.checked_as<stmts>()
-                                 ->seq_;
-
-    ASSERT_EQ(get_str(the_for_body.at(0)),
-            "var loop_v_0: s32 = phi(loop_v, loop_v_1 loop)");
-    ASSERT_EQ(get_status(the_for_body.at(0)), expected_loop_var);
-
-    ASSERT_EQ(get_str(the_for_body.at(2)),
-            "var loop_v_1: s32 = (loop_v_0 + __tmp17)");
-    ASSERT_EQ(get_status(the_for_body.at(2)), expected_loop_var);
-
-    // loop var depends on global var
-    ASSERT_EQ(get_str(the_for_body.at(3)),
-            "var loop_g_2: s32 = phi(loop_g, loop_g_3 loop)");
-    ASSERT_EQ(get_status(the_for_body.at(3)), volatility_result_t::YES);
-
-    ASSERT_EQ(get_str(the_for_body.at(5)),
-            "var loop_g_3: s32 = (loop_g_2 + __tmp20)");
-    ASSERT_EQ(get_status(the_for_body.at(5)), volatility_result_t::YES);
-
-    ASSERT_EQ(get_str(the_for_body.at(8)),
-            "var __tmp24: s32 = (loop_v_1 * __tmp23)");
-    ASSERT_EQ(get_status(the_for_body.at(8)), expected_loop_var);
-
-    // depends on for-loop-var
-    ASSERT_EQ(get_str(the_for_body.at(12)),
-            "var __tmp27: s32 = (__tmp25 + __tmp26)");
-    ASSERT_EQ(get_status(the_for_body.at(12)), expected_loop_var);
-}
-
-TEST(GCCore_CPU_volatility_analysis, TestVolatilityAnalysis) {
-    builder::ir_builder_t bld;
-    _function_(s32, ccc, _arg_("A", s32, {10000}), _arg_("a", s32)) {
-        _bind_(A, a);
-        _var_(g, s32); // simulate global var
-        g->attr()[attr_keys::module_global_offset] = size_t(1);
-        A[a + 3] = a + 2;
-        A[a] = (a + 2) + A[0];
-        A[a] = (a + 2) + g;
-        _var_ex_(loop_v, s32, linkage::local, 0);
-        _var_ex_(loop_g, s32, linkage::local, 0);
-        _for_(i, 0, 100) {
-            loop_v = loop_v + 1;
-            loop_g = loop_g + g;
-            A[a] = loop_v * 3;
-            A[a] = builder::make_cast(s32, i) + 2;
-        }
-        _return_(loop_g);
-    }
-
-    auto ssa_ccc = ssa_transform_t()(ccc);
-    // loop vars should not be volatile
-    check(ssa_ccc, false, volatility_result_t::NO);
-    volatility_analysis::reset_temp_data().dispatch(ssa_ccc);
-    // loop vars should be volatile
-    check(ssa_ccc, true, volatility_result_t::YES);
-
-    volatility_analysis::reset_temp_data().dispatch(ssa_ccc);
-    std::unordered_map<stmt_c, volatility_result_t> result_map;
-
-    volatility_analysis::viewer_t(false, &result_map).dispatch(ssa_ccc);
-    ASSERT_FALSE(result_map.empty());
-    bool met = false;
-    for (auto &kv : result_map) {
-        if (kv.first.static_as<define_c>()->var_.checked_as<var>()->name_
-                == "__tmp0") {
-            ASSERT_EQ(kv.second.is_volatile_, volatility_result_t::NO);
-            met = true;
-        }
-    }
-    ASSERT_TRUE(met);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/test_backend_api.cpp b/tests/gtests/graph/unit/backend/graph_compiler/test_backend_api.cpp
deleted file mode 100644
index 4cb7ee4bf1e..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/test_backend_api.cpp
+++ /dev/null
@@ -1,295 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <memory>
-#include <gtest/gtest.h>
-
-#include "backend/graph_compiler/compiler_backend.hpp"
-#include "backend/graph_compiler/compiler_partition_impl.hpp"
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-#include "interface/partition.hpp"
-#include "test_utils.hpp"
-
-namespace impl = dnnl::impl;
-namespace utils = dnnl::graph::tests::unit::utils;
-namespace compiler_utils = impl::graph::tests::unit::compiler::utils;
-
-TEST(GCBackendApi, GetMemSize_CPU) {
-    graph::logical_tensor_t a, b, c, d, e;
-    const std::vector<graph::dim_t> a_dim {1, 4, 3};
-    const std::vector<graph::dim_t> b_dim {32, 16, 64, 64};
-    const std::vector<graph::dim_t> c_dim {32};
-    const std::vector<graph::dim_t> d_dim {1, 1};
-
-    a = utils::logical_tensor_init(
-            0, a_dim, graph::data_type::u8, graph::layout_type::strided);
-    b = utils::logical_tensor_init(
-            1, b_dim, graph::data_type::s8, graph::layout_type::strided);
-    c = utils::logical_tensor_init(
-            2, c_dim, graph::data_type::f32, graph::layout_type::strided);
-    d = utils::logical_tensor_init(
-            3, d_dim, graph::data_type::s32, graph::layout_type::strided);
-    e = utils::logical_tensor_init(
-            4, {}, graph::data_type::f32, graph::layout_type::strided);
-
-    auto &compiler_backend_ptr
-            = graph::compiler_impl::compiler_backend_t::get_singleton();
-
-    size_t a_mem_res = utils::product(a_dim) * sizeof(signed char);
-    size_t b_mem_res = utils::product(b_dim) * sizeof(signed char);
-    size_t c_mem_res = utils::product(c_dim) * sizeof(float);
-    size_t d_mem_res = utils::product(d_dim) * sizeof(int32_t);
-    size_t e_mem_res = sizeof(float);
-    ASSERT_EQ(compiler_backend_ptr.get_mem_size(a), a_mem_res);
-    ASSERT_EQ(compiler_backend_ptr.get_mem_size(b), b_mem_res);
-    ASSERT_EQ(compiler_backend_ptr.get_mem_size(c), c_mem_res);
-    ASSERT_EQ(compiler_backend_ptr.get_mem_size(d), d_mem_res);
-    ASSERT_EQ(compiler_backend_ptr.get_mem_size(e), e_mem_res);
-}
-
-TEST(GCBackendApi, CompilerBackendRegistration_CPU) {
-    std::vector<const graph::backend_t *> &backends
-            = graph::backend_registry_t::get_singleton()
-                      .get_registered_backends();
-    auto compiler_backend = std::find_if(
-            backends.begin(), backends.end(), [](const graph::backend_t *bkd) {
-                return bkd->get_name() == "compiler_backend";
-            });
-    ASSERT_NE(compiler_backend, backends.end());
-    EXPECT_FLOAT_EQ((*compiler_backend)->get_priority(), 2.0);
-}
-
-TEST(GCBackendApi, TestRewriteOutputLayout_CPU) {
-    REQUIRE_AVX512();
-    using namespace impl::graph;
-    graph_t agraph;
-    compiler_utils::add_MHA_infer_shape(&agraph);
-    agraph.finalize();
-
-    auto &compiler_backend_ptr
-            = compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(agraph, partition_policy::fusion);
-    auto partitions = agraph.get_partitions();
-
-    partition_t p;
-    p.init(partitions[0]);
-    std::vector<const logical_tensor_t *> inputs;
-    std::vector<logical_tensor_t *> outputs;
-    for (auto &lt : p.get_inputs()) {
-        inputs.push_back(&lt);
-    }
-    for (auto &lt : p.get_outputs()) {
-        outputs.push_back(const_cast<logical_tensor_t *>(&lt));
-    }
-    // replace output node to be unknown shape + any format
-    outputs[0]->layout_type = graph::layout_type::any;
-    outputs[0]->ndims = -1;
-
-    // latest update will not overwrite output layout
-    p.infer_shape(inputs, outputs);
-    EXPECT_EQ(outputs[0]->layout_type, graph::layout_type::any);
-}
-
-// Test output tensor inplace
-static void build_conv_add_partition(graph::graph_t &agraph,
-        const graph::dims &input_shape, const graph::dims &filter_shape,
-        const graph::dims &strides, const graph::dims &output_shape) {
-    using dims = graph::dims;
-    // construct conv-add graph
-    utils::id_generator id_gen;
-    auto dtype = graph::data_type::f32;
-    graph::logical_tensor_t input0
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    graph::logical_tensor_t weight0
-            = utils::logical_tensor_init(id_gen.get_id(), filter_shape, dtype);
-    graph::logical_tensor_t output0
-            = utils::logical_tensor_init(id_gen.get_id(), output_shape, dtype);
-    graph::op_t conv_fwd(
-            id_gen.get_id(), graph::op_kind::Convolution, "conv_fwd0");
-    conv_fwd.set_attr<dims>(graph::op_attr::strides, strides);
-    conv_fwd.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_fwd.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_fwd.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_fwd.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_fwd.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_fwd.add_input(input0);
-    conv_fwd.add_input(weight0);
-    conv_fwd.add_output(output0);
-    agraph.add_op(&conv_fwd);
-
-    graph::logical_tensor_t input1
-            = utils::logical_tensor_init(id_gen.get_id(), output_shape, dtype);
-    graph::logical_tensor_t output
-            = utils::logical_tensor_init(id_gen.get_id(), output_shape, dtype);
-    graph::op_t add {id_gen.get_id(), graph::op_kind::Add, "add"};
-    add.add_input(output0);
-    add.add_input(input1);
-    add.add_output(output);
-    agraph.add_op(&add);
-    agraph.finalize();
-
-    // add {conv, add} to partitions
-    auto conv_add_partition
-            = std::make_shared<graph::compiler_impl::compiler_partition_impl_t>(
-                    agraph.get_engine_kind(), agraph.get_fpmath_mode(),
-                    graph::partition_kind_t::convolution_post_ops, "conv_add");
-    std::vector<graph::op_t *> ops;
-    dnnl::impl::graph::topo_order_visit(
-            agraph.get_output_ops(), [&ops](graph::op_t *op) {
-                ops.push_back(op);
-                return graph::status::success;
-            });
-    for (const auto &op : ops) {
-        conv_add_partition->add_op(op->shared_from_this());
-        op->set_partition(conv_add_partition.get());
-        for (const auto &value : op->get_input_values()) {
-            if (!value->has_producer()) {
-                conv_add_partition->add_input_tensor(value);
-            }
-        }
-        for (const auto &value : op->get_output_values()) {
-            if (value->get_consumers().empty()) {
-                conv_add_partition->add_output_tensor(value);
-            }
-        }
-    }
-    agraph.add_partition(conv_add_partition);
-}
-
-TEST(GCBackendApi, ConvAdd_Inplace0_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-
-    graph::graph_t agraph(engine->kind());
-    const graph::dims input_shape = {128, 3, 227, 227};
-    const graph::dims filter_shape = {16, 3, 11, 11};
-    const graph::dims strides = {4, 4};
-    const graph::dims output_shape = {128, 16, 55, 55};
-    build_conv_add_partition(
-            agraph, input_shape, filter_shape, strides, output_shape);
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    graph::partition_t p;
-    p.init(partitions[0]);
-    auto partition_inputs = p.get_inputs();
-    auto partition_outputs = p.get_outputs();
-    std::vector<const graph::logical_tensor_t *> inputs;
-    std::vector<const graph::logical_tensor_t *> outputs;
-    for (auto &lt : partition_inputs) {
-        inputs.push_back(&lt);
-    }
-    for (auto &lt : partition_outputs) {
-        outputs.push_back(&lt);
-    }
-    graph::compiled_partition_t cp(p);
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
-
-    // Check in-place pairs
-    size_t num_inplace_pairs = 10; // Initialized with an impossible value.
-    const dnnl_graph_inplace_pair_t *inplace_pairs = nullptr;
-    EXPECT_EQ(dnnl_graph_compiled_partition_get_inplace_ports(
-                      &cp, &num_inplace_pairs, &inplace_pairs),
-            dnnl_success);
-    /*
-    Main entry:
-    * @param logical_tensor_5 [f32 [128, 16, 55, 55] @ ABCD]
-    * @param logical_tensor_0 [f32 [128, 3, 227, 227] @ ABCD]
-    * @param logical_tensor_1 [f32 [16, 3, 11, 11] @ ABCD]
-    * @param logical_tensor_4 [f32 [128, 16, 55, 55] @ ABCD]
-    func conv_add_100004(logical_tensor_5: [f32 * 6195200UL], logical_tensor_0: [f32 * 19787136UL],
-            logical_tensor_1: [f32 * 5808UL], logical_tensor_4: [f32 * 6195200UL]): void {
-        // [f32 [1, 1, 11, 11, 3, 16] @ ABCD3b16a]
-        tensor buffer_3: [f32 * 5808UL]
-        evaluate{reorder_5(buffer_3, logical_tensor_1)}
-        // [f32 [128, 1, 55, 55, 16] @ ABCD16b]
-        tensor buffer_4: [f32 * 6195200UL]
-        evaluate{reorder_6(buffer_4, logical_tensor_4)}
-        // [f32 [128, 1, 227, 227, 3] @ ABCD3b]
-        tensor buffer_5: [f32 * 19787136UL]
-        evaluate{reorder_4(buffer_5, logical_tensor_0)}
-        // [f32 [128, 1, 55, 55, 16] @ ABCD16b]
-        tensor buffer_6: [f32 * 6195200UL]
-        evaluate{outerloop_1X128X1X55X1_partition_conv_fwd_core_add_8(buffer_6, buffer_5, buffer_3, buffer_4)}
-        evaluate{reorder_7(logical_tensor_5, buffer_6)}
-    }
-    Inplace: out buf: logical_tensor_5, in buf: logical_tensor_4
-    */
-    // This feature is disabled temporarily.
-    EXPECT_EQ(num_inplace_pairs, 0U);
-    /*
-    auto pair0 = *(inplace_pairs);
-    EXPECT_EQ(pair0.input_id,
-            agraph.get_input_values()[2]->get_logical_tensor().id);
-    EXPECT_EQ(pair0.output_id,
-            agraph.get_output_values()[0]->get_logical_tensor().id);
-    */
-}
-
-TEST(GCBackendApi, ConvAdd_Inplace1_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-
-    graph::graph_t agraph(engine->kind());
-    const graph::dims input_shape = {1, 1, 4, 4};
-    const graph::dims filter_shape = {1, 1, 1, 1};
-    const graph::dims strides = {1, 1};
-    const graph::dims output_shape = {1, 1, 4, 4};
-    build_conv_add_partition(
-            agraph, input_shape, filter_shape, strides, output_shape);
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    graph::partition_t p;
-    p.init(partitions[0]);
-    auto partition_inputs = p.get_inputs();
-    auto partition_outputs = p.get_outputs();
-    std::vector<const graph::logical_tensor_t *> inputs;
-    std::vector<const graph::logical_tensor_t *> outputs;
-    for (auto &lt : partition_inputs) {
-        inputs.push_back(&lt);
-    }
-    for (auto &lt : partition_outputs) {
-        outputs.push_back(&lt);
-    }
-    graph::compiled_partition_t cp(p);
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), graph::status::success);
-
-    /*
-    Main entry:
-    * @param logical_tensor_5 [f32 [1, 1, 4, 4] @ ABCD]
-    * @param logical_tensor_0 [f32 [1, 1, 4, 4] @ ABCD]
-    * @param logical_tensor_1 [f32 [1, 1, 1, 1] @ ABCD]
-    * @param logical_tensor_4 [f32 [1, 1, 4, 4] @ ABCD]
-    func conv_add_100004(logical_tensor_5: [f32 * 16UL], logical_tensor_0: [f32 * 16UL], 
-            logical_tensor_1: [f32 * 1UL], logical_tensor_4: [f32 * 16UL]): void {
-        evaluate{outerloop_1X1X1X4_partition_conv_fwd_core_add_8(&logical_tensor_5[0UL],
-                &logical_tensor_0[0UL], &logical_tensor_1[0UL], &logical_tensor_4[0UL])}
-    }
-    Inplace: out buf: logical_tensor_5, in buf: logical_tensor_4
-    */
-    // This feature is disabled temporarily.
-    // check inplace pairs
-    std::vector<graph::inplace_pair_t> inplace_pairs = cp.get_inplace_pairs();
-    ASSERT_EQ(inplace_pairs.size(), 0U);
-    /*
-    ASSERT_EQ(inplace_pairs[0].input_id,
-            agraph.get_input_values()[2]->get_logical_tensor().id);
-    ASSERT_EQ(inplace_pairs[0].output_id,
-            agraph.get_output_values()[0]->get_logical_tensor().id);
-    */
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/test_compile_execute.cpp b/tests/gtests/graph/unit/backend/graph_compiler/test_compile_execute.cpp
deleted file mode 100644
index c752c5f8ed6..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/test_compile_execute.cpp
+++ /dev/null
@@ -1,1189 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include "backend/graph_compiler/compiler_backend.hpp"
-#include "interface/allocator.hpp"
-#include "interface/graph.hpp"
-#include "interface/partition.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "test_utils.hpp"
-
-#include <gtest/gtest.h>
-#include <runtime/context.hpp>
-
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_CUSTOM
-struct gc_env_initializer {
-    gc_env_initializer() {
-        dnnl::impl::graph::gc::runtime::get_default_stream = []() {
-            static auto the_stream = []() {
-                dnnl::impl::graph::gc::runtime::stream_t ret
-                        = dnnl::impl::graph::gc::runtime::default_stream;
-                ret.vtable_.stream = ::get_stream();
-                return ret;
-            }();
-            return &the_stream;
-        };
-    }
-};
-static gc_env_initializer gc_test_init;
-#endif
-
-namespace impl = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-namespace compiler_utils = dnnl::impl::graph::tests::unit::compiler::utils;
-
-using ltsr_vec = std::vector<impl::logical_tensor_t>;
-static void set_mlp_dynamic_parti_ltsrs(int64_t real_batch_size,
-        ltsr_vec &parti_inputs, ltsr_vec &parti_outputs) {
-    parti_inputs[0].dims[0] = real_batch_size;
-    parti_outputs[0].dims[0] = real_batch_size;
-}
-
-static void compile_execution_pipeline(impl::graph_t &agraph,
-        int expected_part_size,
-        std::function<void(ltsr_vec &, ltsr_vec &)> dynamic_callback
-        = nullptr) {
-    auto &compiler_backend_ptr
-            = impl::compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(agraph, impl::partition_policy::fusion);
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), static_cast<size_t>(expected_part_size));
-    if (dynamic_callback) { ASSERT_EQ(expected_part_size, 1); }
-    // TODO(yifei): generalize the logic here
-    // sort partitions to run forward first according to num ops
-    std::sort(partitions.begin(), partitions.end(),
-            [](std::shared_ptr<impl::partition_impl_t> a,
-                    std::shared_ptr<impl::partition_impl_t> b) {
-                return a->get_ops().size() < b->get_ops().size();
-            });
-
-    std::unordered_map<size_t, impl::logical_tensor_t> lt_info_map;
-
-    for (size_t i = 0; i < partitions.size(); ++i) {
-        impl::partition_t p;
-        p.init(partitions[i]);
-        auto partition_inputs = p.get_inputs();
-        auto partition_outputs = p.get_outputs();
-
-        // replace partition inputs info if needed
-        for (size_t i = 0; i < partition_inputs.size(); ++i) {
-            if (lt_info_map.find(partition_inputs[i].id) != lt_info_map.end()) {
-                partition_inputs[i] = lt_info_map[partition_inputs[i].id];
-            }
-        }
-
-        std::vector<const impl::logical_tensor_t *> inputs;
-        std::vector<const impl::logical_tensor_t *> outputs;
-        for (auto &lt : partition_inputs) {
-            inputs.push_back(&lt);
-        }
-        for (auto &lt : partition_outputs) {
-            outputs.push_back(&lt);
-        }
-        impl::compiled_partition_t cp(p);
-        impl::engine_t &eng = *get_engine();
-        ASSERT_EQ(p.compile(&cp, inputs, outputs, &eng), impl::status::success);
-
-        std::vector<test_tensor> execution_inputs;
-        std::vector<test_tensor> execution_outputs;
-        partition_outputs.clear();
-        for (auto &lt : outputs) {
-            impl::logical_tensor_t compiled_output;
-            cp.query_logical_tensor(lt->id, &compiled_output);
-            partition_outputs.push_back(compiled_output);
-            assert(compiled_output.ndims > -1);
-        }
-        if (dynamic_callback) {
-            dynamic_callback(partition_inputs, partition_outputs);
-        }
-        for (auto &lt : partition_inputs) {
-            assert(lt.ndims > -1);
-            lt_info_map[lt.id] = lt;
-        }
-        for (auto &lt : partition_outputs) {
-            assert(lt.ndims > -1);
-            lt_info_map[lt.id] = lt;
-        }
-
-        for (auto &lt : partition_inputs) {
-            test_tensor placeholder(lt, &eng);
-            execution_inputs.push_back(placeholder);
-        }
-        for (auto &lt : partition_outputs) {
-            test_tensor placeholder(lt, &eng);
-            execution_outputs.push_back(placeholder);
-        }
-
-        impl::stream_t &strm = *get_stream();
-        ASSERT_EQ(cp.execute(&strm,
-                          test_tensor::to_graph_tensor(execution_inputs),
-                          test_tensor::to_graph_tensor(execution_outputs)),
-                impl::status::success);
-        strm.wait();
-    }
-}
-
-// test fp32 get partition + compile + execution of MHA graph
-TEST(GCGraphTest, FP32MHACompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32MHACompileExecution2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32MHACompileExecution3_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative2(&agraph);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// test ITEX pattern ends with StaticReshape
-TEST(GCGraphTest, FP32MHACompileExecution4_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(
-            &agraph, false, false, impl::op_kind::StaticReshape);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MHACompileExecutionDynamicQuantize_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(
-            &agraph, false, true, false, 128, 384, 16, 1024, true);
-    agraph.finalize();
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16MHACompileExecutionDynamicQuantize_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(
-            &agraph, true, true, false, 128, 384, 16, 1024, true);
-    agraph.finalize();
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MHACompileExecutionDynamicQuantize2_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, true,
-            impl::op_kind::Reorder, 128, 384, 16, 1024, true);
-    agraph.finalize();
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16MHACompileExecutionDynamicQuantize2_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, true,
-            impl::op_kind::Reorder, 128, 384, 16, 1024, true);
-    agraph.finalize();
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MHACompileExecutionFake_CPU) {
-    REQUIRE_AVX512(); // fake int8, so it only requires avx512
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative2(&agraph, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// test int8 get partition + compile + execution of MHA graph
-TEST(GCGraphTest, INT8MHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16MHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, true, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16MHACompileExecution2_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, true, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// test ITEX pattern ends with StaticReshape
-TEST(GCGraphTest, INT8BF16MHACompileExecution3_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(
-            &agraph, true, true, impl::op_kind::StaticReshape);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MLPCompileExecutionDynamicQuantize_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_int8_mlp_subgraph(&agraph, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid},
-            false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// test bf16 get partition + compile + execution of MHA graph
-TEST(GCGraphTest, BF16MHACompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// test infer shape + compile + execute for fp32 MHA
-// the created graph is without intermediate shapes
-// if infer shape failed, the compilation will also fail
-TEST(GCGraphTest, FP32MHAInfershapeCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_infer_shape(&agraph);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// test compile + multithreading execution for fp32 MHA
-TEST(GCGraphTest, FP32MHACompileExecutionMultiThreading_CPU) {
-    REQUIRE_AVX512();
-#if SC_CPU_THREADPOOL == SC_THREAD_POOL_TBB
-    GTEST_SKIP();
-#endif
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false);
-    agraph.finalize();
-
-    auto &compiler_backend_ptr
-            = impl::compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(agraph, impl::partition_policy::fusion);
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    impl::partition_t p;
-    p.init(partitions[0]);
-    auto partition_inputs = p.get_inputs();
-    auto partition_outputs = p.get_outputs();
-
-    std::vector<const impl::logical_tensor_t *> inputs;
-    std::vector<const impl::logical_tensor_t *> outputs;
-    for (auto &lt : partition_inputs) {
-        inputs.push_back(&lt);
-    }
-    for (auto &lt : partition_outputs) {
-        outputs.push_back(&lt);
-    }
-
-    impl::compiled_partition_t cp(p);
-    ASSERT_EQ(p.compile(&cp, inputs, outputs, engine), impl::status::success);
-
-    int thread_num = 2;
-
-    auto thread_func = [&](size_t tid) {
-        std::vector<test_tensor> execution_inputs;
-        std::vector<test_tensor> execution_outputs;
-
-        for (auto &lt : partition_inputs) {
-            test_tensor placeholder(lt, engine);
-            execution_inputs.push_back(placeholder);
-        }
-        for (auto &lt : partition_outputs) {
-            graph::logical_tensor_t compiled_output;
-            cp.query_logical_tensor(lt.id, &compiled_output);
-            test_tensor placeholder(compiled_output, engine);
-            execution_outputs.push_back(placeholder);
-        }
-        impl::stream_t &strm = *get_stream();
-        ASSERT_EQ(cp.execute(&strm,
-                          test_tensor::to_graph_tensor(execution_inputs),
-                          test_tensor::to_graph_tensor(execution_outputs)),
-                impl::status::success);
-    };
-
-    std::vector<std::thread> workers;
-    for (int t_num = 0; t_num < thread_num; t_num++) {
-        workers.emplace_back(thread_func, t_num);
-    }
-
-    for (int t_num = 0; t_num < thread_num; t_num++) {
-        workers[t_num].join();
-    }
-}
-
-TEST(GCGraphTest, FP32MHAAlternativeCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32MHAAlternativeCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16MHAAlternativeCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16MHAAlternativeCompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32MHAAlternative4CompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative4(&agraph, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MHAAlternative4CompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative4(&agraph, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32BartMHACompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16BartMHACompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BartMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16BartMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32DistillBertMHA_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16DistillBertMHA_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32DistillBertMHA_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16DistillBertMHA_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32MLPCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MLPCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_int8_mlp_subgraph(&agraph, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16MLPCompileExecution_CPU) {
-    REQUIRE_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, true, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32MLPDynamicGraphCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, -1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1,
-            std::bind(set_mlp_dynamic_parti_ltsrs, static_cast<int64_t>(1),
-                    std::placeholders::_1, std::placeholders::_2));
-}
-
-TEST(GCGraphTest, INT8MLPDynamicGraphCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_int8_mlp_subgraph(&agraph, -1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1,
-            std::bind(set_mlp_dynamic_parti_ltsrs, static_cast<int64_t>(1),
-                    std::placeholders::_1, std::placeholders::_2));
-}
-
-TEST(GCGraphTest, BF16MLPDynamicGraphCompileExecution_CPU) {
-    REQUIRE_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, true, -1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1,
-            std::bind(set_mlp_dynamic_parti_ltsrs, static_cast<int64_t>(1),
-                    std::placeholders::_1, std::placeholders::_2));
-}
-
-TEST(GCGraphTest, FP32MLPTrainingGraphCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_training_graph(&agraph, 128, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid},
-            {impl::op_kind::SigmoidBackward, impl::op_kind::ReLUBackward,
-                    impl::op_kind::ReLUBackward, impl::op_kind::ReLUBackward,
-                    impl::op_kind::ReLUBackward});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, FP32MLPTrainingGraphCompileExecution2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_training_graph(&agraph, 128, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid},
-            {impl::op_kind::SigmoidBackward, impl::op_kind::ReLUBackward,
-                    impl::op_kind::ReLUBackward, impl::op_kind::ReLUBackward,
-                    impl::op_kind::ReLUBackward},
-            false, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, BF16MLPTrainingGraphCompileExecution_CPU) {
-    REQUIRE_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_training_graph(&agraph, 128, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {impl::op_kind::ReLU, impl::op_kind::ReLU, impl::op_kind::ReLU,
-                    impl::op_kind::ReLU, impl::op_kind::Sigmoid},
-            {impl::op_kind::SigmoidBackward, impl::op_kind::ReLUBackward,
-                    impl::op_kind::ReLUBackward, impl::op_kind::ReLUBackward,
-                    impl::op_kind::ReLUBackward},
-            true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, FP32MHATrainingGraphCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, BF16MHATrainingGraphCompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, FP32MHATrainingGraphCompileExecution2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, BF16MHATrainingGraphCompileExecution2_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, FP32IdenticalBottleneckCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_resblock(&agraph, id_gen,
-            {1, 256, 56, 56},
-            {{64, 256, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32ConvolutionalBottleneckCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_resblock(&agraph, id_gen,
-            {1, 64, 56, 56},
-            {{256, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8IdenticalBottleneckCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_identical_bottleneck_resblock(&agraph,
-            id_gen, {1, 256, 56, 56},
-            {{64, 256, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8IdenticalBottleneckCompileExecutionNXC_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_identical_bottleneck_resblock(&agraph,
-            id_gen, {1, 56, 56, 256},
-            {{1, 1, 256, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}},
-            {{1, 1}, {1, 1}, {1, 1}}, {{0, 0}, {1, 1}, {0, 0}}, "NXC", "XIO");
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8ConvolutionalBottleneckCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_convolutional_bottleneck_resblock(&agraph,
-            id_gen, {1, 64, 56, 56},
-            {{256, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32IdenticalBottleneckTrainingCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_training_subgraph(&agraph,
-            id_gen, {1, 56, 56, 256},
-            {{1, 1, 256, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, FP32IdenticalBottleneckTrainingCompileExecutionNCX_CPU) {
-    REQUIRE_AVX512();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_training_subgraph(&agraph,
-            id_gen, {1, 256, 56, 56},
-            {{64, 256, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}}, false, false,
-            {{1, 1}, {1, 1}, {1, 1}}, {{0, 0}, {1, 1}, {0, 0}}, "NCX", "OIX");
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, FP32ConvolutionalBottleneckTrainingCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_training_subgraph(
-            &agraph, id_gen, {1, 56, 56, 64},
-            {{1, 1, 64, 256}, {1, 1, 64, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, BF16IdenticalBottleneckTrainingCompileExecution_CPU) {
-    REQUIRE_AMXBF16();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_training_subgraph(&agraph,
-            id_gen, {64, 56, 56, 256},
-            {{1, 1, 256, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}}, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, BF16ConvolutionalBottleneckTrainingCompileExecution_CPU) {
-    REQUIRE_AMXBF16();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-#if SC_BUILTIN_JIT_ENABLED
-    if (::dnnl::impl::graph::gc::get_default_context()->flags_.jit_kind_
-            == ::dnnl::impl::graph::gc::jit_kind::xbyak) {
-        GTEST_SKIP();
-        return;
-    }
-#endif
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_training_subgraph(
-            &agraph, id_gen, {64, 56, 56, 64},
-            {{1, 1, 64, 256}, {1, 1, 64, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}},
-            true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 2);
-}
-
-TEST(GCGraphTest, INT8IdenticalBottleneckCompileExecutionDynamicQuantize_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_identical_bottleneck_resblock(&agraph,
-            id_gen, {1, 256, 56, 56},
-            {{64, 256, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}},
-            {{1, 1}, {1, 1}, {1, 1}}, {{0, 0}, {1, 1}, {0, 0}}, "NCX", "OIX",
-            true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest,
-        INT8IdenticalBottleneckCompileExecutionDynamicQuantizeNXC_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_identical_bottleneck_resblock(&agraph,
-            id_gen, {1, 56, 56, 256},
-            {{1, 1, 256, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}},
-            {{1, 1}, {1, 1}, {1, 1}}, {{0, 0}, {1, 1}, {0, 0}}, "NXC", "XIO",
-            true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest,
-        INT8ConvolutionalBottleneckCompileExecutionDynamicQuantize_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_convolutional_bottleneck_resblock(&agraph,
-            id_gen, {1, 64, 56, 56},
-            {{256, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}},
-            {{2, 2}, {1, 1}, {2, 2}, {1, 1}}, {{0, 0}, {0, 0}, {1, 1}, {0, 0}},
-            "NCX", "OIX", true);
-
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8MulQuantizeCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_mul_quantize_subgraph(
-            &agraph, id_gen, {4, 1, 4096});
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32GPTMHACompileExecution_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16GPTMHACompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32GPTMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16GPTMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32LLAMAMHACompileExecution_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16LLAMAMHACompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32LLAMAMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16LLAMAMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32GPTMLPCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16GPTMLPCompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32GPTMLPCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16GPTMLPCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32LLAMAMLPCompileExecution_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16LLAMAMLPCompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32LLAMAMLPCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16LLAMAMLPCompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-// shall be re-enabled after fp32/bf16 concat patterns are re-enabled
-#if 0
-TEST(GCGraphTest, GptjBf16Concat_CPU) {
-    REQUIRE_AVX512();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_gptj_concat_subgraph(&agraph);
-    agraph.finalize();
-
-    // should hit add_to_concat_permute_concat_to pattern
-    compile_execution_pipeline(agraph, 1);
-}
-#endif
-
-TEST(GCGraphTest, GptjInt8Bf16Concat_CPU) {
-    REQUIRE_AVX512();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_gptj_concat_subgraph(&agraph, true);
-    agraph.finalize();
-
-    // should hit mul_mul_add_concat_permute_concat_quant pattern
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, LlamaInt8Bf16Concat_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::add_llama_concat_subgraph(&agraph, true);
-    agraph.finalize();
-
-    // should hit add_typecast_concat_typecasts_quant pattern
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, FP32STARCODERMHACompileExecution_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, false, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, BF16STARCODERMHACompileExecution_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, true, false);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8FP32STARCODERMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, false, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
-
-TEST(GCGraphTest, INT8BF16STARCODERMHACompileExecution_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    SKIP_WHEN_SINGLE_OP_PATTERN_ON();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    impl::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, true, true);
-    agraph.finalize();
-
-    compile_execution_pipeline(agraph, 1);
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/test_pattern.cpp b/tests/gtests/graph/unit/backend/graph_compiler/test_pattern.cpp
deleted file mode 100644
index 20b5ce7ac88..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/test_pattern.cpp
+++ /dev/null
@@ -1,1181 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "backend/graph_compiler/compiler_backend.hpp"
-#include "graph/unit/unit_test_common.hpp"
-#include "interface/graph.hpp"
-#include "interface/partition.hpp"
-#include "test_utils.hpp"
-#include "utils/pm/pass_base.hpp"
-#include "utils/pm/pass_manager.hpp"
-
-namespace impl = dnnl::impl;
-namespace compiler_impl = dnnl::impl::graph::compiler_impl;
-namespace pass = dnnl::impl::graph::pass;
-namespace utils = dnnl::graph::tests::unit::utils;
-namespace compiler_utils = dnnl::impl::graph::tests::unit::compiler::utils;
-
-typedef struct {
-    unsigned int num_ops;
-    unsigned int num_inputs;
-    unsigned int num_outputs;
-} partition_info_t;
-
-pass::pass_base_ptr get_pass(compiler_impl::compiler_backend_t &backend_ptr,
-        const std::string &pass_name) {
-    auto pm = pass::pass_manager_t(backend_ptr.get_pass_registry());
-    auto &passes = pm.get_passes();
-    auto find = std::find_if(passes.begin(), passes.end(),
-            [&pass_name](const pass::pass_base_ptr &p) -> bool {
-                return p->get_pass_name() == pass_name;
-            });
-    if (find == passes.end()) { return nullptr; }
-    return *find;
-}
-
-void test_pattern_matched(graph::graph_t &agraph,
-        const std::vector<std::string> &pass_names, unsigned int partition_num,
-        const std::vector<partition_info_t> &partition_infos) {
-    auto &compiler_backend_ptr
-            = compiler_impl::compiler_backend_t::get_singleton();
-
-    for (const auto &pass_name : pass_names) {
-        pass::pass_base_ptr apass = get_pass(compiler_backend_ptr, pass_name);
-        apass->run(agraph);
-    }
-
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), partition_num);
-
-    for (unsigned int i = 0; i < partition_num; ++i) {
-        graph::partition_t p;
-        p.init(partitions[i]);
-
-        auto partition_inputs = p.get_inputs();
-        auto partition_outputs = p.get_outputs();
-
-        ASSERT_EQ(p.num_ops(), partition_infos[i].num_ops);
-        ASSERT_EQ(partition_inputs.size(), partition_infos[i].num_inputs);
-        ASSERT_EQ(partition_outputs.size(), partition_infos[i].num_outputs);
-    }
-}
-
-// test int8 MHA pattern (optimized graph)
-TEST(GCPatternTests, INT8MHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{20, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8MHAPattern2_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{19, 5, 1}});
-}
-
-// test fp32 MHA pattern
-TEST(GCPatternTests, FP32MHAPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{14, 5, 1}});
-}
-
-TEST(GCPatternTests, FP32MHAPattern2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, false, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{13, 5, 1}});
-}
-
-// test fp32 MHA pattern alternative
-TEST(GCPatternTests, FP32MHAPatternAlternative_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{7, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8FP32MHAPatternAlternative_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{13, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16MHAPatternAlternative_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{19, 5, 1}});
-}
-
-// test fp32 MHA pattern (no reshape)
-TEST(GCPatternTests, FP32MHAPatternOptionalReshape_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    utils::construct_f32_MHA(&agraph);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{13, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16MHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, true, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{25, 5, 1}});
-}
-
-// test bf16 MHA pattern
-TEST(GCPatternTests, BF16MHAPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph(&agraph, true, false);
-    agraph.finalize();
-
-    auto &compiler_backend_ptr
-            = compiler_impl::compiler_backend_t::get_singleton();
-
-    // it shall not match fp32 pass
-    pass::pass_base_ptr apass
-            = get_pass(compiler_backend_ptr, "fp32_mha_pattern");
-
-    apass->run(agraph);
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 0U);
-
-    apass = get_pass(compiler_backend_ptr, "bf16_mha_pattern");
-    REQUIRE_BF16_AMXBF16();
-    apass->run(agraph);
-    partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    graph::partition_t p;
-    p.init(partitions[0]);
-
-    auto partition_inputs = p.get_inputs();
-    auto partition_outputs = p.get_outputs();
-    ASSERT_EQ(partition_inputs.size(), 5U);
-    ASSERT_EQ(partition_outputs.size(), 1U);
-}
-
-// test bf16 MHA pattern alternative
-TEST(GCPatternTests, BF16MHAPatternAlternative_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, false);
-    agraph.finalize();
-
-    auto &compiler_backend_ptr
-            = compiler_impl::compiler_backend_t::get_singleton();
-
-    // it shall not match fp32 alternative pass
-    pass::pass_base_ptr apass
-            = get_pass(compiler_backend_ptr, "fp32_mha_pattern_alternative");
-
-    apass->run(agraph);
-    auto partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 0U);
-
-    apass = get_pass(compiler_backend_ptr, "bf16_mha_pattern_alternative");
-    REQUIRE_BF16_AMXBF16();
-    apass->run(agraph);
-    partitions = agraph.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    graph::partition_t p;
-    p.init(partitions[0]);
-
-    auto partition_inputs = p.get_inputs();
-    auto partition_outputs = p.get_outputs();
-    ASSERT_EQ(partition_inputs.size(), 5U);
-    ASSERT_EQ(partition_outputs.size(), 1U);
-}
-
-// test MHA pattern matcher v2 on graph variations
-TEST(GCPatternTests, INT8MHAPatternVariation1_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::get_int8_MHA_subgraph_varients(&agraph);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{20, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8MHAPatternVariation2_CPU) {
-    // replace divide with multiply
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::get_int8_MHA_subgraph_varients(&agraph, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{20, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8MHAPatternVariation3_CPU) {
-    // set rescale output as Add's second input
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::get_int8_MHA_subgraph_varients(&agraph, true,
-            std::vector<compiler_utils::quantize_position_t>(
-                    4, compiler_utils::RESHAPE_INCLUDED),
-            1);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{20, 5, 1}});
-}
-
-// test fp32 distill_bert MHA pattern
-TEST(GCPatternTests, FP32DistillBertMHAPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_distill_bert_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{6, 5, 1}});
-}
-
-// test bf16 distill_bert MHA pattern
-TEST(GCPatternTests, BF16DistillBertMHAPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_distill_bert_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{6, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8FP32DistillBertMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_distill_bert_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{12, 5, 1}});
-}
-
-// test int8-bf16 distill_Bert MHA pattern
-TEST(GCPatternTests, INT8BF16DistillBertMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_distill_bert_MHA(&agraph, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_distill_bert_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{18, 5, 1}});
-}
-
-TEST(GCPatternTests, FP32DLRMBottom_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, 1, 3, {13, 512, 256, 128},
-            {graph::op_kind::ReLU, graph::op_kind::ReLU, graph::op_kind::ReLU});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mlp_forward_pattern"}, 1,
-            std::vector<partition_info_t> {{6, 7, 1}});
-}
-
-TEST(GCPatternTests, FP32DLRMTop_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {graph::op_kind::ReLU, graph::op_kind::ReLU, graph::op_kind::ReLU,
-                    graph::op_kind::ReLU, graph::op_kind::Sigmoid});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mlp_forward_pattern"}, 1,
-            std::vector<partition_info_t> {{10, 11, 1}});
-}
-
-TEST(GCPatternTests, INT8DLRMBottom_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_int8_mlp_subgraph(&agraph, 1, 3, {13, 512, 256, 128},
-            {graph::op_kind::ReLU, graph::op_kind::ReLU, graph::op_kind::ReLU});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mlp_pattern"}, 1,
-            std::vector<partition_info_t> {{15, 7, 1}});
-}
-
-TEST(GCPatternTests, INT8DLRMTop_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_int8_mlp_subgraph(&agraph, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {graph::op_kind::ReLU, graph::op_kind::ReLU, graph::op_kind::ReLU,
-                    graph::op_kind::ReLU, graph::op_kind::Sigmoid});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mlp_pattern"}, 1,
-            std::vector<partition_info_t> {{25, 11, 1}});
-}
-
-TEST(GCPatternTests, FP32MLPSeparateAdd_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {graph::op_kind::ReLU, graph::op_kind::ReLU, graph::op_kind::ReLU,
-                    graph::op_kind::ReLU, graph::op_kind::Sigmoid},
-            true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mlp_forward_pattern"}, 1,
-            std::vector<partition_info_t> {{15, 11, 1}});
-}
-
-TEST(GCPatternTests, FP32MLPNoActivation_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {graph::op_kind::Wildcard, graph::op_kind::Wildcard,
-                    graph::op_kind::Wildcard, graph::op_kind::Wildcard,
-                    graph::op_kind::Wildcard});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mlp_forward_pattern"}, 1,
-            std::vector<partition_info_t> {{5, 11, 1}});
-}
-
-TEST(GCPatternTests, FP32MLPSeparateAddNoActivation_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_subgraph(&agraph, false, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {graph::op_kind::Wildcard, graph::op_kind::Wildcard,
-                    graph::op_kind::Wildcard, graph::op_kind::Wildcard,
-                    graph::op_kind::Wildcard},
-            true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mlp_forward_pattern"}, 1,
-            std::vector<partition_info_t> {{10, 11, 1}});
-}
-
-TEST(GCPatternTests, INT8MLPNoActivation_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_int8_mlp_subgraph(&agraph, 1, 5,
-            {479, 1024, 1024, 512, 256, 1},
-            {graph::op_kind::Wildcard, graph::op_kind::Wildcard,
-                    graph::op_kind::Wildcard, graph::op_kind::Wildcard,
-                    graph::op_kind::Wildcard});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mlp_pattern"}, 1,
-            std::vector<partition_info_t> {{20, 11, 1}});
-}
-
-TEST(GCPatternTests, FP32MLPTraining_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_mlp_training_graph(&agraph, 1, 3, {13, 512, 256, 128},
-            {graph::op_kind::ReLU, graph::op_kind::ReLU, graph::op_kind::ReLU},
-            {graph::op_kind::ReLUBackward, graph::op_kind::ReLUBackward,
-                    graph::op_kind::ReLUBackward});
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"fp32_mlp_forward_pattern", "fp32_mlp_backward_pattern_v2"}, 2,
-            std::vector<partition_info_t> {{6, 7, 3}, {13, 7, 3}});
-}
-
-TEST(GCPatternTests, FP32MHATrainingPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"fp32_mha_forward_pattern", "fp32_mha_backward_pattern"}, 2,
-            std::vector<partition_info_t> {{8, 6, 3}, {11, 8, 3}});
-}
-
-TEST(GCPatternTests, FP32MHATrainingPattern2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"fp32_mha_forward_pattern", "fp32_mha_backward_pattern"}, 2,
-            std::vector<partition_info_t> {{9, 7, 3}, {12, 9, 3}});
-}
-
-TEST(GCPatternTests, BF16MHATrainingPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"bf16_mha_forward_pattern", "bf16_mha_backward_pattern"}, 2,
-            std::vector<partition_info_t> {{8, 6, 3}, {11, 8, 3}});
-}
-
-TEST(GCPatternTests, BF16MHATrainingPattern2_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_training_subgraph(&agraph, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"bf16_mha_forward_pattern", "bf16_mha_backward_pattern"}, 2,
-            std::vector<partition_info_t> {{9, 7, 3}, {12, 9, 3}});
-}
-
-TEST(GCPatternTests, FP32IdenticalBottleneckPattern1_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_resblock(&agraph, id_gen,
-            {1, 64, 56, 56}, {{64, 64, 1, 1}, {64, 64, 1, 1}}, false,
-            {{1, 1}, {1, 1}}, {{0, 0}, {0, 0}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"f32_identical_bottleneck"}, 1,
-            std::vector<partition_info_t> {{5, 5, 1}});
-}
-
-TEST(GCPatternTests, FP32IdenticalBottleneckPattern2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_resblock(&agraph, id_gen,
-            {1, 64, 56, 56},
-            {{64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1}},
-            false, {{1, 1}, {1, 1}, {1, 1}, {1, 1}},
-            {{0, 0}, {0, 0}, {0, 0}, {0, 0}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"f32_identical_bottleneck"}, 1,
-            std::vector<partition_info_t> {{9, 9, 1}});
-}
-
-TEST(GCPatternTests, FP32ConvolutionalBottleneckPattern1_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_resblock(&agraph, id_gen,
-            {1, 64, 56, 56}, {{64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1}},
-            false, {{2, 2}, {1, 1}, {2, 2}}, {{0, 0}, {0, 0}, {0, 0}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"f32_convolutional_bottleneck"}, 1,
-            std::vector<partition_info_t> {{6, 7, 1}});
-}
-
-TEST(GCPatternTests, FP32ConvolutionalBottleneckPattern2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_resblock(&agraph, id_gen,
-            {1, 64, 56, 56},
-            {{64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1},
-                    {64, 64, 1, 1}},
-            false, {{2, 2}, {1, 1}, {2, 2}, {1, 1}, {1, 1}},
-            {{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"f32_convolutional_bottleneck"}, 1,
-            std::vector<partition_info_t> {{10, 11, 1}});
-}
-
-TEST(GCPatternTests, BF16IdenticalBottleneckPattern_CPU) {
-    REQUIRE_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_resblock(&agraph, id_gen,
-            {1, 256, 56, 56},
-            {{256, 64, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}}, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_identical_bottleneck"}, 1,
-            std::vector<partition_info_t> {{7, 7, 1}});
-}
-
-TEST(GCPatternTests, INT8ConvolutionalBottleneckPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_int8_convolutional_bottleneck_resblock(&agraph,
-            id_gen, {1, 64, 56, 56},
-            {{64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 1, 1},
-                    {64, 64, 1, 1}},
-            {{2, 2}, {1, 1}, {2, 2}, {1, 1}, {1, 1}},
-            {{0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}, {0, 0}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_convolutional_bottleneck"}, 1,
-            std::vector<partition_info_t> {{26, 11, 1}});
-}
-
-TEST(GCPatternTests, BF16ConvolutionalBottleneckPattern_CPU) {
-    REQUIRE_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_resblock(&agraph, id_gen,
-            {1, 64, 56, 56},
-            {{256, 64, 1, 1}, {64, 64, 1, 1}, {64, 64, 3, 3}, {256, 64, 1, 1}},
-            true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_convolutional_bottleneck"}, 1,
-            std::vector<partition_info_t> {{8, 9, 1}});
-}
-
-TEST(GCPatternTests, FP32ConvolutionalBottleneckTrainingPattern_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_convolutional_bottleneck_training_subgraph(
-            &agraph, id_gen, {1, 56, 56, 64},
-            {{1, 1, 64, 256}, {1, 1, 64, 64}, {3, 3, 64, 64}, {1, 1, 64, 256}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"f32_convolutional_bottleneck_forward",
-                    "f32_convolutional_bottleneck_backward_v1"},
-            2, std::vector<partition_info_t> {{12, 21, 23}, {16, 25, 13}});
-}
-
-TEST(GCPatternTests, FP32IdenticalBottleneckTrainingPattern_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_identical_bottleneck_training_subgraph(&agraph,
-            id_gen, {1, 56, 56, 64},
-            {{1, 1, 64, 64}, {3, 3, 64, 64}, {1, 1, 64, 64}});
-    agraph.finalize();
-
-    test_pattern_matched(agraph,
-            {"f32_identical_bottleneck_forward",
-                    "f32_identical_bottleneck_backward_v1"},
-            2, std::vector<partition_info_t> {{10, 16, 18}, {13, 20, 10}});
-}
-
-TEST(GCPatternTests, FP32MatMulSoftmaxPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_matmul_softmax_fusion"}, 1,
-            std::vector<partition_info_t> {{4, 4, 1}});
-}
-
-TEST(GCPatternTests, BF16MatMulSoftmaxPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_matmul_softmax_fusion"}, 1,
-            std::vector<partition_info_t> {{4, 4, 1}});
-}
-
-TEST(GCPatternTests, INT8MatMulSoftmaxPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_matmul_softmax_fusion"}, 1,
-            std::vector<partition_info_t> {{7, 4, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16MatMulSoftmaxPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative(&agraph, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_matmul_softmax_fusion"}, 1,
-            std::vector<partition_info_t> {{10, 4, 1}});
-}
-
-TEST(GCPatternTests, INT8MulQuantizePattern_CPU) {
-    {
-        REQUIRE_AVX512();
-        utils::id_generator id_gen;
-        REQUIRE_CPU_ENGINE();
-        graph::graph_t agraph(engine->kind());
-        compiler_utils::construct_mul_quantize_subgraph(
-                &agraph, id_gen, {4, 1, 4096});
-        agraph.finalize();
-
-        auto ops = agraph.get_ops();
-
-        for (auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::Multiply) {
-                auto relu = agraph.create_op(graph::op_kind::ReLU, "relu");
-                auto in_val = op->get_input_value(0);
-                in_val->remove_consumer(*op, 0);
-                in_val->add_consumer(*relu, 0);
-                relu->add_input(in_val);
-                auto relu_out = in_val->get_logical_tensor();
-                relu_out.id = id_gen.get_id();
-                auto new_val = std::make_shared<graph::value_t>(
-                        *relu, 0, relu_out, false);
-                relu->add_output(new_val);
-                new_val->add_consumer(*op, 0);
-                op->connect_input(0, new_val);
-                break;
-            }
-        }
-
-        auto &compiler_backend_ptr
-                = compiler_impl::compiler_backend_t::get_singleton();
-        pass::pass_base_ptr apass
-                = get_pass(compiler_backend_ptr, "mul_typecast_quantize");
-
-        apass->run(agraph);
-        auto partitions = agraph.get_partitions();
-        ASSERT_EQ(partitions.size(), 0U);
-    }
-    {
-        REQUIRE_AVX512();
-        utils::id_generator id_gen;
-        REQUIRE_CPU_ENGINE();
-        graph::graph_t agraph(engine->kind());
-        compiler_utils::construct_mul_quantize_subgraph(
-                &agraph, id_gen, {4, 1, 4096});
-        agraph.finalize();
-
-        auto ops = agraph.get_ops();
-
-        for (auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::Multiply) {
-                auto wildcard = agraph.create_op(
-                        graph::op_kind::Wildcard, "wildcard");
-                auto in_val = op->get_input_value(0);
-                wildcard->add_output(in_val);
-                in_val->set_producer(*wildcard);
-                break;
-            }
-        }
-
-        auto &compiler_backend_ptr
-                = compiler_impl::compiler_backend_t::get_singleton();
-        pass::pass_base_ptr apass
-                = get_pass(compiler_backend_ptr, "mul_typecast_quantize");
-
-        apass->run(agraph);
-        auto partitions = agraph.get_partitions();
-        ASSERT_EQ(partitions.size(), 1U);
-
-        graph::partition_t p;
-        p.init(partitions[0]);
-
-        auto partition_inputs = p.get_inputs();
-        auto partition_outputs = p.get_outputs();
-        ASSERT_EQ(p.num_ops(), 2U);
-        ASSERT_EQ(partition_inputs.size(), 2U);
-        ASSERT_EQ(partition_outputs.size(), 1U);
-    }
-}
-
-TEST(GCPatternTests, FP32GPTMHAPattern_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_gpt_mha"}, 1,
-            std::vector<partition_info_t> {{8, 7, 1}});
-}
-
-TEST(GCPatternTests, BF16GPTMHAPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_gpt_mha"}, 1,
-            std::vector<partition_info_t> {{8, 7, 1}});
-}
-
-TEST(GCPatternTests, INT8FP32GPTMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_gpt_mha"}, 1,
-            std::vector<partition_info_t> {{14, 7, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16GPTMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mha_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_gpt_mha"}, 1,
-            std::vector<partition_info_t> {{20, 7, 1}});
-}
-
-TEST(GCPatternTests, FP32LLAMAMHAPattern_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_llama_mha"}, 1,
-            std::vector<partition_info_t> {{6, 6, 1}});
-}
-
-TEST(GCPatternTests, BF16LLAMAMHAPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_llama_mha"}, 1,
-            std::vector<partition_info_t> {{8, 6, 1}});
-}
-
-TEST(GCPatternTests, INT8FP32LLAMAMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_llama_mha"}, 1,
-            std::vector<partition_info_t> {{11, 6, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16LLAMAMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mha_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_llama_mha"}, 1,
-            std::vector<partition_info_t> {{16, 6, 1}});
-}
-
-TEST(GCPatternTests, FP32GPTMLPPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_gpt_mlp"}, 1,
-            std::vector<partition_info_t> {{7, 10, 1}});
-}
-
-TEST(GCPatternTests, INT8GPTMLPPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_gpt_mlp"}, 1,
-            std::vector<partition_info_t> {{15, 10, 1}});
-}
-
-TEST(GCPatternTests, BF16GPTMLPPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_gpt_mlp"}, 1,
-            std::vector<partition_info_t> {{7, 10, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16GPTMLPPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_gpt_mlp_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_gpt_mlp"}, 1,
-            std::vector<partition_info_t> {{23, 10, 1}});
-}
-
-TEST(GCPatternTests, FP32LLAMAMLPPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_llama_mlp"}, 1,
-            std::vector<partition_info_t> {{21, 10, 1}});
-}
-
-TEST(GCPatternTests, INT8LLAMAMLPPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_llama_mlp"}, 1,
-            std::vector<partition_info_t> {{32, 10, 1}});
-}
-
-TEST(GCPatternTests, BF16LLAMAMLPPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    REQUIRE_AMX();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_llama_mlp"}, 1,
-            std::vector<partition_info_t> {{25, 10, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16LLAMAMLPPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_llama_mlp_subgraph(&agraph, id_gen, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_llama_mlp"}, 1,
-            std::vector<partition_info_t> {{47, 10, 1}});
-}
-
-TEST(GCPatternTests, FP32MHAPatternAlternative2_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative2(&agraph);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern_alternative2"}, 1,
-            std::vector<partition_info_t> {{14, 4, 1}});
-}
-
-TEST(GCPatternTests, FAKEINT8MHAPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative2(&agraph, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fake_int8_mha_pattern"}, 1,
-            std::vector<partition_info_t> {{15, 5, 1}});
-}
-
-TEST(GCPatternTests, FP32BartMHAPattern_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{3, 3, 1}});
-}
-
-TEST(GCPatternTests, BF16BartMHAPattern_CPU) {
-    REQUIRE_AMXBF16();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{3, 3, 1}});
-}
-
-TEST(GCPatternTests, INT8FP32BartMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{9, 3, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16BartMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_bart_MHA(&agraph, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_mha_pattern_alternative"}, 1,
-            std::vector<partition_info_t> {{15, 3, 1}});
-}
-
-TEST(GCPatternTests, FP32MHAPatternAlternative4_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative4(&agraph, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_mha_pattern_alternative4"}, 1,
-            std::vector<partition_info_t> {{7, 5, 1}});
-}
-
-TEST(GCPatternTests, INT8MHAPatternAlternative4_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_MHA_subgraph_alternative4(&agraph, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_mha_pattern_alternative4"}, 1,
-            std::vector<partition_info_t> {{13, 5, 1}});
-}
-
-// shall be re-enabled after fp32/bf16 concat patterns are re-enabled
-#if 0
-TEST(GCPatternTests, add_to_concat_permute_concat_to_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_gptj_concat_subgraph(&agraph);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"add_to_concat_permute_concat_to"}, 1,
-            std::vector<partition_info_t> {{6, 4, 1}});
-}
-
-TEST(GCPatternTests, add_to_concat_permute_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_gptj_concat_subgraph(&agraph);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"add_to_concat_permute"}, 1,
-            std::vector<partition_info_t> {{4, 3, 1}});
-}
-
-TEST(GCPatternTests, permute_concat_to_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_gptj_concat_subgraph(&agraph);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"permute_concat_to"}, 1,
-            std::vector<partition_info_t> {{3, 2, 1}});
-}
-#endif
-
-TEST(GCPatternTests, mul_mul_add_concat_permute_concat_quant_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_gptj_concat_subgraph(&agraph, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"mul_mul_add_concat_permute_concat_quant"}, 1,
-            std::vector<partition_info_t> {{9, 4, 1}});
-}
-
-TEST(GCPatternTests, add_typecast_concat_typecasts_quant_CPU) {
-    REQUIRE_AVX512();
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::add_llama_concat_subgraph(&agraph, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"add_typecast_concat_typecasts_quant"}, 1,
-            std::vector<partition_info_t> {{6, 3, 1}});
-}
-
-TEST(GCPatternTests, FP32STARCODERMHAPattern_CPU) {
-    REQUIRE_AVX512();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, false, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"fp32_starcoder_mha"}, 1,
-            std::vector<partition_info_t> {{5, 6, 1}});
-}
-
-TEST(GCPatternTests, BF16STARCODERMHAPattern_CPU) {
-    REQUIRE_BF16_AMXBF16();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, true, false);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"bf16_starcoder_mha"}, 1,
-            std::vector<partition_info_t> {{5, 6, 1}});
-}
-
-TEST(GCPatternTests, INT8FP32STARCODERMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, false, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_starcoder_mha"}, 1,
-            std::vector<partition_info_t> {{10, 6, 1}});
-}
-
-TEST(GCPatternTests, INT8BF16STARCODERMHAPattern_CPU) {
-    REQUIRE_VNNI_AMXINT8();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::graph_t agraph(engine->kind());
-    compiler_utils::construct_starcoder_mha_subgraph(
-            &agraph, id_gen, true, true);
-    agraph.finalize();
-
-    test_pattern_matched(agraph, {"int8_bf16_starcoder_mha"}, 1,
-            std::vector<partition_info_t> {{15, 6, 1}});
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/test_single_op_partition.cpp b/tests/gtests/graph/unit/backend/graph_compiler/test_single_op_partition.cpp
deleted file mode 100644
index e68d760abb2..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/test_single_op_partition.cpp
+++ /dev/null
@@ -1,342 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include "backend/graph_compiler/compiler_backend.hpp"
-#include "interface/allocator.hpp"
-#include "interface/graph.hpp"
-#include "interface/partition.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "test_utils.hpp"
-
-#include <gtest/gtest.h>
-#include <runtime/context.hpp>
-
-namespace impl = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-namespace compiler_utils = dnnl::impl::graph::tests::unit::compiler::utils;
-
-TEST(GCSingleOpTest, BinaryOpCompileExecution_CPU) {
-    REQUIRE_SINGLE_OP_PATTERN();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::engine_t *eng = get_engine();
-
-    graph::logical_tensor_t src0_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t src1_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {1, 16}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 16}, graph::data_type::f32);
-
-    std::vector<graph::op_kind_t> binary_ops = {graph::op_kind::Add,
-            graph::op_kind::Subtract, graph::op_kind::Multiply,
-            graph::op_kind::Divide, graph::op_kind::Maximum};
-
-    for (size_t i = 0; i < binary_ops.size(); i++) {
-        graph::op_t binary_op(id_gen.get_id(), binary_ops[i], "binary");
-
-        binary_op.add_input(src0_lt);
-        binary_op.add_input(src1_lt);
-        binary_op.add_output(dst_lt);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&binary_op);
-        g.finalize();
-
-        auto &compiler_backend_ptr
-                = impl::compiler_impl::compiler_backend_t::get_singleton();
-        compiler_backend_ptr.get_partitions(g, impl::partition_policy::fusion);
-        auto partitions = g.get_partitions();
-        ASSERT_EQ(partitions.size(), 1U);
-
-        auto part = partitions[0];
-        graph::partition_t p;
-        p.init(part);
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> inputs {
-                &src0_lt, &src1_lt};
-        std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-        p.compile(&cp, inputs, outputs, eng);
-
-        std::vector<float> src0(16 * 16), src1(16), dst(16 * 16);
-
-        test_tensor src0_ts(src0_lt, eng, src0);
-        test_tensor src1_ts(src1_lt, eng, src1);
-        test_tensor dst_ts(dst_lt, eng, dst);
-
-        graph::stream_t *strm = get_stream();
-        cp.execute(strm, {src0_ts.get(), src1_ts.get()}, {dst_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(GCSingleOpTest, UnaryActivationOpCompileExecution_CPU) {
-    REQUIRE_SINGLE_OP_PATTERN();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::engine_t *eng = get_engine();
-
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 16}, graph::data_type::f32);
-
-    std::vector<graph::op_kind_t> unary_ops = {graph::op_kind::ReLU,
-            graph::op_kind::Sigmoid, graph::op_kind::GELU, graph::op_kind::Tanh,
-            graph::op_kind::SoftMax};
-
-    for (size_t i = 0; i < unary_ops.size(); i++) {
-        graph::op_t unary_op(id_gen.get_id(), unary_ops[i], "unary");
-
-        unary_op.add_input(src_lt);
-        unary_op.add_output(dst_lt);
-
-        graph::graph_t g(eng->kind());
-        g.add_op(&unary_op);
-        g.finalize();
-
-        auto &compiler_backend_ptr
-                = impl::compiler_impl::compiler_backend_t::get_singleton();
-        compiler_backend_ptr.get_partitions(g, impl::partition_policy::fusion);
-        auto partitions = g.get_partitions();
-        ASSERT_EQ(partitions.size(), 1U);
-
-        auto part = partitions[0];
-        graph::partition_t p;
-        p.init(part);
-        graph::compiled_partition_t cp(p);
-
-        std::vector<const graph::logical_tensor_t *> inputs {&src_lt};
-        std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-        p.compile(&cp, inputs, outputs, eng);
-
-        std::vector<float> src(16 * 16), dst(16 * 16);
-
-        test_tensor src_ts(src_lt, eng, src);
-        test_tensor dst_ts(dst_lt, eng, dst);
-
-        graph::stream_t *strm = get_stream();
-        cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
-        strm->wait();
-    }
-}
-
-TEST(GCSingleOpTest, StaticReshapeOpCompileExecution_CPU) {
-    REQUIRE_SINGLE_OP_PATTERN();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::engine_t *eng = get_engine();
-
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 16}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {2, 8, 4, 4}, graph::data_type::f32);
-
-    graph::op_t static_reshape_op(
-            id_gen.get_id(), graph::op_kind::StaticReshape, "static_reshape");
-    static_reshape_op.set_attr(
-            graph::op_attr::shape, std::vector<graph::dim_t> {2, 8, 4, 4});
-    static_reshape_op.set_attr(graph::op_attr::special_zero, false);
-    static_reshape_op.add_input(src_lt);
-    static_reshape_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&static_reshape_op);
-    g.finalize();
-
-    auto &compiler_backend_ptr
-            = impl::compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(g, impl::partition_policy::fusion);
-    auto partitions = g.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    auto part = partitions[0];
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> src(16 * 16), dst(16 * 16);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
-    strm->wait();
-}
-
-TEST(GCSingleOpTest, StaticTransposeOpCompileExecution_CPU) {
-    REQUIRE_SINGLE_OP_PATTERN();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::engine_t *eng = get_engine();
-
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {2, 8, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {2, 4, 4, 8}, graph::data_type::f32);
-
-    graph::op_t static_transpose_op(id_gen.get_id(),
-            graph::op_kind::StaticTranspose, "static_transpose");
-    static_transpose_op.set_attr(
-            graph::op_attr::order, std::vector<graph::dim_t> {0, 2, 3, 1});
-    static_transpose_op.add_input(src_lt);
-    static_transpose_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&static_transpose_op);
-    g.finalize();
-
-    auto &compiler_backend_ptr
-            = impl::compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(g, impl::partition_policy::fusion);
-    auto partitions = g.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    auto part = partitions[0];
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {&src_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> src(16 * 16), dst(16 * 16);
-
-    test_tensor src_ts(src_lt, eng, src);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src_ts.get()}, {dst_ts.get()});
-    strm->wait();
-}
-
-TEST(GCSingleOpTest, SelectOpCompileExecution_CPU) {
-    REQUIRE_SINGLE_OP_PATTERN();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::engine_t *eng = get_engine();
-
-    graph::logical_tensor_t cond_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {2, 1, 1, 5}, graph::data_type::boolean);
-    graph::logical_tensor_t then_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {2, 3, 4, 5}, graph::data_type::f32);
-    graph::logical_tensor_t else_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {1}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {2, 3, 4, 5}, graph::data_type::f32);
-
-    graph::op_t select_op(id_gen.get_id(), graph::op_kind::Select, "select_op");
-    select_op.add_input(cond_lt);
-    select_op.add_input(then_lt);
-    select_op.add_input(else_lt);
-    select_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&select_op);
-    g.finalize();
-
-    auto &compiler_backend_ptr
-            = impl::compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(g, impl::partition_policy::fusion);
-    auto partitions = g.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    auto part = partitions[0];
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &cond_lt, &then_lt, &else_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> then(2 * 3 * 4 * 5), els(1), dst(2 * 3 * 4 * 5);
-    std::vector<char> cond(2 * 5);
-
-    test_tensor cond_ts(cond_lt, eng, cond);
-    test_tensor then_ts(then_lt, eng, then);
-    test_tensor else_ts(else_lt, eng, els);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {cond_ts.get(), then_ts.get(), else_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-}
-
-TEST(GCSingleOpTest, ConcatOpCompileExecution_CPU) {
-    REQUIRE_SINGLE_OP_PATTERN();
-    utils::id_generator id_gen;
-    REQUIRE_CPU_ENGINE();
-    graph::engine_t *eng = get_engine();
-
-    graph::logical_tensor_t src0_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 8, 1, 8}, graph::data_type::f32);
-    graph::logical_tensor_t src1_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 8, 2, 8}, graph::data_type::f32);
-    graph::logical_tensor_t src2_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 8, 3, 8}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            id_gen.get_id(), {16, 8, 6, 8}, graph::data_type::f32);
-
-    graph::op_t concat_op(id_gen.get_id(), graph::op_kind::Concat, "concat_op");
-    concat_op.set_attr<int64_t>(graph::op_attr::axis, -2);
-    concat_op.add_input(src0_lt);
-    concat_op.add_input(src1_lt);
-    concat_op.add_input(src2_lt);
-    concat_op.add_output(dst_lt);
-
-    graph::graph_t g(eng->kind());
-    g.add_op(&concat_op);
-    g.finalize();
-
-    auto &compiler_backend_ptr
-            = impl::compiler_impl::compiler_backend_t::get_singleton();
-    compiler_backend_ptr.get_partitions(g, impl::partition_policy::fusion);
-    auto partitions = g.get_partitions();
-    ASSERT_EQ(partitions.size(), 1U);
-
-    auto part = partitions[0];
-    graph::partition_t p;
-    p.init(part);
-    graph::compiled_partition_t cp(p);
-
-    std::vector<const graph::logical_tensor_t *> inputs {
-            &src0_lt, &src1_lt, &src2_lt};
-    std::vector<const graph::logical_tensor_t *> outputs {&dst_lt};
-    p.compile(&cp, inputs, outputs, eng);
-
-    std::vector<float> src0(16 * 8 * 1 * 8), src1(16 * 8 * 2 * 8),
-            src2(16 * 8 * 3 * 8), dst(16 * 8 * 6 * 8);
-
-    test_tensor src0_ts(src0_lt, eng, src0);
-    test_tensor src1_ts(src1_lt, eng, src1);
-    test_tensor src2_ts(src2_lt, eng, src2);
-    test_tensor dst_ts(dst_lt, eng, dst);
-
-    graph::stream_t *strm = get_stream();
-    cp.execute(strm, {src0_ts.get(), src1_ts.get(), src2_ts.get()},
-            {dst_ts.get()});
-    strm->wait();
-}
diff --git a/tests/gtests/graph/unit/backend/graph_compiler/test_utils.hpp b/tests/gtests/graph/unit/backend/graph_compiler/test_utils.hpp
deleted file mode 100644
index 0734b8d8063..00000000000
--- a/tests/gtests/graph/unit/backend/graph_compiler/test_utils.hpp
+++ /dev/null
@@ -1,5413 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#ifndef BACKEND_GRAPH_COMPILER_TEST_UTILS_HPP
-#define BACKEND_GRAPH_COMPILER_TEST_UTILS_HPP
-
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-#include "compiler/config/context.hpp"
-#include "graph/unit/utils.hpp"
-#include "interface/c_types_map.hpp"
-#include "interface/graph.hpp"
-#include "interface/partition.hpp"
-#include "runtime/config.hpp"
-
-namespace dnnl {
-namespace impl {
-namespace graph {
-namespace tests {
-namespace unit {
-namespace compiler {
-namespace utils {
-
-#define REQUIRE_AVX512() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512F) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define REQUIRE_VNNI_AMXINT8() \
-    REQUIRE_AVX512() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512VNNI \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512AMXINT8) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define REQUIRE_BF16_AMXBF16() \
-    REQUIRE_AVX512() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512BF16 \
-            && !::dnnl::impl::graph::gc::get_default_context() \
-                        ->machine_.cpu_flags_.fAVX512AMXBF16) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define REQUIRE_AMX() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXTILE) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define REQUIRE_AMXBF16() \
-    if (!::dnnl::impl::graph::gc::get_default_context() \
-                    ->machine_.cpu_flags_.fAVX512AMXBF16) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define REQUIRE_SINGLE_OP_PATTERN() \
-    if (!graph::utils::getenv_int_internal( \
-                "ENABLE_GRAPH_COMPILER_SINGLE_OP_PATTERN", 0)) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define SKIP_WHEN_SINGLE_OP_PATTERN_ON() \
-    if (graph::utils::getenv_int_internal( \
-                "ENABLE_GRAPH_COMPILER_SINGLE_OP_PATTERN", 0)) { \
-        GTEST_SKIP(); \
-        return; \
-    }
-
-#define REQUIRE_CPU_ENGINE() \
-    impl::engine_t *engine = get_engine(); \
-    SKIP_IF(engine->kind() == graph::engine_kind::gpu, "skip on gpu");
-
-#define DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(name) \
-    name.set_attr(op_attr::qtype, std::string("per_tensor")); \
-    name.set_attr(op_attr::axis, (int64_t)0);
-
-#define DEFINE_DEFAULT_PER_CHANNEL_DYN_QUANT_ATTR(name, shape, ax) \
-    name.set_attr(op_attr::qtype, std::string("per_channel")); \
-    name.set_attr(op_attr::axis, (int64_t)ax);
-
-#define DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(name) \
-    name.set_attr(graph::op_attr::scales, std::vector<float>({0.12f})); \
-    name.set_attr(graph::op_attr::zps, std::vector<int64_t>({2})); \
-    name.set_attr(graph::op_attr::qtype, std::string("per_tensor")); \
-    name.set_attr(graph::op_attr::axis, (int64_t)0);
-
-#define DEFINE_DEFAULT_PER_CHANNEL_QUANT_ATTR(name, shape, ax) \
-    name.set_attr(graph::op_attr::scales, std::vector<float>(shape, 0.12f)); \
-    name.set_attr(graph::op_attr::zps, std::vector<int64_t>(shape, 0)); \
-    name.set_attr(graph::op_attr::qtype, std::string("per_channel")); \
-    name.set_attr(graph::op_attr::axis, (int64_t)ax);
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-typedef enum {
-    RESHAPE_INCLUDED = 0,
-    RESHAPE_EXCLUDED = 1,
-} quantize_position_t;
-
-// this function can add fp32/bf16/int8-fp32/int8-bf16 MHA graph
-// if is_itex is false, it is the default setting
-// if itex is true, it will put K side's second transpose to matmul_qk
-inline void add_MHA_subgraph(graph::graph_t *agraph, bool use_bf16 = false,
-        bool use_int8 = false, bool is_itex = false,
-        graph::dim_t batch_size = 128, graph::dim_t seq_len = 384,
-        graph::dim_t num_head = 16, graph::dim_t head_dim = 1024,
-        bool dyn_quant = false) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE = is_itex
-            ? std::vector<graph::dim_t> {batch_size, 1, seq_len, seq_len}
-            : std::vector<graph::dim_t> {batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QKV_INPUT_SHAPE = is_itex
-            ? std::vector<graph::dim_t> {batch_size * seq_len, head_dim}
-            : std::vector<graph::dim_t> {batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> QKV_RESHAPED_SHAPE {
-            batch_size, seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> QKV_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-
-    graph::logical_tensor_t query_dequantize_input, key_dequantize_input,
-            value_dequantize_input;
-    query_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::u8);
-    key_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::u8);
-    value_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::u8);
-    // dyn quant scale and zp tensor.
-    graph::logical_tensor_t qkv_scale_desc, qkv_zp_desc;
-    if (dyn_quant) {
-        qkv_scale_desc = utils::logical_tensor_init(
-                logical_tensor_idx++, {1}, graph::data_type::f32);
-        qkv_zp_desc = utils::logical_tensor_init(
-                logical_tensor_idx++, {1}, graph::data_type::u8);
-    }
-
-    graph::logical_tensor_t query_typecast_input, key_typecast_input,
-            value_typecast_input;
-    query_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::f32);
-    key_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::f32);
-    value_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_reshape_input, key_reshape_input,
-            value_reshape_input;
-    query_reshape_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, dtype);
-    key_reshape_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, dtype);
-    value_reshape_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t query_transpose_input, key_transpose_input,
-            value_transpose_input;
-    query_transpose_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    key_transpose_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    value_transpose_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-
-    graph::logical_tensor_t key_transpose2_input;
-    key_transpose2_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t attention_mask_flt;
-    attention_mask_flt = utils::logical_tensor_init(
-            logical_tensor_idx++, EXTENDED_ATTENTION_MASK_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_scale, fscore_scale_out;
-    fscore_scale = utils::logical_tensor_init(logical_tensor_idx++, CONST_SHAPE,
-            is_itex || (use_bf16 && !use_int8) ? dtype : graph::data_type::f32);
-    fscore_scale_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_add_out, softmax_out;
-    fscore_add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_cast_out;
-    softmax_cast_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_quantize_out;
-    softmax_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, graph::data_type::u8);
-    graph::logical_tensor_t softmax_dequantize_out;
-    softmax_dequantize_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_dequantize_out_cast;
-    softmax_dequantize_out_cast
-            = utils::logical_tensor_init(logical_tensor_idx++,
-                    MATMUL_QK_OUTPUT_SHAPE, graph::data_type::bf16);
-    graph::logical_tensor_t out_scale_desc, out_zp_desc;
-    if (dyn_quant) {
-        out_scale_desc = utils::logical_tensor_init(
-                logical_tensor_idx++, {1}, graph::data_type::f32);
-        out_zp_desc = utils::logical_tensor_init(
-                logical_tensor_idx++, {1}, graph::data_type::s8);
-    }
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t context_transpose_out, context_reshape_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    context_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t context_cast_out;
-    context_cast_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t context_quantize_out;
-    context_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_INPUT_SHAPE, graph::data_type::u8);
-
-    graph::op_t dequantize_query {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_query"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_query);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query);
-    }
-    graph::op_t dequantize_key {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_key"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_key);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key);
-    }
-    graph::op_t dequantize_value {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_value"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_value);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value);
-    }
-    graph::op_t typecast_query {
-            op_idx++, graph::op_kind::TypeCast, "typecast_query"};
-    graph::op_t typecast_key {
-            op_idx++, graph::op_kind::TypeCast, "typecast_key"};
-    graph::op_t typecast_value {
-            op_idx++, graph::op_kind::TypeCast, "typecast_value"};
-
-    graph::op_t query_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "query_reshape"};
-    query_reshape.set_attr(op_attr::shape, QKV_RESHAPED_SHAPE);
-    query_reshape.set_attr(op_attr::special_zero, false);
-    graph::op_t query_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "query_transpose"};
-    query_transpose.set_attr(op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t key_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "key_reshape"};
-    key_reshape.set_attr(op_attr::shape, QKV_RESHAPED_SHAPE);
-    key_reshape.set_attr(op_attr::special_zero, false);
-    graph::op_t key_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose"};
-    key_transpose.set_attr(op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t key_transpose2 {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose2"};
-    key_transpose2.set_attr(op_attr::order, std::vector<int64_t> {0, 1, 3, 2});
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-    if (is_itex) { matmul_qk.set_attr(op_attr::transpose_b, true); }
-
-    graph::op_t fscore_rescale {op_idx++,
-            is_itex ? graph::op_kind::Multiply : graph::op_kind::Divide,
-            "fscore_rescale"};
-    fscore_rescale.set_attr(op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_add {op_idx++, graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(op_attr::axis, (int64_t)3);
-
-    graph::op_t softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "softmax_cast"};
-    graph::op_t quantize_softmax {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicQuantize
-                      : graph::op_kind::Quantize,
-            "quantize_softmax"};
-    graph::op_t dequantize_softmax {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_softmax"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(quantize_softmax);
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_softmax);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_softmax);
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_softmax);
-    }
-
-    graph::op_t dequantize_softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "dequantize_softmax_cast"};
-
-    graph::op_t value_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "value_reshape"};
-    value_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPED_SHAPE);
-    value_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t value_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "value_transpose"};
-    value_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reshape_output {
-            op_idx++, graph::op_kind::StaticReshape, "reshape_output"};
-    reshape_output.set_attr(op_attr::special_zero, false);
-    reshape_output.set_attr(op_attr::shape, QKV_INPUT_SHAPE);
-
-    graph::op_t typecast_output {
-            op_idx++, graph::op_kind::TypeCast, "typecast_output"};
-    graph::op_t quantize_output {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicQuantize
-                      : graph::op_kind::Quantize,
-            "quantize_output"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(quantize_output);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_output);
-    }
-
-    if (use_int8) {
-        dequantize_query.add_input(query_dequantize_input);
-        dequantize_key.add_input(key_dequantize_input);
-        dequantize_value.add_input(value_dequantize_input);
-        if (!use_bf16) {
-            dequantize_query.add_output(query_reshape_input);
-            dequantize_key.add_output(key_reshape_input);
-            dequantize_value.add_output(value_reshape_input);
-        } else {
-            dequantize_query.add_output(query_typecast_input);
-            dequantize_key.add_output(key_typecast_input);
-            dequantize_value.add_output(value_typecast_input);
-            typecast_query.add_input(query_typecast_input);
-            typecast_key.add_input(key_typecast_input);
-            typecast_value.add_input(value_typecast_input);
-            typecast_query.add_output(query_reshape_input);
-            typecast_key.add_output(key_reshape_input);
-            typecast_value.add_output(value_reshape_input);
-        }
-        if (dyn_quant) {
-            dequantize_query.add_input(qkv_scale_desc);
-            dequantize_query.add_input(qkv_zp_desc);
-            dequantize_key.add_input(qkv_scale_desc);
-            dequantize_key.add_input(qkv_zp_desc);
-            dequantize_value.add_input(qkv_scale_desc);
-            dequantize_value.add_input(qkv_zp_desc);
-        }
-    }
-
-    query_reshape.add_input(query_reshape_input);
-    query_reshape.add_output(query_transpose_input);
-    query_transpose.add_input(query_transpose_input);
-    query_transpose.add_output(query_matmul_input);
-
-    key_reshape.add_input(key_reshape_input);
-    key_reshape.add_output(key_transpose_input);
-    key_transpose.add_input(key_transpose_input);
-    key_transpose.add_output(key_transpose2_input);
-
-    matmul_qk.add_input(query_matmul_input);
-
-    if (!is_itex) {
-        key_transpose2.add_input(key_transpose2_input);
-        key_transpose2.add_output(key_matmul_input);
-        matmul_qk.add_input(key_matmul_input);
-    } else {
-        matmul_qk.add_input(key_transpose2_input);
-    }
-
-    matmul_qk.add_output(matmul_qk_out);
-
-    fscore_rescale.add_input(matmul_qk_out);
-    fscore_rescale.add_input(fscore_scale);
-    fscore_rescale.add_output(fscore_scale_out);
-    fscore_add.add_input(fscore_scale_out);
-    fscore_add.add_input(attention_mask_flt);
-    fscore_add.add_output(fscore_add_out);
-    softmax.add_input(fscore_add_out);
-    softmax.add_output(softmax_out);
-
-    if (use_int8) {
-        quantize_softmax.add_output(softmax_quantize_out);
-        dequantize_softmax.add_input(softmax_quantize_out);
-        dequantize_softmax.add_output(softmax_dequantize_out);
-        if (!use_bf16) {
-            quantize_softmax.add_input(softmax_out);
-            matmul_v.add_input(softmax_dequantize_out);
-        } else {
-            softmax_cast.add_input(softmax_out);
-            softmax_cast.add_output(softmax_cast_out);
-            quantize_softmax.add_input(softmax_cast_out);
-            dequantize_softmax_cast.add_input(softmax_dequantize_out);
-            dequantize_softmax_cast.add_output(softmax_dequantize_out_cast);
-            matmul_v.add_input(softmax_dequantize_out_cast);
-        }
-        if (dyn_quant) {
-            quantize_softmax.add_input(qkv_scale_desc);
-            quantize_softmax.add_input(qkv_zp_desc);
-            dequantize_softmax.add_input(qkv_scale_desc);
-            dequantize_softmax.add_input(qkv_zp_desc);
-        }
-    } else {
-        matmul_v.add_input(softmax_out);
-    }
-
-    value_reshape.add_input(value_reshape_input);
-    value_reshape.add_output(value_transpose_input);
-    value_transpose.add_input(value_transpose_input);
-    value_transpose.add_output(value_matmul_input);
-
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-    reshape_output.add_input(context_transpose_out);
-    reshape_output.add_output(context_reshape_out);
-
-    if (use_int8) {
-        quantize_output.add_output(context_quantize_out);
-        if (!use_bf16) {
-            quantize_output.add_input(context_reshape_out);
-        } else {
-            typecast_output.add_input(context_reshape_out);
-            typecast_output.add_output(context_cast_out);
-            quantize_output.add_input(context_cast_out);
-        }
-        if (dyn_quant) {
-            quantize_output.add_input(out_scale_desc);
-            quantize_output.add_input(out_zp_desc);
-        }
-    }
-
-    if (use_int8) {
-        agraph->add_op(&dequantize_query);
-        agraph->add_op(&dequantize_key);
-        agraph->add_op(&dequantize_value);
-        if (use_bf16) {
-            agraph->add_op(&typecast_query);
-            agraph->add_op(&typecast_key);
-            agraph->add_op(&typecast_value);
-        }
-    }
-
-    agraph->add_op(&query_reshape);
-    agraph->add_op(&query_transpose);
-    agraph->add_op(&key_reshape);
-    agraph->add_op(&key_transpose);
-
-    if (!is_itex) { agraph->add_op(&key_transpose2); }
-
-    agraph->add_op(&value_reshape);
-    agraph->add_op(&value_transpose);
-
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_rescale);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&softmax);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_softmax);
-        agraph->add_op(&dequantize_softmax);
-        if (use_bf16) {
-            agraph->add_op(&softmax_cast);
-            agraph->add_op(&dequantize_softmax_cast);
-        }
-    }
-
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reshape_output);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_output);
-        if (use_bf16) { agraph->add_op(&typecast_output); }
-    }
-}
-
-// add MHA fp32/bf16/int8 graph that has starting dequantize
-// after reshape/transpose && ends with reorder
-inline void add_MHA_subgraph_alternative(graph::graph_t *agraph,
-        bool use_bf16 = false, bool use_int8 = false,
-        graph::op_kind_t output_op_kind = graph::op_kind::Reorder,
-        graph::dim_t batch_size = 128, graph::dim_t seq_len = 384,
-        graph::dim_t num_head = 16, graph::dim_t head_dim = 1024,
-        bool dyn_quant = false) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE {
-            batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QKV_RESHAPED_SHAPE {
-            batch_size, seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> QKV_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> RESHAPE_OUTPUT_SHAPE {
-            batch_size * seq_len, num_head * size_per_head};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-
-    graph::logical_tensor_t query_dequantize_input, key_dequantize_input,
-            value_dequantize_input;
-    query_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::u8);
-    key_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::u8);
-    value_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::u8);
-
-    // dyn quant scale and zp tensor.
-    graph::logical_tensor_t qkv_scale_desc, qkv_zp_desc;
-    if (dyn_quant) {
-        qkv_scale_desc = utils::logical_tensor_init(
-                logical_tensor_idx++, {1}, graph::data_type::f32);
-        qkv_zp_desc = utils::logical_tensor_init(
-                logical_tensor_idx++, {1}, graph::data_type::s32);
-    }
-
-    graph::logical_tensor_t query_typecast_input, key_typecast_input,
-            value_typecast_input;
-    query_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-    key_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::f32);
-    value_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t attention_mask_flt;
-    attention_mask_flt = utils::logical_tensor_init(
-            logical_tensor_idx++, EXTENDED_ATTENTION_MASK_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_scale, fscore_div_out;
-    fscore_scale = utils::logical_tensor_init(logical_tensor_idx++, CONST_SHAPE,
-            graph::data_type::f32); // fscore_scale is always fp32
-    fscore_div_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_add_out, softmax_out;
-    fscore_add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_cast_out;
-    softmax_cast_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_quantize_out;
-    softmax_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, graph::data_type::u8);
-    graph::logical_tensor_t softmax_dequantize_out;
-    softmax_dequantize_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_dequantize_out_cast;
-    softmax_dequantize_out_cast
-            = utils::logical_tensor_init(logical_tensor_idx++,
-                    MATMUL_QK_OUTPUT_SHAPE, graph::data_type::bf16);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t context_transpose_out, context_final_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    context_final_out = utils::logical_tensor_init(logical_tensor_idx++, dtype);
-
-    graph::logical_tensor_t context_cast_out;
-    context_cast_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32);
-
-    graph::logical_tensor_t context_quantize_out;
-    context_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::u8);
-
-    graph::op_t dequantize_query {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_query"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_query);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query);
-    }
-    graph::op_t dequantize_key {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_key"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_key);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key);
-    }
-    graph::op_t dequantize_value {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_value"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_value);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value);
-    }
-    graph::op_t typecast_query {
-            op_idx++, graph::op_kind::TypeCast, "typecast_query"};
-    graph::op_t typecast_key {
-            op_idx++, graph::op_kind::TypeCast, "typecast_key"};
-    graph::op_t typecast_value {
-            op_idx++, graph::op_kind::TypeCast, "typecast_value"};
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_div {op_idx++, graph::op_kind::Divide, "fscore_div"};
-    fscore_div.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_add {op_idx++, graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-
-    graph::op_t softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "softmax_cast"};
-    graph::op_t quantize_softmax {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicQuantize
-                      : graph::op_kind::Quantize,
-            "quantize_softmax"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(quantize_softmax);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_softmax);
-    }
-    graph::op_t dequantize_softmax {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_softmax"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequantize_softmax);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_softmax);
-    }
-
-    graph::op_t dequantize_softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "dequantize_softmax_cast"};
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reshape_reorder_output {
-            op_idx++, output_op_kind, "reshape_reorder_output"};
-    if (output_op_kind == graph::op_kind::StaticReshape) {
-        // if static reshape
-        reshape_reorder_output.set_attr(
-                graph::op_attr::shape, RESHAPE_OUTPUT_SHAPE);
-        reshape_reorder_output.set_attr(graph::op_attr::special_zero, false);
-    }
-
-    graph::op_t typecast_output {
-            op_idx++, graph::op_kind::TypeCast, "typecast_output"};
-    graph::op_t quantize_output {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicQuantize
-                      : graph::op_kind::Quantize,
-            "quantize_output"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(quantize_output);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_output);
-    }
-
-    if (use_int8) {
-        dequantize_query.add_input(query_dequantize_input);
-        dequantize_key.add_input(key_dequantize_input);
-        dequantize_value.add_input(value_dequantize_input);
-        if (!use_bf16) {
-            dequantize_query.add_output(query_matmul_input);
-            dequantize_key.add_output(key_matmul_input);
-            dequantize_value.add_output(value_matmul_input);
-        } else {
-            dequantize_query.add_output(query_typecast_input);
-            dequantize_key.add_output(key_typecast_input);
-            dequantize_value.add_output(value_typecast_input);
-            typecast_query.add_input(query_typecast_input);
-            typecast_key.add_input(key_typecast_input);
-            typecast_value.add_input(value_typecast_input);
-            typecast_query.add_output(query_matmul_input);
-            typecast_key.add_output(key_matmul_input);
-            typecast_value.add_output(value_matmul_input);
-        }
-        if (dyn_quant) {
-            dequantize_query.add_input(qkv_scale_desc);
-            dequantize_query.add_input(qkv_zp_desc);
-            dequantize_key.add_input(qkv_scale_desc);
-            dequantize_key.add_input(qkv_zp_desc);
-            dequantize_value.add_input(qkv_scale_desc);
-            dequantize_value.add_input(qkv_zp_desc);
-        }
-    }
-
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-
-    fscore_div.add_input(matmul_qk_out);
-    fscore_div.add_input(fscore_scale);
-    fscore_div.add_output(fscore_div_out);
-    fscore_add.add_input(fscore_div_out);
-    fscore_add.add_input(attention_mask_flt);
-    fscore_add.add_output(fscore_add_out);
-    softmax.add_input(fscore_add_out);
-    softmax.add_output(softmax_out);
-
-    if (use_int8) {
-        quantize_softmax.add_output(softmax_quantize_out);
-        dequantize_softmax.add_input(softmax_quantize_out);
-        dequantize_softmax.add_output(softmax_dequantize_out);
-        if (!use_bf16) {
-            quantize_softmax.add_input(softmax_out);
-            matmul_v.add_input(softmax_dequantize_out);
-        } else {
-            softmax_cast.add_input(softmax_out);
-            softmax_cast.add_output(softmax_cast_out);
-            quantize_softmax.add_input(softmax_cast_out);
-            dequantize_softmax_cast.add_input(softmax_dequantize_out);
-            dequantize_softmax_cast.add_output(softmax_dequantize_out_cast);
-            matmul_v.add_input(softmax_dequantize_out_cast);
-        }
-        if (dyn_quant) {
-            quantize_softmax.add_input(qkv_scale_desc);
-            quantize_softmax.add_input(qkv_zp_desc);
-            dequantize_softmax.add_input(qkv_scale_desc);
-            dequantize_softmax.add_input(qkv_zp_desc);
-        }
-    } else {
-        matmul_v.add_input(softmax_out);
-    }
-
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-    reshape_reorder_output.add_input(context_transpose_out);
-    reshape_reorder_output.add_output(context_final_out);
-
-    if (use_int8) {
-        quantize_output.add_output(context_quantize_out);
-        if (!use_bf16) {
-            quantize_output.add_input(context_final_out);
-        } else {
-            typecast_output.add_input(context_final_out);
-            typecast_output.add_output(context_cast_out);
-            quantize_output.add_input(context_cast_out);
-        }
-        if (dyn_quant) {
-            quantize_output.add_input(qkv_scale_desc);
-            quantize_output.add_input(qkv_zp_desc);
-        }
-    }
-
-    if (use_int8) {
-        agraph->add_op(&dequantize_query);
-        agraph->add_op(&dequantize_key);
-        agraph->add_op(&dequantize_value);
-        if (use_bf16) {
-            agraph->add_op(&typecast_query);
-            agraph->add_op(&typecast_key);
-            agraph->add_op(&typecast_value);
-        }
-    }
-
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_div);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&softmax);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_softmax);
-        agraph->add_op(&dequantize_softmax);
-        if (use_bf16) {
-            agraph->add_op(&softmax_cast);
-            agraph->add_op(&dequantize_softmax_cast);
-        }
-    }
-
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reshape_reorder_output);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_output);
-        if (use_bf16) { agraph->add_op(&typecast_output); }
-    }
-}
-
-// this function can add a simple MHA without intermediate tensor shape
-// aiming for testing infer shape
-inline void add_MHA_infer_shape(graph::graph_t *agraph,
-        graph::dim_t batch_size = 128, graph::dim_t seq_len = 384,
-        graph::dim_t num_head = 16, graph::dim_t head_dim = 1024) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> MIXED_LAYER_INPUT_SHAPE {
-            batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE {
-            batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    std::vector<graph::dim_t> OUTPUT_SHAPE {batch_size, seq_len, head_dim};
-
-    graph::logical_tensor_t query_gemm_out_flt, qk_bmm_wei_flt,
-            value_bmm_wei_flt;
-    query_gemm_out_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    qk_bmm_wei_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    value_bmm_wei_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t attention_mask_flt;
-    attention_mask_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            EXTENDED_ATTENTION_MASK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_reshape_out, query_transpose_out;
-    query_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-    query_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t key_reshape_out, key_transpose_out;
-    key_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-    key_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t key_transpose_out2;
-    key_transpose_out2 = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t fscore_scale, fscore_div_out;
-    fscore_scale = utils::logical_tensor_init(
-            logical_tensor_idx++, CONST_SHAPE, graph::data_type::f32);
-    fscore_div_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t fscore_add_out, softmax_out;
-    fscore_add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t value_reshape_out, value_transpose_out;
-    value_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-    value_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t softmax_out_q, softmax_out_deq;
-    softmax_out_q = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-    softmax_out_deq = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-
-    graph::logical_tensor_t context_transpose_out, context_reshape_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32, layout_type::strided);
-    context_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, OUTPUT_SHAPE, graph::data_type::f32);
-
-    // reshape + transpose for query + key
-    graph::op_t query_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "query_reshape"};
-    query_reshape.set_attr(graph::op_attr::shape,
-            std::vector<int64_t> {
-                    batch_size, seq_len, num_head, size_per_head});
-    query_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t query_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "query_transpose"};
-    query_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t key_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "key_reshape"};
-    key_reshape.set_attr(graph::op_attr::shape,
-            std::vector<int64_t> {
-                    batch_size, seq_len, num_head, size_per_head});
-    key_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t key_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose"};
-    key_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t key_transpose2 {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose2"};
-    key_transpose2.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 1, 3, 2});
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_div {op_idx++, graph::op_kind::Divide, "fscore_div"};
-    fscore_div.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_add {op_idx++, graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-
-    // reshape + transpose for value
-    graph::op_t value_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "value_reshape"};
-    value_reshape.set_attr(graph::op_attr::shape,
-            std::vector<int64_t> {
-                    batch_size, seq_len, num_head, size_per_head});
-    value_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t value_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "value_transpose"};
-    value_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reshape_output {
-            op_idx++, graph::op_kind::StaticReshape, "reshape_output"};
-    reshape_output.set_attr(graph::op_attr::special_zero, false);
-    reshape_output.set_attr(graph::op_attr::shape,
-            std::vector<int64_t> {batch_size, seq_len, head_dim});
-
-    query_reshape.add_input(query_gemm_out_flt);
-    key_reshape.add_input(qk_bmm_wei_flt);
-    value_reshape.add_input(value_bmm_wei_flt);
-
-    query_reshape.add_output(query_reshape_out);
-    query_transpose.add_input(query_reshape_out);
-    query_transpose.add_output(query_transpose_out);
-
-    key_reshape.add_output(key_reshape_out);
-    key_transpose.add_input(key_reshape_out);
-    key_transpose.add_output(key_transpose_out);
-    key_transpose2.add_input(key_transpose_out);
-    key_transpose2.add_output(key_transpose_out2);
-
-    matmul_qk.add_input(query_transpose_out);
-    matmul_qk.add_input(key_transpose_out2);
-    matmul_qk.add_output(matmul_qk_out);
-
-    fscore_div.add_input(matmul_qk_out);
-    fscore_div.add_input(fscore_scale);
-    fscore_div.add_output(fscore_div_out);
-    fscore_add.add_input(fscore_div_out);
-    fscore_add.add_input(attention_mask_flt);
-    fscore_add.add_output(fscore_add_out);
-    softmax.add_input(fscore_add_out);
-    softmax.add_output(softmax_out);
-
-    value_reshape.add_output(value_reshape_out);
-    value_transpose.add_input(value_reshape_out);
-    value_transpose.add_output(value_transpose_out);
-    matmul_v.add_input(softmax_out);
-    matmul_v.add_input(value_transpose_out);
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-    reshape_output.add_input(context_transpose_out);
-    reshape_output.add_output(context_reshape_out);
-
-    agraph->add_op(&query_reshape);
-    agraph->add_op(&query_transpose);
-    agraph->add_op(&key_reshape);
-    agraph->add_op(&key_transpose);
-    agraph->add_op(&key_transpose2);
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_div);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&softmax);
-
-    agraph->add_op(&value_reshape);
-    agraph->add_op(&value_transpose);
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reshape_output);
-}
-
-inline void get_int8_MHA_subgraph_varients(graph::graph_t *agraph,
-        bool use_div = true,
-        const std::vector<quantize_position_t> &quantize_positions
-        = std::vector<quantize_position_t>(4, RESHAPE_INCLUDED),
-        graph::dim_t add_inport = 0, graph::dim_t batch_size = 128,
-        graph::dim_t seq_len = 384, graph::dim_t num_head = 16,
-        graph::dim_t head_dim = 1024) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> MIXED_LAYER_INPUT_SHAPE {
-            batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE {
-            batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QKV_RESHAPED_SHAPE {
-            batch_size, seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> QKV_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    std::vector<graph::dim_t> OUTPUT_SHAPE {batch_size, seq_len, head_dim};
-
-    graph::logical_tensor_t query_gemm_out_flt, qk_bmm_wei_flt,
-            value_bmm_wei_flt, attention_mask_flt;
-    query_gemm_out_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    qk_bmm_wei_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    value_bmm_wei_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    attention_mask_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            EXTENDED_ATTENTION_MASK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_gemm_out_q, qk_bmm_wei_q, value_bmm_wei_q;
-    query_gemm_out_q = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[0] ? QKV_TRANSPOSED_SHAPE
-                                  : MIXED_LAYER_INPUT_SHAPE,
-            graph::data_type::u8);
-    qk_bmm_wei_q = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[1] ? KEY_TRANSPOSED_SHAPE
-                                  : MIXED_LAYER_INPUT_SHAPE,
-            graph::data_type::u8);
-    value_bmm_wei_q = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[2] ? QKV_TRANSPOSED_SHAPE
-                                  : MIXED_LAYER_INPUT_SHAPE,
-            graph::data_type::u8);
-
-    graph::logical_tensor_t query_gemm_out_deq, qk_bmm_wei_deq,
-            value_bmm_wei_deq;
-    query_gemm_out_deq = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[0] ? QKV_TRANSPOSED_SHAPE
-                                  : MIXED_LAYER_INPUT_SHAPE,
-            graph::data_type::f32);
-    qk_bmm_wei_deq = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[1] ? KEY_TRANSPOSED_SHAPE
-                                  : MIXED_LAYER_INPUT_SHAPE,
-            graph::data_type::f32);
-    value_bmm_wei_deq = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[2] ? QKV_TRANSPOSED_SHAPE
-                                  : MIXED_LAYER_INPUT_SHAPE,
-            graph::data_type::f32);
-
-    graph::logical_tensor_t query_reshape_out, query_transpose_out;
-    query_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, graph::data_type::f32);
-    query_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t key_reshape_out, key_transpose_out;
-    key_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, graph::data_type::f32);
-    key_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t key_transpose_out2;
-    key_transpose_out2 = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t fscore_scale, fscore_div_out;
-    fscore_scale = utils::logical_tensor_init(
-            logical_tensor_idx++, CONST_SHAPE, graph::data_type::f32);
-    fscore_div_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t fscore_add_out, softmax_out;
-    fscore_add_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-    softmax_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t value_reshape_out, value_transpose_out;
-    value_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, graph::data_type::f32);
-    value_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_out_q, softmax_out_deq;
-    softmax_out_q = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, graph::data_type::u8);
-    softmax_out_deq = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t context_transpose_out, context_reshape_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, graph::data_type::f32);
-    context_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t context_out_q, context_out_deq;
-    context_out_q = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[3] ? MATMUL_V_OUTPUT_SHAPE : OUTPUT_SHAPE,
-            graph::data_type::u8);
-    context_out_deq = utils::logical_tensor_init(logical_tensor_idx++,
-            quantize_positions[3] ? MATMUL_V_OUTPUT_SHAPE : OUTPUT_SHAPE,
-            graph::data_type::f32);
-
-    // add quantize-dequantize
-    graph::op_t quantize_query_gemm {
-            op_idx++, graph::op_kind::Quantize, "quantize_query_gemm"};
-    graph::op_t quantize_key_gemm {
-            op_idx++, graph::op_kind::Quantize, "quantize_key_gemm"};
-    graph::op_t quantize_value_gemm {
-            op_idx++, graph::op_kind::Quantize, "quantize_value_gemm"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_query_gemm);
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_key_gemm);
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_value_gemm);
-
-    graph::op_t dequantize_query_gemm {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_query_gemm"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query_gemm);
-    graph::op_t dequantize_key_gemm {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_key_gemm"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key_gemm);
-    graph::op_t dequantize_value_gemm {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_value_gemm"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value_gemm);
-
-    // reshape + transpose for query + key
-    graph::op_t query_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "query_reshape"};
-    query_reshape.set_attr(graph::op_attr::special_zero, false);
-    query_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPED_SHAPE);
-    graph::op_t query_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "query_transpose"};
-    query_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-
-    graph::op_t key_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "key_reshape"};
-    key_reshape.set_attr(graph::op_attr::special_zero, false);
-    key_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPED_SHAPE);
-    graph::op_t key_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose"};
-    key_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t key_transpose2 {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose2"};
-    key_transpose2.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 1, 3, 2});
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_rescale {op_idx++,
-            use_div ? graph::op_kind::Divide : graph::op_kind::Multiply,
-            "fscore_rescale"};
-    fscore_rescale.set_attr(
-            graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_add {op_idx++, graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-    // quantize-dequantize softmax's output
-    graph::op_t quantize_softmax {
-            op_idx++, graph::op_kind::Quantize, "quantize_softmax"};
-    graph::op_t dequantize_softmax {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_softmax"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_softmax);
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_softmax);
-
-    // reshape + transpose for value
-    graph::op_t value_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "value_reshape"};
-    value_reshape.set_attr(graph::op_attr::special_zero, false);
-    value_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPED_SHAPE);
-    graph::op_t value_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "value_transpose"};
-    value_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reshape_output {
-            op_idx++, graph::op_kind::StaticReshape, "reshape_output"};
-    reshape_output.set_attr(graph::op_attr::special_zero, false);
-    reshape_output.set_attr(graph::op_attr::shape, OUTPUT_SHAPE);
-
-    // quantize dequantize output
-    graph::op_t quantize_output {
-            op_idx++, graph::op_kind::Quantize, "quantize_output"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_output);
-    graph::op_t dequantize_output {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_output"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_output);
-
-    // query part: quantize's input; reshape's input;
-    quantize_query_gemm.add_output(query_gemm_out_q);
-    dequantize_query_gemm.add_input(query_gemm_out_q);
-    dequantize_query_gemm.add_output(query_gemm_out_deq);
-    query_reshape.add_output(query_reshape_out);
-    query_transpose.add_input(query_reshape_out);
-    query_transpose.add_output(query_transpose_out);
-    if (quantize_positions[0] == RESHAPE_INCLUDED) {
-        quantize_query_gemm.add_input(query_gemm_out_flt);
-        query_reshape.add_input(query_gemm_out_deq);
-    } else {
-        quantize_query_gemm.add_input(query_transpose_out);
-        query_reshape.add_input(query_gemm_out_flt);
-    }
-
-    // key part
-    quantize_key_gemm.add_output(qk_bmm_wei_q);
-    dequantize_key_gemm.add_input(qk_bmm_wei_q);
-    dequantize_key_gemm.add_output(qk_bmm_wei_deq);
-    key_reshape.add_output(key_reshape_out);
-    key_transpose.add_input(key_reshape_out);
-    key_transpose.add_output(key_transpose_out);
-    key_transpose2.add_input(key_transpose_out);
-    key_transpose2.add_output(key_transpose_out2);
-    if (quantize_positions[1] == RESHAPE_INCLUDED) {
-        quantize_key_gemm.add_input(qk_bmm_wei_flt);
-        key_reshape.add_input(qk_bmm_wei_deq);
-    } else {
-        quantize_key_gemm.add_input(key_transpose_out2);
-        key_reshape.add_input(qk_bmm_wei_flt);
-    }
-
-    // value part
-    quantize_value_gemm.add_output(value_bmm_wei_q);
-    dequantize_value_gemm.add_input(value_bmm_wei_q);
-    dequantize_value_gemm.add_output(value_bmm_wei_deq);
-    value_reshape.add_output(value_reshape_out);
-    value_transpose.add_input(value_reshape_out);
-    value_transpose.add_output(value_transpose_out);
-    if (quantize_positions[2] == RESHAPE_INCLUDED) {
-        quantize_value_gemm.add_input(value_bmm_wei_flt);
-        value_reshape.add_input(value_bmm_wei_deq);
-    } else {
-        quantize_value_gemm.add_input(value_transpose_out);
-        value_reshape.add_input(value_bmm_wei_flt);
-    }
-
-    // matmul qk
-    if (quantize_positions[0] == RESHAPE_INCLUDED) {
-        matmul_qk.add_input(query_transpose_out);
-    } else {
-        matmul_qk.add_input(query_gemm_out_deq);
-    }
-    if (quantize_positions[1] == RESHAPE_INCLUDED) {
-        matmul_qk.add_input(key_transpose_out2);
-    } else {
-        matmul_qk.add_input(qk_bmm_wei_deq);
-    }
-    matmul_qk.add_output(matmul_qk_out);
-
-    fscore_rescale.add_input(matmul_qk_out);
-    fscore_rescale.add_input(fscore_scale);
-    fscore_rescale.add_output(fscore_div_out);
-
-    // add commutativity
-    if (add_inport == 0) {
-        fscore_add.add_input(fscore_div_out);
-        fscore_add.add_input(attention_mask_flt);
-    } else {
-        fscore_add.add_input(attention_mask_flt);
-        fscore_add.add_input(fscore_div_out);
-    }
-    fscore_add.add_output(fscore_add_out);
-    softmax.add_input(fscore_add_out);
-    softmax.add_output(softmax_out);
-    quantize_softmax.add_input(softmax_out);
-    quantize_softmax.add_output(softmax_out_q);
-    dequantize_softmax.add_input(softmax_out_q);
-    dequantize_softmax.add_output(softmax_out_deq);
-
-    // matmul v
-    matmul_v.add_input(softmax_out_deq);
-    if (quantize_positions[2] == RESHAPE_INCLUDED) {
-        matmul_v.add_input(value_transpose_out);
-    } else {
-        matmul_v.add_input(value_bmm_wei_deq);
-    }
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_output(context_transpose_out);
-    reshape_output.add_input(context_transpose_out);
-    reshape_output.add_output(context_reshape_out);
-    quantize_output.add_output(context_out_q);
-    dequantize_output.add_input(context_out_q);
-    dequantize_output.add_output(context_out_deq);
-    if (quantize_positions[3] == RESHAPE_INCLUDED) {
-        transpose_output.add_input(matmul_v_out);
-        quantize_output.add_input(context_reshape_out);
-    } else {
-        quantize_output.add_input(matmul_v_out);
-        transpose_output.add_input(context_out_deq);
-    }
-
-    agraph->add_op(&quantize_query_gemm);
-    agraph->add_op(&quantize_key_gemm);
-    agraph->add_op(&quantize_value_gemm);
-    agraph->add_op(&dequantize_query_gemm);
-    agraph->add_op(&dequantize_key_gemm);
-    agraph->add_op(&dequantize_value_gemm);
-    agraph->add_op(&query_reshape);
-    agraph->add_op(&query_transpose);
-    agraph->add_op(&key_reshape);
-    agraph->add_op(&key_transpose);
-    agraph->add_op(&key_transpose2);
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_rescale);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&softmax);
-    agraph->add_op(&quantize_softmax);
-    agraph->add_op(&dequantize_softmax);
-    agraph->add_op(&value_reshape);
-    agraph->add_op(&value_transpose);
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&quantize_output);
-    agraph->add_op(&dequantize_output);
-    agraph->add_op(&reshape_output);
-}
-
-inline void add_MHA_training_subgraph(graph::graph_t *agraph,
-        bool use_bf16 = false, bool has_mul_from_select = false,
-        graph::dim_t batch_size = 128, graph::dim_t seq_len = 384,
-        graph::dim_t num_head = 16, graph::dim_t head_dim = 1024) {
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE {
-            batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QKV_RESHAPED_SHAPE {
-            batch_size, seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> QKV_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> SOFTMAX_SUM_SHAPE {
-            batch_size, num_head, seq_len, 1};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    std::vector<graph::dim_t> OUTPUT_SHAPE {batch_size, seq_len, head_dim};
-
-    // start constructing forward graph
-    graph::logical_tensor_t attention_mask_flt;
-    attention_mask_flt = utils::logical_tensor_init(
-            logical_tensor_idx++, EXTENDED_ATTENTION_MASK_SHAPE, dtype);
-
-    graph::logical_tensor_t query_transpose_out;
-    query_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t key_transpose_out;
-    key_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_scale, fscore_div_out;
-    fscore_scale = utils::logical_tensor_init(
-            logical_tensor_idx++, CONST_SHAPE, graph::data_type::f32);
-    fscore_div_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_add_out, softmax_out, dropout, dropout_out,
-            select, select_out;
-    fscore_add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    dropout = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    dropout_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    select = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-    select_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t value_transpose_out;
-    value_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t context_transpose_out, context_reshape_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    context_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, OUTPUT_SHAPE, dtype);
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-    matmul_qk.set_attr(graph::op_attr::transpose_b, true);
-    matmul_qk.add_input(query_transpose_out);
-    matmul_qk.add_input(key_transpose_out);
-    matmul_qk.add_output(matmul_qk_out);
-
-    graph::op_t fscore_div {op_idx++, graph::op_kind::Divide, "fscore_div"};
-    fscore_div.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    fscore_div.add_input(matmul_qk_out);
-    fscore_div.add_input(fscore_scale);
-    fscore_div.add_output(fscore_div_out);
-
-    graph::op_t fscore_add {op_idx++, graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    fscore_add.add_input(fscore_div_out);
-    fscore_add.add_input(attention_mask_flt);
-    fscore_add.add_output(fscore_add_out);
-
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-    softmax.add_input(fscore_add_out);
-    softmax.add_output(softmax_out);
-
-    graph::op_t mul_dropout {op_idx++, graph::op_kind::Multiply, "mul_dropout"};
-    mul_dropout.add_input(softmax_out);
-    mul_dropout.add_input(dropout);
-    mul_dropout.add_output(dropout_out);
-
-    graph::op_t mul_select {op_idx++, graph::op_kind::Multiply, "mul_select"};
-    mul_select.add_input(dropout_out);
-    mul_select.add_input(select);
-    mul_select.add_output(select_out);
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-    matmul_v.add_input(has_mul_from_select ? select_out : dropout_out);
-    matmul_v.add_input(value_transpose_out);
-    matmul_v.add_output(matmul_v_out);
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-
-    graph::op_t reshape_output {
-            op_idx++, graph::op_kind::StaticReshape, "reshape_output"};
-    reshape_output.set_attr(graph::op_attr::special_zero, false);
-    reshape_output.set_attr(graph::op_attr::shape, OUTPUT_SHAPE);
-    reshape_output.add_input(context_transpose_out);
-    reshape_output.add_output(context_reshape_out);
-
-    // adding ops
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_div);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&softmax);
-    agraph->add_op(&mul_dropout);
-    if (has_mul_from_select) { agraph->add_op(&mul_select); }
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reshape_output);
-
-    // start constructing backward graph
-    graph::logical_tensor_t backward_in;
-    backward_in = utils::logical_tensor_init(
-            logical_tensor_idx++, OUTPUT_SHAPE, dtype);
-    graph::logical_tensor_t in_reshape;
-    in_reshape = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    graph::logical_tensor_t in_transpose;
-    in_transpose = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t bmm_v_grad_weight;
-    bmm_v_grad_weight = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t value_transpose;
-    value_transpose = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t bmm_v_grad_data;
-    bmm_v_grad_data = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t select_grad;
-    select_grad = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t dropout_grad;
-    dropout_grad = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_mul;
-    softmax_mul = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_sum;
-    softmax_sum = utils::logical_tensor_init(
-            logical_tensor_idx++, SOFTMAX_SUM_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_sub;
-    softmax_sub = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_grad;
-    fscore_grad = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t bmm_q_grad_weight;
-    bmm_q_grad_weight = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t bmm_k_grad_weight;
-    bmm_k_grad_weight = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, dtype);
-
-    graph::op_t reshape_bwd {
-            op_idx++, graph::op_kind::StaticReshape, "reshape_bwd"};
-    reshape_bwd.set_attr(graph::op_attr::shape, QKV_RESHAPED_SHAPE);
-    reshape_bwd.set_attr(graph::op_attr::special_zero, false);
-    reshape_bwd.add_input(backward_in);
-    reshape_bwd.add_output(in_reshape);
-
-    graph::op_t transpose_bwd {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_bwd"};
-    transpose_bwd.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    transpose_bwd.add_input(in_reshape);
-    transpose_bwd.add_output(in_transpose);
-
-    graph::op_t grad_v {op_idx++, graph::op_kind::MatMul, "grad_v"};
-    grad_v.set_attr(graph::op_attr::transpose_a, true);
-    grad_v.add_input(dropout_out);
-    grad_v.add_input(in_transpose);
-    grad_v.add_output(bmm_v_grad_weight);
-
-    graph::op_t grad_dropout {op_idx++, graph::op_kind::MatMul, "grad_dropout"};
-    grad_dropout.set_attr(graph::op_attr::transpose_b, true);
-    grad_dropout.add_input(in_transpose);
-    grad_dropout.add_input(value_transpose);
-    grad_dropout.add_output(bmm_v_grad_data);
-
-    graph::op_t grad_select {op_idx++, graph::op_kind::Multiply, "grad_select"};
-    grad_select.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    grad_select.add_input(bmm_v_grad_data);
-    grad_select.add_input(select);
-    grad_select.add_output(select_grad);
-
-    graph::op_t mul {op_idx++, graph::op_kind::Multiply, "mul"};
-    mul.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    mul.add_input(has_mul_from_select ? select_grad : bmm_v_grad_data);
-    mul.add_input(dropout);
-    mul.add_output(dropout_grad);
-
-    graph::op_t mul_2 {op_idx++, graph::op_kind::Multiply, "mul_2"};
-    mul_2.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    mul_2.add_input(dropout_grad);
-    mul_2.add_input(softmax_out);
-    mul_2.add_output(softmax_mul);
-
-    graph::op_t reduce_sum {op_idx++, graph::op_kind::ReduceSum, "reduce_sum"};
-    reduce_sum.set_attr(graph::op_attr::keep_dims, true);
-    reduce_sum.set_attr(graph::op_attr::axes, std::vector<int64_t> {-1});
-    reduce_sum.add_input(softmax_mul);
-    reduce_sum.add_output(softmax_sum);
-
-    graph::op_t sub {op_idx++, graph::op_kind::Subtract, "sub"};
-    sub.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    sub.add_input(dropout_grad);
-    sub.add_input(softmax_sum);
-    sub.add_output(softmax_sub);
-
-    graph::op_t div {op_idx++, graph::op_kind::Divide, "div"};
-    div.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    div.add_input(softmax_sub);
-    div.add_input(fscore_scale);
-    div.add_output(fscore_grad);
-
-    graph::op_t grad_q {op_idx++, graph::op_kind::MatMul, "grad_q"};
-    grad_q.add_input(fscore_grad);
-    grad_q.add_input(key_transpose_out);
-    grad_q.add_output(bmm_q_grad_weight);
-
-    graph::op_t grad_k {op_idx++, graph::op_kind::MatMul, "grad_k"};
-    grad_k.set_attr(graph::op_attr::transpose_a, true);
-    grad_k.add_input(fscore_grad);
-    grad_k.add_input(query_transpose_out);
-    grad_k.add_output(bmm_k_grad_weight);
-
-    // adding ops
-    agraph->add_op(&reshape_bwd);
-    agraph->add_op(&transpose_bwd);
-    agraph->add_op(&grad_v);
-    agraph->add_op(&grad_dropout);
-    if (has_mul_from_select) { agraph->add_op(&grad_select); };
-    agraph->add_op(&mul);
-    agraph->add_op(&mul_2);
-    agraph->add_op(&reduce_sum);
-    agraph->add_op(&sub);
-    agraph->add_op(&div);
-    agraph->add_op(&grad_q);
-    agraph->add_op(&grad_k);
-}
-
-inline void add_distill_bert_MHA(graph::graph_t *agraph, bool use_bf16 = false,
-        bool use_int8 = false,
-        graph::op_kind_t output_op_kind = graph::op_kind::Reorder,
-        graph::dim_t batch_size = 224, graph::dim_t seq_len = 128,
-        graph::dim_t num_head = 12, graph::dim_t head_dim = 768) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE {
-            batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QKV_RESHAPED_SHAPE {
-            batch_size, seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> QKV_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_OUTPUT_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> RESHAPE_OUTPUT_SHAPE {
-            batch_size * seq_len, num_head * size_per_head};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-
-    graph::logical_tensor_t query_dequantize_input, key_dequantize_input,
-            value_dequantize_input;
-    query_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::u8);
-    key_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::u8);
-    value_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::u8);
-
-    graph::logical_tensor_t query_typecast_input, key_typecast_input,
-            value_typecast_input;
-    query_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-    key_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::f32);
-    value_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t attention_mask_flt;
-    attention_mask_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            EXTENDED_ATTENTION_MASK_SHAPE, graph::data_type::boolean);
-
-    graph::logical_tensor_t fscore_scale, select_out;
-    fscore_scale = utils::logical_tensor_init(
-            logical_tensor_idx++, CONST_SHAPE, dtype);
-    select_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_out;
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_cast_out;
-    softmax_cast_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_quantize_out;
-    softmax_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, graph::data_type::u8);
-    graph::logical_tensor_t softmax_dequantize_out;
-    softmax_dequantize_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_dequantize_out_cast;
-    softmax_dequantize_out_cast
-            = utils::logical_tensor_init(logical_tensor_idx++,
-                    MATMUL_QK_OUTPUT_SHAPE, graph::data_type::bf16);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t context_transpose_out, context_final_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPED_SHAPE, dtype);
-    context_final_out = utils::logical_tensor_init(logical_tensor_idx++, dtype);
-
-    graph::logical_tensor_t context_cast_out;
-    context_cast_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32);
-
-    graph::logical_tensor_t context_quantize_out;
-    context_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::u8);
-
-    graph::op_t dequantize_query {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_query"};
-    dequantize_query.set_attr(
-            graph::op_attr::scales, std::vector<float>({0.12f}));
-    dequantize_query.set_attr(graph::op_attr::zps, std::vector<int64_t>({2}));
-    dequantize_query.set_attr(graph::op_attr::qtype, std::string("per_tensor"));
-    dequantize_query.set_attr(graph::op_attr::axis, (int64_t)0);
-    graph::op_t dequantize_key {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_key"};
-    dequantize_key.set_attr(
-            graph::op_attr::scales, std::vector<float>({0.12f}));
-    dequantize_key.set_attr(graph::op_attr::zps, std::vector<int64_t>({2}));
-    dequantize_key.set_attr(graph::op_attr::qtype, std::string("per_tensor"));
-    dequantize_key.set_attr(graph::op_attr::axis, (int64_t)0);
-    graph::op_t dequantize_value {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_value"};
-    dequantize_value.set_attr(
-            graph::op_attr::scales, std::vector<float>({0.12f}));
-    dequantize_value.set_attr(graph::op_attr::zps, std::vector<int64_t>({2}));
-    dequantize_value.set_attr(graph::op_attr::qtype, std::string("per_tensor"));
-    dequantize_value.set_attr(graph::op_attr::axis, (int64_t)0);
-    graph::op_t typecast_query {
-            op_idx++, graph::op_kind::TypeCast, "typecast_query"};
-    graph::op_t typecast_key {
-            op_idx++, graph::op_kind::TypeCast, "typecast_key"};
-    graph::op_t typecast_value {
-            op_idx++, graph::op_kind::TypeCast, "typecast_value"};
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t select {op_idx++, graph::op_kind::Select, "select"};
-    select.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-
-    graph::op_t softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "softmax_cast"};
-    graph::op_t quantize_softmax {
-            op_idx++, graph::op_kind::Quantize, "quantize_softmax"};
-    quantize_softmax.set_attr(
-            graph::op_attr::scales, std::vector<float>({0.12f}));
-    quantize_softmax.set_attr(graph::op_attr::zps, std::vector<int64_t>({2}));
-    quantize_softmax.set_attr(graph::op_attr::qtype, std::string("per_tensor"));
-    quantize_softmax.set_attr(graph::op_attr::axis, (int64_t)0);
-    graph::op_t dequantize_softmax {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_softmax"};
-    dequantize_softmax.set_attr(
-            graph::op_attr::scales, std::vector<float>({0.12f}));
-    dequantize_softmax.set_attr(graph::op_attr::zps, std::vector<int64_t>({2}));
-    dequantize_softmax.set_attr(
-            graph::op_attr::qtype, std::string("per_tensor"));
-    dequantize_softmax.set_attr(graph::op_attr::axis, (int64_t)0);
-
-    graph::op_t dequantize_softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "dequantize_softmax_cast"};
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reshape_reorder_output {
-            op_idx++, output_op_kind, "reshape_reorder_output"};
-    if (output_op_kind == graph::op_kind::StaticReshape) {
-        // if static reshape
-        reshape_reorder_output.set_attr(
-                graph::op_attr::shape, RESHAPE_OUTPUT_SHAPE);
-        reshape_reorder_output.set_attr(graph::op_attr::special_zero, false);
-    }
-
-    graph::op_t typecast_output {
-            op_idx++, graph::op_kind::TypeCast, "typecast_output"};
-    graph::op_t quantize_output {
-            op_idx++, graph::op_kind::Quantize, "quantize_output"};
-    quantize_output.set_attr(
-            graph::op_attr::scales, std::vector<float>({0.12f}));
-    quantize_output.set_attr(graph::op_attr::zps, std::vector<int64_t>({2}));
-    quantize_output.set_attr(graph::op_attr::qtype, std::string("per_tensor"));
-    quantize_output.set_attr(graph::op_attr::axis, (int64_t)0);
-
-    if (use_int8) {
-        dequantize_query.add_input(query_dequantize_input);
-        dequantize_key.add_input(key_dequantize_input);
-        dequantize_value.add_input(value_dequantize_input);
-        if (!use_bf16) {
-            dequantize_query.add_output(query_matmul_input);
-            dequantize_key.add_output(key_matmul_input);
-            dequantize_value.add_output(value_matmul_input);
-        } else {
-            dequantize_query.add_output(query_typecast_input);
-            dequantize_key.add_output(key_typecast_input);
-            dequantize_value.add_output(value_typecast_input);
-            typecast_query.add_input(query_typecast_input);
-            typecast_key.add_input(key_typecast_input);
-            typecast_value.add_input(value_typecast_input);
-            typecast_query.add_output(query_matmul_input);
-            typecast_key.add_output(key_matmul_input);
-            typecast_value.add_output(value_matmul_input);
-        }
-    }
-
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-
-    select.add_input(attention_mask_flt);
-    select.add_input(fscore_scale);
-    select.add_input(matmul_qk_out);
-    select.add_output(select_out);
-    softmax.add_input(select_out);
-    softmax.add_output(softmax_out);
-
-    if (use_int8) {
-        quantize_softmax.add_output(softmax_quantize_out);
-        dequantize_softmax.add_input(softmax_quantize_out);
-        dequantize_softmax.add_output(softmax_dequantize_out);
-        if (!use_bf16) {
-            quantize_softmax.add_input(softmax_out);
-            matmul_v.add_input(softmax_dequantize_out);
-        } else {
-            softmax_cast.add_input(softmax_out);
-            softmax_cast.add_output(softmax_cast_out);
-            quantize_softmax.add_input(softmax_cast_out);
-            dequantize_softmax_cast.add_input(softmax_dequantize_out);
-            dequantize_softmax_cast.add_output(softmax_dequantize_out_cast);
-            matmul_v.add_input(softmax_dequantize_out_cast);
-        }
-    } else {
-        matmul_v.add_input(softmax_out);
-    }
-
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-    reshape_reorder_output.add_input(context_transpose_out);
-    reshape_reorder_output.add_output(context_final_out);
-
-    if (use_int8) {
-        quantize_output.add_output(context_quantize_out);
-        if (!use_bf16) {
-            quantize_output.add_input(context_final_out);
-        } else {
-            typecast_output.add_input(context_final_out);
-            typecast_output.add_output(context_cast_out);
-            quantize_output.add_input(context_cast_out);
-        }
-    }
-
-    if (use_int8) {
-        agraph->add_op(&dequantize_query);
-        agraph->add_op(&dequantize_key);
-        agraph->add_op(&dequantize_value);
-        if (use_bf16) {
-            agraph->add_op(&typecast_query);
-            agraph->add_op(&typecast_key);
-            agraph->add_op(&typecast_value);
-        }
-    }
-
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&select);
-    agraph->add_op(&softmax);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_softmax);
-        agraph->add_op(&dequantize_softmax);
-        if (use_bf16) {
-            agraph->add_op(&softmax_cast);
-            agraph->add_op(&dequantize_softmax_cast);
-        }
-    }
-
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reshape_reorder_output);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_output);
-        if (use_bf16) { agraph->add_op(&typecast_output); }
-    }
-}
-
-inline void add_mlp_subgraph(graph::graph_t *agraph, bool use_bf16 = false,
-        graph::dim_t batch_size = 1, graph::dim_t layer = 1,
-        std::vector<graph::dim_t> hidden_size = {13, 512},
-        std::vector<graph::op_kind_t> act_type = {graph::op_kind::ReLU},
-        bool separate_bias_add = false) {
-    size_t lt_idx = 0;
-    size_t op_idx = 0;
-
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-
-    std::vector<graph::dim_t> layer_input_size {batch_size, hidden_size[0]};
-    graph::logical_tensor_t input
-            = utils::logical_tensor_init(lt_idx++, layer_input_size, dtype);
-
-    for (graph::dim_t i = 0; i < layer; ++i) {
-        std::vector<graph::dim_t> layer_weight_size {
-                hidden_size[i], hidden_size[i + 1]};
-        std::vector<graph::dim_t> layer_bias_size {hidden_size[i + 1]};
-        std::vector<graph::dim_t> layer_dst_size {
-                batch_size, hidden_size[i + 1]};
-
-        graph::logical_tensor_t weight, bias, matmul_dst, add_dst, act_dst;
-
-        weight = utils::logical_tensor_init(lt_idx++, layer_weight_size, dtype);
-        weight.property = property_type::constant;
-        bias = utils::logical_tensor_init(lt_idx++, layer_bias_size, dtype);
-        bias.property = property_type::constant;
-        matmul_dst
-                = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-        add_dst = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-        act_dst = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-
-        std::string layer_suffix = "_layer" + std::to_string(i);
-        graph::op_t matmul {
-                op_idx++, graph::op_kind::MatMul, "matmul" + layer_suffix};
-        graph::op_t add {op_idx++, graph::op_kind::Add, "add" + layer_suffix};
-        graph::op_t activation {
-                op_idx++, act_type[i], "activation" + layer_suffix};
-
-        matmul.add_input(input);
-        matmul.add_input(weight);
-        matmul.add_output(matmul_dst);
-        if (separate_bias_add) {
-            add.add_input(matmul_dst);
-            add.add_input(bias);
-            add.add_output(add_dst);
-        } else {
-            matmul.add_input(bias);
-        }
-
-        if (act_type[i] == graph::op_kind::Wildcard) {
-            input = separate_bias_add ? add_dst : matmul_dst;
-        } else {
-            activation.add_input(separate_bias_add ? add_dst : matmul_dst);
-            activation.add_output(act_dst);
-            input = act_dst;
-        }
-
-        agraph->add_op(&matmul);
-        if (separate_bias_add) { agraph->add_op(&add); }
-        if (act_type[i] != graph::op_kind::Wildcard) {
-            agraph->add_op(&activation);
-        }
-    }
-}
-
-inline void add_int8_mlp_subgraph(graph_t *agraph,
-        std::vector<graph::dim_t> batch_dims = {1, 384}, graph::dim_t layer = 1,
-        std::vector<graph::dim_t> hidden_size = {1024, 1024},
-        std::vector<op_kind_t> act_type = {op_kind::ReLU},
-        bool mixed_dtype = false, bool dyn_quant = false) {
-    size_t lt_idx = 0;
-    size_t op_idx = 0;
-
-    auto dtype = mixed_dtype ? graph::data_type::bf16 : graph::data_type::f32;
-    std::vector<graph::dim_t> layer_input_size(batch_dims);
-    layer_input_size.push_back(hidden_size[0]);
-    graph::logical_tensor_t input_desc
-            = utils::logical_tensor_init(lt_idx++, layer_input_size, dtype);
-
-    for (graph::dim_t i = 0; i < layer; ++i) {
-        std::vector<graph::dim_t> layer_weight_size {
-                hidden_size[i], hidden_size[i + 1]};
-        std::vector<graph::dim_t> layer_bias_size {hidden_size[i + 1]};
-        std::vector<graph::dim_t> layer_dst_size(batch_dims);
-        layer_dst_size.push_back(hidden_size[i + 1]);
-        // creating logical tensors of each layer
-        graph::logical_tensor_t casted_input_desc, quant_input_desc,
-                dequant_input_desc, casted_dequant_input_desc,
-                quant_weight_desc, dequant_weight_desc,
-                casted_dequant_weight_desc, bias_desc, matmul_dst_desc,
-                act_dst_desc;
-        // logical tensors for dynamic quantization.
-        graph::logical_tensor_t input_scale_desc, input_zp_desc,
-                weight_scale_desc;
-        casted_input_desc = utils::logical_tensor_init(
-                lt_idx++, layer_input_size, graph::data_type::f32);
-        quant_input_desc = utils::logical_tensor_init(
-                lt_idx++, layer_input_size, graph::data_type::u8);
-        dequant_input_desc = utils::logical_tensor_init(
-                lt_idx++, layer_input_size, graph::data_type::f32);
-        casted_dequant_input_desc
-                = utils::logical_tensor_init(lt_idx++, layer_input_size, dtype);
-        quant_weight_desc = utils::logical_tensor_init(
-                lt_idx++, layer_weight_size, graph::data_type::s8);
-        quant_weight_desc.property = property_type::constant;
-        dequant_weight_desc = utils::logical_tensor_init(
-                lt_idx++, layer_weight_size, graph::data_type::f32);
-        casted_dequant_weight_desc = utils::logical_tensor_init(
-                lt_idx++, layer_weight_size, dtype);
-        bias_desc
-                = utils::logical_tensor_init(lt_idx++, layer_bias_size, dtype);
-        bias_desc.property = property_type::constant;
-        matmul_dst_desc
-                = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-        act_dst_desc
-                = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-        if (dyn_quant) {
-            input_scale_desc = utils::logical_tensor_init(
-                    lt_idx++, {1}, graph::data_type::f32);
-            input_zp_desc = utils::logical_tensor_init(
-                    lt_idx++, {1}, graph::data_type::s32);
-            weight_scale_desc = utils::logical_tensor_init(
-                    lt_idx++, {hidden_size[i + 1]}, graph::data_type::f32);
-        }
-        // defining ops of each layer
-        std::string layer_suffix = "_layer" + std::to_string(i);
-        graph::op_t typecast_input_f32 {op_idx++, graph::op_kind::TypeCast,
-                "typecast_input_f32" + layer_suffix};
-        graph::op_t quant_input {op_idx++,
-                dyn_quant ? graph::op_kind::DynamicQuantize
-                          : graph::op_kind::Quantize,
-                "quantize_input" + layer_suffix};
-        if (dyn_quant) {
-            DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(quant_input);
-        } else {
-            DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quant_input);
-        }
-        graph::op_t dequant_input {op_idx++,
-                dyn_quant ? graph::op_kind::DynamicDequantize
-                          : graph::op_kind::Dequantize,
-                "dequantize_input" + layer_suffix};
-        if (dyn_quant) {
-            DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequant_input);
-        } else {
-            DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequant_input);
-        }
-        graph::op_t typecast_input_bf16 {op_idx++, graph::op_kind::TypeCast,
-                "typecast_input_bf16" + layer_suffix};
-        graph::op_t dequant_weight {op_idx++,
-                dyn_quant ? graph::op_kind::DynamicDequantize
-                          : graph::op_kind::Dequantize,
-                "dequantize_weight" + layer_suffix};
-        if (dyn_quant) {
-            DEFINE_DEFAULT_PER_CHANNEL_DYN_QUANT_ATTR(
-                    dequant_weight, hidden_size[i + 1], 1);
-        } else {
-            DEFINE_DEFAULT_PER_CHANNEL_QUANT_ATTR(
-                    dequant_weight, hidden_size[i + 1], 1);
-        }
-        graph::op_t typecast_weight_bf16 {op_idx++, graph::op_kind::TypeCast,
-                "typecast_input_bf16" + layer_suffix};
-        graph::op_t matmul {
-                op_idx++, graph::op_kind::MatMul, "matmul" + layer_suffix};
-        graph::op_t activation {
-                op_idx++, act_type[i], "activation" + layer_suffix};
-        // defining op connection of each layer
-        quant_input.add_output(quant_input_desc);
-        dequant_input.add_input(quant_input_desc);
-        dequant_input.add_output(dequant_input_desc);
-        dequant_weight.add_input(quant_weight_desc);
-        dequant_weight.add_output(dequant_weight_desc);
-        if (mixed_dtype) {
-            typecast_input_f32.add_input(input_desc);
-            typecast_input_f32.add_output(casted_input_desc);
-            quant_input.add_input(casted_input_desc);
-            typecast_input_bf16.add_input(dequant_input_desc);
-            typecast_input_bf16.add_output(casted_dequant_input_desc);
-            typecast_weight_bf16.add_input(dequant_weight_desc);
-            typecast_weight_bf16.add_output(casted_dequant_weight_desc);
-            matmul.add_input(casted_dequant_input_desc);
-            matmul.add_input(casted_dequant_weight_desc);
-        } else {
-            quant_input.add_input(input_desc);
-            matmul.add_input(dequant_input_desc);
-            matmul.add_input(dequant_weight_desc);
-        }
-        if (dyn_quant) {
-            quant_input.add_input(input_scale_desc);
-            quant_input.add_input(input_zp_desc);
-            dequant_input.add_input(input_scale_desc);
-            dequant_input.add_input(input_zp_desc);
-            dequant_weight.add_input(weight_scale_desc);
-        }
-
-        matmul.add_input(bias_desc);
-        matmul.add_output(matmul_dst_desc);
-
-        if (act_type[i] == graph::op_kind::Wildcard) {
-            input_desc = matmul_dst_desc;
-        } else {
-            activation.add_input(matmul_dst_desc);
-            activation.add_output(act_dst_desc);
-            input_desc = act_dst_desc;
-        }
-        // adding ops of each layer
-        if (mixed_dtype) {
-            agraph->add_op(&typecast_input_f32);
-            agraph->add_op(&typecast_input_bf16);
-            agraph->add_op(&typecast_weight_bf16);
-        }
-        agraph->add_op(&quant_input);
-        agraph->add_op(&dequant_input);
-        agraph->add_op(&dequant_weight);
-        agraph->add_op(&matmul);
-        if (act_type[i] != graph::op_kind::Wildcard) {
-            agraph->add_op(&activation);
-        }
-
-        layer_input_size = layer_dst_size;
-    }
-
-    // defining output layer logical tensors
-    graph::logical_tensor_t casted_output_desc = utils::logical_tensor_init(
-            lt_idx++, layer_input_size, graph::data_type::f32);
-    graph::logical_tensor_t quant_output_desc = utils::logical_tensor_init(
-            lt_idx++, layer_input_size, graph::data_type::u8);
-    graph::logical_tensor_t dequant_output_desc = utils::logical_tensor_init(
-            lt_idx++, layer_input_size, graph::data_type::f32);
-    graph::logical_tensor_t casted_dequant_output_desc
-            = utils::logical_tensor_init(lt_idx++, layer_input_size, dtype);
-    // defining logical tensors for dynamic quantization.
-    graph::logical_tensor_t output_scale_desc
-            = utils::logical_tensor_init(lt_idx++, {1}, graph::data_type::f32);
-    graph::logical_tensor_t output_zp_desc
-            = utils::logical_tensor_init(lt_idx++, {1}, graph::data_type::s32);
-    // defining output layer ops
-    graph::op_t typecast_output_f32 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_output_f32"};
-    graph::op_t quant_output {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicQuantize
-                      : graph::op_kind::Quantize,
-            "quantize_output"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(quant_output);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quant_output);
-    }
-    graph::op_t dequant_output {op_idx++,
-            dyn_quant ? graph::op_kind::DynamicDequantize
-                      : graph::op_kind::Dequantize,
-            "dequantize_output"};
-    if (dyn_quant) {
-        DEFINE_DEFAULT_PER_TENSOR_DYN_QUANT_ATTR(dequant_output);
-    } else {
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequant_output);
-    }
-    graph::op_t typecast_output_bf16 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_output_bf16"};
-    // defining connection between output ops
-    quant_output.add_output(quant_output_desc);
-    dequant_output.add_input(quant_output_desc);
-    dequant_output.add_output(dequant_output_desc);
-    if (mixed_dtype) {
-        typecast_output_f32.add_input(input_desc);
-        typecast_output_f32.add_output(casted_output_desc);
-        quant_output.add_input(casted_output_desc);
-        typecast_output_bf16.add_input(dequant_output_desc);
-        typecast_output_bf16.add_output(casted_dequant_output_desc);
-    } else {
-        quant_output.add_input(input_desc);
-    }
-    if (dyn_quant) {
-        quant_output.add_input(output_scale_desc);
-        quant_output.add_input(output_zp_desc);
-        dequant_output.add_input(output_scale_desc);
-        dequant_output.add_input(output_zp_desc);
-    }
-    // adding ops
-    agraph->add_op(&quant_output);
-    agraph->add_op(&dequant_output);
-    if (mixed_dtype) {
-        agraph->add_op(&typecast_output_f32);
-        agraph->add_op(&typecast_output_bf16);
-    }
-}
-
-inline void add_int8_mlp_subgraph(graph_t *agraph, graph::dim_t batch_size = 1,
-        graph::dim_t layer = 1,
-        std::vector<graph::dim_t> hidden_size = {13, 512},
-        std::vector<op_kind_t> act_type = {op_kind::ReLU},
-        bool mixed_dtype = false, bool dyn_quant = false) {
-    add_int8_mlp_subgraph(agraph, std::vector<graph::dim_t> {batch_size}, layer,
-            hidden_size, act_type, mixed_dtype, dyn_quant);
-}
-
-static bool apply_use_dst(bool use_dst, graph::op_kind_t act_op_kind) {
-    if (!use_dst
-            && (act_op_kind == graph::op_kind::ReLUBackward
-                    || act_op_kind == graph::op_kind::SigmoidBackward)) {
-        return false;
-    }
-    return true;
-}
-
-/*
-act_type: Activation function op after each layer of matmul (forward order)
-act_backprop_type: Activation backprop function op before each matmul
-                   (reverse order)
-e.g. {ReLU, Sigmoid} ==> {SigmoidBackprop, ReLUBackprop}
-formula: X_{i+1} = Activation(Z{i}); Z_{i} = X_{i}W_{i} + bias_{i}
-*/
-inline void add_mlp_training_graph(graph::graph_t *agraph,
-        graph::dim_t batch_size, graph::dim_t layer,
-        std::vector<graph::dim_t> hidden_size,
-        std::vector<graph::op_kind_t> act_type,
-        std::vector<graph::op_kind_t> act_backprop_type, bool use_bf16 = false,
-        bool use_dst = true, bool weight_transposed = false) {
-    graph::data_type_t dtype
-            = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    size_t lt_idx = 0;
-    size_t op_idx = 0;
-    std::vector<graph::logical_tensor_t> z_lts;
-    std::vector<graph::logical_tensor_t> weight_lts;
-    std::vector<graph::logical_tensor_t> x_lts;
-
-    std::vector<graph::dim_t> layer_input_size {batch_size, hidden_size[0]};
-    graph::logical_tensor_t input
-            = utils::logical_tensor_init(lt_idx++, layer_input_size, dtype);
-    x_lts.push_back(input);
-
-    // constructing the forward calculations
-    for (graph::dim_t i = 0; i < layer; ++i) {
-        std::vector<graph::dim_t> layer_weight_size {
-                hidden_size[i], hidden_size[i + 1]};
-        std::vector<graph::dim_t> layer_bias_size {hidden_size[i + 1]};
-        std::vector<graph::dim_t> layer_dst_size {
-                batch_size, hidden_size[i + 1]};
-
-        graph::logical_tensor_t weight, bias, matmul_dst, act_dst;
-
-        weight = utils::logical_tensor_init(lt_idx++, layer_weight_size, dtype);
-        weight_lts.push_back(weight);
-        bias = utils::logical_tensor_init(lt_idx++, layer_bias_size, dtype);
-        matmul_dst
-                = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-        z_lts.push_back(matmul_dst);
-        act_dst = utils::logical_tensor_init(lt_idx++, layer_dst_size, dtype);
-        x_lts.push_back(act_dst);
-
-        std::string layer_suffix = "_layer" + std::to_string(i);
-        graph::op_t matmul {
-                op_idx++, graph::op_kind::MatMul, "matmul" + layer_suffix};
-        graph::op_t activation {
-                op_idx++, act_type[i], "activation" + layer_suffix};
-
-        matmul.add_input(input);
-        matmul.add_input(weight);
-        matmul.add_input(bias);
-        matmul.add_output(matmul_dst);
-
-        activation.add_input(matmul_dst);
-        activation.add_output(act_dst);
-
-        agraph->add_op(&matmul);
-        agraph->add_op(&activation);
-
-        input = act_dst;
-    }
-
-    graph::logical_tensor_t grad_y = utils::logical_tensor_init(
-            lt_idx++, {batch_size, hidden_size[layer]}, dtype);
-
-    // constructing backward calculations
-    for (graph::dim_t i = 0; i < layer; ++i) {
-        std::vector<graph::dim_t> grad_z_size {
-                batch_size, hidden_size[layer - i]};
-        std::vector<graph::dim_t> transposed_grad_z_size {
-                hidden_size[layer - i], batch_size};
-        std::vector<graph::dim_t> grad_x_size {
-                batch_size, hidden_size[layer - i - 1]};
-        std::vector<graph::dim_t> grad_w_size = weight_transposed
-                ? std::vector<graph::dim_t> {hidden_size[layer - i],
-                        hidden_size[layer - i - 1]}
-                : std::vector<graph::dim_t> {
-                        hidden_size[layer - i - 1], hidden_size[layer - i]};
-        std::vector<graph::dim_t> transposed_x_size {
-                hidden_size[layer - i - 1], batch_size};
-        std::vector<graph::dim_t> transposed_weight_size {
-                hidden_size[layer - i], hidden_size[layer - i - 1]};
-        std::vector<graph::dim_t> grad_bias_size {1, hidden_size[layer - i]};
-
-        graph::logical_tensor_t activation_backprop_out,
-                transposed_activation_backprop_out, grad_x, grad_weight,
-                grad_bias, transposed_weight_out, transposed_x_out;
-        activation_backprop_out
-                = utils::logical_tensor_init(lt_idx++, grad_z_size, dtype);
-        transposed_activation_backprop_out = utils::logical_tensor_init(
-                lt_idx++, transposed_grad_z_size, dtype);
-        grad_x = utils::logical_tensor_init(lt_idx++, grad_x_size, dtype);
-        grad_weight = utils::logical_tensor_init(lt_idx++, grad_w_size, dtype);
-        grad_bias = utils::logical_tensor_init(lt_idx++, grad_bias_size, dtype);
-        transposed_weight_out = utils::logical_tensor_init(
-                lt_idx++, transposed_weight_size, dtype);
-        transposed_x_out = utils::logical_tensor_init(
-                lt_idx++, transposed_x_size, dtype);
-
-        std::string layer_suffix = "_layer" + std::to_string(layer - i - 1);
-        graph::op_t activation_backward {op_idx++, act_backprop_type[i],
-                "activation_backprop" + layer_suffix};
-        if (!apply_use_dst(use_dst, act_backprop_type[i])) {
-            activation_backward.set_attr(graph::op_attr::use_dst, false);
-        }
-        graph::op_t transpose_activation_backward {op_idx++,
-                graph::op_kind::StaticTranspose,
-                "transpose_activation_backward" + layer_suffix};
-        transpose_activation_backward.set_attr(
-                graph::op_attr::order, std::vector<int64_t> {1, 0});
-        graph::op_t transpose_weight {op_idx++, graph::op_kind::StaticTranspose,
-                "transpose_weight" + layer_suffix};
-        transpose_weight.set_attr(
-                graph::op_attr::order, std::vector<int64_t> {1, 0});
-        graph::op_t transpose_x {op_idx++, graph::op_kind::StaticTranspose,
-                "transpose_x" + layer_suffix};
-        transpose_x.set_attr(
-                graph::op_attr::order, std::vector<int64_t> {1, 0});
-        graph::op_t matmul_weight {
-                op_idx++, graph::op_kind::MatMul, "grad_weight" + layer_suffix};
-        graph::op_t matmul_x {
-                op_idx++, graph::op_kind::MatMul, "grad_x" + layer_suffix};
-        graph::op_t reduce_bias {op_idx++, graph::op_kind::ReduceSum,
-                "grad_bias" + layer_suffix};
-        reduce_bias.set_attr(graph::op_attr::axes, std::vector<int64_t> {0});
-        reduce_bias.set_attr(graph::op_attr::keep_dims, true);
-
-        // X_{i+1} = act(Z_{i})
-        if (!apply_use_dst(use_dst, act_backprop_type[i])) {
-            activation_backward.add_input(z_lts[layer - i - 1]);
-        } else {
-            activation_backward.add_input(x_lts[layer - i]);
-        }
-        activation_backward.add_input(grad_y);
-        activation_backward.add_output(activation_backprop_out);
-        transpose_activation_backward.add_input(activation_backprop_out);
-        transpose_activation_backward.add_output(
-                transposed_activation_backprop_out);
-        transpose_weight.add_input(weight_lts[layer - i - 1]);
-        transpose_weight.add_output(transposed_weight_out);
-        matmul_x.add_input(activation_backprop_out);
-        matmul_x.add_input(transposed_weight_out);
-        matmul_x.add_output(grad_x);
-        transpose_x.add_input(x_lts[layer - i - 1]);
-        transpose_x.add_output(transposed_x_out);
-        if (weight_transposed) {
-            matmul_weight.add_input(transposed_activation_backprop_out);
-            matmul_weight.add_input(x_lts[layer - i - 1]);
-            matmul_weight.add_output(grad_weight);
-        } else {
-            matmul_weight.add_input(transposed_x_out);
-            matmul_weight.add_input(activation_backprop_out);
-            matmul_weight.add_output(grad_weight);
-        }
-        reduce_bias.add_input(activation_backprop_out);
-        reduce_bias.add_output(grad_bias);
-
-        agraph->add_op(&activation_backward);
-        if (weight_transposed) {
-            agraph->add_op(&transpose_activation_backward);
-        } else {
-            agraph->add_op(&transpose_x);
-        }
-        agraph->add_op(&matmul_weight);
-        if (i != layer - 1) {
-            // no need to compute grad_input for the last backprop layer
-            agraph->add_op(&matmul_x);
-            agraph->add_op(&transpose_weight);
-        }
-        if (batch_size > 1) { agraph->add_op(&reduce_bias); }
-
-        grad_y = grad_x;
-    }
-}
-
-static inline void add_MHA_subgraph_alternative2(graph::graph_t *agraph,
-        bool has_quantize = false, graph::dim_t batch_size = 64,
-        graph::dim_t seq_len = 384, graph::dim_t num_head = 16,
-        graph::dim_t head_dim = 1024) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> MIXED_LAYER_INPUT_SHAPE {
-            batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> QKV_RESHAPE_SHAPE {
-            batch_size, seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> QV_TRANSPOSE_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> K_TRANSPOSE_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> SOFTMAX_SHAPE {
-            batch_size * num_head * seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> EXTENDED_ATTENTION_MASK_SHAPE {
-            batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> OUTPUT_SHAPE {batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-
-    graph::logical_tensor_t query_gemm_out_flt, qk_bmm_wei_flt,
-            value_bmm_wei_flt;
-    query_gemm_out_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    qk_bmm_wei_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-    value_bmm_wei_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            MIXED_LAYER_INPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t attention_mask_flt;
-    attention_mask_flt = utils::logical_tensor_init(logical_tensor_idx++,
-            EXTENDED_ATTENTION_MASK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t scale;
-    scale = utils::logical_tensor_init(
-            logical_tensor_idx++, CONST_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_reshape_out, query_transpose_out;
-    query_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPE_SHAPE, graph::data_type::f32);
-    query_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QV_TRANSPOSE_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t key_reshape_out, key_transpose_out;
-    key_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPE_SHAPE, graph::data_type::f32);
-    key_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, K_TRANSPOSE_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t fake_quantize_out, fake_dequantize_out;
-    fake_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::u8);
-    fake_dequantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t add_out;
-    add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::f32);
-    graph::logical_tensor_t mul_out;
-    mul_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t pre_softmax_reshape_out;
-    pre_softmax_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, SOFTMAX_SHAPE, graph::data_type::f32);
-    graph::logical_tensor_t softmax_out;
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, SOFTMAX_SHAPE, graph::data_type::f32);
-    graph::logical_tensor_t softmax_reshape_out;
-    softmax_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t value_reshape_out, value_transpose_out;
-    value_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPE_SHAPE, graph::data_type::f32);
-    value_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QV_TRANSPOSE_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t context_transpose_out, context_reshape_out;
-    context_transpose_out = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_RESHAPE_SHAPE, graph::data_type::f32);
-    context_reshape_out = utils::logical_tensor_init(
-            logical_tensor_idx++, OUTPUT_SHAPE, graph::data_type::f32);
-
-    // reshape + transpose for query + key
-    graph::op_t query_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "query_reshape"};
-    query_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPE_SHAPE);
-    query_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t query_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "query_transpose"};
-    query_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t key_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "key_reshape"};
-    key_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPE_SHAPE);
-    key_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t key_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "key_transpose"};
-    key_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 3, 1});
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fake_quantize {
-            op_idx++, graph::op_kind::Quantize, "fake_quantize"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(fake_quantize);
-    graph::op_t fake_dequantize {
-            op_idx++, graph::op_kind::Dequantize, "fake_dequantize"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(fake_dequantize);
-
-    graph::op_t add {op_idx++, graph::op_kind::Add, "add"};
-    add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t mul {op_idx++, graph::op_kind::Multiply, "mul"};
-    mul.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-
-    graph::op_t pre_softmax_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "pre_softmax_reshape"};
-    pre_softmax_reshape.set_attr(graph::op_attr::shape, SOFTMAX_SHAPE);
-    pre_softmax_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)1);
-    graph::op_t softmax_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "softmax_reshape"};
-    softmax_reshape.set_attr(graph::op_attr::shape, MATMUL_QK_SHAPE);
-    softmax_reshape.set_attr(graph::op_attr::special_zero, false);
-
-    // reshape + transpose for value
-    graph::op_t value_reshape {
-            op_idx++, graph::op_kind::StaticReshape, "value_reshape"};
-    value_reshape.set_attr(graph::op_attr::shape, QKV_RESHAPE_SHAPE);
-    value_reshape.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t value_transpose {
-            op_idx++, graph::op_kind::StaticTranspose, "value_transpose"};
-    value_transpose.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {
-            op_idx++, graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reshape_output {
-            op_idx++, graph::op_kind::StaticReshape, "reshape_output"};
-    reshape_output.set_attr(graph::op_attr::special_zero, false);
-    reshape_output.set_attr(graph::op_attr::shape, OUTPUT_SHAPE);
-
-    query_reshape.add_input(query_gemm_out_flt);
-    key_reshape.add_input(qk_bmm_wei_flt);
-    value_reshape.add_input(value_bmm_wei_flt);
-
-    query_reshape.add_output(query_reshape_out);
-    query_transpose.add_input(query_reshape_out);
-    query_transpose.add_output(query_transpose_out);
-
-    key_reshape.add_output(key_reshape_out);
-    key_transpose.add_input(key_reshape_out);
-    key_transpose.add_output(key_transpose_out);
-
-    matmul_qk.add_input(query_transpose_out);
-    matmul_qk.add_input(key_transpose_out);
-    matmul_qk.add_output(matmul_qk_out);
-
-    if (!has_quantize) {
-        add.add_input(matmul_qk_out);
-        add.add_input(attention_mask_flt);
-        add.add_output(add_out);
-        pre_softmax_reshape.add_input(add_out);
-    } else {
-        fake_quantize.add_input(matmul_qk_out);
-        fake_quantize.add_output(fake_quantize_out);
-        fake_dequantize.add_input(fake_quantize_out);
-        fake_dequantize.add_output(fake_dequantize_out);
-        add.add_input(fake_dequantize_out);
-        add.add_input(attention_mask_flt);
-        add.add_output(add_out);
-        mul.add_input(add_out);
-        mul.add_input(scale);
-        mul.add_output(mul_out);
-        pre_softmax_reshape.add_input(mul_out);
-    }
-
-    pre_softmax_reshape.add_output(pre_softmax_reshape_out);
-    softmax.add_input(pre_softmax_reshape_out);
-    softmax.add_output(softmax_out);
-    softmax_reshape.add_input(softmax_out);
-    softmax_reshape.add_output(softmax_reshape_out);
-
-    value_reshape.add_output(value_reshape_out);
-    value_transpose.add_input(value_reshape_out);
-    value_transpose.add_output(value_transpose_out);
-    matmul_v.add_input(softmax_reshape_out);
-    matmul_v.add_input(value_transpose_out);
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-    reshape_output.add_input(context_transpose_out);
-    reshape_output.add_output(context_reshape_out);
-
-    agraph->add_op(&query_reshape);
-    agraph->add_op(&query_transpose);
-    agraph->add_op(&key_reshape);
-    agraph->add_op(&key_transpose);
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&add);
-    if (has_quantize) {
-        agraph->add_op(&fake_quantize);
-        agraph->add_op(&fake_dequantize);
-        agraph->add_op(&mul);
-    }
-
-    agraph->add_op(&pre_softmax_reshape);
-    agraph->add_op(&softmax);
-    agraph->add_op(&softmax_reshape);
-
-    agraph->add_op(&value_reshape);
-    agraph->add_op(&value_transpose);
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reshape_output);
-}
-
-static inline void add_MHA_subgraph_alternative4(graph::graph_t *agraph,
-        bool use_int8 = false, graph::dim_t batch_size = 16,
-        graph::dim_t seq_len = 2048, graph::dim_t num_head = 12,
-        graph::dim_t head_dim = 768) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> QV_TRANSPOSE_SHAPE {
-            batch_size * num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> K_TRANSPOSE_SHAPE {
-            batch_size * num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_SHAPE {
-            batch_size * num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_VIEWED_SHAPE {
-            batch_size, num_head, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_SHAPE {
-            batch_size * num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> MASK_SHAPE {batch_size, 1, 1, seq_len};
-
-    auto dtype = graph::data_type::f32;
-
-    graph::logical_tensor_t query_dequantize_input, key_dequantize_input,
-            value_dequantize_input;
-    query_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QV_TRANSPOSE_SHAPE, graph::data_type::u8);
-    key_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, K_TRANSPOSE_SHAPE, graph::data_type::u8);
-    value_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QV_TRANSPOSE_SHAPE, graph::data_type::u8);
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QV_TRANSPOSE_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, K_TRANSPOSE_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QV_TRANSPOSE_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, dtype);
-
-    graph::logical_tensor_t attention_mask_flt, max_in_flt;
-    attention_mask_flt = utils::logical_tensor_init(
-            logical_tensor_idx++, MASK_SHAPE, dtype);
-    max_in_flt = utils::logical_tensor_init(
-            logical_tensor_idx++, MASK_SHAPE, dtype);
-
-    graph::logical_tensor_t fscore_reshape1_out, fscore_add_out, fscore_max_out,
-            fscore_reshape2_out, softmax_out;
-    fscore_reshape1_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_VIEWED_SHAPE, dtype);
-    fscore_add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_VIEWED_SHAPE, dtype);
-    fscore_max_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_VIEWED_SHAPE, dtype);
-    fscore_reshape2_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, dtype);
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_quantize_out, softmax_dequantize_out;
-    softmax_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::u8);
-    softmax_dequantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_SHAPE, dtype);
-
-    graph::logical_tensor_t context_quantize_out;
-    context_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_SHAPE, graph::data_type::u8);
-
-    graph::op_t dequantize_query {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_query"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query);
-    graph::op_t dequantize_key {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_key"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key);
-    graph::op_t dequantize_value {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_value"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value);
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_view1 {
-            op_idx++, graph::op_kind::StaticReshape, "fscore_view1"};
-    fscore_view1.set_attr(graph::op_attr::shape, MATMUL_QK_VIEWED_SHAPE);
-    fscore_view1.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t fscore_add {op_idx++, graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_max {op_idx++, graph::op_kind::Maximum, "fscore_max"};
-    fscore_max.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_view2 {
-            op_idx++, graph::op_kind::StaticReshape, "fscore_view2"};
-    fscore_view2.set_attr(graph::op_attr::shape, MATMUL_QK_SHAPE);
-    fscore_view2.set_attr(graph::op_attr::special_zero, false);
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)2);
-
-    graph::op_t quantize_softmax {
-            op_idx++, graph::op_kind::Quantize, "quantize_softmax"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_softmax);
-    graph::op_t dequantize_softmax {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_softmax"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_softmax);
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-
-    graph::op_t quantize_output {
-            op_idx++, graph::op_kind::Quantize, "quantize_output"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_output);
-
-    if (use_int8) {
-        dequantize_query.add_input(query_dequantize_input);
-        dequantize_key.add_input(key_dequantize_input);
-        dequantize_value.add_input(value_dequantize_input);
-        dequantize_query.add_output(query_matmul_input);
-        dequantize_key.add_output(key_matmul_input);
-        dequantize_value.add_output(value_matmul_input);
-    }
-
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-
-    fscore_view1.add_input(matmul_qk_out);
-    fscore_view1.add_output(fscore_reshape1_out);
-    fscore_add.add_input(fscore_reshape1_out);
-    fscore_add.add_input(attention_mask_flt);
-    fscore_add.add_output(fscore_add_out);
-    fscore_max.add_input(fscore_add_out);
-    fscore_max.add_input(max_in_flt);
-    fscore_max.add_output(fscore_max_out);
-    fscore_view2.add_input(fscore_max_out);
-    fscore_view2.add_output(fscore_reshape2_out);
-    softmax.add_input(fscore_reshape2_out);
-    softmax.add_output(softmax_out);
-
-    if (use_int8) {
-        quantize_softmax.add_input(softmax_out);
-        quantize_softmax.add_output(softmax_quantize_out);
-        dequantize_softmax.add_input(softmax_quantize_out);
-        dequantize_softmax.add_output(softmax_dequantize_out);
-        matmul_v.add_input(softmax_dequantize_out);
-    } else {
-        matmul_v.add_input(softmax_out);
-    }
-
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    if (use_int8) {
-        quantize_output.add_input(matmul_v_out);
-        quantize_output.add_output(context_quantize_out);
-    }
-
-    if (use_int8) {
-        agraph->add_op(&dequantize_query);
-        agraph->add_op(&dequantize_key);
-        agraph->add_op(&dequantize_value);
-    }
-
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_view1);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&fscore_max);
-    agraph->add_op(&fscore_view2);
-    agraph->add_op(&softmax);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_softmax);
-        agraph->add_op(&dequantize_softmax);
-    }
-
-    agraph->add_op(&matmul_v);
-
-    if (use_int8) { agraph->add_op(&quantize_output); }
-}
-
-// returned vector with shape {ic, ks, oc}
-static std::vector<graph::dim_t> extract_filter_info(
-        const dims &shape, const std::string &filter_format) {
-    size_t ndims = shape.size();
-    return filter_format == "OIX"
-            ? std::vector<graph::dim_t> {shape[1], shape[2], shape[0]}
-            : std::vector<graph::dim_t> {
-                    shape[ndims - 2], shape[0], shape[ndims - 1]};
-}
-
-inline graph::logical_tensor_t create_dyn_dequantize(
-        utils::id_generator &id_gen, graph_t &agraph,
-        const graph::logical_tensor_t &src, const std::string &qtype,
-        int64_t axis, graph::dim_t channel_size = 1) {
-    graph::op_t dq_op(
-            id_gen.get_id(), graph::op_kind::DynamicDequantize, "dequantize");
-    dq_op.set_attr<std::string>(op_attr::qtype, qtype);
-    dq_op.set_attr<int64_t>(op_attr::axis, axis);
-
-    auto dst = utils::logical_tensor_init(id_gen.get_id(), data_type::f32);
-    graph::logical_tensor_t scale_desc = utils::logical_tensor_init(
-            id_gen.get_id(),
-            qtype == "per_channel" ? std::vector<graph::dim_t> {channel_size}
-                                   : std::vector<graph::dim_t> {1},
-            data_type::f32);
-    dq_op.add_input(src);
-    dq_op.add_input(scale_desc);
-    dq_op.add_output(dst);
-    agraph.add_op(&dq_op);
-    return dst;
-}
-
-inline graph::logical_tensor_t create_dyn_quantize(utils::id_generator &id_gen,
-        graph_t &agraph, const graph::logical_tensor_t &src,
-        data_type_t dst_dtype, const std::string &qtype, int64_t axis) {
-    graph::op_t q_op(
-            id_gen.get_id(), graph::op_kind::DynamicQuantize, "quantize");
-    q_op.set_attr<std::string>(op_attr::qtype, qtype);
-    q_op.set_attr<int64_t>(op_attr::axis, axis);
-
-    auto dst = utils::logical_tensor_init(id_gen.get_id(), dst_dtype);
-    graph::logical_tensor_t scale_desc
-            = utils::logical_tensor_init(id_gen.get_id(), {1}, data_type::f32);
-    q_op.add_input(src);
-    q_op.add_input(scale_desc);
-    q_op.add_output(dst);
-    agraph.add_op(&q_op);
-    return dst;
-}
-
-inline graph::logical_tensor_t create_int8_convolution_dyn_quant(
-        utils::id_generator &id_gen, graph_t &agraph,
-        const graph::logical_tensor_t &src, int64_t ic, int64_t ks, int64_t oc,
-        int64_t groups, const dims &strides, const dims &dilations,
-        const dims &pads_begin, const dims &pads_end,
-        const std::string &data_format, const std::string &filter_format,
-        bool with_bias, bool with_bn, float epsilon, bool with_relu,
-        data_type_t dst_dtype, bool is_quantize_dst = true,
-        bool use_biasadd = false, bool is_quantize_wei = false) {
-    assertm(!with_bn, "int8 conv not support bn now");
-
-    auto dq_src = create_dyn_dequantize(id_gen, agraph, src, "per_tensor", 0);
-
-    graph::op_t conv(id_gen.get_id(), graph::op_kind::Convolution, "conv");
-    conv.set_attr<int64_t>(op_attr::groups, groups);
-    conv.set_attr<dims>(op_attr::strides, strides);
-    conv.set_attr<dims>(op_attr::dilations, dilations);
-    conv.set_attr<dims>(op_attr::pads_begin, pads_begin);
-    conv.set_attr<dims>(op_attr::pads_end, pads_end);
-    conv.set_attr<std::string>(op_attr::data_format, data_format);
-    conv.set_attr<std::string>(op_attr::weights_format, filter_format);
-
-    dims wei_shape = (filter_format == "OIX") ? dims {oc, ic / groups, ks, ks}
-                                              : dims {ks, ks, ic / groups, oc};
-
-    graph::logical_tensor_t int8_wei;
-    if (is_quantize_wei) {
-        auto f32_wei = utils::logical_tensor_init(
-                id_gen.get_id(), wei_shape, data_type::f32);
-        f32_wei.property = property_type::constant;
-        int8_wei = create_dyn_quantize(id_gen, agraph, f32_wei, data_type::s8,
-                "per_channel", (filter_format == "OIX") ? 0 : 3);
-    } else {
-        int8_wei = utils::logical_tensor_init(
-                id_gen.get_id(), wei_shape, data_type::s8);
-        int8_wei.property = property_type::constant;
-    }
-    int64_t channel_axis = (filter_format == "OIX") ? 0 : 3;
-    auto dq_wei = create_dyn_dequantize(id_gen, agraph, int8_wei, "per_channel",
-            channel_axis, wei_shape[channel_axis]);
-
-    auto dst = utils::logical_tensor_init(id_gen.get_id(), dq_src.data_type);
-
-    conv.add_input(dq_src);
-    conv.add_input(dq_wei);
-    if (with_bias && !use_biasadd) {
-        auto bias = utils::logical_tensor_init(
-                id_gen.get_id(), dims {oc}, dq_src.data_type);
-        bias.property = property_type::constant;
-        conv.add_input(bias);
-    }
-    conv.add_output(dst);
-    agraph.add_op(&conv);
-
-    if (with_bias && use_biasadd) {
-        graph::op_t biasadd_op(
-                id_gen.get_id(), graph::op_kind::BiasAdd, "biasadd");
-        biasadd_op.set_attr<std::string>(op_attr::data_format, data_format);
-
-        auto biasadd_src = dst;
-        auto bias = utils::logical_tensor_init(
-                id_gen.get_id(), dims {oc}, biasadd_src.data_type);
-        bias.property = property_type::constant;
-
-        dst = utils::logical_tensor_init(
-                id_gen.get_id(), biasadd_src.data_type);
-
-        biasadd_op.add_input(biasadd_src);
-        biasadd_op.add_input(bias);
-        biasadd_op.add_output(dst);
-
-        agraph.add_op(&biasadd_op);
-    }
-
-    if (with_relu) {
-        graph::op_t relu_op(id_gen.get_id(), graph::op_kind::ReLU, "relu");
-        auto relu_src = dst;
-        dst = utils::logical_tensor_init(id_gen.get_id(), dq_src.data_type);
-        relu_op.add_input(relu_src);
-        relu_op.add_output(dst);
-        agraph.add_op(&relu_op);
-    }
-
-    if (is_quantize_dst) {
-        dst = create_dyn_quantize(
-                id_gen, agraph, dst, dst_dtype, "per_tensor", 0);
-    }
-
-    return dst;
-}
-
-// filter_shape in the order of 1x1; 1x1 + 3x3 + 1x1
-// strides and paddings also start with the single conv branch
-inline void construct_convolutional_bottleneck_resblock(graph::graph_t *agraph,
-        utils::id_generator &id_gen, const dims &input_shape,
-        const std::vector<dims> &filter_shapes, bool is_bf16 = false,
-        const std::vector<std::vector<int64_t>> &strides
-        = {{2, 2}, {1, 1}, {2, 2}, {1, 1}},
-        const std::vector<std::vector<int64_t>> &paddings
-        = {{0, 0}, {0, 0}, {1, 1}, {0, 0}},
-        const std::string &data_format = "NCX",
-        const std::string &filter_format = "OIX") {
-    auto dtype = is_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    auto input
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-
-    std::vector<std::vector<graph::dim_t>> filter_infos;
-    for (auto shape : filter_shapes) {
-        filter_infos.push_back(extract_filter_info(shape, filter_format));
-    }
-    auto left = create_convolution(id_gen, *agraph, input, filter_infos[0][0],
-            filter_infos[0][1], filter_infos[0][2], 1, strides[0], {1, 1},
-            paddings[0], paddings[0], data_format, filter_format, true, false,
-            1e-6f, false);
-
-    auto right = input;
-    for (size_t i = 1; i < filter_shapes.size(); ++i) {
-        right = utils::create_convolution(id_gen, *agraph, right,
-                filter_infos[i][0], filter_infos[i][1], filter_infos[i][2], 1,
-                strides[i], {1, 1}, paddings[i], paddings[i], data_format,
-                filter_format, true, false, 1e-6f,
-                i < filter_shapes.size() - 1);
-    }
-    auto add = create_add(id_gen, *agraph, left, right);
-    auto relu3 = create_relu(id_gen, *agraph, add);
-    (void)(relu3);
-}
-
-inline void construct_identical_bottleneck_resblock(graph::graph_t *agraph,
-        utils::id_generator &id_gen, const dims &input_shape,
-        const std::vector<dims> &filter_shapes, bool is_bf16 = false,
-        const std::vector<std::vector<int64_t>> &strides
-        = {{1, 1}, {1, 1}, {1, 1}},
-        const std::vector<std::vector<int64_t>> &paddings
-        = {{0, 0}, {1, 1}, {0, 0}},
-        const std::string &data_format = "NCX",
-        const std::string &filter_format = "OIX") {
-    auto dtype = is_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    auto input
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    std::vector<std::vector<graph::dim_t>> filter_infos;
-    for (auto shape : filter_shapes) {
-        filter_infos.push_back(extract_filter_info(shape, filter_format));
-    }
-    auto temp_input = input;
-    for (size_t i = 0; i < filter_shapes.size(); ++i) {
-        temp_input = utils::create_convolution(id_gen, *agraph, temp_input,
-                filter_infos[i][0], filter_infos[i][1], filter_infos[i][2], 1,
-                strides[i], {1, 1}, paddings[i], paddings[i], data_format,
-                filter_format, true, false, 1e-6f,
-                i < filter_shapes.size() - 1);
-    }
-    auto add = utils::create_add(id_gen, *agraph, temp_input, input);
-    auto relu3 = utils::create_relu(id_gen, *agraph, add);
-    (void)(relu3);
-}
-
-inline void construct_int8_identical_bottleneck_resblock(graph::graph_t *agraph,
-        utils::id_generator &id_gen, const dims &input_shape,
-        const std::vector<dims> &filter_shapes,
-        const std::vector<std::vector<int64_t>> &strides
-        = {{1, 1}, {1, 1}, {1, 1}},
-        const std::vector<std::vector<int64_t>> &paddings
-        = {{0, 0}, {1, 1}, {0, 0}},
-        const std::string &data_format = "NCX",
-        const std::string &filter_format = "OIX", bool dyn_quant = false) {
-    float scale_src = 1 / 255.f, scale_out = 1;
-    int64_t zp_src = 0, zp_out = 78;
-
-    auto src = utils::logical_tensor_init(
-            id_gen.get_id(), input_shape, graph::data_type::u8);
-
-    std::vector<std::vector<graph::dim_t>> filter_infos;
-    for (auto shape : filter_shapes) {
-        filter_infos.push_back(extract_filter_info(shape, filter_format));
-    }
-    auto temp_src = src;
-    for (size_t i = 0; i < filter_shapes.size(); ++i) {
-        if (dyn_quant) {
-            temp_src = create_int8_convolution_dyn_quant(id_gen, *agraph,
-                    temp_src, filter_infos[i][0], filter_infos[i][1],
-                    filter_infos[i][2], 1, strides[i], {1, 1}, paddings[i],
-                    paddings[i], data_format, filter_format, true, false, 1e-6f,
-                    /*has_relu? */ i < filter_shapes.size() - 1,
-                    graph::data_type::u8,
-                    /*is_quantize_dst? */ i < filter_shapes.size() - 1);
-        } else {
-            std::vector<float> scale_wei(filter_infos[i][2], 1 / 127.f);
-            temp_src = utils::create_int8_convolution(id_gen, *agraph, temp_src,
-                    filter_infos[i][0], filter_infos[i][1], filter_infos[i][2],
-                    1, strides[i], {1, 1}, paddings[i], paddings[i],
-                    data_format, filter_format, true, false, 1e-6f,
-                    /*has_relu? */ i < filter_shapes.size() - 1, scale_src,
-                    zp_src, scale_out, zp_out, scale_wei, graph::data_type::u8,
-                    /*is_quantize_dst? */ i < filter_shapes.size() - 1);
-        }
-    }
-    auto dq3 = dyn_quant
-            ? create_dyn_dequantize(id_gen, *agraph, src, "per_tensor", 0)
-            : create_dequantize(id_gen, *agraph, src, "per_tensor", {zp_src},
-                    {scale_src}, 0);
-    auto add0 = create_add(id_gen, *agraph, temp_src, dq3);
-    auto relu0 = create_relu(id_gen, *agraph, add0);
-    auto q2 = dyn_quant
-            ? create_dyn_quantize(id_gen, *agraph, relu0, graph::data_type::u8,
-                    "per_tensor", 0)
-            : create_quantize(id_gen, *agraph, relu0, graph::data_type::u8,
-                    "per_tensor", std::vector<int64_t> {zp_out},
-                    std::vector<float> {scale_out}, 0);
-    (void)(q2);
-}
-
-// filter_shape in the order of 1x1; 1x1+3x3+1x1
-// strides and paddings also first consider the single conv branch
-inline void construct_int8_convolutional_bottleneck_resblock(
-        graph::graph_t *agraph, utils::id_generator &id_gen,
-        const dims &input_shape, const std::vector<dims> &filter_shapes,
-        const std::vector<std::vector<int64_t>> &strides
-        = {{2, 2}, {1, 1}, {2, 2}, {1, 1}},
-        const std::vector<std::vector<int64_t>> &paddings
-        = {{0, 0}, {0, 0}, {1, 1}, {0, 0}},
-        const std::string &data_format = "NCX",
-        const std::string &filter_format = "OIX", bool dyn_quant = false) {
-    float scale_src = 1 / 255.f, scale_out = 1;
-    int64_t zp_src = 0, zp_out = 78;
-    auto src = utils::logical_tensor_init(
-            id_gen.get_id(), input_shape, graph::data_type::u8);
-
-    std::vector<std::vector<graph::dim_t>> filter_infos;
-    for (auto shape : filter_shapes) {
-        filter_infos.push_back(extract_filter_info(shape, filter_format));
-    }
-    std::vector<float> scale_wei(filter_infos[0][2], 1 / 127.f);
-    graph::logical_tensor_t left;
-    if (dyn_quant) {
-        left = create_int8_convolution_dyn_quant(id_gen, *agraph, src,
-                filter_infos[0][0], filter_infos[0][1], filter_infos[0][2], 1,
-                strides[0], {1, 1}, paddings[0], paddings[0], data_format,
-                filter_format, true, false, 1e-6f,
-                /*no relu*/ false, graph::data_type::u8);
-    } else {
-        left = utils::create_int8_convolution(id_gen, *agraph, src,
-                filter_infos[0][0], filter_infos[0][1], filter_infos[0][2], 1,
-                strides[0], {1, 1}, paddings[0], paddings[0], data_format,
-                filter_format, true, false, 1e-6f,
-                /*no relu*/ false, scale_src, zp_src, scale_out, zp_out,
-                scale_wei, graph::data_type::u8);
-    }
-
-    auto right = src;
-    for (size_t i = 1; i < filter_shapes.size(); ++i) {
-        if (dyn_quant) {
-            right = create_int8_convolution_dyn_quant(id_gen, *agraph, right,
-                    filter_infos[i][0], filter_infos[i][1], filter_infos[i][2],
-                    1, strides[i], {1, 1}, paddings[i], paddings[i],
-                    data_format, filter_format, true, false, 1e-6f,
-                    /*has relu*/ i < filter_shapes.size() - 1,
-                    graph::data_type::u8,
-                    /*is_quantize_dst? */ i < filter_shapes.size() - 1);
-        } else {
-            scale_wei = std::vector<float>(filter_infos[i][2], 1 / 127.f);
-            right = utils::create_int8_convolution(id_gen, *agraph, right,
-                    filter_infos[i][0], filter_infos[i][1], filter_infos[i][2],
-                    1, strides[i], {1, 1}, paddings[i], paddings[i],
-                    data_format, filter_format, true, false, 1e-6f,
-                    /*has relu*/ i < filter_shapes.size() - 1, scale_src,
-                    zp_src, scale_out, zp_out, scale_wei, graph::data_type::u8,
-                    /*is_quantize_dst? */ i < filter_shapes.size() - 1);
-        }
-    }
-    auto dq3 = dyn_quant
-            ? create_dyn_dequantize(id_gen, *agraph, left, "per_tensor", 0)
-            : create_dequantize(id_gen, *agraph, left, "per_tensor", {zp_src},
-                    {scale_src}, 0);
-    auto add0 = create_add(id_gen, *agraph, right, dq3);
-    auto relu0 = create_relu(id_gen, *agraph, add0);
-    auto q2 = dyn_quant
-            ? create_dyn_quantize(id_gen, *agraph, relu0, graph::data_type::u8,
-                    "per_tensor", 0)
-            : create_quantize(id_gen, *agraph, relu0, graph::data_type::u8,
-                    "per_tensor", std::vector<int64_t> {zp_out},
-                    std::vector<float> {scale_out}, 0);
-    (void)(q2);
-}
-
-inline graph::logical_tensor_t create_relu_bwd(utils::id_generator &id_gen,
-        graph::graph_t &agraph, const graph::logical_tensor_t &dst,
-        const graph::logical_tensor_t &delta_in, const dims &dims) {
-    graph::op_t relu_bwd(
-            id_gen.get_id(), graph::op_kind::ReLUBackward, "relu_bwd");
-    auto delta
-            = utils::logical_tensor_init(id_gen.get_id(), dims, dst.data_type);
-    relu_bwd.add_input(dst);
-    relu_bwd.add_input(delta);
-    relu_bwd.add_output(delta_in);
-    agraph.add_op(&relu_bwd);
-    return delta;
-}
-
-inline graph::logical_tensor_t create_relu_bwd2(utils::id_generator &id_gen,
-        graph::graph_t &agraph, const graph::logical_tensor_t &dst,
-        const graph::logical_tensor_t &delta) {
-    graph::op_t relu_bwd(
-            id_gen.get_id(), graph::op_kind::ReLUBackward, "relu_bwd");
-    auto output = utils::logical_tensor_init(id_gen.get_id(), dst.data_type);
-    relu_bwd.add_input(dst);
-    relu_bwd.add_input(delta);
-    relu_bwd.add_output(output);
-    agraph.add_op(&relu_bwd);
-    return output;
-}
-
-// create conv training fwd and bwd together since the connection is complex
-// src is the src of the first conv_fwd
-// delta_in is the destination of the last conv_bwd
-// (which is then connected to the previous subgraph)
-/*
-        (src)           (delta_in)
-          |                 |
-         conv          conv_data_bwd
-          |                 |
-          bn              bn_bwd
-          |                 |
-         relu            relu_bwd
-          |                 |
-        (dst)         (delta_in_next)
-*/
-inline std::vector<graph::logical_tensor_t> create_convolution_training(
-        utils::id_generator &id_gen, graph::graph_t &agraph,
-        const graph::logical_tensor_t &src, graph::logical_tensor_t &delta_in,
-        const dims &filter_shape, const dims &strides, const dims &pads_begin,
-        const dims &pads_end, const std::string &data_format,
-        const std::string &filter_format, bool with_bn = false,
-        float epsilon = 1e-6f, bool with_momentum = false,
-        float momentum = 0.0f, bool with_relu = false,
-        graph::logical_tensor_t *known_delta_in_next = nullptr,
-        bool is_bn_bf16 = false) {
-    auto bn_dtype = is_bn_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    graph::op_t conv_fwd(
-            id_gen.get_id(), graph::op_kind::Convolution, "conv_fwd");
-    conv_fwd.set_attr<dims>(graph::op_attr::strides, strides);
-    conv_fwd.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    if (std::all_of(strides.begin(), strides.end(),
-                [](size_t x) { return x == 1; })) {
-        conv_fwd.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-        conv_fwd.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-        conv_fwd.set_attr<std::string>(graph::op_attr::auto_pad, "SAME_UPPER");
-    } else {
-        conv_fwd.set_attr<dims>(graph::op_attr::pads_begin, pads_begin);
-        conv_fwd.set_attr<dims>(graph::op_attr::pads_end, pads_end);
-    }
-    conv_fwd.set_attr<std::string>(graph::op_attr::data_format, data_format);
-    conv_fwd.set_attr<std::string>(
-            graph::op_attr::weights_format, filter_format);
-
-    graph::dim_t ih = (data_format == "NCX") ? src.dims[2] : src.dims[1];
-    graph::dim_t iw = (data_format == "NCX") ? src.dims[3] : src.dims[2];
-    graph::dim_t ks
-            = (filter_format == "OIX") ? filter_shape[2] : filter_shape[0];
-    graph::dim_t ic
-            = (filter_format == "OIX") ? filter_shape[1] : filter_shape[2];
-    graph::dim_t oc
-            = (filter_format == "OIX") ? filter_shape[0] : filter_shape[3];
-    graph::dim_t oh = (ih + pads_begin[0] + pads_end[0] - ks) / strides[0] + 1;
-    graph::dim_t ow = (iw + pads_begin[1] + pads_end[1] - ks) / strides[1] + 1;
-
-    dims src_shape = (data_format == "NCX") ? dims {src.dims[0], ic, ih, iw}
-                                            : dims {src.dims[0], ih, iw, ic};
-    dims dst_shape = (data_format == "NCX") ? dims {src.dims[0], oc, oh, ow}
-                                            : dims {src.dims[0], oh, ow, oc};
-
-    auto wei = utils::logical_tensor_init(
-            id_gen.get_id(), filter_shape, src.data_type);
-    auto dst = utils::logical_tensor_init(
-            id_gen.get_id(), dst_shape, src.data_type);
-
-    conv_fwd.add_input(src);
-    conv_fwd.add_input(wei);
-    conv_fwd.add_output(dst);
-    agraph.add_op(&conv_fwd);
-
-    // gamma, mean, var will be reused in bwd part
-    graph::logical_tensor_t bn_src, gamma, mean, var;
-    if (with_bn) {
-        graph::op_t bn_fwd(id_gen.get_id(),
-                graph::op_kind::BatchNormForwardTraining, "bn_fwd");
-        bn_fwd.set_attr<std::string>(graph::op_attr::data_format, data_format);
-        bn_fwd.set_attr<float>(graph::op_attr::epsilon, epsilon);
-        if (with_momentum) {
-            bn_fwd.set_attr<float>(graph::op_attr::momentum, momentum);
-        }
-
-        int64_t bn_ic = oc;
-
-        bn_src = dst;
-        gamma = utils::logical_tensor_init(id_gen.get_id(), {bn_ic}, bn_dtype);
-        auto beta = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-        mean = utils::logical_tensor_init(id_gen.get_id(), {bn_ic}, bn_dtype);
-        var = utils::logical_tensor_init(id_gen.get_id(), {bn_ic}, bn_dtype);
-
-        dst = utils::logical_tensor_init(
-                id_gen.get_id(), dst_shape, src.data_type);
-        auto running_mean = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-        auto running_variance = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-        auto batch_mean = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-        auto batch_variance = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-
-        bn_fwd.add_input(bn_src);
-        bn_fwd.add_input(mean);
-        bn_fwd.add_input(var);
-        bn_fwd.add_input(gamma);
-        bn_fwd.add_input(beta);
-        bn_fwd.add_output(dst);
-        bn_fwd.add_output(running_mean);
-        bn_fwd.add_output(running_variance);
-        bn_fwd.add_output(batch_mean);
-        bn_fwd.add_output(batch_variance);
-
-        agraph.add_op(&bn_fwd);
-    }
-    if (with_relu) {
-        // fwd relu
-        graph::op_t relu_fwd(id_gen.get_id(), graph::op_kind::ReLU, "relu_fwd");
-        auto relu_src = dst;
-        dst = utils::logical_tensor_init(
-                id_gen.get_id(), dst_shape, src.data_type);
-        relu_fwd.add_input(relu_src);
-        relu_fwd.add_output(dst);
-        agraph.add_op(&relu_fwd);
-    }
-
-    // start constructing bwd part
-    graph::logical_tensor_t output_delta = utils::logical_tensor_init(
-            id_gen.get_id(), dst_shape, src.data_type);
-    graph::logical_tensor_t delta_in_next;
-    if (known_delta_in_next) {
-        output_delta = *known_delta_in_next;
-        delta_in_next = *known_delta_in_next;
-    } else {
-        delta_in_next = output_delta;
-    }
-    if (with_relu) {
-        // bwd relu
-        graph::op_t relu_bwd(
-                id_gen.get_id(), graph::op_kind::ReLUBackward, "relu_bwd");
-        relu_bwd.set_attr<bool>(graph::op_attr::use_dst, true);
-        relu_bwd.add_input(dst); // dst is dst of fwd
-        relu_bwd.add_input(output_delta);
-        output_delta = utils::logical_tensor_init(
-                id_gen.get_id(), dst_shape, src.data_type);
-        relu_bwd.add_output(output_delta);
-        agraph.add_op(&relu_bwd);
-    }
-    if (with_bn) {
-        graph::op_t bn_bwd(id_gen.get_id(),
-                graph::op_kind::BatchNormTrainingBackward, "bn_bwd");
-        bn_bwd.set_attr<std::string>(graph::op_attr::data_format, data_format);
-        bn_bwd.set_attr<float>(graph::op_attr::epsilon, epsilon);
-
-        int64_t bn_ic = oc;
-        auto gamma_delta = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-        auto beta_delta = utils::logical_tensor_init(
-                id_gen.get_id(), {bn_ic}, bn_dtype);
-
-        bn_bwd.add_input(bn_src);
-        bn_bwd.add_input(output_delta);
-        bn_bwd.add_input(mean);
-        bn_bwd.add_input(var);
-        bn_bwd.add_input(gamma);
-        output_delta = utils::logical_tensor_init(
-                id_gen.get_id(), dst_shape, src.data_type);
-        bn_bwd.add_output(output_delta);
-        bn_bwd.add_output(gamma_delta);
-        bn_bwd.add_output(beta_delta);
-
-        agraph.add_op(&bn_bwd);
-    }
-
-    graph::op_t conv_bwd_data(id_gen.get_id(),
-            graph::op_kind::ConvolutionBackwardData, "conv_bwd_data");
-    conv_bwd_data.set_attr<dims>(graph::op_attr::strides, strides);
-    conv_bwd_data.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    if (std::all_of(strides.begin(), strides.end(),
-                [](size_t x) { return x == 1; })) {
-        conv_bwd_data.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-        conv_bwd_data.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-        conv_bwd_data.set_attr<std::string>(
-                graph::op_attr::auto_pad, "SAME_UPPER");
-    } else {
-        conv_bwd_data.set_attr<dims>(graph::op_attr::pads_begin, pads_begin);
-        conv_bwd_data.set_attr<dims>(graph::op_attr::pads_end, pads_end);
-    }
-    conv_bwd_data.set_attr<std::string>(
-            graph::op_attr::data_format, data_format);
-    conv_bwd_data.set_attr<std::string>(
-            graph::op_attr::weights_format, filter_format);
-    conv_bwd_data.set_attr<dims>(graph::op_attr::dst_shape, src_shape);
-    conv_bwd_data.add_input(output_delta);
-    conv_bwd_data.add_input(wei);
-    conv_bwd_data.add_output(delta_in);
-    agraph.add_op(&conv_bwd_data);
-
-    graph::op_t conv_bwd_weight(id_gen.get_id(),
-            graph::op_kind::ConvolutionBackwardWeights, "conv_bwd_weight");
-    conv_bwd_weight.set_attr<dims>(graph::op_attr::strides, strides);
-    conv_bwd_weight.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    if (std::all_of(strides.begin(), strides.end(),
-                [](size_t x) { return x == 1; })) {
-        conv_bwd_weight.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-        conv_bwd_weight.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-        conv_bwd_weight.set_attr<std::string>(
-                graph::op_attr::auto_pad, "SAME_UPPER");
-    } else {
-        conv_bwd_weight.set_attr<dims>(graph::op_attr::pads_begin, pads_begin);
-        conv_bwd_weight.set_attr<dims>(graph::op_attr::pads_end, pads_end);
-    }
-    conv_bwd_weight.set_attr<std::string>(
-            graph::op_attr::data_format, data_format);
-    conv_bwd_weight.set_attr<std::string>(
-            graph::op_attr::weights_format, filter_format);
-    conv_bwd_weight.set_attr<dims>(graph::op_attr::weights_shape, filter_shape);
-    conv_bwd_weight.add_input(src);
-    conv_bwd_weight.add_input(output_delta);
-    auto delta_weight = utils::logical_tensor_init(
-            id_gen.get_id(), filter_shape, src.data_type);
-    conv_bwd_weight.add_output(delta_weight);
-    agraph.add_op(&conv_bwd_weight);
-
-    return {dst, delta_in_next};
-}
-
-inline void construct_identical_bottleneck_training_subgraph(
-        graph::graph_t *agraph, utils::id_generator &id_gen,
-        const dims &input_shape, const std::vector<dims> &filter_shapes,
-        bool is_bf16 = false, bool is_bn_bf16 = false,
-        const std::vector<std::vector<int64_t>> &strides
-        = {{1, 1}, {1, 1}, {1, 1}},
-        const std::vector<std::vector<int64_t>> &paddings
-        = {{0, 0}, {1, 1}, {0, 0}},
-        const std::string &data_format = "NXC",
-        const std::string &filter_format = "XIO") {
-    auto dtype = is_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    auto src = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto delta_data
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto ret0 = create_convolution_training(id_gen, *agraph, src, delta_data,
-            filter_shapes[0], strides[0], paddings[0], paddings[0], data_format,
-            filter_format, true, 1e-5f, true, 0.1f, true, nullptr, is_bn_bf16);
-    auto conv0 = ret0[0];
-    auto delta_in0 = ret0[1];
-    auto ret1 = create_convolution_training(id_gen, *agraph, conv0, delta_in0,
-            filter_shapes[1], strides[1], paddings[1], paddings[1], data_format,
-            filter_format, true, 1e-5f, true, 0.1f, true, nullptr, is_bn_bf16);
-    auto conv1 = ret1[0];
-    auto delta_in1 = ret1[1];
-    auto ret2 = create_convolution_training(id_gen, *agraph, conv1, delta_in1,
-            filter_shapes[2], strides[2], paddings[2], paddings[2], data_format,
-            filter_format, true, 1e-5f, true, 0.1f, false, nullptr, is_bn_bf16);
-    auto conv2 = ret2[0];
-    auto delta_in2 = ret2[1];
-    auto add = utils::create_add(id_gen, *agraph, src, conv2);
-    auto relu = utils::create_relu(id_gen, *agraph, add);
-    // relu_bwd is delta of forward_output
-    auto relu_bwd
-            = create_relu_bwd(id_gen, *agraph, relu, delta_in2, input_shape);
-    auto add_bwd = utils::create_add(id_gen, *agraph, delta_in2, delta_data);
-    auto relu_bwd_last_src
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto relu_bwd_last
-            = create_relu_bwd2(id_gen, *agraph, relu_bwd_last_src, add_bwd);
-    (void)(add_bwd);
-    (void)(relu_bwd);
-    (void)(relu_bwd_last);
-}
-
-// filter is in order {rhs, lhs0, lhs1, lhs2}
-inline void construct_convolutional_bottleneck_training_subgraph(
-        graph::graph_t *agraph, utils::id_generator &id_gen,
-        const dims &input_shape, const std::vector<dims> &filter_shapes,
-        bool is_bf16 = false, bool is_bn_bf16 = false,
-        const std::vector<std::vector<int64_t>> &strides
-        = {{2, 2}, {1, 1}, {2, 2}, {1, 1}},
-        const std::vector<std::vector<int64_t>> &paddings
-        = {{0, 0}, {0, 0}, {1, 1}, {0, 0}},
-        const std::string &data_format = "NXC",
-        const std::string &filter_format = "XIO") {
-    auto infer_output_shape = [&](dims shape) {
-        if (data_format == "NCX")
-            return dims {
-                    shape[0], filter_shapes[0][0], shape[2] / 2, shape[3] / 2};
-        else
-            return dims {
-                    shape[0], shape[1] / 2, shape[2] / 2, filter_shapes[0][3]};
-    };
-
-    auto dtype = is_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    auto src = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto delta_data_lhs
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto delta_data_rhs
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto rhs = create_convolution_training(id_gen, *agraph, src, delta_data_rhs,
-            filter_shapes[0], strides[0], paddings[0], paddings[0], data_format,
-            filter_format, true, 1e-5f, true, 0.1f, false, nullptr, is_bn_bf16);
-    auto conv_rhs0 = rhs[0];
-    auto delta_in_rhs0 = rhs[1];
-    auto lhs0 = create_convolution_training(id_gen, *agraph, src,
-            delta_data_lhs, filter_shapes[1], strides[1], paddings[1],
-            paddings[1], data_format, filter_format, true, 1e-5f, true, 0.1f,
-            true, nullptr, is_bn_bf16);
-    auto conv_lhs0 = lhs0[0];
-    auto delta_in_lhs0 = lhs0[1];
-    auto lhs1 = create_convolution_training(id_gen, *agraph, conv_lhs0,
-            delta_in_lhs0, filter_shapes[2], strides[2], paddings[2],
-            paddings[2], data_format, filter_format, true, 1e-5f, true, 0.1f,
-            true, nullptr, is_bn_bf16);
-    auto conv_lhs1 = lhs1[0];
-    auto delta_in_lhs1 = lhs1[1];
-    auto lhs2 = create_convolution_training(id_gen, *agraph, conv_lhs1,
-            delta_in_lhs1, filter_shapes[3], strides[3], paddings[3],
-            paddings[3], data_format, filter_format, true, 1e-5f, true, 0.1f,
-            false, &delta_in_rhs0, is_bn_bf16);
-    auto conv_lhs2 = lhs2[0];
-    auto delta_in_lhs2 = lhs2[1];
-    auto add = utils::create_add(id_gen, *agraph, conv_rhs0, conv_lhs2);
-    auto relu = utils::create_relu(id_gen, *agraph, add);
-    auto relu_bwd = create_relu_bwd(id_gen, *agraph, relu, delta_in_rhs0,
-            infer_output_shape(input_shape));
-    auto add_bwd = utils::create_add(
-            id_gen, *agraph, delta_data_lhs, delta_data_rhs);
-    auto relu_bwd_last_src
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto relu_bwd_last
-            = create_relu_bwd2(id_gen, *agraph, relu_bwd_last_src, add_bwd);
-    (void)(delta_in_lhs2);
-    (void)(add_bwd);
-    (void)(relu_bwd);
-    (void)(relu_bwd_last);
-}
-
-inline void add_bart_MHA(graph::graph_t *agraph, bool use_bf16 = false,
-        bool use_int8 = false, graph::dim_t batch_size = 1,
-        graph::dim_t seq_len = 6, graph::dim_t num_head = 12,
-        graph::dim_t head_dim = 768) {
-    size_t logical_tensor_idx = 0;
-    size_t op_idx = 0;
-    graph::dim_t batch_dim = batch_size * num_head;
-    graph::dim_t size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> QKV_TRANSPOSED_SHAPE {
-            batch_dim, seq_len, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_dim, size_per_head, seq_len};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_dim, seq_len, seq_len};
-    std::vector<graph::dim_t> MATMUL_V_OUTPUT_SHAPE {
-            batch_dim, seq_len, size_per_head};
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-
-    graph::logical_tensor_t query_dequantize_input, key_dequantize_input,
-            value_dequantize_input;
-    query_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::u8);
-    key_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::u8);
-    value_dequantize_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::u8);
-
-    graph::logical_tensor_t query_typecast_input, key_typecast_input,
-            value_typecast_input;
-    query_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-    key_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, graph::data_type::f32);
-    value_typecast_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, KEY_TRANSPOSED_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            logical_tensor_idx++, QKV_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_out;
-    softmax_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t softmax_cast_out;
-    softmax_cast_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_quantize_out;
-    softmax_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_QK_OUTPUT_SHAPE, graph::data_type::u8);
-    graph::logical_tensor_t softmax_dequantize_out;
-    softmax_dequantize_out = utils::logical_tensor_init(logical_tensor_idx++,
-            MATMUL_QK_OUTPUT_SHAPE, graph::data_type::f32);
-
-    graph::logical_tensor_t softmax_dequantize_out_cast;
-    softmax_dequantize_out_cast
-            = utils::logical_tensor_init(logical_tensor_idx++,
-                    MATMUL_QK_OUTPUT_SHAPE, graph::data_type::bf16);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            logical_tensor_idx++, MATMUL_V_OUTPUT_SHAPE, dtype);
-
-    graph::logical_tensor_t context_cast_out;
-    context_cast_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::f32);
-
-    graph::logical_tensor_t context_quantize_out;
-    context_quantize_out = utils::logical_tensor_init(
-            logical_tensor_idx++, graph::data_type::u8);
-
-    graph::op_t dequantize_query {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_query"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query);
-    graph::op_t dequantize_key {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_key"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key);
-    graph::op_t dequantize_value {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_value"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value);
-    graph::op_t typecast_query {
-            op_idx++, graph::op_kind::TypeCast, "typecast_query"};
-    graph::op_t typecast_key {
-            op_idx++, graph::op_kind::TypeCast, "typecast_key"};
-    graph::op_t typecast_value {
-            op_idx++, graph::op_kind::TypeCast, "typecast_value"};
-
-    graph::op_t matmul_qk {op_idx++, graph::op_kind::MatMul, "matmul_qk"};
-    graph::op_t softmax {op_idx++, graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)2);
-
-    graph::op_t softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "softmax_cast"};
-    graph::op_t quantize_softmax {
-            op_idx++, graph::op_kind::Quantize, "quantize_softmax"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_softmax);
-    graph::op_t dequantize_softmax {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_softmax"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_softmax);
-
-    graph::op_t dequantize_softmax_cast {
-            op_idx++, graph::op_kind::TypeCast, "dequantize_softmax_cast"};
-
-    graph::op_t matmul_v {op_idx++, graph::op_kind::MatMul, "matmul_v"};
-    graph::op_t typecast_output {
-            op_idx++, graph::op_kind::TypeCast, "typecast_output"};
-    graph::op_t quantize_output {
-            op_idx++, graph::op_kind::Quantize, "quantize_output"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_output);
-
-    if (use_int8) {
-        dequantize_query.add_input(query_dequantize_input);
-        dequantize_key.add_input(key_dequantize_input);
-        dequantize_value.add_input(value_dequantize_input);
-        if (!use_bf16) {
-            dequantize_query.add_output(query_matmul_input);
-            dequantize_key.add_output(key_matmul_input);
-            dequantize_value.add_output(value_matmul_input);
-        } else {
-            dequantize_query.add_output(query_typecast_input);
-            dequantize_key.add_output(key_typecast_input);
-            dequantize_value.add_output(value_typecast_input);
-            typecast_query.add_input(query_typecast_input);
-            typecast_key.add_input(key_typecast_input);
-            typecast_value.add_input(value_typecast_input);
-            typecast_query.add_output(query_matmul_input);
-            typecast_key.add_output(key_matmul_input);
-            typecast_value.add_output(value_matmul_input);
-        }
-    }
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-    softmax.add_input(matmul_qk_out);
-    softmax.add_output(softmax_out);
-    if (use_int8) {
-        quantize_softmax.add_output(softmax_quantize_out);
-        dequantize_softmax.add_input(softmax_quantize_out);
-        dequantize_softmax.add_output(softmax_dequantize_out);
-        if (!use_bf16) {
-            quantize_softmax.add_input(softmax_out);
-            matmul_v.add_input(softmax_dequantize_out);
-        } else {
-            softmax_cast.add_input(softmax_out);
-            softmax_cast.add_output(softmax_cast_out);
-            quantize_softmax.add_input(softmax_cast_out);
-            dequantize_softmax_cast.add_input(softmax_dequantize_out);
-            dequantize_softmax_cast.add_output(softmax_dequantize_out_cast);
-            matmul_v.add_input(softmax_dequantize_out_cast);
-        }
-    } else {
-        matmul_v.add_input(softmax_out);
-    }
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-    if (use_int8) {
-        quantize_output.add_output(context_quantize_out);
-        if (!use_bf16) {
-            quantize_output.add_input(matmul_v_out);
-        } else {
-            typecast_output.add_input(matmul_v_out);
-            typecast_output.add_output(context_cast_out);
-            quantize_output.add_input(context_cast_out);
-        }
-    }
-
-    if (use_int8) {
-        agraph->add_op(&dequantize_query);
-        agraph->add_op(&dequantize_key);
-        agraph->add_op(&dequantize_value);
-        if (use_bf16) {
-            agraph->add_op(&typecast_query);
-            agraph->add_op(&typecast_key);
-            agraph->add_op(&typecast_value);
-        }
-    }
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&softmax);
-    if (use_int8) {
-        agraph->add_op(&quantize_softmax);
-        agraph->add_op(&dequantize_softmax);
-        if (use_bf16) {
-            agraph->add_op(&softmax_cast);
-            agraph->add_op(&dequantize_softmax_cast);
-        }
-    }
-    agraph->add_op(&matmul_v);
-    if (use_int8) {
-        agraph->add_op(&quantize_output);
-        if (use_bf16) { agraph->add_op(&typecast_output); }
-    }
-}
-
-inline void add_bart_mlp_residual_subgraph(graph::graph_t *agraph,
-        bool use_bf16 = false, bool use_int8 = false,
-        graph::dim_t batch_size = 1, graph::dim_t seq_len = 17) {
-    size_t lt_idx = 0;
-    size_t op_idx = 0;
-    const graph::dim_t head_dim = 768;
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-    std::vector<graph::dim_t> input_size_1 {batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> input_size_2 {batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> input_size_3 {batch_size, seq_len, head_dim * 4};
-    std::vector<graph::dim_t> output_size {batch_size, seq_len, head_dim};
-    std::vector<graph::dim_t> weight_size_1 {head_dim, head_dim};
-    std::vector<graph::dim_t> weight_size_2 {head_dim * 4, head_dim};
-    std::vector<graph::dim_t> weight_size_3 {head_dim, head_dim * 4};
-    std::vector<graph::dim_t> bias_size_1 {head_dim};
-    std::vector<graph::dim_t> bias_size_2 {head_dim * 4};
-    std::vector<graph::dim_t> bias_size_3 {head_dim};
-
-    // layer1
-    graph::logical_tensor_t input_desc_1 = utils::logical_tensor_init(
-            lt_idx++, input_size_1, graph::data_type::u8);
-    graph::logical_tensor_t dequant_input_desc_1 = utils::logical_tensor_init(
-            lt_idx++, input_size_1, graph::data_type::f32);
-    graph::logical_tensor_t typecast_input_desc_1
-            = utils::logical_tensor_init(lt_idx++, input_size_1, dtype);
-
-    graph::logical_tensor_t weight_desc_1 = utils::logical_tensor_init(
-            lt_idx++, weight_size_1, graph::data_type::s8);
-    weight_desc_1.property = property_type::constant;
-    graph::logical_tensor_t dequant_weight_desc_1 = utils::logical_tensor_init(
-            lt_idx++, weight_size_1, graph::data_type::f32);
-    graph::logical_tensor_t typecast_weight_desc_1
-            = utils::logical_tensor_init(lt_idx++, weight_size_1, dtype);
-    graph::logical_tensor_t bias_desc_1
-            = utils::logical_tensor_init(lt_idx++, bias_size_1, dtype);
-    bias_desc_1.property = property_type::constant;
-
-    graph::logical_tensor_t matmul_desc_1
-            = utils::logical_tensor_init(lt_idx++, input_size_1, dtype);
-    graph::logical_tensor_t add_other_input_desc
-            = utils::logical_tensor_init(lt_idx++, input_size_1, dtype);
-    graph::logical_tensor_t add_desc_1
-            = utils::logical_tensor_init(lt_idx++, input_size_1, dtype);
-    graph::logical_tensor_t layernorm_alpha_desc_1 = utils::logical_tensor_init(
-            lt_idx++, bias_size_1, graph::data_type::f32);
-    graph::logical_tensor_t layernorm_beta_desc_1 = utils::logical_tensor_init(
-            lt_idx++, bias_size_1, graph::data_type::f32);
-    graph::logical_tensor_t layernorm_desc_1
-            = utils::logical_tensor_init(lt_idx++, input_size_1, dtype);
-    graph::logical_tensor_t typecast_output_1 = utils::logical_tensor_init(
-            lt_idx++, input_size_1, graph::data_type::f32);
-
-    // layer2
-    graph::logical_tensor_t quant_input_desc_2 = utils::logical_tensor_init(
-            lt_idx++, input_size_2, graph::data_type::u8);
-    graph::logical_tensor_t dequant_input_desc_2 = utils::logical_tensor_init(
-            lt_idx++, input_size_2, graph::data_type::f32);
-    graph::logical_tensor_t typecast_input_desc_2
-            = utils::logical_tensor_init(lt_idx++, input_size_2, dtype);
-
-    graph::logical_tensor_t weight_desc_2 = utils::logical_tensor_init(
-            lt_idx++, weight_size_2, graph::data_type::s8);
-    weight_desc_2.property = property_type::constant;
-    graph::logical_tensor_t dequant_weight_desc_2 = utils::logical_tensor_init(
-            lt_idx++, weight_size_2, graph::data_type::f32);
-    graph::logical_tensor_t typecast_weight_desc_2
-            = utils::logical_tensor_init(lt_idx++, weight_size_2, dtype);
-    graph::logical_tensor_t bias_desc_2
-            = utils::logical_tensor_init(lt_idx++, bias_size_2, dtype);
-    bias_desc_2.property = property_type::constant;
-
-    graph::logical_tensor_t matmul_desc_2
-            = utils::logical_tensor_init(lt_idx++, input_size_3, dtype);
-    graph::logical_tensor_t gelu_desc
-            = utils::logical_tensor_init(lt_idx++, input_size_3, dtype);
-    graph::logical_tensor_t typecast_output_2 = utils::logical_tensor_init(
-            lt_idx++, input_size_3, graph::data_type::f32);
-
-    // layer 3
-    graph::logical_tensor_t quant_input_desc_3 = utils::logical_tensor_init(
-            lt_idx++, input_size_3, graph::data_type::u8);
-    graph::logical_tensor_t dequant_input_desc_3 = utils::logical_tensor_init(
-            lt_idx++, input_size_3, graph::data_type::f32);
-    graph::logical_tensor_t typecast_input_desc_3
-            = utils::logical_tensor_init(lt_idx++, input_size_3, dtype);
-
-    graph::logical_tensor_t weight_desc_3 = utils::logical_tensor_init(
-            lt_idx++, weight_size_3, graph::data_type::s8);
-    weight_desc_3.property = property_type::constant;
-    graph::logical_tensor_t dequant_weight_desc_3 = utils::logical_tensor_init(
-            lt_idx++, weight_size_3, graph::data_type::f32);
-    graph::logical_tensor_t typecast_weight_desc_3
-            = utils::logical_tensor_init(lt_idx++, weight_size_3, dtype);
-    graph::logical_tensor_t bias_desc_3
-            = utils::logical_tensor_init(lt_idx++, bias_size_3, dtype);
-    bias_desc_3.property = property_type::constant;
-
-    graph::logical_tensor_t matmul_desc_3
-            = utils::logical_tensor_init(lt_idx++, output_size, dtype);
-    graph::logical_tensor_t add_desc_2
-            = utils::logical_tensor_init(lt_idx++, output_size, dtype);
-    graph::logical_tensor_t layernorm_alpha_desc_2 = utils::logical_tensor_init(
-            lt_idx++, bias_size_3, graph::data_type::f32);
-    graph::logical_tensor_t layernorm_beta_desc_2 = utils::logical_tensor_init(
-            lt_idx++, bias_size_3, graph::data_type::f32);
-    graph::logical_tensor_t layernorm_desc_2
-            = utils::logical_tensor_init(lt_idx++, output_size, dtype);
-
-    // construct op
-    graph::op_t dequant_input_1 {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_input_1"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequant_input_1);
-    graph::op_t typecast_input_1 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_input_1"};
-
-    graph::op_t dequant_weight_1 {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_weight_1"};
-    DEFINE_DEFAULT_PER_CHANNEL_QUANT_ATTR(dequant_weight_1, bias_size_1[0], 0);
-    graph::op_t typecast_weight_1 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_weight_1"};
-
-    graph::op_t matmul_1 {op_idx++, graph::op_kind::MatMul, "matmul_1"};
-    matmul_1.set_attr(graph::op_attr::transpose_b, true);
-    graph::op_t add_1 {op_idx++, graph::op_kind::Add, "add_1"};
-    graph::op_t layernorm_1 {
-            op_idx++, graph::op_kind::LayerNorm, "layernorm_1"};
-    layernorm_1.set_attr(graph::op_attr::keep_stats, false);
-    graph::op_t cast_output_1 {
-            op_idx++, graph::op_kind::TypeCast, "cast_output_1"};
-    graph::op_t quant_input_2 {
-            op_idx++, graph::op_kind::Quantize, "quantize_input_2"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quant_input_2);
-
-    graph::op_t dequant_input_2 {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_input_2"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequant_input_2);
-    graph::op_t typecast_input_2 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_input_2"};
-
-    graph::op_t dequant_weight_2 {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_weight_2"};
-    DEFINE_DEFAULT_PER_CHANNEL_QUANT_ATTR(dequant_weight_2, bias_size_2[0], 0);
-    graph::op_t typecast_weight_2 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_weight_2"};
-
-    graph::op_t matmul_2 {op_idx++, graph::op_kind::MatMul, "matmul_2"};
-    matmul_2.set_attr(graph::op_attr::transpose_b, true);
-    graph::op_t gelu_1 {op_idx++, graph::op_kind::GELU, "gelu_1"};
-    graph::op_t cast_output_2 {
-            op_idx++, graph::op_kind::TypeCast, "cast_output_2"};
-    graph::op_t quant_input_3 {
-            op_idx++, graph::op_kind::Quantize, "quantize_input_3"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quant_input_3);
-
-    graph::op_t dequant_input_3 {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_input_3"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequant_input_3);
-    graph::op_t typecast_input_3 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_input_3"};
-
-    graph::op_t dequant_weight_3 {
-            op_idx++, graph::op_kind::Dequantize, "dequantize_weight_3"};
-    DEFINE_DEFAULT_PER_CHANNEL_QUANT_ATTR(dequant_weight_3, bias_size_3[0], 0);
-    graph::op_t typecast_weight_3 {
-            op_idx++, graph::op_kind::TypeCast, "typecast_weight_3"};
-
-    graph::op_t matmul_3 {op_idx++, graph::op_kind::MatMul, "matmul_3"};
-    matmul_3.set_attr(graph::op_attr::transpose_b, true);
-    graph::op_t add_2 {op_idx++, graph::op_kind::Add, "add_2"};
-    graph::op_t layernorm_2 {
-            op_idx++, graph::op_kind::LayerNorm, "layernorm_2"};
-    layernorm_2.set_attr(graph::op_attr::keep_stats, false);
-
-    if (use_int8) {
-        dequant_input_1.add_input(input_desc_1);
-        dequant_input_1.add_output(dequant_input_desc_1);
-        dequant_weight_1.add_input(weight_desc_1);
-        dequant_weight_1.add_output(dequant_weight_desc_1);
-        if (!use_bf16) {
-            matmul_1.add_input(dequant_input_desc_1);
-            matmul_1.add_input(dequant_weight_desc_1);
-        } else {
-            typecast_input_1.add_input(dequant_input_desc_1);
-            typecast_input_1.add_output(typecast_input_desc_1);
-            typecast_weight_1.add_input(dequant_weight_desc_1);
-            typecast_weight_1.add_output(typecast_weight_desc_1);
-            matmul_1.add_input(typecast_input_desc_1);
-            matmul_1.add_input(typecast_weight_desc_1);
-        }
-    } else {
-        matmul_1.add_input(typecast_input_desc_1);
-        matmul_1.add_input(typecast_weight_desc_1);
-    }
-    matmul_1.add_input(bias_desc_1);
-    matmul_1.add_output(matmul_desc_1);
-    add_1.add_input(add_other_input_desc);
-    add_1.add_input(matmul_desc_1);
-    add_1.add_output(add_desc_1);
-    layernorm_1.add_input(add_desc_1);
-    layernorm_1.add_input(layernorm_alpha_desc_1);
-    layernorm_1.add_input(layernorm_beta_desc_1);
-    layernorm_1.add_output(layernorm_desc_1);
-    if (use_int8) {
-        quant_input_2.add_output(quant_input_desc_2);
-        dequant_input_2.add_input(quant_input_desc_2);
-        dequant_input_2.add_output(dequant_input_desc_2);
-        dequant_weight_2.add_input(weight_desc_2);
-        dequant_weight_2.add_output(dequant_weight_desc_2);
-        if (!use_bf16) {
-            quant_input_2.add_input(layernorm_desc_1);
-            matmul_2.add_input(dequant_input_desc_2);
-            matmul_2.add_input(dequant_weight_desc_2);
-        } else {
-            cast_output_1.add_input(layernorm_desc_1);
-            cast_output_1.add_output(typecast_output_1);
-            quant_input_2.add_input(typecast_output_1);
-            typecast_input_2.add_input(dequant_input_desc_2);
-            typecast_input_2.add_output(typecast_input_desc_2);
-            typecast_weight_2.add_input(dequant_weight_desc_2);
-            typecast_weight_2.add_output(typecast_weight_desc_2);
-            matmul_2.add_input(typecast_input_desc_2);
-            matmul_2.add_input(typecast_weight_desc_2);
-        }
-    } else {
-        matmul_2.add_input(layernorm_desc_1);
-        matmul_2.add_input(typecast_weight_desc_2);
-    }
-
-    matmul_2.add_input(bias_desc_2);
-    matmul_2.add_output(matmul_desc_2);
-    gelu_1.add_input(matmul_desc_2);
-    gelu_1.add_output(gelu_desc);
-
-    if (use_int8) {
-        quant_input_3.add_output(quant_input_desc_3);
-        dequant_input_3.add_input(quant_input_desc_3);
-        dequant_input_3.add_output(dequant_input_desc_3);
-        dequant_weight_3.add_input(weight_desc_3);
-        dequant_weight_3.add_output(dequant_weight_desc_3);
-        if (!use_bf16) {
-            quant_input_3.add_input(gelu_desc);
-            matmul_3.add_input(dequant_input_desc_3);
-            matmul_3.add_input(dequant_weight_desc_3);
-        } else {
-            cast_output_2.add_input(gelu_desc);
-            cast_output_2.add_output(typecast_output_2);
-            quant_input_3.add_input(typecast_output_2);
-            typecast_input_3.add_input(dequant_input_desc_3);
-            typecast_input_3.add_output(typecast_input_desc_3);
-            typecast_weight_3.add_input(dequant_weight_desc_3);
-            typecast_weight_3.add_output(typecast_weight_desc_3);
-            matmul_3.add_input(typecast_input_desc_3);
-            matmul_3.add_input(typecast_weight_desc_3);
-        }
-    } else {
-        matmul_3.add_input(gelu_desc);
-        matmul_3.add_input(typecast_weight_desc_3);
-    }
-
-    matmul_3.add_input(bias_desc_3);
-    matmul_3.add_output(matmul_desc_3);
-    add_2.add_input(layernorm_desc_1);
-    add_2.add_input(matmul_desc_3);
-    add_2.add_output(add_desc_2);
-    layernorm_2.add_input(add_desc_2);
-    layernorm_2.add_input(layernorm_alpha_desc_2);
-    layernorm_2.add_input(layernorm_beta_desc_2);
-    layernorm_2.add_output(layernorm_desc_2);
-
-    if (use_int8) {
-        agraph->add_op(&dequant_input_1);
-        agraph->add_op(&dequant_weight_1);
-        agraph->add_op(&quant_input_2);
-        agraph->add_op(&dequant_input_2);
-        agraph->add_op(&dequant_weight_2);
-        agraph->add_op(&quant_input_3);
-        agraph->add_op(&dequant_input_3);
-        agraph->add_op(&dequant_weight_3);
-        if (use_bf16) {
-            agraph->add_op(&typecast_input_1);
-            agraph->add_op(&typecast_weight_1);
-            agraph->add_op(&typecast_input_2);
-            agraph->add_op(&typecast_weight_2);
-            agraph->add_op(&typecast_input_3);
-            agraph->add_op(&typecast_weight_3);
-            agraph->add_op(&cast_output_1);
-            agraph->add_op(&cast_output_2);
-        }
-    }
-
-    agraph->add_op(&matmul_1);
-    agraph->add_op(&add_1);
-    agraph->add_op(&layernorm_1);
-    agraph->add_op(&matmul_2);
-    agraph->add_op(&gelu_1);
-    agraph->add_op(&matmul_3);
-    agraph->add_op(&add_2);
-    agraph->add_op(&layernorm_2);
-}
-
-inline void construct_mul_quantize_subgraph(graph::graph_t *agraph,
-        utils::id_generator &id_gen, const graph::dims &input_shape,
-        bool is_mixed_precision = false) {
-    graph::dims smooth_quant_scales_shape {input_shape[input_shape.size() - 1]};
-    auto dtype = is_mixed_precision ? graph::data_type::bf16
-                                    : graph::data_type::f32;
-    auto mul_in
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto smooth_quant_scale = utils::logical_tensor_init(
-            id_gen.get_id(), smooth_quant_scales_shape, graph::data_type::f32);
-    auto mul_out = utils::logical_tensor_init(
-            id_gen.get_id(), input_shape, graph::data_type::f32);
-    auto quant_out = utils::logical_tensor_init(
-            id_gen.get_id(), input_shape, graph::data_type::u8);
-
-    graph::op_t mul {id_gen.get_id(), graph::op_kind::Multiply, "mul"};
-    mul.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t quantize {
-            id_gen.get_id(), graph::op_kind::Quantize, "quantize"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize);
-
-    mul.add_input(mul_in);
-    mul.add_input(smooth_quant_scale);
-    mul.add_output(mul_out);
-    quantize.add_input(mul_out);
-    quantize.add_output(quant_out);
-
-    agraph->add_op(&mul);
-    agraph->add_op(&quantize);
-}
-
-inline void construct_gpt_mha_subgraph(graph::graph_t *agraph,
-        utils::id_generator &id_gen, bool use_bf16 = false,
-        bool use_int8 = false, int batch_size = 4, int seq_len = 34,
-        int num_head = 16, int head_dim = 4096) {
-    int size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> SELECT_BOOL_SHAPE {1, 1, 1, seq_len};
-    std::vector<graph::dim_t> ADD_MASK_SHAPE {batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QUERY_TRANSPOSED_SHAPE {
-            batch_size, num_head, 1, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> VALUE_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> MATMUL_QK_OUTPUT_SHAPE {
-            batch_size, num_head, 1, seq_len};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-    auto dtype = use_bf16 ? graph::data_type::bf16 : graph::data_type::f32;
-
-    graph::logical_tensor_t query_dequantize_input, key_dequantize_input,
-            value_dequantize_input;
-    query_dequantize_input = utils::logical_tensor_init(
-            id_gen.get_id(), QUERY_TRANSPOSED_SHAPE, graph::data_type::u8);
-    key_dequantize_input = utils::logical_tensor_init(
-            id_gen.get_id(), KEY_TRANSPOSED_SHAPE, graph::data_type::u8);
-    value_dequantize_input = utils::logical_tensor_init(
-            id_gen.get_id(), VALUE_TRANSPOSED_SHAPE, graph::data_type::u8);
-
-    graph::logical_tensor_t query_typecast_input, key_typecast_input,
-            value_typecast_input;
-    query_typecast_input = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::f32, graph::layout_type::strided);
-    key_typecast_input = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::f32, graph::layout_type::strided);
-    value_typecast_input = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::f32, graph::layout_type::strided);
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            id_gen.get_id(), QUERY_TRANSPOSED_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            id_gen.get_id(), KEY_TRANSPOSED_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            id_gen.get_id(), VALUE_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t select_bool_lt, select_else_lt, select_out;
-    select_bool_lt = utils::logical_tensor_init(
-            id_gen.get_id(), SELECT_BOOL_SHAPE, graph::data_type::boolean);
-    select_else_lt
-            = utils::logical_tensor_init(id_gen.get_id(), CONST_SHAPE, dtype);
-    select_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t fscore_scale, fscore_scale_out;
-    fscore_scale
-            = utils::logical_tensor_init(id_gen.get_id(), CONST_SHAPE, dtype);
-    fscore_scale_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t fscore_add_out, fscore_add_mask, softmax_out;
-    fscore_add_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-    fscore_add_mask = utils::logical_tensor_init(
-            id_gen.get_id(), ADD_MASK_SHAPE, dtype);
-    softmax_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t softmax_cast_out;
-    softmax_cast_out = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::f32, graph::layout_type::strided);
-
-    graph::logical_tensor_t softmax_quantize_out;
-    softmax_quantize_out = utils::logical_tensor_init(
-            id_gen.get_id(), graph::data_type::u8, graph::layout_type::strided);
-    graph::logical_tensor_t softmax_dequantize_out;
-    softmax_dequantize_out = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::f32, graph::layout_type::strided);
-
-    graph::logical_tensor_t softmax_dequantize_out_cast;
-    softmax_dequantize_out_cast = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::bf16, graph::layout_type::strided);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t context_transpose_out, context_reorder_out;
-    context_transpose_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-    context_reorder_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t context_cast_out;
-    context_cast_out = utils::logical_tensor_init(id_gen.get_id(),
-            graph::data_type::f32, graph::layout_type::strided);
-
-    graph::logical_tensor_t context_quantize_out;
-    context_quantize_out = utils::logical_tensor_init(
-            id_gen.get_id(), graph::data_type::u8, graph::layout_type::strided);
-
-    graph::op_t dequantize_query {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_query"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query);
-    graph::op_t dequantize_key {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_key"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key);
-    graph::op_t dequantize_value {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_value"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value);
-    graph::op_t typecast_query {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_query"};
-    graph::op_t typecast_key {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_key"};
-    graph::op_t typecast_value {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_value"};
-
-    graph::op_t matmul_qk {
-            id_gen.get_id(), graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_select {
-            id_gen.get_id(), graph::op_kind::Select, "fscore_select"};
-    graph::op_t fscore_rescale {
-            id_gen.get_id(), graph::op_kind::Divide, "fscore_rescale"};
-    fscore_rescale.set_attr(
-            graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_add {id_gen.get_id(), graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {id_gen.get_id(), graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-
-    graph::op_t softmax_cast1 {
-            id_gen.get_id(), graph::op_kind::TypeCast, "softmax_cast1"};
-    graph::op_t quantize_softmax {
-            id_gen.get_id(), graph::op_kind::Quantize, "quantize_softmax"};
-    graph::op_t dequantize_softmax {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_softmax"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_softmax);
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_softmax);
-
-    graph::op_t dequantize_softmax_cast {id_gen.get_id(),
-            graph::op_kind::TypeCast, "dequantize_softmax_cast"};
-
-    graph::op_t matmul_v {id_gen.get_id(), graph::op_kind::MatMul, "matmul_v"};
-
-    // transpose + reshape before output
-    graph::op_t transpose_output {id_gen.get_id(),
-            graph::op_kind::StaticTranspose, "transpose_output"};
-    transpose_output.set_attr(
-            graph::op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    graph::op_t reorder_output {
-            id_gen.get_id(), graph::op_kind::Reorder, "reorder_output"};
-
-    graph::op_t typecast_output {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_output"};
-    graph::op_t quantize_output {
-            id_gen.get_id(), graph::op_kind::Quantize, "quantize_output"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_output);
-
-    if (use_int8) {
-        dequantize_query.add_input(query_dequantize_input);
-        dequantize_key.add_input(key_dequantize_input);
-        dequantize_value.add_input(value_dequantize_input);
-        if (!use_bf16) {
-            dequantize_query.add_output(query_matmul_input);
-            dequantize_key.add_output(key_matmul_input);
-            dequantize_value.add_output(value_matmul_input);
-        } else {
-            dequantize_query.add_output(query_typecast_input);
-            dequantize_key.add_output(key_typecast_input);
-            dequantize_value.add_output(value_typecast_input);
-            typecast_query.add_input(query_typecast_input);
-            typecast_key.add_input(key_typecast_input);
-            typecast_value.add_input(value_typecast_input);
-            typecast_query.add_output(query_matmul_input);
-            typecast_key.add_output(key_matmul_input);
-            typecast_value.add_output(value_matmul_input);
-        }
-    }
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-    fscore_select.add_input(select_bool_lt);
-    fscore_select.add_input(matmul_qk_out);
-    fscore_select.add_input(select_else_lt);
-    fscore_select.add_output(select_out);
-    fscore_rescale.add_input(select_out);
-    fscore_rescale.add_input(fscore_scale);
-    fscore_rescale.add_output(fscore_scale_out);
-    fscore_add.add_input(fscore_scale_out);
-    fscore_add.add_input(fscore_add_mask);
-    fscore_add.add_output(fscore_add_out);
-    softmax.add_input(fscore_add_out);
-    softmax.add_output(softmax_out);
-
-    if (use_int8) {
-        quantize_softmax.add_output(softmax_quantize_out);
-        dequantize_softmax.add_input(softmax_quantize_out);
-        dequantize_softmax.add_output(softmax_dequantize_out);
-        if (!use_bf16) {
-            quantize_softmax.add_input(softmax_out);
-            matmul_v.add_input(softmax_dequantize_out);
-        } else {
-            softmax_cast1.add_input(softmax_out);
-            softmax_cast1.add_output(softmax_cast_out);
-            quantize_softmax.add_input(softmax_cast_out);
-            dequantize_softmax_cast.add_input(softmax_dequantize_out);
-            dequantize_softmax_cast.add_output(softmax_dequantize_out_cast);
-            matmul_v.add_input(softmax_dequantize_out_cast);
-        }
-    } else {
-        matmul_v.add_input(softmax_out);
-    }
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    transpose_output.add_input(matmul_v_out);
-    transpose_output.add_output(context_transpose_out);
-    reorder_output.add_input(context_transpose_out);
-    reorder_output.add_output(context_reorder_out);
-
-    if (use_int8) {
-        quantize_output.add_output(context_quantize_out);
-        if (!use_bf16) {
-            quantize_output.add_input(context_reorder_out);
-        } else {
-            typecast_output.add_input(context_reorder_out);
-            typecast_output.add_output(context_cast_out);
-            quantize_output.add_input(context_cast_out);
-        }
-    }
-    if (use_int8) {
-        agraph->add_op(&dequantize_query);
-        agraph->add_op(&dequantize_key);
-        agraph->add_op(&dequantize_value);
-        if (use_bf16) {
-            agraph->add_op(&typecast_query);
-            agraph->add_op(&typecast_key);
-            agraph->add_op(&typecast_value);
-        }
-    }
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_select);
-    agraph->add_op(&fscore_rescale);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&softmax);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_softmax);
-        agraph->add_op(&dequantize_softmax);
-        if (use_bf16) {
-            agraph->add_op(&softmax_cast1);
-            agraph->add_op(&dequantize_softmax_cast);
-        }
-    }
-
-    agraph->add_op(&matmul_v);
-    agraph->add_op(&transpose_output);
-    agraph->add_op(&reorder_output);
-
-    if (use_int8) {
-        agraph->add_op(&quantize_output);
-        if (use_bf16) { agraph->add_op(&typecast_output); }
-    }
-}
-
-static void construct_llama_mha_base(graph::graph_t *agraph,
-        utils::id_generator &id_gen, graph::data_type_t dtype,
-        int batch_size = 4, int seq_len = 34, int num_head = 32,
-        int head_dim = 4096) {
-    int size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> ADD_MASK_SHAPE {batch_size, 1, 1, seq_len};
-    std::vector<graph::dim_t> QUERY_TRANSPOSED_SHAPE {
-            batch_size, num_head, 1, size_per_head};
-    std::vector<graph::dim_t> KEY_TRANSPOSED_SHAPE {
-            batch_size, num_head, size_per_head, seq_len};
-    std::vector<graph::dim_t> VALUE_TRANSPOSED_SHAPE {
-            batch_size, num_head, seq_len, size_per_head};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input = utils::logical_tensor_init(
-            id_gen.get_id(), QUERY_TRANSPOSED_SHAPE, dtype);
-    key_matmul_input = utils::logical_tensor_init(
-            id_gen.get_id(), KEY_TRANSPOSED_SHAPE, dtype);
-    value_matmul_input = utils::logical_tensor_init(
-            id_gen.get_id(), VALUE_TRANSPOSED_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t fscore_scale, fscore_scale_out;
-    fscore_scale
-            = utils::logical_tensor_init(id_gen.get_id(), CONST_SHAPE, dtype);
-    fscore_scale_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t fscore_add_mask, fscore_add_out, fscore_max_in,
-            fscore_max_out, softmax_out;
-    fscore_add_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-    fscore_add_mask = utils::logical_tensor_init(
-            id_gen.get_id(), ADD_MASK_SHAPE, dtype);
-    fscore_max_in
-            = utils::logical_tensor_init(id_gen.get_id(), CONST_SHAPE, dtype);
-    fscore_max_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-    softmax_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::op_t dequantize_query {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_query"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_query);
-    graph::op_t dequantize_key {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_key"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_key);
-    graph::op_t dequantize_value {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_value"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_value);
-    graph::op_t typecast_query {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_query"};
-    graph::op_t typecast_key {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_key"};
-    graph::op_t typecast_value {
-            id_gen.get_id(), graph::op_kind::TypeCast, "typecast_value"};
-
-    graph::op_t matmul_qk {
-            id_gen.get_id(), graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_rescale {
-            id_gen.get_id(), graph::op_kind::Divide, "fscore_rescale"};
-    fscore_rescale.set_attr(
-            graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_add {id_gen.get_id(), graph::op_kind::Add, "fscore_add"};
-    fscore_add.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_max {
-            id_gen.get_id(), graph::op_kind::Maximum, "fscore_max"};
-    fscore_max.set_attr(graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {id_gen.get_id(), graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-
-    graph::op_t matmul_v {id_gen.get_id(), graph::op_kind::MatMul, "matmul_v"};
-
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-    fscore_rescale.add_input(matmul_qk_out);
-    fscore_rescale.add_input(fscore_scale);
-    fscore_rescale.add_output(fscore_scale_out);
-    fscore_add.add_input(fscore_scale_out);
-    fscore_add.add_input(fscore_add_mask);
-    fscore_add.add_output(fscore_add_out);
-    fscore_max.add_input(fscore_add_out);
-    fscore_max.add_input(fscore_max_in);
-    fscore_max.add_output(fscore_max_out);
-    softmax.add_input(fscore_max_out);
-    softmax.add_output(softmax_out);
-    matmul_v.add_input(softmax_out);
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_rescale);
-    agraph->add_op(&fscore_add);
-    agraph->add_op(&fscore_max);
-    agraph->add_op(&softmax);
-    agraph->add_op(&matmul_v);
-}
-
-static void insert_quantization_before_op(graph::graph_t *agraph,
-        utils::id_generator &id_gen, graph::op_t *op, int idx, bool is_bf16) {
-    // init quant dequant
-    graph::op_t quantize {
-            id_gen.get_id(), graph::op_kind::Quantize, "quantize"};
-    graph::op_t dequantize {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize);
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize);
-    auto in_val = op->get_input_value(idx);
-    in_val->remove_consumer(*op, idx);
-    auto ref_lt = in_val->get_logical_tensor();
-    auto quantize_out_lt = ref_lt;
-    quantize_out_lt.id = id_gen.get_id();
-    quantize_out_lt.data_type = graph::data_type::u8;
-    quantize.add_output(quantize_out_lt);
-    dequantize.add_input(quantize_out_lt);
-    auto dequant_out_lt = ref_lt;
-    dequant_out_lt.id = id_gen.get_id();
-    dequant_out_lt.data_type = graph::data_type::f32;
-
-    if (is_bf16) {
-        graph::op_t typecast1 {
-                id_gen.get_id(), graph::op_kind::TypeCast, "typecast1"};
-        graph::op_t typecast2 {
-                id_gen.get_id(), graph::op_kind::TypeCast, "typecast2"};
-
-        auto typecast1_out_lt = ref_lt;
-        typecast1_out_lt.id = id_gen.get_id();
-        typecast1_out_lt.data_type = graph::data_type::f32;
-        typecast1.add_input(in_val);
-        typecast1.add_output(typecast1_out_lt);
-
-        quantize.add_input(typecast1_out_lt);
-        dequantize.add_output(dequant_out_lt);
-
-        typecast2.add_input(dequant_out_lt);
-        auto typecast2_out_lt = ref_lt;
-        typecast2_out_lt.id = id_gen.get_id();
-        typecast2_out_lt.data_type = graph::data_type::bf16;
-
-        auto typecast2_out_val
-                = std::make_shared<graph::value_t>(typecast2_out_lt, false);
-        op->connect_input(idx, typecast2_out_val);
-        typecast2.add_output(typecast2_out_val);
-
-        agraph->add_op(&typecast1);
-        agraph->add_op(&typecast2);
-    } else {
-        quantize.add_input(in_val);
-
-        auto dequant_out_val
-                = std::make_shared<graph::value_t>(dequant_out_lt, false);
-        op->connect_input(idx, dequant_out_val);
-        dequantize.add_output(dequant_out_val);
-    }
-    agraph->add_op(&quantize);
-    agraph->add_op(&dequantize);
-}
-
-static void insert_quantization_after_output(graph::graph_t *agraph,
-        utils::id_generator &id_gen, const graph::logical_tensor_t &output_lt,
-        bool is_bf16) {
-    // init quant dequant
-    graph::op_t quantize {
-            id_gen.get_id(), graph::op_kind::Quantize, "quantize"};
-    graph::op_t dequantize {
-            id_gen.get_id(), graph::op_kind::Dequantize, "dequantize"};
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize);
-    DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize);
-    auto quantize_out_lt = output_lt;
-    quantize_out_lt.id = id_gen.get_id();
-    quantize_out_lt.data_type = graph::data_type::u8;
-    quantize.add_output(quantize_out_lt);
-    dequantize.add_input(quantize_out_lt);
-    auto dequant_out_lt = output_lt;
-    dequant_out_lt.id = id_gen.get_id();
-    dequant_out_lt.data_type = graph::data_type::f32;
-    dequantize.add_output(dequant_out_lt);
-
-    if (is_bf16) {
-        graph::op_t typecast1 {
-                id_gen.get_id(), graph::op_kind::TypeCast, "typecast1"};
-        graph::op_t typecast2 {
-                id_gen.get_id(), graph::op_kind::TypeCast, "typecast2"};
-
-        auto typecast1_out_lt = output_lt;
-        typecast1_out_lt.id = id_gen.get_id();
-        typecast1_out_lt.data_type = graph::data_type::f32;
-        typecast1.add_input(output_lt);
-        typecast1.add_output(typecast1_out_lt);
-
-        quantize.add_input(typecast1_out_lt);
-
-        typecast2.add_input(dequant_out_lt);
-        auto typecast2_out_lt = output_lt;
-        typecast2_out_lt.id = id_gen.get_id();
-        typecast2_out_lt.data_type = graph::data_type::bf16;
-
-        auto typecast2_out_val
-                = std::make_shared<graph::value_t>(typecast2_out_lt, false);
-        typecast2.add_output(typecast2_out_val);
-
-        agraph->add_op(&typecast1);
-        agraph->add_op(&typecast2);
-    } else {
-        quantize.add_input(output_lt);
-    }
-    agraph->add_op(&quantize);
-    agraph->add_op(&dequantize);
-}
-
-inline void construct_llama_mha_subgraph(graph::graph_t *agraph,
-        utils::id_generator &id_gen, bool use_bf16 = false,
-        bool use_int8 = false, int batch_size = 4, int seq_len = 34,
-        int num_head = 32, int head_dim = 4096) {
-    construct_llama_mha_base(agraph, id_gen,
-            use_bf16 ? graph::data_type::bf16 : graph::data_type::f32,
-            batch_size, seq_len, num_head, head_dim);
-    auto ops = agraph->get_ops();
-    // insert quantize / dequantize / typecast
-    if (use_int8) {
-        for (const auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::MatMul) {
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 0, use_bf16);
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 1, use_bf16);
-            }
-        }
-    }
-    // specially deal with bf16 case
-    if (use_bf16 && !use_int8) {
-        // change dtype of add/max/softmax
-        for (const auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::Add
-                    || op->get_kind() == graph::op_kind::Maximum
-                    || op->get_kind() == graph::op_kind::SoftMax) {
-                for (auto &val : op->get_input_values()) {
-                    val->set_data_type(graph::data_type::f32);
-                }
-                for (auto &val : op->get_output_values()) {
-                    val->set_data_type(graph::data_type::f32);
-                }
-            }
-        }
-
-        // insert typecast after div & softmax
-        for (const auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::Divide
-                    || op->get_kind() == graph::op_kind::SoftMax) {
-                if (op->get_kind() == graph::op_kind::SoftMax) {
-                    for (auto &val : op->get_output_values()) {
-                        val->set_data_type(graph::data_type::bf16);
-                    }
-                } else if (op->get_kind() == graph::op_kind::Divide) {
-                    for (auto &val : op->get_output_values()) {
-                        val->set_data_type(graph::data_type::f32);
-                    }
-                }
-                graph::op_t typecast {
-                        id_gen.get_id(), graph::op_kind::TypeCast, "typecast"};
-                auto out_val = op->get_output_value(0);
-                typecast.add_output(out_val);
-                auto typecast_in_lt = out_val->get_logical_tensor();
-                typecast_in_lt.id = id_gen.get_id();
-                typecast_in_lt.data_type
-                        = typecast_in_lt.data_type == graph::data_type::bf16
-                        ? graph::data_type::f32
-                        : graph::data_type::bf16;
-                auto typecast_in_val = std::make_shared<graph::value_t>(
-                        *op, 0, typecast_in_lt, false);
-                op->connect_output(0, typecast_in_val);
-                typecast.add_input(typecast_in_val);
-
-                agraph->add_op(&typecast);
-            }
-        }
-    }
-}
-
-static inline graph::logical_tensor_t create_matmul(utils::id_generator &id_gen,
-        graph::graph_t *agraph, const graph::logical_tensor_t *input,
-        const graph::logical_tensor_t *weight,
-        const graph::logical_tensor_t *bias = nullptr,
-        bool transpose_b = false) {
-    graph::op_t matmul(id_gen.get_id(), graph::op_kind::MatMul, "matmul");
-    auto dst = utils::logical_tensor_init(id_gen.get_id(), input->data_type);
-    if (transpose_b) { matmul.set_attr(graph::op_attr::transpose_b, true); }
-    matmul.add_input(*input);
-    matmul.add_input(*weight);
-    if (bias != nullptr) { matmul.add_input(*bias); }
-    matmul.add_output(dst);
-    agraph->add_op(&matmul);
-    return dst;
-}
-
-static graph::logical_tensor_t construct_gpt_mlp_base(graph::graph_t *agraph,
-        utils::id_generator &id_gen, graph::data_type_t dtype,
-        graph::dim_t batch_size = 4,
-        std::vector<graph::dim_t> hidden_sizes = {4096, 16384, 4096}) {
-    std::vector<graph::dim_t> input_shape {batch_size, 1, hidden_sizes[0]};
-    std::vector<graph::dim_t> weight_size_1 {hidden_sizes[0], hidden_sizes[1]};
-    std::vector<graph::dim_t> weight_size_2 {hidden_sizes[1], hidden_sizes[2]};
-    std::vector<graph::dim_t> weight_size_3 {hidden_sizes[0], hidden_sizes[2]};
-    std::vector<graph::dim_t> bias_size_1 {hidden_sizes[1]};
-    std::vector<graph::dim_t> bias_size_2 {hidden_sizes[2]};
-
-    auto input_desc_1
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto weight_desc_1
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_1, dtype);
-    auto bias_desc_1
-            = utils::logical_tensor_init(id_gen.get_id(), bias_size_1, dtype);
-    graph::logical_tensor_t matmul_desc_1 = create_matmul(
-            id_gen, agraph, &input_desc_1, &weight_desc_1, &bias_desc_1);
-
-    graph::op_t gelu(id_gen.get_id(), graph::op_kind::GELU, "gelu");
-    auto gelu_dst = utils::logical_tensor_init(
-            id_gen.get_id(), matmul_desc_1.data_type);
-    gelu.add_input(matmul_desc_1);
-    gelu.add_output(gelu_dst);
-    agraph->add_op(&gelu);
-
-    auto weight_desc_2
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_2, dtype);
-    auto bias_desc_2
-            = utils::logical_tensor_init(id_gen.get_id(), bias_size_2, dtype);
-    graph::logical_tensor_t matmul_desc_2 = create_matmul(
-            id_gen, agraph, &gelu_dst, &weight_desc_2, &bias_desc_2);
-
-    auto input_desc_3
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto weight_desc_3
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_3, dtype);
-    graph::logical_tensor_t matmul_desc_3 = create_matmul(
-            id_gen, agraph, &input_desc_3, &weight_desc_3, nullptr);
-    auto add1_dst
-            = utils::create_add(id_gen, *agraph, matmul_desc_3, matmul_desc_2);
-    auto add2_in
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto add2_dst = utils::create_add(id_gen, *agraph, add1_dst, add2_in);
-    graph::op_t layernorm(
-            id_gen.get_id(), graph::op_kind::LayerNorm, "layernorm");
-    layernorm.set_attr(graph::op_attr::keep_stats, false);
-    auto layernorm_gamma = utils::logical_tensor_init(
-            id_gen.get_id(), bias_size_2, add2_dst.data_type);
-    auto layernorm_beta = utils::logical_tensor_init(
-            id_gen.get_id(), bias_size_2, add2_dst.data_type);
-    auto layernorm_dst
-            = utils::logical_tensor_init(id_gen.get_id(), add2_dst.data_type);
-    layernorm.add_input(add2_dst);
-    layernorm.add_input(layernorm_gamma);
-    layernorm.add_input(layernorm_beta);
-    layernorm.add_output(layernorm_dst);
-    agraph->add_op(&layernorm);
-    return layernorm_dst;
-}
-
-inline void construct_gpt_mlp_subgraph(graph::graph_t *agraph,
-        utils::id_generator &id_gen, bool use_bf16 = false,
-        bool use_int8 = false, graph::dim_t batch_size = 4,
-        std::vector<graph::dim_t> hidden_sizes = {4096, 16384, 4096}) {
-    auto output_lt = construct_gpt_mlp_base(agraph, id_gen,
-            use_bf16 ? graph::data_type::bf16 : graph::data_type::f32,
-            batch_size, hidden_sizes);
-    auto ops = agraph->get_ops();
-    // insert quantize / dequantize / typecast
-    if (use_int8) {
-        for (const auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::MatMul) {
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 0, use_bf16);
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 1, use_bf16);
-            }
-        }
-        insert_quantization_after_output(agraph, id_gen, output_lt, use_bf16);
-    }
-}
-
-static graph::logical_tensor_t construct_rms_norm_subgraph(
-        graph::graph_t *agraph, utils::id_generator &id_gen,
-        const graph::logical_tensor_t &input_desc,
-        graph::dim_t mul_in_size = 4096) {
-    bool is_bf16 = input_desc.data_type == graph::data_type::bf16;
-    graph::logical_tensor_t input_lt = input_desc;
-    if (is_bf16) {
-        graph::op_t typecast_in(
-                id_gen.get_id(), graph::op_kind::TypeCast, "typecast_in");
-        auto typecast_in_dst = utils::logical_tensor_init(
-                id_gen.get_id(), graph::data_type::f32);
-        typecast_in.add_input(input_lt);
-        typecast_in.add_output(typecast_in_dst);
-        agraph->add_op(&typecast_in);
-        input_lt = typecast_in_dst;
-    }
-    graph::op_t pow(id_gen.get_id(), graph::op_kind::Pow, "pow");
-    pow.set_attr(graph::op_attr::beta, 2.0f);
-    auto pow_dst = utils::logical_tensor_init(
-            id_gen.get_id(), graph::data_type::f32);
-    pow.add_input(input_lt);
-    pow.add_output(pow_dst);
-    agraph->add_op(&pow);
-
-    graph::op_t reduce_mean(
-            id_gen.get_id(), graph::op_kind::ReduceMean, "reduce_mean");
-    reduce_mean.set_attr(graph::op_attr::axes, std::vector<int64_t> {-1});
-    reduce_mean.set_attr(graph::op_attr::keep_dims, true);
-    auto reduce_mean_dst = utils::logical_tensor_init(
-            id_gen.get_id(), graph::data_type::f32);
-    reduce_mean.add_input(pow_dst);
-    reduce_mean.add_output(reduce_mean_dst);
-    agraph->add_op(&reduce_mean);
-
-    graph::logical_tensor_t add_in = utils::logical_tensor_init(id_gen.get_id(),
-            std::vector<graph::dim_t> {1}, graph::data_type::f32);
-    graph::logical_tensor_t add_dst
-            = utils::create_add(id_gen, *agraph, reduce_mean_dst, add_in);
-
-    graph::op_t rsqrt(id_gen.get_id(), graph::op_kind::Pow, "rsqrt");
-    rsqrt.set_attr(graph::op_attr::beta, -0.5f);
-    auto rsqrt_dst = utils::logical_tensor_init(
-            id_gen.get_id(), graph::data_type::f32);
-    rsqrt.add_input(add_dst);
-    rsqrt.add_output(rsqrt_dst);
-    agraph->add_op(&rsqrt);
-
-    graph::op_t mul1(id_gen.get_id(), graph::op_kind::Multiply, "mul1");
-    auto mul1_dst = utils::logical_tensor_init(
-            id_gen.get_id(), graph::data_type::f32);
-    mul1.add_input(input_lt);
-    mul1.add_input(rsqrt_dst);
-    mul1.add_output(mul1_dst);
-    agraph->add_op(&mul1);
-
-    if (is_bf16) {
-        graph::op_t typecast(
-                id_gen.get_id(), graph::op_kind::TypeCast, "typecast");
-        auto typecast_dst = utils::logical_tensor_init(
-                id_gen.get_id(), graph::data_type::bf16);
-        typecast.add_input(mul1_dst);
-        typecast.add_output(typecast_dst);
-        agraph->add_op(&typecast);
-        mul1_dst = typecast_dst;
-    }
-
-    graph::op_t mul2(id_gen.get_id(), graph::op_kind::Multiply, "mul2");
-    auto mul2_in = utils::logical_tensor_init(id_gen.get_id(),
-            std::vector<graph::dim_t> {mul_in_size}, mul1_dst.data_type);
-    auto mul2_dst
-            = utils::logical_tensor_init(id_gen.get_id(), mul1_dst.data_type);
-    mul2.add_input(mul1_dst);
-    mul2.add_input(mul2_in);
-    mul2.add_output(mul2_dst);
-    agraph->add_op(&mul2);
-    return mul2_dst;
-}
-
-static graph::logical_tensor_t construct_llama_mlp_base(graph::graph_t *agraph,
-        utils::id_generator &id_gen, graph::data_type_t dtype,
-        graph::dim_t batch_size = 4,
-        std::vector<graph::dim_t> hidden_sizes = {4096, 4096, 11008, 4096}) {
-    std::vector<graph::dim_t> input_shape {batch_size, 1, hidden_sizes[0]};
-    std::vector<graph::dim_t> matmul1_shape {batch_size, 1, hidden_sizes[1]};
-    std::vector<graph::dim_t> weight_size_1 {hidden_sizes[0], hidden_sizes[1]};
-    std::vector<graph::dim_t> weight_size_2 {hidden_sizes[1], hidden_sizes[2]};
-    std::vector<graph::dim_t> weight_size_3 {hidden_sizes[2], hidden_sizes[3]};
-
-    auto input_desc_1
-            = utils::logical_tensor_init(id_gen.get_id(), input_shape, dtype);
-    auto weight_desc_1
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_1, dtype);
-    graph::logical_tensor_t matmul_desc_1 = create_matmul(
-            id_gen, agraph, &input_desc_1, &weight_desc_1, nullptr);
-    auto add2_in
-            = utils::logical_tensor_init(id_gen.get_id(), matmul1_shape, dtype);
-    graph::logical_tensor_t add_desc_1
-            = utils::create_add(id_gen, *agraph, matmul_desc_1, add2_in);
-
-    auto norm1_dst = construct_rms_norm_subgraph(
-            agraph, id_gen, add_desc_1, hidden_sizes[1]);
-
-    auto weight_desc_2
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_2, dtype);
-    auto weight_desc_3
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_2, dtype);
-    graph::logical_tensor_t matmul_desc_2 = create_matmul(
-            id_gen, agraph, &norm1_dst, &weight_desc_2, nullptr);
-    graph::logical_tensor_t matmul_desc_3 = create_matmul(
-            id_gen, agraph, &norm1_dst, &weight_desc_3, nullptr);
-    graph::op_t sigmoid(id_gen.get_id(), graph::op_kind::Sigmoid, "sigmoid");
-    auto sigmoid_dst = utils::logical_tensor_init(id_gen.get_id(), dtype);
-    sigmoid.add_input(matmul_desc_2);
-    sigmoid.add_output(sigmoid_dst);
-    agraph->add_op(&sigmoid);
-    graph::op_t silu_mul(id_gen.get_id(), graph::op_kind::Multiply, "silu_mul");
-    auto silu_mul_dst = utils::logical_tensor_init(id_gen.get_id(), dtype);
-    silu_mul.add_input(matmul_desc_2);
-    silu_mul.add_input(sigmoid_dst);
-    silu_mul.add_output(silu_mul_dst);
-    agraph->add_op(&silu_mul);
-    graph::op_t mul(id_gen.get_id(), graph::op_kind::Multiply, "mul");
-    auto mul_dst = utils::logical_tensor_init(id_gen.get_id(), dtype);
-    mul.add_input(silu_mul_dst);
-    mul.add_input(matmul_desc_3);
-    mul.add_output(mul_dst);
-    agraph->add_op(&mul);
-
-    auto weight_desc_4
-            = utils::logical_tensor_init(id_gen.get_id(), weight_size_3, dtype);
-    graph::logical_tensor_t matmul_desc_4
-            = create_matmul(id_gen, agraph, &mul_dst, &weight_desc_4, nullptr);
-    graph::logical_tensor_t add_desc_2
-            = utils::create_add(id_gen, *agraph, matmul_desc_4, add_desc_1);
-
-    auto norm2_dst = construct_rms_norm_subgraph(
-            agraph, id_gen, add_desc_2, hidden_sizes[3]);
-    return norm2_dst;
-}
-
-inline void construct_llama_mlp_subgraph(graph::graph_t *agraph,
-        utils::id_generator &id_gen, bool use_bf16 = false,
-        bool use_int8 = false, graph::dim_t batch_size = 4,
-        std::vector<graph::dim_t> hidden_sizes = {4096, 4096, 11008, 4096}) {
-    auto output_lt = construct_llama_mlp_base(agraph, id_gen,
-            use_bf16 ? graph::data_type::bf16 : graph::data_type::f32,
-            batch_size, hidden_sizes);
-    auto ops = agraph->get_ops();
-    // insert quantize / dequantize / typecast
-    if (use_int8) {
-        for (const auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::MatMul
-                    && (op->get_id() == 2 || op->get_id() == 34
-                            || op->get_id() == 38)) {
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 0, use_bf16);
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 1, use_bf16);
-            } else if (op->get_kind() == graph::op_kind::MatMul
-                    && (op->get_id() == 23 || op->get_id() == 25
-                            || op->get_id() == 27 || op->get_id() == 29)) {
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 1, use_bf16);
-            }
-        }
-        // specially deal with horizontally merged matmul part
-        auto matmul_lhs_iter = std::find_if(ops.begin(), ops.end(),
-                [&](const std::shared_ptr<graph::op_t> &op) {
-                    if ((!use_bf16 && op->get_id() == 23)
-                            || (use_bf16 && op->get_id() == 27)) {
-                        return true;
-                    }
-                    return false;
-                });
-        auto matmul_rhs_iter = std::find_if(ops.begin(), ops.end(),
-                [&](const std::shared_ptr<graph::op_t> &op) {
-                    if ((!use_bf16 && op->get_id() == 25)
-                            || (use_bf16 && op->get_id() == 29)) {
-                        return true;
-                    }
-                    return false;
-                });
-        if (matmul_lhs_iter == ops.end() || matmul_rhs_iter == ops.end()) {
-            throw std::runtime_error("Cannot find op with specific id");
-        }
-        graph::op_t *matmul_lhs = (*matmul_lhs_iter).get();
-        graph::op_t *matmul_rhs = (*matmul_rhs_iter).get();
-        graph::op_t quantize_common {
-                id_gen.get_id(), graph::op_kind::Quantize, "quantize"};
-        graph::op_t dequantize_lhs {
-                id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_lhs"};
-        graph::op_t dequantize_rhs {
-                id_gen.get_id(), graph::op_kind::Dequantize, "dequantize_rhs"};
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quantize_common);
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_lhs);
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(dequantize_rhs);
-        auto in_val_lhs = matmul_lhs->get_input_value(0);
-        in_val_lhs->remove_consumer(*matmul_lhs, 0);
-        in_val_lhs->remove_consumer(*matmul_rhs, 0);
-        auto ref_lt = in_val_lhs->get_logical_tensor();
-        auto quantize_common_out_lt = ref_lt;
-        quantize_common_out_lt.id = id_gen.get_id();
-        quantize_common_out_lt.data_type = graph::data_type::u8;
-        quantize_common.add_output(quantize_common_out_lt);
-        dequantize_lhs.add_input(quantize_common_out_lt);
-        dequantize_rhs.add_input(quantize_common_out_lt);
-        auto dequant_lhs_out_lt = ref_lt;
-        dequant_lhs_out_lt.id = id_gen.get_id();
-        dequant_lhs_out_lt.data_type = graph::data_type::f32;
-        auto dequant_rhs_out_lt = ref_lt;
-        dequant_rhs_out_lt.id = id_gen.get_id();
-        dequant_rhs_out_lt.data_type = graph::data_type::f32;
-
-        if (use_bf16) {
-            graph::op_t typecast_common {id_gen.get_id(),
-                    graph::op_kind::TypeCast, "typecast_common"};
-            graph::op_t typecast_lhs {
-                    id_gen.get_id(), graph::op_kind::TypeCast, "typecast_lhs"};
-            graph::op_t typecast_rhs {
-                    id_gen.get_id(), graph::op_kind::TypeCast, "typecast_rhs"};
-
-            auto typecast_common_out_lt = ref_lt;
-            typecast_common_out_lt.id = id_gen.get_id();
-            typecast_common_out_lt.data_type = graph::data_type::f32;
-            typecast_common.add_input(in_val_lhs);
-            typecast_common.add_output(typecast_common_out_lt);
-
-            quantize_common.add_input(typecast_common_out_lt);
-            dequantize_lhs.add_output(dequant_lhs_out_lt);
-            dequantize_rhs.add_output(dequant_rhs_out_lt);
-            typecast_lhs.add_input(dequant_lhs_out_lt);
-            typecast_rhs.add_input(dequant_rhs_out_lt);
-            auto typecast_lhs_out_lt = ref_lt;
-            typecast_lhs_out_lt.id = id_gen.get_id();
-            typecast_lhs_out_lt.data_type = graph::data_type::bf16;
-            auto typecast_rhs_out_lt = ref_lt;
-            typecast_rhs_out_lt.id = id_gen.get_id();
-            typecast_rhs_out_lt.data_type = graph::data_type::bf16;
-
-            auto typecast_lhs_out_val = std::make_shared<graph::value_t>(
-                    typecast_lhs_out_lt, false);
-            matmul_lhs->connect_input(0, typecast_lhs_out_val);
-            typecast_lhs.add_output(typecast_lhs_out_val);
-            auto typecast_rhs_out_val = std::make_shared<graph::value_t>(
-                    typecast_rhs_out_lt, false);
-            matmul_rhs->connect_input(0, typecast_rhs_out_val);
-            typecast_rhs.add_output(typecast_rhs_out_val);
-
-            agraph->add_op(&typecast_common);
-            agraph->add_op(&typecast_lhs);
-            agraph->add_op(&typecast_rhs);
-        } else {
-            quantize_common.add_input(in_val_lhs);
-
-            auto dequant_lhs_out_val = std::make_shared<graph::value_t>(
-                    dequant_lhs_out_lt, false);
-            matmul_lhs->connect_input(0, dequant_lhs_out_val);
-            dequantize_lhs.add_output(dequant_lhs_out_val);
-            auto dequant_rhs_out_val = std::make_shared<graph::value_t>(
-                    dequant_rhs_out_lt, false);
-            matmul_rhs->connect_input(0, dequant_rhs_out_val);
-            dequantize_rhs.add_output(dequant_rhs_out_val);
-        }
-        agraph->add_op(&quantize_common);
-        agraph->add_op(&dequantize_lhs);
-        agraph->add_op(&dequantize_rhs);
-
-        insert_quantization_after_output(agraph, id_gen, output_lt, use_bf16);
-    }
-}
-
-/* from GPT-J
-[IN0](dtype)  [IN1](dtype) [IN2](dtype)   [IN3](dtype)
-      \            /            |              /
-      mul1      mul2            |             /
-        \       /               |            /
-           add                 to1          /
-            \                  /           /
-                concat1                   /
-                   |                     /
-                permute                 /
-                   \                   /
-                         concat2
-                            |
-                           to2
-                            |
-                           to3
-                            |
-                          quant
-                            |
-                        [OUT0](dtype)
-*/
-inline void add_gptj_concat_subgraph(
-        graph::graph_t *agraph, bool add_quant = false) {
-    std::vector<graph::dim_t> shape0 = {32, 1024, 256};
-    std::vector<graph::dim_t> shape1 = {32, 1024, 256};
-    std::vector<graph::dim_t> shape2 = {32, 1024, 256};
-    std::vector<graph::dim_t> shape3 = {32, 256, 2048};
-    graph::dim_t concat1_axis = 1;
-    std::vector<int64_t> permute_order = {0, 2, 1};
-    graph::dim_t concat2_axis = 1;
-
-    size_t op_idx = 0, logical_tensor_idx = 0;
-    graph::logical_tensor_t in0, in1, in2, in3;
-    in0 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape0, graph::data_type::f32);
-    in1 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape1, graph::data_type::f32);
-    in2 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape2, graph::data_type::bf16);
-    in3 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape3, graph::data_type::f32);
-
-    graph::op_t mul1 {op_idx++, graph::op_kind::Multiply, "mul1"};
-    graph::logical_tensor_t mul1_out = utils::logical_tensor_init(
-            logical_tensor_idx++, shape0, graph::data_type::f32);
-    mul1.add_input(in0);
-    mul1.add_input(in1);
-    mul1.add_output(mul1_out);
-    agraph->add_op(&mul1);
-
-    graph::op_t mul2 {op_idx++, graph::op_kind::Multiply, "mul2"};
-    graph::logical_tensor_t mul2_out = utils::logical_tensor_init(
-            logical_tensor_idx++, shape0, graph::data_type::f32);
-    mul2.add_input(in0);
-    mul2.add_input(in1);
-    mul2.add_output(mul2_out);
-    agraph->add_op(&mul2);
-
-    graph::op_t add_layer {op_idx++, graph::op_kind::Add, "add_layer"};
-    graph::logical_tensor_t add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, shape0, graph::data_type::f32);
-    add_layer.add_input(mul1_out);
-    add_layer.add_input(mul2_out);
-    add_layer.add_output(add_out);
-    agraph->add_op(&add_layer);
-
-    graph::op_t to_layer1 {op_idx++, graph::op_kind::TypeCast, "to_layer1"};
-    graph::logical_tensor_t to1_out = utils::logical_tensor_init(
-            logical_tensor_idx++, shape2, graph::data_type::f32);
-    to_layer1.add_input(in2);
-    to_layer1.add_output(to1_out);
-    agraph->add_op(&to_layer1);
-
-    graph::op_t concat_layer1
-            = {op_idx++, graph::op_kind::Concat, "concat_layer1"};
-    concat_layer1.set_attr<int64_t>(graph::op_attr::axis, concat1_axis);
-    std::vector<graph::dim_t> concat1_out_shape = shape0;
-    concat1_out_shape[concat1_axis] += shape2[concat1_axis];
-    graph::logical_tensor_t concat1_out = utils::logical_tensor_init(
-            logical_tensor_idx++, concat1_out_shape, graph::data_type::f32);
-    concat_layer1.add_input(add_out);
-    concat_layer1.add_input(to1_out);
-    concat_layer1.add_output(concat1_out);
-    agraph->add_op(&concat_layer1);
-
-    graph::op_t permute_layer
-            = {op_idx++, graph::op_kind::StaticTranspose, "permute_layer"};
-    permute_layer.set_attr(graph::op_attr::order, permute_order);
-    std::vector<graph::dim_t> permute_out_shape = concat1_out_shape;
-    for (size_t i = 0; i < permute_order.size(); ++i) {
-        auto axis = permute_order[i];
-        permute_out_shape[i] = concat1_out_shape[axis];
-    }
-    graph::logical_tensor_t permute_out = utils::logical_tensor_init(
-            logical_tensor_idx++, permute_out_shape, graph::data_type::f32);
-    permute_layer.add_input(concat1_out);
-    permute_layer.add_output(permute_out);
-    agraph->add_op(&permute_layer);
-
-    graph::op_t concat_layer2
-            = {op_idx++, graph::op_kind::Concat, "concat_layer2"};
-    concat_layer2.set_attr<int64_t>(graph::op_attr::axis, concat2_axis);
-    std::vector<graph::dim_t> concat2_out_shape = permute_out_shape;
-    concat2_out_shape[concat2_axis] += shape3[concat2_axis];
-    graph::logical_tensor_t concat2_out = utils::logical_tensor_init(
-            logical_tensor_idx++, concat2_out_shape, graph::data_type::f32);
-    concat_layer2.add_input(in3);
-    concat_layer2.add_input(permute_out);
-    concat_layer2.add_output(concat2_out);
-    agraph->add_op(&concat_layer2);
-
-    graph::op_t to_layer2 = {op_idx++, graph::op_kind::TypeCast, "to_layer2"};
-    graph::logical_tensor_t to2_out = utils::logical_tensor_init(
-            logical_tensor_idx++, concat2_out_shape, graph::data_type::bf16);
-    to_layer2.add_input(concat2_out);
-    to_layer2.add_output(to2_out);
-    agraph->add_op(&to_layer2);
-
-    if (add_quant) {
-        graph::op_t to_layer3
-                = {op_idx++, graph::op_kind::TypeCast, "to_layer3"};
-        graph::logical_tensor_t to3_out = utils::logical_tensor_init(
-                logical_tensor_idx++, concat2_out_shape, graph::data_type::f32);
-        to_layer3.add_input(to2_out);
-        to_layer3.add_output(to3_out);
-        agraph->add_op(&to_layer3);
-
-        graph::op_t quant_layer {
-                op_idx++, graph::op_kind::Quantize, "quant_layer"};
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quant_layer);
-        graph::logical_tensor_t quant_out = utils::logical_tensor_init(
-                logical_tensor_idx++, concat2_out_shape, graph::data_type::u8);
-        quant_layer.add_input(to3_out);
-        quant_layer.add_output(quant_out);
-        agraph->add_op(&quant_layer);
-    }
-}
-
-/* from Llama
-[IN0](dtype)  [IN1](dtype) [IN2](dtype)
-      \            /            /
-           add                 /
-            |                 /
-           to1               /
-            \               /
-                concat1
-                   |
-                  to2
-                   |
-                  to3
-                   |
-                 quant
-                   |
-               [OUT0](dtype)
-*/
-inline void add_llama_concat_subgraph(
-        graph::graph_t *agraph, bool add_quant = false) {
-    std::vector<graph::dim_t> shape0 = {32, 1024, 256};
-    std::vector<graph::dim_t> shape1 = {32, 1024, 256};
-    std::vector<graph::dim_t> shape2 = {32, 1024, 256};
-    graph::dim_t concat1_axis = 1;
-
-    size_t op_idx = 0, logical_tensor_idx = 0;
-    graph::logical_tensor_t in0, in1, in2;
-    in0 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape0, graph::data_type::bf16);
-    in1 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape1, graph::data_type::bf16);
-    in2 = utils::logical_tensor_init(
-            logical_tensor_idx++, shape2, graph::data_type::f32);
-
-    graph::op_t add_layer {op_idx++, graph::op_kind::Add, "add_layer"};
-    graph::logical_tensor_t add_out = utils::logical_tensor_init(
-            logical_tensor_idx++, shape0, graph::data_type::bf16);
-    add_layer.add_input(in0);
-    add_layer.add_input(in1);
-    add_layer.add_output(add_out);
-    agraph->add_op(&add_layer);
-
-    graph::op_t to_layer1 {op_idx++, graph::op_kind::TypeCast, "to_layer1"};
-    graph::logical_tensor_t to1_out = utils::logical_tensor_init(
-            logical_tensor_idx++, shape2, graph::data_type::f32);
-    to_layer1.add_input(add_out);
-    to_layer1.add_output(to1_out);
-    agraph->add_op(&to_layer1);
-
-    graph::op_t concat_layer1
-            = {op_idx++, graph::op_kind::Concat, "concat_layer1"};
-    concat_layer1.set_attr<int64_t>(graph::op_attr::axis, concat1_axis);
-    std::vector<graph::dim_t> concat1_out_shape = shape0;
-    concat1_out_shape[concat1_axis] += shape2[concat1_axis];
-    graph::logical_tensor_t concat1_out = utils::logical_tensor_init(
-            logical_tensor_idx++, concat1_out_shape, graph::data_type::f32);
-    concat_layer1.add_input(in2);
-    concat_layer1.add_input(to1_out);
-    concat_layer1.add_output(concat1_out);
-    agraph->add_op(&concat_layer1);
-
-    graph::op_t to_layer2 = {op_idx++, graph::op_kind::TypeCast, "to_layer2"};
-    graph::logical_tensor_t to2_out = utils::logical_tensor_init(
-            logical_tensor_idx++, concat1_out_shape, graph::data_type::bf16);
-    to_layer2.add_input(concat1_out);
-    to_layer2.add_output(to2_out);
-    agraph->add_op(&to_layer2);
-
-    if (add_quant) {
-        graph::op_t to_layer3
-                = {op_idx++, graph::op_kind::TypeCast, "to_layer3"};
-        graph::logical_tensor_t to3_out = utils::logical_tensor_init(
-                logical_tensor_idx++, concat1_out_shape, graph::data_type::f32);
-        to_layer3.add_input(to2_out);
-        to_layer3.add_output(to3_out);
-        agraph->add_op(&to_layer3);
-
-        graph::op_t quant_layer {
-                op_idx++, graph::op_kind::Quantize, "quant_layer"};
-        DEFINE_DEFAULT_PER_TENSOR_QUANT_ATTR(quant_layer);
-        graph::logical_tensor_t quant_out = utils::logical_tensor_init(
-                logical_tensor_idx++, concat1_out_shape, graph::data_type::u8);
-        quant_layer.add_input(to3_out);
-        quant_layer.add_output(quant_out);
-        agraph->add_op(&quant_layer);
-    }
-}
-
-static void construct_starcoder_mha_base(graph::graph_t *agraph,
-        utils::id_generator &id_gen, graph::data_type_t dtype,
-        int batch_size = 1, int query_seq_len = 16, int key_seq_len = 16,
-        int num_head = 32, int head_dim = 4096) {
-    int size_per_head = head_dim / num_head;
-    std::vector<graph::dim_t> COND_MASK_SHAPE {batch_size, 1, 1, key_seq_len};
-    std::vector<graph::dim_t> QUERY_SHAPE {
-            batch_size, query_seq_len, num_head, size_per_head};
-    std::vector<graph::dim_t> KEY_SHAPE {
-            batch_size, 1, size_per_head, key_seq_len};
-    std::vector<graph::dim_t> VALUE_SHAPE {
-            batch_size, query_seq_len, key_seq_len, size_per_head};
-    std::vector<graph::dim_t> CONST_SHAPE {1};
-
-    graph::logical_tensor_t query_matmul_input, key_matmul_input,
-            value_matmul_input;
-    query_matmul_input
-            = utils::logical_tensor_init(id_gen.get_id(), QUERY_SHAPE, dtype);
-    key_matmul_input
-            = utils::logical_tensor_init(id_gen.get_id(), KEY_SHAPE, dtype);
-    value_matmul_input
-            = utils::logical_tensor_init(id_gen.get_id(), VALUE_SHAPE, dtype);
-
-    graph::logical_tensor_t matmul_qk_out;
-    matmul_qk_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t fscore_scale, fscore_scale_out;
-    fscore_scale
-            = utils::logical_tensor_init(id_gen.get_id(), CONST_SHAPE, dtype);
-    fscore_scale_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t fscore_mask, fscore_mask_else, fscore_select_out,
-            softmax_out;
-    fscore_mask = utils::logical_tensor_init(
-            id_gen.get_id(), COND_MASK_SHAPE, graph::data_type::boolean);
-    fscore_mask_else
-            = utils::logical_tensor_init(id_gen.get_id(), CONST_SHAPE, dtype);
-    fscore_select_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-    softmax_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::logical_tensor_t matmul_v_out;
-    matmul_v_out = utils::logical_tensor_init(
-            id_gen.get_id(), dtype, graph::layout_type::strided);
-
-    graph::op_t matmul_qk {
-            id_gen.get_id(), graph::op_kind::MatMul, "matmul_qk"};
-
-    graph::op_t fscore_rescale {
-            id_gen.get_id(), graph::op_kind::Multiply, "fscore_rescale"};
-    fscore_rescale.set_attr(
-            graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t fscore_select {
-            id_gen.get_id(), graph::op_kind::Select, "fscore_select"};
-    fscore_select.set_attr(
-            graph::op_attr::auto_broadcast, std::string("numpy"));
-    graph::op_t softmax {id_gen.get_id(), graph::op_kind::SoftMax, "softmax"};
-    softmax.set_attr(graph::op_attr::axis, (int64_t)3);
-
-    graph::op_t matmul_v {id_gen.get_id(), graph::op_kind::MatMul, "matmul_v"};
-
-    matmul_qk.add_input(query_matmul_input);
-    matmul_qk.add_input(key_matmul_input);
-    matmul_qk.add_output(matmul_qk_out);
-    fscore_rescale.add_input(matmul_qk_out);
-    fscore_rescale.add_input(fscore_scale);
-    fscore_rescale.add_output(fscore_scale_out);
-    fscore_select.add_input(fscore_mask);
-    fscore_select.add_input(fscore_scale_out);
-    fscore_select.add_input(fscore_mask_else);
-    fscore_select.add_output(fscore_select_out);
-    softmax.add_input(fscore_select_out);
-    softmax.add_output(softmax_out);
-    matmul_v.add_input(softmax_out);
-    matmul_v.add_input(value_matmul_input);
-    matmul_v.add_output(matmul_v_out);
-
-    agraph->add_op(&matmul_qk);
-    agraph->add_op(&fscore_rescale);
-    agraph->add_op(&fscore_select);
-    agraph->add_op(&softmax);
-    agraph->add_op(&matmul_v);
-}
-
-inline void construct_starcoder_mha_subgraph(graph::graph_t *agraph,
-        utils::id_generator &id_gen, bool use_bf16 = false,
-        bool use_int8 = false, int batch_size = 1, int query_seq_len = 16,
-        int key_seq_len = 16, int num_head = 32, int head_dim = 4096) {
-    construct_starcoder_mha_base(agraph, id_gen,
-            use_bf16 ? graph::data_type::bf16 : graph::data_type::f32,
-            batch_size, query_seq_len, key_seq_len, num_head, head_dim);
-    auto ops = agraph->get_ops();
-    // insert quantize / dequantize / typecast
-    if (use_int8) {
-        for (const auto &op : ops) {
-            if (op->get_kind() == graph::op_kind::MatMul) {
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 0, use_bf16);
-                insert_quantization_before_op(
-                        agraph, id_gen, op.get(), 1, use_bf16);
-            }
-        }
-    }
-}
-
-} // namespace utils
-} // namespace compiler
-} // namespace unit
-} // namespace tests
-} // namespace graph
-} // namespace impl
-} // namespace dnnl
-
-#endif
diff --git a/tests/gtests/graph/unit/interface/CMakeLists.txt b/tests/gtests/graph/unit/interface/CMakeLists.txt
index 73e3a668355..27aaab7ac23 100644
--- a/tests/gtests/graph/unit/interface/CMakeLists.txt
+++ b/tests/gtests/graph/unit/interface/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2023 Intel Corporation
+# Copyright 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,21 +15,26 @@
 #===============================================================================
 set(OBJ_LIB graph_unit_test_interface)
 
-add_library(${OBJ_LIB} OBJECT
+file(GLOB TEST_INTERFACE_ENGINE_DEPENDENT_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_backend.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_compiled_partition.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_graph.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_logical_tensor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_schema.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_op.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_partition_hashing.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_shape_infer.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/test_tensor.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_def_constraint.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/test_value.cpp
 )
 
+file(GLOB TEST_INTERFACE_ENGINE_INDEPENDENT_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_backend_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_graph_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_logical_tensor_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_schema_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_shape_infer_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_op_def_constraint_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_value_cpu.cpp
+)
+
+add_library(${OBJ_LIB} OBJECT ${TEST_INTERFACE_ENGINE_DEPENDENT_SOURCES} ${TEST_INTERFACE_ENGINE_INDEPENDENT_SOURCES})
+
 set(sycl_rt_pattern "(SYCL|DPCPP)")
 if(DNNL_CPU_RUNTIME MATCHES ${sycl_rt_pattern} OR DNNL_GPU_RUNTIME MATCHES ${sycl_rt_pattern})
     add_subdirectory(sycl)
@@ -38,4 +43,17 @@ endif()
 set_property(GLOBAL APPEND PROPERTY GRAPH_UNIT_TEST_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
 
-register_graph_test_suite("test_graph_unit_interface" "test_interface_*")
+foreach(TEST_FILE ${TEST_INTERFACE_ENGINE_DEPENDENT_SOURCES})
+    get_filename_component(file_name ${TEST_FILE} NAME_WE)
+    string(REPLACE "test_" "test_graph_unit_interface_" test_suite_name ${file_name})
+    string(REPLACE "test_" "test_interface_" filter ${file_name})
+    register_graph_test_suite(${test_suite_name} "${filter}.*:${filter}/*")
+endforeach()
+
+foreach(TEST_FILE ${TEST_INTERFACE_ENGINE_INDEPENDENT_SOURCES})
+    get_filename_component(file_name ${TEST_FILE} NAME_WE)
+    string(REPLACE "_cpu" "" file_name ${file_name})
+    string(REPLACE "test_" "test_graph_unit_interface_" test_suite_name ${file_name})
+    string(REPLACE "test_" "test_interface_" filter ${file_name})
+    register_graph_test_suite(${test_suite_name}_cpu "${filter}.*:${filter}/*")
+endforeach()
diff --git a/tests/gtests/graph/unit/interface/sycl/CMakeLists.txt b/tests/gtests/graph/unit/interface/sycl/CMakeLists.txt
index 1206440eb5f..af091c74ae7 100644
--- a/tests/gtests/graph/unit/interface/sycl/CMakeLists.txt
+++ b/tests/gtests/graph/unit/interface/sycl/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2021-2023 Intel Corporation
+# Copyright 2021-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,12 +14,19 @@
 # limitations under the License.
 #===============================================================================
 
-file(GLOB SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/*.[ch]pp
-    )
+file(GLOB TEST_INTERFACE_SYCL_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
+)
 
 set(OBJ_LIB graph_unit_test_interface_sycl)
-add_library(${OBJ_LIB} OBJECT ${SOURCES})
+add_library(${OBJ_LIB} OBJECT ${TEST_INTERFACE_SYCL_SOURCES})
 
 set_property(GLOBAL APPEND PROPERTY GRAPH_UNIT_TEST_DEPS
     $<TARGET_OBJECTS:${OBJ_LIB}>)
+
+foreach(TEST_FILE ${TEST_INTERFACE_SYCL_SOURCES})
+    get_filename_component(file_name ${TEST_FILE} NAME_WE)
+    string(REPLACE "test_" "test_graph_unit_interface_" test_suite_name ${file_name})
+    string(REPLACE "test_" "test_interface_" filter ${file_name})
+    register_graph_test_suite(${test_suite_name}_sycl "${filter}*")
+endforeach()
diff --git a/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp b/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp
index c5f5a5d5095..b042b78d5ed 100644
--- a/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp
+++ b/tests/gtests/graph/unit/interface/sycl/test_allocator.cpp
@@ -25,7 +25,7 @@
 
 namespace graph = dnnl::impl::graph;
 
-TEST(test_interface_test_allocator, DefaultSyclAllocator) {
+TEST(test_interface_allocator, DefaultSyclAllocator) {
     graph::engine_kind_t kind = get_test_engine_kind();
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
     SKIP_IF(kind == graph::engine_kind::cpu,
@@ -54,7 +54,7 @@ TEST(test_interface_test_allocator, DefaultSyclAllocator) {
     }
 }
 
-TEST(test_interface_test_allocator, SyclAllocator) {
+TEST(test_interface_allocator, SyclAllocator) {
     graph::engine_kind_t kind = get_test_engine_kind();
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
     SKIP_IF(kind == graph::engine_kind::cpu,
diff --git a/tests/gtests/graph/unit/interface/test_allocator.cpp b/tests/gtests/graph/unit/interface/test_allocator.cpp
index 901bb38c6d7..95d47c24586 100644
--- a/tests/gtests/graph/unit/interface/test_allocator.cpp
+++ b/tests/gtests/graph/unit/interface/test_allocator.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ TEST(test_interface_allocator, DefaultCpuAllocator) {
     }
 }
 
-TEST(test_interface_engine, AllocatorEarlyDestroy) {
+TEST(test_interface_allocator, AllocatorEarlyDestroy) {
     dnnl::impl::graph::allocator_t *alloc
             = new dnnl::impl::graph::allocator_t();
     graph::engine_t *eng = get_engine();
@@ -64,77 +64,3 @@ TEST(test_interface_engine, AllocatorEarlyDestroy) {
 #endif
     }
 }
-
-#ifndef NDEBUG
-TEST(test_interface_allocator, Monitor) {
-    using namespace dnnl::impl::graph;
-
-    const size_t temp_size = 1024, persist_size = 512;
-
-    allocator_t *alloc = new allocator_t();
-    allocator_t::monitor_t &monitor = alloc->get_monitor();
-    std::vector<void *> persist_bufs;
-    std::mutex m;
-
-    auto callee = [&]() {
-        // allocate persistent buffer
-        void *p_buf = alloc->allocate(
-                persist_size, {allocator_t::mem_type_t::persistent, 4096});
-        {
-            std::lock_guard<std::mutex> lock(m);
-            persist_bufs.emplace_back(p_buf);
-        }
-
-        // allocate temporary buffer
-        void *t_buf = alloc->allocate(
-                temp_size, {allocator_t::mem_type_t::temp, 4096});
-        for (size_t i = 0; i < temp_size; i++) {
-            char *ptr = (char *)t_buf + i;
-            *ptr = *ptr + 2;
-        }
-        // deallocate temporary buffer
-        alloc->deallocate(t_buf);
-    };
-
-    // single thread
-    for (size_t iter = 0; iter < 4; iter++) {
-        monitor.reset_peak_temp_memory();
-        ASSERT_EQ(monitor.get_peak_temp_memory(), 0U);
-
-        callee(); // call the callee to do memory operation
-
-        ASSERT_EQ(monitor.get_peak_temp_memory(), temp_size);
-        ASSERT_EQ(
-                monitor.get_total_persist_memory(), persist_size * (iter + 1));
-    }
-
-    for (auto p_buf : persist_bufs) {
-        alloc->deallocate(p_buf);
-    }
-    persist_bufs.clear();
-
-    // multiple threads
-    auto thread_func = [&]() {
-        monitor.reset_peak_temp_memory();
-        ASSERT_EQ(monitor.get_peak_temp_memory(), 0U);
-        callee();
-        ASSERT_EQ(monitor.get_peak_temp_memory(), temp_size);
-    };
-
-    std::thread t1(thread_func);
-    std::thread t2(thread_func);
-
-    t1.join();
-    t2.join();
-
-    // two threads allocated persist buffer
-    ASSERT_EQ(monitor.get_total_persist_memory(), persist_size * 2);
-
-    for (auto p_buf : persist_bufs) {
-        alloc->deallocate(p_buf);
-    }
-    persist_bufs.clear();
-
-    delete alloc;
-}
-#endif
diff --git a/tests/gtests/graph/unit/interface/test_backend.cpp b/tests/gtests/graph/unit/interface/test_backend_cpu.cpp
similarity index 100%
rename from tests/gtests/graph/unit/interface/test_backend.cpp
rename to tests/gtests/graph/unit/interface/test_backend_cpu.cpp
diff --git a/tests/gtests/graph/unit/interface/test_compiled_partition.cpp b/tests/gtests/graph/unit/interface/test_compiled_partition.cpp
index e932907842c..3fa3f833092 100644
--- a/tests/gtests/graph/unit/interface/test_compiled_partition.cpp
+++ b/tests/gtests/graph/unit/interface/test_compiled_partition.cpp
@@ -35,7 +35,7 @@ namespace utils = dnnl::graph::tests::unit::utils;
 namespace dnnl {
 namespace graph {
 
-TEST(test_interface_compiled_partition_cache, SingleOpCase) {
+TEST(test_interface_compiled_partition, CacheSingleOpCase) {
 #if !defined(NDEBUG) && (DNNL_CPU_RUNTIME == DNNL_RUNTIME_THREADPOOL)
     // TODO:
     // Due to symbol duplication of dnnl_get_max_threads(), when building with
@@ -138,7 +138,69 @@ TEST(test_interface_compiled_partition_cache, SingleOpCase) {
 #endif
 }
 
-TEST(test_interface_lru_compiled_partition_cache, Method) {
+TEST(test_interface_compiled_partition, CacheEngine) {
+    dnnl::engine::kind ekind;
+    if (get_test_engine_kind() == impl::graph::engine_kind::cpu) {
+        ekind = dnnl::engine::kind::cpu;
+    } else {
+        ekind = dnnl::engine::kind::gpu;
+    }
+    const size_t batch_num = 2;
+
+    impl::graph::op_kind_t kind = impl::graph::op_kind::ReLU;
+
+    // Flush the cache
+    set_compiled_partition_cache_capacity(0);
+    set_compiled_partition_cache_capacity(1024);
+
+    for (size_t batch = 0; batch < batch_num; ++batch) {
+        dnnl::engine engine = dnnl::engine(ekind, 0);
+        impl::engine_t *eng = engine.get();
+        impl::graph::logical_tensor_t input = utils::logical_tensor_init(0,
+                {1, 1, 1, 1}, impl::graph::data_type::f32,
+                impl::graph::layout_type::strided);
+        impl::graph::logical_tensor_t output = utils::logical_tensor_init(1,
+                {1, 1, 1, 1}, impl::graph::data_type::f32,
+                impl::graph::layout_type::strided);
+
+        impl::graph::op_t elt {0, kind, "elt"};
+        elt.add_input(input);
+        elt.add_output(output);
+        // Create graph
+        impl::graph::graph_t g {eng->kind()};
+        g.add_op(&elt);
+        g.finalize();
+        // Create single-op partition
+        std::vector<const impl::graph::backend_t *> &backends
+                = impl::graph::backend_registry_t::get_singleton()
+                          .get_registered_backends();
+        for (const auto &cbkd : backends) {
+            impl::graph::backend_t *bkd
+                    = const_cast<impl::graph::backend_t *>(cbkd);
+            bkd->get_partitions(g, impl::graph::partition_policy::fusion);
+        }
+        // wrap into the partition
+        impl::graph::partition_t par = impl::graph::partition_t();
+        std::vector<impl::graph::partition_t *> parts {&par};
+        g.get_ordered_partitions(parts);
+
+        impl::graph::compiled_partition_t cp(par);
+        std::pair<impl::graph::compiled_partition_t *, cache_state_t> cpcache {
+                &cp, cache_state_t::miss};
+        std::vector<const impl::graph::logical_tensor_t *> inputs {&input};
+        std::vector<const impl::graph::logical_tensor_t *> outputs {&output};
+        // Partition compilation
+        par.compile(cpcache, inputs, outputs, eng);
+    }
+
+#ifdef DNNL_GRAPH_DISABLE_COMPILED_PARTITION_CACHE
+    ASSERT_EQ(get_compiled_partition_cache_size(), 0);
+#else
+    ASSERT_EQ(get_compiled_partition_cache_size(), static_cast<int>(batch_num));
+#endif
+}
+
+TEST(test_interface_compiled_partition, CacheMethod) {
     namespace graph = dnnl::impl::graph;
 
     graph::engine_t &eng = *get_engine();
diff --git a/tests/gtests/graph/unit/interface/test_graph.cpp b/tests/gtests/graph/unit/interface/test_graph.cpp
deleted file mode 100644
index 13148c52b54..00000000000
--- a/tests/gtests/graph/unit/interface/test_graph.cpp
+++ /dev/null
@@ -1,597 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <string>
-
-#include "gtest/gtest.h"
-
-#include "interface/c_types_map.hpp"
-#include "interface/graph.hpp"
-#include "interface/logical_tensor.hpp"
-#include "interface/value.hpp"
-
-#include "graph/unit/utils.hpp"
-
-TEST(test_interface_graph, Create) {
-    using namespace dnnl::impl::graph;
-
-    graph_t g_default_engine;
-    ASSERT_EQ(g_default_engine.get_engine_kind(), engine_kind::cpu);
-
-    graph_t g_gpu {engine_kind::gpu};
-    ASSERT_EQ(g_gpu.get_engine_kind(), engine_kind::gpu);
-}
-
-TEST(test_interface_graph, AddOp) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv2d")};
-    op_t op1 {1, ReLU, std::string("relu")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
-    op0.add_input(t0);
-    op0.add_output(t1);
-    op1.add_input(t1);
-    op1.add_output(t2);
-    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-
-    auto ret = agraph.get_ops()[0];
-    ASSERT_EQ(*ret, op1);
-}
-
-TEST(test_interface_graph, FailAddOpWithInvalidAttrValue) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv0")};
-
-    op0.set_attr<std::vector<int64_t>>(op_attr::strides, {4, 4});
-    op0.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {111, 111});
-    op0.set_attr<std::vector<int64_t>>(op_attr::pads_end, {111, 111});
-    op0.set_attr<std::string>(op_attr::auto_pad, "VALID");
-    op0.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
-    op0.set_attr<std::string>(op_attr::data_format, "NCX");
-    op0.set_attr<std::string>(op_attr::weights_format, "OIX");
-    op0.set_attr<int64_t>(op_attr::groups, 1);
-
-    // prepare logical tensor
-    logical_tensor_t src = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t weight = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t bias = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t conv_dst = logical_tensor_init(4, data_type::f32);
-
-    op0.add_input(src);
-    op0.add_input(weight);
-    op0.add_input(bias);
-    op0.add_output(conv_dst);
-
-    ASSERT_EQ(agraph.add_op(&op0), status::success);
-
-    op0.set_attr<std::string>(op_attr::weights_format, "IOX");
-    graph_t agraph1;
-    ASSERT_EQ(agraph1.add_op(&op0), status::invalid_graph_op);
-}
-
-TEST(test_interface_graph, AddNullOp) {
-    using namespace dnnl::impl::graph;
-
-    graph_t agraph;
-    ASSERT_EQ(agraph.add_op(nullptr), status::invalid_graph_op);
-    ASSERT_EQ(agraph.num_ops(), 0U);
-    ASSERT_EQ(agraph.get_ops().size(), 0U);
-}
-
-TEST(test_interface_graph, DeleteOp) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv2d")};
-    op_t op1 {1, ReLU, std::string("relu")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
-    op0.add_input(t0);
-    op0.add_output(t1);
-    op1.add_input(t1);
-    op1.add_output(t2);
-    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-    ASSERT_EQ(agraph.get_ops().size(), 1U);
-
-    agraph.delete_op(&op1);
-    ASSERT_EQ(agraph.num_ops(), 0U);
-    ASSERT_EQ(agraph.get_ops().size(), 0U);
-}
-
-TEST(test_interface_graph, GetOutputOps) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, ReLU, std::string("relu1")};
-    op_t op1 {1, ReLU, std::string("relu2")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
-    op0.add_input(t0);
-    op0.add_output(t1);
-    op1.add_input(t1);
-    op1.add_output(t2);
-    ASSERT_EQ(agraph.add_op(&op0), status::success);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.num_ops(), 2U);
-    agraph.finalize();
-    ASSERT_EQ(agraph.get_output_ops().size(), 1U);
-    ASSERT_EQ(*(agraph.get_output_ops()[0]), op1);
-}
-
-TEST(test_interface_graph, GetOutputOps2) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, ReLU, std::string("relu1")};
-    op_t op1 {1, ReLU, std::string("relu2")};
-    op_t op2 {2, ReLU, std::string("relu3")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t t3 = logical_tensor_init(3, data_type::f32);
-    op0.add_input(t0);
-    op0.add_output(t1);
-    op1.add_input(t1);
-    op1.add_output(t2);
-    op2.add_input(t2);
-    op2.add_output(t3);
-    ASSERT_EQ(agraph.add_op(&op0), status::success);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.add_op(&op2), status::success);
-    ASSERT_EQ(agraph.num_ops(), 3U);
-    agraph.finalize();
-    ASSERT_EQ(agraph.get_ops()[1]
-                      ->get_output_value(0)
-                      ->get_consumers()[0]
-                      .get_op(),
-            *agraph.get_ops()[2]);
-    ASSERT_EQ(agraph.get_ops()[0]
-                      ->get_output_value(0)
-                      ->get_consumers()[0]
-                      .get_op(),
-            *agraph.get_ops()[1]);
-    ASSERT_EQ(agraph.get_ops()[2]->get_input_value(0)->has_producer(), true);
-    ASSERT_EQ(agraph.get_ops()[1]->get_input_value(0)->has_producer(), true);
-}
-
-TEST(test_interface_graph, BuildGraph) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv2d")};
-    op_t op1 {1, ReLU, std::string("relu")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
-    op0.add_input(t0);
-    op0.add_output(t1);
-    op1.add_input(t1);
-    op1.add_output(t2);
-    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-
-    ASSERT_EQ(agraph.finalize(), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-    ASSERT_EQ(agraph.finalize(), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-}
-
-TEST(test_interface_graph, InvalidOp) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv2d")};
-    op_t op1 {1, ReLU, std::string("relu")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
-    op0.add_input(t0);
-    op0.add_output(t1);
-    op1.add_input(t1);
-    op1.add_output(t2);
-    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-
-    /*
-    ASSERT_EQ(agraph.run_pass(partition_policy::fusion), status::invalid_graph);
-    ASSERT_EQ(agraph.finalize(), status::success);
-    ASSERT_EQ(agraph.run_pass(partition_policy::fusion), status::success);
-    */
-}
-
-TEST(test_interface_graph, Wildcard) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    graph_t agraph;
-    op_t op {0, Wildcard, std::string("wildcard")};
-    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
-    op.add_input(t0);
-    op.add_output(t1);
-    ASSERT_EQ(agraph.add_op(&op), status::success);
-    ASSERT_EQ(agraph.num_ops(), 1U);
-}
-
-TEST(test_interface_graph, GetInputOutputEdges) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-    using ltw = dnnl::impl::graph::logical_tensor_wrapper_t;
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv0")};
-    op_t op1 {1, Add, std::string("add0")};
-    op_t op2 {2, ReLU, std::string("relu0")};
-    op_t op3 {3, Wildcard, std::string("wildcard0")};
-    op_t op4 {4, Wildcard, std::string("wildcard1")};
-    op_t op5 {5, Wildcard, std::string("wildcard2")};
-
-    op0.set_attr<std::vector<int64_t>>(op_attr::strides, {4, 4});
-    op0.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {111, 111});
-    op0.set_attr<std::vector<int64_t>>(op_attr::pads_end, {111, 111});
-    op0.set_attr<std::string>(op_attr::auto_pad, "VALID");
-    op0.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
-    op0.set_attr<std::string>(op_attr::data_format, "NCX");
-    op0.set_attr<std::string>(op_attr::weights_format, "OIX");
-    op0.set_attr<int64_t>(op_attr::groups, 1);
-
-    // prepare logical tensor
-    logical_tensor_t src = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t weight = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t bias = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t other = logical_tensor_init(3, data_type::f32);
-    logical_tensor_t conv_dst = logical_tensor_init(4, data_type::f32);
-    logical_tensor_t add_dst = logical_tensor_init(5, data_type::f32);
-    logical_tensor_t dst = logical_tensor_init(6, data_type::f32);
-    logical_tensor_t wild_val = logical_tensor_init(7, data_type::f32);
-
-    op4.add_output(src);
-    op0.add_input(src);
-    op0.add_input(weight);
-    op0.add_input(bias);
-    op0.add_output(conv_dst);
-
-    op1.add_input(conv_dst);
-    op1.add_input(other);
-    op1.add_output(add_dst);
-
-    op2.add_input(add_dst);
-    op2.add_output(dst);
-
-    op3.add_input(conv_dst);
-    op3.add_output(wild_val);
-
-    op5.add_input(wild_val);
-
-    ASSERT_EQ(agraph.add_op(&op0), status::success);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.add_op(&op2), status::success);
-    ASSERT_EQ(agraph.add_op(&op3), status::success);
-    ASSERT_EQ(agraph.num_ops(), 4U);
-    agraph.finalize();
-
-    auto ops = agraph.get_ops();
-    ops.pop_back();
-    graph_t subgraph(ops);
-    ASSERT_EQ(subgraph.num_ops(), 3U);
-
-    auto in_vals = subgraph.get_input_values();
-    ASSERT_EQ(in_vals.size(), 4U);
-    ASSERT_EQ(ltw(in_vals[0]->get_logical_tensor()), ltw(src));
-    ASSERT_EQ(ltw(in_vals[1]->get_logical_tensor()), ltw(weight));
-    ASSERT_EQ(ltw(in_vals[2]->get_logical_tensor()), ltw(bias));
-    ASSERT_EQ(ltw(in_vals[3]->get_logical_tensor()), ltw(other));
-
-    auto out_vals = subgraph.get_output_values();
-    ASSERT_EQ(out_vals.size(), 2U);
-    logical_tensor_t out_lt1 = out_vals[0]->get_logical_tensor();
-    logical_tensor_t out_lt2 = out_vals[1]->get_logical_tensor();
-    if (out_lt1.id == 6)
-        ASSERT_EQ(out_lt2.id, 4U);
-    else if (out_lt1.id == 4)
-        ASSERT_EQ(out_lt2.id, 6U);
-    else
-        ASSERT_TRUE(false);
-}
-
-TEST(test_interface_graph, InferShape) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-    using ltw = dnnl::impl::graph::logical_tensor_wrapper_t;
-
-    std::vector<int64_t> src_shape {8, 3, 227, 227};
-    std::vector<int64_t> weight_shape {96, 3, 11, 11};
-    std::vector<int64_t> bias_shape {96};
-    std::vector<int64_t> dst_shape {8, 96, 55, 55};
-
-    graph_t agraph;
-    op_t op0 {0, Convolution, std::string("conv0")};
-    op_t op1 {1, Add, std::string("add0")};
-    op_t op2 {2, ReLU, std::string("relu0")};
-
-    op0.set_attr<std::vector<int64_t>>(op_attr::strides, {4, 4});
-    op0.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {111, 111});
-    op0.set_attr<std::vector<int64_t>>(op_attr::pads_end, {111, 111});
-    op0.set_attr<std::string>(op_attr::auto_pad, "VALID");
-    op0.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
-    op0.set_attr<std::string>(op_attr::data_format, "NCX");
-    op0.set_attr<std::string>(op_attr::weights_format, "OIX");
-    op0.set_attr<int64_t>(op_attr::groups, 1);
-
-    // prepare logical tensor
-    logical_tensor_t src = logical_tensor_init(0, src_shape, data_type::f32);
-    logical_tensor_t weight
-            = logical_tensor_init(1, weight_shape, data_type::f32);
-    logical_tensor_t bias = logical_tensor_init(2, bias_shape, data_type::f32);
-    logical_tensor_t other = logical_tensor_init(3, dst_shape, data_type::f32);
-    logical_tensor_t conv_dst = logical_tensor_init(4, data_type::f32);
-    logical_tensor_t add_dst = logical_tensor_init(5, data_type::f32);
-    logical_tensor_t dst = logical_tensor_init(6, data_type::f32);
-
-    op0.add_input(src);
-    op0.add_input(weight);
-    op0.add_input(bias);
-    op0.add_output(conv_dst);
-
-    op1.add_input(conv_dst);
-    op1.add_input(other);
-    op1.add_output(add_dst);
-
-    op2.add_input(add_dst);
-    op2.add_output(dst);
-
-    ASSERT_EQ(agraph.add_op(&op0), status::success);
-    ASSERT_EQ(agraph.add_op(&op1), status::success);
-    ASSERT_EQ(agraph.add_op(&op2), status::success);
-    ASSERT_EQ(agraph.num_ops(), 3U);
-    agraph.finalize();
-
-    auto in_vals = agraph.get_input_values();
-    ASSERT_EQ(in_vals.size(), 4U);
-    ASSERT_EQ(ltw(in_vals[0]->get_logical_tensor()), ltw(src));
-    ASSERT_EQ(ltw(in_vals[1]->get_logical_tensor()), ltw(weight));
-    ASSERT_EQ(ltw(in_vals[2]->get_logical_tensor()), ltw(bias));
-    ASSERT_EQ(ltw(in_vals[3]->get_logical_tensor()), ltw(other));
-
-    ASSERT_EQ(agraph.infer_shape(), status::success);
-
-    auto out_vals = agraph.get_output_values();
-    ASSERT_EQ(out_vals.size(), 1U);
-    logical_tensor_t out_lt = out_vals[0]->get_logical_tensor();
-    ASSERT_EQ(out_lt.id, 6U);
-    ASSERT_EQ(out_lt.ndims, 4);
-    ASSERT_EQ(out_lt.dims[0], 8);
-    ASSERT_EQ(out_lt.dims[1], 96);
-    ASSERT_EQ(out_lt.dims[2], 55);
-    ASSERT_EQ(out_lt.dims[3], 55);
-}
-
-TEST(test_interface_graph, SetFpmathMode) {
-    using namespace dnnl::impl::graph;
-    ASSERT_EQ(dnnl::impl::get_fpmath_mode(), fpmath_mode::strict);
-
-    graph_t graph;
-    ASSERT_EQ(graph.get_fpmath_mode(), fpmath_mode::strict);
-
-    for (auto m : {fpmath_mode::strict, fpmath_mode::bf16, fpmath_mode::f16,
-                 fpmath_mode::any}) {
-        graph_t graph2 {engine_kind::cpu, m};
-        ASSERT_EQ(graph2.get_fpmath_mode(), m);
-    }
-}
-
-TEST(test_interface_graph, SetUserInputsOutputs) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t div(0, op_kind::Divide, "add_op");
-
-    // prepare logical tensor
-    logical_tensor_t src0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t src1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t dst = logical_tensor_init(2, data_type::f32);
-
-    div.add_input(src0);
-    div.add_input(src1);
-    div.add_output(dst);
-
-    graph_t g(engine_kind::cpu);
-    ASSERT_EQ(g.add_op(&div), status::success);
-    g.finalize();
-
-    logical_tensor_t src0_compile
-            = logical_tensor_init(0, {2, 2}, data_type::f32);
-    logical_tensor_t src1_compile = logical_tensor_init(1, {}, data_type::f32);
-
-    ASSERT_EQ(g.set_user_inputs_outputs({src0_compile, src1_compile}, {dst}),
-            status::success);
-    ASSERT_EQ(g.infer_shape(), status::success);
-    auto out_vals = g.get_output_values();
-    ASSERT_EQ(out_vals.size(), 1U);
-    logical_tensor_t out_lt = out_vals[0]->get_logical_tensor();
-    ASSERT_EQ(out_lt.id, 2U);
-    ASSERT_EQ(out_lt.ndims, 2);
-    ASSERT_EQ(out_lt.dims[0], 2);
-    ASSERT_EQ(out_lt.dims[1], 2);
-}
-
-TEST(test_interface_graph, NonDAGGraph) {
-    /*
-          mm0 <--
-          |     |
-         relu1  |
-          |     |
-          mm2  /
-          |   /
-         relu3
-    */
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t mm0(0, op_kind::MatMul, "matmul_op");
-    op_t relu1(1, op_kind::ReLU, "relu_op");
-    op_t mm2(2, op_kind::MatMul, "matmul_op");
-    op_t relu3(3, op_kind::ReLU, "relu_op");
-
-    // prepare logical tensor
-    logical_tensor_t mm0_src = logical_tensor_init(0, {1, 2}, data_type::f32);
-    logical_tensor_t mm0_weight
-            = logical_tensor_init(1, {2, 1}, data_type::f32);
-    logical_tensor_t mm0_dst = logical_tensor_init(2, {1, 1}, data_type::f32);
-    logical_tensor_t relu1_dst = logical_tensor_init(3, {1, 1}, data_type::f32);
-
-    logical_tensor_t mm2_weight
-            = logical_tensor_init(4, {2, 1}, data_type::f32);
-    logical_tensor_t mm2_dst = logical_tensor_init(5, {1, 1}, data_type::f32);
-
-    mm0.add_input(mm0_src);
-    mm0.add_input(mm0_weight);
-    mm0.add_output(mm0_dst);
-    relu1.add_input(mm0_dst);
-    relu1.add_output(relu1_dst);
-    mm2.add_input(relu1_dst);
-    mm2.add_input(mm2_weight);
-    mm2.add_output(mm2_dst);
-    relu3.add_input(mm2_dst);
-    relu3.add_output(mm0_src);
-
-    graph_t g(engine_kind::cpu);
-    ASSERT_EQ(g.add_op(&mm0), status::success);
-    ASSERT_EQ(g.add_op(&relu1), status::success);
-    ASSERT_EQ(g.add_op(&mm2), status::success);
-    ASSERT_EQ(g.add_op(&relu3), status::success);
-
-    status_t status = g.finalize();
-    ASSERT_EQ(status, status::invalid_graph);
-}
-
-TEST(test_interface_graph, SingleOpGraph) {
-    /*                __
-        \ /          |   \  /
-         matmul  or  |  matmul
-           |         |____|
-    */
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    std::vector<status_t> statuses = {status::success, status::invalid_graph};
-    for (const auto &expected_status : statuses) {
-        op_t mm(0, op_kind::MatMul, "matmul_op");
-        // prepare logical tensor
-        logical_tensor_t mm_src
-                = logical_tensor_init(0, {1, 2}, data_type::f32);
-        logical_tensor_t mm_weight
-                = logical_tensor_init(1, {2, 2}, data_type::f32);
-        logical_tensor_t mm_dst
-                = logical_tensor_init(2, {1, 2}, data_type::f32);
-
-        mm.add_input(mm_src);
-        mm.add_input(mm_weight);
-        if (expected_status == status::success) {
-            mm.add_output(mm_dst);
-        } else {
-            mm.add_output(mm_src);
-        }
-
-        graph_t g(engine_kind::cpu);
-        ASSERT_EQ(g.add_op(&mm), status::success);
-
-        status_t status = g.finalize();
-        ASSERT_EQ(status, expected_status);
-    }
-}
-
-TEST(test_interface_graph, DAGGraphWithNonDAGGraph) {
-    /*
-        mm0             mm2 <--
-         |     &&        |     |
-        relu1          relu3 __|
-    */
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::op_kind;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t mm0(0, op_kind::MatMul, "matmul_op");
-    op_t relu1(1, op_kind::ReLU, "relu_op");
-    op_t mm2(2, op_kind::MatMul, "matmul_op");
-    op_t relu3(3, op_kind::ReLU, "relu_op");
-
-    // prepare logical tensor
-    logical_tensor_t mm0_src = logical_tensor_init(0, {1, 2}, data_type::f32);
-    logical_tensor_t mm0_weight
-            = logical_tensor_init(1, {2, 1}, data_type::f32);
-    logical_tensor_t mm0_dst = logical_tensor_init(2, {1, 1}, data_type::f32);
-    logical_tensor_t relu1_dst = logical_tensor_init(3, {1, 1}, data_type::f32);
-
-    logical_tensor_t mm2_src = logical_tensor_init(4, {1, 5}, data_type::f32);
-    logical_tensor_t mm2_weight
-            = logical_tensor_init(5, {5, 5}, data_type::f32);
-    logical_tensor_t mm2_dst = logical_tensor_init(6, {1, 5}, data_type::f32);
-
-    mm0.add_input(mm0_src);
-    mm0.add_input(mm0_weight);
-    mm0.add_output(mm0_dst);
-    relu1.add_input(mm0_dst);
-    relu1.add_output(relu1_dst);
-
-    mm2.add_input(mm2_src);
-    mm2.add_input(mm2_weight);
-    mm2.add_output(mm2_dst);
-    relu3.add_input(mm2_dst);
-    relu3.add_output(mm2_src);
-
-    graph_t g(engine_kind::cpu);
-    ASSERT_EQ(g.add_op(&mm0), status::success);
-    ASSERT_EQ(g.add_op(&relu1), status::success);
-    ASSERT_EQ(g.add_op(&mm2), status::success);
-    ASSERT_EQ(g.add_op(&relu3), status::success);
-
-    status_t status = g.finalize();
-    ASSERT_EQ(status, status::invalid_graph);
-}
diff --git a/tests/gtests/graph/unit/interface/test_graph_cpu.cpp b/tests/gtests/graph/unit/interface/test_graph_cpu.cpp
new file mode 100644
index 00000000000..b854e7f2460
--- /dev/null
+++ b/tests/gtests/graph/unit/interface/test_graph_cpu.cpp
@@ -0,0 +1,597 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+#include "interface/c_types_map.hpp"
+#include "interface/graph.hpp"
+#include "interface/logical_tensor.hpp"
+#include "interface/value.hpp"
+
+#include "graph/unit/utils.hpp"
+
+TEST(test_interface_graph, Create) {
+    using namespace dnnl::impl::graph;
+
+    graph_t g_default_engine;
+    ASSERT_EQ(g_default_engine.get_engine_kind(), engine_kind::cpu);
+
+    graph_t g_gpu {engine_kind::gpu};
+    ASSERT_EQ(g_gpu.get_engine_kind(), engine_kind::gpu);
+}
+
+TEST(test_interface_graph, AddOp) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv2d")};
+    op_t op1 {1, ReLU, std::string("relu")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
+    op0.add_input(t0);
+    op0.add_output(t1);
+    op1.add_input(t1);
+    op1.add_output(t2);
+    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+
+    auto ret = agraph.get_ops()[0];
+    ASSERT_EQ(*ret, op1);
+}
+
+TEST(test_interface_graph, FailAddOpWithInvalidAttrValue) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv0")};
+
+    op0.set_attr<std::vector<int64_t>>(op_attr::strides, {4, 4});
+    op0.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {111, 111});
+    op0.set_attr<std::vector<int64_t>>(op_attr::pads_end, {111, 111});
+    op0.set_attr<std::string>(op_attr::auto_pad, "VALID");
+    op0.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
+    op0.set_attr<std::string>(op_attr::data_format, "NCX");
+    op0.set_attr<std::string>(op_attr::weights_format, "OIX");
+    op0.set_attr<int64_t>(op_attr::groups, 1);
+
+    // prepare logical tensor
+    logical_tensor_t src = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t weight = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t bias = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t conv_dst = logical_tensor_init(4, data_type::f32);
+
+    op0.add_input(src);
+    op0.add_input(weight);
+    op0.add_input(bias);
+    op0.add_output(conv_dst);
+
+    ASSERT_EQ(agraph.add_op(&op0), status::success);
+
+    op0.set_attr<std::string>(op_attr::weights_format, "IOX");
+    graph_t agraph1;
+    ASSERT_EQ(agraph1.add_op(&op0), status::invalid_graph_op);
+}
+
+TEST(test_interface_graph, AddNullOp) {
+    using namespace dnnl::impl::graph;
+
+    graph_t agraph;
+    ASSERT_EQ(agraph.add_op(nullptr), status::invalid_graph_op);
+    ASSERT_EQ(agraph.num_ops(), 0U);
+    ASSERT_EQ(agraph.get_ops().size(), 0U);
+}
+
+TEST(test_interface_graph, DeleteOp) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv2d")};
+    op_t op1 {1, ReLU, std::string("relu")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
+    op0.add_input(t0);
+    op0.add_output(t1);
+    op1.add_input(t1);
+    op1.add_output(t2);
+    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+    ASSERT_EQ(agraph.get_ops().size(), 1U);
+
+    agraph.delete_op(&op1);
+    ASSERT_EQ(agraph.num_ops(), 0U);
+    ASSERT_EQ(agraph.get_ops().size(), 0U);
+}
+
+TEST(test_interface_graph, GetOutputOps) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, ReLU, std::string("relu1")};
+    op_t op1 {1, ReLU, std::string("relu2")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
+    op0.add_input(t0);
+    op0.add_output(t1);
+    op1.add_input(t1);
+    op1.add_output(t2);
+    ASSERT_EQ(agraph.add_op(&op0), status::success);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.num_ops(), 2U);
+    agraph.finalize();
+    ASSERT_EQ(agraph.get_output_ops().size(), 1U);
+    ASSERT_EQ(*(agraph.get_output_ops()[0]), op1);
+}
+
+TEST(test_interface_graph, GetOutputOps2) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, ReLU, std::string("relu1")};
+    op_t op1 {1, ReLU, std::string("relu2")};
+    op_t op2 {2, ReLU, std::string("relu3")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t t3 = logical_tensor_init(3, data_type::f32);
+    op0.add_input(t0);
+    op0.add_output(t1);
+    op1.add_input(t1);
+    op1.add_output(t2);
+    op2.add_input(t2);
+    op2.add_output(t3);
+    ASSERT_EQ(agraph.add_op(&op0), status::success);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.add_op(&op2), status::success);
+    ASSERT_EQ(agraph.num_ops(), 3U);
+    agraph.finalize();
+    ASSERT_EQ(agraph.get_ops()[1]
+                      ->get_output_value(0)
+                      ->get_consumers()[0]
+                      .get_op(),
+            *agraph.get_ops()[2]);
+    ASSERT_EQ(agraph.get_ops()[0]
+                      ->get_output_value(0)
+                      ->get_consumers()[0]
+                      .get_op(),
+            *agraph.get_ops()[1]);
+    ASSERT_EQ(agraph.get_ops()[2]->get_input_value(0)->has_producer(), true);
+    ASSERT_EQ(agraph.get_ops()[1]->get_input_value(0)->has_producer(), true);
+}
+
+TEST(test_interface_graph, BuildGraph) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv2d")};
+    op_t op1 {1, ReLU, std::string("relu")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
+    op0.add_input(t0);
+    op0.add_output(t1);
+    op1.add_input(t1);
+    op1.add_output(t2);
+    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+
+    ASSERT_EQ(agraph.finalize(), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+    ASSERT_EQ(agraph.finalize(), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+}
+
+TEST(test_interface_graph, InvalidOp) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv2d")};
+    op_t op1 {1, ReLU, std::string("relu")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t t2 = logical_tensor_init(2, data_type::f32);
+    op0.add_input(t0);
+    op0.add_output(t1);
+    op1.add_input(t1);
+    op1.add_output(t2);
+    ASSERT_EQ(agraph.add_op(&op0), status::invalid_graph_op);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+
+    /*
+    ASSERT_EQ(agraph.run_pass(partition_policy::fusion), status::invalid_graph);
+    ASSERT_EQ(agraph.finalize(), status::success);
+    ASSERT_EQ(agraph.run_pass(partition_policy::fusion), status::success);
+    */
+}
+
+TEST(test_interface_graph, Wildcard) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    graph_t agraph;
+    op_t op {0, Wildcard, std::string("wildcard")};
+    logical_tensor_t t0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t t1 = logical_tensor_init(1, data_type::f32);
+    op.add_input(t0);
+    op.add_output(t1);
+    ASSERT_EQ(agraph.add_op(&op), status::success);
+    ASSERT_EQ(agraph.num_ops(), 1U);
+}
+
+TEST(test_interface_graph, GetInputOutputEdges) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+    using ltw = dnnl::impl::graph::logical_tensor_wrapper_t;
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv0")};
+    op_t op1 {1, Add, std::string("add0")};
+    op_t op2 {2, ReLU, std::string("relu0")};
+    op_t op3 {3, Wildcard, std::string("wildcard0")};
+    op_t op4 {4, Wildcard, std::string("wildcard1")};
+    op_t op5 {5, Wildcard, std::string("wildcard2")};
+
+    op0.set_attr<std::vector<int64_t>>(op_attr::strides, {4, 4});
+    op0.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {111, 111});
+    op0.set_attr<std::vector<int64_t>>(op_attr::pads_end, {111, 111});
+    op0.set_attr<std::string>(op_attr::auto_pad, "VALID");
+    op0.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
+    op0.set_attr<std::string>(op_attr::data_format, "NCX");
+    op0.set_attr<std::string>(op_attr::weights_format, "OIX");
+    op0.set_attr<int64_t>(op_attr::groups, 1);
+
+    // prepare logical tensor
+    logical_tensor_t src = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t weight = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t bias = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t other = logical_tensor_init(3, data_type::f32);
+    logical_tensor_t conv_dst = logical_tensor_init(4, data_type::f32);
+    logical_tensor_t add_dst = logical_tensor_init(5, data_type::f32);
+    logical_tensor_t dst = logical_tensor_init(6, data_type::f32);
+    logical_tensor_t wild_val = logical_tensor_init(7, data_type::f32);
+
+    op4.add_output(src);
+    op0.add_input(src);
+    op0.add_input(weight);
+    op0.add_input(bias);
+    op0.add_output(conv_dst);
+
+    op1.add_input(conv_dst);
+    op1.add_input(other);
+    op1.add_output(add_dst);
+
+    op2.add_input(add_dst);
+    op2.add_output(dst);
+
+    op3.add_input(conv_dst);
+    op3.add_output(wild_val);
+
+    op5.add_input(wild_val);
+
+    ASSERT_EQ(agraph.add_op(&op0), status::success);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.add_op(&op2), status::success);
+    ASSERT_EQ(agraph.add_op(&op3), status::success);
+    ASSERT_EQ(agraph.num_ops(), 4U);
+    agraph.finalize();
+
+    auto ops = agraph.get_ops();
+    ops.pop_back();
+    graph_t subgraph(ops);
+    ASSERT_EQ(subgraph.num_ops(), 3U);
+
+    auto in_vals = subgraph.get_input_values();
+    ASSERT_EQ(in_vals.size(), 4U);
+    ASSERT_EQ(ltw(in_vals[0]->get_logical_tensor()), ltw(src));
+    ASSERT_EQ(ltw(in_vals[1]->get_logical_tensor()), ltw(weight));
+    ASSERT_EQ(ltw(in_vals[2]->get_logical_tensor()), ltw(bias));
+    ASSERT_EQ(ltw(in_vals[3]->get_logical_tensor()), ltw(other));
+
+    auto out_vals = subgraph.get_output_values();
+    ASSERT_EQ(out_vals.size(), 2U);
+    logical_tensor_t out_lt1 = out_vals[0]->get_logical_tensor();
+    logical_tensor_t out_lt2 = out_vals[1]->get_logical_tensor();
+    if (out_lt1.id == 6)
+        ASSERT_EQ(out_lt2.id, 4U);
+    else if (out_lt1.id == 4)
+        ASSERT_EQ(out_lt2.id, 6U);
+    else
+        ASSERT_TRUE(false);
+}
+
+TEST(test_interface_graph, InferShape) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+    using ltw = dnnl::impl::graph::logical_tensor_wrapper_t;
+
+    std::vector<int64_t> src_shape {8, 3, 227, 227};
+    std::vector<int64_t> weight_shape {96, 3, 11, 11};
+    std::vector<int64_t> bias_shape {96};
+    std::vector<int64_t> dst_shape {8, 96, 55, 55};
+
+    graph_t agraph;
+    op_t op0 {0, Convolution, std::string("conv0")};
+    op_t op1 {1, Add, std::string("add0")};
+    op_t op2 {2, ReLU, std::string("relu0")};
+
+    op0.set_attr<std::vector<int64_t>>(op_attr::strides, {4, 4});
+    op0.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {111, 111});
+    op0.set_attr<std::vector<int64_t>>(op_attr::pads_end, {111, 111});
+    op0.set_attr<std::string>(op_attr::auto_pad, "VALID");
+    op0.set_attr<std::vector<int64_t>>(op_attr::dilations, {1, 1});
+    op0.set_attr<std::string>(op_attr::data_format, "NCX");
+    op0.set_attr<std::string>(op_attr::weights_format, "OIX");
+    op0.set_attr<int64_t>(op_attr::groups, 1);
+
+    // prepare logical tensor
+    logical_tensor_t src = logical_tensor_init(0, src_shape, data_type::f32);
+    logical_tensor_t weight
+            = logical_tensor_init(1, weight_shape, data_type::f32);
+    logical_tensor_t bias = logical_tensor_init(2, bias_shape, data_type::f32);
+    logical_tensor_t other = logical_tensor_init(3, dst_shape, data_type::f32);
+    logical_tensor_t conv_dst = logical_tensor_init(4, data_type::f32);
+    logical_tensor_t add_dst = logical_tensor_init(5, data_type::f32);
+    logical_tensor_t dst = logical_tensor_init(6, data_type::f32);
+
+    op0.add_input(src);
+    op0.add_input(weight);
+    op0.add_input(bias);
+    op0.add_output(conv_dst);
+
+    op1.add_input(conv_dst);
+    op1.add_input(other);
+    op1.add_output(add_dst);
+
+    op2.add_input(add_dst);
+    op2.add_output(dst);
+
+    ASSERT_EQ(agraph.add_op(&op0), status::success);
+    ASSERT_EQ(agraph.add_op(&op1), status::success);
+    ASSERT_EQ(agraph.add_op(&op2), status::success);
+    ASSERT_EQ(agraph.num_ops(), 3U);
+    agraph.finalize();
+
+    auto in_vals = agraph.get_input_values();
+    ASSERT_EQ(in_vals.size(), 4U);
+    ASSERT_EQ(ltw(in_vals[0]->get_logical_tensor()), ltw(src));
+    ASSERT_EQ(ltw(in_vals[1]->get_logical_tensor()), ltw(weight));
+    ASSERT_EQ(ltw(in_vals[2]->get_logical_tensor()), ltw(bias));
+    ASSERT_EQ(ltw(in_vals[3]->get_logical_tensor()), ltw(other));
+
+    ASSERT_EQ(agraph.infer_shape(), status::success);
+
+    auto out_vals = agraph.get_output_values();
+    ASSERT_EQ(out_vals.size(), 1U);
+    logical_tensor_t out_lt = out_vals[0]->get_logical_tensor();
+    ASSERT_EQ(out_lt.id, 6U);
+    ASSERT_EQ(out_lt.ndims, 4);
+    ASSERT_EQ(out_lt.dims[0], 8);
+    ASSERT_EQ(out_lt.dims[1], 96);
+    ASSERT_EQ(out_lt.dims[2], 55);
+    ASSERT_EQ(out_lt.dims[3], 55);
+}
+
+TEST(test_interface_graph, SetFpmathMode) {
+    using namespace dnnl::impl::graph;
+    ASSERT_EQ(dnnl::impl::get_fpmath_mode(), fpmath_mode::strict);
+
+    graph_t graph;
+    ASSERT_EQ(graph.get_fpmath_mode().mode_, fpmath_mode::strict);
+
+    for (auto m : {fpmath_mode::strict, fpmath_mode::bf16, fpmath_mode::f16,
+                 fpmath_mode::any}) {
+        graph_t graph2 {engine_kind::cpu, m};
+        ASSERT_EQ(graph2.get_fpmath_mode().mode_, m);
+    }
+}
+
+TEST(test_interface_graph, SetUserInputsOutputs) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t div(0, op_kind::Divide, "add_op");
+
+    // prepare logical tensor
+    logical_tensor_t src0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t src1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t dst = logical_tensor_init(2, data_type::f32);
+
+    div.add_input(src0);
+    div.add_input(src1);
+    div.add_output(dst);
+
+    graph_t g(engine_kind::cpu);
+    ASSERT_EQ(g.add_op(&div), status::success);
+    g.finalize();
+
+    logical_tensor_t src0_compile
+            = logical_tensor_init(0, {2, 2}, data_type::f32);
+    logical_tensor_t src1_compile = logical_tensor_init(1, {}, data_type::f32);
+
+    ASSERT_EQ(g.set_user_inputs_outputs({src0_compile, src1_compile}, {dst}),
+            status::success);
+    ASSERT_EQ(g.infer_shape(), status::success);
+    auto out_vals = g.get_output_values();
+    ASSERT_EQ(out_vals.size(), 1U);
+    logical_tensor_t out_lt = out_vals[0]->get_logical_tensor();
+    ASSERT_EQ(out_lt.id, 2U);
+    ASSERT_EQ(out_lt.ndims, 2);
+    ASSERT_EQ(out_lt.dims[0], 2);
+    ASSERT_EQ(out_lt.dims[1], 2);
+}
+
+TEST(test_interface_graph, NonDAGGraph) {
+    /*
+          mm0 <--
+          |     |
+         relu1  |
+          |     |
+          mm2  /
+          |   /
+         relu3
+    */
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t mm0(0, op_kind::MatMul, "matmul_op");
+    op_t relu1(1, op_kind::ReLU, "relu_op");
+    op_t mm2(2, op_kind::MatMul, "matmul_op");
+    op_t relu3(3, op_kind::ReLU, "relu_op");
+
+    // prepare logical tensor
+    logical_tensor_t mm0_src = logical_tensor_init(0, {1, 2}, data_type::f32);
+    logical_tensor_t mm0_weight
+            = logical_tensor_init(1, {2, 1}, data_type::f32);
+    logical_tensor_t mm0_dst = logical_tensor_init(2, {1, 1}, data_type::f32);
+    logical_tensor_t relu1_dst = logical_tensor_init(3, {1, 1}, data_type::f32);
+
+    logical_tensor_t mm2_weight
+            = logical_tensor_init(4, {2, 1}, data_type::f32);
+    logical_tensor_t mm2_dst = logical_tensor_init(5, {1, 1}, data_type::f32);
+
+    mm0.add_input(mm0_src);
+    mm0.add_input(mm0_weight);
+    mm0.add_output(mm0_dst);
+    relu1.add_input(mm0_dst);
+    relu1.add_output(relu1_dst);
+    mm2.add_input(relu1_dst);
+    mm2.add_input(mm2_weight);
+    mm2.add_output(mm2_dst);
+    relu3.add_input(mm2_dst);
+    relu3.add_output(mm0_src);
+
+    graph_t g(engine_kind::cpu);
+    ASSERT_EQ(g.add_op(&mm0), status::success);
+    ASSERT_EQ(g.add_op(&relu1), status::success);
+    ASSERT_EQ(g.add_op(&mm2), status::success);
+    ASSERT_EQ(g.add_op(&relu3), status::success);
+
+    status_t status = g.finalize();
+    ASSERT_EQ(status, status::invalid_graph);
+}
+
+TEST(test_interface_graph, SingleOpGraph) {
+    /*                __
+        \ /          |   \  /
+         matmul  or  |  matmul
+           |         |____|
+    */
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    std::vector<status_t> statuses = {status::success, status::invalid_graph};
+    for (const auto &expected_status : statuses) {
+        op_t mm(0, op_kind::MatMul, "matmul_op");
+        // prepare logical tensor
+        logical_tensor_t mm_src
+                = logical_tensor_init(0, {1, 2}, data_type::f32);
+        logical_tensor_t mm_weight
+                = logical_tensor_init(1, {2, 2}, data_type::f32);
+        logical_tensor_t mm_dst
+                = logical_tensor_init(2, {1, 2}, data_type::f32);
+
+        mm.add_input(mm_src);
+        mm.add_input(mm_weight);
+        if (expected_status == status::success) {
+            mm.add_output(mm_dst);
+        } else {
+            mm.add_output(mm_src);
+        }
+
+        graph_t g(engine_kind::cpu);
+        ASSERT_EQ(g.add_op(&mm), status::success);
+
+        status_t status = g.finalize();
+        ASSERT_EQ(status, expected_status);
+    }
+}
+
+TEST(test_interface_graph, DAGGraphWithNonDAGGraph) {
+    /*
+        mm0             mm2 <--
+         |     &&        |     |
+        relu1          relu3 __|
+    */
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::op_kind;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t mm0(0, op_kind::MatMul, "matmul_op");
+    op_t relu1(1, op_kind::ReLU, "relu_op");
+    op_t mm2(2, op_kind::MatMul, "matmul_op");
+    op_t relu3(3, op_kind::ReLU, "relu_op");
+
+    // prepare logical tensor
+    logical_tensor_t mm0_src = logical_tensor_init(0, {1, 2}, data_type::f32);
+    logical_tensor_t mm0_weight
+            = logical_tensor_init(1, {2, 1}, data_type::f32);
+    logical_tensor_t mm0_dst = logical_tensor_init(2, {1, 1}, data_type::f32);
+    logical_tensor_t relu1_dst = logical_tensor_init(3, {1, 1}, data_type::f32);
+
+    logical_tensor_t mm2_src = logical_tensor_init(4, {1, 5}, data_type::f32);
+    logical_tensor_t mm2_weight
+            = logical_tensor_init(5, {5, 5}, data_type::f32);
+    logical_tensor_t mm2_dst = logical_tensor_init(6, {1, 5}, data_type::f32);
+
+    mm0.add_input(mm0_src);
+    mm0.add_input(mm0_weight);
+    mm0.add_output(mm0_dst);
+    relu1.add_input(mm0_dst);
+    relu1.add_output(relu1_dst);
+
+    mm2.add_input(mm2_src);
+    mm2.add_input(mm2_weight);
+    mm2.add_output(mm2_dst);
+    relu3.add_input(mm2_dst);
+    relu3.add_output(mm2_src);
+
+    graph_t g(engine_kind::cpu);
+    ASSERT_EQ(g.add_op(&mm0), status::success);
+    ASSERT_EQ(g.add_op(&relu1), status::success);
+    ASSERT_EQ(g.add_op(&mm2), status::success);
+    ASSERT_EQ(g.add_op(&relu3), status::success);
+
+    status_t status = g.finalize();
+    ASSERT_EQ(status, status::invalid_graph);
+}
diff --git a/tests/gtests/graph/unit/interface/test_logical_tensor.cpp b/tests/gtests/graph/unit/interface/test_logical_tensor.cpp
deleted file mode 100644
index e0357f76e0f..00000000000
--- a/tests/gtests/graph/unit/interface/test_logical_tensor.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <vector>
-#include <gtest/gtest.h>
-
-#include "graph/unit/utils.hpp"
-#include "interface/backend.hpp"
-#include "interface/logical_tensor.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_interface_logical_tensor, CreateDefault) {
-    const size_t id = 123;
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(id, graph::data_type::f32);
-
-    ASSERT_EQ(lt.id, id);
-    ASSERT_EQ(lt.data_type, graph::data_type::f32);
-}
-
-TEST(test_interface_logical_tensor, CreateWithShape) {
-    const size_t id = 123;
-
-    graph::logical_tensor_t lt_0
-            = utils::logical_tensor_init(id, {}, graph::data_type::f32);
-    ASSERT_EQ(lt_0.id, id);
-    ASSERT_EQ(lt_0.ndims, 0);
-    ASSERT_EQ(lt_0.data_type, graph::data_type::f32);
-
-    graph::logical_tensor_t lt_1
-            = utils::logical_tensor_init(id, {3}, graph::data_type::f32);
-    ASSERT_EQ(lt_1.id, id);
-    ASSERT_EQ(lt_1.ndims, 1);
-    ASSERT_EQ(lt_1.data_type, graph::data_type::f32);
-
-    graph::logical_tensor_t lt_2
-            = utils::logical_tensor_init(id, {3, 4}, graph::data_type::f32);
-    ASSERT_EQ(lt_2.id, id);
-    ASSERT_EQ(lt_2.ndims, 2);
-    ASSERT_EQ(lt_2.data_type, graph::data_type::f32);
-
-    graph::logical_tensor_t lt_3
-            = utils::logical_tensor_init(id, {3, 4, 5}, graph::data_type::f32);
-    ASSERT_EQ(lt_3.id, id);
-    ASSERT_EQ(lt_3.ndims, 3);
-    ASSERT_EQ(lt_3.data_type, graph::data_type::f32);
-
-    graph::logical_tensor_t lt_4 = utils::logical_tensor_init(
-            id, {3, 4, 5, 6}, graph::data_type::f32);
-    ASSERT_EQ(lt_4.id, id);
-    ASSERT_EQ(lt_4.ndims, 4);
-    ASSERT_EQ(lt_4.data_type, graph::data_type::f32);
-
-    graph::logical_tensor_t lt_5
-            = utils::logical_tensor_init(id, {4, 5, 0}, graph::data_type::f32);
-    ASSERT_EQ(lt_5.id, id);
-    ASSERT_EQ(lt_5.ndims, 3);
-    ASSERT_EQ(lt_5.data_type, graph::data_type::f32);
-    ASSERT_EQ(lt_5.layout_type, graph::layout_type::strided);
-    ASSERT_EQ(lt_5.layout.strides[0], 5);
-    ASSERT_EQ(lt_5.layout.strides[1], 1);
-    ASSERT_EQ(lt_5.layout.strides[2], 1);
-}
-
-TEST(test_interface_logical_tensor, Copy) {
-    const size_t id = 123;
-
-    graph::logical_tensor_t lt_1
-            = utils::logical_tensor_init(id, {3, 4}, graph::data_type::f32);
-    graph::logical_tensor_t lt_2(lt_1);
-
-    ASSERT_EQ(lt_1.id, lt_2.id);
-    ASSERT_EQ(lt_1.ndims, lt_2.ndims);
-    ASSERT_EQ(lt_1.data_type, lt_2.data_type);
-}
-
-TEST(test_interface_logical_tensor, Assign) {
-    const size_t id = 123;
-
-    graph::logical_tensor_t lt_1
-            = utils::logical_tensor_init(id, {3, 4}, graph::data_type::f32);
-    graph::logical_tensor_t lt_2 = lt_1;
-
-    ASSERT_EQ(lt_1.id, lt_2.id);
-    ASSERT_EQ(lt_1.ndims, lt_2.ndims);
-    ASSERT_EQ(lt_1.data_type, lt_2.data_type);
-}
-
-TEST(test_interface_logical_tensor, PushToVector) {
-    size_t num_inputs = 3;
-    std::vector<graph::dim_t> dims {1};
-    std::vector<graph::logical_tensor_t> lt_vec;
-    lt_vec.reserve(num_inputs);
-    for (size_t i = 0; i < num_inputs; ++i) {
-        lt_vec.emplace_back(
-                utils::logical_tensor_init(i, dims, graph::data_type::f32));
-    }
-
-    for (size_t i = 0; i < num_inputs; ++i) {
-        ASSERT_EQ((size_t)lt_vec[i].ndims, dims.size());
-    }
-}
-
-TEST(test_interface_logical_tensor, IdenticalSimilar) {
-    using ltw = graph::logical_tensor_wrapper_t;
-
-    // unknown dims and strides
-    graph::logical_tensor_t lt1 = utils::logical_tensor_init(
-            0, graph::data_type::f32, graph::layout_type::strided);
-    graph::logical_tensor_t lt2 = utils::logical_tensor_init(
-            0, graph::data_type::f32, graph::layout_type::strided);
-    graph::logical_tensor_t lt3 = utils::logical_tensor_init(
-            1, graph::data_type::f32, graph::layout_type::strided);
-    ASSERT_EQ(ltw(lt1).is_identical(ltw(lt2)), true);
-    ASSERT_EQ(ltw(lt1).is_identical(ltw(lt3)), false);
-
-    // given dims and strides
-    graph::logical_tensor_t lt4 = utils::logical_tensor_init(
-            1, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
-    // implicit strides
-    graph::logical_tensor_t lt5 = utils::logical_tensor_init(
-            1, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
-    // explicit strides
-    graph::logical_tensor_t lt6 = utils::logical_tensor_init(
-            1, {1, 2, 3}, {6, 3, 1}, graph::data_type::f32);
-    ASSERT_EQ(ltw(lt4).is_identical(ltw(lt5)), true);
-    ASSERT_EQ(ltw(lt4).is_identical(ltw(lt6)), true);
-
-    // same id + same shape/strides
-    graph::logical_tensor_t lt7 = utils::logical_tensor_init(
-            1, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
-    // same id + different shape/strides
-    graph::logical_tensor_t lt8 = utils::logical_tensor_init(
-            1, {1, 2, 1}, graph::data_type::f32, graph::layout_type::strided);
-    ASSERT_TRUE(ltw(lt4) == ltw(lt7));
-    ASSERT_TRUE(ltw(lt4) != ltw(lt8));
-
-    // different id + same shape/strides
-    graph::logical_tensor_t lt9 = utils::logical_tensor_init(
-            2, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
-    // different id + different shape/strides
-    graph::logical_tensor_t lt10 = utils::logical_tensor_init(
-            2, {1, 2, 1}, graph::data_type::f32, graph::layout_type::strided);
-    ASSERT_EQ(ltw(lt4).is_similar(ltw(lt9)), true);
-    ASSERT_EQ(ltw(lt4).is_similar(ltw(lt10)), false);
-}
-
-TEST(test_interface_logical_tensor, GetWeightSpatialDims) {
-    using ltw = graph::logical_tensor_wrapper_t;
-    graph::logical_tensor_t lt = utils::logical_tensor_init(0, {1, 2, 2, 1},
-            graph::data_type::f32, graph::layout_type::strided);
-
-    auto wrap = ltw(lt);
-    ASSERT_EQ(wrap.get_weight_spatial_dims("XXX").empty(), true);
-    ASSERT_EQ(wrap.get_weight_spatial_dims("XIO")[0], 1);
-    ASSERT_EQ(wrap.get_weight_spatial_dims("XIO")[1], 2);
-    ASSERT_EQ(wrap.get_weight_spatial_dims("OIX")[0], 2);
-    ASSERT_EQ(wrap.get_weight_spatial_dims("OIX")[1], 1);
-}
-
-TEST(test_interface_logical_tensor, GetSrcSpatialDims) {
-    using ltw = graph::logical_tensor_wrapper_t;
-    graph::logical_tensor_t lt = utils::logical_tensor_init(0, {1, 2, 2, 1},
-            graph::data_type::f32, graph::layout_type::strided);
-
-    auto wrap = ltw(lt);
-    ASSERT_EQ(wrap.get_src_spatial_dims("XXX").size(), 0U);
-    ASSERT_EQ(wrap.get_src_spatial_dims("NCX")[0], 2);
-    ASSERT_EQ(wrap.get_src_spatial_dims("NCX")[1], 1);
-    ASSERT_EQ(wrap.get_src_spatial_dims("NXC")[0], 2);
-    ASSERT_EQ(wrap.get_src_spatial_dims("NXC")[1], 2);
-}
-
-TEST(test_interface_logical_tensor, GetWeightOrSrcIO) {
-    using ltw = graph::logical_tensor_wrapper_t;
-    graph::logical_tensor_t lt = utils::logical_tensor_init(0, {1, 2, 2, 1},
-            graph::data_type::f32, graph::layout_type::strided);
-
-    auto wrap = ltw(lt);
-    ASSERT_EQ(wrap.get_weight_i("XXX"), DNNL_GRAPH_UNKNOWN_DIM);
-    ASSERT_EQ(wrap.get_weight_i("OIX"), 2);
-    ASSERT_EQ(wrap.get_weight_i("XIO"), 2);
-
-    ASSERT_EQ(wrap.get_weight_o("XXX"), DNNL_GRAPH_UNKNOWN_DIM);
-    ASSERT_EQ(wrap.get_weight_o("OIX"), 1);
-    ASSERT_EQ(wrap.get_weight_o("XIO"), 1);
-
-    ASSERT_EQ(wrap.get_src_c("XXX"), DNNL_GRAPH_UNKNOWN_DIM);
-    ASSERT_EQ(wrap.get_src_c("NCX"), 2);
-    ASSERT_EQ(wrap.get_src_c("NXC"), 1);
-}
-
-TEST(test_interface_logical_tensor, IsIdentical) {
-    using ltw = graph::logical_tensor_wrapper_t;
-    graph::logical_tensor_t lt1 = utils::logical_tensor_init(
-            0, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::undef);
-    graph::logical_tensor_t lt2 = utils::logical_tensor_init(
-            0, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::undef);
-    graph::logical_tensor_t lt3 = utils::logical_tensor_init(
-            1, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::undef);
-    graph::logical_tensor_t lt4 = utils::logical_tensor_init(
-            1, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::opaque);
-    graph::logical_tensor_t lt5 = utils::logical_tensor_init(
-            1, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::opaque);
-    ASSERT_EQ(ltw(lt1).is_identical(lt2), true);
-    ASSERT_EQ(ltw(lt1).is_identical(lt3), false);
-    ASSERT_EQ(ltw(lt3).is_identical(lt4), false);
-    ASSERT_EQ(ltw(lt4).is_identical(lt5), true);
-}
diff --git a/tests/gtests/graph/unit/interface/test_logical_tensor_cpu.cpp b/tests/gtests/graph/unit/interface/test_logical_tensor_cpu.cpp
new file mode 100644
index 00000000000..c8894c0c672
--- /dev/null
+++ b/tests/gtests/graph/unit/interface/test_logical_tensor_cpu.cpp
@@ -0,0 +1,224 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <vector>
+#include <gtest/gtest.h>
+
+#include "graph/unit/utils.hpp"
+#include "interface/backend.hpp"
+#include "interface/logical_tensor.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+
+TEST(test_interface_logical_tensor, CreateDefault) {
+    const size_t id = 123;
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(id, graph::data_type::f32);
+
+    ASSERT_EQ(lt.id, id);
+    ASSERT_EQ(lt.data_type, graph::data_type::f32);
+}
+
+TEST(test_interface_logical_tensor, CreateWithShape) {
+    const size_t id = 123;
+
+    graph::logical_tensor_t lt_0
+            = utils::logical_tensor_init(id, {}, graph::data_type::f32);
+    ASSERT_EQ(lt_0.id, id);
+    ASSERT_EQ(lt_0.ndims, 0);
+    ASSERT_EQ(lt_0.data_type, graph::data_type::f32);
+
+    graph::logical_tensor_t lt_1
+            = utils::logical_tensor_init(id, {3}, graph::data_type::f32);
+    ASSERT_EQ(lt_1.id, id);
+    ASSERT_EQ(lt_1.ndims, 1);
+    ASSERT_EQ(lt_1.data_type, graph::data_type::f32);
+
+    graph::logical_tensor_t lt_2
+            = utils::logical_tensor_init(id, {3, 4}, graph::data_type::f32);
+    ASSERT_EQ(lt_2.id, id);
+    ASSERT_EQ(lt_2.ndims, 2);
+    ASSERT_EQ(lt_2.data_type, graph::data_type::f32);
+
+    graph::logical_tensor_t lt_3
+            = utils::logical_tensor_init(id, {3, 4, 5}, graph::data_type::f32);
+    ASSERT_EQ(lt_3.id, id);
+    ASSERT_EQ(lt_3.ndims, 3);
+    ASSERT_EQ(lt_3.data_type, graph::data_type::f32);
+
+    graph::logical_tensor_t lt_4 = utils::logical_tensor_init(
+            id, {3, 4, 5, 6}, graph::data_type::f32);
+    ASSERT_EQ(lt_4.id, id);
+    ASSERT_EQ(lt_4.ndims, 4);
+    ASSERT_EQ(lt_4.data_type, graph::data_type::f32);
+
+    graph::logical_tensor_t lt_5
+            = utils::logical_tensor_init(id, {4, 5, 0}, graph::data_type::f32);
+    ASSERT_EQ(lt_5.id, id);
+    ASSERT_EQ(lt_5.ndims, 3);
+    ASSERT_EQ(lt_5.data_type, graph::data_type::f32);
+    ASSERT_EQ(lt_5.layout_type, graph::layout_type::strided);
+    ASSERT_EQ(lt_5.layout.strides[0], 5);
+    ASSERT_EQ(lt_5.layout.strides[1], 1);
+    ASSERT_EQ(lt_5.layout.strides[2], 1);
+}
+
+TEST(test_interface_logical_tensor, Copy) {
+    const size_t id = 123;
+
+    graph::logical_tensor_t lt_1
+            = utils::logical_tensor_init(id, {3, 4}, graph::data_type::f32);
+    graph::logical_tensor_t lt_2(lt_1);
+
+    ASSERT_EQ(lt_1.id, lt_2.id);
+    ASSERT_EQ(lt_1.ndims, lt_2.ndims);
+    ASSERT_EQ(lt_1.data_type, lt_2.data_type);
+}
+
+TEST(test_interface_logical_tensor, Assign) {
+    const size_t id = 123;
+
+    graph::logical_tensor_t lt_1
+            = utils::logical_tensor_init(id, {3, 4}, graph::data_type::f32);
+    graph::logical_tensor_t lt_2 = lt_1;
+
+    ASSERT_EQ(lt_1.id, lt_2.id);
+    ASSERT_EQ(lt_1.ndims, lt_2.ndims);
+    ASSERT_EQ(lt_1.data_type, lt_2.data_type);
+}
+
+TEST(test_interface_logical_tensor, PushToVector) {
+    size_t num_inputs = 3;
+    std::vector<graph::dim_t> dims {1};
+    std::vector<graph::logical_tensor_t> lt_vec;
+    lt_vec.reserve(num_inputs);
+    for (size_t i = 0; i < num_inputs; ++i) {
+        lt_vec.emplace_back(
+                utils::logical_tensor_init(i, dims, graph::data_type::f32));
+    }
+
+    for (size_t i = 0; i < num_inputs; ++i) {
+        ASSERT_EQ((size_t)lt_vec[i].ndims, dims.size());
+    }
+}
+
+TEST(test_interface_logical_tensor, IdenticalSimilar) {
+    using ltw = graph::logical_tensor_wrapper_t;
+
+    // unknown dims and strides
+    graph::logical_tensor_t lt1 = utils::logical_tensor_init(
+            0, graph::data_type::f32, graph::layout_type::strided);
+    graph::logical_tensor_t lt2 = utils::logical_tensor_init(
+            0, graph::data_type::f32, graph::layout_type::strided);
+    graph::logical_tensor_t lt3 = utils::logical_tensor_init(
+            1, graph::data_type::f32, graph::layout_type::strided);
+    ASSERT_EQ(ltw(lt1).is_identical(ltw(lt2)), true);
+    ASSERT_EQ(ltw(lt1).is_identical(ltw(lt3)), false);
+
+    // given dims and strides
+    graph::logical_tensor_t lt4 = utils::logical_tensor_init(
+            1, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
+    // implicit strides
+    graph::logical_tensor_t lt5 = utils::logical_tensor_init(
+            1, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
+    // explicit strides
+    graph::logical_tensor_t lt6 = utils::logical_tensor_init(
+            1, {1, 2, 3}, {6, 3, 1}, graph::data_type::f32);
+    ASSERT_EQ(ltw(lt4).is_identical(ltw(lt5)), true);
+    ASSERT_EQ(ltw(lt4).is_identical(ltw(lt6)), true);
+
+    // same id + same shape/strides
+    graph::logical_tensor_t lt7 = utils::logical_tensor_init(
+            1, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
+    // same id + different shape/strides
+    graph::logical_tensor_t lt8 = utils::logical_tensor_init(
+            1, {1, 2, 1}, graph::data_type::f32, graph::layout_type::strided);
+    ASSERT_TRUE(ltw(lt4) == ltw(lt7));
+    ASSERT_TRUE(ltw(lt4) != ltw(lt8));
+
+    // different id + same shape/strides
+    graph::logical_tensor_t lt9 = utils::logical_tensor_init(
+            2, {1, 2, 3}, graph::data_type::f32, graph::layout_type::strided);
+    // different id + different shape/strides
+    graph::logical_tensor_t lt10 = utils::logical_tensor_init(
+            2, {1, 2, 1}, graph::data_type::f32, graph::layout_type::strided);
+    ASSERT_EQ(ltw(lt4).is_similar(ltw(lt9)), true);
+    ASSERT_EQ(ltw(lt4).is_similar(ltw(lt10)), false);
+}
+
+TEST(test_interface_logical_tensor, GetWeightSpatialDims) {
+    using ltw = graph::logical_tensor_wrapper_t;
+    graph::logical_tensor_t lt = utils::logical_tensor_init(0, {1, 2, 2, 1},
+            graph::data_type::f32, graph::layout_type::strided);
+
+    auto wrap = ltw(lt);
+    ASSERT_EQ(wrap.get_weight_spatial_dims("XXX").empty(), true);
+    ASSERT_EQ(wrap.get_weight_spatial_dims("XIO")[0], 1);
+    ASSERT_EQ(wrap.get_weight_spatial_dims("XIO")[1], 2);
+    ASSERT_EQ(wrap.get_weight_spatial_dims("OIX")[0], 2);
+    ASSERT_EQ(wrap.get_weight_spatial_dims("OIX")[1], 1);
+}
+
+TEST(test_interface_logical_tensor, GetSrcSpatialDims) {
+    using ltw = graph::logical_tensor_wrapper_t;
+    graph::logical_tensor_t lt = utils::logical_tensor_init(0, {1, 2, 2, 1},
+            graph::data_type::f32, graph::layout_type::strided);
+
+    auto wrap = ltw(lt);
+    ASSERT_EQ(wrap.get_src_spatial_dims("XXX").size(), 0U);
+    ASSERT_EQ(wrap.get_src_spatial_dims("NCX")[0], 2);
+    ASSERT_EQ(wrap.get_src_spatial_dims("NCX")[1], 1);
+    ASSERT_EQ(wrap.get_src_spatial_dims("NXC")[0], 2);
+    ASSERT_EQ(wrap.get_src_spatial_dims("NXC")[1], 2);
+}
+
+TEST(test_interface_logical_tensor, GetWeightOrSrcIO) {
+    using ltw = graph::logical_tensor_wrapper_t;
+    graph::logical_tensor_t lt = utils::logical_tensor_init(0, {1, 2, 2, 1},
+            graph::data_type::f32, graph::layout_type::strided);
+
+    auto wrap = ltw(lt);
+    ASSERT_EQ(wrap.get_weight_i("XXX"), DNNL_GRAPH_UNKNOWN_DIM);
+    ASSERT_EQ(wrap.get_weight_i("OIX"), 2);
+    ASSERT_EQ(wrap.get_weight_i("XIO"), 2);
+
+    ASSERT_EQ(wrap.get_weight_o("XXX"), DNNL_GRAPH_UNKNOWN_DIM);
+    ASSERT_EQ(wrap.get_weight_o("OIX"), 1);
+    ASSERT_EQ(wrap.get_weight_o("XIO"), 1);
+
+    ASSERT_EQ(wrap.get_src_c("XXX"), DNNL_GRAPH_UNKNOWN_DIM);
+    ASSERT_EQ(wrap.get_src_c("NCX"), 2);
+    ASSERT_EQ(wrap.get_src_c("NXC"), 1);
+}
+
+TEST(test_interface_logical_tensor, IsIdentical) {
+    using ltw = graph::logical_tensor_wrapper_t;
+    graph::logical_tensor_t lt1 = utils::logical_tensor_init(
+            0, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::undef);
+    graph::logical_tensor_t lt2 = utils::logical_tensor_init(
+            0, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::undef);
+    graph::logical_tensor_t lt3 = utils::logical_tensor_init(
+            1, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::undef);
+    graph::logical_tensor_t lt4 = utils::logical_tensor_init(
+            1, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::opaque);
+    graph::logical_tensor_t lt5 = utils::logical_tensor_init(
+            1, {1, 2, 2, 1}, graph::data_type::f32, graph::layout_type::opaque);
+    ASSERT_EQ(ltw(lt1).is_identical(lt2), true);
+    ASSERT_EQ(ltw(lt1).is_identical(lt3), false);
+    ASSERT_EQ(ltw(lt3).is_identical(lt4), false);
+    ASSERT_EQ(ltw(lt4).is_identical(lt5), true);
+}
diff --git a/tests/gtests/graph/unit/interface/test_op.cpp b/tests/gtests/graph/unit/interface/test_op.cpp
deleted file mode 100644
index a74492e12c5..00000000000
--- a/tests/gtests/graph/unit/interface/test_op.cpp
+++ /dev/null
@@ -1,389 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <array>
-#include <limits>
-#include <string>
-
-#include "gtest/gtest.h"
-
-#include "interface/c_types_map.hpp"
-#include "interface/logical_tensor.hpp"
-#include "interface/op.hpp"
-#include "interface/op_schema.hpp"
-#include "interface/partition.hpp"
-#include "interface/partition_impl.hpp"
-
-#include "graph/unit/utils.hpp"
-
-TEST(test_interface_op, OpResetAttr) {
-    using namespace dnnl::impl::graph;
-    op_t conv {0, op_kind::Convolution, std::string("conv")};
-    conv.set_attr(op_attr::auto_pad, std::string("VALID"));
-    conv.set_attr(op_attr::auto_pad, std::string("NONE"));
-    ASSERT_EQ(conv.get_attr<std::string>(op_attr::auto_pad), "NONE");
-}
-
-TEST(test_interface_op, Attr2str) {
-    namespace graph = dnnl::impl::graph;
-
-#define CASE(a) \
-    ASSERT_EQ(std::string(#a), graph::op_t::attr2str(graph::op_attr::a))
-
-    CASE(alpha);
-    CASE(beta);
-    CASE(epsilon);
-    CASE(max);
-    CASE(min);
-    CASE(momentum);
-    CASE(scales);
-    CASE(axis);
-    CASE(begin_norm_axis);
-    CASE(groups);
-    CASE(axes);
-    CASE(dilations);
-    CASE(weights_shape);
-    CASE(src_shape);
-    CASE(kernel);
-    CASE(order);
-    CASE(output_padding);
-    CASE(dst_shape);
-    CASE(pads_begin);
-    CASE(pads_end);
-    CASE(shape);
-    CASE(sizes);
-    CASE(strides);
-    CASE(zps);
-    CASE(exclude_pad);
-    CASE(keep_dims);
-    CASE(keep_stats);
-    CASE(per_channel_broadcast);
-    CASE(special_zero);
-    CASE(transpose_a);
-    CASE(transpose_b);
-    CASE(use_affine);
-    CASE(use_dst);
-    CASE(auto_broadcast);
-    CASE(auto_pad);
-    CASE(coordinate_transformation_mode);
-    CASE(data_format);
-    CASE(weights_format);
-    CASE(mode);
-    CASE(qtype);
-    CASE(rounding_type);
-    CASE(matched);
-    CASE(backend);
-    CASE(partition_id);
-#undef CASE
-}
-
-TEST(test_interface_op, AddOpIds) {
-    using namespace dnnl::impl::graph;
-    op_t wild_card {0, op_kind::Wildcard, std::string("wildcard")};
-    std::vector<size_t> ids {10001, 100023};
-    wild_card.add_op_ids(ids);
-    ASSERT_EQ(wild_card.get_op_ids().size(), ids.size());
-    for (size_t i = 0; i < ids.size(); ++i) {
-        ASSERT_EQ(ids[i], wild_card.get_op_ids()[i]);
-    }
-}
-
-TEST(test_interface_op, ValidateMatmul) {
-    using namespace dnnl::impl::graph;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    ASSERT_EQ(matmul.get_id(), 0U);
-    ASSERT_EQ(matmul.get_kind(), op_kind::MatMul);
-    ASSERT_EQ(matmul.get_name(), std::string("matmul"));
-    ASSERT_FALSE(matmul.is_internal());
-}
-
-TEST(test_interface_op, CreateInternal) {
-    using namespace dnnl::impl::graph;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul"), true};
-    ASSERT_EQ(matmul.get_id(), 0U);
-    ASSERT_EQ(matmul.get_kind(), op_kind::MatMul);
-    ASSERT_EQ(matmul.get_name(), std::string("matmul"));
-    ASSERT_TRUE(matmul.is_internal());
-}
-
-TEST(test_interface_op, CreateWithoutId) {
-    using namespace dnnl::impl::graph;
-
-    op_t matmul {op_kind::MatMul, std::string("matmul")};
-    ASSERT_EQ(matmul.get_id(), std::numeric_limits<size_t>::max());
-    ASSERT_EQ(matmul.get_kind(), op_kind::MatMul);
-    ASSERT_EQ(matmul.get_name(), std::string("matmul"));
-    ASSERT_TRUE(matmul.is_internal());
-}
-
-TEST(test_interface_op, AddInput) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    ASSERT_EQ(matmul.num_inputs(), 0U);
-    ASSERT_EQ(matmul.num_outputs(), 0U);
-
-    logical_tensor_t lt0 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt1 = logical_tensor_init(2, data_type::f32);
-    matmul.add_input(lt0);
-    matmul.add_input(lt1);
-
-    ASSERT_EQ(matmul.num_inputs(), 2U);
-    ASSERT_EQ(matmul.num_outputs(), 0U);
-}
-
-TEST(test_interface_op, GetInput) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt0 = logical_tensor_init(1, {2, 3}, data_type::f32);
-    logical_tensor_t lt1 = logical_tensor_init(2, {3, 4}, data_type::f32);
-    matmul.add_input(lt0);
-    matmul.add_input(lt1);
-
-    auto value0 = matmul.get_input_value(0);
-    ASSERT_EQ(logical_tensor_wrapper_t(value0->get_logical_tensor()),
-            logical_tensor_wrapper_t(lt0));
-
-    auto value1 = matmul.get_input_value(1);
-    ASSERT_EQ(logical_tensor_wrapper_t(value1->get_logical_tensor()),
-            logical_tensor_wrapper_t(lt1));
-}
-
-TEST(test_interface_op, GetInputValues) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt0 = logical_tensor_init(1, {2, 3}, data_type::f32);
-    logical_tensor_t lt1 = logical_tensor_init(2, {3, 4}, data_type::f32);
-    matmul.add_input(lt0);
-    matmul.add_input(lt1);
-
-    auto values = matmul.get_input_values();
-    ASSERT_EQ(values.size(), 2U);
-    ASSERT_EQ(logical_tensor_wrapper_t(values[0]->get_logical_tensor()),
-            logical_tensor_wrapper_t(lt0));
-    ASSERT_EQ(logical_tensor_wrapper_t(values[1]->get_logical_tensor()),
-            logical_tensor_wrapper_t(lt1));
-}
-
-TEST(test_interface_op, SetInputOp) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t relu {0, op_kind::ReLU, std::string("relu")};
-    logical_tensor_t lt0 = logical_tensor_init(1, {2, 3}, data_type::f32);
-    relu.add_output(lt0);
-
-    op_t matmul {2, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt1 = logical_tensor_init(3, {3, 4}, data_type::f32);
-    matmul.add_input(lt0);
-    matmul.add_input(lt1);
-
-    // setup the connection
-    matmul.connect_input(0, relu, 0);
-
-    auto op0 = matmul.get_input_op(0);
-    ASSERT_EQ(op0->get_kind(), op_kind::ReLU);
-    ASSERT_EQ(op0->get_id(), 0U);
-    ASSERT_EQ(op0->get_name(), std::string("relu"));
-
-    ASSERT_EQ(relu.num_output_consumers(0), 1U);
-}
-
-TEST(test_interface_op, AddOutput) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    ASSERT_EQ(matmul.num_inputs(), 0U);
-    ASSERT_EQ(matmul.num_outputs(), 0U);
-
-    logical_tensor_t lt = logical_tensor_init(1, data_type::f32);
-    matmul.add_output(lt);
-
-    ASSERT_EQ(matmul.num_inputs(), 0U);
-    ASSERT_EQ(matmul.num_outputs(), 1U);
-}
-
-TEST(test_interface_op, GetOutput) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt = logical_tensor_init(1, {2, 3}, data_type::f32);
-    matmul.add_output(lt);
-
-    auto value = matmul.get_output_value(0);
-    ASSERT_EQ(logical_tensor_wrapper_t(value->get_logical_tensor()),
-            logical_tensor_wrapper_t(lt));
-}
-
-TEST(test_interface_op, GetOutputValues) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::graph::tests::unit::utils;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt = logical_tensor_init(1, {2, 3}, data_type::f32);
-    matmul.add_output(lt);
-
-    auto values = matmul.get_output_values();
-    ASSERT_EQ(values.size(), 1U);
-    ASSERT_EQ(logical_tensor_wrapper_t(values[0]->get_logical_tensor()),
-            logical_tensor_wrapper_t(lt));
-}
-
-TEST(test_interface_op, SetAttributeBool) {
-    using namespace dnnl::impl::graph;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    matmul.set_attr<bool>(op_attr::transpose_a, true);
-    matmul.set_attr<bool>(op_attr::transpose_b, false);
-
-    ASSERT_EQ(matmul.num_attributes(), 2U);
-    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_a));
-    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_b));
-    ASSERT_FALSE(matmul.has_attr(op_attr::undef));
-    ASSERT_TRUE(matmul.get_attr<bool>(op_attr::transpose_a));
-    ASSERT_FALSE(matmul.get_attr<bool>(op_attr::transpose_b));
-}
-
-TEST(test_interface_op, SetAttributeFloatString) {
-    using namespace dnnl::impl::graph;
-
-    op_t bn {0, op_kind::BatchNormInference, std::string("batch_norm")};
-    bn.set_attr<float>(op_attr::epsilon, 0.5);
-    ASSERT_EQ(bn.get_attr<float>(op_attr::epsilon), 0.5);
-
-    bn.set_attr<std::string>(op_attr::data_format, "NCX");
-    ASSERT_EQ(
-            bn.get_attr<std::string>(op_attr::data_format), std::string("NCX"));
-}
-
-TEST(test_interface_op, SetAttributeInt64Vector) {
-    using namespace dnnl::impl::graph;
-
-    op_t conv {0, op_kind::Convolution, std::string("convolution")};
-    conv.set_attr<int64_t>(op_attr::groups, 2);
-    ASSERT_EQ(conv.get_attr<int64_t>(op_attr::groups), 2);
-
-    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {1, 1});
-    auto ret = conv.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-    ASSERT_EQ(ret.size(), 2U);
-    ASSERT_EQ(ret[0], 1);
-    ASSERT_EQ(ret[1], 1);
-}
-
-TEST(test_interface_op, MergeAttributes) {
-    using namespace dnnl::impl::graph;
-
-    op_t conv {0, op_kind::Convolution, std::string("convolution")};
-    conv.set_attr<int64_t>(op_attr::groups, 2);
-    conv.set_attr<std::string>(op_attr::data_format, "NCX");
-
-    std::unordered_map<op_attr_t, op_t::attribute_value_t> other {};
-    op_t::attribute_value_t fmt {std::string("OIX")};
-    other.insert({op_attr::weights_format, fmt});
-    std::vector<int64_t> pad {2, 2};
-    other.insert({op_attr::pads_begin, {pad}});
-
-    conv.merge_attributes(other);
-    ASSERT_EQ(conv.num_attributes(), 4U);
-}
-
-TEST(test_interface_op, ValidateSameAttributes) {
-    using namespace dnnl::impl::graph;
-
-    op_t conv {0, op_kind::Convolution, std::string("convolution")};
-    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {2, 2});
-    conv.set_attr<std::string>(op_attr::data_format, "NCX");
-
-    op_t pool {1, op_kind::MaxPool, std::string("max_pool")};
-    pool.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {2, 2});
-    pool.set_attr<std::string>(op_attr::data_format, "NCX");
-
-    ASSERT_TRUE(conv.is_same_attr_value(pool, op_attr::pads_begin));
-    ASSERT_TRUE(conv.is_same_attr_value(pool, op_attr::data_format));
-    ASSERT_TRUE(conv.has_same_attr_values(pool));
-
-    op_t bn {2, op_kind::BatchNormInference, std::string("batch_norm")};
-    bn.set_attr<std::string>(op_attr::data_format, "NCX");
-    ASSERT_TRUE(conv.is_same_attr_value(bn, op_attr::data_format));
-    ASSERT_FALSE(conv.has_same_attr_values(bn));
-}
-
-TEST(test_interface_op, OverwriteAttributes) {
-    using namespace dnnl::impl::graph;
-
-    op_t conv {0, op_kind::Convolution, std::string("convolution")};
-    std::vector<int64_t> pad = {2, 2};
-    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, pad);
-    conv.set_attr<std::string>(op_attr::data_format, "NCX");
-    ASSERT_TRUE(conv.has_attr(op_attr::pads_begin));
-    ASSERT_TRUE(conv.has_attr(op_attr::data_format));
-    ASSERT_EQ(conv.get_attr<std::vector<int64_t>>(op_attr::pads_begin), pad);
-    ASSERT_EQ(conv.get_attr<std::string>(op_attr::data_format), "NCX");
-
-    // reset vector and string attributes
-    pad = {1, 1};
-    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, pad);
-    conv.set_attr<std::string>(op_attr::data_format, "NXC");
-    ASSERT_TRUE(conv.has_attr(op_attr::pads_begin));
-    ASSERT_TRUE(conv.has_attr(op_attr::data_format));
-    ASSERT_EQ(conv.get_attr<std::vector<int64_t>>(op_attr::pads_begin), pad);
-    ASSERT_EQ(conv.get_attr<std::string>(op_attr::data_format), "NXC");
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    matmul.set_attr<bool>(op_attr::transpose_a, true);
-    matmul.set_attr<bool>(op_attr::transpose_b, false);
-    ASSERT_EQ(matmul.num_attributes(), 2U);
-    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_a));
-    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_b));
-    ASSERT_TRUE(matmul.get_attr<bool>(op_attr::transpose_a));
-    ASSERT_FALSE(matmul.get_attr<bool>(op_attr::transpose_b));
-
-    // reset boolean attributes
-    matmul.set_attr<bool>(op_attr::transpose_a, false);
-    matmul.set_attr<bool>(op_attr::transpose_b, true);
-    ASSERT_EQ(matmul.num_attributes(), 2U);
-    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_a));
-    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_b));
-    ASSERT_FALSE(matmul.get_attr<bool>(op_attr::transpose_a));
-    ASSERT_TRUE(matmul.get_attr<bool>(op_attr::transpose_b));
-}
-
-TEST(test_interface_op, KindString) {
-    using namespace dnnl::impl::graph;
-
-    ASSERT_EQ(op_t::kind2str(op_kind::Abs), "Abs");
-    ASSERT_EQ(op_t::kind2str(op_kind::Convolution), "Convolution");
-    ASSERT_EQ(op_t::kind2str(op_kind::MatMul), "MatMul");
-    ASSERT_EQ(op_t::kind2str(op_kind::LastSymbol), "LastSymbol");
-}
-
-TEST(test_interface_op, ReSetAttributeValue) {
-    using namespace dnnl::impl::graph;
-
-    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
-    matmul.set_attr<bool>(op_attr::transpose_a, true);
-    graph::utils::attribute_value_t a;
-    ASSERT_NO_THROW(matmul.set_attr(graph::op_attr::transpose_a, a));
-}
diff --git a/tests/gtests/graph/unit/interface/test_op_cpu.cpp b/tests/gtests/graph/unit/interface/test_op_cpu.cpp
new file mode 100644
index 00000000000..3fd35f0000b
--- /dev/null
+++ b/tests/gtests/graph/unit/interface/test_op_cpu.cpp
@@ -0,0 +1,389 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <array>
+#include <limits>
+#include <string>
+
+#include "gtest/gtest.h"
+
+#include "interface/c_types_map.hpp"
+#include "interface/logical_tensor.hpp"
+#include "interface/op.hpp"
+#include "interface/op_schema.hpp"
+#include "interface/partition.hpp"
+#include "interface/partition_impl.hpp"
+
+#include "graph/unit/utils.hpp"
+
+TEST(test_interface_op, OpResetAttr) {
+    using namespace dnnl::impl::graph;
+    op_t conv {0, op_kind::Convolution, std::string("conv")};
+    conv.set_attr(op_attr::auto_pad, std::string("VALID"));
+    conv.set_attr(op_attr::auto_pad, std::string("NONE"));
+    ASSERT_EQ(conv.get_attr<std::string>(op_attr::auto_pad), "NONE");
+}
+
+TEST(test_interface_op, Attr2str) {
+    namespace graph = dnnl::impl::graph;
+
+#define CASE(a) \
+    ASSERT_EQ(std::string(#a), graph::op_t::attr2str(graph::op_attr::a))
+
+    CASE(alpha);
+    CASE(beta);
+    CASE(epsilon);
+    CASE(max);
+    CASE(min);
+    CASE(momentum);
+    CASE(scales);
+    CASE(axis);
+    CASE(begin_norm_axis);
+    CASE(groups);
+    CASE(axes);
+    CASE(dilations);
+    CASE(weights_shape);
+    CASE(src_shape);
+    CASE(kernel);
+    CASE(order);
+    CASE(output_padding);
+    CASE(dst_shape);
+    CASE(pads_begin);
+    CASE(pads_end);
+    CASE(shape);
+    CASE(sizes);
+    CASE(strides);
+    CASE(zps);
+    CASE(exclude_pad);
+    CASE(keep_dims);
+    CASE(keep_stats);
+    CASE(per_channel_broadcast);
+    CASE(special_zero);
+    CASE(transpose_a);
+    CASE(transpose_b);
+    CASE(use_affine);
+    CASE(use_dst);
+    CASE(auto_broadcast);
+    CASE(auto_pad);
+    CASE(coordinate_transformation_mode);
+    CASE(data_format);
+    CASE(weights_format);
+    CASE(mode);
+    CASE(qtype);
+    CASE(rounding_type);
+    CASE(matched);
+    CASE(backend);
+    CASE(partition_id);
+#undef CASE
+}
+
+TEST(test_interface_op, AddOpIds) {
+    using namespace dnnl::impl::graph;
+    op_t wild_card {0, op_kind::Wildcard, std::string("wildcard")};
+    std::vector<size_t> ids {10001, 100023};
+    wild_card.add_op_ids(ids);
+    ASSERT_EQ(wild_card.get_op_ids().size(), ids.size());
+    for (size_t i = 0; i < ids.size(); ++i) {
+        ASSERT_EQ(ids[i], wild_card.get_op_ids()[i]);
+    }
+}
+
+TEST(test_interface_op, ValidateMatmul) {
+    using namespace dnnl::impl::graph;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    ASSERT_EQ(matmul.get_id(), 0U);
+    ASSERT_EQ(matmul.get_kind(), op_kind::MatMul);
+    ASSERT_EQ(matmul.get_name(), std::string("matmul"));
+    ASSERT_FALSE(matmul.is_internal());
+}
+
+TEST(test_interface_op, CreateInternal) {
+    using namespace dnnl::impl::graph;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul"), true};
+    ASSERT_EQ(matmul.get_id(), 0U);
+    ASSERT_EQ(matmul.get_kind(), op_kind::MatMul);
+    ASSERT_EQ(matmul.get_name(), std::string("matmul"));
+    ASSERT_TRUE(matmul.is_internal());
+}
+
+TEST(test_interface_op, CreateWithoutId) {
+    using namespace dnnl::impl::graph;
+
+    op_t matmul {op_kind::MatMul, std::string("matmul")};
+    ASSERT_EQ(matmul.get_id(), std::numeric_limits<size_t>::max());
+    ASSERT_EQ(matmul.get_kind(), op_kind::MatMul);
+    ASSERT_EQ(matmul.get_name(), std::string("matmul"));
+    ASSERT_TRUE(matmul.is_internal());
+}
+
+TEST(test_interface_op, AddInput) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    ASSERT_EQ(matmul.num_inputs(), 0U);
+    ASSERT_EQ(matmul.num_outputs(), 0U);
+
+    logical_tensor_t lt0 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt1 = logical_tensor_init(2, data_type::f32);
+    matmul.add_input(lt0);
+    matmul.add_input(lt1);
+
+    ASSERT_EQ(matmul.num_inputs(), 2U);
+    ASSERT_EQ(matmul.num_outputs(), 0U);
+}
+
+TEST(test_interface_op, GetInput) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt0 = logical_tensor_init(1, {2, 3}, data_type::f32);
+    logical_tensor_t lt1 = logical_tensor_init(2, {3, 4}, data_type::f32);
+    matmul.add_input(lt0);
+    matmul.add_input(lt1);
+
+    auto value0 = matmul.get_input_value(0);
+    ASSERT_EQ(logical_tensor_wrapper_t(value0->get_logical_tensor()),
+            logical_tensor_wrapper_t(lt0));
+
+    auto value1 = matmul.get_input_value(1);
+    ASSERT_EQ(logical_tensor_wrapper_t(value1->get_logical_tensor()),
+            logical_tensor_wrapper_t(lt1));
+}
+
+TEST(test_interface_op, GetInputValues) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt0 = logical_tensor_init(1, {2, 3}, data_type::f32);
+    logical_tensor_t lt1 = logical_tensor_init(2, {3, 4}, data_type::f32);
+    matmul.add_input(lt0);
+    matmul.add_input(lt1);
+
+    auto values = matmul.get_input_values();
+    ASSERT_EQ(values.size(), 2U);
+    ASSERT_EQ(logical_tensor_wrapper_t(values[0]->get_logical_tensor()),
+            logical_tensor_wrapper_t(lt0));
+    ASSERT_EQ(logical_tensor_wrapper_t(values[1]->get_logical_tensor()),
+            logical_tensor_wrapper_t(lt1));
+}
+
+TEST(test_interface_op, SetInputOp) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t relu {0, op_kind::ReLU, std::string("relu")};
+    logical_tensor_t lt0 = logical_tensor_init(1, {2, 3}, data_type::f32);
+    relu.add_output(lt0);
+
+    op_t matmul {2, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt1 = logical_tensor_init(3, {3, 4}, data_type::f32);
+    matmul.add_input(lt0);
+    matmul.add_input(lt1);
+
+    // setup the connection
+    matmul.connect_input(0, relu, 0);
+
+    auto op0 = matmul.get_input_op(0);
+    ASSERT_EQ(op0->get_kind(), op_kind::ReLU);
+    ASSERT_EQ(op0->get_id(), 0U);
+    ASSERT_EQ(op0->get_name(), std::string("relu"));
+
+    ASSERT_EQ(relu.num_output_consumers(0), 1U);
+}
+
+TEST(test_interface_op, AddOutput) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    ASSERT_EQ(matmul.num_inputs(), 0U);
+    ASSERT_EQ(matmul.num_outputs(), 0U);
+
+    logical_tensor_t lt = logical_tensor_init(1, data_type::f32);
+    matmul.add_output(lt);
+
+    ASSERT_EQ(matmul.num_inputs(), 0U);
+    ASSERT_EQ(matmul.num_outputs(), 1U);
+}
+
+TEST(test_interface_op, GetOutput) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt = logical_tensor_init(1, {2, 3}, data_type::f32);
+    matmul.add_output(lt);
+
+    auto value = matmul.get_output_value(0);
+    ASSERT_EQ(logical_tensor_wrapper_t(value->get_logical_tensor()),
+            logical_tensor_wrapper_t(lt));
+}
+
+TEST(test_interface_op, GetOutputValues) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::graph::tests::unit::utils;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt = logical_tensor_init(1, {2, 3}, data_type::f32);
+    matmul.add_output(lt);
+
+    auto values = matmul.get_output_values();
+    ASSERT_EQ(values.size(), 1U);
+    ASSERT_EQ(logical_tensor_wrapper_t(values[0]->get_logical_tensor()),
+            logical_tensor_wrapper_t(lt));
+}
+
+TEST(test_interface_op, SetAttributeBool) {
+    using namespace dnnl::impl::graph;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    matmul.set_attr<bool>(op_attr::transpose_a, true);
+    matmul.set_attr<bool>(op_attr::transpose_b, false);
+
+    ASSERT_EQ(matmul.num_attributes(), 2U);
+    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_a));
+    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_b));
+    ASSERT_FALSE(matmul.has_attr(op_attr::undef));
+    ASSERT_TRUE(matmul.get_attr<bool>(op_attr::transpose_a));
+    ASSERT_FALSE(matmul.get_attr<bool>(op_attr::transpose_b));
+}
+
+TEST(test_interface_op, SetAttributeFloatString) {
+    using namespace dnnl::impl::graph;
+
+    op_t bn {0, op_kind::BatchNormInference, std::string("batch_norm")};
+    bn.set_attr<float>(op_attr::epsilon, 0.5);
+    ASSERT_EQ(bn.get_attr<float>(op_attr::epsilon), 0.5);
+
+    bn.set_attr<std::string>(op_attr::data_format, "NCX");
+    ASSERT_EQ(
+            bn.get_attr<std::string>(op_attr::data_format), std::string("NCX"));
+}
+
+TEST(test_interface_op, SetAttributeInt64Vector) {
+    using namespace dnnl::impl::graph;
+
+    op_t conv {0, op_kind::Convolution, std::string("convolution")};
+    conv.set_attr<int64_t>(op_attr::groups, 2);
+    ASSERT_EQ(conv.get_attr<int64_t>(op_attr::groups), 2);
+
+    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {1, 1});
+    auto ret = conv.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+    ASSERT_EQ(ret.size(), 2U);
+    ASSERT_EQ(ret[0], 1);
+    ASSERT_EQ(ret[1], 1);
+}
+
+TEST(test_interface_op, MergeAttributes) {
+    using namespace dnnl::impl::graph;
+
+    op_t conv {0, op_kind::Convolution, std::string("convolution")};
+    conv.set_attr<int64_t>(op_attr::groups, 2);
+    conv.set_attr<std::string>(op_attr::data_format, "NCX");
+
+    std::unordered_map<op_attr_t, op_t::attribute_value_t> other {};
+    op_t::attribute_value_t fmt {std::string("OIX")};
+    other.insert({op_attr::weights_format, fmt});
+    std::vector<int64_t> pad {2, 2};
+    other.insert({op_attr::pads_begin, {pad}});
+
+    conv.merge_attributes(other);
+    ASSERT_EQ(conv.num_attributes(), 4U);
+}
+
+TEST(test_interface_op, ValidateSameAttributes) {
+    using namespace dnnl::impl::graph;
+
+    op_t conv {0, op_kind::Convolution, std::string("convolution")};
+    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {2, 2});
+    conv.set_attr<std::string>(op_attr::data_format, "NCX");
+
+    op_t pool {1, op_kind::MaxPool, std::string("max_pool")};
+    pool.set_attr<std::vector<int64_t>>(op_attr::pads_begin, {2, 2});
+    pool.set_attr<std::string>(op_attr::data_format, "NCX");
+
+    ASSERT_TRUE(conv.is_same_attr_value(pool, op_attr::pads_begin));
+    ASSERT_TRUE(conv.is_same_attr_value(pool, op_attr::data_format));
+    ASSERT_TRUE(conv.has_same_attr_values(pool));
+
+    op_t bn {2, op_kind::BatchNormInference, std::string("batch_norm")};
+    bn.set_attr<std::string>(op_attr::data_format, "NCX");
+    ASSERT_TRUE(conv.is_same_attr_value(bn, op_attr::data_format));
+    ASSERT_FALSE(conv.has_same_attr_values(bn));
+}
+
+TEST(test_interface_op, OverwriteAttributes) {
+    using namespace dnnl::impl::graph;
+
+    op_t conv {0, op_kind::Convolution, std::string("convolution")};
+    std::vector<int64_t> pad = {2, 2};
+    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, pad);
+    conv.set_attr<std::string>(op_attr::data_format, "NCX");
+    ASSERT_TRUE(conv.has_attr(op_attr::pads_begin));
+    ASSERT_TRUE(conv.has_attr(op_attr::data_format));
+    ASSERT_EQ(conv.get_attr<std::vector<int64_t>>(op_attr::pads_begin), pad);
+    ASSERT_EQ(conv.get_attr<std::string>(op_attr::data_format), "NCX");
+
+    // reset vector and string attributes
+    pad = {1, 1};
+    conv.set_attr<std::vector<int64_t>>(op_attr::pads_begin, pad);
+    conv.set_attr<std::string>(op_attr::data_format, "NXC");
+    ASSERT_TRUE(conv.has_attr(op_attr::pads_begin));
+    ASSERT_TRUE(conv.has_attr(op_attr::data_format));
+    ASSERT_EQ(conv.get_attr<std::vector<int64_t>>(op_attr::pads_begin), pad);
+    ASSERT_EQ(conv.get_attr<std::string>(op_attr::data_format), "NXC");
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    matmul.set_attr<bool>(op_attr::transpose_a, true);
+    matmul.set_attr<bool>(op_attr::transpose_b, false);
+    ASSERT_EQ(matmul.num_attributes(), 2U);
+    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_a));
+    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_b));
+    ASSERT_TRUE(matmul.get_attr<bool>(op_attr::transpose_a));
+    ASSERT_FALSE(matmul.get_attr<bool>(op_attr::transpose_b));
+
+    // reset boolean attributes
+    matmul.set_attr<bool>(op_attr::transpose_a, false);
+    matmul.set_attr<bool>(op_attr::transpose_b, true);
+    ASSERT_EQ(matmul.num_attributes(), 2U);
+    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_a));
+    ASSERT_TRUE(matmul.has_attr(op_attr::transpose_b));
+    ASSERT_FALSE(matmul.get_attr<bool>(op_attr::transpose_a));
+    ASSERT_TRUE(matmul.get_attr<bool>(op_attr::transpose_b));
+}
+
+TEST(test_interface_op, KindString) {
+    using namespace dnnl::impl::graph;
+
+    ASSERT_EQ(op_t::kind2str(op_kind::Abs), "Abs");
+    ASSERT_EQ(op_t::kind2str(op_kind::Convolution), "Convolution");
+    ASSERT_EQ(op_t::kind2str(op_kind::MatMul), "MatMul");
+    ASSERT_EQ(op_t::kind2str(op_kind::LastSymbol), "LastSymbol");
+}
+
+TEST(test_interface_op, ReSetAttributeValue) {
+    using namespace dnnl::impl::graph;
+
+    op_t matmul {0, op_kind::MatMul, std::string("matmul")};
+    matmul.set_attr<bool>(op_attr::transpose_a, true);
+    graph::utils::attribute_value_t a;
+    ASSERT_NO_THROW(matmul.set_attr(graph::op_attr::transpose_a, a));
+}
diff --git a/tests/gtests/graph/unit/interface/test_op_def_constraint.cpp b/tests/gtests/graph/unit/interface/test_op_def_constraint_cpu.cpp
similarity index 100%
rename from tests/gtests/graph/unit/interface/test_op_def_constraint.cpp
rename to tests/gtests/graph/unit/interface/test_op_def_constraint_cpu.cpp
diff --git a/tests/gtests/graph/unit/interface/test_op_schema.cpp b/tests/gtests/graph/unit/interface/test_op_schema.cpp
deleted file mode 100644
index 05f1f68cfb2..00000000000
--- a/tests/gtests/graph/unit/interface/test_op_schema.cpp
+++ /dev/null
@@ -1,5154 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <gtest/gtest.h>
-
-#include "interface/c_types_map.hpp"
-#include "interface/graph.hpp"
-#include "interface/op_def.hpp"
-#include "interface/op_schema.hpp"
-
-#include "graph/unit/utils.hpp"
-
-using namespace dnnl::impl::graph;
-using namespace dnnl::graph::tests::unit::utils;
-
-#ifndef NDEBUG
-
-TEST(test_interface_op_schema, DuplicateAttributeDeathTest) {
-    EXPECT_DEATH(op_schema_t()
-                         .set_attr(op_attr::kernel, true, attribute_kind::b)
-                         .set_attr(op_attr::kernel, true, attribute_kind::b),
-            "provided attribute has already been set");
-}
-
-TEST(test_interface_op_schema, DuplicatedInputDeathTest) {
-    EXPECT_DEATH(op_schema_t().set_num_inputs(5).set_input(3, "mean").set_input(
-                         3, "mean"),
-            "provided `in_offset` has already been set");
-}
-
-TEST(test_interface_op_schema, DuplicatedOutputDeathTest) {
-    EXPECT_DEATH(op_schema_t()
-                         .set_num_outputs(1)
-                         .set_output(0, "output")
-                         .set_output(0, "output"),
-            "provided `out_offset` has already been set");
-}
-
-TEST(test_interface_op_schema, SetInputBeforeSetNumInputsDeathTest) {
-    EXPECT_DEATH(op_schema_t().set_input(0, "a").set_num_inputs(2),
-            "input set before setting num_inputs_");
-}
-
-TEST(test_interface_op_schema, SetOutputBeforeSetNumOutputsDeathTest) {
-    EXPECT_DEATH(op_schema_t().set_output(0, "output").set_num_outputs(1),
-            "output set before setting num_outputs_");
-}
-
-TEST(test_interface_op_schema, ExceededNumInputsDeathTest) {
-    EXPECT_DEATH(
-            op_schema_t().set_num_inputs(1).set_input(0, "a").set_input(1, "b"),
-            "input offset exceeds declared num of inputs");
-}
-
-TEST(test_interface_op_schema, ExceededNumOutputsDeathTest) {
-    EXPECT_DEATH(op_schema_t().set_num_outputs(1).set_output(0, "a").set_output(
-                         1, "b"),
-            "output offset exceeds declared num of outputs");
-}
-
-#endif
-
-TEST(test_interface_op_schema, OpschemaMethodDeathTest) {
-    auto op_schema = op_schema_t(op_kind::Add, 1);
-
-    ASSERT_NO_THROW({ op_schema.set_num_inputs(2); });
-    ASSERT_NO_THROW({ op_schema.set_num_outputs(1); });
-    ASSERT_NO_THROW({ op_schema.set_input(0, "a", "T"); });
-    ASSERT_NO_THROW({ op_schema.set_input(1, "b", "T"); });
-    ASSERT_NO_THROW({ op_schema.get_inputs(); });
-    ASSERT_NO_THROW({ op_schema.set_output(0, "output", "T"); });
-
-    ASSERT_NO_THROW({
-        op_schema.set_type_constraints(
-                "T", {data_type::f32, data_type::bf16, data_type::f16});
-    });
-    ASSERT_NO_THROW({ op_schema.get_outputs(); });
-}
-
-TEST(test_interface_op_schema, Convolution) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-    const std::set<size_t> expected_in_sizes = {2, 3};
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 8;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::dilations, true}, {op_attr::auto_pad, false},
-            {op_attr::groups, false}, {op_attr::data_format, false},
-            {op_attr::weights_format, false}};
-    for (auto expected_in_size : expected_in_sizes) {
-        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-                expected_attr_size, attrs_data);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvolutionAutoPadShape) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t groups = 1;
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-
-        auto lt_data = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        auto lt_weight
-                = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &inferred_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &inferred_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {2, 2};
-            std::vector<int64_t> expected_pads_end {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 3, 3};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {0, 0};
-            std::vector<int64_t> expected_pads_end {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferConvolutionAutoPadNegtivePaddingSize) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
-    std::vector<int64_t> strides = {3, 3};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {"SAME_UPPER", "SAME_LOWER"};
-    int64_t groups = 1;
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-
-        auto lt_data = logical_tensor_init(id++, {1, 1, 5, 5}, data_type::f32);
-        auto lt_weight
-                = logical_tensor_init(id++, {1, 1, 1, 1}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &inferred_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &inferred_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-        const std::vector<int64_t> expected_out_shape {1, 1, 2, 2};
-        ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-        std::vector<int64_t> expected_pads_begin {0, 0};
-        std::vector<int64_t> expected_pads_end {0, 0};
-        EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-        EXPECT_EQ(inferred_pads_end, expected_pads_end);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvolutionNXCAutoPadShapeWithDilations) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NXC";
-    std::string filter_format = "XIO";
-    const std::vector<std::string> auto_pads_vec {
-            "None", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t groups = 1;
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-
-        auto lt_data
-                = logical_tensor_init(id++, {1, 100, 100, 160}, data_type::f32);
-        auto lt_weight
-                = logical_tensor_init(id++, {7, 1, 160, 192}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &infered_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &infered_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto infered_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        if (auto_pad == "None" || auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape {1, 88, 100, 192};
-            ASSERT_EQ(infered_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {0, 0};
-            std::vector<int64_t> expected_pads_end {0, 0};
-            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
-            EXPECT_EQ(infered_pads_end, expected_pads_end);
-        } else {
-            const std::vector<int64_t> expected_out_shape {1, 100, 100, 192};
-            ASSERT_EQ(infered_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {6, 0};
-            std::vector<int64_t> expected_pads_end {6, 0};
-            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
-            EXPECT_EQ(infered_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeAutoPadNegtivePaddingSize) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
-    std::vector<int64_t> strides = {2, 2};
-    std::vector<int64_t> pads_begin = {0, 0};
-    std::vector<int64_t> pads_end = {0, 0};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {"SAME_UPPER", "SAME_LOWER"};
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    // according to the convtranspose semantic, output_padding is bigger
-    // than 0 only if stride is greater than 1.
-    std::vector<int64_t> output_padding = {0, 0};
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
-                dilations, auto_pad, data_format, filter_format, groups,
-                output_padding);
-
-        auto lt_data = logical_tensor_init(0, {1, 1, 3, 3}, data_type::f32);
-        auto lt_weight = logical_tensor_init(1, {1, 1, 1, 1}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &infered_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &infered_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto infered_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        const std::vector<int64_t> expected_out_shape {1, 1, 5, 5};
-        ASSERT_EQ(infered_out_shape, expected_out_shape);
-
-        std::vector<int64_t> expected_pads_begin {0, 0};
-        std::vector<int64_t> expected_pads_end {0, 0};
-        EXPECT_EQ(infered_pads_begin, expected_pads_begin);
-        EXPECT_EQ(infered_pads_end, expected_pads_end);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeAutoPadShapeWithDilations) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    // according to the convtranspose semantic, output_padding is bigger
-    // than 0 only if stride is greater than 1.
-    std::vector<int64_t> output_padding = {0, 0};
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
-                dilations, auto_pad, data_format, filter_format, groups,
-                output_padding);
-
-        auto lt_data = logical_tensor_init(0, {1, 1, 10, 10}, data_type::f32);
-        auto lt_weight = logical_tensor_init(1, {1, 1, 4, 4}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &infered_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &infered_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto infered_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 13, 13};
-            ASSERT_EQ(infered_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
-            EXPECT_EQ(infered_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER" || auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
-            ASSERT_EQ(infered_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {3, 3};
-            std::vector<int64_t> expected_pads_end {3, 3};
-            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
-            EXPECT_EQ(infered_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 16, 16};
-            ASSERT_EQ(infered_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {0, 0};
-            std::vector<int64_t> expected_pads_end {0, 0};
-            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
-            EXPECT_EQ(infered_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferConvolutionDilationsShape) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-
-    auto lt_data = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
-    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
-    std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
-
-    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-    ASSERT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, ConvTranspose) {
-    const op_kind_t op_kind_ = op_kind::ConvTranspose;
-    const std::set<size_t> expected_in_sizes = {2, 3};
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 9;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::dilations, true}, {op_attr::auto_pad, false},
-            {op_attr::groups, false}, {op_attr::data_format, false},
-            {op_attr::weights_format, false}, {op_attr::output_padding, false}};
-    for (auto expected_in_size : expected_in_sizes) {
-        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-                expected_attr_size, attrs_data);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeAutoPadShape) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    // according to the convtranspose semantic, output_padding is bigger
-    // than 0 only if stride is greater than 1.
-    std::vector<int64_t> output_padding = {0, 0};
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
-                dilations, auto_pad, data_format, filter_format, groups,
-                output_padding);
-
-        auto lt_data = logical_tensor_init(0, {1, 1, 6, 6}, data_type::f32);
-        auto lt_weight = logical_tensor_init(1, {1, 1, 4, 4}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &inferred_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &inferred_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {2, 2};
-            std::vector<int64_t> expected_pads_end {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 9, 9};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {0, 0};
-            std::vector<int64_t> expected_pads_end {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeDilationsShape) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-
-    auto lt_data = logical_tensor_init(id++, {1, 1, 10, 10}, data_type::f32);
-    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
-    std::vector<int64_t> expected_out_shape {1, 1, 16, 16};
-
-    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-    ASSERT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferConvtransposeBiasOutputShape) {
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {}; // empty pads_begin
-    std::vector<int64_t> pads_end = {}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    int64_t groups = 1;
-    std::vector<int64_t> output_padding = {0, 0};
-
-    set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
-            dilations, "SAME_UPPER", data_format, filter_format, groups,
-            output_padding);
-
-    auto lt_data = logical_tensor_init(0, {1, 1, 5, 5}, data_type::f32);
-    auto lt_weight = logical_tensor_init(1, {1, 1, 3, 3}, data_type::f32);
-    auto lt_o = logical_tensor_init(
-            2, {1, 1, 5, 5}, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-    pads_begin = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-    pads_end = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-    std::vector<int64_t> expected_pads_begin = {1, 1};
-    std::vector<int64_t> expected_pads_end = {1, 1};
-    EXPECT_EQ(pads_begin, expected_pads_begin);
-    EXPECT_EQ(pads_end, expected_pads_end);
-}
-
-TEST(test_interface_op_schema, ConvTransposeBackwardData) {
-    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardData;
-    const std::set<size_t> expected_in_sizes = {2};
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 8;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::dilations, true}, {op_attr::auto_pad, false},
-            {op_attr::groups, false}, {op_attr::data_format, false},
-            {op_attr::weights_format, false}};
-    for (auto expected_in_size : expected_in_sizes) {
-        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-                expected_attr_size, attrs_data);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvTransposeBackwardDataDilationsShape) {
-    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
-            op_kind::ConvTransposeBackwardData);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTransposeBackwardData,
-            op_t::kind2str(op_kind::ConvTransposeBackwardData)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-
-    auto lt_data = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
-    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
-    std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
-
-    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-    ASSERT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferConvTransposeBackwardDataAutoPadShape) {
-    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardData;
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
-
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-
-        auto lt_data = logical_tensor_init(0, {1, 1, 6, 6}, data_type::f32);
-        auto lt_weight = logical_tensor_init(1, {1, 1, 4, 4}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &inferred_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &inferred_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {2, 2};
-            std::vector<int64_t> expected_pads_end {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 3, 3};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {0, 0};
-            std::vector<int64_t> expected_pads_end {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferConvTransposeBackwardDataShapeWithNxcXio) {
-    const auto kind = op_kind::ConvTransposeBackwardData;
-    const std::string data_format {"NCX"};
-    const std::string filter_format {"OIX"};
-    const int64_t groups {1};
-    const int64_t stride {4};
-    const int64_t pad {0};
-    const int64_t dilation {1};
-    const std::vector<int64_t> diff_dst_shape {256, 3, 227, 227};
-    const std::vector<int64_t> wei_shape {3, 96, 11, 11};
-    const std::vector<int64_t> expected_diff_src_shape {256, 96, 55, 55};
-
-    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
-            filter_format, groups, stride, pad, dilation, diff_dst_shape,
-            wei_shape, expected_diff_src_shape);
-}
-
-TEST(test_interface_op_schema, InferConvTransposeBackwardDataShapeWithNcxOix) {
-    const auto kind = op_kind::ConvTransposeBackwardData;
-    const std::string data_format {"NCX"};
-    const std::string filter_format {"OIX"};
-    const int64_t groups {1};
-    const int64_t stride {4};
-    const int64_t pad {0};
-    const int64_t dilation {1};
-    const std::vector<int64_t> diff_dst_shape {256, 3, 227, 227};
-    const std::vector<int64_t> wei_shape {3, 96, 11, 11};
-    const std::vector<int64_t> expected_diff_src_shape {256, 96, 55, 55};
-
-    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
-            filter_format, groups, stride, pad, dilation, diff_dst_shape,
-            wei_shape, expected_diff_src_shape);
-}
-
-TEST(test_interface_op_schema,
-        InferConvTransposeBackwardDataShapeWithNxcXioAndGroups) {
-    const auto kind = op_kind::ConvTransposeBackwardData;
-    const std::string data_format {"NCX"};
-    const std::string filter_format {"OIX"};
-    const int64_t groups {2};
-    const int64_t stride {1};
-    const int64_t pad {1};
-    const int64_t dilation {1};
-    const std::vector<int64_t> diff_dst_shape {256, 384, 13, 13};
-    const std::vector<int64_t> wei_shape {192, 256, 3, 3};
-    const std::vector<int64_t> expected_diff_src_shape {256, 256, 13, 13};
-
-    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
-            filter_format, groups, stride, pad, dilation, diff_dst_shape,
-            wei_shape, expected_diff_src_shape);
-}
-
-TEST(test_interface_op_schema,
-        InferConvTransposeBackwardDataShapeWithNcxOixAndGroups) {
-    const auto kind = op_kind::ConvTransposeBackwardData;
-    const std::string data_format {"NCX"};
-    const std::string filter_format {"OIX"};
-    const int64_t groups {2};
-    const int64_t stride {1};
-    const int64_t pad {1};
-    const int64_t dilation {1};
-    const std::vector<int64_t> diff_dst_shape {256, 384, 13, 13};
-    const std::vector<int64_t> wei_shape {192, 256, 3, 3};
-    const std::vector<int64_t> expected_diff_src_shape {256, 256, 13, 13};
-
-    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
-            filter_format, groups, stride, pad, dilation, diff_dst_shape,
-            wei_shape, expected_diff_src_shape);
-}
-
-TEST(test_interface_op_schema, ConvTransposeBackwardWeights) {
-    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardWeights;
-    const std::set<size_t> expected_in_sizes = {2, 3};
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 9;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::dilations, true}, {op_attr::auto_pad, false},
-            {op_attr::groups, false}, {op_attr::data_format, false},
-            {op_attr::weights_format, false}, {op_attr::weights_shape, false}};
-    for (auto expected_in_size : expected_in_sizes) {
-        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-                expected_attr_size, attrs_data);
-    }
-}
-
-TEST(test_interface_op_schema,
-        InferConvTransposeBackwardWeightsShapeFromAttribute) {
-    const auto kind = op_kind::ConvTransposeBackwardWeights;
-    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(kind);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {kind, op_t::kind2str(kind)};
-    std::vector<int64_t> strides = {4, 4};
-    std::vector<int64_t> pads_begin = {0, 0};
-    std::vector<int64_t> pads_end = {0, 0};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    int64_t groups = 1;
-
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations, "None",
-            data_format, filter_format, groups);
-    const std::vector<int64_t> expected_diff_wei_shape {3, 96, 11, 11};
-    a_op.set_attr<std::vector<int64_t>>(
-            op_attr::weights_shape, expected_diff_wei_shape);
-
-    auto src_lt = logical_tensor_init(0, {256, 96, 55, 55}, data_type::f32);
-    auto diff_dst_lt
-            = logical_tensor_init(1, {256, 3, 227, 227}, data_type::f32);
-    auto diff_wei_lt = logical_tensor_init(2, data_type::f32);
-    std::vector<logical_tensor_t *> in_lts {&src_lt, &diff_dst_lt};
-    std::vector<logical_tensor_t *> out_lts {&diff_wei_lt};
-    a_op_schema->shape_infer(&a_op, in_lts, out_lts);
-
-    const std::vector<int64_t> inferred_diff_wei_shape
-            = logical_tensor_wrapper_t(diff_wei_lt).vdims();
-    EXPECT_EQ(inferred_diff_wei_shape, expected_diff_wei_shape);
-}
-
-TEST(test_interface_op_schema,
-        InferConvTransposeBackwardWeightsDilationsShape) {
-    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
-            op_kind::ConvTransposeBackwardWeights);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvTransposeBackwardWeights,
-            op_t::kind2str(op_kind::ConvTransposeBackwardWeights)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-
-    // data shape {N, IC, H, W}
-    const std::vector<int64_t> &in_data = {1, 1, 10, 10};
-    const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
-
-    a_op.set_attr(op_attr::weights_shape, expected_out_shape);
-    auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
-
-    logical_tensor_t lt_output_delta
-            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
-
-    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-    logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    a_op_schema->shape_infer(&a_op, in, out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferConvTransposeBackwardWeightsAutoPadShape) {
-    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardWeights;
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
-
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-
-        // data shape {N, IC, H, W}
-        const std::vector<int64_t> &in_data = {1, 1, 6, 6};
-        const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
-
-        a_op.set_attr(op_attr::weights_shape, expected_out_shape);
-        auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
-
-        logical_tensor_t lt_output_delta
-                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        if (auto_pad == "VALID") {
-            lt_output_delta
-                    = logical_tensor_init(id++, {1, 1, 9, 9}, data_type::f32);
-        }
-
-        std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        a_op_schema->shape_infer(&a_op, in, out);
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, GenerateDefaultAttribute) {
-    const op_schema_t *matmul_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MatMul);
-    op_t matmul_op {0, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt_data_a = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_data_b = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-
-    matmul_op.add_input(lt_data_a);
-    matmul_op.add_input(lt_data_b);
-    matmul_op.add_output(lt_out);
-    EXPECT_TRUE(matmul_op_schema->verify(&matmul_op));
-
-    logical_tensor_t lt_bias = logical_tensor_init(3, data_type::f32);
-    matmul_op.add_input(lt_bias);
-    EXPECT_TRUE(matmul_op_schema->verify(&matmul_op));
-
-    matmul_op.set_attr(op_attr::transpose_a, true);
-    const bool *flag;
-    const bool **ret_flag = &flag;
-    matmul_op.get_attr<bool>(op_attr::transpose_a, ret_flag);
-    EXPECT_TRUE(ret_flag);
-    EXPECT_EQ(matmul_op.get_attr<bool>(op_attr::transpose_b, ret_flag),
-            status::invalid_arguments);
-
-    graph_t agraph;
-    ASSERT_EQ(agraph.add_op(&matmul_op), status::success);
-    agraph.finalize();
-    ASSERT_EQ(agraph.num_ops(), 1U);
-
-    const auto &graph_matmul_op = agraph.get_ops()[0];
-    EXPECT_TRUE(graph_matmul_op->get_attr<bool>(op_attr::transpose_a));
-    EXPECT_FALSE(graph_matmul_op->get_attr<bool>(op_attr::transpose_b));
-}
-
-TEST(test_interface_op_schema, TestVerifyFunction) {
-    const op_schema_t *conv_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
-
-    op_t conv_op {0, op_kind::Convolution, std::string("convolution")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_weight = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-
-    conv_op.add_input(lt_data);
-    conv_op.add_input(lt_weight);
-    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
-
-    conv_op.add_output(lt_out);
-    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
-
-    std::vector<int64_t> strides = {2, 2};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    conv_op.set_attr(op_attr::strides, strides);
-    conv_op.set_attr(op_attr::pads_begin, pads_begin);
-    conv_op.set_attr(op_attr::pads_end, pads_end);
-    conv_op.set_attr(op_attr::dilations, dilations);
-    conv_op.set_attr(op_attr::data_format, data_format);
-    conv_op.set_attr(op_attr::weights_format, filter_format);
-
-    EXPECT_TRUE(conv_op_schema->verify(&conv_op));
-
-    conv_op.set_attr(op_attr::auto_pad, false);
-    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
-
-    std::string auto_pad = "VALID";
-    conv_op.set_attr(op_attr::auto_pad, auto_pad);
-
-    EXPECT_TRUE(conv_op_schema->verify(&conv_op));
-
-    float arbitrary_value = 123.0;
-    conv_op.set_attr(op_attr::undef, arbitrary_value);
-
-    // not allow undefined attribute
-    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
-
-    const op_schema_t *select_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Select);
-
-    logical_tensor_t lt_cond_invalid_dt
-            = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_cond_valid_dt
-            = logical_tensor_init(0, data_type::boolean);
-    logical_tensor_t lt_src_0 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_src_1 = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t lt_dst = logical_tensor_init(3, data_type::f32);
-
-    op_t select_op_invalid_dt {0, op_kind::Select, std::string("select")};
-    select_op_invalid_dt.add_input(lt_cond_invalid_dt);
-    select_op_invalid_dt.add_input(lt_src_0);
-    select_op_invalid_dt.add_input(lt_src_1);
-    select_op_invalid_dt.add_output(lt_dst);
-    EXPECT_FALSE(select_op_schema->verify(&select_op_invalid_dt));
-
-    op_t select_op_valid_dt {0, op_kind::Select, std::string("select")};
-    select_op_valid_dt.add_input(lt_cond_valid_dt);
-    select_op_valid_dt.add_input(lt_src_0);
-    select_op_valid_dt.add_input(lt_src_1);
-    select_op_valid_dt.add_output(lt_dst);
-    EXPECT_TRUE(select_op_schema->verify(&select_op_valid_dt));
-}
-
-TEST(test_interface_op_schema, InferConvOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, IC, H, W}
-        const std::vector<int64_t> &in_data = {1, 32, 224, 224};
-        // weight shape {OC, IC, KH, KW}
-        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3};
-        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeOutputShape) {
-    const op_kind_t op_kind_ = op_kind::ConvTranspose;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    // data shape {N, IC, H, W}
-    const std::vector<int64_t> &in_data = {1, 16, 111, 111};
-    const std::vector<int64_t> &expected_out_shape = {1, 32, 224, 224};
-    for (auto groups : groups_vec) {
-        // weight shape {OC, IC, KH, KW}
-        const std::vector<int64_t> &in_weight = {32 / groups, 16, 3, 3};
-
-        verify_shape_infer_for_convtranspose(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConv3dOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, IC, D, H, W}
-        const std::vector<int64_t> &in_data = {1, 32, 224, 224, 224};
-        // weight shape {OC, IC, KD, KH, KW}
-        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3, 3};
-        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111, 111};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeOutputShapeWithNxcFormat) {
-    const op_kind_t op_kind_ = op_kind::ConvTranspose;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "OIX";
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    // data shape {N, H, W, IC}
-    const std::vector<int64_t> &in_data = {1, 111, 111, 16};
-    const std::vector<int64_t> &expected_out_shape = {1, 224, 224, 32};
-    for (auto groups : groups_vec) {
-        // weight shape {OC, IC, KH, KW}
-        const std::vector<int64_t> &in_weight = {32 / groups, 16, 3, 3};
-
-        verify_shape_infer_for_convtranspose(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvOutputShapeWithNxcFormat) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "OIX";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, H, W, IC}
-        const std::vector<int64_t> &in_data = {1, 224, 224, 32};
-        // weight shape {OC, IC, KH, KW}
-        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3};
-        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 16};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConv3dOutputShapeWithNxcFormat) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "OIX";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, D, H, W, IC}
-        const std::vector<int64_t> &in_data = {1, 224, 224, 224, 32};
-        // weight shape {OC, IC, KD, KH, KW}
-        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3, 3};
-        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 111, 16};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvOutputShapeWithNxcXioFormat) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "XIO";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, H, W, IC}
-        const std::vector<int64_t> &in_data = {1, 224, 224, 32};
-        // weight shape {KH, KW, IC, OC}
-        const std::vector<int64_t> &in_weight = {3, 3, 32 / groups, 16};
-        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 16};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeOutputShapeWithNxcXioFormat) {
-    const op_kind_t op_kind_ = op_kind::ConvTranspose;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "XIO";
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    // data shape {N, H, W, IC}
-    const std::vector<int64_t> &in_data = {1, 111, 111, 16};
-    const std::vector<int64_t> &expected_out_shape = {1, 224, 224, 32};
-    for (auto groups : groups_vec) {
-        // weight shape {KH, KW, IC, OC}
-        const std::vector<int64_t> &in_weight = {3, 3, 16, 32 / groups};
-
-        verify_shape_infer_for_convtranspose(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConv3dOutputShapeWithNxcXioFormat) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "XIO";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, D, H, W, IC}
-        const std::vector<int64_t> &in_data = {1, 224, 224, 224, 32};
-        // weight shape {KD, KH, KW, IC, OC}
-        const std::vector<int64_t> &in_weight = {3, 3, 3, 32 / groups, 16};
-        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 111, 16};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvOutputShapeWithXioFormat) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "XIO";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, IC, H, W}
-        const std::vector<int64_t> &in_data = {1, 32, 224, 224};
-        // weight shape {KH, KW, IC, OC}
-        const std::vector<int64_t> &in_weight = {3, 3, 32 / groups, 16};
-        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvtransposeOutputShapeWithXioFormat) {
-    const op_kind_t op_kind_ = op_kind::ConvTranspose;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "XIO";
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    // data shape {N, IC, H, W}
-    const std::vector<int64_t> &in_data = {1, 16, 111, 111};
-    const std::vector<int64_t> &expected_out_shape = {1, 32, 224, 224};
-    for (auto groups : groups_vec) {
-        // weight shape {KH, KW, IC, OC}
-        const std::vector<int64_t> &in_weight = {3, 3, 16, 32 / groups};
-
-        verify_shape_infer_for_convtranspose(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConv3dOutputShapeWithXioFormat) {
-    const op_kind_t op_kind_ = op_kind::Convolution;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "XIO";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-
-    for (auto groups : groups_vec) {
-        // data shape {N, IC, D, H, W}
-        const std::vector<int64_t> &in_data = {1, 32, 224, 224, 224};
-        // weight shape {KD, KH, KW, IC, OC}
-        const std::vector<int64_t> &in_weight = {3, 3, 3, 32 / groups, 16};
-        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111, 111};
-
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, expected_out_shape);
-
-        const std::vector<int64_t> &in_bias = {16};
-        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
-                groups, in_data, in_weight, in_bias, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, MaxPool) {
-    const op_kind_t op_kind_ = op_kind::MaxPool;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 8;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::kernel, true}, {op_attr::pads_begin, true},
-            {op_attr::pads_end, true}, {op_attr::dilations, false},
-            {op_attr::data_format, false}, {op_attr::auto_pad, false},
-            {op_attr::rounding_type, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMaxpoolOutputShape) {
-    const op_schema_t *pool_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
-
-    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
-    std::vector<int64_t> strides = {2, 2};
-    std::vector<int64_t> kernel = {3, 3};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string auto_pad = "SAME_UPPER";
-    std::string data_format = "NCX";
-
-    pool_op.set_attr(op_attr::strides, strides);
-    pool_op.set_attr(op_attr::pads_begin, pads_begin);
-    pool_op.set_attr(op_attr::pads_end, pads_end);
-    pool_op.set_attr(op_attr::kernel, kernel);
-    pool_op.set_attr(op_attr::dilations, dilations);
-    pool_op.set_attr(op_attr::auto_pad, auto_pad);
-    pool_op.set_attr(op_attr::data_format, data_format);
-
-    logical_tensor_t lt_data
-            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_data};
-    logical_tensor_t lt_o
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-
-    // if output shape is unknown, infer output shape
-    pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_o).vdims();
-    const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    const std::vector<int64_t> inferred_out_strides
-            = logical_tensor_wrapper_t(lt_o).vstrides();
-    const std::vector<int64_t> expected_out_strides
-            = compute_dense_strides(expected_out_shape);
-    EXPECT_EQ(inferred_out_strides, expected_out_strides);
-
-    // if output shape is known, infer auto pad
-    pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
-    auto inferred_pads_begin
-            = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-    auto inferred_pads_end
-            = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-    const std::vector<int64_t> expected_pads_begin = {0, 0};
-    const std::vector<int64_t> expected_pads_end = {1, 1};
-    EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-    EXPECT_EQ(inferred_pads_end, expected_pads_end);
-}
-
-TEST(test_interface_op_schema, InferMaxpoolAutoPadShape) {
-    const op_schema_t *pool_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
-    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-
-    pool_op.set_attr(op_attr::strides, strides);
-    pool_op.set_attr(op_attr::pads_begin, pads_begin);
-    pool_op.set_attr(op_attr::pads_end, pads_end);
-    pool_op.set_attr(op_attr::kernel, kernel);
-    pool_op.set_attr(op_attr::dilations, dilations);
-
-    pool_op.set_attr(op_attr::data_format, data_format);
-    std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        pool_op.set_attr(op_attr::auto_pad, auto_pad);
-        logical_tensor_t lt_data
-                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        std::vector<logical_tensor_t *> lt_in {&lt_data};
-        logical_tensor_t lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        // if output shape is unknown, infer output shape
-        pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_o).vdims();
-
-        auto inferred_pads_begin
-                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        auto inferred_pads_end
-                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {2, 2};
-            const std::vector<int64_t> expected_pads_end = {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 3, 3};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {0, 0};
-            const std::vector<int64_t> expected_pads_end = {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferMaxpoolDilationsShape) {
-    const op_schema_t *pool_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
-    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {0, 0};
-    std::vector<int64_t> pads_end = {0, 0};
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    const std::vector<int64_t> expected_out_shape = {1, 1, 10, 10};
-    int64_t id = 0;
-
-    pool_op.set_attr(op_attr::strides, strides);
-    pool_op.set_attr(op_attr::pads_begin, pads_begin);
-    pool_op.set_attr(op_attr::pads_end, pads_end);
-    pool_op.set_attr(op_attr::kernel, kernel);
-    pool_op.set_attr(op_attr::dilations, dilations);
-
-    pool_op.set_attr(op_attr::data_format, data_format);
-    pool_op.set_attr<std::string>(op_attr::auto_pad, "VALID");
-
-    logical_tensor_t lt_data
-            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_data};
-    logical_tensor_t lt_o
-            = logical_tensor_init(id++, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-    // if output shape is unknown, infer output shape
-    pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
-
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_o).vdims();
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferMaxpoolOutputShapeWithRoundingType) {
-    const op_schema_t *pool_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
-
-    std::set<std::string> rounding_types = {"ceil", "floor"};
-    for (auto &rounding_type : rounding_types) {
-        op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
-        std::vector<int64_t> strides = {2, 2};
-        std::vector<int64_t> kernel = {3, 3};
-        std::vector<int64_t> pads_begin = {0, 0};
-        std::vector<int64_t> pads_end = {0, 0};
-        std::vector<int64_t> dilations = {1, 1};
-        std::string data_format = "NCX";
-        pool_op.set_attr(op_attr::strides, strides);
-        pool_op.set_attr(op_attr::pads_begin, pads_begin);
-        pool_op.set_attr(op_attr::pads_end, pads_end);
-        pool_op.set_attr(op_attr::kernel, kernel);
-        pool_op.set_attr(op_attr::dilations, dilations);
-        pool_op.set_attr(op_attr::data_format, data_format);
-        pool_op.set_attr(op_attr::rounding_type, rounding_type);
-
-        logical_tensor_t lt_data
-                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-        std::vector<logical_tensor_t *> lt_in {&lt_data};
-        logical_tensor_t lt_o
-                = logical_tensor_init(2, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-
-        // if output shape is unknown, infer output shape
-        pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_o).vdims();
-        const std::vector<int64_t> inferred_out_strides
-                = logical_tensor_wrapper_t(lt_o).vstrides();
-        if (rounding_type == "ceil") {
-            const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-            const std::vector<int64_t> expected_out_strides
-                    = compute_dense_strides(expected_out_shape);
-            EXPECT_EQ(inferred_out_strides, expected_out_strides);
-        } else { // rounding_type = floor
-            const std::vector<int64_t> expected_out_shape = {1, 3, 111, 111};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-            const std::vector<int64_t> expected_out_strides
-                    = compute_dense_strides(expected_out_shape);
-            EXPECT_EQ(inferred_out_strides, expected_out_strides);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, TestMaxpoolDilations) {
-    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {0, 0};
-    std::vector<int64_t> pads_end = {0, 0};
-    std::vector<int64_t> invalid_dilations = {2, 2, 2};
-    std::string data_format = "NCX";
-    const std::vector<int64_t> expected_out_shape = {1, 1, 10, 10};
-    int64_t id = 0;
-
-    pool_op.set_attr(op_attr::strides, strides);
-    pool_op.set_attr(op_attr::pads_begin, pads_begin);
-    pool_op.set_attr(op_attr::pads_end, pads_end);
-    pool_op.set_attr(op_attr::kernel, kernel);
-    pool_op.set_attr(op_attr::dilations, invalid_dilations);
-
-    pool_op.set_attr(op_attr::data_format, data_format);
-    pool_op.set_attr<std::string>(op_attr::auto_pad, "VALID");
-
-    logical_tensor_t lt_data
-            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_data};
-    logical_tensor_t lt_o
-            = logical_tensor_init(id++, data_type::f32, layout_type::strided);
-    pool_op.add_input(lt_data);
-    pool_op.add_output(lt_o);
-
-    graph_t agraph;
-    // failed due to invalid_dilations.size() != kernel.size()
-    ASSERT_NE(agraph.add_op(&pool_op), status::success);
-
-    pool_op.remove_attr(op_attr::dilations);
-    std::vector<int64_t> invalid_dilations_2(12, 2);
-    pool_op.set_attr(op_attr::dilations, invalid_dilations_2);
-    // failed due to invalid_dilations.size() != kernel.size()
-    ASSERT_NE(agraph.add_op(&pool_op), status::success);
-
-    pool_op.remove_attr(op_attr::dilations);
-    // dilations will be added by default in add_op()
-    ASSERT_EQ(agraph.add_op(&pool_op), status::success);
-
-    agraph.finalize();
-    ASSERT_EQ(agraph.num_ops(), 1U);
-
-    const auto &graph_max_pool_op = agraph.get_ops()[0];
-    // dilations: default value is vec(12, 1)
-    std::vector<int64_t> expected_dilations(12, 1);
-    ASSERT_EQ(graph_max_pool_op->get_attr<dims>(op_attr::dilations),
-            expected_dilations);
-}
-
-TEST(test_interface_op_schema, InferMatmulOutputShape) {
-    const op_schema_t *matmul_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MatMul);
-
-    op_t matmul_op {op_kind::MatMul, op_t::kind2str(op_kind::MatMul)};
-    bool transpose_a = true;
-    matmul_op.set_attr(op_attr::transpose_a, transpose_a);
-
-    // test 2 dims matmul
-    logical_tensor_t lt_in1
-            = logical_tensor_init(0, {1024, 64}, data_type::f32);
-    logical_tensor_t lt_in2
-            = logical_tensor_init(1, {1024, 1000}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_in1, &lt_in2};
-    logical_tensor_t lt_o1
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out1 {&lt_o1};
-
-    matmul_op_schema->shape_infer(&matmul_op, lt_in, lt_out1);
-
-    const std::vector<int64_t> inferred_out_shape1
-            = logical_tensor_wrapper_t(lt_o1).vdims();
-    const std::vector<int64_t> expected_out_shape1 = {64, 1000};
-    EXPECT_EQ(inferred_out_shape1, expected_out_shape1);
-
-    const std::vector<int64_t> inferred_out_strides1
-            = logical_tensor_wrapper_t(lt_o1).vstrides();
-    const std::vector<int64_t> expected_out_strides1
-            = compute_dense_strides(expected_out_shape1);
-    EXPECT_EQ(inferred_out_strides1, expected_out_strides1);
-
-    // test 1 dims matmul
-    lt_in1 = logical_tensor_init(0, {1024}, data_type::f32);
-    logical_tensor_t lt_o2
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out2 {&lt_o2};
-    matmul_op_schema->shape_infer(&matmul_op, lt_in, lt_out2);
-
-    auto &inferred_out_shape2 = lt_o2.dims;
-    EXPECT_EQ(lt_o2.ndims, 1);
-    EXPECT_EQ(inferred_out_shape2[0], 1000);
-    auto &inferred_out_strides2 = lt_o2.layout.strides;
-    EXPECT_EQ(inferred_out_strides2[0], 1);
-
-    // test >2 dims matmul
-    lt_in1 = logical_tensor_init(0, {3, 1, 10, 1024, 64}, data_type::f32);
-    lt_in2 = logical_tensor_init(1, {5, 1, 1024, 1000}, data_type::f32);
-    logical_tensor_t lt_o3
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out3 {&lt_o3};
-    matmul_op_schema->shape_infer(&matmul_op, lt_in, lt_out3);
-
-    const std::vector<int64_t> inferred_out_shape3
-            = logical_tensor_wrapper_t(lt_o3).vdims();
-    const std::vector<int64_t> expected_out_shape3 = {3, 5, 10, 64, 1000};
-    EXPECT_EQ(inferred_out_shape3, expected_out_shape3);
-
-    const std::vector<int64_t> inferred_out_strides3
-            = logical_tensor_wrapper_t(lt_o3).vstrides();
-    const std::vector<int64_t> expected_out_strides3
-            = compute_dense_strides(expected_out_shape3);
-    EXPECT_EQ(inferred_out_strides3, expected_out_strides3);
-}
-
-TEST(test_interface_op_schema, Abs) {
-    const op_kind_t op_kind_ = op_kind::Abs;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, AbsBackward) {
-    const op_kind_t op_kind_ = op_kind::AbsBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, Add) {
-    const op_kind_t op_kind_ = op_kind::Add;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferAddOutputShapeWithoutBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Add;
-
-    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferAddOutputShapeWithBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Add;
-
-    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, BatchNormForwardTraining) {
-    const op_kind_t op_kind_ = op_kind::BatchNormForwardTraining;
-    const size_t expected_in_size = 5;
-    const size_t expected_out_size = 5;
-    const size_t expected_attr_size = 3;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::epsilon, true},
-            {op_attr::momentum, false}, {op_attr::data_format, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferBatchNormForwardTrainingOutputShape) {
-    const op_kind_t op_kind_ = op_kind::BatchNormForwardTraining;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    logical_tensor_t lt_in
-            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-    logical_tensor_t lt_mean = logical_tensor_init(1, {224}, data_type::f32);
-    logical_tensor_t lt_variance
-            = logical_tensor_init(2, {224}, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_in, &lt_mean, &lt_variance};
-    logical_tensor_t lt_out = logical_tensor_init(3, data_type::f32);
-    logical_tensor_t lt_r_mean = logical_tensor_init(4, data_type::f32);
-    logical_tensor_t lt_r_var = logical_tensor_init(5, data_type::f32);
-    logical_tensor_t lt_b_mean = logical_tensor_init(6, data_type::f32);
-    logical_tensor_t lt_b_var = logical_tensor_init(7, data_type::f32);
-    std::vector<logical_tensor_t *> out {
-            &lt_out, &lt_r_mean, &lt_r_var, &lt_b_mean, &lt_b_var};
-
-    op_.set_attr<float>(op_attr::epsilon, 0.01f);
-    op_schema_->shape_infer(&op_, in, out);
-    const std::vector<int64_t> inferred_out_shape0
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    const std::vector<int64_t> expected_out_shape0 = {1, 3, 224, 224};
-    EXPECT_EQ(inferred_out_shape0, expected_out_shape0);
-
-    const std::vector<int64_t> inferred_out_shape1
-            = logical_tensor_wrapper_t(lt_r_mean).vdims();
-    const std::vector<int64_t> inferred_out_shape2
-            = logical_tensor_wrapper_t(lt_r_var).vdims();
-    const std::vector<int64_t> inferred_out_shape3
-            = logical_tensor_wrapper_t(lt_b_mean).vdims();
-    const std::vector<int64_t> inferred_out_shape4
-            = logical_tensor_wrapper_t(lt_b_var).vdims();
-    const std::vector<int64_t> expected_out_shape_1D = {224};
-    EXPECT_EQ(inferred_out_shape1, expected_out_shape_1D);
-    EXPECT_EQ(inferred_out_shape2, expected_out_shape_1D);
-    EXPECT_EQ(inferred_out_shape3, expected_out_shape_1D);
-    EXPECT_EQ(inferred_out_shape4, expected_out_shape_1D);
-
-    logical_tensor_t lt_out_not_filled = logical_tensor_init(8, data_type::f32);
-    std::vector<logical_tensor_t *> out_partially_not_filled {
-            &lt_out_not_filled, &lt_r_mean, &lt_r_var, &lt_b_mean, &lt_b_var};
-    const std::string data_f = "NCX";
-    op_.set_attr(op_attr::data_format, data_f);
-    auto result = op_schema_->shape_infer(&op_, in, out_partially_not_filled);
-    EXPECT_EQ(result, status::invalid_shape);
-}
-
-TEST(test_interface_op_schema, BatchNormTrainingBackward) {
-    const op_kind_t op_kind_ = op_kind::BatchNormTrainingBackward;
-    const size_t expected_in_size = 5;
-    const size_t expected_out_size = 3;
-    const size_t expected_attr_size = 2;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::epsilon, true}, {op_attr::data_format, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferBatchNormTrainingBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::BatchNormTrainingBackward;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    logical_tensor_t lt_in
-            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-    logical_tensor_t lt_output_delta
-            = logical_tensor_init(1, {1, 2, 224, 224}, data_type::f32);
-    logical_tensor_t lt_mean = logical_tensor_init(2, {224}, data_type::f32);
-    logical_tensor_t lt_variance
-            = logical_tensor_init(3, {224}, data_type::f32);
-    std::vector<logical_tensor_t *> in {
-            &lt_in, &lt_output_delta, &lt_mean, &lt_variance};
-    logical_tensor_t lt_input_delta = logical_tensor_init(4, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_input_delta};
-
-    op_.set_attr<float>(op_attr::epsilon, 0.01f);
-    op_schema_->shape_infer(&op_, in, out);
-    const std::vector<int64_t> inferred_out_shape0
-            = logical_tensor_wrapper_t(lt_input_delta).vdims();
-    const std::vector<int64_t> expected_out_shape0 = {1, 3, 224, 224};
-    EXPECT_EQ(inferred_out_shape0, expected_out_shape0);
-
-    logical_tensor_t lt_gamma = logical_tensor_init(5, {224}, data_type::f32);
-    logical_tensor_t lt_beta = logical_tensor_init(6, {224}, data_type::f32);
-    std::vector<logical_tensor_t *> in_with_options {&lt_in, &lt_output_delta,
-            &lt_gamma, &lt_beta, &lt_mean, &lt_variance};
-    logical_tensor_t lt_gamma_delta = logical_tensor_init(7, data_type::f32);
-    logical_tensor_t lt_beta_delta = logical_tensor_init(8, data_type::f32);
-    std::vector<logical_tensor_t *> out_with_options {
-            &lt_input_delta, &lt_gamma_delta, &lt_beta_delta};
-
-    op_schema_->shape_infer(&op_, in_with_options, out_with_options);
-    const std::vector<int64_t> expected_out_shape_1D = {224};
-    const std::vector<int64_t> inferred_out_shape1
-            = logical_tensor_wrapper_t(lt_gamma_delta).vdims();
-    const std::vector<int64_t> inferred_out_shape2
-            = logical_tensor_wrapper_t(lt_beta_delta).vdims();
-    EXPECT_EQ(inferred_out_shape1, expected_out_shape_1D);
-    EXPECT_EQ(inferred_out_shape2, expected_out_shape_1D);
-}
-
-TEST(test_interface_op_schema, BiasAdd) {
-    const op_kind_t op_kind_ = op_kind::BiasAdd;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::data_format, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferBiasAddOutputShape) {
-    const op_kind_t op_kind_ = op_kind::BiasAdd;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    logical_tensor_t lt_in
-            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-    logical_tensor_t lt_bias = logical_tensor_init(1, {224}, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_in, &lt_bias};
-    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    op_schema_->shape_infer(&op_, in, out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    const std::vector<int64_t> expected_out_shape = {1, 3, 224, 224};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    logical_tensor_t lt_out_not_filled = logical_tensor_init(2, data_type::f32);
-    std::vector<logical_tensor_t *> out_not_filled {&lt_out_not_filled};
-    const std::string data_f = "NCX";
-    op_.set_attr(op_attr::data_format, data_f);
-    auto result = op_schema_->shape_infer(&op_, in, out_not_filled);
-    EXPECT_EQ(result, status::invalid_shape);
-}
-
-TEST(test_interface_op_schema, BiasAddBackward) {
-    const op_kind_t op_kind_ = op_kind::BiasAddBackward;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::data_format, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferBiasAddBackwardOutputShapeWithNxcFormat) {
-    const op_kind_t op_kind_ = op_kind::BiasAddBackward;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    const int64_t channels = 16;
-    logical_tensor_t lt_in
-            = logical_tensor_init(0, {3, 64, 64, channels}, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_in};
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    // no need to set attribute, NXC value should be default
-    op_schema_->shape_infer(&op_, in, out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    const std::vector<int64_t> expected_out_shape = {channels};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    // explicitly setting data_format to NXC
-    const std::string default_data_f = "NXC";
-    op_.set_attr(op_attr::data_format, default_data_f);
-    logical_tensor_t lt_out_expl = logical_tensor_init(2, data_type::f32);
-    std::vector<logical_tensor_t *> out_expl {&lt_out_expl};
-
-    op_schema_->shape_infer(&op_, in, out_expl);
-    const std::vector<int64_t> inferred_out_shape_expl
-            = logical_tensor_wrapper_t(lt_out_expl).vdims();
-    EXPECT_EQ(inferred_out_shape_expl, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferBiasAddBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::BiasAddBackward;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    const int64_t channels = 16;
-    logical_tensor_t lt_in
-            = logical_tensor_init(0, {3, channels, 64, 64}, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_in};
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-    const std::string data_f = "NCX";
-    op_.set_attr(op_attr::data_format, data_f);
-
-    op_schema_->shape_infer(&op_, in, out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    const std::vector<int64_t> expected_out_shape = {channels};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, Clamp) {
-    const op_kind_t op_kind_ = op_kind::Clamp;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 2;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::min, true}, {op_attr::max, true}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferClampOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Clamp;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, ClampBackward) {
-    const op_kind_t op_kind_ = op_kind::ClampBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 3;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::min, true}, {op_attr::max, true}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferClampBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::ClampBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, ConvolutionBackwardData) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
-    const size_t expected_in_size = 3;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 10;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::dilations, true}, {op_attr::auto_pad, false},
-            {op_attr::output_padding, false}, {op_attr::groups, false},
-            {op_attr::data_format, false}, {op_attr::weights_format, false},
-            {op_attr::dst_shape, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferConvolutionBackwardDataOutputShape) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-    // data shape {N, IC, H, W}
-    std::vector<int64_t> in_data {1, 16, 224, 224};
-    std::vector<int64_t> in_output_shape {};
-    std::vector<int64_t> expected_out_shape {1, 32, 452, 452};
-
-    for (auto groups : groups_vec) {
-        // weight shape {OC, IC, KH, KW}
-        std::vector<int64_t> in_weight = {16, 32 / groups, 3, 3};
-
-        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, in_output_shape,
-                expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, InferConvolutionBackwardDataAutoPadShape) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
-
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t groups = 1;
-    int64_t id = 0;
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-        if (auto_pad == "NONE") {
-            a_op.set_attr<std::vector<int64_t>>(
-                    op_attr::dst_shape, {1, 1, 6, 6});
-        } else if (auto_pad == "SAME_UPPER") {
-            a_op.set_attr<std::vector<int64_t>>(
-                    op_attr::dst_shape, {1, 1, 6, 6});
-        } else if (auto_pad == "SAME_LOWER") {
-            a_op.set_attr<std::vector<int64_t>>(
-                    op_attr::dst_shape, {1, 1, 6, 6});
-        } else if (auto_pad == "VALID") {
-            a_op.set_attr<std::vector<int64_t>>(
-                    op_attr::dst_shape, {1, 1, 9, 9});
-        }
-        auto lt_data = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        auto lt_weight
-                = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
-        auto lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-
-        const auto &inferred_pads_begin
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        const auto &inferred_pads_end
-                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {1, 1};
-            std::vector<int64_t> expected_pads_end {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {2, 2};
-            std::vector<int64_t> expected_pads_end {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape {1, 1, 9, 9};
-            ASSERT_EQ(inferred_out_shape, expected_out_shape);
-
-            std::vector<int64_t> expected_pads_begin {0, 0};
-            std::vector<int64_t> expected_pads_end {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferConvolutionBackwardDataDilationsShape) {
-    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
-            op_kind::ConvolutionBackwardData);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvolutionBackwardData,
-            op_t::kind2str(op_kind::ConvolutionBackwardData)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-
-    int64_t groups = 1;
-    int64_t id = 0;
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-    a_op.set_attr<std::vector<int64_t>>(op_attr::dst_shape, {1, 1, 16, 16});
-
-    auto lt_data = logical_tensor_init(id++, {1, 1, 10, 10}, data_type::f32);
-    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
-    std::vector<int64_t> expected_out_shape {1, 1, 16, 16};
-
-    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
-    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
-    ASSERT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema,
-        InferConvolutionBackwardDataOutputShapeWithNxcFormat) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "OIX";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-    // data shape {N, H, W, IC}
-    std::vector<int64_t> in_data {1, 224, 224, 16};
-    std::vector<int64_t> in_output_shape {};
-    std::vector<int64_t> expected_out_shape {1, 452, 452, 32};
-
-    for (auto groups : groups_vec) {
-        // weight shape {OC, IC, KH, KW}
-        std::vector<int64_t> in_weight = {16, 32 / groups, 3, 3};
-
-        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, in_output_shape,
-                expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema,
-        InferConvolutionBackwardDataOutputShapeWithNxcXioFormat) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
-
-    std::string data_format = "NXC";
-    std::string filter_format = "XIO";
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-    // data shape {N, H, W, IC}
-    std::vector<int64_t> in_data {1, 224, 224, 16};
-    std::vector<int64_t> in_output_shape {};
-    std::vector<int64_t> expected_out_shape {1, 452, 452, 32};
-
-    for (auto groups : groups_vec) {
-        // weight shape {KH, KW, IC, OC}
-        std::vector<int64_t> in_weight = {3, 3, 32 / groups, 16};
-
-        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, in_output_shape,
-                expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema,
-        InferConvolutionBackwardDataOutputShapeWithXioFormat) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
-
-    std::string data_format = "NCX";
-    std::string filter_format = "XIO";
-
-    // data shape {N, IC, H, W}
-    std::vector<int64_t> in_data {1, 16, 224, 224};
-    std::vector<int64_t> in_output_shape {};
-    std::vector<int64_t> expected_out_shape {1, 32, 452, 452};
-
-    std::vector<int64_t> groups_vec {1, 2, 4};
-    for (auto groups : groups_vec) {
-        // weight shape {KH, KW, IC, OC}
-        std::vector<int64_t> in_weight {3, 3, 32 / groups, 16};
-
-        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
-                filter_format, groups, in_data, in_weight, in_output_shape,
-                expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, ConvolutionBackwardWeights) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardWeights;
-    const size_t expected_in_size = 3;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 9;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::dilations, true}, {op_attr::auto_pad, false},
-            {op_attr::groups, false}, {op_attr::data_format, false},
-            {op_attr::weights_format, false}, {op_attr::weights_shape, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferConvolutionBackwardWeightsDilationsShape) {
-    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
-            op_kind::ConvolutionBackwardWeights);
-    EXPECT_TRUE(nullptr != a_op_schema);
-    op_t a_op {op_kind::ConvolutionBackwardWeights,
-            op_t::kind2str(op_kind::ConvolutionBackwardWeights)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-
-    int64_t groups = 1;
-    int64_t id = 0;
-
-    const std::vector<int64_t> &in_data = {1, 1, 16, 16};
-    const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
-    const std::vector<int64_t> &in_output_delta = {1, 1, 10, 10};
-
-    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-    a_op.set_attr(op_attr::weights_shape, expected_out_shape);
-
-    auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
-    logical_tensor_t lt_output_delta
-            = logical_tensor_init(id++, in_output_delta, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-    logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    a_op_schema->shape_infer(&a_op, in, out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferConvolutionBackwardWeightsOutputShape) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardWeights;
-
-    std::vector<int64_t> strides = {2, 2};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string auto_pad = "VALID";
-    std::string data_format = "NCX";
-    std::string filter_format = "XIO";
-    int64_t groups = 1;
-
-    // data shape {N, IC, H, W}
-    const std::vector<int64_t> &in_data = {1, 3, 224, 224};
-    const std::vector<int64_t> &expected_out_shape = {4, 4, 3, 16};
-    const std::vector<int64_t> &in_output_delta = {1, 16, 111, 111};
-
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    set_conv_common_attr(op_, strides, pads_begin, pads_end, dilations,
-            auto_pad, data_format, filter_format, groups);
-    op_.set_attr(op_attr::weights_shape, expected_out_shape);
-
-    logical_tensor_t lt_data = logical_tensor_init(0, in_data, data_type::f32);
-    logical_tensor_t lt_output_delta
-            = logical_tensor_init(1, in_output_delta, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    op_schema_->shape_infer(&op_, in, out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, InferConvolutionBackwardWeightsAutoPadShape) {
-    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardWeights;
-    const op_schema_t *a_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
-
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
-    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
-    std::vector<int64_t> dilations = {1, 1};
-    std::vector<int64_t> filters {1, 1, 4, 4};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    const std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t groups = 1;
-    int64_t id = 0;
-    for (const auto &auto_pad : auto_pads_vec) {
-        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
-                auto_pad, data_format, filter_format, groups);
-        a_op.set_attr(op_attr::weights_shape, filters);
-
-        // data shape {N, IC, H, W}
-        const std::vector<int64_t> &in_data = {1, 1, 6, 6};
-        const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
-
-        a_op.set_attr(op_attr::weights_shape, expected_out_shape);
-        auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
-
-        logical_tensor_t lt_output_delta
-                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        if (auto_pad == "VALID") {
-            lt_output_delta
-                    = logical_tensor_init(id++, {1, 1, 3, 3}, data_type::f32);
-        }
-
-        std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        a_op_schema->shape_infer(&a_op, in, out);
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema, AvgPool) {
-    const op_kind_t op_kind_ = op_kind::AvgPool;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 8;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::kernel, true}, {op_attr::pads_begin, true},
-            {op_attr::pads_end, true}, {op_attr::data_format, false},
-            {op_attr::auto_pad, false}, {op_attr::rounding_type, false},
-            {op_attr::exclude_pad, true}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferAvgpoolOutputShape) {
-    const op_schema_t *avg_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::AvgPool);
-
-    op_t avg_op {op_kind::AvgPool, op_t::kind2str(op_kind::AvgPool)};
-    std::vector<int64_t> strides = {2, 2};
-    std::vector<int64_t> kernel = {3, 3};
-    bool exclude_pad = false;
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::string auto_pad = "SAME_UPPER";
-    std::string data_format = "NCX";
-
-    avg_op.set_attr(op_attr::strides, strides);
-    avg_op.set_attr(op_attr::pads_begin, pads_begin);
-    avg_op.set_attr(op_attr::pads_end, pads_end);
-    avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
-    avg_op.set_attr(op_attr::kernel, kernel);
-    avg_op.set_attr(op_attr::auto_pad, auto_pad);
-    avg_op.set_attr(op_attr::data_format, data_format);
-
-    logical_tensor_t lt_data
-            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_data};
-    logical_tensor_t lt_o
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out {&lt_o};
-
-    // if output shape is unknown, infer output shape
-    avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_o).vdims();
-    const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    const std::vector<int64_t> inferred_out_strides
-            = logical_tensor_wrapper_t(lt_o).vstrides();
-    const std::vector<int64_t> expected_out_strides
-            = compute_dense_strides(expected_out_shape);
-    EXPECT_EQ(inferred_out_strides, expected_out_strides);
-
-    // if output shape is known, infer auto pad
-    avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
-    auto inferred_pads_begin
-            = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-    auto inferred_pads_end
-            = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-    const std::vector<int64_t> expected_pads_begin = {0, 0};
-    const std::vector<int64_t> expected_pads_end = {1, 1};
-    EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-    EXPECT_EQ(inferred_pads_end, expected_pads_end);
-}
-
-TEST(test_interface_op_schema, InferAvgpoolAutoPadShape) {
-    const op_schema_t *avg_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::AvgPool);
-    op_t avg_op {op_kind::AvgPool, op_t::kind2str(op_kind::AvgPool)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    bool exclude_pad = false;
-    std::string data_format = "NCX";
-
-    avg_op.set_attr(op_attr::strides, strides);
-    avg_op.set_attr(op_attr::pads_begin, pads_begin);
-    avg_op.set_attr(op_attr::pads_end, pads_end);
-    avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
-    avg_op.set_attr(op_attr::kernel, kernel);
-    avg_op.set_attr(op_attr::data_format, data_format);
-
-    std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        avg_op.set_attr(op_attr::auto_pad, auto_pad);
-        logical_tensor_t lt_data
-                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        std::vector<logical_tensor_t *> lt_in {&lt_data};
-        logical_tensor_t lt_o = logical_tensor_init(
-                id++, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-        // if output shape is unknown, infer output shape
-        avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_o).vdims();
-
-        auto inferred_pads_begin
-                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        auto inferred_pads_end
-                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {2, 2};
-            const std::vector<int64_t> expected_pads_end = {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_out_shape = {1, 1, 3, 3};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> expected_pads_begin = {0, 0};
-            const std::vector<int64_t> expected_pads_end = {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferAvgpoolOutputShapeWithRoundingType) {
-    const op_schema_t *avg_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::AvgPool);
-
-    std::set<std::string> rounding_types = {"ceil", "floor"};
-    for (auto &rounding_type : rounding_types) {
-        op_t avg_op {op_kind::AvgPool, op_t::kind2str(op_kind::AvgPool)};
-        std::vector<int64_t> strides = {2, 2};
-        std::vector<int64_t> kernel = {3, 3};
-        std::vector<int64_t> pads_begin = {0, 0};
-        std::vector<int64_t> pads_end = {0, 0};
-        bool exclude_pad = false;
-        std::string data_format = "NCX";
-        avg_op.set_attr(op_attr::strides, strides);
-        avg_op.set_attr(op_attr::pads_begin, pads_begin);
-        avg_op.set_attr(op_attr::pads_end, pads_end);
-        avg_op.set_attr(op_attr::kernel, kernel);
-        avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
-        avg_op.set_attr(op_attr::data_format, data_format);
-        avg_op.set_attr(op_attr::rounding_type, rounding_type);
-
-        logical_tensor_t lt_data
-                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-        std::vector<logical_tensor_t *> lt_in {&lt_data};
-        logical_tensor_t lt_o
-                = logical_tensor_init(2, data_type::f32, layout_type::strided);
-        std::vector<logical_tensor_t *> lt_out {&lt_o};
-
-        // if output shape is unknown, infer output shape
-        avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_o).vdims();
-        const std::vector<int64_t> inferred_out_strides
-                = logical_tensor_wrapper_t(lt_o).vstrides();
-        if (rounding_type == "ceil") {
-            const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-            const std::vector<int64_t> expected_out_strides
-                    = compute_dense_strides(expected_out_shape);
-            EXPECT_EQ(inferred_out_strides, expected_out_strides);
-        } else { // rounding_type = floor
-            const std::vector<int64_t> expected_out_shape = {1, 3, 111, 111};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-            const std::vector<int64_t> expected_out_strides
-                    = compute_dense_strides(expected_out_shape);
-            EXPECT_EQ(inferred_out_strides, expected_out_strides);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, AvgPoolBackward) {
-    const op_kind_t op_kind_ = op_kind::AvgPoolBackward;
-
-    const std::set<size_t> expected_in_sizes = {2};
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 8;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::strides, true}, {op_attr::pads_begin, true},
-                    {op_attr::pads_end, true}, {op_attr::kernel, true},
-                    {op_attr::auto_pad, false}, {op_attr::exclude_pad, true},
-                    {op_attr::data_format, false}, {op_attr::src_shape, false}};
-    for (auto expected_in_size : expected_in_sizes) {
-        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-                expected_attr_size, attrs_data);
-    }
-}
-
-TEST(test_interface_op_schema, AvgPoolBackwardAutoPadShape) {
-    const op_schema_t *avg_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::AvgPoolBackward);
-    op_t avg_op {
-            op_kind::AvgPoolBackward, op_t::kind2str(op_kind::AvgPoolBackward)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    bool exclude_pad = false;
-    std::string data_format = "NCX";
-
-    avg_op.set_attr(op_attr::strides, strides);
-    avg_op.set_attr(op_attr::pads_begin, pads_begin);
-    avg_op.set_attr(op_attr::pads_end, pads_end);
-    avg_op.set_attr(op_attr::kernel, kernel);
-    avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
-    avg_op.set_attr(op_attr::data_format, data_format);
-    std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        std::vector<int64_t> in_shape {1, 1, 6, 6};
-        avg_op.set_attr(op_attr::auto_pad, auto_pad);
-        avg_op.set_attr(op_attr::src_shape, in_shape);
-
-        logical_tensor_t lt_output_delta
-                = logical_tensor_init(id++, in_shape, data_type::f32);
-        if (auto_pad == "VALID") {
-            lt_output_delta
-                    = logical_tensor_init(id++, {1, 1, 3, 3}, data_type::f32);
-        }
-        std::vector<logical_tensor_t *> in {&lt_output_delta};
-        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        // if output shape is unknown, infer output shape
-        avg_op_schema->shape_infer(&avg_op, in, out);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-
-        auto inferred_pads_begin
-                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        auto inferred_pads_end
-                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_pads_begin = {2, 2};
-            const std::vector<int64_t> expected_pads_end = {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_pads_begin = {0, 0};
-            const std::vector<int64_t> expected_pads_end = {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, Divide) {
-    const op_kind_t op_kind_ = op_kind::Divide;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferDivideOutputShapeWithoutBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Divide;
-
-    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferDivideOutputShapeWithBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Divide;
-
-    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, Elu) {
-    const op_kind_t op_kind_ = op_kind::Elu;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::alpha, true}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferEluOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Elu;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, EluBackward) {
-    const op_kind_t op_kind_ = op_kind::EluBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::alpha, true}, {op_attr::use_dst, false}};
-    const size_t expected_attr_size = attrs_data.size();
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferEluBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::EluBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, End) {
-    const op_schema_t *op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::End);
-
-    op_t end_op {0, op_kind::End, "end"};
-    logical_tensor_t lt_in_0 = logical_tensor_init(0, data_type::f32);
-
-    end_op.add_input(lt_in_0);
-    EXPECT_TRUE(op_schema->verify(&end_op));
-}
-
-TEST(test_interface_op_schema, Reorder) {
-    const op_kind_t op_kind_ = op_kind::Reorder;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferReorderOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Reorder;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Exp) {
-    const op_kind_t op_kind_ = op_kind::Exp;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferExpOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Exp;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Gelu) {
-    const op_kind_t op_kind_ = op_kind::GELU;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferGeluOutputShape) {
-    const op_kind_t op_kind_ = op_kind::GELU;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, GELUBackward) {
-    const op_kind_t op_kind_ = op_kind::GELUBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferGeluBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::GELUBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, HardSwish) {
-    const op_kind_t op_kind_ = op_kind::HardSwish;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferHardSwishOutputShape) {
-    const op_kind_t op_kind_ = op_kind::HardSwish;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, HardSwishBackward) {
-    const op_kind_t op_kind_ = op_kind::HardSwishBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferHardSwishBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::HardSwishBackward;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, LayerNorm) {
-    const op_kind_t op_kind_ = op_kind::LayerNorm;
-    const size_t expected_in_size = 3;
-    const size_t expected_out_size = 3;
-    const size_t expected_attr_size = 4;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::keep_stats, false}, {op_attr::begin_norm_axis, false},
-                    {op_attr::use_affine, false}, {op_attr::epsilon, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferLayerNormOutputShape) {
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
-    op_t op_ {op_kind::LayerNorm, op_t::kind2str(op_kind::LayerNorm)};
-
-    const std::vector<layout_type_t> layout_types
-            = {layout_type::strided, layout_type::opaque};
-
-    // We test all available cases
-    const std::vector<bool> keep_statses = {true, false};
-
-    // TODO(qun) we should test multi begin_norm_axis attrs
-    const int64_t begin_norm_axis = -1;
-
-    op_.set_attr(op_attr::begin_norm_axis, begin_norm_axis);
-
-    for (auto keep_stats : keep_statses) {
-        op_.set_attr(op_attr::keep_stats, static_cast<bool>(keep_stats));
-        for (const auto &ltype : layout_types) {
-            logical_tensor_t lt_in1 = logical_tensor_init(
-                    0, {1, 3, 416, 416}, data_type::f32, ltype);
-            logical_tensor_t lt_out = logical_tensor_init(
-                    1, data_type::f32, layout_type::strided);
-            logical_tensor_t lt_mean = logical_tensor_init(
-                    2, data_type::f32, layout_type::strided);
-            logical_tensor_t lt_var = logical_tensor_init(
-                    3, data_type::f32, layout_type::strided);
-
-            std::vector<logical_tensor_t *> in {&lt_in1};
-            std::vector<logical_tensor_t *> out {&lt_out, &lt_mean, &lt_var};
-
-            op_schema_->shape_infer(&op_, in, out);
-
-            const std::vector<int64_t> inferred_out_shape
-                    = logical_tensor_wrapper_t(lt_out).vdims();
-            const std::vector<int64_t> expected_out_shape = {1, 3, 416, 416};
-            EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-            const std::vector<int64_t> inferred_out_strides
-                    = logical_tensor_wrapper_t(lt_out).vstrides();
-            const std::vector<int64_t> expected_out_strides
-                    = compute_dense_strides(expected_out_shape);
-            EXPECT_EQ(inferred_out_strides, expected_out_strides);
-
-            if (keep_stats) {
-                // check inferred shape and strides for mean
-                const std::vector<int64_t> inferred_mean_shape
-                        = logical_tensor_wrapper_t(lt_mean).vdims();
-                const std::vector<int64_t> expected_mean_shape = {1, 3, 416};
-                EXPECT_EQ(inferred_mean_shape, expected_mean_shape);
-
-                const std::vector<int64_t> inferred_mean_strides
-                        = logical_tensor_wrapper_t(lt_mean).vstrides();
-                const std::vector<int64_t> expected_mean_strides
-                        = compute_dense_strides(expected_mean_shape);
-                EXPECT_EQ(inferred_mean_strides, expected_mean_strides);
-
-                // check inferred shape and strides for var
-                const std::vector<int64_t> inferred_var_shape
-                        = logical_tensor_wrapper_t(lt_var).vdims();
-                const std::vector<int64_t> expected_var_shape = {1, 3, 416};
-                EXPECT_EQ(inferred_var_shape, expected_var_shape);
-
-                const std::vector<int64_t> inferred_var_strides
-                        = logical_tensor_wrapper_t(lt_var).vstrides();
-                const std::vector<int64_t> expected_var_strides
-                        = compute_dense_strides(expected_var_shape);
-                EXPECT_EQ(inferred_var_strides, expected_var_strides);
-            }
-        }
-    }
-}
-
-TEST(test_interface_op_schema, LayerNormBackward) {
-    const op_kind_t op_kind_ = op_kind::LayerNormBackward;
-    const size_t expected_in_size = 6;
-    const size_t expected_out_size = 3;
-    const size_t expected_attr_size = 3;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::begin_norm_axis, false}, {op_attr::use_affine, false},
-                    {op_attr::epsilon, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferLayerNormBackwardOutputShape) {
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind::LayerNormBackward);
-    op_t op_ {op_kind::LayerNormBackward,
-            op_t::kind2str(op_kind::LayerNormBackward)};
-
-    const std::vector<layout_type_t> layout_types
-            = {layout_type::strided, layout_type::opaque};
-
-    // We test all available cases
-    const std::vector<bool> use_affines = {true, false};
-
-    for (auto use_affine : use_affines) {
-        op_.set_attr(op_attr::use_affine, static_cast<bool>(use_affine));
-        for (const auto &ltype : layout_types) {
-            logical_tensor_t lt_data = logical_tensor_init(
-                    0, {1, 256, 64, 64}, data_type::f32, ltype);
-            logical_tensor_t lt_out_delta = logical_tensor_init(
-                    1, {1, 256, 64, 64}, data_type::f32, ltype);
-            logical_tensor_t lt_mean
-                    = logical_tensor_init(2, {1, 256}, data_type::f32, ltype);
-            logical_tensor_t lt_variance
-                    = logical_tensor_init(3, {1, 256}, data_type::f32, ltype);
-            logical_tensor_t lt_gamma
-                    = logical_tensor_init(4, {1, 256}, data_type::f32, ltype);
-            logical_tensor_t lt_beta
-                    = logical_tensor_init(5, {1, 256}, data_type::f32, ltype);
-            logical_tensor_t lt_in_delta = logical_tensor_init(
-                    6, data_type::f32, layout_type::strided);
-            logical_tensor_t lt_gamma_delta = logical_tensor_init(
-                    7, data_type::f32, layout_type::strided);
-            logical_tensor_t lt_beta_delta = logical_tensor_init(
-                    8, data_type::f32, layout_type::strided);
-            std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_out_delta,
-                    &lt_mean, &lt_variance, &lt_gamma, &lt_beta};
-            std::vector<logical_tensor_t *> lt_out {
-                    &lt_in_delta, &lt_gamma_delta, &lt_beta_delta};
-
-            op_schema_->shape_infer(&op_, lt_in, lt_out);
-
-            const std::vector<int64_t> inferred_in_delta_shape
-                    = logical_tensor_wrapper_t(lt_in_delta).vdims();
-            const std::vector<int64_t> expected_in_delta_shape
-                    = {1, 256, 64, 64};
-            EXPECT_EQ(inferred_in_delta_shape, expected_in_delta_shape);
-
-            const std::vector<int64_t> inferred_in_delta_strides
-                    = logical_tensor_wrapper_t(lt_in_delta).vstrides();
-            const std::vector<int64_t> expected_in_delta_strides
-                    = compute_dense_strides(expected_in_delta_shape);
-            EXPECT_EQ(inferred_in_delta_strides, expected_in_delta_strides);
-
-            if (use_affine) {
-                const std::vector<int64_t> inferred_gamma_delta_shape
-                        = logical_tensor_wrapper_t(lt_gamma_delta).vdims();
-                const std::vector<int64_t> expected_gamma_delta_shape
-                        = {1, 256};
-                EXPECT_EQ(
-                        inferred_gamma_delta_shape, expected_gamma_delta_shape);
-
-                const std::vector<int64_t> inferred_gamma_delta_strides
-                        = logical_tensor_wrapper_t(lt_gamma_delta).vstrides();
-                const std::vector<int64_t> expected_gamma_delta_strides
-                        = compute_dense_strides(expected_gamma_delta_shape);
-                EXPECT_EQ(inferred_gamma_delta_strides,
-                        expected_gamma_delta_strides);
-
-                const std::vector<int64_t> inferred_beta_delta_shape
-                        = logical_tensor_wrapper_t(lt_beta_delta).vdims();
-                const std::vector<int64_t> expected_beta_delta_shape = {1, 256};
-                EXPECT_EQ(inferred_beta_delta_shape, expected_beta_delta_shape);
-
-                const std::vector<int64_t> inferred_beta_delta_strides
-                        = logical_tensor_wrapper_t(lt_beta_delta).vstrides();
-                const std::vector<int64_t> expected_beta_delta_strides
-                        = compute_dense_strides(expected_beta_delta_shape);
-                EXPECT_EQ(inferred_beta_delta_strides,
-                        expected_beta_delta_strides);
-            }
-        }
-    }
-}
-
-TEST(test_interface_op_schema, Log) {
-    const op_kind_t op_kind_ = op_kind::Log;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferLogOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Log;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, LogSoftmax) {
-    const op_kind_t op_kind_ = op_kind::LogSoftmax;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferLogSoftmaxOutputShape) {
-    const op_kind_t op_kind_ = op_kind::LogSoftmax;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, LogSoftmaxBackward) {
-    const op_kind_t op_kind_ = op_kind::LogSoftmaxBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferLogSoftmaxBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::LogSoftmaxBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Maximum) {
-    const op_kind_t op_kind_ = op_kind::Maximum;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMaximumOutputShapeWithoutBroadcst) {
-    const op_kind_t op_kind_ = op_kind::Maximum;
-
-    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferMaximumOutputShapeWihBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Maximum;
-
-    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, MaxPoolBackward) {
-    const op_kind_t op_kind_ = op_kind::MaxPoolBackward;
-
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 7;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
-            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
-            {op_attr::kernel, true}, {op_attr::auto_pad, false},
-            {op_attr::dilations, false}, {op_attr::data_format, false}};
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMaxPoolBackwardAutoPadShape) {
-    const op_schema_t *pool_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MaxPoolBackward);
-    op_t pool_op {
-            op_kind::MaxPoolBackward, op_t::kind2str(op_kind::MaxPoolBackward)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {1, 1};
-    std::string data_format = "NCX";
-
-    pool_op.set_attr(op_attr::strides, strides);
-    pool_op.set_attr(op_attr::pads_begin, pads_begin);
-    pool_op.set_attr(op_attr::pads_end, pads_end);
-    pool_op.set_attr(op_attr::kernel, kernel);
-    pool_op.set_attr(op_attr::dilations, dilations);
-
-    pool_op.set_attr(op_attr::data_format, data_format);
-    std::vector<std::string> auto_pads_vec {
-            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
-    int64_t id = 0;
-
-    for (const auto &auto_pad : auto_pads_vec) {
-        pool_op.set_attr(op_attr::auto_pad, auto_pad);
-        logical_tensor_t lt_data
-                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-
-        logical_tensor_t lt_output_delta
-                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
-        if (auto_pad == "VALID") {
-            lt_output_delta
-                    = logical_tensor_init(id++, {1, 1, 3, 3}, data_type::f32);
-        }
-        std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        // if output shape is unknown, infer output shape
-        pool_op_schema->shape_infer(&pool_op, in, out);
-
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-
-        auto inferred_pads_begin
-                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
-        auto inferred_pads_end
-                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
-        const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-        if (auto_pad == "NONE") {
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_UPPER") {
-            const std::vector<int64_t> expected_pads_begin = {1, 1};
-            const std::vector<int64_t> expected_pads_end = {2, 2};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "SAME_LOWER") {
-            const std::vector<int64_t> expected_pads_begin = {2, 2};
-            const std::vector<int64_t> expected_pads_end = {1, 1};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        } else if (auto_pad == "VALID") {
-            const std::vector<int64_t> expected_pads_begin = {0, 0};
-            const std::vector<int64_t> expected_pads_end = {0, 0};
-            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
-            EXPECT_EQ(inferred_pads_end, expected_pads_end);
-        }
-    }
-}
-
-TEST(test_interface_op_schema, InferMaxPoolBackwardDilationsShape) {
-    const op_schema_t *pool_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MaxPoolBackward);
-    op_t pool_op {
-            op_kind::MaxPoolBackward, op_t::kind2str(op_kind::MaxPoolBackward)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> kernel = {4, 4};
-    std::vector<int64_t> pads_begin = {0, 0};
-    std::vector<int64_t> pads_end = {0, 0};
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    const std::vector<int64_t> expected_out_shape = {1, 1, 16, 16};
-    int64_t id = 0;
-
-    pool_op.set_attr(op_attr::strides, strides);
-    pool_op.set_attr(op_attr::pads_begin, pads_begin);
-    pool_op.set_attr(op_attr::pads_end, pads_end);
-    pool_op.set_attr(op_attr::kernel, kernel);
-    pool_op.set_attr(op_attr::dilations, dilations);
-
-    pool_op.set_attr(op_attr::data_format, data_format);
-    pool_op.set_attr<std::string>(op_attr::auto_pad, "VALID");
-
-    logical_tensor_t lt_data
-            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
-
-    logical_tensor_t lt_output_delta
-            = logical_tensor_init(id++, {1, 1, 10, 10}, data_type::f32);
-
-    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
-    logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    // if output shape is unknown, infer output shape
-    pool_op_schema->shape_infer(&pool_op, in, out);
-
-    const std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, Minimum) {
-    const op_kind_t op_kind_ = op_kind::Minimum;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMinimumOutputShapeWithoutBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Minimum;
-
-    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferMinimumOutputShapeWithBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Minimum;
-
-    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, Mish) {
-    const op_kind_t op_kind_ = op_kind::Mish;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMishOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Mish;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, MishBackward) {
-    const op_kind_t op_kind_ = op_kind::MishBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMishBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::MishBackward;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Multiply) {
-    const op_kind_t op_kind_ = op_kind::Multiply;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferMultiplyOutputShapeWithoutBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Multiply;
-
-    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferMultiplyOutputShapeWithBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Multiply;
-
-    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, PReLU) {
-    const op_kind_t op_kind_ = op_kind::PReLU;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 2;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::data_format, false},
-            {op_attr::per_channel_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, PReLUOutputShape) {
-    const op_kind_t op_kind_ = op_kind::PReLU;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Reduce) {
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 2;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::keep_dims, false}, {op_attr::axes, false}};
-    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
-            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
-            op_kind::ReduceProd, op_kind::ReduceSum};
-
-    for (auto op_kind_ : configs) {
-        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-                expected_attr_size, attrs_data);
-    }
-}
-
-TEST(test_interface_op_schema, InferReduceOutputShapeFromInput) {
-    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
-            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
-            op_kind::ReduceProd, op_kind::ReduceSum};
-
-    for (auto op_kind_ : configs) {
-        const op_schema_t *op_schema_
-                = op_schema_registry_t::get_op_schema(op_kind_);
-        op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-        logical_tensor_t lt_in
-                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-        logical_tensor_t lt_axis_indices
-                = logical_tensor_init(1, data_type::f32);
-        std::vector<logical_tensor_t *> in {&lt_in, &lt_axis_indices};
-        logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        auto ret = op_schema_->shape_infer(&op_, in, out);
-        EXPECT_EQ(ret, status::unimplemented);
-    }
-}
-
-TEST(test_interface_op_schema,
-        InferReduceOutputShapeFromAttributeWithKeepingDims) {
-    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
-            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
-            op_kind::ReduceProd, op_kind::ReduceSum};
-    const std::vector<int64_t> axes {2};
-    const std::vector<int64_t> expected_out_shape = {1, 3, 1, 224};
-
-    for (auto op_kind_ : configs) {
-        const op_schema_t *op_schema_
-                = op_schema_registry_t::get_op_schema(op_kind_);
-        op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-        op_.set_attr(op_attr::keep_dims, true);
-        op_.set_attr(op_attr::axes, axes);
-
-        logical_tensor_t lt_in
-                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-        logical_tensor_t lt_axis_indices
-                = logical_tensor_init(1, data_type::f32);
-        std::vector<logical_tensor_t *> in {&lt_in, &lt_axis_indices};
-        logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        op_schema_->shape_infer(&op_, in, out);
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-    }
-}
-
-TEST(test_interface_op_schema,
-        InferReduceOutputShapeFromAttributeWithoutKeepingDims) {
-    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
-            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
-            op_kind::ReduceProd, op_kind::ReduceSum};
-    const std::vector<int64_t> axes {0, 1, 2, 3};
-    const std::vector<int64_t> expected_out_shape = {};
-
-    for (auto op_kind_ : configs) {
-        const op_schema_t *op_schema_
-                = op_schema_registry_t::get_op_schema(op_kind_);
-        op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-        op_.set_attr(op_attr::axes, axes);
-
-        logical_tensor_t lt_in
-                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
-        logical_tensor_t lt_axis_indices
-                = logical_tensor_init(1, data_type::f32);
-        std::vector<logical_tensor_t *> in {&lt_in, &lt_axis_indices};
-        logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        op_schema_->shape_infer(&op_, in, out);
-        const std::vector<int64_t> inferred_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        EXPECT_EQ(inferred_out_shape, expected_out_shape);
-        lt_out = logical_tensor_init(2, {}, data_type::f32);
-        EXPECT_EQ(
-                op_schema_->shape_infer(&op_, in, out), graph::status::success);
-    }
-}
-
-TEST(test_interface_op_schema, ReLU) {
-    const op_kind_t op_kind_ = op_kind::ReLU;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferReluOutputShape) {
-    const op_kind_t op_kind_ = op_kind::ReLU;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, ReLUBackward) {
-    const op_kind_t op_kind_ = op_kind::ReLUBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
-    const size_t expected_attr_size = attrs_data.size();
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferReluBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::ReLUBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Round) {
-    const op_kind_t op_kind_ = op_kind::Round;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferRoundOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Round;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Select) {
-    const op_kind_t op_kind_ = op_kind::Select;
-    const size_t expected_in_size = 3;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSelectOutputShapeWithoutBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Select;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-    const std::string no_broadcast_attr_val = "none";
-    op_.set_attr(op_attr::auto_broadcast, no_broadcast_attr_val);
-
-    // positive case
-    logical_tensor_t lt_in0
-            = logical_tensor_init(0, {3, 3, 64, 128}, data_type::boolean);
-    logical_tensor_t lt_in1
-            = logical_tensor_init(1, {3, 3, 64, 128}, data_type::f32);
-    logical_tensor_t lt_in2
-            = logical_tensor_init(2, {3, 3, 64, 128}, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_in0, &lt_in1, &lt_in2};
-    logical_tensor_t lt_out = logical_tensor_init(3, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    status_t ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_EQ(ret, status::success);
-    const std::vector<int64_t> infered_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    const std::vector<int64_t> expected_out_shape = {3, 3, 64, 128};
-    EXPECT_EQ(infered_out_shape, expected_out_shape);
-
-    // negative case
-    logical_tensor_t lt_in2_neg
-            = logical_tensor_init(4, {3, 3, 64, 64}, data_type::f32);
-    std::vector<logical_tensor_t *> in_neg {&lt_in0, &lt_in1, &lt_in2_neg};
-    logical_tensor_t lt_out_neg = logical_tensor_init(5, data_type::f32);
-    std::vector<logical_tensor_t *> out_neg {&lt_out_neg};
-    ret = op_schema_->shape_infer(&op_, in_neg, out_neg);
-    EXPECT_EQ(ret, status::invalid_shape);
-}
-
-#define for_ for
-TEST(test_interface_op_schema, InferSelectOutputShapeWithBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Select;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-    const std::vector<std::vector<int64_t>> in0_shapes
-            = {{2, 3, 64, 64}, {1, 64}, {3, 1, 64}, {1}};
-    const std::vector<std::vector<int64_t>> in1_shapes
-            = {{2, 1, 64, 1}, {2, 3, 64, 64}};
-    const std::vector<std::vector<int64_t>> in2_shapes
-            = {{3, 1, 64}, {2, 3, 64, 64}};
-    const std::vector<std::vector<int64_t>> expected_out_shapes
-            = {{2, 3, 64, 64}};
-
-    for_(const auto &in0_shape : in0_shapes)
-    for_(const auto &in1_shape : in1_shapes)
-    for_(const auto &in2_shape : in2_shapes)
-    for (const auto &expected_out_shape : expected_out_shapes) {
-        logical_tensor_t lt_in0
-                = logical_tensor_init(0, in0_shape, data_type::boolean);
-        logical_tensor_t lt_in1
-                = logical_tensor_init(1, in1_shape, data_type::f32);
-        logical_tensor_t lt_in2
-                = logical_tensor_init(2, in2_shape, data_type::f32);
-        std::vector<logical_tensor_t *> in {&lt_in0, &lt_in1, &lt_in2};
-        logical_tensor_t lt_out = logical_tensor_init(3, data_type::f32);
-        std::vector<logical_tensor_t *> out {&lt_out};
-
-        // shape inference without explicitly setting auto_broadcast
-        // should be enabled by default
-        op_schema_->shape_infer(&op_, in, out);
-        const std::vector<int64_t> infered_out_shape
-                = logical_tensor_wrapper_t(lt_out).vdims();
-        EXPECT_EQ(infered_out_shape, expected_out_shape);
-
-        // explicitly setting auto_broadcast
-        const std::string with_broadcast_attr_val = "numpy";
-        op_.set_attr(op_attr::auto_broadcast, with_broadcast_attr_val);
-        logical_tensor_t lt_out_expl = logical_tensor_init(4, data_type::f32);
-        std::vector<logical_tensor_t *> out_expl {&lt_out_expl};
-
-        op_schema_->shape_infer(&op_, in, out_expl);
-        const std::vector<int64_t> infered_out_shape_expl
-                = logical_tensor_wrapper_t(lt_out_expl).vdims();
-        EXPECT_EQ(infered_out_shape_expl, expected_out_shape);
-    }
-}
-#undef for_
-
-TEST(test_interface_op_schema, Sigmoid) {
-    const op_kind_t op_kind_ = op_kind::Sigmoid;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSigmoidOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Sigmoid;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, SigmoidBackward) {
-    const op_kind_t op_kind_ = op_kind::SigmoidBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSigmoidBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::SigmoidBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, SoftMax) {
-    const op_kind_t op_kind_ = op_kind::SoftMax;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSoftMaxOutputShape) {
-    const op_kind_t op_kind_ = op_kind::SoftMax;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, SoftMaxBackward) {
-    const op_kind_t op_kind_ = op_kind::SoftMaxBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSoftMaxBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::SoftMaxBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferSqrtOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Sqrt;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Sqrt) {
-    const op_kind_t op_kind_ = op_kind::Sqrt;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, SoftPlus) {
-    const op_kind_t op_kind_ = op_kind::SoftPlus;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::beta, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSoftPlusOutputShape) {
-    const op_kind_t op_kind_ = op_kind::SoftPlus;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, SoftPlusBackward) {
-    const op_kind_t op_kind_ = op_kind::SoftPlusBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::beta, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSoftPlusBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::SoftPlusBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, SqrtBackward) {
-    const op_kind_t op_kind_ = op_kind::SqrtBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, SquaredDifference) {
-    const op_kind_t op_kind_ = op_kind::SquaredDifference;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, Subtract) {
-    const op_kind_t op_kind_ = op_kind::Subtract;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::auto_broadcast, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSubtractOutputShapeWithoutBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Subtract;
-
-    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferSubtractOutputShapeWithBroadcast) {
-    const op_kind_t op_kind_ = op_kind::Subtract;
-
-    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
-}
-
-TEST(test_interface_op_schema, Tanh) {
-    const op_kind_t op_kind_ = op_kind::Tanh;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-TEST(test_interface_op_schema, InferSqrtBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::SqrtBackward;
-
-    verify_two_ins_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Square) {
-    const op_kind_t op_kind_ = op_kind::Square;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 0;
-    const std::map<op_attr_t, bool> attrs_data = {};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferSquareOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Square;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, InferTanhOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Tanh;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, TanhBackward) {
-    const op_kind_t op_kind_ = op_kind::TanhBackward;
-    const size_t expected_in_size = 2;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferTanhBackwardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::TanhBackward;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
-
-TEST(test_interface_op_schema, Wildcard) {
-    const op_schema_t *op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Wildcard);
-    auto inputs_option = op_schema->get_inputs_option();
-    auto outputs_option = op_schema->get_outputs_option();
-    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::variadic);
-    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::variadic);
-
-    op_t wildcard_op {0, op_kind::Wildcard, std::string("wildcard")};
-    logical_tensor_t lt_in_0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_in_1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_out_0 = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t lt_out_1 = logical_tensor_init(3, data_type::f32);
-    logical_tensor_t lt_out_2 = logical_tensor_init(4, data_type::f32);
-
-    wildcard_op.add_input(lt_in_0);
-    wildcard_op.add_input(lt_in_1);
-    wildcard_op.add_output(lt_out_0);
-    wildcard_op.add_output(lt_out_1);
-    wildcard_op.add_output(lt_out_2);
-
-    EXPECT_TRUE(op_schema->verify(&wildcard_op));
-}
-
-TEST(test_interface_op_schema, InferWildcardOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Wildcard;
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind_);
-    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
-
-    logical_tensor_t lt_in = logical_tensor_init(0, data_type::f32);
-    std::vector<logical_tensor_t *> in {&lt_in};
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
-    std::vector<logical_tensor_t *> out {&lt_out};
-
-    auto ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_EQ(ret, status::unimplemented);
-}
-
-TEST(test_interface_op_schema, BatchNormOptionalInput) {
-    const op_schema_t *bn_op_schema = op_schema_registry_t::get_op_schema(
-            op_kind::BatchNormForwardTraining);
-    auto inputs_option = bn_op_schema->get_inputs_option();
-    auto outputs_option = bn_op_schema->get_outputs_option();
-    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::optional);
-    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::fixed);
-
-    op_t bn_op {0, op_kind::BatchNormForwardTraining, std::string("bn")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_mean = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_viance = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t lt_output = logical_tensor_init(3, data_type::f32);
-    logical_tensor_t lt_running_mean = logical_tensor_init(4, data_type::f32);
-    logical_tensor_t lt_running_viance = logical_tensor_init(5, data_type::f32);
-    logical_tensor_t lt_batch_mean = logical_tensor_init(6, data_type::f32);
-    logical_tensor_t lt_batch_viance = logical_tensor_init(7, data_type::f32);
-
-    bn_op.add_input(lt_data);
-    bn_op.add_input(lt_mean);
-    bn_op.add_input(lt_viance);
-    bn_op.add_output(lt_output);
-    bn_op.add_output(lt_running_mean);
-    bn_op.add_output(lt_running_viance);
-    bn_op.add_output(lt_batch_mean);
-    bn_op.add_output(lt_batch_viance);
-
-    bn_op.set_attr<float>(op_attr::epsilon, 0.001f);
-    EXPECT_TRUE(bn_op_schema->verify(&bn_op));
-
-    logical_tensor_t lt_gamma = logical_tensor_init(8, data_type::f32);
-    bn_op.add_input(lt_gamma);
-    EXPECT_TRUE(bn_op_schema->verify(&bn_op));
-    logical_tensor_t lt_beta = logical_tensor_init(9, data_type::f32);
-    bn_op.add_input(lt_beta);
-    EXPECT_TRUE(bn_op_schema->verify(&bn_op));
-
-    logical_tensor_t lt_false = logical_tensor_init(10, data_type::f32);
-    bn_op.add_input(lt_false);
-    EXPECT_FALSE(bn_op_schema->verify(&bn_op));
-}
-
-TEST(test_interface_op_schema, ConcatVariadicInput) {
-    const op_schema_t *concat_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Concat);
-    auto inputs_option = concat_op_schema->get_inputs_option();
-    auto outputs_option = concat_op_schema->get_outputs_option();
-    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::variadic);
-    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::fixed);
-
-    op_t concat_op {0, op_kind::Concat, std::string("concat")};
-    logical_tensor_t lt_data_0 = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_data_1 = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_data_2 = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t lt_output = logical_tensor_init(3, data_type::f32);
-
-    concat_op.add_input(lt_data_0);
-    concat_op.add_input(lt_data_1);
-    concat_op.add_input(lt_data_2);
-    concat_op.add_output(lt_output);
-
-    concat_op.set_attr(op_attr::axis, int64_t(0));
-    EXPECT_TRUE(concat_op_schema->verify(&concat_op));
-}
-
-TEST(test_interface_op_schema, ConcatVariadicInputNegative) {
-    const op_schema_t *concat_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Concat);
-
-    op_t concat_op {0, op_kind::Concat, std::string("concat")};
-    logical_tensor_t lt_output = logical_tensor_init(3, data_type::f32);
-
-    concat_op.add_output(lt_output);
-
-    concat_op.set_attr(op_attr::axis, int64_t(0));
-    EXPECT_FALSE(concat_op_schema->verify(&concat_op));
-}
-
-TEST(test_interface_op_schema, LayerNormOptionalInputs) {
-    const op_schema_t *ln_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
-    auto inputs_option = ln_op_schema->get_inputs_option();
-    auto outputs_option = ln_op_schema->get_outputs_option();
-    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::optional);
-    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::optional);
-
-    op_t ln_op {0, op_kind::LayerNorm, std::string("ln")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-
-    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f32);
-
-    ln_op.add_input(lt_data);
-
-    ln_op.add_output(lt_output);
-
-    // Since number of output is only 1,
-    // which doesn't fit requirement when keep_stats is true
-    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
-
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::f32);
-    ln_op.add_input(lt_beta);
-    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
-
-    logical_tensor_t lt_gamma = logical_tensor_init(3, data_type::f32);
-    ln_op.add_input(lt_gamma);
-    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
-
-    logical_tensor_t lt_mean = logical_tensor_init(4, data_type::f32);
-    ln_op.add_output(lt_mean);
-    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
-
-    logical_tensor_t lt_variance = logical_tensor_init(5, data_type::f32);
-    ln_op.add_output(lt_variance);
-    EXPECT_TRUE(ln_op_schema->verify(&ln_op));
-
-    logical_tensor_t lt_false = logical_tensor_init(6, data_type::f32);
-    ln_op.add_input(lt_false);
-    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
-}
-
-TEST(test_interface_op_schema, AddDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Add;
-    op_t tmp_op {0, tmp_op_kind, std::string("add")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
-    EXPECT_EQ(*sval, "numpy");
-}
-
-TEST(test_interface_op_schema, AvgpoolDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::AvgPool;
-    op_t tmp_op {0, tmp_op_kind, std::string("avgpool")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    tmp_op.get_attr<std::string>(op_attr::rounding_type, &sval);
-    EXPECT_EQ(*sval, "floor");
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-
-    tmp_op.get_attr<std::string>(op_attr::exclude_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
-    tmp_op.add_input(lt_data);
-    tmp_op.add_output(lt_out);
-    std::vector<int64_t> strides = {2, 2};
-    std::vector<int64_t> kernel = {3, 3};
-    std::vector<int64_t> pads_begin = {1, 1};
-    std::vector<int64_t> pads_end = {2, 2};
-    std::vector<int64_t> dilations = {1, 1};
-    bool exclude_pad = false;
-
-    tmp_op.set_attr(op_attr::strides, strides);
-    tmp_op.set_attr(op_attr::pads_begin, pads_begin);
-    tmp_op.set_attr(op_attr::pads_end, pads_end);
-    tmp_op.set_attr(op_attr::kernel, kernel);
-    tmp_op.set_attr(op_attr::exclude_pad, exclude_pad);
-
-    EXPECT_TRUE(opm->verify(&tmp_op));
-}
-
-TEST(test_interface_op_schema, AvgBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::AvgPoolBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("avgpool_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-}
-
-TEST(test_interface_op_schema, BatchNormInferenceDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::BatchNormInference;
-    op_t tmp_op {0, tmp_op_kind, std::string("bn_inference")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, BatchNormForwardTrainingDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::BatchNormForwardTraining;
-    op_t tmp_op {0, tmp_op_kind, std::string("bn_fwd_training")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, BatchNormTrainingBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::BatchNormTrainingBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("bn_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, BiasaddDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::BiasAdd;
-    op_t tmp_op {0, tmp_op_kind, std::string("bias_add")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, BiasaddBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::BiasAddBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("bias_add_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, ConvolutionDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Convolution;
-    op_t tmp_op {0, tmp_op_kind, std::string("conv")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    tmp_op.get_attr<std::string>(op_attr::weights_format, &sval);
-    EXPECT_EQ(*sval, "XIO");
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::groups, &ival);
-    int64_t int_value {1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, ConvolutionBackwardDataDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::ConvolutionBackwardData;
-    op_t tmp_op {0, tmp_op_kind, std::string("conv_bpd")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::vector<int64_t> *vval {nullptr};
-    tmp_op.get_attr<std::vector<int64_t>>(op_attr::output_padding, &vval);
-    std::vector<int64_t> vector_value(DNNL_MAX_NDIMS, 0);
-    EXPECT_EQ(*vval, vector_value);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    tmp_op.get_attr<std::string>(op_attr::weights_format, &sval);
-    EXPECT_EQ(*sval, "XIO");
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::groups, &ival);
-    int64_t int_value {1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, ConvolutionBackwardWeightsDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::ConvolutionBackwardWeights;
-    op_t tmp_op {0, tmp_op_kind, std::string("conv_bpf")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    tmp_op.get_attr<std::string>(op_attr::weights_format, &sval);
-    EXPECT_EQ(*sval, "XIO");
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::groups, &ival);
-    int64_t int_value {1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, DivideDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Divide;
-    op_t tmp_op {0, tmp_op_kind, std::string("divide")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
-    EXPECT_EQ(*sval, "numpy");
-}
-
-TEST(test_interface_op_schema, InterpolateDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Interpolate;
-    op_t tmp_op {0, tmp_op_kind, std::string("interpolate")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(
-            op_attr::coordinate_transformation_mode, &sval);
-    EXPECT_EQ(*sval, "half_pixel");
-
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, InterpolateBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::InterpolateBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("interpolate_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(
-            op_attr::coordinate_transformation_mode, &sval);
-    EXPECT_EQ(*sval, "half_pixel");
-
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-}
-
-TEST(test_interface_op_schema, LayerNormDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::LayerNorm;
-    op_t tmp_op {0, tmp_op_kind, std::string("ln")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const bool *bval {nullptr};
-    tmp_op.get_attr<bool>(op_attr::keep_stats, &bval);
-    EXPECT_TRUE(bval);
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::begin_norm_axis, &ival);
-    int64_t int_value {-1};
-    EXPECT_EQ(*ival, int_value);
-
-    tmp_op.get_attr<bool>(op_attr::use_affine, &bval);
-    EXPECT_TRUE(bval);
-
-    const float *fval {nullptr};
-    tmp_op.get_attr<float>(op_attr::epsilon, &fval);
-    float float_value {1e-5f};
-    EXPECT_FLOAT_EQ(*fval, float_value);
-}
-
-TEST(test_interface_op_schema, LayerNormBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::LayerNormBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("ln_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const bool *bval {nullptr};
-    tmp_op.get_attr<bool>(op_attr::use_affine, &bval);
-    EXPECT_TRUE(bval);
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::begin_norm_axis, &ival);
-    int64_t int_value {-1};
-    EXPECT_EQ(*ival, int_value);
-
-    const float *fval {nullptr};
-    tmp_op.get_attr<float>(op_attr::epsilon, &fval);
-    float float_value {1e-5f};
-    EXPECT_FLOAT_EQ(*fval, float_value);
-}
-
-TEST(test_interface_op_schema, LogSoftmaxDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::LogSoftmax;
-    op_t tmp_op {0, tmp_op_kind, std::string("log_softmax")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
-    int64_t int_value {-1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, LogSoftmaxBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::LogSoftmaxBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("logsoftmax_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
-    int64_t int_value {-1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, MatmulDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::MatMul;
-    op_t tmp_op {0, tmp_op_kind, std::string("matmul")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const bool *bval {nullptr};
-    tmp_op.get_attr<bool>(op_attr::transpose_a, &bval);
-    EXPECT_FALSE(*bval);
-
-    tmp_op.get_attr<bool>(op_attr::transpose_b, &bval);
-    EXPECT_FALSE(*bval);
-}
-
-TEST(test_interface_op_schema, MaxPoolDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::MaxPool;
-    op_t tmp_op {0, tmp_op_kind, std::string("max_pool")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    tmp_op.get_attr<std::string>(op_attr::rounding_type, &sval);
-    EXPECT_EQ(*sval, "floor");
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-
-    const std::vector<int64_t> *vval {nullptr};
-    tmp_op.get_attr<std::vector<int64_t>>(op_attr::dilations, &vval);
-    std::vector<int64_t> vector_value(DNNL_MAX_NDIMS, 1);
-    EXPECT_EQ(*vval, vector_value);
-}
-
-TEST(test_interface_op_schema, MaxPoolBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::MaxPoolBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("max_pool_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
-    EXPECT_EQ(*sval, "NXC");
-
-    const std::vector<int64_t> *vval {nullptr};
-    tmp_op.get_attr<std::vector<int64_t>>(op_attr::dilations, &vval);
-    std::vector<int64_t> vector_value(DNNL_MAX_NDIMS, 1);
-    EXPECT_EQ(*vval, vector_value);
-
-    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
-    EXPECT_EQ(*sval, "None");
-}
-
-TEST(test_interface_op_schema, MaximumDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Maximum;
-    op_t tmp_op {0, tmp_op_kind, std::string("max")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
-    EXPECT_EQ(*sval, "numpy");
-}
-
-TEST(test_interface_op_schema, MinimumDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Minimum;
-    op_t tmp_op {0, tmp_op_kind, std::string("min")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
-    EXPECT_EQ(*sval, "numpy");
-}
-
-TEST(test_interface_op_schema, MultiplyDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Multiply;
-    op_t tmp_op {0, tmp_op_kind, std::string("mul")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
-    EXPECT_EQ(*sval, "numpy");
-}
-
-TEST(test_interface_op_schema, ReduceDefaultAttribute) {
-    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
-            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
-            op_kind::ReduceProd, op_kind::ReduceSum};
-
-    for (auto tmp_op_kind : configs) {
-        op_t tmp_op {0, tmp_op_kind, std::string("reduce")};
-
-        const op_schema_t *opm
-                = op_schema_registry_t::get_op_schema(tmp_op_kind);
-        EXPECT_TRUE(opm != nullptr);
-        opm->set_default_attribute(&tmp_op);
-
-        const bool *bval {nullptr};
-        tmp_op.get_attr<bool>(op_attr::keep_dims, &bval);
-        EXPECT_FALSE(*bval);
-
-        const std::vector<int64_t> *vval {nullptr};
-        tmp_op.get_attr<std::vector<int64_t>>(op_attr::axes, &vval);
-        EXPECT_TRUE(vval->empty());
-    }
-}
-
-TEST(test_interface_op_schema, SigmoidBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::SigmoidBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("sig_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const bool *bval {nullptr};
-    tmp_op.get_attr<bool>(op_attr::use_dst, &bval);
-    EXPECT_TRUE(bval);
-}
-
-TEST(test_interface_op_schema, SoftMaxDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::SoftMax;
-    op_t tmp_op {0, tmp_op_kind, std::string("softmax")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
-    int64_t int_value {1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, SoftMaxBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::SoftMaxBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("softmax_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const int64_t *ival {nullptr};
-    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
-    int64_t int_value {1};
-    EXPECT_EQ(*ival, int_value);
-}
-
-TEST(test_interface_op_schema, SoftPlusDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::SoftPlus;
-    op_t tmp_op {0, tmp_op_kind, std::string("softplus")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    float ival = tmp_op.get_attr<float>(op_attr::beta);
-    EXPECT_EQ(ival, 1.f);
-}
-
-TEST(test_interface_op_schema, SoftPlusBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::SoftPlusBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("softplus_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    float ival = tmp_op.get_attr<float>(op_attr::beta);
-    EXPECT_EQ(ival, 1.f);
-}
-
-TEST(test_interface_op_schema, SqrtBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::SqrtBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("sqrt_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const bool *bval {nullptr};
-    tmp_op.get_attr<bool>(op_attr::use_dst, &bval);
-    EXPECT_TRUE(bval);
-}
-
-TEST(test_interface_op_schema, SubtractDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::Subtract;
-    op_t tmp_op {0, tmp_op_kind, std::string("sub")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const std::string *sval {nullptr};
-    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
-    EXPECT_EQ(*sval, "numpy");
-}
-
-TEST(test_interface_op_schema, TanhBackwardDefaultAttribute) {
-    op_kind_t tmp_op_kind = op_kind::TanhBackward;
-    op_t tmp_op {0, tmp_op_kind, std::string("tanh_bp")};
-
-    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
-    EXPECT_TRUE(opm != nullptr);
-    opm->set_default_attribute(&tmp_op);
-
-    const bool *bval {nullptr};
-    tmp_op.get_attr<bool>(op_attr::use_dst, &bval);
-    EXPECT_TRUE(bval);
-}
-
-TEST(test_interface_op_schema, MatmulTypeConstraints) {
-    const op_schema_t *matmul_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::MatMul);
-    op_t matmul_op {0, op_kind::MatMul, std::string("matmul")};
-    logical_tensor_t lt_data_a = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_data_b = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_out = logical_tensor_init(2, data_type::s8);
-
-    matmul_op.add_input(lt_data_a);
-    matmul_op.add_input(lt_data_b);
-    matmul_op.add_output(lt_out);
-    // MatMul op doesn't support s8 output
-    EXPECT_FALSE(matmul_op_schema->verify(&matmul_op));
-}
-
-TEST(test_interface_op_schema, Quantize) {
-    const op_schema_t *quant_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Quantize);
-    op_t quant_op {0, op_kind::Quantize, std::string("quantize")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::s8);
-
-    quant_op.add_input(lt_data);
-    quant_op.add_output(lt_out);
-    quant_op.set_attr(op_attr::zps, std::vector<int64_t> {1});
-    quant_op.set_attr(op_attr::scales, std::vector<float> {0.1f});
-    EXPECT_TRUE(quant_op_schema->verify(&quant_op));
-}
-
-TEST(test_interface_op_schema, QuantizeWithFloatZps) {
-    const op_schema_t *quant_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Quantize);
-    op_t quant_op {0, op_kind::Quantize, std::string("quantize")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::s8);
-
-    quant_op.add_input(lt_data);
-    quant_op.add_output(lt_out);
-    quant_op.set_attr(op_attr::scales, std::vector<int64_t> {1});
-    quant_op.set_attr(op_attr::zps, std::vector<float> {0.1f});
-
-    //Quantize op does not support float zps and int64 scales
-    EXPECT_FALSE(quant_op_schema->verify(&quant_op));
-}
-
-TEST(test_interface_op_schema, Dequantize) {
-    const op_schema_t *dequant_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Dequantize);
-    op_t dequant_op {0, op_kind::Dequantize, std::string("dequantize")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::u8);
-    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
-
-    dequant_op.add_input(lt_data);
-    dequant_op.add_output(lt_out);
-    dequant_op.set_attr(op_attr::zps, std::vector<int64_t> {1});
-    dequant_op.set_attr(op_attr::scales, std::vector<float> {0.1f});
-    EXPECT_TRUE(dequant_op_schema->verify(&dequant_op));
-}
-
-TEST(test_interface_op_schema, LayerNormBf16) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
-
-    op_t lnorm {0, op_kind::LayerNorm, std::string("layer_norm")};
-    lnorm.set_attr(op_attr::keep_stats, false);
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
-
-    lnorm.add_input(lt_data);
-    lnorm.add_input(lt_gamma);
-    lnorm.add_input(lt_beta);
-    lnorm.add_output(lt_output);
-
-    EXPECT_TRUE(schema->verify(&lnorm));
-}
-
-TEST(test_interface_op_schema, LayerNormBf16WithGamma) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
-
-    op_t lnorm {0, op_kind::LayerNorm, std::string("layer_norm")};
-    lnorm.set_attr(op_attr::keep_stats, false);
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
-
-    lnorm.add_input(lt_data);
-    lnorm.add_input(lt_gamma);
-    lnorm.add_input(lt_beta);
-    lnorm.add_output(lt_output);
-
-    EXPECT_TRUE(schema->verify(&lnorm));
-}
-
-TEST(test_interface_op_schema, SoftmaxBf16) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::SoftMax);
-
-    op_t softmax {0, op_kind::SoftMax, std::string("softmax")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(1, data_type::bf16);
-
-    softmax.add_input(lt_data);
-    softmax.add_output(lt_output);
-
-    softmax.set_attr<int64_t>(op_attr::axis, 1);
-    EXPECT_TRUE(schema->verify(&softmax));
-}
-
-TEST(test_interface_op_schema, LogSoftmaxBf16) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::LogSoftmax);
-
-    op_t logsoftmax {0, op_kind::LogSoftmax, std::string("logsoftmax")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(1, data_type::bf16);
-
-    logsoftmax.add_input(lt_data);
-    logsoftmax.add_output(lt_output);
-
-    logsoftmax.set_attr<int64_t>(op_attr::axis, 1);
-    EXPECT_TRUE(schema->verify(&logsoftmax));
-}
-
-TEST(test_interface_op_schema, TypeCast) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::TypeCast);
-    EXPECT_TRUE(schema != nullptr);
-
-    op_t typecast {0, op_kind::TypeCast, std::string("typecast")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f32);
-
-    typecast.add_input(lt_data);
-    typecast.add_output(lt_output);
-
-    EXPECT_TRUE(schema->verify(&typecast));
-}
-
-TEST(test_interface_op_schema, BatchNormInferenceWithBf16Data) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::BatchNormInference);
-
-    op_t bn {0, op_kind::BatchNormInference, std::string("bn")};
-    bn.set_attr(op_attr::epsilon, 0.001f);
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::f32);
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::f32);
-    logical_tensor_t lt_mean = logical_tensor_init(3, data_type::f32);
-    logical_tensor_t lt_var = logical_tensor_init(4, data_type::f32);
-    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
-
-    bn.add_input(lt_data);
-    bn.add_input(lt_gamma);
-    bn.add_input(lt_beta);
-    bn.add_input(lt_mean);
-    bn.add_input(lt_var);
-    bn.add_output(lt_output);
-
-    EXPECT_TRUE(schema->verify(&bn));
-}
-
-TEST(test_interface_op_schema, BatchNormInferenceWithBf16Inputs) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::BatchNormInference);
-
-    op_t bn {0, op_kind::BatchNormInference, std::string("bn")};
-    bn.set_attr(op_attr::epsilon, 0.001f);
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
-    logical_tensor_t lt_mean = logical_tensor_init(3, data_type::bf16);
-    logical_tensor_t lt_var = logical_tensor_init(4, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
-
-    bn.add_input(lt_data);
-    bn.add_input(lt_gamma);
-    bn.add_input(lt_beta);
-    bn.add_input(lt_mean);
-    bn.add_input(lt_var);
-    bn.add_output(lt_output);
-
-    EXPECT_TRUE(schema->verify(&bn));
-}
-
-TEST(test_interface_op_schema, StaticTranspose) {
-    const op_kind_t op_kind_ = op_kind::StaticTranspose;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::order, true}};
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, StaticReshape) {
-    const op_kind_t op_kind_ = op_kind::StaticReshape;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 2;
-    const std::map<op_attr_t, bool> attrs_data
-            = {{op_attr::shape, true}, {op_attr::special_zero, true}};
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferStaticTransposeShape) {
-    const op_schema_t *static_transpose_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::StaticTranspose);
-
-    op_t static_transpose_op {
-            op_kind::StaticTranspose, op_t::kind2str(op_kind::StaticTranspose)};
-    static_transpose_op.set_attr(
-            op_attr::order, std::vector<int64_t> {2, 0, 1});
-
-    logical_tensor_t lt_in1
-            = logical_tensor_init(0, {1024, 64, 32}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_in1};
-    logical_tensor_t lt_o1
-            = logical_tensor_init(1, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out1 {&lt_o1};
-
-    static_transpose_op_schema->shape_infer(
-            &static_transpose_op, lt_in, lt_out1);
-
-    const std::vector<int64_t> inferred_out_shape1
-            = logical_tensor_wrapper_t(lt_o1).vdims();
-    const std::vector<int64_t> expected_out_shape1 = {32, 1024, 64};
-    EXPECT_EQ(inferred_out_shape1, expected_out_shape1);
-
-    // negative order
-    static_transpose_op.set_attr(
-            op_attr::order, std::vector<int64_t> {-2, 2, 0});
-    lt_in1 = logical_tensor_init(0, {2, 4, 1024}, data_type::f32);
-    logical_tensor_t lt_o2
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out2 {&lt_o2};
-    static_transpose_op_schema->shape_infer(
-            &static_transpose_op, lt_in, lt_out2);
-
-    const std::vector<int64_t> inferred_out_shape2
-            = logical_tensor_wrapper_t(lt_o2).vdims();
-    const std::vector<int64_t> expected_out_shape2 = {4, 1024, 2};
-    EXPECT_EQ(inferred_out_shape2, expected_out_shape2);
-
-    // repeat order
-    static_transpose_op.set_attr(
-            op_attr::order, std::vector<int64_t> {1, 1, 0});
-    logical_tensor_t lt_o3
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out3 {&lt_o3};
-    status_t infer_status = static_transpose_op_schema->shape_infer(
-            &static_transpose_op, lt_in, lt_out3);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // order not cover all input axis
-    static_transpose_op.set_attr(op_attr::order, std::vector<int64_t> {1, 0});
-    logical_tensor_t lt_o4
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out4 {&lt_o4};
-    infer_status = static_transpose_op_schema->shape_infer(
-            &static_transpose_op, lt_in, lt_out4);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // order out of range
-    static_transpose_op.set_attr(
-            op_attr::order, std::vector<int64_t> {1, 3, 0});
-    logical_tensor_t lt_o5
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out5 {&lt_o5};
-    infer_status = static_transpose_op_schema->shape_infer(
-            &static_transpose_op, lt_in, lt_out5);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // order is empty
-    static_transpose_op.set_attr(op_attr::order, std::vector<int64_t> {});
-    logical_tensor_t lt_o6
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out6 {&lt_o6};
-    infer_status = static_transpose_op_schema->shape_infer(
-            &static_transpose_op, lt_in, lt_out6);
-
-    const std::vector<int64_t> inferred_out_shape6
-            = logical_tensor_wrapper_t(lt_o6).vdims();
-    const std::vector<int64_t> expected_out_shape6 = {1024, 4, 2};
-    EXPECT_EQ(inferred_out_shape6, expected_out_shape6);
-}
-
-TEST(test_interface_op_schema, InferStaticReshapeShape) {
-    const op_schema_t *static_reshape_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::StaticReshape);
-
-    op_t static_reshape_op {
-            op_kind::StaticReshape, op_t::kind2str(op_kind::StaticReshape)};
-
-    std::vector<int64_t> out_shape {16, 4, 8};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    static_reshape_op.set_attr(op_attr::special_zero, false);
-
-    logical_tensor_t lt_in1
-            = logical_tensor_init(0, {2, 8, 32}, data_type::f32);
-    std::vector<logical_tensor_t *> lt_in {&lt_in1};
-    logical_tensor_t lt_o1
-            = logical_tensor_init(1, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out1 {&lt_o1};
-
-    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out1);
-
-    std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_o1).vdims();
-    EXPECT_EQ(inferred_out_shape, out_shape);
-
-    // test special zero true
-    out_shape = {4, 0, 16};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    static_reshape_op.set_attr(op_attr::special_zero, true);
-    lt_o1 = logical_tensor_init(2, data_type::f32, layout_type::strided);
-
-    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out1);
-
-    inferred_out_shape = logical_tensor_wrapper_t(lt_o1).vdims();
-    std::vector<int64_t> expected_out_shape = {4, 8, 16};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    // test special zero false
-    out_shape = {4, 0, 16};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    static_reshape_op.set_attr(op_attr::special_zero, false);
-    lt_o1 = logical_tensor_init(3, data_type::f32, layout_type::strided);
-
-    status_t infer_status = static_reshape_op_schema->shape_infer(
-            &static_reshape_op, lt_in, lt_out1);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // test -1 in shape
-    out_shape = {8, 0, -1};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    static_reshape_op.set_attr(op_attr::special_zero, true);
-    lt_o1 = logical_tensor_init(4, data_type::f32, layout_type::strided);
-
-    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out1);
-
-    inferred_out_shape = logical_tensor_wrapper_t(lt_o1).vdims();
-    expected_out_shape = {8, 8, 8};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    // test input/output with different shape size
-    out_shape = {4, 6, 16};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    lt_o1 = logical_tensor_init(5, data_type::f32, layout_type::strided);
-
-    infer_status = static_reshape_op_schema->shape_infer(
-            &static_reshape_op, lt_in, lt_out1);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // test invalid shape: more than one -1
-    out_shape = {-1, -1, 16};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    lt_o1 = logical_tensor_init(6, data_type::f32, layout_type::strided);
-
-    infer_status = static_reshape_op_schema->shape_infer(
-            &static_reshape_op, lt_in, lt_out1);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // test invalid shape: < -1
-    out_shape = {4, -6, 16};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    lt_o1 = logical_tensor_init(7, data_type::f32, layout_type::strided);
-
-    infer_status = static_reshape_op_schema->shape_infer(
-            &static_reshape_op, lt_in, lt_out1);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // test shape contains 0-D
-    out_shape = {0, 4, 7};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    static_reshape_op.set_attr(op_attr::special_zero, false);
-
-    logical_tensor_t lt_in8
-            = logical_tensor_init(8, {0, 2, 8, 6}, data_type::f32);
-    lt_in = {&lt_in8};
-    logical_tensor_t lt_o9
-            = logical_tensor_init(9, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> lt_out9 {&lt_o9};
-
-    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out9);
-
-    inferred_out_shape = logical_tensor_wrapper_t(lt_o9).vdims();
-    EXPECT_EQ(inferred_out_shape, out_shape);
-
-    // test invalid shape case
-    out_shape = {-1, 0};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    lt_o9 = logical_tensor_init(10, data_type::f32, layout_type::strided);
-
-    infer_status = static_reshape_op_schema->shape_infer(
-            &static_reshape_op, lt_in, lt_out9);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-
-    // test input/output with different shape size
-    out_shape = {-1, 0};
-    static_reshape_op.set_attr(op_attr::shape, out_shape);
-    lt_o9 = logical_tensor_init(11, data_type::f32, layout_type::strided);
-
-    infer_status = static_reshape_op_schema->shape_infer(
-            &static_reshape_op, lt_in, lt_out9);
-    EXPECT_EQ(infer_status, status::invalid_shape);
-}
-
-TEST(test_interface_op_schema, FailToAddWildcard) {
-    const op_schema_t *wildcard_op_schema
-            = op_schema_registry_t::get_op_schema(op_kind::Wildcard);
-    op_t wildcard_op {0, op_kind::Wildcard, std::string("wildcard")};
-    logical_tensor_t lt_data_a = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_data_b = logical_tensor_init(1, data_type::f16);
-    logical_tensor_t lt_out = logical_tensor_init(2, data_type::bf16);
-
-    wildcard_op.add_input(lt_data_a);
-    wildcard_op.add_input(lt_data_b);
-    wildcard_op.add_output(lt_out);
-    EXPECT_TRUE(wildcard_op_schema->verify(&wildcard_op));
-}
-
-struct dynamic_quantization_params_t {
-    data_type_t data_type;
-    data_type_t scales_type;
-    data_type_t zps_type;
-    data_type_t out_type;
-    std::string qtype;
-    bool asymmetric;
-    int64_t axis;
-    bool supported;
-};
-
-template <op_kind_t kind>
-class dynamic_quantization_t
-    : public ::testing::TestWithParam<dynamic_quantization_params_t> {
-public:
-    void Test() {
-        const op_schema_t *schema = op_schema_registry_t::get_op_schema(kind);
-
-        auto params = ::testing::TestWithParam<
-                dynamic_quantization_params_t>::GetParam();
-
-        op_t dynamic_quantize {0, kind, "dynamic_quantize"};
-        logical_tensor_t lt_data = logical_tensor_init(0, params.data_type);
-        logical_tensor_t lt_scales = logical_tensor_init(1, params.scales_type);
-        logical_tensor_t lt_zps = logical_tensor_init(2, params.zps_type);
-        logical_tensor_t lt_output = logical_tensor_init(3, params.out_type);
-
-        dynamic_quantize.add_input(lt_data);
-        dynamic_quantize.add_input(lt_scales);
-        dynamic_quantize.add_output(lt_output);
-
-        if (params.asymmetric) dynamic_quantize.add_input(lt_zps);
-
-        dynamic_quantize.set_attr<std::string>(op_attr::qtype, params.qtype);
-        if (params.qtype == "per_channel")
-            dynamic_quantize.set_attr<int64_t>(op_attr::axis, params.axis);
-
-        EXPECT_EQ(schema->verify(&dynamic_quantize), params.supported);
-    }
-};
-
-using dynamic_quantize_t = dynamic_quantization_t<op_kind::DynamicQuantize>;
-
-TEST_P(dynamic_quantize_t, TestDynamicQuantize) {
-    Test();
-}
-
-static auto QuantizeCases = []() {
-    std::vector<dynamic_quantization_params_t> cases;
-
-    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
-            data_type::s8, "per_tensor", false, 1, true});
-    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
-            data_type::s8, "per_channel", false, 1, true});
-
-    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
-            data_type::u8, "per_tensor", false, 1, true});
-    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
-            data_type::u8, "per_channel", false, 1, true});
-
-    cases.push_back({data_type::f32, data_type::f32, data_type::s8,
-            data_type::s8, "per_tensor", true, 1, true});
-    cases.push_back({data_type::f32, data_type::f32, data_type::s8,
-            data_type::s8, "per_channel", true, 1, true});
-
-    cases.push_back({data_type::f32, data_type::f32, data_type::u8,
-            data_type::u8, "per_tensor", true, 1, true});
-    cases.push_back({data_type::f32, data_type::f32, data_type::u8,
-            data_type::u8, "per_channel", true, 1, true});
-
-    cases.push_back({data_type::f32, data_type::f32, data_type::s32,
-            data_type::s8, "per_tensor", true, 1, true});
-    cases.push_back({data_type::f32, data_type::f32, data_type::s32,
-            data_type::s8, "per_channel", true, 1, true});
-
-    // negative cases
-    cases.push_back({data_type::f16, data_type::f32, data_type::s32,
-            data_type::s8, "per_tensor", true, 1, false});
-    cases.push_back({data_type::f16, data_type::f16, data_type::s32,
-            data_type::s8, "per_channel", true, 1, false});
-    cases.push_back({data_type::bf16, data_type::bf16, data_type::s32,
-            data_type::s8, "per_channel", true, 1, false});
-    cases.push_back({data_type::bf16, data_type::bf16, data_type::s32,
-            data_type::u8, "per_tensor", false, 1, false});
-
-    return ::testing::ValuesIn(cases);
-};
-
-INSTANTIATE_TEST_SUITE_P(
-        test_interface_op_schema, dynamic_quantize_t, QuantizeCases());
-
-using dynamic_dequantize_t = dynamic_quantization_t<op_kind::DynamicDequantize>;
-
-TEST_P(dynamic_dequantize_t, TestDynamicDequantize) {
-    Test();
-}
-
-static auto DequantizeCases = []() {
-    std::vector<dynamic_quantization_params_t> cases;
-
-    cases.push_back({data_type::s8, data_type::f32, data_type::f32,
-            data_type::f32, "per_tensor", false, 1, true});
-    cases.push_back({data_type::s8, data_type::f32, data_type::f32,
-            data_type::f32, "per_channel", false, 1, true});
-
-    cases.push_back({data_type::u8, data_type::f32, data_type::f32,
-            data_type::f32, "per_tensor", false, 1, true});
-    cases.push_back({data_type::u8, data_type::f32, data_type::f32,
-            data_type::f32, "per_channel", false, 1, true});
-
-    cases.push_back({data_type::s8, data_type::f32, data_type::s8,
-            data_type::f32, "per_tensor", true, 1, true});
-    cases.push_back({data_type::s8, data_type::f32, data_type::s8,
-            data_type::f32, "per_channel", true, 1, true});
-
-    cases.push_back({data_type::u8, data_type::f32, data_type::u8,
-            data_type::f32, "per_tensor", true, 1, true});
-    cases.push_back({data_type::u8, data_type::f32, data_type::u8,
-            data_type::f32, "per_channel", true, 1, true});
-
-    cases.push_back({data_type::s8, data_type::f32, data_type::s32,
-            data_type::f32, "per_tensor", true, 1, true});
-    cases.push_back({data_type::s8, data_type::f32, data_type::s32,
-            data_type::f32, "per_channel", true, 1, true});
-
-    // negative cases
-    cases.push_back({data_type::s8, data_type::f32, data_type::s32,
-            data_type::f16, "per_tensor", true, 1, false});
-    cases.push_back({data_type::s8, data_type::f16, data_type::s32,
-            data_type::f16, "per_channel", true, 1, false});
-    cases.push_back({data_type::s8, data_type::bf16, data_type::s32,
-            data_type::bf16, "per_channel", true, 1, false});
-    cases.push_back({data_type::u8, data_type::bf16, data_type::s32,
-            data_type::bf16, "per_tensor", false, 1, false});
-
-    return ::testing::ValuesIn(cases);
-};
-
-INSTANTIATE_TEST_SUITE_P(
-        test_interface_op_schema, dynamic_dequantize_t, DequantizeCases());
-
-TEST(test_interface_op_schema, InferInterpolateShape) {
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind::Interpolate);
-    op_t op_ {0, op_kind::Interpolate, std::string("Interpolate")};
-
-    // test normal ncx sizes case
-    op_.set_attr<dims>(op_attr::sizes, {10, 20});
-    op_.set_attr<std::string>(op_attr::data_format, "NXC");
-    op_.set_attr<std::string>(
-            op_attr::coordinate_transformation_mode, "align_corners");
-    logical_tensor_t lt_in = logical_tensor_init(
-            0, {6, 7, 8, 9}, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> in {&lt_in};
-    logical_tensor_t lt_out
-            = logical_tensor_init(1, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> out {&lt_out};
-    status_t ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_EQ(ret, status::success);
-    std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    std::vector<int64_t> expected_out_shape = {6, 10, 20, 9};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    // sizes and scales should not be valid at the same time
-    op_.set_attr<std::string>(op_attr::data_format, "NCX");
-    op_.set_attr<std::vector<float>>(op_attr::scales, {0.5f, 0.6f});
-    lt_out = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    EXPECT_NE(op_schema_->verify(&op_), true);
-
-    // test normal scales case
-    op_.set_attr<dims>(op_attr::sizes, {});
-    lt_out = logical_tensor_init(3, data_type::f32, layout_type::strided);
-    ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_EQ(ret, status::success);
-    inferred_out_shape = logical_tensor_wrapper_t(lt_out).vdims();
-    expected_out_shape = {6, 7, 4, 5};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-
-    // scales.size() == in.ndims() -2
-    op_.set_attr<std::vector<float>>(op_attr::scales, {0.5f, 0.6f, 0.7f});
-    lt_out = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_NE(ret, status::success);
-
-    // sizes.size() == in.ndims() -2
-    op_.set_attr<std::vector<float>>(op_attr::scales, {});
-    op_.set_attr<dims>(op_attr::sizes, {20});
-    lt_out = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_NE(ret, status::success);
-}
-
-TEST(test_interface_op_schema, InferInterpolateBackwardShape) {
-    const op_schema_t *op_schema_
-            = op_schema_registry_t::get_op_schema(op_kind::InterpolateBackward);
-    op_t op_ {0, op_kind::InterpolateBackward,
-            std::string("InterpolateBackward")};
-    logical_tensor_t lt_in1 = logical_tensor_init(
-            0, {8, 7, 6, 5}, data_type::f32, layout_type::strided);
-    logical_tensor_t lt_in2 = logical_tensor_init(
-            1, {8, 7, 3, 2}, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> in {&lt_in1, &lt_in2};
-    logical_tensor_t lt_out
-            = logical_tensor_init(2, data_type::f32, layout_type::strided);
-    std::vector<logical_tensor_t *> out {&lt_out};
-    op_.set_attr<dims>(op_attr::sizes, {6, 5});
-
-    status_t ret = op_schema_->shape_infer(&op_, in, out);
-    EXPECT_EQ(ret, status::success);
-    std::vector<int64_t> inferred_out_shape
-            = logical_tensor_wrapper_t(lt_out).vdims();
-    std::vector<int64_t> expected_out_shape = {8, 7, 6, 5};
-    EXPECT_EQ(inferred_out_shape, expected_out_shape);
-}
-
-TEST(test_interface_op_schema, FailToTypecastBf16ToF16) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::TypeCast);
-
-    op_t typecast {0, op_kind::TypeCast, std::string("typecast")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f16);
-
-    typecast.add_input(lt_data);
-    typecast.add_output(lt_output);
-
-    EXPECT_FALSE(schema->verify(&typecast));
-}
-
-TEST(test_interface_op_schema, FailToTypecastF32ToF32) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::TypeCast);
-
-    op_t typecast {0, op_kind::TypeCast, std::string("typecast")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f32);
-
-    typecast.add_input(lt_data);
-    typecast.add_output(lt_output);
-
-    EXPECT_FALSE(schema->verify(&typecast));
-}
-
-TEST(test_interface_op_schema, FailToAddBn) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::BatchNormInference);
-
-    op_t bn {0, op_kind::BatchNormInference, std::string("bn")};
-    bn.set_attr(op_attr::epsilon, 0.001f);
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
-    logical_tensor_t lt_mean = logical_tensor_init(3, data_type::bf16);
-    logical_tensor_t lt_var = logical_tensor_init(4, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(5, data_type::f32);
-
-    bn.add_input(lt_data);
-    bn.add_input(lt_gamma);
-    bn.add_input(lt_beta);
-    bn.add_input(lt_mean);
-    bn.add_input(lt_var);
-    bn.add_output(lt_output);
-
-    EXPECT_FALSE(schema->verify(&bn));
-}
-
-TEST(test_interface_op_schema, FailToAddLn) {
-    const op_schema_t *schema
-            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
-
-    op_t ln {0, op_kind::LayerNorm, std::string("ln")};
-    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
-    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
-    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
-    logical_tensor_t lt_output = logical_tensor_init(5, data_type::f32);
-
-    ln.add_input(lt_data);
-    ln.add_input(lt_gamma);
-    ln.add_input(lt_beta);
-    ln.add_output(lt_output);
-
-    EXPECT_FALSE(schema->verify(&ln));
-}
-
-TEST(test_interface_op_schema, Pow) {
-    const op_kind_t op_kind_ = op_kind::Pow;
-    const size_t expected_in_size = 1;
-    const size_t expected_out_size = 1;
-    const size_t expected_attr_size = 1;
-    const std::map<op_attr_t, bool> attrs_data = {{op_attr::beta, true}};
-
-    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
-            expected_attr_size, attrs_data);
-}
-
-TEST(test_interface_op_schema, InferPowOutputShape) {
-    const op_kind_t op_kind_ = op_kind::Pow;
-
-    verify_single_in_identity_shape_infer(op_kind_);
-}
diff --git a/tests/gtests/graph/unit/interface/test_op_schema_cpu.cpp b/tests/gtests/graph/unit/interface/test_op_schema_cpu.cpp
new file mode 100644
index 00000000000..2786d80e1b6
--- /dev/null
+++ b/tests/gtests/graph/unit/interface/test_op_schema_cpu.cpp
@@ -0,0 +1,5154 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <gtest/gtest.h>
+
+#include "interface/c_types_map.hpp"
+#include "interface/graph.hpp"
+#include "interface/op_def.hpp"
+#include "interface/op_schema.hpp"
+
+#include "graph/unit/utils.hpp"
+
+using namespace dnnl::impl::graph;
+using namespace dnnl::graph::tests::unit::utils;
+
+#ifndef NDEBUG
+
+TEST(test_interface_op_schema, DuplicateAttributeDeathTest) {
+    EXPECT_DEATH(op_schema_t()
+                         .set_attr(op_attr::kernel, true, attribute_kind::b)
+                         .set_attr(op_attr::kernel, true, attribute_kind::b),
+            "provided attribute has already been set");
+}
+
+TEST(test_interface_op_schema, DuplicatedInputDeathTest) {
+    EXPECT_DEATH(op_schema_t().set_num_inputs(5).set_input(3, "mean").set_input(
+                         3, "mean"),
+            "provided `in_offset` has already been set");
+}
+
+TEST(test_interface_op_schema, DuplicatedOutputDeathTest) {
+    EXPECT_DEATH(op_schema_t()
+                         .set_num_outputs(1)
+                         .set_output(0, "output")
+                         .set_output(0, "output"),
+            "provided `out_offset` has already been set");
+}
+
+TEST(test_interface_op_schema, SetInputBeforeSetNumInputsDeathTest) {
+    EXPECT_DEATH(op_schema_t().set_input(0, "a").set_num_inputs(2),
+            "input set before setting num_inputs_");
+}
+
+TEST(test_interface_op_schema, SetOutputBeforeSetNumOutputsDeathTest) {
+    EXPECT_DEATH(op_schema_t().set_output(0, "output").set_num_outputs(1),
+            "output set before setting num_outputs_");
+}
+
+TEST(test_interface_op_schema, ExceededNumInputsDeathTest) {
+    EXPECT_DEATH(
+            op_schema_t().set_num_inputs(1).set_input(0, "a").set_input(1, "b"),
+            "input offset exceeds declared num of inputs");
+}
+
+TEST(test_interface_op_schema, ExceededNumOutputsDeathTest) {
+    EXPECT_DEATH(op_schema_t().set_num_outputs(1).set_output(0, "a").set_output(
+                         1, "b"),
+            "output offset exceeds declared num of outputs");
+}
+
+#endif
+
+TEST(test_interface_op_schema, OpschemaMethodDeathTest) {
+    auto op_schema = op_schema_t(op_kind::Add, 1);
+
+    ASSERT_NO_THROW({ op_schema.set_num_inputs(2); });
+    ASSERT_NO_THROW({ op_schema.set_num_outputs(1); });
+    ASSERT_NO_THROW({ op_schema.set_input(0, "a", "T"); });
+    ASSERT_NO_THROW({ op_schema.set_input(1, "b", "T"); });
+    ASSERT_NO_THROW({ op_schema.get_inputs(); });
+    ASSERT_NO_THROW({ op_schema.set_output(0, "output", "T"); });
+
+    ASSERT_NO_THROW({
+        op_schema.set_type_constraints(
+                "T", {data_type::f32, data_type::bf16, data_type::f16});
+    });
+    ASSERT_NO_THROW({ op_schema.get_outputs(); });
+}
+
+TEST(test_interface_op_schema, Convolution) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+    const std::set<size_t> expected_in_sizes = {2, 3};
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 8;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::dilations, true}, {op_attr::auto_pad, false},
+            {op_attr::groups, false}, {op_attr::data_format, false},
+            {op_attr::weights_format, false}};
+    for (auto expected_in_size : expected_in_sizes) {
+        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+                expected_attr_size, attrs_data);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvolutionAutoPadShape) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t groups = 1;
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+
+        auto lt_data = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        auto lt_weight
+                = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &inferred_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &inferred_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {2, 2};
+            std::vector<int64_t> expected_pads_end {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 3, 3};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {0, 0};
+            std::vector<int64_t> expected_pads_end {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferConvolutionAutoPadNegtivePaddingSize) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
+    std::vector<int64_t> strides = {3, 3};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {"SAME_UPPER", "SAME_LOWER"};
+    int64_t groups = 1;
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+
+        auto lt_data = logical_tensor_init(id++, {1, 1, 5, 5}, data_type::f32);
+        auto lt_weight
+                = logical_tensor_init(id++, {1, 1, 1, 1}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &inferred_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &inferred_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+        const std::vector<int64_t> expected_out_shape {1, 1, 2, 2};
+        ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+        std::vector<int64_t> expected_pads_begin {0, 0};
+        std::vector<int64_t> expected_pads_end {0, 0};
+        EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+        EXPECT_EQ(inferred_pads_end, expected_pads_end);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvolutionNXCAutoPadShapeWithDilations) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NXC";
+    std::string filter_format = "XIO";
+    const std::vector<std::string> auto_pads_vec {
+            "None", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t groups = 1;
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+
+        auto lt_data
+                = logical_tensor_init(id++, {1, 100, 100, 160}, data_type::f32);
+        auto lt_weight
+                = logical_tensor_init(id++, {7, 1, 160, 192}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &infered_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &infered_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto infered_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        if (auto_pad == "None" || auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape {1, 88, 100, 192};
+            ASSERT_EQ(infered_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {0, 0};
+            std::vector<int64_t> expected_pads_end {0, 0};
+            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
+            EXPECT_EQ(infered_pads_end, expected_pads_end);
+        } else {
+            const std::vector<int64_t> expected_out_shape {1, 100, 100, 192};
+            ASSERT_EQ(infered_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {6, 0};
+            std::vector<int64_t> expected_pads_end {6, 0};
+            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
+            EXPECT_EQ(infered_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeAutoPadNegtivePaddingSize) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
+    std::vector<int64_t> strides = {2, 2};
+    std::vector<int64_t> pads_begin = {0, 0};
+    std::vector<int64_t> pads_end = {0, 0};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {"SAME_UPPER", "SAME_LOWER"};
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    // according to the convtranspose semantic, output_padding is bigger
+    // than 0 only if stride is greater than 1.
+    std::vector<int64_t> output_padding = {0, 0};
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
+                dilations, auto_pad, data_format, filter_format, groups,
+                output_padding);
+
+        auto lt_data = logical_tensor_init(0, {1, 1, 3, 3}, data_type::f32);
+        auto lt_weight = logical_tensor_init(1, {1, 1, 1, 1}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &infered_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &infered_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto infered_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        const std::vector<int64_t> expected_out_shape {1, 1, 5, 5};
+        ASSERT_EQ(infered_out_shape, expected_out_shape);
+
+        std::vector<int64_t> expected_pads_begin {0, 0};
+        std::vector<int64_t> expected_pads_end {0, 0};
+        EXPECT_EQ(infered_pads_begin, expected_pads_begin);
+        EXPECT_EQ(infered_pads_end, expected_pads_end);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeAutoPadShapeWithDilations) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    // according to the convtranspose semantic, output_padding is bigger
+    // than 0 only if stride is greater than 1.
+    std::vector<int64_t> output_padding = {0, 0};
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
+                dilations, auto_pad, data_format, filter_format, groups,
+                output_padding);
+
+        auto lt_data = logical_tensor_init(0, {1, 1, 10, 10}, data_type::f32);
+        auto lt_weight = logical_tensor_init(1, {1, 1, 4, 4}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &infered_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &infered_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto infered_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 13, 13};
+            ASSERT_EQ(infered_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
+            EXPECT_EQ(infered_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER" || auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
+            ASSERT_EQ(infered_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {3, 3};
+            std::vector<int64_t> expected_pads_end {3, 3};
+            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
+            EXPECT_EQ(infered_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 16, 16};
+            ASSERT_EQ(infered_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {0, 0};
+            std::vector<int64_t> expected_pads_end {0, 0};
+            EXPECT_EQ(infered_pads_begin, expected_pads_begin);
+            EXPECT_EQ(infered_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferConvolutionDilationsShape) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::Convolution, op_t::kind2str(op_kind::Convolution)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+
+    auto lt_data = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
+    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
+    std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
+
+    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+    ASSERT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, ConvTranspose) {
+    const op_kind_t op_kind_ = op_kind::ConvTranspose;
+    const std::set<size_t> expected_in_sizes = {2, 3};
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 9;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::dilations, true}, {op_attr::auto_pad, false},
+            {op_attr::groups, false}, {op_attr::data_format, false},
+            {op_attr::weights_format, false}, {op_attr::output_padding, false}};
+    for (auto expected_in_size : expected_in_sizes) {
+        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+                expected_attr_size, attrs_data);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeAutoPadShape) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    // according to the convtranspose semantic, output_padding is bigger
+    // than 0 only if stride is greater than 1.
+    std::vector<int64_t> output_padding = {0, 0};
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
+                dilations, auto_pad, data_format, filter_format, groups,
+                output_padding);
+
+        auto lt_data = logical_tensor_init(0, {1, 1, 6, 6}, data_type::f32);
+        auto lt_weight = logical_tensor_init(1, {1, 1, 4, 4}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &inferred_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &inferred_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {2, 2};
+            std::vector<int64_t> expected_pads_end {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 9, 9};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {0, 0};
+            std::vector<int64_t> expected_pads_end {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeDilationsShape) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+
+    auto lt_data = logical_tensor_init(id++, {1, 1, 10, 10}, data_type::f32);
+    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
+    std::vector<int64_t> expected_out_shape {1, 1, 16, 16};
+
+    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+    ASSERT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferConvtransposeBiasOutputShape) {
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::ConvTranspose);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTranspose, op_t::kind2str(op_kind::ConvTranspose)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {}; // empty pads_begin
+    std::vector<int64_t> pads_end = {}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    int64_t groups = 1;
+    std::vector<int64_t> output_padding = {0, 0};
+
+    set_convtranspose_common_attr(a_op, strides, pads_begin, pads_end,
+            dilations, "SAME_UPPER", data_format, filter_format, groups,
+            output_padding);
+
+    auto lt_data = logical_tensor_init(0, {1, 1, 5, 5}, data_type::f32);
+    auto lt_weight = logical_tensor_init(1, {1, 1, 3, 3}, data_type::f32);
+    auto lt_o = logical_tensor_init(
+            2, {1, 1, 5, 5}, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+    pads_begin = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+    pads_end = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+    std::vector<int64_t> expected_pads_begin = {1, 1};
+    std::vector<int64_t> expected_pads_end = {1, 1};
+    EXPECT_EQ(pads_begin, expected_pads_begin);
+    EXPECT_EQ(pads_end, expected_pads_end);
+}
+
+TEST(test_interface_op_schema, ConvTransposeBackwardData) {
+    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardData;
+    const std::set<size_t> expected_in_sizes = {2};
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 8;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::dilations, true}, {op_attr::auto_pad, false},
+            {op_attr::groups, false}, {op_attr::data_format, false},
+            {op_attr::weights_format, false}};
+    for (auto expected_in_size : expected_in_sizes) {
+        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+                expected_attr_size, attrs_data);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvTransposeBackwardDataDilationsShape) {
+    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
+            op_kind::ConvTransposeBackwardData);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTransposeBackwardData,
+            op_t::kind2str(op_kind::ConvTransposeBackwardData)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+
+    auto lt_data = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
+    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
+    std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
+
+    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+    ASSERT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferConvTransposeBackwardDataAutoPadShape) {
+    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardData;
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
+
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+
+        auto lt_data = logical_tensor_init(0, {1, 1, 6, 6}, data_type::f32);
+        auto lt_weight = logical_tensor_init(1, {1, 1, 4, 4}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &inferred_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &inferred_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {2, 2};
+            std::vector<int64_t> expected_pads_end {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 3, 3};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {0, 0};
+            std::vector<int64_t> expected_pads_end {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferConvTransposeBackwardDataShapeWithNxcXio) {
+    const auto kind = op_kind::ConvTransposeBackwardData;
+    const std::string data_format {"NCX"};
+    const std::string filter_format {"OIX"};
+    const int64_t groups {1};
+    const int64_t stride {4};
+    const int64_t pad {0};
+    const int64_t dilation {1};
+    const std::vector<int64_t> diff_dst_shape {256, 3, 227, 227};
+    const std::vector<int64_t> wei_shape {3, 96, 11, 11};
+    const std::vector<int64_t> expected_diff_src_shape {256, 96, 55, 55};
+
+    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
+            filter_format, groups, stride, pad, dilation, diff_dst_shape,
+            wei_shape, expected_diff_src_shape);
+}
+
+TEST(test_interface_op_schema, InferConvTransposeBackwardDataShapeWithNcxOix) {
+    const auto kind = op_kind::ConvTransposeBackwardData;
+    const std::string data_format {"NCX"};
+    const std::string filter_format {"OIX"};
+    const int64_t groups {1};
+    const int64_t stride {4};
+    const int64_t pad {0};
+    const int64_t dilation {1};
+    const std::vector<int64_t> diff_dst_shape {256, 3, 227, 227};
+    const std::vector<int64_t> wei_shape {3, 96, 11, 11};
+    const std::vector<int64_t> expected_diff_src_shape {256, 96, 55, 55};
+
+    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
+            filter_format, groups, stride, pad, dilation, diff_dst_shape,
+            wei_shape, expected_diff_src_shape);
+}
+
+TEST(test_interface_op_schema,
+        InferConvTransposeBackwardDataShapeWithNxcXioAndGroups) {
+    const auto kind = op_kind::ConvTransposeBackwardData;
+    const std::string data_format {"NCX"};
+    const std::string filter_format {"OIX"};
+    const int64_t groups {2};
+    const int64_t stride {1};
+    const int64_t pad {1};
+    const int64_t dilation {1};
+    const std::vector<int64_t> diff_dst_shape {256, 384, 13, 13};
+    const std::vector<int64_t> wei_shape {192, 256, 3, 3};
+    const std::vector<int64_t> expected_diff_src_shape {256, 256, 13, 13};
+
+    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
+            filter_format, groups, stride, pad, dilation, diff_dst_shape,
+            wei_shape, expected_diff_src_shape);
+}
+
+TEST(test_interface_op_schema,
+        InferConvTransposeBackwardDataShapeWithNcxOixAndGroups) {
+    const auto kind = op_kind::ConvTransposeBackwardData;
+    const std::string data_format {"NCX"};
+    const std::string filter_format {"OIX"};
+    const int64_t groups {2};
+    const int64_t stride {1};
+    const int64_t pad {1};
+    const int64_t dilation {1};
+    const std::vector<int64_t> diff_dst_shape {256, 384, 13, 13};
+    const std::vector<int64_t> wei_shape {192, 256, 3, 3};
+    const std::vector<int64_t> expected_diff_src_shape {256, 256, 13, 13};
+
+    verify_shape_infer_for_convtranspose_bprop_data(kind, data_format,
+            filter_format, groups, stride, pad, dilation, diff_dst_shape,
+            wei_shape, expected_diff_src_shape);
+}
+
+TEST(test_interface_op_schema, ConvTransposeBackwardWeights) {
+    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardWeights;
+    const std::set<size_t> expected_in_sizes = {2, 3};
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 9;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::dilations, true}, {op_attr::auto_pad, false},
+            {op_attr::groups, false}, {op_attr::data_format, false},
+            {op_attr::weights_format, false}, {op_attr::weights_shape, false}};
+    for (auto expected_in_size : expected_in_sizes) {
+        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+                expected_attr_size, attrs_data);
+    }
+}
+
+TEST(test_interface_op_schema,
+        InferConvTransposeBackwardWeightsShapeFromAttribute) {
+    const auto kind = op_kind::ConvTransposeBackwardWeights;
+    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(kind);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {kind, op_t::kind2str(kind)};
+    std::vector<int64_t> strides = {4, 4};
+    std::vector<int64_t> pads_begin = {0, 0};
+    std::vector<int64_t> pads_end = {0, 0};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    int64_t groups = 1;
+
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations, "None",
+            data_format, filter_format, groups);
+    const std::vector<int64_t> expected_diff_wei_shape {3, 96, 11, 11};
+    a_op.set_attr<std::vector<int64_t>>(
+            op_attr::weights_shape, expected_diff_wei_shape);
+
+    auto src_lt = logical_tensor_init(0, {256, 96, 55, 55}, data_type::f32);
+    auto diff_dst_lt
+            = logical_tensor_init(1, {256, 3, 227, 227}, data_type::f32);
+    auto diff_wei_lt = logical_tensor_init(2, data_type::f32);
+    std::vector<logical_tensor_t *> in_lts {&src_lt, &diff_dst_lt};
+    std::vector<logical_tensor_t *> out_lts {&diff_wei_lt};
+    a_op_schema->shape_infer(&a_op, in_lts, out_lts);
+
+    const std::vector<int64_t> inferred_diff_wei_shape
+            = logical_tensor_wrapper_t(diff_wei_lt).vdims();
+    EXPECT_EQ(inferred_diff_wei_shape, expected_diff_wei_shape);
+}
+
+TEST(test_interface_op_schema,
+        InferConvTransposeBackwardWeightsDilationsShape) {
+    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
+            op_kind::ConvTransposeBackwardWeights);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvTransposeBackwardWeights,
+            op_t::kind2str(op_kind::ConvTransposeBackwardWeights)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+
+    // data shape {N, IC, H, W}
+    const std::vector<int64_t> &in_data = {1, 1, 10, 10};
+    const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
+
+    a_op.set_attr(op_attr::weights_shape, expected_out_shape);
+    auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
+
+    logical_tensor_t lt_output_delta
+            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
+
+    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+    logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    a_op_schema->shape_infer(&a_op, in, out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferConvTransposeBackwardWeightsAutoPadShape) {
+    const op_kind_t op_kind_ = op_kind::ConvTransposeBackwardWeights;
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
+
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+
+        // data shape {N, IC, H, W}
+        const std::vector<int64_t> &in_data = {1, 1, 6, 6};
+        const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
+
+        a_op.set_attr(op_attr::weights_shape, expected_out_shape);
+        auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
+
+        logical_tensor_t lt_output_delta
+                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        if (auto_pad == "VALID") {
+            lt_output_delta
+                    = logical_tensor_init(id++, {1, 1, 9, 9}, data_type::f32);
+        }
+
+        std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        a_op_schema->shape_infer(&a_op, in, out);
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, GenerateDefaultAttribute) {
+    const op_schema_t *matmul_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MatMul);
+    op_t matmul_op {0, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt_data_a = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_data_b = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+
+    matmul_op.add_input(lt_data_a);
+    matmul_op.add_input(lt_data_b);
+    matmul_op.add_output(lt_out);
+    EXPECT_TRUE(matmul_op_schema->verify(&matmul_op));
+
+    logical_tensor_t lt_bias = logical_tensor_init(3, data_type::f32);
+    matmul_op.add_input(lt_bias);
+    EXPECT_TRUE(matmul_op_schema->verify(&matmul_op));
+
+    matmul_op.set_attr(op_attr::transpose_a, true);
+    const bool *flag;
+    const bool **ret_flag = &flag;
+    matmul_op.get_attr<bool>(op_attr::transpose_a, ret_flag);
+    EXPECT_TRUE(ret_flag);
+    EXPECT_EQ(matmul_op.get_attr<bool>(op_attr::transpose_b, ret_flag),
+            status::invalid_arguments);
+
+    graph_t agraph;
+    ASSERT_EQ(agraph.add_op(&matmul_op), status::success);
+    agraph.finalize();
+    ASSERT_EQ(agraph.num_ops(), 1U);
+
+    const auto &graph_matmul_op = agraph.get_ops()[0];
+    EXPECT_TRUE(graph_matmul_op->get_attr<bool>(op_attr::transpose_a));
+    EXPECT_FALSE(graph_matmul_op->get_attr<bool>(op_attr::transpose_b));
+}
+
+TEST(test_interface_op_schema, TestVerifyFunction) {
+    const op_schema_t *conv_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Convolution);
+
+    op_t conv_op {0, op_kind::Convolution, std::string("convolution")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_weight = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+
+    conv_op.add_input(lt_data);
+    conv_op.add_input(lt_weight);
+    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
+
+    conv_op.add_output(lt_out);
+    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
+
+    std::vector<int64_t> strides = {2, 2};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    conv_op.set_attr(op_attr::strides, strides);
+    conv_op.set_attr(op_attr::pads_begin, pads_begin);
+    conv_op.set_attr(op_attr::pads_end, pads_end);
+    conv_op.set_attr(op_attr::dilations, dilations);
+    conv_op.set_attr(op_attr::data_format, data_format);
+    conv_op.set_attr(op_attr::weights_format, filter_format);
+
+    EXPECT_TRUE(conv_op_schema->verify(&conv_op));
+
+    conv_op.set_attr(op_attr::auto_pad, false);
+    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
+
+    std::string auto_pad = "VALID";
+    conv_op.set_attr(op_attr::auto_pad, auto_pad);
+
+    EXPECT_TRUE(conv_op_schema->verify(&conv_op));
+
+    float arbitrary_value = 123.0;
+    conv_op.set_attr(op_attr::undef, arbitrary_value);
+
+    // not allow undefined attribute
+    EXPECT_FALSE(conv_op_schema->verify(&conv_op));
+
+    const op_schema_t *select_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Select);
+
+    logical_tensor_t lt_cond_invalid_dt
+            = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_cond_valid_dt
+            = logical_tensor_init(0, data_type::boolean);
+    logical_tensor_t lt_src_0 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_src_1 = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t lt_dst = logical_tensor_init(3, data_type::f32);
+
+    op_t select_op_invalid_dt {0, op_kind::Select, std::string("select")};
+    select_op_invalid_dt.add_input(lt_cond_invalid_dt);
+    select_op_invalid_dt.add_input(lt_src_0);
+    select_op_invalid_dt.add_input(lt_src_1);
+    select_op_invalid_dt.add_output(lt_dst);
+    EXPECT_FALSE(select_op_schema->verify(&select_op_invalid_dt));
+
+    op_t select_op_valid_dt {0, op_kind::Select, std::string("select")};
+    select_op_valid_dt.add_input(lt_cond_valid_dt);
+    select_op_valid_dt.add_input(lt_src_0);
+    select_op_valid_dt.add_input(lt_src_1);
+    select_op_valid_dt.add_output(lt_dst);
+    EXPECT_TRUE(select_op_schema->verify(&select_op_valid_dt));
+}
+
+TEST(test_interface_op_schema, InferConvOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, IC, H, W}
+        const std::vector<int64_t> &in_data = {1, 32, 224, 224};
+        // weight shape {OC, IC, KH, KW}
+        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3};
+        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeOutputShape) {
+    const op_kind_t op_kind_ = op_kind::ConvTranspose;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    // data shape {N, IC, H, W}
+    const std::vector<int64_t> &in_data = {1, 16, 111, 111};
+    const std::vector<int64_t> &expected_out_shape = {1, 32, 224, 224};
+    for (auto groups : groups_vec) {
+        // weight shape {OC, IC, KH, KW}
+        const std::vector<int64_t> &in_weight = {32 / groups, 16, 3, 3};
+
+        verify_shape_infer_for_convtranspose(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConv3dOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, IC, D, H, W}
+        const std::vector<int64_t> &in_data = {1, 32, 224, 224, 224};
+        // weight shape {OC, IC, KD, KH, KW}
+        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3, 3};
+        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111, 111};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeOutputShapeWithNxcFormat) {
+    const op_kind_t op_kind_ = op_kind::ConvTranspose;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "OIX";
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    // data shape {N, H, W, IC}
+    const std::vector<int64_t> &in_data = {1, 111, 111, 16};
+    const std::vector<int64_t> &expected_out_shape = {1, 224, 224, 32};
+    for (auto groups : groups_vec) {
+        // weight shape {OC, IC, KH, KW}
+        const std::vector<int64_t> &in_weight = {32 / groups, 16, 3, 3};
+
+        verify_shape_infer_for_convtranspose(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvOutputShapeWithNxcFormat) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "OIX";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, H, W, IC}
+        const std::vector<int64_t> &in_data = {1, 224, 224, 32};
+        // weight shape {OC, IC, KH, KW}
+        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3};
+        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 16};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConv3dOutputShapeWithNxcFormat) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "OIX";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, D, H, W, IC}
+        const std::vector<int64_t> &in_data = {1, 224, 224, 224, 32};
+        // weight shape {OC, IC, KD, KH, KW}
+        const std::vector<int64_t> &in_weight = {16, 32 / groups, 3, 3, 3};
+        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 111, 16};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvOutputShapeWithNxcXioFormat) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "XIO";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, H, W, IC}
+        const std::vector<int64_t> &in_data = {1, 224, 224, 32};
+        // weight shape {KH, KW, IC, OC}
+        const std::vector<int64_t> &in_weight = {3, 3, 32 / groups, 16};
+        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 16};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeOutputShapeWithNxcXioFormat) {
+    const op_kind_t op_kind_ = op_kind::ConvTranspose;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "XIO";
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    // data shape {N, H, W, IC}
+    const std::vector<int64_t> &in_data = {1, 111, 111, 16};
+    const std::vector<int64_t> &expected_out_shape = {1, 224, 224, 32};
+    for (auto groups : groups_vec) {
+        // weight shape {KH, KW, IC, OC}
+        const std::vector<int64_t> &in_weight = {3, 3, 16, 32 / groups};
+
+        verify_shape_infer_for_convtranspose(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConv3dOutputShapeWithNxcXioFormat) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "XIO";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, D, H, W, IC}
+        const std::vector<int64_t> &in_data = {1, 224, 224, 224, 32};
+        // weight shape {KD, KH, KW, IC, OC}
+        const std::vector<int64_t> &in_weight = {3, 3, 3, 32 / groups, 16};
+        const std::vector<int64_t> &expected_out_shape = {1, 111, 111, 111, 16};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvOutputShapeWithXioFormat) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "XIO";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, IC, H, W}
+        const std::vector<int64_t> &in_data = {1, 32, 224, 224};
+        // weight shape {KH, KW, IC, OC}
+        const std::vector<int64_t> &in_weight = {3, 3, 32 / groups, 16};
+        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvtransposeOutputShapeWithXioFormat) {
+    const op_kind_t op_kind_ = op_kind::ConvTranspose;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "XIO";
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    // data shape {N, IC, H, W}
+    const std::vector<int64_t> &in_data = {1, 16, 111, 111};
+    const std::vector<int64_t> &expected_out_shape = {1, 32, 224, 224};
+    for (auto groups : groups_vec) {
+        // weight shape {KH, KW, IC, OC}
+        const std::vector<int64_t> &in_weight = {3, 3, 16, 32 / groups};
+
+        verify_shape_infer_for_convtranspose(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConv3dOutputShapeWithXioFormat) {
+    const op_kind_t op_kind_ = op_kind::Convolution;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "XIO";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+
+    for (auto groups : groups_vec) {
+        // data shape {N, IC, D, H, W}
+        const std::vector<int64_t> &in_data = {1, 32, 224, 224, 224};
+        // weight shape {KD, KH, KW, IC, OC}
+        const std::vector<int64_t> &in_weight = {3, 3, 3, 32 / groups, 16};
+        const std::vector<int64_t> &expected_out_shape = {1, 16, 111, 111, 111};
+
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, expected_out_shape);
+
+        const std::vector<int64_t> &in_bias = {16};
+        verify_shape_infer_for_conv(op_kind_, data_format, filter_format,
+                groups, in_data, in_weight, in_bias, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, MaxPool) {
+    const op_kind_t op_kind_ = op_kind::MaxPool;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 8;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::kernel, true}, {op_attr::pads_begin, true},
+            {op_attr::pads_end, true}, {op_attr::dilations, false},
+            {op_attr::data_format, false}, {op_attr::auto_pad, false},
+            {op_attr::rounding_type, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMaxpoolOutputShape) {
+    const op_schema_t *pool_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
+
+    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
+    std::vector<int64_t> strides = {2, 2};
+    std::vector<int64_t> kernel = {3, 3};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string auto_pad = "SAME_UPPER";
+    std::string data_format = "NCX";
+
+    pool_op.set_attr(op_attr::strides, strides);
+    pool_op.set_attr(op_attr::pads_begin, pads_begin);
+    pool_op.set_attr(op_attr::pads_end, pads_end);
+    pool_op.set_attr(op_attr::kernel, kernel);
+    pool_op.set_attr(op_attr::dilations, dilations);
+    pool_op.set_attr(op_attr::auto_pad, auto_pad);
+    pool_op.set_attr(op_attr::data_format, data_format);
+
+    logical_tensor_t lt_data
+            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_data};
+    logical_tensor_t lt_o
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+
+    // if output shape is unknown, infer output shape
+    pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_o).vdims();
+    const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    const std::vector<int64_t> inferred_out_strides
+            = logical_tensor_wrapper_t(lt_o).vstrides();
+    const std::vector<int64_t> expected_out_strides
+            = compute_dense_strides(expected_out_shape);
+    EXPECT_EQ(inferred_out_strides, expected_out_strides);
+
+    // if output shape is known, infer auto pad
+    pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
+    auto inferred_pads_begin
+            = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+    auto inferred_pads_end
+            = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+    const std::vector<int64_t> expected_pads_begin = {0, 0};
+    const std::vector<int64_t> expected_pads_end = {1, 1};
+    EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+    EXPECT_EQ(inferred_pads_end, expected_pads_end);
+}
+
+TEST(test_interface_op_schema, InferMaxpoolAutoPadShape) {
+    const op_schema_t *pool_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
+    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+
+    pool_op.set_attr(op_attr::strides, strides);
+    pool_op.set_attr(op_attr::pads_begin, pads_begin);
+    pool_op.set_attr(op_attr::pads_end, pads_end);
+    pool_op.set_attr(op_attr::kernel, kernel);
+    pool_op.set_attr(op_attr::dilations, dilations);
+
+    pool_op.set_attr(op_attr::data_format, data_format);
+    std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        pool_op.set_attr(op_attr::auto_pad, auto_pad);
+        logical_tensor_t lt_data
+                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        std::vector<logical_tensor_t *> lt_in {&lt_data};
+        logical_tensor_t lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        // if output shape is unknown, infer output shape
+        pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_o).vdims();
+
+        auto inferred_pads_begin
+                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        auto inferred_pads_end
+                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {2, 2};
+            const std::vector<int64_t> expected_pads_end = {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 3, 3};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {0, 0};
+            const std::vector<int64_t> expected_pads_end = {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferMaxpoolDilationsShape) {
+    const op_schema_t *pool_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
+    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {0, 0};
+    std::vector<int64_t> pads_end = {0, 0};
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    const std::vector<int64_t> expected_out_shape = {1, 1, 10, 10};
+    int64_t id = 0;
+
+    pool_op.set_attr(op_attr::strides, strides);
+    pool_op.set_attr(op_attr::pads_begin, pads_begin);
+    pool_op.set_attr(op_attr::pads_end, pads_end);
+    pool_op.set_attr(op_attr::kernel, kernel);
+    pool_op.set_attr(op_attr::dilations, dilations);
+
+    pool_op.set_attr(op_attr::data_format, data_format);
+    pool_op.set_attr<std::string>(op_attr::auto_pad, "VALID");
+
+    logical_tensor_t lt_data
+            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_data};
+    logical_tensor_t lt_o
+            = logical_tensor_init(id++, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+    // if output shape is unknown, infer output shape
+    pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
+
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_o).vdims();
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferMaxpoolOutputShapeWithRoundingType) {
+    const op_schema_t *pool_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MaxPool);
+
+    std::set<std::string> rounding_types = {"ceil", "floor"};
+    for (auto &rounding_type : rounding_types) {
+        op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
+        std::vector<int64_t> strides = {2, 2};
+        std::vector<int64_t> kernel = {3, 3};
+        std::vector<int64_t> pads_begin = {0, 0};
+        std::vector<int64_t> pads_end = {0, 0};
+        std::vector<int64_t> dilations = {1, 1};
+        std::string data_format = "NCX";
+        pool_op.set_attr(op_attr::strides, strides);
+        pool_op.set_attr(op_attr::pads_begin, pads_begin);
+        pool_op.set_attr(op_attr::pads_end, pads_end);
+        pool_op.set_attr(op_attr::kernel, kernel);
+        pool_op.set_attr(op_attr::dilations, dilations);
+        pool_op.set_attr(op_attr::data_format, data_format);
+        pool_op.set_attr(op_attr::rounding_type, rounding_type);
+
+        logical_tensor_t lt_data
+                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+        std::vector<logical_tensor_t *> lt_in {&lt_data};
+        logical_tensor_t lt_o
+                = logical_tensor_init(2, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+
+        // if output shape is unknown, infer output shape
+        pool_op_schema->shape_infer(&pool_op, lt_in, lt_out);
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_o).vdims();
+        const std::vector<int64_t> inferred_out_strides
+                = logical_tensor_wrapper_t(lt_o).vstrides();
+        if (rounding_type == "ceil") {
+            const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+            const std::vector<int64_t> expected_out_strides
+                    = compute_dense_strides(expected_out_shape);
+            EXPECT_EQ(inferred_out_strides, expected_out_strides);
+        } else { // rounding_type = floor
+            const std::vector<int64_t> expected_out_shape = {1, 3, 111, 111};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+            const std::vector<int64_t> expected_out_strides
+                    = compute_dense_strides(expected_out_shape);
+            EXPECT_EQ(inferred_out_strides, expected_out_strides);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, TestMaxpoolDilations) {
+    op_t pool_op {op_kind::MaxPool, op_t::kind2str(op_kind::MaxPool)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {0, 0};
+    std::vector<int64_t> pads_end = {0, 0};
+    std::vector<int64_t> invalid_dilations = {2, 2, 2};
+    std::string data_format = "NCX";
+    const std::vector<int64_t> expected_out_shape = {1, 1, 10, 10};
+    int64_t id = 0;
+
+    pool_op.set_attr(op_attr::strides, strides);
+    pool_op.set_attr(op_attr::pads_begin, pads_begin);
+    pool_op.set_attr(op_attr::pads_end, pads_end);
+    pool_op.set_attr(op_attr::kernel, kernel);
+    pool_op.set_attr(op_attr::dilations, invalid_dilations);
+
+    pool_op.set_attr(op_attr::data_format, data_format);
+    pool_op.set_attr<std::string>(op_attr::auto_pad, "VALID");
+
+    logical_tensor_t lt_data
+            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_data};
+    logical_tensor_t lt_o
+            = logical_tensor_init(id++, data_type::f32, layout_type::strided);
+    pool_op.add_input(lt_data);
+    pool_op.add_output(lt_o);
+
+    graph_t agraph;
+    // failed due to invalid_dilations.size() != kernel.size()
+    ASSERT_NE(agraph.add_op(&pool_op), status::success);
+
+    pool_op.remove_attr(op_attr::dilations);
+    std::vector<int64_t> invalid_dilations_2(12, 2);
+    pool_op.set_attr(op_attr::dilations, invalid_dilations_2);
+    // failed due to invalid_dilations.size() != kernel.size()
+    ASSERT_NE(agraph.add_op(&pool_op), status::success);
+
+    pool_op.remove_attr(op_attr::dilations);
+    // dilations will be added by default in add_op()
+    ASSERT_EQ(agraph.add_op(&pool_op), status::success);
+
+    agraph.finalize();
+    ASSERT_EQ(agraph.num_ops(), 1U);
+
+    const auto &graph_max_pool_op = agraph.get_ops()[0];
+    // dilations: default value is vec(12, 1)
+    std::vector<int64_t> expected_dilations(12, 1);
+    ASSERT_EQ(graph_max_pool_op->get_attr<dims>(op_attr::dilations),
+            expected_dilations);
+}
+
+TEST(test_interface_op_schema, InferMatmulOutputShape) {
+    const op_schema_t *matmul_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MatMul);
+
+    op_t matmul_op {op_kind::MatMul, op_t::kind2str(op_kind::MatMul)};
+    bool transpose_a = true;
+    matmul_op.set_attr(op_attr::transpose_a, transpose_a);
+
+    // test 2 dims matmul
+    logical_tensor_t lt_in1
+            = logical_tensor_init(0, {1024, 64}, data_type::f32);
+    logical_tensor_t lt_in2
+            = logical_tensor_init(1, {1024, 1000}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_in1, &lt_in2};
+    logical_tensor_t lt_o1
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out1 {&lt_o1};
+
+    matmul_op_schema->shape_infer(&matmul_op, lt_in, lt_out1);
+
+    const std::vector<int64_t> inferred_out_shape1
+            = logical_tensor_wrapper_t(lt_o1).vdims();
+    const std::vector<int64_t> expected_out_shape1 = {64, 1000};
+    EXPECT_EQ(inferred_out_shape1, expected_out_shape1);
+
+    const std::vector<int64_t> inferred_out_strides1
+            = logical_tensor_wrapper_t(lt_o1).vstrides();
+    const std::vector<int64_t> expected_out_strides1
+            = compute_dense_strides(expected_out_shape1);
+    EXPECT_EQ(inferred_out_strides1, expected_out_strides1);
+
+    // test 1 dims matmul
+    lt_in1 = logical_tensor_init(0, {1024}, data_type::f32);
+    logical_tensor_t lt_o2
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out2 {&lt_o2};
+    matmul_op_schema->shape_infer(&matmul_op, lt_in, lt_out2);
+
+    auto &inferred_out_shape2 = lt_o2.dims;
+    EXPECT_EQ(lt_o2.ndims, 1);
+    EXPECT_EQ(inferred_out_shape2[0], 1000);
+    auto &inferred_out_strides2 = lt_o2.layout.strides;
+    EXPECT_EQ(inferred_out_strides2[0], 1);
+
+    // test >2 dims matmul
+    lt_in1 = logical_tensor_init(0, {3, 1, 10, 1024, 64}, data_type::f32);
+    lt_in2 = logical_tensor_init(1, {5, 1, 1024, 1000}, data_type::f32);
+    logical_tensor_t lt_o3
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out3 {&lt_o3};
+    matmul_op_schema->shape_infer(&matmul_op, lt_in, lt_out3);
+
+    const std::vector<int64_t> inferred_out_shape3
+            = logical_tensor_wrapper_t(lt_o3).vdims();
+    const std::vector<int64_t> expected_out_shape3 = {3, 5, 10, 64, 1000};
+    EXPECT_EQ(inferred_out_shape3, expected_out_shape3);
+
+    const std::vector<int64_t> inferred_out_strides3
+            = logical_tensor_wrapper_t(lt_o3).vstrides();
+    const std::vector<int64_t> expected_out_strides3
+            = compute_dense_strides(expected_out_shape3);
+    EXPECT_EQ(inferred_out_strides3, expected_out_strides3);
+}
+
+TEST(test_interface_op_schema, Abs) {
+    const op_kind_t op_kind_ = op_kind::Abs;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, AbsBackward) {
+    const op_kind_t op_kind_ = op_kind::AbsBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, Add) {
+    const op_kind_t op_kind_ = op_kind::Add;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferAddOutputShapeWithoutBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Add;
+
+    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferAddOutputShapeWithBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Add;
+
+    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, BatchNormForwardTraining) {
+    const op_kind_t op_kind_ = op_kind::BatchNormForwardTraining;
+    const size_t expected_in_size = 5;
+    const size_t expected_out_size = 5;
+    const size_t expected_attr_size = 3;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::epsilon, true},
+            {op_attr::momentum, false}, {op_attr::data_format, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferBatchNormForwardTrainingOutputShape) {
+    const op_kind_t op_kind_ = op_kind::BatchNormForwardTraining;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    logical_tensor_t lt_in
+            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+    logical_tensor_t lt_mean = logical_tensor_init(1, {224}, data_type::f32);
+    logical_tensor_t lt_variance
+            = logical_tensor_init(2, {224}, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_in, &lt_mean, &lt_variance};
+    logical_tensor_t lt_out = logical_tensor_init(3, data_type::f32);
+    logical_tensor_t lt_r_mean = logical_tensor_init(4, data_type::f32);
+    logical_tensor_t lt_r_var = logical_tensor_init(5, data_type::f32);
+    logical_tensor_t lt_b_mean = logical_tensor_init(6, data_type::f32);
+    logical_tensor_t lt_b_var = logical_tensor_init(7, data_type::f32);
+    std::vector<logical_tensor_t *> out {
+            &lt_out, &lt_r_mean, &lt_r_var, &lt_b_mean, &lt_b_var};
+
+    op_.set_attr<float>(op_attr::epsilon, 0.01f);
+    op_schema_->shape_infer(&op_, in, out);
+    const std::vector<int64_t> inferred_out_shape0
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    const std::vector<int64_t> expected_out_shape0 = {1, 3, 224, 224};
+    EXPECT_EQ(inferred_out_shape0, expected_out_shape0);
+
+    const std::vector<int64_t> inferred_out_shape1
+            = logical_tensor_wrapper_t(lt_r_mean).vdims();
+    const std::vector<int64_t> inferred_out_shape2
+            = logical_tensor_wrapper_t(lt_r_var).vdims();
+    const std::vector<int64_t> inferred_out_shape3
+            = logical_tensor_wrapper_t(lt_b_mean).vdims();
+    const std::vector<int64_t> inferred_out_shape4
+            = logical_tensor_wrapper_t(lt_b_var).vdims();
+    const std::vector<int64_t> expected_out_shape_1D = {224};
+    EXPECT_EQ(inferred_out_shape1, expected_out_shape_1D);
+    EXPECT_EQ(inferred_out_shape2, expected_out_shape_1D);
+    EXPECT_EQ(inferred_out_shape3, expected_out_shape_1D);
+    EXPECT_EQ(inferred_out_shape4, expected_out_shape_1D);
+
+    logical_tensor_t lt_out_not_filled = logical_tensor_init(8, data_type::f32);
+    std::vector<logical_tensor_t *> out_partially_not_filled {
+            &lt_out_not_filled, &lt_r_mean, &lt_r_var, &lt_b_mean, &lt_b_var};
+    const std::string data_f = "NCX";
+    op_.set_attr(op_attr::data_format, data_f);
+    auto result = op_schema_->shape_infer(&op_, in, out_partially_not_filled);
+    EXPECT_EQ(result, status::invalid_shape);
+}
+
+TEST(test_interface_op_schema, BatchNormTrainingBackward) {
+    const op_kind_t op_kind_ = op_kind::BatchNormTrainingBackward;
+    const size_t expected_in_size = 5;
+    const size_t expected_out_size = 3;
+    const size_t expected_attr_size = 2;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::epsilon, true}, {op_attr::data_format, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferBatchNormTrainingBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::BatchNormTrainingBackward;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    logical_tensor_t lt_in
+            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+    logical_tensor_t lt_output_delta
+            = logical_tensor_init(1, {1, 2, 224, 224}, data_type::f32);
+    logical_tensor_t lt_mean = logical_tensor_init(2, {224}, data_type::f32);
+    logical_tensor_t lt_variance
+            = logical_tensor_init(3, {224}, data_type::f32);
+    std::vector<logical_tensor_t *> in {
+            &lt_in, &lt_output_delta, &lt_mean, &lt_variance};
+    logical_tensor_t lt_input_delta = logical_tensor_init(4, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_input_delta};
+
+    op_.set_attr<float>(op_attr::epsilon, 0.01f);
+    op_schema_->shape_infer(&op_, in, out);
+    const std::vector<int64_t> inferred_out_shape0
+            = logical_tensor_wrapper_t(lt_input_delta).vdims();
+    const std::vector<int64_t> expected_out_shape0 = {1, 3, 224, 224};
+    EXPECT_EQ(inferred_out_shape0, expected_out_shape0);
+
+    logical_tensor_t lt_gamma = logical_tensor_init(5, {224}, data_type::f32);
+    logical_tensor_t lt_beta = logical_tensor_init(6, {224}, data_type::f32);
+    std::vector<logical_tensor_t *> in_with_options {&lt_in, &lt_output_delta,
+            &lt_gamma, &lt_beta, &lt_mean, &lt_variance};
+    logical_tensor_t lt_gamma_delta = logical_tensor_init(7, data_type::f32);
+    logical_tensor_t lt_beta_delta = logical_tensor_init(8, data_type::f32);
+    std::vector<logical_tensor_t *> out_with_options {
+            &lt_input_delta, &lt_gamma_delta, &lt_beta_delta};
+
+    op_schema_->shape_infer(&op_, in_with_options, out_with_options);
+    const std::vector<int64_t> expected_out_shape_1D = {224};
+    const std::vector<int64_t> inferred_out_shape1
+            = logical_tensor_wrapper_t(lt_gamma_delta).vdims();
+    const std::vector<int64_t> inferred_out_shape2
+            = logical_tensor_wrapper_t(lt_beta_delta).vdims();
+    EXPECT_EQ(inferred_out_shape1, expected_out_shape_1D);
+    EXPECT_EQ(inferred_out_shape2, expected_out_shape_1D);
+}
+
+TEST(test_interface_op_schema, BiasAdd) {
+    const op_kind_t op_kind_ = op_kind::BiasAdd;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::data_format, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferBiasAddOutputShape) {
+    const op_kind_t op_kind_ = op_kind::BiasAdd;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    logical_tensor_t lt_in
+            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+    logical_tensor_t lt_bias = logical_tensor_init(1, {224}, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_in, &lt_bias};
+    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    op_schema_->shape_infer(&op_, in, out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    const std::vector<int64_t> expected_out_shape = {1, 3, 224, 224};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    logical_tensor_t lt_out_not_filled = logical_tensor_init(2, data_type::f32);
+    std::vector<logical_tensor_t *> out_not_filled {&lt_out_not_filled};
+    const std::string data_f = "NCX";
+    op_.set_attr(op_attr::data_format, data_f);
+    auto result = op_schema_->shape_infer(&op_, in, out_not_filled);
+    EXPECT_EQ(result, status::invalid_shape);
+}
+
+TEST(test_interface_op_schema, BiasAddBackward) {
+    const op_kind_t op_kind_ = op_kind::BiasAddBackward;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::data_format, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferBiasAddBackwardOutputShapeWithNxcFormat) {
+    const op_kind_t op_kind_ = op_kind::BiasAddBackward;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    const int64_t channels = 16;
+    logical_tensor_t lt_in
+            = logical_tensor_init(0, {3, 64, 64, channels}, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_in};
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    // no need to set attribute, NXC value should be default
+    op_schema_->shape_infer(&op_, in, out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    const std::vector<int64_t> expected_out_shape = {channels};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    // explicitly setting data_format to NXC
+    const std::string default_data_f = "NXC";
+    op_.set_attr(op_attr::data_format, default_data_f);
+    logical_tensor_t lt_out_expl = logical_tensor_init(2, data_type::f32);
+    std::vector<logical_tensor_t *> out_expl {&lt_out_expl};
+
+    op_schema_->shape_infer(&op_, in, out_expl);
+    const std::vector<int64_t> inferred_out_shape_expl
+            = logical_tensor_wrapper_t(lt_out_expl).vdims();
+    EXPECT_EQ(inferred_out_shape_expl, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferBiasAddBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::BiasAddBackward;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    const int64_t channels = 16;
+    logical_tensor_t lt_in
+            = logical_tensor_init(0, {3, channels, 64, 64}, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_in};
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+    const std::string data_f = "NCX";
+    op_.set_attr(op_attr::data_format, data_f);
+
+    op_schema_->shape_infer(&op_, in, out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    const std::vector<int64_t> expected_out_shape = {channels};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, Clamp) {
+    const op_kind_t op_kind_ = op_kind::Clamp;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 2;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::min, true}, {op_attr::max, true}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferClampOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Clamp;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, ClampBackward) {
+    const op_kind_t op_kind_ = op_kind::ClampBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 3;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::min, true}, {op_attr::max, true}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferClampBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::ClampBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, ConvolutionBackwardData) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
+    const size_t expected_in_size = 3;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 10;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::dilations, true}, {op_attr::auto_pad, false},
+            {op_attr::output_padding, false}, {op_attr::groups, false},
+            {op_attr::data_format, false}, {op_attr::weights_format, false},
+            {op_attr::dst_shape, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferConvolutionBackwardDataOutputShape) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+    // data shape {N, IC, H, W}
+    std::vector<int64_t> in_data {1, 16, 224, 224};
+    std::vector<int64_t> in_output_shape {};
+    std::vector<int64_t> expected_out_shape {1, 32, 452, 452};
+
+    for (auto groups : groups_vec) {
+        // weight shape {OC, IC, KH, KW}
+        std::vector<int64_t> in_weight = {16, 32 / groups, 3, 3};
+
+        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, in_output_shape,
+                expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, InferConvolutionBackwardDataAutoPadShape) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
+
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t groups = 1;
+    int64_t id = 0;
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+        if (auto_pad == "NONE") {
+            a_op.set_attr<std::vector<int64_t>>(
+                    op_attr::dst_shape, {1, 1, 6, 6});
+        } else if (auto_pad == "SAME_UPPER") {
+            a_op.set_attr<std::vector<int64_t>>(
+                    op_attr::dst_shape, {1, 1, 6, 6});
+        } else if (auto_pad == "SAME_LOWER") {
+            a_op.set_attr<std::vector<int64_t>>(
+                    op_attr::dst_shape, {1, 1, 6, 6});
+        } else if (auto_pad == "VALID") {
+            a_op.set_attr<std::vector<int64_t>>(
+                    op_attr::dst_shape, {1, 1, 9, 9});
+        }
+        auto lt_data = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        auto lt_weight
+                = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
+        auto lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+
+        const auto &inferred_pads_begin
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        const auto &inferred_pads_end
+                = a_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {1, 1};
+            std::vector<int64_t> expected_pads_end {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 6, 6};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {2, 2};
+            std::vector<int64_t> expected_pads_end {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape {1, 1, 9, 9};
+            ASSERT_EQ(inferred_out_shape, expected_out_shape);
+
+            std::vector<int64_t> expected_pads_begin {0, 0};
+            std::vector<int64_t> expected_pads_end {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferConvolutionBackwardDataDilationsShape) {
+    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
+            op_kind::ConvolutionBackwardData);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvolutionBackwardData,
+            op_t::kind2str(op_kind::ConvolutionBackwardData)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+
+    int64_t groups = 1;
+    int64_t id = 0;
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+    a_op.set_attr<std::vector<int64_t>>(op_attr::dst_shape, {1, 1, 16, 16});
+
+    auto lt_data = logical_tensor_init(id++, {1, 1, 10, 10}, data_type::f32);
+    auto lt_weight = logical_tensor_init(id++, {1, 1, 4, 4}, data_type::f32);
+    std::vector<int64_t> expected_out_shape {1, 1, 16, 16};
+
+    auto lt_o = logical_tensor_init(id++, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+    a_op_schema->shape_infer(&a_op, lt_in, lt_out);
+    const auto inferred_out_shape = logical_tensor_wrapper_t(lt_o).vdims();
+    ASSERT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema,
+        InferConvolutionBackwardDataOutputShapeWithNxcFormat) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "OIX";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+    // data shape {N, H, W, IC}
+    std::vector<int64_t> in_data {1, 224, 224, 16};
+    std::vector<int64_t> in_output_shape {};
+    std::vector<int64_t> expected_out_shape {1, 452, 452, 32};
+
+    for (auto groups : groups_vec) {
+        // weight shape {OC, IC, KH, KW}
+        std::vector<int64_t> in_weight = {16, 32 / groups, 3, 3};
+
+        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, in_output_shape,
+                expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema,
+        InferConvolutionBackwardDataOutputShapeWithNxcXioFormat) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
+
+    std::string data_format = "NXC";
+    std::string filter_format = "XIO";
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+    // data shape {N, H, W, IC}
+    std::vector<int64_t> in_data {1, 224, 224, 16};
+    std::vector<int64_t> in_output_shape {};
+    std::vector<int64_t> expected_out_shape {1, 452, 452, 32};
+
+    for (auto groups : groups_vec) {
+        // weight shape {KH, KW, IC, OC}
+        std::vector<int64_t> in_weight = {3, 3, 32 / groups, 16};
+
+        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, in_output_shape,
+                expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema,
+        InferConvolutionBackwardDataOutputShapeWithXioFormat) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardData;
+
+    std::string data_format = "NCX";
+    std::string filter_format = "XIO";
+
+    // data shape {N, IC, H, W}
+    std::vector<int64_t> in_data {1, 16, 224, 224};
+    std::vector<int64_t> in_output_shape {};
+    std::vector<int64_t> expected_out_shape {1, 32, 452, 452};
+
+    std::vector<int64_t> groups_vec {1, 2, 4};
+    for (auto groups : groups_vec) {
+        // weight shape {KH, KW, IC, OC}
+        std::vector<int64_t> in_weight {3, 3, 32 / groups, 16};
+
+        verify_shape_infer_for_conv_bprop_data(op_kind_, data_format,
+                filter_format, groups, in_data, in_weight, in_output_shape,
+                expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, ConvolutionBackwardWeights) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardWeights;
+    const size_t expected_in_size = 3;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 9;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::dilations, true}, {op_attr::auto_pad, false},
+            {op_attr::groups, false}, {op_attr::data_format, false},
+            {op_attr::weights_format, false}, {op_attr::weights_shape, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferConvolutionBackwardWeightsDilationsShape) {
+    const op_schema_t *a_op_schema = op_schema_registry_t::get_op_schema(
+            op_kind::ConvolutionBackwardWeights);
+    EXPECT_TRUE(nullptr != a_op_schema);
+    op_t a_op {op_kind::ConvolutionBackwardWeights,
+            op_t::kind2str(op_kind::ConvolutionBackwardWeights)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+
+    int64_t groups = 1;
+    int64_t id = 0;
+
+    const std::vector<int64_t> &in_data = {1, 1, 16, 16};
+    const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
+    const std::vector<int64_t> &in_output_delta = {1, 1, 10, 10};
+
+    set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+    a_op.set_attr(op_attr::weights_shape, expected_out_shape);
+
+    auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
+    logical_tensor_t lt_output_delta
+            = logical_tensor_init(id++, in_output_delta, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+    logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    a_op_schema->shape_infer(&a_op, in, out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferConvolutionBackwardWeightsOutputShape) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardWeights;
+
+    std::vector<int64_t> strides = {2, 2};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string auto_pad = "VALID";
+    std::string data_format = "NCX";
+    std::string filter_format = "XIO";
+    int64_t groups = 1;
+
+    // data shape {N, IC, H, W}
+    const std::vector<int64_t> &in_data = {1, 3, 224, 224};
+    const std::vector<int64_t> &expected_out_shape = {4, 4, 3, 16};
+    const std::vector<int64_t> &in_output_delta = {1, 16, 111, 111};
+
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    set_conv_common_attr(op_, strides, pads_begin, pads_end, dilations,
+            auto_pad, data_format, filter_format, groups);
+    op_.set_attr(op_attr::weights_shape, expected_out_shape);
+
+    logical_tensor_t lt_data = logical_tensor_init(0, in_data, data_type::f32);
+    logical_tensor_t lt_output_delta
+            = logical_tensor_init(1, in_output_delta, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+    logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    op_schema_->shape_infer(&op_, in, out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, InferConvolutionBackwardWeightsAutoPadShape) {
+    const op_kind_t op_kind_ = op_kind::ConvolutionBackwardWeights;
+    const op_schema_t *a_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t a_op {op_kind_, op_t::kind2str(op_kind_)};
+
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {1, 1}; // empty pads_begin
+    std::vector<int64_t> pads_end = {2, 2}; // empty pads_end
+    std::vector<int64_t> dilations = {1, 1};
+    std::vector<int64_t> filters {1, 1, 4, 4};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    const std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t groups = 1;
+    int64_t id = 0;
+    for (const auto &auto_pad : auto_pads_vec) {
+        set_conv_common_attr(a_op, strides, pads_begin, pads_end, dilations,
+                auto_pad, data_format, filter_format, groups);
+        a_op.set_attr(op_attr::weights_shape, filters);
+
+        // data shape {N, IC, H, W}
+        const std::vector<int64_t> &in_data = {1, 1, 6, 6};
+        const std::vector<int64_t> &expected_out_shape = {4, 4, 1, 1};
+
+        a_op.set_attr(op_attr::weights_shape, expected_out_shape);
+        auto lt_data = logical_tensor_init(id++, in_data, data_type::f32);
+
+        logical_tensor_t lt_output_delta
+                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        if (auto_pad == "VALID") {
+            lt_output_delta
+                    = logical_tensor_init(id++, {1, 1, 3, 3}, data_type::f32);
+        }
+
+        std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        a_op_schema->shape_infer(&a_op, in, out);
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema, AvgPool) {
+    const op_kind_t op_kind_ = op_kind::AvgPool;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 8;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::kernel, true}, {op_attr::pads_begin, true},
+            {op_attr::pads_end, true}, {op_attr::data_format, false},
+            {op_attr::auto_pad, false}, {op_attr::rounding_type, false},
+            {op_attr::exclude_pad, true}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferAvgpoolOutputShape) {
+    const op_schema_t *avg_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::AvgPool);
+
+    op_t avg_op {op_kind::AvgPool, op_t::kind2str(op_kind::AvgPool)};
+    std::vector<int64_t> strides = {2, 2};
+    std::vector<int64_t> kernel = {3, 3};
+    bool exclude_pad = false;
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::string auto_pad = "SAME_UPPER";
+    std::string data_format = "NCX";
+
+    avg_op.set_attr(op_attr::strides, strides);
+    avg_op.set_attr(op_attr::pads_begin, pads_begin);
+    avg_op.set_attr(op_attr::pads_end, pads_end);
+    avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
+    avg_op.set_attr(op_attr::kernel, kernel);
+    avg_op.set_attr(op_attr::auto_pad, auto_pad);
+    avg_op.set_attr(op_attr::data_format, data_format);
+
+    logical_tensor_t lt_data
+            = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_data};
+    logical_tensor_t lt_o
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out {&lt_o};
+
+    // if output shape is unknown, infer output shape
+    avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_o).vdims();
+    const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    const std::vector<int64_t> inferred_out_strides
+            = logical_tensor_wrapper_t(lt_o).vstrides();
+    const std::vector<int64_t> expected_out_strides
+            = compute_dense_strides(expected_out_shape);
+    EXPECT_EQ(inferred_out_strides, expected_out_strides);
+
+    // if output shape is known, infer auto pad
+    avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
+    auto inferred_pads_begin
+            = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+    auto inferred_pads_end
+            = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+    const std::vector<int64_t> expected_pads_begin = {0, 0};
+    const std::vector<int64_t> expected_pads_end = {1, 1};
+    EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+    EXPECT_EQ(inferred_pads_end, expected_pads_end);
+}
+
+TEST(test_interface_op_schema, InferAvgpoolAutoPadShape) {
+    const op_schema_t *avg_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::AvgPool);
+    op_t avg_op {op_kind::AvgPool, op_t::kind2str(op_kind::AvgPool)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    bool exclude_pad = false;
+    std::string data_format = "NCX";
+
+    avg_op.set_attr(op_attr::strides, strides);
+    avg_op.set_attr(op_attr::pads_begin, pads_begin);
+    avg_op.set_attr(op_attr::pads_end, pads_end);
+    avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
+    avg_op.set_attr(op_attr::kernel, kernel);
+    avg_op.set_attr(op_attr::data_format, data_format);
+
+    std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        avg_op.set_attr(op_attr::auto_pad, auto_pad);
+        logical_tensor_t lt_data
+                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        std::vector<logical_tensor_t *> lt_in {&lt_data};
+        logical_tensor_t lt_o = logical_tensor_init(
+                id++, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+        // if output shape is unknown, infer output shape
+        avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_o).vdims();
+
+        auto inferred_pads_begin
+                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        auto inferred_pads_end
+                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {2, 2};
+            const std::vector<int64_t> expected_pads_end = {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_out_shape = {1, 1, 3, 3};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> expected_pads_begin = {0, 0};
+            const std::vector<int64_t> expected_pads_end = {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferAvgpoolOutputShapeWithRoundingType) {
+    const op_schema_t *avg_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::AvgPool);
+
+    std::set<std::string> rounding_types = {"ceil", "floor"};
+    for (auto &rounding_type : rounding_types) {
+        op_t avg_op {op_kind::AvgPool, op_t::kind2str(op_kind::AvgPool)};
+        std::vector<int64_t> strides = {2, 2};
+        std::vector<int64_t> kernel = {3, 3};
+        std::vector<int64_t> pads_begin = {0, 0};
+        std::vector<int64_t> pads_end = {0, 0};
+        bool exclude_pad = false;
+        std::string data_format = "NCX";
+        avg_op.set_attr(op_attr::strides, strides);
+        avg_op.set_attr(op_attr::pads_begin, pads_begin);
+        avg_op.set_attr(op_attr::pads_end, pads_end);
+        avg_op.set_attr(op_attr::kernel, kernel);
+        avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
+        avg_op.set_attr(op_attr::data_format, data_format);
+        avg_op.set_attr(op_attr::rounding_type, rounding_type);
+
+        logical_tensor_t lt_data
+                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+        std::vector<logical_tensor_t *> lt_in {&lt_data};
+        logical_tensor_t lt_o
+                = logical_tensor_init(2, data_type::f32, layout_type::strided);
+        std::vector<logical_tensor_t *> lt_out {&lt_o};
+
+        // if output shape is unknown, infer output shape
+        avg_op_schema->shape_infer(&avg_op, lt_in, lt_out);
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_o).vdims();
+        const std::vector<int64_t> inferred_out_strides
+                = logical_tensor_wrapper_t(lt_o).vstrides();
+        if (rounding_type == "ceil") {
+            const std::vector<int64_t> expected_out_shape = {1, 3, 112, 112};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+            const std::vector<int64_t> expected_out_strides
+                    = compute_dense_strides(expected_out_shape);
+            EXPECT_EQ(inferred_out_strides, expected_out_strides);
+        } else { // rounding_type = floor
+            const std::vector<int64_t> expected_out_shape = {1, 3, 111, 111};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+            const std::vector<int64_t> expected_out_strides
+                    = compute_dense_strides(expected_out_shape);
+            EXPECT_EQ(inferred_out_strides, expected_out_strides);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, AvgPoolBackward) {
+    const op_kind_t op_kind_ = op_kind::AvgPoolBackward;
+
+    const std::set<size_t> expected_in_sizes = {2};
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 8;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::strides, true}, {op_attr::pads_begin, true},
+                    {op_attr::pads_end, true}, {op_attr::kernel, true},
+                    {op_attr::auto_pad, false}, {op_attr::exclude_pad, true},
+                    {op_attr::data_format, false}, {op_attr::src_shape, false}};
+    for (auto expected_in_size : expected_in_sizes) {
+        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+                expected_attr_size, attrs_data);
+    }
+}
+
+TEST(test_interface_op_schema, AvgPoolBackwardAutoPadShape) {
+    const op_schema_t *avg_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::AvgPoolBackward);
+    op_t avg_op {
+            op_kind::AvgPoolBackward, op_t::kind2str(op_kind::AvgPoolBackward)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    bool exclude_pad = false;
+    std::string data_format = "NCX";
+
+    avg_op.set_attr(op_attr::strides, strides);
+    avg_op.set_attr(op_attr::pads_begin, pads_begin);
+    avg_op.set_attr(op_attr::pads_end, pads_end);
+    avg_op.set_attr(op_attr::kernel, kernel);
+    avg_op.set_attr(op_attr::exclude_pad, exclude_pad);
+    avg_op.set_attr(op_attr::data_format, data_format);
+    std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        std::vector<int64_t> in_shape {1, 1, 6, 6};
+        avg_op.set_attr(op_attr::auto_pad, auto_pad);
+        avg_op.set_attr(op_attr::src_shape, in_shape);
+
+        logical_tensor_t lt_output_delta
+                = logical_tensor_init(id++, in_shape, data_type::f32);
+        if (auto_pad == "VALID") {
+            lt_output_delta
+                    = logical_tensor_init(id++, {1, 1, 3, 3}, data_type::f32);
+        }
+        std::vector<logical_tensor_t *> in {&lt_output_delta};
+        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        // if output shape is unknown, infer output shape
+        avg_op_schema->shape_infer(&avg_op, in, out);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+
+        auto inferred_pads_begin
+                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        auto inferred_pads_end
+                = avg_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_pads_begin = {2, 2};
+            const std::vector<int64_t> expected_pads_end = {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_pads_begin = {0, 0};
+            const std::vector<int64_t> expected_pads_end = {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, Divide) {
+    const op_kind_t op_kind_ = op_kind::Divide;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferDivideOutputShapeWithoutBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Divide;
+
+    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferDivideOutputShapeWithBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Divide;
+
+    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, Elu) {
+    const op_kind_t op_kind_ = op_kind::Elu;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::alpha, true}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferEluOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Elu;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, EluBackward) {
+    const op_kind_t op_kind_ = op_kind::EluBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::alpha, true}, {op_attr::use_dst, false}};
+    const size_t expected_attr_size = attrs_data.size();
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferEluBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::EluBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, End) {
+    const op_schema_t *op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::End);
+
+    op_t end_op {0, op_kind::End, "end"};
+    logical_tensor_t lt_in_0 = logical_tensor_init(0, data_type::f32);
+
+    end_op.add_input(lt_in_0);
+    EXPECT_TRUE(op_schema->verify(&end_op));
+}
+
+TEST(test_interface_op_schema, Reorder) {
+    const op_kind_t op_kind_ = op_kind::Reorder;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferReorderOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Reorder;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Exp) {
+    const op_kind_t op_kind_ = op_kind::Exp;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferExpOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Exp;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Gelu) {
+    const op_kind_t op_kind_ = op_kind::GELU;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferGeluOutputShape) {
+    const op_kind_t op_kind_ = op_kind::GELU;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, GELUBackward) {
+    const op_kind_t op_kind_ = op_kind::GELUBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferGeluBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::GELUBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, HardSwish) {
+    const op_kind_t op_kind_ = op_kind::HardSwish;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferHardSwishOutputShape) {
+    const op_kind_t op_kind_ = op_kind::HardSwish;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, HardSwishBackward) {
+    const op_kind_t op_kind_ = op_kind::HardSwishBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferHardSwishBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::HardSwishBackward;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, LayerNorm) {
+    const op_kind_t op_kind_ = op_kind::LayerNorm;
+    const size_t expected_in_size = 3;
+    const size_t expected_out_size = 3;
+    const size_t expected_attr_size = 4;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::keep_stats, false}, {op_attr::begin_norm_axis, false},
+                    {op_attr::use_affine, false}, {op_attr::epsilon, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferLayerNormOutputShape) {
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
+    op_t op_ {op_kind::LayerNorm, op_t::kind2str(op_kind::LayerNorm)};
+
+    const std::vector<layout_type_t> layout_types
+            = {layout_type::strided, layout_type::opaque};
+
+    // We test all available cases
+    const std::vector<bool> keep_statses = {true, false};
+
+    // TODO(qun) we should test multi begin_norm_axis attrs
+    const int64_t begin_norm_axis = -1;
+
+    op_.set_attr(op_attr::begin_norm_axis, begin_norm_axis);
+
+    for (auto keep_stats : keep_statses) {
+        op_.set_attr(op_attr::keep_stats, static_cast<bool>(keep_stats));
+        for (const auto &ltype : layout_types) {
+            logical_tensor_t lt_in1 = logical_tensor_init(
+                    0, {1, 3, 416, 416}, data_type::f32, ltype);
+            logical_tensor_t lt_out = logical_tensor_init(
+                    1, data_type::f32, layout_type::strided);
+            logical_tensor_t lt_mean = logical_tensor_init(
+                    2, data_type::f32, layout_type::strided);
+            logical_tensor_t lt_var = logical_tensor_init(
+                    3, data_type::f32, layout_type::strided);
+
+            std::vector<logical_tensor_t *> in {&lt_in1};
+            std::vector<logical_tensor_t *> out {&lt_out, &lt_mean, &lt_var};
+
+            op_schema_->shape_infer(&op_, in, out);
+
+            const std::vector<int64_t> inferred_out_shape
+                    = logical_tensor_wrapper_t(lt_out).vdims();
+            const std::vector<int64_t> expected_out_shape = {1, 3, 416, 416};
+            EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+            const std::vector<int64_t> inferred_out_strides
+                    = logical_tensor_wrapper_t(lt_out).vstrides();
+            const std::vector<int64_t> expected_out_strides
+                    = compute_dense_strides(expected_out_shape);
+            EXPECT_EQ(inferred_out_strides, expected_out_strides);
+
+            if (keep_stats) {
+                // check inferred shape and strides for mean
+                const std::vector<int64_t> inferred_mean_shape
+                        = logical_tensor_wrapper_t(lt_mean).vdims();
+                const std::vector<int64_t> expected_mean_shape = {1, 3, 416};
+                EXPECT_EQ(inferred_mean_shape, expected_mean_shape);
+
+                const std::vector<int64_t> inferred_mean_strides
+                        = logical_tensor_wrapper_t(lt_mean).vstrides();
+                const std::vector<int64_t> expected_mean_strides
+                        = compute_dense_strides(expected_mean_shape);
+                EXPECT_EQ(inferred_mean_strides, expected_mean_strides);
+
+                // check inferred shape and strides for var
+                const std::vector<int64_t> inferred_var_shape
+                        = logical_tensor_wrapper_t(lt_var).vdims();
+                const std::vector<int64_t> expected_var_shape = {1, 3, 416};
+                EXPECT_EQ(inferred_var_shape, expected_var_shape);
+
+                const std::vector<int64_t> inferred_var_strides
+                        = logical_tensor_wrapper_t(lt_var).vstrides();
+                const std::vector<int64_t> expected_var_strides
+                        = compute_dense_strides(expected_var_shape);
+                EXPECT_EQ(inferred_var_strides, expected_var_strides);
+            }
+        }
+    }
+}
+
+TEST(test_interface_op_schema, LayerNormBackward) {
+    const op_kind_t op_kind_ = op_kind::LayerNormBackward;
+    const size_t expected_in_size = 6;
+    const size_t expected_out_size = 3;
+    const size_t expected_attr_size = 3;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::begin_norm_axis, false}, {op_attr::use_affine, false},
+                    {op_attr::epsilon, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferLayerNormBackwardOutputShape) {
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind::LayerNormBackward);
+    op_t op_ {op_kind::LayerNormBackward,
+            op_t::kind2str(op_kind::LayerNormBackward)};
+
+    const std::vector<layout_type_t> layout_types
+            = {layout_type::strided, layout_type::opaque};
+
+    // We test all available cases
+    const std::vector<bool> use_affines = {true, false};
+
+    for (auto use_affine : use_affines) {
+        op_.set_attr(op_attr::use_affine, static_cast<bool>(use_affine));
+        for (const auto &ltype : layout_types) {
+            logical_tensor_t lt_data = logical_tensor_init(
+                    0, {1, 256, 64, 64}, data_type::f32, ltype);
+            logical_tensor_t lt_out_delta = logical_tensor_init(
+                    1, {1, 256, 64, 64}, data_type::f32, ltype);
+            logical_tensor_t lt_mean
+                    = logical_tensor_init(2, {1, 256}, data_type::f32, ltype);
+            logical_tensor_t lt_variance
+                    = logical_tensor_init(3, {1, 256}, data_type::f32, ltype);
+            logical_tensor_t lt_gamma
+                    = logical_tensor_init(4, {1, 256}, data_type::f32, ltype);
+            logical_tensor_t lt_beta
+                    = logical_tensor_init(5, {1, 256}, data_type::f32, ltype);
+            logical_tensor_t lt_in_delta = logical_tensor_init(
+                    6, data_type::f32, layout_type::strided);
+            logical_tensor_t lt_gamma_delta = logical_tensor_init(
+                    7, data_type::f32, layout_type::strided);
+            logical_tensor_t lt_beta_delta = logical_tensor_init(
+                    8, data_type::f32, layout_type::strided);
+            std::vector<logical_tensor_t *> lt_in {&lt_data, &lt_out_delta,
+                    &lt_mean, &lt_variance, &lt_gamma, &lt_beta};
+            std::vector<logical_tensor_t *> lt_out {
+                    &lt_in_delta, &lt_gamma_delta, &lt_beta_delta};
+
+            op_schema_->shape_infer(&op_, lt_in, lt_out);
+
+            const std::vector<int64_t> inferred_in_delta_shape
+                    = logical_tensor_wrapper_t(lt_in_delta).vdims();
+            const std::vector<int64_t> expected_in_delta_shape
+                    = {1, 256, 64, 64};
+            EXPECT_EQ(inferred_in_delta_shape, expected_in_delta_shape);
+
+            const std::vector<int64_t> inferred_in_delta_strides
+                    = logical_tensor_wrapper_t(lt_in_delta).vstrides();
+            const std::vector<int64_t> expected_in_delta_strides
+                    = compute_dense_strides(expected_in_delta_shape);
+            EXPECT_EQ(inferred_in_delta_strides, expected_in_delta_strides);
+
+            if (use_affine) {
+                const std::vector<int64_t> inferred_gamma_delta_shape
+                        = logical_tensor_wrapper_t(lt_gamma_delta).vdims();
+                const std::vector<int64_t> expected_gamma_delta_shape
+                        = {1, 256};
+                EXPECT_EQ(
+                        inferred_gamma_delta_shape, expected_gamma_delta_shape);
+
+                const std::vector<int64_t> inferred_gamma_delta_strides
+                        = logical_tensor_wrapper_t(lt_gamma_delta).vstrides();
+                const std::vector<int64_t> expected_gamma_delta_strides
+                        = compute_dense_strides(expected_gamma_delta_shape);
+                EXPECT_EQ(inferred_gamma_delta_strides,
+                        expected_gamma_delta_strides);
+
+                const std::vector<int64_t> inferred_beta_delta_shape
+                        = logical_tensor_wrapper_t(lt_beta_delta).vdims();
+                const std::vector<int64_t> expected_beta_delta_shape = {1, 256};
+                EXPECT_EQ(inferred_beta_delta_shape, expected_beta_delta_shape);
+
+                const std::vector<int64_t> inferred_beta_delta_strides
+                        = logical_tensor_wrapper_t(lt_beta_delta).vstrides();
+                const std::vector<int64_t> expected_beta_delta_strides
+                        = compute_dense_strides(expected_beta_delta_shape);
+                EXPECT_EQ(inferred_beta_delta_strides,
+                        expected_beta_delta_strides);
+            }
+        }
+    }
+}
+
+TEST(test_interface_op_schema, Log) {
+    const op_kind_t op_kind_ = op_kind::Log;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferLogOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Log;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, LogSoftmax) {
+    const op_kind_t op_kind_ = op_kind::LogSoftmax;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferLogSoftmaxOutputShape) {
+    const op_kind_t op_kind_ = op_kind::LogSoftmax;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, LogSoftmaxBackward) {
+    const op_kind_t op_kind_ = op_kind::LogSoftmaxBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferLogSoftmaxBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::LogSoftmaxBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Maximum) {
+    const op_kind_t op_kind_ = op_kind::Maximum;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMaximumOutputShapeWithoutBroadcst) {
+    const op_kind_t op_kind_ = op_kind::Maximum;
+
+    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferMaximumOutputShapeWihBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Maximum;
+
+    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, MaxPoolBackward) {
+    const op_kind_t op_kind_ = op_kind::MaxPoolBackward;
+
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 7;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::strides, true},
+            {op_attr::pads_begin, true}, {op_attr::pads_end, true},
+            {op_attr::kernel, true}, {op_attr::auto_pad, false},
+            {op_attr::dilations, false}, {op_attr::data_format, false}};
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMaxPoolBackwardAutoPadShape) {
+    const op_schema_t *pool_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MaxPoolBackward);
+    op_t pool_op {
+            op_kind::MaxPoolBackward, op_t::kind2str(op_kind::MaxPoolBackward)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {1, 1};
+    std::string data_format = "NCX";
+
+    pool_op.set_attr(op_attr::strides, strides);
+    pool_op.set_attr(op_attr::pads_begin, pads_begin);
+    pool_op.set_attr(op_attr::pads_end, pads_end);
+    pool_op.set_attr(op_attr::kernel, kernel);
+    pool_op.set_attr(op_attr::dilations, dilations);
+
+    pool_op.set_attr(op_attr::data_format, data_format);
+    std::vector<std::string> auto_pads_vec {
+            "NONE", "SAME_UPPER", "SAME_LOWER", "VALID"};
+    int64_t id = 0;
+
+    for (const auto &auto_pad : auto_pads_vec) {
+        pool_op.set_attr(op_attr::auto_pad, auto_pad);
+        logical_tensor_t lt_data
+                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+
+        logical_tensor_t lt_output_delta
+                = logical_tensor_init(id++, {1, 1, 6, 6}, data_type::f32);
+        if (auto_pad == "VALID") {
+            lt_output_delta
+                    = logical_tensor_init(id++, {1, 1, 3, 3}, data_type::f32);
+        }
+        std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+        logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        // if output shape is unknown, infer output shape
+        pool_op_schema->shape_infer(&pool_op, in, out);
+
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+
+        auto inferred_pads_begin
+                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_begin);
+        auto inferred_pads_end
+                = pool_op.get_attr<std::vector<int64_t>>(op_attr::pads_end);
+        const std::vector<int64_t> expected_out_shape = {1, 1, 6, 6};
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+        if (auto_pad == "NONE") {
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_UPPER") {
+            const std::vector<int64_t> expected_pads_begin = {1, 1};
+            const std::vector<int64_t> expected_pads_end = {2, 2};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "SAME_LOWER") {
+            const std::vector<int64_t> expected_pads_begin = {2, 2};
+            const std::vector<int64_t> expected_pads_end = {1, 1};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        } else if (auto_pad == "VALID") {
+            const std::vector<int64_t> expected_pads_begin = {0, 0};
+            const std::vector<int64_t> expected_pads_end = {0, 0};
+            EXPECT_EQ(inferred_pads_begin, expected_pads_begin);
+            EXPECT_EQ(inferred_pads_end, expected_pads_end);
+        }
+    }
+}
+
+TEST(test_interface_op_schema, InferMaxPoolBackwardDilationsShape) {
+    const op_schema_t *pool_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MaxPoolBackward);
+    op_t pool_op {
+            op_kind::MaxPoolBackward, op_t::kind2str(op_kind::MaxPoolBackward)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> kernel = {4, 4};
+    std::vector<int64_t> pads_begin = {0, 0};
+    std::vector<int64_t> pads_end = {0, 0};
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    const std::vector<int64_t> expected_out_shape = {1, 1, 16, 16};
+    int64_t id = 0;
+
+    pool_op.set_attr(op_attr::strides, strides);
+    pool_op.set_attr(op_attr::pads_begin, pads_begin);
+    pool_op.set_attr(op_attr::pads_end, pads_end);
+    pool_op.set_attr(op_attr::kernel, kernel);
+    pool_op.set_attr(op_attr::dilations, dilations);
+
+    pool_op.set_attr(op_attr::data_format, data_format);
+    pool_op.set_attr<std::string>(op_attr::auto_pad, "VALID");
+
+    logical_tensor_t lt_data
+            = logical_tensor_init(id++, {1, 1, 16, 16}, data_type::f32);
+
+    logical_tensor_t lt_output_delta
+            = logical_tensor_init(id++, {1, 1, 10, 10}, data_type::f32);
+
+    std::vector<logical_tensor_t *> in {&lt_data, &lt_output_delta};
+    logical_tensor_t lt_out = logical_tensor_init(id++, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    // if output shape is unknown, infer output shape
+    pool_op_schema->shape_infer(&pool_op, in, out);
+
+    const std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, Minimum) {
+    const op_kind_t op_kind_ = op_kind::Minimum;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMinimumOutputShapeWithoutBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Minimum;
+
+    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferMinimumOutputShapeWithBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Minimum;
+
+    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, Mish) {
+    const op_kind_t op_kind_ = op_kind::Mish;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMishOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Mish;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, MishBackward) {
+    const op_kind_t op_kind_ = op_kind::MishBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMishBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::MishBackward;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Multiply) {
+    const op_kind_t op_kind_ = op_kind::Multiply;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferMultiplyOutputShapeWithoutBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Multiply;
+
+    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferMultiplyOutputShapeWithBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Multiply;
+
+    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, PReLU) {
+    const op_kind_t op_kind_ = op_kind::PReLU;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 2;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::data_format, false},
+            {op_attr::per_channel_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, PReLUOutputShape) {
+    const op_kind_t op_kind_ = op_kind::PReLU;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Reduce) {
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 2;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::keep_dims, false}, {op_attr::axes, false}};
+    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
+            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
+            op_kind::ReduceProd, op_kind::ReduceSum};
+
+    for (auto op_kind_ : configs) {
+        verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+                expected_attr_size, attrs_data);
+    }
+}
+
+TEST(test_interface_op_schema, InferReduceOutputShapeFromInput) {
+    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
+            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
+            op_kind::ReduceProd, op_kind::ReduceSum};
+
+    for (auto op_kind_ : configs) {
+        const op_schema_t *op_schema_
+                = op_schema_registry_t::get_op_schema(op_kind_);
+        op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+        logical_tensor_t lt_in
+                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+        logical_tensor_t lt_axis_indices
+                = logical_tensor_init(1, data_type::f32);
+        std::vector<logical_tensor_t *> in {&lt_in, &lt_axis_indices};
+        logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        auto ret = op_schema_->shape_infer(&op_, in, out);
+        EXPECT_EQ(ret, status::unimplemented);
+    }
+}
+
+TEST(test_interface_op_schema,
+        InferReduceOutputShapeFromAttributeWithKeepingDims) {
+    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
+            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
+            op_kind::ReduceProd, op_kind::ReduceSum};
+    const std::vector<int64_t> axes {2};
+    const std::vector<int64_t> expected_out_shape = {1, 3, 1, 224};
+
+    for (auto op_kind_ : configs) {
+        const op_schema_t *op_schema_
+                = op_schema_registry_t::get_op_schema(op_kind_);
+        op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+        op_.set_attr(op_attr::keep_dims, true);
+        op_.set_attr(op_attr::axes, axes);
+
+        logical_tensor_t lt_in
+                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+        logical_tensor_t lt_axis_indices
+                = logical_tensor_init(1, data_type::f32);
+        std::vector<logical_tensor_t *> in {&lt_in, &lt_axis_indices};
+        logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        op_schema_->shape_infer(&op_, in, out);
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+    }
+}
+
+TEST(test_interface_op_schema,
+        InferReduceOutputShapeFromAttributeWithoutKeepingDims) {
+    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
+            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
+            op_kind::ReduceProd, op_kind::ReduceSum};
+    const std::vector<int64_t> axes {0, 1, 2, 3};
+    const std::vector<int64_t> expected_out_shape = {};
+
+    for (auto op_kind_ : configs) {
+        const op_schema_t *op_schema_
+                = op_schema_registry_t::get_op_schema(op_kind_);
+        op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+        op_.set_attr(op_attr::axes, axes);
+
+        logical_tensor_t lt_in
+                = logical_tensor_init(0, {1, 3, 224, 224}, data_type::f32);
+        logical_tensor_t lt_axis_indices
+                = logical_tensor_init(1, data_type::f32);
+        std::vector<logical_tensor_t *> in {&lt_in, &lt_axis_indices};
+        logical_tensor_t lt_out = logical_tensor_init(2, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        op_schema_->shape_infer(&op_, in, out);
+        const std::vector<int64_t> inferred_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        EXPECT_EQ(inferred_out_shape, expected_out_shape);
+        lt_out = logical_tensor_init(2, {}, data_type::f32);
+        EXPECT_EQ(
+                op_schema_->shape_infer(&op_, in, out), graph::status::success);
+    }
+}
+
+TEST(test_interface_op_schema, ReLU) {
+    const op_kind_t op_kind_ = op_kind::ReLU;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferReluOutputShape) {
+    const op_kind_t op_kind_ = op_kind::ReLU;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, ReLUBackward) {
+    const op_kind_t op_kind_ = op_kind::ReLUBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
+    const size_t expected_attr_size = attrs_data.size();
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferReluBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::ReLUBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Round) {
+    const op_kind_t op_kind_ = op_kind::Round;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferRoundOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Round;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Select) {
+    const op_kind_t op_kind_ = op_kind::Select;
+    const size_t expected_in_size = 3;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSelectOutputShapeWithoutBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Select;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+    const std::string no_broadcast_attr_val = "none";
+    op_.set_attr(op_attr::auto_broadcast, no_broadcast_attr_val);
+
+    // positive case
+    logical_tensor_t lt_in0
+            = logical_tensor_init(0, {3, 3, 64, 128}, data_type::boolean);
+    logical_tensor_t lt_in1
+            = logical_tensor_init(1, {3, 3, 64, 128}, data_type::f32);
+    logical_tensor_t lt_in2
+            = logical_tensor_init(2, {3, 3, 64, 128}, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_in0, &lt_in1, &lt_in2};
+    logical_tensor_t lt_out = logical_tensor_init(3, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    status_t ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_EQ(ret, status::success);
+    const std::vector<int64_t> infered_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    const std::vector<int64_t> expected_out_shape = {3, 3, 64, 128};
+    EXPECT_EQ(infered_out_shape, expected_out_shape);
+
+    // negative case
+    logical_tensor_t lt_in2_neg
+            = logical_tensor_init(4, {3, 3, 64, 64}, data_type::f32);
+    std::vector<logical_tensor_t *> in_neg {&lt_in0, &lt_in1, &lt_in2_neg};
+    logical_tensor_t lt_out_neg = logical_tensor_init(5, data_type::f32);
+    std::vector<logical_tensor_t *> out_neg {&lt_out_neg};
+    ret = op_schema_->shape_infer(&op_, in_neg, out_neg);
+    EXPECT_EQ(ret, status::invalid_shape);
+}
+
+#define for_ for
+TEST(test_interface_op_schema, InferSelectOutputShapeWithBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Select;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+    const std::vector<std::vector<int64_t>> in0_shapes
+            = {{2, 3, 64, 64}, {1, 64}, {3, 1, 64}, {1}};
+    const std::vector<std::vector<int64_t>> in1_shapes
+            = {{2, 1, 64, 1}, {2, 3, 64, 64}};
+    const std::vector<std::vector<int64_t>> in2_shapes
+            = {{3, 1, 64}, {2, 3, 64, 64}};
+    const std::vector<std::vector<int64_t>> expected_out_shapes
+            = {{2, 3, 64, 64}};
+
+    for_(const auto &in0_shape : in0_shapes)
+    for_(const auto &in1_shape : in1_shapes)
+    for_(const auto &in2_shape : in2_shapes)
+    for (const auto &expected_out_shape : expected_out_shapes) {
+        logical_tensor_t lt_in0
+                = logical_tensor_init(0, in0_shape, data_type::boolean);
+        logical_tensor_t lt_in1
+                = logical_tensor_init(1, in1_shape, data_type::f32);
+        logical_tensor_t lt_in2
+                = logical_tensor_init(2, in2_shape, data_type::f32);
+        std::vector<logical_tensor_t *> in {&lt_in0, &lt_in1, &lt_in2};
+        logical_tensor_t lt_out = logical_tensor_init(3, data_type::f32);
+        std::vector<logical_tensor_t *> out {&lt_out};
+
+        // shape inference without explicitly setting auto_broadcast
+        // should be enabled by default
+        op_schema_->shape_infer(&op_, in, out);
+        const std::vector<int64_t> infered_out_shape
+                = logical_tensor_wrapper_t(lt_out).vdims();
+        EXPECT_EQ(infered_out_shape, expected_out_shape);
+
+        // explicitly setting auto_broadcast
+        const std::string with_broadcast_attr_val = "numpy";
+        op_.set_attr(op_attr::auto_broadcast, with_broadcast_attr_val);
+        logical_tensor_t lt_out_expl = logical_tensor_init(4, data_type::f32);
+        std::vector<logical_tensor_t *> out_expl {&lt_out_expl};
+
+        op_schema_->shape_infer(&op_, in, out_expl);
+        const std::vector<int64_t> infered_out_shape_expl
+                = logical_tensor_wrapper_t(lt_out_expl).vdims();
+        EXPECT_EQ(infered_out_shape_expl, expected_out_shape);
+    }
+}
+#undef for_
+
+TEST(test_interface_op_schema, Sigmoid) {
+    const op_kind_t op_kind_ = op_kind::Sigmoid;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSigmoidOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Sigmoid;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, SigmoidBackward) {
+    const op_kind_t op_kind_ = op_kind::SigmoidBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSigmoidBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::SigmoidBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, SoftMax) {
+    const op_kind_t op_kind_ = op_kind::SoftMax;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSoftMaxOutputShape) {
+    const op_kind_t op_kind_ = op_kind::SoftMax;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, SoftMaxBackward) {
+    const op_kind_t op_kind_ = op_kind::SoftMaxBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::axis, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSoftMaxBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::SoftMaxBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferSqrtOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Sqrt;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Sqrt) {
+    const op_kind_t op_kind_ = op_kind::Sqrt;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, SoftPlus) {
+    const op_kind_t op_kind_ = op_kind::SoftPlus;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::beta, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSoftPlusOutputShape) {
+    const op_kind_t op_kind_ = op_kind::SoftPlus;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, SoftPlusBackward) {
+    const op_kind_t op_kind_ = op_kind::SoftPlusBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::beta, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSoftPlusBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::SoftPlusBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, SqrtBackward) {
+    const op_kind_t op_kind_ = op_kind::SqrtBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, SquaredDifference) {
+    const op_kind_t op_kind_ = op_kind::SquaredDifference;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, Subtract) {
+    const op_kind_t op_kind_ = op_kind::Subtract;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::auto_broadcast, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSubtractOutputShapeWithoutBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Subtract;
+
+    verify_shape_infer_for_arithmetic_op_no_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferSubtractOutputShapeWithBroadcast) {
+    const op_kind_t op_kind_ = op_kind::Subtract;
+
+    verify_shape_infer_for_arithmetic_op_with_broadcast(op_kind_);
+}
+
+TEST(test_interface_op_schema, Tanh) {
+    const op_kind_t op_kind_ = op_kind::Tanh;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+TEST(test_interface_op_schema, InferSqrtBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::SqrtBackward;
+
+    verify_two_ins_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Square) {
+    const op_kind_t op_kind_ = op_kind::Square;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 0;
+    const std::map<op_attr_t, bool> attrs_data = {};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferSquareOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Square;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, InferTanhOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Tanh;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, TanhBackward) {
+    const op_kind_t op_kind_ = op_kind::TanhBackward;
+    const size_t expected_in_size = 2;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::use_dst, false}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferTanhBackwardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::TanhBackward;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
+
+TEST(test_interface_op_schema, Wildcard) {
+    const op_schema_t *op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Wildcard);
+    auto inputs_option = op_schema->get_inputs_option();
+    auto outputs_option = op_schema->get_outputs_option();
+    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::variadic);
+    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::variadic);
+
+    op_t wildcard_op {0, op_kind::Wildcard, std::string("wildcard")};
+    logical_tensor_t lt_in_0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_in_1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_out_0 = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t lt_out_1 = logical_tensor_init(3, data_type::f32);
+    logical_tensor_t lt_out_2 = logical_tensor_init(4, data_type::f32);
+
+    wildcard_op.add_input(lt_in_0);
+    wildcard_op.add_input(lt_in_1);
+    wildcard_op.add_output(lt_out_0);
+    wildcard_op.add_output(lt_out_1);
+    wildcard_op.add_output(lt_out_2);
+
+    EXPECT_TRUE(op_schema->verify(&wildcard_op));
+}
+
+TEST(test_interface_op_schema, InferWildcardOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Wildcard;
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind_);
+    op_t op_ {op_kind_, op_t::kind2str(op_kind_)};
+
+    logical_tensor_t lt_in = logical_tensor_init(0, data_type::f32);
+    std::vector<logical_tensor_t *> in {&lt_in};
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
+    std::vector<logical_tensor_t *> out {&lt_out};
+
+    auto ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_EQ(ret, status::unimplemented);
+}
+
+TEST(test_interface_op_schema, BatchNormOptionalInput) {
+    const op_schema_t *bn_op_schema = op_schema_registry_t::get_op_schema(
+            op_kind::BatchNormForwardTraining);
+    auto inputs_option = bn_op_schema->get_inputs_option();
+    auto outputs_option = bn_op_schema->get_outputs_option();
+    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::optional);
+    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::fixed);
+
+    op_t bn_op {0, op_kind::BatchNormForwardTraining, std::string("bn")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_mean = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_viance = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t lt_output = logical_tensor_init(3, data_type::f32);
+    logical_tensor_t lt_running_mean = logical_tensor_init(4, data_type::f32);
+    logical_tensor_t lt_running_viance = logical_tensor_init(5, data_type::f32);
+    logical_tensor_t lt_batch_mean = logical_tensor_init(6, data_type::f32);
+    logical_tensor_t lt_batch_viance = logical_tensor_init(7, data_type::f32);
+
+    bn_op.add_input(lt_data);
+    bn_op.add_input(lt_mean);
+    bn_op.add_input(lt_viance);
+    bn_op.add_output(lt_output);
+    bn_op.add_output(lt_running_mean);
+    bn_op.add_output(lt_running_viance);
+    bn_op.add_output(lt_batch_mean);
+    bn_op.add_output(lt_batch_viance);
+
+    bn_op.set_attr<float>(op_attr::epsilon, 0.001f);
+    EXPECT_TRUE(bn_op_schema->verify(&bn_op));
+
+    logical_tensor_t lt_gamma = logical_tensor_init(8, data_type::f32);
+    bn_op.add_input(lt_gamma);
+    EXPECT_TRUE(bn_op_schema->verify(&bn_op));
+    logical_tensor_t lt_beta = logical_tensor_init(9, data_type::f32);
+    bn_op.add_input(lt_beta);
+    EXPECT_TRUE(bn_op_schema->verify(&bn_op));
+
+    logical_tensor_t lt_false = logical_tensor_init(10, data_type::f32);
+    bn_op.add_input(lt_false);
+    EXPECT_FALSE(bn_op_schema->verify(&bn_op));
+}
+
+TEST(test_interface_op_schema, ConcatVariadicInput) {
+    const op_schema_t *concat_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Concat);
+    auto inputs_option = concat_op_schema->get_inputs_option();
+    auto outputs_option = concat_op_schema->get_outputs_option();
+    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::variadic);
+    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::fixed);
+
+    op_t concat_op {0, op_kind::Concat, std::string("concat")};
+    logical_tensor_t lt_data_0 = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_data_1 = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_data_2 = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t lt_output = logical_tensor_init(3, data_type::f32);
+
+    concat_op.add_input(lt_data_0);
+    concat_op.add_input(lt_data_1);
+    concat_op.add_input(lt_data_2);
+    concat_op.add_output(lt_output);
+
+    concat_op.set_attr(op_attr::axis, int64_t(0));
+    EXPECT_TRUE(concat_op_schema->verify(&concat_op));
+}
+
+TEST(test_interface_op_schema, ConcatVariadicInputNegative) {
+    const op_schema_t *concat_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Concat);
+
+    op_t concat_op {0, op_kind::Concat, std::string("concat")};
+    logical_tensor_t lt_output = logical_tensor_init(3, data_type::f32);
+
+    concat_op.add_output(lt_output);
+
+    concat_op.set_attr(op_attr::axis, int64_t(0));
+    EXPECT_FALSE(concat_op_schema->verify(&concat_op));
+}
+
+TEST(test_interface_op_schema, LayerNormOptionalInputs) {
+    const op_schema_t *ln_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
+    auto inputs_option = ln_op_schema->get_inputs_option();
+    auto outputs_option = ln_op_schema->get_outputs_option();
+    EXPECT_TRUE(inputs_option == op_schema_t::param_num_option::optional);
+    EXPECT_TRUE(outputs_option == op_schema_t::param_num_option::optional);
+
+    op_t ln_op {0, op_kind::LayerNorm, std::string("ln")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+
+    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f32);
+
+    ln_op.add_input(lt_data);
+
+    ln_op.add_output(lt_output);
+
+    // Since number of output is only 1,
+    // which doesn't fit requirement when keep_stats is true
+    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
+
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::f32);
+    ln_op.add_input(lt_beta);
+    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
+
+    logical_tensor_t lt_gamma = logical_tensor_init(3, data_type::f32);
+    ln_op.add_input(lt_gamma);
+    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
+
+    logical_tensor_t lt_mean = logical_tensor_init(4, data_type::f32);
+    ln_op.add_output(lt_mean);
+    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
+
+    logical_tensor_t lt_variance = logical_tensor_init(5, data_type::f32);
+    ln_op.add_output(lt_variance);
+    EXPECT_TRUE(ln_op_schema->verify(&ln_op));
+
+    logical_tensor_t lt_false = logical_tensor_init(6, data_type::f32);
+    ln_op.add_input(lt_false);
+    EXPECT_FALSE(ln_op_schema->verify(&ln_op));
+}
+
+TEST(test_interface_op_schema, AddDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Add;
+    op_t tmp_op {0, tmp_op_kind, std::string("add")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
+    EXPECT_EQ(*sval, "numpy");
+}
+
+TEST(test_interface_op_schema, AvgpoolDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::AvgPool;
+    op_t tmp_op {0, tmp_op_kind, std::string("avgpool")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    tmp_op.get_attr<std::string>(op_attr::rounding_type, &sval);
+    EXPECT_EQ(*sval, "floor");
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+
+    tmp_op.get_attr<std::string>(op_attr::exclude_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
+    tmp_op.add_input(lt_data);
+    tmp_op.add_output(lt_out);
+    std::vector<int64_t> strides = {2, 2};
+    std::vector<int64_t> kernel = {3, 3};
+    std::vector<int64_t> pads_begin = {1, 1};
+    std::vector<int64_t> pads_end = {2, 2};
+    std::vector<int64_t> dilations = {1, 1};
+    bool exclude_pad = false;
+
+    tmp_op.set_attr(op_attr::strides, strides);
+    tmp_op.set_attr(op_attr::pads_begin, pads_begin);
+    tmp_op.set_attr(op_attr::pads_end, pads_end);
+    tmp_op.set_attr(op_attr::kernel, kernel);
+    tmp_op.set_attr(op_attr::exclude_pad, exclude_pad);
+
+    EXPECT_TRUE(opm->verify(&tmp_op));
+}
+
+TEST(test_interface_op_schema, AvgBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::AvgPoolBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("avgpool_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+}
+
+TEST(test_interface_op_schema, BatchNormInferenceDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::BatchNormInference;
+    op_t tmp_op {0, tmp_op_kind, std::string("bn_inference")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, BatchNormForwardTrainingDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::BatchNormForwardTraining;
+    op_t tmp_op {0, tmp_op_kind, std::string("bn_fwd_training")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, BatchNormTrainingBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::BatchNormTrainingBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("bn_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, BiasaddDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::BiasAdd;
+    op_t tmp_op {0, tmp_op_kind, std::string("bias_add")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, BiasaddBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::BiasAddBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("bias_add_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, ConvolutionDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Convolution;
+    op_t tmp_op {0, tmp_op_kind, std::string("conv")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    tmp_op.get_attr<std::string>(op_attr::weights_format, &sval);
+    EXPECT_EQ(*sval, "XIO");
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::groups, &ival);
+    int64_t int_value {1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, ConvolutionBackwardDataDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::ConvolutionBackwardData;
+    op_t tmp_op {0, tmp_op_kind, std::string("conv_bpd")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::vector<int64_t> *vval {nullptr};
+    tmp_op.get_attr<std::vector<int64_t>>(op_attr::output_padding, &vval);
+    std::vector<int64_t> vector_value(DNNL_MAX_NDIMS, 0);
+    EXPECT_EQ(*vval, vector_value);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    tmp_op.get_attr<std::string>(op_attr::weights_format, &sval);
+    EXPECT_EQ(*sval, "XIO");
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::groups, &ival);
+    int64_t int_value {1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, ConvolutionBackwardWeightsDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::ConvolutionBackwardWeights;
+    op_t tmp_op {0, tmp_op_kind, std::string("conv_bpf")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    tmp_op.get_attr<std::string>(op_attr::weights_format, &sval);
+    EXPECT_EQ(*sval, "XIO");
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::groups, &ival);
+    int64_t int_value {1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, DivideDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Divide;
+    op_t tmp_op {0, tmp_op_kind, std::string("divide")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
+    EXPECT_EQ(*sval, "numpy");
+}
+
+TEST(test_interface_op_schema, InterpolateDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Interpolate;
+    op_t tmp_op {0, tmp_op_kind, std::string("interpolate")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(
+            op_attr::coordinate_transformation_mode, &sval);
+    EXPECT_EQ(*sval, "half_pixel");
+
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, InterpolateBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::InterpolateBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("interpolate_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(
+            op_attr::coordinate_transformation_mode, &sval);
+    EXPECT_EQ(*sval, "half_pixel");
+
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+}
+
+TEST(test_interface_op_schema, LayerNormDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::LayerNorm;
+    op_t tmp_op {0, tmp_op_kind, std::string("ln")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const bool *bval {nullptr};
+    tmp_op.get_attr<bool>(op_attr::keep_stats, &bval);
+    EXPECT_TRUE(bval);
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::begin_norm_axis, &ival);
+    int64_t int_value {-1};
+    EXPECT_EQ(*ival, int_value);
+
+    tmp_op.get_attr<bool>(op_attr::use_affine, &bval);
+    EXPECT_TRUE(bval);
+
+    const float *fval {nullptr};
+    tmp_op.get_attr<float>(op_attr::epsilon, &fval);
+    float float_value {1e-5f};
+    EXPECT_FLOAT_EQ(*fval, float_value);
+}
+
+TEST(test_interface_op_schema, LayerNormBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::LayerNormBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("ln_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const bool *bval {nullptr};
+    tmp_op.get_attr<bool>(op_attr::use_affine, &bval);
+    EXPECT_TRUE(bval);
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::begin_norm_axis, &ival);
+    int64_t int_value {-1};
+    EXPECT_EQ(*ival, int_value);
+
+    const float *fval {nullptr};
+    tmp_op.get_attr<float>(op_attr::epsilon, &fval);
+    float float_value {1e-5f};
+    EXPECT_FLOAT_EQ(*fval, float_value);
+}
+
+TEST(test_interface_op_schema, LogSoftmaxDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::LogSoftmax;
+    op_t tmp_op {0, tmp_op_kind, std::string("log_softmax")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
+    int64_t int_value {-1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, LogSoftmaxBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::LogSoftmaxBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("logsoftmax_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
+    int64_t int_value {-1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, MatmulDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::MatMul;
+    op_t tmp_op {0, tmp_op_kind, std::string("matmul")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const bool *bval {nullptr};
+    tmp_op.get_attr<bool>(op_attr::transpose_a, &bval);
+    EXPECT_FALSE(*bval);
+
+    tmp_op.get_attr<bool>(op_attr::transpose_b, &bval);
+    EXPECT_FALSE(*bval);
+}
+
+TEST(test_interface_op_schema, MaxPoolDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::MaxPool;
+    op_t tmp_op {0, tmp_op_kind, std::string("max_pool")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    tmp_op.get_attr<std::string>(op_attr::rounding_type, &sval);
+    EXPECT_EQ(*sval, "floor");
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+
+    const std::vector<int64_t> *vval {nullptr};
+    tmp_op.get_attr<std::vector<int64_t>>(op_attr::dilations, &vval);
+    std::vector<int64_t> vector_value(DNNL_MAX_NDIMS, 1);
+    EXPECT_EQ(*vval, vector_value);
+}
+
+TEST(test_interface_op_schema, MaxPoolBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::MaxPoolBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("max_pool_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::data_format, &sval);
+    EXPECT_EQ(*sval, "NXC");
+
+    const std::vector<int64_t> *vval {nullptr};
+    tmp_op.get_attr<std::vector<int64_t>>(op_attr::dilations, &vval);
+    std::vector<int64_t> vector_value(DNNL_MAX_NDIMS, 1);
+    EXPECT_EQ(*vval, vector_value);
+
+    tmp_op.get_attr<std::string>(op_attr::auto_pad, &sval);
+    EXPECT_EQ(*sval, "None");
+}
+
+TEST(test_interface_op_schema, MaximumDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Maximum;
+    op_t tmp_op {0, tmp_op_kind, std::string("max")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
+    EXPECT_EQ(*sval, "numpy");
+}
+
+TEST(test_interface_op_schema, MinimumDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Minimum;
+    op_t tmp_op {0, tmp_op_kind, std::string("min")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
+    EXPECT_EQ(*sval, "numpy");
+}
+
+TEST(test_interface_op_schema, MultiplyDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Multiply;
+    op_t tmp_op {0, tmp_op_kind, std::string("mul")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
+    EXPECT_EQ(*sval, "numpy");
+}
+
+TEST(test_interface_op_schema, ReduceDefaultAttribute) {
+    const std::vector<op_kind_t> configs {op_kind::ReduceL1, op_kind::ReduceL2,
+            op_kind::ReduceMax, op_kind::ReduceMean, op_kind::ReduceMin,
+            op_kind::ReduceProd, op_kind::ReduceSum};
+
+    for (auto tmp_op_kind : configs) {
+        op_t tmp_op {0, tmp_op_kind, std::string("reduce")};
+
+        const op_schema_t *opm
+                = op_schema_registry_t::get_op_schema(tmp_op_kind);
+        EXPECT_TRUE(opm != nullptr);
+        opm->set_default_attribute(&tmp_op);
+
+        const bool *bval {nullptr};
+        tmp_op.get_attr<bool>(op_attr::keep_dims, &bval);
+        EXPECT_FALSE(*bval);
+
+        const std::vector<int64_t> *vval {nullptr};
+        tmp_op.get_attr<std::vector<int64_t>>(op_attr::axes, &vval);
+        EXPECT_TRUE(vval->empty());
+    }
+}
+
+TEST(test_interface_op_schema, SigmoidBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::SigmoidBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("sig_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const bool *bval {nullptr};
+    tmp_op.get_attr<bool>(op_attr::use_dst, &bval);
+    EXPECT_TRUE(bval);
+}
+
+TEST(test_interface_op_schema, SoftMaxDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::SoftMax;
+    op_t tmp_op {0, tmp_op_kind, std::string("softmax")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
+    int64_t int_value {1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, SoftMaxBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::SoftMaxBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("softmax_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const int64_t *ival {nullptr};
+    tmp_op.get_attr<int64_t>(op_attr::axis, &ival);
+    int64_t int_value {1};
+    EXPECT_EQ(*ival, int_value);
+}
+
+TEST(test_interface_op_schema, SoftPlusDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::SoftPlus;
+    op_t tmp_op {0, tmp_op_kind, std::string("softplus")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    float ival = tmp_op.get_attr<float>(op_attr::beta);
+    EXPECT_EQ(ival, 1.f);
+}
+
+TEST(test_interface_op_schema, SoftPlusBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::SoftPlusBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("softplus_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    float ival = tmp_op.get_attr<float>(op_attr::beta);
+    EXPECT_EQ(ival, 1.f);
+}
+
+TEST(test_interface_op_schema, SqrtBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::SqrtBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("sqrt_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const bool *bval {nullptr};
+    tmp_op.get_attr<bool>(op_attr::use_dst, &bval);
+    EXPECT_TRUE(bval);
+}
+
+TEST(test_interface_op_schema, SubtractDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::Subtract;
+    op_t tmp_op {0, tmp_op_kind, std::string("sub")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const std::string *sval {nullptr};
+    tmp_op.get_attr<std::string>(op_attr::auto_broadcast, &sval);
+    EXPECT_EQ(*sval, "numpy");
+}
+
+TEST(test_interface_op_schema, TanhBackwardDefaultAttribute) {
+    op_kind_t tmp_op_kind = op_kind::TanhBackward;
+    op_t tmp_op {0, tmp_op_kind, std::string("tanh_bp")};
+
+    const op_schema_t *opm = op_schema_registry_t::get_op_schema(tmp_op_kind);
+    EXPECT_TRUE(opm != nullptr);
+    opm->set_default_attribute(&tmp_op);
+
+    const bool *bval {nullptr};
+    tmp_op.get_attr<bool>(op_attr::use_dst, &bval);
+    EXPECT_TRUE(bval);
+}
+
+TEST(test_interface_op_schema, MatmulTypeConstraints) {
+    const op_schema_t *matmul_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::MatMul);
+    op_t matmul_op {0, op_kind::MatMul, std::string("matmul")};
+    logical_tensor_t lt_data_a = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_data_b = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_out = logical_tensor_init(2, data_type::s8);
+
+    matmul_op.add_input(lt_data_a);
+    matmul_op.add_input(lt_data_b);
+    matmul_op.add_output(lt_out);
+    // MatMul op doesn't support s8 output
+    EXPECT_FALSE(matmul_op_schema->verify(&matmul_op));
+}
+
+TEST(test_interface_op_schema, Quantize) {
+    const op_schema_t *quant_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Quantize);
+    op_t quant_op {0, op_kind::Quantize, std::string("quantize")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::s8);
+
+    quant_op.add_input(lt_data);
+    quant_op.add_output(lt_out);
+    quant_op.set_attr(op_attr::zps, std::vector<int64_t> {1});
+    quant_op.set_attr(op_attr::scales, std::vector<float> {0.1f});
+    EXPECT_TRUE(quant_op_schema->verify(&quant_op));
+}
+
+TEST(test_interface_op_schema, QuantizeWithFloatZps) {
+    const op_schema_t *quant_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Quantize);
+    op_t quant_op {0, op_kind::Quantize, std::string("quantize")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::s8);
+
+    quant_op.add_input(lt_data);
+    quant_op.add_output(lt_out);
+    quant_op.set_attr(op_attr::scales, std::vector<int64_t> {1});
+    quant_op.set_attr(op_attr::zps, std::vector<float> {0.1f});
+
+    //Quantize op does not support float zps and int64 scales
+    EXPECT_FALSE(quant_op_schema->verify(&quant_op));
+}
+
+TEST(test_interface_op_schema, Dequantize) {
+    const op_schema_t *dequant_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Dequantize);
+    op_t dequant_op {0, op_kind::Dequantize, std::string("dequantize")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::u8);
+    logical_tensor_t lt_out = logical_tensor_init(1, data_type::f32);
+
+    dequant_op.add_input(lt_data);
+    dequant_op.add_output(lt_out);
+    dequant_op.set_attr(op_attr::zps, std::vector<int64_t> {1});
+    dequant_op.set_attr(op_attr::scales, std::vector<float> {0.1f});
+    EXPECT_TRUE(dequant_op_schema->verify(&dequant_op));
+}
+
+TEST(test_interface_op_schema, LayerNormBf16) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
+
+    op_t lnorm {0, op_kind::LayerNorm, std::string("layer_norm")};
+    lnorm.set_attr(op_attr::keep_stats, false);
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
+
+    lnorm.add_input(lt_data);
+    lnorm.add_input(lt_gamma);
+    lnorm.add_input(lt_beta);
+    lnorm.add_output(lt_output);
+
+    EXPECT_TRUE(schema->verify(&lnorm));
+}
+
+TEST(test_interface_op_schema, LayerNormBf16WithGamma) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
+
+    op_t lnorm {0, op_kind::LayerNorm, std::string("layer_norm")};
+    lnorm.set_attr(op_attr::keep_stats, false);
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
+
+    lnorm.add_input(lt_data);
+    lnorm.add_input(lt_gamma);
+    lnorm.add_input(lt_beta);
+    lnorm.add_output(lt_output);
+
+    EXPECT_TRUE(schema->verify(&lnorm));
+}
+
+TEST(test_interface_op_schema, SoftmaxBf16) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::SoftMax);
+
+    op_t softmax {0, op_kind::SoftMax, std::string("softmax")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(1, data_type::bf16);
+
+    softmax.add_input(lt_data);
+    softmax.add_output(lt_output);
+
+    softmax.set_attr<int64_t>(op_attr::axis, 1);
+    EXPECT_TRUE(schema->verify(&softmax));
+}
+
+TEST(test_interface_op_schema, LogSoftmaxBf16) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::LogSoftmax);
+
+    op_t logsoftmax {0, op_kind::LogSoftmax, std::string("logsoftmax")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(1, data_type::bf16);
+
+    logsoftmax.add_input(lt_data);
+    logsoftmax.add_output(lt_output);
+
+    logsoftmax.set_attr<int64_t>(op_attr::axis, 1);
+    EXPECT_TRUE(schema->verify(&logsoftmax));
+}
+
+TEST(test_interface_op_schema, TypeCast) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::TypeCast);
+    EXPECT_TRUE(schema != nullptr);
+
+    op_t typecast {0, op_kind::TypeCast, std::string("typecast")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f32);
+
+    typecast.add_input(lt_data);
+    typecast.add_output(lt_output);
+
+    EXPECT_TRUE(schema->verify(&typecast));
+}
+
+TEST(test_interface_op_schema, BatchNormInferenceWithBf16Data) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::BatchNormInference);
+
+    op_t bn {0, op_kind::BatchNormInference, std::string("bn")};
+    bn.set_attr(op_attr::epsilon, 0.001f);
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::f32);
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::f32);
+    logical_tensor_t lt_mean = logical_tensor_init(3, data_type::f32);
+    logical_tensor_t lt_var = logical_tensor_init(4, data_type::f32);
+    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
+
+    bn.add_input(lt_data);
+    bn.add_input(lt_gamma);
+    bn.add_input(lt_beta);
+    bn.add_input(lt_mean);
+    bn.add_input(lt_var);
+    bn.add_output(lt_output);
+
+    EXPECT_TRUE(schema->verify(&bn));
+}
+
+TEST(test_interface_op_schema, BatchNormInferenceWithBf16Inputs) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::BatchNormInference);
+
+    op_t bn {0, op_kind::BatchNormInference, std::string("bn")};
+    bn.set_attr(op_attr::epsilon, 0.001f);
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
+    logical_tensor_t lt_mean = logical_tensor_init(3, data_type::bf16);
+    logical_tensor_t lt_var = logical_tensor_init(4, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(5, data_type::bf16);
+
+    bn.add_input(lt_data);
+    bn.add_input(lt_gamma);
+    bn.add_input(lt_beta);
+    bn.add_input(lt_mean);
+    bn.add_input(lt_var);
+    bn.add_output(lt_output);
+
+    EXPECT_TRUE(schema->verify(&bn));
+}
+
+TEST(test_interface_op_schema, StaticTranspose) {
+    const op_kind_t op_kind_ = op_kind::StaticTranspose;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::order, true}};
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, StaticReshape) {
+    const op_kind_t op_kind_ = op_kind::StaticReshape;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 2;
+    const std::map<op_attr_t, bool> attrs_data
+            = {{op_attr::shape, true}, {op_attr::special_zero, true}};
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferStaticTransposeShape) {
+    const op_schema_t *static_transpose_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::StaticTranspose);
+
+    op_t static_transpose_op {
+            op_kind::StaticTranspose, op_t::kind2str(op_kind::StaticTranspose)};
+    static_transpose_op.set_attr(
+            op_attr::order, std::vector<int64_t> {2, 0, 1});
+
+    logical_tensor_t lt_in1
+            = logical_tensor_init(0, {1024, 64, 32}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_in1};
+    logical_tensor_t lt_o1
+            = logical_tensor_init(1, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out1 {&lt_o1};
+
+    static_transpose_op_schema->shape_infer(
+            &static_transpose_op, lt_in, lt_out1);
+
+    const std::vector<int64_t> inferred_out_shape1
+            = logical_tensor_wrapper_t(lt_o1).vdims();
+    const std::vector<int64_t> expected_out_shape1 = {32, 1024, 64};
+    EXPECT_EQ(inferred_out_shape1, expected_out_shape1);
+
+    // negative order
+    static_transpose_op.set_attr(
+            op_attr::order, std::vector<int64_t> {-2, 2, 0});
+    lt_in1 = logical_tensor_init(0, {2, 4, 1024}, data_type::f32);
+    logical_tensor_t lt_o2
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out2 {&lt_o2};
+    static_transpose_op_schema->shape_infer(
+            &static_transpose_op, lt_in, lt_out2);
+
+    const std::vector<int64_t> inferred_out_shape2
+            = logical_tensor_wrapper_t(lt_o2).vdims();
+    const std::vector<int64_t> expected_out_shape2 = {4, 1024, 2};
+    EXPECT_EQ(inferred_out_shape2, expected_out_shape2);
+
+    // repeat order
+    static_transpose_op.set_attr(
+            op_attr::order, std::vector<int64_t> {1, 1, 0});
+    logical_tensor_t lt_o3
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out3 {&lt_o3};
+    status_t infer_status = static_transpose_op_schema->shape_infer(
+            &static_transpose_op, lt_in, lt_out3);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // order not cover all input axis
+    static_transpose_op.set_attr(op_attr::order, std::vector<int64_t> {1, 0});
+    logical_tensor_t lt_o4
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out4 {&lt_o4};
+    infer_status = static_transpose_op_schema->shape_infer(
+            &static_transpose_op, lt_in, lt_out4);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // order out of range
+    static_transpose_op.set_attr(
+            op_attr::order, std::vector<int64_t> {1, 3, 0});
+    logical_tensor_t lt_o5
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out5 {&lt_o5};
+    infer_status = static_transpose_op_schema->shape_infer(
+            &static_transpose_op, lt_in, lt_out5);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // order is empty
+    static_transpose_op.set_attr(op_attr::order, std::vector<int64_t> {});
+    logical_tensor_t lt_o6
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out6 {&lt_o6};
+    infer_status = static_transpose_op_schema->shape_infer(
+            &static_transpose_op, lt_in, lt_out6);
+
+    const std::vector<int64_t> inferred_out_shape6
+            = logical_tensor_wrapper_t(lt_o6).vdims();
+    const std::vector<int64_t> expected_out_shape6 = {1024, 4, 2};
+    EXPECT_EQ(inferred_out_shape6, expected_out_shape6);
+}
+
+TEST(test_interface_op_schema, InferStaticReshapeShape) {
+    const op_schema_t *static_reshape_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::StaticReshape);
+
+    op_t static_reshape_op {
+            op_kind::StaticReshape, op_t::kind2str(op_kind::StaticReshape)};
+
+    std::vector<int64_t> out_shape {16, 4, 8};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    static_reshape_op.set_attr(op_attr::special_zero, false);
+
+    logical_tensor_t lt_in1
+            = logical_tensor_init(0, {2, 8, 32}, data_type::f32);
+    std::vector<logical_tensor_t *> lt_in {&lt_in1};
+    logical_tensor_t lt_o1
+            = logical_tensor_init(1, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out1 {&lt_o1};
+
+    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out1);
+
+    std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_o1).vdims();
+    EXPECT_EQ(inferred_out_shape, out_shape);
+
+    // test special zero true
+    out_shape = {4, 0, 16};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    static_reshape_op.set_attr(op_attr::special_zero, true);
+    lt_o1 = logical_tensor_init(2, data_type::f32, layout_type::strided);
+
+    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out1);
+
+    inferred_out_shape = logical_tensor_wrapper_t(lt_o1).vdims();
+    std::vector<int64_t> expected_out_shape = {4, 8, 16};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    // test special zero false
+    out_shape = {4, 0, 16};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    static_reshape_op.set_attr(op_attr::special_zero, false);
+    lt_o1 = logical_tensor_init(3, data_type::f32, layout_type::strided);
+
+    status_t infer_status = static_reshape_op_schema->shape_infer(
+            &static_reshape_op, lt_in, lt_out1);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // test -1 in shape
+    out_shape = {8, 0, -1};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    static_reshape_op.set_attr(op_attr::special_zero, true);
+    lt_o1 = logical_tensor_init(4, data_type::f32, layout_type::strided);
+
+    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out1);
+
+    inferred_out_shape = logical_tensor_wrapper_t(lt_o1).vdims();
+    expected_out_shape = {8, 8, 8};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    // test input/output with different shape size
+    out_shape = {4, 6, 16};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    lt_o1 = logical_tensor_init(5, data_type::f32, layout_type::strided);
+
+    infer_status = static_reshape_op_schema->shape_infer(
+            &static_reshape_op, lt_in, lt_out1);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // test invalid shape: more than one -1
+    out_shape = {-1, -1, 16};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    lt_o1 = logical_tensor_init(6, data_type::f32, layout_type::strided);
+
+    infer_status = static_reshape_op_schema->shape_infer(
+            &static_reshape_op, lt_in, lt_out1);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // test invalid shape: < -1
+    out_shape = {4, -6, 16};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    lt_o1 = logical_tensor_init(7, data_type::f32, layout_type::strided);
+
+    infer_status = static_reshape_op_schema->shape_infer(
+            &static_reshape_op, lt_in, lt_out1);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // test shape contains 0-D
+    out_shape = {0, 4, 7};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    static_reshape_op.set_attr(op_attr::special_zero, false);
+
+    logical_tensor_t lt_in8
+            = logical_tensor_init(8, {0, 2, 8, 6}, data_type::f32);
+    lt_in = {&lt_in8};
+    logical_tensor_t lt_o9
+            = logical_tensor_init(9, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> lt_out9 {&lt_o9};
+
+    static_reshape_op_schema->shape_infer(&static_reshape_op, lt_in, lt_out9);
+
+    inferred_out_shape = logical_tensor_wrapper_t(lt_o9).vdims();
+    EXPECT_EQ(inferred_out_shape, out_shape);
+
+    // test invalid shape case
+    out_shape = {-1, 0};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    lt_o9 = logical_tensor_init(10, data_type::f32, layout_type::strided);
+
+    infer_status = static_reshape_op_schema->shape_infer(
+            &static_reshape_op, lt_in, lt_out9);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+
+    // test input/output with different shape size
+    out_shape = {-1, 0};
+    static_reshape_op.set_attr(op_attr::shape, out_shape);
+    lt_o9 = logical_tensor_init(11, data_type::f32, layout_type::strided);
+
+    infer_status = static_reshape_op_schema->shape_infer(
+            &static_reshape_op, lt_in, lt_out9);
+    EXPECT_EQ(infer_status, status::invalid_shape);
+}
+
+TEST(test_interface_op_schema, FailToAddWildcard) {
+    const op_schema_t *wildcard_op_schema
+            = op_schema_registry_t::get_op_schema(op_kind::Wildcard);
+    op_t wildcard_op {0, op_kind::Wildcard, std::string("wildcard")};
+    logical_tensor_t lt_data_a = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_data_b = logical_tensor_init(1, data_type::f16);
+    logical_tensor_t lt_out = logical_tensor_init(2, data_type::bf16);
+
+    wildcard_op.add_input(lt_data_a);
+    wildcard_op.add_input(lt_data_b);
+    wildcard_op.add_output(lt_out);
+    EXPECT_TRUE(wildcard_op_schema->verify(&wildcard_op));
+}
+
+struct dynamic_quantization_params_t {
+    data_type_t data_type;
+    data_type_t scales_type;
+    data_type_t zps_type;
+    data_type_t out_type;
+    std::string qtype;
+    bool asymmetric;
+    int64_t axis;
+    bool supported;
+};
+
+template <op_kind_t kind>
+class dynamic_quantization_t
+    : public ::testing::TestWithParam<dynamic_quantization_params_t> {
+public:
+    void Test() {
+        const op_schema_t *schema = op_schema_registry_t::get_op_schema(kind);
+
+        auto params = ::testing::TestWithParam<
+                dynamic_quantization_params_t>::GetParam();
+
+        op_t dynamic_quantize {0, kind, "dynamic_quantize"};
+        logical_tensor_t lt_data = logical_tensor_init(0, params.data_type);
+        logical_tensor_t lt_scales = logical_tensor_init(1, params.scales_type);
+        logical_tensor_t lt_zps = logical_tensor_init(2, params.zps_type);
+        logical_tensor_t lt_output = logical_tensor_init(3, params.out_type);
+
+        dynamic_quantize.add_input(lt_data);
+        dynamic_quantize.add_input(lt_scales);
+        dynamic_quantize.add_output(lt_output);
+
+        if (params.asymmetric) dynamic_quantize.add_input(lt_zps);
+
+        dynamic_quantize.set_attr<std::string>(op_attr::qtype, params.qtype);
+        if (params.qtype == "per_channel")
+            dynamic_quantize.set_attr<int64_t>(op_attr::axis, params.axis);
+
+        EXPECT_EQ(schema->verify(&dynamic_quantize), params.supported);
+    }
+};
+
+using dynamic_quantize_t = dynamic_quantization_t<op_kind::DynamicQuantize>;
+
+TEST_P(dynamic_quantize_t, TestDynamicQuantize) {
+    Test();
+}
+
+static auto QuantizeCases = []() {
+    std::vector<dynamic_quantization_params_t> cases;
+
+    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
+            data_type::s8, "per_tensor", false, 1, true});
+    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
+            data_type::s8, "per_channel", false, 1, true});
+
+    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
+            data_type::u8, "per_tensor", false, 1, true});
+    cases.push_back({data_type::f32, data_type::f32, data_type::f32,
+            data_type::u8, "per_channel", false, 1, true});
+
+    cases.push_back({data_type::f32, data_type::f32, data_type::s8,
+            data_type::s8, "per_tensor", true, 1, true});
+    cases.push_back({data_type::f32, data_type::f32, data_type::s8,
+            data_type::s8, "per_channel", true, 1, true});
+
+    cases.push_back({data_type::f32, data_type::f32, data_type::u8,
+            data_type::u8, "per_tensor", true, 1, true});
+    cases.push_back({data_type::f32, data_type::f32, data_type::u8,
+            data_type::u8, "per_channel", true, 1, true});
+
+    cases.push_back({data_type::f32, data_type::f32, data_type::s32,
+            data_type::s8, "per_tensor", true, 1, true});
+    cases.push_back({data_type::f32, data_type::f32, data_type::s32,
+            data_type::s8, "per_channel", true, 1, true});
+
+    // negative cases
+    cases.push_back({data_type::f16, data_type::f32, data_type::s32,
+            data_type::s8, "per_tensor", true, 1, false});
+    cases.push_back({data_type::f16, data_type::f16, data_type::s32,
+            data_type::s8, "per_channel", true, 1, false});
+    cases.push_back({data_type::bf16, data_type::bf16, data_type::s32,
+            data_type::s8, "per_channel", true, 1, false});
+    cases.push_back({data_type::bf16, data_type::bf16, data_type::s32,
+            data_type::u8, "per_tensor", false, 1, false});
+
+    return ::testing::ValuesIn(cases);
+};
+
+INSTANTIATE_TEST_SUITE_P(
+        test_interface_op_schema, dynamic_quantize_t, QuantizeCases());
+
+using dynamic_dequantize_t = dynamic_quantization_t<op_kind::DynamicDequantize>;
+
+TEST_P(dynamic_dequantize_t, TestDynamicDequantize) {
+    Test();
+}
+
+static auto DequantizeCases = []() {
+    std::vector<dynamic_quantization_params_t> cases;
+
+    cases.push_back({data_type::s8, data_type::f32, data_type::f32,
+            data_type::f32, "per_tensor", false, 1, true});
+    cases.push_back({data_type::s8, data_type::f32, data_type::f32,
+            data_type::f32, "per_channel", false, 1, true});
+
+    cases.push_back({data_type::u8, data_type::f32, data_type::f32,
+            data_type::f32, "per_tensor", false, 1, true});
+    cases.push_back({data_type::u8, data_type::f32, data_type::f32,
+            data_type::f32, "per_channel", false, 1, true});
+
+    cases.push_back({data_type::s8, data_type::f32, data_type::s8,
+            data_type::f32, "per_tensor", true, 1, true});
+    cases.push_back({data_type::s8, data_type::f32, data_type::s8,
+            data_type::f32, "per_channel", true, 1, true});
+
+    cases.push_back({data_type::u8, data_type::f32, data_type::u8,
+            data_type::f32, "per_tensor", true, 1, true});
+    cases.push_back({data_type::u8, data_type::f32, data_type::u8,
+            data_type::f32, "per_channel", true, 1, true});
+
+    cases.push_back({data_type::s8, data_type::f32, data_type::s32,
+            data_type::f32, "per_tensor", true, 1, true});
+    cases.push_back({data_type::s8, data_type::f32, data_type::s32,
+            data_type::f32, "per_channel", true, 1, true});
+    cases.push_back({data_type::s8, data_type::f16, data_type::s32,
+            data_type::f16, "per_channel", true, 1, true});
+    cases.push_back({data_type::s8, data_type::bf16, data_type::s32,
+            data_type::bf16, "per_channel", true, 1, true});
+    cases.push_back({data_type::u8, data_type::bf16, data_type::s32,
+            data_type::bf16, "per_tensor", false, 1, true});
+
+    // negative cases
+    cases.push_back({data_type::s8, data_type::f32, data_type::s32,
+            data_type::f16, "per_tensor", true, 1, false});
+
+    return ::testing::ValuesIn(cases);
+};
+
+INSTANTIATE_TEST_SUITE_P(
+        test_interface_op_schema, dynamic_dequantize_t, DequantizeCases());
+
+TEST(test_interface_op_schema, InferInterpolateShape) {
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind::Interpolate);
+    op_t op_ {0, op_kind::Interpolate, std::string("Interpolate")};
+
+    // test normal ncx sizes case
+    op_.set_attr<dims>(op_attr::sizes, {10, 20});
+    op_.set_attr<std::string>(op_attr::data_format, "NXC");
+    op_.set_attr<std::string>(
+            op_attr::coordinate_transformation_mode, "align_corners");
+    logical_tensor_t lt_in = logical_tensor_init(
+            0, {6, 7, 8, 9}, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> in {&lt_in};
+    logical_tensor_t lt_out
+            = logical_tensor_init(1, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> out {&lt_out};
+    status_t ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_EQ(ret, status::success);
+    std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    std::vector<int64_t> expected_out_shape = {6, 10, 20, 9};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    // sizes and scales should not be valid at the same time
+    op_.set_attr<std::string>(op_attr::data_format, "NCX");
+    op_.set_attr<std::vector<float>>(op_attr::scales, {0.5f, 0.6f});
+    lt_out = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    EXPECT_NE(op_schema_->verify(&op_), true);
+
+    // test normal scales case
+    op_.set_attr<dims>(op_attr::sizes, {});
+    lt_out = logical_tensor_init(3, data_type::f32, layout_type::strided);
+    ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_EQ(ret, status::success);
+    inferred_out_shape = logical_tensor_wrapper_t(lt_out).vdims();
+    expected_out_shape = {6, 7, 4, 5};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+
+    // scales.size() == in.ndims() -2
+    op_.set_attr<std::vector<float>>(op_attr::scales, {0.5f, 0.6f, 0.7f});
+    lt_out = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_NE(ret, status::success);
+
+    // sizes.size() == in.ndims() -2
+    op_.set_attr<std::vector<float>>(op_attr::scales, {});
+    op_.set_attr<dims>(op_attr::sizes, {20});
+    lt_out = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_NE(ret, status::success);
+}
+
+TEST(test_interface_op_schema, InferInterpolateBackwardShape) {
+    const op_schema_t *op_schema_
+            = op_schema_registry_t::get_op_schema(op_kind::InterpolateBackward);
+    op_t op_ {0, op_kind::InterpolateBackward,
+            std::string("InterpolateBackward")};
+    logical_tensor_t lt_in1 = logical_tensor_init(
+            0, {8, 7, 6, 5}, data_type::f32, layout_type::strided);
+    logical_tensor_t lt_in2 = logical_tensor_init(
+            1, {8, 7, 3, 2}, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> in {&lt_in1, &lt_in2};
+    logical_tensor_t lt_out
+            = logical_tensor_init(2, data_type::f32, layout_type::strided);
+    std::vector<logical_tensor_t *> out {&lt_out};
+    op_.set_attr<dims>(op_attr::sizes, {6, 5});
+
+    status_t ret = op_schema_->shape_infer(&op_, in, out);
+    EXPECT_EQ(ret, status::success);
+    std::vector<int64_t> inferred_out_shape
+            = logical_tensor_wrapper_t(lt_out).vdims();
+    std::vector<int64_t> expected_out_shape = {8, 7, 6, 5};
+    EXPECT_EQ(inferred_out_shape, expected_out_shape);
+}
+
+TEST(test_interface_op_schema, FailToTypecastBf16ToF16) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::TypeCast);
+
+    op_t typecast {0, op_kind::TypeCast, std::string("typecast")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f16);
+
+    typecast.add_input(lt_data);
+    typecast.add_output(lt_output);
+
+    EXPECT_FALSE(schema->verify(&typecast));
+}
+
+TEST(test_interface_op_schema, FailToTypecastF32ToF32) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::TypeCast);
+
+    op_t typecast {0, op_kind::TypeCast, std::string("typecast")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_output = logical_tensor_init(1, data_type::f32);
+
+    typecast.add_input(lt_data);
+    typecast.add_output(lt_output);
+
+    EXPECT_FALSE(schema->verify(&typecast));
+}
+
+TEST(test_interface_op_schema, FailToAddBn) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::BatchNormInference);
+
+    op_t bn {0, op_kind::BatchNormInference, std::string("bn")};
+    bn.set_attr(op_attr::epsilon, 0.001f);
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
+    logical_tensor_t lt_mean = logical_tensor_init(3, data_type::bf16);
+    logical_tensor_t lt_var = logical_tensor_init(4, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(5, data_type::f32);
+
+    bn.add_input(lt_data);
+    bn.add_input(lt_gamma);
+    bn.add_input(lt_beta);
+    bn.add_input(lt_mean);
+    bn.add_input(lt_var);
+    bn.add_output(lt_output);
+
+    EXPECT_FALSE(schema->verify(&bn));
+}
+
+TEST(test_interface_op_schema, FailToAddLn) {
+    const op_schema_t *schema
+            = op_schema_registry_t::get_op_schema(op_kind::LayerNorm);
+
+    op_t ln {0, op_kind::LayerNorm, std::string("ln")};
+    logical_tensor_t lt_data = logical_tensor_init(0, data_type::f32);
+    logical_tensor_t lt_gamma = logical_tensor_init(1, data_type::bf16);
+    logical_tensor_t lt_beta = logical_tensor_init(2, data_type::bf16);
+    logical_tensor_t lt_output = logical_tensor_init(5, data_type::f32);
+
+    ln.add_input(lt_data);
+    ln.add_input(lt_gamma);
+    ln.add_input(lt_beta);
+    ln.add_output(lt_output);
+
+    EXPECT_FALSE(schema->verify(&ln));
+}
+
+TEST(test_interface_op_schema, Pow) {
+    const op_kind_t op_kind_ = op_kind::Pow;
+    const size_t expected_in_size = 1;
+    const size_t expected_out_size = 1;
+    const size_t expected_attr_size = 1;
+    const std::map<op_attr_t, bool> attrs_data = {{op_attr::beta, true}};
+
+    verify_op_schema(op_kind_, expected_in_size, expected_out_size,
+            expected_attr_size, attrs_data);
+}
+
+TEST(test_interface_op_schema, InferPowOutputShape) {
+    const op_kind_t op_kind_ = op_kind::Pow;
+
+    verify_single_in_identity_shape_infer(op_kind_);
+}
diff --git a/tests/gtests/graph/unit/interface/test_shape_infer.cpp b/tests/gtests/graph/unit/interface/test_shape_infer.cpp
deleted file mode 100644
index 972e62d0ce1..00000000000
--- a/tests/gtests/graph/unit/interface/test_shape_infer.cpp
+++ /dev/null
@@ -1,316 +0,0 @@
-/*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "gtest/gtest.h"
-
-#include "interface/c_types_map.hpp"
-#include "interface/shape_infer.hpp"
-
-#include "backend/dnnl/common.hpp"
-
-#include "graph/unit/unit_test_common.hpp"
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-using graph::dims;
-
-TEST(test_interface_shape_infer, OneWayBroadcast) {
-    using dims = graph::dims;
-    dims src_shape {2, 3};
-    dims dst1_shape {2};
-    dims dst2_shape {2, 3};
-    dims dst3_shape {4, 3};
-    dims dst4_shape {1, 2, 3};
-
-    ASSERT_EQ(graph::one_way_broadcast(dst1_shape, src_shape),
-            graph::status::invalid_shape);
-
-    ASSERT_EQ(graph::one_way_broadcast(dst2_shape, src_shape),
-            graph::status::success);
-
-    ASSERT_EQ(graph::one_way_broadcast(dst3_shape, src_shape),
-            graph::status::invalid_shape);
-
-    ASSERT_EQ(graph::one_way_broadcast(dst4_shape, src_shape),
-            graph::status::success);
-}
-
-TEST(test_interface_shape_infer, InvalidShapeForMatmul) {
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::logical_tensor_t src0
-            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
-    graph::logical_tensor_t weight0
-            = utils::logical_tensor_init(1, {8}, graph::data_type::f32);
-    graph::logical_tensor_t dst0
-            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
-
-    std::vector<graph::logical_tensor_t *> inputs0 {&src0, &weight0};
-    std::vector<graph::logical_tensor_t *> outputs0 {&dst0};
-    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs0, outputs0),
-            graph::status::invalid_shape);
-
-    graph::logical_tensor_t src1
-            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
-    graph::logical_tensor_t weight1
-            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
-    graph::logical_tensor_t dst1
-            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
-
-    std::vector<graph::logical_tensor_t *> inputs1 {&src1, &weight1};
-    std::vector<graph::logical_tensor_t *> outputs1 {&dst1};
-    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs1, outputs1),
-            graph::status::invalid_shape);
-
-    graph::logical_tensor_t src2
-            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
-    graph::logical_tensor_t weight2
-            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
-    graph::logical_tensor_t dst2
-            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
-
-    std::vector<graph::logical_tensor_t *> inputs2 {&src2, &weight2};
-    std::vector<graph::logical_tensor_t *> outputs2 {&dst2};
-    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs2, outputs2),
-            graph::status::invalid_shape);
-
-    graph::logical_tensor_t src3
-            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
-    graph::logical_tensor_t weight3
-            = utils::logical_tensor_init(1, {4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t dst3
-            = utils::logical_tensor_init(1, {4, 1}, graph::data_type::f32);
-
-    std::vector<graph::logical_tensor_t *> inputs3 {&src3, &weight3};
-    std::vector<graph::logical_tensor_t *> outputs3 {&dst3};
-    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs3, outputs3),
-            graph::status::invalid_shape);
-
-    graph::logical_tensor_t src4
-            = utils::logical_tensor_init(1, {8, 6, 3}, graph::data_type::f32);
-    graph::logical_tensor_t weight4
-            = utils::logical_tensor_init(1, {4, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t dst4
-            = utils::logical_tensor_init(1, {4, 1, 2}, graph::data_type::f32);
-
-    std::vector<graph::logical_tensor_t *> inputs4 {&src4, &weight4};
-    std::vector<graph::logical_tensor_t *> outputs4 {&dst4};
-    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs4, outputs4),
-            graph::status::invalid_shape);
-
-    graph::logical_tensor_t src5
-            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
-    graph::logical_tensor_t weight5
-            = utils::logical_tensor_init(1, {6, 4}, graph::data_type::f32);
-    graph::logical_tensor_t dst5
-            = utils::logical_tensor_init(1, {8, 2}, graph::data_type::f32);
-
-    std::vector<graph::logical_tensor_t *> inputs5 {&src5, &weight5};
-    std::vector<graph::logical_tensor_t *> outputs5 {&dst5};
-    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs5, outputs5),
-            graph::status::invalid_shape);
-}
-
-TEST(test_interface_shape_infer, InvalidShapeForConv) {
-    using dims = graph::dnnl_impl::dims;
-
-    graph::op_t conv_op {0, graph::op_kind::Convolution, std::string("conv")};
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 2);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-
-    graph::logical_tensor_t src = utils::logical_tensor_init(
-            0, {8, 3, 224, 224}, graph::data_type::f32);
-    graph::logical_tensor_t weight = utils::logical_tensor_init(
-            1, {16, 3, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t dst = utils::logical_tensor_init(2,
-            {8, 16, 222, 222}, graph::data_type::f32, graph::layout_type::any);
-
-    std::vector<graph::logical_tensor_t *> inputs {&src, &weight};
-    std::vector<graph::logical_tensor_t *> outputs {&dst};
-    ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
-            graph::status::invalid_shape);
-    ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
-                      &conv_op, inputs, outputs),
-            graph::status::invalid_shape);
-
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
-    ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
-            graph::status::invalid_shape);
-    ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
-                      &conv_op, inputs, outputs),
-            graph::status::invalid_shape);
-}
-
-TEST(test_interface_shape_infer, InvalidShapeForPoolBackward) {
-    graph::op_t max_pool_bk_op {
-            0, graph::op_kind::MaxPoolBackward, std::string("max_pool_bk_op")};
-    graph::op_t avg_pool_bk_op {
-            1, graph::op_kind::AvgPoolBackward, std::string("avg_pool_bk_op")};
-
-    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
-            0, {1, 1, 4, 4}, graph::data_type::f32);
-    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
-            1, {1, 1, 2, 2}, graph::data_type::f32, graph::layout_type::any);
-    graph::logical_tensor_t dst_lt2 = utils::logical_tensor_init(
-            2, graph::data_type::f32, graph::layout_type::any);
-
-    std::vector<graph::logical_tensor_t *> inputs {&src_lt};
-    std::vector<graph::logical_tensor_t *> outputs {&dst_lt};
-    std::vector<graph::logical_tensor_t *> outputs2 {&dst_lt2};
-
-    ASSERT_EQ(graph::infer_pool_bwd_output_shape(
-                      &max_pool_bk_op, inputs, outputs),
-            graph::status::invalid_shape);
-    ASSERT_EQ(graph::infer_pool_bwd_output_shape(
-                      &avg_pool_bk_op, inputs, outputs2),
-            graph::status::unimplemented);
-}
-
-TEST(test_interface_shape_infer, CanonicalizeErrorDeathTest) {
-#ifndef NDEBUG
-    ASSERT_DEATH(graph::canonicalize({1, 2, 3, 4}, "XXXX"), "invalid format");
-#endif
-}
-
-TEST(test_interface_shape_infer, InferConvOuputShapeError) {
-    graph::op_t conv_op(graph::op_kind::Convolution);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    {
-        std::vector<graph::dim_t> src_shape = {8, 8, 32, 32};
-        std::vector<graph::dim_t> wei_shape = {8, 4, 1, 1};
-        graph::data_type_t dtype = graph::data_type::f32;
-        auto conv0_src = utils::logical_tensor_init(0, src_shape, dtype);
-        auto conv0_wei = utils::logical_tensor_init(1, wei_shape, dtype);
-        auto conv0_dst = utils::logical_tensor_init(2, src_shape, dtype);
-        std::vector<graph::logical_tensor_t *> inputs {&conv0_src, &conv0_wei};
-        std::vector<graph::logical_tensor_t *> outputs {&conv0_dst};
-        ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
-                graph::status::invalid_shape);
-    }
-
-    {
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1, 1, 1});
-        std::vector<graph::dim_t> src_shape = {8, 8, 32, 32};
-        std::vector<graph::dim_t> wei_shape = {8, 8, 1, 1};
-        graph::data_type_t dtype = graph::data_type::f32;
-        auto conv0_src = utils::logical_tensor_init(0, src_shape, dtype);
-        auto conv0_wei = utils::logical_tensor_init(1, wei_shape, dtype);
-        auto conv0_dst = utils::logical_tensor_init(2, src_shape, dtype);
-        std::vector<graph::logical_tensor_t *> inputs {&conv0_src, &conv0_wei};
-        std::vector<graph::logical_tensor_t *> outputs {&conv0_dst};
-        ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
-                graph::status::invalid_shape);
-    }
-}
-
-TEST(test_interface_shape_infer, InferConvBpropDataOuputShape) {
-    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardData);
-    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
-    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
-    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
-    // according to spec, group should be greater than 0
-    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
-    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
-    conv_op.set_attr<dims>(graph::op_attr::dst_shape, dims {8, 3, 224, 224});
-
-    // prepare logical tensor
-    graph::logical_tensor_t diff_src = utils::logical_tensor_init(
-            0, {8, 3, 224, 224}, graph::data_type::f32);
-    graph::logical_tensor_t weights = utils::logical_tensor_init(
-            1, {16, 3, 3, 3}, graph::data_type::f32);
-    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
-            2, {8, 16, 222, 222}, graph::data_type::f32);
-
-    conv_op.add_input(diff_dst);
-    conv_op.add_input(weights);
-    conv_op.add_output(diff_src);
-    std::vector<graph::logical_tensor_t *> inputs {&diff_src, &weights};
-    std::vector<graph::logical_tensor_t *> outputs {&diff_dst};
-    {
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "XXXXX");
-        ASSERT_EQ(graph::infer_conv_bprop_data_output_shape(
-                          &conv_op, inputs, outputs),
-                graph::status::unimplemented);
-    }
-    {
-        conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1, 1, 1});
-        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
-        ASSERT_EQ(graph::infer_conv_bprop_data_output_shape(
-                          &conv_op, inputs, outputs),
-                graph::status::invalid_shape);
-    }
-}
-
-TEST(test_interface_shape_infer, InferConvtransposeNcxOixError) {
-    graph::op_t conv {graph::op_kind::ConvTransposeBackwardData,
-            graph::op_t::kind2str(graph::op_kind::ConvTransposeBackwardData)};
-    std::vector<int64_t> strides = {1, 1};
-    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
-    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
-    std::vector<int64_t> dilations = {2, 2};
-    std::string data_format = "NCX";
-    std::string filter_format = "OIX";
-    std::string auto_pad = "VALID";
-    int64_t groups = 1;
-    int64_t id = 0;
-    conv.set_attr(graph::op_attr::strides, strides);
-    conv.set_attr(graph::op_attr::pads_begin, pads_begin);
-    conv.set_attr(graph::op_attr::pads_end, pads_end);
-    conv.set_attr(graph::op_attr::dilations, dilations);
-    conv.set_attr(graph::op_attr::auto_pad, auto_pad);
-    conv.set_attr(graph::op_attr::data_format, data_format);
-    conv.set_attr(graph::op_attr::weights_format, filter_format);
-    conv.set_attr(graph::op_attr::groups, groups);
-
-    auto lt_data = utils::logical_tensor_init(
-            id++, {1, 1, 16, 16}, graph::data_type::f32);
-    auto lt_weight = utils::logical_tensor_init(
-            id++, {1, 1, 4, 4}, graph::data_type::f32);
-    std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
-
-    auto lt_o = utils::logical_tensor_init(
-            id++, graph::data_type::f32, graph::layout_type::strided);
-    std::vector<graph::logical_tensor_t *> lt_in {&lt_data, &lt_weight};
-    std::vector<graph::logical_tensor_t *> lt_out {&lt_o};
-    {
-        conv.set_attr<int64_t>(graph::op_attr::groups, 2);
-        ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
-                          &conv, lt_in, lt_out),
-                graph::status::invalid_shape);
-    }
-
-    {
-        conv.set_attr(graph::op_attr::groups, groups);
-        conv.set_attr<graph::dims>(graph::op_attr::strides, {1, 1, 1, 1, 1});
-        ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
-                          &conv, lt_in, lt_out),
-                graph::status::invalid_shape);
-    }
-}
diff --git a/tests/gtests/graph/unit/interface/test_shape_infer_cpu.cpp b/tests/gtests/graph/unit/interface/test_shape_infer_cpu.cpp
new file mode 100644
index 00000000000..ffe8b98a533
--- /dev/null
+++ b/tests/gtests/graph/unit/interface/test_shape_infer_cpu.cpp
@@ -0,0 +1,316 @@
+/*******************************************************************************
+* Copyright 2022-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "gtest/gtest.h"
+
+#include "interface/c_types_map.hpp"
+#include "interface/shape_infer.hpp"
+
+#include "backend/dnnl/common.hpp"
+
+#include "graph/unit/unit_test_common.hpp"
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+using graph::dims;
+
+TEST(test_interface_shape_infer, OneWayBroadcast) {
+    using dims = graph::dims;
+    dims src_shape {2, 3};
+    dims dst1_shape {2};
+    dims dst2_shape {2, 3};
+    dims dst3_shape {4, 3};
+    dims dst4_shape {1, 2, 3};
+
+    ASSERT_EQ(graph::one_way_broadcast(dst1_shape, src_shape),
+            graph::status::invalid_shape);
+
+    ASSERT_EQ(graph::one_way_broadcast(dst2_shape, src_shape),
+            graph::status::success);
+
+    ASSERT_EQ(graph::one_way_broadcast(dst3_shape, src_shape),
+            graph::status::invalid_shape);
+
+    ASSERT_EQ(graph::one_way_broadcast(dst4_shape, src_shape),
+            graph::status::success);
+}
+
+TEST(test_interface_shape_infer, InvalidShapeForMatmul) {
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::logical_tensor_t src0
+            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
+    graph::logical_tensor_t weight0
+            = utils::logical_tensor_init(1, {8}, graph::data_type::f32);
+    graph::logical_tensor_t dst0
+            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
+
+    std::vector<graph::logical_tensor_t *> inputs0 {&src0, &weight0};
+    std::vector<graph::logical_tensor_t *> outputs0 {&dst0};
+    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs0, outputs0),
+            graph::status::invalid_shape);
+
+    graph::logical_tensor_t src1
+            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
+    graph::logical_tensor_t weight1
+            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
+    graph::logical_tensor_t dst1
+            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
+
+    std::vector<graph::logical_tensor_t *> inputs1 {&src1, &weight1};
+    std::vector<graph::logical_tensor_t *> outputs1 {&dst1};
+    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs1, outputs1),
+            graph::status::invalid_shape);
+
+    graph::logical_tensor_t src2
+            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
+    graph::logical_tensor_t weight2
+            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
+    graph::logical_tensor_t dst2
+            = utils::logical_tensor_init(1, {4}, graph::data_type::f32);
+
+    std::vector<graph::logical_tensor_t *> inputs2 {&src2, &weight2};
+    std::vector<graph::logical_tensor_t *> outputs2 {&dst2};
+    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs2, outputs2),
+            graph::status::invalid_shape);
+
+    graph::logical_tensor_t src3
+            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
+    graph::logical_tensor_t weight3
+            = utils::logical_tensor_init(1, {4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t dst3
+            = utils::logical_tensor_init(1, {4, 1}, graph::data_type::f32);
+
+    std::vector<graph::logical_tensor_t *> inputs3 {&src3, &weight3};
+    std::vector<graph::logical_tensor_t *> outputs3 {&dst3};
+    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs3, outputs3),
+            graph::status::invalid_shape);
+
+    graph::logical_tensor_t src4
+            = utils::logical_tensor_init(1, {8, 6, 3}, graph::data_type::f32);
+    graph::logical_tensor_t weight4
+            = utils::logical_tensor_init(1, {4, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t dst4
+            = utils::logical_tensor_init(1, {4, 1, 2}, graph::data_type::f32);
+
+    std::vector<graph::logical_tensor_t *> inputs4 {&src4, &weight4};
+    std::vector<graph::logical_tensor_t *> outputs4 {&dst4};
+    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs4, outputs4),
+            graph::status::invalid_shape);
+
+    graph::logical_tensor_t src5
+            = utils::logical_tensor_init(1, {8, 6}, graph::data_type::f32);
+    graph::logical_tensor_t weight5
+            = utils::logical_tensor_init(1, {6, 4}, graph::data_type::f32);
+    graph::logical_tensor_t dst5
+            = utils::logical_tensor_init(1, {8, 2}, graph::data_type::f32);
+
+    std::vector<graph::logical_tensor_t *> inputs5 {&src5, &weight5};
+    std::vector<graph::logical_tensor_t *> outputs5 {&dst5};
+    ASSERT_EQ(graph::infer_matmul_output_shape(&matmul, inputs5, outputs5),
+            graph::status::invalid_shape);
+}
+
+TEST(test_interface_shape_infer, InvalidShapeForConv) {
+    using dims = graph::dnnl_impl::dims;
+
+    graph::op_t conv_op {0, graph::op_kind::Convolution, std::string("conv")};
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 2);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+
+    graph::logical_tensor_t src = utils::logical_tensor_init(
+            0, {8, 3, 224, 224}, graph::data_type::f32);
+    graph::logical_tensor_t weight = utils::logical_tensor_init(
+            1, {16, 3, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t dst = utils::logical_tensor_init(2,
+            {8, 16, 222, 222}, graph::data_type::f32, graph::layout_type::any);
+
+    std::vector<graph::logical_tensor_t *> inputs {&src, &weight};
+    std::vector<graph::logical_tensor_t *> outputs {&dst};
+    ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
+            graph::status::invalid_shape);
+    ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
+                      &conv_op, inputs, outputs),
+            graph::status::invalid_shape);
+
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1, 1});
+    ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
+            graph::status::invalid_shape);
+    ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
+                      &conv_op, inputs, outputs),
+            graph::status::invalid_shape);
+}
+
+TEST(test_interface_shape_infer, InvalidShapeForPoolBackward) {
+    graph::op_t max_pool_bk_op {
+            0, graph::op_kind::MaxPoolBackward, std::string("max_pool_bk_op")};
+    graph::op_t avg_pool_bk_op {
+            1, graph::op_kind::AvgPoolBackward, std::string("avg_pool_bk_op")};
+
+    graph::logical_tensor_t src_lt = utils::logical_tensor_init(
+            0, {1, 1, 4, 4}, graph::data_type::f32);
+    graph::logical_tensor_t dst_lt = utils::logical_tensor_init(
+            1, {1, 1, 2, 2}, graph::data_type::f32, graph::layout_type::any);
+    graph::logical_tensor_t dst_lt2 = utils::logical_tensor_init(
+            2, graph::data_type::f32, graph::layout_type::any);
+
+    std::vector<graph::logical_tensor_t *> inputs {&src_lt};
+    std::vector<graph::logical_tensor_t *> outputs {&dst_lt};
+    std::vector<graph::logical_tensor_t *> outputs2 {&dst_lt2};
+
+    ASSERT_EQ(graph::infer_pool_bwd_output_shape(
+                      &max_pool_bk_op, inputs, outputs),
+            graph::status::invalid_shape);
+    ASSERT_EQ(graph::infer_pool_bwd_output_shape(
+                      &avg_pool_bk_op, inputs, outputs2),
+            graph::status::unimplemented);
+}
+
+TEST(test_interface_shape_infer, CanonicalizeErrorDeathTest) {
+#ifndef NDEBUG
+    ASSERT_DEATH(graph::canonicalize({1, 2, 3, 4}, "XXXX"), "invalid format");
+#endif
+}
+
+TEST(test_interface_shape_infer, InferConvOuputShapeError) {
+    graph::op_t conv_op(graph::op_kind::Convolution);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    {
+        std::vector<graph::dim_t> src_shape = {8, 8, 32, 32};
+        std::vector<graph::dim_t> wei_shape = {8, 4, 1, 1};
+        graph::data_type_t dtype = graph::data_type::f32;
+        auto conv0_src = utils::logical_tensor_init(0, src_shape, dtype);
+        auto conv0_wei = utils::logical_tensor_init(1, wei_shape, dtype);
+        auto conv0_dst = utils::logical_tensor_init(2, src_shape, dtype);
+        std::vector<graph::logical_tensor_t *> inputs {&conv0_src, &conv0_wei};
+        std::vector<graph::logical_tensor_t *> outputs {&conv0_dst};
+        ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
+                graph::status::invalid_shape);
+    }
+
+    {
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1, 1, 1});
+        std::vector<graph::dim_t> src_shape = {8, 8, 32, 32};
+        std::vector<graph::dim_t> wei_shape = {8, 8, 1, 1};
+        graph::data_type_t dtype = graph::data_type::f32;
+        auto conv0_src = utils::logical_tensor_init(0, src_shape, dtype);
+        auto conv0_wei = utils::logical_tensor_init(1, wei_shape, dtype);
+        auto conv0_dst = utils::logical_tensor_init(2, src_shape, dtype);
+        std::vector<graph::logical_tensor_t *> inputs {&conv0_src, &conv0_wei};
+        std::vector<graph::logical_tensor_t *> outputs {&conv0_dst};
+        ASSERT_EQ(graph::infer_conv_output_shape(&conv_op, inputs, outputs),
+                graph::status::invalid_shape);
+    }
+}
+
+TEST(test_interface_shape_infer, InferConvBpropDataOuputShape) {
+    graph::op_t conv_op(graph::op_kind::ConvolutionBackwardData);
+    conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::dilations, dims {1, 1});
+    conv_op.set_attr<dims>(graph::op_attr::pads_begin, dims {0, 0});
+    conv_op.set_attr<dims>(graph::op_attr::pads_end, dims {0, 0});
+    // according to spec, group should be greater than 0
+    conv_op.set_attr<int64_t>(graph::op_attr::groups, 1);
+    conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+    conv_op.set_attr<std::string>(graph::op_attr::weights_format, "OIX");
+    conv_op.set_attr<dims>(graph::op_attr::dst_shape, dims {8, 3, 224, 224});
+
+    // prepare logical tensor
+    graph::logical_tensor_t diff_src = utils::logical_tensor_init(
+            0, {8, 3, 224, 224}, graph::data_type::f32);
+    graph::logical_tensor_t weights = utils::logical_tensor_init(
+            1, {16, 3, 3, 3}, graph::data_type::f32);
+    graph::logical_tensor_t diff_dst = utils::logical_tensor_init(
+            2, {8, 16, 222, 222}, graph::data_type::f32);
+
+    conv_op.add_input(diff_dst);
+    conv_op.add_input(weights);
+    conv_op.add_output(diff_src);
+    std::vector<graph::logical_tensor_t *> inputs {&diff_src, &weights};
+    std::vector<graph::logical_tensor_t *> outputs {&diff_dst};
+    {
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "XXXXX");
+        ASSERT_EQ(graph::infer_conv_bprop_data_output_shape(
+                          &conv_op, inputs, outputs),
+                graph::status::unimplemented);
+    }
+    {
+        conv_op.set_attr<dims>(graph::op_attr::strides, dims {1, 1, 1, 1, 1});
+        conv_op.set_attr<std::string>(graph::op_attr::data_format, "NCX");
+        ASSERT_EQ(graph::infer_conv_bprop_data_output_shape(
+                          &conv_op, inputs, outputs),
+                graph::status::invalid_shape);
+    }
+}
+
+TEST(test_interface_shape_infer, InferConvtransposeNcxOixError) {
+    graph::op_t conv {graph::op_kind::ConvTransposeBackwardData,
+            graph::op_t::kind2str(graph::op_kind::ConvTransposeBackwardData)};
+    std::vector<int64_t> strides = {1, 1};
+    std::vector<int64_t> pads_begin = {0, 0}; // empty pads_begin
+    std::vector<int64_t> pads_end = {0, 0}; // empty pads_end
+    std::vector<int64_t> dilations = {2, 2};
+    std::string data_format = "NCX";
+    std::string filter_format = "OIX";
+    std::string auto_pad = "VALID";
+    int64_t groups = 1;
+    int64_t id = 0;
+    conv.set_attr(graph::op_attr::strides, strides);
+    conv.set_attr(graph::op_attr::pads_begin, pads_begin);
+    conv.set_attr(graph::op_attr::pads_end, pads_end);
+    conv.set_attr(graph::op_attr::dilations, dilations);
+    conv.set_attr(graph::op_attr::auto_pad, auto_pad);
+    conv.set_attr(graph::op_attr::data_format, data_format);
+    conv.set_attr(graph::op_attr::weights_format, filter_format);
+    conv.set_attr(graph::op_attr::groups, groups);
+
+    auto lt_data = utils::logical_tensor_init(
+            id++, {1, 1, 16, 16}, graph::data_type::f32);
+    auto lt_weight = utils::logical_tensor_init(
+            id++, {1, 1, 4, 4}, graph::data_type::f32);
+    std::vector<int64_t> expected_out_shape {1, 1, 10, 10};
+
+    auto lt_o = utils::logical_tensor_init(
+            id++, graph::data_type::f32, graph::layout_type::strided);
+    std::vector<graph::logical_tensor_t *> lt_in {&lt_data, &lt_weight};
+    std::vector<graph::logical_tensor_t *> lt_out {&lt_o};
+    {
+        conv.set_attr<int64_t>(graph::op_attr::groups, 2);
+        ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
+                          &conv, lt_in, lt_out),
+                graph::status::invalid_shape);
+    }
+
+    {
+        conv.set_attr(graph::op_attr::groups, groups);
+        conv.set_attr<graph::dims>(graph::op_attr::strides, {1, 1, 1, 1, 1});
+        ASSERT_EQ(graph::infer_convtranspose_bprop_data_output_shape(
+                          &conv, lt_in, lt_out),
+                graph::status::invalid_shape);
+    }
+}
diff --git a/tests/gtests/graph/unit/interface/test_tensor.cpp b/tests/gtests/graph/unit/interface/test_tensor.cpp
index b94f0cb5ba1..e4944a3bd4e 100644
--- a/tests/gtests/graph/unit/interface/test_tensor.cpp
+++ b/tests/gtests/graph/unit/interface/test_tensor.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ TEST(test_interface_tensor, GetEngine) {
     graph::engine_t &engine = *get_engine();
     graph::logical_tensor_t lt = utils::logical_tensor_init(
             0, {1, 2}, graph::data_type::f32, graph::layout_type::strided);
-    test_tensor tmp(lt, &engine);
+    test_tensor_t tmp(lt, &engine);
     ASSERT_EQ(tmp.get().get_engine(), &engine);
 }
 
diff --git a/tests/gtests/graph/unit/interface/test_value.cpp b/tests/gtests/graph/unit/interface/test_value.cpp
deleted file mode 100644
index 7ab8b8b7d47..00000000000
--- a/tests/gtests/graph/unit/interface/test_value.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <gtest/gtest.h>
-
-#include "interface/c_types_map.hpp"
-#include "interface/logical_tensor.hpp"
-#include "interface/op.hpp"
-#include "interface/value.hpp"
-
-#include "graph/unit/utils.hpp"
-
-namespace graph = dnnl::impl::graph;
-namespace utils = dnnl::graph::tests::unit::utils;
-
-TEST(test_interface_value, Create) {
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val {matmul, 0, lt};
-
-    ASSERT_EQ(val.is_internal(), false);
-}
-
-TEST(test_interface_value, CreateInternal) {
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val {matmul, 0, lt, true};
-
-    ASSERT_EQ(val.is_internal(), true);
-}
-
-TEST(test_interface_value, GetLogicalTensor) {
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val {matmul, 0, lt};
-
-    auto lt1 = val.get_logical_tensor();
-    ASSERT_TRUE(graph::logical_tensor_wrapper_t(lt)
-            == graph::logical_tensor_wrapper_t(lt1));
-}
-TEST(test_interface_value, GetProducer) {
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val {matmul, 0, lt};
-
-    ASSERT_TRUE(val.has_producer());
-
-    auto &prod = val.get_producer();
-    ASSERT_EQ(&prod, &matmul);
-}
-
-TEST(test_interface_value, SetProducer) {
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val {lt, false};
-    ASSERT_FALSE(val.has_producer());
-
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    val.set_producer(matmul);
-    ASSERT_TRUE(val.has_producer());
-
-    auto &prod = val.get_producer();
-    ASSERT_EQ(&prod, &matmul);
-}
-
-TEST(test_interface_value, Equal) {
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val1 {lt, false};
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    val1.set_producer(matmul);
-    val1.set_offset(0);
-
-    graph::value_t val2 {matmul, 0, lt};
-    ASSERT_EQ(val1, val2);
-
-    graph::value_t val3 {matmul, 1, lt};
-    ASSERT_NE(val1, val3);
-
-    graph::logical_tensor_t lt1
-            = utils::logical_tensor_init(123, graph::data_type::f32);
-
-    graph::value_t val4 {matmul, 0, lt1};
-    ASSERT_NE(val1, val4);
-}
-
-TEST(test_interface_value, Offset) {
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::value_t val {matmul, 1, lt};
-    size_t offset = val.get_offset();
-    ASSERT_EQ(offset, 1U);
-
-    val.set_offset(2);
-    offset = val.get_offset();
-    ASSERT_EQ(offset, 2U);
-}
-
-TEST(test_interface_value, DefaultOffset) {
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::value_t val {lt};
-    size_t offset = val.get_offset();
-    size_t max = std::numeric_limits<size_t>::max();
-    ASSERT_EQ(offset, max);
-
-    val.set_offset(2);
-    offset = val.get_offset();
-    ASSERT_EQ(offset, 2U);
-}
-
-TEST(test_interface_value, AddConsumer) {
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(1, graph::data_type::f32);
-    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
-    graph::value_t val {matmul, 0, lt};
-
-    std::vector<graph::value_t::consumer_t> consumers = val.get_consumers();
-    ASSERT_EQ(consumers.size(), 0U);
-
-    graph::op_t relu {2, graph::op_kind::ReLU, std::string("relu")};
-    val.add_consumer(relu, 0);
-    consumers = val.get_consumers();
-    ASSERT_EQ(consumers.size(), 1U);
-}
-
-TEST(test_interface_value, FindConsumer) {
-    size_t id = 0;
-    graph::logical_tensor_t lt
-            = utils::logical_tensor_init(++id, graph::data_type::f32);
-    graph::op_t matmul {++id, graph::op_kind::MatMul, std::string("matmul")};
-    graph::value_t val {matmul, 0, lt};
-
-    graph::op_t relu_op {id++, graph::op_kind::ReLU, std::string("relu")};
-    val.add_consumer(relu_op, 0);
-
-    graph::op_t abs_op {id++, graph::op_kind::Abs, std::string("abs")};
-    val.add_consumer(abs_op, 1);
-
-    auto ret1 = val.find_consumer(1, graph::op_kind::ReLU, 0);
-    ASSERT_EQ(ret1.has_value(), false);
-
-    auto ret2 = val.find_consumer(0, graph::op_kind::Abs, 0);
-    ASSERT_EQ(ret2.has_value(), false);
-
-    auto ret3 = val.find_consumer(0, graph::op_kind::Abs, 1);
-    ASSERT_EQ(ret3.has_value(), true);
-
-    auto ret4 = val.find_consumer(0, graph::op_kind::Abs, 1, true);
-    ASSERT_EQ(ret4.has_value(), true);
-}
diff --git a/tests/gtests/graph/unit/interface/test_value_cpu.cpp b/tests/gtests/graph/unit/interface/test_value_cpu.cpp
new file mode 100644
index 00000000000..b5d3aaa7d47
--- /dev/null
+++ b/tests/gtests/graph/unit/interface/test_value_cpu.cpp
@@ -0,0 +1,169 @@
+/*******************************************************************************
+* Copyright 2021-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <gtest/gtest.h>
+
+#include "interface/c_types_map.hpp"
+#include "interface/logical_tensor.hpp"
+#include "interface/op.hpp"
+#include "interface/value.hpp"
+
+#include "graph/unit/utils.hpp"
+
+namespace graph = dnnl::impl::graph;
+namespace utils = dnnl::graph::tests::unit::utils;
+
+TEST(test_interface_value, Create) {
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val {matmul, 0, lt};
+
+    ASSERT_EQ(val.is_internal(), false);
+}
+
+TEST(test_interface_value, CreateInternal) {
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val {matmul, 0, lt, true};
+
+    ASSERT_EQ(val.is_internal(), true);
+}
+
+TEST(test_interface_value, GetLogicalTensor) {
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val {matmul, 0, lt};
+
+    auto lt1 = val.get_logical_tensor();
+    ASSERT_TRUE(graph::logical_tensor_wrapper_t(lt)
+            == graph::logical_tensor_wrapper_t(lt1));
+}
+TEST(test_interface_value, GetProducer) {
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val {matmul, 0, lt};
+
+    ASSERT_TRUE(val.has_producer());
+
+    auto &prod = val.get_producer();
+    ASSERT_EQ(&prod, &matmul);
+}
+
+TEST(test_interface_value, SetProducer) {
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val {lt, false};
+    ASSERT_FALSE(val.has_producer());
+
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    val.set_producer(matmul);
+    ASSERT_TRUE(val.has_producer());
+
+    auto &prod = val.get_producer();
+    ASSERT_EQ(&prod, &matmul);
+}
+
+TEST(test_interface_value, Equal) {
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val1 {lt, false};
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    val1.set_producer(matmul);
+    val1.set_offset(0);
+
+    graph::value_t val2 {matmul, 0, lt};
+    ASSERT_EQ(val1, val2);
+
+    graph::value_t val3 {matmul, 1, lt};
+    ASSERT_NE(val1, val3);
+
+    graph::logical_tensor_t lt1
+            = utils::logical_tensor_init(123, graph::data_type::f32);
+
+    graph::value_t val4 {matmul, 0, lt1};
+    ASSERT_NE(val1, val4);
+}
+
+TEST(test_interface_value, Offset) {
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::value_t val {matmul, 1, lt};
+    size_t offset = val.get_offset();
+    ASSERT_EQ(offset, 1U);
+
+    val.set_offset(2);
+    offset = val.get_offset();
+    ASSERT_EQ(offset, 2U);
+}
+
+TEST(test_interface_value, DefaultOffset) {
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::value_t val {lt};
+    size_t offset = val.get_offset();
+    size_t max = std::numeric_limits<size_t>::max();
+    ASSERT_EQ(offset, max);
+
+    val.set_offset(2);
+    offset = val.get_offset();
+    ASSERT_EQ(offset, 2U);
+}
+
+TEST(test_interface_value, AddConsumer) {
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(1, graph::data_type::f32);
+    graph::op_t matmul {0, graph::op_kind::MatMul, std::string("matmul")};
+    graph::value_t val {matmul, 0, lt};
+
+    std::vector<graph::value_t::consumer_t> consumers = val.get_consumers();
+    ASSERT_EQ(consumers.size(), 0U);
+
+    graph::op_t relu {2, graph::op_kind::ReLU, std::string("relu")};
+    val.add_consumer(relu, 0);
+    consumers = val.get_consumers();
+    ASSERT_EQ(consumers.size(), 1U);
+}
+
+TEST(test_interface_value, FindConsumer) {
+    size_t id = 0;
+    graph::logical_tensor_t lt
+            = utils::logical_tensor_init(++id, graph::data_type::f32);
+    graph::op_t matmul {++id, graph::op_kind::MatMul, std::string("matmul")};
+    graph::value_t val {matmul, 0, lt};
+
+    graph::op_t relu_op {id++, graph::op_kind::ReLU, std::string("relu")};
+    val.add_consumer(relu_op, 0);
+
+    graph::op_t abs_op {id++, graph::op_kind::Abs, std::string("abs")};
+    val.add_consumer(abs_op, 1);
+
+    auto ret1 = val.find_consumer(1, graph::op_kind::ReLU, 0);
+    ASSERT_EQ(ret1.has_value(), false);
+
+    auto ret2 = val.find_consumer(0, graph::op_kind::Abs, 0);
+    ASSERT_EQ(ret2.has_value(), false);
+
+    auto ret3 = val.find_consumer(0, graph::op_kind::Abs, 1);
+    ASSERT_EQ(ret3.has_value(), true);
+
+    auto ret4 = val.find_consumer(0, graph::op_kind::Abs, 1, true);
+    ASSERT_EQ(ret4.has_value(), true);
+}
diff --git a/tests/gtests/graph/unit/unit_test_common.hpp b/tests/gtests/graph/unit/unit_test_common.hpp
index 41a1ceaaa6a..8387365dd41 100644
--- a/tests/gtests/graph/unit/unit_test_common.hpp
+++ b/tests/gtests/graph/unit/unit_test_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,8 +36,6 @@
 #include "gpu/intel/sycl/compat.hpp"
 #if __has_include(<sycl/sycl.hpp>)
 #include <sycl/sycl.hpp>
-#elif __has_include(<CL/sycl.hpp>)
-#include <CL/sycl.hpp>
 #else
 #error "Unsupported compiler"
 #endif
@@ -82,13 +80,13 @@ inline int set_compiled_partition_cache_capacity(int capacity) {
     return 0;
 }
 
-class test_tensor {
+class test_tensor_t {
 private:
     using ltw = dnnl::impl::graph::logical_tensor_wrapper_t;
 
-    struct deletor_wrapper {
-        deletor_wrapper(const dnnl::impl::graph::engine_t *eng) : eng_(eng) {}
-        void operator()(void *p) {
+    struct deletor_wrapper_t {
+        deletor_wrapper_t(const dnnl::impl::graph::engine_t *eng) : eng_(eng) {}
+        void operator()(void *p) const {
             if (p) {
                 const auto k = eng_->kind();
                 auto alc = static_cast<dnnl::impl::graph::allocator_t *>(
@@ -104,8 +102,8 @@ class test_tensor {
                     alc->deallocate(p, get_device(), get_context(), {});
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
                     const auto *ocl_engine = dnnl::impl::utils::downcast<
-                            const dnnl::impl::gpu::intel::ocl::ocl_gpu_engine_t
-                                    *>(eng_);
+                            const dnnl::impl::gpu::intel::ocl::engine_t *>(
+                            eng_);
                     auto dev = ocl_engine->device();
                     auto ctx = ocl_engine->context();
                     alc->deallocate(p, dev, ctx, {});
@@ -131,24 +129,24 @@ class test_tensor {
 #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
             data.reset(static_cast<char *>(alc->allocate(
                                size, get_device(), get_context())),
-                    deletor_wrapper {e});
+                    deletor_wrapper_t {e});
 #else
             data.reset(static_cast<char *>(alc->allocate(size)),
-                    deletor_wrapper {e});
+                    deletor_wrapper_t {e});
 #endif
         } else { // gpu kind
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
             data.reset(static_cast<char *>(alc->allocate(
                                size, get_device(), get_context())),
-                    deletor_wrapper {e});
+                    deletor_wrapper_t {e});
 #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
             const auto *ocl_engine = dnnl::impl::utils::downcast<
-                    const dnnl::impl::gpu::intel::ocl::ocl_gpu_engine_t *>(e);
+                    const dnnl::impl::gpu::intel::ocl::engine_t *>(e);
             auto dev = ocl_engine->device();
             auto ctx = ocl_engine->context();
 
             data.reset(static_cast<char *>(alc->allocate(size, dev, ctx)),
-                    deletor_wrapper {e});
+                    deletor_wrapper_t {e});
 #else
             assert(!"only sycl and ocl runtime is supported on gpu");
 #endif
@@ -157,9 +155,9 @@ class test_tensor {
     }
 
 public:
-    test_tensor() = default;
+    test_tensor_t() = default;
 
-    test_tensor(const dnnl::impl::graph::logical_tensor_t &lt,
+    test_tensor_t(const dnnl::impl::graph::logical_tensor_t &lt,
             const dnnl::impl::graph::engine_t *e)
         : num_bytes_(ltw(lt).size()) {
         data_ = allocate(e, num_bytes_);
@@ -167,9 +165,9 @@ class test_tensor {
     }
 
     template <typename T>
-    test_tensor(const dnnl::impl::graph::logical_tensor_t &lt,
+    test_tensor_t(const dnnl::impl::graph::logical_tensor_t &lt,
             const dnnl::impl::graph::engine_t *e, const std::vector<T> &data)
-        : test_tensor(lt, e) {
+        : test_tensor_t(lt, e) {
         this->fill(data);
     }
 
@@ -239,10 +237,10 @@ class test_tensor {
     }
 
     static std::vector<dnnl::impl::graph::tensor_t> to_graph_tensor(
-            const std::vector<test_tensor> &vecs) {
-        std::vector<dnnl::impl::graph::tensor_t> res;
-        for (const auto &e : vecs) {
-            res.emplace_back(e.get());
+            const std::vector<test_tensor_t> &vecs) {
+        std::vector<dnnl::impl::graph::tensor_t> res(vecs.size());
+        for (size_t i = 0; i < vecs.size(); ++i) {
+            res[i] = vecs[i].get();
         }
         return res;
     }
diff --git a/tests/gtests/graph/unit/utils.hpp b/tests/gtests/graph/unit/utils.hpp
index 9feab434b38..0967797fec0 100644
--- a/tests/gtests/graph/unit/utils.hpp
+++ b/tests/gtests/graph/unit/utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2024 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -144,13 +144,14 @@ static inline std::vector<int64_t> compute_dense_strides(
 }
 
 static inline std::vector<dnnl::impl::graph::logical_tensor_t>
-create_logical_tensors(
-        size_t num_lt, impl::data_type_t dtype = impl::data_type::f32) {
+create_logical_tensors(size_t num_lt,
+        impl::data_type_t dtype = impl::data_type::f32,
+        size_t id_start_from = 0) {
     size_t count = 0;
     std::vector<dnnl::impl::graph::logical_tensor_t> lt_vec;
     lt_vec.reserve(num_lt);
     while (count < num_lt) {
-        lt_vec.emplace_back(logical_tensor_init(count, dtype));
+        lt_vec.emplace_back(logical_tensor_init(id_start_from + count, dtype));
         count++;
     }
     return lt_vec;
@@ -420,9 +421,9 @@ static inline void infer_conv_shape(dnnl::impl::graph::op_kind_t kind) {
 }
 
 static inline void verify_shape_infer_for_conv(
-        const dnnl::impl::graph::op_kind_t op_kind_, std::string data_format,
-        std::string filter_format, int64_t groups,
-        const std::vector<int64_t> &in_data,
+        const dnnl::impl::graph::op_kind_t op_kind_,
+        const std::string &data_format, const std::string &filter_format,
+        int64_t groups, const std::vector<int64_t> &in_data,
         const std::vector<int64_t> &in_weight,
         const std::vector<int64_t> &expected_out_shape) {
     using namespace dnnl::impl::graph;
@@ -504,9 +505,9 @@ static inline void verify_shape_infer_for_convtranspose_bprop_data(
 }
 
 static inline void verify_shape_infer_for_convtranspose(
-        const dnnl::impl::graph::op_kind_t op_kind_, std::string data_format,
-        std::string filter_format, int64_t groups,
-        const std::vector<int64_t> &in_data,
+        const dnnl::impl::graph::op_kind_t op_kind_,
+        const std::string &data_format, const std::string &filter_format,
+        int64_t groups, const std::vector<int64_t> &in_data,
         const std::vector<int64_t> &in_weight,
         const std::vector<int64_t> &expected_out_shape) {
     using namespace dnnl::impl::graph;
@@ -552,9 +553,9 @@ static inline void verify_shape_infer_for_convtranspose(
 }
 
 static inline void verify_shape_infer_for_conv(
-        const dnnl::impl::graph::op_kind_t op_kind_, std::string data_format,
-        std::string filter_format, int64_t groups,
-        const std::vector<int64_t> &in_data,
+        const dnnl::impl::graph::op_kind_t op_kind_,
+        const std::string &data_format, const std::string &filter_format,
+        int64_t groups, const std::vector<int64_t> &in_data,
         const std::vector<int64_t> &in_weight,
         const std::vector<int64_t> &in_bias,
         const std::vector<int64_t> &expected_out_shape) {
@@ -602,9 +603,9 @@ static inline void verify_shape_infer_for_conv(
 }
 
 static inline void verify_shape_infer_for_conv_bprop_data(
-        const dnnl::impl::graph::op_kind_t op_kind_, std::string data_format,
-        std::string filter_format, int64_t groups,
-        const std::vector<int64_t> &in_data,
+        const dnnl::impl::graph::op_kind_t op_kind_,
+        const std::string &data_format, const std::string &filter_format,
+        int64_t groups, const std::vector<int64_t> &in_data,
         const std::vector<int64_t> &in_weight,
         const std::vector<int64_t> &in_output_shape,
         const std::vector<int64_t> &expected_out_shape) {
@@ -1897,16 +1898,16 @@ inline void construct_chained_relu(dnnl::impl::graph::graph_t *agraph) {
     agraph->add_op(&relu2);
 }
 
-class id_generator {
+class id_generator_t {
 public:
-    id_generator() : id_(0) {};
+    id_generator_t() : id_(0) {};
     size_t get_id() { return id_++; }
 
 private:
     size_t id_;
 };
 
-inline impl::graph::logical_tensor_t create_convolution(id_generator &id_gen,
+inline impl::graph::logical_tensor_t create_convolution(id_generator_t &id_gen,
         impl::graph::graph_t &agraph, const impl::graph::logical_tensor_t &src,
         int64_t ic, int64_t ks, int64_t oc, int64_t groups,
         const impl::graph::dims &strides, const impl::graph::dims &dilations,
@@ -2016,7 +2017,7 @@ inline impl::graph::logical_tensor_t create_convolution(id_generator &id_gen,
     return dst;
 }
 
-inline impl::graph::logical_tensor_t create_add(id_generator &id_gen,
+inline impl::graph::logical_tensor_t create_add(id_generator_t &id_gen,
         impl::graph::graph_t &agraph, const impl::graph::logical_tensor_t &src0,
         const impl::graph::logical_tensor_t &src1) {
     impl::graph::op_t add(id_gen.get_id(), impl::graph::op_kind::Add, "add");
@@ -2028,7 +2029,7 @@ inline impl::graph::logical_tensor_t create_add(id_generator &id_gen,
     return dst;
 }
 
-inline impl::graph::logical_tensor_t create_relu(id_generator &id_gen,
+inline impl::graph::logical_tensor_t create_relu(id_generator_t &id_gen,
         impl::graph::graph_t &agraph,
         const impl::graph::logical_tensor_t &src) {
     impl::graph::op_t relu_op(
@@ -2040,7 +2041,7 @@ inline impl::graph::logical_tensor_t create_relu(id_generator &id_gen,
     return dst;
 }
 
-inline impl::graph::logical_tensor_t create_dequantize(id_generator &id_gen,
+inline impl::graph::logical_tensor_t create_dequantize(id_generator_t &id_gen,
         impl::graph::graph_t &agraph, const impl::graph::logical_tensor_t &src,
         const std::string &qtype, const std::vector<int64_t> &zps,
         const std::vector<float> &scales, int64_t axis) {
@@ -2059,7 +2060,7 @@ inline impl::graph::logical_tensor_t create_dequantize(id_generator &id_gen,
     return dst;
 }
 
-inline impl::graph::logical_tensor_t create_quantize(id_generator &id_gen,
+inline impl::graph::logical_tensor_t create_quantize(id_generator_t &id_gen,
         impl::graph::graph_t &agraph, const impl::graph::logical_tensor_t &src,
         impl::graph::data_type_t dst_dtype, const std::string &qtype,
         const std::vector<int64_t> &zps, const std::vector<float> &scales,
@@ -2079,7 +2080,7 @@ inline impl::graph::logical_tensor_t create_quantize(id_generator &id_gen,
 }
 
 inline impl::graph::logical_tensor_t create_int8_convolution(
-        id_generator &id_gen, impl::graph::graph_t &agraph,
+        id_generator_t &id_gen, impl::graph::graph_t &agraph,
         const impl::graph::logical_tensor_t &src, int64_t ic, int64_t ks,
         int64_t oc, int64_t groups, const impl::graph::dims &strides,
         const impl::graph::dims &dilations, const impl::graph::dims &pads_begin,
@@ -2185,7 +2186,7 @@ inline impl::graph::logical_tensor_t create_int8_convolution(
 }
 
 inline void construct_convolutional_bottleneck_resblock(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen) {
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen) {
     auto input = utils::logical_tensor_init(
             id_gen.get_id(), {8, 64, 56, 56}, impl::graph::data_type::f32);
     auto conv0 = create_convolution(id_gen, *agraph, input, 64, 1, 64, 1,
@@ -2206,7 +2207,7 @@ inline void construct_convolutional_bottleneck_resblock(
 }
 
 inline void construct_int8_conv_bias_relu_conv_bias_relu_block(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen) {
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen) {
     int64_t ic = 8, oc = 8, ks = 1;
     std::vector<int64_t> src_shape {1, ic, 12, 12};
 
@@ -2230,7 +2231,7 @@ inline void construct_int8_conv_bias_relu_conv_bias_relu_block(
 }
 
 inline void construct_int8_identical_bottleneck_resblock(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen) {
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen) {
     int64_t ic = 8, oc = 8, ks = 1;
     std::vector<int64_t> src_shape {1, ic, 12, 12};
 
@@ -2266,7 +2267,7 @@ inline void construct_int8_identical_bottleneck_resblock(
 }
 
 inline void construct_int8_convolutional_bottleneck_resblock(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen) {
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen) {
     int64_t ic = 8, oc = 8, ks = 1;
     std::vector<int64_t> src_shape {1, ic, 12, 12};
 
@@ -2317,7 +2318,7 @@ While CPU supports.
 we can set Post-sum/binary zero points by zp_postbinary.
 */
 inline void construct_int8_resnet50_stage2_block(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen,
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen,
         size_t three_conv_block_num = 2, bool use_biasadd = false,
         bool is_quantize_wei = false, float scales = 1 / 255.f,
         int64_t zps = 78, int64_t zp_postbinary = 0) {
@@ -2385,7 +2386,7 @@ inline void construct_int8_resnet50_stage2_block(
 }
 
 inline void construct_f32_resnet50_stage2_block(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen,
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen,
         size_t three_conv_block_num = 2, bool use_biasadd = false) {
     int64_t ic = 8, oc = 8, ks = 1;
     std::vector<int64_t> src_shape {1, ic, 12, 12};
@@ -2431,7 +2432,7 @@ inline void construct_f32_resnet50_stage2_block(
 }
 
 inline void construct_itex_int8_resnet50_stage2_block(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen,
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen,
         size_t three_conv_block_num = 2) {
     int64_t ic = 8, oc = 8, ks = 1;
     std::vector<int64_t> src_shape {1, ic, 12, 12};
@@ -2501,7 +2502,7 @@ inline void construct_itex_int8_resnet50_stage2_block(
 }
 
 inline void construct_int8_resnext101_stage3_block(
-        dnnl::impl::graph::graph_t *agraph, id_generator &id_gen,
+        dnnl::impl::graph::graph_t *agraph, id_generator_t &id_gen,
         size_t three_conv_block_num = 22) {
     int64_t ic = 8, oc = 8, ks = 1;
     std::vector<int64_t> src_shape {1, ic, 12, 12};
diff --git a/tests/gtests/graph/unit/utils/CMakeLists.txt b/tests/gtests/graph/unit/utils/CMakeLists.txt
index a25d5fe13f5..0c16e2a9726 100644
--- a/tests/gtests/graph/unit/utils/CMakeLists.txt
+++ b/tests/gtests/graph/unit/utils/CMakeLists.txt
@@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright 2022-2023 Intel Corporation
+# Copyright 2022-2024 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,15 +16,38 @@
 
 set(OBJ_LIB graph_unit_test_utils)
 
-file(GLOB TEST_UTILS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_*.cpp)
+file(GLOB TEST_UTILS_ENGINE_DEPENDENT_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_allocator.cpp
+)
+
+file(GLOB TEST_UTILS_ENGINE_INDEPENDENT_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_attribute_value_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_debug_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_json_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_pattern_matcher_cpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/test_utils_cpu.cpp
+)
 
 if(DNNL_USE_CLANG_SANITIZER)
     # Due to the following tests are testing out-of-range enum values.
-    list(REMOVE_ITEM TEST_UTILS_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_debug.cpp)
+    list(REMOVE_ITEM TEST_UTILS_ENGINE_INDEPENDENT_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_debug_cpu.cpp)
 endif()
 
-add_library(${OBJ_LIB} OBJECT ${TEST_UTILS_SOURCES})
+add_library(${OBJ_LIB} OBJECT ${TEST_UTILS_ENGINE_DEPENDENT_SOURCES} ${TEST_UTILS_ENGINE_INDEPENDENT_SOURCES})
 
 set_property(GLOBAL APPEND PROPERTY GRAPH_UNIT_TEST_DEPS $<TARGET_OBJECTS:${OBJ_LIB}>)
 
-register_graph_test_suite("test_graph_unit_utils" "test_utils_*")
+foreach(TEST_FILE ${TEST_UTILS_ENGINE_DEPENDENT_SOURCES})
+    get_filename_component(file_name ${TEST_FILE} NAME_WE)
+    string(REPLACE "test_" "test_graph_unit_utils_" test_suite_name ${file_name})
+    string(REPLACE "test_" "test_utils_" filter ${file_name})
+    register_graph_test_suite(${test_suite_name} "${filter}*")
+endforeach()
+
+foreach(TEST_FILE ${TEST_UTILS_ENGINE_INDEPENDENT_SOURCES})
+    get_filename_component(file_name ${TEST_FILE} NAME_WE)
+    string(REPLACE "_cpu" "" file_name ${file_name})
+    string(REPLACE "test_" "test_graph_unit_utils_" test_suite_name ${file_name})
+    string(REPLACE "test_" "test_utils_" filter ${file_name})
+    register_graph_test_suite(${test_suite_name}_cpu "${filter}*")
+endforeach()
diff --git a/tests/gtests/graph/unit/utils/test_allocator.cpp b/tests/gtests/graph/unit/utils/test_allocator.cpp
index e42fe13a6a6..772407b96fb 100644
--- a/tests/gtests/graph/unit/utils/test_allocator.cpp
+++ b/tests/gtests/graph/unit/utils/test_allocator.cpp
@@ -28,7 +28,7 @@
 
 namespace graph = dnnl::impl::graph;
 
-TEST(test_utils_alloctor, SyclAlloctorMallocAndFree) {
+TEST(test_utils_allocator, SyclAlloctorMallocAndFree) {
     graph::engine_t &engine = *get_engine();
     if (engine.kind() == graph::engine_kind::cpu) {
 #ifdef DNNL_GRAPH_CPU_SYCL
diff --git a/tests/gtests/graph/unit/utils/test_attribute_value.cpp b/tests/gtests/graph/unit/utils/test_attribute_value.cpp
deleted file mode 100644
index fa125d51d57..00000000000
--- a/tests/gtests/graph/unit/utils/test_attribute_value.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <gtest/gtest.h>
-
-#include "utils/attribute_value.hpp"
-
-#include <vector>
-
-TEST(test_utils_attribute_value, Int64) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const int64_t i = 1234;
-    attribute_value_t v1 {i};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::i);
-    ASSERT_EQ(v1.get<int64_t>(), i);
-}
-
-TEST(test_utils_attribute_value, Int64Vector) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const std::vector<int64_t> is {1234, 5678};
-    attribute_value_t v1 {is};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::is);
-    ASSERT_EQ(v1.get<std::vector<int64_t>>(), is);
-}
-
-TEST(test_utils_attribute_value, Float) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const float f = 0.5;
-    attribute_value_t v1 {f};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::f);
-    ASSERT_EQ(v1.get<float>(), f);
-}
-
-TEST(test_utils_attribute_value, FloatVector) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const std::vector<float> fs {0.5, 0.25};
-    attribute_value_t v1 {fs};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::fs);
-    ASSERT_EQ(v1.get<std::vector<float>>(), fs);
-}
-
-TEST(test_utils_attribute_value, BoolValue) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const bool b = true;
-    attribute_value_t v1 {b};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::b);
-    ASSERT_EQ(v1.get<bool>(), b);
-}
-
-TEST(test_utils_attribute_value, StringValue) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const std::string s = "string attribute";
-    attribute_value_t v1 {s};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::s);
-    ASSERT_EQ(v1.get<std::string>(), s);
-}
-
-TEST(test_utils_attribute_value, Copy) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const std::vector<float> fs {0.5, 0.25};
-    attribute_value_t v1 {fs};
-    ASSERT_EQ(v1.get_kind(), attribute_kind::fs);
-    ASSERT_EQ(v1.get<std::vector<float>>(), fs);
-
-    attribute_value_t v2 = v1; // NOLINT
-    ASSERT_EQ(v2.get_kind(), attribute_kind::fs);
-    ASSERT_EQ(v2.get<std::vector<float>>(), fs);
-}
-
-TEST(test_utils_attribute_value, Equal) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const std::vector<float> fs1 {0.5, 0.25};
-    attribute_value_t v1 {fs1};
-    attribute_value_t v2 {fs1};
-    ASSERT_EQ(v1, v2);
-
-    const std::vector<float> fs2 {0.5, 0.5};
-    attribute_value_t v3 {fs2};
-    ASSERT_NE(v1, v3);
-}
-
-TEST(test_utils_attribute_value, AttributeValueAssignOperator) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-
-    const attribute_value_t v1 {int64_t(3)};
-    attribute_value_t v2 {int64_t(1)};
-    v2 = v1;
-    ASSERT_EQ(v1, v2);
-}
diff --git a/tests/gtests/graph/unit/utils/test_attribute_value_cpu.cpp b/tests/gtests/graph/unit/utils/test_attribute_value_cpu.cpp
new file mode 100644
index 00000000000..03b0d7249c9
--- /dev/null
+++ b/tests/gtests/graph/unit/utils/test_attribute_value_cpu.cpp
@@ -0,0 +1,119 @@
+/*******************************************************************************
+* Copyright 2021-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <gtest/gtest.h>
+
+#include "utils/attribute_value.hpp"
+
+#include <vector>
+
+TEST(test_utils_attribute_value, Int64) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const int64_t i = 1234;
+    attribute_value_t v1 {i};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::i);
+    ASSERT_EQ(v1.get<int64_t>(), i);
+}
+
+TEST(test_utils_attribute_value, Int64Vector) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const std::vector<int64_t> is {1234, 5678};
+    attribute_value_t v1 {is};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::is);
+    ASSERT_EQ(v1.get<std::vector<int64_t>>(), is);
+}
+
+TEST(test_utils_attribute_value, Float) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const float f = 0.5;
+    attribute_value_t v1 {f};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::f);
+    ASSERT_EQ(v1.get<float>(), f);
+}
+
+TEST(test_utils_attribute_value, FloatVector) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const std::vector<float> fs {0.5, 0.25};
+    attribute_value_t v1 {fs};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::fs);
+    ASSERT_EQ(v1.get<std::vector<float>>(), fs);
+}
+
+TEST(test_utils_attribute_value, BoolValue) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const bool b = true;
+    attribute_value_t v1 {b};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::b);
+    ASSERT_EQ(v1.get<bool>(), b);
+}
+
+TEST(test_utils_attribute_value, StringValue) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const std::string s = "string attribute";
+    attribute_value_t v1 {s};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::s);
+    ASSERT_EQ(v1.get<std::string>(), s);
+}
+
+TEST(test_utils_attribute_value, Copy) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const std::vector<float> fs {0.5, 0.25};
+    attribute_value_t v1 {fs};
+    ASSERT_EQ(v1.get_kind(), attribute_kind::fs);
+    ASSERT_EQ(v1.get<std::vector<float>>(), fs);
+
+    attribute_value_t v2 = v1; // NOLINT
+    ASSERT_EQ(v2.get_kind(), attribute_kind::fs);
+    ASSERT_EQ(v2.get<std::vector<float>>(), fs);
+}
+
+TEST(test_utils_attribute_value, Equal) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const std::vector<float> fs1 {0.5, 0.25};
+    attribute_value_t v1 {fs1};
+    attribute_value_t v2 {fs1};
+    ASSERT_EQ(v1, v2);
+
+    const std::vector<float> fs2 {0.5, 0.5};
+    attribute_value_t v3 {fs2};
+    ASSERT_NE(v1, v3);
+}
+
+TEST(test_utils_attribute_value, AttributeValueAssignOperator) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+
+    const attribute_value_t v1 {int64_t(3)};
+    attribute_value_t v2 {int64_t(1)};
+    v2 = v1;
+    ASSERT_EQ(v1, v2);
+}
diff --git a/tests/gtests/graph/unit/utils/test_debug.cpp b/tests/gtests/graph/unit/utils/test_debug.cpp
deleted file mode 100644
index 75caf94b2f2..00000000000
--- a/tests/gtests/graph/unit/utils/test_debug.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "interface/c_types_map.hpp"
-#include "utils/debug.hpp"
-
-TEST(test_utils_debug_utils, DataType2strDeathTest) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-    EXPECT_STREQ("undef", data_type2str(data_type::undef));
-    EXPECT_STREQ("f16", data_type2str(data_type::f16));
-    EXPECT_STREQ("bf16", data_type2str(data_type::bf16));
-    EXPECT_STREQ("f32", data_type2str(data_type::f32));
-    EXPECT_STREQ("s32", data_type2str(data_type::s32));
-    EXPECT_STREQ("s8", data_type2str(data_type::s8));
-    EXPECT_STREQ("u8", data_type2str(data_type::u8));
-    EXPECT_STREQ("boolean", data_type2str(data_type::boolean));
-#ifndef NDEBUG
-    EXPECT_DEATH(data_type2str(static_cast<data_type_t>(data_type::u8 + 1)),
-            "unknown data_type");
-#else
-    EXPECT_STREQ("unknown data_type",
-            data_type2str(static_cast<data_type_t>(data_type::u8 + 1)));
-#endif
-}
-
-TEST(test_utils_debug_utils, EngineKind2strDeathTest) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-    EXPECT_STREQ("any", engine_kind2str(engine_kind::any_engine));
-    EXPECT_STREQ("cpu", engine_kind2str(engine_kind::cpu));
-    EXPECT_STREQ("gpu", engine_kind2str(engine_kind::gpu));
-#ifndef NDEBUG
-    EXPECT_DEATH(
-            engine_kind2str(static_cast<engine_kind_t>(engine_kind::gpu + 1)),
-            "unknown engine_kind");
-#else
-    EXPECT_STREQ("unknown engine_kind",
-            engine_kind2str(static_cast<engine_kind_t>(engine_kind::gpu + 1)));
-#endif
-}
-
-TEST(test_utils_debug_utils, LayoutType2strDeathTest) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-    EXPECT_STREQ("undef", layout_type2str(layout_type::undef));
-    EXPECT_STREQ("any", layout_type2str(layout_type::any));
-    EXPECT_STREQ("strided", layout_type2str(layout_type::strided));
-    EXPECT_STREQ("opaque", layout_type2str(layout_type::opaque));
-#ifndef NDEBUG
-    EXPECT_DEATH(layout_type2str(
-                         static_cast<layout_type_t>(layout_type::opaque + 1)),
-            "unknown layout_type");
-#else
-    EXPECT_STREQ("unknown layout_type",
-            layout_type2str(
-                    static_cast<layout_type_t>(layout_type::opaque + 1)));
-#endif
-}
-
-TEST(test_utils_debug_utils, FpmathMode2strDeathTest) {
-    using namespace dnnl::impl::graph;
-    using namespace dnnl::impl::graph::utils;
-    EXPECT_STREQ("strict", fpmath_mode2str(fpmath_mode::strict));
-    EXPECT_STREQ("bf16", fpmath_mode2str(fpmath_mode::bf16));
-    EXPECT_STREQ("f16", fpmath_mode2str(fpmath_mode::f16));
-    EXPECT_STREQ("any", fpmath_mode2str(fpmath_mode::any));
-    EXPECT_STREQ("tf32", fpmath_mode2str(fpmath_mode::tf32));
-#ifndef NDEBUG
-    EXPECT_DEATH(
-            fpmath_mode2str(static_cast<fpmath_mode_t>(fpmath_mode::tf32 + 1)),
-            "unknown fpmath_mode");
-#else
-    EXPECT_STREQ("unknown fpmath_mode",
-            fpmath_mode2str(static_cast<fpmath_mode_t>(fpmath_mode::tf32 + 1)));
-#endif
-}
diff --git a/tests/gtests/graph/unit/utils/test_debug_cpu.cpp b/tests/gtests/graph/unit/utils/test_debug_cpu.cpp
new file mode 100644
index 00000000000..0f420f4d5c6
--- /dev/null
+++ b/tests/gtests/graph/unit/utils/test_debug_cpu.cpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+* Copyright 2021-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <string>
+
+#include <gtest/gtest.h>
+
+#include "interface/c_types_map.hpp"
+#include "utils/debug.hpp"
+
+TEST(test_utils_debug_utils, DataType2strDeathTest) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+    EXPECT_STREQ("undef", data_type2str(data_type::undef));
+    EXPECT_STREQ("f16", data_type2str(data_type::f16));
+    EXPECT_STREQ("bf16", data_type2str(data_type::bf16));
+    EXPECT_STREQ("f32", data_type2str(data_type::f32));
+    EXPECT_STREQ("s32", data_type2str(data_type::s32));
+    EXPECT_STREQ("s8", data_type2str(data_type::s8));
+    EXPECT_STREQ("u8", data_type2str(data_type::u8));
+    EXPECT_STREQ("boolean", data_type2str(data_type::boolean));
+#ifndef NDEBUG
+    EXPECT_DEATH(data_type2str(static_cast<data_type_t>(data_type::u8 + 1)),
+            "unknown data_type");
+#else
+    EXPECT_STREQ("unknown data_type",
+            data_type2str(static_cast<data_type_t>(data_type::u8 + 1)));
+#endif
+}
+
+TEST(test_utils_debug_utils, EngineKind2strDeathTest) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+    EXPECT_STREQ("any", engine_kind2str(engine_kind::any_engine));
+    EXPECT_STREQ("cpu", engine_kind2str(engine_kind::cpu));
+    EXPECT_STREQ("gpu", engine_kind2str(engine_kind::gpu));
+#ifndef NDEBUG
+    EXPECT_DEATH(
+            engine_kind2str(static_cast<engine_kind_t>(engine_kind::gpu + 1)),
+            "unknown engine_kind");
+#else
+    EXPECT_STREQ("unknown engine_kind",
+            engine_kind2str(static_cast<engine_kind_t>(engine_kind::gpu + 1)));
+#endif
+}
+
+TEST(test_utils_debug_utils, LayoutType2strDeathTest) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+    EXPECT_STREQ("undef", layout_type2str(layout_type::undef));
+    EXPECT_STREQ("any", layout_type2str(layout_type::any));
+    EXPECT_STREQ("strided", layout_type2str(layout_type::strided));
+    EXPECT_STREQ("opaque", layout_type2str(layout_type::opaque));
+#ifndef NDEBUG
+    EXPECT_DEATH(layout_type2str(
+                         static_cast<layout_type_t>(layout_type::opaque + 1)),
+            "unknown layout_type");
+#else
+    EXPECT_STREQ("unknown layout_type",
+            layout_type2str(
+                    static_cast<layout_type_t>(layout_type::opaque + 1)));
+#endif
+}
+
+TEST(test_utils_debug_utils, FpmathMode2strDeathTest) {
+    using namespace dnnl::impl::graph;
+    using namespace dnnl::impl::graph::utils;
+    EXPECT_STREQ("strict", fpmath_mode2str(fpmath_mode::strict));
+    EXPECT_STREQ("bf16", fpmath_mode2str(fpmath_mode::bf16));
+    EXPECT_STREQ("f16", fpmath_mode2str(fpmath_mode::f16));
+    EXPECT_STREQ("any", fpmath_mode2str(fpmath_mode::any));
+    EXPECT_STREQ("tf32", fpmath_mode2str(fpmath_mode::tf32));
+#ifndef NDEBUG
+    EXPECT_DEATH(
+            fpmath_mode2str(static_cast<fpmath_mode_t>(fpmath_mode::tf32 + 1)),
+            "unknown fpmath_mode");
+#else
+    EXPECT_STREQ("unknown fpmath_mode",
+            fpmath_mode2str(static_cast<fpmath_mode_t>(fpmath_mode::tf32 + 1)));
+#endif
+}
diff --git a/tests/gtests/graph/unit/utils/test_json.cpp b/tests/gtests/graph/unit/utils/test_json.cpp
deleted file mode 100644
index 8f89766bb80..00000000000
--- a/tests/gtests/graph/unit/utils/test_json.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/*******************************************************************************
-* Copyright 2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <cstdio>
-#include <map>
-#include <string>
-#include "utils/json.hpp"
-#include "gtest/gtest.h"
-
-TEST(test_utils_json, WriterReader) {
-    using namespace dnnl::impl::graph;
-
-    std::string filename = "test.txt";
-    std::ofstream of(filename);
-    utils::json::json_writer_t writer(&of);
-    const std::string test_v = "\"\\tr\nlshjk\t\\kv\rm\"";
-    ASSERT_NO_THROW(writer.write_string(test_v));
-    of.close();
-    std::ifstream fs("test.txt");
-    utils::json::json_reader_t read(&fs);
-    std::string tmp;
-    ASSERT_NO_THROW(read.read_string(&tmp));
-    ASSERT_NO_THROW(read.read<std::string>(&tmp));
-
-    std::map<std::basic_string<char>, char> mymap;
-    utils::json::map_json_t<std::map<std::basic_string<char>, char>> map_tmp;
-    ASSERT_NO_THROW(map_tmp.read(&read, &mymap));
-    fs.close();
-    ASSERT_EQ(std::remove("test.txt"), 0);
-}
diff --git a/tests/gtests/graph/unit/utils/test_json_cpu.cpp b/tests/gtests/graph/unit/utils/test_json_cpu.cpp
new file mode 100644
index 00000000000..5cbad55e3e4
--- /dev/null
+++ b/tests/gtests/graph/unit/utils/test_json_cpu.cpp
@@ -0,0 +1,43 @@
+/*******************************************************************************
+* Copyright 2023-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <cstdio>
+#include <map>
+#include <string>
+#include "utils/json.hpp"
+#include "gtest/gtest.h"
+
+TEST(test_utils_json, WriterReader) {
+    using namespace dnnl::impl::graph;
+
+    std::string filename = "test.txt";
+    std::ofstream of(filename);
+    utils::json::json_writer_t writer(&of);
+    const std::string test_v = "\"\\tr\nlshjk\t\\kv\rm\"";
+    ASSERT_NO_THROW(writer.write_string(test_v));
+    of.close();
+    std::ifstream fs("test.txt");
+    utils::json::json_reader_t read(&fs);
+    std::string tmp;
+    ASSERT_NO_THROW(read.read_string(&tmp));
+    ASSERT_NO_THROW(read.read<std::string>(&tmp));
+
+    std::map<std::basic_string<char>, char> mymap;
+    utils::json::map_json_t<std::map<std::basic_string<char>, char>> map_tmp;
+    ASSERT_NO_THROW(map_tmp.read(&read, &mymap));
+    fs.close();
+    ASSERT_EQ(std::remove("test.txt"), 0);
+}
diff --git a/tests/gtests/graph/unit/utils/test_pattern_matcher.cpp b/tests/gtests/graph/unit/utils/test_pattern_matcher.cpp
deleted file mode 100644
index 0028f789c49..00000000000
--- a/tests/gtests/graph/unit/utils/test_pattern_matcher.cpp
+++ /dev/null
@@ -1,2735 +0,0 @@
-/*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <memory>
-
-#include "gtest/gtest.h"
-
-#include "interface/graph.hpp"
-#include "utils/pm/nested_matcher.hpp"
-#include "utils/pm/pass_base.hpp"
-
-#include "graph/unit/utils.hpp"
-
-using namespace dnnl::impl::graph;
-using namespace dnnl::impl::graph::op_kind;
-using namespace dnnl::impl::graph::utils::pm;
-using namespace dnnl::graph::tests::unit::utils;
-
-const iport_t IN0 = 0;
-const iport_t IN1 = 1;
-const oport_t OUT0 = 0;
-
-//
-// All pattern starts with a "pb_graph"
-//
-TEST(test_utils_pattern_matcher, Graph) {
-    auto pgraph = std::make_shared<pb_graph_t>();
-
-    ASSERT_NE(pgraph, nullptr);
-}
-
-//
-// Pattern is grown by appending pattern ops ("pb_op", "alternation" and
-// "repetition") to a "pb_graph" with pb_graph.append_op(),
-// append_alternation(), append_optional() and append_repeition().
-// Pattern can be a nested graph since "alteration" and "repetition"
-// embeds "pb_graph".
-// Pattern graph has the following properties.
-// - During matching, aggegrate pattern nodes (pb_graph, alternation,
-// repetition) will be unpacked recursively until all nodes are expanded
-// to just "pb_op"s
-// - Any inner "pb_graph" embedded inside "alternation" or "repetition" needs
-// to provide a mapping from the "pb_graph"'s in/out port to it's inner node's
-// in/out port to enable unpacking. This is done by calling create_input_port()
-// and create_output_port().
-// - "alternation" and "repetition"'s in/out ports are mapped to the same
-// numberred in/out ports of embedded "pb_graph"(s)
-// - One graph op is matched with one "pb_op". And expanded pattern graph's
-// "pb_op" are not aliased. So graph ops matched with different "pb_op"s cannot
-// be aliased.
-// - Graph op attribute checking for is done by "decision_function"s of a
-// "pb_op". Every "pb_op" needs to provide at least one "decision_function".
-// One "decision_function" needs to be passed as an arugument to append_op()
-// Some variants of append_op() provides a quick way to setup common
-// "decision_function"s.
-// Use pb_op.append_decision_function() to add additional attribute checkers.
-// - Pattern matcher matches graph op edges with pb_op edges. Graph ops can
-// have more edges than constrained by the pattern graph. Those are marked as
-// unhandled edges during matching. Unhandled edges are two types. One is
-// connected to a graph op matched by this pattern and called an internal edge.
-// The other is called an external edge.
-// - Matcher has two different modes of handling unhandled edges. First mode
-// assumes all unhandled inputs as external input and assumes unhandled outputs
-// from ops matched with non root pb_op (side outputs) are not allowed.
-// This mode is useful for backends backed by fixed kernels such as oneDNN
-// primitives. To allow side outputs, pb_op.allow_external_output() is provided
-// to override this behavior. The second mode auto exports unhandled external
-// inputs and outputs.
-// Pattern matcher has two different mode/way of handling unmatched graph op
-// edges.
-// - Order of external inputs and outputs returned by matcher is implementation
-// dependent. (Port numbers provided by create_input_port() and
-// create_output_port() may be used to enforce ordering for fixed patterns from
-// a flat pattern graph. But the idea is not practical in general. For example,
-// nested patterns may have variable number of side inputs so fixed ordering
-// cannot be enforced.)
-// - In case a match has multiple aliased external inputs, they are not merged
-// and matcher reports them as separate inputs.
-//
-
-//
-// Leaf pattern ops can be created by passing dnnl_graph op_kind.
-// External inputs and outputs of a match will be ordered and
-// exposed as part of the match. The order depends on matcher
-// implementation.
-//
-TEST(test_utils_pattern_matcher, GraphAppendLeafOp) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Grow internal graph
-    // Leaf pattern op "Add"
-    auto op0 = graphp->append_op(Add);
-    ASSERT_NE(graphp, nullptr);
-    ASSERT_NE(op0, nullptr);
-}
-
-//
-// Convolution + BiasAdd
-// A vector of all in coming edges to the new op can passed to
-// append_op for non leaf pattern ops
-//
-TEST(test_utils_pattern_matcher, GraphAppendNonLeafOp) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Grow internal graph
-    // Convolution -> BiasAdd
-    // Leaf pattern op
-    auto op0 = graphp->append_op(Convolution);
-    // Non leaf pattern op "BiasAdd" with only one of the inputs constrained
-    // input 0 is constrained to output 0 of "Abs" op
-    // unconstrained input is like matching "Any"
-    // input 1 is free to match any op
-    auto op1 = graphp->append_op(BiasAdd, {in_edge(IN0, op0, OUT0)});
-    // Make sure that input1 to "BiasAdd" node does not come from within
-    // the matched pattern
-    ASSERT_NE(op1->get_producer(IN0), nullptr);
-    ASSERT_EQ(op1->get_producer(IN0)->first, op0);
-    ASSERT_EQ(op1->get_producer(IN0)->second, OUT0);
-    ASSERT_NE(op0->get_consumers(OUT0), nullptr);
-    ASSERT_EQ(op0->get_consumers(OUT0)->at(0)->first, op1);
-    ASSERT_EQ(op0->get_consumers(OUT0)->at(0)->second, IN0);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t bias {1, BiasAdd, "bias"};
-    op_t relu {2, ReLU, "relu"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    bias.add_input(lt_vec[2]);
-    bias.add_input(lt_vec[3]);
-    bias.add_output(lt_vec[4]);
-    relu.add_input(lt_vec[4]);
-    relu.add_output(lt_vec[5]);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&bias), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-
-    // matched dnnl_graph_op will be marked
-    for (auto &p : fusion_ops) {
-        ASSERT_TRUE(p->get_attr<bool>(op_attr::matched));
-    }
-}
-
-TEST(test_utils_pattern_matcher, GraphNoAllowSideOutput) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto op0 = graphp->append_op(Convolution);
-    auto op1 = graphp->append_op(BiasAdd, {in_edge(IN0, op0, OUT0)});
-    UNUSED(op1);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t bias {1, BiasAdd, "bias"};
-    op_t relu {2, ReLU, "relu"};
-    op_t add {3, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    bias.add_input(lt_vec[2]);
-    bias.add_input(lt_vec[3]);
-    bias.add_output(lt_vec[4]);
-    relu.add_input(lt_vec[4]);
-    relu.add_output(lt_vec[5]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[6]);
-    add.add_output(lt_vec[7]);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&bias), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    op_t *internal_op = agraph.get_ops()[0].get();
-    EXPECT_FALSE(match_pattern(internal_op, graphp, fusion_ops));
-}
-
-TEST(test_utils_pattern_matcher, ConvAddFusion) {
-    // conv + add fusion
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto padd = pattern_graph->append_op(
-            Add, {in_edge(IN0, pconv, OUT0), in_edge(IN1, pconv, OUT0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t add {1, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_output(lt_vec[3]);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(
-            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, FailToFuseConvAdd) {
-    // conv = add fusion
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t add {1, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_output(lt_vec[3]);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(
-            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-}
-
-TEST(test_utils_pattern_matcher, ConvAddFusionCase2) {
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
-    UNUSED(padd);
-
-    graph_t agraph1;
-    op_t conv0 {0, Convolution, "conv0"};
-    set_conv_common_attr(conv0);
-    op_t conv1 {1, Convolution, "conv1"};
-    set_conv_common_attr(conv1);
-    op_t add1 {2, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    conv0.add_input(lt_vec[0]);
-    conv0.add_input(lt_vec[1]);
-    conv0.add_output(lt_vec[2]);
-    conv1.add_input(lt_vec[2]);
-    conv1.add_input(lt_vec[3]);
-    conv1.add_output(lt_vec[4]);
-    add1.add_input(lt_vec[2]);
-    add1.add_input(lt_vec[4]);
-    add1.add_output(lt_vec[5]);
-    ASSERT_EQ(agraph1.add_op(&conv0), status::success);
-    ASSERT_EQ(agraph1.add_op(&conv1), status::success);
-    ASSERT_EQ(agraph1.add_op(&add1), status::success);
-    agraph1.finalize();
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(
-            agraph1.get_ops()[0].get(), pattern_graph, fusion_ops));
-    fusion_ops.clear();
-
-    EXPECT_TRUE(match_pattern(
-            agraph1.get_ops()[1].get(), pattern_graph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, ConvAddFusionCase3) {
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t conv0 {0, Convolution, "conv0"};
-    set_conv_common_attr(conv0);
-    op_t conv1 {1, Convolution, "conv1"};
-    set_conv_common_attr(conv1);
-    op_t add {2, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
-    conv0.add_input(lt_vec[0]);
-    conv0.add_input(lt_vec[1]);
-    conv0.add_output(lt_vec[2]);
-    conv1.add_input(lt_vec[3]);
-    conv1.add_input(lt_vec[4]);
-    conv1.add_output(lt_vec[5]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[5]);
-    add.add_output(lt_vec[6]);
-    ASSERT_EQ(agraph.add_op(&conv0), status::success);
-    ASSERT_EQ(agraph.add_op(&conv1), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-
-    EXPECT_TRUE(match_pattern(
-            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 2U);
-    for (auto &op : agraph.get_ops())
-        op->remove_attr(op_attr::matched);
-    fusion_ops.clear();
-
-    EXPECT_TRUE(match_pattern(
-            agraph.get_ops()[1].get(), pattern_graph, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 2U);
-    for (auto &op : agraph.get_ops())
-        op->remove_attr(op_attr::matched);
-}
-
-TEST(test_utils_pattern_matcher, CommutativeInputBothConstrained) {
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto pelu = pattern_graph->append_op(Elu, {in_edge(IN0, pconv, OUT0)});
-    auto pabsnode = pattern_graph->append_op(Abs, {in_edge(IN0, pconv, OUT0)});
-    auto padd = pattern_graph->append_op(
-            Add, {in_edge(IN0, pelu, OUT0), in_edge(IN1, pabsnode, OUT0)});
-    UNUSED(padd);
-
-    for (size_t elu_offset : {0, 1}) {
-        graph_t agraph;
-        op_t conv {0, Convolution, "conv"};
-        set_conv_common_attr(conv);
-        op_t elu {1, Elu, "elu"};
-        elu.set_attr<float>(op_attr::alpha, 0.1f);
-        op_t abs {2, Abs, "abs"};
-        op_t add {3, Add, "add"};
-        std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-        conv.add_input(lt_vec[0]);
-        conv.add_input(lt_vec[1]);
-        conv.add_output(lt_vec[2]);
-        elu.add_input(lt_vec[2]);
-        elu.add_output(lt_vec[3]);
-        abs.add_input(lt_vec[2]);
-        abs.add_output(lt_vec[4]);
-        if (elu_offset == 0) {
-            add.add_input(lt_vec[3]);
-            add.add_input(lt_vec[4]);
-        } else {
-            add.add_input(lt_vec[4]);
-            add.add_input(lt_vec[3]);
-        }
-        add.add_output(lt_vec[5]);
-        ASSERT_EQ(agraph.add_op(&conv), status::success);
-        ASSERT_EQ(agraph.add_op(&elu), status::success);
-        ASSERT_EQ(agraph.add_op(&abs), status::success);
-        ASSERT_EQ(agraph.add_op(&add), status::success);
-        agraph.finalize();
-
-        std::vector<op_t *> fusion_ops;
-        EXPECT_TRUE(match_pattern(
-                agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-        ASSERT_EQ(fusion_ops.size(), 4U);
-    }
-}
-
-TEST(test_utils_pattern_matcher, CommutativeInput) {
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-    auto pconv0 = pattern_graph->append_op(Convolution);
-    pconv0->append_decision_function(
-            [](op_t *o) -> bool { return o->num_inputs() == 3; });
-    auto pconv1 = pattern_graph->append_op(Convolution);
-    auto prelu0 = pattern_graph->append_op(ReLU, {in_edge(IN0, pconv0, OUT0)});
-    auto prelu1 = pattern_graph->append_op(ReLU, {in_edge(IN0, pconv1, OUT0)});
-    auto padd = pattern_graph->append_op(
-            Add, {in_edge(IN0, prelu0, OUT0), in_edge(IN1, prelu1, OUT0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t conv0 {0, Convolution, "conv0"};
-    set_conv_common_attr(conv0);
-    op_t conv1 {1, Convolution, "conv1"};
-    set_conv_common_attr(conv1);
-    op_t relu0 {2, ReLU, "relu0"};
-    op_t relu1 {3, ReLU, "relu1"};
-    op_t add {4, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(10);
-    conv0.add_input(lt_vec[0]);
-    conv0.add_input(lt_vec[1]);
-    conv0.add_output(lt_vec[2]);
-    relu0.add_input(lt_vec[2]);
-    relu0.add_output(lt_vec[3]);
-    conv1.add_input(lt_vec[4]);
-    conv1.add_input(lt_vec[5]);
-    conv1.add_input(lt_vec[6]);
-    conv1.add_output(lt_vec[7]);
-    relu1.add_input(lt_vec[7]);
-    relu1.add_output(lt_vec[8]);
-    add.add_input(lt_vec[3]);
-    add.add_input(lt_vec[8]);
-    add.add_output(lt_vec[9]);
-    ASSERT_EQ(agraph.add_op(&conv0), status::success);
-    ASSERT_EQ(agraph.add_op(&relu0), status::success);
-    ASSERT_EQ(agraph.add_op(&conv1), status::success);
-    ASSERT_EQ(agraph.add_op(&relu1), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(
-            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-    fusion_ops.clear();
-    EXPECT_TRUE(match_pattern(
-            agraph.get_ops()[2].get(), pattern_graph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 5U);
-}
-
-//
-// Convolution + BiasAdd + Elu
-// Convolution + BiasAdd + Sigmoid
-// Convolution + BiasAdd + ReLU
-// Convolution + BiasAdd + Clamp
-// Convolution + BiasAdd + Square
-// Convolution + BiasAdd + Tanh
-// Convolution + BiasAdd + Sqrt
-//
-TEST(test_utils_pattern_matcher, ConvBiasActivationFusion) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pconv = graphp->append_op(Convolution);
-    auto pbias = graphp->append_op(BiasAdd, {in_edge(IN0, pconv, OUT0)});
-    auto pact = graphp->append_alternation(
-            {Elu, Sigmoid, ReLU, Clamp, Square, Tanh, Sqrt},
-            {in_edge(IN0, pbias, OUT0)});
-    UNUSED(pact);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t bias {1, BiasAdd, "bias"};
-    op_t relu {2, ReLU, "relu"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    bias.add_input(lt_vec[2]);
-    bias.add_input(lt_vec[3]);
-    bias.add_output(lt_vec[4]);
-    relu.add_input(lt_vec[4]);
-    relu.add_output(lt_vec[5]);
-
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&bias), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-//
-// Convolution + BiasAdd + Add + ReLU
-// Convolution + BiasAdd + Add + ELU
-//
-TEST(test_utils_pattern_matcher, ConvBiasSumActivationFusion) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pconv = graphp->append_op(Convolution);
-    auto pbias = graphp->append_op(BiasAdd, {in_edge(IN0, pconv, OUT0)});
-    auto padd = graphp->append_op(Add, {in_edge(IN0, pbias, OUT0)});
-    auto pact = graphp->append_alternation(
-            {Elu, ReLU}, {in_edge(IN0, padd, OUT0)});
-    UNUSED(pact);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t bias {1, BiasAdd, "bias"};
-    op_t add {2, Add, "add"};
-    op_t elu {3, Elu, "elu"};
-    elu.set_attr<float>(op_attr::alpha, 0.1f);
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    bias.add_input(lt_vec[2]);
-    bias.add_input(lt_vec[3]);
-    bias.add_output(lt_vec[4]);
-    // Force check commutative input
-    add.add_input(lt_vec[5]);
-    add.add_input(lt_vec[4]);
-    add.add_output(lt_vec[6]);
-    elu.add_input(lt_vec[6]);
-    elu.add_output(lt_vec[7]);
-
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&bias), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    ASSERT_EQ(agraph.add_op(&elu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 4U);
-}
-
-//
-// MatMul + BiasAdd + Add
-//
-TEST(test_utils_pattern_matcher, MatmulBiasSumFusion) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmatmul = graphp->append_op(MatMul);
-    auto pbias = graphp->append_op(BiasAdd, {in_edge(IN0, pmatmul, OUT0)});
-    auto padd = graphp->append_op(Add, {in_edge(IN0, pbias, OUT0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t bias {1, BiasAdd, "bias"};
-    op_t add {2, Add, "add"};
-    op_t relu {3, ReLU, "relu"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    bias.add_input(lt_vec[2]);
-    bias.add_input(lt_vec[3]);
-    bias.add_output(lt_vec[4]);
-    add.add_input(lt_vec[5]);
-    add.add_input(lt_vec[4]);
-    add.add_output(lt_vec[6]);
-    relu.add_input(lt_vec[6]);
-    relu.add_output(lt_vec[7]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&bias), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-//
-// MatMul + ReLU
-// MatMul + Elu
-// MatMul + GELU
-// MatMul + Sigmoid
-// MatMul + Clamp
-//
-TEST(test_utils_pattern_matcher, MatmulActivationFusion) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmat = graphp->append_op(MatMul);
-    auto pact = graphp->append_alternation(
-            {ReLU, Elu, GELU, Sigmoid, Clamp}, {in_edge(IN0, pmat, OUT0)});
-    UNUSED(pact);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-    op_t add {2, Add, "add"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-    add.add_input(lt_vec[3]);
-    add.add_input(lt_vec[4]);
-    add.add_output(lt_vec[5]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(agraph.get_ops()[1].get(), graphp, fusion_ops));
-    fusion_ops.clear();
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, ConvSwishFusion) {
-    // conv_swish pass
-    //   conv
-    //   |   |
-    //   | sigmoid
-    //   |   |
-    // multiply
-
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto psigmoid
-            = pattern_graph->append_op(Sigmoid, {in_edge(IN0, pconv, OUT0)});
-    in_edges_t mul_edges
-            = {in_edge(IN0, pconv, OUT0), in_edge(IN1, psigmoid, OUT0)};
-    auto pmul = pattern_graph->append_op(Multiply, mul_edges);
-    UNUSED(pmul);
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t sigmoid {1, Sigmoid, "sigmoid"};
-    op_t multiply {2, Multiply, "multiply"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(5);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    sigmoid.add_input(lt_vec[2]);
-    sigmoid.add_output(lt_vec[3]);
-    // Force check commutative input
-    multiply.add_input(lt_vec[3]);
-    multiply.add_input(lt_vec[2]);
-    multiply.add_output(lt_vec[4]);
-
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&sigmoid), status::success);
-    ASSERT_EQ(agraph.add_op(&multiply), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(
-            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, ConvSumEltwiseFusion) {
-    // conv + sum + (Relu / Elu / Clamp / Square / Tanh / Abs / Sqrt)
-    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
-    auto pconv = pattern_graph->append_op(Convolution);
-    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
-
-    std::shared_ptr<pb_graph_t> optional_act = std::make_shared<pb_graph_t>();
-    auto pact = optional_act->append_alternation(
-            {Elu, ReLU, Square, Tanh, Abs, Sqrt, Clamp});
-    optional_act->create_input_port(IN0, pact, IN0);
-    optional_act->create_output_port(OUT0, pact, OUT0);
-    pattern_graph->append_optional(optional_act, {in_edge(IN0, padd, OUT0)});
-
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t add {1, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(5);
-    conv.add_input(lt_vec[0]);
-    conv.add_input(lt_vec[1]);
-    conv.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[3]);
-    add.add_output(lt_vec[4]);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(
-            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-//
-// Alternation, Repetition, Optional are nested pattern nodes
-// that has a body(s) of graph.
-// Input and Output ports of those nested patterns get mapped to
-// the corresponding port (same index) of the body.
-// If you need to change that mapping, wrap the body in a graph
-// and use create_input_port/create_output_port to change the
-// mapping.
-//
-
-//
-// Alternation node wraps two or more alternatives and
-// constructed with append_alternation.
-// Input or Output "n" of the alternation node connects to
-// Input of Output "n" of the alternative.
-//
-TEST(test_utils_pattern_matcher, Alternation) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // MatMul -> (Add | Multiply)
-    auto pmatmul = graphp->append_op(MatMul);
-
-    // Prepare the alternative graphs
-    auto addgraph = std::make_shared<pb_graph_t>();
-    auto padd = addgraph->append_op(Add);
-    addgraph->create_input_port(IN0, padd, IN0);
-    addgraph->create_input_port(IN1, padd, IN1);
-    addgraph->create_output_port(OUT0, padd, OUT0);
-    auto mulgraph = std::make_shared<pb_graph_t>();
-    auto pmul = mulgraph->append_op(Multiply);
-    mulgraph->create_input_port(IN0, pmul, IN0);
-    mulgraph->create_input_port(IN1, pmul, IN1);
-    mulgraph->create_output_port(OUT0, pmul, OUT0);
-    // We can add a helper function like
-    // single_op_graph(op_kind);
-    // that create a new graph add a single node and sets
-    // inner consumer and producers.
-
-    auto palt = graphp->append_alternation(
-            {addgraph, mulgraph}, {in_edge(IN0, pmatmul, OUT0)});
-    UNUSED(palt);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t add {1, Add, "add"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(5);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[3]);
-    add.add_output(lt_vec[4]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(agraph.get_ops()[1].get(), graphp, fusion_ops));
-    fusion_ops.clear();
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, AlternationWithConsumer) {
-    /*
-    pattern:
-          matmul
-            |
-(softmax + relu) | (relu + softmax)
-            |
-          matmul
-    graph:
-         matmul
-           |
-         softmax
-           |
-          relu
-           |
-         matmul
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmatmul = graphp->append_op(op_kind::MatMul);
-    auto alter1 = std::make_shared<pb_graph_t>();
-    auto psoftmax1 = alter1->append_op(op_kind::SoftMax);
-    auto prelu1 = alter1->append_op(op_kind::ReLU, {in_edge(0, psoftmax1, 0)});
-    alter1->create_input_port(0, psoftmax1, 0);
-    alter1->create_output_port(0, prelu1, 0);
-    auto alter2 = std::make_shared<pb_graph_t>();
-    auto prelu2 = alter2->append_op(op_kind::ReLU);
-    auto psoftmax2
-            = alter2->append_op(op_kind::SoftMax, {in_edge(0, prelu2, 0)});
-    alter2->create_input_port(0, prelu2, 0);
-    alter2->create_output_port(0, psoftmax2, 0);
-    auto palter = graphp->append_alternation(
-            {alter1, alter2}, {in_edge(0, pmatmul, 0)});
-    auto pmatmul2 = graphp->append_op(op_kind::MatMul, {in_edge(0, palter, 0)});
-    UNUSED(pmatmul2);
-
-    graph_t agraph;
-    op_t matmul0 {0, MatMul, "matmul0"};
-    op_t softmax {1, SoftMax, "softmax"};
-    op_t relu {2, ReLU, "relu"};
-    op_t matmul1 {3, MatMul, "matmul1"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
-    matmul0.add_input(lt_vec[0]);
-    matmul0.add_input(lt_vec[1]);
-    matmul0.add_output(lt_vec[2]);
-    softmax.add_input(lt_vec[2]);
-    softmax.add_output(lt_vec[3]);
-    relu.add_input(lt_vec[3]);
-    relu.add_output(lt_vec[4]);
-    matmul1.add_input(lt_vec[4]);
-    matmul1.add_input(lt_vec[5]);
-    matmul1.add_output(lt_vec[6]);
-
-    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
-    ASSERT_EQ(agraph.add_op(&softmax), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    // should match the 1st rep_unit
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 4U);
-}
-
-//
-// Repetition node wraps body that gets repeated a
-// number of times specified by a range and constructed with
-// append_repetition.
-// The body repeats inself by connecting edges through an
-// output port to input port mapping.
-// The mapping has to be given as an argument to append_repetition.
-//
-TEST(test_utils_pattern_matcher, Repetition) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Pattern that captures
-    // MatMul -> (Add | Multiply) -> ReLU
-    // MatMul -> (Add | Multiply) -> (Add | Multiply) -> ReLU
-    auto pmatmul = graphp->append_op(MatMul);
-    auto repbody = std::make_shared<pb_graph_t>();
-    auto paddormul = repbody->append_alternation({Add, Multiply});
-    repbody->create_input_port(IN0, paddormul, IN0);
-    // No need to create IN1 for the body since it is not connected to
-    // an outer pattern.
-    // repbody->create_input_port(IN1, addormul, IN1);
-    repbody->create_output_port(OUT0, paddormul, OUT0);
-
-    // Repeat 1 or 2 times [1, 3) by mapping OUT0 back to IN0
-    auto rep = graphp->append_repetition(
-            repbody, {OUT0, IN0}, 1, 3, {in_edge(IN0, pmatmul, OUT0)});
-    auto prelu = graphp->append_op(ReLU, {in_edge(IN0, rep, OUT0)});
-    UNUSED(prelu);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t add {1, Add, "add"};
-    op_t mul {2, Multiply, "mul"};
-    op_t relu {3, ReLU, "relu"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[3]);
-    add.add_output(lt_vec[4]);
-    mul.add_input(lt_vec[4]);
-    mul.add_input(lt_vec[5]);
-    mul.add_output(lt_vec[6]);
-    relu.add_input(lt_vec[6]);
-    relu.add_output(lt_vec[7]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    ASSERT_EQ(agraph.add_op(&mul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 4U);
-}
-
-TEST(test_utils_pattern_matcher, RepetitionFail) {
-    /* 
-    Pattern:
-     MatMul
-       \    /
-      [Add/Div]*[1,3]
-
-     Graph:
-          MatMul
-            \   /
-             Add
-          \  /
-          Div
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmatmul = graphp->append_op(MatMul);
-    auto repbody = std::make_shared<pb_graph_t>();
-    auto paddordiv = repbody->append_alternation({Add, Divide});
-    repbody->create_input_port(IN0, paddordiv, IN0);
-    repbody->create_output_port(OUT0, paddordiv, OUT0);
-
-    graphp->append_repetition(
-            repbody, {OUT0, IN0}, 2, 3, {in_edge(IN0, pmatmul, OUT0)});
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t add {1, Add, "add"};
-    op_t div {2, Divide, "div"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[3]);
-    add.add_output(lt_vec[4]);
-    // incorrect order for div
-    div.add_input(lt_vec[5]);
-    div.add_input(lt_vec[4]);
-    div.add_output(lt_vec[6]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    ASSERT_EQ(agraph.add_op(&div), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-}
-
-TEST(test_utils_pattern_matcher, RepetitionWithMultiConsumersFail) {
-    /* 
-    Pattern:
-     MatMul
-       \    /
-      [Add/Div]*[1,3]
-
-     Graph:
-          MatMul
-            \   /
-             Add
-            /  \
-          Div  Add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmatmul = graphp->append_op(MatMul);
-    auto repbody = std::make_shared<pb_graph_t>();
-    auto paddordiv = repbody->append_alternation({Add, Divide});
-    paddordiv->allow_internal_inputs();
-    repbody->create_input_port(IN0, paddordiv, IN0);
-    repbody->create_output_port(OUT0, paddordiv, OUT0);
-
-    graphp->append_repetition(
-            repbody, {OUT0, IN0}, 1, 3, {in_edge(IN0, pmatmul, OUT0)});
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t add {1, Add, "add"};
-    op_t div {2, Divide, "div"};
-    op_t add2 {3, Add, "add2"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(9);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    add.add_input(lt_vec[2]);
-    add.add_input(lt_vec[3]);
-    add.add_output(lt_vec[4]);
-    div.add_input(lt_vec[4]);
-    div.add_input(lt_vec[5]);
-    div.add_output(lt_vec[6]);
-    add2.add_input(lt_vec[4]);
-    add2.add_input(lt_vec[7]);
-    add2.add_output(lt_vec[8]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    ASSERT_EQ(agraph.add_op(&div), status::success);
-    ASSERT_EQ(agraph.add_op(&add2), status::success);
-    agraph.finalize();
-    ASSERT_EQ(agraph.num_ops(), 4U);
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    // only matmul+add are fused
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-//
-// "Optional" is a special case of repetition that repeats one or zero times
-// and constructed with append_optional.
-// output to input port mapping isn't needed since the body does not repeat
-// more than once.
-//
-TEST(test_utils_pattern_matcher, Optional) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Pattern that captures
-    // MatMul -> ReLU
-    // MatMul -> (Add | Multiply) -> ReLU
-    auto pmatmul = graphp->append_op(MatMul);
-    auto repbody = std::make_shared<pb_graph_t>();
-    auto paddormul = repbody->append_alternation({Add, Multiply});
-    repbody->create_input_port(IN0, paddormul, IN0);
-    repbody->create_output_port(OUT0, paddormul, OUT0);
-    auto rep = graphp->append_optional(repbody, {in_edge(IN0, pmatmul, OUT0)});
-    auto prelu = graphp->append_op(ReLU, {in_edge(IN0, rep, OUT0)});
-    UNUSED(prelu);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-
-    graph_t agraph2;
-    op_t matmul2 {0, MatMul, "matmul"};
-    op_t add2 {1, Add, "add"};
-    op_t relu2 {2, ReLU, "relu"};
-
-    std::vector<logical_tensor_t> lt_vec2 = create_logical_tensors(6);
-    matmul2.add_input(lt_vec2[0]);
-    matmul2.add_input(lt_vec2[1]);
-    matmul2.add_output(lt_vec2[2]);
-    add2.add_input(lt_vec2[2]);
-    add2.add_input(lt_vec2[3]);
-    add2.add_output(lt_vec2[4]);
-    relu2.add_input(lt_vec2[4]);
-    relu2.add_output(lt_vec2[5]);
-
-    ASSERT_EQ(agraph2.add_op(&matmul2), status::success);
-    ASSERT_EQ(agraph2.add_op(&add2), status::success);
-    ASSERT_EQ(agraph2.add_op(&relu2), status::success);
-    agraph2.finalize();
-
-    fusion_ops.clear();
-    EXPECT_TRUE(match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, OptionalWithLargerPort) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Pattern that captures
-    // MatMul -> (Divide)? -> Select
-    auto pmatmul = graphp->append_op(MatMul);
-    auto optbody = std::make_shared<pb_graph_t>();
-    auto pdiv = optbody->append_op(Divide);
-    optbody->create_input_port(0, pdiv, 0);
-    optbody->create_output_port(0, pdiv, 0);
-    auto popt_div = graphp->append_optional(optbody, {in_edge(0, pmatmul, 0)});
-    // optional divide is connected to 2nd input of select
-    // which means when divide does not exist, matmul should connected to
-    // 2nd input of select
-    auto pselect = graphp->append_op(Select, {in_edge(2, popt_div, 0)});
-    UNUSED(pselect);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t select {1, Select, "select"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    lt_vec[5].data_type = data_type::boolean;
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    select.add_input(lt_vec[5]);
-    select.add_input(lt_vec[3]);
-    select.add_input(lt_vec[2]);
-    select.add_output(lt_vec[4]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&select), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-
-    graph_t agraph2;
-    op_t matmul2 {0, MatMul, "matmul"};
-    op_t divide2 {1, Divide, "div"};
-    op_t select2 {2, Select, "select"};
-
-    std::vector<logical_tensor_t> lt_vec2 = create_logical_tensors(8);
-    lt_vec2[7].data_type = data_type::boolean;
-    matmul2.add_input(lt_vec2[0]);
-    matmul2.add_input(lt_vec2[1]);
-    matmul2.add_output(lt_vec2[2]);
-    divide2.add_input(lt_vec2[2]);
-    divide2.add_input(lt_vec2[3]);
-    divide2.add_output(lt_vec2[4]);
-    select2.add_input(lt_vec2[7]);
-    select2.add_input(lt_vec2[5]);
-    select2.add_input(lt_vec2[4]);
-    select2.add_output(lt_vec2[6]);
-
-    ASSERT_EQ(agraph2.add_op(&matmul2), status::success);
-    ASSERT_EQ(agraph2.add_op(&divide2), status::success);
-    ASSERT_EQ(agraph2.add_op(&select2), status::success);
-    agraph2.finalize();
-
-    fusion_ops.clear();
-    EXPECT_TRUE(match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-//
-// ?: means optional
-// ^: means repetition
-// Conv+(BN)?+ReLU
-// Conv+(BN)?+ReLU+Add
-// Conv+(BN)?+ReLU+Conv+(BN)?+ReLU+Conv+(BN)?+ReLU+Add
-//
-// Conv+(BN)?+ReLU+(((Conv+(BN)?+ReLU)^2)?+Add)?
-//
-// Note that each "()" requires an addition pb_graph.
-// So for this example, we need 1 + 5 = 6 pb_graphs.
-//
-// Since this example is not a fixed pattern and has
-// variable number of side inputs, we cannot use
-// create_input_port to setup globbal ordering for inputs.
-//
-// create_input_port/create_output_port is still needed for
-// setting up the contact interface for nested patterns.
-//
-TEST(test_utils_pattern_matcher, ComplexRepetition) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Basic building block
-    // Convolution + (BatchNormInference)? + ReLU
-
-    // Conv
-    auto pconv = graphp->append_op(Convolution);
-    // Optional BN
-    auto body = std::make_shared<pb_graph_t>();
-    auto pbn = body->append_op(BatchNormInference);
-    // Interface for body
-    body->create_input_port(IN0, pbn, IN0);
-    body->create_output_port(OUT0, pbn, OUT0);
-    auto popt = graphp->append_optional(body, {in_edge(IN0, pconv, OUT0)});
-    // ReLU
-    auto prelu = graphp->append_op(ReLU, {in_edge(IN0, popt, OUT0)});
-    // Create same block to use as repetition body
-    auto graphp2 = std::make_shared<pb_graph_t>();
-    auto pconv2 = graphp2->append_op(Convolution);
-    auto body2 = std::make_shared<pb_graph_t>();
-    auto pbn2 = body2->append_op(BatchNormInference);
-    // Interface for body2
-    body2->create_input_port(IN0, pbn2, IN0);
-    body2->create_output_port(OUT0, pbn2, OUT0);
-    auto popt2 = graphp2->append_optional(body2, {in_edge(IN0, pconv2, OUT0)});
-    auto prelu2 = graphp2->append_op(ReLU, {in_edge(IN0, popt2, OUT0)});
-    // Interface for graphp2
-    graphp2->create_input_port(IN0, pconv2, IN0);
-    graphp2->create_output_port(OUT0, prelu2, OUT0);
-
-    // repeat body exactly two times
-    auto graphp3 = std::make_shared<pb_graph_t>();
-    auto prep = graphp3->append_repetition(graphp2, {OUT0, IN0}, 2, 3);
-    // Interface for graphp3
-    graphp3->create_input_port(IN0, prep, IN0);
-    graphp3->create_output_port(OUT0, prep, OUT0);
-
-    // optional repeated body followed by an "Add"
-    auto graphp4 = std::make_shared<pb_graph_t>();
-    auto popt3 = graphp4->append_optional(graphp3);
-    auto padd = graphp4->append_op(Add, {in_edge(IN0, popt3, OUT0)});
-    // Interface for graphp4
-    graphp4->create_input_port(IN0, popt3, IN0);
-    graphp4->create_output_port(OUT0, padd, OUT0);
-
-    // Append the complex pattern to relu
-    auto popt4 = graphp->append_optional(graphp4, {in_edge(IN0, prelu, OUT0)});
-    UNUSED(popt4);
-
-    graph_t agraph;
-    op_t conv1 {0, Convolution, "conv1"};
-    set_conv_common_attr(conv1);
-    op_t relu1 {1, ReLU, "relu1"};
-    op_t conv2 {2, Convolution, "conv2"};
-    set_conv_common_attr(conv2);
-    op_t relu2 {3, ReLU, "relu2"};
-    op_t conv3 {4, Convolution, "conv3"};
-    set_conv_common_attr(conv3);
-    op_t relu3 {5, ReLU, "relu3"};
-    op_t add {6, Add, "add"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(12);
-    conv1.add_input(lt_vec[0]);
-    conv1.add_input(lt_vec[1]);
-    conv1.add_output(lt_vec[2]);
-    relu1.add_input(lt_vec[2]);
-    relu1.add_output(lt_vec[3]);
-    conv2.add_input(lt_vec[3]);
-    conv2.add_input(lt_vec[4]);
-    conv2.add_output(lt_vec[5]);
-    relu2.add_input(lt_vec[5]);
-    relu2.add_output(lt_vec[6]);
-    conv3.add_input(lt_vec[6]);
-    conv3.add_input(lt_vec[7]);
-    conv3.add_output(lt_vec[8]);
-    relu3.add_input(lt_vec[8]);
-    relu3.add_output(lt_vec[9]);
-    add.add_input(lt_vec[9]);
-    add.add_input(lt_vec[10]);
-    add.add_output(lt_vec[11]);
-    ASSERT_EQ(agraph.add_op(&conv1), status::success);
-    ASSERT_EQ(agraph.add_op(&relu1), status::success);
-    ASSERT_EQ(agraph.add_op(&conv2), status::success);
-    ASSERT_EQ(agraph.add_op(&relu2), status::success);
-    ASSERT_EQ(agraph.add_op(&conv3), status::success);
-    ASSERT_EQ(agraph.add_op(&relu3), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 7U);
-
-    graph_t agraph2;
-    op_t conv4 {0, Convolution, "conv4"};
-    set_conv_common_attr(conv4);
-    op_t bn {1, BatchNormInference, "bn"};
-    bn.set_attr(op_attr::epsilon, 0.001f);
-
-    lt_vec = create_logical_tensors(8);
-    conv4.add_input(lt_vec[0]);
-    conv4.add_input(lt_vec[1]);
-    conv4.add_output(lt_vec[2]);
-    bn.add_input(lt_vec[2]);
-    bn.add_input(lt_vec[3]);
-    bn.add_input(lt_vec[4]);
-    bn.add_input(lt_vec[5]);
-    bn.add_input(lt_vec[6]);
-    bn.add_output(lt_vec[7]);
-    ASSERT_EQ(agraph2.add_op(&conv4), status::success);
-    ASSERT_EQ(agraph2.add_op(&bn), status::success);
-    agraph2.finalize();
-
-    fusion_ops.clear();
-    EXPECT_FALSE(match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops));
-
-    graph_t agraph3;
-    op_t conv5 {0, Convolution, "conv5"};
-    set_conv_common_attr(conv5);
-    op_t relu5 {1, ReLU, "relu4"};
-    op_t add2 {2, Add, "add2"};
-    lt_vec = create_logical_tensors(6);
-    conv5.add_input(lt_vec[0]);
-    conv5.add_input(lt_vec[1]);
-    conv5.add_output(lt_vec[2]);
-    relu5.add_input(lt_vec[2]);
-    relu5.add_output(lt_vec[3]);
-    add2.add_input(lt_vec[3]);
-    add2.add_input(lt_vec[4]);
-    add2.add_output(lt_vec[5]);
-    ASSERT_EQ(agraph3.add_op(&conv5), status::success);
-    ASSERT_EQ(agraph3.add_op(&relu5), status::success);
-    ASSERT_EQ(agraph3.add_op(&add2), status::success);
-    agraph3.finalize();
-
-    fusion_ops.clear();
-    EXPECT_TRUE(match_pattern(agraph3.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, ParallelMatmul) {
-    auto graphp = std::make_shared<pb_graph_t>();
-    // Pattern that captures shared input to three MatMuls
-    //            |--> MatMul
-    //   Wildcard ----> MatMul
-    //            |--> MatMul
-    auto pwild = graphp->append_op(Wildcard);
-    auto pmm1 = graphp->append_op(MatMul, {in_edge(IN0, pwild, OUT0)});
-    auto pmm2 = graphp->append_op(MatMul, {in_edge(IN0, pwild, OUT0)});
-    auto pmm3 = graphp->append_op(MatMul, {in_edge(IN0, pwild, OUT0)});
-    UNUSED(pmm1);
-    UNUSED(pmm2);
-    UNUSED(pmm3);
-
-    graph_t agraph;
-    op_t relu {4, ReLU, "relu"};
-    op_t matmul1 {0, MatMul, "matmul1"};
-    op_t matmul2 {1, MatMul, "matmul2"};
-    op_t matmul3 {2, MatMul, "matmul3"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
-    relu.add_input(lt_vec[7]);
-    relu.add_output(lt_vec[0]);
-    matmul1.add_input(lt_vec[0]);
-    matmul1.add_input(lt_vec[1]);
-    matmul1.add_output(lt_vec[2]);
-    matmul2.add_input(lt_vec[0]);
-    matmul2.add_input(lt_vec[3]);
-    matmul2.add_output(lt_vec[4]);
-    matmul3.add_input(lt_vec[0]);
-    matmul3.add_input(lt_vec[5]);
-    matmul3.add_output(lt_vec[6]);
-
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul3), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, OptionalInput) {
-    /*Pattern                  Graph
-     Dq0     Dq1            Dq0     Dq1
-      |      |               |       |
-      |   [Reshape]*         |       |
-       \    /                 \     /
-       MatMul                 MatMul
-         |                       |
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pdq0 = graphp->append_op(Dequantize);
-    auto pdq1 = graphp->append_op(Dequantize);
-    auto optbody = std::make_shared<pb_graph_t>();
-    auto preshape = optbody->append_op(StaticReshape);
-    optbody->create_input_port(IN0, preshape, IN0);
-    optbody->create_output_port(OUT0, preshape, OUT0);
-    auto popt = graphp->append_optional(optbody, {in_edge(IN0, pdq1, OUT0)});
-    auto pmatmul = graphp->append_op(
-            MatMul, {in_edge(IN0, pdq0, OUT0), in_edge(IN1, popt, OUT0)});
-    UNUSED(pmatmul);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    std::vector<int64_t> zps = {0};
-    std::vector<float> scales = {3.1f};
-    op_t dq0 {1, Dequantize, "dq0"};
-    dq0.set_attr(op_attr::scales, scales);
-    dq0.set_attr(op_attr::zps, zps);
-    op_t dq1 {2, Dequantize, "dq1"};
-    dq1.set_attr(op_attr::scales, scales);
-    dq1.set_attr(op_attr::zps, zps);
-
-    auto lt0 = logical_tensor_init(0, data_type::s8);
-    auto lt1 = logical_tensor_init(1, data_type::f32);
-    dq0.add_input(lt0);
-    dq0.add_output(lt1);
-    auto lt2 = logical_tensor_init(2, data_type::s8);
-    auto lt3 = logical_tensor_init(3, data_type::f32);
-    dq1.add_input(lt2);
-    dq1.add_output(lt3);
-    auto lt4 = logical_tensor_init(4, data_type::f32);
-    matmul.add_input(lt1);
-    matmul.add_input(lt3);
-    matmul.add_output(lt4);
-
-    ASSERT_EQ(agraph.add_op(&dq0), status::success);
-    ASSERT_EQ(agraph.add_op(&dq1), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-//
-// Construct a nested pattern:
-// (NODE)* represents that NODE is wraped in repetition or optional
-// (NODE1 | NODE2 | NODE3) represents alternation of NODE1, NODE2 and NODE3
-// (Matmul -> (((ReLU | Sigmoid | Tanh)))*)*
-//
-TEST(test_utils_pattern_matcher, NestedMatchingFailure) {
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
-    auto optional_activation_subgraph = std::make_shared<pb_graph_t>();
-    auto activation = optional_activation_subgraph->append_alternation(
-            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh});
-    optional_activation_subgraph->create_input_port(0, activation, 0);
-    optional_activation_subgraph->create_output_port(0, activation, 0);
-    auto optional_activation = mlp_layer->append_optional(
-            optional_activation_subgraph, {in_edge(0, matmul_layer, 0)});
-    mlp_layer->create_input_port(0, matmul_layer, 0);
-    mlp_layer->create_output_port(0, optional_activation, 0);
-    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 2);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, RepetitionWithMultipleConsumers) {
-    /* pattern
-       conv
-        |
-       relu x [1,3)
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pconv = graphp->append_op(Convolution);
-    auto repbody = std::make_shared<pb_graph_t>();
-    auto prelu = repbody->append_op(ReLU);
-    repbody->create_input_port(IN0, prelu, IN0);
-    repbody->create_output_port(OUT0, prelu, OUT0);
-    graphp->append_repetition(
-            repbody, {OUT0, IN0}, 1, 3, {in_edge(IN0, pconv, OUT0)});
-
-    /* graph
-       conv
-        |
-       relu
-        / \
-   wildcard wildcard
-    */
-    graph_t agraph;
-    op_t conv {0, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t relu {1, ReLU, "relu"};
-    op_t wildcard1 {2, Wildcard, "w1"};
-    op_t wildcard2 {3, Wildcard, "w2"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
-    conv.add_input(lt_vec[2]);
-    conv.add_input(lt_vec[3]);
-    conv.add_output(lt_vec[4]);
-    relu.add_input(lt_vec[4]);
-    relu.add_output(lt_vec[5]);
-    wildcard1.add_input(lt_vec[5]);
-    wildcard1.add_output(lt_vec[6]);
-    wildcard2.add_input(lt_vec[5]);
-    wildcard2.add_output(lt_vec[7]);
-
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard1), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard2), status::success);
-    agraph.finalize();
-    ASSERT_EQ(agraph.num_ops(), 4U);
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, MultipleConsumer) {
-    /*Pattern
-     Transpose
-      /     \____________
-   Matmul               /
-                     MatMul
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto trans = graphp->append_op(StaticTranspose);
-    auto mat1 = graphp->append_op(MatMul, {in_edge(IN1, trans, OUT0)});
-    auto mat2 = graphp->append_op(MatMul, {in_edge(IN1, trans, OUT0)});
-    UNUSED(mat1);
-    UNUSED(mat2);
-
-    graph_t agraph;
-    op_t transpose {0, StaticTranspose, "transpose"};
-    transpose.set_attr(op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
-    op_t matmul1 {1, MatMul, "matmul1"};
-    op_t matmul2 {2, MatMul, "matmul2"};
-
-    auto lt0 = logical_tensor_init(0, data_type::f32);
-    auto lt1 = logical_tensor_init(1, data_type::f32);
-    transpose.add_input(lt0);
-    transpose.add_output(lt1);
-    auto lt2 = logical_tensor_init(2, data_type::f32);
-    auto lt3 = logical_tensor_init(3, data_type::f32);
-    matmul1.add_input(lt2);
-    matmul1.add_input(lt1);
-    matmul1.add_output(lt3);
-    auto lt4 = logical_tensor_init(4, data_type::f32);
-    auto lt5 = logical_tensor_init(5, data_type::f32);
-    matmul2.add_input(lt4);
-    matmul2.add_input(lt1);
-    matmul2.add_output(lt5);
-
-    ASSERT_EQ(agraph.add_op(&transpose), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, MultipleConsumerDifferentPartition) {
-    /*Pattern
-     Matmul
-      |
-     Div
-      |
-     Add
-      |
-   SoftMax
-      |
-     Mul
-    */
-    /*Graph
-
-    \   /
-    Matmul
-      |
-     Div
-      |
-     Add
-      |
-   SoftMax
-      |  \________________
-     Mul                  \
-                   SoftMaxBackProp
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto matmul_node = graphp->append_op(MatMul);
-    auto div_node
-            = graphp->append_op(Divide, {in_edge(IN0, matmul_node, OUT0)});
-    auto add_node = graphp->append_op(Add, {in_edge(IN0, div_node, OUT0)});
-    auto softmax_node
-            = graphp->append_op(SoftMax, {in_edge(IN0, add_node, OUT0)});
-    softmax_node->allow_external_outputs();
-    auto mul_node
-            = graphp->append_op(Multiply, {in_edge(IN0, softmax_node, OUT0)});
-    UNUSED(mul_node);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t div {1, Divide, "div"};
-    op_t add {2, Add, "add"};
-    op_t softmax {3, SoftMax, "softmax"};
-    op_t mul {4, Multiply, "mul"};
-    op_t softmaxbwd {5, SoftMaxBackward, "softmaxbwd"};
-
-    auto lt0 = logical_tensor_init(0, data_type::f32);
-    auto lt1 = logical_tensor_init(1, data_type::f32);
-    auto lt2 = logical_tensor_init(2, data_type::f32);
-    matmul.add_input(lt0);
-    matmul.add_input(lt1);
-    matmul.add_output(lt2);
-    auto lt3 = logical_tensor_init(3, data_type::f32);
-    auto lt4 = logical_tensor_init(4, data_type::f32);
-    div.add_input(lt2);
-    div.add_input(lt3);
-    div.add_output(lt4);
-    auto lt5 = logical_tensor_init(5, data_type::f32);
-    auto lt6 = logical_tensor_init(6, data_type::f32);
-    add.add_input(lt4);
-    add.add_input(lt5);
-    add.add_output(lt6);
-    auto lt7 = logical_tensor_init(7, data_type::f32);
-    softmax.add_input(lt6);
-    softmax.add_output(lt7);
-    auto lt8 = logical_tensor_init(8, data_type::f32);
-    auto lt9 = logical_tensor_init(9, data_type::f32);
-    mul.add_input(lt7);
-    mul.add_input(lt8);
-    mul.add_output(lt9);
-
-    auto lt10 = logical_tensor_init(10, data_type::f32);
-    auto lt11 = logical_tensor_init(11, data_type::f32);
-    softmaxbwd.add_input(lt7);
-    softmaxbwd.add_input(lt10);
-    softmaxbwd.add_output(lt11);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&div), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-    ASSERT_EQ(agraph.add_op(&softmax), status::success);
-    ASSERT_EQ(agraph.add_op(&mul), status::success);
-    ASSERT_EQ(agraph.add_op(&softmaxbwd), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 5U);
-}
-
-TEST(test_utils_pattern_matcher, NestedRepetitionOptional) {
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul = mlp_layer->append_op(op_kind::MatMul);
-    auto optional_add_subgraph = std::make_shared<pb_graph_t>();
-    auto optional_add = optional_add_subgraph->append_op(op_kind::Add);
-    optional_add_subgraph->create_input_port(0, optional_add, 0);
-    optional_add_subgraph->create_output_port(0, optional_add, 0);
-    auto add = mlp_layer->append_optional(
-            optional_add_subgraph, {in_edge(0, matmul, 0)});
-
-    auto activation = mlp_layer->append_alternation(
-            {op_kind::ReLU, op_kind::Sigmoid, op_kind::GELU},
-            {in_edge(0, add, 0)});
-
-    mlp_layer->create_input_port(0, matmul, 0);
-    mlp_layer->create_output_port(0, activation, 0);
-    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 10);
-
-    graph_t agraph;
-    op_t matmul_op {0, MatMul, "matmul"};
-    op_t add_op {1, Add, "add"};
-    op_t relu {2, ReLU, "relu"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    matmul_op.add_input(lt_vec[0]);
-    matmul_op.add_input(lt_vec[1]);
-    matmul_op.add_output(lt_vec[2]);
-    add_op.add_input(lt_vec[2]);
-    add_op.add_input(lt_vec[3]);
-    add_op.add_output(lt_vec[4]);
-    relu.add_input(lt_vec[4]);
-    relu.add_output(lt_vec[5]);
-
-    ASSERT_EQ(agraph.add_op(&matmul_op), status::success);
-    ASSERT_EQ(agraph.add_op(&add_op), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, RepetitionExternalOutput) {
-    /*
-    pattern:
-          matmul                    \
-         |      \(external_output)   |
-      activation                     |  * [1,10)
-         |      \(external_output)   /
-
-    graph:
-         matmul
-          |    \
-          relu  ext0
-          |   \
-         matmul ext1
-          |    \
-          relu  ext2
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul = fwd_mlp_layer->append_op(op_kind::MatMul);
-    matmul->allow_external_outputs();
-    auto activation = fwd_mlp_layer->append_alternation(
-            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh},
-            {in_edge(0, matmul, 0)});
-    activation->allow_external_outputs();
-    fwd_mlp_layer->create_input_port(0, matmul, 0);
-    fwd_mlp_layer->create_output_port(0, activation, 0);
-
-    // repeat layer for [1, 10) times
-    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
-
-    graph_t agraph;
-    op_t matmul0 {0, MatMul, "matmul0"};
-    op_t relu0 {1, ReLU, "relu0"};
-    op_t matmul1 {2, MatMul, "matmul1"};
-    op_t relu1 {3, ReLU, "relu1"};
-
-    op_t ext0 {4, StaticTranspose, "ext0"};
-    ext0.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
-    op_t ext1 {5, StaticTranspose, "ext1"};
-    ext1.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
-    op_t ext2 {6, StaticTranspose, "ext2"};
-    ext2.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
-
-    auto lt0 = logical_tensor_init(0, data_type::f32);
-    auto lt1 = logical_tensor_init(1, data_type::f32);
-    auto lt2 = logical_tensor_init(2, data_type::f32);
-    matmul0.add_input(lt0);
-    matmul0.add_input(lt1);
-    matmul0.add_output(lt2);
-    auto lt3 = logical_tensor_init(3, data_type::f32);
-    relu0.add_input(lt2);
-    relu0.add_output(lt3);
-    auto lt4 = logical_tensor_init(4, data_type::f32);
-    auto lt5 = logical_tensor_init(5, data_type::f32);
-    matmul1.add_input(lt3);
-    matmul1.add_input(lt4);
-    matmul1.add_output(lt5);
-    auto lt6 = logical_tensor_init(6, data_type::f32);
-    relu1.add_input(lt5);
-    relu1.add_output(lt6);
-    auto lt7 = logical_tensor_init(7, data_type::f32);
-    auto lt8 = logical_tensor_init(8, data_type::f32);
-    auto lt9 = logical_tensor_init(9, data_type::f32);
-    ext0.add_input(lt2);
-    ext0.add_output(lt7);
-    ext1.add_input(lt3);
-    ext1.add_output(lt8);
-    ext2.add_input(lt5);
-    ext2.add_output(lt9);
-
-    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
-    ASSERT_EQ(agraph.add_op(&relu0), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-    ASSERT_EQ(agraph.add_op(&relu1), status::success);
-    ASSERT_EQ(agraph.add_op(&ext0), status::success);
-    ASSERT_EQ(agraph.add_op(&ext1), status::success);
-    ASSERT_EQ(agraph.add_op(&ext2), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 4U);
-}
-
-TEST(test_utils_pattern_matcher, RepetitionExternalOutputSwapOrder) {
-    /*
-    pattern:
-          matmul                    \
-         |      \(external_output)   |
-      activation                     |  * [1,10)
-         |      \(external_output)   /
-
-    graph:
-         matmul
-        /    |
-      ext0  relu
-           / |
-       ext1 matmul
-            /  |
-          ext2 relu
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul = fwd_mlp_layer->append_op(op_kind::MatMul);
-    matmul->allow_external_outputs();
-    auto activation = fwd_mlp_layer->append_alternation(
-            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh},
-            {in_edge(0, matmul, 0)});
-    activation->allow_external_outputs();
-    fwd_mlp_layer->create_input_port(0, matmul, 0);
-    fwd_mlp_layer->create_output_port(0, activation, 0);
-
-    // repeat layer for [1, 10) times
-    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
-
-    graph_t agraph;
-    op_t matmul0 {0, MatMul, "matmul0"};
-    op_t relu0 {1, ReLU, "relu0"};
-    op_t matmul1 {2, MatMul, "matmul1"};
-    op_t relu1 {3, ReLU, "relu1"};
-
-    op_t ext0 {4, StaticTranspose, "ext0"};
-    ext0.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
-    op_t ext1 {5, StaticTranspose, "ext1"};
-    ext1.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
-    op_t ext2 {6, StaticTranspose, "ext2"};
-    ext2.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
-
-    auto lt0 = logical_tensor_init(0, data_type::f32);
-    auto lt1 = logical_tensor_init(1, data_type::f32);
-    auto lt2 = logical_tensor_init(2, data_type::f32);
-    matmul0.add_input(lt0);
-    matmul0.add_input(lt1);
-    matmul0.add_output(lt2);
-
-    auto lt7 = logical_tensor_init(7, data_type::f32);
-    ext0.add_input(lt2);
-    ext0.add_output(lt7);
-
-    auto lt3 = logical_tensor_init(3, data_type::f32);
-    relu0.add_input(lt2);
-    relu0.add_output(lt3);
-
-    auto lt8 = logical_tensor_init(8, data_type::f32);
-    ext1.add_input(lt3);
-    ext1.add_output(lt8);
-
-    auto lt4 = logical_tensor_init(4, data_type::f32);
-    auto lt5 = logical_tensor_init(5, data_type::f32);
-    matmul1.add_input(lt3);
-    matmul1.add_input(lt4);
-    matmul1.add_output(lt5);
-
-    auto lt9 = logical_tensor_init(9, data_type::f32);
-    ext2.add_input(lt5);
-    ext2.add_output(lt9);
-
-    auto lt6 = logical_tensor_init(6, data_type::f32);
-    relu1.add_input(lt5);
-    relu1.add_output(lt6);
-
-    ASSERT_EQ(agraph.add_op(&ext0), status::success);
-    ASSERT_EQ(agraph.add_op(&ext1), status::success);
-    ASSERT_EQ(agraph.add_op(&ext2), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
-    ASSERT_EQ(agraph.add_op(&relu0), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-    ASSERT_EQ(agraph.add_op(&relu1), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[3].get(), graphp, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 4U);
-}
-
-TEST(test_utils_pattern_matcher, CyclicCheck) {
-    /*
-    pattern:
-          matmul
-           /  \(external_output)
-         relu
-           \  /
-            add
-
-
-    graph:
-         matmul
-          /  \
-        relu  sigmoid
-          \  /
-           add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmatmul = graphp->append_op(op_kind::MatMul);
-    pmatmul->allow_external_outputs();
-    auto prelu = graphp->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
-    auto padd = graphp->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-    op_t add {2, Add, "add"};
-    op_t sigmoid {3, Sigmoid, "sigmoid"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-    sigmoid.add_input(lt_vec[2]);
-    sigmoid.add_output(lt_vec[4]);
-    add.add_input(lt_vec[3]);
-    add.add_input(lt_vec[4]);
-    add.add_output(lt_vec[5]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&sigmoid), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-}
-
-TEST(test_utils_pattern_matcher, UndirectCyclicCheck) {
-    /*
-    pattern:
-          matmul
-           /  \(external_output)
-         relu
-           \  /
-            add
-
-
-    graph:
-         matmul
-          /  \
-         |    wildcard wildcard
-        relu    |     /
-         |    wildcard
-          \  /
-           add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto pmatmul = graphp->append_op(op_kind::MatMul);
-    pmatmul->allow_external_outputs();
-    auto prelu = graphp->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
-    auto padd = graphp->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-    op_t add {2, Add, "add"};
-    op_t wildcard {3, Wildcard, "wildcard"};
-    op_t wildcard2 {4, Wildcard, "wildcard"};
-    op_t wildcard3 {5, Wildcard, "wildcard"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(9);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-    wildcard.add_input(lt_vec[2]);
-    wildcard.add_output(lt_vec[4]);
-    wildcard2.add_input(lt_vec[5]);
-    wildcard2.add_output(lt_vec[6]);
-    wildcard3.add_input(lt_vec[4]);
-    wildcard3.add_input(lt_vec[6]);
-    wildcard3.add_output(lt_vec[7]);
-    add.add_input(lt_vec[3]);
-    add.add_input(lt_vec[7]);
-    add.add_output(lt_vec[8]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard2), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard3), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-}
-
-TEST(test_utils_pattern_matcher, ComplexCyclicCheck) {
-    /*
-    pattern:
-          matmul                   \
-           /   \(external_output)   |
-         relu                       |  * [1,10)
-           \  /                     |
-            add                     /
-
-    graph:
-         matmul
-          /   \
-        relu  sigmoid
-          \        |
-           add     |
-            |      |
-           matmul /
-            |    /
-           relu /
-            \  /
-             add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
-    auto pmatmul = fwd_mlp_layer->append_op(op_kind::MatMul);
-    pmatmul->allow_external_outputs();
-    auto prelu
-            = fwd_mlp_layer->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
-    auto padd = fwd_mlp_layer->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
-    fwd_mlp_layer->create_input_port(0, pmatmul, 0);
-    fwd_mlp_layer->create_output_port(0, padd, 0);
-
-    // repeat layer for [1, 10) times
-    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
-
-    graph_t agraph;
-    op_t matmul0 {0, MatMul, "matmu0"};
-    op_t relu0 {1, ReLU, "relu0"};
-    op_t add0 {2, Add, "add0"};
-    op_t sigmoid0 {3, Sigmoid, "sigmoid0"};
-    op_t matmul1 {4, MatMul, "matmul1"};
-    op_t relu1 {5, ReLU, "relu1"};
-    op_t add1 {6, Add, "add1"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(11);
-    matmul0.add_input(lt_vec[0]);
-    matmul0.add_input(lt_vec[1]);
-    matmul0.add_output(lt_vec[2]);
-    relu0.add_input(lt_vec[2]);
-    relu0.add_output(lt_vec[3]);
-    sigmoid0.add_input(lt_vec[2]);
-    sigmoid0.add_output(lt_vec[4]);
-    add0.add_input(lt_vec[3]);
-    add0.add_input(lt_vec[5]);
-    add0.add_output(lt_vec[6]);
-    matmul1.add_input(lt_vec[6]);
-    matmul1.add_input(lt_vec[7]);
-    matmul1.add_output(lt_vec[8]);
-    relu1.add_input(lt_vec[8]);
-    relu1.add_output(lt_vec[9]);
-    add1.add_input(lt_vec[9]);
-    //cycle here
-    add1.add_input(lt_vec[4]);
-    add1.add_output(lt_vec[10]);
-
-    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
-    ASSERT_EQ(agraph.add_op(&relu0), status::success);
-    ASSERT_EQ(agraph.add_op(&sigmoid0), status::success);
-    ASSERT_EQ(agraph.add_op(&add0), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-    ASSERT_EQ(agraph.add_op(&relu1), status::success);
-    ASSERT_EQ(agraph.add_op(&add1), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    // should match the 1st rep_unit
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, ComplexUndirectCyclicCheck) {
-    /*
-    pattern:
-          matmul                   \
-           /   \(external_output)   |
-         relu                       |  * [1,10)
-           \  /                     |
-            add                     /
-
-    graph:
-         matmul
-          /   \
-        relu  wildcard
-          \        |
-           add    wildcard
-            |      |
-           matmul wildcard
-            |    /
-           relu /
-            \  /
-             add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
-    auto pmatmul = fwd_mlp_layer->append_op(op_kind::MatMul);
-    pmatmul->allow_external_outputs();
-    auto prelu
-            = fwd_mlp_layer->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
-    auto padd = fwd_mlp_layer->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
-    fwd_mlp_layer->create_input_port(0, pmatmul, 0);
-    fwd_mlp_layer->create_output_port(0, padd, 0);
-
-    // repeat layer for [1, 10) times
-    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
-
-    graph_t agraph;
-    op_t matmul0 {0, MatMul, "matmu0"};
-    op_t relu0 {1, ReLU, "relu0"};
-    op_t add0 {2, Add, "add0"};
-    op_t wildcard0 {3, Wildcard, "wildcard0"};
-    op_t wildcard1 {4, Wildcard, "wildcard1"};
-    op_t wildcard2 {5, Wildcard, "wildcard2"};
-    op_t matmul1 {6, MatMul, "matmul1"};
-    op_t relu1 {7, ReLU, "relu1"};
-    op_t add1 {8, Add, "add1"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(13);
-    matmul0.add_input(lt_vec[0]);
-    matmul0.add_input(lt_vec[1]);
-    matmul0.add_output(lt_vec[2]);
-    relu0.add_input(lt_vec[2]);
-    relu0.add_output(lt_vec[3]);
-    wildcard0.add_input(lt_vec[2]);
-    wildcard0.add_output(lt_vec[4]);
-    wildcard1.add_input(lt_vec[4]);
-    wildcard1.add_output(lt_vec[5]);
-    wildcard2.add_input(lt_vec[5]);
-    wildcard2.add_output(lt_vec[6]);
-    add0.add_input(lt_vec[3]);
-    add0.add_input(lt_vec[7]);
-    add0.add_output(lt_vec[8]);
-    matmul1.add_input(lt_vec[8]);
-    matmul1.add_input(lt_vec[9]);
-    matmul1.add_output(lt_vec[10]);
-    relu1.add_input(lt_vec[10]);
-    relu1.add_output(lt_vec[11]);
-    add1.add_input(lt_vec[11]);
-    //cycle here
-    add1.add_input(lt_vec[6]);
-    add1.add_output(lt_vec[12]);
-
-    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
-    ASSERT_EQ(agraph.add_op(&relu0), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard0), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard1), status::success);
-    ASSERT_EQ(agraph.add_op(&wildcard2), status::success);
-    ASSERT_EQ(agraph.add_op(&add0), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
-    ASSERT_EQ(agraph.add_op(&relu1), status::success);
-    ASSERT_EQ(agraph.add_op(&add1), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    // should match the 1st rep_unit
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, OptionalSubgraphFailure) {
-    /*
-        [   \    /
-            matmul
-              |
-        [relu, sigmoid, tanh]*[0,1] ]*[1,5]
-    */
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
-    auto optional_activation_subgraph = std::make_shared<pb_graph_t>();
-    auto activation = optional_activation_subgraph->append_alternation(
-            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh});
-    optional_activation_subgraph->create_input_port(0, activation, 0);
-    optional_activation_subgraph->create_output_port(0, activation, 0);
-    auto optional_activation = mlp_layer->append_optional(
-            optional_activation_subgraph, {in_edge(0, matmul_layer, 0)});
-    mlp_layer->create_input_port(0, matmul_layer, 0);
-    mlp_layer->create_output_port(0, optional_activation, 0);
-    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 5);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t matmul2 {1, MatMul, "matmul2"};
-    op_t matmul3 {2, MatMul, "matmul3"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    matmul2.add_input(lt_vec[2]);
-    matmul2.add_input(lt_vec[3]);
-    matmul2.add_output(lt_vec[4]);
-    matmul3.add_input(lt_vec[4]);
-    matmul3.add_input(lt_vec[5]);
-    matmul3.add_output(lt_vec[6]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul3), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 3U);
-}
-
-TEST(test_utils_pattern_matcher, OptionalSubgraphFailure3) {
-    /*
-            [  \     /
-               matmul
-                 |
-               relu
-                 |
-              [relu]*[0,1] ]*[1,5]
-    */
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
-    auto relu_layer = mlp_layer->append_op(
-            op_kind::ReLU, {in_edge(0, matmul_layer, 0)});
-    auto optional_relu_subgraph = std::make_shared<pb_graph_t>();
-    auto activation = optional_relu_subgraph->append_op(op_kind::ReLU);
-    optional_relu_subgraph->create_input_port(0, activation, 0);
-    optional_relu_subgraph->create_output_port(0, activation, 0);
-    auto optional_relu = mlp_layer->append_optional(
-            optional_relu_subgraph, {in_edge(0, relu_layer, 0)});
-    mlp_layer->create_input_port(0, matmul_layer, 0);
-    mlp_layer->create_output_port(0, optional_relu, 0);
-    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 5);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 2U);
-}
-
-TEST(test_utils_pattern_matcher, OptionalSubgraphFailure4) {
-    /*
-            [  \     /
-               matmul
-                 |
-                add*[0,1]
-                 |
-              [relu]*[0,1] ]*[1,5]
-    */
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto mlp_layer = std::make_shared<pb_graph_t>();
-    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
-    auto optional_add_subgraph = std::make_shared<pb_graph_t>();
-    auto add = optional_add_subgraph->append_op(op_kind::Add);
-    optional_add_subgraph->create_input_port(0, add, 0);
-    optional_add_subgraph->create_output_port(0, add, 0);
-    auto optional_add = mlp_layer->append_optional(
-            optional_add_subgraph, {in_edge(0, matmul_layer, 0)});
-    auto optional_activation_subgraph = std::make_shared<pb_graph_t>();
-    auto activation = optional_activation_subgraph->append_alternation(
-            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh});
-    optional_activation_subgraph->create_input_port(0, activation, 0);
-    optional_activation_subgraph->create_output_port(0, activation, 0);
-    auto optional_activation = mlp_layer->append_optional(
-            optional_activation_subgraph, {in_edge(0, optional_add, 0)});
-    mlp_layer->create_input_port(0, matmul_layer, 0);
-    mlp_layer->create_output_port(0, optional_activation, 0);
-    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 5);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(3);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 1U);
-}
-
-TEST(test_utils_pattern_matcher, ShouldNotMatchIdenticalResblock) {
-    // pattern:
-    //     |               |
-    //   conv              |
-    //     |               |
-    //   opt_bias          |
-    //     |               |
-    //   opt_relu          |
-    //     |  dst0       conv
-    //   conv              |
-    //     |            opt_bias
-    //   opt_bias          |
-    //     |            opt_relu
-    //   opt_relu          |
-    //     |  dst1         |
-    //   conv              |
-    //     |              /
-    //   opt_bias        /
-    //         \        / dst2
-    //          \      /
-    //            add
-    //             |
-    //            relu
-    //             |
-    auto conv_opt_bias_opt_eltwise
-            = [&](const std::shared_ptr<pb_graph_t> &pgraph,
-                      pb_op_t *input) -> pb_op_t * {
-        in_edges_t in_edges;
-        if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-        pb_op_t *conv = pgraph->append_op(op_kind::Convolution, in_edges);
-
-        // Optional bias_add
-        auto popt_bias_graph = std::make_shared<pb_graph_t>();
-        pb_op_t *pbias = popt_bias_graph->append_op(op_kind::BiasAdd);
-        popt_bias_graph->create_input_port(0, pbias, 0);
-        popt_bias_graph->create_output_port(0, pbias, 0);
-        auto popt_bias = pgraph->append_optional(
-                popt_bias_graph, in_edges_t {in_edge(0, conv, 0)});
-
-        // Optional post relu
-        auto popt_eltwise_graph = std::make_shared<pb_graph_t>();
-        pb_op_t *peltwise = popt_eltwise_graph->append_op(op_kind::ReLU);
-        popt_eltwise_graph->create_input_port(0, peltwise, 0);
-        popt_eltwise_graph->create_output_port(0, peltwise, 0);
-        auto popt_eltwise = pgraph->append_optional(
-                popt_eltwise_graph, in_edges_t {in_edge(0, popt_bias, 0)});
-        return reinterpret_cast<pb_op_t *>(popt_eltwise);
-    };
-
-    auto conv_opt_bias_add_relu
-            = [&](const std::shared_ptr<pb_graph_t> &pgraph, pb_op_t *input,
-                      pb_op_t *post_src) -> pb_op_t * {
-        in_edges_t in_edges;
-        if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
-        pb_op_t *conv = pgraph->append_op(op_kind::Convolution, in_edges);
-
-        // Optional bias_add
-        auto popt_bias_graph = std::make_shared<pb_graph_t>();
-        pb_op_t *pbias = popt_bias_graph->append_op(op_kind::BiasAdd);
-        popt_bias_graph->create_input_port(0, pbias, 0);
-        popt_bias_graph->create_output_port(0, pbias, 0);
-        auto popt_bias = pgraph->append_optional(
-                popt_bias_graph, in_edges_t {in_edge(0, conv, 0)});
-
-        in_edges_t add_in_edges = in_edges_t {in_edge(0, popt_bias, 0)};
-        if (post_src) { add_in_edges.emplace_back(in_edge(1, post_src, 0)); }
-        pb_op_t *add = pgraph->append_op(op_kind::Add, add_in_edges);
-
-        pb_op_t *relu = pgraph->append_op(
-                op_kind::ReLU, in_edges_t {in_edge(0, add, 0)});
-        return relu;
-    };
-
-    auto pgraph = std::make_shared<pb_graph_t>();
-    pb_op_t *dst0 = conv_opt_bias_opt_eltwise(pgraph, nullptr);
-    pb_op_t *dst1 = conv_opt_bias_opt_eltwise(pgraph, dst0);
-    pb_op_t *dst2 = conv_opt_bias_opt_eltwise(pgraph, nullptr);
-    conv_opt_bias_add_relu(pgraph, dst1, dst2);
-
-    // graph:
-    // construct identical bottleneck resblock
-    //     |               |
-    //   conv              |
-    //     |               |
-    //   bias              |
-    //     |               |
-    //   relu              |
-    //     |               |
-    //   conv              |
-    //     |               |
-    //   bias              |
-    //     |               |
-    //   relu              |
-    //     |               |
-    //   conv              |
-    //     |              /
-    //    bias           /
-    //         \        /
-    //          \      /
-    //            add
-    //             |
-    //            relu
-    //             |
-
-    graph_t agraph;
-
-    id_generator id_gen;
-
-    int64_t ic = 8, oc = 8, ks = 1;
-    std::vector<int64_t> src_shape {1, ic, 12, 12};
-
-    auto src = logical_tensor_init(id_gen.get_id(), src_shape, data_type::f32);
-
-    auto conv0 = create_convolution(id_gen, agraph, src, ic, ks, oc, 1, {1, 1},
-            {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", true, false, 1e-6f,
-            /*with relu*/ true, /*with biasadd*/ true);
-    auto conv1 = create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", true, false, 1e-6f,
-            /*with relu*/ true, /*with biasadd*/ true);
-    auto conv2 = create_convolution(id_gen, agraph, conv1, ic, ks, oc, 1,
-            {1, 1}, {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", true, false, 1e-6f,
-            /*with relu*/ false, /*with biasadd*/ true);
-    auto add0 = create_add(id_gen, agraph, conv2, src);
-    create_relu(id_gen, agraph, add0);
-
-    agraph.finalize();
-
-    ASSERT_EQ(agraph.get_ops().size(), 10U);
-
-    std::vector<op_t *> fusion_ops;
-    // should not match, so should be false
-    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-}
-
-TEST(test_utils_pattern_matcher, RepetitionOportExternalOutput) {
-    /*
-    pattern:
-        matmul                     \
-          |                         |  * [1,10)
-         relu                      /
-          |  \(external_output)
-        sigmoid
-    graph:
-         matmul
-           |
-          relu
-           |  \
-        matmul relu_bwd
-           |
-          relu
-           |  \
-       sigmoid relu_bwd
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-    auto grep = std::make_shared<pb_graph_t>();
-    auto pmatmul = grep->append_op(op_kind::MatMul);
-    auto prelu = grep->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
-    prelu->allow_external_outputs();
-    grep->create_input_port(0, pmatmul, 0);
-    grep->create_output_port(0, prelu, 0);
-    auto prep = graphp->append_repetition(grep, {0, 0}, 1, 10);
-
-    auto psigmoid = graphp->append_op(op_kind::Sigmoid, {in_edge(0, prep, 0)});
-
-    UNUSED(psigmoid);
-
-    graph_t agraph;
-    op_t matmul {0, MatMul, "matmul"};
-    op_t relu {1, ReLU, "relu"};
-    op_t relu_bwd {2, ReLUBackward, "relu_bwd"};
-    op_t matmul2 {3, MatMul, "matmul2"};
-    op_t relu2 {4, ReLU, "relu2"};
-    op_t relu_bwd2 {5, ReLUBackward, "relu_bwd2"};
-    op_t sigmoid {6, Sigmoid, "sigmoid"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(12);
-    matmul.add_input(lt_vec[0]);
-    matmul.add_input(lt_vec[1]);
-    matmul.add_output(lt_vec[2]);
-    relu.add_input(lt_vec[2]);
-    relu.add_output(lt_vec[3]);
-    relu_bwd.add_input(lt_vec[3]);
-    relu_bwd.add_input(lt_vec[4]);
-    relu_bwd.add_output(lt_vec[5]);
-    matmul2.add_input(lt_vec[3]);
-    matmul2.add_input(lt_vec[6]);
-    matmul2.add_output(lt_vec[7]);
-    relu2.add_input(lt_vec[7]);
-    relu2.add_output(lt_vec[8]);
-    sigmoid.add_input(lt_vec[8]);
-    sigmoid.add_output(lt_vec[9]);
-    relu_bwd2.add_input(lt_vec[8]);
-    relu_bwd2.add_input(lt_vec[10]);
-    relu_bwd2.add_output(lt_vec[11]);
-
-    ASSERT_EQ(agraph.add_op(&matmul), status::success);
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&relu_bwd), status::success);
-    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
-    ASSERT_EQ(agraph.add_op(&relu2), status::success);
-    ASSERT_EQ(agraph.add_op(&sigmoid), status::success);
-    ASSERT_EQ(agraph.add_op(&relu_bwd2), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 5U);
-}
-
-TEST(test_utils_pattern_matcher, OptionalCommutative) {
-    /*
-    pattern:
-          relu
-           |   \
-         Conv   |
-           |    |
-  [0-1]*BiasAdd |
-           |   /
-          Add
-    graph:
-          relu
-        /  |
-       |  Conv
-       |   |
-       | BiasAdd*[0-1]
-        \  |
-          Add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-
-    auto prelu = graphp->append_op(op_kind::ReLU);
-    auto pconv
-            = graphp->append_op(op_kind::Convolution, {in_edge(0, prelu, 0)});
-    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
-    auto biasadd = biasadd_subgraph->append_op(op_kind::BiasAdd);
-    biasadd_subgraph->create_input_port(0, biasadd, 0);
-    biasadd_subgraph->create_output_port(0, biasadd, 0);
-    auto optional_biasadd
-            = graphp->append_optional(biasadd_subgraph, {in_edge(0, pconv, 0)});
-    in_edges_t add_edges
-            = {in_edge(0, optional_biasadd, 0), in_edge(1, prelu, 0)};
-    auto padd = graphp->append_op(op_kind::Add, add_edges);
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t relu {0, ReLU, "relu"};
-    op_t conv {1, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t bias {2, BiasAdd, "bias"};
-    op_t add {3, Add, "add"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
-    relu.add_input(lt_vec[0]);
-    relu.add_output(lt_vec[1]);
-    conv.add_input(lt_vec[1]);
-    conv.add_input(lt_vec[2]);
-    conv.add_output(lt_vec[3]);
-    bias.add_input(lt_vec[3]);
-    bias.add_input(lt_vec[4]);
-    bias.add_output(lt_vec[5]);
-    add.add_input(lt_vec[1]);
-    add.add_input(lt_vec[5]);
-    add.add_output(lt_vec[6]);
-
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&bias), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 4U);
-}
-
-TEST(test_utils_pattern_matcher, AlternativeCommutative) {
-    /*
-    pattern:
-          relu______________
-           |                \
-         Conv               |
-           |                |
-  [0-1]*(ReLU|Tanh|Sigmoid) |
-           |  ______________/
-           | /
-          Add
-    graph:
-          relu
-        /  |
-       |  Conv
-       |   |
-       | [0-1]*(ReLU|Tanh|Sigmoid)
-        \  |
-          Add
-    */
-    auto graphp = std::make_shared<pb_graph_t>();
-
-    auto prelu = graphp->append_op(op_kind::ReLU);
-    auto pconv
-            = graphp->append_op(op_kind::Convolution, {in_edge(0, prelu, 0)});
-    auto palt_subgraph = std::make_shared<pb_graph_t>();
-    auto palt = palt_subgraph->append_alternation(
-            {op_kind::ReLU, op_kind::Tanh, op_kind::Sigmoid});
-    palt_subgraph->create_input_port(0, palt, 0);
-    palt_subgraph->create_output_port(0, palt, 0);
-    auto optional_biasadd
-            = graphp->append_optional(palt_subgraph, {in_edge(0, pconv, 0)});
-    in_edges_t add_edges
-            = {in_edge(0, optional_biasadd, 0), in_edge(1, prelu, 0)};
-    auto padd = graphp->append_op(op_kind::Add, add_edges);
-    UNUSED(padd);
-
-    graph_t agraph;
-    op_t relu {0, ReLU, "relu"};
-    op_t conv {1, Convolution, "conv"};
-    set_conv_common_attr(conv);
-    op_t relu2 {2, ReLU, "relu2"};
-    op_t add {3, Add, "add"};
-
-    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
-    relu.add_input(lt_vec[0]);
-    relu.add_output(lt_vec[1]);
-    conv.add_input(lt_vec[1]);
-    conv.add_input(lt_vec[2]);
-    conv.add_output(lt_vec[3]);
-    relu2.add_input(lt_vec[3]);
-    relu2.add_output(lt_vec[4]);
-    add.add_input(lt_vec[1]);
-    add.add_input(lt_vec[4]);
-    add.add_output(lt_vec[5]);
-
-    ASSERT_EQ(agraph.add_op(&relu), status::success);
-    ASSERT_EQ(agraph.add_op(&conv), status::success);
-    ASSERT_EQ(agraph.add_op(&relu2), status::success);
-    ASSERT_EQ(agraph.add_op(&add), status::success);
-
-    agraph.finalize();
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
-    EXPECT_EQ(fusion_ops.size(), 4U);
-}
-
-TEST(test_utils_pattern_matcher, CreateOutputPort) {
-    auto post_subgraph = std::make_shared<pb_graph_t>();
-    std::vector<graph::op_kind_t> unary_binary
-            = {graph::op_kind::Abs, graph::op_kind::Clamp};
-    auto alternative_post_op = post_subgraph->append_alternation(unary_binary);
-    ASSERT_NO_THROW(alternative_post_op->allow_internal_inputs());
-    ASSERT_TRUE(post_subgraph->create_input_port(0, alternative_post_op, 0));
-    ASSERT_TRUE(post_subgraph->create_output_port(1, alternative_post_op, 0));
-}
-
-TEST(test_utils_pattern_matcher, CreateInputPort) {
-    auto alt_graph = std::make_shared<pb_graph_t>();
-    std::vector<graph::op_kind_t> unary_binary
-            = {graph::op_kind::GELU, graph::op_kind::HardSwish};
-    auto palt = alt_graph->append_alternation(unary_binary);
-    ASSERT_NO_THROW(palt->allow_internal_inputs());
-    ASSERT_TRUE(alt_graph->create_input_port(0, palt, 0));
-    ASSERT_FALSE(alt_graph->create_input_port(0, palt, 0));
-}
-
-TEST(test_utils_pattern_matcher, GraphNodeName) {
-    auto alt_graph = std::make_shared<pb_graph_t>();
-    std::shared_ptr<pb_node_t> node_ptr = alt_graph;
-    ASSERT_NO_THROW(auto node_str = node_ptr->get_name());
-}
-
-TEST(test_utils_pattern_matcher, GraphRun) {
-    graph::pass::pass_base a;
-    graph::graph_t agraph;
-    ASSERT_EQ(a.run(agraph), graph::status::success);
-}
-
-TEST(test_utils_pattern_matcher, RepConvReluWithMultiConsumers) {
-    // pattern:
-    //   conv
-    //     |
-    //   relu
-    //     |____________
-    //     |            |
-    //     |      [conv-> relu]*[0,3)
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto pconv = pgraph->append_op(graph::op_kind::Convolution);
-    auto prelu = pgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, pconv, 0)});
-    auto prep_subgraph = std::make_shared<pb_graph_t>();
-    auto prconv = prep_subgraph->append_op(graph::op_kind::Convolution);
-    auto prrelu = prep_subgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, prconv, 0)});
-    prelu->allow_external_outputs();
-    prep_subgraph->create_input_port(IN0, prconv, IN0);
-    prep_subgraph->create_output_port(OUT0, prrelu, OUT0);
-    pgraph->append_repetition(
-            prep_subgraph, {0, 0}, 0, 3, in_edges_t {in_edge(0, prelu, 0)});
-
-    // graph:
-    // the single conv is the 1st consumer
-    // while the conv on the "main branch" is the 2nd consumer
-    //   conv
-    //     |
-    //   relu
-    //     |________________
-    //     |                |
-    // (external output)  conv
-    //   conv               |
-    //     |              relu
-    //     |                |
-    graph::graph_t agraph;
-
-    id_generator id_gen;
-
-    int64_t ic = 8, oc = 8, ks = 1;
-    std::vector<int64_t> src_shape {1, ic, 12, 12};
-
-    auto src = logical_tensor_init(
-            id_gen.get_id(), src_shape, graph::data_type::f32);
-
-    auto conv0 = create_convolution(id_gen, agraph, src, ic, ks, oc, 1, {1, 1},
-            {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
-            /*with relu*/ true, /*with biasadd*/ false);
-    // the order of consumer depends on the order of adding ops
-    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
-            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
-            /*with relu*/ false, /*with biasadd*/ false);
-    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
-            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
-            /*with relu*/ true, /*with biasadd*/ false);
-
-    agraph.finalize();
-
-    ASSERT_EQ(agraph.get_ops().size(), 5U);
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 4U);
-}
-
-TEST(test_utils_pattern_matcher, RepConvReluWithMultiConsumersCase2) {
-    /*pattern:  
-                 |
-           [conv-> relu]*[0,3)
-            /    |
-           |     |
-     (allow external output)
-    */
-    auto pgraph = std::make_shared<pb_graph_t>();
-    auto prep_subgraph = std::make_shared<pb_graph_t>();
-    auto prconv = prep_subgraph->append_op(graph::op_kind::Convolution);
-    auto prrelu = prep_subgraph->append_op(
-            graph::op_kind::ReLU, in_edges_t {in_edge(0, prconv, 0)});
-    prrelu->allow_external_outputs();
-    prep_subgraph->create_input_port(IN0, prconv, IN0);
-    prep_subgraph->create_output_port(OUT0, prrelu, OUT0);
-    pgraph->append_repetition(prep_subgraph, {0, 0}, 0, 3);
-
-    // graph:
-    // the single conv is the 1st consumer
-    // while the conv on the "main branch" is the 2nd consumer
-    //   conv
-    //     |
-    //   relu
-    //     |________________
-    //     |                |
-    // (external output)  conv
-    //   conv               |
-    //     |              relu
-    //     |                |
-    graph::graph_t agraph;
-
-    id_generator id_gen;
-
-    int64_t ic = 8, oc = 8, ks = 1;
-    std::vector<int64_t> src_shape {1, ic, 12, 12};
-
-    auto src = logical_tensor_init(
-            id_gen.get_id(), src_shape, graph::data_type::f32);
-
-    auto conv0 = create_convolution(id_gen, agraph, src, ic, ks, oc, 1, {1, 1},
-            {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
-            /*with relu*/ true, /*with biasadd*/ false);
-    // the order of consumer depends on the order of adding ops
-    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
-            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
-            /*with relu*/ false, /*with biasadd*/ false);
-    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
-            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
-            /*with relu*/ true, /*with biasadd*/ false);
-
-    agraph.finalize();
-
-    ASSERT_EQ(agraph.get_ops().size(), 5U);
-
-    std::vector<op_t *> fusion_ops;
-    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
-    ASSERT_EQ(fusion_ops.size(), 4U);
-}
diff --git a/tests/gtests/graph/unit/utils/test_pattern_matcher_cpu.cpp b/tests/gtests/graph/unit/utils/test_pattern_matcher_cpu.cpp
new file mode 100644
index 00000000000..59b9a373973
--- /dev/null
+++ b/tests/gtests/graph/unit/utils/test_pattern_matcher_cpu.cpp
@@ -0,0 +1,2915 @@
+/*******************************************************************************
+* Copyright 2021-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "interface/graph.hpp"
+#include "utils/pm/nested_matcher.hpp"
+#include "utils/pm/pass_base.hpp"
+
+#include "graph/unit/utils.hpp"
+
+using namespace dnnl::impl::graph;
+using namespace dnnl::impl::graph::op_kind;
+using namespace dnnl::impl::graph::utils::pm;
+using namespace dnnl::graph::tests::unit::utils;
+
+const iport_t IN0 = 0;
+const iport_t IN1 = 1;
+const oport_t OUT0 = 0;
+
+//
+// All pattern starts with a "pb_graph"
+//
+TEST(test_utils_pattern_matcher, Graph) {
+    auto pgraph = std::make_shared<pb_graph_t>();
+
+    ASSERT_NE(pgraph, nullptr);
+}
+
+//
+// Pattern is grown by appending pattern ops ("pb_op", "alternation" and
+// "repetition") to a "pb_graph" with pb_graph.append_op(),
+// append_alternation(), append_optional() and append_repeition().
+// Pattern can be a nested graph since "alteration" and "repetition"
+// embeds "pb_graph".
+// Pattern graph has the following properties.
+// - During matching, aggegrate pattern nodes (pb_graph, alternation,
+// repetition) will be unpacked recursively until all nodes are expanded
+// to just "pb_op"s
+// - Any inner "pb_graph" embedded inside "alternation" or "repetition" needs
+// to provide a mapping from the "pb_graph"'s in/out port to it's inner node's
+// in/out port to enable unpacking. This is done by calling create_input_port()
+// and create_output_port().
+// - "alternation" and "repetition"'s in/out ports are mapped to the same
+// numberred in/out ports of embedded "pb_graph"(s)
+// - One graph op is matched with one "pb_op". And expanded pattern graph's
+// "pb_op" are not aliased. So graph ops matched with different "pb_op"s cannot
+// be aliased.
+// - Graph op attribute checking for is done by "decision_function"s of a
+// "pb_op". Every "pb_op" needs to provide at least one "decision_function".
+// One "decision_function" needs to be passed as an arugument to append_op()
+// Some variants of append_op() provides a quick way to setup common
+// "decision_function"s.
+// Use pb_op.append_decision_function() to add additional attribute checkers.
+// - Pattern matcher matches graph op edges with pb_op edges. Graph ops can
+// have more edges than constrained by the pattern graph. Those are marked as
+// unhandled edges during matching. Unhandled edges are two types. One is
+// connected to a graph op matched by this pattern and called an internal edge.
+// The other is called an external edge.
+// - Matcher has two different modes of handling unhandled edges. First mode
+// assumes all unhandled inputs as external input and assumes unhandled outputs
+// from ops matched with non root pb_op (side outputs) are not allowed.
+// This mode is useful for backends backed by fixed kernels such as oneDNN
+// primitives. To allow side outputs, pb_op.allow_external_output() is provided
+// to override this behavior. The second mode auto exports unhandled external
+// inputs and outputs.
+// Pattern matcher has two different mode/way of handling unmatched graph op
+// edges.
+// - Order of external inputs and outputs returned by matcher is implementation
+// dependent. (Port numbers provided by create_input_port() and
+// create_output_port() may be used to enforce ordering for fixed patterns from
+// a flat pattern graph. But the idea is not practical in general. For example,
+// nested patterns may have variable number of side inputs so fixed ordering
+// cannot be enforced.)
+// - In case a match has multiple aliased external inputs, they are not merged
+// and matcher reports them as separate inputs.
+//
+
+//
+// Leaf pattern ops can be created by passing dnnl_graph op_kind.
+// External inputs and outputs of a match will be ordered and
+// exposed as part of the match. The order depends on matcher
+// implementation.
+//
+TEST(test_utils_pattern_matcher, GraphAppendLeafOp) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Grow internal graph
+    // Leaf pattern op "Add"
+    auto op0 = graphp->append_op(Add);
+    ASSERT_NE(graphp, nullptr);
+    ASSERT_NE(op0, nullptr);
+}
+
+//
+// Convolution + BiasAdd
+// A vector of all in coming edges to the new op can passed to
+// append_op for non leaf pattern ops
+//
+TEST(test_utils_pattern_matcher, GraphAppendNonLeafOp) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Grow internal graph
+    // Convolution -> BiasAdd
+    // Leaf pattern op
+    auto op0 = graphp->append_op(Convolution);
+    // Non leaf pattern op "BiasAdd" with only one of the inputs constrained
+    // input 0 is constrained to output 0 of "Abs" op
+    // unconstrained input is like matching "Any"
+    // input 1 is free to match any op
+    auto op1 = graphp->append_op(BiasAdd, {in_edge(IN0, op0, OUT0)});
+    // Make sure that input1 to "BiasAdd" node does not come from within
+    // the matched pattern
+    ASSERT_NE(op1->get_producer(IN0), nullptr);
+    ASSERT_EQ(op1->get_producer(IN0)->first, op0);
+    ASSERT_EQ(op1->get_producer(IN0)->second, OUT0);
+    ASSERT_NE(op0->get_consumers(OUT0), nullptr);
+    ASSERT_EQ(op0->get_consumers(OUT0)->at(0)->first, op1);
+    ASSERT_EQ(op0->get_consumers(OUT0)->at(0)->second, IN0);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t bias {1, BiasAdd, "bias"};
+    op_t relu {2, ReLU, "relu"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    bias.add_input(lt_vec[2]);
+    bias.add_input(lt_vec[3]);
+    bias.add_output(lt_vec[4]);
+    relu.add_input(lt_vec[4]);
+    relu.add_output(lt_vec[5]);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&bias), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+
+    // matched dnnl_graph_op will be marked
+    for (auto &p : fusion_ops) {
+        ASSERT_TRUE(p->get_attr<bool>(op_attr::matched));
+    }
+}
+
+TEST(test_utils_pattern_matcher, GraphNoAllowSideOutput) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto op0 = graphp->append_op(Convolution);
+    auto op1 = graphp->append_op(BiasAdd, {in_edge(IN0, op0, OUT0)});
+    UNUSED(op1);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t bias {1, BiasAdd, "bias"};
+    op_t relu {2, ReLU, "relu"};
+    op_t add {3, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    bias.add_input(lt_vec[2]);
+    bias.add_input(lt_vec[3]);
+    bias.add_output(lt_vec[4]);
+    relu.add_input(lt_vec[4]);
+    relu.add_output(lt_vec[5]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[6]);
+    add.add_output(lt_vec[7]);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&bias), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    op_t *internal_op = agraph.get_ops()[0].get();
+    EXPECT_FALSE(match_pattern(internal_op, graphp, fusion_ops));
+}
+
+TEST(test_utils_pattern_matcher, ConvAddFusion) {
+    // conv + add fusion
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto padd = pattern_graph->append_op(
+            Add, {in_edge(IN0, pconv, OUT0), in_edge(IN1, pconv, OUT0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t add {1, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_output(lt_vec[3]);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(
+            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, FailToFuseConvAdd) {
+    // conv = add fusion
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t add {1, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_output(lt_vec[3]);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(
+            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+}
+
+TEST(test_utils_pattern_matcher, ConvAddFusionCase2) {
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
+    UNUSED(padd);
+
+    graph_t agraph1;
+    op_t conv0 {0, Convolution, "conv0"};
+    set_conv_common_attr(conv0);
+    op_t conv1 {1, Convolution, "conv1"};
+    set_conv_common_attr(conv1);
+    op_t add1 {2, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    conv0.add_input(lt_vec[0]);
+    conv0.add_input(lt_vec[1]);
+    conv0.add_output(lt_vec[2]);
+    conv1.add_input(lt_vec[2]);
+    conv1.add_input(lt_vec[3]);
+    conv1.add_output(lt_vec[4]);
+    add1.add_input(lt_vec[2]);
+    add1.add_input(lt_vec[4]);
+    add1.add_output(lt_vec[5]);
+    ASSERT_EQ(agraph1.add_op(&conv0), status::success);
+    ASSERT_EQ(agraph1.add_op(&conv1), status::success);
+    ASSERT_EQ(agraph1.add_op(&add1), status::success);
+    agraph1.finalize();
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(
+            agraph1.get_ops()[0].get(), pattern_graph, fusion_ops));
+    fusion_ops.clear();
+
+    EXPECT_TRUE(match_pattern(
+            agraph1.get_ops()[1].get(), pattern_graph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, ConvAddFusionCase3) {
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t conv0 {0, Convolution, "conv0"};
+    set_conv_common_attr(conv0);
+    op_t conv1 {1, Convolution, "conv1"};
+    set_conv_common_attr(conv1);
+    op_t add {2, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
+    conv0.add_input(lt_vec[0]);
+    conv0.add_input(lt_vec[1]);
+    conv0.add_output(lt_vec[2]);
+    conv1.add_input(lt_vec[3]);
+    conv1.add_input(lt_vec[4]);
+    conv1.add_output(lt_vec[5]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[5]);
+    add.add_output(lt_vec[6]);
+    ASSERT_EQ(agraph.add_op(&conv0), status::success);
+    ASSERT_EQ(agraph.add_op(&conv1), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+
+    EXPECT_TRUE(match_pattern(
+            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 2U);
+    for (auto &op : agraph.get_ops())
+        op->remove_attr(op_attr::matched);
+    fusion_ops.clear();
+
+    EXPECT_TRUE(match_pattern(
+            agraph.get_ops()[1].get(), pattern_graph, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 2U);
+    for (auto &op : agraph.get_ops())
+        op->remove_attr(op_attr::matched);
+}
+
+TEST(test_utils_pattern_matcher, CommutativeInputBothConstrained) {
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto pelu = pattern_graph->append_op(Elu, {in_edge(IN0, pconv, OUT0)});
+    auto pabsnode = pattern_graph->append_op(Abs, {in_edge(IN0, pconv, OUT0)});
+    auto padd = pattern_graph->append_op(
+            Add, {in_edge(IN0, pelu, OUT0), in_edge(IN1, pabsnode, OUT0)});
+    UNUSED(padd);
+
+    for (size_t elu_offset : {0, 1}) {
+        graph_t agraph;
+        op_t conv {0, Convolution, "conv"};
+        set_conv_common_attr(conv);
+        op_t elu {1, Elu, "elu"};
+        elu.set_attr<float>(op_attr::alpha, 0.1f);
+        op_t abs {2, Abs, "abs"};
+        op_t add {3, Add, "add"};
+        std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+        conv.add_input(lt_vec[0]);
+        conv.add_input(lt_vec[1]);
+        conv.add_output(lt_vec[2]);
+        elu.add_input(lt_vec[2]);
+        elu.add_output(lt_vec[3]);
+        abs.add_input(lt_vec[2]);
+        abs.add_output(lt_vec[4]);
+        if (elu_offset == 0) {
+            add.add_input(lt_vec[3]);
+            add.add_input(lt_vec[4]);
+        } else {
+            add.add_input(lt_vec[4]);
+            add.add_input(lt_vec[3]);
+        }
+        add.add_output(lt_vec[5]);
+        ASSERT_EQ(agraph.add_op(&conv), status::success);
+        ASSERT_EQ(agraph.add_op(&elu), status::success);
+        ASSERT_EQ(agraph.add_op(&abs), status::success);
+        ASSERT_EQ(agraph.add_op(&add), status::success);
+        agraph.finalize();
+
+        std::vector<op_t *> fusion_ops;
+        EXPECT_TRUE(match_pattern(
+                agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+        ASSERT_EQ(fusion_ops.size(), 4U);
+    }
+}
+
+TEST(test_utils_pattern_matcher, CommutativeInput) {
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+    auto pconv0 = pattern_graph->append_op(Convolution);
+    pconv0->append_decision_function(
+            [](op_t *o) -> bool { return o->num_inputs() == 3; });
+    auto pconv1 = pattern_graph->append_op(Convolution);
+    auto prelu0 = pattern_graph->append_op(ReLU, {in_edge(IN0, pconv0, OUT0)});
+    auto prelu1 = pattern_graph->append_op(ReLU, {in_edge(IN0, pconv1, OUT0)});
+    auto padd = pattern_graph->append_op(
+            Add, {in_edge(IN0, prelu0, OUT0), in_edge(IN1, prelu1, OUT0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t conv0 {0, Convolution, "conv0"};
+    set_conv_common_attr(conv0);
+    op_t conv1 {1, Convolution, "conv1"};
+    set_conv_common_attr(conv1);
+    op_t relu0 {2, ReLU, "relu0"};
+    op_t relu1 {3, ReLU, "relu1"};
+    op_t add {4, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(10);
+    conv0.add_input(lt_vec[0]);
+    conv0.add_input(lt_vec[1]);
+    conv0.add_output(lt_vec[2]);
+    relu0.add_input(lt_vec[2]);
+    relu0.add_output(lt_vec[3]);
+    conv1.add_input(lt_vec[4]);
+    conv1.add_input(lt_vec[5]);
+    conv1.add_input(lt_vec[6]);
+    conv1.add_output(lt_vec[7]);
+    relu1.add_input(lt_vec[7]);
+    relu1.add_output(lt_vec[8]);
+    add.add_input(lt_vec[3]);
+    add.add_input(lt_vec[8]);
+    add.add_output(lt_vec[9]);
+    ASSERT_EQ(agraph.add_op(&conv0), status::success);
+    ASSERT_EQ(agraph.add_op(&relu0), status::success);
+    ASSERT_EQ(agraph.add_op(&conv1), status::success);
+    ASSERT_EQ(agraph.add_op(&relu1), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(
+            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+    fusion_ops.clear();
+    EXPECT_TRUE(match_pattern(
+            agraph.get_ops()[2].get(), pattern_graph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 5U);
+}
+
+//
+// Convolution + BiasAdd + Elu
+// Convolution + BiasAdd + Sigmoid
+// Convolution + BiasAdd + ReLU
+// Convolution + BiasAdd + Clamp
+// Convolution + BiasAdd + Square
+// Convolution + BiasAdd + Tanh
+// Convolution + BiasAdd + Sqrt
+//
+TEST(test_utils_pattern_matcher, ConvBiasActivationFusion) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pconv = graphp->append_op(Convolution);
+    auto pbias = graphp->append_op(BiasAdd, {in_edge(IN0, pconv, OUT0)});
+    auto pact = graphp->append_alternation(
+            {Elu, Sigmoid, ReLU, Clamp, Square, Tanh, Sqrt},
+            {in_edge(IN0, pbias, OUT0)});
+    UNUSED(pact);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t bias {1, BiasAdd, "bias"};
+    op_t relu {2, ReLU, "relu"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    bias.add_input(lt_vec[2]);
+    bias.add_input(lt_vec[3]);
+    bias.add_output(lt_vec[4]);
+    relu.add_input(lt_vec[4]);
+    relu.add_output(lt_vec[5]);
+
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&bias), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+//
+// Convolution + BiasAdd + Add + ReLU
+// Convolution + BiasAdd + Add + ELU
+//
+TEST(test_utils_pattern_matcher, ConvBiasSumActivationFusion) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pconv = graphp->append_op(Convolution);
+    auto pbias = graphp->append_op(BiasAdd, {in_edge(IN0, pconv, OUT0)});
+    auto padd = graphp->append_op(Add, {in_edge(IN0, pbias, OUT0)});
+    auto pact = graphp->append_alternation(
+            {Elu, ReLU}, {in_edge(IN0, padd, OUT0)});
+    UNUSED(pact);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t bias {1, BiasAdd, "bias"};
+    op_t add {2, Add, "add"};
+    op_t elu {3, Elu, "elu"};
+    elu.set_attr<float>(op_attr::alpha, 0.1f);
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    bias.add_input(lt_vec[2]);
+    bias.add_input(lt_vec[3]);
+    bias.add_output(lt_vec[4]);
+    // Force check commutative input
+    add.add_input(lt_vec[5]);
+    add.add_input(lt_vec[4]);
+    add.add_output(lt_vec[6]);
+    elu.add_input(lt_vec[6]);
+    elu.add_output(lt_vec[7]);
+
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&bias), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    ASSERT_EQ(agraph.add_op(&elu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 4U);
+}
+
+//
+// MatMul + BiasAdd + Add
+//
+TEST(test_utils_pattern_matcher, MatmulBiasSumFusion) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmatmul = graphp->append_op(MatMul);
+    auto pbias = graphp->append_op(BiasAdd, {in_edge(IN0, pmatmul, OUT0)});
+    auto padd = graphp->append_op(Add, {in_edge(IN0, pbias, OUT0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t bias {1, BiasAdd, "bias"};
+    op_t add {2, Add, "add"};
+    op_t relu {3, ReLU, "relu"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    bias.add_input(lt_vec[2]);
+    bias.add_input(lt_vec[3]);
+    bias.add_output(lt_vec[4]);
+    add.add_input(lt_vec[5]);
+    add.add_input(lt_vec[4]);
+    add.add_output(lt_vec[6]);
+    relu.add_input(lt_vec[6]);
+    relu.add_output(lt_vec[7]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&bias), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+//
+// MatMul + ReLU
+// MatMul + Elu
+// MatMul + GELU
+// MatMul + Sigmoid
+// MatMul + Clamp
+//
+TEST(test_utils_pattern_matcher, MatmulActivationFusion) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmat = graphp->append_op(MatMul);
+    auto pact = graphp->append_alternation(
+            {ReLU, Elu, GELU, Sigmoid, Clamp}, {in_edge(IN0, pmat, OUT0)});
+    UNUSED(pact);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+    op_t add {2, Add, "add"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+    add.add_input(lt_vec[3]);
+    add.add_input(lt_vec[4]);
+    add.add_output(lt_vec[5]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(agraph.get_ops()[1].get(), graphp, fusion_ops));
+    fusion_ops.clear();
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, ConvSwishFusion) {
+    // conv_swish pass
+    //   conv
+    //   |   |
+    //   | sigmoid
+    //   |   |
+    // multiply
+
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto psigmoid
+            = pattern_graph->append_op(Sigmoid, {in_edge(IN0, pconv, OUT0)});
+    in_edges_t mul_edges
+            = {in_edge(IN0, pconv, OUT0), in_edge(IN1, psigmoid, OUT0)};
+    auto pmul = pattern_graph->append_op(Multiply, mul_edges);
+    UNUSED(pmul);
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t sigmoid {1, Sigmoid, "sigmoid"};
+    op_t multiply {2, Multiply, "multiply"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(5);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    sigmoid.add_input(lt_vec[2]);
+    sigmoid.add_output(lt_vec[3]);
+    // Force check commutative input
+    multiply.add_input(lt_vec[3]);
+    multiply.add_input(lt_vec[2]);
+    multiply.add_output(lt_vec[4]);
+
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&sigmoid), status::success);
+    ASSERT_EQ(agraph.add_op(&multiply), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(
+            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, ConvSumEltwiseFusion) {
+    // conv + sum + (Relu / Elu / Clamp / Square / Tanh / Abs / Sqrt)
+    std::shared_ptr<pb_graph_t> pattern_graph = std::make_shared<pb_graph_t>();
+    auto pconv = pattern_graph->append_op(Convolution);
+    auto padd = pattern_graph->append_op(Add, {in_edge(IN0, pconv, OUT0)});
+
+    std::shared_ptr<pb_graph_t> optional_act = std::make_shared<pb_graph_t>();
+    auto pact = optional_act->append_alternation(
+            {Elu, ReLU, Square, Tanh, Abs, Sqrt, Clamp});
+    optional_act->create_input_port(IN0, pact, IN0);
+    optional_act->create_output_port(OUT0, pact, OUT0);
+    pattern_graph->append_optional(optional_act, {in_edge(IN0, padd, OUT0)});
+
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t add {1, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(5);
+    conv.add_input(lt_vec[0]);
+    conv.add_input(lt_vec[1]);
+    conv.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[3]);
+    add.add_output(lt_vec[4]);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(
+            agraph.get_ops()[0].get(), pattern_graph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+//
+// Alternation, Repetition, Optional are nested pattern nodes
+// that has a body(s) of graph.
+// Input and Output ports of those nested patterns get mapped to
+// the corresponding port (same index) of the body.
+// If you need to change that mapping, wrap the body in a graph
+// and use create_input_port/create_output_port to change the
+// mapping.
+//
+
+//
+// Alternation node wraps two or more alternatives and
+// constructed with append_alternation.
+// Input or Output "n" of the alternation node connects to
+// Input of Output "n" of the alternative.
+//
+TEST(test_utils_pattern_matcher, Alternation) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // MatMul -> (Add | Multiply)
+    auto pmatmul = graphp->append_op(MatMul);
+
+    // Prepare the alternative graphs
+    auto addgraph = std::make_shared<pb_graph_t>();
+    auto padd = addgraph->append_op(Add);
+    addgraph->create_input_port(IN0, padd, IN0);
+    addgraph->create_input_port(IN1, padd, IN1);
+    addgraph->create_output_port(OUT0, padd, OUT0);
+    auto mulgraph = std::make_shared<pb_graph_t>();
+    auto pmul = mulgraph->append_op(Multiply);
+    mulgraph->create_input_port(IN0, pmul, IN0);
+    mulgraph->create_input_port(IN1, pmul, IN1);
+    mulgraph->create_output_port(OUT0, pmul, OUT0);
+    // We can add a helper function like
+    // single_op_graph(op_kind);
+    // that create a new graph add a single node and sets
+    // inner consumer and producers.
+
+    auto palt = graphp->append_alternation(
+            {addgraph, mulgraph}, {in_edge(IN0, pmatmul, OUT0)});
+    UNUSED(palt);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t add {1, Add, "add"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(5);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[3]);
+    add.add_output(lt_vec[4]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(agraph.get_ops()[1].get(), graphp, fusion_ops));
+    fusion_ops.clear();
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, AlternationWithConsumer) {
+    /*
+    pattern:
+          matmul
+            |
+(softmax + relu) | (relu + softmax)
+            |
+          matmul
+    graph:
+         matmul
+           |
+         softmax
+           |
+          relu
+           |
+         matmul
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmatmul = graphp->append_op(op_kind::MatMul);
+    auto alter1 = std::make_shared<pb_graph_t>();
+    auto psoftmax1 = alter1->append_op(op_kind::SoftMax);
+    auto prelu1 = alter1->append_op(op_kind::ReLU, {in_edge(0, psoftmax1, 0)});
+    alter1->create_input_port(0, psoftmax1, 0);
+    alter1->create_output_port(0, prelu1, 0);
+    auto alter2 = std::make_shared<pb_graph_t>();
+    auto prelu2 = alter2->append_op(op_kind::ReLU);
+    auto psoftmax2
+            = alter2->append_op(op_kind::SoftMax, {in_edge(0, prelu2, 0)});
+    alter2->create_input_port(0, prelu2, 0);
+    alter2->create_output_port(0, psoftmax2, 0);
+    auto palter = graphp->append_alternation(
+            {alter1, alter2}, {in_edge(0, pmatmul, 0)});
+    auto pmatmul2 = graphp->append_op(op_kind::MatMul, {in_edge(0, palter, 0)});
+    UNUSED(pmatmul2);
+
+    graph_t agraph;
+    op_t matmul0 {0, MatMul, "matmul0"};
+    op_t softmax {1, SoftMax, "softmax"};
+    op_t relu {2, ReLU, "relu"};
+    op_t matmul1 {3, MatMul, "matmul1"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
+    matmul0.add_input(lt_vec[0]);
+    matmul0.add_input(lt_vec[1]);
+    matmul0.add_output(lt_vec[2]);
+    softmax.add_input(lt_vec[2]);
+    softmax.add_output(lt_vec[3]);
+    relu.add_input(lt_vec[3]);
+    relu.add_output(lt_vec[4]);
+    matmul1.add_input(lt_vec[4]);
+    matmul1.add_input(lt_vec[5]);
+    matmul1.add_output(lt_vec[6]);
+
+    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
+    ASSERT_EQ(agraph.add_op(&softmax), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    // should match the 1st rep_unit
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 4U);
+}
+
+//
+// Repetition node wraps body that gets repeated a
+// number of times specified by a range and constructed with
+// append_repetition.
+// The body repeats inself by connecting edges through an
+// output port to input port mapping.
+// The mapping has to be given as an argument to append_repetition.
+//
+TEST(test_utils_pattern_matcher, Repetition) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Pattern that captures
+    // MatMul -> (Add | Multiply) -> ReLU
+    // MatMul -> (Add | Multiply) -> (Add | Multiply) -> ReLU
+    auto pmatmul = graphp->append_op(MatMul);
+    auto repbody = std::make_shared<pb_graph_t>();
+    auto paddormul = repbody->append_alternation({Add, Multiply});
+    repbody->create_input_port(IN0, paddormul, IN0);
+    // No need to create IN1 for the body since it is not connected to
+    // an outer pattern.
+    // repbody->create_input_port(IN1, addormul, IN1);
+    repbody->create_output_port(OUT0, paddormul, OUT0);
+
+    // Repeat 1 or 2 times [1, 3) by mapping OUT0 back to IN0
+    auto rep = graphp->append_repetition(
+            repbody, {OUT0, IN0}, 1, 3, {in_edge(IN0, pmatmul, OUT0)});
+    auto prelu = graphp->append_op(ReLU, {in_edge(IN0, rep, OUT0)});
+    UNUSED(prelu);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t add {1, Add, "add"};
+    op_t mul {2, Multiply, "mul"};
+    op_t relu {3, ReLU, "relu"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[3]);
+    add.add_output(lt_vec[4]);
+    mul.add_input(lt_vec[4]);
+    mul.add_input(lt_vec[5]);
+    mul.add_output(lt_vec[6]);
+    relu.add_input(lt_vec[6]);
+    relu.add_output(lt_vec[7]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    ASSERT_EQ(agraph.add_op(&mul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 4U);
+}
+
+TEST(test_utils_pattern_matcher, RepetitionFail) {
+    /* 
+    Pattern:
+     MatMul
+       \    /
+      [Add/Div]*[1,3]
+
+     Graph:
+          MatMul
+            \   /
+             Add
+          \  /
+          Div
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmatmul = graphp->append_op(MatMul);
+    auto repbody = std::make_shared<pb_graph_t>();
+    auto paddordiv = repbody->append_alternation({Add, Divide});
+    repbody->create_input_port(IN0, paddordiv, IN0);
+    repbody->create_output_port(OUT0, paddordiv, OUT0);
+
+    graphp->append_repetition(
+            repbody, {OUT0, IN0}, 2, 3, {in_edge(IN0, pmatmul, OUT0)});
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t add {1, Add, "add"};
+    op_t div {2, Divide, "div"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[3]);
+    add.add_output(lt_vec[4]);
+    // incorrect order for div
+    div.add_input(lt_vec[5]);
+    div.add_input(lt_vec[4]);
+    div.add_output(lt_vec[6]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    ASSERT_EQ(agraph.add_op(&div), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+}
+
+TEST(test_utils_pattern_matcher, RepetitionWithMultiConsumersFail) {
+    /* 
+    Pattern:
+     MatMul
+       \    /
+      [Add/Div]*[1,3]
+
+     Graph:
+          MatMul
+            \   /
+             Add
+            /  \
+          Div  Add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmatmul = graphp->append_op(MatMul);
+    auto repbody = std::make_shared<pb_graph_t>();
+    auto paddordiv = repbody->append_alternation({Add, Divide});
+    paddordiv->allow_internal_inputs();
+    repbody->create_input_port(IN0, paddordiv, IN0);
+    repbody->create_output_port(OUT0, paddordiv, OUT0);
+
+    graphp->append_repetition(
+            repbody, {OUT0, IN0}, 1, 3, {in_edge(IN0, pmatmul, OUT0)});
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t add {1, Add, "add"};
+    op_t div {2, Divide, "div"};
+    op_t add2 {3, Add, "add2"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(9);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    add.add_input(lt_vec[2]);
+    add.add_input(lt_vec[3]);
+    add.add_output(lt_vec[4]);
+    div.add_input(lt_vec[4]);
+    div.add_input(lt_vec[5]);
+    div.add_output(lt_vec[6]);
+    add2.add_input(lt_vec[4]);
+    add2.add_input(lt_vec[7]);
+    add2.add_output(lt_vec[8]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    ASSERT_EQ(agraph.add_op(&div), status::success);
+    ASSERT_EQ(agraph.add_op(&add2), status::success);
+    agraph.finalize();
+    ASSERT_EQ(agraph.num_ops(), 4U);
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    // only matmul+add are fused
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+//
+// "Optional" is a special case of repetition that repeats one or zero times
+// and constructed with append_optional.
+// output to input port mapping isn't needed since the body does not repeat
+// more than once.
+//
+TEST(test_utils_pattern_matcher, Optional) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Pattern that captures
+    // MatMul -> ReLU
+    // MatMul -> (Add | Multiply) -> ReLU
+    auto pmatmul = graphp->append_op(MatMul);
+    auto repbody = std::make_shared<pb_graph_t>();
+    auto paddormul = repbody->append_alternation({Add, Multiply});
+    repbody->create_input_port(IN0, paddormul, IN0);
+    repbody->create_output_port(OUT0, paddormul, OUT0);
+    auto rep = graphp->append_optional(repbody, {in_edge(IN0, pmatmul, OUT0)});
+    auto prelu = graphp->append_op(ReLU, {in_edge(IN0, rep, OUT0)});
+    UNUSED(prelu);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+
+    graph_t agraph2;
+    op_t matmul2 {0, MatMul, "matmul"};
+    op_t add2 {1, Add, "add"};
+    op_t relu2 {2, ReLU, "relu"};
+
+    std::vector<logical_tensor_t> lt_vec2 = create_logical_tensors(6);
+    matmul2.add_input(lt_vec2[0]);
+    matmul2.add_input(lt_vec2[1]);
+    matmul2.add_output(lt_vec2[2]);
+    add2.add_input(lt_vec2[2]);
+    add2.add_input(lt_vec2[3]);
+    add2.add_output(lt_vec2[4]);
+    relu2.add_input(lt_vec2[4]);
+    relu2.add_output(lt_vec2[5]);
+
+    ASSERT_EQ(agraph2.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph2.add_op(&add2), status::success);
+    ASSERT_EQ(agraph2.add_op(&relu2), status::success);
+    agraph2.finalize();
+
+    fusion_ops.clear();
+    EXPECT_TRUE(match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, OptionalWithLargerPort) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Pattern that captures
+    // MatMul -> (Divide)? -> Select
+    auto pmatmul = graphp->append_op(MatMul);
+    auto optbody = std::make_shared<pb_graph_t>();
+    auto pdiv = optbody->append_op(Divide);
+    optbody->create_input_port(0, pdiv, 0);
+    optbody->create_output_port(0, pdiv, 0);
+    auto popt_div = graphp->append_optional(optbody, {in_edge(0, pmatmul, 0)});
+    // optional divide is connected to 2nd input of select
+    // which means when divide does not exist, matmul should connected to
+    // 2nd input of select
+    auto pselect = graphp->append_op(Select, {in_edge(2, popt_div, 0)});
+    UNUSED(pselect);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t select {1, Select, "select"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    lt_vec[5].data_type = data_type::boolean;
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    select.add_input(lt_vec[5]);
+    select.add_input(lt_vec[3]);
+    select.add_input(lt_vec[2]);
+    select.add_output(lt_vec[4]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&select), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+
+    graph_t agraph2;
+    op_t matmul2 {0, MatMul, "matmul"};
+    op_t divide2 {1, Divide, "div"};
+    op_t select2 {2, Select, "select"};
+
+    std::vector<logical_tensor_t> lt_vec2 = create_logical_tensors(8);
+    lt_vec2[7].data_type = data_type::boolean;
+    matmul2.add_input(lt_vec2[0]);
+    matmul2.add_input(lt_vec2[1]);
+    matmul2.add_output(lt_vec2[2]);
+    divide2.add_input(lt_vec2[2]);
+    divide2.add_input(lt_vec2[3]);
+    divide2.add_output(lt_vec2[4]);
+    select2.add_input(lt_vec2[7]);
+    select2.add_input(lt_vec2[5]);
+    select2.add_input(lt_vec2[4]);
+    select2.add_output(lt_vec2[6]);
+
+    ASSERT_EQ(agraph2.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph2.add_op(&divide2), status::success);
+    ASSERT_EQ(agraph2.add_op(&select2), status::success);
+    agraph2.finalize();
+
+    fusion_ops.clear();
+    EXPECT_TRUE(match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+//
+// ?: means optional
+// ^: means repetition
+// Conv+(BN)?+ReLU
+// Conv+(BN)?+ReLU+Add
+// Conv+(BN)?+ReLU+Conv+(BN)?+ReLU+Conv+(BN)?+ReLU+Add
+//
+// Conv+(BN)?+ReLU+(((Conv+(BN)?+ReLU)^2)?+Add)?
+//
+// Note that each "()" requires an addition pb_graph.
+// So for this example, we need 1 + 5 = 6 pb_graphs.
+//
+// Since this example is not a fixed pattern and has
+// variable number of side inputs, we cannot use
+// create_input_port to setup globbal ordering for inputs.
+//
+// create_input_port/create_output_port is still needed for
+// setting up the contact interface for nested patterns.
+//
+TEST(test_utils_pattern_matcher, ComplexRepetition) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Basic building block
+    // Convolution + (BatchNormInference)? + ReLU
+
+    // Conv
+    auto pconv = graphp->append_op(Convolution);
+    // Optional BN
+    auto body = std::make_shared<pb_graph_t>();
+    auto pbn = body->append_op(BatchNormInference);
+    // Interface for body
+    body->create_input_port(IN0, pbn, IN0);
+    body->create_output_port(OUT0, pbn, OUT0);
+    auto popt = graphp->append_optional(body, {in_edge(IN0, pconv, OUT0)});
+    // ReLU
+    auto prelu = graphp->append_op(ReLU, {in_edge(IN0, popt, OUT0)});
+    // Create same block to use as repetition body
+    auto graphp2 = std::make_shared<pb_graph_t>();
+    auto pconv2 = graphp2->append_op(Convolution);
+    auto body2 = std::make_shared<pb_graph_t>();
+    auto pbn2 = body2->append_op(BatchNormInference);
+    // Interface for body2
+    body2->create_input_port(IN0, pbn2, IN0);
+    body2->create_output_port(OUT0, pbn2, OUT0);
+    auto popt2 = graphp2->append_optional(body2, {in_edge(IN0, pconv2, OUT0)});
+    auto prelu2 = graphp2->append_op(ReLU, {in_edge(IN0, popt2, OUT0)});
+    // Interface for graphp2
+    graphp2->create_input_port(IN0, pconv2, IN0);
+    graphp2->create_output_port(OUT0, prelu2, OUT0);
+
+    // repeat body exactly two times
+    auto graphp3 = std::make_shared<pb_graph_t>();
+    auto prep = graphp3->append_repetition(graphp2, {OUT0, IN0}, 2, 3);
+    // Interface for graphp3
+    graphp3->create_input_port(IN0, prep, IN0);
+    graphp3->create_output_port(OUT0, prep, OUT0);
+
+    // optional repeated body followed by an "Add"
+    auto graphp4 = std::make_shared<pb_graph_t>();
+    auto popt3 = graphp4->append_optional(graphp3);
+    auto padd = graphp4->append_op(Add, {in_edge(IN0, popt3, OUT0)});
+    // Interface for graphp4
+    graphp4->create_input_port(IN0, popt3, IN0);
+    graphp4->create_output_port(OUT0, padd, OUT0);
+
+    // Append the complex pattern to relu
+    auto popt4 = graphp->append_optional(graphp4, {in_edge(IN0, prelu, OUT0)});
+    UNUSED(popt4);
+
+    graph_t agraph;
+    op_t conv1 {0, Convolution, "conv1"};
+    set_conv_common_attr(conv1);
+    op_t relu1 {1, ReLU, "relu1"};
+    op_t conv2 {2, Convolution, "conv2"};
+    set_conv_common_attr(conv2);
+    op_t relu2 {3, ReLU, "relu2"};
+    op_t conv3 {4, Convolution, "conv3"};
+    set_conv_common_attr(conv3);
+    op_t relu3 {5, ReLU, "relu3"};
+    op_t add {6, Add, "add"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(12);
+    conv1.add_input(lt_vec[0]);
+    conv1.add_input(lt_vec[1]);
+    conv1.add_output(lt_vec[2]);
+    relu1.add_input(lt_vec[2]);
+    relu1.add_output(lt_vec[3]);
+    conv2.add_input(lt_vec[3]);
+    conv2.add_input(lt_vec[4]);
+    conv2.add_output(lt_vec[5]);
+    relu2.add_input(lt_vec[5]);
+    relu2.add_output(lt_vec[6]);
+    conv3.add_input(lt_vec[6]);
+    conv3.add_input(lt_vec[7]);
+    conv3.add_output(lt_vec[8]);
+    relu3.add_input(lt_vec[8]);
+    relu3.add_output(lt_vec[9]);
+    add.add_input(lt_vec[9]);
+    add.add_input(lt_vec[10]);
+    add.add_output(lt_vec[11]);
+    ASSERT_EQ(agraph.add_op(&conv1), status::success);
+    ASSERT_EQ(agraph.add_op(&relu1), status::success);
+    ASSERT_EQ(agraph.add_op(&conv2), status::success);
+    ASSERT_EQ(agraph.add_op(&relu2), status::success);
+    ASSERT_EQ(agraph.add_op(&conv3), status::success);
+    ASSERT_EQ(agraph.add_op(&relu3), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 7U);
+
+    graph_t agraph2;
+    op_t conv4 {0, Convolution, "conv4"};
+    set_conv_common_attr(conv4);
+    op_t bn {1, BatchNormInference, "bn"};
+    bn.set_attr(op_attr::epsilon, 0.001f);
+
+    lt_vec = create_logical_tensors(8);
+    conv4.add_input(lt_vec[0]);
+    conv4.add_input(lt_vec[1]);
+    conv4.add_output(lt_vec[2]);
+    bn.add_input(lt_vec[2]);
+    bn.add_input(lt_vec[3]);
+    bn.add_input(lt_vec[4]);
+    bn.add_input(lt_vec[5]);
+    bn.add_input(lt_vec[6]);
+    bn.add_output(lt_vec[7]);
+    ASSERT_EQ(agraph2.add_op(&conv4), status::success);
+    ASSERT_EQ(agraph2.add_op(&bn), status::success);
+    agraph2.finalize();
+
+    fusion_ops.clear();
+    EXPECT_FALSE(match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops));
+
+    graph_t agraph3;
+    op_t conv5 {0, Convolution, "conv5"};
+    set_conv_common_attr(conv5);
+    op_t relu5 {1, ReLU, "relu4"};
+    op_t add2 {2, Add, "add2"};
+    lt_vec = create_logical_tensors(6);
+    conv5.add_input(lt_vec[0]);
+    conv5.add_input(lt_vec[1]);
+    conv5.add_output(lt_vec[2]);
+    relu5.add_input(lt_vec[2]);
+    relu5.add_output(lt_vec[3]);
+    add2.add_input(lt_vec[3]);
+    add2.add_input(lt_vec[4]);
+    add2.add_output(lt_vec[5]);
+    ASSERT_EQ(agraph3.add_op(&conv5), status::success);
+    ASSERT_EQ(agraph3.add_op(&relu5), status::success);
+    ASSERT_EQ(agraph3.add_op(&add2), status::success);
+    agraph3.finalize();
+
+    fusion_ops.clear();
+    EXPECT_TRUE(match_pattern(agraph3.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, SharedInput) {
+    /* Pattern that captures shared input to two MatMuls
+              |
+             / \
+        MatMul  MatMul
+             \ /
+           Multiply
+              |
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmm1 = graphp->append_op(MatMul);
+    auto pmm2 = graphp->append_op(MatMul);
+    graphp->create_input_port(0, pmm1, 0);
+    graphp->create_input_port(0, pmm2, 0);
+    auto pmul = graphp->append_op(
+            Multiply, {in_edge(0, pmm1, 0), in_edge(1, pmm2, 0)});
+    UNUSED(pmul);
+
+    // test with a graph that has the shared input
+    graph_t agraph;
+    op_t matmul1 {0, MatMul, "matmul1"};
+    op_t matmul2 {1, MatMul, "matmul2"};
+    op_t multiply {2, Multiply, "multiply"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    matmul1.add_input(lt_vec[0]);
+    matmul1.add_input(lt_vec[1]);
+    matmul1.add_output(lt_vec[2]);
+    matmul2.add_input(lt_vec[0]);
+    matmul2.add_input(lt_vec[3]);
+    matmul2.add_output(lt_vec[4]);
+    multiply.add_input(lt_vec[2]);
+    multiply.add_input(lt_vec[4]);
+    multiply.add_output(lt_vec[5]);
+
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph.add_op(&multiply), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+
+    // test with a graph that does not have the shared input
+    graph_t agraph2;
+    op_t matmul3 {0, MatMul, "matmul1"};
+    op_t matmul4 {1, MatMul, "matmul2"};
+    op_t multiply2 {2, Multiply, "multiply"};
+
+    std::vector<logical_tensor_t> lt_vec2 = create_logical_tensors(7);
+    matmul3.add_input(lt_vec2[0]);
+    matmul3.add_input(lt_vec2[1]);
+    matmul3.add_output(lt_vec2[2]);
+    matmul4.add_input(lt_vec2[3]);
+    matmul4.add_input(lt_vec2[4]);
+    matmul4.add_output(lt_vec2[5]);
+    multiply2.add_input(lt_vec2[2]);
+    multiply2.add_input(lt_vec2[5]);
+    multiply2.add_output(lt_vec2[6]);
+
+    ASSERT_EQ(agraph2.add_op(&matmul3), status::success);
+    ASSERT_EQ(agraph2.add_op(&matmul4), status::success);
+    ASSERT_EQ(agraph2.add_op(&multiply2), status::success);
+    agraph2.finalize();
+
+    std::vector<op_t *> fusion_ops2;
+    EXPECT_FALSE(
+            match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops2));
+    ASSERT_EQ(fusion_ops2.size(), 0U);
+}
+
+TEST(test_utils_pattern_matcher, SharedInputCase2) {
+    /* Pattern that captures shared input to two MatMuls
+
+         Dequant  Dequant
+        IN0  | IN0  |
+          \ /    \ /
+        MatMul  MatMul
+             \ /
+           Multiply
+              |
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pdequant1 = graphp->append_op(DynamicDequantize);
+    auto pmm1 = graphp->append_op(MatMul, {in_edge(1, pdequant1, 0)});
+    auto pdequant2 = graphp->append_op(DynamicDequantize);
+    auto pmm2 = graphp->append_op(MatMul, {in_edge(1, pdequant2, 0)});
+    graphp->create_input_port(0, pmm1, 0);
+    graphp->create_input_port(0, pmm2, 0);
+    auto pmul = graphp->append_op(
+            Multiply, {in_edge(0, pmm1, 0), in_edge(1, pmm2, 0)});
+    UNUSED(pmul);
+
+    // test with a graph that has the shared input
+    graph_t agraph;
+    op_t dequant1 {0, DynamicDequantize, "dequant1"};
+    op_t matmul1 {1, MatMul, "matmul1"};
+    op_t dequant2 {2, DynamicDequantize, "dequant2"};
+    op_t matmul2 {3, MatMul, "matmul2"};
+    op_t multiply {4, Multiply, "multiply"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    std::vector<logical_tensor_t> lt_vec_s8
+            = create_logical_tensors(4, data_type::s8, 8);
+    dequant1.add_input(lt_vec_s8[0]);
+    dequant1.add_input(lt_vec[0]);
+    dequant1.add_input(lt_vec_s8[1]);
+    dequant1.add_output(lt_vec[1]);
+
+    matmul1.add_input(lt_vec[2]);
+    matmul1.add_input(lt_vec[1]);
+    matmul1.add_output(lt_vec[3]);
+
+    dequant2.add_input(lt_vec_s8[2]);
+    dequant2.add_input(lt_vec[4]);
+    dequant2.add_input(lt_vec_s8[3]);
+    dequant2.add_output(lt_vec[5]);
+
+    matmul2.add_input(lt_vec[2]);
+    matmul2.add_input(lt_vec[5]);
+    matmul2.add_output(lt_vec[6]);
+
+    multiply.add_input(lt_vec[3]);
+    multiply.add_input(lt_vec[6]);
+    multiply.add_output(lt_vec[7]);
+
+    ASSERT_EQ(agraph.add_op(&dequant1), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&dequant2), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph.add_op(&multiply), status::success);
+    ASSERT_EQ(agraph.finalize(), status::success);
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 5U);
+
+    // test with a graph that does not have the shared input
+    graph_t agraph2;
+    op_t dequant3 {0, DynamicDequantize, "dequant1"};
+    op_t matmul3 {1, MatMul, "matmul1"};
+    op_t dequant4 {2, DynamicDequantize, "dequant2"};
+    op_t matmul4 {3, MatMul, "matmul2"};
+    op_t multiply2 {4, Multiply, "multiply"};
+
+    std::vector<logical_tensor_t> lt_vec2 = create_logical_tensors(9);
+    std::vector<logical_tensor_t> lt_vec2_s8
+            = create_logical_tensors(4, data_type::s8, 9);
+    dequant3.add_input(lt_vec2_s8[0]);
+    dequant3.add_input(lt_vec2[0]);
+    dequant3.add_input(lt_vec2_s8[1]);
+    dequant3.add_output(lt_vec2[1]);
+    matmul3.add_input(lt_vec2[2]);
+    matmul3.add_input(lt_vec2[1]);
+    matmul3.add_output(lt_vec2[3]);
+    dequant4.add_input(lt_vec2_s8[2]);
+    dequant4.add_input(lt_vec2[4]);
+    dequant4.add_input(lt_vec2_s8[3]);
+    dequant4.add_output(lt_vec2[5]);
+    matmul4.add_input(lt_vec2[8]);
+    matmul4.add_input(lt_vec2[5]);
+    matmul4.add_output(lt_vec2[6]);
+    multiply2.add_input(lt_vec2[3]);
+    multiply2.add_input(lt_vec2[6]);
+    multiply2.add_output(lt_vec2[7]);
+
+    ASSERT_EQ(agraph2.add_op(&dequant3), status::success);
+    ASSERT_EQ(agraph2.add_op(&matmul3), status::success);
+    ASSERT_EQ(agraph2.add_op(&dequant4), status::success);
+    ASSERT_EQ(agraph2.add_op(&matmul4), status::success);
+    ASSERT_EQ(agraph2.add_op(&multiply2), status::success);
+    agraph2.finalize();
+
+    std::vector<op_t *> fusion_ops2;
+    EXPECT_FALSE(
+            match_pattern(agraph2.get_ops()[0].get(), graphp, fusion_ops2));
+    ASSERT_EQ(fusion_ops2.size(), 0U);
+}
+
+TEST(test_utils_pattern_matcher, ParallelMatmul) {
+    auto graphp = std::make_shared<pb_graph_t>();
+    // Pattern that captures shared input to three MatMuls
+    //            |--> MatMul
+    //   Wildcard ----> MatMul
+    //            |--> MatMul
+    auto pwild = graphp->append_op(Wildcard);
+    auto pmm1 = graphp->append_op(MatMul, {in_edge(IN0, pwild, OUT0)});
+    auto pmm2 = graphp->append_op(MatMul, {in_edge(IN0, pwild, OUT0)});
+    auto pmm3 = graphp->append_op(MatMul, {in_edge(IN0, pwild, OUT0)});
+    UNUSED(pmm1);
+    UNUSED(pmm2);
+    UNUSED(pmm3);
+
+    graph_t agraph;
+    op_t relu {4, ReLU, "relu"};
+    op_t matmul1 {0, MatMul, "matmul1"};
+    op_t matmul2 {1, MatMul, "matmul2"};
+    op_t matmul3 {2, MatMul, "matmul3"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    relu.add_input(lt_vec[7]);
+    relu.add_output(lt_vec[0]);
+    matmul1.add_input(lt_vec[0]);
+    matmul1.add_input(lt_vec[1]);
+    matmul1.add_output(lt_vec[2]);
+    matmul2.add_input(lt_vec[0]);
+    matmul2.add_input(lt_vec[3]);
+    matmul2.add_output(lt_vec[4]);
+    matmul3.add_input(lt_vec[0]);
+    matmul3.add_input(lt_vec[5]);
+    matmul3.add_output(lt_vec[6]);
+
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul3), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, OptionalInput) {
+    /*Pattern                  Graph
+     Dq0     Dq1            Dq0     Dq1
+      |      |               |       |
+      |   [Reshape]*         |       |
+       \    /                 \     /
+       MatMul                 MatMul
+         |                       |
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pdq0 = graphp->append_op(Dequantize);
+    auto pdq1 = graphp->append_op(Dequantize);
+    auto optbody = std::make_shared<pb_graph_t>();
+    auto preshape = optbody->append_op(StaticReshape);
+    optbody->create_input_port(IN0, preshape, IN0);
+    optbody->create_output_port(OUT0, preshape, OUT0);
+    auto popt = graphp->append_optional(optbody, {in_edge(IN0, pdq1, OUT0)});
+    auto pmatmul = graphp->append_op(
+            MatMul, {in_edge(IN0, pdq0, OUT0), in_edge(IN1, popt, OUT0)});
+    UNUSED(pmatmul);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    std::vector<int64_t> zps = {0};
+    std::vector<float> scales = {3.1f};
+    op_t dq0 {1, Dequantize, "dq0"};
+    dq0.set_attr(op_attr::scales, scales);
+    dq0.set_attr(op_attr::zps, zps);
+    op_t dq1 {2, Dequantize, "dq1"};
+    dq1.set_attr(op_attr::scales, scales);
+    dq1.set_attr(op_attr::zps, zps);
+
+    auto lt0 = logical_tensor_init(0, data_type::s8);
+    auto lt1 = logical_tensor_init(1, data_type::f32);
+    dq0.add_input(lt0);
+    dq0.add_output(lt1);
+    auto lt2 = logical_tensor_init(2, data_type::s8);
+    auto lt3 = logical_tensor_init(3, data_type::f32);
+    dq1.add_input(lt2);
+    dq1.add_output(lt3);
+    auto lt4 = logical_tensor_init(4, data_type::f32);
+    matmul.add_input(lt1);
+    matmul.add_input(lt3);
+    matmul.add_output(lt4);
+
+    ASSERT_EQ(agraph.add_op(&dq0), status::success);
+    ASSERT_EQ(agraph.add_op(&dq1), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+//
+// Construct a nested pattern:
+// (NODE)* represents that NODE is wraped in repetition or optional
+// (NODE1 | NODE2 | NODE3) represents alternation of NODE1, NODE2 and NODE3
+// (Matmul -> (((ReLU | Sigmoid | Tanh)))*)*
+//
+TEST(test_utils_pattern_matcher, NestedMatchingFailure) {
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
+    auto optional_activation_subgraph = std::make_shared<pb_graph_t>();
+    auto activation = optional_activation_subgraph->append_alternation(
+            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh});
+    optional_activation_subgraph->create_input_port(0, activation, 0);
+    optional_activation_subgraph->create_output_port(0, activation, 0);
+    auto optional_activation = mlp_layer->append_optional(
+            optional_activation_subgraph, {in_edge(0, matmul_layer, 0)});
+    mlp_layer->create_input_port(0, matmul_layer, 0);
+    mlp_layer->create_output_port(0, optional_activation, 0);
+    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 2);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, RepetitionWithMultipleConsumers) {
+    /* pattern
+       conv
+        |
+       relu x [1,3)
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pconv = graphp->append_op(Convolution);
+    auto repbody = std::make_shared<pb_graph_t>();
+    auto prelu = repbody->append_op(ReLU);
+    repbody->create_input_port(IN0, prelu, IN0);
+    repbody->create_output_port(OUT0, prelu, OUT0);
+    graphp->append_repetition(
+            repbody, {OUT0, IN0}, 1, 3, {in_edge(IN0, pconv, OUT0)});
+
+    /* graph
+       conv
+        |
+       relu
+        / \
+   wildcard wildcard
+    */
+    graph_t agraph;
+    op_t conv {0, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t relu {1, ReLU, "relu"};
+    op_t wildcard1 {2, Wildcard, "w1"};
+    op_t wildcard2 {3, Wildcard, "w2"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(8);
+    conv.add_input(lt_vec[2]);
+    conv.add_input(lt_vec[3]);
+    conv.add_output(lt_vec[4]);
+    relu.add_input(lt_vec[4]);
+    relu.add_output(lt_vec[5]);
+    wildcard1.add_input(lt_vec[5]);
+    wildcard1.add_output(lt_vec[6]);
+    wildcard2.add_input(lt_vec[5]);
+    wildcard2.add_output(lt_vec[7]);
+
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard1), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard2), status::success);
+    agraph.finalize();
+    ASSERT_EQ(agraph.num_ops(), 4U);
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, MultipleConsumer) {
+    /*Pattern
+     Transpose
+      /     \____________
+   Matmul               /
+                     MatMul
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto trans = graphp->append_op(StaticTranspose);
+    auto mat1 = graphp->append_op(MatMul, {in_edge(IN1, trans, OUT0)});
+    auto mat2 = graphp->append_op(MatMul, {in_edge(IN1, trans, OUT0)});
+    UNUSED(mat1);
+    UNUSED(mat2);
+
+    graph_t agraph;
+    op_t transpose {0, StaticTranspose, "transpose"};
+    transpose.set_attr(op_attr::order, std::vector<int64_t> {0, 2, 1, 3});
+    op_t matmul1 {1, MatMul, "matmul1"};
+    op_t matmul2 {2, MatMul, "matmul2"};
+
+    auto lt0 = logical_tensor_init(0, data_type::f32);
+    auto lt1 = logical_tensor_init(1, data_type::f32);
+    transpose.add_input(lt0);
+    transpose.add_output(lt1);
+    auto lt2 = logical_tensor_init(2, data_type::f32);
+    auto lt3 = logical_tensor_init(3, data_type::f32);
+    matmul1.add_input(lt2);
+    matmul1.add_input(lt1);
+    matmul1.add_output(lt3);
+    auto lt4 = logical_tensor_init(4, data_type::f32);
+    auto lt5 = logical_tensor_init(5, data_type::f32);
+    matmul2.add_input(lt4);
+    matmul2.add_input(lt1);
+    matmul2.add_output(lt5);
+
+    ASSERT_EQ(agraph.add_op(&transpose), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, MultipleConsumerDifferentPartition) {
+    /*Pattern
+     Matmul
+      |
+     Div
+      |
+     Add
+      |
+   SoftMax
+      |
+     Mul
+    */
+    /*Graph
+
+    \   /
+    Matmul
+      |
+     Div
+      |
+     Add
+      |
+   SoftMax
+      |  \________________
+     Mul                  \
+                   SoftMaxBackProp
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto matmul_node = graphp->append_op(MatMul);
+    auto div_node
+            = graphp->append_op(Divide, {in_edge(IN0, matmul_node, OUT0)});
+    auto add_node = graphp->append_op(Add, {in_edge(IN0, div_node, OUT0)});
+    auto softmax_node
+            = graphp->append_op(SoftMax, {in_edge(IN0, add_node, OUT0)});
+    softmax_node->allow_external_outputs();
+    auto mul_node
+            = graphp->append_op(Multiply, {in_edge(IN0, softmax_node, OUT0)});
+    UNUSED(mul_node);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t div {1, Divide, "div"};
+    op_t add {2, Add, "add"};
+    op_t softmax {3, SoftMax, "softmax"};
+    op_t mul {4, Multiply, "mul"};
+    op_t softmaxbwd {5, SoftMaxBackward, "softmaxbwd"};
+
+    auto lt0 = logical_tensor_init(0, data_type::f32);
+    auto lt1 = logical_tensor_init(1, data_type::f32);
+    auto lt2 = logical_tensor_init(2, data_type::f32);
+    matmul.add_input(lt0);
+    matmul.add_input(lt1);
+    matmul.add_output(lt2);
+    auto lt3 = logical_tensor_init(3, data_type::f32);
+    auto lt4 = logical_tensor_init(4, data_type::f32);
+    div.add_input(lt2);
+    div.add_input(lt3);
+    div.add_output(lt4);
+    auto lt5 = logical_tensor_init(5, data_type::f32);
+    auto lt6 = logical_tensor_init(6, data_type::f32);
+    add.add_input(lt4);
+    add.add_input(lt5);
+    add.add_output(lt6);
+    auto lt7 = logical_tensor_init(7, data_type::f32);
+    softmax.add_input(lt6);
+    softmax.add_output(lt7);
+    auto lt8 = logical_tensor_init(8, data_type::f32);
+    auto lt9 = logical_tensor_init(9, data_type::f32);
+    mul.add_input(lt7);
+    mul.add_input(lt8);
+    mul.add_output(lt9);
+
+    auto lt10 = logical_tensor_init(10, data_type::f32);
+    auto lt11 = logical_tensor_init(11, data_type::f32);
+    softmaxbwd.add_input(lt7);
+    softmaxbwd.add_input(lt10);
+    softmaxbwd.add_output(lt11);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&div), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+    ASSERT_EQ(agraph.add_op(&softmax), status::success);
+    ASSERT_EQ(agraph.add_op(&mul), status::success);
+    ASSERT_EQ(agraph.add_op(&softmaxbwd), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 5U);
+}
+
+TEST(test_utils_pattern_matcher, NestedRepetitionOptional) {
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul = mlp_layer->append_op(op_kind::MatMul);
+    auto optional_add_subgraph = std::make_shared<pb_graph_t>();
+    auto optional_add = optional_add_subgraph->append_op(op_kind::Add);
+    optional_add_subgraph->create_input_port(0, optional_add, 0);
+    optional_add_subgraph->create_output_port(0, optional_add, 0);
+    auto add = mlp_layer->append_optional(
+            optional_add_subgraph, {in_edge(0, matmul, 0)});
+
+    auto activation = mlp_layer->append_alternation(
+            {op_kind::ReLU, op_kind::Sigmoid, op_kind::GELU},
+            {in_edge(0, add, 0)});
+
+    mlp_layer->create_input_port(0, matmul, 0);
+    mlp_layer->create_output_port(0, activation, 0);
+    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 10);
+
+    graph_t agraph;
+    op_t matmul_op {0, MatMul, "matmul"};
+    op_t add_op {1, Add, "add"};
+    op_t relu {2, ReLU, "relu"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    matmul_op.add_input(lt_vec[0]);
+    matmul_op.add_input(lt_vec[1]);
+    matmul_op.add_output(lt_vec[2]);
+    add_op.add_input(lt_vec[2]);
+    add_op.add_input(lt_vec[3]);
+    add_op.add_output(lt_vec[4]);
+    relu.add_input(lt_vec[4]);
+    relu.add_output(lt_vec[5]);
+
+    ASSERT_EQ(agraph.add_op(&matmul_op), status::success);
+    ASSERT_EQ(agraph.add_op(&add_op), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, RepetitionExternalOutput) {
+    /*
+    pattern:
+          matmul                    \
+         |      \(external_output)   |
+      activation                     |  * [1,10)
+         |      \(external_output)   /
+
+    graph:
+         matmul
+          |    \
+          relu  ext0
+          |   \
+         matmul ext1
+          |    \
+          relu  ext2
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul = fwd_mlp_layer->append_op(op_kind::MatMul);
+    matmul->allow_external_outputs();
+    auto activation = fwd_mlp_layer->append_alternation(
+            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh},
+            {in_edge(0, matmul, 0)});
+    activation->allow_external_outputs();
+    fwd_mlp_layer->create_input_port(0, matmul, 0);
+    fwd_mlp_layer->create_output_port(0, activation, 0);
+
+    // repeat layer for [1, 10) times
+    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
+
+    graph_t agraph;
+    op_t matmul0 {0, MatMul, "matmul0"};
+    op_t relu0 {1, ReLU, "relu0"};
+    op_t matmul1 {2, MatMul, "matmul1"};
+    op_t relu1 {3, ReLU, "relu1"};
+
+    op_t ext0 {4, StaticTranspose, "ext0"};
+    ext0.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
+    op_t ext1 {5, StaticTranspose, "ext1"};
+    ext1.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
+    op_t ext2 {6, StaticTranspose, "ext2"};
+    ext2.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
+
+    auto lt0 = logical_tensor_init(0, data_type::f32);
+    auto lt1 = logical_tensor_init(1, data_type::f32);
+    auto lt2 = logical_tensor_init(2, data_type::f32);
+    matmul0.add_input(lt0);
+    matmul0.add_input(lt1);
+    matmul0.add_output(lt2);
+    auto lt3 = logical_tensor_init(3, data_type::f32);
+    relu0.add_input(lt2);
+    relu0.add_output(lt3);
+    auto lt4 = logical_tensor_init(4, data_type::f32);
+    auto lt5 = logical_tensor_init(5, data_type::f32);
+    matmul1.add_input(lt3);
+    matmul1.add_input(lt4);
+    matmul1.add_output(lt5);
+    auto lt6 = logical_tensor_init(6, data_type::f32);
+    relu1.add_input(lt5);
+    relu1.add_output(lt6);
+    auto lt7 = logical_tensor_init(7, data_type::f32);
+    auto lt8 = logical_tensor_init(8, data_type::f32);
+    auto lt9 = logical_tensor_init(9, data_type::f32);
+    ext0.add_input(lt2);
+    ext0.add_output(lt7);
+    ext1.add_input(lt3);
+    ext1.add_output(lt8);
+    ext2.add_input(lt5);
+    ext2.add_output(lt9);
+
+    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
+    ASSERT_EQ(agraph.add_op(&relu0), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&relu1), status::success);
+    ASSERT_EQ(agraph.add_op(&ext0), status::success);
+    ASSERT_EQ(agraph.add_op(&ext1), status::success);
+    ASSERT_EQ(agraph.add_op(&ext2), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 4U);
+}
+
+TEST(test_utils_pattern_matcher, RepetitionExternalOutputSwapOrder) {
+    /*
+    pattern:
+          matmul                    \
+         |      \(external_output)   |
+      activation                     |  * [1,10)
+         |      \(external_output)   /
+
+    graph:
+         matmul
+        /    |
+      ext0  relu
+           / |
+       ext1 matmul
+            /  |
+          ext2 relu
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul = fwd_mlp_layer->append_op(op_kind::MatMul);
+    matmul->allow_external_outputs();
+    auto activation = fwd_mlp_layer->append_alternation(
+            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh},
+            {in_edge(0, matmul, 0)});
+    activation->allow_external_outputs();
+    fwd_mlp_layer->create_input_port(0, matmul, 0);
+    fwd_mlp_layer->create_output_port(0, activation, 0);
+
+    // repeat layer for [1, 10) times
+    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
+
+    graph_t agraph;
+    op_t matmul0 {0, MatMul, "matmul0"};
+    op_t relu0 {1, ReLU, "relu0"};
+    op_t matmul1 {2, MatMul, "matmul1"};
+    op_t relu1 {3, ReLU, "relu1"};
+
+    op_t ext0 {4, StaticTranspose, "ext0"};
+    ext0.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
+    op_t ext1 {5, StaticTranspose, "ext1"};
+    ext1.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
+    op_t ext2 {6, StaticTranspose, "ext2"};
+    ext2.set_attr(op_attr::order, std::vector<int64_t> {0, 1});
+
+    auto lt0 = logical_tensor_init(0, data_type::f32);
+    auto lt1 = logical_tensor_init(1, data_type::f32);
+    auto lt2 = logical_tensor_init(2, data_type::f32);
+    matmul0.add_input(lt0);
+    matmul0.add_input(lt1);
+    matmul0.add_output(lt2);
+
+    auto lt7 = logical_tensor_init(7, data_type::f32);
+    ext0.add_input(lt2);
+    ext0.add_output(lt7);
+
+    auto lt3 = logical_tensor_init(3, data_type::f32);
+    relu0.add_input(lt2);
+    relu0.add_output(lt3);
+
+    auto lt8 = logical_tensor_init(8, data_type::f32);
+    ext1.add_input(lt3);
+    ext1.add_output(lt8);
+
+    auto lt4 = logical_tensor_init(4, data_type::f32);
+    auto lt5 = logical_tensor_init(5, data_type::f32);
+    matmul1.add_input(lt3);
+    matmul1.add_input(lt4);
+    matmul1.add_output(lt5);
+
+    auto lt9 = logical_tensor_init(9, data_type::f32);
+    ext2.add_input(lt5);
+    ext2.add_output(lt9);
+
+    auto lt6 = logical_tensor_init(6, data_type::f32);
+    relu1.add_input(lt5);
+    relu1.add_output(lt6);
+
+    ASSERT_EQ(agraph.add_op(&ext0), status::success);
+    ASSERT_EQ(agraph.add_op(&ext1), status::success);
+    ASSERT_EQ(agraph.add_op(&ext2), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
+    ASSERT_EQ(agraph.add_op(&relu0), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&relu1), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[3].get(), graphp, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 4U);
+}
+
+TEST(test_utils_pattern_matcher, CyclicCheck) {
+    /*
+    pattern:
+          matmul
+           /  \(external_output)
+         relu
+           \  /
+            add
+
+
+    graph:
+         matmul
+          /  \
+        relu  sigmoid
+          \  /
+           add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmatmul = graphp->append_op(op_kind::MatMul);
+    pmatmul->allow_external_outputs();
+    auto prelu = graphp->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
+    auto padd = graphp->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+    op_t add {2, Add, "add"};
+    op_t sigmoid {3, Sigmoid, "sigmoid"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+    sigmoid.add_input(lt_vec[2]);
+    sigmoid.add_output(lt_vec[4]);
+    add.add_input(lt_vec[3]);
+    add.add_input(lt_vec[4]);
+    add.add_output(lt_vec[5]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&sigmoid), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+}
+
+TEST(test_utils_pattern_matcher, UndirectCyclicCheck) {
+    /*
+    pattern:
+          matmul
+           /  \(external_output)
+         relu
+           \  /
+            add
+
+
+    graph:
+         matmul
+          /  \
+         |    wildcard wildcard
+        relu    |     /
+         |    wildcard
+          \  /
+           add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto pmatmul = graphp->append_op(op_kind::MatMul);
+    pmatmul->allow_external_outputs();
+    auto prelu = graphp->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
+    auto padd = graphp->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+    op_t add {2, Add, "add"};
+    op_t wildcard {3, Wildcard, "wildcard"};
+    op_t wildcard2 {4, Wildcard, "wildcard"};
+    op_t wildcard3 {5, Wildcard, "wildcard"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(9);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+    wildcard.add_input(lt_vec[2]);
+    wildcard.add_output(lt_vec[4]);
+    wildcard2.add_input(lt_vec[5]);
+    wildcard2.add_output(lt_vec[6]);
+    wildcard3.add_input(lt_vec[4]);
+    wildcard3.add_input(lt_vec[6]);
+    wildcard3.add_output(lt_vec[7]);
+    add.add_input(lt_vec[3]);
+    add.add_input(lt_vec[7]);
+    add.add_output(lt_vec[8]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard2), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard3), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+}
+
+TEST(test_utils_pattern_matcher, ComplexCyclicCheck) {
+    /*
+    pattern:
+          matmul                   \
+           /   \(external_output)   |
+         relu                       |  * [1,10)
+           \  /                     |
+            add                     /
+
+    graph:
+         matmul
+          /   \
+        relu  sigmoid
+          \        |
+           add     |
+            |      |
+           matmul /
+            |    /
+           relu /
+            \  /
+             add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
+    auto pmatmul = fwd_mlp_layer->append_op(op_kind::MatMul);
+    pmatmul->allow_external_outputs();
+    auto prelu
+            = fwd_mlp_layer->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
+    auto padd = fwd_mlp_layer->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
+    fwd_mlp_layer->create_input_port(0, pmatmul, 0);
+    fwd_mlp_layer->create_output_port(0, padd, 0);
+
+    // repeat layer for [1, 10) times
+    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
+
+    graph_t agraph;
+    op_t matmul0 {0, MatMul, "matmu0"};
+    op_t relu0 {1, ReLU, "relu0"};
+    op_t add0 {2, Add, "add0"};
+    op_t sigmoid0 {3, Sigmoid, "sigmoid0"};
+    op_t matmul1 {4, MatMul, "matmul1"};
+    op_t relu1 {5, ReLU, "relu1"};
+    op_t add1 {6, Add, "add1"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(11);
+    matmul0.add_input(lt_vec[0]);
+    matmul0.add_input(lt_vec[1]);
+    matmul0.add_output(lt_vec[2]);
+    relu0.add_input(lt_vec[2]);
+    relu0.add_output(lt_vec[3]);
+    sigmoid0.add_input(lt_vec[2]);
+    sigmoid0.add_output(lt_vec[4]);
+    add0.add_input(lt_vec[3]);
+    add0.add_input(lt_vec[5]);
+    add0.add_output(lt_vec[6]);
+    matmul1.add_input(lt_vec[6]);
+    matmul1.add_input(lt_vec[7]);
+    matmul1.add_output(lt_vec[8]);
+    relu1.add_input(lt_vec[8]);
+    relu1.add_output(lt_vec[9]);
+    add1.add_input(lt_vec[9]);
+    //cycle here
+    add1.add_input(lt_vec[4]);
+    add1.add_output(lt_vec[10]);
+
+    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
+    ASSERT_EQ(agraph.add_op(&relu0), status::success);
+    ASSERT_EQ(agraph.add_op(&sigmoid0), status::success);
+    ASSERT_EQ(agraph.add_op(&add0), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&relu1), status::success);
+    ASSERT_EQ(agraph.add_op(&add1), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    // should match the 1st rep_unit
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, ComplexUndirectCyclicCheck) {
+    /*
+    pattern:
+          matmul                   \
+           /   \(external_output)   |
+         relu                       |  * [1,10)
+           \  /                     |
+            add                     /
+
+    graph:
+         matmul
+          /   \
+        relu  wildcard
+          \        |
+           add    wildcard
+            |      |
+           matmul wildcard
+            |    /
+           relu /
+            \  /
+             add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto fwd_mlp_layer = std::make_shared<pb_graph_t>();
+    auto pmatmul = fwd_mlp_layer->append_op(op_kind::MatMul);
+    pmatmul->allow_external_outputs();
+    auto prelu
+            = fwd_mlp_layer->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
+    auto padd = fwd_mlp_layer->append_op(op_kind::Add, {in_edge(0, prelu, 0)});
+    fwd_mlp_layer->create_input_port(0, pmatmul, 0);
+    fwd_mlp_layer->create_output_port(0, padd, 0);
+
+    // repeat layer for [1, 10) times
+    graphp->append_repetition(fwd_mlp_layer, {0, 0}, 1, 10);
+
+    graph_t agraph;
+    op_t matmul0 {0, MatMul, "matmu0"};
+    op_t relu0 {1, ReLU, "relu0"};
+    op_t add0 {2, Add, "add0"};
+    op_t wildcard0 {3, Wildcard, "wildcard0"};
+    op_t wildcard1 {4, Wildcard, "wildcard1"};
+    op_t wildcard2 {5, Wildcard, "wildcard2"};
+    op_t matmul1 {6, MatMul, "matmul1"};
+    op_t relu1 {7, ReLU, "relu1"};
+    op_t add1 {8, Add, "add1"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(13);
+    matmul0.add_input(lt_vec[0]);
+    matmul0.add_input(lt_vec[1]);
+    matmul0.add_output(lt_vec[2]);
+    relu0.add_input(lt_vec[2]);
+    relu0.add_output(lt_vec[3]);
+    wildcard0.add_input(lt_vec[2]);
+    wildcard0.add_output(lt_vec[4]);
+    wildcard1.add_input(lt_vec[4]);
+    wildcard1.add_output(lt_vec[5]);
+    wildcard2.add_input(lt_vec[5]);
+    wildcard2.add_output(lt_vec[6]);
+    add0.add_input(lt_vec[3]);
+    add0.add_input(lt_vec[7]);
+    add0.add_output(lt_vec[8]);
+    matmul1.add_input(lt_vec[8]);
+    matmul1.add_input(lt_vec[9]);
+    matmul1.add_output(lt_vec[10]);
+    relu1.add_input(lt_vec[10]);
+    relu1.add_output(lt_vec[11]);
+    add1.add_input(lt_vec[11]);
+    //cycle here
+    add1.add_input(lt_vec[6]);
+    add1.add_output(lt_vec[12]);
+
+    ASSERT_EQ(agraph.add_op(&matmul0), status::success);
+    ASSERT_EQ(agraph.add_op(&relu0), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard0), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard1), status::success);
+    ASSERT_EQ(agraph.add_op(&wildcard2), status::success);
+    ASSERT_EQ(agraph.add_op(&add0), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul1), status::success);
+    ASSERT_EQ(agraph.add_op(&relu1), status::success);
+    ASSERT_EQ(agraph.add_op(&add1), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    // should match the 1st rep_unit
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, OptionalSubgraphFailure) {
+    /*
+        [   \    /
+            matmul
+              |
+        [relu, sigmoid, tanh]*[0,1] ]*[1,5]
+    */
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
+    auto optional_activation_subgraph = std::make_shared<pb_graph_t>();
+    auto activation = optional_activation_subgraph->append_alternation(
+            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh});
+    optional_activation_subgraph->create_input_port(0, activation, 0);
+    optional_activation_subgraph->create_output_port(0, activation, 0);
+    auto optional_activation = mlp_layer->append_optional(
+            optional_activation_subgraph, {in_edge(0, matmul_layer, 0)});
+    mlp_layer->create_input_port(0, matmul_layer, 0);
+    mlp_layer->create_output_port(0, optional_activation, 0);
+    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 5);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t matmul2 {1, MatMul, "matmul2"};
+    op_t matmul3 {2, MatMul, "matmul3"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    matmul2.add_input(lt_vec[2]);
+    matmul2.add_input(lt_vec[3]);
+    matmul2.add_output(lt_vec[4]);
+    matmul3.add_input(lt_vec[4]);
+    matmul3.add_input(lt_vec[5]);
+    matmul3.add_output(lt_vec[6]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul3), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 3U);
+}
+
+TEST(test_utils_pattern_matcher, OptionalSubgraphFailure3) {
+    /*
+            [  \     /
+               matmul
+                 |
+               relu
+                 |
+              [relu]*[0,1] ]*[1,5]
+    */
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
+    auto relu_layer = mlp_layer->append_op(
+            op_kind::ReLU, {in_edge(0, matmul_layer, 0)});
+    auto optional_relu_subgraph = std::make_shared<pb_graph_t>();
+    auto activation = optional_relu_subgraph->append_op(op_kind::ReLU);
+    optional_relu_subgraph->create_input_port(0, activation, 0);
+    optional_relu_subgraph->create_output_port(0, activation, 0);
+    auto optional_relu = mlp_layer->append_optional(
+            optional_relu_subgraph, {in_edge(0, relu_layer, 0)});
+    mlp_layer->create_input_port(0, matmul_layer, 0);
+    mlp_layer->create_output_port(0, optional_relu, 0);
+    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 5);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(4);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 2U);
+}
+
+TEST(test_utils_pattern_matcher, OptionalSubgraphFailure4) {
+    /*
+            [  \     /
+               matmul
+                 |
+                add*[0,1]
+                 |
+              [relu]*[0,1] ]*[1,5]
+    */
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto mlp_layer = std::make_shared<pb_graph_t>();
+    auto matmul_layer = mlp_layer->append_op(op_kind::MatMul);
+    auto optional_add_subgraph = std::make_shared<pb_graph_t>();
+    auto add = optional_add_subgraph->append_op(op_kind::Add);
+    optional_add_subgraph->create_input_port(0, add, 0);
+    optional_add_subgraph->create_output_port(0, add, 0);
+    auto optional_add = mlp_layer->append_optional(
+            optional_add_subgraph, {in_edge(0, matmul_layer, 0)});
+    auto optional_activation_subgraph = std::make_shared<pb_graph_t>();
+    auto activation = optional_activation_subgraph->append_alternation(
+            {op_kind::ReLU, op_kind::Sigmoid, op_kind::Tanh});
+    optional_activation_subgraph->create_input_port(0, activation, 0);
+    optional_activation_subgraph->create_output_port(0, activation, 0);
+    auto optional_activation = mlp_layer->append_optional(
+            optional_activation_subgraph, {in_edge(0, optional_add, 0)});
+    mlp_layer->create_input_port(0, matmul_layer, 0);
+    mlp_layer->create_output_port(0, optional_activation, 0);
+    pgraph->append_repetition(mlp_layer, {0, 0}, 1, 5);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(3);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 1U);
+}
+
+TEST(test_utils_pattern_matcher, ShouldNotMatchIdenticalResblock) {
+    // pattern:
+    //     |               |
+    //   conv              |
+    //     |               |
+    //   opt_bias          |
+    //     |               |
+    //   opt_relu          |
+    //     |  dst0       conv
+    //   conv              |
+    //     |            opt_bias
+    //   opt_bias          |
+    //     |            opt_relu
+    //   opt_relu          |
+    //     |  dst1         |
+    //   conv              |
+    //     |              /
+    //   opt_bias        /
+    //         \        / dst2
+    //          \      /
+    //            add
+    //             |
+    //            relu
+    //             |
+    auto conv_opt_bias_opt_eltwise
+            = [&](const std::shared_ptr<pb_graph_t> &pgraph,
+                      pb_op_t *input) -> pb_op_t * {
+        in_edges_t in_edges;
+        if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
+        pb_op_t *conv = pgraph->append_op(op_kind::Convolution, in_edges);
+
+        // Optional bias_add
+        auto popt_bias_graph = std::make_shared<pb_graph_t>();
+        pb_op_t *pbias = popt_bias_graph->append_op(op_kind::BiasAdd);
+        popt_bias_graph->create_input_port(0, pbias, 0);
+        popt_bias_graph->create_output_port(0, pbias, 0);
+        auto popt_bias = pgraph->append_optional(
+                popt_bias_graph, in_edges_t {in_edge(0, conv, 0)});
+
+        // Optional post relu
+        auto popt_eltwise_graph = std::make_shared<pb_graph_t>();
+        pb_op_t *peltwise = popt_eltwise_graph->append_op(op_kind::ReLU);
+        popt_eltwise_graph->create_input_port(0, peltwise, 0);
+        popt_eltwise_graph->create_output_port(0, peltwise, 0);
+        auto popt_eltwise = pgraph->append_optional(
+                popt_eltwise_graph, in_edges_t {in_edge(0, popt_bias, 0)});
+        return reinterpret_cast<pb_op_t *>(popt_eltwise);
+    };
+
+    auto conv_opt_bias_add_relu
+            = [&](const std::shared_ptr<pb_graph_t> &pgraph, pb_op_t *input,
+                      pb_op_t *post_src) -> pb_op_t * {
+        in_edges_t in_edges;
+        if (input) { in_edges = in_edges_t {in_edge(0, input, 0)}; }
+        pb_op_t *conv = pgraph->append_op(op_kind::Convolution, in_edges);
+
+        // Optional bias_add
+        auto popt_bias_graph = std::make_shared<pb_graph_t>();
+        pb_op_t *pbias = popt_bias_graph->append_op(op_kind::BiasAdd);
+        popt_bias_graph->create_input_port(0, pbias, 0);
+        popt_bias_graph->create_output_port(0, pbias, 0);
+        auto popt_bias = pgraph->append_optional(
+                popt_bias_graph, in_edges_t {in_edge(0, conv, 0)});
+
+        in_edges_t add_in_edges = in_edges_t {in_edge(0, popt_bias, 0)};
+        if (post_src) { add_in_edges.emplace_back(in_edge(1, post_src, 0)); }
+        pb_op_t *add = pgraph->append_op(op_kind::Add, add_in_edges);
+
+        pb_op_t *relu = pgraph->append_op(
+                op_kind::ReLU, in_edges_t {in_edge(0, add, 0)});
+        return relu;
+    };
+
+    auto pgraph = std::make_shared<pb_graph_t>();
+    pb_op_t *dst0 = conv_opt_bias_opt_eltwise(pgraph, nullptr);
+    pb_op_t *dst1 = conv_opt_bias_opt_eltwise(pgraph, dst0);
+    pb_op_t *dst2 = conv_opt_bias_opt_eltwise(pgraph, nullptr);
+    conv_opt_bias_add_relu(pgraph, dst1, dst2);
+
+    // graph:
+    // construct identical bottleneck resblock
+    //     |               |
+    //   conv              |
+    //     |               |
+    //   bias              |
+    //     |               |
+    //   relu              |
+    //     |               |
+    //   conv              |
+    //     |               |
+    //   bias              |
+    //     |               |
+    //   relu              |
+    //     |               |
+    //   conv              |
+    //     |              /
+    //    bias           /
+    //         \        /
+    //          \      /
+    //            add
+    //             |
+    //            relu
+    //             |
+
+    graph_t agraph;
+
+    id_generator_t id_gen;
+
+    int64_t ic = 8, oc = 8, ks = 1;
+    std::vector<int64_t> src_shape {1, ic, 12, 12};
+
+    auto src = logical_tensor_init(id_gen.get_id(), src_shape, data_type::f32);
+
+    auto conv0 = create_convolution(id_gen, agraph, src, ic, ks, oc, 1, {1, 1},
+            {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", true, false, 1e-6f,
+            /*with relu*/ true, /*with biasadd*/ true);
+    auto conv1 = create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1,
+            {1, 1}, {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", true, false, 1e-6f,
+            /*with relu*/ true, /*with biasadd*/ true);
+    auto conv2 = create_convolution(id_gen, agraph, conv1, ic, ks, oc, 1,
+            {1, 1}, {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", true, false, 1e-6f,
+            /*with relu*/ false, /*with biasadd*/ true);
+    auto add0 = create_add(id_gen, agraph, conv2, src);
+    create_relu(id_gen, agraph, add0);
+
+    agraph.finalize();
+
+    ASSERT_EQ(agraph.get_ops().size(), 10U);
+
+    std::vector<op_t *> fusion_ops;
+    // should not match, so should be false
+    EXPECT_FALSE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+}
+
+TEST(test_utils_pattern_matcher, RepetitionOportExternalOutput) {
+    /*
+    pattern:
+        matmul                     \
+          |                         |  * [1,10)
+         relu                      /
+          |  \(external_output)
+        sigmoid
+    graph:
+         matmul
+           |
+          relu
+           |  \
+        matmul relu_bwd
+           |
+          relu
+           |  \
+       sigmoid relu_bwd
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+    auto grep = std::make_shared<pb_graph_t>();
+    auto pmatmul = grep->append_op(op_kind::MatMul);
+    auto prelu = grep->append_op(op_kind::ReLU, {in_edge(0, pmatmul, 0)});
+    prelu->allow_external_outputs();
+    grep->create_input_port(0, pmatmul, 0);
+    grep->create_output_port(0, prelu, 0);
+    auto prep = graphp->append_repetition(grep, {0, 0}, 1, 10);
+
+    auto psigmoid = graphp->append_op(op_kind::Sigmoid, {in_edge(0, prep, 0)});
+
+    UNUSED(psigmoid);
+
+    graph_t agraph;
+    op_t matmul {0, MatMul, "matmul"};
+    op_t relu {1, ReLU, "relu"};
+    op_t relu_bwd {2, ReLUBackward, "relu_bwd"};
+    op_t matmul2 {3, MatMul, "matmul2"};
+    op_t relu2 {4, ReLU, "relu2"};
+    op_t relu_bwd2 {5, ReLUBackward, "relu_bwd2"};
+    op_t sigmoid {6, Sigmoid, "sigmoid"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(12);
+    matmul.add_input(lt_vec[0]);
+    matmul.add_input(lt_vec[1]);
+    matmul.add_output(lt_vec[2]);
+    relu.add_input(lt_vec[2]);
+    relu.add_output(lt_vec[3]);
+    relu_bwd.add_input(lt_vec[3]);
+    relu_bwd.add_input(lt_vec[4]);
+    relu_bwd.add_output(lt_vec[5]);
+    matmul2.add_input(lt_vec[3]);
+    matmul2.add_input(lt_vec[6]);
+    matmul2.add_output(lt_vec[7]);
+    relu2.add_input(lt_vec[7]);
+    relu2.add_output(lt_vec[8]);
+    sigmoid.add_input(lt_vec[8]);
+    sigmoid.add_output(lt_vec[9]);
+    relu_bwd2.add_input(lt_vec[8]);
+    relu_bwd2.add_input(lt_vec[10]);
+    relu_bwd2.add_output(lt_vec[11]);
+
+    ASSERT_EQ(agraph.add_op(&matmul), status::success);
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&relu_bwd), status::success);
+    ASSERT_EQ(agraph.add_op(&matmul2), status::success);
+    ASSERT_EQ(agraph.add_op(&relu2), status::success);
+    ASSERT_EQ(agraph.add_op(&sigmoid), status::success);
+    ASSERT_EQ(agraph.add_op(&relu_bwd2), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 5U);
+}
+
+TEST(test_utils_pattern_matcher, OptionalCommutative) {
+    /*
+    pattern:
+          relu
+           |   \
+         Conv   |
+           |    |
+  [0-1]*BiasAdd |
+           |   /
+          Add
+    graph:
+          relu
+        /  |
+       |  Conv
+       |   |
+       | BiasAdd*[0-1]
+        \  |
+          Add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+
+    auto prelu = graphp->append_op(op_kind::ReLU);
+    auto pconv
+            = graphp->append_op(op_kind::Convolution, {in_edge(0, prelu, 0)});
+    auto biasadd_subgraph = std::make_shared<pb_graph_t>();
+    auto biasadd = biasadd_subgraph->append_op(op_kind::BiasAdd);
+    biasadd_subgraph->create_input_port(0, biasadd, 0);
+    biasadd_subgraph->create_output_port(0, biasadd, 0);
+    auto optional_biasadd
+            = graphp->append_optional(biasadd_subgraph, {in_edge(0, pconv, 0)});
+    in_edges_t add_edges
+            = {in_edge(0, optional_biasadd, 0), in_edge(1, prelu, 0)};
+    auto padd = graphp->append_op(op_kind::Add, add_edges);
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t relu {0, ReLU, "relu"};
+    op_t conv {1, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t bias {2, BiasAdd, "bias"};
+    op_t add {3, Add, "add"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(7);
+    relu.add_input(lt_vec[0]);
+    relu.add_output(lt_vec[1]);
+    conv.add_input(lt_vec[1]);
+    conv.add_input(lt_vec[2]);
+    conv.add_output(lt_vec[3]);
+    bias.add_input(lt_vec[3]);
+    bias.add_input(lt_vec[4]);
+    bias.add_output(lt_vec[5]);
+    add.add_input(lt_vec[1]);
+    add.add_input(lt_vec[5]);
+    add.add_output(lt_vec[6]);
+
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&bias), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 4U);
+}
+
+TEST(test_utils_pattern_matcher, AlternativeCommutative) {
+    /*
+    pattern:
+          relu______________
+           |                \
+         Conv               |
+           |                |
+  [0-1]*(ReLU|Tanh|Sigmoid) |
+           |  ______________/
+           | /
+          Add
+    graph:
+          relu
+        /  |
+       |  Conv
+       |   |
+       | [0-1]*(ReLU|Tanh|Sigmoid)
+        \  |
+          Add
+    */
+    auto graphp = std::make_shared<pb_graph_t>();
+
+    auto prelu = graphp->append_op(op_kind::ReLU);
+    auto pconv
+            = graphp->append_op(op_kind::Convolution, {in_edge(0, prelu, 0)});
+    auto palt_subgraph = std::make_shared<pb_graph_t>();
+    auto palt = palt_subgraph->append_alternation(
+            {op_kind::ReLU, op_kind::Tanh, op_kind::Sigmoid});
+    palt_subgraph->create_input_port(0, palt, 0);
+    palt_subgraph->create_output_port(0, palt, 0);
+    auto optional_biasadd
+            = graphp->append_optional(palt_subgraph, {in_edge(0, pconv, 0)});
+    in_edges_t add_edges
+            = {in_edge(0, optional_biasadd, 0), in_edge(1, prelu, 0)};
+    auto padd = graphp->append_op(op_kind::Add, add_edges);
+    UNUSED(padd);
+
+    graph_t agraph;
+    op_t relu {0, ReLU, "relu"};
+    op_t conv {1, Convolution, "conv"};
+    set_conv_common_attr(conv);
+    op_t relu2 {2, ReLU, "relu2"};
+    op_t add {3, Add, "add"};
+
+    std::vector<logical_tensor_t> lt_vec = create_logical_tensors(6);
+    relu.add_input(lt_vec[0]);
+    relu.add_output(lt_vec[1]);
+    conv.add_input(lt_vec[1]);
+    conv.add_input(lt_vec[2]);
+    conv.add_output(lt_vec[3]);
+    relu2.add_input(lt_vec[3]);
+    relu2.add_output(lt_vec[4]);
+    add.add_input(lt_vec[1]);
+    add.add_input(lt_vec[4]);
+    add.add_output(lt_vec[5]);
+
+    ASSERT_EQ(agraph.add_op(&relu), status::success);
+    ASSERT_EQ(agraph.add_op(&conv), status::success);
+    ASSERT_EQ(agraph.add_op(&relu2), status::success);
+    ASSERT_EQ(agraph.add_op(&add), status::success);
+
+    agraph.finalize();
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), graphp, fusion_ops));
+    EXPECT_EQ(fusion_ops.size(), 4U);
+}
+
+TEST(test_utils_pattern_matcher, CreateOutputPort) {
+    auto post_subgraph = std::make_shared<pb_graph_t>();
+    std::vector<graph::op_kind_t> unary_binary
+            = {graph::op_kind::Abs, graph::op_kind::Clamp};
+    auto alternative_post_op = post_subgraph->append_alternation(unary_binary);
+    ASSERT_NO_THROW(alternative_post_op->allow_internal_inputs());
+    ASSERT_TRUE(post_subgraph->create_input_port(0, alternative_post_op, 0));
+    ASSERT_TRUE(post_subgraph->create_output_port(1, alternative_post_op, 0));
+}
+
+TEST(test_utils_pattern_matcher, CreateInputPort) {
+    auto alt_graph = std::make_shared<pb_graph_t>();
+    std::vector<graph::op_kind_t> unary_binary
+            = {graph::op_kind::GELU, graph::op_kind::HardSwish};
+    auto palt = alt_graph->append_alternation(unary_binary);
+    ASSERT_NO_THROW(palt->allow_internal_inputs());
+    ASSERT_TRUE(alt_graph->create_input_port(0, palt, 0));
+    ASSERT_FALSE(alt_graph->create_input_port(0, palt, 0));
+}
+
+TEST(test_utils_pattern_matcher, GraphNodeName) {
+    auto alt_graph = std::make_shared<pb_graph_t>();
+    std::shared_ptr<pb_node_t> node_ptr = alt_graph;
+    ASSERT_NO_THROW(auto node_str = node_ptr->get_name());
+}
+
+TEST(test_utils_pattern_matcher, GraphRun) {
+    graph::pass::pass_base_t a;
+    graph::graph_t agraph;
+    ASSERT_EQ(a.run(agraph), graph::status::success);
+}
+
+TEST(test_utils_pattern_matcher, RepConvReluWithMultiConsumers) {
+    // pattern:
+    //   conv
+    //     |
+    //   relu
+    //     |____________
+    //     |            |
+    //     |      [conv-> relu]*[0,3)
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto pconv = pgraph->append_op(graph::op_kind::Convolution);
+    auto prelu = pgraph->append_op(
+            graph::op_kind::ReLU, in_edges_t {in_edge(0, pconv, 0)});
+    auto prep_subgraph = std::make_shared<pb_graph_t>();
+    auto prconv = prep_subgraph->append_op(graph::op_kind::Convolution);
+    auto prrelu = prep_subgraph->append_op(
+            graph::op_kind::ReLU, in_edges_t {in_edge(0, prconv, 0)});
+    prelu->allow_external_outputs();
+    prep_subgraph->create_input_port(IN0, prconv, IN0);
+    prep_subgraph->create_output_port(OUT0, prrelu, OUT0);
+    pgraph->append_repetition(
+            prep_subgraph, {0, 0}, 0, 3, in_edges_t {in_edge(0, prelu, 0)});
+
+    // graph:
+    // the single conv is the 1st consumer
+    // while the conv on the "main branch" is the 2nd consumer
+    //   conv
+    //     |
+    //   relu
+    //     |________________
+    //     |                |
+    // (external output)  conv
+    //   conv               |
+    //     |              relu
+    //     |                |
+    graph::graph_t agraph;
+
+    id_generator_t id_gen;
+
+    int64_t ic = 8, oc = 8, ks = 1;
+    std::vector<int64_t> src_shape {1, ic, 12, 12};
+
+    auto src = logical_tensor_init(
+            id_gen.get_id(), src_shape, graph::data_type::f32);
+
+    auto conv0 = create_convolution(id_gen, agraph, src, ic, ks, oc, 1, {1, 1},
+            {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
+            /*with relu*/ true, /*with biasadd*/ false);
+    // the order of consumer depends on the order of adding ops
+    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
+            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
+            /*with relu*/ false, /*with biasadd*/ false);
+    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
+            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
+            /*with relu*/ true, /*with biasadd*/ false);
+
+    agraph.finalize();
+
+    ASSERT_EQ(agraph.get_ops().size(), 5U);
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 4U);
+}
+
+TEST(test_utils_pattern_matcher, RepConvReluWithMultiConsumersCase2) {
+    /*pattern:  
+                 |
+           [conv-> relu]*[0,3)
+            /    |
+           |     |
+     (allow external output)
+    */
+    auto pgraph = std::make_shared<pb_graph_t>();
+    auto prep_subgraph = std::make_shared<pb_graph_t>();
+    auto prconv = prep_subgraph->append_op(graph::op_kind::Convolution);
+    auto prrelu = prep_subgraph->append_op(
+            graph::op_kind::ReLU, in_edges_t {in_edge(0, prconv, 0)});
+    prrelu->allow_external_outputs();
+    prep_subgraph->create_input_port(IN0, prconv, IN0);
+    prep_subgraph->create_output_port(OUT0, prrelu, OUT0);
+    pgraph->append_repetition(prep_subgraph, {0, 0}, 0, 3);
+
+    // graph:
+    // the single conv is the 1st consumer
+    // while the conv on the "main branch" is the 2nd consumer
+    //   conv
+    //     |
+    //   relu
+    //     |________________
+    //     |                |
+    // (external output)  conv
+    //   conv               |
+    //     |              relu
+    //     |                |
+    graph::graph_t agraph;
+
+    id_generator_t id_gen;
+
+    int64_t ic = 8, oc = 8, ks = 1;
+    std::vector<int64_t> src_shape {1, ic, 12, 12};
+
+    auto src = logical_tensor_init(
+            id_gen.get_id(), src_shape, graph::data_type::f32);
+
+    auto conv0 = create_convolution(id_gen, agraph, src, ic, ks, oc, 1, {1, 1},
+            {1, 1}, {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
+            /*with relu*/ true, /*with biasadd*/ false);
+    // the order of consumer depends on the order of adding ops
+    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
+            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
+            /*with relu*/ false, /*with biasadd*/ false);
+    create_convolution(id_gen, agraph, conv0, ic, ks, oc, 1, {1, 1}, {1, 1},
+            {0, 0}, {0, 0}, "NCX", "OIX", false, false, 1e-6f,
+            /*with relu*/ true, /*with biasadd*/ false);
+
+    agraph.finalize();
+
+    ASSERT_EQ(agraph.get_ops().size(), 5U);
+
+    std::vector<op_t *> fusion_ops;
+    EXPECT_TRUE(match_pattern(agraph.get_ops()[0].get(), pgraph, fusion_ops));
+    ASSERT_EQ(fusion_ops.size(), 4U);
+}
diff --git a/tests/gtests/graph/unit/utils/test_utils.cpp b/tests/gtests/graph/unit/utils/test_utils.cpp
deleted file mode 100644
index 3466b457df7..00000000000
--- a/tests/gtests/graph/unit/utils/test_utils.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include <stdlib.h>
-#include <string>
-#include <gtest/gtest.h>
-
-#include "utils/any.hpp"
-#include "utils/utils.hpp"
-
-#ifdef _WIN32
-#include <windows.h>
-#endif
-
-static inline void custom_setenv(
-        const char *name, const char *value, int overwrite) {
-#ifdef _WIN32
-    SetEnvironmentVariable(name, value);
-#else
-    ::setenv(name, value, overwrite);
-#endif
-}
-
-TEST(test_utils_utils, Any) {
-    using namespace dnnl::impl::graph::utils;
-    any_t a = 1;
-    ASSERT_EQ(any_cast<int>(a), 1);
-    int *i = any_cast<int>(&a);
-    ASSERT_NE(i, nullptr);
-    ASSERT_EQ(*i, 1);
-    a = 3.14;
-    ASSERT_EQ(any_cast<double>(a), 3.14);
-    a = true;
-    ASSERT_EQ(any_cast<bool>(a), true);
-    any_t b;
-    ASSERT_EQ(b.empty(), true);
-    EXPECT_THROW(any_cast<int>(b), bad_any_cast_t);
-}
-
-TEST(test_utils_utils, What) {
-    using namespace dnnl::impl::graph::utils;
-
-    bad_any_cast_t bad_any;
-    ASSERT_EQ(std::string(bad_any.what()), std::string("bad any_cast"));
-}
-
-TEST(test_utils_utils, Iffy_getenv) {
-    char buffer[10] = {'\0'};
-    EXPECT_EQ(INT_MIN, dnnl::impl::getenv(nullptr, buffer, 10));
-    EXPECT_EQ(INT_MIN, dnnl::impl::getenv("foo", nullptr, 10));
-    EXPECT_EQ(INT_MIN, dnnl::impl::getenv("foo", buffer, -1));
-}
-
-TEST(test_utils_utils, IffyGetenvInt) {
-    ASSERT_NO_THROW(dnnl::impl::getenv_int("PWD", 57));
-    ASSERT_NO_THROW(dnnl::impl::getenv_int("LANG", 7));
-}
-
-TEST(test_utils_utils, IffyGetenvIntUser) {
-    custom_setenv("ONEDNN_GRAPH_UNKTYU", "abc", 1);
-    ASSERT_NO_THROW(dnnl::impl::getenv_int_user("UNKTYU", 12));
-}
-
-TEST(test_utils_utils, IffyGetenvIntInternal) {
-    namespace utils = dnnl::impl::graph::utils;
-    custom_setenv("_ONEDNN_GRAPH_UNKTYU", "abc", 1);
-    ASSERT_NO_THROW(utils::getenv_int_internal("UNKTYU", 12));
-}
-
-TEST(test_utils_utils, IffyGetenvStringUser) {
-    custom_setenv("ONEDNN_UNKTYU", "abc", 1);
-    ASSERT_NO_THROW(dnnl::impl::getenv_string_user("UNKTYU"));
-}
-
-TEST(test_utils_utils, IffyCheckVerboseStringUser) {
-    namespace utils = dnnl::impl::graph::utils;
-    custom_setenv("ONEDNN_UNKTYU", "ab,c\nd\ne\nf\n", 1);
-    ASSERT_NO_THROW(utils::check_verbose_string_user("UNKTYU", "a"));
-}
diff --git a/tests/gtests/graph/unit/utils/test_utils_cpu.cpp b/tests/gtests/graph/unit/utils/test_utils_cpu.cpp
new file mode 100644
index 00000000000..97ca31f2156
--- /dev/null
+++ b/tests/gtests/graph/unit/utils/test_utils_cpu.cpp
@@ -0,0 +1,92 @@
+/*******************************************************************************
+* Copyright 2020-2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <stdlib.h>
+#include <string>
+#include <gtest/gtest.h>
+
+#include "utils/any.hpp"
+#include "utils/utils.hpp"
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+static inline void custom_setenv(
+        const char *name, const char *value, int overwrite) {
+#ifdef _WIN32
+    SetEnvironmentVariable(name, value);
+#else
+    ::setenv(name, value, overwrite);
+#endif
+}
+
+TEST(test_utils_utils, Any) {
+    using namespace dnnl::impl::graph::utils;
+    any_t a = 1;
+    ASSERT_EQ(any_cast<int>(a), 1);
+    int *i = any_cast<int>(&a);
+    ASSERT_NE(i, nullptr);
+    ASSERT_EQ(*i, 1);
+    a = 3.14;
+    ASSERT_EQ(any_cast<double>(a), 3.14);
+    a = true;
+    ASSERT_EQ(any_cast<bool>(a), true);
+    any_t b;
+    ASSERT_EQ(b.empty(), true);
+    EXPECT_THROW(any_cast<int>(b), bad_any_cast_t);
+}
+
+TEST(test_utils_utils, What) {
+    using namespace dnnl::impl::graph::utils;
+
+    bad_any_cast_t bad_any;
+    ASSERT_EQ(std::string(bad_any.what()), std::string("bad any_cast"));
+}
+
+TEST(test_utils_utils, Iffy_getenv) {
+    char buffer[10] = {'\0'};
+    EXPECT_EQ(INT_MIN, dnnl::impl::getenv(nullptr, buffer, 10));
+    EXPECT_EQ(INT_MIN, dnnl::impl::getenv("foo", nullptr, 10));
+    EXPECT_EQ(INT_MIN, dnnl::impl::getenv("foo", buffer, -1));
+}
+
+TEST(test_utils_utils, IffyGetenvInt) {
+    ASSERT_NO_THROW(dnnl::impl::getenv_int("PWD", 57));
+    ASSERT_NO_THROW(dnnl::impl::getenv_int("LANG", 7));
+}
+
+TEST(test_utils_utils, IffyGetenvIntUser) {
+    custom_setenv("ONEDNN_GRAPH_UNKTYU", "abc", 1);
+    ASSERT_NO_THROW(dnnl::impl::getenv_int_user("UNKTYU", 12));
+}
+
+TEST(test_utils_utils, IffyGetenvIntInternal) {
+    namespace utils = dnnl::impl::graph::utils;
+    custom_setenv("_ONEDNN_GRAPH_UNKTYU", "abc", 1);
+    ASSERT_NO_THROW(utils::getenv_int_internal("UNKTYU", 12));
+}
+
+TEST(test_utils_utils, IffyGetenvStringUser) {
+    custom_setenv("ONEDNN_UNKTYU", "abc", 1);
+    ASSERT_NO_THROW(dnnl::impl::getenv_string_user("UNKTYU"));
+}
+
+TEST(test_utils_utils, IffyCheckVerboseStringUser) {
+    namespace utils = dnnl::impl::graph::utils;
+    custom_setenv("ONEDNN_UNKTYU", "ab,c\nd\ne\nf\n", 1);
+    ASSERT_NO_THROW(utils::check_verbose_string_user("UNKTYU", "a"));
+}
diff --git a/tests/gtests/gtest/_clang-format b/tests/gtests/gtest/_clang-format
deleted file mode 100644
index 8f3f1619dcd..00000000000
--- a/tests/gtests/gtest/_clang-format
+++ /dev/null
@@ -1,18 +0,0 @@
-#===============================================================================
-# Copyright 2019 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#===============================================================================
-
-DisableFormat : true
-SortIncludes:   false
diff --git a/tests/gtests/in/gemm_in.h b/tests/gtests/in/gemm_in.h
index 1ec1bf9b64a..ae3cf70ad28 100644
--- a/tests/gtests/in/gemm_in.h
+++ b/tests/gtests/in/gemm_in.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -16,86 +16,86 @@
 
 #if defined(FP16) || defined(FP32) || defined(F16F16F32) || defined(BF16BF16F32)
 INST_TEST_CASE(TestGEMM,
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {}, true,
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {}, true,
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {}, true,
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {}, true,
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {}, true,
                 dnnl_invalid_arguments},
 
-        test_params {'N', 'N', 1, 1, 1, 1.0, 0.0, 4, 4, 4},
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'N', 'n', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
-        test_params {'n', 'T', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
-        test_params {'T', 'N', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
-        test_params {'t', 't', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
-        test_params {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100},
-        test_params {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100},
-        test_params {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100},
-        test_params {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2},
-        test_params {'t', 't', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2},
+        test_params_t {'N', 'N', 1, 1, 1, 1.0, 0.0, 4, 4, 4},
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'N', 'n', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
+        test_params_t {'n', 'T', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
+        test_params_t {'T', 'N', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
+        test_params_t {'t', 't', 31, 21, 11, 2.0, 1.5, 61, 51, 81},
+        test_params_t {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2},
+        test_params_t {'t', 't', 2, 2, 10000, 1.0, 2.0, 2, 10000, 2},
 
         make_test_params_with_offset(
                 {1, 2, 3}, 'n', 'n', 100, 100, 2, 1.0f, 2.0f, 100, 100, 100),
         make_test_params_with_offset(
                 {30, 20, 10}, 'n', 't', 100, 2, 100, 1.0f, 2.0f, 100, 100, 100),
 
-        test_params {'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
-        test_params {'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000},
-        test_params {'t', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
-        test_params {'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000},
-        test_params {'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
-        test_params {'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000},
-        test_params {'t', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
-        test_params {'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000});
+        test_params_t {'n', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
+        test_params_t {'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000},
+        test_params_t {'t', 'n', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
+        test_params_t {'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000},
+        test_params_t {'n', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
+        test_params_t {'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000},
+        test_params_t {'t', 't', 2000, 2000, 2000, 1.0, 0.0, 2000, 2000, 2000},
+        test_params_t {'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000});
 
 CPU_INST_TEST_CASE(TestGEMV,
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1, 1},
-        test_params {'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2000, 3000, 3000},
-        test_params {'n', 'n', 1, 300, 8000, 1.0f, 0.0f, 8000, 300, 300},
-        test_params {'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1, 1},
-        test_params {'t', 'n', 200, 1, 8000, 1.0f, 0.0f, 200, 1, 1},
-        test_params {'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 1, 3000, 3000},
-        test_params {'t', 'n', 1, 300, 8000, 1.0f, 0.0f, 1, 300, 300},
-        test_params {'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1000, 1},
-        test_params {'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 2000, 2000, 3000},
-        test_params {'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1000, 1},
-        test_params {'t', 't', 200, 1, 8000, 1.0f, 0.0f, 200, 8000, 1},
-        test_params {'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 1, 2000, 3000},
-
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1010, 1, 30},
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1010, 20, 1},
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1010, 20, 30},
-        test_params {'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2010, 3010, 3010},
-        test_params {'n', 'n', 1, 300, 8000, 1.0f, 0.0f, 8010, 310, 310},
-        test_params {'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2010, 20, 30},
-        test_params {'t', 'n', 200, 1, 8000, 1.0f, 0.0f, 210, 20, 30},
-        test_params {'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 20, 3010, 3010},
-        test_params {'t', 'n', 1, 300, 8000, 1.0f, 0.0f, 20, 310, 310},
-        test_params {'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 1010, 1010, 20},
-        test_params {'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 2010, 2010, 3010},
-        test_params {'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 2010, 1010, 20},
-        test_params {'t', 't', 200, 1, 8000, 1.0f, 0.0f, 210, 8010, 20},
-        test_params {'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 20, 2010, 3010},
-
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 1},
-        test_params {'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 3000},
-        test_params {'n', 'n', 1, 300, 8000, 1.0f, 1.0f, 8000, 300, 300},
-        test_params {'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 1},
-        test_params {'t', 'n', 200, 1, 8000, 1.0f, 1.0f, 200, 1, 1},
-        test_params {'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 3000},
-        test_params {'t', 'n', 1, 300, 8000, 1.0f, 1.0f, 1, 300, 300},
-        test_params {'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1000, 1},
-        test_params {'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 3000},
-        test_params {'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 1},
-        test_params {'t', 't', 200, 1, 8000, 1.0f, 1.0f, 200, 8000, 1},
-        test_params {'t', 't', 1, 3000, 4000, 1.0f, 1.0f, 1, 4000, 3000});
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1, 1},
+        test_params_t {'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2000, 3000, 3000},
+        test_params_t {'n', 'n', 1, 300, 8000, 1.0f, 0.0f, 8000, 300, 300},
+        test_params_t {'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1, 1},
+        test_params_t {'t', 'n', 200, 1, 8000, 1.0f, 0.0f, 200, 1, 1},
+        test_params_t {'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 1, 3000, 3000},
+        test_params_t {'t', 'n', 1, 300, 8000, 1.0f, 0.0f, 1, 300, 300},
+        test_params_t {'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1000, 1},
+        test_params_t {'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 2000, 2000, 3000},
+        test_params_t {'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1000, 1},
+        test_params_t {'t', 't', 200, 1, 8000, 1.0f, 0.0f, 200, 8000, 1},
+        test_params_t {'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 1, 2000, 3000},
+
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1010, 1, 30},
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1010, 20, 1},
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1010, 20, 30},
+        test_params_t {'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2010, 3010, 3010},
+        test_params_t {'n', 'n', 1, 300, 8000, 1.0f, 0.0f, 8010, 310, 310},
+        test_params_t {'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2010, 20, 30},
+        test_params_t {'t', 'n', 200, 1, 8000, 1.0f, 0.0f, 210, 20, 30},
+        test_params_t {'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 20, 3010, 3010},
+        test_params_t {'t', 'n', 1, 300, 8000, 1.0f, 0.0f, 20, 310, 310},
+        test_params_t {'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 1010, 1010, 20},
+        test_params_t {'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 2010, 2010, 3010},
+        test_params_t {'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 2010, 1010, 20},
+        test_params_t {'t', 't', 200, 1, 8000, 1.0f, 0.0f, 210, 8010, 20},
+        test_params_t {'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 20, 2010, 3010},
+
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 1},
+        test_params_t {'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 3000},
+        test_params_t {'n', 'n', 1, 300, 8000, 1.0f, 1.0f, 8000, 300, 300},
+        test_params_t {'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 1},
+        test_params_t {'t', 'n', 200, 1, 8000, 1.0f, 1.0f, 200, 1, 1},
+        test_params_t {'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 3000},
+        test_params_t {'t', 'n', 1, 300, 8000, 1.0f, 1.0f, 1, 300, 300},
+        test_params_t {'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1000, 1},
+        test_params_t {'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 3000},
+        test_params_t {'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 1},
+        test_params_t {'t', 't', 200, 1, 8000, 1.0f, 1.0f, 200, 8000, 1},
+        test_params_t {'t', 't', 1, 3000, 4000, 1.0f, 1.0f, 1, 4000, 3000});
 
 /**
  * These cases are used to test the small-N avx-512 sgemm TN kernels.
@@ -104,39 +104,39 @@ CPU_INST_TEST_CASE(TestGEMV,
  * are swapped.
  */
 CPU_INST_TEST_CASE(TestGEMM_smalln,
-        test_params {'n', 't', 5, 512, 512, 1.0f, 1.0f, 512, 512, 512},
-        test_params {'n', 't', 5, 512, 1536, 1.0f, 1.0f, 1536, 1536, 512},
-        test_params {'n', 't', 5, 512, 2048, 1.0f, 1.0f, 2048, 2048, 512},
-        test_params {'n', 't', 5, 2048, 512, 1.0f, 1.0f, 512, 512, 2048},
-        test_params {'n', 't', 7, 512, 512, 0.0f, 1.0f, 512, 512, 512},
-        test_params {'n', 't', 7, 512, 1536, 1.0f, 0.0f, 1536, 1536, 512},
-        test_params {'n', 't', 7, 512, 2048, 0.5f, 0.5f, 2048, 2048, 512},
-        test_params {'n', 't', 7, 2048, 512, 1.0f, 1.0f, 512, 512, 2048},
-        test_params {'n', 't', 4, 512, 512, 1.0f, 1.0f, 512, 512, 512},
-        test_params {'n', 't', 4, 512, 1536, 1.0f, 1.0f, 1536, 1536, 512},
-        test_params {'n', 't', 4, 512, 2048, 1.0f, 1.0f, 2048, 2048, 512},
-        test_params {'n', 't', 4, 2048, 512, 1.0f, 1.0f, 512, 512, 2048},
-        test_params {'n', 't', 8, 512, 512, 1.0f, 1.0f, 512, 512, 512},
-        test_params {'n', 't', 8, 512, 1536, 1.0f, 1.0f, 1536, 1536, 512},
-        test_params {'n', 't', 8, 512, 2048, 1.0f, 1.0f, 2048, 2048, 512},
-        test_params {'n', 't', 8, 2048, 512, 1.0f, 1.0f, 512, 512, 2048});
+        test_params_t {'n', 't', 5, 512, 512, 1.0f, 1.0f, 512, 512, 512},
+        test_params_t {'n', 't', 5, 512, 1536, 1.0f, 1.0f, 1536, 1536, 512},
+        test_params_t {'n', 't', 5, 512, 2048, 1.0f, 1.0f, 2048, 2048, 512},
+        test_params_t {'n', 't', 5, 2048, 512, 1.0f, 1.0f, 512, 512, 2048},
+        test_params_t {'n', 't', 7, 512, 512, 0.0f, 1.0f, 512, 512, 512},
+        test_params_t {'n', 't', 7, 512, 1536, 1.0f, 0.0f, 1536, 1536, 512},
+        test_params_t {'n', 't', 7, 512, 2048, 0.5f, 0.5f, 2048, 2048, 512},
+        test_params_t {'n', 't', 7, 2048, 512, 1.0f, 1.0f, 512, 512, 2048},
+        test_params_t {'n', 't', 4, 512, 512, 1.0f, 1.0f, 512, 512, 512},
+        test_params_t {'n', 't', 4, 512, 1536, 1.0f, 1.0f, 1536, 1536, 512},
+        test_params_t {'n', 't', 4, 512, 2048, 1.0f, 1.0f, 2048, 2048, 512},
+        test_params_t {'n', 't', 4, 2048, 512, 1.0f, 1.0f, 512, 512, 2048},
+        test_params_t {'n', 't', 8, 512, 512, 1.0f, 1.0f, 512, 512, 512},
+        test_params_t {'n', 't', 8, 512, 1536, 1.0f, 1.0f, 1536, 1536, 512},
+        test_params_t {'n', 't', 8, 512, 2048, 1.0f, 1.0f, 2048, 2048, 512},
+        test_params_t {'n', 't', 8, 2048, 512, 1.0f, 1.0f, 512, 512, 2048});
 
 CPU_INST_TEST_CASE(TestGEMM_stkmem,
-        test_params {'n', 'n', 2, 48, 83, 1.0f, 0.0f, 83, 48, 48},
-        test_params {'n', 'n', 2, 48, 200, 1.0f, 0.0f, 200, 48, 48},
-        test_params {'n', 'n', 2, 16, 251, 1.0f, 0.0f, 251, 16, 16},
-        test_params {'n', 'n', 2, 16, 256, 1.0f, 0.0f, 256, 16, 16});
+        test_params_t {'n', 'n', 2, 48, 83, 1.0f, 0.0f, 83, 48, 48},
+        test_params_t {'n', 'n', 2, 48, 200, 1.0f, 0.0f, 200, 48, 48},
+        test_params_t {'n', 'n', 2, 16, 251, 1.0f, 0.0f, 251, 16, 16},
+        test_params_t {'n', 'n', 2, 16, 256, 1.0f, 0.0f, 256, 16, 16});
 
 #if defined(FP32) || defined(BF16BF16F32)
 #if !BUILD_GEMM_KERNELS_NONE
 INST_TEST_CASE(TestGEMM_packed,
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {false, true},
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {false, true},
                 true, dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {true, false},
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {true, false},
                 true, dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {true, true},
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {true, true},
                 true, dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {true, true},
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {true, true},
                 true, dnnl_invalid_arguments},
 
         make_test_params_pack(
@@ -204,537 +204,540 @@ INST_TEST_CASE(TestGEMM_packed,
 #elif defined(BF16BF16BF16)
 
 INST_TEST_CASE(TestGEMM,
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {}, true,
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {}, true,
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {}, true,
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {}, true,
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {}, true,
                 dnnl_invalid_arguments},
 
-        test_params {'N', 'N', 1, 1, 1, 1.0, 0.0, 4, 4, 4},
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
-        test_params {'N', 'n', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
-        test_params {'n', 'T', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
-        test_params {'T', 'N', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
-        test_params {'t', 't', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
-        test_params {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100},
-        test_params {'n', 't', 100, 2, 58, 1.0, 2.0, 100, 100, 100},
-        test_params {'t', 'n', 2, 100, 61, 1.0, 2.0, 100, 100, 100},
-        test_params {'t', 't', 2, 100, 60, 1.0, 2.0, 100, 100, 100},
-        test_params {'n', 'n', 2, 2, 11, 1.0, -1.0, 20, 2, 2},
-        test_params {'t', 't', 2, 2, 11, 1.0, -1.0, 2, 20, 2},
+        test_params_t {'N', 'N', 1, 1, 1, 1.0, 0.0, 4, 4, 4},
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80},
+        test_params_t {'N', 'n', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
+        test_params_t {'n', 'T', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
+        test_params_t {'T', 'N', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
+        test_params_t {'t', 't', 31, 21, 11, 2.5, 1.5, 61, 51, 81},
+        test_params_t {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'n', 't', 100, 2, 58, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'t', 'n', 2, 100, 61, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'t', 't', 2, 100, 60, 1.0, 2.0, 100, 100, 100},
+        test_params_t {'n', 'n', 2, 2, 11, 1.0, -1.0, 20, 2, 2},
+        test_params_t {'t', 't', 2, 2, 11, 1.0, -1.0, 2, 20, 2},
 
         make_test_params_with_offset(
                 {1, 2, 3}, 'n', 'n', 100, 100, 2, 1.0f, 2.0f, 100, 100, 100),
         make_test_params_with_offset(
                 {30, 20, 10}, 'n', 't', 100, 2, 100, 1.0f, 2.0f, 100, 100, 100),
 
-        test_params {'n', 'n', 2000, 2000, 20, 1.0, 0.0, 20, 2000, 2000},
-        test_params {'n', 'n', 3000, 3000, 30, 1.0, 0.0, 30, 3000, 3000},
-        test_params {'t', 'n', 2000, 2000, 20, 1.0, 0.0, 2000, 2000, 2000},
-        test_params {'t', 'n', 3000, 3000, 30, 1.0, 0.0, 3000, 3000, 3000},
-        test_params {'n', 't', 2000, 2000, 20, 1.0, 0.0, 20, 20, 2000},
-        test_params {'n', 't', 3000, 3000, 30, 1.0, 0.0, 30, 30, 3000},
-        test_params {'t', 't', 2000, 2000, 20, 1.0, 0.0, 2000, 20, 2000},
-        test_params {'t', 't', 3000, 3000, 30, 1.0, 0.0, 3000, 30, 3000});
+        test_params_t {'n', 'n', 2000, 2000, 20, 1.0, 0.0, 20, 2000, 2000},
+        test_params_t {'n', 'n', 3000, 3000, 30, 1.0, 0.0, 30, 3000, 3000},
+        test_params_t {'t', 'n', 2000, 2000, 20, 1.0, 0.0, 2000, 2000, 2000},
+        test_params_t {'t', 'n', 3000, 3000, 30, 1.0, 0.0, 3000, 3000, 3000},
+        test_params_t {'n', 't', 2000, 2000, 20, 1.0, 0.0, 20, 20, 2000},
+        test_params_t {'n', 't', 3000, 3000, 30, 1.0, 0.0, 30, 30, 3000},
+        test_params_t {'t', 't', 2000, 2000, 20, 1.0, 0.0, 2000, 20, 2000},
+        test_params_t {'t', 't', 3000, 3000, 30, 1.0, 0.0, 3000, 30, 3000});
 
 #else
-constexpr test_igemm_params fix_use_oc = {'F', false, false, true};
-constexpr test_igemm_params col_use_oc = {'C', false, false, true};
-constexpr test_igemm_params row_use_oc = {'R', false, false, true};
+constexpr test_igemm_params_t fix_use_oc = {'F', false, false, true};
+constexpr test_igemm_params_t col_use_oc = {'C', false, false, true};
+constexpr test_igemm_params_t row_use_oc = {'R', false, false, true};
 
-constexpr test_igemm_params fix_use_all_offsets = {'F', true, true, true};
-constexpr test_igemm_params col_use_all_offsets = {'C', true, true, true};
-constexpr test_igemm_params row_use_all_offsets = {'R', true, true, true};
+constexpr test_igemm_params_t fix_use_all_offsets = {'F', true, true, true};
+constexpr test_igemm_params_t col_use_all_offsets = {'C', true, true, true};
+constexpr test_igemm_params_t row_use_all_offsets = {'R', true, true, true};
 
-constexpr test_igemm_params fix_no_offsets = {'F', false, false, false};
-constexpr test_igemm_params col_no_offsets = {'C', false, false, false};
-constexpr test_igemm_params row_no_offsets = {'R', false, false, false};
+constexpr test_igemm_params_t fix_no_offsets = {'F', false, false, false};
+constexpr test_igemm_params_t col_no_offsets = {'C', false, false, false};
+constexpr test_igemm_params_t row_no_offsets = {'R', false, false, false};
 
 #if !BUILD_GEMM_KERNELS_NONE
 INST_TEST_CASE(TestGEMM_expected_failures,
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {}, true,
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {}, true,
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {}, true,
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {}, true,
                 dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {}, true,
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {}, true,
                 dnnl_invalid_arguments},
 
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, fix_use_oc, {}, true,
-                dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, fix_use_oc, {}, true,
-                dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, fix_use_oc, {}, true,
-                dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, fix_use_oc, {}, true,
-                dnnl_invalid_arguments},
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, fix_use_oc, {},
+                true, dnnl_invalid_arguments},
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, fix_use_oc, {},
+                true, dnnl_invalid_arguments},
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, fix_use_oc, {},
+                true, dnnl_invalid_arguments},
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, fix_use_oc, {},
+                true, dnnl_invalid_arguments},
 
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, fix_use_all_offsets,
-                {}, true, dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, fix_use_all_offsets,
-                {}, true, dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, fix_use_all_offsets,
-                {}, true, dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, fix_use_all_offsets,
-                {}, true, dnnl_invalid_arguments},
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8,
+                fix_use_all_offsets, {}, true, dnnl_invalid_arguments},
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8,
+                fix_use_all_offsets, {}, true, dnnl_invalid_arguments},
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8,
+                fix_use_all_offsets, {}, true, dnnl_invalid_arguments},
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3,
+                fix_use_all_offsets, {}, true, dnnl_invalid_arguments},
 
-        test_params {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {true, true},
+        test_params_t {'t', 'n', 3, 2, 1, 1.0, 0.0, 2, 5, 8, {}, {true, true},
                 true, dnnl_invalid_arguments},
-        test_params {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {false, true},
+        test_params_t {'n', 'n', 3, 2, 2, 1.0, 0.0, 1, 5, 8, {}, {false, true},
                 true, dnnl_invalid_arguments},
-        test_params {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {true, false},
+        test_params_t {'n', 't', 3, 2, 2, 1.0, 0.0, 3, 1, 8, {}, {true, false},
                 true, dnnl_invalid_arguments},
-        test_params {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {false, true},
+        test_params_t {'n', 'd', 3, 2, 1, 1.0, 0.0, 3, 3, 3, {}, {false, true},
                 true, dnnl_invalid_arguments});
 #endif
 
 CPU_INST_TEST_CASE(TestGEMM_stkmem,
-        test_params {'n', 'n', 10, 4000, 2, 1.0, 0.0, 2, 4000, 4000,
+        test_params_t {'n', 'n', 10, 4000, 2, 1.0, 0.0, 2, 4000, 4000,
                 fix_use_all_offsets},
-        test_params {'n', 'n', 10, 5000, 2, 1.0, 0.0, 2, 5000, 5000,
+        test_params_t {'n', 'n', 10, 5000, 2, 1.0, 0.0, 2, 5000, 5000,
                 fix_use_all_offsets});
 
 INST_TEST_CASE(TestGEMM_general_cases_fix_offset,
-        test_params {'N', 'n', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
-        test_params {'n', 'T', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
-        test_params {'T', 'N', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
-        test_params {'t', 't', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
-        test_params {
+        test_params_t {'N', 'n', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'n', 'T', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'T', 'N', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'t', 't', 30, 20, 10, 1.0, 0.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_use_oc},
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_use_oc},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, fix_use_oc},
+        test_params_t {
+                'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, fix_use_oc},
 
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2,
                 fix_use_all_offsets},
 
-        test_params {
+        test_params_t {
                 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, fix_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, fix_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, fix_no_offsets},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, fix_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, fix_no_offsets});
 
 INST_TEST_CASE(TestGEMM_general_cases_col_offset,
-        test_params {'N', 'n', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
-        test_params {'n', 'T', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
-        test_params {'T', 'N', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
-        test_params {'t', 't', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
-        test_params {
+        test_params_t {'N', 'n', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
+        test_params_t {'n', 'T', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
+        test_params_t {'T', 'N', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
+        test_params_t {'t', 't', 30, 20, 10, 1.0, 0.0, 60, 50, 80, col_use_oc},
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_use_oc},
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_use_oc},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, col_use_oc},
+        test_params_t {
+                'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, col_use_oc},
 
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2,
                 col_use_all_offsets},
 
-        test_params {
+        test_params_t {
                 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, col_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, col_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, col_no_offsets},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, col_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, col_no_offsets});
 
 INST_TEST_CASE(TestGEMM_general_cases_row_offset,
-        test_params {'N', 'n', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
-        test_params {'n', 'T', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
-        test_params {'T', 'N', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
-        test_params {'t', 't', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
-        test_params {
+        test_params_t {'N', 'n', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
+        test_params_t {'n', 'T', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
+        test_params_t {'T', 'N', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
+        test_params_t {'t', 't', 30, 20, 10, 1.0, 0.0, 60, 50, 80, row_use_oc},
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_use_oc},
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_use_oc},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, row_use_oc},
+        test_params_t {
+                'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, row_use_oc},
 
-        test_params {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
+        test_params_t {'t', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2,
                 row_use_all_offsets},
 
-        test_params {
+        test_params_t {
                 'N', 'n', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.0, 1.0, 60, 50, 80, row_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.0, 2.0, 100, 100, 100, row_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.0, 2.0, 100, 100, 100, row_no_offsets},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.0, 2.0, 100, 100, 100, row_no_offsets},
-        test_params {
+        test_params_t {
                 'n', 'n', 2, 2, 10000, 1.0, 2.0, 10000, 2, 2, row_no_offsets});
 
 CPU_INST_TEST_CASE(TestGEMM_fractional_scales_fix_offset,
         /* alpha and beta have non-zero fractional part */
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, fix_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, fix_use_oc},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, fix_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, fix_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, fix_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2, fix_use_oc},
 
-        test_params {'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80,
+        test_params_t {'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
+        test_params_t {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
                 fix_use_all_offsets},
-        test_params {'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80,
+        test_params_t {'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80,
+        test_params_t {'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80,
                 fix_use_all_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
                 fix_use_all_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
                 fix_use_all_offsets},
 
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, fix_no_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
+        test_params_t {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
                 fix_no_offsets},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, fix_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, fix_no_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
                 fix_no_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
                 fix_no_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
                 fix_no_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
                 fix_no_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
                 fix_no_offsets});
 
 CPU_INST_TEST_CASE(TestGEMM_fractional_scales_col_offset,
         /* alpha and beta have non-zero fractional part */
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, col_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, col_use_oc},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, col_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, col_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, col_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2, col_use_oc},
 
-        test_params {'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80,
+        test_params_t {'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
+        test_params_t {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
                 col_use_all_offsets},
-        test_params {'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80,
+        test_params_t {'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80,
+        test_params_t {'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80,
                 col_use_all_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
                 col_use_all_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
                 col_use_all_offsets},
 
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, col_no_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
+        test_params_t {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
                 col_no_offsets},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, col_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, col_no_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
                 col_no_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
                 col_no_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
                 col_no_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
                 col_no_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
                 col_no_offsets});
 
 CPU_INST_TEST_CASE(TestGEMM_fractional_scales_row_offset,
         /* alpha and beta have non-zero fractional part */
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, row_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120, row_use_oc},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, row_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, row_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 't', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 't', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100, row_use_oc},
-        test_params {
+        test_params_t {
                 'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2, row_use_oc},
 
-        test_params {'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80,
+        test_params_t {'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
+        test_params_t {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
                 row_use_all_offsets},
-        test_params {'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80,
+        test_params_t {'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80,
+        test_params_t {'t', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80,
                 row_use_all_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
                 row_use_all_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
                 row_use_all_offsets},
 
-        test_params {
+        test_params_t {
                 'n', 'T', 30, 20, 10, 2.33f, 1.66f, 60, 50, 80, row_no_offsets},
-        test_params {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
+        test_params_t {'n', 'T', 30, 20, 10, 2.19f, 1.99f, 120, 120, 120,
                 row_no_offsets},
-        test_params {
+        test_params_t {
                 'T', 'N', 30, 20, 10, 2.01f, 1.01f, 60, 50, 80, row_no_offsets},
-        test_params {
+        test_params_t {
                 't', 't', 30, 20, 10, 2.99f, 1.19f, 60, 50, 80, row_no_offsets},
-        test_params {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
+        test_params_t {'n', 'n', 100, 100, 2, 1.33f, 2.33f, 100, 100, 100,
                 row_no_offsets},
-        test_params {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
+        test_params_t {'n', 't', 100, 2, 100, 1.19f, 2.99f, 100, 100, 100,
                 row_no_offsets},
-        test_params {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
+        test_params_t {'t', 'n', 2, 100, 100, 1.01f, 2.01f, 100, 100, 100,
                 row_no_offsets},
-        test_params {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
+        test_params_t {'t', 't', 2, 100, 100, 1.99f, 2.19f, 100, 100, 100,
                 row_no_offsets},
-        test_params {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
+        test_params_t {'n', 'n', 2, 2, 10000, 1.66f, 2.33f, 10000, 2, 2,
                 row_no_offsets});
 
 #ifndef DNNL_ENABLE_MEM_DEBUG
 CPU_INST_TEST_CASE(TestGEMV,
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1, 1,
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1, 1,
                 fix_no_offsets},
-        test_params {'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2000, 3000, 3000,
+        test_params_t {'n', 'n', 1, 3000, 2000, 1.0f, 0.0f, 2000, 3000, 3000,
                 fix_no_offsets},
-        test_params {'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1, 1,
+        test_params_t {'t', 'n', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1, 1,
                 fix_no_offsets},
-        test_params {'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 1, 3000, 3000,
+        test_params_t {'t', 'n', 1, 3000, 2000, 1.0f, 0.0f, 1, 3000, 3000,
                 fix_no_offsets},
-        test_params {'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1000, 1,
+        test_params_t {'n', 't', 2000, 1, 1000, 1.0f, 0.0f, 1000, 1000, 1,
                 fix_no_offsets},
-        test_params {'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 2000, 2000, 3000,
+        test_params_t {'n', 't', 1, 3000, 2000, 1.0f, 0.0f, 2000, 2000, 3000,
                 fix_no_offsets},
-        test_params {'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1000, 1,
+        test_params_t {'t', 't', 2000, 1, 1000, 1.0f, 0.0f, 2000, 1000, 1,
                 fix_no_offsets},
-        test_params {'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 1, 2000, 3000,
+        test_params_t {'t', 't', 1, 3000, 2000, 1.0f, 0.0f, 1, 2000, 3000,
                 fix_no_offsets},
 
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 1,
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 1,
                 fix_no_offsets},
-        test_params {'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 3000,
+        test_params_t {'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 3000,
                 fix_no_offsets},
-        test_params {'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 1,
+        test_params_t {'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 1,
                 fix_no_offsets},
-        test_params {'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 3000,
+        test_params_t {'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 3000,
                 fix_no_offsets},
-        test_params {'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1000, 1,
+        test_params_t {'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1000, 1,
                 fix_no_offsets},
-        test_params {'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 3000,
+        test_params_t {'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 3000,
                 fix_no_offsets},
-        test_params {'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 1,
+        test_params_t {'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 1,
                 fix_no_offsets},
-        test_params {'t', 't', 1, 3000, 2000, 1.0f, 1.0f, 1, 2000, 3000,
+        test_params_t {'t', 't', 1, 3000, 2000, 1.0f, 1.0f, 1, 2000, 3000,
                 fix_no_offsets},
 
-        test_params {'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 1,
+        test_params_t {'n', 'n', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1, 1,
                 {'F', true, false, false}},
-        test_params {'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 3000,
+        test_params_t {'n', 'n', 1, 3000, 2000, 1.0f, 1.0f, 2000, 3000, 3000,
                 {'F', true, true, false}},
-        test_params {'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 1,
+        test_params_t {'t', 'n', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1, 1,
                 {'F', false, true, false}},
-        test_params {'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 3000,
+        test_params_t {'t', 'n', 1, 3000, 2000, 1.0f, 1.0f, 1, 3000, 3000,
                 {'F', true, false, true}},
-        test_params {'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1000, 1,
+        test_params_t {'n', 't', 2000, 1, 1000, 1.0f, 1.0f, 1000, 1000, 1,
                 {'F', false, true, true}},
-        test_params {'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 3000,
+        test_params_t {'n', 't', 1, 3000, 2000, 1.0f, 1.0f, 2000, 2000, 3000,
                 {'F', true, true, false}},
-        test_params {'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 1,
+        test_params_t {'t', 't', 2000, 1, 1000, 1.0f, 1.0f, 2000, 1000, 1,
                 {'F', true, false, false}},
-        test_params {'t', 't', 1, 3000, 2000, 1.0f, 1.0f, 1, 2000, 3000,
+        test_params_t {'t', 't', 1, 3000, 2000, 1.0f, 1.0f, 1, 2000, 3000,
                 {'F', false, true, false}});
 
 CPU_INST_TEST_CASE(TestGEMV_kblocking,
-        test_params {
+        test_params_t {
                 't', 'n', 20, 1, 7000, 1.0f, 0.0f, 20, 1, 500, fix_no_offsets},
-        test_params {'t', 't', 50, 1, 7000, 1.0f, 0.0f, 50, 7000, 500,
+        test_params_t {'t', 't', 50, 1, 7000, 1.0f, 0.0f, 50, 7000, 500,
                 fix_no_offsets},
-        test_params {'t', 'n', 400, 1, 7000, 1.0f, 0.0f, 400, 1, 500,
+        test_params_t {'t', 'n', 400, 1, 7000, 1.0f, 0.0f, 400, 1, 500,
                 fix_no_offsets},
-        test_params {'t', 't', 500, 1, 7000, 1.0f, 0.0f, 500, 7000, 500,
+        test_params_t {'t', 't', 500, 1, 7000, 1.0f, 0.0f, 500, 7000, 500,
                 fix_no_offsets},
-        test_params {
+        test_params_t {
                 't', 'n', 20, 1, 7000, 1.0f, 1.0f, 20, 1, 500, fix_no_offsets},
-        test_params {'t', 't', 50, 1, 7000, 1.0f, 1.0f, 50, 7000, 500,
+        test_params_t {'t', 't', 50, 1, 7000, 1.0f, 1.0f, 50, 7000, 500,
                 fix_no_offsets},
-        test_params {'t', 'n', 500, 1, 7000, 1.0f, 1.0f, 500, 1, 500,
+        test_params_t {'t', 'n', 500, 1, 7000, 1.0f, 1.0f, 500, 1, 500,
                 fix_no_offsets},
-        test_params {'t', 't', 500, 1, 7000, 1.0f, 1.0f, 500, 7000, 500,
+        test_params_t {'t', 't', 500, 1, 7000, 1.0f, 1.0f, 500, 7000, 500,
                 fix_no_offsets},
 
-        test_params {'n', 'n', 1, 40, 7000, 1.0f, 0.0f, 7000, 40, 500,
+        test_params_t {'n', 'n', 1, 40, 7000, 1.0f, 0.0f, 7000, 40, 500,
                 fix_no_offsets},
-        test_params {'t', 'n', 1, 10, 7000, 1.0f, 0.0f, 7000, 10, 10,
+        test_params_t {'t', 'n', 1, 10, 7000, 1.0f, 0.0f, 7000, 10, 10,
                 fix_no_offsets},
-        test_params {'n', 'n', 1, 400, 7000, 1.0f, 0.0f, 7000, 400, 500,
+        test_params_t {'n', 'n', 1, 400, 7000, 1.0f, 0.0f, 7000, 400, 500,
                 fix_no_offsets},
-        test_params {'t', 'n', 1, 100, 7000, 1.0f, 0.0f, 7000, 100, 500,
+        test_params_t {'t', 'n', 1, 100, 7000, 1.0f, 0.0f, 7000, 100, 500,
                 fix_no_offsets},
-        test_params {'n', 'n', 1, 40, 7000, 1.0f, 1.0f, 7000, 40, 500,
+        test_params_t {'n', 'n', 1, 40, 7000, 1.0f, 1.0f, 7000, 40, 500,
                 fix_no_offsets},
-        test_params {'t', 'n', 1, 10, 7000, 1.0f, 1.0f, 7000, 10, 500,
+        test_params_t {'t', 'n', 1, 10, 7000, 1.0f, 1.0f, 7000, 10, 500,
                 fix_no_offsets},
-        test_params {'n', 'n', 1, 400, 7000, 1.0f, 1.0f, 7000, 400, 500,
+        test_params_t {'n', 'n', 1, 400, 7000, 1.0f, 1.0f, 7000, 400, 500,
                 fix_no_offsets},
-        test_params {'t', 'n', 1, 550, 7000, 1.0f, 1.0f, 7000, 550, 550,
+        test_params_t {'t', 'n', 1, 550, 7000, 1.0f, 1.0f, 7000, 550, 550,
                 fix_no_offsets});
 
 #if !BUILD_GEMM_KERNELS_NONE
@@ -838,13 +841,13 @@ CPU_INST_TEST_CASE(TestGEMM_packed,
 #endif
 
 CPU_INST_TEST_CASE(TestGEMM_heavy,
-        test_params {'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
+        test_params_t {'n', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
                 fix_use_oc},
-        test_params {'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
+        test_params_t {'t', 'n', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
                 fix_use_oc},
-        test_params {'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
+        test_params_t {'n', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
                 fix_use_oc},
-        test_params {'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
+        test_params_t {'t', 't', 3000, 3000, 3000, 1.0, 0.0, 3000, 3000, 3000,
                 fix_use_oc});
 
 #if !BUILD_GEMM_KERNELS_NONE
diff --git a/tests/gtests/internals/sdpa_internal.hpp b/tests/gtests/internals/sdpa_internal.hpp
new file mode 100644
index 00000000000..a647293a095
--- /dev/null
+++ b/tests/gtests/internals/sdpa_internal.hpp
@@ -0,0 +1,94 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_TEST_INTERNAL_SDPA_INTERNAL_HPP
+#define DNNL_TEST_INTERNAL_SDPA_INTERNAL_HPP
+
+#include "dnnl.hpp"
+
+// NOLINTBEGIN(readability-identifier-naming)
+
+/// Creates a primitive descriptor for a scaled dot product attention primitive
+///
+/// @param primitive_desc Output primitive descriptor.
+/// @param engine Engine to use.
+/// @param query_desc Query memory descriptor (tensor Q)
+/// @param key_desc Key memory descriptor (tensor K)
+/// @param value_desc Value memory descriptor (tensor V)
+/// @param dst_desc Destination memory descriptor.
+/// @param attn_mask_desc Attention mask memory descriptor.
+/// @param attr Primitive attributes (can be NULL).
+/// @param kq_attr Attribute for the Key/Query matmul operation(can be NULL).
+/// @param vs_attr Attribute for the Value/Score matmul operation(can be NULL).
+/// @returns #dnnl_success on success and a status describing the error
+///     otherwise.
+
+dnnl_status_t DNNL_API sdpa_primitive_desc_create(
+        dnnl_primitive_desc_t *primitive_desc_iface, dnnl_engine_t engine,
+        const_dnnl_memory_desc_t query_desc, const_dnnl_memory_desc_t key_desc,
+        const_dnnl_memory_desc_t value_desc, const_dnnl_memory_desc_t dst_desc,
+        const_dnnl_memory_desc_t mask_desc, dnnl_data_type_t scale_dt,
+        bool invert_scale, dnnl_dim_t kv_head_number, int attn_mask_type,
+        const_dnnl_primitive_attr_t attr, const_dnnl_primitive_attr_t kq_attr,
+        const_dnnl_primitive_attr_t vs_attr);
+
+namespace dnnl {
+namespace impl {
+
+/// Scaled Dot Product Attention (sdpa) internal primitive.
+/// Implementing internally for more flexible validation
+struct sdpa : public dnnl::primitive {
+    /// Primitive descriptor for a sdpa primitive.
+    struct primitive_desc : public dnnl::primitive_desc {
+        /// Default constructor. Produces an empty object.
+        primitive_desc() = default;
+
+        primitive_desc(const engine &aengine, const memory::desc &query_desc,
+                const memory::desc &key_desc, const memory::desc &value_desc,
+                const memory::desc *attn_mask_desc, memory::data_type scale_dt,
+                const memory::desc &output_desc, bool invert_scale,
+                memory::dim kv_head_number, int attn_mask_type,
+                const primitive_attr &attr = default_attr(),
+                const primitive_attr &kq_attr = default_attr(),
+                const primitive_attr &vs_attr = default_attr()) {
+
+            dnnl_primitive_desc_t pd = nullptr;
+            dnnl_status_t status = sdpa_primitive_desc_create(&pd,
+                    aengine.get(), query_desc.get(), key_desc.get(),
+                    value_desc.get(), output_desc.get(),
+                    optional_arg(attn_mask_desc), (dnnl_data_type_t)scale_dt,
+                    invert_scale, kv_head_number, attn_mask_type, attr.get(),
+                    kq_attr.get(), vs_attr.get());
+
+            dnnl::error::wrap_c_api(status,
+                    "could not create a primitive descriptor for a sdpa "
+                    "primitive");
+            reset(pd);
+        }
+    };
+
+    /// Default constructor. Produces an empty object.
+    sdpa() = default;
+
+    /// Constructs a sdpa primitive.
+    /// @param pd Primitive descriptor for a sdpa primitive.
+    sdpa(const primitive_desc &pd) : primitive(pd) {}
+};
+} // namespace impl
+} // namespace dnnl
+
+// NOLINTEND(readability-identifier-naming)
+#endif
diff --git a/tests/gtests/internals/test_bcast_strategy.cpp b/tests/gtests/internals/test_bcast_strategy.cpp
index 0c3e3ba5700..a47e06f37e0 100644
--- a/tests/gtests/internals/test_bcast_strategy.cpp
+++ b/tests/gtests/internals/test_bcast_strategy.cpp
@@ -245,6 +245,9 @@ INSTANTIATE_TEST_SUITE_P(SupportedStrategies, bcast_strategy_test_t,
                 std::make_tuple(memory::dims {1, 2, 2, 1},
                         memory::dims {1, 2, 1, 1},
                         impl::broadcasting_strategy_t::per_oc_spatial),
+                std::make_tuple(memory::dims {2, 2, 2, 2},
+                        memory::dims {1, 2, 2, 1},
+                        impl::broadcasting_strategy_t::per_oc_d),
                 std::make_tuple(memory::dims {1, 2, 1, 1},
                         memory::dims {1, 2, 1, 1},
                         impl::broadcasting_strategy_t::no_broadcast),
diff --git a/tests/gtests/internals/test_brgemm.cpp b/tests/gtests/internals/test_brgemm.cpp
index 6b657edc5d5..69257ba3f2f 100644
--- a/tests/gtests/internals/test_brgemm.cpp
+++ b/tests/gtests/internals/test_brgemm.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2024 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -29,7 +29,7 @@
 
 namespace dnnl {
 
-struct brgemm_params_t : test_params {
+struct brgemm_params_t : test_params_t {
     impl::data_type_t dt_a;
     impl::data_type_t dt_b;
     impl::cpu::x64::brgemm_batch_kind_t batch_kind;
@@ -193,15 +193,19 @@ class brgemm_test_t : public ::testing::TestWithParam<brgemm_params_t> {
                 p.alpha, p.beta, p.lda, p.ldb, p.ldc, p.M, p.N, p.K);
         if (res != dnnl_success) return res;
 
-        if (desc.is_tmm) {
-            res = brgemm_init_tiles(desc, palette);
-            if (res != dnnl_success) return res;
-        }
         if (!desc.is_tmm) {
             res = brgemm_desc_set_attr(&desc, p.attrs);
             if (res != dnnl_success) return res;
         }
 
+        res = brgemm_desc_finalize(&desc);
+        if (res != dnnl_success) return res;
+
+        if (desc.is_tmm) {
+            res = brgemm_init_tiles(desc, palette);
+            if (res != dnnl_success) return res;
+        }
+
         x64::brgemm_kernel_t *_t_ptr;
         res = brgemm_kernel_create(&_t_ptr, desc);
         if (res != dnnl_success) return res;
@@ -275,7 +279,7 @@ class brgemm_test_t : public ::testing::TestWithParam<brgemm_params_t> {
     }
 
     std::shared_ptr<engine> eng_;
-    test_gemm_data gemm_data_;
+    test_gemm_data_t gemm_data_;
     std::shared_ptr<test_memory> b_mem_reordered_;
 };
 
diff --git a/tests/gtests/internals/test_comparison_operators.cpp b/tests/gtests/internals/test_comparison_operators.cpp
index 345062bffbc..283f3796077 100644
--- a/tests/gtests/internals/test_comparison_operators.cpp
+++ b/tests/gtests/internals/test_comparison_operators.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Intel Corporation
+* Copyright 2021-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 
 #include "dnnl.hpp"
 
+#include "common/opdesc.hpp"
 #include "common/primitive_attr.hpp"
 #include "common/type_helpers.hpp"
 
diff --git a/tests/gtests/internals/test_int4.cpp b/tests/gtests/internals/test_int4.cpp
deleted file mode 100644
index 3c97cccd809..00000000000
--- a/tests/gtests/internals/test_int4.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*******************************************************************************
-* Copyright 2024 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#include "dnnl_test_common.hpp"
-#include "gtest/gtest.h"
-
-#include "src/common/int4.hpp"
-#include "src/common/nstl.hpp"
-#include "src/common/type_helpers.hpp"
-
-namespace dnnl {
-
-template <typename T>
-void test_limits(float max, float lowest, float epsilon) {
-    ASSERT_EQ(max, static_cast<float>(impl::nstl::numeric_limits<T>::max()));
-    ASSERT_EQ(lowest,
-            static_cast<float>(impl::nstl::numeric_limits<T>::lowest()));
-    ASSERT_EQ(epsilon,
-            static_cast<float>(impl::nstl::numeric_limits<T>::epsilon()));
-}
-
-TEST(test_limits, int4) {
-    test_limits<impl::int4_t>(7.f, -8.f, 0.f);
-}
-
-TEST(test_limits, uint4) {
-    test_limits<impl::uint4_t>(15, 0, 0);
-}
-
-template <typename T>
-void test_conversions() {
-    impl::parallel_nd(0xff, [&](uint16_t u16) {
-        // Each uint8_t contains a pair of int4_t numbers.
-        // Convert int4 -> f32 and back again,
-        // expecting bitwise identical values.
-        uint8_t int4_pair = static_cast<uint8_t>(u16);
-        float num1 = static_cast<float>(
-                T::extract(int4_pair, impl::int4_extract_t::low_half));
-        float num2 = static_cast<float>(
-                T::extract(int4_pair, impl::int4_extract_t::high_half));
-        // Check that the all numbers are in the range
-        float int4_lowest
-                = static_cast<float>(impl::nstl::numeric_limits<T>::lowest());
-        float int4_max
-                = static_cast<float>(impl::nstl::numeric_limits<T>::max());
-        ASSERT_TRUE(num1 >= int4_lowest && num1 <= int4_max);
-        ASSERT_TRUE(num2 >= int4_lowest && num2 <= int4_max);
-
-        // Check that the numbers are extracted in the right order
-        if (u16 <= 0xf)
-            ASSERT_TRUE(num2 == 0);
-        else
-            ASSERT_TRUE(num2 != 0);
-
-        // The target value must be initialized
-        uint8_t new_int4_pair = 0;
-        // Down-convert
-        T i4_num1(num1), i4_num2(num2);
-        new_int4_pair
-                = i4_num1.insert(new_int4_pair, impl::int4_extract_t::low_half);
-        new_int4_pair = i4_num2.insert(
-                new_int4_pair, impl::int4_extract_t::high_half);
-        ASSERT_EQ(int4_pair, new_int4_pair);
-    });
-}
-
-TEST(test_int4_conversion, int4) {
-    test_conversions<impl::int4_t>();
-}
-
-TEST(test_int4_conversion, uint4) {
-    test_conversions<impl::uint4_t>();
-}
-
-} // namespace dnnl
diff --git a/tests/gtests/internals/test_nibble.cpp b/tests/gtests/internals/test_nibble.cpp
new file mode 100644
index 00000000000..8ea9adf2a05
--- /dev/null
+++ b/tests/gtests/internals/test_nibble.cpp
@@ -0,0 +1,98 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "dnnl_test_common.hpp"
+#include "gtest/gtest.h"
+
+#include "src/common/float4.hpp"
+#include "src/common/int4.hpp"
+#include "src/common/nstl.hpp"
+#include "src/common/type_helpers.hpp"
+
+namespace dnnl {
+
+template <typename T>
+void test_limits(float max, float lowest, float epsilon) {
+    ASSERT_EQ(max, static_cast<float>(impl::nstl::numeric_limits<T>::max()));
+    ASSERT_EQ(lowest,
+            static_cast<float>(impl::nstl::numeric_limits<T>::lowest()));
+    ASSERT_EQ(epsilon,
+            static_cast<float>(impl::nstl::numeric_limits<T>::epsilon()));
+}
+
+TEST(test_limits, int4) {
+    test_limits<impl::int4_t>(7.f, -8.f, 0.f);
+}
+
+TEST(test_limits, uint4) {
+    test_limits<impl::uint4_t>(15.f, 0.f, 0.f);
+}
+
+TEST(test_limits, f4_e2m1) {
+    test_limits<impl::float4_e2m1_t>(6.0f, -6.0f, 1.0f);
+}
+
+TEST(test_limits, f4_e3m0) {
+    test_limits<impl::float4_e3m0_t>(16.0f, -16.0f, 1.0f);
+}
+
+template <typename T>
+void test_conversions() {
+    impl::parallel_nd(0xff, [&](uint8_t u8) {
+        // Each uint8_t contains a pair of 4-bit numbers.
+        // Convert T -> f32 and back again,
+        // expecting bitwise identical values.
+        impl::nibble2_t T_pair(u8);
+        float num1 = static_cast<T>(T_pair.get(0));
+        float num2 = static_cast<T>(T_pair.get(1));
+        // Check that the all numbers are in the range
+        float T_lowest
+                = static_cast<float>(impl::nstl::numeric_limits<T>::lowest());
+        float T_max = static_cast<float>(impl::nstl::numeric_limits<T>::max());
+        ASSERT_TRUE(num1 >= T_lowest && num1 <= T_max);
+        ASSERT_TRUE(num2 >= T_lowest && num2 <= T_max);
+
+        // Check that the numbers are extracted in the right order
+        if (u8 <= 0xf)
+            ASSERT_TRUE(num2 == 0);
+        else
+            // only case for num2 == -num2 is with fp types and signed 0.
+            ASSERT_TRUE(num2 != 0 || num2 == -num2);
+
+        // The target value must be initialized
+        impl::nibble2_t new_T_pair(static_cast<T>(T_pair.get(0)).raw_bits_,
+                static_cast<T>(T_pair.get(1)).raw_bits_);
+        ASSERT_EQ(T_pair.get(), new_T_pair.get());
+    });
+}
+
+TEST(test_int4_conversion, int4) {
+    test_conversions<impl::int4_t>();
+}
+
+TEST(test_int4_conversion, uint4) {
+    test_conversions<impl::uint4_t>();
+}
+
+TEST(test_e2m1_conversion, f4_e2m1) {
+    test_conversions<impl::float4_e2m1_t>();
+}
+
+TEST(test_e3m0_conversion, f4_e3m0) {
+    test_conversions<impl::float4_e3m0_t>();
+}
+
+} // namespace dnnl
diff --git a/tests/gtests/internals/test_sdpa.cpp b/tests/gtests/internals/test_sdpa.cpp
new file mode 100644
index 00000000000..c58d370cb6b
--- /dev/null
+++ b/tests/gtests/internals/test_sdpa.cpp
@@ -0,0 +1,1361 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <dnnl_test_common.hpp>
+#include <gtest/gtest.h>
+
+#include "sdpa_internal.hpp"
+#include "test_utils.hpp"
+
+#include <oneapi/dnnl/dnnl.hpp>
+
+#include <memory>
+#include <random>
+
+using mdt = memory::data_type;
+
+enum class mask_type { no_mask, oneD, twoD, causal_br, causal_tl };
+
+struct sdpa_dims_t {
+    memory::dim mb;
+    memory::dim head_num;
+    memory::dim kv_group_size;
+    memory::dim seq_len;
+    memory::dim query_num;
+    memory::dim head_size;
+
+    int kgroup_size;
+    int vgroup_size;
+
+    memory::data_type dt;
+
+    memory::data_type qdt;
+
+    memory::data_type kdt;
+    memory::data_type ksdt;
+    memory::data_type kzpdt;
+
+    memory::data_type vdt;
+    memory::data_type vsdt;
+    memory::data_type vzpdt;
+
+    memory::data_type mskdt;
+
+    quantize_type qtype;
+    bool with_key_transposed;
+    mask_type mask;
+};
+
+struct sdpa_tensors_t {
+    memory m_query, m_key, m_scale, m_mask, m_value, m_output;
+    memory m_query_test;
+    memory m_key_quantized, m_value_quantized, m_output_quantized;
+    memory m_key_t, m_value_t;
+    memory m_key_t_quantized, m_value_t_quantized;
+
+    memory m_reorder_scale_attr, m_key_scales, m_key_scales_t, m_key_zp,
+            m_value_scales, m_value_zp;
+    dnnl::primitive_attr sdpa_attr, sdpa_attr_quantized, sdpa_kq_attr_quantized,
+            sdpa_vs_attr_quantized;
+
+    int kq_mask, vs_mask;
+    memory::dims kq_groups, vs_groups;
+};
+
+std::ostream &operator<<(std::ostream &ss, const sdpa_dims_t &p) {
+    ss << "mb_" << p.mb;
+    ss << "_head_num_" << p.head_num;
+    ss << "_D_" << p.head_size;
+    if (p.with_key_transposed)
+        ss << "_T";
+    else
+        ss << "_";
+    ss << "K_" << p.seq_len;
+    ss << "_Q_" << p.query_num;
+    ss << "_Qdt_" << p.qdt;
+    ss << "_Kdt_" << p.kdt;
+    if (p.kdt != mdt::f16 && p.kdt != mdt::bf16
+            && p.qtype != quantize_type::no_quantization) {
+        ss << "_Ksdt_" << p.ksdt;
+        ss << "_Kzpdt_" << p.kzpdt;
+    }
+    ss << "_Vdt_" << p.vdt;
+    if (p.vdt != mdt::f16 && p.kdt != mdt::bf16
+            && p.qtype != quantize_type::no_quantization) {
+        ss << "_Vsdt_" << p.vsdt;
+        ss << "_Vzpdt_" << p.vzpdt;
+    }
+    switch (p.mask) {
+        case mask_type::no_mask: ss << "_no_mask"; break;
+        case mask_type::oneD: ss << "_mask1D"; break;
+        case mask_type::twoD: ss << "_mask2D"; break;
+        case mask_type::causal_br: ss << "_maskcausalbr"; break;
+        case mask_type::causal_tl: ss << "_maskcausaltl"; break;
+    }
+    if (!(p.kdt == mdt::f16 || p.vdt == mdt::f16)
+            && !(p.kdt == mdt::bf16 || p.vdt == mdt::f16)) {
+        ss << "_qtype_" << p.qtype;
+    }
+    return ss;
+}
+
+std::string print_to_string(const ::testing::TestParamInfo<sdpa_dims_t> &info) {
+    std::stringstream ss;
+    ss << info.param;
+    return ss.str();
+}
+
+std::string print_table_header(const sdpa_dims_t &p) {
+    std::stringstream ss;
+    ss << "| mb |    K | #Head |   D |    q | Kdt | Vdt |  time |";
+    return ss.str();
+}
+
+std::string print_row(const sdpa_dims_t &p) {
+    std::stringstream ss;
+
+    ss << "|" << p.mb;
+    ss << "|" << p.head_num;
+    ss << "|" << p.head_size;
+    ss << "|" << p.seq_len;
+    ss << "|" << p.query_num;
+    ss << "|" << p.kdt;
+    if (p.kdt != mdt::f16 && p.vdt != mdt::bf16
+            && p.qtype != quantize_type::no_quantization) {
+        ss << "/" << p.ksdt;
+        ss << "/" << p.kzpdt;
+    }
+    ss << "|" << p.vdt;
+    if (p.vdt != mdt::f16 && p.vdt != mdt::bf16
+            && p.qtype != quantize_type::no_quantization) {
+        ss << "/" << p.vsdt;
+        ss << "/" << p.vzpdt;
+    }
+    ss << "|";
+    switch (p.mask) {
+        case mask_type::no_mask: ss << "no"; break;
+        case mask_type::oneD: ss << "1D"; break;
+        case mask_type::twoD: ss << "2D"; break;
+        case mask_type::causal_br: ss << "causalbr"; break;
+        case mask_type::causal_tl: ss << "causaltl"; break;
+    }
+    ss << "|" << p.qtype;
+    return ss.str();
+}
+
+using dnnl::algorithm;
+using dnnl::matmul;
+using dnnl::memory;
+using dnnl::primitive_attr;
+using dnnl::softmax_forward;
+
+#define COMPLAIN_DNNL_ERROR_AND_EXIT(what, status) \
+    do { \
+        printf("[%s:%d] `%s` returns oneDNN error: %s.\n", __FILE__, __LINE__, \
+                what, dnnl_status2str(status)); \
+        printf("Example failed.\n"); \
+        exit(1); \
+    } while (0)
+
+#define COMPLAIN_EXAMPLE_ERROR_AND_EXIT(complain_fmt, ...) \
+    do { \
+        printf("[%s:%d] Error in the example: " complain_fmt ".\n", __FILE__, \
+                __LINE__, __VA_ARGS__); \
+        printf("Example failed.\n"); \
+        exit(2); \
+    } while (0)
+
+#undef CHECK
+#define CHECK(f) \
+    do { \
+        dnnl_status_t s_ = f; \
+        if (s_ != dnnl_success) COMPLAIN_DNNL_ERROR_AND_EXIT(#f, s_); \
+    } while (0)
+
+// initialize the mask with first 3/4 elements with 0s and the last 1/4 elements
+// with -inf.
+void fill_mask(std::vector<float> &mask, const memory::desc &desc) {
+    size_t seq_len = desc.get_dims()[3];
+    size_t query_num = desc.get_dims()[2];
+    size_t batches = desc.get_dims()[1] * desc.get_dims()[0];
+    for (size_t b = 0; b < batches; b++) {
+        for (size_t q = 0; q < query_num; q++) {
+            for (size_t i = 0; i < seq_len; i++) {
+                if (i <= q) {
+                    mask[b * query_num * seq_len + q * seq_len + i] = 0;
+                    // = (float)i + (float)q / 100.f;
+                } else {
+                    mask[b * query_num * seq_len + q * seq_len + i]
+                            = -1 * std::numeric_limits<float>::infinity();
+                    //= -((float)i + (float)q / 100.f);
+                }
+            }
+        }
+    }
+}
+
+void fill_causal_mask(
+        std::vector<float> &mask, const memory::desc &desc, mask_type mask_t) {
+    size_t seq_len = desc.get_dims()[3];
+    size_t query_num = desc.get_dims()[2];
+    size_t batches = desc.get_dims()[1] * desc.get_dims()[0];
+    for (size_t b = 0; b < batches; b++) {
+        for (size_t q = 0; q < query_num; q++) {
+            for (size_t k = 0; k < seq_len; k++) {
+                if (mask_t == mask_type::causal_br
+                                ? ((q + seq_len - query_num) >= k)
+                                : (q >= k)) {
+                    mask[b * query_num * seq_len + q * seq_len + k] = 0;
+                    // = (float)k + (float)q / 100.f;
+                } else {
+                    mask[b * query_num * seq_len + q * seq_len + k]
+                            = -1 * std::numeric_limits<float>::infinity();
+                    //= -((float)k + (float)q / 100.f);
+                }
+            }
+        }
+    }
+}
+
+memory::dims double_mb(const memory::dims &dims) {
+    memory::dims ret = dims;
+    ret[0] *= 2;
+    return ret;
+}
+
+memory double_and_resize(const memory::desc &desc, dnnl::engine &eng) {
+    dnnl::stream s(eng);
+    memory::dims dims2 = double_mb(desc.get_dims());
+    auto desc2 = memory::desc(dims2, desc.get_data_type(), desc.get_strides());
+
+    dnnl_memory_t mem2;
+    CHECK(dnnl_memory_create(
+            &mem2, desc2.get(), eng.get(), DNNL_MEMORY_ALLOCATE));
+
+    void *mapped_ptr = nullptr;
+    CHECK(dnnl_memory_map_data(mem2, &mapped_ptr));
+
+    for (size_t i = 0; i < desc2.get_size(); i++) {
+        ((uint8_t *)mapped_ptr)[i] = 0xFF;
+    }
+    CHECK(dnnl_memory_unmap_data(mem2, mapped_ptr));
+
+    void *handle;
+    CHECK(dnnl_memory_get_data_handle(mem2, &handle));
+    return memory(desc, eng, handle);
+}
+
+sdpa_tensors_t get_descriptors(dnnl::engine &eng, const sdpa_dims_t &p) {
+    sdpa_tensors_t out;
+
+    // Prepare input and output shapes to construct the sdpa graph.
+    const memory::dims q_sz = {p.mb, p.head_num, p.query_num, p.head_size};
+    const memory::dims k_sz = {p.mb, p.head_num, p.head_size, p.seq_len};
+    const memory::dims k_stride
+            = {p.mb, p.head_num, p.head_size, p.seq_len * 2};
+    const memory::dims k_t_stride
+            = {p.mb, p.head_num, p.seq_len * 2, p.head_size};
+    const memory::dims v_sz = {p.mb, p.head_num, p.seq_len, p.head_size};
+    const memory::dims scale_sz = {1, 1, 1, 1};
+    const memory::dims key_scales_sz = [&] {
+        switch (p.qtype) {
+            case quantize_type::no_quantization:
+                return memory::dims {1, 1, 1, 1};
+            case quantize_type::per_token_with_groups:
+                return memory::dims {
+                        k_sz[0], k_sz[1], k_sz[2] / p.kgroup_size, k_sz[3]};
+            case quantize_type::per_token:
+                return memory::dims {k_sz[0], k_sz[1], 1, k_sz[3]};
+            case quantize_type::per_tensor: return memory::dims {1, 1, 1, 1};
+            case quantize_type::per_tensor1:
+                return memory::dims {k_sz[0], 1, 1, 1};
+            case quantize_type::per_tensor3:
+                return memory::dims {k_sz[0], k_sz[1], 1, 1};
+        }
+        throw std::runtime_error("Quantization type not supported\n");
+    }();
+    const memory::dims val_scales_sz = [&] {
+        switch (p.qtype) {
+            case quantize_type::no_quantization:
+                return memory::dims {1, 1, 1, 1};
+            case quantize_type::per_token_with_groups:
+                return memory::dims {
+                        v_sz[0], v_sz[1], v_sz[2], v_sz[3] / p.vgroup_size};
+            case quantize_type::per_token:
+                return memory::dims {v_sz[0], v_sz[1], v_sz[2], 1};
+            case quantize_type::per_tensor: return memory::dims {1, 1, 1, 1};
+            case quantize_type::per_tensor1:
+                return memory::dims {v_sz[0], 1, 1, 1};
+            case quantize_type::per_tensor3:
+                return memory::dims {v_sz[0], v_sz[1], 1, 1};
+        }
+        throw std::runtime_error("Quantization type not supported\n");
+    }();
+
+    memory::dims mask_sz;
+    switch (p.mask) {
+        case mask_type::no_mask: mask_sz = {};
+        case mask_type::oneD: mask_sz = {1, 1, 1, p.seq_len}; break;
+        case mask_type::causal_br:
+        case mask_type::causal_tl:
+        case mask_type::twoD: mask_sz = {1, 1, p.query_num, p.seq_len}; break;
+    }
+
+    auto ksdt = p.ksdt == mdt::undef ? p.kdt : p.ksdt;
+    auto kzpdt = p.kzpdt == mdt::undef ? mdt::s8 : p.kzpdt;
+    auto vsdt = p.vsdt == mdt::undef ? p.vdt : p.vsdt;
+    auto vzpdt = p.vzpdt == mdt::undef ? mdt::s8 : p.vzpdt;
+
+    memory::format_tag abcd = memory::format_tag::abcd;
+    memory::format_tag abdc = memory::format_tag::abdc;
+    // score = query x key.T
+    // scaled_score = score / scale
+    // masked_score = scaled_score + mask
+    // All combined in a single matmul primitive.
+    // clang-format off
+    auto query_md            = memory::desc(q_sz,          p.qdt,   abcd);
+    auto key_t_md            = memory::desc(k_sz,          p.dt,    abdc);
+    auto key_md              = memory::desc(k_sz,          p.dt,    abcd);
+    auto value_t_md          = memory::desc(v_sz,          p.dt,    abdc);
+    auto value_md            = memory::desc(v_sz,          p.dt,    abcd);
+    auto scale_md            = memory::desc(scale_sz,      p.qdt,    abcd);
+
+    auto query_test_md       = memory::desc(q_sz,          p.qdt,   abcd);
+
+    auto key_quantized_md    = memory::desc(k_sz,          p.kdt,   abcd);
+    auto key_t_quantized_md  = memory::desc(k_sz,          p.kdt,   abdc);
+    auto key_scales_md       = memory::desc(key_scales_sz, ksdt,    abcd);
+    auto key_scales_t_md     = memory::desc(key_scales_sz, ksdt,    abdc);
+    auto key_zp_md           = memory::desc(key_scales_sz, kzpdt,   abcd);
+
+    auto val_quantized_md    = memory::desc(v_sz,          p.vdt,   abcd);
+    auto val_t_quantized_md  = memory::desc(v_sz,          p.vdt,   abdc);
+    auto val_scales_md       = memory::desc(val_scales_sz, vsdt,    abcd);
+    auto val_zp_md           = memory::desc(val_scales_sz, vzpdt,   abcd);
+
+
+    auto mask_md             = memory::desc(mask_sz,       p.mskdt, abcd);
+    auto output_md           = memory::desc(q_sz,          p.qdt,   abcd);
+    auto output_quantized_md = memory::desc(q_sz,          p.qdt,   abcd);
+    // clang-format on
+
+    // Create memory objects
+    out.m_query = double_and_resize(query_md, eng);
+    out.m_query_test = double_and_resize(query_test_md, eng);
+    out.m_key = double_and_resize(key_md, eng);
+    out.m_key_t = double_and_resize(key_t_md, eng);
+    out.m_scale = double_and_resize(scale_md, eng);
+    out.m_key_quantized = double_and_resize(key_quantized_md, eng);
+    out.m_key_t_quantized = double_and_resize(key_t_quantized_md, eng);
+    out.m_key_scales = double_and_resize(key_scales_md, eng);
+    out.m_key_scales_t = double_and_resize(key_scales_t_md, eng);
+    out.m_key_zp = double_and_resize(key_zp_md, eng);
+    out.m_value_quantized = double_and_resize(val_quantized_md, eng);
+    out.m_value_t_quantized = double_and_resize(val_t_quantized_md, eng);
+    out.m_value_scales = double_and_resize(val_scales_md, eng);
+    out.m_value_zp = double_and_resize(val_zp_md, eng);
+    out.m_mask = double_and_resize(mask_md, eng);
+    out.m_value = double_and_resize(value_md, eng);
+    out.m_value_t = double_and_resize(value_t_md, eng);
+    out.m_output = double_and_resize(output_md, eng);
+    out.m_output_quantized = double_and_resize(output_quantized_md, eng);
+
+    // Allocate user data.
+    std::vector<float> query_data(product(q_sz), 0.f);
+    std::vector<float> scale_data(product(scale_sz), std::sqrt(p.head_size));
+    std::vector<float> key_quantized_data(product(k_sz), 0);
+    std::vector<float> val_quantized_data(product(v_sz), 0);
+    std::vector<float> key_scale_data(product(key_scales_sz), 1.f);
+    std::vector<float> val_scale_data(product(val_scales_sz), 1.f);
+
+    std::vector<int> key_zp_data_signed(product(key_scales_sz), INT_MAX);
+    std::vector<int> val_zp_data_signed(product(val_scales_sz), INT_MAX);
+
+    std::vector<unsigned> key_zp_data_unsigned(product(key_scales_sz), INT_MAX);
+    std::vector<unsigned> val_zp_data_unsigned(product(val_scales_sz), INT_MAX);
+
+    std::vector<float> mask_data(product(mask_sz), NAN);
+    std::vector<float> output_data(product(q_sz), NAN);
+
+    out.sdpa_attr.set_scratchpad_mode(dnnl::scratchpad_mode::library);
+    out.sdpa_attr_quantized.set_scratchpad_mode(dnnl::scratchpad_mode::library);
+
+    out.kq_mask = 0;
+    out.vs_mask = 0;
+    out.kq_groups = {};
+    out.vs_groups = {};
+    switch (p.qtype) {
+        case quantize_type::per_token_with_groups:
+            out.kq_mask = 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0;
+            out.vs_mask = 1 << 3 | 1 << 2 | 1 << 1 | 1 << 0;
+            out.kq_groups = {p.kgroup_size, 1};
+            out.vs_groups = {1, p.vgroup_size};
+            break;
+        case quantize_type::per_token:
+            out.kq_mask = 1 << 3 | 1 << 1 | 1 << 0;
+            out.vs_mask = 1 << 0 | 1 << 1 | 1 << 2;
+            break;
+        case quantize_type::per_tensor3:
+            out.kq_mask = 3;
+            out.vs_mask = 3;
+            break;
+        case quantize_type::per_tensor1:
+            out.kq_mask = 1;
+            out.vs_mask = 1;
+            break;
+        case quantize_type::per_tensor:
+            out.kq_mask = 0;
+            out.vs_mask = 0;
+            break;
+        case quantize_type::no_quantization: break;
+    }
+
+    if (p.qtype != quantize_type::no_quantization) {
+        if (p.kdt != mdt::f16 && p.kdt != mdt::bf16 && p.ksdt != mdt::undef) {
+            out.sdpa_kq_attr_quantized.set_scales(
+                    DNNL_ARG_WEIGHTS, out.kq_mask, out.kq_groups, p.ksdt);
+        }
+
+        if (p.vdt != mdt::f16 && p.vdt != mdt::bf16 && p.vsdt != mdt::undef) {
+            out.sdpa_vs_attr_quantized.set_scales(
+                    DNNL_ARG_WEIGHTS, out.vs_mask, out.vs_groups, p.vsdt);
+        }
+
+        if (p.kdt != mdt::f16 && p.kdt != mdt::bf16 && p.kzpdt != mdt::undef) {
+            out.sdpa_kq_attr_quantized.set_zero_points(
+                    DNNL_ARG_WEIGHTS, out.kq_mask, out.kq_groups, p.kzpdt);
+        }
+
+        if (p.vdt != mdt::f16 && p.vdt != mdt::bf16 && p.vzpdt != mdt::undef) {
+            out.sdpa_vs_attr_quantized.set_zero_points(
+                    DNNL_ARG_WEIGHTS, out.vs_mask, out.vs_groups, p.vzpdt);
+        }
+    }
+
+    fill_random(query_data, query_md);
+    fill_random_quantized(key_quantized_data, key_quantized_md,
+            (p.kdt == mdt::u4 || p.kdt == mdt::u8));
+    fill_random_quantized(val_quantized_data, val_quantized_md,
+            (p.vdt == mdt::u4 || p.vdt == mdt::u8));
+    if (p.qtype != quantize_type::no_quantization) {
+        if (p.kdt != mdt::f16 && p.kdt != mdt::bf16 && p.ksdt != mdt::undef)
+            fill_random_scales(key_scale_data, key_scales_md);
+        if (p.vdt != mdt::f16 && p.vdt != mdt::bf16 && p.vsdt != mdt::undef)
+            fill_random_scales(val_scale_data, val_scales_md);
+        if (p.kdt != mdt::f16 && p.kdt != mdt::bf16 && p.kzpdt != mdt::undef)
+            fill_random_quantized(key_zp_data_signed, key_zp_md);
+        if (p.vdt != mdt::f16 && p.vdt != mdt::bf16 && p.vzpdt != mdt::undef)
+            fill_random_quantized(val_zp_data_signed, val_zp_md);
+        if (p.kdt != mdt::f16 && p.kdt != mdt::bf16 && p.kzpdt != mdt::undef)
+            fill_random_quantized(key_zp_data_unsigned, key_zp_md);
+        if (p.vdt != mdt::f16 && p.vdt != mdt::bf16 && p.vzpdt != mdt::undef)
+            fill_random_quantized(val_zp_data_unsigned, val_zp_md);
+    }
+
+    if (p.mask == mask_type::causal_br || p.mask == mask_type::causal_tl) {
+        fill_causal_mask(mask_data, mask_md, p.mask);
+    } else {
+        fill_mask(mask_data, mask_md);
+    }
+
+/// This section allows setting the values of the tensors using environment variables.
+/// Syntax:
+///    <Tensor Name>[<S for scales, Z for zero points>]<R for row C for column>
+///
+/// KR=3 KC=1 Set the value in the  Key tensor at (3, 1) to 1 and all other values should be zero
+/// VSR=1 VSC=2  Set the scale for the Value tensor at (1, 2) to 1 and all other values to zero
+#if 0
+    auto &Q = query_data;
+    auto &K = key_quantized_data;
+    auto &V = val_quantized_data;
+    auto &Ks = key_scale_data;
+    auto &Vs = val_scale_data;
+    auto &Kz = key_zp_data_signed;
+    auto &Vz = val_zp_data_signed;
+    auto d = p.head_size;
+    auto k = p.seq_len;
+    auto q = p.query_num;
+
+    int kr = -1, kc = -1, qr = -1, qc = -1, vr = -1, vc = -1, mr = -1, mc = -1,
+        xb = 0;
+    int ksr = -1, ksc = -1, kzr = -1, kzc = -1, vsr = -1, vscales = -1,
+        vzr = -1, vzc = -1;
+    if (getenv("KR")) kr = atoi(getenv("KR"));
+    if (getenv("KC")) kc = atoi(getenv("KC"));
+    if (getenv("KSR")) ksr = atoi(getenv("KSR"));
+    if (getenv("KSC")) ksc = atoi(getenv("KSC"));
+    if (getenv("KZR")) kzr = atoi(getenv("KZR"));
+    if (getenv("KZC")) kzc = atoi(getenv("KZC"));
+    if (getenv("QR")) qr = atoi(getenv("QR"));
+    if (getenv("QC")) qc = atoi(getenv("QC"));
+    if (getenv("VR")) vr = atoi(getenv("VR"));
+    if (getenv("VC")) vc = atoi(getenv("VC"));
+    if (getenv("VSR")) vsr = atoi(getenv("VSR"));
+    if (getenv("VScaleC")) vscales = atoi(getenv("VScaleC"));
+    if (getenv("VZR")) vzr = atoi(getenv("VZR"));
+    if (getenv("VZC")) vzc = atoi(getenv("VZC"));
+    if (getenv("XB")) xb = atoi(getenv("XB"));
+
+    if (getenv("MR")) mr = atoi(getenv("MR"));
+    if (getenv("MC")) mc = atoi(getenv("MC"));
+
+    if (mr >= 0 || mc >= 0) {
+        mr = std::max(mr, 0);
+        mc = std::max(mc, 0);
+        for (auto &m : mask_data)
+            m = 0;
+        mask_data[mr * p.seq_len + mc] = -999;
+    }
+    if (kr >= 0 || kc >= 0) {
+        kr = std::max(kr, 0);
+        kc = std::max(kc, 0);
+        if (getenv("KX")) {
+            for (int kr_ = 0; kr_ < d; kr_++)
+                for (int kc_ = 0; kc_ < k; kc_++)
+                    if (kr_ >= kr || kc_ >= kc) K[kr_ * k + kc_] = 0;
+        } else {
+            for (auto &k : K)
+                k = 0;
+            K[xb * d * k + kr * k + kc] = 1;
+        }
+    }
+    if (ksr >= 0 || ksc >= 0) {
+        ksr = std::max(ksr, 0);
+        ksc = std::max(ksc, 0);
+        for (auto &ks : Ks)
+            ks = 0;
+        Ks[(xb * d / p.kgroup_size * k + ksr * k) + ksc] = 1;
+    }
+    if (kzr >= 0 || kzc >= 0) {
+        kzr = std::max(kzr, 0);
+        kzc = std::max(kzc, 0);
+        for (auto &kz : Kz)
+            kz = 0;
+        Kz[(xb * d * k + kzr * d) / p.kgroup_size + kzc] = 2;
+    }
+    if (qr >= 0 || qc >= 0) {
+        qr = std::max(qr, 0);
+        qc = std::max(qc, 0);
+        if (getenv("QX")) {
+            for (int qr_ = 0; qr_ < d; qr_++)
+                for (int qc_ = 0; qc_ < q; qc_++)
+                    if (qr_ >= qr || qc_ >= qc) Q[qr_ * d + qc_] = 0;
+        } else {
+            for (auto &q : Q)
+                q = 0;
+            Q[xb * d * q + qr * d + qc] = 1;
+        }
+    }
+    if (vr >= 0 || vc >= 0) {
+        vr = std::max(vr, 0);
+        vc = std::max(vc, 0);
+        if (getenv("VX")) {
+            for (int vr_ = 0; vr_ < k; vr_++)
+                for (int vc_ = 0; vc_ < d; vc_++)
+                    if (vr_ >= vr || vc_ >= vc) V[vr_ * d + vc_] = 0;
+        } else {
+            for (auto &v : V)
+                v = 0;
+            V[xb * d * k + vr * d + vc] = 1;
+        }
+    }
+    if (vsr >= 0 || vscales >= 0) {
+        vsr = std::max(vsr, 0);
+        vscales = std::max(vscales, 0);
+        for (auto &vs : Vs)
+            vs = 0;
+        Vs[(xb * d * k + vscales * d) / p.vgroup_size + vsr] = 1;
+    }
+    if (vzr >= 0 || vzc >= 0) {
+        vzr = std::max(vzr, 0);
+        vzc = std::max(vzc, 0);
+        for (auto &vz : Vz)
+            vz = 0;
+        Vz[(xb * d * k + vzc * d) / p.vgroup_size + vzr] = 1;
+    }
+#endif
+
+    int group_size = p.kgroup_size;
+    if (p.qtype == quantize_type::per_tensor) {
+        group_size = k_sz[0] * k_sz[1] * k_sz[2] * k_sz[3];
+    } else if (p.qtype == quantize_type::per_tensor1) {
+        group_size = k_sz[1] * k_sz[2] * k_sz[3];
+    } else if (p.qtype == quantize_type::per_tensor3) {
+        group_size = k_sz[2] * k_sz[3];
+    }
+
+    std::vector<float> key_data;
+    if (p.kzpdt == mdt::s4 || p.kzpdt == mdt::s8) {
+        key_data = dequantize(key_quantized_data, key_md, key_scales_md,
+                key_zp_data_signed, key_scale_data, group_size, p.qtype,
+                out.kq_groups, 0);
+    } else {
+        key_data = dequantize(key_quantized_data, key_md, key_scales_md,
+                key_zp_data_unsigned, key_scale_data, group_size, p.qtype,
+                out.kq_groups, 0);
+    }
+    group_size = p.vgroup_size;
+    if (p.qtype == quantize_type::per_tensor) {
+        group_size = v_sz[0] * v_sz[1] * v_sz[2] * v_sz[3];
+    } else if (p.qtype == quantize_type::per_tensor1) {
+        group_size = v_sz[1] * v_sz[2] * v_sz[3];
+    } else if (p.qtype == quantize_type::per_tensor3) {
+        group_size = v_sz[2] * v_sz[3];
+    }
+    std::vector<float> value_data;
+    if (p.vzpdt == mdt::s4 || p.vzpdt == mdt::s8) {
+        value_data = dequantize(val_quantized_data, value_md, val_scales_md,
+                val_zp_data_signed, val_scale_data, group_size, p.qtype,
+                out.vs_groups, 1);
+    } else {
+        value_data = dequantize(val_quantized_data, value_md, val_scales_md,
+                val_zp_data_unsigned, val_scale_data, group_size, p.qtype,
+                out.vs_groups, 1);
+    }
+
+    write_to_dnnl_memory(mask_data.data(), out.m_mask);
+    write_to_dnnl_memory(scale_data.data(), out.m_scale);
+
+    // Write data to tensor object's handle.
+    write_to_dnnl_memory(key_data.data(), out.m_key);
+    write_to_dnnl_memory(value_data.data(), out.m_value);
+    write_to_dnnl_memory(query_data.data(), out.m_query);
+    write_to_dnnl_memory(query_data.data(), out.m_query_test);
+
+    write_to_dnnl_memory(key_quantized_data.data(), out.m_key_quantized);
+
+    write_to_dnnl_memory(val_quantized_data.data(), out.m_value_quantized);
+    if (p.kzpdt == mdt::s4 || p.kzpdt == mdt::s8) {
+        write_to_dnnl_memory(key_zp_data_signed.data(), out.m_key_zp);
+    } else {
+        write_to_dnnl_memory(key_zp_data_unsigned.data(), out.m_key_zp);
+    }
+    if (p.vzpdt == mdt::s4 || p.vzpdt == mdt::s8) {
+        write_to_dnnl_memory(val_zp_data_signed.data(), out.m_value_zp);
+    } else {
+        write_to_dnnl_memory(val_zp_data_unsigned.data(), out.m_value_zp);
+    }
+    write_to_dnnl_memory(key_scale_data.data(), out.m_key_scales);
+    write_to_dnnl_memory(val_scale_data.data(), out.m_value_scales);
+    write_to_dnnl_memory(output_data.data(), out.m_output);
+    write_to_dnnl_memory(output_data.data(), out.m_output_quantized);
+
+    transpose_strides(eng, out.m_key_scales_t, out.m_key_scales);
+    transpose_strides(eng, out.m_key_t, out.m_key);
+    transpose_strides(eng, out.m_key_t_quantized, out.m_key_quantized);
+    transpose_strides(eng, out.m_value_t, out.m_value);
+    transpose_strides(eng, out.m_value_t_quantized, out.m_value_quantized);
+
+    return out;
+}
+
+class sdpa_test_t : public ::testing::TestWithParam<sdpa_dims_t> {
+public:
+    void SetUp() override {
+#ifdef DNNL_SYCL_CUDA
+        GTEST_SKIP() << "SDPA primitive tests do not support CUDA";
+#endif
+#ifdef DNNL_SYCL_HIP
+        GTEST_SKIP() << "SDPA primitive tests do not support HIP";
+#endif
+#ifdef DNNL_TEST_WITH_ENGINE_PARAM
+        SKIP_IF(get_test_engine_kind() != dnnl::engine::kind::gpu,
+                "This test requires GPU engine");
+        eng = get_test_engine();
+#else
+        SKIP_IF(engine::get_count(engine::kind::gpu) == 0,
+                "SDPA tests require gpus.");
+        eng = dnnl::engine(engine::kind::gpu, 0);
+#endif
+        strm = dnnl::stream(eng);
+        p = GetParam();
+        t = get_descriptors(eng, p);
+    }
+
+protected:
+    sdpa_dims_t p;
+    dnnl::engine eng;
+    dnnl::stream strm;
+    sdpa_tensors_t t;
+};
+
+bool with_key_transposed = true;
+bool no_key_transposed = false;
+
+// clang-format off
+
+INSTANTIATE_TEST_SUITE_P(AllMaskTypes,
+    sdpa_test_t,
+                               //  mb,  hd_num, kv_grp_sz,seq_len, qry_num, hd_size, kg_sz, vgrp_sz,       dt,       qdt,     kdt,      ksdt,   kzpdt,      vdt,     vsdt,  vzpdt,   mskdt, qtype
+    testing::Values(
+                    sdpa_dims_t{   1,       2,        2,    384,     384,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::no_mask },
+                    sdpa_dims_t{   1,       2,        2,    384,       1,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::no_mask },
+                    sdpa_dims_t{   1,       2,        2,    384,     384,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::oneD},
+                    sdpa_dims_t{   1,       2,        2,    384,       1,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::oneD},
+                    sdpa_dims_t{   1,       2,        2,    384,     384,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,       2,        2,    384,       1,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,       2,        2,    384,     384,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::causal_br },
+                    sdpa_dims_t{   1,       2,        2,    384,       1,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::causal_br },
+                    sdpa_dims_t{   1,       2,        2,    384,     384,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::causal_tl },
+                    sdpa_dims_t{   1,       2,        2,    384,       1,       128,   128,      128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8, mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token,             no_key_transposed, mask_type::causal_tl }
+    ), &print_to_string);
+
+
+//llama-2-7b-chat shape: Q [1x32xSEQ_LENx128] KV [1x32xSEQ_LENx128]
+//llama-3-8b shape: Q [1x32xSEQ_LENx128] KV [1x8xSEQ_LENx128]
+//minicpm-1b-sft shape:  Q [1x24xSEQ_LENx64]  KV [1x8xSEQ_LENx64]
+//qwen2-7b shape: Q [1x28xSEQ_LENx128] KV [1x4xSEQ_LENx128]
+//phi3-mini-4k-instruct shape: Q [1x32xSEQ_LENx96] KV [1x32xSEQ_LENx96]
+
+
+INSTANTIATE_TEST_SUITE_P(llama_2_7b_chat,
+    sdpa_test_t,
+                               // mb,  hd_num, kv_grp_sz,seq_len, qry_num, hd_size, kg_sz, vgrp_sz,       dt,       qdt,       kdt,        ksdt,      kzpdt,        vdt,       vsdt,      vzpdt,     mskdt, qtype
+    testing::Values(
+                    sdpa_dims_t{   1,      32,        32,    384,     384,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,    385,       1,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,    512,     512,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,    513,       1,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,   1024,    1024,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,   1025,       1,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,   2048,    2048,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl },
+                    sdpa_dims_t{   1,      32,        32,   2049,       1,     128,   128,     128, mdt::f16,  mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,    mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed,  mask_type::causal_tl }
+    ), &print_to_string);
+
+
+
+INSTANTIATE_TEST_SUITE_P(llama_3_8b,
+    sdpa_test_t,
+                               // mb, hd_num, kv_grp_sz,seq_len, qry_num, hd_size, kg_sz, vgrp_sz,       dt,      qdt,       kdt,        ksdt,      kzpdt,       vdt,       vsdt,      vzpdt,    mskdt, qtype
+    testing::Values(
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,  mdt::f16,    mdt::f16,    mdt::s8,  mdt::f16,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::no_quantization,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,     32,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16,    mdt::s8,   mdt::s8, mdt::undef, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,  mdt::undef, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16, mdt::undef, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+
+
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8, mdt::undef,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+
+
+                    sdpa_dims_t{   1,     2,          2,    384,     384,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    385,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    512,     512,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,    513,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1024,    1024,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   1025,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2048,    2048,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,     2,          2,   2049,       1,     128,   128,     128, mdt::f16, mdt::f16,   mdt::s8,    mdt::f16, mdt::undef,   mdt::s8,   mdt::f16,    mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD }
+
+    ), &print_to_string);
+
+
+INSTANTIATE_TEST_SUITE_P(minicpm_1b_sft,
+    sdpa_test_t,
+                               // mb,  hd_num, kv_grp_sz,seq_len, qry_num, hd_size, kg_sz, vgrp_sz,       dt,       qdt,     kdt,      ksdt,   kzpdt,      vdt,     vsdt,   vzpdt,    mskdt, qtype
+    testing::Values(
+                    sdpa_dims_t{   1,      24,         8,    384,     384,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,    385,       1,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,    512,     512,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,    513,       1,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,   1024,    1024,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,   1025,       1,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,   2048,    2048,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      24,         8,   2049,       1,      64,    64,      64, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD }
+    ), &print_to_string);
+
+
+INSTANTIATE_TEST_SUITE_P(qwen2_7b,
+    sdpa_test_t,
+                               // mb,  hd_num, kv_grp_sz,seq_len,  qry_num, hd_size, kg_sz, vgrp_sz,       dt,        qdt,     kdt,      ksdt,   kzpdt,      vdt,     vsdt,  vzpdt,    mskdt, qtype
+    testing::Values(
+                    sdpa_dims_t{   1,      28,         4,    384,      384,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,    385,        1,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,    512,      512,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,    513,        1,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,   1024,     1024,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,   1025,        1,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,   2048,     2048,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      28,         4,   2049,        1,     128,   128,     128, mdt::f16,  mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD }
+    ), &print_to_string);
+
+
+
+INSTANTIATE_TEST_SUITE_P(phi3_mini_4k_instruct,
+    sdpa_test_t,
+                               // mb,  hd_num, kv_grp_sz,seq_len, qry_num, hd_size, kg_sz, vgrp_sz,       dt,        qdt,     kdt,      ksdt,   kzpdt,      vdt,     vsdt,   vzpdt,    mskdt, qtype
+    testing::Values(
+                    //sdpa_dims_t{   1,      2,        2,    384,     384,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    //sdpa_dims_t{   1,      2,        2,    384,     384,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::oneD },
+                    //sdpa_dims_t{   1,      2,        2,    384,     384,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::no_mask },
+                    sdpa_dims_t{   1,      2,        2,    385,       1,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    //sdpa_dims_t{   1,      2,        2,    512,     512,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      2,        2,    513,       1,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    //sdpa_dims_t{   1,      2,        2,   1024,    1024,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      2,        2,   1025,       1,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    //sdpa_dims_t{   1,      2,        2,   2048,    2048,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD },
+                    sdpa_dims_t{   1,      2,        2,   2049,       1,     96,     96,      96, mdt::f16,   mdt::f16, mdt::s8,  mdt::f16, mdt::s8,  mdt::s8, mdt::f16, mdt::s8, mdt::f16, quantize_type::per_token_with_groups,  with_key_transposed, mask_type::twoD }
+    ), &print_to_string);
+
+// clang-format on
+
+memory as(dnnl::stream &strm, memory &mem, memory::data_type dt) {
+    const memory::dims sz = mem.get_desc().get_dims();
+
+    auto md = memory::desc(sz, dt, mem.get_desc().get_strides());
+    auto out = memory(md, mem.get_engine());
+    dnnl::reorder(mem, out).execute(strm, mem, out);
+    strm.wait();
+    return out;
+}
+
+std::pair<dnnl::reorder, memory> dequantize_prim(const engine &eng, mdt dt,
+        const memory::desc &desc, int mask, const memory::dims &groups, mdt sdt,
+        mdt zpdt, dnnl::memory::format_tag tag = memory::format_tag::abcd) {
+    auto dequantized_md = memory::desc(desc.get_dims(), dt, tag);
+    primitive_attr dequantized_attr;
+
+    if (sdt != mdt::undef) {
+        dequantized_attr.set_scales(DNNL_ARG_FROM, mask, groups, sdt);
+    }
+    if (zpdt != mdt::undef) {
+        dequantized_attr.set_zero_points(DNNL_ARG_SRC, mask, groups, zpdt);
+    }
+
+    auto dequantize_pd = dnnl::reorder::primitive_desc(
+            eng, desc, eng, dequantized_md, dequantized_attr, false);
+
+    memory dequantized_mem
+            = memory({desc.get_dims(), dt, memory::format_tag::abcd}, eng);
+    return std::make_pair(dnnl::reorder(dequantize_pd), dequantized_mem);
+}
+
+void prim_sdpa_quant(const sdpa_dims_t &p, const sdpa_tensors_t &t,
+        dnnl::engine &eng, dnnl::stream &strm, dnnl::memory &query,
+        dnnl::memory &key, dnnl::memory &key_scales, dnnl::memory &key_zp,
+        dnnl::memory::data_type scale_dt, dnnl::memory &scale,
+        dnnl::memory &mask, dnnl::memory &value, dnnl::memory &value_scales,
+        dnnl::memory &value_zp, dnnl::memory &output, bool invert_scale) {
+
+    const memory::dims score_sz = {p.mb, p.head_num, p.query_num, p.seq_len};
+    auto score_md = memory::desc(score_sz, mdt::f32, memory::format_tag::abcd);
+
+    using namespace dnnl;
+    primitive_attr bmm1_attr;
+    bmm1_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user);
+    post_ops bmm1_po;
+    auto scale_f32 = as(strm, scale, mdt::f32);
+    auto mask_f32 = as(strm, mask, mdt::f32);
+    if (scale_dt != mdt::undef) {
+        if (invert_scale)
+            bmm1_po.append_binary(algorithm::binary_div, scale_f32.get_desc());
+        else
+            bmm1_po.append_binary(algorithm::binary_mul, scale_f32.get_desc());
+    }
+    if (p.mask != mask_type::no_mask) {
+        bmm1_po.append_binary(algorithm::binary_add, mask_f32.get_desc());
+    }
+
+    bmm1_attr.set_post_ops(bmm1_po);
+
+    memory key_dequantized;
+    if ((key.get_desc().get_data_type() != mdt::f16
+                && key.get_desc().get_data_type() != mdt::bf16)
+            && p.qtype != quantize_type::no_quantization) {
+
+        dnnl::reorder key_dequantize_prim;
+        std::tie(key_dequantize_prim, key_dequantized)
+                = dequantize_prim(eng, mdt::f16, key.get_desc(), t.kq_mask,
+                        t.kq_groups, p.ksdt, p.kzpdt);
+
+        std::unordered_map<int, memory> key_dequantize_args = {
+                {DNNL_ARG_FROM, key},
+                {DNNL_ARG_TO, key_dequantized},
+        };
+        if (p.ksdt != mdt::undef) {
+            key_dequantize_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_FROM]
+                    = key_scales;
+        }
+        if (p.kzpdt != mdt::undef)
+            key_dequantize_args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_FROM]
+                    = key_zp;
+        key_dequantize_prim.execute(strm, key_dequantize_args);
+
+        strm.wait();
+    } else {
+        key_dequantized = key;
+        strm.wait();
+    }
+
+    memory value_dequantized;
+    if (value.get_desc().get_data_type() != mdt::f16
+            && value.get_desc().get_data_type() != mdt::bf16
+            && p.qtype != quantize_type::no_quantization) {
+        dnnl::reorder value_dequantize_prim;
+        std::tie(value_dequantize_prim, value_dequantized)
+                = dequantize_prim(eng, mdt::f32, value.get_desc(), t.vs_mask,
+                        t.vs_groups, p.vsdt, p.vzpdt);
+
+        std::unordered_map<int, memory> value_dequantize_args = {
+                {DNNL_ARG_FROM, value},
+                {DNNL_ARG_TO, value_dequantized},
+        };
+        if (p.vsdt != mdt::undef) {
+            value_dequantize_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_FROM]
+                    = value_scales;
+        }
+        if (p.vzpdt != mdt::undef)
+            value_dequantize_args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_FROM]
+                    = value_zp;
+        value_dequantize_prim.execute(strm, value_dequantize_args);
+        strm.wait();
+    } else {
+        value_dequantized = as(strm, value, mdt::f32);
+        strm.wait();
+    }
+
+    auto score = memory(score_md, eng);
+    auto score2 = memory(score_md, eng);
+    auto bmm1_pd = matmul::primitive_desc(eng, query.get_desc(),
+            key_dequantized.get_desc(), score.get_desc(), bmm1_attr);
+    auto bmm1_prim = matmul(bmm1_pd);
+
+    primitive_attr softmax_attr;
+    softmax_attr.set_scratchpad_mode(scratchpad_mode::library);
+    auto softmax_pd = softmax_forward::primitive_desc(eng,
+            prop_kind::forward_inference, algorithm::softmax_accurate,
+            score.get_desc(), score.get_desc(), 3, softmax_attr);
+    auto softmax_prim = softmax_forward(softmax_pd);
+
+    // attention_output = attention_probs x value
+    primitive_attr bmm2_attr;
+
+    bmm2_attr.set_scratchpad_mode(scratchpad_mode::library);
+    auto bmm2_pd = matmul::primitive_desc(eng, score.get_desc(),
+            value_dequantized.get_desc(), output.get_desc(), bmm2_attr);
+    auto bmm2_prim = matmul(bmm2_pd);
+
+    std::unordered_map<int, memory> bmm1_args = {{DNNL_ARG_SRC, query},
+            {DNNL_ARG_WEIGHTS, key_dequantized}, {DNNL_ARG_DST, score}};
+
+    if (scale_dt != mdt::undef) {
+        bmm1_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1]
+                = scale_f32;
+        if (p.mask != mask_type::no_mask) {
+            bmm1_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(1) | DNNL_ARG_SRC_1]
+                    = mask_f32;
+        }
+    } else {
+        if (p.mask != mask_type::no_mask) {
+            bmm1_args[DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1]
+                    = mask_f32;
+        }
+    }
+
+    const auto loop = [&]() {
+        // each primitive will use all threads
+        bmm1_prim.execute(strm, bmm1_args);
+
+        //strm.wait();
+        //print_mem(score, "score");
+
+        softmax_prim.execute(strm,
+                {
+                        {DNNL_ARG_SRC, score},
+                        {DNNL_ARG_DST, score2},
+                });
+        //strm.wait();
+        //print_mem(score2, "score2");
+
+        bmm2_prim.execute(strm,
+                {
+                        {DNNL_ARG_SRC, score2},
+                        {DNNL_ARG_WEIGHTS, value_dequantized},
+                        {DNNL_ARG_DST, output},
+                });
+    };
+
+    // Warmup run.
+    // Execute primitives of sdpa.
+    loop();
+    strm.wait();
+}
+
+template <typename T>
+void check_memory(memory &gold, memory &test) {
+    T *mapped_ptr_gold = (T *)gold.map_data();
+    T *mapped_ptr_test = (T *)test.map_data();
+
+    auto dims = gold.get_desc().get_dims();
+    auto strides = gold.get_desc().get_strides();
+
+    int mismatches = 0;
+    int total = 0;
+    float fthreshold = 0.f;
+    if (std::is_same<T, float16_t>::value) {
+        fthreshold = 0.001466f;
+    } else {
+        fthreshold = 0.0079f;
+    }
+
+    float max_diff = std::numeric_limits<float>::min();
+    std::map<int, std::map<int, int>> hist;
+    bool verbose = false;
+    for_(int l = 0; l < dims[0]; l++)
+    for_(int k = 0; k < dims[1]; k++)
+    for_(int j = 0; j < dims[2]; j++)
+    for (int i = 0; i < dims[3]; i++) {
+        auto offset = l * strides[0] + k * strides[1] + j * strides[2]
+                + i * strides[3];
+        auto o_gold = (float)mapped_ptr_gold[offset];
+        auto o_test = (float)mapped_ptr_test[offset];
+        total++;
+
+        float abs_diff = abs(o_gold - o_test);
+        bool is_nan = isnan(o_gold) || isnan(o_test);
+
+        bool is_mismatch = is_nan
+                || (abs(o_gold) > 1.f ? abs_diff > abs(o_gold * fthreshold)
+                                      : abs_diff > fthreshold);
+        if (max_diff < abs_diff) {
+            if (verbose) {
+                printf("new max: gold: %f vs test: %f diff: %f\n", o_gold,
+                        o_test, abs_diff);
+            }
+            max_diff = abs_diff;
+        }
+        if (is_mismatch) {
+            hist[0][l]++;
+            hist[1][k]++;
+            hist[2][j]++;
+            hist[3][i]++;
+        }
+        if ((is_mismatch && mismatches++ < 32) || is_nan) {
+            if (verbose)
+                fprintf(stderr,
+                        "Mismatch at (%d,%d,%d,%d): test %f "
+                        "vs. gold %f (diff: %f thresh: %f)\n",
+                        l, k, j, i, o_test, o_gold, abs_diff,
+                        (abs(o_gold) > 2.f ? abs(o_gold * fthreshold)
+                                           : fthreshold));
+        }
+    }
+
+    gold.unmap_data(mapped_ptr_gold);
+    test.unmap_data(mapped_ptr_test);
+
+    int threshold = total * 0.0006;
+
+    ASSERT_LE(mismatches, threshold)
+            << "max diff: " << max_diff << " out of: " << total;
+}
+
+int to_attn_mask_type(mask_type t) {
+    using namespace dnnl::impl::attn_mask_type;
+    auto attn_mask = buffer;
+    switch (t) {
+        case mask_type::causal_tl: attn_mask = top_left; break;
+        case mask_type::causal_br: attn_mask = bottom_right; break;
+        default:;
+    }
+    return static_cast<int>(attn_mask);
+}
+
+GPU_TEST_P(sdpa_test_t, compare) {
+    memory::data_type scale_dt = t.m_query_test.get_desc().get_data_type();
+    //memory::data_type scale_dt = memory::data_type::undef;
+    bool invert_scale = true;
+
+    using namespace dnnl::impl;
+    auto mask = t.m_mask.get_desc();
+
+    memory::desc *mask_ptr = nullptr;
+
+    switch (p.mask) {
+        case mask_type::no_mask:
+        case mask_type::causal_tl:
+        case mask_type::causal_br: mask_ptr = nullptr; break;
+        case mask_type::oneD:
+        case mask_type::twoD: mask_ptr = &mask; break;
+    }
+
+    sdpa::primitive_desc sdpa_quantized_pd;
+    sdpa sdpa_quantized_p;
+    try {
+        sdpa_quantized_pd = sdpa::primitive_desc(eng, t.m_query_test.get_desc(),
+                p.with_key_transposed ? t.m_key_t_quantized.get_desc()
+                                      : t.m_key_quantized.get_desc(),
+                t.m_value_quantized.get_desc(), mask_ptr, scale_dt,
+                t.m_output_quantized.get_desc(), invert_scale, p.head_num,
+                to_attn_mask_type(p.mask), t.sdpa_attr_quantized,
+                t.sdpa_kq_attr_quantized, t.sdpa_vs_attr_quantized);
+        sdpa_quantized_p = sdpa(sdpa_quantized_pd);
+    } catch (const dnnl::error &e) {
+        if (e.status == dnnl_unimplemented)
+            GTEST_SKIP() << "Unimplemented: " << e.what();
+        else
+            throw;
+    }
+
+    std::unordered_map<int, memory> s8_args = {{{DNNL_ARG_QUERIES, t.m_query},
+            {DNNL_ARG_VALUES, t.m_value_quantized},
+            {DNNL_ARG_DST, t.m_output_quantized}}};
+
+    if (p.with_key_transposed) {
+        s8_args[DNNL_ARG_KEYS] = t.m_key_t_quantized;
+    } else {
+        s8_args[DNNL_ARG_KEYS] = t.m_key_quantized;
+    }
+    if (scale_dt != mdt::undef) { s8_args[DNNL_ARG_SCALE] = t.m_scale; }
+
+    bool k_is_16_bit_float = ((p.kdt == mdt::f16) || (p.kdt == mdt::bf16));
+    bool v_is_16_bit_float = ((p.vdt == mdt::f16) || (p.vdt == mdt::bf16));
+    if (!k_is_16_bit_float && p.qtype != quantize_type::no_quantization) {
+        if (p.ksdt != mdt::undef)
+            s8_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_KEYS] = t.m_key_scales;
+        if (p.kzpdt != mdt::undef)
+            s8_args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_KEYS] = t.m_key_zp;
+    }
+    if (!v_is_16_bit_float && p.qtype != quantize_type::no_quantization) {
+        if (p.vsdt != mdt::undef)
+            s8_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_VALUES] = t.m_value_scales;
+        if (p.vzpdt != mdt::undef)
+            s8_args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_VALUES] = t.m_value_zp;
+    }
+    if (mask_ptr) { s8_args[DNNL_ARG_ATTN_MASK] = t.m_mask; }
+
+    std::unordered_map<int, memory> f16_args
+            = {{DNNL_ARG_QUERIES, t.m_query}, {DNNL_ARG_KEYS, t.m_key},
+                    {DNNL_ARG_VALUES, t.m_value}, {DNNL_ARG_DST, t.m_output}};
+    if (scale_dt != mdt::undef) { f16_args[DNNL_ARG_SCALE] = t.m_scale; }
+    if (mask_ptr) { f16_args[DNNL_ARG_ATTN_MASK] = t.m_mask; }
+
+    auto loop_quantized = [&] { sdpa_quantized_p.execute(strm, s8_args); };
+    loop_quantized();
+    strm.wait();
+    prim_sdpa_quant(p, t, eng, strm, t.m_query,
+            p.with_key_transposed ? t.m_key_t_quantized : t.m_key_quantized,
+            t.m_key_scales, t.m_key_zp, scale_dt, t.m_scale, t.m_mask,
+            t.m_value_quantized, t.m_value_scales, t.m_value_zp, t.m_output,
+            invert_scale);
+    strm.wait();
+
+#if 0
+    if (::getenv("SKIP_CHECK")) return;
+#endif
+    if (t.m_output.get_desc().get_data_type() == mdt::f16)
+        check_memory<float16_t>(t.m_output, t.m_output_quantized);
+    else if (t.m_output.get_desc().get_data_type() == mdt::bf16)
+        check_memory<bfloat16_t>(t.m_output, t.m_output_quantized);
+
+#if 0
+    for (auto &kv : hist) {
+        for (auto &kv2 : kv.second) {
+            printf("hist[%d][%d] = %d\n", kv.first, kv2.first, kv2.second);
+        }
+    }
+#endif
+}
+std::vector<std::chrono::microseconds> timeit(
+        const std::function<void()> &func, dnnl::stream &str, int iterations) {
+    using namespace std::chrono;
+    func();
+    func();
+    std::vector<std::chrono::microseconds> times;
+    for (int j = 0; j < 5; j++) {
+        auto e = steady_clock::now();
+        str.wait();
+        auto s = steady_clock::now();
+        for (int i = 0; i < iterations; i++) {
+            func();
+        }
+        str.wait();
+        e = steady_clock::now();
+        times.push_back(std::chrono::duration_cast<microseconds>(e - s));
+    }
+    return times;
+}
+
+GPU_TEST_P(sdpa_test_t, perf) {
+    memory::data_type scale_dt = memory::data_type::f16;
+    //memory::data_type scale_dt = memory::data_type::undef;
+    bool invert_scale = true;
+
+    using namespace dnnl::impl;
+    auto mask = t.m_mask.get_desc();
+
+    memory::desc *mask_ptr = nullptr;
+
+    switch (p.mask) {
+        case mask_type::no_mask:
+        case mask_type::causal_tl:
+        case mask_type::causal_br: mask_ptr = nullptr; break;
+        case mask_type::oneD:
+        case mask_type::twoD: mask_ptr = &mask; break;
+    }
+
+    sdpa::primitive_desc sdpa_quantized_pd;
+    sdpa sdpa_quantized_p;
+    try {
+        sdpa_quantized_pd = sdpa::primitive_desc(eng, t.m_query.get_desc(),
+                p.with_key_transposed ? t.m_key_t_quantized.get_desc()
+                                      : t.m_key_quantized.get_desc(),
+                t.m_value_quantized.get_desc(), mask_ptr, scale_dt,
+                t.m_output_quantized.get_desc(), invert_scale, p.head_num,
+                to_attn_mask_type(p.mask), t.sdpa_attr_quantized,
+                t.sdpa_kq_attr_quantized, t.sdpa_vs_attr_quantized);
+        sdpa_quantized_p = sdpa(sdpa_quantized_pd);
+    } catch (const dnnl::error &e) {
+        if (e.status == dnnl_unimplemented)
+            GTEST_SKIP() << "Unimplemented: " << e.what();
+        else
+            throw;
+    }
+
+    auto sdpaf16_pd = sdpa::primitive_desc(eng, t.m_query.get_desc(),
+            p.with_key_transposed ? t.m_key_t.get_desc() : t.m_key.get_desc(),
+            t.m_value.get_desc(), mask_ptr, scale_dt, t.m_output.get_desc(),
+            invert_scale, p.head_num, to_attn_mask_type(p.mask), t.sdpa_attr);
+    auto sdpaf16_p = sdpa(sdpaf16_pd);
+
+    std::unordered_map<int, memory> s8_args = {{{DNNL_ARG_QUERIES, t.m_query},
+            {DNNL_ARG_VALUES, t.m_value_quantized},
+            {DNNL_ARG_DST, t.m_output_quantized}}};
+
+    if (p.with_key_transposed) {
+        s8_args[DNNL_ARG_KEYS] = t.m_key_t_quantized;
+    } else {
+        s8_args[DNNL_ARG_KEYS] = t.m_key_quantized;
+    }
+    if (scale_dt != mdt::undef) { s8_args[DNNL_ARG_SCALE] = t.m_scale; }
+
+    if (p.kdt != mdt::f16 && p.qtype != quantize_type::no_quantization) {
+        s8_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_KEYS] = t.m_key_scales;
+        s8_args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_KEYS] = t.m_key_zp;
+    }
+    if (p.vdt != mdt::f16 && p.qtype != quantize_type::no_quantization) {
+        s8_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_VALUES] = t.m_value_scales;
+        s8_args[DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_VALUES] = t.m_value_zp;
+    }
+    if (mask_ptr) { s8_args[DNNL_ARG_ATTN_MASK] = t.m_mask; }
+
+    auto loop_quantized = [&] { sdpa_quantized_p.execute(strm, s8_args); };
+
+    /// Dequantize reorder for key
+    memory key_dequantized;
+    dnnl::reorder key_dequantize_prim;
+    bool dequantize_k = p.kdt != mdt::f16 && p.kdt != mdt::bf16
+            && p.qtype != quantize_type::no_quantization;
+    if (dequantize_k) {
+        std::tie(key_dequantize_prim, key_dequantized)
+                = dequantize_prim(eng, mdt::f16,
+                        p.with_key_transposed ? t.m_key_t_quantized.get_desc()
+                                              : t.m_key_quantized.get_desc(),
+                        t.kq_mask, t.kq_groups, p.ksdt, p.kzpdt,
+                        (p.with_key_transposed ? memory::format_tag::abdc
+                                               : memory::format_tag::abcd));
+    } else {
+        key_dequantized = p.with_key_transposed ? t.m_key_t_quantized
+                                                : t.m_key_quantized;
+    }
+
+    /// Dequantize reorder for value
+    memory value_dequantized;
+    dnnl::reorder value_dequantize_prim;
+    bool dequantize_v = p.vdt != mdt::f16 && p.vdt != mdt::bf16
+            && p.qtype != quantize_type::no_quantization;
+    if (dequantize_v) {
+        std::tie(value_dequantize_prim, value_dequantized)
+                = dequantize_prim(eng, mdt::f16, t.m_value_quantized.get_desc(),
+                        t.vs_mask, t.vs_groups, p.vsdt, p.vzpdt);
+    } else {
+        value_dequantized = t.m_value_quantized;
+    }
+
+    std::unordered_map<int, memory> f16_args
+            = {{DNNL_ARG_QUERIES, t.m_query}, {DNNL_ARG_KEYS, key_dequantized},
+                    {DNNL_ARG_VALUES, value_dequantized},
+                    {DNNL_ARG_DST, t.m_output_quantized}};
+    if (scale_dt != mdt::undef) { f16_args[DNNL_ARG_SCALE] = t.m_scale; }
+    if (mask_ptr) { f16_args[DNNL_ARG_ATTN_MASK] = t.m_mask; }
+
+    auto loop_sdpa_f16 = [&] { sdpaf16_p.execute(strm, f16_args); };
+
+    int iterations = 100;
+    auto quantized_time = timeit(loop_quantized, strm, iterations);
+    auto sdpa_f16_time = timeit(loop_sdpa_f16, strm, iterations);
+
+    auto min_time = [](const std::vector<std::chrono::microseconds> &a) {
+        return *std::min_element(a.begin(), a.end());
+    };
+
+    std::cout << print_row(p) << "|"
+              << min_time(quantized_time).count() / float(iterations) << "|"
+              << min_time(sdpa_f16_time).count() / float(iterations) << "|"
+              << std::endl;
+}
diff --git a/tests/gtests/internals/test_utils.cpp b/tests/gtests/internals/test_utils.cpp
new file mode 100644
index 00000000000..4e906ffa4ab
--- /dev/null
+++ b/tests/gtests/internals/test_utils.cpp
@@ -0,0 +1,357 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "test_utils.hpp"
+
+using dnnl::memory;
+using mdt = memory::data_type;
+
+memory::dim product(const std::vector<int64_t> &dims) {
+    return dims.empty() ? 0
+                        : std::accumulate(dims.begin(), dims.end(),
+                                (memory::dim)1, std::multiplies<memory::dim>());
+}
+
+std::random_device &get_random_device() {
+    static std::random_device rd;
+    return rd;
+}
+
+std::mt19937 &get_generator() {
+    static std::mt19937 generator(get_random_device()());
+    return generator;
+}
+
+// this is changed from the fill_random() function in matmul_perf.cpp.
+void fill_random(std::vector<float> &out, const memory::desc &desc) {
+    static std::vector<float> random_data_f;
+    constexpr memory::dim nrand = 1037;
+
+    if (random_data_f.empty()) {
+        std::uniform_real_distribution<float> dist_f(-3.0f, 4.0f);
+
+        random_data_f.resize(nrand);
+        for (auto &d : random_data_f)
+            d = dist_f(get_generator());
+    }
+
+    auto elems = product(desc.get_dims());
+    for (memory::dim i = 0; i < elems; i += nrand) {
+        size_t chunk = std::min(nrand, elems - i);
+        std::memcpy(&out[i], random_data_f.data(), chunk * sizeof(float));
+    }
+}
+
+void fill_random_scales(std::vector<float> &out, const memory::desc &desc) {
+    static std::vector<float> random_data_f;
+    constexpr memory::dim nrand = 1037;
+
+    if (random_data_f.empty()) {
+        std::uniform_int_distribution<int> dist_f(-16, 16);
+
+        random_data_f.resize(nrand);
+        for (auto &d : random_data_f) {
+            auto value = dist_f(get_generator()) * 0.125f;
+            if (value == 0.f) value = dist_f(get_generator()) * 0.125f;
+            d = value;
+        }
+    }
+
+    auto elems = product(desc.get_dims());
+    for (memory::dim i = 0; i < elems; i += nrand) {
+        size_t chunk = std::min(nrand, elems - i);
+        std::memcpy(&out[i], random_data_f.data(), chunk * sizeof(float));
+    }
+}
+
+inline int get_idxs_str_len(size_t ndims) {
+    return static_cast<int>(std::string("( ):").size() + 4 * (ndims - 1));
+}
+
+inline void print_idxs(std::vector<int64_t> idxs) {
+    size_t ndims = idxs.size();
+    printf("(");
+    for (size_t i = 0; i < (ndims - 1); ++i) {
+        if (i < (ndims - 2)) {
+            printf("%3d,", static_cast<int>(idxs[i]));
+        } else {
+            printf("%4d", static_cast<int>(idxs[i]));
+        }
+    }
+    printf(" ):");
+}
+
+void print_mem(const dnnl::memory &mem, const std::string &name) {
+    auto eng = mem.get_engine();
+    dnnl::stream s(eng);
+    s.wait();
+    auto desc = mem.get_desc();
+    auto dims = desc.get_dims();
+    auto strides = desc.get_strides();
+
+    size_t ndims = dims.size();
+    size_t lastdim = ndims - 1;
+
+    printf("%sbegin : ", name.c_str());
+    printf("\ndims : [");
+    for (auto d : dims) {
+        printf("%6ld ", (long)d);
+    }
+    printf("]  strides : [");
+    for (auto s : strides) {
+        printf("%6ld ", (long)s);
+    }
+
+    if (mem.get_desc().get_data_type() == dnnl_bf16) { printf("bf16\n"); }
+    void *mapped_ptr_ = (void *)mem.map_data();
+    size_t padlen = std::max(get_idxs_str_len(ndims)
+                    - static_cast<int>(std::string("   i :").size()),
+            0);
+    printf("]\n   i :%s", std::string(padlen, ' ').c_str());
+    for (int i = 0; i < dims[lastdim]; i++) {
+        switch ((int)desc.get_data_type()) {
+            case dnnl_u4:
+            case dnnl_s4: printf("%4d", i); break;
+            case dnnl_u8:
+            case dnnl_s8: printf("%4d", i); break;
+            case dnnl_f32:
+            case dnnl_bf16:
+            case dnnl_f16: printf("%9d", i); break;
+        }
+    }
+    printf("\n-----\n");
+
+    switch ((int)desc.get_data_type()) {
+        case dnnl_u4:
+        case dnnl_s4: {
+            char *mapped_ptr = (char *)mapped_ptr_;
+
+            dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+                if (idxs[lastdim] == 0) { print_idxs(idxs); }
+
+                size_t offset = 0;
+                for (size_t i = 0; i < ndims; ++i) {
+                    offset += idxs[i] * strides[i];
+                }
+                offset /= 2;
+
+                const bool odd_lastdim = idxs[lastdim] % 2;
+                bool is_odd = odd_lastdim;
+                if (ndims > 1 && strides[lastdim] != 1) {
+                    // assumes last 2 dims transposed, TODO: arbitrary continuous dim?
+                    const bool odd_2lastdim = idxs[lastdim - 1] % 2;
+                    is_odd = odd_2lastdim;
+                }
+                int bits;
+                if (is_odd) {
+                    bits = (mapped_ptr[offset] & 0xf0) >> 4;
+                } else {
+                    bits = (mapped_ptr[offset] & 0x0f);
+                }
+                if (desc.get_data_type() == dnnl_s4) {
+                    int sign = (bits & 0x08) ? -1 : 1;
+                    if (sign == -1) {
+                        bits = (bits & 0x07) - 8;
+                    } else {
+                        bits = (bits & 0x07);
+                    }
+                }
+                printf("%4d", bits);
+                if (idxs[lastdim] == (dims[lastdim] - 1)) { printf("\n"); }
+            });
+        } break;
+
+        case dnnl_u8:
+        case dnnl_s8: {
+            char *mapped_ptr = (char *)mapped_ptr_;
+
+            dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+                if (idxs[lastdim] == 0) { print_idxs(idxs); }
+
+                size_t offset = 0;
+                for (size_t i = 0; i < ndims; ++i) {
+                    offset += idxs[i] * strides[i];
+                }
+                printf("%4d", mapped_ptr[offset]);
+                if (idxs[lastdim] == (dims[lastdim] - 1)) { printf("\n"); }
+            });
+        } break;
+        case dnnl_bf16: {
+            using dnnl::impl::bfloat16_t;
+            bfloat16_t *mapped_ptr = (bfloat16_t *)mapped_ptr_;
+
+            dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+                if (idxs[lastdim] == 0) { print_idxs(idxs); }
+
+                size_t offset = 0;
+                for (size_t i = 0; i < ndims; ++i) {
+                    offset += idxs[i] * strides[i];
+                }
+                printf("%+9.3f", (float)(mapped_ptr[offset]));
+                if (idxs[lastdim] == (dims[lastdim] - 1)) { printf("\n"); }
+            });
+        } break;
+        case dnnl_f16: {
+            using dnnl::impl::float16_t;
+            float16_t *mapped_ptr = (float16_t *)mapped_ptr_;
+
+            dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+                if (idxs[lastdim] == 0) { print_idxs(idxs); }
+
+                size_t offset = 0;
+                for (size_t i = 0; i < ndims; ++i) {
+                    offset += idxs[i] * strides[i];
+                }
+                printf("%+9.3f", (mapped_ptr[offset].f()));
+                if (idxs[lastdim] == (dims[lastdim] - 1)) { printf("\n"); }
+            });
+        } break;
+        case dnnl_f32: {
+            float *mapped_ptr = (float *)mapped_ptr_;
+
+            dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+                if (idxs[lastdim] == 0) { print_idxs(idxs); }
+
+                size_t offset = 0;
+                for (size_t i = 0; i < ndims; ++i) {
+                    offset += idxs[i] * strides[i];
+                }
+                printf("%+9.3f", (mapped_ptr[offset]));
+                if (idxs[lastdim] == (dims[lastdim] - 1)) { printf("\n"); }
+            });
+        } break;
+        default: throw std::runtime_error("Not supported");
+    }
+    mem.unmap_data(mapped_ptr_);
+    printf("%send\n", name.c_str());
+}
+
+void transpose(const dnnl::engine &eng, memory &out, memory &in) {
+    dnnl::stream s(eng);
+
+    void *ptr2 = out.map_data();
+    void *ptr1 = in.map_data();
+
+    std::memcpy(ptr2, ptr1, in.get_desc().get_size());
+    in.unmap_data(ptr1);
+    out.unmap_data(ptr2);
+}
+
+void transpose_strides(const dnnl::engine &eng, memory &out, memory &in) {
+    dnnl::stream s(eng);
+
+    if (out.get_desc().get_data_type() == mdt::u4
+            || out.get_desc().get_data_type() == mdt::s4) {
+        auto desc = in.get_desc();
+        auto dims = desc.get_dims();
+        auto strides = desc.get_strides();
+        auto strides_t = out.get_desc().get_strides();
+
+        char *mapped_ptr = (char *)in.map_data();
+        char *mapped_ptr_t = (char *)out.map_data();
+
+        size_t ndims = dims.size();
+        assert(ndims > 1);
+
+        size_t lastdim = ndims - 1;
+        size_t n2lastdim = lastdim - 1;
+
+        dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+            int is_odd = idxs[lastdim] % 2;
+            int is_odd_t = idxs[n2lastdim] % 2;
+
+            size_t offset = 0;
+            size_t offset_t = 0;
+            for (size_t i = 0; i < ndims; ++i) {
+                offset += idxs[i] * strides[i];
+                offset_t += idxs[i] * strides_t[i];
+            }
+            offset /= 2;
+            offset_t /= 2;
+
+            auto &val = mapped_ptr[offset];
+            auto &val_t = mapped_ptr_t[offset_t];
+
+            char bits;
+            if (is_odd) {
+                bits = val & 0xf0;
+                bits >>= 4;
+            } else {
+                bits = val & 0x0f;
+            }
+            if (is_odd_t) {
+                val_t |= (bits << 4);
+            } else {
+                val_t |= bits;
+            }
+        });
+
+        in.unmap_data(mapped_ptr);
+        out.unmap_data(mapped_ptr_t);
+    } else {
+        dnnl::reorder(in, out).execute(s, in, out);
+    }
+}
+
+std::ostream &operator<<(std::ostream &ss, const quantize_type &qt) {
+    switch (qt) {
+        case quantize_type::no_quantization: ss << "no_quantization"; break;
+        case quantize_type::per_tensor: ss << "per_tensor"; break;
+        case quantize_type::per_tensor1: ss << "per_tensor1"; break;
+        case quantize_type::per_tensor3: ss << "per_tensor3"; break;
+        case quantize_type::per_token: ss << "per_token"; break;
+        case quantize_type::per_token_with_groups:
+            ss << "per_token_with_groups";
+            break;
+    }
+    return ss;
+}
+
+std::ostream &operator<<(std::ostream &ss, const memory::data_type &dt) {
+    switch (dt) {
+        case mdt::f32: ss << "f32"; break;
+        case mdt::s32: ss << "s32"; break;
+        case mdt::f16: ss << "f16"; break;
+        case mdt::s8: ss << "s8"; break;
+        case mdt::u8: ss << "u8"; break;
+        case mdt::s4: ss << "s4"; break;
+        case mdt::u4: ss << "u4"; break;
+        default: ss << "na"; break;
+    }
+    return ss;
+}
+
+void dynamic_iterate_alldims(std::vector<int64_t> dims,
+        const std::function<void(std::vector<int64_t> idxs)> &fn) {
+    size_t ndims = dims.size();
+    assert(ndims > 1); // TODO: will fail w/ndim == 1
+    size_t lastdim = ndims - 1;
+
+    std::vector<int64_t> idxs(ndims, 0);
+    while (true) {
+
+        fn(idxs);
+
+        int d = static_cast<int>(lastdim);
+        while (d >= 0) {
+            if (++idxs[d] < dims[d]) {
+                break;
+            } else {
+                idxs[d--] = 0;
+            }
+        }
+        if (d < 0) { break; }
+    }
+}
diff --git a/tests/gtests/internals/test_utils.hpp b/tests/gtests/internals/test_utils.hpp
new file mode 100644
index 00000000000..88840896b5f
--- /dev/null
+++ b/tests/gtests/internals/test_utils.hpp
@@ -0,0 +1,236 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef DNNL_TEST_INTERNAL_TEST_UTILS_HPP
+#define DNNL_TEST_INTERNAL_TEST_UTILS_HPP
+
+#include <dnnl_test_common.hpp>
+#include <oneapi/dnnl/dnnl.hpp>
+
+#include <functional>
+#include <random>
+
+enum class quantize_type {
+    no_quantization,
+    per_tensor,
+    per_tensor1,
+    per_tensor3,
+    per_token,
+    per_token_with_groups
+};
+
+dnnl::memory::dim product(const std::vector<int64_t> &dims);
+
+std::random_device &get_random_device();
+
+std::mt19937 &get_generator();
+
+void fill_random(std::vector<float> &out, const dnnl::memory::desc &desc);
+
+void fill_random_scales(
+        std::vector<float> &out, const dnnl::memory::desc &desc);
+
+void print_mem(const dnnl::memory &mem, const std::string &name = "");
+
+void transpose(const dnnl::engine &eng, dnnl::memory &out, dnnl::memory &in);
+
+void transpose_strides(
+        const dnnl::engine &eng, dnnl::memory &out, dnnl::memory &in);
+
+/// Read from handle, write to memory
+/// This function is similar to the function found in write_to_dnnl_memory but this
+/// function has been expanded to perform an inline conversion from the source data
+/// type(handle) to the destination memory object. Currently, it only supports
+/// bf16, f16, f32, s32, u8/s8, u4/s4 data types for the dnnl::memory object
+/// and unsigned int and float for the handle.
+///
+/// The function first transfers the data to the handle's data type then
+/// uses reorder to perform the conversion to the destination data type.
+template <typename T>
+void write_to_dnnl_memory(const T *handle, dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t size = mem.get_desc().get_size();
+
+    if (!handle) throw std::runtime_error("handle is nullptr.");
+
+    dnnl::stream s(eng);
+    if (eng.get_kind() == dnnl::engine::kind::gpu) {
+        if (mem.get_desc().get_data_type() != dnnl_f32
+                && std::is_same<T, float>::value) {
+            dnnl::memory mem_f32_mem(
+                    {mem.get_desc().get_dims(), dnnl::memory::data_type::f32,
+                            mem.get_desc().get_strides()},
+                    eng);
+            write_to_dnnl_memory<float>((const float *)handle, mem_f32_mem);
+            dnnl::reorder(mem_f32_mem, mem).execute(s, mem_f32_mem, mem);
+            s.wait();
+        } else if (mem.get_desc().get_data_type() != dnnl_s32
+                && std::is_same<T, int>::value) {
+            dnnl::memory mem_s32_mem(
+                    {mem.get_desc().get_dims(), dnnl::memory::data_type::s32,
+                            mem.get_desc().get_strides()},
+                    eng);
+            write_to_dnnl_memory<int>((const int *)handle, mem_s32_mem);
+            dnnl::reorder(mem_s32_mem, mem).execute(s, mem_s32_mem, mem);
+            s.wait();
+        } else if ((mem.get_desc().get_data_type() == dnnl_u8
+                           || mem.get_desc().get_data_type() == dnnl_s8
+                           || mem.get_desc().get_data_type() == dnnl_s4
+                           || mem.get_desc().get_data_type() == dnnl_u4)
+                && std::is_same<T, unsigned>::value) {
+            dnnl::memory mem_u32_mem(
+                    {mem.get_desc().get_dims(), dnnl::memory::data_type::s32,
+                            mem.get_desc().get_strides()},
+                    eng);
+            write_to_dnnl_memory<unsigned>(
+                    (const unsigned *)handle, mem_u32_mem);
+            dnnl::reorder(mem_u32_mem, mem).execute(s, mem_u32_mem, mem);
+            s.wait();
+        } else if ((mem.get_desc().get_data_type() == dnnl_f32
+                           && std::is_same<T, float>::value)
+                || (mem.get_desc().get_data_type() == dnnl_s32
+                        && std::is_same<T, int>::value)) {
+            void *mapped_ptr = mem.map_data();
+            if (mapped_ptr) std::memcpy(mapped_ptr, handle, size);
+            mem.unmap_data(mapped_ptr);
+        } else {
+            // PC: this branch is identical to the one above
+            void *mapped_ptr = mem.map_data();
+            if (mapped_ptr) std::memcpy(mapped_ptr, handle, size);
+            mem.unmap_data(mapped_ptr);
+        }
+        return;
+    }
+
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+        uint8_t *dst = static_cast<uint8_t *>(mem.get_data_handle());
+        if (!dst) throw std::runtime_error("get_data_handle returned nullptr.");
+        for (size_t i = 0; i < size; ++i)
+            dst[i] = ((uint8_t *)handle)[i];
+        return;
+    }
+
+    assert(!"not expected");
+}
+
+template <typename T>
+void fill_random_quantized(std::vector<T> &out, const dnnl::memory::desc &desc,
+        bool is_unsigned = false) {
+    static std::vector<T> random_data_f;
+    static std::vector<T> random_data_u;
+    constexpr dnnl::memory::dim nrand = 2049;
+
+    if (random_data_f.empty() || random_data_u.empty()) {
+        std::uniform_int_distribution<int> dist_f(-4, 4);
+        std::uniform_int_distribution<unsigned> dist_u(0, 6);
+
+        random_data_u.resize(nrand);
+        for (auto &d : random_data_u) {
+            d = dist_u(get_generator());
+        }
+        random_data_f.resize(nrand);
+        for (auto &d : random_data_f) {
+            d = dist_f(get_generator());
+        }
+    }
+
+    auto elems = product(desc.get_dims());
+    if (std::is_same<unsigned, T>::value || is_unsigned) {
+        for (dnnl::memory::dim i = 0; i < elems; i += nrand) {
+            size_t chunk = std::min(nrand, elems - i);
+            std::memcpy(&out[i], random_data_u.data(), chunk * sizeof(T));
+        }
+    } else {
+        for (dnnl::memory::dim i = 0; i < elems; i += nrand) {
+            size_t chunk = std::min(nrand, elems - i);
+            std::memcpy(&out[i], random_data_f.data(), chunk * sizeof(T));
+        }
+    }
+}
+
+void dynamic_iterate_alldims(std::vector<int64_t> dims,
+        const std::function<void(std::vector<int64_t> idxs)> &fn);
+
+template <typename T>
+std::vector<float> dequantize(const std::vector<float> &input,
+        dnnl::memory::desc &desc, dnnl::memory::desc &scale_md,
+        const std::vector<T> &zero_points, const std::vector<float> &scales,
+        int group_size, quantize_type qtype, dnnl::memory::dims groups,
+        int token_dim = -1) {
+    std::vector<float> out(input.size());
+    if (qtype == quantize_type::per_tensor
+            || qtype == quantize_type::per_tensor1
+            || qtype == quantize_type::per_tensor3) {
+        for (size_t i = 0; i < input.size(); i++)
+            out[i] = (input[i] - zero_points[i / group_size])
+                    * scales[i / group_size];
+    } else {
+        auto dims = desc.get_dims();
+        auto strides = desc.get_strides();
+
+        auto scales_dim = scale_md.get_dims();
+        auto zp_dim = scale_md.get_dims();
+        auto scales_strides = scale_md.get_strides();
+        auto zp_strides = scale_md.get_strides();
+
+        size_t ndims = dims.size();
+        size_t lastdim = ndims - 1;
+        size_t n2lastdim = lastdim - 1;
+
+        if (qtype == quantize_type::no_quantization) { groups = {1, 1}; }
+        if (qtype == quantize_type::per_token) {
+            if (token_dim == 0) {
+                groups = {dims[n2lastdim], 1};
+            } else if (token_dim == 1) {
+                groups = {1, dims[lastdim]};
+            }
+        }
+
+        dynamic_iterate_alldims(dims, [&](std::vector<int64_t> idxs) {
+            size_t offset = 0;
+            size_t scale_offset = 0;
+            size_t zp_offset = 0;
+
+            int group = 0;
+
+            for (size_t i = 0; i < ndims; ++i) {
+                if (groups[0] > 1) {
+                    group = (i == n2lastdim) ? groups[0] : 1;
+                    scale_offset += idxs[i] / group * scales_strides[i];
+
+                    // set final stride = 1
+                    auto zp_stride = (i == lastdim) ? 1 : zp_strides[i];
+                    zp_offset += idxs[i] / group * zp_stride;
+                } else if (groups[1] > 1) {
+                    group = (i == lastdim) ? groups[1] : 1;
+                    scale_offset += idxs[i] / group * scales_strides[i];
+                    zp_offset += idxs[i] / group * zp_strides[i];
+                }
+                offset += idxs[i] * strides[i];
+            }
+
+            out[offset] = (input[offset] - zero_points[zp_offset])
+                    * scales[scale_offset];
+        });
+    }
+    return out;
+}
+
+std::ostream &operator<<(std::ostream &ss, const quantize_type &qt);
+
+std::ostream &operator<<(std::ostream &ss, const memory::data_type &dt);
+
+#endif
diff --git a/tests/gtests/ocl/api/test_engine.cpp b/tests/gtests/ocl/api/test_engine.cpp
index 333c673f5bb..b49b231d71f 100644
--- a/tests/gtests/ocl/api/test_engine.cpp
+++ b/tests/gtests/ocl/api/test_engine.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2022 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -290,6 +290,8 @@ TEST_P(ocl_engine_test_t, BinaryKernels) {
         ASSERT_EQ(dnnl_impl_gpu_mayiuse_ngen_kernels(eng), true);
     }
 #endif
+
+    if (s == dnnl_success) { DNNL_CHECK(dnnl_engine_destroy(eng)); }
 }
 
 INSTANTIATE_TEST_SUITE_P(Simple, ocl_engine_test_t,
diff --git a/tests/gtests/ocl/api/test_memory_buffer.cpp b/tests/gtests/ocl/api/test_memory_buffer.cpp
index d7396e2d9d7..3bf7666507f 100644
--- a/tests/gtests/ocl/api/test_memory_buffer.cpp
+++ b/tests/gtests/ocl/api/test_memory_buffer.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -213,4 +213,124 @@ HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_buffer_test_cpp_t, BufferMapUnmap) {
     TEST_OCL_CHECK(clReleaseMemObject(ocl_mem));
 }
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+HANDLE_EXCEPTIONS_FOR_TEST(
+        ocl_memory_buffer_test_cpp_t, TestSparseMemoryCreation) {
+    engine eng(engine::kind::gpu, 0);
+    const int nnz = 12;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, memory::data_type::f32,
+                            nnz, memory::data_type::s32));
+
+    memory mem;
+    // Default memory constructor.
+    EXPECT_NO_THROW(mem = memory(md, eng));
+    // Memory object is expected to have 3 handles.
+    EXPECT_NO_THROW(mem.get_data_handle(0));
+    EXPECT_NO_THROW(mem.get_data_handle(1));
+    EXPECT_NO_THROW(mem.get_data_handle(2));
+
+    // Default interop AI to create a memory object.
+    EXPECT_NO_THROW(mem = ocl_interop::make_memory(
+                            md, eng, ocl_interop::memory_kind::buffer));
+    // Memory object is expected to have 3 handles.
+    EXPECT_NO_THROW(mem.get_data_handle(0));
+    EXPECT_NO_THROW(mem.get_data_handle(1));
+    EXPECT_NO_THROW(mem.get_data_handle(2));
+
+    // User provided buffers.
+    cl_context ocl_ctx = ocl_interop::get_context(eng);
+    cl_int err;
+    cl_mem ocl_values = clCreateBuffer(
+            ocl_ctx, CL_MEM_READ_WRITE, md.get_size(0), nullptr, &err);
+    TEST_OCL_CHECK(err);
+    ASSERT_NE(ocl_values, nullptr);
+
+    cl_mem ocl_row_indices = clCreateBuffer(
+            ocl_ctx, CL_MEM_READ_WRITE, md.get_size(1), nullptr, &err);
+    TEST_OCL_CHECK(err);
+    ASSERT_NE(ocl_row_indices, nullptr);
+
+    cl_mem ocl_col_indices = clCreateBuffer(
+            ocl_ctx, CL_MEM_READ_WRITE, md.get_size(2), nullptr, &err);
+    TEST_OCL_CHECK(err);
+    ASSERT_NE(ocl_col_indices, nullptr);
+
+    EXPECT_NO_THROW(mem = ocl_interop::make_memory(md, eng,
+                            {ocl_values, ocl_row_indices, ocl_col_indices}));
+
+    ASSERT_NO_THROW(mem.set_data_handle(nullptr, 0));
+    ASSERT_NO_THROW(mem.set_data_handle(nullptr, 1));
+    ASSERT_NO_THROW(mem.set_data_handle(nullptr, 2));
+
+    ASSERT_EQ(mem.get_data_handle(0), nullptr);
+    ASSERT_EQ(mem.get_data_handle(1), nullptr);
+    ASSERT_EQ(mem.get_data_handle(2), nullptr);
+}
+
+HANDLE_EXCEPTIONS_FOR_TEST(
+        ocl_memory_buffer_test_cpp_t, TestSparseMemoryMapUnmap) {
+    engine eng(engine::kind::gpu, 0);
+
+    const int nnz = 2;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({2, 2}, memory::data_type::f32, nnz,
+                            memory::data_type::s32));
+
+    // User provided buffers.
+    std::vector<float> coo_values = {1.5, 2.5};
+    std::vector<int> row_indices = {0, 1};
+    std::vector<int> col_indices = {0, 1};
+
+    cl_context ocl_ctx = ocl_interop::get_context(eng);
+    cl_int err;
+    cl_mem ocl_values
+            = clCreateBuffer(ocl_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                    md.get_size(0), coo_values.data(), &err);
+    TEST_OCL_CHECK(err);
+    ASSERT_NE(ocl_values, nullptr);
+
+    cl_mem ocl_row_indices
+            = clCreateBuffer(ocl_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                    md.get_size(1), row_indices.data(), &err);
+    TEST_OCL_CHECK(err);
+    ASSERT_NE(ocl_row_indices, nullptr);
+
+    cl_mem ocl_col_indices
+            = clCreateBuffer(ocl_ctx, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                    md.get_size(2), col_indices.data(), &err);
+    TEST_OCL_CHECK(err);
+    ASSERT_NE(ocl_col_indices, nullptr);
+
+    memory coo_mem;
+    EXPECT_NO_THROW(coo_mem = ocl_interop::make_memory(md, eng,
+                            {ocl_values, ocl_row_indices, ocl_col_indices}));
+
+    float *mapped_coo_values = nullptr;
+    int *mapped_row_indices = nullptr;
+    int *mapped_col_indices = nullptr;
+
+    ASSERT_NO_THROW(mapped_coo_values = coo_mem.map_data<float>(0));
+    ASSERT_NO_THROW(mapped_row_indices = coo_mem.map_data<int>(1));
+    ASSERT_NO_THROW(mapped_col_indices = coo_mem.map_data<int>(2));
+
+    for (size_t i = 0; i < coo_values.size(); i++)
+        ASSERT_EQ(coo_values[i], mapped_coo_values[i]);
+
+    for (size_t i = 0; i < row_indices.size(); i++)
+        ASSERT_EQ(row_indices[i], mapped_row_indices[i]);
+
+    for (size_t i = 0; i < col_indices.size(); i++)
+        ASSERT_EQ(col_indices[i], mapped_col_indices[i]);
+
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_coo_values, 0));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_row_indices, 1));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_col_indices, 2));
+}
+#endif
+
 } // namespace dnnl
diff --git a/tests/gtests/ocl/api/test_memory_usm.cpp b/tests/gtests/ocl/api/test_memory_usm.cpp
index e4bc917f1eb..ee5a7bb64a0 100644
--- a/tests/gtests/ocl/api/test_memory_usm.cpp
+++ b/tests/gtests/ocl/api/test_memory_usm.cpp
@@ -45,6 +45,13 @@ void fill_data(void *usm_ptr, memory::dim n, const engine &eng) {
         s.wait();
     }
 }
+
+using usm_unique_ptr_t = std::unique_ptr<void, std::function<void(void *)>>;
+usm_unique_ptr_t allocate_usm(size_t size, const engine &eng) {
+    return usm_unique_ptr_t(usm::malloc_shared(eng.get(), size),
+            [&](void *ptr) { usm::free(eng.get(), ptr); });
+}
+
 } // namespace
 
 class ocl_memory_usm_test_t : public ::testing::Test {};
@@ -56,17 +63,17 @@ HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_usm_test_t, Constructor) {
     memory::dim n = 100;
     memory::desc mem_d({n}, memory::data_type::f32, memory::format_tag::x);
 
-    void *ptr = usm::malloc_shared(eng.get(), sizeof(float) * n);
+    auto ptr = allocate_usm(sizeof(float) * n, eng);
 
     auto mem = ocl_interop::make_memory(
-            mem_d, eng, ocl_interop::memory_kind::usm, ptr);
+            mem_d, eng, ocl_interop::memory_kind::usm, ptr.get());
 
-    ASSERT_EQ(ptr, mem.get_data_handle());
+    ASSERT_EQ(ptr.get(), mem.get_data_handle());
     ASSERT_EQ(ocl_interop::memory_kind::usm, ocl_interop::get_memory_kind(mem));
 
     {
         for (int i = 0; i < n; i++) {
-            ((float *)ptr)[i] = float(i);
+            ((float *)ptr.get())[i] = float(i);
         }
     }
 
@@ -77,8 +84,6 @@ HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_usm_test_t, Constructor) {
             ASSERT_EQ(ptr_f32[i], float(i));
         }
     }
-
-    usm::free(eng.get(), ptr);
 }
 
 HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_usm_test_t, ConstructorNone) {
@@ -202,4 +207,106 @@ HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_usm_test_t, SharedMapUnmap) {
             dnnl::impl::xpu::ocl::usm::free);
 }
 
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_usm_test_t, TestSparseMemoryCreation) {
+    engine eng(engine::kind::gpu, 0);
+    const int nnz = 12;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, memory::data_type::f32,
+                            nnz, memory::data_type::s32));
+
+    memory mem;
+    // Default memory constructor.
+    EXPECT_NO_THROW(mem = memory(md, eng));
+    // Default interop API to create a memory object.
+    EXPECT_NO_THROW(mem
+            = ocl_interop::make_memory(md, eng, ocl_interop::memory_kind::usm));
+    // User provided buffers.
+    auto ocl_values = allocate_usm(md.get_size(0), eng);
+    ASSERT_NE(ocl_values, nullptr);
+
+    auto ocl_row_indices = allocate_usm(md.get_size(1), eng);
+    ASSERT_NE(ocl_row_indices, nullptr);
+
+    auto ocl_col_indices = allocate_usm(md.get_size(2), eng);
+    ASSERT_NE(ocl_col_indices, nullptr);
+
+    EXPECT_NO_THROW(mem
+            = ocl_interop::make_memory(md, eng, ocl_interop::memory_kind::usm,
+                    {ocl_values.get(), ocl_row_indices.get(),
+                            ocl_col_indices.get()}));
+
+    ASSERT_NO_THROW(mem.set_data_handle(nullptr, 0));
+    ASSERT_NO_THROW(mem.set_data_handle(nullptr, 1));
+    ASSERT_NO_THROW(mem.set_data_handle(nullptr, 2));
+
+    ASSERT_EQ(mem.get_data_handle(0), nullptr);
+    ASSERT_EQ(mem.get_data_handle(1), nullptr);
+    ASSERT_EQ(mem.get_data_handle(2), nullptr);
+}
+
+HANDLE_EXCEPTIONS_FOR_TEST(ocl_memory_usm_test_t, TestSparseMemoryMapUnmap) {
+    engine eng(engine::kind::gpu, 0);
+
+    const int nnz = 2;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({2, 2}, memory::data_type::f32, nnz,
+                            memory::data_type::s32));
+
+    // User provided buffers.
+    std::vector<float> coo_values = {1.5, 2.5};
+    std::vector<int> row_indices = {0, 1};
+    std::vector<int> col_indices = {0, 1};
+
+    // User provided buffers.
+    auto ocl_values = allocate_usm(md.get_size(0), eng);
+    ASSERT_NE(ocl_values, nullptr);
+
+    auto ocl_row_indices = allocate_usm(md.get_size(1), eng);
+    ASSERT_NE(ocl_row_indices, nullptr);
+
+    auto ocl_col_indices = allocate_usm(md.get_size(2), eng);
+    ASSERT_NE(ocl_col_indices, nullptr);
+
+    auto s = stream(eng);
+    usm::memcpy(s.get(), ocl_values.get(), coo_values.data(), md.get_size(0));
+    usm::memcpy(
+            s.get(), ocl_row_indices.get(), row_indices.data(), md.get_size(1));
+    usm::memcpy(
+            s.get(), ocl_col_indices.get(), col_indices.data(), md.get_size(2));
+    s.wait();
+
+    memory coo_mem;
+    EXPECT_NO_THROW(coo_mem
+            = ocl_interop::make_memory(md, eng, ocl_interop::memory_kind::usm,
+                    {ocl_values.get(), ocl_row_indices.get(),
+                            ocl_col_indices.get()}));
+
+    float *mapped_coo_values = nullptr;
+    int *mapped_row_indices = nullptr;
+    int *mapped_col_indices = nullptr;
+
+    ASSERT_NO_THROW(mapped_coo_values = coo_mem.map_data<float>(0));
+    ASSERT_NO_THROW(mapped_row_indices = coo_mem.map_data<int>(1));
+    ASSERT_NO_THROW(mapped_col_indices = coo_mem.map_data<int>(2));
+
+    for (size_t i = 0; i < coo_values.size(); i++)
+        ASSERT_EQ(coo_values[i], mapped_coo_values[i]);
+
+    for (size_t i = 0; i < row_indices.size(); i++)
+        ASSERT_EQ(row_indices[i], mapped_row_indices[i]);
+
+    for (size_t i = 0; i < col_indices.size(); i++)
+        ASSERT_EQ(col_indices[i], mapped_col_indices[i]);
+
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_coo_values, 0));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_row_indices, 1));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_col_indices, 2));
+}
+#endif
+
 } // namespace dnnl
diff --git a/tests/gtests/sycl/api/CMakeLists.txt b/tests/gtests/sycl/api/CMakeLists.txt
index 541a206434a..7e1fddce82f 100644
--- a/tests/gtests/sycl/api/CMakeLists.txt
+++ b/tests/gtests/sycl/api/CMakeLists.txt
@@ -20,3 +20,20 @@ file(GLOB TEST_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/test_*.cpp)
 list(APPEND TEST_SOURCES ${MAIN_SRC_GTEST})
 
 register_exe(${TEST_EXE} "${TEST_SOURCES}" "test" "dnnl_gtest")
+# InteropReorderAndUserKernel & EltwiseWithUserKernel tests run SYCL kernels
+# so they need to be compiled with the correct device triple
+if(DNNL_WITH_SYCL)
+    if(DNNL_SYCL_GENERIC)
+        CHECK_CXX_COMPILER_FLAG("-fsycl -fsycl-targets=nvptx64-nvidia-cuda,spir64" NVIDIA_TARGET_SUPPORTED)
+    endif()
+
+    # Enable linking SYCL kernels.
+    if(DNNL_SYCL_CUDA OR (DNNL_SYCL_GENERIC AND NVIDIA_TARGET_SUPPORTED))
+        append(CMAKE_CXX_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda,spir64")
+        append(CMAKE_CXX_FLAGS "-Wno-linker-warnings")
+    endif()
+
+    if(DNNL_AMD_ENABLE_SYCL_KERNELS)
+        append(CMAKE_CXX_FLAGS "-fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}")
+    endif()
+endif()
diff --git a/tests/gtests/sycl/api/test_memory_buffer.cpp b/tests/gtests/sycl/api/test_memory_buffer.cpp
index c0884af9e89..1584650ce7c 100644
--- a/tests/gtests/sycl/api/test_memory_buffer.cpp
+++ b/tests/gtests/sycl/api/test_memory_buffer.cpp
@@ -388,6 +388,142 @@ TEST_P(sycl_memory_buffer_test, EltwiseWithUserKernel) {
     }
 }
 
+TEST_P(sycl_memory_buffer_test, MemoryOutOfScope) {
+    engine::kind eng_kind = GetParam();
+    SKIP_IF(engine::get_count(eng_kind) == 0, "Engine not found.");
+
+#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
+    SKIP_IF(eng_kind == engine::kind::cpu,
+            "Skip this test for classic CPU runtime");
+#endif
+    engine eng(eng_kind, 0);
+
+    memory::dim n = 2048;
+    memory::desc mem_d({n}, memory::data_type::f32, memory::format_tag::a);
+
+    auto eltwise_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward,
+            algorithm::eltwise_relu, mem_d, mem_d, 0.0f);
+    auto eltwise = eltwise_forward(eltwise_pd);
+
+    stream s(eng);
+    {
+        memory mem = sycl_interop::make_memory(
+                mem_d, eng, sycl_interop::memory_kind::buffer);
+        eltwise.execute(s, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}});
+    }
+    s.wait();
+}
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+TEST_P(sycl_memory_buffer_test, TestSparseMemoryCreation) {
+    engine::kind eng_kind = GetParam();
+
+#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
+    SKIP_IF(eng_kind == engine::kind::cpu,
+            "Skip this test for classic CPU runtime");
+#endif
+    SKIP_IF(engine::get_count(eng_kind) == 0, "Engine not found.");
+
+    engine eng(eng_kind, 0);
+    const int nnz = 12;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, memory::data_type::f32,
+                            nnz, memory::data_type::s32));
+    memory mem;
+    // Default memory constructor.
+    EXPECT_NO_THROW(mem = memory(md, eng));
+    // Memory object is expected to have 3 handles.
+    EXPECT_NO_THROW(mem.get_data_handle(0));
+    EXPECT_NO_THROW(mem.get_data_handle(1));
+    EXPECT_NO_THROW(mem.get_data_handle(2));
+
+    // Default interop API to create a memory object.
+    EXPECT_NO_THROW(mem = sycl_interop::make_memory(
+                            md, eng, sycl_interop::memory_kind::buffer));
+    // Memory object is expected to have 3 handles.
+    EXPECT_NO_THROW(mem.get_data_handle(0));
+    EXPECT_NO_THROW(mem.get_data_handle(1));
+    EXPECT_NO_THROW(mem.get_data_handle(2));
+
+    // User provided buffers.
+    buffer<uint8_t, 1> buf_values {range<1>(md.get_size(0))};
+    buffer<uint8_t, 1> buf_row_indices {range<1>(md.get_size(1))};
+    buffer<uint8_t, 1> buf_col_indices {range<1>(md.get_size(2))};
+
+    EXPECT_NO_THROW(mem = sycl_interop::make_memory(md, eng,
+                            sycl_interop::memory_kind::buffer,
+                            {&buf_values, &buf_row_indices, &buf_col_indices}));
+
+    auto &h1 = *reinterpret_cast<buffer<uint8_t, 1> *>(mem.get_data_handle(0));
+    auto &h2 = *reinterpret_cast<buffer<uint8_t, 1> *>(mem.get_data_handle(1));
+    auto &h3 = *reinterpret_cast<buffer<uint8_t, 1> *>(mem.get_data_handle(2));
+
+    ASSERT_EQ(h1, buf_values);
+    ASSERT_EQ(h2, buf_row_indices);
+    ASSERT_EQ(h3, buf_col_indices);
+}
+
+TEST_P(sycl_memory_buffer_test, TestSparseMemoryMapUnmap) {
+    engine::kind eng_kind = GetParam();
+
+#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
+    SKIP_IF(eng_kind == engine::kind::cpu,
+            "Skip this test for classic CPU runtime");
+#endif
+    SKIP_IF(engine::get_count(eng_kind) == 0, "Engine not found.");
+
+    engine eng(eng_kind, 0);
+
+    const int nnz = 2;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({2, 2}, memory::data_type::f32, nnz,
+                            memory::data_type::s32));
+
+    // User provided buffers.
+    std::vector<float> coo_values = {1.5, 2.5};
+    std::vector<int> row_indices = {0, 1};
+    std::vector<int> col_indices = {0, 1};
+
+    buffer<uint8_t, 1> buf_coo_values(
+            (uint8_t *)&coo_values[0], range<1>(md.get_size(0)));
+    buffer<uint8_t, 1> buf_row_indices(
+            (uint8_t *)&row_indices[0], range<1>(md.get_size(1)));
+    buffer<uint8_t, 1> buf_col_indices(
+            (uint8_t *)&col_indices[0], range<1>(md.get_size(2)));
+
+    memory coo_mem;
+    EXPECT_NO_THROW(
+            coo_mem = sycl_interop::make_memory(md, eng,
+                    sycl_interop::memory_kind::buffer,
+                    {&buf_coo_values, &buf_row_indices, &buf_col_indices}));
+
+    float *mapped_coo_values = nullptr;
+    int *mapped_row_indices = nullptr;
+    int *mapped_col_indices = nullptr;
+
+    ASSERT_NO_THROW(mapped_coo_values = coo_mem.map_data<float>(0));
+    ASSERT_NO_THROW(mapped_row_indices = coo_mem.map_data<int>(1));
+    ASSERT_NO_THROW(mapped_col_indices = coo_mem.map_data<int>(2));
+
+    for (size_t i = 0; i < coo_values.size(); i++)
+        ASSERT_EQ(coo_values[i], mapped_coo_values[i]);
+
+    for (size_t i = 0; i < row_indices.size(); i++)
+        ASSERT_EQ(row_indices[i], mapped_row_indices[i]);
+
+    for (size_t i = 0; i < col_indices.size(); i++)
+        ASSERT_EQ(col_indices[i], mapped_col_indices[i]);
+
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_coo_values, 0));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_row_indices, 1));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_col_indices, 2));
+}
+#endif
+
 namespace {
 struct PrintToStringParamName {
     template <class ParamType>
diff --git a/tests/gtests/sycl/api/test_memory_usm.cpp b/tests/gtests/sycl/api/test_memory_usm.cpp
index 003e9a1bc50..227c4ec25cd 100644
--- a/tests/gtests/sycl/api/test_memory_usm.cpp
+++ b/tests/gtests/sycl/api/test_memory_usm.cpp
@@ -30,6 +30,8 @@ namespace dnnl {
 class fill_kernel;
 
 class sycl_memory_usm_test : public ::testing::TestWithParam<engine::kind> {
+    using usm_unique_ptr_t = std::unique_ptr<void, std::function<void(void *)>>;
+
 protected:
     static void fill_data(void *usm_ptr, memory::dim n, const engine &eng) {
         auto alloc_kind = ::sycl::get_pointer_type(
@@ -47,6 +49,12 @@ class sycl_memory_usm_test : public ::testing::TestWithParam<engine::kind> {
             q.memcpy(usm_ptr, host_ptr.data(), n * sizeof(float)).wait();
         }
     }
+
+    usm_unique_ptr_t allocate_usm(size_t size, const ::sycl::device &dev,
+            const ::sycl::context &ctx) {
+        return usm_unique_ptr_t(::sycl::malloc_shared(size, dev, ctx),
+                [&](void *ptr) { ::sycl::free(ptr, ctx); });
+    }
 };
 
 TEST_P(sycl_memory_usm_test, Constructor) {
@@ -65,17 +73,18 @@ TEST_P(sycl_memory_usm_test, Constructor) {
         return;
     }
 #endif
-    void *ptr = ::sycl::malloc_shared(sizeof(float) * n,
-            sycl_interop::get_device(eng), sycl_interop::get_context(eng));
+    auto dev = sycl_interop::get_device(eng);
+    auto ctx = sycl_interop::get_context(eng);
+    auto ptr = allocate_usm(sizeof(float) * n, dev, ctx);
 
     auto mem = sycl_interop::make_memory(
-            mem_d, eng, sycl_interop::memory_kind::usm, ptr);
+            mem_d, eng, sycl_interop::memory_kind::usm, ptr.get());
 
-    ASSERT_EQ(ptr, mem.get_data_handle());
+    ASSERT_EQ(ptr.get(), mem.get_data_handle());
 
     {
         for (int i = 0; i < n; i++) {
-            ((float *)ptr)[i] = float(i);
+            ((float *)ptr.get())[i] = float(i);
         }
     }
 
@@ -86,8 +95,6 @@ TEST_P(sycl_memory_usm_test, Constructor) {
             ASSERT_EQ(ptr_f32[i], float(i));
         }
     }
-
-    ::sycl::free(ptr, sycl_interop::get_context(eng));
 }
 
 TEST_P(sycl_memory_usm_test, ConstructorNone) {
@@ -229,6 +236,152 @@ TEST_P(sycl_memory_usm_test, ErrorMemoryConstructorUsingSystemMemory) {
     }
 }
 
+TEST_P(sycl_memory_usm_test, MemoryOutOfScope) {
+    engine::kind eng_kind = GetParam();
+    SKIP_IF(engine::get_count(eng_kind) == 0, "Engine not found.");
+
+#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
+    SKIP_IF(eng_kind == engine::kind::cpu,
+            "Skip this test for classic CPU runtime");
+#endif
+    engine eng(eng_kind, 0);
+
+    memory::dim n = 2048;
+    memory::desc mem_d({n}, memory::data_type::f32, memory::format_tag::a);
+
+    auto dev = sycl_interop::get_device(eng);
+    auto ctx = sycl_interop::get_context(eng);
+    auto ptr = allocate_usm(sizeof(float) * n, dev, ctx);
+
+    auto eltwise_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward,
+            algorithm::eltwise_relu, mem_d, mem_d, 0.0f);
+    auto eltwise = eltwise_forward(eltwise_pd);
+
+    stream s(eng);
+    {
+        memory mem = sycl_interop::make_memory(
+                mem_d, eng, sycl_interop::memory_kind::usm, ptr.get());
+        eltwise.execute(s, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}});
+    }
+    s.wait();
+}
+
+#ifdef DNNL_EXPERIMENTAL_SPARSE
+TEST_P(sycl_memory_usm_test, TestSparseMemoryCreation) {
+    engine::kind eng_kind = GetParam();
+
+#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
+    SKIP_IF(eng_kind == engine::kind::cpu,
+            "Skip this test for classic CPU runtime");
+#endif
+    SKIP_IF(engine::get_count(eng_kind) == 0, "Engine not found.");
+
+    engine eng(eng_kind, 0);
+    const int nnz = 12;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, memory::data_type::f32,
+                            nnz, memory::data_type::s32));
+    memory mem;
+    // Default memory constructor.
+    EXPECT_NO_THROW(mem = memory(md, eng));
+    // Memory object is expected to have 3 handles.
+    EXPECT_NO_THROW(mem.get_data_handle(0));
+    EXPECT_NO_THROW(mem.get_data_handle(1));
+    EXPECT_NO_THROW(mem.get_data_handle(2));
+
+    // Default interop API to create a memory object.
+    EXPECT_NO_THROW(mem = sycl_interop::make_memory(
+                            md, eng, sycl_interop::memory_kind::usm));
+    // Memory object is expected to have 3 handles.
+    EXPECT_NO_THROW(mem.get_data_handle(0));
+    EXPECT_NO_THROW(mem.get_data_handle(1));
+    EXPECT_NO_THROW(mem.get_data_handle(2));
+
+    // User provided buffers.
+    auto dev = sycl_interop::get_device(eng);
+    auto ctx = sycl_interop::get_context(eng);
+
+    auto usm_values = allocate_usm(md.get_size(0), dev, ctx);
+    auto usm_row_indices = allocate_usm(md.get_size(1), dev, ctx);
+    auto usm_col_indices = allocate_usm(md.get_size(2), dev, ctx);
+
+    EXPECT_NO_THROW(mem
+            = sycl_interop::make_memory(md, eng, sycl_interop::memory_kind::usm,
+                    {usm_values.get(), usm_row_indices.get(),
+                            usm_col_indices.get()}));
+
+    ASSERT_EQ(mem.get_data_handle(0), usm_values.get());
+    ASSERT_EQ(mem.get_data_handle(1), usm_row_indices.get());
+    ASSERT_EQ(mem.get_data_handle(2), usm_col_indices.get());
+}
+
+TEST_P(sycl_memory_usm_test, TestSparseMemoryMapUnmap) {
+    engine::kind eng_kind = GetParam();
+
+#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_SYCL
+    SKIP_IF(eng_kind == engine::kind::cpu,
+            "Skip this test for classic CPU runtime");
+#endif
+    SKIP_IF(engine::get_count(eng_kind) == 0, "Engine not found.");
+
+    engine eng(eng_kind, 0);
+
+    const int nnz = 2;
+    memory::desc md;
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({2, 2}, memory::data_type::f32, nnz,
+                            memory::data_type::s32));
+
+    // User provided buffers.
+    std::vector<float> coo_values = {1.5, 2.5};
+    std::vector<int> row_indices = {0, 1};
+    std::vector<int> col_indices = {0, 1};
+
+    auto dev = sycl_interop::get_device(eng);
+    auto ctx = sycl_interop::get_context(eng);
+    auto usm_values = allocate_usm(md.get_size(0), dev, ctx);
+    auto usm_row_indices = allocate_usm(md.get_size(1), dev, ctx);
+    auto usm_col_indices = allocate_usm(md.get_size(2), dev, ctx);
+
+    for (size_t i = 0; i < coo_values.size(); i++)
+        static_cast<float *>(usm_values.get())[i] = coo_values[i];
+    for (size_t i = 0; i < row_indices.size(); i++)
+        static_cast<int *>(usm_row_indices.get())[i] = row_indices[i];
+    for (size_t i = 0; i < col_indices.size(); i++)
+        static_cast<int *>(usm_col_indices.get())[i] = col_indices[i];
+
+    memory coo_mem;
+    EXPECT_NO_THROW(coo_mem
+            = sycl_interop::make_memory(md, eng, sycl_interop::memory_kind::usm,
+                    {usm_values.get(), usm_row_indices.get(),
+                            usm_col_indices.get()}));
+
+    float *mapped_coo_values = nullptr;
+    int *mapped_row_indices = nullptr;
+    int *mapped_col_indices = nullptr;
+
+    ASSERT_NO_THROW(mapped_coo_values = coo_mem.map_data<float>(0));
+    ASSERT_NO_THROW(mapped_row_indices = coo_mem.map_data<int>(1));
+    ASSERT_NO_THROW(mapped_col_indices = coo_mem.map_data<int>(2));
+
+    for (size_t i = 0; i < coo_values.size(); i++)
+        ASSERT_EQ(coo_values[i], mapped_coo_values[i]);
+
+    for (size_t i = 0; i < row_indices.size(); i++)
+        ASSERT_EQ(row_indices[i], mapped_row_indices[i]);
+
+    for (size_t i = 0; i < col_indices.size(); i++)
+        ASSERT_EQ(col_indices[i], mapped_col_indices[i]);
+
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_coo_values, 0));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_row_indices, 1));
+    ASSERT_NO_THROW(coo_mem.unmap_data(mapped_col_indices, 2));
+}
+#endif
+
 namespace {
 struct PrintToStringParamName {
     template <class ParamType>
diff --git a/tests/gtests/test_batch_normalization.cpp b/tests/gtests/test_batch_normalization.cpp
index a421ccd7453..c28ecda5b8c 100644
--- a/tests/gtests/test_batch_normalization.cpp
+++ b/tests/gtests/test_batch_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -59,6 +59,11 @@ bool hip_check_format_tag(tag first_tag, Rest... rest_tags) {
     return hip_check_format_tag(rest_tags...);
 }
 
+bool generic_check_format_tag(tag atag) {
+    return impl::utils::one_of(atag, tag::ncw, tag::nchw, tag::ncdhw, tag::nwc,
+            tag::nhwc, tag::ndhwc, tag::any);
+}
+
 class batch_normalization_test_t
     : public ::testing::TestWithParam<batch_normalization_test_params_t> {
 private:
@@ -80,6 +85,11 @@ class batch_normalization_test_t
         SKIP_IF_HIP(!hip_check_format_tag(p.src_tag, p.dst_tag),
                 "Unsupported format tag");
 
+        SKIP_IF_GENERIC(
+                !generic_check_format_tag(p.src_tag), "Unsupported format tag");
+        SKIP_IF_GENERIC(
+                !generic_check_format_tag(p.dst_tag), "Unsupported format tag");
+
         SKIP_IF_CUDA(p.src_dt != p.dst_dt && p.src_dt != dt::undef
                         && p.dst_dt != dt::undef,
                 "Unsupported different data types for source and "
@@ -104,7 +114,7 @@ class batch_normalization_test_t
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         aa.po_eltwise = true;
 
         auto src_md = memory::desc(p.dims, p.src_dt, p.src_tag);
@@ -217,7 +227,7 @@ class batch_normalization_test_t
         // batch_normalization specific types and values
         using pd_t = batch_normalization_backward::primitive_desc;
         using hint_pd_t = batch_normalization_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_binary.cpp b/tests/gtests/test_binary.cpp
index 9c36ce575ac..f76fd5c0cbc 100644
--- a/tests/gtests/test_binary.cpp
+++ b/tests/gtests/test_binary.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2023 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -34,17 +34,18 @@ struct binary_test_params_t {
 };
 
 template <typename src0_data_t, typename src1_data_t = src0_data_t,
-        typename dst_data_t = src0_data_t>
+        typename src2_data_t = int8_t, typename dst_data_t = src0_data_t>
 class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
 private:
     binary_test_params_t p;
-    data_type src0_dt, src1_dt, dst_dt;
+    data_type src0_dt, src1_dt, src2_dt, dst_dt;
 
 protected:
     void SetUp() override {
-        src0_dt = data_traits<src0_data_t>::data_type;
-        src1_dt = data_traits<src1_data_t>::data_type;
-        dst_dt = data_traits<dst_data_t>::data_type;
+        src0_dt = data_traits_t<src0_data_t>::data_type;
+        src1_dt = data_traits_t<src1_data_t>::data_type;
+        src2_dt = data_traits_t<src2_data_t>::data_type;
+        dst_dt = data_traits_t<dst_data_t>::data_type;
 
         p = ::testing::TestWithParam<binary_test_params_t>::GetParam();
 
@@ -54,6 +55,9 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
         SKIP_IF(unsupported_data_type(src1_dt),
                 "Engine does not support this data type.");
 
+        SKIP_IF(unsupported_data_type(src2_dt),
+                "Engine does not support this data type.");
+
         SKIP_IF(unsupported_data_type(dst_dt),
                 "Engine does not support this data type.");
 
@@ -63,12 +67,19 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
         SKIP_IF_HIP(!hip_check_data_types_combination(src0_dt, src1_dt, dst_dt),
                 "Engine does not support this data type combination.");
 
+        SKIP_IF(get_test_engine_kind() == engine::kind::gpu
+                        && p.aalgorithm == algorithm::binary_select,
+                "Engine does not support ternary operations for the binary "
+                "primitive.");
+
         for (auto tag : p.srcs_format) {
             MAYBE_UNUSED(tag);
             SKIP_IF_CUDA(!cuda_check_format_tag(tag),
                     "Unsupported source format tag");
             SKIP_IF_HIP(!hip_check_format_tag(tag),
                     "Unsupported source format tag");
+            SKIP_IF_GENERIC(!generic_check_format_tag(tag),
+                    "Unsupported source format tag");
         }
         SKIP_IF_CUDA(!cuda_check_format_tag(p.dst_format),
                 "Unsupported destination format tag");
@@ -101,6 +112,14 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
         return atag == tag::abcd || atag == tag::acdb;
     }
     bool hip_check_format_tag(tag atag) { return atag == tag::abcd; }
+    bool generic_check_format_tag(tag atag) {
+        return impl::utils::one_of(atag, tag::a, tag::ab, tag::abc, tag::abcd,
+                tag::abcde, tag::abcdef, tag::abdec, tag::acb, tag::acbde,
+                tag::acbdef, tag::acdb, tag::acdeb, tag::ba, tag::bac,
+                tag::bacd, tag::bca, tag::bcda, tag::bcdea, tag::cba, tag::cdba,
+                tag::cdeba, tag::decab, tag::defcab, tag::Ab32a, tag::aBc32b,
+                tag::any);
+    }
 
     void Test() {
         auto eng = get_test_engine();
@@ -108,7 +127,7 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
 
         // binary specific types and values
         using pd_t = binary::primitive_desc;
-        allows_attr_t aa {false};
+        allows_attr_t aa {};
         aa.scales = true;
         aa.po_sum = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
         aa.po_eltwise = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
@@ -140,6 +159,8 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
             auto desc_A = memory::desc(p.dims, src0_dt, p.srcs_format[0]);
             // TODO: try to fit "reshape" logic here.
             auto desc_B = memory::desc(dims_B, src1_dt, p.srcs_format[1]);
+            auto desc_S = memory::desc(p.dims, src2_dt, p.srcs_format[0]);
+
             auto desc_C = memory::desc(p.dims, dst_dt, p.dst_format);
 
             const dnnl::impl::memory_desc_wrapper mdw_desc_A(desc_A.get());
@@ -148,14 +169,21 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
             // default pd ctor
             auto pd = pd_t();
             // regular pd ctor
-            pd = pd_t(eng, p.aalgorithm, desc_A, desc_B, desc_C);
+            pd = p.aalgorithm == algorithm::binary_select
+                    ? pd_t(eng, p.aalgorithm, desc_A, desc_B, desc_S, desc_C)
+                    : pd_t(eng, p.aalgorithm, desc_A, desc_B, desc_C);
             // test all pd ctors
             // XXX: NVidia and AMD GPU support is sparse, attributes are not
             // supported consistently across all shapes
             if (!is_nvidia_gpu(eng) && !is_amd_gpu(eng))
-                if (!has_zero_dim)
-                    test_fwd_pd_constructors<pd_t>(
-                            pd, aa, p.aalgorithm, desc_A, desc_B, desc_C);
+                if (!has_zero_dim) {
+                    if (p.aalgorithm == algorithm::binary_select)
+                        test_fwd_pd_constructors<pd_t>(pd, aa, p.aalgorithm,
+                                desc_A, desc_B, desc_S, desc_C);
+                    else
+                        test_fwd_pd_constructors<pd_t>(
+                                pd, aa, p.aalgorithm, desc_A, desc_B, desc_C);
+                }
             // test non-md query interfaces
             ASSERT_EQ(pd.get_algorithm(), p.aalgorithm);
 
@@ -198,10 +226,24 @@ class binary_test_t : public ::testing::TestWithParam<binary_test_params_t> {
             // Remove zeroes in src1 to avoid division by zero
             remove_zeroes<src1_data_t>(mem_B);
 
-            prim.execute(strm,
-                    {{DNNL_ARG_SRC_0, mem_A}, {DNNL_ARG_SRC_1, mem_B},
-                            {DNNL_ARG_DST, mem_C},
-                            {DNNL_ARG_WORKSPACE, mem_ws}});
+            if (p.aalgorithm == algorithm::binary_select) {
+                // binary select operation requires an additional descriptor
+                // for conditional input
+                const auto src2_desc = pd.src_desc(2);
+                ASSERT_TRUE(pd.query_md(query::exec_arg_md, DNNL_ARG_SRC_2)
+                        == src2_desc);
+                auto mem_S = test::make_memory(src2_desc, test_engine);
+                fill_data<src2_data_t>(
+                        src2_desc.get_size() / sizeof(src2_data_t), mem_S);
+                prim.execute(strm,
+                        {{DNNL_ARG_SRC_0, mem_A}, {DNNL_ARG_SRC_1, mem_B},
+                                {DNNL_ARG_SRC_2, mem_S}, {DNNL_ARG_DST, mem_C},
+                                {DNNL_ARG_WORKSPACE, mem_ws}});
+            } else
+                prim.execute(strm,
+                        {{DNNL_ARG_SRC_0, mem_A}, {DNNL_ARG_SRC_1, mem_B},
+                                {DNNL_ARG_DST, mem_C},
+                                {DNNL_ARG_WORKSPACE, mem_ws}});
             strm.wait();
         }
     }
@@ -318,7 +360,9 @@ static auto zero_dim = []() {
             binary_test_params_t {{tag::nhwc, tag::nChw16c}, tag::nhwc,
                     algorithm::binary_eq, {4, 16, 7, 0}},
             binary_test_params_t {{tag::nhwc, tag::nChw16c}, tag::nhwc,
-                    algorithm::binary_ne, {4, 16, 7, 0}});
+                    algorithm::binary_ne, {4, 16, 7, 0}},
+            binary_test_params_t {{tag::nhwc, tag::nChw16c}, tag::nhwc,
+                    algorithm::binary_select, {4, 16, 7, 0}});
 };
 
 static auto simple_cases = []() {
@@ -346,7 +390,9 @@ static auto simple_cases = []() {
             binary_test_params_t {{tag::nchw, tag::nChw16c}, tag::any,
                     algorithm::binary_eq, {5, 16, 8, 7}},
             binary_test_params_t {{tag::nchw, tag::nChw16c}, tag::any,
-                    algorithm::binary_ne, {5, 16, 8, 7}});
+                    algorithm::binary_ne, {5, 16, 8, 7}},
+            binary_test_params_t {{tag::nchw, tag::nChw16c}, tag::any,
+                    algorithm::binary_select, {5, 16, 8, 7}});
 };
 
 #define INST_TEST_CASE(test) \
@@ -360,14 +406,14 @@ using binary_test_bf16 = binary_test_t<bfloat16_t>;
 using binary_test_f16 = binary_test_t<float16_t>;
 using binary_test_s8 = binary_test_t<int8_t>;
 using binary_test_u8 = binary_test_t<uint8_t>;
-using binary_test_s8u8s8 = binary_test_t<int8_t, uint8_t, int8_t>;
-using binary_test_u8s8u8 = binary_test_t<uint8_t, int8_t, uint8_t>;
-using binary_test_u8s8s8 = binary_test_t<uint8_t, int8_t, int8_t>;
-using binary_test_s8u8u8 = binary_test_t<int8_t, uint8_t, uint8_t>;
-using binary_test_s8f32u8 = binary_test_t<int8_t, float, uint8_t>;
-using binary_test_s8f32s8 = binary_test_t<int8_t, float, int8_t>;
-using binary_test_f32u8s8 = binary_test_t<float, uint8_t, int8_t>;
-using binary_test_f32f32u8 = binary_test_t<float, float, uint8_t>;
+using binary_test_s8u8s8 = binary_test_t<int8_t, uint8_t, int8_t, int8_t>;
+using binary_test_u8s8u8 = binary_test_t<uint8_t, int8_t, int8_t, uint8_t>;
+using binary_test_u8s8s8 = binary_test_t<uint8_t, int8_t, int8_t, int8_t>;
+using binary_test_s8u8u8 = binary_test_t<int8_t, uint8_t, int8_t, uint8_t>;
+using binary_test_s8f32u8 = binary_test_t<int8_t, float, int8_t, uint8_t>;
+using binary_test_s8f32s8 = binary_test_t<int8_t, float, int8_t, int8_t>;
+using binary_test_f32u8s8 = binary_test_t<float, uint8_t, int8_t, int8_t>;
+using binary_test_f32f32u8 = binary_test_t<float, float, int8_t, uint8_t>;
 
 INST_TEST_CASE(binary_test_f32)
 INST_TEST_CASE(binary_test_bf16)
diff --git a/tests/gtests/test_concat.cpp b/tests/gtests/test_concat.cpp
index 467267c65f5..3f48adc8a1a 100644
--- a/tests/gtests/test_concat.cpp
+++ b/tests/gtests/test_concat.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -99,8 +99,16 @@ class concat_test_t : public ::testing::TestWithParam<concat_test_params_t> {
                 dnnl_aBcde4b);
     }
 
+    bool generic_supported_format_tag(memory::format_tag tag) {
+        return impl::utils::one_of(tag, dnnl_a, dnnl_ab, dnnl_abc, dnnl_abcd,
+                dnnl_abcde, dnnl_abcdef, dnnl_abdec, dnnl_acb, dnnl_acbde,
+                dnnl_acbdef, dnnl_acdb, dnnl_acdeb, dnnl_ba, dnnl_bac,
+                dnnl_bacd, dnnl_bca, dnnl_bcda, dnnl_bcdea, dnnl_cba, dnnl_cdba,
+                dnnl_cdeba, dnnl_decab, dnnl_defcab);
+    }
+
     void SetUp() override {
-        auto data_type = data_traits<data_t>::data_type;
+        auto data_type = data_traits_t<data_t>::data_type;
         SKIP_IF_HIP(true, "Concat operator is not supported");
         SKIP_IF(unsupported_data_type(data_type),
                 "Engine does not support this data type.");
@@ -109,10 +117,14 @@ class concat_test_t : public ::testing::TestWithParam<concat_test_params_t> {
         for (size_t i = 0; i < p.srcs_cds.size(); i++) {
             SKIP_IF_CUDA(!cuda_supported_format_tag(p.srcs_format[i]),
                     "Unsupported format tag");
+            SKIP_IF_GENERIC(!generic_supported_format_tag(p.srcs_format[i]),
+                    "Unsupported format tag");
         }
 
         SKIP_IF_CUDA(!cuda_supported_format_tag(p.dst_format),
                 "Unsupported format tag");
+        SKIP_IF_GENERIC(!generic_supported_format_tag(p.dst_format),
+                "Unsupported format tag");
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status, false);
     }
@@ -141,7 +153,7 @@ class concat_test_t : public ::testing::TestWithParam<concat_test_params_t> {
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
 
         std::vector<memory::desc> srcs_md;
         std::vector<memory> srcs;
@@ -156,8 +168,9 @@ class concat_test_t : public ::testing::TestWithParam<concat_test_params_t> {
         // test construction from a C pd
         concat_pd = pd_t(concat_pd.get());
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         aa.scales = true;
+        aa.scales_arg = DNNL_ARG_MULTIPLE_SRC;
 
         test_fwd_pd_constructors<pd_t>(concat_pd, aa, dst_desc,
                 static_cast<int>(p.concat_dimension), srcs_md);
diff --git a/tests/gtests/test_convolution_backward_data_common.hpp b/tests/gtests/test_convolution_backward_data_common.hpp
index 361f7344165..726803be35c 100644
--- a/tests/gtests/test_convolution_backward_data_common.hpp
+++ b/tests/gtests/test_convolution_backward_data_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -86,10 +86,10 @@ void compute_ref_conv_bwd_data(const test_convolution_sizes_t &c,
 
 template <typename data_t_diff_dst, typename data_t_wei, typename data_t_acc,
         typename data_t_diff_src>
-class convolution_backward_data_test
+class convolution_backward_data_test_t
     : public ::testing::TestWithParam<test_convolution_params_t> {
 protected:
-    virtual void SetUp() {
+    void SetUp() override {
         auto p = ::testing::TestWithParam<
                 test_convolution_params_t>::GetParam();
 
@@ -106,11 +106,11 @@ class convolution_backward_data_test
                                         memory::format_tag::owi,
                                         memory::format_tag::ohwi,
                                         memory::format_tag::odhwi)))
-                        && data_traits<data_t_diff_src>::data_type
+                        && data_traits_t<data_t_diff_src>::data_type
                                 == memory::data_type::f32
-                        && data_traits<data_t_diff_dst>::data_type
+                        && data_traits_t<data_t_diff_dst>::data_type
                                 == memory::data_type::f32
-                        && data_traits<data_t_wei>::data_type
+                        && data_traits_t<data_t_wei>::data_type
                                 == memory::data_type::f32
                         && check_cuda_alg_format(p.formats.dst_format,
                                 p.formats.weights_format, p.aalgorithm)),
@@ -128,16 +128,33 @@ class convolution_backward_data_test
                                             memory::format_tag::owi,
                                             memory::format_tag::ohwi,
                                             memory::format_tag::odhwi)))
-                            && data_traits<data_t_diff_src>::data_type
+                            && data_traits_t<data_t_diff_src>::data_type
                                     == memory::data_type::f32
-                            && data_traits<data_t_diff_dst>::data_type
+                            && data_traits_t<data_t_diff_dst>::data_type
                                     == memory::data_type::f32
-                            && data_traits<data_t_wei>::data_type
+                            && data_traits_t<data_t_wei>::data_type
                                     == memory::data_type::f32
                             && check_hip_alg_format(p.formats.dst_format,
                                     p.formats.weights_format, p.aalgorithm)),
                 "Format is not supported.");
 
+        SKIP_IF_GENERIC(
+                !(generic_check_format_tags(p.formats.src_format)
+                        && generic_check_format_tags(p.formats.dst_format)
+                        && (generic_check_format_tags(p.formats.weights_format)
+                                || (impl::utils::one_of(
+                                        p.formats.weights_format,
+                                        memory::format_tag::goiw,
+                                        memory::format_tag::goihw,
+                                        memory::format_tag::goidhw,
+                                        memory::format_tag::oiw,
+                                        memory::format_tag::oihw,
+                                        memory::format_tag::oidhw)))
+                        && check_generic_dt<data_t_diff_src>()
+                        && check_generic_dt<data_t_diff_dst>()
+                        && check_generic_dt<data_t_wei>()),
+                "Format is not supported.");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -156,6 +173,14 @@ class convolution_backward_data_test
                 memory::format_tag::acdeb);
     }
 
+    bool generic_check_format_tags(memory::format_tag tag) {
+        return impl::utils::one_of(tag, memory::format_tag::ab,
+                memory::format_tag::abc, memory::format_tag::abcd,
+                memory::format_tag::abcde, memory::format_tag::abcdef,
+                memory::format_tag::acb, memory::format_tag::acdb,
+                memory::format_tag::acdeb, memory::format_tag::any);
+    }
+
     bool check_cuda_alg_format(memory::format_tag dst_fmt,
             memory::format_tag wei_fmt, algorithm alg) {
         bool res = dst_fmt == wei_fmt;
@@ -182,15 +207,23 @@ class convolution_backward_data_test
         return res;
     }
 
+    template <typename dt>
+    bool check_generic_dt() {
+        return impl::utils::one_of(data_traits_t<dt>::data_type,
+                memory::data_type::f32, memory::data_type::bf16,
+                memory::data_type::f16, memory::data_type::s32,
+                memory::data_type::s8, memory::data_type::u8);
+    }
+
     void Test() {
         auto p = ::testing::TestWithParam<
                 test_convolution_params_t>::GetParam();
         ASSERT_EQ(p.aalgorithm, algorithm::convolution_direct);
         auto eng = get_test_engine();
         auto strm = stream(eng);
-        auto data_type_diff_src = data_traits<data_t_diff_src>::data_type;
-        auto data_type_diff_dst = data_traits<data_t_diff_dst>::data_type;
-        auto data_type_wei = data_traits<data_t_wei>::data_type;
+        auto data_type_diff_src = data_traits_t<data_t_diff_src>::data_type;
+        auto data_type_diff_dst = data_traits_t<data_t_diff_dst>::data_type;
+        auto data_type_wei = data_traits_t<data_t_wei>::data_type;
 
         test_convolution_sizes_t cd = p.sizes;
 
diff --git a/tests/gtests/test_convolution_backward_data_f32.cpp b/tests/gtests/test_convolution_backward_data_f32.cpp
index 238abfe8a99..8f07c09e5b5 100644
--- a/tests/gtests/test_convolution_backward_data_f32.cpp
+++ b/tests/gtests/test_convolution_backward_data_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2020 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 namespace dnnl {
 
 using convolution_test
-        = convolution_backward_data_test<float, float, float, float>;
+        = convolution_backward_data_test_t<float, float, float, float>;
 
 TEST_P(convolution_test, TestConvolution) {}
 
diff --git a/tests/gtests/test_convolution_backward_weights_common.hpp b/tests/gtests/test_convolution_backward_weights_common.hpp
index 3fbd0d5875f..be871008a63 100644
--- a/tests/gtests/test_convolution_backward_weights_common.hpp
+++ b/tests/gtests/test_convolution_backward_weights_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -118,10 +118,10 @@ void compute_ref_conv_bwd_weights(const test_convolution_sizes_t &c,
 
 template <typename data_t_src, typename data_t_diff_dst,
         typename data_t_diff_weights, typename data_t_diff_bias>
-class convolution_backward_weights_test
+class convolution_backward_weights_test_t
     : public ::testing::TestWithParam<test_convolution_params_t> {
 protected:
-    virtual void SetUp() {
+    void SetUp() override {
         auto p = ::testing::TestWithParam<
                 test_convolution_params_t>::GetParam();
 
@@ -138,11 +138,11 @@ class convolution_backward_weights_test
                                         memory::format_tag::owi,
                                         memory::format_tag::ohwi,
                                         memory::format_tag::odhwi)))
-                        && data_traits<data_t_src>::data_type
+                        && data_traits_t<data_t_src>::data_type
                                 == memory::data_type::f32
-                        && data_traits<data_t_diff_dst>::data_type
+                        && data_traits_t<data_t_diff_dst>::data_type
                                 == memory::data_type::f32
-                        && data_traits<data_t_diff_weights>::data_type
+                        && data_traits_t<data_t_diff_weights>::data_type
                                 == memory::data_type::f32
                         && check_cuda_alg_format(p.formats.dst_format,
                                 p.formats.weights_format, p.aalgorithm)),
@@ -160,16 +160,38 @@ class convolution_backward_weights_test
                                             memory::format_tag::owi,
                                             memory::format_tag::ohwi,
                                             memory::format_tag::odhwi)))
-                            && data_traits<data_t_src>::data_type
+                            && data_traits_t<data_t_src>::data_type
                                     == memory::data_type::f32
-                            && data_traits<data_t_diff_dst>::data_type
+                            && data_traits_t<data_t_diff_dst>::data_type
                                     == memory::data_type::f32
-                            && data_traits<data_t_diff_weights>::data_type
+                            && data_traits_t<data_t_diff_weights>::data_type
                                     == memory::data_type::f32
                             && check_hip_alg_format(p.formats.dst_format,
                                     p.formats.weights_format, p.aalgorithm)),
                 "Format is not supported.");
 
+        SKIP_IF_GENERIC(
+                !(generic_check_format_tags(p.formats.src_format)
+                        && generic_check_format_tags(p.formats.dst_format)
+                        && (generic_check_format_tags(p.formats.weights_format)
+                                || (impl::utils::one_of(
+                                        p.formats.weights_format,
+                                        memory::format_tag::goiw,
+                                        memory::format_tag::goihw,
+                                        memory::format_tag::goidhw,
+                                        memory::format_tag::oiw,
+                                        memory::format_tag::oihw,
+                                        memory::format_tag::oidhw,
+                                        memory::format_tag::bacd,
+                                        memory::format_tag::bcda,
+                                        memory::format_tag::acbde,
+                                        memory::format_tag::iohw,
+                                        memory::format_tag::hwigo)))
+                        && check_generic_dt<data_t_src>()
+                        && check_generic_dt<data_t_diff_dst>()
+                        && check_generic_dt<data_t_diff_weights>()),
+                "Format is not supported.");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -214,6 +236,22 @@ class convolution_backward_weights_test
         return res;
     }
 
+    bool generic_check_format_tags(memory::format_tag tag) {
+        return impl::utils::one_of(tag, memory::format_tag::ab,
+                memory::format_tag::abc, memory::format_tag::abcd,
+                memory::format_tag::abcde, memory::format_tag::abcdef,
+                memory::format_tag::acb, memory::format_tag::acdb,
+                memory::format_tag::acdeb, memory::format_tag::any);
+    }
+
+    template <typename dt>
+    bool check_generic_dt() {
+        return impl::utils::one_of(data_traits_t<dt>::data_type,
+                memory::data_type::f32, memory::data_type::bf16,
+                memory::data_type::f16, memory::data_type::s32,
+                memory::data_type::s8, memory::data_type::u8);
+    }
+
     void Test() {
         auto p = ::testing::TestWithParam<
                 test_convolution_params_t>::GetParam();
@@ -221,13 +259,13 @@ class convolution_backward_weights_test
         ASSERT_EQ(p.aalgorithm, algorithm::convolution_direct);
         auto eng = get_test_engine();
         auto strm = stream(eng);
-        memory::data_type data_type_src = data_traits<data_t_src>::data_type;
+        memory::data_type data_type_src = data_traits_t<data_t_src>::data_type;
         memory::data_type data_type_diff_dst
-                = data_traits<data_t_diff_dst>::data_type;
+                = data_traits_t<data_t_diff_dst>::data_type;
         memory::data_type data_type_diff_weights
-                = data_traits<data_t_diff_weights>::data_type;
+                = data_traits_t<data_t_diff_weights>::data_type;
         memory::data_type data_type_diff_bias
-                = data_traits<data_t_diff_bias>::data_type;
+                = data_traits_t<data_t_diff_bias>::data_type;
 
         test_convolution_sizes_t cd = p.sizes;
 
diff --git a/tests/gtests/test_convolution_backward_weights_f32.cpp b/tests/gtests/test_convolution_backward_weights_f32.cpp
index 56ab5dc208e..e38dca33dd1 100644
--- a/tests/gtests/test_convolution_backward_weights_f32.cpp
+++ b/tests/gtests/test_convolution_backward_weights_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2020 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 namespace dnnl {
 
 using convolution_test
-        = convolution_backward_weights_test<float, float, float, float>;
+        = convolution_backward_weights_test_t<float, float, float, float>;
 
 TEST_P(convolution_test, TestConvolution) {}
 
diff --git a/tests/gtests/test_convolution_eltwise_forward_common.hpp b/tests/gtests/test_convolution_eltwise_forward_common.hpp
index 439bef5f3f0..3f857658e11 100644
--- a/tests/gtests/test_convolution_eltwise_forward_common.hpp
+++ b/tests/gtests/test_convolution_eltwise_forward_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -111,13 +111,13 @@ void compute_ref_conv_eltwise_fwd(const test_convolution_sizes_t &c,
 
 template <typename data_t_src, typename data_t_wei, typename data_t_acc,
         typename data_t_dst>
-class convolution_eltwise_test
+class convolution_eltwise_test_t
     : public ::testing::TestWithParam<test_convolution_eltwise_params_t> {
 protected:
-    virtual void SetUp() {
-        memory::data_type data_type_src = data_traits<data_t_src>::data_type;
-        memory::data_type data_type_dst = data_traits<data_t_dst>::data_type;
-        memory::data_type data_type_wei = data_traits<data_t_wei>::data_type;
+    void SetUp() override {
+        memory::data_type data_type_src = data_traits_t<data_t_src>::data_type;
+        memory::data_type data_type_dst = data_traits_t<data_t_dst>::data_type;
+        memory::data_type data_type_wei = data_traits_t<data_t_wei>::data_type;
 
         SKIP_IF(unsupported_data_type(data_type_src),
                 "Engine does not support this data type.");
@@ -161,6 +161,24 @@ class convolution_eltwise_test
                                         memory::format_tag::odhwi))),
                 "Format is not supported.");
 
+        SKIP_IF_GENERIC(
+                !(generic_check_format_tags(p.formats.src_format)
+                        && generic_check_format_tags(p.formats.dst_format)
+                        && impl::utils::one_of(p.formats.weights_format,
+                                /* weights formats */
+                                memory::format_tag::goiw,
+                                memory::format_tag::goihw,
+                                memory::format_tag::goidhw,
+                                memory::format_tag::oiw,
+                                memory::format_tag::oihw,
+                                memory::format_tag::oidhw,
+                                memory::format_tag::bacd,
+                                memory::format_tag::bcda,
+                                memory::format_tag::acbde,
+                                memory::format_tag::iohw,
+                                memory::format_tag::hwigo)),
+                "Format is not supported.");
+
         SKIP_IF_CUDA(p.alg != algorithm::eltwise_relu
                         && p.alg != algorithm::eltwise_tanh
                         && p.alg != algorithm::eltwise_elu
@@ -177,6 +195,17 @@ class convolution_eltwise_test
         SKIP_IF_HIP(p.alg == algorithm::eltwise_relu || p.eltwise_alpha != 0.0,
                 "DNNL only supports relu w/ slope=0 for integers");
 
+        SKIP_IF_GENERIC(
+                !impl::utils::one_of(p.alg, algorithm::eltwise_relu,
+                        algorithm::eltwise_linear, algorithm::eltwise_clip,
+                        algorithm::eltwise_clip_v2,
+                        algorithm::eltwise_hardswish,
+                        algorithm::eltwise_gelu_tanh,
+                        algorithm::eltwise_gelu_erf, algorithm::eltwise_tanh,
+                        algorithm::eltwise_logistic, algorithm::eltwise_swish,
+                        algorithm::eltwise_elu),
+                "Unsupported algorithm");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -201,6 +230,14 @@ class convolution_eltwise_test
                                 memory::format_tag::aBcde4b)));
     }
 
+    bool generic_check_format_tags(memory::format_tag tag) {
+        return (impl::utils::one_of(tag, memory::format_tag::ab,
+                memory::format_tag::abc, memory::format_tag::abcd,
+                memory::format_tag::abcde, memory::format_tag::abcdef,
+                memory::format_tag::acb, memory::format_tag::acdb,
+                memory::format_tag::acdeb));
+    }
+
     virtual void Test() {
         test_convolution_eltwise_params_t p = ::testing::TestWithParam<
                 test_convolution_eltwise_params_t>::GetParam();
@@ -210,9 +247,9 @@ class convolution_eltwise_test
         float eltwise_alpha = p.eltwise_alpha;
         float eltwise_beta = p.eltwise_beta;
 
-        memory::data_type data_type_src = data_traits<data_t_src>::data_type;
-        memory::data_type data_type_dst = data_traits<data_t_dst>::data_type;
-        memory::data_type data_type_wei = data_traits<data_t_wei>::data_type;
+        memory::data_type data_type_src = data_traits_t<data_t_src>::data_type;
+        memory::data_type data_type_dst = data_traits_t<data_t_dst>::data_type;
+        memory::data_type data_type_wei = data_traits_t<data_t_wei>::data_type;
 
         test_convolution_sizes_t cd = p.sizes;
 
diff --git a/tests/gtests/test_convolution_eltwise_forward_f32.cpp b/tests/gtests/test_convolution_eltwise_forward_f32.cpp
index ee3b5f12887..72988cfcdcc 100644
--- a/tests/gtests/test_convolution_eltwise_forward_f32.cpp
+++ b/tests/gtests/test_convolution_eltwise_forward_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using convolution_test = convolution_eltwise_test<float, float, float, float>;
+using convolution_test = convolution_eltwise_test_t<float, float, float, float>;
 
 TEST_P(convolution_test, TestConvolutionEltwise) {}
 
diff --git a/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp b/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp
index 19322e30597..9169aa9e119 100644
--- a/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp
+++ b/tests/gtests/test_convolution_eltwise_forward_x8s8f32s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -23,9 +23,9 @@
 namespace dnnl {
 
 using convolution_test_u8s8s32f32
-        = convolution_eltwise_test<uint8_t, int8_t, int32_t, float>;
+        = convolution_eltwise_test_t<uint8_t, int8_t, int32_t, float>;
 using convolution_test_s8s8s32f32
-        = convolution_eltwise_test<int8_t, int8_t, int32_t, float>;
+        = convolution_eltwise_test_t<int8_t, int8_t, int32_t, float>;
 
 #define EXPAND_FORMATS(src, weights, bias, dst) \
     { \
diff --git a/tests/gtests/test_convolution_format_any.cpp b/tests/gtests/test_convolution_format_any.cpp
index 304de44f828..0041910dded 100644
--- a/tests/gtests/test_convolution_format_any.cpp
+++ b/tests/gtests/test_convolution_format_any.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -58,11 +58,14 @@ class convolution_any_fmt_test_t
         ASSERT_EQ(p.aprop_kind, prop_kind::forward);
         ASSERT_EQ(p.aalgorithm, algorithm::convolution_direct);
         auto eng = get_test_engine();
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
         SKIP_IF_CUDA((p.expected_src_fmt == BLK || p.expected_dst_fmt == BLK),
                 "unsupported format");
         SKIP_IF_HIP((p.expected_src_fmt == BLK || p.expected_dst_fmt == BLK),
                 "unsupported format");
+        SKIP_IF_GENERIC(
+                (p.expected_src_fmt == BLK || p.expected_dst_fmt == BLK),
+                "unsupported format");
         ASSERT_EQ(data_type, dnnl::memory::data_type::f32);
 
         test_convolution_sizes_t cd = p.test_cd;
diff --git a/tests/gtests/test_convolution_forward_common.hpp b/tests/gtests/test_convolution_forward_common.hpp
index ed0dc828fff..cf4ce892ead 100644
--- a/tests/gtests/test_convolution_forward_common.hpp
+++ b/tests/gtests/test_convolution_forward_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -112,13 +112,13 @@ void compute_ref_conv_fwd(const test_convolution_sizes_t &c,
 
 template <typename data_t_src, typename data_t_wei, typename data_t_acc,
         typename data_t_dst>
-class convolution_forward_test
+class convolution_forward_test_t
     : public ::testing::TestWithParam<test_convolution_params_t> {
 protected:
-    virtual void SetUp() {
-        memory::data_type data_type_src = data_traits<data_t_src>::data_type;
-        memory::data_type data_type_dst = data_traits<data_t_dst>::data_type;
-        memory::data_type data_type_wei = data_traits<data_t_wei>::data_type;
+    void SetUp() override {
+        memory::data_type data_type_src = data_traits_t<data_t_src>::data_type;
+        memory::data_type data_type_dst = data_traits_t<data_t_dst>::data_type;
+        memory::data_type data_type_wei = data_traits_t<data_t_wei>::data_type;
 
         SKIP_IF(unsupported_data_type(data_type_src),
                 "Engine does not support this data type.");
@@ -162,6 +162,25 @@ class convolution_forward_test
                                         memory::format_tag::odhwi))),
                 "Format is not supported.");
 
+        SKIP_IF_GENERIC(
+                !(generic_check_format_tags(p.formats.src_format)
+                        && generic_check_format_tags(p.formats.dst_format)
+                        && (generic_check_format_tags(p.formats.weights_format)
+                                || (impl::utils::one_of(
+                                        p.formats.weights_format,
+                                        memory::format_tag::goiw,
+                                        memory::format_tag::goihw,
+                                        memory::format_tag::goidhw,
+                                        memory::format_tag::oiw,
+                                        memory::format_tag::oihw,
+                                        memory::format_tag::oidhw,
+                                        memory::format_tag::bacd,
+                                        memory::format_tag::bcda,
+                                        memory::format_tag::acbde,
+                                        memory::format_tag::iohw,
+                                        memory::format_tag::hwigo)))),
+                "Format is not supported.");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -186,6 +205,14 @@ class convolution_forward_test
                                 memory::format_tag::aBcde4b)));
     }
 
+    bool generic_check_format_tags(memory::format_tag tag) {
+        return impl::utils::one_of(tag, memory::format_tag::ab,
+                memory::format_tag::abc, memory::format_tag::abcd,
+                memory::format_tag::abcde, memory::format_tag::abcdef,
+                memory::format_tag::acb, memory::format_tag::acdb,
+                memory::format_tag::acdeb, memory::format_tag::any);
+    }
+
     void Test() {
         auto p = ::testing::TestWithParam<
                 test_convolution_params_t>::GetParam();
@@ -193,9 +220,9 @@ class convolution_forward_test
         auto eng = get_test_engine();
         auto strm = stream(eng);
 
-        memory::data_type data_type_src = data_traits<data_t_src>::data_type;
-        memory::data_type data_type_dst = data_traits<data_t_dst>::data_type;
-        memory::data_type data_type_wei = data_traits<data_t_wei>::data_type;
+        memory::data_type data_type_src = data_traits_t<data_t_src>::data_type;
+        memory::data_type data_type_dst = data_traits_t<data_t_dst>::data_type;
+        memory::data_type data_type_wei = data_traits_t<data_t_wei>::data_type;
 
         test_convolution_sizes_t cd = p.sizes;
 
diff --git a/tests/gtests/test_convolution_forward_f32.cpp b/tests/gtests/test_convolution_forward_f32.cpp
index 7cae24d5c64..2c906ea8595 100644
--- a/tests/gtests/test_convolution_forward_f32.cpp
+++ b/tests/gtests/test_convolution_forward_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2020 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 #include "test_convolution_forward_common.hpp"
 namespace dnnl {
 
-using convolution_test = convolution_forward_test<float, float, float, float>;
+using convolution_test = convolution_forward_test_t<float, float, float, float>;
 
 TEST_P(convolution_test, TestConvolution) {}
 
diff --git a/tests/gtests/test_convolution_forward_u8s8fp.cpp b/tests/gtests/test_convolution_forward_u8s8fp.cpp
index c3c374bc0f8..55f7116bb36 100644
--- a/tests/gtests/test_convolution_forward_u8s8fp.cpp
+++ b/tests/gtests/test_convolution_forward_u8s8fp.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2020 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 namespace dnnl {
 
 using convolution_test
-        = convolution_forward_test<uint8_t, int8_t, int32_t, float>;
+        = convolution_forward_test_t<uint8_t, int8_t, int32_t, float>;
 
 TEST_P(convolution_test, TestConvolution) {}
 
diff --git a/tests/gtests/test_convolution_forward_u8s8s32.cpp b/tests/gtests/test_convolution_forward_u8s8s32.cpp
index 6f433831c79..c1897fc9a0d 100644
--- a/tests/gtests/test_convolution_forward_u8s8s32.cpp
+++ b/tests/gtests/test_convolution_forward_u8s8s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2020 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 namespace dnnl {
 
 using convolution_test
-        = convolution_forward_test<uint8_t, int8_t, int32_t, int32_t>;
+        = convolution_forward_test_t<uint8_t, int8_t, int32_t, int32_t>;
 
 TEST_P(convolution_test, TestConvolution) {}
 
diff --git a/tests/gtests/test_cross_engine_reorder.cpp b/tests/gtests/test_cross_engine_reorder.cpp
index 9d2c42dffcc..9a076c97889 100644
--- a/tests/gtests/test_cross_engine_reorder.cpp
+++ b/tests/gtests/test_cross_engine_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2021 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -28,12 +28,12 @@ using f32_f32 = std::pair<float, float>;
 
 using tag = memory::format_tag;
 
-using cfg_f32 = test_simple_params<f32_f32>;
+using cfg_f32 = test_simple_params_t<f32_f32>;
 
-using reorder_simple_test_f32_f32 = reorder_simple_test<f32_f32>;
+using reorder_simple_test_t_f32_f32 = reorder_simple_test_t<f32_f32>;
 
 #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
-TEST_P(reorder_simple_test_f32_f32, CPU_GPU) {
+TEST_P(reorder_simple_test_t_f32_f32, CPU_GPU) {
     SKIP_IF(engine::get_count(engine::kind::gpu) == 0,
             "GPU engines not found.");
 
@@ -43,7 +43,7 @@ TEST_P(reorder_simple_test_f32_f32, CPU_GPU) {
     Test(eng_cpu, eng_gpu);
 }
 
-TEST_P(reorder_simple_test_f32_f32, GPU_CPU) {
+TEST_P(reorder_simple_test_t_f32_f32, GPU_CPU) {
     SKIP_IF(engine::get_count(engine::kind::gpu) == 0,
             "GPU engines not found.");
 
@@ -54,7 +54,7 @@ TEST_P(reorder_simple_test_f32_f32, GPU_CPU) {
 }
 #endif
 
-TEST_P(reorder_simple_test_f32_f32, GPU_GPU) {
+TEST_P(reorder_simple_test_t_f32_f32, GPU_GPU) {
     SKIP_IF(engine::get_count(engine::kind::gpu) == 0,
             "GPU engines not found.");
 
@@ -68,7 +68,7 @@ TEST_P(reorder_simple_test_f32_f32, GPU_GPU) {
     Test(eng_gpu1, eng_gpu2);
 }
 
-INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_f32_f32,
+INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {tag::nchw, tag::nhwc, {32, 48, 5, 4}},
                 cfg_f32 {tag::oihw, tag::IOhw16i16o, {32, 48, 2, 3}},
                 cfg_f32 {tag::oihw, tag::OIhw16o16i, {32, 32, 1, 1}},
diff --git a/tests/gtests/test_deconvolution.cpp b/tests/gtests/test_deconvolution.cpp
index 5b3c63eaed0..3cb70da6a3b 100644
--- a/tests/gtests/test_deconvolution.cpp
+++ b/tests/gtests/test_deconvolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -135,7 +135,7 @@ class deconvolution_test_t
 
 protected:
     void SetUp() override {
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
         SKIP_IF(unsupported_data_type(data_type),
                 "Engine does not support this data type.");
 
@@ -158,6 +158,11 @@ class deconvolution_test_t
                                 p.formats.dst_format, p.sizes.ng > 1)),
                 "Format is not supported.");
 
+        SKIP_IF_GENERIC(
+                !(generic_check_format_tags(p.formats.src_format)
+                        && generic_check_format_tags(p.formats.dst_format)),
+                "Format is not supported.");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -204,6 +209,13 @@ class deconvolution_test_t
         return false;
     }
 
+    bool generic_check_format_tags(memory::format_tag tag) {
+        return impl::utils::one_of(tag, memory::format_tag::ab,
+                memory::format_tag::abc, memory::format_tag::abcd,
+                memory::format_tag::abcde, memory::format_tag::abcdef,
+                memory::format_tag::any);
+    }
+
     void Test() {
         auto p = ::testing::TestWithParam<
                 deconvolution_test_params_t>::GetParam();
@@ -212,7 +224,7 @@ class deconvolution_test_t
         strm = make_stream(eng);
 
         ASSERT_EQ(p.aalgorithm, algorithm::deconvolution_direct);
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
 
         test_convolution_sizes_t dd = p.sizes;
         with_bias = p.formats.bias_format != memory::format_tag::undef;
@@ -297,11 +309,21 @@ class deconvolution_test_t
                         *dec_src_desc, *dec_weights_desc, *dec_dst_desc,
                         strides, padL, padR);
 
-        auto aa = allows_attr_t {false};
-        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
-        aa.po_eltwise = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
-        aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
-        aa.po_sum = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        allows_attr_t aa {};
+
+#ifndef DNNL_SYCL_GENERIC
+        aa.po_binary = !is_amd_gpu(eng);
+        aa.po_eltwise = !is_amd_gpu(eng);
+        aa.po_prelu = !is_amd_gpu(eng);
+        aa.po_sum = !is_amd_gpu(eng);
+#else
+        aa.po_eltwise = true;
+        aa.po_sum = true;
+        if (eng.get_kind() == dnnl::engine::kind::cpu) {
+            aa.po_binary = true;
+            aa.po_prelu = true;
+        }
+#endif
 
         bool is_int8 = impl::utils::one_of(dec_src_desc->get_data_type(),
                 memory::data_type::s8, memory::data_type::u8);
@@ -398,7 +420,7 @@ class deconvolution_test_t
                         *dec_weights_desc, *dec_dst_desc, strides, padL, padR,
                         deconv_primitive_desc);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         test_bwd_pd_constructors<pd_t, hint_pd_t>(
                 deconv_bwd_data_primitive_desc, deconv_primitive_desc, aa,
                 algorithm::deconvolution_direct, *dec_src_desc,
@@ -471,7 +493,7 @@ class deconvolution_test_t
                         *dec_weights_desc, *dec_bias_desc, *dec_dst_desc,
                         strides, padL, padR, deconv_primitive_desc);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         test_bwd_pd_constructors<pd_t, hint_pd_t>(
                 deconv_bwd_weights_primitive_desc, deconv_primitive_desc, aa,
                 algorithm::deconvolution_direct, *dec_src_desc,
diff --git a/tests/gtests/test_eltwise.cpp b/tests/gtests/test_eltwise.cpp
index 5508cb3da9a..37ca4192828 100644
--- a/tests/gtests/test_eltwise.cpp
+++ b/tests/gtests/test_eltwise.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -62,6 +62,13 @@ bool hip_check_format_tag(tag first_tag, Rest... rest_tags) {
     return hip_check_format_tag(rest_tags...);
 }
 
+template <typename... Rest>
+bool generic_check_format_tag(tag first_tag, Rest... rest_tags) {
+    const bool ok = hip_check_format_tag(first_tag);
+    if (!ok) return ok;
+    return hip_check_format_tag(rest_tags...);
+}
+
 class eltwise_test_t : public ::testing::TestWithParam<eltwise_test_params_t> {
 private:
     eltwise_test_params_t p;
@@ -83,6 +90,8 @@ class eltwise_test_t : public ::testing::TestWithParam<eltwise_test_params_t> {
                 "Unsupported format tag");
         SKIP_IF_HIP(!hip_check_format_tag(p.src_tag, p.dst_tag),
                 "Unsupported format tag");
+        SKIP_IF_GENERIC(!generic_check_format_tag(p.src_tag, p.dst_tag),
+                "Unsupported format tag");
         SKIP_IF_CUDA(p.src_dt != p.dst_dt && p.src_dt != dt::undef
                         && p.dst_dt != dt::undef,
                 "Unsupported different data types for source and "
@@ -111,7 +120,7 @@ class eltwise_test_t : public ::testing::TestWithParam<eltwise_test_params_t> {
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         aa.po_binary = !is_amd_gpu(eng);
 
         auto src_md = memory::desc(p.dims, p.src_dt, p.src_tag);
@@ -177,7 +186,7 @@ class eltwise_test_t : public ::testing::TestWithParam<eltwise_test_params_t> {
         // eltwise specific types and values
         using pd_t = eltwise_backward::primitive_desc;
         using hint_pd_t = eltwise_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_gemm_bf16bf16bf16.cpp b/tests/gtests/test_gemm_bf16bf16bf16.cpp
index 7455cfd7b4f..9c136fe7f66 100644
--- a/tests/gtests/test_gemm_bf16bf16bf16.cpp
+++ b/tests/gtests/test_gemm_bf16bf16bf16.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<bfloat16_t, bfloat16_t, bfloat16_t>;
+using gemm_test = gemm_test_common_t<bfloat16_t, bfloat16_t, bfloat16_t>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_bf16bf16f32.cpp b/tests/gtests/test_gemm_bf16bf16f32.cpp
index bbf56084e9b..7cf8dba6dd1 100644
--- a/tests/gtests/test_gemm_bf16bf16f32.cpp
+++ b/tests/gtests/test_gemm_bf16bf16f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<bfloat16_t, bfloat16_t, float>;
+using gemm_test = gemm_test_common_t<bfloat16_t, bfloat16_t, float>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_common.hpp b/tests/gtests/test_gemm_common.hpp
index 2bdbd4ed8f8..0223d9c1a81 100644
--- a/tests/gtests/test_gemm_common.hpp
+++ b/tests/gtests/test_gemm_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2024 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -95,16 +95,16 @@ bool is_memory_kind_buffer(const test_memory &mem) {
  */
 
 template <typename a_dt, typename b_dt, typename c_dt>
-struct dnnl_gemm {
-    static dnnl_status_t call(test_params &p, const test_memory &a_mem,
+struct dnnl_gemm_t {
+    static dnnl_status_t call(test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem) {
         throw error(dnnl_runtime_error, "unknown gemm");
     }
 };
 
 template <>
-struct dnnl_gemm<float16_t, float16_t, float16_t> {
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+struct dnnl_gemm_t<float16_t, float16_t, float16_t> {
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &) {
         throw error(dnnl_runtime_error, "unknown gemm");
@@ -112,8 +112,8 @@ struct dnnl_gemm<float16_t, float16_t, float16_t> {
 };
 
 template <>
-struct dnnl_gemm<float, float, float> {
-    static dnnl_status_t call_packed(const test_params &p,
+struct dnnl_gemm_t<float, float, float> {
+    static dnnl_status_t call_packed(const test_params_t &p,
             const test_memory &a_mem, const test_memory &b_mem,
             const test_memory &c_mem) {
         /* Alas, the internal API still uses Fortran notation.
@@ -181,7 +181,7 @@ struct dnnl_gemm<float, float, float> {
         return status;
     }
 
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &) {
 
@@ -207,8 +207,8 @@ struct dnnl_gemm<float, float, float> {
 };
 
 template <>
-struct dnnl_gemm<int8_t, int8_t, int32_t> {
-    static dnnl_status_t call_packed(const test_params &p,
+struct dnnl_gemm_t<int8_t, int8_t, int32_t> {
+    static dnnl_status_t call_packed(const test_params_t &p,
             const test_memory &a_mem, const test_memory &b_mem,
             const test_memory &c_mem, const test_memory &oc_mem) {
         /* Alas, the internal API still uses Fortran notation.
@@ -291,7 +291,7 @@ struct dnnl_gemm<int8_t, int8_t, int32_t> {
         return status;
     }
 
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &oc_mem) {
 
@@ -322,8 +322,8 @@ struct dnnl_gemm<int8_t, int8_t, int32_t> {
 };
 
 template <>
-struct dnnl_gemm<int8_t, uint8_t, int32_t> {
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+struct dnnl_gemm_t<int8_t, uint8_t, int32_t> {
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &oc_mem) {
         throw error(dnnl_runtime_error, "unknown gemm");
@@ -331,8 +331,8 @@ struct dnnl_gemm<int8_t, uint8_t, int32_t> {
 };
 
 template <>
-struct dnnl_gemm<uint8_t, uint8_t, int32_t> {
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+struct dnnl_gemm_t<uint8_t, uint8_t, int32_t> {
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &oc_mem) {
 
@@ -341,8 +341,8 @@ struct dnnl_gemm<uint8_t, uint8_t, int32_t> {
 };
 
 template <>
-struct dnnl_gemm<uint8_t, int8_t, int32_t> {
-    static dnnl_status_t call_packed(const test_params &p,
+struct dnnl_gemm_t<uint8_t, int8_t, int32_t> {
+    static dnnl_status_t call_packed(const test_params_t &p,
             const test_memory &a_mem, const test_memory &b_mem,
             const test_memory &c_mem, const test_memory &oc_mem) {
         /* Alas, the internal API still uses Fortran notation.
@@ -425,7 +425,7 @@ struct dnnl_gemm<uint8_t, int8_t, int32_t> {
         return status;
     }
 
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &oc_mem) {
         assert(p.igemm_params.oa() >= 0);
@@ -457,8 +457,8 @@ struct dnnl_gemm<uint8_t, int8_t, int32_t> {
 };
 
 template <>
-struct dnnl_gemm<float16_t, float16_t, float> {
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+struct dnnl_gemm_t<float16_t, float16_t, float> {
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &) {
         return dnnl_unimplemented;
@@ -466,8 +466,8 @@ struct dnnl_gemm<float16_t, float16_t, float> {
 };
 
 template <>
-struct dnnl_gemm<bfloat16_t, bfloat16_t, float> {
-    static dnnl_status_t call_packed(const test_params &p,
+struct dnnl_gemm_t<bfloat16_t, bfloat16_t, float> {
+    static dnnl_status_t call_packed(const test_params_t &p,
             const test_memory &a_mem, const test_memory &b_mem,
             const test_memory &c_mem) {
         /* Alas, the internal API still uses Fortran notation.
@@ -535,7 +535,7 @@ struct dnnl_gemm<bfloat16_t, bfloat16_t, float> {
         return status;
     }
 
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &) {
         if (p.pack_params.pack_a || p.pack_params.pack_b)
@@ -550,8 +550,8 @@ struct dnnl_gemm<bfloat16_t, bfloat16_t, float> {
 };
 
 template <>
-struct dnnl_gemm<bfloat16_t, bfloat16_t, bfloat16_t> {
-    static dnnl_status_t call(const test_params &p, const test_memory &a_mem,
+struct dnnl_gemm_t<bfloat16_t, bfloat16_t, bfloat16_t> {
+    static dnnl_status_t call(const test_params_t &p, const test_memory &a_mem,
             const test_memory &b_mem, const test_memory &c_mem,
             const test_memory &) {
         return dnnl_unimplemented;
@@ -559,12 +559,12 @@ struct dnnl_gemm<bfloat16_t, bfloat16_t, bfloat16_t> {
 };
 
 template <typename a_dt, typename b_dt, typename c_dt>
-struct run_test_gemm {
-    static void call(const test_params &p) {
+struct run_test_gemm_t {
+    static void call(const test_params_t &p) {
         if (p.expect_to_fail) {
             engine eng = get_test_engine();
             test_memory zero_mem({}, eng);
-            auto status = dnnl_gemm<a_dt, b_dt, c_dt>::call(
+            auto status = dnnl_gemm_t<a_dt, b_dt, c_dt>::call(
                     p, zero_mem, zero_mem, zero_mem, zero_mem);
             if (status != dnnl_success)
                 throw error(status, "oneDNN gemm returned error");
@@ -572,10 +572,10 @@ struct run_test_gemm {
         }
 
         engine eng = get_test_engine();
-        test_gemm_data gemm_data;
+        test_gemm_data_t gemm_data;
         prepare_data_for_gemm_testing<a_dt, b_dt, c_dt>(p, gemm_data, eng);
 
-        auto status = dnnl_gemm<a_dt, b_dt, c_dt>::call(p, *gemm_data.a_mem,
+        auto status = dnnl_gemm_t<a_dt, b_dt, c_dt>::call(p, *gemm_data.a_mem,
                 *gemm_data.b_mem, *gemm_data.c_mem, *gemm_data.oc_mem);
 
         if (status == dnnl_success) {
@@ -588,10 +588,10 @@ struct run_test_gemm {
 };
 
 template <typename a_dt, typename b_dt, typename c_dt>
-class gemm_test_common : public ::testing::TestWithParam<test_params> {
+class gemm_test_common_t : public ::testing::TestWithParam<test_params_t> {
 protected:
-    virtual void SetUp() {
-        const auto &p = ::testing::TestWithParam<test_params>::GetParam();
+    void SetUp() override {
+        const auto &p = ::testing::TestWithParam<test_params_t>::GetParam();
 
         SKIP_IF(get_test_engine_kind() == engine::kind::gpu,
                 "GPU GEMM not implemented.");
@@ -605,10 +605,11 @@ class gemm_test_common : public ::testing::TestWithParam<test_params> {
         SKIP_IF(!zero_off && get_test_engine_kind() == engine::kind::cpu,
                 "CPU does not support non-zero offsets.");
 
-        SKIP_IF(unsupported_data_type(data_traits<a_dt>::data_type),
+        SKIP_IF(unsupported_data_type(data_traits_t<a_dt>::data_type),
                 "Engine does not support this data type.");
 
-        bool is_f16 = (data_traits<a_dt>::data_type == memory::data_type::f16);
+        bool is_f16
+                = (data_traits_t<a_dt>::data_type == memory::data_type::f16);
         SKIP_IF(is_f16 && get_test_engine_kind() == engine::kind::cpu,
                 "CPU does not support f16 data type.");
 
@@ -618,9 +619,9 @@ class gemm_test_common : public ::testing::TestWithParam<test_params> {
 #endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
         SKIP_IF(get_test_engine_kind() == engine::kind::gpu
-                        && (data_traits<a_dt>::data_type
+                        && (data_traits_t<a_dt>::data_type
                                         == memory::data_type::u8
-                                || data_traits<a_dt>::data_type
+                                || data_traits_t<a_dt>::data_type
                                         == memory::data_type::s8),
                 "SYCL GPU int GEMM not implemented.");
         SKIP_IF_CUDA(true, "Test not supported in CUDA backend");
@@ -628,9 +629,9 @@ class gemm_test_common : public ::testing::TestWithParam<test_params> {
 
 #if DNNL_X64
         bool is_bf16bf16f32 = true
-                && data_traits<a_dt>::data_type == memory::data_type::bf16
-                && data_traits<b_dt>::data_type == memory::data_type::bf16
-                && data_traits<c_dt>::data_type == memory::data_type::f32;
+                && data_traits_t<a_dt>::data_type == memory::data_type::bf16
+                && data_traits_t<b_dt>::data_type == memory::data_type::bf16
+                && data_traits_t<c_dt>::data_type == memory::data_type::f32;
 
         SKIP_IF(is_bf16bf16f32 && get_test_engine_kind() == engine::kind::cpu
                         && !dnnl::mayiuse(cpu_isa::avx512_core),
@@ -644,10 +645,10 @@ class gemm_test_common : public ::testing::TestWithParam<test_params> {
                         || p.igemm_params.ob() != 0)
                         && pack,
                 "Packed GEMM doesn't support alpha or non-zero offset{A,B}.");
-        SKIP_IF(data_traits<b_dt>::data_type == memory::data_type::u8
+        SKIP_IF(data_traits_t<b_dt>::data_type == memory::data_type::u8
                         && get_test_engine_kind() == engine::kind::cpu,
                 "CPU does not support s8u8s32 and u8u8s32 GEMM.");
-        SKIP_IF(data_traits<c_dt>::data_type == memory::data_type::bf16
+        SKIP_IF(data_traits_t<c_dt>::data_type == memory::data_type::bf16
                         && get_test_engine_kind() == engine::kind::cpu,
                 "CPU does not support bf16bf16bf16 GEMM.");
 
@@ -660,24 +661,24 @@ class gemm_test_common : public ::testing::TestWithParam<test_params> {
 #endif
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
         if (get_test_engine_kind() == engine::kind::gpu) {
-            const auto &p = ::testing::TestWithParam<test_params>::GetParam();
+            const auto &p = ::testing::TestWithParam<test_params_t>::GetParam();
 
 #if defined(TEST_DNNL_DPCPP_BUFFER)
             // Test SYCL buffer interfaces
-            run_test_gemm<a_dt, b_dt, c_dt>::call(p);
+            run_test_gemm_t<a_dt, b_dt, c_dt>::call(p);
 #else
             // Test SYCL USM interfaces
             bool zero_off = (p.off.a == 0 && p.off.b == 0 && p.off.c == 0);
             SKIP_IF(!zero_off, "USM interfaces do not support offsets.");
 
-            run_test_gemm<a_dt, b_dt, c_dt>::call(p);
+            run_test_gemm_t<a_dt, b_dt, c_dt>::call(p);
 #endif
 
             return;
         }
 #endif
-        const auto &p = ::testing::TestWithParam<test_params>::GetParam();
-        run_test_gemm<a_dt, b_dt, c_dt>::call(p);
+        const auto &p = ::testing::TestWithParam<test_params_t>::GetParam();
+        run_test_gemm_t<a_dt, b_dt, c_dt>::call(p);
     }
 };
 } // namespace dnnl
diff --git a/tests/gtests/test_gemm_data_preparation.hpp b/tests/gtests/test_gemm_data_preparation.hpp
index ec84eec5f44..1ddfb857174 100644
--- a/tests/gtests/test_gemm_data_preparation.hpp
+++ b/tests/gtests/test_gemm_data_preparation.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,12 +53,7 @@ static constexpr int N_test_max = 53;
  * a surjective function from {0, ..., dim-1} onto {0, ..., dim_test-1}.
  */
 struct mapper_t {
-    mapper_t(const mapper_t &other)
-        : dim_(other.dim_)
-        , dim_test_(other.dim_test_)
-        , gen_(other.gen_)
-        , gen_start_(other.gen_start_)
-        , mapper_(other.mapper_) {}
+    mapper_t(const mapper_t &other) = default;
 
     mapper_t(mapper_t &&other) noexcept
         : dim_(other.dim_)
@@ -93,7 +88,7 @@ struct mapper_t {
     std::vector<int64_t> mapper_;
 };
 
-struct test_gemm_data {
+struct test_gemm_data_t {
     std::shared_ptr<test_memory> a_mem;
     std::shared_ptr<test_memory> b_mem;
     std::shared_ptr<test_memory> c_mem;
@@ -112,7 +107,7 @@ template <typename data_t>
 void prepare_matrix(const test_memory &M_mem, int64_t off_beg, layout_t layout,
         int64_t R, int64_t C, int64_t LD, const mapper_t &mapper) {
     auto M = map_memory<data_t>(M_mem);
-    auto dt = data_traits<data_t>::data_type;
+    auto dt = data_traits_t<data_t>::data_type;
     bool is_fp = (false || dt == memory::data_type::f16
             || dt == memory::data_type::bf16 || dt == memory::data_type::f32);
     const data_t mean = (data_t)(is_fp ? 1.f : 4);
@@ -203,7 +198,7 @@ void extend_matrix(const test_memory &M_mem, int64_t off, int64_t R, int64_t C,
 }
 
 inline void get_matrix_size(
-        const test_params &p, size_t &sizeA, size_t &sizeB, size_t &sizeC) {
+        const test_params_t &p, size_t &sizeA, size_t &sizeB, size_t &sizeC) {
     const bool tr_a = (p.transA == 'T' || p.transA == 't');
     const bool tr_b = (p.transB == 'T' || p.transB == 't');
     sizeA = tr_a ? p.lda * p.K : p.lda * p.M;
@@ -214,11 +209,11 @@ inline void get_matrix_size(
 template <typename T>
 inline memory::desc get_matrix_md(memory::dim n, memory::dim off) {
     return create_md(
-            {n + off}, data_traits<T>::data_type, memory::format_tag::x);
+            {n + off}, data_traits_t<T>::data_type, memory::format_tag::x);
 }
 
 template <typename a_dt, typename b_dt, typename c_dt>
-void fill_matrices(const test_params &p, const mapper_t &mapper_m,
+void fill_matrices(const test_params_t &p, const mapper_t &mapper_m,
         const mapper_t &mapper_n, const test_memory &a_mem,
         const test_memory &b_mem, const test_memory &c_mem,
         const test_memory &c_ref_mem, const test_memory &oc_mem) {
@@ -256,23 +251,23 @@ void fill_matrices(const test_params &p, const mapper_t &mapper_m,
 
 template <typename a_dt, typename b_dt, typename c_dt>
 void prepare_data_for_gemm_testing(
-        const test_params &p, test_gemm_data &gemm_data, engine &eng) {
+        const test_params_t &p, test_gemm_data_t &gemm_data, engine &eng) {
     size_t sizeA, sizeB, sizeC;
     get_matrix_size(p, sizeA, sizeB, sizeC);
 
-    gemm_data.a_mem.reset(
-            new test_memory(get_matrix_md<a_dt>(sizeA, p.off.a), eng));
-    gemm_data.b_mem.reset(
-            new test_memory(get_matrix_md<b_dt>(sizeB, p.off.b), eng));
-    gemm_data.c_mem.reset(
-            new test_memory(get_matrix_md<c_dt>(sizeC, p.off.c), eng));
-    gemm_data.c_ref_mem.reset(
-            new test_memory(get_matrix_md<c_dt>(sizeC, p.off.c), eng));
-    gemm_data.oc_mem.reset(
-            new test_memory(get_matrix_md<c_dt>(p.size_oc(), p.off.co), eng));
-
-    gemm_data.mapper_m.reset(new mapper_t(p.M, M_test_max));
-    gemm_data.mapper_n.reset(new mapper_t(p.N, N_test_max));
+    gemm_data.a_mem = std::make_shared<test_memory>(
+            get_matrix_md<a_dt>(sizeA, p.off.a), eng);
+    gemm_data.b_mem = std::make_shared<test_memory>(
+            get_matrix_md<b_dt>(sizeB, p.off.b), eng);
+    gemm_data.c_mem = std::make_shared<test_memory>(
+            get_matrix_md<c_dt>(sizeC, p.off.c), eng);
+    gemm_data.c_ref_mem = std::make_shared<test_memory>(
+            get_matrix_md<c_dt>(sizeC, p.off.c), eng);
+    gemm_data.oc_mem = std::make_shared<test_memory>(
+            get_matrix_md<c_dt>(p.size_oc(), p.off.co), eng);
+
+    gemm_data.mapper_m = std::make_shared<mapper_t>(p.M, M_test_max);
+    gemm_data.mapper_n = std::make_shared<mapper_t>(p.N, N_test_max);
 
     fill_matrices<a_dt, b_dt, c_dt>(p, *gemm_data.mapper_m, *gemm_data.mapper_n,
             *gemm_data.a_mem, *gemm_data.b_mem, *gemm_data.c_mem,
diff --git a/tests/gtests/test_gemm_f16.cpp b/tests/gtests/test_gemm_f16.cpp
index 4eadf5387b4..c783ea78a4e 100644
--- a/tests/gtests/test_gemm_f16.cpp
+++ b/tests/gtests/test_gemm_f16.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<float16_t, float16_t, float16_t>;
+using gemm_test = gemm_test_common_t<float16_t, float16_t, float16_t>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_f16f16f32.cpp b/tests/gtests/test_gemm_f16f16f32.cpp
index cc7d3dc6c7e..14bc289274b 100644
--- a/tests/gtests/test_gemm_f16f16f32.cpp
+++ b/tests/gtests/test_gemm_f16f16f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<float16_t, float16_t, float>;
+using gemm_test = gemm_test_common_t<float16_t, float16_t, float>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_f32.cpp b/tests/gtests/test_gemm_f32.cpp
index 0053a097bfb..e5a1ba6b834 100644
--- a/tests/gtests/test_gemm_f32.cpp
+++ b/tests/gtests/test_gemm_f32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2020 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<float, float, float>;
+using gemm_test = gemm_test_common_t<float, float, float>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_params.hpp b/tests/gtests/test_gemm_params.hpp
index 67a6134b5bf..524bceea10c 100644
--- a/tests/gtests/test_gemm_params.hpp
+++ b/tests/gtests/test_gemm_params.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@ namespace dnnl {
 
 enum class layout_t { ROW_MAJOR, COL_MAJOR };
 
-struct test_igemm_params {
+struct test_igemm_params_t {
     char offsetc;
     bool nonzero_oa;
     bool nonzero_ob;
@@ -37,19 +37,19 @@ struct test_igemm_params {
     int8_t ob() const { return (int8_t)(nonzero_ob ? 3 : 0); }
 };
 
-struct test_pack_params {
+struct test_pack_params_t {
     bool pack_a;
     bool pack_b;
 };
 
-struct gemm_offset {
+struct gemm_offset_t {
     int64_t a;
     int64_t b;
     int64_t c;
     int64_t co;
 };
 
-struct test_params {
+struct test_params_t {
     char transA;
     char transB;
     int64_t M;
@@ -61,12 +61,12 @@ struct test_params {
     int64_t ldb;
     int64_t ldc;
 
-    test_igemm_params igemm_params;
-    test_pack_params pack_params;
+    test_igemm_params_t igemm_params;
+    test_pack_params_t pack_params;
     bool expect_to_fail;
     dnnl_status_t expected_status;
 
-    gemm_offset off;
+    gemm_offset_t off;
 
     bool tr_a() const { return transA == 'T' || transA == 't'; }
     bool tr_b() const { return transB == 'T' || transB == 't'; }
@@ -84,17 +84,17 @@ struct test_params {
 };
 
 template <typename... TArgs>
-inline test_params make_test_params_with_offset(
-        const gemm_offset &off, TArgs &&...args) {
-    test_params params {std::forward<TArgs>(args)...};
+inline test_params_t make_test_params_with_offset(
+        const gemm_offset_t &off, TArgs &&...args) {
+    test_params_t params {std::forward<TArgs>(args)...};
     params.off = off;
     return params;
 }
 
 template <typename... TArgs>
-inline test_params make_test_params_pack(
-        const test_pack_params &pack_params, TArgs &&...args) {
-    test_params params {std::forward<TArgs>(args)...};
+inline test_params_t make_test_params_pack(
+        const test_pack_params_t &pack_params, TArgs &&...args) {
+    test_params_t params {std::forward<TArgs>(args)...};
     params.pack_params = pack_params;
     return params;
 }
diff --git a/tests/gtests/test_gemm_s8s8s32.cpp b/tests/gtests/test_gemm_s8s8s32.cpp
index f0872cb62cc..8b698ee213d 100644
--- a/tests/gtests/test_gemm_s8s8s32.cpp
+++ b/tests/gtests/test_gemm_s8s8s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2020 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<int8_t, int8_t, int32_t>;
+using gemm_test = gemm_test_common_t<int8_t, int8_t, int32_t>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_s8u8s32.cpp b/tests/gtests/test_gemm_s8u8s32.cpp
index d758efff9e6..15355de50d7 100644
--- a/tests/gtests/test_gemm_s8u8s32.cpp
+++ b/tests/gtests/test_gemm_s8u8s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<int8_t, uint8_t, int32_t>;
+using gemm_test = gemm_test_common_t<int8_t, uint8_t, int32_t>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_u8s8s32.cpp b/tests/gtests/test_gemm_u8s8s32.cpp
index aba03880d3c..799d6829d67 100644
--- a/tests/gtests/test_gemm_u8s8s32.cpp
+++ b/tests/gtests/test_gemm_u8s8s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2020 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<uint8_t, int8_t, int32_t>;
+using gemm_test = gemm_test_common_t<uint8_t, int8_t, int32_t>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_u8u8s32.cpp b/tests/gtests/test_gemm_u8u8s32.cpp
index 1eac2130454..2458af598a6 100644
--- a/tests/gtests/test_gemm_u8u8s32.cpp
+++ b/tests/gtests/test_gemm_u8u8s32.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2020 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@
 
 namespace dnnl {
 
-using gemm_test = gemm_test_common<uint8_t, uint8_t, int32_t>;
+using gemm_test = gemm_test_common_t<uint8_t, uint8_t, int32_t>;
 
 TEST_P(gemm_test, TestGEMM) {}
 
diff --git a/tests/gtests/test_gemm_validation.hpp b/tests/gtests/test_gemm_validation.hpp
index 16912117964..c8e396ab59c 100644
--- a/tests/gtests/test_gemm_validation.hpp
+++ b/tests/gtests/test_gemm_validation.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,8 +24,8 @@
 namespace dnnl {
 
 template <typename a_dt, typename b_dt, typename c_dt>
-struct ref_gemm {
-    static void call(const test_params &p, int64_t M, int64_t N,
+struct ref_gemm_t {
+    static void call(const test_params_t &p, int64_t M, int64_t N,
             const test_memory &a_mem, const test_memory &b_mem,
             const test_memory &c_mem, const test_memory &) {
         auto a = map_memory<a_dt>(a_mem);
@@ -59,8 +59,8 @@ struct ref_gemm {
 };
 
 template <typename a_dt, typename b_dt>
-struct ref_gemm<a_dt, b_dt, int32_t> {
-    static void call(const test_params &p, int64_t M, int64_t N,
+struct ref_gemm_t<a_dt, b_dt, int32_t> {
+    static void call(const test_params_t &p, int64_t M, int64_t N,
             const test_memory &a_mem, const test_memory &b_mem,
             const test_memory &c_mem, const test_memory &oc_mem) {
         auto A = map_memory<a_dt>(a_mem);
@@ -106,7 +106,7 @@ struct ref_gemm<a_dt, b_dt, int32_t> {
 };
 
 template <typename a_dt, typename c_dt>
-void compare(const test_params &p, const test_memory &c_mem,
+void compare(const test_params_t &p, const test_memory &c_mem,
         const test_memory &c_ref_mem) {
     using data_type = memory::data_type;
     auto c = map_memory<c_dt>(c_mem);
@@ -118,15 +118,15 @@ void compare(const test_params &p, const test_memory &c_mem,
         c_dt got = c[p.off.c + i * p.ldc + j];
         c_dt diff = got - ref;
 
-        if (data_traits<a_dt>::data_type == data_type::f16) {
+        if (data_traits_t<a_dt>::data_type == data_type::f16) {
             const float eps = 1e-3 * p.K;
             float e = (std::abs(ref) > eps) ? diff / ref : float(diff);
             ASSERT_NEAR(e, 0.0, eps) << "Row: " << i << " Col: " << j;
-        } else if (data_traits<a_dt>::data_type == data_type::bf16) {
+        } else if (data_traits_t<a_dt>::data_type == data_type::bf16) {
             const float eps = 1e-2 * p.K;
             float e = (std::abs(ref) > eps) ? diff / ref : float(diff);
             ASSERT_NEAR(e, 0.0, eps) << "Row: " << i << " Col: " << j;
-        } else if (data_traits<a_dt>::data_type == data_type::f32) {
+        } else if (data_traits_t<a_dt>::data_type == data_type::f32) {
             c_dt e = (std::abs(ref) > 1e-4) ? c_dt(diff / ref) : diff;
             ASSERT_NEAR(e, 0.0, 1e-4) << "Row: " << i << " Col: " << j;
         } else {
@@ -134,9 +134,9 @@ void compare(const test_params &p, const test_memory &c_mem,
             c_dt eps = 0;
             if (p.alpha == 1.0f) {
                 eps = 1;
-            } else if (data_traits<a_dt>::data_type == data_type::u8) {
+            } else if (data_traits_t<a_dt>::data_type == data_type::u8) {
                 eps = p.K / 700 + 1;
-            } else if (data_traits<a_dt>::data_type == data_type::s8) {
+            } else if (data_traits_t<a_dt>::data_type == data_type::s8) {
                 eps = p.K / 350 + 1;
             }
             ASSERT_NEAR(diff, 0, eps) << "Row: " << i << " Col: " << j;
@@ -145,11 +145,11 @@ void compare(const test_params &p, const test_memory &c_mem,
 }
 
 template <typename a_dt, typename b_dt, typename c_dt>
-void validate(const test_params &p, test_gemm_data &gemm_data) {
+void validate(const test_params_t &p, test_gemm_data_t &gemm_data) {
     const int64_t M_test = gemm_data.mapper_m->dim_test();
     const int64_t N_test = gemm_data.mapper_n->dim_test();
 
-    ref_gemm<a_dt, b_dt, c_dt>::call(p, M_test, N_test, *gemm_data.a_mem,
+    ref_gemm_t<a_dt, b_dt, c_dt>::call(p, M_test, N_test, *gemm_data.a_mem,
             *gemm_data.b_mem, *gemm_data.c_ref_mem, *gemm_data.oc_mem);
     extend_matrix<c_dt>(*gemm_data.c_ref_mem, p.off.c, p.M, p.N, p.ldc,
             *gemm_data.mapper_m, *gemm_data.mapper_n);
diff --git a/tests/gtests/test_group_normalization.cpp b/tests/gtests/test_group_normalization.cpp
index 2b14d2b77eb..1e345bd94b7 100644
--- a/tests/gtests/test_group_normalization.cpp
+++ b/tests/gtests/test_group_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023-2024 Intel Corporation
+* Copyright 2023-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -46,8 +46,6 @@ class group_normalization_test_t
 
 protected:
     void SetUp() override {
-        SKIP_IF_HIP(
-                true, "Group Normalization operator is not supported in HIP");
         p = ::testing::TestWithParam<
                 group_normalization_test_params_t>::GetParam();
 
@@ -70,14 +68,10 @@ class group_normalization_test_t
 
         const bool is_src_int8 = p.src_dt == memory::data_type::s8
                 || p.src_dt == memory::data_type::u8;
-        auto aa = allows_attr_t {
-                false, /* po_sum */
-                true, /* po_eltwise */
-                true, /* po_binary*/
-                false, /* po_prelu*/
-                false, /* zp */
-                is_src_int8, /* scales */
-        };
+        allows_attr_t aa {};
+        aa.po_eltwise = true;
+        aa.po_binary = true;
+        aa.scales = is_src_int8;
 
         auto src_md = memory::desc(p.dims, p.src_dt, p.src_tag);
         auto dst_md = memory::desc(p.dims, p.dst_dt, p.dst_tag);
@@ -191,7 +185,7 @@ class group_normalization_test_t
         // group_normalization specific types and values
         using pd_t = group_normalization_backward::primitive_desc;
         using hint_pd_t = group_normalization_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_iface_attr.cpp b/tests/gtests/test_iface_attr.cpp
index 5674bc14d91..419db5b83cd 100644
--- a/tests/gtests/test_iface_attr.cpp
+++ b/tests/gtests/test_iface_attr.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2017-2024 Intel Corporation
+* Copyright 2017-2025 Intel Corporation
 * Copyright 2020-2021 FUJITSU LIMITED
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -205,8 +205,8 @@ TEST_F(attr_test_t, TestZeroPointsWithGroups) {
 TEST_F(attr_test_t, TestZeroPointsDataTypes) {
     dnnl::primitive_attr attr;
 
-    const std::vector<int> supported_args = {DNNL_ARG_WEIGHTS};
-    const std::vector<int> unsupported_args = {DNNL_ARG_SRC};
+    const std::vector<int> supported_args = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS};
+    const std::vector<int> unsupported_args = {DNNL_ARG_DST};
 
     const std::vector<data_type> supported_dts = {data_type::s32, data_type::s8,
             data_type::u8, data_type::s4, data_type::u4};
@@ -225,10 +225,10 @@ TEST_F(attr_test_t, TestZeroPointsDataTypes) {
     for (auto arg : unsupported_args) {
         for (auto dt : supported_dts) {
             if (dt == data_type::s32) {
-                // s32 is a default zero point data type supported by all eligible arguments
+                // s32 is a default zero point data type supported by all
+                // eligible arguments.
                 attr.set_zero_points(arg, 0, {}, dt);
             } else {
-                // Only weights support non-default zero point data type
                 EXPECT_ANY_THROW(attr.set_zero_points(arg, 0, {}, dt));
             }
         }
@@ -281,15 +281,17 @@ HANDLE_EXCEPTIONS_FOR_TEST_F(attr_test_t, TestScalesWithGroups) {
     for (auto arg : supported_args) {
         // single non-default scales for supported arg
         attr.set_scales(arg, 0, {});
-        // multiple scales with groups
+        // multiple scales with a single group dim
         attr.set_scales(arg, 1 << 0, {4});
+        // multiple scales with multiple group dims
+        attr.set_scales(arg, 1 << 0, {4, 1});
         // scales with groups and a data type
-        attr.set_scales(arg, 1 << 0, {4}, data_type::f32);
+        attr.set_scales(arg, 1 << 0, {4, 1}, data_type::f32);
     }
 
     for (auto arg : unsupported_args) {
         // multiple scales with groups for unsupported args
-        EXPECT_ANY_THROW(attr.set_scales(arg, 1 << 0, {4}));
+        EXPECT_ANY_THROW(attr.set_scales(arg, 1 << 0, {4, 1}));
         // multiple scales with non-default data type for unsupported args
         EXPECT_ANY_THROW(attr.set_scales(arg, 1 << 0, {}, data_type::bf16));
     }
@@ -791,7 +793,7 @@ HANDLE_EXCEPTIONS_FOR_TEST_F(attr_test_t, TestGetCppObjects) {
     // of using a dangling pointer from destroyed object via
     // `pd.get_primitive_attr().get_post_ops()` construction as attributes will
     // be destroyed once post-ops are saved on stack.
-    // See https://github.com/oneapi-src/oneDNN/issues/1337 for details.
+    // See https://github.com/uxlfoundation/oneDNN/issues/1337 for details.
     dnnl::primitive_attr attr;
     dnnl::post_ops ops;
     memory::desc po_src1_md({1, 1, 1, 1}, data_type::f32, tag::abcd);
diff --git a/tests/gtests/test_iface_attr_quantization.cpp b/tests/gtests/test_iface_attr_quantization.cpp
index 01bd66063ac..f6e547a0011 100644
--- a/tests/gtests/test_iface_attr_quantization.cpp
+++ b/tests/gtests/test_iface_attr_quantization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,10 +32,10 @@ class attr_quantization_test_t : public ::testing::Test {
     engine eng = get_test_engine();
     void SetUp() override {}
 
-    static primitive_attr gen_attr_with_scales() {
+    static primitive_attr gen_attr_with_scales(bool with_wei = true) {
         primitive_attr attr;
         attr.set_scales_mask(DNNL_ARG_SRC, 0);
-        attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+        if (with_wei) attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
         attr.set_scales_mask(DNNL_ARG_DST, 0);
         return attr;
     }
@@ -55,11 +55,14 @@ class attr_quantization_test_t : public ::testing::Test {
     }
 
     template <typename F>
-    static void check_status(const F &f, dnnl_status_t status) {
-        catch_expected_failures(f, status != dnnl_success, status, false);
+    static void check_status(const F &f, dnnl_status_t status,
+            const char *filename, int64_t line_num) {
+        catch_expected_failures(
+                f, status != dnnl_success, status, false, filename, line_num);
     }
 };
-#define CHECK_STATUs(status, ...) check_status([&]() { __VA_ARGS__; }, status)
+#define CHECK_STATUs(status, ...) \
+    check_status([&]() { __VA_ARGS__; }, status, __FILE__, __LINE__)
 #define CHECK_STATUS(status, ...) CHECK_STATUs(status, __VA_ARGS__)
 
 #define CHECK_OK(...) CHECK_STATUS(dnnl_success, __VA_ARGS__)
@@ -81,14 +84,19 @@ TEST_F(attr_quantization_test_t, TestBNorm) {
                 eng, prop_kind::forward_inference, md, md, 0.1f, flags));
         CHECK_UNIMPL(batch_normalization_forward::primitive_desc(eng,
                 prop_kind::forward_inference, md, md, 0.1f, flags,
-                gen_attr_with_scales()));
+                gen_attr_with_scales(
+                        /* with_wei = false, qunatization is not supported */)));
 
-        for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_BIAS,
-                     DNNL_ARG_MEAN, DNNL_ARG_VARIANCE, DNNL_ARG_DST}) {
+        for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
             CHECK_UNIMPL(batch_normalization_forward::primitive_desc(eng,
                     prop_kind::forward_inference, md, md, 0.1f, flags,
                     gen_attr_with_zp(arg)));
         }
+        for (auto arg : {DNNL_ARG_BIAS, DNNL_ARG_MEAN, DNNL_ARG_VARIANCE}) {
+            CHECK_INVALID(batch_normalization_forward::primitive_desc(eng,
+                    prop_kind::forward_inference, md, md, 0.1f, flags,
+                    gen_attr_with_zp(arg)));
+        }
     }
 }
 
@@ -103,8 +111,15 @@ TEST_F(attr_quantization_test_t, TestBinary) {
         else
             CHECK_OK(binary::primitive_desc(eng, algorithm::binary_add, md, md,
                     md, gen_attr_with_scales(arg)));
-        CHECK_UNIMPL(binary::primitive_desc(
-                eng, algorithm::binary_add, md, md, md, gen_attr_with_zp(arg)));
+    }
+
+    for (auto arg : {DNNL_ARG_SRC_0, DNNL_ARG_SRC_1, DNNL_ARG_DST}) {
+        if (arg == DNNL_ARG_SRC_1)
+            CHECK_INVALID(binary::primitive_desc(eng, algorithm::binary_add, md,
+                    md, md, gen_attr_with_zp(arg)));
+        else
+            CHECK_UNIMPL(binary::primitive_desc(eng, algorithm::binary_add, md,
+                    md, md, gen_attr_with_zp(arg)));
     }
 }
 
@@ -114,11 +129,10 @@ TEST_F(attr_quantization_test_t, TestConcat) {
     memory::desc md {{1, 16, 3, 3}, data_type::s8, tag::abcd};
     CHECK_OK(concat::primitive_desc(eng, 1, {md, md}));
 
-    for (auto arg :
-            {DNNL_ARG_MULTIPLE_SRC, DNNL_ARG_MULTIPLE_SRC + 1, DNNL_ARG_DST}) {
+    for (auto arg : {DNNL_ARG_MULTIPLE_SRC, DNNL_ARG_MULTIPLE_SRC + 1}) {
         CHECK_OK(concat::primitive_desc(
                 eng, 1, {md, md}, gen_attr_with_scales(arg)));
-        CHECK_UNIMPL(concat::primitive_desc(
+        CHECK_INVALID(concat::primitive_desc(
                 eng, 1, {md, md}, gen_attr_with_zp(arg)));
     }
 }
@@ -356,7 +370,9 @@ TEST_F(attr_quantization_test_t, TestEltwise) {
                 eng, prop_kind::forward, algorithm::eltwise_relu, md, md, 0.f));
 
         CHECK_UNIMPL(eltwise_forward::primitive_desc(eng, prop_kind::forward,
-                algorithm::eltwise_relu, md, md, 0.f, gen_attr_with_scales()));
+                algorithm::eltwise_relu, md, md, 0.f,
+                gen_attr_with_scales(
+                        /* with_wei = false, quantization is not supported */)));
 
         for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
             CHECK_UNIMPL(eltwise_forward::primitive_desc(eng,
@@ -371,6 +387,7 @@ TEST_F(attr_quantization_test_t, TestInnerProduct) {
     SKIP_IF_CUDA(true, "Unsupported datatype for CUDA");
     // src, wei needs to be s8 and dst be s32.
     SKIP_IF_HIP(true, "Unsupported datatype for HIP");
+    SKIP_IF_GENERIC(true, "InnerProduct is not supported for Generic");
     memory::desc src_md {{1, 16, 7, 7}, data_type::u8, tag::any};
     memory::desc wei_md {{32, 16, 7, 7}, data_type::s8, tag::any};
     memory::desc dst_md {{1, 32}, data_type::s32, tag::any};
@@ -379,8 +396,7 @@ TEST_F(attr_quantization_test_t, TestInnerProduct) {
     CHECK_OK(inner_product_forward::primitive_desc(eng, prop_kind::forward,
             src_md, wei_md, dst_md, gen_attr_with_scales()));
 
-    for (auto arg :
-            {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_BIAS, DNNL_ARG_DST}) {
+    for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
         CHECK_UNIMPL(
                 inner_product_forward::primitive_desc(eng, prop_kind::forward,
                         src_md, wei_md, dst_md, gen_attr_with_zp(arg)));
@@ -397,16 +413,23 @@ TEST_F(attr_quantization_test_t, TestLNorm) {
 
     CHECK_OK(layer_normalization_forward::primitive_desc(
             eng, prop_kind::forward_inference, md, md, stat_md, 0.1f, flags));
+    CHECK_UNIMPL(layer_normalization_forward::primitive_desc(eng,
+            prop_kind::forward_inference, md, md, stat_md, 0.1f, flags,
+            gen_attr_with_scales(/* WEIGHTS are not supported */)));
     CHECK_OK(layer_normalization_forward::primitive_desc(eng,
             prop_kind::forward_inference, md, md, stat_md, 0.1f, flags,
-            gen_attr_with_scales()));
+            gen_attr_with_scales(/* with_wei = */ false)));
 
-    for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_MEAN, DNNL_ARG_VARIANCE,
-                 DNNL_ARG_WEIGHTS, DNNL_ARG_BIAS, DNNL_ARG_DST}) {
+    for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS, DNNL_ARG_DST}) {
         CHECK_UNIMPL(layer_normalization_forward::primitive_desc(eng,
                 prop_kind::forward_inference, md, md, stat_md, 0.1f, flags,
                 gen_attr_with_zp(arg)));
     }
+    for (auto arg : {DNNL_ARG_MEAN, DNNL_ARG_VARIANCE, DNNL_ARG_BIAS}) {
+        CHECK_INVALID(layer_normalization_forward::primitive_desc(eng,
+                prop_kind::forward_inference, md, md, stat_md, 0.1f, flags,
+                gen_attr_with_zp(arg)));
+    }
 }
 
 TEST_F(attr_quantization_test_t, TestLRN) {
@@ -416,7 +439,9 @@ TEST_F(attr_quantization_test_t, TestLRN) {
                 algorithm::lrn_across_channels, md, md, 5, 1.f, 0.75f, 1.0f));
         CHECK_UNIMPL(lrn_forward::primitive_desc(eng,
                 prop_kind::forward_inference, algorithm::lrn_across_channels,
-                md, md, 5, 1.f, 0.75f, 1.0f, gen_attr_with_scales()));
+                md, md, 5, 1.f, 0.75f, 1.0f,
+                gen_attr_with_scales(
+                        /* with_wei = false, quantization is not supported */)));
 
         for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
             CHECK_UNIMPL(lrn_forward::primitive_desc(eng,
@@ -427,13 +452,16 @@ TEST_F(attr_quantization_test_t, TestLRN) {
     }
 }
 
-CPU_TEST_F(attr_quantization_test_t, TestMatmul) {
+TEST_F(attr_quantization_test_t, TestMatmul) {
+    // cuDNN doesn't support zero points
+    SKIP_IF_CUDA(true, "Test not supported on cuda");
+
     for (auto a_dt : {data_type::f32, data_type::u8}) {
         const data_type b_dt
                 = a_dt == data_type::f32 ? data_type::f32 : data_type::s8;
 
-        memory::desc a_md {{10, 3}, a_dt, tag::ab};
-        memory::desc b_md {{3, 20}, b_dt, tag::ba};
+        memory::desc a_md {{10, 64}, a_dt, tag::ab};
+        memory::desc b_md {{64, 20}, b_dt, tag::ba};
         memory::desc c_md {{10, 20}, data_type::f32, tag::ab};
 
         CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md));
@@ -457,7 +485,13 @@ CPU_TEST_F(attr_quantization_test_t, TestMatmul) {
                             gen_attr_with_zp(arg, (1 << 1) + (1 << 0))));
                     CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_zp(
-                                    arg, (1 << 1) + (1 << 0), b_dt, {3, 1})));
+                                    arg, (1 << 1) + (1 << 0), b_dt, {32, 1})));
+                } else if (arg == DNNL_ARG_SRC) {
+                    CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                            gen_attr_with_zp(arg, (1 << 1) + (1 << 0))));
+                    CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                            gen_attr_with_zp(
+                                    arg, (1 << 1) + (1 << 0), b_dt, {1, 32})));
                 } else {
                     CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_zp(arg, (1 << 1) + (1 << 0))));
@@ -470,42 +504,48 @@ CPU_TEST_F(attr_quantization_test_t, TestMatmul) {
             if (arg == DNNL_ARG_WEIGHTS) {
                 CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
                         gen_attr_with_scales(arg, 1 << 1)));
-                CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
-                        gen_attr_with_scales(arg, (1 << 1) + (1 << 0))));
                 if (b_dt == data_type::s8) {
                     CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                            gen_attr_with_scales(arg, (1 << 1) + (1 << 0))));
+                    // Groups non divisible by 32 are not supported.
+                    CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_scales(arg, (1 << 1) + (1 << 0),
                                     data_type::f32, {3, 1})));
+                    CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                            gen_attr_with_scales(arg, (1 << 1) + (1 << 0),
+                                    data_type::f32, {32, 1})));
                 } else {
                     CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_scales(arg, (1 << 1) + (1 << 0),
-                                    data_type::f32, {3, 1})));
+                                    data_type::f32, {32, 1})));
                 }
             } else if (arg == DNNL_ARG_SRC) {
-                CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
-                        gen_attr_with_scales(arg, 1 << 1)));
+                // Somehow GPU doeshave this support.
+                const bool is_cpu = get_test_engine_kind() == engine::kind::cpu;
+                if (is_cpu) {
+                    CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                            gen_attr_with_scales(arg, 1 << 1)));
+                }
                 if (a_dt == data_type::u8) {
                     CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_scales(
-                                    arg, 1 << 1, data_type::f32, {1, 3})));
-                    CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                                    arg, 1 << 1, data_type::f32, {1, 32})));
+                    // Groups non divisible by 32 are not supported.
+                    CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_scales(arg, (1 << 1) + (1 << 0),
                                     data_type::f32, {1, 3})));
+                    CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                            gen_attr_with_scales(arg, (1 << 1) + (1 << 0),
+                                    data_type::f32, {1, 32})));
                 } else {
                     CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_scales(
-                                    arg, 1 << 1, data_type::f32, {1, 3})));
+                                    arg, 1 << 1, data_type::f32, {1, 32})));
                     CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
                             gen_attr_with_scales(arg, (1 << 1) + (1 << 0),
-                                    data_type::f32, {1, 3})));
+                                    data_type::f32, {1, 32})));
                 }
-            } else {
-                CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
-                        gen_attr_with_scales(arg, 1 << 1)));
             }
-            //scales: unsupported mask
-            CHECK_UNIMPL(matmul::primitive_desc(
-                    eng, a_md, b_md, c_md, gen_attr_with_scales(arg, 1 << 2)));
         }
     }
 }
@@ -515,9 +555,10 @@ CPU_TEST_F(attr_quantization_test_t, TestMatmulBatch) {
         const data_type b_dt
                 = a_dt == data_type::f32 ? data_type::f32 : data_type::s8;
 
-        memory::desc a_md {{1, 10, 3}, a_dt, tag::abc};
-        memory::desc b_md {{1, 3, 20}, b_dt, tag::acb};
-        memory::desc c_md {{1, 10, 20}, data_type::f32, tag::abc};
+        memory::desc a_md {{2, 5, 10, 64}, a_dt, tag::abcd};
+        memory::desc b_md {{2, 5, 64, 20}, b_dt, tag::abdc};
+        memory::desc c_md {{2, 5, 10, 20}, data_type::f32, tag::abcd};
+        const auto ndims = a_md.get_ndims();
 
         CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md));
         CHECK_OK(matmul::primitive_desc(
@@ -536,15 +577,28 @@ CPU_TEST_F(attr_quantization_test_t, TestMatmulBatch) {
             CHECK_OK(matmul::primitive_desc(
                     eng, a_md, b_md, c_md, gen_attr_with_scales(arg)));
             // scales: per_oc mask
-            if (arg == DNNL_ARG_WEIGHTS)
+            const auto per_oc_mask = 1 << (ndims - 1);
+            if (arg == DNNL_ARG_WEIGHTS) {
+                CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                        gen_attr_with_scales(arg, per_oc_mask)));
+            }
+
+            if (a_dt != data_type::u8 && a_dt != data_type::s8) continue;
+            // scales: per_tensor mask for int8 type only.
+            const auto per_tensor_mask = (1 << ndims) - 1;
+            const auto per_ocic_mask = (1 << (ndims - 1)) + (1 << (ndims - 2));
+            if (arg == DNNL_ARG_WEIGHTS) {
                 CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
-                        gen_attr_with_scales(arg, 1 << 2)));
-            else
+                        gen_attr_with_scales(arg, per_tensor_mask,
+                                data_type::f32, {32, 1})));
+            } else if (arg == DNNL_ARG_SRC) {
+                CHECK_OK(matmul::primitive_desc(eng, a_md, b_md, c_md,
+                        gen_attr_with_scales(
+                                arg, per_ocic_mask, data_type::f32, {1, 32})));
+            } else {
                 CHECK_UNIMPL(matmul::primitive_desc(eng, a_md, b_md, c_md,
-                        gen_attr_with_scales(arg, 1 << 2)));
-            //scales: unsupported mask
-            CHECK_UNIMPL(matmul::primitive_desc(
-                    eng, a_md, b_md, c_md, gen_attr_with_scales(arg, 1 << 1)));
+                        gen_attr_with_scales(arg, per_tensor_mask)));
+            }
         }
     }
 }
@@ -558,10 +612,11 @@ TEST_F(attr_quantization_test_t, TestPool) {
     CHECK_OK(pooling_forward::primitive_desc(eng, prop_kind::forward_inference,
             algorithm::pooling_max, src_md, dst_md, {2, 2}, {2, 2}, {0, 0},
             {0, 0}, {0, 0}));
-    CHECK_UNIMPL(
-            pooling_forward::primitive_desc(eng, prop_kind::forward_inference,
-                    algorithm::pooling_max, src_md, dst_md, {2, 2}, {2, 2},
-                    {0, 0}, {0, 0}, {0, 0}, gen_attr_with_scales()));
+    CHECK_UNIMPL(pooling_forward::primitive_desc(eng,
+            prop_kind::forward_inference, algorithm::pooling_max, src_md,
+            dst_md, {2, 2}, {2, 2}, {0, 0}, {0, 0}, {0, 0},
+            gen_attr_with_scales(
+                    /* with_wei = false, quantization is not supported */)));
 
     for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
         CHECK_UNIMPL(pooling_forward::primitive_desc(eng,
@@ -581,7 +636,9 @@ TEST_F(attr_quantization_test_t, TestPReLU) {
             eng, prop_kind::forward, data_md, weights_md, data_md));
 
     CHECK_UNIMPL(prelu_forward::primitive_desc(eng, prop_kind::forward, data_md,
-            weights_md, data_md, gen_attr_with_scales()));
+            weights_md, data_md,
+            gen_attr_with_scales(
+                    /* with_wei = false, quantization is not supported */)));
 
     for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
         CHECK_UNIMPL(prelu_forward::primitive_desc(eng, prop_kind::forward,
@@ -595,8 +652,10 @@ CPU_TEST_F(attr_quantization_test_t, TestReorder) {
     CHECK_OK(reorder::primitive_desc(eng, src_md, eng, dst_md));
 
     for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
-        CHECK_OK(reorder::primitive_desc(
-                eng, src_md, eng, dst_md, gen_attr_with_scales()));
+        CHECK_UNIMPL(reorder::primitive_desc(eng, src_md, eng, dst_md,
+                gen_attr_with_scales(/* WEIGHTS are not supported */)));
+        CHECK_OK(reorder::primitive_desc(eng, src_md, eng, dst_md,
+                gen_attr_with_scales(/* with_wei = */ false)));
         CHECK_OK(reorder::primitive_desc(
                 eng, src_md, eng, dst_md, gen_attr_with_zp(arg)));
     }
@@ -605,6 +664,7 @@ CPU_TEST_F(attr_quantization_test_t, TestReorder) {
 TEST_F(attr_quantization_test_t, TestRNN) {
     SKIP_IF_CUDA(true, "RNN primitive not supported for CUDA");
     SKIP_IF_HIP(true, "RNN primitive not supported for HIP");
+    SKIP_IF_GENERIC(true, "RNN primitive not supported for Generic");
     // Int8 RNN relies on packed API solely which is available only for X64.
 #if !DNNL_X64
     return;
@@ -645,9 +705,8 @@ TEST_F(attr_quantization_test_t, TestRNN) {
                         attr));
     }
 
-    for (auto arg : {DNNL_ARG_SRC_LAYER, DNNL_ARG_SRC_ITER, DNNL_ARG_SRC_ITER_C,
-                 DNNL_ARG_WEIGHTS_LAYER, DNNL_ARG_WEIGHTS_ITER, DNNL_ARG_BIAS,
-                 DNNL_ARG_DST_LAYER, DNNL_ARG_DST_ITER, DNNL_ARG_DST_ITER_C}) {
+    for (auto arg :
+            {DNNL_ARG_SRC_LAYER, DNNL_ARG_WEIGHTS_LAYER, DNNL_ARG_DST_LAYER}) {
         CHECK_UNIMPL(
                 lstm_forward::primitive_desc(eng, prop_kind::forward_inference,
                         rnn_direction::unidirectional_left2right, src_layer_md,
@@ -664,8 +723,10 @@ TEST_F(attr_quantization_test_t, TestShuffle) {
 
     CHECK_OK(shuffle_forward::primitive_desc pd(
             eng, prop_kind::forward, md, md, 1, 4));
-    CHECK_UNIMPL(shuffle_forward::primitive_desc pd(
-            eng, prop_kind::forward, md, md, 1, 4, gen_attr_with_scales()));
+    CHECK_UNIMPL(shuffle_forward::primitive_desc pd(eng, prop_kind::forward, md,
+            md, 1, 4,
+            gen_attr_with_scales(
+                    /* with_wei = false, quantization is not supported */)));
 
     for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
         CHECK_UNIMPL(shuffle_forward::primitive_desc pd(
@@ -695,8 +756,9 @@ TEST_F(attr_quantization_test_t, TestSum) {
     SKIP_IF_HIP(true, "Unsupported operator for HIP");
     memory::desc md {{1, 16, 3, 3}, data_type::s8, tag::abcd};
     CHECK_OK(sum::primitive_desc(eng, {1.f, 1.f}, {md, md}));
-    CHECK_UNIMPL(sum::primitive_desc(
-            eng, {1.f, 1.f}, {md, md}, gen_attr_with_scales()));
+    CHECK_UNIMPL(sum::primitive_desc(eng, {1.f, 1.f}, {md, md},
+            gen_attr_with_scales(
+                    /* with_wei = false, quantization is not supported */)));
 
     for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
         CHECK_UNIMPL(sum::primitive_desc(
diff --git a/tests/gtests/test_iface_pd_iter.cpp b/tests/gtests/test_iface_pd_iter.cpp
index 064bdaf1576..703ade0812a 100644
--- a/tests/gtests/test_iface_pd_iter.cpp
+++ b/tests/gtests/test_iface_pd_iter.cpp
@@ -114,6 +114,7 @@ TEST_F(pd_iter_test_t, UnsupportedPrimitives) {
 TEST(pd_next_impl, TestEltwiseImpl) {
     SKIP_IF_CUDA(true, "Unsupported memory format for CUDA");
     SKIP_IF_HIP(true, "Unsupported memory format for HIP");
+    SKIP_IF_GENERIC(true, "Unsupported memory format for Generic");
     auto eng = get_test_engine();
     memory::desc md(
             {8, 32, 4, 4}, memory::data_type::f32, memory::format_tag::nChw8c);
diff --git a/tests/gtests/test_iface_runtime_dims.cpp b/tests/gtests/test_iface_runtime_dims.cpp
index 0e55453387f..c1e1a450af1 100644
--- a/tests/gtests/test_iface_runtime_dims.cpp
+++ b/tests/gtests/test_iface_runtime_dims.cpp
@@ -162,6 +162,8 @@ TEST_F(runtime_dim_test_t, TestEltwise) {
 }
 
 TEST_F(runtime_dim_test_t, TestInnerProduct) {
+    SKIP_IF_GENERIC(true, "InnerProduct is not implemented in Generic");
+
     memory::desc src_md {
             {DNNL_RUNTIME_DIM_VAL, 16, 7, 7}, data_type::f32, tag::abcd};
     memory::desc wei_md {{32, 16, 7, 7}, data_type::f32, tag::abcd};
diff --git a/tests/gtests/test_iface_sparse.cpp b/tests/gtests/test_iface_sparse.cpp
index 481c093ae2c..423abacd48d 100644
--- a/tests/gtests/test_iface_sparse.cpp
+++ b/tests/gtests/test_iface_sparse.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -31,6 +31,8 @@ TEST(iface_sparse_test_t, TestSparseMDCreation) {
     // CSR.
     ASSERT_NO_THROW(
             md = memory::desc::csr({64, 128}, dt::f32, nnz, dt::s32, dt::s32));
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
     // Packed.
     ASSERT_NO_THROW(md = memory::desc::packed({64, 128}, dt::f32, nnz));
 }
@@ -63,6 +65,24 @@ TEST(iface_sparse_test_t, TestSparseMDComparison) {
             = memory::desc::csr({64, 128}, dt::f32, nnz + 1, dt::s32, dt::s32));
     ASSERT_NE(md1, md2);
 
+    // COO.
+
+    // Different value data types.
+    ASSERT_NO_THROW(md1 = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
+    ASSERT_NO_THROW(md2 = memory::desc::coo({64, 128}, dt::f16, nnz, dt::s32));
+    ASSERT_NE(md1, md2);
+
+    // Different index data types.
+    ASSERT_NO_THROW(md1 = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
+    ASSERT_NO_THROW(md2 = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s8));
+    ASSERT_NE(md1, md2);
+
+    // Different nnz.
+    ASSERT_NO_THROW(md1 = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
+    ASSERT_NO_THROW(
+            md2 = memory::desc::coo({64, 128}, dt::f32, nnz + 1, dt::s32));
+    ASSERT_NE(md1, md2);
+
     // Packed.
 
     // Equal memory descriptors.
@@ -98,6 +118,18 @@ TEST(iface_sparse_test_t, TestSparseMDQueries) {
     ASSERT_EQ(md.get_data_type(1), indices_dt);
     ASSERT_EQ(md.get_data_type(2), pointers_dt);
 
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo(dims, data_type, nnz, indices_dt));
+    ASSERT_EQ(md.get_dims(), dims);
+    ASSERT_EQ(md.get_data_type(), data_type);
+    ASSERT_EQ(md.get_data_type(0), data_type);
+    ASSERT_EQ(md.get_format_kind(), memory::format_kind::sparse);
+
+    ASSERT_EQ(md.get_nnz(), nnz);
+    ASSERT_EQ(md.get_sparse_encoding(), memory::sparse_encoding::coo);
+    ASSERT_EQ(md.get_data_type(1), indices_dt);
+    ASSERT_EQ(md.get_data_type(2), indices_dt);
+
     // Packed.
     ASSERT_NO_THROW(md = memory::desc::packed(dims, data_type, nnz));
     ASSERT_EQ(md.get_dims(), dims);
@@ -117,23 +149,35 @@ TEST(iface_sparse_test_t, TestSparseMDSize) {
     ASSERT_NO_THROW(
             md = memory::desc::csr({64, 128}, dt::f32, nnz, dt::s32, dt::s32));
     // Size of values.
-    const size_t exp_values_size
-            = nnz * memory::data_type_size(md.get_data_type());
+    size_t exp_values_size = nnz * memory::data_type_size(md.get_data_type());
     // Default.
     ASSERT_EQ(md.get_size(), exp_values_size);
     // Explicit.
     ASSERT_EQ(md.get_size(0), exp_values_size);
 
     // Size of indices.
-    const size_t exp_indices_size
-            = nnz * memory::data_type_size(md.get_data_type(1));
+    size_t exp_indices_size = nnz * memory::data_type_size(md.get_data_type(1));
     ASSERT_EQ(md.get_size(1), exp_indices_size);
 
     // Size of  pointers.
-    const size_t exp_pointers_size = (md.get_dims()[0] + 1)
+    size_t exp_pointers_size = (md.get_dims()[0] + 1)
             * memory::data_type_size(md.get_data_type(2));
     ASSERT_EQ(md.get_size(2), exp_pointers_size);
 
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
+    // Size of values.
+    exp_values_size = nnz * memory::data_type_size(md.get_data_type());
+    // Default.
+    ASSERT_EQ(md.get_size(), exp_values_size);
+    // Explicit.
+    ASSERT_EQ(md.get_size(0), exp_values_size);
+
+    // Size of indices.
+    exp_indices_size = nnz * memory::data_type_size(md.get_data_type(1));
+    ASSERT_EQ(md.get_size(1), exp_indices_size);
+    ASSERT_EQ(md.get_size(2), exp_indices_size);
+
     // Packed.
 
     // The user-created memory descriptor for packed encoding cannot
@@ -152,7 +196,7 @@ TEST(iface_sparse_test_t, TestSparseMDSize) {
     ASSERT_EQ(md.get_size(2), 0u);
 }
 
-TEST(iface_sparse_test_t, TestSparseMemoryCreation) {
+HANDLE_EXCEPTIONS_FOR_TEST(iface_sparse_test_t, TestSparseMemoryCreation) {
     engine eng = get_test_engine();
 
     const bool is_unimplemented = (eng.get_kind() == engine::kind::gpu
@@ -161,12 +205,15 @@ TEST(iface_sparse_test_t, TestSparseMemoryCreation) {
 
     const int nnz = 12;
     memory::desc md;
+
+    // CSR.
     ASSERT_NO_THROW(
             md = memory::desc::csr({64, 128}, dt::f32, nnz, dt::s32, dt::s32));
     memory mem;
 
     // Default memory constructor.
-    EXPECT_NO_THROW(mem = memory(md, eng));
+    mem = memory(md, eng);
+
     // User provided buffers.
     {
         std::vector<float> values(1);
@@ -176,9 +223,26 @@ TEST(iface_sparse_test_t, TestSparseMemoryCreation) {
                 mem = memory(md, eng,
                         {values.data(), indices.data(), pointers.data()}));
     }
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
+
+    // Default memory constructor.
+    mem = memory(md, eng);
+    // User provided buffers.
+    {
+        std::vector<float> values(1);
+        std::vector<int> row_indices(1);
+        std::vector<int> col_indices(1);
+
+        EXPECT_NO_THROW(mem = memory(md, eng,
+                                {values.data(), row_indices.data(),
+                                        col_indices.data()}));
+    }
 }
 
-TEST(iface_sparse_test_t, TestSparseMemorySetGetDataHandles) {
+HANDLE_EXCEPTIONS_FOR_TEST(
+        iface_sparse_test_t, TestSparseMemorySetGetDataHandles) {
     engine eng = get_test_engine();
 
     const bool is_unimplemented = (eng.get_kind() == engine::kind::gpu
@@ -186,27 +250,32 @@ TEST(iface_sparse_test_t, TestSparseMemorySetGetDataHandles) {
     if (is_unimplemented) return;
 
     const int nnz = 12;
+
+    // CSR.
     memory::desc md;
     ASSERT_NO_THROW(
             md = memory::desc::csr({64, 128}, dt::f32, nnz, dt::s32, dt::s32));
     memory mem;
 
-    const int nhandles = 3;
+    int nhandles = 3;
     // Default memory constructor.
-    EXPECT_NO_THROW(mem = memory(md, eng));
-    for (int i = 0; i < nhandles; i++) {
-        void *h = mem.get_data_handle(i);
-        ASSERT_NE(h, nullptr);
-    }
+    mem = memory(md, eng);
 
-    // Creating a memory object without underlying buffers.
-    for (int i = 0; i < nhandles; i++) {
-        EXPECT_NO_THROW(mem.set_data_handle(DNNL_MEMORY_NONE, i));
-    }
-
-    for (int i = 0; i < nhandles; i++) {
-        void *h = mem.get_data_handle(i);
-        ASSERT_EQ(h, nullptr);
+    {
+        for (int i = 0; i < nhandles; i++) {
+            void *h = mem.get_data_handle(i);
+            ASSERT_NE(h, nullptr);
+        }
+
+        // Creating a memory object without underlying buffers.
+        for (int i = 0; i < nhandles; i++) {
+            EXPECT_NO_THROW(mem.set_data_handle(DNNL_MEMORY_NONE, i));
+        }
+
+        for (int i = 0; i < nhandles; i++) {
+            void *h = mem.get_data_handle(i);
+            ASSERT_EQ(h, nullptr);
+        }
     }
 
     // User provided buffers.
@@ -223,6 +292,44 @@ TEST(iface_sparse_test_t, TestSparseMemorySetGetDataHandles) {
         ASSERT_EQ(mem.get_data_handle(1), indices.data());
         ASSERT_EQ(mem.get_data_handle(2), pointers.data());
     }
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({64, 128}, dt::f32, nnz, dt::s32));
+
+    // Default memory constructor.
+    mem = memory(md, eng);
+
+    {
+        for (int i = 0; i < nhandles; i++) {
+            void *h = mem.get_data_handle(i);
+            ASSERT_NE(h, nullptr);
+        }
+
+        // Creating a memory object without underlying buffers.
+        for (int i = 0; i < nhandles; i++) {
+            EXPECT_NO_THROW(mem.set_data_handle(DNNL_MEMORY_NONE, i));
+        }
+
+        for (int i = 0; i < nhandles; i++) {
+            void *h = mem.get_data_handle(i);
+            ASSERT_EQ(h, nullptr);
+        }
+    }
+
+    // User provided buffers.
+    {
+        std::vector<float> values(1);
+        std::vector<int> row_indices(1);
+        std::vector<int> col_indices(1);
+
+        ASSERT_NO_THROW(mem.set_data_handle(values.data(), 0));
+        ASSERT_NO_THROW(mem.set_data_handle(row_indices.data(), 1));
+        ASSERT_NO_THROW(mem.set_data_handle(col_indices.data(), 2));
+
+        ASSERT_EQ(mem.get_data_handle(0), values.data());
+        ASSERT_EQ(mem.get_data_handle(1), row_indices.data());
+        ASSERT_EQ(mem.get_data_handle(2), col_indices.data());
+    }
 }
 
 TEST(iface_sparse_test_t, TestSparseMemoryMapUnmap) {
@@ -233,6 +340,8 @@ TEST(iface_sparse_test_t, TestSparseMemoryMapUnmap) {
     if (is_unimplemented) return;
 
     const int nnz = 2;
+
+    // CSR.
     memory::desc md;
     ASSERT_NO_THROW(
             md = memory::desc::csr({2, 2}, dt::f32, nnz, dt::s32, dt::s32));
@@ -263,6 +372,37 @@ TEST(iface_sparse_test_t, TestSparseMemoryMapUnmap) {
     ASSERT_NO_THROW(mem.unmap_data(mapped_values, 0));
     ASSERT_NO_THROW(mem.unmap_data(mapped_indices, 1));
     ASSERT_NO_THROW(mem.unmap_data(mapped_pointers, 2));
+
+    // COO.
+    ASSERT_NO_THROW(md = memory::desc::coo({2, 2}, dt::f32, nnz, dt::s32));
+
+    std::vector<float> coo_values = {1.5, 2.5};
+    std::vector<int> row_indices = {0, 1};
+    std::vector<int> col_indices = {0, 1};
+
+    memory coo_mem(md, eng,
+            {coo_values.data(), row_indices.data(), col_indices.data()});
+
+    float *mapped_coo_values = nullptr;
+    int *mapped_row_indices = nullptr;
+    int *mapped_col_indices = nullptr;
+
+    ASSERT_NO_THROW(mapped_coo_values = coo_mem.map_data<float>(0));
+    ASSERT_NO_THROW(mapped_row_indices = coo_mem.map_data<int>(1));
+    ASSERT_NO_THROW(mapped_col_indices = coo_mem.map_data<int>(2));
+
+    for (size_t i = 0; i < coo_values.size(); i++)
+        ASSERT_EQ(values[i], mapped_values[i]);
+
+    for (size_t i = 0; i < row_indices.size(); i++)
+        ASSERT_EQ(row_indices[i], mapped_row_indices[i]);
+
+    for (size_t i = 0; i < col_indices.size(); i++)
+        ASSERT_EQ(col_indices[i], mapped_col_indices[i]);
+
+    ASSERT_NO_THROW(mem.unmap_data(mapped_coo_values, 0));
+    ASSERT_NO_THROW(mem.unmap_data(mapped_row_indices, 1));
+    ASSERT_NO_THROW(mem.unmap_data(mapped_col_indices, 2));
 }
 
 } // namespace dnnl
diff --git a/tests/gtests/test_iface_weights_format.cpp b/tests/gtests/test_iface_weights_format.cpp
index 3a757d7086d..e2417a794ee 100644
--- a/tests/gtests/test_iface_weights_format.cpp
+++ b/tests/gtests/test_iface_weights_format.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021-2022 Intel Corporation
+* Copyright 2021-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -104,12 +104,10 @@ class weights_format_test_t : public ::testing::Test {
 
         // inner product zero dimension shapes with channel tails
         for (auto sz : {1, 3, 15, 17, 31, 33, 63, 65, 127, 129})
-            inner_product_shapes.emplace_back(
-                    inner_product_shape_t {sz, sz, sz});
+            inner_product_shapes.emplace_back(sz, sz, sz);
 
         // inner product zero dimensional regression shapes
-        inner_product_shapes.emplace_back(
-                inner_product_shape_t {2, 1024, 30522});
+        inner_product_shapes.emplace_back(2, 1024, 30522);
 
         // inner product shapes of higher dimensions
         // dims format: either of {mb, ic, oc, kw}, {mb, ic, oc, kw, kh},
diff --git a/tests/gtests/test_iface_wino_convolution.cpp b/tests/gtests/test_iface_wino_convolution.cpp
index 3e0bc2f7128..fb2a2226657 100644
--- a/tests/gtests/test_iface_wino_convolution.cpp
+++ b/tests/gtests/test_iface_wino_convolution.cpp
@@ -1,6 +1,6 @@
 /*******************************************************************************
 * Copyright 2020-2023 Intel Corporation
-* Copyright 2023 Arm Ltd. and affiliates
+* Copyright 2023-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ class wino_conv_test_t : public ::testing::Test {
         const bool is_gpu = get_test_engine_kind() == engine::kind::gpu;
         input_f32.wino_supported = is_gpu;
         input_f16.wino_supported = is_gpu;
-#elif DNNL_AARCH64 && DNNL_AARCH64_USE_ACL
+#elif DNNL_AARCH64 && DNNL_USE_ACL
 #if DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL
         const bool is_cpu = get_test_engine_kind() == engine::kind::cpu;
         input_f32.wino_supported = is_cpu;
@@ -64,11 +64,14 @@ class wino_conv_test_t : public ::testing::Test {
 };
 
 TEST_F(wino_conv_test_t, TestSmallPadding) {
+    SKIP_IF_GENERIC(true, "Unsupported test case.");
     for (const auto &input : {input_f32, input_f16, input_int8}) {
         if (unsupported_data_type(input.dat_dt)
                 || unsupported_data_type(input.wei_dt))
             continue;
-
+#if defined(DNNL_AARCH64) && defined(DNNL_AARCH64_USE_ACL)
+        if (input.dat_dt == data_type::f16) continue;
+#endif
         memory::desc src_md {{1, 16, 7, 7}, input.dat_dt, tag::any};
         memory::desc wei_md {{32, 16, 3, 3}, input.wei_dt, tag::any};
         memory::desc dst_md {{1, 32, 7, 7}, input.dat_dt, tag::any};
@@ -98,6 +101,7 @@ TEST_F(wino_conv_test_t, TestSmallPadding) {
 }
 
 TEST_F(wino_conv_test_t, TestLargePadding) {
+    SKIP_IF_GENERIC(true, "Unsupported test case.");
     for (const auto &input : {input_f32, input_f16, input_int8}) {
         if (unsupported_data_type(input.dat_dt)
                 || unsupported_data_type(input.wei_dt))
@@ -123,6 +127,7 @@ TEST_F(wino_conv_test_t, TestLargePadding) {
 
 TEST_F(wino_conv_test_t, TestUnsupportedKernel) {
     SKIP_IF_HIP(true, "Unsupported test case.");
+    SKIP_IF_GENERIC(true, "Unsupported test case.");
     for (const auto &input : {input_f32, input_f16, input_int8}) {
         if (unsupported_data_type(input.dat_dt)
                 || unsupported_data_type(input.wei_dt))
diff --git a/tests/gtests/test_inner_product_backward_data.cpp b/tests/gtests/test_inner_product_backward_data.cpp
index 8e3f2b64458..733a5c5ebc6 100644
--- a/tests/gtests/test_inner_product_backward_data.cpp
+++ b/tests/gtests/test_inner_product_backward_data.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -96,15 +96,18 @@ class inner_product_test_bwd_data_t
 protected:
     void SetUp() override {
         auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
-        SKIP_IF_CUDA(!cuda_check_format_tags(p.diff_src_format,
+        SKIP_IF_CUDA(!cuda_generic_check_format_tags(p.diff_src_format,
                              p.weights_format, p.diff_dst_format),
                 "Unsupported format tag");
         SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions");
+        SKIP_IF_GENERIC(!cuda_generic_check_format_tags(p.diff_src_format,
+                                p.weights_format, p.diff_dst_format),
+                "Unsupported format tag");
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
 
-    bool cuda_check_format_tags(memory::format_tag diff_src_format,
+    bool cuda_generic_check_format_tags(memory::format_tag diff_src_format,
             memory::format_tag wei_format, memory::format_tag diff_dst_format) {
         bool diff_src_ok = diff_src_format == memory::format_tag::ncdhw
                 || diff_src_format == memory::format_tag::ndhwc
@@ -145,7 +148,7 @@ class inner_product_test_bwd_data_t
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
         ASSERT_EQ(data_type, dnnl::memory::data_type::f32);
 
         memory::dims diff_src_dims = {ipd.mb, ipd.ic},
@@ -178,7 +181,7 @@ class inner_product_test_bwd_data_t
         auto ip_primitive_desc = pd_t(eng, ip_diff_src_desc, ip_weights_desc,
                 ip_diff_dst_desc, ip_fwd_pdesc);
 
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
         test_bwd_pd_constructors<pd_t, hint_pd_t>(ip_primitive_desc,
                 ip_fwd_pdesc, aa, ip_diff_src_desc, ip_weights_desc,
                 ip_diff_dst_desc);
diff --git a/tests/gtests/test_inner_product_backward_weights.cpp b/tests/gtests/test_inner_product_backward_weights.cpp
index ba1167a9c2d..6ac106bf76d 100644
--- a/tests/gtests/test_inner_product_backward_weights.cpp
+++ b/tests/gtests/test_inner_product_backward_weights.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -124,16 +124,20 @@ class inner_product_test_bwd_weights_t
 protected:
     void SetUp() override {
         auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
-        SKIP_IF_CUDA(
-                !cuda_check_format_tags(p.src_format, p.diff_weights_format,
-                        p.diff_bias_format, p.diff_dst_format),
+        SKIP_IF_CUDA(!cuda_generic_check_format_tags(p.src_format,
+                             p.diff_weights_format, p.diff_bias_format,
+                             p.diff_dst_format),
                 "Unsupported format tag");
         SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions");
+        SKIP_IF_GENERIC(!cuda_generic_check_format_tags(p.src_format,
+                                p.diff_weights_format, p.diff_bias_format,
+                                p.diff_dst_format),
+                "Unsupported format tag");
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
 
-    bool cuda_check_format_tags(memory::format_tag src_format,
+    bool cuda_generic_check_format_tags(memory::format_tag src_format,
             memory::format_tag diff_wei_format,
             memory::format_tag diff_bia_format,
             memory::format_tag diff_dst_format) {
@@ -182,7 +186,7 @@ class inner_product_test_bwd_weights_t
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
         ASSERT_EQ(data_type, dnnl::memory::data_type::f32);
 
         memory::dims src_dims = {ipd.mb, ipd.ic},
@@ -224,7 +228,7 @@ class inner_product_test_bwd_weights_t
                         ip_src_desc, ip_diff_weights_desc, ip_diff_dst_desc,
                         ip_fwd_pdesc);
 
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
         test_bwd_pd_constructors<pd_t, hint_pd_t>(ip_primitive_desc,
                 ip_fwd_pdesc, aa, ip_src_desc, ip_diff_weights_desc,
                 ip_diff_dst_desc);
diff --git a/tests/gtests/test_inner_product_forward.cpp b/tests/gtests/test_inner_product_forward.cpp
index f0f67711ec8..359639eb7f1 100644
--- a/tests/gtests/test_inner_product_forward.cpp
+++ b/tests/gtests/test_inner_product_forward.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -88,15 +88,18 @@ class inner_product_test_t
 protected:
     void SetUp() override {
         auto p = ::testing::TestWithParam<inprod_test_params_t>::GetParam();
-        SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format, p.weights_format,
-                             p.bias_format, p.dst_format),
+        SKIP_IF_CUDA(!cuda_generic_check_format_tags(p.src_format,
+                             p.weights_format, p.bias_format, p.dst_format),
+                "Unsupported format tag");
+        SKIP_IF_GENERIC(!cuda_generic_check_format_tags(p.src_format,
+                                p.weights_format, p.bias_format, p.dst_format),
                 "Unsupported format tag");
         SKIP_IF_CUDA(p.ndims > 5, "Unsupported number of dimensions");
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
 
-    bool cuda_check_format_tags(memory::format_tag src_format,
+    bool cuda_generic_check_format_tags(memory::format_tag src_format,
             memory::format_tag wei_format, memory::format_tag bia_format,
             memory::format_tag dst_format) {
         bool src_ok = src_format == memory::format_tag::ncdhw
@@ -143,7 +146,7 @@ class inner_product_test_t
         ASSERT_EQ(p.aprop_kind, prop_kind::forward);
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
         ASSERT_EQ(data_type, dnnl::memory::data_type::f32);
 
         memory::dims src_dims = {ipd.mb, ipd.ic}, wei_dims = {ipd.oc, ipd.ic};
@@ -174,12 +177,16 @@ class inner_product_test_t
                 : pd_t(eng, p.aprop_kind, ip_src_desc, ip_weights_desc,
                         ip_dst_desc);
 
-        auto aa = allows_attr_t {false};
-        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        allows_attr_t aa {};
         aa.po_eltwise = true;
-        aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
         aa.po_sum = true;
-
+#ifdef DNNL_SYCL_GENERIC
+        aa.po_binary = true;
+        aa.po_prelu = true;
+#else
+        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+#endif
         test_fwd_pd_constructors<pd_t>(ip_primitive_desc, aa, p.aprop_kind,
                 ip_src_desc, ip_weights_desc, ip_bias_desc, ip_dst_desc);
 
diff --git a/tests/gtests/test_layer_normalization.cpp b/tests/gtests/test_layer_normalization.cpp
index eb5db3d58f7..89fad805c00 100644
--- a/tests/gtests/test_layer_normalization.cpp
+++ b/tests/gtests/test_layer_normalization.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -135,7 +135,7 @@ class lnorm_test_t : public ::testing::TestWithParam<test_lnorm_params_t> {
                 || impl::utils::one_of(dst_md->get_data_type(),
                         memory::data_type::s8, memory::data_type::u8);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         const bool is_cpu = get_test_engine_kind() == engine::kind::cpu;
         aa.po_eltwise = is_cpu;
         aa.po_binary = is_cpu;
diff --git a/tests/gtests/test_lrn.cpp b/tests/gtests/test_lrn.cpp
index 5961a193432..c7eea29c00e 100644
--- a/tests/gtests/test_lrn.cpp
+++ b/tests/gtests/test_lrn.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -61,6 +61,13 @@ bool hip_check_format_tag(tag first_tag, Rest... rest_tags) {
     return hip_check_format_tag(rest_tags...);
 }
 
+template <typename... Rest>
+bool generic_check_format_tag(tag first_tag, Rest... rest_tags) {
+    const bool ok = cuda_check_format_tag(first_tag);
+    if (!ok) return ok;
+    return cuda_check_format_tag(rest_tags...);
+}
+
 class lrn_test_t : public ::testing::TestWithParam<lrn_test_params_t> {
 private:
     lrn_test_params_t p;
@@ -83,6 +90,8 @@ class lrn_test_t : public ::testing::TestWithParam<lrn_test_params_t> {
                 "Unsupported format tag");
         SKIP_IF_HIP(!hip_check_format_tag(p.src_tag, p.dst_tag),
                 "Unsupported format tag");
+        SKIP_IF_GENERIC(!generic_check_format_tag(p.src_tag, p.dst_tag),
+                "Unsupported format tag");
 
         SKIP_IF_CUDA(p.src_dt != p.dst_dt && p.src_dt != dt::undef
                         && p.dst_dt != dt::undef,
@@ -113,7 +122,7 @@ class lrn_test_t : public ::testing::TestWithParam<lrn_test_params_t> {
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
 
         auto src_md = memory::desc(p.dims, p.src_dt, p.src_tag);
         auto dst_md = memory::desc(p.dims, p.dst_dt, p.dst_tag);
@@ -179,7 +188,7 @@ class lrn_test_t : public ::testing::TestWithParam<lrn_test_params_t> {
         // lrn specific types and values
         using pd_t = lrn_backward::primitive_desc;
         using hint_pd_t = lrn_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_malloc.cpp b/tests/gtests/test_malloc.cpp
index b7bd4bdaa75..99eb3783693 100644
--- a/tests/gtests/test_malloc.cpp
+++ b/tests/gtests/test_malloc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020 Intel Corporation
+* Copyright 2020-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -78,10 +78,6 @@ void *malloc(size_t size, int alignment) {
 #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
 #include <CL/cl.h>
 
-#include "gpu/ocl/ocl_gpu_engine.hpp"
-#include "gpu/ocl/ocl_memory_storage.hpp"
-#include "tests/gtests/dnnl_test_common_ocl.hpp"
-
 namespace dnnl {
 namespace impl {
 namespace gpu {
diff --git a/tests/gtests/test_matmul.cpp b/tests/gtests/test_matmul.cpp
index da51e47cd16..2e95a7ab732 100644
--- a/tests/gtests/test_matmul.cpp
+++ b/tests/gtests/test_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -259,10 +259,10 @@ class matmul_iface_test_t
 
         auto matmul_pd = pd_t(eng, src_md, weights_md, bia_md, dst_md, attr);
 
-        auto aa = allows_attr_t {false};
-        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        allows_attr_t aa {};
+        aa.po_binary = !is_amd_gpu(eng);
         aa.po_eltwise = true;
-        aa.po_prelu = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        aa.po_prelu = !is_amd_gpu(eng);
         aa.po_sum = true;
         // scales are not supported by HIP
         aa.scales = !is_amd_gpu(eng);
@@ -270,6 +270,11 @@ class matmul_iface_test_t
                 memory::data_type::s8, memory::data_type::u8);
         if (is_int8) aa.zp = true;
 
+#ifdef DNNL_SYCL_GENERIC
+        aa.po_prelu = true;
+        aa.po_binary = true;
+#endif
+
         test_fwd_pd_constructors<pd_t>(
                 matmul_pd, aa, src_md, weights_md, bia_md, dst_md);
 
diff --git a/tests/gtests/test_pooling_backward.cpp b/tests/gtests/test_pooling_backward.cpp
index 09caa3b029a..105bd506c20 100644
--- a/tests/gtests/test_pooling_backward.cpp
+++ b/tests/gtests/test_pooling_backward.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 * Copyright 2022-2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -60,6 +60,13 @@ bool hip_check_format_tags(memory::format_tag format) {
     return format_ok;
 }
 
+bool generic_check_format_tags(memory::format_tag format) {
+    return impl::utils::one_of(format, memory::format_tag::a,
+            memory::format_tag::nchw, memory::format_tag::ncdhw,
+            memory::format_tag::nhwc, memory::format_tag::ndhwc,
+            memory::format_tag::any);
+}
+
 template <typename data_t>
 class pooling_bwd_test_t
     : public ::testing::TestWithParam<pool_bwd_test_params_t> {
@@ -97,6 +104,11 @@ class pooling_bwd_test_t
                 "Test is not designed to test non-intel implementations of max "
                 "algorithm");
 
+        SKIP_IF_GENERIC(!generic_check_format_tags(p.diff_src_format),
+                "Unsupported format tag");
+        SKIP_IF_GENERIC(!generic_check_format_tags(p.diff_dst_format),
+                "Unsupported format tag");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -106,7 +118,7 @@ class pooling_bwd_test_t
 
         eng = get_test_engine();
         strm = make_stream(eng);
-        data_type = data_traits<data_t>::data_type;
+        data_type = data_traits_t<data_t>::data_type;
         ASSERT_EQ(data_type, dnnl::memory::data_type::f32);
 
         if (p.ndims == 5) {
@@ -221,7 +233,7 @@ class pooling_bwd_test_t
         auto pool_bwd_prim_desc = pd_t(eng, p.aalgorithm, *src_desc, *dst_desc,
                 strides, ker, dilation, pad_l, pad_r, pool_prim_desc);
         // test all pd ctors
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
         test_bwd_pd_constructors<pd_t, hint_pd_t>(pool_bwd_prim_desc,
                 pool_prim_desc, aa, p.aalgorithm, *src_desc, *dst_desc, strides,
                 ker, dilation, pad_l, pad_r);
diff --git a/tests/gtests/test_pooling_forward.cpp b/tests/gtests/test_pooling_forward.cpp
index eda35328a49..946f4c50b6b 100644
--- a/tests/gtests/test_pooling_forward.cpp
+++ b/tests/gtests/test_pooling_forward.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 * Copyright 2022-2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -84,6 +84,13 @@ bool hip_check_format_tags(memory::format_tag format) {
     return format_ok;
 }
 
+bool generic_check_format_tags(memory::format_tag format) {
+    return impl::utils::one_of(format, memory::format_tag::a,
+            memory::format_tag::nchw, memory::format_tag::ncdhw,
+            memory::format_tag::nhwc, memory::format_tag::ndhwc,
+            memory::format_tag::any);
+}
+
 template <typename data_t>
 void check_pool_fwd(const pool_test_params_t &p, const memory &src,
         const memory &dst, const memory &ws) {
@@ -216,7 +223,7 @@ class pooling_test_t : public ::testing::TestWithParam<pool_test_params_t> {
     void SetUp() override {
         p = ::testing::TestWithParam<decltype(p)>::GetParam();
 
-        SKIP_IF(unsupported_data_type(data_traits<data_t>::data_type),
+        SKIP_IF(unsupported_data_type(data_traits_t<data_t>::data_type),
                 "Engine does not support this data type.");
         SKIP_IF_CUDA(!cuda_check_format_tags(p.src_format),
                 "Unsupported format tag");
@@ -226,8 +233,12 @@ class pooling_test_t : public ::testing::TestWithParam<pool_test_params_t> {
                 !hip_check_format_tags(p.src_format), "Unsupported format tag");
         SKIP_IF_HIP(
                 !hip_check_format_tags(p.dst_format), "Unsupported format tag");
-        SKIP_IF_HIP(data_traits<data_t>::data_type == memory::data_type::s8,
+        SKIP_IF_HIP(data_traits_t<data_t>::data_type == memory::data_type::s8,
                 "Unsupported data type");
+        SKIP_IF_GENERIC(!generic_check_format_tags(p.src_format),
+                "Unsupported format tag");
+        SKIP_IF_GENERIC(!generic_check_format_tags(p.dst_format),
+                "Unsupported format tag");
 
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
@@ -266,7 +277,7 @@ class pooling_test_t : public ::testing::TestWithParam<pool_test_params_t> {
                 || p.aprop_kind == prop_kind::forward_inference);
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
 
         test_pool_desc_t pd = p.test_pd;
         auto p_src_desc = (p.ndims == 5)
@@ -314,7 +325,7 @@ class pooling_test_t : public ::testing::TestWithParam<pool_test_params_t> {
         auto pool_prim_desc = pd_t(eng, p.aprop_kind, p.aalgorithm, p_src_desc,
                 p_dst_desc, strides, ker, dilation, pad_l, pad_r);
         // test all pd ctors
-        allows_attr_t aa {false};
+        allows_attr_t aa {};
         if (!(is_nvidia_gpu(eng) || is_amd_gpu(eng))) {
             aa.po_eltwise = true;
             aa.po_binary = true;
@@ -766,12 +777,7 @@ INSTANTIATE_TEST_SUITE_P(TestPoolingForwardZeroDim, pooling_test_float,
                         algorithm::pooling_max, memory::format_tag::nhwc,
                         memory::format_tag::nhwc,
                         EXPAND_SIZES_2D(
-                                0, 4, 4, 4, 4, 4, 3, 3, 0, 0, 1, 1, 1, 1)},
-                pool_test_params_float {prop_kind::forward_training,
-                        algorithm::pooling_max, memory::format_tag::nchw,
-                        memory::format_tag::nchw,
-                        EXPAND_SIZES_2D(
-                                2, 4, 0, 4, 4, 4, 3, 3, 1, 1, 1, 1, 1, 1)}));
+                                0, 4, 4, 4, 4, 4, 3, 3, 0, 0, 1, 1, 1, 1)}));
 
 INSTANTIATE_TEST_SUITE_P(TestPoolingForwardEF, pooling_test_float,
         ::testing::Values(
diff --git a/tests/gtests/test_prelu.cpp b/tests/gtests/test_prelu.cpp
index 385b1e15344..9e9a9ef423c 100644
--- a/tests/gtests/test_prelu.cpp
+++ b/tests/gtests/test_prelu.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "dnnl_test_common.hpp"
 #include "gtest/gtest.h"
 
+#include "dnnl_test_macros.hpp"
 #include "oneapi/dnnl/dnnl.hpp"
 
 namespace dnnl {
@@ -37,6 +38,18 @@ struct prelu_test_params_t {
     dnnl_status_t expected_status;
 };
 
+bool generic_check_tag(memory::format_tag ft) {
+    return impl::utils::one_of(
+            ft, tag::ncdhw, tag::nchw, tag::ncw, tag::nhwc, tag::nwc, tag::any);
+}
+
+bool generic_reduce_diff_weights(
+        const memory::dims &src_dims, const memory::dims &diff_wei_dims) {
+    return impl::utils::array_product(src_dims.data(), src_dims.size())
+            != impl::utils::array_product(
+                    diff_wei_dims.data(), diff_wei_dims.size());
+}
+
 class prelu_test_t : public ::testing::TestWithParam<prelu_test_params_t> {
 private:
     prelu_test_params_t p;
@@ -53,6 +66,18 @@ class prelu_test_t : public ::testing::TestWithParam<prelu_test_params_t> {
         SKIP_IF(unsupported_data_type(p.src_dt, p.wei_dt, p.dst_dt),
                 "Engine does not support this data type.");
 
+        SKIP_IF_GENERIC(
+                !generic_check_tag(p.src_tag), "Unsupported format tag");
+        SKIP_IF_GENERIC(
+                !generic_check_tag(p.dst_tag), "Unsupported format tag");
+        SKIP_IF_GENERIC(
+                !generic_check_tag(p.wei_tag), "Unsupported format tag");
+        // XXX: Enable these cases when generic reduction is implemented
+        SKIP_IF_GENERIC(generic_reduce_diff_weights(p.src_dims, p.wei_dims),
+                "Unsupported configuration");
+        SKIP_IF_GENERIC(p.expected_status == dnnl_unimplemented,
+                "Test case not supported");
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -64,7 +89,7 @@ class prelu_test_t : public ::testing::TestWithParam<prelu_test_params_t> {
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
 
         auto src_md = memory::desc(p.src_dims, p.src_dt, p.src_tag);
         auto wei_md = memory::desc(p.wei_dims, p.wei_dt, p.wei_tag);
@@ -126,7 +151,7 @@ class prelu_test_t : public ::testing::TestWithParam<prelu_test_params_t> {
         // prelu specific types and values
         using pd_t = prelu_backward::primitive_desc;
         using hint_pd_t = prelu_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_reduction.cpp b/tests/gtests/test_reduction.cpp
index 22305b14e4d..e594175d515 100644
--- a/tests/gtests/test_reduction.cpp
+++ b/tests/gtests/test_reduction.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Intel Corporation
+* Copyright 2020-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,15 +42,13 @@ class reduction_test_t
 
 protected:
     void SetUp() override {
-        src_dt = data_traits<src_data_t>::data_type;
-        dst_dt = data_traits<dst_data_t>::data_type;
+        src_dt = data_traits_t<src_data_t>::data_type;
+        dst_dt = data_traits_t<dst_data_t>::data_type;
 
         p = ::testing::TestWithParam<reduction_test_params_t>::GetParam();
 
         SKIP_IF(unsupported_data_type(src_dt),
                 "Engine does not support this data type.");
-        SKIP_IF(get_test_engine().get_kind() != engine::kind::cpu,
-                "Engine does not support this primitive.");
         SKIP_IF_CUDA(p.aalgorithm != algorithm::reduction_max
                         && p.aalgorithm != algorithm::reduction_min
                         && p.aalgorithm != algorithm::reduction_sum
@@ -61,18 +59,36 @@ class reduction_test_t
                                 != algorithm::reduction_norm_lp_power_p_max
                         && p.eps != 0.0f,
                 "Unsupported algorithm type for CUDA");
+        SKIP_IF_GENERIC(!(generic_supported_format_tag(p.src_format)
+                                && generic_supported_format_tag(p.dst_format)),
+                "Unsupported format tag");
 
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
 
+    bool generic_supported_format_tag(memory::format_tag tag) {
+        return impl::utils::one_of(tag, impl::format_tag::a,
+                impl::format_tag::ab, impl::format_tag::abc,
+                impl::format_tag::abcd, impl::format_tag::abcde,
+                impl::format_tag::abcdef, impl::format_tag::abdec,
+                impl::format_tag::acb, impl::format_tag::acbde,
+                impl::format_tag::acbdef, impl::format_tag::acdb,
+                impl::format_tag::acdeb, impl::format_tag::ba,
+                impl::format_tag::bac, impl::format_tag::bacd,
+                impl::format_tag::bca, impl::format_tag::bcda,
+                impl::format_tag::bcdea, impl::format_tag::cba,
+                impl::format_tag::cdba, impl::format_tag::cdeba,
+                impl::format_tag::decab, impl::format_tag::defcab);
+    }
+
     void Test() {
         // reduction specific types and values
         using pd_t = reduction::primitive_desc;
-        allows_attr_t allowed_attributes {false}; // doesn't support anything
-        allowed_attributes.po_sum = true;
-        allowed_attributes.po_eltwise = true;
-        allowed_attributes.po_binary = true;
+        allows_attr_t aa {};
+        aa.po_sum = true;
+        aa.po_eltwise = true;
+        aa.po_binary = true;
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
@@ -85,8 +101,8 @@ class reduction_test_t
         // regular pd ctor
         pd = pd_t(eng, p.aalgorithm, desc_src, desc_dst, p.p, p.eps);
         // test all pd ctors
-        test_fwd_pd_constructors<pd_t>(pd, allowed_attributes, p.aalgorithm,
-                desc_src, desc_dst, p.p, p.eps);
+        test_fwd_pd_constructors<pd_t>(
+                pd, aa, p.aalgorithm, desc_src, desc_dst, p.p, p.eps);
 
         EXPECT_ANY_THROW(reduction(pd, {}));
         // default primitive ctor
diff --git a/tests/gtests/test_reorder.cpp b/tests/gtests/test_reorder.cpp
index 6cabe6f712c..6b379ac7e62 100644
--- a/tests/gtests/test_reorder.cpp
+++ b/tests/gtests/test_reorder.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2023 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 * Copyright 2023 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,42 +32,42 @@ using f32_f32 = std::pair<float, float>;
 using s32_s32 = std::pair<int32_t, int32_t>;
 using s8_s8 = std::pair<int8_t, int8_t>;
 
-using cfg_bf16 = test_simple_params<f32_bf16>;
-using cfg_f32 = test_simple_params<f32_f32>;
-using cfg_s32 = test_simple_params<s32_s32>;
-using cfg_s8 = test_simple_params<s8_s8>;
+using cfg_bf16 = test_simple_params_t<f32_bf16>;
+using cfg_f32 = test_simple_params_t<f32_f32>;
+using cfg_s32 = test_simple_params_t<s32_s32>;
+using cfg_s8 = test_simple_params_t<s8_s8>;
 
-using reorder_simple_test_f32_bf16 = reorder_simple_test<f32_bf16>;
-using reorder_simple_test_f32_f32 = reorder_simple_test<f32_f32>;
-using reorder_simple_test_s32_s32 = reorder_simple_test<s32_s32>;
-using reorder_simple_test_s8_s8 = reorder_simple_test<s8_s8>;
+using reorder_simple_test_t_f32_bf16 = reorder_simple_test_t<f32_bf16>;
+using reorder_simple_test_t_f32_f32 = reorder_simple_test_t<f32_f32>;
+using reorder_simple_test_t_s32_s32 = reorder_simple_test_t<s32_s32>;
+using reorder_simple_test_t_s8_s8 = reorder_simple_test_t<s8_s8>;
 
 using fmt = memory::format_tag;
 
-TEST_P(reorder_simple_test_f32_bf16, TestsReorder) {
+TEST_P(reorder_simple_test_t_f32_bf16, TestsReorder) {
     Test();
 }
-TEST_P(reorder_simple_test_s32_s32, TestsReorder) {
+TEST_P(reorder_simple_test_t_s32_s32, TestsReorder) {
     Test();
 }
-TEST_P(reorder_simple_test_f32_f32, TestsReorder) {
+TEST_P(reorder_simple_test_t_f32_f32, TestsReorder) {
     Test();
 }
-TEST_P(reorder_simple_test_s8_s8, TestsReorder) {
+TEST_P(reorder_simple_test_t_s8_s8, TestsReorder) {
     Test();
 }
 
-INSTANTIATE_TEST_SUITE_P(ACLCases, reorder_simple_test_f32_bf16,
+INSTANTIATE_TEST_SUITE_P(ACLCases, reorder_simple_test_t_f32_bf16,
         ::testing::Values(cfg_bf16 {fmt::ab, fmt::BA4b4a, {128, 128}},
                 cfg_bf16 {fmt::ab, fmt::BA8b4a, {128, 128}}));
 
-INSTANTIATE_TEST_SUITE_P(ACLCases, reorder_simple_test_f32_f32,
+INSTANTIATE_TEST_SUITE_P(ACLCases, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::cdba, fmt::Acdb8a, {16, 16, 1, 1}},
                 cfg_f32 {fmt::cdba, fmt::Acdb8a, {1, 16, 1, 1}},
                 cfg_f32 {fmt::ba, fmt::Ab4a, {128, 128}},
                 cfg_f32 {fmt::ba, fmt::Ab8a, {128, 128}}));
 
-INSTANTIATE_TEST_SUITE_P(CornerCases, reorder_simple_test_f32_f32,
+INSTANTIATE_TEST_SUITE_P(CornerCases, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::nchw, fmt::nc, {2, 16, 8, 8}, true,
                                   dnnl_invalid_arguments},
                 cfg_f32 {fmt::any, fmt::nchw, {2, 28, 3, 4}, true,
@@ -82,7 +82,7 @@ INSTANTIATE_TEST_SUITE_P(CornerCases, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::OIhw16o16i, fmt::oihw, {16, 31, 0, 3}},
                 cfg_f32 {fmt::OIhw16i16o, fmt::OIhw16o16i, {32, 16, 3, 0}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(PaddedData, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(PaddedData, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::nchw, fmt::nChw8c, {2, 28, 3, 4}},
                 cfg_f32 {fmt::nChw8c, fmt::nchw, {2, 28, 3, 4}},
                 cfg_f32 {fmt::chwn, fmt::nChw8c, {2, 28, 3, 4}},
@@ -104,7 +104,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(PaddedData, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::ndhwc, fmt::nCdhw16c, {3, 28, 2, 3, 4}},
                 cfg_f32 {fmt::nCdhw16c, fmt::ndhwc, {3, 28, 2, 3, 4}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Data_3d, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(Data_3d, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::ncdhw, fmt::nCdhw16c, {2, 32, 2, 3, 4}},
                 cfg_f32 {fmt::nCdhw16c, fmt::ncdhw, {2, 32, 2, 3, 4}},
                 cfg_f32 {fmt::nCdhw8c, fmt::ncdhw, {2, 32, 2, 3, 4}},
@@ -113,7 +113,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(Data_3d, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::ndhwc, fmt::nCdhw8c, {3, 32, 2, 3, 4}},
                 cfg_f32 {fmt::nCdhw8c, fmt::ndhwc, {3, 32, 2, 3, 4}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(PaddedWeights, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(PaddedWeights, reorder_simple_test_t_f32_f32,
         ::testing::Values(
                 // Oi(d)hw16o
                 cfg_f32 {fmt::oihw, fmt::Oihw16o, {17, 23, 2, 3}},
@@ -171,7 +171,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(PaddedWeights, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::hwio, fmt::OhwI8i8o, {17, 23, 2, 3}},
                 cfg_f32 {fmt::OhwI8i8o, fmt::hwio, {17, 23, 2, 3}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Weights_3d, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(Weights_3d, reorder_simple_test_t_f32_f32,
         ::testing::Values(
                 cfg_f32 {fmt::oidhw, fmt::OIdhw8i8o, {16, 24, 2, 3, 3}},
                 cfg_f32 {fmt::OIdhw8i8o, fmt::oidhw, {16, 24, 2, 3, 3}},
@@ -222,7 +222,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(Weights_3d, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::gIOdhw16o16i, fmt::gOIdhw16i16o,
                         {2, 64, 96, 2, 3, 4}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::nchw, fmt::nchw, {10, 10, 13, 13}},
                 cfg_f32 {fmt::nchw, fmt::nhwc, {10, 10, 10, 10}},
                 cfg_f32 {fmt::nhwc, fmt::nchw, {10, 10, 10, 10}},
@@ -245,7 +245,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::nChw16c, fmt::nhwc, {2, 64, 4, 4}},
                 cfg_f32 {fmt::abcd, fmt::abdc, {10, 10, 10, 10}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Weights_0, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(Weights_0, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::hwio, fmt::oihw, {32, 32, 3, 3}},
                 cfg_f32 {fmt::oihw, fmt::hwio, {32, 32, 3, 3}},
                 cfg_f32 {fmt::hwio, fmt::Ohwi8o, {32, 32, 3, 3}},
@@ -313,7 +313,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(Weights_0, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::goihw, fmt::Goihw8g, {16, 16, 16, 3, 3}},
                 cfg_f32 {fmt::Goihw8g, fmt::goihw, {16, 16, 16, 3, 3}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Weights_1, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(Weights_1, reorder_simple_test_t_f32_f32,
         ::testing::Values(
                 cfg_f32 {fmt::goihw, fmt::Goihw16g, {32, 32, 32, 3, 3}},
                 cfg_f32 {fmt::Goihw16g, fmt::goihw, {32, 32, 32, 3, 3}},
@@ -326,7 +326,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(Weights_1, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::goihw, fmt::giohw, {2, 32, 32, 3, 3}},
                 cfg_f32 {fmt::giohw, fmt::goihw, {2, 32, 32, 3, 3}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Weights_IOhw16o16i, reorder_simple_test_f32_f32,
+CPU_INSTANTIATE_TEST_SUITE_P(Weights_IOhw16o16i, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::oihw, fmt::IOhw16o16i, {64, 64, 3, 3}},
                 cfg_f32 {fmt::IOhw16o16i, fmt::oihw, {64, 64, 3, 3}},
                 cfg_f32 {fmt::OIhw16i16o, fmt::IOhw16o16i, {64, 64, 3, 3}},
@@ -339,11 +339,11 @@ CPU_INSTANTIATE_TEST_SUITE_P(Weights_IOhw16o16i, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::gIOhw16o16i, fmt::gOIhw16i16o,
                         {2, 64, 64, 3, 3}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Simple, reorder_simple_test_s32_s32,
+CPU_INSTANTIATE_TEST_SUITE_P(Simple, reorder_simple_test_t_s32_s32,
         ::testing::Values(cfg_s32 {fmt::nchw, fmt::nChw16c, {2, 64, 4, 4}},
                 cfg_s32 {fmt::nChw16c, fmt::nchw, {2, 64, 4, 4}}));
 
-CPU_INSTANTIATE_TEST_SUITE_P(Simple, reorder_simple_test_s8_s8,
+CPU_INSTANTIATE_TEST_SUITE_P(Simple, reorder_simple_test_t_s8_s8,
         ::testing::Values(cfg_s8 {fmt::oihw, fmt::OIhw4i16o4i, {64, 64, 3, 3}},
                 cfg_s8 {fmt::OIhw4i16o4i, fmt::oihw, {64, 64, 3, 3}},
                 cfg_s8 {fmt::oihw, fmt::OhwI4i16o4i, {64, 64, 3, 3}},
@@ -351,7 +351,7 @@ CPU_INSTANTIATE_TEST_SUITE_P(Simple, reorder_simple_test_s8_s8,
                 cfg_s8 {fmt::goihw, fmt::gOIhw4i16o4i, {2, 64, 64, 3, 3}},
                 cfg_s8 {fmt::gOIhw4i16o4i, fmt::goihw, {2, 64, 64, 3, 3}}));
 
-GPU_INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_f32_f32,
+GPU_INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::nchw, fmt::nhwc, {2, 48, 5, 4}},
                 cfg_f32 {fmt::nchw, fmt::NChw16n16c, {64, 32, 5, 6}},
                 cfg_f32 {fmt::nChw16c, fmt::NChw16n16c, {32, 48, 6, 9}},
@@ -359,7 +359,7 @@ GPU_INSTANTIATE_TEST_SUITE_P(Data, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::ncdhw, fmt::NCdhw16n16c, {32, 32, 2, 5, 6}},
                 cfg_f32 {fmt::nCdhw16c, fmt::NCdhw16n16c, {32, 48, 2, 6, 9}}));
 
-GPU_INSTANTIATE_TEST_SUITE_P(Data_1D, reorder_simple_test_f32_f32,
+GPU_INSTANTIATE_TEST_SUITE_P(Data_1D, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::ncw, fmt::nCw16c, {2, 48, 7}},
                 cfg_f32 {fmt::nCw16c, fmt::ncw, {2, 48, 7}},
                 cfg_f32 {fmt::ncw, fmt::NCw16n16c, {32, 48, 7}},
@@ -367,13 +367,13 @@ GPU_INSTANTIATE_TEST_SUITE_P(Data_1D, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::nCw16c, fmt::NCw16n16c, {32, 48, 7}},
                 cfg_f32 {fmt::NCw16n16c, fmt::nCw16c, {32, 48, 7}}));
 
-GPU_INSTANTIATE_TEST_SUITE_P(PaddedData, reorder_simple_test_f32_f32,
+GPU_INSTANTIATE_TEST_SUITE_P(PaddedData, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::nchw, fmt::nChw8c, {2, 28, 5, 4}},
                 cfg_f32 {fmt::nChw8c, fmt::nchw, {2, 28, 5, 4}},
                 cfg_f32 {fmt::nchw, fmt::nChw16c, {2, 28, 5, 4}},
                 cfg_f32 {fmt::nChw16c, fmt::nchw, {2, 28, 5, 4}}));
 
-GPU_INSTANTIATE_TEST_SUITE_P(Weights, reorder_simple_test_f32_f32,
+GPU_INSTANTIATE_TEST_SUITE_P(Weights, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::oihw, fmt::IOhw16i16o, {32, 48, 2, 3}},
                 cfg_f32 {fmt::oihw, fmt::OIhw16o16i, {32, 32, 2, 2}},
                 cfg_f32 {fmt::hwigo, fmt::gIOhw16i16o, {2, 64, 32, 2, 3}},
@@ -385,7 +385,7 @@ GPU_INSTANTIATE_TEST_SUITE_P(Weights, reorder_simple_test_f32_f32,
                 cfg_f32 {
                         fmt::goidhw, fmt::gOIdhw16i16o, {2, 32, 64, 2, 2, 7}}));
 
-GPU_INSTANTIATE_TEST_SUITE_P(weights_1D, reorder_simple_test_f32_f32,
+GPU_INSTANTIATE_TEST_SUITE_P(weights_1D, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::oiw, fmt::OIw8i16o2i, {32, 48, 7}},
                 cfg_f32 {fmt::OIw8i16o2i, fmt::oiw, {32, 48, 7}},
                 cfg_f32 {fmt::oiw, fmt::OwI8i16o2i, {32, 48, 7}},
@@ -411,7 +411,7 @@ GPU_INSTANTIATE_TEST_SUITE_P(weights_1D, reorder_simple_test_f32_f32,
                 cfg_f32 {fmt::goiw, fmt::gIOw16i16o, {8, 32, 48, 7}},
                 cfg_f32 {fmt::gIOw16i16o, fmt::goiw, {8, 32, 48, 7}}));
 
-GPU_INSTANTIATE_TEST_SUITE_P(PaddedWeights, reorder_simple_test_f32_f32,
+GPU_INSTANTIATE_TEST_SUITE_P(PaddedWeights, reorder_simple_test_t_f32_f32,
         ::testing::Values(cfg_f32 {fmt::oihw, fmt::IOhw16i16o, {17, 23, 2, 1}},
                 cfg_f32 {fmt::goihw, fmt::gOIhw16o16i, {2, 17, 23, 1, 2}}));
 
diff --git a/tests/gtests/test_reorder_common.hpp b/tests/gtests/test_reorder_common.hpp
index 29cee170341..249c3d504b6 100644
--- a/tests/gtests/test_reorder_common.hpp
+++ b/tests/gtests/test_reorder_common.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,7 +50,7 @@ inline void check_reorder(const memory::desc &md_i, const memory::desc &md_o,
 }
 
 template <typename reorder_types>
-struct test_simple_params {
+struct test_simple_params_t {
     memory::format_tag fmt_i;
     memory::format_tag fmt_o;
     memory::dims dims;
@@ -59,22 +59,22 @@ struct test_simple_params {
 };
 
 template <typename reorder_types>
-class reorder_simple_test
-    : public ::testing::TestWithParam<test_simple_params<reorder_types>> {
+class reorder_simple_test_t
+    : public ::testing::TestWithParam<test_simple_params_t<reorder_types>> {
 protected:
 #ifdef DNNL_TEST_WITH_ENGINE_PARAM
     void Test() {
         using data_i_t = typename reorder_types::first_type;
         using data_o_t = typename reorder_types::second_type;
-        memory::data_type prec_i = data_traits<data_i_t>::data_type;
-        memory::data_type prec_o = data_traits<data_o_t>::data_type;
+        memory::data_type prec_i = data_traits_t<data_i_t>::data_type;
+        memory::data_type prec_o = data_traits_t<data_o_t>::data_type;
 
         SKIP_IF(unsupported_data_type(prec_i),
                 "Engine does not support this data type.");
         SKIP_IF(unsupported_data_type(prec_o),
                 "Engine does not support this data type.");
 
-        test_simple_params<reorder_types> p
+        test_simple_params_t<reorder_types> p
                 = ::testing::TestWithParam<decltype(p)>::GetParam();
 
         SKIP_IF_CUDA(!((supported_format(p.fmt_i)
@@ -87,6 +87,9 @@ class reorder_simple_test
                             && (supported_format(p.fmt_o)
                                     || supported_blocking(prec_o, p.fmt_o))),
                 "Unsupported cuda format tag/ data type");
+        SKIP_IF_GENERIC(
+                !(supported_format(p.fmt_i) && supported_format(p.fmt_o)),
+                "Unsupported generic format tag");
 
         catch_expected_failures(
                 [&]() {
@@ -113,15 +116,15 @@ class reorder_simple_test
     void Test(engine &eng_i, engine &eng_o) {
         using data_i_t = typename reorder_types::first_type;
         using data_o_t = typename reorder_types::second_type;
-        memory::data_type prec_i = data_traits<data_i_t>::data_type;
-        memory::data_type prec_o = data_traits<data_o_t>::data_type;
+        memory::data_type prec_i = data_traits_t<data_i_t>::data_type;
+        memory::data_type prec_o = data_traits_t<data_o_t>::data_type;
 
         SKIP_IF(unsupported_data_type(prec_i, eng_i),
                 "Engine does not support this data type.");
         SKIP_IF(unsupported_data_type(prec_o, eng_o),
                 "Engine does not support this data type.");
 
-        test_simple_params<reorder_types> p
+        test_simple_params_t<reorder_types> p
                 = ::testing::TestWithParam<decltype(p)>::GetParam();
 
 #ifdef DNNL_SYCL_CUDA
@@ -138,6 +141,10 @@ class reorder_simple_test
                                 || supported_blocking(prec_o, p.fmt_o))),
                 "Unsupported hip format tag/ data type");
 #endif
+#ifdef DNNL_SYCL_GENERIC
+        SKIP_IF(!(supported_format(p.fmt_i) && supported_format(p.fmt_o)),
+                "Unsupported generic format tag");
+#endif
 
         catch_expected_failures([&]() { RunTest(eng_i, eng_o); },
                 p.expect_to_fail, p.expected_status);
@@ -147,14 +154,14 @@ class reorder_simple_test
         using data_i_t = typename reorder_types::first_type;
         using data_o_t = typename reorder_types::second_type;
 
-        test_simple_params<reorder_types> p
+        test_simple_params_t<reorder_types> p
                 = ::testing::TestWithParam<decltype(p)>::GetParam();
 
         const size_t nelems = std::accumulate(p.dims.begin(), p.dims.end(),
                 size_t(1), std::multiplies<size_t>());
 
-        memory::data_type prec_i = data_traits<data_i_t>::data_type;
-        memory::data_type prec_o = data_traits<data_o_t>::data_type;
+        memory::data_type prec_i = data_traits_t<data_i_t>::data_type;
+        memory::data_type prec_o = data_traits_t<data_o_t>::data_type;
         auto md_i = memory::desc(p.dims, prec_i, p.fmt_i);
         auto md_o = memory::desc(p.dims, prec_o, p.fmt_o);
 
diff --git a/tests/gtests/test_resampling.cpp b/tests/gtests/test_resampling.cpp
index dcbb81a4530..fbf5e8a0030 100644
--- a/tests/gtests/test_resampling.cpp
+++ b/tests/gtests/test_resampling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -211,7 +211,7 @@ class resampling_test_t
     std::shared_ptr<memory::desc> src_desc, dst_desc;
     std::vector<float> factors;
     std::vector<float> expected_factors;
-    resampling_forward::primitive_desc resampling_pd;
+    std::shared_ptr<resampling_forward::primitive_desc> pd_fwd_hint;
 
     resampling_test_params_t p;
     engine eng;
@@ -222,6 +222,10 @@ class resampling_test_t
         return impl::utils::one_of(
                 tag, dnnl_abc, dnnl_abcd, dnnl_acb, dnnl_acdb);
     }
+    bool generic_supported_format_tag(memory::format_tag tag) {
+        return impl::utils::one_of(tag, dnnl_abc, dnnl_abcd, dnnl_acb,
+                dnnl_acdb, dnnl_format_tag_any);
+    }
     void SetUp() override {
         SKIP_IF_HIP(
                 true, "Resampling operator is not supported by hip backend");
@@ -232,6 +236,8 @@ class resampling_test_t
                 "cudnn resampling backend does not support 5d tensor");
         SKIP_IF_CUDA(!cuda_supported_format_tag(p.src_format),
                 "Unsupported format tag");
+        SKIP_IF_GENERIC(!generic_supported_format_tag(p.src_format),
+                "Unsupported format tag");
 
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
@@ -266,7 +272,7 @@ class resampling_test_t
             dst_dims.push_back(pd.ow);
         }
 
-        memory::data_type data_type = data_traits<data_t>::data_type;
+        memory::data_type data_type = data_traits_t<data_t>::data_type;
         src_desc = std::make_shared<memory::desc>(
                 src_dims, data_type, p.src_format);
         dst_desc = std::make_shared<memory::desc>(
@@ -283,37 +289,48 @@ class resampling_test_t
     }
 
     void Forward() {
-        resampling_pd = resampling_forward::primitive_desc(
-                eng, p.aprop_kind, p.aalgorithm, *src_desc, *dst_desc);
-        resampling_pd = resampling_forward::primitive_desc(
-                resampling_pd.get()); // test construction from a C pd
+        // resampling specific types and values
+        using pd_t = resampling_forward::primitive_desc;
+
+        allows_attr_t aa {};
+        aa.po_sum = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        aa.po_binary = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+        aa.po_eltwise = !is_nvidia_gpu(eng) && !is_amd_gpu(eng);
+
+        // default pd ctor
+        auto pd = pd_t();
+        // regular pd ctor
+        pd = pd_t(eng, p.aprop_kind, p.aalgorithm, *src_desc, *dst_desc);
+        // test all pd ctors
+        test_fwd_pd_constructors<pd_t>(
+                pd, aa, p.aprop_kind, p.aalgorithm, *src_desc, *dst_desc);
+        pd_fwd_hint = std::make_shared<pd_t>(pd);
+
+        EXPECT_ANY_THROW(resampling_forward(pd, {}));
+        // default primitive ctor
+        auto resampling = resampling_forward();
+        // regular primitive ctor
+        resampling = resampling_forward(pd);
 
         {
-            auto resampling_desc_no_dst
-                    = resampling_forward::primitive_desc(eng, p.aprop_kind,
-                            p.aalgorithm, factors, resampling_pd.src_desc());
-            auto resampling_pd_no_dst
-                    = resampling_forward::primitive_desc(eng, p.aprop_kind,
-                            p.aalgorithm, factors, resampling_pd.src_desc());
-            ASSERT_EQ(
-                    resampling_pd.dst_desc(), resampling_pd_no_dst.dst_desc());
-            ASSERT_EQ(resampling_pd_no_dst.get_factors(), expected_factors);
+            auto pd_no_dst = pd_t(
+                    eng, p.aprop_kind, p.aalgorithm, factors, pd.src_desc());
+            ASSERT_EQ(pd.dst_desc(), pd_no_dst.dst_desc());
+            ASSERT_EQ(pd_no_dst.get_factors(), expected_factors);
         }
 
-        ASSERT_EQ(resampling_pd.get_prop_kind(), p.aprop_kind);
-        ASSERT_EQ(resampling_pd.get_algorithm(), p.aalgorithm);
-        ASSERT_EQ(resampling_pd.get_factors(), expected_factors);
+        ASSERT_EQ(pd.get_prop_kind(), p.aprop_kind);
+        ASSERT_EQ(pd.get_algorithm(), p.aalgorithm);
+        ASSERT_EQ(pd.get_factors(), expected_factors);
 
-        auto src = test::make_memory(resampling_pd.src_desc(), eng);
-        auto dst = test::make_memory(resampling_pd.dst_desc(), eng);
-        auto dst_ref = test::make_memory(resampling_pd.dst_desc(), eng);
+        auto src = test::make_memory(pd.src_desc(), eng);
+        auto dst = test::make_memory(pd.dst_desc(), eng);
+        auto dst_ref = test::make_memory(pd.dst_desc(), eng);
 
         fill_data<data_t>(src.get_desc().get_size() / sizeof(data_t), src);
         check_zero_tail<data_t>(1, src);
 
-        EXPECT_ANY_THROW(resampling_forward(resampling_pd, {}));
-        resampling_forward(resampling_pd)
-                .execute(strm, {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}});
+        resampling.execute(strm, {{DNNL_ARG_SRC, src}, {DNNL_ARG_DST, dst}});
         strm.wait();
 
         compute_ref_resampling_fwd<data_t>(p, src, dst_ref);
@@ -324,30 +341,41 @@ class resampling_test_t
     }
 
     void Backward() {
-        auto resampling_bwd_pd = resampling_backward::primitive_desc(eng,
-                p.aalgorithm, factors, *src_desc, *dst_desc, resampling_pd);
-
-        auto diff_src
-                = test::make_memory(resampling_bwd_pd.diff_src_desc(), eng);
-        auto diff_dst
-                = test::make_memory(resampling_bwd_pd.diff_dst_desc(), eng);
-        auto diff_src_ref
-                = test::make_memory(resampling_bwd_pd.diff_src_desc(), eng);
-
-        ASSERT_EQ(resampling_bwd_pd.get_prop_kind(), prop_kind::backward_data);
-        ASSERT_EQ(resampling_bwd_pd.get_algorithm(), p.aalgorithm);
-        ASSERT_EQ(resampling_bwd_pd.get_factors(), expected_factors);
+        // resampling specific types and values
+        using pd_t = resampling_backward::primitive_desc;
+        using hint_pd_t = resampling_forward::primitive_desc;
+        allows_attr_t aa {}; // doesn't support anything
+
+        // default pd ctor
+        auto pd = pd_t();
+        // regular pd ctor
+        pd = pd_t(
+                eng, p.aalgorithm, factors, *src_desc, *dst_desc, *pd_fwd_hint);
+        // test all pd ctors
+        test_bwd_pd_constructors<pd_t, hint_pd_t>(pd, *pd_fwd_hint, aa,
+                p.aalgorithm, factors, *src_desc, *dst_desc);
+
+        EXPECT_ANY_THROW(resampling_backward(pd, {}));
+        // default primitive ctor
+        auto resampling = resampling_backward();
+        // regular primitive ctor
+        resampling = resampling_backward(pd);
+
+        auto diff_src = test::make_memory(pd.diff_src_desc(), eng);
+        auto diff_dst = test::make_memory(pd.diff_dst_desc(), eng);
+        auto diff_src_ref = test::make_memory(pd.diff_src_desc(), eng);
+
+        ASSERT_EQ(pd.get_prop_kind(), prop_kind::backward_data);
+        ASSERT_EQ(pd.get_algorithm(), p.aalgorithm);
+        ASSERT_EQ(pd.get_factors(), expected_factors);
 
         fill_data<data_t>(
                 diff_dst.get_desc().get_size() / sizeof(data_t), diff_dst);
         check_zero_tail<data_t>(1, diff_dst);
         check_zero_tail<data_t>(1, diff_src);
 
-        EXPECT_ANY_THROW(resampling_backward(resampling_bwd_pd, {}));
-        resampling_backward(resampling_bwd_pd)
-                .execute(strm,
-                        {{DNNL_ARG_DIFF_SRC, diff_src},
-                                {DNNL_ARG_DIFF_DST, diff_dst}});
+        resampling.execute(strm,
+                {{DNNL_ARG_DIFF_SRC, diff_src}, {DNNL_ARG_DIFF_DST, diff_dst}});
         strm.wait();
 
         compute_ref_resampling_bwd<data_t>(p, diff_dst, diff_src_ref);
diff --git a/tests/gtests/test_rnn_forward.cpp b/tests/gtests/test_rnn_forward.cpp
index f885df10c74..0b8aef68348 100644
--- a/tests/gtests/test_rnn_forward.cpp
+++ b/tests/gtests/test_rnn_forward.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2018-2023 Intel Corporation
+* Copyright 2018-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -104,7 +104,7 @@ class rnn_forward_test_t : public ::testing::TestWithParam<test_rnn_params_t> {
         return memory::desc();
     }
 
-    void testExecArgQueries(typename T::primitive_desc pd) {
+    void testExecArgQueries(const typename T::primitive_desc &pd) {
         ASSERT_TRUE(pd.query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_LAYER)
                 == pd.weights_layer_desc());
         ASSERT_TRUE(pd.query_md(query::exec_arg_md, DNNL_ARG_WEIGHTS_ITER)
@@ -131,7 +131,7 @@ class rnn_forward_test_t : public ::testing::TestWithParam<test_rnn_params_t> {
                 == queryDstIterC(pd));
     };
 
-    void test_primitive_param_queries(typename T::primitive_desc pd) {
+    void test_primitive_param_queries(const typename T::primitive_desc &pd) {
         auto p = ::testing::TestWithParam<test_rnn_params_t>::GetParam();
 
         dnnl::algorithm expected_cell_kind = algorithm::undef;
@@ -169,6 +169,19 @@ class rnn_forward_test_t : public ::testing::TestWithParam<test_rnn_params_t> {
 
     void SetUp() override {
         auto p = ::testing::TestWithParam<test_rnn_params_t>::GetParam();
+        SKIP_IF_GENERIC(!is_vanilla_rnn, "Unsupported cell type");
+        SKIP_IF_GENERIC(
+                !(p.direction == rnn_direction::unidirectional_left2right),
+                "Unsupported direction");
+        SKIP_IF_GENERIC(
+                is_lstm || is_gru || is_lbr_gru || is_augru || is_lbr_augru,
+                "Unsupported cell type");
+        SKIP_IF_CUDA(!is_vanilla_rnn, "Unsupported cell type");
+        SKIP_IF_CUDA(!(p.direction == rnn_direction::unidirectional_left2right),
+                "Unsupported direction cuda");
+        SKIP_IF_CUDA(
+                is_lstm || is_gru || is_lbr_gru || is_augru || is_lbr_augru,
+                "Unsupported cell type");
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status, false);
     }
@@ -185,7 +198,7 @@ class rnn_forward_test_t : public ::testing::TestWithParam<test_rnn_params_t> {
         //ASSERT_EQ(p.aalgorithm, algorithm::vanilla_lstm);
 
         // Initialize the data
-        memory::data_type prec = data_traits<data_t>::data_type;
+        memory::data_type prec = data_traits_t<data_t>::data_type;
         auto dims = p.sizes;
         auto t = dims.t, mb = dims.mb, l = dims.l, d = dims.d;
         auto slc = dims.slc, sic = dims.sic, dhc = dims.dhc, dic = dims.dic;
@@ -732,6 +745,65 @@ CPU_INSTANTIATE_TEST_SUITE_P(TestRnn, rnn_forward_test_f32,
                                 fmt::undef},
                         test_rnn_sizes_t {3, 1, 5, 1, 4, 4, 4, 4}}));
 
+TEST_P(rnn_forward_test_f32, TestsRnnGPU) {}
+GPU_INSTANTIATE_TEST_SUITE_P(TestRnn, rnn_forward_test_f32,
+        ::testing::Values(
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::ldnc, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::ldnc},
+                        test_rnn_sizes_t {1, 1, 10, 16, 100, 100, 100, 100}},
+                /* Check for invalid parameters: unsupported unrolling */
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::ldnc, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::ldnc},
+                        test_rnn_sizes_t {2, 1, 10, 16, 200, 100, 100, 100},
+                        true, dnnl_invalid_arguments},
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::ldnc, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::ldnc},
+                        test_rnn_sizes_t {2, 1, 10, 16, 100, 200, 100, 100},
+                        true, dnnl_invalid_arguments},
+                /* Check for invalid parameters: inconsistent dimensions */
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::ldnc, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::ldnc},
+                        test_rnn_sizes_t {2, 1, 10, 16, 100, 100, 50, 100},
+                        true, dnnl_invalid_arguments},
+                /* Check if passing {src,dst}_iter impacts results */
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::undef, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::ldnc},
+                        test_rnn_sizes_t {3, 1, 5, 1, 4, 4, 4, 4}},
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::ldnc, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::undef},
+                        test_rnn_sizes_t {3, 1, 5, 1, 4, 4, 4, 4}},
+                cfg_f32 {PLAIN_RNN(alg::eltwise_tanh),
+                        prop_kind::forward_inference,
+                        dir::unidirectional_left2right,
+                        {fmt::tnc, fmt::undef, fmt::ldigo, fmt::ldigo,
+                                fmt::undef, fmt::undef, fmt::ldgo, fmt::tnc,
+                                fmt::undef},
+                        test_rnn_sizes_t {3, 1, 5, 1, 4, 4, 4, 4}}));
+
 TEST_P(lstm_forward_test_f32, TestsLSTM) {}
 CPU_INSTANTIATE_TEST_SUITE_P(TestLSTM, lstm_forward_test_f32,
         ::testing::Values(
diff --git a/tests/gtests/test_shuffle.cpp b/tests/gtests/test_shuffle.cpp
index 4c5eb12194e..643e76a7593 100644
--- a/tests/gtests/test_shuffle.cpp
+++ b/tests/gtests/test_shuffle.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2024 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -36,6 +36,11 @@ struct shuffle_test_params_t {
     dnnl_status_t expected_status;
 };
 
+bool generic_check_format(memory::format_tag tag) {
+    return impl::utils::one_of(
+            tag, dnnl_abc, dnnl_abcd, dnnl_acbde, dnnl_acb, dnnl_acdb);
+}
+
 class shuffle_test_t : public ::testing::TestWithParam<shuffle_test_params_t> {
 private:
     shuffle_test_params_t p;
@@ -50,6 +55,8 @@ class shuffle_test_t : public ::testing::TestWithParam<shuffle_test_params_t> {
 
         SKIP_IF(unsupported_data_type(p.src_dt, p.dst_dt),
                 "Engine does not support this data type.");
+        SKIP_IF_GENERIC(!generic_check_format(p.src_tag), "Unsupported format");
+        SKIP_IF_GENERIC(!generic_check_format(p.dst_tag), "Unsupported format");
 
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
@@ -62,7 +69,7 @@ class shuffle_test_t : public ::testing::TestWithParam<shuffle_test_params_t> {
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
 
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
 
         auto src_md = memory::desc(p.dims, p.src_dt, p.src_tag);
         auto dst_md = memory::desc(p.dims, p.dst_dt, p.dst_tag);
@@ -118,7 +125,7 @@ class shuffle_test_t : public ::testing::TestWithParam<shuffle_test_params_t> {
         // shuffle specific types and values
         using pd_t = shuffle_backward::primitive_desc;
         using hint_pd_t = shuffle_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_softmax.cpp b/tests/gtests/test_softmax.cpp
index e00e42b7caf..d73da851695 100644
--- a/tests/gtests/test_softmax.cpp
+++ b/tests/gtests/test_softmax.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Intel Corporation
+* Copyright 2022-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -57,12 +57,20 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
         SKIP_IF_HIP(!hip_check_format_tag(p.src_tag), "Unsupported format tag");
         SKIP_IF_HIP(!hip_check_format_tag(p.dst_tag), "Unsupported format tag");
 
+        SKIP_IF_GENERIC(
+                !generic_check_format_tag(p.src_tag), "Unsupported format tag");
+        SKIP_IF_GENERIC(
+                !generic_check_format_tag(p.dst_tag), "Unsupported format tag");
+
         if (!is_fwd(p.aprop_kind)) {
             SKIP_IF_CUDA(!cuda_check_format_tag(p.diff_dst_tag),
                     "Unsupported format tag");
 
             SKIP_IF_HIP(!hip_check_format_tag(p.diff_dst_tag),
                     "Unsupported format tag");
+
+            SKIP_IF_GENERIC(!generic_check_format_tag(p.diff_dst_tag),
+                    "Unsupported format tag");
         }
         SKIP_IF_CUDA((p.src_dt == dt::bf16 || p.dst_dt == dt::bf16),
                 "Unsupported datatype for CUDA");
@@ -99,6 +107,10 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
                         && p.src_dt != dt::undef && p.diff_dst_dt != dt::undef,
                 "Unsupported different data types for diff_source and "
                 "diff_destination");
+        SKIP_IF_GENERIC(!is_fwd(p.aprop_kind) && p.src_dt != p.diff_dst_dt
+                        && p.src_dt != dt::undef && p.diff_dst_dt != dt::undef,
+                "Unsupported different data types for diff_source and "
+                "diff_destination");
 
         SKIP_IF_CUDA(p.src_tag != p.dst_tag && p.src_tag != tag::any
                         && p.dst_tag != tag::any,
@@ -120,6 +132,9 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
                 "Unsupported int8 destination data type");
         SKIP_IF_HIP(p.dst_dt == dt::u8 || p.dst_dt == dt::s8,
                 "Unsupported int8 destination data type");
+        SKIP_IF_GENERIC(!is_fwd(p.aprop_kind)
+                        && (p.dst_dt == dt::u8 || p.dst_dt == dt::s8),
+                "Unsupported int8 destination data type");
 
         SKIP_IF_HIP(p.axis != 1, "Unsupported axis for HIP");
 
@@ -136,6 +151,13 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
                 || tag == memory::format_tag::abcd
                 || tag == memory::format_tag::abcde);
     }
+    bool generic_check_format_tag(memory::format_tag tag) {
+        return impl::utils::one_of(tag, memory::format_tag::a,
+                memory::format_tag::ab, memory::format_tag::abc,
+                memory::format_tag::abcd, memory::format_tag::abcde,
+                memory::format_tag::acb, memory::format_tag::acdb,
+                memory::format_tag::acdeb, memory::format_tag::any);
+    }
     void Forward() {
         // softmax specific types and values
         using pd_t = softmax_forward::primitive_desc;
@@ -145,8 +167,8 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
         prop_kind pk = !is_fwd(p.aprop_kind) ? prop_kind::forward_training
                                              : p.aprop_kind;
 
-        allows_attr_t aa {false};
-        if (!(is_nvidia_gpu(eng) || is_amd_gpu(eng))) {
+        allows_attr_t aa {};
+        if (!is_amd_gpu(eng)) {
             aa.po_eltwise = true;
             aa.po_binary = true;
         }
@@ -159,6 +181,12 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
         if (is_int8 && !(is_nvidia_gpu(eng) || is_amd_gpu(eng)))
             aa.scales = true;
 
+#ifdef DNNL_SYCL_GENERIC
+        aa.po_eltwise = true;
+        aa.po_binary = true;
+        if (is_int8) aa.scales = true;
+#endif
+
         // To validate backward on valid tag::any settings reuse dst tag.
         const bool src_bwd_any = !is_fwd(p.aprop_kind) && p.src_tag == tag::any;
         auto src_tag = src_bwd_any ? p.dst_tag : p.src_tag;
@@ -232,7 +260,7 @@ class softmax_test_t : public ::testing::TestWithParam<softmax_test_params_t> {
         // softmax specific types and values
         using pd_t = softmax_backward::primitive_desc;
         using hint_pd_t = softmax_forward::primitive_desc;
-        allows_attr_t aa {false}; // doesn't support anything
+        allows_attr_t aa {}; // doesn't support anything
 
         auto eng = get_test_engine();
         auto strm = make_stream(eng);
diff --git a/tests/gtests/test_sum.cpp b/tests/gtests/test_sum.cpp
index 8ed34c43a0b..bc1b15ee4fe 100644
--- a/tests/gtests/test_sum.cpp
+++ b/tests/gtests/test_sum.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -50,6 +50,7 @@ TEST_F(iface_sum_test_t, SumTestDstDataTypeCompliance) {
     for (dt dst_dt : {dt::undef, dt::s8, dt::s32, dt::f32}) {
         sum::primitive_desc sum_pd;
         SKIP_FOR_LOOP_CUDA(dst_dt == dt::s32, "Unsupported data_type");
+        SKIP_FOR_LOOP_GENERIC(dst_dt == dt::s32, "Unsupported data_type");
         if (dst_dt != dt::undef) {
             memory::desc dst_md(shape, dst_dt, dst_tag);
             sum_pd = sum::primitive_desc(
@@ -142,10 +143,21 @@ class sum_test_t : public ::testing::TestWithParam<sum_test_params> {
                 dnnl_cdeba, dnnl_decab, dnnl_defcab, dnnl_aBc4b, dnnl_aBcd4b,
                 dnnl_aBcde4b);
     }
+    bool generic_supported_format_tag(memory::format_tag tag) {
+        return impl::utils::one_of(tag, dnnl_a, dnnl_ab, dnnl_abc, dnnl_abcd,
+                dnnl_abcde, dnnl_abcdef, dnnl_abdec, dnnl_acb, dnnl_acbde,
+                dnnl_acbdef, dnnl_acdb, dnnl_acdeb, dnnl_ba, dnnl_bac,
+                dnnl_bacd, dnnl_bca, dnnl_bcda, dnnl_bcdea, dnnl_cba, dnnl_cdba,
+                dnnl_cdeba, dnnl_decab, dnnl_defcab, dnnl_format_tag_any);
+    }
+    bool generic_supported_dt(memory::data_type dt) {
+        return impl::utils::one_of(
+                dt, dnnl_f32, dnnl_bf16, dnnl_f16, dnnl_s8, dnnl_u8);
+    }
     void SetUp() override {
         SKIP_IF_HIP(true, "Sum operator is not supported by HIP");
-        src_data_type = data_traits<src_data_t>::data_type;
-        dst_data_type = data_traits<dst_data_t>::data_type;
+        src_data_type = data_traits_t<src_data_t>::data_type;
+        dst_data_type = data_traits_t<dst_data_t>::data_type;
         sum_test_params p
                 = ::testing::TestWithParam<sum_test_params>::GetParam();
         SKIP_IF(get_test_engine_kind() == engine::kind::gpu
@@ -162,6 +174,18 @@ class sum_test_t : public ::testing::TestWithParam<sum_test_params> {
             SKIP_IF_CUDA(!cuda_supported_format_tag(p.srcs_format[i]),
                     "Unsupported format tag");
         }
+
+        SKIP_IF_GENERIC(
+                !generic_supported_dt(src_data_type), "Unsupported data type");
+        SKIP_IF_GENERIC(
+                !generic_supported_dt(dst_data_type), "Unsupported data type");
+        SKIP_IF_GENERIC(!generic_supported_format_tag(p.dst_format),
+                "Unsupported format tag");
+        for (size_t i = 0; i < p.srcs_format.size(); i++) {
+            SKIP_IF_GENERIC(!generic_supported_format_tag(p.srcs_format[i]),
+                    "Unsupported format tag");
+        }
+
         catch_expected_failures(
                 [&]() { Test(); }, p.expect_to_fail, p.expected_status);
     }
@@ -200,7 +224,7 @@ class sum_test_t : public ::testing::TestWithParam<sum_test_params> {
         dst = test::make_memory(sum_pd.dst_desc(), eng);
 
         // test all pd ctors
-        auto aa = allows_attr_t {false};
+        allows_attr_t aa {};
         if (p.is_output_omitted)
             test_fwd_pd_constructors<pd_t>(sum_pd, aa, p.scale, srcs_md);
         else {
@@ -221,14 +245,17 @@ class sum_test_t : public ::testing::TestWithParam<sum_test_params> {
             // Keep few mantissa digits for fp types to avoid round-off errors
             // With proper scalars the computations give exact results
             if (!std::is_integral<src_data_t>::value) {
-                using uint_type = typename data_traits<src_data_t>::uint_type;
+                using uint_type = typename data_traits_t<src_data_t>::uint_type;
                 int mant_digits
                         = dnnl::impl::nstl::numeric_limits<src_data_t>::digits;
                 int want_mant_digits = 3;
+                int digits_shift = mant_digits - want_mant_digits;
+                uint_type max_val = (uint_type)-1;
+                // Move left to keep mask value in uint_type range, move left
+                // to flush all digits but `want_mant_digits`.
+                uint_type mask = (max_val >> digits_shift) << digits_shift;
                 auto src_ptr = map_memory<src_data_t>(src_memory);
                 for (size_t i = 0; i < sz; i++) {
-                    uint_type mask = (uint_type)-1
-                            << (mant_digits - want_mant_digits);
                     *((uint_type *)&src_ptr[i]) &= mask;
                 }
             }
diff --git a/tests/other/subproject/CMakeLists.txt b/tests/other/subproject/CMakeLists.txt
index 3fcda91975c..956601faeb6 100644
--- a/tests/other/subproject/CMakeLists.txt
+++ b/tests/other/subproject/CMakeLists.txt
@@ -20,7 +20,7 @@
 # To test run:
 # mkdir -p build && cd build && cmake .. && make -j && ./project_app
 
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.13)
 
 set(PROJECT_NAME "Project")
 
diff --git a/src/common/ittnotify/_clang-format b/third_party/.clang-format
similarity index 100%
rename from src/common/ittnotify/_clang-format
rename to third_party/.clang-format
diff --git a/third_party/.clang-tidy b/third_party/.clang-tidy
new file mode 100644
index 00000000000..fc05834a23b
--- /dev/null
+++ b/third_party/.clang-tidy
@@ -0,0 +1,3 @@
+Checks: '-*,misc-definitions-in-headers'
+CheckOptions:
+  - { key: HeaderFileExtensions,          value: "x" }
diff --git a/tests/gtests/gtest/CMakeLists.txt b/third_party/gtest/CMakeLists.txt
similarity index 100%
rename from tests/gtests/gtest/CMakeLists.txt
rename to third_party/gtest/CMakeLists.txt
diff --git a/tests/gtests/gtest/LICENSE b/third_party/gtest/LICENSE
similarity index 100%
rename from tests/gtests/gtest/LICENSE
rename to third_party/gtest/LICENSE
diff --git a/tests/gtests/gtest/gtest-death-test.h b/third_party/gtest/gtest-death-test.h
similarity index 100%
rename from tests/gtests/gtest/gtest-death-test.h
rename to third_party/gtest/gtest-death-test.h
diff --git a/tests/gtests/gtest/gtest-matchers.h b/third_party/gtest/gtest-matchers.h
similarity index 100%
rename from tests/gtests/gtest/gtest-matchers.h
rename to third_party/gtest/gtest-matchers.h
diff --git a/tests/gtests/gtest/gtest-message.h b/third_party/gtest/gtest-message.h
similarity index 100%
rename from tests/gtests/gtest/gtest-message.h
rename to third_party/gtest/gtest-message.h
diff --git a/tests/gtests/gtest/gtest-param-test.h b/third_party/gtest/gtest-param-test.h
similarity index 100%
rename from tests/gtests/gtest/gtest-param-test.h
rename to third_party/gtest/gtest-param-test.h
diff --git a/tests/gtests/gtest/gtest-printers.h b/third_party/gtest/gtest-printers.h
similarity index 100%
rename from tests/gtests/gtest/gtest-printers.h
rename to third_party/gtest/gtest-printers.h
diff --git a/tests/gtests/gtest/gtest-spi.h b/third_party/gtest/gtest-spi.h
similarity index 100%
rename from tests/gtests/gtest/gtest-spi.h
rename to third_party/gtest/gtest-spi.h
diff --git a/tests/gtests/gtest/gtest-test-part.h b/third_party/gtest/gtest-test-part.h
similarity index 100%
rename from tests/gtests/gtest/gtest-test-part.h
rename to third_party/gtest/gtest-test-part.h
diff --git a/tests/gtests/gtest/gtest-typed-test.h b/third_party/gtest/gtest-typed-test.h
similarity index 100%
rename from tests/gtests/gtest/gtest-typed-test.h
rename to third_party/gtest/gtest-typed-test.h
diff --git a/tests/gtests/gtest/gtest.h b/third_party/gtest/gtest.h
similarity index 100%
rename from tests/gtests/gtest/gtest.h
rename to third_party/gtest/gtest.h
diff --git a/tests/gtests/gtest/gtest_pred_impl.h b/third_party/gtest/gtest_pred_impl.h
similarity index 100%
rename from tests/gtests/gtest/gtest_pred_impl.h
rename to third_party/gtest/gtest_pred_impl.h
diff --git a/tests/gtests/gtest/gtest_prod.h b/third_party/gtest/gtest_prod.h
similarity index 100%
rename from tests/gtests/gtest/gtest_prod.h
rename to third_party/gtest/gtest_prod.h
diff --git a/tests/gtests/gtest/internal/custom/README.md b/third_party/gtest/internal/custom/README.md
similarity index 100%
rename from tests/gtests/gtest/internal/custom/README.md
rename to third_party/gtest/internal/custom/README.md
diff --git a/tests/gtests/gtest/internal/custom/gtest-port.h b/third_party/gtest/internal/custom/gtest-port.h
similarity index 100%
rename from tests/gtests/gtest/internal/custom/gtest-port.h
rename to third_party/gtest/internal/custom/gtest-port.h
diff --git a/tests/gtests/gtest/internal/custom/gtest-printers.h b/third_party/gtest/internal/custom/gtest-printers.h
similarity index 100%
rename from tests/gtests/gtest/internal/custom/gtest-printers.h
rename to third_party/gtest/internal/custom/gtest-printers.h
diff --git a/tests/gtests/gtest/internal/custom/gtest.h b/third_party/gtest/internal/custom/gtest.h
similarity index 100%
rename from tests/gtests/gtest/internal/custom/gtest.h
rename to third_party/gtest/internal/custom/gtest.h
diff --git a/tests/gtests/gtest/internal/gtest-death-test-internal.h b/third_party/gtest/internal/gtest-death-test-internal.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-death-test-internal.h
rename to third_party/gtest/internal/gtest-death-test-internal.h
diff --git a/tests/gtests/gtest/internal/gtest-filepath.h b/third_party/gtest/internal/gtest-filepath.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-filepath.h
rename to third_party/gtest/internal/gtest-filepath.h
diff --git a/tests/gtests/gtest/internal/gtest-internal.h b/third_party/gtest/internal/gtest-internal.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-internal.h
rename to third_party/gtest/internal/gtest-internal.h
diff --git a/tests/gtests/gtest/internal/gtest-param-util.h b/third_party/gtest/internal/gtest-param-util.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-param-util.h
rename to third_party/gtest/internal/gtest-param-util.h
diff --git a/tests/gtests/gtest/internal/gtest-port-arch.h b/third_party/gtest/internal/gtest-port-arch.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-port-arch.h
rename to third_party/gtest/internal/gtest-port-arch.h
diff --git a/tests/gtests/gtest/internal/gtest-port.h b/third_party/gtest/internal/gtest-port.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-port.h
rename to third_party/gtest/internal/gtest-port.h
diff --git a/tests/gtests/gtest/internal/gtest-string.h b/third_party/gtest/internal/gtest-string.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-string.h
rename to third_party/gtest/internal/gtest-string.h
diff --git a/tests/gtests/gtest/internal/gtest-type-util.h b/third_party/gtest/internal/gtest-type-util.h
similarity index 100%
rename from tests/gtests/gtest/internal/gtest-type-util.h
rename to third_party/gtest/internal/gtest-type-util.h
diff --git a/tests/gtests/gtest/src/gtest-all.cc b/third_party/gtest/src/gtest-all.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-all.cc
rename to third_party/gtest/src/gtest-all.cc
diff --git a/tests/gtests/gtest/src/gtest-death-test.cc b/third_party/gtest/src/gtest-death-test.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-death-test.cc
rename to third_party/gtest/src/gtest-death-test.cc
diff --git a/tests/gtests/gtest/src/gtest-filepath.cc b/third_party/gtest/src/gtest-filepath.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-filepath.cc
rename to third_party/gtest/src/gtest-filepath.cc
diff --git a/tests/gtests/gtest/src/gtest-internal-inl.h b/third_party/gtest/src/gtest-internal-inl.h
similarity index 100%
rename from tests/gtests/gtest/src/gtest-internal-inl.h
rename to third_party/gtest/src/gtest-internal-inl.h
diff --git a/tests/gtests/gtest/src/gtest-matchers.cc b/third_party/gtest/src/gtest-matchers.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-matchers.cc
rename to third_party/gtest/src/gtest-matchers.cc
diff --git a/tests/gtests/gtest/src/gtest-port.cc b/third_party/gtest/src/gtest-port.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-port.cc
rename to third_party/gtest/src/gtest-port.cc
diff --git a/tests/gtests/gtest/src/gtest-printers.cc b/third_party/gtest/src/gtest-printers.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-printers.cc
rename to third_party/gtest/src/gtest-printers.cc
diff --git a/tests/gtests/gtest/src/gtest-test-part.cc b/third_party/gtest/src/gtest-test-part.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-test-part.cc
rename to third_party/gtest/src/gtest-test-part.cc
diff --git a/tests/gtests/gtest/src/gtest-typed-test.cc b/third_party/gtest/src/gtest-typed-test.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest-typed-test.cc
rename to third_party/gtest/src/gtest-typed-test.cc
diff --git a/tests/gtests/gtest/src/gtest.cc b/third_party/gtest/src/gtest.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest.cc
rename to third_party/gtest/src/gtest.cc
diff --git a/tests/gtests/gtest/src/gtest_main.cc b/third_party/gtest/src/gtest_main.cc
similarity index 100%
rename from tests/gtests/gtest/src/gtest_main.cc
rename to third_party/gtest/src/gtest_main.cc
diff --git a/src/common/ittnotify/LICENSE.BSD b/third_party/ittnotify/LICENSE.BSD
similarity index 100%
rename from src/common/ittnotify/LICENSE.BSD
rename to third_party/ittnotify/LICENSE.BSD
diff --git a/src/common/ittnotify/README.md b/third_party/ittnotify/README.md
similarity index 100%
rename from src/common/ittnotify/README.md
rename to third_party/ittnotify/README.md
diff --git a/src/common/ittnotify/disable_warnings.h b/third_party/ittnotify/disable_warnings.h
similarity index 100%
rename from src/common/ittnotify/disable_warnings.h
rename to third_party/ittnotify/disable_warnings.h
diff --git a/third_party/ittnotify/ittnotify.h b/third_party/ittnotify/ittnotify.h
new file mode 100644
index 00000000000..898f8bb61a5
--- /dev/null
+++ b/third_party/ittnotify/ittnotify.h
@@ -0,0 +1,4665 @@
+/*
+  Copyright (C) 2005-2019 Intel Corporation
+
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/
+#ifndef _ITTNOTIFY_H_
+#define _ITTNOTIFY_H_
+
+/**
+@file
+@brief Public User API functions and types
+@mainpage
+
+The Instrumentation and Tracing Technology API (ITT API) is used to
+annotate a user's program with additional information
+that can be used by correctness and performance tools. The user inserts
+calls in their program. Those calls generate information that is collected
+at runtime, and used by Intel(R) Threading Tools.
+
+@section API Concepts
+The following general concepts are used throughout the API.
+
+@subsection Unicode Support
+Many API functions take character string arguments. On Windows, there
+are two versions of each such function. The function name is suffixed
+by W if Unicode support is enabled, and by A otherwise. Any API function
+that takes a character string argument adheres to this convention.
+
+@subsection Conditional Compilation
+Many users prefer having an option to modify ITT API code when linking it
+inside their runtimes. ITT API header file provides a mechanism to replace
+ITT API function names inside your code with empty strings. To do this,
+define the macros INTEL_NO_ITTNOTIFY_API during compilation and remove the
+static library from the linker script.
+
+@subsection Domains
+[see domains]
+Domains provide a way to separate notification for different modules or
+libraries in a program. Domains are specified by dotted character strings,
+e.g. TBB.Internal.Control.
+
+A mechanism (to be specified) is provided to enable and disable
+domains. By default, all domains are enabled.
+@subsection Named Entities and Instances
+Named entities (frames, regions, tasks, and markers) communicate
+information about the program to the analysis tools. A named entity often
+refers to a section of program code, or to some set of logical concepts
+that the programmer wants to group together.
+
+Named entities relate to the programmer's static view of the program. When
+the program actually executes, many instances of a given named entity
+may be created.
+
+The API annotations denote instances of named entities. The actual
+named entities are displayed using the analysis tools. In other words,
+the named entities come into existence when instances are created.
+
+Instances of named entities may have instance identifiers (IDs). Some
+API calls use instance identifiers to create relationships between
+different instances of named entities. Other API calls associate data
+with instances of named entities.
+
+Some named entities must always have instance IDs. In particular, regions
+and frames always have IDs. Task and markers need IDs only if the ID is
+needed in another API call (such as adding a relation or metadata).
+
+The lifetime of instance IDs is distinct from the lifetime of
+instances. This allows various relationships to be specified separate
+from the actual execution of instances. This flexibility comes at the
+expense of extra API calls.
+
+The same ID may not be reused for different instances, unless a previous
+[ref] __itt_id_destroy call for that ID has been issued.
+*/
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__)
+#    define ITT_OS ITT_OS_OPENBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#if defined(__MINGW32__) && !defined(__cplusplus)
+#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
+#else
+#define ITT_INLINE           static __forceinline
+#endif /* __MINGW32__ */
+
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifdef INTEL_ITTNOTIFY_ENABLE_LEGACY
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    pragma message("WARNING!!! Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro")
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    warning "Deprecated API is used. Please undefine INTEL_ITTNOTIFY_ENABLE_LEGACY macro"
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#  include "legacy/ittnotify.h"
+#endif /* INTEL_ITTNOTIFY_ENABLE_LEGACY */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup public Public API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup control Collection Control
+ * @ingroup public
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) VTune(TM) Profiler:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/**
+ * @enum __itt_collection_scope
+ * @brief Enumerator for collection scopes
+ */
+typedef enum {
+    __itt_collection_scope_host    = 1 << 0,
+    __itt_collection_scope_offload = 1 << 1,
+    __itt_collection_scope_all     = 0x7FFFFFFF
+} __itt_collection_scope;
+
+/** @brief Pause scoped collection */
+void ITTAPI __itt_pause_scoped(__itt_collection_scope);
+/** @brief Resume scoped collection */
+void ITTAPI __itt_resume_scoped(__itt_collection_scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,         (void))
+ITT_STUBV(ITTAPI, void, pause_scoped,  (__itt_collection_scope))
+ITT_STUBV(ITTAPI, void, resume,        (void))
+ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope))
+ITT_STUBV(ITTAPI, void, detach,        (void))
+#define __itt_pause             ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr         ITTNOTIFY_NAME(pause)
+#define __itt_pause_scoped      ITTNOTIFY_VOID(pause_scoped)
+#define __itt_pause_scoped_ptr  ITTNOTIFY_NAME(pause_scoped)
+#define __itt_resume            ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr        ITTNOTIFY_NAME(resume)
+#define __itt_resume_scoped     ITTNOTIFY_VOID(resume_scoped)
+#define __itt_resume_scoped_ptr ITTNOTIFY_NAME(resume_scoped)
+#define __itt_detach            ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr        ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr           0
+#define __itt_pause_scoped(scope)
+#define __itt_pause_scoped_ptr    0
+#define __itt_resume()
+#define __itt_resume_ptr          0
+#define __itt_resume_scoped(scope)
+#define __itt_resume_scoped_ptr   0
+#define __itt_detach()
+#define __itt_detach_ptr          0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr           0
+#define __itt_pause_scoped_ptr    0
+#define __itt_resume_ptr          0
+#define __itt_resume_scoped_ptr   0
+#define __itt_detach_ptr          0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} control group */
+/** @endcond */
+
+/**
+ * @defgroup Intel Processor Trace control
+ * API from this group provides control over collection and analysis of Intel Processor Trace (Intel PT) data
+ * Information about Intel Processor Trace technology can be found here (Volume 3 chapter 35):
+ * https://software.intel.com/sites/default/files/managed/39/c5/325462-sdm-vol-1-2abcd-3abcd.pdf
+ * Use this API to mark particular code regions for loading detailed performance statistics.
+ * This mode makes your analysis faster and more accurate.
+ * @{
+*/
+typedef unsigned char __itt_pt_region;
+
+/**
+ * @brief function saves a region name marked with Intel PT API and returns a region id.
+ * Only 7 names can be registered. Attempts to register more names will be ignored and a region id with auto names will be returned.
+ * For automatic naming of regions pass NULL as function parameter
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_pt_region ITTAPI __itt_pt_region_createA(const char    *name);
+__itt_pt_region ITTAPI __itt_pt_region_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_pt_region_create __itt_pt_region_createW
+#else /* UNICODE */
+#  define __itt_pt_region_create __itt_pt_region_createA
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_pt_region ITTAPI __itt_pt_region_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_pt_region, pt_region_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA     ITTNOTIFY_DATA(pt_region_createA)
+#define __itt_pt_region_createA_ptr ITTNOTIFY_NAME(pt_region_createA)
+#define __itt_pt_region_createW     ITTNOTIFY_DATA(pt_region_createW)
+#define __itt_pt_region_createW_ptr ITTNOTIFY_NAME(pt_region_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create     ITTNOTIFY_DATA(pt_region_create)
+#define __itt_pt_region_create_ptr ITTNOTIFY_NAME(pt_region_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA(name) (__itt_pt_region)0
+#define __itt_pt_region_createA_ptr 0
+#define __itt_pt_region_createW(name) (__itt_pt_region)0
+#define __itt_pt_region_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create(name)  (__itt_pt_region)0
+#define __itt_pt_region_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_pt_region_createA_ptr 0
+#define __itt_pt_region_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_pt_region_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief function contains a special code pattern identified on the post-processing stage and
+ * marks the beginning of a code region targeted for Intel PT analysis
+ * @param[in] region - region id, 0 <= region < 8
+*/
+void __itt_mark_pt_region_begin(__itt_pt_region region);
+/**
+ * @brief function contains a special code pattern identified on the post-processing stage and
+ * marks the end of a code region targeted for Intel PT analysis
+ * @param[in] region - region id, 0 <= region < 8
+*/
+void __itt_mark_pt_region_end(__itt_pt_region region);
+/** @} Intel PT control group*/
+
+/**
+ * @defgroup threads Threads
+ * @ingroup public
+ * Give names to threads
+ * @{
+ */
+/**
+ * @brief Sets thread name of calling thread
+ * @param[in] name - name of thread
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_thread_set_nameA(const char    *name);
+void ITTAPI __itt_thread_set_nameW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thread_set_name     __itt_thread_set_nameW
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_thread_set_name     __itt_thread_set_nameA
+#  define __itt_thread_set_name_ptr __itt_thread_set_nameA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_thread_set_name(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name))
+ITT_STUBV(ITTAPI, void, thread_set_nameW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, thread_set_name,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA     ITTNOTIFY_VOID(thread_set_nameA)
+#define __itt_thread_set_nameA_ptr ITTNOTIFY_NAME(thread_set_nameA)
+#define __itt_thread_set_nameW     ITTNOTIFY_VOID(thread_set_nameW)
+#define __itt_thread_set_nameW_ptr ITTNOTIFY_NAME(thread_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name     ITTNOTIFY_VOID(thread_set_name)
+#define __itt_thread_set_name_ptr ITTNOTIFY_NAME(thread_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA(name)
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW(name)
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name(name)
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thread_set_nameA_ptr 0
+#define __itt_thread_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thread_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void ITTAPI __itt_thread_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, thread_ignore, (void))
+#define __itt_thread_ignore     ITTNOTIFY_VOID(thread_ignore)
+#define __itt_thread_ignore_ptr ITTNOTIFY_NAME(thread_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thread_ignore()
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thread_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} threads group */
+
+/**
+ * @defgroup suppress Error suppression
+ * @ingroup public
+ * General behavior: application continues to run, but errors are suppressed
+ *
+ * @{
+ */
+
+/*****************************************************************//**
+ * @name group of functions used for error suppression in correctness tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask
+ */
+#define __itt_suppress_all_errors 0x7fffffff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from threading analysis)
+ */
+#define __itt_suppress_threading_errors 0x000000ff
+
+/**
+ * @hideinitializer
+ * @brief possible value for suppression mask (suppresses errors from memory analysis)
+ */
+#define __itt_suppress_memory_errors 0x0000ff00
+
+/**
+ * @brief Start suppressing errors identified in mask on this thread
+ */
+void ITTAPI __itt_suppress_push(unsigned int mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_push, (unsigned int mask))
+#define __itt_suppress_push     ITTNOTIFY_VOID(suppress_push)
+#define __itt_suppress_push_ptr ITTNOTIFY_NAME(suppress_push)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_push(mask)
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_push_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effects of the matching call to __itt_suppress_push
+ */
+void ITTAPI __itt_suppress_pop(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_pop, (void))
+#define __itt_suppress_pop     ITTNOTIFY_VOID(suppress_pop)
+#define __itt_suppress_pop_ptr ITTNOTIFY_NAME(suppress_pop)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_pop()
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_pop_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @enum __itt_suppress_mode
+ * @brief Enumerator for the suppressing modes
+ */
+typedef enum __itt_suppress_mode {
+    __itt_unsuppress_range,
+    __itt_suppress_range
+} __itt_suppress_mode_t;
+
+/**
+ * @enum __itt_collection_state
+ * @brief Enumerator for collection state.
+ */
+typedef enum {
+    __itt_collection_uninitialized = 0, /* uninitialized */
+    __itt_collection_init_fail = 1, /* failed to init */
+    __itt_collection_collector_absent = 2, /* non work state collector is absent */
+    __itt_collection_collector_exists = 3, /* work state collector exists */
+    __itt_collection_init_successful = 4 /* success to init */
+} __itt_collection_state;
+
+/**
+ * @brief Mark a range of memory for error suppression or unsuppression for error types included in mask
+ */
+void ITTAPI __itt_suppress_mark_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_mark_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_mark_range     ITTNOTIFY_VOID(suppress_mark_range)
+#define __itt_suppress_mark_range_ptr ITTNOTIFY_NAME(suppress_mark_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_mark_range(mask)
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_mark_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Undo the effect of a matching call to __itt_suppress_mark_range.   If not matching
+ *        call is found, nothing is changed.
+ */
+void ITTAPI __itt_suppress_clear_range(__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, suppress_clear_range, (__itt_suppress_mode_t mode, unsigned int mask, void * address, size_t size))
+#define __itt_suppress_clear_range     ITTNOTIFY_VOID(suppress_clear_range)
+#define __itt_suppress_clear_range_ptr ITTNOTIFY_NAME(suppress_clear_range)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_suppress_clear_range(mask)
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_suppress_clear_range_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+/** @} suppress group */
+
+/**
+ * @defgroup sync Synchronization
+ * @ingroup public
+ * Indicate user-written synchronization code
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+@brief Name a synchronization object
+@param[in] addr       Handle for the synchronization object. You should
+use a real address to uniquely identify the synchronization object.
+@param[in] objtype    null-terminated object type string. If NULL is
+passed, the name will be "User Synchronization".
+@param[in] objname    null-terminated object name string. If NULL,
+no name will be assigned to the object.
+@param[in] attribute  one of [#__itt_attr_barrier, #__itt_attr_mutex]
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_createA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_createW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_create     __itt_sync_createW
+#  define __itt_sync_create_ptr __itt_sync_createW_ptr
+#else /* UNICODE */
+#  define __itt_sync_create     __itt_sync_createA
+#  define __itt_sync_create_ptr __itt_sync_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_create (void *addr, const char *objtype, const char *objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_createA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_createW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_create,  (void *addr, const char*    objtype, const char*    objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA     ITTNOTIFY_VOID(sync_createA)
+#define __itt_sync_createA_ptr ITTNOTIFY_NAME(sync_createA)
+#define __itt_sync_createW     ITTNOTIFY_VOID(sync_createW)
+#define __itt_sync_createW_ptr ITTNOTIFY_NAME(sync_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create     ITTNOTIFY_VOID(sync_create)
+#define __itt_sync_create_ptr ITTNOTIFY_NAME(sync_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA(addr, objtype, objname, attribute)
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW(addr, objtype, objname, attribute)
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create(addr, objtype, objname, attribute)
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_createA_ptr 0
+#define __itt_sync_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+@brief Rename a synchronization object
+
+You can use the rename call to assign or reassign a name to a given
+synchronization object.
+@param[in] addr  handle for the synchronization object.
+@param[in] name  null-terminated object name string.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_renameA(void *addr, const char    *name);
+void ITTAPI __itt_sync_renameW(void *addr, const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_rename     __itt_sync_renameW
+#  define __itt_sync_rename_ptr __itt_sync_renameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_rename     __itt_sync_renameA
+#  define __itt_sync_rename_ptr __itt_sync_renameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_rename(void *addr, const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_renameA, (void *addr, const char    *name))
+ITT_STUBV(ITTAPI, void, sync_renameW, (void *addr, const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_rename,  (void *addr, const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA     ITTNOTIFY_VOID(sync_renameA)
+#define __itt_sync_renameA_ptr ITTNOTIFY_NAME(sync_renameA)
+#define __itt_sync_renameW     ITTNOTIFY_VOID(sync_renameW)
+#define __itt_sync_renameW_ptr ITTNOTIFY_NAME(sync_renameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename     ITTNOTIFY_VOID(sync_rename)
+#define __itt_sync_rename_ptr ITTNOTIFY_NAME(sync_rename)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA(addr, name)
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW(addr, name)
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename(addr, name)
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_renameA_ptr 0
+#define __itt_sync_renameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_rename_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ @brief Destroy a synchronization object.
+ @param addr Handle for the synchronization object.
+ */
+void ITTAPI __itt_sync_destroy(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_destroy, (void *addr))
+#define __itt_sync_destroy     ITTNOTIFY_VOID(sync_destroy)
+#define __itt_sync_destroy_ptr ITTNOTIFY_NAME(sync_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_destroy(addr)
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/*****************************************************************//**
+ * @name group of functions is used for performance measurement tools
+ *********************************************************************/
+/** @{ */
+/**
+ * @brief Enter spin loop on user-defined sync object
+ */
+void ITTAPI __itt_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_prepare, (void *addr))
+#define __itt_sync_prepare     ITTNOTIFY_VOID(sync_prepare)
+#define __itt_sync_prepare_ptr ITTNOTIFY_NAME(sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_prepare(addr)
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Quit spin loop without acquiring spin object
+ */
+void ITTAPI __itt_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_cancel, (void *addr))
+#define __itt_sync_cancel     ITTNOTIFY_VOID(sync_cancel)
+#define __itt_sync_cancel_ptr ITTNOTIFY_NAME(sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_cancel(addr)
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void ITTAPI __itt_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_acquired, (void *addr))
+#define __itt_sync_acquired     ITTNOTIFY_VOID(sync_acquired)
+#define __itt_sync_acquired_ptr ITTNOTIFY_NAME(sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_acquired(addr)
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void ITTAPI __itt_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, sync_releasing, (void *addr))
+#define __itt_sync_releasing     ITTNOTIFY_VOID(sync_releasing)
+#define __itt_sync_releasing_ptr ITTNOTIFY_NAME(sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_sync_releasing(addr)
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/** @} sync group */
+
+/**************************************************************//**
+ * @name group of functions is used for correctness checking tools
+ ******************************************************************/
+/** @{ */
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_prepare(void* addr);
+ */
+void ITTAPI __itt_fsync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_prepare, (void *addr))
+#define __itt_fsync_prepare     ITTNOTIFY_VOID(fsync_prepare)
+#define __itt_fsync_prepare_ptr ITTNOTIFY_NAME(fsync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_prepare(addr)
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_cancel(void *addr);
+ */
+void ITTAPI __itt_fsync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_cancel, (void *addr))
+#define __itt_fsync_cancel     ITTNOTIFY_VOID(fsync_cancel)
+#define __itt_fsync_cancel_ptr ITTNOTIFY_NAME(fsync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_cancel(addr)
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_acquired(void *addr);
+ */
+void ITTAPI __itt_fsync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_acquired, (void *addr))
+#define __itt_fsync_acquired     ITTNOTIFY_VOID(fsync_acquired)
+#define __itt_fsync_acquired_ptr ITTNOTIFY_NAME(fsync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_acquired(addr)
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup legacy
+ * @deprecated Legacy API
+ * @brief Fast synchronization which does no require spinning.
+ * - This special function is to be used by TBB and OpenMP libraries only when they know
+ *   there is no spin but they need to suppress TC warnings about shared variable modifications.
+ * - It only has corresponding pointers in static library and does not have corresponding function
+ *   in dynamic library.
+ * @see void __itt_sync_releasing(void* addr);
+ */
+void ITTAPI __itt_fsync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, fsync_releasing, (void *addr))
+#define __itt_fsync_releasing     ITTNOTIFY_VOID(fsync_releasing)
+#define __itt_fsync_releasing_ptr ITTNOTIFY_NAME(fsync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_fsync_releasing(addr)
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_fsync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} */
+
+/**
+ * @defgroup model Modeling by Intel(R) Parallel Advisor
+ * @ingroup public
+ * This is the subset of itt used for modeling by Intel(R) Parallel Advisor.
+ * This API is called ONLY using annotate.h, by "Annotation" macros
+ * the user places in their sources during the parallelism modeling steps.
+ *
+ * site_begin/end and task_begin/end take the address of handle variables,
+ * which are writeable by the API.  Handles must be 0 initialized prior
+ * to the first call to begin, or may cause a run-time failure.
+ * The handles are initialized in a multi-thread safe way by the API if
+ * the handle is 0.  The commonly expected idiom is one static handle to
+ * identify a site or task.  If a site or task of the same name has already
+ * been started during this collection, the same handle MAY be returned,
+ * but is not required to be - it is unspecified if data merging is done
+ * based on name.  These routines also take an instance variable.  Like
+ * the lexical instance, these must be 0 initialized.  Unlike the lexical
+ * instance, this is used to track a single dynamic instance.
+ *
+ * API used by the Intel(R) Parallel Advisor to describe potential concurrency
+ * and related activities. User-added source annotations expand to calls
+ * to these procedures to enable modeling of a hypothetical concurrent
+ * execution serially.
+ * @{
+ */
+#if !defined(_ADVISOR_ANNOTATE_H_) || defined(ANNOTATE_EXPAND_NULL)
+
+typedef void* __itt_model_site;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_site_instance;    /*!< @brief handle for dynamic instance */
+typedef void* __itt_model_task;             /*!< @brief handle for lexical site     */
+typedef void* __itt_model_task_instance;    /*!< @brief handle for dynamic instance */
+
+/**
+ * @enum __itt_model_disable
+ * @brief Enumerator for the disable methods
+ */
+typedef enum {
+    __itt_model_disable_observation,
+    __itt_model_disable_collection
+} __itt_model_disable;
+
+#endif /* !_ADVISOR_ANNOTATE_H_ || ANNOTATE_EXPAND_NULL */
+
+/**
+ * @brief ANNOTATE_SITE_BEGIN/ANNOTATE_SITE_END support.
+ *
+ * site_begin/end model a potential concurrency site.
+ * site instances may be recursively nested with themselves.
+ * site_end exits the most recently started but unended site for the current
+ * thread.  The handle passed to end may be used to validate structure.
+ * Instances of a site encountered on different threads concurrently
+ * are considered completely distinct. If the site name for two different
+ * lexical sites match, it is unspecified whether they are treated as the
+ * same or different for data presentation.
+ */
+void ITTAPI __itt_model_site_begin(__itt_model_site *site, __itt_model_site_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_site_beginW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_site_beginA(const char *name);
+void ITTAPI __itt_model_site_beginAL(const char *name, size_t siteNameLen);
+void ITTAPI __itt_model_site_end  (__itt_model_site *site, __itt_model_site_instance *instance);
+void ITTAPI __itt_model_site_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_site_begin,  (__itt_model_site *site, __itt_model_site_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_site_beginW,  (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_site_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_site_beginAL,  (const char *name, size_t siteNameLen))
+ITT_STUBV(ITTAPI, void, model_site_end,    (__itt_model_site *site, __itt_model_site_instance *instance))
+ITT_STUBV(ITTAPI, void, model_site_end_2,  (void))
+#define __itt_model_site_begin      ITTNOTIFY_VOID(model_site_begin)
+#define __itt_model_site_begin_ptr  ITTNOTIFY_NAME(model_site_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW      ITTNOTIFY_VOID(model_site_beginW)
+#define __itt_model_site_beginW_ptr  ITTNOTIFY_NAME(model_site_beginW)
+#endif
+#define __itt_model_site_beginA      ITTNOTIFY_VOID(model_site_beginA)
+#define __itt_model_site_beginA_ptr  ITTNOTIFY_NAME(model_site_beginA)
+#define __itt_model_site_beginAL      ITTNOTIFY_VOID(model_site_beginAL)
+#define __itt_model_site_beginAL_ptr  ITTNOTIFY_NAME(model_site_beginAL)
+#define __itt_model_site_end        ITTNOTIFY_VOID(model_site_end)
+#define __itt_model_site_end_ptr    ITTNOTIFY_NAME(model_site_end)
+#define __itt_model_site_end_2        ITTNOTIFY_VOID(model_site_end_2)
+#define __itt_model_site_end_2_ptr    ITTNOTIFY_NAME(model_site_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_site_begin(site, instance, name)
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW(name)
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA(name)
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL(name, siteNameLen)
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end(site, instance)
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2()
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_site_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_site_beginW_ptr  0
+#endif
+#define __itt_model_site_beginA_ptr  0
+#define __itt_model_site_beginAL_ptr  0
+#define __itt_model_site_end_ptr    0
+#define __itt_model_site_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_TASK_BEGIN/ANNOTATE_TASK_END support
+ *
+ * task_begin/end model a potential task, which is contained within the most
+ * closely enclosing dynamic site.  task_end exits the most recently started
+ * but unended task.  The handle passed to end may be used to validate
+ * structure.  It is unspecified if bad dynamic nesting is detected.  If it
+ * is, it should be encoded in the resulting data collection.  The collector
+ * should not fail due to construct nesting issues, nor attempt to directly
+ * indicate the problem.
+ */
+void ITTAPI __itt_model_task_begin(__itt_model_task *task, __itt_model_task_instance *instance, const char *name);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_model_task_beginW(const wchar_t *name);
+void ITTAPI __itt_model_iteration_taskW(const wchar_t *name);
+#endif
+void ITTAPI __itt_model_task_beginA(const char *name);
+void ITTAPI __itt_model_task_beginAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_iteration_taskA(const char *name);
+void ITTAPI __itt_model_iteration_taskAL(const char *name, size_t taskNameLen);
+void ITTAPI __itt_model_task_end  (__itt_model_task *task, __itt_model_task_instance *instance);
+void ITTAPI __itt_model_task_end_2(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_task_begin,  (__itt_model_task *task, __itt_model_task_instance *instance, const char *name))
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, model_task_beginW,  (const wchar_t *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskW, (const wchar_t *name))
+#endif
+ITT_STUBV(ITTAPI, void, model_task_beginA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_task_beginAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_iteration_taskA,  (const char *name))
+ITT_STUBV(ITTAPI, void, model_iteration_taskAL,  (const char *name, size_t taskNameLen))
+ITT_STUBV(ITTAPI, void, model_task_end,    (__itt_model_task *task, __itt_model_task_instance *instance))
+ITT_STUBV(ITTAPI, void, model_task_end_2,  (void))
+#define __itt_model_task_begin      ITTNOTIFY_VOID(model_task_begin)
+#define __itt_model_task_begin_ptr  ITTNOTIFY_NAME(model_task_begin)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW     ITTNOTIFY_VOID(model_task_beginW)
+#define __itt_model_task_beginW_ptr ITTNOTIFY_NAME(model_task_beginW)
+#define __itt_model_iteration_taskW     ITTNOTIFY_VOID(model_iteration_taskW)
+#define __itt_model_iteration_taskW_ptr ITTNOTIFY_NAME(model_iteration_taskW)
+#endif
+#define __itt_model_task_beginA    ITTNOTIFY_VOID(model_task_beginA)
+#define __itt_model_task_beginA_ptr ITTNOTIFY_NAME(model_task_beginA)
+#define __itt_model_task_beginAL    ITTNOTIFY_VOID(model_task_beginAL)
+#define __itt_model_task_beginAL_ptr ITTNOTIFY_NAME(model_task_beginAL)
+#define __itt_model_iteration_taskA    ITTNOTIFY_VOID(model_iteration_taskA)
+#define __itt_model_iteration_taskA_ptr ITTNOTIFY_NAME(model_iteration_taskA)
+#define __itt_model_iteration_taskAL    ITTNOTIFY_VOID(model_iteration_taskAL)
+#define __itt_model_iteration_taskAL_ptr ITTNOTIFY_NAME(model_iteration_taskAL)
+#define __itt_model_task_end        ITTNOTIFY_VOID(model_task_end)
+#define __itt_model_task_end_ptr    ITTNOTIFY_NAME(model_task_end)
+#define __itt_model_task_end_2        ITTNOTIFY_VOID(model_task_end_2)
+#define __itt_model_task_end_2_ptr    ITTNOTIFY_NAME(model_task_end_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_task_begin(task, instance, name)
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW(name)
+#define __itt_model_task_beginW_ptr  0
+#endif
+#define __itt_model_task_beginA(name)
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL(name, siteNameLen)
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA(name)
+#define __itt_model_iteration_taskA_ptr  0
+#define __itt_model_iteration_taskAL(name, siteNameLen)
+#define __itt_model_iteration_taskAL_ptr  0
+#define __itt_model_task_end(task, instance)
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2()
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_task_begin_ptr  0
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_model_task_beginW_ptr 0
+#endif
+#define __itt_model_task_beginA_ptr  0
+#define __itt_model_task_beginAL_ptr  0
+#define __itt_model_iteration_taskA_ptr    0
+#define __itt_model_iteration_taskAL_ptr    0
+#define __itt_model_task_end_ptr    0
+#define __itt_model_task_end_2_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_LOCK_ACQUIRE/ANNOTATE_LOCK_RELEASE support
+ *
+ * lock_acquire/release model a potential lock for both lockset and
+ * performance modeling.  Each unique address is modeled as a separate
+ * lock, with invalid addresses being valid lock IDs.  Specifically:
+ * no storage is accessed by the API at the specified address - it is only
+ * used for lock identification.  Lock acquires may be self-nested and are
+ * unlocked by a corresponding number of releases.
+ * (These closely correspond to __itt_sync_acquired/__itt_sync_releasing,
+ * but may not have identical semantics.)
+ */
+void ITTAPI __itt_model_lock_acquire(void *lock);
+void ITTAPI __itt_model_lock_acquire_2(void *lock);
+void ITTAPI __itt_model_lock_release(void *lock);
+void ITTAPI __itt_model_lock_release_2(void *lock);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_lock_acquire, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_acquire_2, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release, (void *lock))
+ITT_STUBV(ITTAPI, void, model_lock_release_2, (void *lock))
+#define __itt_model_lock_acquire     ITTNOTIFY_VOID(model_lock_acquire)
+#define __itt_model_lock_acquire_ptr ITTNOTIFY_NAME(model_lock_acquire)
+#define __itt_model_lock_acquire_2     ITTNOTIFY_VOID(model_lock_acquire_2)
+#define __itt_model_lock_acquire_2_ptr ITTNOTIFY_NAME(model_lock_acquire_2)
+#define __itt_model_lock_release     ITTNOTIFY_VOID(model_lock_release)
+#define __itt_model_lock_release_ptr ITTNOTIFY_NAME(model_lock_release)
+#define __itt_model_lock_release_2     ITTNOTIFY_VOID(model_lock_release_2)
+#define __itt_model_lock_release_2_ptr ITTNOTIFY_NAME(model_lock_release_2)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_lock_acquire(lock)
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2(lock)
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release(lock)
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2(lock)
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_lock_acquire_ptr 0
+#define __itt_model_lock_acquire_2_ptr 0
+#define __itt_model_lock_release_ptr 0
+#define __itt_model_lock_release_2_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_RECORD_ALLOCATION/ANNOTATE_RECORD_DEALLOCATION support
+ *
+ * record_allocation/deallocation describe user-defined memory allocator
+ * behavior, which may be required for correctness modeling to understand
+ * when storage is not expected to be actually reused across threads.
+ */
+void ITTAPI __itt_model_record_allocation  (void *addr, size_t size);
+void ITTAPI __itt_model_record_deallocation(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_record_allocation,   (void *addr, size_t size))
+ITT_STUBV(ITTAPI, void, model_record_deallocation, (void *addr))
+#define __itt_model_record_allocation       ITTNOTIFY_VOID(model_record_allocation)
+#define __itt_model_record_allocation_ptr   ITTNOTIFY_NAME(model_record_allocation)
+#define __itt_model_record_deallocation     ITTNOTIFY_VOID(model_record_deallocation)
+#define __itt_model_record_deallocation_ptr ITTNOTIFY_NAME(model_record_deallocation)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_record_allocation(addr, size)
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation(addr)
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_record_allocation_ptr   0
+#define __itt_model_record_deallocation_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_INDUCTION_USES support
+ *
+ * Note particular storage is inductive through the end of the current site
+ */
+void ITTAPI __itt_model_induction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_induction_uses, (void *addr, size_t size))
+#define __itt_model_induction_uses     ITTNOTIFY_VOID(model_induction_uses)
+#define __itt_model_induction_uses_ptr ITTNOTIFY_NAME(model_induction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_induction_uses(addr, size)
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_induction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_REDUCTION_USES support
+ *
+ * Note particular storage is used for reduction through the end
+ * of the current site
+ */
+void ITTAPI __itt_model_reduction_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_reduction_uses, (void *addr, size_t size))
+#define __itt_model_reduction_uses     ITTNOTIFY_VOID(model_reduction_uses)
+#define __itt_model_reduction_uses_ptr ITTNOTIFY_NAME(model_reduction_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_reduction_uses(addr, size)
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_reduction_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_OBSERVE_USES support
+ *
+ * Have correctness modeling record observations about uses of storage
+ * through the end of the current site
+ */
+void ITTAPI __itt_model_observe_uses(void* addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_observe_uses, (void *addr, size_t size))
+#define __itt_model_observe_uses     ITTNOTIFY_VOID(model_observe_uses)
+#define __itt_model_observe_uses_ptr ITTNOTIFY_NAME(model_observe_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_observe_uses(addr, size)
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_observe_uses_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_CLEAR_USES support
+ *
+ * Clear the special handling of a piece of storage related to induction,
+ * reduction or observe_uses
+ */
+void ITTAPI __itt_model_clear_uses(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_clear_uses, (void *addr))
+#define __itt_model_clear_uses     ITTNOTIFY_VOID(model_clear_uses)
+#define __itt_model_clear_uses_ptr ITTNOTIFY_NAME(model_clear_uses)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_clear_uses(addr)
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_clear_uses_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief ANNOTATE_DISABLE_*_PUSH/ANNOTATE_DISABLE_*_POP support
+ *
+ * disable_push/disable_pop push and pop disabling based on a parameter.
+ * Disabling observations stops processing of memory references during
+ * correctness modeling, and all annotations that occur in the disabled
+ * region.  This allows description of code that is expected to be handled
+ * specially during conversion to parallelism or that is not recognized
+ * by tools (e.g. some kinds of synchronization operations.)
+ * This mechanism causes all annotations in the disabled region, other
+ * than disable_push and disable_pop, to be ignored.  (For example, this
+ * might validly be used to disable an entire parallel site and the contained
+ * tasks and locking in it for data collection purposes.)
+ * The disable for collection is a more expensive operation, but reduces
+ * collector overhead significantly.  This applies to BOTH correctness data
+ * collection and performance data collection.  For example, a site
+ * containing a task might only enable data collection for the first 10
+ * iterations.  Both performance and correctness data should reflect this,
+ * and the program should run as close to full speed as possible when
+ * collection is disabled.
+ */
+void ITTAPI __itt_model_disable_push(__itt_model_disable x);
+void ITTAPI __itt_model_disable_pop(void);
+void ITTAPI __itt_model_aggregate_task(size_t x);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, model_disable_push, (__itt_model_disable x))
+ITT_STUBV(ITTAPI, void, model_disable_pop,  (void))
+ITT_STUBV(ITTAPI, void, model_aggregate_task, (size_t x))
+#define __itt_model_disable_push     ITTNOTIFY_VOID(model_disable_push)
+#define __itt_model_disable_push_ptr ITTNOTIFY_NAME(model_disable_push)
+#define __itt_model_disable_pop      ITTNOTIFY_VOID(model_disable_pop)
+#define __itt_model_disable_pop_ptr  ITTNOTIFY_NAME(model_disable_pop)
+#define __itt_model_aggregate_task      ITTNOTIFY_VOID(model_aggregate_task)
+#define __itt_model_aggregate_task_ptr  ITTNOTIFY_NAME(model_aggregate_task)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_model_disable_push(x)
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop()
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task(x)
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_model_disable_push_ptr 0
+#define __itt_model_disable_pop_ptr 0
+#define __itt_model_aggregate_task_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} model group */
+
+/**
+ * @defgroup heap Heap
+ * @ingroup public
+ * Heap group
+ * @{
+ */
+
+typedef void* __itt_heap_function;
+
+/**
+ * @brief Create an identification for heap function
+ * @return non-zero identifier or NULL
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_heap_function ITTAPI __itt_heap_function_createA(const char*    name, const char*    domain);
+__itt_heap_function ITTAPI __itt_heap_function_createW(const wchar_t* name, const wchar_t* domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_heap_function_create     __itt_heap_function_createW
+#  define __itt_heap_function_create_ptr __itt_heap_function_createW_ptr
+#else
+#  define __itt_heap_function_create     __itt_heap_function_createA
+#  define __itt_heap_function_create_ptr __itt_heap_function_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_heap_function ITTAPI __itt_heap_function_create(const char* name, const char* domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createA, (const char*    name, const char*    domain))
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_createW, (const wchar_t* name, const wchar_t* domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_heap_function, heap_function_create,  (const char*    name, const char*    domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA     ITTNOTIFY_DATA(heap_function_createA)
+#define __itt_heap_function_createA_ptr ITTNOTIFY_NAME(heap_function_createA)
+#define __itt_heap_function_createW     ITTNOTIFY_DATA(heap_function_createW)
+#define __itt_heap_function_createW_ptr ITTNOTIFY_NAME(heap_function_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create      ITTNOTIFY_DATA(heap_function_create)
+#define __itt_heap_function_create_ptr  ITTNOTIFY_NAME(heap_function_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW(name, domain) (__itt_heap_function)0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create(name, domain)  (__itt_heap_function)0
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_heap_function_createA_ptr 0
+#define __itt_heap_function_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_heap_function_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation begin occurrence.
+ */
+void ITTAPI __itt_heap_allocate_begin(__itt_heap_function h, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_begin, (__itt_heap_function h, size_t size, int initialized))
+#define __itt_heap_allocate_begin     ITTNOTIFY_VOID(heap_allocate_begin)
+#define __itt_heap_allocate_begin_ptr ITTNOTIFY_NAME(heap_allocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_begin(h, size, initialized)
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an allocation end occurrence.
+ */
+void ITTAPI __itt_heap_allocate_end(__itt_heap_function h, void** addr, size_t size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_allocate_end, (__itt_heap_function h, void** addr, size_t size, int initialized))
+#define __itt_heap_allocate_end     ITTNOTIFY_VOID(heap_allocate_end)
+#define __itt_heap_allocate_end_ptr ITTNOTIFY_NAME(heap_allocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_allocate_end(h, addr, size, initialized)
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_allocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record a free begin occurrence.
+ */
+void ITTAPI __itt_heap_free_begin(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_begin, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_begin     ITTNOTIFY_VOID(heap_free_begin)
+#define __itt_heap_free_begin_ptr ITTNOTIFY_NAME(heap_free_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_begin(h, addr)
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record a free end occurrence.
+ */
+void ITTAPI __itt_heap_free_end(__itt_heap_function h, void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_free_end, (__itt_heap_function h, void* addr))
+#define __itt_heap_free_end     ITTNOTIFY_VOID(heap_free_end)
+#define __itt_heap_free_end_ptr ITTNOTIFY_NAME(heap_free_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_free_end(h, addr)
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_free_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record a reallocation begin occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_begin(__itt_heap_function h, void* addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_begin, (__itt_heap_function h, void* addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_begin     ITTNOTIFY_VOID(heap_reallocate_begin)
+#define __itt_heap_reallocate_begin_ptr ITTNOTIFY_NAME(heap_reallocate_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_begin(h, addr, new_size, initialized)
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_begin_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record a reallocation end occurrence.
+ */
+void ITTAPI __itt_heap_reallocate_end(__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reallocate_end, (__itt_heap_function h, void* addr, void** new_addr, size_t new_size, int initialized))
+#define __itt_heap_reallocate_end     ITTNOTIFY_VOID(heap_reallocate_end)
+#define __itt_heap_reallocate_end_ptr ITTNOTIFY_NAME(heap_reallocate_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reallocate_end(h, addr, new_addr, new_size, initialized)
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reallocate_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access begin */
+void ITTAPI __itt_heap_internal_access_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_begin,  (void))
+#define __itt_heap_internal_access_begin      ITTNOTIFY_VOID(heap_internal_access_begin)
+#define __itt_heap_internal_access_begin_ptr  ITTNOTIFY_NAME(heap_internal_access_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_begin()
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief internal access end */
+void ITTAPI __itt_heap_internal_access_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_internal_access_end, (void))
+#define __itt_heap_internal_access_end     ITTNOTIFY_VOID(heap_internal_access_end)
+#define __itt_heap_internal_access_end_ptr ITTNOTIFY_NAME(heap_internal_access_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_internal_access_end()
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_internal_access_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth begin */
+void ITTAPI __itt_heap_record_memory_growth_begin(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_begin,  (void))
+#define __itt_heap_record_memory_growth_begin      ITTNOTIFY_VOID(heap_record_memory_growth_begin)
+#define __itt_heap_record_memory_growth_begin_ptr  ITTNOTIFY_NAME(heap_record_memory_growth_begin)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_begin()
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_begin_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief record memory growth end */
+void ITTAPI __itt_heap_record_memory_growth_end(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record_memory_growth_end, (void))
+#define __itt_heap_record_memory_growth_end     ITTNOTIFY_VOID(heap_record_memory_growth_end)
+#define __itt_heap_record_memory_growth_end_ptr ITTNOTIFY_NAME(heap_record_memory_growth_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record_memory_growth_end()
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_memory_growth_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Specify the type of heap detection/reporting to modify.
+ */
+/**
+ * @hideinitializer
+ * @brief Report on memory leaks.
+ */
+#define __itt_heap_leaks 0x00000001
+
+/**
+ * @hideinitializer
+ * @brief Report on memory growth.
+ */
+#define __itt_heap_growth 0x00000002
+
+
+/** @brief heap reset detection */
+void ITTAPI __itt_heap_reset_detection(unsigned int reset_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_reset_detection,  (unsigned int reset_mask))
+#define __itt_heap_reset_detection      ITTNOTIFY_VOID(heap_reset_detection)
+#define __itt_heap_reset_detection_ptr  ITTNOTIFY_NAME(heap_reset_detection)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_reset_detection()
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_reset_detection_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief report */
+void ITTAPI __itt_heap_record(unsigned int record_mask);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, heap_record, (unsigned int record_mask))
+#define __itt_heap_record     ITTNOTIFY_VOID(heap_record)
+#define __itt_heap_record_ptr ITTNOTIFY_NAME(heap_record)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_heap_record()
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_heap_record_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} heap group */
+/** @endcond */
+/* ========================================================================== */
+
+/**
+ * @defgroup domains Domains
+ * @ingroup public
+ * Domains group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_domain
+{
+    volatile int flags; /*!< Zero if disabled, non-zero if enabled. The meaning of different non-zero values is reserved to the runtime */
+    const char* nameA;  /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved to the runtime */
+    void* extra2; /*!< Reserved to the runtime */
+    struct ___itt_domain* next;
+} __itt_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup domains
+ * @brief Create a domain.
+ * Create domain using some domain name: the URI naming style is recommended.
+ * Because the set of domains is expected to be static over the application's
+ * execution time, there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of
+ * which thread created the domain. This call is thread-safe.
+ * @param[in] name name of domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_domain* ITTAPI __itt_domain_createA(const char    *name);
+__itt_domain* ITTAPI __itt_domain_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_domain_create     __itt_domain_createW
+#  define __itt_domain_create_ptr __itt_domain_createW_ptr
+#else /* UNICODE */
+#  define __itt_domain_create     __itt_domain_createA
+#  define __itt_domain_create_ptr __itt_domain_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_domain* ITTAPI __itt_domain_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_domain*, domain_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_domain*, domain_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_domain*, domain_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA     ITTNOTIFY_DATA(domain_createA)
+#define __itt_domain_createA_ptr ITTNOTIFY_NAME(domain_createA)
+#define __itt_domain_createW     ITTNOTIFY_DATA(domain_createW)
+#define __itt_domain_createW_ptr ITTNOTIFY_NAME(domain_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create     ITTNOTIFY_DATA(domain_create)
+#define __itt_domain_create_ptr ITTNOTIFY_NAME(domain_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA(name) (__itt_domain*)0
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW(name) (__itt_domain*)0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create(name)  (__itt_domain*)0
+#define __itt_domain_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_domain_createA_ptr 0
+#define __itt_domain_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_domain_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} domains group */
+
+/**
+ * @defgroup ids IDs
+ * @ingroup public
+ * IDs group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_id
+{
+    unsigned long long d1, d2, d3;
+} __itt_id;
+
+#pragma pack(pop)
+/** @endcond */
+
+static const __itt_id __itt_null = { 0, 0, 0 };
+
+/**
+ * @ingroup ids
+ * @brief A convenience function is provided to create an ID without domain control.
+ * @brief This is a convenience function to initialize an __itt_id structure. This function
+ * does not affect the collector runtime in any way. After you make the ID with this
+ * function, you still must create it with the __itt_id_create function before using the ID
+ * to identify a named entity.
+ * @param[in] addr The address of object; high QWORD of the ID value.
+ * @param[in] extra The extra data to unique identify object; low QWORD of the ID value.
+ */
+
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE __itt_id ITTAPI __itt_id_make(void* addr, unsigned long long extra)
+{
+    __itt_id id = __itt_null;
+    id.d1 = (unsigned long long)((uintptr_t)addr);
+    id.d2 = (unsigned long long)extra;
+    id.d3 = (unsigned long long)0; /* Reserved. Must be zero */
+    return id;
+}
+
+/**
+ * @ingroup ids
+ * @brief Create an instance of identifier.
+ * This establishes the beginning of the lifetime of an instance of
+ * the given ID in the trace. Once this lifetime starts, the ID
+ * can be used to tag named entity instances in calls such as
+ * __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * Instance IDs are not domain specific!
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_create(d,x) ITTNOTIFY_VOID_D1(id_create,d,x)
+#define __itt_id_create_ptr  ITTNOTIFY_NAME(id_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create(domain,id)
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup ids
+ * @brief Destroy an instance of identifier.
+ * This ends the lifetime of the current instance of the given ID value in the trace.
+ * Any relationships that are established after this lifetime ends are invalid.
+ * This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_destroy, (const __itt_domain *domain, __itt_id id))
+#define __itt_id_destroy(d,x) ITTNOTIFY_VOID_D1(id_destroy,d,x)
+#define __itt_id_destroy_ptr  ITTNOTIFY_NAME(id_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_destroy(domain,id)
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} ids group */
+
+/**
+ * @defgroup handless String Handles
+ * @ingroup public
+ * String Handles group
+ * @{
+ */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_string_handle
+{
+    const char* strA; /*!< Copy of original string in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* strW; /*!< Copy of original string in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* strW;
+#endif /* UNICODE || _UNICODE */
+    int   extra1; /*!< Reserved. Must be zero   */
+    void* extra2; /*!< Reserved. Must be zero   */
+    struct ___itt_string_handle* next;
+} __itt_string_handle;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup handles
+ * @brief Create a string handle.
+ * Create and return handle value that can be associated with a string.
+ * Consecutive calls to __itt_string_handle_create with the same name
+ * return the same value. Because the set of string handles is expected to remain
+ * static during the application's execution time, there is no mechanism to destroy a string handle.
+ * Any string handle can be accessed by any thread in the process, regardless of which thread created
+ * the string handle. This call is thread-safe.
+ * @param[in] name The input string
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_string_handle* ITTAPI __itt_string_handle_createA(const char    *name);
+__itt_string_handle* ITTAPI __itt_string_handle_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_string_handle_create     __itt_string_handle_createW
+#  define __itt_string_handle_create_ptr __itt_string_handle_createW_ptr
+#else /* UNICODE */
+#  define __itt_string_handle_create     __itt_string_handle_createA
+#  define __itt_string_handle_create_ptr __itt_string_handle_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_string_handle* ITTAPI __itt_string_handle_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_string_handle*, string_handle_create,  (const char    *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA     ITTNOTIFY_DATA(string_handle_createA)
+#define __itt_string_handle_createA_ptr ITTNOTIFY_NAME(string_handle_createA)
+#define __itt_string_handle_createW     ITTNOTIFY_DATA(string_handle_createW)
+#define __itt_string_handle_createW_ptr ITTNOTIFY_NAME(string_handle_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create     ITTNOTIFY_DATA(string_handle_create)
+#define __itt_string_handle_create_ptr ITTNOTIFY_NAME(string_handle_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA(name) (__itt_string_handle*)0
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW(name) (__itt_string_handle*)0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create(name)  (__itt_string_handle*)0
+#define __itt_string_handle_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_string_handle_createA_ptr 0
+#define __itt_string_handle_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_string_handle_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} handles group */
+
+/** @cond exclude_from_documentation */
+typedef unsigned long long __itt_timestamp;
+/** @endcond */
+
+#define __itt_timestamp_none ((__itt_timestamp)-1LL)
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @ingroup timestamps
+ * @brief Return timestamp corresponding to the current moment.
+ * This returns the timestamp in the format that is the most relevant for the current
+ * host or platform (RDTSC, QPC, and others). You can use the "<" operator to
+ * compare __itt_timestamp values.
+ */
+__itt_timestamp ITTAPI __itt_get_timestamp(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_timestamp, get_timestamp, (void))
+#define __itt_get_timestamp      ITTNOTIFY_DATA(get_timestamp)
+#define __itt_get_timestamp_ptr  ITTNOTIFY_NAME(get_timestamp)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_get_timestamp()
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_get_timestamp_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} timestamps */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/**
+ * @defgroup regions Regions
+ * @ingroup public
+ * Regions group
+ * @{
+ */
+/**
+ * @ingroup regions
+ * @brief Begin of region instance.
+ * Successive calls to __itt_region_begin with the same ID are ignored
+ * until a call to __itt_region_end with the same ID
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance. Must not be __itt_null
+ * @param[in] parentid The instance ID for the parent of this region instance, or __itt_null
+ * @param[in] name The name of this region
+ */
+void ITTAPI __itt_region_begin(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup regions
+ * @brief End of region instance.
+ * The first call to __itt_region_end with a given ID ends the
+ * region. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_region_begin call.
+ * @param[in] domain The domain for this region instance
+ * @param[in] id The instance ID for this region instance
+ */
+void ITTAPI __itt_region_end(const __itt_domain *domain, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, region_begin, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, region_end,   (const __itt_domain *domain, __itt_id id))
+#define __itt_region_begin(d,x,y,z) ITTNOTIFY_VOID_D3(region_begin,d,x,y,z)
+#define __itt_region_begin_ptr      ITTNOTIFY_NAME(region_begin)
+#define __itt_region_end(d,x)       ITTNOTIFY_VOID_D1(region_end,d,x)
+#define __itt_region_end_ptr        ITTNOTIFY_NAME(region_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_region_begin(d,x,y,z)
+#define __itt_region_begin_ptr 0
+#define __itt_region_end(d,x)
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_region_begin_ptr 0
+#define __itt_region_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} regions group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup public
+ * Frames are similar to regions, but are intended to be easier to use and to implement.
+ * In particular:
+ * - Frames always represent periods of elapsed time
+ * - By default, frames have no nesting relationships
+ * @{
+ */
+
+/**
+ * @ingroup frames
+ * @brief Begin a frame instance.
+ * Successive calls to __itt_frame_begin with the
+ * same ID are ignored until a call to __itt_frame_end with the same ID.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ */
+void ITTAPI __itt_frame_begin_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief End a frame instance.
+ * The first call to __itt_frame_end with a given ID
+ * ends the frame. Successive calls with the same ID are ignored, as are
+ * calls that do not have a matching __itt_frame_begin call.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL for current
+ */
+void ITTAPI __itt_frame_end_v3(const __itt_domain *domain, __itt_id *id);
+
+/**
+ * @ingroup frames
+ * @brief Submits a frame instance.
+ * Successive calls to __itt_frame_begin or __itt_frame_submit with the
+ * same ID are ignored until a call to __itt_frame_end or __itt_frame_submit
+ * with the same ID.
+ * Passing special __itt_timestamp_none value as "end" argument means
+ * take the current timestamp as the end timestamp.
+ * @param[in] domain The domain for this frame instance
+ * @param[in] id The instance ID for this frame instance or NULL
+ * @param[in] begin Timestamp of the beginning of the frame
+ * @param[in] end Timestamp of the end of the frame
+ */
+void ITTAPI __itt_frame_submit_v3(const __itt_domain *domain, __itt_id *id,
+    __itt_timestamp begin, __itt_timestamp end);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin_v3,  (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_end_v3,    (const __itt_domain *domain, __itt_id *id))
+ITT_STUBV(ITTAPI, void, frame_submit_v3, (const __itt_domain *domain, __itt_id *id, __itt_timestamp begin, __itt_timestamp end))
+#define __itt_frame_begin_v3(d,x)      ITTNOTIFY_VOID_D1(frame_begin_v3,d,x)
+#define __itt_frame_begin_v3_ptr       ITTNOTIFY_NAME(frame_begin_v3)
+#define __itt_frame_end_v3(d,x)        ITTNOTIFY_VOID_D1(frame_end_v3,d,x)
+#define __itt_frame_end_v3_ptr         ITTNOTIFY_NAME(frame_end_v3)
+#define __itt_frame_submit_v3(d,x,b,e) ITTNOTIFY_VOID_D3(frame_submit_v3,d,x,b,e)
+#define __itt_frame_submit_v3_ptr      ITTNOTIFY_NAME(frame_submit_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin_v3(domain,id)
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3(domain,id)
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3(domain,id,begin,end)
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_v3_ptr 0
+#define __itt_frame_end_v3_ptr   0
+#define __itt_frame_submit_v3_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+/** @endcond */
+
+/**
+ * @defgroup taskgroup Task Group
+ * @ingroup public
+ * Task Group
+ * @{
+ */
+/**
+ * @ingroup task_groups
+ * @brief Denotes a task_group instance.
+ * Successive calls to __itt_task_group with the same ID are ignored.
+ * @param[in] domain The domain for this task_group instance
+ * @param[in] id The instance ID for this task_group instance. Must not be __itt_null.
+ * @param[in] parentid The instance ID for the parent of this task_group instance, or __itt_null.
+ * @param[in] name The name of this task_group
+ */
+void ITTAPI __itt_task_group(const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_group, (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+#define __itt_task_group(d,x,y,z) ITTNOTIFY_VOID_D3(task_group,d,x,y,z)
+#define __itt_task_group_ptr      ITTNOTIFY_NAME(task_group)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_group(d,x,y,z)
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_group_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} taskgroup group */
+
+/**
+ * @defgroup tasks Tasks
+ * @ingroup public
+ * A task instance represents a piece of work performed by a particular
+ * thread for a period of time. A call to __itt_task_begin creates a
+ * task instance. This becomes the current instance for that task on that
+ * thread. A following call to __itt_task_end on the same thread ends the
+ * instance. There may be multiple simultaneous instances of tasks with the
+ * same name on different threads. If an ID is specified, the task instance
+ * receives that ID. Nested tasks are allowed.
+ *
+ * Note: The task is defined by the bracketing of __itt_task_begin and
+ * __itt_task_end on the same thread. If some scheduling mechanism causes
+ * task switching (the thread executes a different user task) or task
+ * switching (the user task switches to a different thread) then this breaks
+ * the notion of  current instance. Additional API calls are required to
+ * deal with that possibility.
+ * @{
+ */
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name);
+
+/**
+ * @ingroup tasks
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid The identifier for this task instance (may be 0)
+ * @param[in] parentid The parent of this task (may be 0)
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn(const __itt_domain *domain, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup tasks
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ */
+void ITTAPI __itt_task_end(const __itt_domain *domain);
+
+/**
+ * @ingroup tasks
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped(const __itt_domain* domain, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup tasks
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped(const __itt_domain *domain, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin,    (const __itt_domain *domain, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn, (const __itt_domain *domain, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end,      (const __itt_domain *domain))
+ITT_STUBV(ITTAPI, void, task_begin_overlapped, (const __itt_domain *domain, __itt_id taskid, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped,   (const __itt_domain *domain, __itt_id taskid))
+#define __itt_task_begin(d,x,y,z)    ITTNOTIFY_VOID_D3(task_begin,d,x,y,z)
+#define __itt_task_begin_ptr         ITTNOTIFY_NAME(task_begin)
+#define __itt_task_begin_fn(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_fn,d,x,y,z)
+#define __itt_task_begin_fn_ptr      ITTNOTIFY_NAME(task_begin_fn)
+#define __itt_task_end(d)            ITTNOTIFY_VOID_D0(task_end,d)
+#define __itt_task_end_ptr           ITTNOTIFY_NAME(task_end)
+#define __itt_task_begin_overlapped(d,x,y,z) ITTNOTIFY_VOID_D3(task_begin_overlapped,d,x,y,z)
+#define __itt_task_begin_overlapped_ptr      ITTNOTIFY_NAME(task_begin_overlapped)
+#define __itt_task_end_overlapped(d,x)       ITTNOTIFY_VOID_D1(task_end_overlapped,d,x)
+#define __itt_task_end_overlapped_ptr        ITTNOTIFY_NAME(task_end_overlapped)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin(domain,id,parentid,name)
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn(domain,id,parentid,fn)
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end(domain)
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped(domain,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ptr         0
+#define __itt_task_end_overlapped(domain,taskid)
+#define __itt_task_end_overlapped_ptr           0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ptr    0
+#define __itt_task_begin_fn_ptr 0
+#define __itt_task_end_ptr      0
+#define __itt_task_begin_overlapped_ptr 0
+#define __itt_task_end_overlapped_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} tasks group */
+
+
+/**
+ * @defgroup markers Markers
+ * Markers represent a single discreet event in time. Markers have a scope,
+ * described by an enumerated type __itt_scope. Markers are created by
+ * the API call __itt_marker. A marker instance can be given an ID for use in
+ * adding metadata.
+ * @{
+ */
+
+/**
+ * @brief Describes the scope of an event object in the trace.
+ */
+typedef enum
+{
+    __itt_scope_unknown = 0,
+    __itt_scope_global,
+    __itt_scope_track_group,
+    __itt_scope_track,
+    __itt_scope_task,
+    __itt_scope_marker
+} __itt_scope;
+
+/** @cond exclude_from_documentation */
+#define __itt_marker_scope_unknown  __itt_scope_unknown
+#define __itt_marker_scope_global   __itt_scope_global
+#define __itt_marker_scope_process  __itt_scope_track_group
+#define __itt_marker_scope_thread   __itt_scope_track
+#define __itt_marker_scope_task     __itt_scope_task
+/** @endcond */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance
+ * @param[in] domain The domain for this marker
+ * @param[in] id The instance ID for this marker or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker(const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker, (const __itt_domain *domain, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker(d,x,y,z) ITTNOTIFY_VOID_D3(marker,d,x,y,z)
+#define __itt_marker_ptr      ITTNOTIFY_NAME(marker)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker(domain,id,name,scope)
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} markers group */
+
+/**
+ * @defgroup metadata Metadata
+ * The metadata API is used to attach extra information to named
+ * entities. Metadata can be attached to an identified named entity by ID,
+ * or to the current entity (which is always a task).
+ *
+ * Conceptually metadata has a type (what kind of metadata), a key (the
+ * name of the metadata), and a value (the actual data). The encoding of
+ * the value depends on the type of the metadata.
+ *
+ * The type of metadata is specified by an enumerated type __itt_metdata_type.
+ * @{
+ */
+
+/**
+ * @ingroup parameters
+ * @brief describes the type of metadata
+ */
+typedef enum {
+    __itt_metadata_unknown = 0,
+    __itt_metadata_u64,     /**< Unsigned 64-bit integer */
+    __itt_metadata_s64,     /**< Signed 64-bit integer */
+    __itt_metadata_u32,     /**< Unsigned 32-bit integer */
+    __itt_metadata_s32,     /**< Signed 32-bit integer */
+    __itt_metadata_u16,     /**< Unsigned 16-bit integer */
+    __itt_metadata_s16,     /**< Signed 16-bit integer */
+    __itt_metadata_float,   /**< Signed 32-bit floating-point */
+    __itt_metadata_double   /**< SIgned 64-bit floating-point */
+} __itt_metadata_type;
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add,d,x,y,z,a,b)
+#define __itt_metadata_add_ptr          ITTNOTIFY_NAME(metadata_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add(d,x,y,z,a,b)
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_addA(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_addW(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add     __itt_metadata_str_addW
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add     __itt_metadata_str_addA
+#  define __itt_metadata_str_add_ptr __itt_metadata_str_addA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add(const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_addA, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_addW, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add, (const __itt_domain *domain, __itt_id id, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addA,d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr        ITTNOTIFY_NAME(metadata_str_addA)
+#define __itt_metadata_str_addW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_addW,d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr        ITTNOTIFY_NAME(metadata_str_addW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add,d,x,y,z,a)
+#define __itt_metadata_str_add_ptr         ITTNOTIFY_NAME(metadata_str_add)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA(d,x,y,z,a)
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW(d,x,y,z,a)
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add(d,x,y,z,a)
+#define __itt_metadata_str_add_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_addA_ptr 0
+#define __itt_metadata_str_addW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] type The type of the metadata
+ * @param[in] count The number of elements of the given type. If count == 0, no metadata will be added.
+ * @param[in] data The metadata itself
+*/
+void ITTAPI __itt_metadata_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, metadata_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, __itt_metadata_type type, size_t count, void *data))
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b) ITTNOTIFY_VOID_D5(metadata_add_with_scope,d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr          ITTNOTIFY_NAME(metadata_add_with_scope)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_metadata_add_with_scope(d,x,y,z,a,b)
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_metadata_add_with_scope_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup parameters
+ * @brief Add string metadata to an instance of a named entity.
+ * @param[in] domain The domain controlling the call
+ * @param[in] scope The scope of the instance to which the metadata is to be added
+
+ * @param[in] id The identifier of the instance to which the metadata is to be added, or __itt_null to add to the current task
+
+ * @param[in] key The name of the metadata
+ * @param[in] data The metadata itself
+ * @param[in] length The number of characters in the string, or -1 if the length is unknown but the string is null-terminated
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_metadata_str_add_with_scopeA(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+void ITTAPI __itt_metadata_str_add_with_scopeW(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeW
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeW_ptr
+#else /* UNICODE */
+#  define __itt_metadata_str_add_with_scope     __itt_metadata_str_add_with_scopeA
+#  define __itt_metadata_str_add_with_scope_ptr __itt_metadata_str_add_with_scopeA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_metadata_str_add_with_scope(const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length);
+#endif
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeA, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scopeW, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const wchar_t *data, size_t length))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, metadata_str_add_with_scope, (const __itt_domain *domain, __itt_scope scope, __itt_string_handle *key, const char *data, size_t length))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeA,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeA)
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a) ITTNOTIFY_VOID_D4(metadata_str_add_with_scopeW,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr        ITTNOTIFY_NAME(metadata_str_add_with_scopeW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)  ITTNOTIFY_VOID_D4(metadata_str_add_with_scope,d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr         ITTNOTIFY_NAME(metadata_str_add_with_scope)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope(d,x,y,z,a)
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_metadata_str_add_with_scopeA_ptr  0
+#define __itt_metadata_str_add_with_scopeW_ptr  0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_metadata_str_add_with_scope_ptr   0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} metadata group */
+
+/**
+ * @defgroup relations Relations
+ * Instances of named entities can be explicitly associated with other
+ * instances using instance IDs and the relationship API calls.
+ *
+ * @{
+ */
+
+/**
+ * @ingroup relations
+ * @brief The kind of relation between two instances is specified by the enumerated type __itt_relation.
+ * Relations between instances can be added with an API call. The relation
+ * API uses instance IDs. Relations can be added before or after the actual
+ * instances are created and persist independently of the instances. This
+ * is the motivation for having different lifetimes for instance IDs and
+ * the actual instances.
+ */
+typedef enum
+{
+    __itt_relation_is_unknown = 0,
+    __itt_relation_is_dependent_on,         /**< "A is dependent on B" means that A cannot start until B completes */
+    __itt_relation_is_sibling_of,           /**< "A is sibling of B" means that A and B were created as a group */
+    __itt_relation_is_parent_of,            /**< "A is parent of B" means that A created B */
+    __itt_relation_is_continuation_of,      /**< "A is continuation of B" means that A assumes the dependencies of B */
+    __itt_relation_is_child_of,             /**< "A is child of B" means that A was created by B (inverse of is_parent_of) */
+    __itt_relation_is_continued_by,         /**< "A is continued by B" means that B assumes the dependencies of A (inverse of is_continuation_of) */
+    __itt_relation_is_predecessor_to        /**< "A is predecessor to B" means that B cannot start until A completes (inverse of is_dependent_on) */
+} __itt_relation;
+
+/**
+ * @ingroup relations
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current(const __itt_domain *domain, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup relations
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add(const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current, (const __itt_domain *domain, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add,            (const __itt_domain *domain, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current(d,x,y) ITTNOTIFY_VOID_D2(relation_add_to_current,d,x,y)
+#define __itt_relation_add_to_current_ptr    ITTNOTIFY_NAME(relation_add_to_current)
+#define __itt_relation_add(d,x,y,z)          ITTNOTIFY_VOID_D3(relation_add,d,x,y,z)
+#define __itt_relation_add_ptr               ITTNOTIFY_NAME(relation_add)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current(d,x,y)
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add(d,x,y,z)
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ptr 0
+#define __itt_relation_add_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} relations group */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_info
+{
+    unsigned long long clock_freq; /*!< Clock domain frequency */
+    unsigned long long clock_base; /*!< Clock domain base timestamp */
+} __itt_clock_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef void (ITTAPI *__itt_get_clock_info_fn)(__itt_clock_info* clock_info, void* data);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_clock_domain
+{
+    __itt_clock_info info;      /*!< Most recent clock domain info */
+    __itt_get_clock_info_fn fn; /*!< Callback function pointer */
+    void* fn_data;              /*!< Input argument for the callback function */
+    int   extra1;               /*!< Reserved. Must be zero */
+    void* extra2;               /*!< Reserved. Must be zero */
+    struct ___itt_clock_domain* next;
+} __itt_clock_domain;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Create a clock domain.
+ * Certain applications require the capability to trace their application using
+ * a clock domain different than the CPU, for instance the instrumentation of events
+ * that occur on a GPU.
+ * Because the set of domains is expected to be static over the application's execution time,
+ * there is no mechanism to destroy a domain.
+ * Any domain can be accessed by any thread in the process, regardless of which thread created
+ * the domain. This call is thread-safe.
+ * @param[in] fn A pointer to a callback function which retrieves alternative CPU timestamps
+ * @param[in] fn_data Argument for a callback function; may be NULL
+ */
+__itt_clock_domain* ITTAPI __itt_clock_domain_create(__itt_get_clock_info_fn fn, void* fn_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_clock_domain*, clock_domain_create, (__itt_get_clock_info_fn fn, void* fn_data))
+#define __itt_clock_domain_create     ITTNOTIFY_DATA(clock_domain_create)
+#define __itt_clock_domain_create_ptr ITTNOTIFY_NAME(clock_domain_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_create(fn,fn_data) (__itt_clock_domain*)0
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomains
+ * @brief Recalculate clock domains frequencies and clock base timestamps.
+ */
+void ITTAPI __itt_clock_domain_reset(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, clock_domain_reset, (void))
+#define __itt_clock_domain_reset     ITTNOTIFY_VOID(clock_domain_reset)
+#define __itt_clock_domain_reset_ptr ITTNOTIFY_NAME(clock_domain_reset)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_clock_domain_reset()
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_clock_domain_reset_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Create an instance of identifier. This establishes the beginning of the lifetime of
+ * an instance of the given ID in the trace. Once this lifetime starts, the ID can be used to
+ * tag named entity instances in calls such as __itt_task_begin, and to specify relationships among
+ * identified named entity instances, using the \ref relations APIs.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to create.
+ */
+void ITTAPI __itt_id_create_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/**
+ * @ingroup clockdomain
+ * @brief Destroy an instance of identifier. This ends the lifetime of the current instance of the
+ * given ID value in the trace. Any relationships that are established after this lifetime ends are
+ * invalid. This call must be performed before the given ID value can be reused for a different
+ * named entity instance.
+ * @param[in] domain The domain controlling the execution of this call.
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The ID to destroy.
+ */
+void ITTAPI __itt_id_destroy_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, id_create_ex,  (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+ITT_STUBV(ITTAPI, void, id_destroy_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id))
+#define __itt_id_create_ex(d,x,y,z)  ITTNOTIFY_VOID_D3(id_create_ex,d,x,y,z)
+#define __itt_id_create_ex_ptr       ITTNOTIFY_NAME(id_create_ex)
+#define __itt_id_destroy_ex(d,x,y,z) ITTNOTIFY_VOID_D3(id_destroy_ex,d,x,y,z)
+#define __itt_id_destroy_ex_ptr      ITTNOTIFY_NAME(id_destroy_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_id_create_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex(domain,clock_domain,timestamp,id)
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_id_create_ex_ptr    0
+#define __itt_id_destroy_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The instance ID for this task instance, or __itt_null
+ * @param[in] parentid The parent instance to which this task instance belongs, or __itt_null
+ * @param[in] name The name of this task
+ */
+void ITTAPI __itt_task_begin_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin a task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, or __itt_null
+ * @param[in] parentid The parent of this task, or __itt_null
+ * @param[in] fn The pointer to the function you are tracing
+ */
+void ITTAPI __itt_task_begin_fn_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, void* fn);
+
+/**
+ * @ingroup clockdomain
+ * @brief End the current task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ */
+void ITTAPI __itt_task_end_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_ex,        (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, task_begin_fn_ex,     (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_id parentid, void* fn))
+ITT_STUBV(ITTAPI, void, task_end_ex,          (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp))
+#define __itt_task_begin_ex(d,x,y,z,a,b)      ITTNOTIFY_VOID_D5(task_begin_ex,d,x,y,z,a,b)
+#define __itt_task_begin_ex_ptr               ITTNOTIFY_NAME(task_begin_ex)
+#define __itt_task_begin_fn_ex(d,x,y,z,a,b)   ITTNOTIFY_VOID_D5(task_begin_fn_ex,d,x,y,z,a,b)
+#define __itt_task_begin_fn_ex_ptr            ITTNOTIFY_NAME(task_begin_fn_ex)
+#define __itt_task_end_ex(d,x,y)              ITTNOTIFY_VOID_D2(task_end_ex,d,x,y)
+#define __itt_task_end_ex_ptr                 ITTNOTIFY_NAME(task_end_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_ex(domain,clock_domain,timestamp,id,parentid,name)
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex(domain,clock_domain,timestamp,id,parentid,fn)
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex(domain,clock_domain,timestamp)
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_ex_ptr          0
+#define __itt_task_begin_fn_ex_ptr       0
+#define __itt_task_end_ex_ptr            0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup counters Counters
+ * @ingroup public
+ * Counters are user-defined objects with a monotonically increasing
+ * value. Counter values are 64-bit unsigned integers.
+ * Counters have names that can be displayed in
+ * the tools.
+ * @{
+ */
+
+/**
+ * @brief opaque structure for counter identification
+ */
+/** @cond exclude_from_documentation */
+
+typedef struct ___itt_counter* __itt_counter;
+
+/**
+ * @brief Create an unsigned 64 bits integer counter with given name/domain
+ *
+ * After __itt_counter_create() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter, where value_ptr is a pointer to an unsigned 64 bits integer
+ *
+ * The call is equal to __itt_counter_create_typed(name, domain, __itt_metadata_u64)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA(const char    *name, const char    *domain);
+__itt_counter ITTAPI __itt_counter_createW(const wchar_t *name, const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create     __itt_counter_createW
+#  define __itt_counter_create_ptr __itt_counter_createW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create     __itt_counter_createA
+#  define __itt_counter_create_ptr __itt_counter_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create(const char *name, const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA, (const char    *name, const char    *domain))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW, (const wchar_t *name, const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create,  (const char *name, const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA     ITTNOTIFY_DATA(counter_createA)
+#define __itt_counter_createA_ptr ITTNOTIFY_NAME(counter_createA)
+#define __itt_counter_createW     ITTNOTIFY_DATA(counter_createW)
+#define __itt_counter_createW_ptr ITTNOTIFY_NAME(counter_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create     ITTNOTIFY_DATA(counter_create)
+#define __itt_counter_create_ptr ITTNOTIFY_NAME(counter_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA(name, domain)
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW(name, domain)
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create(name, domain)
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_ptr 0
+#define __itt_counter_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Increment the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc, (__itt_counter id))
+#define __itt_counter_inc     ITTNOTIFY_VOID(counter_inc)
+#define __itt_counter_inc_ptr ITTNOTIFY_NAME(counter_inc)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc(id)
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Increment the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_inc_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_inc_delta     ITTNOTIFY_VOID(counter_inc_delta)
+#define __itt_counter_inc_delta_ptr ITTNOTIFY_NAME(counter_inc_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_delta(id, value)
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec, (__itt_counter id))
+#define __itt_counter_dec     ITTNOTIFY_VOID(counter_dec)
+#define __itt_counter_dec_ptr ITTNOTIFY_NAME(counter_dec)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec(id)
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/**
+ * @brief Decrement the unsigned 64 bits integer counter value with x
+ *
+ * Calling this function to non-unsigned 64 bits integer counters has no effect
+ */
+void ITTAPI __itt_counter_dec_delta(__itt_counter id, unsigned long long value);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_delta, (__itt_counter id, unsigned long long value))
+#define __itt_counter_dec_delta     ITTNOTIFY_VOID(counter_dec_delta)
+#define __itt_counter_dec_delta_ptr ITTNOTIFY_NAME(counter_dec_delta)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_delta(id, value)
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_delta_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls increment the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_inc_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Increment a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to increment the counter
+ */
+void ITTAPI __itt_counter_inc_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_inc_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_inc_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_inc_v3(d,x)         ITTNOTIFY_VOID_D1(counter_inc_v3,d,x)
+#define __itt_counter_inc_v3_ptr          ITTNOTIFY_NAME(counter_inc_v3)
+#define __itt_counter_inc_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_inc_delta_v3,d,x,y)
+#define __itt_counter_inc_delta_v3_ptr    ITTNOTIFY_NAME(counter_inc_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_inc_v3(domain,name)
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3(domain,name,delta)
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_inc_v3_ptr       0
+#define __itt_counter_inc_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by one.
+ * The first call with a given name creates a counter by that name and sets its
+ * value to zero. Successive calls decrement the counter value.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ */
+void ITTAPI __itt_counter_dec_v3(const __itt_domain *domain, __itt_string_handle *name);
+
+/**
+ * @ingroup counters
+ * @brief Decrement a counter by the value specified in delta.
+ * @param[in] domain The domain controlling the call. Counter names are not domain specific.
+ *            The domain argument is used only to enable or disable the API calls.
+ * @param[in] name The name of the counter
+ * @param[in] delta The amount by which to decrement the counter
+ */
+void ITTAPI __itt_counter_dec_delta_v3(const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_dec_v3,       (const __itt_domain *domain, __itt_string_handle *name))
+ITT_STUBV(ITTAPI, void, counter_dec_delta_v3, (const __itt_domain *domain, __itt_string_handle *name, unsigned long long delta))
+#define __itt_counter_dec_v3(d,x)         ITTNOTIFY_VOID_D1(counter_dec_v3,d,x)
+#define __itt_counter_dec_v3_ptr          ITTNOTIFY_NAME(counter_dec_v3)
+#define __itt_counter_dec_delta_v3(d,x,y) ITTNOTIFY_VOID_D2(counter_dec_delta_v3,d,x,y)
+#define __itt_counter_dec_delta_v3_ptr    ITTNOTIFY_NAME(counter_dec_delta_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_dec_v3(domain,name)
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3(domain,name,delta)
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_dec_v3_ptr       0
+#define __itt_counter_dec_delta_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} counters group */
+
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value(__itt_counter id, void *value_ptr);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value, (__itt_counter id, void *value_ptr))
+#define __itt_counter_set_value     ITTNOTIFY_VOID(counter_set_value)
+#define __itt_counter_set_value_ptr ITTNOTIFY_NAME(counter_set_value)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value(id, value_ptr)
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the counter value
+ */
+void ITTAPI __itt_counter_set_value_ex(__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value_ex, (__itt_counter id, __itt_clock_domain *clock_domain, unsigned long long timestamp, void *value_ptr))
+#define __itt_counter_set_value_ex     ITTNOTIFY_VOID(counter_set_value_ex)
+#define __itt_counter_set_value_ex_ptr ITTNOTIFY_NAME(counter_set_value_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create a typed counter with given name/domain
+ *
+ * After __itt_counter_create_typed() is called, __itt_counter_inc(id), __itt_counter_inc_delta(id, delta),
+ * __itt_counter_set_value(id, value_ptr) or __itt_counter_set_value_ex(id, clock_domain, timestamp, value_ptr)
+ * can be used to change the value of the counter
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_create_typedA(const char    *name, const char    *domain, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_create_typedW(const wchar_t *name, const wchar_t *domain, __itt_metadata_type type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create_typed     __itt_counter_create_typedW
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedW_ptr
+#else /* UNICODE */
+#  define __itt_counter_create_typed     __itt_counter_create_typedA
+#  define __itt_counter_create_typed_ptr __itt_counter_create_typedA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create_typed(const char *name, const char *domain, __itt_metadata_type type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedA, (const char    *name, const char    *domain, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typedW, (const wchar_t *name, const wchar_t *domain, __itt_metadata_type type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char *name, const char *domain, __itt_metadata_type type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA     ITTNOTIFY_DATA(counter_create_typedA)
+#define __itt_counter_create_typedA_ptr ITTNOTIFY_NAME(counter_create_typedA)
+#define __itt_counter_create_typedW     ITTNOTIFY_DATA(counter_create_typedW)
+#define __itt_counter_create_typedW_ptr ITTNOTIFY_NAME(counter_create_typedW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed     ITTNOTIFY_DATA(counter_create_typed)
+#define __itt_counter_create_typed_ptr ITTNOTIFY_NAME(counter_create_typed)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA(name, domain, type)
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW(name, domain, type)
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed(name, domain, type)
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_create_typedA_ptr 0
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_typed_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the counter identified by the pointer previously returned by __itt_counter_create() or
+ * __itt_counter_create_typed()
+ */
+void ITTAPI __itt_counter_destroy(__itt_counter id);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_destroy, (__itt_counter id))
+#define __itt_counter_destroy     ITTNOTIFY_VOID(counter_destroy)
+#define __itt_counter_destroy_ptr ITTNOTIFY_NAME(counter_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_destroy(id)
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} counters group */
+
+/**
+ * @ingroup markers
+ * @brief Create a marker instance.
+ * @param[in] domain The domain for this marker
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] id The instance ID for this marker, or __itt_null
+ * @param[in] name The name for this marker
+ * @param[in] scope The scope for this marker
+ */
+void ITTAPI __itt_marker_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, marker_ex,    (const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id id, __itt_string_handle *name, __itt_scope scope))
+#define __itt_marker_ex(d,x,y,z,a,b)    ITTNOTIFY_VOID_D5(marker_ex,d,x,y,z,a,b)
+#define __itt_marker_ex_ptr             ITTNOTIFY_NAME(marker_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_marker_ex(domain,clock_domain,timestamp,id,name,scope)
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_marker_ex_ptr    0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation to the current task instance.
+ * The current task instance is the head of the relation.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_to_current_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail);
+
+/**
+ * @ingroup clockdomain
+ * @brief Add a relation between two instance identifiers.
+ * @param[in] domain The domain controlling this call
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] head The ID for the head of the relation
+ * @param[in] relation The kind of relation
+ * @param[in] tail The ID for the tail of the relation
+ */
+void ITTAPI __itt_relation_add_ex(const __itt_domain *domain,  __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, relation_add_to_current_ex, (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_relation relation, __itt_id tail))
+ITT_STUBV(ITTAPI, void, relation_add_ex,            (const __itt_domain *domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id head, __itt_relation relation, __itt_id tail))
+#define __itt_relation_add_to_current_ex(d,x,y,z,a) ITTNOTIFY_VOID_D4(relation_add_to_current_ex,d,x,y,z,a)
+#define __itt_relation_add_to_current_ex_ptr        ITTNOTIFY_NAME(relation_add_to_current_ex)
+#define __itt_relation_add_ex(d,x,y,z,a,b)          ITTNOTIFY_VOID_D5(relation_add_ex,d,x,y,z,a,b)
+#define __itt_relation_add_ex_ptr                   ITTNOTIFY_NAME(relation_add_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_relation_add_to_current_ex(domain,clock_domain,timestame,relation,tail)
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex(domain,clock_domain,timestamp,head,relation,tail)
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_relation_add_to_current_ex_ptr 0
+#define __itt_relation_add_ex_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum ___itt_track_group_type
+{
+    __itt_track_group_type_normal = 0
+} __itt_track_group_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track_group
+{
+    __itt_string_handle* name;     /*!< Name of the track group */
+    struct ___itt_track* track;    /*!< List of child tracks    */
+    __itt_track_group_type tgtype; /*!< Type of the track group */
+    int   extra1;                  /*!< Reserved. Must be zero  */
+    void* extra2;                  /*!< Reserved. Must be zero  */
+    struct ___itt_track_group* next;
+} __itt_track_group;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Placeholder for custom track types. Currently, "normal" custom track
+ * is the only available track type.
+ */
+typedef enum ___itt_track_type
+{
+    __itt_track_type_normal = 0
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+    , __itt_track_type_queue
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
+} __itt_track_type;
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_track
+{
+    __itt_string_handle* name; /*!< Name of the track group */
+    __itt_track_group* group;  /*!< Parent group to a track */
+    __itt_track_type ttype;    /*!< Type of the track       */
+    int   extra1;              /*!< Reserved. Must be zero  */
+    void* extra2;              /*!< Reserved. Must be zero  */
+    struct ___itt_track* next;
+} __itt_track;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create logical track group.
+ */
+__itt_track_group* ITTAPI __itt_track_group_create(__itt_string_handle* name, __itt_track_group_type track_group_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track_group*, track_group_create, (__itt_string_handle* name, __itt_track_group_type track_group_type))
+#define __itt_track_group_create     ITTNOTIFY_DATA(track_group_create)
+#define __itt_track_group_create_ptr ITTNOTIFY_NAME(track_group_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_group_create(name)  (__itt_track_group*)0
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_group_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Create logical track.
+ */
+__itt_track* ITTAPI __itt_track_create(__itt_track_group* track_group, __itt_string_handle* name, __itt_track_type track_type);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_track*, track_create, (__itt_track_group* track_group,__itt_string_handle* name, __itt_track_type track_type))
+#define __itt_track_create     ITTNOTIFY_DATA(track_create)
+#define __itt_track_create_ptr ITTNOTIFY_NAME(track_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_track_create(track_group,name,track_type)  (__itt_track*)0
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_track_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the logical track.
+ */
+void ITTAPI __itt_set_track(__itt_track* track);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, set_track, (__itt_track *track))
+#define __itt_set_track     ITTNOTIFY_VOID(set_track)
+#define __itt_set_track_ptr ITTNOTIFY_NAME(set_track)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_set_track(track)
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_set_track_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/* ========================================================================== */
+/** @cond exclude_from_gpa_documentation */
+/**
+ * @defgroup events Events
+ * @ingroup public
+ * Events group
+ * @{
+ */
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} events group */
+
+
+/**
+ * @defgroup arrays Arrays Visualizer
+ * @ingroup public
+ * Visualize arrays
+ * @{
+ */
+
+/**
+ * @enum __itt_av_data_type
+ * @brief Defines types of arrays data (for C/C++ intrinsic types)
+ */
+typedef enum
+{
+    __itt_e_first = 0,
+    __itt_e_char = 0,  /* 1-byte integer */
+    __itt_e_uchar,     /* 1-byte unsigned integer */
+    __itt_e_int16,     /* 2-byte integer */
+    __itt_e_uint16,    /* 2-byte unsigned integer  */
+    __itt_e_int32,     /* 4-byte integer */
+    __itt_e_uint32,    /* 4-byte unsigned integer */
+    __itt_e_int64,     /* 8-byte integer */
+    __itt_e_uint64,    /* 8-byte unsigned integer */
+    __itt_e_float,     /* 4-byte floating */
+    __itt_e_double,    /* 8-byte floating */
+    __itt_e_last = __itt_e_double
+} __itt_av_data_type;
+
+/**
+ * @brief Save an array data to a file.
+ * Output format is defined by the file extension. The csv and bmp formats are supported (bmp - for 2-dimensional array only).
+ * @param[in] data - pointer to the array data
+ * @param[in] rank - the rank of the array
+ * @param[in] dimensions - pointer to an array of integers, which specifies the array dimensions.
+ * The size of dimensions must be equal to the rank
+ * @param[in] type - the type of the array, specified as one of the __itt_av_data_type values (for intrinsic types)
+ * @param[in] filePath - the file path; the output format is defined by the file extension
+ * @param[in] columnOrder - defines how the array is stored in the linear memory.
+ * It should be 1 for column-major order (e.g. in FORTRAN) or 0 - for row-major order (e.g. in C).
+ */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_av_saveA(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+int ITTAPI __itt_av_saveW(void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_av_save     __itt_av_saveW
+#  define __itt_av_save_ptr __itt_av_saveW_ptr
+#else /* UNICODE */
+#  define __itt_av_save     __itt_av_saveA
+#  define __itt_av_save_ptr __itt_av_saveA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_av_save(void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, av_saveA, (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+ITT_STUB(ITTAPI, int, av_saveW, (void *data, int rank, const int *dimensions, int type, const wchar_t *filePath, int columnOrder))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, av_save,  (void *data, int rank, const int *dimensions, int type, const char *filePath, int columnOrder))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA     ITTNOTIFY_DATA(av_saveA)
+#define __itt_av_saveA_ptr ITTNOTIFY_NAME(av_saveA)
+#define __itt_av_saveW     ITTNOTIFY_DATA(av_saveW)
+#define __itt_av_saveW_ptr ITTNOTIFY_NAME(av_saveW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save     ITTNOTIFY_DATA(av_save)
+#define __itt_av_save_ptr ITTNOTIFY_NAME(av_save)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA(name)
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW(name)
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save(name)
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_av_saveA_ptr 0
+#define __itt_av_saveW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_av_save_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+void ITTAPI __itt_enable_attach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, enable_attach, (void))
+#define __itt_enable_attach     ITTNOTIFY_VOID(enable_attach)
+#define __itt_enable_attach_ptr ITTNOTIFY_NAME(enable_attach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_enable_attach()
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_enable_attach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_gpa_documentation */
+
+/** @} arrays group */
+
+/** @endcond */
+
+/**
+ * @brief Module load notification
+ * This API is used to report necessary information in case of bypassing default system loader.
+ * Notification should be done immidiatelly after this module is loaded to process memory.
+ * @param[in] start_addr - module start address
+ * @param[in] end_addr - module end address
+ * @param[in] path - file system full path to the module
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_module_loadA(void *start_addr, void *end_addr, const char *path);
+void ITTAPI __itt_module_loadW(void *start_addr, void *end_addr, const wchar_t *path);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_module_load     __itt_module_loadW
+#  define __itt_module_load_ptr __itt_module_loadW_ptr
+#else /* UNICODE */
+#  define __itt_module_load     __itt_module_loadA
+#  define __itt_module_load_ptr __itt_module_loadA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_module_load(void *start_addr, void *end_addr, const char *path);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, void, module_loadA, (void *start_addr, void *end_addr, const char *path))
+ITT_STUB(ITTAPI, void, module_loadW, (void *start_addr, void *end_addr, const wchar_t *path))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, void, module_load,  (void *start_addr, void *end_addr, const char *path))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA     ITTNOTIFY_VOID(module_loadA)
+#define __itt_module_loadA_ptr ITTNOTIFY_NAME(module_loadA)
+#define __itt_module_loadW     ITTNOTIFY_VOID(module_loadW)
+#define __itt_module_loadW_ptr ITTNOTIFY_NAME(module_loadW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load     ITTNOTIFY_VOID(module_load)
+#define __itt_module_load_ptr ITTNOTIFY_NAME(module_load)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA(start_addr, end_addr, path)
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW(start_addr, end_addr, path)
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load(start_addr, end_addr, path)
+#define __itt_module_load_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_module_loadA_ptr 0
+#define __itt_module_loadW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_module_load_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Report module unload
+ * This API is used to report necessary information in case of bypassing default system loader.
+ * Notification should be done just before the module is unloaded from process memory.
+ * @param[in] addr - base address of loaded module
+ */
+void ITTAPI __itt_module_unload(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, module_unload, (void *addr))
+#define __itt_module_unload     ITTNOTIFY_VOID(module_unload)
+#define __itt_module_unload_ptr ITTNOTIFY_NAME(module_unload)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_module_unload(addr)
+#define __itt_module_unload_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_module_unload_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum
+{
+    __itt_module_type_unknown = 0,
+    __itt_module_type_elf,
+    __itt_module_type_coff
+} __itt_module_type;
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+typedef enum
+{
+    itt_section_type_unknown,
+    itt_section_type_bss,        /* notifies that the section contains uninitialized data. These are the relevant section types and the modules that contain them:
+                                  * ELF module:  SHT_NOBITS section type
+                                  * COFF module: IMAGE_SCN_CNT_UNINITIALIZED_DATA section type
+                                  */
+    itt_section_type_data,       /* notifies that section contains initialized data. These are the relevant section types and the modules that contain them:
+                                  * ELF module:  SHT_PROGBITS section type
+                                  * COFF module: IMAGE_SCN_CNT_INITIALIZED_DATA section type
+                                  */
+    itt_section_type_text        /* notifies that the section contains executable code. These are the relevant section types and the modules that contain them:
+                                  * ELF module:  SHT_PROGBITS section type
+                                  * COFF module: IMAGE_SCN_CNT_CODE section type
+                                  */
+} __itt_section_type;
+/** @endcond */
+
+/**
+ * @hideinitializer
+ * @brief bit-mask, detects a section attribute that indicates whether a section can be executed as code:
+ * These are the relevant section attributes and the modules that contain them:
+ * ELF module:  PF_X section attribute
+ * COFF module: IMAGE_SCN_MEM_EXECUTE attribute
+ */
+#define __itt_section_exec 0x20000000
+
+/**
+ * @hideinitializer
+ * @brief bit-mask, detects a section attribute that indicates whether a section can be read.
+ * These are the relevant section attributes and the modules that contain them:
+ * ELF module:  PF_R attribute
+ * COFF module: IMAGE_SCN_MEM_READ attribute
+ */
+#define __itt_section_read 0x40000000
+
+/**
+ * @hideinitializer
+ * @brief bit-mask, detects a section attribute that indicates whether a section can be written to.
+ * These are the relevant section attributes and the modules that contain them:
+ * ELF module:  PF_W attribute
+ * COFF module: IMAGE_SCN_MEM_WRITE attribute
+ */
+#define __itt_section_write 0x80000000
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_section_info
+{
+    const char* name;                 /*!< Section name in UTF8 */
+    __itt_section_type type;          /*!< Section content and semantics description */
+    size_t flags;                     /*!< Section bit flags that describe attributes using bit mask
+                                       * Zero if disabled, non-zero if enabled
+                                       */
+    void* start_addr;                 /*!< Section load(relocated) start address */
+    size_t size;                      /*!< Section file offset */
+    size_t file_offset;               /*!< Section size */
+} __itt_section_info;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_module_object
+{
+    unsigned int version;                 /*!< API version*/
+    __itt_id module_id;                   /*!< Unique identifier. This is unchanged for sections that belong to the same module */
+    __itt_module_type module_type;        /*!< Binary module format */
+    const char* module_name;              /*!< Unique module name or path to module in UTF8
+                                           * Contains module name when module_bufer and module_size exist
+                                           * Contains module path when module_bufer and module_size absent
+                                           * module_name remains the same for the certain module_id
+                                           */
+    void* module_buffer;                  /*!< Module buffer content */
+    size_t module_size;                   /*!< Module buffer size */
+                                          /*!< If module_buffer and module_size exist, the binary module is dumped onto the system.
+                                           * If module_buffer and module_size do not exist,
+                                           * the binary module exists on the system already.
+                                           * The module_name parameter contains the path to the module.
+                                           */
+    __itt_section_info* section_array;    /*!< Reference to section information */
+    size_t section_number;
+} __itt_module_object;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Load module content and its loaded(relocated) sections.
+ * This API is useful to save a module, or specify its location on the system and report information about loaded sections.
+ * The target module is saved on the system if module buffer content and size are available.
+ * If module buffer content and size are unavailable, the module name contains the path to the existing binary module.
+ * @param[in] module_obj - provides module and section information, along with unique module identifiers (name,module ID)
+ * which bind the binary module to particular sections.
+ */
+void ITTAPI __itt_module_load_with_sections(__itt_module_object* module_obj);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, module_load_with_sections,  (__itt_module_object* module_obj))
+#define __itt_module_load_with_sections     ITTNOTIFY_VOID(module_load_with_sections)
+#define __itt_module_load_with_sections_ptr ITTNOTIFY_NAME(module_load_with_sections)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_module_load_with_sections(module_obj)
+#define __itt_module_load_with_sections_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_module_load_with_sections_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Unload a module and its loaded(relocated) sections.
+ * This API notifies that the module and its sections were unloaded.
+ * @param[in] module_obj - provides module and sections information, along with unique module identifiers (name,module ID)
+ * which bind the binary module to particular sections.
+ */
+void ITTAPI __itt_module_unload_with_sections(__itt_module_object* module_obj);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, module_unload_with_sections,  (__itt_module_object* module_obj))
+#define __itt_module_unload_with_sections     ITTNOTIFY_VOID(module_unload_with_sections)
+#define __itt_module_unload_with_sections_ptr ITTNOTIFY_NAME(module_unload_with_sections)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_module_unload_with_sections(module_obj)
+#define __itt_module_unload_with_sections_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_module_unload_with_sections_ptr  0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_histogram
+{
+    const __itt_domain* domain;      /*!< Domain of the histogram*/
+    const char* nameA;               /*!< Name of the histogram */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW;
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    __itt_metadata_type x_type;     /*!< Type of the histogram X axis */
+    __itt_metadata_type y_type;     /*!< Type of the histogram Y axis */
+    int   extra1;                   /*!< Reserved to the runtime */
+    void* extra2;                   /*!< Reserved to the runtime */
+    struct ___itt_histogram* next;
+}  __itt_histogram;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Create a typed histogram instance with given name/domain.
+ * @param[in] domain The domain controlling the call.
+ * @param[in] name   The name of the histogram.
+ * @param[in] x_type The type of the X axis in histogram (may be 0 to calculate batch statistics).
+ * @param[in] y_type The type of the Y axis in histogram.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_histogram* ITTAPI __itt_histogram_createA(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
+__itt_histogram* ITTAPI __itt_histogram_createW(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_histogram_create     __itt_histogram_createW
+#  define __itt_histogram_create_ptr __itt_histogram_createW_ptr
+#else /* UNICODE */
+#  define __itt_histogram_create     __itt_histogram_createA
+#  define __itt_histogram_create_ptr __itt_histogram_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_histogram* ITTAPI __itt_histogram_create(const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_createA, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_histogram_createA     ITTNOTIFY_DATA(histogram_createA)
+#define __itt_histogram_createA_ptr ITTNOTIFY_NAME(histogram_createA)
+#define __itt_histogram_createW     ITTNOTIFY_DATA(histogram_createW)
+#define __itt_histogram_createW_ptr ITTNOTIFY_NAME(histogram_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_histogram_create     ITTNOTIFY_DATA(histogram_create)
+#define __itt_histogram_create_ptr ITTNOTIFY_NAME(histogram_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_histogram_createA(domain, name, x_type, y_type) (__itt_histogram*)0
+#define __itt_histogram_createA_ptr 0
+#define __itt_histogram_createW(domain, name, x_type, y_type) (__itt_histogram*)0
+#define __itt_histogram_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_histogram_create(domain, name, x_type, y_type) (__itt_histogram*)0
+#define __itt_histogram_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_histogram_createA_ptr 0
+#define __itt_histogram_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_histogram_create_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Submit statistics for a histogram instance.
+ * @param[in] hist    Pointer to the histogram instance to which the histogram statistic is to be dumped.
+ * @param[in] length  The number of elements in dumped axis data array.
+ * @param[in] x_data  The X axis dumped data itself (may be NULL to calculate batch statistics).
+ * @param[in] y_data  The Y axis dumped data itself.
+*/
+void ITTAPI __itt_histogram_submit(__itt_histogram* hist, size_t length, void* x_data, void* y_data);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* hist, size_t length, void* x_data, void* y_data))
+#define __itt_histogram_submit     ITTNOTIFY_VOID(histogram_submit)
+#define __itt_histogram_submit_ptr ITTNOTIFY_NAME(histogram_submit)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_histogram_submit(hist, length, x_data, y_data)
+#define __itt_histogram_submit_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_histogram_submit_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+
+/**
+* @brief function allows to obtain the current collection state at the moment
+* @return collection state as a enum __itt_collection_state
+*/
+__itt_collection_state __itt_get_collection_state(void);
+
+/**
+* @brief function releases resources allocated by ITT API static part
+* this API should be called from the library destructor
+* @return void
+*/
+void __itt_release_resources(void);
+/** @endcond */
+
+/**
+ * @brief Create a typed counter with given domain pointer, string name and counter type
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_counter ITTAPI __itt_counter_createA_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type);
+__itt_counter ITTAPI __itt_counter_createW_v3(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_counter_create_v3     __itt_counter_createW_v3
+#  define __itt_counter_create_v3_ptr __itt_counter_createW_v3_ptr
+#else /* UNICODE */
+#  define __itt_counter_create_v3     __itt_counter_createA_v3
+#  define __itt_counter_create_v3_ptr __itt_counter_createA_v3_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_counter ITTAPI __itt_counter_create_v3(const __itt_domain* domain, const char* name, __itt_metadata_type type);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char* name, __itt_metadata_type type))
+ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_v3,  (const __itt_domain* domain, const char* name, __itt_metadata_type type))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_v3     ITTNOTIFY_DATA(counter_createA_v3)
+#define __itt_counter_createA_v3_ptr ITTNOTIFY_NAME(counter_createA_v3)
+#define __itt_counter_createW_v3     ITTNOTIFY_DATA(counter_createW_v3)
+#define __itt_counter_createW_v3_ptr ITTNOTIFY_NAME(counter_createW_v3)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_v3     ITTNOTIFY_DATA(counter_create_v3)
+#define __itt_counter_create_v3_ptr ITTNOTIFY_NAME(counter_create_v3)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_v3(domain, name, type) (__itt_counter)0
+#define __itt_counter_createA_v3_ptr 0
+#define __itt_counter_createW_v3(domain, name, type) (__itt_counter)0
+#define __itt_counter_create_typedW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_v3(domain, name, type) (__itt_counter)0
+#define __itt_counter_create_v3_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_counter_createA_v3_ptr 0
+#define __itt_counter_createW_v3_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_counter_create_v3_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Set the counter value api
+ */
+void ITTAPI __itt_counter_set_value_v3(__itt_counter counter, void *value_ptr);
+
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr))
+#define __itt_counter_set_value_v3     ITTNOTIFY_VOID(counter_set_value_v3)
+#define __itt_counter_set_value_v3_ptr ITTNOTIFY_NAME(counter_set_value_v3)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_counter_set_value_v3(counter, value_ptr)
+#define __itt_counter_set_value_v3_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_counter_set_value_v3_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief describes the type of context metadata
+*/
+typedef enum {
+    __itt_context_unknown = 0,              /*!< Undefined type */
+    __itt_context_nameA,                    /*!< ASCII string char* type */
+    __itt_context_nameW,                    /*!< Unicode string wchar_t* type */
+    __itt_context_deviceA,                  /*!< ASCII string char* type */
+    __itt_context_deviceW,                  /*!< Unicode string wchar_t* type */
+    __itt_context_unitsA,                   /*!< ASCII string char* type */
+    __itt_context_unitsW,                   /*!< Unicode string wchar_t* type */
+    __itt_context_pci_addrA,                /*!< ASCII string char* type */
+    __itt_context_pci_addrW,                /*!< Unicode string wchar_t* type */
+    __itt_context_tid,                      /*!< Unsigned 64-bit integer type */
+    __itt_context_max_val,                  /*!< Unsigned 64-bit integer type */
+    __itt_context_bandwidth_flag,           /*!< Unsigned 64-bit integer type */
+    __itt_context_latency_flag,             /*!< Unsigned 64-bit integer type */
+    __itt_context_occupancy_flag,           /*!< Unsigned 64-bit integer type */
+    __itt_context_on_thread_flag,           /*!< Unsigned 64-bit integer type */
+    __itt_context_is_abs_val_flag,          /*!< Unsigned 64-bit integer type */
+    __itt_context_cpu_instructions_flag,    /*!< Unsigned 64-bit integer type */
+    __itt_context_cpu_cycles_flag           /*!< Unsigned 64-bit integer type */
+} __itt_context_type;
+
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_context_name __itt_context_nameW
+#  define __itt_context_device __itt_context_deviceW
+#  define __itt_context_units __itt_context_unitsW
+#  define __itt_context_pci_addr __itt_context_pci_addrW
+#else  /* UNICODE || _UNICODE */
+#  define __itt_context_name __itt_context_nameA
+#  define __itt_context_device __itt_context_deviceA
+#  define __itt_context_units __itt_context_unitsA
+#  define __itt_context_pci_addr __itt_context_pci_addrA
+#endif /* UNICODE || _UNICODE */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_context_metadata
+{
+    __itt_context_type type;    /*!< Type of the context metadata value */
+    void* value;                /*!< Pointer to context metadata value itself */
+}  __itt_context_metadata;
+
+#pragma pack(pop)
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#pragma pack(push, 8)
+
+typedef struct ___itt_counter_metadata
+{
+    __itt_counter counter;              /*!< Associated context metadata counter */
+    __itt_context_type type;            /*!< Type of the context metadata value */
+    const char* str_valueA;             /*!< String context metadata value */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* str_valueW;
+#else  /* UNICODE || _UNICODE */
+    void* str_valueW;
+#endif /* UNICODE || _UNICODE */
+    unsigned long long value;           /*!< Numeric context metadata value */
+    int   extra1;                       /*!< Reserved to the runtime */
+    void* extra2;                       /*!< Reserved to the runtime */
+    struct ___itt_counter_metadata* next;
+}  __itt_counter_metadata;
+
+#pragma pack(pop)
+/** @endcond */
+
+/**
+ * @brief Bind context metadata to counter instance
+ * @param[in] counter   Pointer to the counter instance to which the context metadata is to be associated.
+ * @param[in] length    The number of elements in context metadata array.
+ * @param[in] metadata  The context metadata itself.
+*/
+void ITTAPI __itt_bind_context_metadata_to_counter(__itt_counter counter, size_t length, __itt_context_metadata* metadata);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata))
+#define __itt_bind_context_metadata_to_counter     ITTNOTIFY_VOID(bind_context_metadata_to_counter)
+#define __itt_bind_context_metadata_to_counter_ptr ITTNOTIFY_NAME(bind_context_metadata_to_counter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_bind_context_metadata_to_counter(counter, length, metadata)
+#define __itt_bind_context_metadata_to_counter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_bind_context_metadata_to_counter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_H_ */
+
+#ifdef INTEL_ITTNOTIFY_API_PRIVATE
+
+#ifndef _ITTNOTIFY_PRIVATE_
+#define _ITTNOTIFY_PRIVATE_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @ingroup clockdomain
+ * @brief Begin an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid The identifier for this task instance, *cannot* be __itt_null.
+ * @param[in] parentid The parent of this task, or __itt_null.
+ * @param[in] name The name of this task.
+ */
+void ITTAPI __itt_task_begin_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name);
+
+/**
+ * @ingroup clockdomain
+ * @brief End an overlapped task instance.
+ * @param[in] domain The domain for this task
+ * @param[in] clock_domain The clock domain controlling the execution of this call.
+ * @param[in] timestamp The user defined timestamp.
+ * @param[in] taskid Explicit ID of finished task
+ */
+void ITTAPI __itt_task_end_overlapped_ex(const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, task_begin_overlapped_ex,       (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid, __itt_id parentid, __itt_string_handle* name))
+ITT_STUBV(ITTAPI, void, task_end_overlapped_ex,         (const __itt_domain* domain, __itt_clock_domain* clock_domain, unsigned long long timestamp, __itt_id taskid))
+#define __itt_task_begin_overlapped_ex(d,x,y,z,a,b)     ITTNOTIFY_VOID_D5(task_begin_overlapped_ex,d,x,y,z,a,b)
+#define __itt_task_begin_overlapped_ex_ptr              ITTNOTIFY_NAME(task_begin_overlapped_ex)
+#define __itt_task_end_overlapped_ex(d,x,y,z)           ITTNOTIFY_VOID_D3(task_end_overlapped_ex,d,x,y,z)
+#define __itt_task_end_overlapped_ex_ptr                ITTNOTIFY_NAME(task_end_overlapped_ex)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_task_begin_overlapped_ex(domain,clock_domain,timestamp,taskid,parentid,name)
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ex(domain,clock_domain,timestamp,taskid)
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_task_begin_overlapped_ex_ptr      0
+#define __itt_task_end_overlapped_ptr           0
+#define __itt_task_end_overlapped_ex_ptr        0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @defgroup makrs_internal Marks
+ * @ingroup internal
+ * Marks group
+ * @warning Internal API:
+ *   - It is not shipped to outside of Intel
+ *   - It is delivered to internal Intel teams using e-mail or SVN access only
+ * @{
+ */
+/** @brief user mark type */
+typedef int __itt_mark_type;
+
+/**
+ * @brief Creates a user mark type with the specified name using char or Unicode string.
+ * @param[in] name - name of mark to create
+ * @return Returns a handle to the mark type
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_mark_type ITTAPI __itt_mark_createA(const char    *name);
+__itt_mark_type ITTAPI __itt_mark_createW(const wchar_t *name);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_create     __itt_mark_createW
+#  define __itt_mark_create_ptr __itt_mark_createW_ptr
+#else /* UNICODE */
+#  define __itt_mark_create     __itt_mark_createA
+#  define __itt_mark_create_ptr __itt_mark_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_mark_type ITTAPI __itt_mark_create(const char *name);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createA, (const char    *name))
+ITT_STUB(ITTAPI, __itt_mark_type, mark_createW, (const wchar_t *name))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_mark_type, mark_create,  (const char *name))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA     ITTNOTIFY_DATA(mark_createA)
+#define __itt_mark_createA_ptr ITTNOTIFY_NAME(mark_createA)
+#define __itt_mark_createW     ITTNOTIFY_DATA(mark_createW)
+#define __itt_mark_createW_ptr ITTNOTIFY_NAME(mark_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create      ITTNOTIFY_DATA(mark_create)
+#define __itt_mark_create_ptr  ITTNOTIFY_NAME(mark_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA(name) (__itt_mark_type)0
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW(name) (__itt_mark_type)0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create(name)  (__itt_mark_type)0
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_createA_ptr 0
+#define __itt_mark_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates a "discrete" user mark type of the specified type and an optional parameter using char or Unicode string.
+ *
+ * - The mark of "discrete" type is placed to collection results in case of success. It appears in overtime view(s) as a special tick sign.
+ * - The call is "synchronous" - function returns after mark is actually added to results.
+ * - This function is useful, for example, to mark different phases of application
+ *   (beginning of the next mark automatically meand end of current region).
+ * - Can be used together with "continuous" marks (see below) at the same collection session
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @param[in] parameter - string parameter of mark
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_markA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_markW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark     __itt_markW
+#  define __itt_mark_ptr __itt_markW_ptr
+#else /* UNICODE  */
+#  define __itt_mark     __itt_markA
+#  define __itt_mark_ptr __itt_markA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, markA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, markW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA     ITTNOTIFY_DATA(markA)
+#define __itt_markA_ptr ITTNOTIFY_NAME(markA)
+#define __itt_markW     ITTNOTIFY_DATA(markW)
+#define __itt_markW_ptr ITTNOTIFY_NAME(markW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark      ITTNOTIFY_DATA(mark)
+#define __itt_mark_ptr  ITTNOTIFY_NAME(mark)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA(mt, parameter) (int)0
+#define __itt_markA_ptr 0
+#define __itt_markW(mt, parameter) (int)0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark(mt, parameter)  (int)0
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_markA_ptr 0
+#define __itt_markW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create a "discrete" user event type (mark) for process
+ * rather then for one thread
+ * @see int __itt_mark(__itt_mark_type mt, const char* parameter);
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int ITTAPI __itt_mark_globalA(__itt_mark_type mt, const char    *parameter);
+int ITTAPI __itt_mark_globalW(__itt_mark_type mt, const wchar_t *parameter);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_mark_global     __itt_mark_globalW
+#  define __itt_mark_global_ptr __itt_mark_globalW_ptr
+#else /* UNICODE  */
+#  define __itt_mark_global     __itt_mark_globalA
+#  define __itt_mark_global_ptr __itt_mark_globalA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int ITTAPI __itt_mark_global(__itt_mark_type mt, const char *parameter);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, int, mark_globalA, (__itt_mark_type mt, const char    *parameter))
+ITT_STUB(ITTAPI, int, mark_globalW, (__itt_mark_type mt, const wchar_t *parameter))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, int, mark_global,  (__itt_mark_type mt, const char *parameter))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA     ITTNOTIFY_DATA(mark_globalA)
+#define __itt_mark_globalA_ptr ITTNOTIFY_NAME(mark_globalA)
+#define __itt_mark_globalW     ITTNOTIFY_DATA(mark_globalW)
+#define __itt_mark_globalW_ptr ITTNOTIFY_NAME(mark_globalW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global      ITTNOTIFY_DATA(mark_global)
+#define __itt_mark_global_ptr  ITTNOTIFY_NAME(mark_global)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA(mt, parameter) (int)0
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW(mt, parameter) (int)0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global(mt, parameter)  (int)0
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_mark_globalA_ptr 0
+#define __itt_mark_globalW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_mark_global_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Creates an "end" point for "continuous" mark with specified name.
+ *
+ * - Returns zero value in case of success, non-zero value otherwise.
+ *   Also returns non-zero value when preceding "begin" point for the
+ *   mark with the same name failed to be created or not created.
+ * - The mark of "continuous" type is placed to collection results in
+ *   case of success. It appears in overtime view(s) as a special tick
+ *   sign (different from "discrete" mark) together with line from
+ *   corresponding "begin" mark to "end" mark.
+ * @note Continuous marks can overlap and be nested inside each other.
+ * Discrete mark can be nested inside marked region
+ * @param[in] mt - mark, created by __itt_mark_create(const char* name) function
+ * @return Returns zero value in case of success, non-zero value otherwise.
+ */
+int ITTAPI __itt_mark_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_off, (__itt_mark_type mt))
+#define __itt_mark_off     ITTNOTIFY_DATA(mark_off)
+#define __itt_mark_off_ptr ITTNOTIFY_NAME(mark_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_off(mt) (int)0
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Use this if necessary to create an "end" point for mark of process
+ * @see int __itt_mark_off(__itt_mark_type mt);
+ */
+int ITTAPI __itt_mark_global_off(__itt_mark_type mt);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, int, mark_global_off, (__itt_mark_type mt))
+#define __itt_mark_global_off     ITTNOTIFY_DATA(mark_global_off)
+#define __itt_mark_global_off_ptr ITTNOTIFY_NAME(mark_global_off)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_mark_global_off(mt) (int)0
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_mark_global_off_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} marks group */
+
+/**
+ * @defgroup counters_internal Counters
+ * @ingroup internal
+ * Counters group
+ * @{
+ */
+
+
+/**
+ * @defgroup stitch Stack Stitching
+ * @ingroup internal
+ * Stack Stitching group
+ * @{
+ */
+/**
+ * @brief opaque structure for counter identification
+ */
+typedef struct ___itt_caller *__itt_caller;
+
+/**
+ * @brief Create the stitch point e.g. a point in call stack where other stacks should be stitched to.
+ * The function returns a unique identifier which is used to match the cut points with corresponding stitch points.
+ */
+__itt_caller ITTAPI __itt_stack_caller_create(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_caller, stack_caller_create, (void))
+#define __itt_stack_caller_create     ITTNOTIFY_DATA(stack_caller_create)
+#define __itt_stack_caller_create_ptr ITTNOTIFY_NAME(stack_caller_create)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_create() (__itt_caller)0
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_create_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Destroy the information about stitch point identified by the pointer previously returned by __itt_stack_caller_create()
+ */
+void ITTAPI __itt_stack_caller_destroy(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_caller_destroy, (__itt_caller id))
+#define __itt_stack_caller_destroy     ITTNOTIFY_VOID(stack_caller_destroy)
+#define __itt_stack_caller_destroy_ptr ITTNOTIFY_NAME(stack_caller_destroy)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_caller_destroy(id)
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_caller_destroy_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Sets the cut point. Stack from each event which occurs after this call will be cut
+ * at the same stack level the function was called and stitched to the corresponding stitch point.
+ */
+void ITTAPI __itt_stack_callee_enter(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_enter, (__itt_caller id))
+#define __itt_stack_callee_enter     ITTNOTIFY_VOID(stack_callee_enter)
+#define __itt_stack_callee_enter_ptr ITTNOTIFY_NAME(stack_callee_enter)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_enter(id)
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_enter_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief This function eliminates the cut point which was set by latest __itt_stack_callee_enter().
+ */
+void ITTAPI __itt_stack_callee_leave(__itt_caller id);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, stack_callee_leave, (__itt_caller id))
+#define __itt_stack_callee_leave     ITTNOTIFY_VOID(stack_callee_leave)
+#define __itt_stack_callee_leave_ptr ITTNOTIFY_NAME(stack_callee_leave)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_stack_callee_leave(id)
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_stack_callee_leave_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @} stitch group */
+
+/* ***************************************************************************************************************************** */
+
+#include <stdarg.h>
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_error_code
+{
+    __itt_error_success       = 0, /*!< no error */
+    __itt_error_no_module     = 1, /*!< module can't be loaded */
+    /* %1$s -- library name; win: %2$d -- system error code; unx: %2$s -- system error message. */
+    __itt_error_no_symbol     = 2, /*!< symbol not found */
+    /* %1$s -- library name, %2$s -- symbol name. */
+    __itt_error_unknown_group = 3, /*!< unknown group specified */
+    /* %1$s -- env var name, %2$s -- group name. */
+    __itt_error_cant_read_env = 4, /*!< GetEnvironmentVariable() failed */
+    /* %1$s -- env var name, %2$d -- system error. */
+    __itt_error_env_too_long  = 5, /*!< variable value too long */
+    /* %1$s -- env var name, %2$d -- actual length of the var, %3$d -- max allowed length. */
+    __itt_error_system        = 6  /*!< pthread_mutexattr_init or pthread_mutex_init failed */
+    /* %1$s -- function name, %2$d -- errno. */
+} __itt_error_code;
+
+typedef void (__itt_error_handler_t)(__itt_error_code code, va_list);
+__itt_error_handler_t* __itt_set_error_handler(__itt_error_handler_t*);
+
+const char* ITTAPI __itt_api_version(void);
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#define __itt_error_handler ITT_JOIN(INTEL_ITTNOTIFY_PREFIX, error_handler)
+void __itt_error_handler(__itt_error_code code, va_list args);
+extern const int ITTNOTIFY_NAME(err);
+#define __itt_err ITTNOTIFY_NAME(err)
+ITT_STUB(ITTAPI, const char*, api_version, (void))
+#define __itt_api_version     ITTNOTIFY_DATA(api_version)
+#define __itt_api_version_ptr ITTNOTIFY_NAME(api_version)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_api_version()   (const char*)0
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_api_version_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _ITTNOTIFY_PRIVATE_ */
+
+#endif /* INTEL_ITTNOTIFY_API_PRIVATE */
diff --git a/src/common/ittnotify/ittnotify_config.h b/third_party/ittnotify/ittnotify_config.h
similarity index 90%
rename from src/common/ittnotify/ittnotify_config.h
rename to third_party/ittnotify/ittnotify_config.h
index eb71db9f0ee..7e10adb01b3 100644
--- a/src/common/ittnotify/ittnotify_config.h
+++ b/third_party/ittnotify/ittnotify_config.h
@@ -23,6 +23,10 @@
 #  define ITT_OS_FREEBSD   4
 #endif /* ITT_OS_FREEBSD */
 
+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
 #ifndef ITT_OS
 #  if defined WIN32 || defined _WIN32
 #    define ITT_OS ITT_OS_WIN
@@ -30,6 +34,8 @@
 #    define ITT_OS ITT_OS_MAC
 #  elif defined( __FreeBSD__ )
 #    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__ )
+#    define ITT_OS ITT_OS_OPENBSD
 #  else
 #    define ITT_OS ITT_OS_LINUX
 #  endif
@@ -51,6 +57,10 @@
 #  define ITT_PLATFORM_FREEBSD 4
 #endif /* ITT_PLATFORM_FREEBSD */
 
+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
 #ifndef ITT_PLATFORM
 #  if ITT_OS==ITT_OS_WIN
 #    define ITT_PLATFORM ITT_PLATFORM_WIN
@@ -58,6 +68,8 @@
 #    define ITT_PLATFORM ITT_PLATFORM_MAC
 #  elif ITT_OS==ITT_OS_FREEBSD
 #    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
 #  else
 #    define ITT_PLATFORM ITT_PLATFORM_POSIX
 #  endif
@@ -196,10 +208,10 @@
 #define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
 
 /* Replace with snapshot date YYYYMMDD for promotion build. */
-#define API_VERSION_BUILD    20180723
+#define API_VERSION_BUILD    20230630
 
 #ifndef API_VERSION_NUM
-#define API_VERSION_NUM 3.23.0
+#define API_VERSION_NUM 3.24.6
 #endif /* API_VERSION_NUM */
 
 #define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
@@ -458,6 +470,7 @@ typedef struct __itt_counter_info
 struct ___itt_domain;
 struct ___itt_string_handle;
 struct ___itt_histogram;
+struct ___itt_counter_metadata;
 
 #include "ittnotify.h"
 
@@ -484,6 +497,7 @@ typedef struct ___itt_global
     __itt_counter_info_t*  counter_list;
     unsigned int           ipt_collect_events;
     struct ___itt_histogram* histogram_list;
+    struct ___itt_counter_metadata* counter_metadata_list;
 } __itt_global;
 
 #pragma pack(pop)
@@ -596,7 +610,7 @@ typedef struct ___itt_global
         h->nameA   = NULL; \
         h->nameW   = name ? _wcsdup(name) : NULL; \
         h->domainA   = NULL; \
-        h->domainW   = name ? _wcsdup(domain) : NULL; \
+        h->domainW   = domain ? _wcsdup(domain) : NULL; \
         h->type = (int)type; \
         h->index = 0; \
         h->next   = NULL; \
@@ -638,6 +652,7 @@ typedef struct ___itt_global
         h->y_type = y_type; \
         h->extra1 = 0; \
         h->extra2 = NULL; \
+        h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->histogram_list = h; \
         else \
@@ -657,6 +672,7 @@ typedef struct ___itt_global
         h->y_type = y_type; \
         h->extra1 = 0; \
         h->extra2 = NULL; \
+        h->next   = NULL; \
         if (h_tail == NULL) \
             (gptr)->histogram_list = h; \
         else \
@@ -664,4 +680,60 @@ typedef struct ___itt_global
     } \
 }
 
+#define NEW_COUNTER_METADATA_NUM(gptr,h,h_tail,counter,type,value) { \
+    h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \
+    if (h != NULL) { \
+        h->counter = counter; \
+        h->type = (int)type; \
+        h->str_valueA = NULL; \
+        h->str_valueW = NULL; \
+        h->value = value; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_metadata_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_METADATA_STR_A(gptr,h,h_tail,counter,type,str_valueA) { \
+    h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \
+    if (h != NULL) { \
+        h->counter = counter; \
+        h->type = (int)type; \
+        char *str_value_copy = NULL; \
+        __itt_fstrdup(str_valueA, str_value_copy); \
+        h->str_valueA = str_value_copy; \
+        h->str_valueW = NULL; \
+        h->value = 0; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_metadata_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_COUNTER_METADATA_STR_W(gptr,h,h_tail,counter,type,str_valueW) { \
+    h = (__itt_counter_metadata*)malloc(sizeof(__itt_counter_metadata)); \
+    if (h != NULL) { \
+        h->counter = counter; \
+        h->type = (int)type; \
+        h->str_valueA = NULL; \
+        h->str_valueW = str_valueW ? _wcsdup(str_valueW) : NULL; \
+        h->value = 0; \
+        h->extra1 = 0; \
+        h->extra2 = NULL; \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->counter_metadata_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
 #endif /* _ITTNOTIFY_CONFIG_H_ */
diff --git a/src/common/ittnotify/ittnotify_static.c b/third_party/ittnotify/ittnotify_static.c
similarity index 87%
rename from src/common/ittnotify/ittnotify_static.c
rename to third_party/ittnotify/ittnotify_static.c
index 71944003896..4774f80e053 100644
--- a/src/common/ittnotify/ittnotify_static.c
+++ b/third_party/ittnotify/ittnotify_static.c
@@ -62,7 +62,7 @@ static const char api_version[] = API_VERSION "\0\n@(#) $Revision$\n";
 #define ITT_ATTRIBUTE_FALLTHROUGH [[gnu::fallthrough]]
 #elif HAS_CPP_ATTR(clang::fallthrough)
 #define ITT_ATTRIBUTE_FALLTHROUGH [[clang::fallthrough]]
-#elif HAS_GNU_ATTR(fallthrough) && !__INTEL_COMPILER
+#elif HAS_GNU_ATTR(fallthrough) && !defined(__INTEL_COMPILER)
 #define ITT_ATTRIBUTE_FALLTHROUGH __attribute__((fallthrough))
 #else
 #define ITT_ATTRIBUTE_FALLTHROUGH
@@ -71,7 +71,7 @@ static const char api_version[] = API_VERSION "\0\n@(#) $Revision$\n";
 
 #if ITT_OS==ITT_OS_WIN
 static const char* ittnotify_lib_name = "libittnotify.dll";
-#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD
+#elif ITT_OS==ITT_OS_LINUX || ITT_OS==ITT_OS_FREEBSD || ITT_OS==ITT_OS_OPENBSD
 static const char* ittnotify_lib_name = "libittnotify.so";
 #elif ITT_OS==ITT_OS_MAC
 static const char* ittnotify_lib_name = "libittnotify.dylib";
@@ -295,7 +295,8 @@ __itt_global _N_(_ittapi_global) = {
     __itt_collection_uninitialized,                /* collection state */
     NULL,                                          /* counter_list */
     0,                                             /* ipt_collect_events */
-    NULL                                           /* histogram_list */
+    NULL,                                          /* histogram_list */
+    NULL                                           /* counter_metadata_list */
 };
 
 typedef void (__itt_api_init_t)(__itt_global*, __itt_group_id);
@@ -840,6 +841,170 @@ static __itt_histogram* ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(histogram_create),_in
     return (__itt_histogram*)h;
 }
 
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW_v3),_init))(const __itt_domain* domain, const wchar_t* name, __itt_metadata_type type)
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL || domain == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(counter_createW_v3) && ITTNOTIFY_NAME(counter_createW_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createW_v3),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createW_v3)(domain, name, type);
+        }
+        else
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return NULL;
+        }
+    }
+    if (__itt_is_collector_available())
+    {
+        for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+        {
+            if (h->nameW != NULL  && h->type == (int)type && !wcscmp(h->nameW, name) && ((h->domainW == NULL && domain->nameW == NULL) ||
+                (h->domainW != NULL && domain->nameW != NULL && !wcscmp(h->domainW, domain->nameW)))) break;
+
+        }
+        if (h == NULL)
+        {
+            NEW_COUNTER_W(&_N_(_ittapi_global),h,h_tail,name,domain->nameW,type);
+        }
+    }
+    __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA_v3),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type type)
+#else  /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+static __itt_counter ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_v3),_init))(const __itt_domain* domain, const char* name, __itt_metadata_type type)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+{
+    __itt_counter_info_t *h_tail = NULL, *h = NULL;
+
+    if (name == NULL || domain == NULL)
+    {
+        return NULL;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        if (ITTNOTIFY_NAME(counter_createA_v3) && ITTNOTIFY_NAME(counter_createA_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_createA_v3),_init)))
+        {
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_createA_v3)(domain, name, type);
+        }
+#else
+        if (ITTNOTIFY_NAME(counter_create_v3) && ITTNOTIFY_NAME(counter_create_v3) != ITT_VERSIONIZE(ITT_JOIN(_N_(counter_create_v3),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            return ITTNOTIFY_NAME(counter_create_v3)(domain, name, type);
+        }
+#endif
+        else
+        {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#else
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#endif
+            return NULL;
+        }
+    }
+    if (__itt_is_collector_available())
+    {
+        for (h_tail = NULL, h = _N_(_ittapi_global).counter_list; h != NULL; h_tail = h, h = h->next)
+        {
+            if (h->nameA != NULL  && h->type == (int)type && !__itt_fstrcmp(h->nameA, name) && ((h->domainA == NULL && domain->nameA == NULL) ||
+                (h->domainA != NULL && domain->nameA != NULL && !__itt_fstrcmp(h->domainA, domain->nameA)))) break;
+        }
+        if (h == NULL)
+        {
+            NEW_COUNTER_A(&_N_(_ittapi_global),h,h_tail,name,domain->nameA,type);
+        }
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+    return (__itt_counter)h;
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(bind_context_metadata_to_counter),_init))(__itt_counter counter, size_t length, __itt_context_metadata* metadata)
+{
+    __itt_counter_metadata *h_tail = NULL, *h = NULL;
+
+    if (counter == NULL || length == 0 || metadata == NULL)
+    {
+        return;
+    }
+
+    ITT_MUTEX_INIT_AND_LOCK(_N_(_ittapi_global));
+    if (_N_(_ittapi_global).api_initialized)
+    {
+        if (ITTNOTIFY_NAME(bind_context_metadata_to_counter) && ITTNOTIFY_NAME(bind_context_metadata_to_counter) != ITT_VERSIONIZE(ITT_JOIN(_N_(bind_context_metadata_to_counter),_init)))
+        {
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+            ITTNOTIFY_NAME(bind_context_metadata_to_counter)(counter, length, metadata);
+        }
+        else
+        {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#else
+            if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+#endif
+            return;
+        }
+    }
+    if (__itt_is_collector_available())
+    {
+        size_t item;
+        char* str_valueA = NULL;
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        wchar_t* str_valueW = NULL;
+#endif
+        unsigned long long value = 0;
+        __itt_context_type type = __itt_context_unknown;
+
+        for (item = 0; item < length; item++)
+        {
+            type = metadata[item].type;
+            for (h_tail = NULL, h = _N_(_ittapi_global).counter_metadata_list; h != NULL; h_tail = h, h = h->next)
+            {
+                if (h->counter != NULL && h->counter == counter && h->type == type) break;
+            }
+            if (h == NULL && counter != NULL && type != __itt_context_unknown)
+            {
+                if (type == __itt_context_nameA || type == __itt_context_deviceA || type == __itt_context_unitsA || type == __itt_context_pci_addrA)
+                {
+                    str_valueA = (char*)(metadata[item].value);
+                    NEW_COUNTER_METADATA_STR_A(&_N_(_ittapi_global),h,h_tail,counter,type,str_valueA);
+                }
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+                else if (type == __itt_context_nameW || type == __itt_context_deviceW || type == __itt_context_unitsW || type == __itt_context_pci_addrW)
+                {
+                    str_valueW = (wchar_t*)(metadata[item].value);
+                    NEW_COUNTER_METADATA_STR_W(&_N_(_ittapi_global),h,h_tail,counter,type,str_valueW);
+                }
+#endif
+                else if (type >= __itt_context_tid && type <= __itt_context_cpu_cycles_flag)
+                {
+                    value = *(unsigned long long*)(metadata[item].value);
+                    NEW_COUNTER_METADATA_NUM(&_N_(_ittapi_global),h,h_tail,counter,type,value);
+                }
+            }
+        }
+    }
+    if (PTHREAD_SYMBOLS) __itt_mutex_unlock(&_N_(_ittapi_global).mutex);
+}
+
 /* -------------------------------------------------------------------------- */
 
 static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause),_init))(void)
@@ -866,6 +1031,30 @@ static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume),_init))(void)
     }
 }
 
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(pause_scoped),_init))(__itt_collection_scope scope)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(pause_scoped) && ITTNOTIFY_NAME(pause_scoped) != ITT_VERSIONIZE(ITT_JOIN(_N_(pause_scoped),_init)))
+    {
+        ITTNOTIFY_NAME(pause_scoped)(scope);
+    }
+}
+
+static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(resume_scoped),_init))(__itt_collection_scope scope)
+{
+    if (!_N_(_ittapi_global).api_initialized && _N_(_ittapi_global).thread_list == NULL)
+    {
+        __itt_init_ittlib_name(NULL, __itt_group_all);
+    }
+    if (ITTNOTIFY_NAME(resume_scoped) && ITTNOTIFY_NAME(resume_scoped) != ITT_VERSIONIZE(ITT_JOIN(_N_(resume_scoped),_init)))
+    {
+        ITTNOTIFY_NAME(resume_scoped)(scope);
+    }
+}
+
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 static void ITTAPI ITT_VERSIONIZE(ITT_JOIN(_N_(thread_set_nameW),_init))(const wchar_t* name)
 {
@@ -1383,6 +1572,19 @@ static void __itt_free_allocated_resources(void)
         current_histogram = tmp;
     }
     _N_(_ittapi_global).histogram_list = NULL;
+
+    __itt_counter_metadata* current_counter_metadata = _N_(_ittapi_global).counter_metadata_list;
+    while (current_counter_metadata != NULL)
+    {
+        __itt_counter_metadata* tmp = current_counter_metadata->next;
+        free((char*)current_counter_metadata->str_valueA);
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        free((wchar_t*)current_counter_metadata->str_valueW);
+#endif
+        free(current_counter_metadata);
+        current_counter_metadata = tmp;
+    }
+    _N_(_ittapi_global).counter_metadata_list = NULL;
 }
 
 ITT_EXTERN_C int _N_(init_ittlib)(const char* lib_name, __itt_group_id init_groups)
diff --git a/src/common/ittnotify/ittnotify_static.h b/third_party/ittnotify/ittnotify_static.h
similarity index 96%
rename from src/common/ittnotify/ittnotify_static.h
rename to third_party/ittnotify/ittnotify_static.h
index 5534ce78d37..6072687239a 100644
--- a/src/common/ittnotify/ittnotify_static.h
+++ b/third_party/ittnotify/ittnotify_static.h
@@ -56,6 +56,8 @@ ITT_STUB(ITTAPI, __itt_counter, counter_create_typed,  (const char    *name, con
 
 ITT_STUBV(ITTAPI, void, pause,  (void), (ITT_NO_PARAMS), pause,  __itt_group_control | __itt_group_legacy, "no args")
 ITT_STUBV(ITTAPI, void, resume, (void), (ITT_NO_PARAMS), resume, __itt_group_control | __itt_group_legacy, "no args")
+ITT_STUBV(ITTAPI, void, pause_scoped,  (__itt_collection_scope scope), (ITT_FORMAT scope), pause_scoped,  __itt_group_control, "%d")
+ITT_STUBV(ITTAPI, void, resume_scoped, (__itt_collection_scope scope), (ITT_FORMAT scope), resume_scoped, __itt_group_control, "%d")
 
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 ITT_STUBV(ITTAPI, void, thread_set_nameA, (const char    *name), (ITT_FORMAT name), thread_set_nameA, __itt_group_thread, "\"%s\"")
@@ -80,6 +82,15 @@ ITT_STUB(ITTAPI, __itt_histogram*, histogram_createW, (const __itt_domain* domai
 ITT_STUB(ITTAPI, __itt_histogram*, histogram_create, (const __itt_domain* domain, const char* name, __itt_metadata_type x_type, __itt_metadata_type y_type), (ITT_FORMAT domain, name, x_type, y_type), histogram_create, __itt_group_structure, "%p, \"%s\", %d, %d")
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
 
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_counter, counter_createA_v3, (const __itt_domain* domain, const char    *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createA_v3, __itt_group_counter, "%p, \"%s\", %d")
+ITT_STUB(ITTAPI, __itt_counter, counter_createW_v3, (const __itt_domain* domain, const wchar_t *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_createW_v3, __itt_group_counter, "%p, \"%s\", %d")
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_counter, counter_create_v3,  (const __itt_domain* domain, const char    *name, __itt_metadata_type type), (ITT_FORMAT domain, name, type), counter_create_v3,  __itt_group_counter, "%p, \"%s\", %d")
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+ITT_STUBV(ITTAPI, void, bind_context_metadata_to_counter, (__itt_counter counter, size_t length, __itt_context_metadata* metadata), (ITT_FORMAT counter, length, metadata), bind_context_metadata_to_counter, __itt_group_structure, "%p, %lu, %p")
+
 #endif /* __ITT_INTERNAL_BODY */
 
 ITT_STUBV(ITTAPI, void, enable_attach, (void), (ITT_NO_PARAMS), enable_attach, __itt_group_all, "no args")
@@ -352,4 +363,6 @@ ITT_STUBV(ITTAPI, void, module_unload, (void *start_addr), (ITT_FORMAT start_add
 
 ITT_STUBV(ITTAPI, void, histogram_submit, (__itt_histogram* hist, size_t length, void* x_data, void* y_data), (ITT_FORMAT hist, length, x_data, y_data), histogram_submit, __itt_group_structure, "%p, %lu, %p, %p")
 
+ITT_STUBV(ITTAPI, void, counter_set_value_v3, (__itt_counter counter, void *value_ptr), (ITT_FORMAT counter, value_ptr), counter_set_value_v3, __itt_group_counter, "%p, %p")
+
 #endif /* __ITT_INTERNAL_INIT */
diff --git a/src/common/ittnotify/ittnotify_types.h b/third_party/ittnotify/ittnotify_types.h
similarity index 100%
rename from src/common/ittnotify/ittnotify_types.h
rename to third_party/ittnotify/ittnotify_types.h
diff --git a/src/common/ittnotify/ittptmark64.S b/third_party/ittnotify/ittptmark64.S
similarity index 100%
rename from src/common/ittnotify/ittptmark64.S
rename to third_party/ittnotify/ittptmark64.S
diff --git a/src/common/ittnotify/ittptmark64.asm b/third_party/ittnotify/ittptmark64.asm
similarity index 100%
rename from src/common/ittnotify/ittptmark64.asm
rename to third_party/ittnotify/ittptmark64.asm
diff --git a/src/common/ittnotify/jitprofiling.c b/third_party/ittnotify/jitprofiling.c
similarity index 88%
rename from src/common/ittnotify/jitprofiling.c
rename to third_party/ittnotify/jitprofiling.c
index a3bfe623cc9..31a15ee267d 100644
--- a/src/common/ittnotify/jitprofiling.c
+++ b/third_party/ittnotify/jitprofiling.c
@@ -8,8 +8,10 @@
 
 #if ITT_PLATFORM==ITT_PLATFORM_WIN
 #include <windows.h>
+#include <string.h>
+#include <ctype.h>
 #endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
-#if ITT_PLATFORM != ITT_PLATFORM_MAC && ITT_PLATFORM != ITT_PLATFORM_FREEBSD
+#if ITT_PLATFORM != ITT_PLATFORM_MAC && ITT_PLATFORM != ITT_PLATFORM_FREEBSD && ITT_PLATFORM != ITT_PLATFORM_OPENBSD
 #include <malloc.h>
 #endif
 #include <stdlib.h>
@@ -18,8 +20,6 @@
 
 static const char rcsid[] = "\n@(#) $Revision$\n";
 
-#define DLL_ENVIRONMENT_VAR             "VS_PROFILER"
-
 #ifndef NEW_DLL_ENVIRONMENT_VAR
 #if ITT_ARCH==ITT_ARCH_IA32
 #define NEW_DLL_ENVIRONMENT_VAR	        "INTEL_JIT_PROFILER32"
@@ -114,6 +114,38 @@ ITT_EXTERN_C iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive()
     return executionMode;
 }
 
+#if ITT_PLATFORM == ITT_PLATFORM_WIN
+static int isValidAbsolutePath(char *path, size_t maxPathLength)
+{
+    if (path == NULL)
+    {
+        return 0;
+    }
+
+    size_t pathLength = strnlen(path, maxPathLength);
+    if (pathLength == maxPathLength)
+    {
+      /* The strnlen() function returns maxPathLength if there is no null terminating
+       * among the first maxPathLength characters in the string pointed to by path.
+       */
+      return 0;
+    }
+
+    if (pathLength > 2)
+    {
+        if (isalpha(path[0]) && path[1] == ':' && path[2] == '\\')
+        {
+            return 1;
+        }
+        else if (path[0] == '\\' && path[1] == '\\')
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+#endif
+
 /* This function loads the collector dll and the relevant functions.
  * on success: all functions load,     iJIT_DLL_is_missing = 0, return value = 1
  * on failure: all functions are NULL, iJIT_DLL_is_missing = 1, return value = 0
@@ -157,7 +189,7 @@ static int loadiJIT_Funcs()
         {
             envret = GetEnvironmentVariableA(NEW_DLL_ENVIRONMENT_VAR, 
                                              dllName, dNameLength);
-            if (envret)
+            if (envret && isValidAbsolutePath(dllName, dNameLength))
             {
                 /* Try to load the dll from the PATH... */
                 m_libHandle = LoadLibraryExA(dllName, 
@@ -165,30 +197,9 @@ static int loadiJIT_Funcs()
             }
             free(dllName);
         }
-    } else {
-        /* Try to use old VS_PROFILER variable */
-        dNameLength = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, NULL, 0);
-        if (dNameLength)
-        {
-            DWORD envret = 0;
-            dllName = (char*)malloc(sizeof(char) * (dNameLength + 1));
-            if(dllName != NULL)
-            {
-                envret = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, 
-                                                 dllName, dNameLength);
-                if (envret)
-                {
-                    /* Try to load the dll from the PATH... */
-                    m_libHandle = LoadLibraryA(dllName);
-                }
-                free(dllName);
-            }
-        }
     }
 #else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
     dllName = getenv(NEW_DLL_ENVIRONMENT_VAR);
-    if (!dllName)
-        dllName = getenv(DLL_ENVIRONMENT_VAR);
 #if defined(__ANDROID__) || defined(ANDROID)
     if (!dllName)
         dllName = ANDROID_JIT_AGENT_PATH;
diff --git a/src/common/ittnotify/jitprofiling.h b/third_party/ittnotify/jitprofiling.h
similarity index 95%
rename from src/common/ittnotify/jitprofiling.h
rename to third_party/ittnotify/jitprofiling.h
index ffda0048d05..5a883594858 100644
--- a/src/common/ittnotify/jitprofiling.h
+++ b/third_party/ittnotify/jitprofiling.h
@@ -14,7 +14,7 @@
  * generated code that can be used by performance tools. The user inserts
  * calls in the code generator to report information before JIT-compiled
  * code goes to execution. This information is collected at runtime and used
- * by tools like Intel(R) VTune(TM) Amplifier to display performance metrics
+ * by tools like Intel(R) VTune(TM) Profiler to display performance metrics
  * associated with JIT-compiled code.
  *
  * These APIs can be used to\n
@@ -45,16 +45,16 @@
  *  * Expected behavior:
  *    * If any iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED event overwrites an
  *      already reported method, then such a method becomes invalid and its
- *      memory region is treated as unloaded. VTune Amplifier displays the metrics
+ *      memory region is treated as unloaded. VTune Profiler displays the metrics
  *      collected by the method until it is overwritten.
  *    * If supplied line number information contains multiple source lines for
- *      the same assembly instruction (code location), then VTune Amplifier picks up
+ *      the same assembly instruction (code location), then VTune Profiler picks up
  *      the first line number.
  *    * Dynamically generated code can be associated with a module name.
  *      Use the iJIT_Method_Load_V2 structure.\n
  *      Clarification of some cases:
  *        * If you register a function with the same method ID multiple times,
- *          specifying different module names, then the VTune Amplifier picks up
+ *          specifying different module names, then the VTune Profiler picks up
  *          the module name registered first. If you want to distinguish the same
  *          function between different JIT engines, supply different method IDs for
  *          each function. Other symbolic information (for example, source file)
@@ -91,18 +91,18 @@
  *        belonging to the same method. Symbolic information (method name,
  *        source file name) will be taken from the first notification, and all
  *        subsequent notifications with the same method ID will be processed
- *        only for line number table information. So, the VTune Amplifier will map
+ *        only for line number table information. So, the VTune Profiler will map
  *        samples to a source line using the line number table from the current
  *        notification while taking the source file name from the very first one.\n
  *        Clarification of some cases:\n
  *          * If you register a second code region with a different source file
  *          name and the same method ID, then this information will be saved and
  *          will not be considered as an extension of the first code region, but
- *          VTune Amplifier will use the source file of the first code region and map
+ *          VTune Profiler will use the source file of the first code region and map
  *          performance metrics incorrectly.
  *          * If you register a second code region with the same source file as
  *          for the first region and the same method ID, then the source file will be
- *          discarded but VTune Amplifier will map metrics to the source file correctly.
+ *          discarded but VTune Profiler will map metrics to the source file correctly.
  *          * If you register a second code region with a null source file and
  *          the same method ID, then provided line number info will be associated
  *          with the source file of the first code region.
@@ -241,7 +241,7 @@ typedef enum _iJIT_IsProfilingActiveFlags
  * @brief Description of a single entry in the line number information of a code region.
  * @details A table of line number entries gives information about how the reported code region
  * is mapped to source file.
- * Intel(R) VTune(TM) Amplifier uses line number information to attribute
+ * Intel(R) VTune(TM) Profiler uses line number information to attribute
  * the samples (virtual address) to a line number. \n
  * It is acceptable to report different code addresses for the same source line:
  * @code
@@ -252,7 +252,7 @@ typedef enum _iJIT_IsProfilingActiveFlags
  *      18      1
  *      21      30
  *
- *  VTune Amplifier constructs the following table using the client data
+ *  VTune Profiler constructs the following table using the client data
  *
  *   Code subrange  Line number
  *      0-1             2
@@ -376,7 +376,7 @@ typedef struct _iJIT_Method_Load_V2
 
     char* module_name; /**<\brief Module name. Can be NULL.
                            The module name can be useful for distinguishing among
-                           different JIT engines. VTune Amplifier will display
+                           different JIT engines. VTune Profiler will display
                            reported methods grouped by specific module. */
 
 } *piJIT_Method_Load_V2, iJIT_Method_Load_V2;
@@ -428,7 +428,7 @@ typedef struct _iJIT_Method_Load_V3
 
     char* module_name; /**<\brief Module name. Can be NULL.
                         *  The module name can be useful for distinguishing among
-                        *  different JIT engines. VTune Amplifier will display
+                        *  different JIT engines. VTune Profiler will display
                         *  reported methods grouped by specific module. */
 
     iJIT_CodeArchitecture module_arch; /**<\brief Architecture of the method's code region.
@@ -438,9 +438,9 @@ typedef struct _iJIT_Method_Load_V3
                                         *  engine generates 64-bit code.
                                         *
                                         *  If JIT engine reports both 32-bit and 64-bit types
-                                        *  of methods then VTune Amplifier splits the methods
+                                        *  of methods then VTune Profiler splits the methods
                                         *  with the same module name but with different
-                                        *  architectures in two different modules. VTune Amplifier
+                                        *  architectures in two different modules. VTune Profiler
                                         *  modifies the original name provided with a 64-bit method
                                         *  version by ending it with '(64)' */
 
@@ -509,9 +509,9 @@ typedef enum _iJIT_SegmentType
     iJIT_CT_CODE,           /**<\brief Executable code. */
 
     iJIT_CT_DATA,           /**<\brief Data (not executable code).
-                             * VTune Amplifier uses the format string
+                             * VTune Profiler uses the format string
                              * (see iJIT_Method_Update) to represent
-                             * this data in the VTune Amplifier GUI */
+                             * this data in the VTune Profiler GUI */
 
     iJIT_CT_KEEP,           /**<\brief Use the previous markup for the trace.
                              * Can be used for the following
@@ -528,11 +528,11 @@ typedef enum _iJIT_SegmentType
  * structure to describe the update of the content within a JIT-compiled method,
  * use iJVM_EVENT_TYPE_METHOD_UPDATE_V2 as an event type to report it.
  *
- * On the first Update event, VTune Amplifier copies the original code range reported by
+ * On the first Update event, VTune Profiler copies the original code range reported by
  * the iJVM_EVENT_TYPE_METHOD_LOAD event, then modifies it with the supplied bytes and
- * adds the modified range to the original method. For next update events, VTune Amplifier
+ * adds the modified range to the original method. For next update events, VTune Profiler
  * does the same but it uses the latest modified version of a code region for update.
- * Eventually, VTune Amplifier GUI displays multiple code ranges for the method reported by
+ * Eventually, VTune Profiler GUI displays multiple code ranges for the method reported by
  * the iJVM_EVENT_TYPE_METHOD_LOAD event.
  * Notes:
  * - Multiple update events with different types for the same trace are allowed
@@ -621,7 +621,7 @@ iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive(void);
  * @brief Reports infomation about JIT-compiled code to the agent.
  *
  * The reported information is used to attribute samples obtained from any
- * Intel(R) VTune(TM) Amplifier collector. This API needs to be called
+ * Intel(R) VTune(TM) Profiler collector. This API needs to be called
  * after JIT compilation and before the first entry into the JIT-compiled
  * code.
  *
diff --git a/third_party/ittnotify/legacy/ittnotify.h b/third_party/ittnotify/legacy/ittnotify.h
new file mode 100644
index 00000000000..cb014f3eed6
--- /dev/null
+++ b/third_party/ittnotify/legacy/ittnotify.h
@@ -0,0 +1,1004 @@
+/*
+  Copyright (C) 2005-2019 Intel Corporation
+
+  SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause
+*/
+#ifndef _LEGACY_ITTNOTIFY_H_
+#define _LEGACY_ITTNOTIFY_H_
+
+/**
+ * @file
+ * @brief Legacy User API functions and types
+ */
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS_FREEBSD
+#  define ITT_OS_FREEBSD   4
+#endif /* ITT_OS_FREEBSD */
+
+#ifndef ITT_OS_OPENBSD
+#  define ITT_OS_OPENBSD   5
+#endif /* ITT_OS_OPENBSD */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  elif defined( __FreeBSD__ )
+#    define ITT_OS ITT_OS_FREEBSD
+#  elif defined( __OpenBSD__ )
+#    define ITT_OS ITT_OS_OPENBSD
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM_MAC
+#  define ITT_PLATFORM_MAC 3
+#endif /* ITT_PLATFORM_MAC */
+
+#ifndef ITT_PLATFORM_FREEBSD
+#  define ITT_PLATFORM_FREEBSD 4
+#endif /* ITT_PLATFORM_FREEBSD */
+
+#ifndef ITT_PLATFORM_OPENBSD
+#  define ITT_PLATFORM_OPENBSD 5
+#endif /* ITT_PLATFORM_OPENBSD */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  elif ITT_OS==ITT_OS_MAC
+#    define ITT_PLATFORM ITT_PLATFORM_MAC
+#  elif ITT_OS==ITT_OS_FREEBSD
+#    define ITT_PLATFORM ITT_PLATFORM_FREEBSD
+#  elif ITT_OS==ITT_OS_OPENBSD
+#    define ITT_PLATFORM ITT_PLATFORM_OPENBSD
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef ITTAPI_CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define ITTAPI_CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define ITTAPI_CDECL __attribute__ ((cdecl))
+#    else  /* _M_IX86 || __i386__ */
+#      define ITTAPI_CDECL /* actual only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* ITTAPI_CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_IX86 || defined __i386__
+#      define STDCALL __attribute__ ((stdcall))
+#    else  /* _M_IX86 || __i386__ */
+#      define STDCALL /* supported only on x86 platform */
+#    endif /* _M_IX86 || __i386__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    ITTAPI_CDECL
+#define LIBITTAPI ITTAPI_CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    ITTAPI_CDECL
+#define LIBITTAPI_CALL ITTAPI_CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#if defined(__MINGW32__) && !defined(__cplusplus)
+#define ITT_INLINE           static __inline__ __attribute__((__always_inline__,__gnu_inline__))
+#else
+#define ITT_INLINE           static __forceinline
+#endif /* __MINGW32__ */
+
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#define ITT_INLINE_ATTRIBUTE __attribute__((unused))
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#define ITT_INLINE_ATTRIBUTE __attribute__((always_inline, unused))
+#endif /* __STRICT_ANSI__ */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+/** @cond exclude_from_documentation */
+/* Helper macro for joining tokens */
+#define ITT_JOIN_AUX(p,n) p##n
+#define ITT_JOIN(p,n)     ITT_JOIN_AUX(p,n)
+
+#ifdef ITT_MAJOR
+#undef ITT_MAJOR
+#endif
+#ifdef ITT_MINOR
+#undef ITT_MINOR
+#endif
+#define ITT_MAJOR     3
+#define ITT_MINOR     0
+
+/* Standard versioning of a token with major and minor version numbers */
+#define ITT_VERSIONIZE(x)    \
+    ITT_JOIN(x,              \
+    ITT_JOIN(_,              \
+    ITT_JOIN(ITT_MAJOR,      \
+    ITT_JOIN(_, ITT_MINOR))))
+
+#ifndef INTEL_ITTNOTIFY_PREFIX
+#  define INTEL_ITTNOTIFY_PREFIX __itt_
+#endif /* INTEL_ITTNOTIFY_PREFIX */
+#ifndef INTEL_ITTNOTIFY_POSTFIX
+#  define INTEL_ITTNOTIFY_POSTFIX _ptr_
+#endif /* INTEL_ITTNOTIFY_POSTFIX */
+
+#define ITTNOTIFY_NAME_AUX(n) ITT_JOIN(INTEL_ITTNOTIFY_PREFIX,n)
+#define ITTNOTIFY_NAME(n)     ITT_VERSIONIZE(ITTNOTIFY_NAME_AUX(ITT_JOIN(n,INTEL_ITTNOTIFY_POSTFIX)))
+
+#define ITTNOTIFY_VOID(n) (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)
+#define ITTNOTIFY_DATA(n) (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)
+
+#define ITTNOTIFY_VOID_D0(n,d)       (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_VOID_D1(n,d,x)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_VOID_D2(n,d,x,y)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_VOID_D3(n,d,x,y,z) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_VOID_D4(n,d,x,y,z,a)     (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_VOID_D5(n,d,x,y,z,a,b)   (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_VOID_D6(n,d,x,y,z,a,b,c) (d == NULL) ? (void)0 : (!(d)->flags) ? (void)0 : (!ITTNOTIFY_NAME(n)) ? (void)0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+#define ITTNOTIFY_DATA_D0(n,d)       (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d)
+#define ITTNOTIFY_DATA_D1(n,d,x)     (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x)
+#define ITTNOTIFY_DATA_D2(n,d,x,y)   (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y)
+#define ITTNOTIFY_DATA_D3(n,d,x,y,z) (d == NULL) ? 0 : (!(d)->flags) ?       0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z)
+#define ITTNOTIFY_DATA_D4(n,d,x,y,z,a)     (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a)
+#define ITTNOTIFY_DATA_D5(n,d,x,y,z,a,b)   (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b)
+#define ITTNOTIFY_DATA_D6(n,d,x,y,z,a,b,c) (d == NULL) ? 0 : (!(d)->flags) ? 0 : (!ITTNOTIFY_NAME(n)) ?       0 : ITTNOTIFY_NAME(n)(d,x,y,z,a,b,c)
+
+#ifdef ITT_STUB
+#undef ITT_STUB
+#endif
+#ifdef ITT_STUBV
+#undef ITT_STUBV
+#endif
+#define ITT_STUBV(api,type,name,args)                             \
+    typedef type (api* ITT_JOIN(ITTNOTIFY_NAME(name),_t)) args;   \
+    extern ITT_JOIN(ITTNOTIFY_NAME(name),_t) ITTNOTIFY_NAME(name);
+#define ITT_STUB ITT_STUBV
+/** @endcond */
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * @defgroup legacy Legacy API
+ * @{
+ * @}
+ */
+
+/**
+ * @defgroup legacy_control Collection Control
+ * @ingroup legacy
+ * General behavior: application continues to run, but no profiling information is being collected
+ *
+ * Pausing occurs not only for the current thread but for all process as well as spawned processes
+ * - Intel(R) Parallel Inspector and Intel(R) Inspector XE:
+ *   - Does not analyze or report errors that involve memory access.
+ *   - Other errors are reported as usual. Pausing data collection in
+ *     Intel(R) Parallel Inspector and Intel(R) Inspector XE
+ *     only pauses tracing and analyzing memory access.
+ *     It does not pause tracing or analyzing threading APIs.
+ *   .
+ * - Intel(R) VTune(TM) Profiler:
+ *   - Does continue to record when new threads are started.
+ *   .
+ * - Other effects:
+ *   - Possible reduction of runtime overhead.
+ *   .
+ * @{
+ */
+#ifndef _ITTNOTIFY_H_
+/** @brief Pause collection */
+void ITTAPI __itt_pause(void);
+/** @brief Resume collection */
+void ITTAPI __itt_resume(void);
+/** @brief Detach collection */
+void ITTAPI __itt_detach(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, pause,   (void))
+ITT_STUBV(ITTAPI, void, resume,  (void))
+ITT_STUBV(ITTAPI, void, detach,  (void))
+#define __itt_pause      ITTNOTIFY_VOID(pause)
+#define __itt_pause_ptr  ITTNOTIFY_NAME(pause)
+#define __itt_resume     ITTNOTIFY_VOID(resume)
+#define __itt_resume_ptr ITTNOTIFY_NAME(resume)
+#define __itt_detach     ITTNOTIFY_VOID(detach)
+#define __itt_detach_ptr ITTNOTIFY_NAME(detach)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_pause()
+#define __itt_pause_ptr  0
+#define __itt_resume()
+#define __itt_resume_ptr 0
+#define __itt_detach()
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_pause_ptr  0
+#define __itt_resume_ptr 0
+#define __itt_detach_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+#endif /* _ITTNOTIFY_H_ */
+/** @} legacy_control group */
+
+/**
+ * @defgroup legacy_threads Threads
+ * @ingroup legacy
+ * Threads group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @deprecated Legacy API
+ * @brief Set name to be associated with thread in analysis GUI.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_thr_name_setA(const char    *name, int namelen);
+int LIBITTAPI __itt_thr_name_setW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_thr_name_set     __itt_thr_name_setW
+#  define __itt_thr_name_set_ptr __itt_thr_name_setW_ptr
+#else
+#  define __itt_thr_name_set     __itt_thr_name_setA
+#  define __itt_thr_name_set_ptr __itt_thr_name_setA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_thr_name_set(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, thr_name_setA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, int, thr_name_setW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, thr_name_set,  (const char    *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA     ITTNOTIFY_DATA(thr_name_setA)
+#define __itt_thr_name_setA_ptr ITTNOTIFY_NAME(thr_name_setA)
+#define __itt_thr_name_setW     ITTNOTIFY_DATA(thr_name_setW)
+#define __itt_thr_name_setW_ptr ITTNOTIFY_NAME(thr_name_setW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set     ITTNOTIFY_DATA(thr_name_set)
+#define __itt_thr_name_set_ptr ITTNOTIFY_NAME(thr_name_set)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA(name, namelen)
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW(name, namelen)
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set(name, namelen)
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_thr_name_setA_ptr 0
+#define __itt_thr_name_setW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_thr_name_set_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Mark current thread as ignored from this point on, for the duration of its existence.
+ */
+void LIBITTAPI __itt_thr_ignore(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, thr_ignore, (void))
+#define __itt_thr_ignore     ITTNOTIFY_VOID(thr_ignore)
+#define __itt_thr_ignore_ptr ITTNOTIFY_NAME(thr_ignore)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_ignore()
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_ignore_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_threads group */
+
+/**
+ * @defgroup legacy_sync Synchronization
+ * @ingroup legacy
+ * Synchronization group
+ * @warning Legacy API
+ * @{
+ */
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_barrier 1
+
+/**
+ * @hideinitializer
+ * @brief possible value of attribute argument for sync object type
+ */
+#define __itt_attr_mutex   2
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name to a sync object using char or Unicode string
+ * @param[in] addr    - pointer to the sync object. You should use a real pointer to your object
+ *                      to make sure that the values don't clash with other object addresses
+ * @param[in] objtype - null-terminated object type string. If NULL is passed, the object will
+ *                      be assumed to be of generic "User Synchronization" type
+ * @param[in] objname - null-terminated object name string. If NULL, no name will be assigned
+ *                      to the object -- you can use the __itt_sync_rename call later to assign
+ *                      the name
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                      exact semantics of how prepare/acquired/releasing calls work.
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+void ITTAPI __itt_sync_set_nameA(void *addr, const char    *objtype, const char    *objname, int attribute);
+void ITTAPI __itt_sync_set_nameW(void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_sync_set_name     __itt_sync_set_nameW
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameW_ptr
+#else /* UNICODE */
+#  define __itt_sync_set_name     __itt_sync_set_nameA
+#  define __itt_sync_set_name_ptr __itt_sync_set_nameA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+void ITTAPI __itt_sync_set_name(void *addr, const char* objtype, const char* objname, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUBV(ITTAPI, void, sync_set_nameA, (void *addr, const char    *objtype, const char    *objname, int attribute))
+ITT_STUBV(ITTAPI, void, sync_set_nameW, (void *addr, const wchar_t *objtype, const wchar_t *objname, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUBV(ITTAPI, void, sync_set_name,  (void *addr, const char    *objtype, const char    *objname, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA     ITTNOTIFY_VOID(sync_set_nameA)
+#define __itt_sync_set_nameA_ptr ITTNOTIFY_NAME(sync_set_nameA)
+#define __itt_sync_set_nameW     ITTNOTIFY_VOID(sync_set_nameW)
+#define __itt_sync_set_nameW_ptr ITTNOTIFY_NAME(sync_set_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name     ITTNOTIFY_VOID(sync_set_name)
+#define __itt_sync_set_name_ptr ITTNOTIFY_NAME(sync_set_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW(addr, objtype, objname, attribute)
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name(addr, objtype, objname, attribute)
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_sync_set_nameA_ptr 0
+#define __itt_sync_set_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_sync_set_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Assign a name and type to a sync object using char or Unicode string
+ * @param[in] addr -      pointer to the sync object. You should use a real pointer to your object
+ *                        to make sure that the values don't clash with other object addresses
+ * @param[in] objtype -   null-terminated object type string. If NULL is passed, the object will
+ *                        be assumed to be of generic "User Synchronization" type
+ * @param[in] objname -   null-terminated object name string. If NULL, no name will be assigned
+ *                        to the object -- you can use the __itt_sync_rename call later to assign
+ *                        the name
+ * @param[in] typelen, namelen -   a length of string for appropriate objtype and objname parameter
+ * @param[in] attribute - one of [#__itt_attr_barrier, #__itt_attr_mutex] values which defines the
+ *                        exact semantics of how prepare/acquired/releasing calls work.
+ * @return __itt_err upon failure (name or namelen being null,name and namelen mismatched)
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+int LIBITTAPI __itt_notify_sync_nameA(void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute);
+int LIBITTAPI __itt_notify_sync_nameW(void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_notify_sync_name __itt_notify_sync_nameW
+#else
+#  define __itt_notify_sync_name __itt_notify_sync_nameA
+#endif
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+int LIBITTAPI __itt_notify_sync_name(void *addr, const char *objtype, int typelen, const char *objname, int namelen, int attribute);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, int, notify_sync_nameA, (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+ITT_STUB(LIBITTAPI, int, notify_sync_nameW, (void *addr, const wchar_t *objtype, int typelen, const wchar_t *objname, int namelen, int attribute))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, int, notify_sync_name,  (void *addr, const char    *objtype, int typelen, const char    *objname, int namelen, int attribute))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA     ITTNOTIFY_DATA(notify_sync_nameA)
+#define __itt_notify_sync_nameA_ptr ITTNOTIFY_NAME(notify_sync_nameA)
+#define __itt_notify_sync_nameW     ITTNOTIFY_DATA(notify_sync_nameW)
+#define __itt_notify_sync_nameW_ptr ITTNOTIFY_NAME(notify_sync_nameW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name     ITTNOTIFY_DATA(notify_sync_name)
+#define __itt_notify_sync_name_ptr ITTNOTIFY_NAME(notify_sync_name)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name(addr, objtype, typelen, objname, namelen, attribute)
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_notify_sync_nameA_ptr 0
+#define __itt_notify_sync_nameW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_notify_sync_name_ptr 0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Enter spin loop on user-defined sync object
+ */
+void LIBITTAPI __itt_notify_sync_prepare(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_prepare, (void *addr))
+#define __itt_notify_sync_prepare     ITTNOTIFY_VOID(notify_sync_prepare)
+#define __itt_notify_sync_prepare_ptr ITTNOTIFY_NAME(notify_sync_prepare)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_prepare(addr)
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_prepare_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Quit spin loop without acquiring spin object
+ */
+void LIBITTAPI __itt_notify_sync_cancel(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_cancel, (void *addr))
+#define __itt_notify_sync_cancel     ITTNOTIFY_VOID(notify_sync_cancel)
+#define __itt_notify_sync_cancel_ptr ITTNOTIFY_NAME(notify_sync_cancel)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_cancel(addr)
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_cancel_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Successful spin loop completion (sync object acquired)
+ */
+void LIBITTAPI __itt_notify_sync_acquired(void *addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_acquired, (void *addr))
+#define __itt_notify_sync_acquired     ITTNOTIFY_VOID(notify_sync_acquired)
+#define __itt_notify_sync_acquired_ptr ITTNOTIFY_NAME(notify_sync_acquired)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_acquired(addr)
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_acquired_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Start sync object releasing code. Is called before the lock release call.
+ */
+void LIBITTAPI __itt_notify_sync_releasing(void* addr);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, notify_sync_releasing, (void *addr))
+#define __itt_notify_sync_releasing     ITTNOTIFY_VOID(notify_sync_releasing)
+#define __itt_notify_sync_releasing_ptr ITTNOTIFY_NAME(notify_sync_releasing)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_notify_sync_releasing(addr)
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_notify_sync_releasing_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_sync group */
+
+#ifndef _ITTNOTIFY_H_
+/**
+ * @defgroup legacy_events Events
+ * @ingroup legacy
+ * Events group
+ * @{
+ */
+
+/** @brief user event type */
+typedef int __itt_event;
+
+/**
+ * @brief Create an event notification
+ * @note name or namelen being null/name and namelen not matching, user event feature not enabled
+ * @return non-zero event identifier upon success and __itt_err otherwise
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_event LIBITTAPI __itt_event_createA(const char    *name, int namelen);
+__itt_event LIBITTAPI __itt_event_createW(const wchar_t *name, int namelen);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_event_create     __itt_event_createW
+#  define __itt_event_create_ptr __itt_event_createW_ptr
+#else
+#  define __itt_event_create     __itt_event_createA
+#  define __itt_event_create_ptr __itt_event_createA_ptr
+#endif /* UNICODE */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_event LIBITTAPI __itt_event_create(const char *name, int namelen);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(LIBITTAPI, __itt_event, event_createA, (const char    *name, int namelen))
+ITT_STUB(LIBITTAPI, __itt_event, event_createW, (const wchar_t *name, int namelen))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(LIBITTAPI, __itt_event, event_create,  (const char *name, int namelen))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA     ITTNOTIFY_DATA(event_createA)
+#define __itt_event_createA_ptr ITTNOTIFY_NAME(event_createA)
+#define __itt_event_createW     ITTNOTIFY_DATA(event_createW)
+#define __itt_event_createW_ptr ITTNOTIFY_NAME(event_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create      ITTNOTIFY_DATA(event_create)
+#define __itt_event_create_ptr  ITTNOTIFY_NAME(event_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA(name, namelen) (__itt_event)0
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW(name, namelen) (__itt_event)0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create(name, namelen)  (__itt_event)0
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_event_createA_ptr 0
+#define __itt_event_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_event_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event occurrence.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_start(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_start, (__itt_event event))
+#define __itt_event_start     ITTNOTIFY_DATA(event_start)
+#define __itt_event_start_ptr ITTNOTIFY_NAME(event_start)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_start(event) (int)0
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_start_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @brief Record an event end occurrence.
+ * @note It is optional if events do not have durations.
+ * @return __itt_err upon failure (invalid event id/user event feature not enabled)
+ */
+int LIBITTAPI __itt_event_end(__itt_event event);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(LIBITTAPI, int, event_end, (__itt_event event))
+#define __itt_event_end     ITTNOTIFY_DATA(event_end)
+#define __itt_event_end_ptr ITTNOTIFY_NAME(event_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_event_end(event) (int)0
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_event_end_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_events group */
+#endif /* _ITTNOTIFY_H_ */
+
+/**
+ * @defgroup legacy_memory Memory Accesses
+ * @ingroup legacy
+ */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on reading
+ */
+void LIBITTAPI __itt_memory_read(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_read, (void *addr, size_t size))
+#define __itt_memory_read     ITTNOTIFY_VOID(memory_read)
+#define __itt_memory_read_ptr ITTNOTIFY_NAME(memory_read)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_read(addr, size)
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_read_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on writing
+ */
+void LIBITTAPI __itt_memory_write(void *addr, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_write, (void *addr, size_t size))
+#define __itt_memory_write     ITTNOTIFY_VOID(memory_write)
+#define __itt_memory_write_ptr ITTNOTIFY_NAME(memory_write)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_write(addr, size)
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_write_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief Inform the tool of memory accesses on updating
+ */
+void LIBITTAPI __itt_memory_update(void *address, size_t size);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(LIBITTAPI, void, memory_update, (void *addr, size_t size))
+#define __itt_memory_update     ITTNOTIFY_VOID(memory_update)
+#define __itt_memory_update_ptr ITTNOTIFY_NAME(memory_update)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_memory_update(addr, size)
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_memory_update_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_memory group */
+
+/**
+ * @defgroup legacy_state Thread and Object States
+ * @ingroup legacy
+ */
+
+/** @brief state type */
+typedef int __itt_state_t;
+
+/** @cond exclude_from_documentation */
+typedef enum __itt_obj_state {
+    __itt_obj_state_err = 0,
+    __itt_obj_state_clr = 1,
+    __itt_obj_state_set = 2,
+    __itt_obj_state_use = 3
+} __itt_obj_state_t;
+
+typedef enum __itt_thr_state {
+    __itt_thr_state_err = 0,
+    __itt_thr_state_clr = 1,
+    __itt_thr_state_set = 2
+} __itt_thr_state_t;
+
+typedef enum __itt_obj_prop {
+    __itt_obj_prop_watch    = 1,
+    __itt_obj_prop_ignore   = 2,
+    __itt_obj_prop_sharable = 3
+} __itt_obj_prop_t;
+
+typedef enum __itt_thr_prop {
+    __itt_thr_prop_quiet = 1
+} __itt_thr_prop_t;
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_get(void);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_get, (void))
+#define __itt_state_get     ITTNOTIFY_DATA(state_get)
+#define __itt_state_get_ptr ITTNOTIFY_NAME(state_get)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_get(void) (__itt_state_t)0
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_get_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object states
+ */
+__itt_state_t LIBITTAPI __itt_state_set(__itt_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_state_t, state_set, (__itt_state_t s))
+#define __itt_state_set     ITTNOTIFY_DATA(state_set)
+#define __itt_state_set_ptr ITTNOTIFY_NAME(state_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_state_set(s) (__itt_state_t)0
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_state_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_thr_state_t LIBITTAPI __itt_thr_mode_set(__itt_thr_prop_t p, __itt_thr_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_thr_state_t, thr_mode_set, (__itt_thr_prop_t p, __itt_thr_state_t s))
+#define __itt_thr_mode_set     ITTNOTIFY_DATA(thr_mode_set)
+#define __itt_thr_mode_set_ptr ITTNOTIFY_NAME(thr_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_thr_mode_set(p, s) (__itt_thr_state_t)0
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_thr_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/**
+ * @deprecated Legacy API
+ * @brief managing thread and object modes
+ */
+__itt_obj_state_t LIBITTAPI __itt_obj_mode_set(__itt_obj_prop_t p, __itt_obj_state_t s);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUB(ITTAPI, __itt_obj_state_t, obj_mode_set, (__itt_obj_prop_t p, __itt_obj_state_t s))
+#define __itt_obj_mode_set     ITTNOTIFY_DATA(obj_mode_set)
+#define __itt_obj_mode_set_ptr ITTNOTIFY_NAME(obj_mode_set)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_obj_mode_set(p, s) (__itt_obj_state_t)0
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_obj_mode_set_ptr 0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} legacy_state group */
+
+/**
+ * @defgroup frames Frames
+ * @ingroup legacy
+ * Frames group
+ * @{
+ */
+/**
+ * @brief opaque structure for frame identification
+ */
+typedef struct __itt_frame_t *__itt_frame;
+
+/**
+ * @brief Create a global frame with given domain
+ */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+__itt_frame ITTAPI __itt_frame_createA(const char    *domain);
+__itt_frame ITTAPI __itt_frame_createW(const wchar_t *domain);
+#if defined(UNICODE) || defined(_UNICODE)
+#  define __itt_frame_create     __itt_frame_createW
+#  define __itt_frame_create_ptr __itt_frame_createW_ptr
+#else /* UNICODE */
+#  define __itt_frame_create     __itt_frame_createA
+#  define __itt_frame_create_ptr __itt_frame_createA_ptr
+#endif /* UNICODE */
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+__itt_frame ITTAPI __itt_frame_create(const char *domain);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+ITT_STUB(ITTAPI, __itt_frame, frame_createA, (const char    *domain))
+ITT_STUB(ITTAPI, __itt_frame, frame_createW, (const wchar_t *domain))
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+ITT_STUB(ITTAPI, __itt_frame, frame_create,  (const char *domain))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA     ITTNOTIFY_DATA(frame_createA)
+#define __itt_frame_createA_ptr ITTNOTIFY_NAME(frame_createA)
+#define __itt_frame_createW     ITTNOTIFY_DATA(frame_createW)
+#define __itt_frame_createW_ptr ITTNOTIFY_NAME(frame_createW)
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create     ITTNOTIFY_DATA(frame_create)
+#define __itt_frame_create_ptr ITTNOTIFY_NAME(frame_create)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA(domain)
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW(domain)
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create(domain)
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_frame_createA_ptr 0
+#define __itt_frame_createW_ptr 0
+#else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define __itt_frame_create_ptr  0
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+
+/** @brief Record a frame begin occurrence. */
+void ITTAPI __itt_frame_begin(__itt_frame frame);
+/** @brief Record a frame end occurrence. */
+void ITTAPI __itt_frame_end  (__itt_frame frame);
+
+/** @cond exclude_from_documentation */
+#ifndef INTEL_NO_MACRO_BODY
+#ifndef INTEL_NO_ITTNOTIFY_API
+ITT_STUBV(ITTAPI, void, frame_begin, (__itt_frame frame))
+ITT_STUBV(ITTAPI, void, frame_end,   (__itt_frame frame))
+#define __itt_frame_begin     ITTNOTIFY_VOID(frame_begin)
+#define __itt_frame_begin_ptr ITTNOTIFY_NAME(frame_begin)
+#define __itt_frame_end       ITTNOTIFY_VOID(frame_end)
+#define __itt_frame_end_ptr   ITTNOTIFY_NAME(frame_end)
+#else  /* INTEL_NO_ITTNOTIFY_API */
+#define __itt_frame_begin(frame)
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end(frame)
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_ITTNOTIFY_API */
+#else  /* INTEL_NO_MACRO_BODY */
+#define __itt_frame_begin_ptr 0
+#define __itt_frame_end_ptr   0
+#endif /* INTEL_NO_MACRO_BODY */
+/** @endcond */
+/** @} frames group */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LEGACY_ITTNOTIFY_H_ */
diff --git a/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_api.h b/third_party/level_zero/layers/zel_tracing_api.h
similarity index 100%
rename from src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_api.h
rename to third_party/level_zero/layers/zel_tracing_api.h
diff --git a/src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_ddi.h b/third_party/level_zero/layers/zel_tracing_ddi.h
similarity index 100%
rename from src/gpu/intel/sycl/l0/level_zero/layers/zel_tracing_ddi.h
rename to third_party/level_zero/layers/zel_tracing_ddi.h
diff --git a/third_party/level_zero/layers/zel_tracing_register_cb.h b/third_party/level_zero/layers/zel_tracing_register_cb.h
new file mode 100644
index 00000000000..5341c090d18
--- /dev/null
+++ b/third_party/level_zero/layers/zel_tracing_register_cb.h
@@ -0,0 +1,3397 @@
+/*
+ *
+ * Copyright (C) 2021-2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file zel_tracing_register_cb.h
+ *
+ */
+#ifndef zel_tracing_register_cb_H
+#define zel_tracing_register_cb_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+
+#include "../ze_api.h"
+
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of tracer object
+typedef struct _zel_tracer_handle_t *zel_tracer_handle_t;
+
+/// Callback definitions for all API released in LevelZero spec 1.1 or newer
+/// Callbacks for APIs included in spec 1.0 are contained in ze_api.helper
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeInitDrivers
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_init_drivers_params_t
+{
+    uint32_t** ppCount;
+    ze_driver_handle_t** pphDrivers;
+    ze_init_driver_type_desc_t** pdesc;
+} ze_init_drivers_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeInitDrivers
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnInitDriversCb_t)(
+    ze_init_drivers_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASBuilderCreateExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_builder_create_exp_params_t
+{
+    ze_driver_handle_t* phDriver;
+    const ze_rtas_builder_exp_desc_t** ppDescriptor;
+    ze_rtas_builder_exp_handle_t** pphBuilder;
+} ze_rtas_builder_create_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASBuilderCreateExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASBuilderCreateExpCb_t)(
+    ze_rtas_builder_create_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASBuilderGetBuildPropertiesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_builder_get_build_properties_exp_params_t
+{
+    ze_rtas_builder_exp_handle_t* phBuilder;
+    const ze_rtas_builder_build_op_exp_desc_t** ppBuildOpDescriptor;
+    ze_rtas_builder_exp_properties_t** ppProperties;
+} ze_rtas_builder_get_build_properties_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASBuilderGetBuildPropertiesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASBuilderGetBuildPropertiesExpCb_t)(
+    ze_rtas_builder_get_build_properties_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASBuilderBuildExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_builder_build_exp_params_t
+{
+    ze_rtas_builder_exp_handle_t* phBuilder;
+    const ze_rtas_builder_build_op_exp_desc_t** ppBuildOpDescriptor;
+    void** ppScratchBuffer;
+    size_t* pscratchBufferSizeBytes;
+    void** ppRtasBuffer;
+    size_t* prtasBufferSizeBytes;
+    ze_rtas_parallel_operation_exp_handle_t* phParallelOperation;
+    void** ppBuildUserPtr;
+    ze_rtas_aabb_exp_t** ppBounds;
+    size_t** ppRtasBufferSizeBytes;
+} ze_rtas_builder_build_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASBuilderBuildExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASBuilderBuildExpCb_t)(
+    ze_rtas_builder_build_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASBuilderDestroyExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_builder_destroy_exp_params_t
+{
+    ze_rtas_builder_exp_handle_t* phBuilder;
+} ze_rtas_builder_destroy_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASBuilderDestroyExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASBuilderDestroyExpCb_t)(
+    ze_rtas_builder_destroy_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASParallelOperationCreateExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_parallel_operation_create_exp_params_t
+{
+    ze_driver_handle_t* phDriver;
+    ze_rtas_parallel_operation_exp_handle_t** pphParallelOperation;
+} ze_rtas_parallel_operation_create_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASParallelOperationCreateExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASParallelOperationCreateExpCb_t)(
+    ze_rtas_parallel_operation_create_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASParallelOperationGetPropertiesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_parallel_operation_get_properties_exp_params_t
+{
+    ze_rtas_parallel_operation_exp_handle_t* phParallelOperation;
+    ze_rtas_parallel_operation_exp_properties_t** ppProperties;
+} ze_rtas_parallel_operation_get_properties_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASParallelOperationGetPropertiesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASParallelOperationGetPropertiesExpCb_t)(
+    ze_rtas_parallel_operation_get_properties_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASParallelOperationJoinExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_parallel_operation_join_exp_params_t
+{
+    ze_rtas_parallel_operation_exp_handle_t* phParallelOperation;
+} ze_rtas_parallel_operation_join_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASParallelOperationJoinExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASParallelOperationJoinExpCb_t)(
+    ze_rtas_parallel_operation_join_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeRTASParallelOperationDestroyExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_rtas_parallel_operation_destroy_exp_params_t
+{
+    ze_rtas_parallel_operation_exp_handle_t* phParallelOperation;
+} ze_rtas_parallel_operation_destroy_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeRTASParallelOperationDestroyExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnRTASParallelOperationDestroyExpCb_t)(
+    ze_rtas_parallel_operation_destroy_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGetExtensionFunctionAddress
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_driver_get_extension_function_address_params_t
+{
+    ze_driver_handle_t* phDriver;
+    const char** pname;
+    void*** pppFunctionAddress;
+} ze_driver_get_extension_function_address_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGetExtensionFunctionAddress
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDriverGetExtensionFunctionAddressCb_t)(
+    ze_driver_get_extension_function_address_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGetLastErrorDescription
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_driver_get_last_error_description_params_t
+{
+    ze_driver_handle_t* phDriver;
+    const char*** pppString;
+} ze_driver_get_last_error_description_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGetLastErrorDescription
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDriverGetLastErrorDescriptionCb_t)(
+    ze_driver_get_last_error_description_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverRTASFormatCompatibilityCheckExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_driver_rtas_format_compatibility_check_exp_params_t
+{
+    ze_driver_handle_t* phDriver;
+    ze_rtas_format_exp_t* prtasFormatA;
+    ze_rtas_format_exp_t* prtasFormatB;
+} ze_driver_rtas_format_compatibility_check_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverRTASFormatCompatibilityCheckExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDriverRTASFormatCompatibilityCheckExpCb_t)(
+    ze_driver_rtas_format_compatibility_check_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetGlobalTimestamps
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_device_get_global_timestamps_params_t
+{
+    ze_device_handle_t* phDevice;
+    uint64_t** phostTimestamp;
+    uint64_t** pdeviceTimestamp;
+} ze_device_get_global_timestamps_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetGlobalTimestamps
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDeviceGetGlobalTimestampsCb_t)(
+    ze_device_get_global_timestamps_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceReserveCacheExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_device_reserve_cache_ext_params_t
+{
+    ze_device_handle_t* phDevice;
+    size_t* pcacheLevel;
+    size_t* pcacheReservationSize;
+} ze_device_reserve_cache_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceReserveCacheExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDeviceReserveCacheExtCb_t)(
+    ze_device_reserve_cache_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceSetCacheAdviceExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_device_set_cache_advice_ext_params_t
+{
+    ze_device_handle_t* phDevice;
+    void** pptr;
+    size_t* pregionSize;
+    ze_cache_ext_region_t* pcacheRegion;
+} ze_device_set_cache_advice_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceSetCacheAdviceExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDeviceSetCacheAdviceExtCb_t)(
+    ze_device_set_cache_advice_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDevicePciGetPropertiesExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_device_pci_get_properties_ext_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_pci_ext_properties_t** ppPciProperties;
+} ze_device_pci_get_properties_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDevicePciGetPropertiesExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDevicePciGetPropertiesExtCb_t)(
+    ze_device_pci_get_properties_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetFabricVertexExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_device_get_fabric_vertex_exp_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_fabric_vertex_handle_t** pphVertex;
+} ze_device_get_fabric_vertex_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetFabricVertexExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDeviceGetFabricVertexExpCb_t)(
+    ze_device_get_fabric_vertex_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetRootDevice
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_device_get_root_device_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_handle_t** pphRootDevice;
+} ze_device_get_root_device_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetRootDevice
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnDeviceGetRootDeviceCb_t)(
+    ze_device_get_root_device_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextCreateEx
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_context_create_ex_params_t
+{
+    ze_driver_handle_t* phDriver;
+    const ze_context_desc_t** pdesc;
+    uint32_t* pnumDevices;
+    ze_device_handle_t** pphDevices;
+    ze_context_handle_t** pphContext;
+} ze_context_create_ex_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextCreateEx
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnContextCreateExCb_t)(
+    ze_context_create_ex_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandQueueGetOrdinal
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_queue_get_ordinal_params_t
+{
+    ze_command_queue_handle_t* phCommandQueue;
+    uint32_t** ppOrdinal;
+} ze_command_queue_get_ordinal_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandQueueGetOrdinal
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandQueueGetOrdinalCb_t)(
+    ze_command_queue_get_ordinal_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandQueueGetIndex
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_queue_get_index_params_t
+{
+    ze_command_queue_handle_t* phCommandQueue;
+    uint32_t** ppIndex;
+} ze_command_queue_get_index_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandQueueGetIndex
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandQueueGetIndexCb_t)(
+    ze_command_queue_get_index_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListGetNextCommandIdWithKernelsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_get_next_command_id_with_kernels_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    const ze_mutable_command_id_exp_desc_t** pdesc;
+    uint32_t* pnumKernels;
+    ze_kernel_handle_t** pphKernels;
+    uint64_t** ppCommandId;
+} ze_command_list_get_next_command_id_with_kernels_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListGetNextCommandIdWithKernelsExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListGetNextCommandIdWithKernelsExpCb_t)(
+    ze_command_list_get_next_command_id_with_kernels_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListUpdateMutableCommandKernelsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_update_mutable_command_kernels_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint32_t* pnumKernels;
+    uint64_t** ppCommandId;
+    ze_kernel_handle_t** pphKernels;
+} ze_command_list_update_mutable_command_kernels_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListUpdateMutableCommandKernelsExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandKernelsExpCb_t)(
+    ze_command_list_update_mutable_command_kernels_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendImageCopyToMemoryExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_append_image_copy_to_memory_ext_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    void** pdstptr;
+    ze_image_handle_t* phSrcImage;
+    const ze_image_region_t** ppSrcRegion;
+    uint32_t* pdestRowPitch;
+    uint32_t* pdestSlicePitch;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_image_copy_to_memory_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendImageCopyToMemoryExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemoryExtCb_t)(
+    ze_command_list_append_image_copy_to_memory_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendImageCopyFromMemoryExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_append_image_copy_from_memory_ext_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_image_handle_t* phDstImage;
+    const void** psrcptr;
+    const ze_image_region_t** ppDstRegion;
+    uint32_t* psrcRowPitch;
+    uint32_t* psrcSlicePitch;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_image_copy_from_memory_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendImageCopyFromMemoryExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemoryExtCb_t)(
+    ze_command_list_append_image_copy_from_memory_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListHostSynchronize
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_host_synchronize_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint64_t* ptimeout;
+} ze_command_list_host_synchronize_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListHostSynchronize
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListHostSynchronizeCb_t)(
+    ze_command_list_host_synchronize_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListCreateCloneExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_create_clone_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_command_list_handle_t** pphClonedCommandList;
+} ze_command_list_create_clone_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListCreateCloneExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListCreateCloneExpCb_t)(
+    ze_command_list_create_clone_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListGetDeviceHandle
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_get_device_handle_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_device_handle_t** pphDevice;
+} ze_command_list_get_device_handle_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListGetDeviceHandle
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListGetDeviceHandleCb_t)(
+    ze_command_list_get_device_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListGetContextHandle
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_get_context_handle_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_context_handle_t** pphContext;
+} ze_command_list_get_context_handle_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListGetContextHandle
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListGetContextHandleCb_t)(
+    ze_command_list_get_context_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListGetOrdinal
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_get_ordinal_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint32_t** ppOrdinal;
+} ze_command_list_get_ordinal_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListGetOrdinal
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListGetOrdinalCb_t)(
+    ze_command_list_get_ordinal_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListImmediateGetIndex
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_immediate_get_index_params_t
+{
+    ze_command_list_handle_t* phCommandListImmediate;
+    uint32_t** ppIndex;
+} ze_command_list_immediate_get_index_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListImmediateGetIndex
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListImmediateGetIndexCb_t)(
+    ze_command_list_immediate_get_index_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListIsImmediate
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_is_immediate_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_bool_t** ppIsImmediate;
+} ze_command_list_is_immediate_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListIsImmediate
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListIsImmediateCb_t)(
+    ze_command_list_is_immediate_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListImmediateAppendCommandListsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_immediate_append_command_lists_exp_params_t
+{
+    ze_command_list_handle_t* phCommandListImmediate;
+    uint32_t* pnumCommandLists;
+    ze_command_list_handle_t** pphCommandLists;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_immediate_append_command_lists_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListImmediateAppendCommandListsExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListImmediateAppendCommandListsExpCb_t)(
+    ze_command_list_immediate_append_command_lists_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListGetNextCommandIdExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_get_next_command_id_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    const ze_mutable_command_id_exp_desc_t** pdesc;
+    uint64_t** ppCommandId;
+} ze_command_list_get_next_command_id_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListGetNextCommandIdExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListGetNextCommandIdExpCb_t)(
+    ze_command_list_get_next_command_id_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListUpdateMutableCommandsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_update_mutable_commands_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    const ze_mutable_commands_exp_desc_t** pdesc;
+} ze_command_list_update_mutable_commands_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListUpdateMutableCommandsExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandsExpCb_t)(
+    ze_command_list_update_mutable_commands_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListUpdateMutableCommandSignalEventExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_update_mutable_command_signal_event_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint64_t* pcommandId;
+    ze_event_handle_t* phSignalEvent;
+} ze_command_list_update_mutable_command_signal_event_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListUpdateMutableCommandSignalEventExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandSignalEventExpCb_t)(
+    ze_command_list_update_mutable_command_signal_event_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListUpdateMutableCommandWaitEventsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_command_list_update_mutable_command_wait_events_exp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint64_t* pcommandId;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_update_mutable_command_wait_events_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListUpdateMutableCommandWaitEventsExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandWaitEventsExpCb_t)(
+    ze_command_list_update_mutable_command_wait_events_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventQueryTimestampsExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_query_timestamps_exp_params_t
+{
+    ze_event_handle_t* phEvent;
+    ze_device_handle_t* phDevice;
+    uint32_t** ppCount;
+    ze_kernel_timestamp_result_t** ppTimestamps;
+} ze_event_query_timestamps_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventQueryTimestampsExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventQueryTimestampsExpCb_t)(
+    ze_event_query_timestamps_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventQueryKernelTimestampsExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_query_kernel_timestamps_ext_params_t
+{
+    ze_event_handle_t* phEvent;
+    ze_device_handle_t* phDevice;
+    uint32_t** ppCount;
+    ze_event_query_kernel_timestamps_results_ext_properties_t** ppResults;
+} ze_event_query_kernel_timestamps_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventQueryKernelTimestampsExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventQueryKernelTimestampsExtCb_t)(
+    ze_event_query_kernel_timestamps_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventGetEventPool
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_get_event_pool_params_t
+{
+    ze_event_handle_t* phEvent;
+    ze_event_pool_handle_t** pphEventPool;
+} ze_event_get_event_pool_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventGetEventPool
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventGetEventPoolCb_t)(
+    ze_event_get_event_pool_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventGetSignalScope
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_get_signal_scope_params_t
+{
+    ze_event_handle_t* phEvent;
+    ze_event_scope_flags_t** ppSignalScope;
+} ze_event_get_signal_scope_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventGetSignalScope
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventGetSignalScopeCb_t)(
+    ze_event_get_signal_scope_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventGetWaitScope
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_get_wait_scope_params_t
+{
+    ze_event_handle_t* phEvent;
+    ze_event_scope_flags_t** ppWaitScope;
+} ze_event_get_wait_scope_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventGetWaitScope
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventGetWaitScopeCb_t)(
+    ze_event_get_wait_scope_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolPutIpcHandle
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_pool_put_ipc_handle_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_ipc_event_pool_handle_t* phIpc;
+} ze_event_pool_put_ipc_handle_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolPutIpcHandle
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventPoolPutIpcHandleCb_t)(
+    ze_event_pool_put_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolGetContextHandle
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_pool_get_context_handle_params_t
+{
+    ze_event_pool_handle_t* phEventPool;
+    ze_context_handle_t** pphContext;
+} ze_event_pool_get_context_handle_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolGetContextHandle
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventPoolGetContextHandleCb_t)(
+    ze_event_pool_get_context_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolGetFlags
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_event_pool_get_flags_params_t
+{
+    ze_event_pool_handle_t* phEventPool;
+    ze_event_pool_flags_t** ppFlags;
+} ze_event_pool_get_flags_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolGetFlags
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnEventPoolGetFlagsCb_t)(
+    ze_event_pool_get_flags_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageGetMemoryPropertiesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_image_get_memory_properties_exp_params_t
+{
+    ze_image_handle_t* phImage;
+    ze_image_memory_properties_exp_t** ppMemoryProperties;
+} ze_image_get_memory_properties_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageGetMemoryPropertiesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnImageGetMemoryPropertiesExpCb_t)(
+    ze_image_get_memory_properties_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageViewCreateExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_image_view_create_exp_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_image_desc_t** pdesc;
+    ze_image_handle_t* phImage;
+    ze_image_handle_t** pphImageView;
+} ze_image_view_create_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageViewCreateExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnImageViewCreateExpCb_t)(
+    ze_image_view_create_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageGetAllocPropertiesExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_image_get_alloc_properties_ext_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_image_handle_t* phImage;
+    ze_image_allocation_ext_properties_t** ppImageAllocProperties;
+} ze_image_get_alloc_properties_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageGetAllocPropertiesExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnImageGetAllocPropertiesExtCb_t)(
+    ze_image_get_alloc_properties_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageViewCreateExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_image_view_create_ext_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_image_desc_t** pdesc;
+    ze_image_handle_t* phImage;
+    ze_image_handle_t** pphImageView;
+} ze_image_view_create_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageViewCreateExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnImageViewCreateExtCb_t)(
+    ze_image_view_create_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageGetDeviceOffsetExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_image_get_device_offset_exp_params_t
+{
+    ze_image_handle_t* phImage;
+    uint64_t** ppDeviceOffset;
+} ze_image_get_device_offset_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageGetDeviceOffsetExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnImageGetDeviceOffsetExpCb_t)(
+    ze_image_get_device_offset_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSetGlobalOffsetExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_kernel_set_global_offset_exp_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    uint32_t* poffsetX;
+    uint32_t* poffsetY;
+    uint32_t* poffsetZ;
+} ze_kernel_set_global_offset_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSetGlobalOffsetExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnKernelSetGlobalOffsetExpCb_t)(
+    ze_kernel_set_global_offset_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelGetBinaryExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_kernel_get_binary_exp_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    size_t** ppSize;
+    uint8_t** ppKernelBinary;
+} ze_kernel_get_binary_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelGetBinaryExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnKernelGetBinaryExpCb_t)(
+    ze_kernel_get_binary_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSchedulingHintExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_kernel_scheduling_hint_exp_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    ze_scheduling_hint_exp_desc_t** ppHint;
+} ze_kernel_scheduling_hint_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSchedulingHintExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnKernelSchedulingHintExpCb_t)(
+    ze_kernel_scheduling_hint_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemFreeExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_free_ext_params_t
+{
+    ze_context_handle_t* phContext;
+    const ze_memory_free_ext_desc_t** ppMemFreeDesc;
+    void** pptr;
+} ze_mem_free_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemFreeExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemFreeExtCb_t)(
+    ze_mem_free_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetIpcHandleFromFileDescriptorExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_get_ipc_handle_from_file_descriptor_exp_params_t
+{
+    ze_context_handle_t* phContext;
+    uint64_t* phandle;
+    ze_ipc_mem_handle_t** ppIpcHandle;
+} ze_mem_get_ipc_handle_from_file_descriptor_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetIpcHandleFromFileDescriptorExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemGetIpcHandleFromFileDescriptorExpCb_t)(
+    ze_mem_get_ipc_handle_from_file_descriptor_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetFileDescriptorFromIpcHandleExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_get_file_descriptor_from_ipc_handle_exp_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_ipc_mem_handle_t* pipcHandle;
+    uint64_t** ppHandle;
+} ze_mem_get_file_descriptor_from_ipc_handle_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetFileDescriptorFromIpcHandleExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemGetFileDescriptorFromIpcHandleExpCb_t)(
+    ze_mem_get_file_descriptor_from_ipc_handle_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemPutIpcHandle
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_put_ipc_handle_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_ipc_mem_handle_t* phandle;
+} ze_mem_put_ipc_handle_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemPutIpcHandle
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemPutIpcHandleCb_t)(
+    ze_mem_put_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemSetAtomicAccessAttributeExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_set_atomic_access_attribute_exp_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const void** pptr;
+    size_t* psize;
+    ze_memory_atomic_attr_exp_flags_t* pattr;
+} ze_mem_set_atomic_access_attribute_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemSetAtomicAccessAttributeExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemSetAtomicAccessAttributeExpCb_t)(
+    ze_mem_set_atomic_access_attribute_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetAtomicAccessAttributeExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_get_atomic_access_attribute_exp_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const void** pptr;
+    size_t* psize;
+    ze_memory_atomic_attr_exp_flags_t** ppAttr;
+} ze_mem_get_atomic_access_attribute_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetAtomicAccessAttributeExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemGetAtomicAccessAttributeExpCb_t)(
+    ze_mem_get_atomic_access_attribute_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetPitchFor2dImage
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_mem_get_pitch_for2d_image_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    size_t* pimageWidth;
+    size_t* pimageHeight;
+    unsigned int* pelementSizeInBytes;
+    size_t ** prowPitch;
+} ze_mem_get_pitch_for2d_image_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetPitchFor2dImage
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnMemGetPitchFor2dImageCb_t)(
+    ze_mem_get_pitch_for2d_image_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleInspectLinkageExt
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_module_inspect_linkage_ext_params_t
+{
+    ze_linkage_inspection_ext_desc_t** ppInspectDesc;
+    uint32_t* pnumModules;
+    ze_module_handle_t** pphModules;
+    ze_module_build_log_handle_t** pphLog;
+} ze_module_inspect_linkage_ext_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleInspectLinkageExt
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnModuleInspectLinkageExtCb_t)(
+    ze_module_inspect_linkage_ext_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricEdgeGetExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_edge_get_exp_params_t
+{
+    ze_fabric_vertex_handle_t* phVertexA;
+    ze_fabric_vertex_handle_t* phVertexB;
+    uint32_t** ppCount;
+    ze_fabric_edge_handle_t** pphEdges;
+} ze_fabric_edge_get_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricEdgeGetExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricEdgeGetExpCb_t)(
+    ze_fabric_edge_get_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricEdgeGetVerticesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_edge_get_vertices_exp_params_t
+{
+    ze_fabric_edge_handle_t* phEdge;
+    ze_fabric_vertex_handle_t** pphVertexA;
+    ze_fabric_vertex_handle_t** pphVertexB;
+} ze_fabric_edge_get_vertices_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricEdgeGetVerticesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricEdgeGetVerticesExpCb_t)(
+    ze_fabric_edge_get_vertices_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricEdgeGetPropertiesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_edge_get_properties_exp_params_t
+{
+    ze_fabric_edge_handle_t* phEdge;
+    ze_fabric_edge_exp_properties_t** ppEdgeProperties;
+} ze_fabric_edge_get_properties_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricEdgeGetPropertiesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricEdgeGetPropertiesExpCb_t)(
+    ze_fabric_edge_get_properties_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricVertexGetExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_vertex_get_exp_params_t
+{
+    ze_driver_handle_t* phDriver;
+    uint32_t** ppCount;
+    ze_fabric_vertex_handle_t** pphVertices;
+} ze_fabric_vertex_get_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricVertexGetExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricVertexGetExpCb_t)(
+    ze_fabric_vertex_get_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricVertexGetSubVerticesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_vertex_get_sub_vertices_exp_params_t
+{
+    ze_fabric_vertex_handle_t* phVertex;
+    uint32_t** ppCount;
+    ze_fabric_vertex_handle_t** pphSubvertices;
+} ze_fabric_vertex_get_sub_vertices_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricVertexGetSubVerticesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricVertexGetSubVerticesExpCb_t)(
+    ze_fabric_vertex_get_sub_vertices_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricVertexGetPropertiesExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_vertex_get_properties_exp_params_t
+{
+    ze_fabric_vertex_handle_t* phVertex;
+    ze_fabric_vertex_exp_properties_t** ppVertexProperties;
+} ze_fabric_vertex_get_properties_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricVertexGetPropertiesExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricVertexGetPropertiesExpCb_t)(
+    ze_fabric_vertex_get_properties_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFabricVertexGetDeviceExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+
+typedef struct _ze_fabric_vertex_get_device_exp_params_t
+{
+    ze_fabric_vertex_handle_t* phVertex;
+    ze_device_handle_t** pphDevice;
+} ze_fabric_vertex_get_device_exp_params_t;
+
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFabricVertexGetDeviceExp
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+
+typedef void (ZE_APICALL *ze_pfnFabricVertexGetDeviceExpCb_t)(
+    ze_fabric_vertex_get_device_exp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+
+typedef enum _zel_tracer_reg_t
+{
+    ZEL_REGISTER_PROLOGUE = 0,
+    ZEL_REGISTER_EPILOGUE = 1     
+} zel_tracer_reg_t;
+
+/// APIs to register callbacks for each core API
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerInitRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnInitCb_t pfnInitCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetCb_t pfnGetCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerInitDriversRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnInitDriversCb_t pfnInitDriversCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetApiVersionRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetApiVersionCb_t pfnGetApiVersionCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetPropertiesCb_t pfnGetPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetIpcPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetIpcPropertiesCb_t pfnGetIpcPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetExtensionPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetExtensionPropertiesCb_t pfnGetExtensionPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetExtensionFunctionAddressRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetExtensionFunctionAddressCb_t pfnGetExtensionFunctionAddressCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverGetLastErrorDescriptionRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverGetLastErrorDescriptionCb_t pfnGetLastErrorDescriptionCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetCb_t pfnGetCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetRootDeviceRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetRootDeviceCb_t pfnGetRootDeviceCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetSubDevicesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetSubDevicesCb_t pfnGetSubDevicesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetPropertiesCb_t pfnGetPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetComputePropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetComputePropertiesCb_t pfnGetComputePropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetModulePropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetModulePropertiesCb_t pfnGetModulePropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetCommandQueueGroupPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t pfnGetCommandQueueGroupPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetMemoryPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetMemoryPropertiesCb_t pfnGetMemoryPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetMemoryAccessPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetMemoryAccessPropertiesCb_t pfnGetMemoryAccessPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetCachePropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetCachePropertiesCb_t pfnGetCachePropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetImagePropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetImagePropertiesCb_t pfnGetImagePropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetExternalMemoryPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetExternalMemoryPropertiesCb_t pfnGetExternalMemoryPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetP2PPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetP2PPropertiesCb_t pfnGetP2PPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceCanAccessPeerRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceCanAccessPeerCb_t pfnCanAccessPeerCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetStatusRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetStatusCb_t pfnGetStatusCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetGlobalTimestampsRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetGlobalTimestampsCb_t pfnGetGlobalTimestampsCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextCreateExRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextCreateExCb_t pfnCreateExCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextGetStatusRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextGetStatusCb_t pfnGetStatusCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandQueueCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandQueueCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandQueueDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandQueueDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandQueueExecuteCommandListsRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandQueueExecuteCommandListsCb_t pfnExecuteCommandListsCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandQueueSynchronizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandQueueSynchronizeCb_t pfnSynchronizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandQueueGetOrdinalRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandQueueGetOrdinalCb_t pfnGetOrdinalCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandQueueGetIndexRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandQueueGetIndexCb_t pfnGetIndexCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListCreateImmediateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListCreateImmediateCb_t pfnCreateImmediateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListCloseRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListCloseCb_t pfnCloseCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListResetRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListResetCb_t pfnResetCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendWriteGlobalTimestampRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendWriteGlobalTimestampCb_t pfnAppendWriteGlobalTimestampCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListHostSynchronizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListHostSynchronizeCb_t pfnHostSynchronizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListGetDeviceHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListGetDeviceHandleCb_t pfnGetDeviceHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListGetContextHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListGetContextHandleCb_t pfnGetContextHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListGetOrdinalRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListGetOrdinalCb_t pfnGetOrdinalCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListImmediateGetIndexRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListImmediateGetIndexCb_t pfnImmediateGetIndexCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListIsImmediateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListIsImmediateCb_t pfnIsImmediateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendBarrierRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendBarrierCb_t pfnAppendBarrierCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemoryRangesBarrierRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemoryRangesBarrierCb_t pfnAppendMemoryRangesBarrierCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextSystemBarrierRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextSystemBarrierCb_t pfnSystemBarrierCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemoryCopyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemoryCopyCb_t pfnAppendMemoryCopyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemoryFillRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemoryFillCb_t pfnAppendMemoryFillCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemoryCopyRegionRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemoryCopyRegionCb_t pfnAppendMemoryCopyRegionCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemoryCopyFromContextRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemoryCopyFromContextCb_t pfnAppendMemoryCopyFromContextCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendImageCopyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendImageCopyCb_t pfnAppendImageCopyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendImageCopyRegionRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendImageCopyRegionCb_t pfnAppendImageCopyRegionCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendImageCopyToMemoryRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendImageCopyToMemoryCb_t pfnAppendImageCopyToMemoryCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendImageCopyFromMemoryRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendImageCopyFromMemoryCb_t pfnAppendImageCopyFromMemoryCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemoryPrefetchRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemoryPrefetchCb_t pfnAppendMemoryPrefetchCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendMemAdviseRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendMemAdviseCb_t pfnAppendMemAdviseCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolGetIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolGetIpcHandleCb_t pfnGetIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolPutIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolPutIpcHandleCb_t pfnPutIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolOpenIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolOpenIpcHandleCb_t pfnOpenIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolCloseIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolCloseIpcHandleCb_t pfnCloseIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendSignalEventRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendSignalEventCb_t pfnAppendSignalEventCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendWaitOnEventsRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendWaitOnEventsCb_t pfnAppendWaitOnEventsCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventHostSignalRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventHostSignalCb_t pfnHostSignalCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventHostSynchronizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventHostSynchronizeCb_t pfnHostSynchronizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventQueryStatusRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventQueryStatusCb_t pfnQueryStatusCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendEventResetRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendEventResetCb_t pfnAppendEventResetCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventHostResetRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventHostResetCb_t pfnHostResetCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventQueryKernelTimestampRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventQueryKernelTimestampCb_t pfnQueryKernelTimestampCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendQueryKernelTimestampsRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendQueryKernelTimestampsCb_t pfnAppendQueryKernelTimestampsCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventGetEventPoolRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventGetEventPoolCb_t pfnGetEventPoolCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventGetSignalScopeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventGetSignalScopeCb_t pfnGetSignalScopeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventGetWaitScopeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventGetWaitScopeCb_t pfnGetWaitScopeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolGetContextHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolGetContextHandleCb_t pfnGetContextHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventPoolGetFlagsRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventPoolGetFlagsCb_t pfnGetFlagsCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFenceCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFenceCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFenceDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFenceDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFenceHostSynchronizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFenceHostSynchronizeCb_t pfnHostSynchronizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFenceQueryStatusRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFenceQueryStatusCb_t pfnQueryStatusCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFenceResetRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFenceResetCb_t pfnResetCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageGetPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageGetPropertiesCb_t pfnGetPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemAllocSharedRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemAllocSharedCb_t pfnAllocSharedCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemAllocDeviceRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemAllocDeviceCb_t pfnAllocDeviceCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemAllocHostRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemAllocHostCb_t pfnAllocHostCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemFreeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemFreeCb_t pfnFreeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetAllocPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetAllocPropertiesCb_t pfnGetAllocPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetAddressRangeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetAddressRangeCb_t pfnGetAddressRangeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetIpcHandleCb_t pfnGetIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetIpcHandleFromFileDescriptorExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetIpcHandleFromFileDescriptorExpCb_t pfnGetIpcHandleFromFileDescriptorExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetFileDescriptorFromIpcHandleExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetFileDescriptorFromIpcHandleExpCb_t pfnGetFileDescriptorFromIpcHandleExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemPutIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemPutIpcHandleCb_t pfnPutIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemOpenIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemOpenIpcHandleCb_t pfnOpenIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemCloseIpcHandleRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemCloseIpcHandleCb_t pfnCloseIpcHandleCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemSetAtomicAccessAttributeExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemSetAtomicAccessAttributeExpCb_t pfnSetAtomicAccessAttributeExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetAtomicAccessAttributeExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetAtomicAccessAttributeExpCb_t pfnGetAtomicAccessAttributeExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleDynamicLinkRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleDynamicLinkCb_t pfnDynamicLinkCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleBuildLogDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleBuildLogDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleBuildLogGetStringRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleBuildLogGetStringCb_t pfnGetStringCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleGetNativeBinaryRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleGetNativeBinaryCb_t pfnGetNativeBinaryCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleGetGlobalPointerRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleGetGlobalPointerCb_t pfnGetGlobalPointerCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleGetKernelNamesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleGetKernelNamesCb_t pfnGetKernelNamesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleGetPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleGetPropertiesCb_t pfnGetPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleGetFunctionPointerRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleGetFunctionPointerCb_t pfnGetFunctionPointerCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSetGroupSizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSetGroupSizeCb_t pfnSetGroupSizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSuggestGroupSizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSuggestGroupSizeCb_t pfnSuggestGroupSizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSuggestMaxCooperativeGroupCountRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t pfnSuggestMaxCooperativeGroupCountCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSetArgumentValueRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSetArgumentValueCb_t pfnSetArgumentValueCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSetIndirectAccessRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSetIndirectAccessCb_t pfnSetIndirectAccessCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelGetIndirectAccessRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelGetIndirectAccessCb_t pfnGetIndirectAccessCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelGetSourceAttributesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelGetSourceAttributesCb_t pfnGetSourceAttributesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSetCacheConfigRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSetCacheConfigCb_t pfnSetCacheConfigCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelGetPropertiesRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelGetPropertiesCb_t pfnGetPropertiesCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelGetNameRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelGetNameCb_t pfnGetNameCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendLaunchKernelRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendLaunchKernelCb_t pfnAppendLaunchKernelCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendLaunchCooperativeKernelRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendLaunchCooperativeKernelCb_t pfnAppendLaunchCooperativeKernelCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendLaunchKernelIndirectRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendLaunchKernelIndirectCb_t pfnAppendLaunchKernelIndirectCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendLaunchMultipleKernelsIndirectRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t pfnAppendLaunchMultipleKernelsIndirectCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextMakeMemoryResidentRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextMakeMemoryResidentCb_t pfnMakeMemoryResidentCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextEvictMemoryRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextEvictMemoryCb_t pfnEvictMemoryCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextMakeImageResidentRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextMakeImageResidentCb_t pfnMakeImageResidentCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerContextEvictImageRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnContextEvictImageCb_t pfnEvictImageCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerSamplerCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnSamplerCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerSamplerDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnSamplerDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemReserveRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemReserveCb_t pfnReserveCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemFreeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemFreeCb_t pfnFreeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemQueryPageSizeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemQueryPageSizeCb_t pfnQueryPageSizeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerPhysicalMemCreateRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnPhysicalMemCreateCb_t pfnCreateCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerPhysicalMemDestroyRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnPhysicalMemDestroyCb_t pfnDestroyCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemMapRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemMapCb_t pfnMapCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemUnmapRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemUnmapCb_t pfnUnmapCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemSetAccessAttributeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemSetAccessAttributeCb_t pfnSetAccessAttributeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerVirtualMemGetAccessAttributeRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnVirtualMemGetAccessAttributeCb_t pfnGetAccessAttributeCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSetGlobalOffsetExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSetGlobalOffsetExpCb_t pfnSetGlobalOffsetExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelGetBinaryExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelGetBinaryExpCb_t pfnGetBinaryExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceReserveCacheExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceReserveCacheExtCb_t pfnReserveCacheExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceSetCacheAdviceExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceSetCacheAdviceExtCb_t pfnSetCacheAdviceExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventQueryTimestampsExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventQueryTimestampsExpCb_t pfnQueryTimestampsExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageGetMemoryPropertiesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageGetMemoryPropertiesExpCb_t pfnGetMemoryPropertiesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageViewCreateExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageViewCreateExtCb_t pfnViewCreateExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageViewCreateExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageViewCreateExpCb_t pfnViewCreateExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerKernelSchedulingHintExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnKernelSchedulingHintExpCb_t pfnSchedulingHintExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDevicePciGetPropertiesExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDevicePciGetPropertiesExtCb_t pfnPciGetPropertiesExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendImageCopyToMemoryExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendImageCopyToMemoryExtCb_t pfnAppendImageCopyToMemoryExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListAppendImageCopyFromMemoryExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListAppendImageCopyFromMemoryExtCb_t pfnAppendImageCopyFromMemoryExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageGetAllocPropertiesExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageGetAllocPropertiesExtCb_t pfnGetAllocPropertiesExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerModuleInspectLinkageExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnModuleInspectLinkageExtCb_t pfnInspectLinkageExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemFreeExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemFreeExtCb_t pfnFreeExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricVertexGetExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricVertexGetExpCb_t pfnGetExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricVertexGetSubVerticesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricVertexGetSubVerticesExpCb_t pfnGetSubVerticesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricVertexGetPropertiesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricVertexGetPropertiesExpCb_t pfnGetPropertiesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricVertexGetDeviceExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricVertexGetDeviceExpCb_t pfnGetDeviceExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDeviceGetFabricVertexExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDeviceGetFabricVertexExpCb_t pfnGetFabricVertexExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricEdgeGetExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricEdgeGetExpCb_t pfnGetExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricEdgeGetVerticesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricEdgeGetVerticesExpCb_t pfnGetVerticesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerFabricEdgeGetPropertiesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnFabricEdgeGetPropertiesExpCb_t pfnGetPropertiesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerEventQueryKernelTimestampsExtRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnEventQueryKernelTimestampsExtCb_t pfnQueryKernelTimestampsExtCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASBuilderCreateExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASBuilderCreateExpCb_t pfnCreateExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASBuilderGetBuildPropertiesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASBuilderGetBuildPropertiesExpCb_t pfnGetBuildPropertiesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerDriverRTASFormatCompatibilityCheckExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnDriverRTASFormatCompatibilityCheckExpCb_t pfnRTASFormatCompatibilityCheckExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASBuilderBuildExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASBuilderBuildExpCb_t pfnBuildExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASBuilderDestroyExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASBuilderDestroyExpCb_t pfnDestroyExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASParallelOperationCreateExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASParallelOperationCreateExpCb_t pfnCreateExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASParallelOperationGetPropertiesExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASParallelOperationGetPropertiesExpCb_t pfnGetPropertiesExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASParallelOperationJoinExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASParallelOperationJoinExpCb_t pfnJoinExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerRTASParallelOperationDestroyExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnRTASParallelOperationDestroyExpCb_t pfnDestroyExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerMemGetPitchFor2dImageRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnMemGetPitchFor2dImageCb_t pfnGetPitchFor2dImageCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerImageGetDeviceOffsetExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnImageGetDeviceOffsetExpCb_t pfnGetDeviceOffsetExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListCreateCloneExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListCreateCloneExpCb_t pfnCreateCloneExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListImmediateAppendCommandListsExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListImmediateAppendCommandListsExpCb_t pfnImmediateAppendCommandListsExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListGetNextCommandIdExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListGetNextCommandIdExpCb_t pfnGetNextCommandIdExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListGetNextCommandIdWithKernelsExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListGetNextCommandIdWithKernelsExpCb_t pfnGetNextCommandIdWithKernelsExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListUpdateMutableCommandsExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListUpdateMutableCommandsExpCb_t pfnUpdateMutableCommandsExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListUpdateMutableCommandSignalEventExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListUpdateMutableCommandSignalEventExpCb_t pfnUpdateMutableCommandSignalEventExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListUpdateMutableCommandWaitEventsExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListUpdateMutableCommandWaitEventsExpCb_t pfnUpdateMutableCommandWaitEventsExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerCommandListUpdateMutableCommandKernelsExpRegisterCallback(
+    zel_tracer_handle_t hTracer,
+    zel_tracer_reg_t callback_type,
+    ze_pfnCommandListUpdateMutableCommandKernelsExpCb_t pfnUpdateMutableCommandKernelsExpCb
+    );
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelTracerResetAllCallbacks(zel_tracer_handle_t hTracer);
+
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // zel_tracing_register_cb_H
diff --git a/third_party/level_zero/loader/ze_loader.h b/third_party/level_zero/loader/ze_loader.h
new file mode 100644
index 00000000000..2d5b75d2ea1
--- /dev/null
+++ b/third_party/level_zero/loader/ze_loader.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file ze_loader.h
+ */
+
+#ifndef _ZE_LOADER_H
+#define _ZE_LOADER_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+
+#include "../ze_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+ typedef struct _zel_version {
+    int major;
+    int minor;
+    int patch; 
+  } zel_version_t; 
+
+ //Ex component string "ze_tracing", "ze_validation", etc 
+#define ZEL_COMPONENT_STRING_SIZE 64 
+
+ typedef struct zel_component_version {
+    char component_name[ZEL_COMPONENT_STRING_SIZE];
+    ze_api_version_t spec_version;
+    zel_version_t component_lib_version;
+} zel_component_version_t; 
+
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelLoaderGetVersions(
+   size_t *num_elems,                     //Pointer to num versions to get.  
+   zel_component_version_t *versions);    //Pointer to array of versions. If set to NULL, num_elems is returned
+
+typedef enum _zel_handle_type_t {
+   ZEL_HANDLE_DRIVER,
+   ZEL_HANDLE_DEVICE,
+   ZEL_HANDLE_CONTEXT,
+   ZEL_HANDLE_COMMAND_QUEUE,
+   ZEL_HANDLE_COMMAND_LIST,
+   ZEL_HANDLE_FENCE,
+   ZEL_HANDLE_EVENT_POOL,
+   ZEL_HANDLE_EVENT,
+   ZEL_HANDLE_IMAGE,
+   ZEL_HANDLE_MODULE,
+   ZEL_HANDLE_MODULE_BUILD_LOG,
+   ZEL_HANDLE_KERNEL,
+   ZEL_HANDLE_SAMPLER,
+   ZEL_HANDLE_PHYSICAL_MEM
+} zel_handle_type_t;
+
+//Translates Loader Handles to Driver Handles if loader handle intercept is enabled.
+//If handle intercept is not enabled handleOut is set to handleIn  
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zelLoaderTranslateHandle(
+   zel_handle_type_t handleType,   //Handle Type
+   void *handleIn,                  //Input: handle to translate from loader handle to driver handle
+   void **handleOut);                //Output: Pointer to handleOut is set to driver handle if successful
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for handling calls to released drivers in teardown.
+///
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zelSetDriverTeardown();
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for Enabling the Tracing Layer During Runtime.
+///
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zelEnableTracingLayer();
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for Disabling the Tracing Layer During Runtime.
+///
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zelDisableTracingLayer();
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#endif //_ZE_LOADER_H
\ No newline at end of file
diff --git a/third_party/level_zero/ze.py b/third_party/level_zero/ze.py
new file mode 100644
index 00000000000..d17cf647a7e
--- /dev/null
+++ b/third_party/level_zero/ze.py
@@ -0,0 +1,6205 @@
+"""
+ Copyright (C) 2019-2021 Intel Corporation
+
+ SPDX-License-Identifier: MIT
+
+ @file ze.py
+ @version v1.11-r1.11.8
+
+ """
+import platform
+from ctypes import *
+from enum import *
+
+###############################################################################
+__version__ = "1.0"
+
+###############################################################################
+## @brief Generates generic 'oneAPI' API versions
+def ZE_MAKE_VERSION( _major, _minor ):
+    return (( _major << 16 )|( _minor & 0x0000ffff))
+
+###############################################################################
+## @brief Extracts 'oneAPI' API major version
+def ZE_MAJOR_VERSION( _ver ):
+    return ( _ver >> 16 )
+
+###############################################################################
+## @brief Extracts 'oneAPI' API minor version
+def ZE_MINOR_VERSION( _ver ):
+    return ( _ver & 0x0000ffff )
+
+###############################################################################
+## @brief Calling convention for all API functions
+# ZE_APICALL not required for python
+
+###############################################################################
+## @brief Microsoft-specific dllexport storage-class attribute
+# ZE_APIEXPORT not required for python
+
+###############################################################################
+## @brief GCC-specific dllexport storage-class attribute
+# ZE_APIEXPORT not required for python
+
+###############################################################################
+## @brief Microsoft-specific dllexport storage-class attribute
+# ZE_DLLEXPORT not required for python
+
+###############################################################################
+## @brief GCC-specific dllexport storage-class attribute
+# ZE_DLLEXPORT not required for python
+
+###############################################################################
+## @brief compiler-independent type
+class ze_bool_t(c_ubyte):
+    pass
+
+###############################################################################
+## @brief Handle of a driver instance
+class ze_driver_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's device object
+class ze_device_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's context object
+class ze_context_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's command queue object
+class ze_command_queue_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's command list object
+class ze_command_list_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's fence object
+class ze_fence_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's event pool object
+class ze_event_pool_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's event object
+class ze_event_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's image object
+class ze_image_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's module object
+class ze_module_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of module's build log object
+class ze_module_build_log_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's kernel object
+class ze_kernel_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's sampler object
+class ze_sampler_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of physical memory object
+class ze_physical_mem_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's fabric vertex object
+class ze_fabric_vertex_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of driver's fabric edge object
+class ze_fabric_edge_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Maximum IPC handle size
+ZE_MAX_IPC_HANDLE_SIZE = 64
+
+###############################################################################
+## @brief IPC handle to a memory allocation
+class ze_ipc_mem_handle_t(Structure):
+    _fields_ = [
+        ("data", c_char * ZE_MAX_IPC_HANDLE_SIZE)                       ## [out] Opaque data representing an IPC handle
+    ]
+
+###############################################################################
+## @brief IPC handle to a event pool allocation
+class ze_ipc_event_pool_handle_t(Structure):
+    _fields_ = [
+        ("data", c_char * ZE_MAX_IPC_HANDLE_SIZE)                       ## [out] Opaque data representing an IPC handle
+    ]
+
+###############################################################################
+## @brief Generic macro for enumerator bit masks
+def ZE_BIT( _i ):
+    return ( 1 << _i )
+
+###############################################################################
+## @brief Defines Return/Error codes
+class ze_result_v(IntEnum):
+    SUCCESS = 0                                                             ## [Core] success
+    NOT_READY = 1                                                           ## [Core] synchronization primitive not signaled
+    ERROR_DEVICE_LOST = 0x70000001                                          ## [Core] device hung, reset, was removed, or driver update occurred
+    ERROR_OUT_OF_HOST_MEMORY = 0x70000002                                   ## [Core] insufficient host memory to satisfy call
+    ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003                                 ## [Core] insufficient device memory to satisfy call
+    ERROR_MODULE_BUILD_FAILURE = 0x70000004                                 ## [Core] error occurred when building module, see build log for details
+    ERROR_MODULE_LINK_FAILURE = 0x70000005                                  ## [Core] error occurred when linking modules, see build log for details
+    ERROR_DEVICE_REQUIRES_RESET = 0x70000006                                ## [Core] device requires a reset
+    ERROR_DEVICE_IN_LOW_POWER_STATE = 0x70000007                            ## [Core] device currently in low power state
+    EXP_ERROR_DEVICE_IS_NOT_VERTEX = 0x7ff00001                             ## [Core, Experimental] device is not represented by a fabric vertex
+    EXP_ERROR_VERTEX_IS_NOT_DEVICE = 0x7ff00002                             ## [Core, Experimental] fabric vertex does not represent a device
+    EXP_ERROR_REMOTE_DEVICE = 0x7ff00003                                    ## [Core, Experimental] fabric vertex represents a remote device or
+                                                                            ## subdevice
+    EXP_ERROR_OPERANDS_INCOMPATIBLE = 0x7ff00004                            ## [Core, Experimental] operands of comparison are not compatible
+    EXP_RTAS_BUILD_RETRY = 0x7ff00005                                       ## [Core, Experimental] ray tracing acceleration structure build
+                                                                            ## operation failed due to insufficient resources, retry with a larger
+                                                                            ## acceleration structure buffer allocation
+    EXP_RTAS_BUILD_DEFERRED = 0x7ff00006                                    ## [Core, Experimental] ray tracing acceleration structure build
+                                                                            ## operation deferred to parallel operation join
+    ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000                             ## [Sysman] access denied due to permission level
+    ERROR_NOT_AVAILABLE = 0x70010001                                        ## [Sysman] resource already in use and simultaneous access not allowed
+                                                                            ## or resource was removed
+    ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000                               ## [Common] external required dependency is unavailable or missing
+    WARNING_DROPPED_DATA = 0x70020001                                       ## [Tools] data may have been dropped
+    ERROR_UNINITIALIZED = 0x78000001                                        ## [Validation] driver is not initialized
+    ERROR_UNSUPPORTED_VERSION = 0x78000002                                  ## [Validation] generic error code for unsupported versions
+    ERROR_UNSUPPORTED_FEATURE = 0x78000003                                  ## [Validation] generic error code for unsupported features
+    ERROR_INVALID_ARGUMENT = 0x78000004                                     ## [Validation] generic error code for invalid arguments
+    ERROR_INVALID_NULL_HANDLE = 0x78000005                                  ## [Validation] handle argument is not valid
+    ERROR_HANDLE_OBJECT_IN_USE = 0x78000006                                 ## [Validation] object pointed to by handle still in-use by device
+    ERROR_INVALID_NULL_POINTER = 0x78000007                                 ## [Validation] pointer argument may not be nullptr
+    ERROR_INVALID_SIZE = 0x78000008                                         ## [Validation] size argument is invalid (e.g., must not be zero)
+    ERROR_UNSUPPORTED_SIZE = 0x78000009                                     ## [Validation] size argument is not supported by the device (e.g., too
+                                                                            ## large)
+    ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a                                ## [Validation] alignment argument is not supported by the device (e.g.,
+                                                                            ## too small)
+    ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b                       ## [Validation] synchronization object in invalid state
+    ERROR_INVALID_ENUMERATION = 0x7800000c                                  ## [Validation] enumerator argument is not valid
+    ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d                              ## [Validation] enumerator argument is not supported by the device
+    ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e                             ## [Validation] image format is not supported by the device
+    ERROR_INVALID_NATIVE_BINARY = 0x7800000f                                ## [Validation] native binary is not supported by the device
+    ERROR_INVALID_GLOBAL_NAME = 0x78000010                                  ## [Validation] global variable is not found in the module
+    ERROR_INVALID_KERNEL_NAME = 0x78000011                                  ## [Validation] kernel name is not found in the module
+    ERROR_INVALID_FUNCTION_NAME = 0x78000012                                ## [Validation] function name is not found in the module
+    ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013                         ## [Validation] group size dimension is not valid for the kernel or
+                                                                            ## device
+    ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014                       ## [Validation] global width dimension is not valid for the kernel or
+                                                                            ## device
+    ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015                        ## [Validation] kernel argument index is not valid for kernel
+    ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016                         ## [Validation] kernel argument size does not match kernel
+    ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017                       ## [Validation] value of kernel attribute is not valid for the kernel or
+                                                                            ## device
+    ERROR_INVALID_MODULE_UNLINKED = 0x78000018                              ## [Validation] module with imports needs to be linked before kernels can
+                                                                            ## be created from it.
+    ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019                            ## [Validation] command list type does not match command queue type
+    ERROR_OVERLAPPING_REGIONS = 0x7800001a                                  ## [Validation] copy operations do not support overlapping regions of
+                                                                            ## memory
+    WARNING_ACTION_REQUIRED = 0x7800001b                                    ## [Sysman] an action is required to complete the desired operation
+    ERROR_INVALID_KERNEL_HANDLE = 0x7800001c                                ## [Core, Validation] kernel handle is invalid for the operation
+    ERROR_UNKNOWN = 0x7ffffffe                                              ## [Core] unknown or internal error
+
+class ze_result_t(c_int):
+    def __str__(self):
+        return str(ze_result_v(self.value))
+
+
+###############################################################################
+## @brief Defines structure types
+class ze_structure_type_v(IntEnum):
+    DRIVER_PROPERTIES = 0x1                                                 ## ::ze_driver_properties_t
+    DRIVER_IPC_PROPERTIES = 0x2                                             ## ::ze_driver_ipc_properties_t
+    DEVICE_PROPERTIES = 0x3                                                 ## ::ze_device_properties_t
+    DEVICE_COMPUTE_PROPERTIES = 0x4                                         ## ::ze_device_compute_properties_t
+    DEVICE_MODULE_PROPERTIES = 0x5                                          ## ::ze_device_module_properties_t
+    COMMAND_QUEUE_GROUP_PROPERTIES = 0x6                                    ## ::ze_command_queue_group_properties_t
+    DEVICE_MEMORY_PROPERTIES = 0x7                                          ## ::ze_device_memory_properties_t
+    DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8                                   ## ::ze_device_memory_access_properties_t
+    DEVICE_CACHE_PROPERTIES = 0x9                                           ## ::ze_device_cache_properties_t
+    DEVICE_IMAGE_PROPERTIES = 0xa                                           ## ::ze_device_image_properties_t
+    DEVICE_P2P_PROPERTIES = 0xb                                             ## ::ze_device_p2p_properties_t
+    DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc                                 ## ::ze_device_external_memory_properties_t
+    CONTEXT_DESC = 0xd                                                      ## ::ze_context_desc_t
+    COMMAND_QUEUE_DESC = 0xe                                                ## ::ze_command_queue_desc_t
+    COMMAND_LIST_DESC = 0xf                                                 ## ::ze_command_list_desc_t
+    EVENT_POOL_DESC = 0x10                                                  ## ::ze_event_pool_desc_t
+    EVENT_DESC = 0x11                                                       ## ::ze_event_desc_t
+    FENCE_DESC = 0x12                                                       ## ::ze_fence_desc_t
+    IMAGE_DESC = 0x13                                                       ## ::ze_image_desc_t
+    IMAGE_PROPERTIES = 0x14                                                 ## ::ze_image_properties_t
+    DEVICE_MEM_ALLOC_DESC = 0x15                                            ## ::ze_device_mem_alloc_desc_t
+    HOST_MEM_ALLOC_DESC = 0x16                                              ## ::ze_host_mem_alloc_desc_t
+    MEMORY_ALLOCATION_PROPERTIES = 0x17                                     ## ::ze_memory_allocation_properties_t
+    EXTERNAL_MEMORY_EXPORT_DESC = 0x18                                      ## ::ze_external_memory_export_desc_t
+    EXTERNAL_MEMORY_IMPORT_FD = 0x19                                        ## ::ze_external_memory_import_fd_t
+    EXTERNAL_MEMORY_EXPORT_FD = 0x1a                                        ## ::ze_external_memory_export_fd_t
+    MODULE_DESC = 0x1b                                                      ## ::ze_module_desc_t
+    MODULE_PROPERTIES = 0x1c                                                ## ::ze_module_properties_t
+    KERNEL_DESC = 0x1d                                                      ## ::ze_kernel_desc_t
+    KERNEL_PROPERTIES = 0x1e                                                ## ::ze_kernel_properties_t
+    SAMPLER_DESC = 0x1f                                                     ## ::ze_sampler_desc_t
+    PHYSICAL_MEM_DESC = 0x20                                                ## ::ze_physical_mem_desc_t
+    KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21                           ## ::ze_kernel_preferred_group_size_properties_t
+    EXTERNAL_MEMORY_IMPORT_WIN32 = 0x22                                     ## ::ze_external_memory_import_win32_handle_t
+    EXTERNAL_MEMORY_EXPORT_WIN32 = 0x23                                     ## ::ze_external_memory_export_win32_handle_t
+    DEVICE_RAYTRACING_EXT_PROPERTIES = 0x00010001                           ## ::ze_device_raytracing_ext_properties_t
+    RAYTRACING_MEM_ALLOC_EXT_DESC = 0x10002                                 ## ::ze_raytracing_mem_alloc_ext_desc_t
+    FLOAT_ATOMIC_EXT_PROPERTIES = 0x10003                                   ## ::ze_float_atomic_ext_properties_t
+    CACHE_RESERVATION_EXT_DESC = 0x10004                                    ## ::ze_cache_reservation_ext_desc_t
+    EU_COUNT_EXT = 0x10005                                                  ## ::ze_eu_count_ext_t
+    SRGB_EXT_DESC = 0x10006                                                 ## ::ze_srgb_ext_desc_t
+    LINKAGE_INSPECTION_EXT_DESC = 0x10007                                   ## ::ze_linkage_inspection_ext_desc_t
+    PCI_EXT_PROPERTIES = 0x10008                                            ## ::ze_pci_ext_properties_t
+    DRIVER_MEMORY_FREE_EXT_PROPERTIES = 0x10009                             ## ::ze_driver_memory_free_ext_properties_t
+    MEMORY_FREE_EXT_DESC = 0x1000a                                          ## ::ze_memory_free_ext_desc_t
+    MEMORY_COMPRESSION_HINTS_EXT_DESC = 0x1000b                             ## ::ze_memory_compression_hints_ext_desc_t
+    IMAGE_ALLOCATION_EXT_PROPERTIES = 0x1000c                               ## ::ze_image_allocation_ext_properties_t
+    DEVICE_LUID_EXT_PROPERTIES = 0x1000d                                    ## ::ze_device_luid_ext_properties_t
+    DEVICE_MEMORY_EXT_PROPERTIES = 0x1000e                                  ## ::ze_device_memory_ext_properties_t
+    DEVICE_IP_VERSION_EXT = 0x1000f                                         ## ::ze_device_ip_version_ext_t
+    IMAGE_VIEW_PLANAR_EXT_DESC = 0x10010                                    ## ::ze_image_view_planar_ext_desc_t
+    EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_PROPERTIES = 0x10011                  ## ::ze_event_query_kernel_timestamps_ext_properties_t
+    EVENT_QUERY_KERNEL_TIMESTAMPS_RESULTS_EXT_PROPERTIES = 0x10012          ## ::ze_event_query_kernel_timestamps_results_ext_properties_t
+    KERNEL_MAX_GROUP_SIZE_EXT_PROPERTIES = 0x10013                          ## ::ze_kernel_max_group_size_ext_properties_t
+    RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001                         ## ::ze_relaxed_allocation_limits_exp_desc_t
+    MODULE_PROGRAM_EXP_DESC = 0x00020002                                    ## ::ze_module_program_exp_desc_t
+    SCHEDULING_HINT_EXP_PROPERTIES = 0x00020003                             ## ::ze_scheduling_hint_exp_properties_t
+    SCHEDULING_HINT_EXP_DESC = 0x00020004                                   ## ::ze_scheduling_hint_exp_desc_t
+    IMAGE_VIEW_PLANAR_EXP_DESC = 0x00020005                                 ## ::ze_image_view_planar_exp_desc_t
+    DEVICE_PROPERTIES_1_2 = 0x00020006                                      ## ::ze_device_properties_t
+    IMAGE_MEMORY_EXP_PROPERTIES = 0x00020007                                ## ::ze_image_memory_properties_exp_t
+    POWER_SAVING_HINT_EXP_DESC = 0x00020008                                 ## ::ze_context_power_saving_hint_exp_desc_t
+    COPY_BANDWIDTH_EXP_PROPERTIES = 0x00020009                              ## ::ze_copy_bandwidth_exp_properties_t
+    DEVICE_P2P_BANDWIDTH_EXP_PROPERTIES = 0x0002000A                        ## ::ze_device_p2p_bandwidth_exp_properties_t
+    FABRIC_VERTEX_EXP_PROPERTIES = 0x0002000B                               ## ::ze_fabric_vertex_exp_properties_t
+    FABRIC_EDGE_EXP_PROPERTIES = 0x0002000C                                 ## ::ze_fabric_edge_exp_properties_t
+    MEMORY_SUB_ALLOCATIONS_EXP_PROPERTIES = 0x0002000D                      ## ::ze_memory_sub_allocations_exp_properties_t
+    RTAS_BUILDER_EXP_DESC = 0x0002000E                                      ## ::ze_rtas_builder_exp_desc_t
+    RTAS_BUILDER_BUILD_OP_EXP_DESC = 0x0002000F                             ## ::ze_rtas_builder_build_op_exp_desc_t
+    RTAS_BUILDER_EXP_PROPERTIES = 0x00020010                                ## ::ze_rtas_builder_exp_properties_t
+    RTAS_PARALLEL_OPERATION_EXP_PROPERTIES = 0x00020011                     ## ::ze_rtas_parallel_operation_exp_properties_t
+    RTAS_DEVICE_EXP_PROPERTIES = 0x00020012                                 ## ::ze_rtas_device_exp_properties_t
+    RTAS_GEOMETRY_AABBS_EXP_CB_PARAMS = 0x00020013                          ## ::ze_rtas_geometry_aabbs_exp_cb_params_t
+    COUNTER_BASED_EVENT_POOL_EXP_DESC = 0x00020014                          ## ::ze_event_pool_counter_based_exp_desc_t
+    MUTABLE_COMMAND_LIST_EXP_PROPERTIES = 0x00020015                        ## ::ze_mutable_command_list_exp_properties_t
+    MUTABLE_COMMAND_LIST_EXP_DESC = 0x00020016                              ## ::ze_mutable_command_list_exp_desc_t
+    MUTABLE_COMMAND_ID_EXP_DESC = 0x00020017                                ## ::ze_mutable_command_id_exp_desc_t
+    MUTABLE_COMMANDS_EXP_DESC = 0x00020018                                  ## ::ze_mutable_commands_exp_desc_t
+    MUTABLE_KERNEL_ARGUMENT_EXP_DESC = 0x00020019                           ## ::ze_mutable_kernel_argument_exp_desc_t
+    MUTABLE_GROUP_COUNT_EXP_DESC = 0x0002001A                               ## ::ze_mutable_group_count_exp_desc_t
+    MUTABLE_GROUP_SIZE_EXP_DESC = 0x0002001B                                ## ::ze_mutable_group_size_exp_desc_t
+    MUTABLE_GLOBAL_OFFSET_EXP_DESC = 0x0002001C                             ## ::ze_mutable_global_offset_exp_desc_t
+    PITCHED_ALLOC_DEVICE_EXP_PROPERTIES = 0x0002001D                        ## ::ze_device_pitched_alloc_exp_properties_t
+    BINDLESS_IMAGE_EXP_DESC = 0x0002001E                                    ## ::ze_image_bindless_exp_desc_t
+    PITCHED_IMAGE_EXP_DESC = 0x0002001F                                     ## ::ze_image_pitched_exp_desc_t
+    MUTABLE_GRAPH_ARGUMENT_EXP_DESC = 0x00020020                            ## ::ze_mutable_graph_argument_exp_desc_t
+    INIT_DRIVER_TYPE_DESC = 0x00020021                                      ## ::ze_init_driver_type_desc_t
+
+class ze_structure_type_t(c_int):
+    def __str__(self):
+        return str(ze_structure_type_v(self.value))
+
+
+###############################################################################
+## @brief External memory type flags
+class ze_external_memory_type_flags_v(IntEnum):
+    OPAQUE_FD = ZE_BIT(0)                                                   ## an opaque POSIX file descriptor handle
+    DMA_BUF = ZE_BIT(1)                                                     ## a file descriptor handle for a Linux dma_buf
+    OPAQUE_WIN32 = ZE_BIT(2)                                                ## an NT handle
+    OPAQUE_WIN32_KMT = ZE_BIT(3)                                            ## a global share (KMT) handle
+    D3D11_TEXTURE = ZE_BIT(4)                                               ## an NT handle referring to a Direct3D 10 or 11 texture resource
+    D3D11_TEXTURE_KMT = ZE_BIT(5)                                           ## a global share (KMT) handle referring to a Direct3D 10 or 11 texture
+                                                                            ## resource
+    D3D12_HEAP = ZE_BIT(6)                                                  ## an NT handle referring to a Direct3D 12 heap resource
+    D3D12_RESOURCE = ZE_BIT(7)                                              ## an NT handle referring to a Direct3D 12 committed resource
+
+class ze_external_memory_type_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Bandwidth unit
+class ze_bandwidth_unit_v(IntEnum):
+    UNKNOWN = 0                                                             ## The unit used for bandwidth is unknown
+    BYTES_PER_NANOSEC = 1                                                   ## Bandwidth is provided in bytes/nanosec
+    BYTES_PER_CLOCK = 2                                                     ## Bandwidth is provided in bytes/clock
+
+class ze_bandwidth_unit_t(c_int):
+    def __str__(self):
+        return str(ze_bandwidth_unit_v(self.value))
+
+
+###############################################################################
+## @brief Latency unit
+class ze_latency_unit_v(IntEnum):
+    UNKNOWN = 0                                                             ## The unit used for latency is unknown
+    NANOSEC = 1                                                             ## Latency is provided in nanosecs
+    CLOCK = 2                                                               ## Latency is provided in clocks
+    HOP = 3                                                                 ## Latency is provided in hops (normalized so that the lowest latency
+                                                                            ## link has a latency of 1 hop)
+
+class ze_latency_unit_t(c_int):
+    def __str__(self):
+        return str(ze_latency_unit_v(self.value))
+
+
+###############################################################################
+## @brief Maximum universal unique id (UUID) size in bytes
+ZE_MAX_UUID_SIZE = 16
+
+###############################################################################
+## @brief Universal unique id (UUID)
+class ze_uuid_t(Structure):
+    _fields_ = [
+        ("id", c_ubyte * ZE_MAX_UUID_SIZE)                              ## [out] opaque data representing a UUID
+    ]
+
+###############################################################################
+## @brief Base for all callback function parameter types
+class ze_base_cb_params_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all properties types
+class ze_base_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all descriptor types
+class ze_base_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Forces driver to only report devices (and sub-devices) as specified by
+##        values
+
+###############################################################################
+## @brief Forces driver to report devices from lowest to highest PCI bus ID
+
+###############################################################################
+## @brief Forces all shared allocations into device memory
+
+###############################################################################
+## @brief Defines the device hierarchy model exposed by Level Zero driver
+##        implementation
+
+###############################################################################
+## @brief Supported initialization flags
+class ze_init_flags_v(IntEnum):
+    GPU_ONLY = ZE_BIT(0)                                                    ## only initialize GPU drivers
+    VPU_ONLY = ZE_BIT(1)                                                    ## only initialize VPU drivers
+
+class ze_init_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported driver initialization type flags
+## 
+## @details
+##     - Bit Field which details the driver types to be initialized and
+##       returned to the user.
+##     - Value Definition:
+##     - 0, do not init or retrieve any drivers.
+##     - ZE_INIT_DRIVER_TYPE_FLAG_GPU,	GPU Drivers are Init and driver handles
+##       retrieved.
+##     - ZE_INIT_DRIVER_TYPE_FLAG_NPU,	NPU Drivers are Init and driver handles
+##       retrieved.
+##     - ZE_INIT_DRIVER_TYPE_FLAG_GPU | ZE_INIT_DRIVER_TYPE_FLAG_NPU, NPU & GPU
+##       Drivers are Init and driver handles retrieved.
+##     - UINT32_MAX	All Drivers of any type are Init and driver handles
+##       retrieved.
+class ze_init_driver_type_flags_v(IntEnum):
+    GPU = ZE_BIT(0)                                                         ## initialize and retrieve GPU drivers
+    NPU = ZE_BIT(1)                                                         ## initialize and retrieve NPU drivers
+
+class ze_init_driver_type_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Init Driver Type descriptor
+class ze_init_driver_type_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_init_driver_type_flags_t)                          ## [in] driver type init flags.
+                                                                        ## must be a valid combination of ::ze_init_driver_type_flag_t or UINT32_MAX;
+                                                                        ## driver types are init and retrieved based on these init flags in zeInitDrivers().
+    ]
+
+###############################################################################
+## @brief Supported API versions
+## 
+## @details
+##     - API versions contain major and minor attributes, use
+##       ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION
+class ze_api_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    _1_1 = ZE_MAKE_VERSION( 1, 1 )                                          ## version 1.1
+    _1_2 = ZE_MAKE_VERSION( 1, 2 )                                          ## version 1.2
+    _1_3 = ZE_MAKE_VERSION( 1, 3 )                                          ## version 1.3
+    _1_4 = ZE_MAKE_VERSION( 1, 4 )                                          ## version 1.4
+    _1_5 = ZE_MAKE_VERSION( 1, 5 )                                          ## version 1.5
+    _1_6 = ZE_MAKE_VERSION( 1, 6 )                                          ## version 1.6
+    _1_7 = ZE_MAKE_VERSION( 1, 7 )                                          ## version 1.7
+    _1_8 = ZE_MAKE_VERSION( 1, 8 )                                          ## version 1.8
+    _1_9 = ZE_MAKE_VERSION( 1, 9 )                                          ## version 1.9
+    _1_10 = ZE_MAKE_VERSION( 1, 10 )                                        ## version 1.10
+    _1_11 = ZE_MAKE_VERSION( 1, 11 )                                        ## version 1.11
+    CURRENT = ZE_MAKE_VERSION( 1, 11 )                                      ## latest known version
+
+class ze_api_version_t(c_int):
+    def __str__(self):
+        return str(ze_api_version_v(self.value))
+
+
+###############################################################################
+## @brief Current API version as a macro
+ZE_API_VERSION_CURRENT_M = ZE_MAKE_VERSION( 1, 11 )
+
+###############################################################################
+## @brief Maximum driver universal unique id (UUID) size in bytes
+ZE_MAX_DRIVER_UUID_SIZE = 16
+
+###############################################################################
+## @brief Driver universal unique id (UUID)
+class ze_driver_uuid_t(Structure):
+    _fields_ = [
+        ("id", c_ubyte * ZE_MAX_DRIVER_UUID_SIZE)                       ## [out] opaque data representing a driver UUID
+    ]
+
+###############################################################################
+## @brief Driver properties queried using ::zeDriverGetProperties
+class ze_driver_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("uuid", ze_driver_uuid_t),                                     ## [out] universal unique identifier.
+        ("driverVersion", c_ulong)                                      ## [out] driver version
+                                                                        ## The driver version is a non-zero, monotonically increasing value where
+                                                                        ## higher values always indicate a more recent version.
+    ]
+
+###############################################################################
+## @brief Supported IPC property flags
+class ze_ipc_property_flags_v(IntEnum):
+    MEMORY = ZE_BIT(0)                                                      ## Supports passing memory allocations between processes. See
+                                                                            ## ::zeMemGetIpcHandle.
+    EVENT_POOL = ZE_BIT(1)                                                  ## Supports passing event pools between processes. See
+                                                                            ## ::zeEventPoolGetIpcHandle.
+
+class ze_ipc_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief IPC properties queried using ::zeDriverGetIpcProperties
+class ze_driver_ipc_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_ipc_property_flags_t)                              ## [out] 0 (none) or a valid combination of ::ze_ipc_property_flag_t
+    ]
+
+###############################################################################
+## @brief Maximum extension name string size
+ZE_MAX_EXTENSION_NAME = 256
+
+###############################################################################
+## @brief Extension properties queried using ::zeDriverGetExtensionProperties
+class ze_driver_extension_properties_t(Structure):
+    _fields_ = [
+        ("name", c_char * ZE_MAX_EXTENSION_NAME),                       ## [out] extension name
+        ("version", c_ulong)                                            ## [out] extension version using ::ZE_MAKE_VERSION
+    ]
+
+###############################################################################
+## @brief Supported device types
+class ze_device_type_v(IntEnum):
+    GPU = 1                                                                 ## Graphics Processing Unit
+    CPU = 2                                                                 ## Central Processing Unit
+    FPGA = 3                                                                ## Field Programmable Gate Array
+    MCA = 4                                                                 ## Memory Copy Accelerator
+    VPU = 5                                                                 ## Vision Processing Unit
+
+class ze_device_type_t(c_int):
+    def __str__(self):
+        return str(ze_device_type_v(self.value))
+
+
+###############################################################################
+## @brief Maximum device universal unique id (UUID) size in bytes
+ZE_MAX_DEVICE_UUID_SIZE = 16
+
+###############################################################################
+## @brief Device universal unique id (UUID)
+class ze_device_uuid_t(Structure):
+    _fields_ = [
+        ("id", c_ubyte * ZE_MAX_DEVICE_UUID_SIZE)                       ## [out] opaque data representing a device UUID
+    ]
+
+###############################################################################
+## @brief Maximum device name string size
+ZE_MAX_DEVICE_NAME = 256
+
+###############################################################################
+## @brief Supported device property flags
+class ze_device_property_flags_v(IntEnum):
+    INTEGRATED = ZE_BIT(0)                                                  ## Device is integrated with the Host.
+    SUBDEVICE = ZE_BIT(1)                                                   ## Device handle used for query represents a sub-device.
+    ECC = ZE_BIT(2)                                                         ## Device supports error correction memory access.
+    ONDEMANDPAGING = ZE_BIT(3)                                              ## Device supports on-demand page-faulting.
+
+class ze_device_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device properties queried using ::zeDeviceGetProperties
+class ze_device_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", ze_device_type_t),                                     ## [out] generic device type
+        ("vendorId", c_ulong),                                          ## [out] vendor id from PCI configuration
+        ("deviceId", c_ulong),                                          ## [out] device id from PCI configuration.
+                                                                        ## Note, the device id uses little-endian format.
+        ("flags", ze_device_property_flags_t),                          ## [out] 0 (none) or a valid combination of ::ze_device_property_flag_t
+        ("subdeviceId", c_ulong),                                       ## [out] sub-device id. Only valid if ::ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE
+                                                                        ## is set.
+        ("coreClockRate", c_ulong),                                     ## [out] Clock rate for device core.
+        ("maxMemAllocSize", c_ulonglong),                               ## [out] Maximum memory allocation size.
+        ("maxHardwareContexts", c_ulong),                               ## [out] Maximum number of logical hardware contexts.
+        ("maxCommandQueuePriority", c_ulong),                           ## [out] Maximum priority for command queues. Higher value is higher
+                                                                        ## priority.
+        ("numThreadsPerEU", c_ulong),                                   ## [out] Maximum number of threads per EU.
+        ("physicalEUSimdWidth", c_ulong),                               ## [out] The physical EU simd width.
+        ("numEUsPerSubslice", c_ulong),                                 ## [out] Maximum number of EUs per sub-slice.
+        ("numSubslicesPerSlice", c_ulong),                              ## [out] Maximum number of sub-slices per slice.
+        ("numSlices", c_ulong),                                         ## [out] Maximum number of slices.
+        ("timerResolution", c_ulonglong),                               ## [out] Returns the resolution of device timer used for profiling,
+                                                                        ## timestamps, etc. When stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES the
+                                                                        ## units are in nanoseconds. When
+                                                                        ## stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 units are in
+                                                                        ## cycles/sec
+        ("timestampValidBits", c_ulong),                                ## [out] Returns the number of valid bits in the timestamp value.
+        ("kernelTimestampValidBits", c_ulong),                          ## [out] Returns the number of valid bits in the kernel timestamp values
+        ("uuid", ze_device_uuid_t),                                     ## [out] universal unique identifier. Note: Subdevices will have their
+                                                                        ## own uuid.
+        ("name", c_char * ZE_MAX_DEVICE_NAME)                           ## [out] Device name
+    ]
+
+###############################################################################
+## @brief Device thread identifier.
+class ze_device_thread_t(Structure):
+    _fields_ = [
+        ("slice", c_ulong),                                             ## [in,out] the slice number.
+                                                                        ## Must be `UINT32_MAX` (all) or less than the `numSlices` member of ::ze_device_properties_t.
+        ("subslice", c_ulong),                                          ## [in,out] the sub-slice number within its slice.
+                                                                        ## Must be `UINT32_MAX` (all) or less than the `numSubslicesPerSlice`
+                                                                        ## member of ::ze_device_properties_t.
+        ("eu", c_ulong),                                                ## [in,out] the EU number within its sub-slice.
+                                                                        ## Must be `UINT32_MAX` (all) or less than the `numEUsPerSubslice` member
+                                                                        ## of ::ze_device_properties_t.
+        ("thread", c_ulong)                                             ## [in,out] the thread number within its EU.
+                                                                        ## Must be `UINT32_MAX` (all) or less than the `numThreadsPerEU` member
+                                                                        ## of ::ze_device_properties_t.
+    ]
+
+###############################################################################
+## @brief Maximum number of subgroup sizes supported.
+ZE_SUBGROUPSIZE_COUNT = 8
+
+###############################################################################
+## @brief Device compute properties queried using ::zeDeviceGetComputeProperties
+class ze_device_compute_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("maxTotalGroupSize", c_ulong),                                 ## [out] Maximum items per compute group. (groupSizeX * groupSizeY *
+                                                                        ## groupSizeZ) <= maxTotalGroupSize
+        ("maxGroupSizeX", c_ulong),                                     ## [out] Maximum items for X dimension in group
+        ("maxGroupSizeY", c_ulong),                                     ## [out] Maximum items for Y dimension in group
+        ("maxGroupSizeZ", c_ulong),                                     ## [out] Maximum items for Z dimension in group
+        ("maxGroupCountX", c_ulong),                                    ## [out] Maximum groups that can be launched for x dimension
+        ("maxGroupCountY", c_ulong),                                    ## [out] Maximum groups that can be launched for y dimension
+        ("maxGroupCountZ", c_ulong),                                    ## [out] Maximum groups that can be launched for z dimension
+        ("maxSharedLocalMemory", c_ulong),                              ## [out] Maximum shared local memory per group.
+        ("numSubGroupSizes", c_ulong),                                  ## [out] Number of subgroup sizes supported. This indicates number of
+                                                                        ## entries in subGroupSizes.
+        ("subGroupSizes", c_ulong * ZE_SUBGROUPSIZE_COUNT)              ## [out] Size group sizes supported.
+    ]
+
+###############################################################################
+## @brief Maximum native kernel universal unique id (UUID) size in bytes
+ZE_MAX_NATIVE_KERNEL_UUID_SIZE = 16
+
+###############################################################################
+## @brief Native kernel universal unique id (UUID)
+class ze_native_kernel_uuid_t(Structure):
+    _fields_ = [
+        ("id", c_ubyte * ZE_MAX_NATIVE_KERNEL_UUID_SIZE)                ## [out] opaque data representing a native kernel UUID
+    ]
+
+###############################################################################
+## @brief Supported device module flags
+class ze_device_module_flags_v(IntEnum):
+    FP16 = ZE_BIT(0)                                                        ## Device supports 16-bit floating-point operations
+    FP64 = ZE_BIT(1)                                                        ## Device supports 64-bit floating-point operations
+    INT64_ATOMICS = ZE_BIT(2)                                               ## Device supports 64-bit atomic operations
+    DP4A = ZE_BIT(3)                                                        ## Device supports four component dot product and accumulate operations
+
+class ze_device_module_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported floating-Point capability flags
+class ze_device_fp_flags_v(IntEnum):
+    DENORM = ZE_BIT(0)                                                      ## Supports denorms
+    INF_NAN = ZE_BIT(1)                                                     ## Supports INF and quiet NaNs
+    ROUND_TO_NEAREST = ZE_BIT(2)                                            ## Supports rounding to nearest even rounding mode
+    ROUND_TO_ZERO = ZE_BIT(3)                                               ## Supports rounding to zero.
+    ROUND_TO_INF = ZE_BIT(4)                                                ## Supports rounding to both positive and negative INF.
+    FMA = ZE_BIT(5)                                                         ## Supports IEEE754-2008 fused multiply-add.
+    ROUNDED_DIVIDE_SQRT = ZE_BIT(6)                                         ## Supports rounding as defined by IEEE754 for divide and sqrt
+                                                                            ## operations.
+    SOFT_FLOAT = ZE_BIT(7)                                                  ## Uses software implementation for basic floating-point operations.
+
+class ze_device_fp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device module properties queried using ::zeDeviceGetModuleProperties
+class ze_device_module_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("spirvVersionSupported", c_ulong),                             ## [out] Maximum supported SPIR-V version.
+                                                                        ## Returns zero if SPIR-V is not supported.
+                                                                        ## Contains major and minor attributes, use ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION.
+        ("flags", ze_device_module_flags_t),                            ## [out] 0 or a valid combination of ::ze_device_module_flag_t
+        ("fp16flags", ze_device_fp_flags_t),                            ## [out] Capabilities for half-precision floating-point operations.
+                                                                        ## returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a
+                                                                        ## combination of ::ze_device_fp_flag_t.
+        ("fp32flags", ze_device_fp_flags_t),                            ## [out] Capabilities for single-precision floating-point operations.
+                                                                        ## returns a combination of ::ze_device_fp_flag_t.
+        ("fp64flags", ze_device_fp_flags_t),                            ## [out] Capabilities for double-precision floating-point operations.
+                                                                        ## returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a
+                                                                        ## combination of ::ze_device_fp_flag_t.
+        ("maxArgumentsSize", c_ulong),                                  ## [out] Maximum kernel argument size that is supported.
+        ("printfBufferSize", c_ulong),                                  ## [out] Maximum size of internal buffer that holds output of printf
+                                                                        ## calls from kernel.
+        ("nativeKernelSupported", ze_native_kernel_uuid_t)              ## [out] Compatibility UUID of supported native kernel.
+                                                                        ## UUID may or may not be the same across driver release, devices, or
+                                                                        ## operating systems.
+                                                                        ## Application is responsible for ensuring UUID matches before creating
+                                                                        ## module using
+                                                                        ## previously created native kernel.
+    ]
+
+###############################################################################
+## @brief Supported command queue group property flags
+class ze_command_queue_group_property_flags_v(IntEnum):
+    COMPUTE = ZE_BIT(0)                                                     ## Command queue group supports enqueing compute commands.
+    COPY = ZE_BIT(1)                                                        ## Command queue group supports enqueing copy commands.
+    COOPERATIVE_KERNELS = ZE_BIT(2)                                         ## Command queue group supports cooperative kernels.
+                                                                            ## See ::zeCommandListAppendLaunchCooperativeKernel for more details.
+    METRICS = ZE_BIT(3)                                                     ## Command queue groups supports metric queries.
+
+class ze_command_queue_group_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Command queue group properties queried using
+##        ::zeDeviceGetCommandQueueGroupProperties
+class ze_command_queue_group_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_command_queue_group_property_flags_t),             ## [out] 0 (none) or a valid combination of
+                                                                        ## ::ze_command_queue_group_property_flag_t
+        ("maxMemoryFillPatternSize", c_size_t),                         ## [out] maximum `pattern_size` supported by command queue group.
+                                                                        ## See ::zeCommandListAppendMemoryFill for more details.
+        ("numQueues", c_ulong)                                          ## [out] the number of physical engines within the group.
+    ]
+
+###############################################################################
+## @brief Supported device memory property flags
+class ze_device_memory_property_flags_v(IntEnum):
+    TBD = ZE_BIT(0)                                                         ## reserved for future use
+
+class ze_device_memory_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device local memory properties queried using
+##        ::zeDeviceGetMemoryProperties
+class ze_device_memory_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_device_memory_property_flags_t),                   ## [out] 0 (none) or a valid combination of
+                                                                        ## ::ze_device_memory_property_flag_t
+        ("maxClockRate", c_ulong),                                      ## [out] Maximum clock rate for device memory.
+        ("maxBusWidth", c_ulong),                                       ## [out] Maximum bus width between device and memory.
+        ("totalSize", c_ulonglong),                                     ## [out] Total memory size in bytes that is available to the device.
+        ("name", c_char * ZE_MAX_DEVICE_NAME)                           ## [out] Memory name
+    ]
+
+###############################################################################
+## @brief Memory access capability flags
+## 
+## @details
+##     - Supported access capabilities for different types of memory
+##       allocations
+class ze_memory_access_cap_flags_v(IntEnum):
+    RW = ZE_BIT(0)                                                          ## Supports load/store access
+    ATOMIC = ZE_BIT(1)                                                      ## Supports atomic access
+    CONCURRENT = ZE_BIT(2)                                                  ## Supports concurrent access
+    CONCURRENT_ATOMIC = ZE_BIT(3)                                           ## Supports concurrent atomic access
+
+class ze_memory_access_cap_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device memory access properties queried using
+##        ::zeDeviceGetMemoryAccessProperties
+class ze_device_memory_access_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("hostAllocCapabilities", ze_memory_access_cap_flags_t),        ## [out] host memory capabilities.
+                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+        ("deviceAllocCapabilities", ze_memory_access_cap_flags_t),      ## [out] device memory capabilities.
+                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+        ("sharedSingleDeviceAllocCapabilities", ze_memory_access_cap_flags_t),  ## [out] shared, single-device memory capabilities.
+                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+        ("sharedCrossDeviceAllocCapabilities", ze_memory_access_cap_flags_t),   ## [out] shared, cross-device memory capabilities.
+                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+        ("sharedSystemAllocCapabilities", ze_memory_access_cap_flags_t) ## [out] shared, system memory capabilities.
+                                                                        ## returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ]
+
+###############################################################################
+## @brief Supported cache control property flags
+class ze_device_cache_property_flags_v(IntEnum):
+    USER_CONTROL = ZE_BIT(0)                                                ## Device support User Cache Control (i.e. SLM section vs Generic Cache)
+
+class ze_device_cache_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device cache properties queried using ::zeDeviceGetCacheProperties
+class ze_device_cache_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_device_cache_property_flags_t),                    ## [out] 0 (none) or a valid combination of
+                                                                        ## ::ze_device_cache_property_flag_t
+        ("cacheSize", c_size_t)                                         ## [out] Per-cache size, in bytes
+    ]
+
+###############################################################################
+## @brief Device image properties queried using ::zeDeviceGetImageProperties
+class ze_device_image_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("maxImageDims1D", c_ulong),                                    ## [out] Maximum image dimensions for 1D resources. if 0, then 1D images
+                                                                        ## are unsupported.
+        ("maxImageDims2D", c_ulong),                                    ## [out] Maximum image dimensions for 2D resources. if 0, then 2D images
+                                                                        ## are unsupported.
+        ("maxImageDims3D", c_ulong),                                    ## [out] Maximum image dimensions for 3D resources. if 0, then 3D images
+                                                                        ## are unsupported.
+        ("maxImageBufferSize", c_ulonglong),                            ## [out] Maximum image buffer size in bytes. if 0, then buffer images are
+                                                                        ## unsupported.
+        ("maxImageArraySlices", c_ulong),                               ## [out] Maximum image array slices. if 0, then image arrays are
+                                                                        ## unsupported.
+        ("maxSamplers", c_ulong),                                       ## [out] Max samplers that can be used in kernel. if 0, then sampling is
+                                                                        ## unsupported.
+        ("maxReadImageArgs", c_ulong),                                  ## [out] Returns the maximum number of simultaneous image objects that
+                                                                        ## can be read from by a kernel. if 0, then reading images is
+                                                                        ## unsupported.
+        ("maxWriteImageArgs", c_ulong)                                  ## [out] Returns the maximum number of simultaneous image objects that
+                                                                        ## can be written to by a kernel. if 0, then writing images is
+                                                                        ## unsupported.
+    ]
+
+###############################################################################
+## @brief Device external memory import and export properties
+class ze_device_external_memory_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("memoryAllocationImportTypes", ze_external_memory_type_flags_t),   ## [out] Supported external memory import types for memory allocations.
+        ("memoryAllocationExportTypes", ze_external_memory_type_flags_t),   ## [out] Supported external memory export types for memory allocations.
+        ("imageImportTypes", ze_external_memory_type_flags_t),          ## [out] Supported external memory import types for images.
+        ("imageExportTypes", ze_external_memory_type_flags_t)           ## [out] Supported external memory export types for images.
+    ]
+
+###############################################################################
+## @brief Supported device peer-to-peer property flags
+class ze_device_p2p_property_flags_v(IntEnum):
+    ACCESS = ZE_BIT(0)                                                      ## Device supports access between peer devices.
+    ATOMICS = ZE_BIT(1)                                                     ## Device supports atomics between peer devices.
+
+class ze_device_p2p_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device peer-to-peer properties queried using
+##        ::zeDeviceGetP2PProperties
+class ze_device_p2p_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_device_p2p_property_flags_t)                       ## [out] 0 (none) or a valid combination of
+                                                                        ## ::ze_device_p2p_property_flag_t
+    ]
+
+###############################################################################
+## @brief Supported context creation flags
+class ze_context_flags_v(IntEnum):
+    TBD = ZE_BIT(0)                                                         ## reserved for future use
+
+class ze_context_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Context descriptor
+class ze_context_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_context_flags_t)                                   ## [in] creation flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_context_flag_t;
+                                                                        ## default behavior may use implicit driver-based heuristics.
+    ]
+
+###############################################################################
+## @brief Supported command queue flags
+class ze_command_queue_flags_v(IntEnum):
+    EXPLICIT_ONLY = ZE_BIT(0)                                               ## command queue should be optimized for submission to a single device engine.
+                                                                            ## driver **must** disable any implicit optimizations for distributing
+                                                                            ## work across multiple engines.
+                                                                            ## this flag should be used when applications want full control over
+                                                                            ## multi-engine submission and scheduling.
+    IN_ORDER = ZE_BIT(1)                                                    ## To be used only when creating immediate command lists. Commands
+                                                                            ## appended to the immediate command
+                                                                            ## list are executed in-order, with driver implementation enforcing
+                                                                            ## dependencies between them.
+                                                                            ## Application is not required to have the signal event of a given
+                                                                            ## command being the wait event of
+                                                                            ## the next to define an in-order list, and application is allowed to
+                                                                            ## pass signal and wait events
+                                                                            ## to each appended command to implement more complex dependency graphs.
+
+class ze_command_queue_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported command queue modes
+class ze_command_queue_mode_v(IntEnum):
+    DEFAULT = 0                                                             ## implicit default behavior; uses driver-based heuristics
+    SYNCHRONOUS = 1                                                         ## Device execution always completes immediately on execute;
+                                                                            ## Host thread is blocked using wait on implicit synchronization object
+    ASYNCHRONOUS = 2                                                        ## Device execution is scheduled and will complete in future;
+                                                                            ## explicit synchronization object must be used to determine completeness
+
+class ze_command_queue_mode_t(c_int):
+    def __str__(self):
+        return str(ze_command_queue_mode_v(self.value))
+
+
+###############################################################################
+## @brief Supported command queue priorities
+class ze_command_queue_priority_v(IntEnum):
+    NORMAL = 0                                                              ## [default] normal priority
+    PRIORITY_LOW = 1                                                        ## lower priority than normal
+    PRIORITY_HIGH = 2                                                       ## higher priority than normal
+
+class ze_command_queue_priority_t(c_int):
+    def __str__(self):
+        return str(ze_command_queue_priority_v(self.value))
+
+
+###############################################################################
+## @brief Command Queue descriptor
+class ze_command_queue_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("ordinal", c_ulong),                                           ## [in] command queue group ordinal
+        ("index", c_ulong),                                             ## [in] command queue index within the group;
+                                                                        ## must be zero if ::ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set
+        ("flags", ze_command_queue_flags_t),                            ## [in] usage flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_command_queue_flag_t;
+                                                                        ## default behavior may use implicit driver-based heuristics to balance
+                                                                        ## latency and throughput.
+        ("mode", ze_command_queue_mode_t),                              ## [in] operation mode
+        ("priority", ze_command_queue_priority_t)                       ## [in] priority
+    ]
+
+###############################################################################
+## @brief Supported command list creation flags
+class ze_command_list_flags_v(IntEnum):
+    RELAXED_ORDERING = ZE_BIT(0)                                            ## driver may reorder commands (e.g., kernels, copies) between barriers
+                                                                            ## and synchronization primitives.
+                                                                            ## using this flag may increase Host overhead of ::zeCommandListClose.
+                                                                            ## therefore, this flag should **not** be set for low-latency usage-models.
+    MAXIMIZE_THROUGHPUT = ZE_BIT(1)                                         ## driver may perform additional optimizations that increase execution
+                                                                            ## throughput. 
+                                                                            ## using this flag may increase Host overhead of ::zeCommandListClose and ::zeCommandQueueExecuteCommandLists.
+                                                                            ## therefore, this flag should **not** be set for low-latency usage-models.
+    EXPLICIT_ONLY = ZE_BIT(2)                                               ## command list should be optimized for submission to a single command
+                                                                            ## queue and device engine.
+                                                                            ## driver **must** disable any implicit optimizations for distributing
+                                                                            ## work across multiple engines.
+                                                                            ## this flag should be used when applications want full control over
+                                                                            ## multi-engine submission and scheduling.
+    IN_ORDER = ZE_BIT(3)                                                    ## commands appended to this command list are executed in-order, with
+                                                                            ## driver implementation
+                                                                            ## enforcing dependencies between them. Application is not required to
+                                                                            ## have the signal event
+                                                                            ## of a given command being the wait event of the next to define an
+                                                                            ## in-order list, and
+                                                                            ## application is allowed to pass signal and wait events to each appended
+                                                                            ## command to implement
+                                                                            ## more complex dependency graphs. Cannot be combined with ::ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING.
+    EXP_CLONEABLE = ZE_BIT(4)                                               ## this command list may be cloned using ::zeCommandListCreateCloneExp
+                                                                            ## after ::zeCommandListClose.
+
+class ze_command_list_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Command List descriptor
+class ze_command_list_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("commandQueueGroupOrdinal", c_ulong),                          ## [in] command queue group ordinal to which this command list will be
+                                                                        ## submitted
+        ("flags", ze_command_list_flags_t)                              ## [in] usage flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_command_list_flag_t;
+                                                                        ## default behavior may use implicit driver-based heuristics to balance
+                                                                        ## latency and throughput.
+    ]
+
+###############################################################################
+## @brief Copy region descriptor
+class ze_copy_region_t(Structure):
+    _fields_ = [
+        ("originX", c_ulong),                                           ## [in] The origin x offset for region in bytes
+        ("originY", c_ulong),                                           ## [in] The origin y offset for region in rows
+        ("originZ", c_ulong),                                           ## [in] The origin z offset for region in slices
+        ("width", c_ulong),                                             ## [in] The region width relative to origin in bytes
+        ("height", c_ulong),                                            ## [in] The region height relative to origin in rows
+        ("depth", c_ulong)                                              ## [in] The region depth relative to origin in slices. Set this to 0 for
+                                                                        ## 2D copy.
+    ]
+
+###############################################################################
+## @brief Region descriptor
+class ze_image_region_t(Structure):
+    _fields_ = [
+        ("originX", c_ulong),                                           ## [in] The origin x offset for region in pixels
+        ("originY", c_ulong),                                           ## [in] The origin y offset for region in pixels
+        ("originZ", c_ulong),                                           ## [in] The origin z offset for region in pixels
+        ("width", c_ulong),                                             ## [in] The region width relative to origin in pixels
+        ("height", c_ulong),                                            ## [in] The region height relative to origin in pixels
+        ("depth", c_ulong)                                              ## [in] The region depth relative to origin. For 1D or 2D images, set
+                                                                        ## this to 1.
+    ]
+
+###############################################################################
+## @brief Supported memory advice hints
+class ze_memory_advice_v(IntEnum):
+    SET_READ_MOSTLY = 0                                                     ## hint that memory will be read from frequently and written to rarely
+    CLEAR_READ_MOSTLY = 1                                                   ## removes the effect of ::ZE_MEMORY_ADVICE_SET_READ_MOSTLY
+    SET_PREFERRED_LOCATION = 2                                              ## hint that the preferred memory location is the specified device
+    CLEAR_PREFERRED_LOCATION = 3                                            ## removes the effect of ::ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION
+    SET_NON_ATOMIC_MOSTLY = 4                                               ## hints that memory will mostly be accessed non-atomically
+    CLEAR_NON_ATOMIC_MOSTLY = 5                                             ## removes the effect of ::ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY
+    BIAS_CACHED = 6                                                         ## hints that memory should be cached
+    BIAS_UNCACHED = 7                                                       ## hints that memory should be not be cached
+    SET_SYSTEM_MEMORY_PREFERRED_LOCATION = 8                                ## hint that the preferred memory location is host memory
+    CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION = 9                              ## removes the effect of
+                                                                            ## ::ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION
+
+class ze_memory_advice_t(c_int):
+    def __str__(self):
+        return str(ze_memory_advice_v(self.value))
+
+
+###############################################################################
+## @brief Supported event pool creation flags
+class ze_event_pool_flags_v(IntEnum):
+    HOST_VISIBLE = ZE_BIT(0)                                                ## signals and waits are also visible to host
+    IPC = ZE_BIT(1)                                                         ## signals and waits may be shared across processes
+    KERNEL_TIMESTAMP = ZE_BIT(2)                                            ## Indicates all events in pool will contain kernel timestamps
+    KERNEL_MAPPED_TIMESTAMP = ZE_BIT(3)                                     ## Indicates all events in pool will contain kernel timestamps
+                                                                            ## synchronized to host time domain; cannot be combined with
+                                                                            ## ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP
+
+class ze_event_pool_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Event pool descriptor
+class ze_event_pool_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_event_pool_flags_t),                               ## [in] creation flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_event_pool_flag_t;
+                                                                        ## default behavior is signals and waits are visible to the entire device
+                                                                        ## and peer devices.
+        ("count", c_ulong)                                              ## [in] number of events within the pool; must be greater than 0
+    ]
+
+###############################################################################
+## @brief Supported event scope flags
+class ze_event_scope_flags_v(IntEnum):
+    SUBDEVICE = ZE_BIT(0)                                                   ## cache hierarchies are flushed or invalidated sufficient for local
+                                                                            ## sub-device access
+    DEVICE = ZE_BIT(1)                                                      ## cache hierarchies are flushed or invalidated sufficient for global
+                                                                            ## device access and peer device access
+    HOST = ZE_BIT(2)                                                        ## cache hierarchies are flushed or invalidated sufficient for device and
+                                                                            ## host access
+
+class ze_event_scope_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Event descriptor
+class ze_event_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("index", c_ulong),                                             ## [in] index of the event within the pool; must be less than the count
+                                                                        ## specified during pool creation
+        ("signal", ze_event_scope_flags_t),                             ## [in] defines the scope of relevant cache hierarchies to flush on a
+                                                                        ## signal action before the event is triggered.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                                                        ## default behavior is synchronization within the command list only, no
+                                                                        ## additional cache hierarchies are flushed.
+        ("wait", ze_event_scope_flags_t)                                ## [in] defines the scope of relevant cache hierarchies to invalidate on
+                                                                        ## a wait action after the event is complete.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                                                        ## default behavior is synchronization within the command list only, no
+                                                                        ## additional cache hierarchies are invalidated.
+    ]
+
+###############################################################################
+## @brief Kernel timestamp clock data
+## 
+## @details
+##     - The timestamp frequency can be queried from the `timerResolution`
+##       member of ::ze_device_properties_t.
+##     - The number of valid bits in the timestamp value can be queried from
+##       the `kernelTimestampValidBits` member of ::ze_device_properties_t.
+class ze_kernel_timestamp_data_t(Structure):
+    _fields_ = [
+        ("kernelStart", c_ulonglong),                                   ## [out] device clock at start of kernel execution
+        ("kernelEnd", c_ulonglong)                                      ## [out] device clock at end of kernel execution
+    ]
+
+###############################################################################
+## @brief Kernel timestamp result
+class ze_kernel_timestamp_result_t(Structure):
+    _fields_ = [
+        ("global", ze_kernel_timestamp_data_t),                         ## [out] wall-clock data
+        ("context", ze_kernel_timestamp_data_t)                         ## [out] context-active data; only includes clocks while device context
+                                                                        ## was actively executing.
+    ]
+
+###############################################################################
+## @brief Supported fence creation flags
+class ze_fence_flags_v(IntEnum):
+    SIGNALED = ZE_BIT(0)                                                    ## fence is created in the signaled state, otherwise not signaled.
+
+class ze_fence_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Fence descriptor
+class ze_fence_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_fence_flags_t)                                     ## [in] creation flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_fence_flag_t.
+    ]
+
+###############################################################################
+## @brief Supported image creation flags
+class ze_image_flags_v(IntEnum):
+    KERNEL_WRITE = ZE_BIT(0)                                                ## kernels will write contents
+    BIAS_UNCACHED = ZE_BIT(1)                                               ## device should not cache contents
+
+class ze_image_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported image types
+class ze_image_type_v(IntEnum):
+    _1D = 0                                                                 ## 1D
+    _1DARRAY = 1                                                            ## 1D array
+    _2D = 2                                                                 ## 2D
+    _2DARRAY = 3                                                            ## 2D array
+    _3D = 4                                                                 ## 3D
+    BUFFER = 5                                                              ## Buffer
+
+class ze_image_type_t(c_int):
+    def __str__(self):
+        return str(ze_image_type_v(self.value))
+
+
+###############################################################################
+## @brief Supported image format layouts
+class ze_image_format_layout_v(IntEnum):
+    _8 = 0                                                                  ## 8-bit single component layout
+    _16 = 1                                                                 ## 16-bit single component layout
+    _32 = 2                                                                 ## 32-bit single component layout
+    _8_8 = 3                                                                ## 2-component 8-bit layout
+    _8_8_8_8 = 4                                                            ## 4-component 8-bit layout
+    _16_16 = 5                                                              ## 2-component 16-bit layout
+    _16_16_16_16 = 6                                                        ## 4-component 16-bit layout
+    _32_32 = 7                                                              ## 2-component 32-bit layout
+    _32_32_32_32 = 8                                                        ## 4-component 32-bit layout
+    _10_10_10_2 = 9                                                         ## 4-component 10_10_10_2 layout
+    _11_11_10 = 10                                                          ## 3-component 11_11_10 layout
+    _5_6_5 = 11                                                             ## 3-component 5_6_5 layout
+    _5_5_5_1 = 12                                                           ## 4-component 5_5_5_1 layout
+    _4_4_4_4 = 13                                                           ## 4-component 4_4_4_4 layout
+    Y8 = 14                                                                 ## Media Format: Y8. Format type and swizzle is ignored for this.
+    NV12 = 15                                                               ## Media Format: NV12. Format type and swizzle is ignored for this.
+    YUYV = 16                                                               ## Media Format: YUYV. Format type and swizzle is ignored for this.
+    VYUY = 17                                                               ## Media Format: VYUY. Format type and swizzle is ignored for this.
+    YVYU = 18                                                               ## Media Format: YVYU. Format type and swizzle is ignored for this.
+    UYVY = 19                                                               ## Media Format: UYVY. Format type and swizzle is ignored for this.
+    AYUV = 20                                                               ## Media Format: AYUV. Format type and swizzle is ignored for this.
+    P010 = 21                                                               ## Media Format: P010. Format type and swizzle is ignored for this.
+    Y410 = 22                                                               ## Media Format: Y410. Format type and swizzle is ignored for this.
+    P012 = 23                                                               ## Media Format: P012. Format type and swizzle is ignored for this.
+    Y16 = 24                                                                ## Media Format: Y16. Format type and swizzle is ignored for this.
+    P016 = 25                                                               ## Media Format: P016. Format type and swizzle is ignored for this.
+    Y216 = 26                                                               ## Media Format: Y216. Format type and swizzle is ignored for this.
+    P216 = 27                                                               ## Media Format: P216. Format type and swizzle is ignored for this.
+    P8 = 28                                                                 ## Media Format: P8. Format type and swizzle is ignored for this.
+    YUY2 = 29                                                               ## Media Format: YUY2. Format type and swizzle is ignored for this.
+    A8P8 = 30                                                               ## Media Format: A8P8. Format type and swizzle is ignored for this.
+    IA44 = 31                                                               ## Media Format: IA44. Format type and swizzle is ignored for this.
+    AI44 = 32                                                               ## Media Format: AI44. Format type and swizzle is ignored for this.
+    Y416 = 33                                                               ## Media Format: Y416. Format type and swizzle is ignored for this.
+    Y210 = 34                                                               ## Media Format: Y210. Format type and swizzle is ignored for this.
+    I420 = 35                                                               ## Media Format: I420. Format type and swizzle is ignored for this.
+    YV12 = 36                                                               ## Media Format: YV12. Format type and swizzle is ignored for this.
+    _400P = 37                                                              ## Media Format: 400P. Format type and swizzle is ignored for this.
+    _422H = 38                                                              ## Media Format: 422H. Format type and swizzle is ignored for this.
+    _422V = 39                                                              ## Media Format: 422V. Format type and swizzle is ignored for this.
+    _444P = 40                                                              ## Media Format: 444P. Format type and swizzle is ignored for this.
+    RGBP = 41                                                               ## Media Format: RGBP. Format type and swizzle is ignored for this.
+    BRGP = 42                                                               ## Media Format: BRGP. Format type and swizzle is ignored for this.
+    _8_8_8 = 43                                                             ## 3-component 8-bit layout
+    _16_16_16 = 44                                                          ## 3-component 16-bit layout
+    _32_32_32 = 45                                                          ## 3-component 32-bit layout
+
+class ze_image_format_layout_t(c_int):
+    def __str__(self):
+        return str(ze_image_format_layout_v(self.value))
+
+
+###############################################################################
+## @brief Supported image format types
+class ze_image_format_type_v(IntEnum):
+    UINT = 0                                                                ## Unsigned integer
+    SINT = 1                                                                ## Signed integer
+    UNORM = 2                                                               ## Unsigned normalized integer
+    SNORM = 3                                                               ## Signed normalized integer
+    FLOAT = 4                                                               ## Float
+
+class ze_image_format_type_t(c_int):
+    def __str__(self):
+        return str(ze_image_format_type_v(self.value))
+
+
+###############################################################################
+## @brief Supported image format component swizzle into channel
+class ze_image_format_swizzle_v(IntEnum):
+    R = 0                                                                   ## Red component
+    G = 1                                                                   ## Green component
+    B = 2                                                                   ## Blue component
+    A = 3                                                                   ## Alpha component
+    _0 = 4                                                                  ## Zero
+    _1 = 5                                                                  ## One
+    X = 6                                                                   ## Don't care
+
+class ze_image_format_swizzle_t(c_int):
+    def __str__(self):
+        return str(ze_image_format_swizzle_v(self.value))
+
+
+###############################################################################
+## @brief Image format 
+class ze_image_format_t(Structure):
+    _fields_ = [
+        ("layout", ze_image_format_layout_t),                           ## [in] image format component layout (e.g. N-component layouts and media
+                                                                        ## formats)
+        ("type", ze_image_format_type_t),                               ## [in] image format type
+        ("x", ze_image_format_swizzle_t),                               ## [in] image component swizzle into channel x
+        ("y", ze_image_format_swizzle_t),                               ## [in] image component swizzle into channel y
+        ("z", ze_image_format_swizzle_t),                               ## [in] image component swizzle into channel z
+        ("w", ze_image_format_swizzle_t)                                ## [in] image component swizzle into channel w
+    ]
+
+###############################################################################
+## @brief Image descriptor
+class ze_image_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_image_flags_t),                                    ## [in] creation flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_image_flag_t;
+                                                                        ## default is read-only, cached access.
+        ("type", ze_image_type_t),                                      ## [in] image type. Media format layouts are unsupported for
+                                                                        ## ::ZE_IMAGE_TYPE_BUFFER
+        ("format", ze_image_format_t),                                  ## [in] image format
+        ("width", c_ulonglong),                                         ## [in] width dimension.
+                                                                        ## ::ZE_IMAGE_TYPE_BUFFER: size in bytes; see the `maxImageBufferSize`
+                                                                        ## member of ::ze_device_image_properties_t for limits.
+                                                                        ## ::ZE_IMAGE_TYPE_1D, ::ZE_IMAGE_TYPE_1DARRAY: width in pixels; see the
+                                                                        ## `maxImageDims1D` member of ::ze_device_image_properties_t for limits.
+                                                                        ## ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: width in pixels; see the
+                                                                        ## `maxImageDims2D` member of ::ze_device_image_properties_t for limits.
+                                                                        ## ::ZE_IMAGE_TYPE_3D: width in pixels; see the `maxImageDims3D` member
+                                                                        ## of ::ze_device_image_properties_t for limits.
+        ("height", c_ulong),                                            ## [in] height dimension.
+                                                                        ## ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: height in pixels; see the
+                                                                        ## `maxImageDims2D` member of ::ze_device_image_properties_t for limits.
+                                                                        ## ::ZE_IMAGE_TYPE_3D: height in pixels; see the `maxImageDims3D` member
+                                                                        ## of ::ze_device_image_properties_t for limits.
+                                                                        ## other: ignored.
+        ("depth", c_ulong),                                             ## [in] depth dimension.
+                                                                        ## ::ZE_IMAGE_TYPE_3D: depth in pixels; see the `maxImageDims3D` member
+                                                                        ## of ::ze_device_image_properties_t for limits.
+                                                                        ## other: ignored.
+        ("arraylevels", c_ulong),                                       ## [in] array levels.
+                                                                        ## ::ZE_IMAGE_TYPE_1DARRAY, ::ZE_IMAGE_TYPE_2DARRAY: see the
+                                                                        ## `maxImageArraySlices` member of ::ze_device_image_properties_t for limits.
+                                                                        ## other: ignored.
+        ("miplevels", c_ulong)                                          ## [in] mipmap levels (must be 0)
+    ]
+
+###############################################################################
+## @brief Supported sampler filtering flags
+class ze_image_sampler_filter_flags_v(IntEnum):
+    POINT = ZE_BIT(0)                                                       ## device supports point filtering
+    LINEAR = ZE_BIT(1)                                                      ## device supports linear filtering
+
+class ze_image_sampler_filter_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Image properties
+class ze_image_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("samplerFilterFlags", ze_image_sampler_filter_flags_t)         ## [out] supported sampler filtering.
+                                                                        ## returns 0 (unsupported) or a combination of ::ze_image_sampler_filter_flag_t.
+    ]
+
+###############################################################################
+## @brief Supported memory allocation flags
+class ze_device_mem_alloc_flags_v(IntEnum):
+    BIAS_CACHED = ZE_BIT(0)                                                 ## device should cache allocation
+    BIAS_UNCACHED = ZE_BIT(1)                                               ## device should not cache allocation (UC)
+    BIAS_INITIAL_PLACEMENT = ZE_BIT(2)                                      ## optimize shared allocation for first access on the device
+
+class ze_device_mem_alloc_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device memory allocation descriptor
+class ze_device_mem_alloc_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_device_mem_alloc_flags_t),                         ## [in] flags specifying additional allocation controls.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_device_mem_alloc_flag_t;
+                                                                        ## default behavior may use implicit driver-based heuristics.
+        ("ordinal", c_ulong)                                            ## [in] ordinal of the device's local memory to allocate from.
+                                                                        ## must be less than the count returned from ::zeDeviceGetMemoryProperties.
+    ]
+
+###############################################################################
+## @brief Supported host memory allocation flags
+class ze_host_mem_alloc_flags_v(IntEnum):
+    BIAS_CACHED = ZE_BIT(0)                                                 ## host should cache allocation
+    BIAS_UNCACHED = ZE_BIT(1)                                               ## host should not cache allocation (UC)
+    BIAS_WRITE_COMBINED = ZE_BIT(2)                                         ## host memory should be allocated write-combined (WC)
+    BIAS_INITIAL_PLACEMENT = ZE_BIT(3)                                      ## optimize shared allocation for first access on the host
+
+class ze_host_mem_alloc_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Host memory allocation descriptor
+class ze_host_mem_alloc_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_host_mem_alloc_flags_t)                            ## [in] flags specifying additional allocation controls.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_host_mem_alloc_flag_t;
+                                                                        ## default behavior may use implicit driver-based heuristics.
+    ]
+
+###############################################################################
+## @brief Memory allocation type
+class ze_memory_type_v(IntEnum):
+    UNKNOWN = 0                                                             ## the memory pointed to is of unknown type
+    HOST = 1                                                                ## the memory pointed to is a host allocation
+    DEVICE = 2                                                              ## the memory pointed to is a device allocation
+    SHARED = 3                                                              ## the memory pointed to is a shared ownership allocation
+
+class ze_memory_type_t(c_int):
+    def __str__(self):
+        return str(ze_memory_type_v(self.value))
+
+
+###############################################################################
+## @brief Memory allocation properties queried using ::zeMemGetAllocProperties
+class ze_memory_allocation_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", ze_memory_type_t),                                     ## [out] type of allocated memory
+        ("id", c_ulonglong),                                            ## [out] identifier for this allocation
+        ("pageSize", c_ulonglong)                                       ## [out] page size used for allocation
+    ]
+
+###############################################################################
+## @brief Supported IPC memory flags
+class ze_ipc_memory_flags_v(IntEnum):
+    BIAS_CACHED = ZE_BIT(0)                                                 ## device should cache allocation
+    BIAS_UNCACHED = ZE_BIT(1)                                               ## device should not cache allocation (UC)
+
+class ze_ipc_memory_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Additional allocation descriptor for exporting external memory
+## 
+## @details
+##     - This structure may be passed to ::zeMemAllocDevice and
+##       ::zeMemAllocHost, via the `pNext` member of
+##       ::ze_device_mem_alloc_desc_t or ::ze_host_mem_alloc_desc_t,
+##       respectively, to indicate an exportable memory allocation.
+##     - This structure may be passed to ::zeImageCreate, via the `pNext`
+##       member of ::ze_image_desc_t, to indicate an exportable image.
+class ze_external_memory_export_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_external_memory_type_flags_t)                      ## [in] flags specifying memory export types for this allocation.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    ]
+
+###############################################################################
+## @brief Additional allocation descriptor for importing external memory as a
+##        file descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeMemAllocDevice or
+##       ::zeMemAllocHost, via the `pNext` member of
+##       ::ze_device_mem_alloc_desc_t or of ::ze_host_mem_alloc_desc_t,
+##       respectively, to import memory from a file descriptor.
+##     - This structure may be passed to ::zeImageCreate, via the `pNext`
+##       member of ::ze_image_desc_t, to import memory from a file descriptor.
+class ze_external_memory_import_fd_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory import type for the file descriptor.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+        ("fd", c_int)                                                   ## [in] the file descriptor handle to import
+    ]
+
+###############################################################################
+## @brief Exports an allocation as a file descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeMemGetAllocProperties, via the
+##       `pNext` member of ::ze_memory_allocation_properties_t, to export a
+##       memory allocation as a file descriptor.
+##     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
+##       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
+##       export an image as a file descriptor.
+##     - The requested memory export type must have been specified when the
+##       allocation was made.
+class ze_external_memory_export_fd_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory export type for the file descriptor.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+        ("fd", c_int)                                                   ## [out] the exported file descriptor handle representing the allocation.
+    ]
+
+###############################################################################
+## @brief Additional allocation descriptor for importing external memory as a
+##        Win32 handle
+## 
+## @details
+##     - When `handle` is `nullptr`, `name` must not be `nullptr`.
+##     - When `name` is `nullptr`, `handle` must not be `nullptr`.
+##     - When `flags` is ::ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT,
+##       `name` must be `nullptr`.
+##     - This structure may be passed to ::zeMemAllocDevice or
+##       ::zeMemAllocHost, via the `pNext` member of
+##       ::ze_device_mem_alloc_desc_t or of ::ze_host_mem_alloc_desc_t,
+##       respectively, to import memory from a Win32 handle.
+##     - This structure may be passed to ::zeImageCreate, via the `pNext`
+##       member of ::ze_image_desc_t, to import memory from a Win32 handle.
+class ze_external_memory_import_win32_handle_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory import type for the Win32 handle.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+        ("handle", c_void_p),                                           ## [in][optional] the Win32 handle to import
+        ("name", c_void_p)                                              ## [in][optional] name of a memory object to import
+    ]
+
+###############################################################################
+## @brief Exports an allocation as a Win32 handle
+## 
+## @details
+##     - This structure may be passed to ::zeMemGetAllocProperties, via the
+##       `pNext` member of ::ze_memory_allocation_properties_t, to export a
+##       memory allocation as a Win32 handle.
+##     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
+##       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
+##       export an image as a Win32 handle.
+##     - The requested memory export type must have been specified when the
+##       allocation was made.
+class ze_external_memory_export_win32_handle_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_external_memory_type_flags_t),                     ## [in] flags specifying the memory export type for the Win32 handle.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+        ("handle", c_void_p)                                            ## [out] the exported Win32 handle representing the allocation.
+    ]
+
+###############################################################################
+## @brief atomic access attribute flags
+class ze_memory_atomic_attr_exp_flags_v(IntEnum):
+    NO_ATOMICS = ZE_BIT(0)                                                  ## Atomics on the pointer are not allowed
+    NO_HOST_ATOMICS = ZE_BIT(1)                                             ## Host atomics on the pointer are not allowed
+    HOST_ATOMICS = ZE_BIT(2)                                                ## Host atomics on the pointer are allowed. Requires
+                                                                            ## ::ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC returned by
+                                                                            ## ::zeDeviceGetMemoryAccessProperties.
+    NO_DEVICE_ATOMICS = ZE_BIT(3)                                           ## Device atomics on the pointer are not allowed
+    DEVICE_ATOMICS = ZE_BIT(4)                                              ## Device atomics on the pointer are allowed. Requires
+                                                                            ## ::ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC returned by
+                                                                            ## ::zeDeviceGetMemoryAccessProperties.
+    NO_SYSTEM_ATOMICS = ZE_BIT(5)                                           ## Concurrent atomics on the pointer from both host and device are not
+                                                                            ## allowed
+    SYSTEM_ATOMICS = ZE_BIT(6)                                              ## Concurrent atomics on the pointer from both host and device are
+                                                                            ## allowed. Requires ::ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC
+                                                                            ## returned by ::zeDeviceGetMemoryAccessProperties.
+
+class ze_memory_atomic_attr_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported module creation input formats
+class ze_module_format_v(IntEnum):
+    IL_SPIRV = 0                                                            ## Format is SPIRV IL format
+    NATIVE = 1                                                              ## Format is device native format
+
+class ze_module_format_t(c_int):
+    def __str__(self):
+        return str(ze_module_format_v(self.value))
+
+
+###############################################################################
+## @brief Specialization constants - User defined constants
+class ze_module_constants_t(Structure):
+    _fields_ = [
+        ("numConstants", c_ulong),                                      ## [in] Number of specialization constants.
+        ("pConstantIds", POINTER(c_ulong)),                             ## [in][range(0, numConstants)] Array of IDs that is sized to
+                                                                        ## numConstants.
+        ("pConstantValues", POINTER(c_void_p))                          ## [in][range(0, numConstants)] Array of pointers to values that is sized
+                                                                        ## to numConstants.
+    ]
+
+###############################################################################
+## @brief Module descriptor
+class ze_module_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("format", ze_module_format_t),                                 ## [in] Module format passed in with pInputModule
+        ("inputSize", c_size_t),                                        ## [in] size of input IL or ISA from pInputModule.
+        ("pInputModule", POINTER(c_ubyte)),                             ## [in] pointer to IL or ISA
+        ("pBuildFlags", c_char_p),                                      ## [in][optional] string containing one or more (comma-separated)
+                                                                        ## compiler flags. If unsupported, flag is ignored with a warning.
+                                                                        ##  - "-ze-opt-disable"
+                                                                        ##       - Disable optimizations
+                                                                        ##  - "-ze-opt-level"
+                                                                        ##       - Specifies optimization level for compiler. Levels are
+                                                                        ## implementation specific.
+                                                                        ##           - 0 is no optimizations (equivalent to -ze-opt-disable)
+                                                                        ##           - 1 is optimize minimally (may be the same as 2)
+                                                                        ##           - 2 is optimize more (default)
+                                                                        ##  - "-ze-opt-greater-than-4GB-buffer-required"
+                                                                        ##       - Use 64-bit offset calculations for buffers.
+                                                                        ##  - "-ze-opt-large-register-file"
+                                                                        ##       - Increase number of registers available to threads.
+                                                                        ##  - "-ze-opt-has-buffer-offset-arg"
+                                                                        ##       - Extend stateless to stateful optimization to more
+                                                                        ##         cases with the use of additional offset (e.g. 64-bit
+                                                                        ##         pointer to binding table with 32-bit offset).
+                                                                        ##  - "-g"
+                                                                        ##       - Include debugging information.
+        ("pConstants", POINTER(ze_module_constants_t))                  ## [in][optional] pointer to specialization constants. Valid only for
+                                                                        ## SPIR-V input. This must be set to nullptr if no specialization
+                                                                        ## constants are provided.
+    ]
+
+###############################################################################
+## @brief Supported module property flags
+class ze_module_property_flags_v(IntEnum):
+    IMPORTS = ZE_BIT(0)                                                     ## Module has imports (i.e. imported global variables and/or kernels).
+                                                                            ## See ::zeModuleDynamicLink.
+
+class ze_module_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Module properties
+class ze_module_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_module_property_flags_t)                           ## [out] 0 (none) or a valid combination of ::ze_module_property_flag_t
+    ]
+
+###############################################################################
+## @brief Supported kernel creation flags
+class ze_kernel_flags_v(IntEnum):
+    FORCE_RESIDENCY = ZE_BIT(0)                                             ## force all device allocations to be resident during execution
+    EXPLICIT_RESIDENCY = ZE_BIT(1)                                          ## application is responsible for all residency of device allocations.
+                                                                            ## driver may disable implicit residency management.
+
+class ze_kernel_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Kernel descriptor
+class ze_kernel_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_kernel_flags_t),                                   ## [in] creation flags.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_kernel_flag_t;
+                                                                        ## default behavior may use driver-based residency.
+        ("pKernelName", c_char_p)                                       ## [in] null-terminated name of kernel in module
+    ]
+
+###############################################################################
+## @brief Kernel indirect access flags
+class ze_kernel_indirect_access_flags_v(IntEnum):
+    HOST = ZE_BIT(0)                                                        ## Indicates that the kernel accesses host allocations indirectly.
+    DEVICE = ZE_BIT(1)                                                      ## Indicates that the kernel accesses device allocations indirectly.
+    SHARED = ZE_BIT(2)                                                      ## Indicates that the kernel accesses shared allocations indirectly.
+
+class ze_kernel_indirect_access_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported Cache Config flags
+class ze_cache_config_flags_v(IntEnum):
+    LARGE_SLM = ZE_BIT(0)                                                   ## Large SLM size
+    LARGE_DATA = ZE_BIT(1)                                                  ## Large General Data size
+
+class ze_cache_config_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Maximum kernel universal unique id (UUID) size in bytes
+ZE_MAX_KERNEL_UUID_SIZE = 16
+
+###############################################################################
+## @brief Maximum module universal unique id (UUID) size in bytes
+ZE_MAX_MODULE_UUID_SIZE = 16
+
+###############################################################################
+## @brief Kernel universal unique id (UUID)
+class ze_kernel_uuid_t(Structure):
+    _fields_ = [
+        ("kid", c_ubyte * ZE_MAX_KERNEL_UUID_SIZE),                     ## [out] opaque data representing a kernel UUID
+        ("mid", c_ubyte * ZE_MAX_MODULE_UUID_SIZE)                      ## [out] opaque data representing the kernel's module UUID
+    ]
+
+###############################################################################
+## @brief Kernel properties
+class ze_kernel_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("numKernelArgs", c_ulong),                                     ## [out] number of kernel arguments.
+        ("requiredGroupSizeX", c_ulong),                                ## [out] required group size in the X dimension,
+                                                                        ## or zero if there is no required group size
+        ("requiredGroupSizeY", c_ulong),                                ## [out] required group size in the Y dimension,
+                                                                        ## or zero if there is no required group size
+        ("requiredGroupSizeZ", c_ulong),                                ## [out] required group size in the Z dimension,
+                                                                        ## or zero if there is no required group size
+        ("requiredNumSubGroups", c_ulong),                              ## [out] required number of subgroups per thread group,
+                                                                        ## or zero if there is no required number of subgroups
+        ("requiredSubgroupSize", c_ulong),                              ## [out] required subgroup size,
+                                                                        ## or zero if there is no required subgroup size
+        ("maxSubgroupSize", c_ulong),                                   ## [out] maximum subgroup size
+        ("maxNumSubgroups", c_ulong),                                   ## [out] maximum number of subgroups per thread group
+        ("localMemSize", c_ulong),                                      ## [out] local memory size used by each thread group
+        ("privateMemSize", c_ulong),                                    ## [out] private memory size allocated by compiler used by each thread
+        ("spillMemSize", c_ulong),                                      ## [out] spill memory size allocated by compiler
+        ("uuid", ze_kernel_uuid_t)                                      ## [out] universal unique identifier.
+    ]
+
+###############################################################################
+## @brief Additional kernel preferred group size properties
+## 
+## @details
+##     - This structure may be passed to ::zeKernelGetProperties, via the
+##       `pNext` member of ::ze_kernel_properties_t, to query additional kernel
+##       preferred group size properties.
+class ze_kernel_preferred_group_size_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("preferredMultiple", c_ulong)                                  ## [out] preferred group size multiple
+    ]
+
+###############################################################################
+## @brief Kernel dispatch group count.
+class ze_group_count_t(Structure):
+    _fields_ = [
+        ("groupCountX", c_ulong),                                       ## [in] number of thread groups in X dimension
+        ("groupCountY", c_ulong),                                       ## [in] number of thread groups in Y dimension
+        ("groupCountZ", c_ulong)                                        ## [in] number of thread groups in Z dimension
+    ]
+
+###############################################################################
+## @brief Module Program Extension Name
+ZE_MODULE_PROGRAM_EXP_NAME = "ZE_experimental_module_program"
+
+###############################################################################
+## @brief Module Program Extension Version(s)
+class ze_module_program_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_module_program_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_module_program_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Module extended descriptor to support multiple input modules.
+## 
+## @details
+##     - Implementation must support ::ZE_experimental_module_program extension
+##     - Modules support import and export linkage for functions and global
+##       variables.
+##     - SPIR-V import and export linkage types are used. See SPIR-V
+##       specification for linkage details.
+##     - pInputModules, pBuildFlags, and pConstants from ::ze_module_desc_t is
+##       ignored.
+##     - Format in ::ze_module_desc_t needs to be set to
+##       ::ZE_MODULE_FORMAT_IL_SPIRV.
+class ze_module_program_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("count", c_ulong),                                             ## [in] Count of input modules
+        ("inputSizes", POINTER(c_size_t)),                              ## [in][range(0, count)] sizes of each input IL module in pInputModules.
+        ("pInputModules", POINTER(c_ubyte*)),                           ## [in][range(0, count)] pointer to an array of IL (e.g. SPIR-V modules).
+                                                                        ## Valid only for SPIR-V input.
+        ("pBuildFlags", POINTER(c_char_p)),                             ## [in][optional][range(0, count)] array of strings containing build
+                                                                        ## flags. See pBuildFlags in ::ze_module_desc_t.
+        ("pConstants", POINTER(ze_module_constants_t*))                 ## [in][optional][range(0, count)] pointer to array of specialization
+                                                                        ## constant strings. Valid only for SPIR-V input. This must be set to
+                                                                        ## nullptr if no specialization constants are provided.
+    ]
+
+###############################################################################
+## @brief Raytracing Extension Name
+ZE_RAYTRACING_EXT_NAME = "ZE_extension_raytracing"
+
+###############################################################################
+## @brief Raytracing Extension Version(s)
+class ze_raytracing_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_raytracing_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_raytracing_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported raytracing capability flags
+class ze_device_raytracing_ext_flags_v(IntEnum):
+    RAYQUERY = ZE_BIT(0)                                                    ## Supports rayquery
+
+class ze_device_raytracing_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Raytracing properties queried using ::zeDeviceGetModuleProperties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetModuleProperties, via
+##       the `pNext` member of ::ze_device_module_properties_t.
+class ze_device_raytracing_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_device_raytracing_ext_flags_t),                    ## [out] 0 or a valid combination of ::ze_device_raytracing_ext_flags_t
+        ("maxBVHLevels", c_ulong)                                       ## [out] Maximum number of BVH levels supported
+    ]
+
+###############################################################################
+## @brief Supported raytracing memory allocation flags
+class ze_raytracing_mem_alloc_ext_flags_v(IntEnum):
+    TBD = ZE_BIT(0)                                                         ## reserved for future use
+
+class ze_raytracing_mem_alloc_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Raytracing memory allocation descriptor
+## 
+## @details
+##     - This structure must be passed to ::zeMemAllocShared or
+##       ::zeMemAllocDevice, via the `pNext` member of
+##       ::ze_device_mem_alloc_desc_t, for any memory allocation that is to be
+##       accessed by raytracing fixed-function of the device.
+class ze_raytracing_mem_alloc_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_raytracing_mem_alloc_ext_flags_t)                  ## [in] flags specifying additional allocation controls.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_raytracing_mem_alloc_ext_flag_t;
+                                                                        ## default behavior may use implicit driver-based heuristics.
+    ]
+
+###############################################################################
+## @brief Sampler addressing modes
+class ze_sampler_address_mode_v(IntEnum):
+    NONE = 0                                                                ## No coordinate modifications for out-of-bounds image access.
+    REPEAT = 1                                                              ## Out-of-bounds coordinates are wrapped back around.
+    CLAMP = 2                                                               ## Out-of-bounds coordinates are clamped to edge.
+    CLAMP_TO_BORDER = 3                                                     ## Out-of-bounds coordinates are clamped to border color which is (0.0f,
+                                                                            ## 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise
+                                                                            ## (0.0f, 0.0f, 0.0f, 1.0f).
+    MIRROR = 4                                                              ## Out-of-bounds coordinates are mirrored starting from edge.
+
+class ze_sampler_address_mode_t(c_int):
+    def __str__(self):
+        return str(ze_sampler_address_mode_v(self.value))
+
+
+###############################################################################
+## @brief Sampler filtering modes
+class ze_sampler_filter_mode_v(IntEnum):
+    NEAREST = 0                                                             ## No coordinate modifications for out of bounds image access.
+    LINEAR = 1                                                              ## Out-of-bounds coordinates are wrapped back around.
+
+class ze_sampler_filter_mode_t(c_int):
+    def __str__(self):
+        return str(ze_sampler_filter_mode_v(self.value))
+
+
+###############################################################################
+## @brief Sampler descriptor
+class ze_sampler_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("addressMode", ze_sampler_address_mode_t),                     ## [in] Sampler addressing mode to determine how out-of-bounds
+                                                                        ## coordinates are handled.
+        ("filterMode", ze_sampler_filter_mode_t),                       ## [in] Sampler filter mode to determine how samples are filtered.
+        ("isNormalized", ze_bool_t)                                     ## [in] Are coordinates normalized [0, 1] or not.
+    ]
+
+###############################################################################
+## @brief Virtual memory page access attributes
+class ze_memory_access_attribute_v(IntEnum):
+    NONE = 0                                                                ## Indicates the memory page is inaccessible.
+    READWRITE = 1                                                           ## Indicates the memory page supports read write access.
+    READONLY = 2                                                            ## Indicates the memory page supports read-only access.
+
+class ze_memory_access_attribute_t(c_int):
+    def __str__(self):
+        return str(ze_memory_access_attribute_v(self.value))
+
+
+###############################################################################
+## @brief Supported physical memory creation flags
+class ze_physical_mem_flags_v(IntEnum):
+    ALLOCATE_ON_DEVICE = ZE_BIT(0)                                          ## [default] allocate physical device memory.
+    ALLOCATE_ON_HOST = ZE_BIT(1)                                            ## Allocate physical host memory instead.
+
+class ze_physical_mem_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Physical memory descriptor
+class ze_physical_mem_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_physical_mem_flags_t),                             ## [in] creation flags.
+                                                                        ## must be 0 (default) or a valid combination of
+                                                                        ## ::ze_physical_mem_flag_t; default is to create physical device memory.
+        ("size", c_size_t)                                              ## [in] size in bytes to reserve; must be page aligned.
+    ]
+
+###############################################################################
+## @brief Floating-Point Atomics Extension Name
+ZE_FLOAT_ATOMICS_EXT_NAME = "ZE_extension_float_atomics"
+
+###############################################################################
+## @brief Floating-Point Atomics Extension Version(s)
+class ze_float_atomics_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_float_atomics_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_float_atomics_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported floating-point atomic capability flags
+class ze_device_fp_atomic_ext_flags_v(IntEnum):
+    GLOBAL_LOAD_STORE = ZE_BIT(0)                                           ## Supports atomic load, store, and exchange
+    GLOBAL_ADD = ZE_BIT(1)                                                  ## Supports atomic add and subtract
+    GLOBAL_MIN_MAX = ZE_BIT(2)                                              ## Supports atomic min and max
+    LOCAL_LOAD_STORE = ZE_BIT(16)                                           ## Supports atomic load, store, and exchange
+    LOCAL_ADD = ZE_BIT(17)                                                  ## Supports atomic add and subtract
+    LOCAL_MIN_MAX = ZE_BIT(18)                                              ## Supports atomic min and max
+
+class ze_device_fp_atomic_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device floating-point atomic properties queried using
+##        ::zeDeviceGetModuleProperties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetModuleProperties, via
+##       the `pNext` member of ::ze_device_module_properties_t.
+class ze_float_atomic_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("fp16Flags", ze_device_fp_atomic_ext_flags_t),                 ## [out] Capabilities for half-precision floating-point atomic operations
+        ("fp32Flags", ze_device_fp_atomic_ext_flags_t),                 ## [out] Capabilities for single-precision floating-point atomic
+                                                                        ## operations
+        ("fp64Flags", ze_device_fp_atomic_ext_flags_t)                  ## [out] Capabilities for double-precision floating-point atomic
+                                                                        ## operations
+    ]
+
+###############################################################################
+## @brief Global Offset Extension Name
+ZE_GLOBAL_OFFSET_EXP_NAME = "ZE_experimental_global_offset"
+
+###############################################################################
+## @brief Global Offset Extension Version(s)
+class ze_global_offset_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_global_offset_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_global_offset_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Relaxed Allocation Limits Extension Name
+ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME = "ZE_experimental_relaxed_allocation_limits"
+
+###############################################################################
+## @brief Relaxed Allocation Limits Extension Version(s)
+class ze_relaxed_allocation_limits_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_relaxed_allocation_limits_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_relaxed_allocation_limits_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported relaxed memory allocation flags
+class ze_relaxed_allocation_limits_exp_flags_v(IntEnum):
+    MAX_SIZE = ZE_BIT(0)                                                    ## Allocation size may exceed the `maxMemAllocSize` member of
+                                                                            ## ::ze_device_properties_t.
+
+class ze_relaxed_allocation_limits_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Relaxed limits memory allocation descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeMemAllocShared or
+##       ::zeMemAllocDevice, via the `pNext` member of
+##       ::ze_device_mem_alloc_desc_t.
+##     - This structure may also be passed to ::zeMemAllocHost, via the `pNext`
+##       member of ::ze_host_mem_alloc_desc_t.
+class ze_relaxed_allocation_limits_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_relaxed_allocation_limits_exp_flags_t)             ## [in] flags specifying allocation limits to relax.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_relaxed_allocation_limits_exp_flag_t;
+    ]
+
+###############################################################################
+## @brief Get Kernel Binary Extension Name
+ZE_GET_KERNEL_BINARY_EXP_NAME = "ZE_extension_kernel_binary_exp"
+
+###############################################################################
+## @brief Cache_Reservation Extension Name
+ZE_CACHE_RESERVATION_EXT_NAME = "ZE_extension_cache_reservation"
+
+###############################################################################
+## @brief Cache_Reservation Extension Version(s)
+class ze_cache_reservation_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_cache_reservation_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_cache_reservation_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Cache Reservation Region
+class ze_cache_ext_region_v(IntEnum):
+    ZE_CACHE_REGION_DEFAULT = 0                                             ## [DEPRECATED] utilize driver default scheme. Use
+                                                                            ## ::ZE_CACHE_EXT_REGION_DEFAULT.
+    ZE_CACHE_RESERVE_REGION = 1                                             ## [DEPRECATED] utilize reserved region. Use
+                                                                            ## ::ZE_CACHE_EXT_REGION_RESERVED.
+    ZE_CACHE_NON_RESERVED_REGION = 2                                        ## [DEPRECATED] utilize non-reserverd region. Use
+                                                                            ## ::ZE_CACHE_EXT_REGION_NON_RESERVED.
+    DEFAULT = 0                                                             ## utilize driver default scheme
+    RESERVED = 1                                                            ## utilize reserved region
+    NON_RESERVED = 2                                                        ## utilize non-reserverd region
+
+class ze_cache_ext_region_t(c_int):
+    def __str__(self):
+        return str(ze_cache_ext_region_v(self.value))
+
+
+###############################################################################
+## @brief CacheReservation structure
+## 
+## @details
+##     - This structure must be passed to ::zeDeviceGetCacheProperties via the
+##       `pNext` member of ::ze_device_cache_properties_t
+##     - Used for determining the max cache reservation allowed on device. Size
+##       of zero means no reservation available.
+class ze_cache_reservation_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("maxCacheReservationSize", c_size_t)                           ## [out] max cache reservation size
+    ]
+
+###############################################################################
+## @brief Event Query Timestamps Extension Name
+ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME = "ZE_experimental_event_query_timestamps"
+
+###############################################################################
+## @brief Event Query Timestamps Extension Version(s)
+class ze_event_query_timestamps_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_event_query_timestamps_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_event_query_timestamps_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Image Memory Properties Extension Name
+ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME = "ZE_experimental_image_memory_properties"
+
+###############################################################################
+## @brief Image Memory Properties Extension Version(s)
+class ze_image_memory_properties_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_memory_properties_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_memory_properties_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Image memory properties
+class ze_image_memory_properties_exp_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("size", c_ulonglong),                                          ## [out] size of image allocation in bytes.
+        ("rowPitch", c_ulonglong),                                      ## [out] size of image row in bytes.
+        ("slicePitch", c_ulonglong)                                     ## [out] size of image slice in bytes.
+    ]
+
+###############################################################################
+## @brief Image View Extension Name
+ZE_IMAGE_VIEW_EXT_NAME = "ZE_extension_image_view"
+
+###############################################################################
+## @brief Image View Extension Version(s)
+class ze_image_view_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_view_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_view_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Image View Extension Name
+ZE_IMAGE_VIEW_EXP_NAME = "ZE_experimental_image_view"
+
+###############################################################################
+## @brief Image View Extension Version(s)
+class ze_image_view_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_view_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_view_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Image View Planar Extension Name
+ZE_IMAGE_VIEW_PLANAR_EXT_NAME = "ZE_extension_image_view_planar"
+
+###############################################################################
+## @brief Image View Planar Extension Version(s)
+class ze_image_view_planar_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_view_planar_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_view_planar_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Image view planar descriptor
+class ze_image_view_planar_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("planeIndex", c_ulong)                                         ## [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
+    ]
+
+###############################################################################
+## @brief Image View Planar Extension Name
+ZE_IMAGE_VIEW_PLANAR_EXP_NAME = "ZE_experimental_image_view_planar"
+
+###############################################################################
+## @brief Image View Planar Extension Version(s)
+class ze_image_view_planar_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_view_planar_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_view_planar_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Image view planar descriptor
+class ze_image_view_planar_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("planeIndex", c_ulong)                                         ## [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
+    ]
+
+###############################################################################
+## @brief Kernel Scheduling Hints Extension Name
+ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME = "ZE_experimental_scheduling_hints"
+
+###############################################################################
+## @brief Kernel Scheduling Hints Extension Version(s)
+class ze_scheduling_hints_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_scheduling_hints_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_scheduling_hints_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported kernel scheduling hint flags
+class ze_scheduling_hint_exp_flags_v(IntEnum):
+    OLDEST_FIRST = ZE_BIT(0)                                                ## Hint that the kernel prefers oldest-first scheduling
+    ROUND_ROBIN = ZE_BIT(1)                                                 ## Hint that the kernel prefers round-robin scheduling
+    STALL_BASED_ROUND_ROBIN = ZE_BIT(2)                                     ## Hint that the kernel prefers stall-based round-robin scheduling
+
+class ze_scheduling_hint_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device kernel scheduling hint properties queried using
+##        ::zeDeviceGetModuleProperties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetModuleProperties, via
+##       the `pNext` member of ::ze_device_module_properties_t.
+class ze_scheduling_hint_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("schedulingHintFlags", ze_scheduling_hint_exp_flags_t)         ## [out] Supported kernel scheduling hints.
+                                                                        ## May be 0 (none) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
+    ]
+
+###############################################################################
+## @brief Kernel scheduling hint descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeKernelSchedulingHintExp.
+class ze_scheduling_hint_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_scheduling_hint_exp_flags_t)                       ## [in] flags specifying kernel scheduling hints.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
+    ]
+
+###############################################################################
+## @brief Linkonce ODR Extension Name
+ZE_LINKONCE_ODR_EXT_NAME = "ZE_extension_linkonce_odr"
+
+###############################################################################
+## @brief Linkonce ODR Extension Version(s)
+class ze_linkonce_odr_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_linkonce_odr_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_linkonce_odr_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Power Saving Hint Extension Name
+ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME = "ZE_experimental_power_saving_hint"
+
+###############################################################################
+## @brief Power Saving Hint Extension Version(s)
+class ze_power_saving_hint_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_power_saving_hint_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_power_saving_hint_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported device types
+class ze_power_saving_hint_type_v(IntEnum):
+    MIN = 0                                                                 ## Minumum power savings. The device will make no attempt to save power
+                                                                            ## while executing work submitted to this context.
+    MAX = 100                                                               ## Maximum power savings. The device will do everything to bring power to
+                                                                            ## a minimum while executing work submitted to this context.
+
+class ze_power_saving_hint_type_t(c_int):
+    def __str__(self):
+        return str(ze_power_saving_hint_type_v(self.value))
+
+
+###############################################################################
+## @brief Extended context descriptor containing power saving hint.
+class ze_context_power_saving_hint_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("hint", c_ulong)                                               ## [in] power saving hint (default value = 0). This is value from [0,100]
+                                                                        ## and can use pre-defined settings from ::ze_power_saving_hint_type_t.
+    ]
+
+###############################################################################
+## @brief Subgroups Extension Name
+ZE_SUBGROUPS_EXT_NAME = "ZE_extension_subgroups"
+
+###############################################################################
+## @brief Subgroups Extension Version(s)
+class ze_subgroup_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_subgroup_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_subgroup_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief EU Count Extension Name
+ZE_EU_COUNT_EXT_NAME = "ZE_extension_eu_count"
+
+###############################################################################
+## @brief EU Count Extension Version(s)
+class ze_eu_count_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_eu_count_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_eu_count_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief EU count queried using ::zeDeviceGetProperties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetProperties via the
+##       `pNext` member of ::ze_device_properties_t.
+##     - Used for determining the total number of EUs available on device.
+class ze_eu_count_ext_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("numTotalEUs", c_ulong)                                        ## [out] Total number of EUs available
+    ]
+
+###############################################################################
+## @brief PCI Properties Extension Name
+ZE_PCI_PROPERTIES_EXT_NAME = "ZE_extension_pci_properties"
+
+###############################################################################
+## @brief PCI Properties Extension Version(s)
+class ze_pci_properties_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_pci_properties_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_pci_properties_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Device PCI address
+## 
+## @details
+##     - This structure may be passed to ::zeDevicePciGetPropertiesExt as an
+##       attribute of ::ze_pci_ext_properties_t.
+##     - A PCI BDF address is the bus:device:function address of the device and
+##       is useful for locating the device in the PCI switch fabric.
+class ze_pci_address_ext_t(Structure):
+    _fields_ = [
+        ("domain", c_ulong),                                            ## [out] PCI domain number
+        ("bus", c_ulong),                                               ## [out] PCI BDF bus number
+        ("device", c_ulong),                                            ## [out] PCI BDF device number
+        ("function", c_ulong)                                           ## [out] PCI BDF function number
+    ]
+
+###############################################################################
+## @brief Device PCI speed
+class ze_pci_speed_ext_t(Structure):
+    _fields_ = [
+        ("genVersion", c_int32_t),                                      ## [out] The link generation. A value of -1 means that this property is
+                                                                        ## unknown.
+        ("width", c_int32_t),                                           ## [out] The number of lanes. A value of -1 means that this property is
+                                                                        ## unknown.
+        ("maxBandwidth", c_int64_t)                                     ## [out] The theoretical maximum bandwidth in bytes/sec (sum of all
+                                                                        ## lanes). A value of -1 means that this property is unknown.
+    ]
+
+###############################################################################
+## @brief Static PCI properties
+class ze_pci_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("address", ze_pci_address_ext_t),                              ## [out] The BDF address
+        ("maxSpeed", ze_pci_speed_ext_t)                                ## [out] Fastest port configuration supported by the device (sum of all
+                                                                        ## lanes)
+    ]
+
+###############################################################################
+## @brief sRGB Extension Name
+ZE_SRGB_EXT_NAME = "ZE_extension_srgb"
+
+###############################################################################
+## @brief sRGB Extension Version(s)
+class ze_srgb_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_srgb_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_srgb_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief sRGB image descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeImageCreate via the `pNext` member
+##       of ::ze_image_desc_t
+##     - Used for specifying that the image is in sRGB format.
+class ze_srgb_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("sRGB", ze_bool_t)                                             ## [in] Is sRGB.
+    ]
+
+###############################################################################
+## @brief Image Copy Extension Name
+ZE_IMAGE_COPY_EXT_NAME = "ZE_extension_image_copy"
+
+###############################################################################
+## @brief Image Copy Extension Version(s)
+class ze_image_copy_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_copy_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_copy_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Image Query Allocation Properties Extension Name
+ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME = "ZE_extension_image_query_alloc_properties"
+
+###############################################################################
+## @brief Image Query Allocation Properties Extension Version(s)
+class ze_image_query_alloc_properties_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_image_query_alloc_properties_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_image_query_alloc_properties_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Image allocation properties queried using
+##        ::zeImageGetAllocPropertiesExt
+class ze_image_allocation_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("id", c_ulonglong)                                             ## [out] identifier for this allocation
+    ]
+
+###############################################################################
+## @brief Linkage Inspection Extension Name
+ZE_LINKAGE_INSPECTION_EXT_NAME = "ZE_extension_linkage_inspection"
+
+###############################################################################
+## @brief Linkage Inspection Extension Version(s)
+class ze_linkage_inspection_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_linkage_inspection_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_linkage_inspection_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported module linkage inspection flags
+class ze_linkage_inspection_ext_flags_v(IntEnum):
+    IMPORTS = ZE_BIT(0)                                                     ## List all imports of modules
+    UNRESOLVABLE_IMPORTS = ZE_BIT(1)                                        ## List all imports of modules that do not have a corresponding export
+    EXPORTS = ZE_BIT(2)                                                     ## List all exports of modules
+
+class ze_linkage_inspection_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Module linkage inspection descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeModuleInspectLinkageExt.
+class ze_linkage_inspection_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_linkage_inspection_ext_flags_t)                    ## [in] flags specifying module linkage inspection.
+                                                                        ## must be 0 (default) or a valid combination of ::ze_linkage_inspection_ext_flag_t.
+    ]
+
+###############################################################################
+## @brief Memory Compression Hints Extension Name
+ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME = "ZE_extension_memory_compression_hints"
+
+###############################################################################
+## @brief Memory Compression Hints Extension Version(s)
+class ze_memory_compression_hints_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_memory_compression_hints_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_memory_compression_hints_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported memory compression hints flags
+class ze_memory_compression_hints_ext_flags_v(IntEnum):
+    COMPRESSED = ZE_BIT(0)                                                  ## Hint Driver implementation to make allocation compressible
+    UNCOMPRESSED = ZE_BIT(1)                                                ## Hint Driver implementation to make allocation not compressible
+
+class ze_memory_compression_hints_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Compression hints memory allocation descriptor
+## 
+## @details
+##     - This structure may be passed to ::zeMemAllocShared or
+##       ::zeMemAllocDevice, via the `pNext` member of
+##       ::ze_device_mem_alloc_desc_t.
+##     - This structure may be passed to ::zeMemAllocHost, via the `pNext`
+##       member of ::ze_host_mem_alloc_desc_t.
+##     - This structure may be passed to ::zeImageCreate, via the `pNext`
+##       member of ::ze_image_desc_t.
+class ze_memory_compression_hints_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_memory_compression_hints_ext_flags_t)              ## [in] flags specifying if allocation should be compressible or not.
+                                                                        ## Must be set to one of the ::ze_memory_compression_hints_ext_flag_t;
+    ]
+
+###############################################################################
+## @brief Memory Free Policies Extension Name
+ZE_MEMORY_FREE_POLICIES_EXT_NAME = "ZE_extension_memory_free_policies"
+
+###############################################################################
+## @brief Memory Free Policies Extension Version(s)
+class ze_memory_free_policies_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_memory_free_policies_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_memory_free_policies_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported memory free policy capability flags
+class ze_driver_memory_free_policy_ext_flags_v(IntEnum):
+    BLOCKING_FREE = ZE_BIT(0)                                               ## blocks until all commands using the memory are complete before freeing
+    DEFER_FREE = ZE_BIT(1)                                                  ## schedules the memory to be freed but does not free immediately
+
+class ze_driver_memory_free_policy_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Driver memory free properties queried using ::zeDriverGetProperties
+## 
+## @details
+##     - All drivers must support an immediate free policy, which is the
+##       default free policy.
+##     - This structure may be returned from ::zeDriverGetProperties, via the
+##       `pNext` member of ::ze_driver_properties_t.
+class ze_driver_memory_free_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("freePolicies", ze_driver_memory_free_policy_ext_flags_t)      ## [out] Supported memory free policies.
+                                                                        ## must be 0 or a combination of ::ze_driver_memory_free_policy_ext_flag_t.
+    ]
+
+###############################################################################
+## @brief Memory free descriptor with free policy
+class ze_memory_free_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("freePolicy", ze_driver_memory_free_policy_ext_flags_t)        ## [in] flags specifying the memory free policy.
+                                                                        ## must be 0 (default) or a supported ::ze_driver_memory_free_policy_ext_flag_t;
+                                                                        ## default behavior is to free immediately.
+    ]
+
+###############################################################################
+## @brief Bandwidth Extension Name
+ZE_BANDWIDTH_PROPERTIES_EXP_NAME = "ZE_experimental_bandwidth_properties"
+
+###############################################################################
+## @brief P2P Bandwidth Properties
+## 
+## @details
+##     - This structure may be passed to ::zeDeviceGetP2PProperties by having
+##       the pNext member of ::ze_device_p2p_properties_t point at this struct.
+class ze_device_p2p_bandwidth_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("logicalBandwidth", c_ulong),                                  ## [out] total logical design bandwidth for all links connecting the two
+                                                                        ## devices
+        ("physicalBandwidth", c_ulong),                                 ## [out] total physical design bandwidth for all links connecting the two
+                                                                        ## devices
+        ("bandwidthUnit", ze_bandwidth_unit_t),                         ## [out] bandwidth unit
+        ("logicalLatency", c_ulong),                                    ## [out] average logical design latency for all links connecting the two
+                                                                        ## devices
+        ("physicalLatency", c_ulong),                                   ## [out] average physical design latency for all links connecting the two
+                                                                        ## devices
+        ("latencyUnit", ze_latency_unit_t)                              ## [out] latency unit
+    ]
+
+###############################################################################
+## @brief Copy Bandwidth Properties
+## 
+## @details
+##     - This structure may be passed to
+##       ::zeDeviceGetCommandQueueGroupProperties by having the pNext member of
+##       ::ze_command_queue_group_properties_t point at this struct.
+class ze_copy_bandwidth_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("copyBandwidth", c_ulong),                                     ## [out] design bandwidth supported by this engine type for copy
+                                                                        ## operations
+        ("copyBandwidthUnit", ze_bandwidth_unit_t)                      ## [out] copy bandwidth unit
+    ]
+
+###############################################################################
+## @brief Device Local Identifier (LUID) Extension Name
+ZE_DEVICE_LUID_EXT_NAME = "ZE_extension_device_luid"
+
+###############################################################################
+## @brief Device Local Identifier (LUID) Extension Version(s)
+class ze_device_luid_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_device_luid_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_device_luid_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Maximum device local identifier (LUID) size in bytes
+ZE_MAX_DEVICE_LUID_SIZE_EXT = 8
+
+###############################################################################
+## @brief Device local identifier (LUID)
+class ze_device_luid_ext_t(Structure):
+    _fields_ = [
+        ("id", c_ubyte * ZE_MAX_DEVICE_LUID_SIZE_EXT)                   ## [out] opaque data representing a device LUID
+    ]
+
+###############################################################################
+## @brief Device LUID properties queried using ::zeDeviceGetProperties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetProperties, via the
+##       `pNext` member of ::ze_device_properties_t.
+class ze_device_luid_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("luid", ze_device_luid_ext_t),                                 ## [out] locally unique identifier (LUID).
+                                                                        ## The returned LUID can be cast to a LUID object and must be equal to
+                                                                        ## the locally
+                                                                        ## unique identifier of an IDXGIAdapter1 object that corresponds to the device.
+        ("nodeMask", c_ulong)                                           ## [out] node mask.
+                                                                        ## The returned node mask must contain exactly one bit.
+                                                                        ## If the device is running on an operating system that supports the
+                                                                        ## Direct3D 12 API
+                                                                        ## and the device corresponds to an individual device in a linked device
+                                                                        ## adapter, the
+                                                                        ## returned node mask identifies the Direct3D 12 node corresponding to
+                                                                        ## the device.
+                                                                        ## Otherwise, the returned node mask must be 1.
+    ]
+
+###############################################################################
+## @brief Fabric Topology Discovery Extension Name
+ZE_FABRIC_EXP_NAME = "ZE_experimental_fabric"
+
+###############################################################################
+## @brief Maximum fabric edge model string size
+ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE = 256
+
+###############################################################################
+## @brief Fabric Vertex types
+class ze_fabric_vertex_exp_type_v(IntEnum):
+    UNKNOWN = 0                                                             ## Fabric vertex type is unknown
+    DEVICE = 1                                                              ## Fabric vertex represents a device
+    SUBDEVICE = 2                                                           ## Fabric vertex represents a subdevice
+    SWITCH = 3                                                              ## Fabric vertex represents a switch
+
+class ze_fabric_vertex_exp_type_t(c_int):
+    def __str__(self):
+        return str(ze_fabric_vertex_exp_type_v(self.value))
+
+
+###############################################################################
+## @brief Fabric edge duplexity
+class ze_fabric_edge_exp_duplexity_v(IntEnum):
+    UNKNOWN = 0                                                             ## Fabric edge duplexity is unknown
+    HALF_DUPLEX = 1                                                         ## Fabric edge is half duplex, i.e. stated bandwidth is obtained in only
+                                                                            ## one direction at time
+    FULL_DUPLEX = 2                                                         ## Fabric edge is full duplex, i.e. stated bandwidth is supported in both
+                                                                            ## directions simultaneously
+
+class ze_fabric_edge_exp_duplexity_t(c_int):
+    def __str__(self):
+        return str(ze_fabric_edge_exp_duplexity_v(self.value))
+
+
+###############################################################################
+## @brief PCI address
+## 
+## @details
+##     - A PCI BDF address is the bus:device:function address of the device and
+##       is useful for locating the device in the PCI switch fabric.
+class ze_fabric_vertex_pci_exp_address_t(Structure):
+    _fields_ = [
+        ("domain", c_ulong),                                            ## [out] PCI domain number
+        ("bus", c_ulong),                                               ## [out] PCI BDF bus number
+        ("device", c_ulong),                                            ## [out] PCI BDF device number
+        ("function", c_ulong)                                           ## [out] PCI BDF function number
+    ]
+
+###############################################################################
+## @brief Fabric Vertex properties
+class ze_fabric_vertex_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("uuid", ze_uuid_t),                                            ## [out] universal unique identifier. If the vertex is co-located with a
+                                                                        ## device/subdevice, then this uuid will match that of the corresponding
+                                                                        ## device/subdevice
+        ("type", ze_fabric_vertex_exp_type_t),                          ## [out] does the fabric vertex represent a device, subdevice, or switch?
+        ("remote", ze_bool_t),                                          ## [out] does the fabric vertex live on the local node or on a remote
+                                                                        ## node?
+        ("address", ze_fabric_vertex_pci_exp_address_t)                 ## [out] B/D/F address of fabric vertex & associated device/subdevice if
+                                                                        ## available
+    ]
+
+###############################################################################
+## @brief Fabric Edge properties
+class ze_fabric_edge_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("uuid", ze_uuid_t),                                            ## [out] universal unique identifier.
+        ("model", c_char * ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE),          ## [out] Description of fabric edge technology. Will be set to the string
+                                                                        ## "unkown" if this cannot be determined for this edge
+        ("bandwidth", c_ulong),                                         ## [out] design bandwidth
+        ("bandwidthUnit", ze_bandwidth_unit_t),                         ## [out] bandwidth unit
+        ("latency", c_ulong),                                           ## [out] design latency
+        ("latencyUnit", ze_latency_unit_t),                             ## [out] latency unit
+        ("duplexity", ze_fabric_edge_exp_duplexity_t)                   ## [out] Duplexity of the fabric edge
+    ]
+
+###############################################################################
+## @brief Device Memory Properties Extension Name
+ZE_DEVICE_MEMORY_PROPERTIES_EXT_NAME = "ZE_extension_device_memory_properties"
+
+###############################################################################
+## @brief Device Memory Properties Extension Version(s)
+class ze_device_memory_properties_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_device_memory_properties_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_device_memory_properties_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Memory module types
+class ze_device_memory_ext_type_v(IntEnum):
+    HBM = 0                                                                 ## HBM memory
+    HBM2 = 1                                                                ## HBM2 memory
+    DDR = 2                                                                 ## DDR memory
+    DDR2 = 3                                                                ## DDR2 memory
+    DDR3 = 4                                                                ## DDR3 memory
+    DDR4 = 5                                                                ## DDR4 memory
+    DDR5 = 6                                                                ## DDR5 memory
+    LPDDR = 7                                                               ## LPDDR memory
+    LPDDR3 = 8                                                              ## LPDDR3 memory
+    LPDDR4 = 9                                                              ## LPDDR4 memory
+    LPDDR5 = 10                                                             ## LPDDR5 memory
+    SRAM = 11                                                               ## SRAM memory
+    L1 = 12                                                                 ## L1 cache
+    L3 = 13                                                                 ## L3 cache
+    GRF = 14                                                                ## Execution unit register file
+    SLM = 15                                                                ## Execution unit shared local memory
+    GDDR4 = 16                                                              ## GDDR4 memory
+    GDDR5 = 17                                                              ## GDDR5 memory
+    GDDR5X = 18                                                             ## GDDR5X memory
+    GDDR6 = 19                                                              ## GDDR6 memory
+    GDDR6X = 20                                                             ## GDDR6X memory
+    GDDR7 = 21                                                              ## GDDR7 memory
+
+class ze_device_memory_ext_type_t(c_int):
+    def __str__(self):
+        return str(ze_device_memory_ext_type_v(self.value))
+
+
+###############################################################################
+## @brief Memory properties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetMemoryProperties via
+##       the `pNext` member of ::ze_device_memory_properties_t
+class ze_device_memory_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", ze_device_memory_ext_type_t),                          ## [out] The memory type
+        ("physicalSize", c_ulonglong),                                  ## [out] Physical memory size in bytes. A value of 0 indicates that this
+                                                                        ## property is not known. However, a call to ::zesMemoryGetState() will
+                                                                        ## correctly return the total size of usable memory.
+        ("readBandwidth", c_ulong),                                     ## [out] Design bandwidth for reads
+        ("writeBandwidth", c_ulong),                                    ## [out] Design bandwidth for writes
+        ("bandwidthUnit", ze_bandwidth_unit_t)                          ## [out] bandwidth unit
+    ]
+
+###############################################################################
+## @brief Bfloat16 Conversions Extension Name
+ZE_BFLOAT16_CONVERSIONS_EXT_NAME = "ZE_extension_bfloat16_conversions"
+
+###############################################################################
+## @brief Bfloat16 Conversions Extension Version(s)
+class ze_bfloat16_conversions_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_bfloat16_conversions_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_bfloat16_conversions_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Device IP Version Extension Name
+ZE_DEVICE_IP_VERSION_EXT_NAME = "ZE_extension_device_ip_version"
+
+###############################################################################
+## @brief Device IP Version Extension Version(s)
+class ze_device_ip_version_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_device_ip_version_version_t(c_int):
+    def __str__(self):
+        return str(ze_device_ip_version_version_v(self.value))
+
+
+###############################################################################
+## @brief Device IP version queried using ::zeDeviceGetProperties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetProperties via the
+##       `pNext` member of ::ze_device_properties_t
+class ze_device_ip_version_ext_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("ipVersion", c_ulong)                                          ## [out] Device IP version. The meaning of the device IP version is
+                                                                        ## implementation-defined, but newer devices should have a higher
+                                                                        ## version than older devices.
+    ]
+
+###############################################################################
+## @brief Kernel Max Group Size Properties Extension Name
+ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_NAME = "ZE_extension_kernel_max_group_size_properties"
+
+###############################################################################
+## @brief Kernel Max Group Size Properties Extension Version(s)
+class ze_kernel_max_group_size_properties_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_kernel_max_group_size_properties_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_kernel_max_group_size_properties_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Additional kernel max group size properties
+## 
+## @details
+##     - This structure may be passed to ::zeKernelGetProperties, via the
+##       `pNext` member of ::ze_kernel_properties_t, to query additional kernel
+##       max group size properties.
+class ze_kernel_max_group_size_properties_ext_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("maxGroupSize", c_ulong)                                       ## [out] maximum group size that can be used to execute the kernel. This
+                                                                        ## value may be less than or equal to the `maxTotalGroupSize` member of
+                                                                        ## ::ze_device_compute_properties_t.
+    ]
+
+###############################################################################
+## @brief compiler-independent type
+class ze_kernel_max_group_size_ext_properties_t(ze_kernel_max_group_size_properties_ext_t):
+    pass
+
+###############################################################################
+## @brief Sub-Allocations Properties Extension Name
+ZE_SUB_ALLOCATIONS_EXP_NAME = "ZE_experimental_sub_allocations"
+
+###############################################################################
+## @brief Sub-Allocations Properties Extension Version(s)
+class ze_sub_allocations_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_sub_allocations_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_sub_allocations_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Properties returned for a sub-allocation
+class ze_sub_allocation_t(Structure):
+    _fields_ = [
+        ("base", c_void_p),                                             ## [in,out][optional] base address of the sub-allocation
+        ("size", c_size_t)                                              ## [in,out][optional] size of the allocation
+    ]
+
+###############################################################################
+## @brief Sub-Allocations Properties
+## 
+## @details
+##     - This structure may be passed to ::zeMemGetAllocProperties, via the
+##       `pNext` member of ::ze_memory_allocation_properties_t.
+class ze_memory_sub_allocations_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("pCount", POINTER(c_ulong)),                                   ## [in,out] pointer to the number of sub-allocations.
+                                                                        ## if count is zero, then the driver shall update the value with the
+                                                                        ## total number of sub-allocations on which the allocation has been divided.
+                                                                        ## if count is greater than the number of sub-allocations, then the
+                                                                        ## driver shall update the value with the correct number of sub-allocations.
+        ("pSubAllocations", POINTER(ze_sub_allocation_t))               ## [in,out][optional][range(0, *pCount)] array of properties for sub-allocations.
+                                                                        ## if count is less than the number of sub-allocations available, then
+                                                                        ## driver shall only retrieve properties for that number of sub-allocations.
+    ]
+
+###############################################################################
+## @brief Event Query Kernel Timestamps Extension Name
+ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_NAME = "ZE_extension_event_query_kernel_timestamps"
+
+###############################################################################
+## @brief Event Query Kernel Timestamps Extension Version(s)
+class ze_event_query_kernel_timestamps_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_event_query_kernel_timestamps_ext_version_t(c_int):
+    def __str__(self):
+        return str(ze_event_query_kernel_timestamps_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Event query kernel timestamps flags
+class ze_event_query_kernel_timestamps_ext_flags_v(IntEnum):
+    KERNEL = ZE_BIT(0)                                                      ## Kernel timestamp results
+    SYNCHRONIZED = ZE_BIT(1)                                                ## Device event timestamps synchronized to the host time domain
+
+class ze_event_query_kernel_timestamps_ext_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Event query kernel timestamps properties
+## 
+## @details
+##     - This structure may be returned from ::zeDeviceGetProperties, via the
+##       `pNext` member of ::ze_device_properties_t.
+class ze_event_query_kernel_timestamps_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_event_query_kernel_timestamps_ext_flags_t)         ## [out] 0 or some combination of
+                                                                        ## ::ze_event_query_kernel_timestamps_ext_flag_t flags
+    ]
+
+###############################################################################
+## @brief Kernel timestamp clock data synchronized to the host time domain
+class ze_synchronized_timestamp_data_ext_t(Structure):
+    _fields_ = [
+        ("kernelStart", c_ulonglong),                                   ## [out] synchronized clock at start of kernel execution
+        ("kernelEnd", c_ulonglong)                                      ## [out] synchronized clock at end of kernel execution
+    ]
+
+###############################################################################
+## @brief Synchronized kernel timestamp result
+class ze_synchronized_timestamp_result_ext_t(Structure):
+    _fields_ = [
+        ("global", ze_synchronized_timestamp_data_ext_t),               ## [out] wall-clock data
+        ("context", ze_synchronized_timestamp_data_ext_t)               ## [out] context-active data; only includes clocks while device context
+                                                                        ## was actively executing.
+    ]
+
+###############################################################################
+## @brief Event query kernel timestamps results properties
+class ze_event_query_kernel_timestamps_results_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("pKernelTimestampsBuffer", POINTER(ze_kernel_timestamp_result_t)), ## [in,out][optional][range(0, *pCount)] pointer to destination buffer of
+                                                                        ## kernel timestamp results
+        ("pSynchronizedTimestampsBuffer", POINTER(ze_synchronized_timestamp_result_ext_t))  ## [in,out][optional][range(0, *pCount)] pointer to destination buffer of
+                                                                        ## synchronized timestamp results
+    ]
+
+###############################################################################
+## @brief Ray Tracing Acceleration Structure Builder Extension Name
+ZE_RTAS_BUILDER_EXP_NAME = "ZE_experimental_rtas_builder"
+
+###############################################################################
+## @brief Ray Tracing Acceleration Structure Builder Extension Version(s)
+class ze_rtas_builder_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_rtas_builder_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_rtas_builder_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure device flags
+class ze_rtas_device_exp_flags_v(IntEnum):
+    RESERVED = ZE_BIT(0)                                                    ## reserved for future use
+
+class ze_rtas_device_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure format
+## 
+## @details
+##     - This is an opaque ray tracing acceleration structure format
+##       identifier.
+class ze_rtas_format_exp_v(IntEnum):
+    INVALID = 0                                                             ## Invalid acceleration structure format
+
+class ze_rtas_format_exp_t(c_int):
+    def __str__(self):
+        return str(ze_rtas_format_exp_v(self.value))
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder flags
+class ze_rtas_builder_exp_flags_v(IntEnum):
+    RESERVED = ZE_BIT(0)                                                    ## Reserved for future use
+
+class ze_rtas_builder_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder parallel operation flags
+class ze_rtas_parallel_operation_exp_flags_v(IntEnum):
+    RESERVED = ZE_BIT(0)                                                    ## Reserved for future use
+
+class ze_rtas_parallel_operation_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder geometry flags
+class ze_rtas_builder_geometry_exp_flags_v(IntEnum):
+    NON_OPAQUE = ZE_BIT(0)                                                  ## non-opaque geometries invoke an any-hit shader
+
+class ze_rtas_builder_geometry_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Packed ray tracing acceleration structure builder geometry flags (see
+##        ::ze_rtas_builder_geometry_exp_flags_t)
+class ze_rtas_builder_packed_geometry_exp_flags_t(c_ubyte):
+    pass
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder instance flags
+class ze_rtas_builder_instance_exp_flags_v(IntEnum):
+    TRIANGLE_CULL_DISABLE = ZE_BIT(0)                                       ## disables culling of front-facing and back-facing triangles
+    TRIANGLE_FRONT_COUNTERCLOCKWISE = ZE_BIT(1)                             ## reverses front and back face of triangles
+    TRIANGLE_FORCE_OPAQUE = ZE_BIT(2)                                       ## forces instanced geometry to be opaque, unless ray flag forces it to
+                                                                            ## be non-opaque
+    TRIANGLE_FORCE_NON_OPAQUE = ZE_BIT(3)                                   ## forces instanced geometry to be non-opaque, unless ray flag forces it
+                                                                            ## to be opaque
+
+class ze_rtas_builder_instance_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Packed ray tracing acceleration structure builder instance flags (see
+##        ::ze_rtas_builder_instance_exp_flags_t)
+class ze_rtas_builder_packed_instance_exp_flags_t(c_ubyte):
+    pass
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder build operation flags
+## 
+## @details
+##     - These flags allow the application to tune the acceleration structure
+##       build operation.
+##     - The acceleration structure builder implementation might choose to use
+##       spatial splitting to split large or long primitives into smaller
+##       pieces. This may result in any-hit shaders being invoked multiple
+##       times for non-opaque primitives, unless
+##       ::ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION is specified.
+##     - Usage of any of these flags may reduce ray tracing performance.
+class ze_rtas_builder_build_op_exp_flags_v(IntEnum):
+    COMPACT = ZE_BIT(0)                                                     ## build more compact acceleration structure
+    NO_DUPLICATE_ANYHIT_INVOCATION = ZE_BIT(1)                              ## guarantees single any-hit shader invocation per primitive
+
+class ze_rtas_builder_build_op_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder build quality hint
+## 
+## @details
+##     - Depending on use case different quality modes for acceleration
+##       structure build are supported.
+##     - A low-quality build builds an acceleration structure fast, but at the
+##       cost of some reduction in ray tracing performance. This mode is
+##       recommended for dynamic content, such as animated characters.
+##     - A medium-quality build uses a compromise between build quality and ray
+##       tracing performance. This mode should be used by default.
+##     - Higher ray tracing performance can be achieved by using a high-quality
+##       build, but acceleration structure build performance might be
+##       significantly reduced.
+class ze_rtas_builder_build_quality_hint_exp_v(IntEnum):
+    LOW = 0                                                                 ## build low-quality acceleration structure (fast)
+    MEDIUM = 1                                                              ## build medium-quality acceleration structure (slower)
+    HIGH = 2                                                                ## build high-quality acceleration structure (slow)
+
+class ze_rtas_builder_build_quality_hint_exp_t(c_int):
+    def __str__(self):
+        return str(ze_rtas_builder_build_quality_hint_exp_v(self.value))
+
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder geometry type
+class ze_rtas_builder_geometry_type_exp_v(IntEnum):
+    TRIANGLES = 0                                                           ## triangle mesh geometry type
+    QUADS = 1                                                               ## quad mesh geometry type
+    PROCEDURAL = 2                                                          ## procedural geometry type
+    INSTANCE = 3                                                            ## instance geometry type
+
+class ze_rtas_builder_geometry_type_exp_t(c_int):
+    def __str__(self):
+        return str(ze_rtas_builder_geometry_type_exp_v(self.value))
+
+
+###############################################################################
+## @brief Packed ray tracing acceleration structure builder geometry type (see
+##        ::ze_rtas_builder_geometry_type_exp_t)
+class ze_rtas_builder_packed_geometry_type_exp_t(c_ubyte):
+    pass
+
+###############################################################################
+## @brief Ray tracing acceleration structure data buffer element format
+## 
+## @details
+##     - Specifies the format of data buffer elements.
+##     - Data buffers may contain instancing transform matrices, triangle/quad
+##       vertex indices, etc...
+class ze_rtas_builder_input_data_format_exp_v(IntEnum):
+    FLOAT3 = 0                                                              ## 3-component float vector (see ::ze_rtas_float3_exp_t)
+    FLOAT3X4_COLUMN_MAJOR = 1                                               ## 3x4 affine transformation in column-major format (see
+                                                                            ## ::ze_rtas_transform_float3x4_column_major_exp_t)
+    FLOAT3X4_ALIGNED_COLUMN_MAJOR = 2                                       ## 3x4 affine transformation in column-major format (see
+                                                                            ## ::ze_rtas_transform_float3x4_aligned_column_major_exp_t)
+    FLOAT3X4_ROW_MAJOR = 3                                                  ## 3x4 affine transformation in row-major format (see
+                                                                            ## ::ze_rtas_transform_float3x4_row_major_exp_t)
+    AABB = 4                                                                ## 3-dimensional axis-aligned bounding-box (see ::ze_rtas_aabb_exp_t)
+    TRIANGLE_INDICES_UINT32 = 5                                             ## Unsigned 32-bit triangle indices (see
+                                                                            ## ::ze_rtas_triangle_indices_uint32_exp_t)
+    QUAD_INDICES_UINT32 = 6                                                 ## Unsigned 32-bit quad indices (see ::ze_rtas_quad_indices_uint32_exp_t)
+
+class ze_rtas_builder_input_data_format_exp_t(c_int):
+    def __str__(self):
+        return str(ze_rtas_builder_input_data_format_exp_v(self.value))
+
+
+###############################################################################
+## @brief Packed ray tracing acceleration structure data buffer element format
+##        (see ::ze_rtas_builder_input_data_format_exp_t)
+class ze_rtas_builder_packed_input_data_format_exp_t(c_ubyte):
+    pass
+
+###############################################################################
+## @brief Handle of ray tracing acceleration structure builder object
+class ze_rtas_builder_exp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of ray tracing acceleration structure builder parallel
+##        operation object
+class ze_rtas_parallel_operation_exp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder descriptor
+class ze_rtas_builder_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("builderVersion", ze_rtas_builder_exp_version_t)               ## [in] ray tracing acceleration structure builder version
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder properties
+class ze_rtas_builder_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_rtas_builder_exp_flags_t),                         ## [out] ray tracing acceleration structure builder flags
+        ("rtasBufferSizeBytesExpected", c_size_t),                      ## [out] expected size (in bytes) required for acceleration structure buffer
+                                                                        ##    - When using an acceleration structure buffer of this size, the
+                                                                        ## build is expected to succeed; however, it is possible that the build
+                                                                        ## may fail with ::ZE_RESULT_EXP_RTAS_BUILD_RETRY
+        ("rtasBufferSizeBytesMaxRequired", c_size_t),                   ## [out] worst-case size (in bytes) required for acceleration structure buffer
+                                                                        ##    - When using an acceleration structure buffer of this size, the
+                                                                        ## build is guaranteed to not run out of memory.
+        ("scratchBufferSizeBytes", c_size_t)                            ## [out] scratch buffer size (in bytes) required for acceleration
+                                                                        ## structure build.
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder parallel operation
+##        properties
+class ze_rtas_parallel_operation_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_rtas_parallel_operation_exp_flags_t),              ## [out] ray tracing acceleration structure builder parallel operation
+                                                                        ## flags
+        ("maxConcurrency", c_ulong)                                     ## [out] maximum number of threads that may join the parallel operation
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure device properties
+## 
+## @details
+##     - This structure may be passed to ::zeDeviceGetProperties, via `pNext`
+##       member of ::ze_device_properties_t.
+##     - The implementation shall populate `format` with a value other than
+##       ::ZE_RTAS_FORMAT_EXP_INVALID when the device supports ray tracing.
+class ze_rtas_device_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_rtas_device_exp_flags_t),                          ## [out] ray tracing acceleration structure device flags
+        ("rtasFormat", ze_rtas_format_exp_t),                           ## [out] ray tracing acceleration structure format
+        ("rtasBufferAlignment", c_ulong)                                ## [out] required alignment of acceleration structure buffer
+    ]
+
+###############################################################################
+## @brief A 3-component vector type
+class ze_rtas_float3_exp_t(Structure):
+    _fields_ = [
+        ("x", c_float),                                                 ## [in] x-coordinate of float3 vector
+        ("y", c_float),                                                 ## [in] y-coordinate of float3 vector
+        ("z", c_float)                                                  ## [in] z-coordinate of float3 vector
+    ]
+
+###############################################################################
+## @brief 3x4 affine transformation in column-major layout
+## 
+## @details
+##     - A 3x4 affine transformation in column major layout, consisting of vectors
+##          - vx=(vx_x, vx_y, vx_z),
+##          - vy=(vy_x, vy_y, vy_z),
+##          - vz=(vz_x, vz_y, vz_z), and
+##          - p=(p_x, p_y, p_z)
+##     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+##       z*vz + p`.
+class ze_rtas_transform_float3x4_column_major_exp_t(Structure):
+    _fields_ = [
+        ("vx_x", c_float),                                              ## [in] element 0 of column 0 of 3x4 matrix
+        ("vx_y", c_float),                                              ## [in] element 1 of column 0 of 3x4 matrix
+        ("vx_z", c_float),                                              ## [in] element 2 of column 0 of 3x4 matrix
+        ("vy_x", c_float),                                              ## [in] element 0 of column 1 of 3x4 matrix
+        ("vy_y", c_float),                                              ## [in] element 1 of column 1 of 3x4 matrix
+        ("vy_z", c_float),                                              ## [in] element 2 of column 1 of 3x4 matrix
+        ("vz_x", c_float),                                              ## [in] element 0 of column 2 of 3x4 matrix
+        ("vz_y", c_float),                                              ## [in] element 1 of column 2 of 3x4 matrix
+        ("vz_z", c_float),                                              ## [in] element 2 of column 2 of 3x4 matrix
+        ("p_x", c_float),                                               ## [in] element 0 of column 3 of 3x4 matrix
+        ("p_y", c_float),                                               ## [in] element 1 of column 3 of 3x4 matrix
+        ("p_z", c_float)                                                ## [in] element 2 of column 3 of 3x4 matrix
+    ]
+
+###############################################################################
+## @brief 3x4 affine transformation in column-major layout with aligned column
+##        vectors
+## 
+## @details
+##     - A 3x4 affine transformation in column major layout, consisting of vectors
+##        - vx=(vx_x, vx_y, vx_z),
+##        - vy=(vy_x, vy_y, vy_z),
+##        - vz=(vz_x, vz_y, vz_z), and
+##        - p=(p_x, p_y, p_z)
+##     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+##       z*vz + p`.
+##     - The column vectors are aligned to 16-bytes and pad members are
+##       ignored.
+class ze_rtas_transform_float3x4_aligned_column_major_exp_t(Structure):
+    _fields_ = [
+        ("vx_x", c_float),                                              ## [in] element 0 of column 0 of 3x4 matrix
+        ("vx_y", c_float),                                              ## [in] element 1 of column 0 of 3x4 matrix
+        ("vx_z", c_float),                                              ## [in] element 2 of column 0 of 3x4 matrix
+        ("pad0", c_float),                                              ## [in] ignored padding
+        ("vy_x", c_float),                                              ## [in] element 0 of column 1 of 3x4 matrix
+        ("vy_y", c_float),                                              ## [in] element 1 of column 1 of 3x4 matrix
+        ("vy_z", c_float),                                              ## [in] element 2 of column 1 of 3x4 matrix
+        ("pad1", c_float),                                              ## [in] ignored padding
+        ("vz_x", c_float),                                              ## [in] element 0 of column 2 of 3x4 matrix
+        ("vz_y", c_float),                                              ## [in] element 1 of column 2 of 3x4 matrix
+        ("vz_z", c_float),                                              ## [in] element 2 of column 2 of 3x4 matrix
+        ("pad2", c_float),                                              ## [in] ignored padding
+        ("p_x", c_float),                                               ## [in] element 0 of column 3 of 3x4 matrix
+        ("p_y", c_float),                                               ## [in] element 1 of column 3 of 3x4 matrix
+        ("p_z", c_float),                                               ## [in] element 2 of column 3 of 3x4 matrix
+        ("pad3", c_float)                                               ## [in] ignored padding
+    ]
+
+###############################################################################
+## @brief 3x4 affine transformation in row-major layout
+## 
+## @details
+##     - A 3x4 affine transformation in row-major layout, consisting of vectors
+##          - vx=(vx_x, vx_y, vx_z),
+##          - vy=(vy_x, vy_y, vy_z),
+##          - vz=(vz_x, vz_y, vz_z), and
+##          - p=(p_x, p_y, p_z)
+##     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+##       z*vz + p`.
+class ze_rtas_transform_float3x4_row_major_exp_t(Structure):
+    _fields_ = [
+        ("vx_x", c_float),                                              ## [in] element 0 of row 0 of 3x4 matrix
+        ("vy_x", c_float),                                              ## [in] element 1 of row 0 of 3x4 matrix
+        ("vz_x", c_float),                                              ## [in] element 2 of row 0 of 3x4 matrix
+        ("p_x", c_float),                                               ## [in] element 3 of row 0 of 3x4 matrix
+        ("vx_y", c_float),                                              ## [in] element 0 of row 1 of 3x4 matrix
+        ("vy_y", c_float),                                              ## [in] element 1 of row 1 of 3x4 matrix
+        ("vz_y", c_float),                                              ## [in] element 2 of row 1 of 3x4 matrix
+        ("p_y", c_float),                                               ## [in] element 3 of row 1 of 3x4 matrix
+        ("vx_z", c_float),                                              ## [in] element 0 of row 2 of 3x4 matrix
+        ("vy_z", c_float),                                              ## [in] element 1 of row 2 of 3x4 matrix
+        ("vz_z", c_float),                                              ## [in] element 2 of row 2 of 3x4 matrix
+        ("p_z", c_float)                                                ## [in] element 3 of row 2 of 3x4 matrix
+    ]
+
+###############################################################################
+## @brief A 3-dimensional axis-aligned bounding-box with lower and upper bounds
+##        in each dimension
+class ze_rtas_aabb_exp_t(Structure):
+    _fields_ = [
+        ("lower", ze_rtas_c_float3_exp_t),                              ## [in] lower bounds of AABB
+        ("upper", ze_rtas_c_float3_exp_t)                               ## [in] upper bounds of AABB
+    ]
+
+###############################################################################
+## @brief Triangle represented using 3 vertex indices
+## 
+## @details
+##     - Represents a triangle using 3 vertex indices that index into a vertex
+##       array that needs to be provided together with the index array.
+##     - The linear barycentric u/v parametrization of the triangle is defined as:
+##          - (u=0, v=0) at v0,
+##          - (u=1, v=0) at v1, and
+##          - (u=0, v=1) at v2
+class ze_rtas_triangle_indices_uint32_exp_t(Structure):
+    _fields_ = [
+        ("v0", c_ulong),                                                ## [in] first index pointing to the first triangle vertex in vertex array
+        ("v1", c_ulong),                                                ## [in] second index pointing to the second triangle vertex in vertex
+                                                                        ## array
+        ("v2", c_ulong)                                                 ## [in] third index pointing to the third triangle vertex in vertex array
+    ]
+
+###############################################################################
+## @brief Quad represented using 4 vertex indices
+## 
+## @details
+##     - Represents a quad composed of 4 indices that index into a vertex array
+##       that needs to be provided together with the index array.
+##     - A quad is a triangle pair represented using 4 vertex indices v0, v1,
+##       v2, v3.
+##       The first triangle is made out of indices v0, v1, v3 and the second triangle
+##       from indices v2, v3, v1. The piecewise linear barycentric u/v parametrization
+##       of the quad is defined as:
+##          - (u=0, v=0) at v0,
+##          - (u=1, v=0) at v1,
+##          - (u=0, v=1) at v3, and
+##          - (u=1, v=1) at v2
+##       This is achieved by correcting the u'/v' coordinates of the second
+##       triangle by
+##       *u = 1-u'* and *v = 1-v'*, yielding a piecewise linear parametrization.
+class ze_rtas_quad_indices_uint32_exp_t(Structure):
+    _fields_ = [
+        ("v0", c_ulong),                                                ## [in] first index pointing to the first quad vertex in vertex array
+        ("v1", c_ulong),                                                ## [in] second index pointing to the second quad vertex in vertex array
+        ("v2", c_ulong),                                                ## [in] third index pointing to the third quad vertex in vertex array
+        ("v3", c_ulong)                                                 ## [in] fourth index pointing to the fourth quad vertex in vertex array
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder geometry info
+class ze_rtas_builder_geometry_info_exp_t(Structure):
+    _fields_ = [
+        ("geometryType", ze_rtas_builder_packed_geometry_type_exp_t)    ## [in] geometry type
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder triangle mesh geometry info
+## 
+## @details
+##     - The linear barycentric u/v parametrization of the triangle is defined as:
+##          - (u=0, v=0) at v0,
+##          - (u=1, v=0) at v1, and
+##          - (u=0, v=1) at v2
+class ze_rtas_builder_triangles_geometry_info_exp_t(Structure):
+    _fields_ = [
+        ("geometryType", ze_rtas_builder_packed_geometry_type_exp_t),   ## [in] geometry type, must be
+                                                                        ## ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES
+        ("geometryFlags", ze_rtas_builder_packed_geometry_exp_flags_t), ## [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                        ## bits representing the geometry flags for all primitives of this
+                                                                        ## geometry
+        ("geometryMask", c_ubyte),                                      ## [in] 8-bit geometry mask for ray masking
+        ("triangleFormat", ze_rtas_builder_packed_input_data_format_exp_t), ## [in] format of triangle buffer data, must be
+                                                                        ## ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32
+        ("vertexFormat", ze_rtas_builder_packed_input_data_format_exp_t),   ## [in] format of vertex buffer data, must be
+                                                                        ## ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3
+        ("triangleCount", c_ulong),                                     ## [in] number of triangles in triangle buffer
+        ("vertexCount", c_ulong),                                       ## [in] number of vertices in vertex buffer
+        ("triangleStride", c_ulong),                                    ## [in] stride (in bytes) of triangles in triangle buffer
+        ("vertexStride", c_ulong),                                      ## [in] stride (in bytes) of vertices in vertex buffer
+        ("pTriangleBuffer", c_void_p),                                  ## [in] pointer to array of triangle indices in specified format
+        ("pVertexBuffer", c_void_p)                                     ## [in] pointer to array of triangle vertices in specified format
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder quad mesh geometry info
+## 
+## @details
+##     - A quad is a triangle pair represented using 4 vertex indices v0, v1,
+##       v2, v3.
+##       The first triangle is made out of indices v0, v1, v3 and the second triangle
+##       from indices v2, v3, v1. The piecewise linear barycentric u/v parametrization
+##       of the quad is defined as:
+##          - (u=0, v=0) at v0,
+##          - (u=1, v=0) at v1,
+##          - (u=0, v=1) at v3, and
+##          - (u=1, v=1) at v2
+##       This is achieved by correcting the u'/v' coordinates of the second
+##       triangle by
+##       *u = 1-u'* and *v = 1-v'*, yielding a piecewise linear parametrization.
+class ze_rtas_builder_quads_geometry_info_exp_t(Structure):
+    _fields_ = [
+        ("geometryType", ze_rtas_builder_packed_geometry_type_exp_t),   ## [in] geometry type, must be ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS
+        ("geometryFlags", ze_rtas_builder_packed_geometry_exp_flags_t), ## [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                        ## bits representing the geometry flags for all primitives of this
+                                                                        ## geometry
+        ("geometryMask", c_ubyte),                                      ## [in] 8-bit geometry mask for ray masking
+        ("quadFormat", ze_rtas_builder_packed_input_data_format_exp_t), ## [in] format of quad buffer data, must be
+                                                                        ## ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32
+        ("vertexFormat", ze_rtas_builder_packed_input_data_format_exp_t),   ## [in] format of vertex buffer data, must be
+                                                                        ## ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3
+        ("quadCount", c_ulong),                                         ## [in] number of quads in quad buffer
+        ("vertexCount", c_ulong),                                       ## [in] number of vertices in vertex buffer
+        ("quadStride", c_ulong),                                        ## [in] stride (in bytes) of quads in quad buffer
+        ("vertexStride", c_ulong),                                      ## [in] stride (in bytes) of vertices in vertex buffer
+        ("pQuadBuffer", c_void_p),                                      ## [in] pointer to array of quad indices in specified format
+        ("pVertexBuffer", c_void_p)                                     ## [in] pointer to array of quad vertices in specified format
+    ]
+
+###############################################################################
+## @brief AABB callback function parameters
+class ze_rtas_geometry_aabbs_exp_cb_params_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("primID", c_ulong),                                            ## [in] first primitive to return bounds for
+        ("primIDCount", c_ulong),                                       ## [in] number of primitives to return bounds for
+        ("pGeomUserPtr", c_void_p),                                     ## [in] pointer provided through geometry descriptor
+        ("pBuildUserPtr", c_void_p),                                    ## [in] pointer provided through ::zeRTASBuilderBuildExp function
+        ("pBoundsOut", POINTER(ze_rtas_aabb_exp_t))                     ## [out] destination buffer to write AABB bounds to
+    ]
+
+###############################################################################
+## @brief Callback function pointer type to return AABBs for a range of
+##        procedural primitives
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder procedural primitives
+##        geometry info
+## 
+## @details
+##     - A host-side bounds callback function is invoked by the acceleration
+##       structure builder to query the bounds of procedural primitives on
+##       demand. The callback is passed some `pGeomUserPtr` that can point to
+##       an application-side representation of the procedural primitives.
+##       Further, a second `pBuildUserPtr`, which is set by a parameter to
+##       ::zeRTASBuilderBuildExp, is passed to the callback. This allows the
+##       build to change the bounds of the procedural geometry, for example, to
+##       build a BVH only over a short time range to implement multi-segment
+##       motion blur.
+class ze_rtas_builder_procedural_geometry_info_exp_t(Structure):
+    _fields_ = [
+        ("geometryType", ze_rtas_builder_packed_geometry_type_exp_t),   ## [in] geometry type, must be
+                                                                        ## ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL
+        ("geometryFlags", ze_rtas_builder_packed_geometry_exp_flags_t), ## [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                        ## bits representing the geometry flags for all primitives of this
+                                                                        ## geometry
+        ("geometryMask", c_ubyte),                                      ## [in] 8-bit geometry mask for ray masking
+        ("reserved", c_ubyte),                                          ## [in] reserved for future use
+        ("primCount", c_ulong),                                         ## [in] number of primitives in geometry
+        ("pfnGetBoundsCb", ze_rtas_geometry_aabbs_cb_exp_t),            ## [in] pointer to callback function to get the axis-aligned bounding-box
+                                                                        ## for a range of primitives
+        ("pGeomUserPtr", c_void_p)                                      ## [in] user data pointer passed to callback
+    ]
+
+###############################################################################
+## @brief Ray tracing acceleration structure builder instance geometry info
+class ze_rtas_builder_instance_geometry_info_exp_t(Structure):
+    _fields_ = [
+        ("geometryType", ze_rtas_builder_packed_geometry_type_exp_t),   ## [in] geometry type, must be
+                                                                        ## ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE
+        ("instanceFlags", ze_rtas_builder_packed_instance_exp_flags_t), ## [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                        ## bits representing the geometry flags for all primitives of this
+                                                                        ## geometry
+        ("geometryMask", c_ubyte),                                      ## [in] 8-bit geometry mask for ray masking
+        ("transformFormat", ze_rtas_builder_packed_input_data_format_exp_t),## [in] format of the specified transformation
+        ("instanceUserID", c_ulong),                                    ## [in] user-specified identifier for the instance
+        ("pTransform", c_void_p),                                       ## [in] object-to-world instance transformation in specified format
+        ("pBounds", POINTER(ze_rtas_aabb_exp_t)),                       ## [in] object-space axis-aligned bounding-box of the instanced
+                                                                        ## acceleration structure
+        ("pAccelerationStructure", c_void_p)                            ## [in] pointer to acceleration structure to instantiate
+    ]
+
+###############################################################################
+## @brief 
+class ze_rtas_builder_build_op_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("rtasFormat", ze_rtas_format_exp_t),                           ## [in] ray tracing acceleration structure format
+        ("buildQuality", ze_rtas_builder_build_quality_hint_exp_t),     ## [in] acceleration structure build quality hint
+        ("buildFlags", ze_rtas_builder_build_op_exp_flags_t),           ## [in] 0 or some combination of ::ze_rtas_builder_build_op_exp_flag_t
+                                                                        ## flags
+        ("ppGeometries", POINTER(ze_rtas_builder_geometry_info_exp_t*)),## [in][optional][range(0, `numGeometries`)] NULL or a valid array of
+                                                                        ## pointers to geometry infos
+        ("numGeometries", c_ulong)                                      ## [in] number of geometries in geometry infos array, can be zero when
+                                                                        ## `ppGeometries` is NULL
+    ]
+
+###############################################################################
+## @brief Counter-based Event Pools Extension Name
+ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME = "ZE_experimental_event_pool_counter_based"
+
+###############################################################################
+## @brief Counter-based Event Pools Extension Version(s)
+class ze_event_pool_counter_based_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_event_pool_counter_based_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_event_pool_counter_based_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Supported event flags for defining counter-based event pools.
+class ze_event_pool_counter_based_exp_flags_v(IntEnum):
+    IMMEDIATE = ZE_BIT(0)                                                   ## Counter-based event pool is used for immediate command lists (default)
+    NON_IMMEDIATE = ZE_BIT(1)                                               ## Counter-based event pool is for non-immediate command lists
+
+class ze_event_pool_counter_based_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Event pool descriptor for counter-based events. This structure may be
+##        passed to ::zeEventPoolCreate as pNext member of
+##        ::ze_event_pool_desc_t.
+class ze_event_pool_counter_based_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_event_pool_counter_based_exp_flags_t)              ## [in] mode flags.
+                                                                        ## must be 0 (default) or a valid value of ::ze_event_pool_counter_based_exp_flag_t
+                                                                        ## default behavior is counter-based event pool is only used for
+                                                                        ## immediate command lists.
+    ]
+
+###############################################################################
+## @brief Image Memory Properties Extension Name
+ZE_BINDLESS_IMAGE_EXP_NAME = "ZE_experimental_bindless_image"
+
+###############################################################################
+## @brief Bindless Image Extension Version(s)
+class ze_bindless_image_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_bindless_image_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_bindless_image_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Image flags for Bindless images
+class ze_image_bindless_exp_flags_v(IntEnum):
+    BINDLESS = ZE_BIT(0)                                                    ## Bindless images are created with ::zeImageCreate. The image handle
+                                                                            ## created with this flag is valid on both host and device.
+    SAMPLED_IMAGE = ZE_BIT(1)                                               ## Bindless sampled images are created with ::zeImageCreate by combining
+                                                                            ## BINDLESS and SAMPLED_IMAGE.
+                                                                            ## Create sampled image view from bindless unsampled image using SAMPLED_IMAGE.
+
+class ze_image_bindless_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Image descriptor for bindless images. This structure may be passed to
+##        ::zeImageCreate via pNext member of ::ze_image_desc_t.
+class ze_image_bindless_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_image_bindless_exp_flags_t)                        ## [in] image flags.
+                                                                        ## must be 0 (default) or a valid value of ::ze_image_bindless_exp_flag_t
+                                                                        ## default behavior is bindless images are not used when creating handles
+                                                                        ## via ::zeImageCreate.
+                                                                        ## When the flag is passed to ::zeImageCreate, then only the memory for
+                                                                        ## the image is allocated.
+                                                                        ## Additional image handles can be created with ::zeImageViewCreateExt.
+                                                                        ## When ::ZE_IMAGE_BINDLESS_EXP_FLAG_SAMPLED_IMAGE flag is passed,
+                                                                        ## ::ze_sampler_desc_t must be attached via pNext member of ::ze_image_bindless_exp_desc_t.
+    ]
+
+###############################################################################
+## @brief Image descriptor for bindless images created from pitched allocations.
+##        This structure may be passed to ::zeImageCreate via pNext member of
+##        ::ze_image_desc_t.
+class ze_image_pitched_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("ptr", c_void_p)                                               ## [in] pointer to pitched device allocation allocated using ::zeMemAllocDevice
+    ]
+
+###############################################################################
+## @brief Device specific properties for pitched allocations
+## 
+## @details
+##     - This structure may be passed to ::zeDeviceGetImageProperties via the
+##       pNext member of ::ze_device_image_properties_t.
+class ze_device_pitched_alloc_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("maxImageLinearWidth", c_size_t),                              ## [out] Maximum image linear width.
+        ("maxImageLinearHeight", c_size_t)                              ## [out] Maximum image linear height.
+    ]
+
+###############################################################################
+## @brief Command List Clone Extension Name
+ZE_COMMAND_LIST_CLONE_EXP_NAME = "ZE_experimental_command_list_clone"
+
+###############################################################################
+## @brief Command List Clone Extension Version(s)
+class ze_command_list_clone_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_command_list_clone_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_command_list_clone_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Immediate Command List Append Extension Name
+ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME = "ZE_experimental_immediate_command_list_append"
+
+###############################################################################
+## @brief Immediate Command List Append Extension Version(s)
+class ze_immediate_command_list_append_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_immediate_command_list_append_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_immediate_command_list_append_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Mutable Command List Extension Name
+ZE_MUTABLE_COMMAND_LIST_EXP_NAME = "ZE_experimental_mutable_command_list"
+
+###############################################################################
+## @brief Mutable Command List Extension Version(s)
+class ze_mutable_command_list_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    _1_1 = ZE_MAKE_VERSION( 1, 1 )                                          ## version 1.1
+    CURRENT = ZE_MAKE_VERSION( 1, 1 )                                       ## latest known version
+
+class ze_mutable_command_list_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_mutable_command_list_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Mutable command flags
+class ze_mutable_command_exp_flags_v(IntEnum):
+    KERNEL_ARGUMENTS = ZE_BIT(0)                                            ## kernel arguments
+    GROUP_COUNT = ZE_BIT(1)                                                 ## kernel group count
+    GROUP_SIZE = ZE_BIT(2)                                                  ## kernel group size
+    GLOBAL_OFFSET = ZE_BIT(3)                                               ## kernel global offset
+    SIGNAL_EVENT = ZE_BIT(4)                                                ## command signal event
+    WAIT_EVENTS = ZE_BIT(5)                                                 ## command wait events
+    KERNEL_INSTRUCTION = ZE_BIT(6)                                          ## command kernel
+    GRAPH_ARGUMENTS = ZE_BIT(7)                                             ## graph arguments
+
+class ze_mutable_command_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Mutable command identifier descriptor
+class ze_mutable_command_id_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_mutable_command_exp_flags_t)                       ## [in] mutable command flags.
+                                                                        ##  - must be 0 (default, equivalent to setting all flags bar kernel
+                                                                        ## instruction), or a valid combination of ::ze_mutable_command_exp_flag_t
+                                                                        ##  - in order to include kernel instruction mutation,
+                                                                        ## ::ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION must be explictly included
+    ]
+
+###############################################################################
+## @brief Mutable command list flags
+class ze_mutable_command_list_exp_flags_v(IntEnum):
+    RESERVED = ZE_BIT(0)                                                    ## reserved
+
+class ze_mutable_command_list_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Mutable command list properties
+class ze_mutable_command_list_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("mutableCommandListFlags", ze_mutable_command_list_exp_flags_t),   ## [out] mutable command list flags
+        ("mutableCommandFlags", ze_mutable_command_exp_flags_t)         ## [out] mutable command flags
+    ]
+
+###############################################################################
+## @brief Mutable command list descriptor
+class ze_mutable_command_list_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", ze_mutable_command_list_exp_flags_t)                  ## [in] mutable command list flags.
+                                                                        ##  - must be 0 (default) or a valid combination of ::ze_mutable_command_list_exp_flag_t
+    ]
+
+###############################################################################
+## @brief Mutable commands descriptor
+class ze_mutable_commands_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", c_ulong)                                              ## [in] must be 0, this field is reserved for future use
+    ]
+
+###############################################################################
+## @brief Mutable kernel argument descriptor
+class ze_mutable_kernel_argument_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("commandId", c_ulonglong),                                     ## [in] command identifier
+        ("argIndex", c_ulong),                                          ## [in] kernel argument index
+        ("argSize", c_size_t),                                          ## [in] kernel argument size
+        ("pArgValue", c_void_p)                                         ## [in] pointer to kernel argument value
+    ]
+
+###############################################################################
+## @brief Mutable kernel group count descriptor
+class ze_mutable_group_count_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("commandId", c_ulonglong),                                     ## [in] command identifier
+        ("pGroupCount", POINTER(ze_group_count_t))                      ## [in] pointer to group count
+    ]
+
+###############################################################################
+## @brief Mutable kernel group size descriptor
+class ze_mutable_group_size_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("commandId", c_ulonglong),                                     ## [in] command identifier
+        ("groupSizeX", c_ulong),                                        ## [in] group size for X dimension to use for the kernel
+        ("groupSizeY", c_ulong),                                        ## [in] group size for Y dimension to use for the kernel
+        ("groupSizeZ", c_ulong)                                         ## [in] group size for Z dimension to use for the kernel
+    ]
+
+###############################################################################
+## @brief Mutable kernel global offset descriptor
+class ze_mutable_global_offset_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("commandId", c_ulonglong),                                     ## [in] command identifier
+        ("offsetX", c_ulong),                                           ## [in] global offset for X dimension to use for this kernel
+        ("offsetY", c_ulong),                                           ## [in] global offset for Y dimension to use for this kernel
+        ("offsetZ", c_ulong)                                            ## [in] global offset for Z dimension to use for this kernel
+    ]
+
+###############################################################################
+## @brief Mutable graph argument descriptor
+class ze_mutable_graph_argument_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", ze_structure_type_t),                                 ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("commandId", c_ulonglong),                                     ## [in] command identifier
+        ("argIndex", c_ulong),                                          ## [in] graph argument index
+        ("pArgValue", c_void_p)                                         ## [in] pointer to graph argument value
+    ]
+
+###############################################################################
+__use_win_types = "Windows" == platform.uname()[0]
+
+###############################################################################
+## @brief Function-pointer for zeRTASBuilderCreateExp
+if __use_win_types:
+    _zeRTASBuilderCreateExp_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_rtas_builder_exp_desc_t), POINTER(ze_rtas_builder_exp_handle_t) )
+else:
+    _zeRTASBuilderCreateExp_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_rtas_builder_exp_desc_t), POINTER(ze_rtas_builder_exp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeRTASBuilderGetBuildPropertiesExp
+if __use_win_types:
+    _zeRTASBuilderGetBuildPropertiesExp_t = WINFUNCTYPE( ze_result_t, ze_rtas_builder_exp_handle_t, POINTER(ze_rtas_builder_build_op_exp_desc_t), POINTER(ze_rtas_builder_exp_properties_t) )
+else:
+    _zeRTASBuilderGetBuildPropertiesExp_t = CFUNCTYPE( ze_result_t, ze_rtas_builder_exp_handle_t, POINTER(ze_rtas_builder_build_op_exp_desc_t), POINTER(ze_rtas_builder_exp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeRTASBuilderBuildExp
+if __use_win_types:
+    _zeRTASBuilderBuildExp_t = WINFUNCTYPE( ze_result_t, ze_rtas_builder_exp_handle_t, POINTER(ze_rtas_builder_build_op_exp_desc_t), c_void_p, c_size_t, c_void_p, c_size_t, ze_rtas_parallel_operation_exp_handle_t, c_void_p, POINTER(ze_rtas_aabb_exp_t), POINTER(c_size_t) )
+else:
+    _zeRTASBuilderBuildExp_t = CFUNCTYPE( ze_result_t, ze_rtas_builder_exp_handle_t, POINTER(ze_rtas_builder_build_op_exp_desc_t), c_void_p, c_size_t, c_void_p, c_size_t, ze_rtas_parallel_operation_exp_handle_t, c_void_p, POINTER(ze_rtas_aabb_exp_t), POINTER(c_size_t) )
+
+###############################################################################
+## @brief Function-pointer for zeRTASBuilderDestroyExp
+if __use_win_types:
+    _zeRTASBuilderDestroyExp_t = WINFUNCTYPE( ze_result_t, ze_rtas_builder_exp_handle_t )
+else:
+    _zeRTASBuilderDestroyExp_t = CFUNCTYPE( ze_result_t, ze_rtas_builder_exp_handle_t )
+
+
+###############################################################################
+## @brief Table of RTASBuilderExp functions pointers
+class _ze_rtas_builder_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreateExp", c_void_p),                                     ## _zeRTASBuilderCreateExp_t
+        ("pfnGetBuildPropertiesExp", c_void_p),                         ## _zeRTASBuilderGetBuildPropertiesExp_t
+        ("pfnBuildExp", c_void_p),                                      ## _zeRTASBuilderBuildExp_t
+        ("pfnDestroyExp", c_void_p)                                     ## _zeRTASBuilderDestroyExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeRTASParallelOperationCreateExp
+if __use_win_types:
+    _zeRTASParallelOperationCreateExp_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_rtas_parallel_operation_exp_handle_t) )
+else:
+    _zeRTASParallelOperationCreateExp_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_rtas_parallel_operation_exp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeRTASParallelOperationGetPropertiesExp
+if __use_win_types:
+    _zeRTASParallelOperationGetPropertiesExp_t = WINFUNCTYPE( ze_result_t, ze_rtas_parallel_operation_exp_handle_t, POINTER(ze_rtas_parallel_operation_exp_properties_t) )
+else:
+    _zeRTASParallelOperationGetPropertiesExp_t = CFUNCTYPE( ze_result_t, ze_rtas_parallel_operation_exp_handle_t, POINTER(ze_rtas_parallel_operation_exp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeRTASParallelOperationJoinExp
+if __use_win_types:
+    _zeRTASParallelOperationJoinExp_t = WINFUNCTYPE( ze_result_t, ze_rtas_parallel_operation_exp_handle_t )
+else:
+    _zeRTASParallelOperationJoinExp_t = CFUNCTYPE( ze_result_t, ze_rtas_parallel_operation_exp_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeRTASParallelOperationDestroyExp
+if __use_win_types:
+    _zeRTASParallelOperationDestroyExp_t = WINFUNCTYPE( ze_result_t, ze_rtas_parallel_operation_exp_handle_t )
+else:
+    _zeRTASParallelOperationDestroyExp_t = CFUNCTYPE( ze_result_t, ze_rtas_parallel_operation_exp_handle_t )
+
+
+###############################################################################
+## @brief Table of RTASParallelOperationExp functions pointers
+class _ze_rtas_parallel_operation_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreateExp", c_void_p),                                     ## _zeRTASParallelOperationCreateExp_t
+        ("pfnGetPropertiesExp", c_void_p),                              ## _zeRTASParallelOperationGetPropertiesExp_t
+        ("pfnJoinExp", c_void_p),                                       ## _zeRTASParallelOperationJoinExp_t
+        ("pfnDestroyExp", c_void_p)                                     ## _zeRTASParallelOperationDestroyExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeInit
+if __use_win_types:
+    _zeInit_t = WINFUNCTYPE( ze_result_t, ze_init_flags_t )
+else:
+    _zeInit_t = CFUNCTYPE( ze_result_t, ze_init_flags_t )
+
+###############################################################################
+## @brief Function-pointer for zeInitDrivers
+if __use_win_types:
+    _zeInitDrivers_t = WINFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(ze_driver_handle_t), POINTER(ze_init_driver_type_desc_t) )
+else:
+    _zeInitDrivers_t = CFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(ze_driver_handle_t), POINTER(ze_init_driver_type_desc_t) )
+
+
+###############################################################################
+## @brief Table of Global functions pointers
+class _ze_global_dditable_t(Structure):
+    _fields_ = [
+        ("pfnInit", c_void_p),                                          ## _zeInit_t
+        ("pfnInitDrivers", c_void_p)                                    ## _zeInitDrivers_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeDriverGet
+if __use_win_types:
+    _zeDriverGet_t = WINFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(ze_driver_handle_t) )
+else:
+    _zeDriverGet_t = CFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(ze_driver_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDriverGetApiVersion
+if __use_win_types:
+    _zeDriverGetApiVersion_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_api_version_t) )
+else:
+    _zeDriverGetApiVersion_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_api_version_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDriverGetProperties
+if __use_win_types:
+    _zeDriverGetProperties_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_properties_t) )
+else:
+    _zeDriverGetProperties_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDriverGetIpcProperties
+if __use_win_types:
+    _zeDriverGetIpcProperties_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_ipc_properties_t) )
+else:
+    _zeDriverGetIpcProperties_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_driver_ipc_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDriverGetExtensionProperties
+if __use_win_types:
+    _zeDriverGetExtensionProperties_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_driver_extension_properties_t) )
+else:
+    _zeDriverGetExtensionProperties_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_driver_extension_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDriverGetExtensionFunctionAddress
+if __use_win_types:
+    _zeDriverGetExtensionFunctionAddress_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, c_char_p, POINTER(c_void_p) )
+else:
+    _zeDriverGetExtensionFunctionAddress_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, c_char_p, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeDriverGetLastErrorDescription
+if __use_win_types:
+    _zeDriverGetLastErrorDescription_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_char_p) )
+else:
+    _zeDriverGetLastErrorDescription_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_char_p) )
+
+
+###############################################################################
+## @brief Table of Driver functions pointers
+class _ze_driver_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGet", c_void_p),                                           ## _zeDriverGet_t
+        ("pfnGetApiVersion", c_void_p),                                 ## _zeDriverGetApiVersion_t
+        ("pfnGetProperties", c_void_p),                                 ## _zeDriverGetProperties_t
+        ("pfnGetIpcProperties", c_void_p),                              ## _zeDriverGetIpcProperties_t
+        ("pfnGetExtensionProperties", c_void_p),                        ## _zeDriverGetExtensionProperties_t
+        ("pfnGetExtensionFunctionAddress", c_void_p),                   ## _zeDriverGetExtensionFunctionAddress_t
+        ("pfnGetLastErrorDescription", c_void_p)                        ## _zeDriverGetLastErrorDescription_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeDriverRTASFormatCompatibilityCheckExp
+if __use_win_types:
+    _zeDriverRTASFormatCompatibilityCheckExp_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, ze_rtas_format_exp_t, ze_rtas_format_exp_t )
+else:
+    _zeDriverRTASFormatCompatibilityCheckExp_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, ze_rtas_format_exp_t, ze_rtas_format_exp_t )
+
+
+###############################################################################
+## @brief Table of DriverExp functions pointers
+class _ze_driver_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnRTASFormatCompatibilityCheckExp", c_void_p)                ## _zeDriverRTASFormatCompatibilityCheckExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGet
+if __use_win_types:
+    _zeDeviceGet_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
+else:
+    _zeDeviceGet_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetSubDevices
+if __use_win_types:
+    _zeDeviceGetSubDevices_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
+else:
+    _zeDeviceGetSubDevices_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetProperties
+if __use_win_types:
+    _zeDeviceGetProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_properties_t) )
+else:
+    _zeDeviceGetProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetComputeProperties
+if __use_win_types:
+    _zeDeviceGetComputeProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_compute_properties_t) )
+else:
+    _zeDeviceGetComputeProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_compute_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetModuleProperties
+if __use_win_types:
+    _zeDeviceGetModuleProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_module_properties_t) )
+else:
+    _zeDeviceGetModuleProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_module_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetCommandQueueGroupProperties
+if __use_win_types:
+    _zeDeviceGetCommandQueueGroupProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_command_queue_group_properties_t) )
+else:
+    _zeDeviceGetCommandQueueGroupProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_command_queue_group_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetMemoryProperties
+if __use_win_types:
+    _zeDeviceGetMemoryProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_memory_properties_t) )
+else:
+    _zeDeviceGetMemoryProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_memory_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetMemoryAccessProperties
+if __use_win_types:
+    _zeDeviceGetMemoryAccessProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_memory_access_properties_t) )
+else:
+    _zeDeviceGetMemoryAccessProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_memory_access_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetCacheProperties
+if __use_win_types:
+    _zeDeviceGetCacheProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_cache_properties_t) )
+else:
+    _zeDeviceGetCacheProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_device_cache_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetImageProperties
+if __use_win_types:
+    _zeDeviceGetImageProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_image_properties_t) )
+else:
+    _zeDeviceGetImageProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_image_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetExternalMemoryProperties
+if __use_win_types:
+    _zeDeviceGetExternalMemoryProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_external_memory_properties_t) )
+else:
+    _zeDeviceGetExternalMemoryProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_external_memory_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetP2PProperties
+if __use_win_types:
+    _zeDeviceGetP2PProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_device_p2p_properties_t) )
+else:
+    _zeDeviceGetP2PProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_device_p2p_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceCanAccessPeer
+if __use_win_types:
+    _zeDeviceCanAccessPeer_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_bool_t) )
+else:
+    _zeDeviceCanAccessPeer_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, ze_device_handle_t, POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetStatus
+if __use_win_types:
+    _zeDeviceGetStatus_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t )
+else:
+    _zeDeviceGetStatus_t = CFUNCTYPE( ze_result_t, ze_device_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetGlobalTimestamps
+if __use_win_types:
+    _zeDeviceGetGlobalTimestamps_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulonglong), POINTER(c_ulonglong) )
+else:
+    _zeDeviceGetGlobalTimestamps_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(c_ulonglong), POINTER(c_ulonglong) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceReserveCacheExt
+if __use_win_types:
+    _zeDeviceReserveCacheExt_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, c_size_t, c_size_t )
+else:
+    _zeDeviceReserveCacheExt_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, c_size_t, c_size_t )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceSetCacheAdviceExt
+if __use_win_types:
+    _zeDeviceSetCacheAdviceExt_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, c_void_p, c_size_t, ze_cache_ext_region_t )
+else:
+    _zeDeviceSetCacheAdviceExt_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, c_void_p, c_size_t, ze_cache_ext_region_t )
+
+###############################################################################
+## @brief Function-pointer for zeDevicePciGetPropertiesExt
+if __use_win_types:
+    _zeDevicePciGetPropertiesExt_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_pci_ext_properties_t) )
+else:
+    _zeDevicePciGetPropertiesExt_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_pci_ext_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetRootDevice
+if __use_win_types:
+    _zeDeviceGetRootDevice_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_handle_t) )
+else:
+    _zeDeviceGetRootDevice_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_device_handle_t) )
+
+
+###############################################################################
+## @brief Table of Device functions pointers
+class _ze_device_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGet", c_void_p),                                           ## _zeDeviceGet_t
+        ("pfnGetSubDevices", c_void_p),                                 ## _zeDeviceGetSubDevices_t
+        ("pfnGetProperties", c_void_p),                                 ## _zeDeviceGetProperties_t
+        ("pfnGetComputeProperties", c_void_p),                          ## _zeDeviceGetComputeProperties_t
+        ("pfnGetModuleProperties", c_void_p),                           ## _zeDeviceGetModuleProperties_t
+        ("pfnGetCommandQueueGroupProperties", c_void_p),                ## _zeDeviceGetCommandQueueGroupProperties_t
+        ("pfnGetMemoryProperties", c_void_p),                           ## _zeDeviceGetMemoryProperties_t
+        ("pfnGetMemoryAccessProperties", c_void_p),                     ## _zeDeviceGetMemoryAccessProperties_t
+        ("pfnGetCacheProperties", c_void_p),                            ## _zeDeviceGetCacheProperties_t
+        ("pfnGetImageProperties", c_void_p),                            ## _zeDeviceGetImageProperties_t
+        ("pfnGetExternalMemoryProperties", c_void_p),                   ## _zeDeviceGetExternalMemoryProperties_t
+        ("pfnGetP2PProperties", c_void_p),                              ## _zeDeviceGetP2PProperties_t
+        ("pfnCanAccessPeer", c_void_p),                                 ## _zeDeviceCanAccessPeer_t
+        ("pfnGetStatus", c_void_p),                                     ## _zeDeviceGetStatus_t
+        ("pfnGetGlobalTimestamps", c_void_p),                           ## _zeDeviceGetGlobalTimestamps_t
+        ("pfnReserveCacheExt", c_void_p),                               ## _zeDeviceReserveCacheExt_t
+        ("pfnSetCacheAdviceExt", c_void_p),                             ## _zeDeviceSetCacheAdviceExt_t
+        ("pfnPciGetPropertiesExt", c_void_p),                           ## _zeDevicePciGetPropertiesExt_t
+        ("pfnGetRootDevice", c_void_p)                                  ## _zeDeviceGetRootDevice_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeDeviceGetFabricVertexExp
+if __use_win_types:
+    _zeDeviceGetFabricVertexExp_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_fabric_vertex_handle_t) )
+else:
+    _zeDeviceGetFabricVertexExp_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_fabric_vertex_handle_t) )
+
+
+###############################################################################
+## @brief Table of DeviceExp functions pointers
+class _ze_device_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetFabricVertexExp", c_void_p)                             ## _zeDeviceGetFabricVertexExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeContextCreate
+if __use_win_types:
+    _zeContextCreate_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), POINTER(ze_context_handle_t) )
+else:
+    _zeContextCreate_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), POINTER(ze_context_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeContextDestroy
+if __use_win_types:
+    _zeContextDestroy_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t )
+else:
+    _zeContextDestroy_t = CFUNCTYPE( ze_result_t, ze_context_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextGetStatus
+if __use_win_types:
+    _zeContextGetStatus_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t )
+else:
+    _zeContextGetStatus_t = CFUNCTYPE( ze_result_t, ze_context_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextSystemBarrier
+if __use_win_types:
+    _zeContextSystemBarrier_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t )
+else:
+    _zeContextSystemBarrier_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextMakeMemoryResident
+if __use_win_types:
+    _zeContextMakeMemoryResident_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
+else:
+    _zeContextMakeMemoryResident_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextEvictMemory
+if __use_win_types:
+    _zeContextEvictMemory_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
+else:
+    _zeContextEvictMemory_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextMakeImageResident
+if __use_win_types:
+    _zeContextMakeImageResident_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
+else:
+    _zeContextMakeImageResident_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextEvictImage
+if __use_win_types:
+    _zeContextEvictImage_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
+else:
+    _zeContextEvictImage_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_image_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeContextCreateEx
+if __use_win_types:
+    _zeContextCreateEx_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_context_handle_t) )
+else:
+    _zeContextCreateEx_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(ze_context_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_context_handle_t) )
+
+
+###############################################################################
+## @brief Table of Context functions pointers
+class _ze_context_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeContextCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeContextDestroy_t
+        ("pfnGetStatus", c_void_p),                                     ## _zeContextGetStatus_t
+        ("pfnSystemBarrier", c_void_p),                                 ## _zeContextSystemBarrier_t
+        ("pfnMakeMemoryResident", c_void_p),                            ## _zeContextMakeMemoryResident_t
+        ("pfnEvictMemory", c_void_p),                                   ## _zeContextEvictMemory_t
+        ("pfnMakeImageResident", c_void_p),                             ## _zeContextMakeImageResident_t
+        ("pfnEvictImage", c_void_p),                                    ## _zeContextEvictImage_t
+        ("pfnCreateEx", c_void_p)                                       ## _zeContextCreateEx_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeCommandQueueCreate
+if __use_win_types:
+    _zeCommandQueueCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_queue_handle_t) )
+else:
+    _zeCommandQueueCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_queue_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandQueueDestroy
+if __use_win_types:
+    _zeCommandQueueDestroy_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t )
+else:
+    _zeCommandQueueDestroy_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandQueueExecuteCommandLists
+if __use_win_types:
+    _zeCommandQueueExecuteCommandLists_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulong, POINTER(ze_command_list_handle_t), ze_fence_handle_t )
+else:
+    _zeCommandQueueExecuteCommandLists_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulong, POINTER(ze_command_list_handle_t), ze_fence_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandQueueSynchronize
+if __use_win_types:
+    _zeCommandQueueSynchronize_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulonglong )
+else:
+    _zeCommandQueueSynchronize_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, c_ulonglong )
+
+###############################################################################
+## @brief Function-pointer for zeCommandQueueGetOrdinal
+if __use_win_types:
+    _zeCommandQueueGetOrdinal_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(c_ulong) )
+else:
+    _zeCommandQueueGetOrdinal_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandQueueGetIndex
+if __use_win_types:
+    _zeCommandQueueGetIndex_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(c_ulong) )
+else:
+    _zeCommandQueueGetIndex_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(c_ulong) )
+
+
+###############################################################################
+## @brief Table of CommandQueue functions pointers
+class _ze_command_queue_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeCommandQueueCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeCommandQueueDestroy_t
+        ("pfnExecuteCommandLists", c_void_p),                           ## _zeCommandQueueExecuteCommandLists_t
+        ("pfnSynchronize", c_void_p),                                   ## _zeCommandQueueSynchronize_t
+        ("pfnGetOrdinal", c_void_p),                                    ## _zeCommandQueueGetOrdinal_t
+        ("pfnGetIndex", c_void_p)                                       ## _zeCommandQueueGetIndex_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeCommandListCreate
+if __use_win_types:
+    _zeCommandListCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_list_desc_t), POINTER(ze_command_list_handle_t) )
+else:
+    _zeCommandListCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_list_desc_t), POINTER(ze_command_list_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListCreateImmediate
+if __use_win_types:
+    _zeCommandListCreateImmediate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_list_handle_t) )
+else:
+    _zeCommandListCreateImmediate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_command_queue_desc_t), POINTER(ze_command_list_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListDestroy
+if __use_win_types:
+    _zeCommandListDestroy_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t )
+else:
+    _zeCommandListDestroy_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListClose
+if __use_win_types:
+    _zeCommandListClose_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t )
+else:
+    _zeCommandListClose_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListReset
+if __use_win_types:
+    _zeCommandListReset_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t )
+else:
+    _zeCommandListReset_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendWriteGlobalTimestamp
+if __use_win_types:
+    _zeCommandListAppendWriteGlobalTimestamp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulonglong), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendWriteGlobalTimestamp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulonglong), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendBarrier
+if __use_win_types:
+    _zeCommandListAppendBarrier_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendBarrier_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemoryRangesBarrier
+if __use_win_types:
+    _zeCommandListAppendMemoryRangesBarrier_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_void_p), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendMemoryRangesBarrier_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_void_p), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemoryCopy
+if __use_win_types:
+    _zeCommandListAppendMemoryCopy_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendMemoryCopy_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemoryFill
+if __use_win_types:
+    _zeCommandListAppendMemoryFill_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendMemoryFill_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_void_p, c_size_t, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemoryCopyRegion
+if __use_win_types:
+    _zeCommandListAppendMemoryCopyRegion_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendMemoryCopyRegion_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, c_void_p, POINTER(ze_copy_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemoryCopyFromContext
+if __use_win_types:
+    _zeCommandListAppendMemoryCopyFromContext_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_context_handle_t, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendMemoryCopyFromContext_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_context_handle_t, c_void_p, c_size_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendImageCopy
+if __use_win_types:
+    _zeCommandListAppendImageCopy_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendImageCopy_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendImageCopyRegion
+if __use_win_types:
+    _zeCommandListAppendImageCopyRegion_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, POINTER(ze_image_region_t), POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendImageCopyRegion_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, ze_image_handle_t, POINTER(ze_image_region_t), POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendImageCopyToMemory
+if __use_win_types:
+    _zeCommandListAppendImageCopyToMemory_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendImageCopyToMemory_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendImageCopyFromMemory
+if __use_win_types:
+    _zeCommandListAppendImageCopyFromMemory_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendImageCopyFromMemory_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemoryPrefetch
+if __use_win_types:
+    _zeCommandListAppendMemoryPrefetch_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_size_t )
+else:
+    _zeCommandListAppendMemoryPrefetch_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, c_size_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendMemAdvise
+if __use_win_types:
+    _zeCommandListAppendMemAdvise_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_device_handle_t, c_void_p, c_size_t, ze_memory_advice_t )
+else:
+    _zeCommandListAppendMemAdvise_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_device_handle_t, c_void_p, c_size_t, ze_memory_advice_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendSignalEvent
+if __use_win_types:
+    _zeCommandListAppendSignalEvent_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
+else:
+    _zeCommandListAppendSignalEvent_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendWaitOnEvents
+if __use_win_types:
+    _zeCommandListAppendWaitOnEvents_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendWaitOnEvents_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendEventReset
+if __use_win_types:
+    _zeCommandListAppendEventReset_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
+else:
+    _zeCommandListAppendEventReset_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendQueryKernelTimestamps
+if __use_win_types:
+    _zeCommandListAppendQueryKernelTimestamps_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t), c_void_p, POINTER(c_size_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendQueryKernelTimestamps_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_event_handle_t), c_void_p, POINTER(c_size_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendLaunchKernel
+if __use_win_types:
+    _zeCommandListAppendLaunchKernel_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendLaunchKernel_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendLaunchCooperativeKernel
+if __use_win_types:
+    _zeCommandListAppendLaunchCooperativeKernel_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendLaunchCooperativeKernel_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendLaunchKernelIndirect
+if __use_win_types:
+    _zeCommandListAppendLaunchKernelIndirect_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendLaunchKernelIndirect_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_kernel_handle_t, POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect
+if __use_win_types:
+    _zeCommandListAppendLaunchMultipleKernelsIndirect_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_kernel_handle_t), POINTER(c_ulong), POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendLaunchMultipleKernelsIndirect_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_kernel_handle_t), POINTER(c_ulong), POINTER(ze_group_count_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendImageCopyToMemoryExt
+if __use_win_types:
+    _zeCommandListAppendImageCopyToMemoryExt_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendImageCopyToMemoryExt_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_void_p, ze_image_handle_t, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListAppendImageCopyFromMemoryExt
+if __use_win_types:
+    _zeCommandListAppendImageCopyFromMemoryExt_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListAppendImageCopyFromMemoryExt_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, ze_image_handle_t, c_void_p, POINTER(ze_image_region_t), c_ulong, c_ulong, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListHostSynchronize
+if __use_win_types:
+    _zeCommandListHostSynchronize_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulonglong )
+else:
+    _zeCommandListHostSynchronize_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulonglong )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListGetDeviceHandle
+if __use_win_types:
+    _zeCommandListGetDeviceHandle_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_device_handle_t) )
+else:
+    _zeCommandListGetDeviceHandle_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_device_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListGetContextHandle
+if __use_win_types:
+    _zeCommandListGetContextHandle_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_context_handle_t) )
+else:
+    _zeCommandListGetContextHandle_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_context_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListGetOrdinal
+if __use_win_types:
+    _zeCommandListGetOrdinal_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulong) )
+else:
+    _zeCommandListGetOrdinal_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListImmediateGetIndex
+if __use_win_types:
+    _zeCommandListImmediateGetIndex_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulong) )
+else:
+    _zeCommandListImmediateGetIndex_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListIsImmediate
+if __use_win_types:
+    _zeCommandListIsImmediate_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_bool_t) )
+else:
+    _zeCommandListIsImmediate_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_bool_t) )
+
+
+###############################################################################
+## @brief Table of CommandList functions pointers
+class _ze_command_list_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeCommandListCreate_t
+        ("pfnCreateImmediate", c_void_p),                               ## _zeCommandListCreateImmediate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeCommandListDestroy_t
+        ("pfnClose", c_void_p),                                         ## _zeCommandListClose_t
+        ("pfnReset", c_void_p),                                         ## _zeCommandListReset_t
+        ("pfnAppendWriteGlobalTimestamp", c_void_p),                    ## _zeCommandListAppendWriteGlobalTimestamp_t
+        ("pfnAppendBarrier", c_void_p),                                 ## _zeCommandListAppendBarrier_t
+        ("pfnAppendMemoryRangesBarrier", c_void_p),                     ## _zeCommandListAppendMemoryRangesBarrier_t
+        ("pfnAppendMemoryCopy", c_void_p),                              ## _zeCommandListAppendMemoryCopy_t
+        ("pfnAppendMemoryFill", c_void_p),                              ## _zeCommandListAppendMemoryFill_t
+        ("pfnAppendMemoryCopyRegion", c_void_p),                        ## _zeCommandListAppendMemoryCopyRegion_t
+        ("pfnAppendMemoryCopyFromContext", c_void_p),                   ## _zeCommandListAppendMemoryCopyFromContext_t
+        ("pfnAppendImageCopy", c_void_p),                               ## _zeCommandListAppendImageCopy_t
+        ("pfnAppendImageCopyRegion", c_void_p),                         ## _zeCommandListAppendImageCopyRegion_t
+        ("pfnAppendImageCopyToMemory", c_void_p),                       ## _zeCommandListAppendImageCopyToMemory_t
+        ("pfnAppendImageCopyFromMemory", c_void_p),                     ## _zeCommandListAppendImageCopyFromMemory_t
+        ("pfnAppendMemoryPrefetch", c_void_p),                          ## _zeCommandListAppendMemoryPrefetch_t
+        ("pfnAppendMemAdvise", c_void_p),                               ## _zeCommandListAppendMemAdvise_t
+        ("pfnAppendSignalEvent", c_void_p),                             ## _zeCommandListAppendSignalEvent_t
+        ("pfnAppendWaitOnEvents", c_void_p),                            ## _zeCommandListAppendWaitOnEvents_t
+        ("pfnAppendEventReset", c_void_p),                              ## _zeCommandListAppendEventReset_t
+        ("pfnAppendQueryKernelTimestamps", c_void_p),                   ## _zeCommandListAppendQueryKernelTimestamps_t
+        ("pfnAppendLaunchKernel", c_void_p),                            ## _zeCommandListAppendLaunchKernel_t
+        ("pfnAppendLaunchCooperativeKernel", c_void_p),                 ## _zeCommandListAppendLaunchCooperativeKernel_t
+        ("pfnAppendLaunchKernelIndirect", c_void_p),                    ## _zeCommandListAppendLaunchKernelIndirect_t
+        ("pfnAppendLaunchMultipleKernelsIndirect", c_void_p),           ## _zeCommandListAppendLaunchMultipleKernelsIndirect_t
+        ("pfnAppendImageCopyToMemoryExt", c_void_p),                    ## _zeCommandListAppendImageCopyToMemoryExt_t
+        ("pfnAppendImageCopyFromMemoryExt", c_void_p),                  ## _zeCommandListAppendImageCopyFromMemoryExt_t
+        ("pfnHostSynchronize", c_void_p),                               ## _zeCommandListHostSynchronize_t
+        ("pfnGetDeviceHandle", c_void_p),                               ## _zeCommandListGetDeviceHandle_t
+        ("pfnGetContextHandle", c_void_p),                              ## _zeCommandListGetContextHandle_t
+        ("pfnGetOrdinal", c_void_p),                                    ## _zeCommandListGetOrdinal_t
+        ("pfnImmediateGetIndex", c_void_p),                             ## _zeCommandListImmediateGetIndex_t
+        ("pfnIsImmediate", c_void_p)                                    ## _zeCommandListIsImmediate_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeCommandListCreateCloneExp
+if __use_win_types:
+    _zeCommandListCreateCloneExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_command_list_handle_t) )
+else:
+    _zeCommandListCreateCloneExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_command_list_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListImmediateAppendCommandListsExp
+if __use_win_types:
+    _zeCommandListImmediateAppendCommandListsExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_command_list_handle_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListImmediateAppendCommandListsExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(ze_command_list_handle_t), ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListGetNextCommandIdExp
+if __use_win_types:
+    _zeCommandListGetNextCommandIdExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_mutable_command_id_exp_desc_t), POINTER(c_ulonglong) )
+else:
+    _zeCommandListGetNextCommandIdExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_mutable_command_id_exp_desc_t), POINTER(c_ulonglong) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListUpdateMutableCommandsExp
+if __use_win_types:
+    _zeCommandListUpdateMutableCommandsExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_mutable_commands_exp_desc_t) )
+else:
+    _zeCommandListUpdateMutableCommandsExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_mutable_commands_exp_desc_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListUpdateMutableCommandSignalEventExp
+if __use_win_types:
+    _zeCommandListUpdateMutableCommandSignalEventExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulonglong, ze_event_handle_t )
+else:
+    _zeCommandListUpdateMutableCommandSignalEventExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulonglong, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListUpdateMutableCommandWaitEventsExp
+if __use_win_types:
+    _zeCommandListUpdateMutableCommandWaitEventsExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulonglong, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zeCommandListUpdateMutableCommandWaitEventsExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulonglong, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListGetNextCommandIdWithKernelsExp
+if __use_win_types:
+    _zeCommandListGetNextCommandIdWithKernelsExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_mutable_command_id_exp_desc_t), c_ulong, POINTER(ze_kernel_handle_t), POINTER(c_ulonglong) )
+else:
+    _zeCommandListGetNextCommandIdWithKernelsExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, POINTER(ze_mutable_command_id_exp_desc_t), c_ulong, POINTER(ze_kernel_handle_t), POINTER(c_ulonglong) )
+
+###############################################################################
+## @brief Function-pointer for zeCommandListUpdateMutableCommandKernelsExp
+if __use_win_types:
+    _zeCommandListUpdateMutableCommandKernelsExp_t = WINFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(c_ulonglong), POINTER(ze_kernel_handle_t) )
+else:
+    _zeCommandListUpdateMutableCommandKernelsExp_t = CFUNCTYPE( ze_result_t, ze_command_list_handle_t, c_ulong, POINTER(c_ulonglong), POINTER(ze_kernel_handle_t) )
+
+
+###############################################################################
+## @brief Table of CommandListExp functions pointers
+class _ze_command_list_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreateCloneExp", c_void_p),                                ## _zeCommandListCreateCloneExp_t
+        ("pfnImmediateAppendCommandListsExp", c_void_p),                ## _zeCommandListImmediateAppendCommandListsExp_t
+        ("pfnGetNextCommandIdExp", c_void_p),                           ## _zeCommandListGetNextCommandIdExp_t
+        ("pfnUpdateMutableCommandsExp", c_void_p),                      ## _zeCommandListUpdateMutableCommandsExp_t
+        ("pfnUpdateMutableCommandSignalEventExp", c_void_p),            ## _zeCommandListUpdateMutableCommandSignalEventExp_t
+        ("pfnUpdateMutableCommandWaitEventsExp", c_void_p),             ## _zeCommandListUpdateMutableCommandWaitEventsExp_t
+        ("pfnGetNextCommandIdWithKernelsExp", c_void_p),                ## _zeCommandListGetNextCommandIdWithKernelsExp_t
+        ("pfnUpdateMutableCommandKernelsExp", c_void_p)                 ## _zeCommandListUpdateMutableCommandKernelsExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeImageGetProperties
+if __use_win_types:
+    _zeImageGetProperties_t = WINFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_properties_t) )
+else:
+    _zeImageGetProperties_t = CFUNCTYPE( ze_result_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeImageCreate
+if __use_win_types:
+    _zeImageCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_handle_t) )
+else:
+    _zeImageCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), POINTER(ze_image_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeImageDestroy
+if __use_win_types:
+    _zeImageDestroy_t = WINFUNCTYPE( ze_result_t, ze_image_handle_t )
+else:
+    _zeImageDestroy_t = CFUNCTYPE( ze_result_t, ze_image_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeImageGetAllocPropertiesExt
+if __use_win_types:
+    _zeImageGetAllocPropertiesExt_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_image_handle_t, POINTER(ze_image_allocation_ext_properties_t) )
+else:
+    _zeImageGetAllocPropertiesExt_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_image_handle_t, POINTER(ze_image_allocation_ext_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeImageViewCreateExt
+if __use_win_types:
+    _zeImageViewCreateExt_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), ze_image_handle_t, POINTER(ze_image_handle_t) )
+else:
+    _zeImageViewCreateExt_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), ze_image_handle_t, POINTER(ze_image_handle_t) )
+
+
+###############################################################################
+## @brief Table of Image functions pointers
+class _ze_image_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zeImageGetProperties_t
+        ("pfnCreate", c_void_p),                                        ## _zeImageCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeImageDestroy_t
+        ("pfnGetAllocPropertiesExt", c_void_p),                         ## _zeImageGetAllocPropertiesExt_t
+        ("pfnViewCreateExt", c_void_p)                                  ## _zeImageViewCreateExt_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeImageGetMemoryPropertiesExp
+if __use_win_types:
+    _zeImageGetMemoryPropertiesExp_t = WINFUNCTYPE( ze_result_t, ze_image_handle_t, POINTER(ze_image_memory_properties_exp_t) )
+else:
+    _zeImageGetMemoryPropertiesExp_t = CFUNCTYPE( ze_result_t, ze_image_handle_t, POINTER(ze_image_memory_properties_exp_t) )
+
+###############################################################################
+## @brief Function-pointer for zeImageViewCreateExp
+if __use_win_types:
+    _zeImageViewCreateExp_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), ze_image_handle_t, POINTER(ze_image_handle_t) )
+else:
+    _zeImageViewCreateExp_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_image_desc_t), ze_image_handle_t, POINTER(ze_image_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeImageGetDeviceOffsetExp
+if __use_win_types:
+    _zeImageGetDeviceOffsetExp_t = WINFUNCTYPE( ze_result_t, ze_image_handle_t, POINTER(c_ulonglong) )
+else:
+    _zeImageGetDeviceOffsetExp_t = CFUNCTYPE( ze_result_t, ze_image_handle_t, POINTER(c_ulonglong) )
+
+
+###############################################################################
+## @brief Table of ImageExp functions pointers
+class _ze_image_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetMemoryPropertiesExp", c_void_p),                        ## _zeImageGetMemoryPropertiesExp_t
+        ("pfnViewCreateExp", c_void_p),                                 ## _zeImageViewCreateExp_t
+        ("pfnGetDeviceOffsetExp", c_void_p)                             ## _zeImageGetDeviceOffsetExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeMemAllocShared
+if __use_win_types:
+    _zeMemAllocShared_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
+else:
+    _zeMemAllocShared_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeMemAllocDevice
+if __use_win_types:
+    _zeMemAllocDevice_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
+else:
+    _zeMemAllocDevice_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_device_mem_alloc_desc_t), c_size_t, c_size_t, ze_device_handle_t, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeMemAllocHost
+if __use_win_types:
+    _zeMemAllocHost_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, POINTER(c_void_p) )
+else:
+    _zeMemAllocHost_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_host_mem_alloc_desc_t), c_size_t, c_size_t, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeMemFree
+if __use_win_types:
+    _zeMemFree_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
+else:
+    _zeMemFree_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zeMemGetAllocProperties
+if __use_win_types:
+    _zeMemGetAllocProperties_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_memory_allocation_properties_t), POINTER(ze_device_handle_t) )
+else:
+    _zeMemGetAllocProperties_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_memory_allocation_properties_t), POINTER(ze_device_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeMemGetAddressRange
+if __use_win_types:
+    _zeMemGetAddressRange_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(c_void_p), POINTER(c_size_t) )
+else:
+    _zeMemGetAddressRange_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(c_void_p), POINTER(c_size_t) )
+
+###############################################################################
+## @brief Function-pointer for zeMemGetIpcHandle
+if __use_win_types:
+    _zeMemGetIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_ipc_mem_handle_t) )
+else:
+    _zeMemGetIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, POINTER(ze_ipc_mem_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeMemOpenIpcHandle
+if __use_win_types:
+    _zeMemOpenIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_ipc_mem_handle_t, ze_ipc_memory_flags_t, POINTER(c_void_p) )
+else:
+    _zeMemOpenIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, ze_ipc_mem_handle_t, ze_ipc_memory_flags_t, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeMemCloseIpcHandle
+if __use_win_types:
+    _zeMemCloseIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
+else:
+    _zeMemCloseIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zeMemFreeExt
+if __use_win_types:
+    _zeMemFreeExt_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_memory_free_ext_desc_t), c_void_p )
+else:
+    _zeMemFreeExt_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_memory_free_ext_desc_t), c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zeMemPutIpcHandle
+if __use_win_types:
+    _zeMemPutIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_mem_handle_t )
+else:
+    _zeMemPutIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_mem_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeMemGetPitchFor2dImage
+if __use_win_types:
+    _zeMemGetPitchFor2dImage_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_size_t, c_size_t, c_int, * )
+else:
+    _zeMemGetPitchFor2dImage_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_size_t, c_size_t, c_int, * )
+
+
+###############################################################################
+## @brief Table of Mem functions pointers
+class _ze_mem_dditable_t(Structure):
+    _fields_ = [
+        ("pfnAllocShared", c_void_p),                                   ## _zeMemAllocShared_t
+        ("pfnAllocDevice", c_void_p),                                   ## _zeMemAllocDevice_t
+        ("pfnAllocHost", c_void_p),                                     ## _zeMemAllocHost_t
+        ("pfnFree", c_void_p),                                          ## _zeMemFree_t
+        ("pfnGetAllocProperties", c_void_p),                            ## _zeMemGetAllocProperties_t
+        ("pfnGetAddressRange", c_void_p),                               ## _zeMemGetAddressRange_t
+        ("pfnGetIpcHandle", c_void_p),                                  ## _zeMemGetIpcHandle_t
+        ("pfnOpenIpcHandle", c_void_p),                                 ## _zeMemOpenIpcHandle_t
+        ("pfnCloseIpcHandle", c_void_p),                                ## _zeMemCloseIpcHandle_t
+        ("pfnFreeExt", c_void_p),                                       ## _zeMemFreeExt_t
+        ("pfnPutIpcHandle", c_void_p),                                  ## _zeMemPutIpcHandle_t
+        ("pfnGetPitchFor2dImage", c_void_p)                             ## _zeMemGetPitchFor2dImage_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeMemGetIpcHandleFromFileDescriptorExp
+if __use_win_types:
+    _zeMemGetIpcHandleFromFileDescriptorExp_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_ulonglong, POINTER(ze_ipc_mem_handle_t) )
+else:
+    _zeMemGetIpcHandleFromFileDescriptorExp_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_ulonglong, POINTER(ze_ipc_mem_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeMemGetFileDescriptorFromIpcHandleExp
+if __use_win_types:
+    _zeMemGetFileDescriptorFromIpcHandleExp_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_mem_handle_t, POINTER(c_ulonglong) )
+else:
+    _zeMemGetFileDescriptorFromIpcHandleExp_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_mem_handle_t, POINTER(c_ulonglong) )
+
+###############################################################################
+## @brief Function-pointer for zeMemSetAtomicAccessAttributeExp
+if __use_win_types:
+    _zeMemSetAtomicAccessAttributeExp_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t, ze_memory_atomic_attr_exp_flags_t )
+else:
+    _zeMemSetAtomicAccessAttributeExp_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t, ze_memory_atomic_attr_exp_flags_t )
+
+###############################################################################
+## @brief Function-pointer for zeMemGetAtomicAccessAttributeExp
+if __use_win_types:
+    _zeMemGetAtomicAccessAttributeExp_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t, POINTER(ze_memory_atomic_attr_exp_flags_t) )
+else:
+    _zeMemGetAtomicAccessAttributeExp_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_void_p, c_size_t, POINTER(ze_memory_atomic_attr_exp_flags_t) )
+
+
+###############################################################################
+## @brief Table of MemExp functions pointers
+class _ze_mem_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetIpcHandleFromFileDescriptorExp", c_void_p),             ## _zeMemGetIpcHandleFromFileDescriptorExp_t
+        ("pfnGetFileDescriptorFromIpcHandleExp", c_void_p),             ## _zeMemGetFileDescriptorFromIpcHandleExp_t
+        ("pfnSetAtomicAccessAttributeExp", c_void_p),                   ## _zeMemSetAtomicAccessAttributeExp_t
+        ("pfnGetAtomicAccessAttributeExp", c_void_p)                    ## _zeMemGetAtomicAccessAttributeExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeFenceCreate
+if __use_win_types:
+    _zeFenceCreate_t = WINFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(ze_fence_desc_t), POINTER(ze_fence_handle_t) )
+else:
+    _zeFenceCreate_t = CFUNCTYPE( ze_result_t, ze_command_queue_handle_t, POINTER(ze_fence_desc_t), POINTER(ze_fence_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeFenceDestroy
+if __use_win_types:
+    _zeFenceDestroy_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t )
+else:
+    _zeFenceDestroy_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeFenceHostSynchronize
+if __use_win_types:
+    _zeFenceHostSynchronize_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t, c_ulonglong )
+else:
+    _zeFenceHostSynchronize_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t, c_ulonglong )
+
+###############################################################################
+## @brief Function-pointer for zeFenceQueryStatus
+if __use_win_types:
+    _zeFenceQueryStatus_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t )
+else:
+    _zeFenceQueryStatus_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeFenceReset
+if __use_win_types:
+    _zeFenceReset_t = WINFUNCTYPE( ze_result_t, ze_fence_handle_t )
+else:
+    _zeFenceReset_t = CFUNCTYPE( ze_result_t, ze_fence_handle_t )
+
+
+###############################################################################
+## @brief Table of Fence functions pointers
+class _ze_fence_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeFenceCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeFenceDestroy_t
+        ("pfnHostSynchronize", c_void_p),                               ## _zeFenceHostSynchronize_t
+        ("pfnQueryStatus", c_void_p),                                   ## _zeFenceQueryStatus_t
+        ("pfnReset", c_void_p)                                          ## _zeFenceReset_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolCreate
+if __use_win_types:
+    _zeEventPoolCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_event_pool_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_event_pool_handle_t) )
+else:
+    _zeEventPoolCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, POINTER(ze_event_pool_desc_t), c_ulong, POINTER(ze_device_handle_t), POINTER(ze_event_pool_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolDestroy
+if __use_win_types:
+    _zeEventPoolDestroy_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
+else:
+    _zeEventPoolDestroy_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolGetIpcHandle
+if __use_win_types:
+    _zeEventPoolGetIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_ipc_event_pool_handle_t) )
+else:
+    _zeEventPoolGetIpcHandle_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_ipc_event_pool_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolOpenIpcHandle
+if __use_win_types:
+    _zeEventPoolOpenIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_event_pool_handle_t, POINTER(ze_event_pool_handle_t) )
+else:
+    _zeEventPoolOpenIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_event_pool_handle_t, POINTER(ze_event_pool_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolCloseIpcHandle
+if __use_win_types:
+    _zeEventPoolCloseIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
+else:
+    _zeEventPoolCloseIpcHandle_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolPutIpcHandle
+if __use_win_types:
+    _zeEventPoolPutIpcHandle_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_event_pool_handle_t )
+else:
+    _zeEventPoolPutIpcHandle_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_ipc_event_pool_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolGetContextHandle
+if __use_win_types:
+    _zeEventPoolGetContextHandle_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_context_handle_t) )
+else:
+    _zeEventPoolGetContextHandle_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_context_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventPoolGetFlags
+if __use_win_types:
+    _zeEventPoolGetFlags_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_event_pool_flags_t) )
+else:
+    _zeEventPoolGetFlags_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_event_pool_flags_t) )
+
+
+###############################################################################
+## @brief Table of EventPool functions pointers
+class _ze_event_pool_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeEventPoolCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeEventPoolDestroy_t
+        ("pfnGetIpcHandle", c_void_p),                                  ## _zeEventPoolGetIpcHandle_t
+        ("pfnOpenIpcHandle", c_void_p),                                 ## _zeEventPoolOpenIpcHandle_t
+        ("pfnCloseIpcHandle", c_void_p),                                ## _zeEventPoolCloseIpcHandle_t
+        ("pfnPutIpcHandle", c_void_p),                                  ## _zeEventPoolPutIpcHandle_t
+        ("pfnGetContextHandle", c_void_p),                              ## _zeEventPoolGetContextHandle_t
+        ("pfnGetFlags", c_void_p)                                       ## _zeEventPoolGetFlags_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeEventCreate
+if __use_win_types:
+    _zeEventCreate_t = WINFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_event_desc_t), POINTER(ze_event_handle_t) )
+else:
+    _zeEventCreate_t = CFUNCTYPE( ze_result_t, ze_event_pool_handle_t, POINTER(ze_event_desc_t), POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventDestroy
+if __use_win_types:
+    _zeEventDestroy_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
+else:
+    _zeEventDestroy_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventHostSignal
+if __use_win_types:
+    _zeEventHostSignal_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
+else:
+    _zeEventHostSignal_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventHostSynchronize
+if __use_win_types:
+    _zeEventHostSynchronize_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, c_ulonglong )
+else:
+    _zeEventHostSynchronize_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, c_ulonglong )
+
+###############################################################################
+## @brief Function-pointer for zeEventQueryStatus
+if __use_win_types:
+    _zeEventQueryStatus_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
+else:
+    _zeEventQueryStatus_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventHostReset
+if __use_win_types:
+    _zeEventHostReset_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t )
+else:
+    _zeEventHostReset_t = CFUNCTYPE( ze_result_t, ze_event_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeEventQueryKernelTimestamp
+if __use_win_types:
+    _zeEventQueryKernelTimestamp_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_kernel_timestamp_result_t) )
+else:
+    _zeEventQueryKernelTimestamp_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_kernel_timestamp_result_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventQueryKernelTimestampsExt
+if __use_win_types:
+    _zeEventQueryKernelTimestampsExt_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_event_query_kernel_timestamps_results_ext_properties_t) )
+else:
+    _zeEventQueryKernelTimestampsExt_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_event_query_kernel_timestamps_results_ext_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventGetEventPool
+if __use_win_types:
+    _zeEventGetEventPool_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_event_pool_handle_t) )
+else:
+    _zeEventGetEventPool_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_event_pool_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventGetSignalScope
+if __use_win_types:
+    _zeEventGetSignalScope_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_event_scope_flags_t) )
+else:
+    _zeEventGetSignalScope_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_event_scope_flags_t) )
+
+###############################################################################
+## @brief Function-pointer for zeEventGetWaitScope
+if __use_win_types:
+    _zeEventGetWaitScope_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_event_scope_flags_t) )
+else:
+    _zeEventGetWaitScope_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, POINTER(ze_event_scope_flags_t) )
+
+
+###############################################################################
+## @brief Table of Event functions pointers
+class _ze_event_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeEventCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeEventDestroy_t
+        ("pfnHostSignal", c_void_p),                                    ## _zeEventHostSignal_t
+        ("pfnHostSynchronize", c_void_p),                               ## _zeEventHostSynchronize_t
+        ("pfnQueryStatus", c_void_p),                                   ## _zeEventQueryStatus_t
+        ("pfnHostReset", c_void_p),                                     ## _zeEventHostReset_t
+        ("pfnQueryKernelTimestamp", c_void_p),                          ## _zeEventQueryKernelTimestamp_t
+        ("pfnQueryKernelTimestampsExt", c_void_p),                      ## _zeEventQueryKernelTimestampsExt_t
+        ("pfnGetEventPool", c_void_p),                                  ## _zeEventGetEventPool_t
+        ("pfnGetSignalScope", c_void_p),                                ## _zeEventGetSignalScope_t
+        ("pfnGetWaitScope", c_void_p)                                   ## _zeEventGetWaitScope_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeEventQueryTimestampsExp
+if __use_win_types:
+    _zeEventQueryTimestampsExp_t = WINFUNCTYPE( ze_result_t, ze_event_handle_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_kernel_timestamp_result_t) )
+else:
+    _zeEventQueryTimestampsExp_t = CFUNCTYPE( ze_result_t, ze_event_handle_t, ze_device_handle_t, POINTER(c_ulong), POINTER(ze_kernel_timestamp_result_t) )
+
+
+###############################################################################
+## @brief Table of EventExp functions pointers
+class _ze_event_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnQueryTimestampsExp", c_void_p)                             ## _zeEventQueryTimestampsExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeModuleCreate
+if __use_win_types:
+    _zeModuleCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_module_desc_t), POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
+else:
+    _zeModuleCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_module_desc_t), POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleDestroy
+if __use_win_types:
+    _zeModuleDestroy_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t )
+else:
+    _zeModuleDestroy_t = CFUNCTYPE( ze_result_t, ze_module_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeModuleDynamicLink
+if __use_win_types:
+    _zeModuleDynamicLink_t = WINFUNCTYPE( ze_result_t, c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
+else:
+    _zeModuleDynamicLink_t = CFUNCTYPE( ze_result_t, c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleGetNativeBinary
+if __use_win_types:
+    _zeModuleGetNativeBinary_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+else:
+    _zeModuleGetNativeBinary_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleGetGlobalPointer
+if __use_win_types:
+    _zeModuleGetGlobalPointer_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_size_t), POINTER(c_void_p) )
+else:
+    _zeModuleGetGlobalPointer_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_size_t), POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleGetKernelNames
+if __use_win_types:
+    _zeModuleGetKernelNames_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
+else:
+    _zeModuleGetKernelNames_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleGetProperties
+if __use_win_types:
+    _zeModuleGetProperties_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_module_properties_t) )
+else:
+    _zeModuleGetProperties_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_module_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleGetFunctionPointer
+if __use_win_types:
+    _zeModuleGetFunctionPointer_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_void_p) )
+else:
+    _zeModuleGetFunctionPointer_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, c_char_p, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeModuleInspectLinkageExt
+if __use_win_types:
+    _zeModuleInspectLinkageExt_t = WINFUNCTYPE( ze_result_t, POINTER(ze_linkage_inspection_ext_desc_t), c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
+else:
+    _zeModuleInspectLinkageExt_t = CFUNCTYPE( ze_result_t, POINTER(ze_linkage_inspection_ext_desc_t), c_ulong, POINTER(ze_module_handle_t), POINTER(ze_module_build_log_handle_t) )
+
+
+###############################################################################
+## @brief Table of Module functions pointers
+class _ze_module_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeModuleCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeModuleDestroy_t
+        ("pfnDynamicLink", c_void_p),                                   ## _zeModuleDynamicLink_t
+        ("pfnGetNativeBinary", c_void_p),                               ## _zeModuleGetNativeBinary_t
+        ("pfnGetGlobalPointer", c_void_p),                              ## _zeModuleGetGlobalPointer_t
+        ("pfnGetKernelNames", c_void_p),                                ## _zeModuleGetKernelNames_t
+        ("pfnGetProperties", c_void_p),                                 ## _zeModuleGetProperties_t
+        ("pfnGetFunctionPointer", c_void_p),                            ## _zeModuleGetFunctionPointer_t
+        ("pfnInspectLinkageExt", c_void_p)                              ## _zeModuleInspectLinkageExt_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeModuleBuildLogDestroy
+if __use_win_types:
+    _zeModuleBuildLogDestroy_t = WINFUNCTYPE( ze_result_t, ze_module_build_log_handle_t )
+else:
+    _zeModuleBuildLogDestroy_t = CFUNCTYPE( ze_result_t, ze_module_build_log_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeModuleBuildLogGetString
+if __use_win_types:
+    _zeModuleBuildLogGetString_t = WINFUNCTYPE( ze_result_t, ze_module_build_log_handle_t, POINTER(c_size_t), c_char_p )
+else:
+    _zeModuleBuildLogGetString_t = CFUNCTYPE( ze_result_t, ze_module_build_log_handle_t, POINTER(c_size_t), c_char_p )
+
+
+###############################################################################
+## @brief Table of ModuleBuildLog functions pointers
+class _ze_module_build_log_dditable_t(Structure):
+    _fields_ = [
+        ("pfnDestroy", c_void_p),                                       ## _zeModuleBuildLogDestroy_t
+        ("pfnGetString", c_void_p)                                      ## _zeModuleBuildLogGetString_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeKernelCreate
+if __use_win_types:
+    _zeKernelCreate_t = WINFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_kernel_desc_t), POINTER(ze_kernel_handle_t) )
+else:
+    _zeKernelCreate_t = CFUNCTYPE( ze_result_t, ze_module_handle_t, POINTER(ze_kernel_desc_t), POINTER(ze_kernel_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelDestroy
+if __use_win_types:
+    _zeKernelDestroy_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t )
+else:
+    _zeKernelDestroy_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSetCacheConfig
+if __use_win_types:
+    _zeKernelSetCacheConfig_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_cache_config_flags_t )
+else:
+    _zeKernelSetCacheConfig_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_cache_config_flags_t )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSetGroupSize
+if __use_win_types:
+    _zeKernelSetGroupSize_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
+else:
+    _zeKernelSetGroupSize_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSuggestGroupSize
+if __use_win_types:
+    _zeKernelSuggestGroupSize_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong, POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong) )
+else:
+    _zeKernelSuggestGroupSize_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong, POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSuggestMaxCooperativeGroupCount
+if __use_win_types:
+    _zeKernelSuggestMaxCooperativeGroupCount_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong) )
+else:
+    _zeKernelSuggestMaxCooperativeGroupCount_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSetArgumentValue
+if __use_win_types:
+    _zeKernelSetArgumentValue_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_size_t, c_void_p )
+else:
+    _zeKernelSetArgumentValue_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_size_t, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSetIndirectAccess
+if __use_win_types:
+    _zeKernelSetIndirectAccess_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_kernel_indirect_access_flags_t )
+else:
+    _zeKernelSetIndirectAccess_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, ze_kernel_indirect_access_flags_t )
+
+###############################################################################
+## @brief Function-pointer for zeKernelGetIndirectAccess
+if __use_win_types:
+    _zeKernelGetIndirectAccess_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_indirect_access_flags_t) )
+else:
+    _zeKernelGetIndirectAccess_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_indirect_access_flags_t) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelGetSourceAttributes
+if __use_win_types:
+    _zeKernelGetSourceAttributes_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
+else:
+    _zeKernelGetSourceAttributes_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_ulong), POINTER(c_char_p) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelGetProperties
+if __use_win_types:
+    _zeKernelGetProperties_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_properties_t) )
+else:
+    _zeKernelGetProperties_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_kernel_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelGetName
+if __use_win_types:
+    _zeKernelGetName_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_size_t), c_char_p )
+else:
+    _zeKernelGetName_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_size_t), c_char_p )
+
+
+###############################################################################
+## @brief Table of Kernel functions pointers
+class _ze_kernel_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeKernelCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zeKernelDestroy_t
+        ("pfnSetCacheConfig", c_void_p),                                ## _zeKernelSetCacheConfig_t
+        ("pfnSetGroupSize", c_void_p),                                  ## _zeKernelSetGroupSize_t
+        ("pfnSuggestGroupSize", c_void_p),                              ## _zeKernelSuggestGroupSize_t
+        ("pfnSuggestMaxCooperativeGroupCount", c_void_p),               ## _zeKernelSuggestMaxCooperativeGroupCount_t
+        ("pfnSetArgumentValue", c_void_p),                              ## _zeKernelSetArgumentValue_t
+        ("pfnSetIndirectAccess", c_void_p),                             ## _zeKernelSetIndirectAccess_t
+        ("pfnGetIndirectAccess", c_void_p),                             ## _zeKernelGetIndirectAccess_t
+        ("pfnGetSourceAttributes", c_void_p),                           ## _zeKernelGetSourceAttributes_t
+        ("pfnGetProperties", c_void_p),                                 ## _zeKernelGetProperties_t
+        ("pfnGetName", c_void_p)                                        ## _zeKernelGetName_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeKernelSetGlobalOffsetExp
+if __use_win_types:
+    _zeKernelSetGlobalOffsetExp_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
+else:
+    _zeKernelSetGlobalOffsetExp_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, c_ulong, c_ulong, c_ulong )
+
+###############################################################################
+## @brief Function-pointer for zeKernelSchedulingHintExp
+if __use_win_types:
+    _zeKernelSchedulingHintExp_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_scheduling_hint_exp_desc_t) )
+else:
+    _zeKernelSchedulingHintExp_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(ze_scheduling_hint_exp_desc_t) )
+
+###############################################################################
+## @brief Function-pointer for zeKernelGetBinaryExp
+if __use_win_types:
+    _zeKernelGetBinaryExp_t = WINFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+else:
+    _zeKernelGetBinaryExp_t = CFUNCTYPE( ze_result_t, ze_kernel_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+
+
+###############################################################################
+## @brief Table of KernelExp functions pointers
+class _ze_kernel_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnSetGlobalOffsetExp", c_void_p),                            ## _zeKernelSetGlobalOffsetExp_t
+        ("pfnSchedulingHintExp", c_void_p),                             ## _zeKernelSchedulingHintExp_t
+        ("pfnGetBinaryExp", c_void_p)                                   ## _zeKernelGetBinaryExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeSamplerCreate
+if __use_win_types:
+    _zeSamplerCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_sampler_desc_t), POINTER(ze_sampler_handle_t) )
+else:
+    _zeSamplerCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_sampler_desc_t), POINTER(ze_sampler_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeSamplerDestroy
+if __use_win_types:
+    _zeSamplerDestroy_t = WINFUNCTYPE( ze_result_t, ze_sampler_handle_t )
+else:
+    _zeSamplerDestroy_t = CFUNCTYPE( ze_result_t, ze_sampler_handle_t )
+
+
+###############################################################################
+## @brief Table of Sampler functions pointers
+class _ze_sampler_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zeSamplerCreate_t
+        ("pfnDestroy", c_void_p)                                        ## _zeSamplerDestroy_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zePhysicalMemCreate
+if __use_win_types:
+    _zePhysicalMemCreate_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_physical_mem_desc_t), POINTER(ze_physical_mem_handle_t) )
+else:
+    _zePhysicalMemCreate_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, POINTER(ze_physical_mem_desc_t), POINTER(ze_physical_mem_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zePhysicalMemDestroy
+if __use_win_types:
+    _zePhysicalMemDestroy_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_physical_mem_handle_t )
+else:
+    _zePhysicalMemDestroy_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_physical_mem_handle_t )
+
+
+###############################################################################
+## @brief Table of PhysicalMem functions pointers
+class _ze_physical_mem_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zePhysicalMemCreate_t
+        ("pfnDestroy", c_void_p)                                        ## _zePhysicalMemDestroy_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemReserve
+if __use_win_types:
+    _zeVirtualMemReserve_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(c_void_p) )
+else:
+    _zeVirtualMemReserve_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(c_void_p) )
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemFree
+if __use_win_types:
+    _zeVirtualMemFree_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
+else:
+    _zeVirtualMemFree_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemQueryPageSize
+if __use_win_types:
+    _zeVirtualMemQueryPageSize_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_size_t, POINTER(c_size_t) )
+else:
+    _zeVirtualMemQueryPageSize_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, ze_device_handle_t, c_size_t, POINTER(c_size_t) )
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemMap
+if __use_win_types:
+    _zeVirtualMemMap_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_physical_mem_handle_t, c_size_t, ze_memory_access_attribute_t )
+else:
+    _zeVirtualMemMap_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_physical_mem_handle_t, c_size_t, ze_memory_access_attribute_t )
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemUnmap
+if __use_win_types:
+    _zeVirtualMemUnmap_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
+else:
+    _zeVirtualMemUnmap_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t )
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemSetAccessAttribute
+if __use_win_types:
+    _zeVirtualMemSetAccessAttribute_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_memory_access_attribute_t )
+else:
+    _zeVirtualMemSetAccessAttribute_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, ze_memory_access_attribute_t )
+
+###############################################################################
+## @brief Function-pointer for zeVirtualMemGetAccessAttribute
+if __use_win_types:
+    _zeVirtualMemGetAccessAttribute_t = WINFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(ze_memory_access_attribute_t), POINTER(c_size_t) )
+else:
+    _zeVirtualMemGetAccessAttribute_t = CFUNCTYPE( ze_result_t, ze_context_handle_t, c_void_p, c_size_t, POINTER(ze_memory_access_attribute_t), POINTER(c_size_t) )
+
+
+###############################################################################
+## @brief Table of VirtualMem functions pointers
+class _ze_virtual_mem_dditable_t(Structure):
+    _fields_ = [
+        ("pfnReserve", c_void_p),                                       ## _zeVirtualMemReserve_t
+        ("pfnFree", c_void_p),                                          ## _zeVirtualMemFree_t
+        ("pfnQueryPageSize", c_void_p),                                 ## _zeVirtualMemQueryPageSize_t
+        ("pfnMap", c_void_p),                                           ## _zeVirtualMemMap_t
+        ("pfnUnmap", c_void_p),                                         ## _zeVirtualMemUnmap_t
+        ("pfnSetAccessAttribute", c_void_p),                            ## _zeVirtualMemSetAccessAttribute_t
+        ("pfnGetAccessAttribute", c_void_p)                             ## _zeVirtualMemGetAccessAttribute_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeFabricVertexGetExp
+if __use_win_types:
+    _zeFabricVertexGetExp_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_fabric_vertex_handle_t) )
+else:
+    _zeFabricVertexGetExp_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, POINTER(c_ulong), POINTER(ze_fabric_vertex_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeFabricVertexGetSubVerticesExp
+if __use_win_types:
+    _zeFabricVertexGetSubVerticesExp_t = WINFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, POINTER(c_ulong), POINTER(ze_fabric_vertex_handle_t) )
+else:
+    _zeFabricVertexGetSubVerticesExp_t = CFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, POINTER(c_ulong), POINTER(ze_fabric_vertex_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeFabricVertexGetPropertiesExp
+if __use_win_types:
+    _zeFabricVertexGetPropertiesExp_t = WINFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, POINTER(ze_fabric_vertex_exp_properties_t) )
+else:
+    _zeFabricVertexGetPropertiesExp_t = CFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, POINTER(ze_fabric_vertex_exp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zeFabricVertexGetDeviceExp
+if __use_win_types:
+    _zeFabricVertexGetDeviceExp_t = WINFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, POINTER(ze_device_handle_t) )
+else:
+    _zeFabricVertexGetDeviceExp_t = CFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, POINTER(ze_device_handle_t) )
+
+
+###############################################################################
+## @brief Table of FabricVertexExp functions pointers
+class _ze_fabric_vertex_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetExp", c_void_p),                                        ## _zeFabricVertexGetExp_t
+        ("pfnGetSubVerticesExp", c_void_p),                             ## _zeFabricVertexGetSubVerticesExp_t
+        ("pfnGetPropertiesExp", c_void_p),                              ## _zeFabricVertexGetPropertiesExp_t
+        ("pfnGetDeviceExp", c_void_p)                                   ## _zeFabricVertexGetDeviceExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zeFabricEdgeGetExp
+if __use_win_types:
+    _zeFabricEdgeGetExp_t = WINFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, ze_fabric_vertex_handle_t, POINTER(c_ulong), POINTER(ze_fabric_edge_handle_t) )
+else:
+    _zeFabricEdgeGetExp_t = CFUNCTYPE( ze_result_t, ze_fabric_vertex_handle_t, ze_fabric_vertex_handle_t, POINTER(c_ulong), POINTER(ze_fabric_edge_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeFabricEdgeGetVerticesExp
+if __use_win_types:
+    _zeFabricEdgeGetVerticesExp_t = WINFUNCTYPE( ze_result_t, ze_fabric_edge_handle_t, POINTER(ze_fabric_vertex_handle_t), POINTER(ze_fabric_vertex_handle_t) )
+else:
+    _zeFabricEdgeGetVerticesExp_t = CFUNCTYPE( ze_result_t, ze_fabric_edge_handle_t, POINTER(ze_fabric_vertex_handle_t), POINTER(ze_fabric_vertex_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zeFabricEdgeGetPropertiesExp
+if __use_win_types:
+    _zeFabricEdgeGetPropertiesExp_t = WINFUNCTYPE( ze_result_t, ze_fabric_edge_handle_t, POINTER(ze_fabric_edge_exp_properties_t) )
+else:
+    _zeFabricEdgeGetPropertiesExp_t = CFUNCTYPE( ze_result_t, ze_fabric_edge_handle_t, POINTER(ze_fabric_edge_exp_properties_t) )
+
+
+###############################################################################
+## @brief Table of FabricEdgeExp functions pointers
+class _ze_fabric_edge_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetExp", c_void_p),                                        ## _zeFabricEdgeGetExp_t
+        ("pfnGetVerticesExp", c_void_p),                                ## _zeFabricEdgeGetVerticesExp_t
+        ("pfnGetPropertiesExp", c_void_p)                               ## _zeFabricEdgeGetPropertiesExp_t
+    ]
+
+###############################################################################
+class _ze_dditable_t(Structure):
+    _fields_ = [
+        ("RTASBuilderExp", _ze_rtas_builder_exp_dditable_t),
+        ("RTASParallelOperationExp", _ze_rtas_parallel_operation_exp_dditable_t),
+        ("Global", _ze_global_dditable_t),
+        ("Driver", _ze_driver_dditable_t),
+        ("DriverExp", _ze_driver_exp_dditable_t),
+        ("Device", _ze_device_dditable_t),
+        ("DeviceExp", _ze_device_exp_dditable_t),
+        ("Context", _ze_context_dditable_t),
+        ("CommandQueue", _ze_command_queue_dditable_t),
+        ("CommandList", _ze_command_list_dditable_t),
+        ("CommandListExp", _ze_command_list_exp_dditable_t),
+        ("Image", _ze_image_dditable_t),
+        ("ImageExp", _ze_image_exp_dditable_t),
+        ("Mem", _ze_mem_dditable_t),
+        ("MemExp", _ze_mem_exp_dditable_t),
+        ("Fence", _ze_fence_dditable_t),
+        ("EventPool", _ze_event_pool_dditable_t),
+        ("Event", _ze_event_dditable_t),
+        ("EventExp", _ze_event_exp_dditable_t),
+        ("Module", _ze_module_dditable_t),
+        ("ModuleBuildLog", _ze_module_build_log_dditable_t),
+        ("Kernel", _ze_kernel_dditable_t),
+        ("KernelExp", _ze_kernel_exp_dditable_t),
+        ("Sampler", _ze_sampler_dditable_t),
+        ("PhysicalMem", _ze_physical_mem_dditable_t),
+        ("VirtualMem", _ze_virtual_mem_dditable_t),
+        ("FabricVertexExp", _ze_fabric_vertex_exp_dditable_t),
+        ("FabricEdgeExp", _ze_fabric_edge_exp_dditable_t)
+    ]
+
+###############################################################################
+## @brief ze device-driver interfaces
+class ZE_DDI:
+    def __init__(self, version : ze_api_version_t):
+        # load the ze_loader library
+        if "Windows" == platform.uname()[0]:
+            self.__dll = WinDLL("ze_loader.dll")
+        else:
+            self.__dll = CDLL("ze_loader.so")
+
+        # fill the ddi tables
+        self.__dditable = _ze_dditable_t()
+
+        # call driver to get function pointers
+        _RTASBuilderExp = _ze_rtas_builder_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetRTASBuilderExpProcAddrTable(version, byref(_RTASBuilderExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.RTASBuilderExp = _RTASBuilderExp
+
+        # attach function interface to function address
+        self.zeRTASBuilderCreateExp = _zeRTASBuilderCreateExp_t(self.__dditable.RTASBuilderExp.pfnCreateExp)
+        self.zeRTASBuilderGetBuildPropertiesExp = _zeRTASBuilderGetBuildPropertiesExp_t(self.__dditable.RTASBuilderExp.pfnGetBuildPropertiesExp)
+        self.zeRTASBuilderBuildExp = _zeRTASBuilderBuildExp_t(self.__dditable.RTASBuilderExp.pfnBuildExp)
+        self.zeRTASBuilderDestroyExp = _zeRTASBuilderDestroyExp_t(self.__dditable.RTASBuilderExp.pfnDestroyExp)
+
+        # call driver to get function pointers
+        _RTASParallelOperationExp = _ze_rtas_parallel_operation_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetRTASParallelOperationExpProcAddrTable(version, byref(_RTASParallelOperationExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.RTASParallelOperationExp = _RTASParallelOperationExp
+
+        # attach function interface to function address
+        self.zeRTASParallelOperationCreateExp = _zeRTASParallelOperationCreateExp_t(self.__dditable.RTASParallelOperationExp.pfnCreateExp)
+        self.zeRTASParallelOperationGetPropertiesExp = _zeRTASParallelOperationGetPropertiesExp_t(self.__dditable.RTASParallelOperationExp.pfnGetPropertiesExp)
+        self.zeRTASParallelOperationJoinExp = _zeRTASParallelOperationJoinExp_t(self.__dditable.RTASParallelOperationExp.pfnJoinExp)
+        self.zeRTASParallelOperationDestroyExp = _zeRTASParallelOperationDestroyExp_t(self.__dditable.RTASParallelOperationExp.pfnDestroyExp)
+
+        # call driver to get function pointers
+        _Global = _ze_global_dditable_t()
+        r = ze_result_v(self.__dll.zeGetGlobalProcAddrTable(version, byref(_Global)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Global = _Global
+
+        # attach function interface to function address
+        self.zeInit = _zeInit_t(self.__dditable.Global.pfnInit)
+        self.zeInitDrivers = _zeInitDrivers_t(self.__dditable.Global.pfnInitDrivers)
+
+        # call driver to get function pointers
+        _Driver = _ze_driver_dditable_t()
+        r = ze_result_v(self.__dll.zeGetDriverProcAddrTable(version, byref(_Driver)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Driver = _Driver
+
+        # attach function interface to function address
+        self.zeDriverGet = _zeDriverGet_t(self.__dditable.Driver.pfnGet)
+        self.zeDriverGetApiVersion = _zeDriverGetApiVersion_t(self.__dditable.Driver.pfnGetApiVersion)
+        self.zeDriverGetProperties = _zeDriverGetProperties_t(self.__dditable.Driver.pfnGetProperties)
+        self.zeDriverGetIpcProperties = _zeDriverGetIpcProperties_t(self.__dditable.Driver.pfnGetIpcProperties)
+        self.zeDriverGetExtensionProperties = _zeDriverGetExtensionProperties_t(self.__dditable.Driver.pfnGetExtensionProperties)
+        self.zeDriverGetExtensionFunctionAddress = _zeDriverGetExtensionFunctionAddress_t(self.__dditable.Driver.pfnGetExtensionFunctionAddress)
+        self.zeDriverGetLastErrorDescription = _zeDriverGetLastErrorDescription_t(self.__dditable.Driver.pfnGetLastErrorDescription)
+
+        # call driver to get function pointers
+        _DriverExp = _ze_driver_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetDriverExpProcAddrTable(version, byref(_DriverExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.DriverExp = _DriverExp
+
+        # attach function interface to function address
+        self.zeDriverRTASFormatCompatibilityCheckExp = _zeDriverRTASFormatCompatibilityCheckExp_t(self.__dditable.DriverExp.pfnRTASFormatCompatibilityCheckExp)
+
+        # call driver to get function pointers
+        _Device = _ze_device_dditable_t()
+        r = ze_result_v(self.__dll.zeGetDeviceProcAddrTable(version, byref(_Device)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Device = _Device
+
+        # attach function interface to function address
+        self.zeDeviceGet = _zeDeviceGet_t(self.__dditable.Device.pfnGet)
+        self.zeDeviceGetSubDevices = _zeDeviceGetSubDevices_t(self.__dditable.Device.pfnGetSubDevices)
+        self.zeDeviceGetProperties = _zeDeviceGetProperties_t(self.__dditable.Device.pfnGetProperties)
+        self.zeDeviceGetComputeProperties = _zeDeviceGetComputeProperties_t(self.__dditable.Device.pfnGetComputeProperties)
+        self.zeDeviceGetModuleProperties = _zeDeviceGetModuleProperties_t(self.__dditable.Device.pfnGetModuleProperties)
+        self.zeDeviceGetCommandQueueGroupProperties = _zeDeviceGetCommandQueueGroupProperties_t(self.__dditable.Device.pfnGetCommandQueueGroupProperties)
+        self.zeDeviceGetMemoryProperties = _zeDeviceGetMemoryProperties_t(self.__dditable.Device.pfnGetMemoryProperties)
+        self.zeDeviceGetMemoryAccessProperties = _zeDeviceGetMemoryAccessProperties_t(self.__dditable.Device.pfnGetMemoryAccessProperties)
+        self.zeDeviceGetCacheProperties = _zeDeviceGetCacheProperties_t(self.__dditable.Device.pfnGetCacheProperties)
+        self.zeDeviceGetImageProperties = _zeDeviceGetImageProperties_t(self.__dditable.Device.pfnGetImageProperties)
+        self.zeDeviceGetExternalMemoryProperties = _zeDeviceGetExternalMemoryProperties_t(self.__dditable.Device.pfnGetExternalMemoryProperties)
+        self.zeDeviceGetP2PProperties = _zeDeviceGetP2PProperties_t(self.__dditable.Device.pfnGetP2PProperties)
+        self.zeDeviceCanAccessPeer = _zeDeviceCanAccessPeer_t(self.__dditable.Device.pfnCanAccessPeer)
+        self.zeDeviceGetStatus = _zeDeviceGetStatus_t(self.__dditable.Device.pfnGetStatus)
+        self.zeDeviceGetGlobalTimestamps = _zeDeviceGetGlobalTimestamps_t(self.__dditable.Device.pfnGetGlobalTimestamps)
+        self.zeDeviceReserveCacheExt = _zeDeviceReserveCacheExt_t(self.__dditable.Device.pfnReserveCacheExt)
+        self.zeDeviceSetCacheAdviceExt = _zeDeviceSetCacheAdviceExt_t(self.__dditable.Device.pfnSetCacheAdviceExt)
+        self.zeDevicePciGetPropertiesExt = _zeDevicePciGetPropertiesExt_t(self.__dditable.Device.pfnPciGetPropertiesExt)
+        self.zeDeviceGetRootDevice = _zeDeviceGetRootDevice_t(self.__dditable.Device.pfnGetRootDevice)
+
+        # call driver to get function pointers
+        _DeviceExp = _ze_device_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetDeviceExpProcAddrTable(version, byref(_DeviceExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.DeviceExp = _DeviceExp
+
+        # attach function interface to function address
+        self.zeDeviceGetFabricVertexExp = _zeDeviceGetFabricVertexExp_t(self.__dditable.DeviceExp.pfnGetFabricVertexExp)
+
+        # call driver to get function pointers
+        _Context = _ze_context_dditable_t()
+        r = ze_result_v(self.__dll.zeGetContextProcAddrTable(version, byref(_Context)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Context = _Context
+
+        # attach function interface to function address
+        self.zeContextCreate = _zeContextCreate_t(self.__dditable.Context.pfnCreate)
+        self.zeContextDestroy = _zeContextDestroy_t(self.__dditable.Context.pfnDestroy)
+        self.zeContextGetStatus = _zeContextGetStatus_t(self.__dditable.Context.pfnGetStatus)
+        self.zeContextSystemBarrier = _zeContextSystemBarrier_t(self.__dditable.Context.pfnSystemBarrier)
+        self.zeContextMakeMemoryResident = _zeContextMakeMemoryResident_t(self.__dditable.Context.pfnMakeMemoryResident)
+        self.zeContextEvictMemory = _zeContextEvictMemory_t(self.__dditable.Context.pfnEvictMemory)
+        self.zeContextMakeImageResident = _zeContextMakeImageResident_t(self.__dditable.Context.pfnMakeImageResident)
+        self.zeContextEvictImage = _zeContextEvictImage_t(self.__dditable.Context.pfnEvictImage)
+        self.zeContextCreateEx = _zeContextCreateEx_t(self.__dditable.Context.pfnCreateEx)
+
+        # call driver to get function pointers
+        _CommandQueue = _ze_command_queue_dditable_t()
+        r = ze_result_v(self.__dll.zeGetCommandQueueProcAddrTable(version, byref(_CommandQueue)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.CommandQueue = _CommandQueue
+
+        # attach function interface to function address
+        self.zeCommandQueueCreate = _zeCommandQueueCreate_t(self.__dditable.CommandQueue.pfnCreate)
+        self.zeCommandQueueDestroy = _zeCommandQueueDestroy_t(self.__dditable.CommandQueue.pfnDestroy)
+        self.zeCommandQueueExecuteCommandLists = _zeCommandQueueExecuteCommandLists_t(self.__dditable.CommandQueue.pfnExecuteCommandLists)
+        self.zeCommandQueueSynchronize = _zeCommandQueueSynchronize_t(self.__dditable.CommandQueue.pfnSynchronize)
+        self.zeCommandQueueGetOrdinal = _zeCommandQueueGetOrdinal_t(self.__dditable.CommandQueue.pfnGetOrdinal)
+        self.zeCommandQueueGetIndex = _zeCommandQueueGetIndex_t(self.__dditable.CommandQueue.pfnGetIndex)
+
+        # call driver to get function pointers
+        _CommandList = _ze_command_list_dditable_t()
+        r = ze_result_v(self.__dll.zeGetCommandListProcAddrTable(version, byref(_CommandList)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.CommandList = _CommandList
+
+        # attach function interface to function address
+        self.zeCommandListCreate = _zeCommandListCreate_t(self.__dditable.CommandList.pfnCreate)
+        self.zeCommandListCreateImmediate = _zeCommandListCreateImmediate_t(self.__dditable.CommandList.pfnCreateImmediate)
+        self.zeCommandListDestroy = _zeCommandListDestroy_t(self.__dditable.CommandList.pfnDestroy)
+        self.zeCommandListClose = _zeCommandListClose_t(self.__dditable.CommandList.pfnClose)
+        self.zeCommandListReset = _zeCommandListReset_t(self.__dditable.CommandList.pfnReset)
+        self.zeCommandListAppendWriteGlobalTimestamp = _zeCommandListAppendWriteGlobalTimestamp_t(self.__dditable.CommandList.pfnAppendWriteGlobalTimestamp)
+        self.zeCommandListAppendBarrier = _zeCommandListAppendBarrier_t(self.__dditable.CommandList.pfnAppendBarrier)
+        self.zeCommandListAppendMemoryRangesBarrier = _zeCommandListAppendMemoryRangesBarrier_t(self.__dditable.CommandList.pfnAppendMemoryRangesBarrier)
+        self.zeCommandListAppendMemoryCopy = _zeCommandListAppendMemoryCopy_t(self.__dditable.CommandList.pfnAppendMemoryCopy)
+        self.zeCommandListAppendMemoryFill = _zeCommandListAppendMemoryFill_t(self.__dditable.CommandList.pfnAppendMemoryFill)
+        self.zeCommandListAppendMemoryCopyRegion = _zeCommandListAppendMemoryCopyRegion_t(self.__dditable.CommandList.pfnAppendMemoryCopyRegion)
+        self.zeCommandListAppendMemoryCopyFromContext = _zeCommandListAppendMemoryCopyFromContext_t(self.__dditable.CommandList.pfnAppendMemoryCopyFromContext)
+        self.zeCommandListAppendImageCopy = _zeCommandListAppendImageCopy_t(self.__dditable.CommandList.pfnAppendImageCopy)
+        self.zeCommandListAppendImageCopyRegion = _zeCommandListAppendImageCopyRegion_t(self.__dditable.CommandList.pfnAppendImageCopyRegion)
+        self.zeCommandListAppendImageCopyToMemory = _zeCommandListAppendImageCopyToMemory_t(self.__dditable.CommandList.pfnAppendImageCopyToMemory)
+        self.zeCommandListAppendImageCopyFromMemory = _zeCommandListAppendImageCopyFromMemory_t(self.__dditable.CommandList.pfnAppendImageCopyFromMemory)
+        self.zeCommandListAppendMemoryPrefetch = _zeCommandListAppendMemoryPrefetch_t(self.__dditable.CommandList.pfnAppendMemoryPrefetch)
+        self.zeCommandListAppendMemAdvise = _zeCommandListAppendMemAdvise_t(self.__dditable.CommandList.pfnAppendMemAdvise)
+        self.zeCommandListAppendSignalEvent = _zeCommandListAppendSignalEvent_t(self.__dditable.CommandList.pfnAppendSignalEvent)
+        self.zeCommandListAppendWaitOnEvents = _zeCommandListAppendWaitOnEvents_t(self.__dditable.CommandList.pfnAppendWaitOnEvents)
+        self.zeCommandListAppendEventReset = _zeCommandListAppendEventReset_t(self.__dditable.CommandList.pfnAppendEventReset)
+        self.zeCommandListAppendQueryKernelTimestamps = _zeCommandListAppendQueryKernelTimestamps_t(self.__dditable.CommandList.pfnAppendQueryKernelTimestamps)
+        self.zeCommandListAppendLaunchKernel = _zeCommandListAppendLaunchKernel_t(self.__dditable.CommandList.pfnAppendLaunchKernel)
+        self.zeCommandListAppendLaunchCooperativeKernel = _zeCommandListAppendLaunchCooperativeKernel_t(self.__dditable.CommandList.pfnAppendLaunchCooperativeKernel)
+        self.zeCommandListAppendLaunchKernelIndirect = _zeCommandListAppendLaunchKernelIndirect_t(self.__dditable.CommandList.pfnAppendLaunchKernelIndirect)
+        self.zeCommandListAppendLaunchMultipleKernelsIndirect = _zeCommandListAppendLaunchMultipleKernelsIndirect_t(self.__dditable.CommandList.pfnAppendLaunchMultipleKernelsIndirect)
+        self.zeCommandListAppendImageCopyToMemoryExt = _zeCommandListAppendImageCopyToMemoryExt_t(self.__dditable.CommandList.pfnAppendImageCopyToMemoryExt)
+        self.zeCommandListAppendImageCopyFromMemoryExt = _zeCommandListAppendImageCopyFromMemoryExt_t(self.__dditable.CommandList.pfnAppendImageCopyFromMemoryExt)
+        self.zeCommandListHostSynchronize = _zeCommandListHostSynchronize_t(self.__dditable.CommandList.pfnHostSynchronize)
+        self.zeCommandListGetDeviceHandle = _zeCommandListGetDeviceHandle_t(self.__dditable.CommandList.pfnGetDeviceHandle)
+        self.zeCommandListGetContextHandle = _zeCommandListGetContextHandle_t(self.__dditable.CommandList.pfnGetContextHandle)
+        self.zeCommandListGetOrdinal = _zeCommandListGetOrdinal_t(self.__dditable.CommandList.pfnGetOrdinal)
+        self.zeCommandListImmediateGetIndex = _zeCommandListImmediateGetIndex_t(self.__dditable.CommandList.pfnImmediateGetIndex)
+        self.zeCommandListIsImmediate = _zeCommandListIsImmediate_t(self.__dditable.CommandList.pfnIsImmediate)
+
+        # call driver to get function pointers
+        _CommandListExp = _ze_command_list_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetCommandListExpProcAddrTable(version, byref(_CommandListExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.CommandListExp = _CommandListExp
+
+        # attach function interface to function address
+        self.zeCommandListCreateCloneExp = _zeCommandListCreateCloneExp_t(self.__dditable.CommandListExp.pfnCreateCloneExp)
+        self.zeCommandListImmediateAppendCommandListsExp = _zeCommandListImmediateAppendCommandListsExp_t(self.__dditable.CommandListExp.pfnImmediateAppendCommandListsExp)
+        self.zeCommandListGetNextCommandIdExp = _zeCommandListGetNextCommandIdExp_t(self.__dditable.CommandListExp.pfnGetNextCommandIdExp)
+        self.zeCommandListUpdateMutableCommandsExp = _zeCommandListUpdateMutableCommandsExp_t(self.__dditable.CommandListExp.pfnUpdateMutableCommandsExp)
+        self.zeCommandListUpdateMutableCommandSignalEventExp = _zeCommandListUpdateMutableCommandSignalEventExp_t(self.__dditable.CommandListExp.pfnUpdateMutableCommandSignalEventExp)
+        self.zeCommandListUpdateMutableCommandWaitEventsExp = _zeCommandListUpdateMutableCommandWaitEventsExp_t(self.__dditable.CommandListExp.pfnUpdateMutableCommandWaitEventsExp)
+        self.zeCommandListGetNextCommandIdWithKernelsExp = _zeCommandListGetNextCommandIdWithKernelsExp_t(self.__dditable.CommandListExp.pfnGetNextCommandIdWithKernelsExp)
+        self.zeCommandListUpdateMutableCommandKernelsExp = _zeCommandListUpdateMutableCommandKernelsExp_t(self.__dditable.CommandListExp.pfnUpdateMutableCommandKernelsExp)
+
+        # call driver to get function pointers
+        _Image = _ze_image_dditable_t()
+        r = ze_result_v(self.__dll.zeGetImageProcAddrTable(version, byref(_Image)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Image = _Image
+
+        # attach function interface to function address
+        self.zeImageGetProperties = _zeImageGetProperties_t(self.__dditable.Image.pfnGetProperties)
+        self.zeImageCreate = _zeImageCreate_t(self.__dditable.Image.pfnCreate)
+        self.zeImageDestroy = _zeImageDestroy_t(self.__dditable.Image.pfnDestroy)
+        self.zeImageGetAllocPropertiesExt = _zeImageGetAllocPropertiesExt_t(self.__dditable.Image.pfnGetAllocPropertiesExt)
+        self.zeImageViewCreateExt = _zeImageViewCreateExt_t(self.__dditable.Image.pfnViewCreateExt)
+
+        # call driver to get function pointers
+        _ImageExp = _ze_image_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetImageExpProcAddrTable(version, byref(_ImageExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.ImageExp = _ImageExp
+
+        # attach function interface to function address
+        self.zeImageGetMemoryPropertiesExp = _zeImageGetMemoryPropertiesExp_t(self.__dditable.ImageExp.pfnGetMemoryPropertiesExp)
+        self.zeImageViewCreateExp = _zeImageViewCreateExp_t(self.__dditable.ImageExp.pfnViewCreateExp)
+        self.zeImageGetDeviceOffsetExp = _zeImageGetDeviceOffsetExp_t(self.__dditable.ImageExp.pfnGetDeviceOffsetExp)
+
+        # call driver to get function pointers
+        _Mem = _ze_mem_dditable_t()
+        r = ze_result_v(self.__dll.zeGetMemProcAddrTable(version, byref(_Mem)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Mem = _Mem
+
+        # attach function interface to function address
+        self.zeMemAllocShared = _zeMemAllocShared_t(self.__dditable.Mem.pfnAllocShared)
+        self.zeMemAllocDevice = _zeMemAllocDevice_t(self.__dditable.Mem.pfnAllocDevice)
+        self.zeMemAllocHost = _zeMemAllocHost_t(self.__dditable.Mem.pfnAllocHost)
+        self.zeMemFree = _zeMemFree_t(self.__dditable.Mem.pfnFree)
+        self.zeMemGetAllocProperties = _zeMemGetAllocProperties_t(self.__dditable.Mem.pfnGetAllocProperties)
+        self.zeMemGetAddressRange = _zeMemGetAddressRange_t(self.__dditable.Mem.pfnGetAddressRange)
+        self.zeMemGetIpcHandle = _zeMemGetIpcHandle_t(self.__dditable.Mem.pfnGetIpcHandle)
+        self.zeMemOpenIpcHandle = _zeMemOpenIpcHandle_t(self.__dditable.Mem.pfnOpenIpcHandle)
+        self.zeMemCloseIpcHandle = _zeMemCloseIpcHandle_t(self.__dditable.Mem.pfnCloseIpcHandle)
+        self.zeMemFreeExt = _zeMemFreeExt_t(self.__dditable.Mem.pfnFreeExt)
+        self.zeMemPutIpcHandle = _zeMemPutIpcHandle_t(self.__dditable.Mem.pfnPutIpcHandle)
+        self.zeMemGetPitchFor2dImage = _zeMemGetPitchFor2dImage_t(self.__dditable.Mem.pfnGetPitchFor2dImage)
+
+        # call driver to get function pointers
+        _MemExp = _ze_mem_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetMemExpProcAddrTable(version, byref(_MemExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MemExp = _MemExp
+
+        # attach function interface to function address
+        self.zeMemGetIpcHandleFromFileDescriptorExp = _zeMemGetIpcHandleFromFileDescriptorExp_t(self.__dditable.MemExp.pfnGetIpcHandleFromFileDescriptorExp)
+        self.zeMemGetFileDescriptorFromIpcHandleExp = _zeMemGetFileDescriptorFromIpcHandleExp_t(self.__dditable.MemExp.pfnGetFileDescriptorFromIpcHandleExp)
+        self.zeMemSetAtomicAccessAttributeExp = _zeMemSetAtomicAccessAttributeExp_t(self.__dditable.MemExp.pfnSetAtomicAccessAttributeExp)
+        self.zeMemGetAtomicAccessAttributeExp = _zeMemGetAtomicAccessAttributeExp_t(self.__dditable.MemExp.pfnGetAtomicAccessAttributeExp)
+
+        # call driver to get function pointers
+        _Fence = _ze_fence_dditable_t()
+        r = ze_result_v(self.__dll.zeGetFenceProcAddrTable(version, byref(_Fence)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Fence = _Fence
+
+        # attach function interface to function address
+        self.zeFenceCreate = _zeFenceCreate_t(self.__dditable.Fence.pfnCreate)
+        self.zeFenceDestroy = _zeFenceDestroy_t(self.__dditable.Fence.pfnDestroy)
+        self.zeFenceHostSynchronize = _zeFenceHostSynchronize_t(self.__dditable.Fence.pfnHostSynchronize)
+        self.zeFenceQueryStatus = _zeFenceQueryStatus_t(self.__dditable.Fence.pfnQueryStatus)
+        self.zeFenceReset = _zeFenceReset_t(self.__dditable.Fence.pfnReset)
+
+        # call driver to get function pointers
+        _EventPool = _ze_event_pool_dditable_t()
+        r = ze_result_v(self.__dll.zeGetEventPoolProcAddrTable(version, byref(_EventPool)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.EventPool = _EventPool
+
+        # attach function interface to function address
+        self.zeEventPoolCreate = _zeEventPoolCreate_t(self.__dditable.EventPool.pfnCreate)
+        self.zeEventPoolDestroy = _zeEventPoolDestroy_t(self.__dditable.EventPool.pfnDestroy)
+        self.zeEventPoolGetIpcHandle = _zeEventPoolGetIpcHandle_t(self.__dditable.EventPool.pfnGetIpcHandle)
+        self.zeEventPoolOpenIpcHandle = _zeEventPoolOpenIpcHandle_t(self.__dditable.EventPool.pfnOpenIpcHandle)
+        self.zeEventPoolCloseIpcHandle = _zeEventPoolCloseIpcHandle_t(self.__dditable.EventPool.pfnCloseIpcHandle)
+        self.zeEventPoolPutIpcHandle = _zeEventPoolPutIpcHandle_t(self.__dditable.EventPool.pfnPutIpcHandle)
+        self.zeEventPoolGetContextHandle = _zeEventPoolGetContextHandle_t(self.__dditable.EventPool.pfnGetContextHandle)
+        self.zeEventPoolGetFlags = _zeEventPoolGetFlags_t(self.__dditable.EventPool.pfnGetFlags)
+
+        # call driver to get function pointers
+        _Event = _ze_event_dditable_t()
+        r = ze_result_v(self.__dll.zeGetEventProcAddrTable(version, byref(_Event)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Event = _Event
+
+        # attach function interface to function address
+        self.zeEventCreate = _zeEventCreate_t(self.__dditable.Event.pfnCreate)
+        self.zeEventDestroy = _zeEventDestroy_t(self.__dditable.Event.pfnDestroy)
+        self.zeEventHostSignal = _zeEventHostSignal_t(self.__dditable.Event.pfnHostSignal)
+        self.zeEventHostSynchronize = _zeEventHostSynchronize_t(self.__dditable.Event.pfnHostSynchronize)
+        self.zeEventQueryStatus = _zeEventQueryStatus_t(self.__dditable.Event.pfnQueryStatus)
+        self.zeEventHostReset = _zeEventHostReset_t(self.__dditable.Event.pfnHostReset)
+        self.zeEventQueryKernelTimestamp = _zeEventQueryKernelTimestamp_t(self.__dditable.Event.pfnQueryKernelTimestamp)
+        self.zeEventQueryKernelTimestampsExt = _zeEventQueryKernelTimestampsExt_t(self.__dditable.Event.pfnQueryKernelTimestampsExt)
+        self.zeEventGetEventPool = _zeEventGetEventPool_t(self.__dditable.Event.pfnGetEventPool)
+        self.zeEventGetSignalScope = _zeEventGetSignalScope_t(self.__dditable.Event.pfnGetSignalScope)
+        self.zeEventGetWaitScope = _zeEventGetWaitScope_t(self.__dditable.Event.pfnGetWaitScope)
+
+        # call driver to get function pointers
+        _EventExp = _ze_event_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetEventExpProcAddrTable(version, byref(_EventExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.EventExp = _EventExp
+
+        # attach function interface to function address
+        self.zeEventQueryTimestampsExp = _zeEventQueryTimestampsExp_t(self.__dditable.EventExp.pfnQueryTimestampsExp)
+
+        # call driver to get function pointers
+        _Module = _ze_module_dditable_t()
+        r = ze_result_v(self.__dll.zeGetModuleProcAddrTable(version, byref(_Module)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Module = _Module
+
+        # attach function interface to function address
+        self.zeModuleCreate = _zeModuleCreate_t(self.__dditable.Module.pfnCreate)
+        self.zeModuleDestroy = _zeModuleDestroy_t(self.__dditable.Module.pfnDestroy)
+        self.zeModuleDynamicLink = _zeModuleDynamicLink_t(self.__dditable.Module.pfnDynamicLink)
+        self.zeModuleGetNativeBinary = _zeModuleGetNativeBinary_t(self.__dditable.Module.pfnGetNativeBinary)
+        self.zeModuleGetGlobalPointer = _zeModuleGetGlobalPointer_t(self.__dditable.Module.pfnGetGlobalPointer)
+        self.zeModuleGetKernelNames = _zeModuleGetKernelNames_t(self.__dditable.Module.pfnGetKernelNames)
+        self.zeModuleGetProperties = _zeModuleGetProperties_t(self.__dditable.Module.pfnGetProperties)
+        self.zeModuleGetFunctionPointer = _zeModuleGetFunctionPointer_t(self.__dditable.Module.pfnGetFunctionPointer)
+        self.zeModuleInspectLinkageExt = _zeModuleInspectLinkageExt_t(self.__dditable.Module.pfnInspectLinkageExt)
+
+        # call driver to get function pointers
+        _ModuleBuildLog = _ze_module_build_log_dditable_t()
+        r = ze_result_v(self.__dll.zeGetModuleBuildLogProcAddrTable(version, byref(_ModuleBuildLog)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.ModuleBuildLog = _ModuleBuildLog
+
+        # attach function interface to function address
+        self.zeModuleBuildLogDestroy = _zeModuleBuildLogDestroy_t(self.__dditable.ModuleBuildLog.pfnDestroy)
+        self.zeModuleBuildLogGetString = _zeModuleBuildLogGetString_t(self.__dditable.ModuleBuildLog.pfnGetString)
+
+        # call driver to get function pointers
+        _Kernel = _ze_kernel_dditable_t()
+        r = ze_result_v(self.__dll.zeGetKernelProcAddrTable(version, byref(_Kernel)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Kernel = _Kernel
+
+        # attach function interface to function address
+        self.zeKernelCreate = _zeKernelCreate_t(self.__dditable.Kernel.pfnCreate)
+        self.zeKernelDestroy = _zeKernelDestroy_t(self.__dditable.Kernel.pfnDestroy)
+        self.zeKernelSetCacheConfig = _zeKernelSetCacheConfig_t(self.__dditable.Kernel.pfnSetCacheConfig)
+        self.zeKernelSetGroupSize = _zeKernelSetGroupSize_t(self.__dditable.Kernel.pfnSetGroupSize)
+        self.zeKernelSuggestGroupSize = _zeKernelSuggestGroupSize_t(self.__dditable.Kernel.pfnSuggestGroupSize)
+        self.zeKernelSuggestMaxCooperativeGroupCount = _zeKernelSuggestMaxCooperativeGroupCount_t(self.__dditable.Kernel.pfnSuggestMaxCooperativeGroupCount)
+        self.zeKernelSetArgumentValue = _zeKernelSetArgumentValue_t(self.__dditable.Kernel.pfnSetArgumentValue)
+        self.zeKernelSetIndirectAccess = _zeKernelSetIndirectAccess_t(self.__dditable.Kernel.pfnSetIndirectAccess)
+        self.zeKernelGetIndirectAccess = _zeKernelGetIndirectAccess_t(self.__dditable.Kernel.pfnGetIndirectAccess)
+        self.zeKernelGetSourceAttributes = _zeKernelGetSourceAttributes_t(self.__dditable.Kernel.pfnGetSourceAttributes)
+        self.zeKernelGetProperties = _zeKernelGetProperties_t(self.__dditable.Kernel.pfnGetProperties)
+        self.zeKernelGetName = _zeKernelGetName_t(self.__dditable.Kernel.pfnGetName)
+
+        # call driver to get function pointers
+        _KernelExp = _ze_kernel_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetKernelExpProcAddrTable(version, byref(_KernelExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.KernelExp = _KernelExp
+
+        # attach function interface to function address
+        self.zeKernelSetGlobalOffsetExp = _zeKernelSetGlobalOffsetExp_t(self.__dditable.KernelExp.pfnSetGlobalOffsetExp)
+        self.zeKernelSchedulingHintExp = _zeKernelSchedulingHintExp_t(self.__dditable.KernelExp.pfnSchedulingHintExp)
+        self.zeKernelGetBinaryExp = _zeKernelGetBinaryExp_t(self.__dditable.KernelExp.pfnGetBinaryExp)
+
+        # call driver to get function pointers
+        _Sampler = _ze_sampler_dditable_t()
+        r = ze_result_v(self.__dll.zeGetSamplerProcAddrTable(version, byref(_Sampler)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Sampler = _Sampler
+
+        # attach function interface to function address
+        self.zeSamplerCreate = _zeSamplerCreate_t(self.__dditable.Sampler.pfnCreate)
+        self.zeSamplerDestroy = _zeSamplerDestroy_t(self.__dditable.Sampler.pfnDestroy)
+
+        # call driver to get function pointers
+        _PhysicalMem = _ze_physical_mem_dditable_t()
+        r = ze_result_v(self.__dll.zeGetPhysicalMemProcAddrTable(version, byref(_PhysicalMem)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.PhysicalMem = _PhysicalMem
+
+        # attach function interface to function address
+        self.zePhysicalMemCreate = _zePhysicalMemCreate_t(self.__dditable.PhysicalMem.pfnCreate)
+        self.zePhysicalMemDestroy = _zePhysicalMemDestroy_t(self.__dditable.PhysicalMem.pfnDestroy)
+
+        # call driver to get function pointers
+        _VirtualMem = _ze_virtual_mem_dditable_t()
+        r = ze_result_v(self.__dll.zeGetVirtualMemProcAddrTable(version, byref(_VirtualMem)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.VirtualMem = _VirtualMem
+
+        # attach function interface to function address
+        self.zeVirtualMemReserve = _zeVirtualMemReserve_t(self.__dditable.VirtualMem.pfnReserve)
+        self.zeVirtualMemFree = _zeVirtualMemFree_t(self.__dditable.VirtualMem.pfnFree)
+        self.zeVirtualMemQueryPageSize = _zeVirtualMemQueryPageSize_t(self.__dditable.VirtualMem.pfnQueryPageSize)
+        self.zeVirtualMemMap = _zeVirtualMemMap_t(self.__dditable.VirtualMem.pfnMap)
+        self.zeVirtualMemUnmap = _zeVirtualMemUnmap_t(self.__dditable.VirtualMem.pfnUnmap)
+        self.zeVirtualMemSetAccessAttribute = _zeVirtualMemSetAccessAttribute_t(self.__dditable.VirtualMem.pfnSetAccessAttribute)
+        self.zeVirtualMemGetAccessAttribute = _zeVirtualMemGetAccessAttribute_t(self.__dditable.VirtualMem.pfnGetAccessAttribute)
+
+        # call driver to get function pointers
+        _FabricVertexExp = _ze_fabric_vertex_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetFabricVertexExpProcAddrTable(version, byref(_FabricVertexExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.FabricVertexExp = _FabricVertexExp
+
+        # attach function interface to function address
+        self.zeFabricVertexGetExp = _zeFabricVertexGetExp_t(self.__dditable.FabricVertexExp.pfnGetExp)
+        self.zeFabricVertexGetSubVerticesExp = _zeFabricVertexGetSubVerticesExp_t(self.__dditable.FabricVertexExp.pfnGetSubVerticesExp)
+        self.zeFabricVertexGetPropertiesExp = _zeFabricVertexGetPropertiesExp_t(self.__dditable.FabricVertexExp.pfnGetPropertiesExp)
+        self.zeFabricVertexGetDeviceExp = _zeFabricVertexGetDeviceExp_t(self.__dditable.FabricVertexExp.pfnGetDeviceExp)
+
+        # call driver to get function pointers
+        _FabricEdgeExp = _ze_fabric_edge_exp_dditable_t()
+        r = ze_result_v(self.__dll.zeGetFabricEdgeExpProcAddrTable(version, byref(_FabricEdgeExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.FabricEdgeExp = _FabricEdgeExp
+
+        # attach function interface to function address
+        self.zeFabricEdgeGetExp = _zeFabricEdgeGetExp_t(self.__dditable.FabricEdgeExp.pfnGetExp)
+        self.zeFabricEdgeGetVerticesExp = _zeFabricEdgeGetVerticesExp_t(self.__dditable.FabricEdgeExp.pfnGetVerticesExp)
+        self.zeFabricEdgeGetPropertiesExp = _zeFabricEdgeGetPropertiesExp_t(self.__dditable.FabricEdgeExp.pfnGetPropertiesExp)
+
+        # success!
diff --git a/third_party/level_zero/ze_api.h b/third_party/level_zero/ze_api.h
new file mode 100644
index 00000000000..dc57f4d47a8
--- /dev/null
+++ b/third_party/level_zero/ze_api.h
@@ -0,0 +1,14506 @@
+/*
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file ze_api.h
+ * @version v1.11-r1.11.8
+ *
+ */
+#ifndef _ZE_API_H
+#define _ZE_API_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+
+// standard headers
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Intel 'oneAPI' Level-Zero API common types
+#if !defined(__GNUC__)
+#pragma region common
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAKE_VERSION
+/// @brief Generates generic 'oneAPI' API versions
+#define ZE_MAKE_VERSION( _major, _minor )  (( _major << 16 )|( _minor & 0x0000ffff))
+#endif // ZE_MAKE_VERSION
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAJOR_VERSION
+/// @brief Extracts 'oneAPI' API major version
+#define ZE_MAJOR_VERSION( _ver )  ( _ver >> 16 )
+#endif // ZE_MAJOR_VERSION
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MINOR_VERSION
+/// @brief Extracts 'oneAPI' API minor version
+#define ZE_MINOR_VERSION( _ver )  ( _ver & 0x0000ffff )
+#endif // ZE_MINOR_VERSION
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_APICALL
+#if defined(_WIN32)
+/// @brief Calling convention for all API functions
+#define ZE_APICALL  __cdecl
+#else
+#define ZE_APICALL  
+#endif // defined(_WIN32)
+#endif // ZE_APICALL
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_APIEXPORT
+#if defined(_WIN32)
+/// @brief Microsoft-specific dllexport storage-class attribute
+#define ZE_APIEXPORT  __declspec(dllexport)
+#endif // defined(_WIN32)
+#endif // ZE_APIEXPORT
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_APIEXPORT
+#if __GNUC__ >= 4
+/// @brief GCC-specific dllexport storage-class attribute
+#define ZE_APIEXPORT  __attribute__ ((visibility ("default")))
+#else
+#define ZE_APIEXPORT  
+#endif // __GNUC__ >= 4
+#endif // ZE_APIEXPORT
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_DLLEXPORT
+#if defined(_WIN32)
+/// @brief Microsoft-specific dllexport storage-class attribute
+#define ZE_DLLEXPORT  __declspec(dllexport)
+#endif // defined(_WIN32)
+#endif // ZE_DLLEXPORT
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_DLLEXPORT
+#if __GNUC__ >= 4
+/// @brief GCC-specific dllexport storage-class attribute
+#define ZE_DLLEXPORT  __attribute__ ((visibility ("default")))
+#else
+#define ZE_DLLEXPORT  
+#endif // __GNUC__ >= 4
+#endif // ZE_DLLEXPORT
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief compiler-independent type
+typedef uint8_t ze_bool_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of a driver instance
+typedef struct _ze_driver_handle_t *ze_driver_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's device object
+typedef struct _ze_device_handle_t *ze_device_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's context object
+typedef struct _ze_context_handle_t *ze_context_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's command queue object
+typedef struct _ze_command_queue_handle_t *ze_command_queue_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's command list object
+typedef struct _ze_command_list_handle_t *ze_command_list_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's fence object
+typedef struct _ze_fence_handle_t *ze_fence_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's event pool object
+typedef struct _ze_event_pool_handle_t *ze_event_pool_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's event object
+typedef struct _ze_event_handle_t *ze_event_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's image object
+typedef struct _ze_image_handle_t *ze_image_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's module object
+typedef struct _ze_module_handle_t *ze_module_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of module's build log object
+typedef struct _ze_module_build_log_handle_t *ze_module_build_log_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's kernel object
+typedef struct _ze_kernel_handle_t *ze_kernel_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's sampler object
+typedef struct _ze_sampler_handle_t *ze_sampler_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of physical memory object
+typedef struct _ze_physical_mem_handle_t *ze_physical_mem_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's fabric vertex object
+typedef struct _ze_fabric_vertex_handle_t *ze_fabric_vertex_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of driver's fabric edge object
+typedef struct _ze_fabric_edge_handle_t *ze_fabric_edge_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_IPC_HANDLE_SIZE
+/// @brief Maximum IPC handle size
+#define ZE_MAX_IPC_HANDLE_SIZE  64
+#endif // ZE_MAX_IPC_HANDLE_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief IPC handle to a memory allocation
+typedef struct _ze_ipc_mem_handle_t
+{
+    char data[ZE_MAX_IPC_HANDLE_SIZE];                                      ///< [out] Opaque data representing an IPC handle
+
+} ze_ipc_mem_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief IPC handle to a event pool allocation
+typedef struct _ze_ipc_event_pool_handle_t
+{
+    char data[ZE_MAX_IPC_HANDLE_SIZE];                                      ///< [out] Opaque data representing an IPC handle
+
+} ze_ipc_event_pool_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_BIT
+/// @brief Generic macro for enumerator bit masks
+#define ZE_BIT( _i )  ( 1 << _i )
+#endif // ZE_BIT
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Defines Return/Error codes
+typedef enum _ze_result_t
+{
+    ZE_RESULT_SUCCESS = 0,                                                  ///< [Core] success
+    ZE_RESULT_NOT_READY = 1,                                                ///< [Core] synchronization primitive not signaled
+    ZE_RESULT_ERROR_DEVICE_LOST = 0x70000001,                               ///< [Core] device hung, reset, was removed, or driver update occurred
+    ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY = 0x70000002,                        ///< [Core] insufficient host memory to satisfy call
+    ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003,                      ///< [Core] insufficient device memory to satisfy call
+    ZE_RESULT_ERROR_MODULE_BUILD_FAILURE = 0x70000004,                      ///< [Core] error occurred when building module, see build log for details
+    ZE_RESULT_ERROR_MODULE_LINK_FAILURE = 0x70000005,                       ///< [Core] error occurred when linking modules, see build log for details
+    ZE_RESULT_ERROR_DEVICE_REQUIRES_RESET = 0x70000006,                     ///< [Core] device requires a reset
+    ZE_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE = 0x70000007,                 ///< [Core] device currently in low power state
+    ZE_RESULT_EXP_ERROR_DEVICE_IS_NOT_VERTEX = 0x7ff00001,                  ///< [Core, Experimental] device is not represented by a fabric vertex
+    ZE_RESULT_EXP_ERROR_VERTEX_IS_NOT_DEVICE = 0x7ff00002,                  ///< [Core, Experimental] fabric vertex does not represent a device
+    ZE_RESULT_EXP_ERROR_REMOTE_DEVICE = 0x7ff00003,                         ///< [Core, Experimental] fabric vertex represents a remote device or
+                                                                            ///< subdevice
+    ZE_RESULT_EXP_ERROR_OPERANDS_INCOMPATIBLE = 0x7ff00004,                 ///< [Core, Experimental] operands of comparison are not compatible
+    ZE_RESULT_EXP_RTAS_BUILD_RETRY = 0x7ff00005,                            ///< [Core, Experimental] ray tracing acceleration structure build
+                                                                            ///< operation failed due to insufficient resources, retry with a larger
+                                                                            ///< acceleration structure buffer allocation
+    ZE_RESULT_EXP_RTAS_BUILD_DEFERRED = 0x7ff00006,                         ///< [Core, Experimental] ray tracing acceleration structure build
+                                                                            ///< operation deferred to parallel operation join
+    ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000,                  ///< [Sysman] access denied due to permission level
+    ZE_RESULT_ERROR_NOT_AVAILABLE = 0x70010001,                             ///< [Sysman] resource already in use and simultaneous access not allowed
+                                                                            ///< or resource was removed
+    ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000,                    ///< [Common] external required dependency is unavailable or missing
+    ZE_RESULT_WARNING_DROPPED_DATA = 0x70020001,                            ///< [Tools] data may have been dropped
+    ZE_RESULT_ERROR_UNINITIALIZED = 0x78000001,                             ///< [Validation] driver is not initialized
+    ZE_RESULT_ERROR_UNSUPPORTED_VERSION = 0x78000002,                       ///< [Validation] generic error code for unsupported versions
+    ZE_RESULT_ERROR_UNSUPPORTED_FEATURE = 0x78000003,                       ///< [Validation] generic error code for unsupported features
+    ZE_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004,                          ///< [Validation] generic error code for invalid arguments
+    ZE_RESULT_ERROR_INVALID_NULL_HANDLE = 0x78000005,                       ///< [Validation] handle argument is not valid
+    ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 0x78000006,                      ///< [Validation] object pointed to by handle still in-use by device
+    ZE_RESULT_ERROR_INVALID_NULL_POINTER = 0x78000007,                      ///< [Validation] pointer argument may not be nullptr
+    ZE_RESULT_ERROR_INVALID_SIZE = 0x78000008,                              ///< [Validation] size argument is invalid (e.g., must not be zero)
+    ZE_RESULT_ERROR_UNSUPPORTED_SIZE = 0x78000009,                          ///< [Validation] size argument is not supported by the device (e.g., too
+                                                                            ///< large)
+    ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a,                     ///< [Validation] alignment argument is not supported by the device (e.g.,
+                                                                            ///< too small)
+    ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b,            ///< [Validation] synchronization object in invalid state
+    ZE_RESULT_ERROR_INVALID_ENUMERATION = 0x7800000c,                       ///< [Validation] enumerator argument is not valid
+    ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d,                   ///< [Validation] enumerator argument is not supported by the device
+    ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e,                  ///< [Validation] image format is not supported by the device
+    ZE_RESULT_ERROR_INVALID_NATIVE_BINARY = 0x7800000f,                     ///< [Validation] native binary is not supported by the device
+    ZE_RESULT_ERROR_INVALID_GLOBAL_NAME = 0x78000010,                       ///< [Validation] global variable is not found in the module
+    ZE_RESULT_ERROR_INVALID_KERNEL_NAME = 0x78000011,                       ///< [Validation] kernel name is not found in the module
+    ZE_RESULT_ERROR_INVALID_FUNCTION_NAME = 0x78000012,                     ///< [Validation] function name is not found in the module
+    ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013,              ///< [Validation] group size dimension is not valid for the kernel or
+                                                                            ///< device
+    ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014,            ///< [Validation] global width dimension is not valid for the kernel or
+                                                                            ///< device
+    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015,             ///< [Validation] kernel argument index is not valid for kernel
+    ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016,              ///< [Validation] kernel argument size does not match kernel
+    ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017,            ///< [Validation] value of kernel attribute is not valid for the kernel or
+                                                                            ///< device
+    ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED = 0x78000018,                   ///< [Validation] module with imports needs to be linked before kernels can
+                                                                            ///< be created from it.
+    ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019,                 ///< [Validation] command list type does not match command queue type
+    ZE_RESULT_ERROR_OVERLAPPING_REGIONS = 0x7800001a,                       ///< [Validation] copy operations do not support overlapping regions of
+                                                                            ///< memory
+    ZE_RESULT_WARNING_ACTION_REQUIRED = 0x7800001b,                         ///< [Sysman] an action is required to complete the desired operation
+    ZE_RESULT_ERROR_INVALID_KERNEL_HANDLE = 0x7800001c,                     ///< [Core, Validation] kernel handle is invalid for the operation
+    ZE_RESULT_ERROR_UNKNOWN = 0x7ffffffe,                                   ///< [Core] unknown or internal error
+    ZE_RESULT_FORCE_UINT32 = 0x7fffffff
+
+} ze_result_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Defines structure types
+typedef enum _ze_structure_type_t
+{
+    ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES = 0x1,                              ///< ::ze_driver_properties_t
+    ZE_STRUCTURE_TYPE_DRIVER_IPC_PROPERTIES = 0x2,                          ///< ::ze_driver_ipc_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3,                              ///< ::ze_device_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES = 0x4,                      ///< ::ze_device_compute_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES = 0x5,                       ///< ::ze_device_module_properties_t
+    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES = 0x6,                 ///< ::ze_command_queue_group_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES = 0x7,                       ///< ::ze_device_memory_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8,                ///< ::ze_device_memory_access_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES = 0x9,                        ///< ::ze_device_cache_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES = 0xa,                        ///< ::ze_device_image_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_P2P_PROPERTIES = 0xb,                          ///< ::ze_device_p2p_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc,              ///< ::ze_device_external_memory_properties_t
+    ZE_STRUCTURE_TYPE_CONTEXT_DESC = 0xd,                                   ///< ::ze_context_desc_t
+    ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC = 0xe,                             ///< ::ze_command_queue_desc_t
+    ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC = 0xf,                              ///< ::ze_command_list_desc_t
+    ZE_STRUCTURE_TYPE_EVENT_POOL_DESC = 0x10,                               ///< ::ze_event_pool_desc_t
+    ZE_STRUCTURE_TYPE_EVENT_DESC = 0x11,                                    ///< ::ze_event_desc_t
+    ZE_STRUCTURE_TYPE_FENCE_DESC = 0x12,                                    ///< ::ze_fence_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_DESC = 0x13,                                    ///< ::ze_image_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_PROPERTIES = 0x14,                              ///< ::ze_image_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC = 0x15,                         ///< ::ze_device_mem_alloc_desc_t
+    ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC = 0x16,                           ///< ::ze_host_mem_alloc_desc_t
+    ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES = 0x17,                  ///< ::ze_memory_allocation_properties_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_DESC = 0x18,                   ///< ::ze_external_memory_export_desc_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD = 0x19,                     ///< ::ze_external_memory_import_fd_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD = 0x1a,                     ///< ::ze_external_memory_export_fd_t
+    ZE_STRUCTURE_TYPE_MODULE_DESC = 0x1b,                                   ///< ::ze_module_desc_t
+    ZE_STRUCTURE_TYPE_MODULE_PROPERTIES = 0x1c,                             ///< ::ze_module_properties_t
+    ZE_STRUCTURE_TYPE_KERNEL_DESC = 0x1d,                                   ///< ::ze_kernel_desc_t
+    ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES = 0x1e,                             ///< ::ze_kernel_properties_t
+    ZE_STRUCTURE_TYPE_SAMPLER_DESC = 0x1f,                                  ///< ::ze_sampler_desc_t
+    ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC = 0x20,                             ///< ::ze_physical_mem_desc_t
+    ZE_STRUCTURE_TYPE_KERNEL_PREFERRED_GROUP_SIZE_PROPERTIES = 0x21,        ///< ::ze_kernel_preferred_group_size_properties_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_WIN32 = 0x22,                  ///< ::ze_external_memory_import_win32_handle_t
+    ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_WIN32 = 0x23,                  ///< ::ze_external_memory_export_win32_handle_t
+    ZE_STRUCTURE_TYPE_DEVICE_RAYTRACING_EXT_PROPERTIES = 0x00010001,        ///< ::ze_device_raytracing_ext_properties_t
+    ZE_STRUCTURE_TYPE_RAYTRACING_MEM_ALLOC_EXT_DESC = 0x10002,              ///< ::ze_raytracing_mem_alloc_ext_desc_t
+    ZE_STRUCTURE_TYPE_FLOAT_ATOMIC_EXT_PROPERTIES = 0x10003,                ///< ::ze_float_atomic_ext_properties_t
+    ZE_STRUCTURE_TYPE_CACHE_RESERVATION_EXT_DESC = 0x10004,                 ///< ::ze_cache_reservation_ext_desc_t
+    ZE_STRUCTURE_TYPE_EU_COUNT_EXT = 0x10005,                               ///< ::ze_eu_count_ext_t
+    ZE_STRUCTURE_TYPE_SRGB_EXT_DESC = 0x10006,                              ///< ::ze_srgb_ext_desc_t
+    ZE_STRUCTURE_TYPE_LINKAGE_INSPECTION_EXT_DESC = 0x10007,                ///< ::ze_linkage_inspection_ext_desc_t
+    ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES = 0x10008,                         ///< ::ze_pci_ext_properties_t
+    ZE_STRUCTURE_TYPE_DRIVER_MEMORY_FREE_EXT_PROPERTIES = 0x10009,          ///< ::ze_driver_memory_free_ext_properties_t
+    ZE_STRUCTURE_TYPE_MEMORY_FREE_EXT_DESC = 0x1000a,                       ///< ::ze_memory_free_ext_desc_t
+    ZE_STRUCTURE_TYPE_MEMORY_COMPRESSION_HINTS_EXT_DESC = 0x1000b,          ///< ::ze_memory_compression_hints_ext_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_ALLOCATION_EXT_PROPERTIES = 0x1000c,            ///< ::ze_image_allocation_ext_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_LUID_EXT_PROPERTIES = 0x1000d,                 ///< ::ze_device_luid_ext_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_MEMORY_EXT_PROPERTIES = 0x1000e,               ///< ::ze_device_memory_ext_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT = 0x1000f,                      ///< ::ze_device_ip_version_ext_t
+    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXT_DESC = 0x10010,                 ///< ::ze_image_view_planar_ext_desc_t
+    ZE_STRUCTURE_TYPE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_PROPERTIES = 0x10011,   ///< ::ze_event_query_kernel_timestamps_ext_properties_t
+    ZE_STRUCTURE_TYPE_EVENT_QUERY_KERNEL_TIMESTAMPS_RESULTS_EXT_PROPERTIES = 0x10012,   ///< ::ze_event_query_kernel_timestamps_results_ext_properties_t
+    ZE_STRUCTURE_TYPE_KERNEL_MAX_GROUP_SIZE_EXT_PROPERTIES = 0x10013,       ///< ::ze_kernel_max_group_size_ext_properties_t
+    ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC = 0x00020001,      ///< ::ze_relaxed_allocation_limits_exp_desc_t
+    ZE_STRUCTURE_TYPE_MODULE_PROGRAM_EXP_DESC = 0x00020002,                 ///< ::ze_module_program_exp_desc_t
+    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_PROPERTIES = 0x00020003,          ///< ::ze_scheduling_hint_exp_properties_t
+    ZE_STRUCTURE_TYPE_SCHEDULING_HINT_EXP_DESC = 0x00020004,                ///< ::ze_scheduling_hint_exp_desc_t
+    ZE_STRUCTURE_TYPE_IMAGE_VIEW_PLANAR_EXP_DESC = 0x00020005,              ///< ::ze_image_view_planar_exp_desc_t
+    ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 = 0x00020006,                   ///< ::ze_device_properties_t
+    ZE_STRUCTURE_TYPE_IMAGE_MEMORY_EXP_PROPERTIES = 0x00020007,             ///< ::ze_image_memory_properties_exp_t
+    ZE_STRUCTURE_TYPE_POWER_SAVING_HINT_EXP_DESC = 0x00020008,              ///< ::ze_context_power_saving_hint_exp_desc_t
+    ZE_STRUCTURE_TYPE_COPY_BANDWIDTH_EXP_PROPERTIES = 0x00020009,           ///< ::ze_copy_bandwidth_exp_properties_t
+    ZE_STRUCTURE_TYPE_DEVICE_P2P_BANDWIDTH_EXP_PROPERTIES = 0x0002000A,     ///< ::ze_device_p2p_bandwidth_exp_properties_t
+    ZE_STRUCTURE_TYPE_FABRIC_VERTEX_EXP_PROPERTIES = 0x0002000B,            ///< ::ze_fabric_vertex_exp_properties_t
+    ZE_STRUCTURE_TYPE_FABRIC_EDGE_EXP_PROPERTIES = 0x0002000C,              ///< ::ze_fabric_edge_exp_properties_t
+    ZE_STRUCTURE_TYPE_MEMORY_SUB_ALLOCATIONS_EXP_PROPERTIES = 0x0002000D,   ///< ::ze_memory_sub_allocations_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_DESC = 0x0002000E,                   ///< ::ze_rtas_builder_exp_desc_t
+    ZE_STRUCTURE_TYPE_RTAS_BUILDER_BUILD_OP_EXP_DESC = 0x0002000F,          ///< ::ze_rtas_builder_build_op_exp_desc_t
+    ZE_STRUCTURE_TYPE_RTAS_BUILDER_EXP_PROPERTIES = 0x00020010,             ///< ::ze_rtas_builder_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_PARALLEL_OPERATION_EXP_PROPERTIES = 0x00020011,  ///< ::ze_rtas_parallel_operation_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_DEVICE_EXP_PROPERTIES = 0x00020012,              ///< ::ze_rtas_device_exp_properties_t
+    ZE_STRUCTURE_TYPE_RTAS_GEOMETRY_AABBS_EXP_CB_PARAMS = 0x00020013,       ///< ::ze_rtas_geometry_aabbs_exp_cb_params_t
+    ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC = 0x00020014,       ///< ::ze_event_pool_counter_based_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_PROPERTIES = 0x00020015,     ///< ::ze_mutable_command_list_exp_properties_t
+    ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_LIST_EXP_DESC = 0x00020016,           ///< ::ze_mutable_command_list_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_COMMAND_ID_EXP_DESC = 0x00020017,             ///< ::ze_mutable_command_id_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_COMMANDS_EXP_DESC = 0x00020018,               ///< ::ze_mutable_commands_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_KERNEL_ARGUMENT_EXP_DESC = 0x00020019,        ///< ::ze_mutable_kernel_argument_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_GROUP_COUNT_EXP_DESC = 0x0002001A,            ///< ::ze_mutable_group_count_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_GROUP_SIZE_EXP_DESC = 0x0002001B,             ///< ::ze_mutable_group_size_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_GLOBAL_OFFSET_EXP_DESC = 0x0002001C,          ///< ::ze_mutable_global_offset_exp_desc_t
+    ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES = 0x0002001D,     ///< ::ze_device_pitched_alloc_exp_properties_t
+    ZE_STRUCTURE_TYPE_BINDLESS_IMAGE_EXP_DESC = 0x0002001E,                 ///< ::ze_image_bindless_exp_desc_t
+    ZE_STRUCTURE_TYPE_PITCHED_IMAGE_EXP_DESC = 0x0002001F,                  ///< ::ze_image_pitched_exp_desc_t
+    ZE_STRUCTURE_TYPE_MUTABLE_GRAPH_ARGUMENT_EXP_DESC = 0x00020020,         ///< ::ze_mutable_graph_argument_exp_desc_t
+    ZE_STRUCTURE_TYPE_INIT_DRIVER_TYPE_DESC = 0x00020021,                   ///< ::ze_init_driver_type_desc_t
+    ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_structure_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief External memory type flags
+typedef uint32_t ze_external_memory_type_flags_t;
+typedef enum _ze_external_memory_type_flag_t
+{
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_FD = ZE_BIT(0),                     ///< an opaque POSIX file descriptor handle
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF = ZE_BIT(1),                       ///< a file descriptor handle for a Linux dma_buf
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32 = ZE_BIT(2),                  ///< an NT handle
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT = ZE_BIT(3),              ///< a global share (KMT) handle
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE = ZE_BIT(4),                 ///< an NT handle referring to a Direct3D 10 or 11 texture resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D11_TEXTURE_KMT = ZE_BIT(5),             ///< a global share (KMT) handle referring to a Direct3D 10 or 11 texture
+                                                                            ///< resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_HEAP = ZE_BIT(6),                    ///< an NT handle referring to a Direct3D 12 heap resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_D3D12_RESOURCE = ZE_BIT(7),                ///< an NT handle referring to a Direct3D 12 committed resource
+    ZE_EXTERNAL_MEMORY_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_external_memory_type_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Bandwidth unit
+typedef enum _ze_bandwidth_unit_t
+{
+    ZE_BANDWIDTH_UNIT_UNKNOWN = 0,                                          ///< The unit used for bandwidth is unknown
+    ZE_BANDWIDTH_UNIT_BYTES_PER_NANOSEC = 1,                                ///< Bandwidth is provided in bytes/nanosec
+    ZE_BANDWIDTH_UNIT_BYTES_PER_CLOCK = 2,                                  ///< Bandwidth is provided in bytes/clock
+    ZE_BANDWIDTH_UNIT_FORCE_UINT32 = 0x7fffffff
+
+} ze_bandwidth_unit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Latency unit
+typedef enum _ze_latency_unit_t
+{
+    ZE_LATENCY_UNIT_UNKNOWN = 0,                                            ///< The unit used for latency is unknown
+    ZE_LATENCY_UNIT_NANOSEC = 1,                                            ///< Latency is provided in nanosecs
+    ZE_LATENCY_UNIT_CLOCK = 2,                                              ///< Latency is provided in clocks
+    ZE_LATENCY_UNIT_HOP = 3,                                                ///< Latency is provided in hops (normalized so that the lowest latency
+                                                                            ///< link has a latency of 1 hop)
+    ZE_LATENCY_UNIT_FORCE_UINT32 = 0x7fffffff
+
+} ze_latency_unit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_UUID_SIZE
+/// @brief Maximum universal unique id (UUID) size in bytes
+#define ZE_MAX_UUID_SIZE  16
+#endif // ZE_MAX_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Universal unique id (UUID)
+typedef struct _ze_uuid_t
+{
+    uint8_t id[ZE_MAX_UUID_SIZE];                                           ///< [out] opaque data representing a UUID
+
+} ze_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all callback function parameter types
+typedef struct _ze_base_cb_params_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} ze_base_cb_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all properties types
+typedef struct _ze_base_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} ze_base_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all descriptor types
+typedef struct _ze_base_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} ze_base_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forces driver to only report devices (and sub-devices) as specified by
+///        values
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forces driver to report devices from lowest to highest PCI bus ID
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forces all shared allocations into device memory
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Defines the device hierarchy model exposed by Level Zero driver
+///        implementation
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_ipc_mem_handle_t
+typedef struct _ze_ipc_mem_handle_t ze_ipc_mem_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_ipc_event_pool_handle_t
+typedef struct _ze_ipc_event_pool_handle_t ze_ipc_event_pool_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_uuid_t
+typedef struct _ze_uuid_t ze_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_base_cb_params_t
+typedef struct _ze_base_cb_params_t ze_base_cb_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_base_properties_t
+typedef struct _ze_base_properties_t ze_base_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_base_desc_t
+typedef struct _ze_base_desc_t ze_base_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_init_driver_type_desc_t
+typedef struct _ze_init_driver_type_desc_t ze_init_driver_type_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_driver_uuid_t
+typedef struct _ze_driver_uuid_t ze_driver_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_driver_properties_t
+typedef struct _ze_driver_properties_t ze_driver_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_driver_ipc_properties_t
+typedef struct _ze_driver_ipc_properties_t ze_driver_ipc_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_driver_extension_properties_t
+typedef struct _ze_driver_extension_properties_t ze_driver_extension_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_uuid_t
+typedef struct _ze_device_uuid_t ze_device_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_properties_t
+typedef struct _ze_device_properties_t ze_device_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_thread_t
+typedef struct _ze_device_thread_t ze_device_thread_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_compute_properties_t
+typedef struct _ze_device_compute_properties_t ze_device_compute_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_native_kernel_uuid_t
+typedef struct _ze_native_kernel_uuid_t ze_native_kernel_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_module_properties_t
+typedef struct _ze_device_module_properties_t ze_device_module_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_command_queue_group_properties_t
+typedef struct _ze_command_queue_group_properties_t ze_command_queue_group_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_memory_properties_t
+typedef struct _ze_device_memory_properties_t ze_device_memory_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_memory_access_properties_t
+typedef struct _ze_device_memory_access_properties_t ze_device_memory_access_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_cache_properties_t
+typedef struct _ze_device_cache_properties_t ze_device_cache_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_image_properties_t
+typedef struct _ze_device_image_properties_t ze_device_image_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_external_memory_properties_t
+typedef struct _ze_device_external_memory_properties_t ze_device_external_memory_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_p2p_properties_t
+typedef struct _ze_device_p2p_properties_t ze_device_p2p_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_context_desc_t
+typedef struct _ze_context_desc_t ze_context_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_command_queue_desc_t
+typedef struct _ze_command_queue_desc_t ze_command_queue_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_command_list_desc_t
+typedef struct _ze_command_list_desc_t ze_command_list_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_copy_region_t
+typedef struct _ze_copy_region_t ze_copy_region_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_region_t
+typedef struct _ze_image_region_t ze_image_region_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_event_pool_desc_t
+typedef struct _ze_event_pool_desc_t ze_event_pool_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_event_desc_t
+typedef struct _ze_event_desc_t ze_event_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_timestamp_data_t
+typedef struct _ze_kernel_timestamp_data_t ze_kernel_timestamp_data_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_timestamp_result_t
+typedef struct _ze_kernel_timestamp_result_t ze_kernel_timestamp_result_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_fence_desc_t
+typedef struct _ze_fence_desc_t ze_fence_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_format_t
+typedef struct _ze_image_format_t ze_image_format_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_desc_t
+typedef struct _ze_image_desc_t ze_image_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_properties_t
+typedef struct _ze_image_properties_t ze_image_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_mem_alloc_desc_t
+typedef struct _ze_device_mem_alloc_desc_t ze_device_mem_alloc_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_host_mem_alloc_desc_t
+typedef struct _ze_host_mem_alloc_desc_t ze_host_mem_alloc_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_memory_allocation_properties_t
+typedef struct _ze_memory_allocation_properties_t ze_memory_allocation_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_external_memory_export_desc_t
+typedef struct _ze_external_memory_export_desc_t ze_external_memory_export_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_external_memory_import_fd_t
+typedef struct _ze_external_memory_import_fd_t ze_external_memory_import_fd_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_external_memory_export_fd_t
+typedef struct _ze_external_memory_export_fd_t ze_external_memory_export_fd_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_external_memory_import_win32_handle_t
+typedef struct _ze_external_memory_import_win32_handle_t ze_external_memory_import_win32_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_external_memory_export_win32_handle_t
+typedef struct _ze_external_memory_export_win32_handle_t ze_external_memory_export_win32_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_module_constants_t
+typedef struct _ze_module_constants_t ze_module_constants_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_module_desc_t
+typedef struct _ze_module_desc_t ze_module_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_module_properties_t
+typedef struct _ze_module_properties_t ze_module_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_desc_t
+typedef struct _ze_kernel_desc_t ze_kernel_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_uuid_t
+typedef struct _ze_kernel_uuid_t ze_kernel_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_properties_t
+typedef struct _ze_kernel_properties_t ze_kernel_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_preferred_group_size_properties_t
+typedef struct _ze_kernel_preferred_group_size_properties_t ze_kernel_preferred_group_size_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_group_count_t
+typedef struct _ze_group_count_t ze_group_count_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_module_program_exp_desc_t
+typedef struct _ze_module_program_exp_desc_t ze_module_program_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_raytracing_ext_properties_t
+typedef struct _ze_device_raytracing_ext_properties_t ze_device_raytracing_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_raytracing_mem_alloc_ext_desc_t
+typedef struct _ze_raytracing_mem_alloc_ext_desc_t ze_raytracing_mem_alloc_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_sampler_desc_t
+typedef struct _ze_sampler_desc_t ze_sampler_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_physical_mem_desc_t
+typedef struct _ze_physical_mem_desc_t ze_physical_mem_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_float_atomic_ext_properties_t
+typedef struct _ze_float_atomic_ext_properties_t ze_float_atomic_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_relaxed_allocation_limits_exp_desc_t
+typedef struct _ze_relaxed_allocation_limits_exp_desc_t ze_relaxed_allocation_limits_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_cache_reservation_ext_desc_t
+typedef struct _ze_cache_reservation_ext_desc_t ze_cache_reservation_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_memory_properties_exp_t
+typedef struct _ze_image_memory_properties_exp_t ze_image_memory_properties_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_view_planar_ext_desc_t
+typedef struct _ze_image_view_planar_ext_desc_t ze_image_view_planar_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_view_planar_exp_desc_t
+typedef struct _ze_image_view_planar_exp_desc_t ze_image_view_planar_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_scheduling_hint_exp_properties_t
+typedef struct _ze_scheduling_hint_exp_properties_t ze_scheduling_hint_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_scheduling_hint_exp_desc_t
+typedef struct _ze_scheduling_hint_exp_desc_t ze_scheduling_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_context_power_saving_hint_exp_desc_t
+typedef struct _ze_context_power_saving_hint_exp_desc_t ze_context_power_saving_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_eu_count_ext_t
+typedef struct _ze_eu_count_ext_t ze_eu_count_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_pci_address_ext_t
+typedef struct _ze_pci_address_ext_t ze_pci_address_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_pci_speed_ext_t
+typedef struct _ze_pci_speed_ext_t ze_pci_speed_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_pci_ext_properties_t
+typedef struct _ze_pci_ext_properties_t ze_pci_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_srgb_ext_desc_t
+typedef struct _ze_srgb_ext_desc_t ze_srgb_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_allocation_ext_properties_t
+typedef struct _ze_image_allocation_ext_properties_t ze_image_allocation_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_linkage_inspection_ext_desc_t
+typedef struct _ze_linkage_inspection_ext_desc_t ze_linkage_inspection_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_memory_compression_hints_ext_desc_t
+typedef struct _ze_memory_compression_hints_ext_desc_t ze_memory_compression_hints_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_driver_memory_free_ext_properties_t
+typedef struct _ze_driver_memory_free_ext_properties_t ze_driver_memory_free_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_memory_free_ext_desc_t
+typedef struct _ze_memory_free_ext_desc_t ze_memory_free_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_p2p_bandwidth_exp_properties_t
+typedef struct _ze_device_p2p_bandwidth_exp_properties_t ze_device_p2p_bandwidth_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_copy_bandwidth_exp_properties_t
+typedef struct _ze_copy_bandwidth_exp_properties_t ze_copy_bandwidth_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_luid_ext_t
+typedef struct _ze_device_luid_ext_t ze_device_luid_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_luid_ext_properties_t
+typedef struct _ze_device_luid_ext_properties_t ze_device_luid_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_fabric_vertex_pci_exp_address_t
+typedef struct _ze_fabric_vertex_pci_exp_address_t ze_fabric_vertex_pci_exp_address_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_fabric_vertex_exp_properties_t
+typedef struct _ze_fabric_vertex_exp_properties_t ze_fabric_vertex_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_fabric_edge_exp_properties_t
+typedef struct _ze_fabric_edge_exp_properties_t ze_fabric_edge_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_memory_ext_properties_t
+typedef struct _ze_device_memory_ext_properties_t ze_device_memory_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_ip_version_ext_t
+typedef struct _ze_device_ip_version_ext_t ze_device_ip_version_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_kernel_max_group_size_properties_ext_t
+typedef struct _ze_kernel_max_group_size_properties_ext_t ze_kernel_max_group_size_properties_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_sub_allocation_t
+typedef struct _ze_sub_allocation_t ze_sub_allocation_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_memory_sub_allocations_exp_properties_t
+typedef struct _ze_memory_sub_allocations_exp_properties_t ze_memory_sub_allocations_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_event_query_kernel_timestamps_ext_properties_t
+typedef struct _ze_event_query_kernel_timestamps_ext_properties_t ze_event_query_kernel_timestamps_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_synchronized_timestamp_data_ext_t
+typedef struct _ze_synchronized_timestamp_data_ext_t ze_synchronized_timestamp_data_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_synchronized_timestamp_result_ext_t
+typedef struct _ze_synchronized_timestamp_result_ext_t ze_synchronized_timestamp_result_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_event_query_kernel_timestamps_results_ext_properties_t
+typedef struct _ze_event_query_kernel_timestamps_results_ext_properties_t ze_event_query_kernel_timestamps_results_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_exp_desc_t
+typedef struct _ze_rtas_builder_exp_desc_t ze_rtas_builder_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_exp_properties_t
+typedef struct _ze_rtas_builder_exp_properties_t ze_rtas_builder_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_parallel_operation_exp_properties_t
+typedef struct _ze_rtas_parallel_operation_exp_properties_t ze_rtas_parallel_operation_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_device_exp_properties_t
+typedef struct _ze_rtas_device_exp_properties_t ze_rtas_device_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_float3_exp_t
+typedef struct _ze_rtas_float3_exp_t ze_rtas_float3_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_transform_float3x4_column_major_exp_t
+typedef struct _ze_rtas_transform_float3x4_column_major_exp_t ze_rtas_transform_float3x4_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_transform_float3x4_aligned_column_major_exp_t
+typedef struct _ze_rtas_transform_float3x4_aligned_column_major_exp_t ze_rtas_transform_float3x4_aligned_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_transform_float3x4_row_major_exp_t
+typedef struct _ze_rtas_transform_float3x4_row_major_exp_t ze_rtas_transform_float3x4_row_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_aabb_exp_t
+typedef struct _ze_rtas_aabb_exp_t ze_rtas_aabb_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_triangle_indices_uint32_exp_t
+typedef struct _ze_rtas_triangle_indices_uint32_exp_t ze_rtas_triangle_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_quad_indices_uint32_exp_t
+typedef struct _ze_rtas_quad_indices_uint32_exp_t ze_rtas_quad_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_geometry_info_exp_t
+typedef struct _ze_rtas_builder_geometry_info_exp_t ze_rtas_builder_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_triangles_geometry_info_exp_t
+typedef struct _ze_rtas_builder_triangles_geometry_info_exp_t ze_rtas_builder_triangles_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_quads_geometry_info_exp_t
+typedef struct _ze_rtas_builder_quads_geometry_info_exp_t ze_rtas_builder_quads_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_geometry_aabbs_exp_cb_params_t
+typedef struct _ze_rtas_geometry_aabbs_exp_cb_params_t ze_rtas_geometry_aabbs_exp_cb_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_procedural_geometry_info_exp_t
+typedef struct _ze_rtas_builder_procedural_geometry_info_exp_t ze_rtas_builder_procedural_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_instance_geometry_info_exp_t
+typedef struct _ze_rtas_builder_instance_geometry_info_exp_t ze_rtas_builder_instance_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_rtas_builder_build_op_exp_desc_t
+typedef struct _ze_rtas_builder_build_op_exp_desc_t ze_rtas_builder_build_op_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_event_pool_counter_based_exp_desc_t
+typedef struct _ze_event_pool_counter_based_exp_desc_t ze_event_pool_counter_based_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_bindless_exp_desc_t
+typedef struct _ze_image_bindless_exp_desc_t ze_image_bindless_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_image_pitched_exp_desc_t
+typedef struct _ze_image_pitched_exp_desc_t ze_image_pitched_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_device_pitched_alloc_exp_properties_t
+typedef struct _ze_device_pitched_alloc_exp_properties_t ze_device_pitched_alloc_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_command_id_exp_desc_t
+typedef struct _ze_mutable_command_id_exp_desc_t ze_mutable_command_id_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_command_list_exp_properties_t
+typedef struct _ze_mutable_command_list_exp_properties_t ze_mutable_command_list_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_command_list_exp_desc_t
+typedef struct _ze_mutable_command_list_exp_desc_t ze_mutable_command_list_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_commands_exp_desc_t
+typedef struct _ze_mutable_commands_exp_desc_t ze_mutable_commands_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_kernel_argument_exp_desc_t
+typedef struct _ze_mutable_kernel_argument_exp_desc_t ze_mutable_kernel_argument_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_group_count_exp_desc_t
+typedef struct _ze_mutable_group_count_exp_desc_t ze_mutable_group_count_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_group_size_exp_desc_t
+typedef struct _ze_mutable_group_size_exp_desc_t ze_mutable_group_size_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_global_offset_exp_desc_t
+typedef struct _ze_mutable_global_offset_exp_desc_t ze_mutable_global_offset_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare ze_mutable_graph_argument_exp_desc_t
+typedef struct _ze_mutable_graph_argument_exp_desc_t ze_mutable_graph_argument_exp_desc_t;
+
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs
+#if !defined(__GNUC__)
+#pragma region driver
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported initialization flags
+typedef uint32_t ze_init_flags_t;
+typedef enum _ze_init_flag_t
+{
+    ZE_INIT_FLAG_GPU_ONLY = ZE_BIT(0),                                      ///< only initialize GPU drivers
+    ZE_INIT_FLAG_VPU_ONLY = ZE_BIT(1),                                      ///< only initialize VPU drivers
+    ZE_INIT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_init_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Initialize the 'oneAPI' driver(s)
+/// 
+/// @details
+///     - @deprecated since 1.10. Please use zeInitDrivers()
+///     - The application must call this function or zeInitDrivers before
+///       calling any other function.
+///     - If this function is not called then all other functions will return
+///       ::ZE_RESULT_ERROR_UNINITIALIZED.
+///     - Only one instance of each driver will be initialized per process.
+///     - The application may call this function multiple times with different
+///       flags or environment variables enabled.
+///     - The application must call this function after forking new processes.
+///       Each forked process must call this function.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe for scenarios
+///       where multiple libraries may initialize the driver(s) simultaneously.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeInit(
+    ze_init_flags_t flags                                                   ///< [in] initialization flags.
+                                                                            ///< must be 0 (default) or a combination of ::ze_init_flag_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves driver instances
+/// 
+/// @details
+///     - @deprecated since 1.10. Please use zeInitDrivers()
+///     - Usage of zeInitDrivers and zeDriverGet is mutually exclusive and
+///       should not be used together. Usage of them together will result in
+///       undefined behavior.
+///     - A driver represents a collection of physical devices.
+///     - Multiple calls to this function will return identical driver handles,
+///       in the same order.
+///     - The application may pass nullptr for pDrivers when only querying the
+///       number of drivers.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clGetPlatformIDs
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGet(
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of driver instances.
+                                                                            ///< if count is zero, then the loader shall update the value with the
+                                                                            ///< total number of drivers available.
+                                                                            ///< if count is greater than the number of drivers available, then the
+                                                                            ///< loader shall update the value with the correct number of drivers available.
+    ze_driver_handle_t* phDrivers                                           ///< [in,out][optional][range(0, *pCount)] array of driver instance handles.
+                                                                            ///< if count is less than the number of drivers available, then the loader
+                                                                            ///< shall only retrieve that number of drivers.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported driver initialization type flags
+/// 
+/// @details
+///     - Bit Field which details the driver types to be initialized and
+///       returned to the user.
+///     - Value Definition:
+///     - 0, do not init or retrieve any drivers.
+///     - ZE_INIT_DRIVER_TYPE_FLAG_GPU,	GPU Drivers are Init and driver handles
+///       retrieved.
+///     - ZE_INIT_DRIVER_TYPE_FLAG_NPU,	NPU Drivers are Init and driver handles
+///       retrieved.
+///     - ZE_INIT_DRIVER_TYPE_FLAG_GPU | ZE_INIT_DRIVER_TYPE_FLAG_NPU, NPU & GPU
+///       Drivers are Init and driver handles retrieved.
+///     - UINT32_MAX	All Drivers of any type are Init and driver handles
+///       retrieved.
+typedef uint32_t ze_init_driver_type_flags_t;
+typedef enum _ze_init_driver_type_flag_t
+{
+    ZE_INIT_DRIVER_TYPE_FLAG_GPU = ZE_BIT(0),                               ///< initialize and retrieve GPU drivers
+    ZE_INIT_DRIVER_TYPE_FLAG_NPU = ZE_BIT(1),                               ///< initialize and retrieve NPU drivers
+    ZE_INIT_DRIVER_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_init_driver_type_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Init Driver Type descriptor
+typedef struct _ze_init_driver_type_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_init_driver_type_flags_t flags;                                      ///< [in] driver type init flags.
+                                                                            ///< must be a valid combination of ::ze_init_driver_type_flag_t or UINT32_MAX;
+                                                                            ///< driver types are init and retrieved based on these init flags in zeInitDrivers().
+
+} ze_init_driver_type_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Initialize the 'oneAPI' driver(s) based on the driver types requested
+///        and retrieve the driver handles.
+/// 
+/// @details
+///     - The application must call this function or zeInit before calling any
+///       other function. (zeInit is [Deprecated] and is replaced by
+///       zeInitDrivers)
+///     - Calls to zeInit[Deprecated] or InitDrivers will not alter the drivers
+///       retrieved thru either api.
+///     - Drivers init thru zeInit[Deprecated] or InitDrivers will not be
+///       reInitialized once init in an application. The Loader will determine
+///       if the already init driver needs to be delivered to the user thru the
+///       init type flags.
+///     - Already init Drivers will not be uninitialized if the call to
+///       InitDrivers does not include that driver's type. Those init drivers
+///       which don't match the init flags will not have their driver handles
+///       returned to the user in that InitDrivers call.
+///     - If this function or zeInit[Deprecated] is not called, then all other
+///       functions will return ::ZE_RESULT_ERROR_UNINITIALIZED.
+///     - Only one instance of each driver will be initialized per process.
+///     - A driver represents a collection of physical devices.
+///     - Multiple calls to this function will return identical driver handles,
+///       in the same order.
+///     - The drivers returned to the caller will be based on the init types
+///       which state the drivers to be included.
+///     - The application may pass nullptr for pDrivers when only querying the
+///       number of drivers.
+///     - The application may call this function multiple times with different
+///       flags or environment variables enabled.
+///     - The application must call this function after forking new processes.
+///       Each forked process must call this function.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe for scenarios
+///       where multiple libraries may initialize the driver(s) simultaneously.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///         + `nullptr == desc`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x0 == desc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeInitDrivers(
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of driver instances.
+                                                                            ///< if count is zero, then the loader shall update the value with the
+                                                                            ///< total number of drivers available.
+                                                                            ///< if count is greater than the number of drivers available, then the
+                                                                            ///< loader shall update the value with the correct number of drivers available.
+    ze_driver_handle_t* phDrivers,                                          ///< [in,out][optional][range(0, *pCount)] array of driver instance handles.
+                                                                            ///< if count is less than the number of drivers available, then the loader
+                                                                            ///< shall only retrieve that number of drivers.
+    ze_init_driver_type_desc_t* desc                                        ///< [in] descriptor containing the driver type initialization details
+                                                                            ///< including ::ze_init_driver_type_flag_t combinations.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported API versions
+/// 
+/// @details
+///     - API versions contain major and minor attributes, use
+///       ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION
+typedef enum _ze_api_version_t
+{
+    ZE_API_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                           ///< version 1.0
+    ZE_API_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),                           ///< version 1.1
+    ZE_API_VERSION_1_2 = ZE_MAKE_VERSION( 1, 2 ),                           ///< version 1.2
+    ZE_API_VERSION_1_3 = ZE_MAKE_VERSION( 1, 3 ),                           ///< version 1.3
+    ZE_API_VERSION_1_4 = ZE_MAKE_VERSION( 1, 4 ),                           ///< version 1.4
+    ZE_API_VERSION_1_5 = ZE_MAKE_VERSION( 1, 5 ),                           ///< version 1.5
+    ZE_API_VERSION_1_6 = ZE_MAKE_VERSION( 1, 6 ),                           ///< version 1.6
+    ZE_API_VERSION_1_7 = ZE_MAKE_VERSION( 1, 7 ),                           ///< version 1.7
+    ZE_API_VERSION_1_8 = ZE_MAKE_VERSION( 1, 8 ),                           ///< version 1.8
+    ZE_API_VERSION_1_9 = ZE_MAKE_VERSION( 1, 9 ),                           ///< version 1.9
+    ZE_API_VERSION_1_10 = ZE_MAKE_VERSION( 1, 10 ),                         ///< version 1.10
+    ZE_API_VERSION_1_11 = ZE_MAKE_VERSION( 1, 11 ),                         ///< version 1.11
+    ZE_API_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 11 ),                      ///< latest known version
+    ZE_API_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_api_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_API_VERSION_CURRENT_M
+/// @brief Current API version as a macro
+#define ZE_API_VERSION_CURRENT_M  ZE_MAKE_VERSION( 1, 11 )
+#endif // ZE_API_VERSION_CURRENT_M
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns the API version supported by the specified driver
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == version`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGetApiVersion(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    ze_api_version_t* version                                               ///< [out] api version
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_DRIVER_UUID_SIZE
+/// @brief Maximum driver universal unique id (UUID) size in bytes
+#define ZE_MAX_DRIVER_UUID_SIZE  16
+#endif // ZE_MAX_DRIVER_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Driver universal unique id (UUID)
+typedef struct _ze_driver_uuid_t
+{
+    uint8_t id[ZE_MAX_DRIVER_UUID_SIZE];                                    ///< [out] opaque data representing a driver UUID
+
+} ze_driver_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Driver properties queried using ::zeDriverGetProperties
+typedef struct _ze_driver_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_driver_uuid_t uuid;                                                  ///< [out] universal unique identifier.
+    uint32_t driverVersion;                                                 ///< [out] driver version
+                                                                            ///< The driver version is a non-zero, monotonically increasing value where
+                                                                            ///< higher values always indicate a more recent version.
+
+} ze_driver_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves properties of the driver.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clGetPlatformInfo**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDriverProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGetProperties(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    ze_driver_properties_t* pDriverProperties                               ///< [in,out] query result for driver properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported IPC property flags
+typedef uint32_t ze_ipc_property_flags_t;
+typedef enum _ze_ipc_property_flag_t
+{
+    ZE_IPC_PROPERTY_FLAG_MEMORY = ZE_BIT(0),                                ///< Supports passing memory allocations between processes. See
+                                                                            ///< ::zeMemGetIpcHandle.
+    ZE_IPC_PROPERTY_FLAG_EVENT_POOL = ZE_BIT(1),                            ///< Supports passing event pools between processes. See
+                                                                            ///< ::zeEventPoolGetIpcHandle.
+    ZE_IPC_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_ipc_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief IPC properties queried using ::zeDriverGetIpcProperties
+typedef struct _ze_driver_ipc_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_ipc_property_flags_t flags;                                          ///< [out] 0 (none) or a valid combination of ::ze_ipc_property_flag_t
+
+} ze_driver_ipc_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves IPC attributes of the driver
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pIpcProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGetIpcProperties(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    ze_driver_ipc_properties_t* pIpcProperties                              ///< [in,out] query result for IPC properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_EXTENSION_NAME
+/// @brief Maximum extension name string size
+#define ZE_MAX_EXTENSION_NAME  256
+#endif // ZE_MAX_EXTENSION_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties queried using ::zeDriverGetExtensionProperties
+typedef struct _ze_driver_extension_properties_t
+{
+    char name[ZE_MAX_EXTENSION_NAME];                                       ///< [out] extension name
+    uint32_t version;                                                       ///< [out] extension version using ::ZE_MAKE_VERSION
+
+} ze_driver_extension_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves extension properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkEnumerateInstanceExtensionProperties**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGetExtensionProperties(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of extension properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of extension properties available.
+                                                                            ///< if count is greater than the number of extension properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< extension properties available.
+    ze_driver_extension_properties_t* pExtensionProperties                  ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< extension properties.
+                                                                            ///< if count is less than the number of extension properties available,
+                                                                            ///< then driver shall only retrieve that number of extension properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves function pointer for vendor-specific or experimental
+///        extensions
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == name`
+///         + `nullptr == ppFunctionAddress`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGetExtensionFunctionAddress(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    const char* name,                                                       ///< [in] extension function name
+    void** ppFunctionAddress                                                ///< [out] pointer to function pointer
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves a string describing the last error code returned by the
+///        driver in the current thread.
+/// 
+/// @details
+///     - String returned is thread local.
+///     - String is only updated on calls returning an error, i.e., not on calls
+///       returning ::ZE_RESULT_SUCCESS.
+///     - String may be empty if driver considers error code is already explicit
+///       enough to describe cause.
+///     - Memory pointed to by ppString is owned by the driver.
+///     - String returned is null-terminated.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ppString`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverGetLastErrorDescription(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    const char** ppString                                                   ///< [in,out] pointer to a null-terminated array of characters describing
+                                                                            ///< cause of error.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Device
+#if !defined(__GNUC__)
+#pragma region device
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves devices within a driver
+/// 
+/// @details
+///     - Multiple calls to this function will return identical device handles,
+///       in the same order.
+///     - The number and order of handles returned from this function is
+///       affected by the ::ZE_AFFINITY_MASK and ::ZE_ENABLE_PCI_ID_DEVICE_ORDER
+///       environment variables.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGet(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of devices available.
+                                                                            ///< if count is greater than the number of devices available, then the
+                                                                            ///< driver shall update the value with the correct number of devices available.
+    ze_device_handle_t* phDevices                                           ///< [in,out][optional][range(0, *pCount)] array of handle of devices.
+                                                                            ///< if count is less than the number of devices available, then driver
+                                                                            ///< shall only retrieve that number of devices.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves the root-device of a device handle
+/// 
+/// @details
+///     - When the device handle passed does not belong to any root-device,
+///       nullptr is returned.
+///     - Multiple calls to this function will return the same device handle.
+///     - The root-device handle returned by this function does not have access
+///       automatically to the resources
+///       created with the associated sub-device, unless those resources have
+///       been created with a context
+///       explicitly containing both handles.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phRootDevice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetRootDevice(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    ze_device_handle_t* phRootDevice                                        ///< [in,out] parent root device.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves a sub-device from a device
+/// 
+/// @details
+///     - When the device handle passed does not contain any sub-device, a
+///       pCount of 0 is returned.
+///     - Multiple calls to this function will return identical device handles,
+///       in the same order.
+///     - The number of handles returned from this function is affected by the
+///       ::ZE_AFFINITY_MASK environment variable.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clCreateSubDevices
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetSubDevices(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sub-devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub-devices available.
+                                                                            ///< if count is greater than the number of sub-devices available, then the
+                                                                            ///< driver shall update the value with the correct number of sub-devices available.
+    ze_device_handle_t* phSubdevices                                        ///< [in,out][optional][range(0, *pCount)] array of handle of sub-devices.
+                                                                            ///< if count is less than the number of sub-devices available, then driver
+                                                                            ///< shall only retrieve that number of sub-devices.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device types
+typedef enum _ze_device_type_t
+{
+    ZE_DEVICE_TYPE_GPU = 1,                                                 ///< Graphics Processing Unit
+    ZE_DEVICE_TYPE_CPU = 2,                                                 ///< Central Processing Unit
+    ZE_DEVICE_TYPE_FPGA = 3,                                                ///< Field Programmable Gate Array
+    ZE_DEVICE_TYPE_MCA = 4,                                                 ///< Memory Copy Accelerator
+    ZE_DEVICE_TYPE_VPU = 5,                                                 ///< Vision Processing Unit
+    ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_DEVICE_UUID_SIZE
+/// @brief Maximum device universal unique id (UUID) size in bytes
+#define ZE_MAX_DEVICE_UUID_SIZE  16
+#endif // ZE_MAX_DEVICE_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device universal unique id (UUID)
+typedef struct _ze_device_uuid_t
+{
+    uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];                                    ///< [out] opaque data representing a device UUID
+
+} ze_device_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_DEVICE_NAME
+/// @brief Maximum device name string size
+#define ZE_MAX_DEVICE_NAME  256
+#endif // ZE_MAX_DEVICE_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device property flags
+typedef uint32_t ze_device_property_flags_t;
+typedef enum _ze_device_property_flag_t
+{
+    ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),                         ///< Device is integrated with the Host.
+    ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),                          ///< Device handle used for query represents a sub-device.
+    ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),                                ///< Device supports error correction memory access.
+    ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),                     ///< Device supports on-demand page-faulting.
+    ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device properties queried using ::zeDeviceGetProperties
+typedef struct _ze_device_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_type_t type;                                                  ///< [out] generic device type
+    uint32_t vendorId;                                                      ///< [out] vendor id from PCI configuration
+    uint32_t deviceId;                                                      ///< [out] device id from PCI configuration.
+                                                                            ///< Note, the device id uses little-endian format.
+    ze_device_property_flags_t flags;                                       ///< [out] 0 (none) or a valid combination of ::ze_device_property_flag_t
+    uint32_t subdeviceId;                                                   ///< [out] sub-device id. Only valid if ::ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE
+                                                                            ///< is set.
+    uint32_t coreClockRate;                                                 ///< [out] Clock rate for device core.
+    uint64_t maxMemAllocSize;                                               ///< [out] Maximum memory allocation size.
+    uint32_t maxHardwareContexts;                                           ///< [out] Maximum number of logical hardware contexts.
+    uint32_t maxCommandQueuePriority;                                       ///< [out] Maximum priority for command queues. Higher value is higher
+                                                                            ///< priority.
+    uint32_t numThreadsPerEU;                                               ///< [out] Maximum number of threads per EU.
+    uint32_t physicalEUSimdWidth;                                           ///< [out] The physical EU simd width.
+    uint32_t numEUsPerSubslice;                                             ///< [out] Maximum number of EUs per sub-slice.
+    uint32_t numSubslicesPerSlice;                                          ///< [out] Maximum number of sub-slices per slice.
+    uint32_t numSlices;                                                     ///< [out] Maximum number of slices.
+    uint64_t timerResolution;                                               ///< [out] Returns the resolution of device timer used for profiling,
+                                                                            ///< timestamps, etc. When stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES the
+                                                                            ///< units are in nanoseconds. When
+                                                                            ///< stype==::ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES_1_2 units are in
+                                                                            ///< cycles/sec
+    uint32_t timestampValidBits;                                            ///< [out] Returns the number of valid bits in the timestamp value.
+    uint32_t kernelTimestampValidBits;                                      ///< [out] Returns the number of valid bits in the kernel timestamp values
+    ze_device_uuid_t uuid;                                                  ///< [out] universal unique identifier. Note: Subdevices will have their
+                                                                            ///< own uuid.
+    char name[ZE_MAX_DEVICE_NAME];                                          ///< [out] Device name
+
+} ze_device_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device thread identifier.
+typedef struct _ze_device_thread_t
+{
+    uint32_t slice;                                                         ///< [in,out] the slice number.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numSlices` member of ::ze_device_properties_t.
+    uint32_t subslice;                                                      ///< [in,out] the sub-slice number within its slice.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numSubslicesPerSlice`
+                                                                            ///< member of ::ze_device_properties_t.
+    uint32_t eu;                                                            ///< [in,out] the EU number within its sub-slice.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numEUsPerSubslice` member
+                                                                            ///< of ::ze_device_properties_t.
+    uint32_t thread;                                                        ///< [in,out] the thread number within its EU.
+                                                                            ///< Must be `UINT32_MAX` (all) or less than the `numThreadsPerEU` member
+                                                                            ///< of ::ze_device_properties_t.
+
+} ze_device_thread_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves properties of the device.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clGetDeviceInfo
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDeviceProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_properties_t* pDeviceProperties                               ///< [in,out] query result for device properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_SUBGROUPSIZE_COUNT
+/// @brief Maximum number of subgroup sizes supported.
+#define ZE_SUBGROUPSIZE_COUNT  8
+#endif // ZE_SUBGROUPSIZE_COUNT
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device compute properties queried using ::zeDeviceGetComputeProperties
+typedef struct _ze_device_compute_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t maxTotalGroupSize;                                             ///< [out] Maximum items per compute group. (groupSizeX * groupSizeY *
+                                                                            ///< groupSizeZ) <= maxTotalGroupSize
+    uint32_t maxGroupSizeX;                                                 ///< [out] Maximum items for X dimension in group
+    uint32_t maxGroupSizeY;                                                 ///< [out] Maximum items for Y dimension in group
+    uint32_t maxGroupSizeZ;                                                 ///< [out] Maximum items for Z dimension in group
+    uint32_t maxGroupCountX;                                                ///< [out] Maximum groups that can be launched for x dimension
+    uint32_t maxGroupCountY;                                                ///< [out] Maximum groups that can be launched for y dimension
+    uint32_t maxGroupCountZ;                                                ///< [out] Maximum groups that can be launched for z dimension
+    uint32_t maxSharedLocalMemory;                                          ///< [out] Maximum shared local memory per group.
+    uint32_t numSubGroupSizes;                                              ///< [out] Number of subgroup sizes supported. This indicates number of
+                                                                            ///< entries in subGroupSizes.
+    uint32_t subGroupSizes[ZE_SUBGROUPSIZE_COUNT];                          ///< [out] Size group sizes supported.
+
+} ze_device_compute_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves compute properties of the device.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clGetDeviceInfo
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pComputeProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetComputeProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_compute_properties_t* pComputeProperties                      ///< [in,out] query result for compute properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_NATIVE_KERNEL_UUID_SIZE
+/// @brief Maximum native kernel universal unique id (UUID) size in bytes
+#define ZE_MAX_NATIVE_KERNEL_UUID_SIZE  16
+#endif // ZE_MAX_NATIVE_KERNEL_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Native kernel universal unique id (UUID)
+typedef struct _ze_native_kernel_uuid_t
+{
+    uint8_t id[ZE_MAX_NATIVE_KERNEL_UUID_SIZE];                             ///< [out] opaque data representing a native kernel UUID
+
+} ze_native_kernel_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device module flags
+typedef uint32_t ze_device_module_flags_t;
+typedef enum _ze_device_module_flag_t
+{
+    ZE_DEVICE_MODULE_FLAG_FP16 = ZE_BIT(0),                                 ///< Device supports 16-bit floating-point operations
+    ZE_DEVICE_MODULE_FLAG_FP64 = ZE_BIT(1),                                 ///< Device supports 64-bit floating-point operations
+    ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS = ZE_BIT(2),                        ///< Device supports 64-bit atomic operations
+    ZE_DEVICE_MODULE_FLAG_DP4A = ZE_BIT(3),                                 ///< Device supports four component dot product and accumulate operations
+    ZE_DEVICE_MODULE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_module_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported floating-Point capability flags
+typedef uint32_t ze_device_fp_flags_t;
+typedef enum _ze_device_fp_flag_t
+{
+    ZE_DEVICE_FP_FLAG_DENORM = ZE_BIT(0),                                   ///< Supports denorms
+    ZE_DEVICE_FP_FLAG_INF_NAN = ZE_BIT(1),                                  ///< Supports INF and quiet NaNs
+    ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST = ZE_BIT(2),                         ///< Supports rounding to nearest even rounding mode
+    ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO = ZE_BIT(3),                            ///< Supports rounding to zero.
+    ZE_DEVICE_FP_FLAG_ROUND_TO_INF = ZE_BIT(4),                             ///< Supports rounding to both positive and negative INF.
+    ZE_DEVICE_FP_FLAG_FMA = ZE_BIT(5),                                      ///< Supports IEEE754-2008 fused multiply-add.
+    ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT = ZE_BIT(6),                      ///< Supports rounding as defined by IEEE754 for divide and sqrt
+                                                                            ///< operations.
+    ZE_DEVICE_FP_FLAG_SOFT_FLOAT = ZE_BIT(7),                               ///< Uses software implementation for basic floating-point operations.
+    ZE_DEVICE_FP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_fp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device module properties queried using ::zeDeviceGetModuleProperties
+typedef struct _ze_device_module_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t spirvVersionSupported;                                         ///< [out] Maximum supported SPIR-V version.
+                                                                            ///< Returns zero if SPIR-V is not supported.
+                                                                            ///< Contains major and minor attributes, use ::ZE_MAJOR_VERSION and ::ZE_MINOR_VERSION.
+    ze_device_module_flags_t flags;                                         ///< [out] 0 or a valid combination of ::ze_device_module_flag_t
+    ze_device_fp_flags_t fp16flags;                                         ///< [out] Capabilities for half-precision floating-point operations.
+                                                                            ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a
+                                                                            ///< combination of ::ze_device_fp_flag_t.
+    ze_device_fp_flags_t fp32flags;                                         ///< [out] Capabilities for single-precision floating-point operations.
+                                                                            ///< returns a combination of ::ze_device_fp_flag_t.
+    ze_device_fp_flags_t fp64flags;                                         ///< [out] Capabilities for double-precision floating-point operations.
+                                                                            ///< returns 0 (if ::ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a
+                                                                            ///< combination of ::ze_device_fp_flag_t.
+    uint32_t maxArgumentsSize;                                              ///< [out] Maximum kernel argument size that is supported.
+    uint32_t printfBufferSize;                                              ///< [out] Maximum size of internal buffer that holds output of printf
+                                                                            ///< calls from kernel.
+    ze_native_kernel_uuid_t nativeKernelSupported;                          ///< [out] Compatibility UUID of supported native kernel.
+                                                                            ///< UUID may or may not be the same across driver release, devices, or
+                                                                            ///< operating systems.
+                                                                            ///< Application is responsible for ensuring UUID matches before creating
+                                                                            ///< module using
+                                                                            ///< previously created native kernel.
+
+} ze_device_module_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves module properties of the device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pModuleProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetModuleProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_module_properties_t* pModuleProperties                        ///< [in,out] query result for module properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported command queue group property flags
+typedef uint32_t ze_command_queue_group_property_flags_t;
+typedef enum _ze_command_queue_group_property_flag_t
+{
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE = ZE_BIT(0),               ///< Command queue group supports enqueing compute commands.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY = ZE_BIT(1),                  ///< Command queue group supports enqueing copy commands.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COOPERATIVE_KERNELS = ZE_BIT(2),   ///< Command queue group supports cooperative kernels.
+                                                                            ///< See ::zeCommandListAppendLaunchCooperativeKernel for more details.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_METRICS = ZE_BIT(3),               ///< Command queue groups supports metric queries.
+    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_command_queue_group_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue group properties queried using
+///        ::zeDeviceGetCommandQueueGroupProperties
+typedef struct _ze_command_queue_group_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_command_queue_group_property_flags_t flags;                          ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_command_queue_group_property_flag_t
+    size_t maxMemoryFillPatternSize;                                        ///< [out] maximum `pattern_size` supported by command queue group.
+                                                                            ///< See ::zeCommandListAppendMemoryFill for more details.
+    uint32_t numQueues;                                                     ///< [out] the number of physical engines within the group.
+
+} ze_command_queue_group_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves command queue group properties of the device.
+/// 
+/// @details
+///     - Properties are reported for each physical command queue type supported
+///       by the device.
+///     - Multiple calls to this function will return properties in the same
+///       order.
+///     - The order in which the properties are returned defines the command
+///       queue group's ordinal.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkGetPhysicalDeviceQueueFamilyProperties**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetCommandQueueGroupProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of command queue group properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of command queue group properties available.
+                                                                            ///< if count is greater than the number of command queue group properties
+                                                                            ///< available, then the driver shall update the value with the correct
+                                                                            ///< number of command queue group properties available.
+    ze_command_queue_group_properties_t* pCommandQueueGroupProperties       ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< command queue group properties.
+                                                                            ///< if count is less than the number of command queue group properties
+                                                                            ///< available, then driver shall only retrieve that number of command
+                                                                            ///< queue group properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device memory property flags
+typedef uint32_t ze_device_memory_property_flags_t;
+typedef enum _ze_device_memory_property_flag_t
+{
+    ZE_DEVICE_MEMORY_PROPERTY_FLAG_TBD = ZE_BIT(0),                         ///< reserved for future use
+    ZE_DEVICE_MEMORY_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_memory_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device local memory properties queried using
+///        ::zeDeviceGetMemoryProperties
+typedef struct _ze_device_memory_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_memory_property_flags_t flags;                                ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_device_memory_property_flag_t
+    uint32_t maxClockRate;                                                  ///< [out] Maximum clock rate for device memory.
+    uint32_t maxBusWidth;                                                   ///< [out] Maximum bus width between device and memory.
+    uint64_t totalSize;                                                     ///< [out] Total memory size in bytes that is available to the device.
+    char name[ZE_MAX_DEVICE_NAME];                                          ///< [out] Memory name
+
+} ze_device_memory_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves local memory properties of the device.
+/// 
+/// @details
+///     - Properties are reported for each physical memory type supported by the
+///       device.
+///     - Multiple calls to this function will return properties in the same
+///       order.
+///     - The order in which the properties are returned defines the device's
+///       local memory ordinal.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clGetDeviceInfo
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetMemoryProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of memory properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of memory properties available.
+                                                                            ///< if count is greater than the number of memory properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< memory properties available.
+    ze_device_memory_properties_t* pMemProperties                           ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< memory properties.
+                                                                            ///< if count is less than the number of memory properties available, then
+                                                                            ///< driver shall only retrieve that number of memory properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory access capability flags
+/// 
+/// @details
+///     - Supported access capabilities for different types of memory
+///       allocations
+typedef uint32_t ze_memory_access_cap_flags_t;
+typedef enum _ze_memory_access_cap_flag_t
+{
+    ZE_MEMORY_ACCESS_CAP_FLAG_RW = ZE_BIT(0),                               ///< Supports load/store access
+    ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC = ZE_BIT(1),                           ///< Supports atomic access
+    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT = ZE_BIT(2),                       ///< Supports concurrent access
+    ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC = ZE_BIT(3),                ///< Supports concurrent atomic access
+    ZE_MEMORY_ACCESS_CAP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_access_cap_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device memory access properties queried using
+///        ::zeDeviceGetMemoryAccessProperties
+typedef struct _ze_device_memory_access_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_memory_access_cap_flags_t hostAllocCapabilities;                     ///< [out] host memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t deviceAllocCapabilities;                   ///< [out] device memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t sharedSingleDeviceAllocCapabilities;       ///< [out] shared, single-device memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t sharedCrossDeviceAllocCapabilities;        ///< [out] shared, cross-device memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+    ze_memory_access_cap_flags_t sharedSystemAllocCapabilities;             ///< [out] shared, system memory capabilities.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_memory_access_cap_flag_t.
+
+} ze_device_memory_access_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves memory access properties of the device.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clGetDeviceInfo
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pMemAccessProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetMemoryAccessProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_memory_access_properties_t* pMemAccessProperties              ///< [in,out] query result for memory access properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported cache control property flags
+typedef uint32_t ze_device_cache_property_flags_t;
+typedef enum _ze_device_cache_property_flag_t
+{
+    ZE_DEVICE_CACHE_PROPERTY_FLAG_USER_CONTROL = ZE_BIT(0),                 ///< Device support User Cache Control (i.e. SLM section vs Generic Cache)
+    ZE_DEVICE_CACHE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_cache_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device cache properties queried using ::zeDeviceGetCacheProperties
+typedef struct _ze_device_cache_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_cache_property_flags_t flags;                                 ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_device_cache_property_flag_t
+    size_t cacheSize;                                                       ///< [out] Per-cache size, in bytes
+
+} ze_device_cache_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves cache properties of the device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clGetDeviceInfo
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetCacheProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of cache properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of cache properties available.
+                                                                            ///< if count is greater than the number of cache properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< cache properties available.
+    ze_device_cache_properties_t* pCacheProperties                          ///< [in,out][optional][range(0, *pCount)] array of query results for cache properties.
+                                                                            ///< if count is less than the number of cache properties available, then
+                                                                            ///< driver shall only retrieve that number of cache properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device image properties queried using ::zeDeviceGetImageProperties
+typedef struct _ze_device_image_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t maxImageDims1D;                                                ///< [out] Maximum image dimensions for 1D resources. if 0, then 1D images
+                                                                            ///< are unsupported.
+    uint32_t maxImageDims2D;                                                ///< [out] Maximum image dimensions for 2D resources. if 0, then 2D images
+                                                                            ///< are unsupported.
+    uint32_t maxImageDims3D;                                                ///< [out] Maximum image dimensions for 3D resources. if 0, then 3D images
+                                                                            ///< are unsupported.
+    uint64_t maxImageBufferSize;                                            ///< [out] Maximum image buffer size in bytes. if 0, then buffer images are
+                                                                            ///< unsupported.
+    uint32_t maxImageArraySlices;                                           ///< [out] Maximum image array slices. if 0, then image arrays are
+                                                                            ///< unsupported.
+    uint32_t maxSamplers;                                                   ///< [out] Max samplers that can be used in kernel. if 0, then sampling is
+                                                                            ///< unsupported.
+    uint32_t maxReadImageArgs;                                              ///< [out] Returns the maximum number of simultaneous image objects that
+                                                                            ///< can be read from by a kernel. if 0, then reading images is
+                                                                            ///< unsupported.
+    uint32_t maxWriteImageArgs;                                             ///< [out] Returns the maximum number of simultaneous image objects that
+                                                                            ///< can be written to by a kernel. if 0, then writing images is
+                                                                            ///< unsupported.
+
+} ze_device_image_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves image properties of the device
+/// 
+/// @details
+///     - See ::zeImageGetProperties for format-specific capabilities.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pImageProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetImageProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_image_properties_t* pImageProperties                          ///< [in,out] query result for image properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device external memory import and export properties
+typedef struct _ze_device_external_memory_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t memoryAllocationImportTypes;            ///< [out] Supported external memory import types for memory allocations.
+    ze_external_memory_type_flags_t memoryAllocationExportTypes;            ///< [out] Supported external memory export types for memory allocations.
+    ze_external_memory_type_flags_t imageImportTypes;                       ///< [out] Supported external memory import types for images.
+    ze_external_memory_type_flags_t imageExportTypes;                       ///< [out] Supported external memory export types for images.
+
+} ze_device_external_memory_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves external memory import and export of the device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pExternalMemoryProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetExternalMemoryProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_device_external_memory_properties_t* pExternalMemoryProperties       ///< [in,out] query result for external memory properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device peer-to-peer property flags
+typedef uint32_t ze_device_p2p_property_flags_t;
+typedef enum _ze_device_p2p_property_flag_t
+{
+    ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS = ZE_BIT(0),                         ///< Device supports access between peer devices.
+    ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS = ZE_BIT(1),                        ///< Device supports atomics between peer devices.
+    ZE_DEVICE_P2P_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_p2p_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device peer-to-peer properties queried using
+///        ::zeDeviceGetP2PProperties
+typedef struct _ze_device_p2p_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_p2p_property_flags_t flags;                                   ///< [out] 0 (none) or a valid combination of
+                                                                            ///< ::ze_device_p2p_property_flag_t
+
+} ze_device_p2p_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves peer-to-peer properties between one device and a peer
+///        devices
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///         + `nullptr == hPeerDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pP2PProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetP2PProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device performing the access
+    ze_device_handle_t hPeerDevice,                                         ///< [in] handle of the peer device with the allocation
+    ze_device_p2p_properties_t* pP2PProperties                              ///< [in,out] Peer-to-Peer properties between source and peer device
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queries if one device can directly access peer device allocations
+/// 
+/// @details
+///     - Any device can access any other device within a node through a
+///       scale-up fabric.
+///     - The following are conditions for CanAccessPeer query.
+///         + If both device and peer device are the same then return true.
+///         + If both sub-device and peer sub-device are the same then return
+///           true.
+///         + If both are sub-devices and share the same parent device then
+///           return true.
+///         + If both device and remote device are connected by a direct or
+///           indirect scale-up fabric or over PCIe (same root complex or shared
+///           PCIe switch) then true.
+///         + If both sub-device and remote parent device (and vice-versa) are
+///           connected by a direct or indirect scale-up fabric or over PCIe
+///           (same root complex or shared PCIe switch) then true.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///         + `nullptr == hPeerDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == value`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceCanAccessPeer(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device performing the access
+    ze_device_handle_t hPeerDevice,                                         ///< [in] handle of the peer device with the allocation
+    ze_bool_t* value                                                        ///< [out] returned access capability
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns current status of the device.
+/// 
+/// @details
+///     - Once a device is reset, this call will update the OS handle attached
+///       to the device handle.
+///     - The application may call this function from simultaneous threads with
+///       the same device handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_SUCCESS
+///         + Device is available for use.
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///         + Device is lost; must be reset for use.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetStatus(
+    ze_device_handle_t hDevice                                              ///< [in] handle of the device
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns synchronized Host and device global timestamps.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads with
+///       the same device handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == hostTimestamp`
+///         + `nullptr == deviceTimestamp`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + The feature is not supported by the underlying platform.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetGlobalTimestamps(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    uint64_t* hostTimestamp,                                                ///< [out] value of the Host's global timestamp that correlates with the
+                                                                            ///< Device's global timestamp value.
+    uint64_t* deviceTimestamp                                               ///< [out] value of the Device's global timestamp that correlates with the
+                                                                            ///< Host's global timestamp value.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Context
+#if !defined(__GNUC__)
+#pragma region context
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported context creation flags
+typedef uint32_t ze_context_flags_t;
+typedef enum _ze_context_flag_t
+{
+    ZE_CONTEXT_FLAG_TBD = ZE_BIT(0),                                        ///< reserved for future use
+    ZE_CONTEXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_context_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Context descriptor
+typedef struct _ze_context_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_context_flags_t flags;                                               ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_context_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
+
+} ze_context_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a context for the driver.
+/// 
+/// @details
+///     - The application must only use the context for the driver which was
+///       provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phContext`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x1 < desc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextCreate(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver object
+    const ze_context_desc_t* desc,                                          ///< [in] pointer to context descriptor
+    ze_context_handle_t* phContext                                          ///< [out] pointer to handle of context object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a context for the driver.
+/// 
+/// @details
+///     - The application must only use the context for the driver which was
+///       provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phContext`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x1 < desc->flags`
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phDevices) && (0 < numDevices)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextCreateEx(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver object
+    const ze_context_desc_t* desc,                                          ///< [in] pointer to context descriptor
+    uint32_t numDevices,                                                    ///< [in][optional] number of device handles; must be 0 if `nullptr ==
+                                                                            ///< phDevices`
+    ze_device_handle_t* phDevices,                                          ///< [in][optional][range(0, numDevices)] array of device handles which
+                                                                            ///< context has visibility.
+                                                                            ///< if nullptr, then all devices and any sub-devices supported by the
+                                                                            ///< driver instance are
+                                                                            ///< visible to the context.
+                                                                            ///< otherwise, the context only has visibility to the devices and any
+                                                                            ///< sub-devices of the
+                                                                            ///< devices in this array.
+    ze_context_handle_t* phContext                                          ///< [out] pointer to handle of context object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a context.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the context before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same context handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextDestroy(
+    ze_context_handle_t hContext                                            ///< [in][release] handle of context object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns current status of the context.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads with
+///       the same context handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_SUCCESS
+///         + Context is available for use.
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///         + Context is invalid; due to device lost or reset.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextGetStatus(
+    ze_context_handle_t hContext                                            ///< [in] handle of context object
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Command Queue
+#if !defined(__GNUC__)
+#pragma region cmdqueue
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported command queue flags
+typedef uint32_t ze_command_queue_flags_t;
+typedef enum _ze_command_queue_flag_t
+{
+    ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY = ZE_BIT(0),                        ///< command queue should be optimized for submission to a single device engine.
+                                                                            ///< driver **must** disable any implicit optimizations for distributing
+                                                                            ///< work across multiple engines.
+                                                                            ///< this flag should be used when applications want full control over
+                                                                            ///< multi-engine submission and scheduling.
+    ZE_COMMAND_QUEUE_FLAG_IN_ORDER = ZE_BIT(1),                             ///< To be used only when creating immediate command lists. Commands
+                                                                            ///< appended to the immediate command
+                                                                            ///< list are executed in-order, with driver implementation enforcing
+                                                                            ///< dependencies between them.
+                                                                            ///< Application is not required to have the signal event of a given
+                                                                            ///< command being the wait event of
+                                                                            ///< the next to define an in-order list, and application is allowed to
+                                                                            ///< pass signal and wait events
+                                                                            ///< to each appended command to implement more complex dependency graphs.
+    ZE_COMMAND_QUEUE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_command_queue_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported command queue modes
+typedef enum _ze_command_queue_mode_t
+{
+    ZE_COMMAND_QUEUE_MODE_DEFAULT = 0,                                      ///< implicit default behavior; uses driver-based heuristics
+    ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS = 1,                                  ///< Device execution always completes immediately on execute;
+                                                                            ///< Host thread is blocked using wait on implicit synchronization object
+    ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS = 2,                                 ///< Device execution is scheduled and will complete in future;
+                                                                            ///< explicit synchronization object must be used to determine completeness
+    ZE_COMMAND_QUEUE_MODE_FORCE_UINT32 = 0x7fffffff
+
+} ze_command_queue_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported command queue priorities
+typedef enum _ze_command_queue_priority_t
+{
+    ZE_COMMAND_QUEUE_PRIORITY_NORMAL = 0,                                   ///< [default] normal priority
+    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW = 1,                             ///< lower priority than normal
+    ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH = 2,                            ///< higher priority than normal
+    ZE_COMMAND_QUEUE_PRIORITY_FORCE_UINT32 = 0x7fffffff
+
+} ze_command_queue_priority_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command Queue descriptor
+typedef struct _ze_command_queue_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t ordinal;                                                       ///< [in] command queue group ordinal
+    uint32_t index;                                                         ///< [in] command queue index within the group;
+                                                                            ///< must be zero if ::ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set
+    ze_command_queue_flags_t flags;                                         ///< [in] usage flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_command_queue_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics to balance
+                                                                            ///< latency and throughput.
+    ze_command_queue_mode_t mode;                                           ///< [in] operation mode
+    ze_command_queue_priority_t priority;                                   ///< [in] priority
+
+} ze_command_queue_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a command queue on the context.
+/// 
+/// @details
+///     - A command queue represents a logical input stream to the device, tied
+///       to a physical input stream.
+///     - The application must only use the command queue for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clCreateCommandQueue**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phCommandQueue`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///         + `::ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < desc->mode`
+///         + `::ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < desc->priority`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandQueueCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    const ze_command_queue_desc_t* desc,                                    ///< [in] pointer to command queue descriptor
+    ze_command_queue_handle_t* phCommandQueue                               ///< [out] pointer to handle of command queue object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a command queue.
+/// 
+/// @details
+///     - The application must destroy all fence handles created from the
+///       command queue before destroying the command queue itself
+///     - The application must ensure the device is not currently referencing
+///       the command queue before it is deleted
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this command queue
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command queue handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clReleaseCommandQueue**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandQueue`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandQueueDestroy(
+    ze_command_queue_handle_t hCommandQueue                                 ///< [in][release] handle of command queue object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Executes a command list in a command queue.
+/// 
+/// @details
+///     - The command lists are submitted to the device in the order they are
+///       received, whether from multiple calls (on the same or different
+///       threads) or a single call with multiple command lists.
+///     - The application must ensure the command lists are accessible by the
+///       device on which the command queue was created.
+///     - The application must ensure the device is not currently referencing
+///       the command list since the implementation is allowed to modify the
+///       contents of the command list for submission.
+///     - The application must only execute command lists created with an
+///       identical command queue group ordinal to the command queue.
+///     - The application must use a fence created using the same command queue.
+///     - The application must ensure the command queue, command list and fence
+///       were created on the same context.
+///     - The application must ensure the command lists being executed are not
+///       immediate command lists.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - vkQueueSubmit
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandQueue`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phCommandLists`
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `0 == numCommandLists`
+///     - ::ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandQueueExecuteCommandLists(
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of the command queue
+    uint32_t numCommandLists,                                               ///< [in] number of command lists to execute
+    ze_command_list_handle_t* phCommandLists,                               ///< [in][range(0, numCommandLists)] list of handles of the command lists
+                                                                            ///< to execute
+    ze_fence_handle_t hFence                                                ///< [in][optional] handle of the fence to signal on completion
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Synchronizes a command queue by waiting on the host.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandQueue`
+///     - ::ZE_RESULT_NOT_READY
+///         + timeout expired
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandQueueSynchronize(
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of the command queue
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then immediately returns the status of the command queue;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the command queue group ordinal.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandQueue`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOrdinal`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandQueueGetOrdinal(
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of the command queue
+    uint32_t* pOrdinal                                                      ///< [out] command queue group ordinal
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the command queue index within the group.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandQueue`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pIndex`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandQueueGetIndex(
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of the command queue
+    uint32_t* pIndex                                                        ///< [out] command queue index within the group
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Command List
+#if !defined(__GNUC__)
+#pragma region cmdlist
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported command list creation flags
+typedef uint32_t ze_command_list_flags_t;
+typedef enum _ze_command_list_flag_t
+{
+    ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING = ZE_BIT(0),                      ///< driver may reorder commands (e.g., kernels, copies) between barriers
+                                                                            ///< and synchronization primitives.
+                                                                            ///< using this flag may increase Host overhead of ::zeCommandListClose.
+                                                                            ///< therefore, this flag should **not** be set for low-latency usage-models.
+    ZE_COMMAND_LIST_FLAG_MAXIMIZE_THROUGHPUT = ZE_BIT(1),                   ///< driver may perform additional optimizations that increase execution
+                                                                            ///< throughput. 
+                                                                            ///< using this flag may increase Host overhead of ::zeCommandListClose and ::zeCommandQueueExecuteCommandLists.
+                                                                            ///< therefore, this flag should **not** be set for low-latency usage-models.
+    ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY = ZE_BIT(2),                         ///< command list should be optimized for submission to a single command
+                                                                            ///< queue and device engine.
+                                                                            ///< driver **must** disable any implicit optimizations for distributing
+                                                                            ///< work across multiple engines.
+                                                                            ///< this flag should be used when applications want full control over
+                                                                            ///< multi-engine submission and scheduling.
+    ZE_COMMAND_LIST_FLAG_IN_ORDER = ZE_BIT(3),                              ///< commands appended to this command list are executed in-order, with
+                                                                            ///< driver implementation
+                                                                            ///< enforcing dependencies between them. Application is not required to
+                                                                            ///< have the signal event
+                                                                            ///< of a given command being the wait event of the next to define an
+                                                                            ///< in-order list, and
+                                                                            ///< application is allowed to pass signal and wait events to each appended
+                                                                            ///< command to implement
+                                                                            ///< more complex dependency graphs. Cannot be combined with ::ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING.
+    ZE_COMMAND_LIST_FLAG_EXP_CLONEABLE = ZE_BIT(4),                         ///< this command list may be cloned using ::zeCommandListCreateCloneExp
+                                                                            ///< after ::zeCommandListClose.
+    ZE_COMMAND_LIST_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_command_list_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command List descriptor
+typedef struct _ze_command_list_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t commandQueueGroupOrdinal;                                      ///< [in] command queue group ordinal to which this command list will be
+                                                                            ///< submitted
+    ze_command_list_flags_t flags;                                          ///< [in] usage flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_command_list_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics to balance
+                                                                            ///< latency and throughput.
+
+} ze_command_list_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a command list on the context.
+/// 
+/// @details
+///     - A command list represents a sequence of commands for execution on a
+///       command queue.
+///     - The command list is created in the 'open' state.
+///     - The application must only use the command list for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x1f < desc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    const ze_command_list_desc_t* desc,                                     ///< [in] pointer to command list descriptor
+    ze_command_list_handle_t* phCommandList                                 ///< [out] pointer to handle of command list object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates an immediate command list on the context.
+/// 
+/// @details
+///     - An immediate command list is used for low-latency submission of
+///       commands.
+///     - An immediate command list creates an implicit command queue.
+///     - Immediate command lists must not be passed to
+///       ::zeCommandQueueExecuteCommandLists.
+///     - Commands appended into an immediate command list may execute
+///       synchronously, by blocking until the command is complete.
+///     - The command list is created in the 'open' state and never needs to be
+///       closed.
+///     - The application must only use the command list for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == altdesc`
+///         + `nullptr == phCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < altdesc->flags`
+///         + `::ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < altdesc->mode`
+///         + `::ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < altdesc->priority`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListCreateImmediate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    const ze_command_queue_desc_t* altdesc,                                 ///< [in] pointer to command queue descriptor
+    ze_command_list_handle_t* phCommandList                                 ///< [out] pointer to handle of command list object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a command list.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the command list before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this command list.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListDestroy(
+    ze_command_list_handle_t hCommandList                                   ///< [in][release] handle of command list object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Closes a command list; ready to be executed by a command queue.
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListClose(
+    ze_command_list_handle_t hCommandList                                   ///< [in] handle of command list object to close
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reset a command list to initial (empty) state; ready for appending
+///        commands.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the command list before it is reset
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListReset(
+    ze_command_list_handle_t hCommandList                                   ///< [in] handle of command list object to reset
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends a memory write of the device's global timestamp value into a
+///        command list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The timestamp frequency can be queried from the `timerResolution`
+///       member of ::ze_device_properties_t.
+///     - The number of valid bits in the timestamp value can be queried from
+///       the `timestampValidBits` member of ::ze_device_properties_t.
+///     - The application must ensure the memory pointed to by dstptr is
+///       accessible by the device on which the command list was created.
+///     - The application must ensure the command list and events were created,
+///       and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendWriteGlobalTimestamp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint64_t* dstptr,                                                       ///< [in,out] pointer to memory where timestamp value will be written; must
+                                                                            ///< be 8byte-aligned.
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing query;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing query
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Synchronizes an immediate command list by waiting on the host for the
+///        completion of all commands previously submitted to it.
+/// 
+/// @details
+///     - The application must call this function only with command lists
+///       created with ::zeCommandListCreateImmediate.
+///     - Waiting on one immediate command list shall not block the concurrent
+///       execution of commands appended to other
+///       immediate command lists created with either a different ordinal or
+///       different index.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_NOT_READY
+///         + timeout expired
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + handle does not correspond to an immediate command list
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListHostSynchronize(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the immediate command list
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then immediately returns the status of the immediate command list;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the handle of the device on which the command list was created.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phDevice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListGetDeviceHandle(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_device_handle_t* phDevice                                            ///< [out] handle of the device on which the command list was created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the handle of the context on which the command list was created.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phContext`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListGetContextHandle(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_context_handle_t* phContext                                          ///< [out] handle of the context on which the command list was created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the command queue group ordinal to which the command list is
+///        submitted.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOrdinal`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListGetOrdinal(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t* pOrdinal                                                      ///< [out] command queue group ordinal to which command list is submitted
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the command queue index within the group to which the immediate
+///        command list is submitted.
+/// 
+/// @details
+///     - The application must call this function only with command lists
+///       created with ::zeCommandListCreateImmediate.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandListImmediate`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pIndex`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + handle does not correspond to an immediate command list
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListImmediateGetIndex(
+    ze_command_list_handle_t hCommandListImmediate,                         ///< [in] handle of the immediate command list
+    uint32_t* pIndex                                                        ///< [out] command queue index within the group to which the immediate
+                                                                            ///< command list is submitted
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query whether a command list is an immediate command list.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pIsImmediate`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListIsImmediate(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_bool_t* pIsImmediate                                                 ///< [out] Boolean indicating whether the command list is an immediate
+                                                                            ///< command list (true) or not (false)
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Barrier
+#if !defined(__GNUC__)
+#pragma region barrier
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends an execution and global memory barrier into a command list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - If numWaitEvents is zero, then all previous commands, enqueued on same
+///       command queue, must complete prior to the execution of the barrier.
+///       This is not the case when numWaitEvents is non-zero.
+///     - If numWaitEvents is non-zero, then only all phWaitEvents must be
+///       signaled prior to the execution of the barrier.
+///     - This command blocks all following commands from beginning until the
+///       execution of the barrier completes.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkCmdPipelineBarrier**
+///     - clEnqueueBarrierWithWaitList
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendBarrier(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing barrier;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing barrier
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends a global memory ranges barrier into a command list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - If numWaitEvents is zero, then all previous commands are completed
+///       prior to the execution of the barrier.
+///     - If numWaitEvents is non-zero, then then all phWaitEvents must be
+///       signaled prior to the execution of the barrier.
+///     - This command blocks all following commands from beginning until the
+///       execution of the barrier completes.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRangeSizes`
+///         + `nullptr == pRanges`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemoryRangesBarrier(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numRanges,                                                     ///< [in] number of memory ranges
+    const size_t* pRangeSizes,                                              ///< [in][range(0, numRanges)] array of sizes of memory range
+    const void** pRanges,                                                   ///< [in][range(0, numRanges)] array of memory ranges
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing barrier;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing barrier
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ensures in-bound writes to the device are globally observable.
+/// 
+/// @details
+///     - This is a special-case system level barrier that can be used to ensure
+///       global observability of writes; 
+///       typically needed after a producer (e.g., NIC) performs direct writes
+///       to the device's memory (e.g., Direct RDMA writes).
+///       This is typically required when the memory corresponding to the writes
+///       is subsequently accessed from a remote device.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextSystemBarrier(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice                                              ///< [in] handle of the device
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Copies
+#if !defined(__GNUC__)
+#pragma region copy
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies host, device, or shared memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by dstptr and srcptr
+///       is accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by dstptr and
+///       srcptr as they are free to be modified by either the Host or device up
+///       until execution.
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the command list and events were created,
+///       and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clEnqueueCopyBuffer**
+///     - **clEnqueueReadBuffer**
+///     - **clEnqueueWriteBuffer**
+///     - **clEnqueueSVMMemcpy**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///         + `nullptr == srcptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemoryCopy(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    size_t size,                                                            ///< [in] size in bytes to copy
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Initializes host, device, or shared memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by dstptr is
+///       accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by dstptr as
+///       it is free to be modified by either the Host or device up until
+///       execution.
+///     - The value to initialize memory to is described by the pattern and the
+///       pattern size.
+///     - The pattern size must be a power-of-two and less than or equal to the
+///       `maxMemoryFillPatternSize` member of
+///       ::ze_command_queue_group_properties_t.
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the command list and events were created,
+///       and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clEnqueueFillBuffer**
+///     - **clEnqueueSVMMemFill**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///         + `nullptr == pattern`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemoryFill(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* ptr,                                                              ///< [in] pointer to memory to initialize
+    const void* pattern,                                                    ///< [in] pointer to value to initialize memory to
+    size_t pattern_size,                                                    ///< [in] size in bytes of the value to initialize memory to
+    size_t size,                                                            ///< [in] size in bytes to initialize
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copy region descriptor
+typedef struct _ze_copy_region_t
+{
+    uint32_t originX;                                                       ///< [in] The origin x offset for region in bytes
+    uint32_t originY;                                                       ///< [in] The origin y offset for region in rows
+    uint32_t originZ;                                                       ///< [in] The origin z offset for region in slices
+    uint32_t width;                                                         ///< [in] The region width relative to origin in bytes
+    uint32_t height;                                                        ///< [in] The region height relative to origin in rows
+    uint32_t depth;                                                         ///< [in] The region depth relative to origin in slices. Set this to 0 for
+                                                                            ///< 2D copy.
+
+} ze_copy_region_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies a region from a 2D or 3D array of host, device, or shared
+///        memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by dstptr and srcptr
+///       is accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by dstptr and
+///       srcptr as they are free to be modified by either the Host or device up
+///       until execution.
+///     - The region width, height, and depth for both src and dst must be same.
+///       The origins can be different.
+///     - The src and dst regions cannot be overlapping.
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the command list and events were created,
+///       and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///         + `nullptr == dstRegion`
+///         + `nullptr == srcptr`
+///         + `nullptr == srcRegion`
+///     - ::ZE_RESULT_ERROR_OVERLAPPING_REGIONS
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemoryCopyRegion(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    const ze_copy_region_t* dstRegion,                                      ///< [in] pointer to destination region to copy to
+    uint32_t dstPitch,                                                      ///< [in] destination pitch in bytes
+    uint32_t dstSlicePitch,                                                 ///< [in] destination slice pitch in bytes. This is required for 3D region
+                                                                            ///< copies where the `depth` member of ::ze_copy_region_t is not 0,
+                                                                            ///< otherwise it's ignored.
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    const ze_copy_region_t* srcRegion,                                      ///< [in] pointer to source region to copy from
+    uint32_t srcPitch,                                                      ///< [in] source pitch in bytes
+    uint32_t srcSlicePitch,                                                 ///< [in] source slice pitch in bytes. This is required for 3D region
+                                                                            ///< copies where the `depth` member of ::ze_copy_region_t is not 0,
+                                                                            ///< otherwise it's ignored.
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies host, device, or shared memory from another context.
+/// 
+/// @details
+///     - The current active and source context must be from the same driver.
+///     - The application must ensure the memory pointed to by dstptr and srcptr
+///       is accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by dstptr and
+///       srcptr as they are free to be modified by either the Host or device up
+///       until execution.
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the command list and events were created,
+///       and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hContextSrc`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///         + `nullptr == srcptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemoryCopyFromContext(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    ze_context_handle_t hContextSrc,                                        ///< [in] handle of source context object
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    size_t size,                                                            ///< [in] size in bytes to copy
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies an image.
+/// 
+/// @details
+///     - The application must ensure the image and events are accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the image format descriptors for both
+///       source and destination images are the same.
+///     - The application must ensure the command list, images and events were
+///       created on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clEnqueueCopyImage**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hDstImage`
+///         + `nullptr == hSrcImage`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendImageCopy(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Region descriptor
+typedef struct _ze_image_region_t
+{
+    uint32_t originX;                                                       ///< [in] The origin x offset for region in pixels
+    uint32_t originY;                                                       ///< [in] The origin y offset for region in pixels
+    uint32_t originZ;                                                       ///< [in] The origin z offset for region in pixels
+    uint32_t width;                                                         ///< [in] The region width relative to origin in pixels
+    uint32_t height;                                                        ///< [in] The region height relative to origin in pixels
+    uint32_t depth;                                                         ///< [in] The region depth relative to origin. For 1D or 2D images, set
+                                                                            ///< this to 1.
+
+} ze_image_region_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies a region of an image to another image.
+/// 
+/// @details
+///     - The application must ensure the image and events are accessible by the
+///       device on which the command list was created.
+///     - The region width and height for both src and dst must be same. The
+///       origins can be different.
+///     - The src and dst regions cannot be overlapping.
+///     - The application must ensure the image format descriptors for both
+///       source and destination images are the same.
+///     - The application must ensure the command list, images and events were
+///       created, and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hDstImage`
+///         + `nullptr == hSrcImage`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_OVERLAPPING_REGIONS
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendImageCopyRegion(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    const ze_image_region_t* pDstRegion,                                    ///< [in][optional] destination region descriptor
+    const ze_image_region_t* pSrcRegion,                                    ///< [in][optional] source region descriptor
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies from an image to device or shared memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by dstptr is
+///       accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by dstptr as
+///       it is free to be modified by either the Host or device up until
+///       execution.
+///     - The application must ensure the image and events are accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the image format descriptor for the source
+///       image is a single-planar format.
+///     - The application must ensure the command list, image and events were
+///       created, and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clEnqueueReadImage
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hSrcImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendImageCopyToMemory(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    const ze_image_region_t* pSrcRegion,                                    ///< [in][optional] source region descriptor
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies to an image from device or shared memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by srcptr is
+///       accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by srcptr as
+///       it is free to be modified by either the Host or device up until
+///       execution.
+///     - The application must ensure the image and events are accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the image format descriptor for the
+///       destination image is a single-planar format.
+///     - The application must ensure the command list, image and events were
+///       created, and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clEnqueueWriteImage
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hDstImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == srcptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendImageCopyFromMemory(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    const ze_image_region_t* pDstRegion,                                    ///< [in][optional] destination region descriptor
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Asynchronously prefetches shared memory to the device associated with
+///        the specified command list
+/// 
+/// @details
+///     - This is a hint to improve performance only and is not required for
+///       correctness.
+///     - Only prefetching to the device associated with the specified command
+///       list is supported.
+///       Prefetching to the host or to a peer device is not supported.
+///     - Prefetching may not be supported for all allocation types for all devices.
+///       If memory prefetching is not supported for the specified memory range
+///       the prefetch hint may be ignored.
+///     - Prefetching may only be supported at a device-specific granularity,
+///       such as at a page boundary.
+///       In this case, the memory range may be expanded such that the start and
+///       end of the range satisfy granularity requirements.
+///     - The application must ensure the memory pointed to by ptr is accessible
+///       by the device on which the command list was created.
+///     - The application must ensure the command list was created, and the
+///       memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clEnqueueSVMMigrateMem
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemoryPrefetch(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    const void* ptr,                                                        ///< [in] pointer to start of the memory range to prefetch
+    size_t size                                                             ///< [in] size in bytes of the memory range to prefetch
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported memory advice hints
+typedef enum _ze_memory_advice_t
+{
+    ZE_MEMORY_ADVICE_SET_READ_MOSTLY = 0,                                   ///< hint that memory will be read from frequently and written to rarely
+    ZE_MEMORY_ADVICE_CLEAR_READ_MOSTLY = 1,                                 ///< removes the effect of ::ZE_MEMORY_ADVICE_SET_READ_MOSTLY
+    ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION = 2,                            ///< hint that the preferred memory location is the specified device
+    ZE_MEMORY_ADVICE_CLEAR_PREFERRED_LOCATION = 3,                          ///< removes the effect of ::ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION
+    ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY = 4,                             ///< hints that memory will mostly be accessed non-atomically
+    ZE_MEMORY_ADVICE_CLEAR_NON_ATOMIC_MOSTLY = 5,                           ///< removes the effect of ::ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY
+    ZE_MEMORY_ADVICE_BIAS_CACHED = 6,                                       ///< hints that memory should be cached
+    ZE_MEMORY_ADVICE_BIAS_UNCACHED = 7,                                     ///< hints that memory should be not be cached
+    ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION = 8,              ///< hint that the preferred memory location is host memory
+    ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION = 9,            ///< removes the effect of
+                                                                            ///< ::ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION
+    ZE_MEMORY_ADVICE_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_advice_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provides advice about the use of a shared memory range
+/// 
+/// @details
+///     - Memory advice is a performance hint only and is not required for
+///       functional correctness.
+///     - Memory advice can be used to override driver heuristics to explicitly
+///       control shared memory behavior.
+///     - Not all memory advice hints may be supported for all allocation types
+///       for all devices.
+///       If a memory advice hint is not supported by the device it will be ignored.
+///     - Memory advice may only be supported at a device-specific granularity,
+///       such as at a page boundary.
+///       In this case, the memory range may be expanded such that the start and
+///       end of the range satisfy granularity requirements.
+///     - The application must ensure the memory pointed to by ptr is accessible
+///       by the device on which the command list was created.
+///     - The application must ensure the command list was created, and memory
+///       was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle, and the memory was
+///       allocated.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION < advice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendMemAdvise(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_device_handle_t hDevice,                                             ///< [in] device associated with the memory advice
+    const void* ptr,                                                        ///< [in] Pointer to the start of the memory range
+    size_t size,                                                            ///< [in] Size in bytes of the memory range
+    ze_memory_advice_t advice                                               ///< [in] Memory advice for the memory range
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Event
+#if !defined(__GNUC__)
+#pragma region event
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event pool creation flags
+typedef uint32_t ze_event_pool_flags_t;
+typedef enum _ze_event_pool_flag_t
+{
+    ZE_EVENT_POOL_FLAG_HOST_VISIBLE = ZE_BIT(0),                            ///< signals and waits are also visible to host
+    ZE_EVENT_POOL_FLAG_IPC = ZE_BIT(1),                                     ///< signals and waits may be shared across processes
+    ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP = ZE_BIT(2),                        ///< Indicates all events in pool will contain kernel timestamps
+    ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP = ZE_BIT(3),                 ///< Indicates all events in pool will contain kernel timestamps
+                                                                            ///< synchronized to host time domain; cannot be combined with
+                                                                            ///< ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP
+    ZE_EVENT_POOL_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_pool_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event pool descriptor
+typedef struct _ze_event_pool_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_event_pool_flags_t flags;                                            ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_event_pool_flag_t;
+                                                                            ///< default behavior is signals and waits are visible to the entire device
+                                                                            ///< and peer devices.
+    uint32_t count;                                                         ///< [in] number of events within the pool; must be greater than 0
+
+} ze_event_pool_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a pool of events on the context.
+/// 
+/// @details
+///     - The application must only use events within the pool for the
+///       device(s), or their sub-devices, which were provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phEventPool`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xf < desc->flags`
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `0 == desc->count`
+///         + `(nullptr == phDevices) && (0 < numDevices)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_event_pool_desc_t* desc,                                       ///< [in] pointer to event pool descriptor
+    uint32_t numDevices,                                                    ///< [in][optional] number of device handles; must be 0 if `nullptr ==
+                                                                            ///< phDevices`
+    ze_device_handle_t* phDevices,                                          ///< [in][optional][range(0, numDevices)] array of device handles which
+                                                                            ///< have visibility to the event pool.
+                                                                            ///< if nullptr, then event pool is visible to all devices supported by the
+                                                                            ///< driver instance.
+    ze_event_pool_handle_t* phEventPool                                     ///< [out] pointer handle of event pool object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Deletes an event pool object.
+/// 
+/// @details
+///     - The application must destroy all event handles created from the pool
+///       before destroying the pool itself.
+///     - The application must ensure the device is not currently referencing
+///       the any event within the pool before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this event pool.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same event pool handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEventPool`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolDestroy(
+    ze_event_pool_handle_t hEventPool                                       ///< [in][release] handle of event pool object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event scope flags
+typedef uint32_t ze_event_scope_flags_t;
+typedef enum _ze_event_scope_flag_t
+{
+    ZE_EVENT_SCOPE_FLAG_SUBDEVICE = ZE_BIT(0),                              ///< cache hierarchies are flushed or invalidated sufficient for local
+                                                                            ///< sub-device access
+    ZE_EVENT_SCOPE_FLAG_DEVICE = ZE_BIT(1),                                 ///< cache hierarchies are flushed or invalidated sufficient for global
+                                                                            ///< device access and peer device access
+    ZE_EVENT_SCOPE_FLAG_HOST = ZE_BIT(2),                                   ///< cache hierarchies are flushed or invalidated sufficient for device and
+                                                                            ///< host access
+    ZE_EVENT_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_scope_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event descriptor
+typedef struct _ze_event_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t index;                                                         ///< [in] index of the event within the pool; must be less than the count
+                                                                            ///< specified during pool creation
+    ze_event_scope_flags_t signal;                                          ///< [in] defines the scope of relevant cache hierarchies to flush on a
+                                                                            ///< signal action before the event is triggered.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                                                            ///< default behavior is synchronization within the command list only, no
+                                                                            ///< additional cache hierarchies are flushed.
+    ze_event_scope_flags_t wait;                                            ///< [in] defines the scope of relevant cache hierarchies to invalidate on
+                                                                            ///< a wait action after the event is complete.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_event_scope_flag_t;
+                                                                            ///< default behavior is synchronization within the command list only, no
+                                                                            ///< additional cache hierarchies are invalidated.
+
+} ze_event_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates an event from the pool.
+/// 
+/// @details
+///     - An event is used to communicate fine-grain host-to-device,
+///       device-to-host or device-to-device dependencies have completed.
+///     - The application must ensure the location in the pool is not being used
+///       by another event.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same event pool handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clCreateUserEvent**
+///     - vkCreateEvent
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEventPool`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < desc->signal`
+///         + `0x7 < desc->wait`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventCreate(
+    ze_event_pool_handle_t hEventPool,                                      ///< [in] handle of the event pool
+    const ze_event_desc_t* desc,                                            ///< [in] pointer to event descriptor
+    ze_event_handle_t* phEvent                                              ///< [out] pointer to handle of event object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Deletes an event object.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the event before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this event.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same event handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clReleaseEvent**
+///     - vkDestroyEvent
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventDestroy(
+    ze_event_handle_t hEvent                                                ///< [in][release] handle of event object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets an IPC event pool handle for the specified event handle that can
+///        be shared with another process.
+/// 
+/// @details
+///     - Event pool must have been created with ::ZE_EVENT_POOL_FLAG_IPC.
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEventPool`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phIpc`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolGetIpcHandle(
+    ze_event_pool_handle_t hEventPool,                                      ///< [in] handle of event pool object
+    ze_ipc_event_pool_handle_t* phIpc                                       ///< [out] Returned IPC event handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns an IPC event pool handle to the driver
+/// 
+/// @details
+///     - This call must be used for IPC handles previously obtained with
+///       ::zeEventPoolGetIpcHandle.
+///     - Upon call, driver may release any underlying resources associated with
+///       the IPC handle.
+///       For instance, it may close the file descriptor contained in the IPC
+///       handle, if such type of handle is being used by the driver.
+///     - This call does not destroy the original event pool for which the IPC
+///       handle was created.
+///     - This function may **not** be called from simultaneous threads with the
+///       same IPC handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolPutIpcHandle(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object associated with the IPC event pool
+                                                                            ///< handle
+    ze_ipc_event_pool_handle_t hIpc                                         ///< [in] IPC event pool handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Opens an IPC event pool handle to retrieve an event pool handle from
+///        another process.
+/// 
+/// @details
+///     - Multiple calls to this function with the same IPC handle will return
+///       unique event pool handles.
+///     - The event handle in this process should not be freed with
+///       ::zeEventPoolDestroy, but rather with ::zeEventPoolCloseIpcHandle.
+///     - If the original event pool has been created for a device containing a
+///       number of sub-devices, then the event pool
+///       returned by this call may be used on a device containing the same
+///       number of sub-devices, or on any of
+///       those sub-devices.
+///     - However, if the original event pool has been created for a sub-device,
+///       then the event pool returned by this call
+///       cannot be used on a device containing any number of sub-devices, and
+///       must be used only in a sub-device. This ensures
+///       functional correctness for any implementation or optimizations the
+///       underlying Level Zero driver may do on
+///       event pools and events.
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phEventPool`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolOpenIpcHandle(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object to associate with the IPC event pool
+                                                                            ///< handle
+    ze_ipc_event_pool_handle_t hIpc,                                        ///< [in] IPC event pool handle
+    ze_event_pool_handle_t* phEventPool                                     ///< [out] pointer handle of event pool object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Closes an IPC event handle in the current process.
+/// 
+/// @details
+///     - Closes an IPC event handle by destroying events that were opened in
+///       this process using ::zeEventPoolOpenIpcHandle.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same event pool handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEventPool`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolCloseIpcHandle(
+    ze_event_pool_handle_t hEventPool                                       ///< [in][release] handle of event pool object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends a signal of the event from the device into a command list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The duration of an event created from an event pool that was created
+///       using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP or
+///       ::ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP flags is undefined.
+///       However, for consistency and orthogonality the event will report
+///       correctly as signaled when used by other event API functionality.
+///     - The application must ensure the command list and events were created
+///       on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clSetUserEventStatus**
+///     - vkCmdSetEvent
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendSignalEvent(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends wait on event(s) on the device into a command list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the command list and events were created
+///       on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phEvents`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendWaitOnEvents(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numEvents,                                                     ///< [in] number of events to wait on before continuing
+    ze_event_handle_t* phEvents                                             ///< [in][range(0, numEvents)] handles of the events to wait on before
+                                                                            ///< continuing
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Signals a event from host.
+/// 
+/// @details
+///     - The duration of an event created from an event pool that was created
+///       using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP or
+///       ::ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP flags is undefined.
+///       However, for consistency and orthogonality the event will report
+///       correctly as signaled when used by other event API functionality.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clSetUserEventStatus
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventHostSignal(
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief The current host thread waits on an event to be signaled.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clWaitForEvents
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_NOT_READY
+///         + timeout expired
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventHostSynchronize(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then operates exactly like ::zeEventQueryStatus;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queries an event object's status on the host.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **clGetEventInfo**
+///     - vkGetEventStatus
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_NOT_READY
+///         + not signaled
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventQueryStatus(
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends a reset of an event back to not signaled state into a command
+///        list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the command list and events were created
+///       on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - vkResetEvent
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendEventReset(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief The current host thread resets an event back to not signaled state.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - vkResetEvent
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventHostReset(
+    ze_event_handle_t hEvent                                                ///< [in] handle of the event
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel timestamp clock data
+/// 
+/// @details
+///     - The timestamp frequency can be queried from the `timerResolution`
+///       member of ::ze_device_properties_t.
+///     - The number of valid bits in the timestamp value can be queried from
+///       the `kernelTimestampValidBits` member of ::ze_device_properties_t.
+typedef struct _ze_kernel_timestamp_data_t
+{
+    uint64_t kernelStart;                                                   ///< [out] device clock at start of kernel execution
+    uint64_t kernelEnd;                                                     ///< [out] device clock at end of kernel execution
+
+} ze_kernel_timestamp_data_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel timestamp result
+typedef struct _ze_kernel_timestamp_result_t
+{
+    ze_kernel_timestamp_data_t global;                                      ///< [out] wall-clock data
+    ze_kernel_timestamp_data_t context;                                     ///< [out] context-active data; only includes clocks while device context
+                                                                            ///< was actively executing.
+
+} ze_kernel_timestamp_result_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queries an event's timestamp value on the host.
+/// 
+/// @details
+///     - The application must ensure the event was created from an event pool
+///       that was created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP or
+///       ::ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP flag.
+///     - The destination memory will be unmodified if the event has not been
+///       signaled.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_NOT_READY
+///         + not signaled
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventQueryKernelTimestamp(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_kernel_timestamp_result_t* dstptr                                    ///< [in,out] pointer to memory for where timestamp result will be written.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends a query of an events' timestamp value(s) into a command list.
+/// 
+/// @details
+///     - The application must ensure the events are accessible by the device on
+///       which the command list was created.
+///     - The application must ensure the events were created from an event pool
+///       that was created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag.
+///     - The application must ensure the memory pointed to by both dstptr and
+///       pOffsets is accessible by the device on which the command list was
+///       created.
+///     - The value(s) written to the destination buffer are undefined if any
+///       timestamp event has not been signaled.
+///     - If pOffsets is nullptr, then multiple results will be appended
+///       sequentially into memory in the same order as phEvents.
+///     - The application must ensure the command list and events were created,
+///       and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phEvents`
+///         + `nullptr == dstptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendQueryKernelTimestamps(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numEvents,                                                     ///< [in] the number of timestamp events to query
+    ze_event_handle_t* phEvents,                                            ///< [in][range(0, numEvents)] handles of timestamp events to query
+    void* dstptr,                                                           ///< [in,out] pointer to memory where ::ze_kernel_timestamp_result_t will
+                                                                            ///< be written; must be size-aligned.
+    const size_t* pOffsets,                                                 ///< [in][optional][range(0, numEvents)] offset, in bytes, to write
+                                                                            ///< results; address must be 4byte-aligned and offsets must be
+                                                                            ///< size-aligned.
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing query;
+                                                                            ///< must be 0 if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing query
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the handle of the event pool for the event.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phEventPool`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventGetEventPool(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_event_pool_handle_t* phEventPool                                     ///< [out] handle of the event pool for the event
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the signal event scope.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSignalScope`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventGetSignalScope(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_event_scope_flags_t* pSignalScope                                    ///< [out] signal event scope. This is the scope of relevant cache
+                                                                            ///< hierarchies that are flushed on a signal action before the event is
+                                                                            ///< triggered. May be 0 or a valid combination of ::ze_event_scope_flag_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the wait event scope.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pWaitScope`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventGetWaitScope(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_event_scope_flags_t* pWaitScope                                      ///< [out] wait event scope. This is the scope of relevant cache
+                                                                            ///< hierarchies invalidated on a wait action after the event is complete.
+                                                                            ///< May be 0 or a valid combination of ::ze_event_scope_flag_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the handle of the context on which the event pool was created.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEventPool`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phContext`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolGetContextHandle(
+    ze_event_pool_handle_t hEventPool,                                      ///< [in] handle of the event pool
+    ze_context_handle_t* phContext                                          ///< [out] handle of the context on which the event pool was created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the creation flags used to create the event pool.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEventPool`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pFlags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventPoolGetFlags(
+    ze_event_pool_handle_t hEventPool,                                      ///< [in] handle of the event pool
+    ze_event_pool_flags_t* pFlags                                           ///< [out] creation flags used to create the event pool; may be 0 or a
+                                                                            ///< valid combination of ::ze_event_pool_flag_t
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Fence
+#if !defined(__GNUC__)
+#pragma region fence
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported fence creation flags
+typedef uint32_t ze_fence_flags_t;
+typedef enum _ze_fence_flag_t
+{
+    ZE_FENCE_FLAG_SIGNALED = ZE_BIT(0),                                     ///< fence is created in the signaled state, otherwise not signaled.
+    ZE_FENCE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_fence_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fence descriptor
+typedef struct _ze_fence_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_fence_flags_t flags;                                                 ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_fence_flag_t.
+
+} ze_fence_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a fence for the command queue.
+/// 
+/// @details
+///     - A fence is a heavyweight synchronization primitive used to communicate
+///       to the host that command list execution has completed.
+///     - The application must only use the fence for the command queue which
+///       was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkCreateFence**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandQueue`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phFence`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x1 < desc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFenceCreate(
+    ze_command_queue_handle_t hCommandQueue,                                ///< [in] handle of command queue
+    const ze_fence_desc_t* desc,                                            ///< [in] pointer to fence descriptor
+    ze_fence_handle_t* phFence                                              ///< [out] pointer to handle of fence object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Deletes a fence object.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the fence before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this fence.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same fence handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkDestroyFence**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFence`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFenceDestroy(
+    ze_fence_handle_t hFence                                                ///< [in][release] handle of fence object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief The current host thread waits on a fence to be signaled.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkWaitForFences**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFence`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_NOT_READY
+///         + timeout expired
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFenceHostSynchronize(
+    ze_fence_handle_t hFence,                                               ///< [in] handle of the fence
+    uint64_t timeout                                                        ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then operates exactly like ::zeFenceQueryStatus;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queries a fence object's status.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkGetFenceStatus**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFence`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_NOT_READY
+///         + not signaled
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFenceQueryStatus(
+    ze_fence_handle_t hFence                                                ///< [in] handle of the fence
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reset a fence back to the not signaled state.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - **vkResetFences**
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFence`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFenceReset(
+    ze_fence_handle_t hFence                                                ///< [in] handle of the fence
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Images
+#if !defined(__GNUC__)
+#pragma region image
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported image creation flags
+typedef uint32_t ze_image_flags_t;
+typedef enum _ze_image_flag_t
+{
+    ZE_IMAGE_FLAG_KERNEL_WRITE = ZE_BIT(0),                                 ///< kernels will write contents
+    ZE_IMAGE_FLAG_BIAS_UNCACHED = ZE_BIT(1),                                ///< device should not cache contents
+    ZE_IMAGE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported image types
+typedef enum _ze_image_type_t
+{
+    ZE_IMAGE_TYPE_1D = 0,                                                   ///< 1D
+    ZE_IMAGE_TYPE_1DARRAY = 1,                                              ///< 1D array
+    ZE_IMAGE_TYPE_2D = 2,                                                   ///< 2D
+    ZE_IMAGE_TYPE_2DARRAY = 3,                                              ///< 2D array
+    ZE_IMAGE_TYPE_3D = 4,                                                   ///< 3D
+    ZE_IMAGE_TYPE_BUFFER = 5,                                               ///< Buffer
+    ZE_IMAGE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported image format layouts
+typedef enum _ze_image_format_layout_t
+{
+    ZE_IMAGE_FORMAT_LAYOUT_8 = 0,                                           ///< 8-bit single component layout
+    ZE_IMAGE_FORMAT_LAYOUT_16 = 1,                                          ///< 16-bit single component layout
+    ZE_IMAGE_FORMAT_LAYOUT_32 = 2,                                          ///< 32-bit single component layout
+    ZE_IMAGE_FORMAT_LAYOUT_8_8 = 3,                                         ///< 2-component 8-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 = 4,                                     ///< 4-component 8-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_16_16 = 5,                                       ///< 2-component 16-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 = 6,                                 ///< 4-component 16-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_32_32 = 7,                                       ///< 2-component 32-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 = 8,                                 ///< 4-component 32-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_10_10_10_2 = 9,                                  ///< 4-component 10_10_10_2 layout
+    ZE_IMAGE_FORMAT_LAYOUT_11_11_10 = 10,                                   ///< 3-component 11_11_10 layout
+    ZE_IMAGE_FORMAT_LAYOUT_5_6_5 = 11,                                      ///< 3-component 5_6_5 layout
+    ZE_IMAGE_FORMAT_LAYOUT_5_5_5_1 = 12,                                    ///< 4-component 5_5_5_1 layout
+    ZE_IMAGE_FORMAT_LAYOUT_4_4_4_4 = 13,                                    ///< 4-component 4_4_4_4 layout
+    ZE_IMAGE_FORMAT_LAYOUT_Y8 = 14,                                         ///< Media Format: Y8. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_NV12 = 15,                                       ///< Media Format: NV12. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YUYV = 16,                                       ///< Media Format: YUYV. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_VYUY = 17,                                       ///< Media Format: VYUY. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YVYU = 18,                                       ///< Media Format: YVYU. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_UYVY = 19,                                       ///< Media Format: UYVY. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_AYUV = 20,                                       ///< Media Format: AYUV. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P010 = 21,                                       ///< Media Format: P010. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y410 = 22,                                       ///< Media Format: Y410. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P012 = 23,                                       ///< Media Format: P012. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y16 = 24,                                        ///< Media Format: Y16. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P016 = 25,                                       ///< Media Format: P016. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y216 = 26,                                       ///< Media Format: Y216. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P216 = 27,                                       ///< Media Format: P216. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_P8 = 28,                                         ///< Media Format: P8. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YUY2 = 29,                                       ///< Media Format: YUY2. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_A8P8 = 30,                                       ///< Media Format: A8P8. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_IA44 = 31,                                       ///< Media Format: IA44. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_AI44 = 32,                                       ///< Media Format: AI44. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y416 = 33,                                       ///< Media Format: Y416. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_Y210 = 34,                                       ///< Media Format: Y210. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_I420 = 35,                                       ///< Media Format: I420. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_YV12 = 36,                                       ///< Media Format: YV12. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_400P = 37,                                       ///< Media Format: 400P. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_422H = 38,                                       ///< Media Format: 422H. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_422V = 39,                                       ///< Media Format: 422V. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_444P = 40,                                       ///< Media Format: 444P. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_RGBP = 41,                                       ///< Media Format: RGBP. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_BRGP = 42,                                       ///< Media Format: BRGP. Format type and swizzle is ignored for this.
+    ZE_IMAGE_FORMAT_LAYOUT_8_8_8 = 43,                                      ///< 3-component 8-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_16_16_16 = 44,                                   ///< 3-component 16-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_32_32_32 = 45,                                   ///< 3-component 32-bit layout
+    ZE_IMAGE_FORMAT_LAYOUT_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_format_layout_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported image format types
+typedef enum _ze_image_format_type_t
+{
+    ZE_IMAGE_FORMAT_TYPE_UINT = 0,                                          ///< Unsigned integer
+    ZE_IMAGE_FORMAT_TYPE_SINT = 1,                                          ///< Signed integer
+    ZE_IMAGE_FORMAT_TYPE_UNORM = 2,                                         ///< Unsigned normalized integer
+    ZE_IMAGE_FORMAT_TYPE_SNORM = 3,                                         ///< Signed normalized integer
+    ZE_IMAGE_FORMAT_TYPE_FLOAT = 4,                                         ///< Float
+    ZE_IMAGE_FORMAT_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_format_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported image format component swizzle into channel
+typedef enum _ze_image_format_swizzle_t
+{
+    ZE_IMAGE_FORMAT_SWIZZLE_R = 0,                                          ///< Red component
+    ZE_IMAGE_FORMAT_SWIZZLE_G = 1,                                          ///< Green component
+    ZE_IMAGE_FORMAT_SWIZZLE_B = 2,                                          ///< Blue component
+    ZE_IMAGE_FORMAT_SWIZZLE_A = 3,                                          ///< Alpha component
+    ZE_IMAGE_FORMAT_SWIZZLE_0 = 4,                                          ///< Zero
+    ZE_IMAGE_FORMAT_SWIZZLE_1 = 5,                                          ///< One
+    ZE_IMAGE_FORMAT_SWIZZLE_X = 6,                                          ///< Don't care
+    ZE_IMAGE_FORMAT_SWIZZLE_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_format_swizzle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image format 
+typedef struct _ze_image_format_t
+{
+    ze_image_format_layout_t layout;                                        ///< [in] image format component layout (e.g. N-component layouts and media
+                                                                            ///< formats)
+    ze_image_format_type_t type;                                            ///< [in] image format type
+    ze_image_format_swizzle_t x;                                            ///< [in] image component swizzle into channel x
+    ze_image_format_swizzle_t y;                                            ///< [in] image component swizzle into channel y
+    ze_image_format_swizzle_t z;                                            ///< [in] image component swizzle into channel z
+    ze_image_format_swizzle_t w;                                            ///< [in] image component swizzle into channel w
+
+} ze_image_format_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image descriptor
+typedef struct _ze_image_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_image_flags_t flags;                                                 ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_image_flag_t;
+                                                                            ///< default is read-only, cached access.
+    ze_image_type_t type;                                                   ///< [in] image type. Media format layouts are unsupported for
+                                                                            ///< ::ZE_IMAGE_TYPE_BUFFER
+    ze_image_format_t format;                                               ///< [in] image format
+    uint64_t width;                                                         ///< [in] width dimension.
+                                                                            ///< ::ZE_IMAGE_TYPE_BUFFER: size in bytes; see the `maxImageBufferSize`
+                                                                            ///< member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_1D, ::ZE_IMAGE_TYPE_1DARRAY: width in pixels; see the
+                                                                            ///< `maxImageDims1D` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: width in pixels; see the
+                                                                            ///< `maxImageDims2D` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_3D: width in pixels; see the `maxImageDims3D` member
+                                                                            ///< of ::ze_device_image_properties_t for limits.
+    uint32_t height;                                                        ///< [in] height dimension.
+                                                                            ///< ::ZE_IMAGE_TYPE_2D, ::ZE_IMAGE_TYPE_2DARRAY: height in pixels; see the
+                                                                            ///< `maxImageDims2D` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< ::ZE_IMAGE_TYPE_3D: height in pixels; see the `maxImageDims3D` member
+                                                                            ///< of ::ze_device_image_properties_t for limits.
+                                                                            ///< other: ignored.
+    uint32_t depth;                                                         ///< [in] depth dimension.
+                                                                            ///< ::ZE_IMAGE_TYPE_3D: depth in pixels; see the `maxImageDims3D` member
+                                                                            ///< of ::ze_device_image_properties_t for limits.
+                                                                            ///< other: ignored.
+    uint32_t arraylevels;                                                   ///< [in] array levels.
+                                                                            ///< ::ZE_IMAGE_TYPE_1DARRAY, ::ZE_IMAGE_TYPE_2DARRAY: see the
+                                                                            ///< `maxImageArraySlices` member of ::ze_device_image_properties_t for limits.
+                                                                            ///< other: ignored.
+    uint32_t miplevels;                                                     ///< [in] mipmap levels (must be 0)
+
+} ze_image_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported sampler filtering flags
+typedef uint32_t ze_image_sampler_filter_flags_t;
+typedef enum _ze_image_sampler_filter_flag_t
+{
+    ZE_IMAGE_SAMPLER_FILTER_FLAG_POINT = ZE_BIT(0),                         ///< device supports point filtering
+    ZE_IMAGE_SAMPLER_FILTER_FLAG_LINEAR = ZE_BIT(1),                        ///< device supports linear filtering
+    ZE_IMAGE_SAMPLER_FILTER_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_sampler_filter_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image properties
+typedef struct _ze_image_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_image_sampler_filter_flags_t samplerFilterFlags;                     ///< [out] supported sampler filtering.
+                                                                            ///< returns 0 (unsupported) or a combination of ::ze_image_sampler_filter_flag_t.
+
+} ze_image_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves supported properties of an image.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == pImageProperties`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageGetProperties(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_properties_t* pImageProperties                                 ///< [out] pointer to image properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates an image on the context.
+/// 
+/// @details
+///     - The application must only use the image for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clCreateImage
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phImage`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_handle_t* phImage                                              ///< [out] pointer to handle of image object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Deletes an image object.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the image before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this image.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same image handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hImage`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageDestroy(
+    ze_image_handle_t hImage                                                ///< [in][release] handle of image object to destroy
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Memory
+#if !defined(__GNUC__)
+#pragma region memory
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported memory allocation flags
+typedef uint32_t ze_device_mem_alloc_flags_t;
+typedef enum _ze_device_mem_alloc_flag_t
+{
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0),                       ///< device should cache allocation
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1),                     ///< device should not cache allocation (UC)
+    ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(2),            ///< optimize shared allocation for first access on the device
+    ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_mem_alloc_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device memory allocation descriptor
+typedef struct _ze_device_mem_alloc_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_mem_alloc_flags_t flags;                                      ///< [in] flags specifying additional allocation controls.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_device_mem_alloc_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
+    uint32_t ordinal;                                                       ///< [in] ordinal of the device's local memory to allocate from.
+                                                                            ///< must be less than the count returned from ::zeDeviceGetMemoryProperties.
+
+} ze_device_mem_alloc_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported host memory allocation flags
+typedef uint32_t ze_host_mem_alloc_flags_t;
+typedef enum _ze_host_mem_alloc_flag_t
+{
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0),                         ///< host should cache allocation
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1),                       ///< host should not cache allocation (UC)
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED = ZE_BIT(2),                 ///< host memory should be allocated write-combined (WC)
+    ZE_HOST_MEM_ALLOC_FLAG_BIAS_INITIAL_PLACEMENT = ZE_BIT(3),              ///< optimize shared allocation for first access on the host
+    ZE_HOST_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_host_mem_alloc_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Host memory allocation descriptor
+typedef struct _ze_host_mem_alloc_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_host_mem_alloc_flags_t flags;                                        ///< [in] flags specifying additional allocation controls.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_host_mem_alloc_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
+
+} ze_host_mem_alloc_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Allocates shared memory on the context.
+/// 
+/// @details
+///     - Shared allocations share ownership between the host and one or more
+///       devices.
+///     - Shared allocations may optionally be associated with a device by
+///       passing a handle to the device.
+///     - Devices supporting only single-device shared access capabilities may
+///       access shared memory associated with the device.
+///       For these devices, ownership of the allocation is shared between the
+///       host and the associated device only.
+///     - Passing nullptr as the device handle does not associate the shared
+///       allocation with any device.
+///       For allocations with no associated device, ownership of the allocation
+///       is shared between the host and all devices supporting cross-device
+///       shared access capabilities.
+///     - The application must only use the memory allocation for the context
+///       and device, or its sub-devices, which was provided during allocation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == device_desc`
+///         + `nullptr == host_desc`
+///         + `nullptr == pptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < device_desc->flags`
+///         + `0xf < host_desc->flags`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Must be zero or a power-of-two
+///         + `0 != (alignment & (alignment - 1))`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemAllocShared(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_device_mem_alloc_desc_t* device_desc,                          ///< [in] pointer to device memory allocation descriptor
+    const ze_host_mem_alloc_desc_t* host_desc,                              ///< [in] pointer to host memory allocation descriptor
+    size_t size,                                                            ///< [in] size in bytes to allocate; must be less than or equal to the
+                                                                            ///< `maxMemAllocSize` member of ::ze_device_properties_t
+    size_t alignment,                                                       ///< [in] minimum alignment in bytes for the allocation; must be a power of
+                                                                            ///< two
+    ze_device_handle_t hDevice,                                             ///< [in][optional] device handle to associate with
+    void** pptr                                                             ///< [out] pointer to shared allocation
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Allocates device memory on the context.
+/// 
+/// @details
+///     - Device allocations are owned by a specific device.
+///     - In general, a device allocation may only be accessed by the device
+///       that owns it.
+///     - The application must only use the memory allocation for the context
+///       and device, or its sub-devices, which was provided during allocation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == device_desc`
+///         + `nullptr == pptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < device_desc->flags`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Must be zero or a power-of-two
+///         + `0 != (alignment & (alignment - 1))`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemAllocDevice(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_device_mem_alloc_desc_t* device_desc,                          ///< [in] pointer to device memory allocation descriptor
+    size_t size,                                                            ///< [in] size in bytes to allocate; must be less than or equal to the
+                                                                            ///< `maxMemAllocSize` member of ::ze_device_properties_t
+    size_t alignment,                                                       ///< [in] minimum alignment in bytes for the allocation; must be a power of
+                                                                            ///< two
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    void** pptr                                                             ///< [out] pointer to device allocation
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Allocates host memory on the context.
+/// 
+/// @details
+///     - Host allocations are owned by the host process.
+///     - Host allocations are accessible by the host and all devices within the
+///       driver's context.
+///     - Host allocations are frequently used as staging areas to transfer data
+///       to or from devices.
+///     - The application must only use the memory allocation for the context
+///       which was provided during allocation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == host_desc`
+///         + `nullptr == pptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xf < host_desc->flags`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Must be zero or a power-of-two
+///         + `0 != (alignment & (alignment - 1))`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemAllocHost(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_host_mem_alloc_desc_t* host_desc,                              ///< [in] pointer to host memory allocation descriptor
+    size_t size,                                                            ///< [in] size in bytes to allocate; must be less than or equal to the
+                                                                            ///< `maxMemAllocSize` member of ::ze_device_properties_t
+    size_t alignment,                                                       ///< [in] minimum alignment in bytes for the allocation; must be a power of
+                                                                            ///< two
+    void** pptr                                                             ///< [out] pointer to host allocation
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frees allocated host memory, device memory, or shared memory on the
+///        context.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the memory before it is freed
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this memory
+///     - The application must **not** call this function from simultaneous
+///       threads with the same pointer.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemFree(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    void* ptr                                                               ///< [in][release] pointer to memory to free
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory allocation type
+typedef enum _ze_memory_type_t
+{
+    ZE_MEMORY_TYPE_UNKNOWN = 0,                                             ///< the memory pointed to is of unknown type
+    ZE_MEMORY_TYPE_HOST = 1,                                                ///< the memory pointed to is a host allocation
+    ZE_MEMORY_TYPE_DEVICE = 2,                                              ///< the memory pointed to is a device allocation
+    ZE_MEMORY_TYPE_SHARED = 3,                                              ///< the memory pointed to is a shared ownership allocation
+    ZE_MEMORY_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory allocation properties queried using ::zeMemGetAllocProperties
+typedef struct _ze_memory_allocation_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_memory_type_t type;                                                  ///< [out] type of allocated memory
+    uint64_t id;                                                            ///< [out] identifier for this allocation
+    uint64_t pageSize;                                                      ///< [out] page size used for allocation
+
+} ze_memory_allocation_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves attributes of a memory allocation
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The application may query attributes of a memory allocation unrelated
+///       to the context.
+///       When this occurs, the returned allocation type will be
+///       ::ZE_MEMORY_TYPE_UNKNOWN, and the returned identifier and associated
+///       device is unspecified.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///         + `nullptr == pMemAllocProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetAllocProperties(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] memory pointer to query
+    ze_memory_allocation_properties_t* pMemAllocProperties,                 ///< [in,out] query result for memory allocation properties
+    ze_device_handle_t* phDevice                                            ///< [out][optional] device associated with this allocation
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves the base address and/or size of an allocation
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetAddressRange(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] memory pointer to query
+    void** pBase,                                                           ///< [in,out][optional] base address of the allocation
+    size_t* pSize                                                           ///< [in,out][optional] size of the allocation
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates an IPC memory handle for the specified allocation
+/// 
+/// @details
+///     - Takes a pointer to a device memory allocation and creates an IPC
+///       memory handle for exporting it for use in another process.
+///     - The pointer must be base pointer of a device or host memory
+///       allocation; i.e. the value returned from ::zeMemAllocDevice or from
+///       ::zeMemAllocHost, respectively.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///         + `nullptr == pIpcHandle`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetIpcHandle(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to the device memory allocation
+    ze_ipc_mem_handle_t* pIpcHandle                                         ///< [out] Returned IPC memory handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates an IPC memory handle out of a file descriptor
+/// 
+/// @details
+///     - Handle passed must be a valid file descriptor obtained with
+///       ::ze_external_memory_export_fd_t via ::zeMemGetAllocProperties.
+///     - Returned IPC handle may contain metadata in addition to the file
+///       descriptor.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pIpcHandle`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetIpcHandleFromFileDescriptorExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    uint64_t handle,                                                        ///< [in] file descriptor
+    ze_ipc_mem_handle_t* pIpcHandle                                         ///< [out] Returned IPC memory handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Gets the file descriptor contained in an IPC memory handle
+/// 
+/// @details
+///     - IPC memory handle must be a valid handle obtained with
+///       ::zeMemGetIpcHandle.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pHandle`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetFileDescriptorFromIpcHandleExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_ipc_mem_handle_t ipcHandle,                                          ///< [in] IPC memory handle
+    uint64_t* pHandle                                                       ///< [out] Returned file descriptor
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns an IPC memory handle to the driver
+/// 
+/// @details
+///     - This call may be used for IPC handles previously obtained with either
+///       ::zeMemGetIpcHandle or with ::ze_external_memory_export_fd_t via ::zeMemGetAllocProperties.
+///     - Upon call, driver may release any underlying resources associated with
+///       the IPC handle.
+///       For instance, it may close the file descriptor contained in the IPC
+///       handle, if such type of handle is being used by the driver.
+///     - This call does not free the original allocation for which the IPC
+///       handle was created.
+///     - This function may **not** be called from simultaneous threads with the
+///       same IPC handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemPutIpcHandle(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_ipc_mem_handle_t handle                                              ///< [in] IPC memory handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported IPC memory flags
+typedef uint32_t ze_ipc_memory_flags_t;
+typedef enum _ze_ipc_memory_flag_t
+{
+    ZE_IPC_MEMORY_FLAG_BIAS_CACHED = ZE_BIT(0),                             ///< device should cache allocation
+    ZE_IPC_MEMORY_FLAG_BIAS_UNCACHED = ZE_BIT(1),                           ///< device should not cache allocation (UC)
+    ZE_IPC_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_ipc_memory_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Opens an IPC memory handle to retrieve a device pointer on the
+///        context.
+/// 
+/// @details
+///     - Takes an IPC memory handle from a remote process and associates it
+///       with a device pointer usable in this process.
+///     - The device pointer in this process should not be freed with
+///       ::zeMemFree, but rather with ::zeMemCloseIpcHandle.
+///     - Multiple calls to this function with the same IPC handle will return
+///       unique pointers.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < flags`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pptr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemOpenIpcHandle(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device to associate with the IPC memory handle
+    ze_ipc_mem_handle_t handle,                                             ///< [in] IPC memory handle
+    ze_ipc_memory_flags_t flags,                                            ///< [in] flags controlling the operation.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_ipc_memory_flag_t.
+    void** pptr                                                             ///< [out] pointer to device allocation in this process
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Closes an IPC memory handle
+/// 
+/// @details
+///     - Closes an IPC memory handle by unmapping memory that was opened in
+///       this process using ::zeMemOpenIpcHandle.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same pointer.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemCloseIpcHandle(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr                                                         ///< [in][release] pointer to device allocation in this process
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Additional allocation descriptor for exporting external memory
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemAllocDevice and
+///       ::zeMemAllocHost, via the `pNext` member of
+///       ::ze_device_mem_alloc_desc_t or ::ze_host_mem_alloc_desc_t,
+///       respectively, to indicate an exportable memory allocation.
+///     - This structure may be passed to ::zeImageCreate, via the `pNext`
+///       member of ::ze_image_desc_t, to indicate an exportable image.
+typedef struct _ze_external_memory_export_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying memory export types for this allocation.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+
+} ze_external_memory_export_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Additional allocation descriptor for importing external memory as a
+///        file descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemAllocDevice or
+///       ::zeMemAllocHost, via the `pNext` member of
+///       ::ze_device_mem_alloc_desc_t or of ::ze_host_mem_alloc_desc_t,
+///       respectively, to import memory from a file descriptor.
+///     - This structure may be passed to ::zeImageCreate, via the `pNext`
+///       member of ::ze_image_desc_t, to import memory from a file descriptor.
+typedef struct _ze_external_memory_import_fd_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory import type for the file descriptor.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    int fd;                                                                 ///< [in] the file descriptor handle to import
+
+} ze_external_memory_import_fd_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exports an allocation as a file descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemGetAllocProperties, via the
+///       `pNext` member of ::ze_memory_allocation_properties_t, to export a
+///       memory allocation as a file descriptor.
+///     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
+///       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
+///       export an image as a file descriptor.
+///     - The requested memory export type must have been specified when the
+///       allocation was made.
+typedef struct _ze_external_memory_export_fd_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory export type for the file descriptor.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    int fd;                                                                 ///< [out] the exported file descriptor handle representing the allocation.
+
+} ze_external_memory_export_fd_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Additional allocation descriptor for importing external memory as a
+///        Win32 handle
+/// 
+/// @details
+///     - When `handle` is `nullptr`, `name` must not be `nullptr`.
+///     - When `name` is `nullptr`, `handle` must not be `nullptr`.
+///     - When `flags` is ::ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_WIN32_KMT,
+///       `name` must be `nullptr`.
+///     - This structure may be passed to ::zeMemAllocDevice or
+///       ::zeMemAllocHost, via the `pNext` member of
+///       ::ze_device_mem_alloc_desc_t or of ::ze_host_mem_alloc_desc_t,
+///       respectively, to import memory from a Win32 handle.
+///     - This structure may be passed to ::zeImageCreate, via the `pNext`
+///       member of ::ze_image_desc_t, to import memory from a Win32 handle.
+typedef struct _ze_external_memory_import_win32_handle_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory import type for the Win32 handle.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    void* handle;                                                           ///< [in][optional] the Win32 handle to import
+    const void* name;                                                       ///< [in][optional] name of a memory object to import
+
+} ze_external_memory_import_win32_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exports an allocation as a Win32 handle
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemGetAllocProperties, via the
+///       `pNext` member of ::ze_memory_allocation_properties_t, to export a
+///       memory allocation as a Win32 handle.
+///     - This structure may be passed to ::zeImageGetAllocPropertiesExt, via
+///       the `pNext` member of ::ze_image_allocation_ext_properties_t, to
+///       export an image as a Win32 handle.
+///     - The requested memory export type must have been specified when the
+///       allocation was made.
+typedef struct _ze_external_memory_export_win32_handle_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_external_memory_type_flags_t flags;                                  ///< [in] flags specifying the memory export type for the Win32 handle.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_external_memory_type_flags_t
+    void* handle;                                                           ///< [out] the exported Win32 handle representing the allocation.
+
+} ze_external_memory_export_win32_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief atomic access attribute flags
+typedef uint32_t ze_memory_atomic_attr_exp_flags_t;
+typedef enum _ze_memory_atomic_attr_exp_flag_t
+{
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_ATOMICS = ZE_BIT(0),                  ///< Atomics on the pointer are not allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_HOST_ATOMICS = ZE_BIT(1),             ///< Host atomics on the pointer are not allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_HOST_ATOMICS = ZE_BIT(2),                ///< Host atomics on the pointer are allowed. Requires
+                                                                            ///< ::ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC returned by
+                                                                            ///< ::zeDeviceGetMemoryAccessProperties.
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_DEVICE_ATOMICS = ZE_BIT(3),           ///< Device atomics on the pointer are not allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_DEVICE_ATOMICS = ZE_BIT(4),              ///< Device atomics on the pointer are allowed. Requires
+                                                                            ///< ::ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC returned by
+                                                                            ///< ::zeDeviceGetMemoryAccessProperties.
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_NO_SYSTEM_ATOMICS = ZE_BIT(5),           ///< Concurrent atomics on the pointer from both host and device are not
+                                                                            ///< allowed
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_SYSTEM_ATOMICS = ZE_BIT(6),              ///< Concurrent atomics on the pointer from both host and device are
+                                                                            ///< allowed. Requires ::ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC
+                                                                            ///< returned by ::zeDeviceGetMemoryAccessProperties.
+    ZE_MEMORY_ATOMIC_ATTR_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_atomic_attr_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sets atomic access attributes for a shared allocation
+/// 
+/// @details
+///     - If the shared-allocation is owned by multiple devices (i.e. nullptr
+///       was passed to ::zeMemAllocShared when creating it), then hDevice may be
+///       passed to set the attributes in that specific device. If nullptr is
+///       passed in hDevice, then the atomic attributes are set in all devices
+///       associated with the allocation.
+///     - If the atomic access attribute select is not supported by the driver,
+///       ::ZE_RESULT_INVALID_ARGUMENT is returned.
+///     - The atomic access attribute may be only supported at a device-specific
+///       granularity, such as at a page boundary. In this case, the memory range
+///       may be expanded such that the start and end of the range satisfy granularity
+///       requirements.
+///     - When calling this function multiple times with different flags, only the
+///       attributes from last call are honored.
+///     - The application must not call this function for shared-allocations currently
+///       being used by the device.
+///     - The application must **not** call this function from simultaneous threads
+///       with the same pointer.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7f < attr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemSetAtomicAccessAttributeExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context
+    ze_device_handle_t hDevice,                                             ///< [in] device associated with the memory advice
+    const void* ptr,                                                        ///< [in] Pointer to the start of the memory range
+    size_t size,                                                            ///< [in] Size in bytes of the memory range
+    ze_memory_atomic_attr_exp_flags_t attr                                  ///< [in] Atomic access attributes to set for the specified range.
+                                                                            ///< Must be 0 (default) or a valid combination of ::ze_memory_atomic_attr_exp_flag_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves the atomic access attributes previously set for a shared
+///        allocation
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads
+///       with the same pointer.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///         + `nullptr == pAttr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetAtomicAccessAttributeExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context
+    ze_device_handle_t hDevice,                                             ///< [in] device associated with the memory advice
+    const void* ptr,                                                        ///< [in] Pointer to the start of the memory range
+    size_t size,                                                            ///< [in] Size in bytes of the memory range
+    ze_memory_atomic_attr_exp_flags_t* pAttr                                ///< [out] Atomic access attributes for the specified range
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Module
+#if !defined(__GNUC__)
+#pragma region module
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported module creation input formats
+typedef enum _ze_module_format_t
+{
+    ZE_MODULE_FORMAT_IL_SPIRV = 0,                                          ///< Format is SPIRV IL format
+    ZE_MODULE_FORMAT_NATIVE = 1,                                            ///< Format is device native format
+    ZE_MODULE_FORMAT_FORCE_UINT32 = 0x7fffffff
+
+} ze_module_format_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Specialization constants - User defined constants
+typedef struct _ze_module_constants_t
+{
+    uint32_t numConstants;                                                  ///< [in] Number of specialization constants.
+    const uint32_t* pConstantIds;                                           ///< [in][range(0, numConstants)] Array of IDs that is sized to
+                                                                            ///< numConstants.
+    const void** pConstantValues;                                           ///< [in][range(0, numConstants)] Array of pointers to values that is sized
+                                                                            ///< to numConstants.
+
+} ze_module_constants_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module descriptor
+typedef struct _ze_module_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_module_format_t format;                                              ///< [in] Module format passed in with pInputModule
+    size_t inputSize;                                                       ///< [in] size of input IL or ISA from pInputModule.
+    const uint8_t* pInputModule;                                            ///< [in] pointer to IL or ISA
+    const char* pBuildFlags;                                                ///< [in][optional] string containing one or more (comma-separated)
+                                                                            ///< compiler flags. If unsupported, flag is ignored with a warning.
+                                                                            ///<  - "-ze-opt-disable"
+                                                                            ///<       - Disable optimizations
+                                                                            ///<  - "-ze-opt-level"
+                                                                            ///<       - Specifies optimization level for compiler. Levels are
+                                                                            ///< implementation specific.
+                                                                            ///<           - 0 is no optimizations (equivalent to -ze-opt-disable)
+                                                                            ///<           - 1 is optimize minimally (may be the same as 2)
+                                                                            ///<           - 2 is optimize more (default)
+                                                                            ///<  - "-ze-opt-greater-than-4GB-buffer-required"
+                                                                            ///<       - Use 64-bit offset calculations for buffers.
+                                                                            ///<  - "-ze-opt-large-register-file"
+                                                                            ///<       - Increase number of registers available to threads.
+                                                                            ///<  - "-ze-opt-has-buffer-offset-arg"
+                                                                            ///<       - Extend stateless to stateful optimization to more
+                                                                            ///<         cases with the use of additional offset (e.g. 64-bit
+                                                                            ///<         pointer to binding table with 32-bit offset).
+                                                                            ///<  - "-g"
+                                                                            ///<       - Include debugging information.
+    const ze_module_constants_t* pConstants;                                ///< [in][optional] pointer to specialization constants. Valid only for
+                                                                            ///< SPIR-V input. This must be set to nullptr if no specialization
+                                                                            ///< constants are provided.
+
+} ze_module_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a module on the context.
+/// 
+/// @details
+///     - Compiles the module for execution on the device.
+///     - The application must only use the module for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The module can be copied to other devices and contexts within the same
+///       driver instance by using ::zeModuleGetNativeBinary.
+///     - A build log can optionally be returned to the caller. The caller is
+///       responsible for destroying build log using ::zeModuleBuildLogDestroy.
+///     - The module descriptor constants are only supported for SPIR-V
+///       specialization constants.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == desc->pInputModule`
+///         + `nullptr == phModule`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_MODULE_FORMAT_NATIVE < desc->format`
+///     - ::ZE_RESULT_ERROR_INVALID_NATIVE_BINARY
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `0 == desc->inputSize`
+///     - ::ZE_RESULT_ERROR_MODULE_BUILD_FAILURE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_module_desc_t* desc,                                           ///< [in] pointer to module descriptor
+    ze_module_handle_t* phModule,                                           ///< [out] pointer to handle of module object created
+    ze_module_build_log_handle_t* phBuildLog                                ///< [out][optional] pointer to handle of module's build log.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys module
+/// 
+/// @details
+///     - The application must destroy all kernel handles created from the
+///       module before destroying the module itself.
+///     - The application must ensure the device is not currently referencing
+///       the module before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this module.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same module handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleDestroy(
+    ze_module_handle_t hModule                                              ///< [in][release] handle of the module
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Dynamically link modules together that share import/export linkage
+///        dependencies.
+/// 
+/// @details
+///     - Modules support SPIR-V import and export linkage types for functions
+///       and global variables. See the SPIR-V specification for linkage
+///       details.
+///     - Modules can have both import and export linkage.
+///     - Modules that do not have any imports or exports do not need to be
+///       linked.
+///     - All module import requirements must be satisfied via linking before
+///       kernel objects can be created from them.
+///     - Modules cannot be partially linked. Unsatisfiable import dependencies
+///       in the set of modules passed to ::zeModuleDynamicLink will result in 
+///       ::ZE_RESULT_ERROR_MODULE_LINK_FAILURE being returned.
+///     - Modules will only be linked once. A module can be used in multiple
+///       link calls if it has exports but its imports will not be re-linked.
+///     - Ambiguous dependencies, where multiple modules satisfy the same import
+///       dependencies for a module, are not allowed.
+///     - The application must ensure the modules being linked were created on
+///       the same context.
+///     - The application may call this function from simultaneous threads as
+///       long as the import modules being linked are not the same.
+///     - ModuleGetNativeBinary can be called on any module regardless of
+///       whether it is linked or not.
+///     - A link log can optionally be returned to the caller. The caller is
+///       responsible for destroying the link log using
+///       ::zeModuleBuildLogDestroy.
+///     - The link log may contain a list of the unresolved import dependencies
+///       if present.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phModules`
+///     - ::ZE_RESULT_ERROR_MODULE_LINK_FAILURE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleDynamicLink(
+    uint32_t numModules,                                                    ///< [in] number of modules to be linked pointed to by phModules.
+    ze_module_handle_t* phModules,                                          ///< [in][range(0, numModules)] pointer to an array of modules to
+                                                                            ///< dynamically link together.
+    ze_module_build_log_handle_t* phLinkLog                                 ///< [out][optional] pointer to handle of dynamic link log.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys module build log object
+/// 
+/// @details
+///     - The implementation of this function may immediately free all Host
+///       allocations associated with this object.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same build log handle.
+///     - The implementation of this function should be lock-free.
+///     - This function can be called before or after ::zeModuleDestroy for the
+///       associated module.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModuleBuildLog`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleBuildLogDestroy(
+    ze_module_build_log_handle_t hModuleBuildLog                            ///< [in][release] handle of the module build log object.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves text string for build log.
+/// 
+/// @details
+///     - The caller can pass nullptr for pBuildLog when querying only for size.
+///     - The caller must provide memory for build log.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModuleBuildLog`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleBuildLogGetString(
+    ze_module_build_log_handle_t hModuleBuildLog,                           ///< [in] handle of the module build log object.
+    size_t* pSize,                                                          ///< [in,out] size of build log string.
+    char* pBuildLog                                                         ///< [in,out][optional] pointer to null-terminated string of the log.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve native binary from Module.
+/// 
+/// @details
+///     - The native binary output can be cached to disk and new modules can be
+///       later constructed from the cached copy.
+///     - The native binary will retain debugging information that is associated
+///       with a module.
+///     - The caller can pass nullptr for pModuleNativeBinary when querying only
+///       for size.
+///     - The implementation will copy the native binary into a buffer supplied
+///       by the caller.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleGetNativeBinary(
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    size_t* pSize,                                                          ///< [in,out] size of native binary in bytes.
+    uint8_t* pModuleNativeBinary                                            ///< [in,out][optional] byte pointer to native binary
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve global variable pointer from Module.
+/// 
+/// @details
+///     - The application may query global pointer from any module that either
+///       exports or imports it.
+///     - The application must dynamically link a module that imports a global
+///       before the global pointer can be queried from it.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pGlobalName`
+///     - ::ZE_RESULT_ERROR_INVALID_GLOBAL_NAME
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleGetGlobalPointer(
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    const char* pGlobalName,                                                ///< [in] name of global variable in module
+    size_t* pSize,                                                          ///< [in,out][optional] size of global variable
+    void** pptr                                                             ///< [in,out][optional] device visible pointer
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve all kernel names in the module.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleGetKernelNames(
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of names.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of names available.
+                                                                            ///< if count is greater than the number of names available, then the
+                                                                            ///< driver shall update the value with the correct number of names available.
+    const char** pNames                                                     ///< [in,out][optional][range(0, *pCount)] array of names of functions.
+                                                                            ///< if count is less than the number of names available, then driver shall
+                                                                            ///< only retrieve that number of names.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported module property flags
+typedef uint32_t ze_module_property_flags_t;
+typedef enum _ze_module_property_flag_t
+{
+    ZE_MODULE_PROPERTY_FLAG_IMPORTS = ZE_BIT(0),                            ///< Module has imports (i.e. imported global variables and/or kernels).
+                                                                            ///< See ::zeModuleDynamicLink.
+    ZE_MODULE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_module_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module properties
+typedef struct _ze_module_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_module_property_flags_t flags;                                       ///< [out] 0 (none) or a valid combination of ::ze_module_property_flag_t
+
+} ze_module_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve module properties.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pModuleProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleGetProperties(
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    ze_module_properties_t* pModuleProperties                               ///< [in,out] query result for module properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported kernel creation flags
+typedef uint32_t ze_kernel_flags_t;
+typedef enum _ze_kernel_flag_t
+{
+    ZE_KERNEL_FLAG_FORCE_RESIDENCY = ZE_BIT(0),                             ///< force all device allocations to be resident during execution
+    ZE_KERNEL_FLAG_EXPLICIT_RESIDENCY = ZE_BIT(1),                          ///< application is responsible for all residency of device allocations.
+                                                                            ///< driver may disable implicit residency management.
+    ZE_KERNEL_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_kernel_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel descriptor
+typedef struct _ze_kernel_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_kernel_flags_t flags;                                                ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_kernel_flag_t;
+                                                                            ///< default behavior may use driver-based residency.
+    const char* pKernelName;                                                ///< [in] null-terminated name of kernel in module
+
+} ze_kernel_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a kernel from the module.
+/// 
+/// @details
+///     - Modules that have unresolved imports need to be dynamically linked
+///       before a kernel can be created from them. (See ::zeModuleDynamicLink)
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == desc->pKernelName`
+///         + `nullptr == phKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_NAME
+///     - ::ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelCreate(
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    const ze_kernel_desc_t* desc,                                           ///< [in] pointer to kernel descriptor
+    ze_kernel_handle_t* phKernel                                            ///< [out] handle of the Function object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a kernel object
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the kernel before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this kernel.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same kernel handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelDestroy(
+    ze_kernel_handle_t hKernel                                              ///< [in][release] handle of the kernel object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve a function pointer from a module by name
+/// 
+/// @details
+///     - The function pointer is unique for the device on which the module was
+///       created.
+///     - The function pointer is no longer valid if module is destroyed.
+///     - The function name should only refer to callable functions within the
+///       module.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pFunctionName`
+///         + `nullptr == pfnFunction`
+///     - ::ZE_RESULT_ERROR_INVALID_FUNCTION_NAME
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleGetFunctionPointer(
+    ze_module_handle_t hModule,                                             ///< [in] handle of the module
+    const char* pFunctionName,                                              ///< [in] Name of function to retrieve function pointer for.
+    void** pfnFunction                                                      ///< [out] pointer to function.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set group size for a kernel.
+/// 
+/// @details
+///     - The group size will be used when a ::zeCommandListAppendLaunchKernel
+///       variant is called.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSetGroupSize(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t groupSizeX,                                                    ///< [in] group size for X dimension to use for this kernel
+    uint32_t groupSizeY,                                                    ///< [in] group size for Y dimension to use for this kernel
+    uint32_t groupSizeZ                                                     ///< [in] group size for Z dimension to use for this kernel
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query a suggested group size for a kernel given a global size for each
+///        dimension.
+/// 
+/// @details
+///     - This function ignores the group size that is set using
+///       ::zeKernelSetGroupSize.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == groupSizeX`
+///         + `nullptr == groupSizeY`
+///         + `nullptr == groupSizeZ`
+///     - ::ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSuggestGroupSize(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t globalSizeX,                                                   ///< [in] global width for X dimension
+    uint32_t globalSizeY,                                                   ///< [in] global width for Y dimension
+    uint32_t globalSizeZ,                                                   ///< [in] global width for Z dimension
+    uint32_t* groupSizeX,                                                   ///< [out] recommended size of group for X dimension
+    uint32_t* groupSizeY,                                                   ///< [out] recommended size of group for Y dimension
+    uint32_t* groupSizeZ                                                    ///< [out] recommended size of group for Z dimension
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query a suggested max group count for a cooperative kernel.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == totalGroupCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSuggestMaxCooperativeGroupCount(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t* totalGroupCount                                               ///< [out] recommended total group count.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set kernel argument for a kernel.
+/// 
+/// @details
+///     - The argument values will be used when a
+///       ::zeCommandListAppendLaunchKernel variant is called.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX
+///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSetArgumentValue(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t argIndex,                                                      ///< [in] argument index in range [0, num args - 1]
+    size_t argSize,                                                         ///< [in] size of argument type
+    const void* pArgValue                                                   ///< [in][optional] argument value represented as matching arg type. If
+                                                                            ///< null then argument value is considered null.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel indirect access flags
+typedef uint32_t ze_kernel_indirect_access_flags_t;
+typedef enum _ze_kernel_indirect_access_flag_t
+{
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST = ZE_BIT(0),                        ///< Indicates that the kernel accesses host allocations indirectly.
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE = ZE_BIT(1),                      ///< Indicates that the kernel accesses device allocations indirectly.
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED = ZE_BIT(2),                      ///< Indicates that the kernel accesses shared allocations indirectly.
+    ZE_KERNEL_INDIRECT_ACCESS_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_kernel_indirect_access_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sets kernel indirect access flags.
+/// 
+/// @details
+///     - The application should specify which allocations will be indirectly
+///       accessed by the kernel to allow driver to optimize which allocations
+///       are made resident
+///     - This function may **not** be called from simultaneous threads with the
+///       same Kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSetIndirectAccess(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_kernel_indirect_access_flags_t flags                                 ///< [in] kernel indirect access flags
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve kernel indirect access flags.
+/// 
+/// @details
+///     - This function may be called from simultaneous threads with the same
+///       Kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pFlags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelGetIndirectAccess(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_kernel_indirect_access_flags_t* pFlags                               ///< [out] query result for kernel indirect access flags.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve all declared kernel attributes (i.e. can be specified with
+///        __attribute__ in runtime language).
+/// 
+/// @details
+///     - This function may be called from simultaneous threads with the same
+///       Kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelGetSourceAttributes(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t* pSize,                                                        ///< [in,out] pointer to size of string in bytes, including
+                                                                            ///< null-terminating character.
+    char** pString                                                          ///< [in,out][optional] pointer to application-managed character array
+                                                                            ///< (string data).
+                                                                            ///< If NULL, the string length of the kernel source attributes, including
+                                                                            ///< a null-terminating character, is returned in pSize.
+                                                                            ///< Otherwise, pString must point to valid application memory that is
+                                                                            ///< greater than or equal to *pSize bytes in length, and on return the
+                                                                            ///< pointed-to string will contain a space-separated list of kernel source attributes.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported Cache Config flags
+typedef uint32_t ze_cache_config_flags_t;
+typedef enum _ze_cache_config_flag_t
+{
+    ZE_CACHE_CONFIG_FLAG_LARGE_SLM = ZE_BIT(0),                             ///< Large SLM size
+    ZE_CACHE_CONFIG_FLAG_LARGE_DATA = ZE_BIT(1),                            ///< Large General Data size
+    ZE_CACHE_CONFIG_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_cache_config_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sets the preferred cache configuration.
+/// 
+/// @details
+///     - The cache configuration will be used when a
+///       ::zeCommandListAppendLaunchKernel variant is called.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < flags`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSetCacheConfig(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_cache_config_flags_t flags                                           ///< [in] cache configuration.
+                                                                            ///< must be 0 (default configuration) or a valid combination of ::ze_cache_config_flag_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_KERNEL_UUID_SIZE
+/// @brief Maximum kernel universal unique id (UUID) size in bytes
+#define ZE_MAX_KERNEL_UUID_SIZE  16
+#endif // ZE_MAX_KERNEL_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_MODULE_UUID_SIZE
+/// @brief Maximum module universal unique id (UUID) size in bytes
+#define ZE_MAX_MODULE_UUID_SIZE  16
+#endif // ZE_MAX_MODULE_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel universal unique id (UUID)
+typedef struct _ze_kernel_uuid_t
+{
+    uint8_t kid[ZE_MAX_KERNEL_UUID_SIZE];                                   ///< [out] opaque data representing a kernel UUID
+    uint8_t mid[ZE_MAX_MODULE_UUID_SIZE];                                   ///< [out] opaque data representing the kernel's module UUID
+
+} ze_kernel_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel properties
+typedef struct _ze_kernel_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t numKernelArgs;                                                 ///< [out] number of kernel arguments.
+    uint32_t requiredGroupSizeX;                                            ///< [out] required group size in the X dimension,
+                                                                            ///< or zero if there is no required group size
+    uint32_t requiredGroupSizeY;                                            ///< [out] required group size in the Y dimension,
+                                                                            ///< or zero if there is no required group size
+    uint32_t requiredGroupSizeZ;                                            ///< [out] required group size in the Z dimension,
+                                                                            ///< or zero if there is no required group size
+    uint32_t requiredNumSubGroups;                                          ///< [out] required number of subgroups per thread group,
+                                                                            ///< or zero if there is no required number of subgroups
+    uint32_t requiredSubgroupSize;                                          ///< [out] required subgroup size,
+                                                                            ///< or zero if there is no required subgroup size
+    uint32_t maxSubgroupSize;                                               ///< [out] maximum subgroup size
+    uint32_t maxNumSubgroups;                                               ///< [out] maximum number of subgroups per thread group
+    uint32_t localMemSize;                                                  ///< [out] local memory size used by each thread group
+    uint32_t privateMemSize;                                                ///< [out] private memory size allocated by compiler used by each thread
+    uint32_t spillMemSize;                                                  ///< [out] spill memory size allocated by compiler
+    ze_kernel_uuid_t uuid;                                                  ///< [out] universal unique identifier.
+
+} ze_kernel_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Additional kernel preferred group size properties
+/// 
+/// @details
+///     - This structure may be passed to ::zeKernelGetProperties, via the
+///       `pNext` member of ::ze_kernel_properties_t, to query additional kernel
+///       preferred group size properties.
+typedef struct _ze_kernel_preferred_group_size_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t preferredMultiple;                                             ///< [out] preferred group size multiple
+
+} ze_kernel_preferred_group_size_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve kernel properties.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pKernelProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelGetProperties(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_kernel_properties_t* pKernelProperties                               ///< [in,out] query result for kernel properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve kernel name from Kernel.
+/// 
+/// @details
+///     - The caller can pass nullptr for pName when querying only for size.
+///     - The implementation will copy the kernel name into a buffer supplied by
+///       the caller.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelGetName(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    size_t* pSize,                                                          ///< [in,out] size of kernel name string, including null terminator, in
+                                                                            ///< bytes.
+    char* pName                                                             ///< [in,out][optional] char pointer to kernel name.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel dispatch group count.
+typedef struct _ze_group_count_t
+{
+    uint32_t groupCountX;                                                   ///< [in] number of thread groups in X dimension
+    uint32_t groupCountY;                                                   ///< [in] number of thread groups in Y dimension
+    uint32_t groupCountZ;                                                   ///< [in] number of thread groups in Z dimension
+
+} ze_group_count_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Launch kernel over one or more work groups.
+/// 
+/// @details
+///     - The application must ensure the kernel and events are accessible by
+///       the device on which the command list was created.
+///     - This may **only** be called for a command list created with command
+///       queue group ordinal that supports compute.
+///     - The application must ensure the command list, kernel and events were
+///       created on the same context.
+///     - This function may **not** be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pLaunchFuncArgs`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendLaunchKernel(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    const ze_group_count_t* pLaunchFuncArgs,                                ///< [in] thread group launch arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Launch kernel cooperatively over one or more work groups.
+/// 
+/// @details
+///     - The application must ensure the kernel and events are accessible by
+///       the device on which the command list was created.
+///     - This may **only** be called for a command list created with command
+///       queue group ordinal that supports compute.
+///     - This may only be used for a command list that are submitted to command
+///       queue with cooperative flag set.
+///     - The application must ensure the command list, kernel and events were
+///       created on the same context.
+///     - This function may **not** be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+///     - Use ::zeKernelSuggestMaxCooperativeGroupCount to recommend max group
+///       count for device for cooperative functions that device supports.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pLaunchFuncArgs`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendLaunchCooperativeKernel(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    const ze_group_count_t* pLaunchFuncArgs,                                ///< [in] thread group launch arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Launch kernel over one or more work groups using indirect arguments.
+/// 
+/// @details
+///     - The application must ensure the kernel and events are accessible by
+///       the device on which the command list was created.
+///     - The application must ensure the launch arguments are visible to the
+///       device on which the command list was created.
+///     - The implementation must not access the contents of the launch
+///       arguments as they are free to be modified by either the Host or device
+///       up until execution.
+///     - This may **only** be called for a command list created with command
+///       queue group ordinal that supports compute.
+///     - The application must ensure the command list, kernel and events were
+///       created, and the memory was allocated, on the same context.
+///     - This function may **not** be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pLaunchArgumentsBuffer`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendLaunchKernelIndirect(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    const ze_group_count_t* pLaunchArgumentsBuffer,                         ///< [in] pointer to device buffer that will contain thread group launch
+                                                                            ///< arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Launch multiple kernels over one or more work groups using an array of
+///        indirect arguments.
+/// 
+/// @details
+///     - The application must ensure the kernel and events are accessible by
+///       the device on which the command list was created.
+///     - The application must ensure the array of launch arguments and count
+///       buffer are visible to the device on which the command list was
+///       created.
+///     - The implementation must not access the contents of the array of launch
+///       arguments or count buffer as they are free to be modified by either
+///       the Host or device up until execution.
+///     - This may **only** be called for a command list created with command
+///       queue group ordinal that supports compute.
+///     - The application must enusre the command list, kernel and events were
+///       created, and the memory was allocated, on the same context.
+///     - This function may **not** be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phKernels`
+///         + `nullptr == pCountBuffer`
+///         + `nullptr == pLaunchArgumentsBuffer`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendLaunchMultipleKernelsIndirect(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numKernels,                                                    ///< [in] maximum number of kernels to launch
+    ze_kernel_handle_t* phKernels,                                          ///< [in][range(0, numKernels)] handles of the kernel objects
+    const uint32_t* pCountBuffer,                                           ///< [in] pointer to device memory location that will contain the actual
+                                                                            ///< number of kernels to launch; value must be less than or equal to
+                                                                            ///< numKernels
+    const ze_group_count_t* pLaunchArgumentsBuffer,                         ///< [in][range(0, numKernels)] pointer to device buffer that will contain
+                                                                            ///< a contiguous array of thread group launch arguments
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting module programs.
+#if !defined(__GNUC__)
+#pragma region program
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MODULE_PROGRAM_EXP_NAME
+/// @brief Module Program Extension Name
+#define ZE_MODULE_PROGRAM_EXP_NAME  "ZE_experimental_module_program"
+#endif // ZE_MODULE_PROGRAM_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module Program Extension Version(s)
+typedef enum _ze_module_program_exp_version_t
+{
+    ZE_MODULE_PROGRAM_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0
+    ZE_MODULE_PROGRAM_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),        ///< latest known version
+    ZE_MODULE_PROGRAM_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_module_program_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module extended descriptor to support multiple input modules.
+/// 
+/// @details
+///     - Implementation must support ::ZE_experimental_module_program extension
+///     - Modules support import and export linkage for functions and global
+///       variables.
+///     - SPIR-V import and export linkage types are used. See SPIR-V
+///       specification for linkage details.
+///     - pInputModules, pBuildFlags, and pConstants from ::ze_module_desc_t is
+///       ignored.
+///     - Format in ::ze_module_desc_t needs to be set to
+///       ::ZE_MODULE_FORMAT_IL_SPIRV.
+typedef struct _ze_module_program_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t count;                                                         ///< [in] Count of input modules
+    const size_t* inputSizes;                                               ///< [in][range(0, count)] sizes of each input IL module in pInputModules.
+    const uint8_t** pInputModules;                                          ///< [in][range(0, count)] pointer to an array of IL (e.g. SPIR-V modules).
+                                                                            ///< Valid only for SPIR-V input.
+    const char** pBuildFlags;                                               ///< [in][optional][range(0, count)] array of strings containing build
+                                                                            ///< flags. See pBuildFlags in ::ze_module_desc_t.
+    const ze_module_constants_t** pConstants;                               ///< [in][optional][range(0, count)] pointer to array of specialization
+                                                                            ///< constant strings. Valid only for SPIR-V input. This must be set to
+                                                                            ///< nullptr if no specialization constants are provided.
+
+} ze_module_program_exp_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Raytracing
+#if !defined(__GNUC__)
+#pragma region raytracing
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_RAYTRACING_EXT_NAME
+/// @brief Raytracing Extension Name
+#define ZE_RAYTRACING_EXT_NAME  "ZE_extension_raytracing"
+#endif // ZE_RAYTRACING_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Raytracing Extension Version(s)
+typedef enum _ze_raytracing_ext_version_t
+{
+    ZE_RAYTRACING_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_RAYTRACING_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
+    ZE_RAYTRACING_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_raytracing_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported raytracing capability flags
+typedef uint32_t ze_device_raytracing_ext_flags_t;
+typedef enum _ze_device_raytracing_ext_flag_t
+{
+    ZE_DEVICE_RAYTRACING_EXT_FLAG_RAYQUERY = ZE_BIT(0),                     ///< Supports rayquery
+    ZE_DEVICE_RAYTRACING_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_raytracing_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Raytracing properties queried using ::zeDeviceGetModuleProperties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
+///       the `pNext` member of ::ze_device_module_properties_t.
+typedef struct _ze_device_raytracing_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_raytracing_ext_flags_t flags;                                 ///< [out] 0 or a valid combination of ::ze_device_raytracing_ext_flags_t
+    uint32_t maxBVHLevels;                                                  ///< [out] Maximum number of BVH levels supported
+
+} ze_device_raytracing_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported raytracing memory allocation flags
+typedef uint32_t ze_raytracing_mem_alloc_ext_flags_t;
+typedef enum _ze_raytracing_mem_alloc_ext_flag_t
+{
+    ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_TBD = ZE_BIT(0),                       ///< reserved for future use
+    ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_raytracing_mem_alloc_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Raytracing memory allocation descriptor
+/// 
+/// @details
+///     - This structure must be passed to ::zeMemAllocShared or
+///       ::zeMemAllocDevice, via the `pNext` member of
+///       ::ze_device_mem_alloc_desc_t, for any memory allocation that is to be
+///       accessed by raytracing fixed-function of the device.
+typedef struct _ze_raytracing_mem_alloc_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_raytracing_mem_alloc_ext_flags_t flags;                              ///< [in] flags specifying additional allocation controls.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_raytracing_mem_alloc_ext_flag_t;
+                                                                            ///< default behavior may use implicit driver-based heuristics.
+
+} ze_raytracing_mem_alloc_ext_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Memory Residency
+#if !defined(__GNUC__)
+#pragma region residency
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Makes memory resident for the device.
+/// 
+/// @details
+///     - The application must ensure the memory is resident before being
+///       referenced by the device
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + ptr is not recognized by the implementation
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextMakeMemoryResident(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    void* ptr,                                                              ///< [in] pointer to memory to make resident
+    size_t size                                                             ///< [in] size in bytes to make resident
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Allows memory to be evicted from the device.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the memory before it is evicted
+///     - The application may free the memory without evicting; the memory is
+///       implicitly evicted when freed.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextEvictMemory(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    void* ptr,                                                              ///< [in] pointer to memory to evict
+    size_t size                                                             ///< [in] size in bytes to evict
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Makes image resident for the device.
+/// 
+/// @details
+///     - The application must ensure the image is resident before being
+///       referenced by the device
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///         + `nullptr == hImage`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextMakeImageResident(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_image_handle_t hImage                                                ///< [in] handle of image to make resident
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Allows image to be evicted from the device.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the image before it is evicted
+///     - The application may destroy the image without evicting; the image is
+///       implicitly evicted when destroyed.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///         + `nullptr == hImage`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeContextEvictImage(
+    ze_context_handle_t hContext,                                           ///< [in] handle of context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_image_handle_t hImage                                                ///< [in] handle of image to make evict
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Sampler
+#if !defined(__GNUC__)
+#pragma region sampler
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sampler addressing modes
+typedef enum _ze_sampler_address_mode_t
+{
+    ZE_SAMPLER_ADDRESS_MODE_NONE = 0,                                       ///< No coordinate modifications for out-of-bounds image access.
+    ZE_SAMPLER_ADDRESS_MODE_REPEAT = 1,                                     ///< Out-of-bounds coordinates are wrapped back around.
+    ZE_SAMPLER_ADDRESS_MODE_CLAMP = 2,                                      ///< Out-of-bounds coordinates are clamped to edge.
+    ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3,                            ///< Out-of-bounds coordinates are clamped to border color which is (0.0f,
+                                                                            ///< 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise
+                                                                            ///< (0.0f, 0.0f, 0.0f, 1.0f).
+    ZE_SAMPLER_ADDRESS_MODE_MIRROR = 4,                                     ///< Out-of-bounds coordinates are mirrored starting from edge.
+    ZE_SAMPLER_ADDRESS_MODE_FORCE_UINT32 = 0x7fffffff
+
+} ze_sampler_address_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sampler filtering modes
+typedef enum _ze_sampler_filter_mode_t
+{
+    ZE_SAMPLER_FILTER_MODE_NEAREST = 0,                                     ///< No coordinate modifications for out of bounds image access.
+    ZE_SAMPLER_FILTER_MODE_LINEAR = 1,                                      ///< Out-of-bounds coordinates are wrapped back around.
+    ZE_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff
+
+} ze_sampler_filter_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sampler descriptor
+typedef struct _ze_sampler_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_sampler_address_mode_t addressMode;                                  ///< [in] Sampler addressing mode to determine how out-of-bounds
+                                                                            ///< coordinates are handled.
+    ze_sampler_filter_mode_t filterMode;                                    ///< [in] Sampler filter mode to determine how samples are filtered.
+    ze_bool_t isNormalized;                                                 ///< [in] Are coordinates normalized [0, 1] or not.
+
+} ze_sampler_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates sampler on the context.
+/// 
+/// @details
+///     - The application must only use the sampler for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phSampler`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_SAMPLER_ADDRESS_MODE_MIRROR < desc->addressMode`
+///         + `::ZE_SAMPLER_FILTER_MODE_LINEAR < desc->filterMode`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeSamplerCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_sampler_desc_t* desc,                                          ///< [in] pointer to sampler descriptor
+    ze_sampler_handle_t* phSampler                                          ///< [out] handle of the sampler
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys sampler object
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the sampler before it is deleted.
+///     - The implementation of this function may immediately free all Host and
+///       Device allocations associated with this sampler.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same sampler handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hSampler`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeSamplerDestroy(
+    ze_sampler_handle_t hSampler                                            ///< [in][release] handle of the sampler
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero APIs for Virtual Memory Management
+#if !defined(__GNUC__)
+#pragma region virtual
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Virtual memory page access attributes
+typedef enum _ze_memory_access_attribute_t
+{
+    ZE_MEMORY_ACCESS_ATTRIBUTE_NONE = 0,                                    ///< Indicates the memory page is inaccessible.
+    ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE = 1,                               ///< Indicates the memory page supports read write access.
+    ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY = 2,                                ///< Indicates the memory page supports read-only access.
+    ZE_MEMORY_ACCESS_ATTRIBUTE_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_access_attribute_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reserves pages in virtual address space.
+/// 
+/// @details
+///     - The application must only use the memory allocation on the context for
+///       which it was created.
+///     - The starting address and size must be page aligned. See
+///       ::zeVirtualMemQueryPageSize.
+///     - If pStart is not null then implementation will attempt to reserve
+///       starting from that address. If not available then will find another
+///       suitable starting address.
+///     - The application may call this function from simultaneous threads.
+///     - The access attributes will default to none to indicate reservation is
+///       inaccessible.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pptr`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemReserve(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* pStart,                                                     ///< [in][optional] pointer to start of region to reserve. If nullptr then
+                                                                            ///< implementation will choose a start address.
+    size_t size,                                                            ///< [in] size in bytes to reserve; must be page aligned.
+    void** pptr                                                             ///< [out] pointer to virtual reservation.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Free pages in a reserved virtual address range.
+/// 
+/// @details
+///     - Any existing virtual mappings for the range will be unmapped.
+///     - Physical allocations objects that were mapped to this range will not
+///       be destroyed. These need to be destroyed explicitly.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemFree(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of region to free.
+    size_t size                                                             ///< [in] size in bytes to free; must be page aligned.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queries page size to use for aligning virtual memory reservations and
+///        physical memory allocations.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pagesize`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemQueryPageSize(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    size_t size,                                                            ///< [in] unaligned allocation size in bytes
+    size_t* pagesize                                                        ///< [out] pointer to page size to use for start address and size
+                                                                            ///< alignments.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported physical memory creation flags
+typedef uint32_t ze_physical_mem_flags_t;
+typedef enum _ze_physical_mem_flag_t
+{
+    ZE_PHYSICAL_MEM_FLAG_ALLOCATE_ON_DEVICE = ZE_BIT(0),                    ///< [default] allocate physical device memory.
+    ZE_PHYSICAL_MEM_FLAG_ALLOCATE_ON_HOST = ZE_BIT(1),                      ///< Allocate physical host memory instead.
+    ZE_PHYSICAL_MEM_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_physical_mem_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Physical memory descriptor
+typedef struct _ze_physical_mem_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_physical_mem_flags_t flags;                                          ///< [in] creation flags.
+                                                                            ///< must be 0 (default) or a valid combination of
+                                                                            ///< ::ze_physical_mem_flag_t; default is to create physical device memory.
+    size_t size;                                                            ///< [in] size in bytes to reserve; must be page aligned.
+
+} ze_physical_mem_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a physical memory object for the context.
+/// 
+/// @details
+///     - The application must only use the physical memory object on the
+///       context for which it was created.
+///     - The size must be page aligned. For host memory, the operating system
+///       page size should be used. For device memory, see
+///       ::zeVirtualMemQueryPageSize.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phPhysicalMemory`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == desc->size`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zePhysicalMemCreate(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object, can be `nullptr` if creating
+                                                                            ///< physical host memory.
+    ze_physical_mem_desc_t* desc,                                           ///< [in] pointer to physical memory descriptor.
+    ze_physical_mem_handle_t* phPhysicalMemory                              ///< [out] pointer to handle of physical memory object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a physical memory object.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the physical memory object before it is deleted
+///     - The application must **not** call this function from simultaneous
+///       threads with the same physical memory handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hPhysicalMemory`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zePhysicalMemDestroy(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_physical_mem_handle_t hPhysicalMemory                                ///< [in][release] handle of physical memory object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Maps pages in virtual address space to pages from physical memory
+///        object.
+/// 
+/// @details
+///     - The virtual address range must have been reserved using
+///       ::zeVirtualMemReserve.
+///     - The application must only use the mapped memory allocation on the
+///       context for which it was created.
+///     - The virtual start address and size must be page aligned. See
+///       ::zeVirtualMemQueryPageSize.
+///     - The application should use, for the starting address and size, the
+///       same size alignment used for the physical allocation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hPhysicalMemory`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemMap(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of virtual address range to map.
+    size_t size,                                                            ///< [in] size in bytes of virtual address range to map; must be page
+                                                                            ///< aligned.
+    ze_physical_mem_handle_t hPhysicalMemory,                               ///< [in] handle to physical memory object.
+    size_t offset,                                                          ///< [in] offset into physical memory allocation object; must be page
+                                                                            ///< aligned.
+    ze_memory_access_attribute_t access                                     ///< [in] specifies page access attributes to apply to the virtual address
+                                                                            ///< range.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Unmaps pages in virtual address space from pages from a physical
+///        memory object.
+/// 
+/// @details
+///     - The page access attributes for virtual address range will revert back
+///       to none.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Address must be page aligned
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///         + Size must be page aligned
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemUnmap(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of region to unmap.
+    size_t size                                                             ///< [in] size in bytes to unmap; must be page aligned.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set memory access attributes for a virtual address range.
+/// 
+/// @details
+///     - This function may be called from simultaneous threads with the same
+///       function handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Address must be page aligned
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///         + Size must be page aligned
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemSetAccessAttribute(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of reserved virtual address region.
+    size_t size,                                                            ///< [in] size in bytes; must be page aligned.
+    ze_memory_access_attribute_t access                                     ///< [in] specifies page access attributes to apply to the virtual address
+                                                                            ///< range.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get memory access attribute for a virtual address range.
+/// 
+/// @details
+///     - If size and outSize are equal then the pages in the specified virtual
+///       address range have the same access attributes.
+///     - This function may be called from simultaneous threads with the same
+///       function handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///         + `nullptr == access`
+///         + `nullptr == outSize`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT
+///         + Address must be page aligned
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_SIZE
+///         + `0 == size`
+///         + Size must be page aligned
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeVirtualMemGetAccessAttribute(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const void* ptr,                                                        ///< [in] pointer to start of virtual address region for query.
+    size_t size,                                                            ///< [in] size in bytes; must be page aligned.
+    ze_memory_access_attribute_t* access,                                   ///< [out] query result for page access attribute.
+    size_t* outSize                                                         ///< [out] query result for size of virtual address range, starting at ptr,
+                                                                            ///< that shares same access attribute.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Floating-Point Atomics
+#if !defined(__GNUC__)
+#pragma region floatAtomics
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_FLOAT_ATOMICS_EXT_NAME
+/// @brief Floating-Point Atomics Extension Name
+#define ZE_FLOAT_ATOMICS_EXT_NAME  "ZE_extension_float_atomics"
+#endif // ZE_FLOAT_ATOMICS_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Floating-Point Atomics Extension Version(s)
+typedef enum _ze_float_atomics_ext_version_t
+{
+    ZE_FLOAT_ATOMICS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZE_FLOAT_ATOMICS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
+    ZE_FLOAT_ATOMICS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_float_atomics_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported floating-point atomic capability flags
+typedef uint32_t ze_device_fp_atomic_ext_flags_t;
+typedef enum _ze_device_fp_atomic_ext_flag_t
+{
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_LOAD_STORE = ZE_BIT(0),             ///< Supports atomic load, store, and exchange
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_ADD = ZE_BIT(1),                    ///< Supports atomic add and subtract
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_GLOBAL_MIN_MAX = ZE_BIT(2),                ///< Supports atomic min and max
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_LOAD_STORE = ZE_BIT(16),             ///< Supports atomic load, store, and exchange
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_ADD = ZE_BIT(17),                    ///< Supports atomic add and subtract
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_LOCAL_MIN_MAX = ZE_BIT(18),                ///< Supports atomic min and max
+    ZE_DEVICE_FP_ATOMIC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_fp_atomic_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device floating-point atomic properties queried using
+///        ::zeDeviceGetModuleProperties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
+///       the `pNext` member of ::ze_device_module_properties_t.
+typedef struct _ze_float_atomic_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_fp_atomic_ext_flags_t fp16Flags;                              ///< [out] Capabilities for half-precision floating-point atomic operations
+    ze_device_fp_atomic_ext_flags_t fp32Flags;                              ///< [out] Capabilities for single-precision floating-point atomic
+                                                                            ///< operations
+    ze_device_fp_atomic_ext_flags_t fp64Flags;                              ///< [out] Capabilities for double-precision floating-point atomic
+                                                                            ///< operations
+
+} ze_float_atomic_ext_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting kernel global work offset.
+#if !defined(__GNUC__)
+#pragma region globaloffset
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_GLOBAL_OFFSET_EXP_NAME
+/// @brief Global Offset Extension Name
+#define ZE_GLOBAL_OFFSET_EXP_NAME  "ZE_experimental_global_offset"
+#endif // ZE_GLOBAL_OFFSET_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Global Offset Extension Version(s)
+typedef enum _ze_global_offset_exp_version_t
+{
+    ZE_GLOBAL_OFFSET_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZE_GLOBAL_OFFSET_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
+    ZE_GLOBAL_OFFSET_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_global_offset_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set global work offset for a kernel.
+/// 
+/// @details
+///     - The global work offset will be used when a
+///       ::zeCommandListAppendLaunchKernel() variant is called.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSetGlobalOffsetExp(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    uint32_t offsetX,                                                       ///< [in] global offset for X dimension to use for this kernel
+    uint32_t offsetY,                                                       ///< [in] global offset for Y dimension to use for this kernel
+    uint32_t offsetZ                                                        ///< [in] global offset for Z dimension to use for this kernel
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting relaxed allocation limits.
+#if !defined(__GNUC__)
+#pragma region relaxedAllocLimits
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME
+/// @brief Relaxed Allocation Limits Extension Name
+#define ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME  "ZE_experimental_relaxed_allocation_limits"
+#endif // ZE_RELAXED_ALLOCATION_LIMITS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Relaxed Allocation Limits Extension Version(s)
+typedef enum _ze_relaxed_allocation_limits_exp_version_t
+{
+    ZE_RELAXED_ALLOCATION_LIMITS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
+    ZE_RELAXED_ALLOCATION_LIMITS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_RELAXED_ALLOCATION_LIMITS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_relaxed_allocation_limits_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported relaxed memory allocation flags
+typedef uint32_t ze_relaxed_allocation_limits_exp_flags_t;
+typedef enum _ze_relaxed_allocation_limits_exp_flag_t
+{
+    ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE = ZE_BIT(0),             ///< Allocation size may exceed the `maxMemAllocSize` member of
+                                                                            ///< ::ze_device_properties_t.
+    ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_relaxed_allocation_limits_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Relaxed limits memory allocation descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemAllocShared or
+///       ::zeMemAllocDevice, via the `pNext` member of
+///       ::ze_device_mem_alloc_desc_t.
+///     - This structure may also be passed to ::zeMemAllocHost, via the `pNext`
+///       member of ::ze_host_mem_alloc_desc_t.
+typedef struct _ze_relaxed_allocation_limits_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_relaxed_allocation_limits_exp_flags_t flags;                         ///< [in] flags specifying allocation limits to relax.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_relaxed_allocation_limits_exp_flag_t;
+
+} ze_relaxed_allocation_limits_exp_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for retrieving kernel binary program data.
+#if !defined(__GNUC__)
+#pragma region kernelBinary
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_GET_KERNEL_BINARY_EXP_NAME
+/// @brief Get Kernel Binary Extension Name
+#define ZE_GET_KERNEL_BINARY_EXP_NAME  "ZE_extension_kernel_binary_exp"
+#endif // ZE_GET_KERNEL_BINARY_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves kernel binary program data (ISA GEN format).
+/// 
+/// @details
+///     - A valid kernel handle must be created with ::zeKernelCreate.
+///     - Returns Intel Graphics Assembly (GEN ISA) format binary program data
+///       for kernel handle.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+///         + `nullptr == pKernelBinary`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelGetBinaryExp(
+    ze_kernel_handle_t hKernel,                                             ///< [in] Kernel handle.
+    size_t* pSize,                                                          ///< [in,out] pointer to variable with size of GEN ISA binary.
+    uint8_t* pKernelBinary                                                  ///< [in,out] pointer to storage area for GEN ISA binary function.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Cache Reservation
+#if !defined(__GNUC__)
+#pragma region cacheReservation
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_CACHE_RESERVATION_EXT_NAME
+/// @brief Cache_Reservation Extension Name
+#define ZE_CACHE_RESERVATION_EXT_NAME  "ZE_extension_cache_reservation"
+#endif // ZE_CACHE_RESERVATION_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Cache_Reservation Extension Version(s)
+typedef enum _ze_cache_reservation_ext_version_t
+{
+    ZE_CACHE_RESERVATION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_CACHE_RESERVATION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
+    ZE_CACHE_RESERVATION_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_cache_reservation_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Cache Reservation Region
+typedef enum _ze_cache_ext_region_t
+{
+    ZE_CACHE_EXT_REGION_ZE_CACHE_REGION_DEFAULT = 0,                        ///< [DEPRECATED] utilize driver default scheme. Use
+                                                                            ///< ::ZE_CACHE_EXT_REGION_DEFAULT.
+    ZE_CACHE_EXT_REGION_ZE_CACHE_RESERVE_REGION = 1,                        ///< [DEPRECATED] utilize reserved region. Use
+                                                                            ///< ::ZE_CACHE_EXT_REGION_RESERVED.
+    ZE_CACHE_EXT_REGION_ZE_CACHE_NON_RESERVED_REGION = 2,                   ///< [DEPRECATED] utilize non-reserverd region. Use
+                                                                            ///< ::ZE_CACHE_EXT_REGION_NON_RESERVED.
+    ZE_CACHE_EXT_REGION_DEFAULT = 0,                                        ///< utilize driver default scheme
+    ZE_CACHE_EXT_REGION_RESERVED = 1,                                       ///< utilize reserved region
+    ZE_CACHE_EXT_REGION_NON_RESERVED = 2,                                   ///< utilize non-reserverd region
+    ZE_CACHE_EXT_REGION_FORCE_UINT32 = 0x7fffffff
+
+} ze_cache_ext_region_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief CacheReservation structure
+/// 
+/// @details
+///     - This structure must be passed to ::zeDeviceGetCacheProperties via the
+///       `pNext` member of ::ze_device_cache_properties_t
+///     - Used for determining the max cache reservation allowed on device. Size
+///       of zero means no reservation available.
+typedef struct _ze_cache_reservation_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    size_t maxCacheReservationSize;                                         ///< [out] max cache reservation size
+
+} ze_cache_reservation_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reserve Cache on Device
+/// 
+/// @details
+///     - The application may call this function but may not be successful as
+///       some other application may have reserve prior
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceReserveCacheExt(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    size_t cacheLevel,                                                      ///< [in] cache level where application want to reserve. If zero, then the
+                                                                            ///< driver shall default to last level of cache and attempt to reserve in
+                                                                            ///< that cache.
+    size_t cacheReservationSize                                             ///< [in] value for reserving size, in bytes. If zero, then the driver
+                                                                            ///< shall remove prior reservation
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Assign VA section to use reserved section
+/// 
+/// @details
+///     - The application may call this function to assign VA to particular
+///       reservartion region
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_CACHE_EXT_REGION_NON_RESERVED < cacheRegion`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceSetCacheAdviceExt(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object
+    void* ptr,                                                              ///< [in] memory pointer to query
+    size_t regionSize,                                                      ///< [in] region size, in pages
+    ze_cache_ext_region_t cacheRegion                                       ///< [in] reservation region
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting event query timestamps.
+#if !defined(__GNUC__)
+#pragma region eventquerytimestamps
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME
+/// @brief Event Query Timestamps Extension Name
+#define ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME  "ZE_experimental_event_query_timestamps"
+#endif // ZE_EVENT_QUERY_TIMESTAMPS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event Query Timestamps Extension Version(s)
+typedef enum _ze_event_query_timestamps_exp_version_t
+{
+    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),    ///< version 1.0
+    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_EVENT_QUERY_TIMESTAMPS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_query_timestamps_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query event timestamps for a device or sub-device.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support
+///       ::ZE_experimental_event_query_timestamps.
+///     - The implementation must return all timestamps for the specified event
+///       and device pair.
+///     - The implementation must return all timestamps for all sub-devices when
+///       device handle is parent device.
+///     - The implementation may return all timestamps for sub-devices when
+///       device handle is sub-device or may return 0 for count.
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventQueryTimestampsExp(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device to query
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of timestamp results.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of timestamps available.
+                                                                            ///< if count is greater than the number of timestamps available, then the
+                                                                            ///< driver shall update the value with the correct number of timestamps available.
+    ze_kernel_timestamp_result_t* pTimestamps                               ///< [in,out][optional][range(0, *pCount)] array of timestamp results.
+                                                                            ///< if count is less than the number of timestamps available, then driver
+                                                                            ///< shall only retrieve that number of timestamps.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting image memory properties.
+#if !defined(__GNUC__)
+#pragma region imagememoryproperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME
+/// @brief Image Memory Properties Extension Name
+#define ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME  "ZE_experimental_image_memory_properties"
+#endif // ZE_IMAGE_MEMORY_PROPERTIES_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image Memory Properties Extension Version(s)
+typedef enum _ze_image_memory_properties_exp_version_t
+{
+    ZE_IMAGE_MEMORY_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
+    ZE_IMAGE_MEMORY_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZE_IMAGE_MEMORY_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_memory_properties_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image memory properties
+typedef struct _ze_image_memory_properties_exp_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t size;                                                          ///< [out] size of image allocation in bytes.
+    uint64_t rowPitch;                                                      ///< [out] size of image row in bytes.
+    uint64_t slicePitch;                                                    ///< [out] size of image slice in bytes.
+
+} ze_image_memory_properties_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query image memory properties.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support
+///       ::ZE_experimental_image_memory_properties extension.
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pMemoryProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageGetMemoryPropertiesExp(
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object
+    ze_image_memory_properties_exp_t* pMemoryProperties                     ///< [in,out] query result for image memory properties.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting image views.
+#if !defined(__GNUC__)
+#pragma region imageview
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_VIEW_EXT_NAME
+/// @brief Image View Extension Name
+#define ZE_IMAGE_VIEW_EXT_NAME  "ZE_extension_image_view"
+#endif // ZE_IMAGE_VIEW_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image View Extension Version(s)
+typedef enum _ze_image_view_ext_version_t
+{
+    ZE_IMAGE_VIEW_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_IMAGE_VIEW_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
+    ZE_IMAGE_VIEW_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_view_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create image view on the context.
+/// 
+/// @details
+///     - The application must only use the image view for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support ::ZE_extension_image_view extension.
+///     - Image views are treated as images from the API.
+///     - Image views provide a mechanism to redescribe how an image is
+///       interpreted (e.g. different format).
+///     - Image views become disabled when their corresponding image resource is
+///       destroyed.
+///     - Use ::zeImageDestroy to destroy image view objects.
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///         + `nullptr == hImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phImageView`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageViewCreateExt(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object to create view from
+    ze_image_handle_t* phImageView                                          ///< [out] pointer to handle of image object created for view
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_VIEW_EXP_NAME
+/// @brief Image View Extension Name
+#define ZE_IMAGE_VIEW_EXP_NAME  "ZE_experimental_image_view"
+#endif // ZE_IMAGE_VIEW_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image View Extension Version(s)
+typedef enum _ze_image_view_exp_version_t
+{
+    ZE_IMAGE_VIEW_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_IMAGE_VIEW_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
+    ZE_IMAGE_VIEW_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_view_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create image view on the context.
+/// 
+/// @details
+///     - The application must only use the image view for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support ::ZE_experimental_image_view
+///       extension.
+///     - Image views are treated as images from the API.
+///     - Image views provide a mechanism to redescribe how an image is
+///       interpreted (e.g. different format).
+///     - Image views become disabled when their corresponding image resource is
+///       destroyed.
+///     - Use ::zeImageDestroy to destroy image view objects.
+///     - Note: This function is deprecated and replaced by
+///       ::zeImageViewCreateExt.
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///         + `nullptr == hImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phImageView`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < desc->flags`
+///         + `::ZE_IMAGE_TYPE_BUFFER < desc->type`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageViewCreateExp(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    const ze_image_desc_t* desc,                                            ///< [in] pointer to image descriptor
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object to create view from
+    ze_image_handle_t* phImageView                                          ///< [out] pointer to handle of image object created for view
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting image views for planar images.
+#if !defined(__GNUC__)
+#pragma region imageviewplanar
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_VIEW_PLANAR_EXT_NAME
+/// @brief Image View Planar Extension Name
+#define ZE_IMAGE_VIEW_PLANAR_EXT_NAME  "ZE_extension_image_view_planar"
+#endif // ZE_IMAGE_VIEW_PLANAR_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image View Planar Extension Version(s)
+typedef enum _ze_image_view_planar_ext_version_t
+{
+    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
+    ZE_IMAGE_VIEW_PLANAR_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_view_planar_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image view planar descriptor
+typedef struct _ze_image_view_planar_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t planeIndex;                                                    ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
+
+} ze_image_view_planar_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_VIEW_PLANAR_EXP_NAME
+/// @brief Image View Planar Extension Name
+#define ZE_IMAGE_VIEW_PLANAR_EXP_NAME  "ZE_experimental_image_view_planar"
+#endif // ZE_IMAGE_VIEW_PLANAR_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image View Planar Extension Version(s)
+typedef enum _ze_image_view_planar_exp_version_t
+{
+    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
+    ZE_IMAGE_VIEW_PLANAR_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_view_planar_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image view planar descriptor
+typedef struct _ze_image_view_planar_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t planeIndex;                                                    ///< [in] the 0-based plane index (e.g. NV12 is 0 = Y plane, 1 UV plane)
+
+} ze_image_view_planar_exp_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for specifying kernel scheduling hints.
+#if !defined(__GNUC__)
+#pragma region kernelSchedulingHints
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME
+/// @brief Kernel Scheduling Hints Extension Name
+#define ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME  "ZE_experimental_scheduling_hints"
+#endif // ZE_KERNEL_SCHEDULING_HINTS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel Scheduling Hints Extension Version(s)
+typedef enum _ze_scheduling_hints_exp_version_t
+{
+    ZE_SCHEDULING_HINTS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),          ///< version 1.0
+    ZE_SCHEDULING_HINTS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),      ///< latest known version
+    ZE_SCHEDULING_HINTS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_scheduling_hints_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported kernel scheduling hint flags
+typedef uint32_t ze_scheduling_hint_exp_flags_t;
+typedef enum _ze_scheduling_hint_exp_flag_t
+{
+    ZE_SCHEDULING_HINT_EXP_FLAG_OLDEST_FIRST = ZE_BIT(0),                   ///< Hint that the kernel prefers oldest-first scheduling
+    ZE_SCHEDULING_HINT_EXP_FLAG_ROUND_ROBIN = ZE_BIT(1),                    ///< Hint that the kernel prefers round-robin scheduling
+    ZE_SCHEDULING_HINT_EXP_FLAG_STALL_BASED_ROUND_ROBIN = ZE_BIT(2),        ///< Hint that the kernel prefers stall-based round-robin scheduling
+    ZE_SCHEDULING_HINT_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_scheduling_hint_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device kernel scheduling hint properties queried using
+///        ::zeDeviceGetModuleProperties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetModuleProperties, via
+///       the `pNext` member of ::ze_device_module_properties_t.
+typedef struct _ze_scheduling_hint_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_scheduling_hint_exp_flags_t schedulingHintFlags;                     ///< [out] Supported kernel scheduling hints.
+                                                                            ///< May be 0 (none) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
+
+} ze_scheduling_hint_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel scheduling hint descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeKernelSchedulingHintExp.
+typedef struct _ze_scheduling_hint_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_scheduling_hint_exp_flags_t flags;                                   ///< [in] flags specifying kernel scheduling hints.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_scheduling_hint_exp_flag_t.
+
+} ze_scheduling_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provide kernel scheduling hints that may improve performance
+/// 
+/// @details
+///     - The scheduling hints may improve performance only and are not required
+///       for correctness.
+///     - If a specified scheduling hint is unsupported it will be silently
+///       ignored.
+///     - If two conflicting scheduling hints are specified there is no defined behavior;
+///       the hints may be ignored or one hint may be chosen arbitrarily.
+///     - The application must not call this function from simultaneous threads
+///       with the same kernel handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pHint`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < pHint->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeKernelSchedulingHintExp(
+    ze_kernel_handle_t hKernel,                                             ///< [in] handle of the kernel object
+    ze_scheduling_hint_exp_desc_t* pHint                                    ///< [in] pointer to kernel scheduling hint descriptor
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for One-Definition-Rule Linkage Types
+#if !defined(__GNUC__)
+#pragma region linkonceodr
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_LINKONCE_ODR_EXT_NAME
+/// @brief Linkonce ODR Extension Name
+#define ZE_LINKONCE_ODR_EXT_NAME  "ZE_extension_linkonce_odr"
+#endif // ZE_LINKONCE_ODR_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Linkonce ODR Extension Version(s)
+typedef enum _ze_linkonce_odr_ext_version_t
+{
+    ZE_LINKONCE_ODR_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),              ///< version 1.0
+    ZE_LINKONCE_ODR_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),          ///< latest known version
+    ZE_LINKONCE_ODR_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_linkonce_odr_ext_version_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting power saving hint.
+#if !defined(__GNUC__)
+#pragma region powersavinghint
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME
+/// @brief Power Saving Hint Extension Name
+#define ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME  "ZE_experimental_power_saving_hint"
+#endif // ZE_CONTEXT_POWER_SAVING_HINT_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Saving Hint Extension Version(s)
+typedef enum _ze_power_saving_hint_exp_version_t
+{
+    ZE_POWER_SAVING_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),         ///< version 1.0
+    ZE_POWER_SAVING_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),     ///< latest known version
+    ZE_POWER_SAVING_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_power_saving_hint_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device types
+typedef enum _ze_power_saving_hint_type_t
+{
+    ZE_POWER_SAVING_HINT_TYPE_MIN = 0,                                      ///< Minumum power savings. The device will make no attempt to save power
+                                                                            ///< while executing work submitted to this context.
+    ZE_POWER_SAVING_HINT_TYPE_MAX = 100,                                    ///< Maximum power savings. The device will do everything to bring power to
+                                                                            ///< a minimum while executing work submitted to this context.
+    ZE_POWER_SAVING_HINT_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_power_saving_hint_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extended context descriptor containing power saving hint.
+typedef struct _ze_context_power_saving_hint_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t hint;                                                          ///< [in] power saving hint (default value = 0). This is value from [0,100]
+                                                                            ///< and can use pre-defined settings from ::ze_power_saving_hint_type_t.
+
+} ze_context_power_saving_hint_exp_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Subgroups
+#if !defined(__GNUC__)
+#pragma region subgroups
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_SUBGROUPS_EXT_NAME
+/// @brief Subgroups Extension Name
+#define ZE_SUBGROUPS_EXT_NAME  "ZE_extension_subgroups"
+#endif // ZE_SUBGROUPS_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Subgroups Extension Version(s)
+typedef enum _ze_subgroup_ext_version_t
+{
+    ZE_SUBGROUP_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                  ///< version 1.0
+    ZE_SUBGROUP_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),              ///< latest known version
+    ZE_SUBGROUP_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_subgroup_ext_version_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for EU Count
+#if !defined(__GNUC__)
+#pragma region EUCount
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_EU_COUNT_EXT_NAME
+/// @brief EU Count Extension Name
+#define ZE_EU_COUNT_EXT_NAME  "ZE_extension_eu_count"
+#endif // ZE_EU_COUNT_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief EU Count Extension Version(s)
+typedef enum _ze_eu_count_ext_version_t
+{
+    ZE_EU_COUNT_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                  ///< version 1.0
+    ZE_EU_COUNT_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),              ///< latest known version
+    ZE_EU_COUNT_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_eu_count_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief EU count queried using ::zeDeviceGetProperties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetProperties via the
+///       `pNext` member of ::ze_device_properties_t.
+///     - Used for determining the total number of EUs available on device.
+typedef struct _ze_eu_count_ext_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t numTotalEUs;                                                   ///< [out] Total number of EUs available
+
+} ze_eu_count_ext_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for PCI Properties
+#if !defined(__GNUC__)
+#pragma region PCIProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_PCI_PROPERTIES_EXT_NAME
+/// @brief PCI Properties Extension Name
+#define ZE_PCI_PROPERTIES_EXT_NAME  "ZE_extension_pci_properties"
+#endif // ZE_PCI_PROPERTIES_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI Properties Extension Version(s)
+typedef enum _ze_pci_properties_ext_version_t
+{
+    ZE_PCI_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0
+    ZE_PCI_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),        ///< latest known version
+    ZE_PCI_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_pci_properties_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device PCI address
+/// 
+/// @details
+///     - This structure may be passed to ::zeDevicePciGetPropertiesExt as an
+///       attribute of ::ze_pci_ext_properties_t.
+///     - A PCI BDF address is the bus:device:function address of the device and
+///       is useful for locating the device in the PCI switch fabric.
+typedef struct _ze_pci_address_ext_t
+{
+    uint32_t domain;                                                        ///< [out] PCI domain number
+    uint32_t bus;                                                           ///< [out] PCI BDF bus number
+    uint32_t device;                                                        ///< [out] PCI BDF device number
+    uint32_t function;                                                      ///< [out] PCI BDF function number
+
+} ze_pci_address_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device PCI speed
+typedef struct _ze_pci_speed_ext_t
+{
+    int32_t genVersion;                                                     ///< [out] The link generation. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int32_t width;                                                          ///< [out] The number of lanes. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int64_t maxBandwidth;                                                   ///< [out] The theoretical maximum bandwidth in bytes/sec (sum of all
+                                                                            ///< lanes). A value of -1 means that this property is unknown.
+
+} ze_pci_speed_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Static PCI properties
+typedef struct _ze_pci_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_pci_address_ext_t address;                                           ///< [out] The BDF address
+    ze_pci_speed_ext_t maxSpeed;                                            ///< [out] Fastest port configuration supported by the device (sum of all
+                                                                            ///< lanes)
+
+} ze_pci_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get PCI properties - address, max speed
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pPciProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDevicePciGetPropertiesExt(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device object.
+    ze_pci_ext_properties_t* pPciProperties                                 ///< [in,out] returns the PCI properties of the device.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for sRGB
+#if !defined(__GNUC__)
+#pragma region SRGB
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_SRGB_EXT_NAME
+/// @brief sRGB Extension Name
+#define ZE_SRGB_EXT_NAME  "ZE_extension_srgb"
+#endif // ZE_SRGB_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief sRGB Extension Version(s)
+typedef enum _ze_srgb_ext_version_t
+{
+    ZE_SRGB_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                      ///< version 1.0
+    ZE_SRGB_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),                  ///< latest known version
+    ZE_SRGB_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_srgb_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief sRGB image descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeImageCreate via the `pNext` member
+///       of ::ze_image_desc_t
+///     - Used for specifying that the image is in sRGB format.
+typedef struct _ze_srgb_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t sRGB;                                                         ///< [in] Is sRGB.
+
+} ze_srgb_ext_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Image Copy To/From Memory
+#if !defined(__GNUC__)
+#pragma region imageCopy
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_COPY_EXT_NAME
+/// @brief Image Copy Extension Name
+#define ZE_IMAGE_COPY_EXT_NAME  "ZE_extension_image_copy"
+#endif // ZE_IMAGE_COPY_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image Copy Extension Version(s)
+typedef enum _ze_image_copy_ext_version_t
+{
+    ZE_IMAGE_COPY_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZE_IMAGE_COPY_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
+    ZE_IMAGE_COPY_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_copy_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies from an image to device or shared memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by dstptr is
+///       accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by dstptr as
+///       it is free to be modified by either the Host or device up until
+///       execution.
+///     - The application must ensure the image and events are accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the image format descriptor for the source
+///       image is a single-planar format.
+///     - The application must ensure that the rowPitch is set to 0 if image is
+///       a 1D image. Otherwise the rowPitch must be greater than or equal to
+///       the element size in bytes x width.
+///     - If rowPitch is set to 0, the appropriate row pitch is calculated based
+///       on the size of each element in bytes multiplied by width
+///     - The application must ensure that the slicePitch is set to 0 if image
+///       is a 1D or 2D image. Otherwise this value must be greater than or
+///       equal to rowPitch x height.
+///     - If slicePitch is set to 0, the appropriate slice pitch is calculated
+///       based on the rowPitch x height.
+///     - The application must ensure the command list, image and events were
+///       created, and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clEnqueueReadImage
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hSrcImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == dstptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendImageCopyToMemoryExt(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    void* dstptr,                                                           ///< [in] pointer to destination memory to copy to
+    ze_image_handle_t hSrcImage,                                            ///< [in] handle of source image to copy from
+    const ze_image_region_t* pSrcRegion,                                    ///< [in][optional] source region descriptor
+    uint32_t destRowPitch,                                                  ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
+                                                                            ///< image or each image of a 1D or 2D image array being written
+    uint32_t destSlicePitch,                                                ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
+                                                                            ///< each image of a 1D or 2D image array being written
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copies to an image from device or shared memory.
+/// 
+/// @details
+///     - The application must ensure the memory pointed to by srcptr is
+///       accessible by the device on which the command list was created.
+///     - The implementation must not access the memory pointed to by srcptr as
+///       it is free to be modified by either the Host or device up until
+///       execution.
+///     - The application must ensure the image and events are accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the image format descriptor for the
+///       destination image is a single-planar format.
+///     - The application must ensure that the rowPitch is set to 0 if image is
+///       a 1D image. Otherwise the rowPitch must be greater than or equal to
+///       the element size in bytes x width.
+///     - If rowPitch is set to 0, the appropriate row pitch is calculated based
+///       on the size of each element in bytes multiplied by width
+///     - The application must ensure that the slicePitch is set to 0 if image
+///       is a 1D or 2D image. Otherwise this value must be greater than or
+///       equal to rowPitch x height.
+///     - If slicePitch is set to 0, the appropriate slice pitch is calculated
+///       based on the rowPitch x height.
+///     - The application must ensure the command list, image and events were
+///       created, and the memory was allocated, on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @remarks
+///   _Analogues_
+///     - clEnqueueWriteImage
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hDstImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == srcptr`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListAppendImageCopyFromMemoryExt(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of command list
+    ze_image_handle_t hDstImage,                                            ///< [in] handle of destination image to copy to
+    const void* srcptr,                                                     ///< [in] pointer to source memory to copy from
+    const ze_image_region_t* pDstRegion,                                    ///< [in][optional] destination region descriptor
+    uint32_t srcRowPitch,                                                   ///< [in] size in bytes of the 1D slice of the 2D region of a 2D or 3D
+                                                                            ///< image or each image of a 1D or 2D image array being read
+    uint32_t srcSlicePitch,                                                 ///< [in] size in bytes of the 2D slice of the 3D region of a 3D image or
+                                                                            ///< each image of a 1D or 2D image array being read
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before launching; must be 0
+                                                                            ///< if `nullptr == phWaitEvents`
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for Querying Image Allocation Properties.
+#if !defined(__GNUC__)
+#pragma region imageQueryAllocProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME
+/// @brief Image Query Allocation Properties Extension Name
+#define ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME  "ZE_extension_image_query_alloc_properties"
+#endif // ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image Query Allocation Properties Extension Version(s)
+typedef enum _ze_image_query_alloc_properties_ext_version_t
+{
+    ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_IMAGE_QUERY_ALLOC_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_query_alloc_properties_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image allocation properties queried using
+///        ::zeImageGetAllocPropertiesExt
+typedef struct _ze_image_allocation_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t id;                                                            ///< [out] identifier for this allocation
+
+} ze_image_allocation_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves attributes of an image allocation
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pImageAllocProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageGetAllocPropertiesExt(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_image_handle_t hImage,                                               ///< [in] handle of image object to query
+    ze_image_allocation_ext_properties_t* pImageAllocProperties             ///< [in,out] query result for image allocation properties
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Linkage Inspection
+#if !defined(__GNUC__)
+#pragma region linkageInspection
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_LINKAGE_INSPECTION_EXT_NAME
+/// @brief Linkage Inspection Extension Name
+#define ZE_LINKAGE_INSPECTION_EXT_NAME  "ZE_extension_linkage_inspection"
+#endif // ZE_LINKAGE_INSPECTION_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Linkage Inspection Extension Version(s)
+typedef enum _ze_linkage_inspection_ext_version_t
+{
+    ZE_LINKAGE_INSPECTION_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),        ///< version 1.0
+    ZE_LINKAGE_INSPECTION_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),    ///< latest known version
+    ZE_LINKAGE_INSPECTION_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_linkage_inspection_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported module linkage inspection flags
+typedef uint32_t ze_linkage_inspection_ext_flags_t;
+typedef enum _ze_linkage_inspection_ext_flag_t
+{
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_IMPORTS = ZE_BIT(0),                     ///< List all imports of modules
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_UNRESOLVABLE_IMPORTS = ZE_BIT(1),        ///< List all imports of modules that do not have a corresponding export
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_EXPORTS = ZE_BIT(2),                     ///< List all exports of modules
+    ZE_LINKAGE_INSPECTION_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_linkage_inspection_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module linkage inspection descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeModuleInspectLinkageExt.
+typedef struct _ze_linkage_inspection_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_linkage_inspection_ext_flags_t flags;                                ///< [in] flags specifying module linkage inspection.
+                                                                            ///< must be 0 (default) or a valid combination of ::ze_linkage_inspection_ext_flag_t.
+
+} ze_linkage_inspection_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief List Imports & Exports
+/// 
+/// @details
+///     - List all the import & unresolveable import dependencies & exports of a
+///       set of modules
+/// 
+/// @remarks
+///   _Analogues_
+///     - None
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pInspectDesc`
+///         + `nullptr == phModules`
+///         + `nullptr == phLog`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < pInspectDesc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeModuleInspectLinkageExt(
+    ze_linkage_inspection_ext_desc_t* pInspectDesc,                         ///< [in] pointer to linkage inspection descriptor structure.
+    uint32_t numModules,                                                    ///< [in] number of modules to be inspected pointed to by phModules.
+    ze_module_handle_t* phModules,                                          ///< [in][range(0, numModules)] pointer to an array of modules to be
+                                                                            ///< inspected for import dependencies.
+    ze_module_build_log_handle_t* phLog                                     ///< [out] pointer to handle of linkage inspection log. Log object will
+                                                                            ///< contain separate lists of imports, un-resolvable imports, and exports.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting memory compression hints.
+#if !defined(__GNUC__)
+#pragma region memoryCompressionHints
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME
+/// @brief Memory Compression Hints Extension Name
+#define ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME  "ZE_extension_memory_compression_hints"
+#endif // ZE_MEMORY_COMPRESSION_HINTS_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory Compression Hints Extension Version(s)
+typedef enum _ze_memory_compression_hints_ext_version_t
+{
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_compression_hints_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported memory compression hints flags
+typedef uint32_t ze_memory_compression_hints_ext_flags_t;
+typedef enum _ze_memory_compression_hints_ext_flag_t
+{
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_COMPRESSED = ZE_BIT(0),            ///< Hint Driver implementation to make allocation compressible
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_UNCOMPRESSED = ZE_BIT(1),          ///< Hint Driver implementation to make allocation not compressible
+    ZE_MEMORY_COMPRESSION_HINTS_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_compression_hints_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Compression hints memory allocation descriptor
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemAllocShared or
+///       ::zeMemAllocDevice, via the `pNext` member of
+///       ::ze_device_mem_alloc_desc_t.
+///     - This structure may be passed to ::zeMemAllocHost, via the `pNext`
+///       member of ::ze_host_mem_alloc_desc_t.
+///     - This structure may be passed to ::zeImageCreate, via the `pNext`
+///       member of ::ze_image_desc_t.
+typedef struct _ze_memory_compression_hints_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_memory_compression_hints_ext_flags_t flags;                          ///< [in] flags specifying if allocation should be compressible or not.
+                                                                            ///< Must be set to one of the ::ze_memory_compression_hints_ext_flag_t;
+
+} ze_memory_compression_hints_ext_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Memory Free Policies
+#if !defined(__GNUC__)
+#pragma region memoryFreePolicies
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MEMORY_FREE_POLICIES_EXT_NAME
+/// @brief Memory Free Policies Extension Name
+#define ZE_MEMORY_FREE_POLICIES_EXT_NAME  "ZE_extension_memory_free_policies"
+#endif // ZE_MEMORY_FREE_POLICIES_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory Free Policies Extension Version(s)
+typedef enum _ze_memory_free_policies_ext_version_t
+{
+    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),      ///< version 1.0
+    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_MEMORY_FREE_POLICIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_memory_free_policies_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported memory free policy capability flags
+typedef uint32_t ze_driver_memory_free_policy_ext_flags_t;
+typedef enum _ze_driver_memory_free_policy_ext_flag_t
+{
+    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_BLOCKING_FREE = ZE_BIT(0),        ///< blocks until all commands using the memory are complete before freeing
+    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_DEFER_FREE = ZE_BIT(1),           ///< schedules the memory to be freed but does not free immediately
+    ZE_DRIVER_MEMORY_FREE_POLICY_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_driver_memory_free_policy_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Driver memory free properties queried using ::zeDriverGetProperties
+/// 
+/// @details
+///     - All drivers must support an immediate free policy, which is the
+///       default free policy.
+///     - This structure may be returned from ::zeDriverGetProperties, via the
+///       `pNext` member of ::ze_driver_properties_t.
+typedef struct _ze_driver_memory_free_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_driver_memory_free_policy_ext_flags_t freePolicies;                  ///< [out] Supported memory free policies.
+                                                                            ///< must be 0 or a combination of ::ze_driver_memory_free_policy_ext_flag_t.
+
+} ze_driver_memory_free_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory free descriptor with free policy
+typedef struct _ze_memory_free_ext_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_driver_memory_free_policy_ext_flags_t freePolicy;                    ///< [in] flags specifying the memory free policy.
+                                                                            ///< must be 0 (default) or a supported ::ze_driver_memory_free_policy_ext_flag_t;
+                                                                            ///< default behavior is to free immediately.
+
+} ze_memory_free_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frees allocated host memory, device memory, or shared memory using the
+///        specified free policy.
+/// 
+/// @details
+///     - The memory free policy is specified by the memory free descriptor.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same pointer.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pMemFreeDesc`
+///         + `nullptr == ptr`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x3 < pMemFreeDesc->freePolicy`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemFreeExt(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    const ze_memory_free_ext_desc_t* pMemFreeDesc,                          ///< [in] pointer to memory free descriptor
+    void* ptr                                                               ///< [in][release] pointer to memory to free
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Bandwidth
+#if !defined(__GNUC__)
+#pragma region bandwidth
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_BANDWIDTH_PROPERTIES_EXP_NAME
+/// @brief Bandwidth Extension Name
+#define ZE_BANDWIDTH_PROPERTIES_EXP_NAME  "ZE_experimental_bandwidth_properties"
+#endif // ZE_BANDWIDTH_PROPERTIES_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief P2P Bandwidth Properties
+/// 
+/// @details
+///     - This structure may be passed to ::zeDeviceGetP2PProperties by having
+///       the pNext member of ::ze_device_p2p_properties_t point at this struct.
+typedef struct _ze_device_p2p_bandwidth_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t logicalBandwidth;                                              ///< [out] total logical design bandwidth for all links connecting the two
+                                                                            ///< devices
+    uint32_t physicalBandwidth;                                             ///< [out] total physical design bandwidth for all links connecting the two
+                                                                            ///< devices
+    ze_bandwidth_unit_t bandwidthUnit;                                      ///< [out] bandwidth unit
+    uint32_t logicalLatency;                                                ///< [out] average logical design latency for all links connecting the two
+                                                                            ///< devices
+    uint32_t physicalLatency;                                               ///< [out] average physical design latency for all links connecting the two
+                                                                            ///< devices
+    ze_latency_unit_t latencyUnit;                                          ///< [out] latency unit
+
+} ze_device_p2p_bandwidth_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Copy Bandwidth Properties
+/// 
+/// @details
+///     - This structure may be passed to
+///       ::zeDeviceGetCommandQueueGroupProperties by having the pNext member of
+///       ::ze_command_queue_group_properties_t point at this struct.
+typedef struct _ze_copy_bandwidth_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t copyBandwidth;                                                 ///< [out] design bandwidth supported by this engine type for copy
+                                                                            ///< operations
+    ze_bandwidth_unit_t copyBandwidthUnit;                                  ///< [out] copy bandwidth unit
+
+} ze_copy_bandwidth_exp_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Device Local Identifier (LUID)
+#if !defined(__GNUC__)
+#pragma region deviceLUID
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_DEVICE_LUID_EXT_NAME
+/// @brief Device Local Identifier (LUID) Extension Name
+#define ZE_DEVICE_LUID_EXT_NAME  "ZE_extension_device_luid"
+#endif // ZE_DEVICE_LUID_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device Local Identifier (LUID) Extension Version(s)
+typedef enum _ze_device_luid_ext_version_t
+{
+    ZE_DEVICE_LUID_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),               ///< version 1.0
+    ZE_DEVICE_LUID_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),           ///< latest known version
+    ZE_DEVICE_LUID_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_luid_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_DEVICE_LUID_SIZE_EXT
+/// @brief Maximum device local identifier (LUID) size in bytes
+#define ZE_MAX_DEVICE_LUID_SIZE_EXT  8
+#endif // ZE_MAX_DEVICE_LUID_SIZE_EXT
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device local identifier (LUID)
+typedef struct _ze_device_luid_ext_t
+{
+    uint8_t id[ZE_MAX_DEVICE_LUID_SIZE_EXT];                                ///< [out] opaque data representing a device LUID
+
+} ze_device_luid_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device LUID properties queried using ::zeDeviceGetProperties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetProperties, via the
+///       `pNext` member of ::ze_device_properties_t.
+typedef struct _ze_device_luid_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_luid_ext_t luid;                                              ///< [out] locally unique identifier (LUID).
+                                                                            ///< The returned LUID can be cast to a LUID object and must be equal to
+                                                                            ///< the locally
+                                                                            ///< unique identifier of an IDXGIAdapter1 object that corresponds to the device.
+    uint32_t nodeMask;                                                      ///< [out] node mask.
+                                                                            ///< The returned node mask must contain exactly one bit.
+                                                                            ///< If the device is running on an operating system that supports the
+                                                                            ///< Direct3D 12 API
+                                                                            ///< and the device corresponds to an individual device in a linked device
+                                                                            ///< adapter, the
+                                                                            ///< returned node mask identifies the Direct3D 12 node corresponding to
+                                                                            ///< the device.
+                                                                            ///< Otherwise, the returned node mask must be 1.
+
+} ze_device_luid_ext_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Fabric Topology Discovery
+#if !defined(__GNUC__)
+#pragma region fabric
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_FABRIC_EXP_NAME
+/// @brief Fabric Topology Discovery Extension Name
+#define ZE_FABRIC_EXP_NAME  "ZE_experimental_fabric"
+#endif // ZE_FABRIC_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE
+/// @brief Maximum fabric edge model string size
+#define ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE  256
+#endif // ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric Vertex types
+typedef enum _ze_fabric_vertex_exp_type_t
+{
+    ZE_FABRIC_VERTEX_EXP_TYPE_UNKNOWN = 0,                                  ///< Fabric vertex type is unknown
+    ZE_FABRIC_VERTEX_EXP_TYPE_DEVICE = 1,                                   ///< Fabric vertex represents a device
+    ZE_FABRIC_VERTEX_EXP_TYPE_SUBDEVICE = 2,                                ///< Fabric vertex represents a subdevice
+    ZE_FABRIC_VERTEX_EXP_TYPE_SWITCH = 3,                                   ///< Fabric vertex represents a switch
+    ZE_FABRIC_VERTEX_EXP_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_fabric_vertex_exp_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric edge duplexity
+typedef enum _ze_fabric_edge_exp_duplexity_t
+{
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_UNKNOWN = 0,                               ///< Fabric edge duplexity is unknown
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_HALF_DUPLEX = 1,                           ///< Fabric edge is half duplex, i.e. stated bandwidth is obtained in only
+                                                                            ///< one direction at time
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_FULL_DUPLEX = 2,                           ///< Fabric edge is full duplex, i.e. stated bandwidth is supported in both
+                                                                            ///< directions simultaneously
+    ZE_FABRIC_EDGE_EXP_DUPLEXITY_FORCE_UINT32 = 0x7fffffff
+
+} ze_fabric_edge_exp_duplexity_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI address
+/// 
+/// @details
+///     - A PCI BDF address is the bus:device:function address of the device and
+///       is useful for locating the device in the PCI switch fabric.
+typedef struct _ze_fabric_vertex_pci_exp_address_t
+{
+    uint32_t domain;                                                        ///< [out] PCI domain number
+    uint32_t bus;                                                           ///< [out] PCI BDF bus number
+    uint32_t device;                                                        ///< [out] PCI BDF device number
+    uint32_t function;                                                      ///< [out] PCI BDF function number
+
+} ze_fabric_vertex_pci_exp_address_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric Vertex properties
+typedef struct _ze_fabric_vertex_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_uuid_t uuid;                                                         ///< [out] universal unique identifier. If the vertex is co-located with a
+                                                                            ///< device/subdevice, then this uuid will match that of the corresponding
+                                                                            ///< device/subdevice
+    ze_fabric_vertex_exp_type_t type;                                       ///< [out] does the fabric vertex represent a device, subdevice, or switch?
+    ze_bool_t remote;                                                       ///< [out] does the fabric vertex live on the local node or on a remote
+                                                                            ///< node?
+    ze_fabric_vertex_pci_exp_address_t address;                             ///< [out] B/D/F address of fabric vertex & associated device/subdevice if
+                                                                            ///< available
+
+} ze_fabric_vertex_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric Edge properties
+typedef struct _ze_fabric_edge_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_uuid_t uuid;                                                         ///< [out] universal unique identifier.
+    char model[ZE_MAX_FABRIC_EDGE_MODEL_EXP_SIZE];                          ///< [out] Description of fabric edge technology. Will be set to the string
+                                                                            ///< "unkown" if this cannot be determined for this edge
+    uint32_t bandwidth;                                                     ///< [out] design bandwidth
+    ze_bandwidth_unit_t bandwidthUnit;                                      ///< [out] bandwidth unit
+    uint32_t latency;                                                       ///< [out] design latency
+    ze_latency_unit_t latencyUnit;                                          ///< [out] latency unit
+    ze_fabric_edge_exp_duplexity_t duplexity;                               ///< [out] Duplexity of the fabric edge
+
+} ze_fabric_edge_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves fabric vertices within a driver
+/// 
+/// @details
+///     - A fabric vertex represents either a device or a switch connected to
+///       other fabric vertices.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricVertexGetExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of fabric vertices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of fabric vertices available.
+                                                                            ///< if count is greater than the number of fabric vertices available, then
+                                                                            ///< the driver shall update the value with the correct number of fabric
+                                                                            ///< vertices available.
+    ze_fabric_vertex_handle_t* phVertices                                   ///< [in,out][optional][range(0, *pCount)] array of handle of fabric vertices.
+                                                                            ///< if count is less than the number of fabric vertices available, then
+                                                                            ///< driver shall only retrieve that number of fabric vertices.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves a fabric sub-vertex from a fabric vertex
+/// 
+/// @details
+///     - Multiple calls to this function will return identical fabric vertex
+///       handles, in the same order.
+///     - The number of handles returned from this function is affected by the
+///       ::ZE_AFFINITY_MASK environment variable.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVertex`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricVertexGetSubVerticesExp(
+    ze_fabric_vertex_handle_t hVertex,                                      ///< [in] handle of the fabric vertex object
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sub-vertices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub-vertices available.
+                                                                            ///< if count is greater than the number of sub-vertices available, then
+                                                                            ///< the driver shall update the value with the correct number of
+                                                                            ///< sub-vertices available.
+    ze_fabric_vertex_handle_t* phSubvertices                                ///< [in,out][optional][range(0, *pCount)] array of handle of sub-vertices.
+                                                                            ///< if count is less than the number of sub-vertices available, then
+                                                                            ///< driver shall only retrieve that number of sub-vertices.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves properties of the fabric vertex.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVertex`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pVertexProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricVertexGetPropertiesExp(
+    ze_fabric_vertex_handle_t hVertex,                                      ///< [in] handle of the fabric vertex
+    ze_fabric_vertex_exp_properties_t* pVertexProperties                    ///< [in,out] query result for fabric vertex properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns device handle from fabric vertex handle.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVertex`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phDevice`
+///     - ::ZE_RESULT_EXP_ERROR_VERTEX_IS_NOT_DEVICE
+///         + Provided fabric vertex handle does not correspond to a device or subdevice.
+///     - ::ZE_RESULT_EXP_ERROR_REMOTE_DEVICE
+///         + Provided fabric vertex handle corresponds to remote device or subdevice.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricVertexGetDeviceExp(
+    ze_fabric_vertex_handle_t hVertex,                                      ///< [in] handle of the fabric vertex
+    ze_device_handle_t* phDevice                                            ///< [out] device handle corresponding to fabric vertex
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns fabric vertex handle from device handle.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phVertex`
+///     - ::ZE_RESULT_EXP_ERROR_DEVICE_IS_NOT_VERTEX
+///         + Provided device handle does not correspond to a fabric vertex.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDeviceGetFabricVertexExp(
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    ze_fabric_vertex_handle_t* phVertex                                     ///< [out] fabric vertex handle corresponding to device
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves all fabric edges between provided pair of fabric vertices
+/// 
+/// @details
+///     - A fabric edge represents one or more physical links between two fabric
+///       vertices.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVertexA`
+///         + `nullptr == hVertexB`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricEdgeGetExp(
+    ze_fabric_vertex_handle_t hVertexA,                                     ///< [in] handle of first fabric vertex instance
+    ze_fabric_vertex_handle_t hVertexB,                                     ///< [in] handle of second fabric vertex instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of fabric edges.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of fabric edges available.
+                                                                            ///< if count is greater than the number of fabric edges available, then
+                                                                            ///< the driver shall update the value with the correct number of fabric
+                                                                            ///< edges available.
+    ze_fabric_edge_handle_t* phEdges                                        ///< [in,out][optional][range(0, *pCount)] array of handle of fabric edges.
+                                                                            ///< if count is less than the number of fabric edges available, then
+                                                                            ///< driver shall only retrieve that number of fabric edges.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves fabric vertices connected by a fabric edge
+/// 
+/// @details
+///     - A fabric vertex represents either a device or a switch connected to
+///       other fabric vertices via a fabric edge.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEdge`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phVertexA`
+///         + `nullptr == phVertexB`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricEdgeGetVerticesExp(
+    ze_fabric_edge_handle_t hEdge,                                          ///< [in] handle of the fabric edge instance
+    ze_fabric_vertex_handle_t* phVertexA,                                   ///< [out] fabric vertex connected to one end of the given fabric edge.
+    ze_fabric_vertex_handle_t* phVertexB                                    ///< [out] fabric vertex connected to other end of the given fabric edge.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves properties of the fabric edge.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEdge`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pEdgeProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeFabricEdgeGetPropertiesExp(
+    ze_fabric_edge_handle_t hEdge,                                          ///< [in] handle of the fabric edge
+    ze_fabric_edge_exp_properties_t* pEdgeProperties                        ///< [in,out] query result for fabric edge properties
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Device Memory Properties
+#if !defined(__GNUC__)
+#pragma region memoryProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_DEVICE_MEMORY_PROPERTIES_EXT_NAME
+/// @brief Device Memory Properties Extension Name
+#define ZE_DEVICE_MEMORY_PROPERTIES_EXT_NAME  "ZE_extension_device_memory_properties"
+#endif // ZE_DEVICE_MEMORY_PROPERTIES_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device Memory Properties Extension Version(s)
+typedef enum _ze_device_memory_properties_ext_version_t
+{
+    ZE_DEVICE_MEMORY_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_DEVICE_MEMORY_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_DEVICE_MEMORY_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_memory_properties_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory module types
+typedef enum _ze_device_memory_ext_type_t
+{
+    ZE_DEVICE_MEMORY_EXT_TYPE_HBM = 0,                                      ///< HBM memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_HBM2 = 1,                                     ///< HBM2 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR = 2,                                      ///< DDR memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR2 = 3,                                     ///< DDR2 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR3 = 4,                                     ///< DDR3 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR4 = 5,                                     ///< DDR4 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_DDR5 = 6,                                     ///< DDR5 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR = 7,                                    ///< LPDDR memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR3 = 8,                                   ///< LPDDR3 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR4 = 9,                                   ///< LPDDR4 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_LPDDR5 = 10,                                  ///< LPDDR5 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_SRAM = 11,                                    ///< SRAM memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_L1 = 12,                                      ///< L1 cache
+    ZE_DEVICE_MEMORY_EXT_TYPE_L3 = 13,                                      ///< L3 cache
+    ZE_DEVICE_MEMORY_EXT_TYPE_GRF = 14,                                     ///< Execution unit register file
+    ZE_DEVICE_MEMORY_EXT_TYPE_SLM = 15,                                     ///< Execution unit shared local memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR4 = 16,                                   ///< GDDR4 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR5 = 17,                                   ///< GDDR5 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR5X = 18,                                  ///< GDDR5X memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR6 = 19,                                   ///< GDDR6 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR6X = 20,                                  ///< GDDR6X memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_GDDR7 = 21,                                   ///< GDDR7 memory
+    ZE_DEVICE_MEMORY_EXT_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_memory_ext_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory properties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetMemoryProperties via
+///       the `pNext` member of ::ze_device_memory_properties_t
+typedef struct _ze_device_memory_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_memory_ext_type_t type;                                       ///< [out] The memory type
+    uint64_t physicalSize;                                                  ///< [out] Physical memory size in bytes. A value of 0 indicates that this
+                                                                            ///< property is not known. However, a call to ::zesMemoryGetState() will
+                                                                            ///< correctly return the total size of usable memory.
+    uint32_t readBandwidth;                                                 ///< [out] Design bandwidth for reads
+    uint32_t writeBandwidth;                                                ///< [out] Design bandwidth for writes
+    ze_bandwidth_unit_t bandwidthUnit;                                      ///< [out] bandwidth unit
+
+} ze_device_memory_ext_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Bfloat16 Conversions
+#if !defined(__GNUC__)
+#pragma region bfloat16conversions
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_BFLOAT16_CONVERSIONS_EXT_NAME
+/// @brief Bfloat16 Conversions Extension Name
+#define ZE_BFLOAT16_CONVERSIONS_EXT_NAME  "ZE_extension_bfloat16_conversions"
+#endif // ZE_BFLOAT16_CONVERSIONS_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Bfloat16 Conversions Extension Version(s)
+typedef enum _ze_bfloat16_conversions_ext_version_t
+{
+    ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),      ///< version 1.0
+    ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_BFLOAT16_CONVERSIONS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_bfloat16_conversions_ext_version_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Device IP Version
+#if !defined(__GNUC__)
+#pragma region deviceipversion
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_DEVICE_IP_VERSION_EXT_NAME
+/// @brief Device IP Version Extension Name
+#define ZE_DEVICE_IP_VERSION_EXT_NAME  "ZE_extension_device_ip_version"
+#endif // ZE_DEVICE_IP_VERSION_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device IP Version Extension Version(s)
+typedef enum _ze_device_ip_version_version_t
+{
+    ZE_DEVICE_IP_VERSION_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZE_DEVICE_IP_VERSION_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
+    ZE_DEVICE_IP_VERSION_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_device_ip_version_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device IP version queried using ::zeDeviceGetProperties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetProperties via the
+///       `pNext` member of ::ze_device_properties_t
+typedef struct _ze_device_ip_version_ext_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t ipVersion;                                                     ///< [out] Device IP version. The meaning of the device IP version is
+                                                                            ///< implementation-defined, but newer devices should have a higher
+                                                                            ///< version than older devices.
+
+} ze_device_ip_version_ext_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for querying kernel max group size properties.
+#if !defined(__GNUC__)
+#pragma region kernelMaxGroupSizeProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_NAME
+/// @brief Kernel Max Group Size Properties Extension Name
+#define ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_NAME  "ZE_extension_kernel_max_group_size_properties"
+#endif // ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel Max Group Size Properties Extension Version(s)
+typedef enum _ze_kernel_max_group_size_properties_ext_version_t
+{
+    ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_KERNEL_MAX_GROUP_SIZE_PROPERTIES_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_kernel_max_group_size_properties_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Additional kernel max group size properties
+/// 
+/// @details
+///     - This structure may be passed to ::zeKernelGetProperties, via the
+///       `pNext` member of ::ze_kernel_properties_t, to query additional kernel
+///       max group size properties.
+typedef struct _ze_kernel_max_group_size_properties_ext_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t maxGroupSize;                                                  ///< [out] maximum group size that can be used to execute the kernel. This
+                                                                            ///< value may be less than or equal to the `maxTotalGroupSize` member of
+                                                                            ///< ::ze_device_compute_properties_t.
+
+} ze_kernel_max_group_size_properties_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief compiler-independent type
+typedef ze_kernel_max_group_size_properties_ext_t ze_kernel_max_group_size_ext_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for querying sub-allocations properties.
+#if !defined(__GNUC__)
+#pragma region subAllocationsProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_SUB_ALLOCATIONS_EXP_NAME
+/// @brief Sub-Allocations Properties Extension Name
+#define ZE_SUB_ALLOCATIONS_EXP_NAME  "ZE_experimental_sub_allocations"
+#endif // ZE_SUB_ALLOCATIONS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sub-Allocations Properties Extension Version(s)
+typedef enum _ze_sub_allocations_exp_version_t
+{
+    ZE_SUB_ALLOCATIONS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),           ///< version 1.0
+    ZE_SUB_ALLOCATIONS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),       ///< latest known version
+    ZE_SUB_ALLOCATIONS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_sub_allocations_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Properties returned for a sub-allocation
+typedef struct _ze_sub_allocation_t
+{
+    void* base;                                                             ///< [in,out][optional] base address of the sub-allocation
+    size_t size;                                                            ///< [in,out][optional] size of the allocation
+
+} ze_sub_allocation_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sub-Allocations Properties
+/// 
+/// @details
+///     - This structure may be passed to ::zeMemGetAllocProperties, via the
+///       `pNext` member of ::ze_memory_allocation_properties_t.
+typedef struct _ze_memory_sub_allocations_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t* pCount;                                                       ///< [in,out] pointer to the number of sub-allocations.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub-allocations on which the allocation has been divided.
+                                                                            ///< if count is greater than the number of sub-allocations, then the
+                                                                            ///< driver shall update the value with the correct number of sub-allocations.
+    ze_sub_allocation_t* pSubAllocations;                                   ///< [in,out][optional][range(0, *pCount)] array of properties for sub-allocations.
+                                                                            ///< if count is less than the number of sub-allocations available, then
+                                                                            ///< driver shall only retrieve properties for that number of sub-allocations.
+
+} ze_memory_sub_allocations_exp_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting the querying of synchronized event timestamps.
+#if !defined(__GNUC__)
+#pragma region eventQueryKernelTimestamps
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_NAME
+/// @brief Event Query Kernel Timestamps Extension Name
+#define ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_NAME  "ZE_extension_event_query_kernel_timestamps"
+#endif // ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event Query Kernel Timestamps Extension Version(s)
+typedef enum _ze_event_query_kernel_timestamps_ext_version_t
+{
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_query_kernel_timestamps_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event query kernel timestamps flags
+typedef uint32_t ze_event_query_kernel_timestamps_ext_flags_t;
+typedef enum _ze_event_query_kernel_timestamps_ext_flag_t
+{
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_KERNEL = ZE_BIT(0),           ///< Kernel timestamp results
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_SYNCHRONIZED = ZE_BIT(1),     ///< Device event timestamps synchronized to the host time domain
+    ZE_EVENT_QUERY_KERNEL_TIMESTAMPS_EXT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_query_kernel_timestamps_ext_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event query kernel timestamps properties
+/// 
+/// @details
+///     - This structure may be returned from ::zeDeviceGetProperties, via the
+///       `pNext` member of ::ze_device_properties_t.
+typedef struct _ze_event_query_kernel_timestamps_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_event_query_kernel_timestamps_ext_flags_t flags;                     ///< [out] 0 or some combination of
+                                                                            ///< ::ze_event_query_kernel_timestamps_ext_flag_t flags
+
+} ze_event_query_kernel_timestamps_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Kernel timestamp clock data synchronized to the host time domain
+typedef struct _ze_synchronized_timestamp_data_ext_t
+{
+    uint64_t kernelStart;                                                   ///< [out] synchronized clock at start of kernel execution
+    uint64_t kernelEnd;                                                     ///< [out] synchronized clock at end of kernel execution
+
+} ze_synchronized_timestamp_data_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Synchronized kernel timestamp result
+typedef struct _ze_synchronized_timestamp_result_ext_t
+{
+    ze_synchronized_timestamp_data_ext_t global;                            ///< [out] wall-clock data
+    ze_synchronized_timestamp_data_ext_t context;                           ///< [out] context-active data; only includes clocks while device context
+                                                                            ///< was actively executing.
+
+} ze_synchronized_timestamp_result_ext_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event query kernel timestamps results properties
+typedef struct _ze_event_query_kernel_timestamps_results_ext_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_kernel_timestamp_result_t* pKernelTimestampsBuffer;                  ///< [in,out][optional][range(0, *pCount)] pointer to destination buffer of
+                                                                            ///< kernel timestamp results
+    ze_synchronized_timestamp_result_ext_t* pSynchronizedTimestampsBuffer;  ///< [in,out][optional][range(0, *pCount)] pointer to destination buffer of
+                                                                            ///< synchronized timestamp results
+
+} ze_event_query_kernel_timestamps_results_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query an event's timestamp value on the host, with domain preference.
+/// 
+/// @details
+///     - For collecting *only* kernel timestamps, the application must ensure
+///       the event was created from an event pool that was created using
+///       ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag.
+///     - For collecting synchronized timestamps, the application must ensure
+///       the event was created from an event pool that was created using
+///       ::ZE_EVENT_POOL_FLAG_KERNEL_MAPPED_TIMESTAMP flag. Kernel timestamps
+///       are also available from this type of event pool, but there is a
+///       performance cost.
+///     - The destination memory will be unmodified if the event has not been
+///       signaled.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support
+///       ::ZE_extension_event_query_kernel_timestamps.
+///     - The implementation must return all timestamps for the specified event
+///       and device pair.
+///     - The implementation must return all timestamps for all sub-devices when
+///       device handle is parent device.
+///     - The implementation may return all timestamps for sub-devices when
+///       device handle is sub-device or may return 0 for count.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEvent`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeEventQueryKernelTimestampsExt(
+    ze_event_handle_t hEvent,                                               ///< [in] handle of the event
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device to query
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of event packets available.
+                                                                            ///<    - This value is implementation specific.
+                                                                            ///<    - if `*pCount` is zero, then the driver shall update the value with
+                                                                            ///< the total number of event packets available.
+                                                                            ///<    - if `*pCount` is greater than the number of event packets
+                                                                            ///< available, the driver shall update the value with the correct value.
+                                                                            ///<    - Buffer(s) for query results must be sized by the application to
+                                                                            ///< accommodate a minimum of `*pCount` elements.
+    ze_event_query_kernel_timestamps_results_ext_properties_t* pResults     ///< [in,out][optional][range(0, *pCount)] pointer to event query
+                                                                            ///< properties structure(s).
+                                                                            ///<    - This parameter may be null when `*pCount` is zero.
+                                                                            ///<    - if `*pCount` is less than the number of event packets available,
+                                                                            ///< the driver may only update `*pCount` elements, starting at element zero.
+                                                                            ///<    - if `*pCount` is greater than the number of event packets
+                                                                            ///< available, the driver may only update the valid elements.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting ray tracing acceleration structure builder.
+#if !defined(__GNUC__)
+#pragma region RTASBuilder
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_RTAS_BUILDER_EXP_NAME
+/// @brief Ray Tracing Acceleration Structure Builder Extension Name
+#define ZE_RTAS_BUILDER_EXP_NAME  "ZE_experimental_rtas_builder"
+#endif // ZE_RTAS_BUILDER_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray Tracing Acceleration Structure Builder Extension Version(s)
+typedef enum _ze_rtas_builder_exp_version_t
+{
+    ZE_RTAS_BUILDER_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),              ///< version 1.0
+    ZE_RTAS_BUILDER_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),          ///< latest known version
+    ZE_RTAS_BUILDER_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure device flags
+typedef uint32_t ze_rtas_device_exp_flags_t;
+typedef enum _ze_rtas_device_exp_flag_t
+{
+    ZE_RTAS_DEVICE_EXP_FLAG_RESERVED = ZE_BIT(0),                           ///< reserved for future use
+    ZE_RTAS_DEVICE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_device_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure format
+/// 
+/// @details
+///     - This is an opaque ray tracing acceleration structure format
+///       identifier.
+typedef enum _ze_rtas_format_exp_t
+{
+    ZE_RTAS_FORMAT_EXP_INVALID = 0,                                         ///< Invalid acceleration structure format
+    ZE_RTAS_FORMAT_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_format_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder flags
+typedef uint32_t ze_rtas_builder_exp_flags_t;
+typedef enum _ze_rtas_builder_exp_flag_t
+{
+    ZE_RTAS_BUILDER_EXP_FLAG_RESERVED = ZE_BIT(0),                          ///< Reserved for future use
+    ZE_RTAS_BUILDER_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder parallel operation flags
+typedef uint32_t ze_rtas_parallel_operation_exp_flags_t;
+typedef enum _ze_rtas_parallel_operation_exp_flag_t
+{
+    ZE_RTAS_PARALLEL_OPERATION_EXP_FLAG_RESERVED = ZE_BIT(0),               ///< Reserved for future use
+    ZE_RTAS_PARALLEL_OPERATION_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_parallel_operation_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder geometry flags
+typedef uint32_t ze_rtas_builder_geometry_exp_flags_t;
+typedef enum _ze_rtas_builder_geometry_exp_flag_t
+{
+    ZE_RTAS_BUILDER_GEOMETRY_EXP_FLAG_NON_OPAQUE = ZE_BIT(0),               ///< non-opaque geometries invoke an any-hit shader
+    ZE_RTAS_BUILDER_GEOMETRY_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_geometry_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure builder geometry flags (see
+///        ::ze_rtas_builder_geometry_exp_flags_t)
+typedef uint8_t ze_rtas_builder_packed_geometry_exp_flags_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder instance flags
+typedef uint32_t ze_rtas_builder_instance_exp_flags_t;
+typedef enum _ze_rtas_builder_instance_exp_flag_t
+{
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_CULL_DISABLE = ZE_BIT(0),    ///< disables culling of front-facing and back-facing triangles
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_FRONT_COUNTERCLOCKWISE = ZE_BIT(1),  ///< reverses front and back face of triangles
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_FORCE_OPAQUE = ZE_BIT(2),    ///< forces instanced geometry to be opaque, unless ray flag forces it to
+                                                                            ///< be non-opaque
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_TRIANGLE_FORCE_NON_OPAQUE = ZE_BIT(3),///< forces instanced geometry to be non-opaque, unless ray flag forces it
+                                                                            ///< to be opaque
+    ZE_RTAS_BUILDER_INSTANCE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_instance_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure builder instance flags (see
+///        ::ze_rtas_builder_instance_exp_flags_t)
+typedef uint8_t ze_rtas_builder_packed_instance_exp_flags_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder build operation flags
+/// 
+/// @details
+///     - These flags allow the application to tune the acceleration structure
+///       build operation.
+///     - The acceleration structure builder implementation might choose to use
+///       spatial splitting to split large or long primitives into smaller
+///       pieces. This may result in any-hit shaders being invoked multiple
+///       times for non-opaque primitives, unless
+///       ::ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION is specified.
+///     - Usage of any of these flags may reduce ray tracing performance.
+typedef uint32_t ze_rtas_builder_build_op_exp_flags_t;
+typedef enum _ze_rtas_builder_build_op_exp_flag_t
+{
+    ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_COMPACT = ZE_BIT(0),                  ///< build more compact acceleration structure
+    ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_NO_DUPLICATE_ANYHIT_INVOCATION = ZE_BIT(1),   ///< guarantees single any-hit shader invocation per primitive
+    ZE_RTAS_BUILDER_BUILD_OP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_build_op_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder build quality hint
+/// 
+/// @details
+///     - Depending on use case different quality modes for acceleration
+///       structure build are supported.
+///     - A low-quality build builds an acceleration structure fast, but at the
+///       cost of some reduction in ray tracing performance. This mode is
+///       recommended for dynamic content, such as animated characters.
+///     - A medium-quality build uses a compromise between build quality and ray
+///       tracing performance. This mode should be used by default.
+///     - Higher ray tracing performance can be achieved by using a high-quality
+///       build, but acceleration structure build performance might be
+///       significantly reduced.
+typedef enum _ze_rtas_builder_build_quality_hint_exp_t
+{
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_LOW = 0,                         ///< build low-quality acceleration structure (fast)
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_MEDIUM = 1,                      ///< build medium-quality acceleration structure (slower)
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH = 2,                        ///< build high-quality acceleration structure (slow)
+    ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_build_quality_hint_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder geometry type
+typedef enum _ze_rtas_builder_geometry_type_exp_t
+{
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES = 0,                        ///< triangle mesh geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS = 1,                            ///< quad mesh geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL = 2,                       ///< procedural geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE = 3,                         ///< instance geometry type
+    ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_geometry_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure builder geometry type (see
+///        ::ze_rtas_builder_geometry_type_exp_t)
+typedef uint8_t ze_rtas_builder_packed_geometry_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure data buffer element format
+/// 
+/// @details
+///     - Specifies the format of data buffer elements.
+///     - Data buffers may contain instancing transform matrices, triangle/quad
+///       vertex indices, etc...
+typedef enum _ze_rtas_builder_input_data_format_exp_t
+{
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3 = 0,                       ///< 3-component float vector (see ::ze_rtas_float3_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_COLUMN_MAJOR = 1,        ///< 3x4 affine transformation in column-major format (see
+                                                                            ///< ::ze_rtas_transform_float3x4_column_major_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR = 2,///< 3x4 affine transformation in column-major format (see
+                                                                            ///< ::ze_rtas_transform_float3x4_aligned_column_major_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ROW_MAJOR = 3,           ///< 3x4 affine transformation in row-major format (see
+                                                                            ///< ::ze_rtas_transform_float3x4_row_major_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_AABB = 4,                         ///< 3-dimensional axis-aligned bounding-box (see ::ze_rtas_aabb_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32 = 5,      ///< Unsigned 32-bit triangle indices (see
+                                                                            ///< ::ze_rtas_triangle_indices_uint32_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32 = 6,          ///< Unsigned 32-bit quad indices (see ::ze_rtas_quad_indices_uint32_exp_t)
+    ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FORCE_UINT32 = 0x7fffffff
+
+} ze_rtas_builder_input_data_format_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Packed ray tracing acceleration structure data buffer element format
+///        (see ::ze_rtas_builder_input_data_format_exp_t)
+typedef uint8_t ze_rtas_builder_packed_input_data_format_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of ray tracing acceleration structure builder object
+typedef struct _ze_rtas_builder_exp_handle_t *ze_rtas_builder_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of ray tracing acceleration structure builder parallel
+///        operation object
+typedef struct _ze_rtas_parallel_operation_exp_handle_t *ze_rtas_parallel_operation_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder descriptor
+typedef struct _ze_rtas_builder_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_builder_exp_version_t builderVersion;                           ///< [in] ray tracing acceleration structure builder version
+
+} ze_rtas_builder_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder properties
+typedef struct _ze_rtas_builder_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_builder_exp_flags_t flags;                                      ///< [out] ray tracing acceleration structure builder flags
+    size_t rtasBufferSizeBytesExpected;                                     ///< [out] expected size (in bytes) required for acceleration structure buffer
+                                                                            ///<    - When using an acceleration structure buffer of this size, the
+                                                                            ///< build is expected to succeed; however, it is possible that the build
+                                                                            ///< may fail with ::ZE_RESULT_EXP_RTAS_BUILD_RETRY
+    size_t rtasBufferSizeBytesMaxRequired;                                  ///< [out] worst-case size (in bytes) required for acceleration structure buffer
+                                                                            ///<    - When using an acceleration structure buffer of this size, the
+                                                                            ///< build is guaranteed to not run out of memory.
+    size_t scratchBufferSizeBytes;                                          ///< [out] scratch buffer size (in bytes) required for acceleration
+                                                                            ///< structure build.
+
+} ze_rtas_builder_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder parallel operation
+///        properties
+typedef struct _ze_rtas_parallel_operation_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_parallel_operation_exp_flags_t flags;                           ///< [out] ray tracing acceleration structure builder parallel operation
+                                                                            ///< flags
+    uint32_t maxConcurrency;                                                ///< [out] maximum number of threads that may join the parallel operation
+
+} ze_rtas_parallel_operation_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure device properties
+/// 
+/// @details
+///     - This structure may be passed to ::zeDeviceGetProperties, via `pNext`
+///       member of ::ze_device_properties_t.
+///     - The implementation shall populate `format` with a value other than
+///       ::ZE_RTAS_FORMAT_EXP_INVALID when the device supports ray tracing.
+typedef struct _ze_rtas_device_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_device_exp_flags_t flags;                                       ///< [out] ray tracing acceleration structure device flags
+    ze_rtas_format_exp_t rtasFormat;                                        ///< [out] ray tracing acceleration structure format
+    uint32_t rtasBufferAlignment;                                           ///< [out] required alignment of acceleration structure buffer
+
+} ze_rtas_device_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief A 3-component vector type
+typedef struct _ze_rtas_float3_exp_t
+{
+    float x;                                                                ///< [in] x-coordinate of float3 vector
+    float y;                                                                ///< [in] y-coordinate of float3 vector
+    float z;                                                                ///< [in] z-coordinate of float3 vector
+
+} ze_rtas_float3_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 3x4 affine transformation in column-major layout
+/// 
+/// @details
+///     - A 3x4 affine transformation in column major layout, consisting of vectors
+///          - vx=(vx_x, vx_y, vx_z),
+///          - vy=(vy_x, vy_y, vy_z),
+///          - vz=(vz_x, vz_y, vz_z), and
+///          - p=(p_x, p_y, p_z)
+///     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+///       z*vz + p`.
+typedef struct _ze_rtas_transform_float3x4_column_major_exp_t
+{
+    float vx_x;                                                             ///< [in] element 0 of column 0 of 3x4 matrix
+    float vx_y;                                                             ///< [in] element 1 of column 0 of 3x4 matrix
+    float vx_z;                                                             ///< [in] element 2 of column 0 of 3x4 matrix
+    float vy_x;                                                             ///< [in] element 0 of column 1 of 3x4 matrix
+    float vy_y;                                                             ///< [in] element 1 of column 1 of 3x4 matrix
+    float vy_z;                                                             ///< [in] element 2 of column 1 of 3x4 matrix
+    float vz_x;                                                             ///< [in] element 0 of column 2 of 3x4 matrix
+    float vz_y;                                                             ///< [in] element 1 of column 2 of 3x4 matrix
+    float vz_z;                                                             ///< [in] element 2 of column 2 of 3x4 matrix
+    float p_x;                                                              ///< [in] element 0 of column 3 of 3x4 matrix
+    float p_y;                                                              ///< [in] element 1 of column 3 of 3x4 matrix
+    float p_z;                                                              ///< [in] element 2 of column 3 of 3x4 matrix
+
+} ze_rtas_transform_float3x4_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 3x4 affine transformation in column-major layout with aligned column
+///        vectors
+/// 
+/// @details
+///     - A 3x4 affine transformation in column major layout, consisting of vectors
+///        - vx=(vx_x, vx_y, vx_z),
+///        - vy=(vy_x, vy_y, vy_z),
+///        - vz=(vz_x, vz_y, vz_z), and
+///        - p=(p_x, p_y, p_z)
+///     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+///       z*vz + p`.
+///     - The column vectors are aligned to 16-bytes and pad members are
+///       ignored.
+typedef struct _ze_rtas_transform_float3x4_aligned_column_major_exp_t
+{
+    float vx_x;                                                             ///< [in] element 0 of column 0 of 3x4 matrix
+    float vx_y;                                                             ///< [in] element 1 of column 0 of 3x4 matrix
+    float vx_z;                                                             ///< [in] element 2 of column 0 of 3x4 matrix
+    float pad0;                                                             ///< [in] ignored padding
+    float vy_x;                                                             ///< [in] element 0 of column 1 of 3x4 matrix
+    float vy_y;                                                             ///< [in] element 1 of column 1 of 3x4 matrix
+    float vy_z;                                                             ///< [in] element 2 of column 1 of 3x4 matrix
+    float pad1;                                                             ///< [in] ignored padding
+    float vz_x;                                                             ///< [in] element 0 of column 2 of 3x4 matrix
+    float vz_y;                                                             ///< [in] element 1 of column 2 of 3x4 matrix
+    float vz_z;                                                             ///< [in] element 2 of column 2 of 3x4 matrix
+    float pad2;                                                             ///< [in] ignored padding
+    float p_x;                                                              ///< [in] element 0 of column 3 of 3x4 matrix
+    float p_y;                                                              ///< [in] element 1 of column 3 of 3x4 matrix
+    float p_z;                                                              ///< [in] element 2 of column 3 of 3x4 matrix
+    float pad3;                                                             ///< [in] ignored padding
+
+} ze_rtas_transform_float3x4_aligned_column_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 3x4 affine transformation in row-major layout
+/// 
+/// @details
+///     - A 3x4 affine transformation in row-major layout, consisting of vectors
+///          - vx=(vx_x, vx_y, vx_z),
+///          - vy=(vy_x, vy_y, vy_z),
+///          - vz=(vz_x, vz_y, vz_z), and
+///          - p=(p_x, p_y, p_z)
+///     - The transformation transforms a point (x, y, z) to: `x*vx + y*vy +
+///       z*vz + p`.
+typedef struct _ze_rtas_transform_float3x4_row_major_exp_t
+{
+    float vx_x;                                                             ///< [in] element 0 of row 0 of 3x4 matrix
+    float vy_x;                                                             ///< [in] element 1 of row 0 of 3x4 matrix
+    float vz_x;                                                             ///< [in] element 2 of row 0 of 3x4 matrix
+    float p_x;                                                              ///< [in] element 3 of row 0 of 3x4 matrix
+    float vx_y;                                                             ///< [in] element 0 of row 1 of 3x4 matrix
+    float vy_y;                                                             ///< [in] element 1 of row 1 of 3x4 matrix
+    float vz_y;                                                             ///< [in] element 2 of row 1 of 3x4 matrix
+    float p_y;                                                              ///< [in] element 3 of row 1 of 3x4 matrix
+    float vx_z;                                                             ///< [in] element 0 of row 2 of 3x4 matrix
+    float vy_z;                                                             ///< [in] element 1 of row 2 of 3x4 matrix
+    float vz_z;                                                             ///< [in] element 2 of row 2 of 3x4 matrix
+    float p_z;                                                              ///< [in] element 3 of row 2 of 3x4 matrix
+
+} ze_rtas_transform_float3x4_row_major_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief A 3-dimensional axis-aligned bounding-box with lower and upper bounds
+///        in each dimension
+typedef struct _ze_rtas_aabb_exp_t
+{
+    ze_rtas_float3_exp_t lower;                                             ///< [in] lower bounds of AABB
+    ze_rtas_float3_exp_t upper;                                             ///< [in] upper bounds of AABB
+
+} ze_rtas_aabb_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Triangle represented using 3 vertex indices
+/// 
+/// @details
+///     - Represents a triangle using 3 vertex indices that index into a vertex
+///       array that needs to be provided together with the index array.
+///     - The linear barycentric u/v parametrization of the triangle is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1, and
+///          - (u=0, v=1) at v2
+typedef struct _ze_rtas_triangle_indices_uint32_exp_t
+{
+    uint32_t v0;                                                            ///< [in] first index pointing to the first triangle vertex in vertex array
+    uint32_t v1;                                                            ///< [in] second index pointing to the second triangle vertex in vertex
+                                                                            ///< array
+    uint32_t v2;                                                            ///< [in] third index pointing to the third triangle vertex in vertex array
+
+} ze_rtas_triangle_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Quad represented using 4 vertex indices
+/// 
+/// @details
+///     - Represents a quad composed of 4 indices that index into a vertex array
+///       that needs to be provided together with the index array.
+///     - A quad is a triangle pair represented using 4 vertex indices v0, v1,
+///       v2, v3.
+///       The first triangle is made out of indices v0, v1, v3 and the second triangle
+///       from indices v2, v3, v1. The piecewise linear barycentric u/v parametrization
+///       of the quad is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1,
+///          - (u=0, v=1) at v3, and
+///          - (u=1, v=1) at v2
+///       This is achieved by correcting the u'/v' coordinates of the second
+///       triangle by
+///       *u = 1-u'* and *v = 1-v'*, yielding a piecewise linear parametrization.
+typedef struct _ze_rtas_quad_indices_uint32_exp_t
+{
+    uint32_t v0;                                                            ///< [in] first index pointing to the first quad vertex in vertex array
+    uint32_t v1;                                                            ///< [in] second index pointing to the second quad vertex in vertex array
+    uint32_t v2;                                                            ///< [in] third index pointing to the third quad vertex in vertex array
+    uint32_t v3;                                                            ///< [in] fourth index pointing to the fourth quad vertex in vertex array
+
+} ze_rtas_quad_indices_uint32_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder geometry info
+typedef struct _ze_rtas_builder_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type
+
+} ze_rtas_builder_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder triangle mesh geometry info
+/// 
+/// @details
+///     - The linear barycentric u/v parametrization of the triangle is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1, and
+///          - (u=0, v=1) at v2
+typedef struct _ze_rtas_builder_triangles_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_TRIANGLES
+    ze_rtas_builder_packed_geometry_exp_flags_t geometryFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    ze_rtas_builder_packed_input_data_format_exp_t triangleFormat;          ///< [in] format of triangle buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_TRIANGLE_INDICES_UINT32
+    ze_rtas_builder_packed_input_data_format_exp_t vertexFormat;            ///< [in] format of vertex buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3
+    uint32_t triangleCount;                                                 ///< [in] number of triangles in triangle buffer
+    uint32_t vertexCount;                                                   ///< [in] number of vertices in vertex buffer
+    uint32_t triangleStride;                                                ///< [in] stride (in bytes) of triangles in triangle buffer
+    uint32_t vertexStride;                                                  ///< [in] stride (in bytes) of vertices in vertex buffer
+    void* pTriangleBuffer;                                                  ///< [in] pointer to array of triangle indices in specified format
+    void* pVertexBuffer;                                                    ///< [in] pointer to array of triangle vertices in specified format
+
+} ze_rtas_builder_triangles_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder quad mesh geometry info
+/// 
+/// @details
+///     - A quad is a triangle pair represented using 4 vertex indices v0, v1,
+///       v2, v3.
+///       The first triangle is made out of indices v0, v1, v3 and the second triangle
+///       from indices v2, v3, v1. The piecewise linear barycentric u/v parametrization
+///       of the quad is defined as:
+///          - (u=0, v=0) at v0,
+///          - (u=1, v=0) at v1,
+///          - (u=0, v=1) at v3, and
+///          - (u=1, v=1) at v2
+///       This is achieved by correcting the u'/v' coordinates of the second
+///       triangle by
+///       *u = 1-u'* and *v = 1-v'*, yielding a piecewise linear parametrization.
+typedef struct _ze_rtas_builder_quads_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_QUADS
+    ze_rtas_builder_packed_geometry_exp_flags_t geometryFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    ze_rtas_builder_packed_input_data_format_exp_t quadFormat;              ///< [in] format of quad buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_QUAD_INDICES_UINT32
+    ze_rtas_builder_packed_input_data_format_exp_t vertexFormat;            ///< [in] format of vertex buffer data, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3
+    uint32_t quadCount;                                                     ///< [in] number of quads in quad buffer
+    uint32_t vertexCount;                                                   ///< [in] number of vertices in vertex buffer
+    uint32_t quadStride;                                                    ///< [in] stride (in bytes) of quads in quad buffer
+    uint32_t vertexStride;                                                  ///< [in] stride (in bytes) of vertices in vertex buffer
+    void* pQuadBuffer;                                                      ///< [in] pointer to array of quad indices in specified format
+    void* pVertexBuffer;                                                    ///< [in] pointer to array of quad vertices in specified format
+
+} ze_rtas_builder_quads_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief AABB callback function parameters
+typedef struct _ze_rtas_geometry_aabbs_exp_cb_params_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t primID;                                                        ///< [in] first primitive to return bounds for
+    uint32_t primIDCount;                                                   ///< [in] number of primitives to return bounds for
+    void* pGeomUserPtr;                                                     ///< [in] pointer provided through geometry descriptor
+    void* pBuildUserPtr;                                                    ///< [in] pointer provided through ::zeRTASBuilderBuildExp function
+    ze_rtas_aabb_exp_t* pBoundsOut;                                         ///< [out] destination buffer to write AABB bounds to
+
+} ze_rtas_geometry_aabbs_exp_cb_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function pointer type to return AABBs for a range of
+///        procedural primitives
+typedef void (*ze_rtas_geometry_aabbs_cb_exp_t)(
+        ze_rtas_geometry_aabbs_exp_cb_params_t* params                          ///< [in] callback function parameters structure
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder procedural primitives
+///        geometry info
+/// 
+/// @details
+///     - A host-side bounds callback function is invoked by the acceleration
+///       structure builder to query the bounds of procedural primitives on
+///       demand. The callback is passed some `pGeomUserPtr` that can point to
+///       an application-side representation of the procedural primitives.
+///       Further, a second `pBuildUserPtr`, which is set by a parameter to
+///       ::zeRTASBuilderBuildExp, is passed to the callback. This allows the
+///       build to change the bounds of the procedural geometry, for example, to
+///       build a BVH only over a short time range to implement multi-segment
+///       motion blur.
+typedef struct _ze_rtas_builder_procedural_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_PROCEDURAL
+    ze_rtas_builder_packed_geometry_exp_flags_t geometryFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    uint8_t reserved;                                                       ///< [in] reserved for future use
+    uint32_t primCount;                                                     ///< [in] number of primitives in geometry
+    ze_rtas_geometry_aabbs_cb_exp_t pfnGetBoundsCb;                         ///< [in] pointer to callback function to get the axis-aligned bounding-box
+                                                                            ///< for a range of primitives
+    void* pGeomUserPtr;                                                     ///< [in] user data pointer passed to callback
+
+} ze_rtas_builder_procedural_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ray tracing acceleration structure builder instance geometry info
+typedef struct _ze_rtas_builder_instance_geometry_info_exp_t
+{
+    ze_rtas_builder_packed_geometry_type_exp_t geometryType;                ///< [in] geometry type, must be
+                                                                            ///< ::ZE_RTAS_BUILDER_GEOMETRY_TYPE_EXP_INSTANCE
+    ze_rtas_builder_packed_instance_exp_flags_t instanceFlags;              ///< [in] 0 or some combination of ::ze_rtas_builder_geometry_exp_flag_t
+                                                                            ///< bits representing the geometry flags for all primitives of this
+                                                                            ///< geometry
+    uint8_t geometryMask;                                                   ///< [in] 8-bit geometry mask for ray masking
+    ze_rtas_builder_packed_input_data_format_exp_t transformFormat;         ///< [in] format of the specified transformation
+    uint32_t instanceUserID;                                                ///< [in] user-specified identifier for the instance
+    void* pTransform;                                                       ///< [in] object-to-world instance transformation in specified format
+    ze_rtas_aabb_exp_t* pBounds;                                            ///< [in] object-space axis-aligned bounding-box of the instanced
+                                                                            ///< acceleration structure
+    void* pAccelerationStructure;                                           ///< [in] pointer to acceleration structure to instantiate
+
+} ze_rtas_builder_instance_geometry_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief 
+typedef struct _ze_rtas_builder_build_op_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_rtas_format_exp_t rtasFormat;                                        ///< [in] ray tracing acceleration structure format
+    ze_rtas_builder_build_quality_hint_exp_t buildQuality;                  ///< [in] acceleration structure build quality hint
+    ze_rtas_builder_build_op_exp_flags_t buildFlags;                        ///< [in] 0 or some combination of ::ze_rtas_builder_build_op_exp_flag_t
+                                                                            ///< flags
+    const ze_rtas_builder_geometry_info_exp_t** ppGeometries;               ///< [in][optional][range(0, `numGeometries`)] NULL or a valid array of
+                                                                            ///< pointers to geometry infos
+    uint32_t numGeometries;                                                 ///< [in] number of geometries in geometry infos array, can be zero when
+                                                                            ///< `ppGeometries` is NULL
+
+} ze_rtas_builder_build_op_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a ray tracing acceleration structure builder object
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support ::ZE_experimental_rtas_builder
+///       extension.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDescriptor`
+///         + `nullptr == phBuilder`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_BUILDER_EXP_VERSION_CURRENT < pDescriptor->builderVersion`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderCreateExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of driver object
+    const ze_rtas_builder_exp_desc_t* pDescriptor,                          ///< [in] pointer to builder descriptor
+    ze_rtas_builder_exp_handle_t* phBuilder                                 ///< [out] handle of builder object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves ray tracing acceleration structure builder properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hBuilder`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pBuildOpDescriptor`
+///         + `nullptr == pProperties`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < pBuildOpDescriptor->rtasFormat`
+///         + `::ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH < pBuildOpDescriptor->buildQuality`
+///         + `0x3 < pBuildOpDescriptor->buildFlags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderGetBuildPropertiesExp(
+    ze_rtas_builder_exp_handle_t hBuilder,                                  ///< [in] handle of builder object
+    const ze_rtas_builder_build_op_exp_desc_t* pBuildOpDescriptor,          ///< [in] pointer to build operation descriptor
+    ze_rtas_builder_exp_properties_t* pProperties                           ///< [in,out] query result for builder properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Checks ray tracing acceleration structure format compatibility
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < rtasFormatA`
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < rtasFormatB`
+///     - ::ZE_RESULT_SUCCESS
+///         + An acceleration structure built with `rtasFormatA` is compatible with devices that report `rtasFormatB`.
+///     - ::ZE_RESULT_EXP_ERROR_OPERANDS_INCOMPATIBLE
+///         + An acceleration structure built with `rtasFormatA` is **not** compatible with devices that report `rtasFormatB`.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeDriverRTASFormatCompatibilityCheckExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of driver object
+    ze_rtas_format_exp_t rtasFormatA,                                       ///< [in] operand A
+    ze_rtas_format_exp_t rtasFormatB                                        ///< [in] operand B
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Build ray tracing acceleration structure
+/// 
+/// @details
+///     - This function builds an acceleration structure of the scene consisting
+///       of the specified geometry information and writes the acceleration
+///       structure to the provided destination buffer. All types of geometries
+///       can get freely mixed inside a scene.
+///     - It is the user's responsibility to manage the acceleration structure
+///       buffer allocation, de-allocation, and potential prefetching to the
+///       device memory. The required size of the acceleration structure buffer
+///       can be queried with the ::zeRTASBuilderGetBuildPropertiesExp function.
+///       The acceleration structure buffer must be a shared USM allocation and
+///       should be present on the host at build time. The referenced scene data
+///       (index- and vertex- buffers) can be standard host allocations, and
+///       will not be referenced into by the build acceleration structure.
+///     - Before an acceleration structure can be built, the user must allocate
+///       the memory for the acceleration structure buffer and scratch buffer
+///       using sizes based on a query for the estimated size properties.
+///     - When using the "worst-case" size for the acceleration structure
+///       buffer, the acceleration structure construction will never fail with ::ZE_RESULT_EXP_RTAS_BUILD_RETRY.
+///     - When using the "expected" size for the acceleration structure buffer,
+///       the acceleration structure construction may fail with
+///       ::ZE_RESULT_EXP_RTAS_BUILD_RETRY. If this happens, the user may resize
+///       their acceleration structure buffer using the returned
+///       `*pRtasBufferSizeBytes` value, which will be updated with an improved
+///       size estimate that will likely result in a successful build.
+///     - The acceleration structure construction is run on the host and is
+///       synchronous, thus after the function returns with a successful result,
+///       the acceleration structure may be used.
+///     - All provided data buffers must be host-accessible.
+///     - The acceleration structure buffer must be a USM allocation.
+///     - A successfully constructed acceleration structure is entirely
+///       self-contained. There is no requirement for input data to persist
+///       beyond build completion.
+///     - A successfully constructed acceleration structure is non-copyable.
+///     - Acceleration structure construction may be parallelized by passing a
+///       valid handle to a parallel operation object and joining that parallel
+///       operation using ::zeRTASParallelOperationJoinExp with user-provided
+///       worker threads.
+///     - **Additional Notes**
+///        - "The geometry infos array, geometry infos, and scratch buffer must
+///       all be standard host memory allocations."
+///        - "A pointer to a geometry info can be a null pointer, in which case
+///       the geometry is treated as empty."
+///        - "If no parallel operation handle is provided, the build is run
+///       sequentially on the current thread."
+///        - "A parallel operation object may only be associated with a single
+///       acceleration structure build at a time."
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hBuilder`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pBuildOpDescriptor`
+///         + `nullptr == pScratchBuffer`
+///         + `nullptr == pRtasBuffer`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZE_RTAS_FORMAT_EXP_INVALID < pBuildOpDescriptor->rtasFormat`
+///         + `::ZE_RTAS_BUILDER_BUILD_QUALITY_HINT_EXP_HIGH < pBuildOpDescriptor->buildQuality`
+///         + `0x3 < pBuildOpDescriptor->buildFlags`
+///     - ::ZE_RESULT_EXP_RTAS_BUILD_DEFERRED
+///         + Acceleration structure build completion is deferred to parallel operation join.
+///     - ::ZE_RESULT_EXP_RTAS_BUILD_RETRY
+///         + Acceleration structure build failed due to insufficient resources, retry the build operation with a larger acceleration structure buffer allocation.
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + Acceleration structure build failed due to parallel operation object participation in another build operation.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderBuildExp(
+    ze_rtas_builder_exp_handle_t hBuilder,                                  ///< [in] handle of builder object
+    const ze_rtas_builder_build_op_exp_desc_t* pBuildOpDescriptor,          ///< [in] pointer to build operation descriptor
+    void* pScratchBuffer,                                                   ///< [in][range(0, `scratchBufferSizeBytes`)] scratch buffer to be used
+                                                                            ///< during acceleration structure construction
+    size_t scratchBufferSizeBytes,                                          ///< [in] size of scratch buffer, in bytes
+    void* pRtasBuffer,                                                      ///< [in] pointer to destination buffer
+    size_t rtasBufferSizeBytes,                                             ///< [in] destination buffer size, in bytes
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation,             ///< [in][optional] handle to parallel operation object
+    void* pBuildUserPtr,                                                    ///< [in][optional] pointer passed to callbacks
+    ze_rtas_aabb_exp_t* pBounds,                                            ///< [in,out][optional] pointer to destination address for acceleration
+                                                                            ///< structure bounds
+    size_t* pRtasBufferSizeBytes                                            ///< [out][optional] updated acceleration structure size requirement, in
+                                                                            ///< bytes
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a ray tracing acceleration structure builder object
+/// 
+/// @details
+///     - The implementation of this function may immediately release any
+///       internal Host and Device resources associated with this builder.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same builder handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hBuilder`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASBuilderDestroyExp(
+    ze_rtas_builder_exp_handle_t hBuilder                                   ///< [in][release] handle of builder object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a ray tracing acceleration structure builder parallel
+///        operation object
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation must support ::ZE_experimental_rtas_builder
+///       extension.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phParallelOperation`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationCreateExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of driver object
+    ze_rtas_parallel_operation_exp_handle_t* phParallelOperation            ///< [out] handle of parallel operation object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves ray tracing acceleration structure builder parallel
+///        operation properties
+/// 
+/// @details
+///     - The application must first bind the parallel operation object to a
+///       build operation before it may query the parallel operation properties.
+///       In other words, the application must first call
+///       ::zeRTASBuilderBuildExp with **hParallelOperation** before calling
+///       this function.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hParallelOperation`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationGetPropertiesExp(
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation,             ///< [in] handle of parallel operation object
+    ze_rtas_parallel_operation_exp_properties_t* pProperties                ///< [in,out] query result for parallel operation properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Joins a parallel build operation
+/// 
+/// @details
+///     - All worker threads return the same error code for the parallel build
+///       operation upon build completion
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hParallelOperation`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationJoinExp(
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation              ///< [in] handle of parallel operation object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a ray tracing acceleration structure builder parallel
+///        operation object
+/// 
+/// @details
+///     - The implementation of this function may immediately release any
+///       internal Host and Device resources associated with this parallel
+///       operation.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same parallel operation handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hParallelOperation`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeRTASParallelOperationDestroyExp(
+    ze_rtas_parallel_operation_exp_handle_t hParallelOperation              ///< [in][release] handle of parallel operation object to destroy
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension APIs for Counter-based Event Pools
+#if !defined(__GNUC__)
+#pragma region counterbasedeventpool
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME
+/// @brief Counter-based Event Pools Extension Name
+#define ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME  "ZE_experimental_event_pool_counter_based"
+#endif // ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Counter-based Event Pools Extension Version(s)
+typedef enum _ze_event_pool_counter_based_exp_version_t
+{
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_pool_counter_based_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event flags for defining counter-based event pools.
+typedef uint32_t ze_event_pool_counter_based_exp_flags_t;
+typedef enum _ze_event_pool_counter_based_exp_flag_t
+{
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE = ZE_BIT(0),             ///< Counter-based event pool is used for immediate command lists (default)
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE = ZE_BIT(1),         ///< Counter-based event pool is for non-immediate command lists
+    ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_event_pool_counter_based_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event pool descriptor for counter-based events. This structure may be
+///        passed to ::zeEventPoolCreate as pNext member of
+///        ::ze_event_pool_desc_t.
+typedef struct _ze_event_pool_counter_based_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_event_pool_counter_based_exp_flags_t flags;                          ///< [in] mode flags.
+                                                                            ///< must be 0 (default) or a valid value of ::ze_event_pool_counter_based_exp_flag_t
+                                                                            ///< default behavior is counter-based event pool is only used for
+                                                                            ///< immediate command lists.
+
+} ze_event_pool_counter_based_exp_desc_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting bindless images.
+#if !defined(__GNUC__)
+#pragma region bindlessimages
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_BINDLESS_IMAGE_EXP_NAME
+/// @brief Image Memory Properties Extension Name
+#define ZE_BINDLESS_IMAGE_EXP_NAME  "ZE_experimental_bindless_image"
+#endif // ZE_BINDLESS_IMAGE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Bindless Image Extension Version(s)
+typedef enum _ze_bindless_image_exp_version_t
+{
+    ZE_BINDLESS_IMAGE_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0
+    ZE_BINDLESS_IMAGE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),        ///< latest known version
+    ZE_BINDLESS_IMAGE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_bindless_image_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image flags for Bindless images
+typedef uint32_t ze_image_bindless_exp_flags_t;
+typedef enum _ze_image_bindless_exp_flag_t
+{
+    ZE_IMAGE_BINDLESS_EXP_FLAG_BINDLESS = ZE_BIT(0),                        ///< Bindless images are created with ::zeImageCreate. The image handle
+                                                                            ///< created with this flag is valid on both host and device.
+    ZE_IMAGE_BINDLESS_EXP_FLAG_SAMPLED_IMAGE = ZE_BIT(1),                   ///< Bindless sampled images are created with ::zeImageCreate by combining
+                                                                            ///< BINDLESS and SAMPLED_IMAGE.
+                                                                            ///< Create sampled image view from bindless unsampled image using SAMPLED_IMAGE.
+    ZE_IMAGE_BINDLESS_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_image_bindless_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image descriptor for bindless images. This structure may be passed to
+///        ::zeImageCreate via pNext member of ::ze_image_desc_t.
+typedef struct _ze_image_bindless_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_image_bindless_exp_flags_t flags;                                    ///< [in] image flags.
+                                                                            ///< must be 0 (default) or a valid value of ::ze_image_bindless_exp_flag_t
+                                                                            ///< default behavior is bindless images are not used when creating handles
+                                                                            ///< via ::zeImageCreate.
+                                                                            ///< When the flag is passed to ::zeImageCreate, then only the memory for
+                                                                            ///< the image is allocated.
+                                                                            ///< Additional image handles can be created with ::zeImageViewCreateExt.
+                                                                            ///< When ::ZE_IMAGE_BINDLESS_EXP_FLAG_SAMPLED_IMAGE flag is passed,
+                                                                            ///< ::ze_sampler_desc_t must be attached via pNext member of ::ze_image_bindless_exp_desc_t.
+
+} ze_image_bindless_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Image descriptor for bindless images created from pitched allocations.
+///        This structure may be passed to ::zeImageCreate via pNext member of
+///        ::ze_image_desc_t.
+typedef struct _ze_image_pitched_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    void* ptr;                                                              ///< [in] pointer to pitched device allocation allocated using ::zeMemAllocDevice
+
+} ze_image_pitched_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device specific properties for pitched allocations
+/// 
+/// @details
+///     - This structure may be passed to ::zeDeviceGetImageProperties via the
+///       pNext member of ::ze_device_image_properties_t.
+typedef struct _ze_device_pitched_alloc_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    size_t maxImageLinearWidth;                                             ///< [out] Maximum image linear width.
+    size_t maxImageLinearHeight;                                            ///< [out] Maximum image linear height.
+
+} ze_device_pitched_alloc_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Allocate pitched USM memory for images
+/// 
+/// @details
+///     - Retrieves pitch for 2D image given the width, height and size in bytes
+///     - The memory is then allocated using ::zeMemAllocDevice by providing
+///       input size calculated as the returned pitch value multiplied by image height
+///     - The application may call this function from simultaneous threads
+///     - The implementation of this function must be thread-safe.
+///     - The implementation of this function should be lock-free.
+///     - The implementation must support ::ZE_experimental_bindless_image extension.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeMemGetPitchFor2dImage(
+    ze_context_handle_t hContext,                                           ///< [in] handle of the context object
+    ze_device_handle_t hDevice,                                             ///< [in] handle of the device
+    size_t imageWidth,                                                      ///< [in] imageWidth
+    size_t imageHeight,                                                     ///< [in] imageHeight
+    unsigned int elementSizeInBytes,                                        ///< [in] Element size in bytes
+    size_t * rowPitch                                                       ///< [out] rowPitch
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get bindless device offset for image
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads
+///     - The implementation of this function must be thread-safe.
+///     - The implementation of this function should be lock-free.
+///     - The implementation must support ::ZE_experimental_bindless_image
+///       extension.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hImage`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDeviceOffset`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeImageGetDeviceOffsetExp(
+    ze_image_handle_t hImage,                                               ///< [in] handle of the image
+    uint64_t* pDeviceOffset                                                 ///< [out] bindless device offset for image
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting compute graphs.
+#if !defined(__GNUC__)
+#pragma region commandListClone
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_COMMAND_LIST_CLONE_EXP_NAME
+/// @brief Command List Clone Extension Name
+#define ZE_COMMAND_LIST_CLONE_EXP_NAME  "ZE_experimental_command_list_clone"
+#endif // ZE_COMMAND_LIST_CLONE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command List Clone Extension Version(s)
+typedef enum _ze_command_list_clone_exp_version_t
+{
+    ZE_COMMAND_LIST_CLONE_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),        ///< version 1.0
+    ZE_COMMAND_LIST_CLONE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),    ///< latest known version
+    ZE_COMMAND_LIST_CLONE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_command_list_clone_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a command list as the clone of another command list.
+/// 
+/// @details
+///     - The source command list must be created with the
+///       ::ZE_COMMAND_LIST_FLAG_EXP_CLONEABLE flag.
+///     - The source command list must be closed prior to cloning.
+///     - The source command list may be cloned while it is running on the
+///       device.
+///     - The cloned command list inherits all properties of the source command
+///       list.
+///     - The cloned command list must be destroyed prior to the source command
+///       list.
+///     - The application must only use the command list for the device, or its
+///       sub-devices, which was provided during creation.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phClonedCommandList`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListCreateCloneExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle to source command list (the command list to clone)
+    ze_command_list_handle_t* phClonedCommandList                           ///< [out] pointer to handle of the cloned command list
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting compute graphs.
+#if !defined(__GNUC__)
+#pragma region immediateCommandListAppend
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME
+/// @brief Immediate Command List Append Extension Name
+#define ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME  "ZE_experimental_immediate_command_list_append"
+#endif // ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Immediate Command List Append Extension Version(s)
+typedef enum _ze_immediate_command_list_append_exp_version_t
+{
+    ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
+    ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZE_IMMEDIATE_COMMAND_LIST_APPEND_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_immediate_command_list_append_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends command lists to dispatch from an immediate command list.
+/// 
+/// @details
+///     - The application must call this function only with command lists
+///       created with ::zeCommandListCreateImmediate.
+///     - The command lists passed to this function in the `phCommandLists`
+///       argument must be regular command lists (i.e. not immediate command
+///       lists).
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandListImmediate`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phCommandLists`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListImmediateAppendCommandListsExp(
+    ze_command_list_handle_t hCommandListImmediate,                         ///< [in] handle of the immediate command list
+    uint32_t numCommandLists,                                               ///< [in] number of command lists
+    ze_command_list_handle_t* phCommandLists,                               ///< [in][range(0, numCommandLists)] handles of command lists
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+                                                                            ///<    - if not null, this event is signaled after the completion of all
+                                                                            ///< appended command lists
+    uint32_t numWaitEvents,                                                 ///< [in][optional] number of events to wait on before executing appended
+                                                                            ///< command lists; must be 0 if nullptr == phWaitEvents
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before executing appended command lists.
+                                                                            ///<    - if not null, all wait events must be satisfied prior to the start
+                                                                            ///< of any appended command list(s)
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Extension for supporting compute graphs with dynamic properties.
+#if !defined(__GNUC__)
+#pragma region mutableCommandList
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MUTABLE_COMMAND_LIST_EXP_NAME
+/// @brief Mutable Command List Extension Name
+#define ZE_MUTABLE_COMMAND_LIST_EXP_NAME  "ZE_experimental_mutable_command_list"
+#endif // ZE_MUTABLE_COMMAND_LIST_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable Command List Extension Version(s)
+typedef enum _ze_mutable_command_list_exp_version_t
+{
+    ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),      ///< version 1.0
+    ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),      ///< version 1.1
+    ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 1 ),  ///< latest known version
+    ZE_MUTABLE_COMMAND_LIST_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_mutable_command_list_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable command flags
+typedef uint32_t ze_mutable_command_exp_flags_t;
+typedef enum _ze_mutable_command_exp_flag_t
+{
+    ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS = ZE_BIT(0),               ///< kernel arguments
+    ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT = ZE_BIT(1),                    ///< kernel group count
+    ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE = ZE_BIT(2),                     ///< kernel group size
+    ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET = ZE_BIT(3),                  ///< kernel global offset
+    ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT = ZE_BIT(4),                   ///< command signal event
+    ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS = ZE_BIT(5),                    ///< command wait events
+    ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION = ZE_BIT(6),             ///< command kernel
+    ZE_MUTABLE_COMMAND_EXP_FLAG_GRAPH_ARGUMENTS = ZE_BIT(7),                ///< graph arguments
+    ZE_MUTABLE_COMMAND_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_mutable_command_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable command identifier descriptor
+typedef struct _ze_mutable_command_id_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_mutable_command_exp_flags_t flags;                                   ///< [in] mutable command flags.
+                                                                            ///<  - must be 0 (default, equivalent to setting all flags bar kernel
+                                                                            ///< instruction), or a valid combination of ::ze_mutable_command_exp_flag_t
+                                                                            ///<  - in order to include kernel instruction mutation,
+                                                                            ///< ::ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION must be explictly included
+
+} ze_mutable_command_id_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable command list flags
+typedef uint32_t ze_mutable_command_list_exp_flags_t;
+typedef enum _ze_mutable_command_list_exp_flag_t
+{
+    ZE_MUTABLE_COMMAND_LIST_EXP_FLAG_RESERVED = ZE_BIT(0),                  ///< reserved
+    ZE_MUTABLE_COMMAND_LIST_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_mutable_command_list_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable command list properties
+typedef struct _ze_mutable_command_list_exp_properties_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_mutable_command_list_exp_flags_t mutableCommandListFlags;            ///< [out] mutable command list flags
+    ze_mutable_command_exp_flags_t mutableCommandFlags;                     ///< [out] mutable command flags
+
+} ze_mutable_command_list_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable command list descriptor
+typedef struct _ze_mutable_command_list_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_mutable_command_list_exp_flags_t flags;                              ///< [in] mutable command list flags.
+                                                                            ///<  - must be 0 (default) or a valid combination of ::ze_mutable_command_list_exp_flag_t
+
+} ze_mutable_command_list_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable commands descriptor
+typedef struct _ze_mutable_commands_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t flags;                                                         ///< [in] must be 0, this field is reserved for future use
+
+} ze_mutable_commands_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable kernel argument descriptor
+typedef struct _ze_mutable_kernel_argument_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t commandId;                                                     ///< [in] command identifier
+    uint32_t argIndex;                                                      ///< [in] kernel argument index
+    size_t argSize;                                                         ///< [in] kernel argument size
+    const void* pArgValue;                                                  ///< [in] pointer to kernel argument value
+
+} ze_mutable_kernel_argument_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable kernel group count descriptor
+typedef struct _ze_mutable_group_count_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t commandId;                                                     ///< [in] command identifier
+    const ze_group_count_t* pGroupCount;                                    ///< [in] pointer to group count
+
+} ze_mutable_group_count_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable kernel group size descriptor
+typedef struct _ze_mutable_group_size_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t commandId;                                                     ///< [in] command identifier
+    uint32_t groupSizeX;                                                    ///< [in] group size for X dimension to use for the kernel
+    uint32_t groupSizeY;                                                    ///< [in] group size for Y dimension to use for the kernel
+    uint32_t groupSizeZ;                                                    ///< [in] group size for Z dimension to use for the kernel
+
+} ze_mutable_group_size_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable kernel global offset descriptor
+typedef struct _ze_mutable_global_offset_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t commandId;                                                     ///< [in] command identifier
+    uint32_t offsetX;                                                       ///< [in] global offset for X dimension to use for this kernel
+    uint32_t offsetY;                                                       ///< [in] global offset for Y dimension to use for this kernel
+    uint32_t offsetZ;                                                       ///< [in] global offset for Z dimension to use for this kernel
+
+} ze_mutable_global_offset_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Mutable graph argument descriptor
+typedef struct _ze_mutable_graph_argument_exp_desc_t
+{
+    ze_structure_type_t stype;                                              ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t commandId;                                                     ///< [in] command identifier
+    uint32_t argIndex;                                                      ///< [in] graph argument index
+    const void* pArgValue;                                                  ///< [in] pointer to graph argument value
+
+} ze_mutable_graph_argument_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns a unique command identifier for the next command to be
+///        appended to a command list.
+/// 
+/// @details
+///     - This function may only be called for a mutable command list.
+///     - This function may not be called on a closed command list.
+///     - This function may be called from simultaneous threads with the same
+///       command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == pCommandId`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xff < desc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListGetNextCommandIdExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    const ze_mutable_command_id_exp_desc_t* desc,                           ///< [in] pointer to mutable command identifier descriptor
+    uint64_t* pCommandId                                                    ///< [out] pointer to mutable command identifier to be written
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns a unique command identifier for the next command to be
+///        appended to a command list. Provides possible kernel handles for
+///        kernel mutation when ::ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_INSTRUCTION
+///        flag is present.
+/// 
+/// @details
+///     - This function may only be called for a mutable command list.
+///     - This function may not be called on a closed command list.
+///     - This function may be called from simultaneous threads with the same
+///       command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == pCommandId`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xff < desc->flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListGetNextCommandIdWithKernelsExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    const ze_mutable_command_id_exp_desc_t* desc,                           ///< [in][out] pointer to mutable command identifier descriptor
+    uint32_t numKernels,                                                    ///< [in][optional] number of entries on phKernels list
+    ze_kernel_handle_t* phKernels,                                          ///< [in][optional][range(0, numKernels)] list of kernels that user can
+                                                                            ///< switch between using ::zeCommandListUpdateMutableCommandKernelsExp
+                                                                            ///< call
+    uint64_t* pCommandId                                                    ///< [out] pointer to mutable command identifier to be written
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Updates mutable commands.
+/// 
+/// @details
+///     - This function may only be called for a mutable command list.
+///     - The application must synchronize mutable command list execution before
+///       calling this function.
+///     - The application must close a mutable command list after completing all
+///       updates.
+///     - This function must not be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + Invalid kernel argument or not matching update descriptor provided
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListUpdateMutableCommandsExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    const ze_mutable_commands_exp_desc_t* desc                              ///< [in] pointer to mutable commands descriptor; multiple descriptors may
+                                                                            ///< be chained via `pNext` member
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Updates the signal event for a mutable command in a mutable command
+///        list.
+/// 
+/// @details
+///     - This function may only be called for a mutable command list.
+///     - The type, scope and flags of the signal event must match those of the
+///       source command.
+///     - The application must synchronize mutable command list execution before
+///       calling this function.
+///     - The application must close a mutable command list after completing all
+///       updates.
+///     - This function must not be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListUpdateMutableCommandSignalEventExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint64_t commandId,                                                     ///< [in] command identifier
+    ze_event_handle_t hSignalEvent                                          ///< [in][optional] handle of the event to signal on completion
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Updates the wait events for a mutable command in a mutable command
+///        list.
+/// 
+/// @details
+///     - This function may only be called for a mutable command list.
+///     - The number of wait events must match that of the source command.
+///     - The type, scope and flags of the wait events must match those of the
+///       source command.
+///     - Passing `nullptr` as the wait events will update the command to not
+///       wait on any events prior to dispatch.
+///     - Passing `nullptr` as an event on event wait list will remove event
+///       dependency from this wait list slot.
+///     - The application must synchronize mutable command list execution before
+///       calling this function.
+///     - The application must close a mutable command list after completing all
+///       updates.
+///     - This function must not be called from simultaneous threads with the
+///       same command list handle.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + The `numWaitEvents` parameter does not match that of the original command.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListUpdateMutableCommandWaitEventsExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint64_t commandId,                                                     ///< [in] command identifier
+    uint32_t numWaitEvents,                                                 ///< [in][optional] the number of wait events
+    ze_event_handle_t* phWaitEvents                                         ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait
+                                                                            ///< on before launching
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Updates the kernel for a mutable command in a mutable command list.
+/// 
+/// @details
+///     - This function may only be called for a mutable command list.
+///     - The kernel handle must be from the provided list for given command id.
+///     - The application must synchronize mutable command list execution before
+///       calling this function.
+///     - The application must close a mutable command list after completing all
+///       updates.
+///     - This function must not be called from simultaneous threads with the
+///       same command list handle.
+///     - This function must be called before updating kernel arguments and
+///       dispatch parameters, when kernel is mutated.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCommandId`
+///         + `nullptr == phKernels`
+///     - ::ZE_RESULT_ERROR_INVALID_KERNEL_HANDLE
+///         + Invalid kernel handle provided for the mutation kernel instruction operation.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeCommandListUpdateMutableCommandKernelsExp(
+    ze_command_list_handle_t hCommandList,                                  ///< [in] handle of the command list
+    uint32_t numKernels,                                                    ///< [in] the number of kernels to update
+    uint64_t* pCommandId,                                                   ///< [in][range(0, numKernels)] command identifier
+    ze_kernel_handle_t* phKernels                                           ///< [in][range(0, numKernels)] handle of the kernel for a command
+                                                                            ///< identifier to switch to
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero API Callbacks
+#if !defined(__GNUC__)
+#pragma region callbacks
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeInit 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_init_params_t
+{
+    ze_init_flags_t* pflags;
+} ze_init_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeInit 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnInitCb_t)(
+    ze_init_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Global callback functions pointers
+typedef struct _ze_global_callbacks_t
+{
+    ze_pfnInitCb_t                                                  pfnInitCb;
+} ze_global_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGet 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_driver_get_params_t
+{
+    uint32_t** ppCount;
+    ze_driver_handle_t** pphDrivers;
+} ze_driver_get_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGet 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDriverGetCb_t)(
+    ze_driver_get_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGetApiVersion 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_driver_get_api_version_params_t
+{
+    ze_driver_handle_t* phDriver;
+    ze_api_version_t** pversion;
+} ze_driver_get_api_version_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGetApiVersion 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDriverGetApiVersionCb_t)(
+    ze_driver_get_api_version_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGetProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_driver_get_properties_params_t
+{
+    ze_driver_handle_t* phDriver;
+    ze_driver_properties_t** ppDriverProperties;
+} ze_driver_get_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGetProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDriverGetPropertiesCb_t)(
+    ze_driver_get_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGetIpcProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_driver_get_ipc_properties_params_t
+{
+    ze_driver_handle_t* phDriver;
+    ze_driver_ipc_properties_t** ppIpcProperties;
+} ze_driver_get_ipc_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGetIpcProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDriverGetIpcPropertiesCb_t)(
+    ze_driver_get_ipc_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDriverGetExtensionProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_driver_get_extension_properties_params_t
+{
+    ze_driver_handle_t* phDriver;
+    uint32_t** ppCount;
+    ze_driver_extension_properties_t** ppExtensionProperties;
+} ze_driver_get_extension_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDriverGetExtensionProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDriverGetExtensionPropertiesCb_t)(
+    ze_driver_get_extension_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Driver callback functions pointers
+typedef struct _ze_driver_callbacks_t
+{
+    ze_pfnDriverGetCb_t                                             pfnGetCb;
+    ze_pfnDriverGetApiVersionCb_t                                   pfnGetApiVersionCb;
+    ze_pfnDriverGetPropertiesCb_t                                   pfnGetPropertiesCb;
+    ze_pfnDriverGetIpcPropertiesCb_t                                pfnGetIpcPropertiesCb;
+    ze_pfnDriverGetExtensionPropertiesCb_t                          pfnGetExtensionPropertiesCb;
+} ze_driver_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGet 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_params_t
+{
+    ze_driver_handle_t* phDriver;
+    uint32_t** ppCount;
+    ze_device_handle_t** pphDevices;
+} ze_device_get_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGet 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetCb_t)(
+    ze_device_get_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetSubDevices 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_sub_devices_params_t
+{
+    ze_device_handle_t* phDevice;
+    uint32_t** ppCount;
+    ze_device_handle_t** pphSubdevices;
+} ze_device_get_sub_devices_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetSubDevices 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetSubDevicesCb_t)(
+    ze_device_get_sub_devices_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_properties_t** ppDeviceProperties;
+} ze_device_get_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetPropertiesCb_t)(
+    ze_device_get_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetComputeProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_compute_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_compute_properties_t** ppComputeProperties;
+} ze_device_get_compute_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetComputeProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetComputePropertiesCb_t)(
+    ze_device_get_compute_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetModuleProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_module_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_module_properties_t** ppModuleProperties;
+} ze_device_get_module_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetModuleProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetModulePropertiesCb_t)(
+    ze_device_get_module_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetCommandQueueGroupProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_command_queue_group_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    uint32_t** ppCount;
+    ze_command_queue_group_properties_t** ppCommandQueueGroupProperties;
+} ze_device_get_command_queue_group_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetCommandQueueGroupProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t)(
+    ze_device_get_command_queue_group_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetMemoryProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_memory_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    uint32_t** ppCount;
+    ze_device_memory_properties_t** ppMemProperties;
+} ze_device_get_memory_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetMemoryProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetMemoryPropertiesCb_t)(
+    ze_device_get_memory_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetMemoryAccessProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_memory_access_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_memory_access_properties_t** ppMemAccessProperties;
+} ze_device_get_memory_access_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetMemoryAccessProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetMemoryAccessPropertiesCb_t)(
+    ze_device_get_memory_access_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetCacheProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_cache_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    uint32_t** ppCount;
+    ze_device_cache_properties_t** ppCacheProperties;
+} ze_device_get_cache_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetCacheProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetCachePropertiesCb_t)(
+    ze_device_get_cache_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetImageProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_image_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_image_properties_t** ppImageProperties;
+} ze_device_get_image_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetImageProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetImagePropertiesCb_t)(
+    ze_device_get_image_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetExternalMemoryProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_external_memory_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_external_memory_properties_t** ppExternalMemoryProperties;
+} ze_device_get_external_memory_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetExternalMemoryProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetExternalMemoryPropertiesCb_t)(
+    ze_device_get_external_memory_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetP2PProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_p2_p_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_handle_t* phPeerDevice;
+    ze_device_p2p_properties_t** ppP2PProperties;
+} ze_device_get_p2_p_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetP2PProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetP2PPropertiesCb_t)(
+    ze_device_get_p2_p_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceCanAccessPeer 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_can_access_peer_params_t
+{
+    ze_device_handle_t* phDevice;
+    ze_device_handle_t* phPeerDevice;
+    ze_bool_t** pvalue;
+} ze_device_can_access_peer_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceCanAccessPeer 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceCanAccessPeerCb_t)(
+    ze_device_can_access_peer_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeDeviceGetStatus 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_device_get_status_params_t
+{
+    ze_device_handle_t* phDevice;
+} ze_device_get_status_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeDeviceGetStatus 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnDeviceGetStatusCb_t)(
+    ze_device_get_status_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Device callback functions pointers
+typedef struct _ze_device_callbacks_t
+{
+    ze_pfnDeviceGetCb_t                                             pfnGetCb;
+    ze_pfnDeviceGetSubDevicesCb_t                                   pfnGetSubDevicesCb;
+    ze_pfnDeviceGetPropertiesCb_t                                   pfnGetPropertiesCb;
+    ze_pfnDeviceGetComputePropertiesCb_t                            pfnGetComputePropertiesCb;
+    ze_pfnDeviceGetModulePropertiesCb_t                             pfnGetModulePropertiesCb;
+    ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t                  pfnGetCommandQueueGroupPropertiesCb;
+    ze_pfnDeviceGetMemoryPropertiesCb_t                             pfnGetMemoryPropertiesCb;
+    ze_pfnDeviceGetMemoryAccessPropertiesCb_t                       pfnGetMemoryAccessPropertiesCb;
+    ze_pfnDeviceGetCachePropertiesCb_t                              pfnGetCachePropertiesCb;
+    ze_pfnDeviceGetImagePropertiesCb_t                              pfnGetImagePropertiesCb;
+    ze_pfnDeviceGetExternalMemoryPropertiesCb_t                     pfnGetExternalMemoryPropertiesCb;
+    ze_pfnDeviceGetP2PPropertiesCb_t                                pfnGetP2PPropertiesCb;
+    ze_pfnDeviceCanAccessPeerCb_t                                   pfnCanAccessPeerCb;
+    ze_pfnDeviceGetStatusCb_t                                       pfnGetStatusCb;
+} ze_device_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_create_params_t
+{
+    ze_driver_handle_t* phDriver;
+    const ze_context_desc_t** pdesc;
+    ze_context_handle_t** pphContext;
+} ze_context_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextCreateCb_t)(
+    ze_context_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_destroy_params_t
+{
+    ze_context_handle_t* phContext;
+} ze_context_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextDestroyCb_t)(
+    ze_context_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextGetStatus 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_get_status_params_t
+{
+    ze_context_handle_t* phContext;
+} ze_context_get_status_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextGetStatus 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextGetStatusCb_t)(
+    ze_context_get_status_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextSystemBarrier 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_system_barrier_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+} ze_context_system_barrier_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextSystemBarrier 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextSystemBarrierCb_t)(
+    ze_context_system_barrier_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextMakeMemoryResident 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_make_memory_resident_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    void** pptr;
+    size_t* psize;
+} ze_context_make_memory_resident_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextMakeMemoryResident 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextMakeMemoryResidentCb_t)(
+    ze_context_make_memory_resident_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextEvictMemory 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_evict_memory_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    void** pptr;
+    size_t* psize;
+} ze_context_evict_memory_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextEvictMemory 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextEvictMemoryCb_t)(
+    ze_context_evict_memory_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextMakeImageResident 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_make_image_resident_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    ze_image_handle_t* phImage;
+} ze_context_make_image_resident_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextMakeImageResident 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextMakeImageResidentCb_t)(
+    ze_context_make_image_resident_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeContextEvictImage 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_context_evict_image_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    ze_image_handle_t* phImage;
+} ze_context_evict_image_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeContextEvictImage 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnContextEvictImageCb_t)(
+    ze_context_evict_image_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Context callback functions pointers
+typedef struct _ze_context_callbacks_t
+{
+    ze_pfnContextCreateCb_t                                         pfnCreateCb;
+    ze_pfnContextDestroyCb_t                                        pfnDestroyCb;
+    ze_pfnContextGetStatusCb_t                                      pfnGetStatusCb;
+    ze_pfnContextSystemBarrierCb_t                                  pfnSystemBarrierCb;
+    ze_pfnContextMakeMemoryResidentCb_t                             pfnMakeMemoryResidentCb;
+    ze_pfnContextEvictMemoryCb_t                                    pfnEvictMemoryCb;
+    ze_pfnContextMakeImageResidentCb_t                              pfnMakeImageResidentCb;
+    ze_pfnContextEvictImageCb_t                                     pfnEvictImageCb;
+} ze_context_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandQueueCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_queue_create_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_command_queue_desc_t** pdesc;
+    ze_command_queue_handle_t** pphCommandQueue;
+} ze_command_queue_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandQueueCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandQueueCreateCb_t)(
+    ze_command_queue_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandQueueDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_queue_destroy_params_t
+{
+    ze_command_queue_handle_t* phCommandQueue;
+} ze_command_queue_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandQueueDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandQueueDestroyCb_t)(
+    ze_command_queue_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandQueueExecuteCommandLists 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_queue_execute_command_lists_params_t
+{
+    ze_command_queue_handle_t* phCommandQueue;
+    uint32_t* pnumCommandLists;
+    ze_command_list_handle_t** pphCommandLists;
+    ze_fence_handle_t* phFence;
+} ze_command_queue_execute_command_lists_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandQueueExecuteCommandLists 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandQueueExecuteCommandListsCb_t)(
+    ze_command_queue_execute_command_lists_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandQueueSynchronize 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_queue_synchronize_params_t
+{
+    ze_command_queue_handle_t* phCommandQueue;
+    uint64_t* ptimeout;
+} ze_command_queue_synchronize_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandQueueSynchronize 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandQueueSynchronizeCb_t)(
+    ze_command_queue_synchronize_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of CommandQueue callback functions pointers
+typedef struct _ze_command_queue_callbacks_t
+{
+    ze_pfnCommandQueueCreateCb_t                                    pfnCreateCb;
+    ze_pfnCommandQueueDestroyCb_t                                   pfnDestroyCb;
+    ze_pfnCommandQueueExecuteCommandListsCb_t                       pfnExecuteCommandListsCb;
+    ze_pfnCommandQueueSynchronizeCb_t                               pfnSynchronizeCb;
+} ze_command_queue_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_create_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_command_list_desc_t** pdesc;
+    ze_command_list_handle_t** pphCommandList;
+} ze_command_list_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListCreateCb_t)(
+    ze_command_list_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListCreateImmediate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_create_immediate_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_command_queue_desc_t** paltdesc;
+    ze_command_list_handle_t** pphCommandList;
+} ze_command_list_create_immediate_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListCreateImmediate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListCreateImmediateCb_t)(
+    ze_command_list_create_immediate_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_destroy_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+} ze_command_list_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListDestroyCb_t)(
+    ze_command_list_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListClose 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_close_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+} ze_command_list_close_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListClose 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListCloseCb_t)(
+    ze_command_list_close_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListReset 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_reset_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+} ze_command_list_reset_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListReset 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListResetCb_t)(
+    ze_command_list_reset_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendWriteGlobalTimestamp 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_write_global_timestamp_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint64_t** pdstptr;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_write_global_timestamp_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendWriteGlobalTimestamp 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendWriteGlobalTimestampCb_t)(
+    ze_command_list_append_write_global_timestamp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendBarrier 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_barrier_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_barrier_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendBarrier 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendBarrierCb_t)(
+    ze_command_list_append_barrier_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemoryRangesBarrier 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_memory_ranges_barrier_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint32_t* pnumRanges;
+    const size_t** ppRangeSizes;
+    const void*** ppRanges;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_memory_ranges_barrier_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemoryRangesBarrier 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryRangesBarrierCb_t)(
+    ze_command_list_append_memory_ranges_barrier_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemoryCopy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_memory_copy_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    void** pdstptr;
+    const void** psrcptr;
+    size_t* psize;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_memory_copy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemoryCopy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyCb_t)(
+    ze_command_list_append_memory_copy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemoryFill 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_memory_fill_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    void** pptr;
+    const void** ppattern;
+    size_t* ppattern_size;
+    size_t* psize;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_memory_fill_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemoryFill 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryFillCb_t)(
+    ze_command_list_append_memory_fill_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemoryCopyRegion 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_memory_copy_region_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    void** pdstptr;
+    const ze_copy_region_t** pdstRegion;
+    uint32_t* pdstPitch;
+    uint32_t* pdstSlicePitch;
+    const void** psrcptr;
+    const ze_copy_region_t** psrcRegion;
+    uint32_t* psrcPitch;
+    uint32_t* psrcSlicePitch;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_memory_copy_region_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemoryCopyRegion 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyRegionCb_t)(
+    ze_command_list_append_memory_copy_region_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemoryCopyFromContext 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_memory_copy_from_context_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    void** pdstptr;
+    ze_context_handle_t* phContextSrc;
+    const void** psrcptr;
+    size_t* psize;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_memory_copy_from_context_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemoryCopyFromContext 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyFromContextCb_t)(
+    ze_command_list_append_memory_copy_from_context_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendImageCopy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_image_copy_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_image_handle_t* phDstImage;
+    ze_image_handle_t* phSrcImage;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_image_copy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendImageCopy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyCb_t)(
+    ze_command_list_append_image_copy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendImageCopyRegion 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_image_copy_region_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_image_handle_t* phDstImage;
+    ze_image_handle_t* phSrcImage;
+    const ze_image_region_t** ppDstRegion;
+    const ze_image_region_t** ppSrcRegion;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_image_copy_region_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendImageCopyRegion 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyRegionCb_t)(
+    ze_command_list_append_image_copy_region_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendImageCopyToMemory 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_image_copy_to_memory_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    void** pdstptr;
+    ze_image_handle_t* phSrcImage;
+    const ze_image_region_t** ppSrcRegion;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_image_copy_to_memory_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendImageCopyToMemory 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemoryCb_t)(
+    ze_command_list_append_image_copy_to_memory_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendImageCopyFromMemory 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_image_copy_from_memory_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_image_handle_t* phDstImage;
+    const void** psrcptr;
+    const ze_image_region_t** ppDstRegion;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_image_copy_from_memory_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendImageCopyFromMemory 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemoryCb_t)(
+    ze_command_list_append_image_copy_from_memory_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemoryPrefetch 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_memory_prefetch_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    const void** pptr;
+    size_t* psize;
+} ze_command_list_append_memory_prefetch_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemoryPrefetch 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemoryPrefetchCb_t)(
+    ze_command_list_append_memory_prefetch_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendMemAdvise 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_mem_advise_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_device_handle_t* phDevice;
+    const void** pptr;
+    size_t* psize;
+    ze_memory_advice_t* padvice;
+} ze_command_list_append_mem_advise_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendMemAdvise 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendMemAdviseCb_t)(
+    ze_command_list_append_mem_advise_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendSignalEvent 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_signal_event_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_event_handle_t* phEvent;
+} ze_command_list_append_signal_event_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendSignalEvent 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendSignalEventCb_t)(
+    ze_command_list_append_signal_event_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendWaitOnEvents 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_wait_on_events_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint32_t* pnumEvents;
+    ze_event_handle_t** pphEvents;
+} ze_command_list_append_wait_on_events_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendWaitOnEvents 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendWaitOnEventsCb_t)(
+    ze_command_list_append_wait_on_events_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendEventReset 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_event_reset_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_event_handle_t* phEvent;
+} ze_command_list_append_event_reset_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendEventReset 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendEventResetCb_t)(
+    ze_command_list_append_event_reset_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendQueryKernelTimestamps 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_query_kernel_timestamps_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint32_t* pnumEvents;
+    ze_event_handle_t** pphEvents;
+    void** pdstptr;
+    const size_t** ppOffsets;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_query_kernel_timestamps_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendQueryKernelTimestamps 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendQueryKernelTimestampsCb_t)(
+    ze_command_list_append_query_kernel_timestamps_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendLaunchKernel 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_launch_kernel_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_kernel_handle_t* phKernel;
+    const ze_group_count_t** ppLaunchFuncArgs;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_launch_kernel_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendLaunchKernel 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchKernelCb_t)(
+    ze_command_list_append_launch_kernel_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendLaunchCooperativeKernel 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_launch_cooperative_kernel_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_kernel_handle_t* phKernel;
+    const ze_group_count_t** ppLaunchFuncArgs;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_launch_cooperative_kernel_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendLaunchCooperativeKernel 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchCooperativeKernelCb_t)(
+    ze_command_list_append_launch_cooperative_kernel_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendLaunchKernelIndirect 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_launch_kernel_indirect_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    ze_kernel_handle_t* phKernel;
+    const ze_group_count_t** ppLaunchArgumentsBuffer;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_launch_kernel_indirect_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendLaunchKernelIndirect 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchKernelIndirectCb_t)(
+    ze_command_list_append_launch_kernel_indirect_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeCommandListAppendLaunchMultipleKernelsIndirect 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_command_list_append_launch_multiple_kernels_indirect_params_t
+{
+    ze_command_list_handle_t* phCommandList;
+    uint32_t* pnumKernels;
+    ze_kernel_handle_t** pphKernels;
+    const uint32_t** ppCountBuffer;
+    const ze_group_count_t** ppLaunchArgumentsBuffer;
+    ze_event_handle_t* phSignalEvent;
+    uint32_t* pnumWaitEvents;
+    ze_event_handle_t** pphWaitEvents;
+} ze_command_list_append_launch_multiple_kernels_indirect_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t)(
+    ze_command_list_append_launch_multiple_kernels_indirect_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of CommandList callback functions pointers
+typedef struct _ze_command_list_callbacks_t
+{
+    ze_pfnCommandListCreateCb_t                                     pfnCreateCb;
+    ze_pfnCommandListCreateImmediateCb_t                            pfnCreateImmediateCb;
+    ze_pfnCommandListDestroyCb_t                                    pfnDestroyCb;
+    ze_pfnCommandListCloseCb_t                                      pfnCloseCb;
+    ze_pfnCommandListResetCb_t                                      pfnResetCb;
+    ze_pfnCommandListAppendWriteGlobalTimestampCb_t                 pfnAppendWriteGlobalTimestampCb;
+    ze_pfnCommandListAppendBarrierCb_t                              pfnAppendBarrierCb;
+    ze_pfnCommandListAppendMemoryRangesBarrierCb_t                  pfnAppendMemoryRangesBarrierCb;
+    ze_pfnCommandListAppendMemoryCopyCb_t                           pfnAppendMemoryCopyCb;
+    ze_pfnCommandListAppendMemoryFillCb_t                           pfnAppendMemoryFillCb;
+    ze_pfnCommandListAppendMemoryCopyRegionCb_t                     pfnAppendMemoryCopyRegionCb;
+    ze_pfnCommandListAppendMemoryCopyFromContextCb_t                pfnAppendMemoryCopyFromContextCb;
+    ze_pfnCommandListAppendImageCopyCb_t                            pfnAppendImageCopyCb;
+    ze_pfnCommandListAppendImageCopyRegionCb_t                      pfnAppendImageCopyRegionCb;
+    ze_pfnCommandListAppendImageCopyToMemoryCb_t                    pfnAppendImageCopyToMemoryCb;
+    ze_pfnCommandListAppendImageCopyFromMemoryCb_t                  pfnAppendImageCopyFromMemoryCb;
+    ze_pfnCommandListAppendMemoryPrefetchCb_t                       pfnAppendMemoryPrefetchCb;
+    ze_pfnCommandListAppendMemAdviseCb_t                            pfnAppendMemAdviseCb;
+    ze_pfnCommandListAppendSignalEventCb_t                          pfnAppendSignalEventCb;
+    ze_pfnCommandListAppendWaitOnEventsCb_t                         pfnAppendWaitOnEventsCb;
+    ze_pfnCommandListAppendEventResetCb_t                           pfnAppendEventResetCb;
+    ze_pfnCommandListAppendQueryKernelTimestampsCb_t                pfnAppendQueryKernelTimestampsCb;
+    ze_pfnCommandListAppendLaunchKernelCb_t                         pfnAppendLaunchKernelCb;
+    ze_pfnCommandListAppendLaunchCooperativeKernelCb_t              pfnAppendLaunchCooperativeKernelCb;
+    ze_pfnCommandListAppendLaunchKernelIndirectCb_t                 pfnAppendLaunchKernelIndirectCb;
+    ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t        pfnAppendLaunchMultipleKernelsIndirectCb;
+} ze_command_list_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageGetProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_image_get_properties_params_t
+{
+    ze_device_handle_t* phDevice;
+    const ze_image_desc_t** pdesc;
+    ze_image_properties_t** ppImageProperties;
+} ze_image_get_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageGetProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnImageGetPropertiesCb_t)(
+    ze_image_get_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_image_create_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_image_desc_t** pdesc;
+    ze_image_handle_t** pphImage;
+} ze_image_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnImageCreateCb_t)(
+    ze_image_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeImageDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_image_destroy_params_t
+{
+    ze_image_handle_t* phImage;
+} ze_image_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeImageDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnImageDestroyCb_t)(
+    ze_image_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Image callback functions pointers
+typedef struct _ze_image_callbacks_t
+{
+    ze_pfnImageGetPropertiesCb_t                                    pfnGetPropertiesCb;
+    ze_pfnImageCreateCb_t                                           pfnCreateCb;
+    ze_pfnImageDestroyCb_t                                          pfnDestroyCb;
+} ze_image_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemAllocShared 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_alloc_shared_params_t
+{
+    ze_context_handle_t* phContext;
+    const ze_device_mem_alloc_desc_t** pdevice_desc;
+    const ze_host_mem_alloc_desc_t** phost_desc;
+    size_t* psize;
+    size_t* palignment;
+    ze_device_handle_t* phDevice;
+    void*** ppptr;
+} ze_mem_alloc_shared_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemAllocShared 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemAllocSharedCb_t)(
+    ze_mem_alloc_shared_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemAllocDevice 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_alloc_device_params_t
+{
+    ze_context_handle_t* phContext;
+    const ze_device_mem_alloc_desc_t** pdevice_desc;
+    size_t* psize;
+    size_t* palignment;
+    ze_device_handle_t* phDevice;
+    void*** ppptr;
+} ze_mem_alloc_device_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemAllocDevice 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemAllocDeviceCb_t)(
+    ze_mem_alloc_device_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemAllocHost 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_alloc_host_params_t
+{
+    ze_context_handle_t* phContext;
+    const ze_host_mem_alloc_desc_t** phost_desc;
+    size_t* psize;
+    size_t* palignment;
+    void*** ppptr;
+} ze_mem_alloc_host_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemAllocHost 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemAllocHostCb_t)(
+    ze_mem_alloc_host_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemFree 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_free_params_t
+{
+    ze_context_handle_t* phContext;
+    void** pptr;
+} ze_mem_free_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemFree 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemFreeCb_t)(
+    ze_mem_free_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetAllocProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_get_alloc_properties_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    ze_memory_allocation_properties_t** ppMemAllocProperties;
+    ze_device_handle_t** pphDevice;
+} ze_mem_get_alloc_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetAllocProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemGetAllocPropertiesCb_t)(
+    ze_mem_get_alloc_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetAddressRange 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_get_address_range_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    void*** ppBase;
+    size_t** ppSize;
+} ze_mem_get_address_range_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetAddressRange 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemGetAddressRangeCb_t)(
+    ze_mem_get_address_range_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemGetIpcHandle 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_get_ipc_handle_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    ze_ipc_mem_handle_t** ppIpcHandle;
+} ze_mem_get_ipc_handle_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemGetIpcHandle 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemGetIpcHandleCb_t)(
+    ze_mem_get_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemOpenIpcHandle 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_open_ipc_handle_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    ze_ipc_mem_handle_t* phandle;
+    ze_ipc_memory_flags_t* pflags;
+    void*** ppptr;
+} ze_mem_open_ipc_handle_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemOpenIpcHandle 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemOpenIpcHandleCb_t)(
+    ze_mem_open_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeMemCloseIpcHandle 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_mem_close_ipc_handle_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+} ze_mem_close_ipc_handle_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeMemCloseIpcHandle 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnMemCloseIpcHandleCb_t)(
+    ze_mem_close_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Mem callback functions pointers
+typedef struct _ze_mem_callbacks_t
+{
+    ze_pfnMemAllocSharedCb_t                                        pfnAllocSharedCb;
+    ze_pfnMemAllocDeviceCb_t                                        pfnAllocDeviceCb;
+    ze_pfnMemAllocHostCb_t                                          pfnAllocHostCb;
+    ze_pfnMemFreeCb_t                                               pfnFreeCb;
+    ze_pfnMemGetAllocPropertiesCb_t                                 pfnGetAllocPropertiesCb;
+    ze_pfnMemGetAddressRangeCb_t                                    pfnGetAddressRangeCb;
+    ze_pfnMemGetIpcHandleCb_t                                       pfnGetIpcHandleCb;
+    ze_pfnMemOpenIpcHandleCb_t                                      pfnOpenIpcHandleCb;
+    ze_pfnMemCloseIpcHandleCb_t                                     pfnCloseIpcHandleCb;
+} ze_mem_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFenceCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_fence_create_params_t
+{
+    ze_command_queue_handle_t* phCommandQueue;
+    const ze_fence_desc_t** pdesc;
+    ze_fence_handle_t** pphFence;
+} ze_fence_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFenceCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnFenceCreateCb_t)(
+    ze_fence_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFenceDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_fence_destroy_params_t
+{
+    ze_fence_handle_t* phFence;
+} ze_fence_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFenceDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnFenceDestroyCb_t)(
+    ze_fence_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFenceHostSynchronize 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_fence_host_synchronize_params_t
+{
+    ze_fence_handle_t* phFence;
+    uint64_t* ptimeout;
+} ze_fence_host_synchronize_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFenceHostSynchronize 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnFenceHostSynchronizeCb_t)(
+    ze_fence_host_synchronize_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFenceQueryStatus 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_fence_query_status_params_t
+{
+    ze_fence_handle_t* phFence;
+} ze_fence_query_status_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFenceQueryStatus 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnFenceQueryStatusCb_t)(
+    ze_fence_query_status_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeFenceReset 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_fence_reset_params_t
+{
+    ze_fence_handle_t* phFence;
+} ze_fence_reset_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeFenceReset 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnFenceResetCb_t)(
+    ze_fence_reset_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Fence callback functions pointers
+typedef struct _ze_fence_callbacks_t
+{
+    ze_pfnFenceCreateCb_t                                           pfnCreateCb;
+    ze_pfnFenceDestroyCb_t                                          pfnDestroyCb;
+    ze_pfnFenceHostSynchronizeCb_t                                  pfnHostSynchronizeCb;
+    ze_pfnFenceQueryStatusCb_t                                      pfnQueryStatusCb;
+    ze_pfnFenceResetCb_t                                            pfnResetCb;
+} ze_fence_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_pool_create_params_t
+{
+    ze_context_handle_t* phContext;
+    const ze_event_pool_desc_t** pdesc;
+    uint32_t* pnumDevices;
+    ze_device_handle_t** pphDevices;
+    ze_event_pool_handle_t** pphEventPool;
+} ze_event_pool_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventPoolCreateCb_t)(
+    ze_event_pool_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_pool_destroy_params_t
+{
+    ze_event_pool_handle_t* phEventPool;
+} ze_event_pool_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventPoolDestroyCb_t)(
+    ze_event_pool_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolGetIpcHandle 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_pool_get_ipc_handle_params_t
+{
+    ze_event_pool_handle_t* phEventPool;
+    ze_ipc_event_pool_handle_t** pphIpc;
+} ze_event_pool_get_ipc_handle_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolGetIpcHandle 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventPoolGetIpcHandleCb_t)(
+    ze_event_pool_get_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolOpenIpcHandle 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_pool_open_ipc_handle_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_ipc_event_pool_handle_t* phIpc;
+    ze_event_pool_handle_t** pphEventPool;
+} ze_event_pool_open_ipc_handle_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolOpenIpcHandle 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventPoolOpenIpcHandleCb_t)(
+    ze_event_pool_open_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventPoolCloseIpcHandle 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_pool_close_ipc_handle_params_t
+{
+    ze_event_pool_handle_t* phEventPool;
+} ze_event_pool_close_ipc_handle_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventPoolCloseIpcHandle 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventPoolCloseIpcHandleCb_t)(
+    ze_event_pool_close_ipc_handle_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of EventPool callback functions pointers
+typedef struct _ze_event_pool_callbacks_t
+{
+    ze_pfnEventPoolCreateCb_t                                       pfnCreateCb;
+    ze_pfnEventPoolDestroyCb_t                                      pfnDestroyCb;
+    ze_pfnEventPoolGetIpcHandleCb_t                                 pfnGetIpcHandleCb;
+    ze_pfnEventPoolOpenIpcHandleCb_t                                pfnOpenIpcHandleCb;
+    ze_pfnEventPoolCloseIpcHandleCb_t                               pfnCloseIpcHandleCb;
+} ze_event_pool_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_create_params_t
+{
+    ze_event_pool_handle_t* phEventPool;
+    const ze_event_desc_t** pdesc;
+    ze_event_handle_t** pphEvent;
+} ze_event_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventCreateCb_t)(
+    ze_event_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_destroy_params_t
+{
+    ze_event_handle_t* phEvent;
+} ze_event_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventDestroyCb_t)(
+    ze_event_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventHostSignal 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_host_signal_params_t
+{
+    ze_event_handle_t* phEvent;
+} ze_event_host_signal_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventHostSignal 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventHostSignalCb_t)(
+    ze_event_host_signal_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventHostSynchronize 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_host_synchronize_params_t
+{
+    ze_event_handle_t* phEvent;
+    uint64_t* ptimeout;
+} ze_event_host_synchronize_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventHostSynchronize 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventHostSynchronizeCb_t)(
+    ze_event_host_synchronize_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventQueryStatus 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_query_status_params_t
+{
+    ze_event_handle_t* phEvent;
+} ze_event_query_status_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventQueryStatus 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventQueryStatusCb_t)(
+    ze_event_query_status_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventHostReset 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_host_reset_params_t
+{
+    ze_event_handle_t* phEvent;
+} ze_event_host_reset_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventHostReset 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventHostResetCb_t)(
+    ze_event_host_reset_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeEventQueryKernelTimestamp 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_event_query_kernel_timestamp_params_t
+{
+    ze_event_handle_t* phEvent;
+    ze_kernel_timestamp_result_t** pdstptr;
+} ze_event_query_kernel_timestamp_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeEventQueryKernelTimestamp 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnEventQueryKernelTimestampCb_t)(
+    ze_event_query_kernel_timestamp_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Event callback functions pointers
+typedef struct _ze_event_callbacks_t
+{
+    ze_pfnEventCreateCb_t                                           pfnCreateCb;
+    ze_pfnEventDestroyCb_t                                          pfnDestroyCb;
+    ze_pfnEventHostSignalCb_t                                       pfnHostSignalCb;
+    ze_pfnEventHostSynchronizeCb_t                                  pfnHostSynchronizeCb;
+    ze_pfnEventQueryStatusCb_t                                      pfnQueryStatusCb;
+    ze_pfnEventHostResetCb_t                                        pfnHostResetCb;
+    ze_pfnEventQueryKernelTimestampCb_t                             pfnQueryKernelTimestampCb;
+} ze_event_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_create_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_module_desc_t** pdesc;
+    ze_module_handle_t** pphModule;
+    ze_module_build_log_handle_t** pphBuildLog;
+} ze_module_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleCreateCb_t)(
+    ze_module_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_destroy_params_t
+{
+    ze_module_handle_t* phModule;
+} ze_module_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleDestroyCb_t)(
+    ze_module_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleDynamicLink 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_dynamic_link_params_t
+{
+    uint32_t* pnumModules;
+    ze_module_handle_t** pphModules;
+    ze_module_build_log_handle_t** pphLinkLog;
+} ze_module_dynamic_link_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleDynamicLink 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleDynamicLinkCb_t)(
+    ze_module_dynamic_link_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleGetNativeBinary 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_get_native_binary_params_t
+{
+    ze_module_handle_t* phModule;
+    size_t** ppSize;
+    uint8_t** ppModuleNativeBinary;
+} ze_module_get_native_binary_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleGetNativeBinary 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleGetNativeBinaryCb_t)(
+    ze_module_get_native_binary_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleGetGlobalPointer 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_get_global_pointer_params_t
+{
+    ze_module_handle_t* phModule;
+    const char** ppGlobalName;
+    size_t** ppSize;
+    void*** ppptr;
+} ze_module_get_global_pointer_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleGetGlobalPointer 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleGetGlobalPointerCb_t)(
+    ze_module_get_global_pointer_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleGetKernelNames 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_get_kernel_names_params_t
+{
+    ze_module_handle_t* phModule;
+    uint32_t** ppCount;
+    const char*** ppNames;
+} ze_module_get_kernel_names_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleGetKernelNames 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleGetKernelNamesCb_t)(
+    ze_module_get_kernel_names_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleGetProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_get_properties_params_t
+{
+    ze_module_handle_t* phModule;
+    ze_module_properties_t** ppModuleProperties;
+} ze_module_get_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleGetProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleGetPropertiesCb_t)(
+    ze_module_get_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleGetFunctionPointer 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_get_function_pointer_params_t
+{
+    ze_module_handle_t* phModule;
+    const char** ppFunctionName;
+    void*** ppfnFunction;
+} ze_module_get_function_pointer_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleGetFunctionPointer 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleGetFunctionPointerCb_t)(
+    ze_module_get_function_pointer_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Module callback functions pointers
+typedef struct _ze_module_callbacks_t
+{
+    ze_pfnModuleCreateCb_t                                          pfnCreateCb;
+    ze_pfnModuleDestroyCb_t                                         pfnDestroyCb;
+    ze_pfnModuleDynamicLinkCb_t                                     pfnDynamicLinkCb;
+    ze_pfnModuleGetNativeBinaryCb_t                                 pfnGetNativeBinaryCb;
+    ze_pfnModuleGetGlobalPointerCb_t                                pfnGetGlobalPointerCb;
+    ze_pfnModuleGetKernelNamesCb_t                                  pfnGetKernelNamesCb;
+    ze_pfnModuleGetPropertiesCb_t                                   pfnGetPropertiesCb;
+    ze_pfnModuleGetFunctionPointerCb_t                              pfnGetFunctionPointerCb;
+} ze_module_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleBuildLogDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_build_log_destroy_params_t
+{
+    ze_module_build_log_handle_t* phModuleBuildLog;
+} ze_module_build_log_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleBuildLogDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleBuildLogDestroyCb_t)(
+    ze_module_build_log_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeModuleBuildLogGetString 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_module_build_log_get_string_params_t
+{
+    ze_module_build_log_handle_t* phModuleBuildLog;
+    size_t** ppSize;
+    char** ppBuildLog;
+} ze_module_build_log_get_string_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeModuleBuildLogGetString 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnModuleBuildLogGetStringCb_t)(
+    ze_module_build_log_get_string_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of ModuleBuildLog callback functions pointers
+typedef struct _ze_module_build_log_callbacks_t
+{
+    ze_pfnModuleBuildLogDestroyCb_t                                 pfnDestroyCb;
+    ze_pfnModuleBuildLogGetStringCb_t                               pfnGetStringCb;
+} ze_module_build_log_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_create_params_t
+{
+    ze_module_handle_t* phModule;
+    const ze_kernel_desc_t** pdesc;
+    ze_kernel_handle_t** pphKernel;
+} ze_kernel_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelCreateCb_t)(
+    ze_kernel_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_destroy_params_t
+{
+    ze_kernel_handle_t* phKernel;
+} ze_kernel_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelDestroyCb_t)(
+    ze_kernel_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSetCacheConfig 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_set_cache_config_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    ze_cache_config_flags_t* pflags;
+} ze_kernel_set_cache_config_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSetCacheConfig 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelSetCacheConfigCb_t)(
+    ze_kernel_set_cache_config_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSetGroupSize 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_set_group_size_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    uint32_t* pgroupSizeX;
+    uint32_t* pgroupSizeY;
+    uint32_t* pgroupSizeZ;
+} ze_kernel_set_group_size_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSetGroupSize 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelSetGroupSizeCb_t)(
+    ze_kernel_set_group_size_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSuggestGroupSize 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_suggest_group_size_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    uint32_t* pglobalSizeX;
+    uint32_t* pglobalSizeY;
+    uint32_t* pglobalSizeZ;
+    uint32_t** pgroupSizeX;
+    uint32_t** pgroupSizeY;
+    uint32_t** pgroupSizeZ;
+} ze_kernel_suggest_group_size_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSuggestGroupSize 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelSuggestGroupSizeCb_t)(
+    ze_kernel_suggest_group_size_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSuggestMaxCooperativeGroupCount 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_suggest_max_cooperative_group_count_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    uint32_t** ptotalGroupCount;
+} ze_kernel_suggest_max_cooperative_group_count_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSuggestMaxCooperativeGroupCount 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t)(
+    ze_kernel_suggest_max_cooperative_group_count_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSetArgumentValue 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_set_argument_value_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    uint32_t* pargIndex;
+    size_t* pargSize;
+    const void** ppArgValue;
+} ze_kernel_set_argument_value_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSetArgumentValue 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelSetArgumentValueCb_t)(
+    ze_kernel_set_argument_value_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelSetIndirectAccess 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_set_indirect_access_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    ze_kernel_indirect_access_flags_t* pflags;
+} ze_kernel_set_indirect_access_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelSetIndirectAccess 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelSetIndirectAccessCb_t)(
+    ze_kernel_set_indirect_access_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelGetIndirectAccess 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_get_indirect_access_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    ze_kernel_indirect_access_flags_t** ppFlags;
+} ze_kernel_get_indirect_access_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelGetIndirectAccess 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelGetIndirectAccessCb_t)(
+    ze_kernel_get_indirect_access_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelGetSourceAttributes 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_get_source_attributes_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    uint32_t** ppSize;
+    char*** ppString;
+} ze_kernel_get_source_attributes_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelGetSourceAttributes 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelGetSourceAttributesCb_t)(
+    ze_kernel_get_source_attributes_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelGetProperties 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_get_properties_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    ze_kernel_properties_t** ppKernelProperties;
+} ze_kernel_get_properties_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelGetProperties 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelGetPropertiesCb_t)(
+    ze_kernel_get_properties_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeKernelGetName 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_kernel_get_name_params_t
+{
+    ze_kernel_handle_t* phKernel;
+    size_t** ppSize;
+    char** ppName;
+} ze_kernel_get_name_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeKernelGetName 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnKernelGetNameCb_t)(
+    ze_kernel_get_name_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Kernel callback functions pointers
+typedef struct _ze_kernel_callbacks_t
+{
+    ze_pfnKernelCreateCb_t                                          pfnCreateCb;
+    ze_pfnKernelDestroyCb_t                                         pfnDestroyCb;
+    ze_pfnKernelSetCacheConfigCb_t                                  pfnSetCacheConfigCb;
+    ze_pfnKernelSetGroupSizeCb_t                                    pfnSetGroupSizeCb;
+    ze_pfnKernelSuggestGroupSizeCb_t                                pfnSuggestGroupSizeCb;
+    ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t                 pfnSuggestMaxCooperativeGroupCountCb;
+    ze_pfnKernelSetArgumentValueCb_t                                pfnSetArgumentValueCb;
+    ze_pfnKernelSetIndirectAccessCb_t                               pfnSetIndirectAccessCb;
+    ze_pfnKernelGetIndirectAccessCb_t                               pfnGetIndirectAccessCb;
+    ze_pfnKernelGetSourceAttributesCb_t                             pfnGetSourceAttributesCb;
+    ze_pfnKernelGetPropertiesCb_t                                   pfnGetPropertiesCb;
+    ze_pfnKernelGetNameCb_t                                         pfnGetNameCb;
+} ze_kernel_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeSamplerCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_sampler_create_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    const ze_sampler_desc_t** pdesc;
+    ze_sampler_handle_t** pphSampler;
+} ze_sampler_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeSamplerCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnSamplerCreateCb_t)(
+    ze_sampler_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeSamplerDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_sampler_destroy_params_t
+{
+    ze_sampler_handle_t* phSampler;
+} ze_sampler_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeSamplerDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnSamplerDestroyCb_t)(
+    ze_sampler_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Sampler callback functions pointers
+typedef struct _ze_sampler_callbacks_t
+{
+    ze_pfnSamplerCreateCb_t                                         pfnCreateCb;
+    ze_pfnSamplerDestroyCb_t                                        pfnDestroyCb;
+} ze_sampler_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zePhysicalMemCreate 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_physical_mem_create_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    ze_physical_mem_desc_t** pdesc;
+    ze_physical_mem_handle_t** pphPhysicalMemory;
+} ze_physical_mem_create_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zePhysicalMemCreate 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnPhysicalMemCreateCb_t)(
+    ze_physical_mem_create_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zePhysicalMemDestroy 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_physical_mem_destroy_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_physical_mem_handle_t* phPhysicalMemory;
+} ze_physical_mem_destroy_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zePhysicalMemDestroy 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnPhysicalMemDestroyCb_t)(
+    ze_physical_mem_destroy_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of PhysicalMem callback functions pointers
+typedef struct _ze_physical_mem_callbacks_t
+{
+    ze_pfnPhysicalMemCreateCb_t                                     pfnCreateCb;
+    ze_pfnPhysicalMemDestroyCb_t                                    pfnDestroyCb;
+} ze_physical_mem_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemReserve 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_reserve_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** ppStart;
+    size_t* psize;
+    void*** ppptr;
+} ze_virtual_mem_reserve_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemReserve 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemReserveCb_t)(
+    ze_virtual_mem_reserve_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemFree 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_free_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    size_t* psize;
+} ze_virtual_mem_free_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemFree 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemFreeCb_t)(
+    ze_virtual_mem_free_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemQueryPageSize 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_query_page_size_params_t
+{
+    ze_context_handle_t* phContext;
+    ze_device_handle_t* phDevice;
+    size_t* psize;
+    size_t** ppagesize;
+} ze_virtual_mem_query_page_size_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemQueryPageSize 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemQueryPageSizeCb_t)(
+    ze_virtual_mem_query_page_size_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemMap 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_map_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    size_t* psize;
+    ze_physical_mem_handle_t* phPhysicalMemory;
+    size_t* poffset;
+    ze_memory_access_attribute_t* paccess;
+} ze_virtual_mem_map_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemMap 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemMapCb_t)(
+    ze_virtual_mem_map_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemUnmap 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_unmap_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    size_t* psize;
+} ze_virtual_mem_unmap_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemUnmap 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemUnmapCb_t)(
+    ze_virtual_mem_unmap_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemSetAccessAttribute 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_set_access_attribute_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    size_t* psize;
+    ze_memory_access_attribute_t* paccess;
+} ze_virtual_mem_set_access_attribute_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemSetAccessAttribute 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemSetAccessAttributeCb_t)(
+    ze_virtual_mem_set_access_attribute_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function parameters for zeVirtualMemGetAccessAttribute 
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct _ze_virtual_mem_get_access_attribute_params_t
+{
+    ze_context_handle_t* phContext;
+    const void** pptr;
+    size_t* psize;
+    ze_memory_access_attribute_t** paccess;
+    size_t** poutSize;
+} ze_virtual_mem_get_access_attribute_params_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Callback function-pointer for zeVirtualMemGetAccessAttribute 
+/// @param[in] params Parameters passed to this instance
+/// @param[in] result Return value
+/// @param[in] pTracerUserData Per-Tracer user data
+/// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data
+typedef void (ZE_APICALL *ze_pfnVirtualMemGetAccessAttributeCb_t)(
+    ze_virtual_mem_get_access_attribute_params_t* params,
+    ze_result_t result,
+    void* pTracerUserData,
+    void** ppTracerInstanceUserData
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of VirtualMem callback functions pointers
+typedef struct _ze_virtual_mem_callbacks_t
+{
+    ze_pfnVirtualMemReserveCb_t                                     pfnReserveCb;
+    ze_pfnVirtualMemFreeCb_t                                        pfnFreeCb;
+    ze_pfnVirtualMemQueryPageSizeCb_t                               pfnQueryPageSizeCb;
+    ze_pfnVirtualMemMapCb_t                                         pfnMapCb;
+    ze_pfnVirtualMemUnmapCb_t                                       pfnUnmapCb;
+    ze_pfnVirtualMemSetAccessAttributeCb_t                          pfnSetAccessAttributeCb;
+    ze_pfnVirtualMemGetAccessAttributeCb_t                          pfnGetAccessAttributeCb;
+} ze_virtual_mem_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Container for all callbacks
+typedef struct _ze_callbacks_t
+{
+    ze_global_callbacks_t               Global;
+    ze_driver_callbacks_t               Driver;
+    ze_device_callbacks_t               Device;
+    ze_context_callbacks_t              Context;
+    ze_command_queue_callbacks_t        CommandQueue;
+    ze_command_list_callbacks_t         CommandList;
+    ze_fence_callbacks_t                Fence;
+    ze_event_pool_callbacks_t           EventPool;
+    ze_event_callbacks_t                Event;
+    ze_image_callbacks_t                Image;
+    ze_module_callbacks_t               Module;
+    ze_module_build_log_callbacks_t     ModuleBuildLog;
+    ze_kernel_callbacks_t               Kernel;
+    ze_sampler_callbacks_t              Sampler;
+    ze_physical_mem_callbacks_t         PhysicalMem;
+    ze_mem_callbacks_t                  Mem;
+    ze_virtual_mem_callbacks_t          VirtualMem;
+} ze_callbacks_t;
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZE_API_H
\ No newline at end of file
diff --git a/third_party/level_zero/ze_ddi.h b/third_party/level_zero/ze_ddi.h
new file mode 100644
index 00000000000..e06ef343045
--- /dev/null
+++ b/third_party/level_zero/ze_ddi.h
@@ -0,0 +1,2583 @@
+/*
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file ze_ddi.h
+ * @version v1.11-r1.11.8
+ *
+ */
+#ifndef _ZE_DDI_H
+#define _ZE_DDI_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+#include "ze_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASBuilderCreateExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASBuilderCreateExp_t)(
+    ze_driver_handle_t,
+    const ze_rtas_builder_exp_desc_t*,
+    ze_rtas_builder_exp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASBuilderGetBuildPropertiesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASBuilderGetBuildPropertiesExp_t)(
+    ze_rtas_builder_exp_handle_t,
+    const ze_rtas_builder_build_op_exp_desc_t*,
+    ze_rtas_builder_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASBuilderBuildExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASBuilderBuildExp_t)(
+    ze_rtas_builder_exp_handle_t,
+    const ze_rtas_builder_build_op_exp_desc_t*,
+    void*,
+    size_t,
+    void*,
+    size_t,
+    ze_rtas_parallel_operation_exp_handle_t,
+    void*,
+    ze_rtas_aabb_exp_t*,
+    size_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASBuilderDestroyExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASBuilderDestroyExp_t)(
+    ze_rtas_builder_exp_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of RTASBuilderExp functions pointers
+typedef struct _ze_rtas_builder_exp_dditable_t
+{
+    ze_pfnRTASBuilderCreateExp_t                                pfnCreateExp;
+    ze_pfnRTASBuilderGetBuildPropertiesExp_t                    pfnGetBuildPropertiesExp;
+    ze_pfnRTASBuilderBuildExp_t                                 pfnBuildExp;
+    ze_pfnRTASBuilderDestroyExp_t                               pfnDestroyExp;
+} ze_rtas_builder_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's RTASBuilderExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetRTASBuilderExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_rtas_builder_exp_dditable_t* pDdiTable                               ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetRTASBuilderExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetRTASBuilderExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_rtas_builder_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASParallelOperationCreateExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASParallelOperationCreateExp_t)(
+    ze_driver_handle_t,
+    ze_rtas_parallel_operation_exp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASParallelOperationGetPropertiesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASParallelOperationGetPropertiesExp_t)(
+    ze_rtas_parallel_operation_exp_handle_t,
+    ze_rtas_parallel_operation_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASParallelOperationJoinExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASParallelOperationJoinExp_t)(
+    ze_rtas_parallel_operation_exp_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeRTASParallelOperationDestroyExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnRTASParallelOperationDestroyExp_t)(
+    ze_rtas_parallel_operation_exp_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of RTASParallelOperationExp functions pointers
+typedef struct _ze_rtas_parallel_operation_exp_dditable_t
+{
+    ze_pfnRTASParallelOperationCreateExp_t                      pfnCreateExp;
+    ze_pfnRTASParallelOperationGetPropertiesExp_t               pfnGetPropertiesExp;
+    ze_pfnRTASParallelOperationJoinExp_t                        pfnJoinExp;
+    ze_pfnRTASParallelOperationDestroyExp_t                     pfnDestroyExp;
+} ze_rtas_parallel_operation_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's RTASParallelOperationExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetRTASParallelOperationExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_rtas_parallel_operation_exp_dditable_t* pDdiTable                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetRTASParallelOperationExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetRTASParallelOperationExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_rtas_parallel_operation_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeInit 
+typedef ze_result_t (ZE_APICALL *ze_pfnInit_t)(
+    ze_init_flags_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeInitDrivers 
+typedef ze_result_t (ZE_APICALL *ze_pfnInitDrivers_t)(
+    uint32_t*,
+    ze_driver_handle_t*,
+    ze_init_driver_type_desc_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Global functions pointers
+typedef struct _ze_global_dditable_t
+{
+    ze_pfnInit_t                                                pfnInit;
+    ze_pfnInitDrivers_t                                         pfnInitDrivers;
+} ze_global_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Global table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetGlobalProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_global_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetGlobalProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetGlobalProcAddrTable_t)(
+    ze_api_version_t,
+    ze_global_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGet 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGet_t)(
+    uint32_t*,
+    ze_driver_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGetApiVersion 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetApiVersion_t)(
+    ze_driver_handle_t,
+    ze_api_version_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGetProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetProperties_t)(
+    ze_driver_handle_t,
+    ze_driver_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGetIpcProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetIpcProperties_t)(
+    ze_driver_handle_t,
+    ze_driver_ipc_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGetExtensionProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetExtensionProperties_t)(
+    ze_driver_handle_t,
+    uint32_t*,
+    ze_driver_extension_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGetExtensionFunctionAddress 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetExtensionFunctionAddress_t)(
+    ze_driver_handle_t,
+    const char*,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverGetLastErrorDescription 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverGetLastErrorDescription_t)(
+    ze_driver_handle_t,
+    const char**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Driver functions pointers
+typedef struct _ze_driver_dditable_t
+{
+    ze_pfnDriverGet_t                                           pfnGet;
+    ze_pfnDriverGetApiVersion_t                                 pfnGetApiVersion;
+    ze_pfnDriverGetProperties_t                                 pfnGetProperties;
+    ze_pfnDriverGetIpcProperties_t                              pfnGetIpcProperties;
+    ze_pfnDriverGetExtensionProperties_t                        pfnGetExtensionProperties;
+    ze_pfnDriverGetExtensionFunctionAddress_t                   pfnGetExtensionFunctionAddress;
+    ze_pfnDriverGetLastErrorDescription_t                       pfnGetLastErrorDescription;
+} ze_driver_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Driver table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetDriverProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_driver_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetDriverProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetDriverProcAddrTable_t)(
+    ze_api_version_t,
+    ze_driver_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDriverRTASFormatCompatibilityCheckExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnDriverRTASFormatCompatibilityCheckExp_t)(
+    ze_driver_handle_t,
+    ze_rtas_format_exp_t,
+    ze_rtas_format_exp_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of DriverExp functions pointers
+typedef struct _ze_driver_exp_dditable_t
+{
+    ze_pfnDriverRTASFormatCompatibilityCheckExp_t               pfnRTASFormatCompatibilityCheckExp;
+} ze_driver_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's DriverExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetDriverExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_driver_exp_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetDriverExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetDriverExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_driver_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGet 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGet_t)(
+    ze_driver_handle_t,
+    uint32_t*,
+    ze_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetSubDevices 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetSubDevices_t)(
+    ze_device_handle_t,
+    uint32_t*,
+    ze_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetProperties_t)(
+    ze_device_handle_t,
+    ze_device_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetComputeProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetComputeProperties_t)(
+    ze_device_handle_t,
+    ze_device_compute_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetModuleProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetModuleProperties_t)(
+    ze_device_handle_t,
+    ze_device_module_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetCommandQueueGroupProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetCommandQueueGroupProperties_t)(
+    ze_device_handle_t,
+    uint32_t*,
+    ze_command_queue_group_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetMemoryProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetMemoryProperties_t)(
+    ze_device_handle_t,
+    uint32_t*,
+    ze_device_memory_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetMemoryAccessProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetMemoryAccessProperties_t)(
+    ze_device_handle_t,
+    ze_device_memory_access_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetCacheProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetCacheProperties_t)(
+    ze_device_handle_t,
+    uint32_t*,
+    ze_device_cache_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetImageProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetImageProperties_t)(
+    ze_device_handle_t,
+    ze_device_image_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetExternalMemoryProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetExternalMemoryProperties_t)(
+    ze_device_handle_t,
+    ze_device_external_memory_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetP2PProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetP2PProperties_t)(
+    ze_device_handle_t,
+    ze_device_handle_t,
+    ze_device_p2p_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceCanAccessPeer 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceCanAccessPeer_t)(
+    ze_device_handle_t,
+    ze_device_handle_t,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetStatus 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetStatus_t)(
+    ze_device_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetGlobalTimestamps 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetGlobalTimestamps_t)(
+    ze_device_handle_t,
+    uint64_t*,
+    uint64_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceReserveCacheExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceReserveCacheExt_t)(
+    ze_device_handle_t,
+    size_t,
+    size_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceSetCacheAdviceExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceSetCacheAdviceExt_t)(
+    ze_device_handle_t,
+    void*,
+    size_t,
+    ze_cache_ext_region_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDevicePciGetPropertiesExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnDevicePciGetPropertiesExt_t)(
+    ze_device_handle_t,
+    ze_pci_ext_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetRootDevice 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetRootDevice_t)(
+    ze_device_handle_t,
+    ze_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Device functions pointers
+typedef struct _ze_device_dditable_t
+{
+    ze_pfnDeviceGet_t                                           pfnGet;
+    ze_pfnDeviceGetSubDevices_t                                 pfnGetSubDevices;
+    ze_pfnDeviceGetProperties_t                                 pfnGetProperties;
+    ze_pfnDeviceGetComputeProperties_t                          pfnGetComputeProperties;
+    ze_pfnDeviceGetModuleProperties_t                           pfnGetModuleProperties;
+    ze_pfnDeviceGetCommandQueueGroupProperties_t                pfnGetCommandQueueGroupProperties;
+    ze_pfnDeviceGetMemoryProperties_t                           pfnGetMemoryProperties;
+    ze_pfnDeviceGetMemoryAccessProperties_t                     pfnGetMemoryAccessProperties;
+    ze_pfnDeviceGetCacheProperties_t                            pfnGetCacheProperties;
+    ze_pfnDeviceGetImageProperties_t                            pfnGetImageProperties;
+    ze_pfnDeviceGetExternalMemoryProperties_t                   pfnGetExternalMemoryProperties;
+    ze_pfnDeviceGetP2PProperties_t                              pfnGetP2PProperties;
+    ze_pfnDeviceCanAccessPeer_t                                 pfnCanAccessPeer;
+    ze_pfnDeviceGetStatus_t                                     pfnGetStatus;
+    ze_pfnDeviceGetGlobalTimestamps_t                           pfnGetGlobalTimestamps;
+    ze_pfnDeviceReserveCacheExt_t                               pfnReserveCacheExt;
+    ze_pfnDeviceSetCacheAdviceExt_t                             pfnSetCacheAdviceExt;
+    ze_pfnDevicePciGetPropertiesExt_t                           pfnPciGetPropertiesExt;
+    ze_pfnDeviceGetRootDevice_t                                 pfnGetRootDevice;
+} ze_device_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Device table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetDeviceProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_device_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetDeviceProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetDeviceProcAddrTable_t)(
+    ze_api_version_t,
+    ze_device_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeDeviceGetFabricVertexExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnDeviceGetFabricVertexExp_t)(
+    ze_device_handle_t,
+    ze_fabric_vertex_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of DeviceExp functions pointers
+typedef struct _ze_device_exp_dditable_t
+{
+    ze_pfnDeviceGetFabricVertexExp_t                            pfnGetFabricVertexExp;
+} ze_device_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's DeviceExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetDeviceExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_device_exp_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetDeviceExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetDeviceExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_device_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextCreate_t)(
+    ze_driver_handle_t,
+    const ze_context_desc_t*,
+    ze_context_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextDestroy_t)(
+    ze_context_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextGetStatus 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextGetStatus_t)(
+    ze_context_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextSystemBarrier 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextSystemBarrier_t)(
+    ze_context_handle_t,
+    ze_device_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextMakeMemoryResident 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextMakeMemoryResident_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    void*,
+    size_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextEvictMemory 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextEvictMemory_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    void*,
+    size_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextMakeImageResident 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextMakeImageResident_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    ze_image_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextEvictImage 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextEvictImage_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    ze_image_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeContextCreateEx 
+typedef ze_result_t (ZE_APICALL *ze_pfnContextCreateEx_t)(
+    ze_driver_handle_t,
+    const ze_context_desc_t*,
+    uint32_t,
+    ze_device_handle_t*,
+    ze_context_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Context functions pointers
+typedef struct _ze_context_dditable_t
+{
+    ze_pfnContextCreate_t                                       pfnCreate;
+    ze_pfnContextDestroy_t                                      pfnDestroy;
+    ze_pfnContextGetStatus_t                                    pfnGetStatus;
+    ze_pfnContextSystemBarrier_t                                pfnSystemBarrier;
+    ze_pfnContextMakeMemoryResident_t                           pfnMakeMemoryResident;
+    ze_pfnContextEvictMemory_t                                  pfnEvictMemory;
+    ze_pfnContextMakeImageResident_t                            pfnMakeImageResident;
+    ze_pfnContextEvictImage_t                                   pfnEvictImage;
+    ze_pfnContextCreateEx_t                                     pfnCreateEx;
+} ze_context_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Context table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetContextProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_context_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetContextProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetContextProcAddrTable_t)(
+    ze_api_version_t,
+    ze_context_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandQueueCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueCreate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_command_queue_desc_t*,
+    ze_command_queue_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandQueueDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueDestroy_t)(
+    ze_command_queue_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandQueueExecuteCommandLists 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueExecuteCommandLists_t)(
+    ze_command_queue_handle_t,
+    uint32_t,
+    ze_command_list_handle_t*,
+    ze_fence_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandQueueSynchronize 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueSynchronize_t)(
+    ze_command_queue_handle_t,
+    uint64_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandQueueGetOrdinal 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueGetOrdinal_t)(
+    ze_command_queue_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandQueueGetIndex 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandQueueGetIndex_t)(
+    ze_command_queue_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of CommandQueue functions pointers
+typedef struct _ze_command_queue_dditable_t
+{
+    ze_pfnCommandQueueCreate_t                                  pfnCreate;
+    ze_pfnCommandQueueDestroy_t                                 pfnDestroy;
+    ze_pfnCommandQueueExecuteCommandLists_t                     pfnExecuteCommandLists;
+    ze_pfnCommandQueueSynchronize_t                             pfnSynchronize;
+    ze_pfnCommandQueueGetOrdinal_t                              pfnGetOrdinal;
+    ze_pfnCommandQueueGetIndex_t                                pfnGetIndex;
+} ze_command_queue_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's CommandQueue table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetCommandQueueProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_command_queue_dditable_t* pDdiTable                                  ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetCommandQueueProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetCommandQueueProcAddrTable_t)(
+    ze_api_version_t,
+    ze_command_queue_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListCreate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_command_list_desc_t*,
+    ze_command_list_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListCreateImmediate 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListCreateImmediate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_command_queue_desc_t*,
+    ze_command_list_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListDestroy_t)(
+    ze_command_list_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListClose 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListClose_t)(
+    ze_command_list_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListReset 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListReset_t)(
+    ze_command_list_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendWriteGlobalTimestamp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendWriteGlobalTimestamp_t)(
+    ze_command_list_handle_t,
+    uint64_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendBarrier 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendBarrier_t)(
+    ze_command_list_handle_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemoryRangesBarrier 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryRangesBarrier_t)(
+    ze_command_list_handle_t,
+    uint32_t,
+    const size_t*,
+    const void**,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemoryCopy 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryCopy_t)(
+    ze_command_list_handle_t,
+    void*,
+    const void*,
+    size_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemoryFill 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryFill_t)(
+    ze_command_list_handle_t,
+    void*,
+    const void*,
+    size_t,
+    size_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemoryCopyRegion 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyRegion_t)(
+    ze_command_list_handle_t,
+    void*,
+    const ze_copy_region_t*,
+    uint32_t,
+    uint32_t,
+    const void*,
+    const ze_copy_region_t*,
+    uint32_t,
+    uint32_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemoryCopyFromContext 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryCopyFromContext_t)(
+    ze_command_list_handle_t,
+    void*,
+    ze_context_handle_t,
+    const void*,
+    size_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendImageCopy 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopy_t)(
+    ze_command_list_handle_t,
+    ze_image_handle_t,
+    ze_image_handle_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendImageCopyRegion 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyRegion_t)(
+    ze_command_list_handle_t,
+    ze_image_handle_t,
+    ze_image_handle_t,
+    const ze_image_region_t*,
+    const ze_image_region_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendImageCopyToMemory 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemory_t)(
+    ze_command_list_handle_t,
+    void*,
+    ze_image_handle_t,
+    const ze_image_region_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendImageCopyFromMemory 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemory_t)(
+    ze_command_list_handle_t,
+    ze_image_handle_t,
+    const void*,
+    const ze_image_region_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemoryPrefetch 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemoryPrefetch_t)(
+    ze_command_list_handle_t,
+    const void*,
+    size_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendMemAdvise 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendMemAdvise_t)(
+    ze_command_list_handle_t,
+    ze_device_handle_t,
+    const void*,
+    size_t,
+    ze_memory_advice_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendSignalEvent 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendSignalEvent_t)(
+    ze_command_list_handle_t,
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendWaitOnEvents 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendWaitOnEvents_t)(
+    ze_command_list_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendEventReset 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendEventReset_t)(
+    ze_command_list_handle_t,
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendQueryKernelTimestamps 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendQueryKernelTimestamps_t)(
+    ze_command_list_handle_t,
+    uint32_t,
+    ze_event_handle_t*,
+    void*,
+    const size_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendLaunchKernel 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchKernel_t)(
+    ze_command_list_handle_t,
+    ze_kernel_handle_t,
+    const ze_group_count_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendLaunchCooperativeKernel 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchCooperativeKernel_t)(
+    ze_command_list_handle_t,
+    ze_kernel_handle_t,
+    const ze_group_count_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendLaunchKernelIndirect 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchKernelIndirect_t)(
+    ze_command_list_handle_t,
+    ze_kernel_handle_t,
+    const ze_group_count_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendLaunchMultipleKernelsIndirect_t)(
+    ze_command_list_handle_t,
+    uint32_t,
+    ze_kernel_handle_t*,
+    const uint32_t*,
+    const ze_group_count_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendImageCopyToMemoryExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyToMemoryExt_t)(
+    ze_command_list_handle_t,
+    void*,
+    ze_image_handle_t,
+    const ze_image_region_t*,
+    uint32_t,
+    uint32_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListAppendImageCopyFromMemoryExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListAppendImageCopyFromMemoryExt_t)(
+    ze_command_list_handle_t,
+    ze_image_handle_t,
+    const void*,
+    const ze_image_region_t*,
+    uint32_t,
+    uint32_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListHostSynchronize 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListHostSynchronize_t)(
+    ze_command_list_handle_t,
+    uint64_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListGetDeviceHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListGetDeviceHandle_t)(
+    ze_command_list_handle_t,
+    ze_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListGetContextHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListGetContextHandle_t)(
+    ze_command_list_handle_t,
+    ze_context_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListGetOrdinal 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListGetOrdinal_t)(
+    ze_command_list_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListImmediateGetIndex 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListImmediateGetIndex_t)(
+    ze_command_list_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListIsImmediate 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListIsImmediate_t)(
+    ze_command_list_handle_t,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of CommandList functions pointers
+typedef struct _ze_command_list_dditable_t
+{
+    ze_pfnCommandListCreate_t                                   pfnCreate;
+    ze_pfnCommandListCreateImmediate_t                          pfnCreateImmediate;
+    ze_pfnCommandListDestroy_t                                  pfnDestroy;
+    ze_pfnCommandListClose_t                                    pfnClose;
+    ze_pfnCommandListReset_t                                    pfnReset;
+    ze_pfnCommandListAppendWriteGlobalTimestamp_t               pfnAppendWriteGlobalTimestamp;
+    ze_pfnCommandListAppendBarrier_t                            pfnAppendBarrier;
+    ze_pfnCommandListAppendMemoryRangesBarrier_t                pfnAppendMemoryRangesBarrier;
+    ze_pfnCommandListAppendMemoryCopy_t                         pfnAppendMemoryCopy;
+    ze_pfnCommandListAppendMemoryFill_t                         pfnAppendMemoryFill;
+    ze_pfnCommandListAppendMemoryCopyRegion_t                   pfnAppendMemoryCopyRegion;
+    ze_pfnCommandListAppendMemoryCopyFromContext_t              pfnAppendMemoryCopyFromContext;
+    ze_pfnCommandListAppendImageCopy_t                          pfnAppendImageCopy;
+    ze_pfnCommandListAppendImageCopyRegion_t                    pfnAppendImageCopyRegion;
+    ze_pfnCommandListAppendImageCopyToMemory_t                  pfnAppendImageCopyToMemory;
+    ze_pfnCommandListAppendImageCopyFromMemory_t                pfnAppendImageCopyFromMemory;
+    ze_pfnCommandListAppendMemoryPrefetch_t                     pfnAppendMemoryPrefetch;
+    ze_pfnCommandListAppendMemAdvise_t                          pfnAppendMemAdvise;
+    ze_pfnCommandListAppendSignalEvent_t                        pfnAppendSignalEvent;
+    ze_pfnCommandListAppendWaitOnEvents_t                       pfnAppendWaitOnEvents;
+    ze_pfnCommandListAppendEventReset_t                         pfnAppendEventReset;
+    ze_pfnCommandListAppendQueryKernelTimestamps_t              pfnAppendQueryKernelTimestamps;
+    ze_pfnCommandListAppendLaunchKernel_t                       pfnAppendLaunchKernel;
+    ze_pfnCommandListAppendLaunchCooperativeKernel_t            pfnAppendLaunchCooperativeKernel;
+    ze_pfnCommandListAppendLaunchKernelIndirect_t               pfnAppendLaunchKernelIndirect;
+    ze_pfnCommandListAppendLaunchMultipleKernelsIndirect_t      pfnAppendLaunchMultipleKernelsIndirect;
+    ze_pfnCommandListAppendImageCopyToMemoryExt_t               pfnAppendImageCopyToMemoryExt;
+    ze_pfnCommandListAppendImageCopyFromMemoryExt_t             pfnAppendImageCopyFromMemoryExt;
+    ze_pfnCommandListHostSynchronize_t                          pfnHostSynchronize;
+    ze_pfnCommandListGetDeviceHandle_t                          pfnGetDeviceHandle;
+    ze_pfnCommandListGetContextHandle_t                         pfnGetContextHandle;
+    ze_pfnCommandListGetOrdinal_t                               pfnGetOrdinal;
+    ze_pfnCommandListImmediateGetIndex_t                        pfnImmediateGetIndex;
+    ze_pfnCommandListIsImmediate_t                              pfnIsImmediate;
+} ze_command_list_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's CommandList table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetCommandListProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_command_list_dditable_t* pDdiTable                                   ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetCommandListProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetCommandListProcAddrTable_t)(
+    ze_api_version_t,
+    ze_command_list_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListCreateCloneExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListCreateCloneExp_t)(
+    ze_command_list_handle_t,
+    ze_command_list_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListImmediateAppendCommandListsExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListImmediateAppendCommandListsExp_t)(
+    ze_command_list_handle_t,
+    uint32_t,
+    ze_command_list_handle_t*,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListGetNextCommandIdExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListGetNextCommandIdExp_t)(
+    ze_command_list_handle_t,
+    const ze_mutable_command_id_exp_desc_t*,
+    uint64_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListUpdateMutableCommandsExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandsExp_t)(
+    ze_command_list_handle_t,
+    const ze_mutable_commands_exp_desc_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListUpdateMutableCommandSignalEventExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandSignalEventExp_t)(
+    ze_command_list_handle_t,
+    uint64_t,
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListUpdateMutableCommandWaitEventsExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandWaitEventsExp_t)(
+    ze_command_list_handle_t,
+    uint64_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListGetNextCommandIdWithKernelsExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListGetNextCommandIdWithKernelsExp_t)(
+    ze_command_list_handle_t,
+    const ze_mutable_command_id_exp_desc_t*,
+    uint32_t,
+    ze_kernel_handle_t*,
+    uint64_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeCommandListUpdateMutableCommandKernelsExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnCommandListUpdateMutableCommandKernelsExp_t)(
+    ze_command_list_handle_t,
+    uint32_t,
+    uint64_t*,
+    ze_kernel_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of CommandListExp functions pointers
+typedef struct _ze_command_list_exp_dditable_t
+{
+    ze_pfnCommandListCreateCloneExp_t                           pfnCreateCloneExp;
+    ze_pfnCommandListImmediateAppendCommandListsExp_t           pfnImmediateAppendCommandListsExp;
+    ze_pfnCommandListGetNextCommandIdExp_t                      pfnGetNextCommandIdExp;
+    ze_pfnCommandListUpdateMutableCommandsExp_t                 pfnUpdateMutableCommandsExp;
+    ze_pfnCommandListUpdateMutableCommandSignalEventExp_t       pfnUpdateMutableCommandSignalEventExp;
+    ze_pfnCommandListUpdateMutableCommandWaitEventsExp_t        pfnUpdateMutableCommandWaitEventsExp;
+    ze_pfnCommandListGetNextCommandIdWithKernelsExp_t           pfnGetNextCommandIdWithKernelsExp;
+    ze_pfnCommandListUpdateMutableCommandKernelsExp_t           pfnUpdateMutableCommandKernelsExp;
+} ze_command_list_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's CommandListExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetCommandListExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_command_list_exp_dditable_t* pDdiTable                               ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetCommandListExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetCommandListExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_command_list_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageGetProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageGetProperties_t)(
+    ze_device_handle_t,
+    const ze_image_desc_t*,
+    ze_image_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageCreate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_image_desc_t*,
+    ze_image_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageDestroy_t)(
+    ze_image_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageGetAllocPropertiesExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageGetAllocPropertiesExt_t)(
+    ze_context_handle_t,
+    ze_image_handle_t,
+    ze_image_allocation_ext_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageViewCreateExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageViewCreateExt_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_image_desc_t*,
+    ze_image_handle_t,
+    ze_image_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Image functions pointers
+typedef struct _ze_image_dditable_t
+{
+    ze_pfnImageGetProperties_t                                  pfnGetProperties;
+    ze_pfnImageCreate_t                                         pfnCreate;
+    ze_pfnImageDestroy_t                                        pfnDestroy;
+    ze_pfnImageGetAllocPropertiesExt_t                          pfnGetAllocPropertiesExt;
+    ze_pfnImageViewCreateExt_t                                  pfnViewCreateExt;
+} ze_image_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Image table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetImageProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_image_dditable_t* pDdiTable                                          ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetImageProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetImageProcAddrTable_t)(
+    ze_api_version_t,
+    ze_image_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageGetMemoryPropertiesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageGetMemoryPropertiesExp_t)(
+    ze_image_handle_t,
+    ze_image_memory_properties_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageViewCreateExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageViewCreateExp_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_image_desc_t*,
+    ze_image_handle_t,
+    ze_image_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeImageGetDeviceOffsetExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnImageGetDeviceOffsetExp_t)(
+    ze_image_handle_t,
+    uint64_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of ImageExp functions pointers
+typedef struct _ze_image_exp_dditable_t
+{
+    ze_pfnImageGetMemoryPropertiesExp_t                         pfnGetMemoryPropertiesExp;
+    ze_pfnImageViewCreateExp_t                                  pfnViewCreateExp;
+    ze_pfnImageGetDeviceOffsetExp_t                             pfnGetDeviceOffsetExp;
+} ze_image_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's ImageExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetImageExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_image_exp_dditable_t* pDdiTable                                      ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetImageExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetImageExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_image_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemAllocShared 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemAllocShared_t)(
+    ze_context_handle_t,
+    const ze_device_mem_alloc_desc_t*,
+    const ze_host_mem_alloc_desc_t*,
+    size_t,
+    size_t,
+    ze_device_handle_t,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemAllocDevice 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemAllocDevice_t)(
+    ze_context_handle_t,
+    const ze_device_mem_alloc_desc_t*,
+    size_t,
+    size_t,
+    ze_device_handle_t,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemAllocHost 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemAllocHost_t)(
+    ze_context_handle_t,
+    const ze_host_mem_alloc_desc_t*,
+    size_t,
+    size_t,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemFree 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemFree_t)(
+    ze_context_handle_t,
+    void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetAllocProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetAllocProperties_t)(
+    ze_context_handle_t,
+    const void*,
+    ze_memory_allocation_properties_t*,
+    ze_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetAddressRange 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetAddressRange_t)(
+    ze_context_handle_t,
+    const void*,
+    void**,
+    size_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetIpcHandle_t)(
+    ze_context_handle_t,
+    const void*,
+    ze_ipc_mem_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemOpenIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemOpenIpcHandle_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    ze_ipc_mem_handle_t,
+    ze_ipc_memory_flags_t,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemCloseIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemCloseIpcHandle_t)(
+    ze_context_handle_t,
+    const void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemFreeExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemFreeExt_t)(
+    ze_context_handle_t,
+    const ze_memory_free_ext_desc_t*,
+    void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemPutIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemPutIpcHandle_t)(
+    ze_context_handle_t,
+    ze_ipc_mem_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetPitchFor2dImage 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetPitchFor2dImage_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    size_t,
+    size_t,
+    unsigned int,
+    size_t *
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Mem functions pointers
+typedef struct _ze_mem_dditable_t
+{
+    ze_pfnMemAllocShared_t                                      pfnAllocShared;
+    ze_pfnMemAllocDevice_t                                      pfnAllocDevice;
+    ze_pfnMemAllocHost_t                                        pfnAllocHost;
+    ze_pfnMemFree_t                                             pfnFree;
+    ze_pfnMemGetAllocProperties_t                               pfnGetAllocProperties;
+    ze_pfnMemGetAddressRange_t                                  pfnGetAddressRange;
+    ze_pfnMemGetIpcHandle_t                                     pfnGetIpcHandle;
+    ze_pfnMemOpenIpcHandle_t                                    pfnOpenIpcHandle;
+    ze_pfnMemCloseIpcHandle_t                                   pfnCloseIpcHandle;
+    ze_pfnMemFreeExt_t                                          pfnFreeExt;
+    ze_pfnMemPutIpcHandle_t                                     pfnPutIpcHandle;
+    ze_pfnMemGetPitchFor2dImage_t                               pfnGetPitchFor2dImage;
+} ze_mem_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Mem table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetMemProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_mem_dditable_t* pDdiTable                                            ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetMemProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetMemProcAddrTable_t)(
+    ze_api_version_t,
+    ze_mem_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetIpcHandleFromFileDescriptorExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetIpcHandleFromFileDescriptorExp_t)(
+    ze_context_handle_t,
+    uint64_t,
+    ze_ipc_mem_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetFileDescriptorFromIpcHandleExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetFileDescriptorFromIpcHandleExp_t)(
+    ze_context_handle_t,
+    ze_ipc_mem_handle_t,
+    uint64_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemSetAtomicAccessAttributeExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemSetAtomicAccessAttributeExp_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const void*,
+    size_t,
+    ze_memory_atomic_attr_exp_flags_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeMemGetAtomicAccessAttributeExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnMemGetAtomicAccessAttributeExp_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const void*,
+    size_t,
+    ze_memory_atomic_attr_exp_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MemExp functions pointers
+typedef struct _ze_mem_exp_dditable_t
+{
+    ze_pfnMemGetIpcHandleFromFileDescriptorExp_t                pfnGetIpcHandleFromFileDescriptorExp;
+    ze_pfnMemGetFileDescriptorFromIpcHandleExp_t                pfnGetFileDescriptorFromIpcHandleExp;
+    ze_pfnMemSetAtomicAccessAttributeExp_t                      pfnSetAtomicAccessAttributeExp;
+    ze_pfnMemGetAtomicAccessAttributeExp_t                      pfnGetAtomicAccessAttributeExp;
+} ze_mem_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MemExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetMemExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_mem_exp_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetMemExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetMemExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_mem_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFenceCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnFenceCreate_t)(
+    ze_command_queue_handle_t,
+    const ze_fence_desc_t*,
+    ze_fence_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFenceDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnFenceDestroy_t)(
+    ze_fence_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFenceHostSynchronize 
+typedef ze_result_t (ZE_APICALL *ze_pfnFenceHostSynchronize_t)(
+    ze_fence_handle_t,
+    uint64_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFenceQueryStatus 
+typedef ze_result_t (ZE_APICALL *ze_pfnFenceQueryStatus_t)(
+    ze_fence_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFenceReset 
+typedef ze_result_t (ZE_APICALL *ze_pfnFenceReset_t)(
+    ze_fence_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Fence functions pointers
+typedef struct _ze_fence_dditable_t
+{
+    ze_pfnFenceCreate_t                                         pfnCreate;
+    ze_pfnFenceDestroy_t                                        pfnDestroy;
+    ze_pfnFenceHostSynchronize_t                                pfnHostSynchronize;
+    ze_pfnFenceQueryStatus_t                                    pfnQueryStatus;
+    ze_pfnFenceReset_t                                          pfnReset;
+} ze_fence_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Fence table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetFenceProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_fence_dditable_t* pDdiTable                                          ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetFenceProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetFenceProcAddrTable_t)(
+    ze_api_version_t,
+    ze_fence_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolCreate_t)(
+    ze_context_handle_t,
+    const ze_event_pool_desc_t*,
+    uint32_t,
+    ze_device_handle_t*,
+    ze_event_pool_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolDestroy_t)(
+    ze_event_pool_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolGetIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolGetIpcHandle_t)(
+    ze_event_pool_handle_t,
+    ze_ipc_event_pool_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolOpenIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolOpenIpcHandle_t)(
+    ze_context_handle_t,
+    ze_ipc_event_pool_handle_t,
+    ze_event_pool_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolCloseIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolCloseIpcHandle_t)(
+    ze_event_pool_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolPutIpcHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolPutIpcHandle_t)(
+    ze_context_handle_t,
+    ze_ipc_event_pool_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolGetContextHandle 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolGetContextHandle_t)(
+    ze_event_pool_handle_t,
+    ze_context_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventPoolGetFlags 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventPoolGetFlags_t)(
+    ze_event_pool_handle_t,
+    ze_event_pool_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of EventPool functions pointers
+typedef struct _ze_event_pool_dditable_t
+{
+    ze_pfnEventPoolCreate_t                                     pfnCreate;
+    ze_pfnEventPoolDestroy_t                                    pfnDestroy;
+    ze_pfnEventPoolGetIpcHandle_t                               pfnGetIpcHandle;
+    ze_pfnEventPoolOpenIpcHandle_t                              pfnOpenIpcHandle;
+    ze_pfnEventPoolCloseIpcHandle_t                             pfnCloseIpcHandle;
+    ze_pfnEventPoolPutIpcHandle_t                               pfnPutIpcHandle;
+    ze_pfnEventPoolGetContextHandle_t                           pfnGetContextHandle;
+    ze_pfnEventPoolGetFlags_t                                   pfnGetFlags;
+} ze_event_pool_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's EventPool table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetEventPoolProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_event_pool_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetEventPoolProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetEventPoolProcAddrTable_t)(
+    ze_api_version_t,
+    ze_event_pool_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventCreate_t)(
+    ze_event_pool_handle_t,
+    const ze_event_desc_t*,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventDestroy_t)(
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventHostSignal 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventHostSignal_t)(
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventHostSynchronize 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventHostSynchronize_t)(
+    ze_event_handle_t,
+    uint64_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventQueryStatus 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryStatus_t)(
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventHostReset 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventHostReset_t)(
+    ze_event_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventQueryKernelTimestamp 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryKernelTimestamp_t)(
+    ze_event_handle_t,
+    ze_kernel_timestamp_result_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventQueryKernelTimestampsExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryKernelTimestampsExt_t)(
+    ze_event_handle_t,
+    ze_device_handle_t,
+    uint32_t*,
+    ze_event_query_kernel_timestamps_results_ext_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventGetEventPool 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventGetEventPool_t)(
+    ze_event_handle_t,
+    ze_event_pool_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventGetSignalScope 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventGetSignalScope_t)(
+    ze_event_handle_t,
+    ze_event_scope_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventGetWaitScope 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventGetWaitScope_t)(
+    ze_event_handle_t,
+    ze_event_scope_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Event functions pointers
+typedef struct _ze_event_dditable_t
+{
+    ze_pfnEventCreate_t                                         pfnCreate;
+    ze_pfnEventDestroy_t                                        pfnDestroy;
+    ze_pfnEventHostSignal_t                                     pfnHostSignal;
+    ze_pfnEventHostSynchronize_t                                pfnHostSynchronize;
+    ze_pfnEventQueryStatus_t                                    pfnQueryStatus;
+    ze_pfnEventHostReset_t                                      pfnHostReset;
+    ze_pfnEventQueryKernelTimestamp_t                           pfnQueryKernelTimestamp;
+    ze_pfnEventQueryKernelTimestampsExt_t                       pfnQueryKernelTimestampsExt;
+    ze_pfnEventGetEventPool_t                                   pfnGetEventPool;
+    ze_pfnEventGetSignalScope_t                                 pfnGetSignalScope;
+    ze_pfnEventGetWaitScope_t                                   pfnGetWaitScope;
+} ze_event_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Event table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetEventProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_event_dditable_t* pDdiTable                                          ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetEventProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetEventProcAddrTable_t)(
+    ze_api_version_t,
+    ze_event_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeEventQueryTimestampsExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnEventQueryTimestampsExp_t)(
+    ze_event_handle_t,
+    ze_device_handle_t,
+    uint32_t*,
+    ze_kernel_timestamp_result_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of EventExp functions pointers
+typedef struct _ze_event_exp_dditable_t
+{
+    ze_pfnEventQueryTimestampsExp_t                             pfnQueryTimestampsExp;
+} ze_event_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's EventExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetEventExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_event_exp_dditable_t* pDdiTable                                      ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetEventExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetEventExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_event_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleCreate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_module_desc_t*,
+    ze_module_handle_t*,
+    ze_module_build_log_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleDestroy_t)(
+    ze_module_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleDynamicLink 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleDynamicLink_t)(
+    uint32_t,
+    ze_module_handle_t*,
+    ze_module_build_log_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleGetNativeBinary 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetNativeBinary_t)(
+    ze_module_handle_t,
+    size_t*,
+    uint8_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleGetGlobalPointer 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetGlobalPointer_t)(
+    ze_module_handle_t,
+    const char*,
+    size_t*,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleGetKernelNames 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetKernelNames_t)(
+    ze_module_handle_t,
+    uint32_t*,
+    const char**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleGetProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetProperties_t)(
+    ze_module_handle_t,
+    ze_module_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleGetFunctionPointer 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleGetFunctionPointer_t)(
+    ze_module_handle_t,
+    const char*,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleInspectLinkageExt 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleInspectLinkageExt_t)(
+    ze_linkage_inspection_ext_desc_t*,
+    uint32_t,
+    ze_module_handle_t*,
+    ze_module_build_log_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Module functions pointers
+typedef struct _ze_module_dditable_t
+{
+    ze_pfnModuleCreate_t                                        pfnCreate;
+    ze_pfnModuleDestroy_t                                       pfnDestroy;
+    ze_pfnModuleDynamicLink_t                                   pfnDynamicLink;
+    ze_pfnModuleGetNativeBinary_t                               pfnGetNativeBinary;
+    ze_pfnModuleGetGlobalPointer_t                              pfnGetGlobalPointer;
+    ze_pfnModuleGetKernelNames_t                                pfnGetKernelNames;
+    ze_pfnModuleGetProperties_t                                 pfnGetProperties;
+    ze_pfnModuleGetFunctionPointer_t                            pfnGetFunctionPointer;
+    ze_pfnModuleInspectLinkageExt_t                             pfnInspectLinkageExt;
+} ze_module_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Module table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetModuleProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_module_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetModuleProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetModuleProcAddrTable_t)(
+    ze_api_version_t,
+    ze_module_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleBuildLogDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleBuildLogDestroy_t)(
+    ze_module_build_log_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeModuleBuildLogGetString 
+typedef ze_result_t (ZE_APICALL *ze_pfnModuleBuildLogGetString_t)(
+    ze_module_build_log_handle_t,
+    size_t*,
+    char*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of ModuleBuildLog functions pointers
+typedef struct _ze_module_build_log_dditable_t
+{
+    ze_pfnModuleBuildLogDestroy_t                               pfnDestroy;
+    ze_pfnModuleBuildLogGetString_t                             pfnGetString;
+} ze_module_build_log_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's ModuleBuildLog table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetModuleBuildLogProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_module_build_log_dditable_t* pDdiTable                               ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetModuleBuildLogProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetModuleBuildLogProcAddrTable_t)(
+    ze_api_version_t,
+    ze_module_build_log_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelCreate_t)(
+    ze_module_handle_t,
+    const ze_kernel_desc_t*,
+    ze_kernel_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelDestroy_t)(
+    ze_kernel_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSetCacheConfig 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetCacheConfig_t)(
+    ze_kernel_handle_t,
+    ze_cache_config_flags_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSetGroupSize 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetGroupSize_t)(
+    ze_kernel_handle_t,
+    uint32_t,
+    uint32_t,
+    uint32_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSuggestGroupSize 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSuggestGroupSize_t)(
+    ze_kernel_handle_t,
+    uint32_t,
+    uint32_t,
+    uint32_t,
+    uint32_t*,
+    uint32_t*,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSuggestMaxCooperativeGroupCount 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSuggestMaxCooperativeGroupCount_t)(
+    ze_kernel_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSetArgumentValue 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetArgumentValue_t)(
+    ze_kernel_handle_t,
+    uint32_t,
+    size_t,
+    const void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSetIndirectAccess 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetIndirectAccess_t)(
+    ze_kernel_handle_t,
+    ze_kernel_indirect_access_flags_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelGetIndirectAccess 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetIndirectAccess_t)(
+    ze_kernel_handle_t,
+    ze_kernel_indirect_access_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelGetSourceAttributes 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetSourceAttributes_t)(
+    ze_kernel_handle_t,
+    uint32_t*,
+    char**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelGetProperties 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetProperties_t)(
+    ze_kernel_handle_t,
+    ze_kernel_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelGetName 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetName_t)(
+    ze_kernel_handle_t,
+    size_t*,
+    char*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Kernel functions pointers
+typedef struct _ze_kernel_dditable_t
+{
+    ze_pfnKernelCreate_t                                        pfnCreate;
+    ze_pfnKernelDestroy_t                                       pfnDestroy;
+    ze_pfnKernelSetCacheConfig_t                                pfnSetCacheConfig;
+    ze_pfnKernelSetGroupSize_t                                  pfnSetGroupSize;
+    ze_pfnKernelSuggestGroupSize_t                              pfnSuggestGroupSize;
+    ze_pfnKernelSuggestMaxCooperativeGroupCount_t               pfnSuggestMaxCooperativeGroupCount;
+    ze_pfnKernelSetArgumentValue_t                              pfnSetArgumentValue;
+    ze_pfnKernelSetIndirectAccess_t                             pfnSetIndirectAccess;
+    ze_pfnKernelGetIndirectAccess_t                             pfnGetIndirectAccess;
+    ze_pfnKernelGetSourceAttributes_t                           pfnGetSourceAttributes;
+    ze_pfnKernelGetProperties_t                                 pfnGetProperties;
+    ze_pfnKernelGetName_t                                       pfnGetName;
+} ze_kernel_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Kernel table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetKernelProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_kernel_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetKernelProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetKernelProcAddrTable_t)(
+    ze_api_version_t,
+    ze_kernel_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSetGlobalOffsetExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSetGlobalOffsetExp_t)(
+    ze_kernel_handle_t,
+    uint32_t,
+    uint32_t,
+    uint32_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelSchedulingHintExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelSchedulingHintExp_t)(
+    ze_kernel_handle_t,
+    ze_scheduling_hint_exp_desc_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeKernelGetBinaryExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnKernelGetBinaryExp_t)(
+    ze_kernel_handle_t,
+    size_t*,
+    uint8_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of KernelExp functions pointers
+typedef struct _ze_kernel_exp_dditable_t
+{
+    ze_pfnKernelSetGlobalOffsetExp_t                            pfnSetGlobalOffsetExp;
+    ze_pfnKernelSchedulingHintExp_t                             pfnSchedulingHintExp;
+    ze_pfnKernelGetBinaryExp_t                                  pfnGetBinaryExp;
+} ze_kernel_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's KernelExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetKernelExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_kernel_exp_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetKernelExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetKernelExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_kernel_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeSamplerCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnSamplerCreate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    const ze_sampler_desc_t*,
+    ze_sampler_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeSamplerDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnSamplerDestroy_t)(
+    ze_sampler_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Sampler functions pointers
+typedef struct _ze_sampler_dditable_t
+{
+    ze_pfnSamplerCreate_t                                       pfnCreate;
+    ze_pfnSamplerDestroy_t                                      pfnDestroy;
+} ze_sampler_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Sampler table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetSamplerProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_sampler_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetSamplerProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetSamplerProcAddrTable_t)(
+    ze_api_version_t,
+    ze_sampler_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zePhysicalMemCreate 
+typedef ze_result_t (ZE_APICALL *ze_pfnPhysicalMemCreate_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    ze_physical_mem_desc_t*,
+    ze_physical_mem_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zePhysicalMemDestroy 
+typedef ze_result_t (ZE_APICALL *ze_pfnPhysicalMemDestroy_t)(
+    ze_context_handle_t,
+    ze_physical_mem_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of PhysicalMem functions pointers
+typedef struct _ze_physical_mem_dditable_t
+{
+    ze_pfnPhysicalMemCreate_t                                   pfnCreate;
+    ze_pfnPhysicalMemDestroy_t                                  pfnDestroy;
+} ze_physical_mem_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's PhysicalMem table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetPhysicalMemProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_physical_mem_dditable_t* pDdiTable                                   ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetPhysicalMemProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetPhysicalMemProcAddrTable_t)(
+    ze_api_version_t,
+    ze_physical_mem_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemReserve 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemReserve_t)(
+    ze_context_handle_t,
+    const void*,
+    size_t,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemFree 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemFree_t)(
+    ze_context_handle_t,
+    const void*,
+    size_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemQueryPageSize 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemQueryPageSize_t)(
+    ze_context_handle_t,
+    ze_device_handle_t,
+    size_t,
+    size_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemMap 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemMap_t)(
+    ze_context_handle_t,
+    const void*,
+    size_t,
+    ze_physical_mem_handle_t,
+    size_t,
+    ze_memory_access_attribute_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemUnmap 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemUnmap_t)(
+    ze_context_handle_t,
+    const void*,
+    size_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemSetAccessAttribute 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemSetAccessAttribute_t)(
+    ze_context_handle_t,
+    const void*,
+    size_t,
+    ze_memory_access_attribute_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeVirtualMemGetAccessAttribute 
+typedef ze_result_t (ZE_APICALL *ze_pfnVirtualMemGetAccessAttribute_t)(
+    ze_context_handle_t,
+    const void*,
+    size_t,
+    ze_memory_access_attribute_t*,
+    size_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of VirtualMem functions pointers
+typedef struct _ze_virtual_mem_dditable_t
+{
+    ze_pfnVirtualMemReserve_t                                   pfnReserve;
+    ze_pfnVirtualMemFree_t                                      pfnFree;
+    ze_pfnVirtualMemQueryPageSize_t                             pfnQueryPageSize;
+    ze_pfnVirtualMemMap_t                                       pfnMap;
+    ze_pfnVirtualMemUnmap_t                                     pfnUnmap;
+    ze_pfnVirtualMemSetAccessAttribute_t                        pfnSetAccessAttribute;
+    ze_pfnVirtualMemGetAccessAttribute_t                        pfnGetAccessAttribute;
+} ze_virtual_mem_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's VirtualMem table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetVirtualMemProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_virtual_mem_dditable_t* pDdiTable                                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetVirtualMemProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetVirtualMemProcAddrTable_t)(
+    ze_api_version_t,
+    ze_virtual_mem_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricVertexGetExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricVertexGetExp_t)(
+    ze_driver_handle_t,
+    uint32_t*,
+    ze_fabric_vertex_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricVertexGetSubVerticesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricVertexGetSubVerticesExp_t)(
+    ze_fabric_vertex_handle_t,
+    uint32_t*,
+    ze_fabric_vertex_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricVertexGetPropertiesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricVertexGetPropertiesExp_t)(
+    ze_fabric_vertex_handle_t,
+    ze_fabric_vertex_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricVertexGetDeviceExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricVertexGetDeviceExp_t)(
+    ze_fabric_vertex_handle_t,
+    ze_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of FabricVertexExp functions pointers
+typedef struct _ze_fabric_vertex_exp_dditable_t
+{
+    ze_pfnFabricVertexGetExp_t                                  pfnGetExp;
+    ze_pfnFabricVertexGetSubVerticesExp_t                       pfnGetSubVerticesExp;
+    ze_pfnFabricVertexGetPropertiesExp_t                        pfnGetPropertiesExp;
+    ze_pfnFabricVertexGetDeviceExp_t                            pfnGetDeviceExp;
+} ze_fabric_vertex_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's FabricVertexExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetFabricVertexExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_fabric_vertex_exp_dditable_t* pDdiTable                              ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetFabricVertexExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetFabricVertexExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_fabric_vertex_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricEdgeGetExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricEdgeGetExp_t)(
+    ze_fabric_vertex_handle_t,
+    ze_fabric_vertex_handle_t,
+    uint32_t*,
+    ze_fabric_edge_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricEdgeGetVerticesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricEdgeGetVerticesExp_t)(
+    ze_fabric_edge_handle_t,
+    ze_fabric_vertex_handle_t*,
+    ze_fabric_vertex_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeFabricEdgeGetPropertiesExp 
+typedef ze_result_t (ZE_APICALL *ze_pfnFabricEdgeGetPropertiesExp_t)(
+    ze_fabric_edge_handle_t,
+    ze_fabric_edge_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of FabricEdgeExp functions pointers
+typedef struct _ze_fabric_edge_exp_dditable_t
+{
+    ze_pfnFabricEdgeGetExp_t                                    pfnGetExp;
+    ze_pfnFabricEdgeGetVerticesExp_t                            pfnGetVerticesExp;
+    ze_pfnFabricEdgeGetPropertiesExp_t                          pfnGetPropertiesExp;
+} ze_fabric_edge_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's FabricEdgeExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zeGetFabricEdgeExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    ze_fabric_edge_exp_dditable_t* pDdiTable                                ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zeGetFabricEdgeExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *ze_pfnGetFabricEdgeExpProcAddrTable_t)(
+    ze_api_version_t,
+    ze_fabric_edge_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Container for all DDI tables
+typedef struct _ze_dditable_t
+{
+    ze_rtas_builder_exp_dditable_t      RTASBuilderExp;
+    ze_rtas_parallel_operation_exp_dditable_t   RTASParallelOperationExp;
+    ze_global_dditable_t                Global;
+    ze_driver_dditable_t                Driver;
+    ze_driver_exp_dditable_t            DriverExp;
+    ze_device_dditable_t                Device;
+    ze_device_exp_dditable_t            DeviceExp;
+    ze_context_dditable_t               Context;
+    ze_command_queue_dditable_t         CommandQueue;
+    ze_command_list_dditable_t          CommandList;
+    ze_command_list_exp_dditable_t      CommandListExp;
+    ze_image_dditable_t                 Image;
+    ze_image_exp_dditable_t             ImageExp;
+    ze_mem_dditable_t                   Mem;
+    ze_mem_exp_dditable_t               MemExp;
+    ze_fence_dditable_t                 Fence;
+    ze_event_pool_dditable_t            EventPool;
+    ze_event_dditable_t                 Event;
+    ze_event_exp_dditable_t             EventExp;
+    ze_module_dditable_t                Module;
+    ze_module_build_log_dditable_t      ModuleBuildLog;
+    ze_kernel_dditable_t                Kernel;
+    ze_kernel_exp_dditable_t            KernelExp;
+    ze_sampler_dditable_t               Sampler;
+    ze_physical_mem_dditable_t          PhysicalMem;
+    ze_virtual_mem_dditable_t           VirtualMem;
+    ze_fabric_vertex_exp_dditable_t     FabricVertexExp;
+    ze_fabric_edge_exp_dditable_t       FabricEdgeExp;
+} ze_dditable_t;
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZE_DDI_H
\ No newline at end of file
diff --git a/third_party/level_zero/ze_intel_gpu.h b/third_party/level_zero/ze_intel_gpu.h
new file mode 100644
index 00000000000..a5d460d1b0b
--- /dev/null
+++ b/third_party/level_zero/ze_intel_gpu.h
@@ -0,0 +1,379 @@
+/*
+ * Copyright (C) 2020-2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#ifndef _ZE_INTEL_GPU_H
+#define _ZE_INTEL_GPU_H
+
+#include "ze_stypes.h"
+#include "ze_api.h"
+
+#if defined(__cplusplus)
+#pragma once
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+#define ZE_INTEL_GPU_VERSION_MAJOR 0
+#define ZE_INTEL_GPU_VERSION_MINOR 1
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME
+/// @brief Module DP properties driver extension name
+#define ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME "ZE_intel_experimental_device_module_dp_properties"
+#endif // ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Module DP properties driver extension Version(s)
+typedef enum _ze_intel_device_module_dp_properties_exp_version_t {
+    ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_DEVICE_MODULE_DP_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_module_dp_properties_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported Dot Product flags
+typedef uint32_t ze_intel_device_module_dp_exp_flags_t;
+typedef enum _ze_intel_device_module_dp_exp_flag_t {
+    ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DP4A = ZE_BIT(0), ///< Supports DP4A operation
+    ZE_INTEL_DEVICE_MODULE_EXP_FLAG_DPAS = ZE_BIT(1), ///< Supports DPAS operation
+    ZE_INTEL_DEVICE_MODULE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_module_dp_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device Module dot product properties queried using
+///        ::zeDeviceGetModuleProperties
+///
+/// @details
+///     - This structure may be passed to ::zeDeviceGetModuleProperties, via
+///       `pNext` member of ::ze_device_module_properties_t.
+/// @brief Device module dot product properties
+typedef struct _ze_intel_device_module_dp_exp_properties_t {
+    ze_structure_type_t stype = ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES; ///< [in] type of this structure
+    void *pNext;                                                                    ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                                    ///< structure (i.e. contains sType and pNext).
+    ze_intel_device_module_dp_exp_flags_t flags;                                    ///< [out] 0 (none) or a valid combination of ::ze_intel_device_module_dp_flag_t
+} ze_intel_device_module_dp_exp_properties_t;
+
+#ifndef ZE_INTEL_COMMAND_LIST_MEMORY_SYNC
+/// @brief wait on memory extension name
+#define ZE_INTEL_COMMAND_LIST_MEMORY_SYNC "ZE_intel_experimental_command_list_memory_sync"
+#endif // ZE_INTEL_COMMAND_LIST_MEMORY_SYNC
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Cmd List memory sync extension Version(s)
+typedef enum _ze_intel_command_list_memory_sync_exp_version_t {
+    ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_COMMAND_LIST_MEMORY_SYNC_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_command_list_memory_sync_exp_version_t;
+
+#ifndef ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC
+/// @brief stype for _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extended descriptor for cmd list memory sync
+///
+/// @details
+///     - Implementation must support ::ZE_intel_experimental_command_list_memory_sync extension
+///     - May be passed to ze_device_properties_t through pNext.
+typedef struct _ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t {
+    ze_structure_type_t stype;                   ///< [in] type of this structure
+    const void *pNext;                           ///< [in][optional] must be null or a pointer to an extension-specific
+                                                 ///< structure (i.e. contains stype and pNext).
+    uint32_t cmdListWaitOnMemoryDataSizeInBytes; /// <out> Defines supported data size for zexCommandListAppendWaitOnMemory[64] API
+} ze_intel_device_command_list_wait_on_memory_data_size_exp_desc_t;
+
+#ifndef ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME
+/// @brief Event sync mode extension name
+#define ZEX_INTEL_EVENT_SYNC_MODE_EXP_NAME "ZEX_intel_experimental_event_sync_mode"
+#endif // ZE_INTEL_EVENT_SYNC_MODE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event sync mode extension Version(s)
+typedef enum _zex_intel_event_sync_mode_exp_version_t {
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} zex_intel_event_sync_mode_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported event sync mode flags
+typedef uint32_t zex_intel_event_sync_mode_exp_flags_t;
+typedef enum _zex_intel_event_sync_mode_exp_flag_t {
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_LOW_POWER_WAIT = ZE_BIT(0),          ///< Low power host synchronization mode, for better CPU utilization
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_SIGNAL_INTERRUPT = ZE_BIT(1),        ///< Generate interrupt when Event is signalled on Device
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT = ZE_BIT(2), ///< Host synchronization APIs wait for external interrupt. Can be used only for Events created via zexCounterBasedEventCreate
+    ZEX_INTEL_EVENT_SYNC_MODE_EXP_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zex_intel_event_sync_mode_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extended descriptor for event sync mode
+///
+/// @details
+///     - Implementation must support ::ZEX_intel_experimental_event_sync_mode extension
+///     - May be passed to ze_event_desc_t through pNext.
+typedef struct _zex_intel_event_sync_mode_exp_desc_t {
+    ze_structure_type_t stype;                           ///< [in] type of this structure
+    const void *pNext;                                   ///< [in][optional] must be null or a pointer to an extension-specific
+                                                         ///< structure (i.e. contains stype and pNext).
+    zex_intel_event_sync_mode_exp_flags_t syncModeFlags; /// <in> valid combination of ::ze_intel_event_sync_mode_exp_flag_t
+    uint32_t externalInterruptId;                        /// <in> External interrupt id. Used only when ZEX_INTEL_EVENT_SYNC_MODE_EXP_FLAG_EXTERNAL_INTERRUPT_WAIT flag is set
+} zex_intel_event_sync_mode_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zex_intel_queue_allocate_msix_hint_exp_desc_t
+typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t zex_intel_queue_allocate_msix_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue descriptor for allocating unique msix. This structure may be
+/// passed as pNext member of ::ze_command_queue_desc_t.
+
+typedef struct _zex_intel_queue_allocate_msix_hint_exp_desc_t {
+    ze_structure_type_t stype; ///< [in] type of this structure
+    const void *pNext;         ///< [in][optional] must be null or a pointer to an extension-specific
+                               ///< structure (i.e. contains stype and pNext).
+    ze_bool_t uniqueMsix;      ///< [in] If set, try to allocate unique msix for command queue.
+                               ///< If not set, driver will follow default behaviour. It may share msix for signaling completion with other queues.
+                               ///< Number of unique msixes may be limited. On unsuccessful allocation, queue or immediate cmd list creation API fallbacks to default behaviour.
+
+} zex_intel_queue_allocate_msix_hint_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Command queue descriptor for enabling copy operations offload. This structure may be
+/// passed as pNext member of ::ze_command_queue_desc_t.
+
+typedef struct _zex_intel_queue_copy_operations_offload_hint_exp_desc_t {
+    ze_structure_type_t stype;    ///< [in] type of this structure
+    const void *pNext;            ///< [in][optional] must be null or a pointer to an extension-specific
+                                  ///< structure (i.e. contains stype and pNext).
+    ze_bool_t copyOffloadEnabled; ///< [in] If set, try to offload copy operations to different engines. Applicable only for compute queues.
+                                  ///< This is only a hint. Driver may ignore it per append call, based on platform capabilities or internal heuristics.
+                                  ///< If not set, driver will follow default behaviour. Copy operations will be submitted to same engine as compute operations.
+
+} zex_intel_queue_copy_operations_offload_hint_exp_desc_t;
+
+#ifndef ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME
+/// @brief Queue copy operations offload hint extension name
+#define ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME "ZEX_intel_experimental_queue_copy_operations_offload_hint"
+#endif // ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Queue copy operations offload hint extension version(s)
+typedef enum _zex_intel_queue_copy_operations_offload_hint_exp_version_t {
+    ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZEX_INTEL_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} zex_intel_queue_copy_operations_offload_hint_exp_version_t;
+
+#ifndef ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME
+/// @brief Extension name for query to read the Intel Level Zero Driver Version String
+#define ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME "ZE_intel_get_driver_version_string"
+#endif // ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query to read the Intel Level Zero Driver Version String extension version(s)
+typedef enum _ze_intel_get_driver_version_string_exp_version_t {
+    ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_GET_DRIVER_VERSION_STRING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_get_driver_version_string_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported 2D Block Array flags
+typedef uint32_t ze_intel_device_block_array_exp_flags_t;
+typedef enum _ze_intel_device_block_array_exp_flag_t {
+    ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_STORE = ZE_BIT(0), ///< Supports store operation
+    ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_LOAD = ZE_BIT(1),  ///< Supports load operation
+    ZE_INTEL_DEVICE_EXP_FLAG_2D_BLOCK_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_block_array_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+/// @brief Device 2D block array properties driver extension name
+#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME "ZE_intel_experimental_device_block_array_properties"
+#endif // ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_NAME
+
+/// @brief Device 2D block array properties queried using
+///        ::zeDeviceGetProperties
+///
+/// @details
+///     - This structure may be passed to ::zeDeviceGetProperties, via
+///       `pNext` member of ::ze_device_properties_t.
+/// @brief Device 2D block array properties
+
+typedef struct _ze_intel_device_block_array_exp_properties_t {
+    ze_structure_type_t stype = ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES; ///< [in] type of this structure
+    void *pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains sType and pNext).
+    ze_intel_device_block_array_exp_flags_t flags;                          ///< [out] 0 (none) or a valid combination of ::ze_intel_device_block_array_exp_flag_t
+} ze_intel_device_block_array_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device 2D block array properties driver extension versions
+typedef enum _ze_intel_device_block_array_exp_properties_version_t {
+    ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0
+    ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_device_block_array_exp_properties_version_t;
+
+/// @brief Query to read the Intel Level Zero Driver Version String
+///
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - The Driver Version String will be in the format:
+///     - Major.Minor.Patch+Optional per semver guidelines https://semver.org/#spec-item-10
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+ze_result_t ZE_APICALL
+zeIntelGetDriverVersionString(
+    ze_driver_handle_t hDriver, ///< [in] Driver handle whose version is being read.
+    char *pDriverVersion,       ///< [in,out] pointer to driver version string.
+    size_t *pVersionSize);      ///< [in,out] pointer to the size of the driver version string.
+                                ///< if size is zero, then the size of the version string is returned.
+
+/// @brief Get Kernel Program Binary
+///
+/// @details
+///     - A valid kernel handle must be created with zeKernelCreate.
+///     - Returns Intel Graphics Assembly (GEN ISA) format binary program data for kernel handle.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+
+#ifndef ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME
+/// @brief Get Kernel Program Binary experimental name
+#define ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME "ZE_intel_experimental_kernel_get_program_binary"
+#endif // ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intel Kernel Get Binary Extension Version(s)
+typedef enum _ze_intel_kernel_get_binary_exp_version_t {
+    ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_INTEL_KERNEL_GET_PROGRAM_BINARY_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_intel_kernel_get_binary_exp_version_t;
+
+ze_result_t ZE_APICALL
+zeIntelKernelGetBinaryExp(
+    ze_kernel_handle_t hKernel, ///< [in] Kernel handle
+    size_t *pSize,              ///< [in, out] pointer to variable with size of GEN ISA binary
+    char *pKernelBinary         ///< [in,out] pointer to storage area for GEN ISA binary function
+);
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME
+/// @brief External semaphore extension name
+#define ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME "ZE_intel_experimental_external_semaphore"
+#endif // ZE_INTEL_EXTERNAL_SEMAPHORE_EXP_NAME
+
+typedef enum _ze_intel_external_semaphore_exp_version_t {
+    ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_1_0 = ZE_MAKE_VERSION(1, 0),     ///< version 1.0
+    ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version
+    ZE_EXTERNAL_SEMAPHORE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+} ze_intel_external_semaphore_exp_version_t;
+
+typedef enum _ze_intel_external_semaphore_exp_flags_t {
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_FD,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_WIN32,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_OPAQUE_WIN32_KMT,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_D3D12_FENCE,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_D3D11_FENCE,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_KEYED_MUTEX,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_KEYED_MUTEX_KMT,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_TIMELINE_SEMAPHORE_FD,
+    ZE_EXTERNAL_SEMAPHORE_EXP_FLAGS_TIMELINE_SEMAPHORE_WIN32
+} ze_intel_external_semaphore_exp_flags_t;
+
+typedef struct _ze_intel_external_semaphore_exp_desc_t {
+    ze_structure_type_t stype;                     ///< [in] type of this structure
+    const void *pNext;                             ///< [in] must be null or a pointer to an extension-specific
+                                                   ///< structure (i.e. contains stype and pNext). When importing
+                                                   ///< a semaphore, this can be a pointer to either ze_intel_external_semaphore_win32_exp_desc_t
+                                                   ///< or ze_intel_external_semaphore_fd_exp_desc_t.
+    ze_intel_external_semaphore_exp_flags_t flags; ///< [in] External semaphore flags. Must be 0 or a valid combination of ::ze_intel_external_semaphore_exp_flags_t
+} ze_intel_external_semaphore_exp_desc_t;
+
+typedef struct _ze_intel_external_semaphore_win32_exp_desc_t {
+    ze_structure_type_t stype; ///< [in] type of this structure
+    const void *pNext;         ///< [in] must be null or a pointer to an extension-specific
+                               ///< structure (i.e. contains stype and pNext).
+    void *handle;              ///< [in] Win32 HANDLE for semaphore
+    const char *name;          ///< [in] Name of the semaphore. Must be valid NULL terminated string.
+} ze_intel_external_semaphore_win32_exp_desc_t;
+
+typedef struct _ze_intel_external_semaphore_fd_exp_desc_t {
+    ze_structure_type_t stype; ///< [in] type of this structure
+    const void *pNext;         ///< [in] must be null or a pointer to an extension-specific
+                               ///< structure (i.e. contains stype and pNext).
+    int fd;                    ///< [in] File descriptor for semaphore. Must be valid file descriptor.
+} ze_intel_external_semaphore_desc_fd_exp_desc_t;
+
+typedef struct _ze_intel_external_semaphore_signal_params_exp_t {
+    ze_structure_type_t stype; ///< [in] type of this structure
+    const void *pNext;         ///< [in] must be null or a pointer to an extension-specific
+                               ///< structure (i.e. contains stype and pNext).
+    uint64_t value;            /// [in] [optional] Value to signal the semaphore with
+} ze_intel_external_semaphore_signal_params_exp_t;
+
+typedef struct _ze_intel_external_semaphore_wait_params_exp_t {
+    ze_structure_type_t stype; ///< [in] type of this structure
+    const void *pNext;         ///< [in] must be null or a pointer to an extension-specific
+                               ///< structure (i.e. contains stype and pNext).
+    uint64_t value;            /// [in] [optional] Value to wait on the semaphore for
+} ze_intel_external_semaphore_wait_params_exp_t;
+
+typedef struct _ze_intel_external_semaphore_exp_handle_t *ze_intel_external_semaphore_exp_handle_t;
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeIntelDeviceImportExternalSemaphoreExp(
+    ze_device_handle_t device,                                   ///< [in] handle of the device
+    const ze_intel_external_semaphore_exp_desc_t *semaphoreDesc, ///< [in] pointer to external semaphore descriptor
+    ze_intel_external_semaphore_exp_handle_t *phSemaphore        ///< [out] pointer to handle of the external semaphore
+);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeIntelCommandListAppendWaitExternalSemaphoresExp(
+    ze_command_list_handle_t hCmdList,                            ///< [in] handle of the command list
+    unsigned int numExternalSemaphores,                           ///< [in] number of external semaphores
+    const ze_intel_external_semaphore_exp_handle_t *phSemaphores, ///< [in] pointer to array of external semaphore handles
+    const ze_intel_external_semaphore_wait_params_exp_t *params,  ///< [in] pointer to array of wait parameters
+    ze_event_handle_t hSignalEvent,                               ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                       ///< [in][optional] number of events to wait on before continuing
+    ze_event_handle_t *phWaitEvents                               ///< [in][optional][range(0, numWaitEvents)] handles of the events to wait on before continuing
+);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeIntelCommandListAppendSignalExternalSemaphoresExp(
+    ze_command_list_handle_t hCmdList,                             ///< [in] handle of the command list
+    size_t numExternalSemaphores,                                  ///< [in] number of external semaphores
+    const ze_intel_external_semaphore_exp_handle_t *phSemaphores,  ///< [in] pointer to array of external semaphore handles
+    const ze_intel_external_semaphore_signal_params_exp_t *params, ///< [in] pointer to array of signal parameters
+    ze_event_handle_t hSignalEvent,                                ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                        ///< [in][optional] number of events to wait on before continuing
+    ze_event_handle_t *phWaitEvents                                ///< [in][optional][range(0, numWaitEvents)] handles of the events to wait on before continuing
+);
+
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zeIntelDeviceReleaseExternalSemaphoreExp(
+    ze_intel_external_semaphore_exp_handle_t hSemaphore ///< [in] handle of the external semaphore
+);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif
diff --git a/third_party/level_zero/ze_stypes.h b/third_party/level_zero/ze_stypes.h
new file mode 100644
index 00000000000..4b02a04e2f1
--- /dev/null
+++ b/third_party/level_zero/ze_stypes.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#ifndef _ZE_STYPES_H
+#define _ZE_STYPES_H
+
+#include "ze_api.h"
+
+#define ZE_STRUCTURE_TYPE_PITCHED_ALLOC_DEVICE_EXP_PROPERTIES (ze_structure_type_t)0x0002001D
+#define ZE_STRUCTURE_TYPE_BINDLESS_IMAGE_EXP_DESC (ze_structure_type_t)0x0002001E
+#define ZE_STRUCTURE_TYPE_PITCHED_IMAGE_EXP_DESC (ze_structure_type_t)0x0002001F
+#define ZE_STRUCTURE_TYPE_SYNCHRONIZED_DISPATCH_EXP_DESC (ze_structure_type_t)0x00020020
+#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_COMMUNICATION_DESC (ze_structure_type_t)0x00020021
+#define ZE_STRUCTURE_TYPE_INTEL_MEDIA_DOORBELL_HANDLE_DESC (ze_structure_type_t)0x00020022
+#define ZE_STRUCTURE_TYPE_INTEL_DEVICE_MEDIA_EXP_PROPERTIES (ze_structure_type_t)0x00020023
+#define ZE_INTEL_DEVICE_BLOCK_ARRAY_EXP_PROPERTIES (ze_structure_type_t)0x00030007
+#define ZEX_STRUCTURE_DEVICE_MODULE_REGISTER_FILE_EXP (ze_structure_type_t)0x00030010
+#define ZEX_STRUCTURE_KERNEL_REGISTER_FILE_SIZE_EXP (ze_structure_type_t)0x00030012
+#define ZE_STRUCTURE_INTEL_DEVICE_MODULE_DP_EXP_PROPERTIES (ze_structure_type_t)0x00030013
+#define ZEX_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC (ze_structure_type_t)0x00030016
+#define ZE_INTEL_STRUCTURE_TYPE_DEVICE_COMMAND_LIST_WAIT_ON_MEMORY_DATA_SIZE_EXP_DESC (ze_structure_type_t)0x00030017
+#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_ALLOCATE_MSIX_HINT_EXP_PROPERTIES (ze_structure_type_t)0x00030018
+#define ZEX_INTEL_STRUCTURE_TYPE_QUEUE_COPY_OPERATIONS_OFFLOAD_HINT_EXP_PROPERTIES (ze_structure_type_t)0x0003001B
+#define ZE_STRUCTURE_INTEL_DEVICE_MEMORY_CXL_EXP_PROPERTIES (ze_structure_type_t)0x00030019
+#define ZEX_STRUCTURE_COUTER_BASED_EVENT_DESC (ze_structure_type_t)0x0003001C
+#define ZEX_STRUCTURE_COUTER_BASED_EVENT_EXTERNAL_SYNC_ALLOC_PROPERTIES (ze_structure_type_t)0x0003001D
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_EXP_DESC (ze_structure_type_t)0x0003001E
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WIN32_EXP_DESC (ze_structure_type_t)0x0003001F
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_FD_EXP_DESC (ze_structure_type_t)0x00030023
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_EXP (ze_structure_type_t)0x00030024
+#define ZE_INTEL_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WAIT_PARAMS_EXP (ze_structure_type_t)0x00030025
+
+#endif
diff --git a/third_party/level_zero/zes.py b/third_party/level_zero/zes.py
new file mode 100644
index 00000000000..dc4af6483a9
--- /dev/null
+++ b/third_party/level_zero/zes.py
@@ -0,0 +1,4313 @@
+"""
+ Copyright (C) 2019-2021 Intel Corporation
+
+ SPDX-License-Identifier: MIT
+
+ @file zes.py
+ @version v1.11-r1.11.8
+
+ """
+import platform
+from ctypes import *
+from enum import *
+
+###############################################################################
+__version__ = "1.0"
+
+###############################################################################
+## @brief Handle to a driver instance
+class zes_driver_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of device object
+class zes_device_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device scheduler queue
+class zes_sched_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device performance factors
+class zes_perf_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device power domain
+class zes_pwr_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device frequency domain
+class zes_freq_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device engine group
+class zes_engine_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device standby control
+class zes_standby_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device firmware
+class zes_firmware_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device memory module
+class zes_mem_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman fabric port
+class zes_fabric_port_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device temperature sensor
+class zes_temp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device power supply
+class zes_psu_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device fan
+class zes_fan_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device LED
+class zes_led_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device RAS error set
+class zes_ras_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device diagnostics test suite
+class zes_diag_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman device overclock domain
+class zes_overclock_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle for a Sysman virtual function management domain
+class zes_vf_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Defines structure types
+class zes_structure_type_v(IntEnum):
+    DEVICE_PROPERTIES = 0x1                                                 ## ::zes_device_properties_t
+    PCI_PROPERTIES = 0x2                                                    ## ::zes_pci_properties_t
+    PCI_BAR_PROPERTIES = 0x3                                                ## ::zes_pci_bar_properties_t
+    DIAG_PROPERTIES = 0x4                                                   ## ::zes_diag_properties_t
+    ENGINE_PROPERTIES = 0x5                                                 ## ::zes_engine_properties_t
+    FABRIC_PORT_PROPERTIES = 0x6                                            ## ::zes_fabric_port_properties_t
+    FAN_PROPERTIES = 0x7                                                    ## ::zes_fan_properties_t
+    FIRMWARE_PROPERTIES = 0x8                                               ## ::zes_firmware_properties_t
+    FREQ_PROPERTIES = 0x9                                                   ## ::zes_freq_properties_t
+    LED_PROPERTIES = 0xa                                                    ## ::zes_led_properties_t
+    MEM_PROPERTIES = 0xb                                                    ## ::zes_mem_properties_t
+    PERF_PROPERTIES = 0xc                                                   ## ::zes_perf_properties_t
+    POWER_PROPERTIES = 0xd                                                  ## ::zes_power_properties_t
+    PSU_PROPERTIES = 0xe                                                    ## ::zes_psu_properties_t
+    RAS_PROPERTIES = 0xf                                                    ## ::zes_ras_properties_t
+    SCHED_PROPERTIES = 0x10                                                 ## ::zes_sched_properties_t
+    SCHED_TIMEOUT_PROPERTIES = 0x11                                         ## ::zes_sched_timeout_properties_t
+    SCHED_TIMESLICE_PROPERTIES = 0x12                                       ## ::zes_sched_timeslice_properties_t
+    STANDBY_PROPERTIES = 0x13                                               ## ::zes_standby_properties_t
+    TEMP_PROPERTIES = 0x14                                                  ## ::zes_temp_properties_t
+    DEVICE_STATE = 0x15                                                     ## ::zes_device_state_t
+    PROCESS_STATE = 0x16                                                    ## ::zes_process_state_t
+    PCI_STATE = 0x17                                                        ## ::zes_pci_state_t
+    FABRIC_PORT_CONFIG = 0x18                                               ## ::zes_fabric_port_config_t
+    FABRIC_PORT_STATE = 0x19                                                ## ::zes_fabric_port_state_t
+    FAN_CONFIG = 0x1a                                                       ## ::zes_fan_config_t
+    FREQ_STATE = 0x1b                                                       ## ::zes_freq_state_t
+    OC_CAPABILITIES = 0x1c                                                  ## ::zes_oc_capabilities_t
+    LED_STATE = 0x1d                                                        ## ::zes_led_state_t
+    MEM_STATE = 0x1e                                                        ## ::zes_mem_state_t
+    PSU_STATE = 0x1f                                                        ## ::zes_psu_state_t
+    BASE_STATE = 0x20                                                       ## ::zes_base_state_t
+    RAS_CONFIG = 0x21                                                       ## ::zes_ras_config_t
+    RAS_STATE = 0x22                                                        ## ::zes_ras_state_t
+    TEMP_CONFIG = 0x23                                                      ## ::zes_temp_config_t
+    PCI_BAR_PROPERTIES_1_2 = 0x24                                           ## ::zes_pci_bar_properties_1_2_t
+    DEVICE_ECC_DESC = 0x25                                                  ## ::zes_device_ecc_desc_t
+    DEVICE_ECC_PROPERTIES = 0x26                                            ## ::zes_device_ecc_properties_t
+    POWER_LIMIT_EXT_DESC = 0x27                                             ## ::zes_power_limit_ext_desc_t
+    POWER_EXT_PROPERTIES = 0x28                                             ## ::zes_power_ext_properties_t
+    OVERCLOCK_PROPERTIES = 0x29                                             ## ::zes_overclock_properties_t
+    FABRIC_PORT_ERROR_COUNTERS = 0x2a                                       ## ::zes_fabric_port_error_counters_t
+    ENGINE_EXT_PROPERTIES = 0x2b                                            ## ::zes_engine_ext_properties_t
+    RESET_PROPERTIES = 0x2c                                                 ## ::zes_reset_properties_t
+    DEVICE_EXT_PROPERTIES = 0x2d                                            ## ::zes_device_ext_properties_t
+    DEVICE_UUID = 0x2e                                                      ## ::zes_uuid_t
+    POWER_DOMAIN_EXP_PROPERTIES = 0x00020001                                ## ::zes_power_domain_exp_properties_t
+    MEM_BANDWIDTH_COUNTER_BITS_EXP_PROPERTIES = 0x00020002                  ## ::zes_mem_bandwidth_counter_bits_exp_properties_t
+    MEMORY_PAGE_OFFLINE_STATE_EXP = 0x00020003                              ## ::zes_mem_page_offline_state_exp_t
+    SUBDEVICE_EXP_PROPERTIES = 0x00020004                                   ## ::zes_subdevice_exp_properties_t
+    VF_EXP_PROPERTIES = 0x00020005                                          ## ::zes_vf_exp_properties_t
+    VF_UTIL_MEM_EXP = 0x00020006                                            ## ::zes_vf_util_mem_exp_t
+    VF_UTIL_ENGINE_EXP = 0x00020007                                         ## ::zes_vf_util_engine_exp_t
+    VF_EXP_CAPABILITIES = 0x00020008                                        ## ::zes_vf_exp_capabilities_t
+    VF_UTIL_MEM_EXP2 = 0x00020009                                           ## ::zes_vf_util_mem_exp2_t
+    VF_UTIL_ENGINE_EXP2 = 0x00020010                                        ## ::zes_vf_util_engine_exp2_t
+
+class zes_structure_type_t(c_int):
+    def __str__(self):
+        return str(zes_structure_type_v(self.value))
+
+
+###############################################################################
+## @brief Base for all properties types
+class zes_base_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all descriptor types
+class zes_base_desc_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all state types
+class zes_base_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all config types
+class zes_base_config_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all capability types
+class zes_base_capability_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Supported sysman initialization flags
+class zes_init_flags_v(IntEnum):
+    PLACEHOLDER = ZE_BIT(0)                                                 ## placeholder for future use
+
+class zes_init_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Maximum extension name string size
+ZES_MAX_EXTENSION_NAME = 256
+
+###############################################################################
+## @brief Extension properties queried using ::zesDriverGetExtensionProperties
+class zes_driver_extension_properties_t(Structure):
+    _fields_ = [
+        ("name", c_char * ZES_MAX_EXTENSION_NAME),                      ## [out] extension name
+        ("version", c_ulong)                                            ## [out] extension version using ::ZE_MAKE_VERSION
+    ]
+
+###############################################################################
+## @brief Maximum number of characters in string properties.
+ZES_STRING_PROPERTY_SIZE = 64
+
+###############################################################################
+## @brief Maximum device universal unique id (UUID) size in bytes.
+ZES_MAX_UUID_SIZE = 16
+
+###############################################################################
+## @brief Types of accelerator engines
+class zes_engine_type_flags_v(IntEnum):
+    OTHER = ZE_BIT(0)                                                       ## Undefined types of accelerators.
+    COMPUTE = ZE_BIT(1)                                                     ## Engines that process compute kernels only (no 3D content).
+    _3D = ZE_BIT(2)                                                         ## Engines that process 3D content only (no compute kernels).
+    MEDIA = ZE_BIT(3)                                                       ## Engines that process media workloads.
+    DMA = ZE_BIT(4)                                                         ## Engines that copy blocks of data.
+    RENDER = ZE_BIT(5)                                                      ## Engines that can process both 3D content and compute kernels.
+
+class zes_engine_type_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device repair status
+class zes_repair_status_v(IntEnum):
+    UNSUPPORTED = 0                                                         ## The device does not support in-field repairs.
+    NOT_PERFORMED = 1                                                       ## The device has never been repaired.
+    PERFORMED = 2                                                           ## The device has been repaired.
+
+class zes_repair_status_t(c_int):
+    def __str__(self):
+        return str(zes_repair_status_v(self.value))
+
+
+###############################################################################
+## @brief Device reset reasons
+class zes_reset_reason_flags_v(IntEnum):
+    WEDGED = ZE_BIT(0)                                                      ## The device needs to be reset because one or more parts of the hardware
+                                                                            ## is wedged
+    REPAIR = ZE_BIT(1)                                                      ## The device needs to be reset in order to complete in-field repairs
+
+class zes_reset_reason_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device reset type
+class zes_reset_type_v(IntEnum):
+    WARM = 0                                                                ## Apply warm reset
+    COLD = 1                                                                ## Apply cold reset
+    FLR = 2                                                                 ## Apply FLR reset
+
+class zes_reset_type_t(c_int):
+    def __str__(self):
+        return str(zes_reset_type_v(self.value))
+
+
+###############################################################################
+## @brief Device state
+class zes_device_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("reset", zes_reset_reason_flags_t),                            ## [out] Indicates if the device needs to be reset and for what reasons.
+                                                                        ## returns 0 (none) or combination of ::zes_reset_reason_flag_t
+        ("repaired", zes_repair_status_t)                               ## [out] Indicates if the device has been repaired
+    ]
+
+###############################################################################
+## @brief Device reset properties
+class zes_reset_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("force", ze_bool_t),                                           ## [in] If set to true, all applications that are currently using the
+                                                                        ## device will be forcibly killed.
+        ("resetType", zes_reset_type_t)                                 ## [in] Type of reset needs to be performed
+    ]
+
+###############################################################################
+## @brief Device universal unique id (UUID)
+class zes_uuid_t(Structure):
+    _fields_ = [
+        ("id", c_ubyte * ZES_MAX_UUID_SIZE)                             ## [out] opaque data representing a device UUID
+    ]
+
+###############################################################################
+## @brief Supported device types
+class zes_device_type_v(IntEnum):
+    GPU = 1                                                                 ## Graphics Processing Unit
+    CPU = 2                                                                 ## Central Processing Unit
+    FPGA = 3                                                                ## Field Programmable Gate Array
+    MCA = 4                                                                 ## Memory Copy Accelerator
+    VPU = 5                                                                 ## Vision Processing Unit
+
+class zes_device_type_t(c_int):
+    def __str__(self):
+        return str(zes_device_type_v(self.value))
+
+
+###############################################################################
+## @brief Supported device property flags
+class zes_device_property_flags_v(IntEnum):
+    INTEGRATED = ZE_BIT(0)                                                  ## Device is integrated with the Host.
+    SUBDEVICE = ZE_BIT(1)                                                   ## Device handle used for query represents a sub-device.
+    ECC = ZE_BIT(2)                                                         ## Device supports error correction memory access.
+    ONDEMANDPAGING = ZE_BIT(3)                                              ## Device supports on-demand page-faulting.
+
+class zes_device_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device properties
+class zes_device_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("core", ze_device_properties_t),                               ## [out] (Deprecated, use ::zes_uuid_t in the extended structure) Core
+                                                                        ## device properties
+        ("numSubdevices", c_ulong),                                     ## [out] Number of sub-devices. A value of 0 indicates that this device
+                                                                        ## doesn't have sub-devices.
+        ("serialNumber", c_char * ZES_STRING_PROPERTY_SIZE),            ## [out] Manufacturing serial number (NULL terminated string value). This
+                                                                        ## value is intended to reflect the Part ID/SoC ID assigned by
+                                                                        ## manufacturer that is unique for a SoC. Will be set to the string
+                                                                        ## "unknown" if this cannot be determined for the device.
+        ("boardNumber", c_char * ZES_STRING_PROPERTY_SIZE),             ## [out] Manufacturing board number (NULL terminated string value).
+                                                                        ## Alternatively "boardSerialNumber", this value is intended to reflect
+                                                                        ## the string printed on board label by manufacturer. Will be set to the
+                                                                        ## string "unknown" if this cannot be determined for the device.
+        ("brandName", c_char * ZES_STRING_PROPERTY_SIZE),               ## [out] Brand name of the device (NULL terminated string value). Will be
+                                                                        ## set to the string "unknown" if this cannot be determined for the
+                                                                        ## device.
+        ("modelName", c_char * ZES_STRING_PROPERTY_SIZE),               ## [out] Model name of the device (NULL terminated string value). Will be
+                                                                        ## set to the string "unknown" if this cannot be determined for the
+                                                                        ## device.
+        ("vendorName", c_char * ZES_STRING_PROPERTY_SIZE),              ## [out] Vendor name of the device (NULL terminated string value). Will
+                                                                        ## be set to the string "unknown" if this cannot be determined for the
+                                                                        ## device.
+        ("driverVersion", c_char * ZES_STRING_PROPERTY_SIZE)            ## [out] Installed driver version (NULL terminated string value). Will be
+                                                                        ## set to the string "unknown" if this cannot be determined for the
+                                                                        ## device.
+    ]
+
+###############################################################################
+## @brief Device properties
+class zes_device_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("uuid", zes_uuid_t),                                           ## [out] universal unique identifier. Note: uuid obtained from Sysman API
+                                                                        ## is the same as from core API. Subdevices will have their own uuid.
+        ("type", zes_device_type_t),                                    ## [out] generic device type
+        ("flags", zes_device_property_flags_t)                          ## [out] 0 (none) or a valid combination of ::zes_device_property_flag_t
+    ]
+
+###############################################################################
+## @brief Contains information about a process that has an open connection with
+##        this device
+## 
+## @details
+##     - The application can use the process ID to query the OS for the owner
+##       and the path to the executable.
+class zes_process_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("processId", c_ulong),                                         ## [out] Host OS process ID.
+        ("memSize", c_ulonglong),                                       ## [out] Device memory size in bytes allocated by this process (may not
+                                                                        ## necessarily be resident on the device at the time of reading).
+        ("sharedSize", c_ulonglong),                                    ## [out] The size of shared device memory mapped into this process (may
+                                                                        ## not necessarily be resident on the device at the time of reading).
+        ("engines", zes_engine_type_flags_t)                            ## [out] Bitfield of accelerator engine types being used by this process.
+    ]
+
+###############################################################################
+## @brief PCI address
+class zes_pci_address_t(Structure):
+    _fields_ = [
+        ("domain", c_ulong),                                            ## [out] BDF domain
+        ("bus", c_ulong),                                               ## [out] BDF bus
+        ("device", c_ulong),                                            ## [out] BDF device
+        ("function", c_ulong)                                           ## [out] BDF function
+    ]
+
+###############################################################################
+## @brief PCI speed
+class zes_pci_speed_t(Structure):
+    _fields_ = [
+        ("gen", c_int32_t),                                             ## [out] The link generation. A value of -1 means that this property is
+                                                                        ## unknown.
+        ("width", c_int32_t),                                           ## [out] The number of lanes. A value of -1 means that this property is
+                                                                        ## unknown.
+        ("maxBandwidth", c_int64_t)                                     ## [out] The maximum bandwidth in bytes/sec (sum of all lanes). A value
+                                                                        ## of -1 means that this property is unknown.
+    ]
+
+###############################################################################
+## @brief Static PCI properties
+class zes_pci_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("address", zes_pci_address_t),                                 ## [out] The BDF address
+        ("maxSpeed", zes_pci_speed_t),                                  ## [out] Fastest port configuration supported by the device (sum of all
+                                                                        ## lanes)
+        ("haveBandwidthCounters", ze_bool_t),                           ## [out] Indicates whether the `rxCounter` and `txCounter` members of
+                                                                        ## ::zes_pci_stats_t will have valid values
+        ("havePacketCounters", ze_bool_t),                              ## [out] Indicates whether the `packetCounter` member of
+                                                                        ## ::zes_pci_stats_t will have a valid value
+        ("haveReplayCounters", ze_bool_t)                               ## [out] Indicates whether the `replayCounter` member of
+                                                                        ## ::zes_pci_stats_t will have a valid value
+    ]
+
+###############################################################################
+## @brief PCI link status
+class zes_pci_link_status_v(IntEnum):
+    UNKNOWN = 0                                                             ## The link status could not be determined
+    GOOD = 1                                                                ## The link is up and operating as expected
+    QUALITY_ISSUES = 2                                                      ## The link is up but has quality and/or bandwidth degradation
+    STABILITY_ISSUES = 3                                                    ## The link has stability issues and preventing workloads making forward
+                                                                            ## progress
+
+class zes_pci_link_status_t(c_int):
+    def __str__(self):
+        return str(zes_pci_link_status_v(self.value))
+
+
+###############################################################################
+## @brief PCI link quality degradation reasons
+class zes_pci_link_qual_issue_flags_v(IntEnum):
+    REPLAYS = ZE_BIT(0)                                                     ## A significant number of replays are occurring
+    SPEED = ZE_BIT(1)                                                       ## There is a degradation in the maximum bandwidth of the link
+
+class zes_pci_link_qual_issue_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief PCI link stability issues
+class zes_pci_link_stab_issue_flags_v(IntEnum):
+    RETRAINING = ZE_BIT(0)                                                  ## Link retraining has occurred to deal with quality issues
+
+class zes_pci_link_stab_issue_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Dynamic PCI state
+class zes_pci_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("status", zes_pci_link_status_t),                              ## [out] The current status of the port
+        ("qualityIssues", zes_pci_link_qual_issue_flags_t),             ## [out] If status is ::ZES_PCI_LINK_STATUS_QUALITY_ISSUES, 
+                                                                        ## then this gives a combination of ::zes_pci_link_qual_issue_flag_t for
+                                                                        ## quality issues that have been detected;
+                                                                        ## otherwise, 0 indicates there are no quality issues with the link at
+                                                                        ## this time."
+        ("stabilityIssues", zes_pci_link_stab_issue_flags_t),           ## [out] If status is ::ZES_PCI_LINK_STATUS_STABILITY_ISSUES, 
+                                                                        ## then this gives a combination of ::zes_pci_link_stab_issue_flag_t for
+                                                                        ## reasons for the connection instability;
+                                                                        ## otherwise, 0 indicates there are no connection stability issues at
+                                                                        ## this time."
+        ("speed", zes_pci_speed_t)                                      ## [out] The current port configure speed
+    ]
+
+###############################################################################
+## @brief PCI bar types
+class zes_pci_bar_type_v(IntEnum):
+    MMIO = 0                                                                ## MMIO registers
+    ROM = 1                                                                 ## ROM aperture
+    MEM = 2                                                                 ## Device memory
+
+class zes_pci_bar_type_t(c_int):
+    def __str__(self):
+        return str(zes_pci_bar_type_v(self.value))
+
+
+###############################################################################
+## @brief Properties of a pci bar
+class zes_pci_bar_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_pci_bar_type_t),                                   ## [out] The type of bar
+        ("index", c_ulong),                                             ## [out] The index of the bar
+        ("base", c_ulonglong),                                          ## [out] Base address of the bar.
+        ("size", c_ulonglong)                                           ## [out] Size of the bar.
+    ]
+
+###############################################################################
+## @brief Properties of a pci bar, including the resizable bar.
+class zes_pci_bar_properties_1_2_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_pci_bar_type_t),                                   ## [out] The type of bar
+        ("index", c_ulong),                                             ## [out] The index of the bar
+        ("base", c_ulonglong),                                          ## [out] Base address of the bar.
+        ("size", c_ulonglong),                                          ## [out] Size of the bar.
+        ("resizableBarSupported", ze_bool_t),                           ## [out] Support for Resizable Bar on this device.
+        ("resizableBarEnabled", ze_bool_t)                              ## [out] Resizable Bar enabled on this device
+    ]
+
+###############################################################################
+## @brief PCI stats counters
+## 
+## @details
+##     - Percent replays is calculated by taking two snapshots (s1, s2) and
+##       using the equation: %replay = 10^6 * (s2.replayCounter -
+##       s1.replayCounter) / (s2.maxBandwidth * (s2.timestamp - s1.timestamp))
+##     - Percent throughput is calculated by taking two snapshots (s1, s2) and
+##       using the equation: %bw = 10^6 * ((s2.rxCounter - s1.rxCounter) +
+##       (s2.txCounter - s1.txCounter)) / (s2.maxBandwidth * (s2.timestamp -
+##       s1.timestamp))
+class zes_pci_stats_t(Structure):
+    _fields_ = [
+        ("timestamp", c_ulonglong),                                     ## [out] Monotonic timestamp counter in microseconds when the measurement
+                                                                        ## was made.
+                                                                        ## This timestamp should only be used to calculate delta time between
+                                                                        ## snapshots of this structure.
+                                                                        ## Never take the delta of this timestamp with the timestamp from a
+                                                                        ## different structure since they are not guaranteed to have the same base.
+                                                                        ## The absolute value of the timestamp is only valid during within the
+                                                                        ## application and may be different on the next execution.
+        ("replayCounter", c_ulonglong),                                 ## [out] Monotonic counter for the number of replay packets (sum of all
+                                                                        ## lanes). Will always be 0 when the `haveReplayCounters` member of
+                                                                        ## ::zes_pci_properties_t is FALSE.
+        ("packetCounter", c_ulonglong),                                 ## [out] Monotonic counter for the number of packets (sum of all lanes).
+                                                                        ## Will always be 0 when the `havePacketCounters` member of
+                                                                        ## ::zes_pci_properties_t is FALSE.
+        ("rxCounter", c_ulonglong),                                     ## [out] Monotonic counter for the number of bytes received (sum of all
+                                                                        ## lanes). Will always be 0 when the `haveBandwidthCounters` member of
+                                                                        ## ::zes_pci_properties_t is FALSE.
+        ("txCounter", c_ulonglong),                                     ## [out] Monotonic counter for the number of bytes transmitted (including
+                                                                        ## replays) (sum of all lanes). Will always be 0 when the
+                                                                        ## `haveBandwidthCounters` member of ::zes_pci_properties_t is FALSE.
+        ("speed", zes_pci_speed_t)                                      ## [out] The current speed of the link (sum of all lanes)
+    ]
+
+###############################################################################
+## @brief Overclock domains.
+class zes_overclock_domain_v(IntEnum):
+    CARD = 1                                                                ## Overclocking card level properties such as temperature limits.
+    PACKAGE = 2                                                             ## Overclocking package level properties such as power limits.
+    GPU_ALL = 4                                                             ## Overclocking a GPU that has all accelerator assets on the same PLL/VR.
+    GPU_RENDER_COMPUTE = 8                                                  ## Overclocking a GPU with render and compute assets on the same PLL/VR.
+    GPU_RENDER = 16                                                         ## Overclocking a GPU with render assets on its own PLL/VR.
+    GPU_COMPUTE = 32                                                        ## Overclocking a GPU with compute assets on its own PLL/VR.
+    GPU_MEDIA = 64                                                          ## Overclocking a GPU with media assets on its own PLL/VR.
+    VRAM = 128                                                              ## Overclocking device local memory.
+    ADM = 256                                                               ## Overclocking LLC/L4 cache.
+
+class zes_overclock_domain_t(c_int):
+    def __str__(self):
+        return str(zes_overclock_domain_v(self.value))
+
+
+###############################################################################
+## @brief Overclock controls.
+class zes_overclock_control_v(IntEnum):
+    VF = 1                                                                  ## This control permits setting a custom V-F curve.
+    FREQ_OFFSET = 2                                                         ## The V-F curve of the overclock domain can be shifted up or down using
+                                                                            ## this control.
+    VMAX_OFFSET = 4                                                         ## This control is used to increase the permitted voltage above the
+                                                                            ## shipped voltage maximum.
+    FREQ = 8                                                                ## This control permits direct changes to the operating frequency.
+    VOLT_LIMIT = 16                                                         ## This control prevents frequencies that would push the voltage above
+                                                                            ## this value, typically used by V-F scanners.
+    POWER_SUSTAINED_LIMIT = 32                                              ## This control changes the sustained power limit (PL1).
+    POWER_BURST_LIMIT = 64                                                  ## This control changes the burst power limit (PL2).
+    POWER_PEAK_LIMIT = 128                                                  ## his control changes the peak power limit (PL4).
+    ICCMAX_LIMIT = 256                                                      ## This control changes the value of IccMax..
+    TEMP_LIMIT = 512                                                        ## This control changes the value of TjMax.
+    ITD_DISABLE = 1024                                                      ## This control permits disabling the adaptive voltage feature ITD
+    ACM_DISABLE = 2048                                                      ## This control permits disabling the adaptive voltage feature ACM.
+
+class zes_overclock_control_t(c_int):
+    def __str__(self):
+        return str(zes_overclock_control_v(self.value))
+
+
+###############################################################################
+## @brief Overclock modes.
+class zes_overclock_mode_v(IntEnum):
+    MODE_OFF = 0                                                            ## Overclock mode is off
+    MODE_STOCK = 2                                                          ## Stock (manufacturing settings) are being used.
+    MODE_ON = 3                                                             ## Overclock mode is on.
+    MODE_UNAVAILABLE = 4                                                    ## Overclocking is unavailable at this time since the system is running
+                                                                            ## on battery.
+    MODE_DISABLED = 5                                                       ## Overclock mode is disabled.
+
+class zes_overclock_mode_t(c_int):
+    def __str__(self):
+        return str(zes_overclock_mode_v(self.value))
+
+
+###############################################################################
+## @brief Overclock control states.
+class zes_control_state_v(IntEnum):
+    STATE_UNSET = 0                                                         ## No overclock control has not been changed by the driver since the last
+                                                                            ## boot/reset.
+    STATE_ACTIVE = 2                                                        ## The overclock control has been set and it is active.
+    STATE_DISABLED = 3                                                      ## The overclock control value has been disabled due to the current power
+                                                                            ## configuration (typically when running on DC).
+
+class zes_control_state_t(c_int):
+    def __str__(self):
+        return str(zes_control_state_v(self.value))
+
+
+###############################################################################
+## @brief Overclock pending actions.
+class zes_pending_action_v(IntEnum):
+    PENDING_NONE = 0                                                        ## There no pending actions. .
+    PENDING_IMMINENT = 1                                                    ## The requested change is in progress and should complete soon.
+    PENDING_COLD_RESET = 2                                                  ## The requested change requires a device cold reset (hotplug, system
+                                                                            ## boot).
+    PENDING_WARM_RESET = 3                                                  ## The requested change requires a device warm reset (PCIe FLR).
+
+class zes_pending_action_t(c_int):
+    def __str__(self):
+        return str(zes_pending_action_v(self.value))
+
+
+###############################################################################
+## @brief Overclock V-F curve programing.
+class zes_vf_program_type_v(IntEnum):
+    VF_ARBITRARY = 0                                                        ## Can program an arbitrary number of V-F points up to the maximum number
+                                                                            ## and each point can have arbitrary voltage and frequency values within
+                                                                            ## the min/max/step limits
+    VF_FREQ_FIXED = 1                                                       ## Can only program the voltage for the V-F points that it reads back -
+                                                                            ## the frequency of those points cannot be changed
+    VF_VOLT_FIXED = 2                                                       ## Can only program the frequency for the V-F points that is reads back -
+                                                                            ## the voltage of each point cannot be changed.
+
+class zes_vf_program_type_t(c_int):
+    def __str__(self):
+        return str(zes_vf_program_type_v(self.value))
+
+
+###############################################################################
+## @brief VF type
+class zes_vf_type_v(IntEnum):
+    VOLT = 0                                                                ## VF Voltage point
+    FREQ = 1                                                                ## VF Frequency point
+
+class zes_vf_type_t(c_int):
+    def __str__(self):
+        return str(zes_vf_type_v(self.value))
+
+
+###############################################################################
+## @brief VF type
+class zes_vf_array_type_v(IntEnum):
+    USER_VF_ARRAY = 0                                                       ## User V-F array
+    DEFAULT_VF_ARRAY = 1                                                    ## Default V-F array
+    LIVE_VF_ARRAY = 2                                                       ## Live V-F array
+
+class zes_vf_array_type_t(c_int):
+    def __str__(self):
+        return str(zes_vf_array_type_v(self.value))
+
+
+###############################################################################
+## @brief Overclock properties
+## 
+## @details
+##     - Information on the overclock domain type and all the contols that are
+##       part of the domain.
+class zes_overclock_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("domainType", zes_overclock_domain_t),                         ## [out] The hardware block that this overclock domain controls (GPU,
+                                                                        ## VRAM, ...)
+        ("AvailableControls", c_ulong),                                 ## [out] Returns the overclock controls that are supported (a bit for
+                                                                        ## each of enum ::zes_overclock_control_t). If no bits are set, the
+                                                                        ## domain doesn't support overclocking.
+        ("VFProgramType", zes_vf_program_type_t),                       ## [out] Type of V-F curve programming that is permitted:.
+        ("NumberOfVFPoints", c_ulong)                                   ## [out] Number of VF points that can be programmed - max_num_points
+    ]
+
+###############################################################################
+## @brief Overclock Control properties
+## 
+## @details
+##     - Provides all the control capabilities supported by the device for the
+##       overclock domain.
+class zes_control_property_t(Structure):
+    _fields_ = [
+        ("MinValue", c_double),                                         ## [out]  This provides information about the limits of the control value
+                                                                        ## so that the driver can calculate the set of valid values.
+        ("MaxValue", c_double),                                         ## [out]  This provides information about the limits of the control value
+                                                                        ## so that the driver can calculate the set of valid values.
+        ("StepValue", c_double),                                        ## [out]  This provides information about the limits of the control value
+                                                                        ## so that the driver can calculate the set of valid values.
+        ("RefValue", c_double),                                         ## [out] The reference value provides the anchor point, UIs can combine
+                                                                        ## this with the user offset request to show the anticipated improvement.
+        ("DefaultValue", c_double)                                      ## [out] The shipped out-of-box position of this control. Driver can
+                                                                        ## request this value at any time to return to the out-of-box behavior.
+    ]
+
+###############################################################################
+## @brief Overclock VF properties
+## 
+## @details
+##     - Provides all the VF capabilities supported by the device for the
+##       overclock domain.
+class zes_vf_property_t(Structure):
+    _fields_ = [
+        ("MinFreq", c_double),                                          ## [out] Read the minimum frequency that can be be programmed in the
+                                                                        ## custom V-F point..
+        ("MaxFreq", c_double),                                          ## [out] Read the maximum frequency that can be be programmed in the
+                                                                        ## custom V-F point..
+        ("StepFreq", c_double),                                         ## [out] Read the frequency step that can be be programmed in the custom
+                                                                        ## V-F point..
+        ("MinVolt", c_double),                                          ## [out] Read the minimum voltage that can be be programmed in the custom
+                                                                        ## V-F point..
+        ("MaxVolt", c_double),                                          ## [out] Read the maximum voltage that can be be programmed in the custom
+                                                                        ## V-F point..
+        ("StepVolt", c_double)                                          ## [out] Read the voltage step that can be be programmed in the custom
+                                                                        ## V-F point.
+    ]
+
+###############################################################################
+## @brief Diagnostic results
+class zes_diag_result_v(IntEnum):
+    NO_ERRORS = 0                                                           ## Diagnostic completed without finding errors to repair
+    ABORT = 1                                                               ## Diagnostic had problems running tests
+    FAIL_CANT_REPAIR = 2                                                    ## Diagnostic had problems setting up repairs
+    REBOOT_FOR_REPAIR = 3                                                   ## Diagnostics found errors, setup for repair and reboot is required to
+                                                                            ## complete the process
+
+class zes_diag_result_t(c_int):
+    def __str__(self):
+        return str(zes_diag_result_v(self.value))
+
+
+###############################################################################
+## @brief Diagnostic test index to use for the very first test.
+ZES_DIAG_FIRST_TEST_INDEX = 0x0
+
+###############################################################################
+## @brief Diagnostic test index to use for the very last test.
+ZES_DIAG_LAST_TEST_INDEX = 0xFFFFFFFF
+
+###############################################################################
+## @brief Diagnostic test
+class zes_diag_test_t(Structure):
+    _fields_ = [
+        ("index", c_ulong),                                             ## [out] Index of the test
+        ("name", c_char * ZES_STRING_PROPERTY_SIZE)                     ## [out] Name of the test
+    ]
+
+###############################################################################
+## @brief Diagnostics test suite properties
+class zes_diag_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("name", c_char * ZES_STRING_PROPERTY_SIZE),                    ## [out] Name of the diagnostics test suite
+        ("haveTests", ze_bool_t)                                        ## [out] Indicates if this test suite has individual tests which can be
+                                                                        ## run separately (use the function ::zesDiagnosticsGetTests() to get the
+                                                                        ## list of these tests)
+    ]
+
+###############################################################################
+## @brief ECC State
+class zes_device_ecc_state_v(IntEnum):
+    UNAVAILABLE = 0                                                         ## None
+    ENABLED = 1                                                             ## ECC enabled.
+    DISABLED = 2                                                            ## ECC disabled.
+
+class zes_device_ecc_state_t(c_int):
+    def __str__(self):
+        return str(zes_device_ecc_state_v(self.value))
+
+
+###############################################################################
+## @brief State Change Requirements
+class zes_device_action_v(IntEnum):
+    NONE = 0                                                                ## No action.
+    WARM_CARD_RESET = 1                                                     ## Warm reset of the card.
+    COLD_CARD_RESET = 2                                                     ## Cold reset of the card.
+    COLD_SYSTEM_REBOOT = 3                                                  ## Cold reboot of the system.
+
+class zes_device_action_t(c_int):
+    def __str__(self):
+        return str(zes_device_action_v(self.value))
+
+
+###############################################################################
+## @brief ECC State Descriptor
+class zes_device_ecc_desc_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("state", zes_device_ecc_state_t)                               ## [out] ECC state
+    ]
+
+###############################################################################
+## @brief ECC State
+class zes_device_ecc_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("currentState", zes_device_ecc_state_t),                       ## [out] Current ECC state
+        ("pendingState", zes_device_ecc_state_t),                       ## [out] Pending ECC state
+        ("pendingAction", zes_device_action_t)                          ## [out] Pending action
+    ]
+
+###############################################################################
+## @brief Accelerator engine groups
+class zes_engine_group_v(IntEnum):
+    ALL = 0                                                                 ## Access information about all engines combined.
+    COMPUTE_ALL = 1                                                         ## Access information about all compute engines combined. Compute engines
+                                                                            ## can only process compute kernels (no 3D content).
+    MEDIA_ALL = 2                                                           ## Access information about all media engines combined.
+    COPY_ALL = 3                                                            ## Access information about all copy (blitter) engines combined.
+    COMPUTE_SINGLE = 4                                                      ## Access information about a single compute engine - this is an engine
+                                                                            ## that can process compute kernels. Note that single engines may share
+                                                                            ## the same underlying accelerator resources as other engines so activity
+                                                                            ## of such an engine may not be indicative of the underlying resource
+                                                                            ## utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
+    RENDER_SINGLE = 5                                                       ## Access information about a single render engine - this is an engine
+                                                                            ## that can process both 3D content and compute kernels. Note that single
+                                                                            ## engines may share the same underlying accelerator resources as other
+                                                                            ## engines so activity of such an engine may not be indicative of the
+                                                                            ## underlying resource utilization - use
+                                                                            ## ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
+    MEDIA_DECODE_SINGLE = 6                                                 ## [DEPRECATED] No longer supported.
+    MEDIA_ENCODE_SINGLE = 7                                                 ## [DEPRECATED] No longer supported.
+    COPY_SINGLE = 8                                                         ## Access information about a single media encode engine. Note that
+                                                                            ## single engines may share the same underlying accelerator resources as
+                                                                            ## other engines so activity of such an engine may not be indicative of
+                                                                            ## the underlying resource utilization - use ::ZES_ENGINE_GROUP_COPY_ALL
+                                                                            ## for that.
+    MEDIA_ENHANCEMENT_SINGLE = 9                                            ## Access information about a single media enhancement engine. Note that
+                                                                            ## single engines may share the same underlying accelerator resources as
+                                                                            ## other engines so activity of such an engine may not be indicative of
+                                                                            ## the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
+                                                                            ## for that.
+    _3D_SINGLE = 10                                                         ## [DEPRECATED] No longer supported.
+    _3D_RENDER_COMPUTE_ALL = 11                                             ## [DEPRECATED] No longer supported.
+    RENDER_ALL = 12                                                         ## Access information about all render engines combined. Render engines
+                                                                            ## are those than process both 3D content and compute kernels.
+    _3D_ALL = 13                                                            ## [DEPRECATED] No longer supported.
+    MEDIA_CODEC_SINGLE = 14                                                 ## Access information about a single media engine. Note that single
+                                                                            ## engines may share the same underlying accelerator resources as other
+                                                                            ## engines so activity of such an engine may not be indicative of the
+                                                                            ## underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL for
+                                                                            ## that.
+
+class zes_engine_group_t(c_int):
+    def __str__(self):
+        return str(zes_engine_group_v(self.value))
+
+
+###############################################################################
+## @brief Engine group properties
+class zes_engine_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_engine_group_t),                                   ## [out] The engine group
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device
+    ]
+
+###############################################################################
+## @brief Engine activity counters
+## 
+## @details
+##     - Percent utilization is calculated by taking two snapshots (s1, s2) and
+##       using the equation: %util = (s2.activeTime - s1.activeTime) /
+##       (s2.timestamp - s1.timestamp)
+##     - The `activeTime` time units are implementation-specific since the
+##       value is only intended to be used for calculating utilization
+##       percentage.
+##     - The `timestamp` should only be used to calculate delta between
+##       snapshots of this structure.
+##     - The application should never take the delta of `timestamp` with the
+##       timestamp from a different structure since they are not guaranteed to
+##       have the same base.
+##     - When taking the delta, the difference between `timestamp` samples
+##       could be `0`, if the frequency of sampling the snapshots is higher
+##       than the frequency of the timestamp update.
+##     - The absolute value of `timestamp` is only valid during within the
+##       application and may be different on the next execution.
+class zes_engine_stats_t(Structure):
+    _fields_ = [
+        ("activeTime", c_ulonglong),                                    ## [out] Monotonic counter where the resource is actively running
+                                                                        ## workloads.
+        ("timestamp", c_ulonglong)                                      ## [out] Monotonic counter when activeTime counter was sampled.
+    ]
+
+###############################################################################
+## @brief Event types
+class zes_event_type_flags_v(IntEnum):
+    DEVICE_DETACH = ZE_BIT(0)                                               ## Event is triggered when the device is no longer available (due to a
+                                                                            ## reset or being disabled).
+    DEVICE_ATTACH = ZE_BIT(1)                                               ## Event is triggered after the device is available again.
+    DEVICE_SLEEP_STATE_ENTER = ZE_BIT(2)                                    ## Event is triggered when the driver is about to put the device into a
+                                                                            ## deep sleep state
+    DEVICE_SLEEP_STATE_EXIT = ZE_BIT(3)                                     ## Event is triggered when the driver is waking the device up from a deep
+                                                                            ## sleep state
+    FREQ_THROTTLED = ZE_BIT(4)                                              ## Event is triggered when the frequency starts being throttled
+    ENERGY_THRESHOLD_CROSSED = ZE_BIT(5)                                    ## Event is triggered when the energy consumption threshold is reached
+                                                                            ## (use ::zesPowerSetEnergyThreshold() to configure).
+    TEMP_CRITICAL = ZE_BIT(6)                                               ## Event is triggered when the critical temperature is reached (use
+                                                                            ## ::zesTemperatureSetConfig() to configure - disabled by default).
+    TEMP_THRESHOLD1 = ZE_BIT(7)                                             ## Event is triggered when the temperature crosses threshold 1 (use
+                                                                            ## ::zesTemperatureSetConfig() to configure - disabled by default).
+    TEMP_THRESHOLD2 = ZE_BIT(8)                                             ## Event is triggered when the temperature crosses threshold 2 (use
+                                                                            ## ::zesTemperatureSetConfig() to configure - disabled by default).
+    MEM_HEALTH = ZE_BIT(9)                                                  ## Event is triggered when the health of device memory changes.
+    FABRIC_PORT_HEALTH = ZE_BIT(10)                                         ## Event is triggered when the health of fabric ports change.
+    PCI_LINK_HEALTH = ZE_BIT(11)                                            ## Event is triggered when the health of the PCI link changes.
+    RAS_CORRECTABLE_ERRORS = ZE_BIT(12)                                     ## Event is triggered when accelerator RAS correctable errors cross
+                                                                            ## thresholds (use ::zesRasSetConfig() to configure - disabled by
+                                                                            ## default).
+    RAS_UNCORRECTABLE_ERRORS = ZE_BIT(13)                                   ## Event is triggered when accelerator RAS uncorrectable errors cross
+                                                                            ## thresholds (use ::zesRasSetConfig() to configure - disabled by
+                                                                            ## default).
+    DEVICE_RESET_REQUIRED = ZE_BIT(14)                                      ## Event is triggered when the device needs to be reset (use
+                                                                            ## ::zesDeviceGetState() to determine the reasons for the reset).
+    SURVIVABILITY_MODE_DETECTED = ZE_BIT(15)                                ## Event is triggered when graphics driver encounter an error condition.
+
+class zes_event_type_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Maximum Fabric port model string size
+ZES_MAX_FABRIC_PORT_MODEL_SIZE = 256
+
+###############################################################################
+## @brief Maximum size of the buffer that will return information about link
+##        types
+ZES_MAX_FABRIC_LINK_TYPE_SIZE = 256
+
+###############################################################################
+## @brief Fabric port status
+class zes_fabric_port_status_v(IntEnum):
+    UNKNOWN = 0                                                             ## The port status cannot be determined
+    HEALTHY = 1                                                             ## The port is up and operating as expected
+    DEGRADED = 2                                                            ## The port is up but has quality and/or speed degradation
+    FAILED = 3                                                              ## Port connection instabilities are preventing workloads making forward
+                                                                            ## progress
+    DISABLED = 4                                                            ## The port is configured down
+
+class zes_fabric_port_status_t(c_int):
+    def __str__(self):
+        return str(zes_fabric_port_status_v(self.value))
+
+
+###############################################################################
+## @brief Fabric port quality degradation reasons
+class zes_fabric_port_qual_issue_flags_v(IntEnum):
+    LINK_ERRORS = ZE_BIT(0)                                                 ## Excessive link errors are occurring
+    SPEED = ZE_BIT(1)                                                       ## There is a degradation in the bitrate and/or width of the link
+
+class zes_fabric_port_qual_issue_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Fabric port failure reasons
+class zes_fabric_port_failure_flags_v(IntEnum):
+    FAILED = ZE_BIT(0)                                                      ## A previously operating link has failed. Hardware will automatically
+                                                                            ## retrain this port. This state will persist until either the physical
+                                                                            ## connection is removed or the link trains successfully.
+    TRAINING_TIMEOUT = ZE_BIT(1)                                            ## A connection has not been established within an expected time.
+                                                                            ## Hardware will continue to attempt port training. This status will
+                                                                            ## persist until either the physical connection is removed or the link
+                                                                            ## successfully trains.
+    FLAPPING = ZE_BIT(2)                                                    ## Port has excessively trained and then transitioned down for some
+                                                                            ## period of time. Driver will allow port to continue to train, but will
+                                                                            ## not enable the port for use until the port has been disabled and
+                                                                            ## subsequently re-enabled using ::zesFabricPortSetConfig().
+
+class zes_fabric_port_failure_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Unique identifier for a fabric port
+## 
+## @details
+##     - This not a universal identifier. The identified is garanteed to be
+##       unique for the current hardware configuration of the system. Changes
+##       in the hardware may result in a different identifier for a given port.
+##     - The main purpose of this identifier to build up an instantaneous
+##       topology map of system connectivity. An application should enumerate
+##       all fabric ports and match the `remotePortId` member of
+##       ::zes_fabric_port_state_t to the `portId` member of
+##       ::zes_fabric_port_properties_t.
+class zes_fabric_port_id_t(Structure):
+    _fields_ = [
+        ("fabricId", c_ulong),                                          ## [out] Unique identifier for the fabric end-point
+        ("attachId", c_ulong),                                          ## [out] Unique identifier for the device attachment point
+        ("portNumber", c_ubyte)                                         ## [out] The logical port number (this is typically marked somewhere on
+                                                                        ## the physical device)
+    ]
+
+###############################################################################
+## @brief Fabric port speed in one direction
+class zes_fabric_port_speed_t(Structure):
+    _fields_ = [
+        ("bitRate", c_int64_t),                                         ## [out] Bits/sec that the link is operating at. A value of -1 means that
+                                                                        ## this property is unknown.
+        ("width", c_int32_t)                                            ## [out] The number of lanes. A value of -1 means that this property is
+                                                                        ## unknown.
+    ]
+
+###############################################################################
+## @brief Fabric port properties
+class zes_fabric_port_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("model", c_char * ZES_MAX_FABRIC_PORT_MODEL_SIZE),             ## [out] Description of port technology. Will be set to the string
+                                                                        ## "unkown" if this cannot be determined for this port.
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the port is located on a sub-device; false means that
+                                                                        ## the port is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("portId", zes_fabric_port_id_t),                               ## [out] The unique port identifier
+        ("maxRxSpeed", zes_fabric_port_speed_t),                        ## [out] Maximum speed supported by the receive side of the port (sum of
+                                                                        ## all lanes)
+        ("maxTxSpeed", zes_fabric_port_speed_t)                         ## [out] Maximum speed supported by the transmit side of the port (sum of
+                                                                        ## all lanes)
+    ]
+
+###############################################################################
+## @brief Provides information about the fabric link attached to a port
+class zes_fabric_link_type_t(Structure):
+    _fields_ = [
+        ("desc", c_char * ZES_MAX_FABRIC_LINK_TYPE_SIZE)                ## [out] Description of link technology. Will be set to the string
+                                                                        ## "unkown" if this cannot be determined for this link.
+    ]
+
+###############################################################################
+## @brief Fabric port configuration
+class zes_fabric_port_config_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("enabled", ze_bool_t),                                         ## [in,out] Port is configured up/down
+        ("beaconing", ze_bool_t)                                        ## [in,out] Beaconing is configured on/off
+    ]
+
+###############################################################################
+## @brief Fabric port state
+class zes_fabric_port_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("status", zes_fabric_port_status_t),                           ## [out] The current status of the port
+        ("qualityIssues", zes_fabric_port_qual_issue_flags_t),          ## [out] If status is ::ZES_FABRIC_PORT_STATUS_DEGRADED,
+                                                                        ## then this gives a combination of ::zes_fabric_port_qual_issue_flag_t
+                                                                        ## for quality issues that have been detected;
+                                                                        ## otherwise, 0 indicates there are no quality issues with the link at
+                                                                        ## this time.
+        ("failureReasons", zes_fabric_port_failure_flags_t),            ## [out] If status is ::ZES_FABRIC_PORT_STATUS_FAILED,
+                                                                        ## then this gives a combination of ::zes_fabric_port_failure_flag_t for
+                                                                        ## reasons for the connection instability;
+                                                                        ## otherwise, 0 indicates there are no connection stability issues at
+                                                                        ## this time.
+        ("remotePortId", zes_fabric_port_id_t),                         ## [out] The unique port identifier for the remote connection point if
+                                                                        ## status is ::ZES_FABRIC_PORT_STATUS_HEALTHY,
+                                                                        ## ::ZES_FABRIC_PORT_STATUS_DEGRADED or ::ZES_FABRIC_PORT_STATUS_FAILED
+        ("rxSpeed", zes_fabric_port_speed_t),                           ## [out] Current maximum receive speed (sum of all lanes)
+        ("txSpeed", zes_fabric_port_speed_t)                            ## [out] Current maximum transmit speed (sum of all lanes)
+    ]
+
+###############################################################################
+## @brief Fabric port throughput.
+class zes_fabric_port_throughput_t(Structure):
+    _fields_ = [
+        ("timestamp", c_ulonglong),                                     ## [out] Monotonic timestamp counter in microseconds when the measurement
+                                                                        ## was made.
+                                                                        ## This timestamp should only be used to calculate delta time between
+                                                                        ## snapshots of this structure.
+                                                                        ## Never take the delta of this timestamp with the timestamp from a
+                                                                        ## different structure since they are not guaranteed to have the same base.
+                                                                        ## The absolute value of the timestamp is only valid during within the
+                                                                        ## application and may be different on the next execution.
+        ("rxCounter", c_ulonglong),                                     ## [out] Monotonic counter for the number of bytes received (sum of all
+                                                                        ## lanes). This includes all protocol overhead, not only the GPU traffic.
+        ("txCounter", c_ulonglong)                                      ## [out] Monotonic counter for the number of bytes transmitted (sum of
+                                                                        ## all lanes). This includes all protocol overhead, not only the GPU
+                                                                        ## traffic.
+    ]
+
+###############################################################################
+## @brief Fabric Port Error Counters
+class zes_fabric_port_error_counters_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("linkFailureCount", c_ulonglong),                              ## [out] Link Failure Error Count reported per port
+        ("fwCommErrorCount", c_ulonglong),                              ## [out] Firmware Communication Error Count reported per device
+        ("fwErrorCount", c_ulonglong),                                  ## [out] Firmware reported Error Count reported per device
+        ("linkDegradeCount", c_ulonglong)                               ## [out] Link Degrade Error Count reported per port
+    ]
+
+###############################################################################
+## @brief Fan resource speed mode
+class zes_fan_speed_mode_v(IntEnum):
+    DEFAULT = 0                                                             ## The fan speed is operating using the hardware default settings
+    FIXED = 1                                                               ## The fan speed is currently set to a fixed value
+    TABLE = 2                                                               ## The fan speed is currently controlled dynamically by hardware based on
+                                                                            ## a temp/speed table
+
+class zes_fan_speed_mode_t(c_int):
+    def __str__(self):
+        return str(zes_fan_speed_mode_v(self.value))
+
+
+###############################################################################
+## @brief Fan speed units
+class zes_fan_speed_units_v(IntEnum):
+    RPM = 0                                                                 ## The fan speed is in units of revolutions per minute (rpm)
+    PERCENT = 1                                                             ## The fan speed is a percentage of the maximum speed of the fan
+
+class zes_fan_speed_units_t(c_int):
+    def __str__(self):
+        return str(zes_fan_speed_units_v(self.value))
+
+
+###############################################################################
+## @brief Fan speed
+class zes_fan_speed_t(Structure):
+    _fields_ = [
+        ("speed", c_int32_t),                                           ## [in,out] The speed of the fan. On output, a value of -1 indicates that
+                                                                        ## there is no fixed fan speed setting.
+        ("units", zes_fan_speed_units_t)                                ## [in,out] The units that the fan speed is expressed in. On output, if
+                                                                        ## fan speed is -1 then units should be ignored.
+    ]
+
+###############################################################################
+## @brief Fan temperature/speed pair
+class zes_fan_temp_speed_t(Structure):
+    _fields_ = [
+        ("temperature", c_ulong),                                       ## [in,out] Temperature in degrees Celsius.
+        ("speed", zes_fan_speed_t)                                      ## [in,out] The speed of the fan
+    ]
+
+###############################################################################
+## @brief Maximum number of fan temperature/speed pairs in the fan speed table.
+ZES_FAN_TEMP_SPEED_PAIR_COUNT = 32
+
+###############################################################################
+## @brief Fan speed table
+class zes_fan_speed_table_t(Structure):
+    _fields_ = [
+        ("numPoints", c_int32_t),                                       ## [in,out] The number of valid points in the fan speed table. 0 means
+                                                                        ## that there is no fan speed table configured. -1 means that a fan speed
+                                                                        ## table is not supported by the hardware.
+        ("table", zes_fan_temp_speed_t * ZES_FAN_TEMP_SPEED_PAIR_COUNT) ## [in,out] Array of temperature/fan speed pairs. The table is ordered
+                                                                        ## based on temperature from lowest to highest.
+    ]
+
+###############################################################################
+## @brief Fan properties
+class zes_fan_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can control the fan speed assuming the
+                                                                        ## user has permissions
+        ("supportedModes", c_ulong),                                    ## [out] Bitfield of supported fan configuration modes
+                                                                        ## (1<<::zes_fan_speed_mode_t)
+        ("supportedUnits", c_ulong),                                    ## [out] Bitfield of supported fan speed units
+                                                                        ## (1<<::zes_fan_speed_units_t)
+        ("maxRPM", c_int32_t),                                          ## [out] The maximum RPM of the fan. A value of -1 means that this
+                                                                        ## property is unknown. 
+        ("maxPoints", c_int32_t)                                        ## [out] The maximum number of points in the fan temp/speed table. A
+                                                                        ## value of -1 means that this fan doesn't support providing a temp/speed
+                                                                        ## table.
+    ]
+
+###############################################################################
+## @brief Fan configuration
+class zes_fan_config_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("mode", zes_fan_speed_mode_t),                                 ## [in,out] The fan speed mode (fixed, temp-speed table)
+        ("speedFixed", zes_fan_speed_t),                                ## [in,out] The current fixed fan speed setting
+        ("speedTable", zes_fan_speed_table_t)                           ## [out] A table containing temperature/speed pairs
+    ]
+
+###############################################################################
+## @brief Firmware properties
+class zes_firmware_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can flash the firmware assuming the user
+                                                                        ## has permissions
+        ("name", c_char * ZES_STRING_PROPERTY_SIZE),                    ## [out] NULL terminated string value. The string "unknown" will be
+                                                                        ## returned if this property cannot be determined.
+        ("version", c_char * ZES_STRING_PROPERTY_SIZE)                  ## [out] NULL terminated string value. The string "unknown" will be
+                                                                        ## returned if this property cannot be determined.
+    ]
+
+###############################################################################
+## @brief Frequency domains.
+class zes_freq_domain_v(IntEnum):
+    GPU = 0                                                                 ## GPU Core Domain.
+    MEMORY = 1                                                              ## Local Memory Domain.
+    MEDIA = 2                                                               ## GPU Media Domain.
+
+class zes_freq_domain_t(c_int):
+    def __str__(self):
+        return str(zes_freq_domain_v(self.value))
+
+
+###############################################################################
+## @brief Frequency properties
+## 
+## @details
+##     - Indicates if this frequency domain can be overclocked (if true,
+##       functions such as ::zesFrequencyOcSetFrequencyTarget() are supported).
+##     - The min/max hardware frequencies are specified for non-overclock
+##       configurations. For overclock configurations, use
+##       ::zesFrequencyOcGetFrequencyTarget() to determine the maximum
+##       frequency that can be requested.
+class zes_freq_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_freq_domain_t),                                    ## [out] The hardware block that this frequency domain controls (GPU,
+                                                                        ## memory, ...)
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can control the frequency of this domain
+                                                                        ## assuming the user has permissions
+        ("isThrottleEventSupported", ze_bool_t),                        ## [out] Indicates if software can register to receive event
+                                                                        ## ::ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED
+        ("min", c_double),                                              ## [out] The minimum hardware clock frequency in units of MHz.
+        ("max", c_double)                                               ## [out] The maximum non-overclock hardware clock frequency in units of
+                                                                        ## MHz.
+    ]
+
+###############################################################################
+## @brief Frequency range between which the hardware can operate.
+## 
+## @details
+##     - When setting limits, they will be clamped to the hardware limits.
+##     - When setting limits, ensure that the max frequency is greater than or
+##       equal to the min frequency specified.
+##     - When setting limits to return to factory settings, specify -1 for both
+##       the min and max limit.
+class zes_freq_range_t(Structure):
+    _fields_ = [
+        ("min", c_double),                                              ## [in,out] The min frequency in MHz below which hardware frequency
+                                                                        ## management will not request frequencies. On input, setting to 0 will
+                                                                        ## permit the frequency to go down to the hardware minimum while setting
+                                                                        ## to -1 will return the min frequency limit to the factory value (can be
+                                                                        ## larger than the hardware min). On output, a negative value indicates
+                                                                        ## that no external minimum frequency limit is in effect.
+        ("max", c_double)                                               ## [in,out] The max frequency in MHz above which hardware frequency
+                                                                        ## management will not request frequencies. On input, setting to 0 or a
+                                                                        ## very big number will permit the frequency to go all the way up to the
+                                                                        ## hardware maximum while setting to -1 will return the max frequency to
+                                                                        ## the factory value (which can be less than the hardware max). On
+                                                                        ## output, a negative number indicates that no external maximum frequency
+                                                                        ## limit is in effect.
+    ]
+
+###############################################################################
+## @brief Frequency throttle reasons
+class zes_freq_throttle_reason_flags_v(IntEnum):
+    AVE_PWR_CAP = ZE_BIT(0)                                                 ## frequency throttled due to average power excursion (PL1)
+    BURST_PWR_CAP = ZE_BIT(1)                                               ## frequency throttled due to burst power excursion (PL2)
+    CURRENT_LIMIT = ZE_BIT(2)                                               ## frequency throttled due to current excursion (PL4)
+    THERMAL_LIMIT = ZE_BIT(3)                                               ## frequency throttled due to thermal excursion (T > TjMax)
+    PSU_ALERT = ZE_BIT(4)                                                   ## frequency throttled due to power supply assertion
+    SW_RANGE = ZE_BIT(5)                                                    ## frequency throttled due to software supplied frequency range
+    HW_RANGE = ZE_BIT(6)                                                    ## frequency throttled due to a sub block that has a lower frequency
+                                                                            ## range when it receives clocks
+
+class zes_freq_throttle_reason_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Frequency state
+class zes_freq_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("currentVoltage", c_double),                                   ## [out] Current voltage in Volts. A negative value indicates that this
+                                                                        ## property is not known.
+        ("request", c_double),                                          ## [out] The current frequency request in MHz. A negative value indicates
+                                                                        ## that this property is not known.
+        ("tdp", c_double),                                              ## [out] The maximum frequency in MHz supported under the current TDP
+                                                                        ## conditions. This fluctuates dynamically based on the power and thermal
+                                                                        ## limits of the part. A negative value indicates that this property is
+                                                                        ## not known.
+        ("efficient", c_double),                                        ## [out] The efficient minimum frequency in MHz. A negative value
+                                                                        ## indicates that this property is not known.
+        ("actual", c_double),                                           ## [out] The resolved frequency in MHz. A negative value indicates that
+                                                                        ## this property is not known.
+        ("throttleReasons", zes_freq_throttle_reason_flags_t)           ## [out] The reasons that the frequency is being limited by the hardware.
+                                                                        ## Returns 0 (frequency not throttled) or a combination of ::zes_freq_throttle_reason_flag_t.
+    ]
+
+###############################################################################
+## @brief Frequency throttle time snapshot
+## 
+## @details
+##     - Percent time throttled is calculated by taking two snapshots (s1, s2)
+##       and using the equation: %throttled = (s2.throttleTime -
+##       s1.throttleTime) / (s2.timestamp - s1.timestamp)
+class zes_freq_throttle_time_t(Structure):
+    _fields_ = [
+        ("throttleTime", c_ulonglong),                                  ## [out] The monotonic counter of time in microseconds that the frequency
+                                                                        ## has been limited by the hardware.
+        ("timestamp", c_ulonglong)                                      ## [out] Microsecond timestamp when throttleTime was captured.
+                                                                        ## This timestamp should only be used to calculate delta time between
+                                                                        ## snapshots of this structure.
+                                                                        ## Never take the delta of this timestamp with the timestamp from a
+                                                                        ## different structure since they are not guaranteed to have the same base.
+                                                                        ## The absolute value of the timestamp is only valid during within the
+                                                                        ## application and may be different on the next execution.
+    ]
+
+###############################################################################
+## @brief Overclocking modes
+## 
+## @details
+##     - [DEPRECATED] No longer supported.
+class zes_oc_mode_v(IntEnum):
+    OFF = 0                                                                 ## Overclocking if off - hardware is running using factory default
+                                                                            ## voltages/frequencies.
+    OVERRIDE = 1                                                            ## Overclock override mode - In this mode, a fixed user-supplied voltage
+                                                                            ## is applied independent of the frequency request. The maximum permitted
+                                                                            ## frequency can also be increased. This mode disables INTERPOLATIVE and
+                                                                            ## FIXED modes.
+    INTERPOLATIVE = 2                                                       ## Overclock interpolative mode - In this mode, the voltage/frequency
+                                                                            ## curve can be extended with a new voltage/frequency point that will be
+                                                                            ## interpolated. The existing voltage/frequency points can also be offset
+                                                                            ## (up or down) by a fixed voltage. This mode disables FIXED and OVERRIDE
+                                                                            ## modes.
+    FIXED = 3                                                               ## Overclocking fixed Mode - In this mode, hardware will disable most
+                                                                            ## frequency throttling and lock the frequency and voltage at the
+                                                                            ## specified overclock values. This mode disables OVERRIDE and
+                                                                            ## INTERPOLATIVE modes. This mode can damage the part, most of the
+                                                                            ## protections are disabled on this mode.
+
+class zes_oc_mode_t(c_int):
+    def __str__(self):
+        return str(zes_oc_mode_v(self.value))
+
+
+###############################################################################
+## @brief Overclocking properties
+## 
+## @details
+##     - Provides all the overclocking capabilities and properties supported by
+##       the device for the frequency domain.
+##     - [DEPRECATED] No longer supported.
+class zes_oc_capabilities_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("isOcSupported", ze_bool_t),                                   ## [out] Indicates if any overclocking features are supported on this
+                                                                        ## frequency domain.
+        ("maxFactoryDefaultFrequency", c_double),                       ## [out] Factory default non-overclock maximum frequency in Mhz.
+        ("maxFactoryDefaultVoltage", c_double),                         ## [out] Factory default voltage used for the non-overclock maximum
+                                                                        ## frequency in MHz.
+        ("maxOcFrequency", c_double),                                   ## [out] Maximum hardware overclocking frequency limit in Mhz.
+        ("minOcVoltageOffset", c_double),                               ## [out] The minimum voltage offset that can be applied to the
+                                                                        ## voltage/frequency curve. Note that this number can be negative.
+        ("maxOcVoltageOffset", c_double),                               ## [out] The maximum voltage offset that can be applied to the
+                                                                        ## voltage/frequency curve.
+        ("maxOcVoltage", c_double),                                     ## [out] The maximum overclock voltage that hardware supports.
+        ("isTjMaxSupported", ze_bool_t),                                ## [out] Indicates if the maximum temperature limit (TjMax) can be
+                                                                        ## changed for this frequency domain.
+        ("isIccMaxSupported", ze_bool_t),                               ## [out] Indicates if the maximum current (IccMax) can be changed for
+                                                                        ## this frequency domain.
+        ("isHighVoltModeCapable", ze_bool_t),                           ## [out] Indicates if this frequency domains supports a feature to set
+                                                                        ## very high voltages.
+        ("isHighVoltModeEnabled", ze_bool_t),                           ## [out] Indicates if very high voltages are permitted on this frequency
+                                                                        ## domain.
+        ("isExtendedModeSupported", ze_bool_t),                         ## [out] Indicates if the extended overclocking features are supported.
+                                                                        ## If this is supported, increments are on 1 Mhz basis.
+        ("isFixedModeSupported", ze_bool_t)                             ## [out] Indicates if the fixed mode is supported. In this mode, hardware
+                                                                        ## will disable most frequency throttling and lock the frequency and
+                                                                        ## voltage at the specified overclock values.
+    ]
+
+###############################################################################
+## @brief LED properties
+class zes_led_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("canControl", ze_bool_t),                                      ## [out] Indicates if software can control the LED assuming the user has
+                                                                        ## permissions
+        ("haveRGB", ze_bool_t)                                          ## [out] Indicates if the LED is RGB capable
+    ]
+
+###############################################################################
+## @brief LED color
+class zes_led_color_t(Structure):
+    _fields_ = [
+        ("red", c_double),                                              ## [in,out][range(0.0, 1.0)] The LED red value. On output, a value less
+                                                                        ## than 0.0 indicates that the color is not known.
+        ("green", c_double),                                            ## [in,out][range(0.0, 1.0)] The LED green value. On output, a value less
+                                                                        ## than 0.0 indicates that the color is not known.
+        ("blue", c_double)                                              ## [in,out][range(0.0, 1.0)] The LED blue value. On output, a value less
+                                                                        ## than 0.0 indicates that the color is not known.
+    ]
+
+###############################################################################
+## @brief LED state
+class zes_led_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("isOn", ze_bool_t),                                            ## [out] Indicates if the LED is on or off
+        ("color", zes_led_color_t)                                      ## [out] Color of the LED
+    ]
+
+###############################################################################
+## @brief Memory module types
+class zes_mem_type_v(IntEnum):
+    HBM = 0                                                                 ## HBM memory
+    DDR = 1                                                                 ## DDR memory
+    DDR3 = 2                                                                ## DDR3 memory
+    DDR4 = 3                                                                ## DDR4 memory
+    DDR5 = 4                                                                ## DDR5 memory
+    LPDDR = 5                                                               ## LPDDR memory
+    LPDDR3 = 6                                                              ## LPDDR3 memory
+    LPDDR4 = 7                                                              ## LPDDR4 memory
+    LPDDR5 = 8                                                              ## LPDDR5 memory
+    SRAM = 9                                                                ## SRAM memory
+    L1 = 10                                                                 ## L1 cache
+    L3 = 11                                                                 ## L3 cache
+    GRF = 12                                                                ## Execution unit register file
+    SLM = 13                                                                ## Execution unit shared local memory
+    GDDR4 = 14                                                              ## GDDR4 memory
+    GDDR5 = 15                                                              ## GDDR5 memory
+    GDDR5X = 16                                                             ## GDDR5X memory
+    GDDR6 = 17                                                              ## GDDR6 memory
+    GDDR6X = 18                                                             ## GDDR6X memory
+    GDDR7 = 19                                                              ## GDDR7 memory
+
+class zes_mem_type_t(c_int):
+    def __str__(self):
+        return str(zes_mem_type_v(self.value))
+
+
+###############################################################################
+## @brief Memory module location
+class zes_mem_loc_v(IntEnum):
+    SYSTEM = 0                                                              ## System memory
+    DEVICE = 1                                                              ## On board local device memory
+
+class zes_mem_loc_t(c_int):
+    def __str__(self):
+        return str(zes_mem_loc_v(self.value))
+
+
+###############################################################################
+## @brief Memory health
+class zes_mem_health_v(IntEnum):
+    UNKNOWN = 0                                                             ## The memory health cannot be determined.
+    OK = 1                                                                  ## All memory channels are healthy.
+    DEGRADED = 2                                                            ## Excessive correctable errors have been detected on one or more
+                                                                            ## channels. Device should be reset.
+    CRITICAL = 3                                                            ## Operating with reduced memory to cover banks with too many
+                                                                            ## uncorrectable errors.
+    REPLACE = 4                                                             ## Device should be replaced due to excessive uncorrectable errors.
+
+class zes_mem_health_t(c_int):
+    def __str__(self):
+        return str(zes_mem_health_v(self.value))
+
+
+###############################################################################
+## @brief Memory properties
+class zes_mem_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_mem_type_t),                                       ## [out] The memory type
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("location", zes_mem_loc_t),                                    ## [out] Location of this memory (system, device)
+        ("physicalSize", c_ulonglong),                                  ## [out] Physical memory size in bytes. A value of 0 indicates that this
+                                                                        ## property is not known. However, a call to ::zesMemoryGetState() will
+                                                                        ## correctly return the total size of usable memory.
+        ("busWidth", c_int32_t),                                        ## [out] Width of the memory bus. A value of -1 means that this property
+                                                                        ## is unknown.
+        ("numChannels", c_int32_t)                                      ## [out] The number of memory channels. A value of -1 means that this
+                                                                        ## property is unknown.
+    ]
+
+###############################################################################
+## @brief Memory state - health, allocated
+## 
+## @details
+##     - Percent allocation is given by 100 * (size - free / size.
+##     - Percent free is given by 100 * free / size.
+class zes_mem_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("health", zes_mem_health_t),                                   ## [out] Indicates the health of the memory
+        ("free", c_ulonglong),                                          ## [out] The free memory in bytes
+        ("size", c_ulonglong)                                           ## [out] The total allocatable memory in bytes (can be less than the
+                                                                        ## `physicalSize` member of ::zes_mem_properties_t)
+    ]
+
+###############################################################################
+## @brief Memory bandwidth
+## 
+## @details
+##     - Percent bandwidth is calculated by taking two snapshots (s1, s2) and
+##       using the equation: %bw = 10^6 * ((s2.readCounter - s1.readCounter) +
+##       (s2.writeCounter - s1.writeCounter)) / (s2.maxBandwidth *
+##       (s2.timestamp - s1.timestamp))
+##     - Counter can roll over and rollover needs to be handled by comparing
+##       the current read against the previous read
+##     - Counter is a 32 byte transaction count, which means the calculated
+##       delta (delta = current_value - previous_value or delta = 2^32 -
+##       previous_value + current_value in case of rollover) needs to be
+##       multiplied by 32 to get delta between samples in actual byte count
+class zes_mem_bandwidth_t(Structure):
+    _fields_ = [
+        ("readCounter", c_ulonglong),                                   ## [out] Total bytes read from memory
+        ("writeCounter", c_ulonglong),                                  ## [out] Total bytes written to memory
+        ("maxBandwidth", c_ulonglong),                                  ## [out] Current maximum bandwidth in units of bytes/sec
+        ("timestamp", c_ulonglong)                                      ## [out] The timestamp in microseconds when these measurements were sampled.
+                                                                        ## This timestamp should only be used to calculate delta time between
+                                                                        ## snapshots of this structure.
+                                                                        ## Never take the delta of this timestamp with the timestamp from a
+                                                                        ## different structure since they are not guaranteed to have the same base.
+                                                                        ## The absolute value of the timestamp is only valid during within the
+                                                                        ## application and may be different on the next execution.
+    ]
+
+###############################################################################
+## @brief Extension properties for Memory bandwidth
+## 
+## @details
+##     - Number of counter bits
+##     - [DEPRECATED] No longer supported.
+class zes_mem_ext_bandwidth_t(Structure):
+    _fields_ = [
+        ("memoryTimestampValidBits", c_ulong)                           ## [out] Returns the number of valid bits in the timestamp values
+    ]
+
+###############################################################################
+## @brief Static information about a Performance Factor domain
+class zes_perf_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if this Performance Factor affects accelerators located on
+                                                                        ## a sub-device
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("engines", zes_engine_type_flags_t)                            ## [out] Bitfield of accelerator engine types that are affected by this
+                                                                        ## Performance Factor.
+    ]
+
+###############################################################################
+## @brief Power Domain
+class zes_power_domain_v(IntEnum):
+    UNKNOWN = 0                                                             ## The PUnit power domain level cannot be determined.
+    CARD = 1                                                                ## The PUnit power domain is a card-level power domain.
+    PACKAGE = 2                                                             ## The PUnit power domain is a package-level power domain.
+    STACK = 3                                                               ## The PUnit power domain is a stack-level power domain.
+    MEMORY = 4                                                              ## The PUnit power domain is a memory-level power domain.
+    GPU = 5                                                                 ## The PUnit power domain is a GPU-level power domain.
+
+class zes_power_domain_t(c_int):
+    def __str__(self):
+        return str(zes_power_domain_v(self.value))
+
+
+###############################################################################
+## @brief Power Level Type
+class zes_power_level_v(IntEnum):
+    UNKNOWN = 0                                                             ## The PUnit power monitoring duration cannot be determined.
+    SUSTAINED = 1                                                           ## The PUnit determines effective power draw by computing a moving
+                                                                            ## average of the actual power draw over a time interval (longer than
+                                                                            ## BURST).
+    BURST = 2                                                               ## The PUnit determines effective power draw by computing a moving
+                                                                            ## average of the actual power draw over a time interval (longer than
+                                                                            ## PEAK).
+    PEAK = 3                                                                ## The PUnit determines effective power draw by computing a moving
+                                                                            ## average of the actual power draw over a very short time interval.
+    INSTANTANEOUS = 4                                                       ## The PUnit predicts effective power draw using the current device
+                                                                            ## configuration (frequency, voltage, etc...) & throttles proactively to
+                                                                            ## stay within the specified limit.
+
+class zes_power_level_t(c_int):
+    def __str__(self):
+        return str(zes_power_level_v(self.value))
+
+
+###############################################################################
+## @brief Power Source Type
+class zes_power_source_v(IntEnum):
+    ANY = 0                                                                 ## Limit active no matter whether the power source is mains powered or
+                                                                            ## battery powered.
+    MAINS = 1                                                               ## Limit active only when the device is mains powered.
+    BATTERY = 2                                                             ## Limit active only when the device is battery powered.
+
+class zes_power_source_t(c_int):
+    def __str__(self):
+        return str(zes_power_source_v(self.value))
+
+
+###############################################################################
+## @brief Limit Unit
+class zes_limit_unit_v(IntEnum):
+    UNKNOWN = 0                                                             ## The PUnit power monitoring unit cannot be determined.
+    CURRENT = 1                                                             ## The limit is specified in milliamperes of current drawn.
+    POWER = 2                                                               ## The limit is specified in milliwatts of power generated.
+
+class zes_limit_unit_t(c_int):
+    def __str__(self):
+        return str(zes_limit_unit_v(self.value))
+
+
+###############################################################################
+## @brief Properties related to device power settings
+class zes_power_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("canControl", ze_bool_t),                                      ## [out] Software can change the power limits of this domain assuming the
+                                                                        ## user has permissions.
+        ("isEnergyThresholdSupported", ze_bool_t),                      ## [out] Indicates if this power domain supports the energy threshold
+                                                                        ## event (::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED).
+        ("defaultLimit", c_int32_t),                                    ## [out] (Deprecated) The factory default TDP power limit of the part in
+                                                                        ## milliwatts. A value of -1 means that this is not known.
+        ("minLimit", c_int32_t),                                        ## [out] (Deprecated) The minimum power limit in milliwatts that can be
+                                                                        ## requested. A value of -1 means that this is not known.
+        ("maxLimit", c_int32_t)                                         ## [out] (Deprecated) The maximum power limit in milliwatts that can be
+                                                                        ## requested. A value of -1 means that this is not known.
+    ]
+
+###############################################################################
+## @brief Energy counter snapshot
+## 
+## @details
+##     - Average power is calculated by taking two snapshots (s1, s2) and using
+##       the equation: PowerWatts = (s2.energy - s1.energy) / (s2.timestamp -
+##       s1.timestamp)
+class zes_power_energy_counter_t(Structure):
+    _fields_ = [
+        ("energy", c_ulonglong),                                        ## [out] The monotonic energy counter in microjoules.
+        ("timestamp", c_ulonglong)                                      ## [out] Microsecond timestamp when energy was captured.
+                                                                        ## This timestamp should only be used to calculate delta time between
+                                                                        ## snapshots of this structure.
+                                                                        ## Never take the delta of this timestamp with the timestamp from a
+                                                                        ## different structure since they are not guaranteed to have the same base.
+                                                                        ## The absolute value of the timestamp is only valid during within the
+                                                                        ## application and may be different on the next execution.
+    ]
+
+###############################################################################
+## @brief Sustained power limits
+## 
+## @details
+##     - The power controller (Punit) will throttle the operating frequency if
+##       the power averaged over a window (typically seconds) exceeds this
+##       limit.
+##     - [DEPRECATED] No longer supported.
+class zes_power_sustained_limit_t(Structure):
+    _fields_ = [
+        ("enabled", ze_bool_t),                                         ## [in,out] indicates if the limit is enabled (true) or ignored (false)
+        ("power", c_int32_t),                                           ## [in,out] power limit in milliwatts
+        ("interval", c_int32_t)                                         ## [in,out] power averaging window (Tau) in milliseconds
+    ]
+
+###############################################################################
+## @brief Burst power limit
+## 
+## @details
+##     - The power controller (Punit) will throttle the operating frequency of
+##       the device if the power averaged over a few milliseconds exceeds a
+##       limit known as PL2. Typically PL2 > PL1 so that it permits the
+##       frequency to burst higher for short periods than would be otherwise
+##       permitted by PL1.
+##     - [DEPRECATED] No longer supported.
+class zes_power_burst_limit_t(Structure):
+    _fields_ = [
+        ("enabled", ze_bool_t),                                         ## [in,out] indicates if the limit is enabled (true) or ignored (false)
+        ("power", c_int32_t)                                            ## [in,out] power limit in milliwatts
+    ]
+
+###############################################################################
+## @brief Peak power limit
+## 
+## @details
+##     - The power controller (Punit) will reactively/proactively throttle the
+##       operating frequency of the device when the instantaneous/100usec power
+##       exceeds this limit. The limit is known as PL4 or Psys. It expresses
+##       the maximum power that can be drawn from the power supply.
+##     - If this power limit is removed or set too high, the power supply will
+##       generate an interrupt when it detects an overcurrent condition and the
+##       power controller will throttle the device frequencies down to min. It
+##       is thus better to tune the PL4 value in order to avoid such
+##       excursions.
+##     - [DEPRECATED] No longer supported.
+class zes_power_peak_limit_t(Structure):
+    _fields_ = [
+        ("powerAC", c_int32_t),                                         ## [in,out] power limit in milliwatts for the AC power source.
+        ("powerDC", c_int32_t)                                          ## [in,out] power limit in milliwatts for the DC power source. On input,
+                                                                        ## this is ignored if the product does not have a battery. On output,
+                                                                        ## this will be -1 if the product does not have a battery.
+    ]
+
+###############################################################################
+## @brief Energy threshold
+## 
+## @details
+##     - .
+class zes_energy_threshold_t(Structure):
+    _fields_ = [
+        ("enable", ze_bool_t),                                          ## [in,out] Indicates if the energy threshold is enabled.
+        ("threshold", c_double),                                        ## [in,out] The energy threshold in Joules. Will be 0.0 if no threshold
+                                                                        ## has been set.
+        ("processId", c_ulong)                                          ## [in,out] The host process ID that set the energy threshold. Will be
+                                                                        ## 0xFFFFFFFF if no threshold has been set.
+    ]
+
+###############################################################################
+## @brief PSU voltage status
+class zes_psu_voltage_status_v(IntEnum):
+    UNKNOWN = 0                                                             ## The status of the power supply voltage controllers cannot be
+                                                                            ## determined
+    NORMAL = 1                                                              ## No unusual voltages have been detected
+    OVER = 2                                                                ## Over-voltage has occurred
+    UNDER = 3                                                               ## Under-voltage has occurred
+
+class zes_psu_voltage_status_t(c_int):
+    def __str__(self):
+        return str(zes_psu_voltage_status_v(self.value))
+
+
+###############################################################################
+## @brief Static properties of the power supply
+class zes_psu_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("haveFan", ze_bool_t),                                         ## [out] True if the power supply has a fan
+        ("ampLimit", c_int32_t)                                         ## [out] The maximum electrical current in milliamperes that can be
+                                                                        ## drawn. A value of -1 indicates that this property cannot be
+                                                                        ## determined.
+    ]
+
+###############################################################################
+## @brief Dynamic state of the power supply
+class zes_psu_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("voltStatus", zes_psu_voltage_status_t),                       ## [out] The current PSU voltage status
+        ("fanFailed", ze_bool_t),                                       ## [out] Indicates if the fan has failed
+        ("temperature", c_int32_t),                                     ## [out] Read the current heatsink temperature in degrees Celsius. A
+                                                                        ## value of -1 indicates that this property cannot be determined.
+        ("current", c_int32_t)                                          ## [out] The amps being drawn in milliamperes. A value of -1 indicates
+                                                                        ## that this property cannot be determined.
+    ]
+
+###############################################################################
+## @brief RAS error type
+class zes_ras_error_type_v(IntEnum):
+    CORRECTABLE = 0                                                         ## Errors were corrected by hardware
+    UNCORRECTABLE = 1                                                       ## Error were not corrected
+
+class zes_ras_error_type_t(c_int):
+    def __str__(self):
+        return str(zes_ras_error_type_v(self.value))
+
+
+###############################################################################
+## @brief RAS error categories
+class zes_ras_error_cat_v(IntEnum):
+    RESET = 0                                                               ## The number of accelerator engine resets attempted by the driver
+    PROGRAMMING_ERRORS = 1                                                  ## The number of hardware exceptions generated by the way workloads have
+                                                                            ## programmed the hardware
+    DRIVER_ERRORS = 2                                                       ## The number of low level driver communication errors have occurred
+    COMPUTE_ERRORS = 3                                                      ## The number of errors that have occurred in the compute accelerator
+                                                                            ## hardware
+    NON_COMPUTE_ERRORS = 4                                                  ## The number of errors that have occurred in the fixed-function
+                                                                            ## accelerator hardware
+    CACHE_ERRORS = 5                                                        ## The number of errors that have occurred in caches (L1/L3/register
+                                                                            ## file/shared local memory/sampler)
+    DISPLAY_ERRORS = 6                                                      ## The number of errors that have occurred in the display
+
+class zes_ras_error_cat_t(c_int):
+    def __str__(self):
+        return str(zes_ras_error_cat_v(self.value))
+
+
+###############################################################################
+## @brief The maximum number of categories
+ZES_MAX_RAS_ERROR_CATEGORY_COUNT = 7
+
+###############################################################################
+## @brief RAS properties
+class zes_ras_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_ras_error_type_t),                                 ## [out] The type of RAS error
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device
+    ]
+
+###############################################################################
+## @brief RAS error details
+class zes_ras_state_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("category", c_ulonglong * ZES_MAX_RAS_ERROR_CATEGORY_COUNT)    ## [in][out] Breakdown of error by category
+    ]
+
+###############################################################################
+## @brief RAS error configuration - thresholds used for triggering RAS events
+##        (::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS,
+##        ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS)
+## 
+## @details
+##     - The driver maintains a total counter which is updated every time a
+##       hardware block covered by the corresponding RAS error set notifies
+##       that an error has occurred. When this total count goes above the
+##       totalThreshold specified below, a RAS event is triggered.
+##     - The driver also maintains a counter for each category of RAS error
+##       (see ::zes_ras_state_t for a breakdown). Each time a hardware block of
+##       that category notifies that an error has occurred, that corresponding
+##       category counter is updated. When it goes above the threshold
+##       specified in detailedThresholds, a RAS event is triggered.
+class zes_ras_config_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("totalThreshold", c_ulonglong),                                ## [in,out] If the total RAS errors exceeds this threshold, the event
+                                                                        ## will be triggered. A value of 0ULL disables triggering the event based
+                                                                        ## on the total counter.
+        ("detailedThresholds", zes_ras_state_t)                         ## [in,out] If the RAS errors for each category exceed the threshold for
+                                                                        ## that category, the event will be triggered. A value of 0ULL will
+                                                                        ## disable an event being triggered for that category.
+    ]
+
+###############################################################################
+## @brief Scheduler mode
+class zes_sched_mode_v(IntEnum):
+    TIMEOUT = 0                                                             ## Multiple applications or contexts are submitting work to the hardware.
+                                                                            ## When higher priority work arrives, the scheduler attempts to pause the
+                                                                            ## current executing work within some timeout interval, then submits the
+                                                                            ## other work.
+    TIMESLICE = 1                                                           ## The scheduler attempts to fairly timeslice hardware execution time
+                                                                            ## between multiple contexts submitting work to the hardware
+                                                                            ## concurrently.
+    EXCLUSIVE = 2                                                           ## Any application or context can run indefinitely on the hardware
+                                                                            ## without being preempted or terminated. All pending work for other
+                                                                            ## contexts must wait until the running context completes with no further
+                                                                            ## submitted work.
+    COMPUTE_UNIT_DEBUG = 3                                                  ## [DEPRECATED] No longer supported.
+
+class zes_sched_mode_t(c_int):
+    def __str__(self):
+        return str(zes_sched_mode_v(self.value))
+
+
+###############################################################################
+## @brief Properties related to scheduler component
+class zes_sched_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if this resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("canControl", ze_bool_t),                                      ## [out] Software can change the scheduler component configuration
+                                                                        ## assuming the user has permissions.
+        ("engines", zes_engine_type_flags_t),                           ## [out] Bitfield of accelerator engine types that are managed by this
+                                                                        ## scheduler component. Note that there can be more than one scheduler
+                                                                        ## component for the same type of accelerator engine.
+        ("supportedModes", c_ulong)                                     ## [out] Bitfield of scheduler modes that can be configured for this
+                                                                        ## scheduler component (bitfield of 1<<::zes_sched_mode_t).
+    ]
+
+###############################################################################
+## @brief Disable forward progress guard timeout.
+ZES_SCHED_WATCHDOG_DISABLE = (~(0ULL))
+
+###############################################################################
+## @brief Configuration for timeout scheduler mode (::ZES_SCHED_MODE_TIMEOUT)
+class zes_sched_timeout_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("watchdogTimeout", c_ulonglong)                                ## [in,out] The maximum time in microseconds that the scheduler will wait
+                                                                        ## for a batch of work submitted to a hardware engine to complete or to
+                                                                        ## be preempted so as to run another context.
+                                                                        ## If this time is exceeded, the hardware engine is reset and the context terminated.
+                                                                        ## If set to ::ZES_SCHED_WATCHDOG_DISABLE, a running workload can run as
+                                                                        ## long as it wants without being terminated, but preemption attempts to
+                                                                        ## run other contexts are permitted but not enforced.
+    ]
+
+###############################################################################
+## @brief Configuration for timeslice scheduler mode
+##        (::ZES_SCHED_MODE_TIMESLICE)
+class zes_sched_timeslice_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("interval", c_ulonglong),                                      ## [in,out] The average interval in microseconds that a submission for a
+                                                                        ## context will run on a hardware engine before being preempted out to
+                                                                        ## run a pending submission for another context.
+        ("yieldTimeout", c_ulonglong)                                   ## [in,out] The maximum time in microseconds that the scheduler will wait
+                                                                        ## to preempt a workload running on an engine before deciding to reset
+                                                                        ## the hardware engine and terminating the associated context.
+    ]
+
+###############################################################################
+## @brief Standby hardware components
+class zes_standby_type_v(IntEnum):
+    GLOBAL = 0                                                              ## Control the overall standby policy of the device/sub-device
+
+class zes_standby_type_t(c_int):
+    def __str__(self):
+        return str(zes_standby_type_v(self.value))
+
+
+###############################################################################
+## @brief Standby hardware component properties
+class zes_standby_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_standby_type_t),                                   ## [out] Which standby hardware component this controls
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device
+    ]
+
+###############################################################################
+## @brief Standby promotion modes
+class zes_standby_promo_mode_v(IntEnum):
+    DEFAULT = 0                                                             ## Best compromise between performance and energy savings.
+    NEVER = 1                                                               ## The device/component will never shutdown. This can improve performance
+                                                                            ## but uses more energy.
+
+class zes_standby_promo_mode_t(c_int):
+    def __str__(self):
+        return str(zes_standby_promo_mode_v(self.value))
+
+
+###############################################################################
+## @brief Temperature sensors
+class zes_temp_sensors_v(IntEnum):
+    GLOBAL = 0                                                              ## The maximum temperature across all device sensors
+    GPU = 1                                                                 ## The maximum temperature across all sensors in the GPU
+    MEMORY = 2                                                              ## The maximum temperature across all sensors in the local memory
+    GLOBAL_MIN = 3                                                          ## The minimum temperature across all device sensors
+    GPU_MIN = 4                                                             ## The minimum temperature across all sensors in the GPU
+    MEMORY_MIN = 5                                                          ## The minimum temperature across all sensors in the local device memory
+    GPU_BOARD = 6                                                           ## The maximum temperature across all sensors in the GPU Board
+    GPU_BOARD_MIN = 7                                                       ## The minimum temperature across all sensors in the GPU Board
+    VOLTAGE_REGULATOR = 8                                                   ## The maximum temperature across all sensors in the Voltage Regulator
+
+class zes_temp_sensors_t(c_int):
+    def __str__(self):
+        return str(zes_temp_sensors_v(self.value))
+
+
+###############################################################################
+## @brief Temperature sensor properties
+class zes_temp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_temp_sensors_t),                                   ## [out] Which part of the device the temperature sensor measures
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the resource is located on a sub-device; false means
+                                                                        ## that the resource is on the device of the calling Sysman handle
+        ("subdeviceId", c_ulong),                                       ## [out] If onSubdevice is true, this gives the ID of the sub-device
+        ("maxTemperature", c_double),                                   ## [out] Will contain the maximum temperature for the specific device in
+                                                                        ## degrees Celsius.
+        ("isCriticalTempSupported", ze_bool_t),                         ## [out] Indicates if the critical temperature event
+                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL is supported
+        ("isThreshold1Supported", ze_bool_t),                           ## [out] Indicates if the temperature threshold 1 event
+                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 is supported
+        ("isThreshold2Supported", ze_bool_t)                            ## [out] Indicates if the temperature threshold 2 event
+                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 is supported
+    ]
+
+###############################################################################
+## @brief Temperature sensor threshold
+class zes_temp_threshold_t(Structure):
+    _fields_ = [
+        ("enableLowToHigh", ze_bool_t),                                 ## [in,out] Trigger an event when the temperature crosses from below the
+                                                                        ## threshold to above.
+        ("enableHighToLow", ze_bool_t),                                 ## [in,out] Trigger an event when the temperature crosses from above the
+                                                                        ## threshold to below.
+        ("threshold", c_double)                                         ## [in,out] The threshold in degrees Celsius.
+    ]
+
+###############################################################################
+## @brief Temperature configuration - which events should be triggered and the
+##        trigger conditions.
+class zes_temp_config_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("enableCritical", ze_bool_t),                                  ## [in,out] Indicates if event ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL should
+                                                                        ## be triggered by the driver.
+        ("threshold1", zes_temp_threshold_t),                           ## [in,out] Configuration controlling if and when event
+                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 should be triggered by the
+                                                                        ## driver.
+        ("threshold2", zes_temp_threshold_t)                            ## [in,out] Configuration controlling if and when event
+                                                                        ## ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 should be triggered by the
+                                                                        ## driver.
+    ]
+
+###############################################################################
+## @brief Power Limits Extension Name
+ZES_POWER_LIMITS_EXT_NAME = "ZES_extension_power_limits"
+
+###############################################################################
+## @brief Power Limits Extension Version(s)
+class zes_power_limits_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_power_limits_ext_version_t(c_int):
+    def __str__(self):
+        return str(zes_power_limits_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Device power/current limit descriptor.
+class zes_power_limit_ext_desc_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("level", zes_power_level_t),                                   ## [in,out] duration type over which the power draw is measured, i.e.
+                                                                        ## sustained, burst, peak, or critical.
+        ("source", zes_power_source_t),                                 ## [out] source of power used by the system, i.e. AC or DC.
+        ("limitUnit", zes_limit_unit_t),                                ## [out] unit used for specifying limit, i.e. current units (milliamps)
+                                                                        ## or power units (milliwatts).
+        ("enabledStateLocked", ze_bool_t),                              ## [out] indicates if the power limit state (enabled/ignored) can be set
+                                                                        ## (false) or is locked (true).
+        ("enabled", ze_bool_t),                                         ## [in,out] indicates if the limit is enabled (true) or ignored (false).
+                                                                        ## If enabledStateIsLocked is True, this value is ignored.
+        ("intervalValueLocked", ze_bool_t),                             ## [out] indicates if the interval can be modified (false) or is fixed
+                                                                        ## (true).
+        ("interval", c_int32_t),                                        ## [in,out] power averaging window in milliseconds. If
+                                                                        ## intervalValueLocked is true, this value is ignored.
+        ("limitValueLocked", ze_bool_t),                                ## [out] indicates if the limit can be set (false) or if the limit is
+                                                                        ## fixed (true).
+        ("limit", c_int32_t)                                            ## [in,out] limit value. If limitValueLocked is true, this value is
+                                                                        ## ignored. The value should be provided in the unit specified by
+                                                                        ## limitUnit.
+    ]
+
+###############################################################################
+## @brief Extension properties related to device power settings
+## 
+## @details
+##     - This structure may be returned from ::zesPowerGetProperties via the
+##       `pNext` member of ::zes_power_properties_t.
+##     - This structure may also be returned from ::zesPowerGetProperties via
+##       the `pNext` member of ::zes_power_ext_properties_t
+##     - Used for determining the power domain level, i.e. card-level v/s
+##       package-level v/s stack-level & the factory default power limits.
+class zes_power_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("domain", zes_power_domain_t),                                 ## [out] domain that the power limit belongs to.
+        ("defaultLimit", POINTER(zes_power_limit_ext_desc_t))           ## [out] the factory default limit of the part.
+    ]
+
+###############################################################################
+## @brief Engine Activity Extension Name
+ZES_ENGINE_ACTIVITY_EXT_NAME = "ZES_extension_engine_activity"
+
+###############################################################################
+## @brief Engine Activity Extension Version(s)
+class zes_engine_activity_ext_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_engine_activity_ext_version_t(c_int):
+    def __str__(self):
+        return str(zes_engine_activity_ext_version_v(self.value))
+
+
+###############################################################################
+## @brief Extension properties related to Engine Groups
+## 
+## @details
+##     - This structure may be passed to ::zesEngineGetProperties by having the
+##       pNext member of ::zes_engine_properties_t point at this struct.
+##     - Used for SRIOV per Virtual Function device utilization by
+##       ::zes_engine_group_t
+class zes_engine_ext_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("countOfVirtualFunctionInstance", c_ulong)                     ## [out] Number of Virtual Function(VF) instances associated with engine
+                                                                        ## to monitor the utilization of hardware across all Virtual Function
+                                                                        ## from a Physical Function (PF) instance.
+                                                                        ## These VF-by-VF views should provide engine group and individual engine
+                                                                        ## level granularity.
+                                                                        ## This count represents the number of VF instances that are actively
+                                                                        ## using the resource represented by the engine handle.
+    ]
+
+###############################################################################
+## @brief RAS Get State Extension Name
+ZES_RAS_GET_STATE_EXP_NAME = "ZES_extension_ras_state"
+
+###############################################################################
+## @brief RAS Get State Extension Version(s)
+class zes_ras_state_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_ras_state_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_ras_state_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief RAS error categories
+class zes_ras_error_category_exp_v(IntEnum):
+    RESET = 0                                                               ## The number of accelerator engine resets attempted by the driver
+    PROGRAMMING_ERRORS = 1                                                  ## The number of hardware exceptions generated by the way workloads have
+                                                                            ## programmed the hardware
+    DRIVER_ERRORS = 2                                                       ## The number of low level driver communication errors have occurred
+    COMPUTE_ERRORS = 3                                                      ## The number of errors that have occurred in the compute accelerator
+                                                                            ## hardware
+    NON_COMPUTE_ERRORS = 4                                                  ## The number of errors that have occurred in the fixed-function
+                                                                            ## accelerator hardware
+    CACHE_ERRORS = 5                                                        ## The number of errors that have occurred in caches (L1/L3/register
+                                                                            ## file/shared local memory/sampler)
+    DISPLAY_ERRORS = 6                                                      ## The number of errors that have occurred in the display
+    MEMORY_ERRORS = 7                                                       ## The number of errors that have occurred in Memory
+    SCALE_ERRORS = 8                                                        ## The number of errors that have occurred in Scale Fabric
+    L3FABRIC_ERRORS = 9                                                     ## The number of errors that have occurred in L3 Fabric
+
+class zes_ras_error_category_exp_t(c_int):
+    def __str__(self):
+        return str(zes_ras_error_category_exp_v(self.value))
+
+
+###############################################################################
+## @brief Extension structure for providing RAS error counters for different
+##        error sets
+class zes_ras_state_exp_t(Structure):
+    _fields_ = [
+        ("category", zes_ras_error_category_exp_t),                     ## [out] category for which error counter is provided.
+        ("errorCounter", c_ulonglong)                                   ## [out] Current value of RAS counter for specific error category.
+    ]
+
+###############################################################################
+## @brief Memory State Extension Name
+ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME = "ZES_extension_mem_state"
+
+###############################################################################
+## @brief Memory State Extension Version(s)
+class zes_mem_page_offline_state_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_mem_page_offline_state_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_mem_page_offline_state_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Extension properties for Memory State
+## 
+## @details
+##     - This structure may be returned from ::zesMemoryGetState via the
+##       `pNext` member of ::zes_mem_state_t
+##     - These additional parameters get Memory Page Offline Metrics
+class zes_mem_page_offline_state_exp_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("memoryPageOffline", c_ulong),                                 ## [out] Returns the number of Memory Pages Offline
+        ("maxMemoryPageOffline", c_ulong)                               ## [out] Returns the Allowed Memory Pages Offline
+    ]
+
+###############################################################################
+## @brief Memory Bandwidth Counter Valid Bits Extension Name
+ZES_MEMORY_BANDWIDTH_COUNTER_BITS_EXP_PROPERTIES_NAME = "ZES_extension_mem_bandwidth_counter_bits_properties"
+
+###############################################################################
+## @brief Memory Bandwidth Counter Valid Bits Extension Version(s)
+class zes_mem_bandwidth_counter_bits_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_mem_bandwidth_counter_bits_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_mem_bandwidth_counter_bits_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Extension properties for reporting valid bit count for memory
+##        bandwidth counter value
+## 
+## @details
+##     - Number of valid read and write counter bits of memory bandwidth
+##     - This structure may be returned from ::zesMemoryGetProperties via the
+##       `pNext` member of ::zes_mem_properties_t.
+##     - Used for denoting number of valid bits in the counter value returned
+##       in ::zes_mem_bandwidth_t.
+class zes_mem_bandwidth_counter_bits_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("validBitsCount", c_ulong)                                     ## [out] Returns the number of valid bits in the counter values
+    ]
+
+###############################################################################
+## @brief Power Domain Properties Name
+ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME = "ZES_extension_power_domain_properties"
+
+###############################################################################
+## @brief Power Domain Properties Extension Version(s)
+class zes_power_domain_properties_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_power_domain_properties_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_power_domain_properties_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Extension structure for providing power domain information associated
+##        with a power handle
+## 
+## @details
+##     - This structure may be returned from ::zesPowerGetProperties via the
+##       `pNext` member of ::zes_power_properties_t.
+##     - Used for associating a power handle with a power domain.
+class zes_power_domain_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("powerDomain", zes_power_domain_t)                             ## [out] Power domain associated with the power handle.
+    ]
+
+###############################################################################
+## @brief Firmware security version
+ZES_FIRMWARE_SECURITY_VERSION_EXP_NAME = "ZES_experimental_firmware_security_version"
+
+###############################################################################
+## @brief Firmware security version Extension Version(s)
+class zes_firmware_security_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_firmware_security_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_firmware_security_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Sysman Device Mapping Extension Name
+ZES_SYSMAN_DEVICE_MAPPING_EXP_NAME = "ZES_experimental_sysman_device_mapping"
+
+###############################################################################
+## @brief Sysman Device Mapping Extension Version(s)
+class zes_sysman_device_mapping_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zes_sysman_device_mapping_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_sysman_device_mapping_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Sub Device Properties
+class zes_subdevice_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("subdeviceId", c_ulong),                                       ## [out] this gives the ID of the sub device
+        ("uuid", zes_uuid_t)                                            ## [out] universal unique identifier of the sub device.
+    ]
+
+###############################################################################
+## @brief Virtual Function Management Extension Name
+ZES_VIRTUAL_FUNCTION_MANAGEMENT_EXP_NAME = "ZES_experimental_virtual_function_management"
+
+###############################################################################
+## @brief Virtual Function Management Extension Version(s)
+class zes_vf_management_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0 (deprecated)
+    _1_1 = ZE_MAKE_VERSION( 1, 1 )                                          ## version 1.1 (deprecated)
+    _1_2 = ZE_MAKE_VERSION( 1, 2 )                                          ## version 1.2
+    CURRENT = ZE_MAKE_VERSION( 1, 2 )                                       ## latest known version
+
+class zes_vf_management_exp_version_t(c_int):
+    def __str__(self):
+        return str(zes_vf_management_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Virtual function memory types (deprecated)
+class zes_vf_info_mem_type_exp_flags_v(IntEnum):
+    MEM_TYPE_SYSTEM = ZE_BIT(0)                                             ## System memory
+    MEM_TYPE_DEVICE = ZE_BIT(1)                                             ## Device local memory
+
+class zes_vf_info_mem_type_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Virtual function utilization flag bit fields (deprecated)
+class zes_vf_info_util_exp_flags_v(IntEnum):
+    INFO_NONE = ZE_BIT(0)                                                   ## No info associated with virtual function
+    INFO_MEM_CPU = ZE_BIT(1)                                                ## System memory utilization associated with virtual function
+    INFO_MEM_GPU = ZE_BIT(2)                                                ## Device memory utilization associated with virtual function
+    INFO_ENGINE = ZE_BIT(3)                                                 ## Engine utilization associated with virtual function
+
+class zes_vf_info_util_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Virtual function management properties (deprecated)
+class zes_vf_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("address", zes_pci_address_t),                                 ## [out] Virtual function BDF address
+        ("uuid", zes_uuid_t),                                           ## [out] universal unique identifier of the device
+        ("flags", zes_vf_info_util_exp_flags_t)                         ## [out] utilization flags available. May be 0 or a valid combination of
+                                                                        ## ::zes_vf_info_util_exp_flag_t.
+    ]
+
+###############################################################################
+## @brief Provides memory utilization values for a virtual function (deprecated)
+class zes_vf_util_mem_exp_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("memTypeFlags", zes_vf_info_mem_type_exp_flags_t),             ## [out] Memory type flags.
+        ("free", c_ulonglong),                                          ## [out] Free memory size in bytes.
+        ("size", c_ulonglong),                                          ## [out] Total allocatable memory in bytes.
+        ("timestamp", c_ulonglong)                                      ## [out] Wall clock time from VF when value was sampled.
+    ]
+
+###############################################################################
+## @brief Provides engine utilization values for a virtual function (deprecated)
+class zes_vf_util_engine_exp_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zes_engine_group_t),                                   ## [out] The engine group.
+        ("activeCounterValue", c_ulonglong),                            ## [out] Represents active counter.
+        ("samplingCounterValue", c_ulonglong),                          ## [out] Represents counter value when activeCounterValue was sampled.
+        ("timestamp", c_ulonglong)                                      ## [out] Wall clock time when the activeCounterValue was sampled.
+    ]
+
+###############################################################################
+## @brief Virtual function management capabilities
+class zes_vf_exp_capabilities_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("address", zes_pci_address_t),                                 ## [out] Virtual function BDF address
+        ("vfDeviceMemSize", c_ulong),                                   ## [out] Virtual function memory size in bytes
+        ("vfID", c_ulong)                                               ## [out] Virtual Function ID
+    ]
+
+###############################################################################
+## @brief Provides memory utilization values for a virtual function
+class zes_vf_util_mem_exp2_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("vfMemLocation", zes_mem_loc_t),                               ## [out] Location of this memory (system, device)
+        ("vfMemUtilized", c_ulonglong)                                  ## [out] Free memory size in bytes.
+    ]
+
+###############################################################################
+## @brief Provides engine utilization values for a virtual function
+## 
+## @details
+##     - Percent utilization is calculated by taking two snapshots (s1, s2) and
+##       using the equation: %util = (s2.activeCounterValue -
+##       s1.activeCounterValue) / (s2.samplingCounterValue -
+##       s1.samplingCounterValue)
+class zes_vf_util_engine_exp2_t(Structure):
+    _fields_ = [
+        ("stype", zes_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("vfEngineType", zes_engine_group_t),                           ## [out] The engine group.
+        ("activeCounterValue", c_ulonglong),                            ## [out] Represents active counter.
+        ("samplingCounterValue", c_ulonglong)                           ## [out] Represents counter value when activeCounterValue was sampled.
+                                                                        ## Refer to the formulae above for calculating the utilization percent
+    ]
+
+###############################################################################
+__use_win_types = "Windows" == platform.uname()[0]
+
+###############################################################################
+## @brief Function-pointer for zesInit
+if __use_win_types:
+    _zesInit_t = WINFUNCTYPE( ze_result_t, zes_init_flags_t )
+else:
+    _zesInit_t = CFUNCTYPE( ze_result_t, zes_init_flags_t )
+
+
+###############################################################################
+## @brief Table of Global functions pointers
+class _zes_global_dditable_t(Structure):
+    _fields_ = [
+        ("pfnInit", c_void_p)                                           ## _zesInit_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetProperties
+if __use_win_types:
+    _zesDeviceGetProperties_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_properties_t) )
+else:
+    _zesDeviceGetProperties_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetState
+if __use_win_types:
+    _zesDeviceGetState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_state_t) )
+else:
+    _zesDeviceGetState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceReset
+if __use_win_types:
+    _zesDeviceReset_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, ze_bool_t )
+else:
+    _zesDeviceReset_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, ze_bool_t )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceProcessesGetState
+if __use_win_types:
+    _zesDeviceProcessesGetState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_process_state_t) )
+else:
+    _zesDeviceProcessesGetState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_process_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDevicePciGetProperties
+if __use_win_types:
+    _zesDevicePciGetProperties_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_properties_t) )
+else:
+    _zesDevicePciGetProperties_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDevicePciGetState
+if __use_win_types:
+    _zesDevicePciGetState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_state_t) )
+else:
+    _zesDevicePciGetState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDevicePciGetBars
+if __use_win_types:
+    _zesDevicePciGetBars_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pci_bar_properties_t) )
+else:
+    _zesDevicePciGetBars_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pci_bar_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDevicePciGetStats
+if __use_win_types:
+    _zesDevicePciGetStats_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_stats_t) )
+else:
+    _zesDevicePciGetStats_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pci_stats_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumDiagnosticTestSuites
+if __use_win_types:
+    _zesDeviceEnumDiagnosticTestSuites_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_diag_handle_t) )
+else:
+    _zesDeviceEnumDiagnosticTestSuites_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_diag_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumEngineGroups
+if __use_win_types:
+    _zesDeviceEnumEngineGroups_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_engine_handle_t) )
+else:
+    _zesDeviceEnumEngineGroups_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_engine_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEventRegister
+if __use_win_types:
+    _zesDeviceEventRegister_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, zes_event_type_flags_t )
+else:
+    _zesDeviceEventRegister_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, zes_event_type_flags_t )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumFabricPorts
+if __use_win_types:
+    _zesDeviceEnumFabricPorts_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fabric_port_handle_t) )
+else:
+    _zesDeviceEnumFabricPorts_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fabric_port_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumFans
+if __use_win_types:
+    _zesDeviceEnumFans_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fan_handle_t) )
+else:
+    _zesDeviceEnumFans_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_fan_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumFirmwares
+if __use_win_types:
+    _zesDeviceEnumFirmwares_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_firmware_handle_t) )
+else:
+    _zesDeviceEnumFirmwares_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_firmware_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumFrequencyDomains
+if __use_win_types:
+    _zesDeviceEnumFrequencyDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_freq_handle_t) )
+else:
+    _zesDeviceEnumFrequencyDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_freq_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumLeds
+if __use_win_types:
+    _zesDeviceEnumLeds_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_led_handle_t) )
+else:
+    _zesDeviceEnumLeds_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_led_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumMemoryModules
+if __use_win_types:
+    _zesDeviceEnumMemoryModules_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_mem_handle_t) )
+else:
+    _zesDeviceEnumMemoryModules_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_mem_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumPerformanceFactorDomains
+if __use_win_types:
+    _zesDeviceEnumPerformanceFactorDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_perf_handle_t) )
+else:
+    _zesDeviceEnumPerformanceFactorDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_perf_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumPowerDomains
+if __use_win_types:
+    _zesDeviceEnumPowerDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pwr_handle_t) )
+else:
+    _zesDeviceEnumPowerDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_pwr_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetCardPowerDomain
+if __use_win_types:
+    _zesDeviceGetCardPowerDomain_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pwr_handle_t) )
+else:
+    _zesDeviceGetCardPowerDomain_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_pwr_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumPsus
+if __use_win_types:
+    _zesDeviceEnumPsus_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_psu_handle_t) )
+else:
+    _zesDeviceEnumPsus_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_psu_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumRasErrorSets
+if __use_win_types:
+    _zesDeviceEnumRasErrorSets_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_ras_handle_t) )
+else:
+    _zesDeviceEnumRasErrorSets_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_ras_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumSchedulers
+if __use_win_types:
+    _zesDeviceEnumSchedulers_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_sched_handle_t) )
+else:
+    _zesDeviceEnumSchedulers_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_sched_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumStandbyDomains
+if __use_win_types:
+    _zesDeviceEnumStandbyDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_standby_handle_t) )
+else:
+    _zesDeviceEnumStandbyDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_standby_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumTemperatureSensors
+if __use_win_types:
+    _zesDeviceEnumTemperatureSensors_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_temp_handle_t) )
+else:
+    _zesDeviceEnumTemperatureSensors_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_temp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEccAvailable
+if __use_win_types:
+    _zesDeviceEccAvailable_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(ze_bool_t) )
+else:
+    _zesDeviceEccAvailable_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEccConfigurable
+if __use_win_types:
+    _zesDeviceEccConfigurable_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(ze_bool_t) )
+else:
+    _zesDeviceEccConfigurable_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetEccState
+if __use_win_types:
+    _zesDeviceGetEccState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_ecc_properties_t) )
+else:
+    _zesDeviceGetEccState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_ecc_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceSetEccState
+if __use_win_types:
+    _zesDeviceSetEccState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_ecc_desc_t), POINTER(zes_device_ecc_properties_t) )
+else:
+    _zesDeviceSetEccState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_device_ecc_desc_t), POINTER(zes_device_ecc_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGet
+if __use_win_types:
+    _zesDeviceGet_t = WINFUNCTYPE( ze_result_t, zes_driver_handle_t, POINTER(c_ulong), POINTER(zes_device_handle_t) )
+else:
+    _zesDeviceGet_t = CFUNCTYPE( ze_result_t, zes_driver_handle_t, POINTER(c_ulong), POINTER(zes_device_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceSetOverclockWaiver
+if __use_win_types:
+    _zesDeviceSetOverclockWaiver_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t )
+else:
+    _zesDeviceSetOverclockWaiver_t = CFUNCTYPE( ze_result_t, zes_device_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetOverclockDomains
+if __use_win_types:
+    _zesDeviceGetOverclockDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong) )
+else:
+    _zesDeviceGetOverclockDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetOverclockControls
+if __use_win_types:
+    _zesDeviceGetOverclockControls_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, zes_overclock_domain_t, POINTER(c_ulong) )
+else:
+    _zesDeviceGetOverclockControls_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, zes_overclock_domain_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceResetOverclockSettings
+if __use_win_types:
+    _zesDeviceResetOverclockSettings_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, ze_bool_t )
+else:
+    _zesDeviceResetOverclockSettings_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, ze_bool_t )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceReadOverclockState
+if __use_win_types:
+    _zesDeviceReadOverclockState_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_overclock_mode_t), POINTER(ze_bool_t), POINTER(ze_bool_t), POINTER(zes_pending_action_t), POINTER(ze_bool_t) )
+else:
+    _zesDeviceReadOverclockState_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_overclock_mode_t), POINTER(ze_bool_t), POINTER(ze_bool_t), POINTER(zes_pending_action_t), POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumOverclockDomains
+if __use_win_types:
+    _zesDeviceEnumOverclockDomains_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_overclock_handle_t) )
+else:
+    _zesDeviceEnumOverclockDomains_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_overclock_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceResetExt
+if __use_win_types:
+    _zesDeviceResetExt_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_reset_properties_t) )
+else:
+    _zesDeviceResetExt_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(zes_reset_properties_t) )
+
+
+###############################################################################
+## @brief Table of Device functions pointers
+class _zes_device_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesDeviceGetProperties_t
+        ("pfnGetState", c_void_p),                                      ## _zesDeviceGetState_t
+        ("pfnReset", c_void_p),                                         ## _zesDeviceReset_t
+        ("pfnProcessesGetState", c_void_p),                             ## _zesDeviceProcessesGetState_t
+        ("pfnPciGetProperties", c_void_p),                              ## _zesDevicePciGetProperties_t
+        ("pfnPciGetState", c_void_p),                                   ## _zesDevicePciGetState_t
+        ("pfnPciGetBars", c_void_p),                                    ## _zesDevicePciGetBars_t
+        ("pfnPciGetStats", c_void_p),                                   ## _zesDevicePciGetStats_t
+        ("pfnEnumDiagnosticTestSuites", c_void_p),                      ## _zesDeviceEnumDiagnosticTestSuites_t
+        ("pfnEnumEngineGroups", c_void_p),                              ## _zesDeviceEnumEngineGroups_t
+        ("pfnEventRegister", c_void_p),                                 ## _zesDeviceEventRegister_t
+        ("pfnEnumFabricPorts", c_void_p),                               ## _zesDeviceEnumFabricPorts_t
+        ("pfnEnumFans", c_void_p),                                      ## _zesDeviceEnumFans_t
+        ("pfnEnumFirmwares", c_void_p),                                 ## _zesDeviceEnumFirmwares_t
+        ("pfnEnumFrequencyDomains", c_void_p),                          ## _zesDeviceEnumFrequencyDomains_t
+        ("pfnEnumLeds", c_void_p),                                      ## _zesDeviceEnumLeds_t
+        ("pfnEnumMemoryModules", c_void_p),                             ## _zesDeviceEnumMemoryModules_t
+        ("pfnEnumPerformanceFactorDomains", c_void_p),                  ## _zesDeviceEnumPerformanceFactorDomains_t
+        ("pfnEnumPowerDomains", c_void_p),                              ## _zesDeviceEnumPowerDomains_t
+        ("pfnGetCardPowerDomain", c_void_p),                            ## _zesDeviceGetCardPowerDomain_t
+        ("pfnEnumPsus", c_void_p),                                      ## _zesDeviceEnumPsus_t
+        ("pfnEnumRasErrorSets", c_void_p),                              ## _zesDeviceEnumRasErrorSets_t
+        ("pfnEnumSchedulers", c_void_p),                                ## _zesDeviceEnumSchedulers_t
+        ("pfnEnumStandbyDomains", c_void_p),                            ## _zesDeviceEnumStandbyDomains_t
+        ("pfnEnumTemperatureSensors", c_void_p),                        ## _zesDeviceEnumTemperatureSensors_t
+        ("pfnEccAvailable", c_void_p),                                  ## _zesDeviceEccAvailable_t
+        ("pfnEccConfigurable", c_void_p),                               ## _zesDeviceEccConfigurable_t
+        ("pfnGetEccState", c_void_p),                                   ## _zesDeviceGetEccState_t
+        ("pfnSetEccState", c_void_p),                                   ## _zesDeviceSetEccState_t
+        ("pfnGet", c_void_p),                                           ## _zesDeviceGet_t
+        ("pfnSetOverclockWaiver", c_void_p),                            ## _zesDeviceSetOverclockWaiver_t
+        ("pfnGetOverclockDomains", c_void_p),                           ## _zesDeviceGetOverclockDomains_t
+        ("pfnGetOverclockControls", c_void_p),                          ## _zesDeviceGetOverclockControls_t
+        ("pfnResetOverclockSettings", c_void_p),                        ## _zesDeviceResetOverclockSettings_t
+        ("pfnReadOverclockState", c_void_p),                            ## _zesDeviceReadOverclockState_t
+        ("pfnEnumOverclockDomains", c_void_p),                          ## _zesDeviceEnumOverclockDomains_t
+        ("pfnResetExt", c_void_p)                                       ## _zesDeviceResetExt_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesDeviceGetSubDevicePropertiesExp
+if __use_win_types:
+    _zesDeviceGetSubDevicePropertiesExp_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_subdevice_exp_properties_t) )
+else:
+    _zesDeviceGetSubDevicePropertiesExp_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_subdevice_exp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumActiveVFExp
+if __use_win_types:
+    _zesDeviceEnumActiveVFExp_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_vf_handle_t) )
+else:
+    _zesDeviceEnumActiveVFExp_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_vf_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDeviceEnumEnabledVFExp
+if __use_win_types:
+    _zesDeviceEnumEnabledVFExp_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_vf_handle_t) )
+else:
+    _zesDeviceEnumEnabledVFExp_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, POINTER(c_ulong), POINTER(zes_vf_handle_t) )
+
+
+###############################################################################
+## @brief Table of DeviceExp functions pointers
+class _zes_device_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetSubDevicePropertiesExp", c_void_p),                     ## _zesDeviceGetSubDevicePropertiesExp_t
+        ("pfnEnumActiveVFExp", c_void_p),                               ## _zesDeviceEnumActiveVFExp_t
+        ("pfnEnumEnabledVFExp", c_void_p)                               ## _zesDeviceEnumEnabledVFExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesDriverEventListen
+if __use_win_types:
+    _zesDriverEventListen_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
+else:
+    _zesDriverEventListen_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDriverEventListenEx
+if __use_win_types:
+    _zesDriverEventListenEx_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulonglong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
+else:
+    _zesDriverEventListenEx_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, c_ulonglong, c_ulong, POINTER(zes_device_handle_t), POINTER(c_ulong), POINTER(zes_event_type_flags_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDriverGet
+if __use_win_types:
+    _zesDriverGet_t = WINFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(zes_driver_handle_t) )
+else:
+    _zesDriverGet_t = CFUNCTYPE( ze_result_t, POINTER(c_ulong), POINTER(zes_driver_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDriverGetExtensionProperties
+if __use_win_types:
+    _zesDriverGetExtensionProperties_t = WINFUNCTYPE( ze_result_t, zes_driver_handle_t, POINTER(c_ulong), POINTER(zes_driver_extension_properties_t) )
+else:
+    _zesDriverGetExtensionProperties_t = CFUNCTYPE( ze_result_t, zes_driver_handle_t, POINTER(c_ulong), POINTER(zes_driver_extension_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDriverGetExtensionFunctionAddress
+if __use_win_types:
+    _zesDriverGetExtensionFunctionAddress_t = WINFUNCTYPE( ze_result_t, zes_driver_handle_t, c_char_p, POINTER(c_void_p) )
+else:
+    _zesDriverGetExtensionFunctionAddress_t = CFUNCTYPE( ze_result_t, zes_driver_handle_t, c_char_p, POINTER(c_void_p) )
+
+
+###############################################################################
+## @brief Table of Driver functions pointers
+class _zes_driver_dditable_t(Structure):
+    _fields_ = [
+        ("pfnEventListen", c_void_p),                                   ## _zesDriverEventListen_t
+        ("pfnEventListenEx", c_void_p),                                 ## _zesDriverEventListenEx_t
+        ("pfnGet", c_void_p),                                           ## _zesDriverGet_t
+        ("pfnGetExtensionProperties", c_void_p),                        ## _zesDriverGetExtensionProperties_t
+        ("pfnGetExtensionFunctionAddress", c_void_p)                    ## _zesDriverGetExtensionFunctionAddress_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesDriverGetDeviceByUuidExp
+if __use_win_types:
+    _zesDriverGetDeviceByUuidExp_t = WINFUNCTYPE( ze_result_t, zes_driver_handle_t, zes_uuid_t, POINTER(zes_device_handle_t), POINTER(ze_bool_t), POINTER(c_ulong) )
+else:
+    _zesDriverGetDeviceByUuidExp_t = CFUNCTYPE( ze_result_t, zes_driver_handle_t, zes_uuid_t, POINTER(zes_device_handle_t), POINTER(ze_bool_t), POINTER(c_ulong) )
+
+
+###############################################################################
+## @brief Table of DriverExp functions pointers
+class _zes_driver_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetDeviceByUuidExp", c_void_p)                             ## _zesDriverGetDeviceByUuidExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetDomainProperties
+if __use_win_types:
+    _zesOverclockGetDomainProperties_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, POINTER(zes_overclock_properties_t) )
+else:
+    _zesOverclockGetDomainProperties_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, POINTER(zes_overclock_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetDomainVFProperties
+if __use_win_types:
+    _zesOverclockGetDomainVFProperties_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, POINTER(zes_vf_property_t) )
+else:
+    _zesOverclockGetDomainVFProperties_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, POINTER(zes_vf_property_t) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetDomainControlProperties
+if __use_win_types:
+    _zesOverclockGetDomainControlProperties_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(zes_control_property_t) )
+else:
+    _zesOverclockGetDomainControlProperties_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(zes_control_property_t) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetControlCurrentValue
+if __use_win_types:
+    _zesOverclockGetControlCurrentValue_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(c_double) )
+else:
+    _zesOverclockGetControlCurrentValue_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetControlPendingValue
+if __use_win_types:
+    _zesOverclockGetControlPendingValue_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(c_double) )
+else:
+    _zesOverclockGetControlPendingValue_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockSetControlUserValue
+if __use_win_types:
+    _zesOverclockSetControlUserValue_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, c_double, POINTER(zes_pending_action_t) )
+else:
+    _zesOverclockSetControlUserValue_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, c_double, POINTER(zes_pending_action_t) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetControlState
+if __use_win_types:
+    _zesOverclockGetControlState_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(zes_control_state_t), POINTER(zes_pending_action_t) )
+else:
+    _zesOverclockGetControlState_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_overclock_control_t, POINTER(zes_control_state_t), POINTER(zes_pending_action_t) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockGetVFPointValues
+if __use_win_types:
+    _zesOverclockGetVFPointValues_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_vf_type_t, zes_vf_array_type_t, c_ulong, POINTER(c_ulong) )
+else:
+    _zesOverclockGetVFPointValues_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_vf_type_t, zes_vf_array_type_t, c_ulong, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zesOverclockSetVFPointValues
+if __use_win_types:
+    _zesOverclockSetVFPointValues_t = WINFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_vf_type_t, c_ulong, c_ulong )
+else:
+    _zesOverclockSetVFPointValues_t = CFUNCTYPE( ze_result_t, zes_overclock_handle_t, zes_vf_type_t, c_ulong, c_ulong )
+
+
+###############################################################################
+## @brief Table of Overclock functions pointers
+class _zes_overclock_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetDomainProperties", c_void_p),                           ## _zesOverclockGetDomainProperties_t
+        ("pfnGetDomainVFProperties", c_void_p),                         ## _zesOverclockGetDomainVFProperties_t
+        ("pfnGetDomainControlProperties", c_void_p),                    ## _zesOverclockGetDomainControlProperties_t
+        ("pfnGetControlCurrentValue", c_void_p),                        ## _zesOverclockGetControlCurrentValue_t
+        ("pfnGetControlPendingValue", c_void_p),                        ## _zesOverclockGetControlPendingValue_t
+        ("pfnSetControlUserValue", c_void_p),                           ## _zesOverclockSetControlUserValue_t
+        ("pfnGetControlState", c_void_p),                               ## _zesOverclockGetControlState_t
+        ("pfnGetVFPointValues", c_void_p),                              ## _zesOverclockGetVFPointValues_t
+        ("pfnSetVFPointValues", c_void_p)                               ## _zesOverclockSetVFPointValues_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerGetProperties
+if __use_win_types:
+    _zesSchedulerGetProperties_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_properties_t) )
+else:
+    _zesSchedulerGetProperties_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerGetCurrentMode
+if __use_win_types:
+    _zesSchedulerGetCurrentMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_mode_t) )
+else:
+    _zesSchedulerGetCurrentMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_mode_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerGetTimeoutModeProperties
+if __use_win_types:
+    _zesSchedulerGetTimeoutModeProperties_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeout_properties_t) )
+else:
+    _zesSchedulerGetTimeoutModeProperties_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeout_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerGetTimesliceModeProperties
+if __use_win_types:
+    _zesSchedulerGetTimesliceModeProperties_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeslice_properties_t) )
+else:
+    _zesSchedulerGetTimesliceModeProperties_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, ze_bool_t, POINTER(zes_sched_timeslice_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerSetTimeoutMode
+if __use_win_types:
+    _zesSchedulerSetTimeoutMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeout_properties_t), POINTER(ze_bool_t) )
+else:
+    _zesSchedulerSetTimeoutMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeout_properties_t), POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerSetTimesliceMode
+if __use_win_types:
+    _zesSchedulerSetTimesliceMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeslice_properties_t), POINTER(ze_bool_t) )
+else:
+    _zesSchedulerSetTimesliceMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(zes_sched_timeslice_properties_t), POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerSetExclusiveMode
+if __use_win_types:
+    _zesSchedulerSetExclusiveMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
+else:
+    _zesSchedulerSetExclusiveMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
+
+###############################################################################
+## @brief Function-pointer for zesSchedulerSetComputeUnitDebugMode
+if __use_win_types:
+    _zesSchedulerSetComputeUnitDebugMode_t = WINFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
+else:
+    _zesSchedulerSetComputeUnitDebugMode_t = CFUNCTYPE( ze_result_t, zes_sched_handle_t, POINTER(ze_bool_t) )
+
+
+###############################################################################
+## @brief Table of Scheduler functions pointers
+class _zes_scheduler_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesSchedulerGetProperties_t
+        ("pfnGetCurrentMode", c_void_p),                                ## _zesSchedulerGetCurrentMode_t
+        ("pfnGetTimeoutModeProperties", c_void_p),                      ## _zesSchedulerGetTimeoutModeProperties_t
+        ("pfnGetTimesliceModeProperties", c_void_p),                    ## _zesSchedulerGetTimesliceModeProperties_t
+        ("pfnSetTimeoutMode", c_void_p),                                ## _zesSchedulerSetTimeoutMode_t
+        ("pfnSetTimesliceMode", c_void_p),                              ## _zesSchedulerSetTimesliceMode_t
+        ("pfnSetExclusiveMode", c_void_p),                              ## _zesSchedulerSetExclusiveMode_t
+        ("pfnSetComputeUnitDebugMode", c_void_p)                        ## _zesSchedulerSetComputeUnitDebugMode_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesPerformanceFactorGetProperties
+if __use_win_types:
+    _zesPerformanceFactorGetProperties_t = WINFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(zes_perf_properties_t) )
+else:
+    _zesPerformanceFactorGetProperties_t = CFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(zes_perf_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPerformanceFactorGetConfig
+if __use_win_types:
+    _zesPerformanceFactorGetConfig_t = WINFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(c_double) )
+else:
+    _zesPerformanceFactorGetConfig_t = CFUNCTYPE( ze_result_t, zes_perf_handle_t, POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesPerformanceFactorSetConfig
+if __use_win_types:
+    _zesPerformanceFactorSetConfig_t = WINFUNCTYPE( ze_result_t, zes_perf_handle_t, c_double )
+else:
+    _zesPerformanceFactorSetConfig_t = CFUNCTYPE( ze_result_t, zes_perf_handle_t, c_double )
+
+
+###############################################################################
+## @brief Table of PerformanceFactor functions pointers
+class _zes_performance_factor_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesPerformanceFactorGetProperties_t
+        ("pfnGetConfig", c_void_p),                                     ## _zesPerformanceFactorGetConfig_t
+        ("pfnSetConfig", c_void_p)                                      ## _zesPerformanceFactorSetConfig_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesPowerGetProperties
+if __use_win_types:
+    _zesPowerGetProperties_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_properties_t) )
+else:
+    _zesPowerGetProperties_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPowerGetEnergyCounter
+if __use_win_types:
+    _zesPowerGetEnergyCounter_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_energy_counter_t) )
+else:
+    _zesPowerGetEnergyCounter_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_energy_counter_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPowerGetLimits
+if __use_win_types:
+    _zesPowerGetLimits_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
+else:
+    _zesPowerGetLimits_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPowerSetLimits
+if __use_win_types:
+    _zesPowerSetLimits_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
+else:
+    _zesPowerSetLimits_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_power_sustained_limit_t), POINTER(zes_power_burst_limit_t), POINTER(zes_power_peak_limit_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPowerGetEnergyThreshold
+if __use_win_types:
+    _zesPowerGetEnergyThreshold_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_energy_threshold_t) )
+else:
+    _zesPowerGetEnergyThreshold_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(zes_energy_threshold_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPowerSetEnergyThreshold
+if __use_win_types:
+    _zesPowerSetEnergyThreshold_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, c_double )
+else:
+    _zesPowerSetEnergyThreshold_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, c_double )
+
+###############################################################################
+## @brief Function-pointer for zesPowerGetLimitsExt
+if __use_win_types:
+    _zesPowerGetLimitsExt_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(c_ulong), POINTER(zes_power_limit_ext_desc_t) )
+else:
+    _zesPowerGetLimitsExt_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(c_ulong), POINTER(zes_power_limit_ext_desc_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPowerSetLimitsExt
+if __use_win_types:
+    _zesPowerSetLimitsExt_t = WINFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(c_ulong), POINTER(zes_power_limit_ext_desc_t) )
+else:
+    _zesPowerSetLimitsExt_t = CFUNCTYPE( ze_result_t, zes_pwr_handle_t, POINTER(c_ulong), POINTER(zes_power_limit_ext_desc_t) )
+
+
+###############################################################################
+## @brief Table of Power functions pointers
+class _zes_power_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesPowerGetProperties_t
+        ("pfnGetEnergyCounter", c_void_p),                              ## _zesPowerGetEnergyCounter_t
+        ("pfnGetLimits", c_void_p),                                     ## _zesPowerGetLimits_t
+        ("pfnSetLimits", c_void_p),                                     ## _zesPowerSetLimits_t
+        ("pfnGetEnergyThreshold", c_void_p),                            ## _zesPowerGetEnergyThreshold_t
+        ("pfnSetEnergyThreshold", c_void_p),                            ## _zesPowerSetEnergyThreshold_t
+        ("pfnGetLimitsExt", c_void_p),                                  ## _zesPowerGetLimitsExt_t
+        ("pfnSetLimitsExt", c_void_p)                                   ## _zesPowerSetLimitsExt_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyGetProperties
+if __use_win_types:
+    _zesFrequencyGetProperties_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_properties_t) )
+else:
+    _zesFrequencyGetProperties_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyGetAvailableClocks
+if __use_win_types:
+    _zesFrequencyGetAvailableClocks_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_ulong), POINTER(c_double) )
+else:
+    _zesFrequencyGetAvailableClocks_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_ulong), POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyGetRange
+if __use_win_types:
+    _zesFrequencyGetRange_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
+else:
+    _zesFrequencyGetRange_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencySetRange
+if __use_win_types:
+    _zesFrequencySetRange_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
+else:
+    _zesFrequencySetRange_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_range_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyGetState
+if __use_win_types:
+    _zesFrequencyGetState_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_state_t) )
+else:
+    _zesFrequencyGetState_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyGetThrottleTime
+if __use_win_types:
+    _zesFrequencyGetThrottleTime_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_throttle_time_t) )
+else:
+    _zesFrequencyGetThrottleTime_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_freq_throttle_time_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcGetCapabilities
+if __use_win_types:
+    _zesFrequencyOcGetCapabilities_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_capabilities_t) )
+else:
+    _zesFrequencyOcGetCapabilities_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_capabilities_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcGetFrequencyTarget
+if __use_win_types:
+    _zesFrequencyOcGetFrequencyTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
+else:
+    _zesFrequencyOcGetFrequencyTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcSetFrequencyTarget
+if __use_win_types:
+    _zesFrequencyOcSetFrequencyTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
+else:
+    _zesFrequencyOcSetFrequencyTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcGetVoltageTarget
+if __use_win_types:
+    _zesFrequencyOcGetVoltageTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double), POINTER(c_double) )
+else:
+    _zesFrequencyOcGetVoltageTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double), POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcSetVoltageTarget
+if __use_win_types:
+    _zesFrequencyOcSetVoltageTarget_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double, c_double )
+else:
+    _zesFrequencyOcSetVoltageTarget_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double, c_double )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcSetMode
+if __use_win_types:
+    _zesFrequencyOcSetMode_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, zes_oc_mode_t )
+else:
+    _zesFrequencyOcSetMode_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, zes_oc_mode_t )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcGetMode
+if __use_win_types:
+    _zesFrequencyOcGetMode_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_mode_t) )
+else:
+    _zesFrequencyOcGetMode_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(zes_oc_mode_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcGetIccMax
+if __use_win_types:
+    _zesFrequencyOcGetIccMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
+else:
+    _zesFrequencyOcGetIccMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcSetIccMax
+if __use_win_types:
+    _zesFrequencyOcSetIccMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
+else:
+    _zesFrequencyOcSetIccMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcGetTjMax
+if __use_win_types:
+    _zesFrequencyOcGetTjMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
+else:
+    _zesFrequencyOcGetTjMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, POINTER(c_double) )
+
+###############################################################################
+## @brief Function-pointer for zesFrequencyOcSetTjMax
+if __use_win_types:
+    _zesFrequencyOcSetTjMax_t = WINFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
+else:
+    _zesFrequencyOcSetTjMax_t = CFUNCTYPE( ze_result_t, zes_freq_handle_t, c_double )
+
+
+###############################################################################
+## @brief Table of Frequency functions pointers
+class _zes_frequency_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesFrequencyGetProperties_t
+        ("pfnGetAvailableClocks", c_void_p),                            ## _zesFrequencyGetAvailableClocks_t
+        ("pfnGetRange", c_void_p),                                      ## _zesFrequencyGetRange_t
+        ("pfnSetRange", c_void_p),                                      ## _zesFrequencySetRange_t
+        ("pfnGetState", c_void_p),                                      ## _zesFrequencyGetState_t
+        ("pfnGetThrottleTime", c_void_p),                               ## _zesFrequencyGetThrottleTime_t
+        ("pfnOcGetCapabilities", c_void_p),                             ## _zesFrequencyOcGetCapabilities_t
+        ("pfnOcGetFrequencyTarget", c_void_p),                          ## _zesFrequencyOcGetFrequencyTarget_t
+        ("pfnOcSetFrequencyTarget", c_void_p),                          ## _zesFrequencyOcSetFrequencyTarget_t
+        ("pfnOcGetVoltageTarget", c_void_p),                            ## _zesFrequencyOcGetVoltageTarget_t
+        ("pfnOcSetVoltageTarget", c_void_p),                            ## _zesFrequencyOcSetVoltageTarget_t
+        ("pfnOcSetMode", c_void_p),                                     ## _zesFrequencyOcSetMode_t
+        ("pfnOcGetMode", c_void_p),                                     ## _zesFrequencyOcGetMode_t
+        ("pfnOcGetIccMax", c_void_p),                                   ## _zesFrequencyOcGetIccMax_t
+        ("pfnOcSetIccMax", c_void_p),                                   ## _zesFrequencyOcSetIccMax_t
+        ("pfnOcGetTjMax", c_void_p),                                    ## _zesFrequencyOcGetTjMax_t
+        ("pfnOcSetTjMax", c_void_p)                                     ## _zesFrequencyOcSetTjMax_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesEngineGetProperties
+if __use_win_types:
+    _zesEngineGetProperties_t = WINFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_properties_t) )
+else:
+    _zesEngineGetProperties_t = CFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesEngineGetActivity
+if __use_win_types:
+    _zesEngineGetActivity_t = WINFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_stats_t) )
+else:
+    _zesEngineGetActivity_t = CFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(zes_engine_stats_t) )
+
+###############################################################################
+## @brief Function-pointer for zesEngineGetActivityExt
+if __use_win_types:
+    _zesEngineGetActivityExt_t = WINFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(c_ulong), POINTER(zes_engine_stats_t) )
+else:
+    _zesEngineGetActivityExt_t = CFUNCTYPE( ze_result_t, zes_engine_handle_t, POINTER(c_ulong), POINTER(zes_engine_stats_t) )
+
+
+###############################################################################
+## @brief Table of Engine functions pointers
+class _zes_engine_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesEngineGetProperties_t
+        ("pfnGetActivity", c_void_p),                                   ## _zesEngineGetActivity_t
+        ("pfnGetActivityExt", c_void_p)                                 ## _zesEngineGetActivityExt_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesStandbyGetProperties
+if __use_win_types:
+    _zesStandbyGetProperties_t = WINFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_properties_t) )
+else:
+    _zesStandbyGetProperties_t = CFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesStandbyGetMode
+if __use_win_types:
+    _zesStandbyGetMode_t = WINFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_promo_mode_t) )
+else:
+    _zesStandbyGetMode_t = CFUNCTYPE( ze_result_t, zes_standby_handle_t, POINTER(zes_standby_promo_mode_t) )
+
+###############################################################################
+## @brief Function-pointer for zesStandbySetMode
+if __use_win_types:
+    _zesStandbySetMode_t = WINFUNCTYPE( ze_result_t, zes_standby_handle_t, zes_standby_promo_mode_t )
+else:
+    _zesStandbySetMode_t = CFUNCTYPE( ze_result_t, zes_standby_handle_t, zes_standby_promo_mode_t )
+
+
+###############################################################################
+## @brief Table of Standby functions pointers
+class _zes_standby_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesStandbyGetProperties_t
+        ("pfnGetMode", c_void_p),                                       ## _zesStandbyGetMode_t
+        ("pfnSetMode", c_void_p)                                        ## _zesStandbySetMode_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesFirmwareGetProperties
+if __use_win_types:
+    _zesFirmwareGetProperties_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(zes_firmware_properties_t) )
+else:
+    _zesFirmwareGetProperties_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(zes_firmware_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFirmwareFlash
+if __use_win_types:
+    _zesFirmwareFlash_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, c_void_p, c_ulong )
+else:
+    _zesFirmwareFlash_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, c_void_p, c_ulong )
+
+###############################################################################
+## @brief Function-pointer for zesFirmwareGetFlashProgress
+if __use_win_types:
+    _zesFirmwareGetFlashProgress_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(c_ulong) )
+else:
+    _zesFirmwareGetFlashProgress_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(c_ulong) )
+
+###############################################################################
+## @brief Function-pointer for zesFirmwareGetConsoleLogs
+if __use_win_types:
+    _zesFirmwareGetConsoleLogs_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(c_size_t), c_char_p )
+else:
+    _zesFirmwareGetConsoleLogs_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, POINTER(c_size_t), c_char_p )
+
+
+###############################################################################
+## @brief Table of Firmware functions pointers
+class _zes_firmware_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesFirmwareGetProperties_t
+        ("pfnFlash", c_void_p),                                         ## _zesFirmwareFlash_t
+        ("pfnGetFlashProgress", c_void_p),                              ## _zesFirmwareGetFlashProgress_t
+        ("pfnGetConsoleLogs", c_void_p)                                 ## _zesFirmwareGetConsoleLogs_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesFirmwareGetSecurityVersionExp
+if __use_win_types:
+    _zesFirmwareGetSecurityVersionExp_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t, c_char_p )
+else:
+    _zesFirmwareGetSecurityVersionExp_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t, c_char_p )
+
+###############################################################################
+## @brief Function-pointer for zesFirmwareSetSecurityVersionExp
+if __use_win_types:
+    _zesFirmwareSetSecurityVersionExp_t = WINFUNCTYPE( ze_result_t, zes_firmware_handle_t )
+else:
+    _zesFirmwareSetSecurityVersionExp_t = CFUNCTYPE( ze_result_t, zes_firmware_handle_t )
+
+
+###############################################################################
+## @brief Table of FirmwareExp functions pointers
+class _zes_firmware_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetSecurityVersionExp", c_void_p),                         ## _zesFirmwareGetSecurityVersionExp_t
+        ("pfnSetSecurityVersionExp", c_void_p)                          ## _zesFirmwareSetSecurityVersionExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesMemoryGetProperties
+if __use_win_types:
+    _zesMemoryGetProperties_t = WINFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_properties_t) )
+else:
+    _zesMemoryGetProperties_t = CFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesMemoryGetState
+if __use_win_types:
+    _zesMemoryGetState_t = WINFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_state_t) )
+else:
+    _zesMemoryGetState_t = CFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesMemoryGetBandwidth
+if __use_win_types:
+    _zesMemoryGetBandwidth_t = WINFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_bandwidth_t) )
+else:
+    _zesMemoryGetBandwidth_t = CFUNCTYPE( ze_result_t, zes_mem_handle_t, POINTER(zes_mem_bandwidth_t) )
+
+
+###############################################################################
+## @brief Table of Memory functions pointers
+class _zes_memory_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesMemoryGetProperties_t
+        ("pfnGetState", c_void_p),                                      ## _zesMemoryGetState_t
+        ("pfnGetBandwidth", c_void_p)                                   ## _zesMemoryGetBandwidth_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetProperties
+if __use_win_types:
+    _zesFabricPortGetProperties_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_properties_t) )
+else:
+    _zesFabricPortGetProperties_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetLinkType
+if __use_win_types:
+    _zesFabricPortGetLinkType_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_link_type_t) )
+else:
+    _zesFabricPortGetLinkType_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_link_type_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetConfig
+if __use_win_types:
+    _zesFabricPortGetConfig_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
+else:
+    _zesFabricPortGetConfig_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortSetConfig
+if __use_win_types:
+    _zesFabricPortSetConfig_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
+else:
+    _zesFabricPortSetConfig_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetState
+if __use_win_types:
+    _zesFabricPortGetState_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_state_t) )
+else:
+    _zesFabricPortGetState_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetThroughput
+if __use_win_types:
+    _zesFabricPortGetThroughput_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_throughput_t) )
+else:
+    _zesFabricPortGetThroughput_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_throughput_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetFabricErrorCounters
+if __use_win_types:
+    _zesFabricPortGetFabricErrorCounters_t = WINFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_error_counters_t) )
+else:
+    _zesFabricPortGetFabricErrorCounters_t = CFUNCTYPE( ze_result_t, zes_fabric_port_handle_t, POINTER(zes_fabric_port_error_counters_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFabricPortGetMultiPortThroughput
+if __use_win_types:
+    _zesFabricPortGetMultiPortThroughput_t = WINFUNCTYPE( ze_result_t, zes_device_handle_t, c_ulong, POINTER(zes_fabric_port_handle_t), POINTER(zes_fabric_port_throughput_t*) )
+else:
+    _zesFabricPortGetMultiPortThroughput_t = CFUNCTYPE( ze_result_t, zes_device_handle_t, c_ulong, POINTER(zes_fabric_port_handle_t), POINTER(zes_fabric_port_throughput_t*) )
+
+
+###############################################################################
+## @brief Table of FabricPort functions pointers
+class _zes_fabric_port_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesFabricPortGetProperties_t
+        ("pfnGetLinkType", c_void_p),                                   ## _zesFabricPortGetLinkType_t
+        ("pfnGetConfig", c_void_p),                                     ## _zesFabricPortGetConfig_t
+        ("pfnSetConfig", c_void_p),                                     ## _zesFabricPortSetConfig_t
+        ("pfnGetState", c_void_p),                                      ## _zesFabricPortGetState_t
+        ("pfnGetThroughput", c_void_p),                                 ## _zesFabricPortGetThroughput_t
+        ("pfnGetFabricErrorCounters", c_void_p),                        ## _zesFabricPortGetFabricErrorCounters_t
+        ("pfnGetMultiPortThroughput", c_void_p)                         ## _zesFabricPortGetMultiPortThroughput_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesTemperatureGetProperties
+if __use_win_types:
+    _zesTemperatureGetProperties_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_properties_t) )
+else:
+    _zesTemperatureGetProperties_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesTemperatureGetConfig
+if __use_win_types:
+    _zesTemperatureGetConfig_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
+else:
+    _zesTemperatureGetConfig_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesTemperatureSetConfig
+if __use_win_types:
+    _zesTemperatureSetConfig_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
+else:
+    _zesTemperatureSetConfig_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(zes_temp_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesTemperatureGetState
+if __use_win_types:
+    _zesTemperatureGetState_t = WINFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(c_double) )
+else:
+    _zesTemperatureGetState_t = CFUNCTYPE( ze_result_t, zes_temp_handle_t, POINTER(c_double) )
+
+
+###############################################################################
+## @brief Table of Temperature functions pointers
+class _zes_temperature_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesTemperatureGetProperties_t
+        ("pfnGetConfig", c_void_p),                                     ## _zesTemperatureGetConfig_t
+        ("pfnSetConfig", c_void_p),                                     ## _zesTemperatureSetConfig_t
+        ("pfnGetState", c_void_p)                                       ## _zesTemperatureGetState_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesPsuGetProperties
+if __use_win_types:
+    _zesPsuGetProperties_t = WINFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_properties_t) )
+else:
+    _zesPsuGetProperties_t = CFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesPsuGetState
+if __use_win_types:
+    _zesPsuGetState_t = WINFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_state_t) )
+else:
+    _zesPsuGetState_t = CFUNCTYPE( ze_result_t, zes_psu_handle_t, POINTER(zes_psu_state_t) )
+
+
+###############################################################################
+## @brief Table of Psu functions pointers
+class _zes_psu_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesPsuGetProperties_t
+        ("pfnGetState", c_void_p)                                       ## _zesPsuGetState_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesFanGetProperties
+if __use_win_types:
+    _zesFanGetProperties_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_properties_t) )
+else:
+    _zesFanGetProperties_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFanGetConfig
+if __use_win_types:
+    _zesFanGetConfig_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_config_t) )
+else:
+    _zesFanGetConfig_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFanSetDefaultMode
+if __use_win_types:
+    _zesFanSetDefaultMode_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t )
+else:
+    _zesFanSetDefaultMode_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zesFanSetFixedSpeedMode
+if __use_win_types:
+    _zesFanSetFixedSpeedMode_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_t) )
+else:
+    _zesFanSetFixedSpeedMode_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFanSetSpeedTableMode
+if __use_win_types:
+    _zesFanSetSpeedTableMode_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_table_t) )
+else:
+    _zesFanSetSpeedTableMode_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, POINTER(zes_fan_speed_table_t) )
+
+###############################################################################
+## @brief Function-pointer for zesFanGetState
+if __use_win_types:
+    _zesFanGetState_t = WINFUNCTYPE( ze_result_t, zes_fan_handle_t, zes_fan_speed_units_t, POINTER(c_int32_t) )
+else:
+    _zesFanGetState_t = CFUNCTYPE( ze_result_t, zes_fan_handle_t, zes_fan_speed_units_t, POINTER(c_int32_t) )
+
+
+###############################################################################
+## @brief Table of Fan functions pointers
+class _zes_fan_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesFanGetProperties_t
+        ("pfnGetConfig", c_void_p),                                     ## _zesFanGetConfig_t
+        ("pfnSetDefaultMode", c_void_p),                                ## _zesFanSetDefaultMode_t
+        ("pfnSetFixedSpeedMode", c_void_p),                             ## _zesFanSetFixedSpeedMode_t
+        ("pfnSetSpeedTableMode", c_void_p),                             ## _zesFanSetSpeedTableMode_t
+        ("pfnGetState", c_void_p)                                       ## _zesFanGetState_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesLedGetProperties
+if __use_win_types:
+    _zesLedGetProperties_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_properties_t) )
+else:
+    _zesLedGetProperties_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesLedGetState
+if __use_win_types:
+    _zesLedGetState_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_state_t) )
+else:
+    _zesLedGetState_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_state_t) )
+
+###############################################################################
+## @brief Function-pointer for zesLedSetState
+if __use_win_types:
+    _zesLedSetState_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, ze_bool_t )
+else:
+    _zesLedSetState_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, ze_bool_t )
+
+###############################################################################
+## @brief Function-pointer for zesLedSetColor
+if __use_win_types:
+    _zesLedSetColor_t = WINFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_color_t) )
+else:
+    _zesLedSetColor_t = CFUNCTYPE( ze_result_t, zes_led_handle_t, POINTER(zes_led_color_t) )
+
+
+###############################################################################
+## @brief Table of Led functions pointers
+class _zes_led_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesLedGetProperties_t
+        ("pfnGetState", c_void_p),                                      ## _zesLedGetState_t
+        ("pfnSetState", c_void_p),                                      ## _zesLedSetState_t
+        ("pfnSetColor", c_void_p)                                       ## _zesLedSetColor_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesRasGetProperties
+if __use_win_types:
+    _zesRasGetProperties_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_properties_t) )
+else:
+    _zesRasGetProperties_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesRasGetConfig
+if __use_win_types:
+    _zesRasGetConfig_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
+else:
+    _zesRasGetConfig_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesRasSetConfig
+if __use_win_types:
+    _zesRasSetConfig_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
+else:
+    _zesRasSetConfig_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(zes_ras_config_t) )
+
+###############################################################################
+## @brief Function-pointer for zesRasGetState
+if __use_win_types:
+    _zesRasGetState_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, ze_bool_t, POINTER(zes_ras_state_t) )
+else:
+    _zesRasGetState_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, ze_bool_t, POINTER(zes_ras_state_t) )
+
+
+###############################################################################
+## @brief Table of Ras functions pointers
+class _zes_ras_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesRasGetProperties_t
+        ("pfnGetConfig", c_void_p),                                     ## _zesRasGetConfig_t
+        ("pfnSetConfig", c_void_p),                                     ## _zesRasSetConfig_t
+        ("pfnGetState", c_void_p)                                       ## _zesRasGetState_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesRasGetStateExp
+if __use_win_types:
+    _zesRasGetStateExp_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(c_ulong), POINTER(zes_ras_state_exp_t) )
+else:
+    _zesRasGetStateExp_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, POINTER(c_ulong), POINTER(zes_ras_state_exp_t) )
+
+###############################################################################
+## @brief Function-pointer for zesRasClearStateExp
+if __use_win_types:
+    _zesRasClearStateExp_t = WINFUNCTYPE( ze_result_t, zes_ras_handle_t, zes_ras_error_category_exp_t )
+else:
+    _zesRasClearStateExp_t = CFUNCTYPE( ze_result_t, zes_ras_handle_t, zes_ras_error_category_exp_t )
+
+
+###############################################################################
+## @brief Table of RasExp functions pointers
+class _zes_ras_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetStateExp", c_void_p),                                   ## _zesRasGetStateExp_t
+        ("pfnClearStateExp", c_void_p)                                  ## _zesRasClearStateExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesDiagnosticsGetProperties
+if __use_win_types:
+    _zesDiagnosticsGetProperties_t = WINFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(zes_diag_properties_t) )
+else:
+    _zesDiagnosticsGetProperties_t = CFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(zes_diag_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDiagnosticsGetTests
+if __use_win_types:
+    _zesDiagnosticsGetTests_t = WINFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(c_ulong), POINTER(zes_diag_test_t) )
+else:
+    _zesDiagnosticsGetTests_t = CFUNCTYPE( ze_result_t, zes_diag_handle_t, POINTER(c_ulong), POINTER(zes_diag_test_t) )
+
+###############################################################################
+## @brief Function-pointer for zesDiagnosticsRunTests
+if __use_win_types:
+    _zesDiagnosticsRunTests_t = WINFUNCTYPE( ze_result_t, zes_diag_handle_t, c_ulong, c_ulong, POINTER(zes_diag_result_t) )
+else:
+    _zesDiagnosticsRunTests_t = CFUNCTYPE( ze_result_t, zes_diag_handle_t, c_ulong, c_ulong, POINTER(zes_diag_result_t) )
+
+
+###############################################################################
+## @brief Table of Diagnostics functions pointers
+class _zes_diagnostics_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProperties", c_void_p),                                 ## _zesDiagnosticsGetProperties_t
+        ("pfnGetTests", c_void_p),                                      ## _zesDiagnosticsGetTests_t
+        ("pfnRunTests", c_void_p)                                       ## _zesDiagnosticsRunTests_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementGetVFPropertiesExp
+if __use_win_types:
+    _zesVFManagementGetVFPropertiesExp_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(zes_vf_exp_properties_t) )
+else:
+    _zesVFManagementGetVFPropertiesExp_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(zes_vf_exp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementGetVFMemoryUtilizationExp
+if __use_win_types:
+    _zesVFManagementGetVFMemoryUtilizationExp_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_mem_exp_t) )
+else:
+    _zesVFManagementGetVFMemoryUtilizationExp_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_mem_exp_t) )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementGetVFEngineUtilizationExp
+if __use_win_types:
+    _zesVFManagementGetVFEngineUtilizationExp_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_engine_exp_t) )
+else:
+    _zesVFManagementGetVFEngineUtilizationExp_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_engine_exp_t) )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementSetVFTelemetryModeExp
+if __use_win_types:
+    _zesVFManagementSetVFTelemetryModeExp_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, zes_vf_info_util_exp_flags_t, ze_bool_t )
+else:
+    _zesVFManagementSetVFTelemetryModeExp_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, zes_vf_info_util_exp_flags_t, ze_bool_t )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementSetVFTelemetrySamplingIntervalExp
+if __use_win_types:
+    _zesVFManagementSetVFTelemetrySamplingIntervalExp_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, zes_vf_info_util_exp_flags_t, c_ulonglong )
+else:
+    _zesVFManagementSetVFTelemetrySamplingIntervalExp_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, zes_vf_info_util_exp_flags_t, c_ulonglong )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementGetVFCapabilitiesExp
+if __use_win_types:
+    _zesVFManagementGetVFCapabilitiesExp_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(zes_vf_exp_capabilities_t) )
+else:
+    _zesVFManagementGetVFCapabilitiesExp_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(zes_vf_exp_capabilities_t) )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementGetVFMemoryUtilizationExp2
+if __use_win_types:
+    _zesVFManagementGetVFMemoryUtilizationExp2_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_mem_exp2_t) )
+else:
+    _zesVFManagementGetVFMemoryUtilizationExp2_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_mem_exp2_t) )
+
+###############################################################################
+## @brief Function-pointer for zesVFManagementGetVFEngineUtilizationExp2
+if __use_win_types:
+    _zesVFManagementGetVFEngineUtilizationExp2_t = WINFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_engine_exp2_t) )
+else:
+    _zesVFManagementGetVFEngineUtilizationExp2_t = CFUNCTYPE( ze_result_t, zes_vf_handle_t, POINTER(c_ulong), POINTER(zes_vf_util_engine_exp2_t) )
+
+
+###############################################################################
+## @brief Table of VFManagementExp functions pointers
+class _zes_vf_management_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetVFPropertiesExp", c_void_p),                            ## _zesVFManagementGetVFPropertiesExp_t
+        ("pfnGetVFMemoryUtilizationExp", c_void_p),                     ## _zesVFManagementGetVFMemoryUtilizationExp_t
+        ("pfnGetVFEngineUtilizationExp", c_void_p),                     ## _zesVFManagementGetVFEngineUtilizationExp_t
+        ("pfnSetVFTelemetryModeExp", c_void_p),                         ## _zesVFManagementSetVFTelemetryModeExp_t
+        ("pfnSetVFTelemetrySamplingIntervalExp", c_void_p),             ## _zesVFManagementSetVFTelemetrySamplingIntervalExp_t
+        ("pfnGetVFCapabilitiesExp", c_void_p),                          ## _zesVFManagementGetVFCapabilitiesExp_t
+        ("pfnGetVFMemoryUtilizationExp2", c_void_p),                    ## _zesVFManagementGetVFMemoryUtilizationExp2_t
+        ("pfnGetVFEngineUtilizationExp2", c_void_p)                     ## _zesVFManagementGetVFEngineUtilizationExp2_t
+    ]
+
+###############################################################################
+class _zes_dditable_t(Structure):
+    _fields_ = [
+        ("Global", _zes_global_dditable_t),
+        ("Device", _zes_device_dditable_t),
+        ("DeviceExp", _zes_device_exp_dditable_t),
+        ("Driver", _zes_driver_dditable_t),
+        ("DriverExp", _zes_driver_exp_dditable_t),
+        ("Overclock", _zes_overclock_dditable_t),
+        ("Scheduler", _zes_scheduler_dditable_t),
+        ("PerformanceFactor", _zes_performance_factor_dditable_t),
+        ("Power", _zes_power_dditable_t),
+        ("Frequency", _zes_frequency_dditable_t),
+        ("Engine", _zes_engine_dditable_t),
+        ("Standby", _zes_standby_dditable_t),
+        ("Firmware", _zes_firmware_dditable_t),
+        ("FirmwareExp", _zes_firmware_exp_dditable_t),
+        ("Memory", _zes_memory_dditable_t),
+        ("FabricPort", _zes_fabric_port_dditable_t),
+        ("Temperature", _zes_temperature_dditable_t),
+        ("Psu", _zes_psu_dditable_t),
+        ("Fan", _zes_fan_dditable_t),
+        ("Led", _zes_led_dditable_t),
+        ("Ras", _zes_ras_dditable_t),
+        ("RasExp", _zes_ras_exp_dditable_t),
+        ("Diagnostics", _zes_diagnostics_dditable_t),
+        ("VFManagementExp", _zes_vf_management_exp_dditable_t)
+    ]
+
+###############################################################################
+## @brief zes device-driver interfaces
+class ZES_DDI:
+    def __init__(self, version : ze_api_version_t):
+        # load the ze_loader library
+        if "Windows" == platform.uname()[0]:
+            self.__dll = WinDLL("ze_loader.dll")
+        else:
+            self.__dll = CDLL("ze_loader.so")
+
+        # fill the ddi tables
+        self.__dditable = _zes_dditable_t()
+
+        # call driver to get function pointers
+        _Global = _zes_global_dditable_t()
+        r = ze_result_v(self.__dll.zesGetGlobalProcAddrTable(version, byref(_Global)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Global = _Global
+
+        # attach function interface to function address
+        self.zesInit = _zesInit_t(self.__dditable.Global.pfnInit)
+
+        # call driver to get function pointers
+        _Device = _zes_device_dditable_t()
+        r = ze_result_v(self.__dll.zesGetDeviceProcAddrTable(version, byref(_Device)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Device = _Device
+
+        # attach function interface to function address
+        self.zesDeviceGetProperties = _zesDeviceGetProperties_t(self.__dditable.Device.pfnGetProperties)
+        self.zesDeviceGetState = _zesDeviceGetState_t(self.__dditable.Device.pfnGetState)
+        self.zesDeviceReset = _zesDeviceReset_t(self.__dditable.Device.pfnReset)
+        self.zesDeviceProcessesGetState = _zesDeviceProcessesGetState_t(self.__dditable.Device.pfnProcessesGetState)
+        self.zesDevicePciGetProperties = _zesDevicePciGetProperties_t(self.__dditable.Device.pfnPciGetProperties)
+        self.zesDevicePciGetState = _zesDevicePciGetState_t(self.__dditable.Device.pfnPciGetState)
+        self.zesDevicePciGetBars = _zesDevicePciGetBars_t(self.__dditable.Device.pfnPciGetBars)
+        self.zesDevicePciGetStats = _zesDevicePciGetStats_t(self.__dditable.Device.pfnPciGetStats)
+        self.zesDeviceEnumDiagnosticTestSuites = _zesDeviceEnumDiagnosticTestSuites_t(self.__dditable.Device.pfnEnumDiagnosticTestSuites)
+        self.zesDeviceEnumEngineGroups = _zesDeviceEnumEngineGroups_t(self.__dditable.Device.pfnEnumEngineGroups)
+        self.zesDeviceEventRegister = _zesDeviceEventRegister_t(self.__dditable.Device.pfnEventRegister)
+        self.zesDeviceEnumFabricPorts = _zesDeviceEnumFabricPorts_t(self.__dditable.Device.pfnEnumFabricPorts)
+        self.zesDeviceEnumFans = _zesDeviceEnumFans_t(self.__dditable.Device.pfnEnumFans)
+        self.zesDeviceEnumFirmwares = _zesDeviceEnumFirmwares_t(self.__dditable.Device.pfnEnumFirmwares)
+        self.zesDeviceEnumFrequencyDomains = _zesDeviceEnumFrequencyDomains_t(self.__dditable.Device.pfnEnumFrequencyDomains)
+        self.zesDeviceEnumLeds = _zesDeviceEnumLeds_t(self.__dditable.Device.pfnEnumLeds)
+        self.zesDeviceEnumMemoryModules = _zesDeviceEnumMemoryModules_t(self.__dditable.Device.pfnEnumMemoryModules)
+        self.zesDeviceEnumPerformanceFactorDomains = _zesDeviceEnumPerformanceFactorDomains_t(self.__dditable.Device.pfnEnumPerformanceFactorDomains)
+        self.zesDeviceEnumPowerDomains = _zesDeviceEnumPowerDomains_t(self.__dditable.Device.pfnEnumPowerDomains)
+        self.zesDeviceGetCardPowerDomain = _zesDeviceGetCardPowerDomain_t(self.__dditable.Device.pfnGetCardPowerDomain)
+        self.zesDeviceEnumPsus = _zesDeviceEnumPsus_t(self.__dditable.Device.pfnEnumPsus)
+        self.zesDeviceEnumRasErrorSets = _zesDeviceEnumRasErrorSets_t(self.__dditable.Device.pfnEnumRasErrorSets)
+        self.zesDeviceEnumSchedulers = _zesDeviceEnumSchedulers_t(self.__dditable.Device.pfnEnumSchedulers)
+        self.zesDeviceEnumStandbyDomains = _zesDeviceEnumStandbyDomains_t(self.__dditable.Device.pfnEnumStandbyDomains)
+        self.zesDeviceEnumTemperatureSensors = _zesDeviceEnumTemperatureSensors_t(self.__dditable.Device.pfnEnumTemperatureSensors)
+        self.zesDeviceEccAvailable = _zesDeviceEccAvailable_t(self.__dditable.Device.pfnEccAvailable)
+        self.zesDeviceEccConfigurable = _zesDeviceEccConfigurable_t(self.__dditable.Device.pfnEccConfigurable)
+        self.zesDeviceGetEccState = _zesDeviceGetEccState_t(self.__dditable.Device.pfnGetEccState)
+        self.zesDeviceSetEccState = _zesDeviceSetEccState_t(self.__dditable.Device.pfnSetEccState)
+        self.zesDeviceGet = _zesDeviceGet_t(self.__dditable.Device.pfnGet)
+        self.zesDeviceSetOverclockWaiver = _zesDeviceSetOverclockWaiver_t(self.__dditable.Device.pfnSetOverclockWaiver)
+        self.zesDeviceGetOverclockDomains = _zesDeviceGetOverclockDomains_t(self.__dditable.Device.pfnGetOverclockDomains)
+        self.zesDeviceGetOverclockControls = _zesDeviceGetOverclockControls_t(self.__dditable.Device.pfnGetOverclockControls)
+        self.zesDeviceResetOverclockSettings = _zesDeviceResetOverclockSettings_t(self.__dditable.Device.pfnResetOverclockSettings)
+        self.zesDeviceReadOverclockState = _zesDeviceReadOverclockState_t(self.__dditable.Device.pfnReadOverclockState)
+        self.zesDeviceEnumOverclockDomains = _zesDeviceEnumOverclockDomains_t(self.__dditable.Device.pfnEnumOverclockDomains)
+        self.zesDeviceResetExt = _zesDeviceResetExt_t(self.__dditable.Device.pfnResetExt)
+
+        # call driver to get function pointers
+        _DeviceExp = _zes_device_exp_dditable_t()
+        r = ze_result_v(self.__dll.zesGetDeviceExpProcAddrTable(version, byref(_DeviceExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.DeviceExp = _DeviceExp
+
+        # attach function interface to function address
+        self.zesDeviceGetSubDevicePropertiesExp = _zesDeviceGetSubDevicePropertiesExp_t(self.__dditable.DeviceExp.pfnGetSubDevicePropertiesExp)
+        self.zesDeviceEnumActiveVFExp = _zesDeviceEnumActiveVFExp_t(self.__dditable.DeviceExp.pfnEnumActiveVFExp)
+        self.zesDeviceEnumEnabledVFExp = _zesDeviceEnumEnabledVFExp_t(self.__dditable.DeviceExp.pfnEnumEnabledVFExp)
+
+        # call driver to get function pointers
+        _Driver = _zes_driver_dditable_t()
+        r = ze_result_v(self.__dll.zesGetDriverProcAddrTable(version, byref(_Driver)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Driver = _Driver
+
+        # attach function interface to function address
+        self.zesDriverEventListen = _zesDriverEventListen_t(self.__dditable.Driver.pfnEventListen)
+        self.zesDriverEventListenEx = _zesDriverEventListenEx_t(self.__dditable.Driver.pfnEventListenEx)
+        self.zesDriverGet = _zesDriverGet_t(self.__dditable.Driver.pfnGet)
+        self.zesDriverGetExtensionProperties = _zesDriverGetExtensionProperties_t(self.__dditable.Driver.pfnGetExtensionProperties)
+        self.zesDriverGetExtensionFunctionAddress = _zesDriverGetExtensionFunctionAddress_t(self.__dditable.Driver.pfnGetExtensionFunctionAddress)
+
+        # call driver to get function pointers
+        _DriverExp = _zes_driver_exp_dditable_t()
+        r = ze_result_v(self.__dll.zesGetDriverExpProcAddrTable(version, byref(_DriverExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.DriverExp = _DriverExp
+
+        # attach function interface to function address
+        self.zesDriverGetDeviceByUuidExp = _zesDriverGetDeviceByUuidExp_t(self.__dditable.DriverExp.pfnGetDeviceByUuidExp)
+
+        # call driver to get function pointers
+        _Overclock = _zes_overclock_dditable_t()
+        r = ze_result_v(self.__dll.zesGetOverclockProcAddrTable(version, byref(_Overclock)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Overclock = _Overclock
+
+        # attach function interface to function address
+        self.zesOverclockGetDomainProperties = _zesOverclockGetDomainProperties_t(self.__dditable.Overclock.pfnGetDomainProperties)
+        self.zesOverclockGetDomainVFProperties = _zesOverclockGetDomainVFProperties_t(self.__dditable.Overclock.pfnGetDomainVFProperties)
+        self.zesOverclockGetDomainControlProperties = _zesOverclockGetDomainControlProperties_t(self.__dditable.Overclock.pfnGetDomainControlProperties)
+        self.zesOverclockGetControlCurrentValue = _zesOverclockGetControlCurrentValue_t(self.__dditable.Overclock.pfnGetControlCurrentValue)
+        self.zesOverclockGetControlPendingValue = _zesOverclockGetControlPendingValue_t(self.__dditable.Overclock.pfnGetControlPendingValue)
+        self.zesOverclockSetControlUserValue = _zesOverclockSetControlUserValue_t(self.__dditable.Overclock.pfnSetControlUserValue)
+        self.zesOverclockGetControlState = _zesOverclockGetControlState_t(self.__dditable.Overclock.pfnGetControlState)
+        self.zesOverclockGetVFPointValues = _zesOverclockGetVFPointValues_t(self.__dditable.Overclock.pfnGetVFPointValues)
+        self.zesOverclockSetVFPointValues = _zesOverclockSetVFPointValues_t(self.__dditable.Overclock.pfnSetVFPointValues)
+
+        # call driver to get function pointers
+        _Scheduler = _zes_scheduler_dditable_t()
+        r = ze_result_v(self.__dll.zesGetSchedulerProcAddrTable(version, byref(_Scheduler)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Scheduler = _Scheduler
+
+        # attach function interface to function address
+        self.zesSchedulerGetProperties = _zesSchedulerGetProperties_t(self.__dditable.Scheduler.pfnGetProperties)
+        self.zesSchedulerGetCurrentMode = _zesSchedulerGetCurrentMode_t(self.__dditable.Scheduler.pfnGetCurrentMode)
+        self.zesSchedulerGetTimeoutModeProperties = _zesSchedulerGetTimeoutModeProperties_t(self.__dditable.Scheduler.pfnGetTimeoutModeProperties)
+        self.zesSchedulerGetTimesliceModeProperties = _zesSchedulerGetTimesliceModeProperties_t(self.__dditable.Scheduler.pfnGetTimesliceModeProperties)
+        self.zesSchedulerSetTimeoutMode = _zesSchedulerSetTimeoutMode_t(self.__dditable.Scheduler.pfnSetTimeoutMode)
+        self.zesSchedulerSetTimesliceMode = _zesSchedulerSetTimesliceMode_t(self.__dditable.Scheduler.pfnSetTimesliceMode)
+        self.zesSchedulerSetExclusiveMode = _zesSchedulerSetExclusiveMode_t(self.__dditable.Scheduler.pfnSetExclusiveMode)
+        self.zesSchedulerSetComputeUnitDebugMode = _zesSchedulerSetComputeUnitDebugMode_t(self.__dditable.Scheduler.pfnSetComputeUnitDebugMode)
+
+        # call driver to get function pointers
+        _PerformanceFactor = _zes_performance_factor_dditable_t()
+        r = ze_result_v(self.__dll.zesGetPerformanceFactorProcAddrTable(version, byref(_PerformanceFactor)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.PerformanceFactor = _PerformanceFactor
+
+        # attach function interface to function address
+        self.zesPerformanceFactorGetProperties = _zesPerformanceFactorGetProperties_t(self.__dditable.PerformanceFactor.pfnGetProperties)
+        self.zesPerformanceFactorGetConfig = _zesPerformanceFactorGetConfig_t(self.__dditable.PerformanceFactor.pfnGetConfig)
+        self.zesPerformanceFactorSetConfig = _zesPerformanceFactorSetConfig_t(self.__dditable.PerformanceFactor.pfnSetConfig)
+
+        # call driver to get function pointers
+        _Power = _zes_power_dditable_t()
+        r = ze_result_v(self.__dll.zesGetPowerProcAddrTable(version, byref(_Power)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Power = _Power
+
+        # attach function interface to function address
+        self.zesPowerGetProperties = _zesPowerGetProperties_t(self.__dditable.Power.pfnGetProperties)
+        self.zesPowerGetEnergyCounter = _zesPowerGetEnergyCounter_t(self.__dditable.Power.pfnGetEnergyCounter)
+        self.zesPowerGetLimits = _zesPowerGetLimits_t(self.__dditable.Power.pfnGetLimits)
+        self.zesPowerSetLimits = _zesPowerSetLimits_t(self.__dditable.Power.pfnSetLimits)
+        self.zesPowerGetEnergyThreshold = _zesPowerGetEnergyThreshold_t(self.__dditable.Power.pfnGetEnergyThreshold)
+        self.zesPowerSetEnergyThreshold = _zesPowerSetEnergyThreshold_t(self.__dditable.Power.pfnSetEnergyThreshold)
+        self.zesPowerGetLimitsExt = _zesPowerGetLimitsExt_t(self.__dditable.Power.pfnGetLimitsExt)
+        self.zesPowerSetLimitsExt = _zesPowerSetLimitsExt_t(self.__dditable.Power.pfnSetLimitsExt)
+
+        # call driver to get function pointers
+        _Frequency = _zes_frequency_dditable_t()
+        r = ze_result_v(self.__dll.zesGetFrequencyProcAddrTable(version, byref(_Frequency)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Frequency = _Frequency
+
+        # attach function interface to function address
+        self.zesFrequencyGetProperties = _zesFrequencyGetProperties_t(self.__dditable.Frequency.pfnGetProperties)
+        self.zesFrequencyGetAvailableClocks = _zesFrequencyGetAvailableClocks_t(self.__dditable.Frequency.pfnGetAvailableClocks)
+        self.zesFrequencyGetRange = _zesFrequencyGetRange_t(self.__dditable.Frequency.pfnGetRange)
+        self.zesFrequencySetRange = _zesFrequencySetRange_t(self.__dditable.Frequency.pfnSetRange)
+        self.zesFrequencyGetState = _zesFrequencyGetState_t(self.__dditable.Frequency.pfnGetState)
+        self.zesFrequencyGetThrottleTime = _zesFrequencyGetThrottleTime_t(self.__dditable.Frequency.pfnGetThrottleTime)
+        self.zesFrequencyOcGetCapabilities = _zesFrequencyOcGetCapabilities_t(self.__dditable.Frequency.pfnOcGetCapabilities)
+        self.zesFrequencyOcGetFrequencyTarget = _zesFrequencyOcGetFrequencyTarget_t(self.__dditable.Frequency.pfnOcGetFrequencyTarget)
+        self.zesFrequencyOcSetFrequencyTarget = _zesFrequencyOcSetFrequencyTarget_t(self.__dditable.Frequency.pfnOcSetFrequencyTarget)
+        self.zesFrequencyOcGetVoltageTarget = _zesFrequencyOcGetVoltageTarget_t(self.__dditable.Frequency.pfnOcGetVoltageTarget)
+        self.zesFrequencyOcSetVoltageTarget = _zesFrequencyOcSetVoltageTarget_t(self.__dditable.Frequency.pfnOcSetVoltageTarget)
+        self.zesFrequencyOcSetMode = _zesFrequencyOcSetMode_t(self.__dditable.Frequency.pfnOcSetMode)
+        self.zesFrequencyOcGetMode = _zesFrequencyOcGetMode_t(self.__dditable.Frequency.pfnOcGetMode)
+        self.zesFrequencyOcGetIccMax = _zesFrequencyOcGetIccMax_t(self.__dditable.Frequency.pfnOcGetIccMax)
+        self.zesFrequencyOcSetIccMax = _zesFrequencyOcSetIccMax_t(self.__dditable.Frequency.pfnOcSetIccMax)
+        self.zesFrequencyOcGetTjMax = _zesFrequencyOcGetTjMax_t(self.__dditable.Frequency.pfnOcGetTjMax)
+        self.zesFrequencyOcSetTjMax = _zesFrequencyOcSetTjMax_t(self.__dditable.Frequency.pfnOcSetTjMax)
+
+        # call driver to get function pointers
+        _Engine = _zes_engine_dditable_t()
+        r = ze_result_v(self.__dll.zesGetEngineProcAddrTable(version, byref(_Engine)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Engine = _Engine
+
+        # attach function interface to function address
+        self.zesEngineGetProperties = _zesEngineGetProperties_t(self.__dditable.Engine.pfnGetProperties)
+        self.zesEngineGetActivity = _zesEngineGetActivity_t(self.__dditable.Engine.pfnGetActivity)
+        self.zesEngineGetActivityExt = _zesEngineGetActivityExt_t(self.__dditable.Engine.pfnGetActivityExt)
+
+        # call driver to get function pointers
+        _Standby = _zes_standby_dditable_t()
+        r = ze_result_v(self.__dll.zesGetStandbyProcAddrTable(version, byref(_Standby)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Standby = _Standby
+
+        # attach function interface to function address
+        self.zesStandbyGetProperties = _zesStandbyGetProperties_t(self.__dditable.Standby.pfnGetProperties)
+        self.zesStandbyGetMode = _zesStandbyGetMode_t(self.__dditable.Standby.pfnGetMode)
+        self.zesStandbySetMode = _zesStandbySetMode_t(self.__dditable.Standby.pfnSetMode)
+
+        # call driver to get function pointers
+        _Firmware = _zes_firmware_dditable_t()
+        r = ze_result_v(self.__dll.zesGetFirmwareProcAddrTable(version, byref(_Firmware)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Firmware = _Firmware
+
+        # attach function interface to function address
+        self.zesFirmwareGetProperties = _zesFirmwareGetProperties_t(self.__dditable.Firmware.pfnGetProperties)
+        self.zesFirmwareFlash = _zesFirmwareFlash_t(self.__dditable.Firmware.pfnFlash)
+        self.zesFirmwareGetFlashProgress = _zesFirmwareGetFlashProgress_t(self.__dditable.Firmware.pfnGetFlashProgress)
+        self.zesFirmwareGetConsoleLogs = _zesFirmwareGetConsoleLogs_t(self.__dditable.Firmware.pfnGetConsoleLogs)
+
+        # call driver to get function pointers
+        _FirmwareExp = _zes_firmware_exp_dditable_t()
+        r = ze_result_v(self.__dll.zesGetFirmwareExpProcAddrTable(version, byref(_FirmwareExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.FirmwareExp = _FirmwareExp
+
+        # attach function interface to function address
+        self.zesFirmwareGetSecurityVersionExp = _zesFirmwareGetSecurityVersionExp_t(self.__dditable.FirmwareExp.pfnGetSecurityVersionExp)
+        self.zesFirmwareSetSecurityVersionExp = _zesFirmwareSetSecurityVersionExp_t(self.__dditable.FirmwareExp.pfnSetSecurityVersionExp)
+
+        # call driver to get function pointers
+        _Memory = _zes_memory_dditable_t()
+        r = ze_result_v(self.__dll.zesGetMemoryProcAddrTable(version, byref(_Memory)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Memory = _Memory
+
+        # attach function interface to function address
+        self.zesMemoryGetProperties = _zesMemoryGetProperties_t(self.__dditable.Memory.pfnGetProperties)
+        self.zesMemoryGetState = _zesMemoryGetState_t(self.__dditable.Memory.pfnGetState)
+        self.zesMemoryGetBandwidth = _zesMemoryGetBandwidth_t(self.__dditable.Memory.pfnGetBandwidth)
+
+        # call driver to get function pointers
+        _FabricPort = _zes_fabric_port_dditable_t()
+        r = ze_result_v(self.__dll.zesGetFabricPortProcAddrTable(version, byref(_FabricPort)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.FabricPort = _FabricPort
+
+        # attach function interface to function address
+        self.zesFabricPortGetProperties = _zesFabricPortGetProperties_t(self.__dditable.FabricPort.pfnGetProperties)
+        self.zesFabricPortGetLinkType = _zesFabricPortGetLinkType_t(self.__dditable.FabricPort.pfnGetLinkType)
+        self.zesFabricPortGetConfig = _zesFabricPortGetConfig_t(self.__dditable.FabricPort.pfnGetConfig)
+        self.zesFabricPortSetConfig = _zesFabricPortSetConfig_t(self.__dditable.FabricPort.pfnSetConfig)
+        self.zesFabricPortGetState = _zesFabricPortGetState_t(self.__dditable.FabricPort.pfnGetState)
+        self.zesFabricPortGetThroughput = _zesFabricPortGetThroughput_t(self.__dditable.FabricPort.pfnGetThroughput)
+        self.zesFabricPortGetFabricErrorCounters = _zesFabricPortGetFabricErrorCounters_t(self.__dditable.FabricPort.pfnGetFabricErrorCounters)
+        self.zesFabricPortGetMultiPortThroughput = _zesFabricPortGetMultiPortThroughput_t(self.__dditable.FabricPort.pfnGetMultiPortThroughput)
+
+        # call driver to get function pointers
+        _Temperature = _zes_temperature_dditable_t()
+        r = ze_result_v(self.__dll.zesGetTemperatureProcAddrTable(version, byref(_Temperature)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Temperature = _Temperature
+
+        # attach function interface to function address
+        self.zesTemperatureGetProperties = _zesTemperatureGetProperties_t(self.__dditable.Temperature.pfnGetProperties)
+        self.zesTemperatureGetConfig = _zesTemperatureGetConfig_t(self.__dditable.Temperature.pfnGetConfig)
+        self.zesTemperatureSetConfig = _zesTemperatureSetConfig_t(self.__dditable.Temperature.pfnSetConfig)
+        self.zesTemperatureGetState = _zesTemperatureGetState_t(self.__dditable.Temperature.pfnGetState)
+
+        # call driver to get function pointers
+        _Psu = _zes_psu_dditable_t()
+        r = ze_result_v(self.__dll.zesGetPsuProcAddrTable(version, byref(_Psu)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Psu = _Psu
+
+        # attach function interface to function address
+        self.zesPsuGetProperties = _zesPsuGetProperties_t(self.__dditable.Psu.pfnGetProperties)
+        self.zesPsuGetState = _zesPsuGetState_t(self.__dditable.Psu.pfnGetState)
+
+        # call driver to get function pointers
+        _Fan = _zes_fan_dditable_t()
+        r = ze_result_v(self.__dll.zesGetFanProcAddrTable(version, byref(_Fan)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Fan = _Fan
+
+        # attach function interface to function address
+        self.zesFanGetProperties = _zesFanGetProperties_t(self.__dditable.Fan.pfnGetProperties)
+        self.zesFanGetConfig = _zesFanGetConfig_t(self.__dditable.Fan.pfnGetConfig)
+        self.zesFanSetDefaultMode = _zesFanSetDefaultMode_t(self.__dditable.Fan.pfnSetDefaultMode)
+        self.zesFanSetFixedSpeedMode = _zesFanSetFixedSpeedMode_t(self.__dditable.Fan.pfnSetFixedSpeedMode)
+        self.zesFanSetSpeedTableMode = _zesFanSetSpeedTableMode_t(self.__dditable.Fan.pfnSetSpeedTableMode)
+        self.zesFanGetState = _zesFanGetState_t(self.__dditable.Fan.pfnGetState)
+
+        # call driver to get function pointers
+        _Led = _zes_led_dditable_t()
+        r = ze_result_v(self.__dll.zesGetLedProcAddrTable(version, byref(_Led)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Led = _Led
+
+        # attach function interface to function address
+        self.zesLedGetProperties = _zesLedGetProperties_t(self.__dditable.Led.pfnGetProperties)
+        self.zesLedGetState = _zesLedGetState_t(self.__dditable.Led.pfnGetState)
+        self.zesLedSetState = _zesLedSetState_t(self.__dditable.Led.pfnSetState)
+        self.zesLedSetColor = _zesLedSetColor_t(self.__dditable.Led.pfnSetColor)
+
+        # call driver to get function pointers
+        _Ras = _zes_ras_dditable_t()
+        r = ze_result_v(self.__dll.zesGetRasProcAddrTable(version, byref(_Ras)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Ras = _Ras
+
+        # attach function interface to function address
+        self.zesRasGetProperties = _zesRasGetProperties_t(self.__dditable.Ras.pfnGetProperties)
+        self.zesRasGetConfig = _zesRasGetConfig_t(self.__dditable.Ras.pfnGetConfig)
+        self.zesRasSetConfig = _zesRasSetConfig_t(self.__dditable.Ras.pfnSetConfig)
+        self.zesRasGetState = _zesRasGetState_t(self.__dditable.Ras.pfnGetState)
+
+        # call driver to get function pointers
+        _RasExp = _zes_ras_exp_dditable_t()
+        r = ze_result_v(self.__dll.zesGetRasExpProcAddrTable(version, byref(_RasExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.RasExp = _RasExp
+
+        # attach function interface to function address
+        self.zesRasGetStateExp = _zesRasGetStateExp_t(self.__dditable.RasExp.pfnGetStateExp)
+        self.zesRasClearStateExp = _zesRasClearStateExp_t(self.__dditable.RasExp.pfnClearStateExp)
+
+        # call driver to get function pointers
+        _Diagnostics = _zes_diagnostics_dditable_t()
+        r = ze_result_v(self.__dll.zesGetDiagnosticsProcAddrTable(version, byref(_Diagnostics)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Diagnostics = _Diagnostics
+
+        # attach function interface to function address
+        self.zesDiagnosticsGetProperties = _zesDiagnosticsGetProperties_t(self.__dditable.Diagnostics.pfnGetProperties)
+        self.zesDiagnosticsGetTests = _zesDiagnosticsGetTests_t(self.__dditable.Diagnostics.pfnGetTests)
+        self.zesDiagnosticsRunTests = _zesDiagnosticsRunTests_t(self.__dditable.Diagnostics.pfnRunTests)
+
+        # call driver to get function pointers
+        _VFManagementExp = _zes_vf_management_exp_dditable_t()
+        r = ze_result_v(self.__dll.zesGetVFManagementExpProcAddrTable(version, byref(_VFManagementExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.VFManagementExp = _VFManagementExp
+
+        # attach function interface to function address
+        self.zesVFManagementGetVFPropertiesExp = _zesVFManagementGetVFPropertiesExp_t(self.__dditable.VFManagementExp.pfnGetVFPropertiesExp)
+        self.zesVFManagementGetVFMemoryUtilizationExp = _zesVFManagementGetVFMemoryUtilizationExp_t(self.__dditable.VFManagementExp.pfnGetVFMemoryUtilizationExp)
+        self.zesVFManagementGetVFEngineUtilizationExp = _zesVFManagementGetVFEngineUtilizationExp_t(self.__dditable.VFManagementExp.pfnGetVFEngineUtilizationExp)
+        self.zesVFManagementSetVFTelemetryModeExp = _zesVFManagementSetVFTelemetryModeExp_t(self.__dditable.VFManagementExp.pfnSetVFTelemetryModeExp)
+        self.zesVFManagementSetVFTelemetrySamplingIntervalExp = _zesVFManagementSetVFTelemetrySamplingIntervalExp_t(self.__dditable.VFManagementExp.pfnSetVFTelemetrySamplingIntervalExp)
+        self.zesVFManagementGetVFCapabilitiesExp = _zesVFManagementGetVFCapabilitiesExp_t(self.__dditable.VFManagementExp.pfnGetVFCapabilitiesExp)
+        self.zesVFManagementGetVFMemoryUtilizationExp2 = _zesVFManagementGetVFMemoryUtilizationExp2_t(self.__dditable.VFManagementExp.pfnGetVFMemoryUtilizationExp2)
+        self.zesVFManagementGetVFEngineUtilizationExp2 = _zesVFManagementGetVFEngineUtilizationExp2_t(self.__dditable.VFManagementExp.pfnGetVFEngineUtilizationExp2)
+
+        # success!
diff --git a/third_party/level_zero/zes_api.h b/third_party/level_zero/zes_api.h
new file mode 100644
index 00000000000..874643cebe7
--- /dev/null
+++ b/third_party/level_zero/zes_api.h
@@ -0,0 +1,7513 @@
+/*
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file zes_api.h
+ * @version v1.11-r1.11.8
+ *
+ */
+#ifndef _ZES_API_H
+#define _ZES_API_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+
+// 'core' API headers
+#include "ze_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Intel 'oneAPI' Level-Zero Sysman API common types
+#if !defined(__GNUC__)
+#pragma region common
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle to a driver instance
+typedef ze_driver_handle_t zes_driver_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of device object
+typedef ze_device_handle_t zes_device_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device scheduler queue
+typedef struct _zes_sched_handle_t *zes_sched_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device performance factors
+typedef struct _zes_perf_handle_t *zes_perf_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device power domain
+typedef struct _zes_pwr_handle_t *zes_pwr_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device frequency domain
+typedef struct _zes_freq_handle_t *zes_freq_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device engine group
+typedef struct _zes_engine_handle_t *zes_engine_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device standby control
+typedef struct _zes_standby_handle_t *zes_standby_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device firmware
+typedef struct _zes_firmware_handle_t *zes_firmware_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device memory module
+typedef struct _zes_mem_handle_t *zes_mem_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman fabric port
+typedef struct _zes_fabric_port_handle_t *zes_fabric_port_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device temperature sensor
+typedef struct _zes_temp_handle_t *zes_temp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device power supply
+typedef struct _zes_psu_handle_t *zes_psu_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device fan
+typedef struct _zes_fan_handle_t *zes_fan_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device LED
+typedef struct _zes_led_handle_t *zes_led_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device RAS error set
+typedef struct _zes_ras_handle_t *zes_ras_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device diagnostics test suite
+typedef struct _zes_diag_handle_t *zes_diag_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman device overclock domain
+typedef struct _zes_overclock_handle_t *zes_overclock_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle for a Sysman virtual function management domain
+typedef struct _zes_vf_handle_t *zes_vf_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Defines structure types
+typedef enum _zes_structure_type_t
+{
+    ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,                             ///< ::zes_device_properties_t
+    ZES_STRUCTURE_TYPE_PCI_PROPERTIES = 0x2,                                ///< ::zes_pci_properties_t
+    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES = 0x3,                            ///< ::zes_pci_bar_properties_t
+    ZES_STRUCTURE_TYPE_DIAG_PROPERTIES = 0x4,                               ///< ::zes_diag_properties_t
+    ZES_STRUCTURE_TYPE_ENGINE_PROPERTIES = 0x5,                             ///< ::zes_engine_properties_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_PROPERTIES = 0x6,                        ///< ::zes_fabric_port_properties_t
+    ZES_STRUCTURE_TYPE_FAN_PROPERTIES = 0x7,                                ///< ::zes_fan_properties_t
+    ZES_STRUCTURE_TYPE_FIRMWARE_PROPERTIES = 0x8,                           ///< ::zes_firmware_properties_t
+    ZES_STRUCTURE_TYPE_FREQ_PROPERTIES = 0x9,                               ///< ::zes_freq_properties_t
+    ZES_STRUCTURE_TYPE_LED_PROPERTIES = 0xa,                                ///< ::zes_led_properties_t
+    ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,                                ///< ::zes_mem_properties_t
+    ZES_STRUCTURE_TYPE_PERF_PROPERTIES = 0xc,                               ///< ::zes_perf_properties_t
+    ZES_STRUCTURE_TYPE_POWER_PROPERTIES = 0xd,                              ///< ::zes_power_properties_t
+    ZES_STRUCTURE_TYPE_PSU_PROPERTIES = 0xe,                                ///< ::zes_psu_properties_t
+    ZES_STRUCTURE_TYPE_RAS_PROPERTIES = 0xf,                                ///< ::zes_ras_properties_t
+    ZES_STRUCTURE_TYPE_SCHED_PROPERTIES = 0x10,                             ///< ::zes_sched_properties_t
+    ZES_STRUCTURE_TYPE_SCHED_TIMEOUT_PROPERTIES = 0x11,                     ///< ::zes_sched_timeout_properties_t
+    ZES_STRUCTURE_TYPE_SCHED_TIMESLICE_PROPERTIES = 0x12,                   ///< ::zes_sched_timeslice_properties_t
+    ZES_STRUCTURE_TYPE_STANDBY_PROPERTIES = 0x13,                           ///< ::zes_standby_properties_t
+    ZES_STRUCTURE_TYPE_TEMP_PROPERTIES = 0x14,                              ///< ::zes_temp_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_STATE = 0x15,                                 ///< ::zes_device_state_t
+    ZES_STRUCTURE_TYPE_PROCESS_STATE = 0x16,                                ///< ::zes_process_state_t
+    ZES_STRUCTURE_TYPE_PCI_STATE = 0x17,                                    ///< ::zes_pci_state_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_CONFIG = 0x18,                           ///< ::zes_fabric_port_config_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_STATE = 0x19,                            ///< ::zes_fabric_port_state_t
+    ZES_STRUCTURE_TYPE_FAN_CONFIG = 0x1a,                                   ///< ::zes_fan_config_t
+    ZES_STRUCTURE_TYPE_FREQ_STATE = 0x1b,                                   ///< ::zes_freq_state_t
+    ZES_STRUCTURE_TYPE_OC_CAPABILITIES = 0x1c,                              ///< ::zes_oc_capabilities_t
+    ZES_STRUCTURE_TYPE_LED_STATE = 0x1d,                                    ///< ::zes_led_state_t
+    ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,                                    ///< ::zes_mem_state_t
+    ZES_STRUCTURE_TYPE_PSU_STATE = 0x1f,                                    ///< ::zes_psu_state_t
+    ZES_STRUCTURE_TYPE_BASE_STATE = 0x20,                                   ///< ::zes_base_state_t
+    ZES_STRUCTURE_TYPE_RAS_CONFIG = 0x21,                                   ///< ::zes_ras_config_t
+    ZES_STRUCTURE_TYPE_RAS_STATE = 0x22,                                    ///< ::zes_ras_state_t
+    ZES_STRUCTURE_TYPE_TEMP_CONFIG = 0x23,                                  ///< ::zes_temp_config_t
+    ZES_STRUCTURE_TYPE_PCI_BAR_PROPERTIES_1_2 = 0x24,                       ///< ::zes_pci_bar_properties_1_2_t
+    ZES_STRUCTURE_TYPE_DEVICE_ECC_DESC = 0x25,                              ///< ::zes_device_ecc_desc_t
+    ZES_STRUCTURE_TYPE_DEVICE_ECC_PROPERTIES = 0x26,                        ///< ::zes_device_ecc_properties_t
+    ZES_STRUCTURE_TYPE_POWER_LIMIT_EXT_DESC = 0x27,                         ///< ::zes_power_limit_ext_desc_t
+    ZES_STRUCTURE_TYPE_POWER_EXT_PROPERTIES = 0x28,                         ///< ::zes_power_ext_properties_t
+    ZES_STRUCTURE_TYPE_OVERCLOCK_PROPERTIES = 0x29,                         ///< ::zes_overclock_properties_t
+    ZES_STRUCTURE_TYPE_FABRIC_PORT_ERROR_COUNTERS = 0x2a,                   ///< ::zes_fabric_port_error_counters_t
+    ZES_STRUCTURE_TYPE_ENGINE_EXT_PROPERTIES = 0x2b,                        ///< ::zes_engine_ext_properties_t
+    ZES_STRUCTURE_TYPE_RESET_PROPERTIES = 0x2c,                             ///< ::zes_reset_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,                        ///< ::zes_device_ext_properties_t
+    ZES_STRUCTURE_TYPE_DEVICE_UUID = 0x2e,                                  ///< ::zes_uuid_t
+    ZES_STRUCTURE_TYPE_POWER_DOMAIN_EXP_PROPERTIES = 0x00020001,            ///< ::zes_power_domain_exp_properties_t
+    ZES_STRUCTURE_TYPE_MEM_BANDWIDTH_COUNTER_BITS_EXP_PROPERTIES = 0x00020002,  ///< ::zes_mem_bandwidth_counter_bits_exp_properties_t
+    ZES_STRUCTURE_TYPE_MEMORY_PAGE_OFFLINE_STATE_EXP = 0x00020003,          ///< ::zes_mem_page_offline_state_exp_t
+    ZES_STRUCTURE_TYPE_SUBDEVICE_EXP_PROPERTIES = 0x00020004,               ///< ::zes_subdevice_exp_properties_t
+    ZES_STRUCTURE_TYPE_VF_EXP_PROPERTIES = 0x00020005,                      ///< ::zes_vf_exp_properties_t
+    ZES_STRUCTURE_TYPE_VF_UTIL_MEM_EXP = 0x00020006,                        ///< ::zes_vf_util_mem_exp_t
+    ZES_STRUCTURE_TYPE_VF_UTIL_ENGINE_EXP = 0x00020007,                     ///< ::zes_vf_util_engine_exp_t
+    ZES_STRUCTURE_TYPE_VF_EXP_CAPABILITIES = 0x00020008,                    ///< ::zes_vf_exp_capabilities_t
+    ZES_STRUCTURE_TYPE_VF_UTIL_MEM_EXP2 = 0x00020009,                       ///< ::zes_vf_util_mem_exp2_t
+    ZES_STRUCTURE_TYPE_VF_UTIL_ENGINE_EXP2 = 0x00020010,                    ///< ::zes_vf_util_engine_exp2_t
+    ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_structure_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all properties types
+typedef struct _zes_base_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zes_base_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all descriptor types
+typedef struct _zes_base_desc_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zes_base_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all state types
+typedef struct _zes_base_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zes_base_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all config types
+typedef struct _zes_base_config_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zes_base_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all capability types
+typedef struct _zes_base_capability_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zes_base_capability_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_base_properties_t
+typedef struct _zes_base_properties_t zes_base_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_base_desc_t
+typedef struct _zes_base_desc_t zes_base_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_base_state_t
+typedef struct _zes_base_state_t zes_base_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_base_config_t
+typedef struct _zes_base_config_t zes_base_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_base_capability_t
+typedef struct _zes_base_capability_t zes_base_capability_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_driver_extension_properties_t
+typedef struct _zes_driver_extension_properties_t zes_driver_extension_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_device_state_t
+typedef struct _zes_device_state_t zes_device_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_reset_properties_t
+typedef struct _zes_reset_properties_t zes_reset_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_uuid_t
+typedef struct _zes_uuid_t zes_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_device_properties_t
+typedef struct _zes_device_properties_t zes_device_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_device_ext_properties_t
+typedef struct _zes_device_ext_properties_t zes_device_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_process_state_t
+typedef struct _zes_process_state_t zes_process_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_address_t
+typedef struct _zes_pci_address_t zes_pci_address_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_speed_t
+typedef struct _zes_pci_speed_t zes_pci_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_properties_t
+typedef struct _zes_pci_properties_t zes_pci_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_state_t
+typedef struct _zes_pci_state_t zes_pci_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_bar_properties_t
+typedef struct _zes_pci_bar_properties_t zes_pci_bar_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_bar_properties_1_2_t
+typedef struct _zes_pci_bar_properties_1_2_t zes_pci_bar_properties_1_2_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_pci_stats_t
+typedef struct _zes_pci_stats_t zes_pci_stats_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_overclock_properties_t
+typedef struct _zes_overclock_properties_t zes_overclock_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_control_property_t
+typedef struct _zes_control_property_t zes_control_property_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_property_t
+typedef struct _zes_vf_property_t zes_vf_property_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_diag_test_t
+typedef struct _zes_diag_test_t zes_diag_test_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_diag_properties_t
+typedef struct _zes_diag_properties_t zes_diag_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_device_ecc_desc_t
+typedef struct _zes_device_ecc_desc_t zes_device_ecc_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_device_ecc_properties_t
+typedef struct _zes_device_ecc_properties_t zes_device_ecc_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_engine_properties_t
+typedef struct _zes_engine_properties_t zes_engine_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_engine_stats_t
+typedef struct _zes_engine_stats_t zes_engine_stats_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_id_t
+typedef struct _zes_fabric_port_id_t zes_fabric_port_id_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_speed_t
+typedef struct _zes_fabric_port_speed_t zes_fabric_port_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_properties_t
+typedef struct _zes_fabric_port_properties_t zes_fabric_port_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_link_type_t
+typedef struct _zes_fabric_link_type_t zes_fabric_link_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_config_t
+typedef struct _zes_fabric_port_config_t zes_fabric_port_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_state_t
+typedef struct _zes_fabric_port_state_t zes_fabric_port_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_throughput_t
+typedef struct _zes_fabric_port_throughput_t zes_fabric_port_throughput_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fabric_port_error_counters_t
+typedef struct _zes_fabric_port_error_counters_t zes_fabric_port_error_counters_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fan_speed_t
+typedef struct _zes_fan_speed_t zes_fan_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fan_temp_speed_t
+typedef struct _zes_fan_temp_speed_t zes_fan_temp_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fan_speed_table_t
+typedef struct _zes_fan_speed_table_t zes_fan_speed_table_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fan_properties_t
+typedef struct _zes_fan_properties_t zes_fan_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_fan_config_t
+typedef struct _zes_fan_config_t zes_fan_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_firmware_properties_t
+typedef struct _zes_firmware_properties_t zes_firmware_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_freq_properties_t
+typedef struct _zes_freq_properties_t zes_freq_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_freq_range_t
+typedef struct _zes_freq_range_t zes_freq_range_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_freq_state_t
+typedef struct _zes_freq_state_t zes_freq_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_freq_throttle_time_t
+typedef struct _zes_freq_throttle_time_t zes_freq_throttle_time_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_oc_capabilities_t
+typedef struct _zes_oc_capabilities_t zes_oc_capabilities_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_led_properties_t
+typedef struct _zes_led_properties_t zes_led_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_led_color_t
+typedef struct _zes_led_color_t zes_led_color_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_led_state_t
+typedef struct _zes_led_state_t zes_led_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_properties_t
+typedef struct _zes_mem_properties_t zes_mem_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_state_t
+typedef struct _zes_mem_state_t zes_mem_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_bandwidth_t
+typedef struct _zes_mem_bandwidth_t zes_mem_bandwidth_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_ext_bandwidth_t
+typedef struct _zes_mem_ext_bandwidth_t zes_mem_ext_bandwidth_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_perf_properties_t
+typedef struct _zes_perf_properties_t zes_perf_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_properties_t
+typedef struct _zes_power_properties_t zes_power_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_energy_counter_t
+typedef struct _zes_power_energy_counter_t zes_power_energy_counter_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_sustained_limit_t
+typedef struct _zes_power_sustained_limit_t zes_power_sustained_limit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_burst_limit_t
+typedef struct _zes_power_burst_limit_t zes_power_burst_limit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_peak_limit_t
+typedef struct _zes_power_peak_limit_t zes_power_peak_limit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_energy_threshold_t
+typedef struct _zes_energy_threshold_t zes_energy_threshold_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_psu_properties_t
+typedef struct _zes_psu_properties_t zes_psu_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_psu_state_t
+typedef struct _zes_psu_state_t zes_psu_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_ras_properties_t
+typedef struct _zes_ras_properties_t zes_ras_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_ras_state_t
+typedef struct _zes_ras_state_t zes_ras_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_ras_config_t
+typedef struct _zes_ras_config_t zes_ras_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_sched_properties_t
+typedef struct _zes_sched_properties_t zes_sched_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_sched_timeout_properties_t
+typedef struct _zes_sched_timeout_properties_t zes_sched_timeout_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_sched_timeslice_properties_t
+typedef struct _zes_sched_timeslice_properties_t zes_sched_timeslice_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_standby_properties_t
+typedef struct _zes_standby_properties_t zes_standby_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_temp_properties_t
+typedef struct _zes_temp_properties_t zes_temp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_temp_threshold_t
+typedef struct _zes_temp_threshold_t zes_temp_threshold_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_temp_config_t
+typedef struct _zes_temp_config_t zes_temp_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_limit_ext_desc_t
+typedef struct _zes_power_limit_ext_desc_t zes_power_limit_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_ext_properties_t
+typedef struct _zes_power_ext_properties_t zes_power_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_engine_ext_properties_t
+typedef struct _zes_engine_ext_properties_t zes_engine_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_ras_state_exp_t
+typedef struct _zes_ras_state_exp_t zes_ras_state_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_page_offline_state_exp_t
+typedef struct _zes_mem_page_offline_state_exp_t zes_mem_page_offline_state_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_mem_bandwidth_counter_bits_exp_properties_t
+typedef struct _zes_mem_bandwidth_counter_bits_exp_properties_t zes_mem_bandwidth_counter_bits_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_power_domain_exp_properties_t
+typedef struct _zes_power_domain_exp_properties_t zes_power_domain_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_subdevice_exp_properties_t
+typedef struct _zes_subdevice_exp_properties_t zes_subdevice_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_exp_properties_t
+typedef struct _zes_vf_exp_properties_t zes_vf_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_util_mem_exp_t
+typedef struct _zes_vf_util_mem_exp_t zes_vf_util_mem_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_util_engine_exp_t
+typedef struct _zes_vf_util_engine_exp_t zes_vf_util_engine_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_exp_capabilities_t
+typedef struct _zes_vf_exp_capabilities_t zes_vf_exp_capabilities_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_util_mem_exp2_t
+typedef struct _zes_vf_util_mem_exp2_t zes_vf_util_mem_exp2_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zes_vf_util_engine_exp2_t
+typedef struct _zes_vf_util_engine_exp2_t zes_vf_util_engine_exp2_t;
+
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman)
+#if !defined(__GNUC__)
+#pragma region driver
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported sysman initialization flags
+typedef uint32_t zes_init_flags_t;
+typedef enum _zes_init_flag_t
+{
+    ZES_INIT_FLAG_PLACEHOLDER = ZE_BIT(0),                                  ///< placeholder for future use
+    ZES_INIT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_init_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Initialize 'oneAPI' System Resource Management (sysman)
+/// 
+/// @details
+///     - The application must call this function or ::zeInit with the
+///       ::ZES_ENABLE_SYSMAN environment variable set before calling any other
+///       sysman function.
+///     - If this function is not called then all other sysman functions will
+///       return ::ZE_RESULT_ERROR_UNINITIALIZED.
+///     - This function will only initialize sysman. To initialize other
+///       functions, call ::zeInit.
+///     - There is no requirement to call this function before or after
+///       ::zeInit.
+///     - Only one instance of sysman will be initialized per process.
+///     - The application must call this function after forking new processes.
+///       Each forked process must call this function.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe for scenarios
+///       where multiple libraries may initialize sysman simultaneously.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x1 < flags`
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesInit(
+    zes_init_flags_t flags                                                  ///< [in] initialization flags.
+                                                                            ///< currently unused, must be 0 (default).
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves sysman driver instances
+/// 
+/// @details
+///     - A sysman driver represents a collection of physical devices.
+///     - Multiple calls to this function will return identical sysman driver
+///       handles, in the same order.
+///     - The application may pass nullptr for pDrivers when only querying the
+///       number of sysman drivers.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverGet(
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sysman driver instances.
+                                                                            ///< if count is zero, then the loader shall update the value with the
+                                                                            ///< total number of sysman drivers available.
+                                                                            ///< if count is greater than the number of sysman drivers available, then
+                                                                            ///< the loader shall update the value with the correct number of sysman
+                                                                            ///< drivers available.
+    zes_driver_handle_t* phDrivers                                          ///< [in,out][optional][range(0, *pCount)] array of sysman driver instance handles.
+                                                                            ///< if count is less than the number of sysman drivers available, then the
+                                                                            ///< loader shall only retrieve that number of sysman drivers.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_EXTENSION_NAME
+/// @brief Maximum extension name string size
+#define ZES_MAX_EXTENSION_NAME  256
+#endif // ZES_MAX_EXTENSION_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties queried using ::zesDriverGetExtensionProperties
+typedef struct _zes_driver_extension_properties_t
+{
+    char name[ZES_MAX_EXTENSION_NAME];                                      ///< [out] extension name
+    uint32_t version;                                                       ///< [out] extension version using ::ZE_MAKE_VERSION
+
+} zes_driver_extension_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves extension properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverGetExtensionProperties(
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of extension properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of extension properties available.
+                                                                            ///< if count is greater than the number of extension properties available,
+                                                                            ///< then the driver shall update the value with the correct number of
+                                                                            ///< extension properties available.
+    zes_driver_extension_properties_t* pExtensionProperties                 ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< extension properties.
+                                                                            ///< if count is less than the number of extension properties available,
+                                                                            ///< then driver shall only retrieve that number of extension properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves function pointer for vendor-specific or experimental
+///        extensions
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == name`
+///         + `nullptr == ppFunctionAddress`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverGetExtensionFunctionAddress(
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the driver instance
+    const char* name,                                                       ///< [in] extension function name
+    void** ppFunctionAddress                                                ///< [out] pointer to function pointer
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Device management
+#if !defined(__GNUC__)
+#pragma region device
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves sysman devices within a sysman driver
+/// 
+/// @details
+///     - Multiple calls to this function will return identical sysman device
+///       handles, in the same order.
+///     - The number and order of handles returned from this function is NOT
+///       affected by the ::ZE_AFFINITY_MASK, ::ZE_ENABLE_PCI_ID_DEVICE_ORDER,
+///       or ::ZE_FLAT_DEVICE_HIERARCHY environment variables.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGet(
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the sysman driver instance
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sysman devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sysman devices available.
+                                                                            ///< if count is greater than the number of sysman devices available, then
+                                                                            ///< the driver shall update the value with the correct number of sysman
+                                                                            ///< devices available.
+    zes_device_handle_t* phDevices                                          ///< [in,out][optional][range(0, *pCount)] array of handle of sysman devices.
+                                                                            ///< if count is less than the number of sysman devices available, then
+                                                                            ///< driver shall only retrieve that number of sysman devices.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_STRING_PROPERTY_SIZE
+/// @brief Maximum number of characters in string properties.
+#define ZES_STRING_PROPERTY_SIZE  64
+#endif // ZES_STRING_PROPERTY_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_UUID_SIZE
+/// @brief Maximum device universal unique id (UUID) size in bytes.
+#define ZES_MAX_UUID_SIZE  16
+#endif // ZES_MAX_UUID_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Types of accelerator engines
+typedef uint32_t zes_engine_type_flags_t;
+typedef enum _zes_engine_type_flag_t
+{
+    ZES_ENGINE_TYPE_FLAG_OTHER = ZE_BIT(0),                                 ///< Undefined types of accelerators.
+    ZES_ENGINE_TYPE_FLAG_COMPUTE = ZE_BIT(1),                               ///< Engines that process compute kernels only (no 3D content).
+    ZES_ENGINE_TYPE_FLAG_3D = ZE_BIT(2),                                    ///< Engines that process 3D content only (no compute kernels).
+    ZES_ENGINE_TYPE_FLAG_MEDIA = ZE_BIT(3),                                 ///< Engines that process media workloads.
+    ZES_ENGINE_TYPE_FLAG_DMA = ZE_BIT(4),                                   ///< Engines that copy blocks of data.
+    ZES_ENGINE_TYPE_FLAG_RENDER = ZE_BIT(5),                                ///< Engines that can process both 3D content and compute kernels.
+    ZES_ENGINE_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_engine_type_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device repair status
+typedef enum _zes_repair_status_t
+{
+    ZES_REPAIR_STATUS_UNSUPPORTED = 0,                                      ///< The device does not support in-field repairs.
+    ZES_REPAIR_STATUS_NOT_PERFORMED = 1,                                    ///< The device has never been repaired.
+    ZES_REPAIR_STATUS_PERFORMED = 2,                                        ///< The device has been repaired.
+    ZES_REPAIR_STATUS_FORCE_UINT32 = 0x7fffffff
+
+} zes_repair_status_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device reset reasons
+typedef uint32_t zes_reset_reason_flags_t;
+typedef enum _zes_reset_reason_flag_t
+{
+    ZES_RESET_REASON_FLAG_WEDGED = ZE_BIT(0),                               ///< The device needs to be reset because one or more parts of the hardware
+                                                                            ///< is wedged
+    ZES_RESET_REASON_FLAG_REPAIR = ZE_BIT(1),                               ///< The device needs to be reset in order to complete in-field repairs
+    ZES_RESET_REASON_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_reset_reason_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device reset type
+typedef enum _zes_reset_type_t
+{
+    ZES_RESET_TYPE_WARM = 0,                                                ///< Apply warm reset
+    ZES_RESET_TYPE_COLD = 1,                                                ///< Apply cold reset
+    ZES_RESET_TYPE_FLR = 2,                                                 ///< Apply FLR reset
+    ZES_RESET_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_reset_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device state
+typedef struct _zes_device_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_reset_reason_flags_t reset;                                         ///< [out] Indicates if the device needs to be reset and for what reasons.
+                                                                            ///< returns 0 (none) or combination of ::zes_reset_reason_flag_t
+    zes_repair_status_t repaired;                                           ///< [out] Indicates if the device has been repaired
+
+} zes_device_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device reset properties
+typedef struct _zes_reset_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t force;                                                        ///< [in] If set to true, all applications that are currently using the
+                                                                            ///< device will be forcibly killed.
+    zes_reset_type_t resetType;                                             ///< [in] Type of reset needs to be performed
+
+} zes_reset_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device universal unique id (UUID)
+typedef struct _zes_uuid_t
+{
+    uint8_t id[ZES_MAX_UUID_SIZE];                                          ///< [out] opaque data representing a device UUID
+
+} zes_uuid_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device types
+typedef enum _zes_device_type_t
+{
+    ZES_DEVICE_TYPE_GPU = 1,                                                ///< Graphics Processing Unit
+    ZES_DEVICE_TYPE_CPU = 2,                                                ///< Central Processing Unit
+    ZES_DEVICE_TYPE_FPGA = 3,                                               ///< Field Programmable Gate Array
+    ZES_DEVICE_TYPE_MCA = 4,                                                ///< Memory Copy Accelerator
+    ZES_DEVICE_TYPE_VPU = 5,                                                ///< Vision Processing Unit
+    ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_device_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device property flags
+typedef uint32_t zes_device_property_flags_t;
+typedef enum _zes_device_property_flag_t
+{
+    ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),                        ///< Device is integrated with the Host.
+    ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),                         ///< Device handle used for query represents a sub-device.
+    ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),                               ///< Device supports error correction memory access.
+    ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),                    ///< Device supports on-demand page-faulting.
+    ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_device_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device properties
+typedef struct _zes_device_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_device_properties_t core;                                            ///< [out] (Deprecated, use ::zes_uuid_t in the extended structure) Core
+                                                                            ///< device properties
+    uint32_t numSubdevices;                                                 ///< [out] Number of sub-devices. A value of 0 indicates that this device
+                                                                            ///< doesn't have sub-devices.
+    char serialNumber[ZES_STRING_PROPERTY_SIZE];                            ///< [out] Manufacturing serial number (NULL terminated string value). This
+                                                                            ///< value is intended to reflect the Part ID/SoC ID assigned by
+                                                                            ///< manufacturer that is unique for a SoC. Will be set to the string
+                                                                            ///< "unknown" if this cannot be determined for the device.
+    char boardNumber[ZES_STRING_PROPERTY_SIZE];                             ///< [out] Manufacturing board number (NULL terminated string value).
+                                                                            ///< Alternatively "boardSerialNumber", this value is intended to reflect
+                                                                            ///< the string printed on board label by manufacturer. Will be set to the
+                                                                            ///< string "unknown" if this cannot be determined for the device.
+    char brandName[ZES_STRING_PROPERTY_SIZE];                               ///< [out] Brand name of the device (NULL terminated string value). Will be
+                                                                            ///< set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+    char modelName[ZES_STRING_PROPERTY_SIZE];                               ///< [out] Model name of the device (NULL terminated string value). Will be
+                                                                            ///< set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+    char vendorName[ZES_STRING_PROPERTY_SIZE];                              ///< [out] Vendor name of the device (NULL terminated string value). Will
+                                                                            ///< be set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+    char driverVersion[ZES_STRING_PROPERTY_SIZE];                           ///< [out] Installed driver version (NULL terminated string value). Will be
+                                                                            ///< set to the string "unknown" if this cannot be determined for the
+                                                                            ///< device.
+
+} zes_device_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device properties
+typedef struct _zes_device_ext_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_uuid_t uuid;                                                        ///< [out] universal unique identifier. Note: uuid obtained from Sysman API
+                                                                            ///< is the same as from core API. Subdevices will have their own uuid.
+    zes_device_type_t type;                                                 ///< [out] generic device type
+    zes_device_property_flags_t flags;                                      ///< [out] 0 (none) or a valid combination of ::zes_device_property_flag_t
+
+} zes_device_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get properties about the device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetProperties(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_device_properties_t* pProperties                                    ///< [in,out] Structure that will contain information about the device.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get information about the state of the device - if a reset is
+///        required, reasons for the reset and if the device has been repaired
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetState(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_device_state_t* pState                                              ///< [in,out] Structure that will contain information about the device.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reset device
+/// 
+/// @details
+///     - Performs a PCI bus reset of the device. This will result in all
+///       current device state being lost.
+///     - All applications using the device should be stopped before calling
+///       this function.
+///     - If the force argument is specified, all applications using the device
+///       will be forcibly killed.
+///     - The function will block until the device has restarted or an
+///       implementation defined timeout occurred waiting for the reset to
+///       complete.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to perform this operation.
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + Reset cannot be performed because applications are using this device.
+///     - ::ZE_RESULT_ERROR_UNKNOWN
+///         + There were problems unloading the device driver, performing a bus reset or reloading the device driver.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceReset(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle for the device
+    ze_bool_t force                                                         ///< [in] If set to true, all applications that are currently using the
+                                                                            ///< device will be forcibly killed.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reset device extension
+/// 
+/// @details
+///     - Performs a PCI bus reset of the device. This will result in all
+///       current device state being lost.
+///     - Prior to calling this function, user is responsible for closing
+///       applications using the device unless force argument is specified.
+///     - If the force argument is specified, all applications using the device
+///       will be forcibly killed.
+///     - The function will block until the device has restarted or a
+///       implementation specific timeout occurred waiting for the reset to
+///       complete.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to perform this operation.
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + Reset cannot be performed because applications are using this device.
+///     - ::ZE_RESULT_ERROR_UNKNOWN
+///         + There were problems unloading the device driver, performing a bus reset or reloading the device driver.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceResetExt(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle for the device
+    zes_reset_properties_t* pProperties                                     ///< [in] Device reset properties to apply
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Contains information about a process that has an open connection with
+///        this device
+/// 
+/// @details
+///     - The application can use the process ID to query the OS for the owner
+///       and the path to the executable.
+typedef struct _zes_process_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t processId;                                                     ///< [out] Host OS process ID.
+    uint64_t memSize;                                                       ///< [out] Device memory size in bytes allocated by this process (may not
+                                                                            ///< necessarily be resident on the device at the time of reading).
+    uint64_t sharedSize;                                                    ///< [out] The size of shared device memory mapped into this process (may
+                                                                            ///< not necessarily be resident on the device at the time of reading).
+    zes_engine_type_flags_t engines;                                        ///< [out] Bitfield of accelerator engine types being used by this process.
+
+} zes_process_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get information about host processes using the device
+/// 
+/// @details
+///     - The number of processes connected to the device is dynamic. This means
+///       that between a call to determine the value of pCount and the
+///       subsequent call, the number of processes may have increased or
+///       decreased. It is recommended that a large array be passed in so as to
+///       avoid receiving the error ::ZE_RESULT_ERROR_INVALID_SIZE. Also, always
+///       check the returned value in pCount since it may be less than the
+///       earlier call to get the required array size.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + The provided value of pCount is not big enough to store information about all the processes currently attached to the device.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceProcessesGetState(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle for the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of processes.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of processes currently attached to the device.
+                                                                            ///< if count is greater than the number of processes currently attached to
+                                                                            ///< the device, then the driver shall update the value with the correct
+                                                                            ///< number of processes.
+    zes_process_state_t* pProcesses                                         ///< [in,out][optional][range(0, *pCount)] array of process information.
+                                                                            ///< if count is less than the number of processes currently attached to
+                                                                            ///< the device, then the driver shall only retrieve information about that
+                                                                            ///< number of processes. In this case, the return code will ::ZE_RESULT_ERROR_INVALID_SIZE.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI address
+typedef struct _zes_pci_address_t
+{
+    uint32_t domain;                                                        ///< [out] BDF domain
+    uint32_t bus;                                                           ///< [out] BDF bus
+    uint32_t device;                                                        ///< [out] BDF device
+    uint32_t function;                                                      ///< [out] BDF function
+
+} zes_pci_address_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI speed
+typedef struct _zes_pci_speed_t
+{
+    int32_t gen;                                                            ///< [out] The link generation. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int32_t width;                                                          ///< [out] The number of lanes. A value of -1 means that this property is
+                                                                            ///< unknown.
+    int64_t maxBandwidth;                                                   ///< [out] The maximum bandwidth in bytes/sec (sum of all lanes). A value
+                                                                            ///< of -1 means that this property is unknown.
+
+} zes_pci_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Static PCI properties
+typedef struct _zes_pci_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_address_t address;                                              ///< [out] The BDF address
+    zes_pci_speed_t maxSpeed;                                               ///< [out] Fastest port configuration supported by the device (sum of all
+                                                                            ///< lanes)
+    ze_bool_t haveBandwidthCounters;                                        ///< [out] Indicates whether the `rxCounter` and `txCounter` members of
+                                                                            ///< ::zes_pci_stats_t will have valid values
+    ze_bool_t havePacketCounters;                                           ///< [out] Indicates whether the `packetCounter` member of
+                                                                            ///< ::zes_pci_stats_t will have a valid value
+    ze_bool_t haveReplayCounters;                                           ///< [out] Indicates whether the `replayCounter` member of
+                                                                            ///< ::zes_pci_stats_t will have a valid value
+
+} zes_pci_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI link status
+typedef enum _zes_pci_link_status_t
+{
+    ZES_PCI_LINK_STATUS_UNKNOWN = 0,                                        ///< The link status could not be determined
+    ZES_PCI_LINK_STATUS_GOOD = 1,                                           ///< The link is up and operating as expected
+    ZES_PCI_LINK_STATUS_QUALITY_ISSUES = 2,                                 ///< The link is up but has quality and/or bandwidth degradation
+    ZES_PCI_LINK_STATUS_STABILITY_ISSUES = 3,                               ///< The link has stability issues and preventing workloads making forward
+                                                                            ///< progress
+    ZES_PCI_LINK_STATUS_FORCE_UINT32 = 0x7fffffff
+
+} zes_pci_link_status_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI link quality degradation reasons
+typedef uint32_t zes_pci_link_qual_issue_flags_t;
+typedef enum _zes_pci_link_qual_issue_flag_t
+{
+    ZES_PCI_LINK_QUAL_ISSUE_FLAG_REPLAYS = ZE_BIT(0),                       ///< A significant number of replays are occurring
+    ZES_PCI_LINK_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1),                         ///< There is a degradation in the maximum bandwidth of the link
+    ZES_PCI_LINK_QUAL_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_pci_link_qual_issue_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI link stability issues
+typedef uint32_t zes_pci_link_stab_issue_flags_t;
+typedef enum _zes_pci_link_stab_issue_flag_t
+{
+    ZES_PCI_LINK_STAB_ISSUE_FLAG_RETRAINING = ZE_BIT(0),                    ///< Link retraining has occurred to deal with quality issues
+    ZES_PCI_LINK_STAB_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_pci_link_stab_issue_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Dynamic PCI state
+typedef struct _zes_pci_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_link_status_t status;                                           ///< [out] The current status of the port
+    zes_pci_link_qual_issue_flags_t qualityIssues;                          ///< [out] If status is ::ZES_PCI_LINK_STATUS_QUALITY_ISSUES, 
+                                                                            ///< then this gives a combination of ::zes_pci_link_qual_issue_flag_t for
+                                                                            ///< quality issues that have been detected;
+                                                                            ///< otherwise, 0 indicates there are no quality issues with the link at
+                                                                            ///< this time."
+    zes_pci_link_stab_issue_flags_t stabilityIssues;                        ///< [out] If status is ::ZES_PCI_LINK_STATUS_STABILITY_ISSUES, 
+                                                                            ///< then this gives a combination of ::zes_pci_link_stab_issue_flag_t for
+                                                                            ///< reasons for the connection instability;
+                                                                            ///< otherwise, 0 indicates there are no connection stability issues at
+                                                                            ///< this time."
+    zes_pci_speed_t speed;                                                  ///< [out] The current port configure speed
+
+} zes_pci_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI bar types
+typedef enum _zes_pci_bar_type_t
+{
+    ZES_PCI_BAR_TYPE_MMIO = 0,                                              ///< MMIO registers
+    ZES_PCI_BAR_TYPE_ROM = 1,                                               ///< ROM aperture
+    ZES_PCI_BAR_TYPE_MEM = 2,                                               ///< Device memory
+    ZES_PCI_BAR_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_pci_bar_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Properties of a pci bar
+typedef struct _zes_pci_bar_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_bar_type_t type;                                                ///< [out] The type of bar
+    uint32_t index;                                                         ///< [out] The index of the bar
+    uint64_t base;                                                          ///< [out] Base address of the bar.
+    uint64_t size;                                                          ///< [out] Size of the bar.
+
+} zes_pci_bar_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Properties of a pci bar, including the resizable bar.
+typedef struct _zes_pci_bar_properties_1_2_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_bar_type_t type;                                                ///< [out] The type of bar
+    uint32_t index;                                                         ///< [out] The index of the bar
+    uint64_t base;                                                          ///< [out] Base address of the bar.
+    uint64_t size;                                                          ///< [out] Size of the bar.
+    ze_bool_t resizableBarSupported;                                        ///< [out] Support for Resizable Bar on this device.
+    ze_bool_t resizableBarEnabled;                                          ///< [out] Resizable Bar enabled on this device
+
+} zes_pci_bar_properties_1_2_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PCI stats counters
+/// 
+/// @details
+///     - Percent replays is calculated by taking two snapshots (s1, s2) and
+///       using the equation: %replay = 10^6 * (s2.replayCounter -
+///       s1.replayCounter) / (s2.maxBandwidth * (s2.timestamp - s1.timestamp))
+///     - Percent throughput is calculated by taking two snapshots (s1, s2) and
+///       using the equation: %bw = 10^6 * ((s2.rxCounter - s1.rxCounter) +
+///       (s2.txCounter - s1.txCounter)) / (s2.maxBandwidth * (s2.timestamp -
+///       s1.timestamp))
+typedef struct _zes_pci_stats_t
+{
+    uint64_t timestamp;                                                     ///< [out] Monotonic timestamp counter in microseconds when the measurement
+                                                                            ///< was made.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+    uint64_t replayCounter;                                                 ///< [out] Monotonic counter for the number of replay packets (sum of all
+                                                                            ///< lanes). Will always be 0 when the `haveReplayCounters` member of
+                                                                            ///< ::zes_pci_properties_t is FALSE.
+    uint64_t packetCounter;                                                 ///< [out] Monotonic counter for the number of packets (sum of all lanes).
+                                                                            ///< Will always be 0 when the `havePacketCounters` member of
+                                                                            ///< ::zes_pci_properties_t is FALSE.
+    uint64_t rxCounter;                                                     ///< [out] Monotonic counter for the number of bytes received (sum of all
+                                                                            ///< lanes). Will always be 0 when the `haveBandwidthCounters` member of
+                                                                            ///< ::zes_pci_properties_t is FALSE.
+    uint64_t txCounter;                                                     ///< [out] Monotonic counter for the number of bytes transmitted (including
+                                                                            ///< replays) (sum of all lanes). Will always be 0 when the
+                                                                            ///< `haveBandwidthCounters` member of ::zes_pci_properties_t is FALSE.
+    zes_pci_speed_t speed;                                                  ///< [out] The current speed of the link (sum of all lanes)
+
+} zes_pci_stats_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get PCI properties - address, max speed
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDevicePciGetProperties(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pci_properties_t* pProperties                                       ///< [in,out] Will contain the PCI properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current PCI state - current speed
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDevicePciGetState(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pci_state_t* pState                                                 ///< [in,out] Will contain the PCI properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get information about each configured bar
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDevicePciGetBars(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of PCI bars.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of PCI bars that are setup.
+                                                                            ///< if count is greater than the number of PCI bars that are setup, then
+                                                                            ///< the driver shall update the value with the correct number of PCI bars.
+    zes_pci_bar_properties_t* pProperties                                   ///< [in,out][optional][range(0, *pCount)] array of information about setup
+                                                                            ///< PCI bars.
+                                                                            ///< if count is less than the number of PCI bars that are setup, then the
+                                                                            ///< driver shall only retrieve information about that number of PCI bars.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get PCI stats - bandwidth, number of packets, number of replays
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pStats`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to query this telemetry.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDevicePciGetStats(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pci_stats_t* pStats                                                 ///< [in,out] Will contain a snapshot of the latest stats.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Overclock controls, VF curve manipulation
+#if !defined(__GNUC__)
+#pragma region Overclock
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock domains.
+typedef enum _zes_overclock_domain_t
+{
+    ZES_OVERCLOCK_DOMAIN_CARD = 1,                                          ///< Overclocking card level properties such as temperature limits.
+    ZES_OVERCLOCK_DOMAIN_PACKAGE = 2,                                       ///< Overclocking package level properties such as power limits.
+    ZES_OVERCLOCK_DOMAIN_GPU_ALL = 4,                                       ///< Overclocking a GPU that has all accelerator assets on the same PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_RENDER_COMPUTE = 8,                            ///< Overclocking a GPU with render and compute assets on the same PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_RENDER = 16,                                   ///< Overclocking a GPU with render assets on its own PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_COMPUTE = 32,                                  ///< Overclocking a GPU with compute assets on its own PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_GPU_MEDIA = 64,                                    ///< Overclocking a GPU with media assets on its own PLL/VR.
+    ZES_OVERCLOCK_DOMAIN_VRAM = 128,                                        ///< Overclocking device local memory.
+    ZES_OVERCLOCK_DOMAIN_ADM = 256,                                         ///< Overclocking LLC/L4 cache.
+    ZES_OVERCLOCK_DOMAIN_FORCE_UINT32 = 0x7fffffff
+
+} zes_overclock_domain_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock controls.
+typedef enum _zes_overclock_control_t
+{
+    ZES_OVERCLOCK_CONTROL_VF = 1,                                           ///< This control permits setting a custom V-F curve.
+    ZES_OVERCLOCK_CONTROL_FREQ_OFFSET = 2,                                  ///< The V-F curve of the overclock domain can be shifted up or down using
+                                                                            ///< this control.
+    ZES_OVERCLOCK_CONTROL_VMAX_OFFSET = 4,                                  ///< This control is used to increase the permitted voltage above the
+                                                                            ///< shipped voltage maximum.
+    ZES_OVERCLOCK_CONTROL_FREQ = 8,                                         ///< This control permits direct changes to the operating frequency.
+    ZES_OVERCLOCK_CONTROL_VOLT_LIMIT = 16,                                  ///< This control prevents frequencies that would push the voltage above
+                                                                            ///< this value, typically used by V-F scanners.
+    ZES_OVERCLOCK_CONTROL_POWER_SUSTAINED_LIMIT = 32,                       ///< This control changes the sustained power limit (PL1).
+    ZES_OVERCLOCK_CONTROL_POWER_BURST_LIMIT = 64,                           ///< This control changes the burst power limit (PL2).
+    ZES_OVERCLOCK_CONTROL_POWER_PEAK_LIMIT = 128,                           ///< his control changes the peak power limit (PL4).
+    ZES_OVERCLOCK_CONTROL_ICCMAX_LIMIT = 256,                               ///< This control changes the value of IccMax..
+    ZES_OVERCLOCK_CONTROL_TEMP_LIMIT = 512,                                 ///< This control changes the value of TjMax.
+    ZES_OVERCLOCK_CONTROL_ITD_DISABLE = 1024,                               ///< This control permits disabling the adaptive voltage feature ITD
+    ZES_OVERCLOCK_CONTROL_ACM_DISABLE = 2048,                               ///< This control permits disabling the adaptive voltage feature ACM.
+    ZES_OVERCLOCK_CONTROL_FORCE_UINT32 = 0x7fffffff
+
+} zes_overclock_control_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock modes.
+typedef enum _zes_overclock_mode_t
+{
+    ZES_OVERCLOCK_MODE_MODE_OFF = 0,                                        ///< Overclock mode is off
+    ZES_OVERCLOCK_MODE_MODE_STOCK = 2,                                      ///< Stock (manufacturing settings) are being used.
+    ZES_OVERCLOCK_MODE_MODE_ON = 3,                                         ///< Overclock mode is on.
+    ZES_OVERCLOCK_MODE_MODE_UNAVAILABLE = 4,                                ///< Overclocking is unavailable at this time since the system is running
+                                                                            ///< on battery.
+    ZES_OVERCLOCK_MODE_MODE_DISABLED = 5,                                   ///< Overclock mode is disabled.
+    ZES_OVERCLOCK_MODE_FORCE_UINT32 = 0x7fffffff
+
+} zes_overclock_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock control states.
+typedef enum _zes_control_state_t
+{
+    ZES_CONTROL_STATE_STATE_UNSET = 0,                                      ///< No overclock control has not been changed by the driver since the last
+                                                                            ///< boot/reset.
+    ZES_CONTROL_STATE_STATE_ACTIVE = 2,                                     ///< The overclock control has been set and it is active.
+    ZES_CONTROL_STATE_STATE_DISABLED = 3,                                   ///< The overclock control value has been disabled due to the current power
+                                                                            ///< configuration (typically when running on DC).
+    ZES_CONTROL_STATE_FORCE_UINT32 = 0x7fffffff
+
+} zes_control_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock pending actions.
+typedef enum _zes_pending_action_t
+{
+    ZES_PENDING_ACTION_PENDING_NONE = 0,                                    ///< There no pending actions. .
+    ZES_PENDING_ACTION_PENDING_IMMINENT = 1,                                ///< The requested change is in progress and should complete soon.
+    ZES_PENDING_ACTION_PENDING_COLD_RESET = 2,                              ///< The requested change requires a device cold reset (hotplug, system
+                                                                            ///< boot).
+    ZES_PENDING_ACTION_PENDING_WARM_RESET = 3,                              ///< The requested change requires a device warm reset (PCIe FLR).
+    ZES_PENDING_ACTION_FORCE_UINT32 = 0x7fffffff
+
+} zes_pending_action_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock V-F curve programing.
+typedef enum _zes_vf_program_type_t
+{
+    ZES_VF_PROGRAM_TYPE_VF_ARBITRARY = 0,                                   ///< Can program an arbitrary number of V-F points up to the maximum number
+                                                                            ///< and each point can have arbitrary voltage and frequency values within
+                                                                            ///< the min/max/step limits
+    ZES_VF_PROGRAM_TYPE_VF_FREQ_FIXED = 1,                                  ///< Can only program the voltage for the V-F points that it reads back -
+                                                                            ///< the frequency of those points cannot be changed
+    ZES_VF_PROGRAM_TYPE_VF_VOLT_FIXED = 2,                                  ///< Can only program the frequency for the V-F points that is reads back -
+                                                                            ///< the voltage of each point cannot be changed.
+    ZES_VF_PROGRAM_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_vf_program_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief VF type
+typedef enum _zes_vf_type_t
+{
+    ZES_VF_TYPE_VOLT = 0,                                                   ///< VF Voltage point
+    ZES_VF_TYPE_FREQ = 1,                                                   ///< VF Frequency point
+    ZES_VF_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_vf_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief VF type
+typedef enum _zes_vf_array_type_t
+{
+    ZES_VF_ARRAY_TYPE_USER_VF_ARRAY = 0,                                    ///< User V-F array
+    ZES_VF_ARRAY_TYPE_DEFAULT_VF_ARRAY = 1,                                 ///< Default V-F array
+    ZES_VF_ARRAY_TYPE_LIVE_VF_ARRAY = 2,                                    ///< Live V-F array
+    ZES_VF_ARRAY_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_vf_array_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock properties
+/// 
+/// @details
+///     - Information on the overclock domain type and all the contols that are
+///       part of the domain.
+typedef struct _zes_overclock_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_overclock_domain_t domainType;                                      ///< [out] The hardware block that this overclock domain controls (GPU,
+                                                                            ///< VRAM, ...)
+    uint32_t AvailableControls;                                             ///< [out] Returns the overclock controls that are supported (a bit for
+                                                                            ///< each of enum ::zes_overclock_control_t). If no bits are set, the
+                                                                            ///< domain doesn't support overclocking.
+    zes_vf_program_type_t VFProgramType;                                    ///< [out] Type of V-F curve programming that is permitted:.
+    uint32_t NumberOfVFPoints;                                              ///< [out] Number of VF points that can be programmed - max_num_points
+
+} zes_overclock_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock Control properties
+/// 
+/// @details
+///     - Provides all the control capabilities supported by the device for the
+///       overclock domain.
+typedef struct _zes_control_property_t
+{
+    double MinValue;                                                        ///< [out]  This provides information about the limits of the control value
+                                                                            ///< so that the driver can calculate the set of valid values.
+    double MaxValue;                                                        ///< [out]  This provides information about the limits of the control value
+                                                                            ///< so that the driver can calculate the set of valid values.
+    double StepValue;                                                       ///< [out]  This provides information about the limits of the control value
+                                                                            ///< so that the driver can calculate the set of valid values.
+    double RefValue;                                                        ///< [out] The reference value provides the anchor point, UIs can combine
+                                                                            ///< this with the user offset request to show the anticipated improvement.
+    double DefaultValue;                                                    ///< [out] The shipped out-of-box position of this control. Driver can
+                                                                            ///< request this value at any time to return to the out-of-box behavior.
+
+} zes_control_property_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclock VF properties
+/// 
+/// @details
+///     - Provides all the VF capabilities supported by the device for the
+///       overclock domain.
+typedef struct _zes_vf_property_t
+{
+    double MinFreq;                                                         ///< [out] Read the minimum frequency that can be be programmed in the
+                                                                            ///< custom V-F point..
+    double MaxFreq;                                                         ///< [out] Read the maximum frequency that can be be programmed in the
+                                                                            ///< custom V-F point..
+    double StepFreq;                                                        ///< [out] Read the frequency step that can be be programmed in the custom
+                                                                            ///< V-F point..
+    double MinVolt;                                                         ///< [out] Read the minimum voltage that can be be programmed in the custom
+                                                                            ///< V-F point..
+    double MaxVolt;                                                         ///< [out] Read the maximum voltage that can be be programmed in the custom
+                                                                            ///< V-F point..
+    double StepVolt;                                                        ///< [out] Read the voltage step that can be be programmed in the custom
+                                                                            ///< V-F point.
+
+} zes_vf_property_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the overclock waiver.The overclock waiver setting is persistent
+///        until the next pcode boot
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This product does not support overclocking
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceSetOverclockWaiver(
+    zes_device_handle_t hDevice                                             ///< [in] Sysman handle of the device.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the list of supported overclock domains for this device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOverclockDomains`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetOverclockDomains(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pOverclockDomains                                             ///< [in,out] Returns the overclock domains that are supported (a bit for
+                                                                            ///< each of enum ::zes_overclock_domain_t). If no bits are set, the device
+                                                                            ///< doesn't support overclocking.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the list of supported overclock controls available for one of the
+///        supported overclock domains on the device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OVERCLOCK_DOMAIN_ADM < domainType`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pAvailableControls`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetOverclockControls(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_overclock_domain_t domainType,                                      ///< [in] Domain type.
+    uint32_t* pAvailableControls                                            ///< [in,out] Returns the overclock controls that are supported for the
+                                                                            ///< specified overclock domain (a bit for each of enum
+                                                                            ///< ::zes_overclock_control_t).
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reset all overclock settings to default values (shipped = 1 or
+///        manufacturing =0)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceResetOverclockSettings(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    ze_bool_t onShippedState                                                ///< [in] True will reset to shipped state; false will reset to
+                                                                            ///< manufacturing state
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Determine the state of overclocking
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOverclockMode`
+///         + `nullptr == pWaiverSetting`
+///         + `nullptr == pOverclockState`
+///         + `nullptr == pPendingAction`
+///         + `nullptr == pPendingReset`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceReadOverclockState(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_overclock_mode_t* pOverclockMode,                                   ///< [out] One of overclock mode.
+    ze_bool_t* pWaiverSetting,                                              ///< [out] Waiver setting: 0 = Waiver not set, 1 = waiver has been set.
+    ze_bool_t* pOverclockState,                                             ///< [out] Current settings 0 =manufacturing state, 1= shipped state)..
+    zes_pending_action_t* pPendingAction,                                   ///< [out] This enum is returned when the driver attempts to set an
+                                                                            ///< overclock control or reset overclock settings.
+    ze_bool_t* pPendingReset                                                ///< [out] Pending reset 0 =manufacturing state, 1= shipped state)..
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of overclock domains
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumOverclockDomains(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_overclock_handle_t* phDomainHandle                                  ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get overclock domain control properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDomainProperties`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetDomainProperties(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_properties_t* pDomainProperties                           ///< [in,out] The overclock properties for the specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read overclock VF min,max and step values
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pVFProperties`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetDomainVFProperties(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_vf_property_t* pVFProperties                                        ///< [in,out] The VF min,max,step for a specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read overclock control values - min/max/step/default/ref
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OVERCLOCK_CONTROL_ACM_DISABLE < DomainControl`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pControlProperties`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetDomainControlProperties(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Handle for the component.
+    zes_control_property_t* pControlProperties                              ///< [in,out] overclock control values.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read the current value for a given overclock control
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OVERCLOCK_CONTROL_ACM_DISABLE < DomainControl`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pValue`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetControlCurrentValue(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Overclock Control.
+    double* pValue                                                          ///< [in,out] Getting overclock control value for the specified control.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read the the reset pending value for a given overclock control
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OVERCLOCK_CONTROL_ACM_DISABLE < DomainControl`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pValue`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetControlPendingValue(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Overclock Control.
+    double* pValue                                                          ///< [out] Returns the pending value for a given control. The units and
+                                                                            ///< format of the value depend on the control type.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the value for a given overclock control
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OVERCLOCK_CONTROL_ACM_DISABLE < DomainControl`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pPendingAction`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockSetControlUserValue(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Domain Control.
+    double pValue,                                                          ///< [in] The new value of the control. The units and format of the value
+                                                                            ///< depend on the control type.
+    zes_pending_action_t* pPendingAction                                    ///< [out] Pending overclock setting.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Determine the state of an overclock control
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OVERCLOCK_CONTROL_ACM_DISABLE < DomainControl`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pControlState`
+///         + `nullptr == pPendingAction`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetControlState(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_overclock_control_t DomainControl,                                  ///< [in] Domain Control.
+    zes_control_state_t* pControlState,                                     ///< [out] Current overclock control state.
+    zes_pending_action_t* pPendingAction                                    ///< [out] Pending overclock setting.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read the frequency or voltage of a V-F point from the default or
+///        custom V-F curve.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_VF_TYPE_FREQ < VFType`
+///         + `::ZES_VF_ARRAY_TYPE_LIVE_VF_ARRAY < VFArrayType`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == PointValue`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockGetVFPointValues(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_vf_type_t VFType,                                                   ///< [in] Voltage or Freqency point to read.
+    zes_vf_array_type_t VFArrayType,                                        ///< [in] User,Default or Live VF array to read from
+    uint32_t PointIndex,                                                    ///< [in] Point index - number between (0, max_num_points - 1).
+    uint32_t* PointValue                                                    ///< [out] Returns the frequency in 1kHz units or voltage in millivolt
+                                                                            ///< units from the custom V-F curve at the specified zero-based index 
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Write the frequency or voltage of a V-F point to custom V-F curve.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDomainHandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_VF_TYPE_FREQ < VFType`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this control domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesOverclockSetVFPointValues(
+    zes_overclock_handle_t hDomainHandle,                                   ///< [in] Handle for the component domain.
+    zes_vf_type_t VFType,                                                   ///< [in] Voltage or Freqency point to read.
+    uint32_t PointIndex,                                                    ///< [in] Point index - number between (0, max_num_points - 1).
+    uint32_t PointValue                                                     ///< [in] Writes frequency in 1kHz units or voltage in millivolt units to
+                                                                            ///< custom V-F curve at the specified zero-based index 
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region diagnostics
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Diagnostic results
+typedef enum _zes_diag_result_t
+{
+    ZES_DIAG_RESULT_NO_ERRORS = 0,                                          ///< Diagnostic completed without finding errors to repair
+    ZES_DIAG_RESULT_ABORT = 1,                                              ///< Diagnostic had problems running tests
+    ZES_DIAG_RESULT_FAIL_CANT_REPAIR = 2,                                   ///< Diagnostic had problems setting up repairs
+    ZES_DIAG_RESULT_REBOOT_FOR_REPAIR = 3,                                  ///< Diagnostics found errors, setup for repair and reboot is required to
+                                                                            ///< complete the process
+    ZES_DIAG_RESULT_FORCE_UINT32 = 0x7fffffff
+
+} zes_diag_result_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_DIAG_FIRST_TEST_INDEX
+/// @brief Diagnostic test index to use for the very first test.
+#define ZES_DIAG_FIRST_TEST_INDEX  0x0
+#endif // ZES_DIAG_FIRST_TEST_INDEX
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_DIAG_LAST_TEST_INDEX
+/// @brief Diagnostic test index to use for the very last test.
+#define ZES_DIAG_LAST_TEST_INDEX  0xFFFFFFFF
+#endif // ZES_DIAG_LAST_TEST_INDEX
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Diagnostic test
+typedef struct _zes_diag_test_t
+{
+    uint32_t index;                                                         ///< [out] Index of the test
+    char name[ZES_STRING_PROPERTY_SIZE];                                    ///< [out] Name of the test
+
+} zes_diag_test_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Diagnostics test suite properties
+typedef struct _zes_diag_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    char name[ZES_STRING_PROPERTY_SIZE];                                    ///< [out] Name of the diagnostics test suite
+    ze_bool_t haveTests;                                                    ///< [out] Indicates if this test suite has individual tests which can be
+                                                                            ///< run separately (use the function ::zesDiagnosticsGetTests() to get the
+                                                                            ///< list of these tests)
+
+} zes_diag_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of diagnostics test suites
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumDiagnosticTestSuites(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_diag_handle_t* phDiagnostics                                        ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get properties of a diagnostics test suite
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDiagnostics`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDiagnosticsGetProperties(
+    zes_diag_handle_t hDiagnostics,                                         ///< [in] Handle for the component.
+    zes_diag_properties_t* pProperties                                      ///< [in,out] Structure describing the properties of a diagnostics test
+                                                                            ///< suite
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get individual tests that can be run separately. Not all test suites
+///        permit running individual tests, check the `haveTests` member of
+///        ::zes_diag_properties_t.
+/// 
+/// @details
+///     - The list of available tests is returned in order of increasing test
+///       index (see the `index` member of ::zes_diag_test_t).
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDiagnostics`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDiagnosticsGetTests(
+    zes_diag_handle_t hDiagnostics,                                         ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of tests.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of tests that are available.
+                                                                            ///< if count is greater than the number of tests that are available, then
+                                                                            ///< the driver shall update the value with the correct number of tests.
+    zes_diag_test_t* pTests                                                 ///< [in,out][optional][range(0, *pCount)] array of information about
+                                                                            ///< individual tests sorted by increasing value of the `index` member of ::zes_diag_test_t.
+                                                                            ///< if count is less than the number of tests that are available, then the
+                                                                            ///< driver shall only retrieve that number of tests.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Run a diagnostics test suite, either all tests or a subset of tests.
+/// 
+/// @details
+///     - WARNING: Running diagnostics may destroy current device state
+///       information. Gracefully close any running workloads before initiating.
+///     - To run all tests in a test suite, set start =
+///       ::ZES_DIAG_FIRST_TEST_INDEX and end = ::ZES_DIAG_LAST_TEST_INDEX.
+///     - If the test suite permits running individual tests, the `haveTests`
+///       member of ::zes_diag_properties_t will be true. In this case, the
+///       function ::zesDiagnosticsGetTests() can be called to get the list of
+///       tests and corresponding indices that can be supplied to the arguments
+///       start and end in this function.
+///     - This function will block until the diagnostics have completed and
+///       force reset based on result
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDiagnostics`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pResult`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to perform diagnostics.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDiagnosticsRunTests(
+    zes_diag_handle_t hDiagnostics,                                         ///< [in] Handle for the component.
+    uint32_t startIndex,                                                    ///< [in] The index of the first test to run. Set to
+                                                                            ///< ::ZES_DIAG_FIRST_TEST_INDEX to start from the beginning.
+    uint32_t endIndex,                                                      ///< [in] The index of the last test to run. Set to
+                                                                            ///< ::ZES_DIAG_LAST_TEST_INDEX to complete all tests after the start test.
+    zes_diag_result_t* pResult                                              ///< [in,out] The result of the diagnostics
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - ECC management
+#if !defined(__GNUC__)
+#pragma region ecc
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief ECC State
+typedef enum _zes_device_ecc_state_t
+{
+    ZES_DEVICE_ECC_STATE_UNAVAILABLE = 0,                                   ///< None
+    ZES_DEVICE_ECC_STATE_ENABLED = 1,                                       ///< ECC enabled.
+    ZES_DEVICE_ECC_STATE_DISABLED = 2,                                      ///< ECC disabled.
+    ZES_DEVICE_ECC_STATE_FORCE_UINT32 = 0x7fffffff
+
+} zes_device_ecc_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief State Change Requirements
+typedef enum _zes_device_action_t
+{
+    ZES_DEVICE_ACTION_NONE = 0,                                             ///< No action.
+    ZES_DEVICE_ACTION_WARM_CARD_RESET = 1,                                  ///< Warm reset of the card.
+    ZES_DEVICE_ACTION_COLD_CARD_RESET = 2,                                  ///< Cold reset of the card.
+    ZES_DEVICE_ACTION_COLD_SYSTEM_REBOOT = 3,                               ///< Cold reboot of the system.
+    ZES_DEVICE_ACTION_FORCE_UINT32 = 0x7fffffff
+
+} zes_device_action_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief ECC State Descriptor
+typedef struct _zes_device_ecc_desc_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_device_ecc_state_t state;                                           ///< [out] ECC state
+
+} zes_device_ecc_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief ECC State
+typedef struct _zes_device_ecc_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_device_ecc_state_t currentState;                                    ///< [out] Current ECC state
+    zes_device_ecc_state_t pendingState;                                    ///< [out] Pending ECC state
+    zes_device_action_t pendingAction;                                      ///< [out] Pending action
+
+} zes_device_ecc_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Is ECC functionality available - true or false?
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pAvailable`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEccAvailable(
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    ze_bool_t* pAvailable                                                   ///< [out] ECC functionality is available (true)/unavailable (false).
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Is ECC support configurable - true or false?
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfigurable`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEccConfigurable(
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    ze_bool_t* pConfigurable                                                ///< [out] ECC can be enabled/disabled (true)/enabled/disabled (false).
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current ECC state, pending state, and pending action
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetEccState(
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    zes_device_ecc_properties_t* pState                                     ///< [out] ECC state, pending state, and pending action for state change.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set new ECC state
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - ::zesDeviceGetState should be called to determine pending action
+///       required to implement state change.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == newState`
+///         + `nullptr == pState`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_DEVICE_ECC_STATE_DISABLED < newState->state`
+///     - ::ZE_RESULT_WARNING_ACTION_REQUIRED
+///         + User must look at the pendingAction attribute of pState & perform the action required to complete the ECC state change.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceSetEccState(
+    zes_device_handle_t hDevice,                                            ///< [in] Handle for the component.
+    const zes_device_ecc_desc_t* newState,                                  ///< [in] Pointer to desired ECC state.
+    zes_device_ecc_properties_t* pState                                     ///< [out] ECC state, pending state, and pending action for state change.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Engine groups
+#if !defined(__GNUC__)
+#pragma region engine
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Accelerator engine groups
+typedef enum _zes_engine_group_t
+{
+    ZES_ENGINE_GROUP_ALL = 0,                                               ///< Access information about all engines combined.
+    ZES_ENGINE_GROUP_COMPUTE_ALL = 1,                                       ///< Access information about all compute engines combined. Compute engines
+                                                                            ///< can only process compute kernels (no 3D content).
+    ZES_ENGINE_GROUP_MEDIA_ALL = 2,                                         ///< Access information about all media engines combined.
+    ZES_ENGINE_GROUP_COPY_ALL = 3,                                          ///< Access information about all copy (blitter) engines combined.
+    ZES_ENGINE_GROUP_COMPUTE_SINGLE = 4,                                    ///< Access information about a single compute engine - this is an engine
+                                                                            ///< that can process compute kernels. Note that single engines may share
+                                                                            ///< the same underlying accelerator resources as other engines so activity
+                                                                            ///< of such an engine may not be indicative of the underlying resource
+                                                                            ///< utilization - use ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
+    ZES_ENGINE_GROUP_RENDER_SINGLE = 5,                                     ///< Access information about a single render engine - this is an engine
+                                                                            ///< that can process both 3D content and compute kernels. Note that single
+                                                                            ///< engines may share the same underlying accelerator resources as other
+                                                                            ///< engines so activity of such an engine may not be indicative of the
+                                                                            ///< underlying resource utilization - use
+                                                                            ///< ::ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL for that.
+    ZES_ENGINE_GROUP_MEDIA_DECODE_SINGLE = 6,                               ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_MEDIA_ENCODE_SINGLE = 7,                               ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_COPY_SINGLE = 8,                                       ///< Access information about a single media encode engine. Note that
+                                                                            ///< single engines may share the same underlying accelerator resources as
+                                                                            ///< other engines so activity of such an engine may not be indicative of
+                                                                            ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_COPY_ALL
+                                                                            ///< for that.
+    ZES_ENGINE_GROUP_MEDIA_ENHANCEMENT_SINGLE = 9,                          ///< Access information about a single media enhancement engine. Note that
+                                                                            ///< single engines may share the same underlying accelerator resources as
+                                                                            ///< other engines so activity of such an engine may not be indicative of
+                                                                            ///< the underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL
+                                                                            ///< for that.
+    ZES_ENGINE_GROUP_3D_SINGLE = 10,                                        ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_3D_RENDER_COMPUTE_ALL = 11,                            ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_RENDER_ALL = 12,                                       ///< Access information about all render engines combined. Render engines
+                                                                            ///< are those than process both 3D content and compute kernels.
+    ZES_ENGINE_GROUP_3D_ALL = 13,                                           ///< [DEPRECATED] No longer supported.
+    ZES_ENGINE_GROUP_MEDIA_CODEC_SINGLE = 14,                               ///< Access information about a single media engine. Note that single
+                                                                            ///< engines may share the same underlying accelerator resources as other
+                                                                            ///< engines so activity of such an engine may not be indicative of the
+                                                                            ///< underlying resource utilization - use ::ZES_ENGINE_GROUP_MEDIA_ALL for
+                                                                            ///< that.
+    ZES_ENGINE_GROUP_FORCE_UINT32 = 0x7fffffff
+
+} zes_engine_group_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Engine group properties
+typedef struct _zes_engine_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_engine_group_t type;                                                ///< [out] The engine group
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+
+} zes_engine_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Engine activity counters
+/// 
+/// @details
+///     - Percent utilization is calculated by taking two snapshots (s1, s2) and
+///       using the equation: %util = (s2.activeTime - s1.activeTime) /
+///       (s2.timestamp - s1.timestamp)
+///     - The `activeTime` time units are implementation-specific since the
+///       value is only intended to be used for calculating utilization
+///       percentage.
+///     - The `timestamp` should only be used to calculate delta between
+///       snapshots of this structure.
+///     - The application should never take the delta of `timestamp` with the
+///       timestamp from a different structure since they are not guaranteed to
+///       have the same base.
+///     - When taking the delta, the difference between `timestamp` samples
+///       could be `0`, if the frequency of sampling the snapshots is higher
+///       than the frequency of the timestamp update.
+///     - The absolute value of `timestamp` is only valid during within the
+///       application and may be different on the next execution.
+typedef struct _zes_engine_stats_t
+{
+    uint64_t activeTime;                                                    ///< [out] Monotonic counter where the resource is actively running
+                                                                            ///< workloads.
+    uint64_t timestamp;                                                     ///< [out] Monotonic counter when activeTime counter was sampled.
+
+} zes_engine_stats_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of engine groups
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumEngineGroups(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_engine_handle_t* phEngine                                           ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get engine group properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEngine`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesEngineGetProperties(
+    zes_engine_handle_t hEngine,                                            ///< [in] Handle for the component.
+    zes_engine_properties_t* pProperties                                    ///< [in,out] The properties for the specified engine group.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the activity stats for an engine group.
+/// 
+/// @details
+///     - This function also returns the engine activity inside a Virtual
+///       Machine (VM), in the presence of hardware virtualization (SRIOV)
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEngine`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pStats`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesEngineGetActivity(
+    zes_engine_handle_t hEngine,                                            ///< [in] Handle for the component.
+    zes_engine_stats_t* pStats                                              ///< [in,out] Will contain a snapshot of the engine group activity
+                                                                            ///< counters.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Event management
+#if !defined(__GNUC__)
+#pragma region events
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event types
+typedef uint32_t zes_event_type_flags_t;
+typedef enum _zes_event_type_flag_t
+{
+    ZES_EVENT_TYPE_FLAG_DEVICE_DETACH = ZE_BIT(0),                          ///< Event is triggered when the device is no longer available (due to a
+                                                                            ///< reset or being disabled).
+    ZES_EVENT_TYPE_FLAG_DEVICE_ATTACH = ZE_BIT(1),                          ///< Event is triggered after the device is available again.
+    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_ENTER = ZE_BIT(2),               ///< Event is triggered when the driver is about to put the device into a
+                                                                            ///< deep sleep state
+    ZES_EVENT_TYPE_FLAG_DEVICE_SLEEP_STATE_EXIT = ZE_BIT(3),                ///< Event is triggered when the driver is waking the device up from a deep
+                                                                            ///< sleep state
+    ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED = ZE_BIT(4),                         ///< Event is triggered when the frequency starts being throttled
+    ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED = ZE_BIT(5),               ///< Event is triggered when the energy consumption threshold is reached
+                                                                            ///< (use ::zesPowerSetEnergyThreshold() to configure).
+    ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL = ZE_BIT(6),                          ///< Event is triggered when the critical temperature is reached (use
+                                                                            ///< ::zesTemperatureSetConfig() to configure - disabled by default).
+    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 = ZE_BIT(7),                        ///< Event is triggered when the temperature crosses threshold 1 (use
+                                                                            ///< ::zesTemperatureSetConfig() to configure - disabled by default).
+    ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 = ZE_BIT(8),                        ///< Event is triggered when the temperature crosses threshold 2 (use
+                                                                            ///< ::zesTemperatureSetConfig() to configure - disabled by default).
+    ZES_EVENT_TYPE_FLAG_MEM_HEALTH = ZE_BIT(9),                             ///< Event is triggered when the health of device memory changes.
+    ZES_EVENT_TYPE_FLAG_FABRIC_PORT_HEALTH = ZE_BIT(10),                    ///< Event is triggered when the health of fabric ports change.
+    ZES_EVENT_TYPE_FLAG_PCI_LINK_HEALTH = ZE_BIT(11),                       ///< Event is triggered when the health of the PCI link changes.
+    ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS = ZE_BIT(12),                ///< Event is triggered when accelerator RAS correctable errors cross
+                                                                            ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
+                                                                            ///< default).
+    ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS = ZE_BIT(13),              ///< Event is triggered when accelerator RAS uncorrectable errors cross
+                                                                            ///< thresholds (use ::zesRasSetConfig() to configure - disabled by
+                                                                            ///< default).
+    ZES_EVENT_TYPE_FLAG_DEVICE_RESET_REQUIRED = ZE_BIT(14),                 ///< Event is triggered when the device needs to be reset (use
+                                                                            ///< ::zesDeviceGetState() to determine the reasons for the reset).
+    ZES_EVENT_TYPE_FLAG_SURVIVABILITY_MODE_DETECTED = ZE_BIT(15),           ///< Event is triggered when graphics driver encounter an error condition.
+    ZES_EVENT_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_event_type_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Specify the list of events to listen to for a given device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xffff < events`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEventRegister(
+    zes_device_handle_t hDevice,                                            ///< [in] The device handle.
+    zes_event_type_flags_t events                                           ///< [in] List of events to listen to.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Wait for events to be received from a one or more devices.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phDevices`
+///         + `nullptr == pNumDeviceEvents`
+///         + `nullptr == pEvents`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to listen to events.
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + One or more of the supplied device handles belongs to a different driver.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverEventListen(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint32_t timeout,                                                       ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then will check status and return immediately;
+                                                                            ///< if `UINT32_MAX`, then function will not return until events arrive.
+    uint32_t count,                                                         ///< [in] Number of device handles in phDevices.
+    zes_device_handle_t* phDevices,                                         ///< [in][range(0, count)] Device handles to listen to for events. Only
+                                                                            ///< devices from the provided driver handle can be specified in this list.
+    uint32_t* pNumDeviceEvents,                                             ///< [in,out] Will contain the actual number of devices in phDevices that
+                                                                            ///< generated events. If non-zero, check pEvents to determine the devices
+                                                                            ///< and events that were received.
+    zes_event_type_flags_t* pEvents                                         ///< [in,out] An array that will continue the list of events for each
+                                                                            ///< device listened in phDevices.
+                                                                            ///< This array must be at least as big as count.
+                                                                            ///< For every device handle in phDevices, this will provide the events
+                                                                            ///< that occurred for that device at the same position in this array. If
+                                                                            ///< no event was received for a given device, the corresponding array
+                                                                            ///< entry will be zero.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Wait for events to be received from a one or more devices.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phDevices`
+///         + `nullptr == pNumDeviceEvents`
+///         + `nullptr == pEvents`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to listen to events.
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + One or more of the supplied device handles belongs to a different driver.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverEventListenEx(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    uint64_t timeout,                                                       ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then will check status and return immediately;
+                                                                            ///< if `UINT64_MAX`, then function will not return until events arrive.
+    uint32_t count,                                                         ///< [in] Number of device handles in phDevices.
+    zes_device_handle_t* phDevices,                                         ///< [in][range(0, count)] Device handles to listen to for events. Only
+                                                                            ///< devices from the provided driver handle can be specified in this list.
+    uint32_t* pNumDeviceEvents,                                             ///< [in,out] Will contain the actual number of devices in phDevices that
+                                                                            ///< generated events. If non-zero, check pEvents to determine the devices
+                                                                            ///< and events that were received.
+    zes_event_type_flags_t* pEvents                                         ///< [in,out] An array that will continue the list of events for each
+                                                                            ///< device listened in phDevices.
+                                                                            ///< This array must be at least as big as count.
+                                                                            ///< For every device handle in phDevices, this will provide the events
+                                                                            ///< that occurred for that device at the same position in this array. If
+                                                                            ///< no event was received for a given device, the corresponding array
+                                                                            ///< entry will be zero.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region fabric
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_FABRIC_PORT_MODEL_SIZE
+/// @brief Maximum Fabric port model string size
+#define ZES_MAX_FABRIC_PORT_MODEL_SIZE  256
+#endif // ZES_MAX_FABRIC_PORT_MODEL_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_FABRIC_LINK_TYPE_SIZE
+/// @brief Maximum size of the buffer that will return information about link
+///        types
+#define ZES_MAX_FABRIC_LINK_TYPE_SIZE  256
+#endif // ZES_MAX_FABRIC_LINK_TYPE_SIZE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port status
+typedef enum _zes_fabric_port_status_t
+{
+    ZES_FABRIC_PORT_STATUS_UNKNOWN = 0,                                     ///< The port status cannot be determined
+    ZES_FABRIC_PORT_STATUS_HEALTHY = 1,                                     ///< The port is up and operating as expected
+    ZES_FABRIC_PORT_STATUS_DEGRADED = 2,                                    ///< The port is up but has quality and/or speed degradation
+    ZES_FABRIC_PORT_STATUS_FAILED = 3,                                      ///< Port connection instabilities are preventing workloads making forward
+                                                                            ///< progress
+    ZES_FABRIC_PORT_STATUS_DISABLED = 4,                                    ///< The port is configured down
+    ZES_FABRIC_PORT_STATUS_FORCE_UINT32 = 0x7fffffff
+
+} zes_fabric_port_status_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port quality degradation reasons
+typedef uint32_t zes_fabric_port_qual_issue_flags_t;
+typedef enum _zes_fabric_port_qual_issue_flag_t
+{
+    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_LINK_ERRORS = ZE_BIT(0),                ///< Excessive link errors are occurring
+    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_SPEED = ZE_BIT(1),                      ///< There is a degradation in the bitrate and/or width of the link
+    ZES_FABRIC_PORT_QUAL_ISSUE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_fabric_port_qual_issue_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port failure reasons
+typedef uint32_t zes_fabric_port_failure_flags_t;
+typedef enum _zes_fabric_port_failure_flag_t
+{
+    ZES_FABRIC_PORT_FAILURE_FLAG_FAILED = ZE_BIT(0),                        ///< A previously operating link has failed. Hardware will automatically
+                                                                            ///< retrain this port. This state will persist until either the physical
+                                                                            ///< connection is removed or the link trains successfully.
+    ZES_FABRIC_PORT_FAILURE_FLAG_TRAINING_TIMEOUT = ZE_BIT(1),              ///< A connection has not been established within an expected time.
+                                                                            ///< Hardware will continue to attempt port training. This status will
+                                                                            ///< persist until either the physical connection is removed or the link
+                                                                            ///< successfully trains.
+    ZES_FABRIC_PORT_FAILURE_FLAG_FLAPPING = ZE_BIT(2),                      ///< Port has excessively trained and then transitioned down for some
+                                                                            ///< period of time. Driver will allow port to continue to train, but will
+                                                                            ///< not enable the port for use until the port has been disabled and
+                                                                            ///< subsequently re-enabled using ::zesFabricPortSetConfig().
+    ZES_FABRIC_PORT_FAILURE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_fabric_port_failure_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Unique identifier for a fabric port
+/// 
+/// @details
+///     - This not a universal identifier. The identified is garanteed to be
+///       unique for the current hardware configuration of the system. Changes
+///       in the hardware may result in a different identifier for a given port.
+///     - The main purpose of this identifier to build up an instantaneous
+///       topology map of system connectivity. An application should enumerate
+///       all fabric ports and match the `remotePortId` member of
+///       ::zes_fabric_port_state_t to the `portId` member of
+///       ::zes_fabric_port_properties_t.
+typedef struct _zes_fabric_port_id_t
+{
+    uint32_t fabricId;                                                      ///< [out] Unique identifier for the fabric end-point
+    uint32_t attachId;                                                      ///< [out] Unique identifier for the device attachment point
+    uint8_t portNumber;                                                     ///< [out] The logical port number (this is typically marked somewhere on
+                                                                            ///< the physical device)
+
+} zes_fabric_port_id_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port speed in one direction
+typedef struct _zes_fabric_port_speed_t
+{
+    int64_t bitRate;                                                        ///< [out] Bits/sec that the link is operating at. A value of -1 means that
+                                                                            ///< this property is unknown.
+    int32_t width;                                                          ///< [out] The number of lanes. A value of -1 means that this property is
+                                                                            ///< unknown.
+
+} zes_fabric_port_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port properties
+typedef struct _zes_fabric_port_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    char model[ZES_MAX_FABRIC_PORT_MODEL_SIZE];                             ///< [out] Description of port technology. Will be set to the string
+                                                                            ///< "unkown" if this cannot be determined for this port.
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the port is located on a sub-device; false means that
+                                                                            ///< the port is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_fabric_port_id_t portId;                                            ///< [out] The unique port identifier
+    zes_fabric_port_speed_t maxRxSpeed;                                     ///< [out] Maximum speed supported by the receive side of the port (sum of
+                                                                            ///< all lanes)
+    zes_fabric_port_speed_t maxTxSpeed;                                     ///< [out] Maximum speed supported by the transmit side of the port (sum of
+                                                                            ///< all lanes)
+
+} zes_fabric_port_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provides information about the fabric link attached to a port
+typedef struct _zes_fabric_link_type_t
+{
+    char desc[ZES_MAX_FABRIC_LINK_TYPE_SIZE];                               ///< [out] Description of link technology. Will be set to the string
+                                                                            ///< "unkown" if this cannot be determined for this link.
+
+} zes_fabric_link_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port configuration
+typedef struct _zes_fabric_port_config_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t enabled;                                                      ///< [in,out] Port is configured up/down
+    ze_bool_t beaconing;                                                    ///< [in,out] Beaconing is configured on/off
+
+} zes_fabric_port_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port state
+typedef struct _zes_fabric_port_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_fabric_port_status_t status;                                        ///< [out] The current status of the port
+    zes_fabric_port_qual_issue_flags_t qualityIssues;                       ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_DEGRADED,
+                                                                            ///< then this gives a combination of ::zes_fabric_port_qual_issue_flag_t
+                                                                            ///< for quality issues that have been detected;
+                                                                            ///< otherwise, 0 indicates there are no quality issues with the link at
+                                                                            ///< this time.
+    zes_fabric_port_failure_flags_t failureReasons;                         ///< [out] If status is ::ZES_FABRIC_PORT_STATUS_FAILED,
+                                                                            ///< then this gives a combination of ::zes_fabric_port_failure_flag_t for
+                                                                            ///< reasons for the connection instability;
+                                                                            ///< otherwise, 0 indicates there are no connection stability issues at
+                                                                            ///< this time.
+    zes_fabric_port_id_t remotePortId;                                      ///< [out] The unique port identifier for the remote connection point if
+                                                                            ///< status is ::ZES_FABRIC_PORT_STATUS_HEALTHY,
+                                                                            ///< ::ZES_FABRIC_PORT_STATUS_DEGRADED or ::ZES_FABRIC_PORT_STATUS_FAILED
+    zes_fabric_port_speed_t rxSpeed;                                        ///< [out] Current maximum receive speed (sum of all lanes)
+    zes_fabric_port_speed_t txSpeed;                                        ///< [out] Current maximum transmit speed (sum of all lanes)
+
+} zes_fabric_port_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric port throughput.
+typedef struct _zes_fabric_port_throughput_t
+{
+    uint64_t timestamp;                                                     ///< [out] Monotonic timestamp counter in microseconds when the measurement
+                                                                            ///< was made.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+    uint64_t rxCounter;                                                     ///< [out] Monotonic counter for the number of bytes received (sum of all
+                                                                            ///< lanes). This includes all protocol overhead, not only the GPU traffic.
+    uint64_t txCounter;                                                     ///< [out] Monotonic counter for the number of bytes transmitted (sum of
+                                                                            ///< all lanes). This includes all protocol overhead, not only the GPU
+                                                                            ///< traffic.
+
+} zes_fabric_port_throughput_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fabric Port Error Counters
+typedef struct _zes_fabric_port_error_counters_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t linkFailureCount;                                              ///< [out] Link Failure Error Count reported per port
+    uint64_t fwCommErrorCount;                                              ///< [out] Firmware Communication Error Count reported per device
+    uint64_t fwErrorCount;                                                  ///< [out] Firmware reported Error Count reported per device
+    uint64_t linkDegradeCount;                                              ///< [out] Link Degrade Error Count reported per port
+
+} zes_fabric_port_error_counters_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of Fabric ports in a device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumFabricPorts(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_fabric_port_handle_t* phPort                                        ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetProperties(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_properties_t* pProperties                               ///< [in,out] Will contain properties of the Fabric Port.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port link type
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pLinkType`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetLinkType(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_link_type_t* pLinkType                                       ///< [in,out] Will contain details about the link attached to the Fabric
+                                                                            ///< port.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port configuration
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetConfig(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_config_t* pConfig                                       ///< [in,out] Will contain configuration of the Fabric Port.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set Fabric port configuration
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortSetConfig(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    const zes_fabric_port_config_t* pConfig                                 ///< [in] Contains new configuration of the Fabric Port.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port state - status (health/degraded/failed/disabled),
+///        reasons for link degradation or instability, current rx/tx speed
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetState(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_state_t* pState                                         ///< [in,out] Will contain the current state of the Fabric Port
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port throughput
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pThroughput`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to query this telemetry.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetThroughput(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_throughput_t* pThroughput                               ///< [in,out] Will contain the Fabric port throughput counters.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric Port Error Counters
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - The memory backing the arrays for phPorts and ppThroughputs must be
+///       allocated in system memory by the user who is also responsible for
+///       releasing them when they are no longer needed.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPort`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pErrors`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to query this telemetry.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetFabricErrorCounters(
+    zes_fabric_port_handle_t hPort,                                         ///< [in] Handle for the component.
+    zes_fabric_port_error_counters_t* pErrors                               ///< [in,out] Will contain the Fabric port Error counters.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Fabric port throughput from multiple ports in a single call
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phPort`
+///         + `nullptr == pThroughput`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFabricPortGetMultiPortThroughput(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t numPorts,                                                      ///< [in] Number of ports enumerated in function ::zesDeviceEnumFabricPorts
+    zes_fabric_port_handle_t* phPort,                                       ///< [in][range(0, numPorts)] array of fabric port handles provided by user
+                                                                            ///< to gather throughput values. 
+    zes_fabric_port_throughput_t** pThroughput                              ///< [out][range(0, numPorts)] array of fabric port throughput counters
+                                                                            ///< from multiple ports of type ::zes_fabric_port_throughput_t.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region fan
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan resource speed mode
+typedef enum _zes_fan_speed_mode_t
+{
+    ZES_FAN_SPEED_MODE_DEFAULT = 0,                                         ///< The fan speed is operating using the hardware default settings
+    ZES_FAN_SPEED_MODE_FIXED = 1,                                           ///< The fan speed is currently set to a fixed value
+    ZES_FAN_SPEED_MODE_TABLE = 2,                                           ///< The fan speed is currently controlled dynamically by hardware based on
+                                                                            ///< a temp/speed table
+    ZES_FAN_SPEED_MODE_FORCE_UINT32 = 0x7fffffff
+
+} zes_fan_speed_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan speed units
+typedef enum _zes_fan_speed_units_t
+{
+    ZES_FAN_SPEED_UNITS_RPM = 0,                                            ///< The fan speed is in units of revolutions per minute (rpm)
+    ZES_FAN_SPEED_UNITS_PERCENT = 1,                                        ///< The fan speed is a percentage of the maximum speed of the fan
+    ZES_FAN_SPEED_UNITS_FORCE_UINT32 = 0x7fffffff
+
+} zes_fan_speed_units_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan speed
+typedef struct _zes_fan_speed_t
+{
+    int32_t speed;                                                          ///< [in,out] The speed of the fan. On output, a value of -1 indicates that
+                                                                            ///< there is no fixed fan speed setting.
+    zes_fan_speed_units_t units;                                            ///< [in,out] The units that the fan speed is expressed in. On output, if
+                                                                            ///< fan speed is -1 then units should be ignored.
+
+} zes_fan_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan temperature/speed pair
+typedef struct _zes_fan_temp_speed_t
+{
+    uint32_t temperature;                                                   ///< [in,out] Temperature in degrees Celsius.
+    zes_fan_speed_t speed;                                                  ///< [in,out] The speed of the fan
+
+} zes_fan_temp_speed_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_FAN_TEMP_SPEED_PAIR_COUNT
+/// @brief Maximum number of fan temperature/speed pairs in the fan speed table.
+#define ZES_FAN_TEMP_SPEED_PAIR_COUNT  32
+#endif // ZES_FAN_TEMP_SPEED_PAIR_COUNT
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan speed table
+typedef struct _zes_fan_speed_table_t
+{
+    int32_t numPoints;                                                      ///< [in,out] The number of valid points in the fan speed table. 0 means
+                                                                            ///< that there is no fan speed table configured. -1 means that a fan speed
+                                                                            ///< table is not supported by the hardware.
+    zes_fan_temp_speed_t table[ZES_FAN_TEMP_SPEED_PAIR_COUNT];              ///< [in,out] Array of temperature/fan speed pairs. The table is ordered
+                                                                            ///< based on temperature from lowest to highest.
+
+} zes_fan_speed_table_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan properties
+typedef struct _zes_fan_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can control the fan speed assuming the
+                                                                            ///< user has permissions
+    uint32_t supportedModes;                                                ///< [out] Bitfield of supported fan configuration modes
+                                                                            ///< (1<<::zes_fan_speed_mode_t)
+    uint32_t supportedUnits;                                                ///< [out] Bitfield of supported fan speed units
+                                                                            ///< (1<<::zes_fan_speed_units_t)
+    int32_t maxRPM;                                                         ///< [out] The maximum RPM of the fan. A value of -1 means that this
+                                                                            ///< property is unknown. 
+    int32_t maxPoints;                                                      ///< [out] The maximum number of points in the fan temp/speed table. A
+                                                                            ///< value of -1 means that this fan doesn't support providing a temp/speed
+                                                                            ///< table.
+
+} zes_fan_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Fan configuration
+typedef struct _zes_fan_config_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_fan_speed_mode_t mode;                                              ///< [in,out] The fan speed mode (fixed, temp-speed table)
+    zes_fan_speed_t speedFixed;                                             ///< [in,out] The current fixed fan speed setting
+    zes_fan_speed_table_t speedTable;                                       ///< [out] A table containing temperature/speed pairs
+
+} zes_fan_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of fans
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumFans(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_fan_handle_t* phFan                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get fan properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFan`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFanGetProperties(
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    zes_fan_properties_t* pProperties                                       ///< [in,out] Will contain the properties of the fan.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get fan configurations and the current fan speed mode (default, fixed,
+///        temp-speed table)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFan`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFanGetConfig(
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    zes_fan_config_t* pConfig                                               ///< [in,out] Will contain the current configuration of the fan.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Configure the fan to run with hardware factory settings (set mode to
+///        ::ZES_FAN_SPEED_MODE_DEFAULT)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFan`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFanSetDefaultMode(
+    zes_fan_handle_t hFan                                                   ///< [in] Handle for the component.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Configure the fan to rotate at a fixed speed (set mode to
+///        ::ZES_FAN_SPEED_MODE_FIXED)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFan`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == speed`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Fixing the fan speed not supported by the hardware or the fan speed units are not supported. See the `supportedModes` and `supportedUnits` members of ::zes_fan_properties_t.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFanSetFixedSpeedMode(
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    const zes_fan_speed_t* speed                                            ///< [in] The fixed fan speed setting
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Configure the fan to adjust speed based on a temperature/speed table
+///        (set mode to ::ZES_FAN_SPEED_MODE_TABLE)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFan`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == speedTable`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + The temperature/speed pairs in the array are not sorted on temperature from lowest to highest.
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Fan speed table not supported by the hardware or the fan speed units are not supported. See the `supportedModes` and `supportedUnits` members of ::zes_fan_properties_t.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFanSetSpeedTableMode(
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    const zes_fan_speed_table_t* speedTable                                 ///< [in] A table containing temperature/speed pairs.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current state of a fan - current mode and speed
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFan`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_FAN_SPEED_UNITS_PERCENT < units`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSpeed`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + The requested fan speed units are not supported. See the `supportedUnits` member of ::zes_fan_properties_t.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFanGetState(
+    zes_fan_handle_t hFan,                                                  ///< [in] Handle for the component.
+    zes_fan_speed_units_t units,                                            ///< [in] The units in which the fan speed should be returned.
+    int32_t* pSpeed                                                         ///< [in,out] Will contain the current speed of the fan in the units
+                                                                            ///< requested. A value of -1 indicates that the fan speed cannot be
+                                                                            ///< measured.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region firmware
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Firmware properties
+typedef struct _zes_firmware_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can flash the firmware assuming the user
+                                                                            ///< has permissions
+    char name[ZES_STRING_PROPERTY_SIZE];                                    ///< [out] NULL terminated string value. The string "unknown" will be
+                                                                            ///< returned if this property cannot be determined.
+    char version[ZES_STRING_PROPERTY_SIZE];                                 ///< [out] NULL terminated string value. The string "unknown" will be
+                                                                            ///< returned if this property cannot be determined.
+
+} zes_firmware_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of firmwares
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumFirmwares(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_firmware_handle_t* phFirmware                                       ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get firmware properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareGetProperties(
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    zes_firmware_properties_t* pProperties                                  ///< [in,out] Pointer to an array that will hold the properties of the
+                                                                            ///< firmware
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Flash a new firmware image
+/// 
+/// @details
+///     - Any running workload must be gracefully closed before invoking this
+///       function.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - This is a non-blocking call. Application may call
+///       ::zesFirmwareGetFlashProgress to get completion status.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pImage`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to perform this operation.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareFlash(
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    void* pImage,                                                           ///< [in] Image of the new firmware to flash.
+    uint32_t size                                                           ///< [in] Size of the flash image.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Firmware Flash Progress
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCompletionPercent`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareGetFlashProgress(
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    uint32_t* pCompletionPercent                                            ///< [in,out] Pointer to the Completion Percentage of Firmware Update
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get Firmware Console Logs
+/// 
+/// @details
+///     - The caller may pass nullptr for pFirmwareLog and set pSize to zero
+///       when querying only for size.
+///     - The caller must provide memory for Firmware log.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareGetConsoleLogs(
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    size_t* pSize,                                                          ///< [in,out] size of firmware log
+    char* pFirmwareLog                                                      ///< [in,out][optional] pointer to null-terminated string of the log.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Frequency domains
+#if !defined(__GNUC__)
+#pragma region frequency
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frequency domains.
+typedef enum _zes_freq_domain_t
+{
+    ZES_FREQ_DOMAIN_GPU = 0,                                                ///< GPU Core Domain.
+    ZES_FREQ_DOMAIN_MEMORY = 1,                                             ///< Local Memory Domain.
+    ZES_FREQ_DOMAIN_MEDIA = 2,                                              ///< GPU Media Domain.
+    ZES_FREQ_DOMAIN_FORCE_UINT32 = 0x7fffffff
+
+} zes_freq_domain_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frequency properties
+/// 
+/// @details
+///     - Indicates if this frequency domain can be overclocked (if true,
+///       functions such as ::zesFrequencyOcSetFrequencyTarget() are supported).
+///     - The min/max hardware frequencies are specified for non-overclock
+///       configurations. For overclock configurations, use
+///       ::zesFrequencyOcGetFrequencyTarget() to determine the maximum
+///       frequency that can be requested.
+typedef struct _zes_freq_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_freq_domain_t type;                                                 ///< [out] The hardware block that this frequency domain controls (GPU,
+                                                                            ///< memory, ...)
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can control the frequency of this domain
+                                                                            ///< assuming the user has permissions
+    ze_bool_t isThrottleEventSupported;                                     ///< [out] Indicates if software can register to receive event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_FREQ_THROTTLED
+    double min;                                                             ///< [out] The minimum hardware clock frequency in units of MHz.
+    double max;                                                             ///< [out] The maximum non-overclock hardware clock frequency in units of
+                                                                            ///< MHz.
+
+} zes_freq_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frequency range between which the hardware can operate.
+/// 
+/// @details
+///     - When setting limits, they will be clamped to the hardware limits.
+///     - When setting limits, ensure that the max frequency is greater than or
+///       equal to the min frequency specified.
+///     - When setting limits to return to factory settings, specify -1 for both
+///       the min and max limit.
+typedef struct _zes_freq_range_t
+{
+    double min;                                                             ///< [in,out] The min frequency in MHz below which hardware frequency
+                                                                            ///< management will not request frequencies. On input, setting to 0 will
+                                                                            ///< permit the frequency to go down to the hardware minimum while setting
+                                                                            ///< to -1 will return the min frequency limit to the factory value (can be
+                                                                            ///< larger than the hardware min). On output, a negative value indicates
+                                                                            ///< that no external minimum frequency limit is in effect.
+    double max;                                                             ///< [in,out] The max frequency in MHz above which hardware frequency
+                                                                            ///< management will not request frequencies. On input, setting to 0 or a
+                                                                            ///< very big number will permit the frequency to go all the way up to the
+                                                                            ///< hardware maximum while setting to -1 will return the max frequency to
+                                                                            ///< the factory value (which can be less than the hardware max). On
+                                                                            ///< output, a negative number indicates that no external maximum frequency
+                                                                            ///< limit is in effect.
+
+} zes_freq_range_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frequency throttle reasons
+typedef uint32_t zes_freq_throttle_reason_flags_t;
+typedef enum _zes_freq_throttle_reason_flag_t
+{
+    ZES_FREQ_THROTTLE_REASON_FLAG_AVE_PWR_CAP = ZE_BIT(0),                  ///< frequency throttled due to average power excursion (PL1)
+    ZES_FREQ_THROTTLE_REASON_FLAG_BURST_PWR_CAP = ZE_BIT(1),                ///< frequency throttled due to burst power excursion (PL2)
+    ZES_FREQ_THROTTLE_REASON_FLAG_CURRENT_LIMIT = ZE_BIT(2),                ///< frequency throttled due to current excursion (PL4)
+    ZES_FREQ_THROTTLE_REASON_FLAG_THERMAL_LIMIT = ZE_BIT(3),                ///< frequency throttled due to thermal excursion (T > TjMax)
+    ZES_FREQ_THROTTLE_REASON_FLAG_PSU_ALERT = ZE_BIT(4),                    ///< frequency throttled due to power supply assertion
+    ZES_FREQ_THROTTLE_REASON_FLAG_SW_RANGE = ZE_BIT(5),                     ///< frequency throttled due to software supplied frequency range
+    ZES_FREQ_THROTTLE_REASON_FLAG_HW_RANGE = ZE_BIT(6),                     ///< frequency throttled due to a sub block that has a lower frequency
+                                                                            ///< range when it receives clocks
+    ZES_FREQ_THROTTLE_REASON_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_freq_throttle_reason_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frequency state
+typedef struct _zes_freq_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    double currentVoltage;                                                  ///< [out] Current voltage in Volts. A negative value indicates that this
+                                                                            ///< property is not known.
+    double request;                                                         ///< [out] The current frequency request in MHz. A negative value indicates
+                                                                            ///< that this property is not known.
+    double tdp;                                                             ///< [out] The maximum frequency in MHz supported under the current TDP
+                                                                            ///< conditions. This fluctuates dynamically based on the power and thermal
+                                                                            ///< limits of the part. A negative value indicates that this property is
+                                                                            ///< not known.
+    double efficient;                                                       ///< [out] The efficient minimum frequency in MHz. A negative value
+                                                                            ///< indicates that this property is not known.
+    double actual;                                                          ///< [out] The resolved frequency in MHz. A negative value indicates that
+                                                                            ///< this property is not known.
+    zes_freq_throttle_reason_flags_t throttleReasons;                       ///< [out] The reasons that the frequency is being limited by the hardware.
+                                                                            ///< Returns 0 (frequency not throttled) or a combination of ::zes_freq_throttle_reason_flag_t.
+
+} zes_freq_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Frequency throttle time snapshot
+/// 
+/// @details
+///     - Percent time throttled is calculated by taking two snapshots (s1, s2)
+///       and using the equation: %throttled = (s2.throttleTime -
+///       s1.throttleTime) / (s2.timestamp - s1.timestamp)
+typedef struct _zes_freq_throttle_time_t
+{
+    uint64_t throttleTime;                                                  ///< [out] The monotonic counter of time in microseconds that the frequency
+                                                                            ///< has been limited by the hardware.
+    uint64_t timestamp;                                                     ///< [out] Microsecond timestamp when throttleTime was captured.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+
+} zes_freq_throttle_time_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclocking modes
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported.
+typedef enum _zes_oc_mode_t
+{
+    ZES_OC_MODE_OFF = 0,                                                    ///< Overclocking if off - hardware is running using factory default
+                                                                            ///< voltages/frequencies.
+    ZES_OC_MODE_OVERRIDE = 1,                                               ///< Overclock override mode - In this mode, a fixed user-supplied voltage
+                                                                            ///< is applied independent of the frequency request. The maximum permitted
+                                                                            ///< frequency can also be increased. This mode disables INTERPOLATIVE and
+                                                                            ///< FIXED modes.
+    ZES_OC_MODE_INTERPOLATIVE = 2,                                          ///< Overclock interpolative mode - In this mode, the voltage/frequency
+                                                                            ///< curve can be extended with a new voltage/frequency point that will be
+                                                                            ///< interpolated. The existing voltage/frequency points can also be offset
+                                                                            ///< (up or down) by a fixed voltage. This mode disables FIXED and OVERRIDE
+                                                                            ///< modes.
+    ZES_OC_MODE_FIXED = 3,                                                  ///< Overclocking fixed Mode - In this mode, hardware will disable most
+                                                                            ///< frequency throttling and lock the frequency and voltage at the
+                                                                            ///< specified overclock values. This mode disables OVERRIDE and
+                                                                            ///< INTERPOLATIVE modes. This mode can damage the part, most of the
+                                                                            ///< protections are disabled on this mode.
+    ZES_OC_MODE_FORCE_UINT32 = 0x7fffffff
+
+} zes_oc_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Overclocking properties
+/// 
+/// @details
+///     - Provides all the overclocking capabilities and properties supported by
+///       the device for the frequency domain.
+///     - [DEPRECATED] No longer supported.
+typedef struct _zes_oc_capabilities_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t isOcSupported;                                                ///< [out] Indicates if any overclocking features are supported on this
+                                                                            ///< frequency domain.
+    double maxFactoryDefaultFrequency;                                      ///< [out] Factory default non-overclock maximum frequency in Mhz.
+    double maxFactoryDefaultVoltage;                                        ///< [out] Factory default voltage used for the non-overclock maximum
+                                                                            ///< frequency in MHz.
+    double maxOcFrequency;                                                  ///< [out] Maximum hardware overclocking frequency limit in Mhz.
+    double minOcVoltageOffset;                                              ///< [out] The minimum voltage offset that can be applied to the
+                                                                            ///< voltage/frequency curve. Note that this number can be negative.
+    double maxOcVoltageOffset;                                              ///< [out] The maximum voltage offset that can be applied to the
+                                                                            ///< voltage/frequency curve.
+    double maxOcVoltage;                                                    ///< [out] The maximum overclock voltage that hardware supports.
+    ze_bool_t isTjMaxSupported;                                             ///< [out] Indicates if the maximum temperature limit (TjMax) can be
+                                                                            ///< changed for this frequency domain.
+    ze_bool_t isIccMaxSupported;                                            ///< [out] Indicates if the maximum current (IccMax) can be changed for
+                                                                            ///< this frequency domain.
+    ze_bool_t isHighVoltModeCapable;                                        ///< [out] Indicates if this frequency domains supports a feature to set
+                                                                            ///< very high voltages.
+    ze_bool_t isHighVoltModeEnabled;                                        ///< [out] Indicates if very high voltages are permitted on this frequency
+                                                                            ///< domain.
+    ze_bool_t isExtendedModeSupported;                                      ///< [out] Indicates if the extended overclocking features are supported.
+                                                                            ///< If this is supported, increments are on 1 Mhz basis.
+    ze_bool_t isFixedModeSupported;                                         ///< [out] Indicates if the fixed mode is supported. In this mode, hardware
+                                                                            ///< will disable most frequency throttling and lock the frequency and
+                                                                            ///< voltage at the specified overclock values.
+
+} zes_oc_capabilities_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of frequency domains
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumFrequencyDomains(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_freq_handle_t* phFrequency                                          ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get frequency properties - available frequencies
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyGetProperties(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_properties_t* pProperties                                      ///< [in,out] The frequency properties for the specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get available non-overclocked hardware clock frequencies for the
+///        frequency domain
+/// 
+/// @details
+///     - The list of available frequencies is returned in order of slowest to
+///       fastest.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyGetAvailableClocks(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of frequencies.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of frequencies that are available.
+                                                                            ///< if count is greater than the number of frequencies that are available,
+                                                                            ///< then the driver shall update the value with the correct number of frequencies.
+    double* phFrequency                                                     ///< [in,out][optional][range(0, *pCount)] array of frequencies in units of
+                                                                            ///< MHz and sorted from slowest to fastest.
+                                                                            ///< if count is less than the number of frequencies that are available,
+                                                                            ///< then the driver shall only retrieve that number of frequencies.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current frequency limits
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pLimits`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyGetRange(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_range_t* pLimits                                               ///< [in,out] The range between which the hardware can operate for the
+                                                                            ///< specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set frequency range between which the hardware can operate.
+/// 
+/// @details
+///     - The application may call this function with the frequency range min
+///       and max values set to `-1` to request the frequency be (re)set to the
+///       default values.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pLimits`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencySetRange(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    const zes_freq_range_t* pLimits                                         ///< [in] The limits between which the hardware can operate for the
+                                                                            ///< specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current frequency state - frequency request, actual frequency, TDP
+///        limits
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyGetState(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_state_t* pState                                                ///< [in,out] Frequency state for the specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get frequency throttle time
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pThrottleTime`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyGetThrottleTime(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_freq_throttle_time_t* pThrottleTime                                 ///< [in,out] Will contain a snapshot of the throttle time counters for the
+                                                                            ///< specified domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the overclocking capabilities.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOcCapabilities`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcGetCapabilities(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_oc_capabilities_t* pOcCapabilities                                  ///< [in,out] Pointer to the capabilities structure.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the current overclocking frequency target, if extended moded is
+///        supported, will returned in 1 Mhz granularity.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCurrentOcFrequency`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcGetFrequencyTarget(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pCurrentOcFrequency                                             ///< [out] Overclocking Frequency in MHz, if extended moded is supported,
+                                                                            ///< will returned in 1 Mhz granularity, else, in multiples of 50 Mhz. This
+                                                                            ///< cannot be greater than the `maxOcFrequency` member of
+                                                                            ///< ::zes_oc_capabilities_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the current overclocking frequency target, if extended moded is
+///        supported, can be set in 1 Mhz granularity.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcSetFrequencyTarget(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double CurrentOcFrequency                                               ///< [in] Overclocking Frequency in MHz, if extended moded is supported, it
+                                                                            ///< could be set in 1 Mhz granularity, else, in multiples of 50 Mhz. This
+                                                                            ///< cannot be greater than the `maxOcFrequency` member of
+                                                                            ///< ::zes_oc_capabilities_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the current overclocking voltage settings.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCurrentVoltageTarget`
+///         + `nullptr == pCurrentVoltageOffset`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcGetVoltageTarget(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pCurrentVoltageTarget,                                          ///< [out] Overclock voltage in Volts. This cannot be greater than the
+                                                                            ///< `maxOcVoltage` member of ::zes_oc_capabilities_t.
+    double* pCurrentVoltageOffset                                           ///< [out] This voltage offset is applied to all points on the
+                                                                            ///< voltage/frequency curve, including the new overclock voltageTarget.
+                                                                            ///< Valid range is between the `minOcVoltageOffset` and
+                                                                            ///< `maxOcVoltageOffset` members of ::zes_oc_capabilities_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the current overclocking voltage settings.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcSetVoltageTarget(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double CurrentVoltageTarget,                                            ///< [in] Overclock voltage in Volts. This cannot be greater than the
+                                                                            ///< `maxOcVoltage` member of ::zes_oc_capabilities_t.
+    double CurrentVoltageOffset                                             ///< [in] This voltage offset is applied to all points on the
+                                                                            ///< voltage/frequency curve, include the new overclock voltageTarget.
+                                                                            ///< Valid range is between the `minOcVoltageOffset` and
+                                                                            ///< `maxOcVoltageOffset` members of ::zes_oc_capabilities_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the current overclocking mode.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_OC_MODE_FIXED < CurrentOcMode`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcSetMode(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_oc_mode_t CurrentOcMode                                             ///< [in] Current Overclocking Mode ::zes_oc_mode_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the current overclocking mode.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCurrentOcMode`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The specified voltage and/or frequency overclock settings exceed the hardware values (see the `maxOcFrequency`, `maxOcVoltage`, `minOcVoltageOffset` and `maxOcVoltageOffset` members of ::zes_oc_capabilities_t).
+///         + Requested voltage overclock is very high but the `isHighVoltModeEnabled` member of ::zes_oc_capabilities_t is not enabled for the device.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcGetMode(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    zes_oc_mode_t* pCurrentOcMode                                           ///< [out] Current Overclocking Mode ::zes_oc_mode_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the maximum current limit setting.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOcIccMax`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + Capability the `isIccMaxSupported` member of ::zes_oc_capabilities_t is false for this frequency domain.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcGetIccMax(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pOcIccMax                                                       ///< [in,out] Will contain the maximum current limit in Amperes on
+                                                                            ///< successful return.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change the maximum current limit setting.
+/// 
+/// @details
+///     - Setting ocIccMax to 0.0 will return the value to the factory default.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The `isIccMaxSupported` member of ::zes_oc_capabilities_t is false for this frequency domain.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + The specified current limit is too low or too high.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcSetIccMax(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double ocIccMax                                                         ///< [in] The new maximum current limit in Amperes.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the maximum temperature limit setting.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pOcTjMax`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcGetTjMax(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double* pOcTjMax                                                        ///< [in,out] Will contain the maximum temperature limit in degrees Celsius
+                                                                            ///< on successful return.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change the maximum temperature limit setting.
+/// 
+/// @details
+///     - Setting ocTjMax to 0.0 will return the value to the factory default.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFrequency`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Overclocking is not supported on this frequency domain (see the `isOcSupported` member of ::zes_oc_capabilities_t).
+///         + The `isTjMaxSupported` member of ::zes_oc_capabilities_t is false for this frequency domain.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Overclocking feature is locked on this frequency domain.
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + The specified temperature limit is too high.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFrequencyOcSetTjMax(
+    zes_freq_handle_t hFrequency,                                           ///< [in] Handle for the component.
+    double ocTjMax                                                          ///< [in] The new maximum temperature limit in degrees Celsius.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region led
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief LED properties
+typedef struct _zes_led_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Indicates if software can control the LED assuming the user has
+                                                                            ///< permissions
+    ze_bool_t haveRGB;                                                      ///< [out] Indicates if the LED is RGB capable
+
+} zes_led_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief LED color
+typedef struct _zes_led_color_t
+{
+    double red;                                                             ///< [in,out][range(0.0, 1.0)] The LED red value. On output, a value less
+                                                                            ///< than 0.0 indicates that the color is not known.
+    double green;                                                           ///< [in,out][range(0.0, 1.0)] The LED green value. On output, a value less
+                                                                            ///< than 0.0 indicates that the color is not known.
+    double blue;                                                            ///< [in,out][range(0.0, 1.0)] The LED blue value. On output, a value less
+                                                                            ///< than 0.0 indicates that the color is not known.
+
+} zes_led_color_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief LED state
+typedef struct _zes_led_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t isOn;                                                         ///< [out] Indicates if the LED is on or off
+    zes_led_color_t color;                                                  ///< [out] Color of the LED
+
+} zes_led_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of LEDs
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumLeds(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_led_handle_t* phLed                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get LED properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hLed`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesLedGetProperties(
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    zes_led_properties_t* pProperties                                       ///< [in,out] Will contain the properties of the LED.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current state of a LED - on/off, color
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hLed`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesLedGetState(
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    zes_led_state_t* pState                                                 ///< [in,out] Will contain the current state of the LED.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Turn the LED on/off
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hLed`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesLedSetState(
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    ze_bool_t enable                                                        ///< [in] Set to TRUE to turn the LED on, FALSE to turn off.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the color of the LED
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hLed`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pColor`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This LED doesn't not support color changes. See the `haveRGB` member of ::zes_led_properties_t.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesLedSetColor(
+    zes_led_handle_t hLed,                                                  ///< [in] Handle for the component.
+    const zes_led_color_t* pColor                                           ///< [in] New color of the LED.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Memory management
+#if !defined(__GNUC__)
+#pragma region memory
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory module types
+typedef enum _zes_mem_type_t
+{
+    ZES_MEM_TYPE_HBM = 0,                                                   ///< HBM memory
+    ZES_MEM_TYPE_DDR = 1,                                                   ///< DDR memory
+    ZES_MEM_TYPE_DDR3 = 2,                                                  ///< DDR3 memory
+    ZES_MEM_TYPE_DDR4 = 3,                                                  ///< DDR4 memory
+    ZES_MEM_TYPE_DDR5 = 4,                                                  ///< DDR5 memory
+    ZES_MEM_TYPE_LPDDR = 5,                                                 ///< LPDDR memory
+    ZES_MEM_TYPE_LPDDR3 = 6,                                                ///< LPDDR3 memory
+    ZES_MEM_TYPE_LPDDR4 = 7,                                                ///< LPDDR4 memory
+    ZES_MEM_TYPE_LPDDR5 = 8,                                                ///< LPDDR5 memory
+    ZES_MEM_TYPE_SRAM = 9,                                                  ///< SRAM memory
+    ZES_MEM_TYPE_L1 = 10,                                                   ///< L1 cache
+    ZES_MEM_TYPE_L3 = 11,                                                   ///< L3 cache
+    ZES_MEM_TYPE_GRF = 12,                                                  ///< Execution unit register file
+    ZES_MEM_TYPE_SLM = 13,                                                  ///< Execution unit shared local memory
+    ZES_MEM_TYPE_GDDR4 = 14,                                                ///< GDDR4 memory
+    ZES_MEM_TYPE_GDDR5 = 15,                                                ///< GDDR5 memory
+    ZES_MEM_TYPE_GDDR5X = 16,                                               ///< GDDR5X memory
+    ZES_MEM_TYPE_GDDR6 = 17,                                                ///< GDDR6 memory
+    ZES_MEM_TYPE_GDDR6X = 18,                                               ///< GDDR6X memory
+    ZES_MEM_TYPE_GDDR7 = 19,                                                ///< GDDR7 memory
+    ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory module location
+typedef enum _zes_mem_loc_t
+{
+    ZES_MEM_LOC_SYSTEM = 0,                                                 ///< System memory
+    ZES_MEM_LOC_DEVICE = 1,                                                 ///< On board local device memory
+    ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_loc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory health
+typedef enum _zes_mem_health_t
+{
+    ZES_MEM_HEALTH_UNKNOWN = 0,                                             ///< The memory health cannot be determined.
+    ZES_MEM_HEALTH_OK = 1,                                                  ///< All memory channels are healthy.
+    ZES_MEM_HEALTH_DEGRADED = 2,                                            ///< Excessive correctable errors have been detected on one or more
+                                                                            ///< channels. Device should be reset.
+    ZES_MEM_HEALTH_CRITICAL = 3,                                            ///< Operating with reduced memory to cover banks with too many
+                                                                            ///< uncorrectable errors.
+    ZES_MEM_HEALTH_REPLACE = 4,                                             ///< Device should be replaced due to excessive uncorrectable errors.
+    ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_health_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory properties
+typedef struct _zes_mem_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_mem_type_t type;                                                    ///< [out] The memory type
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_mem_loc_t location;                                                 ///< [out] Location of this memory (system, device)
+    uint64_t physicalSize;                                                  ///< [out] Physical memory size in bytes. A value of 0 indicates that this
+                                                                            ///< property is not known. However, a call to ::zesMemoryGetState() will
+                                                                            ///< correctly return the total size of usable memory.
+    int32_t busWidth;                                                       ///< [out] Width of the memory bus. A value of -1 means that this property
+                                                                            ///< is unknown.
+    int32_t numChannels;                                                    ///< [out] The number of memory channels. A value of -1 means that this
+                                                                            ///< property is unknown.
+
+} zes_mem_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory state - health, allocated
+/// 
+/// @details
+///     - Percent allocation is given by 100 * (size - free / size.
+///     - Percent free is given by 100 * free / size.
+typedef struct _zes_mem_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_mem_health_t health;                                                ///< [out] Indicates the health of the memory
+    uint64_t free;                                                          ///< [out] The free memory in bytes
+    uint64_t size;                                                          ///< [out] The total allocatable memory in bytes (can be less than the
+                                                                            ///< `physicalSize` member of ::zes_mem_properties_t)
+
+} zes_mem_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory bandwidth
+/// 
+/// @details
+///     - Percent bandwidth is calculated by taking two snapshots (s1, s2) and
+///       using the equation: %bw = 10^6 * ((s2.readCounter - s1.readCounter) +
+///       (s2.writeCounter - s1.writeCounter)) / (s2.maxBandwidth *
+///       (s2.timestamp - s1.timestamp))
+///     - Counter can roll over and rollover needs to be handled by comparing
+///       the current read against the previous read
+///     - Counter is a 32 byte transaction count, which means the calculated
+///       delta (delta = current_value - previous_value or delta = 2^32 -
+///       previous_value + current_value in case of rollover) needs to be
+///       multiplied by 32 to get delta between samples in actual byte count
+typedef struct _zes_mem_bandwidth_t
+{
+    uint64_t readCounter;                                                   ///< [out] Total bytes read from memory
+    uint64_t writeCounter;                                                  ///< [out] Total bytes written to memory
+    uint64_t maxBandwidth;                                                  ///< [out] Current maximum bandwidth in units of bytes/sec
+    uint64_t timestamp;                                                     ///< [out] The timestamp in microseconds when these measurements were sampled.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+
+} zes_mem_bandwidth_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties for Memory bandwidth
+/// 
+/// @details
+///     - Number of counter bits
+///     - [DEPRECATED] No longer supported.
+typedef struct _zes_mem_ext_bandwidth_t
+{
+    uint32_t memoryTimestampValidBits;                                      ///< [out] Returns the number of valid bits in the timestamp values
+
+} zes_mem_ext_bandwidth_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of memory modules
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumMemoryModules(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_mem_handle_t* phMemory                                              ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get memory properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMemory`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesMemoryGetProperties(
+    zes_mem_handle_t hMemory,                                               ///< [in] Handle for the component.
+    zes_mem_properties_t* pProperties                                       ///< [in,out] Will contain memory properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get memory state - health, allocated
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMemory`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesMemoryGetState(
+    zes_mem_handle_t hMemory,                                               ///< [in] Handle for the component.
+    zes_mem_state_t* pState                                                 ///< [in,out] Will contain the current health and allocated memory.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get memory bandwidth
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMemory`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pBandwidth`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to query this telemetry.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesMemoryGetBandwidth(
+    zes_mem_handle_t hMemory,                                               ///< [in] Handle for the component.
+    zes_mem_bandwidth_t* pBandwidth                                         ///< [in,out] Will contain the total number of bytes read from and written
+                                                                            ///< to memory, as well as the current maximum bandwidth.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Performance factor
+#if !defined(__GNUC__)
+#pragma region performance
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Static information about a Performance Factor domain
+typedef struct _zes_perf_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this Performance Factor affects accelerators located on
+                                                                            ///< a sub-device
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    zes_engine_type_flags_t engines;                                        ///< [out] Bitfield of accelerator engine types that are affected by this
+                                                                            ///< Performance Factor.
+
+} zes_perf_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handles to accelerator domains whose performance can be optimized
+///        via a Performance Factor
+/// 
+/// @details
+///     - A Performance Factor should be tuned for each workload.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumPerformanceFactorDomains(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_perf_handle_t* phPerf                                               ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get properties about a Performance Factor domain
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPerf`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPerformanceFactorGetProperties(
+    zes_perf_handle_t hPerf,                                                ///< [in] Handle for the Performance Factor domain.
+    zes_perf_properties_t* pProperties                                      ///< [in,out] Will contain information about the specified Performance
+                                                                            ///< Factor domain.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current Performance Factor for a given domain
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPerf`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pFactor`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPerformanceFactorGetConfig(
+    zes_perf_handle_t hPerf,                                                ///< [in] Handle for the Performance Factor domain.
+    double* pFactor                                                         ///< [in,out] Will contain the actual Performance Factor being used by the
+                                                                            ///< hardware (may not be the same as the requested Performance Factor).
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change the performance factor for a domain
+/// 
+/// @details
+///     - The Performance Factor is a number between 0 and 100.
+///     - A Performance Factor is a hint to the hardware. Depending on the
+///       hardware, the request may not be granted. Follow up this function with
+///       a call to ::zesPerformanceFactorGetConfig() to determine the actual
+///       factor being used by the hardware.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPerf`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPerformanceFactorSetConfig(
+    zes_perf_handle_t hPerf,                                                ///< [in] Handle for the Performance Factor domain.
+    double factor                                                           ///< [in] The new Performance Factor.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Scheduler management
+#if !defined(__GNUC__)
+#pragma region power
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Domain
+typedef enum _zes_power_domain_t
+{
+    ZES_POWER_DOMAIN_UNKNOWN = 0,                                           ///< The PUnit power domain level cannot be determined.
+    ZES_POWER_DOMAIN_CARD = 1,                                              ///< The PUnit power domain is a card-level power domain.
+    ZES_POWER_DOMAIN_PACKAGE = 2,                                           ///< The PUnit power domain is a package-level power domain.
+    ZES_POWER_DOMAIN_STACK = 3,                                             ///< The PUnit power domain is a stack-level power domain.
+    ZES_POWER_DOMAIN_MEMORY = 4,                                            ///< The PUnit power domain is a memory-level power domain.
+    ZES_POWER_DOMAIN_GPU = 5,                                               ///< The PUnit power domain is a GPU-level power domain.
+    ZES_POWER_DOMAIN_FORCE_UINT32 = 0x7fffffff
+
+} zes_power_domain_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Level Type
+typedef enum _zes_power_level_t
+{
+    ZES_POWER_LEVEL_UNKNOWN = 0,                                            ///< The PUnit power monitoring duration cannot be determined.
+    ZES_POWER_LEVEL_SUSTAINED = 1,                                          ///< The PUnit determines effective power draw by computing a moving
+                                                                            ///< average of the actual power draw over a time interval (longer than
+                                                                            ///< BURST).
+    ZES_POWER_LEVEL_BURST = 2,                                              ///< The PUnit determines effective power draw by computing a moving
+                                                                            ///< average of the actual power draw over a time interval (longer than
+                                                                            ///< PEAK).
+    ZES_POWER_LEVEL_PEAK = 3,                                               ///< The PUnit determines effective power draw by computing a moving
+                                                                            ///< average of the actual power draw over a very short time interval.
+    ZES_POWER_LEVEL_INSTANTANEOUS = 4,                                      ///< The PUnit predicts effective power draw using the current device
+                                                                            ///< configuration (frequency, voltage, etc...) & throttles proactively to
+                                                                            ///< stay within the specified limit.
+    ZES_POWER_LEVEL_FORCE_UINT32 = 0x7fffffff
+
+} zes_power_level_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Source Type
+typedef enum _zes_power_source_t
+{
+    ZES_POWER_SOURCE_ANY = 0,                                               ///< Limit active no matter whether the power source is mains powered or
+                                                                            ///< battery powered.
+    ZES_POWER_SOURCE_MAINS = 1,                                             ///< Limit active only when the device is mains powered.
+    ZES_POWER_SOURCE_BATTERY = 2,                                           ///< Limit active only when the device is battery powered.
+    ZES_POWER_SOURCE_FORCE_UINT32 = 0x7fffffff
+
+} zes_power_source_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Limit Unit
+typedef enum _zes_limit_unit_t
+{
+    ZES_LIMIT_UNIT_UNKNOWN = 0,                                             ///< The PUnit power monitoring unit cannot be determined.
+    ZES_LIMIT_UNIT_CURRENT = 1,                                             ///< The limit is specified in milliamperes of current drawn.
+    ZES_LIMIT_UNIT_POWER = 2,                                               ///< The limit is specified in milliwatts of power generated.
+    ZES_LIMIT_UNIT_FORCE_UINT32 = 0x7fffffff
+
+} zes_limit_unit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Properties related to device power settings
+typedef struct _zes_power_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Software can change the power limits of this domain assuming the
+                                                                            ///< user has permissions.
+    ze_bool_t isEnergyThresholdSupported;                                   ///< [out] Indicates if this power domain supports the energy threshold
+                                                                            ///< event (::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED).
+    int32_t defaultLimit;                                                   ///< [out] (Deprecated) The factory default TDP power limit of the part in
+                                                                            ///< milliwatts. A value of -1 means that this is not known.
+    int32_t minLimit;                                                       ///< [out] (Deprecated) The minimum power limit in milliwatts that can be
+                                                                            ///< requested. A value of -1 means that this is not known.
+    int32_t maxLimit;                                                       ///< [out] (Deprecated) The maximum power limit in milliwatts that can be
+                                                                            ///< requested. A value of -1 means that this is not known.
+
+} zes_power_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Energy counter snapshot
+/// 
+/// @details
+///     - Average power is calculated by taking two snapshots (s1, s2) and using
+///       the equation: PowerWatts = (s2.energy - s1.energy) / (s2.timestamp -
+///       s1.timestamp)
+typedef struct _zes_power_energy_counter_t
+{
+    uint64_t energy;                                                        ///< [out] The monotonic energy counter in microjoules.
+    uint64_t timestamp;                                                     ///< [out] Microsecond timestamp when energy was captured.
+                                                                            ///< This timestamp should only be used to calculate delta time between
+                                                                            ///< snapshots of this structure.
+                                                                            ///< Never take the delta of this timestamp with the timestamp from a
+                                                                            ///< different structure since they are not guaranteed to have the same base.
+                                                                            ///< The absolute value of the timestamp is only valid during within the
+                                                                            ///< application and may be different on the next execution.
+
+} zes_power_energy_counter_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sustained power limits
+/// 
+/// @details
+///     - The power controller (Punit) will throttle the operating frequency if
+///       the power averaged over a window (typically seconds) exceeds this
+///       limit.
+///     - [DEPRECATED] No longer supported.
+typedef struct _zes_power_sustained_limit_t
+{
+    ze_bool_t enabled;                                                      ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
+    int32_t power;                                                          ///< [in,out] power limit in milliwatts
+    int32_t interval;                                                       ///< [in,out] power averaging window (Tau) in milliseconds
+
+} zes_power_sustained_limit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Burst power limit
+/// 
+/// @details
+///     - The power controller (Punit) will throttle the operating frequency of
+///       the device if the power averaged over a few milliseconds exceeds a
+///       limit known as PL2. Typically PL2 > PL1 so that it permits the
+///       frequency to burst higher for short periods than would be otherwise
+///       permitted by PL1.
+///     - [DEPRECATED] No longer supported.
+typedef struct _zes_power_burst_limit_t
+{
+    ze_bool_t enabled;                                                      ///< [in,out] indicates if the limit is enabled (true) or ignored (false)
+    int32_t power;                                                          ///< [in,out] power limit in milliwatts
+
+} zes_power_burst_limit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Peak power limit
+/// 
+/// @details
+///     - The power controller (Punit) will reactively/proactively throttle the
+///       operating frequency of the device when the instantaneous/100usec power
+///       exceeds this limit. The limit is known as PL4 or Psys. It expresses
+///       the maximum power that can be drawn from the power supply.
+///     - If this power limit is removed or set too high, the power supply will
+///       generate an interrupt when it detects an overcurrent condition and the
+///       power controller will throttle the device frequencies down to min. It
+///       is thus better to tune the PL4 value in order to avoid such
+///       excursions.
+///     - [DEPRECATED] No longer supported.
+typedef struct _zes_power_peak_limit_t
+{
+    int32_t powerAC;                                                        ///< [in,out] power limit in milliwatts for the AC power source.
+    int32_t powerDC;                                                        ///< [in,out] power limit in milliwatts for the DC power source. On input,
+                                                                            ///< this is ignored if the product does not have a battery. On output,
+                                                                            ///< this will be -1 if the product does not have a battery.
+
+} zes_power_peak_limit_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Energy threshold
+/// 
+/// @details
+///     - .
+typedef struct _zes_energy_threshold_t
+{
+    ze_bool_t enable;                                                       ///< [in,out] Indicates if the energy threshold is enabled.
+    double threshold;                                                       ///< [in,out] The energy threshold in Joules. Will be 0.0 if no threshold
+                                                                            ///< has been set.
+    uint32_t processId;                                                     ///< [in,out] The host process ID that set the energy threshold. Will be
+                                                                            ///< 0xFFFFFFFF if no threshold has been set.
+
+} zes_energy_threshold_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of power domains
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumPowerDomains(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_pwr_handle_t* phPower                                               ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of the PCIe card-level power
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phPower`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + The device does not provide access to card level power controls or telemetry. An invalid power domain handle will be returned in phPower.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetCardPowerDomain(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    zes_pwr_handle_t* phPower                                               ///< [in,out] power domain handle for the entire PCIe card.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get properties related to a power domain
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerGetProperties(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_power_properties_t* pProperties                                     ///< [in,out] Structure that will contain property data.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get energy counter
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pEnergy`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerGetEnergyCounter(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_power_energy_counter_t* pEnergy                                     ///< [in,out] Will contain the latest snapshot of the energy counter and
+                                                                            ///< timestamp when the last counter value was measured.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get power limits
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] Use ::zesPowerGetLimitsExt.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerGetLimits(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_power_sustained_limit_t* pSustained,                                ///< [in,out][optional] The sustained power limit. If this is null, the
+                                                                            ///< current sustained power limits will not be returned.
+    zes_power_burst_limit_t* pBurst,                                        ///< [in,out][optional] The burst power limit. If this is null, the current
+                                                                            ///< peak power limits will not be returned.
+    zes_power_peak_limit_t* pPeak                                           ///< [in,out][optional] The peak power limit. If this is null, the peak
+                                                                            ///< power limits will not be returned.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set power limits
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] Use ::zesPowerSetLimitsExt.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + The device is in use, meaning that the GPU is under Over clocking, applying power limits under overclocking is not supported.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerSetLimits(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    const zes_power_sustained_limit_t* pSustained,                          ///< [in][optional] The sustained power limit. If this is null, no changes
+                                                                            ///< will be made to the sustained power limits.
+    const zes_power_burst_limit_t* pBurst,                                  ///< [in][optional] The burst power limit. If this is null, no changes will
+                                                                            ///< be made to the burst power limits.
+    const zes_power_peak_limit_t* pPeak                                     ///< [in][optional] The peak power limit. If this is null, no changes will
+                                                                            ///< be made to the peak power limits.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get energy threshold
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pThreshold`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Energy threshold not supported on this power domain (check the `isEnergyThresholdSupported` member of ::zes_power_properties_t).
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to request this feature.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerGetEnergyThreshold(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    zes_energy_threshold_t* pThreshold                                      ///< [in,out] Returns information about the energy threshold setting -
+                                                                            ///< enabled/energy threshold/process ID.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set energy threshold
+/// 
+/// @details
+///     - An event ::ZES_EVENT_TYPE_FLAG_ENERGY_THRESHOLD_CROSSED will be
+///       generated when the delta energy consumed starting from this call
+///       exceeds the specified threshold. Use the function
+///       ::zesDeviceEventRegister() to start receiving the event.
+///     - Only one running process can control the energy threshold at a given
+///       time. If another process attempts to change the energy threshold, the
+///       error ::ZE_RESULT_ERROR_NOT_AVAILABLE will be returned. The function
+///       ::zesPowerGetEnergyThreshold() to determine the process ID currently
+///       controlling this setting.
+///     - Calling this function will remove any pending energy thresholds and
+///       start counting from the time of this call.
+///     - Once the energy threshold has been reached and the event generated,
+///       the threshold is automatically removed. It is up to the application to
+///       request a new threshold.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Energy threshold not supported on this power domain (check the `isEnergyThresholdSupported` member of ::zes_power_properties_t).
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to request this feature.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Another running process has set the energy threshold.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerSetEnergyThreshold(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    double threshold                                                        ///< [in] The energy threshold to be set in joules.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region psu
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief PSU voltage status
+typedef enum _zes_psu_voltage_status_t
+{
+    ZES_PSU_VOLTAGE_STATUS_UNKNOWN = 0,                                     ///< The status of the power supply voltage controllers cannot be
+                                                                            ///< determined
+    ZES_PSU_VOLTAGE_STATUS_NORMAL = 1,                                      ///< No unusual voltages have been detected
+    ZES_PSU_VOLTAGE_STATUS_OVER = 2,                                        ///< Over-voltage has occurred
+    ZES_PSU_VOLTAGE_STATUS_UNDER = 3,                                       ///< Under-voltage has occurred
+    ZES_PSU_VOLTAGE_STATUS_FORCE_UINT32 = 0x7fffffff
+
+} zes_psu_voltage_status_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Static properties of the power supply
+typedef struct _zes_psu_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t haveFan;                                                      ///< [out] True if the power supply has a fan
+    int32_t ampLimit;                                                       ///< [out] The maximum electrical current in milliamperes that can be
+                                                                            ///< drawn. A value of -1 indicates that this property cannot be
+                                                                            ///< determined.
+
+} zes_psu_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Dynamic state of the power supply
+typedef struct _zes_psu_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_psu_voltage_status_t voltStatus;                                    ///< [out] The current PSU voltage status
+    ze_bool_t fanFailed;                                                    ///< [out] Indicates if the fan has failed
+    int32_t temperature;                                                    ///< [out] Read the current heatsink temperature in degrees Celsius. A
+                                                                            ///< value of -1 indicates that this property cannot be determined.
+    int32_t current;                                                        ///< [out] The amps being drawn in milliamperes. A value of -1 indicates
+                                                                            ///< that this property cannot be determined.
+
+} zes_psu_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of power supplies
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumPsus(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_psu_handle_t* phPsu                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get power supply properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPsu`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPsuGetProperties(
+    zes_psu_handle_t hPsu,                                                  ///< [in] Handle for the component.
+    zes_psu_properties_t* pProperties                                       ///< [in,out] Will contain the properties of the power supply.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current power supply state
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPsu`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPsuGetState(
+    zes_psu_handle_t hPsu,                                                  ///< [in] Handle for the component.
+    zes_psu_state_t* pState                                                 ///< [in,out] Will contain the current state of the power supply.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region ras
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS error type
+typedef enum _zes_ras_error_type_t
+{
+    ZES_RAS_ERROR_TYPE_CORRECTABLE = 0,                                     ///< Errors were corrected by hardware
+    ZES_RAS_ERROR_TYPE_UNCORRECTABLE = 1,                                   ///< Error were not corrected
+    ZES_RAS_ERROR_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_ras_error_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS error categories
+typedef enum _zes_ras_error_cat_t
+{
+    ZES_RAS_ERROR_CAT_RESET = 0,                                            ///< The number of accelerator engine resets attempted by the driver
+    ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS = 1,                               ///< The number of hardware exceptions generated by the way workloads have
+                                                                            ///< programmed the hardware
+    ZES_RAS_ERROR_CAT_DRIVER_ERRORS = 2,                                    ///< The number of low level driver communication errors have occurred
+    ZES_RAS_ERROR_CAT_COMPUTE_ERRORS = 3,                                   ///< The number of errors that have occurred in the compute accelerator
+                                                                            ///< hardware
+    ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS = 4,                               ///< The number of errors that have occurred in the fixed-function
+                                                                            ///< accelerator hardware
+    ZES_RAS_ERROR_CAT_CACHE_ERRORS = 5,                                     ///< The number of errors that have occurred in caches (L1/L3/register
+                                                                            ///< file/shared local memory/sampler)
+    ZES_RAS_ERROR_CAT_DISPLAY_ERRORS = 6,                                   ///< The number of errors that have occurred in the display
+    ZES_RAS_ERROR_CAT_FORCE_UINT32 = 0x7fffffff
+
+} zes_ras_error_cat_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MAX_RAS_ERROR_CATEGORY_COUNT
+/// @brief The maximum number of categories
+#define ZES_MAX_RAS_ERROR_CATEGORY_COUNT  7
+#endif // ZES_MAX_RAS_ERROR_CATEGORY_COUNT
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS properties
+typedef struct _zes_ras_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_ras_error_type_t type;                                              ///< [out] The type of RAS error
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+
+} zes_ras_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS error details
+typedef struct _zes_ras_state_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t category[ZES_MAX_RAS_ERROR_CATEGORY_COUNT];                    ///< [in][out] Breakdown of error by category
+
+} zes_ras_state_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS error configuration - thresholds used for triggering RAS events
+///        (::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS,
+///        ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS)
+/// 
+/// @details
+///     - The driver maintains a total counter which is updated every time a
+///       hardware block covered by the corresponding RAS error set notifies
+///       that an error has occurred. When this total count goes above the
+///       totalThreshold specified below, a RAS event is triggered.
+///     - The driver also maintains a counter for each category of RAS error
+///       (see ::zes_ras_state_t for a breakdown). Each time a hardware block of
+///       that category notifies that an error has occurred, that corresponding
+///       category counter is updated. When it goes above the threshold
+///       specified in detailedThresholds, a RAS event is triggered.
+typedef struct _zes_ras_config_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t totalThreshold;                                                ///< [in,out] If the total RAS errors exceeds this threshold, the event
+                                                                            ///< will be triggered. A value of 0ULL disables triggering the event based
+                                                                            ///< on the total counter.
+    zes_ras_state_t detailedThresholds;                                     ///< [in,out] If the RAS errors for each category exceed the threshold for
+                                                                            ///< that category, the event will be triggered. A value of 0ULL will
+                                                                            ///< disable an event being triggered for that category.
+
+} zes_ras_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of all RAS error sets on a device
+/// 
+/// @details
+///     - A RAS error set is a collection of RAS error counters of a given type
+///       (correctable/uncorrectable) from hardware blocks contained within a
+///       sub-device or within the device.
+///     - A device without sub-devices will typically return two handles, one
+///       for correctable errors sets and one for uncorrectable error sets.
+///     - A device with sub-devices will return RAS error sets for each
+///       sub-device and possibly RAS error sets for hardware blocks outside the
+///       sub-devices.
+///     - If the function completes successfully but pCount is set to 0, RAS
+///       features are not available/enabled on this device.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumRasErrorSets(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_ras_handle_t* phRas                                                 ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get RAS properties of a given RAS error set - this enables discovery
+///        of the type of RAS error set (correctable/uncorrectable) and if
+///        located on a sub-device
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasGetProperties(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    zes_ras_properties_t* pProperties                                       ///< [in,out] Structure describing RAS properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get RAS error thresholds that control when RAS events are generated
+/// 
+/// @details
+///     - The driver maintains counters for all RAS error sets and error
+///       categories. Events are generated when errors occur. The configuration
+///       enables setting thresholds to limit when events are sent.
+///     - When a particular RAS correctable error counter exceeds the configured
+///       threshold, the event ::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS will
+///       be triggered.
+///     - When a particular RAS uncorrectable error counter exceeds the
+///       configured threshold, the event
+///       ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS will be triggered.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasGetConfig(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    zes_ras_config_t* pConfig                                               ///< [in,out] Will be populed with the current RAS configuration -
+                                                                            ///< thresholds used to trigger events
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set RAS error thresholds that control when RAS events are generated
+/// 
+/// @details
+///     - The driver maintains counters for all RAS error sets and error
+///       categories. Events are generated when errors occur. The configuration
+///       enables setting thresholds to limit when events are sent.
+///     - When a particular RAS correctable error counter exceeds the specified
+///       threshold, the event ::ZES_EVENT_TYPE_FLAG_RAS_CORRECTABLE_ERRORS will
+///       be generated.
+///     - When a particular RAS uncorrectable error counter exceeds the
+///       specified threshold, the event
+///       ::ZES_EVENT_TYPE_FLAG_RAS_UNCORRECTABLE_ERRORS will be generated.
+///     - Call ::zesRasGetState() and set the clear flag to true to restart
+///       event generation once counters have exceeded thresholds.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Another running process is controlling these settings.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + Don't have permissions to set thresholds.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasSetConfig(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    const zes_ras_config_t* pConfig                                         ///< [in] Change the RAS configuration - thresholds used to trigger events
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the current value of RAS error counters for a particular error set
+/// 
+/// @details
+///     - Clearing errors will affect other threads/applications - the counter
+///       values will start from zero.
+///     - Clearing errors requires write permissions.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pState`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + Don't have permissions to clear error counters.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasGetState(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    ze_bool_t clear,                                                        ///< [in] Set to 1 to clear the counters of this type
+    zes_ras_state_t* pState                                                 ///< [in,out] Breakdown of where errors have occurred
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Scheduler management
+#if !defined(__GNUC__)
+#pragma region scheduler
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Scheduler mode
+typedef enum _zes_sched_mode_t
+{
+    ZES_SCHED_MODE_TIMEOUT = 0,                                             ///< Multiple applications or contexts are submitting work to the hardware.
+                                                                            ///< When higher priority work arrives, the scheduler attempts to pause the
+                                                                            ///< current executing work within some timeout interval, then submits the
+                                                                            ///< other work.
+    ZES_SCHED_MODE_TIMESLICE = 1,                                           ///< The scheduler attempts to fairly timeslice hardware execution time
+                                                                            ///< between multiple contexts submitting work to the hardware
+                                                                            ///< concurrently.
+    ZES_SCHED_MODE_EXCLUSIVE = 2,                                           ///< Any application or context can run indefinitely on the hardware
+                                                                            ///< without being preempted or terminated. All pending work for other
+                                                                            ///< contexts must wait until the running context completes with no further
+                                                                            ///< submitted work.
+    ZES_SCHED_MODE_COMPUTE_UNIT_DEBUG = 3,                                  ///< [DEPRECATED] No longer supported.
+    ZES_SCHED_MODE_FORCE_UINT32 = 0x7fffffff
+
+} zes_sched_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Properties related to scheduler component
+typedef struct _zes_sched_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t onSubdevice;                                                  ///< [out] True if this resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    ze_bool_t canControl;                                                   ///< [out] Software can change the scheduler component configuration
+                                                                            ///< assuming the user has permissions.
+    zes_engine_type_flags_t engines;                                        ///< [out] Bitfield of accelerator engine types that are managed by this
+                                                                            ///< scheduler component. Note that there can be more than one scheduler
+                                                                            ///< component for the same type of accelerator engine.
+    uint32_t supportedModes;                                                ///< [out] Bitfield of scheduler modes that can be configured for this
+                                                                            ///< scheduler component (bitfield of 1<<::zes_sched_mode_t).
+
+} zes_sched_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_SCHED_WATCHDOG_DISABLE
+/// @brief Disable forward progress guard timeout.
+#define ZES_SCHED_WATCHDOG_DISABLE  (~(0ULL))
+#endif // ZES_SCHED_WATCHDOG_DISABLE
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Configuration for timeout scheduler mode (::ZES_SCHED_MODE_TIMEOUT)
+typedef struct _zes_sched_timeout_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t watchdogTimeout;                                               ///< [in,out] The maximum time in microseconds that the scheduler will wait
+                                                                            ///< for a batch of work submitted to a hardware engine to complete or to
+                                                                            ///< be preempted so as to run another context.
+                                                                            ///< If this time is exceeded, the hardware engine is reset and the context terminated.
+                                                                            ///< If set to ::ZES_SCHED_WATCHDOG_DISABLE, a running workload can run as
+                                                                            ///< long as it wants without being terminated, but preemption attempts to
+                                                                            ///< run other contexts are permitted but not enforced.
+
+} zes_sched_timeout_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Configuration for timeslice scheduler mode
+///        (::ZES_SCHED_MODE_TIMESLICE)
+typedef struct _zes_sched_timeslice_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t interval;                                                      ///< [in,out] The average interval in microseconds that a submission for a
+                                                                            ///< context will run on a hardware engine before being preempted out to
+                                                                            ///< run a pending submission for another context.
+    uint64_t yieldTimeout;                                                  ///< [in,out] The maximum time in microseconds that the scheduler will wait
+                                                                            ///< to preempt a workload running on an engine before deciding to reset
+                                                                            ///< the hardware engine and terminating the associated context.
+
+} zes_sched_timeslice_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns handles to scheduler components.
+/// 
+/// @details
+///     - Each scheduler component manages the distribution of work across one
+///       or more accelerator engines.
+///     - If an application wishes to change the scheduler behavior for all
+///       accelerator engines of a specific type (e.g. compute), it should
+///       select all the handles where the `engines` member
+///       ::zes_sched_properties_t contains that type.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumSchedulers(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_sched_handle_t* phScheduler                                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get properties related to a scheduler component
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerGetProperties(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Handle for the component.
+    zes_sched_properties_t* pProperties                                     ///< [in,out] Structure that will contain property data.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get current scheduling mode in effect on a scheduler component.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pMode`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerGetCurrentMode(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    zes_sched_mode_t* pMode                                                 ///< [in,out] Will contain the current scheduler mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get scheduler config for mode ::ZES_SCHED_MODE_TIMEOUT
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerGetTimeoutModeProperties(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t getDefaults,                                                  ///< [in] If TRUE, the driver will return the system default properties for
+                                                                            ///< this mode, otherwise it will return the current properties.
+    zes_sched_timeout_properties_t* pConfig                                 ///< [in,out] Will contain the current parameters for this mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get scheduler config for mode ::ZES_SCHED_MODE_TIMESLICE
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerGetTimesliceModeProperties(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t getDefaults,                                                  ///< [in] If TRUE, the driver will return the system default properties for
+                                                                            ///< this mode, otherwise it will return the current properties.
+    zes_sched_timeslice_properties_t* pConfig                               ///< [in,out] Will contain the current parameters for this mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change scheduler mode to ::ZES_SCHED_MODE_TIMEOUT or update scheduler
+///        mode parameters if already running in this mode.
+/// 
+/// @details
+///     - This mode is optimized for multiple applications or contexts
+///       submitting work to the hardware. When higher priority work arrives,
+///       the scheduler attempts to pause the current executing work within some
+///       timeout interval, then submits the other work.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+///         + `nullptr == pNeedReload`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make this modification.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerSetTimeoutMode(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    zes_sched_timeout_properties_t* pProperties,                            ///< [in] The properties to use when configurating this mode.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change scheduler mode to ::ZES_SCHED_MODE_TIMESLICE or update
+///        scheduler mode parameters if already running in this mode.
+/// 
+/// @details
+///     - This mode is optimized to provide fair sharing of hardware execution
+///       time between multiple contexts submitting work to the hardware
+///       concurrently.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+///         + `nullptr == pNeedReload`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make this modification.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerSetTimesliceMode(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    zes_sched_timeslice_properties_t* pProperties,                          ///< [in] The properties to use when configurating this mode.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change scheduler mode to ::ZES_SCHED_MODE_EXCLUSIVE
+/// 
+/// @details
+///     - This mode is optimized for single application/context use-cases. It
+///       permits a context to run indefinitely on the hardware without being
+///       preempted or terminated. All pending work for other contexts must wait
+///       until the running context completes with no further submitted work.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pNeedReload`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make this modification.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerSetExclusiveMode(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Change scheduler mode to ::ZES_SCHED_MODE_COMPUTE_UNIT_DEBUG
+/// 
+/// @details
+///     - This is a special mode that must ben enabled when debugging an
+///       application that uses this device e.g. using the Level0 Debug API.
+///     - It ensures that only one command queue can execute work on the
+///       hardware at a given time. Work is permitted to run as long as needed
+///       without enforcing any scheduler fairness policies.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - [DEPRECATED] No longer supported.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hScheduler`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pNeedReload`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + This scheduler component does not support scheduler modes.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make this modification.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesSchedulerSetComputeUnitDebugMode(
+    zes_sched_handle_t hScheduler,                                          ///< [in] Sysman handle for the component.
+    ze_bool_t* pNeedReload                                                  ///< [in,out] Will be set to TRUE if a device driver reload is needed to
+                                                                            ///< apply the new scheduler mode.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Standby domains
+#if !defined(__GNUC__)
+#pragma region standby
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Standby hardware components
+typedef enum _zes_standby_type_t
+{
+    ZES_STANDBY_TYPE_GLOBAL = 0,                                            ///< Control the overall standby policy of the device/sub-device
+    ZES_STANDBY_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zes_standby_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Standby hardware component properties
+typedef struct _zes_standby_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_standby_type_t type;                                                ///< [out] Which standby hardware component this controls
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+
+} zes_standby_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Standby promotion modes
+typedef enum _zes_standby_promo_mode_t
+{
+    ZES_STANDBY_PROMO_MODE_DEFAULT = 0,                                     ///< Best compromise between performance and energy savings.
+    ZES_STANDBY_PROMO_MODE_NEVER = 1,                                       ///< The device/component will never shutdown. This can improve performance
+                                                                            ///< but uses more energy.
+    ZES_STANDBY_PROMO_MODE_FORCE_UINT32 = 0x7fffffff
+
+} zes_standby_promo_mode_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of standby controls
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumStandbyDomains(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_standby_handle_t* phStandby                                         ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get standby hardware component properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hStandby`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesStandbyGetProperties(
+    zes_standby_handle_t hStandby,                                          ///< [in] Handle for the component.
+    zes_standby_properties_t* pProperties                                   ///< [in,out] Will contain the standby hardware properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the current standby promotion mode
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hStandby`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pMode`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesStandbyGetMode(
+    zes_standby_handle_t hStandby,                                          ///< [in] Handle for the component.
+    zes_standby_promo_mode_t* pMode                                         ///< [in,out] Will contain the current standby mode.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set standby promotion mode
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hStandby`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_STANDBY_PROMO_MODE_NEVER < mode`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesStandbySetMode(
+    zes_standby_handle_t hStandby,                                          ///< [in] Handle for the component.
+    zes_standby_promo_mode_t mode                                           ///< [in] New standby mode.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for System Resource Management (Sysman) - Firmware management
+#if !defined(__GNUC__)
+#pragma region temperature
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Temperature sensors
+typedef enum _zes_temp_sensors_t
+{
+    ZES_TEMP_SENSORS_GLOBAL = 0,                                            ///< The maximum temperature across all device sensors
+    ZES_TEMP_SENSORS_GPU = 1,                                               ///< The maximum temperature across all sensors in the GPU
+    ZES_TEMP_SENSORS_MEMORY = 2,                                            ///< The maximum temperature across all sensors in the local memory
+    ZES_TEMP_SENSORS_GLOBAL_MIN = 3,                                        ///< The minimum temperature across all device sensors
+    ZES_TEMP_SENSORS_GPU_MIN = 4,                                           ///< The minimum temperature across all sensors in the GPU
+    ZES_TEMP_SENSORS_MEMORY_MIN = 5,                                        ///< The minimum temperature across all sensors in the local device memory
+    ZES_TEMP_SENSORS_GPU_BOARD = 6,                                         ///< The maximum temperature across all sensors in the GPU Board
+    ZES_TEMP_SENSORS_GPU_BOARD_MIN = 7,                                     ///< The minimum temperature across all sensors in the GPU Board
+    ZES_TEMP_SENSORS_VOLTAGE_REGULATOR = 8,                                 ///< The maximum temperature across all sensors in the Voltage Regulator
+    ZES_TEMP_SENSORS_FORCE_UINT32 = 0x7fffffff
+
+} zes_temp_sensors_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Temperature sensor properties
+typedef struct _zes_temp_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_temp_sensors_t type;                                                ///< [out] Which part of the device the temperature sensor measures
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the resource is located on a sub-device; false means
+                                                                            ///< that the resource is on the device of the calling Sysman handle
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    double maxTemperature;                                                  ///< [out] Will contain the maximum temperature for the specific device in
+                                                                            ///< degrees Celsius.
+    ze_bool_t isCriticalTempSupported;                                      ///< [out] Indicates if the critical temperature event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL is supported
+    ze_bool_t isThreshold1Supported;                                        ///< [out] Indicates if the temperature threshold 1 event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 is supported
+    ze_bool_t isThreshold2Supported;                                        ///< [out] Indicates if the temperature threshold 2 event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 is supported
+
+} zes_temp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Temperature sensor threshold
+typedef struct _zes_temp_threshold_t
+{
+    ze_bool_t enableLowToHigh;                                              ///< [in,out] Trigger an event when the temperature crosses from below the
+                                                                            ///< threshold to above.
+    ze_bool_t enableHighToLow;                                              ///< [in,out] Trigger an event when the temperature crosses from above the
+                                                                            ///< threshold to below.
+    double threshold;                                                       ///< [in,out] The threshold in degrees Celsius.
+
+} zes_temp_threshold_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Temperature configuration - which events should be triggered and the
+///        trigger conditions.
+typedef struct _zes_temp_config_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    ze_bool_t enableCritical;                                               ///< [in,out] Indicates if event ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL should
+                                                                            ///< be triggered by the driver.
+    zes_temp_threshold_t threshold1;                                        ///< [in,out] Configuration controlling if and when event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 should be triggered by the
+                                                                            ///< driver.
+    zes_temp_threshold_t threshold2;                                        ///< [in,out] Configuration controlling if and when event
+                                                                            ///< ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 should be triggered by the
+                                                                            ///< driver.
+
+} zes_temp_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of temperature sensors
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumTemperatureSensors(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_temp_handle_t* phTemperature                                        ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get temperature sensor properties
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTemperature`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesTemperatureGetProperties(
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    zes_temp_properties_t* pProperties                                      ///< [in,out] Will contain the temperature sensor properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get temperature configuration for this sensor - which events are
+///        triggered and the trigger conditions
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTemperature`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Temperature thresholds are not supported on this temperature sensor. Generally this is only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL.
+///         + One or both of the thresholds is not supported. Check the `isThreshold1Supported` and `isThreshold2Supported` members of ::zes_temp_properties_t.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to request this feature.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesTemperatureGetConfig(
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    zes_temp_config_t* pConfig                                              ///< [in,out] Returns current configuration.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set temperature configuration for this sensor - indicates which events
+///        are triggered and the trigger conditions
+/// 
+/// @details
+///     - Events ::ZES_EVENT_TYPE_FLAG_TEMP_CRITICAL will be triggered when
+///       temperature reaches the critical range. Use the function
+///       ::zesDeviceEventRegister() to start receiving this event.
+///     - Events ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD1 and
+///       ::ZES_EVENT_TYPE_FLAG_TEMP_THRESHOLD2 will be generated when
+///       temperature cross the thresholds set using this function. Use the
+///       function ::zesDeviceEventRegister() to start receiving these events.
+///     - Only one running process can set the temperature configuration at a
+///       time. If another process attempts to change the configuration, the
+///       error ::ZE_RESULT_ERROR_NOT_AVAILABLE will be returned. The function
+///       ::zesTemperatureGetConfig() will return the process ID currently
+///       controlling these settings.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTemperature`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pConfig`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + Temperature thresholds are not supported on this temperature sensor. Generally they are only supported for temperature sensor ::ZES_TEMP_SENSORS_GLOBAL.
+///         + Enabling the critical temperature event is not supported. Check the `isCriticalTempSupported` member of ::zes_temp_properties_t.
+///         + One or both of the thresholds is not supported. Check the `isThreshold1Supported` and `isThreshold2Supported` members of ::zes_temp_properties_t.
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to request this feature.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + Another running process is controlling these settings.
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + One or both the thresholds is above TjMax (see ::zesFrequencyOcGetTjMax()). Temperature thresholds must be below this value.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesTemperatureSetConfig(
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    const zes_temp_config_t* pConfig                                        ///< [in] New configuration.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the temperature from a specified sensor
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTemperature`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pTemperature`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesTemperatureGetState(
+    zes_temp_handle_t hTemperature,                                         ///< [in] Handle for the component.
+    double* pTemperature                                                    ///< [in,out] Will contain the temperature read from the specified sensor
+                                                                            ///< in degrees Celsius.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Power Limits
+#if !defined(__GNUC__)
+#pragma region powerLimits
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_POWER_LIMITS_EXT_NAME
+/// @brief Power Limits Extension Name
+#define ZES_POWER_LIMITS_EXT_NAME  "ZES_extension_power_limits"
+#endif // ZES_POWER_LIMITS_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Limits Extension Version(s)
+typedef enum _zes_power_limits_ext_version_t
+{
+    ZES_POWER_LIMITS_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),             ///< version 1.0
+    ZES_POWER_LIMITS_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),         ///< latest known version
+    ZES_POWER_LIMITS_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_power_limits_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device power/current limit descriptor.
+typedef struct _zes_power_limit_ext_desc_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_power_level_t level;                                                ///< [in,out] duration type over which the power draw is measured, i.e.
+                                                                            ///< sustained, burst, peak, or critical.
+    zes_power_source_t source;                                              ///< [out] source of power used by the system, i.e. AC or DC.
+    zes_limit_unit_t limitUnit;                                             ///< [out] unit used for specifying limit, i.e. current units (milliamps)
+                                                                            ///< or power units (milliwatts).
+    ze_bool_t enabledStateLocked;                                           ///< [out] indicates if the power limit state (enabled/ignored) can be set
+                                                                            ///< (false) or is locked (true).
+    ze_bool_t enabled;                                                      ///< [in,out] indicates if the limit is enabled (true) or ignored (false).
+                                                                            ///< If enabledStateIsLocked is True, this value is ignored.
+    ze_bool_t intervalValueLocked;                                          ///< [out] indicates if the interval can be modified (false) or is fixed
+                                                                            ///< (true).
+    int32_t interval;                                                       ///< [in,out] power averaging window in milliseconds. If
+                                                                            ///< intervalValueLocked is true, this value is ignored.
+    ze_bool_t limitValueLocked;                                             ///< [out] indicates if the limit can be set (false) or if the limit is
+                                                                            ///< fixed (true).
+    int32_t limit;                                                          ///< [in,out] limit value. If limitValueLocked is true, this value is
+                                                                            ///< ignored. The value should be provided in the unit specified by
+                                                                            ///< limitUnit.
+
+} zes_power_limit_ext_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties related to device power settings
+/// 
+/// @details
+///     - This structure may be returned from ::zesPowerGetProperties via the
+///       `pNext` member of ::zes_power_properties_t.
+///     - This structure may also be returned from ::zesPowerGetProperties via
+///       the `pNext` member of ::zes_power_ext_properties_t
+///     - Used for determining the power domain level, i.e. card-level v/s
+///       package-level v/s stack-level & the factory default power limits.
+typedef struct _zes_power_ext_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_power_domain_t domain;                                              ///< [out] domain that the power limit belongs to.
+    zes_power_limit_ext_desc_t* defaultLimit;                               ///< [out] the factory default limit of the part.
+
+} zes_power_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get power limits
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - This function returns all the power limits associated with the
+///       supplied power domain.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerGetLimitsExt(
+    zes_pwr_handle_t hPower,                                                ///< [in] Power domain handle instance.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of power limit descriptors. If count is
+                                                                            ///< zero, then the driver shall update the value with the total number of
+                                                                            ///< components of this type that are available. If count is greater than
+                                                                            ///< the number of components of this type that are available, then the
+                                                                            ///< driver shall update the value with the correct number of components.
+    zes_power_limit_ext_desc_t* pSustained                                  ///< [in,out][optional][range(0, *pCount)] Array of query results for power
+                                                                            ///< limit descriptors. If count is less than the number of components of
+                                                                            ///< this type that are available, then the driver shall only retrieve that
+                                                                            ///< number of components.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set power limits
+/// 
+/// @details
+///     - The application can only modify unlocked members of the limit
+///       descriptors returned by ::zesPowerGetLimitsExt.
+///     - Not all the limits returned by ::zesPowerGetLimitsExt need to be
+///       supplied to this function.
+///     - Limits do not have to be supplied in the same order as returned by
+///       ::zesPowerGetLimitsExt.
+///     - The same limit can be supplied multiple times. Limits are applied in
+///       the order in which they are supplied.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hPower`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + User does not have permissions to make these modifications.
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + The device is in use, meaning that the GPU is under Over clocking, applying power limits under overclocking is not supported.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesPowerSetLimitsExt(
+    zes_pwr_handle_t hPower,                                                ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in] Pointer to the number of power limit descriptors.
+    zes_power_limit_ext_desc_t* pSustained                                  ///< [in][optional][range(0, *pCount)] Array of power limit descriptors.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Engine Activity
+#if !defined(__GNUC__)
+#pragma region engineActivity
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_ENGINE_ACTIVITY_EXT_NAME
+/// @brief Engine Activity Extension Name
+#define ZES_ENGINE_ACTIVITY_EXT_NAME  "ZES_extension_engine_activity"
+#endif // ZES_ENGINE_ACTIVITY_EXT_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Engine Activity Extension Version(s)
+typedef enum _zes_engine_activity_ext_version_t
+{
+    ZES_ENGINE_ACTIVITY_EXT_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),          ///< version 1.0
+    ZES_ENGINE_ACTIVITY_EXT_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),      ///< latest known version
+    ZES_ENGINE_ACTIVITY_EXT_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_engine_activity_ext_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties related to Engine Groups
+/// 
+/// @details
+///     - This structure may be passed to ::zesEngineGetProperties by having the
+///       pNext member of ::zes_engine_properties_t point at this struct.
+///     - Used for SRIOV per Virtual Function device utilization by
+///       ::zes_engine_group_t
+typedef struct _zes_engine_ext_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t countOfVirtualFunctionInstance;                                ///< [out] Number of Virtual Function(VF) instances associated with engine
+                                                                            ///< to monitor the utilization of hardware across all Virtual Function
+                                                                            ///< from a Physical Function (PF) instance.
+                                                                            ///< These VF-by-VF views should provide engine group and individual engine
+                                                                            ///< level granularity.
+                                                                            ///< This count represents the number of VF instances that are actively
+                                                                            ///< using the resource represented by the engine handle.
+
+} zes_engine_ext_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get activity stats for Physical Function (PF) and each Virtual
+///        Function (VF) associated with engine group.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hEngine`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE - "Engine activity extension is not supported in the environment."
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesEngineGetActivityExt(
+    zes_engine_handle_t hEngine,                                            ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of VF engine stats descriptors.
+                                                                            ///<  - if count is zero, the driver shall update the value with the total
+                                                                            ///< number of engine stats available.
+                                                                            ///<  - if count is greater than the total number of engine stats
+                                                                            ///< available, the driver shall update the value with the correct number
+                                                                            ///< of engine stats available.
+                                                                            ///<  - The count returned is the sum of number of VF instances currently
+                                                                            ///< available and the PF instance.
+    zes_engine_stats_t* pStats                                              ///< [in,out][optional][range(0, *pCount)] array of engine group activity counters.
+                                                                            ///<  - if count is less than the total number of engine stats available,
+                                                                            ///< then driver shall only retrieve that number of stats.
+                                                                            ///<  - the implementation shall populate the vector with engine stat for
+                                                                            ///< PF at index 0 of the vector followed by user provided pCount-1 number
+                                                                            ///< of VF engine stats.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for RAS Get State and Clear State
+#if !defined(__GNUC__)
+#pragma region rasState
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_RAS_GET_STATE_EXP_NAME
+/// @brief RAS Get State Extension Name
+#define ZES_RAS_GET_STATE_EXP_NAME  "ZES_extension_ras_state"
+#endif // ZES_RAS_GET_STATE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS Get State Extension Version(s)
+typedef enum _zes_ras_state_exp_version_t
+{
+    ZES_RAS_STATE_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),                ///< version 1.0
+    ZES_RAS_STATE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),            ///< latest known version
+    ZES_RAS_STATE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_ras_state_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief RAS error categories
+typedef enum _zes_ras_error_category_exp_t
+{
+    ZES_RAS_ERROR_CATEGORY_EXP_RESET = 0,                                   ///< The number of accelerator engine resets attempted by the driver
+    ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS = 1,                      ///< The number of hardware exceptions generated by the way workloads have
+                                                                            ///< programmed the hardware
+    ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS = 2,                           ///< The number of low level driver communication errors have occurred
+    ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS = 3,                          ///< The number of errors that have occurred in the compute accelerator
+                                                                            ///< hardware
+    ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS = 4,                      ///< The number of errors that have occurred in the fixed-function
+                                                                            ///< accelerator hardware
+    ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS = 5,                            ///< The number of errors that have occurred in caches (L1/L3/register
+                                                                            ///< file/shared local memory/sampler)
+    ZES_RAS_ERROR_CATEGORY_EXP_DISPLAY_ERRORS = 6,                          ///< The number of errors that have occurred in the display
+    ZES_RAS_ERROR_CATEGORY_EXP_MEMORY_ERRORS = 7,                           ///< The number of errors that have occurred in Memory
+    ZES_RAS_ERROR_CATEGORY_EXP_SCALE_ERRORS = 8,                            ///< The number of errors that have occurred in Scale Fabric
+    ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS = 9,                         ///< The number of errors that have occurred in L3 Fabric
+    ZES_RAS_ERROR_CATEGORY_EXP_FORCE_UINT32 = 0x7fffffff
+
+} zes_ras_error_category_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension structure for providing RAS error counters for different
+///        error sets
+typedef struct _zes_ras_state_exp_t
+{
+    zes_ras_error_category_exp_t category;                                  ///< [out] category for which error counter is provided.
+    uint64_t errorCounter;                                                  ///< [out] Current value of RAS counter for specific error category.
+
+} zes_ras_state_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ras Get State
+/// 
+/// @details
+///     - This function retrieves error counters for different RAS error
+///       categories.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasGetStateExp(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of RAS state structures that can be retrieved.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of error categories for which state can be retrieved.
+                                                                            ///< if count is greater than the number of RAS states available, then the
+                                                                            ///< driver shall update the value with the correct number of RAS states available.
+    zes_ras_state_exp_t* pState                                             ///< [in,out][optional][range(0, *pCount)] array of query results for RAS
+                                                                            ///< error states for different categories.
+                                                                            ///< if count is less than the number of RAS states available, then driver
+                                                                            ///< shall only retrieve that number of RAS states.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Ras Clear State
+/// 
+/// @details
+///     - This function clears error counters for a RAS error category.
+///     - Clearing errors will affect other threads/applications - the counter
+///       values will start from zero.
+///     - Clearing errors requires write permissions.
+///     - The application should not call this function from simultaneous
+///       threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hRas`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS < category`
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + Don't have permissions to clear error counters.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesRasClearStateExp(
+    zes_ras_handle_t hRas,                                                  ///< [in] Handle for the component.
+    zes_ras_error_category_exp_t category                                   ///< [in] category for which error counter is to be cleared.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Memory State
+#if !defined(__GNUC__)
+#pragma region memPageOfflineState
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME
+/// @brief Memory State Extension Name
+#define ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME  "ZES_extension_mem_state"
+#endif // ZES_MEM_PAGE_OFFLINE_STATE_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory State Extension Version(s)
+typedef enum _zes_mem_page_offline_state_exp_version_t
+{
+    ZES_MEM_PAGE_OFFLINE_STATE_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
+    ZES_MEM_PAGE_OFFLINE_STATE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZES_MEM_PAGE_OFFLINE_STATE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_page_offline_state_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties for Memory State
+/// 
+/// @details
+///     - This structure may be returned from ::zesMemoryGetState via the
+///       `pNext` member of ::zes_mem_state_t
+///     - These additional parameters get Memory Page Offline Metrics
+typedef struct _zes_mem_page_offline_state_exp_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t memoryPageOffline;                                             ///< [out] Returns the number of Memory Pages Offline
+    uint32_t maxMemoryPageOffline;                                          ///< [out] Returns the Allowed Memory Pages Offline
+
+} zes_mem_page_offline_state_exp_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Memory Bandwidth Counter Valid Bits
+#if !defined(__GNUC__)
+#pragma region memoryBwCounterValidBits
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_MEMORY_BANDWIDTH_COUNTER_BITS_EXP_PROPERTIES_NAME
+/// @brief Memory Bandwidth Counter Valid Bits Extension Name
+#define ZES_MEMORY_BANDWIDTH_COUNTER_BITS_EXP_PROPERTIES_NAME  "ZES_extension_mem_bandwidth_counter_bits_properties"
+#endif // ZES_MEMORY_BANDWIDTH_COUNTER_BITS_EXP_PROPERTIES_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Memory Bandwidth Counter Valid Bits Extension Version(s)
+typedef enum _zes_mem_bandwidth_counter_bits_exp_version_t
+{
+    ZES_MEM_BANDWIDTH_COUNTER_BITS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),   ///< version 1.0
+    ZES_MEM_BANDWIDTH_COUNTER_BITS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZES_MEM_BANDWIDTH_COUNTER_BITS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_mem_bandwidth_counter_bits_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension properties for reporting valid bit count for memory
+///        bandwidth counter value
+/// 
+/// @details
+///     - Number of valid read and write counter bits of memory bandwidth
+///     - This structure may be returned from ::zesMemoryGetProperties via the
+///       `pNext` member of ::zes_mem_properties_t.
+///     - Used for denoting number of valid bits in the counter value returned
+///       in ::zes_mem_bandwidth_t.
+typedef struct _zes_mem_bandwidth_counter_bits_exp_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t validBitsCount;                                                ///< [out] Returns the number of valid bits in the counter values
+
+} zes_mem_bandwidth_counter_bits_exp_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Power Domain Properties
+#if !defined(__GNUC__)
+#pragma region powerDomainProperties
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME
+/// @brief Power Domain Properties Name
+#define ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME  "ZES_extension_power_domain_properties"
+#endif // ZES_POWER_DOMAIN_PROPERTIES_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Power Domain Properties Extension Version(s)
+typedef enum _zes_power_domain_properties_exp_version_t
+{
+    ZES_POWER_DOMAIN_PROPERTIES_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZES_POWER_DOMAIN_PROPERTIES_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZES_POWER_DOMAIN_PROPERTIES_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_power_domain_properties_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Extension structure for providing power domain information associated
+///        with a power handle
+/// 
+/// @details
+///     - This structure may be returned from ::zesPowerGetProperties via the
+///       `pNext` member of ::zes_power_properties_t.
+///     - Used for associating a power handle with a power domain.
+typedef struct _zes_power_domain_exp_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_power_domain_t powerDomain;                                         ///< [out] Power domain associated with the power handle.
+
+} zes_power_domain_exp_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for firmware security version
+#if !defined(__GNUC__)
+#pragma region firmwareSecurityVersion
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_FIRMWARE_SECURITY_VERSION_EXP_NAME
+/// @brief Firmware security version
+#define ZES_FIRMWARE_SECURITY_VERSION_EXP_NAME  "ZES_experimental_firmware_security_version"
+#endif // ZES_FIRMWARE_SECURITY_VERSION_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Firmware security version Extension Version(s)
+typedef enum _zes_firmware_security_exp_version_t
+{
+    ZES_FIRMWARE_SECURITY_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),        ///< version 1.0
+    ZES_FIRMWARE_SECURITY_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),    ///< latest known version
+    ZES_FIRMWARE_SECURITY_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_firmware_security_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the firmware security version number of the currently running
+///        firmware
+/// 
+/// @details
+///     - The application should create a character array of size
+///       ::ZES_STRING_PROPERTY_SIZE and reference it for the `pVersion`
+///       parameter.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pVersion`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareGetSecurityVersionExp(
+    zes_firmware_handle_t hFirmware,                                        ///< [in] Handle for the component.
+    char* pVersion                                                          ///< [in,out] NULL terminated string value. The string "unknown" will be
+                                                                            ///< returned if this property cannot be determined.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set the firmware security version number
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hFirmware`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesFirmwareSetSecurityVersionExp(
+    zes_firmware_handle_t hFirmware                                         ///< [in] Handle for the component.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Sysman Device Mapping
+#if !defined(__GNUC__)
+#pragma region sysmanDeviceMapping
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_SYSMAN_DEVICE_MAPPING_EXP_NAME
+/// @brief Sysman Device Mapping Extension Name
+#define ZES_SYSMAN_DEVICE_MAPPING_EXP_NAME  "ZES_experimental_sysman_device_mapping"
+#endif // ZES_SYSMAN_DEVICE_MAPPING_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sysman Device Mapping Extension Version(s)
+typedef enum _zes_sysman_device_mapping_exp_version_t
+{
+    ZES_SYSMAN_DEVICE_MAPPING_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),    ///< version 1.0
+    ZES_SYSMAN_DEVICE_MAPPING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZES_SYSMAN_DEVICE_MAPPING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_sysman_device_mapping_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sub Device Properties
+typedef struct _zes_subdevice_exp_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t subdeviceId;                                                   ///< [out] this gives the ID of the sub device
+    zes_uuid_t uuid;                                                        ///< [out] universal unique identifier of the sub device.
+
+} zes_subdevice_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves sub device properties for the given sysman device handle
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceGetSubDevicePropertiesExp(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of sub devices.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of sub devices currently attached to the device.
+                                                                            ///< if count is greater than the number of sub devices currently attached
+                                                                            ///< to the device, then the driver shall update the value with the correct
+                                                                            ///< number of sub devices.
+    zes_subdevice_exp_properties_t* pSubdeviceProps                         ///< [in,out][optional][range(0, *pCount)] array of sub device property structures.
+                                                                            ///< if count is less than the number of sysman sub devices available, then
+                                                                            ///< the driver shall only retrieve that number of sub device property structures.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves sysman device and subdevice index for the given UUID and
+///        sysman driver
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phDevice`
+///         + `nullptr == onSubdevice`
+///         + `nullptr == subdeviceId`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDriverGetDeviceByUuidExp(
+    zes_driver_handle_t hDriver,                                            ///< [in] handle of the sysman driver instance
+    zes_uuid_t uuid,                                                        ///< [in] universal unique identifier.
+    zes_device_handle_t* phDevice,                                          ///< [out] Sysman handle of the device.
+    ze_bool_t* onSubdevice,                                                 ///< [out] True if the UUID belongs to the sub-device; false means that
+                                                                            ///< UUID belongs to the root device.
+    uint32_t* subdeviceId                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Sysman Extension APIs for Virtual Function Management Properties
+#if !defined(__GNUC__)
+#pragma region virtualFunctionManagement
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZES_VIRTUAL_FUNCTION_MANAGEMENT_EXP_NAME
+/// @brief Virtual Function Management Extension Name
+#define ZES_VIRTUAL_FUNCTION_MANAGEMENT_EXP_NAME  "ZES_experimental_virtual_function_management"
+#endif // ZES_VIRTUAL_FUNCTION_MANAGEMENT_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Virtual Function Management Extension Version(s)
+typedef enum _zes_vf_management_exp_version_t
+{
+    ZES_VF_MANAGEMENT_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0 (deprecated)
+    ZES_VF_MANAGEMENT_EXP_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),            ///< version 1.1 (deprecated)
+    ZES_VF_MANAGEMENT_EXP_VERSION_1_2 = ZE_MAKE_VERSION( 1, 2 ),            ///< version 1.2
+    ZES_VF_MANAGEMENT_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 2 ),        ///< latest known version
+    ZES_VF_MANAGEMENT_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zes_vf_management_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Virtual function memory types (deprecated)
+typedef uint32_t zes_vf_info_mem_type_exp_flags_t;
+typedef enum _zes_vf_info_mem_type_exp_flag_t
+{
+    ZES_VF_INFO_MEM_TYPE_EXP_FLAG_MEM_TYPE_SYSTEM = ZE_BIT(0),              ///< System memory
+    ZES_VF_INFO_MEM_TYPE_EXP_FLAG_MEM_TYPE_DEVICE = ZE_BIT(1),              ///< Device local memory
+    ZES_VF_INFO_MEM_TYPE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_vf_info_mem_type_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Virtual function utilization flag bit fields (deprecated)
+typedef uint32_t zes_vf_info_util_exp_flags_t;
+typedef enum _zes_vf_info_util_exp_flag_t
+{
+    ZES_VF_INFO_UTIL_EXP_FLAG_INFO_NONE = ZE_BIT(0),                        ///< No info associated with virtual function
+    ZES_VF_INFO_UTIL_EXP_FLAG_INFO_MEM_CPU = ZE_BIT(1),                     ///< System memory utilization associated with virtual function
+    ZES_VF_INFO_UTIL_EXP_FLAG_INFO_MEM_GPU = ZE_BIT(2),                     ///< Device memory utilization associated with virtual function
+    ZES_VF_INFO_UTIL_EXP_FLAG_INFO_ENGINE = ZE_BIT(3),                      ///< Engine utilization associated with virtual function
+    ZES_VF_INFO_UTIL_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zes_vf_info_util_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Virtual function management properties (deprecated)
+typedef struct _zes_vf_exp_properties_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_address_t address;                                              ///< [out] Virtual function BDF address
+    zes_uuid_t uuid;                                                        ///< [out] universal unique identifier of the device
+    zes_vf_info_util_exp_flags_t flags;                                     ///< [out] utilization flags available. May be 0 or a valid combination of
+                                                                            ///< ::zes_vf_info_util_exp_flag_t.
+
+} zes_vf_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provides memory utilization values for a virtual function (deprecated)
+typedef struct _zes_vf_util_mem_exp_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_vf_info_mem_type_exp_flags_t memTypeFlags;                          ///< [out] Memory type flags.
+    uint64_t free;                                                          ///< [out] Free memory size in bytes.
+    uint64_t size;                                                          ///< [out] Total allocatable memory in bytes.
+    uint64_t timestamp;                                                     ///< [out] Wall clock time from VF when value was sampled.
+
+} zes_vf_util_mem_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provides engine utilization values for a virtual function (deprecated)
+typedef struct _zes_vf_util_engine_exp_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_engine_group_t type;                                                ///< [out] The engine group.
+    uint64_t activeCounterValue;                                            ///< [out] Represents active counter.
+    uint64_t samplingCounterValue;                                          ///< [out] Represents counter value when activeCounterValue was sampled.
+    uint64_t timestamp;                                                     ///< [out] Wall clock time when the activeCounterValue was sampled.
+
+} zes_vf_util_engine_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Virtual function management capabilities
+typedef struct _zes_vf_exp_capabilities_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_pci_address_t address;                                              ///< [out] Virtual function BDF address
+    uint32_t vfDeviceMemSize;                                               ///< [out] Virtual function memory size in bytes
+    uint32_t vfID;                                                          ///< [out] Virtual Function ID
+
+} zes_vf_exp_capabilities_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provides memory utilization values for a virtual function
+typedef struct _zes_vf_util_mem_exp2_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_mem_loc_t vfMemLocation;                                            ///< [out] Location of this memory (system, device)
+    uint64_t vfMemUtilized;                                                 ///< [out] Free memory size in bytes.
+
+} zes_vf_util_mem_exp2_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Provides engine utilization values for a virtual function
+/// 
+/// @details
+///     - Percent utilization is calculated by taking two snapshots (s1, s2) and
+///       using the equation: %util = (s2.activeCounterValue -
+///       s1.activeCounterValue) / (s2.samplingCounterValue -
+///       s1.samplingCounterValue)
+typedef struct _zes_vf_util_engine_exp2_t
+{
+    zes_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zes_engine_group_t vfEngineType;                                        ///< [out] The engine group.
+    uint64_t activeCounterValue;                                            ///< [out] Represents active counter.
+    uint64_t samplingCounterValue;                                          ///< [out] Represents counter value when activeCounterValue was sampled.
+                                                                            ///< Refer to the formulae above for calculating the utilization percent
+
+} zes_vf_util_engine_exp2_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of virtual function modules
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported. Use ::zesDeviceEnumEnabledVFExp.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumActiveVFExp(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_vf_handle_t* phVFhandle                                             ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get virtual function management properties
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported. Use
+///       ::zesVFManagementGetVFCapabilitiesExp.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementGetVFPropertiesExp(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the VF component.
+    zes_vf_exp_properties_t* pProperties                                    ///< [in,out] Will contain VF properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get memory activity stats for each available memory types associated
+///        with Virtual Function (VF)
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported. Use
+///       ::zesVFManagementGetVFMemoryUtilizationExp2.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementGetVFMemoryUtilizationExp(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of VF memory stats descriptors.
+                                                                            ///<  - if count is zero, the driver shall update the value with the total
+                                                                            ///< number of memory stats available.
+                                                                            ///<  - if count is greater than the total number of memory stats
+                                                                            ///< available, the driver shall update the value with the correct number
+                                                                            ///< of memory stats available.
+                                                                            ///<  - The count returned is the sum of number of VF instances currently
+                                                                            ///< available and the PF instance.
+    zes_vf_util_mem_exp_t* pMemUtil                                         ///< [in,out][optional][range(0, *pCount)] array of memory group activity counters.
+                                                                            ///<  - if count is less than the total number of memory stats available,
+                                                                            ///< then driver shall only retrieve that number of stats.
+                                                                            ///<  - the implementation shall populate the vector pCount-1 number of VF
+                                                                            ///< memory stats.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get engine activity stats for each available engine group associated
+///        with Virtual Function (VF)
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported. Use
+///       ::zesVFManagementGetVFEngineUtilizationExp2.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementGetVFEngineUtilizationExp(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of VF engine stats descriptors.
+                                                                            ///<  - if count is zero, the driver shall update the value with the total
+                                                                            ///< number of engine stats available.
+                                                                            ///<  - if count is greater than the total number of engine stats
+                                                                            ///< available, the driver shall update the value with the correct number
+                                                                            ///< of engine stats available.
+                                                                            ///<  - The count returned is the sum of number of VF instances currently
+                                                                            ///< available and the PF instance.
+    zes_vf_util_engine_exp_t* pEngineUtil                                   ///< [in,out][optional][range(0, *pCount)] array of engine group activity counters.
+                                                                            ///<  - if count is less than the total number of engine stats available,
+                                                                            ///< then driver shall only retrieve that number of stats.
+                                                                            ///<  - the implementation shall populate the vector pCount-1 number of VF
+                                                                            ///< engine stats.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Configure utilization telemetry enabled or disabled associated with
+///        Virtual Function (VF)
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xf < flags`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementSetVFTelemetryModeExp(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the component.
+    zes_vf_info_util_exp_flags_t flags,                                     ///< [in] utilization flags to enable or disable. May be 0 or a valid
+                                                                            ///< combination of ::zes_vf_info_util_exp_flag_t.
+    ze_bool_t enable                                                        ///< [in] Enable utilization telemetry.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Set sampling interval to monitor for a particular utilization
+///        telemetry associated with Virtual Function (VF)
+/// 
+/// @details
+///     - [DEPRECATED] No longer supported.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0xf < flag`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementSetVFTelemetrySamplingIntervalExp(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the component.
+    zes_vf_info_util_exp_flags_t flag,                                      ///< [in] utilization flags to set sampling interval. May be 0 or a valid
+                                                                            ///< combination of ::zes_vf_info_util_exp_flag_t.
+    uint64_t samplingInterval                                               ///< [in] Sampling interval value.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get handle of virtual function modules
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesDeviceEnumEnabledVFExp(
+    zes_device_handle_t hDevice,                                            ///< [in] Sysman handle of the device.
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of components of this type.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of components of this type that are available.
+                                                                            ///< if count is greater than the number of components of this type that
+                                                                            ///< are available, then the driver shall update the value with the correct
+                                                                            ///< number of components.
+    zes_vf_handle_t* phVFhandle                                             ///< [in,out][optional][range(0, *pCount)] array of handle of components of
+                                                                            ///< this type.
+                                                                            ///< if count is less than the number of components of this type that are
+                                                                            ///< available, then the driver shall only retrieve that number of
+                                                                            ///< component handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get virtual function management capabilities
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCapability`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementGetVFCapabilitiesExp(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the VF component.
+    zes_vf_exp_capabilities_t* pCapability                                  ///< [in,out] Will contain VF capability.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get memory activity stats for each available memory types associated
+///        with Virtual Function (VF)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - If VF is disable/pause/not active, utilization will give zero value.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementGetVFMemoryUtilizationExp2(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of VF memory stats descriptors.
+                                                                            ///<  - if count is zero, the driver shall update the value with the total
+                                                                            ///< number of memory stats available.
+                                                                            ///<  - if count is greater than the total number of memory stats
+                                                                            ///< available, the driver shall update the value with the correct number
+                                                                            ///< of memory stats available.
+    zes_vf_util_mem_exp2_t* pMemUtil                                        ///< [in,out][optional][range(0, *pCount)] array of memory group activity counters.
+                                                                            ///<  - if count is less than the total number of memory stats available,
+                                                                            ///< then driver shall only retrieve that number of stats.
+                                                                            ///<  - the implementation shall populate the vector pCount-1 number of VF
+                                                                            ///< memory stats.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get engine activity stats for each available engine group associated
+///        with Virtual Function (VF)
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+///     - If VF is disable/pause/not active, utilization will give zero value.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hVFhandle`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zesVFManagementGetVFEngineUtilizationExp2(
+    zes_vf_handle_t hVFhandle,                                              ///< [in] Sysman handle for the component.
+    uint32_t* pCount,                                                       ///< [in,out] Pointer to the number of VF engine stats descriptors.
+                                                                            ///<  - if count is zero, the driver shall update the value with the total
+                                                                            ///< number of engine stats available.
+                                                                            ///<  - if count is greater than the total number of engine stats
+                                                                            ///< available, the driver shall update the value with the correct number
+                                                                            ///< of engine stats available.
+    zes_vf_util_engine_exp2_t* pEngineUtil                                  ///< [in,out][optional][range(0, *pCount)] array of engine group activity counters.
+                                                                            ///<  - if count is less than the total number of engine stats available,
+                                                                            ///< then driver shall only retrieve that number of stats.
+                                                                            ///<  - the implementation shall populate the vector pCount-1 number of VF
+                                                                            ///< engine stats.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZES_API_H
\ No newline at end of file
diff --git a/third_party/level_zero/zes_ddi.h b/third_party/level_zero/zes_ddi.h
new file mode 100644
index 00000000000..27ef4403d77
--- /dev/null
+++ b/third_party/level_zero/zes_ddi.h
@@ -0,0 +1,1991 @@
+/*
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file zes_ddi.h
+ * @version v1.11-r1.11.8
+ *
+ */
+#ifndef _ZES_DDI_H
+#define _ZES_DDI_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+#include "zes_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesInit 
+typedef ze_result_t (ZE_APICALL *zes_pfnInit_t)(
+    zes_init_flags_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Global functions pointers
+typedef struct _zes_global_dditable_t
+{
+    zes_pfnInit_t                                               pfnInit;
+} zes_global_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Global table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetGlobalProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_global_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetGlobalProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetGlobalProcAddrTable_t)(
+    ze_api_version_t,
+    zes_global_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetProperties_t)(
+    zes_device_handle_t,
+    zes_device_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetState_t)(
+    zes_device_handle_t,
+    zes_device_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceReset 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceReset_t)(
+    zes_device_handle_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceProcessesGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceProcessesGetState_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_process_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDevicePciGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetProperties_t)(
+    zes_device_handle_t,
+    zes_pci_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDevicePciGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetState_t)(
+    zes_device_handle_t,
+    zes_pci_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDevicePciGetBars 
+typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetBars_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_pci_bar_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDevicePciGetStats 
+typedef ze_result_t (ZE_APICALL *zes_pfnDevicePciGetStats_t)(
+    zes_device_handle_t,
+    zes_pci_stats_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumDiagnosticTestSuites 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumDiagnosticTestSuites_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_diag_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumEngineGroups 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumEngineGroups_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_engine_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEventRegister 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEventRegister_t)(
+    zes_device_handle_t,
+    zes_event_type_flags_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumFabricPorts 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFabricPorts_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_fabric_port_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumFans 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFans_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_fan_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumFirmwares 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFirmwares_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_firmware_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumFrequencyDomains 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumFrequencyDomains_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_freq_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumLeds 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumLeds_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_led_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumMemoryModules 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumMemoryModules_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_mem_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumPerformanceFactorDomains 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumPerformanceFactorDomains_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_perf_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumPowerDomains 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumPowerDomains_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_pwr_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetCardPowerDomain 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetCardPowerDomain_t)(
+    zes_device_handle_t,
+    zes_pwr_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumPsus 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumPsus_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_psu_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumRasErrorSets 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumRasErrorSets_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_ras_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumSchedulers 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumSchedulers_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_sched_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumStandbyDomains 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumStandbyDomains_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_standby_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumTemperatureSensors 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumTemperatureSensors_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_temp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEccAvailable 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEccAvailable_t)(
+    zes_device_handle_t,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEccConfigurable 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEccConfigurable_t)(
+    zes_device_handle_t,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetEccState 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetEccState_t)(
+    zes_device_handle_t,
+    zes_device_ecc_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceSetEccState 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceSetEccState_t)(
+    zes_device_handle_t,
+    const zes_device_ecc_desc_t*,
+    zes_device_ecc_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGet 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGet_t)(
+    zes_driver_handle_t,
+    uint32_t*,
+    zes_device_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceSetOverclockWaiver 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceSetOverclockWaiver_t)(
+    zes_device_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetOverclockDomains 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetOverclockDomains_t)(
+    zes_device_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetOverclockControls 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetOverclockControls_t)(
+    zes_device_handle_t,
+    zes_overclock_domain_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceResetOverclockSettings 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceResetOverclockSettings_t)(
+    zes_device_handle_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceReadOverclockState 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceReadOverclockState_t)(
+    zes_device_handle_t,
+    zes_overclock_mode_t*,
+    ze_bool_t*,
+    ze_bool_t*,
+    zes_pending_action_t*,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumOverclockDomains 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumOverclockDomains_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_overclock_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceResetExt 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceResetExt_t)(
+    zes_device_handle_t,
+    zes_reset_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Device functions pointers
+typedef struct _zes_device_dditable_t
+{
+    zes_pfnDeviceGetProperties_t                                pfnGetProperties;
+    zes_pfnDeviceGetState_t                                     pfnGetState;
+    zes_pfnDeviceReset_t                                        pfnReset;
+    zes_pfnDeviceProcessesGetState_t                            pfnProcessesGetState;
+    zes_pfnDevicePciGetProperties_t                             pfnPciGetProperties;
+    zes_pfnDevicePciGetState_t                                  pfnPciGetState;
+    zes_pfnDevicePciGetBars_t                                   pfnPciGetBars;
+    zes_pfnDevicePciGetStats_t                                  pfnPciGetStats;
+    zes_pfnDeviceEnumDiagnosticTestSuites_t                     pfnEnumDiagnosticTestSuites;
+    zes_pfnDeviceEnumEngineGroups_t                             pfnEnumEngineGroups;
+    zes_pfnDeviceEventRegister_t                                pfnEventRegister;
+    zes_pfnDeviceEnumFabricPorts_t                              pfnEnumFabricPorts;
+    zes_pfnDeviceEnumFans_t                                     pfnEnumFans;
+    zes_pfnDeviceEnumFirmwares_t                                pfnEnumFirmwares;
+    zes_pfnDeviceEnumFrequencyDomains_t                         pfnEnumFrequencyDomains;
+    zes_pfnDeviceEnumLeds_t                                     pfnEnumLeds;
+    zes_pfnDeviceEnumMemoryModules_t                            pfnEnumMemoryModules;
+    zes_pfnDeviceEnumPerformanceFactorDomains_t                 pfnEnumPerformanceFactorDomains;
+    zes_pfnDeviceEnumPowerDomains_t                             pfnEnumPowerDomains;
+    zes_pfnDeviceGetCardPowerDomain_t                           pfnGetCardPowerDomain;
+    zes_pfnDeviceEnumPsus_t                                     pfnEnumPsus;
+    zes_pfnDeviceEnumRasErrorSets_t                             pfnEnumRasErrorSets;
+    zes_pfnDeviceEnumSchedulers_t                               pfnEnumSchedulers;
+    zes_pfnDeviceEnumStandbyDomains_t                           pfnEnumStandbyDomains;
+    zes_pfnDeviceEnumTemperatureSensors_t                       pfnEnumTemperatureSensors;
+    zes_pfnDeviceEccAvailable_t                                 pfnEccAvailable;
+    zes_pfnDeviceEccConfigurable_t                              pfnEccConfigurable;
+    zes_pfnDeviceGetEccState_t                                  pfnGetEccState;
+    zes_pfnDeviceSetEccState_t                                  pfnSetEccState;
+    zes_pfnDeviceGet_t                                          pfnGet;
+    zes_pfnDeviceSetOverclockWaiver_t                           pfnSetOverclockWaiver;
+    zes_pfnDeviceGetOverclockDomains_t                          pfnGetOverclockDomains;
+    zes_pfnDeviceGetOverclockControls_t                         pfnGetOverclockControls;
+    zes_pfnDeviceResetOverclockSettings_t                       pfnResetOverclockSettings;
+    zes_pfnDeviceReadOverclockState_t                           pfnReadOverclockState;
+    zes_pfnDeviceEnumOverclockDomains_t                         pfnEnumOverclockDomains;
+    zes_pfnDeviceResetExt_t                                     pfnResetExt;
+} zes_device_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Device table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetDeviceProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_device_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetDeviceProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetDeviceProcAddrTable_t)(
+    ze_api_version_t,
+    zes_device_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceGetSubDevicePropertiesExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceGetSubDevicePropertiesExp_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_subdevice_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumActiveVFExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumActiveVFExp_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_vf_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDeviceEnumEnabledVFExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnDeviceEnumEnabledVFExp_t)(
+    zes_device_handle_t,
+    uint32_t*,
+    zes_vf_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of DeviceExp functions pointers
+typedef struct _zes_device_exp_dditable_t
+{
+    zes_pfnDeviceGetSubDevicePropertiesExp_t                    pfnGetSubDevicePropertiesExp;
+    zes_pfnDeviceEnumActiveVFExp_t                              pfnEnumActiveVFExp;
+    zes_pfnDeviceEnumEnabledVFExp_t                             pfnEnumEnabledVFExp;
+} zes_device_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's DeviceExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetDeviceExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_device_exp_dditable_t* pDdiTable                                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetDeviceExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetDeviceExpProcAddrTable_t)(
+    ze_api_version_t,
+    zes_device_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDriverEventListen 
+typedef ze_result_t (ZE_APICALL *zes_pfnDriverEventListen_t)(
+    ze_driver_handle_t,
+    uint32_t,
+    uint32_t,
+    zes_device_handle_t*,
+    uint32_t*,
+    zes_event_type_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDriverEventListenEx 
+typedef ze_result_t (ZE_APICALL *zes_pfnDriverEventListenEx_t)(
+    ze_driver_handle_t,
+    uint64_t,
+    uint32_t,
+    zes_device_handle_t*,
+    uint32_t*,
+    zes_event_type_flags_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDriverGet 
+typedef ze_result_t (ZE_APICALL *zes_pfnDriverGet_t)(
+    uint32_t*,
+    zes_driver_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDriverGetExtensionProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnDriverGetExtensionProperties_t)(
+    zes_driver_handle_t,
+    uint32_t*,
+    zes_driver_extension_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDriverGetExtensionFunctionAddress 
+typedef ze_result_t (ZE_APICALL *zes_pfnDriverGetExtensionFunctionAddress_t)(
+    zes_driver_handle_t,
+    const char*,
+    void**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Driver functions pointers
+typedef struct _zes_driver_dditable_t
+{
+    zes_pfnDriverEventListen_t                                  pfnEventListen;
+    zes_pfnDriverEventListenEx_t                                pfnEventListenEx;
+    zes_pfnDriverGet_t                                          pfnGet;
+    zes_pfnDriverGetExtensionProperties_t                       pfnGetExtensionProperties;
+    zes_pfnDriverGetExtensionFunctionAddress_t                  pfnGetExtensionFunctionAddress;
+} zes_driver_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Driver table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetDriverProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_driver_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetDriverProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetDriverProcAddrTable_t)(
+    ze_api_version_t,
+    zes_driver_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDriverGetDeviceByUuidExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnDriverGetDeviceByUuidExp_t)(
+    zes_driver_handle_t,
+    zes_uuid_t,
+    zes_device_handle_t*,
+    ze_bool_t*,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of DriverExp functions pointers
+typedef struct _zes_driver_exp_dditable_t
+{
+    zes_pfnDriverGetDeviceByUuidExp_t                           pfnGetDeviceByUuidExp;
+} zes_driver_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's DriverExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetDriverExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_driver_exp_dditable_t* pDdiTable                                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetDriverExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetDriverExpProcAddrTable_t)(
+    ze_api_version_t,
+    zes_driver_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetDomainProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetDomainProperties_t)(
+    zes_overclock_handle_t,
+    zes_overclock_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetDomainVFProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetDomainVFProperties_t)(
+    zes_overclock_handle_t,
+    zes_vf_property_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetDomainControlProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetDomainControlProperties_t)(
+    zes_overclock_handle_t,
+    zes_overclock_control_t,
+    zes_control_property_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetControlCurrentValue 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetControlCurrentValue_t)(
+    zes_overclock_handle_t,
+    zes_overclock_control_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetControlPendingValue 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetControlPendingValue_t)(
+    zes_overclock_handle_t,
+    zes_overclock_control_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockSetControlUserValue 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockSetControlUserValue_t)(
+    zes_overclock_handle_t,
+    zes_overclock_control_t,
+    double,
+    zes_pending_action_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetControlState 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetControlState_t)(
+    zes_overclock_handle_t,
+    zes_overclock_control_t,
+    zes_control_state_t*,
+    zes_pending_action_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockGetVFPointValues 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockGetVFPointValues_t)(
+    zes_overclock_handle_t,
+    zes_vf_type_t,
+    zes_vf_array_type_t,
+    uint32_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesOverclockSetVFPointValues 
+typedef ze_result_t (ZE_APICALL *zes_pfnOverclockSetVFPointValues_t)(
+    zes_overclock_handle_t,
+    zes_vf_type_t,
+    uint32_t,
+    uint32_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Overclock functions pointers
+typedef struct _zes_overclock_dditable_t
+{
+    zes_pfnOverclockGetDomainProperties_t                       pfnGetDomainProperties;
+    zes_pfnOverclockGetDomainVFProperties_t                     pfnGetDomainVFProperties;
+    zes_pfnOverclockGetDomainControlProperties_t                pfnGetDomainControlProperties;
+    zes_pfnOverclockGetControlCurrentValue_t                    pfnGetControlCurrentValue;
+    zes_pfnOverclockGetControlPendingValue_t                    pfnGetControlPendingValue;
+    zes_pfnOverclockSetControlUserValue_t                       pfnSetControlUserValue;
+    zes_pfnOverclockGetControlState_t                           pfnGetControlState;
+    zes_pfnOverclockGetVFPointValues_t                          pfnGetVFPointValues;
+    zes_pfnOverclockSetVFPointValues_t                          pfnSetVFPointValues;
+} zes_overclock_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Overclock table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetOverclockProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_overclock_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetOverclockProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetOverclockProcAddrTable_t)(
+    ze_api_version_t,
+    zes_overclock_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetProperties_t)(
+    zes_sched_handle_t,
+    zes_sched_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerGetCurrentMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetCurrentMode_t)(
+    zes_sched_handle_t,
+    zes_sched_mode_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerGetTimeoutModeProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetTimeoutModeProperties_t)(
+    zes_sched_handle_t,
+    ze_bool_t,
+    zes_sched_timeout_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerGetTimesliceModeProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerGetTimesliceModeProperties_t)(
+    zes_sched_handle_t,
+    ze_bool_t,
+    zes_sched_timeslice_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerSetTimeoutMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetTimeoutMode_t)(
+    zes_sched_handle_t,
+    zes_sched_timeout_properties_t*,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerSetTimesliceMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetTimesliceMode_t)(
+    zes_sched_handle_t,
+    zes_sched_timeslice_properties_t*,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerSetExclusiveMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetExclusiveMode_t)(
+    zes_sched_handle_t,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesSchedulerSetComputeUnitDebugMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnSchedulerSetComputeUnitDebugMode_t)(
+    zes_sched_handle_t,
+    ze_bool_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Scheduler functions pointers
+typedef struct _zes_scheduler_dditable_t
+{
+    zes_pfnSchedulerGetProperties_t                             pfnGetProperties;
+    zes_pfnSchedulerGetCurrentMode_t                            pfnGetCurrentMode;
+    zes_pfnSchedulerGetTimeoutModeProperties_t                  pfnGetTimeoutModeProperties;
+    zes_pfnSchedulerGetTimesliceModeProperties_t                pfnGetTimesliceModeProperties;
+    zes_pfnSchedulerSetTimeoutMode_t                            pfnSetTimeoutMode;
+    zes_pfnSchedulerSetTimesliceMode_t                          pfnSetTimesliceMode;
+    zes_pfnSchedulerSetExclusiveMode_t                          pfnSetExclusiveMode;
+    zes_pfnSchedulerSetComputeUnitDebugMode_t                   pfnSetComputeUnitDebugMode;
+} zes_scheduler_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Scheduler table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetSchedulerProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_scheduler_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetSchedulerProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetSchedulerProcAddrTable_t)(
+    ze_api_version_t,
+    zes_scheduler_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPerformanceFactorGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnPerformanceFactorGetProperties_t)(
+    zes_perf_handle_t,
+    zes_perf_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPerformanceFactorGetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnPerformanceFactorGetConfig_t)(
+    zes_perf_handle_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPerformanceFactorSetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnPerformanceFactorSetConfig_t)(
+    zes_perf_handle_t,
+    double
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of PerformanceFactor functions pointers
+typedef struct _zes_performance_factor_dditable_t
+{
+    zes_pfnPerformanceFactorGetProperties_t                     pfnGetProperties;
+    zes_pfnPerformanceFactorGetConfig_t                         pfnGetConfig;
+    zes_pfnPerformanceFactorSetConfig_t                         pfnSetConfig;
+} zes_performance_factor_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's PerformanceFactor table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetPerformanceFactorProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_performance_factor_dditable_t* pDdiTable                            ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetPerformanceFactorProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetPerformanceFactorProcAddrTable_t)(
+    ze_api_version_t,
+    zes_performance_factor_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetProperties_t)(
+    zes_pwr_handle_t,
+    zes_power_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerGetEnergyCounter 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetEnergyCounter_t)(
+    zes_pwr_handle_t,
+    zes_power_energy_counter_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerGetLimits 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetLimits_t)(
+    zes_pwr_handle_t,
+    zes_power_sustained_limit_t*,
+    zes_power_burst_limit_t*,
+    zes_power_peak_limit_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerSetLimits 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerSetLimits_t)(
+    zes_pwr_handle_t,
+    const zes_power_sustained_limit_t*,
+    const zes_power_burst_limit_t*,
+    const zes_power_peak_limit_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerGetEnergyThreshold 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetEnergyThreshold_t)(
+    zes_pwr_handle_t,
+    zes_energy_threshold_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerSetEnergyThreshold 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerSetEnergyThreshold_t)(
+    zes_pwr_handle_t,
+    double
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerGetLimitsExt 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerGetLimitsExt_t)(
+    zes_pwr_handle_t,
+    uint32_t*,
+    zes_power_limit_ext_desc_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPowerSetLimitsExt 
+typedef ze_result_t (ZE_APICALL *zes_pfnPowerSetLimitsExt_t)(
+    zes_pwr_handle_t,
+    uint32_t*,
+    zes_power_limit_ext_desc_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Power functions pointers
+typedef struct _zes_power_dditable_t
+{
+    zes_pfnPowerGetProperties_t                                 pfnGetProperties;
+    zes_pfnPowerGetEnergyCounter_t                              pfnGetEnergyCounter;
+    zes_pfnPowerGetLimits_t                                     pfnGetLimits;
+    zes_pfnPowerSetLimits_t                                     pfnSetLimits;
+    zes_pfnPowerGetEnergyThreshold_t                            pfnGetEnergyThreshold;
+    zes_pfnPowerSetEnergyThreshold_t                            pfnSetEnergyThreshold;
+    zes_pfnPowerGetLimitsExt_t                                  pfnGetLimitsExt;
+    zes_pfnPowerSetLimitsExt_t                                  pfnSetLimitsExt;
+} zes_power_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Power table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetPowerProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_power_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetPowerProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetPowerProcAddrTable_t)(
+    ze_api_version_t,
+    zes_power_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetProperties_t)(
+    zes_freq_handle_t,
+    zes_freq_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyGetAvailableClocks 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetAvailableClocks_t)(
+    zes_freq_handle_t,
+    uint32_t*,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyGetRange 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetRange_t)(
+    zes_freq_handle_t,
+    zes_freq_range_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencySetRange 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencySetRange_t)(
+    zes_freq_handle_t,
+    const zes_freq_range_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetState_t)(
+    zes_freq_handle_t,
+    zes_freq_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyGetThrottleTime 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyGetThrottleTime_t)(
+    zes_freq_handle_t,
+    zes_freq_throttle_time_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcGetCapabilities 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetCapabilities_t)(
+    zes_freq_handle_t,
+    zes_oc_capabilities_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcGetFrequencyTarget 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetFrequencyTarget_t)(
+    zes_freq_handle_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcSetFrequencyTarget 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetFrequencyTarget_t)(
+    zes_freq_handle_t,
+    double
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcGetVoltageTarget 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetVoltageTarget_t)(
+    zes_freq_handle_t,
+    double*,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcSetVoltageTarget 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetVoltageTarget_t)(
+    zes_freq_handle_t,
+    double,
+    double
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcSetMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetMode_t)(
+    zes_freq_handle_t,
+    zes_oc_mode_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcGetMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetMode_t)(
+    zes_freq_handle_t,
+    zes_oc_mode_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcGetIccMax 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetIccMax_t)(
+    zes_freq_handle_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcSetIccMax 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetIccMax_t)(
+    zes_freq_handle_t,
+    double
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcGetTjMax 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcGetTjMax_t)(
+    zes_freq_handle_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFrequencyOcSetTjMax 
+typedef ze_result_t (ZE_APICALL *zes_pfnFrequencyOcSetTjMax_t)(
+    zes_freq_handle_t,
+    double
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Frequency functions pointers
+typedef struct _zes_frequency_dditable_t
+{
+    zes_pfnFrequencyGetProperties_t                             pfnGetProperties;
+    zes_pfnFrequencyGetAvailableClocks_t                        pfnGetAvailableClocks;
+    zes_pfnFrequencyGetRange_t                                  pfnGetRange;
+    zes_pfnFrequencySetRange_t                                  pfnSetRange;
+    zes_pfnFrequencyGetState_t                                  pfnGetState;
+    zes_pfnFrequencyGetThrottleTime_t                           pfnGetThrottleTime;
+    zes_pfnFrequencyOcGetCapabilities_t                         pfnOcGetCapabilities;
+    zes_pfnFrequencyOcGetFrequencyTarget_t                      pfnOcGetFrequencyTarget;
+    zes_pfnFrequencyOcSetFrequencyTarget_t                      pfnOcSetFrequencyTarget;
+    zes_pfnFrequencyOcGetVoltageTarget_t                        pfnOcGetVoltageTarget;
+    zes_pfnFrequencyOcSetVoltageTarget_t                        pfnOcSetVoltageTarget;
+    zes_pfnFrequencyOcSetMode_t                                 pfnOcSetMode;
+    zes_pfnFrequencyOcGetMode_t                                 pfnOcGetMode;
+    zes_pfnFrequencyOcGetIccMax_t                               pfnOcGetIccMax;
+    zes_pfnFrequencyOcSetIccMax_t                               pfnOcSetIccMax;
+    zes_pfnFrequencyOcGetTjMax_t                                pfnOcGetTjMax;
+    zes_pfnFrequencyOcSetTjMax_t                                pfnOcSetTjMax;
+} zes_frequency_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Frequency table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetFrequencyProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_frequency_dditable_t* pDdiTable                                     ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetFrequencyProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetFrequencyProcAddrTable_t)(
+    ze_api_version_t,
+    zes_frequency_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesEngineGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnEngineGetProperties_t)(
+    zes_engine_handle_t,
+    zes_engine_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesEngineGetActivity 
+typedef ze_result_t (ZE_APICALL *zes_pfnEngineGetActivity_t)(
+    zes_engine_handle_t,
+    zes_engine_stats_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesEngineGetActivityExt 
+typedef ze_result_t (ZE_APICALL *zes_pfnEngineGetActivityExt_t)(
+    zes_engine_handle_t,
+    uint32_t*,
+    zes_engine_stats_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Engine functions pointers
+typedef struct _zes_engine_dditable_t
+{
+    zes_pfnEngineGetProperties_t                                pfnGetProperties;
+    zes_pfnEngineGetActivity_t                                  pfnGetActivity;
+    zes_pfnEngineGetActivityExt_t                               pfnGetActivityExt;
+} zes_engine_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Engine table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetEngineProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_engine_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetEngineProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetEngineProcAddrTable_t)(
+    ze_api_version_t,
+    zes_engine_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesStandbyGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnStandbyGetProperties_t)(
+    zes_standby_handle_t,
+    zes_standby_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesStandbyGetMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnStandbyGetMode_t)(
+    zes_standby_handle_t,
+    zes_standby_promo_mode_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesStandbySetMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnStandbySetMode_t)(
+    zes_standby_handle_t,
+    zes_standby_promo_mode_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Standby functions pointers
+typedef struct _zes_standby_dditable_t
+{
+    zes_pfnStandbyGetProperties_t                               pfnGetProperties;
+    zes_pfnStandbyGetMode_t                                     pfnGetMode;
+    zes_pfnStandbySetMode_t                                     pfnSetMode;
+} zes_standby_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Standby table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetStandbyProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_standby_dditable_t* pDdiTable                                       ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetStandbyProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetStandbyProcAddrTable_t)(
+    ze_api_version_t,
+    zes_standby_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFirmwareGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareGetProperties_t)(
+    zes_firmware_handle_t,
+    zes_firmware_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFirmwareFlash 
+typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareFlash_t)(
+    zes_firmware_handle_t,
+    void*,
+    uint32_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFirmwareGetFlashProgress 
+typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareGetFlashProgress_t)(
+    zes_firmware_handle_t,
+    uint32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFirmwareGetConsoleLogs 
+typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareGetConsoleLogs_t)(
+    zes_firmware_handle_t,
+    size_t*,
+    char*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Firmware functions pointers
+typedef struct _zes_firmware_dditable_t
+{
+    zes_pfnFirmwareGetProperties_t                              pfnGetProperties;
+    zes_pfnFirmwareFlash_t                                      pfnFlash;
+    zes_pfnFirmwareGetFlashProgress_t                           pfnGetFlashProgress;
+    zes_pfnFirmwareGetConsoleLogs_t                             pfnGetConsoleLogs;
+} zes_firmware_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Firmware table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetFirmwareProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_firmware_dditable_t* pDdiTable                                      ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetFirmwareProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetFirmwareProcAddrTable_t)(
+    ze_api_version_t,
+    zes_firmware_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFirmwareGetSecurityVersionExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareGetSecurityVersionExp_t)(
+    zes_firmware_handle_t,
+    char*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFirmwareSetSecurityVersionExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnFirmwareSetSecurityVersionExp_t)(
+    zes_firmware_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of FirmwareExp functions pointers
+typedef struct _zes_firmware_exp_dditable_t
+{
+    zes_pfnFirmwareGetSecurityVersionExp_t                      pfnGetSecurityVersionExp;
+    zes_pfnFirmwareSetSecurityVersionExp_t                      pfnSetSecurityVersionExp;
+} zes_firmware_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's FirmwareExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetFirmwareExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_firmware_exp_dditable_t* pDdiTable                                  ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetFirmwareExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetFirmwareExpProcAddrTable_t)(
+    ze_api_version_t,
+    zes_firmware_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesMemoryGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnMemoryGetProperties_t)(
+    zes_mem_handle_t,
+    zes_mem_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesMemoryGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnMemoryGetState_t)(
+    zes_mem_handle_t,
+    zes_mem_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesMemoryGetBandwidth 
+typedef ze_result_t (ZE_APICALL *zes_pfnMemoryGetBandwidth_t)(
+    zes_mem_handle_t,
+    zes_mem_bandwidth_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Memory functions pointers
+typedef struct _zes_memory_dditable_t
+{
+    zes_pfnMemoryGetProperties_t                                pfnGetProperties;
+    zes_pfnMemoryGetState_t                                     pfnGetState;
+    zes_pfnMemoryGetBandwidth_t                                 pfnGetBandwidth;
+} zes_memory_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Memory table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetMemoryProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_memory_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetMemoryProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetMemoryProcAddrTable_t)(
+    ze_api_version_t,
+    zes_memory_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetProperties_t)(
+    zes_fabric_port_handle_t,
+    zes_fabric_port_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetLinkType 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetLinkType_t)(
+    zes_fabric_port_handle_t,
+    zes_fabric_link_type_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetConfig_t)(
+    zes_fabric_port_handle_t,
+    zes_fabric_port_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortSetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortSetConfig_t)(
+    zes_fabric_port_handle_t,
+    const zes_fabric_port_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetState_t)(
+    zes_fabric_port_handle_t,
+    zes_fabric_port_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetThroughput 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetThroughput_t)(
+    zes_fabric_port_handle_t,
+    zes_fabric_port_throughput_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetFabricErrorCounters 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetFabricErrorCounters_t)(
+    zes_fabric_port_handle_t,
+    zes_fabric_port_error_counters_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFabricPortGetMultiPortThroughput 
+typedef ze_result_t (ZE_APICALL *zes_pfnFabricPortGetMultiPortThroughput_t)(
+    zes_device_handle_t,
+    uint32_t,
+    zes_fabric_port_handle_t*,
+    zes_fabric_port_throughput_t**
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of FabricPort functions pointers
+typedef struct _zes_fabric_port_dditable_t
+{
+    zes_pfnFabricPortGetProperties_t                            pfnGetProperties;
+    zes_pfnFabricPortGetLinkType_t                              pfnGetLinkType;
+    zes_pfnFabricPortGetConfig_t                                pfnGetConfig;
+    zes_pfnFabricPortSetConfig_t                                pfnSetConfig;
+    zes_pfnFabricPortGetState_t                                 pfnGetState;
+    zes_pfnFabricPortGetThroughput_t                            pfnGetThroughput;
+    zes_pfnFabricPortGetFabricErrorCounters_t                   pfnGetFabricErrorCounters;
+    zes_pfnFabricPortGetMultiPortThroughput_t                   pfnGetMultiPortThroughput;
+} zes_fabric_port_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's FabricPort table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetFabricPortProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_fabric_port_dditable_t* pDdiTable                                   ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetFabricPortProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetFabricPortProcAddrTable_t)(
+    ze_api_version_t,
+    zes_fabric_port_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesTemperatureGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureGetProperties_t)(
+    zes_temp_handle_t,
+    zes_temp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesTemperatureGetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureGetConfig_t)(
+    zes_temp_handle_t,
+    zes_temp_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesTemperatureSetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureSetConfig_t)(
+    zes_temp_handle_t,
+    const zes_temp_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesTemperatureGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnTemperatureGetState_t)(
+    zes_temp_handle_t,
+    double*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Temperature functions pointers
+typedef struct _zes_temperature_dditable_t
+{
+    zes_pfnTemperatureGetProperties_t                           pfnGetProperties;
+    zes_pfnTemperatureGetConfig_t                               pfnGetConfig;
+    zes_pfnTemperatureSetConfig_t                               pfnSetConfig;
+    zes_pfnTemperatureGetState_t                                pfnGetState;
+} zes_temperature_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Temperature table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetTemperatureProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_temperature_dditable_t* pDdiTable                                   ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetTemperatureProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetTemperatureProcAddrTable_t)(
+    ze_api_version_t,
+    zes_temperature_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPsuGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnPsuGetProperties_t)(
+    zes_psu_handle_t,
+    zes_psu_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesPsuGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnPsuGetState_t)(
+    zes_psu_handle_t,
+    zes_psu_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Psu functions pointers
+typedef struct _zes_psu_dditable_t
+{
+    zes_pfnPsuGetProperties_t                                   pfnGetProperties;
+    zes_pfnPsuGetState_t                                        pfnGetState;
+} zes_psu_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Psu table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetPsuProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_psu_dditable_t* pDdiTable                                           ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetPsuProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetPsuProcAddrTable_t)(
+    ze_api_version_t,
+    zes_psu_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFanGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnFanGetProperties_t)(
+    zes_fan_handle_t,
+    zes_fan_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFanGetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnFanGetConfig_t)(
+    zes_fan_handle_t,
+    zes_fan_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFanSetDefaultMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnFanSetDefaultMode_t)(
+    zes_fan_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFanSetFixedSpeedMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnFanSetFixedSpeedMode_t)(
+    zes_fan_handle_t,
+    const zes_fan_speed_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFanSetSpeedTableMode 
+typedef ze_result_t (ZE_APICALL *zes_pfnFanSetSpeedTableMode_t)(
+    zes_fan_handle_t,
+    const zes_fan_speed_table_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesFanGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnFanGetState_t)(
+    zes_fan_handle_t,
+    zes_fan_speed_units_t,
+    int32_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Fan functions pointers
+typedef struct _zes_fan_dditable_t
+{
+    zes_pfnFanGetProperties_t                                   pfnGetProperties;
+    zes_pfnFanGetConfig_t                                       pfnGetConfig;
+    zes_pfnFanSetDefaultMode_t                                  pfnSetDefaultMode;
+    zes_pfnFanSetFixedSpeedMode_t                               pfnSetFixedSpeedMode;
+    zes_pfnFanSetSpeedTableMode_t                               pfnSetSpeedTableMode;
+    zes_pfnFanGetState_t                                        pfnGetState;
+} zes_fan_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Fan table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetFanProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_fan_dditable_t* pDdiTable                                           ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetFanProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetFanProcAddrTable_t)(
+    ze_api_version_t,
+    zes_fan_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesLedGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnLedGetProperties_t)(
+    zes_led_handle_t,
+    zes_led_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesLedGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnLedGetState_t)(
+    zes_led_handle_t,
+    zes_led_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesLedSetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnLedSetState_t)(
+    zes_led_handle_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesLedSetColor 
+typedef ze_result_t (ZE_APICALL *zes_pfnLedSetColor_t)(
+    zes_led_handle_t,
+    const zes_led_color_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Led functions pointers
+typedef struct _zes_led_dditable_t
+{
+    zes_pfnLedGetProperties_t                                   pfnGetProperties;
+    zes_pfnLedGetState_t                                        pfnGetState;
+    zes_pfnLedSetState_t                                        pfnSetState;
+    zes_pfnLedSetColor_t                                        pfnSetColor;
+} zes_led_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Led table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetLedProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_led_dditable_t* pDdiTable                                           ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetLedProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetLedProcAddrTable_t)(
+    ze_api_version_t,
+    zes_led_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesRasGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnRasGetProperties_t)(
+    zes_ras_handle_t,
+    zes_ras_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesRasGetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnRasGetConfig_t)(
+    zes_ras_handle_t,
+    zes_ras_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesRasSetConfig 
+typedef ze_result_t (ZE_APICALL *zes_pfnRasSetConfig_t)(
+    zes_ras_handle_t,
+    const zes_ras_config_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesRasGetState 
+typedef ze_result_t (ZE_APICALL *zes_pfnRasGetState_t)(
+    zes_ras_handle_t,
+    ze_bool_t,
+    zes_ras_state_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Ras functions pointers
+typedef struct _zes_ras_dditable_t
+{
+    zes_pfnRasGetProperties_t                                   pfnGetProperties;
+    zes_pfnRasGetConfig_t                                       pfnGetConfig;
+    zes_pfnRasSetConfig_t                                       pfnSetConfig;
+    zes_pfnRasGetState_t                                        pfnGetState;
+} zes_ras_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Ras table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetRasProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_ras_dditable_t* pDdiTable                                           ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetRasProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetRasProcAddrTable_t)(
+    ze_api_version_t,
+    zes_ras_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesRasGetStateExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnRasGetStateExp_t)(
+    zes_ras_handle_t,
+    uint32_t*,
+    zes_ras_state_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesRasClearStateExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnRasClearStateExp_t)(
+    zes_ras_handle_t,
+    zes_ras_error_category_exp_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of RasExp functions pointers
+typedef struct _zes_ras_exp_dditable_t
+{
+    zes_pfnRasGetStateExp_t                                     pfnGetStateExp;
+    zes_pfnRasClearStateExp_t                                   pfnClearStateExp;
+} zes_ras_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's RasExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetRasExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_ras_exp_dditable_t* pDdiTable                                       ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetRasExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetRasExpProcAddrTable_t)(
+    ze_api_version_t,
+    zes_ras_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDiagnosticsGetProperties 
+typedef ze_result_t (ZE_APICALL *zes_pfnDiagnosticsGetProperties_t)(
+    zes_diag_handle_t,
+    zes_diag_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDiagnosticsGetTests 
+typedef ze_result_t (ZE_APICALL *zes_pfnDiagnosticsGetTests_t)(
+    zes_diag_handle_t,
+    uint32_t*,
+    zes_diag_test_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesDiagnosticsRunTests 
+typedef ze_result_t (ZE_APICALL *zes_pfnDiagnosticsRunTests_t)(
+    zes_diag_handle_t,
+    uint32_t,
+    uint32_t,
+    zes_diag_result_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Diagnostics functions pointers
+typedef struct _zes_diagnostics_dditable_t
+{
+    zes_pfnDiagnosticsGetProperties_t                           pfnGetProperties;
+    zes_pfnDiagnosticsGetTests_t                                pfnGetTests;
+    zes_pfnDiagnosticsRunTests_t                                pfnRunTests;
+} zes_diagnostics_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Diagnostics table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetDiagnosticsProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_diagnostics_dditable_t* pDdiTable                                   ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetDiagnosticsProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetDiagnosticsProcAddrTable_t)(
+    ze_api_version_t,
+    zes_diagnostics_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementGetVFPropertiesExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementGetVFPropertiesExp_t)(
+    zes_vf_handle_t,
+    zes_vf_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementGetVFMemoryUtilizationExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementGetVFMemoryUtilizationExp_t)(
+    zes_vf_handle_t,
+    uint32_t*,
+    zes_vf_util_mem_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementGetVFEngineUtilizationExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementGetVFEngineUtilizationExp_t)(
+    zes_vf_handle_t,
+    uint32_t*,
+    zes_vf_util_engine_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementSetVFTelemetryModeExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementSetVFTelemetryModeExp_t)(
+    zes_vf_handle_t,
+    zes_vf_info_util_exp_flags_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementSetVFTelemetrySamplingIntervalExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementSetVFTelemetrySamplingIntervalExp_t)(
+    zes_vf_handle_t,
+    zes_vf_info_util_exp_flags_t,
+    uint64_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementGetVFCapabilitiesExp 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementGetVFCapabilitiesExp_t)(
+    zes_vf_handle_t,
+    zes_vf_exp_capabilities_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementGetVFMemoryUtilizationExp2 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementGetVFMemoryUtilizationExp2_t)(
+    zes_vf_handle_t,
+    uint32_t*,
+    zes_vf_util_mem_exp2_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesVFManagementGetVFEngineUtilizationExp2 
+typedef ze_result_t (ZE_APICALL *zes_pfnVFManagementGetVFEngineUtilizationExp2_t)(
+    zes_vf_handle_t,
+    uint32_t*,
+    zes_vf_util_engine_exp2_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of VFManagementExp functions pointers
+typedef struct _zes_vf_management_exp_dditable_t
+{
+    zes_pfnVFManagementGetVFPropertiesExp_t                     pfnGetVFPropertiesExp;
+    zes_pfnVFManagementGetVFMemoryUtilizationExp_t              pfnGetVFMemoryUtilizationExp;
+    zes_pfnVFManagementGetVFEngineUtilizationExp_t              pfnGetVFEngineUtilizationExp;
+    zes_pfnVFManagementSetVFTelemetryModeExp_t                  pfnSetVFTelemetryModeExp;
+    zes_pfnVFManagementSetVFTelemetrySamplingIntervalExp_t      pfnSetVFTelemetrySamplingIntervalExp;
+    zes_pfnVFManagementGetVFCapabilitiesExp_t                   pfnGetVFCapabilitiesExp;
+    zes_pfnVFManagementGetVFMemoryUtilizationExp2_t             pfnGetVFMemoryUtilizationExp2;
+    zes_pfnVFManagementGetVFEngineUtilizationExp2_t             pfnGetVFEngineUtilizationExp2;
+} zes_vf_management_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's VFManagementExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zesGetVFManagementExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zes_vf_management_exp_dditable_t* pDdiTable                             ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zesGetVFManagementExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zes_pfnGetVFManagementExpProcAddrTable_t)(
+    ze_api_version_t,
+    zes_vf_management_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Container for all DDI tables
+typedef struct _zes_dditable_t
+{
+    zes_global_dditable_t               Global;
+    zes_device_dditable_t               Device;
+    zes_device_exp_dditable_t           DeviceExp;
+    zes_driver_dditable_t               Driver;
+    zes_driver_exp_dditable_t           DriverExp;
+    zes_overclock_dditable_t            Overclock;
+    zes_scheduler_dditable_t            Scheduler;
+    zes_performance_factor_dditable_t   PerformanceFactor;
+    zes_power_dditable_t                Power;
+    zes_frequency_dditable_t            Frequency;
+    zes_engine_dditable_t               Engine;
+    zes_standby_dditable_t              Standby;
+    zes_firmware_dditable_t             Firmware;
+    zes_firmware_exp_dditable_t         FirmwareExp;
+    zes_memory_dditable_t               Memory;
+    zes_fabric_port_dditable_t          FabricPort;
+    zes_temperature_dditable_t          Temperature;
+    zes_psu_dditable_t                  Psu;
+    zes_fan_dditable_t                  Fan;
+    zes_led_dditable_t                  Led;
+    zes_ras_dditable_t                  Ras;
+    zes_ras_exp_dditable_t              RasExp;
+    zes_diagnostics_dditable_t          Diagnostics;
+    zes_vf_management_exp_dditable_t    VFManagementExp;
+} zes_dditable_t;
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZES_DDI_H
\ No newline at end of file
diff --git a/third_party/level_zero/zet.py b/third_party/level_zero/zet.py
new file mode 100644
index 00000000000..993e7383afa
--- /dev/null
+++ b/third_party/level_zero/zet.py
@@ -0,0 +1,1880 @@
+"""
+ Copyright (C) 2019-2021 Intel Corporation
+
+ SPDX-License-Identifier: MIT
+
+ @file zet.py
+ @version v1.11-r1.11.8
+
+ """
+import platform
+from ctypes import *
+from enum import *
+
+###############################################################################
+__version__ = "1.0"
+
+###############################################################################
+## @brief Handle to a driver instance
+class zet_driver_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of device object
+class zet_device_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of context object
+class zet_context_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of command list object
+class zet_command_list_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of module object
+class zet_module_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of function object
+class zet_kernel_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of metric group's object
+class zet_metric_group_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of metric's object
+class zet_metric_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of metric streamer's object
+class zet_metric_streamer_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of metric query pool's object
+class zet_metric_query_pool_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of metric query's object
+class zet_metric_query_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of tracer object
+class zet_tracer_exp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Debug session handle
+class zet_debug_session_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Defines structure types
+class zet_structure_type_v(IntEnum):
+    METRIC_GROUP_PROPERTIES = 0x1                                           ## ::zet_metric_group_properties_t
+    METRIC_PROPERTIES = 0x2                                                 ## ::zet_metric_properties_t
+    METRIC_STREAMER_DESC = 0x3                                              ## ::zet_metric_streamer_desc_t
+    METRIC_QUERY_POOL_DESC = 0x4                                            ## ::zet_metric_query_pool_desc_t
+    PROFILE_PROPERTIES = 0x5                                                ## ::zet_profile_properties_t
+    DEVICE_DEBUG_PROPERTIES = 0x6                                           ## ::zet_device_debug_properties_t
+    DEBUG_MEMORY_SPACE_DESC = 0x7                                           ## ::zet_debug_memory_space_desc_t
+    DEBUG_REGSET_PROPERTIES = 0x8                                           ## ::zet_debug_regset_properties_t
+    GLOBAL_METRICS_TIMESTAMPS_EXP_PROPERTIES = 0x9                          ## ::zet_metric_global_timestamps_resolution_exp_t. Deprecated, use
+                                                                            ## ::ZET_STRUCTURE_TYPE_METRIC_GLOBAL_TIMESTAMPS_RESOLUTION_EXP.
+    METRIC_GLOBAL_TIMESTAMPS_RESOLUTION_EXP = 0x9                           ## ::zet_metric_global_timestamps_resolution_exp_t
+    TRACER_EXP_DESC = 0x00010001                                            ## ::zet_tracer_exp_desc_t
+    METRICS_CALCULATE_EXP_DESC = 0x00010002                                 ## ::zet_metric_calculate_exp_desc_t. Deprecated, use
+                                                                            ## ::ZET_STRUCTURE_TYPE_METRIC_CALCULATE_EXP_DESC.
+    METRIC_CALCULATE_EXP_DESC = 0x00010002                                  ## ::zet_metric_calculate_exp_desc_t
+    METRIC_PROGRAMMABLE_EXP_PROPERTIES = 0x00010003                         ## ::zet_metric_programmable_exp_properties_t
+    METRIC_PROGRAMMABLE_PARAM_INFO_EXP = 0x00010004                         ## ::zet_metric_programmable_param_info_exp_t
+    METRIC_PROGRAMMABLE_PARAM_VALUE_INFO_EXP = 0x00010005                   ## ::zet_metric_programmable_param_value_info_exp_t
+    METRIC_GROUP_TYPE_EXP = 0x00010006                                      ## ::zet_metric_group_type_exp_t
+    EXPORT_DMA_EXP_PROPERTIES = 0x00010007                                  ## ::zet_export_dma_buf_exp_properties_t
+    METRIC_TRACER_EXP_DESC = 0x00010008                                     ## ::zet_metric_tracer_exp_desc_t
+
+class zet_structure_type_t(c_int):
+    def __str__(self):
+        return str(zet_structure_type_v(self.value))
+
+
+###############################################################################
+## @brief Base for all properties types
+class zet_base_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Base for all descriptor types
+class zet_base_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p)                                             ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+    ]
+
+###############################################################################
+## @brief Supported value types
+class zet_value_type_v(IntEnum):
+    UINT32 = 0                                                              ## 32-bit unsigned-integer
+    UINT64 = 1                                                              ## 64-bit unsigned-integer
+    FLOAT32 = 2                                                             ## 32-bit floating-point
+    FLOAT64 = 3                                                             ## 64-bit floating-point
+    BOOL8 = 4                                                               ## 8-bit boolean
+    STRING = 5                                                              ## C string
+    UINT8 = 6                                                               ## 8-bit unsigned-integer
+    UINT16 = 7                                                              ## 16-bit unsigned-integer
+
+class zet_value_type_t(c_int):
+    def __str__(self):
+        return str(zet_value_type_v(self.value))
+
+
+###############################################################################
+## @brief Union of values
+class zet_value_t(Structure):
+    _fields_ = [
+        ("ui32", c_ulong),                                              ## [out] 32-bit unsigned-integer
+        ("ui64", c_ulonglong),                                          ## [out] 64-bit unsigned-integer
+        ("fp32", c_float),                                              ## [out] 32-bit floating-point
+        ("fp64", c_double),                                             ## [out] 64-bit floating-point
+        ("b8", ze_bool_t)                                               ## [out] 8-bit boolean
+    ]
+
+###############################################################################
+## @brief Typed value
+class zet_typed_value_t(Structure):
+    _fields_ = [
+        ("type", zet_value_type_t),                                     ## [out] type of value
+        ("value", zet_value_t)                                          ## [out] value
+    ]
+
+###############################################################################
+## @brief Enables driver instrumentation and dependencies for device metrics
+
+###############################################################################
+## @brief Enables driver instrumentation and dependencies for program
+##        instrumentation
+
+###############################################################################
+## @brief Enables driver instrumentation and dependencies for program debugging
+
+###############################################################################
+## @brief Supported module debug info formats.
+class zet_module_debug_info_format_v(IntEnum):
+    ELF_DWARF = 0                                                           ## Format is ELF/DWARF
+
+class zet_module_debug_info_format_t(c_int):
+    def __str__(self):
+        return str(zet_module_debug_info_format_v(self.value))
+
+
+###############################################################################
+## @brief Supported device debug property flags
+class zet_device_debug_property_flags_v(IntEnum):
+    ATTACH = ZE_BIT(0)                                                      ## the device supports attaching for debug
+
+class zet_device_debug_property_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device debug properties queried using ::zetDeviceGetDebugProperties.
+class zet_device_debug_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", zet_device_debug_property_flags_t)                    ## [out] returns 0 (none) or a valid combination of
+                                                                        ## ::zet_device_debug_property_flag_t
+    ]
+
+###############################################################################
+## @brief Debug configuration provided to ::zetDebugAttach
+class zet_debug_config_t(Structure):
+    _fields_ = [
+        ("pid", c_ulong)                                                ## [in] the host process identifier
+    ]
+
+###############################################################################
+## @brief Supported debug event flags.
+class zet_debug_event_flags_v(IntEnum):
+    NEED_ACK = ZE_BIT(0)                                                    ## The event needs to be acknowledged by calling
+                                                                            ## ::zetDebugAcknowledgeEvent.
+
+class zet_debug_event_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Supported debug event types.
+class zet_debug_event_type_v(IntEnum):
+    INVALID = 0                                                             ## The event is invalid
+    DETACHED = 1                                                            ## The tool was detached
+    PROCESS_ENTRY = 2                                                       ## The debuggee process created command queues on the device
+    PROCESS_EXIT = 3                                                        ## The debuggee process destroyed all command queues on the device
+    MODULE_LOAD = 4                                                         ## An in-memory module was loaded onto the device
+    MODULE_UNLOAD = 5                                                       ## An in-memory module is about to get unloaded from the device
+    THREAD_STOPPED = 6                                                      ## The thread stopped due to a device exception
+    THREAD_UNAVAILABLE = 7                                                  ## The thread is not available to be stopped
+    PAGE_FAULT = 8                                                          ## A page request could not be completed on the device
+
+class zet_debug_event_type_t(c_int):
+    def __str__(self):
+        return str(zet_debug_event_type_v(self.value))
+
+
+###############################################################################
+## @brief Supported debug detach reasons.
+class zet_debug_detach_reason_v(IntEnum):
+    INVALID = 0                                                             ## The detach reason is not valid
+    HOST_EXIT = 1                                                           ## The host process exited
+
+class zet_debug_detach_reason_t(c_int):
+    def __str__(self):
+        return str(zet_debug_detach_reason_v(self.value))
+
+
+###############################################################################
+## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_DETACHED
+class zet_debug_event_info_detached_t(Structure):
+    _fields_ = [
+        ("reason", zet_debug_detach_reason_t)                           ## [out] the detach reason
+    ]
+
+###############################################################################
+## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD and
+##        ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
+class zet_debug_event_info_module_t(Structure):
+    _fields_ = [
+        ("format", zet_module_debug_info_format_t),                     ## [out] the module format
+        ("moduleBegin", c_ulonglong),                                   ## [out] the begin address of the in-memory module (inclusive)
+        ("moduleEnd", c_ulonglong),                                     ## [out] the end address of the in-memory module (exclusive)
+        ("load", c_ulonglong)                                           ## [out] the load address of the module on the device
+    ]
+
+###############################################################################
+## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED and
+##        ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
+class zet_debug_event_info_thread_stopped_t(Structure):
+    _fields_ = [
+        ("thread", ze_device_thread_t)                                  ## [out] the stopped/unavailable thread
+    ]
+
+###############################################################################
+## @brief Page fault reasons.
+class zet_debug_page_fault_reason_v(IntEnum):
+    INVALID = 0                                                             ## The page fault reason is not valid
+    MAPPING_ERROR = 1                                                       ## The address is not mapped
+    PERMISSION_ERROR = 2                                                    ## Invalid access permissions
+
+class zet_debug_page_fault_reason_t(c_int):
+    def __str__(self):
+        return str(zet_debug_page_fault_reason_v(self.value))
+
+
+###############################################################################
+## @brief Event information for ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
+class zet_debug_event_info_page_fault_t(Structure):
+    _fields_ = [
+        ("address", c_ulonglong),                                       ## [out] the faulting address
+        ("mask", c_ulonglong),                                          ## [out] the alignment mask
+        ("reason", zet_debug_page_fault_reason_t)                       ## [out] the page fault reason
+    ]
+
+###############################################################################
+## @brief Event type-specific information
+class zet_debug_event_info_t(Structure):
+    _fields_ = [
+        ("detached", zet_debug_event_info_detached_t),                  ## [out] type == ::ZET_DEBUG_EVENT_TYPE_DETACHED
+        ("module", zet_debug_event_info_module_t),                      ## [out] type == ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD or
+                                                                        ## ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
+        ("thread", zet_debug_event_info_thread_stopped_t),              ## [out] type == ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED or
+                                                                        ## ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
+        ("page_fault", zet_debug_event_info_page_fault_t)               ## [out] type == ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
+    ]
+
+###############################################################################
+## @brief A debug event on the device.
+class zet_debug_event_t(Structure):
+    _fields_ = [
+        ("type", zet_debug_event_type_t),                               ## [out] the event type
+        ("flags", zet_debug_event_flags_t),                             ## [out] returns 0 (none) or a combination of ::zet_debug_event_flag_t
+        ("info", zet_debug_event_info_t)                                ## [out] event type specific information
+    ]
+
+###############################################################################
+## @brief Supported device memory space types.
+class zet_debug_memory_space_type_v(IntEnum):
+    DEFAULT = 0                                                             ## default memory space (attribute may be omitted)
+    SLM = 1                                                                 ## shared local memory space (GPU-only)
+    ELF = 2                                                                 ## ELF file memory space
+
+class zet_debug_memory_space_type_t(c_int):
+    def __str__(self):
+        return str(zet_debug_memory_space_type_v(self.value))
+
+
+###############################################################################
+## @brief Device memory space descriptor
+class zet_debug_memory_space_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zet_debug_memory_space_type_t),                        ## [in] type of memory space
+        ("address", c_ulonglong)                                        ## [in] the virtual address within the memory space
+    ]
+
+###############################################################################
+## @brief Supported general register set flags.
+class zet_debug_regset_flags_v(IntEnum):
+    READABLE = ZE_BIT(0)                                                    ## register set is readable
+    WRITEABLE = ZE_BIT(1)                                                   ## register set is writeable
+
+class zet_debug_regset_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Device register set properties queried using
+##        ::zetDebugGetRegisterSetProperties.
+class zet_debug_regset_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", c_ulong),                                              ## [out] device-specific register set type
+        ("version", c_ulong),                                           ## [out] device-specific version of this register set
+        ("generalFlags", zet_debug_regset_flags_t),                     ## [out] general register set flags
+        ("deviceFlags", c_ulong),                                       ## [out] device-specific register set flags
+        ("count", c_ulong),                                             ## [out] number of registers in the set
+        ("bitSize", c_ulong),                                           ## [out] the size of a register in bits
+        ("byteSize", c_ulong)                                           ## [out] the size required for reading or writing a register in bytes
+    ]
+
+###############################################################################
+## @brief Maximum metric group name string size
+ZET_MAX_METRIC_GROUP_NAME = 256
+
+###############################################################################
+## @brief Maximum metric group description string size
+ZET_MAX_METRIC_GROUP_DESCRIPTION = 256
+
+###############################################################################
+## @brief Metric group sampling type
+class zet_metric_group_sampling_type_flags_v(IntEnum):
+    EVENT_BASED = ZE_BIT(0)                                                 ## Event based sampling
+    TIME_BASED = ZE_BIT(1)                                                  ## Time based sampling
+    EXP_TRACER_BASED = ZE_BIT(2)                                            ## Experimental Tracer based sampling
+
+class zet_metric_group_sampling_type_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Metric group properties queried using ::zetMetricGroupGetProperties
+class zet_metric_group_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("name", c_char * ZET_MAX_METRIC_GROUP_NAME),                   ## [out] metric group name
+        ("description", c_char * ZET_MAX_METRIC_GROUP_DESCRIPTION),     ## [out] metric group description
+        ("samplingType", zet_metric_group_sampling_type_flags_t),       ## [out] metric group sampling type.
+                                                                        ## returns a combination of ::zet_metric_group_sampling_type_flag_t.
+        ("domain", c_ulong),                                            ## [out] metric group domain number. Cannot use multiple, simultaneous
+                                                                        ## metric groups from the same domain.
+        ("metricCount", c_ulong)                                        ## [out] metric count belonging to this group
+    ]
+
+###############################################################################
+## @brief Metric types
+class zet_metric_type_v(IntEnum):
+    DURATION = 0                                                            ## Metric type: duration
+    EVENT = 1                                                               ## Metric type: event
+    EVENT_WITH_RANGE = 2                                                    ## Metric type: event with range
+    THROUGHPUT = 3                                                          ## Metric type: throughput
+    TIMESTAMP = 4                                                           ## Metric type: timestamp
+    FLAG = 5                                                                ## Metric type: flag
+    RATIO = 6                                                               ## Metric type: ratio
+    RAW = 7                                                                 ## Metric type: raw
+    EVENT_EXP_TIMESTAMP = 0x7ffffff9                                        ## Metric type: event with only timestamp and value has no meaning
+    EVENT_EXP_START = 0x7ffffffa                                            ## Metric type: the first event of a start/end event pair
+    EVENT_EXP_END = 0x7ffffffb                                              ## Metric type: the second event of a start/end event pair
+    EVENT_EXP_MONOTONIC_WRAPS_VALUE = 0x7ffffffc                            ## Metric type: value of the event is a monotonically increasing value
+                                                                            ## that can wrap around
+    EXP_EXPORT_DMA_BUF = 0x7ffffffd                                         ## Metric which exports linux dma_buf, which could be imported/mapped to
+                                                                            ## the host process
+    IP_EXP = 0x7ffffffe                                                     ## Metric type: instruction pointer. Deprecated, use
+                                                                            ## ::ZET_METRIC_TYPE_IP.
+    IP = 0x7ffffffe                                                         ## Metric type: instruction pointer
+
+class zet_metric_type_t(c_int):
+    def __str__(self):
+        return str(zet_metric_type_v(self.value))
+
+
+###############################################################################
+## @brief Metric group calculation type
+class zet_metric_group_calculation_type_v(IntEnum):
+    METRIC_VALUES = 0                                                       ## Calculated metric values from raw data.
+    MAX_METRIC_VALUES = 1                                                   ## Maximum metric values.
+
+class zet_metric_group_calculation_type_t(c_int):
+    def __str__(self):
+        return str(zet_metric_group_calculation_type_v(self.value))
+
+
+###############################################################################
+## @brief Maximum metric name string size
+ZET_MAX_METRIC_NAME = 256
+
+###############################################################################
+## @brief Maximum metric description string size
+ZET_MAX_METRIC_DESCRIPTION = 256
+
+###############################################################################
+## @brief Maximum metric component string size
+ZET_MAX_METRIC_COMPONENT = 256
+
+###############################################################################
+## @brief Maximum metric result units string size
+ZET_MAX_METRIC_RESULT_UNITS = 256
+
+###############################################################################
+## @brief Metric properties queried using ::zetMetricGetProperties
+class zet_metric_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("name", c_char * ZET_MAX_METRIC_NAME),                         ## [out] metric name
+        ("description", c_char * ZET_MAX_METRIC_DESCRIPTION),           ## [out] metric description
+        ("component", c_char * ZET_MAX_METRIC_COMPONENT),               ## [out] metric component
+        ("tierNumber", c_ulong),                                        ## [out] number of tier
+        ("metricType", zet_metric_type_t),                              ## [out] metric type
+        ("resultType", zet_value_type_t),                               ## [out] metric result type
+        ("resultUnits", c_char * ZET_MAX_METRIC_RESULT_UNITS)           ## [out] metric result units
+    ]
+
+###############################################################################
+## @brief Metric streamer descriptor
+class zet_metric_streamer_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("notifyEveryNReports", c_ulong),                               ## [in,out] number of collected reports after which notification event
+                                                                        ## will be signaled. If the requested value is not supported exactly,
+                                                                        ## then the driver may use a value that is the closest supported
+                                                                        ## approximation and shall update this member during ::zetMetricStreamerOpen.
+        ("samplingPeriod", c_ulong)                                     ## [in,out] streamer sampling period in nanoseconds. If the requested
+                                                                        ## value is not supported exactly, then the driver may use a value that
+                                                                        ## is the closest supported approximation and shall update this member
+                                                                        ## during ::zetMetricStreamerOpen.
+    ]
+
+###############################################################################
+## @brief Metric query pool types
+class zet_metric_query_pool_type_v(IntEnum):
+    PERFORMANCE = 0                                                         ## Performance metric query pool.
+    EXECUTION = 1                                                           ## Skips workload execution between begin/end calls.
+
+class zet_metric_query_pool_type_t(c_int):
+    def __str__(self):
+        return str(zet_metric_query_pool_type_v(self.value))
+
+
+###############################################################################
+## @brief Metric query pool description
+class zet_metric_query_pool_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zet_metric_query_pool_type_t),                         ## [in] Query pool type.
+        ("count", c_ulong)                                              ## [in] Internal slots count within query pool object.
+    ]
+
+###############################################################################
+## @brief Supportted profile features
+class zet_profile_flags_v(IntEnum):
+    REGISTER_REALLOCATION = ZE_BIT(0)                                       ## request the compiler attempt to minimize register usage as much as
+                                                                            ## possible to allow for instrumentation
+    FREE_REGISTER_INFO = ZE_BIT(1)                                          ## request the compiler generate free register info
+
+class zet_profile_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Profiling meta-data for instrumentation
+class zet_profile_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("flags", zet_profile_flags_t),                                 ## [out] indicates which flags were enabled during compilation.
+                                                                        ## returns 0 (none) or a combination of ::zet_profile_flag_t
+        ("numTokens", c_ulong)                                          ## [out] number of tokens immediately following this structure
+    ]
+
+###############################################################################
+## @brief Supported profile token types
+class zet_profile_token_type_v(IntEnum):
+    FREE_REGISTER = 0                                                       ## GRF info
+
+class zet_profile_token_type_t(c_int):
+    def __str__(self):
+        return str(zet_profile_token_type_v(self.value))
+
+
+###############################################################################
+## @brief Profile free register token detailing unused registers in the current
+##        function
+class zet_profile_free_register_token_t(Structure):
+    _fields_ = [
+        ("type", zet_profile_token_type_t),                             ## [out] type of token
+        ("size", c_ulong),                                              ## [out] total size of the token, in bytes
+        ("count", c_ulong)                                              ## [out] number of register sequences immediately following this
+                                                                        ## structure
+    ]
+
+###############################################################################
+## @brief Profile register sequence detailing consecutive bytes, all of which
+##        are unused
+class zet_profile_register_sequence_t(Structure):
+    _fields_ = [
+        ("start", c_ulong),                                             ## [out] starting byte in the register table, representing the start of
+                                                                        ## unused bytes in the current function
+        ("count", c_ulong)                                              ## [out] number of consecutive bytes in the sequence, starting from start
+    ]
+
+###############################################################################
+## @brief API Tracing Experimental Extension Name
+ZET_API_TRACING_EXP_NAME = "ZET_experimental_api_tracing"
+
+###############################################################################
+## @brief API Tracing Experimental Extension Version(s)
+class zet_api_tracing_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zet_api_tracing_exp_version_t(c_int):
+    def __str__(self):
+        return str(zet_api_tracing_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Alias the existing callbacks definition for 'core' callbacks
+class zet_core_callbacks_t(ze_callbacks_t):
+    pass
+
+###############################################################################
+## @brief Tracer descriptor
+class zet_tracer_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("pUserData", c_void_p)                                         ## [in] pointer passed to every tracer's callbacks
+    ]
+
+###############################################################################
+## @brief Concurrent Metric Groups Experimental Extension Name
+ZET_CONCURRENT_METRIC_GROUPS_EXP_NAME = "ZET_experimental_concurrent_metric_groups"
+
+###############################################################################
+## @brief Concurrent Metric Groups Experimental Extension Version(s)
+class zet_concurrent_metric_groups_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zet_concurrent_metric_groups_exp_version_t(c_int):
+    def __str__(self):
+        return str(zet_concurrent_metric_groups_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Metric Tracer Experimental Extension Name
+ZET_METRICS_TRACER_EXP_NAME = "ZET_experimental_metric_tracer"
+
+###############################################################################
+## @brief Metric Tracer Experimental Extension Version(s)
+class zet_metric_tracer_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zet_metric_tracer_exp_version_t(c_int):
+    def __str__(self):
+        return str(zet_metric_tracer_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Handle of metric tracer's object
+class zet_metric_tracer_exp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Handle of metric decoder's object
+class zet_metric_decoder_exp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Metric tracer descriptor
+class zet_metric_tracer_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("notifyEveryNBytes", c_ulong)                                  ## [in,out] number of collected bytes after which notification event will
+                                                                        ## be signaled. If the requested value is not supported exactly, then the
+                                                                        ## driver may use a value that is the closest supported approximation and
+                                                                        ## shall update this member during ::zetMetricTracerCreateExp.
+    ]
+
+###############################################################################
+## @brief Decoded metric entry
+class zet_metric_entry_exp_t(Structure):
+    _fields_ = [
+        ("value", zet_value_t),                                         ## [out] value of the decodable metric entry or event. Number is
+                                                                        ## meaningful based on the metric type.
+        ("timeStamp", c_ulonglong),                                     ## [out] timestamp at which the event happened.
+        ("metricIndex", c_ulong),                                       ## [out] index to the decodable metric handle in the input array
+                                                                        ## (phMetric) in ::zetMetricTracerDecodeExp().
+        ("onSubdevice", ze_bool_t),                                     ## [out] True if the event occurred on a sub-device; false means the
+                                                                        ## device on which the metric tracer was opened does not have
+                                                                        ## sub-devices.
+        ("subdeviceId", c_ulong)                                        ## [out] If onSubdevice is true, this gives the ID of the sub-device.
+    ]
+
+###############################################################################
+## @brief Metric group type
+class zet_metric_group_type_exp_flags_v(IntEnum):
+    EXPORT_DMA_BUF = ZE_BIT(0)                                              ## Metric group and metrics exports memory using linux dma-buf, which
+                                                                            ## could be imported/mapped to the host process. Properties of the
+                                                                            ## dma_buf could be queried using ::zet_export_dma_buf_exp_properties_t.
+    USER_CREATED = ZE_BIT(1)                                                ## Metric group created using ::zetMetricGroupCreateExp
+    OTHER = ZE_BIT(2)                                                       ## Metric group which has a collection of metrics
+
+class zet_metric_group_type_exp_flags_t(c_int):
+    def __str__(self):
+        return hex(self.value)
+
+
+###############################################################################
+## @brief Query the metric group type using `pNext` of
+##        ::zet_metric_group_properties_t
+class zet_metric_group_type_exp_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zet_metric_group_type_exp_flags_t)                     ## [out] metric group type.
+                                                                        ## returns a combination of ::zet_metric_group_type_exp_flags_t.
+    ]
+
+###############################################################################
+## @brief Exported dma_buf properties queried using `pNext` of
+##        ::zet_metric_group_properties_t or ::zet_metric_properties_t
+class zet_export_dma_buf_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("fd", c_int),                                                  ## [out] the file descriptor handle that could be used to import the
+                                                                        ## memory by the host process.
+        ("size", c_size_t)                                              ## [out] size in bytes of the dma_buf
+    ]
+
+###############################################################################
+## @brief Calculating Multiple Metrics Experimental Extension Name
+ZET_MULTI_METRICS_EXP_NAME = "ZET_experimental_calculate_multiple_metrics"
+
+###############################################################################
+## @brief Calculating Multiple Metrics Experimental Extension Version(s)
+class ze_calculate_multiple_metrics_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_calculate_multiple_metrics_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_calculate_multiple_metrics_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Global Metric Timestamps Experimental Extension Name
+ZET_GLOBAL_METRICS_TIMESTAMPS_EXP_NAME = "ZET_experimental_global_metric_timestamps"
+
+###############################################################################
+## @brief Global Metric Timestamps Experimental Extension Version(s)
+class ze_metric_global_timestamps_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class ze_metric_global_timestamps_exp_version_t(c_int):
+    def __str__(self):
+        return str(ze_metric_global_timestamps_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Metric timestamps resolution
+## 
+## @details
+##     - This structure may be returned from ::zetMetricGroupGetProperties via
+##       the `pNext` member of ::zet_metric_group_properties_t.
+##     - Used for mapping metric timestamps to other timers.
+class zet_metric_global_timestamps_resolution_exp_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("timerResolution", c_ulonglong),                               ## [out] Returns the resolution of metrics timer (used for timestamps) in
+                                                                        ## cycles/sec.
+        ("timestampValidBits", c_ulonglong)                             ## [out] Returns the number of valid bits in the timestamp value.
+    ]
+
+###############################################################################
+## @brief Exporting Metrics Data Experimental Extension Name
+ZET_EXPORT_METRICS_DATA_EXP_NAME = "ZET_experimental_metric_export_data"
+
+###############################################################################
+## @brief Exporting Metrics Data Experimental Extension Version(s)
+class zet_export_metric_data_exp_version_v(IntEnum):
+    _1_0 = ZE_MAKE_VERSION( 1, 0 )                                          ## version 1.0
+    CURRENT = ZE_MAKE_VERSION( 1, 0 )                                       ## latest known version
+
+class zet_export_metric_data_exp_version_t(c_int):
+    def __str__(self):
+        return str(zet_export_metric_data_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Maximum count of characters in export data element name
+ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_NAME_EXP = 256
+
+###############################################################################
+## @brief Maximum export data element description string size
+ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_DESCRIPTION_EXP = 256
+
+###############################################################################
+## @brief Metrics calculation descriptor
+class zet_metric_calculate_exp_desc_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("rawReportSkipCount", c_ulong)                                 ## [in] number of reports to skip during calculation
+    ]
+
+###############################################################################
+## @brief Programmable Metrics Experimental Extension Name
+ZET_PROGRAMMABLE_METRICS_EXP_NAME = "ZET_experimental_programmable_metrics"
+
+###############################################################################
+## @brief Programmable Metrics Experimental Extension Version(s)
+class zet_metric_programmable_exp_version_v(IntEnum):
+    _1_1 = ZE_MAKE_VERSION( 1, 1 )                                          ## version 1.1
+    CURRENT = ZE_MAKE_VERSION( 1, 1 )                                       ## latest known version
+
+class zet_metric_programmable_exp_version_t(c_int):
+    def __str__(self):
+        return str(zet_metric_programmable_exp_version_v(self.value))
+
+
+###############################################################################
+## @brief Maximum count of characters in export data element name
+ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_NAME_EXP = 256
+
+###############################################################################
+## @brief Maximum export data element description string size
+ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_DESCRIPTION_EXP = 256
+
+###############################################################################
+## @brief Maximum metric programmable name string size
+ZET_MAX_METRIC_PROGRAMMABLE_NAME_EXP = 128
+
+###############################################################################
+## @brief Maximum metric programmable description string size
+ZET_MAX_METRIC_PROGRAMMABLE_DESCRIPTION_EXP = 128
+
+###############################################################################
+## @brief Maximum metric programmable component string size
+ZET_MAX_METRIC_PROGRAMMABLE_COMPONENT_EXP = 128
+
+###############################################################################
+## @brief Maximum metric programmable parameter string size
+ZET_MAX_METRIC_PROGRAMMABLE_PARAMETER_NAME_EXP = 128
+
+###############################################################################
+## @brief Maximum value for programmable value description
+ZET_MAX_METRIC_PROGRAMMABLE_VALUE_DESCRIPTION_EXP = 128
+
+###############################################################################
+## @brief Maximum value metric group name prefix
+ZE_MAX_METRIC_GROUP_NAME_PREFIX = 64
+
+###############################################################################
+## @brief Handle of metric programmable's object
+class zet_metric_programmable_exp_handle_t(c_void_p):
+    pass
+
+###############################################################################
+## @brief Metric Programmable properties queried using
+##        ::zetMetricProgrammableGetPropertiesExp
+class zet_metric_programmable_exp_properties_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("name", c_char * ZET_MAX_METRIC_PROGRAMMABLE_NAME_EXP),        ## [out] metric programmable name
+        ("description", c_char * ZET_MAX_METRIC_PROGRAMMABLE_DESCRIPTION_EXP),  ## [out] metric programmable description
+        ("component", c_char * ZET_MAX_METRIC_PROGRAMMABLE_COMPONENT_EXP),  ## [out] metric programmable component
+        ("tierNumber", c_ulong),                                        ## [out] tier number
+        ("domain", c_ulong),                                            ## [out] metric domain number.
+        ("parameterCount", c_ulong),                                    ## [out] number of parameters in the programmable
+        ("samplingType", zet_metric_group_sampling_type_flags_t),       ## [out] metric sampling type.
+                                                                        ## returns a combination of ::zet_metric_group_sampling_type_flag_t.
+        ("sourceId", c_ulong)                                           ## [out] unique metric source identifier(within platform)to identify the
+                                                                        ## HW block where the metric is collected.
+    ]
+
+###############################################################################
+## @brief Metric Programmable Parameter types
+class zet_metric_programmable_param_type_exp_v(IntEnum):
+    DISAGGREGATION = 0                                                      ## Metric is disaggregated.
+    LATENCY = 1                                                             ## Metric for latency measurement.
+    NORMALIZATION_UTILIZATION = 2                                           ## Produces normalization in percent using raw_metric * 100 / cycles / HW
+                                                                            ## instance_count.
+    NORMALIZATION_AVERAGE = 3                                               ## Produces normalization using raw_metric / HW instance_count.
+    NORMALIZATION_RATE = 4                                                  ## Produces normalization average using raw_metric / timestamp.
+    NORMALIZATION_BYTES = 5                                                 ## Produces normalization average using raw_metric * n bytes.
+
+class zet_metric_programmable_param_type_exp_t(c_int):
+    def __str__(self):
+        return str(zet_metric_programmable_param_type_exp_v(self.value))
+
+
+###############################################################################
+## @brief Supported value info types
+class zet_value_info_type_exp_v(IntEnum):
+    UINT32 = 0                                                              ## 32-bit unsigned-integer
+    UINT64 = 1                                                              ## 64-bit unsigned-integer
+    FLOAT32 = 2                                                             ## 32-bit floating-point
+    FLOAT64 = 3                                                             ## 64-bit floating-point
+    BOOL8 = 4                                                               ## 8-bit boolean
+    UINT8 = 5                                                               ## 8-bit unsigned-integer
+    UINT16 = 6                                                              ## 16-bit unsigned-integer
+    UINT64_RANGE = 7                                                        ## 64-bit unsigned-integer range (minimum and maximum)
+    FLOAT64_RANGE = 8                                                       ## 64-bit floating point range (minimum and maximum)
+
+class zet_value_info_type_exp_t(c_int):
+    def __str__(self):
+        return str(zet_value_info_type_exp_v(self.value))
+
+
+###############################################################################
+## @brief Value info of type uint64_t range
+class zet_value_uint64_range_exp_t(Structure):
+    _fields_ = [
+        ("ui64Min", c_ulonglong),                                       ## [out] minimum value of the range
+        ("ui64Max", c_ulonglong)                                        ## [out] maximum value of the range
+    ]
+
+###############################################################################
+## @brief Value info of type float64 range
+class zet_value_fp64_range_exp_t(Structure):
+    _fields_ = [
+        ("fp64Min", c_double),                                          ## [out] minimum value of the range
+        ("fp64Max", c_double)                                           ## [out] maximum value of the range
+    ]
+
+###############################################################################
+## @brief Union of value information
+class zet_value_info_exp_t(Structure):
+    _fields_ = [
+        ("ui32", c_ulong),                                              ## [out] 32-bit unsigned-integer
+        ("ui64", c_ulonglong),                                          ## [out] 64-bit unsigned-integer
+        ("fp32", c_float),                                              ## [out] 32-bit floating-point
+        ("fp64", c_double),                                             ## [out] 64-bit floating-point
+        ("b8", ze_bool_t),                                              ## [out] 8-bit boolean
+        ("ui8", c_ubyte),                                               ## [out] 8-bit unsigned integer
+        ("ui16", c_ushort),                                             ## [out] 16-bit unsigned integer
+        ("ui64Range", zet_value_uint64_range_exp_t),                    ## [out] minimum and maximum value of the range
+        ("fp64Range", zet_value_fp64_range_exp_t)                       ## [out] minimum and maximum value of the range
+    ]
+
+###############################################################################
+## @brief Metric Programmable parameter information
+class zet_metric_programmable_param_info_exp_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("type", zet_metric_programmable_param_type_exp_t),             ## [out] programmable parameter type
+        ("name", c_char * ZET_MAX_METRIC_PROGRAMMABLE_PARAMETER_NAME_EXP),  ## [out] metric programmable parameter name
+        ("valueInfoType", zet_value_info_type_exp_t),                   ## [out] value info type
+        ("defaultValue", zet_value_t),                                  ## [out] default value for the parameter
+        ("valueInfoCount", c_ulong)                                     ## [out] count of ::zet_metric_programmable_param_value_info_exp_t
+    ]
+
+###############################################################################
+## @brief Metric Programmable parameter value information
+class zet_metric_programmable_param_value_info_exp_t(Structure):
+    _fields_ = [
+        ("stype", zet_structure_type_t),                                ## [in] type of this structure
+        ("pNext", c_void_p),                                            ## [in,out][optional] must be null or a pointer to an extension-specific
+                                                                        ## structure (i.e. contains stype and pNext).
+        ("valueInfo", zet_value_info_exp_t),                            ## [out] information about the parameter value
+        ("description", c_char * ZET_MAX_METRIC_PROGRAMMABLE_VALUE_DESCRIPTION_EXP) ## [out] description about the value
+    ]
+
+###############################################################################
+## @brief Metric Programmable parameter value
+class zet_metric_programmable_param_value_exp_t(Structure):
+    _fields_ = [
+        ("value", zet_value_t)                                          ## [in] parameter value
+    ]
+
+###############################################################################
+__use_win_types = "Windows" == platform.uname()[0]
+
+###############################################################################
+## @brief Function-pointer for zetMetricProgrammableGetExp
+if __use_win_types:
+    _zetMetricProgrammableGetExp_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_metric_programmable_exp_handle_t) )
+else:
+    _zetMetricProgrammableGetExp_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_metric_programmable_exp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricProgrammableGetPropertiesExp
+if __use_win_types:
+    _zetMetricProgrammableGetPropertiesExp_t = WINFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, POINTER(zet_metric_programmable_exp_properties_t) )
+else:
+    _zetMetricProgrammableGetPropertiesExp_t = CFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, POINTER(zet_metric_programmable_exp_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricProgrammableGetParamInfoExp
+if __use_win_types:
+    _zetMetricProgrammableGetParamInfoExp_t = WINFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, POINTER(c_ulong), POINTER(zet_metric_programmable_param_info_exp_t) )
+else:
+    _zetMetricProgrammableGetParamInfoExp_t = CFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, POINTER(c_ulong), POINTER(zet_metric_programmable_param_info_exp_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricProgrammableGetParamValueInfoExp
+if __use_win_types:
+    _zetMetricProgrammableGetParamValueInfoExp_t = WINFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, c_ulong, POINTER(c_ulong), POINTER(zet_metric_programmable_param_value_info_exp_t) )
+else:
+    _zetMetricProgrammableGetParamValueInfoExp_t = CFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, c_ulong, POINTER(c_ulong), POINTER(zet_metric_programmable_param_value_info_exp_t) )
+
+
+###############################################################################
+## @brief Table of MetricProgrammableExp functions pointers
+class _zet_metric_programmable_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetExp", c_void_p),                                        ## _zetMetricProgrammableGetExp_t
+        ("pfnGetPropertiesExp", c_void_p),                              ## _zetMetricProgrammableGetPropertiesExp_t
+        ("pfnGetParamInfoExp", c_void_p),                               ## _zetMetricProgrammableGetParamInfoExp_t
+        ("pfnGetParamValueInfoExp", c_void_p)                           ## _zetMetricProgrammableGetParamValueInfoExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricTracerCreateExp
+if __use_win_types:
+    _zetMetricTracerCreateExp_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, c_ulong, POINTER(zet_metric_group_handle_t), POINTER(zet_metric_tracer_exp_desc_t), ze_event_handle_t, POINTER(zet_metric_tracer_exp_handle_t) )
+else:
+    _zetMetricTracerCreateExp_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, c_ulong, POINTER(zet_metric_group_handle_t), POINTER(zet_metric_tracer_exp_desc_t), ze_event_handle_t, POINTER(zet_metric_tracer_exp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricTracerDestroyExp
+if __use_win_types:
+    _zetMetricTracerDestroyExp_t = WINFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t )
+else:
+    _zetMetricTracerDestroyExp_t = CFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricTracerEnableExp
+if __use_win_types:
+    _zetMetricTracerEnableExp_t = WINFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, ze_bool_t )
+else:
+    _zetMetricTracerEnableExp_t = CFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, ze_bool_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricTracerDisableExp
+if __use_win_types:
+    _zetMetricTracerDisableExp_t = WINFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, ze_bool_t )
+else:
+    _zetMetricTracerDisableExp_t = CFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, ze_bool_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricTracerReadDataExp
+if __use_win_types:
+    _zetMetricTracerReadDataExp_t = WINFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+else:
+    _zetMetricTracerReadDataExp_t = CFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricTracerDecodeExp
+if __use_win_types:
+    _zetMetricTracerDecodeExp_t = WINFUNCTYPE( ze_result_t, zet_metric_decoder_exp_handle_t, POINTER(c_size_t), POINTER(c_ubyte), c_ulong, POINTER(zet_metric_handle_t), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_metric_entry_exp_t) )
+else:
+    _zetMetricTracerDecodeExp_t = CFUNCTYPE( ze_result_t, zet_metric_decoder_exp_handle_t, POINTER(c_size_t), POINTER(c_ubyte), c_ulong, POINTER(zet_metric_handle_t), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_metric_entry_exp_t) )
+
+
+###############################################################################
+## @brief Table of MetricTracerExp functions pointers
+class _zet_metric_tracer_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreateExp", c_void_p),                                     ## _zetMetricTracerCreateExp_t
+        ("pfnDestroyExp", c_void_p),                                    ## _zetMetricTracerDestroyExp_t
+        ("pfnEnableExp", c_void_p),                                     ## _zetMetricTracerEnableExp_t
+        ("pfnDisableExp", c_void_p),                                    ## _zetMetricTracerDisableExp_t
+        ("pfnReadDataExp", c_void_p),                                   ## _zetMetricTracerReadDataExp_t
+        ("pfnDecodeExp", c_void_p)                                      ## _zetMetricTracerDecodeExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricDecoderCreateExp
+if __use_win_types:
+    _zetMetricDecoderCreateExp_t = WINFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, POINTER(zet_metric_decoder_exp_handle_t) )
+else:
+    _zetMetricDecoderCreateExp_t = CFUNCTYPE( ze_result_t, zet_metric_tracer_exp_handle_t, POINTER(zet_metric_decoder_exp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricDecoderDestroyExp
+if __use_win_types:
+    _zetMetricDecoderDestroyExp_t = WINFUNCTYPE( ze_result_t, zet_metric_decoder_exp_handle_t )
+else:
+    _zetMetricDecoderDestroyExp_t = CFUNCTYPE( ze_result_t, zet_metric_decoder_exp_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricDecoderGetDecodableMetricsExp
+if __use_win_types:
+    _zetMetricDecoderGetDecodableMetricsExp_t = WINFUNCTYPE( ze_result_t, zet_metric_decoder_exp_handle_t, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+else:
+    _zetMetricDecoderGetDecodableMetricsExp_t = CFUNCTYPE( ze_result_t, zet_metric_decoder_exp_handle_t, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+
+
+###############################################################################
+## @brief Table of MetricDecoderExp functions pointers
+class _zet_metric_decoder_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreateExp", c_void_p),                                     ## _zetMetricDecoderCreateExp_t
+        ("pfnDestroyExp", c_void_p),                                    ## _zetMetricDecoderDestroyExp_t
+        ("pfnGetDecodableMetricsExp", c_void_p)                         ## _zetMetricDecoderGetDecodableMetricsExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetDeviceGetDebugProperties
+if __use_win_types:
+    _zetDeviceGetDebugProperties_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_device_debug_properties_t) )
+else:
+    _zetDeviceGetDebugProperties_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_device_debug_properties_t) )
+
+
+###############################################################################
+## @brief Table of Device functions pointers
+class _zet_device_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetDebugProperties", c_void_p)                             ## _zetDeviceGetDebugProperties_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetDeviceGetConcurrentMetricGroupsExp
+if __use_win_types:
+    _zetDeviceGetConcurrentMetricGroupsExp_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, c_ulong, *, *, * )
+else:
+    _zetDeviceGetConcurrentMetricGroupsExp_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, c_ulong, *, *, * )
+
+###############################################################################
+## @brief Function-pointer for zetDeviceCreateMetricGroupsFromMetricsExp
+if __use_win_types:
+    _zetDeviceCreateMetricGroupsFromMetricsExp_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, c_ulong, *, *, *, *, POINTER(zet_metric_group_handle_t) )
+else:
+    _zetDeviceCreateMetricGroupsFromMetricsExp_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, c_ulong, *, *, *, *, POINTER(zet_metric_group_handle_t) )
+
+
+###############################################################################
+## @brief Table of DeviceExp functions pointers
+class _zet_device_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetConcurrentMetricGroupsExp", c_void_p),                  ## _zetDeviceGetConcurrentMetricGroupsExp_t
+        ("pfnCreateMetricGroupsFromMetricsExp", c_void_p)               ## _zetDeviceCreateMetricGroupsFromMetricsExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetContextActivateMetricGroups
+if __use_win_types:
+    _zetContextActivateMetricGroups_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, c_ulong, POINTER(zet_metric_group_handle_t) )
+else:
+    _zetContextActivateMetricGroups_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, c_ulong, POINTER(zet_metric_group_handle_t) )
+
+
+###############################################################################
+## @brief Table of Context functions pointers
+class _zet_context_dditable_t(Structure):
+    _fields_ = [
+        ("pfnActivateMetricGroups", c_void_p)                           ## _zetContextActivateMetricGroups_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetCommandListAppendMetricStreamerMarker
+if __use_win_types:
+    _zetCommandListAppendMetricStreamerMarker_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_streamer_handle_t, c_ulong )
+else:
+    _zetCommandListAppendMetricStreamerMarker_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_streamer_handle_t, c_ulong )
+
+###############################################################################
+## @brief Function-pointer for zetCommandListAppendMetricQueryBegin
+if __use_win_types:
+    _zetCommandListAppendMetricQueryBegin_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t )
+else:
+    _zetCommandListAppendMetricQueryBegin_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetCommandListAppendMetricQueryEnd
+if __use_win_types:
+    _zetCommandListAppendMetricQueryEnd_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+else:
+    _zetCommandListAppendMetricQueryEnd_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t, zet_metric_query_handle_t, ze_event_handle_t, c_ulong, POINTER(ze_event_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetCommandListAppendMetricMemoryBarrier
+if __use_win_types:
+    _zetCommandListAppendMetricMemoryBarrier_t = WINFUNCTYPE( ze_result_t, zet_command_list_handle_t )
+else:
+    _zetCommandListAppendMetricMemoryBarrier_t = CFUNCTYPE( ze_result_t, zet_command_list_handle_t )
+
+
+###############################################################################
+## @brief Table of CommandList functions pointers
+class _zet_command_list_dditable_t(Structure):
+    _fields_ = [
+        ("pfnAppendMetricStreamerMarker", c_void_p),                    ## _zetCommandListAppendMetricStreamerMarker_t
+        ("pfnAppendMetricQueryBegin", c_void_p),                        ## _zetCommandListAppendMetricQueryBegin_t
+        ("pfnAppendMetricQueryEnd", c_void_p),                          ## _zetCommandListAppendMetricQueryEnd_t
+        ("pfnAppendMetricMemoryBarrier", c_void_p)                      ## _zetCommandListAppendMetricMemoryBarrier_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetModuleGetDebugInfo
+if __use_win_types:
+    _zetModuleGetDebugInfo_t = WINFUNCTYPE( ze_result_t, zet_module_handle_t, zet_module_debug_info_format_t, POINTER(c_size_t), POINTER(c_ubyte) )
+else:
+    _zetModuleGetDebugInfo_t = CFUNCTYPE( ze_result_t, zet_module_handle_t, zet_module_debug_info_format_t, POINTER(c_size_t), POINTER(c_ubyte) )
+
+
+###############################################################################
+## @brief Table of Module functions pointers
+class _zet_module_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetDebugInfo", c_void_p)                                   ## _zetModuleGetDebugInfo_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetKernelGetProfileInfo
+if __use_win_types:
+    _zetKernelGetProfileInfo_t = WINFUNCTYPE( ze_result_t, zet_kernel_handle_t, POINTER(zet_profile_properties_t) )
+else:
+    _zetKernelGetProfileInfo_t = CFUNCTYPE( ze_result_t, zet_kernel_handle_t, POINTER(zet_profile_properties_t) )
+
+
+###############################################################################
+## @brief Table of Kernel functions pointers
+class _zet_kernel_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGetProfileInfo", c_void_p)                                 ## _zetKernelGetProfileInfo_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricGet
+if __use_win_types:
+    _zetMetricGet_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+else:
+    _zetMetricGet_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGetProperties
+if __use_win_types:
+    _zetMetricGetProperties_t = WINFUNCTYPE( ze_result_t, zet_metric_handle_t, POINTER(zet_metric_properties_t) )
+else:
+    _zetMetricGetProperties_t = CFUNCTYPE( ze_result_t, zet_metric_handle_t, POINTER(zet_metric_properties_t) )
+
+
+###############################################################################
+## @brief Table of Metric functions pointers
+class _zet_metric_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGet", c_void_p),                                           ## _zetMetricGet_t
+        ("pfnGetProperties", c_void_p)                                  ## _zetMetricGetProperties_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricCreateFromProgrammableExp
+if __use_win_types:
+    _zetMetricCreateFromProgrammableExp_t = WINFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, POINTER(zet_metric_programmable_param_value_exp_t), c_ulong, c_char_p, c_char_p, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+else:
+    _zetMetricCreateFromProgrammableExp_t = CFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, POINTER(zet_metric_programmable_param_value_exp_t), c_ulong, c_char_p, c_char_p, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricDestroyExp
+if __use_win_types:
+    _zetMetricDestroyExp_t = WINFUNCTYPE( ze_result_t, zet_metric_handle_t )
+else:
+    _zetMetricDestroyExp_t = CFUNCTYPE( ze_result_t, zet_metric_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricCreateFromProgrammableExp2
+if __use_win_types:
+    _zetMetricCreateFromProgrammableExp2_t = WINFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, c_ulong, POINTER(zet_metric_programmable_param_value_exp_t), c_char_p, c_char_p, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+else:
+    _zetMetricCreateFromProgrammableExp2_t = CFUNCTYPE( ze_result_t, zet_metric_programmable_exp_handle_t, c_ulong, POINTER(zet_metric_programmable_param_value_exp_t), c_char_p, c_char_p, POINTER(c_ulong), POINTER(zet_metric_handle_t) )
+
+
+###############################################################################
+## @brief Table of MetricExp functions pointers
+class _zet_metric_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreateFromProgrammableExp", c_void_p),                     ## _zetMetricCreateFromProgrammableExp_t
+        ("pfnDestroyExp", c_void_p),                                    ## _zetMetricDestroyExp_t
+        ("pfnCreateFromProgrammableExp2", c_void_p)                     ## _zetMetricCreateFromProgrammableExp2_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupGet
+if __use_win_types:
+    _zetMetricGroupGet_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_metric_group_handle_t) )
+else:
+    _zetMetricGroupGet_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_metric_group_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupGetProperties
+if __use_win_types:
+    _zetMetricGroupGetProperties_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(zet_metric_group_properties_t) )
+else:
+    _zetMetricGroupGetProperties_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(zet_metric_group_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupCalculateMetricValues
+if __use_win_types:
+    _zetMetricGroupCalculateMetricValues_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(zet_typed_value_t) )
+else:
+    _zetMetricGroupCalculateMetricValues_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(zet_typed_value_t) )
+
+
+###############################################################################
+## @brief Table of MetricGroup functions pointers
+class _zet_metric_group_dditable_t(Structure):
+    _fields_ = [
+        ("pfnGet", c_void_p),                                           ## _zetMetricGroupGet_t
+        ("pfnGetProperties", c_void_p),                                 ## _zetMetricGroupGetProperties_t
+        ("pfnCalculateMetricValues", c_void_p)                          ## _zetMetricGroupCalculateMetricValues_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupCalculateMultipleMetricValuesExp
+if __use_win_types:
+    _zetMetricGroupCalculateMultipleMetricValuesExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_typed_value_t) )
+else:
+    _zetMetricGroupCalculateMultipleMetricValuesExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_typed_value_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupGetGlobalTimestampsExp
+if __use_win_types:
+    _zetMetricGroupGetGlobalTimestampsExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, ze_bool_t, POINTER(c_ulonglong), POINTER(c_ulonglong) )
+else:
+    _zetMetricGroupGetGlobalTimestampsExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, ze_bool_t, POINTER(c_ulonglong), POINTER(c_ulonglong) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupGetExportDataExp
+if __use_win_types:
+    _zetMetricGroupGetExportDataExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(c_ubyte), c_size_t, POINTER(c_size_t), * )
+else:
+    _zetMetricGroupGetExportDataExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, POINTER(c_ubyte), c_size_t, POINTER(c_size_t), * )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupCalculateMetricExportDataExp
+if __use_win_types:
+    _zetMetricGroupCalculateMetricExportDataExp_t = WINFUNCTYPE( ze_result_t, ze_driver_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(zet_metric_calculate_exp_desc_t), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_typed_value_t) )
+else:
+    _zetMetricGroupCalculateMetricExportDataExp_t = CFUNCTYPE( ze_result_t, ze_driver_handle_t, zet_metric_group_calculation_type_t, c_size_t, POINTER(c_ubyte), POINTER(zet_metric_calculate_exp_desc_t), POINTER(c_ulong), POINTER(c_ulong), POINTER(c_ulong), POINTER(zet_typed_value_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupCreateExp
+if __use_win_types:
+    _zetMetricGroupCreateExp_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, c_char_p, c_char_p, zet_metric_group_sampling_type_flags_t, POINTER(zet_metric_group_handle_t) )
+else:
+    _zetMetricGroupCreateExp_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, c_char_p, c_char_p, zet_metric_group_sampling_type_flags_t, POINTER(zet_metric_group_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupAddMetricExp
+if __use_win_types:
+    _zetMetricGroupAddMetricExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_handle_t, *, c_char_p )
+else:
+    _zetMetricGroupAddMetricExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_handle_t, *, c_char_p )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupRemoveMetricExp
+if __use_win_types:
+    _zetMetricGroupRemoveMetricExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_handle_t )
+else:
+    _zetMetricGroupRemoveMetricExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t, zet_metric_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupCloseExp
+if __use_win_types:
+    _zetMetricGroupCloseExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t )
+else:
+    _zetMetricGroupCloseExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricGroupDestroyExp
+if __use_win_types:
+    _zetMetricGroupDestroyExp_t = WINFUNCTYPE( ze_result_t, zet_metric_group_handle_t )
+else:
+    _zetMetricGroupDestroyExp_t = CFUNCTYPE( ze_result_t, zet_metric_group_handle_t )
+
+
+###############################################################################
+## @brief Table of MetricGroupExp functions pointers
+class _zet_metric_group_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCalculateMultipleMetricValuesExp", c_void_p),              ## _zetMetricGroupCalculateMultipleMetricValuesExp_t
+        ("pfnGetGlobalTimestampsExp", c_void_p),                        ## _zetMetricGroupGetGlobalTimestampsExp_t
+        ("pfnGetExportDataExp", c_void_p),                              ## _zetMetricGroupGetExportDataExp_t
+        ("pfnCalculateMetricExportDataExp", c_void_p),                  ## _zetMetricGroupCalculateMetricExportDataExp_t
+        ("pfnCreateExp", c_void_p),                                     ## _zetMetricGroupCreateExp_t
+        ("pfnAddMetricExp", c_void_p),                                  ## _zetMetricGroupAddMetricExp_t
+        ("pfnRemoveMetricExp", c_void_p),                               ## _zetMetricGroupRemoveMetricExp_t
+        ("pfnCloseExp", c_void_p),                                      ## _zetMetricGroupCloseExp_t
+        ("pfnDestroyExp", c_void_p)                                     ## _zetMetricGroupDestroyExp_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricStreamerOpen
+if __use_win_types:
+    _zetMetricStreamerOpen_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_streamer_desc_t), ze_event_handle_t, POINTER(zet_metric_streamer_handle_t) )
+else:
+    _zetMetricStreamerOpen_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_streamer_desc_t), ze_event_handle_t, POINTER(zet_metric_streamer_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricStreamerClose
+if __use_win_types:
+    _zetMetricStreamerClose_t = WINFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t )
+else:
+    _zetMetricStreamerClose_t = CFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricStreamerReadData
+if __use_win_types:
+    _zetMetricStreamerReadData_t = WINFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_ubyte) )
+else:
+    _zetMetricStreamerReadData_t = CFUNCTYPE( ze_result_t, zet_metric_streamer_handle_t, c_ulong, POINTER(c_size_t), POINTER(c_ubyte) )
+
+
+###############################################################################
+## @brief Table of MetricStreamer functions pointers
+class _zet_metric_streamer_dditable_t(Structure):
+    _fields_ = [
+        ("pfnOpen", c_void_p),                                          ## _zetMetricStreamerOpen_t
+        ("pfnClose", c_void_p),                                         ## _zetMetricStreamerClose_t
+        ("pfnReadData", c_void_p)                                       ## _zetMetricStreamerReadData_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricQueryPoolCreate
+if __use_win_types:
+    _zetMetricQueryPoolCreate_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_query_pool_desc_t), POINTER(zet_metric_query_pool_handle_t) )
+else:
+    _zetMetricQueryPoolCreate_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, zet_device_handle_t, zet_metric_group_handle_t, POINTER(zet_metric_query_pool_desc_t), POINTER(zet_metric_query_pool_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricQueryPoolDestroy
+if __use_win_types:
+    _zetMetricQueryPoolDestroy_t = WINFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t )
+else:
+    _zetMetricQueryPoolDestroy_t = CFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t )
+
+
+###############################################################################
+## @brief Table of MetricQueryPool functions pointers
+class _zet_metric_query_pool_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zetMetricQueryPoolCreate_t
+        ("pfnDestroy", c_void_p)                                        ## _zetMetricQueryPoolDestroy_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetMetricQueryCreate
+if __use_win_types:
+    _zetMetricQueryCreate_t = WINFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t, c_ulong, POINTER(zet_metric_query_handle_t) )
+else:
+    _zetMetricQueryCreate_t = CFUNCTYPE( ze_result_t, zet_metric_query_pool_handle_t, c_ulong, POINTER(zet_metric_query_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetMetricQueryDestroy
+if __use_win_types:
+    _zetMetricQueryDestroy_t = WINFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
+else:
+    _zetMetricQueryDestroy_t = CFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricQueryReset
+if __use_win_types:
+    _zetMetricQueryReset_t = WINFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
+else:
+    _zetMetricQueryReset_t = CFUNCTYPE( ze_result_t, zet_metric_query_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetMetricQueryGetData
+if __use_win_types:
+    _zetMetricQueryGetData_t = WINFUNCTYPE( ze_result_t, zet_metric_query_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+else:
+    _zetMetricQueryGetData_t = CFUNCTYPE( ze_result_t, zet_metric_query_handle_t, POINTER(c_size_t), POINTER(c_ubyte) )
+
+
+###############################################################################
+## @brief Table of MetricQuery functions pointers
+class _zet_metric_query_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zetMetricQueryCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zetMetricQueryDestroy_t
+        ("pfnReset", c_void_p),                                         ## _zetMetricQueryReset_t
+        ("pfnGetData", c_void_p)                                        ## _zetMetricQueryGetData_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetTracerExpCreate
+if __use_win_types:
+    _zetTracerExpCreate_t = WINFUNCTYPE( ze_result_t, zet_context_handle_t, POINTER(zet_tracer_exp_desc_t), POINTER(zet_tracer_exp_handle_t) )
+else:
+    _zetTracerExpCreate_t = CFUNCTYPE( ze_result_t, zet_context_handle_t, POINTER(zet_tracer_exp_desc_t), POINTER(zet_tracer_exp_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetTracerExpDestroy
+if __use_win_types:
+    _zetTracerExpDestroy_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t )
+else:
+    _zetTracerExpDestroy_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetTracerExpSetPrologues
+if __use_win_types:
+    _zetTracerExpSetPrologues_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
+else:
+    _zetTracerExpSetPrologues_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
+
+###############################################################################
+## @brief Function-pointer for zetTracerExpSetEpilogues
+if __use_win_types:
+    _zetTracerExpSetEpilogues_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
+else:
+    _zetTracerExpSetEpilogues_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, POINTER(zet_core_callbacks_t) )
+
+###############################################################################
+## @brief Function-pointer for zetTracerExpSetEnabled
+if __use_win_types:
+    _zetTracerExpSetEnabled_t = WINFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, ze_bool_t )
+else:
+    _zetTracerExpSetEnabled_t = CFUNCTYPE( ze_result_t, zet_tracer_exp_handle_t, ze_bool_t )
+
+
+###############################################################################
+## @brief Table of TracerExp functions pointers
+class _zet_tracer_exp_dditable_t(Structure):
+    _fields_ = [
+        ("pfnCreate", c_void_p),                                        ## _zetTracerExpCreate_t
+        ("pfnDestroy", c_void_p),                                       ## _zetTracerExpDestroy_t
+        ("pfnSetPrologues", c_void_p),                                  ## _zetTracerExpSetPrologues_t
+        ("pfnSetEpilogues", c_void_p),                                  ## _zetTracerExpSetEpilogues_t
+        ("pfnSetEnabled", c_void_p)                                     ## _zetTracerExpSetEnabled_t
+    ]
+
+###############################################################################
+## @brief Function-pointer for zetDebugAttach
+if __use_win_types:
+    _zetDebugAttach_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_debug_config_t), POINTER(zet_debug_session_handle_t) )
+else:
+    _zetDebugAttach_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(zet_debug_config_t), POINTER(zet_debug_session_handle_t) )
+
+###############################################################################
+## @brief Function-pointer for zetDebugDetach
+if __use_win_types:
+    _zetDebugDetach_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t )
+else:
+    _zetDebugDetach_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t )
+
+###############################################################################
+## @brief Function-pointer for zetDebugReadEvent
+if __use_win_types:
+    _zetDebugReadEvent_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, c_ulonglong, POINTER(zet_debug_event_t) )
+else:
+    _zetDebugReadEvent_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, c_ulonglong, POINTER(zet_debug_event_t) )
+
+###############################################################################
+## @brief Function-pointer for zetDebugAcknowledgeEvent
+if __use_win_types:
+    _zetDebugAcknowledgeEvent_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, POINTER(zet_debug_event_t) )
+else:
+    _zetDebugAcknowledgeEvent_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, POINTER(zet_debug_event_t) )
+
+###############################################################################
+## @brief Function-pointer for zetDebugInterrupt
+if __use_win_types:
+    _zetDebugInterrupt_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
+else:
+    _zetDebugInterrupt_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
+
+###############################################################################
+## @brief Function-pointer for zetDebugResume
+if __use_win_types:
+    _zetDebugResume_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
+else:
+    _zetDebugResume_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t )
+
+###############################################################################
+## @brief Function-pointer for zetDebugReadMemory
+if __use_win_types:
+    _zetDebugReadMemory_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
+else:
+    _zetDebugReadMemory_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zetDebugWriteMemory
+if __use_win_types:
+    _zetDebugWriteMemory_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
+else:
+    _zetDebugWriteMemory_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(zet_debug_memory_space_desc_t), c_size_t, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zetDebugGetRegisterSetProperties
+if __use_win_types:
+    _zetDebugGetRegisterSetProperties_t = WINFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_debug_regset_properties_t) )
+else:
+    _zetDebugGetRegisterSetProperties_t = CFUNCTYPE( ze_result_t, zet_device_handle_t, POINTER(c_ulong), POINTER(zet_debug_regset_properties_t) )
+
+###############################################################################
+## @brief Function-pointer for zetDebugReadRegisters
+if __use_win_types:
+    _zetDebugReadRegisters_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
+else:
+    _zetDebugReadRegisters_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zetDebugWriteRegisters
+if __use_win_types:
+    _zetDebugWriteRegisters_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
+else:
+    _zetDebugWriteRegisters_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, c_ulong, c_ulong, c_ulong, c_void_p )
+
+###############################################################################
+## @brief Function-pointer for zetDebugGetThreadRegisterSetProperties
+if __use_win_types:
+    _zetDebugGetThreadRegisterSetProperties_t = WINFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(c_ulong), POINTER(zet_debug_regset_properties_t) )
+else:
+    _zetDebugGetThreadRegisterSetProperties_t = CFUNCTYPE( ze_result_t, zet_debug_session_handle_t, ze_device_thread_t, POINTER(c_ulong), POINTER(zet_debug_regset_properties_t) )
+
+
+###############################################################################
+## @brief Table of Debug functions pointers
+class _zet_debug_dditable_t(Structure):
+    _fields_ = [
+        ("pfnAttach", c_void_p),                                        ## _zetDebugAttach_t
+        ("pfnDetach", c_void_p),                                        ## _zetDebugDetach_t
+        ("pfnReadEvent", c_void_p),                                     ## _zetDebugReadEvent_t
+        ("pfnAcknowledgeEvent", c_void_p),                              ## _zetDebugAcknowledgeEvent_t
+        ("pfnInterrupt", c_void_p),                                     ## _zetDebugInterrupt_t
+        ("pfnResume", c_void_p),                                        ## _zetDebugResume_t
+        ("pfnReadMemory", c_void_p),                                    ## _zetDebugReadMemory_t
+        ("pfnWriteMemory", c_void_p),                                   ## _zetDebugWriteMemory_t
+        ("pfnGetRegisterSetProperties", c_void_p),                      ## _zetDebugGetRegisterSetProperties_t
+        ("pfnReadRegisters", c_void_p),                                 ## _zetDebugReadRegisters_t
+        ("pfnWriteRegisters", c_void_p),                                ## _zetDebugWriteRegisters_t
+        ("pfnGetThreadRegisterSetProperties", c_void_p)                 ## _zetDebugGetThreadRegisterSetProperties_t
+    ]
+
+###############################################################################
+class _zet_dditable_t(Structure):
+    _fields_ = [
+        ("MetricProgrammableExp", _zet_metric_programmable_exp_dditable_t),
+        ("MetricTracerExp", _zet_metric_tracer_exp_dditable_t),
+        ("MetricDecoderExp", _zet_metric_decoder_exp_dditable_t),
+        ("Device", _zet_device_dditable_t),
+        ("DeviceExp", _zet_device_exp_dditable_t),
+        ("Context", _zet_context_dditable_t),
+        ("CommandList", _zet_command_list_dditable_t),
+        ("Module", _zet_module_dditable_t),
+        ("Kernel", _zet_kernel_dditable_t),
+        ("Metric", _zet_metric_dditable_t),
+        ("MetricExp", _zet_metric_exp_dditable_t),
+        ("MetricGroup", _zet_metric_group_dditable_t),
+        ("MetricGroupExp", _zet_metric_group_exp_dditable_t),
+        ("MetricStreamer", _zet_metric_streamer_dditable_t),
+        ("MetricQueryPool", _zet_metric_query_pool_dditable_t),
+        ("MetricQuery", _zet_metric_query_dditable_t),
+        ("TracerExp", _zet_tracer_exp_dditable_t),
+        ("Debug", _zet_debug_dditable_t)
+    ]
+
+###############################################################################
+## @brief zet device-driver interfaces
+class ZET_DDI:
+    def __init__(self, version : ze_api_version_t):
+        # load the ze_loader library
+        if "Windows" == platform.uname()[0]:
+            self.__dll = WinDLL("ze_loader.dll")
+        else:
+            self.__dll = CDLL("ze_loader.so")
+
+        # fill the ddi tables
+        self.__dditable = _zet_dditable_t()
+
+        # call driver to get function pointers
+        _MetricProgrammableExp = _zet_metric_programmable_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricProgrammableExpProcAddrTable(version, byref(_MetricProgrammableExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricProgrammableExp = _MetricProgrammableExp
+
+        # attach function interface to function address
+        self.zetMetricProgrammableGetExp = _zetMetricProgrammableGetExp_t(self.__dditable.MetricProgrammableExp.pfnGetExp)
+        self.zetMetricProgrammableGetPropertiesExp = _zetMetricProgrammableGetPropertiesExp_t(self.__dditable.MetricProgrammableExp.pfnGetPropertiesExp)
+        self.zetMetricProgrammableGetParamInfoExp = _zetMetricProgrammableGetParamInfoExp_t(self.__dditable.MetricProgrammableExp.pfnGetParamInfoExp)
+        self.zetMetricProgrammableGetParamValueInfoExp = _zetMetricProgrammableGetParamValueInfoExp_t(self.__dditable.MetricProgrammableExp.pfnGetParamValueInfoExp)
+
+        # call driver to get function pointers
+        _MetricTracerExp = _zet_metric_tracer_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricTracerExpProcAddrTable(version, byref(_MetricTracerExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricTracerExp = _MetricTracerExp
+
+        # attach function interface to function address
+        self.zetMetricTracerCreateExp = _zetMetricTracerCreateExp_t(self.__dditable.MetricTracerExp.pfnCreateExp)
+        self.zetMetricTracerDestroyExp = _zetMetricTracerDestroyExp_t(self.__dditable.MetricTracerExp.pfnDestroyExp)
+        self.zetMetricTracerEnableExp = _zetMetricTracerEnableExp_t(self.__dditable.MetricTracerExp.pfnEnableExp)
+        self.zetMetricTracerDisableExp = _zetMetricTracerDisableExp_t(self.__dditable.MetricTracerExp.pfnDisableExp)
+        self.zetMetricTracerReadDataExp = _zetMetricTracerReadDataExp_t(self.__dditable.MetricTracerExp.pfnReadDataExp)
+        self.zetMetricTracerDecodeExp = _zetMetricTracerDecodeExp_t(self.__dditable.MetricTracerExp.pfnDecodeExp)
+
+        # call driver to get function pointers
+        _MetricDecoderExp = _zet_metric_decoder_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricDecoderExpProcAddrTable(version, byref(_MetricDecoderExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricDecoderExp = _MetricDecoderExp
+
+        # attach function interface to function address
+        self.zetMetricDecoderCreateExp = _zetMetricDecoderCreateExp_t(self.__dditable.MetricDecoderExp.pfnCreateExp)
+        self.zetMetricDecoderDestroyExp = _zetMetricDecoderDestroyExp_t(self.__dditable.MetricDecoderExp.pfnDestroyExp)
+        self.zetMetricDecoderGetDecodableMetricsExp = _zetMetricDecoderGetDecodableMetricsExp_t(self.__dditable.MetricDecoderExp.pfnGetDecodableMetricsExp)
+
+        # call driver to get function pointers
+        _Device = _zet_device_dditable_t()
+        r = ze_result_v(self.__dll.zetGetDeviceProcAddrTable(version, byref(_Device)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Device = _Device
+
+        # attach function interface to function address
+        self.zetDeviceGetDebugProperties = _zetDeviceGetDebugProperties_t(self.__dditable.Device.pfnGetDebugProperties)
+
+        # call driver to get function pointers
+        _DeviceExp = _zet_device_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetDeviceExpProcAddrTable(version, byref(_DeviceExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.DeviceExp = _DeviceExp
+
+        # attach function interface to function address
+        self.zetDeviceGetConcurrentMetricGroupsExp = _zetDeviceGetConcurrentMetricGroupsExp_t(self.__dditable.DeviceExp.pfnGetConcurrentMetricGroupsExp)
+        self.zetDeviceCreateMetricGroupsFromMetricsExp = _zetDeviceCreateMetricGroupsFromMetricsExp_t(self.__dditable.DeviceExp.pfnCreateMetricGroupsFromMetricsExp)
+
+        # call driver to get function pointers
+        _Context = _zet_context_dditable_t()
+        r = ze_result_v(self.__dll.zetGetContextProcAddrTable(version, byref(_Context)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Context = _Context
+
+        # attach function interface to function address
+        self.zetContextActivateMetricGroups = _zetContextActivateMetricGroups_t(self.__dditable.Context.pfnActivateMetricGroups)
+
+        # call driver to get function pointers
+        _CommandList = _zet_command_list_dditable_t()
+        r = ze_result_v(self.__dll.zetGetCommandListProcAddrTable(version, byref(_CommandList)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.CommandList = _CommandList
+
+        # attach function interface to function address
+        self.zetCommandListAppendMetricStreamerMarker = _zetCommandListAppendMetricStreamerMarker_t(self.__dditable.CommandList.pfnAppendMetricStreamerMarker)
+        self.zetCommandListAppendMetricQueryBegin = _zetCommandListAppendMetricQueryBegin_t(self.__dditable.CommandList.pfnAppendMetricQueryBegin)
+        self.zetCommandListAppendMetricQueryEnd = _zetCommandListAppendMetricQueryEnd_t(self.__dditable.CommandList.pfnAppendMetricQueryEnd)
+        self.zetCommandListAppendMetricMemoryBarrier = _zetCommandListAppendMetricMemoryBarrier_t(self.__dditable.CommandList.pfnAppendMetricMemoryBarrier)
+
+        # call driver to get function pointers
+        _Module = _zet_module_dditable_t()
+        r = ze_result_v(self.__dll.zetGetModuleProcAddrTable(version, byref(_Module)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Module = _Module
+
+        # attach function interface to function address
+        self.zetModuleGetDebugInfo = _zetModuleGetDebugInfo_t(self.__dditable.Module.pfnGetDebugInfo)
+
+        # call driver to get function pointers
+        _Kernel = _zet_kernel_dditable_t()
+        r = ze_result_v(self.__dll.zetGetKernelProcAddrTable(version, byref(_Kernel)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Kernel = _Kernel
+
+        # attach function interface to function address
+        self.zetKernelGetProfileInfo = _zetKernelGetProfileInfo_t(self.__dditable.Kernel.pfnGetProfileInfo)
+
+        # call driver to get function pointers
+        _Metric = _zet_metric_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricProcAddrTable(version, byref(_Metric)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Metric = _Metric
+
+        # attach function interface to function address
+        self.zetMetricGet = _zetMetricGet_t(self.__dditable.Metric.pfnGet)
+        self.zetMetricGetProperties = _zetMetricGetProperties_t(self.__dditable.Metric.pfnGetProperties)
+
+        # call driver to get function pointers
+        _MetricExp = _zet_metric_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricExpProcAddrTable(version, byref(_MetricExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricExp = _MetricExp
+
+        # attach function interface to function address
+        self.zetMetricCreateFromProgrammableExp = _zetMetricCreateFromProgrammableExp_t(self.__dditable.MetricExp.pfnCreateFromProgrammableExp)
+        self.zetMetricDestroyExp = _zetMetricDestroyExp_t(self.__dditable.MetricExp.pfnDestroyExp)
+        self.zetMetricCreateFromProgrammableExp2 = _zetMetricCreateFromProgrammableExp2_t(self.__dditable.MetricExp.pfnCreateFromProgrammableExp2)
+
+        # call driver to get function pointers
+        _MetricGroup = _zet_metric_group_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricGroupProcAddrTable(version, byref(_MetricGroup)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricGroup = _MetricGroup
+
+        # attach function interface to function address
+        self.zetMetricGroupGet = _zetMetricGroupGet_t(self.__dditable.MetricGroup.pfnGet)
+        self.zetMetricGroupGetProperties = _zetMetricGroupGetProperties_t(self.__dditable.MetricGroup.pfnGetProperties)
+        self.zetMetricGroupCalculateMetricValues = _zetMetricGroupCalculateMetricValues_t(self.__dditable.MetricGroup.pfnCalculateMetricValues)
+
+        # call driver to get function pointers
+        _MetricGroupExp = _zet_metric_group_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricGroupExpProcAddrTable(version, byref(_MetricGroupExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricGroupExp = _MetricGroupExp
+
+        # attach function interface to function address
+        self.zetMetricGroupCalculateMultipleMetricValuesExp = _zetMetricGroupCalculateMultipleMetricValuesExp_t(self.__dditable.MetricGroupExp.pfnCalculateMultipleMetricValuesExp)
+        self.zetMetricGroupGetGlobalTimestampsExp = _zetMetricGroupGetGlobalTimestampsExp_t(self.__dditable.MetricGroupExp.pfnGetGlobalTimestampsExp)
+        self.zetMetricGroupGetExportDataExp = _zetMetricGroupGetExportDataExp_t(self.__dditable.MetricGroupExp.pfnGetExportDataExp)
+        self.zetMetricGroupCalculateMetricExportDataExp = _zetMetricGroupCalculateMetricExportDataExp_t(self.__dditable.MetricGroupExp.pfnCalculateMetricExportDataExp)
+        self.zetMetricGroupCreateExp = _zetMetricGroupCreateExp_t(self.__dditable.MetricGroupExp.pfnCreateExp)
+        self.zetMetricGroupAddMetricExp = _zetMetricGroupAddMetricExp_t(self.__dditable.MetricGroupExp.pfnAddMetricExp)
+        self.zetMetricGroupRemoveMetricExp = _zetMetricGroupRemoveMetricExp_t(self.__dditable.MetricGroupExp.pfnRemoveMetricExp)
+        self.zetMetricGroupCloseExp = _zetMetricGroupCloseExp_t(self.__dditable.MetricGroupExp.pfnCloseExp)
+        self.zetMetricGroupDestroyExp = _zetMetricGroupDestroyExp_t(self.__dditable.MetricGroupExp.pfnDestroyExp)
+
+        # call driver to get function pointers
+        _MetricStreamer = _zet_metric_streamer_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricStreamerProcAddrTable(version, byref(_MetricStreamer)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricStreamer = _MetricStreamer
+
+        # attach function interface to function address
+        self.zetMetricStreamerOpen = _zetMetricStreamerOpen_t(self.__dditable.MetricStreamer.pfnOpen)
+        self.zetMetricStreamerClose = _zetMetricStreamerClose_t(self.__dditable.MetricStreamer.pfnClose)
+        self.zetMetricStreamerReadData = _zetMetricStreamerReadData_t(self.__dditable.MetricStreamer.pfnReadData)
+
+        # call driver to get function pointers
+        _MetricQueryPool = _zet_metric_query_pool_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricQueryPoolProcAddrTable(version, byref(_MetricQueryPool)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricQueryPool = _MetricQueryPool
+
+        # attach function interface to function address
+        self.zetMetricQueryPoolCreate = _zetMetricQueryPoolCreate_t(self.__dditable.MetricQueryPool.pfnCreate)
+        self.zetMetricQueryPoolDestroy = _zetMetricQueryPoolDestroy_t(self.__dditable.MetricQueryPool.pfnDestroy)
+
+        # call driver to get function pointers
+        _MetricQuery = _zet_metric_query_dditable_t()
+        r = ze_result_v(self.__dll.zetGetMetricQueryProcAddrTable(version, byref(_MetricQuery)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.MetricQuery = _MetricQuery
+
+        # attach function interface to function address
+        self.zetMetricQueryCreate = _zetMetricQueryCreate_t(self.__dditable.MetricQuery.pfnCreate)
+        self.zetMetricQueryDestroy = _zetMetricQueryDestroy_t(self.__dditable.MetricQuery.pfnDestroy)
+        self.zetMetricQueryReset = _zetMetricQueryReset_t(self.__dditable.MetricQuery.pfnReset)
+        self.zetMetricQueryGetData = _zetMetricQueryGetData_t(self.__dditable.MetricQuery.pfnGetData)
+
+        # call driver to get function pointers
+        _TracerExp = _zet_tracer_exp_dditable_t()
+        r = ze_result_v(self.__dll.zetGetTracerExpProcAddrTable(version, byref(_TracerExp)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.TracerExp = _TracerExp
+
+        # attach function interface to function address
+        self.zetTracerExpCreate = _zetTracerExpCreate_t(self.__dditable.TracerExp.pfnCreate)
+        self.zetTracerExpDestroy = _zetTracerExpDestroy_t(self.__dditable.TracerExp.pfnDestroy)
+        self.zetTracerExpSetPrologues = _zetTracerExpSetPrologues_t(self.__dditable.TracerExp.pfnSetPrologues)
+        self.zetTracerExpSetEpilogues = _zetTracerExpSetEpilogues_t(self.__dditable.TracerExp.pfnSetEpilogues)
+        self.zetTracerExpSetEnabled = _zetTracerExpSetEnabled_t(self.__dditable.TracerExp.pfnSetEnabled)
+
+        # call driver to get function pointers
+        _Debug = _zet_debug_dditable_t()
+        r = ze_result_v(self.__dll.zetGetDebugProcAddrTable(version, byref(_Debug)))
+        if r != ze_result_v.SUCCESS:
+            raise Exception(r)
+        self.__dditable.Debug = _Debug
+
+        # attach function interface to function address
+        self.zetDebugAttach = _zetDebugAttach_t(self.__dditable.Debug.pfnAttach)
+        self.zetDebugDetach = _zetDebugDetach_t(self.__dditable.Debug.pfnDetach)
+        self.zetDebugReadEvent = _zetDebugReadEvent_t(self.__dditable.Debug.pfnReadEvent)
+        self.zetDebugAcknowledgeEvent = _zetDebugAcknowledgeEvent_t(self.__dditable.Debug.pfnAcknowledgeEvent)
+        self.zetDebugInterrupt = _zetDebugInterrupt_t(self.__dditable.Debug.pfnInterrupt)
+        self.zetDebugResume = _zetDebugResume_t(self.__dditable.Debug.pfnResume)
+        self.zetDebugReadMemory = _zetDebugReadMemory_t(self.__dditable.Debug.pfnReadMemory)
+        self.zetDebugWriteMemory = _zetDebugWriteMemory_t(self.__dditable.Debug.pfnWriteMemory)
+        self.zetDebugGetRegisterSetProperties = _zetDebugGetRegisterSetProperties_t(self.__dditable.Debug.pfnGetRegisterSetProperties)
+        self.zetDebugReadRegisters = _zetDebugReadRegisters_t(self.__dditable.Debug.pfnReadRegisters)
+        self.zetDebugWriteRegisters = _zetDebugWriteRegisters_t(self.__dditable.Debug.pfnWriteRegisters)
+        self.zetDebugGetThreadRegisterSetProperties = _zetDebugGetThreadRegisterSetProperties_t(self.__dditable.Debug.pfnGetThreadRegisterSetProperties)
+
+        # success!
diff --git a/third_party/level_zero/zet_api.h b/third_party/level_zero/zet_api.h
new file mode 100644
index 00000000000..15c74eb5c6d
--- /dev/null
+++ b/third_party/level_zero/zet_api.h
@@ -0,0 +1,3260 @@
+/*
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file zet_api.h
+ * @version v1.11-r1.11.8
+ *
+ */
+#ifndef _ZET_API_H
+#define _ZET_API_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+
+// 'core' API headers
+#include "ze_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// Intel 'oneAPI' Level-Zero Tool API common types
+#if !defined(__GNUC__)
+#pragma region common
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle to a driver instance
+typedef ze_driver_handle_t zet_driver_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of device object
+typedef ze_device_handle_t zet_device_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of context object
+typedef ze_context_handle_t zet_context_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of command list object
+typedef ze_command_list_handle_t zet_command_list_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of module object
+typedef ze_module_handle_t zet_module_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of function object
+typedef ze_kernel_handle_t zet_kernel_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric group's object
+typedef struct _zet_metric_group_handle_t *zet_metric_group_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric's object
+typedef struct _zet_metric_handle_t *zet_metric_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric streamer's object
+typedef struct _zet_metric_streamer_handle_t *zet_metric_streamer_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric query pool's object
+typedef struct _zet_metric_query_pool_handle_t *zet_metric_query_pool_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric query's object
+typedef struct _zet_metric_query_handle_t *zet_metric_query_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of tracer object
+typedef struct _zet_tracer_exp_handle_t *zet_tracer_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Debug session handle
+typedef struct _zet_debug_session_handle_t *zet_debug_session_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Defines structure types
+typedef enum _zet_structure_type_t
+{
+    ZET_STRUCTURE_TYPE_METRIC_GROUP_PROPERTIES = 0x1,                       ///< ::zet_metric_group_properties_t
+    ZET_STRUCTURE_TYPE_METRIC_PROPERTIES = 0x2,                             ///< ::zet_metric_properties_t
+    ZET_STRUCTURE_TYPE_METRIC_STREAMER_DESC = 0x3,                          ///< ::zet_metric_streamer_desc_t
+    ZET_STRUCTURE_TYPE_METRIC_QUERY_POOL_DESC = 0x4,                        ///< ::zet_metric_query_pool_desc_t
+    ZET_STRUCTURE_TYPE_PROFILE_PROPERTIES = 0x5,                            ///< ::zet_profile_properties_t
+    ZET_STRUCTURE_TYPE_DEVICE_DEBUG_PROPERTIES = 0x6,                       ///< ::zet_device_debug_properties_t
+    ZET_STRUCTURE_TYPE_DEBUG_MEMORY_SPACE_DESC = 0x7,                       ///< ::zet_debug_memory_space_desc_t
+    ZET_STRUCTURE_TYPE_DEBUG_REGSET_PROPERTIES = 0x8,                       ///< ::zet_debug_regset_properties_t
+    ZET_STRUCTURE_TYPE_GLOBAL_METRICS_TIMESTAMPS_EXP_PROPERTIES = 0x9,      ///< ::zet_metric_global_timestamps_resolution_exp_t. Deprecated, use
+                                                                            ///< ::ZET_STRUCTURE_TYPE_METRIC_GLOBAL_TIMESTAMPS_RESOLUTION_EXP.
+    ZET_STRUCTURE_TYPE_METRIC_GLOBAL_TIMESTAMPS_RESOLUTION_EXP = 0x9,       ///< ::zet_metric_global_timestamps_resolution_exp_t
+    ZET_STRUCTURE_TYPE_TRACER_EXP_DESC = 0x00010001,                        ///< ::zet_tracer_exp_desc_t
+    ZET_STRUCTURE_TYPE_METRICS_CALCULATE_EXP_DESC = 0x00010002,             ///< ::zet_metric_calculate_exp_desc_t. Deprecated, use
+                                                                            ///< ::ZET_STRUCTURE_TYPE_METRIC_CALCULATE_EXP_DESC.
+    ZET_STRUCTURE_TYPE_METRIC_CALCULATE_EXP_DESC = 0x00010002,              ///< ::zet_metric_calculate_exp_desc_t
+    ZET_STRUCTURE_TYPE_METRIC_PROGRAMMABLE_EXP_PROPERTIES = 0x00010003,     ///< ::zet_metric_programmable_exp_properties_t
+    ZET_STRUCTURE_TYPE_METRIC_PROGRAMMABLE_PARAM_INFO_EXP = 0x00010004,     ///< ::zet_metric_programmable_param_info_exp_t
+    ZET_STRUCTURE_TYPE_METRIC_PROGRAMMABLE_PARAM_VALUE_INFO_EXP = 0x00010005,   ///< ::zet_metric_programmable_param_value_info_exp_t
+    ZET_STRUCTURE_TYPE_METRIC_GROUP_TYPE_EXP = 0x00010006,                  ///< ::zet_metric_group_type_exp_t
+    ZET_STRUCTURE_TYPE_EXPORT_DMA_EXP_PROPERTIES = 0x00010007,              ///< ::zet_export_dma_buf_exp_properties_t
+    ZET_STRUCTURE_TYPE_METRIC_TRACER_EXP_DESC = 0x00010008,                 ///< ::zet_metric_tracer_exp_desc_t
+    ZET_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_structure_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all properties types
+typedef struct _zet_base_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zet_base_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Base for all descriptor types
+typedef struct _zet_base_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+
+} zet_base_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported value types
+typedef enum _zet_value_type_t
+{
+    ZET_VALUE_TYPE_UINT32 = 0,                                              ///< 32-bit unsigned-integer
+    ZET_VALUE_TYPE_UINT64 = 1,                                              ///< 64-bit unsigned-integer
+    ZET_VALUE_TYPE_FLOAT32 = 2,                                             ///< 32-bit floating-point
+    ZET_VALUE_TYPE_FLOAT64 = 3,                                             ///< 64-bit floating-point
+    ZET_VALUE_TYPE_BOOL8 = 4,                                               ///< 8-bit boolean
+    ZET_VALUE_TYPE_STRING = 5,                                              ///< C string
+    ZET_VALUE_TYPE_UINT8 = 6,                                               ///< 8-bit unsigned-integer
+    ZET_VALUE_TYPE_UINT16 = 7,                                              ///< 16-bit unsigned-integer
+    ZET_VALUE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_value_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Union of values
+typedef union _zet_value_t
+{
+    uint32_t ui32;                                                          ///< [out] 32-bit unsigned-integer
+    uint64_t ui64;                                                          ///< [out] 64-bit unsigned-integer
+    float fp32;                                                             ///< [out] 32-bit floating-point
+    double fp64;                                                            ///< [out] 64-bit floating-point
+    ze_bool_t b8;                                                           ///< [out] 8-bit boolean
+
+} zet_value_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Typed value
+typedef struct _zet_typed_value_t
+{
+    zet_value_type_t type;                                                  ///< [out] type of value
+    zet_value_t value;                                                      ///< [out] value
+
+} zet_typed_value_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enables driver instrumentation and dependencies for device metrics
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enables driver instrumentation and dependencies for program
+///        instrumentation
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enables driver instrumentation and dependencies for program debugging
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_base_properties_t
+typedef struct _zet_base_properties_t zet_base_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_base_desc_t
+typedef struct _zet_base_desc_t zet_base_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_typed_value_t
+typedef struct _zet_typed_value_t zet_typed_value_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_device_debug_properties_t
+typedef struct _zet_device_debug_properties_t zet_device_debug_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_config_t
+typedef struct _zet_debug_config_t zet_debug_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_event_info_detached_t
+typedef struct _zet_debug_event_info_detached_t zet_debug_event_info_detached_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_event_info_module_t
+typedef struct _zet_debug_event_info_module_t zet_debug_event_info_module_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_event_info_thread_stopped_t
+typedef struct _zet_debug_event_info_thread_stopped_t zet_debug_event_info_thread_stopped_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_event_info_page_fault_t
+typedef struct _zet_debug_event_info_page_fault_t zet_debug_event_info_page_fault_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_event_t
+typedef struct _zet_debug_event_t zet_debug_event_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_memory_space_desc_t
+typedef struct _zet_debug_memory_space_desc_t zet_debug_memory_space_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_debug_regset_properties_t
+typedef struct _zet_debug_regset_properties_t zet_debug_regset_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_group_properties_t
+typedef struct _zet_metric_group_properties_t zet_metric_group_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_properties_t
+typedef struct _zet_metric_properties_t zet_metric_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_streamer_desc_t
+typedef struct _zet_metric_streamer_desc_t zet_metric_streamer_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_query_pool_desc_t
+typedef struct _zet_metric_query_pool_desc_t zet_metric_query_pool_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_profile_properties_t
+typedef struct _zet_profile_properties_t zet_profile_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_profile_free_register_token_t
+typedef struct _zet_profile_free_register_token_t zet_profile_free_register_token_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_profile_register_sequence_t
+typedef struct _zet_profile_register_sequence_t zet_profile_register_sequence_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_tracer_exp_desc_t
+typedef struct _zet_tracer_exp_desc_t zet_tracer_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_tracer_exp_desc_t
+typedef struct _zet_metric_tracer_exp_desc_t zet_metric_tracer_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_entry_exp_t
+typedef struct _zet_metric_entry_exp_t zet_metric_entry_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_group_type_exp_t
+typedef struct _zet_metric_group_type_exp_t zet_metric_group_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_export_dma_buf_exp_properties_t
+typedef struct _zet_export_dma_buf_exp_properties_t zet_export_dma_buf_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_global_timestamps_resolution_exp_t
+typedef struct _zet_metric_global_timestamps_resolution_exp_t zet_metric_global_timestamps_resolution_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_calculate_exp_desc_t
+typedef struct _zet_metric_calculate_exp_desc_t zet_metric_calculate_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_programmable_exp_properties_t
+typedef struct _zet_metric_programmable_exp_properties_t zet_metric_programmable_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_value_uint64_range_exp_t
+typedef struct _zet_value_uint64_range_exp_t zet_value_uint64_range_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_value_fp64_range_exp_t
+typedef struct _zet_value_fp64_range_exp_t zet_value_fp64_range_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_programmable_param_info_exp_t
+typedef struct _zet_metric_programmable_param_info_exp_t zet_metric_programmable_param_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_programmable_param_value_info_exp_t
+typedef struct _zet_metric_programmable_param_value_info_exp_t zet_metric_programmable_param_value_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Forward-declare zet_metric_programmable_param_value_exp_t
+typedef struct _zet_metric_programmable_param_value_exp_t zet_metric_programmable_param_value_exp_t;
+
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Device
+#if !defined(__GNUC__)
+#pragma region device
+#endif
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Context
+#if !defined(__GNUC__)
+#pragma region context
+#endif
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Command List
+#if !defined(__GNUC__)
+#pragma region cmdlist
+#endif
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Module
+#if !defined(__GNUC__)
+#pragma region module
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported module debug info formats.
+typedef enum _zet_module_debug_info_format_t
+{
+    ZET_MODULE_DEBUG_INFO_FORMAT_ELF_DWARF = 0,                             ///< Format is ELF/DWARF
+    ZET_MODULE_DEBUG_INFO_FORMAT_FORCE_UINT32 = 0x7fffffff
+
+} zet_module_debug_info_format_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve debug info from module.
+/// 
+/// @details
+///     - The caller can pass nullptr for pDebugInfo when querying only for
+///       size.
+///     - The implementation will copy the native binary into a buffer supplied
+///       by the caller.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hModule`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_MODULE_DEBUG_INFO_FORMAT_ELF_DWARF < format`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetModuleGetDebugInfo(
+    zet_module_handle_t hModule,                                            ///< [in] handle of the module
+    zet_module_debug_info_format_t format,                                  ///< [in] debug info format requested
+    size_t* pSize,                                                          ///< [in,out] size of debug info in bytes
+    uint8_t* pDebugInfo                                                     ///< [in,out][optional] byte pointer to debug info
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Program Debug
+#if !defined(__GNUC__)
+#pragma region debug
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device debug property flags
+typedef uint32_t zet_device_debug_property_flags_t;
+typedef enum _zet_device_debug_property_flag_t
+{
+    ZET_DEVICE_DEBUG_PROPERTY_FLAG_ATTACH = ZE_BIT(0),                      ///< the device supports attaching for debug
+    ZET_DEVICE_DEBUG_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zet_device_debug_property_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device debug properties queried using ::zetDeviceGetDebugProperties.
+typedef struct _zet_device_debug_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_device_debug_property_flags_t flags;                                ///< [out] returns 0 (none) or a valid combination of
+                                                                            ///< ::zet_device_debug_property_flag_t
+
+} zet_device_debug_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves debug properties of the device.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pDebugProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDeviceGetDebugProperties(
+    zet_device_handle_t hDevice,                                            ///< [in] device handle
+    zet_device_debug_properties_t* pDebugProperties                         ///< [in,out] query result for debug properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Debug configuration provided to ::zetDebugAttach
+typedef struct _zet_debug_config_t
+{
+    uint32_t pid;                                                           ///< [in] the host process identifier
+
+} zet_debug_config_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Attach to a device.
+/// 
+/// @details
+///     - The device must be enabled for debug; see
+///       ::zesSchedulerSetComputeUnitDebugMode.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == config`
+///         + `nullptr == phDebug`
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_FEATURE
+///         + attaching to this device is not supported
+///     - ::ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS
+///         + caller does not have sufficient permissions
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + a debugger is already attached
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugAttach(
+    zet_device_handle_t hDevice,                                            ///< [in] device handle
+    const zet_debug_config_t* config,                                       ///< [in] the debug configuration
+    zet_debug_session_handle_t* phDebug                                     ///< [out] debug session handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Close a debug session.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugDetach(
+    zet_debug_session_handle_t hDebug                                       ///< [in][release] debug session handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported debug event flags.
+typedef uint32_t zet_debug_event_flags_t;
+typedef enum _zet_debug_event_flag_t
+{
+    ZET_DEBUG_EVENT_FLAG_NEED_ACK = ZE_BIT(0),                              ///< The event needs to be acknowledged by calling
+                                                                            ///< ::zetDebugAcknowledgeEvent.
+    ZET_DEBUG_EVENT_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zet_debug_event_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported debug event types.
+typedef enum _zet_debug_event_type_t
+{
+    ZET_DEBUG_EVENT_TYPE_INVALID = 0,                                       ///< The event is invalid
+    ZET_DEBUG_EVENT_TYPE_DETACHED = 1,                                      ///< The tool was detached
+    ZET_DEBUG_EVENT_TYPE_PROCESS_ENTRY = 2,                                 ///< The debuggee process created command queues on the device
+    ZET_DEBUG_EVENT_TYPE_PROCESS_EXIT = 3,                                  ///< The debuggee process destroyed all command queues on the device
+    ZET_DEBUG_EVENT_TYPE_MODULE_LOAD = 4,                                   ///< An in-memory module was loaded onto the device
+    ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD = 5,                                 ///< An in-memory module is about to get unloaded from the device
+    ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED = 6,                                ///< The thread stopped due to a device exception
+    ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE = 7,                            ///< The thread is not available to be stopped
+    ZET_DEBUG_EVENT_TYPE_PAGE_FAULT = 8,                                    ///< A page request could not be completed on the device
+    ZET_DEBUG_EVENT_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_debug_event_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported debug detach reasons.
+typedef enum _zet_debug_detach_reason_t
+{
+    ZET_DEBUG_DETACH_REASON_INVALID = 0,                                    ///< The detach reason is not valid
+    ZET_DEBUG_DETACH_REASON_HOST_EXIT = 1,                                  ///< The host process exited
+    ZET_DEBUG_DETACH_REASON_FORCE_UINT32 = 0x7fffffff
+
+} zet_debug_detach_reason_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_DETACHED
+typedef struct _zet_debug_event_info_detached_t
+{
+    zet_debug_detach_reason_t reason;                                       ///< [out] the detach reason
+
+} zet_debug_event_info_detached_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD and
+///        ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
+typedef struct _zet_debug_event_info_module_t
+{
+    zet_module_debug_info_format_t format;                                  ///< [out] the module format
+    uint64_t moduleBegin;                                                   ///< [out] the begin address of the in-memory module (inclusive)
+    uint64_t moduleEnd;                                                     ///< [out] the end address of the in-memory module (exclusive)
+    uint64_t load;                                                          ///< [out] the load address of the module on the device
+
+} zet_debug_event_info_module_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED and
+///        ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
+typedef struct _zet_debug_event_info_thread_stopped_t
+{
+    ze_device_thread_t thread;                                              ///< [out] the stopped/unavailable thread
+
+} zet_debug_event_info_thread_stopped_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Page fault reasons.
+typedef enum _zet_debug_page_fault_reason_t
+{
+    ZET_DEBUG_PAGE_FAULT_REASON_INVALID = 0,                                ///< The page fault reason is not valid
+    ZET_DEBUG_PAGE_FAULT_REASON_MAPPING_ERROR = 1,                          ///< The address is not mapped
+    ZET_DEBUG_PAGE_FAULT_REASON_PERMISSION_ERROR = 2,                       ///< Invalid access permissions
+    ZET_DEBUG_PAGE_FAULT_REASON_FORCE_UINT32 = 0x7fffffff
+
+} zet_debug_page_fault_reason_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event information for ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
+typedef struct _zet_debug_event_info_page_fault_t
+{
+    uint64_t address;                                                       ///< [out] the faulting address
+    uint64_t mask;                                                          ///< [out] the alignment mask
+    zet_debug_page_fault_reason_t reason;                                   ///< [out] the page fault reason
+
+} zet_debug_event_info_page_fault_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Event type-specific information
+typedef union _zet_debug_event_info_t
+{
+    zet_debug_event_info_detached_t detached;                               ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_DETACHED
+    zet_debug_event_info_module_t module;                                   ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_MODULE_LOAD or
+                                                                            ///< ::ZET_DEBUG_EVENT_TYPE_MODULE_UNLOAD
+    zet_debug_event_info_thread_stopped_t thread;                           ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_THREAD_STOPPED or
+                                                                            ///< ::ZET_DEBUG_EVENT_TYPE_THREAD_UNAVAILABLE
+    zet_debug_event_info_page_fault_t page_fault;                           ///< [out] type == ::ZET_DEBUG_EVENT_TYPE_PAGE_FAULT
+
+} zet_debug_event_info_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief A debug event on the device.
+typedef struct _zet_debug_event_t
+{
+    zet_debug_event_type_t type;                                            ///< [out] the event type
+    zet_debug_event_flags_t flags;                                          ///< [out] returns 0 (none) or a combination of ::zet_debug_event_flag_t
+    zet_debug_event_info_t info;                                            ///< [out] event type specific information
+
+} zet_debug_event_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read the topmost debug event.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == event`
+///     - ::ZE_RESULT_NOT_READY
+///         + the timeout expired
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugReadEvent(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    uint64_t timeout,                                                       ///< [in] if non-zero, then indicates the maximum time (in milliseconds) to
+                                                                            ///< yield before returning ::ZE_RESULT_SUCCESS or ::ZE_RESULT_NOT_READY;
+                                                                            ///< if zero, then immediately returns the status of the event;
+                                                                            ///< if `UINT64_MAX`, then function will not return until complete or
+                                                                            ///< device is lost.
+                                                                            ///< Due to external dependencies, timeout may be rounded to the closest
+                                                                            ///< value allowed by the accuracy of those dependencies.
+    zet_debug_event_t* event                                                ///< [in,out] a pointer to a ::zet_debug_event_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Acknowledge a debug event.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == event`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugAcknowledgeEvent(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    const zet_debug_event_t* event                                          ///< [in] a pointer to a ::zet_debug_event_t.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Interrupt device threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is already stopped or unavailable
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugInterrupt(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread                                               ///< [in] the thread to interrupt
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Resume device threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is already running or unavailable
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugResume(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread                                               ///< [in] the thread to resume
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported device memory space types.
+typedef enum _zet_debug_memory_space_type_t
+{
+    ZET_DEBUG_MEMORY_SPACE_TYPE_DEFAULT = 0,                                ///< default memory space (attribute may be omitted)
+    ZET_DEBUG_MEMORY_SPACE_TYPE_SLM = 1,                                    ///< shared local memory space (GPU-only)
+    ZET_DEBUG_MEMORY_SPACE_TYPE_ELF = 2,                                    ///< ELF file memory space
+    ZET_DEBUG_MEMORY_SPACE_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_debug_memory_space_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device memory space descriptor
+typedef struct _zet_debug_memory_space_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_debug_memory_space_type_t type;                                     ///< [in] type of memory space
+    uint64_t address;                                                       ///< [in] the virtual address within the memory space
+
+} zet_debug_memory_space_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read memory.
+/// 
+/// @details
+///     - The thread identifier 'all' can be used for accessing the default
+///       memory space, e.g. for setting breakpoints.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == buffer`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_DEBUG_MEMORY_SPACE_TYPE_ELF < desc->type`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is running or unavailable
+///         + the memory cannot be accessed from the supplied thread
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugReadMemory(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread,                                              ///< [in] the thread identifier.
+    const zet_debug_memory_space_desc_t* desc,                              ///< [in] memory space descriptor
+    size_t size,                                                            ///< [in] the number of bytes to read
+    void* buffer                                                            ///< [in,out] a buffer to hold a copy of the memory
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Write memory.
+/// 
+/// @details
+///     - The thread identifier 'all' can be used for accessing the default
+///       memory space, e.g. for setting breakpoints.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == buffer`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_DEBUG_MEMORY_SPACE_TYPE_ELF < desc->type`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is running or unavailable
+///         + the memory cannot be accessed from the supplied thread
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugWriteMemory(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread,                                              ///< [in] the thread identifier.
+    const zet_debug_memory_space_desc_t* desc,                              ///< [in] memory space descriptor
+    size_t size,                                                            ///< [in] the number of bytes to write
+    const void* buffer                                                      ///< [in] a buffer holding the pattern to write
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported general register set flags.
+typedef uint32_t zet_debug_regset_flags_t;
+typedef enum _zet_debug_regset_flag_t
+{
+    ZET_DEBUG_REGSET_FLAG_READABLE = ZE_BIT(0),                             ///< register set is readable
+    ZET_DEBUG_REGSET_FLAG_WRITEABLE = ZE_BIT(1),                            ///< register set is writeable
+    ZET_DEBUG_REGSET_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zet_debug_regset_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Device register set properties queried using
+///        ::zetDebugGetRegisterSetProperties.
+typedef struct _zet_debug_regset_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t type;                                                          ///< [out] device-specific register set type
+    uint32_t version;                                                       ///< [out] device-specific version of this register set
+    zet_debug_regset_flags_t generalFlags;                                  ///< [out] general register set flags
+    uint32_t deviceFlags;                                                   ///< [out] device-specific register set flags
+    uint32_t count;                                                         ///< [out] number of registers in the set
+    uint32_t bitSize;                                                       ///< [out] the size of a register in bits
+    uint32_t byteSize;                                                      ///< [out] the size required for reading or writing a register in bytes
+
+} zet_debug_regset_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves debug register set properties.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugGetRegisterSetProperties(
+    zet_device_handle_t hDevice,                                            ///< [in] device handle
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of register set properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of register set properties available.
+                                                                            ///< if count is greater than the number of register set properties
+                                                                            ///< available, then the driver shall update the value with the correct
+                                                                            ///< number of registry set properties available.
+    zet_debug_regset_properties_t* pRegisterSetProperties                   ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< register set properties.
+                                                                            ///< if count is less than the number of register set properties available,
+                                                                            ///< then driver shall only retrieve that number of register set properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves debug register set properties for a given thread.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is running or unavailable
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + the thread argument specifies more than one or a non-existant thread
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugGetThreadRegisterSetProperties(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread,                                              ///< [in] the thread identifier specifying a single stopped thread
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of register set properties.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of register set properties available.
+                                                                            ///< if count is greater than the number of register set properties
+                                                                            ///< available, then the driver shall update the value with the correct
+                                                                            ///< number of registry set properties available.
+    zet_debug_regset_properties_t* pRegisterSetProperties                   ///< [in,out][optional][range(0, *pCount)] array of query results for
+                                                                            ///< register set properties.
+                                                                            ///< if count is less than the number of register set properties available,
+                                                                            ///< then driver shall only retrieve that number of register set properties.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read register state.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is running or unavailable
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugReadRegisters(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread,                                              ///< [in] the thread identifier
+    uint32_t type,                                                          ///< [in] register set type
+    uint32_t start,                                                         ///< [in] the starting offset into the register state area; must be less
+                                                                            ///< than the `count` member of ::zet_debug_regset_properties_t for the
+                                                                            ///< type
+    uint32_t count,                                                         ///< [in] the number of registers to read; start+count must be less than or
+                                                                            ///< equal to the `count` member of ::zet_debug_register_group_properties_t
+                                                                            ///< for the type
+    void* pRegisterValues                                                   ///< [in,out][optional][range(0, count)] buffer of register values
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Write register state.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDebug`
+///     - ::ZE_RESULT_ERROR_NOT_AVAILABLE
+///         + the thread is running or unavailable
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDebugWriteRegisters(
+    zet_debug_session_handle_t hDebug,                                      ///< [in] debug session handle
+    ze_device_thread_t thread,                                              ///< [in] the thread identifier
+    uint32_t type,                                                          ///< [in] register set type
+    uint32_t start,                                                         ///< [in] the starting offset into the register state area; must be less
+                                                                            ///< than the `count` member of ::zet_debug_regset_properties_t for the
+                                                                            ///< type
+    uint32_t count,                                                         ///< [in] the number of registers to write; start+count must be less than
+                                                                            ///< or equal to the `count` member of
+                                                                            ///< ::zet_debug_register_group_properties_t for the type
+    void* pRegisterValues                                                   ///< [in,out][optional][range(0, count)] buffer of register values
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Metric
+#if !defined(__GNUC__)
+#pragma region metric
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves metric group for a device.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupGet(
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of metric groups.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric groups available.
+                                                                            ///< if count is greater than the number of metric groups available, then
+                                                                            ///< the driver shall update the value with the correct number of metric
+                                                                            ///< groups available.
+    zet_metric_group_handle_t* phMetricGroups                               ///< [in,out][optional][range(0, *pCount)] array of handle of metric groups.
+                                                                            ///< if count is less than the number of metric groups available, then
+                                                                            ///< driver shall only retrieve that number of metric groups.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_GROUP_NAME
+/// @brief Maximum metric group name string size
+#define ZET_MAX_METRIC_GROUP_NAME  256
+#endif // ZET_MAX_METRIC_GROUP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_GROUP_DESCRIPTION
+/// @brief Maximum metric group description string size
+#define ZET_MAX_METRIC_GROUP_DESCRIPTION  256
+#endif // ZET_MAX_METRIC_GROUP_DESCRIPTION
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric group sampling type
+typedef uint32_t zet_metric_group_sampling_type_flags_t;
+typedef enum _zet_metric_group_sampling_type_flag_t
+{
+    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_EVENT_BASED = ZE_BIT(0),            ///< Event based sampling
+    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_TIME_BASED = ZE_BIT(1),             ///< Time based sampling
+    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_EXP_TRACER_BASED = ZE_BIT(2),       ///< Experimental Tracer based sampling
+    ZET_METRIC_GROUP_SAMPLING_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_group_sampling_type_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric group properties queried using ::zetMetricGroupGetProperties
+typedef struct _zet_metric_group_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    char name[ZET_MAX_METRIC_GROUP_NAME];                                   ///< [out] metric group name
+    char description[ZET_MAX_METRIC_GROUP_DESCRIPTION];                     ///< [out] metric group description
+    zet_metric_group_sampling_type_flags_t samplingType;                    ///< [out] metric group sampling type.
+                                                                            ///< returns a combination of ::zet_metric_group_sampling_type_flag_t.
+    uint32_t domain;                                                        ///< [out] metric group domain number. Cannot use multiple, simultaneous
+                                                                            ///< metric groups from the same domain.
+    uint32_t metricCount;                                                   ///< [out] metric count belonging to this group
+
+} zet_metric_group_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves attributes of a metric group.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupGetProperties(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    zet_metric_group_properties_t* pProperties                              ///< [in,out] metric group properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric types
+typedef enum _zet_metric_type_t
+{
+    ZET_METRIC_TYPE_DURATION = 0,                                           ///< Metric type: duration
+    ZET_METRIC_TYPE_EVENT = 1,                                              ///< Metric type: event
+    ZET_METRIC_TYPE_EVENT_WITH_RANGE = 2,                                   ///< Metric type: event with range
+    ZET_METRIC_TYPE_THROUGHPUT = 3,                                         ///< Metric type: throughput
+    ZET_METRIC_TYPE_TIMESTAMP = 4,                                          ///< Metric type: timestamp
+    ZET_METRIC_TYPE_FLAG = 5,                                               ///< Metric type: flag
+    ZET_METRIC_TYPE_RATIO = 6,                                              ///< Metric type: ratio
+    ZET_METRIC_TYPE_RAW = 7,                                                ///< Metric type: raw
+    ZET_METRIC_TYPE_EVENT_EXP_TIMESTAMP = 0x7ffffff9,                       ///< Metric type: event with only timestamp and value has no meaning
+    ZET_METRIC_TYPE_EVENT_EXP_START = 0x7ffffffa,                           ///< Metric type: the first event of a start/end event pair
+    ZET_METRIC_TYPE_EVENT_EXP_END = 0x7ffffffb,                             ///< Metric type: the second event of a start/end event pair
+    ZET_METRIC_TYPE_EVENT_EXP_MONOTONIC_WRAPS_VALUE = 0x7ffffffc,           ///< Metric type: value of the event is a monotonically increasing value
+                                                                            ///< that can wrap around
+    ZET_METRIC_TYPE_EXP_EXPORT_DMA_BUF = 0x7ffffffd,                        ///< Metric which exports linux dma_buf, which could be imported/mapped to
+                                                                            ///< the host process
+    ZET_METRIC_TYPE_IP_EXP = 0x7ffffffe,                                    ///< Metric type: instruction pointer. Deprecated, use
+                                                                            ///< ::ZET_METRIC_TYPE_IP.
+    ZET_METRIC_TYPE_IP = 0x7ffffffe,                                        ///< Metric type: instruction pointer
+    ZET_METRIC_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric group calculation type
+typedef enum _zet_metric_group_calculation_type_t
+{
+    ZET_METRIC_GROUP_CALCULATION_TYPE_METRIC_VALUES = 0,                    ///< Calculated metric values from raw data.
+    ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES = 1,                ///< Maximum metric values.
+    ZET_METRIC_GROUP_CALCULATION_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_group_calculation_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Calculates metric values from raw data.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES < type`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawData`
+///         + `nullptr == pMetricValueCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupCalculateMetricValues(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    zet_metric_group_calculation_type_t type,                               ///< [in] calculation type to be applied on raw data
+    size_t rawDataSize,                                                     ///< [in] size in bytes of raw data buffer
+    const uint8_t* pRawData,                                                ///< [in][range(0, rawDataSize)] buffer of raw data to calculate
+    uint32_t* pMetricValueCount,                                            ///< [in,out] pointer to number of metric values calculated.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric values to be calculated.
+                                                                            ///< if count is greater than the number available in the raw data buffer,
+                                                                            ///< then the driver shall update the value with the actual number of
+                                                                            ///< metric values to be calculated.
+    zet_typed_value_t* pMetricValues                                        ///< [in,out][optional][range(0, *pMetricValueCount)] buffer of calculated metrics.
+                                                                            ///< if count is less than the number available in the raw data buffer,
+                                                                            ///< then driver shall only calculate that number of metric values.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves metric from a metric group.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGet(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of metrics.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metrics available.
+                                                                            ///< if count is greater than the number of metrics available, then the
+                                                                            ///< driver shall update the value with the correct number of metrics available.
+    zet_metric_handle_t* phMetrics                                          ///< [in,out][optional][range(0, *pCount)] array of handle of metrics.
+                                                                            ///< if count is less than the number of metrics available, then driver
+                                                                            ///< shall only retrieve that number of metrics.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_NAME
+/// @brief Maximum metric name string size
+#define ZET_MAX_METRIC_NAME  256
+#endif // ZET_MAX_METRIC_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_DESCRIPTION
+/// @brief Maximum metric description string size
+#define ZET_MAX_METRIC_DESCRIPTION  256
+#endif // ZET_MAX_METRIC_DESCRIPTION
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_COMPONENT
+/// @brief Maximum metric component string size
+#define ZET_MAX_METRIC_COMPONENT  256
+#endif // ZET_MAX_METRIC_COMPONENT
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_RESULT_UNITS
+/// @brief Maximum metric result units string size
+#define ZET_MAX_METRIC_RESULT_UNITS  256
+#endif // ZET_MAX_METRIC_RESULT_UNITS
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric properties queried using ::zetMetricGetProperties
+typedef struct _zet_metric_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    char name[ZET_MAX_METRIC_NAME];                                         ///< [out] metric name
+    char description[ZET_MAX_METRIC_DESCRIPTION];                           ///< [out] metric description
+    char component[ZET_MAX_METRIC_COMPONENT];                               ///< [out] metric component
+    uint32_t tierNumber;                                                    ///< [out] number of tier
+    zet_metric_type_t metricType;                                           ///< [out] metric type
+    zet_value_type_t resultType;                                            ///< [out] metric result type
+    char resultUnits[ZET_MAX_METRIC_RESULT_UNITS];                          ///< [out] metric result units
+
+} zet_metric_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves attributes of a metric.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetric`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGetProperties(
+    zet_metric_handle_t hMetric,                                            ///< [in] handle of the metric
+    zet_metric_properties_t* pProperties                                    ///< [in,out] metric properties
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Activates metric groups.
+/// 
+/// @details
+///     - Immediately reconfigures the device to activate only those metric
+///       groups provided.
+///     - Any metric groups previously activated but not provided will be
+///       deactivated.
+///     - Deactivating metric groups that are still in-use will result in
+///       undefined behavior.
+///     - All metric groups must have different domains, see
+///       ::zet_metric_group_properties_t.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same device handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phMetricGroups) && (0 < count)`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + Multiple metric groups share the same domain
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetContextActivateMetricGroups(
+    zet_context_handle_t hContext,                                          ///< [in] handle of the context object
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    uint32_t count,                                                         ///< [in] metric group count to activate; must be 0 if `nullptr ==
+                                                                            ///< phMetricGroups`
+    zet_metric_group_handle_t* phMetricGroups                               ///< [in][optional][range(0, count)] handles of the metric groups to activate.
+                                                                            ///< nullptr deactivates all previously used metric groups.
+                                                                            ///< all metrics groups must come from a different domains.
+                                                                            ///< metric query and metric stream must use activated metric groups.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric streamer descriptor
+typedef struct _zet_metric_streamer_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t notifyEveryNReports;                                           ///< [in,out] number of collected reports after which notification event
+                                                                            ///< will be signaled. If the requested value is not supported exactly,
+                                                                            ///< then the driver may use a value that is the closest supported
+                                                                            ///< approximation and shall update this member during ::zetMetricStreamerOpen.
+    uint32_t samplingPeriod;                                                ///< [in,out] streamer sampling period in nanoseconds. If the requested
+                                                                            ///< value is not supported exactly, then the driver may use a value that
+                                                                            ///< is the closest supported approximation and shall update this member
+                                                                            ///< during ::zetMetricStreamerOpen.
+
+} zet_metric_streamer_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Opens metric streamer for a device.
+/// 
+/// @details
+///     - The notification event must have been created from an event pool that
+///       was created using ::ZE_EVENT_POOL_FLAG_HOST_VISIBLE flag.
+///     - The duration of the signal event created from an event pool that was
+///       created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
+///       However, for consistency and orthogonality the event will report
+///       correctly as signaled when used by other event API functionality.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same device handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phMetricStreamer`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricStreamerOpen(
+    zet_context_handle_t hContext,                                          ///< [in] handle of the context object
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    zet_metric_streamer_desc_t* desc,                                       ///< [in,out] metric streamer descriptor
+    ze_event_handle_t hNotificationEvent,                                   ///< [in][optional] event used for report availability notification
+    zet_metric_streamer_handle_t* phMetricStreamer                          ///< [out] handle of metric streamer
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Append metric streamer marker into a command list.
+/// 
+/// @details
+///     - The application must ensure the metric streamer is accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the command list and metric streamer were
+///       created on the same context.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+///     - Allow to associate metric stream time based metrics with executed
+///       workload.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hMetricStreamer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetCommandListAppendMetricStreamerMarker(
+    zet_command_list_handle_t hCommandList,                                 ///< [in] handle of the command list
+    zet_metric_streamer_handle_t hMetricStreamer,                           ///< [in] handle of the metric streamer
+    uint32_t value                                                          ///< [in] streamer marker value
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Closes metric streamer.
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same metric streamer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricStreamer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricStreamerClose(
+    zet_metric_streamer_handle_t hMetricStreamer                            ///< [in][release] handle of the metric streamer
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Reads data from metric streamer.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricStreamer`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawDataSize`
+///     - ::ZE_RESULT_WARNING_DROPPED_DATA
+///         + Metric streamer data may have been dropped. Reduce sampling period.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricStreamerReadData(
+    zet_metric_streamer_handle_t hMetricStreamer,                           ///< [in] handle of the metric streamer
+    uint32_t maxReportCount,                                                ///< [in] the maximum number of reports the application wants to receive.
+                                                                            ///< if `UINT32_MAX`, then function will retrieve all reports available
+    size_t* pRawDataSize,                                                   ///< [in,out] pointer to size in bytes of raw data requested to read.
+                                                                            ///< if size is zero, then the driver will update the value with the total
+                                                                            ///< size in bytes needed for all reports available.
+                                                                            ///< if size is non-zero, then driver will only retrieve the number of
+                                                                            ///< reports that fit into the buffer.
+                                                                            ///< if size is larger than size needed for all reports, then driver will
+                                                                            ///< update the value with the actual size needed.
+    uint8_t* pRawData                                                       ///< [in,out][optional][range(0, *pRawDataSize)] buffer containing streamer
+                                                                            ///< reports in raw format
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric query pool types
+typedef enum _zet_metric_query_pool_type_t
+{
+    ZET_METRIC_QUERY_POOL_TYPE_PERFORMANCE = 0,                             ///< Performance metric query pool.
+    ZET_METRIC_QUERY_POOL_TYPE_EXECUTION = 1,                               ///< Skips workload execution between begin/end calls.
+    ZET_METRIC_QUERY_POOL_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_query_pool_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric query pool description
+typedef struct _zet_metric_query_pool_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_metric_query_pool_type_t type;                                      ///< [in] Query pool type.
+    uint32_t count;                                                         ///< [in] Internal slots count within query pool object.
+
+} zet_metric_query_pool_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a pool of metric queries on the context.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == phMetricQueryPool`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_METRIC_QUERY_POOL_TYPE_EXECUTION < desc->type`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricQueryPoolCreate(
+    zet_context_handle_t hContext,                                          ///< [in] handle of the context object
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] metric group associated with the query object.
+    const zet_metric_query_pool_desc_t* desc,                               ///< [in] metric query pool descriptor
+    zet_metric_query_pool_handle_t* phMetricQueryPool                       ///< [out] handle of metric query pool
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Deletes a query pool object.
+/// 
+/// @details
+///     - The application must destroy all query handles created from the pool
+///       before destroying the pool itself.
+///     - The application must ensure the device is not currently referencing
+///       the any query within the pool before it is deleted.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same query pool handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricQueryPool`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricQueryPoolDestroy(
+    zet_metric_query_pool_handle_t hMetricQueryPool                         ///< [in][release] handle of the metric query pool
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates metric query from the pool.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricQueryPool`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phMetricQuery`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricQueryCreate(
+    zet_metric_query_pool_handle_t hMetricQueryPool,                        ///< [in] handle of the metric query pool
+    uint32_t index,                                                         ///< [in] index of the query within the pool
+    zet_metric_query_handle_t* phMetricQuery                                ///< [out] handle of metric query
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Deletes a metric query object.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the query before it is deleted.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same query handle.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricQuery`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricQueryDestroy(
+    zet_metric_query_handle_t hMetricQuery                                  ///< [in][release] handle of metric query
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Resets a metric query object back to initial state.
+/// 
+/// @details
+///     - The application must ensure the device is not currently referencing
+///       the query before it is reset
+///     - The application must **not** call this function from simultaneous
+///       threads with the same query handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricQuery`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricQueryReset(
+    zet_metric_query_handle_t hMetricQuery                                  ///< [in] handle of metric query
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends metric query begin into a command list.
+/// 
+/// @details
+///     - The application must ensure the metric query is accessible by the
+///       device on which the command list was created.
+///     - The application must ensure the command list and metric query were
+///       created on the same context.
+///     - This command blocks all following commands from beginning until the
+///       execution of the query completes.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hMetricQuery`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetCommandListAppendMetricQueryBegin(
+    zet_command_list_handle_t hCommandList,                                 ///< [in] handle of the command list
+    zet_metric_query_handle_t hMetricQuery                                  ///< [in] handle of the metric query
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends metric query end into a command list.
+/// 
+/// @details
+///     - The application must ensure the metric query and events are accessible
+///       by the device on which the command list was created.
+///     - The application must ensure the command list, events and metric query
+///       were created on the same context.
+///     - The duration of the signal event created from an event pool that was
+///       created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
+///       However, for consistency and orthogonality the event will report
+///       correctly as signaled when used by other event API functionality.
+///     - If numWaitEvents is zero, then all previous commands are completed
+///       prior to the execution of the query.
+///     - If numWaitEvents is non-zero, then all phWaitEvents must be signaled
+///       prior to the execution of the query.
+///     - This command blocks all following commands from beginning until the
+///       execution of the query completes.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+///         + `nullptr == hMetricQuery`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+///     - ::ZE_RESULT_ERROR_INVALID_SIZE
+///         + `(nullptr == phWaitEvents) && (0 < numWaitEvents)`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetCommandListAppendMetricQueryEnd(
+    zet_command_list_handle_t hCommandList,                                 ///< [in] handle of the command list
+    zet_metric_query_handle_t hMetricQuery,                                 ///< [in] handle of the metric query
+    ze_event_handle_t hSignalEvent,                                         ///< [in][optional] handle of the event to signal on completion
+    uint32_t numWaitEvents,                                                 ///< [in] must be zero
+    ze_event_handle_t* phWaitEvents                                         ///< [in][mbz] must be nullptr
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Appends metric query commands to flush all caches.
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same command list handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hCommandList`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetCommandListAppendMetricMemoryBarrier(
+    zet_command_list_handle_t hCommandList                                  ///< [in] handle of the command list
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves raw data for a given metric query.
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricQuery`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawDataSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricQueryGetData(
+    zet_metric_query_handle_t hMetricQuery,                                 ///< [in] handle of the metric query
+    size_t* pRawDataSize,                                                   ///< [in,out] pointer to size in bytes of raw data requested to read.
+                                                                            ///< if size is zero, then the driver will update the value with the total
+                                                                            ///< size in bytes needed for all reports available.
+                                                                            ///< if size is non-zero, then driver will only retrieve the number of
+                                                                            ///< reports that fit into the buffer.
+                                                                            ///< if size is larger than size needed for all reports, then driver will
+                                                                            ///< update the value with the actual size needed.
+    uint8_t* pRawData                                                       ///< [in,out][optional][range(0, *pRawDataSize)] buffer containing query
+                                                                            ///< reports in raw format
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool APIs for Program Instrumentation (PIN)
+#if !defined(__GNUC__)
+#pragma region pin
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supportted profile features
+typedef uint32_t zet_profile_flags_t;
+typedef enum _zet_profile_flag_t
+{
+    ZET_PROFILE_FLAG_REGISTER_REALLOCATION = ZE_BIT(0),                     ///< request the compiler attempt to minimize register usage as much as
+                                                                            ///< possible to allow for instrumentation
+    ZET_PROFILE_FLAG_FREE_REGISTER_INFO = ZE_BIT(1),                        ///< request the compiler generate free register info
+    ZET_PROFILE_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zet_profile_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Profiling meta-data for instrumentation
+typedef struct _zet_profile_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_profile_flags_t flags;                                              ///< [out] indicates which flags were enabled during compilation.
+                                                                            ///< returns 0 (none) or a combination of ::zet_profile_flag_t
+    uint32_t numTokens;                                                     ///< [out] number of tokens immediately following this structure
+
+} zet_profile_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported profile token types
+typedef enum _zet_profile_token_type_t
+{
+    ZET_PROFILE_TOKEN_TYPE_FREE_REGISTER = 0,                               ///< GRF info
+    ZET_PROFILE_TOKEN_TYPE_FORCE_UINT32 = 0x7fffffff
+
+} zet_profile_token_type_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Profile free register token detailing unused registers in the current
+///        function
+typedef struct _zet_profile_free_register_token_t
+{
+    zet_profile_token_type_t type;                                          ///< [out] type of token
+    uint32_t size;                                                          ///< [out] total size of the token, in bytes
+    uint32_t count;                                                         ///< [out] number of register sequences immediately following this
+                                                                            ///< structure
+
+} zet_profile_free_register_token_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Profile register sequence detailing consecutive bytes, all of which
+///        are unused
+typedef struct _zet_profile_register_sequence_t
+{
+    uint32_t start;                                                         ///< [out] starting byte in the register table, representing the start of
+                                                                            ///< unused bytes in the current function
+    uint32_t count;                                                         ///< [out] number of consecutive bytes in the sequence, starting from start
+
+} zet_profile_register_sequence_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve profiling information generated for the kernel.
+/// 
+/// @details
+///     - Module must be created using the following build option:
+///         + "-zet-profile-flags <n>" - enable generation of profile
+///           information
+///         + "<n>" must be a combination of ::zet_profile_flag_t, in hex
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function should be lock-free.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hKernel`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProfileProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetKernelGetProfileInfo(
+    zet_kernel_handle_t hKernel,                                            ///< [in] handle to kernel
+    zet_profile_properties_t* pProfileProperties                            ///< [out] pointer to profile properties
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension APIs for API Tracing
+#if !defined(__GNUC__)
+#pragma region tracing
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_API_TRACING_EXP_NAME
+/// @brief API Tracing Experimental Extension Name
+#define ZET_API_TRACING_EXP_NAME  "ZET_experimental_api_tracing"
+#endif // ZET_API_TRACING_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief API Tracing Experimental Extension Version(s)
+typedef enum _zet_api_tracing_exp_version_t
+{
+    ZET_API_TRACING_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),              ///< version 1.0
+    ZET_API_TRACING_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),          ///< latest known version
+    ZET_API_TRACING_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zet_api_tracing_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Alias the existing callbacks definition for 'core' callbacks
+typedef ze_callbacks_t zet_core_callbacks_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Tracer descriptor
+typedef struct _zet_tracer_exp_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    void* pUserData;                                                        ///< [in] pointer passed to every tracer's callbacks
+
+} zet_tracer_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Creates a tracer on the context.
+/// 
+/// @details
+///     - The application must only use the tracer for the context which was
+///       provided during creation.
+///     - The tracer is created in the disabled state.
+///     - The application may call this function from simultaneous threads.
+///     - The implementation of this function must be thread-safe.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == desc`
+///         + `nullptr == desc->pUserData`
+///         + `nullptr == phTracer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetTracerExpCreate(
+    zet_context_handle_t hContext,                                          ///< [in] handle of the context object
+    const zet_tracer_exp_desc_t* desc,                                      ///< [in] pointer to tracer descriptor
+    zet_tracer_exp_handle_t* phTracer                                       ///< [out] pointer to handle of tracer object created
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroys a tracer.
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same tracer handle.
+///     - The implementation of this function must be thread-safe.
+///     - The implementation of this function will stall and wait on any
+///       outstanding threads executing callbacks before freeing any Host
+///       allocations associated with this tracer.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTracer`
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetTracerExpDestroy(
+    zet_tracer_exp_handle_t hTracer                                         ///< [in][release] handle of tracer object to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sets the collection of callbacks to be executed **before** driver
+///        execution.
+/// 
+/// @details
+///     - The application only needs to set the function pointers it is
+///       interested in receiving; all others should be 'nullptr'
+///     - The application must ensure that no other threads are executing
+///       functions for which the tracing functions are changing.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same tracer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTracer`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCoreCbs`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetTracerExpSetPrologues(
+    zet_tracer_exp_handle_t hTracer,                                        ///< [in] handle of the tracer
+    zet_core_callbacks_t* pCoreCbs                                          ///< [in] pointer to table of 'core' callback function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Sets the collection of callbacks to be executed **after** driver
+///        execution.
+/// 
+/// @details
+///     - The application only needs to set the function pointers it is
+///       interested in receiving; all others should be 'nullptr'
+///     - The application must ensure that no other threads are executing
+///       functions for which the tracing functions are changing.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same tracer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTracer`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCoreCbs`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetTracerExpSetEpilogues(
+    zet_tracer_exp_handle_t hTracer,                                        ///< [in] handle of the tracer
+    zet_core_callbacks_t* pCoreCbs                                          ///< [in] pointer to table of 'core' callback function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enables (or disables) the tracer
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same tracer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hTracer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetTracerExpSetEnabled(
+    zet_tracer_exp_handle_t hTracer,                                        ///< [in] handle of the tracer
+    ze_bool_t enable                                                        ///< [in] enable the tracer if true; disable if false
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension to get Concurrent Metric Groups
+#if !defined(__GNUC__)
+#pragma region concurrentMetricGroup
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_CONCURRENT_METRIC_GROUPS_EXP_NAME
+/// @brief Concurrent Metric Groups Experimental Extension Name
+#define ZET_CONCURRENT_METRIC_GROUPS_EXP_NAME  "ZET_experimental_concurrent_metric_groups"
+#endif // ZET_CONCURRENT_METRIC_GROUPS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Concurrent Metric Groups Experimental Extension Version(s)
+typedef enum _zet_concurrent_metric_groups_exp_version_t
+{
+    ZET_CONCURRENT_METRIC_GROUPS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ), ///< version 1.0
+    ZET_CONCURRENT_METRIC_GROUPS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ), ///< latest known version
+    ZET_CONCURRENT_METRIC_GROUPS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zet_concurrent_metric_groups_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get sets of metric groups which could be collected concurrently.
+/// 
+/// @details
+///     - Re-arrange the input metric groups to provide sets of concurrent
+///       metric groups.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///         + `nullptr == phMetricGroups`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDeviceGetConcurrentMetricGroupsExp(
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    uint32_t metricGroupCount,                                              ///< [in] metric group count
+    zet_metric_group_handle_t * phMetricGroups,                             ///< [in,out] metrics groups to be re-arranged to be sets of concurrent
+                                                                            ///< groups
+    uint32_t * pMetricGroupsCountPerConcurrentGroup,                        ///< [in,out][optional][*pConcurrentGroupCount] count of metric groups per
+                                                                            ///< concurrent group.
+    uint32_t * pConcurrentGroupCount                                        ///< [out] number of concurrent groups.
+                                                                            ///< The value of this parameter could be used to determine the number of
+                                                                            ///< replays necessary.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Metrics Tracer
+#if !defined(__GNUC__)
+#pragma region metricTracer
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_METRICS_TRACER_EXP_NAME
+/// @brief Metric Tracer Experimental Extension Name
+#define ZET_METRICS_TRACER_EXP_NAME  "ZET_experimental_metric_tracer"
+#endif // ZET_METRICS_TRACER_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric Tracer Experimental Extension Version(s)
+typedef enum _zet_metric_tracer_exp_version_t
+{
+    ZET_METRIC_TRACER_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),            ///< version 1.0
+    ZET_METRIC_TRACER_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),        ///< latest known version
+    ZET_METRIC_TRACER_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_tracer_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric tracer's object
+typedef struct _zet_metric_tracer_exp_handle_t *zet_metric_tracer_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric decoder's object
+typedef struct _zet_metric_decoder_exp_handle_t *zet_metric_decoder_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric tracer descriptor
+typedef struct _zet_metric_tracer_exp_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t notifyEveryNBytes;                                             ///< [in,out] number of collected bytes after which notification event will
+                                                                            ///< be signaled. If the requested value is not supported exactly, then the
+                                                                            ///< driver may use a value that is the closest supported approximation and
+                                                                            ///< shall update this member during ::zetMetricTracerCreateExp.
+
+} zet_metric_tracer_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Decoded metric entry
+typedef struct _zet_metric_entry_exp_t
+{
+    zet_value_t value;                                                      ///< [out] value of the decodable metric entry or event. Number is
+                                                                            ///< meaningful based on the metric type.
+    uint64_t timeStamp;                                                     ///< [out] timestamp at which the event happened.
+    uint32_t metricIndex;                                                   ///< [out] index to the decodable metric handle in the input array
+                                                                            ///< (phMetric) in ::zetMetricTracerDecodeExp().
+    ze_bool_t onSubdevice;                                                  ///< [out] True if the event occurred on a sub-device; false means the
+                                                                            ///< device on which the metric tracer was opened does not have
+                                                                            ///< sub-devices.
+    uint32_t subdeviceId;                                                   ///< [out] If onSubdevice is true, this gives the ID of the sub-device.
+
+} zet_metric_entry_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a metric tracer for a device.
+/// 
+/// @details
+///     - The notification event must have been created from an event pool that
+///       was created using ::ZE_EVENT_POOL_FLAG_HOST_VISIBLE flag.
+///     - The duration of the signal event created from an event pool that was
+///       created using ::ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined.
+///       However, for consistency and orthogonality the event will report
+///       correctly as signaled when used by other event API functionality.
+///     - The application must **not** call this function from simultaneous
+///       threads with the same device handle.
+///     - The metric tracer is created in disabled state
+///     - Metric groups must support sampling type
+///       ZET_METRIC_SAMPLING_TYPE_EXP_FLAG_TRACER_BASED
+///     - All metric groups must be first activated
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hContext`
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phMetricGroups`
+///         + `nullptr == desc`
+///         + `nullptr == phMetricTracer`
+///     - ::ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricTracerCreateExp(
+    zet_context_handle_t hContext,                                          ///< [in] handle of the context object
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    uint32_t metricGroupCount,                                              ///< [in] metric group count
+    zet_metric_group_handle_t* phMetricGroups,                              ///< [in][range(0, metricGroupCount )] handles of the metric groups to
+                                                                            ///< trace
+    zet_metric_tracer_exp_desc_t* desc,                                     ///< [in,out] metric tracer descriptor
+    ze_event_handle_t hNotificationEvent,                                   ///< [in][optional] event used for report availability notification. Note:
+                                                                            ///< If buffer is not drained when the event it flagged, there is a risk of
+                                                                            ///< HW event buffer being overrun
+    zet_metric_tracer_exp_handle_t* phMetricTracer                          ///< [out] handle of the metric tracer
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroy a metric tracer.
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same metric tracer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricTracer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricTracerDestroyExp(
+    zet_metric_tracer_exp_handle_t hMetricTracer                            ///< [in] handle of the metric tracer
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Start events collection
+/// 
+/// @details
+///     - Driver implementations must make this API call have as minimal
+///       overhead as possible, to allow applications start/stop event
+///       collection at any point during execution
+///     - The application must **not** call this function from simultaneous
+///       threads with the same metric tracer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricTracer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricTracerEnableExp(
+    zet_metric_tracer_exp_handle_t hMetricTracer,                           ///< [in] handle of the metric tracer
+    ze_bool_t synchronous                                                   ///< [in] request synchronous behavior. Confirmation of successful
+                                                                            ///< asynchronous operation is done by calling ::zetMetricTracerReadDataExp()
+                                                                            ///< and checking the return status: ::ZE_RESULT_NOT_READY will be returned
+                                                                            ///< when the tracer is inactive. ::ZE_RESULT_SUCCESS will be returned 
+                                                                            ///< when the tracer is active.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Stop events collection
+/// 
+/// @details
+///     - Driver implementations must make this API call have as minimal
+///       overhead as possible, to allow applications start/stop event
+///       collection at any point during execution
+///     - The application must **not** call this function from simultaneous
+///       threads with the same metric tracer handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricTracer`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricTracerDisableExp(
+    zet_metric_tracer_exp_handle_t hMetricTracer,                           ///< [in] handle of the metric tracer
+    ze_bool_t synchronous                                                   ///< [in] request synchronous behavior. Confirmation of successful
+                                                                            ///< asynchronous operation is done by calling ::zetMetricTracerReadDataExp()
+                                                                            ///< and checking the return status: ::ZE_RESULT_SUCCESS will be returned
+                                                                            ///< when the tracer is active or when it is inactive but still has data. 
+                                                                            ///< ::ZE_RESULT_NOT_READY will be returned when the tracer is inactive and
+                                                                            ///< has no more data to be retrieved.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Read data from the metric tracer
+/// 
+/// @details
+///     - The application must **not** call this function from simultaneous
+///       threads with the same metric tracer handle.
+///     - Data can be retrieved after tracer is disabled. When buffers are
+///       drained ::ZE_RESULT_NOT_READY will be returned
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricTracer`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawDataSize`
+///     - ::ZE_RESULT_WARNING_DROPPED_DATA
+///         + Metric tracer data may have been dropped.
+///     - ::ZE_RESULT_NOT_READY
+///         + Metric tracer is disabled and no data is available to read.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricTracerReadDataExp(
+    zet_metric_tracer_exp_handle_t hMetricTracer,                           ///< [in] handle of the metric tracer
+    size_t* pRawDataSize,                                                   ///< [in,out] pointer to size in bytes of raw data requested to read.
+                                                                            ///< if size is zero, then the driver will update the value with the total
+                                                                            ///< size in bytes needed for all data available.
+                                                                            ///< if size is non-zero, then driver will only retrieve that amount of
+                                                                            ///< data. 
+                                                                            ///< if size is larger than size needed for all data, then driver will
+                                                                            ///< update the value with the actual size needed.
+    uint8_t* pRawData                                                       ///< [in,out][optional][range(0, *pRawDataSize)] buffer containing tracer
+                                                                            ///< data in raw format
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create a metric decoder for a given metric tracer.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricTracer`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == phMetricDecoder`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricDecoderCreateExp(
+    zet_metric_tracer_exp_handle_t hMetricTracer,                           ///< [in] handle of the metric tracer
+    zet_metric_decoder_exp_handle_t* phMetricDecoder                        ///< [out] handle of the metric decoder object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroy a metric decoder.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == phMetricDecoder`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricDecoderDestroyExp(
+    zet_metric_decoder_exp_handle_t phMetricDecoder                         ///< [in] handle of the metric decoder object
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Return the list of the decodable metrics from the decoder.
+/// 
+/// @details
+///     - The decodable metrics handles returned by this API are defined by the
+///       metric groups in the tracer on which the decoder was created.
+///     - The decodable metrics handles returned by this API are only valid to
+///       decode metrics raw data with ::zetMetricTracerDecodeExp(). Decodable
+///       metric handles are not valid to compare with metrics handles included
+///       in metric groups.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricDecoder`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+///         + `nullptr == phMetrics`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricDecoderGetDecodableMetricsExp(
+    zet_metric_decoder_exp_handle_t hMetricDecoder,                         ///< [in] handle of the metric decoder object
+    uint32_t* pCount,                                                       ///< [in,out] pointer to number of decodable metric in the hMetricDecoder
+                                                                            ///< handle. If count is zero, then the driver shall 
+                                                                            ///< update the value with the total number of decodable metrics available
+                                                                            ///< in the decoder. if count is greater than zero 
+                                                                            ///< but less than the total number of decodable metrics available in the
+                                                                            ///< decoder, then only that number will be returned. 
+                                                                            ///< if count is greater than the number of decodable metrics available in
+                                                                            ///< the decoder, then the driver shall update the 
+                                                                            ///< value with the actual number of decodable metrics available. 
+    zet_metric_handle_t* phMetrics                                          ///< [in,out] [range(0, *pCount)] array of handles of decodable metrics in
+                                                                            ///< the hMetricDecoder handle provided.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Decode raw events collected from a tracer.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == phMetricDecoder`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawDataSize`
+///         + `nullptr == phMetrics`
+///         + `nullptr == pSetCount`
+///         + `nullptr == pMetricEntriesCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricTracerDecodeExp(
+    zet_metric_decoder_exp_handle_t phMetricDecoder,                        ///< [in] handle of the metric decoder object
+    size_t* pRawDataSize,                                                   ///< [in,out] size in bytes of raw data buffer. If pMetricEntriesCount is
+                                                                            ///< greater than zero but less than total number of 
+                                                                            ///< decodable metrics available in the raw data buffer, then driver shall
+                                                                            ///< update this value with actual number of raw 
+                                                                            ///< data bytes processed.
+    uint8_t* pRawData,                                                      ///< [in,out][optional][range(0, *pRawDataSize)] buffer containing tracer
+                                                                            ///< data in raw format
+    uint32_t metricsCount,                                                  ///< [in] number of decodable metrics in the tracer for which the
+                                                                            ///< hMetricDecoder handle was provided. See 
+                                                                            ///< ::zetMetricDecoderGetDecodableMetricsExp(). If metricCount is greater
+                                                                            ///< than zero but less than the number decodable 
+                                                                            ///< metrics available in the raw data buffer, then driver shall only
+                                                                            ///< decode those.
+    zet_metric_handle_t* phMetrics,                                         ///< [in] [range(0, metricsCount)] array of handles of decodable metrics in
+                                                                            ///< the decoder for which the hMetricDecoder handle was 
+                                                                            ///< provided. Metrics handles are expected to be for decodable metrics,
+                                                                            ///< see ::zetMetricDecoderGetDecodableMetrics() 
+    uint32_t* pSetCount,                                                    ///< [in,out] pointer to number of metric sets. If count is zero, then the
+                                                                            ///< driver shall update the value with the total
+                                                                            ///< number of metric sets to be decoded. If count is greater than the
+                                                                            ///< number available in the raw data buffer, then the
+                                                                            ///< driver shall update the value with the actual number of metric sets to
+                                                                            ///< be decoded. There is a 1:1 relation between
+                                                                            ///< the number of sets and sub-devices returned in the decoded entries.
+    uint32_t* pMetricEntriesCountPerSet,                                    ///< [in,out][optional][range(0, *pSetCount)] buffer of metric entries
+                                                                            ///< counts per metric set, one value per set.
+    uint32_t* pMetricEntriesCount,                                          ///< [in,out]  pointer to the total number of metric entries decoded, for
+                                                                            ///< all metric sets. If count is zero, then the
+                                                                            ///< driver shall update the value with the total number of metric entries
+                                                                            ///< to be decoded. If count is greater than zero
+                                                                            ///< but less than the total number of metric entries available in the raw
+                                                                            ///< data, then user provided number will be decoded.
+                                                                            ///< If count is greater than the number available in the raw data buffer,
+                                                                            ///< then the driver shall update the value with
+                                                                            ///< the actual number of decodable metric entries decoded. If set to null,
+                                                                            ///< then driver will only update the value of
+                                                                            ///< pSetCount.
+    zet_metric_entry_exp_t* pMetricEntries                                  ///< [in,out][optional][range(0, *pMetricEntriesCount)] buffer containing
+                                                                            ///< decoded metric entries
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Metrics/Metric Groups which export Memory
+#if !defined(__GNUC__)
+#pragma region metricExportMemory
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric group type
+typedef uint32_t zet_metric_group_type_exp_flags_t;
+typedef enum _zet_metric_group_type_exp_flag_t
+{
+    ZET_METRIC_GROUP_TYPE_EXP_FLAG_EXPORT_DMA_BUF = ZE_BIT(0),              ///< Metric group and metrics exports memory using linux dma-buf, which
+                                                                            ///< could be imported/mapped to the host process. Properties of the
+                                                                            ///< dma_buf could be queried using ::zet_export_dma_buf_exp_properties_t.
+    ZET_METRIC_GROUP_TYPE_EXP_FLAG_USER_CREATED = ZE_BIT(1),                ///< Metric group created using ::zetMetricGroupCreateExp
+    ZET_METRIC_GROUP_TYPE_EXP_FLAG_OTHER = ZE_BIT(2),                       ///< Metric group which has a collection of metrics
+    ZET_METRIC_GROUP_TYPE_EXP_FLAG_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_group_type_exp_flag_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query the metric group type using `pNext` of
+///        ::zet_metric_group_properties_t
+typedef struct _zet_metric_group_type_exp_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_metric_group_type_exp_flags_t type;                                 ///< [out] metric group type.
+                                                                            ///< returns a combination of ::zet_metric_group_type_exp_flags_t.
+
+} zet_metric_group_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported dma_buf properties queried using `pNext` of
+///        ::zet_metric_group_properties_t or ::zet_metric_properties_t
+typedef struct _zet_export_dma_buf_exp_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    int fd;                                                                 ///< [out] the file descriptor handle that could be used to import the
+                                                                            ///< memory by the host process.
+    size_t size;                                                            ///< [out] size in bytes of the dma_buf
+
+} zet_export_dma_buf_exp_properties_t;
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Calculating Multiple Metrics
+#if !defined(__GNUC__)
+#pragma region multiMetricValues
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MULTI_METRICS_EXP_NAME
+/// @brief Calculating Multiple Metrics Experimental Extension Name
+#define ZET_MULTI_METRICS_EXP_NAME  "ZET_experimental_calculate_multiple_metrics"
+#endif // ZET_MULTI_METRICS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Calculating Multiple Metrics Experimental Extension Version(s)
+typedef enum _ze_calculate_multiple_metrics_exp_version_t
+{
+    ZE_CALCULATE_MULTIPLE_METRICS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),///< version 1.0
+    ZE_CALCULATE_MULTIPLE_METRICS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),///< latest known version
+    ZE_CALCULATE_MULTIPLE_METRICS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_calculate_multiple_metrics_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Calculate one or more sets of metric values from raw data.
+/// 
+/// @details
+///     - This function is similar to ::zetMetricGroupCalculateMetricValues
+///       except it may calculate more than one set of metric values from a
+///       single data buffer.  There may be one set of metric values for each
+///       sub-device, for example.
+///     - Each set of metric values may consist of a different number of metric
+///       values, returned as the metric value count.
+///     - All metric values are calculated into a single buffer; use the metric
+///       counts to determine which metric values belong to which set.
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES < type`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawData`
+///         + `nullptr == pSetCount`
+///         + `nullptr == pTotalMetricValueCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupCalculateMultipleMetricValuesExp(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    zet_metric_group_calculation_type_t type,                               ///< [in] calculation type to be applied on raw data
+    size_t rawDataSize,                                                     ///< [in] size in bytes of raw data buffer
+    const uint8_t* pRawData,                                                ///< [in][range(0, rawDataSize)] buffer of raw data to calculate
+    uint32_t* pSetCount,                                                    ///< [in,out] pointer to number of metric sets.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric sets to be calculated.
+                                                                            ///< if count is greater than the number available in the raw data buffer,
+                                                                            ///< then the driver shall update the value with the actual number of
+                                                                            ///< metric sets to be calculated.
+    uint32_t* pTotalMetricValueCount,                                       ///< [in,out] pointer to number of the total number of metric values
+                                                                            ///< calculated, for all metric sets.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric values to be calculated.
+                                                                            ///< if count is greater than the number available in the raw data buffer,
+                                                                            ///< then the driver shall update the value with the actual number of
+                                                                            ///< metric values to be calculated.
+    uint32_t* pMetricCounts,                                                ///< [in,out][optional][range(0, *pSetCount)] buffer of metric counts per
+                                                                            ///< metric set.
+    zet_typed_value_t* pMetricValues                                        ///< [in,out][optional][range(0, *pTotalMetricValueCount)] buffer of
+                                                                            ///< calculated metrics.
+                                                                            ///< if count is less than the number available in the raw data buffer,
+                                                                            ///< then driver shall only calculate that number of metric values.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Global Metric Timestamps
+#if !defined(__GNUC__)
+#pragma region GlobalTimestamps
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_GLOBAL_METRICS_TIMESTAMPS_EXP_NAME
+/// @brief Global Metric Timestamps Experimental Extension Name
+#define ZET_GLOBAL_METRICS_TIMESTAMPS_EXP_NAME  "ZET_experimental_global_metric_timestamps"
+#endif // ZET_GLOBAL_METRICS_TIMESTAMPS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Global Metric Timestamps Experimental Extension Version(s)
+typedef enum _ze_metric_global_timestamps_exp_version_t
+{
+    ZE_METRIC_GLOBAL_TIMESTAMPS_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),  ///< version 1.0
+    ZE_METRIC_GLOBAL_TIMESTAMPS_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),  ///< latest known version
+    ZE_METRIC_GLOBAL_TIMESTAMPS_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} ze_metric_global_timestamps_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric timestamps resolution
+/// 
+/// @details
+///     - This structure may be returned from ::zetMetricGroupGetProperties via
+///       the `pNext` member of ::zet_metric_group_properties_t.
+///     - Used for mapping metric timestamps to other timers.
+typedef struct _zet_metric_global_timestamps_resolution_exp_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint64_t timerResolution;                                               ///< [out] Returns the resolution of metrics timer (used for timestamps) in
+                                                                            ///< cycles/sec.
+    uint64_t timestampValidBits;                                            ///< [out] Returns the number of valid bits in the timestamp value.
+
+} zet_metric_global_timestamps_resolution_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Returns metric timestamps synchronized with global device timestamps,
+///        optionally synchronized with host
+/// 
+/// @details
+///     - The application may call this function from simultaneous threads.
+///     - By default, the global and metrics timestamps are synchronized to the
+///       device.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == globalTimestamp`
+///         + `nullptr == metricTimestamp`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupGetGlobalTimestampsExp(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    ze_bool_t synchronizedWithHost,                                         ///< [in] Returns the timestamps synchronized to the host or the device.
+    uint64_t* globalTimestamp,                                              ///< [out] Device timestamp.
+    uint64_t* metricTimestamp                                               ///< [out] Metric timestamp.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Exporting Metrics Data
+#if !defined(__GNUC__)
+#pragma region metricExportData
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_EXPORT_METRICS_DATA_EXP_NAME
+/// @brief Exporting Metrics Data Experimental Extension Name
+#define ZET_EXPORT_METRICS_DATA_EXP_NAME  "ZET_experimental_metric_export_data"
+#endif // ZET_EXPORT_METRICS_DATA_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exporting Metrics Data Experimental Extension Version(s)
+typedef enum _zet_export_metric_data_exp_version_t
+{
+    ZET_EXPORT_METRIC_DATA_EXP_VERSION_1_0 = ZE_MAKE_VERSION( 1, 0 ),       ///< version 1.0
+    ZET_EXPORT_METRIC_DATA_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 0 ),   ///< latest known version
+    ZET_EXPORT_METRIC_DATA_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zet_export_metric_data_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_NAME_EXP
+/// @brief Maximum count of characters in export data element name
+#define ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_NAME_EXP  256
+#endif // ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_NAME_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_DESCRIPTION_EXP
+/// @brief Maximum export data element description string size
+#define ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_DESCRIPTION_EXP  256
+#endif // ZET_MAX_METRIC_EXPORT_DATA_ELEMENT_DESCRIPTION_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metrics calculation descriptor
+typedef struct _zet_metric_calculate_exp_desc_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    const void* pNext;                                                      ///< [in][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    uint32_t rawReportSkipCount;                                            ///< [in] number of reports to skip during calculation
+
+} zet_metric_calculate_exp_desc_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Export Metrics Data for system independent calculation.
+/// 
+/// @details
+///     - This function exports raw data and necessary information to perform
+///       metrics calculation of collected data in a different system than where
+///       data was collected, which may or may not have accelerators.
+///     - Implementations can choose to describe the data arrangement of the
+///       exported data, using any mechanism which allows users to read and
+///       process them.
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pRawData`
+///         + `nullptr == pExportDataSize`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupGetExportDataExp(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] handle of the metric group
+    const uint8_t* pRawData,                                                ///< [in] buffer of raw data
+    size_t rawDataSize,                                                     ///< [in] size in bytes of raw data buffer
+    size_t* pExportDataSize,                                                ///< [in,out] size in bytes of export data buffer
+                                                                            ///< if size is zero, then the driver shall update the value with the
+                                                                            ///< number of bytes necessary to store the exported data.
+                                                                            ///< if size is greater than required, then the driver shall update the
+                                                                            ///< value with the actual number of bytes necessary to store the exported data.
+    uint8_t * pExportData                                                   ///< [in,out][optional][range(0, *pExportDataSize)] buffer of exported data.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Calculate one or more sets of metric values from exported raw data.
+/// 
+/// @details
+///     - Calculate metrics values using exported data returned by
+///       ::zetMetricGroupGetExportDataExp.
+///     - This function is similar to
+///       ::zetMetricGroupCalculateMultipleMetricValuesExp except it would
+///       calculate from exported metric data.
+///     - This function could be used to calculate metrics on a system different
+///       from where the metric raw data was collected.
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDriver`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `::ZET_METRIC_GROUP_CALCULATION_TYPE_MAX_METRIC_VALUES < type`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pExportData`
+///         + `nullptr == pCalculateDescriptor`
+///         + `nullptr == pSetCount`
+///         + `nullptr == pTotalMetricValueCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupCalculateMetricExportDataExp(
+    ze_driver_handle_t hDriver,                                             ///< [in] handle of the driver instance
+    zet_metric_group_calculation_type_t type,                               ///< [in] calculation type to be applied on raw data
+    size_t exportDataSize,                                                  ///< [in] size in bytes of exported data buffer
+    const uint8_t* pExportData,                                             ///< [in][range(0, exportDataSize)] buffer of exported data to calculate
+    zet_metric_calculate_exp_desc_t* pCalculateDescriptor,                  ///< [in] descriptor specifying calculation specific parameters
+    uint32_t* pSetCount,                                                    ///< [in,out] pointer to number of metric sets.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric sets to be calculated.
+                                                                            ///< if count is greater than the number available in the raw data buffer,
+                                                                            ///< then the driver shall update the value with the actual number of
+                                                                            ///< metric sets to be calculated.
+    uint32_t* pTotalMetricValueCount,                                       ///< [in,out] pointer to number of the total number of metric values
+                                                                            ///< calculated, for all metric sets.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric values to be calculated.
+                                                                            ///< if count is greater than the number available in the raw data buffer,
+                                                                            ///< then the driver shall update the value with the actual number of
+                                                                            ///< metric values to be calculated.
+    uint32_t* pMetricCounts,                                                ///< [in,out][optional][range(0, *pSetCount)] buffer of metric counts per
+                                                                            ///< metric set.
+    zet_typed_value_t* pMetricValues                                        ///< [in,out][optional][range(0, *pTotalMetricValueCount)] buffer of
+                                                                            ///< calculated metrics.
+                                                                            ///< if count is less than the number available in the raw data buffer,
+                                                                            ///< then driver shall only calculate that number of metric values.
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Level-Zero Tool Experimental Extension for Programmable Metrics
+#if !defined(__GNUC__)
+#pragma region metricProgrammable
+#endif
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_PROGRAMMABLE_METRICS_EXP_NAME
+/// @brief Programmable Metrics Experimental Extension Name
+#define ZET_PROGRAMMABLE_METRICS_EXP_NAME  "ZET_experimental_programmable_metrics"
+#endif // ZET_PROGRAMMABLE_METRICS_EXP_NAME
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Programmable Metrics Experimental Extension Version(s)
+typedef enum _zet_metric_programmable_exp_version_t
+{
+    ZET_METRIC_PROGRAMMABLE_EXP_VERSION_1_1 = ZE_MAKE_VERSION( 1, 1 ),      ///< version 1.1
+    ZET_METRIC_PROGRAMMABLE_EXP_VERSION_CURRENT = ZE_MAKE_VERSION( 1, 1 ),  ///< latest known version
+    ZET_METRIC_PROGRAMMABLE_EXP_VERSION_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_programmable_exp_version_t;
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_NAME_EXP
+/// @brief Maximum count of characters in export data element name
+#define ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_NAME_EXP  256
+#endif // ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_NAME_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_DESCRIPTION_EXP
+/// @brief Maximum export data element description string size
+#define ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_DESCRIPTION_EXP  256
+#endif // ZET_MAX_PROGRAMMABLE_METRICS_ELEMENT_DESCRIPTION_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_PROGRAMMABLE_NAME_EXP
+/// @brief Maximum metric programmable name string size
+#define ZET_MAX_METRIC_PROGRAMMABLE_NAME_EXP  128
+#endif // ZET_MAX_METRIC_PROGRAMMABLE_NAME_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_PROGRAMMABLE_DESCRIPTION_EXP
+/// @brief Maximum metric programmable description string size
+#define ZET_MAX_METRIC_PROGRAMMABLE_DESCRIPTION_EXP  128
+#endif // ZET_MAX_METRIC_PROGRAMMABLE_DESCRIPTION_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_PROGRAMMABLE_COMPONENT_EXP
+/// @brief Maximum metric programmable component string size
+#define ZET_MAX_METRIC_PROGRAMMABLE_COMPONENT_EXP  128
+#endif // ZET_MAX_METRIC_PROGRAMMABLE_COMPONENT_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_PROGRAMMABLE_PARAMETER_NAME_EXP
+/// @brief Maximum metric programmable parameter string size
+#define ZET_MAX_METRIC_PROGRAMMABLE_PARAMETER_NAME_EXP  128
+#endif // ZET_MAX_METRIC_PROGRAMMABLE_PARAMETER_NAME_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZET_MAX_METRIC_PROGRAMMABLE_VALUE_DESCRIPTION_EXP
+/// @brief Maximum value for programmable value description
+#define ZET_MAX_METRIC_PROGRAMMABLE_VALUE_DESCRIPTION_EXP  128
+#endif // ZET_MAX_METRIC_PROGRAMMABLE_VALUE_DESCRIPTION_EXP
+
+///////////////////////////////////////////////////////////////////////////////
+#ifndef ZE_MAX_METRIC_GROUP_NAME_PREFIX
+/// @brief Maximum value metric group name prefix
+#define ZE_MAX_METRIC_GROUP_NAME_PREFIX  64
+#endif // ZE_MAX_METRIC_GROUP_NAME_PREFIX
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Handle of metric programmable's object
+typedef struct _zet_metric_programmable_exp_handle_t *zet_metric_programmable_exp_handle_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric Programmable properties queried using
+///        ::zetMetricProgrammableGetPropertiesExp
+typedef struct _zet_metric_programmable_exp_properties_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    char name[ZET_MAX_METRIC_PROGRAMMABLE_NAME_EXP];                        ///< [out] metric programmable name
+    char description[ZET_MAX_METRIC_PROGRAMMABLE_DESCRIPTION_EXP];          ///< [out] metric programmable description
+    char component[ZET_MAX_METRIC_PROGRAMMABLE_COMPONENT_EXP];              ///< [out] metric programmable component
+    uint32_t tierNumber;                                                    ///< [out] tier number
+    uint32_t domain;                                                        ///< [out] metric domain number.
+    uint32_t parameterCount;                                                ///< [out] number of parameters in the programmable
+    zet_metric_group_sampling_type_flags_t samplingType;                    ///< [out] metric sampling type.
+                                                                            ///< returns a combination of ::zet_metric_group_sampling_type_flag_t.
+    uint32_t sourceId;                                                      ///< [out] unique metric source identifier(within platform)to identify the
+                                                                            ///< HW block where the metric is collected.
+
+} zet_metric_programmable_exp_properties_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric Programmable Parameter types
+typedef enum _zet_metric_programmable_param_type_exp_t
+{
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_DISAGGREGATION = 0,              ///< Metric is disaggregated.
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_LATENCY = 1,                     ///< Metric for latency measurement.
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_NORMALIZATION_UTILIZATION = 2,   ///< Produces normalization in percent using raw_metric * 100 / cycles / HW
+                                                                            ///< instance_count.
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_NORMALIZATION_AVERAGE = 3,       ///< Produces normalization using raw_metric / HW instance_count.
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_NORMALIZATION_RATE = 4,          ///< Produces normalization average using raw_metric / timestamp.
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_NORMALIZATION_BYTES = 5,         ///< Produces normalization average using raw_metric * n bytes.
+    ZET_METRIC_PROGRAMMABLE_PARAM_TYPE_EXP_FORCE_UINT32 = 0x7fffffff
+
+} zet_metric_programmable_param_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Supported value info types
+typedef enum _zet_value_info_type_exp_t
+{
+    ZET_VALUE_INFO_TYPE_EXP_UINT32 = 0,                                     ///< 32-bit unsigned-integer
+    ZET_VALUE_INFO_TYPE_EXP_UINT64 = 1,                                     ///< 64-bit unsigned-integer
+    ZET_VALUE_INFO_TYPE_EXP_FLOAT32 = 2,                                    ///< 32-bit floating-point
+    ZET_VALUE_INFO_TYPE_EXP_FLOAT64 = 3,                                    ///< 64-bit floating-point
+    ZET_VALUE_INFO_TYPE_EXP_BOOL8 = 4,                                      ///< 8-bit boolean
+    ZET_VALUE_INFO_TYPE_EXP_UINT8 = 5,                                      ///< 8-bit unsigned-integer
+    ZET_VALUE_INFO_TYPE_EXP_UINT16 = 6,                                     ///< 16-bit unsigned-integer
+    ZET_VALUE_INFO_TYPE_EXP_UINT64_RANGE = 7,                               ///< 64-bit unsigned-integer range (minimum and maximum)
+    ZET_VALUE_INFO_TYPE_EXP_FLOAT64_RANGE = 8,                              ///< 64-bit floating point range (minimum and maximum)
+    ZET_VALUE_INFO_TYPE_EXP_FORCE_UINT32 = 0x7fffffff
+
+} zet_value_info_type_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Value info of type uint64_t range
+typedef struct _zet_value_uint64_range_exp_t
+{
+    uint64_t ui64Min;                                                       ///< [out] minimum value of the range
+    uint64_t ui64Max;                                                       ///< [out] maximum value of the range
+
+} zet_value_uint64_range_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Value info of type float64 range
+typedef struct _zet_value_fp64_range_exp_t
+{
+    double fp64Min;                                                         ///< [out] minimum value of the range
+    double fp64Max;                                                         ///< [out] maximum value of the range
+
+} zet_value_fp64_range_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Union of value information
+typedef union _zet_value_info_exp_t
+{
+    uint32_t ui32;                                                          ///< [out] 32-bit unsigned-integer
+    uint64_t ui64;                                                          ///< [out] 64-bit unsigned-integer
+    float fp32;                                                             ///< [out] 32-bit floating-point
+    double fp64;                                                            ///< [out] 64-bit floating-point
+    ze_bool_t b8;                                                           ///< [out] 8-bit boolean
+    uint8_t ui8;                                                            ///< [out] 8-bit unsigned integer
+    uint16_t ui16;                                                          ///< [out] 16-bit unsigned integer
+    zet_value_uint64_range_exp_t ui64Range;                                 ///< [out] minimum and maximum value of the range
+    zet_value_fp64_range_exp_t fp64Range;                                   ///< [out] minimum and maximum value of the range
+
+} zet_value_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric Programmable parameter information
+typedef struct _zet_metric_programmable_param_info_exp_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_metric_programmable_param_type_exp_t type;                          ///< [out] programmable parameter type
+    char name[ZET_MAX_METRIC_PROGRAMMABLE_PARAMETER_NAME_EXP];              ///< [out] metric programmable parameter name
+    zet_value_info_type_exp_t valueInfoType;                                ///< [out] value info type
+    zet_value_t defaultValue;                                               ///< [out] default value for the parameter
+    uint32_t valueInfoCount;                                                ///< [out] count of ::zet_metric_programmable_param_value_info_exp_t
+
+} zet_metric_programmable_param_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric Programmable parameter value information
+typedef struct _zet_metric_programmable_param_value_info_exp_t
+{
+    zet_structure_type_t stype;                                             ///< [in] type of this structure
+    void* pNext;                                                            ///< [in,out][optional] must be null or a pointer to an extension-specific
+                                                                            ///< structure (i.e. contains stype and pNext).
+    zet_value_info_exp_t valueInfo;                                         ///< [out] information about the parameter value
+    char description[ZET_MAX_METRIC_PROGRAMMABLE_VALUE_DESCRIPTION_EXP];    ///< [out] description about the value
+
+} zet_metric_programmable_param_value_info_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Metric Programmable parameter value
+typedef struct _zet_metric_programmable_param_value_exp_t
+{
+    zet_value_t value;                                                      ///< [in] parameter value
+
+} zet_metric_programmable_param_value_exp_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Query and get the available metric programmable handles.
+/// 
+/// @details
+///     - Query the available programmable handles using *pCount = 0.
+///     - Returns all programmable metric handles available in the device.
+///     - The application may call this function from simultaneous threads.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricProgrammableGetExp(
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    uint32_t* pCount,                                                       ///< [in,out] pointer to the number of metric programmable handles.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< total number of metric programmable handles available.
+                                                                            ///< if count is greater than the number of metric programmable handles
+                                                                            ///< available, then the driver shall update the value with the correct
+                                                                            ///< number of metric programmable handles available.
+    zet_metric_programmable_exp_handle_t* phMetricProgrammables             ///< [in,out][optional][range(0, *pCount)] array of handle of metric programmables.
+                                                                            ///< if count is less than the number of metric programmables available,
+                                                                            ///< then driver shall only retrieve that number of metric programmables.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the properties of the metric programmable.
+/// 
+/// @details
+///     - Returns the properties of the metric programmable.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricProgrammable`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pProperties`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricProgrammableGetPropertiesExp(
+    zet_metric_programmable_exp_handle_t hMetricProgrammable,               ///< [in] handle of the metric programmable
+    zet_metric_programmable_exp_properties_t* pProperties                   ///< [in,out] properties of the metric programmable
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the information about the parameters of the metric programmable.
+/// 
+/// @details
+///     - Returns information about the parameters of the metric programmable
+///       handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricProgrammable`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pParameterCount`
+///         + `nullptr == pParameterInfo`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricProgrammableGetParamInfoExp(
+    zet_metric_programmable_exp_handle_t hMetricProgrammable,               ///< [in] handle of the metric programmable
+    uint32_t* pParameterCount,                                              ///< [in,out] count of the parameters to retrieve parameter info.
+                                                                            ///< if value pParameterCount is greater than count of parameters
+                                                                            ///< available, then pParameterCount will be updated with count of
+                                                                            ///< parameters available.
+                                                                            ///< The count of parameters available can be queried using ::zetMetricProgrammableGetPropertiesExp.
+    zet_metric_programmable_param_info_exp_t* pParameterInfo                ///< [in,out][range(1, *pParameterCount)] array of parameter info.
+                                                                            ///< if parameterCount is less than the number of parameters available,
+                                                                            ///< then driver shall only retrieve that number of parameter info.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Get the information about the parameter value of the metric
+///        programmable.
+/// 
+/// @details
+///     - Returns the value-information about the parameter at the specific
+///       ordinal of the metric programmable handle.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricProgrammable`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pValueInfoCount`
+///         + `nullptr == pValueInfo`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricProgrammableGetParamValueInfoExp(
+    zet_metric_programmable_exp_handle_t hMetricProgrammable,               ///< [in] handle of the metric programmable
+    uint32_t parameterOrdinal,                                              ///< [in] ordinal of the parameter in the metric programmable
+    uint32_t* pValueInfoCount,                                              ///< [in,out] count of parameter value information to retrieve.
+                                                                            ///< if value at pValueInfoCount is greater than count of value info
+                                                                            ///< available, then pValueInfoCount will be updated with count of value
+                                                                            ///< info available.
+                                                                            ///< The count of parameter value info available can be queried using ::zetMetricProgrammableGetParamInfoExp.
+    zet_metric_programmable_param_value_info_exp_t* pValueInfo              ///< [in,out][range(1, *pValueInfoCount)] array of parameter value info.
+                                                                            ///< if pValueInfoCount is less than the number of value info available,
+                                                                            ///< then driver shall only retrieve that number of value info.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create metric handles by applying parameter values on the metric
+///        programmable handle.
+/// 
+/// @details
+///     - Multiple parameter values could be used to prepare a metric.
+///     - If parameterCount = 0, the default value of the metric programmable
+///       would be used for all parameters.
+///     - The implementation can post-fix a C string to the metric name and
+///       description, based on the parameter values chosen.
+///     - ::zetMetricProgrammableGetParamInfoExp() returns a list of parameters
+///       in a defined order.
+///     - Therefore, the list of values passed in to the API should respect the
+///       same order such that the desired parameter is set with expected value
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricProgrammable`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pParameterValues`
+///         + `nullptr == pName`
+///         + `nullptr == pDescription`
+///         + `nullptr == pMetricHandleCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricCreateFromProgrammableExp2(
+    zet_metric_programmable_exp_handle_t hMetricProgrammable,               ///< [in] handle of the metric programmable
+    uint32_t parameterCount,                                                ///< [in] Count of parameters to set.
+    zet_metric_programmable_param_value_exp_t* pParameterValues,            ///< [in] list of parameter values to be set.
+    const char* pName,                                                      ///< [in] pointer to metric name to be used. Must point to a
+                                                                            ///< null-terminated character array no longer than ::ZET_MAX_METRIC_NAME.
+    const char* pDescription,                                               ///< [in] pointer to metric description to be used. Must point to a
+                                                                            ///< null-terminated character array no longer than
+                                                                            ///< ::ZET_MAX_METRIC_DESCRIPTION.
+    uint32_t* pMetricHandleCount,                                           ///< [in,out] Pointer to the number of metric handles.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< number of metric handles available for this programmable.
+                                                                            ///< if count is greater than the number of metric handles available, then
+                                                                            ///< the driver shall update the value with the correct number of metric
+                                                                            ///< handles available.
+    zet_metric_handle_t* phMetricHandles                                    ///< [in,out][optional][range(0,*pMetricHandleCount)] array of handle of metrics.
+                                                                            ///< if count is less than the number of metrics available, then driver
+                                                                            ///< shall only retrieve that number of metric handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create metric handles by applying parameter values on the metric
+///        programmable handle.
+/// 
+/// @details
+///     - This API is deprecated. Please use
+///       ::zetMetricCreateFromProgrammableExp2()
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricProgrammable`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pParameterValues`
+///         + `nullptr == pName`
+///         + `nullptr == pDescription`
+///         + `nullptr == pMetricHandleCount`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricCreateFromProgrammableExp(
+    zet_metric_programmable_exp_handle_t hMetricProgrammable,               ///< [in] handle of the metric programmable
+    zet_metric_programmable_param_value_exp_t* pParameterValues,            ///< [in] list of parameter values to be set.
+    uint32_t parameterCount,                                                ///< [in] Count of parameters to set.
+    const char* pName,                                                      ///< [in] pointer to metric name to be used. Must point to a
+                                                                            ///< null-terminated character array no longer than ::ZET_MAX_METRIC_NAME.
+    const char* pDescription,                                               ///< [in] pointer to metric description to be used. Must point to a
+                                                                            ///< null-terminated character array no longer than
+                                                                            ///< ::ZET_MAX_METRIC_DESCRIPTION.
+    uint32_t* pMetricHandleCount,                                           ///< [in,out] Pointer to the number of metric handles.
+                                                                            ///< if count is zero, then the driver shall update the value with the
+                                                                            ///< number of metric handles available for this programmable.
+                                                                            ///< if count is greater than the number of metric handles available, then
+                                                                            ///< the driver shall update the value with the correct number of metric
+                                                                            ///< handles available.
+    zet_metric_handle_t* phMetricHandles                                    ///< [in,out][optional][range(0,*pMetricHandleCount)] array of handle of metrics.
+                                                                            ///< if count is less than the number of metrics available, then driver
+                                                                            ///< shall only retrieve that number of metric handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create multiple metric group handles from metric handles.
+/// 
+/// @details
+///     - Creates multiple metric groups from metrics which were created using
+///       ::zetMetricCreateFromProgrammableExp2().
+///     - Metrics whose Hardware resources do not overlap are added to same
+///       metric group.
+///     - The metric groups created using this API are managed by the
+///       application and cannot be retrieved using ::zetMetricGroupGet().
+///     - The created metric groups are ready for activation and collection.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///         + `nullptr == phMetrics`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + metricGroupCount is lesser than the number of metric group handles that could be created.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetDeviceCreateMetricGroupsFromMetricsExp(
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device.
+    uint32_t metricCount,                                                   ///< [in] number of metric handles.
+    zet_metric_handle_t * phMetrics,                                        ///< [in] metric handles to be added to the metric groups.
+    const char * pMetricGroupNamePrefix,                                    ///< [in] prefix to the name created for the metric groups. Must point to a
+                                                                            ///< null-terminated character array no longer than
+                                                                            ///< ZEX_MAX_METRIC_GROUP_NAME_PREFIX.
+    const char * pDescription,                                              ///< [in] pointer to description of the metric groups. Must point to a
+                                                                            ///< null-terminated character array no longer than
+                                                                            ///< ::ZET_MAX_METRIC_GROUP_DESCRIPTION.
+    uint32_t * pMetricGroupCount,                                           ///< [in,out] pointer to the number of metric group handles to be created.
+                                                                            ///< if pMetricGroupCount is zero, then the driver shall update the value
+                                                                            ///< with the maximum possible number of metric group handles that could be created.
+                                                                            ///< if pMetricGroupCount is greater than the number of metric group
+                                                                            ///< handles that could be created, then the driver shall update the value
+                                                                            ///< with the correct number of metric group handles generated.
+                                                                            ///< if pMetricGroupCount is lesser than the number of metric group handles
+                                                                            ///< that could be created, then ::ZE_RESULT_ERROR_INVALID_ARGUMENT is returned.
+    zet_metric_group_handle_t* phMetricGroup                                ///< [in,out][optional][range(0, *pMetricGroupCount)] array of handle of
+                                                                            ///< metric group handles.
+                                                                            ///< Created Metric group handles.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Create metric group handle.
+/// 
+/// @details
+///     - This API is deprecated. Please use
+///       ::zetCreateMetricGroupsFromMetricsExp() 
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hDevice`
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `nullptr == pName`
+///         + `nullptr == pDescription`
+///         + `nullptr == phMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_ENUMERATION
+///         + `0x7 < samplingType`
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupCreateExp(
+    zet_device_handle_t hDevice,                                            ///< [in] handle of the device
+    const char* pName,                                                      ///< [in] pointer to metric group name. Must point to a null-terminated
+                                                                            ///< character array no longer than ::ZET_MAX_METRIC_GROUP_NAME.
+    const char* pDescription,                                               ///< [in] pointer to metric group description. Must point to a
+                                                                            ///< null-terminated character array no longer than
+                                                                            ///< ::ZET_MAX_METRIC_GROUP_DESCRIPTION.
+    zet_metric_group_sampling_type_flags_t samplingType,                    ///< [in] Sampling type for the metric group.
+    zet_metric_group_handle_t* phMetricGroup                                ///< [in,out] Created Metric group handle
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Add a metric handle to the metric group handle created using
+///        ::zetMetricGroupCreateExp.
+/// 
+/// @details
+///     - Reasons for failing to add the metric could be queried using
+///       pErrorString
+///     - Multiple additions of same metric would add the metric only once to
+///       the hMetricGroup
+///     - Metric handles from multiple domains may be used in a single metric
+///       group.
+///     - Metric handles from different sourceIds (refer
+///       ::zet_metric_programmable_exp_properties_t) are not allowed in a
+///       single metric group.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///         + `nullptr == hMetric`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + If a Metric handle from a pre-defined metric group is requested to be added.
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + If the metric group is currently activated.
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupAddMetricExp(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] Handle of the metric group
+    zet_metric_handle_t hMetric,                                            ///< [in] Metric to be added to the group.
+    size_t * pErrorStringSize,                                              ///< [in,out][optional] Size of the error string to query, if an error was
+                                                                            ///< reported during adding the metric handle.
+                                                                            ///< if *pErrorStringSize is zero, then the driver shall update the value
+                                                                            ///< with the size of the error string in bytes.
+    char* pErrorString                                                      ///< [in,out][optional][range(0, *pErrorStringSize)] Error string.
+                                                                            ///< if *pErrorStringSize is less than the length of the error string
+                                                                            ///< available, then driver shall only retrieve that length of error string.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Remove a metric from the metric group handle created using
+///        ::zetMetricGroupCreateExp.
+/// 
+/// @details
+///     - Remove an already added metric handle from the metric group.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///         + `nullptr == hMetric`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + If trying to remove a metric not previously added to the metric group
+///         + If the input metric group is a pre-defined metric group
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + If the metric group is currently activated
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupRemoveMetricExp(
+    zet_metric_group_handle_t hMetricGroup,                                 ///< [in] Handle of the metric group
+    zet_metric_handle_t hMetric                                             ///< [in] Metric handle to be removed from the metric group.
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Closes a created metric group using ::zetMetricGroupCreateExp, so that
+///        it can be activated.
+/// 
+/// @details
+///     - Finalizes the ::zetMetricGroupAddMetricExp and
+///       ::zetMetricGroupRemoveMetricExp operations on the metric group.
+///     - This is a necessary step before activation of the created metric
+///       group.
+///     - Add / Remove of metrics is possible after ::zetMetricGroupCloseExp.
+///       However, a call to ::zetMetricGroupCloseExp is necessary after
+///       modifying the metric group.
+///     - Implementations could choose to add new metrics to the group during
+///       ::zetMetricGroupCloseExp, which are related and might add value to the
+///       metrics already added by the application
+///     - Applications can query the list of metrics in the metric group using
+///       ::zetMetricGet
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + If the input metric group is a pre-defined metric group
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + If the metric group is currently activated
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupCloseExp(
+    zet_metric_group_handle_t hMetricGroup                                  ///< [in] Handle of the metric group
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroy a metric group created using ::zetMetricGroupCreateExp.
+/// 
+/// @details
+///     - Metric handles created using ::zetMetricCreateFromProgrammableExp2 and
+///       are part of the metricGroup are not destroyed.
+///     - It is necessary to call ::zetMetricDestroyExp for each of the metric
+///       handles (created from ::zetMetricCreateFromProgrammableExp2) to
+///       destroy them.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetricGroup`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + If trying to destroy a pre-defined metric group
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + If trying to destroy an activated metric group
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricGroupDestroyExp(
+    zet_metric_group_handle_t hMetricGroup                                  ///< [in] Handle of the metric group to destroy
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Destroy a metric created using ::zetMetricCreateFromProgrammableExp2.
+/// 
+/// @details
+///     - If a metric is added to a metric group, the metric has to be removed
+///       using ::zetMetricGroupRemoveMetricExp before it can be destroyed.
+/// 
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_DEVICE_LOST
+///     - ::ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY
+///     - ::ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `nullptr == hMetric`
+///     - ::ZE_RESULT_ERROR_INVALID_ARGUMENT
+///         + If trying to destroy a metric from pre-defined metric group
+///     - ::ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE
+///         + If trying to destroy a metric currently added to a metric group
+ZE_APIEXPORT ze_result_t ZE_APICALL
+zetMetricDestroyExp(
+    zet_metric_handle_t hMetric                                             ///< [in] Handle of the metric to destroy
+    );
+
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZET_API_H
\ No newline at end of file
diff --git a/third_party/level_zero/zet_ddi.h b/third_party/level_zero/zet_ddi.h
new file mode 100644
index 00000000000..8d6d7c0de95
--- /dev/null
+++ b/third_party/level_zero/zet_ddi.h
@@ -0,0 +1,1174 @@
+/*
+ *
+ * Copyright (C) 2019-2021 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * @file zet_ddi.h
+ * @version v1.11-r1.11.8
+ *
+ */
+#ifndef _ZET_DDI_H
+#define _ZET_DDI_H
+#if defined(__cplusplus)
+#pragma once
+#endif
+#include "zet_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricProgrammableGetExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricProgrammableGetExp_t)(
+    zet_device_handle_t,
+    uint32_t*,
+    zet_metric_programmable_exp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricProgrammableGetPropertiesExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricProgrammableGetPropertiesExp_t)(
+    zet_metric_programmable_exp_handle_t,
+    zet_metric_programmable_exp_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricProgrammableGetParamInfoExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricProgrammableGetParamInfoExp_t)(
+    zet_metric_programmable_exp_handle_t,
+    uint32_t*,
+    zet_metric_programmable_param_info_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricProgrammableGetParamValueInfoExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricProgrammableGetParamValueInfoExp_t)(
+    zet_metric_programmable_exp_handle_t,
+    uint32_t,
+    uint32_t*,
+    zet_metric_programmable_param_value_info_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricProgrammableExp functions pointers
+typedef struct _zet_metric_programmable_exp_dditable_t
+{
+    zet_pfnMetricProgrammableGetExp_t                           pfnGetExp;
+    zet_pfnMetricProgrammableGetPropertiesExp_t                 pfnGetPropertiesExp;
+    zet_pfnMetricProgrammableGetParamInfoExp_t                  pfnGetParamInfoExp;
+    zet_pfnMetricProgrammableGetParamValueInfoExp_t             pfnGetParamValueInfoExp;
+} zet_metric_programmable_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricProgrammableExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricProgrammableExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_programmable_exp_dditable_t* pDdiTable                       ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricProgrammableExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricProgrammableExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_programmable_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricTracerCreateExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricTracerCreateExp_t)(
+    zet_context_handle_t,
+    zet_device_handle_t,
+    uint32_t,
+    zet_metric_group_handle_t*,
+    zet_metric_tracer_exp_desc_t*,
+    ze_event_handle_t,
+    zet_metric_tracer_exp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricTracerDestroyExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricTracerDestroyExp_t)(
+    zet_metric_tracer_exp_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricTracerEnableExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricTracerEnableExp_t)(
+    zet_metric_tracer_exp_handle_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricTracerDisableExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricTracerDisableExp_t)(
+    zet_metric_tracer_exp_handle_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricTracerReadDataExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricTracerReadDataExp_t)(
+    zet_metric_tracer_exp_handle_t,
+    size_t*,
+    uint8_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricTracerDecodeExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricTracerDecodeExp_t)(
+    zet_metric_decoder_exp_handle_t,
+    size_t*,
+    uint8_t*,
+    uint32_t,
+    zet_metric_handle_t*,
+    uint32_t*,
+    uint32_t*,
+    uint32_t*,
+    zet_metric_entry_exp_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricTracerExp functions pointers
+typedef struct _zet_metric_tracer_exp_dditable_t
+{
+    zet_pfnMetricTracerCreateExp_t                              pfnCreateExp;
+    zet_pfnMetricTracerDestroyExp_t                             pfnDestroyExp;
+    zet_pfnMetricTracerEnableExp_t                              pfnEnableExp;
+    zet_pfnMetricTracerDisableExp_t                             pfnDisableExp;
+    zet_pfnMetricTracerReadDataExp_t                            pfnReadDataExp;
+    zet_pfnMetricTracerDecodeExp_t                              pfnDecodeExp;
+} zet_metric_tracer_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricTracerExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricTracerExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_tracer_exp_dditable_t* pDdiTable                             ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricTracerExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricTracerExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_tracer_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricDecoderCreateExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricDecoderCreateExp_t)(
+    zet_metric_tracer_exp_handle_t,
+    zet_metric_decoder_exp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricDecoderDestroyExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricDecoderDestroyExp_t)(
+    zet_metric_decoder_exp_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricDecoderGetDecodableMetricsExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricDecoderGetDecodableMetricsExp_t)(
+    zet_metric_decoder_exp_handle_t,
+    uint32_t*,
+    zet_metric_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricDecoderExp functions pointers
+typedef struct _zet_metric_decoder_exp_dditable_t
+{
+    zet_pfnMetricDecoderCreateExp_t                             pfnCreateExp;
+    zet_pfnMetricDecoderDestroyExp_t                            pfnDestroyExp;
+    zet_pfnMetricDecoderGetDecodableMetricsExp_t                pfnGetDecodableMetricsExp;
+} zet_metric_decoder_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricDecoderExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricDecoderExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_decoder_exp_dditable_t* pDdiTable                            ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricDecoderExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricDecoderExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_decoder_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDeviceGetDebugProperties 
+typedef ze_result_t (ZE_APICALL *zet_pfnDeviceGetDebugProperties_t)(
+    zet_device_handle_t,
+    zet_device_debug_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Device functions pointers
+typedef struct _zet_device_dditable_t
+{
+    zet_pfnDeviceGetDebugProperties_t                           pfnGetDebugProperties;
+} zet_device_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Device table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetDeviceProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_device_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetDeviceProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetDeviceProcAddrTable_t)(
+    ze_api_version_t,
+    zet_device_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDeviceGetConcurrentMetricGroupsExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnDeviceGetConcurrentMetricGroupsExp_t)(
+    zet_device_handle_t,
+    uint32_t,
+    zet_metric_group_handle_t *,
+    uint32_t *,
+    uint32_t *
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDeviceCreateMetricGroupsFromMetricsExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnDeviceCreateMetricGroupsFromMetricsExp_t)(
+    zet_device_handle_t,
+    uint32_t,
+    zet_metric_handle_t *,
+    const char *,
+    const char *,
+    uint32_t *,
+    zet_metric_group_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of DeviceExp functions pointers
+typedef struct _zet_device_exp_dditable_t
+{
+    zet_pfnDeviceGetConcurrentMetricGroupsExp_t                 pfnGetConcurrentMetricGroupsExp;
+    zet_pfnDeviceCreateMetricGroupsFromMetricsExp_t             pfnCreateMetricGroupsFromMetricsExp;
+} zet_device_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's DeviceExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetDeviceExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_device_exp_dditable_t* pDdiTable                                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetDeviceExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetDeviceExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_device_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetContextActivateMetricGroups 
+typedef ze_result_t (ZE_APICALL *zet_pfnContextActivateMetricGroups_t)(
+    zet_context_handle_t,
+    zet_device_handle_t,
+    uint32_t,
+    zet_metric_group_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Context functions pointers
+typedef struct _zet_context_dditable_t
+{
+    zet_pfnContextActivateMetricGroups_t                        pfnActivateMetricGroups;
+} zet_context_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Context table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetContextProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_context_dditable_t* pDdiTable                                       ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetContextProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetContextProcAddrTable_t)(
+    ze_api_version_t,
+    zet_context_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetCommandListAppendMetricStreamerMarker 
+typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricStreamerMarker_t)(
+    zet_command_list_handle_t,
+    zet_metric_streamer_handle_t,
+    uint32_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetCommandListAppendMetricQueryBegin 
+typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricQueryBegin_t)(
+    zet_command_list_handle_t,
+    zet_metric_query_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetCommandListAppendMetricQueryEnd 
+typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricQueryEnd_t)(
+    zet_command_list_handle_t,
+    zet_metric_query_handle_t,
+    ze_event_handle_t,
+    uint32_t,
+    ze_event_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetCommandListAppendMetricMemoryBarrier 
+typedef ze_result_t (ZE_APICALL *zet_pfnCommandListAppendMetricMemoryBarrier_t)(
+    zet_command_list_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of CommandList functions pointers
+typedef struct _zet_command_list_dditable_t
+{
+    zet_pfnCommandListAppendMetricStreamerMarker_t              pfnAppendMetricStreamerMarker;
+    zet_pfnCommandListAppendMetricQueryBegin_t                  pfnAppendMetricQueryBegin;
+    zet_pfnCommandListAppendMetricQueryEnd_t                    pfnAppendMetricQueryEnd;
+    zet_pfnCommandListAppendMetricMemoryBarrier_t               pfnAppendMetricMemoryBarrier;
+} zet_command_list_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's CommandList table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetCommandListProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_command_list_dditable_t* pDdiTable                                  ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetCommandListProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetCommandListProcAddrTable_t)(
+    ze_api_version_t,
+    zet_command_list_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetModuleGetDebugInfo 
+typedef ze_result_t (ZE_APICALL *zet_pfnModuleGetDebugInfo_t)(
+    zet_module_handle_t,
+    zet_module_debug_info_format_t,
+    size_t*,
+    uint8_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Module functions pointers
+typedef struct _zet_module_dditable_t
+{
+    zet_pfnModuleGetDebugInfo_t                                 pfnGetDebugInfo;
+} zet_module_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Module table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetModuleProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_module_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetModuleProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetModuleProcAddrTable_t)(
+    ze_api_version_t,
+    zet_module_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetKernelGetProfileInfo 
+typedef ze_result_t (ZE_APICALL *zet_pfnKernelGetProfileInfo_t)(
+    zet_kernel_handle_t,
+    zet_profile_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Kernel functions pointers
+typedef struct _zet_kernel_dditable_t
+{
+    zet_pfnKernelGetProfileInfo_t                               pfnGetProfileInfo;
+} zet_kernel_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Kernel table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetKernelProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_kernel_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetKernelProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetKernelProcAddrTable_t)(
+    ze_api_version_t,
+    zet_kernel_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGet 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGet_t)(
+    zet_metric_group_handle_t,
+    uint32_t*,
+    zet_metric_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGetProperties 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGetProperties_t)(
+    zet_metric_handle_t,
+    zet_metric_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Metric functions pointers
+typedef struct _zet_metric_dditable_t
+{
+    zet_pfnMetricGet_t                                          pfnGet;
+    zet_pfnMetricGetProperties_t                                pfnGetProperties;
+} zet_metric_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Metric table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_dditable_t* pDdiTable                                        ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricCreateFromProgrammableExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricCreateFromProgrammableExp_t)(
+    zet_metric_programmable_exp_handle_t,
+    zet_metric_programmable_param_value_exp_t*,
+    uint32_t,
+    const char*,
+    const char*,
+    uint32_t*,
+    zet_metric_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricDestroyExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricDestroyExp_t)(
+    zet_metric_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricCreateFromProgrammableExp2 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricCreateFromProgrammableExp2_t)(
+    zet_metric_programmable_exp_handle_t,
+    uint32_t,
+    zet_metric_programmable_param_value_exp_t*,
+    const char*,
+    const char*,
+    uint32_t*,
+    zet_metric_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricExp functions pointers
+typedef struct _zet_metric_exp_dditable_t
+{
+    zet_pfnMetricCreateFromProgrammableExp_t                    pfnCreateFromProgrammableExp;
+    zet_pfnMetricDestroyExp_t                                   pfnDestroyExp;
+    zet_pfnMetricCreateFromProgrammableExp2_t                   pfnCreateFromProgrammableExp2;
+} zet_metric_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_exp_dditable_t* pDdiTable                                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupGet 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupGet_t)(
+    zet_device_handle_t,
+    uint32_t*,
+    zet_metric_group_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupGetProperties 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupGetProperties_t)(
+    zet_metric_group_handle_t,
+    zet_metric_group_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupCalculateMetricValues 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCalculateMetricValues_t)(
+    zet_metric_group_handle_t,
+    zet_metric_group_calculation_type_t,
+    size_t,
+    const uint8_t*,
+    uint32_t*,
+    zet_typed_value_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricGroup functions pointers
+typedef struct _zet_metric_group_dditable_t
+{
+    zet_pfnMetricGroupGet_t                                     pfnGet;
+    zet_pfnMetricGroupGetProperties_t                           pfnGetProperties;
+    zet_pfnMetricGroupCalculateMetricValues_t                   pfnCalculateMetricValues;
+} zet_metric_group_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricGroup table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricGroupProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_group_dditable_t* pDdiTable                                  ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricGroupProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricGroupProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_group_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupCalculateMultipleMetricValuesExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCalculateMultipleMetricValuesExp_t)(
+    zet_metric_group_handle_t,
+    zet_metric_group_calculation_type_t,
+    size_t,
+    const uint8_t*,
+    uint32_t*,
+    uint32_t*,
+    uint32_t*,
+    zet_typed_value_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupGetGlobalTimestampsExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupGetGlobalTimestampsExp_t)(
+    zet_metric_group_handle_t,
+    ze_bool_t,
+    uint64_t*,
+    uint64_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupGetExportDataExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupGetExportDataExp_t)(
+    zet_metric_group_handle_t,
+    const uint8_t*,
+    size_t,
+    size_t*,
+    uint8_t *
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupCalculateMetricExportDataExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCalculateMetricExportDataExp_t)(
+    ze_driver_handle_t,
+    zet_metric_group_calculation_type_t,
+    size_t,
+    const uint8_t*,
+    zet_metric_calculate_exp_desc_t*,
+    uint32_t*,
+    uint32_t*,
+    uint32_t*,
+    zet_typed_value_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupCreateExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCreateExp_t)(
+    zet_device_handle_t,
+    const char*,
+    const char*,
+    zet_metric_group_sampling_type_flags_t,
+    zet_metric_group_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupAddMetricExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupAddMetricExp_t)(
+    zet_metric_group_handle_t,
+    zet_metric_handle_t,
+    size_t *,
+    char*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupRemoveMetricExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupRemoveMetricExp_t)(
+    zet_metric_group_handle_t,
+    zet_metric_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupCloseExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupCloseExp_t)(
+    zet_metric_group_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricGroupDestroyExp 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricGroupDestroyExp_t)(
+    zet_metric_group_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricGroupExp functions pointers
+typedef struct _zet_metric_group_exp_dditable_t
+{
+    zet_pfnMetricGroupCalculateMultipleMetricValuesExp_t        pfnCalculateMultipleMetricValuesExp;
+    zet_pfnMetricGroupGetGlobalTimestampsExp_t                  pfnGetGlobalTimestampsExp;
+    zet_pfnMetricGroupGetExportDataExp_t                        pfnGetExportDataExp;
+    zet_pfnMetricGroupCalculateMetricExportDataExp_t            pfnCalculateMetricExportDataExp;
+    zet_pfnMetricGroupCreateExp_t                               pfnCreateExp;
+    zet_pfnMetricGroupAddMetricExp_t                            pfnAddMetricExp;
+    zet_pfnMetricGroupRemoveMetricExp_t                         pfnRemoveMetricExp;
+    zet_pfnMetricGroupCloseExp_t                                pfnCloseExp;
+    zet_pfnMetricGroupDestroyExp_t                              pfnDestroyExp;
+} zet_metric_group_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricGroupExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricGroupExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_group_exp_dditable_t* pDdiTable                              ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricGroupExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricGroupExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_group_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricStreamerOpen 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricStreamerOpen_t)(
+    zet_context_handle_t,
+    zet_device_handle_t,
+    zet_metric_group_handle_t,
+    zet_metric_streamer_desc_t*,
+    ze_event_handle_t,
+    zet_metric_streamer_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricStreamerClose 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricStreamerClose_t)(
+    zet_metric_streamer_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricStreamerReadData 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricStreamerReadData_t)(
+    zet_metric_streamer_handle_t,
+    uint32_t,
+    size_t*,
+    uint8_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricStreamer functions pointers
+typedef struct _zet_metric_streamer_dditable_t
+{
+    zet_pfnMetricStreamerOpen_t                                 pfnOpen;
+    zet_pfnMetricStreamerClose_t                                pfnClose;
+    zet_pfnMetricStreamerReadData_t                             pfnReadData;
+} zet_metric_streamer_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricStreamer table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricStreamerProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_streamer_dditable_t* pDdiTable                               ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricStreamerProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricStreamerProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_streamer_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricQueryPoolCreate 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryPoolCreate_t)(
+    zet_context_handle_t,
+    zet_device_handle_t,
+    zet_metric_group_handle_t,
+    const zet_metric_query_pool_desc_t*,
+    zet_metric_query_pool_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricQueryPoolDestroy 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryPoolDestroy_t)(
+    zet_metric_query_pool_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricQueryPool functions pointers
+typedef struct _zet_metric_query_pool_dditable_t
+{
+    zet_pfnMetricQueryPoolCreate_t                              pfnCreate;
+    zet_pfnMetricQueryPoolDestroy_t                             pfnDestroy;
+} zet_metric_query_pool_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricQueryPool table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricQueryPoolProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_query_pool_dditable_t* pDdiTable                             ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricQueryPoolProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricQueryPoolProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_query_pool_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricQueryCreate 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryCreate_t)(
+    zet_metric_query_pool_handle_t,
+    uint32_t,
+    zet_metric_query_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricQueryDestroy 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryDestroy_t)(
+    zet_metric_query_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricQueryReset 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryReset_t)(
+    zet_metric_query_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetMetricQueryGetData 
+typedef ze_result_t (ZE_APICALL *zet_pfnMetricQueryGetData_t)(
+    zet_metric_query_handle_t,
+    size_t*,
+    uint8_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of MetricQuery functions pointers
+typedef struct _zet_metric_query_dditable_t
+{
+    zet_pfnMetricQueryCreate_t                                  pfnCreate;
+    zet_pfnMetricQueryDestroy_t                                 pfnDestroy;
+    zet_pfnMetricQueryReset_t                                   pfnReset;
+    zet_pfnMetricQueryGetData_t                                 pfnGetData;
+} zet_metric_query_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's MetricQuery table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetMetricQueryProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_metric_query_dditable_t* pDdiTable                                  ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetMetricQueryProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetMetricQueryProcAddrTable_t)(
+    ze_api_version_t,
+    zet_metric_query_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetTracerExpCreate 
+typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpCreate_t)(
+    zet_context_handle_t,
+    const zet_tracer_exp_desc_t*,
+    zet_tracer_exp_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetTracerExpDestroy 
+typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpDestroy_t)(
+    zet_tracer_exp_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetTracerExpSetPrologues 
+typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpSetPrologues_t)(
+    zet_tracer_exp_handle_t,
+    zet_core_callbacks_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetTracerExpSetEpilogues 
+typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpSetEpilogues_t)(
+    zet_tracer_exp_handle_t,
+    zet_core_callbacks_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetTracerExpSetEnabled 
+typedef ze_result_t (ZE_APICALL *zet_pfnTracerExpSetEnabled_t)(
+    zet_tracer_exp_handle_t,
+    ze_bool_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of TracerExp functions pointers
+typedef struct _zet_tracer_exp_dditable_t
+{
+    zet_pfnTracerExpCreate_t                                    pfnCreate;
+    zet_pfnTracerExpDestroy_t                                   pfnDestroy;
+    zet_pfnTracerExpSetPrologues_t                              pfnSetPrologues;
+    zet_pfnTracerExpSetEpilogues_t                              pfnSetEpilogues;
+    zet_pfnTracerExpSetEnabled_t                                pfnSetEnabled;
+} zet_tracer_exp_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's TracerExp table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetTracerExpProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_tracer_exp_dditable_t* pDdiTable                                    ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetTracerExpProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetTracerExpProcAddrTable_t)(
+    ze_api_version_t,
+    zet_tracer_exp_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugAttach 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugAttach_t)(
+    zet_device_handle_t,
+    const zet_debug_config_t*,
+    zet_debug_session_handle_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugDetach 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugDetach_t)(
+    zet_debug_session_handle_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugReadEvent 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugReadEvent_t)(
+    zet_debug_session_handle_t,
+    uint64_t,
+    zet_debug_event_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugAcknowledgeEvent 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugAcknowledgeEvent_t)(
+    zet_debug_session_handle_t,
+    const zet_debug_event_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugInterrupt 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugInterrupt_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugResume 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugResume_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugReadMemory 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugReadMemory_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t,
+    const zet_debug_memory_space_desc_t*,
+    size_t,
+    void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugWriteMemory 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugWriteMemory_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t,
+    const zet_debug_memory_space_desc_t*,
+    size_t,
+    const void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugGetRegisterSetProperties 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugGetRegisterSetProperties_t)(
+    zet_device_handle_t,
+    uint32_t*,
+    zet_debug_regset_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugReadRegisters 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugReadRegisters_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t,
+    uint32_t,
+    uint32_t,
+    uint32_t,
+    void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugWriteRegisters 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugWriteRegisters_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t,
+    uint32_t,
+    uint32_t,
+    uint32_t,
+    void*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetDebugGetThreadRegisterSetProperties 
+typedef ze_result_t (ZE_APICALL *zet_pfnDebugGetThreadRegisterSetProperties_t)(
+    zet_debug_session_handle_t,
+    ze_device_thread_t,
+    uint32_t*,
+    zet_debug_regset_properties_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Table of Debug functions pointers
+typedef struct _zet_debug_dditable_t
+{
+    zet_pfnDebugAttach_t                                        pfnAttach;
+    zet_pfnDebugDetach_t                                        pfnDetach;
+    zet_pfnDebugReadEvent_t                                     pfnReadEvent;
+    zet_pfnDebugAcknowledgeEvent_t                              pfnAcknowledgeEvent;
+    zet_pfnDebugInterrupt_t                                     pfnInterrupt;
+    zet_pfnDebugResume_t                                        pfnResume;
+    zet_pfnDebugReadMemory_t                                    pfnReadMemory;
+    zet_pfnDebugWriteMemory_t                                   pfnWriteMemory;
+    zet_pfnDebugGetRegisterSetProperties_t                      pfnGetRegisterSetProperties;
+    zet_pfnDebugReadRegisters_t                                 pfnReadRegisters;
+    zet_pfnDebugWriteRegisters_t                                pfnWriteRegisters;
+    zet_pfnDebugGetThreadRegisterSetProperties_t                pfnGetThreadRegisterSetProperties;
+} zet_debug_dditable_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Debug table
+///        with current process' addresses
+///
+/// @returns
+///     - ::ZE_RESULT_SUCCESS
+///     - ::ZE_RESULT_ERROR_UNINITIALIZED
+///     - ::ZE_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::ZE_RESULT_ERROR_UNSUPPORTED_VERSION
+ZE_DLLEXPORT ze_result_t ZE_APICALL
+zetGetDebugProcAddrTable(
+    ze_api_version_t version,                                               ///< [in] API version requested
+    zet_debug_dditable_t* pDdiTable                                         ///< [in,out] pointer to table of DDI function pointers
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for zetGetDebugProcAddrTable
+typedef ze_result_t (ZE_APICALL *zet_pfnGetDebugProcAddrTable_t)(
+    ze_api_version_t,
+    zet_debug_dditable_t*
+    );
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Container for all DDI tables
+typedef struct _zet_dditable_t
+{
+    zet_metric_programmable_exp_dditable_t  MetricProgrammableExp;
+    zet_metric_tracer_exp_dditable_t    MetricTracerExp;
+    zet_metric_decoder_exp_dditable_t   MetricDecoderExp;
+    zet_device_dditable_t               Device;
+    zet_device_exp_dditable_t           DeviceExp;
+    zet_context_dditable_t              Context;
+    zet_command_list_dditable_t         CommandList;
+    zet_module_dditable_t               Module;
+    zet_kernel_dditable_t               Kernel;
+    zet_metric_dditable_t               Metric;
+    zet_metric_exp_dditable_t           MetricExp;
+    zet_metric_group_dditable_t         MetricGroup;
+    zet_metric_group_exp_dditable_t     MetricGroupExp;
+    zet_metric_streamer_dditable_t      MetricStreamer;
+    zet_metric_query_pool_dditable_t    MetricQueryPool;
+    zet_metric_query_dditable_t         MetricQuery;
+    zet_tracer_exp_dditable_t           TracerExp;
+    zet_debug_dditable_t                Debug;
+} zet_dditable_t;
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif // _ZET_DDI_H
\ No newline at end of file
diff --git a/third_party/mdapi/metrics_discovery_api.h b/third_party/mdapi/metrics_discovery_api.h
new file mode 100644
index 00000000000..391847eb1ef
--- /dev/null
+++ b/third_party/mdapi/metrics_discovery_api.h
@@ -0,0 +1,1946 @@
+/*****************************************************************************\
+
+    Copyright © 2019, Intel Corporation
+
+    Permission is hereby granted, free of charge, to any person obtaining a
+    copy of this software and associated documentation files (the "Software"),
+    to deal in the Software without restriction, including without limitation
+    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+    and/or sell copies of the Software, and to permit persons to whom the
+    Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+    THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+    IN THE SOFTWARE.
+
+    File Name:  metrics_discovery_api.h
+
+    Abstract:   Interface for metrics discovery DLL
+
+    Notes:
+
+\*****************************************************************************/
+#include <stdint.h>
+
+#ifndef __METRICS_DISCOVERY_H_
+#define __METRICS_DISCOVERY_H_
+
+#ifdef _MSC_VER
+#define MD_STDCALL __stdcall
+#else
+#define MD_STDCALL
+#endif // _MSC_VER
+
+//////////////////////////////////////////////////////////////////////////////////
+// Helper macro to check required API version.
+// Combines major and minor into one, comparable 64bit value.
+//////////////////////////////////////////////////////////////////////////////////
+#define MD_API_VERSION_COMBINE_MAJOR_MINOR(version) \
+    ((uint64_t)(version).MajorNumber << 32 | (uint64_t)(version).MinorNumber)
+
+//////////////////////////////////////////////////////////////////////////////////
+// Macro to check required API version.
+// Uses TApiVersion_1_0 struct.
+//////////////////////////////////////////////////////////////////////////////////
+#define MD_API_VERSION_AT_LEAST(requiredVersion, currentVersion) \
+    (MD_API_VERSION_COMBINE_MAJOR_MINOR((currentVersion)) \
+                    > MD_API_VERSION_COMBINE_MAJOR_MINOR((requiredVersion)) \
+            || MD_API_VERSION_COMBINE_MAJOR_MINOR((currentVersion)) \
+                            == MD_API_VERSION_COMBINE_MAJOR_MINOR( \
+                                    (requiredVersion)) \
+                    && (currentVersion).BuildNumber \
+                            >= (requiredVersion).BuildNumber)
+
+//////////////////////////////////////////////////////////////////////////////////
+// API build number:
+//////////////////////////////////////////////////////////////////////////////////
+#define MD_API_BUILD_NUMBER_CURRENT 176
+
+namespace MetricsDiscovery {
+//////////////////////////////////////////////////////////////////////////////////
+// API major version number:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMD_API_MAJOR_VERSION {
+    MD_API_MAJOR_NUMBER_1 = 1,
+    MD_API_MAJOR_NUMBER_CURRENT = MD_API_MAJOR_NUMBER_1,
+    MD_API_MAJOR_NUMBER_CEIL = 0xFFFFFFFF
+} MD_API_MAJOR_VERSION;
+
+//////////////////////////////////////////////////////////////////////////////////
+// API minor version number:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMD_API_MINOR_VERSION {
+    MD_API_MINOR_NUMBER_0 = 0,
+    MD_API_MINOR_NUMBER_1 = 1, // CalculationAPI
+    MD_API_MINOR_NUMBER_2 = 2, // OverridesAPI
+    MD_API_MINOR_NUMBER_3 = 3, // BatchBuffer Sampling (aka DMA Sampling)
+    MD_API_MINOR_NUMBER_4 = 4, // GT dependent MetricSets
+    MD_API_MINOR_NUMBER_5 = 5, // MaxValue calculation for CalculationAPI
+    MD_API_MINOR_NUMBER_6 = 6, // Multi adapter support
+    MD_API_MINOR_NUMBER_7 = 7, // Compile time equations calculation approach
+    MD_API_MINOR_NUMBER_8 = 8, // TAdapterParams update
+    MD_API_MINOR_NUMBER_9 = 9, // Sub device support.
+    MD_API_MINOR_NUMBER_10
+    = 10, // GetGpuCpuTimestamps API function extended by a correlation indicator param
+    MD_API_MINOR_NUMBER_11 = 11, // Add availability equations for metric sets
+    MD_API_MINOR_NUMBER_12
+    = 12, // Add support for Information Set in concurrent group
+    MD_API_MINOR_NUMBER_13 = 13, // Extend API to support flexible metric sets
+    MD_API_MINOR_NUMBER_CURRENT = MD_API_MINOR_NUMBER_13,
+    MD_API_MINOR_NUMBER_CEIL = 0xFFFFFFFF
+} MD_API_MINOR_VERSION;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Completion codes:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum ECompletionCode {
+    CC_OK = 0,
+    CC_READ_PENDING = 1,
+    CC_ALREADY_INITIALIZED = 2,
+    CC_STILL_INITIALIZED = 3,
+    CC_CONCURRENT_GROUP_LOCKED = 4,
+    CC_WAIT_TIMEOUT = 5,
+    CC_TRY_AGAIN = 6,
+    CC_INTERRUPTED = 7,
+    CC_ERROR_INVALID_PARAMETER = 40,
+    CC_ERROR_NO_MEMORY = 41,
+    CC_ERROR_GENERAL = 42,
+    CC_ERROR_FILE_NOT_FOUND = 43,
+    CC_ERROR_NOT_SUPPORTED = 44,
+    CC_ERROR_ACCESS_DENIED = 45,
+    CC_LAST_1_0 = 46
+} TCompletionCode;
+
+/* Forward declarations */
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for the GPU metrics root object.
+//////////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_0;
+class IMetricsDevice_1_1;
+class IMetricsDevice_1_2;
+class IMetricsDevice_1_5;
+class IMetricsDevice_1_10;
+class IMetricsDevice_1_11;
+class IMetricsDevice_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for Metrics Device overrides.
+//////////////////////////////////////////////////////////////////////////////////
+class IOverride_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for the metrics groups that can be collected concurrently
+// to another group.
+//////////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_0;
+class IConcurrentGroup_1_1;
+class IConcurrentGroup_1_5;
+class IConcurrentGroup_1_11;
+class IConcurrentGroup_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for the metric sets mapping to different HW configuration
+// that should be used exclusively to each other metric set in the concurrent
+// group.
+//////////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_0;
+class IMetricSet_1_1;
+class IMetricSet_1_4;
+class IMetricSet_1_5;
+class IMetricSet_1_11;
+class IMetricSet_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for the metric that is sampled.
+//////////////////////////////////////////////////////////////////////////////////
+class IMetric_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for the measurement information (report reason, etc.).
+//////////////////////////////////////////////////////////////////////////////////
+class IInformation_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Abstract interface for the metric read and normalization equation.
+//////////////////////////////////////////////////////////////////////////////////
+class IEquation_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Value types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EValueType {
+    VALUE_TYPE_UINT32,
+    VALUE_TYPE_UINT64,
+    VALUE_TYPE_FLOAT,
+    VALUE_TYPE_BOOL,
+    VALUE_TYPE_CSTRING,
+    VALUE_TYPE_BYTEARRAY,
+    VALUE_TYPE_UINT32_RANGE,
+    VALUE_TYPE_UINT64_RANGE,
+    // ...
+    VALUE_TYPE_LAST,
+} TValueType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Byte Array:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SByteArray_1_0 {
+    uint32_t Size;
+    uint8_t *Data;
+} TByteArray_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Typed value:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct STypedValue_1_0 {
+    TValueType ValueType;
+    union {
+        uint32_t ValueUInt32;
+        uint64_t ValueUInt64;
+        struct {
+            uint32_t Low;
+            uint32_t High;
+        } ValueUInt64Fields;
+        float ValueFloat;
+        bool ValueBool;
+        char *ValueCString;
+        TByteArray_1_0 *ValueByteArray; // Dynamically allocated
+    };
+} TTypedValue_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global symbol:
+//     Global symbols will be available to describe SKU specific information.
+//     Example global symbols:
+//     "EuCoresTotalCount", "EuThreadsCount", "EuSlicesTotalCount", "EuSubslicesTotalCount",
+//     "SamplersTotalCount", "PciDeviceId", "NumberOfShadingUnits", "GpuTimestampFrequency",
+//     "MaxTimestamp", "GpuMinFrequencyMHz", "GpuMaxFrequencyMHz"
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SGlobalSymbol_1_0 {
+    const char *SymbolName;
+    TTypedValue_1_0 SymbolTypedValue;
+} TGlobalSymbol_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// API version:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SApiVersion_1_0 {
+    uint32_t MajorNumber;
+    uint32_t MinorNumber;
+    uint32_t BuildNumber;
+} TApiVersion_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Metrics Device:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricsDeviceParams_1_0 {
+    // API version
+    TApiVersion_1_0 Version;
+    uint32_t ConcurrentGroupsCount;
+    uint32_t GlobalSymbolsCount;
+    uint32_t DeltaFunctionsCount;
+    uint32_t EquationElementTypesCount;
+    uint32_t EquationOperationsCount;
+    const char *DeviceName;
+} TMetricsDeviceParams_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Metrics Device 1.2:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricsDeviceParams_1_2 : public SMetricsDeviceParams_1_0 {
+    uint32_t OverrideCount;
+} TMetricsDeviceParams_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Metric API types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMetricApiType {
+    API_TYPE_IOSTREAM = 0x00000001, // API independent method
+    API_TYPE_DX9 = 0x00000002,
+    API_TYPE_DX10 = 0x00000004,
+    API_TYPE_DX11 = 0x00000008,
+    API_TYPE_OGL = 0x00000010,
+    API_TYPE_OGL4_X = 0x00000020,
+    API_TYPE_OCL = 0x00000040,
+    API_TYPE_MEDIA = 0x00000080, // Obsolete
+    API_TYPE_DX12 = 0x00000100,
+    API_TYPE_BBSTREAM = 0x00000200, // Obsolete
+    API_TYPE_VULKAN = 0x00000400,
+    API_TYPE_RESERVED = 0x00000800,
+    API_TYPE_ALL = 0xFFFFFFFF
+} TMetricApiType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Measurement types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMeasurementType {
+    MEASUREMENT_TYPE_SNAPSHOT_IO = 0x00000001,
+    MEASUREMENT_TYPE_SNAPSHOT_QUERY = 0x00000002,
+    MEASUREMENT_TYPE_DELTA_QUERY = 0x00000004,
+    MEASUREMENT_TYPE_ALL = 0x0000FFFF,
+} TMeasurementType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Usage flags:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMetricUsageFlag {
+    USAGE_FLAG_OVERVIEW
+    = 0x00000001, // GPU system overview metric, useful for high level workload characterization
+    USAGE_FLAG_INDICATE
+    = 0x00000002, // Metric indicating a performance problem, useful when comparing with threshold
+    USAGE_FLAG_CORRELATE
+    = 0x00000004, // Metric correlating with performance problem, useful for proving to false only
+    USAGE_FLAG_SYSTEM = 0x00000020, // Metric useful at system level
+    USAGE_FLAG_FRAME = 0x00000040, // Metric useful at frame level
+    USAGE_FLAG_BATCH = 0x00000080, // Metric useful at batch level
+    USAGE_FLAG_DRAW = 0x00000100, // Metric useful at draw level
+    USAGE_FLAG_TIER_1 = 0x00000400,
+    USAGE_FLAG_TIER_2 = 0x00000800,
+    USAGE_FLAG_TIER_3 = 0x00001000,
+    USAGE_FLAG_TIER_4 = 0x00002000,
+    USAGE_FLAG_GLASS_JAW = 0x00004000,
+    USAGE_FLAG_ALL = 0x0000FFFF,
+} TMetricUsageFlag;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Sampling types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum ESamplingType {
+    SAMPLING_TYPE_OA_TIMER = 0x00000001,
+    SAMPLING_TYPE_OA_EVENT = 0x00000002,
+    SAMPLING_TYPE_GPU_QUERY = 0x00000004,
+    SAMPLING_TYPE_DMA_BUFFER
+    = 0x00000008, // Possible future extension for media
+    SAMPLING_TYPE_OAM_TIMER = 0x00000010,
+    SAMPLING_TYPE_ALL = 0x0000FFFF,
+} TSamplingType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Metric categories:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMetricCategory {
+    GPU_RENDER = 0x0001,
+    GPU_COMPUTE = 0x0002,
+    GPU_MEDIA = 0x0004,
+    GPU_GENERIC
+    = 0x0008, // Does not belong to any specific category like memory traffic
+} TMetricCategory;
+
+//////////////////////////////////////////////////////////////////////////////////
+// IoStream read flags:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EIoReadFlag {
+    IO_READ_FLAG_DROP_OLD_REPORTS = 0x00000001,
+    IO_READ_FLAG_GET_CONTEXT_ID_TAGS = 0x00000002,
+} TIoReadFlag;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Override modes:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EOverrideMode {
+    OVERRIDE_MODE_GLOBAL = 0x0001,
+    OVERRIDE_MODE_LOCAL = 0x0002,
+} TOverrideMode;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Adapter capability flags:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EAdapterCapability {
+    ADAPTER_CAPABILITY_UNDEFINED = 0,
+    ADAPTER_CAPABILITY_RENDER_SUPPORTED = 1 << 0,
+} TAdapterCapability;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Adapter types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EAdapterType {
+    ADAPTER_TYPE_UNDEFINED = 0,
+    ADAPTER_TYPE_INTEGRATED,
+    ADAPTER_TYPE_DISCRETE,
+} TAdapterType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Adapter ID types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EAdapterIdType {
+    ADAPTER_ID_TYPE_UNDEFINED = 0,
+    ADAPTER_ID_TYPE_LUID,
+    ADAPTER_ID_TYPE_MAJOR_MINOR,
+} TAdapterIdType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// LUID (locally unique identifier) adapter ID:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterIdLuid_1_6 {
+    uint32_t LowPart;
+    int32_t HighPart;
+} TAdapterIdLuid_1_6;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Major / minor pair adapter ID:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterIdMajorMinor_1_6 {
+    int32_t Major;
+    int32_t Minor;
+} TAdapterIdMajorMinor_1_6;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Adapter ID:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterId_1_6 {
+    TAdapterIdType Type;
+    union {
+        TAdapterIdLuid_1_6 Luid;
+        TAdapterIdMajorMinor_1_6 MajorMinor;
+    };
+} TAdapterId_1_6;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of GPU adapter:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterParams_1_6 {
+    const char *ShortName;
+    TAdapterId_1_6 SystemId; // Operating system specific adapter id
+    uint32_t VendorId;
+    uint32_t SubVendorId;
+    uint32_t DeviceId;
+    uint32_t Platform;
+    uint32_t BusNumber;
+    uint32_t DeviceNumber;
+    uint32_t FunctionNumber;
+    TAdapterType Type; // Adapter type, e.g. integrated, discrete
+    uint32_t
+            CapabilityMask; // Consists of TAdapterCapability flags, e.g. RENDER_SUPPORTED
+} TAdapterParams_1_6;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of GPU adapter:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterParams_1_8 : public SAdapterParams_1_6 {
+    uint32_t DomainNumber;
+} TAdapterParams_1_8;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of GPU adapter:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterParams_1_9 : public SAdapterParams_1_8 {
+    uint32_t SubDevicesCount;
+} TAdapterParams_1_9;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of GPU sub device:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SSubDeviceParams_1_9 {
+    uint32_t EnginesCount;
+} TSubDeviceParams_1_9;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Engine ID types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EEngineIdType { ENGINE_ID_TYPE_CLASS_INSTANCE = 0 } TEngineIdType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Class / instance engine ID:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SEngineIdClassInstance_1_9 {
+    uint32_t Class;
+    uint32_t Instance;
+} TEngineIdClassInstance_1_9;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Engine identification:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SEngineId_1_9 {
+    TEngineIdType Type;
+
+    union {
+        TEngineIdClassInstance_1_9 ClassInstance;
+    };
+} TEngineId_1_9;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of GPU engine:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SEngineParams_1_9 {
+    TEngineId_1_9 EngineId;
+} TEngineParams_1_9;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of GPU engine:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SEngineParams_1_13 : public SEngineParams_1_9 {
+    uint32_t GtId;
+    uint32_t OaUnit;
+} TEngineParams_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Adapter Group:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SAdapterGroupParams_1_6 {
+    // API version
+    TApiVersion_1_0 Version;
+    uint32_t AdapterCount;
+} TAdapterGroupParams_1_6;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Concurrent Group:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SConcurrentGroupParams_1_0 {
+    const char *SymbolName; // For example "OA" or "OAM0" or "PipeStats"
+    const char *Description; // For example "OA Unit Metrics"
+    uint32_t MeasurementTypeMask;
+    uint32_t MetricSetsCount;
+    uint32_t IoMeasurementInformationCount;
+    uint32_t IoGpuContextInformationCount;
+
+} TConcurrentGroupParams_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Concurrent Group:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SConcurrentGroupParams_1_13 : SConcurrentGroupParams_1_0 {
+    const char *StreamEventName;
+} TConcurrentGroupParams_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of an Override:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SOverrideParams_1_2 {
+    const char *SymbolName; // For example "FrequencyOverride"
+    const char *
+            Description; // For example "Overrides device GPU frequency with a static value."
+    uint32_t ApiMask;
+    uint32_t PlatformMask;
+    uint32_t OverrideModeMask;
+
+} TOverrideParams_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Base params of SetOverride method:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SSetOverrideParams_1_2 {
+    bool Enable;
+} TSetOverrideParams_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Frequency override specific SetOverride params:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SSetFrequencyOverrideParams_1_2 : SSetOverrideParams_1_2 {
+    uint32_t FrequencyMhz;
+    uint32_t Pid;
+} TSetFrequencyOverrideParams_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Query override specific SetOverride params:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SSetQueryOverrideParams_1_2 : SSetOverrideParams_1_2 {
+    uint32_t Period; // Nanoseconds
+} TSetQueryOverrideParams_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Driver override params:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SSetDriverOverrideParams_1_2 : SSetOverrideParams_1_2 {
+    uint32_t Value;
+} TSetDriverOverrideParams_1_2;
+
+//////////////////////////////////////////////////////////////////////////////////
+// API specific id:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SApiSpecificId_1_0 {
+    uint32_t D3D9QueryId; // D3D9 Query ID
+    uint32_t D3D9Fourcc; // D3D9 FourCC
+    uint32_t D3D1XQueryId; // D3D1X Query ID
+    uint32_t D3D1XDevDependentId; // D3D1X device dependent counter ID
+    const char *D3D1XDevDependentName; // Device dependent counter name
+    uint32_t OGLQueryIntelId; // Intel OGL query extension ID
+    const char *OGLQueryIntelName; // Intel OGL query extension name
+    uint32_t OGLQueryARBTargetId; // ARB OGL Query Target ID
+    uint32_t OCL; // OCL configuration ID
+    uint32_t HwConfigId; // Config ID for IO stream
+    uint32_t placeholder[1];
+} TApiSpecificId_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Metric set:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricSetParams_1_0 {
+    const char *SymbolName; // For example "Dx11Tessellation"
+    const char *ShortName; // For example "DX11 Tessellation Metrics Set"
+    uint32_t ApiMask;
+    uint32_t CategoryMask;
+    uint32_t RawReportSize; // As in HW
+    uint32_t QueryReportSize; // As in Query API
+    uint32_t MetricsCount;
+    uint32_t InformationCount;
+    uint32_t ComplementarySetsCount;
+    TApiSpecificId_1_0 ApiSpecificId;
+    uint32_t PlatformMask;
+} TMetricSetParams_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// GT differenced MetricSet params:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricSetParams_1_4 : SMetricSetParams_1_0 {
+    uint32_t GtMask;
+} TMetricSetParams_1_4;
+
+//////////////////////////////////////////////////////////////////////////////////
+// AvailabilityEquation differenced MetricSet params:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricSetParams_1_11 : SMetricSetParams_1_4 {
+    const char *AvailabilityEquation;
+} TMetricSetParams_1_11;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Metric result types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMetricResultType {
+    RESULT_UINT32,
+    RESULT_UINT64,
+    RESULT_BOOL,
+    RESULT_FLOAT,
+    RESULT_LAST
+} TMetricResultType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Metric types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EMetricType {
+    METRIC_TYPE_DURATION,
+    METRIC_TYPE_EVENT,
+    METRIC_TYPE_EVENT_WITH_RANGE,
+    METRIC_TYPE_THROUGHPUT,
+    METRIC_TYPE_TIMESTAMP,
+    METRIC_TYPE_FLAG,
+    METRIC_TYPE_RATIO,
+    METRIC_TYPE_RAW,
+    METRIC_TYPE_LAST
+} TMetricType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Information types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EInformationType {
+    INFORMATION_TYPE_REPORT_REASON,
+    INFORMATION_TYPE_VALUE,
+    INFORMATION_TYPE_FLAG,
+    INFORMATION_TYPE_TIMESTAMP,
+    INFORMATION_TYPE_CONTEXT_ID_TAG,
+    INFORMATION_TYPE_SAMPLE_PHASE,
+    INFORMATION_TYPE_GPU_NODE,
+    INFORMATION_TYPE_LAST
+} TInformationType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Report reasons:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EReportReason {
+    REPORT_REASON_UNDEFINED = 0x0000,
+    REPORT_REASON_INTERNAL_TIMER = 0x0001,
+    REPORT_REASON_INTERNAL_TRIGGER1 = 0x0002,
+    REPORT_REASON_INTERNAL_TRIGGER2 = 0x0004,
+    REPORT_REASON_INTERNAL_CONTEXT_SWITCH = 0x0008,
+    REPORT_REASON_INTERNAL_GO = 0x0010,
+    REPORT_REASON_INTERNAL_FREQUENCY_CHANGE = 0x0020,
+    REPORT_REASON_INTERNAL_MMIO_TRIGGER = 0x0040,
+    REPORT_REASON_QUERY_DEFAULT = 0x0100,
+    REPORT_REASON_QUERY_INTERNAL_RESOLVE = 0x0200,
+    REPORT_REASON_QUERY_INTERNAL_CLEAR = 0x0400,
+} TReportReason;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Sample phase:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum ESamplePhase {
+    SAMPLE_PHASE_END,
+    SAMPLE_PHASE_BEGIN,
+    SAMPLE_PHASE_LAST
+} TSamplePhase;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Gpu Node:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EInformationGpuNode {
+    INFORMATION_GPUNODE_3D = 0, // Available by default on all platform
+    INFORMATION_GPUNODE_VIDEO = 1, // Available on CTG+
+    INFORMATION_GPUNODE_BLT = 2, // Available on GT
+    INFORMATION_GPUNODE_VE = 3, // Available on HSW+ (VideoEnhancement)
+    INFORMATION_GPUNODE_VCS2 = 4, // Available on BDW+ GT3+
+    INFORMATION_GPUNODE_REAL_MAX
+    = 5, // All nodes beyond this are virtual nodes - they don't have an actual GPU engine
+    INFORMATION_GPUNODE_LAST
+} TInformationGpuNode;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Hardware unit types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EHwUnitType {
+    HW_UNIT_GPU,
+    HW_UNIT_SLICE,
+    HW_UNIT_SUBSLICE,
+    HW_UNIT_SUBSLICE_BANK,
+    HW_UNIT_EU_UNIT,
+    HW_UNIT_UNCORE,
+    HW_UNIT_DUALSUBSLICE,
+    HW_UNIT_LAST
+} THwUnitType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Delta function types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EDeltaFunctionType {
+    DELTA_FUNCTION_NULL = 0,
+    DELTA_N_BITS,
+    DELTA_BOOL_OR, // Logic OR - good for exceptions
+    DELTA_BOOL_XOR, // Logic XOR - good to check if bits were changed
+    DELTA_GET_PREVIOUS, // Preserve previous value
+    DELTA_GET_LAST, // Preserve last value
+    DELTA_NS_TIME, // Delta for nanosecond timestamps (GPU timestamp wraps at 32 bits but was value multiplied by 80)
+    DELTA_FUNCTION_LAST_1_0
+} TDeltaFunctionType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Delta function:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SDeltaFunction_1_0 {
+    TDeltaFunctionType FunctionType;
+    union {
+        uint32_t BitsCount; // Used for DELTA_N_BITS to specify bits count
+    };
+} TDeltaFunction_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Equation element types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EEquationElementType {
+    EQUATION_ELEM_OPERATION, // See TEquationOperation enumeration
+    EQUATION_ELEM_RD_BITFIELD, //
+    EQUATION_ELEM_RD_UINT8, //
+    EQUATION_ELEM_RD_UINT16, //
+    EQUATION_ELEM_RD_UINT32, //
+    EQUATION_ELEM_RD_UINT64, //
+    EQUATION_ELEM_RD_FLOAT, //
+    EQUATION_ELEM_RD_40BIT_CNTR, // Assemble 40 bit counter that is in two locations, result in unsigned integer 64b
+    EQUATION_ELEM_IMM_UINT64, //
+    EQUATION_ELEM_IMM_FLOAT, //
+    EQUATION_ELEM_SELF_COUNTER_VALUE, // Defined by $Self token, the UINT64 result of DeltaFunction for IO or QueryReadEquation
+    EQUATION_ELEM_GLOBAL_SYMBOL, // Defined by $"SymbolName", available in MetricsDevice SymbolTable
+    EQUATION_ELEM_LOCAL_COUNTER_SYMBOL, // Defined by $"SymbolName", refers to counter delta value in the local set
+    EQUATION_ELEM_OTHER_SET_COUNTER_SYMBOL, // Defined by concatenated string of $"setSymbolName/SymbolName", refers to counter
+    // Delta value in the other set
+    EQUATION_ELEM_LOCAL_METRIC_SYMBOL, // Defined by $$"SymbolName", refers to metric normalized value in the local set
+    EQUATION_ELEM_OTHER_SET_METRIC_SYMBOL, // Defined by concatenated string of $$"setSymbolName/SymbolName", refers to metric
+    // Normalized value in the other set
+    EQUATION_ELEM_INFORMATION_SYMBOL, // Defined by i$"SymbolName", refers to information value type only
+    EQUATION_ELEM_STD_NORM_GPU_DURATION, // Action is $Self $GpuCoreClocks FDIV 100 FMUL
+    EQUATION_ELEM_STD_NORM_EU_AGGR_DURATION, // Action is $Self $GpuCoreClocks $EuCoresTotalCount UMUL FDIV 100 FMUL
+    EQUATION_ELEM_MASK, //
+    EQUATION_ELEM_LAST_1_0
+
+} TEquationElementType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Equation operations:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EEquationOperation {
+    EQUATION_OPER_RSHIFT, // 64b unsigned integer right shift
+    EQUATION_OPER_LSHIFT, // 64b unsigned integer left shift
+    EQUATION_OPER_AND, // Bitwise AND of two unsigned integers, 64b each
+    EQUATION_OPER_OR, // Bitwise OR of two unsigned integers, 64b each
+    EQUATION_OPER_XOR, // Bitwise XOR of two unsigned integers, 64b each
+    EQUATION_OPER_XNOR, // Bitwise XNOR of two unsigned integers, 64b each
+    EQUATION_OPER_AND_L, // Logical AND (C-like "&&") of two unsigned integers, 64b each, result is true(1) if both values are true(greater than 0)
+    EQUATION_OPER_EQUALS, // Equality (C-like "==") of two unsigned integers, 64b each, result is true(1) or false(0)
+    EQUATION_OPER_UADD, // Unsigned integer add, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
+    EQUATION_OPER_USUB, // Unsigned integer subtract, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
+    EQUATION_OPER_UMUL, // Unsigned integer mul, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
+    EQUATION_OPER_UDIV, // Unsigned integer div, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
+    EQUATION_OPER_FADD, // Floating point add, arguments are casted to be 32b floating points, result is a 32b float
+    EQUATION_OPER_FSUB, // Floating point subtract, arguments are casted to be 32b floating points, result is a 32b float
+    EQUATION_OPER_FMUL, // Floating point multiply, arguments are casted to be 32b floating points, result is a 32b float
+    EQUATION_OPER_FDIV, // Floating point divide, arguments are casted to be 32b floating points, result is a 32b float
+    EQUATION_OPER_UGT, // 64b unsigned integers comparison of is greater than, result is bool true(1) or false(0)
+    EQUATION_OPER_ULT, // 64b unsigned integers comparison of is less than, result is bool true(1) or false(0)
+    EQUATION_OPER_UGTE, // 64b unsigned integers comparison of is greater than or equal, result is bool true(1) or false(0)
+    EQUATION_OPER_ULTE, // 64b unsigned integers comparison of is less than or equal, result is bool true(1) or false(0)
+    EQUATION_OPER_FGT, // 32b floating point numbers comparison of is greater than, result is bool true(1) or false(0)
+    EQUATION_OPER_FLT, // 32b floating point numbers comparison of is less than, result is bool true(1) or false(0)
+    EQUATION_OPER_FGTE, // 32b floating point numbers comparison of is greater than or equal, result is bool true(1) or false(0)
+    EQUATION_OPER_FLTE, // 32b floating point numbers comparison of is less than or equal, result is bool true(1) or false(0)
+    EQUATION_OPER_UMIN, // Unsigned integer MIN function, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
+    EQUATION_OPER_UMAX, // Unsigned integer MAX function, arguments are casted to be 64b unsigned integers, result is unsigned integer 64b
+    EQUATION_OPER_FMIN, // Floating point MIN function, arguments are casted to be 32b floating points, result is a 32b float
+    EQUATION_OPER_FMAX, // Floating point MAX function, arguments are casted to be 32b floating points, result is a 32b float
+    EQUATION_OPER_LAST_1_0
+} TEquationOperation;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Metric prototype option descriptor types:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EOptionDescriptorType {
+    OPTION_DESCRIPTOR_TYPE_DISAGGREGATION
+    = 0, // Metric is disaggregated by: slices, xe cores, l3 banks etc
+    OPTION_DESCRIPTOR_TYPE_LATENCY = 1, // Latency measurements
+    OPTION_DESCRIPTOR_TYPE_NORMALIZATION_UTILIZATION
+    = 2, // Produces normalization equation: "raw_metric * 100% / cycles / instance_count", metric unit is updated to percent
+    OPTION_DESCRIPTOR_TYPE_NORMALIZATION_AVERAGE
+    = 3, // Produces normalization equation: "raw_metric / instance_count", metric unit is not changed
+    OPTION_DESCRIPTOR_TYPE_NORMALIZATION_RATE
+    = 4, // Produces normalization equation: "raw_metric / timestamp", metric unit is updated to per second
+    OPTION_DESCRIPTOR_TYPE_NORMALIZATION_BYTE
+    = 5, // Produces normalization equation: "raw_metric * N bytes", metric unit is updated to bytes
+    OPTION_DESCRIPTOR_TYPE_LAST
+} TOptionDescriptorType;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Disaggregation modes:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EDisaggregationMode {
+    DISAGGREGATION_MODE_NONE = 0,
+    DISAGGREGATION_MODE_XECORE = 1,
+    DISAGGREGATION_MODE_L3BANK = 2,
+    DISAGGREGATION_MODE_SLICE = 3,
+    DISAGGREGATION_MODE_SQIDI = 4,
+    DISAGGREGATION_MODE_L3NODE = 5,
+    DISAGGREGATION_MODE_COPYENGINE = 6,
+    DISAGGREGATION_MODE_LAST
+} TDisaggregationMode;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Query modes:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EQueryMode {
+    QUERY_MODE_NONE = 0,
+    QUERY_MODE_RENDER = 1,
+    QUERY_MODE_COMPUTE = 2,
+    QUERY_MODE_GLOBAL = 3,
+    QUERY_MODE_GLOBAL_EXTENDED = 4,
+    QUERY_MODE_LAST
+} TQueryMode;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Query mode masks:
+//////////////////////////////////////////////////////////////////////////////////
+typedef enum EQueryModeMask {
+    QUERY_MODE_MASK_NONE = 0x00000000,
+    QUERY_MODE_MASK_RENDER = 0x00000001,
+    QUERY_MODE_MASK_COMPUTE = 0x00000002,
+    QUERY_MODE_MASK_GLOBAL = 0x00000004,
+    QUERY_MODE_MASK_GLOBAL_EXTENDED = 0x00000008,
+    QUERY_MODE_MASK_ALL = 0xFFFFFFFF
+} TQueryModeMask;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Read params:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SReadParams_1_0 {
+    uint32_t ByteOffset;
+    uint32_t BitOffset;
+    uint32_t BitsCount;
+    uint32_t ByteOffsetExt;
+} TReadParams_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Equation element:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SEquationElement_1_0 {
+    TEquationElementType Type;
+    union {
+        uint64_t ImmediateUInt64;
+        float ImmediateFloat;
+        TByteArray_1_0 Mask;
+        TEquationOperation Operation;
+        TReadParams_1_0 ReadParams;
+    };
+    char *SymbolName;
+} TEquationElement_1_0;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IEquation_1_0
+//
+// Description:
+//   Abstract interface for the equation object.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IEquation_1_0 {
+public:
+    virtual ~IEquation_1_0();
+    virtual uint32_t GetEquationElementsCount(void);
+    virtual TEquationElement_1_0 *GetEquationElement(uint32_t index);
+};
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Metric:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricParams_1_0 {
+    uint32_t IdInSet; // Position in set (may change after SetApiFiltering)
+    uint32_t GroupId; // Specific metric group id
+    const char *SymbolName; // Symbol name, used in equations
+    const char *
+            ShortName; // Consistent metric name, not changed platform to platform
+    const char *GroupName; // VertexShader for example
+    const char *LongName; // Hint about the metric shown to users
+    const char *DxToOglAlias; // To replace DX pixels with OGL fragments
+    uint32_t UsageFlagsMask; //
+    uint32_t ApiMask; //
+    TMetricResultType ResultType; //
+    const char *MetricResultUnits; //
+    TMetricType MetricType; //
+    uint64_t
+            LowWatermark; // Low watermark for hotspot indication (USAGE_FLAG_INDICATE only)
+    uint64_t
+            HighWatermark; // High watermark for hotspot indication (USAGE_FLAG_INDICATE only)
+    THwUnitType HwUnitType; //
+    IEquation_1_0 *
+            IoReadEquation; // Read equation specification for IO stream (accessing raw values potentially spread in report in several locations)
+    IEquation_1_0 *
+            QueryReadEquation; // Read equation specification for query (accessing calculated delta values)
+    TDeltaFunction_1_0 DeltaFunction; //
+    IEquation_1_0 *
+            NormEquation; // Normalization equation to get normalized value to bytes transfered or to a percentage of utilization
+    IEquation_1_0 *
+            MaxValueEquation; // To calculate metrics max value as a function of other metrics and device parameters (e.g. 100 for percentage)
+} TMetricParams_1_0;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Extended global parameters of Metric:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricParams_1_13 : public SMetricParams_1_0 {
+    uint32_t QueryModeMask; // Mask values specified in TQueryModeMask
+} TMetricParams_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Valid value:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SValidValue_1_13 {
+    TValueType ValueType;
+    union {
+        uint32_t ValueUInt32;
+        uint64_t ValueUInt64;
+        struct {
+            uint32_t Min;
+            uint32_t Max;
+        } ValueUInt32Range;
+        struct {
+            uint64_t Min;
+            uint64_t Max;
+        } ValueUInt64Range;
+    };
+} TValidValue_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Metric Prototype:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricPrototypeParams_1_13 {
+    const char *SymbolName; // Symbol name, used in equations
+    const char *
+            ShortName; // Consistent metric name, not changed platform to platform
+    const char *GroupName; // VertexShader for example
+    const char *LongName; // Hint about the metric shown to users
+    const char *DxToOglAlias; // To replace DX pixels with OGL fragments
+    uint32_t UsageFlagsMask; //
+    uint32_t ApiMask; //
+    TMetricResultType ResultType; //
+    const char *MetricResultUnits; //
+    TMetricType MetricType; //
+    THwUnitType HwUnitType; //
+    uint32_t OptionDescriptorCount; //
+    TDisaggregationMode DisaggregationMode; //
+    uint32_t QueryModeMask; // Mask values specified in TQueryModeMask
+} TMetricPrototypeParams_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Metric Prototype Option Descriptor:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SMetricPrototypeOptionDescriptor_1_13 {
+    TOptionDescriptorType Type;
+    const char *SymbolName;
+    TValidValue_1_13 *ValidValues;
+    uint32_t ValidValueCount;
+} TMetricPrototypeOptionDescriptor_1_13;
+
+//////////////////////////////////////////////////////////////////////////////////
+// Global parameters of Information:
+//////////////////////////////////////////////////////////////////////////////////
+typedef struct SInformationParams_1_0 {
+    uint32_t IdInSet; // Position in set (may change after SetApiFiltering)
+    const char *SymbolName; // Symbol name, used in equations
+    const char *ShortName; // Consistent name, not changed platform to platform
+    const char *GroupName; // Some more global context of the information
+    const char *LongName; // Hint about the information shown to users
+    uint32_t ApiMask; //
+    TInformationType InfoType; //
+    const char *InfoUnits; //
+    IEquation_1_0 *
+            IoReadEquation; // Read equation specification for IO stream (accessing raw values potentially spread in report in several locations)
+    IEquation_1_0 *
+            QueryReadEquation; // Read equation specification for query (accessing calculated delta values)
+    TDeltaFunction_1_0 OverflowFunction; //
+} TInformationParams_1_0;
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IInformation_1_0
+//
+// Descriptions:
+//   Abstract interface for the measurement information parameter.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IInformation_1_0 {
+public:
+    virtual ~IInformation_1_0();
+    virtual TInformationParams_1_0 *GetParams();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetric_1_0
+//
+// Description:
+//   Abstract interface for the metric that is sampled.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetric_1_0 {
+public:
+    virtual ~IMetric_1_0();
+    virtual TMetricParams_1_0 *GetParams();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetric_1_13
+//
+// Description:
+//   Updated 1.0 version to use with 1.13 interface version.
+//   Introduces new metric params.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetric_1_13 : public IMetric_1_0 {
+public:
+    virtual ~IMetric_1_13();
+    virtual TMetricParams_1_13 *GetParams();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricPrototype_1_13
+//
+// Description:
+//   Abstract interface for the metric prototype that is sampled.
+//
+// New:
+// - GetParams:                 To get this metric prototype params
+// - Clone:                     To clone the metric prototype
+// - GetOptionDescriptor:       To get option descriptor
+// - SetOption:                 To set option
+// - ChangeNames:               To change symbol, short, long or unit names
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricPrototype_1_13 {
+public:
+    virtual ~IMetricPrototype_1_13();
+    virtual const TMetricPrototypeParams_1_13 *GetParams(void) const;
+    virtual IMetricPrototype_1_13 *Clone(void);
+    virtual const TMetricPrototypeOptionDescriptor_1_13 *GetOptionDescriptor(
+            uint32_t index) const;
+    virtual TCompletionCode SetOption(const TOptionDescriptorType optionType,
+            const TTypedValue_1_0 *typedValue);
+    virtual TCompletionCode ChangeNames(const char *symbolName,
+            const char *shortName, const char *longName,
+            const char *resultUnits);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricEnumerator_1_13
+//
+// Description:
+//   Abstract interface for the metric enumerator.
+//
+// New:
+// - GetMetricPrototypeCount:     To get the number of available metric prototypes
+// - GetMetricPrototype:          To get available metric prototype at a given index
+// - GetMetricPrototypes:         To get a given number of available metric prototypes
+// - RemoveClonedMetricPrototype: To remove cloned metric prototype
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricEnumerator_1_13 {
+public:
+    virtual ~IMetricEnumerator_1_13();
+    virtual uint32_t GetMetricPrototypeCount(void);
+    virtual IMetricPrototype_1_13 *GetMetricPrototype(const uint32_t index);
+    virtual TCompletionCode GetMetricPrototypes(const uint32_t index,
+            uint32_t *count, IMetricPrototype_1_13 **metrics);
+    virtual TCompletionCode RemoveClonedMetricPrototype(
+            IMetricPrototype_1_13 *clonedPrototype);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricSet_1_0
+//
+// Description:
+//   Abstract interface for the metric sets mapping to different HW
+//   configuration that should be used exclusively to each other
+//   metric set in the concurrent group.
+//
+// New:
+// - GetParams:
+// - GetMetric:                         To get particular metric
+// - GetInformation:                    To get particular information about measurement
+// - GetComplementaryMetricSet:         Below proposal to address multi-passes at the set level
+// - Activate:                          To enable this configuration before query instance is created
+// - Deactivate:                        To disable this configuration after query instance is created
+// - AddCustomMetric:                   To add an additional custom metric to this set
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_0 {
+public:
+    virtual ~IMetricSet_1_0();
+
+    virtual TMetricSetParams_1_0 *GetParams(void);
+    virtual IMetric_1_0 *GetMetric(uint32_t index);
+    virtual IInformation_1_0 *GetInformation(uint32_t index);
+    virtual IMetricSet_1_0 *GetComplementaryMetricSet(uint32_t index);
+    virtual TCompletionCode Activate(void);
+    virtual TCompletionCode Deactivate(void);
+    virtual IMetric_1_0 *AddCustomMetric(const char *symbolName,
+            const char *shortName, const char *groupName, const char *longName,
+            const char *dxToOglAlias, uint32_t usageFlagsMask, uint32_t apiMask,
+            TMetricResultType resultType, const char *resultUnits,
+            TMetricType metricType, int64_t loWatermark, int64_t hiWatermark,
+            THwUnitType hwType, const char *ioReadEquation,
+            const char *deltaFunction, const char *queryReadEquation,
+            const char *normalizationEquation, const char *maxValueEquation,
+            const char *signalName);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricSet_1_1
+//
+// Description:
+//   Updated 1.0 version to use with 1.1 interface version.
+//   Introduces an ability to calculate metrics from raw data.
+//
+// New:
+// - SetApiFiltering:                       To filter available metrics/information for the given API. Use TMetricApiType to build the mask.
+// - CalculateMetrics:                      To calculate normalized metrics/information from the raw data.
+// - CalculateIoMeasurementInformation:     To calculate additional information for stream measurements.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_1 : public IMetricSet_1_0 {
+public:
+    virtual ~IMetricSet_1_1();
+    virtual TCompletionCode SetApiFiltering(uint32_t apiMask);
+    virtual TCompletionCode CalculateMetrics(const uint8_t *rawData,
+            uint32_t rawDataSize, TTypedValue_1_0 *out, uint32_t outSize,
+            uint32_t *outReportCount, bool enableContextFiltering);
+    virtual TCompletionCode CalculateIoMeasurementInformation(
+            TTypedValue_1_0 *out, uint32_t outSize);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricSet_1_4
+//
+// Description:
+//   Updated 1.1 version to use with 1.4 interface version.
+//   Extends set params with gtType information.
+//
+// Updates:
+// - GetParams
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_4 : public IMetricSet_1_1 {
+public:
+    virtual ~IMetricSet_1_4();
+    virtual TMetricSetParams_1_4 *GetParams(void);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricSet_1_5
+//
+// Description:
+//   Updated 1.4 version to use with 1.5 interface version.
+//   Adds an ability to calculate MaxValueEquations (maximal value) for each metric.
+//   Param 'enableContextFiltering' becomes deprecated.
+//
+// Updates:
+// - GetComplementaryMetricSet:         Update to 1.5 interface
+// - CalculateMetrics:                  CalculateMetrics extended with max values calculation.
+//                                      optional param 'outMaxValues' should have a memory
+//                                      for at least 'MetricCount * RawReportCount' values, can be nullptr.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_5 : public IMetricSet_1_4 {
+public:
+    using IMetricSet_1_1::
+            CalculateMetrics; // To avoid hiding by 1.5 interface function
+    virtual IMetricSet_1_5 *GetComplementaryMetricSet(uint32_t index);
+    virtual TCompletionCode CalculateMetrics(const uint8_t *rawData,
+            uint32_t rawDataSize, TTypedValue_1_0 *out, uint32_t outSize,
+            uint32_t *outReportCount, TTypedValue_1_0 *outMaxValues,
+            uint32_t outMaxValuesSize);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricSet_1_11
+//
+// Description:
+//   Updated 1.5 version to use with 1.11 interface version.
+//   Extends set params with AvailabilityEquation information.
+//
+// Updates:
+// - GetParams
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_11 : public IMetricSet_1_5 {
+public:
+    virtual ~IMetricSet_1_11();
+    virtual TMetricSetParams_1_11 *GetParams(void);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricSet_1_13
+//
+// Description:
+//   Updated 1.11 version to use with 1.13 interface version.
+//   Adds an ability to create and delete flexible metric sets.
+//
+// New:
+// - Open:              Prepare metric set for adding metrics
+// - AddMetric:         To add a given metric to flexible metric set
+// - RemoveMetric:      To remove a given metric from flexible metric set
+// - Finalize:          To make metric set usable after adding metrics
+//
+// Updates:
+// - GetMetric          Update to 1.13 interface
+// - AddCustomMetric    Update to 1.13 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricSet_1_13 : public IMetricSet_1_11 {
+public:
+    virtual ~IMetricSet_1_13();
+
+    // New.
+    virtual TCompletionCode Open();
+    virtual TCompletionCode AddMetric(IMetricPrototype_1_13 *metricPrototype);
+    virtual TCompletionCode RemoveMetric(
+            IMetricPrototype_1_13 *metricPrototype);
+    virtual TCompletionCode Finalize();
+
+    // Updates.
+    virtual IMetric_1_13 *GetMetric(uint32_t index);
+
+    using IMetricSet_1_0::
+            AddCustomMetric; // To avoid hiding by 1.13 interface function
+    virtual IMetric_1_13 *AddCustomMetric(const char *symbolName,
+            const char *shortName, const char *groupName, const char *longName,
+            const char *dxToOglAlias, uint32_t usageFlagsMask, uint32_t apiMask,
+            TMetricResultType resultType, const char *resultUnits,
+            TMetricType metricType, int64_t loWatermark, int64_t hiWatermark,
+            THwUnitType hwType, const char *ioReadEquation,
+            const char *deltaFunction, const char *queryReadEquation,
+            const char *normalizationEquation, const char *maxValueEquation,
+            const char *signalName, uint32_t queryModeMask);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//
+//   IConcurrentGroup_1_0
+//
+// Description:
+//   Abstract interface for the metrics groups that can be collected
+//   concurrently to another group.
+//
+///////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_0 {
+public:
+    virtual ~IConcurrentGroup_1_0();
+    virtual TConcurrentGroupParams_1_0 *GetParams(void);
+    virtual IMetricSet_1_0 *GetMetricSet(uint32_t index);
+    virtual TCompletionCode OpenIoStream(IMetricSet_1_0 *metricSet,
+            uint32_t processId, uint32_t *nsTimerPeriod,
+            uint32_t *oaBufferSize);
+    virtual TCompletionCode ReadIoStream(
+            uint32_t *reportsCount, char *reportData, uint32_t readFlags);
+    virtual TCompletionCode CloseIoStream(void);
+    virtual TCompletionCode WaitForReports(uint32_t milliseconds);
+    virtual IInformation_1_0 *GetIoMeasurementInformation(uint32_t index);
+    virtual IInformation_1_0 *GetIoGpuContextInformation(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IConcurrentGroup_1_1
+//
+// Description:
+//   Updated 1.0 version to use with 1.1 interface version.
+//
+// Updates:
+// - GetMetricSet:                  Update to 1.1 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_1 : public IConcurrentGroup_1_0 {
+public:
+    virtual IMetricSet_1_1 *GetMetricSet(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IConcurrentGroup_1_3
+//
+// Description:
+//   Updated 1.1 version to use with 1.3 interface version.Introduces setting Stream Sampling Type.
+//
+// New:
+// - SetIoStreamSamplingType:       To set sampling type during IoStream measurements
+//
+///////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_3 : public IConcurrentGroup_1_1 {
+public:
+    virtual TCompletionCode SetIoStreamSamplingType(TSamplingType type);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IConcurrentGroup_1_5
+//
+// Description:
+//   Updated 1.3 version to use with 1.5 interface version.
+//
+// Updates:
+// - GetMetricSet:                  Update to 1.5 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_5 : public IConcurrentGroup_1_3 {
+public:
+    virtual IMetricSet_1_5 *GetMetricSet(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IConcurrentGroup_1_11
+//
+// Description:
+//   Updated 1.5 version to use with 1.11 interface version.
+//
+// Updates:
+// - GetMetricSet:                  Update to 1.11 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_11 : public IConcurrentGroup_1_5 {
+public:
+    virtual IMetricSet_1_11 *GetMetricSet(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IConcurrentGroup_1_13
+//
+// Description:
+//   Updated 1.11 version to use with 1.13 interface version.
+//
+// Updates:
+// - GetParams                      Update to 1.13 interface
+// - GetMetricSet:                  Update to 1.13 interface
+//
+// New:
+// - GetMetricEnumerator:           To get metric enumerator
+// - GetMetricEnumeratorFromFile:   To get metric enumerator from file
+// - AddMetricSet:                  To create flexible metric set
+// - RemoveMetricSet:               To delete flexible metric set
+//
+///////////////////////////////////////////////////////////////////////////////
+class IConcurrentGroup_1_13 : public IConcurrentGroup_1_11 {
+public:
+    // Updates.
+    virtual TConcurrentGroupParams_1_13 *GetParams(void);
+    virtual IMetricSet_1_13 *GetMetricSet(uint32_t index);
+
+    // New.
+    virtual IMetricEnumerator_1_13 *GetMetricEnumerator(void);
+    virtual IMetricEnumerator_1_13 *GetMetricEnumeratorFromFile(
+            const char *fileName);
+    virtual IMetricSet_1_13 *AddMetricSet(
+            const char *symbolName, const char *shortName);
+    virtual TCompletionCode RemoveMetricSet(IMetricSet_1_13 *metricSet);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IOverride_1_2
+//
+// Description:
+//   Abstract interface for Metrics Device overrides.
+//
+// New:
+// - GetParams:                     To get this Override params
+// - SetOverride:                   To enable/disable this Override
+//
+///////////////////////////////////////////////////////////////////////////////
+class IOverride_1_2 {
+public:
+    virtual ~IOverride_1_2();
+    virtual TOverrideParams_1_2 *GetParams(void);
+    virtual TCompletionCode SetOverride(
+            TSetOverrideParams_1_2 *params, uint32_t paramsSize);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_0
+//
+// Description:
+//   Abstract interface for the GPU metrics root object.
+//
+// New:
+// - GetParams:                     To get MetricsDevice params
+// - GetConcurrentGroup:            Child objects are of IConcurrentGroup
+// - GetGlobalSymbol:               To get GlobalSymbol at the given index
+// - GetGlobalSymbolValueByName:    To get GlobalSymbol with the given name
+// - GetLastError:                  To get last error from TCompletionCode enum
+// - GetGpuCpuTimestamps:           To get both GPU and CPU timestamp at the same time
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_0 {
+public:
+    virtual ~IMetricsDevice_1_0();
+    virtual TMetricsDeviceParams_1_0 *GetParams(void);
+    virtual IConcurrentGroup_1_0 *GetConcurrentGroup(uint32_t index);
+    virtual TGlobalSymbol_1_0 *GetGlobalSymbol(uint32_t index);
+    virtual TTypedValue_1_0 *GetGlobalSymbolValueByName(const char *name);
+    virtual TCompletionCode GetLastError(void);
+    virtual TCompletionCode GetGpuCpuTimestamps(uint64_t *gpuTimestampNs,
+            uint64_t *cpuTimestampNs, uint32_t *cpuId);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_1
+//
+// Description:
+//   Updated 1.0 version to use with 1.1 interface version.
+//
+// Updates:
+// - GetConcurrentGroup:            Update to 1.1 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_1 : public IMetricsDevice_1_0 {
+public:
+    virtual IConcurrentGroup_1_1 *GetConcurrentGroup(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_2
+//
+// Description:
+//   Updated 1.1 version to use with 1.2 interface version.
+//   Introduces an interface for getting overrides.
+//
+// Updates:
+// - GetParams:                     Update to 1.2 interface
+//
+// New:
+// - GetOverride:                   To get override at the given index
+// - GetOverrideByName:             To get override with the given name
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_2 : public IMetricsDevice_1_1 {
+public:
+    // Updates.
+    virtual TMetricsDeviceParams_1_2 *GetParams(void);
+
+    // New.
+    virtual IOverride_1_2 *GetOverride(uint32_t index);
+    virtual IOverride_1_2 *GetOverrideByName(const char *symbolName);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_5
+//
+// Description:
+//   Updated 1.2 version to use with 1.5 interface version.
+//
+// Updates:
+// - GetConcurrentGroup:            Update to 1.5 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_5 : public IMetricsDevice_1_2 {
+public:
+    virtual IConcurrentGroup_1_5 *GetConcurrentGroup(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_10
+//
+// Description:
+//   Updated 1.5 version to use with 1.10 interface version.
+//
+// Updates:
+// - GetGpuCpuTimestamps:            Update to 1.10 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_10 : public IMetricsDevice_1_5 {
+public:
+    using IMetricsDevice_1_0::
+            GetGpuCpuTimestamps; // To avoid hiding by 1.10 interface function
+    virtual TCompletionCode GetGpuCpuTimestamps(uint64_t *gpuTimestampNs,
+            uint64_t *cpuTimestampNs, uint32_t *cpuId,
+            uint64_t *correlationIndicatorNs);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_11
+//
+// Description:
+//   Updated 1.10 version to use with 1.11 interface version.
+//
+// Updates:
+// - GetConcurrentGroup:            Update to 1.11 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_11 : public IMetricsDevice_1_10 {
+public:
+    virtual IConcurrentGroup_1_11 *GetConcurrentGroup(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IMetricsDevice_1_13
+//
+// Description:
+//   Updated 1.11 version to use with 1.13 interface version.
+//
+// Updates:
+// - GetConcurrentGroup:            Update to 1.13 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IMetricsDevice_1_13 : public IMetricsDevice_1_11 {
+public:
+    virtual IConcurrentGroup_1_13 *GetConcurrentGroup(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapter_1_6
+//
+// Description:
+//   Abstract interface for GPU adapter.
+//
+// New:
+// - GetParams:                     To get this adapter params
+// - Reset:                         To reset this adapter state
+// - OpenMetricsDevice
+// - OpenMetricsDeviceFromFile
+// - CloseMetricsDevice
+// - SaveMetricsDeviceToFile
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapter_1_6 {
+public:
+    virtual ~IAdapter_1_6();
+    virtual const TAdapterParams_1_6 *GetParams(void) const;
+    virtual TCompletionCode Reset();
+    virtual TCompletionCode OpenMetricsDevice(
+            IMetricsDevice_1_5 **metricsDevice);
+    virtual TCompletionCode OpenMetricsDeviceFromFile(const char *fileName,
+            void *openParams, IMetricsDevice_1_5 **metricsDevice);
+    virtual TCompletionCode CloseMetricsDevice(
+            IMetricsDevice_1_5 *metricsDevice);
+    virtual TCompletionCode SaveMetricsDeviceToFile(const char *fileName,
+            void *saveParams, IMetricsDevice_1_5 *metricsDevice);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapter_1_8
+//
+// Description:
+//   Abstract interface for GPU adapter.
+//
+// Updates:
+// - GetParams:                     Update to 1.8 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapter_1_8 : public IAdapter_1_6 {
+public:
+    virtual const TAdapterParams_1_8 *GetParams(void) const;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapter_1_9
+//
+// Description:
+//   Abstract interface for GPU adapter.
+//
+// Updates:
+// - GetParams:                     Update to 1.9 interface
+//
+// New:
+// - GetSubDeviceParams:            To get sub device parameters
+// - GetEngineParams:               To get engine parameters
+// - OpenMetricsSubDevice:          To open metrics device on given sub device
+// - OpenMetricsSubDeviceFromFile:  To open metrics device from file on given sub device
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapter_1_9 : public IAdapter_1_8 {
+public:
+    // Updates.
+    virtual const TAdapterParams_1_9 *GetParams(void) const;
+
+    // New.
+    virtual const TSubDeviceParams_1_9 *GetSubDeviceParams(
+            const uint32_t subDeviceIndex);
+    virtual const TEngineParams_1_9 *GetEngineParams(
+            const uint32_t subDeviceIndex, const uint32_t engineIndex);
+    virtual TCompletionCode OpenMetricsSubDevice(
+            const uint32_t subDeviceIndex, IMetricsDevice_1_5 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDeviceFromFile(
+            const uint32_t subDeviceIndex, const char *fileName,
+            void *openParams, IMetricsDevice_1_5 **metricsDevice);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapter_1_10
+//
+// Description:
+//   Abstract interface for GPU adapter.
+//
+// Updates:
+// - OpenMetricsDevice:              Update to 1.10 interface
+// - OpenMetricsDeviceFromFile:      Update to 1.10 interface
+// - OpenMetricsSubDevice:           Update to 1.10 interface
+// - OpenMetricsSubDeviceFromFile:   Update to 1.10 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapter_1_10 : public IAdapter_1_9 {
+public:
+    using IAdapter_1_6::OpenMetricsDevice;
+    using IAdapter_1_6::OpenMetricsDeviceFromFile;
+    using IAdapter_1_9::OpenMetricsSubDevice;
+    using IAdapter_1_9::OpenMetricsSubDeviceFromFile;
+
+    virtual TCompletionCode OpenMetricsDevice(
+            IMetricsDevice_1_10 **metricsDevice);
+    virtual TCompletionCode OpenMetricsDeviceFromFile(const char *fileName,
+            void *openParams, IMetricsDevice_1_10 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDevice(
+            const uint32_t subDeviceIndex, IMetricsDevice_1_10 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDeviceFromFile(
+            const uint32_t subDeviceIndex, const char *fileName,
+            void *openParams, IMetricsDevice_1_10 **metricsDevice);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapter_1_11
+//
+// Description:
+//   Abstract interface for GPU adapter.
+//
+// Updates:
+// - OpenMetricsDevice:              Update to 1.11 interface
+// - OpenMetricsDeviceFromFile:      Update to 1.11 interface
+// - OpenMetricsSubDevice:           Update to 1.11 interface
+// - OpenMetricsSubDeviceFromFile:   Update to 1.11 interface
+//
+// New:
+// - SaveMetricsDeviceToFile   To save metrics device with required minimal api version
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapter_1_11 : public IAdapter_1_10 {
+public:
+    // Updates.
+    using IAdapter_1_10::OpenMetricsDevice;
+    using IAdapter_1_10::OpenMetricsDeviceFromFile;
+    using IAdapter_1_10::OpenMetricsSubDevice;
+    using IAdapter_1_10::OpenMetricsSubDeviceFromFile;
+
+    virtual TCompletionCode OpenMetricsDevice(
+            IMetricsDevice_1_11 **metricsDevice);
+    virtual TCompletionCode OpenMetricsDeviceFromFile(const char *fileName,
+            void *openParams, IMetricsDevice_1_11 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDevice(
+            const uint32_t subDeviceIndex, IMetricsDevice_1_11 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDeviceFromFile(
+            const uint32_t subDeviceIndex, const char *fileName,
+            void *openParams, IMetricsDevice_1_11 **metricsDevice);
+
+    // New.
+    using IAdapter_1_6::SaveMetricsDeviceToFile;
+    virtual TCompletionCode SaveMetricsDeviceToFile(const char *fileName,
+            void *saveParams, IMetricsDevice_1_11 *metricsDevice,
+            const uint32_t minMajorApiVersion,
+            const uint32_t minMinorApiVersion);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapter_1_13
+//
+// Description:
+//   Abstract interface for GPU adapter.
+//
+// Updates:
+// - OpenMetricsDevice:             Update to 1.13 interface
+// - OpenMetricsDeviceFromFile:     Update to 1.13 interface
+// - OpenMetricsSubDevice:          Update to 1.13 interface
+// - OpenMetricsSubDeviceFromFile:  Update to 1.13 interface
+// - GetEngineParams:               Update to 1.13 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapter_1_13 : public IAdapter_1_11 {
+public:
+    // Updates.
+    using IAdapter_1_11::OpenMetricsDevice;
+    using IAdapter_1_11::OpenMetricsDeviceFromFile;
+    using IAdapter_1_11::OpenMetricsSubDevice;
+    using IAdapter_1_11::OpenMetricsSubDeviceFromFile;
+
+    virtual TCompletionCode OpenMetricsDevice(
+            IMetricsDevice_1_13 **metricsDevice);
+    virtual TCompletionCode OpenMetricsDeviceFromFile(const char *fileName,
+            void *openParams, IMetricsDevice_1_13 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDevice(
+            const uint32_t subDeviceIndex, IMetricsDevice_1_13 **metricsDevice);
+    virtual TCompletionCode OpenMetricsSubDeviceFromFile(
+            const uint32_t subDeviceIndex, const char *fileName,
+            void *openParams, IMetricsDevice_1_13 **metricsDevice);
+
+    virtual const TEngineParams_1_13 *GetEngineParams(
+            const uint32_t subDeviceIndex, const uint32_t engineIndex);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapterGroup_1_6
+//
+// Description:
+//   Abstract interface for the GPU adapters root object.
+//
+// New:
+// - GetParams:                     To get this adapter group params
+// - GetAdapter:                    To enumerate available GPU adapters
+// - Close:                         To close this adapter group
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapterGroup_1_6 {
+public:
+    virtual ~IAdapterGroup_1_6();
+    virtual const TAdapterGroupParams_1_6 *GetParams(void) const;
+    virtual IAdapter_1_6 *GetAdapter(uint32_t index);
+    virtual TCompletionCode Close();
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapterGroup_1_8
+//
+// Description:
+//   Abstract interface for the GPU adapters root object.
+//
+// Updates:
+// - GetAdapter:                    Update to 1.8 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapterGroup_1_8 : public IAdapterGroup_1_6 {
+public:
+    virtual IAdapter_1_8 *GetAdapter(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapterGroup_1_9
+//
+// Description:
+//   Abstract interface for the GPU adapters root object.
+//
+// Updates:
+// - GetAdapter:                    Update to 1.9 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapterGroup_1_9 : public IAdapterGroup_1_8 {
+public:
+    virtual IAdapter_1_9 *GetAdapter(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapterGroup_1_10
+//
+// Description:
+//   Abstract interface for the GPU adapters root object.
+//
+// Updates:
+// - GetAdapter:                    Update to 1.10 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapterGroup_1_10 : public IAdapterGroup_1_9 {
+public:
+    virtual IAdapter_1_10 *GetAdapter(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapterGroup_1_11
+//
+// Description:
+//   Abstract interface for the GPU adapters root object.
+//
+// Updates:
+// - GetAdapter:                    Update to 1.11 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapterGroup_1_11 : public IAdapterGroup_1_10 {
+public:
+    virtual IAdapter_1_11 *GetAdapter(uint32_t index);
+};
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Class:
+//   IAdapterGroup_1_13
+//
+// Description:
+//   Abstract interface for the GPU adapters root object.
+//
+// Updates:
+// - GetAdapter:                    Update to 1.13 interface
+//
+///////////////////////////////////////////////////////////////////////////////
+class IAdapterGroup_1_13 : public IAdapterGroup_1_11 {
+public:
+    virtual IAdapter_1_13 *GetAdapter(uint32_t index);
+};
+
+//////////////////////////////////////////////////////////////////////////////////
+// Latest interfaces and typedef structs versions:
+//////////////////////////////////////////////////////////////////////////////////
+using IAdapterGroupLatest = IAdapterGroup_1_13;
+using IAdapterLatest = IAdapter_1_13;
+using IConcurrentGroupLatest = IConcurrentGroup_1_13;
+using IEquationLatest = IEquation_1_0;
+using IInformationLatest = IInformation_1_0;
+using IMetricEnumeratorLatest = IMetricEnumerator_1_13;
+using IMetricLatest = IMetric_1_13;
+using IMetricPrototypeLatest = IMetricPrototype_1_13;
+using IMetricSetLatest = IMetricSet_1_13;
+using IMetricsDeviceLatest = IMetricsDevice_1_13;
+using IOverrideLatest = IOverride_1_2;
+using TAdapterGroupParamsLatest = TAdapterGroupParams_1_6;
+using TAdapterIdLatest = TAdapterId_1_6;
+using TAdapterIdLuidLatest = TAdapterIdLuid_1_6;
+using TAdapterIdMajorMinorLatest = TAdapterIdMajorMinor_1_6;
+using TAdapterParamsLatest = TAdapterParams_1_9;
+using TApiSpecificIdLatest = TApiSpecificId_1_0;
+using TApiVersionLatest = TApiVersion_1_0;
+using TByteArrayLatest = TByteArray_1_0;
+using TConcurrentGroupParamsLatest = TConcurrentGroupParams_1_13;
+using TDeltaFunctionLatest = TDeltaFunction_1_0;
+using TEngineIdClassInstanceLatest = TEngineIdClassInstance_1_9;
+using TEngineIdLatest = TEngineId_1_9;
+using TEngineParamsLatest = TEngineParams_1_13;
+using TEquationElementLatest = TEquationElement_1_0;
+using TGlobalSymbolLatest = TGlobalSymbol_1_0;
+using TInformationParamsLatest = TInformationParams_1_0;
+using TMetricParamsLatest = TMetricParams_1_13;
+using TMetricPrototypeOptionDescriptorLatest
+        = TMetricPrototypeOptionDescriptor_1_13;
+using TMetricPrototypeParamsLatest = TMetricPrototypeParams_1_13;
+using TMetricSetParamsLatest = TMetricSetParams_1_11;
+using TMetricsDeviceParamsLatest = TMetricsDeviceParams_1_2;
+using TOverrideParamsLatest = TOverrideParams_1_2;
+using TReadParamsLatest = TReadParams_1_0;
+using TSetDriverOverrideParamsLatest = TSetDriverOverrideParams_1_2;
+using TSetFrequencyOverrideParamsLatest = TSetFrequencyOverrideParams_1_2;
+using TSetOverrideParamsLatest = TSetOverrideParams_1_2;
+using TSetQueryOverrideParamsLatest = TSetQueryOverrideParams_1_2;
+using TSubDeviceParamsLatest = TSubDeviceParams_1_9;
+using TTypedValueLatest = TTypedValue_1_0;
+using TValidValueLatest = TValidValue_1_13;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// [Current] Factory functions
+typedef TCompletionCode(MD_STDCALL *OpenAdapterGroup_fn)(
+        IAdapterGroupLatest **adapterGroup);
+
+// [Legacy] Factory functions
+typedef TCompletionCode(MD_STDCALL *OpenMetricsDevice_fn)(
+        IMetricsDeviceLatest **metricsDevice);
+typedef TCompletionCode(MD_STDCALL *OpenMetricsDeviceFromFile_fn)(
+        const char *fileName, void *openParams,
+        IMetricsDeviceLatest **metricsDevice);
+typedef TCompletionCode(MD_STDCALL *CloseMetricsDevice_fn)(
+        IMetricsDeviceLatest *metricsDevice);
+typedef TCompletionCode(MD_STDCALL *SaveMetricsDeviceToFile_fn)(
+        const char *fileName, void *saveParams,
+        IMetricsDeviceLatest *metricsDevice);
+
+#ifdef __cplusplus
+}
+#endif
+
+}; // namespace MetricsDiscovery
+#endif // __METRICS_DISCOVERY_H_
diff --git a/src/gpu/intel/jit/ngen/COPYRIGHT b/third_party/ngen/COPYRIGHT
similarity index 100%
rename from src/gpu/intel/jit/ngen/COPYRIGHT
rename to third_party/ngen/COPYRIGHT
diff --git a/third_party/ngen/ngen.hpp b/third_party/ngen/ngen.hpp
new file mode 100644
index 00000000000..e7c9d5c908e
--- /dev/null
+++ b/third_party/ngen/ngen.hpp
@@ -0,0 +1,2818 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+// nGEN: a C++ library for runtime Gen assembly generation.
+//
+// Macros that control nGEN's interface:
+//    NGEN_SAFE             if defined, enables run-time safety checks. Exceptions will be thrown if checks fail.
+//    NGEN_SHORT_NAMES      if defined, enables some short names (r[...] for indirect addressing, W for NoMask)
+//    NGEN_GLOBAL_REGS      if defined, register names and instruction modifiers (r7, cr0, Switch, etc.) are
+//                           global variables in the ngen namespace. Otherwise, they are members of the code
+//                           generator classes
+//    NGEN_CPP11            if defined, ngen is C++11-compatible (C++17 not required)
+
+#ifndef NGEN_HPP
+#define NGEN_HPP
+
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
+
+#include "ngen_config_internal.hpp"
+
+#include <array>
+#include <cstring>
+#include <type_traits>
+#include <vector>
+
+#include "ngen_core.hpp"
+#include "ngen_auto_swsb.hpp"
+#include "ngen_debuginfo.hpp"
+// -----------------------------------------------------------------------
+// Binary formats, split between pre-Gen12 and post-Gen12.
+#include "ngen_gen8.hpp"
+#include "ngen_gen12.hpp"
+// -----------------------------------------------------------------------
+
+#include "ngen_asm.hpp"
+
+namespace NGEN_NAMESPACE {
+
+// Forward declarations.
+template <HW hw> class BinaryCodeGenerator;
+template <HW hw> class ELFCodeGenerator;
+
+// MSVC v140 workaround for enum comparison in template arguments.
+static constexpr bool hwLT(HW hw1, HW hw2) { return hw1 < hw2; }
+static constexpr bool hwLE(HW hw1, HW hw2) { return hw1 <= hw2; }
+static constexpr bool hwGE(HW hw1, HW hw2) { return hw1 >= hw2; }
+static constexpr bool hwGT(HW hw1, HW hw2) { return hw1 > hw2; }
+
+class LabelFixup {
+public:
+    uint32_t labelID;
+    int32_t anchor;
+    int32_t offset;
+
+    LabelFixup(uint32_t labelID_, int32_t offset_) : labelID(labelID_), anchor(0), offset(offset_) {}
+
+    static constexpr auto JIPOffset = 12;
+    static constexpr auto JIPOffsetJMPI = -4;
+    static constexpr auto UIPOffset = 8;
+};
+
+#if defined(NGEN_GLOBAL_REGS) && !defined(NGEN_GLOBAL_REGS_DEFINED)
+#define NGEN_GLOBAL_REGS_DEFINED
+#include "ngen_registers.hpp"
+#endif
+
+template <HW hw>
+class BinaryCodeGenerator
+{
+    friend class ELFCodeGenerator<hw>;
+
+public:
+    static constexpr HW hardware = hw;
+    static constexpr HW getHardware() { return hardware; }
+
+protected:
+    class InstructionStream {
+        friend class BinaryCodeGenerator;
+
+        std::vector<LabelFixup> fixups;
+        std::vector<uint32_t> labels;
+        std::vector<uint64_t> code;
+        bool appended = false;
+
+        int length() const { return int(code.size() * sizeof(uint64_t)); }
+
+        void db(const Instruction8 &i) {
+            code.push_back(i.qword[0]);
+            code.push_back(i.qword[1]);
+        }
+
+        void db(const Instruction12 &i) {
+            code.push_back(i.qword[0]);
+            code.push_back(i.qword[1]);
+        }
+
+        void addFixup(LabelFixup fixup) {
+            fixup.anchor = length();
+            fixups.push_back(fixup);
+        }
+
+        void mark(Label &label, LabelManager &man) {
+            uint32_t id = label.getID(man);
+
+            man.setTarget(id, length());
+            labels.push_back(id);
+        }
+
+        void fixLabels(LabelManager &man) {
+            for (const auto &fixup : fixups) {
+                int32_t target = man.getTarget(fixup.labelID);
+                uint8_t *field = ((uint8_t *) code.data()) + fixup.anchor + fixup.offset;
+                *((int32_t *) field) = target - fixup.anchor;
+            }
+        }
+
+        void append(InstructionStream &other, LabelManager &man) {
+            auto offset = length();
+            auto sz = code.size();
+
+            code.resize(sz + other.code.size());
+            std::copy(other.code.begin(), other.code.end(), code.begin() + sz);
+
+            sz = labels.size();
+            labels.resize(sz + other.labels.size());
+            std::copy(other.labels.begin(), other.labels.end(), labels.begin() + sz);
+
+            for (LabelFixup fixup : other.fixups) {
+                fixup.anchor += offset;
+                fixups.push_back(fixup);
+            }
+
+#ifdef NGEN_SAFE
+            if (other.appended && !other.labels.empty())
+                throw multiple_label_exception();
+#endif
+
+            for (uint32_t id : other.labels)
+                man.offsetTarget(id, offset);
+
+            other.appended = true;
+        }
+
+        InstructionStream() {}
+    };
+
+    class Program {
+        friend class BinaryCodeGenerator;
+        using Instruction = typename Instruction12Dispatch<hw>::type;
+        std::vector<uint64_t> &code;
+
+        Program(InstructionStream &stream) : code(stream.code) {};
+
+    public:
+        size_t size() const                               { return code.size() >> 1; }
+        Instruction &operator[](size_t index)             { return *reinterpret_cast<Instruction *>(&code[index * 2]); }
+        const Instruction &operator[](size_t index) const { return *reinterpret_cast<Instruction *>(&code[index * 2]); }
+    };
+
+    static constexpr bool isGen12 = (hw >= HW::Gen12LP);
+    Product product;
+    int declaredGRFs = 128;
+
+    Label _labelLocalIDsLoaded;
+    Label _labelArgsLoaded;
+    Label _lastFenceLabel;
+    RegData _lastFenceDst;
+
+    DebugLine debugLine;
+
+private:
+    InstructionModifier defaultModifier;
+
+    LabelManager labelManager;
+    InstructionStream rootStream;
+    std::vector<InstructionStream*> streamStack;
+
+    void db(const Instruction8 &i)  { streamStack.back()->db(i); }
+    void db(const Instruction12 &i) { streamStack.back()->db(i); }
+    void addFixup(LabelFixup fixup) { streamStack.back()->addFixup(fixup); }
+
+    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, SourceLocation loc);
+    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, SourceLocation loc);
+    template <bool forceWE = false, typename D, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0, SourceLocation loc);
+    template <bool forceWE = false, typename D, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0, SourceLocation loc);
+
+    template <bool forceWE = false, typename D, typename S0, typename S1, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, SourceLocation loc);
+    template <bool forceWE = false, typename D, typename S0, typename S1, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, SourceLocation loc);
+    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1, SourceLocation loc);
+    template <bool forceWE = false, typename D, typename S0, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1, SourceLocation loc);
+
+    template <HW hw_ = hw>
+    typename std::enable_if<hwLE(hw_, HW::Gen9)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, RegData dst, RegData src0, RegData src1, RegData src2, SourceLocation loc);
+    template <HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, Align16Operand dst, Align16Operand src0, Align16Operand src1, Align16Operand src2, SourceLocation loc);
+    template <typename D, typename S0, typename S1, typename S2, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2, SourceLocation loc);
+    template <typename D, typename S0, typename S1, typename S2, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2, SourceLocation loc);
+
+    template <typename DS0>
+    void opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0, SourceLocation loc);
+    template <typename DS0, typename S1>
+    void opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0, S1 src1, SourceLocation loc);
+
+    template <typename D, typename S0, typename S2>
+    void opBfn(Opcode op, DataType defaultType, const InstructionModifier &mod, int bfnCtrl, D dst, S0 src0, RegData src1, S2 src2, SourceLocation loc);
+    void opDpas(Opcode op, DataType defaultType, const InstructionModifier &mod, int sdepth, int rcount, RegData dst, RegData src0, RegData src1, RegData src2, SourceLocation loc);
+
+    template <typename D, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, uint32_t exdesc, D desc, SourceLocation loc);
+    template <typename D, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, const RegData &exdesc, D desc, SourceLocation loc);
+    template <typename ED, typename D, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, ED exdesc, D desc, SourceLocation loc);
+
+    template <HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc);
+    template <HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc);
+    template <typename D, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, D desc, SourceLocation loc);
+
+    template <typename ED, typename D, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, ED exdesc, D desc, SourceLocation loc);
+    template <typename D, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, D desc, SourceLocation loc);
+    template <typename D, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, RegData exdesc, D desc, SourceLocation loc);
+
+    template <HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip, SourceLocation loc);
+    template <HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip, SourceLocation loc);
+    template <bool forceWE = false, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, SourceLocation loc);
+    template <bool forceWE = false, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, SourceLocation loc);
+    template <bool forceWE = false, bool small12 = true, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc);
+    template <bool forceWE = false, bool small12 = true, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc);
+
+    void opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, Label &uip, SourceLocation loc);
+    template <bool forceWE = false>
+    void opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, SourceLocation loc);
+    void opCall(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, SourceLocation loc);
+
+    template <HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip, SourceLocation loc);
+    template <HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip, SourceLocation loc);
+    void opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Label &jip, SourceLocation loc);
+
+    void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, SourceLocation loc);
+    void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, RegData src0, SourceLocation loc);
+    void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, const Immediate &src0, SourceLocation loc);
+
+    void opNop(Opcode op, SourceLocation loc);
+
+    inline void unsupported();
+
+#include "ngen_compiler_fix.hpp"
+
+public:
+    explicit BinaryCodeGenerator(Product product_, DebugConfig debugConfig = {})
+        : product{product_}, debugLine(debugConfig), defaultModifier{}, labelManager{},
+
+                                                     sync{this}, load{this}, store{this}, atomic{this}
+    {
+        _workaround_();
+        pushStream(rootStream);
+    }
+
+    explicit BinaryCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : BinaryCodeGenerator({genericProductFamily(hw), stepping_, PlatformType::Unknown}, debugConfig) {}
+
+    ~BinaryCodeGenerator() {
+        for (size_t sn = 1; sn < streamStack.size(); sn++)
+            delete streamStack[sn];
+    }
+
+    std::vector<uint8_t> getCode();
+    size_t getRootStreamLength() const { return rootStream.length(); }
+
+    Product getProduct() const { return product; }
+    ProductFamily getProductFamily() const { return product.family; }
+    int getStepping() const { return product.stepping; }
+
+    void setProduct(Product product_) { product = product_; }
+    void setProductFamily(ProductFamily family_) { product.family = family_; }
+    void setStepping(int stepping_) { product.stepping = stepping_; }
+
+protected:
+    // Configuration.
+    void setDefaultNoMask(bool def = true)          { defaultModifier.setWrEn(def); }
+    void setDefaultAutoSWSB(bool def = true)        { defaultModifier.setAutoSWSB(def); }
+    bool getDefaultNoMask() const                   { return defaultModifier.isWrEn(); }
+    bool getDefaultAutoSWSB() const                 { return defaultModifier.isAutoSWSB(); }
+
+    // Stream handling.
+    void pushStream()                               { pushStream(new InstructionStream()); }
+    void pushStream(InstructionStream *s)           { streamStack.push_back(s); }
+    void pushStream(InstructionStream &s)           { pushStream(&s); }
+
+    InstructionStream *popStream();
+
+    void appendStream(InstructionStream *s)         { appendStream(*s); }
+    void appendStream(InstructionStream &s)         { streamStack.back()->append(s, labelManager); }
+    void appendCurrentStream()                      { InstructionStream *s = popStream(); appendStream(s); delete s; }
+
+    void discardStream()                            { delete popStream(); }
+
+    template <typename String>
+    void comment(String)                            {}
+
+    void requireGRF(int grfs)                       { declaredGRFs = grfs; }
+
+    // Registers.
+#ifndef NGEN_GLOBAL_REGS
+#include "ngen_registers.hpp"
+#endif
+
+    // Labels.
+    inline void mark(Label &label)          { streamStack.back()->mark(label, labelManager); }
+
+    // Instructions.
+    template <typename DT = void>
+    void add(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::add, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void add(const InstructionModifier &mod, const RegData &dst,
+             const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::add, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void addc(const InstructionModifier &mod, const RegData &dst,
+              const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::addc, getDataType<DT>(), mod | AccWrEn, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void addc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::addc, getDataType<DT>(), mod | AccWrEn, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void and_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::and_gen12 : Opcode::and_, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void and_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::and_gen12 : Opcode::and_, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void and(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        and_<DT>(mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void and(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        and_<DT>(mod, dst, src0, src1, loc);
+    }
+#endif
+    template <typename DT = void>
+    void asr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::asr_gen12 : Opcode::asr, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void asr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::asr_gen12 : Opcode::asr, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void avg(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::avg, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void avg(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::avg, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfi1(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi1_gen12 : Opcode::bfi1, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void bfi1(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi1_gen12 : Opcode::bfi1, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        if (hw < HW::XeHP) unsupported();
+        opBfn(Opcode::bfn, getDataType<DT>(), mod, ctrl, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void bfrev(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfrev_gen12 : Opcode::bfrev, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void bfrev(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfrev_gen12 : Opcode::bfrev, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    void brc(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        opBranch(Opcode::brc, mod, isGen12 ? null.ud() : ip.d(), jip, uip, loc);
+    }
+    void brc(const InstructionModifier &mod, RegData src0, SourceLocation loc = {}) {
+        src0.setRegion(2, 2, 1);
+        opBranch<true, true>(Opcode::brc, mod, isGen12 ? null.ud() : ip.d(), src0, loc);
+    }
+    void brd(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        opBranch(Opcode::brd, mod, isGen12 ? null.ud() : ip.d(), jip, loc);
+    }
+    void brd(const InstructionModifier &mod, RegData src0, SourceLocation loc = {}) {
+        src0.setRegion(2, 2, 1);
+        opBranch<true, true>(Opcode::brd, mod, isGen12 ? null.ud() : ip.d(), src0, loc);
+    }
+    void break_(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        opBranch(Opcode::break_, mod, null, jip, uip, loc);
+    }
+    void call(const InstructionModifier &mod, const RegData &dst, Label &jip, SourceLocation loc = {}) {
+        opCall(Opcode::call, mod, dst, jip, loc);
+    }
+    void call(const InstructionModifier &mod, const RegData &dst, RegData jip, SourceLocation loc = {}) {
+        if (isGen12)
+            opBranch<true, true>(Opcode::call, mod, dst, jip, loc);
+        else {
+            jip.setRegion(0, 1, 0);
+            opX<true>(Opcode::call, DataType::d, mod, dst, null.ud(0)(0, 1, 0), jip, loc);
+        }
+    }
+    void calla(const InstructionModifier &mod, const RegData &dst, int32_t jip, SourceLocation loc = {}) {
+        if (isGen12)
+            opBranch<true>(Opcode::calla, mod, dst, jip, loc);
+        else
+            opX<true>(Opcode::calla, DataType::d, mod, dst, (hw <= HW::Gen9) ? null.ud(0)(2,2,1) : null.ud(0)(0,1,0), Immediate::d(jip), loc);
+    }
+    void calla(const InstructionModifier &mod, const RegData &dst, RegData jip, SourceLocation loc = {}) {
+        if (isGen12)
+            opBranch<true, true>(Opcode::calla, mod, dst, jip, loc);
+        else {
+            jip.setRegion(0, 1, 0);
+            opX<true>(Opcode::calla, DataType::d, mod, dst, null.ud(0)(0, 1, 0), jip, loc);
+        }
+    }
+    template <typename DT = void>
+    void cbit(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::cbit, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void cbit(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::cbit, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void cmp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::cmp_gen12 : Opcode::cmp, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void cmp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::cmp_gen12 : Opcode::cmp, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void cmpn(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::cmpn_gen12 : Opcode::cmpn, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    void cont(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        opBranch(Opcode::cont, mod, null, jip, uip, loc);
+    }
+    template <typename DT = void>
+    void dp2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp2, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dp2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp2, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dp3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp3, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dp3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp3, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dp4(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp4, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dp4(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp4, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        if (hw < HW::Gen12LP) unsupported();
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        if (hw < HW::Gen12LP) unsupported();
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        if (hw < HW::Gen12LP) unsupported();
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        if (hw < HW::Gen12LP) unsupported();
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void dpas(const InstructionModifier &mod, uint8_t sdepth, uint8_t rcount, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opDpas(Opcode::dpas, getDataType<DT>(), mod, sdepth, rcount, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void dpasw(const InstructionModifier &mod, uint8_t sdepth, uint8_t rcount, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opDpas(Opcode::dpasw, getDataType<DT>(), mod, sdepth, rcount, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void dph(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dph, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void dph(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dph, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    void else_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl, SourceLocation loc = {}) {
+        mod.setBranchCtrl(branchCtrl);
+        opBranch(Opcode::else_, mod, null, jip, uip, loc);
+    }
+    void else_(InstructionModifier mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        else_(mod, jip, uip, false, loc);
+    }
+    void else_(InstructionModifier mod, Label &jip, SourceLocation loc = {}) {
+        else_(mod, jip, jip, false, loc);
+    }
+    void endif(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        opBranch(Opcode::endif, mod, null, jip, loc);
+    }
+    void endif(const InstructionModifier &mod, SourceLocation loc = {}) {
+        opBranch(Opcode::endif, mod, null, sizeof(Instruction8), loc);
+    }
+    template <typename DT = void>
+    void fbh(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbh, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void fbh(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbh, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void fbl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbl, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void fbl(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbl, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void frc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::frc, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    void goto_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl, SourceLocation loc = {}) {
+        mod.setBranchCtrl(branchCtrl);
+        opBranch(Opcode::goto_, mod, null, jip, uip, loc);
+    }
+    void goto_(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        goto_(mod, jip, uip, false, loc);
+    }
+    void goto_(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        goto_(mod, jip, jip, loc);
+    }
+    void halt(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        opBranch(Opcode::halt, mod, null, jip, uip, loc);
+    }
+    void halt(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        halt(mod, jip, jip, loc);
+    }
+    void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl, SourceLocation loc = {}) {
+        mod.setBranchCtrl(branchCtrl);
+        opBranch(Opcode::if_, mod, null, jip, uip, loc);
+    }
+    void if_(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        if_(mod, jip, uip, false, loc);
+    }
+    void if_(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        if_(mod, jip, jip, false, loc);
+    }
+    void illegal(SourceLocation loc = {}) {
+        opX(Opcode::illegal, DataType::invalid, InstructionModifier(), null, null, null, loc);
+    }
+    void join(InstructionModifier mod, Label &jip, SourceLocation loc = {}) {
+        opBranch(Opcode::join, mod, null, jip, loc);
+    }
+    void join(InstructionModifier mod, SourceLocation loc = {}) {
+        opBranch(Opcode::join, mod, null, sizeof(Instruction8), loc);
+    }
+    void jmpi(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        auto dst = isGen12 ? ARF(null) : ARF(ip);
+        opJmpi(Opcode::jmpi, mod, dst, dst, jip, loc);
+    }
+    void jmpi(const InstructionModifier &mod, const RegData &jip, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (!isGen12 && jip.getType() != DataType::d && jip.getType() != DataType::invalid)
+            throw invalid_type_exception();
+#endif
+        if (isGen12)
+            opBranch<true, false>(Opcode::jmpi, mod, null, jip, loc);
+        else
+            opX(Opcode::jmpi, DataType::d, mod, ip, ip, jip, loc);
+    }
+    template <typename DT = void>
+    void line(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen11) unsupported();
+        opX(Opcode::line, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void line(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen11) unsupported();
+        opX(Opcode::line, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void lrp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::lrp, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void lzd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::lzd, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void lzd(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::lzd, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void mac(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::mac, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mac(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::mac, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mach(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::mach, getDataType<DT>(), (hw >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mach(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::mach, getDataType<DT>(), (hw >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void macl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hw < HW::Gen10) unsupported();
+#endif
+        opX((hw >= HW::XeHPC) ? Opcode::macl : Opcode::mach, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void macl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hw < HW::Gen10) unsupported();
+#endif
+        opX((hw >= HW::XeHPC) ? Opcode::macl : Opcode::mach, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void, HW hw_ = hw>
+    typename std::enable_if<hwLE(hw_, HW::Gen9)>::type
+    madm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1, const ExtendedReg &src2, SourceLocation loc = {}) {
+        opX(Opcode::madm, getDataType<DT>(), mod, extToAlign16(dst), extToAlign16(src0), extToAlign16(src1), extToAlign16(src2), loc);
+    }
+    template <typename DT = void, HW hw_ = hw>
+    typename std::enable_if<hwGT(hw_, HW::Gen9)>::type
+    madm(const InstructionModifier &mod, const ExtendedReg &dst, ExtendedReg src0, ExtendedReg src1, const ExtendedReg &src2, SourceLocation loc = {}) {
+        src0.getBase().setRegion(4,4,1);
+        src1.getBase().setRegion(4,4,1);
+        opX(Opcode::madm, getDataType<DT>(), mod, dst, src0, src1, src2, loc);
+    }
+    template <typename DT = void>
+    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (mathArgCount(hw, fc) != 1) throw invalid_operand_count_exception();
+#endif
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (mathArgCount(hw, fc) != 2) throw invalid_operand_count_exception();
+#endif
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc == MathFunction::invm || fc == MathFunction::rsqtm) throw invalid_operand_exception();
+#endif
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, src1.forceInt32(), loc);
+    }
+    template <typename DT = void, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen11)>::type
+    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, const ExtendedReg &src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc != MathFunction::rsqtm) throw invalid_operand_exception();
+#endif
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, extToAlign16(dst), extToAlign16(src0), loc);
+    }
+    template <typename DT = void, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen11)>::type
+    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, ExtendedReg src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc != MathFunction::rsqtm) throw invalid_operand_exception();
+#endif
+        if (hw == HW::Gen11)
+            src0.getBase().setRegion(2,2,1);
+        else
+            src0.getBase().setRegion(1,1,0);
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, loc);
+    }
+    template <typename DT = void, HW hw_ = hw>
+    typename std::enable_if<hwLT(hw_, HW::Gen11)>::type
+    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc != MathFunction::invm) throw invalid_operand_exception();
+#endif
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, extToAlign16(dst), extToAlign16(src0), extToAlign16(src1), loc);
+    }
+    template <typename DT = void, HW hw_ = hw>
+    typename std::enable_if<hwGE(hw_, HW::Gen11)>::type
+    math(const InstructionModifier &mod, MathFunction fc, const ExtendedReg &dst, ExtendedReg src0, ExtendedReg src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc != MathFunction::invm) throw invalid_operand_exception();
+#endif
+        if (hw == HW::Gen11) {
+            src0.getBase().setRegion(2,2,1);
+            src1.getBase().setRegion(2,2,1);
+        } else {
+            src0.getBase().setRegion(1,1,0);
+            src1.getBase().setRegion(1,1,0);
+        }
+        opMath(Opcode::math, getDataType<DT>(), mod, fc, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mov(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::mov_gen12 : Opcode::mov, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void mov(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::mov_gen12 : Opcode::mov, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (!src0.isIndirect()) throw invalid_address_mode_exception();
+#endif
+        if (hardware >= HW::Gen10)
+            movi<DT>(mod, dst, src0, null.ud(0)(1,1,0));
+        else
+            opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen10) throw unsupported_instruction();
+        if (!src0.isIndirect()) throw invalid_address_mode_exception();
+#endif
+        opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen10) throw unsupported_instruction();
+#endif
+        opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mul(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::mul, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void mul(const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1, SourceLocation loc = {}) {
+        if (dst.getBytes() == 8)
+            src1 = src1.forceInt32();
+        opX(Opcode::mul, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    void nop(SourceLocation loc = {}) {
+        opNop(isGen12 ? Opcode::nop_gen12 : Opcode::nop, loc);
+    }
+    void nop(const InstructionModifier &mod, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::nop_gen12 : Opcode::nop, DataType::invalid, mod, null, null, null, loc);
+    }
+    template <typename DT = void>
+    void not_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::not_gen12 : Opcode::not_, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void not_(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::not_gen12 : Opcode::not_, getDataType<DT>(), mod, dst, src0, loc);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void not(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        not_<DT>(mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void not(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        not_<DT>(mod, dst, src0, loc);
+    }
+#endif
+    template <typename DT = void>
+    void or_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::or_gen12 : Opcode::or_, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void or_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::or_gen12 : Opcode::or_, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void or(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        or_<DT>(mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void or(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        or_<DT>(mod, dst, src0, src1, loc);
+    }
+#endif
+    template <typename DT = void>
+    void pln(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen11) unsupported();
+        opX(Opcode::pln, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    void ret(const InstructionModifier &mod, RegData src0, SourceLocation loc = {}) {
+        src0.setRegion(2,2,1);
+        if (isGen12)
+            opBranch<true, true>(Opcode::ret, mod, null, src0, loc);
+        else
+            opX<true>(Opcode::ret, DataType::ud, mod, null, src0, loc);
+    }
+    template <typename DT = void>
+    void rndd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndd, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rndd(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndd, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rnde(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rnde, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rnde(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rnde, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rndu(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndu, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rndu(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndu, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rndz(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndz, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rndz(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndz, getDataType<DT>(), mod, dst, src0, loc);
+    }
+    template <typename DT = void>
+    void rol(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::rol_gen12 : Opcode::rol, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void rol(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::rol_gen12 : Opcode::rol, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void ror(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::ror_gen12 : Opcode::ror, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void ror(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::ror_gen12 : Opcode::ror, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void sad2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen12LP) unsupported();
+        opX(Opcode::sad2, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void sad2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen12LP) unsupported();
+        opX(Opcode::sad2, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void sada2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen12LP) unsupported();
+        opX(Opcode::sada2, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void sada2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        if (hw >= HW::Gen12LP) unsupported();
+        opX(Opcode::sada2, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void sel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::sel_gen12 : Opcode::sel, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void sel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::sel_gen12 : Opcode::sel, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+
+    /* Gen12-style sends */
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1, -1, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1[0], src1.getLen(), exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, NullRegister(), 0, exdesc, desc, loc);
+    }
+    /* Pre-Gen12-style sends; also supported on Gen12. */
+    void send(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, dst, src0, exdesc, desc, loc);
+    }
+    void send(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, dst, src0, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, dst, src0, exdesc, desc, loc);
+    }
+    void sendc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, dst, src0, exdesc, desc, loc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSends(Opcode::sends, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSends(Opcode::sendsc, mod, dst, src0, src1, exdesc, desc, loc);
+    }
+
+    template <typename DT = void>
+    void shl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shl_gen12 : Opcode::shl, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void shl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shl_gen12 : Opcode::shl, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void shr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shr_gen12 : Opcode::shr, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void shr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shr_gen12 : Opcode::shr, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void smov(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::smov_gen12 : Opcode::smov, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void srnd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::srnd, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void srnd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::srnd, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void subb(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::subb, getDataType<DT>(), mod | AccWrEn, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void subb(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::subb, getDataType<DT>(), mod | AccWrEn, dst, src0, src1, loc);
+    }
+    void wait(const InstructionModifier &mod, const RegData &nreg, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (!nreg.isARF() || nreg.getARFType() != ARFType::n) throw invalid_arf_exception();
+#endif
+        opX(Opcode::wait, DataType::invalid, mod, nreg, nreg, loc);
+    }
+    void while_(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        opBranch(Opcode::while_, mod, null, jip, loc);
+    }
+    template <typename DT = void>
+    void xor_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::xor_gen12 : Opcode::xor_, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void xor_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::xor_gen12 : Opcode::xor_, getDataType<DT>(), mod, dst, src0, src1, loc);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void xor(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        xor_<DT>(mod, dst, src0, src1, loc);
+    }
+    template <typename DT = void>
+    void xor(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        xor_<DT>(mod, dst, src0, src1, loc);
+    }
+#endif
+
+private:
+    struct Sync {
+        BinaryCodeGenerator<hw> &parent;
+
+        Sync(BinaryCodeGenerator<hw> *parent_) : parent(*parent_) {}
+
+        void operator()(SyncFunction fc, const InstructionModifier &mod = InstructionModifier(), SourceLocation loc = {}) {
+            parent.opSync(Opcode::sync, fc, mod, loc);
+        }
+        void operator()(SyncFunction fc, const RegData &src0, SourceLocation loc) {
+            this->operator()(fc, InstructionModifier(), src0, loc);
+        }
+        void operator()(SyncFunction fc, const InstructionModifier &mod, const RegData &src0, SourceLocation loc) {
+            parent.opSync(Opcode::sync, fc, mod, src0, loc);
+        }
+        void operator()(SyncFunction fc, int src0, SourceLocation loc) {
+            this->operator()(fc, InstructionModifier(), src0, loc);
+        }
+        void operator()(SyncFunction fc, const InstructionModifier &mod, uint32_t src0, SourceLocation loc) {
+            parent.opSync(Opcode::sync, fc, mod, Immediate::ud(src0), loc);
+        }
+        void allrd(SourceLocation loc = {}) {
+            allrd(null.ud(0)(0, 1, 1), loc);
+        }
+        void allrd(const InstructionModifier &mod, SourceLocation loc = {}) {
+            allrd(mod, null.ud(0)(0, 1, 1), loc);
+        }
+        void allrd(const RegData &src0, SourceLocation loc = {}) {
+            allrd(InstructionModifier(), src0, loc);
+        }
+        void allrd(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::allrd, mod, src0, loc);
+        }
+        void allrd(uint32_t src0, SourceLocation loc = {}) {
+            allrd(InstructionModifier(), src0, loc);
+        }
+        void allrd(const InstructionModifier &mod, uint32_t src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::allrd, mod, src0, loc);
+        }
+        void allwr(SourceLocation loc = {}) {
+            allwr(null, loc);
+        }
+        void allwr(const InstructionModifier &mod, SourceLocation loc = {}) {
+            allwr(mod, null, loc);
+        }
+        void allwr(const RegData &src0, SourceLocation loc = {}) {
+            allwr(InstructionModifier(), src0, loc);
+        }
+        void allwr(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::allwr, mod, src0, loc);
+        }
+        void allwr(uint32_t src0, SourceLocation loc = {}) {
+            allwr(InstructionModifier(), src0, loc);
+        }
+        void allwr(const InstructionModifier &mod, uint32_t src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::allwr, mod, src0, loc);
+        }
+        void bar(const InstructionModifier &mod = InstructionModifier(), SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, mod, loc);
+        }
+        void bar(const InstructionModifier &mod, uint32_t src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, mod, src0, loc);
+        }
+        void bar(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, mod, src0, loc);
+        }
+        void bar(uint32_t src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, InstructionModifier(), src0, loc);
+        }
+        void bar(const RegData &src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, InstructionModifier(), src0, loc);
+        }
+        void flush(SourceLocation loc = {}) {
+            flush(InstructionModifier(), loc);
+        }
+        void flush(const InstructionModifier &mod, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::flush, InstructionModifier(), null, loc);
+        }
+        void host(const InstructionModifier &mod = InstructionModifier(), SourceLocation loc = {}) {
+            this->operator()(SyncFunction::host, mod, loc);
+        }
+        void nop(const InstructionModifier &mod = InstructionModifier(), SourceLocation loc = {}) {
+            this->operator()(SyncFunction::nop, mod, loc);
+        }
+    };
+public:
+    Sync sync;
+
+    void ignoredep(Operand op, SourceLocation loc = {}) {
+        if (hw >= HW::Gen12LP)
+            opX(Opcode::directive, DataType::ud, InstructionModifier(), GRF(static_cast<int>(op)), NullRegister(), NullRegister(), loc);
+    }
+    void subdep(Operand op, const GRFRange &r, SourceLocation loc) {
+        if (op == Operand::dst && !r.isEmpty()) {
+#ifdef NGEN_SAFE
+            if (r.getLen() > 32) throw invalid_directive_exception();
+#endif
+            opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::subdep_dst)), r[0], r[r.getLen() - 1], loc);
+        } else {
+            ignoredep(op, loc);
+            wrdep(r, loc);
+        }
+    }
+    void subdep(Operand op, const GRF &r, SourceLocation loc = {}) {
+        subdep(op, r-r, loc);
+    }
+    void wrdep(const GRFRange &r, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hw < HW::Gen12LP) throw unsupported_instruction();
+#endif
+        int len = r.getLen();
+        for (int o = 0; o < len; o += 32) {
+            int thisLen = std::min(len - o, 32);
+            opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::wrdep)), r[o], r[o + thisLen - 1], loc);
+        }
+    }
+    void wrdep(const GRF &r, SourceLocation loc = {}) {
+        wrdep(r-r, loc);
+    }
+    void fencedep(Label &fenceLocation, SourceLocation loc) {
+        addFixup(LabelFixup(fenceLocation.getID(labelManager), LabelFixup::JIPOffset));
+        opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::fencedep)), Immediate::ud(0), loc);
+    }
+    void disablePVCWARWA(SourceLocation loc) {
+        opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::pvcwarwa)), NullRegister(), loc);
+    }
+
+    using _self = BinaryCodeGenerator<hw>;
+#include "ngen_pseudo.hpp"
+};
+
+#define NGEN_FORWARD(hw) NGEN_FORWARD_SCOPE(NGEN_NAMESPACE::BinaryCodeGenerator<hw>)
+
+#define NGEN_FORWARD_SCOPE(scope) \
+NGEN_FORWARD_SCOPE_NO_ELF_OVERRIDES(scope) \
+void requireGRF(int grfs) { scope::requireGRF(grfs); }
+
+#define NGEN_NILARY_OP(op, scope) void op(NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(loc);}
+#define NGEN_UNARY_OP(op, scope) template <typename A0> void op(A0 &&a0, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), loc);}
+#define NGEN_BINARY_OP(op, scope) template <typename A0, typename A1> void op(A0 &&a0, A1 &&a1, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), std::forward<A1>(a1), loc);}
+#define NGEN_TERNARY_OP(op, scope) template <typename A0, typename A1, typename A2> void op(A0 &&a0, A1 &&a1, A2 &&a2, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), loc);}
+#define NGEN_QUADRARY_OP(op, scope) template <typename A0, typename A1, typename A2, typename A3> void op(A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), loc);}
+#define NGEN_PENTARY_OP(op, scope) template <typename A0, typename A1, typename A2, typename A3, typename A4> void op(A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, A4 &&a4, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), std::forward<A4>(a4), loc);}
+#define NGEN_HEXARY_OP(op, scope) template <typename A0, typename A1, typename A2, typename A3, typename A4, typename A5> void op(A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, A4 &&a4, A5 &&a5, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), std::forward<A4>(a4), std::forward<A5>(a5), loc);}
+#define NGEN_SEPTARY_OP(op, scope) template <typename A0, typename A1, typename A2, typename A3, typename A4, typename A5, typename A6> void op(A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, A4 &&a4, A5 &&a5, A6 &&a6, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::op(std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), std::forward<A4>(a4), std::forward<A5>(a5), std::forward<A6>(a6), loc);}
+
+#define NGEN_FORWARD_SCOPE_OP(op, scope) \
+    NGEN_UNARY_OP(op, scope)       \
+    NGEN_BINARY_OP(op, scope)      \
+    NGEN_TERNARY_OP(op, scope)     \
+    NGEN_QUADRARY_OP(op, scope)    \
+    NGEN_PENTARY_OP(op, scope)     \
+    NGEN_HEXARY_OP(op, scope)      \
+    NGEN_SEPTARY_OP(op, scope)     \
+
+#define NGEN_BINARY_DT_OP(op, scope) template <typename DT = void, typename A0, typename A1> void op(const NGEN_NAMESPACE::InstructionModifier &mod, A0 &&a0, A1 &&a1, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::template op<DT>(mod, std::forward<A0>(a0), std::forward<A1>(a1), loc);}
+#define NGEN_TERNARY_DT_OP(op, scope) template <typename DT = void, typename A0, typename A1, typename A2> void op(const NGEN_NAMESPACE::InstructionModifier &mod, A0 &&a0, A1 &&a1, A2 &&a2, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::template op<DT>(mod, std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), loc);}
+#define NGEN_QUADRARY_DT_OP(op, scope) template <typename DT = void, typename A0, typename A1, typename A2, typename A3> void op(const NGEN_NAMESPACE::InstructionModifier &mod, A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::template op<DT>(mod, std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), loc);}
+#define NGEN_PENTARY_DT_OP(op, scope) template <typename DT = void, typename A0, typename A1, typename A2, typename A3, typename A4> void op(const NGEN_NAMESPACE::InstructionModifier &mod, A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, A4 &&a4, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::template op<DT>(mod, std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), std::forward<A4>(a4), loc);}
+#define NGEN_HEXARY_DT_OP(op, scope) template <typename DT = void, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5> void op(const NGEN_NAMESPACE::InstructionModifier &mod, A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, A4 &&a4, A5 &&a5, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::template op<DT>(mod, std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), std::forward<A4>(a4), std::forward<A5>(a5), loc);}
+#define NGEN_OCTARY_DT_OP(op, scope) template <typename DT = void, typename A0, typename A1, typename A2, typename A3, typename A4, typename A5, typename A6, typename A7> void op(const NGEN_NAMESPACE::InstructionModifier &mod, A0 &&a0, A1 &&a1, A2 &&a2, A3 &&a3, A4 &&a4, A5 &&a5, A6 &&a6, A7 &&a7, NGEN_NAMESPACE::SourceLocation loc = {}) {scope::template op<DT>(mod, std::forward<A0>(a0), std::forward<A1>(a1), std::forward<A2>(a2), std::forward<A3>(a3), std::forward<A4>(a4), std::forward<A5>(a5), std::forward<A6>(a6), std::forward<A7>(a7), loc);}
+
+#define NGEN_FORWARD_SCOPE_DT_OP(op, scope) \
+    NGEN_BINARY_DT_OP(op, scope)      \
+    NGEN_TERNARY_DT_OP(op, scope)     \
+    NGEN_QUADRARY_DT_OP(op, scope)    \
+    NGEN_PENTARY_DT_OP(op, scope)     \
+    NGEN_HEXARY_DT_OP(op, scope)      \
+    NGEN_OCTARY_DT_OP(op, scope)      \
+
+#define NGEN_FORWARD_SCOPE_NO_ELF_OVERRIDES(scope)            \
+using scope::isGen12; \
+constexpr NGEN_NAMESPACE::HW getHardware() const { return scope::getHardware(); } \
+NGEN_FORWARD_SCOPE_DT_OP(add, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(addc, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(add3, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(and_, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(asr, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(avg, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(bfe, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(bfi1, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(bfi2, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(bfn, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(bfrev, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(cbit, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(cmp, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(cmpn, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(csel, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dp2, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dp3, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dp4, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dp4a, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dpas, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dpasw, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(dph, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(fbh, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(fbl, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(frc, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(line, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(lrp, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(lzd, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(mac, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(macl, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(mach, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(mad, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(madm, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(math, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(mov, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(movi, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(mul, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(not_, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(or_, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(pln, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rndd, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rnde, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rndu, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rndz, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rol, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(ror, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(sad2, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(sada2, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(sel, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(shl, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(shr, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(smov, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(subb, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(xor_, scope) \
+NGEN_FORWARD_SCOPE_OP(brc, scope) \
+NGEN_FORWARD_SCOPE_OP(brd, scope) \
+NGEN_FORWARD_SCOPE_OP(break_, scope) \
+NGEN_FORWARD_SCOPE_OP(call, scope) \
+NGEN_FORWARD_SCOPE_OP(calla, scope) \
+NGEN_FORWARD_SCOPE_OP(cont, scope) \
+NGEN_FORWARD_SCOPE_OP(else_, scope) \
+NGEN_FORWARD_SCOPE_OP(endif, scope) \
+NGEN_FORWARD_SCOPE_OP(goto_, scope) \
+NGEN_FORWARD_SCOPE_OP(halt, scope) \
+NGEN_FORWARD_SCOPE_OP(if_, scope) \
+NGEN_NILARY_OP(illegal, scope) \
+NGEN_FORWARD_SCOPE_OP(join, scope) \
+NGEN_FORWARD_SCOPE_OP(jmpi, scope) \
+NGEN_NILARY_OP(nop, scope) \
+NGEN_FORWARD_SCOPE_OP(ret, scope) \
+NGEN_FORWARD_SCOPE_OP(send, scope) \
+NGEN_FORWARD_SCOPE_OP(sendc, scope) \
+NGEN_FORWARD_SCOPE_OP(sends, scope) \
+NGEN_FORWARD_SCOPE_OP(sendsc, scope) \
+using scope::sync; \
+NGEN_FORWARD_SCOPE_OP(wait, scope) \
+NGEN_FORWARD_SCOPE_OP(while_, scope) \
+NGEN_FORWARD_SCOPE_OP(ignoredep, scope) \
+NGEN_FORWARD_SCOPE_OP(subdep, scope) \
+NGEN_FORWARD_SCOPE_OP(wrdep, scope) \
+NGEN_FORWARD_SCOPE_OP(fencedep, scope) \
+NGEN_NILARY_OP(disablePVCWARWA, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(min_, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(max_, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(bfi, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(cos, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(exp, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(fdiv, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(idiv, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(inv, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(invm, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(iqot, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(irem, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(log, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(pow, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rsqt, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(rsqtm, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(sin, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(sqt, scope) \
+template <typename DT = void, typename... Targs> void fdiv_ieee(Targs&&... args) { scope::template fdiv_ieee<DT>(std::forward<Targs>(args)...); } \
+NGEN_FORWARD_SCOPE_DT_OP(inv_ieee, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(sqt_ieee, scope) \
+NGEN_FORWARD_SCOPE_OP(threadend, scope) \
+template <typename... Targs> void barrierheader(Targs&&... args) { scope::barrierheader(std::forward<Targs>(args)...); } \
+NGEN_FORWARD_SCOPE_OP(barriermsg, scope)                                           \
+template <typename... Targs> void barriersignal(Targs&&... args) { scope::barriersignal(std::forward<Targs>(args)...); } \
+NGEN_NILARY_OP(barrierwait, scope) \
+NGEN_FORWARD_SCOPE_OP(barrierwait, scope) \
+template <typename... Targs> void barrier(Targs&&... args) { scope::barrier(std::forward<Targs>(args)...); } \
+using scope::load; \
+using scope::store; \
+using scope::atomic; \
+template <typename... Targs> void memfence(Targs&&... args) { scope::memfence(std::forward<Targs>(args)...); } \
+template <typename... Targs> void slmfence(Targs&&... args) { scope::slmfence(std::forward<Targs>(args)...); } \
+NGEN_NILARY_OP(fencewait, scope) \
+template <typename... Targs> void loadlid(Targs&&... args) { scope::loadlid(std::forward<Targs>(args)...); } \
+template <typename... Targs> void loadargs(Targs&&... args) { scope::loadargs(std::forward<Targs>(args)...); } \
+template <typename... Targs> void epilogue(int GRFCount, bool hasSLM, const NGEN_NAMESPACE::RegData &r0_info) { scope::epilogue(GRFCount, hasSLM, r0_info); } \
+template <typename... Targs> void pushStream(Targs&&... args) { scope::pushStream(std::forward<Targs>(args)...); } \
+template <typename... Targs> void appendStream(Targs&&... args) { scope::appendStream(std::forward<Targs>(args)...); } \
+template <typename... Targs> void appendCurrentStream(Targs&&... args) { scope::appendCurrentStream(std::forward<Targs>(args)...); } \
+template <typename... Targs> void discardStream(Targs&&... args) { scope::discardStream(std::forward<Targs>(args)...); } \
+template <typename... Targs> void mark(Targs&&... args) { scope::mark(std::forward<Targs>(args)...); } \
+template <typename... Targs> void comment(Targs&&... args) { scope::comment(std::forward<Targs>(args)...); } \
+template <typename... Targs> void setDefaultNoMask(Targs&&... args) { scope::setDefaultNoMask(std::forward<Targs>(args)...); } \
+template <typename... Targs> void setDefaultAutoSWSB(Targs&&... args) { scope::setDefaultAutoSWSB(std::forward<Targs>(args)...); } \
+bool getDefaultNoMask() const { return scope::getDefaultNoMask(); } \
+bool getDefaultAutoSWSB() const { return scope::getDefaultAutoSWSB(); } \
+using scope::product; \
+NGEN_NAMESPACE::Product getProduct() const { return scope::getProduct(); } \
+NGEN_NAMESPACE::ProductFamily getProductFamily() const { return scope::getProductFamily(); } \
+int getStepping() const { return scope::getStepping(); } \
+void setProduct(NGEN_NAMESPACE::Product product_) { scope::setProduct(product_); } \
+void setProductFamily(NGEN_NAMESPACE::ProductFamily family_) { scope::setProductFamily(family_); } \
+void setStepping(int stepping_) { scope::setStepping(stepping_); } \
+NGEN_FORWARD_SCOPE_EXTRA(scope) \
+NGEN_FORWARD_SCOPE_OP_NAMES(scope) \
+NGEN_FORWARD_SCOPE_MIN_MAX(scope) \
+NGEN_FORWARD_SCOPE_REGISTERS(scope)
+
+#define NGEN_FORWARD_SCOPE_EXTRA(scope)
+#define NGEN_FORWARD_SCOPE_EXTRA_ELF_OVERRIDES(hw)
+
+
+#ifdef NGEN_NO_OP_NAMES
+#define NGEN_FORWARD_SCOPE_OP_NAMES(scope)
+#else
+#define NGEN_FORWARD_SCOPE_OP_NAMES(scope) \
+NGEN_FORWARD_SCOPE_DT_OP(and, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(not, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(or, scope) \
+NGEN_FORWARD_SCOPE_DT_OP(xor, scope)
+#endif
+
+#ifdef NGEN_WINDOWS_COMPAT
+#define NGEN_FORWARD_SCOPE_MIN_MAX(scope)
+#else
+#define NGEN_FORWARD_SCOPE_MIN_MAX(scope) \
+NGEN_FORWARD_SCOPE_DT_OP(min, scope)     \
+NGEN_FORWARD_SCOPE_DT_OP(max, scope)
+#endif
+
+#ifdef NGEN_GLOBAL_REGS
+#define NGEN_FORWARD_SCOPE_REGISTERS(scope)
+#else
+#define NGEN_FORWARD_SCOPE_REGISTERS_BASE(scope) \
+using scope::indirect; \
+using scope::r0; using scope::r1; using scope::r2; using scope::r3; \
+using scope::r4; using scope::r5; using scope::r6; using scope::r7; \
+using scope::r8; using scope::r9; using scope::r10; using scope::r11; \
+using scope::r12; using scope::r13; using scope::r14; using scope::r15; \
+using scope::r16; using scope::r17; using scope::r18; using scope::r19; \
+using scope::r20; using scope::r21; using scope::r22; using scope::r23; \
+using scope::r24; using scope::r25; using scope::r26; using scope::r27; \
+using scope::r28; using scope::r29; using scope::r30; using scope::r31; \
+using scope::r32; using scope::r33; using scope::r34; using scope::r35; \
+using scope::r36; using scope::r37; using scope::r38; using scope::r39; \
+using scope::r40; using scope::r41; using scope::r42; using scope::r43; \
+using scope::r44; using scope::r45; using scope::r46; using scope::r47; \
+using scope::r48; using scope::r49; using scope::r50; using scope::r51; \
+using scope::r52; using scope::r53; using scope::r54; using scope::r55; \
+using scope::r56; using scope::r57; using scope::r58; using scope::r59; \
+using scope::r60; using scope::r61; using scope::r62; using scope::r63; \
+using scope::r64; using scope::r65; using scope::r66; using scope::r67; \
+using scope::r68; using scope::r69; using scope::r70; using scope::r71; \
+using scope::r72; using scope::r73; using scope::r74; using scope::r75; \
+using scope::r76; using scope::r77; using scope::r78; using scope::r79; \
+using scope::r80; using scope::r81; using scope::r82; using scope::r83; \
+using scope::r84; using scope::r85; using scope::r86; using scope::r87; \
+using scope::r88; using scope::r89; using scope::r90; using scope::r91; \
+using scope::r92; using scope::r93; using scope::r94; using scope::r95; \
+using scope::r96; using scope::r97; using scope::r98; using scope::r99; \
+using scope::r100; using scope::r101; using scope::r102; using scope::r103; \
+using scope::r104; using scope::r105; using scope::r106; using scope::r107; \
+using scope::r108; using scope::r109; using scope::r110; using scope::r111; \
+using scope::r112; using scope::r113; using scope::r114; using scope::r115; \
+using scope::r116; using scope::r117; using scope::r118; using scope::r119; \
+using scope::r120; using scope::r121; using scope::r122; using scope::r123; \
+using scope::r124; using scope::r125; using scope::r126; using scope::r127; \
+using scope::r128; using scope::r129; using scope::r130; using scope::r131; \
+using scope::r132; using scope::r133; using scope::r134; using scope::r135; \
+using scope::r136; using scope::r137; using scope::r138; using scope::r139; \
+using scope::r140; using scope::r141; using scope::r142; using scope::r143; \
+using scope::r144; using scope::r145; using scope::r146; using scope::r147; \
+using scope::r148; using scope::r149; using scope::r150; using scope::r151; \
+using scope::r152; using scope::r153; using scope::r154; using scope::r155; \
+using scope::r156; using scope::r157; using scope::r158; using scope::r159; \
+using scope::r160; using scope::r161; using scope::r162; using scope::r163; \
+using scope::r164; using scope::r165; using scope::r166; using scope::r167; \
+using scope::r168; using scope::r169; using scope::r170; using scope::r171; \
+using scope::r172; using scope::r173; using scope::r174; using scope::r175; \
+using scope::r176; using scope::r177; using scope::r178; using scope::r179; \
+using scope::r180; using scope::r181; using scope::r182; using scope::r183; \
+using scope::r184; using scope::r185; using scope::r186; using scope::r187; \
+using scope::r188; using scope::r189; using scope::r190; using scope::r191; \
+using scope::r192; using scope::r193; using scope::r194; using scope::r195; \
+using scope::r196; using scope::r197; using scope::r198; using scope::r199; \
+using scope::r200; using scope::r201; using scope::r202; using scope::r203; \
+using scope::r204; using scope::r205; using scope::r206; using scope::r207; \
+using scope::r208; using scope::r209; using scope::r210; using scope::r211; \
+using scope::r212; using scope::r213; using scope::r214; using scope::r215; \
+using scope::r216; using scope::r217; using scope::r218; using scope::r219; \
+using scope::r220; using scope::r221; using scope::r222; using scope::r223; \
+using scope::r224; using scope::r225; using scope::r226; using scope::r227; \
+using scope::r228; using scope::r229; using scope::r230; using scope::r231; \
+using scope::r232; using scope::r233; using scope::r234; using scope::r235; \
+using scope::r236; using scope::r237; using scope::r238; using scope::r239; \
+using scope::r240; using scope::r241; using scope::r242; using scope::r243; \
+using scope::r244; using scope::r245; using scope::r246; using scope::r247; \
+using scope::r248; using scope::r249; using scope::r250; using scope::r251; \
+using scope::r252; using scope::r253; using scope::r254; using scope::r255; \
+using scope::null; \
+using scope::a0; \
+using scope::acc0; using scope::acc1; using scope::acc2; using scope::acc3; \
+using scope::acc4; using scope::acc5; using scope::acc6; using scope::acc7; \
+using scope::acc8; using scope::acc9; \
+using scope::mme0; using scope::mme1; using scope::mme2; using scope::mme3; \
+using scope::mme4; using scope::mme5; using scope::mme6; using scope::mme7; \
+using scope::noacc; using scope::nomme; \
+using scope::f0; using scope::f1; using scope::f2; using scope::f3; \
+using scope::f0_0; using scope::f0_1; using scope::f1_0; using scope::f1_1; \
+using scope::ce0; using scope::sp; using scope::sr0; using scope::sr1; \
+using scope::cr0; using scope::n0; using scope::ip; using scope::tdr0; \
+using scope::tm0; using scope::tm1; using scope::tm2; using scope::tm3; \
+using scope::tm4; using scope::pm0; using scope::tp0; using scope::dbg0; \
+using scope::fc0; using scope::fc1; using scope::fc2; using scope::fc3; \
+using scope::NoDDClr; using scope::NoDDChk; \
+using scope::AccWrEn; using scope::NoSrcDepSet; using scope::Breakpoint; using scope::sat; \
+using scope::NoMask; \
+using scope::ExBSO; \
+using scope::Serialize; using scope::EOT; \
+using scope::Atomic; using scope::Switch; using scope::NoPreempt; \
+using scope::anyv; using scope::allv; using scope::any2h; using scope::all2h; \
+using scope::any4h; using scope::all4h; using scope::any8h; using scope::all8h; \
+using scope::any16h; using scope::all16h; using scope::any32h; using scope::all32h; \
+using scope::any; using scope::all; \
+using scope::x_repl; using scope::y_repl; using scope::z_repl; using scope::w_repl; \
+using scope::ze; using scope::eq; using scope::nz; using scope::ne; \
+using scope::gt; using scope::ge; using scope::lt; using scope::le; \
+using scope::ov; using scope::un; using scope::eo; \
+using scope::M0; using scope::M4; using scope::M8; using scope::M12; \
+using scope::M16; using scope::M20; using scope::M24; using scope::M28; \
+using scope::sb0; using scope::sb1; using scope::sb2; using scope::sb3; \
+using scope::sb4; using scope::sb5; using scope::sb6; using scope::sb7; \
+using scope::sb8; using scope::sb9; using scope::sb10; using scope::sb11; \
+using scope::sb12; using scope::sb13; using scope::sb14; using scope::sb15; \
+using scope::sb16; using scope::sb17; using scope::sb18; using scope::sb19; \
+using scope::sb20; using scope::sb21; using scope::sb22; using scope::sb23; \
+using scope::sb24; using scope::sb25; using scope::sb26; using scope::sb27; \
+using scope::sb28; using scope::sb29; using scope::sb30; using scope::sb31; \
+using scope::NoAccSBSet; \
+using scope::A32; using scope::A32NC; using scope::A64; using scope::A64NC; \
+using scope::SLM; \
+template <typename... Targs> NGEN_NAMESPACE::InstructionModifier ExecutionOffset(Targs&&... args) { return scope::ExecutionOffset(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::AddressBase Surface(Targs&&... args) { return scope::Surface(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::AddressBase CC(Targs&&... args) { return scope::CC(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::AddressBase SC(Targs&&... args) { return scope::SC(std::forward<Targs>(args)...); } \
+using scope::D8; using scope::D16; using scope::D32; using scope::D64; \
+using scope::D8U32; using scope::D16U32; \
+using scope::D8T; using scope::D16T; using scope::D32T; using scope::D64T; \
+using scope::D8U32T; using scope::D16U32T; \
+using scope::V1; using scope::V2; using scope::V3; using scope::V4; \
+using scope::V8; using scope::V16; using scope::V32; using scope::V64; \
+using scope::V1T; using scope::V2T; using scope::V3T; using scope::V4T; \
+using scope::V8T; using scope::V16T; using scope::V32T; using scope::V64T; \
+using scope::transpose; \
+using scope::vnni; \
+using scope::L1UC_L3UC; using scope::L1UC_L3C; using scope::L1C_L3UC; using scope::L1C_L3C; \
+using scope::L1S_L3UC; using scope::L1S_L3C; using scope::L1IAR_L3C; using scope::L1UC_L3WB; \
+using scope::L1WT_L3UC; using scope::L1WT_L3WB; using scope::L1S_L3WB; using scope::L1WB_L3WB; \
+using scope::L1C_L3CC; using scope::L1UC_L3CC;
+#define NGEN_FORWARD_SCOPE_REGISTERS_EXTRA1(scope) \
+using scope::s0;
+#define NGEN_FORWARD_SCOPE_REGISTERS_EXTRA2(scope)
+#define NGEN_FORWARD_SCOPE_REGISTERS_EXTRA3(scope)
+#define NGEN_FORWARD_SCOPE_REGISTERS(scope) NGEN_FORWARD_SCOPE_REGISTERS_BASE(scope) NGEN_FORWARD_SCOPE_REGISTERS_EXTRA1(scope) NGEN_FORWARD_SCOPE_REGISTERS_EXTRA2(scope) NGEN_FORWARD_SCOPE_REGISTERS_EXTRA3(scope)
+#endif
+
+template <HW hw>
+inline void BinaryCodeGenerator<hw>::unsupported()
+{
+#ifdef NGEN_SAFE
+    throw unsupported_instruction();
+#endif
+}
+
+template <HW hw>
+typename BinaryCodeGenerator<hw>::InstructionStream *BinaryCodeGenerator<hw>::popStream()
+{
+#ifdef NGEN_SAFE
+    if (streamStack.size() <= 1) throw stream_stack_underflow();
+#endif
+
+    InstructionStream *result = streamStack.back();
+    streamStack.pop_back();
+    return result;
+}
+
+template <HW hw>
+static inline Instruction12 encodeSyncInsertion(autoswsb::SyncInsertion &si)
+{
+    Instruction12 i;
+
+    i.common.opcode = static_cast<int>(Opcode::sync);
+    i.common.swsb = (hw >= HW::XeHPC) ? SWSBInfoXeHPC(si.swsb, Opcode::sync).raw()
+                                      :    SWSBInfo12(si.swsb, Opcode::sync).raw();
+    i.common.maskCtrl = true;
+    i.binary.cmod = static_cast<int>(si.fc);
+
+    if (si.mask) {
+        i.binary.src0Type = getTypecode12(DataType::ud);
+        i.binary.src0Imm = true;
+        i.imm32.value = si.mask;
+    }
+    i.binary.dst = 1;
+
+    return i;
+}
+
+template <HW hw>
+static inline Instruction12 encodeDummyMovInsertion(autoswsb::DummyMovInsertion &mi)
+{
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+
+    i.common.opcode = static_cast<int>(Opcode::mov_gen12);
+    i.common.swsb = (hw >= HW::XeHPC) ? SWSBInfoXeHPC(mi.swsb, Opcode::sync).raw()
+                                      :    SWSBInfo12(mi.swsb, Opcode::sync).raw();
+    i.common.maskCtrl = true;
+    i.binary.dst = 1;
+    i.binary.dstType = i.binary.src0Type = getTypecode12(DataType::ud);
+
+    if (mi.constant) {
+        i.binary.src0Imm = true;
+        i.imm32.value = 0;
+    } else
+        i.binary.src0 = encodeBinaryOperand12<0>(GRF(mi.grf).ud(0), tag).bits;
+
+    return i;
+}
+
+template <HW hw>
+std::vector<uint8_t> BinaryCodeGenerator<hw>::getCode()
+{
+#ifdef NGEN_SAFE
+    if (streamStack.size() > 1) throw unfinished_stream_exception();
+#endif
+    rootStream.fixLabels(labelManager);
+
+    Program program(rootStream);
+    autoswsb::BasicBlockList analysis = autoswsb::autoSWSB(hw, declaredGRFs, program);
+    std::vector<uint8_t> result;
+
+    if (analysis.empty()) {
+        result.resize(rootStream.length());
+        std::memmove(result.data(), rootStream.code.data(), rootStream.length());
+    } else {
+        std::multimap<int32_t, autoswsb::SyncInsertion*> syncs;
+        std::multimap<int32_t, autoswsb::DummyMovInsertion*> movs;
+
+        for (auto &bb : analysis) {
+            for (auto &sync : bb.syncs)
+                syncs.insert(std::make_pair(sync.inum, &sync));
+            for (auto &mov : bb.movs)
+                movs.insert(std::make_pair(mov.inum, &mov));
+        }
+
+        result.resize(rootStream.length() + (syncs.size() + movs.size()) * sizeof(Instruction12));
+
+        auto *psrc_start = reinterpret_cast<const Instruction12 *>(rootStream.code.data());
+        auto *psrc = psrc_start;
+        auto *pdst_start = reinterpret_cast<Instruction12 *>(result.data());
+        auto *pdst = pdst_start;
+        auto &srcLines = debugLine.srcLines;
+
+        auto nextSync = syncs.begin();
+        auto nextMov = movs.begin();
+
+        for (uint32_t isrc = 0; isrc < program.size(); isrc++, psrc++) {
+            if (psrc->opcode() == Opcode::directive)
+                continue;
+            while ((nextSync != syncs.end()) && (nextSync->second->inum == isrc))
+                *pdst++ = encodeSyncInsertion<hw>(*(nextSync++)->second);
+            while ((nextMov != movs.end()) && (nextMov->second->inum == isrc))
+                *pdst++ = encodeDummyMovInsertion<hw>(*(nextMov++)->second);
+
+            if(!srcLines.empty())
+                srcLines[psrc - psrc_start].address = sizeof(*pdst) * (pdst - pdst_start);
+            *pdst++ = *psrc;
+        }
+
+        result.resize(reinterpret_cast<uint8_t *>(pdst) - result.data());
+    }
+
+    return result;
+}
+
+template <HW hw>
+template <bool forceWE, typename D, typename S0, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
+    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
+
+    encodeCommon8(i, op, emod);
+    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+
+    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
+    if (src0.isIndirect()) i.binary.src0AddrImm9 = src0.getOffset() >> 9;
+
+    i.binary.dstType = getTypecode<hw>(dst.getType());
+    i.binary.src0Type = getTypecode<hw>(src0.getType());
+
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.src0RegFile = getRegFile(src0);
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, typename S0, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
+    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.binary.dst  = encodeBinaryOperand12<-1>(dst, tag).bits;
+    i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
+
+    i.binary.dstAddrMode = dst.isIndirect();
+    i.binary.dstType  = getTypecode12(dst.getType());
+    i.binary.src0Type = getTypecode12(src0.getType());
+
+    i.binary.src0Mods = src0.getMods();
+
+    i.binary.cmod = static_cast<int>(mod.getCMod());
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
+    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
+
+    encodeCommon8(i, op, emod);
+    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+
+    i.binary.dstType = getTypecode<hw>(dst.getType());
+    i.binary.src0Type = getImmediateTypecode<hw>(src0.getType());
+
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.src0RegFile = getRegFile(src0);
+
+    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
+
+    if (getBytes(src0.getType()) == 8)
+        i.imm64.value = static_cast<uint64_t>(src0);
+    else
+        i.imm32.value = static_cast<uint64_t>(src0);
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, const Immediate &src0, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType()});
+    dst.fixup(hw, emod.getExecSize(), ewidth, defaultType, -1, 1);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 1);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.binary.dst = encodeBinaryOperand12<-1>(dst, tag).bits;
+
+    i.binary.dstAddrMode = dst.isIndirect();
+
+    i.binary.dstType  = getTypecode12(dst.getType());
+    i.binary.src0Type = getTypecode12(src0.getType());
+
+    i.binary.src0Imm = true;
+
+    i.binary.cmod = static_cast<int>(mod.getCMod());
+
+    auto val = static_cast<uint64_t>(src0);
+    i.imm32.value = uint32_t(val);
+    if (getBytes(src0.getType()) == 8) {
+#ifdef NGEN_SAFE
+        if (mod.getCMod() != ConditionModifier::none) throw invalid_modifiers_exception();
+#endif
+        i.imm64.high = val >> 32;
+    }
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, typename S0, typename S1, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
+    dst.fixup(hw, emod.getExecSize(),  ewidth, defaultType, -1, 2);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
+
+    encodeCommon8(i, op, emod);
+    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
+
+    i.binary.dst  = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+    i.binary.src1 = encodeBinaryOperand8<false>(src1).bits;
+
+    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
+    if (src0.isIndirect()) i.binary.src0AddrImm9 = src0.getOffset() >> 9;
+    if (src1.isIndirect()) i.binary.src1AddrImm9 = src1.getOffset() >> 9;
+
+    i.binary.dstType  = getTypecode<hw>(dst.getType());
+    i.binary.src0Type = getTypecode<hw>(src0.getType());
+    i.binary.src1Type = getTypecode<hw>(src1.getType());
+
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.src0RegFile = getRegFile(src0);
+    i.binary.src1RegFile = getRegFile(src1);
+
+#ifdef NGEN_SAFE
+    if (src1.isARF() && op != Opcode::illegal && op != Opcode::movi && op != Opcode::directive)
+        throw grf_expected_exception();
+#endif
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, typename S0, typename S1, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.isNull() ? src0.getType()  : src1.getType()});
+    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 2);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.binary.dst  = encodeBinaryOperand12<-1>(dst, tag).bits;
+    i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
+    i.binary.src1 = encodeBinaryOperand12<1>(src1, tag).bits;
+
+    i.binary.dstAddrMode = dst.isIndirect();
+    i.binary.dstType  = getTypecode12(dst.getType());
+    i.binary.src0Type = getTypecode12(src0.getType());
+    i.binary.src1Type = getTypecode12(src1.getType());
+
+    i.binary.src0Mods = src0.getMods();
+    i.binary.src1Mods = src1.getMods();
+
+    i.binary.cmod = static_cast<int>(mod.getCMod());
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, typename S0, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
+    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 2);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
+
+    encodeCommon8(i, op, emod);
+    i.common.accessMode = std::is_base_of<Align16Operand, D>::value;
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+
+    if (dst.isIndirect())  i.binary.dstAddrImm9 = dst.getOffset() >> 9;
+    if (src0.isIndirect()) i.binary.src0AddrImm9 = src0.getOffset() >> 9;
+
+    i.binary.dstType = getTypecode<hw>(dst.getType());
+    i.binary.src0Type = getTypecode<hw>(src0.getType());
+    i.binary.src1Type = getImmediateTypecode<hw>(src1.getType());
+
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.src0RegFile = getRegFile(src0);
+    i.binary.src1RegFile = getRegFile(src1);
+
+    i.imm32.value = static_cast<uint64_t>(src1);
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, typename D, typename S0, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, const Immediate &src1, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType()});
+    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 2);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 2);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 2);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.binary.dst  = encodeBinaryOperand12<-1>(dst, tag).bits;
+    i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
+    i.binary.src1 = static_cast<uint64_t>(src1);
+
+    i.binary.dstAddrMode = dst.isIndirect();
+    i.binary.dstType  = getTypecode12(dst.getType());
+    i.binary.src0Type = getTypecode12(src0.getType());
+    i.binary.src1Type = getTypecode12(src1.getType());
+
+    i.binary.src0Mods = src0.getMods();
+
+    i.binary.cmod = static_cast<int>(mod.getCMod());
+
+    i.binary.src1Imm = true;
+    i.imm32.value = uint32_t(static_cast<uint64_t>(src1));
+
+    db(i);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwLE(hw_, HW::Gen9)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, RegData dst, RegData src0, RegData src1, RegData src2, SourceLocation loc)
+{
+    opX(op, defaultType, mod, emulateAlign16Dst(dst),  emulateAlign16Src(src0),
+                              emulateAlign16Src(src1), emulateAlign16Src(src2), loc);
+}
+
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, Align16Operand dst, Align16Operand src0, Align16Operand src1, Align16Operand src2, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+#ifdef NGEN_SAFE
+    if (dst.getReg().isARF())  throw grf_expected_exception();
+    if (src0.getReg().isARF()) throw grf_expected_exception();
+    if (src1.getReg().isARF()) throw grf_expected_exception();
+    if (src2.getReg().isARF()) throw grf_expected_exception();
+#endif
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier | Align16;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
+    dst.getReg().fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
+    src0.getReg().fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
+    src1.getReg().fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
+    src2.getReg().fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
+
+    encodeCommon8(i, op, emod);
+
+    i.ternary16.dstChanEn = dst.getChanEn();
+    i.ternary16.dstRegNum = dst.getReg().getBase();
+    i.ternary16.dstSubregNum2_4 = dst.getReg().getByteOffset() >> 2;
+    i.ternary16.dstType = getTernary16Typecode8(dst.getReg().getType());
+
+    i.ternary16.srcType = getTernary16Typecode8(src0.getReg().getType());
+
+    bool isFOrHF = (src0.getReg().getType() == DataType::f
+                 || src0.getReg().getType() == DataType::hf);
+
+    i.ternary16.src1Type = isFOrHF && (src1.getReg().getType() == DataType::hf);
+    i.ternary16.src2Type = isFOrHF && (src1.getReg().getType() == DataType::hf);
+
+    encodeTernaryCommon8(i, src0, src1, src2);
+
+    db(i);
+}
+
+template <HW hw>
+template <typename D, typename S0, typename S1, typename S2, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2, SourceLocation loc)
+{
+    if (hw < HW::Gen10)
+        unsupported();
+
+    debugLine.add(rootStream.length(), loc);
+
+#ifdef NGEN_SAFE
+    if (src0.isARF()) throw grf_expected_exception();
+    if (src2.isARF()) throw grf_expected_exception();
+#endif
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
+    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
+    src2.fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
+
+    encodeCommon8(i, op, emod);
+
+    i.ternary1.src0RegFile = std::is_base_of<Immediate, S0>::value;
+    i.ternary1.src1RegFile = src1.isARF();
+    i.ternary1.src2RegFile = std::is_base_of<Immediate, S2>::value;
+
+    encodeTernaryCommon8(i, src0, src1, src2);
+    encodeTernary1Dst10(i, dst);
+
+    db(i);
+}
+
+template <HW hw>
+template <typename D, typename S0,typename S1, typename S2, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
+    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
+    src2.fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.ternary.dst  = encodeTernaryOperand12<true>(dst, tag).bits;
+    encodeTernarySrc0(i, src0, tag);
+    encodeTernarySrc1(i, src1, tag);
+    encodeTernarySrc2(i, src2, tag);
+    encodeTernaryTypes(i, dst, src0, src1, src2);
+
+    i.ternary.cmod = static_cast<int>(mod.getCMod());
+
+    db(i);
+}
+
+template <HW hw>
+template <typename DS0>
+void BinaryCodeGenerator<hw>::opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0, SourceLocation loc)
+{
+    InstructionModifier mmod = mod;
+
+    mmod.setCMod(static_cast<ConditionModifier>(fc));
+    opX(op, defaultType, mmod, dst, src0, loc);
+}
+
+template <HW hw>
+template <typename DS0, typename S1>
+void BinaryCodeGenerator<hw>::opMath(Opcode op, DataType defaultType, const InstructionModifier &mod, MathFunction fc, DS0 dst, DS0 src0, S1 src1, SourceLocation loc)
+{
+    InstructionModifier mmod = mod;
+
+    mmod.setCMod(static_cast<ConditionModifier>(fc));
+    opX(op, defaultType, mmod, dst, src0, src1, loc);
+}
+
+template <HW hw>
+template <typename D, typename S0, typename S2>
+void BinaryCodeGenerator<hw>::opBfn(Opcode op, DataType defaultType, const InstructionModifier &mod, int bfnCtrl, D dst, S0 src0, RegData src1, S2 src2, SourceLocation loc)
+{
+    if (hw < HW::XeHP)
+        unsupported();
+
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    int ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
+    dst.fixup(hw,  emod.getExecSize(), ewidth, defaultType, -1, 3);
+    src0.fixup(hw, emod.getExecSize(), ewidth, defaultType, 0, 3);
+    src1.fixup(hw, emod.getExecSize(), ewidth, defaultType, 1, 3);
+    src2.fixup(hw, emod.getExecSize(), ewidth, defaultType, 2, 3);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.ternary.dst  = encodeTernaryOperand12<true>(dst, tag).bits;
+    encodeTernarySrc0(i, src0, tag);
+    encodeTernarySrc1(i, src1, tag);
+    encodeTernarySrc2(i, src2, tag);
+
+    /* SYCL + GCC 12.3 workaround                    */
+    /* encodeTernaryTypes(i, dst, src0, src1, src2); */
+    Instruction12 i2;
+    encodeTernaryTypes(i2, dst, src0, src1, src2);
+
+    i.ternary.execType = i2.ternary.execType;
+    i.ternary.dstType  = i2.ternary.dstType;
+    i.ternary.src0Type = i2.ternary.src0Type;
+    i.ternary.src1Type = i2.ternary.src1Type;
+    i.ternary.src2Type = i2.ternary.src2Type;
+    /*************************************************/
+
+    i.ternary.cmod = static_cast<int>(mod.getCMod());
+
+    i.bfn.bfnCtrl03 = (bfnCtrl >> 0);
+    i.bfn.bfnCtrl47 = (bfnCtrl >> 4);
+
+    db(i);
+}
+
+template <HW hw>
+static inline void encodeDPAS(Instruction12 &i, Opcode op, DataType defaultType, const InstructionModifier &emod, int sdepth, int rcount, RegData dst, RegData src0, RegData src1, RegData src2)
+{
+    typename EncodingTag12Dispatch<hw>::tag tag;
+
+    dst.fixup(hw, emod.getExecSize(), 0, defaultType, -1, 3);
+    src0.fixup(hw, emod.getExecSize(), 0, defaultType, 0, 3);
+    src1.fixup(hw, emod.getExecSize(), 0, defaultType, 1, 3);
+    src2.fixup(hw, emod.getExecSize(), 0, defaultType, 2, 3);
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.ternary.dst  = encodeTernaryOperand12<true,  false>(dst,  tag).bits;
+    i.ternary.src0 = encodeTernaryOperand12<false, false>(src0, tag).bits;
+    i.ternary.src1 = encodeTernaryOperand12<false, false>(src1, tag).bits;
+    i.ternary.src2 = encodeTernaryOperand12<false, false>(src2, tag).bits;
+
+    encodeTernaryTypes(i, dst, src0, src1, src2);
+
+    i.dpas.rcount = rcount - 1;
+    i.dpas.sdepth = utils::log2(sdepth);
+
+    i.dpas.src1SubBytePrecision = encodeSubBytePrecision12(src1.getType());
+    i.dpas.src2SubBytePrecision = encodeSubBytePrecision12(src2.getType());
+
+    i.ternary.cmod = static_cast<int>(emod.getCMod());
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opDpas(Opcode op, DataType defaultType, const InstructionModifier &mod, int sdepth, int rcount, RegData dst, RegData src0, RegData src1, RegData src2, SourceLocation loc)
+{
+    if (hw < HW::XeHP)
+        unsupported();
+
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction12 i{};
+    encodeDPAS<hw>(i, op, defaultType, mod | defaultModifier, sdepth, rcount, dst, src0, src1, src2);
+    db(i);
+}
+
+template <HW hw>
+template <typename D, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, uint32_t exdesc, D desc, SourceLocation loc)
+{
+    exdesc |= uint32_t(static_cast<uint8_t>(sfid));
+    opSends(static_cast<Opcode>(static_cast<uint8_t>(op) | 2), mod, dst, src0, src1, exdesc, desc, loc);
+}
+
+template <HW hw>
+template <typename D, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0, const RegData &src1, int src1Length, const RegData &exdesc, D desc, SourceLocation loc)
+{
+    opSends(static_cast<Opcode>(static_cast<uint8_t>(op) | 2), mod, dst, src0, src1, exdesc, desc, loc);
+}
+
+template <HW hw>
+template <typename ED, typename D, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, SharedFunction sfid, const RegData &dst, const RegData &src0_, const RegData &src1, int src1Length, ED exdesc, D desc, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    auto src0 = src0_;
+    const bool src0Indirect = (hw >= HW::Xe3 && src0.isIndirect());
+    if (src0Indirect)
+        src0 = src0.getIndirectReg();
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.send.fusionCtrl = emod.isSerialized();
+
+    i.send.dstReg = dst.getBase();
+    i.send.src0Reg = src0.getBase();
+    i.send.src1Reg = src1.getBase();
+
+    i.send.dstRegFile = getRegFile(dst);
+    i.send.src0RegFile = getRegFile(src0);
+    i.send.src1RegFile = getRegFile(src1);
+
+    i.send.sfid = static_cast<int>(sfid) & 0xF;
+
+    if (src1.isNull())
+        src1Length = 0;
+
+    encodeSendDesc(i, desc);
+    encodeSendExDesc(i, exdesc, mod, src1Length, hw);
+
+    if (src0Indirect)
+        i.send.exDesc6_10 = src0.getOffset() >> 1;
+
+    db(i);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon8(i, op, emod);
+
+    i.binary.dst  = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+
+    i.sendsGen9.dstRegFile = getRegFile(dst);
+    i.binary.src0RegFile = getRegFile(src0);
+    i.binary.src1RegFile = RegFileIMM;
+
+    i.binary.dstType = getTypecode<hw>(dst.getType());
+
+    i.sendsGen9.sfid = exdesc & 0xF;
+    i.sendGen8.zero = 0;
+    i.sendGen8.exDesc16_19 = (exdesc >> 16) & 0xF;
+    i.sendGen8.exDesc20_23 = (exdesc >> 20) & 0xF;
+    i.sendGen8.exDesc24_27 = (exdesc >> 24) & 0xF;
+    i.sendGen8.exDesc28_31 = (exdesc >> 28) & 0xF;
+    i.sendsGen9.desc = desc;
+
+    i.sendsGen9.eot = (exdesc >> 5) & 1;
+    if (dst.isIndirect()) i.sendsGen9.dstAddrImm9 = dst.getOffset() >> 9;
+
+    db(i);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+#ifdef NGEN_SAFE
+    // Only a0.0:ud is allowed for desc.
+    if (!desc.isARF() || desc.getARFType() != ARFType::a || desc.getARFBase() != 0 || desc.getOffset() != 0)
+        throw invalid_arf_exception();
+#endif
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon8(i, op, emod);
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+    i.binary.src1 = encodeBinaryOperand8<false>(desc).bits;
+
+    i.sendsGen9.dstRegFile = getRegFile(dst);
+    i.binary.src0RegFile = getRegFile(src0);
+    i.binary.src1RegFile = getRegFile(desc);
+    i.binary.src1Type = getTypecode<hw>(desc.getType());
+
+    i.sendsGen9.sfid = exdesc & 0xF;
+    i.sendGen8.zero = 0;
+    i.sendGen8.exDesc16_19 = (exdesc >> 16) & 0xF;
+    i.sendGen8.exDesc20_23 = (exdesc >> 20) & 0xF;
+    i.sendGen8.exDesc24_27 = (exdesc >> 24) & 0xF;
+    i.sendGen8.exDesc28_31 = (exdesc >> 28) & 0xF;
+
+    i.sendsGen9.eot = (exdesc >> 5) & 1;
+    if (dst.isIndirect()) i.sendsGen9.dstAddrImm9 = dst.getOffset() >> 9;
+
+    db(i);
+}
+
+template <HW hw>
+template <typename D, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSend(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, D desc, SourceLocation loc)
+{
+    opSends(op, mod, dst, src0, null, exdesc, desc, loc);
+}
+
+template <HW hw>
+template <typename ED, typename D, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, ED exdesc, D desc, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon8(i, op, emod);
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+
+    i.binary.src0RegFile = 0;                   // ?
+    i.sendsGen9.dstRegFile = getRegFile(dst);
+    i.sendsGen9.src1RegFile = getRegFile(src1);
+    i.sendsGen9.src1RegNum = src1.getBase();
+
+    if (dst.isIndirect())  i.sendsGen9.dstAddrImm9  =  dst.getOffset() >> 9;
+    if (src0.isIndirect()) i.sendsGen9.src0AddrImm9 = src0.getOffset() >> 9;
+
+    encodeSendsDesc(i, desc);
+    encodeSendsExDesc(i, exdesc);
+
+    db(i);
+}
+
+template <HW hw>
+template <typename D, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, RegData exdesc, D desc, SourceLocation loc)
+{
+#ifdef NGEN_SAFE
+    throw sfid_needed_exception();
+#endif
+}
+
+template <HW hw>
+template <typename D, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opSends(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, D desc, SourceLocation loc)
+{
+    Opcode mop = static_cast<Opcode>(static_cast<int>(op) & ~2);
+    opSend(mop, mod, static_cast<SharedFunction>(exdesc & 0x1F), dst, src0, src1, -1, exdesc, desc, loc);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon8(i, op, emod);
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.dstType = getTypecode<hw>(dst.getType());
+    i.binary.src0RegFile = getRegFile(Immediate());
+    i.binary.src0Type = getTypecode<hw>(DataType::d);
+    i.branches.jip = jip;
+    i.branches.uip = uip;
+
+    db(i);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, int32_t uip, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.branches.branchCtrl = emod.getBranchCtrl();
+
+    i.binary.dst = encodeBinaryOperand12<-1, false>(dst, tag).bits;
+
+    i.binary.src0Imm = true;
+    i.binary.src1Imm = true;
+
+    i.branches.jip = jip;
+    i.branches.uip = uip;
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    encodeCommon8(i, op, emod);
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.dstType = getTypecode<hw>(dst.getType());
+    i.binary.src1RegFile = RegFileIMM;
+    i.binary.src1Type = getTypecode<hw>(DataType::d);
+    i.branches.jip = jip;
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, int32_t jip, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.branches.branchCtrl = emod.getBranchCtrl();
+
+    i.binary.dst = encodeBinaryOperand12<-1, false>(dst, tag).bits;
+    i.binary.src0Imm = true;
+    i.branches.jip = jip;
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, bool small12, HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    encodeCommon8(i, op, emod);
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.dstRegFile = getRegFile(dst);
+    i.binary.dstType = getTypecode<hw>(DataType::d);
+    i.binary.src0RegFile = getRegFile(src0);
+    i.binary.src0Type = getTypecode<hw>(DataType::d);
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+
+    db(i);
+}
+
+template <HW hw>
+template <bool forceWE, bool small12, HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+    if (forceWE)
+        emod |= NoMask;
+
+    encodeCommon12(i, op, emod, dst, tag);
+
+    i.branches.branchCtrl = emod.getBranchCtrl();
+
+    i.binary.dst = encodeBinaryOperand12<-1, false>(dst, tag).bits;
+    i.binary.src0 = encodeBinaryOperand12<0, false>(src0, tag).bits;
+    if (small12)
+        i.binary.src0 &= 0xFFFF;
+
+
+    db(i);
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, Label &uip, SourceLocation loc)
+{
+    addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
+    addFixup(LabelFixup(uip.getID(labelManager), LabelFixup::UIPOffset));
+    opBranch(op, mod, dst, 0, 0, loc);
+}
+
+template <HW hw>
+template <bool forceWE>
+void BinaryCodeGenerator<hw>::opBranch(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, SourceLocation loc)
+{
+    addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
+    opBranch<forceWE>(op, mod, dst, 0, loc);
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opCall(Opcode op, const InstructionModifier &mod, const RegData &dst, Label &jip, SourceLocation loc)
+{
+    addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
+    if (isGen12)
+        opBranch<true>(op, mod, dst, 0, loc);
+    else
+        opX<true>(op, DataType::d, mod, dst, null.ud(0)(0, 1, 0), Immediate::d(0), loc);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwLT(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip, SourceLocation loc)
+{
+#ifdef NGEN_SAFE
+        if (mod.getExecSize() != 1) throw invalid_modifiers_exception();
+#endif
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+    InstructionModifier emod = mod | defaultModifier | NoMask;
+
+    encodeCommon8(i, op, emod);
+
+    src0.fixup(hw, emod.getExecSize(), 0, DataType::d, 0, 2);
+
+    i.binary.dst = encodeBinaryOperand8<true>(dst).bits;
+    i.binary.src0 = encodeBinaryOperand8<false>(src0).bits;
+    i.binary.src0RegFile = getRegFile(src0);
+    i.binary.src1RegFile = RegFileIMM;
+    i.binary.src1Type = getTypecode<hw>(DataType::d);
+
+    i.branches.jip = jip;
+
+    db(i);
+}
+
+template <HW hw>
+template <HW hw_>
+typename std::enable_if<hwGE(hw_, HW::Gen12LP)>::type
+BinaryCodeGenerator<hw>::opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, RegData src0, uint32_t jip, SourceLocation loc)
+{
+    opBranch<true>(op, mod, dst, jip, loc);
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opJmpi(Opcode op, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Label &jip, SourceLocation loc)
+{
+    if (hw >= HW::Gen12LP)
+        addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffset));
+    opJmpi(op, mod, dst, src0, 0, loc);
+    if (hw < HW::Gen12LP)
+        addFixup(LabelFixup(jip.getID(labelManager), LabelFixup::JIPOffsetJMPI));
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, SourceLocation loc)
+{
+    if (hw < HW::Gen12LP)
+        unsupported();
+
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon12(i, op, emod, null, tag);
+
+    i.binary.dst = 0x1;
+    i.binary.cmod = static_cast<int>(fc);
+
+    db(i);
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, RegData src0, SourceLocation loc)
+{
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    if (hw < HW::Gen12LP)
+        unsupported();
+
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon12(i, op, emod, null, tag);
+
+    i.binary.dst = 0x1;
+    if (!src0.isNull()) {
+        src0.setRegion(0, 1, 0);
+        i.binary.src0 = encodeBinaryOperand12<0>(src0, tag).bits;
+        i.binary.src0Type = getTypecode12(src0.getType());
+    }
+    i.binary.cmod = static_cast<int>(fc);
+
+    db(i);
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, const Immediate &src0, SourceLocation loc)
+{
+    if (hw < HW::Gen12LP)
+        unsupported();
+
+    debugLine.add(rootStream.length(), loc);
+
+    typename EncodingTag12Dispatch<hw>::tag tag;
+    Instruction12 i{};
+    InstructionModifier emod = mod | defaultModifier;
+
+    encodeCommon12(i, op, emod, null, tag);
+
+    i.binary.dst = 0x1;
+    i.binary.src0Type = getTypecode12(src0.getType());
+    i.binary.src0Imm = true;
+    i.binary.cmod = static_cast<int>(fc);
+
+    i.imm32.value = uint32_t(static_cast<uint64_t>(src0));
+
+    db(i);
+}
+
+template <HW hw>
+void BinaryCodeGenerator<hw>::opNop(Opcode op, SourceLocation loc)
+{
+    debugLine.add(rootStream.length(), loc);
+
+    Instruction8 i{};
+
+    i.qword[0] = static_cast<int>(op);
+    i.qword[1] = 0;
+
+    db(i);
+}
+
+} /* namespace NGEN_NAMESPACE */
+
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#endif /* header guard */
diff --git a/third_party/ngen/ngen_asm.hpp b/third_party/ngen/ngen_asm.hpp
new file mode 100644
index 00000000000..438bff51302
--- /dev/null
+++ b/third_party/ngen/ngen_asm.hpp
@@ -0,0 +1,1888 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_ASM_HPP
+#define NGEN_ASM_HPP
+
+#include "ngen_config_internal.hpp"
+
+#ifdef NGEN_ASM
+
+#include <array>
+#include <cstdint>
+#include <sstream>
+#include <string>
+
+#include "ngen_core.hpp"
+#include "ngen_debuginfo.hpp"
+#include "ngen_gen12.hpp"
+
+namespace NGEN_NAMESPACE {
+
+
+inline void RegData::outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const
+{
+#ifdef NGEN_SAFE
+    if (isInvalid()) throw invalid_object_exception();
+#endif
+    auto vs = getVS();
+    if (detail == PrintDetail::vs_hs)
+        if (vs > 8 && (getHS() != 0))
+            vs = 8;
+
+    if (getNeg()) str << '-';
+    if (getAbs()) str << "(abs)";
+
+    if (isARF()) {
+        str << getARFType();
+        switch (getARFType()) {
+            case ARFType::null:
+            case ARFType::sp:
+            case ARFType::ip:
+                break;
+            default:
+                str << getARFBase();
+        }
+    } else if (isIndirect()) {
+        str << "r[";
+        getIndirectReg().outputText(str, PrintDetail::sub_no_type, man);
+        if (getOffset())
+            str << ',' << getOffset();
+        str << ']';
+    } else
+        str << 'r' << base;
+
+    if (detail <= PrintDetail::base) return;
+
+    if (!isIndirect() && !isNull())
+        str << '.' << getOffset();
+
+    if (detail <= PrintDetail::sub_no_type) return;
+
+    if (detail >= PrintDetail::hs && !isNull()) {
+        str << '<';
+        if (detail >= PrintDetail::vs_hs && !isVxIndirect())
+            str << vs << ';';
+        if (detail == PrintDetail::full)
+            str << getWidth() << ',';
+        str << getHS();
+        str << '>';
+    }
+
+    str << ':' << getType();
+}
+
+static inline std::ostream& operator<<(std::ostream &str, const RegData &r)
+{
+    LabelManager man;
+    r.outputText(str, PrintDetail::full, man);
+    return str;
+}
+
+inline void Immediate::outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const
+{
+    uint64_t nbytes = getBytes(getType());
+    uint64_t val;
+
+    if (nbytes == 8)
+        val = payload;
+    else
+        val = payload & ((uint64_t(1) << (nbytes * 8)) - 1);
+
+    str << "0x" << std::hex << val << std::dec;
+    if (!hiddenType && detail >= PrintDetail::sub)
+        str << ':' << type;
+}
+
+inline void ExtendedReg::outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const
+{
+#ifdef NGEN_SAFE
+    if (isInvalid()) throw invalid_object_exception();
+#endif
+
+    if (base.getNeg()) str << '-';
+    if (base.getAbs()) str << "(abs)";
+
+    str << 'r' << base.getBase() << '.';
+    if (mmeNum == 8)
+        str << "nomme";
+    else
+        str << "mme" << int(mmeNum);
+
+    if (detail >= PrintDetail::sub)
+        str << ':' << base.getType();
+}
+
+inline void Align16Operand::outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const
+{
+#ifdef NGEN_SAFE
+    if (isInvalid()) throw invalid_object_exception();
+    throw iga_align16_exception();
+#else
+    str << "<unsupported Align16 operand>";
+#endif
+}
+
+inline void GRFRange::outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const
+{
+    str << 'r' << int(base) << ':' << int(len);
+}
+
+inline void Label::outputText(std::ostream &str, PrintDetail detail, LabelManager &man) {
+    str << 'L' << getID(man);
+}
+
+struct NoOperand {
+    static const bool emptyOp = true;
+    void fixup(HW hw, int esize, int ewidth, DataType defaultType, int srcN, int arity) const {}
+    constexpr DataType getType() const { return DataType::invalid; }
+    constexpr bool isScalar() const { return false; }
+
+    void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const {}
+};
+
+struct AsmOperand {
+    union {
+        RegData reg;
+        ExtendedReg ereg;
+        Immediate imm;
+        Label label;
+        GRFRange range;
+    };
+    enum class Type : uint8_t {
+        none = 0,
+        reg = 1,
+        ereg = 2,
+        imm = 3,
+        label = 4,
+        range = 5
+    } type;
+
+    AsmOperand()                  : type{Type::none} {}
+    AsmOperand(NoOperand)         : AsmOperand() {}
+    AsmOperand(RegData reg_)      : reg{reg_}, type{Type::reg} {}
+    AsmOperand(ExtendedReg ereg_) : ereg{ereg_}, type{Type::ereg} {}
+    AsmOperand(Immediate imm_)    : imm{imm_}, type{Type::imm} {}
+    AsmOperand(Label label_)      : label{label_}, type{Type::label} {}
+    AsmOperand(GRFRange range_)   : range{range_}, type{Type::range} {}
+    AsmOperand(uint32_t imm_)     : imm{imm_}, type{Type::imm} {}
+
+    void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const {
+        switch (type) {
+            case Type::none:    break;
+            case Type::ereg:    ereg.outputText(str, detail, man); break;
+            case Type::reg:     reg.outputText(str, detail, man); break;
+            case Type::imm:     imm.outputText(str, detail, man); break;
+            case Type::label: {
+                auto clone = label;
+                clone.outputText(str, detail, man);
+                break;
+            }
+            case Type::range:   range.outputText(str, detail, man); break;
+        }
+    }
+};
+
+struct AsmInstruction {
+    Opcode op;
+    uint16_t ext;
+    uint32_t inum;
+    InstructionModifier mod;
+    AsmOperand dst, src[5];
+    LabelManager *labelManager;
+    std::string comment;
+
+    AsmInstruction(Opcode op_, uint16_t ext_, uint32_t inum_, InstructionModifier mod_, LabelManager *man,
+        AsmOperand dst_ = NoOperand(), AsmOperand src0 = NoOperand(), AsmOperand src1 = NoOperand(),
+        AsmOperand src2 = NoOperand(), AsmOperand src3 = NoOperand(), AsmOperand src4 = NoOperand())
+            : op(op_), ext(ext_), inum(inum_), mod(mod_), dst(dst_), src{src0, src1, src2, src3, src4}, labelManager{man}, comment{} {}
+
+    explicit AsmInstruction(uint32_t inum_, const std::string &comment_)
+            : op(Opcode::illegal), ext(0), inum(inum_), mod{}, dst{}, src{}, labelManager{nullptr}, comment{comment_} {}
+    inline AsmInstruction(const autoswsb::SyncInsertion &si);
+    inline AsmInstruction(const autoswsb::DummyMovInsertion &mi);
+
+    bool isLabel() const   { return (op == Opcode::illegal) && (dst.type == AsmOperand::Type::label); }
+    bool isComment() const { return (op == Opcode::illegal) && !comment.empty(); }
+
+    // Auto-SWSB interface.
+    bool autoSWSB() const       { return mod.isAutoSWSB(); }
+    SWSBInfo swsb() const       { return mod.getSWSB(); }
+    void setSWSB(SWSBInfo swsb) { mod.setSWSB(swsb); }
+    void clearAutoSWSB()        { mod.setAutoSWSB(false); }
+    Opcode opcode() const       { return op; }
+    SyncFunction syncFC() const { return static_cast<SyncFunction>(ext & 0xF); }
+    SharedFunction sfid() const { return static_cast<SharedFunction>(ext & 0xF); }
+    bool eot() const            { return mod.isEOT(); }
+    bool predicated() const     { return !mod.isWrEn() || (mod.getPredCtrl() != PredCtrl::None); }
+    bool atomic() const         { return mod.isAtomic(); }
+
+    inline unsigned dstTypecode()  const { return getTypecode(dst); }
+    inline unsigned src0Typecode() const { return getTypecode(src[0]); }
+    inline unsigned src1Typecode() const { return getTypecode(src[1]); }
+    inline autoswsb::DestinationMask destinations(int &jip, int &uip) const;
+    inline bool getOperandRegion(autoswsb::DependencyRegion &region, int opNum) const;
+
+    void shiftJIP(int32_t shift) const {}
+    void shiftUIP(int32_t shift) const {}
+
+    bool getImm32(uint32_t &imm, int opNum = 0) const {
+        if (src[opNum].type == AsmOperand::Type::imm) {
+            imm = uint32_t(static_cast<uint64_t>(src[opNum].imm));
+            return true;
+        } else
+            return false;
+    }
+    bool getARFType(ARFType &arfType, int opNum, HW hw) const {
+        auto &opd = (opNum < 0) ? dst : src[opNum];
+        if (opd.type == AsmOperand::Type::reg && opd.reg.isARF()) {
+            arfType = opd.reg.getARFType();
+            return true;
+        } else
+            return false;
+    }
+    bool getSendDesc(MessageDescriptor &desc) const { return getImm32(desc.all, 3); }
+    int getFencedepJIP() const {
+        if (src[0].type == AsmOperand::Type::label) {
+            auto label = src[0].label;
+            return labelManager->getTarget(label.getID(*labelManager)) - inum + 1;
+        } else
+            return 0;
+    }
+
+protected:
+    static inline unsigned getTypecode(const AsmOperand &op);
+};
+
+AsmInstruction::AsmInstruction(const autoswsb::SyncInsertion &si)
+{
+    op = Opcode::sync;
+    ext = static_cast<uint8_t>(si.fc);
+    mod = InstructionModifier::createMaskCtrl(true);
+    mod.setSWSB(si.swsb);
+    dst = NoOperand();
+    for (auto n = 0; n < 4; n++)
+        src[n] = NoOperand();
+    if (si.mask)
+        src[0] = Immediate::ud(si.mask);
+    else
+        src[0] = NullRegister();
+}
+
+AsmInstruction::AsmInstruction(const autoswsb::DummyMovInsertion &mi)
+{
+    op = Opcode::mov_gen12;
+    ext = 0;
+    mod = 1 | InstructionModifier::createMaskCtrl(true);
+    mod.setSWSB(mi.swsb);
+    dst = NullRegister().retype(mi.dt);
+    for (auto n = 1; n < 4; n++)
+        src[n] = NoOperand();
+    if (mi.constant) {
+        src[0] = Immediate::zero(mi.dt);
+    } else
+        src[0] = GRF(mi.grf).sub(0, mi.dt);
+}
+
+unsigned AsmInstruction::getTypecode(const AsmOperand &op)
+{
+    DataType dt = DataType::invalid;
+
+    switch (op.type) {
+        case AsmOperand::Type::reg:  dt = op.reg.getType(); break;
+        case AsmOperand::Type::ereg: dt = op.ereg.getType(); break;
+        default: break;
+    }
+
+    return getTypecode12(dt);
+}
+
+autoswsb::DestinationMask AsmInstruction::destinations(int &jip, int &uip) const
+{
+    using namespace autoswsb;
+
+    if (!isBranch(op))
+        return eot() ? DestNone : DestNextIP;
+
+    if (src[0].type == AsmOperand::Type::reg)
+        return DestUnknown;
+
+    DestinationMask mask = DestNextIP;
+    if (src[0].type == AsmOperand::Type::label) {
+        auto label = src[0].label;
+        mask |= DestJIP;
+        jip = labelManager->getTarget(label.getID(*labelManager)) - inum;
+    }
+
+    if (src[1].type == AsmOperand::Type::label) {
+        auto label = src[1].label;
+        mask |= DestUIP;
+        uip = labelManager->getTarget(label.getID(*labelManager)) - inum;
+    }
+
+    if (op == Opcode::jmpi && mod.getPredCtrl() == PredCtrl::None)
+        mask &= ~DestNextIP;
+
+    return mask;
+}
+
+bool AsmInstruction::getOperandRegion(autoswsb::DependencyRegion &region, int opNum) const
+{
+    using namespace autoswsb;
+    const AsmOperand &operand = (opNum < 0) ? dst : src[opNum];
+    RegData rd;
+    auto hw = region.hw;
+
+    switch (operand.type) {
+        case AsmOperand::Type::reg:    rd = operand.reg; break;
+        case AsmOperand::Type::ereg:   rd = operand.ereg.getBase(); break;
+        case AsmOperand::Type::range:  region = DependencyRegion(hw, operand.range); return true;
+        case AsmOperand::Type::none:
+            if (hw >= HW::Xe3 && (op == Opcode::send || op == Opcode::sendc) && opNum == 1
+                    && src[0].type == AsmOperand::Type::reg && src[0].reg.isIndirect()
+                    && src[3].type == AsmOperand::Type::imm) {
+                auto desc = static_cast<MessageDescriptor>(uint32_t(static_cast<uint64_t>(src[3].imm)));
+                auto sreg = src[0].reg.getIndirectReg();
+                sreg.setRegion(1, 1, 0);
+                region = DependencyRegion(hw, desc.parts.messageLen, sreg);
+                return true;
+            }
+            return false;
+        default: return false;
+    }
+
+    if (rd.isARF() && !autoswsb::trackableARF(rd.getARFType()))
+        return false;
+
+    if (rd.isIndirect())
+        region = DependencyRegion();
+    else if (op == Opcode::send || op == Opcode::sendc) {
+        int len = 0;
+        if (opNum <= 0) {
+            if (src[3].type == AsmOperand::Type::imm) {
+                MessageDescriptor desc;
+                desc.all = uint32_t(static_cast<uint64_t>(src[3].imm));
+                len = (opNum < 0) ? desc.parts.responseLen : desc.parts.messageLen;
+                if (len == 31) len++;       // 32 GRF responses are encoded as 31. Conservatively use the higher value.
+            } else
+                len = -1;
+        } else if (opNum == 1) {
+            bool exdescImm = (src[2].type == AsmOperand::Type::imm);
+            if (exdescImm && (hw >= HW::XeHPG))
+                len = ext >> 8;
+            else if (exdescImm) {
+                ExtendedMessageDescriptor exdesc;
+                exdesc.all = uint32_t(static_cast<uint64_t>(src[2].imm));
+                len = exdesc.parts.extMessageLen;
+            } else
+                len = -1;
+        }
+        if (len == 0)
+            return false;
+        else if (len == -1)
+            region = DependencyRegion();
+        else
+            region = DependencyRegion(hw, GRFRange(rd.getBase(), len));
+    } else if (op == Opcode::dpas || op == Opcode::dpasw) {
+        unsigned sdepth = ext >> 8;
+        unsigned rcount = ext & 0xFF;
+        unsigned len;
+
+        switch (opNum) {
+            case -1:
+            case 0: len = GRF::bytesToGRFs(hw, rcount * operand.reg.getBytes() * mod.getExecSize()); break;
+            case 1: len = sdepth; break;
+            case 2:
+                if (op == Opcode::dpasw) rcount = (rcount + 1) >> 1;
+                len = GRF::bytesToGRFs(hw, operand.reg.getByteOffset() + sdepth * rcount * 4);
+                break;
+            default: return false;
+        }
+
+        region = DependencyRegion(hw, GRFRange(operand.reg.getBase(), len));
+    } else
+        region = DependencyRegion(hw, mod.getExecSize(), rd);
+
+    return true;
+}
+
+#if defined(NGEN_GLOBAL_REGS) && !defined(NGEN_GLOBAL_REGS_DEFINED)
+#include "ngen_registers.hpp"
+#endif
+
+class AsmCodeGenerator {
+private:
+#include "ngen_compiler_fix.hpp"
+public:
+    explicit AsmCodeGenerator(Product product_) : hardware(getCore(product_.family)), product(product_), defaultOutput{nullptr},
+                                                  sync{this}, load{this}, store{this}, atomic{this}
+    {
+        isGen12 = (hardware >= HW::Gen12LP);
+        _workaround_();
+        streamStack.push_back(new InstructionStream());
+    }
+
+    explicit AsmCodeGenerator(HW hardware_, int stepping_ = 0) : AsmCodeGenerator({genericProductFamily(hardware_), 0, PlatformType::Unknown}) {}
+
+    AsmCodeGenerator(HW hardware_, std::ostream &defaultOutput_, int stepping_ = 0) : AsmCodeGenerator(hardware_, stepping_) {
+        defaultOutput = &defaultOutput_;
+    }
+    ~AsmCodeGenerator() noexcept(false) {
+        if (defaultOutput != nullptr)
+            getCode(*defaultOutput);
+        for (auto &s : streamStack)
+            delete s;
+    }
+
+    constexpr HW getHardware() const { return hardware; }
+
+    inline void getCode(std::ostream &out);
+    void enableLineNumbers(bool enable = true) { lineNumbers = enable; }
+
+    Product getProduct() const { return product; }
+    ProductFamily getProductFamily() const { return product.family; }
+    int getStepping() const { return product.stepping; }
+
+    void setProduct(Product product_) { product = product_; }
+    void setProductFamily(ProductFamily family_) { product.family = family_; }
+    void setStepping(int stepping_) { product.stepping = stepping_; }
+
+protected:
+    struct InstructionStream {
+        std::vector<AsmInstruction> buffer;
+        std::vector<uint32_t> labels;
+
+        template <typename... Remaining>
+        AsmInstruction &append(Opcode op, uint16_t ext, Remaining&&... args) {
+            buffer.emplace_back(op, ext, 0, std::forward<Remaining>(args)...);
+            return buffer.back();
+        }
+
+        void appendComment(const std::string &str) { buffer.emplace_back(0, str); }
+
+        void mark(Label &label, LabelManager &man) {
+            uint32_t id = label.getID(man);
+
+            man.setTarget(id, uint32_t(buffer.size()));
+            labels.push_back(id);
+            buffer.emplace_back(Opcode::illegal, 0, 0, InstructionModifier(), &man, label);
+        }
+
+        void append(InstructionStream &other, LabelManager &man) {
+            for (uint32_t id : other.labels)
+                man.offsetTarget(id, uint32_t(buffer.size()));
+
+            buffer.insert(buffer.end(), other.buffer.begin(), other.buffer.end());
+            labels.insert(labels.end(), other.labels.begin(), other.labels.end());
+        }
+    };
+
+    HW hardware;
+    Product product;
+    bool isGen12;
+    int declaredGRFs = 128;
+    std::ostream *defaultOutput;
+    bool lineNumbers = false;
+
+    Label _labelLocalIDsLoaded;
+    Label _labelArgsLoaded;
+    Label _lastFenceLabel;
+    RegData _lastFenceDst;
+
+private:
+    InstructionModifier defaultModifier;
+    LabelManager labelManager;
+    std::vector<InstructionStream*> streamStack;
+
+    inline void unsupported();
+
+    // Output functions.
+    template <typename D, typename S0, typename S1, typename S2>
+    inline void opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2, uint16_t ext);
+
+    template <typename D, typename S0, typename S1, typename S2> void opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2) {
+        opX(op, defaultType, mod, dst, src0, src1, src2, 0);
+    }
+    template <typename D, typename S0, typename S1> void opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1) {
+        opX(op, defaultType, mod, dst, src0, src1, NoOperand());
+    }
+    template <typename D, typename S0, typename S1> void opX(Opcode op, const InstructionModifier &mod, D dst, S0 src0, S1 src1) {
+        opX(op, DataType::invalid, mod, dst, src0, src1);
+    }
+    template <typename D, typename S0> void opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0) {
+        opX(op, defaultType, mod, dst, src0, NoOperand());
+    }
+    template <typename D, typename S0> void opX(Opcode op, const InstructionModifier &mod, D dst, S0 src0) {
+        opX(op, DataType::invalid, mod, dst, src0);
+    }
+    template <typename D> void opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst) {
+        opX(op, defaultType, mod, dst, NoOperand());
+    }
+    template <typename D> void opX(Opcode op, const InstructionModifier &mod, D dst) {
+        opX(op, DataType::invalid, mod, dst);
+    }
+    void opX(Opcode op) {
+        opX(op, InstructionModifier(), NoOperand());
+    }
+    void opX(Opcode op, const InstructionModifier &mod, Label &jip) {
+        (void) jip.getID(labelManager);
+        opX(op, DataType::invalid, mod, NoOperand(), jip);
+    }
+    void opX(Opcode op, const InstructionModifier &mod, Label &jip, Label &uip) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(op, DataType::invalid, mod, NoOperand(), jip, uip, NoOperand());
+    }
+
+    template <typename S1, typename ED, typename D>
+    void opSend(Opcode op, const InstructionModifier &mod, SharedFunction sf, RegData dst, RegData src0, S1 src1, ED exdesc, D desc) {
+        if (!(hardware >= HW::Xe3 && src0.isIndirect()))
+        if (src1.emptyOp && (isGen12 || op == Opcode::sends || op == Opcode::sendsc)) {
+            opSend(op, mod, sf, dst, src0, null, exdesc, desc);
+            return;
+        }
+
+        auto &i = streamStack.back()->append(op, static_cast<uint8_t>(sf), mod | defaultModifier, &labelManager, dst, src0, src1, exdesc, desc);
+        if (i.src[2].type == AsmOperand::Type::imm && i.src[1].type != AsmOperand::Type::none) {
+            uint32_t exdesc = uint32_t(static_cast<uint64_t>(i.src[2].imm));
+            if (isGen12) {
+                if (hardware >= HW::XeHPG) {
+                    i.ext |= 0x80 | (((exdesc >> 6) & 0x1F) << 8);
+                    i.src[2].imm = uint32_t(exdesc & ~0x7EF);
+                } else
+                i.src[2].imm = uint32_t(exdesc & ~0x2F);
+            } else
+                i.src[2].imm = uint32_t(exdesc | static_cast<uint8_t>(sf));
+        }
+    }
+    void opDpas(Opcode op, DataType defaultType, const InstructionModifier &mod, int sdepth, int rcount, RegData dst, RegData src0, RegData src1, RegData src2) {
+        dst.fixup(hardware, 1, 0, defaultType, -1, 3);
+        src0.fixup(hardware, 1, 0, defaultType, 0, 3);
+        src1.fixup(hardware, 1, 0, defaultType, 1, 3);
+        src2.fixup(hardware, 1, 0, defaultType, 2, 3);
+        (void) streamStack.back()->append(op, (sdepth << 8) | rcount, mod | defaultModifier, &labelManager, dst, src0, src1, src2);
+    }
+    template <typename D, typename S0> void opCall(Opcode op, const InstructionModifier &mod, D dst, S0 src0) {
+        (void) streamStack.back()->append(op, 0, mod | defaultModifier | NoMask, &labelManager, dst, src0);
+    }
+    template <typename S1> void opJmpi(Opcode op, const InstructionModifier &mod, S1 src1) {
+        (void) streamStack.back()->append(op, 0, mod | defaultModifier | NoMask, &labelManager, NoOperand(), src1);
+    }
+    template <typename S0> void opSync(Opcode op, SyncFunction fc, const InstructionModifier &mod, S0 src0) {
+        (void) streamStack.back()->append(op, static_cast<uint8_t>(fc), mod | defaultModifier, &labelManager, NoOperand(), src0);
+    }
+
+    inline void finalize();
+
+    enum class ModPlacementType {Pre, Mid, Post};
+    inline void outX(std::ostream &out, const AsmInstruction &i, int lineNo);
+    inline void outExt(std::ostream &out, const AsmInstruction &i);
+    inline void outMods(std::ostream &out, const InstructionModifier &mod, Opcode op, ModPlacementType location);
+    inline void outSync(std::ostream &out, const autoswsb::SyncInsertion &si);
+
+protected:
+    // Configuration.
+    void setDefaultNoMask(bool def = true)          { defaultModifier.setWrEn(def); }
+    void setDefaultAutoSWSB(bool def = true)        { defaultModifier.setAutoSWSB(def); }
+    bool getDefaultNoMask() const                   { return defaultModifier.isWrEn(); }
+    bool getDefaultAutoSWSB() const                 { return defaultModifier.isAutoSWSB(); }
+
+    // Stream handling.
+    void pushStream()                               { pushStream(new InstructionStream()); }
+    void pushStream(InstructionStream &s)           { pushStream(&s); }
+    void pushStream(InstructionStream *s)           { streamStack.push_back(s); }
+
+    inline InstructionStream *popStream();
+
+    void appendStream(InstructionStream *s)         { appendStream(*s); }
+    void appendStream(InstructionStream &s)         { streamStack.back()->append(s, labelManager); }
+    void appendCurrentStream()                      { InstructionStream *s = popStream(); appendStream(s); delete s; }
+
+    void discardStream()                            { delete popStream(); }
+
+    void comment(const std::string &str)            { streamStack.back()->appendComment(str); }
+
+    void requireGRF(int grfs)                       { declaredGRFs = grfs; }
+
+    // Instructions.
+    template <typename DT = void>
+    void add(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::add, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void add(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::add, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void addc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::addc, getDataType<DT>(), (hardware >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
+    }
+    template <typename DT = void>
+    void addc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::addc, getDataType<DT>(), (hardware >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void add3(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::add3, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void and_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::and_gen12 : Opcode::and_, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void and_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::and_gen12 : Opcode::and_, getDataType<DT>(), mod, dst, src0, src1);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void and(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        and_<DT>(mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void and(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        and_<DT>(mod, dst, src0, src1);
+    }
+#endif
+    template <typename DT = void>
+    void asr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::asr_gen12 : Opcode::asr, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void asr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::asr_gen12 : Opcode::asr, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void avg(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::avg, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void avg(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::avg, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfe(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfe_gen12 : Opcode::bfe, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfi1(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi1_gen12 : Opcode::bfi1, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void bfi1(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi1_gen12 : Opcode::bfi1, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfi2(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfi2_gen12 : Opcode::bfi2, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::bfn, getDataType<DT>(), mod, dst, src0, src1, src2, ctrl);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::bfn, getDataType<DT>(), mod, dst, src0, src1, src2, ctrl);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::bfn, getDataType<DT>(), mod, dst, src0, src1, src2, ctrl);
+    }
+    template <typename DT = void>
+    void bfn(const InstructionModifier &mod, uint8_t ctrl, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::bfn, getDataType<DT>(), mod, dst, src0, src1, src2, ctrl);
+    }
+    template <typename DT = void>
+    void bfrev(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfrev_gen12 : Opcode::bfrev, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void bfrev(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::bfrev_gen12 : Opcode::bfrev, getDataType<DT>(), mod, dst, src0);
+    }
+    void brc(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::brc, mod, jip, uip);
+    }
+    void brc(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+        opCall(Opcode::brc, mod, NoOperand(), src0);
+    }
+    void brd(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        opX(Opcode::brd, mod, jip);
+    }
+    void brd(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+        opCall(Opcode::brd, mod, NoOperand(), src0);
+    }
+    void break_(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::break_, mod, jip, uip);
+    }
+    void call(const InstructionModifier &mod, const RegData &dst, Label &jip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        opCall(Opcode::call, mod, dst, jip);
+    }
+    void call(const InstructionModifier &mod, const RegData &dst, const RegData &jip, SourceLocation loc = {}) {
+        opCall(Opcode::call, mod, dst, jip);
+    }
+    void calla(const InstructionModifier &mod, const RegData &dst, int32_t jip, SourceLocation loc = {}) {
+        opCall(Opcode::calla, mod, dst, Immediate::ud(jip));
+    }
+    void calla(const InstructionModifier &mod, const RegData &dst, const RegData &jip, SourceLocation loc = {}) {
+        opCall(Opcode::calla, mod, dst, jip);
+    }
+    template <typename DT = void>
+    void cbit(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::cbit, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void cbit(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::cbit, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void cmp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::cmp_gen12 : Opcode::cmp, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void cmp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::cmp_gen12 : Opcode::cmp, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void cmpn(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::cmpn_gen12 : Opcode::cmpn, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void csel(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::csel_gen12 : Opcode::csel, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    void cont(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::cont, mod, jip, uip);
+    }
+    template <typename DT = void>
+    void dp2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp2, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dp2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp2, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dp3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp3, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dp3(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp3, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dp4(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp4, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dp4(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dp4, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void dp4a(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::dp4a, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void dpas(const InstructionModifier &mod, uint8_t sdepth, uint8_t rcount, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opDpas(Opcode::dpas, getDataType<DT>(), mod, sdepth, rcount, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void dpasw(const InstructionModifier &mod, uint8_t sdepth, uint8_t rcount, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opDpas(Opcode::dpasw, getDataType<DT>(), mod, sdepth, rcount, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void dph(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::dph, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void dph(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::dph, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    void else_(const InstructionModifier &mod, Label &jip, Label &uip, bool branchCtrl = false, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::else_, DataType::invalid, mod, NoOperand(), jip, uip, NoOperand(), branchCtrl);
+    }
+    void else_(InstructionModifier mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        else_(mod, jip, uip, false, loc);
+    }
+    void else_(InstructionModifier mod, Label &jip, SourceLocation loc = {}) {
+        else_(mod, jip, jip, false, loc);
+    }
+    void endif(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        opX(Opcode::endif, mod, NoOperand(), jip);
+    }
+    void endif(const InstructionModifier &mod, SourceLocation loc = {}) {
+        Label next;
+        endif(mod, next);
+        mark(next);
+    }
+    template <typename DT = void>
+    void fbh(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbh, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void fbh(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbh, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void fbl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbl, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void fbl(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::fbl, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void frc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::frc, getDataType<DT>(), mod, dst, src0);
+    }
+    void goto_(const InstructionModifier &mod, Label &jip, Label &uip, bool branchCtrl = false, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::goto_, DataType::invalid, mod, NoOperand(), jip, uip, NoOperand(), branchCtrl);
+    }
+    void goto_(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        goto_(mod, jip, jip);
+    }
+    void halt(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::halt, mod, jip, uip);
+    }
+    void halt(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        halt(mod, jip, jip);
+    }
+    void if_(InstructionModifier mod, Label &jip, Label &uip, bool branchCtrl, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        (void) uip.getID(labelManager);
+        opX(Opcode::if_, DataType::invalid, mod, NoOperand(), jip, uip, NoOperand(), branchCtrl);
+    }
+    void if_(const InstructionModifier &mod, Label &jip, Label &uip, SourceLocation loc = {}) {
+        if_(mod, jip, uip, false, loc);
+    }
+    void if_(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        if_(mod, jip, jip, false, loc);
+    }
+    void illegal(SourceLocation loc = {}) {
+        opX(Opcode::illegal);
+    }
+    void join(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        opX(Opcode::join, mod, jip);
+    }
+    void join(const InstructionModifier &mod, SourceLocation loc = {}) {
+        Label next;
+        join(mod, next);
+        mark(next);
+    }
+    void jmpi(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        opJmpi(Opcode::jmpi, mod, jip);
+    }
+    void jmpi(const InstructionModifier &mod, const RegData &jip, SourceLocation loc = {}) {
+        opJmpi(Opcode::jmpi, mod, jip);
+    }
+    template <typename DT = void>
+    void line(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::line, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void line(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::line, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void lrp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::lrp, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void lzd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::lzd, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void lzd(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::lzd, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void mac(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::mac, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void mac(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::mac, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void mach(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::mach, getDataType<DT>(), (hardware >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
+    }
+    template <typename DT = void>
+    void mach(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::mach, getDataType<DT>(), (hardware >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
+    }
+    template <typename DT = void>
+    void macl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen10) unsupported();
+#endif
+        opX((hardware >= HW::XeHPC) ? Opcode::macl : Opcode::mach, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void macl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen10) unsupported();
+#endif
+        opX((hardware >= HW::XeHPC) ? Opcode::macl : Opcode::mach, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const Align16Operand &dst, const Align16Operand &src0, const Align16Operand &src1, const Align16Operand &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const RegData &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void mad(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, const RegData &src1, const Immediate &src2, SourceLocation loc = {}) {
+        opX(Opcode::mad, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void madm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1, const ExtendedReg &src2, SourceLocation loc = {}) {
+        opX(Opcode::madm, getDataType<DT>(), mod, dst, src0, src1, src2);
+    }
+    template <typename DT = void>
+    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (mathArgCount(hardware, fc) != 1) throw invalid_operand_count_exception();
+#endif
+        if (fc == MathFunction::rsqtm)
+            math<DT>(mod, fc, dst | nomme, src0 | nomme);
+        else
+            opX(Opcode::math, getDataType<DT>(), mod, dst, src0, NoOperand(), NoOperand(), static_cast<uint8_t>(fc));
+    }
+    template <typename DT = void>
+    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (mathArgCount(hardware, fc) != 2) throw invalid_operand_count_exception();
+#endif
+        if (fc == MathFunction::invm)
+            math<DT>(mod, fc, dst | nomme, src0 | nomme, src1 | nomme);
+        else
+            opX(Opcode::math, getDataType<DT>(), mod, dst, src0, src1, NoOperand(), static_cast<uint8_t>(fc));
+    }
+    template <typename DT = void>
+    void math(const InstructionModifier &mod, MathFunction fc, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc == MathFunction::invm || fc == MathFunction::rsqtm) throw invalid_operand_exception();
+#endif
+        opX(Opcode::math, getDataType<DT>(), mod, dst, src0, src1.forceInt32(), NoOperand(), static_cast<uint8_t>(fc));
+    }
+    template <typename DT = void>
+    void math(InstructionModifier mod, MathFunction fc, const ExtendedReg &dst, const ExtendedReg &src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc != MathFunction::rsqtm) throw invalid_operand_exception();
+#endif
+        mod.setCMod(ConditionModifier::eo);
+        opX(Opcode::math, getDataType<DT>(), mod, dst, src0, NoOperand(), NoOperand(), static_cast<uint8_t>(fc));
+    }
+    template <typename DT = void>
+    void math(InstructionModifier mod, MathFunction fc, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (fc != MathFunction::invm) throw invalid_operand_exception();
+#endif
+        mod.setCMod(ConditionModifier::eo);
+        opX(Opcode::math, getDataType<DT>(), mod, dst, src0, src1, NoOperand(), static_cast<uint8_t>(fc));
+    }
+    template <typename DT = void>
+    void mov(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::mov_gen12 : Opcode::mov, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void mov(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::mov_gen12 : Opcode::mov, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (!src0.isIndirect()) throw invalid_address_mode_exception();
+#endif
+        if (hardware >= HW::Gen10)
+            movi<DT>(mod, dst, src0, null);
+        else
+            opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen10) throw unsupported_instruction();
+        if (!src0.isIndirect()) throw invalid_address_mode_exception();
+#endif
+        opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void movi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen10) throw unsupported_instruction();
+#endif
+        opX(isGen12 ? Opcode::movi_gen12 : Opcode::movi, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void mul(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::mul, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void mul(const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1, SourceLocation loc = {}) {
+        if (dst.getBytes() == 8)
+            src1 = src1.forceInt32();
+        opX(Opcode::mul, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    void nop(SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::nop_gen12 : Opcode::nop);
+    }
+    template <typename DT = void>
+    void not_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::not_gen12 : Opcode::not_, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void not_(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::not_gen12 : Opcode::not_, getDataType<DT>(), mod, dst, src0);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void not(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        not_<DT>(mod, dst, src0);
+    }
+    template <typename DT = void>
+    void not(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        not_<DT>(mod, dst, src0);
+    }
+#endif
+    template <typename DT = void>
+    void or_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::or_gen12 : Opcode::or_, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void or_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::or_gen12 : Opcode::or_, getDataType<DT>(), mod, dst, src0, src1);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void or(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        or_<DT>(mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void or(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        or_<DT>(mod, dst, src0, src1);
+    }
+#endif
+    template <typename DT = void>
+    void pln(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::pln, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    void ret(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+        opJmpi(Opcode::ret, mod, src0);
+    }
+    template <typename DT = void>
+    void rndd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndd, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rndd(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndd, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rnde(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rnde, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rnde(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rnde, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rndu(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndu, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rndu(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndu, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rndz(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndz, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rndz(const InstructionModifier &mod, const RegData &dst, const Immediate &src0, SourceLocation loc = {}) {
+        opX(Opcode::rndz, getDataType<DT>(), mod, dst, src0);
+    }
+    template <typename DT = void>
+    void rol(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::rol_gen12 : Opcode::rol, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void rol(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::rol_gen12 : Opcode::rol, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void ror(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::ror_gen12 : Opcode::ror, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void ror(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::ror_gen12 : Opcode::ror, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void sad2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::sad2, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void sad2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::sad2, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void sada2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::sada2, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void sada2(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::sada2, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void sel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::sel_gen12 : Opcode::sel, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void sel(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::sel_gen12 : Opcode::sel, getDataType<DT>(), mod, dst, src0, src1);
+    }
+
+    /* Gen12-style sends */
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, src1, Immediate::ud(exdesc), Immediate::ud(desc));
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, src1, exdesc, Immediate::ud(desc));
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, src1, exdesc, Immediate::ud(desc));
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, NoOperand(), Immediate::ud(exdesc), Immediate::ud(desc));
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, NoOperand(), exdesc, Immediate::ud(desc));
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, src1, Immediate::ud(exdesc), desc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, src1, exdesc, desc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, src1, exdesc, desc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, NoOperand(), Immediate::ud(exdesc), desc);
+    }
+    void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::send : Opcode::sends, mod, sf, dst, src0, NoOperand(), exdesc, desc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, src1, Immediate::ud(exdesc), Immediate::ud(desc));
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, src1, exdesc, Immediate::ud(desc));
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, src1, exdesc, Immediate::ud(desc));
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, NoOperand(), Immediate::ud(exdesc), Immediate::ud(desc));
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, NoOperand(), exdesc, Immediate::ud(desc));
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, src1, Immediate::ud(exdesc), desc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, src1, exdesc, desc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const GRFRange &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, src1, exdesc, desc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, NoOperand(), Immediate::ud(exdesc), desc);
+    }
+    void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+        opSend(isGen12 ? Opcode::sendc : Opcode::sendsc, mod, sf, dst, src0, NoOperand(), exdesc, desc);
+    }
+    template <typename T1, typename T2> void send(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, NoOperand src1, T1 exdesc, T2 desc, SourceLocation loc = {}) {
+        opSend(Opcode::send, mod, sf, dst, src0, src1, exdesc, desc);
+    }
+    template <typename T1, typename T2> void sendc(const InstructionModifier &mod, SharedFunction sf, const RegData &dst, const RegData &src0, NoOperand src1, T1 exdesc, T2 desc, SourceLocation loc = {}) {
+        opSend(Opcode::sendc, mod, sf, dst, src0, src1, exdesc, desc);
+    }
+    /* Pre-Gen12 style sends */
+    void send(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        if (isGen12)
+            send(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, null, exdesc, desc);
+        else
+            send(mod, SharedFunction::null, dst, src0, NoOperand(), Immediate::ud(exdesc), Immediate::ud(desc));
+    }
+    void send(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        if (isGen12)
+            send(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, null, exdesc, desc);
+        else
+            send(mod, SharedFunction::null, dst, src0, NoOperand(), Immediate::ud(exdesc), desc);
+    }
+    void sendc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        if (isGen12)
+            sendc(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, null, exdesc, desc);
+        else
+            sendc(mod, SharedFunction::null, dst, src0, NoOperand(), Immediate::ud(exdesc), Immediate::ud(desc));
+    }
+    void sendc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        if (isGen12)
+            sendc(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, null, exdesc, desc);
+        else
+            sendc(mod, SharedFunction::null, dst, src0, NoOperand(), Immediate::ud(exdesc), desc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        send(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, src1, exdesc, desc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        send(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, src1, exdesc, desc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (isGen12) throw sfid_needed_exception();
+#endif
+        send(mod, static_cast<SharedFunction>(0), dst, src0, src1, exdesc, desc);
+    }
+    void sends(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (isGen12) throw sfid_needed_exception();
+#endif
+        send(mod, static_cast<SharedFunction>(0), dst, src0, src1, exdesc, desc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, uint32_t desc, SourceLocation loc = {}) {
+        sendc(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, src1, exdesc, desc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, uint32_t exdesc, const RegData &desc, SourceLocation loc = {}) {
+        sendc(mod, static_cast<SharedFunction>(exdesc & 0xF), dst, src0, src1, exdesc, desc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, uint32_t desc, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (isGen12) throw sfid_needed_exception();
+#endif
+        sendc(mod, static_cast<SharedFunction>(0), dst, src0, src1, exdesc, desc);
+    }
+    void sendsc(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &exdesc, const RegData &desc, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (isGen12) throw sfid_needed_exception();
+#endif
+        sendc(mod, static_cast<SharedFunction>(0), dst, src0, src1, exdesc, desc);
+    }
+
+    template <typename DT = void>
+    void shl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shl_gen12 : Opcode::shl, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void shl(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shl_gen12 : Opcode::shl, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void shr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shr_gen12 : Opcode::shr, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void shr(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::shr_gen12 : Opcode::shr, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void smov(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::smov_gen12 : Opcode::smov, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void srnd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::srnd, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void srnd(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::srnd, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void subb(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(Opcode::subb, getDataType<DT>(), (hardware >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
+    }
+    template <typename DT = void>
+    void subb(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(Opcode::subb, getDataType<DT>(), (hardware >= HW::XeHPC) ? mod : (mod | AccWrEn), dst, src0, src1);
+    }
+    void wait(const InstructionModifier &mod, const RegData &nreg, SourceLocation loc = {}) {
+        opX(Opcode::wait, mod, NoOperand(), nreg);
+    }
+    void while_(const InstructionModifier &mod, Label &jip, SourceLocation loc = {}) {
+        (void) jip.getID(labelManager);
+        opX(Opcode::while_, mod, jip);
+    }
+    template <typename DT = void>
+    void xor_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::xor_gen12 : Opcode::xor_, getDataType<DT>(), mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void xor_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        opX(isGen12 ? Opcode::xor_gen12 : Opcode::xor_, getDataType<DT>(), mod, dst, src0, src1);
+    }
+#ifndef NGEN_NO_OP_NAMES
+    template <typename DT = void>
+    void xor(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+        xor_<DT>(mod, dst, src0, src1);
+    }
+    template <typename DT = void>
+    void xor(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+        xor_<DT>(mod, dst, src0, src1);
+    }
+#endif
+
+private:
+    struct Sync {
+        AsmCodeGenerator &parent;
+
+        Sync(AsmCodeGenerator *parent_) : parent(*parent_) {}
+
+        void operator()(SyncFunction fc, const InstructionModifier &mod = InstructionModifier()) {
+            parent.opSync(Opcode::sync, fc, mod, null);
+        }
+        void operator()(SyncFunction fc, const RegData &src0) {
+            this->operator()(fc, InstructionModifier(), src0);
+        }
+        void operator()(SyncFunction fc, const InstructionModifier &mod, const RegData &src0) {
+            parent.opSync(Opcode::sync, fc, mod, src0);
+        }
+        void operator()(SyncFunction fc, int src0) {
+            this->operator()(fc, InstructionModifier(), src0);
+        }
+        void operator()(SyncFunction fc, const InstructionModifier &mod, int src0) {
+            parent.opSync(Opcode::sync, fc, mod, Immediate::ud(src0));
+        }
+        void allrd() {
+            allrd(null);
+        }
+        void allrd(const InstructionModifier &mod) {
+            allrd(mod, null);
+        }
+        void allrd(const RegData &src0) {
+            allrd(InstructionModifier(), src0);
+        }
+        void allrd(const InstructionModifier &mod, const RegData &src0) {
+            this->operator()(SyncFunction::allrd, mod, src0);
+        }
+        void allrd(uint32_t src0) {
+            allrd(InstructionModifier(), src0);
+        }
+        void allrd(const InstructionModifier &mod, uint32_t src0) {
+            this->operator()(SyncFunction::allrd, mod, src0);
+        }
+        void allwr() {
+            allwr(null);
+        }
+        void allwr(const InstructionModifier &mod) {
+            allwr(mod, null);
+        }
+        void allwr(const RegData &src0) {
+            allwr(InstructionModifier(), src0);
+        }
+        void allwr(const InstructionModifier &mod, const RegData &src0) {
+            this->operator()(SyncFunction::allwr, mod, src0);
+        }
+        void allwr(uint32_t src0) {
+            allwr(InstructionModifier(), src0);
+        }
+        void allwr(const InstructionModifier &mod, uint32_t src0) {
+            this->operator()(SyncFunction::allwr, mod, src0);
+        }
+        void bar(const InstructionModifier &mod = InstructionModifier(), SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, mod);
+        }
+        void bar(const InstructionModifier &mod, uint32_t src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, mod, src0);
+        }
+        void bar(const InstructionModifier &mod, const RegData &src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, mod, src0);
+        }
+        void bar(uint32_t src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, InstructionModifier(), src0);
+        }
+        void bar(const RegData &src0, SourceLocation loc = {}) {
+            this->operator()(SyncFunction::bar, InstructionModifier(), src0);
+        }
+        void flush() {
+            flush(InstructionModifier());
+        }
+        void flush(const InstructionModifier &mod) {
+            this->operator()(SyncFunction::flush, InstructionModifier(), null);
+        }
+        void host(const InstructionModifier &mod = InstructionModifier()) {
+            this->operator()(SyncFunction::host, mod);
+        }
+        void nop(const InstructionModifier &mod = InstructionModifier()) {
+            this->operator()(SyncFunction::nop, mod);
+        }
+    };
+public:
+    Sync sync;
+
+    void ignoredep(Operand op) {
+        if (hardware >= HW::Gen12LP)
+            opX(Opcode::directive, DataType::ud, InstructionModifier(), GRF(static_cast<int>(op)), NoOperand());
+    }
+    void subdep(Operand op, const GRFRange &r) {
+        if (op == Operand::dst) {
+#ifdef NGEN_SAFE
+            if (r.getLen() > 32) throw invalid_directive_exception();
+#endif
+            opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::subdep_dst)), r);
+        } else {
+            ignoredep(op);
+            wrdep(r);
+        }
+    }
+    void subdep(Operand op, const GRF &r) {
+        subdep(op, r-r);
+    }
+    void wrdep(const GRFRange &r, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+        if (hardware < HW::Gen12LP) throw unsupported_instruction();
+#endif
+        int len = r.getLen();
+        for (int o = 0; o < len; o += 32) {
+            int thisLen = std::min(len - o, 32);
+            opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::wrdep)), r[o] - r[o + thisLen - 1]);
+        }
+    }
+    void wrdep(const GRF &r, SourceLocation loc = {}) {
+        wrdep(r-r);
+    }
+    void fencedep(Label &fenceLocation, SourceLocation loc = {}) {
+        opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::fencedep)), fenceLocation);
+    }
+
+    void disablePVCWARWA(SourceLocation loc) {
+        opX(Opcode::directive, DataType::ud, InstructionModifier::createAutoSWSB(), GRF(static_cast<int>(Directive::pvcwarwa)), NullRegister());
+    }
+
+    inline void mark(Label &label)          { streamStack.back()->mark(label, labelManager); }
+
+    using _self = AsmCodeGenerator;
+
+#include "ngen_pseudo.hpp"
+#ifndef NGEN_GLOBAL_REGS
+#include "ngen_registers.hpp"
+#endif
+};
+
+
+void AsmCodeGenerator::unsupported()
+{
+#ifdef NGEN_SAFE
+    throw unsupported_instruction();
+#endif
+}
+
+AsmCodeGenerator::InstructionStream *AsmCodeGenerator::popStream()
+{
+#ifdef NGEN_SAFE
+    if (streamStack.size() <= 1) throw stream_stack_underflow();
+#endif
+
+    InstructionStream *result = streamStack.back();
+    streamStack.pop_back();
+    return result;
+}
+
+void AsmCodeGenerator::finalize()
+{
+#ifdef NGEN_SAFE
+    if (streamStack.size() > 1) throw unfinished_stream_exception();
+#endif
+    auto &buffer = streamStack.back()->buffer;
+    int inum = 0;
+    for (auto &i : buffer)
+        i.inum = inum++;
+}
+
+void AsmCodeGenerator::getCode(std::ostream &out)
+{
+    finalize();
+
+    autoswsb::BasicBlockList analysis = autoswsb::autoSWSB(hardware, declaredGRFs, streamStack.back()->buffer);
+    std::multimap<int32_t, autoswsb::SyncInsertion*> syncs;      // Syncs inserted by auto-SWSB.
+    std::multimap<int32_t, autoswsb::DummyMovInsertion*> movs;   // Dummy moves inserted by auto-SWSB.
+
+    for (auto &bb : analysis) {
+        for (auto &sync: bb.syncs)
+            syncs.insert(std::make_pair(sync.inum, &sync));
+        for (auto &mov: bb.movs)
+            movs.insert(std::make_pair(mov.inum, &mov));
+    }
+
+    auto nextSync = syncs.begin();
+    auto nextMov = movs.begin();
+    int lineNo = 0;
+
+    for (auto &i : streamStack.back()->buffer) {
+        while ((nextSync != syncs.end()) && (nextSync->second->inum == i.inum))
+            outX(out, *(nextSync++)->second, lineNo++);
+        while ((nextMov != movs.end()) && (nextMov->second->inum == i.inum))
+            outX(out, *(nextMov++)->second, lineNo++);
+
+        if (i.isLabel()) {
+            i.dst.label.outputText(out, PrintDetail::full, labelManager);
+            out << ':' << std::endl;
+            if (i.dst.label == _labelLocalIDsLoaded)
+                lineNo = 0;
+        } else if (i.isComment())
+            out << "// " << i.comment << std::endl;
+        else if (i.op != Opcode::directive)
+            outX(out, i, lineNo++);
+    }
+}
+
+template <typename D, typename S0, typename S1, typename S2>
+void AsmCodeGenerator::opX(Opcode op, DataType defaultType, const InstructionModifier &mod, D dst, S0 src0, S1 src1, S2 src2, uint16_t ext)
+{
+    bool is2Src = !S1::emptyOp;
+    bool is3Src = !S2::emptyOp;
+    int arity = 1 + is2Src + is3Src;
+
+    InstructionModifier emod = mod | defaultModifier;
+    auto esize = emod.getExecSize();
+
+    if (is3Src && hardware < HW::Gen10)
+        esize = std::min<int>(esize, 8);        // WA for IGA Align16 emulation issue
+
+#ifdef NGEN_SAFE
+    if (esize > 1 && dst.isScalar())
+        throw invalid_execution_size_exception();
+#endif
+
+    auto ewidth = getExecWidth({defaultType, dst.getType(), src0.getType(), src1.getType(), src2.getType()});
+    dst.fixup(hardware,  esize, ewidth, defaultType, -1, arity);
+    src0.fixup(hardware, esize, ewidth, defaultType, 0, arity);
+    src1.fixup(hardware, esize, ewidth, defaultType, 1, arity);
+    src2.fixup(hardware, esize, ewidth, defaultType, 2, arity);
+
+    streamStack.back()->append(op, ext, emod, &labelManager, dst, src0, src1, src2);
+}
+
+static const char *getMnemonic(Opcode op, HW hw)
+{
+    const char *names[0x80] = {
+        "illegal", "sync", "sel", "movi", "not", "and", "or", "xor",
+        "shr", "shl", "smov", "", "asr", "", "ror", "rol",
+        "cmp", "cmpn", "csel", "", "", "", "", "bfrev",
+        "bfe", "bfi1", "bfi2", "", "", "", "", "",
+        "jmpi", "brd", "if", "brc", "else", "endif", "", "while",
+        "break", "cont", "halt", "calla", "call", "ret", "goto", "join",
+        "wait", "send", "sendc", "sends", "sendsc", "", "", "",
+        "math", "", "", "", "", "", "", "",
+        "add", "mul", "avg", "frc", "rndu", "rndd", "rnde", "rndz",
+        "mac", "mach", "lzd", "fbh", "fbl", "cbit", "addc", "subb",
+        "sad2", "sada2", "add3", "macl", "srnd", "dph", "dp3", "dp2",
+        "dp4a", "dpas", "dpasw", "mad", "lrp", "madm", "", "",
+        "nop", "mov", "sel", "movi", "not", "and", "or", "xor",
+        "shr", "shl", "smov", "bfn", "asr", "", "ror", "rol",
+        "cmp", "cmpn", "csel", "", "", "", "", "bfrev",
+        "bfe", "bfi1", "bfi2", "", "", "", "nop", ""
+    };
+
+    const char *mnemonic = names[static_cast<int>(op) & 0x7F];
+
+    if (hw < HW::Gen12LP) switch (op) {
+        case Opcode::mov:    mnemonic = "mov";    break;
+        case Opcode::line:   mnemonic = "line";   break;
+        case Opcode::pln:    mnemonic = "pln";    break;
+        case Opcode::dp4:    mnemonic = "dp4";    break;
+        default: break;
+    }
+
+    return mnemonic;
+}
+
+void AsmCodeGenerator::outX(std::ostream &out, const AsmInstruction &i, int lineNo)
+{
+    bool ternary = (i.src[2].type != AsmOperand::Type::none);
+    PrintDetail ddst = PrintDetail::hs;
+    PrintDetail dsrc01 = ternary ? PrintDetail::vs_hs : PrintDetail::full;
+    PrintDetail dsrc[5] = {dsrc01, dsrc01, PrintDetail::hs, PrintDetail::base, PrintDetail::base};
+
+    switch (i.op) {
+        case Opcode::send:
+        case Opcode::sends:
+        case Opcode::sendc:
+        case Opcode::sendsc:
+            ddst = dsrc[0] = dsrc[1] = PrintDetail::base;
+            dsrc[2] = dsrc[3] = PrintDetail::sub_no_type;
+            break;
+        case Opcode::brc:
+        case Opcode::brd:
+        case Opcode::call:
+        case Opcode::calla:
+            ddst = PrintDetail::sub;
+            dsrc[0] = PrintDetail::sub_no_type;
+            break;
+        case Opcode::jmpi:
+        case Opcode::ret:
+            dsrc[0] = PrintDetail::sub_no_type;
+            break;
+        case Opcode::dpas:
+        case Opcode::dpasw:
+            if (isGen12) ddst = dsrc[0] = dsrc[1] = dsrc[2] = PrintDetail::sub;
+            break;
+        case Opcode::sync:
+            if (isGen12) {
+                if (i.src[0].type == AsmOperand::Type::reg)
+                    dsrc[0] = PrintDetail::sub;
+                else
+                    dsrc[0] = PrintDetail::sub_no_type;
+            }
+            break;
+        default: break;
+    }
+
+    outMods(out, i.mod, i.op, ModPlacementType::Pre);
+
+    out << getMnemonic(i.op, hardware);
+    outExt(out, i);
+    out << '\t';
+
+    outMods(out, i.mod, i.op, ModPlacementType::Mid);
+
+    i.dst.outputText(out, ddst, labelManager); out << '\t';
+    for (int n = 0; n <= 4; n++) {
+        i.src[n].outputText(out, dsrc[n], labelManager);
+        bool showLen = false;
+        if (i.ext & 0x80) {
+            showLen |= (n == 1 && (i.op == Opcode::send || i.op == Opcode::sendc) && hardware >= HW::XeHPG);
+        }
+
+        if (showLen)
+            out << ':' << (i.ext >> 8);
+        out << '\t';
+    }
+
+    outMods(out, i.mod, i.op, ModPlacementType::Post);
+    if (lineNumbers)
+        out << "\t// " << lineNo * 2;
+    out << std::endl;
+}
+
+void AsmCodeGenerator::outExt(std::ostream &out, const AsmInstruction &i)
+{
+    switch (i.opcode()) {
+        case Opcode::else_:
+        case Opcode::goto_:
+        case Opcode::if_:       if (i.ext) out << ".b";                         break;
+        case Opcode::math:      out << '.' << static_cast<MathFunction>(i.ext); break;
+        default: break;
+    }
+
+    if (isGen12) switch (i.opcode()) {
+        case Opcode::send:
+        case Opcode::sendc:
+        case Opcode::sends:
+        case Opcode::sendsc:    out << '.' << getMnemonic(static_cast<SharedFunction>(i.ext & 0xF), hardware); break;
+        case Opcode::sync:      out << '.' << static_cast<SyncFunction>(i.ext);                                break;
+        case Opcode::bfn:       out << ".0x" << std::hex << i.ext << std::dec;                                 break;
+        case Opcode::dpas:
+        case Opcode::dpasw: {
+            int sdepth = i.ext >> 8;
+            int rcount = i.ext & 0xFF;
+            out << '.' << sdepth << 'x' << rcount;
+        }
+        default: break;
+    }
+}
+
+static const char *toText(PredCtrl ctrl, bool align16) {
+    const char *names[2][16] = {{"", "", "anyv", "allv", "any2h", "all2h", "any4h", "all4h", "any8h", "all8h", "any16h", "all16h", "any32h", "all32h", "any", "all"},
+                                {"", "", "x",    "y",    "z",     "w",     "",      "",      "",      "",      "",       "",       "",       "",       "",    ""}};
+    return names[align16][static_cast<int>(ctrl) & 0xF];
+}
+
+void AsmCodeGenerator::outMods(std::ostream &out,const InstructionModifier &mod, Opcode op, AsmCodeGenerator::ModPlacementType location)
+{
+    ConditionModifier cmod = mod.getCMod();
+    PredCtrl ctrl = mod.getPredCtrl();
+    bool wrEn = mod.isWrEn();
+    bool havePred = (ctrl != PredCtrl::None) && (cmod != ConditionModifier::eo);
+
+    switch (location) {
+        case ModPlacementType::Pre:
+            if (wrEn || havePred) {
+                out << '(';
+                if (wrEn) {
+                    out << 'W';
+                    if (havePred) out << '&';
+                }
+                if (havePred) {
+                    if (mod.isPredInv()) out << '~';
+                    mod.getFlagReg().outputText(out, PrintDetail::sub_no_type, labelManager);
+                    if (ctrl != PredCtrl::Normal)
+                        out << '.' << toText(ctrl, mod.isAlign16());
+                }
+                out << ')';
+            }
+            out << '\t';
+            break;
+        case ModPlacementType::Mid:
+            if (mod.getExecSize() > 0)
+                out << '(' << mod.getExecSize() << "|M" << mod.getChannelOffset() << ')' << '\t';
+
+            if (cmod != ConditionModifier::none) {
+                out << '(' << cmod << ')';
+                mod.getFlagReg().outputText(out, PrintDetail::sub_no_type, labelManager);
+                out << '\t';
+            }
+
+            if (mod.isSaturate()) out << "(sat)";
+            break;
+        case ModPlacementType::Post:
+        {
+            bool havePostMod = false;
+            auto startPostMod = [&]() {
+                out << (havePostMod ? ',' : '{');
+                havePostMod = true;
+            };
+            auto printPostMod = [&](const char *name) {
+                startPostMod(); out << name;
+            };
+
+            SWSBInfo swsb = mod.getSWSB();
+            if (swsb.hasToken()) {
+                startPostMod(); out << '$' << swsb.parts.token;
+                if (swsb.parts.src && !swsb.parts.dst) out << ".src";
+                if (swsb.parts.dst && !swsb.parts.src) out << ".dst";
+            }
+            if (swsb.hasDist()) {
+                startPostMod();
+                if (hardware > HW::Gen12LP && (op == Opcode::send || op == Opcode::sendc) && swsb.getPipe() == Pipe::Default)
+                    out << Pipe::A;
+                else if (hardware > HW::Gen12LP || !swsb.hasToken())
+                    out << swsb.getPipe();
+                out << '@' << swsb.parts.dist;
+            }
+
+            if (swsb.parts.noacc)                                         printPostMod("NoAccSBSet");
+            if (mod.isAlign16())                                          printPostMod("Align16");
+            if (mod.isNoDDClr())                                          printPostMod("NoDDClr");
+            if (mod.isNoDDChk())                                          printPostMod("NoDDChk");
+            if (mod.getThreadCtrl() == ThreadCtrl::Atomic)                printPostMod("Atomic");
+            if (!isGen12 && mod.getThreadCtrl() == ThreadCtrl::Switch)    printPostMod("Switch");
+            if (!isGen12 && mod.getThreadCtrl() == ThreadCtrl::NoPreempt) printPostMod("NoPreempt");
+            if (mod.isAccWrEn() && hardware < HW::XeHPC)                  printPostMod("AccWrEn");
+            if (mod.isCompact())                                          printPostMod("Compact");
+            if (mod.isBreakpoint())                                       printPostMod("Breakpoint");
+            if (mod.isSerialized())                                       printPostMod("Serialize");
+            if (mod.isEOT())                                              printPostMod("EOT");
+            if (mod.isExBSO())                                            printPostMod("ExBSO");
+
+            if (havePostMod) out << '}';
+        }
+        break;
+    }
+}
+
+} /* namespace NGEN_NAMESPACE */
+
+#endif
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_auto_swsb.hpp b/third_party/ngen/ngen_auto_swsb.hpp
similarity index 83%
rename from src/gpu/intel/jit/ngen/ngen_auto_swsb.hpp
rename to third_party/ngen/ngen_auto_swsb.hpp
index 247ebd7957f..35e71b209e7 100644
--- a/src/gpu/intel/jit/ngen/ngen_auto_swsb.hpp
+++ b/third_party/ngen/ngen_auto_swsb.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -30,6 +30,8 @@
 #include <list>
 #include <map>
 
+#include "ngen_core.hpp"
+
 namespace NGEN_NAMESPACE {
 namespace autoswsb {
 
@@ -45,17 +47,19 @@ enum {
     PipeMaskI = 4,
     PipeMaskL = 8,
     PipeMaskM = 0x10,
-    PipeMaskC = 0x20,   // All instructions (in-order/out-of-order).
-    PipeMaskO = 0x40,   // All out-of-order pipes. Not a valid GeneralizedPipe.
+    PipeMaskS = 0x20,
+    PipeMaskC = 0x40,   // All instructions (in-order/out-of-order).
+    PipeMaskO = 0x80,   // All out-of-order pipes. Not a valid GeneralizedPipe.
     PipeBitA = 0,
     PipeBitF = 1,
     PipeBitI = 2,
     PipeBitL = 3,
     PipeBitM = 4,
-    PipeBitC = 5,
-    PipeBitO = 6,
+    PipeBitS = 5,
+    PipeBitC = 6,
+    PipeBitO = 7,
 };
-static constexpr int NPipes = 6;
+static constexpr int NPipes = 7;
 
 static inline PipeMask toMask(Pipe pipe)   { return (1 << (static_cast<unsigned>(pipe) - 1)); }
 static inline Pipe fromMask(PipeMask mask) { return mask ? static_cast<Pipe>(1 + utils::log2(mask)) : Pipe::Default; }
@@ -72,10 +76,13 @@ enum {
 class GeneralizedPipe {
     uint16_t v;
 
+public:
     static constexpr uint16_t vInOrder  = 0x000;
     static constexpr uint16_t vSend     = 0x100;        // OR'ed with SFID
     static constexpr uint16_t vSystolic = 0x200;
     static constexpr uint16_t vMath     = 0x300;
+
+private:
     static constexpr uint16_t vTypeMask = 0x300;
 
     GeneralizedPipe(uint16_t v_, int dummy) : v{v_} {}
@@ -92,10 +99,13 @@ class GeneralizedPipe {
     bool operator!=(GeneralizedPipe other) const { return v != other.v; }
 
     bool inOrder() const { return ((v & vTypeMask) == vInOrder) && (v != PipeMaskNone); }
+    uint16_t type() const { return v & vTypeMask; }
     PipeMask inOrderPipe() const { return inOrder() ? (v & ~vTypeMask) : PipeMaskNone; }
     Pipe toPipe() const { return fromMask(inOrderPipe()); }
     inline PipeMask syncPipes(HW hw) const;
 
+    inline unsigned sendClassXeHPC() const;
+
 #ifdef NGEN_DEBUG
     inline void dump() const;
 #endif
@@ -142,6 +152,7 @@ struct Dependency {
     GeneralizedPipe pipe;                               // Execution pipe for instruction
     uint16_t tokenTime;                                 // Estimated upper bound for token lifetime, in cycles.
     std::array<int32_t, NPipes> counters;               // Pipe counters, relative to start of BB.
+    uint32_t inum;                                      // Instruction number.
 
     // (Mostly) dependency information.
     uint8_t token;                                      // Out of order token
@@ -164,9 +175,6 @@ struct Dependency {
     }
     bool operator!=(const Dependency *other) { return !(operator==(other)); }
 
-    int32_t &inum()                 { return counters[1]; }     // For OOO dependencies in phase 0
-    const int32_t &inum() const     { return counters[1]; }
-
     constexpr bool read() const     { return !rw; }
     constexpr bool write() const    { return rw; }
     constexpr bool hasToken() const { return tokenSrc || tokenDst; }
@@ -252,20 +260,30 @@ struct SyncInsertion {
     uint32_t mask;                                  // (allrd/allwr) 0 indicates no mask to be applied.
 };
 
+struct DummyMovInsertion {
+    uint32_t inum;
+    SWSBInfo swsb;
+    uint16_t grf;
+    bool constant;
+    DataType dt;
+};
+
 struct BasicBlock;
 
 struct BasicBlock {
-    uint32_t id;                                            // index
-    int32_t label;                                          // multipurpose flag for use in algorithms
-    uint32_t istart, iend;                                  // instruction range: [istart, iend)
+    uint32_t id;                                            // Index
+    int32_t label;                                          // Multipurpose flag for use in algorithms
+    uint32_t istart, iend;                                  // Instruction range: [istart, iend)
     uint32_t directives;                                    // # of directives (pseudo-instructions) in this BB
     std::array<uint32_t, NPipes> lengths;                   // # of instructions in each pipe in this BB
-    std::vector<BasicBlock *> pred, succ;                   // list of predecessor/successor BBs
-    DependencyTable<false> producers;                       // table of dependencies produced and consumed by this BB.
-    DependencyTable<true> consumers;                        //   production table re-used for live incoming dependencies.
-    DependencyTable<false> incoming;                        // table of dependencies produced by prior BBs (temporary).
-    std::vector<SyncInsertion> syncs;                       // list of sync instructions to generate.
-    std::vector<std::array<DependencyRegion, 4>> opRegions; // cache of instruction operand regions.
+    std::vector<BasicBlock *> pred, succ;                   // List of predecessor/successor BBs
+    DependencyTable<false> producers;                       // Table of dependencies produced and consumed by this BB.
+    DependencyTable<true> consumers;                        //   Production table re-used for live incoming dependencies.
+    DependencyTable<false> incoming;                        // Table of dependencies produced by prior BBs (temporary).
+    std::vector<SyncInsertion> syncs;                       // List of sync instructions to generate.
+    std::vector<DummyMovInsertion> movs;                    // List of mov instructions to generate.
+    std::vector<std::array<DependencyRegion, 4>> opRegions; // Cache of instruction operand regions.
+    bool enablePVCWARWA = false;                            // Enable workaround for PVC WAR bug.
 
     const DependencyRegion &getOperandRegion(int inum, int opNum) const {
         return opRegions[inum - istart][opNum + 1];
@@ -349,6 +367,11 @@ inline GeneralizedPipe getPipe(HW hw, const Instruction &insn, bool checkOOO = t
                 mask = PipeMaskL;
         }
 
+        if (hw >= HW::Xe3) {
+            ARFType dstARF;
+            if (insn.getARFType(dstARF, -1, hw) && dstARF == ARFType::s)
+                mask = PipeMaskS;
+        }
     } else
         mask = PipeMaskA;
     return mask;
@@ -370,6 +393,29 @@ PipeMask GeneralizedPipe::syncPipes(HW hw) const
     return (v == PipeMaskNone) ? allPipes(hw) : inOrderPipe();
 }
 
+unsigned GeneralizedPipe::sendClassXeHPC() const
+{
+    if (type() == vSend) switch (static_cast<SharedFunction>(v & 0xF)) {
+        case SharedFunction::dcro:
+        case SharedFunction::dc0:
+        case SharedFunction::dc1:
+        case SharedFunction::slm:
+        case SharedFunction::ugm: return 1;
+        default: return 2;
+    }
+    return 0;
+}
+
+static inline DataType dtForPipe(Pipe p)
+{
+    switch (p) {
+        default:
+        case Pipe::I: return DataType::ud;
+        case Pipe::F: return DataType::f;
+        case Pipe::L: return DataType::df;
+    }
+}
+
 /**********************/
 /* Dependency Regions */
 /**********************/
@@ -411,6 +457,7 @@ DependencyRegion::DependencyRegion(HW hw_, int esize, RegData rr)
     int off = rr.getByteOffset();
 
     auto makeMask = [](int sz) -> uint64_t {
+        if (sz == 64) return ~uint64_t(0);
         return (uint64_t(1) << sz) - 1;
     };
 
@@ -525,10 +572,17 @@ inline bool contains(const DependencyRegion &dep1, const DependencyRegion &dep2)
     return true;
 }
 
+inline bool bboxContains(const DependencyRegion &dep1, const DependencyRegion &dep2)
+{
+    if (dep1.arf != dep2.arf) return false;
+    if (dep1.unspecified || dep2.unspecified) return false;
+    return (dep1.base <= dep2.base && dep1.base + dep1.size >= dep2.base + dep2.size);
+}
+
 // Check if an ARF type needs SWSB tracking.
 inline bool trackableARF(ARFType type)
 {
-    return (type == ARFType::acc || type == ARFType::a);
+    return (type == ARFType::acc || type == ARFType::a || type == ARFType::s);
 }
 
 // Distance in an in-order pipe after which a dependency can be ignored.
@@ -540,6 +594,7 @@ inline int timeout(GeneralizedPipe pipe)
         case PipeMaskF: return 11;
         case PipeMaskL: return 15;
         case PipeMaskM: return 19;
+        case PipeMaskS: return 11; // FIXME: use correct value when available
         default:        return std::numeric_limits<int>::max();
     }
 }
@@ -600,6 +655,7 @@ inline bool intersects(const Dependency<false> &dep1, const Dependency<true> &de
         if (dep1.read() || dep1.pipe.inOrder())
         if (dep2.write() && (dep1.pipe == dep2.pipe) && (dep1.pipe != GeneralizedPipe::Math()))     return false;
         if (dep1.pipe.inOrder() && (distance(dep1, dep2, dep1.pipe) >= timeout(dep1.pipe)))         return false;
+        if ((dep2.region.base >> 4) != (static_cast<uint8_t>(ARFType::s) & 0xF))
         if (dep2.region.arf && (dep2.read() || dep2.region.hw == HW::Gen12LP))                      return false;
         return intersects(dep1.region, dep2.region);
     } else {
@@ -638,7 +694,7 @@ inline bool impliesWithoutRegion(const Dependency<false> &dep1, const Dependency
             return false;
         if (dep1.token != dep2.token)
             return false;
-        if ((dep1.token == dep1.tokenTBD) && (dep1.inum() != dep2.inum()))
+        if ((dep1.token == dep1.tokenTBD) && (dep1.inum != dep2.inum))
             return false;
     }
     if (dep2.pipe.inOrder()) {
@@ -962,6 +1018,7 @@ inline void dumpPipeMask(PipeMask mask, bool spacers = true)
         std::cerr << char((mask & PipeMaskI) ? 'I' : ' ');
         std::cerr << char((mask & PipeMaskL) ? 'L' : ' ');
         std::cerr << char((mask & PipeMaskM) ? 'M' : ' ');
+        std::cerr << char((mask & PipeMaskS) ? 'S' : ' ');
         std::cerr << char((mask & PipeMaskO) ? 'O' : ' ');
     } else {
         if (mask & PipeMaskA) std::cerr << 'A';
@@ -969,6 +1026,7 @@ inline void dumpPipeMask(PipeMask mask, bool spacers = true)
         if (mask & PipeMaskI) std::cerr << 'I';
         if (mask & PipeMaskL) std::cerr << 'L';
         if (mask & PipeMaskM) std::cerr << 'M';
+        if (mask & PipeMaskS) std::cerr << 'S';
         if (mask & PipeMaskO) std::cerr << 'O';
         if (mask == PipeMaskNone) std::cerr << '-';
     }
@@ -1020,7 +1078,7 @@ void Dependency<consumer>::dump() const
 {
     if (tokenTime > 0) {
         std::cerr << '[' << counters[PipeBitA] << " + " << tokenTime;
-        std::cerr << ',' << inum();
+        std::cerr << ',' << inum;
     } else {
         std::cerr << '[';
         for (auto &counter : counters)
@@ -1088,7 +1146,7 @@ void DependencyTable<consumer>::dump() const
                         if (i > NPipes)
                             std::cerr << '?';
                         else
-                            std::cerr << "AFILMCO"[i % (NPipes + 1)];
+                            std::cerr << "AFILMSCO"[i % (NPipes + 1)];
                         break;
                 }
                 std::cerr << ":\t";
@@ -1153,6 +1211,7 @@ inline bool canDefaultPipe(HW hw, const Instruction &insn)
 template <typename Program>
 inline BasicBlockList getBasicBlocks(HW hw, const Program &program)
 {
+    bool enablePVCWARWA = true;
     auto icount = int(program.size());
 
     // Create map from BB head instructions to instruction #s.
@@ -1247,6 +1306,9 @@ inline BasicBlockList getBasicBlocks(HW hw, const Program &program)
         bb.opRegions.resize(bb.iend - bb.istart);
         std::array<bool, 4> ignoreDeps = {false};
 
+        DependencyRegion subDstRegion(hw);
+        subDstRegion.clear();
+
         for (uint32_t n = bb.istart; n < bb.iend; n++) {
             auto &regions = bb.opRegions[n - bb.istart];
             const auto &insn = program[n];
@@ -1257,11 +1319,21 @@ inline BasicBlockList getBasicBlocks(HW hw, const Program &program)
                     case Directive::ignoredep_src0: ignoreDeps[1] = true; break;
                     case Directive::ignoredep_src1: ignoreDeps[2] = true; break;
                     case Directive::ignoredep_src2: ignoreDeps[3] = true; break;
+                    case Directive::subdep_dst:
+#ifdef NGEN_SAFE
+                        if (!subDstRegion.empty())
+                            throw invalid_directive_exception();
+#endif
+                        insn.getOperandRegion(subDstRegion, 0);
+                        break;
                     case Directive::wrdep:
                         regions[1].hw = hw;
                         insn.getOperandRegion(regions[1], 0);
                         break;
                     case Directive::fencedep: break;
+                    case Directive::pvcwarwa:
+                        enablePVCWARWA = false;
+                        break;
                 }
                 continue;
             }
@@ -1272,10 +1344,39 @@ inline BasicBlockList getBasicBlocks(HW hw, const Program &program)
                     regions[srcN + 1].clear();
             }
 
+            if (!subDstRegion.empty()) {
+                regions[0] = subDstRegion;
+                subDstRegion.clear();
+            }
+
             ignoreDeps.fill(false);
         }
     }
 
+#ifndef NGEN_DISABLE_GETENV
+#ifndef _WIN32
+    // Check ONEAPI_PVC_SEND_WAR_WA environment variable.
+    static bool checkedEnv = false;
+    static bool haveEnv = false;
+    static bool envEnablePWW = true;
+
+    if (!checkedEnv) {
+        if (auto e = ::getenv("ONEAPI_PVC_SEND_WAR_WA")) {
+            haveEnv = true;
+            if (e[0] == '0' && e[1] == '\0')
+                envEnablePWW = false;
+        }
+        checkedEnv = true;
+    }
+
+    if (haveEnv)
+        enablePVCWARWA = envEnablePWW;
+#endif
+#endif
+
+    for (auto &bb: list)
+        bb.enablePVCWARWA = enablePVCWARWA;
+
     return list;
 }
 
@@ -1364,7 +1465,7 @@ inline bool arfNeedsSync(ARFType type)
 }
 
 // Get preferred SBID for a given GRF.
-inline uint8_t preferredSBID(int tokens, uint8_t base)
+inline uint8_t preferredSBID(int tokens, uint16_t base)
 {
     if (tokens >= 32)
         return (base >> 2) & 0x1F;
@@ -1390,7 +1491,7 @@ inline uint8_t chooseSBID(HW hw, int tokens, Program &program, const BasicBlock
     auto accumulateTokens = [&](const Dependency<false> &dep) {
         if (!dep.hasToken()) return;
 
-        auto depSWSB = program[dep.inum()].swsb();
+        auto depSWSB = program[dep.inum].swsb();
         if (!depSWSB.hasTokenSet()) return;
 
         auto token = depSWSB.parts.token;
@@ -1455,6 +1556,162 @@ inline void addToSWSB(Dependency<true> &swsb, const Dependency<false> &dep, uint
     }
 }
 
+struct PVCWARWA {
+    enum {
+        None, Undecided, MoveDep, DummyMov, DstDep
+    } strategy = None;
+    uint32_t inumSrc = 0;
+    uint16_t payload[2] = {0xFFFF, 0xFFFF};
+    Dependency<false> dep;
+    bool rs = false;
+
+    bool operator!() const { return strategy == None; }
+    operator bool()  const { return !!*this; }
+};
+
+// Detect cases that may trigger the PVC WAR-after-send bug,
+//   and choose a workaround.
+template <typename Program>
+PVCWARWA analyzePVCWARWA(HW hw, Program &program, BasicBlock &bb, int phase,
+                         Dependency<true> &consumeOp, std::vector<Dependency<false>> &pvcWARWADeps)
+{
+    PVCWARWA pww;
+    auto inum = consumeOp.inum;
+    auto &regions = bb.opRegions[inum - bb.istart];
+
+    // Check if the workaround is needed.
+    if (phase == 0 || regions[0].empty()) return pww;
+
+    bool pvcWARWA = (hw == HW::XeHPC) && bb.enablePVCWARWA;
+    if (!pvcWARWA) return pww;
+
+    // Look for the latest send instruction we have a WAR dependency on, if any.
+    consumeOp.rw = true;
+    consumeOp.region = regions[0];
+    pvcWARWADeps.clear();
+    bb.producers.findIntersections(consumeOp, pvcWARWADeps);
+    for (auto &dep: pvcWARWADeps) {
+        if (dep.write()) continue;
+        if (dep.pipe.type() != GeneralizedPipe::vSend) continue;
+        if ((pww.strategy == PVCWARWA::None) || dep.inum > pww.dep.inum) {
+            pww.dep = dep;
+            pww.strategy = PVCWARWA::Undecided;
+        }
+    }
+
+    if (pww.strategy == PVCWARWA::None)
+        return pww;
+
+    // Check if send instruction is in the same BB.
+    auto &dep = pww.dep;
+    bool sameBB = (dep.inum >= bb.istart && dep.inum < bb.iend);
+    int adjust = 0;
+
+    if (sameBB && consumeOp.pipe.type() != GeneralizedPipe::vSystolic) {
+        // Check if we have a src at least as large as our dst.
+        int srcN;
+        for (srcN = 0; srcN <= 2; srcN++) {
+            if (regions[srcN + 1].unspecified) continue;
+            if (bboxContains(regions[srcN + 1], regions[0]))
+                break;
+        }
+        if (srcN >= 2) srcN = -1;
+
+        // Check for potential read suppression.
+        if (srcN >= 0 && consumeOp.pipe.inOrder()) {
+            pww.rs = true;
+            for (uint32_t iother = inum - 1; iother > dep.inum; iother--) {
+                if (getPipe(hw, program[iother], false) != consumeOp.pipe)
+                    continue;
+                const auto &sr = bb.opRegions[iother - bb.istart][srcN + 1];
+                pww.rs = bboxContains(sr, regions[srcN + 1]);
+                break;
+            }
+        }
+
+        // Check if we can move the dependency further down the pipe.
+        if (srcN >= 0) {
+            int after = std::max(0, dep.region.base + dep.region.size - regions[0].base - 1);
+            bool higherPri = false;
+            switch (consumeOp.pipe.type()) {
+                case GeneralizedPipe::vInOrder:
+                    higherPri = (program[inum].dstTypecode() == 0b1011); break;
+                case GeneralizedPipe::vSend:
+                case GeneralizedPipe::vSystolic:
+                    higherPri = true; break;
+                default: break;
+            }
+            adjust = (higherPri ? 2 : 1) - after;
+
+            if (adjust <= 0) {
+                pww.strategy = PVCWARWA::None;   /* no WA needed */
+                return pww;
+            }
+        }
+    }
+
+    if (phase < 2) return pww;
+
+    // Need to apply a WA. Decide on one, in order of priority:
+    //  1) If send dst is null or in a different BB, change .src to .dst
+    //  2) Move .src dependency later, if this instruction also
+    //          has its dst as a non-suppressed src operand
+    //  3) Add dummy mov instructions to ensure FIFO cleaned out
+    //  4) Change .src to .dst
+    auto sendClass = dep.pipe.sendClassXeHPC();
+
+    // Case 1
+    if (!sameBB || bb.opRegions[dep.inum - bb.istart][0].empty()) {
+        pww.strategy = PVCWARWA::DstDep;
+        return pww;
+    }
+
+    // Case 2: walk forward, looking for a new target send instruction.
+    auto eligibleSend = [=, &program](uint32_t inum) {
+        auto &insn = program[inum];
+        if (inum != dep.inum && insn.predicated())
+            return false;
+        return (sendClass == getPipe(hw, insn).sendClassXeHPC());
+    };
+
+    if (adjust > 0) {
+        for (pww.inumSrc = dep.inum + 1; pww.inumSrc < inum; pww.inumSrc++) {
+            if (!eligibleSend(pww.inumSrc)) continue;
+            for (int srcN = 0; srcN <= 1; srcN++) {
+                auto &sr = bb.opRegions[pww.inumSrc - bb.istart][srcN + 1];
+                if (!sr.unspecified)
+                    adjust -= sr.size;
+            }
+            if (adjust <= 0) break;
+        }
+
+        if (adjust <= 0) {
+            pww.strategy = PVCWARWA::MoveDep;
+            return pww;
+        }
+    }
+
+    // Case 3: collect 2 GRFs worth of payload from this send class, walking backward.
+    int ngrf = 0;
+    for (int32_t iother = dep.inum; iother >= int32_t(bb.istart) && ngrf < 2; iother--) {
+        if (!eligibleSend(iother)) continue;
+        for (int srcN = 0; srcN <= 1; srcN++) {
+            auto &sr = bb.opRegions[iother - bb.istart][srcN + 1];
+            if (sr.unspecified) continue;
+            for (int i = 0; i < sr.size && ngrf < 2; i++)
+                pww.payload[ngrf++] = sr.base + i;
+        }
+    }
+    if (ngrf == 2) {
+        pww.strategy = PVCWARWA::DummyMov;
+        return pww;
+    }
+
+    // Case 4
+    pww.strategy = PVCWARWA::DstDep;
+    return pww;
+}
+
 // Main dependency analysis.
 // This is run three times on every BB.
 // Phase 0
@@ -1484,7 +1741,7 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
     uint32_t chainTokenMaskSrc = 0, chainTokenMaskDst = 0, chainTokenMaskDstX = 0;
     Dependency<true> chainGenerated;
     std::array<int32_t, NPipes> counters;
-    std::vector<Dependency<false>> depList, depListIncoming, chainProducers;
+    std::vector<Dependency<false>> depList, depListIncoming, chainProducers, pvcWARWADeps;
     std::vector<std::pair<bool, const DependencyRegion*>> depOperands;
 
     // Incrementing counters.
@@ -1583,6 +1840,7 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
         Dependency<true> consumeOp;
         consumeOp.counters = counters;
         consumeOp.pipe = getPipe(hw, insn);
+        consumeOp.inum = inum;
 
         // Read SWSB information for this instruction, if already present.
         Dependency<false> tokenInfo;
@@ -1686,6 +1944,9 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
                 depOperands.push_back(std::make_pair(rw, &regions[srcN + 1]));
             }
 
+            // Handle PVC HW bug with WAR dependencies on send instructions.
+            auto pww = analyzePVCWARWA(hw, program, bb, phase, consumeOp, pvcWARWADeps);
+
             // Analyze operands.
             for (auto &depOp: depOperands) {
                 // Create associated dependency consumer.
@@ -1728,7 +1989,7 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
                         tokenMaskDstX |= (1 << dep.token);
                     else {
                         // Check SWSB again in case it was recently assigned.
-                        auto curSWSB = program[dep.inum()].swsb();
+                        auto curSWSB = program[dep.inum].swsb();
                         if (curSWSB.hasTokenSet())
                             tokenMaskDstX |= (1 << curSWSB.parts.token);
                     }
@@ -1750,7 +2011,7 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
             // Always wait until phase 2 to assign SWSB to {Atomic} chains --
             //   it's not known if all dependencies for the chain have been found until the end.
             // Also delay predicated token instructions, to ensure we know all SBIDs.
-            if (inumChain >= 0 || insn.atomic() || (tokenInsn && insn.predicated()) || forcePhase2Next)
+            if (inumChain >= 0 || insn.atomic() || (tokenInsn && insn.predicated()) || forcePhase2Next || pww)
                 foundAllDeps = false;
 
             // If token missing on OOO instruction, assign one during phase 1.
@@ -1808,66 +2069,115 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
                 if (tokenAssigned && (insn.predicated() || inumChain >= 0) && tokenMayBeActive)
                     tokenMaskDst |= (1 << tokenInfo.token);
 
-                // Handle OOO dependencies.
-                //    - dst implies src
-                //    - use SWSB to mark src/dst w/o dist (in-order or no token) or dst + dist (in-order only, same pipe)
-                //    - add sync for any remaining dependencies.
                 tokenMaskSrc &= ~tokenMaskDst;
 
-                bool defaultPipe = generated.pipe.inOrder() && (generated.depPipe == generated.pipe.inOrderPipe())
-                                                            && canDefaultPipe(hw, insn);
+                // Clean producer list of known SWSB and sync dependencies.
+                if (tokenMaskSrc) bb.producers.removeByTokenMask(tokenMaskSrc, false);
+                if (tokenMaskDst) bb.producers.removeByTokenMask(tokenMaskDst, true);
+                bb.producers.removeIntersections(generated);
 
-                bool acceptsSrc = false, acceptsDst = false;
-                if (generated.pipe.inOrder() || !tokenAssigned) {
-                    if (hw >= HW::XeHPC) {
-                        acceptsSrc = (generated.depPipe == PipeMaskNone || defaultPipe);
-                        acceptsDst = acceptsSrc || (generated.depPipe == PipeMaskA);
-                    } else {
-                        acceptsSrc = (generated.depPipe == PipeMaskNone);
-                        acceptsDst = acceptsSrc || defaultPipe;
+                if (recordSWSB) {
+                    // Alter SWSB with any workarounds for PVC WAR dependencies.
+                    // Note these alterations do not affect the dependency tables.
+                    auto inumSync = (inumChain >= 0) ? inumChain : inum;
+                    if (tokenMaskDst & (1 << pww.dep.token))
+                        pww.strategy = PVCWARWA::None;
+                    switch (pww.strategy) {
+                        default:
+                        case PVCWARWA::None: break;
+                        case PVCWARWA::MoveDep: {
+                            Dependency<false> produce;
+                            Dependency<true> consume;
+                            (void) getSWSBDependencies(hw, program[pww.inumSrc], produce, consume);
+                            // tokenMaskSrc &= ~(1 << pww.dep.token);  /* not working in certain cases */
+                            tokenMaskSrc |=  (1 << produce.token);
+                            if (pww.rs)
+                                bb.movs.push_back({uint32_t(inumSync), SWSBInfo{}, 0, true, dtForPipe(generated.pipe.toPipe())});
+                            break;
+                        }
+                        case PVCWARWA::DummyMov: {
+                            tokenMaskSrc &= ~(1 << pww.dep.token);
+                            auto pipe = (generated.pipe.inOrderPipe() == PipeMaskF) ? PipeMaskF : PipeMaskI;
+                            auto dt = dtForPipe(fromMask(pipe));
+                            bb.movs.push_back({uint32_t(inumSync), SBID(pww.dep.token).src, 0, true, dt});
+                            bb.movs.push_back({uint32_t(inumSync), SWSBInfo{}, pww.payload[1], false, dt});
+                            bb.movs.push_back({uint32_t(inumSync), SWSBInfo{}, pww.payload[0], false, dt});
+                            if (generated.pipe.inOrderPipe() != pipe) {
+                                if ((generated.hasToken() || tokenAssigned) && !isSend(insn.opcode())) {
+                                    Dependency<true> distGen;
+                                    distGen.depPipe = pipe;
+                                    distGen.dist = 1;
+                                    auto swsb = encodeSWSB(hw, (decltype(&insn)) nullptr, Dependency<false>(), distGen);
+                                    bb.syncs.push_back({uint32_t(inumSync), swsb, SyncFunction::nop, 0});
+                                } else {
+                                    auto pidx = utils::log2(pipe);
+                                    Dependency<false> dep;
+                                    dep.pipe = pipe;
+                                    dep.counters[pidx] = generated.counters[pidx] - 1;
+                                    addToSWSB(generated, dep, tokenMaskSrc, tokenMaskDst);
+                                }
+                            }
+                            break;
+                        }
+                        case PVCWARWA::DstDep:
+                            tokenMaskSrc &= ~(1 << pww.dep.token);
+                            tokenMaskDst |=  (1 << pww.dep.token);
+                            break;
                     }
-                }
 
-                if (tokenMaskDst && acceptsDst) {
-                    generated.token = utils::bsr(tokenMaskDst);
-                    generated.tokenDst = true;
-                    tokenMaskDst &= ~(1 << generated.token);
-                } else if (tokenMaskSrc && acceptsSrc) {
-                    generated.token = utils::bsr(tokenMaskSrc);
-                    generated.tokenSrc = true;
-                    tokenMaskSrc &= ~(1 << generated.token);
-                }
+                    // Merge OOO dependencies into SWSB.
+                    //    - use SWSB to mark src/dst w/o dist (in-order or no token) or dst + dist (in-order only, same pipe)
+                    //    - add sync for any remaining dependencies.
+                    bool defaultPipe = generated.pipe.inOrder() && (generated.depPipe == generated.pipe.inOrderPipe())
+                                                                && canDefaultPipe(hw, insn);
+
+                    bool acceptsSrc = false, acceptsDst = false;
+                    if (generated.pipe.inOrder() || !tokenAssigned) {
+                        if (hw >= HW::XeHPC) {
+                            acceptsSrc = (generated.depPipe == PipeMaskNone || defaultPipe);
+                            acceptsDst = acceptsSrc || (generated.depPipe == PipeMaskA);
+                        } else {
+                            acceptsSrc = (generated.depPipe == PipeMaskNone);
+                            acceptsDst = acceptsSrc || defaultPipe;
+                        }
+                    }
 
-                bool oneSrc = tokenMaskSrc && utils::is_zero_or_pow2(tokenMaskSrc);
-                bool oneDst = tokenMaskDst && utils::is_zero_or_pow2(tokenMaskDst);
-                bool oneSrcSWSB = false, oneDstSWSB = false;
-                auto inumSync = (inumChain >= 0) ? inumChain : inum;
-
-                if (syncSWSB.empty()) {
-                    if (oneSrc) {
-                        syncSWSB = SBID(utils::bsr(tokenMaskSrc)).src;
-                        oneSrcSWSB = true;
-                    } else if (oneDst) {
-                        syncSWSB = SBID(utils::bsr(tokenMaskDst)).dst;
-                        oneDstSWSB = true;
+                    if (tokenMaskDst && acceptsDst) {
+                        generated.token = utils::bsr(tokenMaskDst);
+                        generated.tokenDst = true;
+                        tokenMaskDst &= ~(1 << generated.token);
+                    } else if (tokenMaskSrc && acceptsSrc) {
+                        generated.token = utils::bsr(tokenMaskSrc);
+                        generated.tokenSrc = true;
+                        tokenMaskSrc &= ~(1 << generated.token);
                     }
-                }
-                if (tokenMaskSrc && !oneSrcSWSB) {
-                    if (recordSWSB)
+
+                    bool oneSrc = tokenMaskSrc && utils::is_zero_or_pow2(tokenMaskSrc);
+                    bool oneDst = tokenMaskDst && utils::is_zero_or_pow2(tokenMaskDst);
+                    bool oneSrcSWSB = false, oneDstSWSB = false;
+
+                    if (syncSWSB.empty()) {
+                        if (oneSrc) {
+                            syncSWSB = SBID(utils::bsr(tokenMaskSrc)).src;
+                            oneSrcSWSB = true;
+                        } else if (oneDst) {
+                            syncSWSB = SBID(utils::bsr(tokenMaskDst)).dst;
+                            oneDstSWSB = true;
+                        }
+                    }
+                    if (tokenMaskSrc && !oneSrcSWSB) {
                         bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::allrd, tokenMaskSrc});
-                    syncSWSB = SWSBInfo();
-                }
-                if (tokenMaskDst && !oneDstSWSB) {
-                    if (recordSWSB)
+                        syncSWSB = SWSBInfo();
+                    }
+                    if (tokenMaskDst && !oneDstSWSB) {
                         bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::allwr, tokenMaskDst});
-                    syncSWSB = SWSBInfo();
-                }
-                if (!syncSWSB.empty() && recordSWSB)
-                    bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::nop, 0});
+                        syncSWSB = SWSBInfo();
+                    }
+                    if (!syncSWSB.empty())
+                        bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::nop, 0});
 
-                // If final or nothing added to consumer table, assign SWSB.
-                // For {Atomic} chains, put SWSB for consumed dependencies at head of chain.
-                if (recordSWSB) {
+                    // If final or nothing added to consumer table, assign SWSB.
+                    // For {Atomic} chains, put SWSB for consumed dependencies at head of chain.
                     if (inumChain >= 0) {
                         if (!insn.atomic()) {
                             program[inumChain].setSWSB(encodeSWSB(hw, &insn, Dependency<false>(), generated));
@@ -1877,11 +2187,6 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
                         insn.setSWSB(encodeSWSB(hw, &insn, tokenInfo, generated));
                     insn.clearAutoSWSB();
                 }
-
-                // After assigning SWSB to in-order instructions, clean producer list of known SWSB and sync dependencies.
-                if (tokenMaskSrc) bb.producers.removeByTokenMask(tokenMaskSrc, false);
-                if (tokenMaskDst) bb.producers.removeByTokenMask(tokenMaskDst, true);
-                bb.producers.removeIntersections(generated);
             }
         } else {
             // SWSB specified. Consume any dependencies associated with this SWSB.
@@ -1944,14 +2249,13 @@ inline void analyze(HW hw, int tokens, Program &program, BasicBlock &bb, int pha
         recordIOPreconsumes(generated);
 
         // Add producer dependencies for all operands.
-        // Also record instruction number and token timeout.
+        // Also record token timeout.
         // During phase 0, only do this for OOO instructions, and if dst not null, only dst.
         if ((phase > 0) || tokenInfo.hasToken()) {
             auto produceOp = consumeOp.cast();
             if (tokenInfo.hasToken()) {
                 produceOp.token = tokenInfo.token;
                 produceOp.tokenTime = estimateLatency(hw, insn);
-                produceOp.inum() = inum;
             }
 
             for (int srcN = -1; srcN < 3; srcN++) {
@@ -2118,6 +2422,7 @@ inline void propagate(std::vector<BasicBlock> &BBs)
 
                     // Dependency is new and was not consumed.
                     // Add to produce table unless it's already implied by existing producers.
+                    if (&bb == pred) return;    /* pathological case, skip */
                     newDep.label = age + 1;
                     if (bb.producers.insert(newDep)) {
                         done = false;
@@ -2169,13 +2474,13 @@ inline void adjustTargets(Program &program, BasicBlockList &list)
     int32_t shift = 0;
     for (auto &bb : list) {
         shifts.insert({bb.istart, shift});
-        shift += int32_t(bb.syncs.size()) - bb.directives;
+        shift += int32_t(bb.syncs.size() + bb.movs.size()) - bb.directives;
     }
     shifts.insert({list.back().iend, shift});
 
     shift = 0;
     for (auto &bb : list) {
-        shift += int32_t(bb.syncs.size()) - bb.directives;
+        shift += int32_t(bb.syncs.size() + bb.movs.size()) - bb.directives;
         auto ntail = bb.iend - 1;
         auto &insn = program[ntail];
         int jip = -1, uip = -1;
diff --git a/src/gpu/intel/jit/ngen/ngen_compiler_fix.hpp b/third_party/ngen/ngen_compiler_fix.hpp
similarity index 99%
rename from src/gpu/intel/jit/ngen/ngen_compiler_fix.hpp
rename to third_party/ngen/ngen_compiler_fix.hpp
index 3195dd05695..c5b55f5ec3d 100644
--- a/src/gpu/intel/jit/ngen/ngen_compiler_fix.hpp
+++ b/third_party/ngen/ngen_compiler_fix.hpp
@@ -109,6 +109,7 @@ void _workaround_() {
 
     (void) ce0.getBase();
     (void) sp.getBase();
+    (void) s0.getBase();
     (void) sr0.getBase();
     (void) sr1.getBase();
     (void) cr0.getBase();
diff --git a/third_party/ngen/ngen_config_internal.hpp b/third_party/ngen/ngen_config_internal.hpp
new file mode 100644
index 00000000000..acad524b957
--- /dev/null
+++ b/third_party/ngen/ngen_config_internal.hpp
@@ -0,0 +1,44 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_CONFIG_INTERNAL_HPP
+#define NGEN_CONFIG_INTERNAL_HPP
+
+// Drop NGEN_CONFIG define once C++11/14 support dropped
+#if (defined(__has_include) && __has_include("ngen_config.hpp")) || defined(NGEN_CONFIG)
+#include "ngen_config.hpp"
+#else
+// Default config settings
+
+#ifndef NGEN_NAMESPACE
+#define NGEN_NAMESPACE ngen
+#endif
+
+#ifndef NGEN_ASM
+#define NGEN_ASM
+#endif
+
+#if (__cplusplus >= 202002L || _MSVC_LANG >= 202002L)
+#if __has_include(<version>)
+#include <version>
+#if __cpp_lib_source_location >= 201907L
+#define NGEN_ENABLE_SOURCE_LOCATION true
+#endif
+#endif
+#endif
+
+#endif
+#endif /* header guard */
diff --git a/src/gpu/intel/jit/ngen/ngen_core.hpp b/third_party/ngen/ngen_core.hpp
similarity index 91%
rename from src/gpu/intel/jit/ngen/ngen_core.hpp
rename to third_party/ngen/ngen_core.hpp
index abf7a356c55..3112c79196d 100644
--- a/src/gpu/intel/jit/ngen/ngen_core.hpp
+++ b/third_party/ngen/ngen_core.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,12 +17,13 @@
 #ifndef NGEN_CORE_HPP
 #define NGEN_CORE_HPP
 
-
+#include <algorithm>
 #include <cstdint>
 #include <cstring>
-#include <vector>
-#include <algorithm>
 #include <type_traits>
+#include <vector>
+
+#include "ngen_config_internal.hpp"
 
 #include "ngen_utils.hpp"
 
@@ -32,6 +33,10 @@
 #endif
 #endif
 
+#ifdef NGEN_ASM
+#include <ostream>
+#endif
+
 #ifdef NGEN_SAFE
 #include <stdexcept>
 #endif
@@ -129,6 +134,10 @@ class invalid_operand_count_exception : public std::runtime_error {
 public:
     invalid_operand_count_exception() : std::runtime_error("Invalid operand count") {}
 };
+class invalid_directive_exception : public std::runtime_error {
+public:
+    invalid_directive_exception() : std::runtime_error("Invalid directive") {}
+};
 class invalid_arf_exception : public std::runtime_error {
 public:
     invalid_arf_exception() : std::runtime_error("Invalid ARF specified") {}
@@ -201,6 +210,10 @@ class invalid_execution_size_exception : public std::runtime_error {
 public:
     invalid_execution_size_exception() : std::runtime_error("Invalid execution size") {}
 };
+class invalid_address_mode_exception : public std::runtime_error {
+public:
+    invalid_address_mode_exception() : std::runtime_error("Invalid address mode") {}
+};
 class invalid_address_modifier_exception : public std::runtime_error {
 public:
     invalid_address_modifier_exception() : std::runtime_error("Invalid address offset") {}
@@ -222,6 +235,7 @@ enum class Core {
     XeHPC,
     Gen12p8 = XeHPC,    /* Deprecated -- will be removed in the future */
     Xe2,
+    Xe3,
 };
 
 typedef Core HW;
@@ -243,21 +257,58 @@ enum class ProductFamily : int {
     ARL,
     GenericXeHPC,
     PVC,
+    PVCVG,
     GenericXe2,
+    BMG,
+    LNL,
+    GenericXe3,
 };
 
+enum class PlatformType {Unknown, Integrated, Discrete};
+
 struct Product {
     ProductFamily family;
     int stepping;
+    PlatformType type;
 };
 
-static inline bool operator==(const Product &p1, const Product &p2) { return p1.family == p2.family && p1.stepping == p2.stepping; }
+static inline bool operator==(const Product &p1, const Product &p2) { return p1.family == p2.family && p1.stepping == p2.stepping && p1.type == p2.type; }
 static inline bool operator!=(const Product &p1, const Product &p2) { return !(p1 == p2); }
 static inline bool operator<(const Product &p1, const Product &p2) { return (p1.family < p2.family) || (p1.family == p2.family && p1.stepping < p2.stepping); }
 static inline bool operator>(const Product &p1, const Product &p2) { return p2 < p1; }
 static inline bool operator>=(const Product &p1, const Product &p2) { return !(p1 < p2); }
 static inline bool operator<=(const Product &p1, const Product &p2) { return !(p2 < p1); }
 
+static inline constexpr14 PlatformType getPlatformType(ProductFamily family) {
+    switch (family) {
+        // Guaranteed integrated
+        case ProductFamily::GenericGen9:
+        case ProductFamily::GenericGen10:
+        case ProductFamily::GenericGen11:
+        case ProductFamily::MTL:
+        case ProductFamily::ARL:
+        case ProductFamily::LNL:
+            return PlatformType::Integrated;
+        // Could be integrated or discrete
+        case ProductFamily::GenericXeLP:
+        case ProductFamily::GenericXeHPG:
+        case ProductFamily::GenericXe2:
+        case ProductFamily::GenericXe3:
+            return PlatformType::Unknown;
+        // Guaranteed discrete
+        case ProductFamily::GenericXeHP:
+        case ProductFamily::GenericXeHPC:
+        case ProductFamily::DG2:
+        case ProductFamily::PVC:
+        case ProductFamily::PVCVG:
+        case ProductFamily::BMG:
+            return PlatformType::Discrete;
+        case ProductFamily::Unknown:
+            return PlatformType::Unknown;
+    }
+    return PlatformType::Unknown;
+}
+
 static inline constexpr14 ProductFamily genericProductFamily(HW hw)
 {
     switch (hw) {
@@ -269,12 +320,14 @@ static inline constexpr14 ProductFamily genericProductFamily(HW hw)
         case HW::XeHPG: return ProductFamily::GenericXeHPG;
         case HW::XeHPC: return ProductFamily::GenericXeHPC;
         case HW::Xe2:   return ProductFamily::GenericXe2;
+        case HW::Xe3:   return ProductFamily::GenericXe3;
         default:        return ProductFamily::Unknown;
     }
 }
 
 static inline constexpr14 Core getCore(ProductFamily family)
 {
+    if (family >= ProductFamily::GenericXe3)   return Core::Xe3;
     if (family >= ProductFamily::GenericXe2)   return Core::Xe2;
     if (family >= ProductFamily::GenericXeHPC) return Core::XeHPC;
     if (family >= ProductFamily::GenericXeHPG) return Core::XeHPG;
@@ -286,6 +339,13 @@ static inline constexpr14 Core getCore(ProductFamily family)
     return Core::Unknown;
 }
 
+static inline constexpr14 bool hasSystolic(ProductFamily family)
+{
+    if (family == ProductFamily::MTL) return false;
+    if (family == ProductFamily::PVCVG) return false;
+    return (family >= ProductFamily::GenericXeHP);
+}
+
 // Stepping IDs.
 enum {
     SteppingPVCXTA0 = 3,
@@ -312,6 +372,7 @@ enum class DataType : uint8_t {
     vf = 0xAF,
     bf8 = 0x6C,
     tf32 = 0xB0,
+    hf8 = 0x71,
     u4 = 0x5C,
     s4 = 0x5D,
     u2 = 0x3E,
@@ -319,6 +380,16 @@ enum class DataType : uint8_t {
     invalid = 0x60
 };
 
+#ifdef NGEN_ASM
+static inline std::ostream &operator<<(std::ostream &str, DataType type)
+{
+    static const char *names[32] = {"ud",   "d",   "uw", "w", "ub", "b", "df", "f", "uq", "q", "hf", "bf", "bf8", "uv", "v",  "vf",
+                                    "tf32", "hf8", "",   "",  "",   "",  "",   "",  "",   "",  "e2m1",   "",   "u4",  "s4", "u2", "s2"};
+    str << names[static_cast<uint8_t>(type) & 0x1F];
+    return str;
+}
+#endif
+
 static inline constexpr   int getLog2Bits(DataType type)               { return static_cast<int>(type) >> 5; }
 static inline constexpr14 int getLog2Bytes(DataType type)              { return std::max<int>(getLog2Bits(type) - 3, 0); }
 static inline constexpr14 int getLog2Dwords(DataType type)             { return std::max<int>(getLog2Bits(type) - 5, 0); }
@@ -355,6 +426,9 @@ template <> inline DataType getDataType<bfloat16>() { return DataType::bf; }
 #ifdef NGEN_BFLOAT8_TYPE
 template <> inline DataType getDataType<bfloat8>() { return DataType::bf8; }
 #endif
+#ifdef NGEN_HFLOAT8_TYPE
+template <> inline DataType getDataType<hfloat8>() { return DataType::hf8; }
+#endif
 #ifdef NGEN_TFLOAT32_TYPE
 template <> inline DataType getDataType<tfloat32>() { return DataType::tf32; }
 #endif
@@ -373,28 +447,38 @@ template <> inline DataType getDataType<int2>() { return DataType::s2; }
 
 // Math function codes.
 enum class MathFunction : uint8_t {
-    inv   = 1,
-    log   = 2,
-    exp   = 3,
-    sqt   = 4,
-    rsqt  = 5,
-    sin   = 6,
-    cos   = 7,
-    fdiv  = 9,
-    pow   = 10,
-    idiv  = 11,
-    iqot  = 12,
-    irem  = 13,
-    invm  = 14,
-    rsqtm = 15
-};
-
-static inline int mathArgCount(MathFunction func)
+    inv   = 0x1,
+    log   = 0x2,
+    exp   = 0x3,
+    sqt   = 0x4,
+    rsqt  = 0x5,
+    sin   = 0x6,
+    cos   = 0x7,
+    fdiv  = 0x9,
+    pow   = 0xA,
+    idiv  = 0xB,
+    iqot  = 0xC,
+    irem  = 0xD,
+    invm  = 0xE,
+    rsqtm = 0xF,
+
+};
+
+static inline int mathArgCount(HW hw, MathFunction func)
 {
     static const char argCounts[16] = {0, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1};
     return argCounts[static_cast<uint8_t>(func) & 0xF];
 }
 
+#ifdef NGEN_ASM
+static inline std::ostream &operator<<(std::ostream &str, MathFunction func)
+{
+    static const char *names[16] = {"", "inv", "log", "exp", "sqt", "rsqt", "sin", "cos", "", "fdiv", "pow", "idiv", "iqot", "irem", "invm", "rsqtm"};
+    str << names[static_cast<uint8_t>(func) & 0xF];
+    return str;
+}
+#endif
+
 static inline bool hasIEEEMacro(HW hw) {
     if (hw == HW::Gen11) return false;
     if (hw == HW::Gen12LP) return false;
@@ -412,6 +496,15 @@ enum class SyncFunction : uint8_t {
     host  = 15
 };
 
+#ifdef NGEN_ASM
+static inline std::ostream &operator<<(std::ostream &str, SyncFunction func)
+{
+    static const char *names[16] = {"nop", "", "allrd", "allwr", "", "", "", "", "", "", "", "", "flush", "", "bar", "host"};
+    str << names[static_cast<uint8_t>(func) & 0xF];
+    return str;
+}
+#endif
+
 // Shared function IDs (SFIDs).
 enum class SharedFunction : uint8_t {
     null = 0x0,
@@ -441,6 +534,22 @@ enum class SharedFunction : uint8_t {
     spawner = ts,
 };
 
+#ifdef NGEN_ASM
+static inline const char *getMnemonic(SharedFunction sfid, HW hw)
+{
+    static const char *names[16] = {
+        "null", ""    , "smpl", "gtwy", "dc2", "rc" , "urb", "ts" ,
+        "vme" , "dcro", "dc0" , "pixi", "dc1", "cre", ""   , ""   ,
+    };
+    static const char *namesLSC[16] = {
+        "null", "ugml", "smpl", "gtwy", "dc2", "rc" , "urb", "btd",
+        "rta" , "dcro", "dc0" , "pixi", "dc1", "tgm", "slm", "ugm",
+    };
+    const auto &table = (hw >= HW::XeHPG) ? namesLSC : names;
+    return table[static_cast<uint8_t>(sfid) & 0xF];
+}
+#endif
+
 // ARFs: high nybble of register # specifies type
 enum class ARFType : uint8_t {
     null = 0,
@@ -450,6 +559,7 @@ enum class ARFType : uint8_t {
     ce   = 4,
     msg  = 5,
     sp   = 6,
+    s    = 0x16,
     sr   = 7,
     cr   = 8,
     n    = 9,
@@ -460,6 +570,18 @@ enum class ARFType : uint8_t {
     dbg  = 15,
 };
 
+#ifdef NGEN_ASM
+static inline std::ostream &operator<<(std::ostream &str, ARFType type)
+{
+    static const char *names[32] = {"null", "a", "acc", "f", "ce", "msg", "sp", "sr", "cr", "n", "ip", "tdr", "tm", "fc", "", "dbg",
+                                    ""    , "" , "",    "",  "",   "",    "s",  "",   "",   "",  "",   "",    "",   "",   "", ""};
+    str << names[static_cast<uint8_t>(type) & 0x1F];
+    return str;
+}
+
+enum class PrintDetail {base = 0, sub_no_type = 1, sub = 2, hs = 3, vs_hs = 4, full = 5};
+#endif
+
 // Invalid singleton class. Can be assigned to nGEN objects to invalidate them.
 static constexpr class Invalid {} invalid{};
 
@@ -531,6 +653,16 @@ class Label {
     void fixup(HW hw, int execSize, int execWidth, DataType defaultType, int srcN, int arity) {}
     constexpr DataType getType() const { return DataType::invalid; }
     constexpr bool isScalar() const { return false; }
+
+#ifdef NGEN_ASM
+    static const bool emptyOp = false;
+    inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man);
+
+    friend inline bool operator==(const Label &r1, const Label &r2) {
+        return !std::memcmp(&r1, &r2, sizeof(Label));
+    }
+    friend inline bool operator!=(const Label &r1, const Label &r2) { return !(r1 == r2); }
+#endif
 };
 
 static inline bool operator==(const RegData &r1, const RegData &r2);
@@ -556,6 +688,10 @@ class RegData {
         : base(base_), arf(arf_), off(off_), mods(0), type(static_cast<int>(type_)), indirect(indirect_), vs(vs_), width(width_), hs(hs_), _pad2(0), invalid(0) {}
 
 public:
+#ifdef NGEN_ASM
+    static const bool emptyOp = false;
+#endif
+
     constexpr RegData()
         : base(0), arf(0), off(0), mods(0), type(0), indirect(0), vs(0), width(0), hs(0), _pad2(0), invalid(1) {}
 
@@ -570,8 +706,8 @@ class RegData {
     constexpr bool isInvalid()         const { return invalid; }
     constexpr bool isValid()           const { return !invalid; }
     constexpr int getOffset()          const { return off; }
-    constexpr14 int getByteOffset()    const { return off * getBytes(); }
-    constexpr14 int getLogicalOffset() const { return off * elementsPerByte(getType()); }
+    constexpr14 int getByteOffset()    const { return (off * getBits()) >> 3; }
+    constexpr14 int getLogicalOffset() const { return off; }                /* Deprecated; use getOffset */
     constexpr DataType getType()       const { return static_cast<DataType>(type); }
     constexpr int getVS()              const { return vs; }
     constexpr int getWidth()           const { return width; }
@@ -579,6 +715,7 @@ class RegData {
     constexpr bool getNeg()            const { return mods & 2; }
     constexpr bool getAbs()            const { return mods & 1; }
     constexpr int getMods()            const { return mods; }
+    constexpr14 int getBits()          const { return NGEN_NAMESPACE::getBits(getType()); }
     constexpr14 int getBytes()         const { return NGEN_NAMESPACE::getBytes(getType()); }
     constexpr14 int getDwords()        const { return NGEN_NAMESPACE::getDwords(getType()); }
     constexpr bool isScalar()          const { return hs == 0 && vs == 0 && width == 1; }
@@ -610,6 +747,10 @@ class RegData {
     friend inline bool operator!=(const RegData &r1, const RegData &r2);
 
     friend inline RegData abs(const RegData &r);
+
+#ifdef NGEN_ASM
+    inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const;
+#endif
 };
 
 static_assert(sizeof(RegData) == 8, "RegData structure is not laid out correctly in memory.");
@@ -717,6 +858,11 @@ class Align16Operand {
     void fixup(HW hw, int execSize, int execWidth, DataType defaultType, int srcN, int arity) {
         rd.fixup(hw, execSize, execWidth, defaultType, srcN, arity);
     }
+
+#ifdef NGEN_ASM
+    inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const;
+    static const bool emptyOp = false;
+#endif
 };
 
 // Register regions.
@@ -800,16 +946,17 @@ class Subregister : public RegData
     Subregister    w(int offset = 0) const { return reinterpret(offset, DataType::w);  }
     Subregister   ub(int offset = 0) const { return reinterpret(offset, DataType::ub); }
     Subregister    b(int offset = 0) const { return reinterpret(offset, DataType::b);  }
-    Subregister   u4(int offset = 0) const { return reinterpret(offset >> 1, DataType::u4); }
-    Subregister   s4(int offset = 0) const { return reinterpret(offset >> 1, DataType::s4); }
-    Subregister   u2(int offset = 0) const { return reinterpret(offset >> 2, DataType::u2); }
-    Subregister   s2(int offset = 0) const { return reinterpret(offset >> 2, DataType::s2); }
+    Subregister   u4(int offset = 0) const { return reinterpret(offset, DataType::u4); }
+    Subregister   s4(int offset = 0) const { return reinterpret(offset, DataType::s4); }
+    Subregister   u2(int offset = 0) const { return reinterpret(offset, DataType::u2); }
+    Subregister   s2(int offset = 0) const { return reinterpret(offset, DataType::s2); }
     Subregister   df(int offset = 0) const { return reinterpret(offset, DataType::df); }
     Subregister    f(int offset = 0) const { return reinterpret(offset, DataType::f);  }
     Subregister   hf(int offset = 0) const { return reinterpret(offset, DataType::hf); }
     Subregister   bf(int offset = 0) const { return reinterpret(offset, DataType::bf); }
     Subregister tf32(int offset = 0) const { return reinterpret(offset, DataType::tf32); }
     Subregister  bf8(int offset = 0) const { return reinterpret(offset, DataType::bf8); }
+    Subregister  hf8(int offset = 0) const { return reinterpret(offset, DataType::hf8); }
 };
 
 // Single register.
@@ -842,16 +989,17 @@ class Register : public RegData
     constexpr14 Subregister    w(int offset) const { return sub(offset, DataType::w);  }
     constexpr14 Subregister   ub(int offset) const { return sub(offset, DataType::ub); }
     constexpr14 Subregister    b(int offset) const { return sub(offset, DataType::b);  }
-    constexpr14 Subregister   u4(int offset) const { return sub(offset >> 1, DataType::u4); }
-    constexpr14 Subregister   s4(int offset) const { return sub(offset >> 1, DataType::s4); }
-    constexpr14 Subregister   u2(int offset) const { return sub(offset >> 2, DataType::u2); }
-    constexpr14 Subregister   s2(int offset) const { return sub(offset >> 2, DataType::s2); }
+    constexpr14 Subregister   u4(int offset) const { return sub(offset, DataType::u4); }
+    constexpr14 Subregister   s4(int offset) const { return sub(offset, DataType::s4); }
+    constexpr14 Subregister   u2(int offset) const { return sub(offset, DataType::u2); }
+    constexpr14 Subregister   s2(int offset) const { return sub(offset, DataType::s2); }
     constexpr14 Subregister   df(int offset) const { return sub(offset, DataType::df); }
     constexpr14 Subregister    f(int offset) const { return sub(offset, DataType::f);  }
     constexpr14 Subregister   hf(int offset) const { return sub(offset, DataType::hf); }
     constexpr14 Subregister   bf(int offset) const { return sub(offset, DataType::bf); }
     constexpr14 Subregister tf32(int offset) const { return sub(offset, DataType::tf32); }
     constexpr14 Subregister  bf8(int offset) const { return sub(offset, DataType::bf8); }
+    constexpr14 Subregister  hf8(int offset) const { return sub(offset, DataType::hf8); }
 
     constexpr14 Register   uq() const { return retype(DataType::uq); }
     constexpr14 Register    q() const { return retype(DataType::q);  }
@@ -871,6 +1019,7 @@ class Register : public RegData
     constexpr14 Register   bf() const { return retype(DataType::bf); }
     constexpr14 Register tf32() const { return retype(DataType::tf32); }
     constexpr14 Register  bf8() const { return retype(DataType::bf8); }
+    constexpr14 Register  hf8() const { return retype(DataType::hf8); }
 
     constexpr14 Subregister operator[](int offset) const { return sub(offset, getType()); }
 
@@ -902,16 +1051,17 @@ class GRF : public Register
     constexpr14 Subregister    w(int offset) const { return sub(offset, DataType::w);  }
     constexpr14 Subregister   ub(int offset) const { return sub(offset, DataType::ub); }
     constexpr14 Subregister    b(int offset) const { return sub(offset, DataType::b);  }
-    constexpr14 Subregister   u4(int offset) const { return sub(offset >> 1, DataType::u4); }
-    constexpr14 Subregister   s4(int offset) const { return sub(offset >> 1, DataType::s4); }
-    constexpr14 Subregister   u2(int offset) const { return sub(offset >> 2, DataType::u2); }
-    constexpr14 Subregister   s2(int offset) const { return sub(offset >> 2, DataType::s2); }
+    constexpr14 Subregister   u4(int offset) const { return sub(offset, DataType::u4); }
+    constexpr14 Subregister   s4(int offset) const { return sub(offset, DataType::s4); }
+    constexpr14 Subregister   u2(int offset) const { return sub(offset, DataType::u2); }
+    constexpr14 Subregister   s2(int offset) const { return sub(offset, DataType::s2); }
     constexpr14 Subregister   df(int offset) const { return sub(offset, DataType::df); }
     constexpr14 Subregister    f(int offset) const { return sub(offset, DataType::f);  }
     constexpr14 Subregister   hf(int offset) const { return sub(offset, DataType::hf); }
     constexpr14 Subregister   bf(int offset) const { return sub(offset, DataType::bf); }
     constexpr14 Subregister tf32(int offset) const { return sub(offset, DataType::tf32); }
     constexpr14 Subregister  bf8(int offset) const { return sub(offset, DataType::bf8); }
+    constexpr14 Subregister  hf8(int offset) const { return sub(offset, DataType::hf8); }
 
     constexpr14 GRF   uq() const { return retype(DataType::uq); }
     constexpr14 GRF    q() const { return retype(DataType::q);  }
@@ -931,6 +1081,7 @@ class GRF : public Register
     constexpr14 GRF   bf() const { return retype(DataType::bf); }
     constexpr14 GRF tf32() const { return retype(DataType::tf32); }
     constexpr14 GRF  bf8() const { return retype(DataType::bf8); }
+    constexpr14 GRF  hf8() const { return retype(DataType::hf8); }
 
     Align16Operand swizzle(int s0, int s1, int s2, int s3)    const { return Align16Operand(*this, s0, s1, s2, s3); }
     Align16Operand enable(bool c0, bool c1, bool c2, bool c3) const { return Align16Operand(*this, (int(c3) << 3) | (int(c2) << 2) | (int(c1) << 1) | int(c0)); }
@@ -1012,6 +1163,7 @@ class AccumulatorRegister : public ARF
             if (hw == HW::Gen9)  return 0;
             if (hw == HW::XeHPG) return 0;
             if (hw == HW::Xe2)   return 0;
+            if (hw == HW::Xe3)   return 0;
         }
         if (hw >= HW::XeHP) return 4;
         return 2;
@@ -1037,7 +1189,7 @@ class SpecialAccumulatorRegister : public AccumulatorRegister
 };
 
 constexpr14 RegData RegData::getIndirectReg() const {
-    auto type = ARFType::a;
+    auto type = (base & 0x100) ? ARFType::s : ARFType::a;
     return ARF(type, 0)[getIndirectOff()];
 }
 
@@ -1062,10 +1214,16 @@ class ExtendedReg {
     constexpr bool isValid()        const { return !base.isInvalid(); }
     constexpr bool isScalar()       const { return base.isScalar(); }
     constexpr bool isARF()          const { return base.isARF(); }
+    constexpr bool isNull()          const { return base.isNull(); }
 
     constexpr14 RegData &getBase()        { return base; }
     constexpr RegData getBase()     const { return base; }
     constexpr uint8_t getMMENum()   const { return mmeNum; }
+
+#ifdef NGEN_ASM
+    inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const;
+    static const bool emptyOp = false;
+#endif
 };
 
 static inline ExtendedReg operator|(const RegData &base, const SpecialAccumulatorRegister &acc)
@@ -1114,6 +1272,20 @@ class StackPointerRegister : public ARF
     explicit constexpr StackPointerRegister(int reg_ = 0) : ARF(ARFType::sp, reg_, DataType::uq) {}
 };
 
+class ScalarRegister : public ARF
+{
+public:
+    explicit constexpr ScalarRegister(int reg_, int off_ = 0, DataType type_ = DataType::ub) : ARF(ARFType::s, reg_, type_, off_) {}
+
+    constexpr ScalarRegister operator[](int offset) const { return ScalarRegister(getARFBase(), getOffset() + offset); }
+    constexpr14 ScalarRegister uq(int offset) const { return ScalarRegister(getARFBase(), (getByteOffset() >> 3) + offset, DataType::uq); }
+    constexpr14 ScalarRegister  q(int offset) const { return ScalarRegister(getARFBase(), (getByteOffset() >> 3) + offset, DataType::q); }
+
+    RegisterRegion operator()(int vs, int width, int hs) const { return reinterpret_cast<const Subregister &>(*this)(vs, width, hs); }
+    RegisterRegion operator()(int vs, int hs) const            { return reinterpret_cast<const Subregister &>(*this)(vs, hs); }
+    RegisterRegion operator()(int hs) const                    { return reinterpret_cast<const Subregister &>(*this)(vs); }
+};
+
 class StateRegister : public ARF
 {
 public:
@@ -1231,7 +1403,7 @@ inline Subregister Subregister::reinterpret(int offset, DataType type_) const
     r.setType(type_);
 
     int o = getOffset();
-    int oldbytes = getBytes(), newbytes = r.getBytes();
+    int oldbytes = getBits(), newbytes = r.getBits();
     int bitdiff = (oldbytes == 0) ? 0
                                   : (utils::log2(newbytes) - utils::log2(oldbytes));
 
@@ -1247,6 +1419,8 @@ inline Subregister Subregister::reinterpret(int offset, DataType type_) const
 class IndirectRegister : public Register {
 protected:
     explicit constexpr14 IndirectRegister(const RegData &reg) : Register(reg.getOffset(), false) {
+        if (reg.getARFType() == ARFType::s)
+            base |= 0x100;
         indirect = true;
     }
     friend class IndirectRegisterFrame;
@@ -1258,7 +1432,9 @@ class IndirectRegisterFrame {
 public:
     IndirectRegister operator[](const RegData &reg) const {
 #ifdef NGEN_SAFE
-        if (!reg.isARF() || reg.getARFType() != ARFType::a)
+        if (!reg.isARF())
+            throw invalid_arf_exception();
+        if (reg.getARFType() != ARFType::a && reg.getARFType() != ARFType::s)
             throw invalid_arf_exception();
 #endif
         return IndirectRegister(reg);
@@ -1302,6 +1478,11 @@ class GRFRange {
 
     void fixup(HW hw, int execSize, int execWidth, DataType defaultType, int srcN, int arity) {}
     constexpr DataType getType() const { return DataType::invalid; }
+
+#ifdef NGEN_ASM
+    static const bool emptyOp = false;
+    inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const;
+#endif
 };
 
 static inline GRFRange operator-(const GRF &reg1, const GRF &reg2)
@@ -1346,6 +1527,15 @@ enum class ConditionModifier {
     eo = 0xF
 };
 
+#ifdef NGEN_ASM
+static inline std::ostream &operator<<(std::ostream &str, ConditionModifier cmod)
+{
+    static const char *names[16] = {"", "eq", "ne", "gt", "ge", "lt", "le", "", "ov", "un", "", "", "", "", "", "eo"};
+    str << names[static_cast<uint8_t>(cmod) & 0xF];
+    return str;
+}
+#endif
+
 enum class ChannelMask {
     rgba = 0,
     gba = 1,
@@ -1504,8 +1694,10 @@ enum class Directive {
     ignoredep_src0 = 1,
     ignoredep_src1 = 2,
     ignoredep_src2 = 3,
+    subdep_dst = 8,
     wrdep = 0x10,
     fencedep = 0x11,
+    pvcwarwa = 0x20,
 };
 
 static inline bool isSend(Opcode op)
@@ -1526,6 +1718,7 @@ static inline bool trackedByToken(HW hw, Opcode op, unsigned dstTypecode)
     switch (op) {
         case Opcode::math:
             if (hw >= HW::XeHPC) return false;
+            /* fall through */
         case Opcode::dpas:
         case Opcode::dpasw:
             return true;
@@ -1549,8 +1742,18 @@ enum class Pipe : uint8_t {
     I = 3, Integer = I,
     L = 4, Long = L,
     M = 5, Math = M,
+    S = 6, Scalar = S,
 };
 
+#ifdef NGEN_ASM
+static inline std::ostream &operator<<(std::ostream &str, Pipe pipe)
+{
+    static const char *names[8] = {"", "A", "F", "I", "L", "M", "S", ""};
+    str << names[static_cast<uint8_t>(pipe) & 7];
+    return str;
+}
+#endif
+
 class SWSBInfo
 {
     friend class InstructionModifier;
@@ -1843,22 +2046,26 @@ class Immediate {
     }
 
     template <typename T> void shrinkSigned(T imm) {
-        if (imm == T(int16_t(imm)))       set<int16_t>(imm);
-        else if (imm == T(uint16_t(imm))) set<uint16_t>(imm);
-        else if (imm == T(int32_t(imm)))  set<int32_t>(imm);
-        else if (imm == T(uint32_t(imm))) set<uint32_t>(imm);
+        if (imm == T(int16_t(imm)))       set(int16_t(imm));
+        else if (imm == T(uint16_t(imm))) set(uint16_t(imm));
+        else if (imm == T(int32_t(imm)))  set(int32_t(imm));
+        else if (imm == T(uint32_t(imm))) set(uint32_t(imm));
         else                              set(imm);
     }
 
     template <typename T> void shrinkUnsigned(T imm) {
-        if (imm == T(uint16_t(imm)))      set<uint16_t>(imm);
-        else if (imm == T(uint32_t(imm))) set<uint32_t>(imm);
+        if (imm == T(uint16_t(imm)))      set(uint16_t(imm));
+        else if (imm == T(uint32_t(imm))) set(uint32_t(imm));
         else                              set(imm);
     }
 
 public:
     Immediate() : payload(0), type(DataType::invalid) {}
 
+#ifdef NGEN_ASM
+    static const bool emptyOp = false;
+#endif
+
     constexpr14 DataType getType()           const { return type; }
     explicit constexpr14 operator uint64_t() const { return payload; }
     constexpr14 int getMods()                const { return 0; }
@@ -1985,6 +2192,10 @@ class Immediate {
         return Immediate(payload, DataType::vf);
     }
 
+    static Immediate zero(DataType dt) {
+        return Immediate(0, dt);
+    }
+
     void fixup(HW hw, int execSize, int execWidth, DataType defaultType, int srcN, int arity) const {
 #ifdef NGEN_SAFE
         if (getBytes(type) > (16 >> arity))
@@ -2006,11 +2217,15 @@ class Immediate {
     Immediate forceInt32() const {
         auto result = *this;
         if (result.type == DataType::uw)
-            result.set<uint32_t>(uint16_t(payload));
+            result.set(uint32_t(uint16_t(payload)));
         else if (result.type == DataType::w)
-            result.set<int32_t>(int16_t(payload));
+            result.set(int32_t(int16_t(payload)));
         return result;
     }
+
+#ifdef NGEN_ASM
+    inline void outputText(std::ostream &str, PrintDetail detail, LabelManager &man) const;
+#endif
 };
 
 // Compute ctrl field for bfn instruction.
@@ -2291,6 +2506,7 @@ class AddressBase {
 };
 
 class hdc_base {
+public:
 protected:
     void hwCheck(HW hw) const {
 #ifdef NGEN_SAFE
diff --git a/third_party/ngen/ngen_debuginfo.hpp b/third_party/ngen/ngen_debuginfo.hpp
new file mode 100644
index 00000000000..579a259e1fc
--- /dev/null
+++ b/third_party/ngen/ngen_debuginfo.hpp
@@ -0,0 +1,419 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_DEBUGINFO_HPP
+#define NGEN_DEBUGINFO_HPP
+
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+
+#include "ngen_utils.hpp"
+
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+#include <source_location>
+#endif
+
+
+namespace NGEN_NAMESPACE {
+
+struct DebugConfig {
+    DebugConfig() = default;
+    DebugConfig(const char * name, uint32_t line, bool enableLineMapping): nameCU(name), programLine(line), enableLineMapping(enableLineMapping) {};
+
+    const char *nameCU = "(unknown)";
+    uint32_t programLine = 0;
+    bool enableLineMapping = false;
+};
+
+struct SourceLocation {
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+    SourceLocation(std::source_location where = std::source_location::current())
+        : value(where) {}
+    SourceLocation(const SourceLocation &) = default;
+    SourceLocation(SourceLocation &&) = default;
+    const char *filePath() { return value.file_name(); }
+    uint32_t line() { return value.line(); }
+    std::source_location value;
+    std::string str() {
+        std::ostringstream oss;
+        oss << value.file_name() << ":" << value.line();
+        return oss.str();
+    }
+#endif
+};
+
+struct DebugLine {
+    DebugLine(const DebugConfig & conf): enableSrcLines(conf.enableLineMapping), programLine(conf.programLine) {
+#ifndef NGEN_ENABLE_SOURCE_LOCATION
+        // Unsupported
+        enableSrcLines = false;
+#endif
+        // Reuse .debug_line_str for simplicity
+        add(conf.nameCU);
+    }
+
+    struct Line {
+        Line(uint64_t address, uint32_t fileEntry, uint32_t line)
+            : address(address), fileEntry(fileEntry), line(line) {}
+        uint64_t address;
+        uint32_t fileEntry;
+        uint32_t line;
+        std::string str() {
+            std::ostringstream oss;
+            oss << "0x" << std::hex << address << " -> " << fileEntry;
+            return oss.str();
+        }
+    };
+    uint32_t add(const char * filePath) {
+        uint32_t entry = 0;
+        auto f = fileMap.find(filePath);
+        if(f == fileMap.end()) {
+            const char * filenamePtr = [&]() {
+                for(int64_t i = strlen(filePath) - 1; i >= 0; i--) {
+                    if(filePath[i] == '/' || filePath[i] == '\\')
+                        return filePath + i + 1;
+                }
+                return filePath;
+            }();
+            std::string filename = filenamePtr;
+            std::string dirName = filenamePtr == filePath ? "" : std::string(filePath, filenamePtr - filePath - 1);
+            uint32_t dirEntryIndex = [&]() {
+                auto e = dirMap.find(dirName);
+                if (e != dirMap.end())
+                    return e->second;
+
+                uint32_t ret = static_cast<uint32_t>(dirEntries.size());
+
+                // Add new directory entry
+                DirEntry dirEntry = {static_cast<uint32_t>(strTable.size())};
+                for (char c : dirName) {
+                    strTable.emplace_back(c);
+                }
+                strTable.emplace_back(0);
+                dirEntries.emplace_back(dirEntry);
+                dirMap[dirName] = ret;
+
+                return ret;
+            }();
+
+            FileEntry fileEntry = {static_cast<uint32_t>(strTable.size()),
+                                   dirEntryIndex};
+            for (char c : filename) {
+                strTable.emplace_back(c);
+            }
+            strTable.emplace_back(0);
+            fileEntries.emplace_back(fileEntry);
+            entry = static_cast<uint32_t>(fileEntries.size() - 1);
+            fileMap[filePath] = entry;
+        } else {
+            entry = f->second;
+        }
+        return entry;
+    }
+
+    void add(uint64_t address, SourceLocation loc) {
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+        if(enableSrcLines) srcLines.emplace_back(address, add(loc.filePath()), loc.line());
+#endif
+    }
+
+    std::string str() {
+        std::ostringstream oss;
+        oss << "Directory Table: \n";
+        for (auto &e : dirEntries)
+            if (strTable[e.strTableOffset] == 0)
+                oss << "\t.\n";
+            else
+                oss << "\t" << &strTable[e.strTableOffset] << "\n";
+
+        oss << "File Table: \n";
+        for (auto &e : fileEntries) {
+            const char *dir =
+                &strTable[dirEntries[e.dirEntriesIndex].strTableOffset];
+            if (*dir == 0)
+                oss << "\t" << &strTable[e.strTableOffset] << "\n";
+            else
+                oss << "\t" << dir << "/" << &strTable[e.strTableOffset] << "\n";
+        }
+
+        for (auto &line : srcLines) {
+            oss << line.str() << "\n";
+        }
+        return oss.str();
+    }
+
+#pragma pack(push, 1)
+    struct DebugLineHeaderBase {
+        uint32_t unitLength; // Length of all debug line information excluding itself
+        uint16_t version = 5;
+        uint8_t addressSize = sizeof(void*);
+        uint8_t segmentSelectorSize = 0;
+        uint32_t headerLength;
+        uint8_t minimumInstructionLength = 16; // sizeof(Instruction12), hard-coded to avoid dependency
+        uint8_t maximumOperationsPerInstruction = 1;
+        uint8_t defaultIsStmt = 1;
+        int8_t lineBase = -32;
+        uint8_t lineRange = 192;
+        uint8_t opcodeBase = 5;
+        uint8_t opCodeLengths[4] = {0, 1, 1, 1};
+
+        enum DWARF_FORM : uint8_t {
+            DATA2 = 0x05,
+            DATA4 = 0x06,
+            DATA8 = 0x07,
+            STRING = 0x08,
+            DATA1 = 0x0b,
+            STRP = 0x0e,
+            LINEPTR = 0x17,
+            FLAG_PRESENT = 0x19,
+            LINE_STRP = 0x1f,
+        };
+
+        enum DWARF_LNCT : uint8_t {
+            PATH = 0x1,
+            DIRECTORY_INDEX = 0x2,
+        };
+
+        struct DirEntriesHeader {
+            uint8_t directoryEntryFormatCount = 1;
+            struct {
+                uint8_t type;
+                uint8_t form;
+            } directoryEntryFormat[1] = {
+                {DWARF_LNCT::PATH, DWARF_FORM::LINE_STRP},
+            };
+        };
+
+        struct FileEntriesHeader {
+            uint8_t filenameEntryFormatCount = 2;
+            struct {
+                uint8_t type;
+                uint8_t form;
+            } filenameEntryFormat[2] = {
+                {DWARF_LNCT::PATH, DWARF_FORM::LINE_STRP},
+                {DWARF_LNCT::DIRECTORY_INDEX, DWARF_FORM::DATA4},
+            };
+        };
+    };
+#pragma pack(pop)
+
+    static void encodeLEB128(std::vector<char> &out, int64_t a) {
+        do {
+            out.emplace_back(static_cast<uint8_t>((a & 0x7f) | 0x80));
+            a >>= 7;
+        } while (a != 0 && a != -1);
+        if (a == -1 && (out.back() & 0x40) == 0) {
+            out.emplace_back(0x7f);
+        } else if (a == 0 && (out.back() & 0x40)) {
+            out.emplace_back(0x00);
+        } else  {
+            out.back() &= 0x7f;
+        }
+    }
+
+    static void encodeULEB128(std::vector<char> &out, size_t a) {
+        do {
+            out.emplace_back(static_cast<uint8_t>((a & 0x7f) | 0x80));
+            a >>= 7;
+        } while (a > 0);
+        out.back() &= 0x7f;
+    };
+
+    static std::vector<char> toULEB128(size_t a) {
+        std::vector<char> ret;
+        encodeULEB128(ret, a);
+        return ret;
+    };
+
+    std::pair<std::vector<char>, uint64_t> encodeLineStatements(const DebugLineHeaderBase & base) const {
+        struct {
+            uint64_t address = 0;
+            uint64_t line = 1;
+            uint32_t file = 1;
+            bool isStmt = 1;
+            uint32_t isa = 0;
+            uint32_t discriminator = 0;
+
+            int8_t lineBase;
+            uint8_t lineRange;
+            uint8_t opcodeBase;
+            uint8_t addressScale;
+
+            enum StandardOps {
+                extendedOp = 0,
+                copy = 1,
+                advancePC = 2,
+                advanceLine = 3,
+                setFile = 4,
+            };
+
+            enum ExtendedOps { endSequence = 1, setAddress = 2 };
+
+            uint64_t initPC(std::vector<char> &out) {
+                out.emplace_back(extendedOp);
+                encodeULEB128(out, 9);
+                out.emplace_back(setAddress);
+                uint64_t relocationOffset = out.size();
+                for (int i = 0; i < 8; i++) {
+                    out.emplace_back(0);
+                }
+                return relocationOffset;
+            }
+
+            void finalize(std::vector<char> &out) {
+                out.emplace_back(advancePC);
+                encodeULEB128(out, 1);
+                out.emplace_back(extendedOp);
+                encodeULEB128(out, 1);
+                out.emplace_back(endSequence);
+            }
+
+            void encode(std::vector<char> &out, const Line &l) {
+                size_t lineAddress = l.address / addressScale;
+
+                if(l.fileEntry != file) {
+                    out.emplace_back(setFile);
+                    encodeULEB128(out, l.fileEntry);
+                    file = l.fileEntry;
+                }
+
+                if (l.line >= line + lineBase &&
+                    l.line < line + lineBase + lineRange) {
+                    int64_t specialOp = (l.line - line) - lineBase +
+                        (lineAddress - address) * lineRange +
+                        opcodeBase;
+                    if (specialOp > 0 && specialOp <= 255) {
+                        out.emplace_back(static_cast<uint8_t>(specialOp));
+                        line = l.line;
+                        address = lineAddress;
+                        return;
+                    }
+                }
+
+                if(lineAddress != address) {
+                    out.emplace_back(advancePC);
+                    encodeULEB128(out, lineAddress - address);
+                    address = lineAddress;
+                }
+
+                if(l.line != line) {
+                    out.emplace_back(advanceLine);
+                    encodeLEB128(out, static_cast<int64_t>(l.line) - line);
+                    line = l.line;
+                }
+
+                out.emplace_back(copy);
+            }
+        } encodeState;
+
+        encodeState.isStmt = base.defaultIsStmt;
+        encodeState.lineBase = base.lineBase;
+        encodeState.lineRange = base.lineRange;
+        encodeState.opcodeBase = base.opcodeBase;
+        encodeState.addressScale = base.minimumInstructionLength;
+
+        std::vector<char> ret;
+        uint64_t relocation_offset = encodeState.initPC(ret);
+        for(const auto & l: srcLines) {
+            encodeState.encode(ret, l);
+        }
+        encodeState.finalize(ret);
+
+        return {ret, relocation_offset};
+    }
+
+    std::pair<std::vector<char>, uint64_t> createDebugLine() const {
+
+        DebugLineHeaderBase base;
+        DebugLineHeaderBase::DirEntriesHeader dirHeader;
+        std::vector<char> directoriesCount = toULEB128(dirEntries.size());
+        uint32_t dirEntriesBytes = static_cast<uint32_t>(dirEntries.size() * sizeof(dirEntries[0]));
+        DebugLineHeaderBase::FileEntriesHeader fileHeader;
+        std::vector<char> filenamesCount = toULEB128(fileEntries.size());
+        uint32_t fileEntriesBytes = static_cast<uint32_t>(fileEntries.size() * sizeof(fileEntries[0]));
+        auto encode = encodeLineStatements(base);
+        std::vector<char> lineStatements = encode.first;
+        uint64_t relocation_offset = encode.second;
+
+        uint32_t headerEntriesSize = static_cast<uint32_t> (sizeof(dirHeader) + directoriesCount.size()
+            + dirEntriesBytes + sizeof(fileHeader) + filenamesCount.size() + fileEntriesBytes);
+        uint32_t lineTableSize = static_cast<uint32_t>(lineStatements.size());
+
+        base.unitLength = sizeof(base) - sizeof(base.unitLength) + headerEntriesSize + lineTableSize;
+        base.headerLength = sizeof(base) -
+            ((offsetof(DebugLineHeaderBase, headerLength)) +
+            sizeof(base.headerLength)) + headerEntriesSize;
+
+        std::vector<char> ret(base.unitLength + sizeof(base.unitLength));
+
+        char *data = ret.data();
+
+        // Write header base data
+        memcpy(data, &base, sizeof(base));
+        data += sizeof(base);
+
+        // Write directory entries
+        memcpy(data, &dirHeader, sizeof(dirHeader));
+        data += sizeof(dirHeader);
+        memcpy(data, directoriesCount.data(), directoriesCount.size());
+        data += directoriesCount.size();
+        memcpy(data, dirEntries.data(), dirEntriesBytes);
+        data += dirEntriesBytes;
+
+        // Write file entries
+        memcpy(data, &fileHeader, sizeof(fileHeader));
+        data += sizeof(fileHeader);
+        memcpy(data, filenamesCount.data(), filenamesCount.size());
+        data += filenamesCount.size();
+        memcpy(data, fileEntries.data(), fileEntriesBytes);
+        data += fileEntriesBytes;
+
+        // Write line statements
+        relocation_offset += data - ret.data();
+        memcpy(data, lineStatements.data(), lineStatements.size());
+        data += lineStatements.size();
+
+        return {ret, relocation_offset};
+    };
+    const std::vector<char> & getDebugLineStr() {
+        return strTable;
+    }
+
+    std::vector<char> strTable = {0};
+    struct DirEntry {
+        uint32_t strTableOffset;
+    };
+    std::vector<DirEntry> dirEntries = {{0}};
+    std::unordered_map<std::string, uint32_t> dirMap = {{"", 0}};
+
+    struct FileEntry {
+        uint32_t strTableOffset;
+        uint32_t dirEntriesIndex;
+    };
+
+    std::vector<FileEntry> fileEntries = {{0, 0}};
+    std::unordered_map<std::string, uint32_t> fileMap {{"", 0}};
+
+    std::vector<Line> srcLines;
+
+    bool enableSrcLines = false;
+    uint32_t programLine = 0;
+};
+
+} // namespace NGEN_NAMESPACE
+
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_decoder.hpp b/third_party/ngen/ngen_decoder.hpp
similarity index 92%
rename from src/gpu/intel/jit/ngen/ngen_decoder.hpp
rename to third_party/ngen/ngen_decoder.hpp
index 37b917d9e9e..e3cdc0054cd 100644
--- a/src/gpu/intel/jit/ngen/ngen_decoder.hpp
+++ b/third_party/ngen/ngen_decoder.hpp
@@ -17,13 +17,19 @@
 #ifndef NGEN_DECODER_HPP
 #define NGEN_DECODER_HPP
 
-#include "ngen_core.hpp"
-#include "ngen_auto_swsb.hpp"
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
 
-namespace NGEN_NAMESPACE {
+#include "ngen_config_internal.hpp"
 
-#include "ngen_gen8.hpp"
+#include "ngen_auto_swsb.hpp"
+#include "ngen_core.hpp"
 #include "ngen_gen12.hpp"
+#include "ngen_gen8.hpp"
+
+namespace NGEN_NAMESPACE {
 
 using DependencyRegion = autoswsb::DependencyRegion;
 
@@ -87,4 +93,7 @@ bool Decoder::getOperandRegion(autoswsb::DependencyRegion &region, int opNum) co
 
 } /* namespace NGEN_NAMESPACE */
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
 #endif /* header guard */
diff --git a/third_party/ngen/ngen_elf.hpp b/third_party/ngen/ngen_elf.hpp
new file mode 100644
index 00000000000..6359af0855f
--- /dev/null
+++ b/third_party/ngen/ngen_elf.hpp
@@ -0,0 +1,676 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_ELF_HPP
+#define NGEN_ELF_HPP
+
+#include "ngen.hpp"
+#include "ngen_interface.hpp"
+
+#include "npack/neo_packager.hpp"
+
+namespace NGEN_NAMESPACE {
+
+// ELF binary format generator class.
+template <HW hw>
+class ELFCodeGenerator : public BinaryCodeGenerator<hw>
+{
+public:
+    inline std::vector<uint8_t> getBinary();
+    static inline HW getBinaryArch(const std::vector<uint8_t> &binary);
+    static inline Product getBinaryHWInfo(const std::vector<uint8_t> &binary);
+
+    explicit ELFCodeGenerator(Product product_, DebugConfig debugConfig = {})  : BinaryCodeGenerator<hw>(product_, debugConfig) {}
+    explicit ELFCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : BinaryCodeGenerator<hw>(stepping_, debugConfig) {}
+    explicit ELFCodeGenerator(DebugConfig debugConfig) : ELFCodeGenerator(0, debugConfig) {}
+
+protected:
+    NEOInterfaceHandler interface_{hw};
+
+    void externalName(const std::string &name)                           { interface_.externalName(name); }
+
+    const std::string &getExternalName() const                           { return interface_.getExternalName(); }
+    int getSIMD() const                                                  { return interface_.getSIMD(); }
+    int getGRFCount() const                                              { return interface_.getGRFCount(); }
+    size_t getSLMSize() const                                            { return interface_.getSLMSize(); }
+
+    void require32BitBuffers()                                           { interface_.require32BitBuffers(); }
+    void requireArbitrationMode(ThreadArbitrationMode mode)              { interface_.requireArbitrationMode(mode); }
+    void requireBarrier()                                                { interface_.requireBarrier(); }
+    void requireBarriers(int nbarriers)                                  { interface_.requireBarriers(nbarriers); }
+    void requireDPAS()                                                   { interface_.requireDPAS(); }
+    void requireGlobalAtomics()                                          { interface_.requireGlobalAtomics(); }
+    void requireGRF(int grfs)                                            { BinaryCodeGenerator<hw>::requireGRF(grfs); interface_.requireGRF(grfs); }
+    void requireLocalID(int dimensions)                                  { interface_.requireLocalID(dimensions); }
+    void requireLocalSize()                                              { interface_.requireLocalSize(); }
+    void requireNonuniformWGs()                                          { interface_.requireNonuniformWGs(); }
+    void requireNoPreemption()                                           { interface_.requireNoPreemption(); }
+    void requireScratch(size_t bytes = 1)                                { interface_.requireScratch(bytes); }
+    void requireSIMD(int simd_)                                          { interface_.requireSIMD(simd_); }
+    void requireSLM(size_t bytes)                                        { interface_.requireSLM(bytes); }
+    void requireStatelessWrites(bool req = true)                         { interface_.requireStatelessWrites(req); }
+    inline void requireType(DataType type)                               { interface_.requireType(type); }
+    template <typename T> void requireType()                             { interface_.requireType<T>(); }
+    void requireWalkOrder(int o1, int o2)                                { interface_.requireWalkOrder(o1, o2); }
+    void requireWalkOrder(int o1, int o2, int o3)                        { interface_.requireWalkOrder(o1, o2, o3); }
+    void requireWorkgroup(size_t x, size_t y = 1, size_t z = 1)          { interface_.requireWorkgroup(x, y, z); }
+
+    void finalizeInterface()                                             { interface_.finalize(); }
+
+    template <typename DT>
+    void newArgument(const std::string &name)                            { interface_.newArgument<DT>(name); }
+    void newArgument(const std::string &name, DataType type,
+                     ExternalArgumentType exttype = ExternalArgumentType::Scalar,
+                     GlobalAccessType access = GlobalAccessType::Default)
+    {
+        interface_.newArgument(name, type, exttype, access);
+    }
+    void newArgument(const std::string &name, Subregister reg,
+                     ExternalArgumentType exttype = ExternalArgumentType::Scalar,
+                     GlobalAccessType access = GlobalAccessType::Default)
+    {
+        interface_.newArgument(name, reg, exttype, access);
+    }
+    void newArgument(const std::string &name, ExternalArgumentType exttype,
+                     GlobalAccessType access = GlobalAccessType::Default)
+    {
+        interface_.newArgument(name, exttype, access);
+    }
+
+    void allowArgumentRearrangement(bool allow)                          { return interface_.allowArgumentRearrangement(allow); }
+
+    Subregister getArgument(const std::string &name) const               { return interface_.getArgument(name); }
+    Subregister getArgumentIfExists(const std::string &name) const       { return interface_.getArgumentIfExists(name); }
+    int getArgumentSurface(const std::string &name) const                { return interface_.getArgumentSurface(name); }
+    int getArgumentSurfaceIfExists(const std::string &name) const        { return interface_.getArgumentSurfaceIfExists(name); }
+    GRF getLocalID(int dim) const                                        { return interface_.getLocalID(dim); }
+    Subregister getSIMD1LocalID(int dim) const                           { return interface_.getSIMD1LocalID(dim); }
+    Subregister getLocalSize(int dim) const                              { return interface_.getLocalSize(dim); }
+
+    void prologue()                                                      { interface_.generatePrologue(*this); }
+    void epilogue(RegData r0_info = RegData())
+    {
+        if (r0_info.isInvalid()) r0_info = this->r0;
+        int GRFCount = interface_.getGRFCount();
+        bool hasSLM = (interface_.getSLMSize() > 0);
+        BinaryCodeGenerator<hw>::epilogue(GRFCount, hasSLM, r0_info);
+    }
+
+    inline std::vector<uint8_t> getBinary(const std::vector<uint8_t> &code);
+
+private:
+    using BinaryCodeGenerator<hw>::labelManager;
+    using BinaryCodeGenerator<hw>::rootStream;
+
+    struct ZebinELF {
+        enum {
+            ELFMagic = 0x464C457F,             // '\x7FELF'
+            ELFClass64 = 2,
+            ELFLittleEndian = 1,
+            ELFVersion1 = 1,
+            ELFRelocatable = 1,
+        };
+        enum {
+            MachineIntelGT = 205,
+            ZebinExec = 0xFF12
+        };
+
+        enum DWARF_UT : uint8_t {
+            COMPILE = 0x01,
+        };
+        enum DWARF_TAG : uint8_t {
+            COMPILATION_UNIT = 0x11,
+            SUBPROGRAM = 0x2e,
+        };
+        enum DWARF_AT : uint8_t {
+            NAME = 0x03,
+            STMT_LIST = 0x10,
+            LOW_PC = 0x11,
+            HIGH_PC = 0x12,
+            DECL_COLUMN = 0x39,
+            DECL_FILE = 0x3a,
+            DECL_LINE = 0x3b,
+        };
+        enum DWARF_FORM : uint8_t {
+            ADDR = 0x1,
+            DATA2 = 0x05,
+            DATA4 = 0x06,
+            DATA8 = 0x07,
+            STRING = 0x08,
+            DATA1 = 0x0b,
+            STRP = 0x0e,
+            LINEPTR = 0x17,
+            FLAG_PRESENT = 0x19,
+            LINE_STRP = 0x1f,
+        };
+        enum DWARF_LNCT : uint8_t {
+            PATH = 0x1,
+            DIRECTORY_INDEX = 0x2,
+        };
+
+        union TargetMetadata {
+            uint32_t all;
+            struct {
+                unsigned genFlags : 8;
+                unsigned minHWRevision : 5;
+                unsigned validateRevision : 1;
+                unsigned disableExtValidation : 1;
+                unsigned useGfxCoreFamily : 1;
+                unsigned maxHWRevision : 5;
+                unsigned generator : 3;
+                unsigned reserved : 8;
+            } parts;
+        };
+        struct FileHeader {
+            uint32_t magic = ELFMagic;
+            uint8_t elfClass = ELFClass64;
+            uint8_t endian = ELFLittleEndian;
+            uint8_t version = ELFVersion1;
+            uint8_t osABI = 0;
+            uint64_t pad = 0;
+            uint16_t type = ELFRelocatable;
+            uint16_t machine = MachineIntelGT;
+            uint32_t version2 = 1;
+            uint64_t entrypoint = 0;
+            uint64_t programHeaderOff = 0;
+            uint64_t sectionTableOff;
+            TargetMetadata flags;
+            uint16_t size;
+            uint16_t programHeaderSize = 0;
+            uint16_t programTableEntries = 0;
+            uint16_t sectionHeaderSize;
+            uint16_t sectionCount;
+            uint16_t strTableIndex = 1;
+        } fileHeader;
+        struct SectionHeader {
+            uint32_t name;
+            enum Type : uint32_t {
+                Null = 0, Program = 1, SymbolTable = 2, StringTable = 3, RelocationWithAddend=4, Note = 7, ZeInfo = 0xFF000011
+            } type;
+            uint64_t flags = 0;
+            uint64_t addr = 0;
+            uint64_t offset;
+            uint64_t size;
+            uint32_t link = 0;
+            uint32_t info = 0;
+            uint64_t align = 0x10;
+            uint64_t entrySize = 0;
+        } sectionHeaders[13];
+        struct SymbolEntry {
+            uint32_t name = 0;
+            enum Info : uint8_t {
+                NoType = 0, Object = 1, Func = 2, Section = 3, File = 4, Common = 5, TLS = 6, LOOS = 10, HIOS = 12, LOPROC = 13, HIPROC = 15
+            } info = Info::NoType;
+            uint8_t other = 0;
+            uint16_t shndx = 0;
+            uint64_t value = 0;
+            uint64_t size = 0;
+        } symTable[3];
+        struct Note {
+            uint32_t nameSize = 8;
+            uint32_t descSize = 4;
+            enum Type : uint32_t {
+                ProductFamily = 1, GfxCoreFamily = 2, TargetMetadata = 3
+            } type = Type::GfxCoreFamily;
+            const char name[8] = "IntelGT";
+            uint32_t payload;
+        } noteGfxCore;
+
+#pragma pack(push, 1)
+        struct DebugInfo {
+            struct {
+                uint32_t unitLength;
+                uint16_t version = 5;
+                uint8_t unitType = DWARF_UT::COMPILE;
+                uint8_t addressSize = sizeof(void*);
+                uint32_t debugAbbrevOffset = 0;
+            } CUHeader;
+            struct {
+                uint8_t abbrevCode = 1;
+                uint32_t name = 0;
+                uint32_t lineTable = 0;
+                uint64_t low_pc = 0;
+                uint64_t high_pc = 0;
+
+                struct {
+                    uint8_t abbrevCode = 2;
+                    uint32_t name;
+                    uint8_t file = 1;
+                    uint32_t line = 0;
+                    uint8_t column = 1;
+                    uint64_t low_pc = 0;
+                    uint64_t high_pc = 0;
+                } subProgram;
+
+                struct {
+                    uint8_t abbrevCode = 0;
+                } End;
+            } CU;
+        } debugInfo;
+#pragma pack(pop)
+
+#pragma pack(push, 1)
+        struct DebugAbbrev {
+            struct {
+                uint8_t abbrevCode = 1;
+                uint8_t tag = DWARF_TAG::COMPILATION_UNIT;
+                uint8_t hasChildren = 1;
+
+                struct {
+                    uint8_t attrName;
+                    uint8_t attrForm;
+                } attributes[5] = {
+                    {DWARF_AT::NAME, DWARF_FORM::LINE_STRP},
+                    {DWARF_AT::STMT_LIST, DWARF_FORM::LINEPTR},
+                    {DWARF_AT::LOW_PC, DWARF_FORM::ADDR},
+                    {DWARF_AT::HIGH_PC, DWARF_FORM::ADDR},
+                    {0, 0},
+                };
+            } CU;
+            struct {
+                uint8_t abbrevCode = 2;
+                uint8_t tag = DWARF_TAG::SUBPROGRAM;
+                uint8_t hasChildren = 0;
+
+                struct {
+                    uint8_t attrName;
+                    uint8_t attrForm;
+                } attributes[7] = {
+                    {DWARF_AT::NAME, DWARF_FORM::STRP},
+                    {DWARF_AT::DECL_FILE, DWARF_FORM::DATA1},
+                    {DWARF_AT::DECL_LINE, DWARF_FORM::DATA4},
+                    {DWARF_AT::DECL_COLUMN, DWARF_FORM::DATA1},
+                    {DWARF_AT::LOW_PC, DWARF_FORM::ADDR},
+                    {DWARF_AT::HIGH_PC, DWARF_FORM::ADDR},
+                    {0, 0},
+                };
+            } subProgram;
+            struct {
+                uint8_t code = 0;
+            } end;
+        } debugAbbrev;
+#pragma pack(pop)
+
+        struct Rela {
+            uint64_t offset;
+            uint64_t info;
+            uint64_t addend;
+        };
+
+        struct StringTable {
+            const char zero = '\0';
+            const char snStrTable[10] = ".shstrtab";
+            const char snMetadata[9] = ".ze_info";
+            const char snNote[21] = ".note.intelgt.compat";
+            const char snSym[8] = ".symtab";
+            const char kernelEntry[7] = "_entry";
+            const char snDebugInfo[17] = ".rela.debug_info";
+            const char snDebugAbbrev[14] = ".debug_abbrev";
+            const char snDebugLine[17] = ".rela.debug_line";
+            const char snDebugLineStr[16] = ".debug_line_str";
+            const char snDebugStr[11] = ".debug_str";
+            const char snText[6] = {'.', 't', 'e', 'x', 't', '.'};
+        } stringTable;
+
+        static size_t align(size_t sz) {
+            return (sz + 0xF) & ~0xF;
+        }
+
+        ZebinELF(size_t szKernelName, size_t szMetadata, size_t szKernel, size_t offKernelEntry,
+                 size_t szDebugLine, size_t szDebugLineStr, uint32_t file1, uint32_t subProgramLine) {
+            fileHeader.size = sizeof(fileHeader);
+            fileHeader.sectionHeaderSize = sizeof(SectionHeader);
+            fileHeader.sectionTableOff = offsetof(ZebinELF, sectionHeaders);
+            fileHeader.sectionCount = sizeof(sectionHeaders) / sizeof(SectionHeader);
+
+            fileHeader.flags.all = 0;
+
+            debugInfo.CUHeader.unitLength = sizeof(debugInfo) - sizeof(debugInfo.CUHeader.unitLength);
+            debugInfo.CU.name = file1;
+            debugInfo.CU.lineTable = 0; // Offset into .debug_line, currently always 0
+            debugInfo.CU.high_pc = szKernel;
+
+            debugInfo.CU.subProgram.name = static_cast<uint32_t>(offsetof(StringTable, snText) + strlen(".text."));
+            debugInfo.CU.subProgram.line = subProgramLine;
+            debugInfo.CU.subProgram.high_pc = szKernel;
+
+            sectionHeaders[0].name = 0;
+            sectionHeaders[0].type = SectionHeader::Type::Null;
+            sectionHeaders[0].offset = 0;
+            sectionHeaders[0].size = 0;
+
+            sectionHeaders[1].name = offsetof(StringTable, snStrTable);
+            sectionHeaders[1].type = SectionHeader::Type::StringTable;
+            sectionHeaders[1].offset = offsetof(ZebinELF, stringTable);
+            sectionHeaders[1].size = sizeof(stringTable) + szKernelName;
+
+            sectionHeaders[2].name = offsetof(StringTable, snMetadata);
+            sectionHeaders[2].type = SectionHeader::Type::ZeInfo;
+            sectionHeaders[2].offset = align(sizeof(ZebinELF) + szKernelName);
+            sectionHeaders[2].size = szMetadata;
+
+            sectionHeaders[3].name = offsetof(StringTable, snText);
+            sectionHeaders[3].type = SectionHeader::Type::Program;
+            sectionHeaders[3].offset = sectionHeaders[2].offset + align(szMetadata);
+            sectionHeaders[3].size = szKernel;
+            sectionHeaders[3].flags = 6;    /* SHF_ALLOC | SHF_EXECINSTR */
+
+            sectionHeaders[4].name = offsetof(StringTable, snNote);
+            sectionHeaders[4].type = SectionHeader::Type::Note;
+            sectionHeaders[4].offset = offsetof(ZebinELF, noteGfxCore);
+            sectionHeaders[4].size = sizeof(noteGfxCore);
+
+            sectionHeaders[5].name = offsetof(StringTable, snSym);
+            sectionHeaders[5].type = SectionHeader::Type::SymbolTable;
+            sectionHeaders[5].offset = offsetof(ZebinELF, symTable);
+            sectionHeaders[5].size = sizeof(symTable);
+            sectionHeaders[5].link = 1; // String Table Header Index
+            sectionHeaders[5].info = sizeof(symTable)/sizeof(symTable[0]);
+            sectionHeaders[5].entrySize = sizeof(symTable[0]);
+
+            // The string for the kernel name is appended immediately following
+            // the StringTable structure.
+            symTable[1].name = sizeof(StringTable);
+            symTable[1].info = SymbolEntry::Info::Func;
+            symTable[1].shndx = 3; // Program Header Index
+            symTable[1].value = 0;
+            symTable[1].size = szKernel;
+
+            symTable[2].name = offsetof(StringTable, kernelEntry);
+            symTable[2].info = SymbolEntry::Info::NoType;
+            symTable[2].shndx = 3; // Program Header Index
+            symTable[2].value = offKernelEntry;
+            symTable[2].size = 0;
+
+            noteGfxCore.payload = static_cast<uint32_t>(npack::encodeGfxCoreFamily(hw));
+
+
+            sectionHeaders[6].name = offsetof(StringTable, snDebugInfo) + 5;
+            sectionHeaders[6].type = SectionHeader::Type::Program;
+            sectionHeaders[6].offset = offsetof(ZebinELF, debugInfo);
+            sectionHeaders[6].size = sizeof(debugInfo);
+
+            sectionHeaders[7].name = offsetof(StringTable, snDebugAbbrev);
+            sectionHeaders[7].type = SectionHeader::Type::Program;
+            sectionHeaders[7].offset = offsetof(ZebinELF, debugAbbrev);
+            sectionHeaders[7].size = sizeof(debugAbbrev);
+
+            sectionHeaders[8] = sectionHeaders[1]; /* Dup of strtab */
+            sectionHeaders[8].name = offsetof(StringTable, snDebugStr);
+            sectionHeaders[8].type = SectionHeader::Type::Program;
+
+            sectionHeaders[9].name = offsetof(StringTable, snDebugLine) + 5;
+            sectionHeaders[9].type = SectionHeader::Type::Program;
+            sectionHeaders[9].offset = sectionHeaders[3].offset + align(szKernel);
+            sectionHeaders[9].size = szDebugLine;
+
+            sectionHeaders[10].name = offsetof(StringTable, snDebugLineStr);
+            sectionHeaders[10].type = SectionHeader::Type::Program;
+            sectionHeaders[10].offset = sectionHeaders[9].offset + align(szDebugLine);
+            sectionHeaders[10].size = szDebugLineStr;
+
+            sectionHeaders[11].name = offsetof(StringTable, snDebugLine);
+            sectionHeaders[11].type = SectionHeader::Type::RelocationWithAddend;
+            sectionHeaders[11].offset = sectionHeaders[10].offset + align(szDebugLineStr);
+            sectionHeaders[11].size = sizeof(Rela);
+            sectionHeaders[11].link = 5; // Symbol table header index
+            sectionHeaders[11].info = 9; // Debug Line header index
+            sectionHeaders[11].entrySize = sizeof(Rela);
+
+            sectionHeaders[12].name = offsetof(StringTable, snDebugInfo);
+            sectionHeaders[12].type = SectionHeader::Type::RelocationWithAddend;
+            sectionHeaders[12].offset = sectionHeaders[11].offset + align(sectionHeaders[11].size);
+            sectionHeaders[12].size = 4*sizeof(Rela);
+            sectionHeaders[12].link = 5; // Symbol table header index
+            sectionHeaders[12].info = 6; // Debug Line header index
+            sectionHeaders[12].entrySize = sizeof(Rela);
+
+        }
+
+        static size_t kernelNameOffset() {
+            return offsetof(ZebinELF, stringTable.snText) + sizeof(stringTable.snText);
+        }
+
+        bool valid() const {
+            if (fileHeader.magic != ELFMagic || fileHeader.elfClass != ELFClass64
+                    || fileHeader.endian != ELFLittleEndian || fileHeader.sectionHeaderSize != sizeof(SectionHeader)
+                    || (fileHeader.version != 0 && fileHeader.version != ELFVersion1)
+                    || (fileHeader.type != ZebinExec && fileHeader.type != ELFRelocatable))
+                return false;
+            auto *base = reinterpret_cast<const uint8_t *>(&fileHeader);
+            auto *sheader = reinterpret_cast<const SectionHeader *>(base + fileHeader.sectionTableOff);
+            for (int s = 0; s < fileHeader.sectionCount; s++, sheader++)
+                if (sheader->type == SectionHeader::Type::ZeInfo)
+                    return true;
+            return false;
+        }
+
+        void findNotes(const Note *&start, const Note *&end) const {
+            auto *base = reinterpret_cast<const uint8_t *>(&fileHeader);
+            auto *sheader0 = reinterpret_cast<const SectionHeader *>(base + fileHeader.sectionTableOff);
+            const char *strtab = nullptr;
+            uint64_t strtabSize = 0;
+
+            auto sheader = sheader0;
+            for (int s = 0; s < fileHeader.sectionCount; s++, sheader++) {
+                if (sheader->type == SectionHeader::Type::StringTable) {
+                    strtab = reinterpret_cast<const char *>(base + sheader->offset);
+                    strtabSize = sheader->size;
+                }
+            }
+
+            bool found = false;
+            sheader = sheader0;
+            for (int s = 0; s < fileHeader.sectionCount; s++, sheader++)
+                if (sheader->type == SectionHeader::Type::Note)
+                    if (sheader->name < strtabSize)
+                        if (!strcmp(strtab + sheader->name, ".note.intelgt.compat"))
+                            { found = true; break; }
+
+            if (found) {
+                start = reinterpret_cast<const Note *>(base + sheader->offset);
+                end = reinterpret_cast<const Note *>(base + sheader->offset + sheader->size);
+            } else
+                start = end = nullptr;
+        }
+    };
+};
+
+#define NGEN_FORWARD_ELF(hw) \
+NGEN_FORWARD_SCOPE_NO_ELF_OVERRIDES(NGEN_NAMESPACE::ELFCodeGenerator<hw>) \
+NGEN_FORWARD_SCOPE_ELF_EXTRA(NGEN_NAMESPACE::ELFCodeGenerator<hw>)        \
+template <typename... Targs> void externalName(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::externalName(std::forward<Targs>(args)...); } \
+const std::string &getExternalName() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getExternalName(); } \
+int getSIMD() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getSIMD(); } \
+int getGRFCount() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getGRFCount(); } \
+size_t getSLMSize() const { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getSLMSize(); } \
+template <typename... Targs> void require32BitBuffers(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::require32BitBuffers(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireArbitrationMode(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireArbitrationMode(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireBarrier(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireBarrier(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireBarriers(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireBarriers(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireDPAS(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireDPAS(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireGlobalAtomics(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireGlobalAtomics(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireGRF(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireGRF(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireLocalID(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireLocalID(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireLocalSize(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireLocalSize(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireNonuniformWGs(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireNonuniformWGs(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireNoPreemption(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireNoPreemption(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireScratch(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireScratch(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireSIMD(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireSIMD(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireSLM(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireSLM(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireStatelessWrites(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireStatelessWrites(std::forward<Targs>(args)...); } \
+void requireType(NGEN_NAMESPACE::DataType type) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireType(type); } \
+template <typename DT = void> void requireType() { NGEN_NAMESPACE::BinaryCodeGenerator<hw>::template requireType<DT>(); } \
+template <typename... Targs> void requireWalkOrder(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireWalkOrder(std::forward<Targs>(args)...); } \
+template <typename... Targs> void requireWorkgroup(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::requireWorkgroup(std::forward<Targs>(args)...); } \
+template <typename... Targs> void finalizeInterface(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::finalizeInterface(std::forward<Targs>(args)...); } \
+template <typename... Targs> void newArgument(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::newArgument(std::forward<Targs>(args)...); } \
+template <typename... Targs> void allowArgumentRearrangement(Targs&&... args) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::allowArgumentRearrangement(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::Subregister getArgument(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgument(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::Subregister getArgumentIfExists(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgumentIfExists(std::forward<Targs>(args)...); } \
+template <typename... Targs> int getArgumentSurface(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgumentSurface(std::forward<Targs>(args)...); } \
+template <typename... Targs> int getArgumentSurfaceIfExists(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getArgumentSurfaceIfExists(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::GRF getLocalID(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getLocalID(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::Subregister getSIMD1LocalID(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getSIMD1LocalID(std::forward<Targs>(args)...); } \
+template <typename... Targs> NGEN_NAMESPACE::Subregister getLocalSize(Targs&&... args) { return NGEN_NAMESPACE::ELFCodeGenerator<hw>::getLocalSize(std::forward<Targs>(args)...); } \
+void prologue() { NGEN_NAMESPACE::ELFCodeGenerator<hw>::prologue(); } \
+void epilogue(const NGEN_NAMESPACE::RegData &r0_info = NGEN_NAMESPACE::RegData()) { NGEN_NAMESPACE::ELFCodeGenerator<hw>::epilogue(r0_info); }
+
+#define NGEN_FORWARD_SCOPE_ELF_EXTRA(scope)
+
+template <HW hw>
+std::vector<uint8_t> ELFCodeGenerator<hw>::getBinary()
+{
+    return getBinary(this->getCode());
+}
+
+template <HW hw>
+std::vector<uint8_t> ELFCodeGenerator<hw>::getBinary(const std::vector<uint8_t> &kernel)
+{
+    using super = BinaryCodeGenerator<hw>;
+    std::vector<uint8_t> binary;
+    std::string metadata;
+
+    // Locate entrypoints for XeHP+.
+    if (hw >= HW::XeHP) {
+        auto idPerThread = super::_labelLocalIDsLoaded.getID(labelManager);
+        auto idCrossThread = super::_labelArgsLoaded.getID(labelManager);
+
+        if (labelManager.hasTarget(idPerThread))
+            interface_.setSkipPerThreadOffset(labelManager.getTarget(idPerThread));
+        if (labelManager.hasTarget(idCrossThread))
+            interface_.setSkipCrossThreadOffset(labelManager.getTarget(idCrossThread));
+    }
+
+    // Generate metadata.
+    metadata = interface_.generateZeInfo();
+
+    auto debugLine_ = super::debugLine.createDebugLine();
+    std::vector<char> debugLine = debugLine_.first;
+    uint64_t kernelRela = 1 | (1ull << 32);
+    typename ZebinELF::Rela debugLineRelocation = {
+        debugLine_.second, kernelRela, 0};
+    const std::vector<char> &debugLineStr = super::debugLine.getDebugLineStr();
+    uint32_t file1 = super::debugLine.fileEntries[1].strTableOffset;
+
+
+    std::array<typename ZebinELF::Rela, 4> debugInfoRelocation;
+    debugInfoRelocation[0] = {offsetof(typename ZebinELF::DebugInfo, CU.low_pc) , kernelRela, 0};
+    debugInfoRelocation[1] = {offsetof(typename ZebinELF::DebugInfo, CU.high_pc), kernelRela, kernel.size()};
+    debugInfoRelocation[2] = {offsetof(typename ZebinELF::DebugInfo, CU.subProgram.low_pc), kernelRela, 0};
+    debugInfoRelocation[3] = {offsetof(typename ZebinELF::DebugInfo, CU.subProgram.high_pc), kernelRela, kernel.size()};
+
+    // Construct ELF.
+    size_t paddedSzKernelName = interface_.getExternalName().length() + 1;
+    size_t paddedSzELF = ZebinELF::align(sizeof(ZebinELF) + paddedSzKernelName);
+    size_t paddedSzMetadata = ZebinELF::align(metadata.size());
+    size_t paddedSzKernel = ZebinELF::align(kernel.size());
+    size_t paddedSzDebugLine = ZebinELF::align(debugLine.size());
+    size_t paddedSzDebugLineStr = ZebinELF::align(debugLineStr.size());
+    size_t paddedSzRelDebugLine = ZebinELF::align(sizeof(debugLineRelocation));
+    size_t paddedSzRelDebugInfo = ZebinELF::align(sizeof(debugInfoRelocation));
+
+    binary.resize(paddedSzELF + paddedSzMetadata + paddedSzKernel + paddedSzDebugLine + paddedSzDebugLineStr + paddedSzRelDebugLine + paddedSzRelDebugInfo);
+
+    (void) new(binary.data()) ZebinELF(paddedSzKernelName, metadata.size(), kernel.size(), interface_.getSkipCrossThreadOffset(),
+                                       debugLine.size(), debugLineStr.size(), file1, super::debugLine.programLine);
+    utils::copy_into(binary, ZebinELF::kernelNameOffset(), interface_.getExternalName());
+    size_t offset = paddedSzELF;
+    utils::copy_into(binary, offset, metadata);
+    offset += paddedSzMetadata;
+    utils::copy_into(binary, offset, kernel);
+    offset += paddedSzKernel;
+    utils::copy_into(binary, offset, debugLine);
+    offset += paddedSzDebugLine;
+    utils::copy_into(binary, offset, debugLineStr);
+    offset += paddedSzDebugLineStr;
+    utils::copy_into(binary, offset, debugLineRelocation);
+    offset += paddedSzRelDebugLine;
+    utils::copy_into(binary, offset, debugInfoRelocation);
+    return binary;
+}
+
+template <HW hw>
+inline HW ELFCodeGenerator<hw>::getBinaryArch(const std::vector<uint8_t> &binary)
+{
+    return getCore(getBinaryHWInfo(binary).family);
+}
+
+template <HW hw>
+inline Product ELFCodeGenerator<hw>::getBinaryHWInfo(const std::vector<uint8_t> &binary)
+{
+    using Note = typename ZebinELF::Note;
+
+    Product outProduct;
+    HW hw_ = HW::Unknown;
+    outProduct.family = ProductFamily::Unknown;
+    outProduct.stepping = 0;
+
+    auto zebinELF = reinterpret_cast<const ZebinELF *>(binary.data());
+    if (zebinELF->valid()) {
+        // Check for .note.intelgt.compat section first. If not present, fall back to flags.
+        const Note *start, *end;
+        zebinELF->findNotes(start, end);
+        if (start && end) {
+            while (start < end) {
+                auto rstart = reinterpret_cast<const uint8_t *>(start);
+                if (start->descSize == sizeof(start->payload)) {
+                    auto *actualPayload = reinterpret_cast<const uint32_t *>(
+                        rstart + offsetof(Note, payload) - sizeof(Note::name) + utils::alignup_pow2(start->nameSize, 4)
+                    );
+                    switch (start->type) {
+                        case Note::Type::ProductFamily: {
+                            auto decodedFamily = npack::decodeProductFamily(static_cast<npack::ProductFamily>(*actualPayload));
+                            if (decodedFamily != ProductFamily::Unknown)
+                                outProduct.family = decodedFamily;
+                            break;
+                        }
+                        case Note::Type::GfxCoreFamily:
+                            if (hw_ == HW::Unknown)
+                                hw_ = npack::decodeGfxCoreFamily(static_cast<npack::GfxCoreFamily>(*actualPayload));
+                            break;
+                        case Note::Type::TargetMetadata: {
+                            typename ZebinELF::TargetMetadata metadata;
+                            metadata.all = *actualPayload;
+                            outProduct.stepping = metadata.parts.minHWRevision;
+                        }
+                        default: break;
+                    }
+                }
+                start = reinterpret_cast<const Note *>(
+                    rstart + offsetof(Note, name)
+                           + utils::alignup_pow2(start->nameSize, 4)
+                           + utils::alignup_pow2(start->descSize, 4)
+                );
+            }
+        } else {
+            if (zebinELF->fileHeader.flags.parts.useGfxCoreFamily)
+                hw_ = npack::decodeGfxCoreFamily(static_cast<npack::GfxCoreFamily>(zebinELF->fileHeader.machine));
+            else
+                outProduct.family = npack::decodeProductFamily(static_cast<npack::ProductFamily>(zebinELF->fileHeader.machine));
+            outProduct.stepping = zebinELF->fileHeader.flags.parts.minHWRevision;
+        }
+    } else
+        return npack::getBinaryHWInfo(binary);
+
+    if (hw_ != HW::Unknown && outProduct.family == ProductFamily::Unknown)
+        outProduct.family = genericProductFamily(hw_);
+    return outProduct;
+}
+
+} /* namespace NGEN_NAMESPACE */
+
+#endif /* NGEN_ELF_HPP */
diff --git a/third_party/ngen/ngen_emulation.hpp b/third_party/ngen/ngen_emulation.hpp
new file mode 100644
index 00000000000..be1f19c458d
--- /dev/null
+++ b/third_party/ngen/ngen_emulation.hpp
@@ -0,0 +1,796 @@
+/*******************************************************************************
+* Copyright 2020-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_EMULATION_HPP
+#define NGEN_EMULATION_HPP
+
+#include "ngen_config_internal.hpp"
+
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+#include <source_location>
+#endif
+
+#include <exception>
+
+namespace NGEN_NAMESPACE {
+
+struct EmulationStrategy {
+    // Emulate 64-bit arithmetic (required for GenXLP)
+    bool emulate64 = false;
+    // Emulate DW x DW -> DW multiplication (required for Gen12)
+    bool emulateDWxDW = false;
+    // Use 32-bit adds for 64-bit arithmetic, assuming no 2^32 boundaries crossed.
+    bool emulate64_add32 = false;
+    // Emulate DW x DW -> QW multiplication (XeHPC)
+    bool emulate64_mul = false;
+    // Emulate QW and/or/xor operations (XeHPC)
+    bool emulate64_logic = false;
+    // Don't emulate QW shl/shr (XeHPC)
+    bool noemulate64_shift = false;
+
+    EmulationStrategy() = default;
+    EmulationStrategy(HW hw_, int stepping = 0) {
+        if (hw_ == HW::Gen11) emulate64 = true;
+        if (hw_ >= HW::Gen11) emulateDWxDW = true;
+        if (hw_ == HW::Gen12LP) emulate64 = true;
+        if (hw_ == HW::XeHPG) emulate64 = true;
+        if (hw_ >= HW::XeHPC) {
+            if (hw_ == HW::XeHPC && stepping < SteppingPVCXTB0)
+                emulate64 = noemulate64_shift = true;
+            else
+                emulate64_mul = emulate64_logic = true;
+        }
+        emulate64_mul |= emulate64;
+    }
+};
+
+struct EmulationState {
+    GRF temp[2];                 // Temporary GRFs for use in emulation sequences
+    FlagRegister flag;           // Flag register for use in emulating 64-bit adds (optional, avoids temporary registers/acc)
+    int flagOffset = 0;                // Channel offset to use with flag register.
+};
+
+// Implementation wrapped as static methods in non-instantiated class.
+// Clients should declare EmulationImplementation as a friend.
+struct EmulationImplementation {
+#ifdef NGEN_ENABLE_SOURCE_LOCATION
+    [[noreturn]] static void stub(std::source_location where = std::source_location::current()) {
+        throw std::runtime_error(std::string("Unimplemented (at ")
+                + std::string(where.file_name()) + ":"
+                + std::to_string(where.line()) + ")");
+    }
+#else
+    [[noreturn]] static void stub() {
+        throw std::runtime_error("Unimplemented");
+    }
+#endif
+
+    template <typename DT, typename O>
+    static void applyDefaultType(O &op)
+    {
+        if (op.getType() == DataType::invalid)
+            op.setType(getDataType<DT>());
+    }
+
+    template <typename O>
+    static bool isQW(const O &op)
+    {
+        return op.getType()== DataType::q || op.getType() == DataType::uq;
+    }
+
+    template <typename O>
+    static bool isDW(const O &op)
+    {
+        return op.getType() == DataType::d || op.getType() == DataType::ud;
+    }
+
+    template <typename O>
+    static bool isW(const O &op) {
+        return op.getType() == DataType::w || op.getType() == DataType::uw;
+    }
+
+    static bool isDW(const Immediate &op)
+    {
+        if (op.getType() == DataType::w)
+            return int16_t(static_cast<uint64_t>(op)) < 0;
+        else
+            return op.getType() == DataType::d || op.getType() == DataType::ud;
+    }
+
+    template <typename O> static O expandDW(const O &op) { return op; }
+    static Immediate expandDW(const Immediate &op) { return op.forceInt32(); }
+
+    template <typename T1, typename T2> static bool equal(const T1 &o1, const T2 &o2) { return o1 == o2; }
+    static bool equal(const RegData &o1, const Immediate &o2) { return false; }
+
+    static void downgradeToDW(RegData &op)
+    {
+        if (isQW(op)) {
+            op.setType((op.getType() == DataType::q) ? DataType::d : DataType::ud);
+            op.setOffset(op.getOffset() * 2);
+        }
+    }
+
+    static void downgradeToDW(Immediate &op)
+    {
+        if (isQW(op))
+            op.setType((op.getType() == DataType::q) ? DataType::d : DataType::ud);
+    }
+
+    // Get the DW equivalent of a QW region.
+    static void makeDWPair(RegData &op, int esize)
+    {
+        if (isQW(op)) {
+            downgradeToDW(op);
+            if (op.getHS() > 1) {
+                if (op.getVS() != op.getHS() * op.getWidth()) stub();
+                op.setRegion(op.getHS() * 2, 2, 1);
+            } else {
+                auto newVS = op.getVS() * 2;
+                if (esize == op.getWidth())
+                    newVS = esize * 2;
+                op.setRegion(newVS, op.getWidth() * 2, 1);
+            }
+        }
+    }
+
+    // Split a register into DW pairs.
+    static void splitToDW(RegData in, RegData &outLo, RegData &outHi)
+    {
+        bool isQ = (in.getType() == DataType::q);
+        bool isUQ = (in.getType() == DataType::uq);
+
+        if (isQ || isUQ) {
+            outLo = in;
+            outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2);
+            outLo.setOffset(in.getOffset() * 2);
+            outLo.setType(DataType::ud);
+
+            outHi = outLo;
+            outHi.setOffset(in.getOffset() * 2 + 1);
+            outHi.setType(isQ ? DataType::d : DataType::ud);
+        } else {
+            outLo = in;
+            outHi = Subregister{};      // invalid
+        }
+    }
+
+    // Split an Immediate into DW pairs.
+    static void splitToDW(const Immediate &in, Immediate &outLo, Immediate &outHi)
+    {
+        bool isQ = (in.getType() == DataType::q);
+        bool isUQ = (in.getType() == DataType::uq);
+
+        if (isQ || isUQ) {
+            outLo = uint32_t(static_cast<uint64_t>(in));
+            outLo = outLo.forceInt32();
+            outLo.setType(DataType::ud);
+
+            outHi = uint32_t(static_cast<uint64_t>(in) >> 32);
+            outHi = outHi.forceInt32();
+            outHi.setType(isQ ? DataType::d : DataType::ud);
+        } else {
+            outLo = in;
+            outHi = uint16_t(0);
+        }
+    }
+
+    static RegData lowWord(RegData in)
+    {
+        if (isW(in)) return in;
+
+        auto outLo = in;
+        outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2);
+        outLo.setOffset(in.getOffset() * 2);
+        outLo.setType(DataType::uw);
+
+        return outLo;
+    }
+
+    static Immediate lowWord(const Immediate &in)
+    {
+        return uint16_t(static_cast<uint64_t>(in) & 0xffff);
+    }
+
+    static RegData highWord(RegData in)
+    {
+        auto out = lowWord(in);
+        out.setOffset(out.getOffset() + 1);
+        return out;
+    }
+
+    static Immediate highWord(const Immediate &in)
+    {
+        return uint16_t(static_cast<uint64_t>(in) >> 16);
+    }
+
+    static bool isUnitStride(const RegData &rd)
+    {
+        return (rd.getHS() == 1 && rd.getVS() == rd.getWidth());
+    }
+
+    static void regionVSAdvance(HW hw, RegData &rd, int i)
+    {
+        int ne = GRF::bytes(hw) / rd.getBytes();
+        int advance = rd.getWidth() > 0 ? (i / rd.getWidth()) * rd.getVS()
+                                        : i * rd.getHS();
+        int noffset = rd.getOffset() + advance;
+        if (noffset >= ne) {
+            noffset--;
+            rd.setBase(rd.getBase() + 1);
+        }
+        rd.setOffset(noffset);
+    }
+
+    static void regionVSAdvance(HW hw, Immediate &imm, int i) {}
+
+    // Move, emulating 64-bit moves with 32-bit (generally a good idea).
+    template <typename DT = void, typename Generator>
+    static void emov(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, const EmulationStrategy &strategy, SourceLocation loc = {})
+    {
+        applyDefaultType<DT>(dst);
+        applyDefaultType<DT>(src0);
+
+        bool dstQ = isQW(dst);
+        bool s0Q = isQW(src0);
+        bool s0D = isDW(src0);
+        bool isDF = (src0.getType() == DataType::df && dst.getType() == DataType::df);
+        bool unaligned = (mod.getExecSize() > 1 && src0.getHS() != 0 && src0.getOffset() != dst.getOffset());
+
+        if ((dstQ && s0D) && strategy.emulate64) {
+            if (src0.getNeg()) stub();
+            bool s0Signed = isSigned(src0.getType());
+            RegData dstHi, dstLo;
+            splitToDW(dst, dstLo, dstHi);
+            g.mov(mod, dstLo, src0, loc);
+            if (!s0Signed)
+                g.mov(mod, dstHi, 0, loc);
+            else
+                g.asr(mod, dstHi, dstLo, uint16_t(31), loc);
+        } else if (((dstQ || s0Q) && strategy.emulate64)
+                || (isDF && unaligned && g.hardware >= HW::XeHP)) {
+            if (dstQ != s0Q) stub();
+
+            auto mod2x = mod;
+            mod2x.setExecSize(mod.getExecSize() * 2);
+
+            makeDWPair(dst, mod.getExecSize());
+            makeDWPair(src0, mod.getExecSize());
+            g.mov(mod2x, dst, src0, loc);
+        } else if (dst.getType() == DataType::f && src0.getType() == DataType::bf && (src0.getHS() != 1 || mod.getExecSize() == 1)) {
+            // Emulate bf16->f32 upconversion
+            dst.setType(DataType::ud);
+            src0.setType(DataType::uw);
+            g.shl(mod, dst, src0, 16, loc);
+        } else
+            g.mov(mod, dst, src0, loc);
+    }
+
+    template <typename DT = void, typename Generator>
+    static void emov(Generator &g, const InstructionModifier &mod, RegData dst, Immediate src0, const EmulationStrategy &strategy, SourceLocation loc = {})
+    {
+        applyDefaultType<DT>(dst);
+        applyDefaultType<DT>(src0);
+
+        bool dstQ = isQW(dst);
+        bool s0Q = isQW(src0);
+
+        if ((dstQ || s0Q) && strategy.emulate64) {
+            if (!dstQ) stub();
+
+            RegData dstHi, dstLo;
+            Immediate s0Hi = 0, s0Lo = 0;
+
+            splitToDW(src0, s0Lo, s0Hi);
+
+            if (static_cast<uint64_t>(s0Lo) == static_cast<uint64_t>(s0Hi) && dst.getHS() <= 1) {
+                auto mod2x = mod;
+                mod2x.setExecSize(mod.getExecSize() * 2);
+
+                downgradeToDW(dst);
+                dst.setRegion(0, 0, 1);
+                g.mov(mod2x, dst, s0Lo, loc);
+            } else {
+                splitToDW(dst, dstLo, dstHi);
+                g.mov(mod, dstLo, s0Lo, loc);
+                g.mov(mod, dstHi, s0Hi, loc);
+            }
+        } else
+            g.mov(mod, dst, src0, loc);
+    }
+
+    template <typename Generator>
+    static void eaddSignExtend1(Generator &g, const InstructionModifier &mod, bool &doSub, const Immediate &src1, Immediate &s1LoPos, const Immediate &s1Lo, const Immediate &s1Hi, bool &s1Q, const GRF (&temp)[2], const SourceLocation & loc)
+    {
+        uint64_t raw = static_cast<uint64_t>(src1);
+        if (src1.getType() == DataType::d) {
+            auto val = int32_t(raw);
+            s1LoPos = uint32_t(std::abs(val));
+            doSub = (val < 0);
+        } else if (src1.getType() == DataType::w) {
+            auto val = int16_t(raw);
+            s1LoPos = uint16_t(std::abs(val));
+            doSub = (val < 0);
+        }
+    }
+
+    template <typename Generator>
+    static void eaddSignExtend1(Generator &g, const InstructionModifier &mod, bool &doSub, const RegData &src1, RegData &s1LoPos, RegData &s1Lo, RegData &s1Hi, bool &s1Q, const GRF (&temp)[2], const SourceLocation &loc)
+    {
+        s1Q = true;
+        s1Hi = temp[0].d();
+        if (s1Lo.getNeg()) {
+            g.asr(mod, s1Hi, -s1Lo, uint16_t(31), loc);
+            s1Hi = -s1Hi;
+        } else
+            g.asr(mod, s1Hi, s1Lo, uint16_t(31), loc);
+        s1Lo.setType(DataType::ud);
+    }
+
+    static void eaddHandleS1Neg(bool &doSub, RegData &s1LoPos, const RegData &s1Lo)
+    {
+        if (isSigned(s1Lo.getType())) stub();
+        doSub = s1Lo.getNeg();
+        s1LoPos = -s1Lo;
+    }
+
+    static void eaddHandleS1Neg(bool &doSub, const Immediate &s1LoPos, const Immediate &s1Lo)
+    {
+        /* no-op */
+    }
+
+    template <typename Generator>
+    static void eaddFixupQD(Generator &g, const InstructionModifier &mod, const FlagRegister &flag, const RegData &dstHi, const RegData &src1, const SourceLocation &loc)
+    {
+        if ((src1.getBytes() < 8) && isSigned(src1.getType())) {
+            // Add sign extension of src1 to high 32 bits of dst (inefficient but rarely used path).
+            g.cmp(mod | (src1.getNeg() ? g.le : g.lt) | flag, src1, 0, loc);
+            g.add(mod | flag, dstHi, dstHi, -1, loc);
+        }
+    }
+
+    template <typename Generator>
+    static void eaddFixupQD(Generator &g, const InstructionModifier &mod, const FlagRegister &flag, const RegData &dstHi, const Immediate &src1, const SourceLocation &loc) {
+        /* no-op */
+    }
+
+    static bool eaddIsNegative(const RegData &r)
+    {
+        return r.getNeg();
+    }
+
+    static bool eaddIsNegative(const Immediate &i)
+    {
+        return int32_t(uint64_t(i)) < 0;
+    }
+
+    // Integer addition, emulating 64-bit arithmetic if configured.
+    template <typename DT = void, typename S1, typename Generator>
+    static void eaddInternal(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, S1 src1, const EmulationStrategy &strategy, const EmulationState &state, const SourceLocation &loc)
+    {
+        const auto &temp = state.temp;
+
+        applyDefaultType<DT>(dst);
+        applyDefaultType<DT>(src0);
+        applyDefaultType<DT>(src1);
+
+        bool dstQ = isQW(dst);
+        bool s0Q = isQW(src0);
+        bool s1Q = isQW(src1);
+
+        if (dstQ && strategy.emulate64_add32) {
+            RegData dstHi, dstLo, s0Hi, s0Lo;
+            S1 s1Hi, s1Lo;
+
+            splitToDW(dst, dstLo, dstHi);
+            splitToDW(src0, s0Lo, s0Hi);
+            splitToDW(src1, s1Lo, s1Hi);
+            g.add(mod, dstLo, s0Lo, s1Lo, loc);
+
+            if (s0Q && s1Q) {
+                if (!equal(dstHi, s0Hi) && !equal(dstHi, s1Hi))
+                    g.add(mod, dstHi, s0Hi, s1Hi, loc);
+            } else if (s0Q) {
+                if (!equal(dstHi, s0Hi)) g.mov(mod, dstHi, s0Hi, loc);
+            } else if (s1Q) {
+                if (!equal(dstHi, s1Hi)) g.mov(mod, dstHi, s1Hi, loc);
+            } else
+                g.mov(mod, dstHi, uint16_t(0), loc);
+        } else if (!strategy.emulate64)
+            g.add(mod, dst, src0, src1, loc);
+        else {
+            if (!dstQ) {
+                downgradeToDW(src0);
+                downgradeToDW(src1);
+                g.add(mod, dst, src0, src1, loc);
+            } else {
+                RegData dstHi, dstLo, s0Hi, s0Lo;
+                S1 s1Hi, s1Lo, s1LoPos;
+                FlagRegister flag = state.flag;
+
+                splitToDW(dst, dstLo, dstHi);
+                splitToDW(src0, s0Lo, s0Hi);
+                splitToDW(src1, s1Lo, s1Hi);
+                s1LoPos = s1Lo;
+
+                bool s0Signed = isSigned(s0Lo.getType());
+                bool s1Signed = isSigned(s1Lo.getType());
+
+                if (flag.isValid() && !eaddIsNegative(s0Lo)) {
+                    // Use flag register + ov.
+                    auto Mx = g.ExecutionOffset(state.flagOffset);
+                    bool neg = eaddIsNegative(s1Lo);
+                    bool revFlag = false;
+
+                    auto s0LoUD = s0Lo;
+                    auto s1LoMod = s1Lo;
+                    s0LoUD.setType(DataType::ud);
+                    if (s1Signed && !std::is_base_of<Immediate, S1>::value) {
+                        s1LoMod.setType(DataType::ud);
+                        revFlag = neg;
+                        neg = false;
+                    }
+
+                    g.add(mod | Mx | g.ov | flag, dstLo, s0LoUD, s1LoMod, loc);
+                    if (s0Q && s1Q)
+                        g.add(mod, dstHi, s0Hi, s1Hi, loc);
+                    else if (s0Q && !equal(dstHi, s0Hi))
+                        g.mov(mod, dstHi, s0Hi, loc);
+                    else if (s1Q && !equal(dstHi, s1Hi))
+                        g.mov(mod, dstHi, s1Hi, loc);
+                    else if (!s0Q && !s1Q)
+                        g.mov(mod, dstHi, 0, loc);
+                    g.add(mod | Mx | (revFlag ? ~flag : flag), dstHi, dstHi, neg ? -1 : +1, loc);
+                    eaddFixupQD(g, mod | Mx, flag, dstHi, src0, loc);
+                    eaddFixupQD(g, mod | Mx, flag, dstHi, src1, loc);
+                } else {
+                    // Slow path: addc/subb + acc.
+                    RegData carry = temp[0].ud();
+                    bool lateCarry = false;
+                    RegData subDstLo;
+                    bool doSub = false;
+
+                    // For :uq + :d or :q + :ud, sign extend 32-bit input to 64 bits.
+                    if (s0Signed != s1Signed) {
+                        if (s0Signed) {
+                            s0Q = true;
+                            s0Hi = temp[0].d();
+                            g.asr(mod, s0Hi, s0Lo, uint16_t(31), loc);
+                            s0Lo.setType(DataType::ud);
+                            if (s0Lo.getNeg())
+                                s0Hi = -s0Hi;
+                        } else
+                            eaddSignExtend1(g, mod, doSub, src1, s1LoPos, s1Lo, s1Hi, s1Q, temp, loc);
+                        carry = temp[1].ud();
+                        lateCarry = true;
+                    }
+
+                    // Handle modifiers.
+                    if (s0Lo.getNeg()) stub();
+                    eaddHandleS1Neg(doSub, s1LoPos, s1Lo);
+
+                    // Compute low 32 bits, saving carry/borrow.
+                    if (dstLo.getOffset() != 0) {
+                        doSub ? g.subb(mod, g.null.retype(s0Lo.getType()), s0Lo, s1LoPos, loc)
+                              : g.addc(mod, g.null.retype(s0Lo.getType()), s0Lo, s1Lo, loc);
+                        g.add(mod, dstLo, s0Lo, s1Lo, loc);
+                    } else if ((mod.getExecSize() > 1) && !isUnitStride(dstLo)) {
+                        subDstLo = temp[1].ud();
+                        doSub ? g.subb(mod, subDstLo, s0Lo, s1LoPos, loc)
+                              : g.addc(mod, subDstLo, s0Lo, s1Lo, loc);
+                    } else {
+                        doSub ? g.subb(mod, dstLo, s0Lo, s1LoPos, loc)
+                              : g.addc(mod, dstLo, s0Lo, s1Lo, loc);
+                    }
+
+                    // Retrieve carry from accumulator, unless it conflicts with subDstLo.
+                    if (!lateCarry) g.mov(mod, carry, g.acc0.ud(), loc);
+
+                    // Move low 32-bits to final resting place, if needed.
+                    if (subDstLo.isValid()) g.mov(mod, dstLo, subDstLo, loc);
+
+                    // Retrieve carry from accumulator once subDstLo isn't needed.
+                    if (lateCarry) g.mov(mod, carry, g.acc0.ud(), loc);
+
+                    if (doSub)
+                        carry = -carry;
+
+                    // Compute high 32 bits of sum.
+                    if (s0Q && s1Q) {
+                        g.add(mod, dstHi, s0Hi, s1Hi, loc);
+                        g.add(mod, dstHi, carry, dstHi, loc);
+                    } else if (s0Q)
+                        g.add(mod, dstHi, carry, s0Hi, loc);
+                    else if (s1Q)
+                        g.add(mod, dstHi, carry, s1Hi, loc);
+                    else
+                        g.mov(mod, dstHi, carry, loc);
+                }
+            }
+        }
+    }
+
+    template <typename DT = void, typename Generator>
+    static void eadd(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) {
+        if (src0.getNeg() && !src1.getNeg() && strategy.emulate64 && !strategy.emulate64_add32)
+            eaddInternal<DT>(g, mod, dst, src1, src0, strategy, state, loc);
+        else
+            eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
+    }
+
+    template <typename DT = void, typename Generator>
+    static void eadd(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {}) {
+        eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
+    }
+
+    // Integer multiplication, emulating 32x32 multiplication as configured.
+    template <typename DT = void, typename S1, typename Generator>
+    static void emulInternal(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, S1 src1, const EmulationStrategy &strategy, const EmulationState &state, const SourceLocation &loc) {
+        applyDefaultType<DT>(dst);
+        applyDefaultType<DT>(src0);
+        applyDefaultType<DT>(src1);
+
+        bool dstD = isDW(dst);
+        bool dstQ = isQW(dst);
+        bool s0W = isW(src0);
+        bool s0D = isDW(src0);
+        bool s0Q = isQW(src0);
+        bool s1W = isW(src1);
+        bool s1D = isDW(src1);
+        bool s1Q = isQW(src1);
+        bool s1Immed = std::is_base_of<Immediate, S1>::value;
+
+        bool s0Signed = isSigned(src0.getType());
+        bool s1Signed = isSigned(src1.getType());
+        auto mulHiType = (s0Signed || s1Signed) ? DataType::d : DataType::ud;
+
+        bool emulate64 = strategy.emulate64_mul;
+
+        if (s0Q) {
+            if (!s1D || !dstQ) stub();
+            auto temp = s1Signed ? state.temp[0].d() : state.temp[0].ud();
+            auto &src1Reg = [&]() -> RegData & {
+                if (s1Immed || s1W) {
+                    g.mov(mod, temp, src1, loc);
+                    return temp;
+                } else {
+                    return *reinterpret_cast<RegData *>(&src1);
+                }
+            }();
+            return emulInternal(g, mod, dst, src1Reg, src0, strategy, state, loc);
+        } else if (s1Q) {
+            if (!s0D || !dstQ) stub();
+            auto s0Type = src0.getType();
+            RegData dstLo, dstHi;
+            S1 s1Hi, s1Lo;
+            splitToDW(dst, dstLo, dstHi);
+            splitToDW(src1, s1Lo, s1Hi);
+            s1Hi = expandDW(s1Hi);
+            s1Lo = expandDW(s1Lo);
+            dstLo.setType(src0.getType());
+            dstHi.setType(src0.getType());
+            auto s1W0 = lowWord(s1Lo);
+            auto s1W2 = lowWord(s1Hi);
+            auto accLo
+                    = g.acc0.retype(s0Type)[dstLo.getOffset()](dstLo.getHS());
+            auto accHi
+                    = g.acc0.retype(s0Type)[dstHi.getOffset()](dstHi.getHS());
+            g.mul(mod, accHi, src0, s1W2, loc);
+            g.macl(mod, dstHi, src0, s1Hi, loc);
+            g.mul(mod, accLo, src0, s1W0, loc);
+            g.mach(mod, dstLo, src0, s1Lo, loc);
+            g.add(mod, dstHi, dstHi, dstLo, loc);
+            g.mov(mod, dstLo, accLo, loc);
+        } else if (dstQ && s0W && s1W) {
+            RegData dstLo, dstHi;
+            splitToDW(dst, dstLo, dstHi);
+
+            g.mul(mod, dstLo, src0, src1, loc);
+
+            dstHi.setType(mulHiType);
+            dstLo.setType(mulHiType);
+
+            if (s0Signed || s1Signed)
+                g.asr(mod, dstHi, dstLo, 31, loc);
+            else
+                g.mov(mod, dstHi, 0, loc);
+        } else if (dstQ && s0W && s1D) {
+            stub();
+        } else if (dstQ && s0D && s1W && !s1Immed && !emulate64 && !strategy.emulateDWxDW) {
+            auto acc = g.acc0.d();
+            g.mov(mod, acc, src1, loc);
+            g.mul(mod, dst, acc, src0, loc);
+        } else if (dstQ && s0D && ((s1W && !s1Immed) || ((s1W || s1D) && emulate64))) {
+            RegData dstLo, dstHi;
+            splitToDW(dst, dstLo, dstHi);
+
+            auto acc = g.acc0.retype(mulHiType)[dstLo.getOffset()](dstLo.getHS());
+
+            g.mul(mod, acc, src0, lowWord(src1), loc);
+            if (s1D)
+                g.mach(mod, dstLo, src0, expandDW(src1), loc);
+            else
+                g.mach(mod, dstLo, src0, int32_t(0), loc);
+            g.mov(mod, dstHi, dstLo, loc);
+            g.mov(mod, dstLo, acc, loc);
+        } else if (dstD && s0D && s1D && strategy.emulateDWxDW) {
+            int ne1 = GRF::bytes(g.hardware) >> 2;
+
+            for (int r = 0; r < mod.getExecSize(); r += ne1) {
+                auto mmod = mod;
+                mmod.setExecSize(std::min(mod.getExecSize() - r, ne1));
+
+                auto acc = g.acc0.retype(mulHiType)[dst.getOffset()](dst.getHS());
+                auto dummy = g.null.retype(mulHiType)[dst.getOffset()](dst.getHS());
+
+                g.mul(mmod, acc, src0, lowWord(src1), loc);
+
+                if (g.getHardware() < HW::Gen10) {
+                    g.mach(mmod, dummy, src0, expandDW(src1), loc);
+                    g.mov(mmod, dst, acc, loc);
+                } else {
+                    g.macl(mmod, dst, src0, expandDW(src1), loc);
+                }
+
+                regionVSAdvance(g.hardware, dst, ne1);
+                regionVSAdvance(g.hardware, src0, ne1);
+                regionVSAdvance(g.hardware, src1, ne1);
+            }
+        } else
+            g.mul(mod, dst, src0, src1, loc);
+    }
+
+    template <typename DT = void, typename Generator>
+    static void emul(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {})
+    {
+        emulInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
+    }
+
+    template <typename DT = void, typename Generator>
+    static void emul(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, Immediate src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {})
+    {
+        emulInternal<DT>(g, mod, dst, src0, src1, strategy, state, loc);
+    }
+
+    template <typename S1, typename Generator>
+    static void emul32High(Generator &g, const InstructionModifier &mod, const RegData &dstHi, const RegData &src0, const S1 &src1, SourceLocation loc = {})
+    {
+        g.mul(mod, g.acc0.ud(dstHi.getOffset()), src0, lowWord(src1), loc);
+        g.mach(mod, dstHi, src0, src1, loc);
+    }
+
+    // Shift left, emulating 64-bit arithmetic if configured.
+    template <typename DT = void, typename Generator>
+    static void eshl(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {})
+    {
+        const auto &temp = state.temp;
+
+        applyDefaultType<DT>(dst);
+        applyDefaultType<DT>(src0);
+
+        bool dstQ = isQW(dst);
+        bool s0Q = isQW(src0);
+
+        if (src1 == 0) {
+            emov<DT, Generator>(g, mod, dst, src0, strategy, loc);
+            return;
+        }
+
+        if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) {
+            if (src1 >= 32) stub();
+
+            RegData dstHi, dstLo, s0Hi, s0Lo;
+
+            auto acc = temp[0].ud();
+
+            splitToDW(dst, dstLo, dstHi);
+
+            if (s0Q) {
+                splitToDW(src0, s0Lo, s0Hi);
+
+                g.shr(mod, acc, s0Lo, uint16_t(32 - src1), loc);
+                g.shl(mod, dstHi, s0Hi, src1, loc);
+                g.shl(mod, dstLo, s0Lo, src1, loc);
+                g.or_(mod, dstHi, acc, dstHi, loc);
+            } else {
+                dstHi.setType(DataType::ud);
+                g.shl(mod, dstLo, src0, src1, loc);
+                g.shr(mod, dstHi, src0, uint16_t(32 - src1), loc);
+            }
+        } else {
+            if (s0Q && !dstQ) downgradeToDW(src0);
+            g.shl(mod, dst, src0, src1, loc);
+        }
+    }
+
+    // Shift right, emulating 64-bit arithmetic if configured.
+    template <typename DT = void, typename Generator>
+    static void eshr(Generator &g, const InstructionModifier &mod, RegData dst, RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {})
+    {
+        const auto &temp = state.temp;
+
+        applyDefaultType<DT>(dst);
+        applyDefaultType<DT>(src0);
+
+        bool dstQ = isQW(dst);
+        bool s0Q = isQW(src0);
+
+        if (src1 == 0) {
+            emov<DT, Generator>(g, mod, dst, src0, strategy, loc);
+            return;
+        }
+
+        if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) {
+            if (src1 >= 32) stub();
+
+            RegData dstHi, dstLo, s0Hi, s0Lo;
+
+            auto acc = temp[0].ud();
+
+            splitToDW(dst, dstLo, dstHi);
+
+            if (s0Q) {
+                splitToDW(src0, s0Lo, s0Hi);
+
+                g.shl(mod, acc, s0Lo, uint16_t(32 - src1), loc);
+                g.shr(mod, dstLo, s0Lo, src1, loc);
+                isSigned(src0.getType()) ? g.asr(mod, dstHi, s0Hi, src1, loc)
+                                         : g.shr(mod, dstHi, s0Hi, src1, loc);
+                g.or_(mod, dstLo, acc, dstLo, loc);
+            } else {
+                dstLo.setType(dstHi.getType());
+                isSigned(src0.getType()) ? g.asr(mod, dstLo, src0, src1, loc)
+                                         : g.shr(mod, dstLo, src0, src1, loc);
+                g.mov(mod, dstHi, uint16_t(0), loc);
+            }
+        } else {
+            if (s0Q && !dstQ) downgradeToDW(src0);
+            isSigned(src0.getType()) ? g.asr(mod, dst, src0, src1, loc)
+                                     : g.shr(mod, dst, src0, src1, loc);
+        }
+    }
+
+    // Multiply by a constant, optimizing for power-of-2 constants and emulating 64-bit arithmetic if configured.
+    template <typename DT = void, typename Generator>
+    static void emulConstant(Generator &g, const InstructionModifier &mod, const RegData &dst, const RegData &src0, int32_t src1, const EmulationStrategy &strategy, const EmulationState &state, SourceLocation loc = {})
+    {
+        if (src1 == 0)
+            emov<DT>(g, mod, dst, uint16_t(0), strategy, loc);
+        else if (src1 == 1) {
+            if (dst != src0) emov<DT>(g, mod, dst, src0, strategy, loc);
+        } else if (utils::is_zero_or_pow2(src1))
+            eshl<DT>(g, mod, dst, src0, uint16_t(utils::log2(src1)), strategy, state, loc);
+        else if (src1 > 0)
+            emul<DT>(g, mod, dst, src0, uint32_t(src1), strategy, state, loc);
+        else
+            emul<DT>(g, mod, dst, src0, int32_t(src1), strategy, state, loc);
+    }
+}; // struct EmulationHelper
+
+} /* namespace NGEN_NAMESPACE */
+
+#define NGEN_EMULATION_FORWARD \
+template <typename DT = void> void emov(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::RegData src0,   const EmulationStrategy &strategy)                                                                     { EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy); } \
+template <typename DT = void> void emov(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::Immediate src0, const EmulationStrategy &strategy)                                                                     { EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy); } \
+template <typename DT = void> void eadd(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, const NGEN_NAMESPACE::RegData &src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::eadd<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename DT = void> void eadd(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, NGEN_NAMESPACE::Immediate src1,      const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::eadd<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename DT = void> void emul(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, const NGEN_NAMESPACE::RegData &src1, const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::emul<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename DT = void> void emul(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, NGEN_NAMESPACE::Immediate src1,      const EmulationStrategy &strategy, const EmulationState &state) { EmulationImplementation::emul<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename DT = void> void eshl(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state)                           { EmulationImplementation::eshl<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename DT = void> void eshr(const NGEN_NAMESPACE::InstructionModifier &mod, NGEN_NAMESPACE::RegData dst, NGEN_NAMESPACE::RegData src0, uint16_t src1, const EmulationStrategy &strategy, const EmulationState &state)                           { EmulationImplementation::eshr<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename DT = void> void emulConstant(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dst, const NGEN_NAMESPACE::RegData &src0, int32_t src1, const EmulationStrategy &strategy, const EmulationState &state)      { EmulationImplementation::emulConstant<DT>(*this, mod, dst, src0, src1, strategy, state); } \
+template <typename S1> void emul32High(const NGEN_NAMESPACE::InstructionModifier &mod, const NGEN_NAMESPACE::RegData &dstHi, const NGEN_NAMESPACE::RegData &src0, const S1 &src1)                                                                           { EmulationImplementation::emul32High(*this, mod, dstHi, src0, src1); }
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_gen12.hpp b/third_party/ngen/ngen_gen12.hpp
similarity index 91%
rename from src/gpu/intel/jit/ngen/ngen_gen12.hpp
rename to third_party/ngen/ngen_gen12.hpp
index c9aef7d2408..e32aa422bcc 100644
--- a/src/gpu/intel/jit/ngen/ngen_gen12.hpp
+++ b/third_party/ngen/ngen_gen12.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,7 +17,13 @@
 /*
  * Do not #include this file directly; ngen uses it internally.
  */
+#ifndef NGEN_GEN12_HPP
+#define NGEN_GEN12_HPP
 
+#include "ngen_auto_swsb.hpp"
+#include "ngen_gen8.hpp"
+
+namespace NGEN_NAMESPACE {
 // Gen12 binary encoding.
 
 struct EncodingTag12 {};
@@ -32,6 +38,8 @@ template <> struct EncodingTag12Dispatch<HW::XeHPC> { using tag = EncodingTagXeH
 template <> struct Instruction12Dispatch<HW::XeHPC> { using type = InstructionXeHPC; };
 template <> struct EncodingTag12Dispatch<HW::Xe2>   { using tag = EncodingTagXeHPC; };
 template <> struct Instruction12Dispatch<HW::Xe2>   { using type = InstructionXeHPC; };
+template <> struct EncodingTag12Dispatch<HW::Xe3>   { using tag = EncodingTagXeHPC; };
+template <> struct Instruction12Dispatch<HW::Xe3>   { using type = InstructionXeHPC; };
 
 class SWSBInfo12
 {
@@ -171,7 +179,7 @@ class SWSBInfoXeHPC
         auto result = SWSBInfo(pipe(op), dist());
         if (combined.mode) {
             bool src, dst;
-            if (op == Opcode::send || op == Opcode::sendc)
+            if (isSend(op))
                 src = dst = true;
             else if (op == Opcode::dpas) {
                 src = (combined.mode <= 2);
@@ -198,13 +206,13 @@ class SWSBInfoXeHPC
     }
     constexpr14 Pipe pipe(Opcode op) const {
         if (combined.mode) {
-            if (op == Opcode::send || op == Opcode::sendc)
+            if (isSend(op))
                 return (combined.mode == 1) ? Pipe::A : (combined.mode == 2) ? Pipe::F : Pipe::I;
             if (op == Opcode::dpas)
                 return Pipe::Default;
             return (combined.mode == 3) ? Pipe::A : Pipe::Default;
         } else if (!scoreboard.sb) {
-            const Pipe table[8] = {Pipe::Default, Pipe::A, Pipe::F, Pipe::I, Pipe::L, Pipe::M, Pipe::A, Pipe::A};
+            const Pipe table[8] = {Pipe::Default, Pipe::A, Pipe::F, Pipe::I, Pipe::L, Pipe::M, Pipe::S, Pipe::A};
             return table[pipeline.pipe];
         } else
             return Pipe::Default;
@@ -472,6 +480,14 @@ struct InstructionXeHPC : public Instruction12 {
     bool getOperandRegion(autoswsb::DependencyRegion &region, int opNum) const {
         return Instruction12::getOperandRegion<EncodingTagXeHPC>(region, opNum);
     }
+
+    bool eot() const {
+        return Instruction12::eot();
+    }
+
+    bool atomic() const {
+        return Instruction12::atomic();
+    }
 };
 
 static_assert(sizeof(InstructionXeHPC) == 16, "Internal error: InstructionXeHPC has been padded by the compiler.");
@@ -481,7 +497,7 @@ static_assert(sizeof(InstructionXeHPC) == 16, "Internal error: InstructionXeHPC
 static inline unsigned getTypecode12(DataType type)
 {
     static const uint8_t conversionTable[32] = {2,6,1,5,0,4,11,10,3,7,9,13,8,0,4,8,
-                                                14,2,2,2,2,2,2,2,2,2,2,2,0,4,0,4};
+                                                14,12,2,2,2,2,2,2,2,2,2,2,0,4,0,4};
     return conversionTable[static_cast<unsigned>(type) & 0x1F];
 }
 
@@ -511,7 +527,7 @@ static inline constexpr14 BinaryOperand12 encodeBinaryOperand12(const RegData &r
         op.indirect.addrReg = rd.getIndirectOff();
         op.indirect.addrMode = 1;
         if (srcN >= 0)
-            op.indirect.vs = (rd.isVxIndirect()) ? 0xFFFF : pow2Encode(rd.getVS());
+            op.indirect.vs = (rd.isVxIndirect()) ? 0xF : pow2Encode(rd.getVS());
     } else {
         op.direct.regFile = getRegFile(rd);
         op.direct.subRegNum = rd.getByteOffset();
@@ -543,7 +559,7 @@ static inline constexpr14 BinaryOperand12 encodeBinaryOperand12(const RegData &r
         op.indirect.addrReg = rd.getIndirectOff();
         op.indirect.addrMode = 1;
         if (srcN >= 0) {
-            op.indirect.vs = (rd.isVxIndirect()) ? 0xFFFF : pow2Encode(rd.getVS());
+            op.indirect.vs = (rd.isVxIndirect()) ? 0xF : pow2Encode(rd.getVS());
             op.indirectXeHPC.addrOff0 = (rd.getOffset() & 1);
         }
     } else {
@@ -625,36 +641,40 @@ static inline constexpr14 TernaryOperand12 encodeTernaryOperand12(const Extended
 
 static inline void encodeCommon12(Instruction12 &i, Opcode opcode, const InstructionModifier &mod, const RegData &dst, EncodingTag12 tag)
 {
-    i.common.opcode = static_cast<unsigned>(opcode) | (mod.parts.autoSWSB << 7);
-    i.common.swsb = SWSBInfo12(mod.getSWSB(), opcode).raw();
-    i.common.execSize = mod.parts.eSizeField;
-    i.common.execOffset = mod.parts.chanOff;
-    i.common.flagReg = (mod.parts.flagRegNum << 1) | mod.parts.flagSubRegNum;
-    i.common.predCtrl = mod.parts.predCtrl;
-    i.common.predInv = mod.parts.predInv;
-    i.common.cmptCtrl = mod.parts.cmptCtrl;
-    i.common.debugCtrl = mod.parts.debugCtrl;
-    i.common.maskCtrl = mod.parts.maskCtrl;
-    i.common.atomicCtrl = mod.parts.threadCtrl;
-    i.common.accWrCtrl = mod.parts.accWrCtrl;
-    i.common.saturate = mod.parts.saturate;
+    Instruction12 i2;   /* separate variable to avoid gcc13 bug */
+    i2.common.opcode = static_cast<unsigned>(opcode) | (mod.parts.autoSWSB << 7);
+    i2.common.swsb = SWSBInfo12(mod.getSWSB(), opcode).raw();
+    i2.common.execSize = mod.parts.eSizeField;
+    i2.common.execOffset = mod.parts.chanOff;
+    i2.common.flagReg = (mod.parts.flagRegNum << 1) | mod.parts.flagSubRegNum;
+    i2.common.predCtrl = mod.parts.predCtrl;
+    i2.common.predInv = mod.parts.predInv;
+    i2.common.cmptCtrl = mod.parts.cmptCtrl;
+    i2.common.debugCtrl = mod.parts.debugCtrl;
+    i2.common.maskCtrl = mod.parts.maskCtrl;
+    i2.common.atomicCtrl = mod.parts.threadCtrl;
+    i2.common.accWrCtrl = mod.parts.accWrCtrl;
+    i2.common.saturate = mod.parts.saturate;
+    i.common = i2.common;
 }
 
 static inline void encodeCommon12(Instruction12 &i, Opcode opcode, const InstructionModifier &mod, const RegData &dst, EncodingTagXeHPC tag)
 {
-    i.common.opcode = static_cast<unsigned>(opcode) | (mod.parts.autoSWSB << 7);
-    i.commonXeHPC.swsb = SWSBInfoXeHPC(mod.getSWSB(), opcode).raw();
-    i.commonXeHPC.execSize = mod.parts.eSizeField;
-    i.commonXeHPC.flagReg = (mod.parts.flagRegNum1 << 2) | (mod.parts.flagRegNum << 1) | mod.parts.flagSubRegNum;
-    i.commonXeHPC.execOffset = mod.parts.chanOff >> 1;
-    i.commonXeHPC.predCtrl = mod.parts.predCtrl;
-    i.common.predInv = mod.parts.predInv;
-    i.common.cmptCtrl = mod.parts.cmptCtrl;
-    i.common.debugCtrl = mod.parts.debugCtrl;
-    i.common.maskCtrl = mod.parts.maskCtrl;
-    i.common.atomicCtrl = mod.parts.threadCtrl;
-    i.commonXeHPC.dstExt = (dst.isIndirect() ? dst.getOffset() : dst.getByteOffset()) & 1;
-    i.common.saturate = mod.parts.saturate;
+    Instruction12 i2;   /* separate variable to avoid gcc13 bug */
+    i2.common.opcode = static_cast<unsigned>(opcode) | (mod.parts.autoSWSB << 7);
+    i2.commonXeHPC.swsb = SWSBInfoXeHPC(mod.getSWSB(), opcode).raw();
+    i2.commonXeHPC.execSize = mod.parts.eSizeField;
+    i2.commonXeHPC.flagReg = (mod.parts.flagRegNum1 << 2) | (mod.parts.flagRegNum << 1) | mod.parts.flagSubRegNum;
+    i2.commonXeHPC.execOffset = mod.parts.chanOff >> 1;
+    i2.commonXeHPC.predCtrl = mod.parts.predCtrl;
+    i2.common.predInv = mod.parts.predInv;
+    i2.common.cmptCtrl = mod.parts.cmptCtrl;
+    i2.common.debugCtrl = mod.parts.debugCtrl;
+    i2.common.maskCtrl = mod.parts.maskCtrl;
+    i2.common.atomicCtrl = mod.parts.threadCtrl;
+    i2.commonXeHPC.dstExt = (dst.isIndirect() ? dst.getOffset() : dst.getByteOffset()) & 1;
+    i2.common.saturate = mod.parts.saturate;
+    i.common = i2.common;
 }
 
 template <typename Tag>
@@ -712,7 +732,7 @@ static inline void encodeTernarySrc0(Instruction12 &i, S0 src0, Tag tag)
 
     auto vs0 = encodeTernaryVS01(src0);
 
-    i.ternary.src0VS0 = vs0;
+    i.ternary.src0VS0 = vs0 & 1;
     i.ternary.src0VS1 = vs0 >> 1;
 }
 
@@ -731,7 +751,7 @@ static inline void encodeTernarySrc1(Instruction12 &i, S1 src1, Tag tag)
 
     auto vs1 = encodeTernaryVS01(src1);
 
-    i.ternary.src1VS0 = vs1;
+    i.ternary.src1VS0 = vs1 & 1;
     i.ternary.src1VS1 = vs1 >> 1;
 }
 
@@ -812,7 +832,7 @@ static inline DataType decodeRegTypecode12(unsigned dt)
         DataType::ub,      DataType::uw,      DataType::ud,      DataType::uq,
         DataType::b,       DataType::w,       DataType::d,       DataType::q,
         DataType::bf8,     DataType::hf,      DataType::f,       DataType::df,
-        DataType::invalid, DataType::bf,      DataType::tf32,    DataType::bf8
+        DataType::hf8,     DataType::bf,      DataType::tf32,    DataType::invalid,
     };
     return conversionTable[dt & 0xF];
 }
@@ -824,6 +844,8 @@ static inline int decodeDPASTypecodeBytes12(unsigned dt)
 
 inline ARFType normalizeARFType(ARFType type, HW hw)
 {
+    if (hw >= HW::Xe3 && type == ARFType::sp)
+        type = ARFType::s;
     return type;
 }
 
@@ -913,22 +935,35 @@ bool Instruction12::getOperandRegion(autoswsb::DependencyRegion &region, int opN
         case Opcode::send:
         case Opcode::sendc: {
             int base = 0, len = 0;
+            if (send.src0RegFile == RegFileARF && hw >= HW::Xe3) switch (opNum) {
+                case 0:
+                    region = DependencyRegion(hw);
+                    return true;
+                    break;
+                case 1: {
+                    /* report s0 dependency as if it came from src1 */
+                    region = DependencyRegion(hw, send.desc25_29 & 0xF, ScalarRegister(0)[send.exDesc6_10 << 1](1));
+                    return true;
+                    break;
+                }
+                default: break;
+            }
             switch (opNum) {
                 case -1:
                     if (send.dstRegFile == RegFileARF) return false;
                     base = send.dstReg;
-                    len = send.descIsReg ? -1 : send.desc20_24;
+                    len = send.descIsReg ? -1 : (int)send.desc20_24;
                     if (len == 31) len++;
                     break;
                 case 0:
                     if (send.src0RegFile == RegFileARF) return false;
                     base = send.src0Reg;
-                    len = send.descIsReg ? -1 : (send.desc25_29 & 0xF);
+                    len = send.descIsReg ? -1 : (int)(send.desc25_29 & 0xF);
                     break;
                 case 1:
                     if (send.src1RegFile == RegFileARF) return false;
                     base = send.src1Reg;
-                    len = send.exDescIsReg ? -1 : send.exDesc6_10;
+                    len = send.exDescIsReg ? -1 : (int)send.exDesc6_10;
                     break;
                 case 2:
                 case 3: // TODO: May need to track indirect acc usage
@@ -1197,3 +1232,6 @@ autoswsb::DestinationMask Instruction12::destinations(int &jip, int &uip) const
 
     return mask;
 }
+
+}
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_gen8.hpp b/third_party/ngen/ngen_gen8.hpp
similarity index 98%
rename from src/gpu/intel/jit/ngen/ngen_gen8.hpp
rename to third_party/ngen/ngen_gen8.hpp
index abf7f84cf1c..0a763126f87 100644
--- a/src/gpu/intel/jit/ngen/ngen_gen8.hpp
+++ b/third_party/ngen/ngen_gen8.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,10 @@
  * Do not #include this file directly; ngen uses it internally.
  */
 
+#ifndef NGEN_GEN8_HPP
+#define NGEN_GEN8_HPP
+
+namespace NGEN_NAMESPACE {
 // Gen8-11 binary encoding implementation.
 
 // 25 bits of data common between src0 and src1.
@@ -318,7 +322,7 @@ static inline constexpr14 BinaryOperand8 encodeBinaryOperand8(const RegData &rd)
         result.indirect1.addrMode = 1;
         result.indirect1.addrSubreg = rd.getIndirectOff();
         if (!dest) {
-            result.indirect1.vs = (rd.isVxIndirect()) ? 0xFFFF :
+            result.indirect1.vs = (rd.isVxIndirect()) ? 0xF :
                                     (rd.getVS() == 0) ? 0 :
                                                         (1 + utils::log2(rd.getVS()));
         }
@@ -605,3 +609,6 @@ static inline constexpr14 Align16Operand extToAlign16(const ExtendedReg &reg)
 {
     return Align16Operand::createWithMME(reg.getBase(), reg.getMMENum());
 }
+
+} // namespace NGEN_NAMESPACE
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_interface.hpp b/third_party/ngen/ngen_interface.hpp
similarity index 96%
rename from src/gpu/intel/jit/ngen/ngen_interface.hpp
rename to third_party/ngen/ngen_interface.hpp
index 0275007993e..a9f2fc5260a 100644
--- a/src/gpu/intel/jit/ngen/ngen_interface.hpp
+++ b/third_party/ngen/ngen_interface.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 
 
 #include "ngen_core.hpp"
+#include "ngen_asm.hpp"
+
 #include <sstream>
 
 
@@ -53,6 +55,7 @@ class unsupported_argument_location_override : public std::runtime_error {
 public:
     unsupported_argument_location_override() : std::runtime_error("Argument register location is invalid") {}
 };
+
 #endif
 
 enum class ExternalArgumentType { Scalar, GlobalPtr, LocalPtr, Hidden };
@@ -90,7 +93,7 @@ class InterfaceHandler
     inline int getArgumentSurface(const std::string &name) const;
     inline int getArgumentSurfaceIfExists(const std::string &name) const;
     inline GRF getLocalID(int dim) const;
-    inline RegData getSIMD1LocalID(int dim) const;
+    inline Subregister getSIMD1LocalID(int dim) const;
     inline Subregister getLocalSize(int dim) const;
 
     const std::string &getExternalName() const           { return kernelName; }
@@ -110,6 +113,7 @@ class InterfaceHandler
     void requireLocalSize()                              { needLocalSize = true; }
     void requireNonuniformWGs()                          { needNonuniformWGs = true; }
     void requireNoPreemption()                           { needNoPreemption = true; }
+    void requirePartitionDim(int dim)                    { needPartitionDim = dim; }
     void requireScratch(size_t bytes = 1)                { scratchSize = bytes; }
     void requireSIMD(int simd_)                          { simd = simd_; }
     void requireSLM(size_t bytes)                        { slmSize = bytes; }
@@ -125,6 +129,7 @@ class InterfaceHandler
     void setInlineGRFCount(int grfs)                     { requestedInlineGRFs = grfs; }
     void setSkipPerThreadOffset(int32_t offset)          { offsetSkipPerThread = offset; }
     void setSkipCrossThreadOffset(int32_t offset)        { offsetSkipCrossThread = offset; }
+    int32_t getSkipCrossThreadOffset() const             { return offsetSkipCrossThread; }
 
     inline GRF getCrossthreadBase(bool effective = true) const;
     inline GRF getArgLoadBase() const;
@@ -137,6 +142,10 @@ class InterfaceHandler
     inline void generateDummyCL(std::ostream &stream) const;
     inline std::string generateZeInfo() const;
 
+#ifdef NGEN_ASM
+    inline void dumpAssignments(std::ostream &stream) const;
+#endif
+
     static constexpr int noSurface = 0x80;        // Returned by getArgumentSurfaceIfExists in case of no surface assignment
 
 protected:
@@ -174,6 +183,7 @@ class InterfaceHandler
     bool needLocalSize = false;
     bool needNonuniformWGs = false;
     bool needNoPreemption = false;
+    int needPartitionDim = -1;
     bool needHalf = false;
     bool needDouble = false;
     bool needStatelessWrites = true;
@@ -277,7 +287,7 @@ int InterfaceHandler::getArgumentSurface(const std::string &name) const
     return surface;
 }
 
-RegData InterfaceHandler::getSIMD1LocalID(int dim) const
+Subregister InterfaceHandler::getSIMD1LocalID(int dim) const
 {
 #ifdef NGEN_SAFE
     if (dim > needLocalID || simd != 1) throw unknown_argument_exception();
@@ -738,6 +748,25 @@ std::string InterfaceHandler::generateZeInfo() const
     return md.str();
 }
 
+#ifdef NGEN_ASM
+void InterfaceHandler::dumpAssignments(std::ostream &stream) const
+{
+    LabelManager manager;
+
+    for (auto &assignment : assignments) {
+        stream << "//  ";
+        if (assignment.reg.isValid())
+            assignment.reg.outputText(stream, PrintDetail::sub, manager);
+        else
+            stream << "(none)";
+        stream << '\t' << assignment.name;
+        if (assignment.surface != noSurface)
+            stream << "\t(BTI " << assignment.surface << ')';
+        stream << std::endl;
+    }
+}
+#endif
+
 } /* namespace NGEN_NAMESPACE */
 
 #endif /* header guard */
diff --git a/third_party/ngen/ngen_level_zero.hpp b/third_party/ngen/ngen_level_zero.hpp
new file mode 100644
index 00000000000..509f37d139e
--- /dev/null
+++ b/third_party/ngen/ngen_level_zero.hpp
@@ -0,0 +1,204 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_LEVEL_ZERO_HPP
+#define NGEN_LEVEL_ZERO_HPP
+
+#include "ngen_config_internal.hpp"
+
+#include "level_zero/ze_api.h"
+
+#if defined(__linux__)
+#include <dlfcn.h>
+#elif defined(_WIN32)
+#include "windows.h"
+#else
+#error "Level Zero is supported on Linux and Windows only"
+#endif
+
+#include <sstream>
+
+#include "ngen_elf.hpp"
+#include "ngen_interface.hpp"
+
+namespace NGEN_NAMESPACE {
+
+// Exceptions.
+class level_zero_error : public std::runtime_error {
+public:
+    level_zero_error(ze_result_t status_ = ZE_RESULT_SUCCESS) : std::runtime_error("A Level Zero error occurred."), status(status_) {}
+protected:
+    ze_result_t status;
+};
+
+// Dynamically loaded level_zero functions
+namespace {
+
+inline void *find_ze_symbol(const char *symbol) {
+#if defined(__linux__)
+    void *handle = dlopen("libze_loader.so.1", RTLD_NOW | RTLD_LOCAL);
+#elif defined(_WIN32)
+    // Use LOAD_LIBRARY_SEARCH_SYSTEM32 flag to avoid DLL hijacking issue.
+    HMODULE handle = LoadLibraryExA(
+            "ze_loader.dll", nullptr, LOAD_LIBRARY_SEARCH_SYSTEM32);
+#endif
+    if (!handle) throw level_zero_error{ZE_RESULT_ERROR_UNINITIALIZED};
+
+#if defined(__linux__)
+    void *f = reinterpret_cast<void *>(dlsym(handle, symbol));
+#elif defined(_WIN32)
+    void *f = reinterpret_cast<void *>(GetProcAddress(handle, symbol));
+#endif
+
+    if (!f) throw level_zero_error{ZE_RESULT_ERROR_UNINITIALIZED};
+    return f;
+}
+
+template <typename F>
+F find_ze_symbol(const char *symbol) {
+    return (F)find_ze_symbol(symbol);
+}
+
+#define ZE_INDIRECT_API(f) \
+    template <typename... Args> ze_result_t call_##f(Args&&... args) { \
+        static auto f_ = find_ze_symbol<decltype(&f)>(#f);              \
+        return f_(std::forward<Args>(args)...);                         \
+    }
+
+ZE_INDIRECT_API(zeModuleCreate)
+ZE_INDIRECT_API(zeModuleDestroy)
+ZE_INDIRECT_API(zeDeviceGetProperties)
+ZE_INDIRECT_API(zeModuleGetNativeBinary)
+ZE_INDIRECT_API(zeKernelCreate)
+
+} // namespace
+    
+// Level Zero program generator class.
+template <HW hw>
+class LevelZeroCodeGenerator : public ELFCodeGenerator<hw>
+{
+public:
+    explicit LevelZeroCodeGenerator(Product product_, DebugConfig debugConfig = {}) : ELFCodeGenerator<hw>(product_, debugConfig) {
+        this->interface_.setInlineGRFCount(0);
+    }
+
+    explicit LevelZeroCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : LevelZeroCodeGenerator({genericProductFamily(hw), stepping_, PlatformType::Unknown}, debugConfig) {}
+
+    explicit LevelZeroCodeGenerator(DebugConfig debugConfig) : LevelZeroCodeGenerator({genericProductFamily(hw), 0}, debugConfig) {}
+
+    inline ze_module_handle_t getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options = "");
+    static inline HW detectHW(ze_context_handle_t context, ze_device_handle_t device);
+    static inline Product detectHWInfo(ze_context_handle_t context, ze_device_handle_t device);
+};
+
+#define NGEN_FORWARD_LEVEL_ZERO(hw) NGEN_FORWARD_ELF(hw)
+
+namespace detail {
+
+static inline void handleL0(ze_result_t result)
+{
+    if (result != ZE_RESULT_SUCCESS)
+        throw level_zero_error{result};
+}
+
+}; /* namespace detail */
+
+template <HW hw>
+ze_module_handle_t LevelZeroCodeGenerator<hw>::getModule(ze_context_handle_t context, ze_device_handle_t device, const std::string &options)
+{
+    using super = ELFCodeGenerator<hw>;
+
+    auto binary = super::getBinary();
+
+    ze_module_desc_t moduleDesc = {
+        ZE_STRUCTURE_TYPE_MODULE_DESC,
+        nullptr,
+        ZE_MODULE_FORMAT_NATIVE,
+        binary.size(),
+        binary.data(),
+        options.c_str(),
+        nullptr
+    };
+
+    ze_module_handle_t module;
+    detail::handleL0(call_zeModuleCreate(context, device, &moduleDesc, &module, nullptr));
+
+    if (module == nullptr)
+        throw level_zero_error{};
+
+    return module;
+}
+
+template <HW hw>
+HW LevelZeroCodeGenerator<hw>::detectHW(ze_context_handle_t context, ze_device_handle_t device)
+{
+    return getCore(detectHWInfo(context, device).family);
+}
+
+template <HW hw>
+Product LevelZeroCodeGenerator<hw>::detectHWInfo(ze_context_handle_t context, ze_device_handle_t device)
+{
+    Product product;
+
+    ze_device_properties_t dprop = {};
+    dprop.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+
+#ifdef ZE_DEVICE_IP_VERSION_EXT_NAME
+    // Try ZE_extension_device_ip_version first if available.
+    ze_device_ip_version_ext_t vprop = {ZE_STRUCTURE_TYPE_DEVICE_IP_VERSION_EXT, nullptr, 0};
+    dprop.pNext = &vprop;
+
+    if (call_zeDeviceGetProperties(device, &dprop) == ZE_RESULT_SUCCESS) {
+        return npack::decodeHWIPVersion(vprop.ipVersion);
+    } else
+#endif
+    {
+        static const uint8_t dummySPV[] = {0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0E, 0x00, 0x06, 0x00, 0x07, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x05, 0x00, 0x01, 0x00, 0x00, 0x00, 0x4F, 0x70, 0x65, 0x6E, 0x43, 0x4C, 0x2E, 0x73, 0x74, 0x64, 0x00, 0x00, 0x0E, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5F, 0x00, 0x00, 0x00, 0x07, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6B, 0x65, 0x72, 0x6E, 0x65, 0x6C, 0x5F, 0x61, 0x72, 0x67, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x2E, 0x5F, 0x2E, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x70, 0x8E, 0x01, 0x00, 0x05, 0x00, 0x04, 0x00, 0x05, 0x00, 0x00, 0x00, 0x65, 0x6E, 0x74, 0x72, 0x79, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xF8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0xFD, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00};
+        ze_module_desc_t moduleDesc = {
+            ZE_STRUCTURE_TYPE_MODULE_DESC,
+            nullptr,
+            ZE_MODULE_FORMAT_IL_SPIRV,
+            sizeof(dummySPV),
+            dummySPV,
+            nullptr,
+            nullptr
+        };
+
+        ze_module_handle_t module;
+        detail::handleL0(call_zeModuleCreate(context, device, &moduleDesc, &module, nullptr));
+
+        if (module == nullptr)
+            throw level_zero_error{};
+
+        std::vector<uint8_t> binary;
+        size_t binarySize;
+
+        detail::handleL0(call_zeModuleGetNativeBinary(module, &binarySize, nullptr));
+        binary.resize(binarySize);
+        detail::handleL0(call_zeModuleGetNativeBinary(module, &binarySize, binary.data()));
+        detail::handleL0(call_zeModuleDestroy(module));
+        product = ELFCodeGenerator<hw>::getBinaryHWInfo(binary);
+        detail::handleL0(call_zeDeviceGetProperties(device, &dprop));
+    }
+
+    product.type = (dprop.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) ? PlatformType::Integrated : PlatformType::Discrete;
+
+    return product;
+}
+
+} /* namespace NGEN_NAMESPACE */
+
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_opencl.hpp b/third_party/ngen/ngen_opencl.hpp
similarity index 84%
rename from src/gpu/intel/jit/ngen/ngen_opencl.hpp
rename to third_party/ngen/ngen_opencl.hpp
index 8e0ef93bd07..6279e08724d 100644
--- a/src/gpu/intel/jit/ngen/ngen_opencl.hpp
+++ b/third_party/ngen/ngen_opencl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,9 +17,11 @@
 #ifndef NGEN_OPENCL_HPP
 #define NGEN_OPENCL_HPP
 
-#include "ngen_config.hpp"
+#include "ngen_config_internal.hpp"
 
+#ifndef __OPENCL_CL_H
 #include <CL/cl.h>
+#endif
 
 #include <atomic>
 #include <sstream>
@@ -53,18 +55,16 @@ template <HW hw>
 class OpenCLCodeGenerator : public ELFCodeGenerator<hw>
 {
 public:
-    explicit OpenCLCodeGenerator(Product product_)  : ELFCodeGenerator<hw>(product_) {}
-    explicit OpenCLCodeGenerator(int stepping_ = 0) : ELFCodeGenerator<hw>(stepping_) {}
+    explicit OpenCLCodeGenerator(Product product_, DebugConfig debugConfig = {})  : ELFCodeGenerator<hw>(product_, debugConfig) {}
+    explicit OpenCLCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : ELFCodeGenerator<hw>(stepping_, debugConfig) {}
+    explicit OpenCLCodeGenerator(DebugConfig debugConfig) : ELFCodeGenerator<hw>(debugConfig) {}
 
     inline std::vector<uint8_t> getBinary(cl_context context, cl_device_id device, const std::string &options = "-cl-std=CL2.0");
     inline cl_kernel getKernel(cl_context context, cl_device_id device, const std::string &options = "-cl-std=CL2.0");
     bool binaryIsZebin() { return isZebin; }
 
     static inline HW detectHW(cl_context context, cl_device_id device);
-    static inline void detectHWInfo(cl_context context, cl_device_id device, HW &outHW, Product &outProduct);
-
-    /* Deprecated. Use the Product-based API instead. */
-    static inline void detectHWInfo(cl_context context, cl_device_id device, HW &outHW, int &outStepping);
+    static inline Product detectHWInfo(cl_context context, cl_device_id device);
 
 private:
     bool isZebin = false;
@@ -258,41 +258,31 @@ cl_kernel OpenCLCodeGenerator<hw>::getKernel(cl_context context, cl_device_id de
 template <HW hw>
 HW OpenCLCodeGenerator<hw>::detectHW(cl_context context, cl_device_id device)
 {
-    HW outHW;
-    Product outProduct;
-
-    detectHWInfo(context, device, outHW, outProduct);
-
-    return outHW;
-}
-
-template <HW hw>
-void OpenCLCodeGenerator<hw>::detectHWInfo(cl_context context, cl_device_id device, HW &outHW, int &outStepping)
-{
-    Product outProduct;
-    detectHWInfo(context, device, outHW, outProduct);
-    outStepping = outProduct.stepping;
+    return getCore(detectHWInfo(context, device).family);
 }
 
 template <HW hw>
-void OpenCLCodeGenerator<hw>::detectHWInfo(cl_context context, cl_device_id device, HW &outHW, Product &outProduct)
+Product OpenCLCodeGenerator<hw>::detectHWInfo(cl_context context, cl_device_id device)
 {
-    const char *dummyCL = "kernel void _ngen_hw_detect(){}";
-    const char *dummyOptions = "";
+    Product product;
 
     // Try CL_DEVICE_IP_VERSION_INTEL query first.
     cl_uint ipVersion = 0;      /* should be cl_version, but older CL/cl.h may not define cl_version */
-    if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ipVersion), &ipVersion, nullptr) == CL_SUCCESS) {
-        outProduct = npack::decodeHWIPVersion(ipVersion);
-        outHW = getCore(outProduct.family);
-        if (outProduct.family != ProductFamily::Unknown)
-            return;
+    if (clGetDeviceInfo(device, CL_DEVICE_IP_VERSION_INTEL, sizeof(ipVersion), &ipVersion, nullptr) == CL_SUCCESS)
+        product = npack::decodeHWIPVersion(ipVersion);
+    else {
+        // If it fails, compile a test program and extract the HW information from it.
+        const char *dummyCL = "kernel void _ngen_hw_detect(){}";
+        const char *dummyOptions = "";
+        auto binary = detail::getOpenCLCProgramBinary(context, device, dummyCL, dummyOptions);
+        product = ELFCodeGenerator<hw>::getBinaryHWInfo(binary);
     }
 
-    // If it fails, compile a test program and extract the HW information from it.
-    auto binary = detail::getOpenCLCProgramBinary(context, device, dummyCL, dummyOptions);
+    cl_bool integrated;
+    if (clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(integrated), &integrated, nullptr) == CL_SUCCESS)
+        product.type = integrated ? PlatformType::Integrated : PlatformType::Discrete;
 
-    ELFCodeGenerator<hw>::getBinaryHWInfo(binary, outHW, outProduct);
+    return product;
 }
 
 } /* namespace NGEN_NAMESPACE */
diff --git a/third_party/ngen/ngen_pseudo.hpp b/third_party/ngen/ngen_pseudo.hpp
new file mode 100644
index 00000000000..b7273972bf2
--- /dev/null
+++ b/third_party/ngen/ngen_pseudo.hpp
@@ -0,0 +1,864 @@
+/*******************************************************************************
+* Copyright 2019-2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+ * Do not #include this file directly; ngen uses it internally.
+ */
+
+
+// Pseudo-instructions and macros.
+template <typename DT = void>
+void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | lt | f0[0], dst, src0, src1, loc);
+}
+template <typename DT = void>
+void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | lt | f0[0], dst, src0, src1, loc);
+}
+#ifndef NGEN_WINDOWS_COMPAT
+template <typename DT = void>
+void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | lt | f0[0], dst, src0, src1, loc);
+}
+template <typename DT = void>
+void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | lt | f0[0], dst, src0, src1, loc);
+}
+#endif
+template <typename DT = void>
+void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | ge | f0[0], dst, src0, src1, loc);
+}
+template <typename DT = void>
+void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | ge | f0[0], dst, src0, src1, loc);
+}
+#ifndef NGEN_WINDOWS_COMPAT
+template <typename DT = void>
+void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | ge | f0[0], dst, src0, src1, loc);
+}
+template <typename DT = void>
+void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    sel<DT>(mod | ge | f0[0], dst, src0, src1, loc);
+}
+#endif
+
+template <typename DT = void>
+void bfi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, const RegData &src3, SourceLocation loc = {}) {
+    bfi1<DT>(mod, dst, src0, src1, loc);
+    bfi2<DT>(mod, dst, dst, src2, src3, loc);
+}
+
+// Brief compare instructions.
+template <typename DT = void>
+void cmp(const InstructionModifier &mod, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    auto dt = getDataType<DT>();
+    if (dt == DataType::invalid)
+        dt = src0.getType();
+    cmp<DT>(mod, null.retype(dt), src0, src1, loc);
+}
+template <typename DT = void>
+void cmp(const InstructionModifier &mod, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    auto dt = getDataType<DT>();
+    if (dt == DataType::invalid)
+        dt = src0.getType();
+    cmp<DT>(mod, null.retype(dt), src0, src1, loc);
+}
+
+// Brief math instructions.
+template <typename DT = void>
+void cos(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::cos, dst, src0, loc);
+}
+template <typename DT = void>
+void exp(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::exp, dst, src0, loc);
+}
+template <typename DT = void>
+void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::fdiv, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::fdiv, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::idiv, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::idiv, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void inv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::inv, dst, src0, loc);
+}
+template <typename DT = void>
+void invm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::invm, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::iqot, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::iqot, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::irem, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::irem, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void log(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::log, dst, src0, loc);
+}
+template <typename DT = void>
+void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::pow, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::pow, dst, src0, src1, loc);
+}
+template <typename DT = void>
+void rsqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::rsqt, dst, src0, loc);
+}
+template <typename DT = void>
+void rsqtm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::rsqtm, dst, src0, loc);
+}
+template <typename DT = void>
+void sin(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::sin, dst, src0, loc);
+}
+template <typename DT = void>
+void sqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0, SourceLocation loc = {}) {
+    math<DT>(mod, MathFunction::sqt, dst, src0, loc);
+}
+
+#define TMP(n) tmp[n].retype(dst.getType())
+
+// IEEE 754-compliant divide math macro sequence.
+//   Requires GRFs initialized with 0.0 and 1.0, as well as temporary GRFs (4 for single precision, 5 for double precision).
+//   dst, num, denom must be distinct GRFs.
+template <typename DT = void, typename A>
+void fdiv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData num, RegData denom,
+               RegData zero, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier(),
+               SourceLocation loc = {}) {
+    DataType dt = getDataType<DT>();
+    if (dt == DataType::invalid)
+        dt = dst.getType();
+    if (cfmod.getExecSize() == 0)
+        cfmod = mod;
+
+    Label labelSkip;
+
+    switch (dt) {
+        case DataType::hf:
+            fdiv<DT>(mod, dst, num, denom, loc);
+            break;
+        case DataType::f:
+            invm<DT>(mod | eo | flag,         dst | mme0,      num | nomme,   denom | nomme, loc);
+            if_(cfmod | ~flag, labelSkip, loc);
+
+            madm<DT>(mod, TMP(0) | mme1,     zero | nomme,     num | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(1) | mme2,      one | nomme,  -denom | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(2) | mme3,      dst | mme0,   TMP(1) | mme2,      dst | mme0, loc);
+            madm<DT>(mod, TMP(3) | mme4,      num | nomme,  -denom | nomme,  TMP(0) | mme1, loc);
+            madm<DT>(mod, TMP(0) | mme5,   TMP(0) | mme1,   TMP(3) | mme4,   TMP(2) | mme3, loc);
+            madm<DT>(mod, TMP(1) | mme6,      num | nomme,  -denom | nomme,  TMP(0) | mme5, loc);
+            madm<DT>(mod,    dst | nomme,  TMP(0) | mme5,   TMP(1) | mme6,   TMP(2) | mme3, loc);
+
+            mark(labelSkip);
+            endif(cfmod, loc);
+            break;
+        case DataType::df:
+            invm<DT>(mod | eo | flag,         dst | mme0,      num | nomme,   denom | nomme, loc);
+            if_(cfmod | ~flag, labelSkip, loc);
+
+            madm<DT>(mod, TMP(0) | mme1,     zero | nomme,     num | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(1) | mme2,      one | nomme,  -denom | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(2) | mme3,      num | nomme,  -denom | nomme,  TMP(0) | mme1, loc);
+            madm<DT>(mod, TMP(3) | mme4,      dst | mme0,   TMP(1) | mme2,      dst | mme0, loc);
+            madm<DT>(mod, TMP(4) | mme5,      one | nomme,  -denom | nomme,  TMP(3) | mme4, loc);
+            madm<DT>(mod,    dst | mme6,      dst | mme0,   TMP(1) | mme2,   TMP(3) | mme4, loc);
+            madm<DT>(mod, TMP(0) | mme7,   TMP(0) | mme1,   TMP(2) | mme3,   TMP(3) | mme4, loc);
+            madm<DT>(mod, TMP(3) | mme0,   TMP(3) | mme4,      dst | mme6,   TMP(4) | mme5, loc);
+            madm<DT>(mod, TMP(2) | mme1,      num | nomme,  -denom | nomme,  TMP(0) | mme7, loc);
+            madm<DT>(mod,    dst | nomme,  TMP(0) | mme7,   TMP(2) | mme1,   TMP(3) | mme0, loc);
+
+            mark(labelSkip);
+            endif(cfmod, loc);
+            break;
+        default:
+#ifdef NGEN_SAFE
+            throw invalid_type_exception();
+#endif
+            break;
+    }
+}
+
+// IEEE 754-compliant reciprocal math macro sequence.
+//   Requires GRF initialized with 1.0, as well as 3 temporary GRFs.
+//   dst and src must be distinct GRFs.
+template <typename DT = void, typename A>
+void inv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src, RegData one,
+              const A &tmp, InstructionModifier cfmod = InstructionModifier(), SourceLocation loc = {}) {
+    DataType dt = getDataType<DT>();
+    if (dt == DataType::invalid)
+        dt = dst.getType();
+    if (cfmod.getExecSize() == 0)
+        cfmod = mod;
+
+    Label labelSkip;
+
+    switch (dt) {
+        case DataType::hf:
+            inv<DT>(mod, dst, src, loc);
+            break;
+        case DataType::f:
+            invm<DT>(mod | eo | flag,         dst | mme0,      one | nomme,     src | nomme, loc);
+            if_(cfmod | ~flag, labelSkip, loc);
+
+            madm<DT>(mod, TMP(1) | mme2,      one | nomme,    -src | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(2) | mme3,      dst | mme0,   TMP(1) | mme2,      dst | mme0, loc);
+            madm<DT>(mod, TMP(0) | mme5,      dst | mme0,   TMP(1) | mme2,   TMP(2) | mme3, loc);
+            madm<DT>(mod, TMP(1) | mme6,      one | nomme,    -src | nomme,  TMP(0) | mme5, loc);
+            madm<DT>(mod,    dst | nomme,  TMP(0) | mme5,   TMP(1) | mme6,   TMP(2) | mme3, loc);
+
+            mark(labelSkip);
+            endif(cfmod, loc);
+            break;
+        case DataType::df:
+            invm<DT>(mod | eo | flag,        dst | mme0,      one | nomme,     src | nomme, loc);
+            if_(cfmod | ~flag, labelSkip, loc);
+
+            madm<DT>(mod, TMP(0) | mme2,     one | nomme,    -src | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(1) | mme4,     dst | mme0,   TMP(0) | mme2,      dst | mme0, loc);
+            madm<DT>(mod, TMP(2) | mme5,     one | nomme,    -src | nomme,  TMP(1) | mme4, loc);
+            madm<DT>(mod,    dst | mme6,     dst | mme0,   TMP(0) | mme2,   TMP(1) | mme4, loc);
+            madm<DT>(mod, TMP(1) | mme0,  TMP(1) | mme4,      dst | mme6,   TMP(2) | mme5, loc);
+            madm<DT>(mod, TMP(0) | mme1,     one | nomme,    -src | nomme,     dst | mme6, loc);
+            madm<DT>(mod,    dst | nomme,    dst | mme6,   TMP(0) | mme1,   TMP(1) | mme0, loc);
+
+            mark(labelSkip);
+            endif(cfmod, loc);
+            break;
+        default:
+#ifdef NGEN_SAFE
+            throw invalid_type_exception();
+#endif
+            break;
+    }
+}
+
+// IEEE 754-compliant square root macro sequence.
+//   Requires GRFs initialized with 0.0 and 0.5 (also 1.0 for double precision),
+//     and temporary GRFs (3 for single precision, 4 for double precision).
+//   dst and src must be distinct GRFs.
+template <typename DT = void, typename A>
+void sqt_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src,
+              RegData zero, RegData oneHalf, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier(),
+              SourceLocation loc = {}) {
+    DataType dt = getDataType<DT>();
+    if (dt == DataType::invalid)
+        dt = dst.getType();
+    if (cfmod.getExecSize() == 0)
+        cfmod = mod;
+
+    Label labelSkip;
+
+    switch (dt) {
+        case DataType::hf:
+            sqt<DT>(mod, dst, src, loc);
+            break;
+        case DataType::f:
+            rsqtm<DT>(mod | eo | flag,        dst | mme0,       src | nomme, loc);
+            if_(cfmod | ~flag, labelSkip, loc);
+
+            madm<DT>(mod, TMP(0) | mme1,     zero | nomme,  oneHalf | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(1) | mme2,     zero | nomme,      src | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(2) | mme3,  oneHalf | nomme,  -TMP(1) | mme2,   TMP(0) | mme1, loc);
+            madm<DT>(mod, TMP(0) | mme4,   TMP(0) | mme1,    TMP(2) | mme3,   TMP(0) | mme1, loc);
+            madm<DT>(mod,    dst | mme5,   TMP(1) | mme2,    TMP(2) | mme3,   TMP(1) | mme2, loc);
+            madm<DT>(mod, TMP(2) | mme6,      src | nomme,     -dst | mme5,      dst | mme5, loc);
+            madm<DT>(mod,    dst | nomme,     dst | mme5,    TMP(0) | mme4,   TMP(2) | mme6, loc);
+
+            mark(labelSkip);
+            endif(cfmod, loc);
+            break;
+        case DataType::df:
+            rsqtm<DT>(mod | eo | flag,        dst | mme0,       src | nomme, loc);
+            if_(cfmod | ~flag, labelSkip, loc);
+
+            madm<DT>(mod, TMP(0) | mme1,     zero | mme0,   oneHalf | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(1) | mme2,     zero | mme0,       src | nomme,     dst | mme0, loc);
+            madm<DT>(mod, TMP(2) | mme3,  oneHalf | nomme,  -TMP(1) | mme2,   TMP(0) | mme1, loc);
+            madm<DT>(mod, TMP(3) | mme4,      one | nomme,  oneHalf | nomme,     dst | nomme, loc);
+            madm<DT>(mod, TMP(3) | mme5,      one | nomme,   TMP(3) | mme4,   TMP(2) | mme3, loc);
+            madm<DT>(mod,    dst | mme6,     zero | mme0,    TMP(2) | mme3,   TMP(1) | mme2, loc);
+            madm<DT>(mod, TMP(2) | mme7,     zero | mme0,    TMP(2) | mme3,   TMP(0) | mme1, loc);
+            madm<DT>(mod,    dst | mme6,   TMP(1) | mme2,    TMP(3) | mme5,      dst | mme6, loc);
+            madm<DT>(mod, TMP(3) | mme5,   TMP(0) | mme1,    TMP(3) | mme5,   TMP(2) | mme7, loc);
+            madm<DT>(mod, TMP(0) | mme1,      src | nomme,     -dst | mme6,      dst | mme6, loc);
+            madm<DT>(mod,    dst | nomme,     dst | mme6,    TMP(0) | mme1,   TMP(3) | mme5, loc);
+
+            mark(labelSkip);
+            endif(cfmod, loc);
+            break;
+        default:
+#ifdef NGEN_SAFE
+            throw invalid_type_exception();
+#endif
+            break;
+    }
+}
+
+#undef TMP
+
+// Thread spawner messages.
+void threadend(const InstructionModifier &mod, const RegData &r0_info, SourceLocation loc = {}) {
+    {
+        auto sf = (hardware <= HW::XeHP) ? SharedFunction::ts
+                                         : SharedFunction::gtwy;
+        uint32_t exdesc = 0x20 | (static_cast<int>(sf) & 0xF);
+        send(8 | EOT | mod | NoMask, null, r0_info, exdesc, 0x2000010, loc);
+    }
+}
+
+void threadend(const RegData &r0_info, SourceLocation loc = {}) {
+    threadend(InstructionModifier(), r0_info, loc);
+}
+
+// Gateway messages.
+void barriermsg(const InstructionModifier &mod, const GRF &header, SourceLocation loc = {}) {
+    {
+        uint32_t exdesc = static_cast<int>(SharedFunction::gtwy) & 0xF;
+        send(1 | mod | NoMask, null, header, exdesc, 0x2000004, loc);
+    }
+}
+
+void barriermsg(const GRF &header, SourceLocation loc = {}) { barriermsg(InstructionModifier(), header, loc); }
+
+// Prepare barrier header.
+void barrierheader(const GRF &header, const GRF &r0_info = r0, SourceLocation loc = {}) {
+    if (hardware >= HW::XeHPG) {
+        mov(1 | NoMask, header.hf(4), Immediate::hf(0), loc);
+        mov(2 | NoMask, header.ub(10)(1), r0_info.ub(11)(0), loc);
+    } else
+        and_(8 | NoMask, header.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000), loc);
+}
+
+void barrierheader(const GRF &header, uint32_t threadCount, const GRF &r0_info = r0, SourceLocation loc = {}) {
+    if (hardware >= HW::XeHPG)
+        mov(1 | NoMask, header.ud(2), (threadCount << 24) | (threadCount << 16), loc);
+    else {
+        and_(8 | NoMask, header.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000), loc);
+        mov(1 | NoMask, header.ub(9), 0x80 | (threadCount & 0x7F), loc);
+    }
+}
+
+void barriersignal(const InstructionModifier &mod, const GRF &temp, const GRF &r0_info = r0, SourceLocation loc = {}) {
+    {
+        barrierheader(temp, r0_info, loc);
+        barriermsg(mod, temp, loc);
+    }
+}
+
+void barriersignal(const InstructionModifier &mod, const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0, SourceLocation loc = {}) {
+    barrierheader(temp, threadCount, r0_info, loc);
+    barriermsg(mod, temp, loc);
+}
+
+void barriersignal(const GRF &temp, const GRF &r0_info = r0, SourceLocation loc = {}) { barriersignal(InstructionModifier(), temp, r0_info, loc); }
+void barriersignal(const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0, SourceLocation loc = {}) { barriersignal(InstructionModifier(), temp, threadCount, r0_info, loc); }
+
+// Named barriers.
+void nbarriermsg(const InstructionModifier &mod, const GRF &header, SourceLocation loc = {}) {
+        barriermsg(mod, header, loc);
+}
+
+void nbarriermsg(const GRF &header, SourceLocation loc = {}) { nbarriermsg(InstructionModifier(), header, loc); }
+
+void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+    if (hardware < HW::XeHPC)
+        throw unsupported_message();
+#endif
+    mov(1 | NoMask, temp.uw(4), uint8_t(barrierID), loc);
+    mov(2 | NoMask, temp.ub(10)(1), r0_info.ub(11)(0), loc);
+    nbarriermsg(mod, temp, loc);
+}
+
+void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers, SourceLocation loc = {}) {
+#ifdef NGEN_SAFE
+    if (hardware < HW::XeHPC)
+        throw unsupported_message();
+#endif
+    mov(1 | NoMask, temp.ud(2), (barrierID & 0xFF) | (static_cast<uint32_t>(barrierType) << 14) | ((producers & 0xFF) << 16) | ((consumers & 0xFF) << 24), loc);
+    nbarriermsg(mod, temp, loc);
+}
+
+void barriersignal(uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0, SourceLocation loc = {}) { barriersignal(InstructionModifier(), barrierID, temp, r0_info, loc); }
+void barriersignal(uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers, SourceLocation loc = {}) { barriersignal(InstructionModifier(), barrierID, temp, barrierType, producers, consumers, loc); }
+
+void barrierwait(SourceLocation loc = {}) {
+    if (isGen12)
+        sync.bar(NoMask, loc);
+    else
+        wait(NoMask, n0[0], loc);
+}
+
+void barrier(const InstructionModifier &mod, const GRF &temp, const GRF &r0_info = r0,
+             SourceLocation loc = {}) {
+    barriersignal(mod, temp, r0_info, loc);
+    barrierwait(loc);
+}
+
+void barrier(const InstructionModifier &mod, const GRF &temp, uint32_t threadCount,
+             const GRF &r0_info = r0, SourceLocation loc = {}) {
+    barriersignal(mod, temp, threadCount, r0_info, loc);
+    barrierwait(loc);
+}
+
+void barrier(const GRF &temp, const GRF &r0_info = r0, SourceLocation loc = {}) {
+    barriersignal(InstructionModifier(), temp, r0_info, loc);
+    barrierwait(loc);
+}
+
+void barrier(const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0,
+             SourceLocation loc = {}) {
+    barriersignal(temp, threadCount, r0_info, loc);
+    barrierwait(loc);
+}
+
+void barrier(const InstructionModifier &mod, uint32_t barrierID,
+             const GRF &temp, const GRF &r0_info = r0,
+             SourceLocation loc = {}) {
+    barriersignal(mod, barrierID, temp, r0_info, loc);
+    barrierwait(loc);
+}
+
+void barrier(const InstructionModifier &mod, uint32_t barrierID,
+             const GRF &temp, BarrierType barrierType, uint32_t producers,
+             uint32_t consumers, SourceLocation loc = {}) {
+    barriersignal(mod, barrierID, temp, barrierType, producers, consumers, loc);
+    barrierwait(loc);
+}
+
+void barrier(uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0,
+             SourceLocation loc = {}) {
+    barriersignal(barrierID, temp, r0_info, loc);
+    barrierwait(loc);
+}
+
+void barrier(uint32_t barrierID, const GRF &temp, BarrierType barrierType,
+             uint32_t producers, uint32_t consumers, SourceLocation loc = {}) {
+    barriersignal(barrierID, temp, barrierType, producers, consumers, loc);
+    barrierwait(loc);
+}
+
+void registerfence(const RegData &dst, SourceLocation loc = {}) {
+    _lastFenceDst = dst;
+    if (isGen12) {
+        _lastFenceLabel = Label();
+        mark(_lastFenceLabel);
+    }
+}
+
+// Global memory fence.
+void memfence(const InstructionModifier &mod, FenceScopeLSC scope, FlushTypeLSC flushing, const RegData &dst = NullRegister(), const RegData &header = GRF(0), SourceLocation loc = {}) {
+    registerfence(dst, loc);
+
+    if (hardware >= HW::XeHPG) {
+        if (flushing == FlushTypeLSC::None && hardware == HW::XeHPG && scope > FenceScopeLSC::Subslice)
+            flushing = static_cast<FlushTypeLSC>(6);    /* workaround for DG2 bug */
+
+        uint32_t desc = 0x0210011F;
+        desc |= static_cast<uint32_t>(scope) << 9;
+        desc |= static_cast<uint32_t>(flushing) << 12;
+        send(1 | mod | NoMask, SharedFunction::ugm, dst, header, null, 0, desc, loc);
+    } else {
+        const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF;
+        send(8 | mod | NoMask, dst, header, exdesc, 0x219E000, loc);
+    }
+}
+
+void memfence(const InstructionModifier &mod, const RegData &dst = NullRegister(), const RegData &header = GRF(0), SourceLocation loc = {}) {
+    memfence(mod, FenceScopeLSC::GPU, FlushTypeLSC::None, dst, header, loc);
+}
+
+void memfence(FenceScopeLSC scope, FlushTypeLSC flushing, const RegData &dst = NullRegister(), const RegData &header = GRF(0), SourceLocation loc = {}) {
+    memfence(InstructionModifier(), scope, flushing, dst, header, loc);
+}
+
+void memfence(const RegData &dst = NullRegister(), const RegData &header = GRF(0), SourceLocation loc = {}) {
+    memfence(InstructionModifier(), dst, header, loc);
+}
+
+// SLM-only memory fence.
+void slmfence(const InstructionModifier &mod, const RegData &dst = NullRegister(), const RegData &header = GRF(0), SourceLocation loc = {}) {
+    registerfence(dst, loc);
+
+    if (hardware >= HW::XeHPG)
+        send(1 | mod | NoMask, SharedFunction::slm, dst, header, null, 0, 0x210011F, loc);
+    else {
+        const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF;
+        send(8 | mod | NoMask, dst, header, exdesc, 0x219E0FE, loc);
+    }
+}
+
+void slmfence(const RegData &dst = NullRegister(), const RegData &header = GRF(0), SourceLocation loc = {}) { slmfence(InstructionModifier(), dst, header, loc); }
+
+// Wait on the last global memory or SLM fence.
+void fencewait(SourceLocation loc = {}) {
+    if (isGen12)
+        fencedep(_lastFenceLabel, loc);
+    else
+        mov<uint32_t>(8 | NoMask, null, _lastFenceDst, loc);
+}
+
+// XeHP+ prologues.
+void loadlid(int argBytes, int dims = 3, int simd = 8, const GRF &temp = GRF(127), int paddedSize = 0, SourceLocation loc = {}) {
+    if (hardware >= HW::XeHP) {
+        if (paddedSize < 0)
+            paddedSize = 12*16;
+        const int grfSize = GRF::bytes(hardware);
+        const int grfOW = grfSize / 16;
+        int simdGRFs = (simd > 16 && grfSize < 64) ? 2 : 1;
+        int insns = 0;
+        const bool lsc = (hardware >= HW::XeHPG);
+        auto tempAddr = temp[lsc ? 0 : 2];
+
+        if (dims > 0) {
+            auto dmSave = defaultModifier;
+            defaultModifier |= NoMask | AutoSWSB;
+
+
+            {
+                insns = lsc ? 5 : 6;
+                if (!lsc)
+                    mov<uint32_t>(8, temp, uint16_t(0), loc);
+                and_<uint32_t>(1, temp[2], r0[0], uint32_t(~0x1F), loc);
+                and_<uint16_t>(1, temp[0], r0[4], uint16_t(0xFF), loc);
+                add<uint32_t>(1, temp[2], temp[2], uint16_t(argBytes), loc);
+                if (simd == 1) {
+                    mad<uint32_t>(1, tempAddr, temp[2], temp.uw(0), uint16_t(grfSize), loc);
+                    lsc ? load(1, r1, D32T(4) | L1C_L3C,      A32,   temp, loc)
+                        : load(8, r1, aligned_block_oword(1), A32NC, temp, loc);
+                } else {
+                    mad<uint32_t>(1, tempAddr, temp[2], temp.uw(0), uint16_t(3 * simdGRFs * grfSize), loc);
+                    lsc ? load(1, r1, D32T(simdGRFs * ((dims == 1) ? 1 : 2) * grfOW * 4) | L1C_L3C,  A32,   temp, loc)
+                        : load(8, r1, aligned_block_oword(simdGRFs * ((dims == 1) ? 1 : 2) * grfOW), A32NC, temp, loc);
+                    if (dims == 3) {
+                        add<uint32_t>(1, tempAddr, tempAddr, uint16_t(2 * simdGRFs * grfSize), loc);
+                        lsc ? load(1, GRF(1 + 2 * simdGRFs), D32T(grfOW * 4 * simdGRFs) | L1C_L3C,  A32,   temp, loc)
+                            : load(8, GRF(1 + 2 * simdGRFs), aligned_block_oword(grfOW * simdGRFs), A32NC, temp, loc);
+                        insns += 2;
+                    }
+                }
+            }
+
+            defaultModifier = dmSave;
+        }
+
+        if (paddedSize > 0) {
+            int nops = (paddedSize >> 4) - insns;
+#ifdef NGEN_SAFE
+            if (paddedSize & 0xF) throw invalid_operand_exception();
+            if (nops < 0)         throw invalid_operand_exception();
+#endif
+            for (int i = 0; i < nops; i++)
+                nop(loc);
+        }
+
+        if (!_labelLocalIDsLoaded.defined(labelManager))
+            mark(_labelLocalIDsLoaded);
+
+    }
+}
+
+void loadargs(const GRF &base, int argGRFs, const GRF &temp = GRF(127), bool inPrologue = true, SourceLocation loc = {}) {
+    if (hardware >= HW::XeHP) {
+        if (argGRFs > 0) {
+            const bool lsc = (hardware >= HW::XeHPG);
+            auto tempAddr = temp[lsc ? 0 : 2];
+            auto dst = base;
+            auto dmSave = defaultModifier;
+            defaultModifier |= NoMask | AutoSWSB;
+
+            {
+                if (!lsc)
+                    mov<uint32_t>(8, temp, uint16_t(0), loc);
+                and_<uint32_t>(1, tempAddr, r0[0], uint32_t(~0x1F), loc);
+                while (argGRFs > 0) {
+                    int nload = std::min(utils::rounddown_pow2(argGRFs), lsc ? 8 : 4);
+                    int loadBytes = nload * GRF::bytes(hardware);
+                    lsc ? load(1, dst, D64T(loadBytes >> 3) | L1C_L3C,      A32,   temp, loc)
+                        : load(8, dst, aligned_block_oword(loadBytes >> 4), A32NC, temp, loc);
+                    argGRFs -= nload;
+                    dst += nload;
+                    if (argGRFs > 0)
+                        add<uint32_t>(1, tempAddr, tempAddr, uint32_t(loadBytes), loc);
+                }
+            }
+
+            defaultModifier = dmSave;
+        }
+
+        if (!_labelArgsLoaded.defined(labelManager))
+            mark(_labelArgsLoaded);
+    }
+}
+
+void epilogue(int GRFCount, bool hasSLM, const RegData &r0_info, SourceLocation loc = {}) {
+    GRF tmp0(GRFCount - 3);
+    GRF tmp1(GRFCount - 2);
+    GRF r0_copy(GRFCount - 4);
+
+    bool doMemFence = false;
+    bool doSLMFence = false;
+    bool setAccToZero = false;
+
+    switch (hardware) {
+        case HW::XeLP:
+        case HW::XeHP:
+            doMemFence = true;
+            doSLMFence = true;
+            setAccToZero = true;
+            break;
+        case HW::XeHPG:
+            setAccToZero = true;
+            break;
+        default: break;
+    }
+
+    if (!hasSLM) doSLMFence = false;
+
+    int dwordsPerReg = GRF::bytes(hardware) / sizeof(uint32_t);
+    mov<uint32_t>(dwordsPerReg, r0_copy, r0_info, loc);
+
+    if (doMemFence) memfence(tmp0, r0_info, loc);
+    if (doSLMFence) slmfence(tmp1, r0_info, loc);
+
+    if (setAccToZero) {
+        mov(16, acc0.f(), 0.f, loc);
+        if (hardware == HW::XeHP) mov(16, acc2.f(), 0.f, loc);
+    }
+
+    if (doMemFence) wrdep(tmp0, loc);
+    if (doSLMFence) wrdep(tmp1, loc);
+
+    threadend(r0_copy, loc);
+}
+
+
+private:
+
+struct Load {
+    _self &parent;
+
+    Load(_self *parent_) : parent(*parent_) {}
+
+    template <typename DataSpec>
+    void operator()(const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const RegData &addr, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, mod, dst, spec, base, GRFDisp(addr), loc);
+    }
+
+    template <typename DataSpec>
+    void operator()(const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, mod, dst, spec, base, addr, loc);
+    }
+
+    template <typename DataSpec>
+    void operator()(SharedFunction sfid, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr, SourceLocation loc = {})
+    {
+        {
+            MessageDescriptor desc;
+            ExtendedMessageDescriptor exdesc;
+
+            if (sfid != SharedFunction::automatic)
+                exdesc.parts.sfid = static_cast<unsigned>(sfid);
+            encodeLoadDescriptors(parent.hardware, desc, exdesc, mod, dst, spec, base, addr);
+            if (sfid != SharedFunction::automatic)
+                exdesc.parts.sfid = static_cast<unsigned>(sfid);
+            parent.send(mod, dst, addr.getBase(), exdesc.all, desc.all, loc);
+        }
+    }
+
+    void ugm(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugm, mod, dst, spec, base, addr, loc);
+    }
+    void ugml(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugml, mod, dst, spec, base, addr, loc);
+    }
+    void tgm(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::tgm, mod, dst, spec, base, addr, loc);
+    }
+    void slm(const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::slm, mod, dst, spec, base, addr, loc);
+    }
+};
+
+struct Store {
+    _self &parent;
+
+    Store(_self *parent_) : parent(*parent_) {}
+
+    template <typename DataSpec>
+    void operator()(const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const RegData &addr, const RegData &data, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, mod, spec, base, GRFDisp(addr), data, loc);
+    }
+
+    template <typename DataSpec>
+    void operator()(const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, mod, spec, base, addr, data, {});
+    }
+
+    template <typename DataSpec>
+    void operator()(SharedFunction sfid, const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        {
+            MessageDescriptor desc;
+            ExtendedMessageDescriptor exdesc;
+
+            if (sfid != SharedFunction::automatic)
+                exdesc.parts.sfid = static_cast<unsigned>(sfid);
+            encodeStoreDescriptors(parent.hardware, desc, exdesc, mod, spec, base, addr);
+            if (sfid != SharedFunction::automatic)
+                exdesc.parts.sfid = static_cast<unsigned>(sfid);
+            parent.sends(mod, NullRegister(), addr.getBase(), data, exdesc.all, desc.all, loc);
+        }
+    }
+
+    void ugm(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugm, mod, spec, base, addr, data, loc);
+    }
+    void ugml(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugml, mod, spec, base, addr, data, loc);
+    }
+    void tgm(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::tgm, mod, spec, base, addr, data, loc);
+    }
+    void slm(const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::slm, mod, spec, base, addr, data, loc);
+    }
+};
+
+struct Atomic_ {
+    _self &parent;
+
+    Atomic_(_self *parent_) : parent(*parent_) {}
+
+    template <typename DataSpec>
+    void operator()(AtomicOp op, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const RegData &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, op, mod, dst, spec, base, GRFDisp(addr), data, loc);
+    }
+    template <typename DataSpec>
+    void operator()(AtomicOp op, const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const RegData &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, op, mod, NullRegister(), spec, base, GRFDisp(addr), data, loc);
+    }
+
+    template <typename DataSpec>
+    void operator()(AtomicOp op, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, op, mod, dst, spec, base, addr, data, loc);
+    }
+    template <typename DataSpec>
+    void operator()(AtomicOp op, const InstructionModifier &mod, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::automatic, op, mod, NullRegister(), spec, base, addr, data, loc);
+    }
+    template <typename DataSpec>
+    void operator()(SharedFunction sfid, AtomicOp op, const InstructionModifier &mod, const RegData &dst, const DataSpec &spec, AddressBase base, const GRFDisp &addr, const RegData &data, SourceLocation loc = {})
+    {
+        {
+            MessageDescriptor desc;
+            ExtendedMessageDescriptor exdesc;
+
+            if (sfid != SharedFunction::automatic)
+                exdesc.parts.sfid = static_cast<unsigned>(sfid);
+            encodeAtomicDescriptors(parent.hardware, desc, exdesc, op, mod, dst, spec, base, addr);
+            if (sfid != SharedFunction::automatic)
+                exdesc.parts.sfid = static_cast<unsigned>(sfid);
+            if (data.isNull())
+                parent.send(mod, dst, addr.getBase(), exdesc.all, desc.all, loc);
+            else
+                parent.sends(mod, dst, addr.getBase(), data, exdesc.all, desc.all, loc);
+        }
+    }
+
+    void ugm(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugm, op, mod, dst, spec, base, addr, data, loc);
+    }
+    void ugm(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugm, op, mod, NullRegister(), spec, base, addr, data, loc);
+    }
+    void ugml(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugml, op, mod, dst, spec, base, addr, data, loc);
+    }
+    void ugml(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::ugml, op, mod, NullRegister(), spec, base, addr, data, loc);
+    }
+    void tgm(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::tgm, op, mod, dst, spec, base, addr, data, loc);
+    }
+    void tgm(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::tgm, op, mod, NullRegister(), spec, base, addr, data, loc);
+    }
+    void slm(AtomicOp op, const InstructionModifier &mod, const RegData &dst, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::slm, op, mod, dst, spec, base, addr, data, loc);
+    }
+    void slm(AtomicOp op, const InstructionModifier &mod, DataSpecLSC spec, AddressBase base, const GRFDisp &addr, const RegData &data = NullRegister(), SourceLocation loc = {})
+    {
+        this->operator()(SharedFunction::slm, op, mod, NullRegister(), spec, base, addr, data, loc);
+    }
+};
+
+public:
+
+Load load;
+Store store;
+Atomic_ atomic;
diff --git a/src/gpu/intel/jit/ngen/ngen_register_allocator.cpp b/third_party/ngen/ngen_register_allocator.cpp
similarity index 100%
rename from src/gpu/intel/jit/ngen/ngen_register_allocator.cpp
rename to third_party/ngen/ngen_register_allocator.cpp
diff --git a/src/gpu/intel/jit/ngen/ngen_register_allocator.hpp b/third_party/ngen/ngen_register_allocator.hpp
similarity index 90%
rename from src/gpu/intel/jit/ngen/ngen_register_allocator.hpp
rename to third_party/ngen/ngen_register_allocator.hpp
index 091f3f08c85..57d25b1336f 100644
--- a/src/gpu/intel/jit/ngen/ngen_register_allocator.hpp
+++ b/third_party/ngen/ngen_register_allocator.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,6 +17,11 @@
 #ifndef NGEN_REGISTER_ALLOCATOR_HPP
 #define NGEN_REGISTER_ALLOCATOR_HPP
 
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wimplicit-int-conversion"
+#endif
+
 #include "ngen.hpp"
 #include <cstdint>
 #include <stdexcept>
@@ -56,7 +61,7 @@ struct Bundle {
     inline int groupSize(HW hw) const;                  // Number of registers in each contiguous group of the bundle.
     inline int stride(HW hw) const;                     // Stride between register groups of the bundle.
 
-    inline int64_t regMask(HW hw, int offset) const;    // Get register mask for this bundle, for registers [64*offset, 64*(offset+1)).
+    inline uint64_t regMask(HW hw, int offset) const;   // Get register mask for this bundle, for registers [64*offset, 64*(offset+1)).
 
     friend constexpr bool operator==(const Bundle &b1, const Bundle &b2) {
         return b1.bundle_id == b2.bundle_id && b1.bank_id == b2.bank_id;
@@ -75,7 +80,7 @@ struct Bundle {
     static constexpr   int bank_count(HW hw)                { return bankCount(hw); }
     int first_reg(HW hw) const                              { return firstReg(hw); }
     int group_size(HW hw) const                             { return groupSize(hw); }
-    int64_t reg_mask(HW hw, int offset) const               { return regMask(hw, offset); }
+    uint64_t reg_mask(HW hw, int offset) const              { return regMask(hw, offset); }
     static bool same_bank(HW hw, RegData r1, RegData r2)    { return sameBank(hw, r1, r2); }
 };
 
@@ -92,8 +97,8 @@ struct BundleGroup {
 
     friend BundleGroup operator|(BundleGroup lhs, Bundle rhs) { lhs |= rhs; return lhs; }
     BundleGroup &operator|=(Bundle rhs) {
-        for (int rchunk = 0; rchunk < int(reg_masks.size()); rchunk++)
-            reg_masks[rchunk] |= rhs.regMask(hw, rchunk);
+        for (size_t rchunk = 0; rchunk < reg_masks.size(); rchunk++)
+            reg_masks[rchunk] |= rhs.reg_mask(hw, int(rchunk));
         return *this;
     }
 
@@ -169,6 +174,11 @@ class RegisterAllocator {
     inline int getRegisterCount() const { return regCount; }
     inline int countAllocedRegisters() const;
 
+    // Check availability.
+    inline bool isFree(GRF reg) const;
+    inline bool isFree(GRFRange range) const;
+    inline bool isFree(Subregister subreg) const;
+
 #ifdef NGEN_ENABLE_RA_DUMP
     inline void dump(std::ostream &str);
 #endif
@@ -230,6 +240,7 @@ int Bundle::firstReg(HW hw) const
     case HW::Gen12LP:
     case HW::XeHPC:
     case HW::Xe2:
+    case HW::Xe3:
         return (bundle0 << 1) | bank0;
     case HW::XeHP:
     case HW::XeHPG:
@@ -265,6 +276,7 @@ int Bundle::stride(HW hw) const
         return 4;
     case HW::Gen12LP:
     case HW::Xe2:
+    case HW::Xe3:
         return 16;
     case HW::XeHP:
     case HW::XeHPG:
@@ -276,9 +288,9 @@ int Bundle::stride(HW hw) const
     }
 }
 
-int64_t Bundle::regMask(HW hw, int offset) const
+uint64_t Bundle::regMask(HW hw, int offset) const
 {
-    int64_t bundle_mask = -1, bank_mask = -1, base_mask = -1;
+    uint64_t bundle_mask = -1, bank_mask = -1, base_mask = -1;
     int bundle0 = (bundle_id == any) ? 0 : bundle_id;
     int bank0   = (bank_id == any)   ? 0 : bank_id;
 
@@ -294,6 +306,7 @@ int64_t Bundle::regMask(HW hw, int offset) const
         return bundle_mask & bank_mask;
     case HW::Gen12LP:
     case HW::Xe2:
+    case HW::Xe3:
         if (bundle_id != any)                           base_mask  = 0x0003000300030003;
         if (bank_id != any)                             base_mask &= 0x5555555555555555;
         return base_mask << (bank0 + (bundle0 << 1));
@@ -323,6 +336,7 @@ Bundle Bundle::locate(HW hw, RegData reg)
             return Bundle((base >> 1) & 1, base >> 6);
         case HW::Gen12LP:
         case HW::Xe2:
+        case HW::Xe3:
             return Bundle(base & 1, (base >> 1) & 7);
         case HW::XeHP:
         case HW::XeHPG:
@@ -458,6 +472,31 @@ void RegisterAllocator::release(FlagRegister flag)
         freeFlag |= (1 << (flag.index() + 1));
 }
 
+bool RegisterAllocator::isFree(GRF reg) const
+{
+    if (reg.isInvalid()) return true;
+    return freeSub[reg.getBase()] == fullSubMask;
+}
+
+bool RegisterAllocator::isFree(GRFRange range) const
+{
+    if (range.isInvalid()) return true;
+    for (int i = 0; i < range.getLen(); i++)
+        if (!isFree(range[i]))
+            return false;
+    return true;
+}
+
+bool RegisterAllocator::isFree(Subregister subreg) const
+{
+    if (subreg.isInvalid()) return true;
+    int r = subreg.getBase();
+    int dw = subreg.getDwords();
+    int o = (subreg.getByteOffset()) >> 2;
+    auto m = (1 << (o + dw)) - (1 << o);
+    return (~freeSub[r] & m) == 0;
+}
+
 // -------------------------------------------
 //  High-level register allocation functions.
 // -------------------------------------------
@@ -488,16 +527,16 @@ FlagRegister RegisterAllocator::allocFlag(bool sub)
 
 GRFRange RegisterAllocator::tryAllocRange(int nregs, Bundle baseBundle, BundleGroup bundleMask)
 {
-    int64_t freeWhole64[sizeof(freeWhole) / sizeof(int64_t)];
+    if (nregs == 0) return GRFRange(0, 0);
+
+    uint64_t freeWhole64[sizeof(freeWhole) / sizeof(uint64_t)];
     std::memcpy(freeWhole64, freeWhole, sizeof(freeWhole));
     bool ok = false;
     int r_base = -1;
 
-    if (nregs <= 0) return GRFRange(0, 0);
-
     for (int rchunk = 0; rchunk < (GRF::maxRegs() >> 6); rchunk++) {
-        int64_t free = freeWhole64[rchunk] & bundleMask.regMask(rchunk);
-        int64_t free_base = free & baseBundle.regMask(hw, rchunk);
+        uint64_t free = freeWhole64[rchunk] & bundleMask.regMask(rchunk);
+        uint64_t free_base = free & baseBundle.regMask(hw, rchunk);
 
         while (free_base) {
             // Find the first free base register.
@@ -514,7 +553,7 @@ GRFRange RegisterAllocator::tryAllocRange(int nregs, Bundle baseBundle, BundleGr
                 // Range to check crosses 64-GRF boundary. Check first part using bitmasks,
                 // Check the rest using a loop (ho hum).
                 uint64_t mask = ~uint64_t(0) << first_bit;
-                ok = !(mask & ~free);
+                ok = !(mask & ~free) && (r_base + nregs <= (int)(sizeof(freeSub) / sizeof(freeSub[0])));
                 if (ok) for (int rr = 64 - first_bit; rr < nregs; rr++) {
                     if (freeSub[r_base + rr] != fullSubMask) {
                         ok = false;
@@ -533,7 +572,7 @@ GRFRange RegisterAllocator::tryAllocRange(int nregs, Bundle baseBundle, BundleGr
 
             // Not enough consecutive registers. Save time when looking for next base
             //  register by clearing the entire range of registers we just considered.
-            int64_t clear_mask = free + (uint64_t(1) << first_bit);
+            uint64_t clear_mask = free + (uint64_t(1) << first_bit);
             free &= clear_mask;
             free_base &= clear_mask;
         }
@@ -556,17 +595,17 @@ Subregister RegisterAllocator::tryAllocSub(DataType type, Bundle bundle)
     auto find_alloc_sub = [&,bundle,dwords](bool search_full_grf) -> bool {
         static const uint16_t alloc_patterns[4] = {0b1111111111111111, 0b0101010101010101, 0, 0b0001000100010001};
         auto alloc_pattern = alloc_patterns[(dwords - 1) & 3];
-        int64_t freeWhole64[sizeof(freeWhole) / sizeof(int64_t)];
+        uint64_t freeWhole64[sizeof(freeWhole) / sizeof(uint64_t)];
         std::memcpy(freeWhole64, freeWhole, sizeof(freeWhole));
 
         for (int rchunk = 0; rchunk < (GRF::maxRegs() >> 6); rchunk++) {
-            int64_t free = search_full_grf ? freeWhole64[rchunk] : -1;
-            free &= bundle.regMask(hw, rchunk);
+            uint64_t free = search_full_grf ? freeWhole64[rchunk] : -1;
+            free &= bundle.reg_mask(hw, rchunk);
 
             while (free) {
                 int rr = utils::bsf(free);
                 int r = rr + (rchunk << 6);
-                free &= ~(int64_t(1) << rr);
+                free &= ~(uint64_t(1) << rr);
 
                 if (search_full_grf || freeSub[r] != fullSubMask) {
                     int subfree = freeSub[r];
@@ -661,4 +700,8 @@ void RegisterAllocator::dump(std::ostream &str)
 
 } /* namespace NGEN_NAMESPACE */
 
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
 #endif /* include guard */
diff --git a/src/gpu/intel/jit/ngen/ngen_register_decl.hpp b/third_party/ngen/ngen_register_decl.hpp
similarity index 98%
rename from src/gpu/intel/jit/ngen/ngen_register_decl.hpp
rename to third_party/ngen/ngen_register_decl.hpp
index 47827e56b21..807ecddcd99 100644
--- a/src/gpu/intel/jit/ngen/ngen_register_decl.hpp
+++ b/third_party/ngen/ngen_register_decl.hpp
@@ -487,7 +487,8 @@ PREFIX constexpr const NGEN_NAMESPACE::InstructionModifier &CG::W;
 PREFIX constexpr NGEN_NAMESPACE::CacheSettingsLSC CG::L1C_L3CC; \
 PREFIX constexpr NGEN_NAMESPACE::CacheSettingsLSC CG::L1UC_L3CC;
 
-#define NGEN_REGISTER_DECL_EXTRA2(CG,PREFIX)
+#define NGEN_REGISTER_DECL_EXTRA2(CG,PREFIX) \
+PREFIX constexpr NGEN_NAMESPACE::ScalarRegister CG::s0;
 
 #define NGEN_REGISTER_DECL_EXTRA3(CG,PREFIX)
 
@@ -504,6 +505,11 @@ NGEN_REGISTER_DECL_EXTRA4(CG,PREFIX)
 #include "ngen.hpp"
 NGEN_REGISTER_DECL(NGEN_NAMESPACE::BinaryCodeGenerator<hw>, template <NGEN_NAMESPACE::HW hw>)
 
+#ifdef NGEN_ASM
+#include "ngen_asm.hpp"
+NGEN_REGISTER_DECL(NGEN_NAMESPACE::AsmCodeGenerator, /* nothing */)
+#endif
+
 template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::Unknown>;
 template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::Gen9>;
 template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::Gen10>;
@@ -513,5 +519,6 @@ template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::XeHP>;
 template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::XeHPG>;
 template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::XeHPC>;
 template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::Xe2>;
+template class NGEN_NAMESPACE::BinaryCodeGenerator<NGEN_NAMESPACE::HW::Xe3>;
 
 #endif /* (defined(NGEN_CPP11) || defined(NGEN_CPP14)) && !defined(NGEN_GLOBAL_REGS) */
diff --git a/src/gpu/intel/jit/ngen/ngen_registers.hpp b/third_party/ngen/ngen_registers.hpp
similarity index 99%
rename from src/gpu/intel/jit/ngen/ngen_registers.hpp
rename to third_party/ngen/ngen_registers.hpp
index fa36a97de2b..dceab7c4d35 100644
--- a/src/gpu/intel/jit/ngen/ngen_registers.hpp
+++ b/third_party/ngen/ngen_registers.hpp
@@ -76,6 +76,7 @@ static constexpr_reg FlagRegister f0_0{0,0}, f0_1{0,1}, f1_0{1,0}, f1_1{1,1};
 static constexpr_reg FlagRegister f2{2}, f3{3};
 static constexpr_reg ChannelEnableRegister ce0{0};
 static constexpr_reg StackPointerRegister sp{0};
+static constexpr_reg ScalarRegister s0{0};
 static constexpr_reg StateRegister sr0{0}, sr1{1};
 static constexpr_reg ControlRegister cr0{0};
 static constexpr_reg NotificationRegister n0{0};
diff --git a/third_party/ngen/ngen_sycl.hpp b/third_party/ngen/ngen_sycl.hpp
new file mode 100644
index 00000000000..39fe260cfc4
--- /dev/null
+++ b/third_party/ngen/ngen_sycl.hpp
@@ -0,0 +1,167 @@
+/*******************************************************************************
+* Copyright 2025 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef NGEN_SYCL_HPP
+#define NGEN_SYCL_HPP
+
+#include "ngen_config_internal.hpp"
+#include "ngen_opencl.hpp"
+#include "ngen_level_zero.hpp"
+#include "ngen_interface.hpp"
+
+#include <sycl/sycl.hpp>
+#include <sycl/backend/opencl.hpp>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+
+
+namespace NGEN_NAMESPACE {
+
+
+// Exceptions.
+class unsupported_sycl_device : public std::runtime_error {
+public:
+    unsupported_sycl_device() : std::runtime_error("Unsupported SYCL device.") {}
+};
+
+// SYCL program generator class.
+template <HW hw>
+class SYCLCodeGenerator : public ELFCodeGenerator<hw>
+{
+public:
+    using ELFCodeGenerator<hw>::getBinary;
+
+    explicit SYCLCodeGenerator(Product product_, DebugConfig debugConfig = {})  : ELFCodeGenerator<hw>(product_, debugConfig) {}
+    explicit SYCLCodeGenerator(int stepping_ = 0, DebugConfig debugConfig = {}) : ELFCodeGenerator<hw>(stepping_, debugConfig) {}
+    explicit SYCLCodeGenerator(DebugConfig debugConfig) : ELFCodeGenerator<hw>(0, debugConfig) {}
+
+    inline sycl::kernel getKernel(const sycl::context &context, const sycl::device &device);
+    bool binaryIsZebin() { return true; }
+
+    static inline HW detectHW(const sycl::context &context, const sycl::device &device);
+    static inline Product detectHWInfo(const sycl::context &context, const sycl::device &device);
+
+    // Queue-based convenience APIs.
+    sycl::kernel getKernel(sycl::queue &queue) {
+        return getKernel(queue.get_context(), queue.get_device());
+    }
+    static HW detectHW(sycl::queue &queue) {
+        return detectHW(queue.get_context(), queue.get_device());
+    }
+    static Product detectHWInfo(sycl::queue &queue) {
+        return detectHWInfo(queue.get_context(), queue.get_device());
+    }
+};
+
+#define NGEN_FORWARD_SYCL(hw) NGEN_FORWARD_ELF(hw)
+
+template <HW hw>
+sycl::kernel SYCLCodeGenerator<hw>::getKernel(const sycl::context &context, const sycl::device &device)
+{
+    using namespace sycl;
+    using super = ELFCodeGenerator<hw>;
+
+    auto kernelName = super::interface_.getExternalName().c_str();
+    auto binary = super::getBinary();
+
+    const auto *binaryPtr = binary.data();
+    size_t binarySize = binary.size();
+
+    std::optional<kernel> outKernel;
+
+    switch (device.get_backend()) {
+        case backend::opencl: {
+            auto contextCL = get_native<backend::opencl>(context);
+            auto deviceCL = get_native<backend::opencl>(device);
+
+            cl_int status = CL_SUCCESS;
+            auto programCL = clCreateProgramWithBinary(contextCL, 1, &deviceCL, &binarySize, &binaryPtr, nullptr, &status);
+
+            detail::handleCL(status);
+            if (programCL == nullptr)
+                detail::handleCL(CL_OUT_OF_HOST_MEMORY);    /* a tried and true "default" error */
+
+            detail::handleCL(clBuildProgram(programCL, 1, &deviceCL, "-cl-std=CL2.0", nullptr, nullptr));
+
+            auto kernelCL = clCreateKernel(programCL, kernelName, &status);
+            detail::handleCL(status);
+
+            outKernel = make_kernel<backend::opencl>(kernelCL, context);
+
+            detail::handleCL(clReleaseKernel(kernelCL));
+            detail::handleCL(clReleaseProgram(programCL));
+            detail::handleCL(clReleaseContext(contextCL));
+            break;
+        }
+        case backend::ext_oneapi_level_zero: {
+            auto contextL0 = get_native<backend::ext_oneapi_level_zero>(context);
+            auto deviceL0 = get_native<backend::ext_oneapi_level_zero>(device);
+
+            ze_module_desc_t moduleDesc = {
+                ZE_STRUCTURE_TYPE_MODULE_DESC,
+                nullptr,
+                ZE_MODULE_FORMAT_NATIVE,
+                binarySize,
+                binaryPtr,
+                "",
+                nullptr
+            };
+
+            ze_module_handle_t moduleL0;
+            detail::handleL0(call_zeModuleCreate(contextL0, deviceL0, &moduleDesc, &moduleL0, nullptr));
+
+            ze_kernel_handle_t kernelL0;
+            ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, kernelName};
+            detail::handleL0(call_zeKernelCreate(moduleL0, &kernelDesc, &kernelL0));
+
+            auto bundle = make_kernel_bundle<backend::ext_oneapi_level_zero, bundle_state::executable>({moduleL0}, context);
+            outKernel = make_kernel<backend::ext_oneapi_level_zero>({bundle, kernelL0}, context);
+            break;
+        }
+        default: throw unsupported_sycl_device();
+    }
+
+    return outKernel.value();
+}
+
+template <HW hw>
+HW SYCLCodeGenerator<hw>::detectHW(const sycl::context &context, const sycl::device &device)
+{
+    return getCore(detectHWInfo(context, device).family);
+}
+
+template <HW hw>
+Product SYCLCodeGenerator<hw>::detectHWInfo(const sycl::context &context, const sycl::device &device)
+{
+    using namespace sycl;
+    switch (device.get_backend()) {
+        case backend::opencl: {
+            auto contextCL = get_native<backend::opencl>(context);
+            auto deviceCL = get_native<backend::opencl>(device);
+            auto ret = OpenCLCodeGenerator<hw>::detectHWInfo(contextCL, deviceCL);
+            detail::handleCL(clReleaseContext(contextCL));
+            return ret;
+        }
+        case backend::ext_oneapi_level_zero:
+            return LevelZeroCodeGenerator<hw>::detectHWInfo(get_native<backend::ext_oneapi_level_zero>(context),
+                                                            get_native<backend::ext_oneapi_level_zero>(device));
+        default: throw unsupported_sycl_device();
+    }
+    return Product{};
+}
+
+} /* namespace NGEN_NAMESPACE */
+
+#endif
diff --git a/src/gpu/intel/jit/ngen/ngen_utils.hpp b/third_party/ngen/ngen_utils.hpp
similarity index 80%
rename from src/gpu/intel/jit/ngen/ngen_utils.hpp
rename to third_party/ngen/ngen_utils.hpp
index 65782ce4a68..926f239eebc 100644
--- a/src/gpu/intel/jit/ngen/ngen_utils.hpp
+++ b/third_party/ngen/ngen_utils.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #define NGEN_UTILS_HPP
 
 #include <cstdint>
+#include <string>
 
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -49,9 +50,9 @@ template <typename T> static inline constexpr14 int bsf(T x)
 #if defined(_MSC_VER)
     unsigned long index = 0;
     if (sizeof(T) > 4)
-        (void) _BitScanForward64(&index, x);
+        (void) _BitScanForward64(&index, (unsigned __int64) x);
     else
-        (void) _BitScanForward(&index, x);
+        (void) _BitScanForward(&index, (unsigned long) x);
     return index;
 #else
     if (sizeof(T) > 4)
@@ -83,9 +84,9 @@ template <typename T> static inline constexpr14 int popcnt(T x)
 {
 #if defined(_MSC_VER) && !defined(__clang__)
     if (sizeof(T) > 4)
-        return __popcnt64(x);
+        return int(__popcnt64((unsigned __int64) x));
     else
-        return __popcnt(x);
+        return __popcnt((unsigned int) x);
 #else
     if (sizeof(T) > 4)
         return __builtin_popcountll(x);
@@ -136,13 +137,26 @@ static inline void copy_into(std::vector<uint8_t> &dst, size_t dst_offset,
         dst[i + dst_offset] = src[i + src_offset];
 }
 
-template <typename Container>
+template <typename T>
+static inline void copy_into(std::vector<uint8_t> &dst, size_t dst_offset, const T & src) {
+    if (dst_offset + sizeof(T) > dst.size()) return;
+    memcpy(dst.data() + dst_offset, &src, sizeof(T));
+}
+
+template <typename T>
 static inline void copy_into(std::vector<uint8_t> &dst, size_t dst_offset,
-                             const Container &src)
+                             const std::vector<T> &src)
 {
+    static_assert(sizeof(T) == 1, "Unexpected element size");
     copy_into(dst, dst_offset, src, 0, src.size());
 }
 
+template <>
+inline void copy_into(std::vector<uint8_t> &dst, size_t dst_offset,
+                             const std::string &src) {
+  copy_into(dst, dst_offset, src, 0, src.size());
+}
+
 } /* namespace utils */
 } /* namespace NGEN_NAMESPACE */
 
diff --git a/src/gpu/intel/jit/ngen/npack/elf_structs.hpp b/third_party/ngen/npack/elf_structs.hpp
similarity index 100%
rename from src/gpu/intel/jit/ngen/npack/elf_structs.hpp
rename to third_party/ngen/npack/elf_structs.hpp
diff --git a/src/gpu/intel/jit/ngen/npack/hash.hpp b/third_party/ngen/npack/hash.hpp
similarity index 100%
rename from src/gpu/intel/jit/ngen/npack/hash.hpp
rename to third_party/ngen/npack/hash.hpp
diff --git a/src/gpu/intel/jit/ngen/npack/neo_packager.hpp b/third_party/ngen/npack/neo_packager.hpp
similarity index 85%
rename from src/gpu/intel/jit/ngen/npack/neo_packager.hpp
rename to third_party/ngen/npack/neo_packager.hpp
index 219944eb86e..5041e912ef3 100644
--- a/src/gpu/intel/jit/ngen/npack/neo_packager.hpp
+++ b/third_party/ngen/npack/neo_packager.hpp
@@ -193,6 +193,7 @@ inline HW decodeGfxCoreFamily(GfxCoreFamily family)
         case GfxCoreFamily::XeHPG:    return HW::XeHPG;
         case GfxCoreFamily::XeHPC:    return HW::XeHPC;
         case GfxCoreFamily::Xe2:      return HW::Xe2;
+        case GfxCoreFamily::Xe3:      return HW::Xe3;
         default:                      return HW::Unknown;
     }
 }
@@ -208,6 +209,7 @@ inline GfxCoreFamily encodeGfxCoreFamily(HW hw)
         case HW::XeHPG:   return GfxCoreFamily::XeHPG;
         case HW::XeHPC:   return GfxCoreFamily::XeHPC;
         case HW::Xe2:     return GfxCoreFamily::Xe2;
+        case HW::Xe3:     return GfxCoreFamily::Xe3;
         default:          return GfxCoreFamily::Unknown;
     }
 }
@@ -223,7 +225,9 @@ inline NGEN_NAMESPACE::ProductFamily decodeProductFamily(ProductFamily family)
     if (family == ProductFamily::MTL) return NGEN_NAMESPACE::ProductFamily::MTL;
     if (family == ProductFamily::PVC) return NGEN_NAMESPACE::ProductFamily::PVC;
     if (family == ProductFamily::ARL) return NGEN_NAMESPACE::ProductFamily::ARL;
-    if (family >= ProductFamily::LNL && family <= ProductFamily::LNL_M) return NGEN_NAMESPACE::ProductFamily::GenericXe2;
+    if (family == ProductFamily::BMG) return NGEN_NAMESPACE::ProductFamily::BMG;
+    if (family >= ProductFamily::LNL && family <= ProductFamily::LNL_M) return NGEN_NAMESPACE::ProductFamily::LNL;
+    if (family >= ProductFamily::PTL) return ngen::ProductFamily::GenericXe3;
     return NGEN_NAMESPACE::ProductFamily::Unknown;
 }
 
@@ -246,18 +250,23 @@ inline bool hasGatewayEOTSend(const std::vector<uint8_t> &binary)
     return false;
 }
 
-inline void getBinaryHWInfo(const std::vector<uint8_t> &binary, HW &outHW, Product &outProduct)
+inline Product getBinaryHWInfo(const std::vector<uint8_t> &binary)
 {
     const SProgramBinaryHeader *pheader = nullptr;
 
     findDeviceBinary(binary, nullptr, &pheader, nullptr);
-    outHW = decodeGfxCoreFamily(pheader->Device);
-    outProduct.family = NGEN_NAMESPACE::ProductFamily::Unknown;
-    outProduct.stepping = pheader->SteppingId;
+    HW hw = decodeGfxCoreFamily(pheader->Device);
 
     // XeHPG identifies with older runtimes as XeHP. Check whether EOT goes to TS (XeHP) or gateway (XeHPG).
-    if (outHW == HW::XeHP && hasGatewayEOTSend(binary))
-        outHW = HW::XeHPG;
+    if (hw == HW::XeHP && hasGatewayEOTSend(binary))
+        hw = HW::XeHPG;
+
+    Product ret;
+    ret.family = genericProductFamily(hw);
+    ret.stepping = pheader->SteppingId;
+    ret.type = PlatformType::Unknown;
+
+    return ret;
 }
 
 inline NGEN_NAMESPACE::Product decodeHWIPVersion(uint32_t rawVersion)
@@ -274,37 +283,49 @@ inline NGEN_NAMESPACE::Product decodeHWIPVersion(uint32_t rawVersion)
         };
     } version;
 
-    ngen::Product outProduct = {ngen::ProductFamily::Unknown, 0};
+    NGEN_NAMESPACE::Product outProduct = {NGEN_NAMESPACE::ProductFamily::Unknown, 0, NGEN_NAMESPACE::PlatformType::Unknown};
 
     version.raw = rawVersion;
     switch (version.architecture) {
-        case 9:  outProduct.family = ngen::ProductFamily::GenericGen9; break;
-        case 11: outProduct.family = ngen::ProductFamily::GenericGen11; break;
+        case 9:  outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericGen9; break;
+        case 11: outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericGen11; break;
         case 12:
             if (version.release <= 10)
-                outProduct.family = ngen::ProductFamily::GenericGen12LP;
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericGen12LP;
             else if (version.release == 50)
-                outProduct.family = ngen::ProductFamily::GenericXeHP;
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::GenericXeHP;
             else if (version.release > 50 && version.release <= 59)
-                outProduct.family = ngen::ProductFamily::DG2;
-            else if (version.release >= 60 && version.release <= 61)
-                outProduct.family = ngen::ProductFamily::PVC;
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::DG2;
+            else if (version.release == 60)
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::PVC;
+            else if (version.release == 61)
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::PVCVG;
             else if (version.release >= 70 && version.release <= 71)
-                outProduct.family = ngen::ProductFamily::MTL;
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::MTL;
             else if (version.release >= 73 && version.release <= 74)
-                 outProduct.family = ngen::ProductFamily::ARL;
+                outProduct.family = NGEN_NAMESPACE::ProductFamily::ARL;
             break;
-        case 20: outProduct.family = ngen::ProductFamily::GenericXe2; break;
+        case 20:
+            if (version.release <= 2)
+                outProduct.family = ngen::ProductFamily::BMG;
+            else if (version.release == 4)
+                outProduct.family = ngen::ProductFamily::LNL;
+            else
+                outProduct.family = ngen::ProductFamily::GenericXe2;
+            break;
+        case 30: outProduct.family = ngen::ProductFamily::GenericXe3; break;
         default: outProduct.family = ngen::ProductFamily::Unknown; break;
     }
 
-    if (outProduct.family != ngen::ProductFamily::Unknown)
+    if (outProduct.family != NGEN_NAMESPACE::ProductFamily::Unknown)
         outProduct.stepping = version.revision;
 
+    outProduct.type = getPlatformType(outProduct.family);
+
     return outProduct;
 }
 
 } /* namespace npack */
-} /* namespace ngen */
+} /* namespace NGEN_NAMESPACE */
 
 #endif /* header guard */
diff --git a/src/gpu/intel/jit/ngen/npack/neo_structs.hpp b/third_party/ngen/npack/neo_structs.hpp
similarity index 98%
rename from src/gpu/intel/jit/ngen/npack/neo_structs.hpp
rename to third_party/ngen/npack/neo_structs.hpp
index 2a2b99841c6..17e97a03239 100644
--- a/src/gpu/intel/jit/ngen/npack/neo_structs.hpp
+++ b/third_party/ngen/npack/neo_structs.hpp
@@ -44,6 +44,7 @@ enum class GfxCoreFamily : uint32_t {
     XeHPG = 0xC07,
     XeHPC = 0xC08,
     Xe2 = 0xC09,
+    Xe3 = 0x1E00,
 };
 
 enum class ProductFamily : uint32_t {
@@ -58,8 +59,10 @@ enum class ProductFamily : uint32_t {
     PVC = 1271,
     MTL = 1272,
     ARL = 1273,
+    BMG = 1274,
     LNL = 1275,
     LNL_M = 1276,
+    PTL = 1300,
 };
 
 struct SProgramBinaryHeader
diff --git a/third_party/spdlog/README.md b/third_party/spdlog/README.md
new file mode 100644
index 00000000000..ace75d9c61d
--- /dev/null
+++ b/third_party/spdlog/README.md
@@ -0,0 +1,3 @@
+This code is from [spdlog](https://github.com/gabime/spdlog).
+
+tag: 1.15.1
diff --git a/third_party/spdlog/common-inl.h b/third_party/spdlog/common-inl.h
new file mode 100644
index 00000000000..a8a0453c14d
--- /dev/null
+++ b/third_party/spdlog/common-inl.h
@@ -0,0 +1,68 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/common.h>
+#endif
+
+#include <algorithm>
+#include <iterator>
+
+namespace spdlog {
+namespace level {
+
+#if __cplusplus >= 201703L
+constexpr
+#endif
+    static string_view_t level_string_views[] SPDLOG_LEVEL_NAMES;
+
+static const char *short_level_names[] SPDLOG_SHORT_LEVEL_NAMES;
+
+SPDLOG_INLINE const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
+    return level_string_views[l];
+}
+
+SPDLOG_INLINE const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT {
+    return short_level_names[l];
+}
+
+SPDLOG_INLINE spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT {
+    auto it = std::find(std::begin(level_string_views), std::end(level_string_views), name);
+    if (it != std::end(level_string_views))
+        return static_cast<level::level_enum>(std::distance(std::begin(level_string_views), it));
+
+    // check also for "warn" and "err" before giving up..
+    if (name == "warn") {
+        return level::warn;
+    }
+    if (name == "err") {
+        return level::err;
+    }
+    return level::off;
+}
+}  // namespace level
+
+SPDLOG_INLINE spdlog_ex::spdlog_ex(std::string msg)
+    : msg_(std::move(msg)) {}
+
+SPDLOG_INLINE spdlog_ex::spdlog_ex(const std::string &msg, int last_errno) {
+#ifdef SPDLOG_USE_STD_FORMAT
+    msg_ = std::system_error(std::error_code(last_errno, std::generic_category()), msg).what();
+#else
+    memory_buf_t outbuf;
+    fmt::format_system_error(outbuf, last_errno, msg.c_str());
+    msg_ = fmt::to_string(outbuf);
+#endif
+}
+
+SPDLOG_INLINE const char *spdlog_ex::what() const SPDLOG_NOEXCEPT { return msg_.c_str(); }
+
+SPDLOG_INLINE void throw_spdlog_ex(const std::string &msg, int last_errno) {
+    SPDLOG_THROW(spdlog_ex(msg, last_errno));
+}
+
+SPDLOG_INLINE void throw_spdlog_ex(std::string msg) { SPDLOG_THROW(spdlog_ex(std::move(msg))); }
+
+}  // namespace spdlog
diff --git a/third_party/spdlog/common.h b/third_party/spdlog/common.h
new file mode 100644
index 00000000000..71ffd2403be
--- /dev/null
+++ b/third_party/spdlog/common.h
@@ -0,0 +1,406 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/tweakme.h>
+
+#include <atomic>
+#include <chrono>
+#include <cstdio>
+#include <exception>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+#ifdef SPDLOG_USE_STD_FORMAT
+    #include <version>
+    #if __cpp_lib_format >= 202207L
+        #include <format>
+    #else
+        #include <string_view>
+    #endif
+#endif
+
+#ifdef SPDLOG_COMPILED_LIB
+    #undef SPDLOG_HEADER_ONLY
+    #if defined(SPDLOG_SHARED_LIB)
+        #if defined(_WIN32)
+            #ifdef spdlog_EXPORTS
+                #define SPDLOG_API __declspec(dllexport)
+            #else  // !spdlog_EXPORTS
+                #define SPDLOG_API __declspec(dllimport)
+            #endif
+        #else  // !defined(_WIN32)
+            #define SPDLOG_API __attribute__((visibility("default")))
+        #endif
+    #else  // !defined(SPDLOG_SHARED_LIB)
+        #define SPDLOG_API
+    #endif
+    #define SPDLOG_INLINE
+#else  // !defined(SPDLOG_COMPILED_LIB)
+    #define SPDLOG_API
+    #define SPDLOG_HEADER_ONLY
+    #define SPDLOG_INLINE inline
+#endif  // #ifdef SPDLOG_COMPILED_LIB
+
+#include <spdlog/fmt/fmt.h>
+
+#if !defined(SPDLOG_USE_STD_FORMAT) && \
+    FMT_VERSION >= 80000  // backward compatibility with fmt versions older than 8
+    #define SPDLOG_FMT_RUNTIME(format_string) fmt::runtime(format_string)
+    #define SPDLOG_FMT_STRING(format_string) FMT_STRING(format_string)
+    #if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+        #include <spdlog/fmt/xchar.h>
+    #endif
+#else
+    #define SPDLOG_FMT_RUNTIME(format_string) format_string
+    #define SPDLOG_FMT_STRING(format_string) format_string
+#endif
+
+// visual studio up to 2013 does not support noexcept nor constexpr
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+    #define SPDLOG_NOEXCEPT _NOEXCEPT
+    #define SPDLOG_CONSTEXPR
+#else
+    #define SPDLOG_NOEXCEPT noexcept
+    #define SPDLOG_CONSTEXPR constexpr
+#endif
+
+// If building with std::format, can just use constexpr, otherwise if building with fmt
+// SPDLOG_CONSTEXPR_FUNC needs to be set the same as FMT_CONSTEXPR to avoid situations where
+// a constexpr function in spdlog could end up calling a non-constexpr function in fmt
+// depending on the compiler
+// If fmt determines it can't use constexpr, we should inline the function instead
+#ifdef SPDLOG_USE_STD_FORMAT
+    #define SPDLOG_CONSTEXPR_FUNC constexpr
+#else  // Being built with fmt
+    #if FMT_USE_CONSTEXPR
+        #define SPDLOG_CONSTEXPR_FUNC FMT_CONSTEXPR
+    #else
+        #define SPDLOG_CONSTEXPR_FUNC inline
+    #endif
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define SPDLOG_DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+    #define SPDLOG_DEPRECATED __declspec(deprecated)
+#else
+    #define SPDLOG_DEPRECATED
+#endif
+
+// disable thread local on msvc 2013
+#ifndef SPDLOG_NO_TLS
+    #if (defined(_MSC_VER) && (_MSC_VER < 1900)) || defined(__cplusplus_winrt)
+        #define SPDLOG_NO_TLS 1
+    #endif
+#endif
+
+#ifndef SPDLOG_FUNCTION
+    #define SPDLOG_FUNCTION static_cast<const char *>(__FUNCTION__)
+#endif
+
+#ifdef SPDLOG_NO_EXCEPTIONS
+    #define SPDLOG_TRY
+    #define SPDLOG_THROW(ex)                               \
+        do {                                               \
+            printf("spdlog fatal error: %s\n", ex.what()); \
+            std::abort();                                  \
+        } while (0)
+    #define SPDLOG_CATCH_STD
+#else
+    #define SPDLOG_TRY try
+    #define SPDLOG_THROW(ex) throw(ex)
+    #define SPDLOG_CATCH_STD             \
+        catch (const std::exception &) { \
+        }
+#endif
+
+namespace spdlog {
+
+class formatter;
+
+namespace sinks {
+class sink;
+}
+
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+using filename_t = std::wstring;
+    // allow macro expansion to occur in SPDLOG_FILENAME_T
+    #define SPDLOG_FILENAME_T_INNER(s) L##s
+    #define SPDLOG_FILENAME_T(s) SPDLOG_FILENAME_T_INNER(s)
+#else
+using filename_t = std::string;
+    #define SPDLOG_FILENAME_T(s) s
+#endif
+
+using log_clock = std::chrono::system_clock;
+using sink_ptr = std::shared_ptr<sinks::sink>;
+using sinks_init_list = std::initializer_list<sink_ptr>;
+using err_handler = std::function<void(const std::string &err_msg)>;
+#ifdef SPDLOG_USE_STD_FORMAT
+namespace fmt_lib = std;
+
+using string_view_t = std::string_view;
+using memory_buf_t = std::string;
+
+template <typename... Args>
+    #if __cpp_lib_format >= 202207L
+using format_string_t = std::format_string<Args...>;
+    #else
+using format_string_t = std::string_view;
+    #endif
+
+template <class T, class Char = char>
+struct is_convertible_to_basic_format_string
+    : std::integral_constant<bool, std::is_convertible<T, std::basic_string_view<Char>>::value> {};
+
+    #if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+using wstring_view_t = std::wstring_view;
+using wmemory_buf_t = std::wstring;
+
+template <typename... Args>
+        #if __cpp_lib_format >= 202207L
+using wformat_string_t = std::wformat_string<Args...>;
+        #else
+using wformat_string_t = std::wstring_view;
+        #endif
+    #endif
+    #define SPDLOG_BUF_TO_STRING(x) x
+#else  // use fmt lib instead of std::format
+namespace fmt_lib = fmt;
+
+using string_view_t = fmt::basic_string_view<char>;
+using memory_buf_t = fmt::basic_memory_buffer<char, 250>;
+
+template <typename... Args>
+using format_string_t = fmt::format_string<Args...>;
+
+template <class T>
+using remove_cvref_t = typename std::remove_cv<typename std::remove_reference<T>::type>::type;
+
+template <typename Char>
+    #if FMT_VERSION >= 90101
+using fmt_runtime_string = fmt::runtime_format_string<Char>;
+    #else
+using fmt_runtime_string = fmt::basic_runtime<Char>;
+    #endif
+
+// clang doesn't like SFINAE disabled constructor in std::is_convertible<> so have to repeat the
+// condition from basic_format_string here, in addition, fmt::basic_runtime<Char> is only
+// convertible to basic_format_string<Char> but not basic_string_view<Char>
+template <class T, class Char = char>
+struct is_convertible_to_basic_format_string
+    : std::integral_constant<bool,
+                             std::is_convertible<T, fmt::basic_string_view<Char>>::value ||
+                                 std::is_same<remove_cvref_t<T>, fmt_runtime_string<Char>>::value> {
+};
+
+    #if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+using wstring_view_t = fmt::basic_string_view<wchar_t>;
+using wmemory_buf_t = fmt::basic_memory_buffer<wchar_t, 250>;
+
+template <typename... Args>
+using wformat_string_t = fmt::wformat_string<Args...>;
+    #endif
+    #define SPDLOG_BUF_TO_STRING(x) fmt::to_string(x)
+#endif
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+    #ifndef _WIN32
+        #error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
+    #endif  // _WIN32
+#endif      // SPDLOG_WCHAR_TO_UTF8_SUPPORT
+
+template <class T>
+struct is_convertible_to_any_format_string
+    : std::integral_constant<bool,
+                             is_convertible_to_basic_format_string<T, char>::value ||
+                                 is_convertible_to_basic_format_string<T, wchar_t>::value> {};
+
+#if defined(SPDLOG_NO_ATOMIC_LEVELS)
+using level_t = details::null_atomic_int;
+#else
+using level_t = std::atomic<int>;
+#endif
+
+#define SPDLOG_LEVEL_TRACE 0
+#define SPDLOG_LEVEL_DEBUG 1
+#define SPDLOG_LEVEL_INFO 2
+#define SPDLOG_LEVEL_WARN 3
+#define SPDLOG_LEVEL_ERROR 4
+#define SPDLOG_LEVEL_CRITICAL 5
+#define SPDLOG_LEVEL_OFF 6
+
+#if !defined(SPDLOG_ACTIVE_LEVEL)
+    #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+#endif
+
+// Log level enum
+namespace level {
+enum level_enum : int {
+    trace = SPDLOG_LEVEL_TRACE,
+    debug = SPDLOG_LEVEL_DEBUG,
+    info = SPDLOG_LEVEL_INFO,
+    warn = SPDLOG_LEVEL_WARN,
+    err = SPDLOG_LEVEL_ERROR,
+    critical = SPDLOG_LEVEL_CRITICAL,
+    off = SPDLOG_LEVEL_OFF,
+    n_levels
+};
+
+#define SPDLOG_LEVEL_NAME_TRACE spdlog::string_view_t("trace", 5)
+#define SPDLOG_LEVEL_NAME_DEBUG spdlog::string_view_t("debug", 5)
+#define SPDLOG_LEVEL_NAME_INFO spdlog::string_view_t("info", 4)
+#define SPDLOG_LEVEL_NAME_WARNING spdlog::string_view_t("warning", 7)
+#define SPDLOG_LEVEL_NAME_ERROR spdlog::string_view_t("error", 5)
+#define SPDLOG_LEVEL_NAME_CRITICAL spdlog::string_view_t("critical", 8)
+#define SPDLOG_LEVEL_NAME_OFF spdlog::string_view_t("off", 3)
+
+#if !defined(SPDLOG_LEVEL_NAMES)
+    #define SPDLOG_LEVEL_NAMES                                                                  \
+        {                                                                                       \
+            SPDLOG_LEVEL_NAME_TRACE, SPDLOG_LEVEL_NAME_DEBUG, SPDLOG_LEVEL_NAME_INFO,           \
+                SPDLOG_LEVEL_NAME_WARNING, SPDLOG_LEVEL_NAME_ERROR, SPDLOG_LEVEL_NAME_CRITICAL, \
+                SPDLOG_LEVEL_NAME_OFF                                                           \
+        }
+#endif
+
+#if !defined(SPDLOG_SHORT_LEVEL_NAMES)
+
+    #define SPDLOG_SHORT_LEVEL_NAMES \
+        { "T", "D", "I", "W", "E", "C", "O" }
+#endif
+
+SPDLOG_API const string_view_t &to_string_view(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
+SPDLOG_API const char *to_short_c_str(spdlog::level::level_enum l) SPDLOG_NOEXCEPT;
+SPDLOG_API spdlog::level::level_enum from_str(const std::string &name) SPDLOG_NOEXCEPT;
+
+}  // namespace level
+
+//
+// Color mode used by sinks with color support.
+//
+enum class color_mode { always, automatic, never };
+
+//
+// Pattern time - specific time getting to use for pattern_formatter.
+// local time by default
+//
+enum class pattern_time_type {
+    local,  // log localtime
+    utc     // log utc
+};
+
+//
+// Log exception
+//
+class SPDLOG_API spdlog_ex : public std::exception {
+public:
+    explicit spdlog_ex(std::string msg);
+    spdlog_ex(const std::string &msg, int last_errno);
+    const char *what() const SPDLOG_NOEXCEPT override;
+
+private:
+    std::string msg_;
+};
+
+[[noreturn]] SPDLOG_API void throw_spdlog_ex(const std::string &msg, int last_errno);
+[[noreturn]] SPDLOG_API void throw_spdlog_ex(std::string msg);
+
+struct source_loc {
+    SPDLOG_CONSTEXPR source_loc() = default;
+    SPDLOG_CONSTEXPR source_loc(const char *filename_in, int line_in, const char *funcname_in)
+        : filename{filename_in},
+          line{line_in},
+          funcname{funcname_in} {}
+
+    SPDLOG_CONSTEXPR bool empty() const SPDLOG_NOEXCEPT { return line <= 0; }
+    const char *filename{nullptr};
+    int line{0};
+    const char *funcname{nullptr};
+};
+
+struct file_event_handlers {
+    file_event_handlers()
+        : before_open(nullptr),
+          after_open(nullptr),
+          before_close(nullptr),
+          after_close(nullptr) {}
+
+    std::function<void(const filename_t &filename)> before_open;
+    std::function<void(const filename_t &filename, std::FILE *file_stream)> after_open;
+    std::function<void(const filename_t &filename, std::FILE *file_stream)> before_close;
+    std::function<void(const filename_t &filename)> after_close;
+};
+
+namespace details {
+
+// to_string_view
+
+SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(const memory_buf_t &buf)
+    SPDLOG_NOEXCEPT {
+    return spdlog::string_view_t{buf.data(), buf.size()};
+}
+
+SPDLOG_CONSTEXPR_FUNC spdlog::string_view_t to_string_view(spdlog::string_view_t str)
+    SPDLOG_NOEXCEPT {
+    return str;
+}
+
+#if defined(SPDLOG_WCHAR_FILENAMES) || defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT)
+SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(const wmemory_buf_t &buf)
+    SPDLOG_NOEXCEPT {
+    return spdlog::wstring_view_t{buf.data(), buf.size()};
+}
+
+SPDLOG_CONSTEXPR_FUNC spdlog::wstring_view_t to_string_view(spdlog::wstring_view_t str)
+    SPDLOG_NOEXCEPT {
+    return str;
+}
+#endif
+
+#if defined(SPDLOG_USE_STD_FORMAT) &&  __cpp_lib_format >= 202207L
+template <typename T, typename... Args>
+SPDLOG_CONSTEXPR_FUNC std::basic_string_view<T> to_string_view(
+    std::basic_format_string<T, Args...> fmt) SPDLOG_NOEXCEPT {
+    return fmt.get();
+}
+#endif
+
+// make_unique support for pre c++14
+#if __cplusplus >= 201402L  // C++14 and beyond
+using std::enable_if_t;
+using std::make_unique;
+#else
+template <bool B, class T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+
+template <typename T, typename... Args>
+std::unique_ptr<T> make_unique(Args &&...args) {
+    static_assert(!std::is_array<T>::value, "arrays not supported");
+    return std::unique_ptr<T>(new T(std::forward<Args>(args)...));
+}
+#endif
+
+// to avoid useless casts (see https://github.com/nlohmann/json/issues/2893#issuecomment-889152324)
+template <typename T, typename U, enable_if_t<!std::is_same<T, U>::value, int> = 0>
+constexpr T conditional_static_cast(U value) {
+    return static_cast<T>(value);
+}
+
+template <typename T, typename U, enable_if_t<std::is_same<T, U>::value, int> = 0>
+constexpr T conditional_static_cast(U value) {
+    return value;
+}
+
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "common-inl.h"
+#endif
diff --git a/third_party/spdlog/details/backtracer-inl.h b/third_party/spdlog/details/backtracer-inl.h
new file mode 100644
index 00000000000..43d1002478c
--- /dev/null
+++ b/third_party/spdlog/details/backtracer-inl.h
@@ -0,0 +1,63 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/backtracer.h>
+#endif
+namespace spdlog {
+namespace details {
+SPDLOG_INLINE backtracer::backtracer(const backtracer &other) {
+    std::lock_guard<std::mutex> lock(other.mutex_);
+    enabled_ = other.enabled();
+    messages_ = other.messages_;
+}
+
+SPDLOG_INLINE backtracer::backtracer(backtracer &&other) SPDLOG_NOEXCEPT {
+    std::lock_guard<std::mutex> lock(other.mutex_);
+    enabled_ = other.enabled();
+    messages_ = std::move(other.messages_);
+}
+
+SPDLOG_INLINE backtracer &backtracer::operator=(backtracer other) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    enabled_ = other.enabled();
+    messages_ = std::move(other.messages_);
+    return *this;
+}
+
+SPDLOG_INLINE void backtracer::enable(size_t size) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    enabled_.store(true, std::memory_order_relaxed);
+    messages_ = circular_q<log_msg_buffer>{size};
+}
+
+SPDLOG_INLINE void backtracer::disable() {
+    std::lock_guard<std::mutex> lock{mutex_};
+    enabled_.store(false, std::memory_order_relaxed);
+}
+
+SPDLOG_INLINE bool backtracer::enabled() const { return enabled_.load(std::memory_order_relaxed); }
+
+SPDLOG_INLINE void backtracer::push_back(const log_msg &msg) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    messages_.push_back(log_msg_buffer{msg});
+}
+
+SPDLOG_INLINE bool backtracer::empty() const {
+    std::lock_guard<std::mutex> lock{mutex_};
+    return messages_.empty();
+}
+
+// pop all items in the q and apply the given fun on each of them.
+SPDLOG_INLINE void backtracer::foreach_pop(std::function<void(const details::log_msg &)> fun) {
+    std::lock_guard<std::mutex> lock{mutex_};
+    while (!messages_.empty()) {
+        auto &front_msg = messages_.front();
+        fun(front_msg);
+        messages_.pop_front();
+    }
+}
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/backtracer.h b/third_party/spdlog/details/backtracer.h
old mode 100755
new mode 100644
similarity index 82%
rename from src/common/spdlog/details/backtracer.h
rename to third_party/spdlog/details/backtracer.h
index e88b89f33f3..541339cdc32
--- a/src/common/spdlog/details/backtracer.h
+++ b/third_party/spdlog/details/backtracer.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <common/spdlog/details/circular_q.h>
-#include <common/spdlog/details/log_msg_buffer.h>
+#include <spdlog/details/circular_q.h>
+#include <spdlog/details/log_msg_buffer.h>
 
 #include <atomic>
 #include <functional>
@@ -17,7 +17,7 @@ namespace spdlog {
 namespace details {
 class SPDLOG_API backtracer {
     mutable std::mutex mutex_;
-    std::atomic<bool> enabled_ {false};
+    std::atomic<bool> enabled_{false};
     circular_q<log_msg_buffer> messages_;
 
 public:
@@ -37,9 +37,9 @@ class SPDLOG_API backtracer {
     void foreach_pop(std::function<void(const details::log_msg &)> fun);
 };
 
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "backtracer-inl.h"
+    #include "backtracer-inl.h"
 #endif
diff --git a/src/common/spdlog/details/circular_q.h b/third_party/spdlog/details/circular_q.h
old mode 100755
new mode 100644
similarity index 86%
rename from src/common/spdlog/details/circular_q.h
rename to third_party/spdlog/details/circular_q.h
index e1b9d047cc1..29e9d255f4f
--- a/src/common/spdlog/details/circular_q.h
+++ b/third_party/spdlog/details/circular_q.h
@@ -7,7 +7,7 @@
 #include <cassert>
 #include <vector>
 
-#include "common/spdlog/common.h"
+#include "spdlog/common.h"
 
 namespace spdlog {
 namespace details {
@@ -26,17 +26,16 @@ class circular_q {
     circular_q() = default;
 
     explicit circular_q(size_t max_items)
-        : max_items_(max_items + 1) // one item is reserved as marker for full q
-        , v_(max_items_) {}
+        : max_items_(max_items + 1)  // one item is reserved as marker for full q
+          ,
+          v_(max_items_) {}
 
     circular_q(const circular_q &) = default;
     circular_q &operator=(const circular_q &) = default;
 
     // move cannot be default,
     // since we need to reset head_, tail_, etc to zero in the moved object
-    circular_q(circular_q &&other) SPDLOG_NOEXCEPT {
-        copy_moveable(std::move(other));
-    }
+    circular_q(circular_q &&other) SPDLOG_NOEXCEPT { copy_moveable(std::move(other)); }
 
     circular_q &operator=(circular_q &&other) SPDLOG_NOEXCEPT {
         copy_moveable(std::move(other));
@@ -49,7 +48,7 @@ class circular_q {
             v_[tail_] = std::move(item);
             tail_ = (tail_ + 1) % max_items_;
 
-            if (tail_ == head_) // overrun last item if full
+            if (tail_ == head_)  // overrun last item if full
             {
                 head_ = (head_ + 1) % max_items_;
                 ++overrun_counter_;
@@ -87,7 +86,9 @@ class circular_q {
 
     bool full() const {
         // head is ahead of the tail by 1
-        if (max_items_ > 0) { return ((tail_ + 1) % max_items_) == head_; }
+        if (max_items_ > 0) {
+            return ((tail_ + 1) % max_items_) == head_;
+        }
         return false;
     }
 
@@ -110,5 +111,5 @@ class circular_q {
         other.overrun_counter_ = 0;
     }
 };
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/console_globals.h b/third_party/spdlog/details/console_globals.h
old mode 100755
new mode 100644
similarity index 85%
rename from src/common/spdlog/details/console_globals.h
rename to third_party/spdlog/details/console_globals.h
index dd940f5e333..9c552106a4e
--- a/src/common/spdlog/details/console_globals.h
+++ b/third_party/spdlog/details/console_globals.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #include <mutex>
-#include <common/spdlog/details/null_mutex.h>
+#include <spdlog/details/null_mutex.h>
 
 namespace spdlog {
 namespace details {
@@ -24,5 +24,5 @@ struct console_nullmutex {
         return s_mutex;
     }
 };
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
diff --git a/third_party/spdlog/details/file_helper-inl.h b/third_party/spdlog/details/file_helper-inl.h
new file mode 100644
index 00000000000..8742b96c7dc
--- /dev/null
+++ b/third_party/spdlog/details/file_helper-inl.h
@@ -0,0 +1,153 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/file_helper.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/details/os.h>
+
+#include <cerrno>
+#include <chrono>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <tuple>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE file_helper::file_helper(const file_event_handlers &event_handlers)
+    : event_handlers_(event_handlers) {}
+
+SPDLOG_INLINE file_helper::~file_helper() { close(); }
+
+SPDLOG_INLINE void file_helper::open(const filename_t &fname, bool truncate) {
+    close();
+    filename_ = fname;
+
+    auto *mode = SPDLOG_FILENAME_T("ab");
+    auto *trunc_mode = SPDLOG_FILENAME_T("wb");
+
+    if (event_handlers_.before_open) {
+        event_handlers_.before_open(filename_);
+    }
+    for (int tries = 0; tries < open_tries_; ++tries) {
+        // create containing folder if not exists already.
+        os::create_dir(os::dir_name(fname));
+        if (truncate) {
+            // Truncate by opening-and-closing a tmp file in "wb" mode, always
+            // opening the actual log-we-write-to in "ab" mode, since that
+            // interacts more politely with eternal processes that might
+            // rotate/truncate the file underneath us.
+            std::FILE *tmp;
+            if (os::fopen_s(&tmp, fname, trunc_mode)) {
+                continue;
+            }
+            std::fclose(tmp);
+        }
+        if (!os::fopen_s(&fd_, fname, mode)) {
+            if (event_handlers_.after_open) {
+                event_handlers_.after_open(filename_, fd_);
+            }
+            return;
+        }
+
+        details::os::sleep_for_millis(open_interval_);
+    }
+
+    throw_spdlog_ex("Failed opening file " + os::filename_to_str(filename_) + " for writing",
+                    errno);
+}
+
+SPDLOG_INLINE void file_helper::reopen(bool truncate) {
+    if (filename_.empty()) {
+        throw_spdlog_ex("Failed re opening file - was not opened before");
+    }
+    this->open(filename_, truncate);
+}
+
+SPDLOG_INLINE void file_helper::flush() {
+    if (std::fflush(fd_) != 0) {
+        throw_spdlog_ex("Failed flush to file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE void file_helper::sync() {
+    if (!os::fsync(fd_)) {
+        throw_spdlog_ex("Failed to fsync file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE void file_helper::close() {
+    if (fd_ != nullptr) {
+        if (event_handlers_.before_close) {
+            event_handlers_.before_close(filename_, fd_);
+        }
+
+        std::fclose(fd_);
+        fd_ = nullptr;
+
+        if (event_handlers_.after_close) {
+            event_handlers_.after_close(filename_);
+        }
+    }
+}
+
+SPDLOG_INLINE void file_helper::write(const memory_buf_t &buf) {
+    if (fd_ == nullptr) return;
+    size_t msg_size = buf.size();
+    auto data = buf.data();
+
+    if (!details::os::fwrite_bytes(data, msg_size, fd_)) {
+        throw_spdlog_ex("Failed writing to file " + os::filename_to_str(filename_), errno);
+    }
+}
+
+SPDLOG_INLINE size_t file_helper::size() const {
+    if (fd_ == nullptr) {
+        throw_spdlog_ex("Cannot use size() on closed file " + os::filename_to_str(filename_));
+    }
+    return os::filesize(fd_);
+}
+
+SPDLOG_INLINE const filename_t &file_helper::filename() const { return filename_; }
+
+//
+// return file path and its extension:
+//
+// "mylog.txt" => ("mylog", ".txt")
+// "mylog" => ("mylog", "")
+// "mylog." => ("mylog.", "")
+// "/dir1/dir2/mylog.txt" => ("/dir1/dir2/mylog", ".txt")
+//
+// the starting dot in filenames is ignored (hidden files):
+//
+// ".mylog" => (".mylog". "")
+// "my_folder/.mylog" => ("my_folder/.mylog", "")
+// "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
+SPDLOG_INLINE std::tuple<filename_t, filename_t> file_helper::split_by_extension(
+    const filename_t &fname) {
+    auto ext_index = fname.rfind('.');
+
+    // no valid extension found - return whole path and empty string as
+    // extension
+    if (ext_index == filename_t::npos || ext_index == 0 || ext_index == fname.size() - 1) {
+        return std::make_tuple(fname, filename_t());
+    }
+
+    // treat cases like "/etc/rc.d/somelogfile or "/abc/.hiddenfile"
+    auto folder_index = fname.find_last_of(details::os::folder_seps_filename);
+    if (folder_index != filename_t::npos && folder_index >= ext_index - 1) {
+        return std::make_tuple(fname, filename_t());
+    }
+
+    // finally - return a valid base and extension tuple
+    return std::make_tuple(fname.substr(0, ext_index), fname.substr(ext_index));
+}
+
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/file_helper.h b/third_party/spdlog/details/file_helper.h
old mode 100755
new mode 100644
similarity index 89%
rename from src/common/spdlog/details/file_helper.h
rename to third_party/spdlog/details/file_helper.h
index aff77c6e2ef..f0e5d180e33
--- a/src/common/spdlog/details/file_helper.h
+++ b/third_party/spdlog/details/file_helper.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
+#include <spdlog/common.h>
 #include <tuple>
-#include <common/spdlog/common.h>
 
 namespace spdlog {
 namespace details {
@@ -44,19 +44,18 @@ class SPDLOG_API file_helper {
     // ".mylog" => (".mylog". "")
     // "my_folder/.mylog" => ("my_folder/.mylog", "")
     // "my_folder/.mylog.txt" => ("my_folder/.mylog", ".txt")
-    static std::tuple<filename_t, filename_t> split_by_extension(
-            const filename_t &fname);
+    static std::tuple<filename_t, filename_t> split_by_extension(const filename_t &fname);
 
 private:
     const int open_tries_ = 5;
     const unsigned int open_interval_ = 10;
-    std::FILE *fd_ {nullptr};
+    std::FILE *fd_{nullptr};
     filename_t filename_;
     file_event_handlers event_handlers_;
 };
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "file_helper-inl.h"
+    #include "file_helper-inl.h"
 #endif
diff --git a/src/common/spdlog/details/fmt_helper.h b/third_party/spdlog/details/fmt_helper.h
old mode 100755
new mode 100644
similarity index 79%
rename from src/common/spdlog/details/fmt_helper.h
rename to third_party/spdlog/details/fmt_helper.h
index 98be11695f2..61306003b21
--- a/src/common/spdlog/details/fmt_helper.h
+++ b/third_party/spdlog/details/fmt_helper.h
@@ -4,13 +4,13 @@
 
 #include <chrono>
 #include <iterator>
-#include <common/spdlog/common.h>
-#include <common/spdlog/fmt/fmt.h>
+#include <spdlog/common.h>
+#include <spdlog/fmt/fmt.h>
 #include <type_traits>
 
 #ifdef SPDLOG_USE_STD_FORMAT
-#include <charconv>
-#include <limits>
+    #include <charconv>
+    #include <limits>
 #endif
 
 // Some fmt helpers to efficiently format and pad ints and strings
@@ -64,32 +64,31 @@ SPDLOG_CONSTEXPR_FUNC unsigned int count_digits_fallback(T n) {
 
 template <typename T>
 inline unsigned int count_digits(T n) {
-    using count_type = typename std::conditional<(sizeof(T) > sizeof(uint32_t)),
-            uint64_t, uint32_t>::type;
+    using count_type =
+        typename std::conditional<(sizeof(T) > sizeof(uint32_t)), uint64_t, uint32_t>::type;
 #ifdef SPDLOG_USE_STD_FORMAT
     return count_digits_fallback(static_cast<count_type>(n));
 #else
     return static_cast<unsigned int>(fmt::
-// fmt 7.0.0 renamed the internal namespace to detail.
-// See: https://github.com/fmtlib/fmt/issues/1538
-#if FMT_VERSION < 70000
-                    internal
-#else
-                    detail
-#endif
-            ::count_digits(static_cast<count_type>(n)));
+    // fmt 7.0.0 renamed the internal namespace to detail.
+    // See: https://github.com/fmtlib/fmt/issues/1538
+    #if FMT_VERSION < 70000
+                                         internal
+    #else
+                                         detail
+    #endif
+                                     ::count_digits(static_cast<count_type>(n)));
 #endif
 }
 
 inline void pad2(int n, memory_buf_t &dest) {
-    if (n >= 0 && n < 100) // 0-99
+    if (n >= 0 && n < 100)  // 0-99
     {
         dest.push_back(static_cast<char>('0' + n / 10));
         dest.push_back(static_cast<char>('0' + n % 10));
-    } else // unlikely, but just in case, let fmt deal with it
+    } else  // unlikely, but just in case, let fmt deal with it
     {
-        fmt_lib::format_to(
-                std::back_inserter(dest), SPDLOG_FMT_STRING("{:02}"), n);
+        fmt_lib::format_to(std::back_inserter(dest), SPDLOG_FMT_STRING("{:02}"), n);
     }
 }
 
@@ -134,10 +133,9 @@ inline ToDuration time_fraction(log_clock::time_point tp) {
     using std::chrono::seconds;
     auto duration = tp.time_since_epoch();
     auto secs = duration_cast<seconds>(duration);
-    return duration_cast<ToDuration>(duration)
-            - duration_cast<ToDuration>(secs);
+    return duration_cast<ToDuration>(duration) - duration_cast<ToDuration>(secs);
 }
 
-} // namespace fmt_helper
-} // namespace details
-} // namespace spdlog
+}  // namespace fmt_helper
+}  // namespace details
+}  // namespace spdlog
diff --git a/third_party/spdlog/details/log_msg-inl.h b/third_party/spdlog/details/log_msg-inl.h
new file mode 100644
index 00000000000..aa3a957682e
--- /dev/null
+++ b/third_party/spdlog/details/log_msg-inl.h
@@ -0,0 +1,44 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/log_msg.h>
+#endif
+
+#include <spdlog/details/os.h>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE log_msg::log_msg(spdlog::log_clock::time_point log_time,
+                               spdlog::source_loc loc,
+                               string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : logger_name(a_logger_name),
+      level(lvl),
+      time(log_time)
+#ifndef SPDLOG_NO_THREAD_ID
+      ,
+      thread_id(os::thread_id())
+#endif
+      ,
+      source(loc),
+      payload(msg) {
+}
+
+SPDLOG_INLINE log_msg::log_msg(spdlog::source_loc loc,
+                               string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : log_msg(os::now(), loc, a_logger_name, lvl, msg) {}
+
+SPDLOG_INLINE log_msg::log_msg(string_view_t a_logger_name,
+                               spdlog::level::level_enum lvl,
+                               spdlog::string_view_t msg)
+    : log_msg(os::now(), source_loc{}, a_logger_name, lvl, msg) {}
+
+}  // namespace details
+}  // namespace spdlog
diff --git a/third_party/spdlog/details/log_msg.h b/third_party/spdlog/details/log_msg.h
new file mode 100644
index 00000000000..87df1e83327
--- /dev/null
+++ b/third_party/spdlog/details/log_msg.h
@@ -0,0 +1,40 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <string>
+
+namespace spdlog {
+namespace details {
+struct SPDLOG_API log_msg {
+    log_msg() = default;
+    log_msg(log_clock::time_point log_time,
+            source_loc loc,
+            string_view_t logger_name,
+            level::level_enum lvl,
+            string_view_t msg);
+    log_msg(source_loc loc, string_view_t logger_name, level::level_enum lvl, string_view_t msg);
+    log_msg(string_view_t logger_name, level::level_enum lvl, string_view_t msg);
+    log_msg(const log_msg &other) = default;
+    log_msg &operator=(const log_msg &other) = default;
+
+    string_view_t logger_name;
+    level::level_enum level{level::off};
+    log_clock::time_point time;
+    size_t thread_id{0};
+
+    // wrapping the formatted text with color (updated by pattern_formatter).
+    mutable size_t color_range_start{0};
+    mutable size_t color_range_end{0};
+
+    source_loc source;
+    string_view_t payload;
+};
+}  // namespace details
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "log_msg-inl.h"
+#endif
diff --git a/third_party/spdlog/details/log_msg_buffer-inl.h b/third_party/spdlog/details/log_msg_buffer-inl.h
new file mode 100644
index 00000000000..2eb242859ea
--- /dev/null
+++ b/third_party/spdlog/details/log_msg_buffer-inl.h
@@ -0,0 +1,54 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/log_msg_buffer.h>
+#endif
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg &orig_msg)
+    : log_msg{orig_msg} {
+    buffer.append(logger_name.begin(), logger_name.end());
+    buffer.append(payload.begin(), payload.end());
+    update_string_views();
+}
+
+SPDLOG_INLINE log_msg_buffer::log_msg_buffer(const log_msg_buffer &other)
+    : log_msg{other} {
+    buffer.append(logger_name.begin(), logger_name.end());
+    buffer.append(payload.begin(), payload.end());
+    update_string_views();
+}
+
+SPDLOG_INLINE log_msg_buffer::log_msg_buffer(log_msg_buffer &&other) SPDLOG_NOEXCEPT
+    : log_msg{other},
+      buffer{std::move(other.buffer)} {
+    update_string_views();
+}
+
+SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(const log_msg_buffer &other) {
+    log_msg::operator=(other);
+    buffer.clear();
+    buffer.append(other.buffer.data(), other.buffer.data() + other.buffer.size());
+    update_string_views();
+    return *this;
+}
+
+SPDLOG_INLINE log_msg_buffer &log_msg_buffer::operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT {
+    log_msg::operator=(other);
+    buffer = std::move(other.buffer);
+    update_string_views();
+    return *this;
+}
+
+SPDLOG_INLINE void log_msg_buffer::update_string_views() {
+    logger_name = string_view_t{buffer.data(), logger_name.size()};
+    payload = string_view_t{buffer.data() + logger_name.size(), payload.size()};
+}
+
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/log_msg_buffer.h b/third_party/spdlog/details/log_msg_buffer.h
old mode 100755
new mode 100644
similarity index 87%
rename from src/common/spdlog/details/log_msg_buffer.h
rename to third_party/spdlog/details/log_msg_buffer.h
index b57c28a64ac..1143b3ba450
--- a/src/common/spdlog/details/log_msg_buffer.h
+++ b/third_party/spdlog/details/log_msg_buffer.h
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#include <common/spdlog/details/log_msg.h>
+#include <spdlog/details/log_msg.h>
 
 namespace spdlog {
 namespace details {
@@ -24,9 +24,9 @@ class SPDLOG_API log_msg_buffer : public log_msg {
     log_msg_buffer &operator=(log_msg_buffer &&other) SPDLOG_NOEXCEPT;
 };
 
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "log_msg_buffer-inl.h"
+    #include "log_msg_buffer-inl.h"
 #endif
diff --git a/third_party/spdlog/details/null_mutex.h b/third_party/spdlog/details/null_mutex.h
new file mode 100644
index 00000000000..e3b32204158
--- /dev/null
+++ b/third_party/spdlog/details/null_mutex.h
@@ -0,0 +1,35 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <atomic>
+#include <utility>
+// null, no cost dummy "mutex" and dummy "atomic" int
+
+namespace spdlog {
+namespace details {
+struct null_mutex {
+    void lock() const {}
+    void unlock() const {}
+};
+
+struct null_atomic_int {
+    int value;
+    null_atomic_int() = default;
+
+    explicit null_atomic_int(int new_value)
+        : value(new_value) {}
+
+    int load(std::memory_order = std::memory_order_relaxed) const { return value; }
+
+    void store(int new_value, std::memory_order = std::memory_order_relaxed) { value = new_value; }
+
+    int exchange(int new_value, std::memory_order = std::memory_order_relaxed) {
+        std::swap(new_value, value);
+        return new_value;  // return value before the call
+    }
+};
+
+}  // namespace details
+}  // namespace spdlog
diff --git a/third_party/spdlog/details/os-inl.h b/third_party/spdlog/details/os-inl.h
new file mode 100644
index 00000000000..8ee423041ce
--- /dev/null
+++ b/third_party/spdlog/details/os-inl.h
@@ -0,0 +1,606 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/os.h>
+#endif
+
+#include <spdlog/common.h>
+
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <ctime>
+#include <string>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <thread>
+
+#ifdef _WIN32
+    #include <spdlog/details/windows_include.h>
+    #include <fileapi.h>  // for FlushFileBuffers
+    #include <io.h>       // for _get_osfhandle, _isatty, _fileno
+    #include <process.h>  // for _get_pid
+
+    #ifdef __MINGW32__
+        #include <share.h>
+    #endif
+
+    #if defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)
+        #include <cassert>
+        #include <limits>
+    #endif
+
+    #include <direct.h>  // for _mkdir/_wmkdir
+
+#else  // unix
+
+    #include <fcntl.h>
+    #include <unistd.h>
+
+    #ifdef __linux__
+        #include <sys/syscall.h>  //Use gettid() syscall under linux to get thread id
+
+    #elif defined(_AIX)
+        #include <pthread.h>  // for pthread_getthrds_np
+
+    #elif defined(__DragonFly__) || defined(__FreeBSD__)
+        #include <pthread_np.h>  // for pthread_getthreadid_np
+
+    #elif defined(__NetBSD__)
+        #include <lwp.h>  // for _lwp_self
+
+    #elif defined(__sun)
+        #include <thread.h>  // for thr_self
+    #endif
+
+#endif  // unix
+
+#if defined __APPLE__
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifndef __has_feature           // Clang - feature checking macros.
+    #define __has_feature(x) 0  // Compatibility with non-clang compilers.
+#endif
+
+namespace spdlog {
+namespace details {
+namespace os {
+
+SPDLOG_INLINE spdlog::log_clock::time_point now() SPDLOG_NOEXCEPT {
+#if defined __linux__ && defined SPDLOG_CLOCK_COARSE
+    timespec ts;
+    ::clock_gettime(CLOCK_REALTIME_COARSE, &ts);
+    return std::chrono::time_point<log_clock, typename log_clock::duration>(
+        std::chrono::duration_cast<typename log_clock::duration>(
+            std::chrono::seconds(ts.tv_sec) + std::chrono::nanoseconds(ts.tv_nsec)));
+
+#else
+    return log_clock::now();
+#endif
+}
+SPDLOG_INLINE std::tm localtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    std::tm tm;
+    ::localtime_s(&tm, &time_tt);
+#else
+    std::tm tm;
+    ::localtime_r(&time_tt, &tm);
+#endif
+    return tm;
+}
+
+SPDLOG_INLINE std::tm localtime() SPDLOG_NOEXCEPT {
+    std::time_t now_t = ::time(nullptr);
+    return localtime(now_t);
+}
+
+SPDLOG_INLINE std::tm gmtime(const std::time_t &time_tt) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    std::tm tm;
+    ::gmtime_s(&tm, &time_tt);
+#else
+    std::tm tm;
+    ::gmtime_r(&time_tt, &tm);
+#endif
+    return tm;
+}
+
+SPDLOG_INLINE std::tm gmtime() SPDLOG_NOEXCEPT {
+    std::time_t now_t = ::time(nullptr);
+    return gmtime(now_t);
+}
+
+// fopen_s on non windows for writing
+SPDLOG_INLINE bool fopen_s(FILE **fp, const filename_t &filename, const filename_t &mode) {
+#ifdef _WIN32
+    #ifdef SPDLOG_WCHAR_FILENAMES
+    *fp = ::_wfsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
+    #else
+    *fp = ::_fsopen((filename.c_str()), mode.c_str(), _SH_DENYNO);
+    #endif
+    #if defined(SPDLOG_PREVENT_CHILD_FD)
+    if (*fp != nullptr) {
+        auto file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(::_fileno(*fp)));
+        if (!::SetHandleInformation(file_handle, HANDLE_FLAG_INHERIT, 0)) {
+            ::fclose(*fp);
+            *fp = nullptr;
+        }
+    }
+    #endif
+#else  // unix
+    #if defined(SPDLOG_PREVENT_CHILD_FD)
+    const int mode_flag = mode == SPDLOG_FILENAME_T("ab") ? O_APPEND : O_TRUNC;
+    const int fd =
+        ::open((filename.c_str()), O_CREAT | O_WRONLY | O_CLOEXEC | mode_flag, mode_t(0644));
+    if (fd == -1) {
+        return true;
+    }
+    *fp = ::fdopen(fd, mode.c_str());
+    if (*fp == nullptr) {
+        ::close(fd);
+    }
+    #else
+    *fp = ::fopen((filename.c_str()), mode.c_str());
+    #endif
+#endif
+
+    return *fp == nullptr;
+}
+
+SPDLOG_INLINE int remove(const filename_t &filename) SPDLOG_NOEXCEPT {
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+    return ::_wremove(filename.c_str());
+#else
+    return std::remove(filename.c_str());
+#endif
+}
+
+SPDLOG_INLINE int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
+    return path_exists(filename) ? remove(filename) : 0;
+}
+
+SPDLOG_INLINE int rename(const filename_t &filename1, const filename_t &filename2) SPDLOG_NOEXCEPT {
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+    return ::_wrename(filename1.c_str(), filename2.c_str());
+#else
+    return std::rename(filename1.c_str(), filename2.c_str());
+#endif
+}
+
+// Return true if path exists (file or directory)
+SPDLOG_INLINE bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    struct _stat buffer;
+    #ifdef SPDLOG_WCHAR_FILENAMES
+    return (::_wstat(filename.c_str(), &buffer) == 0);
+    #else
+    return (::_stat(filename.c_str(), &buffer) == 0);
+    #endif
+#else  // common linux/unix all have the stat system call
+    struct stat buffer;
+    return (::stat(filename.c_str(), &buffer) == 0);
+#endif
+}
+
+#ifdef _MSC_VER
+    // avoid warning about unreachable statement at the end of filesize()
+    #pragma warning(push)
+    #pragma warning(disable : 4702)
+#endif
+
+// Return file size according to open FILE* object
+SPDLOG_INLINE size_t filesize(FILE *f) {
+    if (f == nullptr) {
+        throw_spdlog_ex("Failed getting file size. fd is null");
+    }
+#if defined(_WIN32) && !defined(__CYGWIN__)
+    int fd = ::_fileno(f);
+    #if defined(_WIN64)  // 64 bits
+    __int64 ret = ::_filelengthi64(fd);
+    if (ret >= 0) {
+        return static_cast<size_t>(ret);
+    }
+
+    #else  // windows 32 bits
+    long ret = ::_filelength(fd);
+    if (ret >= 0) {
+        return static_cast<size_t>(ret);
+    }
+    #endif
+
+#else  // unix
+    // OpenBSD and AIX doesn't compile with :: before the fileno(..)
+    #if defined(__OpenBSD__) || defined(_AIX)
+    int fd = fileno(f);
+    #else
+    int fd = ::fileno(f);
+    #endif
+    // 64 bits(but not in osx, linux/musl or cygwin, where fstat64 is deprecated)
+    #if ((defined(__linux__) && defined(__GLIBC__)) || defined(__sun) || defined(_AIX)) && \
+        (defined(__LP64__) || defined(_LP64))
+    struct stat64 st;
+    if (::fstat64(fd, &st) == 0) {
+        return static_cast<size_t>(st.st_size);
+    }
+    #else  // other unix or linux 32 bits or cygwin
+    struct stat st;
+    if (::fstat(fd, &st) == 0) {
+        return static_cast<size_t>(st.st_size);
+    }
+    #endif
+#endif
+    throw_spdlog_ex("Failed getting file size from fd", errno);
+    return 0;  // will not be reached.
+}
+
+#ifdef _MSC_VER
+    #pragma warning(pop)
+#endif
+
+// Return utc offset in minutes or throw spdlog_ex on failure
+SPDLOG_INLINE int utc_minutes_offset(const std::tm &tm) {
+#ifdef _WIN32
+    #if _WIN32_WINNT < _WIN32_WINNT_WS08
+    TIME_ZONE_INFORMATION tzinfo;
+    auto rv = ::GetTimeZoneInformation(&tzinfo);
+    #else
+    DYNAMIC_TIME_ZONE_INFORMATION tzinfo;
+    auto rv = ::GetDynamicTimeZoneInformation(&tzinfo);
+    #endif
+    if (rv == TIME_ZONE_ID_INVALID) throw_spdlog_ex("Failed getting timezone info. ", errno);
+
+    int offset = -tzinfo.Bias;
+    if (tm.tm_isdst) {
+        offset -= tzinfo.DaylightBias;
+    } else {
+        offset -= tzinfo.StandardBias;
+    }
+    return offset;
+#else
+
+    #if defined(sun) || defined(__sun) || defined(_AIX) || \
+        (defined(__NEWLIB__) && !defined(__TM_GMTOFF)) ||  \
+        (!defined(_BSD_SOURCE) && !defined(_GNU_SOURCE))
+    // 'tm_gmtoff' field is BSD extension and it's missing on SunOS/Solaris
+    struct helper {
+        static long int calculate_gmt_offset(const std::tm &localtm = details::os::localtime(),
+                                             const std::tm &gmtm = details::os::gmtime()) {
+            int local_year = localtm.tm_year + (1900 - 1);
+            int gmt_year = gmtm.tm_year + (1900 - 1);
+
+            long int days = (
+                // difference in day of year
+                localtm.tm_yday -
+                gmtm.tm_yday
+
+                // + intervening leap days
+                + ((local_year >> 2) - (gmt_year >> 2)) - (local_year / 100 - gmt_year / 100) +
+                ((local_year / 100 >> 2) - (gmt_year / 100 >> 2))
+
+                // + difference in years * 365 */
+                + static_cast<long int>(local_year - gmt_year) * 365);
+
+            long int hours = (24 * days) + (localtm.tm_hour - gmtm.tm_hour);
+            long int mins = (60 * hours) + (localtm.tm_min - gmtm.tm_min);
+            long int secs = (60 * mins) + (localtm.tm_sec - gmtm.tm_sec);
+
+            return secs;
+        }
+    };
+
+    auto offset_seconds = helper::calculate_gmt_offset(tm);
+    #else
+    auto offset_seconds = tm.tm_gmtoff;
+    #endif
+
+    return static_cast<int>(offset_seconds / 60);
+#endif
+}
+
+// Return current thread id as size_t
+// It exists because the std::this_thread::get_id() is much slower(especially
+// under VS 2013)
+SPDLOG_INLINE size_t _thread_id() SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return static_cast<size_t>(::GetCurrentThreadId());
+#elif defined(__linux__)
+    #if defined(__ANDROID__) && defined(__ANDROID_API__) && (__ANDROID_API__ < 21)
+        #define SYS_gettid __NR_gettid
+    #endif
+    return static_cast<size_t>(::syscall(SYS_gettid));
+#elif defined(_AIX)
+    struct __pthrdsinfo buf;
+    int reg_size = 0;
+    pthread_t pt = pthread_self();
+    int retval = pthread_getthrds_np(&pt, PTHRDSINFO_QUERY_TID, &buf, sizeof(buf), NULL, &reg_size);
+    int tid = (!retval) ? buf.__pi_tid : 0;
+    return static_cast<size_t>(tid);
+#elif defined(__DragonFly__) || defined(__FreeBSD__)
+    return static_cast<size_t>(::pthread_getthreadid_np());
+#elif defined(__NetBSD__)
+    return static_cast<size_t>(::_lwp_self());
+#elif defined(__OpenBSD__)
+    return static_cast<size_t>(::getthrid());
+#elif defined(__sun)
+    return static_cast<size_t>(::thr_self());
+#elif __APPLE__
+    uint64_t tid;
+    // There is no pthread_threadid_np prior to Mac OS X 10.6, and it is not supported on any PPC,
+    // including 10.6.8 Rosetta. __POWERPC__ is Apple-specific define encompassing ppc and ppc64.
+    #ifdef MAC_OS_X_VERSION_MAX_ALLOWED
+    {
+        #if (MAC_OS_X_VERSION_MAX_ALLOWED < 1060) || defined(__POWERPC__)
+        tid = pthread_mach_thread_np(pthread_self());
+        #elif MAC_OS_X_VERSION_MIN_REQUIRED < 1060
+        if (&pthread_threadid_np) {
+            pthread_threadid_np(nullptr, &tid);
+        } else {
+            tid = pthread_mach_thread_np(pthread_self());
+        }
+        #else
+        pthread_threadid_np(nullptr, &tid);
+        #endif
+    }
+    #else
+    pthread_threadid_np(nullptr, &tid);
+    #endif
+    return static_cast<size_t>(tid);
+#else  // Default to standard C++11 (other Unix)
+    return static_cast<size_t>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+#endif
+}
+
+// Return current thread id as size_t (from thread local storage)
+SPDLOG_INLINE size_t thread_id() SPDLOG_NOEXCEPT {
+#if defined(SPDLOG_NO_TLS)
+    return _thread_id();
+#else  // cache thread id in tls
+    static thread_local const size_t tid = _thread_id();
+    return tid;
+#endif
+}
+
+// This is avoid msvc issue in sleep_for that happens if the clock changes.
+// See https://github.com/gabime/spdlog/issues/609
+SPDLOG_INLINE void sleep_for_millis(unsigned int milliseconds) SPDLOG_NOEXCEPT {
+#if defined(_WIN32)
+    ::Sleep(milliseconds);
+#else
+    std::this_thread::sleep_for(std::chrono::milliseconds(milliseconds));
+#endif
+}
+
+// wchar support for windows file names (SPDLOG_WCHAR_FILENAMES must be defined)
+#if defined(_WIN32) && defined(SPDLOG_WCHAR_FILENAMES)
+SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) {
+    memory_buf_t buf;
+    wstr_to_utf8buf(filename, buf);
+    return SPDLOG_BUF_TO_STRING(buf);
+}
+#else
+SPDLOG_INLINE std::string filename_to_str(const filename_t &filename) { return filename; }
+#endif
+
+SPDLOG_INLINE int pid() SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return conditional_static_cast<int>(::GetCurrentProcessId());
+#else
+    return conditional_static_cast<int>(::getpid());
+#endif
+}
+
+// Determine if the terminal supports colors
+// Based on: https://github.com/agauniyal/rang/
+SPDLOG_INLINE bool is_color_terminal() SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return true;
+#else
+
+    static const bool result = []() {
+        const char *env_colorterm_p = std::getenv("COLORTERM");
+        if (env_colorterm_p != nullptr) {
+            return true;
+        }
+
+        static constexpr std::array<const char *, 16> terms = {
+            {"ansi", "color", "console", "cygwin", "gnome", "konsole", "kterm", "linux", "msys",
+             "putty", "rxvt", "screen", "vt100", "xterm", "alacritty", "vt102"}};
+
+        const char *env_term_p = std::getenv("TERM");
+        if (env_term_p == nullptr) {
+            return false;
+        }
+
+        return std::any_of(terms.begin(), terms.end(), [&](const char *term) {
+            return std::strstr(env_term_p, term) != nullptr;
+        });
+    }();
+
+    return result;
+#endif
+}
+
+// Determine if the terminal attached
+// Source: https://github.com/agauniyal/rang/
+SPDLOG_INLINE bool in_terminal(FILE *file) SPDLOG_NOEXCEPT {
+#ifdef _WIN32
+    return ::_isatty(_fileno(file)) != 0;
+#else
+    return ::isatty(fileno(file)) != 0;
+#endif
+}
+
+#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) && defined(_WIN32)
+SPDLOG_INLINE void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target) {
+    if (wstr.size() > static_cast<size_t>((std::numeric_limits<int>::max)()) / 4 - 1) {
+        throw_spdlog_ex("UTF-16 string is too big to be converted to UTF-8");
+    }
+
+    int wstr_size = static_cast<int>(wstr.size());
+    if (wstr_size == 0) {
+        target.resize(0);
+        return;
+    }
+
+    int result_size = static_cast<int>(target.capacity());
+    if ((wstr_size + 1) * 4 > result_size) {
+        result_size =
+            ::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size, NULL, 0, NULL, NULL);
+    }
+
+    if (result_size > 0) {
+        target.resize(result_size);
+        result_size = ::WideCharToMultiByte(CP_UTF8, 0, wstr.data(), wstr_size, target.data(),
+                                            result_size, NULL, NULL);
+
+        if (result_size > 0) {
+            target.resize(result_size);
+            return;
+        }
+    }
+
+    throw_spdlog_ex(
+        fmt_lib::format("WideCharToMultiByte failed. Last error: {}", ::GetLastError()));
+}
+
+SPDLOG_INLINE void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target) {
+    if (str.size() > static_cast<size_t>((std::numeric_limits<int>::max)()) - 1) {
+        throw_spdlog_ex("UTF-8 string is too big to be converted to UTF-16");
+    }
+
+    int str_size = static_cast<int>(str.size());
+    if (str_size == 0) {
+        target.resize(0);
+        return;
+    }
+
+    // find the size to allocate for the result buffer
+    int result_size =
+        ::MultiByteToWideChar(CP_UTF8, 0, str.data(), str_size, NULL, 0);
+
+    if (result_size > 0) {
+        target.resize(result_size);
+        result_size = ::MultiByteToWideChar(CP_UTF8, 0, str.data(), str_size, target.data(),
+                                            result_size);
+        if (result_size > 0) {
+            assert(result_size == target.size());
+            return;
+        }
+    }
+
+    throw_spdlog_ex(
+        fmt_lib::format("MultiByteToWideChar failed. Last error: {}", ::GetLastError()));
+}
+#endif  // (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) &&
+        // defined(_WIN32)
+
+// return true on success
+static SPDLOG_INLINE bool mkdir_(const filename_t &path) {
+#ifdef _WIN32
+    #ifdef SPDLOG_WCHAR_FILENAMES
+    return ::_wmkdir(path.c_str()) == 0;
+    #else
+    return ::_mkdir(path.c_str()) == 0;
+    #endif
+#else
+    return ::mkdir(path.c_str(), mode_t(0755)) == 0;
+#endif
+}
+
+// create the given directory - and all directories leading to it
+// return true on success or if the directory already exists
+SPDLOG_INLINE bool create_dir(const filename_t &path) {
+    if (path_exists(path)) {
+        return true;
+    }
+
+    if (path.empty()) {
+        return false;
+    }
+
+    size_t search_offset = 0;
+    do {
+        auto token_pos = path.find_first_of(folder_seps_filename, search_offset);
+        // treat the entire path as a folder if no folder separator not found
+        if (token_pos == filename_t::npos) {
+            token_pos = path.size();
+        }
+
+        auto subdir = path.substr(0, token_pos);
+#ifdef _WIN32
+        // if subdir is just a drive letter, add a slash e.g. "c:"=>"c:\",
+        // otherwise path_exists(subdir) returns false (issue #3079)
+        const bool is_drive = subdir.length() == 2 && subdir[1] == ':';
+        if (is_drive) {
+            subdir += '\\';
+            token_pos++;
+        }
+#endif
+
+        if (!subdir.empty() && !path_exists(subdir) && !mkdir_(subdir)) {
+            return false;  // return error if failed creating dir
+        }
+        search_offset = token_pos + 1;
+    } while (search_offset < path.size());
+
+    return true;
+}
+
+// Return directory name from given path or empty string
+// "abc/file" => "abc"
+// "abc/" => "abc"
+// "abc" => ""
+// "abc///" => "abc//"
+SPDLOG_INLINE filename_t dir_name(const filename_t &path) {
+    auto pos = path.find_last_of(folder_seps_filename);
+    return pos != filename_t::npos ? path.substr(0, pos) : filename_t{};
+}
+
+std::string SPDLOG_INLINE getenv(const char *field) {
+#if defined(_MSC_VER)
+    #if defined(__cplusplus_winrt)
+    return std::string{};  // not supported under uwp
+    #else
+    size_t len = 0;
+    char buf[128];
+    bool ok = ::getenv_s(&len, buf, sizeof(buf), field) == 0;
+    return ok ? buf : std::string{};
+    #endif
+#else  // revert to getenv
+    char *buf = ::getenv(field);
+    return buf ? buf : std::string{};
+#endif
+}
+
+// Do fsync by FILE handlerpointer
+// Return true on success
+SPDLOG_INLINE bool fsync(FILE *fp) {
+#ifdef _WIN32
+    return FlushFileBuffers(reinterpret_cast<HANDLE>(_get_osfhandle(_fileno(fp)))) != 0;
+#else
+    return ::fsync(fileno(fp)) == 0;
+#endif
+}
+
+// Do non-locking fwrite if possible by the os or use the regular locking fwrite
+// Return true on success.
+SPDLOG_INLINE bool fwrite_bytes(const void *ptr, const size_t n_bytes, FILE *fp) {
+    #if defined(_WIN32) && defined(SPDLOG_FWRITE_UNLOCKED)
+    return _fwrite_nolock(ptr, 1, n_bytes, fp) == n_bytes;
+    #elif defined(SPDLOG_FWRITE_UNLOCKED)
+    return ::fwrite_unlocked(ptr, 1, n_bytes, fp) == n_bytes;
+    #else
+    return std::fwrite(ptr, 1, n_bytes, fp) == n_bytes;
+    #endif
+}
+
+}  // namespace os
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/os.h b/third_party/spdlog/details/os.h
old mode 100755
new mode 100644
similarity index 79%
rename from src/common/spdlog/details/os.h
rename to third_party/spdlog/details/os.h
index b24952e5ce3..5fd12bac1ea
--- a/src/common/spdlog/details/os.h
+++ b/third_party/spdlog/details/os.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <ctime> // std::time_t
-#include <common/spdlog/common.h>
+#include <ctime>  // std::time_t
+#include <spdlog/common.h>
 
 namespace spdlog {
 namespace details {
@@ -22,31 +22,30 @@ SPDLOG_API std::tm gmtime() SPDLOG_NOEXCEPT;
 
 // eol definition
 #if !defined(SPDLOG_EOL)
-#ifdef _WIN32
-#define SPDLOG_EOL "\r\n"
-#else
-#define SPDLOG_EOL "\n"
-#endif
+    #ifdef _WIN32
+        #define SPDLOG_EOL "\r\n"
+    #else
+        #define SPDLOG_EOL "\n"
+    #endif
 #endif
 
 SPDLOG_CONSTEXPR static const char *default_eol = SPDLOG_EOL;
 
 // folder separator
 #if !defined(SPDLOG_FOLDER_SEPS)
-#ifdef _WIN32
-#define SPDLOG_FOLDER_SEPS "\\/"
-#else
-#define SPDLOG_FOLDER_SEPS "/"
-#endif
+    #ifdef _WIN32
+        #define SPDLOG_FOLDER_SEPS "\\/"
+    #else
+        #define SPDLOG_FOLDER_SEPS "/"
+    #endif
 #endif
 
 SPDLOG_CONSTEXPR static const char folder_seps[] = SPDLOG_FOLDER_SEPS;
-SPDLOG_CONSTEXPR static const filename_t::value_type folder_seps_filename[]
-        = SPDLOG_FILENAME_T(SPDLOG_FOLDER_SEPS);
+SPDLOG_CONSTEXPR static const filename_t::value_type folder_seps_filename[] =
+    SPDLOG_FILENAME_T(SPDLOG_FOLDER_SEPS);
 
 // fopen_s on non windows for writing
-SPDLOG_API bool fopen_s(
-        FILE **fp, const filename_t &filename, const filename_t &mode);
+SPDLOG_API bool fopen_s(FILE **fp, const filename_t &filename, const filename_t &mode);
 
 // Remove filename. return 0 on success
 SPDLOG_API int remove(const filename_t &filename) SPDLOG_NOEXCEPT;
@@ -55,8 +54,7 @@ SPDLOG_API int remove(const filename_t &filename) SPDLOG_NOEXCEPT;
 // Note: Non atomic (might return failure to delete if concurrently deleted by other process/thread)
 SPDLOG_API int remove_if_exists(const filename_t &filename) SPDLOG_NOEXCEPT;
 
-SPDLOG_API int rename(const filename_t &filename1,
-        const filename_t &filename2) SPDLOG_NOEXCEPT;
+SPDLOG_API int rename(const filename_t &filename1, const filename_t &filename2) SPDLOG_NOEXCEPT;
 
 // Return if file exists.
 SPDLOG_API bool path_exists(const filename_t &filename) SPDLOG_NOEXCEPT;
@@ -91,8 +89,7 @@ SPDLOG_API bool is_color_terminal() SPDLOG_NOEXCEPT;
 // Source: https://github.com/agauniyal/rang/
 SPDLOG_API bool in_terminal(FILE *file) SPDLOG_NOEXCEPT;
 
-#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) \
-        && defined(_WIN32)
+#if (defined(SPDLOG_WCHAR_TO_UTF8_SUPPORT) || defined(SPDLOG_WCHAR_FILENAMES)) && defined(_WIN32)
 SPDLOG_API void wstr_to_utf8buf(wstring_view_t wstr, memory_buf_t &target);
 
 SPDLOG_API void utf8_to_wstrbuf(string_view_t str, wmemory_buf_t &target);
@@ -117,10 +114,14 @@ SPDLOG_API std::string getenv(const char *field);
 // Return true on success.
 SPDLOG_API bool fsync(FILE *fp);
 
-} // namespace os
-} // namespace details
-} // namespace spdlog
+// Do non-locking fwrite if possible by the os or use the regular locking fwrite
+// Return true on success.
+SPDLOG_API bool fwrite_bytes(const void *ptr, const size_t n_bytes, FILE *fp);
+
+}  // namespace os
+}  // namespace details
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "os-inl.h"
+    #include "os-inl.h"
 #endif
diff --git a/src/common/spdlog/details/periodic_worker-inl.h b/third_party/spdlog/details/periodic_worker-inl.h
old mode 100755
new mode 100644
similarity index 84%
rename from src/common/spdlog/details/periodic_worker-inl.h
rename to third_party/spdlog/details/periodic_worker-inl.h
index 5b814d0dd1f..18f11fbec8c
--- a/src/common/spdlog/details/periodic_worker-inl.h
+++ b/third_party/spdlog/details/periodic_worker-inl.h
@@ -4,7 +4,7 @@
 #pragma once
 
 #ifndef SPDLOG_HEADER_ONLY
-#include <common/spdlog/details/periodic_worker.h>
+    #include <spdlog/details/periodic_worker.h>
 #endif
 
 namespace spdlog {
@@ -22,5 +22,5 @@ SPDLOG_INLINE periodic_worker::~periodic_worker() {
     }
 }
 
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/periodic_worker.h b/third_party/spdlog/details/periodic_worker.h
old mode 100755
new mode 100644
similarity index 79%
rename from src/common/spdlog/details/periodic_worker.h
rename to third_party/spdlog/details/periodic_worker.h
index 1653a179fcf..d647b66ee17
--- a/src/common/spdlog/details/periodic_worker.h
+++ b/third_party/spdlog/details/periodic_worker.h
@@ -11,10 +11,10 @@
 //    to finish first).
 
 #include <chrono>
+#include <condition_variable>
 #include <functional>
 #include <mutex>
 #include <thread>
-#include <condition_variable>
 namespace spdlog {
 namespace details {
 
@@ -22,16 +22,17 @@ class SPDLOG_API periodic_worker {
 public:
     template <typename Rep, typename Period>
     periodic_worker(const std::function<void()> &callback_fun,
-            std::chrono::duration<Rep, Period> interval) {
+                    std::chrono::duration<Rep, Period> interval) {
         active_ = (interval > std::chrono::duration<Rep, Period>::zero());
-        if (!active_) { return; }
+        if (!active_) {
+            return;
+        }
 
         worker_thread_ = std::thread([this, callback_fun, interval]() {
             for (;;) {
                 std::unique_lock<std::mutex> lock(this->mutex_);
-                if (this->cv_.wait_for(lock, interval,
-                            [this] { return !this->active_; })) {
-                    return; // active_ == false, so exit this thread
+                if (this->cv_.wait_for(lock, interval, [this] { return !this->active_; })) {
+                    return;  // active_ == false, so exit this thread
                 }
                 callback_fun();
             }
@@ -49,9 +50,9 @@ class SPDLOG_API periodic_worker {
     std::mutex mutex_;
     std::condition_variable cv_;
 };
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "periodic_worker-inl.h"
+    #include "periodic_worker-inl.h"
 #endif
diff --git a/third_party/spdlog/details/registry-inl.h b/third_party/spdlog/details/registry-inl.h
new file mode 100644
index 00000000000..f447848ee03
--- /dev/null
+++ b/third_party/spdlog/details/registry-inl.h
@@ -0,0 +1,261 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/details/registry.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/details/periodic_worker.h>
+#include <spdlog/logger.h>
+#include <spdlog/pattern_formatter.h>
+
+#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
+    // support for the default stdout color logger
+    #ifdef _WIN32
+        #include <spdlog/sinks/wincolor_sink.h>
+    #else
+        #include <spdlog/sinks/ansicolor_sink.h>
+    #endif
+#endif  // SPDLOG_DISABLE_DEFAULT_LOGGER
+
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+namespace spdlog {
+namespace details {
+
+SPDLOG_INLINE registry::registry()
+    : formatter_(new pattern_formatter()) {
+#ifndef SPDLOG_DISABLE_DEFAULT_LOGGER
+    // create default logger (ansicolor_stdout_sink_mt or wincolor_stdout_sink_mt in windows).
+    #ifdef _WIN32
+    auto color_sink = std::make_shared<sinks::wincolor_stdout_sink_mt>();
+    #else
+    auto color_sink = std::make_shared<sinks::ansicolor_stdout_sink_mt>();
+    #endif
+
+    const char *default_logger_name = "";
+    default_logger_ = std::make_shared<spdlog::logger>(default_logger_name, std::move(color_sink));
+    loggers_[default_logger_name] = default_logger_;
+
+#endif  // SPDLOG_DISABLE_DEFAULT_LOGGER
+}
+
+SPDLOG_INLINE registry::~registry() = default;
+
+SPDLOG_INLINE void registry::register_logger(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    register_logger_(std::move(new_logger));
+}
+
+SPDLOG_INLINE void registry::initialize_logger(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    new_logger->set_formatter(formatter_->clone());
+
+    if (err_handler_) {
+        new_logger->set_error_handler(err_handler_);
+    }
+
+    // set new level according to previously configured level or default level
+    auto it = log_levels_.find(new_logger->name());
+    auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
+    new_logger->set_level(new_level);
+
+    new_logger->flush_on(flush_level_);
+
+    if (backtrace_n_messages_ > 0) {
+        new_logger->enable_backtrace(backtrace_n_messages_);
+    }
+
+    if (automatic_registration_) {
+        register_logger_(std::move(new_logger));
+    }
+}
+
+SPDLOG_INLINE std::shared_ptr<logger> registry::get(const std::string &logger_name) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    auto found = loggers_.find(logger_name);
+    return found == loggers_.end() ? nullptr : found->second;
+}
+
+SPDLOG_INLINE std::shared_ptr<logger> registry::default_logger() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    return default_logger_;
+}
+
+// Return raw ptr to the default logger.
+// To be used directly by the spdlog default api (e.g. spdlog::info)
+// This make the default API faster, but cannot be used concurrently with set_default_logger().
+// e.g do not call set_default_logger() from one thread while calling spdlog::info() from another.
+SPDLOG_INLINE logger *registry::get_default_raw() { return default_logger_.get(); }
+
+// set default logger.
+// default logger is stored in default_logger_ (for faster retrieval) and in the loggers_ map.
+SPDLOG_INLINE void registry::set_default_logger(std::shared_ptr<logger> new_default_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    if (new_default_logger != nullptr) {
+        loggers_[new_default_logger->name()] = new_default_logger;
+    }
+    default_logger_ = std::move(new_default_logger);
+}
+
+SPDLOG_INLINE void registry::set_tp(std::shared_ptr<thread_pool> tp) {
+    std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
+    tp_ = std::move(tp);
+}
+
+SPDLOG_INLINE std::shared_ptr<thread_pool> registry::get_tp() {
+    std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
+    return tp_;
+}
+
+// Set global formatter. Each sink in each logger will get a clone of this object
+SPDLOG_INLINE void registry::set_formatter(std::unique_ptr<formatter> formatter) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    formatter_ = std::move(formatter);
+    for (auto &l : loggers_) {
+        l.second->set_formatter(formatter_->clone());
+    }
+}
+
+SPDLOG_INLINE void registry::enable_backtrace(size_t n_messages) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    backtrace_n_messages_ = n_messages;
+
+    for (auto &l : loggers_) {
+        l.second->enable_backtrace(n_messages);
+    }
+}
+
+SPDLOG_INLINE void registry::disable_backtrace() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    backtrace_n_messages_ = 0;
+    for (auto &l : loggers_) {
+        l.second->disable_backtrace();
+    }
+}
+
+SPDLOG_INLINE void registry::set_level(level::level_enum log_level) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->set_level(log_level);
+    }
+    global_log_level_ = log_level;
+}
+
+SPDLOG_INLINE void registry::flush_on(level::level_enum log_level) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->flush_on(log_level);
+    }
+    flush_level_ = log_level;
+}
+
+SPDLOG_INLINE void registry::set_error_handler(err_handler handler) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->set_error_handler(handler);
+    }
+    err_handler_ = std::move(handler);
+}
+
+SPDLOG_INLINE void registry::apply_all(
+    const std::function<void(const std::shared_ptr<logger>)> &fun) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        fun(l.second);
+    }
+}
+
+SPDLOG_INLINE void registry::flush_all() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    for (auto &l : loggers_) {
+        l.second->flush();
+    }
+}
+
+SPDLOG_INLINE void registry::drop(const std::string &logger_name) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    auto is_default_logger = default_logger_ && default_logger_->name() == logger_name;
+    loggers_.erase(logger_name);
+    if (is_default_logger) {
+        default_logger_.reset();
+    }
+}
+
+SPDLOG_INLINE void registry::drop_all() {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    loggers_.clear();
+    default_logger_.reset();
+}
+
+// clean all resources and threads started by the registry
+SPDLOG_INLINE void registry::shutdown() {
+    {
+        std::lock_guard<std::mutex> lock(flusher_mutex_);
+        periodic_flusher_.reset();
+    }
+
+    drop_all();
+
+    {
+        std::lock_guard<std::recursive_mutex> lock(tp_mutex_);
+        tp_.reset();
+    }
+}
+
+SPDLOG_INLINE std::recursive_mutex &registry::tp_mutex() { return tp_mutex_; }
+
+SPDLOG_INLINE void registry::set_automatic_registration(bool automatic_registration) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    automatic_registration_ = automatic_registration;
+}
+
+SPDLOG_INLINE void registry::set_levels(log_levels levels, level::level_enum *global_level) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    log_levels_ = std::move(levels);
+    auto global_level_requested = global_level != nullptr;
+    global_log_level_ = global_level_requested ? *global_level : global_log_level_;
+
+    for (auto &logger : loggers_) {
+        auto logger_entry = log_levels_.find(logger.first);
+        if (logger_entry != log_levels_.end()) {
+            logger.second->set_level(logger_entry->second);
+        } else if (global_level_requested) {
+            logger.second->set_level(*global_level);
+        }
+    }
+}
+
+SPDLOG_INLINE registry &registry::instance() {
+    static registry s_instance;
+    return s_instance;
+}
+
+SPDLOG_INLINE void registry::apply_logger_env_levels(std::shared_ptr<logger> new_logger) {
+    std::lock_guard<std::mutex> lock(logger_map_mutex_);
+    auto it = log_levels_.find(new_logger->name());
+    auto new_level = it != log_levels_.end() ? it->second : global_log_level_;
+    new_logger->set_level(new_level);
+}
+
+SPDLOG_INLINE void registry::throw_if_exists_(const std::string &logger_name) {
+    if (loggers_.find(logger_name) != loggers_.end()) {
+        throw_spdlog_ex("logger with name '" + logger_name + "' already exists");
+    }
+}
+
+SPDLOG_INLINE void registry::register_logger_(std::shared_ptr<logger> new_logger) {
+    auto logger_name = new_logger->name();
+    throw_if_exists_(logger_name);
+    loggers_[logger_name] = std::move(new_logger);
+}
+
+}  // namespace details
+}  // namespace spdlog
diff --git a/src/common/spdlog/details/registry.h b/third_party/spdlog/details/registry.h
old mode 100755
new mode 100644
similarity index 91%
rename from src/common/spdlog/details/registry.h
rename to third_party/spdlog/details/registry.h
index 07c82c6cfce..8afcbd6f59d
--- a/src/common/spdlog/details/registry.h
+++ b/third_party/spdlog/details/registry.h
@@ -8,8 +8,8 @@
 // If user requests a non existing logger, nullptr will be returned
 // This class is thread safe
 
-#include <common/spdlog/common.h>
-#include <common/spdlog/details/periodic_worker.h>
+#include <spdlog/common.h>
+#include <spdlog/details/periodic_worker.h>
 
 #include <chrono>
 #include <functional>
@@ -67,8 +67,7 @@ class SPDLOG_API registry {
     void flush_every(std::chrono::duration<Rep, Period> interval) {
         std::lock_guard<std::mutex> lock(flusher_mutex_);
         auto clbk = [this]() { this->flush_all(); };
-        periodic_flusher_
-                = details::make_unique<periodic_worker>(clbk, interval);
+        periodic_flusher_ = details::make_unique<periodic_worker>(clbk, interval);
     }
 
     std::unique_ptr<periodic_worker> &get_flusher() {
@@ -78,8 +77,7 @@ class SPDLOG_API registry {
 
     void set_error_handler(err_handler handler);
 
-    void apply_all(
-            const std::function<void(const std::shared_ptr<logger>)> &fun);
+    void apply_all(const std::function<void(const std::shared_ptr<logger>)> &fun);
 
     void flush_all();
 
@@ -123,9 +121,9 @@ class SPDLOG_API registry {
     size_t backtrace_n_messages_ = 0;
 };
 
-} // namespace details
-} // namespace spdlog
+}  // namespace details
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "registry-inl.h"
+    #include "registry-inl.h"
 #endif
diff --git a/third_party/spdlog/details/synchronous_factory.h b/third_party/spdlog/details/synchronous_factory.h
new file mode 100644
index 00000000000..4bd5a51c868
--- /dev/null
+++ b/third_party/spdlog/details/synchronous_factory.h
@@ -0,0 +1,22 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include "registry.h"
+
+namespace spdlog {
+
+// Default logger factory-  creates synchronous loggers
+class logger;
+
+struct synchronous_factory {
+    template <typename Sink, typename... SinkArgs>
+    static std::shared_ptr<spdlog::logger> create(std::string logger_name, SinkArgs &&...args) {
+        auto sink = std::make_shared<Sink>(std::forward<SinkArgs>(args)...);
+        auto new_logger = std::make_shared<spdlog::logger>(std::move(logger_name), std::move(sink));
+        details::registry::instance().initialize_logger(new_logger);
+        return new_logger;
+    }
+};
+}  // namespace spdlog
diff --git a/third_party/spdlog/details/windows_include.h b/third_party/spdlog/details/windows_include.h
new file mode 100644
index 00000000000..bbab59b1c76
--- /dev/null
+++ b/third_party/spdlog/details/windows_include.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#ifndef NOMINMAX
+    #define NOMINMAX  // prevent windows redefining min/max
+#endif
+
+#ifndef WIN32_LEAN_AND_MEAN
+    #define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
diff --git a/third_party/spdlog/fmt/bundled/base.h b/third_party/spdlog/fmt/bundled/base.h
new file mode 100644
index 00000000000..9eeeb2ce7c5
--- /dev/null
+++ b/third_party/spdlog/fmt/bundled/base.h
@@ -0,0 +1,2961 @@
+// Formatting library for C++ - the base API for char/UTF-8
+//
+// Copyright (c) 2012 - present, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_BASE_H_
+#define FMT_BASE_H_
+
+#if defined(FMT_IMPORT_STD) && !defined(FMT_MODULE)
+#  define FMT_MODULE
+#endif
+
+#ifndef FMT_MODULE
+#  include <limits.h>  // CHAR_BIT
+#  include <stdio.h>   // FILE
+#  include <string.h>  // memcmp
+
+#  include <type_traits>  // std::enable_if
+#endif
+
+// The fmt library version in the form major * 10000 + minor * 100 + patch.
+#define FMT_VERSION 110103
+
+// Detect compiler versions.
+#if defined(__clang__) && !defined(__ibmxl__)
+#  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
+#else
+#  define FMT_CLANG_VERSION 0
+#endif
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#  define FMT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#else
+#  define FMT_GCC_VERSION 0
+#endif
+#if defined(__ICL)
+#  define FMT_ICC_VERSION __ICL
+#elif defined(__INTEL_COMPILER)
+#  define FMT_ICC_VERSION __INTEL_COMPILER
+#else
+#  define FMT_ICC_VERSION 0
+#endif
+#if defined(_MSC_VER)
+#  define FMT_MSC_VERSION _MSC_VER
+#else
+#  define FMT_MSC_VERSION 0
+#endif
+
+// Detect standard library versions.
+#ifdef _GLIBCXX_RELEASE
+#  define FMT_GLIBCXX_RELEASE _GLIBCXX_RELEASE
+#else
+#  define FMT_GLIBCXX_RELEASE 0
+#endif
+#ifdef _LIBCPP_VERSION
+#  define FMT_LIBCPP_VERSION _LIBCPP_VERSION
+#else
+#  define FMT_LIBCPP_VERSION 0
+#endif
+
+#ifdef _MSVC_LANG
+#  define FMT_CPLUSPLUS _MSVC_LANG
+#else
+#  define FMT_CPLUSPLUS __cplusplus
+#endif
+
+// Detect __has_*.
+#ifdef __has_feature
+#  define FMT_HAS_FEATURE(x) __has_feature(x)
+#else
+#  define FMT_HAS_FEATURE(x) 0
+#endif
+#ifdef __has_include
+#  define FMT_HAS_INCLUDE(x) __has_include(x)
+#else
+#  define FMT_HAS_INCLUDE(x) 0
+#endif
+#ifdef __has_builtin
+#  define FMT_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#  define FMT_HAS_BUILTIN(x) 0
+#endif
+#ifdef __has_cpp_attribute
+#  define FMT_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x)
+#else
+#  define FMT_HAS_CPP_ATTRIBUTE(x) 0
+#endif
+
+#define FMT_HAS_CPP14_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201402L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+#define FMT_HAS_CPP17_ATTRIBUTE(attribute) \
+  (FMT_CPLUSPLUS >= 201703L && FMT_HAS_CPP_ATTRIBUTE(attribute))
+
+// Detect C++14 relaxed constexpr.
+#ifdef FMT_USE_CONSTEXPR
+// Use the provided definition.
+#elif FMT_GCC_VERSION >= 702 && FMT_CPLUSPLUS >= 201402L
+// GCC only allows constexpr member functions in non-literal types since 7.2:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66297.
+#  define FMT_USE_CONSTEXPR 1
+#elif FMT_ICC_VERSION
+#  define FMT_USE_CONSTEXPR 0  // https://github.com/fmtlib/fmt/issues/1628
+#elif FMT_HAS_FEATURE(cxx_relaxed_constexpr) || FMT_MSC_VERSION >= 1912
+#  define FMT_USE_CONSTEXPR 1
+#else
+#  define FMT_USE_CONSTEXPR 0
+#endif
+#if FMT_USE_CONSTEXPR
+#  define FMT_CONSTEXPR constexpr
+#else
+#  define FMT_CONSTEXPR
+#endif
+
+// Detect consteval, C++20 constexpr extensions and std::is_constant_evaluated.
+#if !defined(__cpp_lib_is_constant_evaluated)
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_CPLUSPLUS < 201709L
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_GLIBCXX_RELEASE && FMT_GLIBCXX_RELEASE < 10
+#  define FMT_USE_CONSTEVAL 0
+#elif FMT_LIBCPP_VERSION && FMT_LIBCPP_VERSION < 10000
+#  define FMT_USE_CONSTEVAL 0
+#elif defined(__apple_build_version__) && __apple_build_version__ < 14000029L
+#  define FMT_USE_CONSTEVAL 0  // consteval is broken in Apple clang < 14.
+#elif FMT_MSC_VERSION && FMT_MSC_VERSION < 1929
+#  define FMT_USE_CONSTEVAL 0  // consteval is broken in MSVC VS2019 < 16.10.
+#elif defined(__cpp_consteval)
+#  define FMT_USE_CONSTEVAL 1
+#elif FMT_GCC_VERSION >= 1002 || FMT_CLANG_VERSION >= 1101
+#  define FMT_USE_CONSTEVAL 1
+#else
+#  define FMT_USE_CONSTEVAL 0
+#endif
+#if FMT_USE_CONSTEVAL
+#  define FMT_CONSTEVAL consteval
+#  define FMT_CONSTEXPR20 constexpr
+#else
+#  define FMT_CONSTEVAL
+#  define FMT_CONSTEXPR20
+#endif
+
+// Check if exceptions are disabled.
+#ifdef FMT_USE_EXCEPTIONS
+// Use the provided definition.
+#elif defined(__GNUC__) && !defined(__EXCEPTIONS)
+#  define FMT_USE_EXCEPTIONS 0
+#elif defined(__clang__) && !defined(__cpp_exceptions)
+#  define FMT_USE_EXCEPTIONS 0
+#elif FMT_MSC_VERSION && !_HAS_EXCEPTIONS
+#  define FMT_USE_EXCEPTIONS 0
+#else
+#  define FMT_USE_EXCEPTIONS 1
+#endif
+#if FMT_USE_EXCEPTIONS
+#  define FMT_TRY try
+#  define FMT_CATCH(x) catch (x)
+#else
+#  define FMT_TRY if (true)
+#  define FMT_CATCH(x) if (false)
+#endif
+
+#ifdef FMT_NO_UNIQUE_ADDRESS
+// Use the provided definition.
+#elif FMT_CPLUSPLUS < 202002L
+// Not supported.
+#elif FMT_HAS_CPP_ATTRIBUTE(no_unique_address)
+#  define FMT_NO_UNIQUE_ADDRESS [[no_unique_address]]
+// VS2019 v16.10 and later except clang-cl (https://reviews.llvm.org/D110485).
+#elif FMT_MSC_VERSION >= 1929 && !FMT_CLANG_VERSION
+#  define FMT_NO_UNIQUE_ADDRESS [[msvc::no_unique_address]]
+#endif
+#ifndef FMT_NO_UNIQUE_ADDRESS
+#  define FMT_NO_UNIQUE_ADDRESS
+#endif
+
+#if FMT_HAS_CPP17_ATTRIBUTE(fallthrough)
+#  define FMT_FALLTHROUGH [[fallthrough]]
+#elif defined(__clang__)
+#  define FMT_FALLTHROUGH [[clang::fallthrough]]
+#elif FMT_GCC_VERSION >= 700 && \
+    (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
+#  define FMT_FALLTHROUGH [[gnu::fallthrough]]
+#else
+#  define FMT_FALLTHROUGH
+#endif
+
+// Disable [[noreturn]] on MSVC/NVCC because of bogus unreachable code warnings.
+#if FMT_HAS_CPP_ATTRIBUTE(noreturn) && !FMT_MSC_VERSION && !defined(__NVCC__)
+#  define FMT_NORETURN [[noreturn]]
+#else
+#  define FMT_NORETURN
+#endif
+
+#ifdef FMT_NODISCARD
+// Use the provided definition.
+#elif FMT_HAS_CPP17_ATTRIBUTE(nodiscard)
+#  define FMT_NODISCARD [[nodiscard]]
+#else
+#  define FMT_NODISCARD
+#endif
+
+#ifdef FMT_DEPRECATED
+// Use the provided definition.
+#elif FMT_HAS_CPP14_ATTRIBUTE(deprecated)
+#  define FMT_DEPRECATED [[deprecated]]
+#else
+#  define FMT_DEPRECATED /* deprecated */
+#endif
+
+#ifdef FMT_ALWAYS_INLINE
+// Use the provided definition.
+#elif FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#  define FMT_ALWAYS_INLINE inline
+#endif
+// A version of FMT_ALWAYS_INLINE to prevent code bloat in debug mode.
+#ifdef NDEBUG
+#  define FMT_INLINE FMT_ALWAYS_INLINE
+#else
+#  define FMT_INLINE inline
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_VISIBILITY(value) __attribute__((visibility(value)))
+#else
+#  define FMT_VISIBILITY(value)
+#endif
+
+// Detect pragmas.
+#define FMT_PRAGMA_IMPL(x) _Pragma(#x)
+#if FMT_GCC_VERSION >= 504 && !defined(__NVCOMPILER)
+// Workaround a _Pragma bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59884
+// and an nvhpc warning: https://github.com/fmtlib/fmt/pull/2582.
+#  define FMT_PRAGMA_GCC(x) FMT_PRAGMA_IMPL(GCC x)
+#else
+#  define FMT_PRAGMA_GCC(x)
+#endif
+#if FMT_CLANG_VERSION
+#  define FMT_PRAGMA_CLANG(x) FMT_PRAGMA_IMPL(clang x)
+#else
+#  define FMT_PRAGMA_CLANG(x)
+#endif
+#if FMT_MSC_VERSION
+#  define FMT_MSC_WARNING(...) __pragma(warning(__VA_ARGS__))
+#else
+#  define FMT_MSC_WARNING(...)
+#endif
+
+#ifndef FMT_BEGIN_NAMESPACE
+#  define FMT_BEGIN_NAMESPACE \
+    namespace fmt {           \
+    inline namespace v11 {
+#  define FMT_END_NAMESPACE \
+    }                       \
+    }
+#endif
+
+#ifndef FMT_EXPORT
+#  define FMT_EXPORT
+#  define FMT_BEGIN_EXPORT
+#  define FMT_END_EXPORT
+#endif
+
+#ifdef _WIN32
+#  define FMT_WIN32 1
+#else
+#  define FMT_WIN32 0
+#endif
+
+#if !defined(FMT_HEADER_ONLY) && FMT_WIN32
+#  if defined(FMT_LIB_EXPORT)
+#    define FMT_API __declspec(dllexport)
+#  elif defined(FMT_SHARED)
+#    define FMT_API __declspec(dllimport)
+#  endif
+#elif defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_API FMT_VISIBILITY("default")
+#endif
+#ifndef FMT_API
+#  define FMT_API
+#endif
+
+#ifndef FMT_OPTIMIZE_SIZE
+#  define FMT_OPTIMIZE_SIZE 0
+#endif
+
+// FMT_BUILTIN_TYPE=0 may result in smaller library size at the cost of higher
+// per-call binary size by passing built-in types through the extension API.
+#ifndef FMT_BUILTIN_TYPES
+#  define FMT_BUILTIN_TYPES 1
+#endif
+
+#define FMT_APPLY_VARIADIC(expr) \
+  using ignore = int[];          \
+  (void)ignore { 0, (expr, 0)... }
+
+// Enable minimal optimizations for more compact code in debug mode.
+FMT_PRAGMA_GCC(push_options)
+#if !defined(__OPTIMIZE__) && !defined(__CUDACC__) && !defined(FMT_MODULE)
+FMT_PRAGMA_GCC(optimize("Og"))
+#endif
+FMT_PRAGMA_CLANG(diagnostic push)
+
+FMT_BEGIN_NAMESPACE
+
+// Implementations of enable_if_t and other metafunctions for older systems.
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <typename T>
+using remove_const_t = typename std::remove_const<T>::type;
+template <typename T>
+using remove_cvref_t = typename std::remove_cv<remove_reference_t<T>>::type;
+template <typename T>
+using make_unsigned_t = typename std::make_unsigned<T>::type;
+template <typename T>
+using underlying_t = typename std::underlying_type<T>::type;
+template <typename T> using decay_t = typename std::decay<T>::type;
+using nullptr_t = decltype(nullptr);
+
+#if FMT_GCC_VERSION && FMT_GCC_VERSION < 500
+// A workaround for gcc 4.9 to make void_t work in a SFINAE context.
+template <typename...> struct void_t_impl {
+  using type = void;
+};
+template <typename... T> using void_t = typename void_t_impl<T...>::type;
+#else
+template <typename...> using void_t = void;
+#endif
+
+struct monostate {
+  constexpr monostate() {}
+};
+
+// An enable_if helper to be used in template parameters which results in much
+// shorter symbols: https://godbolt.org/z/sWw4vP. Extra parentheses are needed
+// to workaround a bug in MSVC 2019 (see #1140 and #1186).
+#ifdef FMT_DOC
+#  define FMT_ENABLE_IF(...)
+#else
+#  define FMT_ENABLE_IF(...) fmt::enable_if_t<(__VA_ARGS__), int> = 0
+#endif
+
+template <typename T> constexpr auto min_of(T a, T b) -> T {
+  return a < b ? a : b;
+}
+template <typename T> constexpr auto max_of(T a, T b) -> T {
+  return a > b ? a : b;
+}
+
+namespace detail {
+// Suppresses "unused variable" warnings with the method described in
+// https://herbsutter.com/2009/10/18/mailbag-shutting-up-compiler-warnings/.
+// (void)var does not work on many Intel compilers.
+template <typename... T> FMT_CONSTEXPR void ignore_unused(const T&...) {}
+
+constexpr auto is_constant_evaluated(bool default_value = false) noexcept
+    -> bool {
+// Workaround for incompatibility between clang 14 and libstdc++ consteval-based
+// std::is_constant_evaluated: https://github.com/fmtlib/fmt/issues/3247.
+#if FMT_CPLUSPLUS >= 202002L && FMT_GLIBCXX_RELEASE >= 12 && \
+    (FMT_CLANG_VERSION >= 1400 && FMT_CLANG_VERSION < 1500)
+  ignore_unused(default_value);
+  return __builtin_is_constant_evaluated();
+#elif defined(__cpp_lib_is_constant_evaluated)
+  ignore_unused(default_value);
+  return std::is_constant_evaluated();
+#else
+  return default_value;
+#endif
+}
+
+// Suppresses "conditional expression is constant" warnings.
+template <typename T> FMT_ALWAYS_INLINE constexpr auto const_check(T val) -> T {
+  return val;
+}
+
+FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
+                                      const char* message);
+
+#if defined(FMT_ASSERT)
+// Use the provided definition.
+#elif defined(NDEBUG)
+// FMT_ASSERT is not empty to avoid -Wempty-body.
+#  define FMT_ASSERT(condition, message) \
+    fmt::detail::ignore_unused((condition), (message))
+#else
+#  define FMT_ASSERT(condition, message)                                    \
+    ((condition) /* void() fails with -Winvalid-constexpr on clang 4.0.1 */ \
+         ? (void)0                                                          \
+         : fmt::detail::assert_fail(__FILE__, __LINE__, (message)))
+#endif
+
+#ifdef FMT_USE_INT128
+// Use the provided definition.
+#elif defined(__SIZEOF_INT128__) && !defined(__NVCC__) && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VERSION)
+#  define FMT_USE_INT128 1
+using int128_opt = __int128_t;  // An optional native 128-bit integer.
+using uint128_opt = __uint128_t;
+inline auto map(int128_opt x) -> int128_opt { return x; }
+inline auto map(uint128_opt x) -> uint128_opt { return x; }
+#else
+#  define FMT_USE_INT128 0
+#endif
+#if !FMT_USE_INT128
+enum class int128_opt {};
+enum class uint128_opt {};
+// Reduce template instantiations.
+inline auto map(int128_opt) -> monostate { return {}; }
+inline auto map(uint128_opt) -> monostate { return {}; }
+#endif
+
+#ifndef FMT_USE_BITINT
+#  define FMT_USE_BITINT (FMT_CLANG_VERSION >= 1500)
+#endif
+
+#if FMT_USE_BITINT
+FMT_PRAGMA_CLANG(diagnostic ignored "-Wbit-int-extension")
+template <int N> using bitint = _BitInt(N);
+template <int N> using ubitint = unsigned _BitInt(N);
+#else
+template <int N> struct bitint {};
+template <int N> struct ubitint {};
+#endif  // FMT_USE_BITINT
+
+// Casts a nonnegative integer to unsigned.
+template <typename Int>
+FMT_CONSTEXPR auto to_unsigned(Int value) -> make_unsigned_t<Int> {
+  FMT_ASSERT(std::is_unsigned<Int>::value || value >= 0, "negative value");
+  return static_cast<make_unsigned_t<Int>>(value);
+}
+
+template <typename Char>
+using unsigned_char = conditional_t<sizeof(Char) == 1, unsigned char, unsigned>;
+
+// A heuristic to detect std::string and std::[experimental::]string_view.
+// It is mainly used to avoid dependency on <[experimental/]string_view>.
+template <typename T, typename Enable = void>
+struct is_std_string_like : std::false_type {};
+template <typename T>
+struct is_std_string_like<T, void_t<decltype(std::declval<T>().find_first_of(
+                                 typename T::value_type(), 0))>>
+    : std::is_convertible<decltype(std::declval<T>().data()),
+                          const typename T::value_type*> {};
+
+// Check if the literal encoding is UTF-8.
+enum { is_utf8_enabled = "\u00A7"[1] == '\xA7' };
+enum { use_utf8 = !FMT_WIN32 || is_utf8_enabled };
+
+#ifndef FMT_UNICODE
+#  define FMT_UNICODE 1
+#endif
+
+static_assert(!FMT_UNICODE || use_utf8,
+              "Unicode support requires compiling with /utf-8");
+
+template <typename T> constexpr const char* narrow(const T*) { return nullptr; }
+constexpr FMT_ALWAYS_INLINE const char* narrow(const char* s) { return s; }
+
+template <typename Char>
+FMT_CONSTEXPR auto compare(const Char* s1, const Char* s2, std::size_t n)
+    -> int {
+  if (!is_constant_evaluated() && sizeof(Char) == 1) return memcmp(s1, s2, n);
+  for (; n != 0; ++s1, ++s2, --n) {
+    if (*s1 < *s2) return -1;
+    if (*s1 > *s2) return 1;
+  }
+  return 0;
+}
+
+namespace adl {
+using namespace std;
+
+template <typename Container>
+auto invoke_back_inserter()
+    -> decltype(back_inserter(std::declval<Container&>()));
+}  // namespace adl
+
+template <typename It, typename Enable = std::true_type>
+struct is_back_insert_iterator : std::false_type {};
+
+template <typename It>
+struct is_back_insert_iterator<
+    It, bool_constant<std::is_same<
+            decltype(adl::invoke_back_inserter<typename It::container_type>()),
+            It>::value>> : std::true_type {};
+
+// Extracts a reference to the container from *insert_iterator.
+template <typename OutputIt>
+inline FMT_CONSTEXPR20 auto get_container(OutputIt it) ->
+    typename OutputIt::container_type& {
+  struct accessor : OutputIt {
+    FMT_CONSTEXPR20 accessor(OutputIt base) : OutputIt(base) {}
+    using OutputIt::container;
+  };
+  return *accessor(it).container;
+}
+}  // namespace detail
+
+// Parsing-related public API and forward declarations.
+FMT_BEGIN_EXPORT
+
+/**
+ * An implementation of `std::basic_string_view` for pre-C++17. It provides a
+ * subset of the API. `fmt::basic_string_view` is used for format strings even
+ * if `std::basic_string_view` is available to prevent issues when a library is
+ * compiled with a different `-std` option than the client code (which is not
+ * recommended).
+ */
+template <typename Char> class basic_string_view {
+ private:
+  const Char* data_;
+  size_t size_;
+
+ public:
+  using value_type = Char;
+  using iterator = const Char*;
+
+  constexpr basic_string_view() noexcept : data_(nullptr), size_(0) {}
+
+  /// Constructs a string reference object from a C string and a size.
+  constexpr basic_string_view(const Char* s, size_t count) noexcept
+      : data_(s), size_(count) {}
+
+  constexpr basic_string_view(nullptr_t) = delete;
+
+  /// Constructs a string reference object from a C string.
+#if FMT_GCC_VERSION
+  FMT_ALWAYS_INLINE
+#endif
+  FMT_CONSTEXPR20 basic_string_view(const Char* s) : data_(s) {
+#if FMT_HAS_BUILTIN(__buitin_strlen) || FMT_GCC_VERSION || FMT_CLANG_VERSION
+    if (std::is_same<Char, char>::value) {
+      size_ = __builtin_strlen(detail::narrow(s));
+      return;
+    }
+#endif
+    size_t len = 0;
+    while (*s++) ++len;
+    size_ = len;
+  }
+
+  /// Constructs a string reference from a `std::basic_string` or a
+  /// `std::basic_string_view` object.
+  template <typename S,
+            FMT_ENABLE_IF(detail::is_std_string_like<S>::value&& std::is_same<
+                          typename S::value_type, Char>::value)>
+  FMT_CONSTEXPR basic_string_view(const S& s) noexcept
+      : data_(s.data()), size_(s.size()) {}
+
+  /// Returns a pointer to the string data.
+  constexpr auto data() const noexcept -> const Char* { return data_; }
+
+  /// Returns the string size.
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  constexpr auto begin() const noexcept -> iterator { return data_; }
+  constexpr auto end() const noexcept -> iterator { return data_ + size_; }
+
+  constexpr auto operator[](size_t pos) const noexcept -> const Char& {
+    return data_[pos];
+  }
+
+  FMT_CONSTEXPR void remove_prefix(size_t n) noexcept {
+    data_ += n;
+    size_ -= n;
+  }
+
+  FMT_CONSTEXPR auto starts_with(basic_string_view<Char> sv) const noexcept
+      -> bool {
+    return size_ >= sv.size_ && detail::compare(data_, sv.data_, sv.size_) == 0;
+  }
+  FMT_CONSTEXPR auto starts_with(Char c) const noexcept -> bool {
+    return size_ >= 1 && *data_ == c;
+  }
+  FMT_CONSTEXPR auto starts_with(const Char* s) const -> bool {
+    return starts_with(basic_string_view<Char>(s));
+  }
+
+  // Lexicographically compare this string reference to other.
+  FMT_CONSTEXPR auto compare(basic_string_view other) const -> int {
+    int result =
+        detail::compare(data_, other.data_, min_of(size_, other.size_));
+    if (result != 0) return result;
+    return size_ == other.size_ ? 0 : (size_ < other.size_ ? -1 : 1);
+  }
+
+  FMT_CONSTEXPR friend auto operator==(basic_string_view lhs,
+                                       basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) == 0;
+  }
+  friend auto operator!=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) != 0;
+  }
+  friend auto operator<(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) < 0;
+  }
+  friend auto operator<=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) <= 0;
+  }
+  friend auto operator>(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) > 0;
+  }
+  friend auto operator>=(basic_string_view lhs, basic_string_view rhs) -> bool {
+    return lhs.compare(rhs) >= 0;
+  }
+};
+
+using string_view = basic_string_view<char>;
+
+/// Specifies if `T` is an extended character type. Can be specialized by users.
+template <typename T> struct is_xchar : std::false_type {};
+template <> struct is_xchar<wchar_t> : std::true_type {};
+template <> struct is_xchar<char16_t> : std::true_type {};
+template <> struct is_xchar<char32_t> : std::true_type {};
+#ifdef __cpp_char8_t
+template <> struct is_xchar<char8_t> : std::true_type {};
+#endif
+
+// DEPRECATED! Will be replaced with an alias to prevent specializations.
+template <typename T> struct is_char : is_xchar<T> {};
+template <> struct is_char<char> : std::true_type {};
+
+template <typename T> class basic_appender;
+using appender = basic_appender<char>;
+
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+
+class context;
+template <typename OutputIt, typename Char> class generic_context;
+template <typename Char> class parse_context;
+
+// Longer aliases for C++20 compatibility.
+template <typename Char> using basic_format_parse_context = parse_context<Char>;
+using format_parse_context = parse_context<char>;
+template <typename OutputIt, typename Char>
+using basic_format_context =
+    conditional_t<std::is_same<OutputIt, appender>::value, context,
+                  generic_context<OutputIt, Char>>;
+using format_context = context;
+
+template <typename Char>
+using buffered_context =
+    conditional_t<std::is_same<Char, char>::value, context,
+                  generic_context<basic_appender<Char>, Char>>;
+
+template <typename Context> class basic_format_arg;
+template <typename Context> class basic_format_args;
+
+// A separate type would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+using format_args = basic_format_args<context>;
+
+// A formatter for objects of type T.
+template <typename T, typename Char = char, typename Enable = void>
+struct formatter {
+  // A deleted default constructor indicates a disabled formatter.
+  formatter() = delete;
+};
+
+/// Reports a format error at compile time or, via a `format_error` exception,
+/// at runtime.
+// This function is intentionally not constexpr to give a compile-time error.
+FMT_NORETURN FMT_API void report_error(const char* message);
+
+enum class presentation_type : unsigned char {
+  // Common specifiers:
+  none = 0,
+  debug = 1,   // '?'
+  string = 2,  // 's' (string, bool)
+
+  // Integral, bool and character specifiers:
+  dec = 3,  // 'd'
+  hex,      // 'x' or 'X'
+  oct,      // 'o'
+  bin,      // 'b' or 'B'
+  chr,      // 'c'
+
+  // String and pointer specifiers:
+  pointer = 3,  // 'p'
+
+  // Floating-point specifiers:
+  exp = 1,  // 'e' or 'E' (1 since there is no FP debug presentation)
+  fixed,    // 'f' or 'F'
+  general,  // 'g' or 'G'
+  hexfloat  // 'a' or 'A'
+};
+
+enum class align { none, left, right, center, numeric };
+enum class sign { none, minus, plus, space };
+enum class arg_id_kind { none, index, name };
+
+// Basic format specifiers for built-in and string types.
+class basic_specs {
+ private:
+  // Data is arranged as follows:
+  //
+  //  0                   1                   2                   3
+  //  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+  // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+  // |type |align| w | p | s |u|#|L|  f  |          unused           |
+  // +-----+-----+---+---+---+-+-+-+-----+---------------------------+
+  //
+  //   w - dynamic width info
+  //   p - dynamic precision info
+  //   s - sign
+  //   u - uppercase (e.g. 'X' for 'x')
+  //   # - alternate form ('#')
+  //   L - localized
+  //   f - fill size
+  //
+  // Bitfields are not used because of compiler bugs such as gcc bug 61414.
+  enum : unsigned {
+    type_mask = 0x00007,
+    align_mask = 0x00038,
+    width_mask = 0x000C0,
+    precision_mask = 0x00300,
+    sign_mask = 0x00C00,
+    uppercase_mask = 0x01000,
+    alternate_mask = 0x02000,
+    localized_mask = 0x04000,
+    fill_size_mask = 0x38000,
+
+    align_shift = 3,
+    width_shift = 6,
+    precision_shift = 8,
+    sign_shift = 10,
+    fill_size_shift = 15,
+
+    max_fill_size = 4
+  };
+
+  unsigned data_ = 1 << fill_size_shift;
+  static_assert(sizeof(data_) * CHAR_BIT >= 18, "");
+
+  // Character (code unit) type is erased to prevent template bloat.
+  char fill_data_[max_fill_size] = {' '};
+
+  FMT_CONSTEXPR void set_fill_size(size_t size) {
+    data_ = (data_ & ~fill_size_mask) |
+            (static_cast<unsigned>(size) << fill_size_shift);
+  }
+
+ public:
+  constexpr auto type() const -> presentation_type {
+    return static_cast<presentation_type>(data_ & type_mask);
+  }
+  FMT_CONSTEXPR void set_type(presentation_type t) {
+    data_ = (data_ & ~type_mask) | static_cast<unsigned>(t);
+  }
+
+  constexpr auto align() const -> align {
+    return static_cast<fmt::align>((data_ & align_mask) >> align_shift);
+  }
+  FMT_CONSTEXPR void set_align(fmt::align a) {
+    data_ = (data_ & ~align_mask) | (static_cast<unsigned>(a) << align_shift);
+  }
+
+  constexpr auto dynamic_width() const -> arg_id_kind {
+    return static_cast<arg_id_kind>((data_ & width_mask) >> width_shift);
+  }
+  FMT_CONSTEXPR void set_dynamic_width(arg_id_kind w) {
+    data_ = (data_ & ~width_mask) | (static_cast<unsigned>(w) << width_shift);
+  }
+
+  FMT_CONSTEXPR auto dynamic_precision() const -> arg_id_kind {
+    return static_cast<arg_id_kind>((data_ & precision_mask) >>
+                                    precision_shift);
+  }
+  FMT_CONSTEXPR void set_dynamic_precision(arg_id_kind p) {
+    data_ = (data_ & ~precision_mask) |
+            (static_cast<unsigned>(p) << precision_shift);
+  }
+
+  constexpr bool dynamic() const {
+    return (data_ & (width_mask | precision_mask)) != 0;
+  }
+
+  constexpr auto sign() const -> sign {
+    return static_cast<fmt::sign>((data_ & sign_mask) >> sign_shift);
+  }
+  FMT_CONSTEXPR void set_sign(fmt::sign s) {
+    data_ = (data_ & ~sign_mask) | (static_cast<unsigned>(s) << sign_shift);
+  }
+
+  constexpr auto upper() const -> bool { return (data_ & uppercase_mask) != 0; }
+  FMT_CONSTEXPR void set_upper() { data_ |= uppercase_mask; }
+
+  constexpr auto alt() const -> bool { return (data_ & alternate_mask) != 0; }
+  FMT_CONSTEXPR void set_alt() { data_ |= alternate_mask; }
+  FMT_CONSTEXPR void clear_alt() { data_ &= ~alternate_mask; }
+
+  constexpr auto localized() const -> bool {
+    return (data_ & localized_mask) != 0;
+  }
+  FMT_CONSTEXPR void set_localized() { data_ |= localized_mask; }
+
+  constexpr auto fill_size() const -> size_t {
+    return (data_ & fill_size_mask) >> fill_size_shift;
+  }
+
+  template <typename Char, FMT_ENABLE_IF(std::is_same<Char, char>::value)>
+  constexpr auto fill() const -> const Char* {
+    return fill_data_;
+  }
+  template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
+  constexpr auto fill() const -> const Char* {
+    return nullptr;
+  }
+
+  template <typename Char> constexpr auto fill_unit() const -> Char {
+    using uchar = unsigned char;
+    return static_cast<Char>(static_cast<uchar>(fill_data_[0]) |
+                             (static_cast<uchar>(fill_data_[1]) << 8) |
+                             (static_cast<uchar>(fill_data_[2]) << 16));
+  }
+
+  FMT_CONSTEXPR void set_fill(char c) {
+    fill_data_[0] = c;
+    set_fill_size(1);
+  }
+
+  template <typename Char>
+  FMT_CONSTEXPR void set_fill(basic_string_view<Char> s) {
+    auto size = s.size();
+    set_fill_size(size);
+    if (size == 1) {
+      unsigned uchar = static_cast<detail::unsigned_char<Char>>(s[0]);
+      fill_data_[0] = static_cast<char>(uchar);
+      fill_data_[1] = static_cast<char>(uchar >> 8);
+      fill_data_[2] = static_cast<char>(uchar >> 16);
+      return;
+    }
+    FMT_ASSERT(size <= max_fill_size, "invalid fill");
+    for (size_t i = 0; i < size; ++i)
+      fill_data_[i & 3] = static_cast<char>(s[i]);
+  }
+
+  FMT_CONSTEXPR void copy_fill_from(const basic_specs& specs) {
+    set_fill_size(specs.fill_size());
+    for (size_t i = 0; i < max_fill_size; ++i)
+      fill_data_[i] = specs.fill_data_[i];
+  }
+};
+
+// Format specifiers for built-in and string types.
+struct format_specs : basic_specs {
+  int width;
+  int precision;
+
+  constexpr format_specs() : width(0), precision(-1) {}
+};
+
+/**
+ * Parsing context consisting of a format string range being parsed and an
+ * argument counter for automatic indexing.
+ */
+template <typename Char = char> class parse_context {
+ private:
+  basic_string_view<Char> fmt_;
+  int next_arg_id_;
+
+  enum { use_constexpr_cast = !FMT_GCC_VERSION || FMT_GCC_VERSION >= 1200 };
+
+  FMT_CONSTEXPR void do_check_arg_id(int arg_id);
+
+ public:
+  using char_type = Char;
+  using iterator = const Char*;
+
+  constexpr explicit parse_context(basic_string_view<Char> fmt,
+                                   int next_arg_id = 0)
+      : fmt_(fmt), next_arg_id_(next_arg_id) {}
+
+  /// Returns an iterator to the beginning of the format string range being
+  /// parsed.
+  constexpr auto begin() const noexcept -> iterator { return fmt_.begin(); }
+
+  /// Returns an iterator past the end of the format string range being parsed.
+  constexpr auto end() const noexcept -> iterator { return fmt_.end(); }
+
+  /// Advances the begin iterator to `it`.
+  FMT_CONSTEXPR void advance_to(iterator it) {
+    fmt_.remove_prefix(detail::to_unsigned(it - begin()));
+  }
+
+  /// Reports an error if using the manual argument indexing; otherwise returns
+  /// the next argument index and switches to the automatic indexing.
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    if (next_arg_id_ < 0) {
+      report_error("cannot switch from manual to automatic argument indexing");
+      return 0;
+    }
+    int id = next_arg_id_++;
+    do_check_arg_id(id);
+    return id;
+  }
+
+  /// Reports an error if using the automatic argument indexing; otherwise
+  /// switches to the manual indexing.
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    if (next_arg_id_ > 0) {
+      report_error("cannot switch from automatic to manual argument indexing");
+      return;
+    }
+    next_arg_id_ = -1;
+    do_check_arg_id(id);
+  }
+  FMT_CONSTEXPR void check_arg_id(basic_string_view<Char>) {
+    next_arg_id_ = -1;
+  }
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id);
+};
+
+FMT_END_EXPORT
+
+namespace detail {
+
+// Constructs fmt::basic_string_view<Char> from types implicitly convertible
+// to it, deducing Char. Explicitly convertible types such as the ones returned
+// from FMT_STRING are intentionally excluded.
+template <typename Char, FMT_ENABLE_IF(is_char<Char>::value)>
+constexpr auto to_string_view(const Char* s) -> basic_string_view<Char> {
+  return s;
+}
+template <typename T, FMT_ENABLE_IF(is_std_string_like<T>::value)>
+constexpr auto to_string_view(const T& s)
+    -> basic_string_view<typename T::value_type> {
+  return s;
+}
+template <typename Char>
+constexpr auto to_string_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+
+template <typename T, typename Enable = void>
+struct has_to_string_view : std::false_type {};
+// detail:: is intentional since to_string_view is not an extension point.
+template <typename T>
+struct has_to_string_view<
+    T, void_t<decltype(detail::to_string_view(std::declval<T>()))>>
+    : std::true_type {};
+
+/// String's character (code unit) type. detail:: is intentional to prevent ADL.
+template <typename S,
+          typename V = decltype(detail::to_string_view(std::declval<S>()))>
+using char_t = typename V::value_type;
+
+enum class type {
+  none_type,
+  // Integer types should go first,
+  int_type,
+  uint_type,
+  long_long_type,
+  ulong_long_type,
+  int128_type,
+  uint128_type,
+  bool_type,
+  char_type,
+  last_integer_type = char_type,
+  // followed by floating-point types.
+  float_type,
+  double_type,
+  long_double_type,
+  last_numeric_type = long_double_type,
+  cstring_type,
+  string_type,
+  pointer_type,
+  custom_type
+};
+
+// Maps core type T to the corresponding type enum constant.
+template <typename T, typename Char>
+struct type_constant : std::integral_constant<type, type::custom_type> {};
+
+#define FMT_TYPE_CONSTANT(Type, constant) \
+  template <typename Char>                \
+  struct type_constant<Type, Char>        \
+      : std::integral_constant<type, type::constant> {}
+
+FMT_TYPE_CONSTANT(int, int_type);
+FMT_TYPE_CONSTANT(unsigned, uint_type);
+FMT_TYPE_CONSTANT(long long, long_long_type);
+FMT_TYPE_CONSTANT(unsigned long long, ulong_long_type);
+FMT_TYPE_CONSTANT(int128_opt, int128_type);
+FMT_TYPE_CONSTANT(uint128_opt, uint128_type);
+FMT_TYPE_CONSTANT(bool, bool_type);
+FMT_TYPE_CONSTANT(Char, char_type);
+FMT_TYPE_CONSTANT(float, float_type);
+FMT_TYPE_CONSTANT(double, double_type);
+FMT_TYPE_CONSTANT(long double, long_double_type);
+FMT_TYPE_CONSTANT(const Char*, cstring_type);
+FMT_TYPE_CONSTANT(basic_string_view<Char>, string_type);
+FMT_TYPE_CONSTANT(const void*, pointer_type);
+
+constexpr auto is_integral_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_integer_type;
+}
+constexpr auto is_arithmetic_type(type t) -> bool {
+  return t > type::none_type && t <= type::last_numeric_type;
+}
+
+constexpr auto set(type rhs) -> int { return 1 << static_cast<int>(rhs); }
+constexpr auto in(type t, int set) -> bool {
+  return ((set >> static_cast<int>(t)) & 1) != 0;
+}
+
+// Bitsets of types.
+enum {
+  sint_set =
+      set(type::int_type) | set(type::long_long_type) | set(type::int128_type),
+  uint_set = set(type::uint_type) | set(type::ulong_long_type) |
+             set(type::uint128_type),
+  bool_set = set(type::bool_type),
+  char_set = set(type::char_type),
+  float_set = set(type::float_type) | set(type::double_type) |
+              set(type::long_double_type),
+  string_set = set(type::string_type),
+  cstring_set = set(type::cstring_type),
+  pointer_set = set(type::pointer_type)
+};
+
+struct view {};
+
+template <typename Char, typename T> struct named_arg;
+template <typename T> struct is_named_arg : std::false_type {};
+template <typename T> struct is_static_named_arg : std::false_type {};
+
+template <typename Char, typename T>
+struct is_named_arg<named_arg<Char, T>> : std::true_type {};
+
+template <typename Char, typename T> struct named_arg : view {
+  const Char* name;
+  const T& value;
+
+  named_arg(const Char* n, const T& v) : name(n), value(v) {}
+  static_assert(!is_named_arg<T>::value, "nested named arguments");
+};
+
+template <bool B = false> constexpr auto count() -> int { return B ? 1 : 0; }
+template <bool B1, bool B2, bool... Tail> constexpr auto count() -> int {
+  return (B1 ? 1 : 0) + count<B2, Tail...>();
+}
+
+template <typename... Args> constexpr auto count_named_args() -> int {
+  return count<is_named_arg<Args>::value...>();
+}
+template <typename... Args> constexpr auto count_static_named_args() -> int {
+  return count<is_static_named_arg<Args>::value...>();
+}
+
+template <typename Char> struct named_arg_info {
+  const Char* name;
+  int id;
+};
+
+template <typename Char, typename T, FMT_ENABLE_IF(!is_named_arg<T>::value)>
+void init_named_arg(named_arg_info<Char>*, int& arg_index, int&, const T&) {
+  ++arg_index;
+}
+template <typename Char, typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+void init_named_arg(named_arg_info<Char>* named_args, int& arg_index,
+                    int& named_arg_index, const T& arg) {
+  named_args[named_arg_index++] = {arg.name, arg_index++};
+}
+
+template <typename T, typename Char,
+          FMT_ENABLE_IF(!is_static_named_arg<T>::value)>
+FMT_CONSTEXPR void init_static_named_arg(named_arg_info<Char>*, int& arg_index,
+                                         int&) {
+  ++arg_index;
+}
+template <typename T, typename Char,
+          FMT_ENABLE_IF(is_static_named_arg<T>::value)>
+FMT_CONSTEXPR void init_static_named_arg(named_arg_info<Char>* named_args,
+                                         int& arg_index, int& named_arg_index) {
+  named_args[named_arg_index++] = {T::name, arg_index++};
+}
+
+// To minimize the number of types we need to deal with, long is translated
+// either to int or to long long depending on its size.
+enum { long_short = sizeof(long) == sizeof(int) };
+using long_type = conditional_t<long_short, int, long long>;
+using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
+
+template <typename T>
+using format_as_result =
+    remove_cvref_t<decltype(format_as(std::declval<const T&>()))>;
+template <typename T>
+using format_as_member_result =
+    remove_cvref_t<decltype(formatter<T>::format_as(std::declval<const T&>()))>;
+
+template <typename T, typename Enable = std::true_type>
+struct use_format_as : std::false_type {};
+// format_as member is only used to avoid injection into the std namespace.
+template <typename T, typename Enable = std::true_type>
+struct use_format_as_member : std::false_type {};
+
+// Only map owning types because mapping views can be unsafe.
+template <typename T>
+struct use_format_as<
+    T, bool_constant<std::is_arithmetic<format_as_result<T>>::value>>
+    : std::true_type {};
+template <typename T>
+struct use_format_as_member<
+    T, bool_constant<std::is_arithmetic<format_as_member_result<T>>::value>>
+    : std::true_type {};
+
+template <typename T, typename U = remove_const_t<T>>
+using use_formatter =
+    bool_constant<(std::is_class<T>::value || std::is_enum<T>::value ||
+                   std::is_union<T>::value || std::is_array<T>::value) &&
+                  !has_to_string_view<T>::value && !is_named_arg<T>::value &&
+                  !use_format_as<T>::value && !use_format_as_member<U>::value>;
+
+template <typename Char, typename T, typename U = remove_const_t<T>>
+auto has_formatter_impl(T* p, buffered_context<Char>* ctx = nullptr)
+    -> decltype(formatter<U, Char>().format(*p, *ctx), std::true_type());
+template <typename Char> auto has_formatter_impl(...) -> std::false_type;
+
+// T can be const-qualified to check if it is const-formattable.
+template <typename T, typename Char> constexpr auto has_formatter() -> bool {
+  return decltype(has_formatter_impl<Char>(static_cast<T*>(nullptr)))::value;
+}
+
+// Maps formatting argument types to natively supported types or user-defined
+// types with formatters. Returns void on errors to be SFINAE-friendly.
+template <typename Char> struct type_mapper {
+  static auto map(signed char) -> int;
+  static auto map(unsigned char) -> unsigned;
+  static auto map(short) -> int;
+  static auto map(unsigned short) -> unsigned;
+  static auto map(int) -> int;
+  static auto map(unsigned) -> unsigned;
+  static auto map(long) -> long_type;
+  static auto map(unsigned long) -> ulong_type;
+  static auto map(long long) -> long long;
+  static auto map(unsigned long long) -> unsigned long long;
+  static auto map(int128_opt) -> int128_opt;
+  static auto map(uint128_opt) -> uint128_opt;
+  static auto map(bool) -> bool;
+
+  template <int N>
+  static auto map(bitint<N>) -> conditional_t<N <= 64, long long, void>;
+  template <int N>
+  static auto map(ubitint<N>)
+      -> conditional_t<N <= 64, unsigned long long, void>;
+
+  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
+  static auto map(T) -> conditional_t<
+      std::is_same<T, char>::value || std::is_same<T, Char>::value, Char, void>;
+
+  static auto map(float) -> float;
+  static auto map(double) -> double;
+  static auto map(long double) -> long double;
+
+  static auto map(Char*) -> const Char*;
+  static auto map(const Char*) -> const Char*;
+  template <typename T, typename C = char_t<T>,
+            FMT_ENABLE_IF(!std::is_pointer<T>::value)>
+  static auto map(const T&) -> conditional_t<std::is_same<C, Char>::value,
+                                             basic_string_view<C>, void>;
+
+  static auto map(void*) -> const void*;
+  static auto map(const void*) -> const void*;
+  static auto map(volatile void*) -> const void*;
+  static auto map(const volatile void*) -> const void*;
+  static auto map(nullptr_t) -> const void*;
+  template <typename T, FMT_ENABLE_IF(std::is_pointer<T>::value ||
+                                      std::is_member_pointer<T>::value)>
+  static auto map(const T&) -> void;
+
+  template <typename T, FMT_ENABLE_IF(use_format_as<T>::value)>
+  static auto map(const T& x) -> decltype(map(format_as(x)));
+  template <typename T, FMT_ENABLE_IF(use_format_as_member<T>::value)>
+  static auto map(const T& x) -> decltype(map(formatter<T>::format_as(x)));
+
+  template <typename T, FMT_ENABLE_IF(use_formatter<T>::value)>
+  static auto map(T&) -> conditional_t<has_formatter<T, Char>(), T&, void>;
+
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  static auto map(const T& named_arg) -> decltype(map(named_arg.value));
+};
+
+// detail:: is used to workaround a bug in MSVC 2017.
+template <typename T, typename Char>
+using mapped_t = decltype(detail::type_mapper<Char>::map(std::declval<T&>()));
+
+// A type constant after applying type_mapper.
+template <typename T, typename Char = char>
+using mapped_type_constant = type_constant<mapped_t<T, Char>, Char>;
+
+template <typename T, typename Context,
+          type TYPE =
+              mapped_type_constant<T, typename Context::char_type>::value>
+using stored_type_constant = std::integral_constant<
+    type, Context::builtin_types || TYPE == type::int_type ? TYPE
+                                                           : type::custom_type>;
+// A parse context with extra data used only in compile-time checks.
+template <typename Char>
+class compile_parse_context : public parse_context<Char> {
+ private:
+  int num_args_;
+  const type* types_;
+  using base = parse_context<Char>;
+
+ public:
+  FMT_CONSTEXPR explicit compile_parse_context(basic_string_view<Char> fmt,
+                                               int num_args, const type* types,
+                                               int next_arg_id = 0)
+      : base(fmt, next_arg_id), num_args_(num_args), types_(types) {}
+
+  constexpr auto num_args() const -> int { return num_args_; }
+  constexpr auto arg_type(int id) const -> type { return types_[id]; }
+
+  FMT_CONSTEXPR auto next_arg_id() -> int {
+    int id = base::next_arg_id();
+    if (id >= num_args_) report_error("argument not found");
+    return id;
+  }
+
+  FMT_CONSTEXPR void check_arg_id(int id) {
+    base::check_arg_id(id);
+    if (id >= num_args_) report_error("argument not found");
+  }
+  using base::check_arg_id;
+
+  FMT_CONSTEXPR void check_dynamic_spec(int arg_id) {
+    ignore_unused(arg_id);
+    if (arg_id < num_args_ && types_ && !is_integral_type(types_[arg_id]))
+      report_error("width/precision is not integer");
+  }
+};
+
+// An argument reference.
+template <typename Char> union arg_ref {
+  FMT_CONSTEXPR arg_ref(int idx = 0) : index(idx) {}
+  FMT_CONSTEXPR arg_ref(basic_string_view<Char> n) : name(n) {}
+
+  int index;
+  basic_string_view<Char> name;
+};
+
+// Format specifiers with width and precision resolved at formatting rather
+// than parsing time to allow reusing the same parsed specifiers with
+// different sets of arguments (precompilation of format strings).
+template <typename Char = char> struct dynamic_format_specs : format_specs {
+  arg_ref<Char> width_ref;
+  arg_ref<Char> precision_ref;
+};
+
+// Converts a character to ASCII. Returns '\0' on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr auto to_ascii(Char c) -> char {
+  return c <= 0xff ? static_cast<char>(c) : '\0';
+}
+
+// Returns the number of code units in a code point or 1 on error.
+template <typename Char>
+FMT_CONSTEXPR auto code_point_length(const Char* begin) -> int {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  auto c = static_cast<unsigned char>(*begin);
+  return static_cast<int>((0x3a55000000000000ull >> (2 * (c >> 3))) & 3) + 1;
+}
+
+// Parses the range [begin, end) as an unsigned integer. This function assumes
+// that the range is non-empty and the first character is a digit.
+template <typename Char>
+FMT_CONSTEXPR auto parse_nonnegative_int(const Char*& begin, const Char* end,
+                                         int error_value) noexcept -> int {
+  FMT_ASSERT(begin != end && '0' <= *begin && *begin <= '9', "");
+  unsigned value = 0, prev = 0;
+  auto p = begin;
+  do {
+    prev = value;
+    value = value * 10 + unsigned(*p - '0');
+    ++p;
+  } while (p != end && '0' <= *p && *p <= '9');
+  auto num_digits = p - begin;
+  begin = p;
+  int digits10 = static_cast<int>(sizeof(int) * CHAR_BIT * 3 / 10);
+  if (num_digits <= digits10) return static_cast<int>(value);
+  // Check for overflow.
+  unsigned max = INT_MAX;
+  return num_digits == digits10 + 1 &&
+                 prev * 10ull + unsigned(p[-1] - '0') <= max
+             ? static_cast<int>(value)
+             : error_value;
+}
+
+FMT_CONSTEXPR inline auto parse_align(char c) -> align {
+  switch (c) {
+  case '<': return align::left;
+  case '>': return align::right;
+  case '^': return align::center;
+  }
+  return align::none;
+}
+
+template <typename Char> constexpr auto is_name_start(Char c) -> bool {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '_';
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR auto parse_arg_id(const Char* begin, const Char* end,
+                                Handler&& handler) -> const Char* {
+  Char c = *begin;
+  if (c >= '0' && c <= '9') {
+    int index = 0;
+    if (c != '0')
+      index = parse_nonnegative_int(begin, end, INT_MAX);
+    else
+      ++begin;
+    if (begin == end || (*begin != '}' && *begin != ':'))
+      report_error("invalid format string");
+    else
+      handler.on_index(index);
+    return begin;
+  }
+  if (FMT_OPTIMIZE_SIZE > 1 || !is_name_start(c)) {
+    report_error("invalid format string");
+    return begin;
+  }
+  auto it = begin;
+  do {
+    ++it;
+  } while (it != end && (is_name_start(*it) || ('0' <= *it && *it <= '9')));
+  handler.on_name({begin, to_unsigned(it - begin)});
+  return it;
+}
+
+template <typename Char> struct dynamic_spec_handler {
+  parse_context<Char>& ctx;
+  arg_ref<Char>& ref;
+  arg_id_kind& kind;
+
+  FMT_CONSTEXPR void on_index(int id) {
+    ref = id;
+    kind = arg_id_kind::index;
+    ctx.check_arg_id(id);
+    ctx.check_dynamic_spec(id);
+  }
+  FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+    ref = id;
+    kind = arg_id_kind::name;
+    ctx.check_arg_id(id);
+  }
+};
+
+template <typename Char> struct parse_dynamic_spec_result {
+  const Char* end;
+  arg_id_kind kind;
+};
+
+// Parses integer | "{" [arg_id] "}".
+template <typename Char>
+FMT_CONSTEXPR auto parse_dynamic_spec(const Char* begin, const Char* end,
+                                      int& value, arg_ref<Char>& ref,
+                                      parse_context<Char>& ctx)
+    -> parse_dynamic_spec_result<Char> {
+  FMT_ASSERT(begin != end, "");
+  auto kind = arg_id_kind::none;
+  if ('0' <= *begin && *begin <= '9') {
+    int val = parse_nonnegative_int(begin, end, -1);
+    if (val == -1) report_error("number is too big");
+    value = val;
+  } else {
+    if (*begin == '{') {
+      ++begin;
+      if (begin != end) {
+        Char c = *begin;
+        if (c == '}' || c == ':') {
+          int id = ctx.next_arg_id();
+          ref = id;
+          kind = arg_id_kind::index;
+          ctx.check_dynamic_spec(id);
+        } else {
+          begin = parse_arg_id(begin, end,
+                               dynamic_spec_handler<Char>{ctx, ref, kind});
+        }
+      }
+      if (begin != end && *begin == '}') return {++begin, kind};
+    }
+    report_error("invalid format string");
+  }
+  return {begin, kind};
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_width(const Char* begin, const Char* end,
+                               format_specs& specs, arg_ref<Char>& width_ref,
+                               parse_context<Char>& ctx) -> const Char* {
+  auto result = parse_dynamic_spec(begin, end, specs.width, width_ref, ctx);
+  specs.set_dynamic_width(result.kind);
+  return result.end;
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_precision(const Char* begin, const Char* end,
+                                   format_specs& specs,
+                                   arg_ref<Char>& precision_ref,
+                                   parse_context<Char>& ctx) -> const Char* {
+  ++begin;
+  if (begin == end) {
+    report_error("invalid precision");
+    return begin;
+  }
+  auto result =
+      parse_dynamic_spec(begin, end, specs.precision, precision_ref, ctx);
+  specs.set_dynamic_precision(result.kind);
+  return result.end;
+}
+
+enum class state { start, align, sign, hash, zero, width, precision, locale };
+
+// Parses standard format specifiers.
+template <typename Char>
+FMT_CONSTEXPR auto parse_format_specs(const Char* begin, const Char* end,
+                                      dynamic_format_specs<Char>& specs,
+                                      parse_context<Char>& ctx, type arg_type)
+    -> const Char* {
+  auto c = '\0';
+  if (end - begin > 1) {
+    auto next = to_ascii(begin[1]);
+    c = parse_align(next) == align::none ? to_ascii(*begin) : '\0';
+  } else {
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+
+  struct {
+    state current_state = state::start;
+    FMT_CONSTEXPR void operator()(state s, bool valid = true) {
+      if (current_state >= s || !valid)
+        report_error("invalid format specifier");
+      current_state = s;
+    }
+  } enter_state;
+
+  using pres = presentation_type;
+  constexpr auto integral_set = sint_set | uint_set | bool_set | char_set;
+  struct {
+    const Char*& begin;
+    format_specs& specs;
+    type arg_type;
+
+    FMT_CONSTEXPR auto operator()(pres pres_type, int set) -> const Char* {
+      if (!in(arg_type, set)) report_error("invalid format specifier");
+      specs.set_type(pres_type);
+      return begin + 1;
+    }
+  } parse_presentation_type{begin, specs, arg_type};
+
+  for (;;) {
+    switch (c) {
+    case '<':
+    case '>':
+    case '^':
+      enter_state(state::align);
+      specs.set_align(parse_align(c));
+      ++begin;
+      break;
+    case '+':
+    case ' ':
+      specs.set_sign(c == ' ' ? sign::space : sign::plus);
+      FMT_FALLTHROUGH;
+    case '-':
+      enter_state(state::sign, in(arg_type, sint_set | float_set));
+      ++begin;
+      break;
+    case '#':
+      enter_state(state::hash, is_arithmetic_type(arg_type));
+      specs.set_alt();
+      ++begin;
+      break;
+    case '0':
+      enter_state(state::zero);
+      if (!is_arithmetic_type(arg_type))
+        report_error("format specifier requires numeric argument");
+      if (specs.align() == align::none) {
+        // Ignore 0 if align is specified for compatibility with std::format.
+        specs.set_align(align::numeric);
+        specs.set_fill('0');
+      }
+      ++begin;
+      break;
+      // clang-format off
+    case '1': case '2': case '3': case '4': case '5':
+    case '6': case '7': case '8': case '9': case '{':
+      // clang-format on
+      enter_state(state::width);
+      begin = parse_width(begin, end, specs, specs.width_ref, ctx);
+      break;
+    case '.':
+      enter_state(state::precision,
+                  in(arg_type, float_set | string_set | cstring_set));
+      begin = parse_precision(begin, end, specs, specs.precision_ref, ctx);
+      break;
+    case 'L':
+      enter_state(state::locale, is_arithmetic_type(arg_type));
+      specs.set_localized();
+      ++begin;
+      break;
+    case 'd': return parse_presentation_type(pres::dec, integral_set);
+    case 'X': specs.set_upper(); FMT_FALLTHROUGH;
+    case 'x': return parse_presentation_type(pres::hex, integral_set);
+    case 'o': return parse_presentation_type(pres::oct, integral_set);
+    case 'B': specs.set_upper(); FMT_FALLTHROUGH;
+    case 'b': return parse_presentation_type(pres::bin, integral_set);
+    case 'E': specs.set_upper(); FMT_FALLTHROUGH;
+    case 'e': return parse_presentation_type(pres::exp, float_set);
+    case 'F': specs.set_upper(); FMT_FALLTHROUGH;
+    case 'f': return parse_presentation_type(pres::fixed, float_set);
+    case 'G': specs.set_upper(); FMT_FALLTHROUGH;
+    case 'g': return parse_presentation_type(pres::general, float_set);
+    case 'A': specs.set_upper(); FMT_FALLTHROUGH;
+    case 'a': return parse_presentation_type(pres::hexfloat, float_set);
+    case 'c':
+      if (arg_type == type::bool_type) report_error("invalid format specifier");
+      return parse_presentation_type(pres::chr, integral_set);
+    case 's':
+      return parse_presentation_type(pres::string,
+                                     bool_set | string_set | cstring_set);
+    case 'p':
+      return parse_presentation_type(pres::pointer, pointer_set | cstring_set);
+    case '?':
+      return parse_presentation_type(pres::debug,
+                                     char_set | string_set | cstring_set);
+    case '}': return begin;
+    default:  {
+      if (*begin == '}') return begin;
+      // Parse fill and alignment.
+      auto fill_end = begin + code_point_length(begin);
+      if (end - fill_end <= 0) {
+        report_error("invalid format specifier");
+        return begin;
+      }
+      if (*begin == '{') {
+        report_error("invalid fill character '{'");
+        return begin;
+      }
+      auto alignment = parse_align(to_ascii(*fill_end));
+      enter_state(state::align, alignment != align::none);
+      specs.set_fill(
+          basic_string_view<Char>(begin, to_unsigned(fill_end - begin)));
+      specs.set_align(alignment);
+      begin = fill_end + 1;
+    }
+    }
+    if (begin == end) return begin;
+    c = to_ascii(*begin);
+  }
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR FMT_INLINE auto parse_replacement_field(const Char* begin,
+                                                      const Char* end,
+                                                      Handler&& handler)
+    -> const Char* {
+  ++begin;
+  if (begin == end) {
+    handler.on_error("invalid format string");
+    return end;
+  }
+  int arg_id = 0;
+  switch (*begin) {
+  case '}':
+    handler.on_replacement_field(handler.on_arg_id(), begin);
+    return begin + 1;
+  case '{': handler.on_text(begin, begin + 1); return begin + 1;
+  case ':': arg_id = handler.on_arg_id(); break;
+  default:  {
+    struct id_adapter {
+      Handler& handler;
+      int arg_id;
+
+      FMT_CONSTEXPR void on_index(int id) { arg_id = handler.on_arg_id(id); }
+      FMT_CONSTEXPR void on_name(basic_string_view<Char> id) {
+        arg_id = handler.on_arg_id(id);
+      }
+    } adapter = {handler, 0};
+    begin = parse_arg_id(begin, end, adapter);
+    arg_id = adapter.arg_id;
+    Char c = begin != end ? *begin : Char();
+    if (c == '}') {
+      handler.on_replacement_field(arg_id, begin);
+      return begin + 1;
+    }
+    if (c != ':') {
+      handler.on_error("missing '}' in format string");
+      return end;
+    }
+    break;
+  }
+  }
+  begin = handler.on_format_specs(arg_id, begin + 1, end);
+  if (begin == end || *begin != '}')
+    return handler.on_error("unknown format specifier"), end;
+  return begin + 1;
+}
+
+template <typename Char, typename Handler>
+FMT_CONSTEXPR void parse_format_string(basic_string_view<Char> fmt,
+                                       Handler&& handler) {
+  auto begin = fmt.data(), end = begin + fmt.size();
+  auto p = begin;
+  while (p != end) {
+    auto c = *p++;
+    if (c == '{') {
+      handler.on_text(begin, p - 1);
+      begin = p = parse_replacement_field(p - 1, end, handler);
+    } else if (c == '}') {
+      if (p == end || *p != '}')
+        return handler.on_error("unmatched '}' in format string");
+      handler.on_text(begin, p);
+      begin = ++p;
+    }
+  }
+  handler.on_text(begin, end);
+}
+
+// Checks char specs and returns true iff the presentation type is char-like.
+FMT_CONSTEXPR inline auto check_char_specs(const format_specs& specs) -> bool {
+  auto type = specs.type();
+  if (type != presentation_type::none && type != presentation_type::chr &&
+      type != presentation_type::debug) {
+    return false;
+  }
+  if (specs.align() == align::numeric || specs.sign() != sign::none ||
+      specs.alt()) {
+    report_error("invalid format specifier for char");
+  }
+  return true;
+}
+
+// A base class for compile-time strings.
+struct compile_string {};
+
+template <typename T, typename Char>
+FMT_VISIBILITY("hidden")  // Suppress an ld warning on macOS (#3769).
+FMT_CONSTEXPR auto invoke_parse(parse_context<Char>& ctx) -> const Char* {
+  using mapped_type = remove_cvref_t<mapped_t<T, Char>>;
+  constexpr bool formattable =
+      std::is_constructible<formatter<mapped_type, Char>>::value;
+  if (!formattable) return ctx.begin();  // Error is reported in the value ctor.
+  using formatted_type = conditional_t<formattable, mapped_type, int>;
+  return formatter<formatted_type, Char>().parse(ctx);
+}
+
+template <typename... T> struct arg_pack {};
+
+template <typename Char, int NUM_ARGS, int NUM_NAMED_ARGS, bool DYNAMIC_NAMES>
+class format_string_checker {
+ private:
+  type types_[max_of(1, NUM_ARGS)];
+  named_arg_info<Char> named_args_[max_of(1, NUM_NAMED_ARGS)];
+  compile_parse_context<Char> context_;
+
+  using parse_func = auto (*)(parse_context<Char>&) -> const Char*;
+  parse_func parse_funcs_[max_of(1, NUM_ARGS)];
+
+ public:
+  template <typename... T>
+  FMT_CONSTEXPR explicit format_string_checker(basic_string_view<Char> fmt,
+                                               arg_pack<T...>)
+      : types_{mapped_type_constant<T, Char>::value...},
+        named_args_{},
+        context_(fmt, NUM_ARGS, types_),
+        parse_funcs_{&invoke_parse<T, Char>...} {
+    int arg_index = 0, named_arg_index = 0;
+    FMT_APPLY_VARIADIC(
+        init_static_named_arg<T>(named_args_, arg_index, named_arg_index));
+    ignore_unused(arg_index, named_arg_index);
+  }
+
+  FMT_CONSTEXPR void on_text(const Char*, const Char*) {}
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return context_.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    context_.check_arg_id(id);
+    return id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+    for (int i = 0; i < NUM_NAMED_ARGS; ++i) {
+      if (named_args_[i].name == id) return named_args_[i].id;
+    }
+    if (!DYNAMIC_NAMES) on_error("argument not found");
+    return -1;
+  }
+
+  FMT_CONSTEXPR void on_replacement_field(int id, const Char* begin) {
+    on_format_specs(id, begin, begin);  // Call parse() on empty specs.
+  }
+
+  FMT_CONSTEXPR auto on_format_specs(int id, const Char* begin, const Char* end)
+      -> const Char* {
+    context_.advance_to(begin);
+    if (id >= 0 && id < NUM_ARGS) return parse_funcs_[id](context_);
+    while (begin != end && *begin != '}') ++begin;
+    return begin;
+  }
+
+  FMT_NORETURN FMT_CONSTEXPR void on_error(const char* message) {
+    report_error(message);
+  }
+};
+
+/// A contiguous memory buffer with an optional growing ability. It is an
+/// internal class and shouldn't be used directly, only via `memory_buffer`.
+template <typename T> class buffer {
+ private:
+  T* ptr_;
+  size_t size_;
+  size_t capacity_;
+
+  using grow_fun = void (*)(buffer& buf, size_t capacity);
+  grow_fun grow_;
+
+ protected:
+  // Don't initialize ptr_ since it is not accessed to save a few cycles.
+  FMT_MSC_WARNING(suppress : 26495)
+  FMT_CONSTEXPR buffer(grow_fun grow, size_t sz) noexcept
+      : size_(sz), capacity_(sz), grow_(grow) {}
+
+  constexpr buffer(grow_fun grow, T* p = nullptr, size_t sz = 0,
+                   size_t cap = 0) noexcept
+      : ptr_(p), size_(sz), capacity_(cap), grow_(grow) {}
+
+  FMT_CONSTEXPR20 ~buffer() = default;
+  buffer(buffer&&) = default;
+
+  /// Sets the buffer data and capacity.
+  FMT_CONSTEXPR void set(T* buf_data, size_t buf_capacity) noexcept {
+    ptr_ = buf_data;
+    capacity_ = buf_capacity;
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  buffer(const buffer&) = delete;
+  void operator=(const buffer&) = delete;
+
+  auto begin() noexcept -> T* { return ptr_; }
+  auto end() noexcept -> T* { return ptr_ + size_; }
+
+  auto begin() const noexcept -> const T* { return ptr_; }
+  auto end() const noexcept -> const T* { return ptr_ + size_; }
+
+  /// Returns the size of this buffer.
+  constexpr auto size() const noexcept -> size_t { return size_; }
+
+  /// Returns the capacity of this buffer.
+  constexpr auto capacity() const noexcept -> size_t { return capacity_; }
+
+  /// Returns a pointer to the buffer data (not null-terminated).
+  FMT_CONSTEXPR auto data() noexcept -> T* { return ptr_; }
+  FMT_CONSTEXPR auto data() const noexcept -> const T* { return ptr_; }
+
+  /// Clears this buffer.
+  FMT_CONSTEXPR void clear() { size_ = 0; }
+
+  // Tries resizing the buffer to contain `count` elements. If T is a POD type
+  // the new elements may not be initialized.
+  FMT_CONSTEXPR void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = min_of(count, capacity_);
+  }
+
+  // Tries increasing the buffer capacity to `new_capacity`. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  FMT_CONSTEXPR void try_reserve(size_t new_capacity) {
+    if (new_capacity > capacity_) grow_(*this, new_capacity);
+  }
+
+  FMT_CONSTEXPR void push_back(const T& value) {
+    try_reserve(size_ + 1);
+    ptr_[size_++] = value;
+  }
+
+  /// Appends data to the end of the buffer.
+  template <typename U>
+// Workaround for MSVC2019 to fix error C2893: Failed to specialize function
+// template 'void fmt::v11::detail::buffer<T>::append(const U *,const U *)'.
+#if !FMT_MSC_VERSION || FMT_MSC_VERSION >= 1940
+  FMT_CONSTEXPR20
+#endif
+      void
+      append(const U* begin, const U* end) {
+    while (begin != end) {
+      auto count = to_unsigned(end - begin);
+      try_reserve(size_ + count);
+      auto free_cap = capacity_ - size_;
+      if (free_cap < count) count = free_cap;
+      // A loop is faster than memcpy on small sizes.
+      T* out = ptr_ + size_;
+      for (size_t i = 0; i < count; ++i) out[i] = begin[i];
+      size_ += count;
+      begin += count;
+    }
+  }
+
+  template <typename Idx> FMT_CONSTEXPR auto operator[](Idx index) -> T& {
+    return ptr_[index];
+  }
+  template <typename Idx>
+  FMT_CONSTEXPR auto operator[](Idx index) const -> const T& {
+    return ptr_[index];
+  }
+};
+
+struct buffer_traits {
+  constexpr explicit buffer_traits(size_t) {}
+  constexpr auto count() const -> size_t { return 0; }
+  constexpr auto limit(size_t size) const -> size_t { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  constexpr explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  constexpr auto count() const -> size_t { return count_; }
+  FMT_CONSTEXPR auto limit(size_t size) -> size_t {
+    size_t n = limit_ > count_ ? limit_ - count_ : 0;
+    count_ += size;
+    return min_of(size, n);
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buffer_size) static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    auto size = this->size();
+    this->clear();
+    const T* begin = data_;
+    const T* end = begin + this->limit(size);
+    while (begin != end) *out_++ = *begin++;
+  }
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n), buffer<T>(grow, data_, 0, buffer_size), out_(out) {}
+  iterator_buffer(iterator_buffer&& other) noexcept
+      : Traits(other),
+        buffer<T>(grow, data_, 0, buffer_size),
+        out_(other.out_) {}
+  ~iterator_buffer() {
+    // Don't crash if flush fails during unwinding.
+    FMT_TRY { flush(); }
+    FMT_CATCH(...) {}
+  }
+
+  auto out() -> OutputIt {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t { return Traits::count() + this->size(); }
+};
+
+template <typename T>
+class iterator_buffer<T*, T, fixed_buffer_traits> : public fixed_buffer_traits,
+                                                    public buffer<T> {
+ private:
+  T* out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() == buf.capacity())
+      static_cast<iterator_buffer&>(buf).flush();
+  }
+
+  void flush() {
+    size_t n = this->limit(this->size());
+    if (this->data() == out_) {
+      out_ += n;
+      this->set(data_, buffer_size);
+    }
+    this->clear();
+  }
+
+ public:
+  explicit iterator_buffer(T* out, size_t n = buffer_size)
+      : fixed_buffer_traits(n), buffer<T>(grow, out, 0, n), out_(out) {}
+  iterator_buffer(iterator_buffer&& other) noexcept
+      : fixed_buffer_traits(other),
+        buffer<T>(static_cast<iterator_buffer&&>(other)),
+        out_(other.out_) {
+    if (this->data() != out_) {
+      this->set(data_, buffer_size);
+      this->clear();
+    }
+  }
+  ~iterator_buffer() { flush(); }
+
+  auto out() -> T* {
+    flush();
+    return out_;
+  }
+  auto count() const -> size_t {
+    return fixed_buffer_traits::count() + this->size();
+  }
+};
+
+template <typename T> class iterator_buffer<T*, T> : public buffer<T> {
+ public:
+  explicit iterator_buffer(T* out, size_t = 0)
+      : buffer<T>([](buffer<T>&, size_t) {}, out, 0, ~size_t()) {}
+
+  auto out() -> T* { return &*this->end(); }
+};
+
+template <typename Container>
+class container_buffer : public buffer<typename Container::value_type> {
+ private:
+  using value_type = typename Container::value_type;
+
+  static FMT_CONSTEXPR void grow(buffer<value_type>& buf, size_t capacity) {
+    auto& self = static_cast<container_buffer&>(buf);
+    self.container.resize(capacity);
+    self.set(&self.container[0], capacity);
+  }
+
+ public:
+  Container& container;
+
+  explicit container_buffer(Container& c)
+      : buffer<value_type>(grow, c.size()), container(c) {}
+};
+
+// A buffer that writes to a container with the contiguous storage.
+template <typename OutputIt>
+class iterator_buffer<
+    OutputIt,
+    enable_if_t<is_back_insert_iterator<OutputIt>::value &&
+                    is_contiguous<typename OutputIt::container_type>::value,
+                typename OutputIt::container_type::value_type>>
+    : public container_buffer<typename OutputIt::container_type> {
+ private:
+  using base = container_buffer<typename OutputIt::container_type>;
+
+ public:
+  explicit iterator_buffer(typename OutputIt::container_type& c) : base(c) {}
+  explicit iterator_buffer(OutputIt out, size_t = 0)
+      : base(get_container(out)) {}
+
+  auto out() -> OutputIt { return OutputIt(this->container); }
+};
+
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+  static FMT_CONSTEXPR void grow(buffer<T>& buf, size_t) {
+    if (buf.size() != buffer_size) return;
+    static_cast<counting_buffer&>(buf).count_ += buf.size();
+    buf.clear();
+  }
+
+ public:
+  FMT_CONSTEXPR counting_buffer() : buffer<T>(grow, data_, 0, buffer_size) {}
+
+  constexpr auto count() const noexcept -> size_t {
+    return count_ + this->size();
+  }
+};
+
+template <typename T>
+struct is_back_insert_iterator<basic_appender<T>> : std::true_type {};
+
+template <typename OutputIt, typename InputIt, typename = void>
+struct has_back_insert_iterator_container_append : std::false_type {};
+template <typename OutputIt, typename InputIt>
+struct has_back_insert_iterator_container_append<
+    OutputIt, InputIt,
+    void_t<decltype(get_container(std::declval<OutputIt>())
+                        .append(std::declval<InputIt>(),
+                                std::declval<InputIt>()))>> : std::true_type {};
+
+// An optimized version of std::copy with the output value type (T).
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            has_back_insert_iterator_container_append<
+                                OutputIt, InputIt>::value)>
+FMT_CONSTEXPR20 auto copy(InputIt begin, InputIt end, OutputIt out)
+    -> OutputIt {
+  get_container(out).append(begin, end);
+  return out;
+}
+
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value &&
+                        !has_back_insert_iterator_container_append<
+                            OutputIt, InputIt>::value)>
+FMT_CONSTEXPR20 auto copy(InputIt begin, InputIt end, OutputIt out)
+    -> OutputIt {
+  auto& c = get_container(out);
+  c.insert(c.end(), begin, end);
+  return out;
+}
+
+template <typename T, typename InputIt, typename OutputIt,
+          FMT_ENABLE_IF(!is_back_insert_iterator<OutputIt>::value)>
+FMT_CONSTEXPR auto copy(InputIt begin, InputIt end, OutputIt out) -> OutputIt {
+  while (begin != end) *out++ = static_cast<T>(*begin++);
+  return out;
+}
+
+template <typename T, typename V, typename OutputIt>
+FMT_CONSTEXPR auto copy(basic_string_view<V> s, OutputIt out) -> OutputIt {
+  return copy<T>(s.begin(), s.end(), out);
+}
+
+template <typename It, typename Enable = std::true_type>
+struct is_buffer_appender : std::false_type {};
+template <typename It>
+struct is_buffer_appender<
+    It, bool_constant<
+            is_back_insert_iterator<It>::value &&
+            std::is_base_of<buffer<typename It::container_type::value_type>,
+                            typename It::container_type>::value>>
+    : std::true_type {};
+
+// Maps an output iterator to a buffer.
+template <typename T, typename OutputIt,
+          FMT_ENABLE_IF(!is_buffer_appender<OutputIt>::value)>
+auto get_buffer(OutputIt out) -> iterator_buffer<OutputIt, T> {
+  return iterator_buffer<OutputIt, T>(out);
+}
+template <typename T, typename OutputIt,
+          FMT_ENABLE_IF(is_buffer_appender<OutputIt>::value)>
+auto get_buffer(OutputIt out) -> buffer<T>& {
+  return get_container(out);
+}
+
+template <typename Buf, typename OutputIt>
+auto get_iterator(Buf& buf, OutputIt) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T, typename OutputIt>
+auto get_iterator(buffer<T>&, OutputIt out) -> OutputIt {
+  return out;
+}
+
+// This type is intentionally undefined, only used for errors.
+template <typename T, typename Char> struct type_is_unformattable_for;
+
+template <typename Char> struct string_value {
+  const Char* data;
+  size_t size;
+  auto str() const -> basic_string_view<Char> { return {data, size}; }
+};
+
+template <typename Context> struct custom_value {
+  using char_type = typename Context::char_type;
+  void* value;
+  void (*format)(void* arg, parse_context<char_type>& parse_ctx, Context& ctx);
+};
+
+template <typename Char> struct named_arg_value {
+  const named_arg_info<Char>* data;
+  size_t size;
+};
+
+struct custom_tag {};
+
+#if !FMT_BUILTIN_TYPES
+#  define FMT_BUILTIN , monostate
+#else
+#  define FMT_BUILTIN
+#endif
+
+// A formatting argument value.
+template <typename Context> class value {
+ public:
+  using char_type = typename Context::char_type;
+
+  union {
+    monostate no_value;
+    int int_value;
+    unsigned uint_value;
+    long long long_long_value;
+    unsigned long long ulong_long_value;
+    int128_opt int128_value;
+    uint128_opt uint128_value;
+    bool bool_value;
+    char_type char_value;
+    float float_value;
+    double double_value;
+    long double long_double_value;
+    const void* pointer;
+    string_value<char_type> string;
+    custom_value<Context> custom;
+    named_arg_value<char_type> named_args;
+  };
+
+  constexpr FMT_INLINE value() : no_value() {}
+  constexpr FMT_INLINE value(signed char x) : int_value(x) {}
+  constexpr FMT_INLINE value(unsigned char x FMT_BUILTIN) : uint_value(x) {}
+  constexpr FMT_INLINE value(signed short x) : int_value(x) {}
+  constexpr FMT_INLINE value(unsigned short x FMT_BUILTIN) : uint_value(x) {}
+  constexpr FMT_INLINE value(int x) : int_value(x) {}
+  constexpr FMT_INLINE value(unsigned x FMT_BUILTIN) : uint_value(x) {}
+  FMT_CONSTEXPR FMT_INLINE value(long x FMT_BUILTIN) : value(long_type(x)) {}
+  FMT_CONSTEXPR FMT_INLINE value(unsigned long x FMT_BUILTIN)
+      : value(ulong_type(x)) {}
+  constexpr FMT_INLINE value(long long x FMT_BUILTIN) : long_long_value(x) {}
+  constexpr FMT_INLINE value(unsigned long long x FMT_BUILTIN)
+      : ulong_long_value(x) {}
+  FMT_INLINE value(int128_opt x FMT_BUILTIN) : int128_value(x) {}
+  FMT_INLINE value(uint128_opt x FMT_BUILTIN) : uint128_value(x) {}
+  constexpr FMT_INLINE value(bool x FMT_BUILTIN) : bool_value(x) {}
+
+  template <int N>
+  constexpr FMT_INLINE value(bitint<N> x FMT_BUILTIN) : long_long_value(x) {
+    static_assert(N <= 64, "unsupported _BitInt");
+  }
+  template <int N>
+  constexpr FMT_INLINE value(ubitint<N> x FMT_BUILTIN) : ulong_long_value(x) {
+    static_assert(N <= 64, "unsupported _BitInt");
+  }
+
+  template <typename T, FMT_ENABLE_IF(is_char<T>::value)>
+  constexpr FMT_INLINE value(T x FMT_BUILTIN) : char_value(x) {
+    static_assert(
+        std::is_same<T, char>::value || std::is_same<T, char_type>::value,
+        "mixing character types is disallowed");
+  }
+
+  constexpr FMT_INLINE value(float x FMT_BUILTIN) : float_value(x) {}
+  constexpr FMT_INLINE value(double x FMT_BUILTIN) : double_value(x) {}
+  FMT_INLINE value(long double x FMT_BUILTIN) : long_double_value(x) {}
+
+  FMT_CONSTEXPR FMT_INLINE value(char_type* x FMT_BUILTIN) {
+    string.data = x;
+    if (is_constant_evaluated()) string.size = 0;
+  }
+  FMT_CONSTEXPR FMT_INLINE value(const char_type* x FMT_BUILTIN) {
+    string.data = x;
+    if (is_constant_evaluated()) string.size = 0;
+  }
+  template <typename T, typename C = char_t<T>,
+            FMT_ENABLE_IF(!std::is_pointer<T>::value)>
+  FMT_CONSTEXPR value(const T& x FMT_BUILTIN) {
+    static_assert(std::is_same<C, char_type>::value,
+                  "mixing character types is disallowed");
+    auto sv = to_string_view(x);
+    string.data = sv.data();
+    string.size = sv.size();
+  }
+  FMT_INLINE value(void* x FMT_BUILTIN) : pointer(x) {}
+  FMT_INLINE value(const void* x FMT_BUILTIN) : pointer(x) {}
+  FMT_INLINE value(volatile void* x FMT_BUILTIN)
+      : pointer(const_cast<const void*>(x)) {}
+  FMT_INLINE value(const volatile void* x FMT_BUILTIN)
+      : pointer(const_cast<const void*>(x)) {}
+  FMT_INLINE value(nullptr_t) : pointer(nullptr) {}
+
+  template <typename T, FMT_ENABLE_IF(std::is_pointer<T>::value ||
+                                      std::is_member_pointer<T>::value)>
+  value(const T&) {
+    // Formatting of arbitrary pointers is disallowed. If you want to format a
+    // pointer cast it to `void*` or `const void*`. In particular, this forbids
+    // formatting of `[const] volatile char*` printed as bool by iostreams.
+    static_assert(sizeof(T) == 0,
+                  "formatting of non-void pointers is disallowed");
+  }
+
+  template <typename T, FMT_ENABLE_IF(use_format_as<T>::value)>
+  value(const T& x) : value(format_as(x)) {}
+  template <typename T, FMT_ENABLE_IF(use_format_as_member<T>::value)>
+  value(const T& x) : value(formatter<T>::format_as(x)) {}
+
+  template <typename T, FMT_ENABLE_IF(is_named_arg<T>::value)>
+  value(const T& named_arg) : value(named_arg.value) {}
+
+  template <typename T,
+            FMT_ENABLE_IF(use_formatter<T>::value || !FMT_BUILTIN_TYPES)>
+  FMT_CONSTEXPR20 FMT_INLINE value(T& x) : value(x, custom_tag()) {}
+
+  FMT_ALWAYS_INLINE value(const named_arg_info<char_type>* args, size_t size)
+      : named_args{args, size} {}
+
+ private:
+  template <typename T, FMT_ENABLE_IF(has_formatter<T, char_type>())>
+  FMT_CONSTEXPR value(T& x, custom_tag) {
+    using value_type = remove_const_t<T>;
+    // T may overload operator& e.g. std::vector<bool>::reference in libc++.
+    if (!is_constant_evaluated()) {
+      custom.value =
+          const_cast<char*>(&reinterpret_cast<const volatile char&>(x));
+    } else {
+      custom.value = nullptr;
+#if defined(__cpp_if_constexpr)
+      if constexpr (std::is_same<decltype(&x), remove_reference_t<T>*>::value)
+        custom.value = const_cast<value_type*>(&x);
+#endif
+    }
+    custom.format = format_custom<value_type, formatter<value_type, char_type>>;
+  }
+
+  template <typename T, FMT_ENABLE_IF(!has_formatter<T, char_type>())>
+  FMT_CONSTEXPR value(const T&, custom_tag) {
+    // Cannot format an argument; to make type T formattable provide a
+    // formatter<T> specialization: https://fmt.dev/latest/api.html#udt.
+    type_is_unformattable_for<T, char_type> _;
+  }
+
+  // Formats an argument of a custom type, such as a user-defined class.
+  template <typename T, typename Formatter>
+  static void format_custom(void* arg, parse_context<char_type>& parse_ctx,
+                            Context& ctx) {
+    auto f = Formatter();
+    parse_ctx.advance_to(f.parse(parse_ctx));
+    using qualified_type =
+        conditional_t<has_formatter<const T, char_type>(), const T, T>;
+    // format must be const for compatibility with std::format and compilation.
+    const auto& cf = f;
+    ctx.advance_to(cf.format(*static_cast<qualified_type*>(arg), ctx));
+  }
+};
+
+enum { packed_arg_bits = 4 };
+// Maximum number of arguments with packed types.
+enum { max_packed_args = 62 / packed_arg_bits };
+enum : unsigned long long { is_unpacked_bit = 1ULL << 63 };
+enum : unsigned long long { has_named_args_bit = 1ULL << 62 };
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <> struct is_output_iterator<appender, char> : std::true_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T,
+    void_t<decltype(*std::declval<decay_t<It>&>()++ = std::declval<T>())>>
+    : std::true_type {};
+
+#ifndef FMT_USE_LOCALE
+#  define FMT_USE_LOCALE (FMT_OPTIMIZE_SIZE <= 1)
+#endif
+
+// A type-erased reference to an std::locale to avoid a heavy <locale> include.
+struct locale_ref {
+#if FMT_USE_LOCALE
+ private:
+  const void* locale_;  // A type-erased pointer to std::locale.
+
+ public:
+  constexpr locale_ref() : locale_(nullptr) {}
+  template <typename Locale> locale_ref(const Locale& loc);
+
+  inline explicit operator bool() const noexcept { return locale_ != nullptr; }
+#endif  // FMT_USE_LOCALE
+
+  template <typename Locale> auto get() const -> Locale;
+};
+
+template <typename> constexpr auto encode_types() -> unsigned long long {
+  return 0;
+}
+
+template <typename Context, typename Arg, typename... Args>
+constexpr auto encode_types() -> unsigned long long {
+  return static_cast<unsigned>(stored_type_constant<Arg, Context>::value) |
+         (encode_types<Context, Args...>() << packed_arg_bits);
+}
+
+template <typename Context, typename... T, size_t NUM_ARGS = sizeof...(T)>
+constexpr auto make_descriptor() -> unsigned long long {
+  return NUM_ARGS <= max_packed_args ? encode_types<Context, T...>()
+                                     : is_unpacked_bit | NUM_ARGS;
+}
+
+template <typename Context, int NUM_ARGS>
+using arg_t = conditional_t<NUM_ARGS <= max_packed_args, value<Context>,
+                            basic_format_arg<Context>>;
+
+template <typename Context, int NUM_ARGS, int NUM_NAMED_ARGS,
+          unsigned long long DESC>
+struct named_arg_store {
+  // args_[0].named_args points to named_args to avoid bloating format_args.
+  arg_t<Context, NUM_ARGS> args[1 + NUM_ARGS];
+  named_arg_info<typename Context::char_type> named_args[NUM_NAMED_ARGS];
+
+  template <typename... T>
+  FMT_CONSTEXPR FMT_ALWAYS_INLINE named_arg_store(T&... values)
+      : args{{named_args, NUM_NAMED_ARGS}, values...} {
+    int arg_index = 0, named_arg_index = 0;
+    FMT_APPLY_VARIADIC(
+        init_named_arg(named_args, arg_index, named_arg_index, values));
+  }
+
+  named_arg_store(named_arg_store&& rhs) {
+    args[0] = {named_args, NUM_NAMED_ARGS};
+    for (size_t i = 1; i < sizeof(args) / sizeof(*args); ++i)
+      args[i] = rhs.args[i];
+    for (size_t i = 0; i < NUM_NAMED_ARGS; ++i)
+      named_args[i] = rhs.named_args[i];
+  }
+
+  named_arg_store(const named_arg_store& rhs) = delete;
+  named_arg_store& operator=(const named_arg_store& rhs) = delete;
+  named_arg_store& operator=(named_arg_store&& rhs) = delete;
+  operator const arg_t<Context, NUM_ARGS>*() const { return args + 1; }
+};
+
+// An array of references to arguments. It can be implicitly converted to
+// `basic_format_args` for passing into type-erased formatting functions
+// such as `vformat`. It is a plain struct to reduce binary size in debug mode.
+template <typename Context, int NUM_ARGS, int NUM_NAMED_ARGS,
+          unsigned long long DESC>
+struct format_arg_store {
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  using type =
+      conditional_t<NUM_NAMED_ARGS == 0,
+                    arg_t<Context, NUM_ARGS>[max_of(1, NUM_ARGS)],
+                    named_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>>;
+  type args;
+};
+
+// TYPE can be different from type_constant<T>, e.g. for __float128.
+template <typename T, typename Char, type TYPE> struct native_formatter {
+ private:
+  dynamic_format_specs<Char> specs_;
+
+ public:
+  using nonlocking = void;
+
+  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+    if (ctx.begin() == ctx.end() || *ctx.begin() == '}') return ctx.begin();
+    auto end = parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx, TYPE);
+    if (const_check(TYPE == type::char_type)) check_char_specs(specs_);
+    return end;
+  }
+
+  template <type U = TYPE,
+            FMT_ENABLE_IF(U == type::string_type || U == type::cstring_type ||
+                          U == type::char_type)>
+  FMT_CONSTEXPR void set_debug_format(bool set = true) {
+    specs_.set_type(set ? presentation_type::debug : presentation_type::none);
+  }
+
+  FMT_PRAGMA_CLANG(diagnostic ignored "-Wundefined-inline")
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const
+      -> decltype(ctx.out());
+};
+
+template <typename T, typename Enable = void>
+struct locking
+    : bool_constant<mapped_type_constant<T>::value == type::custom_type> {};
+template <typename T>
+struct locking<T, void_t<typename formatter<remove_cvref_t<T>>::nonlocking>>
+    : std::false_type {};
+
+template <typename T = int> FMT_CONSTEXPR inline auto is_locking() -> bool {
+  return locking<T>::value;
+}
+template <typename T1, typename T2, typename... Tail>
+FMT_CONSTEXPR inline auto is_locking() -> bool {
+  return locking<T1>::value || is_locking<T2, Tail...>();
+}
+
+FMT_API void vformat_to(buffer<char>& buf, string_view fmt, format_args args,
+                        locale_ref loc = {});
+
+#if FMT_WIN32
+FMT_API void vprint_mojibake(FILE*, string_view, format_args, bool);
+#else  // format_args is passed by reference since it is defined later.
+inline void vprint_mojibake(FILE*, string_view, const format_args&, bool) {}
+#endif
+}  // namespace detail
+
+// The main public API.
+
+template <typename Char>
+FMT_CONSTEXPR void parse_context<Char>::do_check_arg_id(int arg_id) {
+  // Argument id is only checked at compile time during parsing because
+  // formatting has its own validation.
+  if (detail::is_constant_evaluated() && use_constexpr_cast) {
+    auto ctx = static_cast<detail::compile_parse_context<Char>*>(this);
+    if (arg_id >= ctx->num_args()) report_error("argument not found");
+  }
+}
+
+template <typename Char>
+FMT_CONSTEXPR void parse_context<Char>::check_dynamic_spec(int arg_id) {
+  using detail::compile_parse_context;
+  if (detail::is_constant_evaluated() && use_constexpr_cast)
+    static_cast<compile_parse_context<Char>*>(this)->check_dynamic_spec(arg_id);
+}
+
+FMT_BEGIN_EXPORT
+
+// An output iterator that appends to a buffer. It is used instead of
+// back_insert_iterator to reduce symbol sizes and avoid <iterator> dependency.
+template <typename T> class basic_appender {
+ protected:
+  detail::buffer<T>* container;
+
+ public:
+  using container_type = detail::buffer<T>;
+
+  FMT_CONSTEXPR basic_appender(detail::buffer<T>& buf) : container(&buf) {}
+
+  FMT_CONSTEXPR20 auto operator=(T c) -> basic_appender& {
+    container->push_back(c);
+    return *this;
+  }
+  FMT_CONSTEXPR20 auto operator*() -> basic_appender& { return *this; }
+  FMT_CONSTEXPR20 auto operator++() -> basic_appender& { return *this; }
+  FMT_CONSTEXPR20 auto operator++(int) -> basic_appender { return *this; }
+};
+
+// A formatting argument. Context is a template parameter for the compiled API
+// where output can be unbuffered.
+template <typename Context> class basic_format_arg {
+ private:
+  detail::value<Context> value_;
+  detail::type type_;
+
+  friend class basic_format_args<Context>;
+
+  using char_type = typename Context::char_type;
+
+ public:
+  class handle {
+   private:
+    detail::custom_value<Context> custom_;
+
+   public:
+    explicit handle(detail::custom_value<Context> custom) : custom_(custom) {}
+
+    void format(parse_context<char_type>& parse_ctx, Context& ctx) const {
+      custom_.format(custom_.value, parse_ctx, ctx);
+    }
+  };
+
+  constexpr basic_format_arg() : type_(detail::type::none_type) {}
+  basic_format_arg(const detail::named_arg_info<char_type>* args, size_t size)
+      : value_(args, size) {}
+  template <typename T>
+  basic_format_arg(T&& val)
+      : value_(val), type_(detail::stored_type_constant<T, Context>::value) {}
+
+  constexpr explicit operator bool() const noexcept {
+    return type_ != detail::type::none_type;
+  }
+  auto type() const -> detail::type { return type_; }
+
+  /**
+   * Visits an argument dispatching to the appropriate visit method based on
+   * the argument type. For example, if the argument type is `double` then
+   * `vis(value)` will be called with the value of type `double`.
+   */
+  template <typename Visitor>
+  FMT_CONSTEXPR FMT_INLINE auto visit(Visitor&& vis) const -> decltype(vis(0)) {
+    using detail::map;
+    switch (type_) {
+    case detail::type::none_type:        break;
+    case detail::type::int_type:         return vis(value_.int_value);
+    case detail::type::uint_type:        return vis(value_.uint_value);
+    case detail::type::long_long_type:   return vis(value_.long_long_value);
+    case detail::type::ulong_long_type:  return vis(value_.ulong_long_value);
+    case detail::type::int128_type:      return vis(map(value_.int128_value));
+    case detail::type::uint128_type:     return vis(map(value_.uint128_value));
+    case detail::type::bool_type:        return vis(value_.bool_value);
+    case detail::type::char_type:        return vis(value_.char_value);
+    case detail::type::float_type:       return vis(value_.float_value);
+    case detail::type::double_type:      return vis(value_.double_value);
+    case detail::type::long_double_type: return vis(value_.long_double_value);
+    case detail::type::cstring_type:     return vis(value_.string.data);
+    case detail::type::string_type:      return vis(value_.string.str());
+    case detail::type::pointer_type:     return vis(value_.pointer);
+    case detail::type::custom_type:      return vis(handle(value_.custom));
+    }
+    return vis(monostate());
+  }
+
+  auto format_custom(const char_type* parse_begin,
+                     parse_context<char_type>& parse_ctx, Context& ctx)
+      -> bool {
+    if (type_ != detail::type::custom_type) return false;
+    parse_ctx.advance_to(parse_begin);
+    value_.custom.format(value_.custom.value, parse_ctx, ctx);
+    return true;
+  }
+};
+
+/**
+ * A view of a collection of formatting arguments. To avoid lifetime issues it
+ * should only be used as a parameter type in type-erased functions such as
+ * `vformat`:
+ *
+ *     void vlog(fmt::string_view fmt, fmt::format_args args);  // OK
+ *     fmt::format_args args = fmt::make_format_args();  // Dangling reference
+ */
+template <typename Context> class basic_format_args {
+ private:
+  // A descriptor that contains information about formatting arguments.
+  // If the number of arguments is less or equal to max_packed_args then
+  // argument types are passed in the descriptor. This reduces binary code size
+  // per formatting function call.
+  unsigned long long desc_;
+  union {
+    // If is_packed() returns true then argument values are stored in values_;
+    // otherwise they are stored in args_. This is done to improve cache
+    // locality and reduce compiled code size since storing larger objects
+    // may require more code (at least on x86-64) even if the same amount of
+    // data is actually copied to stack. It saves ~10% on the bloat test.
+    const detail::value<Context>* values_;
+    const basic_format_arg<Context>* args_;
+  };
+
+  constexpr auto is_packed() const -> bool {
+    return (desc_ & detail::is_unpacked_bit) == 0;
+  }
+  constexpr auto has_named_args() const -> bool {
+    return (desc_ & detail::has_named_args_bit) != 0;
+  }
+
+  FMT_CONSTEXPR auto type(int index) const -> detail::type {
+    int shift = index * detail::packed_arg_bits;
+    unsigned mask = (1 << detail::packed_arg_bits) - 1;
+    return static_cast<detail::type>((desc_ >> shift) & mask);
+  }
+
+  template <int NUM_ARGS, int NUM_NAMED_ARGS, unsigned long long DESC>
+  using store =
+      detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC>;
+
+ public:
+  using format_arg = basic_format_arg<Context>;
+
+  constexpr basic_format_args() : desc_(0), args_(nullptr) {}
+
+  /// Constructs a `basic_format_args` object from `format_arg_store`.
+  template <int NUM_ARGS, int NUM_NAMED_ARGS, unsigned long long DESC,
+            FMT_ENABLE_IF(NUM_ARGS <= detail::max_packed_args)>
+  constexpr FMT_ALWAYS_INLINE basic_format_args(
+      const store<NUM_ARGS, NUM_NAMED_ARGS, DESC>& s)
+      : desc_(DESC | (NUM_NAMED_ARGS != 0 ? +detail::has_named_args_bit : 0)),
+        values_(s.args) {}
+
+  template <int NUM_ARGS, int NUM_NAMED_ARGS, unsigned long long DESC,
+            FMT_ENABLE_IF(NUM_ARGS > detail::max_packed_args)>
+  constexpr basic_format_args(const store<NUM_ARGS, NUM_NAMED_ARGS, DESC>& s)
+      : desc_(DESC | (NUM_NAMED_ARGS != 0 ? +detail::has_named_args_bit : 0)),
+        args_(s.args) {}
+
+  /// Constructs a `basic_format_args` object from a dynamic list of arguments.
+  constexpr basic_format_args(const format_arg* args, int count,
+                              bool has_named = false)
+      : desc_(detail::is_unpacked_bit | detail::to_unsigned(count) |
+              (has_named ? +detail::has_named_args_bit : 0)),
+        args_(args) {}
+
+  /// Returns the argument with the specified id.
+  FMT_CONSTEXPR auto get(int id) const -> format_arg {
+    auto arg = format_arg();
+    if (!is_packed()) {
+      if (id < max_size()) arg = args_[id];
+      return arg;
+    }
+    if (static_cast<unsigned>(id) >= detail::max_packed_args) return arg;
+    arg.type_ = type(id);
+    if (arg.type_ != detail::type::none_type) arg.value_ = values_[id];
+    return arg;
+  }
+
+  template <typename Char>
+  auto get(basic_string_view<Char> name) const -> format_arg {
+    int id = get_id(name);
+    return id >= 0 ? get(id) : format_arg();
+  }
+
+  template <typename Char>
+  FMT_CONSTEXPR auto get_id(basic_string_view<Char> name) const -> int {
+    if (!has_named_args()) return -1;
+    const auto& named_args =
+        (is_packed() ? values_[-1] : args_[-1].value_).named_args;
+    for (size_t i = 0; i < named_args.size; ++i) {
+      if (named_args.data[i].name == name) return named_args.data[i].id;
+    }
+    return -1;
+  }
+
+  auto max_size() const -> int {
+    unsigned long long max_packed = detail::max_packed_args;
+    return static_cast<int>(is_packed() ? max_packed
+                                        : desc_ & ~detail::is_unpacked_bit);
+  }
+};
+
+// A formatting context.
+class context {
+ private:
+  appender out_;
+  format_args args_;
+  FMT_NO_UNIQUE_ADDRESS detail::locale_ref loc_;
+
+ public:
+  /// The character type for the output.
+  using char_type = char;
+
+  using iterator = appender;
+  using format_arg = basic_format_arg<context>;
+  using parse_context_type FMT_DEPRECATED = parse_context<>;
+  template <typename T> using formatter_type FMT_DEPRECATED = formatter<T>;
+  enum { builtin_types = FMT_BUILTIN_TYPES };
+
+  /// Constructs a `context` object. References to the arguments are stored
+  /// in the object so make sure they have appropriate lifetimes.
+  FMT_CONSTEXPR context(iterator out, format_args args,
+                        detail::locale_ref loc = {})
+      : out_(out), args_(args), loc_(loc) {}
+  context(context&&) = default;
+  context(const context&) = delete;
+  void operator=(const context&) = delete;
+
+  FMT_CONSTEXPR auto arg(int id) const -> format_arg { return args_.get(id); }
+  inline auto arg(string_view name) const -> format_arg {
+    return args_.get(name);
+  }
+  FMT_CONSTEXPR auto arg_id(string_view name) const -> int {
+    return args_.get_id(name);
+  }
+  auto args() const -> const format_args& { return args_; }
+
+  // Returns an iterator to the beginning of the output range.
+  FMT_CONSTEXPR auto out() const -> iterator { return out_; }
+
+  // Advances the begin iterator to `it`.
+  FMT_CONSTEXPR void advance_to(iterator) {}
+
+  FMT_CONSTEXPR auto locale() const -> detail::locale_ref { return loc_; }
+};
+
+template <typename Char = char> struct runtime_format_string {
+  basic_string_view<Char> str;
+};
+
+/**
+ * Creates a runtime format string.
+ *
+ * **Example**:
+ *
+ *     // Check format string at runtime instead of compile-time.
+ *     fmt::print(fmt::runtime("{:d}"), "I am not a number");
+ */
+inline auto runtime(string_view s) -> runtime_format_string<> { return {{s}}; }
+
+/// A compile-time format string. Use `format_string` in the public API to
+/// prevent type deduction.
+template <typename... T> struct fstring {
+ private:
+  static constexpr int num_static_named_args =
+      detail::count_static_named_args<T...>();
+
+  using checker = detail::format_string_checker<
+      char, static_cast<int>(sizeof...(T)), num_static_named_args,
+      num_static_named_args != detail::count_named_args<T...>()>;
+
+  using arg_pack = detail::arg_pack<T...>;
+
+ public:
+  string_view str;
+  using t = fstring;
+
+  // Reports a compile-time error if S is not a valid format string for T.
+  template <size_t N>
+  FMT_CONSTEVAL FMT_ALWAYS_INLINE fstring(const char (&s)[N]) : str(s, N - 1) {
+    using namespace detail;
+    static_assert(count<(std::is_base_of<view, remove_reference_t<T>>::value &&
+                         std::is_reference<T>::value)...>() == 0,
+                  "passing views as lvalues is disallowed");
+    if (FMT_USE_CONSTEVAL) parse_format_string<char>(s, checker(s, arg_pack()));
+#ifdef FMT_ENFORCE_COMPILE_STRING
+    static_assert(
+        FMT_USE_CONSTEVAL && sizeof(s) != 0,
+        "FMT_ENFORCE_COMPILE_STRING requires format strings to use FMT_STRING");
+#endif
+  }
+  template <typename S,
+            FMT_ENABLE_IF(std::is_convertible<const S&, string_view>::value)>
+  FMT_CONSTEVAL FMT_ALWAYS_INLINE fstring(const S& s) : str(s) {
+    auto sv = string_view(str);
+    if (FMT_USE_CONSTEVAL)
+      detail::parse_format_string<char>(sv, checker(sv, arg_pack()));
+#ifdef FMT_ENFORCE_COMPILE_STRING
+    static_assert(
+        FMT_USE_CONSTEVAL && sizeof(s) != 0,
+        "FMT_ENFORCE_COMPILE_STRING requires format strings to use FMT_STRING");
+#endif
+  }
+  template <typename S,
+            FMT_ENABLE_IF(std::is_base_of<detail::compile_string, S>::value&&
+                              std::is_same<typename S::char_type, char>::value)>
+  FMT_ALWAYS_INLINE fstring(const S&) : str(S()) {
+    FMT_CONSTEXPR auto sv = string_view(S());
+    FMT_CONSTEXPR int ignore =
+        (parse_format_string(sv, checker(sv, arg_pack())), 0);
+    detail::ignore_unused(ignore);
+  }
+  fstring(runtime_format_string<> fmt) : str(fmt.str) {}
+
+  // Returning by reference generates better code in debug mode.
+  FMT_ALWAYS_INLINE operator const string_view&() const { return str; }
+  auto get() const -> string_view { return str; }
+};
+
+template <typename... T> using format_string = typename fstring<T...>::t;
+
+template <typename T, typename Char = char>
+using is_formattable = bool_constant<!std::is_same<
+    detail::mapped_t<conditional_t<std::is_void<T>::value, int*, T>, Char>,
+    void>::value>;
+#ifdef __cpp_concepts
+template <typename T, typename Char = char>
+concept formattable = is_formattable<remove_reference_t<T>, Char>::value;
+#endif
+
+template <typename T, typename Char>
+using has_formatter FMT_DEPRECATED = std::is_constructible<formatter<T, Char>>;
+
+// A formatter specialization for natively supported types.
+template <typename T, typename Char>
+struct formatter<T, Char,
+                 enable_if_t<detail::type_constant<T, Char>::value !=
+                             detail::type::custom_type>>
+    : detail::native_formatter<T, Char, detail::type_constant<T, Char>::value> {
+};
+
+/**
+ * Constructs an object that stores references to arguments and can be
+ * implicitly converted to `format_args`. `Context` can be omitted in which case
+ * it defaults to `context`. See `arg` for lifetime considerations.
+ */
+// Take arguments by lvalue references to avoid some lifetime issues, e.g.
+//   auto args = make_format_args(std::string());
+template <typename Context = context, typename... T,
+          int NUM_ARGS = sizeof...(T),
+          int NUM_NAMED_ARGS = detail::count_named_args<T...>(),
+          unsigned long long DESC = detail::make_descriptor<Context, T...>()>
+constexpr FMT_ALWAYS_INLINE auto make_format_args(T&... args)
+    -> detail::format_arg_store<Context, NUM_ARGS, NUM_NAMED_ARGS, DESC> {
+  // Suppress warnings for pathological types convertible to detail::value.
+  FMT_PRAGMA_GCC(diagnostic ignored "-Wconversion")
+  return {{args...}};
+}
+
+template <typename... T>
+using vargs =
+    detail::format_arg_store<context, sizeof...(T),
+                             detail::count_named_args<T...>(),
+                             detail::make_descriptor<context, T...>()>;
+
+/**
+ * Returns a named argument to be used in a formatting function.
+ * It should only be used in a call to a formatting function.
+ *
+ * **Example**:
+ *
+ *     fmt::print("The answer is {answer}.", fmt::arg("answer", 42));
+ */
+template <typename Char, typename T>
+inline auto arg(const Char* name, const T& arg) -> detail::named_arg<Char, T> {
+  return {name, arg};
+}
+
+/// Formats a string and writes the output to `out`.
+template <typename OutputIt,
+          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
+                                                   char>::value)>
+auto vformat_to(OutputIt&& out, string_view fmt, format_args args)
+    -> remove_cvref_t<OutputIt> {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, {});
+  return detail::get_iterator(buf, out);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt`, writes the result to
+ * the output iterator `out` and returns the iterator past the end of the output
+ * range. `format_to` does not append a terminating null character.
+ *
+ * **Example**:
+ *
+ *     auto out = std::vector<char>();
+ *     fmt::format_to(std::back_inserter(out), "{}", 42);
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<remove_cvref_t<OutputIt>,
+                                                   char>::value)>
+FMT_INLINE auto format_to(OutputIt&& out, format_string<T...> fmt, T&&... args)
+    -> remove_cvref_t<OutputIt> {
+  return vformat_to(out, fmt.str, vargs<T...>{{args...}});
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /// Iterator past the end of the output range.
+  OutputIt out;
+  /// Total (not truncated) output size.
+  size_t size;
+};
+
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to_n(OutputIt out, size_t n, string_view fmt, format_args args)
+    -> format_to_n_result<OutputIt> {
+  using traits = detail::fixed_buffer_traits;
+  auto buf = detail::iterator_buffer<OutputIt, char, traits>(out, n);
+  detail::vformat_to(buf, fmt, args, {});
+  return {buf.out(), buf.count()};
+}
+
+/**
+ * Formats `args` according to specifications in `fmt`, writes up to `n`
+ * characters of the result to the output iterator `out` and returns the total
+ * (not truncated) output size and the iterator past the end of the output
+ * range. `format_to_n` does not append a terminating null character.
+ */
+template <typename OutputIt, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+FMT_INLINE auto format_to_n(OutputIt out, size_t n, format_string<T...> fmt,
+                            T&&... args) -> format_to_n_result<OutputIt> {
+  return vformat_to_n(out, n, fmt.str, vargs<T...>{{args...}});
+}
+
+struct format_to_result {
+  /// Pointer to just after the last successful write in the array.
+  char* out;
+  /// Specifies if the output was truncated.
+  bool truncated;
+
+  FMT_CONSTEXPR operator char*() const {
+    // Report truncation to prevent silent data loss.
+    if (truncated) report_error("output is truncated");
+    return out;
+  }
+};
+
+template <size_t N>
+auto vformat_to(char (&out)[N], string_view fmt, format_args args)
+    -> format_to_result {
+  auto result = vformat_to_n(out, N, fmt, args);
+  return {result.out, result.size > N};
+}
+
+template <size_t N, typename... T>
+FMT_INLINE auto format_to(char (&out)[N], format_string<T...> fmt, T&&... args)
+    -> format_to_result {
+  auto result = vformat_to_n(out, N, fmt.str, vargs<T...>{{args...}});
+  return {result.out, result.size > N};
+}
+
+/// Returns the number of chars in the output of `format(fmt, args...)`.
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto formatted_size(format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to(buf, fmt.str, vargs<T...>{{args...}}, {});
+  return buf.count();
+}
+
+FMT_API void vprint(string_view fmt, format_args args);
+FMT_API void vprint(FILE* f, string_view fmt, format_args args);
+FMT_API void vprintln(FILE* f, string_view fmt, format_args args);
+FMT_API void vprint_buffered(FILE* f, string_view fmt, format_args args);
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the output
+ * to `stdout`.
+ *
+ * **Example**:
+ *
+ *     fmt::print("The answer is {}.", 42);
+ */
+template <typename... T>
+FMT_INLINE void print(format_string<T...> fmt, T&&... args) {
+  vargs<T...> va = {{args...}};
+  if (detail::const_check(!detail::use_utf8))
+    return detail::vprint_mojibake(stdout, fmt.str, va, false);
+  return detail::is_locking<T...>() ? vprint_buffered(stdout, fmt.str, va)
+                                    : vprint(fmt.str, va);
+}
+
+/**
+ * Formats `args` according to specifications in `fmt` and writes the
+ * output to the file `f`.
+ *
+ * **Example**:
+ *
+ *     fmt::print(stderr, "Don't {}!", "panic");
+ */
+template <typename... T>
+FMT_INLINE void print(FILE* f, format_string<T...> fmt, T&&... args) {
+  vargs<T...> va = {{args...}};
+  if (detail::const_check(!detail::use_utf8))
+    return detail::vprint_mojibake(f, fmt.str, va, false);
+  return detail::is_locking<T...>() ? vprint_buffered(f, fmt.str, va)
+                                    : vprint(f, fmt.str, va);
+}
+
+/// Formats `args` according to specifications in `fmt` and writes the output
+/// to the file `f` followed by a newline.
+template <typename... T>
+FMT_INLINE void println(FILE* f, format_string<T...> fmt, T&&... args) {
+  vargs<T...> va = {{args...}};
+  return detail::const_check(detail::use_utf8)
+             ? vprintln(f, fmt.str, va)
+             : detail::vprint_mojibake(f, fmt.str, va, true);
+}
+
+/// Formats `args` according to specifications in `fmt` and writes the output
+/// to `stdout` followed by a newline.
+template <typename... T>
+FMT_INLINE void println(format_string<T...> fmt, T&&... args) {
+  return fmt::println(stdout, fmt, static_cast<T&&>(args)...);
+}
+
+FMT_END_EXPORT
+FMT_PRAGMA_CLANG(diagnostic pop)
+FMT_PRAGMA_GCC(pop_options)
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  include "format.h"
+#endif
+#endif  // FMT_BASE_H_
diff --git a/third_party/spdlog/fmt/bundled/core.h b/third_party/spdlog/fmt/bundled/core.h
new file mode 100644
index 00000000000..8ca735f0c00
--- /dev/null
+++ b/third_party/spdlog/fmt/bundled/core.h
@@ -0,0 +1,5 @@
+// This file is only provided for compatibility and may be removed in future
+// versions. Use fmt/base.h if you don't need fmt::format and fmt/format.h
+// otherwise.
+
+#include "format.h"
diff --git a/third_party/spdlog/fmt/bundled/format-inl.h b/third_party/spdlog/fmt/bundled/format-inl.h
new file mode 100644
index 00000000000..a5b79dbe49b
--- /dev/null
+++ b/third_party/spdlog/fmt/bundled/format-inl.h
@@ -0,0 +1,1949 @@
+// Formatting library for C++ - implementation
+//
+// Copyright (c) 2012 - 2016, Victor Zverovich
+// All rights reserved.
+//
+// For the license information refer to format.h.
+
+#ifndef FMT_FORMAT_INL_H_
+#define FMT_FORMAT_INL_H_
+
+#ifndef FMT_MODULE
+#  include <algorithm>
+#  include <cerrno>  // errno
+#  include <climits>
+#  include <cmath>
+#  include <exception>
+#endif
+
+#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+#  include <io.h>  // _isatty
+#endif
+
+#include "format.h"
+
+#if FMT_USE_LOCALE
+#  include <locale>
+#endif
+
+#ifndef FMT_FUNC
+#  define FMT_FUNC
+#endif
+
+FMT_BEGIN_NAMESPACE
+namespace detail {
+
+FMT_FUNC void assert_fail(const char* file, int line, const char* message) {
+  // Use unchecked std::fprintf to avoid triggering another assertion when
+  // writing to stderr fails.
+  fprintf(stderr, "%s:%d: assertion failed: %s", file, line, message);
+  abort();
+}
+
+FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
+                                string_view message) noexcept {
+  // Report error code making sure that the output fits into
+  // inline_buffer_size to avoid dynamic memory allocation and potential
+  // bad_alloc.
+  out.try_resize(0);
+  static const char SEP[] = ": ";
+  static const char ERROR_STR[] = "error ";
+  // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
+  size_t error_code_size = sizeof(SEP) + sizeof(ERROR_STR) - 2;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<int>>(error_code);
+  if (detail::is_negative(error_code)) {
+    abs_value = 0 - abs_value;
+    ++error_code_size;
+  }
+  error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
+  auto it = appender(out);
+  if (message.size() <= inline_buffer_size - error_code_size)
+    fmt::format_to(it, FMT_STRING("{}{}"), message, SEP);
+  fmt::format_to(it, FMT_STRING("{}{}"), ERROR_STR, error_code);
+  FMT_ASSERT(out.size() <= inline_buffer_size, "");
+}
+
+FMT_FUNC void do_report_error(format_func func, int error_code,
+                              const char* message) noexcept {
+  memory_buffer full_message;
+  func(full_message, error_code, message);
+  // Don't use fwrite_all because the latter may throw.
+  if (std::fwrite(full_message.data(), full_message.size(), 1, stderr) > 0)
+    std::fputc('\n', stderr);
+}
+
+// A wrapper around fwrite that throws on error.
+inline void fwrite_all(const void* ptr, size_t count, FILE* stream) {
+  size_t written = std::fwrite(ptr, 1, count, stream);
+  if (written < count)
+    FMT_THROW(system_error(errno, FMT_STRING("cannot write to file")));
+}
+
+#if FMT_USE_LOCALE
+using std::locale;
+using std::numpunct;
+using std::use_facet;
+
+template <typename Locale>
+locale_ref::locale_ref(const Locale& loc) : locale_(&loc) {
+  static_assert(std::is_same<Locale, locale>::value, "");
+}
+#else
+struct locale {};
+template <typename Char> struct numpunct {
+  auto grouping() const -> std::string { return "\03"; }
+  auto thousands_sep() const -> Char { return ','; }
+  auto decimal_point() const -> Char { return '.'; }
+};
+template <typename Facet> Facet use_facet(locale) { return {}; }
+#endif  // FMT_USE_LOCALE
+
+template <typename Locale> auto locale_ref::get() const -> Locale {
+  static_assert(std::is_same<Locale, locale>::value, "");
+#if FMT_USE_LOCALE
+  if (locale_) return *static_cast<const locale*>(locale_);
+#endif
+  return locale();
+}
+
+template <typename Char>
+FMT_FUNC auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char> {
+  auto&& facet = use_facet<numpunct<Char>>(loc.get<locale>());
+  auto grouping = facet.grouping();
+  auto thousands_sep = grouping.empty() ? Char() : facet.thousands_sep();
+  return {std::move(grouping), thousands_sep};
+}
+template <typename Char>
+FMT_FUNC auto decimal_point_impl(locale_ref loc) -> Char {
+  return use_facet<numpunct<Char>>(loc.get<locale>()).decimal_point();
+}
+
+#if FMT_USE_LOCALE
+FMT_FUNC auto write_loc(appender out, loc_value value,
+                        const format_specs& specs, locale_ref loc) -> bool {
+  auto locale = loc.get<std::locale>();
+  // We cannot use the num_put<char> facet because it may produce output in
+  // a wrong encoding.
+  using facet = format_facet<std::locale>;
+  if (std::has_facet<facet>(locale))
+    return use_facet<facet>(locale).put(out, value, specs);
+  return facet(locale).put(out, value, specs);
+}
+#endif
+}  // namespace detail
+
+FMT_FUNC void report_error(const char* message) {
+#if FMT_USE_EXCEPTIONS
+  // Use FMT_THROW instead of throw to avoid bogus unreachable code warnings
+  // from MSVC.
+  FMT_THROW(format_error(message));
+#else
+  fputs(message, stderr);
+  abort();
+#endif
+}
+
+template <typename Locale> typename Locale::id format_facet<Locale>::id;
+
+template <typename Locale> format_facet<Locale>::format_facet(Locale& loc) {
+  auto& np = detail::use_facet<detail::numpunct<char>>(loc);
+  grouping_ = np.grouping();
+  if (!grouping_.empty()) separator_ = std::string(1, np.thousands_sep());
+}
+
+#if FMT_USE_LOCALE
+template <>
+FMT_API FMT_FUNC auto format_facet<std::locale>::do_put(
+    appender out, loc_value val, const format_specs& specs) const -> bool {
+  return val.visit(
+      detail::loc_writer<>{out, specs, separator_, grouping_, decimal_point_});
+}
+#endif
+
+FMT_FUNC auto vsystem_error(int error_code, string_view fmt, format_args args)
+    -> std::system_error {
+  auto ec = std::error_code(error_code, std::generic_category());
+  return std::system_error(ec, vformat(fmt, args));
+}
+
+namespace detail {
+
+template <typename F>
+inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
+  return x.f == y.f && x.e == y.e;
+}
+
+// Compilers should be able to optimize this into the ror instruction.
+FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
+  r &= 31;
+  return (n >> r) | (n << (32 - r));
+}
+FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
+  r &= 63;
+  return (n >> r) | (n << (64 - r));
+}
+
+// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
+namespace dragonbox {
+// Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline auto umul96_upper64(uint32_t x, uint64_t y) noexcept -> uint64_t {
+  return umul128_upper64(static_cast<uint64_t>(x) << 32, y);
+}
+
+// Computes lower 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_lower128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint64_t high = x * y.high();
+  uint128_fallback high_low = umul128(x, y.low());
+  return {high + high_low.high(), high_low.low()};
+}
+
+// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline auto umul96_lower64(uint32_t x, uint64_t y) noexcept -> uint64_t {
+  return x * y;
+}
+
+// Various fast log computations.
+inline auto floor_log10_pow2_minus_log10_4_over_3(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2936 && e >= -2985, "too large exponent");
+  return (e * 631305 - 261663) >> 21;
+}
+
+FMT_INLINE_VARIABLE constexpr struct {
+  uint32_t divisor;
+  int shift_amount;
+} div_small_pow10_infos[] = {{10, 16}, {100, 16}};
+
+// Replaces n by floor(n / pow(10, N)) returning true if and only if n is
+// divisible by pow(10, N).
+// Precondition: n <= pow(10, N + 1).
+template <int N>
+auto check_divisibility_and_divide_by_pow10(uint32_t& n) noexcept -> bool {
+  // The numbers below are chosen such that:
+  //   1. floor(n/d) = floor(nm / 2^k) where d=10 or d=100,
+  //   2. nm mod 2^k < m if and only if n is divisible by d,
+  // where m is magic_number, k is shift_amount
+  // and d is divisor.
+  //
+  // Item 1 is a common technique of replacing division by a constant with
+  // multiplication, see e.g. "Division by Invariant Integers Using
+  // Multiplication" by Granlund and Montgomery (1994). magic_number (m) is set
+  // to ceil(2^k/d) for large enough k.
+  // The idea for item 2 originates from Schubfach.
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.shift_amount) / info.divisor + 1;
+  n *= magic_number;
+  const uint32_t comparison_mask = (1u << info.shift_amount) - 1;
+  bool result = (n & comparison_mask) < magic_number;
+  n >>= info.shift_amount;
+  return result;
+}
+
+// Computes floor(n / pow(10, N)) for small n and N.
+// Precondition: n <= pow(10, N + 1).
+template <int N> auto small_division_by_pow10(uint32_t n) noexcept -> uint32_t {
+  constexpr auto info = div_small_pow10_infos[N - 1];
+  FMT_ASSERT(n <= info.divisor * 10, "n is too large");
+  constexpr uint32_t magic_number =
+      (1u << info.shift_amount) / info.divisor + 1;
+  return (n * magic_number) >> info.shift_amount;
+}
+
+// Computes floor(n / 10^(kappa + 1)) (float)
+inline auto divide_by_10_to_kappa_plus_1(uint32_t n) noexcept -> uint32_t {
+  // 1374389535 = ceil(2^37/100)
+  return static_cast<uint32_t>((static_cast<uint64_t>(n) * 1374389535) >> 37);
+}
+// Computes floor(n / 10^(kappa + 1)) (double)
+inline auto divide_by_10_to_kappa_plus_1(uint64_t n) noexcept -> uint64_t {
+  // 2361183241434822607 = ceil(2^(64+7)/1000)
+  return umul128_upper64(n, 2361183241434822607ull) >> 7;
+}
+
+// Various subroutines using pow10 cache
+template <typename T> struct cache_accessor;
+
+template <> struct cache_accessor<float> {
+  using carrier_uint = float_info<float>::carrier_uint;
+  using cache_entry_type = uint64_t;
+
+  static auto get_cached_power(int k) noexcept -> uint64_t {
+    FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
+               "k is out of range");
+    static constexpr const uint64_t pow10_significands[] = {
+        0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
+        0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
+        0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
+        0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
+        0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
+        0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
+        0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
+        0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
+        0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
+        0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
+        0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
+        0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
+        0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
+        0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
+        0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
+        0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
+        0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
+        0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
+        0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
+        0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940985,
+        0xa18f07d736b90be6, 0xc9f2c9cd04674edf, 0xfc6f7c4045812297,
+        0x9dc5ada82b70b59e, 0xc5371912364ce306, 0xf684df56c3e01bc7,
+        0x9a130b963a6c115d, 0xc097ce7bc90715b4, 0xf0bdc21abb48db21,
+        0x96769950b50d88f5, 0xbc143fa4e250eb32, 0xeb194f8e1ae525fe,
+        0x92efd1b8d0cf37bf, 0xb7abc627050305ae, 0xe596b7b0c643c71a,
+        0x8f7e32ce7bea5c70, 0xb35dbf821ae4f38c, 0xe0352f62a19e306f};
+    return pow10_significands[k - float_info<float>::min_k];
+  }
+
+  struct compute_mul_result {
+    carrier_uint result;
+    bool is_integer;
+  };
+  struct compute_mul_parity_result {
+    bool parity;
+    bool is_integer;
+  };
+
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
+    auto r = umul96_upper64(u, cache);
+    return {static_cast<carrier_uint>(r >> 32),
+            static_cast<carrier_uint>(r) == 0};
+  }
+
+  static auto compute_delta(const cache_entry_type& cache, int beta) noexcept
+      -> uint32_t {
+    return static_cast<uint32_t>(cache >> (64 - 1 - beta));
+  }
+
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
+
+    auto r = umul96_lower64(two_f, cache);
+    return {((r >> (64 - beta)) & 1) != 0,
+            static_cast<uint32_t>(r >> (32 - beta)) == 0};
+  }
+
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return static_cast<carrier_uint>(
+        (cache - (cache >> (num_significand_bits<float>() + 2))) >>
+        (64 - num_significand_bits<float>() - 1 - beta));
+  }
+
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return static_cast<carrier_uint>(
+        (cache + (cache >> (num_significand_bits<float>() + 1))) >>
+        (64 - num_significand_bits<float>() - 1 - beta));
+  }
+
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (static_cast<carrier_uint>(
+                cache >> (64 - num_significand_bits<float>() - 2 - beta)) +
+            1) /
+           2;
+  }
+};
+
+template <> struct cache_accessor<double> {
+  using carrier_uint = float_info<double>::carrier_uint;
+  using cache_entry_type = uint128_fallback;
+
+  static auto get_cached_power(int k) noexcept -> uint128_fallback {
+    FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
+               "k is out of range");
+
+    static constexpr const uint128_fallback pow10_significands[] = {
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0x9faacf3df73609b1, 0x77b191618c54e9ad},
+      {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
+      {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
+      {0x9becce62836ac577, 0x4ee367f9430aec33},
+      {0xc2e801fb244576d5, 0x229c41f793cda740},
+      {0xf3a20279ed56d48a, 0x6b43527578c11110},
+      {0x9845418c345644d6, 0x830a13896b78aaaa},
+      {0xbe5691ef416bd60c, 0x23cc986bc656d554},
+      {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
+      {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
+      {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
+      {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
+      {0x91376c36d99995be, 0x23100809b9c21fa2},
+      {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
+      {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
+      {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
+      {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
+      {0xdd95317f31c7fa1d, 0x40405643d711d584},
+      {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
+      {0xad1c8eab5ee43b66, 0xda3243650005eed0},
+      {0xd863b256369d4a40, 0x90bed43e40076a83},
+      {0x873e4f75e2224e68, 0x5a7744a6e804a292},
+      {0xa90de3535aaae202, 0x711515d0a205cb37},
+      {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
+      {0x8412d9991ed58091, 0xe858790afe9486c3},
+      {0xa5178fff668ae0b6, 0x626e974dbe39a873},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
+      {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
+      {0xc987434744ac874e, 0xa327ffb266b56221},
+      {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
+      {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
+      {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
+      {0xf6019da07f549b2b, 0x7e2a53a146606a49},
+      {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
+      {0xc0314325637a1939, 0xfa911155fefb5309},
+      {0xf03d93eebc589f88, 0x793555ab7eba27cb},
+      {0x96267c7535b763b5, 0x4bc1558b2f3458df},
+      {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
+      {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
+      {0x92a1958a7675175f, 0x0bfacd89ec191eca},
+      {0xb749faed14125d36, 0xcef980ec671f667c},
+      {0xe51c79a85916f484, 0x82b7e12780e7401b},
+      {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
+      {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
+      {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
+      {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
+      {0xaecc49914078536d, 0x58fae9f773886e19},
+      {0xda7f5bf590966848, 0xaf39a475506a899f},
+      {0x888f99797a5e012d, 0x6d8406c952429604},
+      {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
+      {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
+      {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0xd0601d8efc57b08b, 0xf13b94daf124da27},
+      {0x823c12795db6ce57, 0x76c53d08d6b70859},
+      {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
+      {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
+      {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
+      {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
+      {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
+      {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
+      {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
+      {0xc21094364dfb5636, 0x985915fc12f542e5},
+      {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
+      {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
+      {0xbd8430bd08277231, 0x50c6ff782a838354},
+      {0xece53cec4a314ebd, 0xa4f8bf5635246429},
+      {0x940f4613ae5ed136, 0x871b7795e136be9a},
+      {0xb913179899f68584, 0x28e2557b59846e40},
+      {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
+      {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
+      {0xb4bca50b065abe63, 0x0fed077a756b53aa},
+      {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
+      {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
+      {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
+      {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
+      {0x89e42caaf9491b60, 0xf41686c49db57245},
+      {0xac5d37d5b79b6239, 0x311c2875c522ced6},
+      {0xd77485cb25823ac7, 0x7d633293366b828c},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
+      {0xd267caa862a12d66, 0xd072df63c324fd7c},
+      {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
+      {0xa46116538d0deb78, 0x52d9be85f074e609},
+      {0xcd795be870516656, 0x67902e276c921f8c},
+      {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
+      {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
+      {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
+      {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
+      {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
+      {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
+      {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
+      {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
+      {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
+      {0xef340a98172aace4, 0x86fb897116c87c35},
+      {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
+      {0xbae0a846d2195712, 0x8974836059cca10a},
+      {0xe998d258869facd7, 0x2bd1a438703fc94c},
+      {0x91ff83775423cc06, 0x7b6306a34627ddd0},
+      {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
+      {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
+      {0x8e938662882af53e, 0x547eb47b7282ee9d},
+      {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
+      {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
+      {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
+      {0xae0b158b4738705e, 0x9624ab50b148d446},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
+      {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
+      {0xd47487cc8470652b, 0x7647c32000696720},
+      {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
+      {0xa5fb0a17c777cf09, 0xf468107100525891},
+      {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
+      {0x81ac1fe293d599bf, 0xc6f14cd848405531},
+      {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
+      {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
+      {0xfd442e4688bd304a, 0x908f4a166d1da664},
+      {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
+      {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
+      {0xf7549530e188c128, 0xd12bee59e68ef47d},
+      {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
+      {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
+      {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
+      {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
+      {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
+      {0xebdf661791d60f56, 0x111b495b3464ad22},
+      {0x936b9fcebb25c995, 0xcab10dd900beec35},
+      {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
+      {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
+      {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
+      {0xb3f4e093db73a093, 0x59ed216765690f57},
+      {0xe0f218b8d25088b8, 0x306869c13ec3532d},
+      {0x8c974f7383725573, 0x1e414218c73a13fc},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
+      {0x894bc396ce5da772, 0x6b8bba8c328eb784},
+      {0xab9eb47c81f5114f, 0x066ea92f3f326565},
+      {0xd686619ba27255a2, 0xc80a537b0efefebe},
+      {0x8613fd0145877585, 0xbd06742ce95f5f37},
+      {0xa798fc4196e952e7, 0x2c48113823b73705},
+      {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
+      {0x82ef85133de648c4, 0x9a984d73dbe722fc},
+      {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
+      {0xcc963fee10b7d1b3, 0x318df905079926a9},
+      {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
+      {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
+      {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
+      {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
+      {0x9c1661a651213e2d, 0x06bea10ca65c084f},
+      {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
+      {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
+      {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
+      {0xbe89523386091465, 0xf6bbb397f1135824},
+      {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
+      {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
+      {0xba121a4650e4ddeb, 0x92f34d62616ce414},
+      {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
+      {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
+      {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
+      {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
+      {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
+      {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
+      {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
+      {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
+      {0x87625f056c7c4a8b, 0x11471cd764ad4973},
+      {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
+      {0xd389b47879823479, 0x4aff1d108d4ec2c4},
+      {0x843610cb4bf160cb, 0xcedf722a585139bb},
+      {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
+      {0xce947a3da6a9273e, 0x733d226229feea33},
+      {0x811ccc668829b887, 0x0806357d5a3f5260},
+      {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
+      {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
+      {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
+      {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
+      {0xc5029163f384a931, 0x0a9e795e65d4df12},
+      {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
+      {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
+      {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
+      {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
+      {0x964e858c91ba2655, 0x3a6a07f8d510f870},
+      {0xbbe226efb628afea, 0x890489f70a55368c},
+      {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
+      {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
+      {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
+      {0xb32df8e9f3546564, 0x47939822dc96abfa},
+      {0xdff9772470297ebd, 0x59787e2b93bc56f8},
+      {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
+      {0xaefae51477a06b03, 0xede622920b6b23f2},
+      {0xdab99e59958885c4, 0xe95fab368e45ecee},
+      {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
+      {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
+      {0xd59944a37c0752a2, 0x4be76d3346f04960},
+      {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
+      {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
+      {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
+      {0x825ecc24c873782f, 0x8ed400668c0c28c9},
+      {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
+      {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
+      {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
+      {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
+      {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
+      {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
+      {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
+      {0xc24452da229b021b, 0xfbe85badce996169},
+      {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
+      {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
+      {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
+      {0xed246723473e3813, 0x290123e9aab23b69},
+      {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
+      {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
+      {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
+      {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
+      {0x8d590723948a535f, 0x579c487e5a38ad0f},
+      {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
+      {0xdcdb1b2798182244, 0xf8e431456cf88e66},
+      {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
+      {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
+      {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
+      {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
+      {0xa87fea27a539e9a5, 0x3f2398d747b36225},
+      {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
+      {0x83a3eeeef9153e89, 0x1953cf68300424ad},
+      {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
+      {0xcdb02555653131b6, 0x3792f412cb06794e},
+      {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
+      {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
+      {0xc8de047564d20a8b, 0xf245825a5a445276},
+      {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
+      {0x9ced737bb6c4183d, 0x55464dd69685606c},
+      {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
+      {0xf53304714d9265df, 0xd53dd99f4b3066a9},
+      {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
+      {0xbf8fdb78849a5f96, 0xde98520472bdd034},
+      {0xef73d256a5c0f77c, 0x963e66858f6d4441},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xbb127c53b17ec159, 0x5560c018580d5d53},
+      {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
+      {0x9226712162ab070d, 0xcab3961304ca70e9},
+      {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
+      {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
+      {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
+      {0xb267ed1940f1c61c, 0x55f038b237591ed4},
+      {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
+      {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
+      {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
+      {0xd9c7dced53c72255, 0x96e7bd358c904a22},
+      {0x881cea14545c7575, 0x7e50d64177da2e55},
+      {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
+      {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
+      {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
+      {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
+      {0xcfb11ead453994ba, 0x67de18eda5814af3},
+      {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
+      {0xa2425ff75e14fc31, 0xa1258379a94d028e},
+      {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
+      {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
+      {0x9e74d1b791e07e48, 0x775ea264cf55347e},
+      {0xc612062576589dda, 0x95364afe032a819e},
+      {0xf79687aed3eec551, 0x3a83ddbd83f52205},
+      {0x9abe14cd44753b52, 0xc4926a9672793543},
+      {0xc16d9a0095928a27, 0x75b7053c0f178294},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
+      {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
+      {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
+      {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
+      {0xb877aa3236a4b449, 0x09befeb9fad487c3},
+      {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
+      {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
+      {0xb424dc35095cd80f, 0x538484c19ef38c95},
+      {0xe12e13424bb40e13, 0x2865a5f206b06fba},
+      {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
+      {0xafebff0bcb24aafe, 0xf78f69a51539d749},
+      {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
+      {0x89705f4136b4a597, 0x31680a88f8953031},
+      {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
+      {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
+      {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
+      {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
+      {0xd1b71758e219652b, 0xd3c36113404ea4a9},
+      {0x83126e978d4fdf3b, 0x645a1cac083126ea},
+      {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
+      {0xcccccccccccccccc, 0xcccccccccccccccd},
+      {0x8000000000000000, 0x0000000000000000},
+      {0xa000000000000000, 0x0000000000000000},
+      {0xc800000000000000, 0x0000000000000000},
+      {0xfa00000000000000, 0x0000000000000000},
+      {0x9c40000000000000, 0x0000000000000000},
+      {0xc350000000000000, 0x0000000000000000},
+      {0xf424000000000000, 0x0000000000000000},
+      {0x9896800000000000, 0x0000000000000000},
+      {0xbebc200000000000, 0x0000000000000000},
+      {0xee6b280000000000, 0x0000000000000000},
+      {0x9502f90000000000, 0x0000000000000000},
+      {0xba43b74000000000, 0x0000000000000000},
+      {0xe8d4a51000000000, 0x0000000000000000},
+      {0x9184e72a00000000, 0x0000000000000000},
+      {0xb5e620f480000000, 0x0000000000000000},
+      {0xe35fa931a0000000, 0x0000000000000000},
+      {0x8e1bc9bf04000000, 0x0000000000000000},
+      {0xb1a2bc2ec5000000, 0x0000000000000000},
+      {0xde0b6b3a76400000, 0x0000000000000000},
+      {0x8ac7230489e80000, 0x0000000000000000},
+      {0xad78ebc5ac620000, 0x0000000000000000},
+      {0xd8d726b7177a8000, 0x0000000000000000},
+      {0x878678326eac9000, 0x0000000000000000},
+      {0xa968163f0a57b400, 0x0000000000000000},
+      {0xd3c21bcecceda100, 0x0000000000000000},
+      {0x84595161401484a0, 0x0000000000000000},
+      {0xa56fa5b99019a5c8, 0x0000000000000000},
+      {0xcecb8f27f4200f3a, 0x0000000000000000},
+      {0x813f3978f8940984, 0x4000000000000000},
+      {0xa18f07d736b90be5, 0x5000000000000000},
+      {0xc9f2c9cd04674ede, 0xa400000000000000},
+      {0xfc6f7c4045812296, 0x4d00000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xc5371912364ce305, 0x6c28000000000000},
+      {0xf684df56c3e01bc6, 0xc732000000000000},
+      {0x9a130b963a6c115c, 0x3c7f400000000000},
+      {0xc097ce7bc90715b3, 0x4b9f100000000000},
+      {0xf0bdc21abb48db20, 0x1e86d40000000000},
+      {0x96769950b50d88f4, 0x1314448000000000},
+      {0xbc143fa4e250eb31, 0x17d955a000000000},
+      {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
+      {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
+      {0xb7abc627050305ad, 0xf14a3d9e40000000},
+      {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
+      {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
+      {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
+      {0xe0352f62a19e306e, 0xd50b2037ad200000},
+      {0x8c213d9da502de45, 0x4526f422cc340000},
+      {0xaf298d050e4395d6, 0x9670b12b7f410000},
+      {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
+      {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
+      {0xab0e93b6efee0053, 0x8eea0d047a457a00},
+      {0xd5d238a4abe98068, 0x72a4904598d6d880},
+      {0x85a36366eb71f041, 0x47a6da2b7f864750},
+      {0xa70c3c40a64e6c51, 0x999090b65f67d924},
+      {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
+      {0x82818f1281ed449f, 0xbff8f10e7a8921a5},
+      {0xa321f2d7226895c7, 0xaff72d52192b6a0e},
+      {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764491},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b5},
+      {0x9f4f2726179a2245, 0x01d762422c946591},
+      {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef6},
+      {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb3},
+      {0x9b934c3b330c8577, 0x63cc55f49f88eb30},
+      {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fc},
+      {0xf316271c7fc3908a, 0x8bef464e3945ef7b},
+      {0x97edd871cfda3a56, 0x97758bf0e3cbb5ad},
+      {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea318},
+      {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bde},
+      {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6b},
+      {0xb975d6b6ee39e436, 0xb3e2fd538e122b45},
+      {0xe7d34c64a9c85d44, 0x60dbbca87196b617},
+      {0x90e40fbeea1d3a4a, 0xbc8955e946fe31ce},
+      {0xb51d13aea4a488dd, 0x6babab6398bdbe42},
+      {0xe264589a4dcdab14, 0xc696963c7eed2dd2},
+      {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca3},
+      {0xb0de65388cc8ada8, 0x3b25a55f43294bcc},
+      {0xdd15fe86affad912, 0x49ef0eb713f39ebf},
+      {0x8a2dbf142dfcc7ab, 0x6e3569326c784338},
+      {0xacb92ed9397bf996, 0x49c2c37f07965405},
+      {0xd7e77a8f87daf7fb, 0xdc33745ec97be907},
+      {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a4},
+      {0xa8acd7c0222311bc, 0xc40832ea0d68ce0d},
+      {0xd2d80db02aabd62b, 0xf50a3fa490c30191},
+      {0x83c7088e1aab65db, 0x792667c6da79e0fb},
+      {0xa4b8cab1a1563f52, 0x577001b891185939},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
+      {0x80b05e5ac60b6178, 0x544f8158315b05b5},
+      {0xa0dc75f1778e39d6, 0x696361ae3db1c722},
+      {0xc913936dd571c84c, 0x03bc3a19cd1e38ea},
+      {0xfb5878494ace3a5f, 0x04ab48a04065c724},
+      {0x9d174b2dcec0e47b, 0x62eb0d64283f9c77},
+      {0xc45d1df942711d9a, 0x3ba5d0bd324f8395},
+      {0xf5746577930d6500, 0xca8f44ec7ee3647a},
+      {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecc},
+      {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67f},
+      {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101f},
+      {0x95d04aee3b80ece5, 0xbba1f1d158724a13},
+      {0xbb445da9ca61281f, 0x2a8a6e45ae8edc98},
+      {0xea1575143cf97226, 0xf52d09d71a3293be},
+      {0x924d692ca61be758, 0x593c2626705f9c57},
+      {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836d},
+      {0xe498f455c38b997a, 0x0b6dfb9c0f956448},
+      {0x8edf98b59a373fec, 0x4724bd4189bd5ead},
+      {0xb2977ee300c50fe7, 0x58edec91ec2cb658},
+      {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ee},
+      {0x8b865b215899f46c, 0xbd79e0d20082ee75},
+      {0xae67f1e9aec07187, 0xecd8590680a3aa12},
+      {0xda01ee641a708de9, 0xe80e6f4820cc9496},
+      {0x884134fe908658b2, 0x3109058d147fdcde},
+      {0xaa51823e34a7eede, 0xbd4b46f0599fd416},
+      {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91b},
+      {0x850fadc09923329e, 0x03e2cf6bc604ddb1},
+      {0xa6539930bf6bff45, 0x84db8346b786151d},
+      {0xcfe87f7cef46ff16, 0xe612641865679a64},
+      {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07f},
+      {0xa26da3999aef7749, 0xe3be5e330f38f09e},
+      {0xcb090c8001ab551c, 0x5cadf5bfd3072cc6},
+      {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f7},
+      {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afb},
+      {0xc646d63501a1511d, 0xb281e1fd541501b9},
+      {0xf7d88bc24209a565, 0x1f225a7ca91a4227},
+      {0x9ae757596946075f, 0x3375788de9b06959},
+      {0xc1a12d2fc3978937, 0x0052d6b1641c83af},
+      {0xf209787bb47d6b84, 0xc0678c5dbd23a49b},
+      {0x9745eb4d50ce6332, 0xf840b7ba963646e1},
+      {0xbd176620a501fbff, 0xb650e5a93bc3d899},
+      {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebf},
+      {0x93ba47c980e98cdf, 0xc66f336c36b10138},
+      {0xb8a8d9bbe123f017, 0xb80b0047445d4185},
+      {0xe6d3102ad96cec1d, 0xa60dc059157491e6},
+      {0x9043ea1ac7e41392, 0x87c89837ad68db30},
+      {0xb454e4a179dd1877, 0x29babe4598c311fc},
+      {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67b},
+      {0x8ce2529e2734bb1d, 0x1899e4a65f58660d},
+      {0xb01ae745b101e9e4, 0x5ec05dcff72e7f90},
+      {0xdc21a1171d42645d, 0x76707543f4fa1f74},
+      {0x899504ae72497eba, 0x6a06494a791c53a9},
+      {0xabfa45da0edbde69, 0x0487db9d17636893},
+      {0xd6f8d7509292d603, 0x45a9d2845d3c42b7},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
+      {0xa7f26836f282b732, 0x8e6cac7768d7141f},
+      {0xd1ef0244af2364ff, 0x3207d795430cd927},
+      {0x8335616aed761f1f, 0x7f44e6bd49e807b9},
+      {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a7},
+      {0xcd036837130890a1, 0x36dba887c37a8c10},
+      {0x802221226be55a64, 0xc2494954da2c978a},
+      {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6d},
+      {0xc83553c5c8965d3d, 0x6f92829494e5acc8},
+      {0xfa42a8b73abbf48c, 0xcb772339ba1f17fa},
+      {0x9c69a97284b578d7, 0xff2a760414536efc},
+      {0xc38413cf25e2d70d, 0xfef5138519684abb},
+      {0xf46518c2ef5b8cd1, 0x7eb258665fc25d6a},
+      {0x98bf2f79d5993802, 0xef2f773ffbd97a62},
+      {0xbeeefb584aff8603, 0xaafb550ffacfd8fb},
+      {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf39},
+      {0x952ab45cfa97a0b2, 0xdd945a747bf26184},
+      {0xba756174393d88df, 0x94f971119aeef9e5},
+      {0xe912b9d1478ceb17, 0x7a37cd5601aab85e},
+      {0x91abb422ccb812ee, 0xac62e055c10ab33b},
+      {0xb616a12b7fe617aa, 0x577b986b314d600a},
+      {0xe39c49765fdf9d94, 0xed5a7e85fda0b80c},
+      {0x8e41ade9fbebc27d, 0x14588f13be847308},
+      {0xb1d219647ae6b31c, 0x596eb2d8ae258fc9},
+      {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bc},
+      {0x8aec23d680043bee, 0x25de7bb9480d5855},
+      {0xada72ccc20054ae9, 0xaf561aa79a10ae6b},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da05},
+      {0x87aa9aff79042286, 0x90fb44d2f05d0843},
+      {0xa99541bf57452b28, 0x353a1607ac744a54},
+      {0xd3fa922f2d1675f2, 0x42889b8997915ce9},
+      {0x847c9b5d7c2e09b7, 0x69956135febada12},
+      {0xa59bc234db398c25, 0x43fab9837e699096},
+      {0xcf02b2c21207ef2e, 0x94f967e45e03f4bc},
+      {0x8161afb94b44f57d, 0x1d1be0eebac278f6},
+      {0xa1ba1ba79e1632dc, 0x6462d92a69731733},
+      {0xca28a291859bbf93, 0x7d7b8f7503cfdcff},
+      {0xfcb2cb35e702af78, 0x5cda735244c3d43f},
+      {0x9defbf01b061adab, 0x3a0888136afa64a8},
+      {0xc56baec21c7a1916, 0x088aaa1845b8fdd1},
+      {0xf6c69a72a3989f5b, 0x8aad549e57273d46},
+      {0x9a3c2087a63f6399, 0x36ac54e2f678864c},
+      {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7de},
+      {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d6},
+      {0x969eb7c47859e743, 0x9f644ae5a4b1b326},
+      {0xbc4665b596706114, 0x873d5d9f0dde1fef},
+      {0xeb57ff22fc0c7959, 0xa90cb506d155a7eb},
+      {0x9316ff75dd87cbd8, 0x09a7f12442d588f3},
+      {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb30},
+      {0xe5d3ef282a242e81, 0x8f1668c8a86da5fb},
+      {0x8fa475791a569d10, 0xf96e017d694487bd},
+      {0xb38d92d760ec4455, 0x37c981dcc395a9ad},
+      {0xe070f78d3927556a, 0x85bbe253f47b1418},
+      {0x8c469ab843b89562, 0x93956d7478ccec8f},
+      {0xaf58416654a6babb, 0x387ac8d1970027b3},
+      {0xdb2e51bfe9d0696a, 0x06997b05fcc0319f},
+      {0x88fcf317f22241e2, 0x441fece3bdf81f04},
+      {0xab3c2fddeeaad25a, 0xd527e81cad7626c4},
+      {0xd60b3bd56a5586f1, 0x8a71e223d8d3b075},
+      {0x85c7056562757456, 0xf6872d5667844e4a},
+      {0xa738c6bebb12d16c, 0xb428f8ac016561dc},
+      {0xd106f86e69d785c7, 0xe13336d701beba53},
+      {0x82a45b450226b39c, 0xecc0024661173474},
+      {0xa34d721642b06084, 0x27f002d7f95d0191},
+      {0xcc20ce9bd35c78a5, 0x31ec038df7b441f5},
+      {0xff290242c83396ce, 0x7e67047175a15272},
+      {0x9f79a169bd203e41, 0x0f0062c6e984d387},
+      {0xc75809c42c684dd1, 0x52c07b78a3e60869},
+      {0xf92e0c3537826145, 0xa7709a56ccdf8a83},
+      {0x9bbcc7a142b17ccb, 0x88a66076400bb692},
+      {0xc2abf989935ddbfe, 0x6acff893d00ea436},
+      {0xf356f7ebf83552fe, 0x0583f6b8c4124d44},
+      {0x98165af37b2153de, 0xc3727a337a8b704b},
+      {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5d},
+      {0xeda2ee1c7064130c, 0x1162def06f79df74},
+      {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba9},
+      {0xb9a74a0637ce2ee1, 0x6d953e2bd7173693},
+      {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0438},
+      {0x910ab1d4db9914a0, 0x1d9c9892400a22a3},
+      {0xb54d5e4a127f59c8, 0x2503beb6d00cab4c},
+      {0xe2a0b5dc971f303a, 0x2e44ae64840fd61e},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
+      {0xb10d8e1456105dad, 0x7425a83e872c5f48},
+      {0xdd50f1996b947518, 0xd12f124e28f7771a},
+      {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa70},
+      {0xace73cbfdc0bfb7b, 0x636cc64d1001550c},
+      {0xd8210befd30efa5a, 0x3c47f7e05401aa4f},
+      {0x8714a775e3e95c78, 0x65acfaec34810a72},
+      {0xa8d9d1535ce3b396, 0x7f1839a741a14d0e},
+      {0xd31045a8341ca07c, 0x1ede48111209a051},
+      {0x83ea2b892091e44d, 0x934aed0aab460433},
+      {0xa4e4b66b68b65d60, 0xf81da84d56178540},
+      {0xce1de40642e3f4b9, 0x36251260ab9d668f},
+      {0x80d2ae83e9ce78f3, 0xc1d72b7c6b42601a},
+      {0xa1075a24e4421730, 0xb24cf65b8612f820},
+      {0xc94930ae1d529cfc, 0xdee033f26797b628},
+      {0xfb9b7cd9a4a7443c, 0x169840ef017da3b2},
+      {0x9d412e0806e88aa5, 0x8e1f289560ee864f},
+      {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e3},
+      {0xf5b5d7ec8acb58a2, 0xae10af696774b1dc},
+      {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef2a},
+      {0xbff610b0cc6edd3f, 0x17fd090a58d32af4},
+      {0xeff394dcff8a948e, 0xddfc4b4cef07f5b1},
+      {0x95f83d0a1fb69cd9, 0x4abdaf101564f98f},
+      {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f2},
+      {0xea53df5fd18d5513, 0x84c86189216dc5ee},
+      {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb5},
+      {0xb7118682dbb66a77, 0x3fbc8c33221dc2a2},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
+      {0x8f05b1163ba6832d, 0x29cb4d87f2a7400f},
+      {0xb2c71d5bca9023f8, 0x743e20e9ef511013},
+      {0xdf78e4b2bd342cf6, 0x914da9246b255417},
+      {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548f},
+      {0xae9672aba3d0c320, 0xa184ac2473b529b2},
+      {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741f},
+      {0x8865899617fb1871, 0x7e2fa67c7a658893},
+      {0xaa7eebfb9df9de8d, 0xddbb901b98feeab8},
+      {0xd51ea6fa85785631, 0x552a74227f3ea566},
+      {0x8533285c936b35de, 0xd53a88958f872760},
+      {0xa67ff273b8460356, 0x8a892abaf368f138},
+      {0xd01fef10a657842c, 0x2d2b7569b0432d86},
+      {0x8213f56a67f6b29b, 0x9c3b29620e29fc74},
+      {0xa298f2c501f45f42, 0x8349f3ba91b47b90},
+      {0xcb3f2f7642717713, 0x241c70a936219a74},
+      {0xfe0efb53d30dd4d7, 0xed238cd383aa0111},
+      {0x9ec95d1463e8a506, 0xf4363804324a40ab},
+      {0xc67bb4597ce2ce48, 0xb143c6053edcd0d6},
+      {0xf81aa16fdc1b81da, 0xdd94b7868e94050b},
+      {0x9b10a4e5e9913128, 0xca7cf2b4191c8327},
+      {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f1},
+      {0xf24a01a73cf2dccf, 0xbc633b39673c8ced},
+      {0x976e41088617ca01, 0xd5be0503e085d814},
+      {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e19},
+      {0xec9c459d51852ba2, 0xddf8e7d60ed1219f},
+      {0x93e1ab8252f33b45, 0xcabb90e5c942b504},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
+      {0xe7109bfba19c0c9d, 0x0cc512670a783ad5},
+      {0x906a617d450187e2, 0x27fb2b80668b24c6},
+      {0xb484f9dc9641e9da, 0xb1f9f660802dedf7},
+      {0xe1a63853bbd26451, 0x5e7873f8a0396974},
+      {0x8d07e33455637eb2, 0xdb0b487b6423e1e9},
+      {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda63},
+      {0xdc5c5301c56b75f7, 0x7641a140cc7810fc},
+      {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9e},
+      {0xac2820d9623bf429, 0x546345fa9fbdcd45},
+      {0xd732290fbacaf133, 0xa97c177947ad4096},
+      {0x867f59a9d4bed6c0, 0x49ed8eabcccc485e},
+      {0xa81f301449ee8c70, 0x5c68f256bfff5a75},
+      {0xd226fc195c6a2f8c, 0x73832eec6fff3112},
+      {0x83585d8fd9c25db7, 0xc831fd53c5ff7eac},
+      {0xa42e74f3d032f525, 0xba3e7ca8b77f5e56},
+      {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35ec},
+      {0x80444b5e7aa7cf85, 0x7980d163cf5b81b4},
+      {0xa0555e361951c366, 0xd7e105bcc3326220},
+      {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa8},
+      {0xfa856334878fc150, 0xb14f98f6f0feb952},
+      {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d4},
+      {0xc3b8358109e84f07, 0x0a862f80ec4700c9},
+      {0xf4a642e14c6262c8, 0xcd27bb612758c0fb},
+      {0x98e7e9cccfbd7dbd, 0x8038d51cb897789d},
+      {0xbf21e44003acdd2c, 0xe0470a63e6bd56c4},
+      {0xeeea5d5004981478, 0x1858ccfce06cac75},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xbaa718e68396cffd, 0xd30560258f54e6bb},
+      {0xe950df20247c83fd, 0x47c6b82ef32a206a},
+      {0x91d28b7416cdd27e, 0x4cdc331d57fa5442},
+      {0xb6472e511c81471d, 0xe0133fe4adf8e953},
+      {0xe3d8f9e563a198e5, 0x58180fddd97723a7},
+      {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7649},
+      {0xb201833b35d63f73, 0x2cd2cc6551e513db},
+      {0xde81e40a034bcf4f, 0xf8077f7ea65e58d2},
+      {0x8b112e86420f6191, 0xfb04afaf27faf783},
+      {0xadd57a27d29339f6, 0x79c5db9af1f9b564},
+      {0xd94ad8b1c7380874, 0x18375281ae7822bd},
+      {0x87cec76f1c830548, 0x8f2293910d0b15b6},
+      {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb23},
+      {0xd433179d9c8cb841, 0x5fa60692a46151ec},
+      {0x849feec281d7f328, 0xdbc7c41ba6bcd334},
+      {0xa5c7ea73224deff3, 0x12b9b522906c0801},
+      {0xcf39e50feae16bef, 0xd768226b34870a01},
+      {0x81842f29f2cce375, 0xe6a1158300d46641},
+      {0xa1e53af46f801c53, 0x60495ae3c1097fd1},
+      {0xca5e89b18b602368, 0x385bb19cb14bdfc5},
+      {0xfcf62c1dee382c42, 0x46729e03dd9ed7b6},
+      {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d2},
+      {0xc5a05277621be293, 0xc7098b7305241886},
+      {0xf70867153aa2db38, 0xb8cbee4fc66d1ea8},
+      {0x9a65406d44a5c903, 0x737f74f1dc043329},
+      {0xc0fe908895cf3b44, 0x505f522e53053ff3},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0},
+      {0x96c6e0eab509e64d, 0x5eca783430dc19f6},
+      {0xbc789925624c5fe0, 0xb67d16413d132073},
+      {0xeb96bf6ebadf77d8, 0xe41c5bd18c57e890},
+      {0x933e37a534cbaae7, 0x8e91b962f7b6f15a},
+      {0xb80dc58e81fe95a1, 0x723627bbb5a4adb1},
+      {0xe61136f2227e3b09, 0xcec3b1aaa30dd91d},
+      {0x8fcac257558ee4e6, 0x213a4f0aa5e8a7b2},
+      {0xb3bd72ed2af29e1f, 0xa988e2cd4f62d19e},
+      {0xe0accfa875af45a7, 0x93eb1b80a33b8606},
+      {0x8c6c01c9498d8b88, 0xbc72f130660533c4},
+      {0xaf87023b9bf0ee6a, 0xeb8fad7c7f8680b5},
+      {0xdb68c2ca82ed2a05, 0xa67398db9f6820e2},
+#else
+      {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+      {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+      {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+      {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+      {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+      {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+      {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+      {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+      {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+      {0x95a8637627989aad, 0xdde7001379a44aa9},
+      {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+      {0xc350000000000000, 0x0000000000000000},
+      {0x9dc5ada82b70b59d, 0xf020000000000000},
+      {0xfee50b7025c36a08, 0x02f236d04753d5b5},
+      {0xcde6fd5e09abcf26, 0xed4c0226b55e6f87},
+      {0xa6539930bf6bff45, 0x84db8346b786151d},
+      {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b3},
+      {0xd910f7ff28069da4, 0x1b2ba1518094da05},
+      {0xaf58416654a6babb, 0x387ac8d1970027b3},
+      {0x8da471a9de737e24, 0x5ceaecfed289e5d3},
+      {0xe4d5e82392a40515, 0x0fabaf3feaa5334b},
+      {0xb8da1662e7b00a17, 0x3d6a751f3b936244},
+      {0x95527a5202df0ccb, 0x0f37801e0c43ebc9},
+      {0xf13e34aabb430a15, 0x647726b9e7c68ff0}
+#endif
+    };
+
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    return pow10_significands[k - float_info<double>::min_k];
+#else
+    static constexpr const uint64_t powers_of_5_64[] = {
+        0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
+        0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
+        0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
+        0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
+        0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
+        0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
+        0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
+        0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
+        0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
+
+    static const int compression_ratio = 27;
+
+    // Compute base index.
+    int cache_index = (k - float_info<double>::min_k) / compression_ratio;
+    int kb = cache_index * compression_ratio + float_info<double>::min_k;
+    int offset = k - kb;
+
+    // Get base cache.
+    uint128_fallback base_cache = pow10_significands[cache_index];
+    if (offset == 0) return base_cache;
+
+    // Compute the required amount of bit-shift.
+    int alpha = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset;
+    FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
+
+    // Try to recover the real cache.
+    uint64_t pow5 = powers_of_5_64[offset];
+    uint128_fallback recovered_cache = umul128(base_cache.high(), pow5);
+    uint128_fallback middle_low = umul128(base_cache.low(), pow5);
+
+    recovered_cache += middle_low.high();
+
+    uint64_t high_to_middle = recovered_cache.high() << (64 - alpha);
+    uint64_t middle_to_low = recovered_cache.low() << (64 - alpha);
+
+    recovered_cache =
+        uint128_fallback{(recovered_cache.low() >> alpha) | high_to_middle,
+                         ((middle_low.low() >> alpha) | middle_to_low)};
+    FMT_ASSERT(recovered_cache.low() + 1 != 0, "");
+    return {recovered_cache.high(), recovered_cache.low() + 1};
+#endif
+  }
+
+  struct compute_mul_result {
+    carrier_uint result;
+    bool is_integer;
+  };
+  struct compute_mul_parity_result {
+    bool parity;
+    bool is_integer;
+  };
+
+  static auto compute_mul(carrier_uint u,
+                          const cache_entry_type& cache) noexcept
+      -> compute_mul_result {
+    auto r = umul192_upper128(u, cache);
+    return {r.high(), r.low() == 0};
+  }
+
+  static auto compute_delta(cache_entry_type const& cache, int beta) noexcept
+      -> uint32_t {
+    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta));
+  }
+
+  static auto compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta) noexcept
+      -> compute_mul_parity_result {
+    FMT_ASSERT(beta >= 1, "");
+    FMT_ASSERT(beta < 64, "");
+
+    auto r = umul192_lower128(two_f, cache);
+    return {((r.high() >> (64 - beta)) & 1) != 0,
+            ((r.high() << beta) | (r.low() >> (64 - beta))) == 0};
+  }
+
+  static auto compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (cache.high() -
+            (cache.high() >> (num_significand_bits<double>() + 2))) >>
+           (64 - num_significand_bits<double>() - 1 - beta);
+  }
+
+  static auto compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return (cache.high() +
+            (cache.high() >> (num_significand_bits<double>() + 1))) >>
+           (64 - num_significand_bits<double>() - 1 - beta);
+  }
+
+  static auto compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta) noexcept -> carrier_uint {
+    return ((cache.high() >> (64 - num_significand_bits<double>() - 2 - beta)) +
+            1) /
+           2;
+  }
+};
+
+FMT_FUNC auto get_cached_power(int k) noexcept -> uint128_fallback {
+  return cache_accessor<double>::get_cached_power(k);
+}
+
+// Various integer checks
+template <typename T>
+auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
+  const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  return exponent >= case_shorter_interval_left_endpoint_lower_threshold &&
+         exponent <= case_shorter_interval_left_endpoint_upper_threshold;
+}
+
+// Remove trailing zeros from n and return the number of zeros removed (float)
+FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
+  FMT_ASSERT(n != 0, "");
+  // Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
+  constexpr uint32_t mod_inv_5 = 0xcccccccd;
+  constexpr uint32_t mod_inv_25 = 0xc28f5c29;  // = mod_inv_5 * mod_inv_5
+
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q > max_value<uint32_t>() / 100) break;
+    n = q;
+    s += 2;
+  }
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= max_value<uint32_t>() / 10) {
+    n = q;
+    s |= 1;
+  }
+  return s;
+}
+
+// Removes trailing zeros and returns the number of zeros removed (double)
+FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
+  FMT_ASSERT(n != 0, "");
+
+  // This magic number is ceil(2^90 / 10^8).
+  constexpr uint64_t magic_number = 12379400392853802749ull;
+  auto nm = umul128(n, magic_number);
+
+  // Is n is divisible by 10^8?
+  if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
+    // If yes, work with the quotient...
+    auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
+    // ... and use the 32 bit variant of the function
+    int s = remove_trailing_zeros(n32, 8);
+    n = n32;
+    return s;
+  }
+
+  // If n is not divisible by 10^8, work with n itself.
+  constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
+  constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29;  // mod_inv_5 * mod_inv_5
+
+  int s = 0;
+  while (true) {
+    auto q = rotr(n * mod_inv_25, 2);
+    if (q > max_value<uint64_t>() / 100) break;
+    n = q;
+    s += 2;
+  }
+  auto q = rotr(n * mod_inv_5, 1);
+  if (q <= max_value<uint64_t>() / 10) {
+    n = q;
+    s |= 1;
+  }
+
+  return s;
+}
+
+// The main algorithm for shorter interval case
+template <typename T>
+FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
+  decimal_fp<T> ret_value;
+  // Compute k and beta
+  const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute xi and zi
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+
+  auto xi = cache_accessor<T>::compute_left_endpoint_for_shorter_interval_case(
+      cache, beta);
+  auto zi = cache_accessor<T>::compute_right_endpoint_for_shorter_interval_case(
+      cache, beta);
+
+  // If the left endpoint is not an integer, increase it
+  if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
+
+  // Try bigger divisor
+  ret_value.significand = zi / 10;
+
+  // If succeed, remove trailing zeros if necessary and return
+  if (ret_value.significand * 10 >= xi) {
+    ret_value.exponent = minus_k + 1;
+    ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+    return ret_value;
+  }
+
+  // Otherwise, compute the round-up of y
+  ret_value.significand =
+      cache_accessor<T>::compute_round_up_for_shorter_interval_case(cache,
+                                                                    beta);
+  ret_value.exponent = minus_k;
+
+  // When tie occurs, choose one of them according to the rule
+  if (exponent >= float_info<T>::shorter_interval_tie_lower_threshold &&
+      exponent <= float_info<T>::shorter_interval_tie_upper_threshold) {
+    ret_value.significand = ret_value.significand % 2 == 0
+                                ? ret_value.significand
+                                : ret_value.significand - 1;
+  } else if (ret_value.significand < xi) {
+    ++ret_value.significand;
+  }
+  return ret_value;
+}
+
+template <typename T> auto to_decimal(T x) noexcept -> decimal_fp<T> {
+  // Step 1: integer promotion & Schubfach multiplier calculation.
+
+  using carrier_uint = typename float_info<T>::carrier_uint;
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  auto br = bit_cast<carrier_uint>(x);
+
+  // Extract significand bits and exponent bits.
+  const carrier_uint significand_mask =
+      (static_cast<carrier_uint>(1) << num_significand_bits<T>()) - 1;
+  carrier_uint significand = (br & significand_mask);
+  int exponent =
+      static_cast<int>((br & exponent_mask<T>()) >> num_significand_bits<T>());
+
+  if (exponent != 0) {  // Check if normal.
+    exponent -= exponent_bias<T>() + num_significand_bits<T>();
+
+    // Shorter interval case; proceed like Schubfach.
+    // In fact, when exponent == 1 and significand == 0, the interval is
+    // regular. However, it can be shown that the end-results are anyway same.
+    if (significand == 0) return shorter_interval_case<T>(exponent);
+
+    significand |= (static_cast<carrier_uint>(1) << num_significand_bits<T>());
+  } else {
+    // Subnormal case; the interval is always regular.
+    if (significand == 0) return {0, 0};
+    exponent =
+        std::numeric_limits<T>::min_exponent - num_significand_bits<T>() - 1;
+  }
+
+  const bool include_left_endpoint = (significand % 2 == 0);
+  const bool include_right_endpoint = include_left_endpoint;
+
+  // Compute k and beta.
+  const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+  const int beta = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute zi and deltai.
+  // 10^kappa <= deltai < 10^(kappa + 1)
+  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta);
+  const carrier_uint two_fc = significand << 1;
+
+  // For the case of binary32, the result of integer check is not correct for
+  // 29711844 * 2^-82
+  // = 6.1442653300000000008655037797566933477355632930994033813476... * 10^-18
+  // and 29711844 * 2^-81
+  // = 1.2288530660000000001731007559513386695471126586198806762695... * 10^-17,
+  // and they are the unique counterexamples. However, since 29711844 is even,
+  // this does not cause any problem for the endpoints calculations; it can only
+  // cause a problem when we need to perform integer check for the center.
+  // Fortunately, with these inputs, that branch is never executed, so we are
+  // fine.
+  const typename cache_accessor<T>::compute_mul_result z_mul =
+      cache_accessor<T>::compute_mul((two_fc | 1) << beta, cache);
+
+  // Step 2: Try larger divisor; remove trailing zeros if necessary.
+
+  // Using an upper bound on zi, we might be able to optimize the division
+  // better than the compiler; we are computing zi / big_divisor here.
+  decimal_fp<T> ret_value;
+  ret_value.significand = divide_by_10_to_kappa_plus_1(z_mul.result);
+  uint32_t r = static_cast<uint32_t>(z_mul.result - float_info<T>::big_divisor *
+                                                        ret_value.significand);
+
+  if (r < deltai) {
+    // Exclude the right endpoint if necessary.
+    if (r == 0 && (z_mul.is_integer & !include_right_endpoint)) {
+      --ret_value.significand;
+      r = float_info<T>::big_divisor;
+      goto small_divisor_case_label;
+    }
+  } else if (r > deltai) {
+    goto small_divisor_case_label;
+  } else {
+    // r == deltai; compare fractional parts.
+    const typename cache_accessor<T>::compute_mul_parity_result x_mul =
+        cache_accessor<T>::compute_mul_parity(two_fc - 1, cache, beta);
+
+    if (!(x_mul.parity | (x_mul.is_integer & include_left_endpoint)))
+      goto small_divisor_case_label;
+  }
+  ret_value.exponent = minus_k + float_info<T>::kappa + 1;
+
+  // We may need to remove trailing zeros.
+  ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+  return ret_value;
+
+  // Step 3: Find the significand with the smaller divisor.
+
+small_divisor_case_label:
+  ret_value.significand *= 10;
+  ret_value.exponent = minus_k + float_info<T>::kappa;
+
+  uint32_t dist = r - (deltai / 2) + (float_info<T>::small_divisor / 2);
+  const bool approx_y_parity =
+      ((dist ^ (float_info<T>::small_divisor / 2)) & 1) != 0;
+
+  // Is dist divisible by 10^kappa?
+  const bool divisible_by_small_divisor =
+      check_divisibility_and_divide_by_pow10<float_info<T>::kappa>(dist);
+
+  // Add dist / 10^kappa to the significand.
+  ret_value.significand += dist;
+
+  if (!divisible_by_small_divisor) return ret_value;
+
+  // Check z^(f) >= epsilon^(f).
+  // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
+  // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f).
+  // Since there are only 2 possibilities, we only need to care about the
+  // parity. Also, zi and r should have the same parity since the divisor
+  // is an even number.
+  const auto y_mul = cache_accessor<T>::compute_mul_parity(two_fc, cache, beta);
+
+  // If z^(f) >= epsilon^(f), we might have a tie when z^(f) == epsilon^(f),
+  // or equivalently, when y is an integer.
+  if (y_mul.parity != approx_y_parity)
+    --ret_value.significand;
+  else if (y_mul.is_integer & (ret_value.significand % 2 != 0))
+    --ret_value.significand;
+  return ret_value;
+}
+}  // namespace dragonbox
+}  // namespace detail
+
+template <> struct formatter<detail::bigint> {
+  FMT_CONSTEXPR auto parse(format_parse_context& ctx)
+      -> format_parse_context::iterator {
+    return ctx.begin();
+  }
+
+  auto format(const detail::bigint& n, format_context& ctx) const
+      -> format_context::iterator {
+    auto out = ctx.out();
+    bool first = true;
+    for (auto i = n.bigits_.size(); i > 0; --i) {
+      auto value = n.bigits_[i - 1u];
+      if (first) {
+        out = fmt::format_to(out, FMT_STRING("{:x}"), value);
+        first = false;
+        continue;
+      }
+      out = fmt::format_to(out, FMT_STRING("{:08x}"), value);
+    }
+    if (n.exp_ > 0)
+      out = fmt::format_to(out, FMT_STRING("p{}"),
+                           n.exp_ * detail::bigint::bigit_bits);
+    return out;
+  }
+};
+
+FMT_FUNC detail::utf8_to_utf16::utf8_to_utf16(string_view s) {
+  for_each_codepoint(s, [this](uint32_t cp, string_view) {
+    if (cp == invalid_code_point) FMT_THROW(std::runtime_error("invalid utf8"));
+    if (cp <= 0xFFFF) {
+      buffer_.push_back(static_cast<wchar_t>(cp));
+    } else {
+      cp -= 0x10000;
+      buffer_.push_back(static_cast<wchar_t>(0xD800 + (cp >> 10)));
+      buffer_.push_back(static_cast<wchar_t>(0xDC00 + (cp & 0x3FF)));
+    }
+    return true;
+  });
+  buffer_.push_back(0);
+}
+
+FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
+                                  const char* message) noexcept {
+  FMT_TRY {
+    auto ec = std::error_code(error_code, std::generic_category());
+    detail::write(appender(out), std::system_error(ec, message).what());
+    return;
+  }
+  FMT_CATCH(...) {}
+  format_error_code(out, error_code, message);
+}
+
+FMT_FUNC void report_system_error(int error_code,
+                                  const char* message) noexcept {
+  do_report_error(format_system_error, error_code, message);
+}
+
+FMT_FUNC auto vformat(string_view fmt, format_args args) -> std::string {
+  // Don't optimize the "{}" case to keep the binary size small and because it
+  // can be better optimized in fmt::format anyway.
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  return to_string(buffer);
+}
+
+namespace detail {
+
+FMT_FUNC void vformat_to(buffer<char>& buf, string_view fmt, format_args args,
+                         locale_ref loc) {
+  auto out = appender(buf);
+  if (fmt.size() == 2 && equal2(fmt.data(), "{}"))
+    return args.get(0).visit(default_arg_formatter<char>{out});
+  parse_format_string(
+      fmt, format_handler<char>{parse_context<char>(fmt), {out, args, loc}});
+}
+
+template <typename T> struct span {
+  T* data;
+  size_t size;
+};
+
+template <typename F> auto flockfile(F* f) -> decltype(_lock_file(f)) {
+  _lock_file(f);
+}
+template <typename F> auto funlockfile(F* f) -> decltype(_unlock_file(f)) {
+  _unlock_file(f);
+}
+
+#ifndef getc_unlocked
+template <typename F> auto getc_unlocked(F* f) -> decltype(_fgetc_nolock(f)) {
+  return _fgetc_nolock(f);
+}
+#endif
+
+template <typename F = FILE, typename Enable = void>
+struct has_flockfile : std::false_type {};
+
+template <typename F>
+struct has_flockfile<F, void_t<decltype(flockfile(&std::declval<F&>()))>>
+    : std::true_type {};
+
+// A FILE wrapper. F is FILE defined as a template parameter to make system API
+// detection work.
+template <typename F> class file_base {
+ public:
+  F* file_;
+
+ public:
+  file_base(F* file) : file_(file) {}
+  operator F*() const { return file_; }
+
+  // Reads a code unit from the stream.
+  auto get() -> int {
+    int result = getc_unlocked(file_);
+    if (result == EOF && ferror(file_) != 0)
+      FMT_THROW(system_error(errno, FMT_STRING("getc failed")));
+    return result;
+  }
+
+  // Puts the code unit back into the stream buffer.
+  void unget(char c) {
+    if (ungetc(c, file_) == EOF)
+      FMT_THROW(system_error(errno, FMT_STRING("ungetc failed")));
+  }
+
+  void flush() { fflush(this->file_); }
+};
+
+// A FILE wrapper for glibc.
+template <typename F> class glibc_file : public file_base<F> {
+ private:
+  enum {
+    line_buffered = 0x200,  // _IO_LINE_BUF
+    unbuffered = 2          // _IO_UNBUFFERED
+  };
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool {
+    return (this->file_->_flags & unbuffered) == 0;
+  }
+
+  void init_buffer() {
+    if (this->file_->_IO_write_ptr) return;
+    // Force buffer initialization by placing and removing a char in a buffer.
+    assume(this->file_->_IO_write_ptr >= this->file_->_IO_write_end);
+    putc_unlocked(0, this->file_);
+    --this->file_->_IO_write_ptr;
+  }
+
+  // Returns the file's read buffer.
+  auto get_read_buffer() const -> span<const char> {
+    auto ptr = this->file_->_IO_read_ptr;
+    return {ptr, to_unsigned(this->file_->_IO_read_end - ptr)};
+  }
+
+  // Returns the file's write buffer.
+  auto get_write_buffer() const -> span<char> {
+    auto ptr = this->file_->_IO_write_ptr;
+    return {ptr, to_unsigned(this->file_->_IO_buf_end - ptr)};
+  }
+
+  void advance_write_buffer(size_t size) { this->file_->_IO_write_ptr += size; }
+
+  bool needs_flush() const {
+    if ((this->file_->_flags & line_buffered) == 0) return false;
+    char* end = this->file_->_IO_write_end;
+    return memchr(end, '\n', to_unsigned(this->file_->_IO_write_ptr - end));
+  }
+
+  void flush() { fflush_unlocked(this->file_); }
+};
+
+// A FILE wrapper for Apple's libc.
+template <typename F> class apple_file : public file_base<F> {
+ private:
+  enum {
+    line_buffered = 1,  // __SNBF
+    unbuffered = 2      // __SLBF
+  };
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool {
+    return (this->file_->_flags & unbuffered) == 0;
+  }
+
+  void init_buffer() {
+    if (this->file_->_p) return;
+    // Force buffer initialization by placing and removing a char in a buffer.
+    putc_unlocked(0, this->file_);
+    --this->file_->_p;
+    ++this->file_->_w;
+  }
+
+  auto get_read_buffer() const -> span<const char> {
+    return {reinterpret_cast<char*>(this->file_->_p),
+            to_unsigned(this->file_->_r)};
+  }
+
+  auto get_write_buffer() const -> span<char> {
+    return {reinterpret_cast<char*>(this->file_->_p),
+            to_unsigned(this->file_->_bf._base + this->file_->_bf._size -
+                        this->file_->_p)};
+  }
+
+  void advance_write_buffer(size_t size) {
+    this->file_->_p += size;
+    this->file_->_w -= size;
+  }
+
+  bool needs_flush() const {
+    if ((this->file_->_flags & line_buffered) == 0) return false;
+    return memchr(this->file_->_p + this->file_->_w, '\n',
+                  to_unsigned(-this->file_->_w));
+  }
+};
+
+// A fallback FILE wrapper.
+template <typename F> class fallback_file : public file_base<F> {
+ private:
+  char next_;  // The next unconsumed character in the buffer.
+  bool has_next_ = false;
+
+ public:
+  using file_base<F>::file_base;
+
+  auto is_buffered() const -> bool { return false; }
+  auto needs_flush() const -> bool { return false; }
+  void init_buffer() {}
+
+  auto get_read_buffer() const -> span<const char> {
+    return {&next_, has_next_ ? 1u : 0u};
+  }
+
+  auto get_write_buffer() const -> span<char> { return {nullptr, 0}; }
+
+  void advance_write_buffer(size_t) {}
+
+  auto get() -> int {
+    has_next_ = false;
+    return file_base<F>::get();
+  }
+
+  void unget(char c) {
+    file_base<F>::unget(c);
+    next_ = c;
+    has_next_ = true;
+  }
+};
+
+#ifndef FMT_USE_FALLBACK_FILE
+#  define FMT_USE_FALLBACK_FILE 0
+#endif
+
+template <typename F,
+          FMT_ENABLE_IF(sizeof(F::_p) != 0 && !FMT_USE_FALLBACK_FILE)>
+auto get_file(F* f, int) -> apple_file<F> {
+  return f;
+}
+template <typename F,
+          FMT_ENABLE_IF(sizeof(F::_IO_read_ptr) != 0 && !FMT_USE_FALLBACK_FILE)>
+inline auto get_file(F* f, int) -> glibc_file<F> {
+  return f;
+}
+
+inline auto get_file(FILE* f, ...) -> fallback_file<FILE> { return f; }
+
+using file_ref = decltype(get_file(static_cast<FILE*>(nullptr), 0));
+
+template <typename F = FILE, typename Enable = void>
+class file_print_buffer : public buffer<char> {
+ public:
+  explicit file_print_buffer(F*) : buffer(nullptr, size_t()) {}
+};
+
+template <typename F>
+class file_print_buffer<F, enable_if_t<has_flockfile<F>::value>>
+    : public buffer<char> {
+ private:
+  file_ref file_;
+
+  static void grow(buffer<char>& base, size_t) {
+    auto& self = static_cast<file_print_buffer&>(base);
+    self.file_.advance_write_buffer(self.size());
+    if (self.file_.get_write_buffer().size == 0) self.file_.flush();
+    auto buf = self.file_.get_write_buffer();
+    FMT_ASSERT(buf.size > 0, "");
+    self.set(buf.data, buf.size);
+    self.clear();
+  }
+
+ public:
+  explicit file_print_buffer(F* f) : buffer(grow, size_t()), file_(f) {
+    flockfile(f);
+    file_.init_buffer();
+    auto buf = file_.get_write_buffer();
+    set(buf.data, buf.size);
+  }
+  ~file_print_buffer() {
+    file_.advance_write_buffer(size());
+    bool flush = file_.needs_flush();
+    F* f = file_;    // Make funlockfile depend on the template parameter F
+    funlockfile(f);  // for the system API detection to work.
+    if (flush) fflush(file_);
+  }
+};
+
+#if !defined(_WIN32) || defined(FMT_USE_WRITE_CONSOLE)
+FMT_FUNC auto write_console(int, string_view) -> bool { return false; }
+#else
+using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
+extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
+    void*, const void*, dword, dword*, void*);
+
+FMT_FUNC bool write_console(int fd, string_view text) {
+  auto u16 = utf8_to_utf16(text);
+  return WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)), u16.c_str(),
+                       static_cast<dword>(u16.size()), nullptr, nullptr) != 0;
+}
+#endif
+
+#ifdef _WIN32
+// Print assuming legacy (non-Unicode) encoding.
+FMT_FUNC void vprint_mojibake(std::FILE* f, string_view fmt, format_args args,
+                              bool newline) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  if (newline) buffer.push_back('\n');
+  fwrite_all(buffer.data(), buffer.size(), f);
+}
+#endif
+
+FMT_FUNC void print(std::FILE* f, string_view text) {
+#if defined(_WIN32) && !defined(FMT_USE_WRITE_CONSOLE)
+  int fd = _fileno(f);
+  if (_isatty(fd)) {
+    std::fflush(f);
+    if (write_console(fd, text)) return;
+  }
+#endif
+  fwrite_all(text.data(), text.size(), f);
+}
+}  // namespace detail
+
+FMT_FUNC void vprint_buffered(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+FMT_FUNC void vprint(std::FILE* f, string_view fmt, format_args args) {
+  if (!detail::file_ref(f).is_buffered() || !detail::has_flockfile<>())
+    return vprint_buffered(f, fmt, args);
+  auto&& buffer = detail::file_print_buffer<>(f);
+  return detail::vformat_to(buffer, fmt, args);
+}
+
+FMT_FUNC void vprintln(std::FILE* f, string_view fmt, format_args args) {
+  auto buffer = memory_buffer();
+  detail::vformat_to(buffer, fmt, args);
+  buffer.push_back('\n');
+  detail::print(f, {buffer.data(), buffer.size()});
+}
+
+FMT_FUNC void vprint(string_view fmt, format_args args) {
+  vprint(stdout, fmt, args);
+}
+
+namespace detail {
+
+struct singleton {
+  unsigned char upper;
+  unsigned char lower_count;
+};
+
+inline auto is_printable(uint16_t x, const singleton* singletons,
+                         size_t singletons_size,
+                         const unsigned char* singleton_lowers,
+                         const unsigned char* normal, size_t normal_size)
+    -> bool {
+  auto upper = x >> 8;
+  auto lower_start = 0;
+  for (size_t i = 0; i < singletons_size; ++i) {
+    auto s = singletons[i];
+    auto lower_end = lower_start + s.lower_count;
+    if (upper < s.upper) break;
+    if (upper == s.upper) {
+      for (auto j = lower_start; j < lower_end; ++j) {
+        if (singleton_lowers[j] == (x & 0xff)) return false;
+      }
+    }
+    lower_start = lower_end;
+  }
+
+  auto xsigned = static_cast<int>(x);
+  auto current = true;
+  for (size_t i = 0; i < normal_size; ++i) {
+    auto v = static_cast<int>(normal[i]);
+    auto len = (v & 0x80) != 0 ? (v & 0x7f) << 8 | normal[++i] : v;
+    xsigned -= len;
+    if (xsigned < 0) break;
+    current = !current;
+  }
+  return current;
+}
+
+// This code is generated by support/printable.py.
+FMT_FUNC auto is_printable(uint32_t cp) -> bool {
+  static constexpr singleton singletons0[] = {
+      {0x00, 1},  {0x03, 5},  {0x05, 6},  {0x06, 3},  {0x07, 6},  {0x08, 8},
+      {0x09, 17}, {0x0a, 28}, {0x0b, 25}, {0x0c, 20}, {0x0d, 16}, {0x0e, 13},
+      {0x0f, 4},  {0x10, 3},  {0x12, 18}, {0x13, 9},  {0x16, 1},  {0x17, 5},
+      {0x18, 2},  {0x19, 3},  {0x1a, 7},  {0x1c, 2},  {0x1d, 1},  {0x1f, 22},
+      {0x20, 3},  {0x2b, 3},  {0x2c, 2},  {0x2d, 11}, {0x2e, 1},  {0x30, 3},
+      {0x31, 2},  {0x32, 1},  {0xa7, 2},  {0xa9, 2},  {0xaa, 4},  {0xab, 8},
+      {0xfa, 2},  {0xfb, 5},  {0xfd, 4},  {0xfe, 3},  {0xff, 9},
+  };
+  static constexpr unsigned char singletons0_lower[] = {
+      0xad, 0x78, 0x79, 0x8b, 0x8d, 0xa2, 0x30, 0x57, 0x58, 0x8b, 0x8c, 0x90,
+      0x1c, 0x1d, 0xdd, 0x0e, 0x0f, 0x4b, 0x4c, 0xfb, 0xfc, 0x2e, 0x2f, 0x3f,
+      0x5c, 0x5d, 0x5f, 0xb5, 0xe2, 0x84, 0x8d, 0x8e, 0x91, 0x92, 0xa9, 0xb1,
+      0xba, 0xbb, 0xc5, 0xc6, 0xc9, 0xca, 0xde, 0xe4, 0xe5, 0xff, 0x00, 0x04,
+      0x11, 0x12, 0x29, 0x31, 0x34, 0x37, 0x3a, 0x3b, 0x3d, 0x49, 0x4a, 0x5d,
+      0x84, 0x8e, 0x92, 0xa9, 0xb1, 0xb4, 0xba, 0xbb, 0xc6, 0xca, 0xce, 0xcf,
+      0xe4, 0xe5, 0x00, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x3b, 0x45, 0x46, 0x49, 0x4a, 0x5e, 0x64, 0x65, 0x84, 0x91, 0x9b, 0x9d,
+      0xc9, 0xce, 0xcf, 0x0d, 0x11, 0x29, 0x45, 0x49, 0x57, 0x64, 0x65, 0x8d,
+      0x91, 0xa9, 0xb4, 0xba, 0xbb, 0xc5, 0xc9, 0xdf, 0xe4, 0xe5, 0xf0, 0x0d,
+      0x11, 0x45, 0x49, 0x64, 0x65, 0x80, 0x84, 0xb2, 0xbc, 0xbe, 0xbf, 0xd5,
+      0xd7, 0xf0, 0xf1, 0x83, 0x85, 0x8b, 0xa4, 0xa6, 0xbe, 0xbf, 0xc5, 0xc7,
+      0xce, 0xcf, 0xda, 0xdb, 0x48, 0x98, 0xbd, 0xcd, 0xc6, 0xce, 0xcf, 0x49,
+      0x4e, 0x4f, 0x57, 0x59, 0x5e, 0x5f, 0x89, 0x8e, 0x8f, 0xb1, 0xb6, 0xb7,
+      0xbf, 0xc1, 0xc6, 0xc7, 0xd7, 0x11, 0x16, 0x17, 0x5b, 0x5c, 0xf6, 0xf7,
+      0xfe, 0xff, 0x80, 0x0d, 0x6d, 0x71, 0xde, 0xdf, 0x0e, 0x0f, 0x1f, 0x6e,
+      0x6f, 0x1c, 0x1d, 0x5f, 0x7d, 0x7e, 0xae, 0xaf, 0xbb, 0xbc, 0xfa, 0x16,
+      0x17, 0x1e, 0x1f, 0x46, 0x47, 0x4e, 0x4f, 0x58, 0x5a, 0x5c, 0x5e, 0x7e,
+      0x7f, 0xb5, 0xc5, 0xd4, 0xd5, 0xdc, 0xf0, 0xf1, 0xf5, 0x72, 0x73, 0x8f,
+      0x74, 0x75, 0x96, 0x2f, 0x5f, 0x26, 0x2e, 0x2f, 0xa7, 0xaf, 0xb7, 0xbf,
+      0xc7, 0xcf, 0xd7, 0xdf, 0x9a, 0x40, 0x97, 0x98, 0x30, 0x8f, 0x1f, 0xc0,
+      0xc1, 0xce, 0xff, 0x4e, 0x4f, 0x5a, 0x5b, 0x07, 0x08, 0x0f, 0x10, 0x27,
+      0x2f, 0xee, 0xef, 0x6e, 0x6f, 0x37, 0x3d, 0x3f, 0x42, 0x45, 0x90, 0x91,
+      0xfe, 0xff, 0x53, 0x67, 0x75, 0xc8, 0xc9, 0xd0, 0xd1, 0xd8, 0xd9, 0xe7,
+      0xfe, 0xff,
+  };
+  static constexpr singleton singletons1[] = {
+      {0x00, 6},  {0x01, 1}, {0x03, 1},  {0x04, 2}, {0x08, 8},  {0x09, 2},
+      {0x0a, 5},  {0x0b, 2}, {0x0e, 4},  {0x10, 1}, {0x11, 2},  {0x12, 5},
+      {0x13, 17}, {0x14, 1}, {0x15, 2},  {0x17, 2}, {0x19, 13}, {0x1c, 5},
+      {0x1d, 8},  {0x24, 1}, {0x6a, 3},  {0x6b, 2}, {0xbc, 2},  {0xd1, 2},
+      {0xd4, 12}, {0xd5, 9}, {0xd6, 2},  {0xd7, 2}, {0xda, 1},  {0xe0, 5},
+      {0xe1, 2},  {0xe8, 2}, {0xee, 32}, {0xf0, 4}, {0xf8, 2},  {0xf9, 2},
+      {0xfa, 2},  {0xfb, 1},
+  };
+  static constexpr unsigned char singletons1_lower[] = {
+      0x0c, 0x27, 0x3b, 0x3e, 0x4e, 0x4f, 0x8f, 0x9e, 0x9e, 0x9f, 0x06, 0x07,
+      0x09, 0x36, 0x3d, 0x3e, 0x56, 0xf3, 0xd0, 0xd1, 0x04, 0x14, 0x18, 0x36,
+      0x37, 0x56, 0x57, 0x7f, 0xaa, 0xae, 0xaf, 0xbd, 0x35, 0xe0, 0x12, 0x87,
+      0x89, 0x8e, 0x9e, 0x04, 0x0d, 0x0e, 0x11, 0x12, 0x29, 0x31, 0x34, 0x3a,
+      0x45, 0x46, 0x49, 0x4a, 0x4e, 0x4f, 0x64, 0x65, 0x5c, 0xb6, 0xb7, 0x1b,
+      0x1c, 0x07, 0x08, 0x0a, 0x0b, 0x14, 0x17, 0x36, 0x39, 0x3a, 0xa8, 0xa9,
+      0xd8, 0xd9, 0x09, 0x37, 0x90, 0x91, 0xa8, 0x07, 0x0a, 0x3b, 0x3e, 0x66,
+      0x69, 0x8f, 0x92, 0x6f, 0x5f, 0xee, 0xef, 0x5a, 0x62, 0x9a, 0x9b, 0x27,
+      0x28, 0x55, 0x9d, 0xa0, 0xa1, 0xa3, 0xa4, 0xa7, 0xa8, 0xad, 0xba, 0xbc,
+      0xc4, 0x06, 0x0b, 0x0c, 0x15, 0x1d, 0x3a, 0x3f, 0x45, 0x51, 0xa6, 0xa7,
+      0xcc, 0xcd, 0xa0, 0x07, 0x19, 0x1a, 0x22, 0x25, 0x3e, 0x3f, 0xc5, 0xc6,
+      0x04, 0x20, 0x23, 0x25, 0x26, 0x28, 0x33, 0x38, 0x3a, 0x48, 0x4a, 0x4c,
+      0x50, 0x53, 0x55, 0x56, 0x58, 0x5a, 0x5c, 0x5e, 0x60, 0x63, 0x65, 0x66,
+      0x6b, 0x73, 0x78, 0x7d, 0x7f, 0x8a, 0xa4, 0xaa, 0xaf, 0xb0, 0xc0, 0xd0,
+      0xae, 0xaf, 0x79, 0xcc, 0x6e, 0x6f, 0x93,
+  };
+  static constexpr unsigned char normal0[] = {
+      0x00, 0x20, 0x5f, 0x22, 0x82, 0xdf, 0x04, 0x82, 0x44, 0x08, 0x1b, 0x04,
+      0x06, 0x11, 0x81, 0xac, 0x0e, 0x80, 0xab, 0x35, 0x28, 0x0b, 0x80, 0xe0,
+      0x03, 0x19, 0x08, 0x01, 0x04, 0x2f, 0x04, 0x34, 0x04, 0x07, 0x03, 0x01,
+      0x07, 0x06, 0x07, 0x11, 0x0a, 0x50, 0x0f, 0x12, 0x07, 0x55, 0x07, 0x03,
+      0x04, 0x1c, 0x0a, 0x09, 0x03, 0x08, 0x03, 0x07, 0x03, 0x02, 0x03, 0x03,
+      0x03, 0x0c, 0x04, 0x05, 0x03, 0x0b, 0x06, 0x01, 0x0e, 0x15, 0x05, 0x3a,
+      0x03, 0x11, 0x07, 0x06, 0x05, 0x10, 0x07, 0x57, 0x07, 0x02, 0x07, 0x15,
+      0x0d, 0x50, 0x04, 0x43, 0x03, 0x2d, 0x03, 0x01, 0x04, 0x11, 0x06, 0x0f,
+      0x0c, 0x3a, 0x04, 0x1d, 0x25, 0x5f, 0x20, 0x6d, 0x04, 0x6a, 0x25, 0x80,
+      0xc8, 0x05, 0x82, 0xb0, 0x03, 0x1a, 0x06, 0x82, 0xfd, 0x03, 0x59, 0x07,
+      0x15, 0x0b, 0x17, 0x09, 0x14, 0x0c, 0x14, 0x0c, 0x6a, 0x06, 0x0a, 0x06,
+      0x1a, 0x06, 0x59, 0x07, 0x2b, 0x05, 0x46, 0x0a, 0x2c, 0x04, 0x0c, 0x04,
+      0x01, 0x03, 0x31, 0x0b, 0x2c, 0x04, 0x1a, 0x06, 0x0b, 0x03, 0x80, 0xac,
+      0x06, 0x0a, 0x06, 0x21, 0x3f, 0x4c, 0x04, 0x2d, 0x03, 0x74, 0x08, 0x3c,
+      0x03, 0x0f, 0x03, 0x3c, 0x07, 0x38, 0x08, 0x2b, 0x05, 0x82, 0xff, 0x11,
+      0x18, 0x08, 0x2f, 0x11, 0x2d, 0x03, 0x20, 0x10, 0x21, 0x0f, 0x80, 0x8c,
+      0x04, 0x82, 0x97, 0x19, 0x0b, 0x15, 0x88, 0x94, 0x05, 0x2f, 0x05, 0x3b,
+      0x07, 0x02, 0x0e, 0x18, 0x09, 0x80, 0xb3, 0x2d, 0x74, 0x0c, 0x80, 0xd6,
+      0x1a, 0x0c, 0x05, 0x80, 0xff, 0x05, 0x80, 0xdf, 0x0c, 0xee, 0x0d, 0x03,
+      0x84, 0x8d, 0x03, 0x37, 0x09, 0x81, 0x5c, 0x14, 0x80, 0xb8, 0x08, 0x80,
+      0xcb, 0x2a, 0x38, 0x03, 0x0a, 0x06, 0x38, 0x08, 0x46, 0x08, 0x0c, 0x06,
+      0x74, 0x0b, 0x1e, 0x03, 0x5a, 0x04, 0x59, 0x09, 0x80, 0x83, 0x18, 0x1c,
+      0x0a, 0x16, 0x09, 0x4c, 0x04, 0x80, 0x8a, 0x06, 0xab, 0xa4, 0x0c, 0x17,
+      0x04, 0x31, 0xa1, 0x04, 0x81, 0xda, 0x26, 0x07, 0x0c, 0x05, 0x05, 0x80,
+      0xa5, 0x11, 0x81, 0x6d, 0x10, 0x78, 0x28, 0x2a, 0x06, 0x4c, 0x04, 0x80,
+      0x8d, 0x04, 0x80, 0xbe, 0x03, 0x1b, 0x03, 0x0f, 0x0d,
+  };
+  static constexpr unsigned char normal1[] = {
+      0x5e, 0x22, 0x7b, 0x05, 0x03, 0x04, 0x2d, 0x03, 0x66, 0x03, 0x01, 0x2f,
+      0x2e, 0x80, 0x82, 0x1d, 0x03, 0x31, 0x0f, 0x1c, 0x04, 0x24, 0x09, 0x1e,
+      0x05, 0x2b, 0x05, 0x44, 0x04, 0x0e, 0x2a, 0x80, 0xaa, 0x06, 0x24, 0x04,
+      0x24, 0x04, 0x28, 0x08, 0x34, 0x0b, 0x01, 0x80, 0x90, 0x81, 0x37, 0x09,
+      0x16, 0x0a, 0x08, 0x80, 0x98, 0x39, 0x03, 0x63, 0x08, 0x09, 0x30, 0x16,
+      0x05, 0x21, 0x03, 0x1b, 0x05, 0x01, 0x40, 0x38, 0x04, 0x4b, 0x05, 0x2f,
+      0x04, 0x0a, 0x07, 0x09, 0x07, 0x40, 0x20, 0x27, 0x04, 0x0c, 0x09, 0x36,
+      0x03, 0x3a, 0x05, 0x1a, 0x07, 0x04, 0x0c, 0x07, 0x50, 0x49, 0x37, 0x33,
+      0x0d, 0x33, 0x07, 0x2e, 0x08, 0x0a, 0x81, 0x26, 0x52, 0x4e, 0x28, 0x08,
+      0x2a, 0x56, 0x1c, 0x14, 0x17, 0x09, 0x4e, 0x04, 0x1e, 0x0f, 0x43, 0x0e,
+      0x19, 0x07, 0x0a, 0x06, 0x48, 0x08, 0x27, 0x09, 0x75, 0x0b, 0x3f, 0x41,
+      0x2a, 0x06, 0x3b, 0x05, 0x0a, 0x06, 0x51, 0x06, 0x01, 0x05, 0x10, 0x03,
+      0x05, 0x80, 0x8b, 0x62, 0x1e, 0x48, 0x08, 0x0a, 0x80, 0xa6, 0x5e, 0x22,
+      0x45, 0x0b, 0x0a, 0x06, 0x0d, 0x13, 0x39, 0x07, 0x0a, 0x36, 0x2c, 0x04,
+      0x10, 0x80, 0xc0, 0x3c, 0x64, 0x53, 0x0c, 0x48, 0x09, 0x0a, 0x46, 0x45,
+      0x1b, 0x48, 0x08, 0x53, 0x1d, 0x39, 0x81, 0x07, 0x46, 0x0a, 0x1d, 0x03,
+      0x47, 0x49, 0x37, 0x03, 0x0e, 0x08, 0x0a, 0x06, 0x39, 0x07, 0x0a, 0x81,
+      0x36, 0x19, 0x80, 0xb7, 0x01, 0x0f, 0x32, 0x0d, 0x83, 0x9b, 0x66, 0x75,
+      0x0b, 0x80, 0xc4, 0x8a, 0xbc, 0x84, 0x2f, 0x8f, 0xd1, 0x82, 0x47, 0xa1,
+      0xb9, 0x82, 0x39, 0x07, 0x2a, 0x04, 0x02, 0x60, 0x26, 0x0a, 0x46, 0x0a,
+      0x28, 0x05, 0x13, 0x82, 0xb0, 0x5b, 0x65, 0x4b, 0x04, 0x39, 0x07, 0x11,
+      0x40, 0x05, 0x0b, 0x02, 0x0e, 0x97, 0xf8, 0x08, 0x84, 0xd6, 0x2a, 0x09,
+      0xa2, 0xf7, 0x81, 0x1f, 0x31, 0x03, 0x11, 0x04, 0x08, 0x81, 0x8c, 0x89,
+      0x04, 0x6b, 0x05, 0x0d, 0x03, 0x09, 0x07, 0x10, 0x93, 0x60, 0x80, 0xf6,
+      0x0a, 0x73, 0x08, 0x6e, 0x17, 0x46, 0x80, 0x9a, 0x14, 0x0c, 0x57, 0x09,
+      0x19, 0x80, 0x87, 0x81, 0x47, 0x03, 0x85, 0x42, 0x0f, 0x15, 0x85, 0x50,
+      0x2b, 0x80, 0xd5, 0x2d, 0x03, 0x1a, 0x04, 0x02, 0x81, 0x70, 0x3a, 0x05,
+      0x01, 0x85, 0x00, 0x80, 0xd7, 0x29, 0x4c, 0x04, 0x0a, 0x04, 0x02, 0x83,
+      0x11, 0x44, 0x4c, 0x3d, 0x80, 0xc2, 0x3c, 0x06, 0x01, 0x04, 0x55, 0x05,
+      0x1b, 0x34, 0x02, 0x81, 0x0e, 0x2c, 0x04, 0x64, 0x0c, 0x56, 0x0a, 0x80,
+      0xae, 0x38, 0x1d, 0x0d, 0x2c, 0x04, 0x09, 0x07, 0x02, 0x0e, 0x06, 0x80,
+      0x9a, 0x83, 0xd8, 0x08, 0x0d, 0x03, 0x0d, 0x03, 0x74, 0x0c, 0x59, 0x07,
+      0x0c, 0x14, 0x0c, 0x04, 0x38, 0x08, 0x0a, 0x06, 0x28, 0x08, 0x22, 0x4e,
+      0x81, 0x54, 0x0c, 0x15, 0x03, 0x03, 0x05, 0x07, 0x09, 0x19, 0x07, 0x07,
+      0x09, 0x03, 0x0d, 0x07, 0x29, 0x80, 0xcb, 0x25, 0x0a, 0x84, 0x06,
+  };
+  auto lower = static_cast<uint16_t>(cp);
+  if (cp < 0x10000) {
+    return is_printable(lower, singletons0,
+                        sizeof(singletons0) / sizeof(*singletons0),
+                        singletons0_lower, normal0, sizeof(normal0));
+  }
+  if (cp < 0x20000) {
+    return is_printable(lower, singletons1,
+                        sizeof(singletons1) / sizeof(*singletons1),
+                        singletons1_lower, normal1, sizeof(normal1));
+  }
+  if (0x2a6de <= cp && cp < 0x2a700) return false;
+  if (0x2b735 <= cp && cp < 0x2b740) return false;
+  if (0x2b81e <= cp && cp < 0x2b820) return false;
+  if (0x2cea2 <= cp && cp < 0x2ceb0) return false;
+  if (0x2ebe1 <= cp && cp < 0x2f800) return false;
+  if (0x2fa1e <= cp && cp < 0x30000) return false;
+  if (0x3134b <= cp && cp < 0xe0100) return false;
+  if (0xe01f0 <= cp && cp < 0x110000) return false;
+  return cp < 0x110000;
+}
+
+}  // namespace detail
+
+FMT_END_NAMESPACE
+
+#endif  // FMT_FORMAT_INL_H_
diff --git a/third_party/spdlog/fmt/bundled/format.h b/third_party/spdlog/fmt/bundled/format.h
new file mode 100644
index 00000000000..4466b4f4d24
--- /dev/null
+++ b/third_party/spdlog/fmt/bundled/format.h
@@ -0,0 +1,4234 @@
+/*
+  Formatting library for C++
+
+  Copyright (c) 2012 - present, Victor Zverovich
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be
+  included in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+  --- Optional exception to the license ---
+
+  As an exception, if, as a result of your compiling your source code, portions
+  of this Software are embedded into a machine-executable object form of such
+  source code, you may redistribute such embedded portions in such object form
+  without including the above copyright and permission notices.
+ */
+
+#ifndef FMT_FORMAT_H_
+#define FMT_FORMAT_H_
+
+#ifndef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#  define _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#  define FMT_REMOVE_TRANSITIVE_INCLUDES
+#endif
+
+#include "base.h"
+
+#ifndef FMT_MODULE
+#  include <cmath>    // std::signbit
+#  include <cstddef>  // std::byte
+#  include <cstdint>  // uint32_t
+#  include <cstring>  // std::memcpy
+#  include <limits>   // std::numeric_limits
+#  include <new>      // std::bad_alloc
+#  if defined(__GLIBCXX__) && !defined(_GLIBCXX_USE_DUAL_ABI)
+// Workaround for pre gcc 5 libstdc++.
+#    include <memory>  // std::allocator_traits
+#  endif
+#  include <stdexcept>     // std::runtime_error
+#  include <string>        // std::string
+#  include <system_error>  // std::system_error
+
+// Check FMT_CPLUSPLUS to avoid a warning in MSVC.
+#  if FMT_HAS_INCLUDE(<bit>) && FMT_CPLUSPLUS > 201703L
+#    include <bit>  // std::bit_cast
+#  endif
+
+// libc++ supports string_view in pre-c++17.
+#  if FMT_HAS_INCLUDE(<string_view>) && \
+      (FMT_CPLUSPLUS >= 201703L || defined(_LIBCPP_VERSION))
+#    include <string_view>
+#    define FMT_USE_STRING_VIEW
+#  endif
+
+#  if FMT_MSC_VERSION
+#    include <intrin.h>  // _BitScanReverse[64], _umul128
+#  endif
+#endif  // FMT_MODULE
+
+#if defined(FMT_USE_NONTYPE_TEMPLATE_ARGS)
+// Use the provided definition.
+#elif defined(__NVCOMPILER)
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#elif FMT_GCC_VERSION >= 903 && FMT_CPLUSPLUS >= 201709L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#elif defined(__cpp_nontype_template_args) && \
+    __cpp_nontype_template_args >= 201911L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#elif FMT_CLANG_VERSION >= 1200 && FMT_CPLUSPLUS >= 202002L
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 1
+#else
+#  define FMT_USE_NONTYPE_TEMPLATE_ARGS 0
+#endif
+
+#if defined __cpp_inline_variables && __cpp_inline_variables >= 201606L
+#  define FMT_INLINE_VARIABLE inline
+#else
+#  define FMT_INLINE_VARIABLE
+#endif
+
+// Check if RTTI is disabled.
+#ifdef FMT_USE_RTTI
+// Use the provided definition.
+#elif defined(__GXX_RTTI) || FMT_HAS_FEATURE(cxx_rtti) || defined(_CPPRTTI) || \
+    defined(__INTEL_RTTI__) || defined(__RTTI)
+// __RTTI is for EDG compilers. _CPPRTTI is for MSVC.
+#  define FMT_USE_RTTI 1
+#else
+#  define FMT_USE_RTTI 0
+#endif
+
+// Visibility when compiled as a shared library/object.
+#if defined(FMT_LIB_EXPORT) || defined(FMT_SHARED)
+#  define FMT_SO_VISIBILITY(value) FMT_VISIBILITY(value)
+#else
+#  define FMT_SO_VISIBILITY(value)
+#endif
+
+#if FMT_GCC_VERSION || FMT_CLANG_VERSION
+#  define FMT_NOINLINE __attribute__((noinline))
+#else
+#  define FMT_NOINLINE
+#endif
+
+namespace std {
+template <typename T> struct iterator_traits<fmt::basic_appender<T>> {
+  using iterator_category = output_iterator_tag;
+  using value_type = T;
+  using difference_type =
+      decltype(static_cast<int*>(nullptr) - static_cast<int*>(nullptr));
+  using pointer = void;
+  using reference = void;
+};
+}  // namespace std
+
+#ifndef FMT_THROW
+#  if FMT_USE_EXCEPTIONS
+#    if FMT_MSC_VERSION || defined(__NVCC__)
+FMT_BEGIN_NAMESPACE
+namespace detail {
+template <typename Exception> inline void do_throw(const Exception& x) {
+  // Silence unreachable code warnings in MSVC and NVCC because these
+  // are nearly impossible to fix in a generic code.
+  volatile bool b = true;
+  if (b) throw x;
+}
+}  // namespace detail
+FMT_END_NAMESPACE
+#      define FMT_THROW(x) detail::do_throw(x)
+#    else
+#      define FMT_THROW(x) throw x
+#    endif
+#  else
+#    define FMT_THROW(x) \
+      ::fmt::detail::assert_fail(__FILE__, __LINE__, (x).what())
+#  endif  // FMT_USE_EXCEPTIONS
+#endif    // FMT_THROW
+
+// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
+// integer formatter template instantiations to just one by only using the
+// largest integer type. This results in a reduction in binary size but will
+// cause a decrease in integer formatting performance.
+#if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
+#  define FMT_REDUCE_INT_INSTANTIATIONS 0
+#endif
+
+FMT_BEGIN_NAMESPACE
+
+template <typename Char, typename Traits, typename Allocator>
+struct is_contiguous<std::basic_string<Char, Traits, Allocator>>
+    : std::true_type {};
+
+namespace detail {
+
+// __builtin_clz is broken in clang with Microsoft codegen:
+// https://github.com/fmtlib/fmt/issues/519.
+#if !FMT_MSC_VERSION
+#  if FMT_HAS_BUILTIN(__builtin_clz) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZ(n) __builtin_clz(n)
+#  endif
+#  if FMT_HAS_BUILTIN(__builtin_clzll) || FMT_GCC_VERSION || FMT_ICC_VERSION
+#    define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
+#  endif
+#endif
+
+// Some compilers masquerade as both MSVC and GCC but otherwise support
+// __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
+// MSVC intrinsics if the clz and clzll builtins are not available.
+#if FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL)
+// Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
+#  ifndef __clang__
+#    pragma intrinsic(_BitScanReverse)
+#    ifdef _WIN64
+#      pragma intrinsic(_BitScanReverse64)
+#    endif
+#  endif
+
+inline auto clz(uint32_t x) -> int {
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  unsigned long r = 0;
+  _BitScanReverse(&r, x);
+  return 31 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZ(n) detail::clz(n)
+
+inline auto clzll(uint64_t x) -> int {
+  FMT_ASSERT(x != 0, "");
+  FMT_MSC_WARNING(suppress : 6102)  // Suppress a bogus static analysis warning.
+  unsigned long r = 0;
+#  ifdef _WIN64
+  _BitScanReverse64(&r, x);
+#  else
+  // Scan the high 32 bits.
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32)))
+    return 63 ^ static_cast<int>(r + 32);
+  // Scan the low 32 bits.
+  _BitScanReverse(&r, static_cast<uint32_t>(x));
+#  endif
+  return 63 ^ static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
+#endif  // FMT_MSC_VERSION && !defined(FMT_BUILTIN_CLZLL)
+
+FMT_CONSTEXPR inline void abort_fuzzing_if(bool condition) {
+  ignore_unused(condition);
+#ifdef FMT_FUZZ
+  if (condition) throw std::runtime_error("fuzzing limit reached");
+#endif
+}
+
+#if defined(FMT_USE_STRING_VIEW)
+template <typename Char> using std_string_view = std::basic_string_view<Char>;
+#else
+template <typename Char> struct std_string_view {
+  operator basic_string_view<Char>() const;
+};
+#endif
+
+template <typename Char, Char... C> struct string_literal {
+  static constexpr Char value[sizeof...(C)] = {C...};
+  constexpr operator basic_string_view<Char>() const {
+    return {value, sizeof...(C)};
+  }
+};
+#if FMT_CPLUSPLUS < 201703L
+template <typename Char, Char... C>
+constexpr Char string_literal<Char, C...>::value[sizeof...(C)];
+#endif
+
+// Implementation of std::bit_cast for pre-C++20.
+template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) == sizeof(From))>
+FMT_CONSTEXPR20 auto bit_cast(const From& from) -> To {
+#ifdef __cpp_lib_bit_cast
+  if (is_constant_evaluated()) return std::bit_cast<To>(from);
+#endif
+  auto to = To();
+  // The cast suppresses a bogus -Wclass-memaccess on GCC.
+  std::memcpy(static_cast<void*>(&to), &from, sizeof(to));
+  return to;
+}
+
+inline auto is_big_endian() -> bool {
+#ifdef _WIN32
+  return false;
+#elif defined(__BIG_ENDIAN__)
+  return true;
+#elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__)
+  return __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__;
+#else
+  struct bytes {
+    char data[sizeof(int)];
+  };
+  return bit_cast<bytes>(1).data[0] == 0;
+#endif
+}
+
+class uint128_fallback {
+ private:
+  uint64_t lo_, hi_;
+
+ public:
+  constexpr uint128_fallback(uint64_t hi, uint64_t lo) : lo_(lo), hi_(hi) {}
+  constexpr uint128_fallback(uint64_t value = 0) : lo_(value), hi_(0) {}
+
+  constexpr auto high() const noexcept -> uint64_t { return hi_; }
+  constexpr auto low() const noexcept -> uint64_t { return lo_; }
+
+  template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+  constexpr explicit operator T() const {
+    return static_cast<T>(lo_);
+  }
+
+  friend constexpr auto operator==(const uint128_fallback& lhs,
+                                   const uint128_fallback& rhs) -> bool {
+    return lhs.hi_ == rhs.hi_ && lhs.lo_ == rhs.lo_;
+  }
+  friend constexpr auto operator!=(const uint128_fallback& lhs,
+                                   const uint128_fallback& rhs) -> bool {
+    return !(lhs == rhs);
+  }
+  friend constexpr auto operator>(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs) -> bool {
+    return lhs.hi_ != rhs.hi_ ? lhs.hi_ > rhs.hi_ : lhs.lo_ > rhs.lo_;
+  }
+  friend constexpr auto operator|(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ | rhs.hi_, lhs.lo_ | rhs.lo_};
+  }
+  friend constexpr auto operator&(const uint128_fallback& lhs,
+                                  const uint128_fallback& rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ & rhs.hi_, lhs.lo_ & rhs.lo_};
+  }
+  friend constexpr auto operator~(const uint128_fallback& n)
+      -> uint128_fallback {
+    return {~n.hi_, ~n.lo_};
+  }
+  friend FMT_CONSTEXPR auto operator+(const uint128_fallback& lhs,
+                                      const uint128_fallback& rhs)
+      -> uint128_fallback {
+    auto result = uint128_fallback(lhs);
+    result += rhs;
+    return result;
+  }
+  friend FMT_CONSTEXPR auto operator*(const uint128_fallback& lhs, uint32_t rhs)
+      -> uint128_fallback {
+    FMT_ASSERT(lhs.hi_ == 0, "");
+    uint64_t hi = (lhs.lo_ >> 32) * rhs;
+    uint64_t lo = (lhs.lo_ & ~uint32_t()) * rhs;
+    uint64_t new_lo = (hi << 32) + lo;
+    return {(hi >> 32) + (new_lo < lo ? 1 : 0), new_lo};
+  }
+  friend constexpr auto operator-(const uint128_fallback& lhs, uint64_t rhs)
+      -> uint128_fallback {
+    return {lhs.hi_ - (lhs.lo_ < rhs ? 1 : 0), lhs.lo_ - rhs};
+  }
+  FMT_CONSTEXPR auto operator>>(int shift) const -> uint128_fallback {
+    if (shift == 64) return {0, hi_};
+    if (shift > 64) return uint128_fallback(0, hi_) >> (shift - 64);
+    return {hi_ >> shift, (hi_ << (64 - shift)) | (lo_ >> shift)};
+  }
+  FMT_CONSTEXPR auto operator<<(int shift) const -> uint128_fallback {
+    if (shift == 64) return {lo_, 0};
+    if (shift > 64) return uint128_fallback(lo_, 0) << (shift - 64);
+    return {hi_ << shift | (lo_ >> (64 - shift)), (lo_ << shift)};
+  }
+  FMT_CONSTEXPR auto operator>>=(int shift) -> uint128_fallback& {
+    return *this = *this >> shift;
+  }
+  FMT_CONSTEXPR void operator+=(uint128_fallback n) {
+    uint64_t new_lo = lo_ + n.lo_;
+    uint64_t new_hi = hi_ + n.hi_ + (new_lo < lo_ ? 1 : 0);
+    FMT_ASSERT(new_hi >= hi_, "");
+    lo_ = new_lo;
+    hi_ = new_hi;
+  }
+  FMT_CONSTEXPR void operator&=(uint128_fallback n) {
+    lo_ &= n.lo_;
+    hi_ &= n.hi_;
+  }
+
+  FMT_CONSTEXPR20 auto operator+=(uint64_t n) noexcept -> uint128_fallback& {
+    if (is_constant_evaluated()) {
+      lo_ += n;
+      hi_ += (lo_ < n ? 1 : 0);
+      return *this;
+    }
+#if FMT_HAS_BUILTIN(__builtin_addcll) && !defined(__ibmxl__)
+    unsigned long long carry;
+    lo_ = __builtin_addcll(lo_, n, 0, &carry);
+    hi_ += carry;
+#elif FMT_HAS_BUILTIN(__builtin_ia32_addcarryx_u64) && !defined(__ibmxl__)
+    unsigned long long result;
+    auto carry = __builtin_ia32_addcarryx_u64(0, lo_, n, &result);
+    lo_ = result;
+    hi_ += carry;
+#elif defined(_MSC_VER) && defined(_M_X64)
+    auto carry = _addcarry_u64(0, lo_, n, &lo_);
+    _addcarry_u64(carry, hi_, 0, &hi_);
+#else
+    lo_ += n;
+    hi_ += (lo_ < n ? 1 : 0);
+#endif
+    return *this;
+  }
+};
+
+using uint128_t = conditional_t<FMT_USE_INT128, uint128_opt, uint128_fallback>;
+
+#ifdef UINTPTR_MAX
+using uintptr_t = ::uintptr_t;
+#else
+using uintptr_t = uint128_t;
+#endif
+
+// Returns the largest possible value for type T. Same as
+// std::numeric_limits<T>::max() but shorter and not affected by the max macro.
+template <typename T> constexpr auto max_value() -> T {
+  return (std::numeric_limits<T>::max)();
+}
+template <typename T> constexpr auto num_bits() -> int {
+  return std::numeric_limits<T>::digits;
+}
+// std::numeric_limits<T>::digits may return 0 for 128-bit ints.
+template <> constexpr auto num_bits<int128_opt>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_opt>() -> int { return 128; }
+template <> constexpr auto num_bits<uint128_fallback>() -> int { return 128; }
+
+// A heterogeneous bit_cast used for converting 96-bit long double to uint128_t
+// and 128-bit pointers to uint128_fallback.
+template <typename To, typename From, FMT_ENABLE_IF(sizeof(To) > sizeof(From))>
+inline auto bit_cast(const From& from) -> To {
+  constexpr auto size = static_cast<int>(sizeof(From) / sizeof(unsigned short));
+  struct data_t {
+    unsigned short value[static_cast<unsigned>(size)];
+  } data = bit_cast<data_t>(from);
+  auto result = To();
+  if (const_check(is_big_endian())) {
+    for (int i = 0; i < size; ++i)
+      result = (result << num_bits<unsigned short>()) | data.value[i];
+  } else {
+    for (int i = size - 1; i >= 0; --i)
+      result = (result << num_bits<unsigned short>()) | data.value[i];
+  }
+  return result;
+}
+
+template <typename UInt>
+FMT_CONSTEXPR20 inline auto countl_zero_fallback(UInt n) -> int {
+  int lz = 0;
+  constexpr UInt msb_mask = static_cast<UInt>(1) << (num_bits<UInt>() - 1);
+  for (; (n & msb_mask) == 0; n <<= 1) lz++;
+  return lz;
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZ(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_CONSTEXPR20 inline auto countl_zero(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated()) return FMT_BUILTIN_CLZLL(n);
+#endif
+  return countl_zero_fallback(n);
+}
+
+FMT_INLINE void assume(bool condition) {
+  (void)condition;
+#if FMT_HAS_BUILTIN(__builtin_assume) && !FMT_ICC_VERSION
+  __builtin_assume(condition);
+#elif FMT_GCC_VERSION
+  if (!condition) __builtin_unreachable();
+#endif
+}
+
+// Attempts to reserve space for n extra characters in the output range.
+// Returns a pointer to the reserved range or a reference to it.
+template <typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            is_contiguous<typename OutputIt::container>::value)>
+#if FMT_CLANG_VERSION >= 307 && !FMT_ICC_VERSION
+__attribute__((no_sanitize("undefined")))
+#endif
+FMT_CONSTEXPR20 inline auto
+reserve(OutputIt it, size_t n) -> typename OutputIt::value_type* {
+  auto& c = get_container(it);
+  size_t size = c.size();
+  c.resize(size + n);
+  return &c[size];
+}
+
+template <typename T>
+FMT_CONSTEXPR20 inline auto reserve(basic_appender<T> it, size_t n)
+    -> basic_appender<T> {
+  buffer<T>& buf = get_container(it);
+  buf.try_reserve(buf.size() + n);
+  return it;
+}
+
+template <typename Iterator>
+constexpr auto reserve(Iterator& it, size_t) -> Iterator& {
+  return it;
+}
+
+template <typename OutputIt>
+using reserve_iterator =
+    remove_reference_t<decltype(reserve(std::declval<OutputIt&>(), 0))>;
+
+template <typename T, typename OutputIt>
+constexpr auto to_pointer(OutputIt, size_t) -> T* {
+  return nullptr;
+}
+template <typename T>
+FMT_CONSTEXPR20 auto to_pointer(basic_appender<T> it, size_t n) -> T* {
+  buffer<T>& buf = get_container(it);
+  buf.try_reserve(buf.size() + n);
+  auto size = buf.size();
+  if (buf.capacity() < size + n) return nullptr;
+  buf.try_resize(size + n);
+  return buf.data() + size;
+}
+
+template <typename OutputIt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value&&
+                            is_contiguous<typename OutputIt::container>::value)>
+inline auto base_iterator(OutputIt it,
+                          typename OutputIt::container_type::value_type*)
+    -> OutputIt {
+  return it;
+}
+
+template <typename Iterator>
+constexpr auto base_iterator(Iterator, Iterator it) -> Iterator {
+  return it;
+}
+
+// <algorithm> is spectacularly slow to compile in C++20 so use a simple fill_n
+// instead (#1998).
+template <typename OutputIt, typename Size, typename T>
+FMT_CONSTEXPR auto fill_n(OutputIt out, Size count, const T& value)
+    -> OutputIt {
+  for (Size i = 0; i < count; ++i) *out++ = value;
+  return out;
+}
+template <typename T, typename Size>
+FMT_CONSTEXPR20 auto fill_n(T* out, Size count, char value) -> T* {
+  if (is_constant_evaluated()) return fill_n<T*, Size, T>(out, count, value);
+  std::memset(out, value, to_unsigned(count));
+  return out + count;
+}
+
+template <typename OutChar, typename InputIt, typename OutputIt>
+FMT_CONSTEXPR FMT_NOINLINE auto copy_noinline(InputIt begin, InputIt end,
+                                              OutputIt out) -> OutputIt {
+  return copy<OutChar>(begin, end, out);
+}
+
+// A public domain branchless UTF-8 decoder by Christopher Wellons:
+// https://github.com/skeeto/branchless-utf8
+/* Decode the next character, c, from s, reporting errors in e.
+ *
+ * Since this is a branchless decoder, four bytes will be read from the
+ * buffer regardless of the actual length of the next character. This
+ * means the buffer _must_ have at least three bytes of zero padding
+ * following the end of the data stream.
+ *
+ * Errors are reported in e, which will be non-zero if the parsed
+ * character was somehow invalid: invalid byte sequence, non-canonical
+ * encoding, or a surrogate half.
+ *
+ * The function returns a pointer to the next character. When an error
+ * occurs, this pointer will be a guess that depends on the particular
+ * error, but it will always advance at least one byte.
+ */
+FMT_CONSTEXPR inline auto utf8_decode(const char* s, uint32_t* c, int* e)
+    -> const char* {
+  constexpr const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
+  constexpr const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
+  constexpr const int shiftc[] = {0, 18, 12, 6, 0};
+  constexpr const int shifte[] = {0, 6, 4, 2, 0};
+
+  int len = "\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\0\0\2\2\2\2\3\3\4"
+      [static_cast<unsigned char>(*s) >> 3];
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  const char* next = s + len + !len;
+
+  using uchar = unsigned char;
+
+  // Assume a four-byte character and load four bytes. Unused bits are
+  // shifted out.
+  *c = uint32_t(uchar(s[0]) & masks[len]) << 18;
+  *c |= uint32_t(uchar(s[1]) & 0x3f) << 12;
+  *c |= uint32_t(uchar(s[2]) & 0x3f) << 6;
+  *c |= uint32_t(uchar(s[3]) & 0x3f) << 0;
+  *c >>= shiftc[len];
+
+  // Accumulate the various error conditions.
+  *e = (*c < mins[len]) << 6;       // non-canonical encoding
+  *e |= ((*c >> 11) == 0x1b) << 7;  // surrogate half?
+  *e |= (*c > 0x10FFFF) << 8;       // out of range?
+  *e |= (uchar(s[1]) & 0xc0) >> 2;
+  *e |= (uchar(s[2]) & 0xc0) >> 4;
+  *e |= uchar(s[3]) >> 6;
+  *e ^= 0x2a;  // top two bits of each tail byte correct?
+  *e >>= shifte[len];
+
+  return next;
+}
+
+constexpr FMT_INLINE_VARIABLE uint32_t invalid_code_point = ~uint32_t();
+
+// Invokes f(cp, sv) for every code point cp in s with sv being the string view
+// corresponding to the code point. cp is invalid_code_point on error.
+template <typename F>
+FMT_CONSTEXPR void for_each_codepoint(string_view s, F f) {
+  auto decode = [f](const char* buf_ptr, const char* ptr) {
+    auto cp = uint32_t();
+    auto error = 0;
+    auto end = utf8_decode(buf_ptr, &cp, &error);
+    bool result = f(error ? invalid_code_point : cp,
+                    string_view(ptr, error ? 1 : to_unsigned(end - buf_ptr)));
+    return result ? (error ? buf_ptr + 1 : end) : nullptr;
+  };
+
+  auto p = s.data();
+  const size_t block_size = 4;  // utf8_decode always reads blocks of 4 chars.
+  if (s.size() >= block_size) {
+    for (auto end = p + s.size() - block_size + 1; p < end;) {
+      p = decode(p, p);
+      if (!p) return;
+    }
+  }
+  auto num_chars_left = to_unsigned(s.data() + s.size() - p);
+  if (num_chars_left == 0) return;
+
+  // Suppress bogus -Wstringop-overflow.
+  if (FMT_GCC_VERSION) num_chars_left &= 3;
+  char buf[2 * block_size - 1] = {};
+  copy<char>(p, p + num_chars_left, buf);
+  const char* buf_ptr = buf;
+  do {
+    auto end = decode(buf_ptr, p);
+    if (!end) return;
+    p += end - buf_ptr;
+    buf_ptr = end;
+  } while (buf_ptr < buf + num_chars_left);
+}
+
+template <typename Char>
+inline auto compute_width(basic_string_view<Char> s) -> size_t {
+  return s.size();
+}
+
+// Computes approximate display width of a UTF-8 string.
+FMT_CONSTEXPR inline auto compute_width(string_view s) -> size_t {
+  size_t num_code_points = 0;
+  // It is not a lambda for compatibility with C++14.
+  struct count_code_points {
+    size_t* count;
+    FMT_CONSTEXPR auto operator()(uint32_t cp, string_view) const -> bool {
+      *count += to_unsigned(
+          1 +
+          (cp >= 0x1100 &&
+           (cp <= 0x115f ||  // Hangul Jamo init. consonants
+            cp == 0x2329 ||  // LEFT-POINTING ANGLE BRACKET
+            cp == 0x232a ||  // RIGHT-POINTING ANGLE BRACKET
+            // CJK ... Yi except IDEOGRAPHIC HALF FILL SPACE:
+            (cp >= 0x2e80 && cp <= 0xa4cf && cp != 0x303f) ||
+            (cp >= 0xac00 && cp <= 0xd7a3) ||    // Hangul Syllables
+            (cp >= 0xf900 && cp <= 0xfaff) ||    // CJK Compatibility Ideographs
+            (cp >= 0xfe10 && cp <= 0xfe19) ||    // Vertical Forms
+            (cp >= 0xfe30 && cp <= 0xfe6f) ||    // CJK Compatibility Forms
+            (cp >= 0xff00 && cp <= 0xff60) ||    // Fullwidth Forms
+            (cp >= 0xffe0 && cp <= 0xffe6) ||    // Fullwidth Forms
+            (cp >= 0x20000 && cp <= 0x2fffd) ||  // CJK
+            (cp >= 0x30000 && cp <= 0x3fffd) ||
+            // Miscellaneous Symbols and Pictographs + Emoticons:
+            (cp >= 0x1f300 && cp <= 0x1f64f) ||
+            // Supplemental Symbols and Pictographs:
+            (cp >= 0x1f900 && cp <= 0x1f9ff))));
+      return true;
+    }
+  };
+  // We could avoid branches by using utf8_decode directly.
+  for_each_codepoint(s, count_code_points{&num_code_points});
+  return num_code_points;
+}
+
+template <typename Char>
+inline auto code_point_index(basic_string_view<Char> s, size_t n) -> size_t {
+  return min_of(n, s.size());
+}
+
+// Calculates the index of the nth code point in a UTF-8 string.
+inline auto code_point_index(string_view s, size_t n) -> size_t {
+  size_t result = s.size();
+  const char* begin = s.begin();
+  for_each_codepoint(s, [begin, &n, &result](uint32_t, string_view sv) {
+    if (n != 0) {
+      --n;
+      return true;
+    }
+    result = to_unsigned(sv.begin() - begin);
+    return false;
+  });
+  return result;
+}
+
+template <typename T> struct is_integral : std::is_integral<T> {};
+template <> struct is_integral<int128_opt> : std::true_type {};
+template <> struct is_integral<uint128_t> : std::true_type {};
+
+template <typename T>
+using is_signed =
+    std::integral_constant<bool, std::numeric_limits<T>::is_signed ||
+                                     std::is_same<T, int128_opt>::value>;
+
+template <typename T>
+using is_integer =
+    bool_constant<is_integral<T>::value && !std::is_same<T, bool>::value &&
+                  !std::is_same<T, char>::value &&
+                  !std::is_same<T, wchar_t>::value>;
+
+#if defined(FMT_USE_FLOAT128)
+// Use the provided definition.
+#elif FMT_CLANG_VERSION && FMT_HAS_INCLUDE(<quadmath.h>)
+#  define FMT_USE_FLOAT128 1
+#elif FMT_GCC_VERSION && defined(_GLIBCXX_USE_FLOAT128) && \
+    !defined(__STRICT_ANSI__)
+#  define FMT_USE_FLOAT128 1
+#else
+#  define FMT_USE_FLOAT128 0
+#endif
+#if FMT_USE_FLOAT128
+using float128 = __float128;
+#else
+struct float128 {};
+#endif
+
+template <typename T> using is_float128 = std::is_same<T, float128>;
+
+template <typename T>
+using is_floating_point =
+    bool_constant<std::is_floating_point<T>::value || is_float128<T>::value>;
+
+template <typename T, bool = std::is_floating_point<T>::value>
+struct is_fast_float : bool_constant<std::numeric_limits<T>::is_iec559 &&
+                                     sizeof(T) <= sizeof(double)> {};
+template <typename T> struct is_fast_float<T, false> : std::false_type {};
+
+template <typename T>
+using is_double_double = bool_constant<std::numeric_limits<T>::digits == 106>;
+
+#ifndef FMT_USE_FULL_CACHE_DRAGONBOX
+#  define FMT_USE_FULL_CACHE_DRAGONBOX 0
+#endif
+
+// An allocator that uses malloc/free to allow removing dependency on the C++
+// standard libary runtime.
+template <typename T> struct allocator {
+  using value_type = T;
+
+  T* allocate(size_t n) {
+    FMT_ASSERT(n <= max_value<size_t>() / sizeof(T), "");
+    T* p = static_cast<T*>(malloc(n * sizeof(T)));
+    if (!p) FMT_THROW(std::bad_alloc());
+    return p;
+  }
+
+  void deallocate(T* p, size_t) { free(p); }
+};
+
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// The number of characters to store in the basic_memory_buffer object itself
+// to avoid dynamic memory allocation.
+enum { inline_buffer_size = 500 };
+
+/**
+ * A dynamically growing memory buffer for trivially copyable/constructible
+ * types with the first `SIZE` elements stored in the object itself. Most
+ * commonly used via the `memory_buffer` alias for `char`.
+ *
+ * **Example**:
+ *
+ *     auto out = fmt::memory_buffer();
+ *     fmt::format_to(std::back_inserter(out), "The answer is {}.", 42);
+ *
+ * This will append "The answer is 42." to `out`. The buffer content can be
+ * converted to `std::string` with `to_string(out)`.
+ */
+template <typename T, size_t SIZE = inline_buffer_size,
+          typename Allocator = detail::allocator<T>>
+class basic_memory_buffer : public detail::buffer<T> {
+ private:
+  T store_[SIZE];
+
+  // Don't inherit from Allocator to avoid generating type_info for it.
+  FMT_NO_UNIQUE_ADDRESS Allocator alloc_;
+
+  // Deallocate memory allocated by the buffer.
+  FMT_CONSTEXPR20 void deallocate() {
+    T* data = this->data();
+    if (data != store_) alloc_.deallocate(data, this->capacity());
+  }
+
+  static FMT_CONSTEXPR20 void grow(detail::buffer<T>& buf, size_t size) {
+    detail::abort_fuzzing_if(size > 5000);
+    auto& self = static_cast<basic_memory_buffer&>(buf);
+    const size_t max_size =
+        std::allocator_traits<Allocator>::max_size(self.alloc_);
+    size_t old_capacity = buf.capacity();
+    size_t new_capacity = old_capacity + old_capacity / 2;
+    if (size > new_capacity)
+      new_capacity = size;
+    else if (new_capacity > max_size)
+      new_capacity = max_of(size, max_size);
+    T* old_data = buf.data();
+    T* new_data = self.alloc_.allocate(new_capacity);
+    // Suppress a bogus -Wstringop-overflow in gcc 13.1 (#3481).
+    detail::assume(buf.size() <= new_capacity);
+    // The following code doesn't throw, so the raw pointer above doesn't leak.
+    memcpy(new_data, old_data, buf.size() * sizeof(T));
+    self.set(new_data, new_capacity);
+    // deallocate must not throw according to the standard, but even if it does,
+    // the buffer already uses the new storage and will deallocate it in
+    // destructor.
+    if (old_data != self.store_) self.alloc_.deallocate(old_data, old_capacity);
+  }
+
+ public:
+  using value_type = T;
+  using const_reference = const T&;
+
+  FMT_CONSTEXPR explicit basic_memory_buffer(
+      const Allocator& alloc = Allocator())
+      : detail::buffer<T>(grow), alloc_(alloc) {
+    this->set(store_, SIZE);
+    if (detail::is_constant_evaluated()) detail::fill_n(store_, SIZE, T());
+  }
+  FMT_CONSTEXPR20 ~basic_memory_buffer() { deallocate(); }
+
+ private:
+  // Move data from other to this buffer.
+  FMT_CONSTEXPR20 void move(basic_memory_buffer& other) {
+    alloc_ = std::move(other.alloc_);
+    T* data = other.data();
+    size_t size = other.size(), capacity = other.capacity();
+    if (data == other.store_) {
+      this->set(store_, capacity);
+      detail::copy<T>(other.store_, other.store_ + size, store_);
+    } else {
+      this->set(data, capacity);
+      // Set pointer to the inline array so that delete is not called
+      // when deallocating.
+      other.set(other.store_, 0);
+      other.clear();
+    }
+    this->resize(size);
+  }
+
+ public:
+  /// Constructs a `basic_memory_buffer` object moving the content of the other
+  /// object to it.
+  FMT_CONSTEXPR20 basic_memory_buffer(basic_memory_buffer&& other) noexcept
+      : detail::buffer<T>(grow) {
+    move(other);
+  }
+
+  /// Moves the content of the other `basic_memory_buffer` object to this one.
+  auto operator=(basic_memory_buffer&& other) noexcept -> basic_memory_buffer& {
+    FMT_ASSERT(this != &other, "");
+    deallocate();
+    move(other);
+    return *this;
+  }
+
+  // Returns a copy of the allocator associated with this buffer.
+  auto get_allocator() const -> Allocator { return alloc_; }
+
+  /// Resizes the buffer to contain `count` elements. If T is a POD type new
+  /// elements may not be initialized.
+  FMT_CONSTEXPR void resize(size_t count) { this->try_resize(count); }
+
+  /// Increases the buffer capacity to `new_capacity`.
+  void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
+
+  using detail::buffer<T>::append;
+  template <typename ContiguousRange>
+  FMT_CONSTEXPR20 void append(const ContiguousRange& range) {
+    append(range.data(), range.data() + range.size());
+  }
+};
+
+using memory_buffer = basic_memory_buffer<char>;
+
+template <size_t SIZE>
+FMT_NODISCARD auto to_string(const basic_memory_buffer<char, SIZE>& buf)
+    -> std::string {
+  auto size = buf.size();
+  detail::assume(size < std::string().max_size());
+  return {buf.data(), size};
+}
+
+// A writer to a buffered stream. It doesn't own the underlying stream.
+class writer {
+ private:
+  detail::buffer<char>* buf_;
+
+  // We cannot create a file buffer in advance because any write to a FILE may
+  // invalidate it.
+  FILE* file_;
+
+ public:
+  inline writer(FILE* f) : buf_(nullptr), file_(f) {}
+  inline writer(detail::buffer<char>& buf) : buf_(&buf) {}
+
+  /// Formats `args` according to specifications in `fmt` and writes the
+  /// output to the file.
+  template <typename... T> void print(format_string<T...> fmt, T&&... args) {
+    if (buf_)
+      fmt::format_to(appender(*buf_), fmt, std::forward<T>(args)...);
+    else
+      fmt::print(file_, fmt, std::forward<T>(args)...);
+  }
+};
+
+class string_buffer {
+ private:
+  std::string str_;
+  detail::container_buffer<std::string> buf_;
+
+ public:
+  inline string_buffer() : buf_(str_) {}
+
+  inline operator writer() { return buf_; }
+  inline std::string& str() { return str_; }
+};
+
+template <typename T, size_t SIZE, typename Allocator>
+struct is_contiguous<basic_memory_buffer<T, SIZE, Allocator>> : std::true_type {
+};
+
+// Suppress a misleading warning in older versions of clang.
+FMT_PRAGMA_CLANG(diagnostic ignored "-Wweak-vtables")
+
+/// An error reported from a formatting function.
+class FMT_SO_VISIBILITY("default") format_error : public std::runtime_error {
+ public:
+  using std::runtime_error::runtime_error;
+};
+
+class loc_value;
+
+FMT_END_EXPORT
+namespace detail {
+FMT_API auto write_console(int fd, string_view text) -> bool;
+FMT_API void print(FILE*, string_view);
+}  // namespace detail
+
+namespace detail {
+template <typename Char, size_t N> struct fixed_string {
+  FMT_CONSTEXPR20 fixed_string(const Char (&s)[N]) {
+    detail::copy<Char, const Char*, Char*>(static_cast<const Char*>(s), s + N,
+                                           data);
+  }
+  Char data[N] = {};
+};
+
+// Converts a compile-time string to basic_string_view.
+FMT_EXPORT template <typename Char, size_t N>
+constexpr auto compile_string_to_view(const Char (&s)[N])
+    -> basic_string_view<Char> {
+  // Remove trailing NUL character if needed. Won't be present if this is used
+  // with a raw character array (i.e. not defined as a string).
+  return {s, N - (std::char_traits<Char>::to_int_type(s[N - 1]) == 0 ? 1 : 0)};
+}
+FMT_EXPORT template <typename Char>
+constexpr auto compile_string_to_view(basic_string_view<Char> s)
+    -> basic_string_view<Char> {
+  return s;
+}
+
+// Returns true if value is negative, false otherwise.
+// Same as `value < 0` but doesn't produce warnings if T is an unsigned type.
+template <typename T, FMT_ENABLE_IF(is_signed<T>::value)>
+constexpr auto is_negative(T value) -> bool {
+  return value < 0;
+}
+template <typename T, FMT_ENABLE_IF(!is_signed<T>::value)>
+constexpr auto is_negative(T) -> bool {
+  return false;
+}
+
+// Smallest of uint32_t, uint64_t, uint128_t that is large enough to
+// represent all values of an integral type T.
+template <typename T>
+using uint32_or_64_or_128_t =
+    conditional_t<num_bits<T>() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS,
+                  uint32_t,
+                  conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
+template <typename T>
+using uint64_or_128_t = conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>;
+
+#define FMT_POWERS_OF_10(factor)                                  \
+  factor * 10, (factor) * 100, (factor) * 1000, (factor) * 10000, \
+      (factor) * 100000, (factor) * 1000000, (factor) * 10000000, \
+      (factor) * 100000000, (factor) * 1000000000
+
+// Converts value in the range [0, 100) to a string.
+// GCC generates slightly better code when value is pointer-size.
+inline auto digits2(size_t value) -> const char* {
+  // Align data since unaligned access may be slower when crossing a
+  // hardware-specific boundary.
+  alignas(2) static const char data[] =
+      "0001020304050607080910111213141516171819"
+      "2021222324252627282930313233343536373839"
+      "4041424344454647484950515253545556575859"
+      "6061626364656667686970717273747576777879"
+      "8081828384858687888990919293949596979899";
+  return &data[value * 2];
+}
+
+template <typename Char> constexpr auto getsign(sign s) -> Char {
+  return static_cast<char>(((' ' << 24) | ('+' << 16) | ('-' << 8)) >>
+                           (static_cast<int>(s) * 8));
+}
+
+template <typename T> FMT_CONSTEXPR auto count_digits_fallback(T n) -> int {
+  int count = 1;
+  for (;;) {
+    // Integer division is slow so do it for a group of four digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    if (n < 10) return count;
+    if (n < 100) return count + 1;
+    if (n < 1000) return count + 2;
+    if (n < 10000) return count + 3;
+    n /= 10000u;
+    count += 4;
+  }
+}
+#if FMT_USE_INT128
+FMT_CONSTEXPR inline auto count_digits(uint128_opt n) -> int {
+  return count_digits_fallback(n);
+}
+#endif
+
+#ifdef FMT_BUILTIN_CLZLL
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+inline auto do_count_digits(uint64_t n) -> int {
+  // This has comparable performance to the version by Kendall Willets
+  // (https://github.com/fmtlib/format-benchmark/blob/master/digits10)
+  // but uses smaller tables.
+  // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
+  static constexpr uint8_t bsr2log10[] = {
+      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+  auto t = bsr2log10[FMT_BUILTIN_CLZLL(n | 1) ^ 63];
+  static constexpr const uint64_t zero_or_powers_of_10[] = {
+      0, 0, FMT_POWERS_OF_10(1U), FMT_POWERS_OF_10(1000000000ULL),
+      10000000000000000000ULL};
+  return t - (n < zero_or_powers_of_10[t]);
+}
+#endif
+
+// Returns the number of decimal digits in n. Leading zeros are not counted
+// except for n == 0 in which case count_digits returns 1.
+FMT_CONSTEXPR20 inline auto count_digits(uint64_t n) -> int {
+#ifdef FMT_BUILTIN_CLZLL
+  if (!is_constant_evaluated() && !FMT_OPTIMIZE_SIZE) return do_count_digits(n);
+#endif
+  return count_digits_fallback(n);
+}
+
+// Counts the number of digits in n. BITS = log2(radix).
+template <int BITS, typename UInt>
+FMT_CONSTEXPR auto count_digits(UInt n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated() && num_bits<UInt>() == 32)
+    return (FMT_BUILTIN_CLZ(static_cast<uint32_t>(n) | 1) ^ 31) / BITS + 1;
+#endif
+  // Lambda avoids unreachable code warnings from NVHPC.
+  return [](UInt m) {
+    int num_digits = 0;
+    do {
+      ++num_digits;
+    } while ((m >>= BITS) != 0);
+    return num_digits;
+  }(n);
+}
+
+#ifdef FMT_BUILTIN_CLZ
+// It is a separate function rather than a part of count_digits to workaround
+// the lack of static constexpr in constexpr functions.
+FMT_INLINE auto do_count_digits(uint32_t n) -> int {
+// An optimization by Kendall Willets from https://bit.ly/3uOIQrB.
+// This increments the upper 32 bits (log10(T) - 1) when >= T is added.
+#  define FMT_INC(T) (((sizeof(#T) - 1ull) << 32) - T)
+  static constexpr uint64_t table[] = {
+      FMT_INC(0),          FMT_INC(0),          FMT_INC(0),           // 8
+      FMT_INC(10),         FMT_INC(10),         FMT_INC(10),          // 64
+      FMT_INC(100),        FMT_INC(100),        FMT_INC(100),         // 512
+      FMT_INC(1000),       FMT_INC(1000),       FMT_INC(1000),        // 4096
+      FMT_INC(10000),      FMT_INC(10000),      FMT_INC(10000),       // 32k
+      FMT_INC(100000),     FMT_INC(100000),     FMT_INC(100000),      // 256k
+      FMT_INC(1000000),    FMT_INC(1000000),    FMT_INC(1000000),     // 2048k
+      FMT_INC(10000000),   FMT_INC(10000000),   FMT_INC(10000000),    // 16M
+      FMT_INC(100000000),  FMT_INC(100000000),  FMT_INC(100000000),   // 128M
+      FMT_INC(1000000000), FMT_INC(1000000000), FMT_INC(1000000000),  // 1024M
+      FMT_INC(1000000000), FMT_INC(1000000000)                        // 4B
+  };
+  auto inc = table[FMT_BUILTIN_CLZ(n | 1) ^ 31];
+  return static_cast<int>((n + inc) >> 32);
+}
+#endif
+
+// Optional version of count_digits for better performance on 32-bit platforms.
+FMT_CONSTEXPR20 inline auto count_digits(uint32_t n) -> int {
+#ifdef FMT_BUILTIN_CLZ
+  if (!is_constant_evaluated() && !FMT_OPTIMIZE_SIZE) return do_count_digits(n);
+#endif
+  return count_digits_fallback(n);
+}
+
+template <typename Int> constexpr auto digits10() noexcept -> int {
+  return std::numeric_limits<Int>::digits10;
+}
+template <> constexpr auto digits10<int128_opt>() noexcept -> int { return 38; }
+template <> constexpr auto digits10<uint128_t>() noexcept -> int { return 38; }
+
+template <typename Char> struct thousands_sep_result {
+  std::string grouping;
+  Char thousands_sep;
+};
+
+template <typename Char>
+FMT_API auto thousands_sep_impl(locale_ref loc) -> thousands_sep_result<Char>;
+template <typename Char>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<Char> {
+  auto result = thousands_sep_impl<char>(loc);
+  return {result.grouping, Char(result.thousands_sep)};
+}
+template <>
+inline auto thousands_sep(locale_ref loc) -> thousands_sep_result<wchar_t> {
+  return thousands_sep_impl<wchar_t>(loc);
+}
+
+template <typename Char>
+FMT_API auto decimal_point_impl(locale_ref loc) -> Char;
+template <typename Char> inline auto decimal_point(locale_ref loc) -> Char {
+  return Char(decimal_point_impl<char>(loc));
+}
+template <> inline auto decimal_point(locale_ref loc) -> wchar_t {
+  return decimal_point_impl<wchar_t>(loc);
+}
+
+#ifndef FMT_HEADER_ONLY
+FMT_BEGIN_EXPORT
+extern template FMT_API auto thousands_sep_impl<char>(locale_ref)
+    -> thousands_sep_result<char>;
+extern template FMT_API auto thousands_sep_impl<wchar_t>(locale_ref)
+    -> thousands_sep_result<wchar_t>;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> char;
+extern template FMT_API auto decimal_point_impl(locale_ref) -> wchar_t;
+FMT_END_EXPORT
+#endif  // FMT_HEADER_ONLY
+
+// Compares two characters for equality.
+template <typename Char> auto equal2(const Char* lhs, const char* rhs) -> bool {
+  return lhs[0] == Char(rhs[0]) && lhs[1] == Char(rhs[1]);
+}
+inline auto equal2(const char* lhs, const char* rhs) -> bool {
+  return memcmp(lhs, rhs, 2) == 0;
+}
+
+// Writes a two-digit value to out.
+template <typename Char>
+FMT_CONSTEXPR20 FMT_INLINE void write2digits(Char* out, size_t value) {
+  if (!is_constant_evaluated() && std::is_same<Char, char>::value &&
+      !FMT_OPTIMIZE_SIZE) {
+    memcpy(out, digits2(value), 2);
+    return;
+  }
+  *out++ = static_cast<Char>('0' + value / 10);
+  *out = static_cast<Char>('0' + value % 10);
+}
+
+// Formats a decimal unsigned integer value writing to out pointing to a buffer
+// of specified size. The caller must ensure that the buffer is large enough.
+template <typename Char, typename UInt>
+FMT_CONSTEXPR20 auto do_format_decimal(Char* out, UInt value, int size)
+    -> Char* {
+  FMT_ASSERT(size >= count_digits(value), "invalid digit count");
+  unsigned n = to_unsigned(size);
+  while (value >= 100) {
+    // Integer division is slow so do it for a group of two digits instead
+    // of for every digit. The idea comes from the talk by Alexandrescu
+    // "Three Optimization Tips for C++". See speed-test for a comparison.
+    n -= 2;
+    write2digits(out + n, static_cast<unsigned>(value % 100));
+    value /= 100;
+  }
+  if (value >= 10) {
+    n -= 2;
+    write2digits(out + n, static_cast<unsigned>(value));
+  } else {
+    out[--n] = static_cast<Char>('0' + value);
+  }
+  return out + n;
+}
+
+template <typename Char, typename UInt>
+FMT_CONSTEXPR FMT_INLINE auto format_decimal(Char* out, UInt value,
+                                             int num_digits) -> Char* {
+  do_format_decimal(out, value, num_digits);
+  return out + num_digits;
+}
+
+template <typename Char, typename UInt, typename OutputIt,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
+FMT_CONSTEXPR auto format_decimal(OutputIt out, UInt value, int num_digits)
+    -> OutputIt {
+  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
+    do_format_decimal(ptr, value, num_digits);
+    return out;
+  }
+  // Buffer is large enough to hold all digits (digits10 + 1).
+  char buffer[digits10<UInt>() + 1];
+  if (is_constant_evaluated()) fill_n(buffer, sizeof(buffer), '\0');
+  do_format_decimal(buffer, value, num_digits);
+  return copy_noinline<Char>(buffer, buffer + num_digits, out);
+}
+
+template <typename Char, typename UInt>
+FMT_CONSTEXPR auto do_format_base2e(int base_bits, Char* out, UInt value,
+                                    int size, bool upper = false) -> Char* {
+  out += size;
+  do {
+    const char* digits = upper ? "0123456789ABCDEF" : "0123456789abcdef";
+    unsigned digit = static_cast<unsigned>(value & ((1 << base_bits) - 1));
+    *--out = static_cast<Char>(base_bits < 4 ? static_cast<char>('0' + digit)
+                                             : digits[digit]);
+  } while ((value >>= base_bits) != 0);
+  return out;
+}
+
+// Formats an unsigned integer in the power of two base (binary, octal, hex).
+template <typename Char, typename UInt>
+FMT_CONSTEXPR auto format_base2e(int base_bits, Char* out, UInt value,
+                                 int num_digits, bool upper = false) -> Char* {
+  do_format_base2e(base_bits, out, value, num_digits, upper);
+  return out + num_digits;
+}
+
+template <typename Char, typename OutputIt, typename UInt,
+          FMT_ENABLE_IF(is_back_insert_iterator<OutputIt>::value)>
+FMT_CONSTEXPR inline auto format_base2e(int base_bits, OutputIt out, UInt value,
+                                        int num_digits, bool upper = false)
+    -> OutputIt {
+  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
+    format_base2e(base_bits, ptr, value, num_digits, upper);
+    return out;
+  }
+  // Make buffer large enough for any base.
+  char buffer[num_bits<UInt>()];
+  if (is_constant_evaluated()) fill_n(buffer, sizeof(buffer), '\0');
+  format_base2e(base_bits, buffer, value, num_digits, upper);
+  return detail::copy_noinline<Char>(buffer, buffer + num_digits, out);
+}
+
+// A converter from UTF-8 to UTF-16.
+class utf8_to_utf16 {
+ private:
+  basic_memory_buffer<wchar_t> buffer_;
+
+ public:
+  FMT_API explicit utf8_to_utf16(string_view s);
+  inline operator basic_string_view<wchar_t>() const {
+    return {&buffer_[0], size()};
+  }
+  inline auto size() const -> size_t { return buffer_.size() - 1; }
+  inline auto c_str() const -> const wchar_t* { return &buffer_[0]; }
+  inline auto str() const -> std::wstring { return {&buffer_[0], size()}; }
+};
+
+enum class to_utf8_error_policy { abort, replace };
+
+// A converter from UTF-16/UTF-32 (host endian) to UTF-8.
+template <typename WChar, typename Buffer = memory_buffer> class to_utf8 {
+ private:
+  Buffer buffer_;
+
+ public:
+  to_utf8() {}
+  explicit to_utf8(basic_string_view<WChar> s,
+                   to_utf8_error_policy policy = to_utf8_error_policy::abort) {
+    static_assert(sizeof(WChar) == 2 || sizeof(WChar) == 4,
+                  "Expect utf16 or utf32");
+    if (!convert(s, policy))
+      FMT_THROW(std::runtime_error(sizeof(WChar) == 2 ? "invalid utf16"
+                                                      : "invalid utf32"));
+  }
+  operator string_view() const { return string_view(&buffer_[0], size()); }
+  auto size() const -> size_t { return buffer_.size() - 1; }
+  auto c_str() const -> const char* { return &buffer_[0]; }
+  auto str() const -> std::string { return std::string(&buffer_[0], size()); }
+
+  // Performs conversion returning a bool instead of throwing exception on
+  // conversion error. This method may still throw in case of memory allocation
+  // error.
+  auto convert(basic_string_view<WChar> s,
+               to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    if (!convert(buffer_, s, policy)) return false;
+    buffer_.push_back(0);
+    return true;
+  }
+  static auto convert(Buffer& buf, basic_string_view<WChar> s,
+                      to_utf8_error_policy policy = to_utf8_error_policy::abort)
+      -> bool {
+    for (auto p = s.begin(); p != s.end(); ++p) {
+      uint32_t c = static_cast<uint32_t>(*p);
+      if (sizeof(WChar) == 2 && c >= 0xd800 && c <= 0xdfff) {
+        // Handle a surrogate pair.
+        ++p;
+        if (p == s.end() || (c & 0xfc00) != 0xd800 || (*p & 0xfc00) != 0xdc00) {
+          if (policy == to_utf8_error_policy::abort) return false;
+          buf.append(string_view("\xEF\xBF\xBD"));
+          --p;
+          continue;
+        } else {
+          c = (c << 10) + static_cast<uint32_t>(*p) - 0x35fdc00;
+        }
+      }
+      if (c < 0x80) {
+        buf.push_back(static_cast<char>(c));
+      } else if (c < 0x800) {
+        buf.push_back(static_cast<char>(0xc0 | (c >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if ((c >= 0x800 && c <= 0xd7ff) || (c >= 0xe000 && c <= 0xffff)) {
+        buf.push_back(static_cast<char>(0xe0 | (c >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else if (c >= 0x10000 && c <= 0x10ffff) {
+        buf.push_back(static_cast<char>(0xf0 | (c >> 18)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0x3ffff) >> 12)));
+        buf.push_back(static_cast<char>(0x80 | ((c & 0xfff) >> 6)));
+        buf.push_back(static_cast<char>(0x80 | (c & 0x3f)));
+      } else {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
+// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
+inline auto umul128(uint64_t x, uint64_t y) noexcept -> uint128_fallback {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return {static_cast<uint64_t>(p >> 64), static_cast<uint64_t>(p)};
+#elif defined(_MSC_VER) && defined(_M_X64)
+  auto hi = uint64_t();
+  auto lo = _umul128(x, y, &hi);
+  return {hi, lo};
+#else
+  const uint64_t mask = static_cast<uint64_t>(max_value<uint32_t>());
+
+  uint64_t a = x >> 32;
+  uint64_t b = x & mask;
+  uint64_t c = y >> 32;
+  uint64_t d = y & mask;
+
+  uint64_t ac = a * c;
+  uint64_t bc = b * c;
+  uint64_t ad = a * d;
+  uint64_t bd = b * d;
+
+  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
+
+  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
+          (intermediate << 32) + (bd & mask)};
+#endif
+}
+
+namespace dragonbox {
+// Computes floor(log10(pow(2, e))) for e in [-2620, 2620] using the method from
+// https://fmt.dev/papers/Dragonbox.pdf#page=28, section 6.1.
+inline auto floor_log10_pow2(int e) noexcept -> int {
+  FMT_ASSERT(e <= 2620 && e >= -2620, "too large exponent");
+  static_assert((-1 >> 1) == -1, "right shift is not arithmetic");
+  return (e * 315653) >> 20;
+}
+
+inline auto floor_log2_pow10(int e) noexcept -> int {
+  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
+  return (e * 1741647) >> 19;
+}
+
+// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
+inline auto umul128_upper64(uint64_t x, uint64_t y) noexcept -> uint64_t {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_opt>(x) * static_cast<uint128_opt>(y);
+  return static_cast<uint64_t>(p >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  return __umulh(x, y);
+#else
+  return umul128(x, y).high();
+#endif
+}
+
+// Computes upper 128 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+inline auto umul192_upper128(uint64_t x, uint128_fallback y) noexcept
+    -> uint128_fallback {
+  uint128_fallback r = umul128(x, y.high());
+  r += umul128_upper64(x, y.low());
+  return r;
+}
+
+FMT_API auto get_cached_power(int k) noexcept -> uint128_fallback;
+
+// Type-specific information that Dragonbox uses.
+template <typename T, typename Enable = void> struct float_info;
+
+template <> struct float_info<float> {
+  using carrier_uint = uint32_t;
+  static const int exponent_bits = 8;
+  static const int kappa = 1;
+  static const int big_divisor = 100;
+  static const int small_divisor = 10;
+  static const int min_k = -31;
+  static const int max_k = 46;
+  static const int shorter_interval_tie_lower_threshold = -35;
+  static const int shorter_interval_tie_upper_threshold = -35;
+};
+
+template <> struct float_info<double> {
+  using carrier_uint = uint64_t;
+  static const int exponent_bits = 11;
+  static const int kappa = 2;
+  static const int big_divisor = 1000;
+  static const int small_divisor = 100;
+  static const int min_k = -292;
+  static const int max_k = 341;
+  static const int shorter_interval_tie_lower_threshold = -77;
+  static const int shorter_interval_tie_upper_threshold = -77;
+};
+
+// An 80- or 128-bit floating point number.
+template <typename T>
+struct float_info<T, enable_if_t<std::numeric_limits<T>::digits == 64 ||
+                                 std::numeric_limits<T>::digits == 113 ||
+                                 is_float128<T>::value>> {
+  using carrier_uint = detail::uint128_t;
+  static const int exponent_bits = 15;
+};
+
+// A double-double floating point number.
+template <typename T>
+struct float_info<T, enable_if_t<is_double_double<T>::value>> {
+  using carrier_uint = detail::uint128_t;
+};
+
+template <typename T> struct decimal_fp {
+  using significand_type = typename float_info<T>::carrier_uint;
+  significand_type significand;
+  int exponent;
+};
+
+template <typename T> FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
+}  // namespace dragonbox
+
+// Returns true iff Float has the implicit bit which is not stored.
+template <typename Float> constexpr auto has_implicit_bit() -> bool {
+  // An 80-bit FP number has a 64-bit significand an no implicit bit.
+  return std::numeric_limits<Float>::digits != 64;
+}
+
+// Returns the number of significand bits stored in Float. The implicit bit is
+// not counted since it is not stored.
+template <typename Float> constexpr auto num_significand_bits() -> int {
+  // std::numeric_limits may not support __float128.
+  return is_float128<Float>() ? 112
+                              : (std::numeric_limits<Float>::digits -
+                                 (has_implicit_bit<Float>() ? 1 : 0));
+}
+
+template <typename Float>
+constexpr auto exponent_mask() ->
+    typename dragonbox::float_info<Float>::carrier_uint {
+  using float_uint = typename dragonbox::float_info<Float>::carrier_uint;
+  return ((float_uint(1) << dragonbox::float_info<Float>::exponent_bits) - 1)
+         << num_significand_bits<Float>();
+}
+template <typename Float> constexpr auto exponent_bias() -> int {
+  // std::numeric_limits may not support __float128.
+  return is_float128<Float>() ? 16383
+                              : std::numeric_limits<Float>::max_exponent - 1;
+}
+
+// Writes the exponent exp in the form "[+-]d{2,3}" to buffer.
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write_exponent(int exp, OutputIt out) -> OutputIt {
+  FMT_ASSERT(-10000 < exp && exp < 10000, "exponent out of range");
+  if (exp < 0) {
+    *out++ = static_cast<Char>('-');
+    exp = -exp;
+  } else {
+    *out++ = static_cast<Char>('+');
+  }
+  auto uexp = static_cast<uint32_t>(exp);
+  if (is_constant_evaluated()) {
+    if (uexp < 10) *out++ = '0';
+    return format_decimal<Char>(out, uexp, count_digits(uexp));
+  }
+  if (uexp >= 100u) {
+    const char* top = digits2(uexp / 100);
+    if (uexp >= 1000u) *out++ = static_cast<Char>(top[0]);
+    *out++ = static_cast<Char>(top[1]);
+    uexp %= 100;
+  }
+  const char* d = digits2(uexp);
+  *out++ = static_cast<Char>(d[0]);
+  *out++ = static_cast<Char>(d[1]);
+  return out;
+}
+
+// A floating-point number f * pow(2, e) where F is an unsigned type.
+template <typename F> struct basic_fp {
+  F f;
+  int e;
+
+  static constexpr const int num_significand_bits =
+      static_cast<int>(sizeof(F) * num_bits<unsigned char>());
+
+  constexpr basic_fp() : f(0), e(0) {}
+  constexpr basic_fp(uint64_t f_val, int e_val) : f(f_val), e(e_val) {}
+
+  // Constructs fp from an IEEE754 floating-point number.
+  template <typename Float> FMT_CONSTEXPR basic_fp(Float n) { assign(n); }
+
+  // Assigns n to this and return true iff predecessor is closer than successor.
+  template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+  FMT_CONSTEXPR auto assign(Float n) -> bool {
+    static_assert(std::numeric_limits<Float>::digits <= 113, "unsupported FP");
+    // Assume Float is in the format [sign][exponent][significand].
+    using carrier_uint = typename dragonbox::float_info<Float>::carrier_uint;
+    const auto num_float_significand_bits =
+        detail::num_significand_bits<Float>();
+    const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+    const auto significand_mask = implicit_bit - 1;
+    auto u = bit_cast<carrier_uint>(n);
+    f = static_cast<F>(u & significand_mask);
+    auto biased_e = static_cast<int>((u & exponent_mask<Float>()) >>
+                                     num_float_significand_bits);
+    // The predecessor is closer if n is a normalized power of 2 (f == 0)
+    // other than the smallest normalized number (biased_e > 1).
+    auto is_predecessor_closer = f == 0 && biased_e > 1;
+    if (biased_e == 0)
+      biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
+    else if (has_implicit_bit<Float>())
+      f += static_cast<F>(implicit_bit);
+    e = biased_e - exponent_bias<Float>() - num_float_significand_bits;
+    if (!has_implicit_bit<Float>()) ++e;
+    return is_predecessor_closer;
+  }
+
+  template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+  FMT_CONSTEXPR auto assign(Float n) -> bool {
+    static_assert(std::numeric_limits<double>::is_iec559, "unsupported FP");
+    return assign(static_cast<double>(n));
+  }
+};
+
+using fp = basic_fp<unsigned long long>;
+
+// Normalizes the value converted from double and multiplied by (1 << SHIFT).
+template <int SHIFT = 0, typename F>
+FMT_CONSTEXPR auto normalize(basic_fp<F> value) -> basic_fp<F> {
+  // Handle subnormals.
+  const auto implicit_bit = F(1) << num_significand_bits<double>();
+  const auto shifted_implicit_bit = implicit_bit << SHIFT;
+  while ((value.f & shifted_implicit_bit) == 0) {
+    value.f <<= 1;
+    --value.e;
+  }
+  // Subtract 1 to account for hidden bit.
+  const auto offset = basic_fp<F>::num_significand_bits -
+                      num_significand_bits<double>() - SHIFT - 1;
+  value.f <<= offset;
+  value.e -= offset;
+  return value;
+}
+
+// Computes lhs * rhs / pow(2, 64) rounded to nearest with half-up tie breaking.
+FMT_CONSTEXPR inline auto multiply(uint64_t lhs, uint64_t rhs) -> uint64_t {
+#if FMT_USE_INT128
+  auto product = static_cast<__uint128_t>(lhs) * rhs;
+  auto f = static_cast<uint64_t>(product >> 64);
+  return (static_cast<uint64_t>(product) & (1ULL << 63)) != 0 ? f + 1 : f;
+#else
+  // Multiply 32-bit parts of significands.
+  uint64_t mask = (1ULL << 32) - 1;
+  uint64_t a = lhs >> 32, b = lhs & mask;
+  uint64_t c = rhs >> 32, d = rhs & mask;
+  uint64_t ac = a * c, bc = b * c, ad = a * d, bd = b * d;
+  // Compute mid 64-bit of result and round.
+  uint64_t mid = (bd >> 32) + (ad & mask) + (bc & mask) + (1U << 31);
+  return ac + (ad >> 32) + (bc >> 32) + (mid >> 32);
+#endif
+}
+
+FMT_CONSTEXPR inline auto operator*(fp x, fp y) -> fp {
+  return {multiply(x.f, y.f), x.e + y.e + 64};
+}
+
+template <typename T, bool doublish = num_bits<T>() == num_bits<double>()>
+using convert_float_result =
+    conditional_t<std::is_same<T, float>::value || doublish, double, T>;
+
+template <typename T>
+constexpr auto convert_float(T value) -> convert_float_result<T> {
+  return static_cast<convert_float_result<T>>(value);
+}
+
+template <typename Char, typename OutputIt>
+FMT_NOINLINE FMT_CONSTEXPR auto fill(OutputIt it, size_t n,
+                                     const basic_specs& specs) -> OutputIt {
+  auto fill_size = specs.fill_size();
+  if (fill_size == 1) return detail::fill_n(it, n, specs.fill_unit<Char>());
+  if (const Char* data = specs.fill<Char>()) {
+    for (size_t i = 0; i < n; ++i) it = copy<Char>(data, data + fill_size, it);
+  }
+  return it;
+}
+
+// Writes the output of f, padded according to format specifications in specs.
+// size: output size in code units.
+// width: output display width in (terminal) column positions.
+template <typename Char, align default_align = align::left, typename OutputIt,
+          typename F>
+FMT_CONSTEXPR auto write_padded(OutputIt out, const format_specs& specs,
+                                size_t size, size_t width, F&& f) -> OutputIt {
+  static_assert(default_align == align::left || default_align == align::right,
+                "");
+  unsigned spec_width = to_unsigned(specs.width);
+  size_t padding = spec_width > width ? spec_width - width : 0;
+  // Shifts are encoded as string literals because static constexpr is not
+  // supported in constexpr functions.
+  auto* shifts =
+      default_align == align::left ? "\x1f\x1f\x00\x01" : "\x00\x1f\x00\x01";
+  size_t left_padding = padding >> shifts[static_cast<int>(specs.align())];
+  size_t right_padding = padding - left_padding;
+  auto it = reserve(out, size + padding * specs.fill_size());
+  if (left_padding != 0) it = fill<Char>(it, left_padding, specs);
+  it = f(it);
+  if (right_padding != 0) it = fill<Char>(it, right_padding, specs);
+  return base_iterator(out, it);
+}
+
+template <typename Char, align default_align = align::left, typename OutputIt,
+          typename F>
+constexpr auto write_padded(OutputIt out, const format_specs& specs,
+                            size_t size, F&& f) -> OutputIt {
+  return write_padded<Char, default_align>(out, specs, size, size, f);
+}
+
+template <typename Char, align default_align = align::left, typename OutputIt>
+FMT_CONSTEXPR auto write_bytes(OutputIt out, string_view bytes,
+                               const format_specs& specs = {}) -> OutputIt {
+  return write_padded<Char, default_align>(
+      out, specs, bytes.size(), [bytes](reserve_iterator<OutputIt> it) {
+        const char* data = bytes.data();
+        return copy<Char>(data, data + bytes.size(), it);
+      });
+}
+
+template <typename Char, typename OutputIt, typename UIntPtr>
+auto write_ptr(OutputIt out, UIntPtr value, const format_specs* specs)
+    -> OutputIt {
+  int num_digits = count_digits<4>(value);
+  auto size = to_unsigned(num_digits) + size_t(2);
+  auto write = [=](reserve_iterator<OutputIt> it) {
+    *it++ = static_cast<Char>('0');
+    *it++ = static_cast<Char>('x');
+    return format_base2e<Char>(4, it, value, num_digits);
+  };
+  return specs ? write_padded<Char, align::right>(out, *specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+}
+
+// Returns true iff the code point cp is printable.
+FMT_API auto is_printable(uint32_t cp) -> bool;
+
+inline auto needs_escape(uint32_t cp) -> bool {
+  if (cp < 0x20 || cp == 0x7f || cp == '"' || cp == '\\') return true;
+  if (const_check(FMT_OPTIMIZE_SIZE > 1)) return false;
+  return !is_printable(cp);
+}
+
+template <typename Char> struct find_escape_result {
+  const Char* begin;
+  const Char* end;
+  uint32_t cp;
+};
+
+template <typename Char>
+auto find_escape(const Char* begin, const Char* end)
+    -> find_escape_result<Char> {
+  for (; begin != end; ++begin) {
+    uint32_t cp = static_cast<unsigned_char<Char>>(*begin);
+    if (const_check(sizeof(Char) == 1) && cp >= 0x80) continue;
+    if (needs_escape(cp)) return {begin, begin + 1, cp};
+  }
+  return {begin, nullptr, 0};
+}
+
+inline auto find_escape(const char* begin, const char* end)
+    -> find_escape_result<char> {
+  if (const_check(!use_utf8)) return find_escape<char>(begin, end);
+  auto result = find_escape_result<char>{end, nullptr, 0};
+  for_each_codepoint(string_view(begin, to_unsigned(end - begin)),
+                     [&](uint32_t cp, string_view sv) {
+                       if (needs_escape(cp)) {
+                         result = {sv.begin(), sv.end(), cp};
+                         return false;
+                       }
+                       return true;
+                     });
+  return result;
+}
+
+template <size_t width, typename Char, typename OutputIt>
+auto write_codepoint(OutputIt out, char prefix, uint32_t cp) -> OutputIt {
+  *out++ = static_cast<Char>('\\');
+  *out++ = static_cast<Char>(prefix);
+  Char buf[width];
+  fill_n(buf, width, static_cast<Char>('0'));
+  format_base2e(4, buf, cp, width);
+  return copy<Char>(buf, buf + width, out);
+}
+
+template <typename OutputIt, typename Char>
+auto write_escaped_cp(OutputIt out, const find_escape_result<Char>& escape)
+    -> OutputIt {
+  auto c = static_cast<Char>(escape.cp);
+  switch (escape.cp) {
+  case '\n':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('n');
+    break;
+  case '\r':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('r');
+    break;
+  case '\t':
+    *out++ = static_cast<Char>('\\');
+    c = static_cast<Char>('t');
+    break;
+  case '"':  FMT_FALLTHROUGH;
+  case '\'': FMT_FALLTHROUGH;
+  case '\\': *out++ = static_cast<Char>('\\'); break;
+  default:
+    if (escape.cp < 0x100) return write_codepoint<2, Char>(out, 'x', escape.cp);
+    if (escape.cp < 0x10000)
+      return write_codepoint<4, Char>(out, 'u', escape.cp);
+    if (escape.cp < 0x110000)
+      return write_codepoint<8, Char>(out, 'U', escape.cp);
+    for (Char escape_char : basic_string_view<Char>(
+             escape.begin, to_unsigned(escape.end - escape.begin))) {
+      out = write_codepoint<2, Char>(out, 'x',
+                                     static_cast<uint32_t>(escape_char) & 0xFF);
+    }
+    return out;
+  }
+  *out++ = c;
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+auto write_escaped_string(OutputIt out, basic_string_view<Char> str)
+    -> OutputIt {
+  *out++ = static_cast<Char>('"');
+  auto begin = str.begin(), end = str.end();
+  do {
+    auto escape = find_escape(begin, end);
+    out = copy<Char>(begin, escape.begin, out);
+    begin = escape.end;
+    if (!begin) break;
+    out = write_escaped_cp<OutputIt, Char>(out, escape);
+  } while (begin != end);
+  *out++ = static_cast<Char>('"');
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+auto write_escaped_char(OutputIt out, Char v) -> OutputIt {
+  Char v_array[1] = {v};
+  *out++ = static_cast<Char>('\'');
+  if ((needs_escape(static_cast<uint32_t>(v)) && v != static_cast<Char>('"')) ||
+      v == static_cast<Char>('\'')) {
+    out = write_escaped_cp(out,
+                           find_escape_result<Char>{v_array, v_array + 1,
+                                                    static_cast<uint32_t>(v)});
+  } else {
+    *out++ = v;
+  }
+  *out++ = static_cast<Char>('\'');
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write_char(OutputIt out, Char value,
+                              const format_specs& specs) -> OutputIt {
+  bool is_debug = specs.type() == presentation_type::debug;
+  return write_padded<Char>(out, specs, 1, [=](reserve_iterator<OutputIt> it) {
+    if (is_debug) return write_escaped_char(it, value);
+    *it++ = value;
+    return it;
+  });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value, const format_specs& specs,
+                         locale_ref loc = {}) -> OutputIt {
+  // char is formatted as unsigned char for consistency across platforms.
+  using unsigned_type =
+      conditional_t<std::is_same<Char, char>::value, unsigned char, unsigned>;
+  return check_char_specs(specs)
+             ? write_char<Char>(out, value, specs)
+             : write<Char>(out, static_cast<unsigned_type>(value), specs, loc);
+}
+
+template <typename Char> class digit_grouping {
+ private:
+  std::string grouping_;
+  std::basic_string<Char> thousands_sep_;
+
+  struct next_state {
+    std::string::const_iterator group;
+    int pos;
+  };
+  auto initial_state() const -> next_state { return {grouping_.begin(), 0}; }
+
+  // Returns the next digit group separator position.
+  auto next(next_state& state) const -> int {
+    if (thousands_sep_.empty()) return max_value<int>();
+    if (state.group == grouping_.end()) return state.pos += grouping_.back();
+    if (*state.group <= 0 || *state.group == max_value<char>())
+      return max_value<int>();
+    state.pos += *state.group++;
+    return state.pos;
+  }
+
+ public:
+  template <typename Locale,
+            FMT_ENABLE_IF(std::is_same<Locale, locale_ref>::value)>
+  explicit digit_grouping(Locale loc, bool localized = true) {
+    if (!localized) return;
+    auto sep = thousands_sep<Char>(loc);
+    grouping_ = sep.grouping;
+    if (sep.thousands_sep) thousands_sep_.assign(1, sep.thousands_sep);
+  }
+  digit_grouping(std::string grouping, std::basic_string<Char> sep)
+      : grouping_(std::move(grouping)), thousands_sep_(std::move(sep)) {}
+
+  auto has_separator() const -> bool { return !thousands_sep_.empty(); }
+
+  auto count_separators(int num_digits) const -> int {
+    int count = 0;
+    auto state = initial_state();
+    while (num_digits > next(state)) ++count;
+    return count;
+  }
+
+  // Applies grouping to digits and write the output to out.
+  template <typename Out, typename C>
+  auto apply(Out out, basic_string_view<C> digits) const -> Out {
+    auto num_digits = static_cast<int>(digits.size());
+    auto separators = basic_memory_buffer<int>();
+    separators.push_back(0);
+    auto state = initial_state();
+    while (int i = next(state)) {
+      if (i >= num_digits) break;
+      separators.push_back(i);
+    }
+    for (int i = 0, sep_index = static_cast<int>(separators.size() - 1);
+         i < num_digits; ++i) {
+      if (num_digits - i == separators[sep_index]) {
+        out = copy<Char>(thousands_sep_.data(),
+                         thousands_sep_.data() + thousands_sep_.size(), out);
+        --sep_index;
+      }
+      *out++ = static_cast<Char>(digits[to_unsigned(i)]);
+    }
+    return out;
+  }
+};
+
+FMT_CONSTEXPR inline void prefix_append(unsigned& prefix, unsigned value) {
+  prefix |= prefix != 0 ? value << 8 : value;
+  prefix += (1u + (value > 0xff ? 1 : 0)) << 24;
+}
+
+// Writes a decimal integer with digit grouping.
+template <typename OutputIt, typename UInt, typename Char>
+auto write_int(OutputIt out, UInt value, unsigned prefix,
+               const format_specs& specs, const digit_grouping<Char>& grouping)
+    -> OutputIt {
+  static_assert(std::is_same<uint64_or_128_t<UInt>, UInt>::value, "");
+  int num_digits = 0;
+  auto buffer = memory_buffer();
+  switch (specs.type()) {
+  default: FMT_ASSERT(false, ""); FMT_FALLTHROUGH;
+  case presentation_type::none:
+  case presentation_type::dec:
+    num_digits = count_digits(value);
+    format_decimal<char>(appender(buffer), value, num_digits);
+    break;
+  case presentation_type::hex:
+    if (specs.alt())
+      prefix_append(prefix, unsigned(specs.upper() ? 'X' : 'x') << 8 | '0');
+    num_digits = count_digits<4>(value);
+    format_base2e<char>(4, appender(buffer), value, num_digits, specs.upper());
+    break;
+  case presentation_type::oct:
+    num_digits = count_digits<3>(value);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    if (specs.alt() && specs.precision <= num_digits && value != 0)
+      prefix_append(prefix, '0');
+    format_base2e<char>(3, appender(buffer), value, num_digits);
+    break;
+  case presentation_type::bin:
+    if (specs.alt())
+      prefix_append(prefix, unsigned(specs.upper() ? 'B' : 'b') << 8 | '0');
+    num_digits = count_digits<1>(value);
+    format_base2e<char>(1, appender(buffer), value, num_digits);
+    break;
+  case presentation_type::chr:
+    return write_char<Char>(out, static_cast<Char>(value), specs);
+  }
+
+  unsigned size = (prefix != 0 ? prefix >> 24 : 0) + to_unsigned(num_digits) +
+                  to_unsigned(grouping.count_separators(num_digits));
+  return write_padded<Char, align::right>(
+      out, specs, size, size, [&](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        return grouping.apply(it, string_view(buffer.data(), buffer.size()));
+      });
+}
+
+#if FMT_USE_LOCALE
+// Writes a localized value.
+FMT_API auto write_loc(appender out, loc_value value, const format_specs& specs,
+                       locale_ref loc) -> bool;
+#endif
+template <typename OutputIt>
+inline auto write_loc(OutputIt, const loc_value&, const format_specs&,
+                      locale_ref) -> bool {
+  return false;
+}
+
+template <typename UInt> struct write_int_arg {
+  UInt abs_value;
+  unsigned prefix;
+};
+
+template <typename T>
+FMT_CONSTEXPR auto make_write_int_arg(T value, sign s)
+    -> write_int_arg<uint32_or_64_or_128_t<T>> {
+  auto prefix = 0u;
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  if (is_negative(value)) {
+    prefix = 0x01000000 | '-';
+    abs_value = 0 - abs_value;
+  } else {
+    constexpr const unsigned prefixes[4] = {0, 0, 0x1000000u | '+',
+                                            0x1000000u | ' '};
+    prefix = prefixes[static_cast<int>(s)];
+  }
+  return {abs_value, prefix};
+}
+
+template <typename Char = char> struct loc_writer {
+  basic_appender<Char> out;
+  const format_specs& specs;
+  std::basic_string<Char> sep;
+  std::string grouping;
+  std::basic_string<Char> decimal_point;
+
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  auto operator()(T value) -> bool {
+    auto arg = make_write_int_arg(value, specs.sign());
+    write_int(out, static_cast<uint64_or_128_t<T>>(arg.abs_value), arg.prefix,
+              specs, digit_grouping<Char>(grouping, sep));
+    return true;
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  auto operator()(T) -> bool {
+    return false;
+  }
+};
+
+// Size and padding computation separate from write_int to avoid template bloat.
+struct size_padding {
+  unsigned size;
+  unsigned padding;
+
+  FMT_CONSTEXPR size_padding(int num_digits, unsigned prefix,
+                             const format_specs& specs)
+      : size((prefix >> 24) + to_unsigned(num_digits)), padding(0) {
+    if (specs.align() == align::numeric) {
+      auto width = to_unsigned(specs.width);
+      if (width > size) {
+        padding = width - size;
+        size = width;
+      }
+    } else if (specs.precision > num_digits) {
+      size = (prefix >> 24) + to_unsigned(specs.precision);
+      padding = to_unsigned(specs.precision - num_digits);
+    }
+  }
+};
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_INLINE auto write_int(OutputIt out, write_int_arg<T> arg,
+                                        const format_specs& specs) -> OutputIt {
+  static_assert(std::is_same<T, uint32_or_64_or_128_t<T>>::value, "");
+
+  constexpr int buffer_size = num_bits<T>();
+  char buffer[buffer_size];
+  if (is_constant_evaluated()) fill_n(buffer, buffer_size, '\0');
+  const char* begin = nullptr;
+  const char* end = buffer + buffer_size;
+
+  auto abs_value = arg.abs_value;
+  auto prefix = arg.prefix;
+  switch (specs.type()) {
+  default: FMT_ASSERT(false, ""); FMT_FALLTHROUGH;
+  case presentation_type::none:
+  case presentation_type::dec:
+    begin = do_format_decimal(buffer, abs_value, buffer_size);
+    break;
+  case presentation_type::hex:
+    begin = do_format_base2e(4, buffer, abs_value, buffer_size, specs.upper());
+    if (specs.alt())
+      prefix_append(prefix, unsigned(specs.upper() ? 'X' : 'x') << 8 | '0');
+    break;
+  case presentation_type::oct: {
+    begin = do_format_base2e(3, buffer, abs_value, buffer_size);
+    // Octal prefix '0' is counted as a digit, so only add it if precision
+    // is not greater than the number of digits.
+    auto num_digits = end - begin;
+    if (specs.alt() && specs.precision <= num_digits && abs_value != 0)
+      prefix_append(prefix, '0');
+    break;
+  }
+  case presentation_type::bin:
+    begin = do_format_base2e(1, buffer, abs_value, buffer_size);
+    if (specs.alt())
+      prefix_append(prefix, unsigned(specs.upper() ? 'B' : 'b') << 8 | '0');
+    break;
+  case presentation_type::chr:
+    return write_char<Char>(out, static_cast<Char>(abs_value), specs);
+  }
+
+  // Write an integer in the format
+  //   <left-padding><prefix><numeric-padding><digits><right-padding>
+  // prefix contains chars in three lower bytes and the size in the fourth byte.
+  int num_digits = static_cast<int>(end - begin);
+  // Slightly faster check for specs.width == 0 && specs.precision == -1.
+  if ((specs.width | (specs.precision + 1)) == 0) {
+    auto it = reserve(out, to_unsigned(num_digits) + (prefix >> 24));
+    for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+      *it++ = static_cast<Char>(p & 0xff);
+    return base_iterator(out, copy<Char>(begin, end, it));
+  }
+  auto sp = size_padding(num_digits, prefix, specs);
+  unsigned padding = sp.padding;
+  return write_padded<Char, align::right>(
+      out, specs, sp.size, [=](reserve_iterator<OutputIt> it) {
+        for (unsigned p = prefix & 0xffffff; p != 0; p >>= 8)
+          *it++ = static_cast<Char>(p & 0xff);
+        it = detail::fill_n(it, padding, static_cast<Char>('0'));
+        return copy<Char>(begin, end, it);
+      });
+}
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR FMT_NOINLINE auto write_int_noinline(OutputIt out,
+                                                   write_int_arg<T> arg,
+                                                   const format_specs& specs)
+    -> OutputIt {
+  return write_int<Char>(out, arg, specs);
+}
+
+template <typename Char, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(basic_appender<Char> out, T value,
+                                    const format_specs& specs, locale_ref loc)
+    -> basic_appender<Char> {
+  if (specs.localized() && write_loc(out, value, specs, loc)) return out;
+  return write_int_noinline<Char>(out, make_write_int_arg(value, specs.sign()),
+                                  specs);
+}
+
+// An inlined version of write used in format string compilation.
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value &&
+                        !std::is_same<OutputIt, basic_appender<Char>>::value)>
+FMT_CONSTEXPR FMT_INLINE auto write(OutputIt out, T value,
+                                    const format_specs& specs, locale_ref loc)
+    -> OutputIt {
+  if (specs.localized() && write_loc(out, value, specs, loc)) return out;
+  return write_int<Char>(out, make_write_int_arg(value, specs.sign()), specs);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
+                         const format_specs& specs) -> OutputIt {
+  auto data = s.data();
+  auto size = s.size();
+  if (specs.precision >= 0 && to_unsigned(specs.precision) < size)
+    size = code_point_index(s, to_unsigned(specs.precision));
+
+  bool is_debug = specs.type() == presentation_type::debug;
+  if (is_debug) {
+    auto buf = counting_buffer<Char>();
+    write_escaped_string(basic_appender<Char>(buf), s);
+    size = buf.count();
+  }
+
+  size_t width = 0;
+  if (specs.width != 0) {
+    width =
+        is_debug ? size : compute_width(basic_string_view<Char>(data, size));
+  }
+  return write_padded<Char>(
+      out, specs, size, width, [=](reserve_iterator<OutputIt> it) {
+        return is_debug ? write_escaped_string(it, s)
+                        : copy<Char>(data, data + size, it);
+      });
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> s,
+                         const format_specs& specs, locale_ref) -> OutputIt {
+  return write<Char>(out, s, specs);
+}
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, const Char* s, const format_specs& specs,
+                         locale_ref) -> OutputIt {
+  if (specs.type() == presentation_type::pointer)
+    return write_ptr<Char>(out, bit_cast<uintptr_t>(s), &specs);
+  if (!s) report_error("string pointer is null");
+  return write<Char>(out, basic_string_view<Char>(s), specs, {});
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_integral<T>::value &&
+                        !std::is_same<T, bool>::value &&
+                        !std::is_same<T, Char>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  auto abs_value = static_cast<uint32_or_64_or_128_t<T>>(value);
+  bool negative = is_negative(value);
+  // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
+  if (negative) abs_value = ~abs_value + 1;
+  int num_digits = count_digits(abs_value);
+  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
+  if (auto ptr = to_pointer<Char>(out, size)) {
+    if (negative) *ptr++ = static_cast<Char>('-');
+    format_decimal<Char>(ptr, abs_value, num_digits);
+    return out;
+  }
+  if (negative) *out++ = static_cast<Char>('-');
+  return format_decimal<Char>(out, abs_value, num_digits);
+}
+
+template <typename Char>
+FMT_CONSTEXPR auto parse_align(const Char* begin, const Char* end,
+                               format_specs& specs) -> const Char* {
+  FMT_ASSERT(begin != end, "");
+  auto alignment = align::none;
+  auto p = begin + code_point_length(begin);
+  if (end - p <= 0) p = begin;
+  for (;;) {
+    switch (to_ascii(*p)) {
+    case '<': alignment = align::left; break;
+    case '>': alignment = align::right; break;
+    case '^': alignment = align::center; break;
+    }
+    if (alignment != align::none) {
+      if (p != begin) {
+        auto c = *begin;
+        if (c == '}') return begin;
+        if (c == '{') {
+          report_error("invalid fill character '{'");
+          return begin;
+        }
+        specs.set_fill(basic_string_view<Char>(begin, to_unsigned(p - begin)));
+        begin = p + 1;
+      } else {
+        ++begin;
+      }
+      break;
+    } else if (p == begin) {
+      break;
+    }
+    p = begin;
+  }
+  specs.set_align(alignment);
+  return begin;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR20 auto write_nonfinite(OutputIt out, bool isnan,
+                                     format_specs specs, sign s) -> OutputIt {
+  auto str =
+      isnan ? (specs.upper() ? "NAN" : "nan") : (specs.upper() ? "INF" : "inf");
+  constexpr size_t str_size = 3;
+  auto size = str_size + (s != sign::none ? 1 : 0);
+  // Replace '0'-padding with space for non-finite values.
+  const bool is_zero_fill =
+      specs.fill_size() == 1 && specs.fill_unit<Char>() == '0';
+  if (is_zero_fill) specs.set_fill(' ');
+  return write_padded<Char>(out, specs, size,
+                            [=](reserve_iterator<OutputIt> it) {
+                              if (s != sign::none)
+                                *it++ = detail::getsign<Char>(s);
+                              return copy<Char>(str, str + str_size, it);
+                            });
+}
+
+// A decimal floating-point number significand * pow(10, exp).
+struct big_decimal_fp {
+  const char* significand;
+  int significand_size;
+  int exponent;
+};
+
+constexpr auto get_significand_size(const big_decimal_fp& f) -> int {
+  return f.significand_size;
+}
+template <typename T>
+inline auto get_significand_size(const dragonbox::decimal_fp<T>& f) -> int {
+  return count_digits(f.significand);
+}
+
+template <typename Char, typename OutputIt>
+constexpr auto write_significand(OutputIt out, const char* significand,
+                                 int significand_size) -> OutputIt {
+  return copy<Char>(significand, significand + significand_size, out);
+}
+template <typename Char, typename OutputIt, typename UInt>
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size) -> OutputIt {
+  return format_decimal<Char>(out, significand, significand_size);
+}
+template <typename Char, typename OutputIt, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int exponent,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.has_separator()) {
+    out = write_significand<Char>(out, significand, significand_size);
+    return detail::fill_n(out, exponent, static_cast<Char>('0'));
+  }
+  auto buffer = memory_buffer();
+  write_significand<char>(appender(buffer), significand, significand_size);
+  detail::fill_n(appender(buffer), exponent, '0');
+  return grouping.apply(out, string_view(buffer.data(), buffer.size()));
+}
+
+template <typename Char, typename UInt,
+          FMT_ENABLE_IF(std::is_integral<UInt>::value)>
+inline auto write_significand(Char* out, UInt significand, int significand_size,
+                              int integral_size, Char decimal_point) -> Char* {
+  if (!decimal_point) return format_decimal(out, significand, significand_size);
+  out += significand_size + 1;
+  Char* end = out;
+  int floating_size = significand_size - integral_size;
+  for (int i = floating_size / 2; i > 0; --i) {
+    out -= 2;
+    write2digits(out, static_cast<std::size_t>(significand % 100));
+    significand /= 100;
+  }
+  if (floating_size % 2 != 0) {
+    *--out = static_cast<Char>('0' + significand % 10);
+    significand /= 10;
+  }
+  *--out = decimal_point;
+  format_decimal(out - integral_size, significand, integral_size);
+  return end;
+}
+
+template <typename OutputIt, typename UInt, typename Char,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
+inline auto write_significand(OutputIt out, UInt significand,
+                              int significand_size, int integral_size,
+                              Char decimal_point) -> OutputIt {
+  // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
+  Char buffer[digits10<UInt>() + 2];
+  auto end = write_significand(buffer, significand, significand_size,
+                               integral_size, decimal_point);
+  return detail::copy_noinline<Char>(buffer, end, out);
+}
+
+template <typename OutputIt, typename Char>
+FMT_CONSTEXPR auto write_significand(OutputIt out, const char* significand,
+                                     int significand_size, int integral_size,
+                                     Char decimal_point) -> OutputIt {
+  out = detail::copy_noinline<Char>(significand, significand + integral_size,
+                                    out);
+  if (!decimal_point) return out;
+  *out++ = decimal_point;
+  return detail::copy_noinline<Char>(significand + integral_size,
+                                     significand + significand_size, out);
+}
+
+template <typename OutputIt, typename Char, typename T, typename Grouping>
+FMT_CONSTEXPR20 auto write_significand(OutputIt out, T significand,
+                                       int significand_size, int integral_size,
+                                       Char decimal_point,
+                                       const Grouping& grouping) -> OutputIt {
+  if (!grouping.has_separator()) {
+    return write_significand(out, significand, significand_size, integral_size,
+                             decimal_point);
+  }
+  auto buffer = basic_memory_buffer<Char>();
+  write_significand(basic_appender<Char>(buffer), significand, significand_size,
+                    integral_size, decimal_point);
+  grouping.apply(
+      out, basic_string_view<Char>(buffer.data(), to_unsigned(integral_size)));
+  return detail::copy_noinline<Char>(buffer.data() + integral_size,
+                                     buffer.end(), out);
+}
+
+template <typename Char, typename OutputIt, typename DecimalFP,
+          typename Grouping = digit_grouping<Char>>
+FMT_CONSTEXPR20 auto do_write_float(OutputIt out, const DecimalFP& f,
+                                    const format_specs& specs, sign s,
+                                    locale_ref loc) -> OutputIt {
+  auto significand = f.significand;
+  int significand_size = get_significand_size(f);
+  const Char zero = static_cast<Char>('0');
+  size_t size = to_unsigned(significand_size) + (s != sign::none ? 1 : 0);
+  using iterator = reserve_iterator<OutputIt>;
+
+  Char decimal_point = specs.localized() ? detail::decimal_point<Char>(loc)
+                                         : static_cast<Char>('.');
+
+  int output_exp = f.exponent + significand_size - 1;
+  auto use_exp_format = [=]() {
+    if (specs.type() == presentation_type::exp) return true;
+    if (specs.type() == presentation_type::fixed) return false;
+    // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
+    // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
+    const int exp_lower = -4, exp_upper = 16;
+    return output_exp < exp_lower ||
+           output_exp >= (specs.precision > 0 ? specs.precision : exp_upper);
+  };
+  if (use_exp_format()) {
+    int num_zeros = 0;
+    if (specs.alt()) {
+      num_zeros = specs.precision - significand_size;
+      if (num_zeros < 0) num_zeros = 0;
+      size += to_unsigned(num_zeros);
+    } else if (significand_size == 1) {
+      decimal_point = Char();
+    }
+    auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp;
+    int exp_digits = 2;
+    if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
+
+    size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
+    char exp_char = specs.upper() ? 'E' : 'e';
+    auto write = [=](iterator it) {
+      if (s != sign::none) *it++ = detail::getsign<Char>(s);
+      // Insert a decimal point after the first digit and add an exponent.
+      it = write_significand(it, significand, significand_size, 1,
+                             decimal_point);
+      if (num_zeros > 0) it = detail::fill_n(it, num_zeros, zero);
+      *it++ = static_cast<Char>(exp_char);
+      return write_exponent<Char>(output_exp, it);
+    };
+    return specs.width > 0
+               ? write_padded<Char, align::right>(out, specs, size, write)
+               : base_iterator(out, write(reserve(out, size)));
+  }
+
+  int exp = f.exponent + significand_size;
+  if (f.exponent >= 0) {
+    // 1234e5 -> 123400000[.0+]
+    size += to_unsigned(f.exponent);
+    int num_zeros = specs.precision - exp;
+    abort_fuzzing_if(num_zeros > 5000);
+    if (specs.alt()) {
+      ++size;
+      if (num_zeros <= 0 && specs.type() != presentation_type::fixed)
+        num_zeros = 0;
+      if (num_zeros > 0) size += to_unsigned(num_zeros);
+    }
+    auto grouping = Grouping(loc, specs.localized());
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
+      if (s != sign::none) *it++ = detail::getsign<Char>(s);
+      it = write_significand<Char>(it, significand, significand_size,
+                                   f.exponent, grouping);
+      if (!specs.alt()) return it;
+      *it++ = decimal_point;
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
+    });
+  } else if (exp > 0) {
+    // 1234e-2 -> 12.34[0+]
+    int num_zeros = specs.alt() ? specs.precision - significand_size : 0;
+    size += 1 + static_cast<unsigned>(max_of(num_zeros, 0));
+    auto grouping = Grouping(loc, specs.localized());
+    size += to_unsigned(grouping.count_separators(exp));
+    return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
+      if (s != sign::none) *it++ = detail::getsign<Char>(s);
+      it = write_significand(it, significand, significand_size, exp,
+                             decimal_point, grouping);
+      return num_zeros > 0 ? detail::fill_n(it, num_zeros, zero) : it;
+    });
+  }
+  // 1234e-6 -> 0.001234
+  int num_zeros = -exp;
+  if (significand_size == 0 && specs.precision >= 0 &&
+      specs.precision < num_zeros) {
+    num_zeros = specs.precision;
+  }
+  bool pointy = num_zeros != 0 || significand_size != 0 || specs.alt();
+  size += 1 + (pointy ? 1 : 0) + to_unsigned(num_zeros);
+  return write_padded<Char, align::right>(out, specs, size, [&](iterator it) {
+    if (s != sign::none) *it++ = detail::getsign<Char>(s);
+    *it++ = zero;
+    if (!pointy) return it;
+    *it++ = decimal_point;
+    it = detail::fill_n(it, num_zeros, zero);
+    return write_significand<Char>(it, significand, significand_size);
+  });
+}
+
+template <typename Char> class fallback_digit_grouping {
+ public:
+  constexpr fallback_digit_grouping(locale_ref, bool) {}
+
+  constexpr auto has_separator() const -> bool { return false; }
+
+  constexpr auto count_separators(int) const -> int { return 0; }
+
+  template <typename Out, typename C>
+  constexpr auto apply(Out out, basic_string_view<C>) const -> Out {
+    return out;
+  }
+};
+
+template <typename Char, typename OutputIt, typename DecimalFP>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, const DecimalFP& f,
+                                 const format_specs& specs, sign s,
+                                 locale_ref loc) -> OutputIt {
+  if (is_constant_evaluated()) {
+    return do_write_float<Char, OutputIt, DecimalFP,
+                          fallback_digit_grouping<Char>>(out, f, specs, s, loc);
+  } else {
+    return do_write_float<Char>(out, f, specs, s, loc);
+  }
+}
+
+template <typename T> constexpr auto isnan(T value) -> bool {
+  return value != value;  // std::isnan doesn't support __float128.
+}
+
+template <typename T, typename Enable = void>
+struct has_isfinite : std::false_type {};
+
+template <typename T>
+struct has_isfinite<T, enable_if_t<sizeof(std::isfinite(T())) != 0>>
+    : std::true_type {};
+
+template <typename T, FMT_ENABLE_IF(std::is_floating_point<T>::value&&
+                                        has_isfinite<T>::value)>
+FMT_CONSTEXPR20 auto isfinite(T value) -> bool {
+  constexpr T inf = T(std::numeric_limits<double>::infinity());
+  if (is_constant_evaluated())
+    return !detail::isnan(value) && value < inf && value > -inf;
+  return std::isfinite(value);
+}
+template <typename T, FMT_ENABLE_IF(!has_isfinite<T>::value)>
+FMT_CONSTEXPR auto isfinite(T value) -> bool {
+  T inf = T(std::numeric_limits<double>::infinity());
+  // std::isfinite doesn't support __float128.
+  return !detail::isnan(value) && value < inf && value > -inf;
+}
+
+template <typename T, FMT_ENABLE_IF(is_floating_point<T>::value)>
+FMT_INLINE FMT_CONSTEXPR bool signbit(T value) {
+  if (is_constant_evaluated()) {
+#ifdef __cpp_if_constexpr
+    if constexpr (std::numeric_limits<double>::is_iec559) {
+      auto bits = detail::bit_cast<uint64_t>(static_cast<double>(value));
+      return (bits >> (num_bits<uint64_t>() - 1)) != 0;
+    }
+#endif
+  }
+  return std::signbit(static_cast<double>(value));
+}
+
+inline FMT_CONSTEXPR20 void adjust_precision(int& precision, int exp10) {
+  // Adjust fixed precision by exponent because it is relative to decimal
+  // point.
+  if (exp10 > 0 && precision > max_value<int>() - exp10)
+    FMT_THROW(format_error("number is too big"));
+  precision += exp10;
+}
+
+class bigint {
+ private:
+  // A bigint is a number in the form bigit_[N - 1] ... bigit_[0] * 32^exp_.
+  using bigit = uint32_t;  // A big digit.
+  using double_bigit = uint64_t;
+  enum { bigit_bits = num_bits<bigit>() };
+  enum { bigits_capacity = 32 };
+  basic_memory_buffer<bigit, bigits_capacity> bigits_;
+  int exp_;
+
+  friend struct formatter<bigint>;
+
+  FMT_CONSTEXPR auto get_bigit(int i) const -> bigit {
+    return i >= exp_ && i < num_bigits() ? bigits_[i - exp_] : 0;
+  }
+
+  FMT_CONSTEXPR void subtract_bigits(int index, bigit other, bigit& borrow) {
+    auto result = double_bigit(bigits_[index]) - other - borrow;
+    bigits_[index] = static_cast<bigit>(result);
+    borrow = static_cast<bigit>(result >> (bigit_bits * 2 - 1));
+  }
+
+  FMT_CONSTEXPR void remove_leading_zeros() {
+    int num_bigits = static_cast<int>(bigits_.size()) - 1;
+    while (num_bigits > 0 && bigits_[num_bigits] == 0) --num_bigits;
+    bigits_.resize(to_unsigned(num_bigits + 1));
+  }
+
+  // Computes *this -= other assuming aligned bigints and *this >= other.
+  FMT_CONSTEXPR void subtract_aligned(const bigint& other) {
+    FMT_ASSERT(other.exp_ >= exp_, "unaligned bigints");
+    FMT_ASSERT(compare(*this, other) >= 0, "");
+    bigit borrow = 0;
+    int i = other.exp_ - exp_;
+    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
+      subtract_bigits(i, other.bigits_[j], borrow);
+    if (borrow != 0) subtract_bigits(i, 0, borrow);
+    FMT_ASSERT(borrow == 0, "");
+    remove_leading_zeros();
+  }
+
+  FMT_CONSTEXPR void multiply(uint32_t value) {
+    bigit carry = 0;
+    const double_bigit wide_value = value;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      double_bigit result = bigits_[i] * wide_value + carry;
+      bigits_[i] = static_cast<bigit>(result);
+      carry = static_cast<bigit>(result >> bigit_bits);
+    }
+    if (carry != 0) bigits_.push_back(carry);
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR void multiply(UInt value) {
+    using half_uint =
+        conditional_t<std::is_same<UInt, uint128_t>::value, uint64_t, uint32_t>;
+    const int shift = num_bits<half_uint>() - bigit_bits;
+    const UInt lower = static_cast<half_uint>(value);
+    const UInt upper = value >> num_bits<half_uint>();
+    UInt carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      UInt result = lower * bigits_[i] + static_cast<bigit>(carry);
+      carry = (upper * bigits_[i] << shift) + (result >> bigit_bits) +
+              (carry >> bigit_bits);
+      bigits_[i] = static_cast<bigit>(result);
+    }
+    while (carry != 0) {
+      bigits_.push_back(static_cast<bigit>(carry));
+      carry >>= bigit_bits;
+    }
+  }
+
+  template <typename UInt, FMT_ENABLE_IF(std::is_same<UInt, uint64_t>::value ||
+                                         std::is_same<UInt, uint128_t>::value)>
+  FMT_CONSTEXPR void assign(UInt n) {
+    size_t num_bigits = 0;
+    do {
+      bigits_[num_bigits++] = static_cast<bigit>(n);
+      n >>= bigit_bits;
+    } while (n != 0);
+    bigits_.resize(num_bigits);
+    exp_ = 0;
+  }
+
+ public:
+  FMT_CONSTEXPR bigint() : exp_(0) {}
+  explicit bigint(uint64_t n) { assign(n); }
+
+  bigint(const bigint&) = delete;
+  void operator=(const bigint&) = delete;
+
+  FMT_CONSTEXPR void assign(const bigint& other) {
+    auto size = other.bigits_.size();
+    bigits_.resize(size);
+    auto data = other.bigits_.data();
+    copy<bigit>(data, data + size, bigits_.data());
+    exp_ = other.exp_;
+  }
+
+  template <typename Int> FMT_CONSTEXPR void operator=(Int n) {
+    FMT_ASSERT(n > 0, "");
+    assign(uint64_or_128_t<Int>(n));
+  }
+
+  FMT_CONSTEXPR auto num_bigits() const -> int {
+    return static_cast<int>(bigits_.size()) + exp_;
+  }
+
+  FMT_CONSTEXPR auto operator<<=(int shift) -> bigint& {
+    FMT_ASSERT(shift >= 0, "");
+    exp_ += shift / bigit_bits;
+    shift %= bigit_bits;
+    if (shift == 0) return *this;
+    bigit carry = 0;
+    for (size_t i = 0, n = bigits_.size(); i < n; ++i) {
+      bigit c = bigits_[i] >> (bigit_bits - shift);
+      bigits_[i] = (bigits_[i] << shift) + carry;
+      carry = c;
+    }
+    if (carry != 0) bigits_.push_back(carry);
+    return *this;
+  }
+
+  template <typename Int> FMT_CONSTEXPR auto operator*=(Int value) -> bigint& {
+    FMT_ASSERT(value > 0, "");
+    multiply(uint32_or_64_or_128_t<Int>(value));
+    return *this;
+  }
+
+  friend FMT_CONSTEXPR auto compare(const bigint& b1, const bigint& b2) -> int {
+    int num_bigits1 = b1.num_bigits(), num_bigits2 = b2.num_bigits();
+    if (num_bigits1 != num_bigits2) return num_bigits1 > num_bigits2 ? 1 : -1;
+    int i = static_cast<int>(b1.bigits_.size()) - 1;
+    int j = static_cast<int>(b2.bigits_.size()) - 1;
+    int end = i - j;
+    if (end < 0) end = 0;
+    for (; i >= end; --i, --j) {
+      bigit b1_bigit = b1.bigits_[i], b2_bigit = b2.bigits_[j];
+      if (b1_bigit != b2_bigit) return b1_bigit > b2_bigit ? 1 : -1;
+    }
+    if (i != j) return i > j ? 1 : -1;
+    return 0;
+  }
+
+  // Returns compare(lhs1 + lhs2, rhs).
+  friend FMT_CONSTEXPR auto add_compare(const bigint& lhs1, const bigint& lhs2,
+                                        const bigint& rhs) -> int {
+    int max_lhs_bigits = max_of(lhs1.num_bigits(), lhs2.num_bigits());
+    int num_rhs_bigits = rhs.num_bigits();
+    if (max_lhs_bigits + 1 < num_rhs_bigits) return -1;
+    if (max_lhs_bigits > num_rhs_bigits) return 1;
+    double_bigit borrow = 0;
+    int min_exp = min_of(min_of(lhs1.exp_, lhs2.exp_), rhs.exp_);
+    for (int i = num_rhs_bigits - 1; i >= min_exp; --i) {
+      double_bigit sum = double_bigit(lhs1.get_bigit(i)) + lhs2.get_bigit(i);
+      bigit rhs_bigit = rhs.get_bigit(i);
+      if (sum > rhs_bigit + borrow) return 1;
+      borrow = rhs_bigit + borrow - sum;
+      if (borrow > 1) return -1;
+      borrow <<= bigit_bits;
+    }
+    return borrow != 0 ? -1 : 0;
+  }
+
+  // Assigns pow(10, exp) to this bigint.
+  FMT_CONSTEXPR20 void assign_pow10(int exp) {
+    FMT_ASSERT(exp >= 0, "");
+    if (exp == 0) return *this = 1;
+    int bitmask = 1 << (num_bits<unsigned>() -
+                        countl_zero(static_cast<uint32_t>(exp)) - 1);
+    // pow(10, exp) = pow(5, exp) * pow(2, exp). First compute pow(5, exp) by
+    // repeated squaring and multiplication.
+    *this = 5;
+    bitmask >>= 1;
+    while (bitmask != 0) {
+      square();
+      if ((exp & bitmask) != 0) *this *= 5;
+      bitmask >>= 1;
+    }
+    *this <<= exp;  // Multiply by pow(2, exp) by shifting.
+  }
+
+  FMT_CONSTEXPR20 void square() {
+    int num_bigits = static_cast<int>(bigits_.size());
+    int num_result_bigits = 2 * num_bigits;
+    basic_memory_buffer<bigit, bigits_capacity> n(std::move(bigits_));
+    bigits_.resize(to_unsigned(num_result_bigits));
+    auto sum = uint128_t();
+    for (int bigit_index = 0; bigit_index < num_bigits; ++bigit_index) {
+      // Compute bigit at position bigit_index of the result by adding
+      // cross-product terms n[i] * n[j] such that i + j == bigit_index.
+      for (int i = 0, j = bigit_index; j >= 0; ++i, --j) {
+        // Most terms are multiplied twice which can be optimized in the future.
+        sum += double_bigit(n[i]) * n[j];
+      }
+      bigits_[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();  // Compute the carry.
+    }
+    // Do the same for the top half.
+    for (int bigit_index = num_bigits; bigit_index < num_result_bigits;
+         ++bigit_index) {
+      for (int j = num_bigits - 1, i = bigit_index - j; i < num_bigits;)
+        sum += double_bigit(n[i++]) * n[j--];
+      bigits_[bigit_index] = static_cast<bigit>(sum);
+      sum >>= num_bits<bigit>();
+    }
+    remove_leading_zeros();
+    exp_ *= 2;
+  }
+
+  // If this bigint has a bigger exponent than other, adds trailing zero to make
+  // exponents equal. This simplifies some operations such as subtraction.
+  FMT_CONSTEXPR void align(const bigint& other) {
+    int exp_difference = exp_ - other.exp_;
+    if (exp_difference <= 0) return;
+    int num_bigits = static_cast<int>(bigits_.size());
+    bigits_.resize(to_unsigned(num_bigits + exp_difference));
+    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
+      bigits_[j] = bigits_[i];
+    memset(bigits_.data(), 0, to_unsigned(exp_difference) * sizeof(bigit));
+    exp_ -= exp_difference;
+  }
+
+  // Divides this bignum by divisor, assigning the remainder to this and
+  // returning the quotient.
+  FMT_CONSTEXPR auto divmod_assign(const bigint& divisor) -> int {
+    FMT_ASSERT(this != &divisor, "");
+    if (compare(*this, divisor) < 0) return 0;
+    FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
+    align(divisor);
+    int quotient = 0;
+    do {
+      subtract_aligned(divisor);
+      ++quotient;
+    } while (compare(*this, divisor) >= 0);
+    return quotient;
+  }
+};
+
+// format_dragon flags.
+enum dragon {
+  predecessor_closer = 1,
+  fixup = 2,  // Run fixup to correct exp10 which can be off by one.
+  fixed = 4,
+};
+
+// Formats a floating-point number using a variation of the Fixed-Precision
+// Positive Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
+// https://fmt.dev/papers/p372-steele.pdf.
+FMT_CONSTEXPR20 inline void format_dragon(basic_fp<uint128_t> value,
+                                          unsigned flags, int num_digits,
+                                          buffer<char>& buf, int& exp10) {
+  bigint numerator;    // 2 * R in (FPP)^2.
+  bigint denominator;  // 2 * S in (FPP)^2.
+  // lower and upper are differences between value and corresponding boundaries.
+  bigint lower;             // (M^- in (FPP)^2).
+  bigint upper_store;       // upper's value if different from lower.
+  bigint* upper = nullptr;  // (M^+ in (FPP)^2).
+  // Shift numerator and denominator by an extra bit or two (if lower boundary
+  // is closer) to make lower and upper integers. This eliminates multiplication
+  // by 2 during later computations.
+  bool is_predecessor_closer = (flags & dragon::predecessor_closer) != 0;
+  int shift = is_predecessor_closer ? 2 : 1;
+  if (value.e >= 0) {
+    numerator = value.f;
+    numerator <<= value.e + shift;
+    lower = 1;
+    lower <<= value.e;
+    if (is_predecessor_closer) {
+      upper_store = 1;
+      upper_store <<= value.e + 1;
+      upper = &upper_store;
+    }
+    denominator.assign_pow10(exp10);
+    denominator <<= shift;
+  } else if (exp10 < 0) {
+    numerator.assign_pow10(-exp10);
+    lower.assign(numerator);
+    if (is_predecessor_closer) {
+      upper_store.assign(numerator);
+      upper_store <<= 1;
+      upper = &upper_store;
+    }
+    numerator *= value.f;
+    numerator <<= shift;
+    denominator = 1;
+    denominator <<= shift - value.e;
+  } else {
+    numerator = value.f;
+    numerator <<= shift;
+    denominator.assign_pow10(exp10);
+    denominator <<= shift - value.e;
+    lower = 1;
+    if (is_predecessor_closer) {
+      upper_store = 1ULL << 1;
+      upper = &upper_store;
+    }
+  }
+  int even = static_cast<int>((value.f & 1) == 0);
+  if (!upper) upper = &lower;
+  bool shortest = num_digits < 0;
+  if ((flags & dragon::fixup) != 0) {
+    if (add_compare(numerator, *upper, denominator) + even <= 0) {
+      --exp10;
+      numerator *= 10;
+      if (num_digits < 0) {
+        lower *= 10;
+        if (upper != &lower) *upper *= 10;
+      }
+    }
+    if ((flags & dragon::fixed) != 0) adjust_precision(num_digits, exp10 + 1);
+  }
+  // Invariant: value == (numerator / denominator) * pow(10, exp10).
+  if (shortest) {
+    // Generate the shortest representation.
+    num_digits = 0;
+    char* data = buf.data();
+    for (;;) {
+      int digit = numerator.divmod_assign(denominator);
+      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
+      // numerator + upper >[=] pow10:
+      bool high = add_compare(numerator, *upper, denominator) + even > 0;
+      data[num_digits++] = static_cast<char>('0' + digit);
+      if (low || high) {
+        if (!low) {
+          ++data[num_digits - 1];
+        } else if (high) {
+          int result = add_compare(numerator, numerator, denominator);
+          // Round half to even.
+          if (result > 0 || (result == 0 && (digit % 2) != 0))
+            ++data[num_digits - 1];
+        }
+        buf.try_resize(to_unsigned(num_digits));
+        exp10 -= num_digits - 1;
+        return;
+      }
+      numerator *= 10;
+      lower *= 10;
+      if (upper != &lower) *upper *= 10;
+    }
+  }
+  // Generate the given number of digits.
+  exp10 -= num_digits - 1;
+  if (num_digits <= 0) {
+    auto digit = '0';
+    if (num_digits == 0) {
+      denominator *= 10;
+      digit = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    }
+    buf.push_back(digit);
+    return;
+  }
+  buf.try_resize(to_unsigned(num_digits));
+  for (int i = 0; i < num_digits - 1; ++i) {
+    int digit = numerator.divmod_assign(denominator);
+    buf[i] = static_cast<char>('0' + digit);
+    numerator *= 10;
+  }
+  int digit = numerator.divmod_assign(denominator);
+  auto result = add_compare(numerator, numerator, denominator);
+  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
+    if (digit == 9) {
+      const auto overflow = '0' + 10;
+      buf[num_digits - 1] = overflow;
+      // Propagate the carry.
+      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
+        buf[i] = '0';
+        ++buf[i - 1];
+      }
+      if (buf[0] == overflow) {
+        buf[0] = '1';
+        if ((flags & dragon::fixed) != 0)
+          buf.push_back('0');
+        else
+          ++exp10;
+      }
+      return;
+    }
+    ++digit;
+  }
+  buf[num_digits - 1] = static_cast<char>('0' + digit);
+}
+
+// Formats a floating-point number using the hexfloat format.
+template <typename Float, FMT_ENABLE_IF(!is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
+                                     buffer<char>& buf) {
+  // float is passed as double to reduce the number of instantiations and to
+  // simplify implementation.
+  static_assert(!std::is_same<Float, float>::value, "");
+
+  using info = dragonbox::float_info<Float>;
+
+  // Assume Float is in the format [sign][exponent][significand].
+  using carrier_uint = typename info::carrier_uint;
+
+  const auto num_float_significand_bits = detail::num_significand_bits<Float>();
+
+  basic_fp<carrier_uint> f(value);
+  f.e += num_float_significand_bits;
+  if (!has_implicit_bit<Float>()) --f.e;
+
+  const auto num_fraction_bits =
+      num_float_significand_bits + (has_implicit_bit<Float>() ? 1 : 0);
+  const auto num_xdigits = (num_fraction_bits + 3) / 4;
+
+  const auto leading_shift = ((num_xdigits - 1) * 4);
+  const auto leading_mask = carrier_uint(0xF) << leading_shift;
+  const auto leading_xdigit =
+      static_cast<uint32_t>((f.f & leading_mask) >> leading_shift);
+  if (leading_xdigit > 1) f.e -= (32 - countl_zero(leading_xdigit) - 1);
+
+  int print_xdigits = num_xdigits - 1;
+  if (specs.precision >= 0 && print_xdigits > specs.precision) {
+    const int shift = ((print_xdigits - specs.precision - 1) * 4);
+    const auto mask = carrier_uint(0xF) << shift;
+    const auto v = static_cast<uint32_t>((f.f & mask) >> shift);
+
+    if (v >= 8) {
+      const auto inc = carrier_uint(1) << (shift + 4);
+      f.f += inc;
+      f.f &= ~(inc - 1);
+    }
+
+    // Check long double overflow
+    if (!has_implicit_bit<Float>()) {
+      const auto implicit_bit = carrier_uint(1) << num_float_significand_bits;
+      if ((f.f & implicit_bit) == implicit_bit) {
+        f.f >>= 4;
+        f.e += 4;
+      }
+    }
+
+    print_xdigits = specs.precision;
+  }
+
+  char xdigits[num_bits<carrier_uint>() / 4];
+  detail::fill_n(xdigits, sizeof(xdigits), '0');
+  format_base2e(4, xdigits, f.f, num_xdigits, specs.upper());
+
+  // Remove zero tail
+  while (print_xdigits > 0 && xdigits[print_xdigits] == '0') --print_xdigits;
+
+  buf.push_back('0');
+  buf.push_back(specs.upper() ? 'X' : 'x');
+  buf.push_back(xdigits[0]);
+  if (specs.alt() || print_xdigits > 0 || print_xdigits < specs.precision)
+    buf.push_back('.');
+  buf.append(xdigits + 1, xdigits + 1 + print_xdigits);
+  for (; print_xdigits < specs.precision; ++print_xdigits) buf.push_back('0');
+
+  buf.push_back(specs.upper() ? 'P' : 'p');
+
+  uint32_t abs_e;
+  if (f.e < 0) {
+    buf.push_back('-');
+    abs_e = static_cast<uint32_t>(-f.e);
+  } else {
+    buf.push_back('+');
+    abs_e = static_cast<uint32_t>(f.e);
+  }
+  format_decimal<char>(appender(buf), abs_e, detail::count_digits(abs_e));
+}
+
+template <typename Float, FMT_ENABLE_IF(is_double_double<Float>::value)>
+FMT_CONSTEXPR20 void format_hexfloat(Float value, format_specs specs,
+                                     buffer<char>& buf) {
+  format_hexfloat(static_cast<double>(value), specs, buf);
+}
+
+constexpr auto fractional_part_rounding_thresholds(int index) -> uint32_t {
+  // For checking rounding thresholds.
+  // The kth entry is chosen to be the smallest integer such that the
+  // upper 32-bits of 10^(k+1) times it is strictly bigger than 5 * 10^k.
+  // It is equal to ceil(2^31 + 2^32/10^(k + 1)).
+  // These are stored in a string literal because we cannot have static arrays
+  // in constexpr functions and non-static ones are poorly optimized.
+  return U"\x9999999a\x828f5c29\x80418938\x80068db9\x8000a7c6\x800010c7"
+         U"\x800001ae\x8000002b"[index];
+}
+
+template <typename Float>
+FMT_CONSTEXPR20 auto format_float(Float value, int precision,
+                                  const format_specs& specs, bool binary32,
+                                  buffer<char>& buf) -> int {
+  // float is passed as double to reduce the number of instantiations.
+  static_assert(!std::is_same<Float, float>::value, "");
+  auto converted_value = convert_float(value);
+
+  const bool fixed = specs.type() == presentation_type::fixed;
+  if (value == 0) {
+    if (precision <= 0 || !fixed) {
+      buf.push_back('0');
+      return 0;
+    }
+    buf.try_resize(to_unsigned(precision));
+    fill_n(buf.data(), precision, '0');
+    return -precision;
+  }
+
+  int exp = 0;
+  bool use_dragon = true;
+  unsigned dragon_flags = 0;
+  if (!is_fast_float<Float>() || is_constant_evaluated()) {
+    const auto inv_log2_10 = 0.3010299956639812;  // 1 / log2(10)
+    using info = dragonbox::float_info<decltype(converted_value)>;
+    const auto f = basic_fp<typename info::carrier_uint>(converted_value);
+    // Compute exp, an approximate power of 10, such that
+    //   10^(exp - 1) <= value < 10^exp or 10^exp <= value < 10^(exp + 1).
+    // This is based on log10(value) == log2(value) / log2(10) and approximation
+    // of log2(value) by e + num_fraction_bits idea from double-conversion.
+    auto e = (f.e + count_digits<1>(f.f) - 1) * inv_log2_10 - 1e-10;
+    exp = static_cast<int>(e);
+    if (e > exp) ++exp;  // Compute ceil.
+    dragon_flags = dragon::fixup;
+  } else {
+    // Extract significand bits and exponent bits.
+    using info = dragonbox::float_info<double>;
+    auto br = bit_cast<uint64_t>(static_cast<double>(value));
+
+    const uint64_t significand_mask =
+        (static_cast<uint64_t>(1) << num_significand_bits<double>()) - 1;
+    uint64_t significand = (br & significand_mask);
+    int exponent = static_cast<int>((br & exponent_mask<double>()) >>
+                                    num_significand_bits<double>());
+
+    if (exponent != 0) {  // Check if normal.
+      exponent -= exponent_bias<double>() + num_significand_bits<double>();
+      significand |=
+          (static_cast<uint64_t>(1) << num_significand_bits<double>());
+      significand <<= 1;
+    } else {
+      // Normalize subnormal inputs.
+      FMT_ASSERT(significand != 0, "zeros should not appear here");
+      int shift = countl_zero(significand);
+      FMT_ASSERT(shift >= num_bits<uint64_t>() - num_significand_bits<double>(),
+                 "");
+      shift -= (num_bits<uint64_t>() - num_significand_bits<double>() - 2);
+      exponent = (std::numeric_limits<double>::min_exponent -
+                  num_significand_bits<double>()) -
+                 shift;
+      significand <<= shift;
+    }
+
+    // Compute the first several nonzero decimal significand digits.
+    // We call the number we get the first segment.
+    const int k = info::kappa - dragonbox::floor_log10_pow2(exponent);
+    exp = -k;
+    const int beta = exponent + dragonbox::floor_log2_pow10(k);
+    uint64_t first_segment;
+    bool has_more_segments;
+    int digits_in_the_first_segment;
+    {
+      const auto r = dragonbox::umul192_upper128(
+          significand << beta, dragonbox::get_cached_power(k));
+      first_segment = r.high();
+      has_more_segments = r.low() != 0;
+
+      // The first segment can have 18 ~ 19 digits.
+      if (first_segment >= 1000000000000000000ULL) {
+        digits_in_the_first_segment = 19;
+      } else {
+        // When it is of 18-digits, we align it to 19-digits by adding a bogus
+        // zero at the end.
+        digits_in_the_first_segment = 18;
+        first_segment *= 10;
+      }
+    }
+
+    // Compute the actual number of decimal digits to print.
+    if (fixed) adjust_precision(precision, exp + digits_in_the_first_segment);
+
+    // Use Dragon4 only when there might be not enough digits in the first
+    // segment.
+    if (digits_in_the_first_segment > precision) {
+      use_dragon = false;
+
+      if (precision <= 0) {
+        exp += digits_in_the_first_segment;
+
+        if (precision < 0) {
+          // Nothing to do, since all we have are just leading zeros.
+          buf.try_resize(0);
+        } else {
+          // We may need to round-up.
+          buf.try_resize(1);
+          if ((first_segment | static_cast<uint64_t>(has_more_segments)) >
+              5000000000000000000ULL) {
+            buf[0] = '1';
+          } else {
+            buf[0] = '0';
+          }
+        }
+      }  // precision <= 0
+      else {
+        exp += digits_in_the_first_segment - precision;
+
+        // When precision > 0, we divide the first segment into three
+        // subsegments, each with 9, 9, and 0 ~ 1 digits so that each fits
+        // in 32-bits which usually allows faster calculation than in
+        // 64-bits. Since some compiler (e.g. MSVC) doesn't know how to optimize
+        // division-by-constant for large 64-bit divisors, we do it here
+        // manually. The magic number 7922816251426433760 below is equal to
+        // ceil(2^(64+32) / 10^10).
+        const uint32_t first_subsegment = static_cast<uint32_t>(
+            dragonbox::umul128_upper64(first_segment, 7922816251426433760ULL) >>
+            32);
+        const uint64_t second_third_subsegments =
+            first_segment - first_subsegment * 10000000000ULL;
+
+        uint64_t prod;
+        uint32_t digits;
+        bool should_round_up;
+        int number_of_digits_to_print = min_of(precision, 9);
+
+        // Print a 9-digits subsegment, either the first or the second.
+        auto print_subsegment = [&](uint32_t subsegment, char* buffer) {
+          int number_of_digits_printed = 0;
+
+          // If we want to print an odd number of digits from the subsegment,
+          if ((number_of_digits_to_print & 1) != 0) {
+            // Convert to 64-bit fixed-point fractional form with 1-digit
+            // integer part. The magic number 720575941 is a good enough
+            // approximation of 2^(32 + 24) / 10^8; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(720575941)) >> 24) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            *buffer = static_cast<char>('0' + digits);
+            number_of_digits_printed++;
+          }
+          // If we want to print an even number of digits from the
+          // first_subsegment,
+          else {
+            // Convert to 64-bit fixed-point fractional form with 2-digits
+            // integer part. The magic number 450359963 is a good enough
+            // approximation of 2^(32 + 20) / 10^7; see
+            // https://jk-jeon.github.io/posts/2022/12/fixed-precision-formatting/#fixed-length-case
+            // for details.
+            prod = ((subsegment * static_cast<uint64_t>(450359963)) >> 20) + 1;
+            digits = static_cast<uint32_t>(prod >> 32);
+            write2digits(buffer, digits);
+            number_of_digits_printed += 2;
+          }
+
+          // Print all digit pairs.
+          while (number_of_digits_printed < number_of_digits_to_print) {
+            prod = static_cast<uint32_t>(prod) * static_cast<uint64_t>(100);
+            digits = static_cast<uint32_t>(prod >> 32);
+            write2digits(buffer + number_of_digits_printed, digits);
+            number_of_digits_printed += 2;
+          }
+        };
+
+        // Print first subsegment.
+        print_subsegment(first_subsegment, buf.data());
+
+        // Perform rounding if the first subsegment is the last subsegment to
+        // print.
+        if (precision <= 9) {
+          // Rounding inside the subsegment.
+          // We round-up if:
+          //  - either the fractional part is strictly larger than 1/2, or
+          //  - the fractional part is exactly 1/2 and the last digit is odd.
+          // We rely on the following observations:
+          //  - If fractional_part >= threshold, then the fractional part is
+          //    strictly larger than 1/2.
+          //  - If the MSB of fractional_part is set, then the fractional part
+          //    must be at least 1/2.
+          //  - When the MSB of fractional_part is set, either
+          //    second_third_subsegments being nonzero or has_more_segments
+          //    being true means there are further digits not printed, so the
+          //    fractional part is strictly larger than 1/2.
+          if (precision < 9) {
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (second_third_subsegments != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          // In this case, the fractional part is at least 1/2 if and only if
+          // second_third_subsegments >= 5000000000ULL, and is strictly larger
+          // than 1/2 if we further have either second_third_subsegments >
+          // 5000000000ULL or has_more_segments == true.
+          else {
+            should_round_up = second_third_subsegments > 5000000000ULL ||
+                              (second_third_subsegments == 5000000000ULL &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+        // Otherwise, print the second subsegment.
+        else {
+          // Compilers are not aware of how to leverage the maximum value of
+          // second_third_subsegments to find out a better magic number which
+          // allows us to eliminate an additional shift. 1844674407370955162 =
+          // ceil(2^64/10) < ceil(2^64*(10^9/(10^10 - 1))).
+          const uint32_t second_subsegment =
+              static_cast<uint32_t>(dragonbox::umul128_upper64(
+                  second_third_subsegments, 1844674407370955162ULL));
+          const uint32_t third_subsegment =
+              static_cast<uint32_t>(second_third_subsegments) -
+              second_subsegment * 10;
+
+          number_of_digits_to_print = precision - 9;
+          print_subsegment(second_subsegment, buf.data() + 9);
+
+          // Rounding inside the subsegment.
+          if (precision < 18) {
+            // The condition third_subsegment != 0 implies that the segment was
+            // of 19 digits, so in this case the third segment should be
+            // consisting of a genuine digit from the input.
+            uint32_t fractional_part = static_cast<uint32_t>(prod);
+            should_round_up =
+                fractional_part >= fractional_part_rounding_thresholds(
+                                       8 - number_of_digits_to_print) ||
+                ((fractional_part >> 31) &
+                 ((digits & 1) | (third_subsegment != 0) |
+                  has_more_segments)) != 0;
+          }
+          // Rounding at the subsegment boundary.
+          else {
+            // In this case, the segment must be of 19 digits, thus
+            // the third subsegment should be consisting of a genuine digit from
+            // the input.
+            should_round_up = third_subsegment > 5 ||
+                              (third_subsegment == 5 &&
+                               ((digits & 1) != 0 || has_more_segments));
+          }
+        }
+
+        // Round-up if necessary.
+        if (should_round_up) {
+          ++buf[precision - 1];
+          for (int i = precision - 1; i > 0 && buf[i] > '9'; --i) {
+            buf[i] = '0';
+            ++buf[i - 1];
+          }
+          if (buf[0] > '9') {
+            buf[0] = '1';
+            if (fixed)
+              buf[precision++] = '0';
+            else
+              ++exp;
+          }
+        }
+        buf.try_resize(to_unsigned(precision));
+      }
+    }  // if (digits_in_the_first_segment > precision)
+    else {
+      // Adjust the exponent for its use in Dragon4.
+      exp += digits_in_the_first_segment - 1;
+    }
+  }
+  if (use_dragon) {
+    auto f = basic_fp<uint128_t>();
+    bool is_predecessor_closer = binary32 ? f.assign(static_cast<float>(value))
+                                          : f.assign(converted_value);
+    if (is_predecessor_closer) dragon_flags |= dragon::predecessor_closer;
+    if (fixed) dragon_flags |= dragon::fixed;
+    // Limit precision to the maximum possible number of significant digits in
+    // an IEEE754 double because we don't need to generate zeros.
+    const int max_double_digits = 767;
+    if (precision > max_double_digits) precision = max_double_digits;
+    format_dragon(f, dragon_flags, precision, buf, exp);
+  }
+  if (!fixed && !specs.alt()) {
+    // Remove trailing zeros.
+    auto num_digits = buf.size();
+    while (num_digits > 0 && buf[num_digits - 1] == '0') {
+      --num_digits;
+      ++exp;
+    }
+    buf.try_resize(num_digits);
+  }
+  return exp;
+}
+
+template <typename Char, typename OutputIt, typename T>
+FMT_CONSTEXPR20 auto write_float(OutputIt out, T value, format_specs specs,
+                                 locale_ref loc) -> OutputIt {
+  // Use signbit because value < 0 is false for NaN.
+  sign s = detail::signbit(value) ? sign::minus : specs.sign();
+
+  if (!detail::isfinite(value))
+    return write_nonfinite<Char>(out, detail::isnan(value), specs, s);
+
+  if (specs.align() == align::numeric && s != sign::none) {
+    *out++ = detail::getsign<Char>(s);
+    s = sign::none;
+    if (specs.width != 0) --specs.width;
+  }
+
+  int precision = specs.precision;
+  if (precision < 0) {
+    if (specs.type() != presentation_type::none) {
+      precision = 6;
+    } else if (is_fast_float<T>::value && !is_constant_evaluated()) {
+      // Use Dragonbox for the shortest format.
+      using floaty = conditional_t<sizeof(T) >= sizeof(double), double, float>;
+      auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
+      return write_float<Char>(out, dec, specs, s, loc);
+    }
+  }
+
+  memory_buffer buffer;
+  if (specs.type() == presentation_type::hexfloat) {
+    if (s != sign::none) buffer.push_back(detail::getsign<char>(s));
+    format_hexfloat(convert_float(value), specs, buffer);
+    return write_bytes<Char, align::right>(out, {buffer.data(), buffer.size()},
+                                           specs);
+  }
+
+  if (specs.type() == presentation_type::exp) {
+    if (precision == max_value<int>())
+      report_error("number is too big");
+    else
+      ++precision;
+    if (specs.precision != 0) specs.set_alt();
+  } else if (specs.type() == presentation_type::fixed) {
+    if (specs.precision != 0) specs.set_alt();
+  } else if (precision == 0) {
+    precision = 1;
+  }
+  int exp = format_float(convert_float(value), precision, specs,
+                         std::is_same<T, float>(), buffer);
+
+  specs.precision = precision;
+  auto f = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
+  return write_float<Char>(out, f, specs, s, loc);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_floating_point<T>::value)>
+FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs specs,
+                           locale_ref loc = {}) -> OutputIt {
+  return specs.localized() && write_loc(out, value, specs, loc)
+             ? out
+             : write_float<Char>(out, value, specs, loc);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_fast_float<T>::value)>
+FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
+  if (is_constant_evaluated()) return write<Char>(out, value, format_specs());
+
+  auto s = detail::signbit(value) ? sign::minus : sign::none;
+
+  constexpr auto specs = format_specs();
+  using floaty = conditional_t<sizeof(T) >= sizeof(double), double, float>;
+  using floaty_uint = typename dragonbox::float_info<floaty>::carrier_uint;
+  floaty_uint mask = exponent_mask<floaty>();
+  if ((bit_cast<floaty_uint>(value) & mask) == mask)
+    return write_nonfinite<Char>(out, std::isnan(value), specs, s);
+
+  auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
+  return write_float<Char>(out, dec, specs, s, {});
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(is_floating_point<T>::value &&
+                        !is_fast_float<T>::value)>
+inline auto write(OutputIt out, T value) -> OutputIt {
+  return write<Char>(out, value, format_specs());
+}
+
+template <typename Char, typename OutputIt>
+auto write(OutputIt out, monostate, format_specs = {}, locale_ref = {})
+    -> OutputIt {
+  FMT_ASSERT(false, "");
+  return out;
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, basic_string_view<Char> value)
+    -> OutputIt {
+  return copy_noinline<Char>(value.begin(), value.end(), out);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(has_to_string_view<T>::value)>
+constexpr auto write(OutputIt out, const T& value) -> OutputIt {
+  return write<Char>(out, to_string_view(value));
+}
+
+// FMT_ENABLE_IF() condition separated to workaround an MSVC bug.
+template <
+    typename Char, typename OutputIt, typename T,
+    bool check = std::is_enum<T>::value && !std::is_same<T, Char>::value &&
+                 mapped_type_constant<T, Char>::value != type::custom_type,
+    FMT_ENABLE_IF(check)>
+FMT_CONSTEXPR auto write(OutputIt out, T value) -> OutputIt {
+  return write<Char>(out, static_cast<underlying_t<T>>(value));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, bool>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, T value, const format_specs& specs = {},
+                         locale_ref = {}) -> OutputIt {
+  return specs.type() != presentation_type::none &&
+                 specs.type() != presentation_type::string
+             ? write<Char>(out, value ? 1 : 0, specs, {})
+             : write_bytes<Char>(out, value ? "true" : "false", specs);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR auto write(OutputIt out, Char value) -> OutputIt {
+  auto it = reserve(out, 1);
+  *it++ = value;
+  return base_iterator(out, it);
+}
+
+template <typename Char, typename OutputIt>
+FMT_CONSTEXPR20 auto write(OutputIt out, const Char* value) -> OutputIt {
+  if (value) return write(out, basic_string_view<Char>(value));
+  report_error("string pointer is null");
+  return out;
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_same<T, void>::value)>
+auto write(OutputIt out, const T* value, const format_specs& specs = {},
+           locale_ref = {}) -> OutputIt {
+  return write_ptr<Char>(out, bit_cast<uintptr_t>(value), &specs);
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(mapped_type_constant<T, Char>::value ==
+                            type::custom_type &&
+                        !std::is_fundamental<T>::value)>
+FMT_CONSTEXPR auto write(OutputIt out, const T& value) -> OutputIt {
+  auto f = formatter<T, Char>();
+  auto parse_ctx = parse_context<Char>({});
+  f.parse(parse_ctx);
+  auto ctx = basic_format_context<OutputIt, Char>(out, {}, {});
+  return f.format(value, ctx);
+}
+
+template <typename T>
+using is_builtin =
+    bool_constant<std::is_same<T, int>::value || FMT_BUILTIN_TYPES>;
+
+// An argument visitor that formats the argument and writes it via the output
+// iterator. It's a class and not a generic lambda for compatibility with C++11.
+template <typename Char> struct default_arg_formatter {
+  using context = buffered_context<Char>;
+
+  basic_appender<Char> out;
+
+  void operator()(monostate) { report_error("argument not found"); }
+
+  template <typename T, FMT_ENABLE_IF(is_builtin<T>::value)>
+  void operator()(T value) {
+    write<Char>(out, value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_builtin<T>::value)>
+  void operator()(T) {
+    FMT_ASSERT(false, "");
+  }
+
+  void operator()(typename basic_format_arg<context>::handle h) {
+    // Use a null locale since the default format must be unlocalized.
+    auto parse_ctx = parse_context<Char>({});
+    auto format_ctx = context(out, {}, {});
+    h.format(parse_ctx, format_ctx);
+  }
+};
+
+template <typename Char> struct arg_formatter {
+  basic_appender<Char> out;
+  const format_specs& specs;
+  FMT_NO_UNIQUE_ADDRESS locale_ref locale;
+
+  template <typename T, FMT_ENABLE_IF(is_builtin<T>::value)>
+  FMT_CONSTEXPR FMT_INLINE void operator()(T value) {
+    detail::write<Char>(out, value, specs, locale);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_builtin<T>::value)>
+  void operator()(T) {
+    FMT_ASSERT(false, "");
+  }
+
+  void operator()(typename basic_format_arg<buffered_context<Char>>::handle) {
+    // User-defined types are handled separately because they require access
+    // to the parse context.
+  }
+};
+
+struct dynamic_spec_getter {
+  template <typename T, FMT_ENABLE_IF(is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T value) -> unsigned long long {
+    return is_negative(value) ? ~0ull : static_cast<unsigned long long>(value);
+  }
+
+  template <typename T, FMT_ENABLE_IF(!is_integer<T>::value)>
+  FMT_CONSTEXPR auto operator()(T) -> unsigned long long {
+    report_error("width/precision is not integer");
+    return 0;
+  }
+};
+
+template <typename Context, typename ID>
+FMT_CONSTEXPR auto get_arg(Context& ctx, ID id) -> basic_format_arg<Context> {
+  auto arg = ctx.arg(id);
+  if (!arg) report_error("argument not found");
+  return arg;
+}
+
+template <typename Context>
+FMT_CONSTEXPR int get_dynamic_spec(
+    arg_id_kind kind, const arg_ref<typename Context::char_type>& ref,
+    Context& ctx) {
+  FMT_ASSERT(kind != arg_id_kind::none, "");
+  auto arg =
+      kind == arg_id_kind::index ? ctx.arg(ref.index) : ctx.arg(ref.name);
+  if (!arg) report_error("argument not found");
+  unsigned long long value = arg.visit(dynamic_spec_getter());
+  if (value > to_unsigned(max_value<int>()))
+    report_error("width/precision is out of range");
+  return static_cast<int>(value);
+}
+
+template <typename Context>
+FMT_CONSTEXPR void handle_dynamic_spec(
+    arg_id_kind kind, int& value,
+    const arg_ref<typename Context::char_type>& ref, Context& ctx) {
+  if (kind != arg_id_kind::none) value = get_dynamic_spec(kind, ref, ctx);
+}
+
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <typename T, typename Char, size_t N,
+          fmt::detail::fixed_string<Char, N> Str>
+struct static_named_arg : view {
+  static constexpr auto name = Str.data;
+
+  const T& value;
+  static_named_arg(const T& v) : value(v) {}
+};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail::fixed_string<Char, N> Str>
+struct is_named_arg<static_named_arg<T, Char, N, Str>> : std::true_type {};
+
+template <typename T, typename Char, size_t N,
+          fmt::detail::fixed_string<Char, N> Str>
+struct is_static_named_arg<static_named_arg<T, Char, N, Str>> : std::true_type {
+};
+
+template <typename Char, size_t N, fmt::detail::fixed_string<Char, N> Str>
+struct udl_arg {
+  template <typename T> auto operator=(T&& value) const {
+    return static_named_arg<T, Char, N, Str>(std::forward<T>(value));
+  }
+};
+#else
+template <typename Char> struct udl_arg {
+  const Char* str;
+
+  template <typename T> auto operator=(T&& value) const -> named_arg<Char, T> {
+    return {str, std::forward<T>(value)};
+  }
+};
+#endif  // FMT_USE_NONTYPE_TEMPLATE_ARGS
+
+template <typename Char> struct format_handler {
+  parse_context<Char> parse_ctx;
+  buffered_context<Char> ctx;
+
+  void on_text(const Char* begin, const Char* end) {
+    copy_noinline<Char>(begin, end, ctx.out());
+  }
+
+  FMT_CONSTEXPR auto on_arg_id() -> int { return parse_ctx.next_arg_id(); }
+  FMT_CONSTEXPR auto on_arg_id(int id) -> int {
+    parse_ctx.check_arg_id(id);
+    return id;
+  }
+  FMT_CONSTEXPR auto on_arg_id(basic_string_view<Char> id) -> int {
+    parse_ctx.check_arg_id(id);
+    int arg_id = ctx.arg_id(id);
+    if (arg_id < 0) report_error("argument not found");
+    return arg_id;
+  }
+
+  FMT_INLINE void on_replacement_field(int id, const Char*) {
+    ctx.arg(id).visit(default_arg_formatter<Char>{ctx.out()});
+  }
+
+  auto on_format_specs(int id, const Char* begin, const Char* end)
+      -> const Char* {
+    auto arg = get_arg(ctx, id);
+    // Not using a visitor for custom types gives better codegen.
+    if (arg.format_custom(begin, parse_ctx, ctx)) return parse_ctx.begin();
+
+    auto specs = dynamic_format_specs<Char>();
+    begin = parse_format_specs(begin, end, specs, parse_ctx, arg.type());
+    if (specs.dynamic()) {
+      handle_dynamic_spec(specs.dynamic_width(), specs.width, specs.width_ref,
+                          ctx);
+      handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
+                          specs.precision_ref, ctx);
+    }
+
+    arg.visit(arg_formatter<Char>{ctx.out(), specs, ctx.locale()});
+    return begin;
+  }
+
+  FMT_NORETURN void on_error(const char* message) { report_error(message); }
+};
+
+using format_func = void (*)(detail::buffer<char>&, int, const char*);
+FMT_API void do_report_error(format_func func, int error_code,
+                             const char* message) noexcept;
+
+FMT_API void format_error_code(buffer<char>& out, int error_code,
+                               string_view message) noexcept;
+
+template <typename T, typename Char, type TYPE>
+template <typename FormatContext>
+FMT_CONSTEXPR auto native_formatter<T, Char, TYPE>::format(
+    const T& val, FormatContext& ctx) const -> decltype(ctx.out()) {
+  if (!specs_.dynamic())
+    return write<Char>(ctx.out(), val, specs_, ctx.locale());
+  auto specs = format_specs(specs_);
+  handle_dynamic_spec(specs.dynamic_width(), specs.width, specs_.width_ref,
+                      ctx);
+  handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
+                      specs_.precision_ref, ctx);
+  return write<Char>(ctx.out(), val, specs, ctx.locale());
+}
+
+// DEPRECATED! https://github.com/fmtlib/fmt/issues/4292.
+template <typename T, typename Enable = void>
+struct is_locale : std::false_type {};
+template <typename T>
+struct is_locale<T, void_t<decltype(T::classic())>> : std::true_type {};
+
+// DEPRECATED!
+template <typename Char = char> struct vformat_args {
+  using type = basic_format_args<buffered_context<Char>>;
+};
+template <> struct vformat_args<char> {
+  using type = format_args;
+};
+
+template <typename Char>
+void vformat_to(buffer<Char>& buf, basic_string_view<Char> fmt,
+                typename vformat_args<Char>::type args, locale_ref loc = {}) {
+  auto out = basic_appender<Char>(buf);
+  parse_format_string(
+      fmt, format_handler<Char>{parse_context<Char>(fmt), {out, args, loc}});
+}
+}  // namespace detail
+
+FMT_BEGIN_EXPORT
+
+// A generic formatting context with custom output iterator and character
+// (code unit) support. Char is the format string code unit type which can be
+// different from OutputIt::value_type.
+template <typename OutputIt, typename Char> class generic_context {
+ private:
+  OutputIt out_;
+  basic_format_args<generic_context> args_;
+  detail::locale_ref loc_;
+
+ public:
+  using char_type = Char;
+  using iterator = OutputIt;
+  using parse_context_type FMT_DEPRECATED = parse_context<Char>;
+  template <typename T>
+  using formatter_type FMT_DEPRECATED = formatter<T, Char>;
+  enum { builtin_types = FMT_BUILTIN_TYPES };
+
+  constexpr generic_context(OutputIt out,
+                            basic_format_args<generic_context> args,
+                            detail::locale_ref loc = {})
+      : out_(out), args_(args), loc_(loc) {}
+  generic_context(generic_context&&) = default;
+  generic_context(const generic_context&) = delete;
+  void operator=(const generic_context&) = delete;
+
+  constexpr auto arg(int id) const -> basic_format_arg<generic_context> {
+    return args_.get(id);
+  }
+  auto arg(basic_string_view<Char> name) const
+      -> basic_format_arg<generic_context> {
+    return args_.get(name);
+  }
+  constexpr auto arg_id(basic_string_view<Char> name) const -> int {
+    return args_.get_id(name);
+  }
+
+  constexpr auto out() const -> iterator { return out_; }
+
+  void advance_to(iterator it) {
+    if (!detail::is_back_insert_iterator<iterator>()) out_ = it;
+  }
+
+  constexpr auto locale() const -> detail::locale_ref { return loc_; }
+};
+
+class loc_value {
+ private:
+  basic_format_arg<context> value_;
+
+ public:
+  template <typename T, FMT_ENABLE_IF(!detail::is_float128<T>::value)>
+  loc_value(T value) : value_(value) {}
+
+  template <typename T, FMT_ENABLE_IF(detail::is_float128<T>::value)>
+  loc_value(T) {}
+
+  template <typename Visitor> auto visit(Visitor&& vis) -> decltype(vis(0)) {
+    return value_.visit(vis);
+  }
+};
+
+// A locale facet that formats values in UTF-8.
+// It is parameterized on the locale to avoid the heavy <locale> include.
+template <typename Locale> class format_facet : public Locale::facet {
+ private:
+  std::string separator_;
+  std::string grouping_;
+  std::string decimal_point_;
+
+ protected:
+  virtual auto do_put(appender out, loc_value val,
+                      const format_specs& specs) const -> bool;
+
+ public:
+  static FMT_API typename Locale::id id;
+
+  explicit format_facet(Locale& loc);
+  explicit format_facet(string_view sep = "", std::string grouping = "\3",
+                        std::string decimal_point = ".")
+      : separator_(sep.data(), sep.size()),
+        grouping_(grouping),
+        decimal_point_(decimal_point) {}
+
+  auto put(appender out, loc_value val, const format_specs& specs) const
+      -> bool {
+    return do_put(out, val, specs);
+  }
+};
+
+#define FMT_FORMAT_AS(Type, Base)                                   \
+  template <typename Char>                                          \
+  struct formatter<Type, Char> : formatter<Base, Char> {            \
+    template <typename FormatContext>                               \
+    FMT_CONSTEXPR auto format(Type value, FormatContext& ctx) const \
+        -> decltype(ctx.out()) {                                    \
+      return formatter<Base, Char>::format(value, ctx);             \
+    }                                                               \
+  }
+
+FMT_FORMAT_AS(signed char, int);
+FMT_FORMAT_AS(unsigned char, unsigned);
+FMT_FORMAT_AS(short, int);
+FMT_FORMAT_AS(unsigned short, unsigned);
+FMT_FORMAT_AS(long, detail::long_type);
+FMT_FORMAT_AS(unsigned long, detail::ulong_type);
+FMT_FORMAT_AS(Char*, const Char*);
+FMT_FORMAT_AS(detail::std_string_view<Char>, basic_string_view<Char>);
+FMT_FORMAT_AS(std::nullptr_t, const void*);
+FMT_FORMAT_AS(void*, const void*);
+
+template <typename Char, size_t N>
+struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {};
+
+template <typename Char, typename Traits, typename Allocator>
+class formatter<std::basic_string<Char, Traits, Allocator>, Char>
+    : public formatter<basic_string_view<Char>, Char> {};
+
+template <int N, typename Char>
+struct formatter<detail::bitint<N>, Char> : formatter<long long, Char> {};
+template <int N, typename Char>
+struct formatter<detail::ubitint<N>, Char>
+    : formatter<unsigned long long, Char> {};
+
+template <typename Char>
+struct formatter<detail::float128, Char>
+    : detail::native_formatter<detail::float128, Char,
+                               detail::type::float_type> {};
+
+template <typename T, typename Char>
+struct formatter<T, Char, void_t<detail::format_as_result<T>>>
+    : formatter<detail::format_as_result<T>, Char> {
+  template <typename FormatContext>
+  FMT_CONSTEXPR auto format(const T& value, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto&& val = format_as(value);  // Make an lvalue reference for format.
+    return formatter<detail::format_as_result<T>, Char>::format(val, ctx);
+  }
+};
+
+/**
+ * Converts `p` to `const void*` for pointer formatting.
+ *
+ * **Example**:
+ *
+ *     auto s = fmt::format("{}", fmt::ptr(p));
+ */
+template <typename T> auto ptr(T p) -> const void* {
+  static_assert(std::is_pointer<T>::value, "");
+  return detail::bit_cast<const void*>(p);
+}
+
+/**
+ * Converts `e` to the underlying type.
+ *
+ * **Example**:
+ *
+ *     enum class color { red, green, blue };
+ *     auto s = fmt::format("{}", fmt::underlying(color::red));  // s == "0"
+ */
+template <typename Enum>
+constexpr auto underlying(Enum e) noexcept -> underlying_t<Enum> {
+  return static_cast<underlying_t<Enum>>(e);
+}
+
+namespace enums {
+template <typename Enum, FMT_ENABLE_IF(std::is_enum<Enum>::value)>
+constexpr auto format_as(Enum e) noexcept -> underlying_t<Enum> {
+  return static_cast<underlying_t<Enum>>(e);
+}
+}  // namespace enums
+
+#ifdef __cpp_lib_byte
+template <> struct formatter<std::byte> : formatter<unsigned> {
+  static auto format_as(std::byte b) -> unsigned char {
+    return static_cast<unsigned char>(b);
+  }
+  template <typename Context>
+  auto format(std::byte b, Context& ctx) const -> decltype(ctx.out()) {
+    return formatter<unsigned>::format(format_as(b), ctx);
+  }
+};
+#endif
+
+struct bytes {
+  string_view data;
+
+  inline explicit bytes(string_view s) : data(s) {}
+};
+
+template <> struct formatter<bytes> {
+ private:
+  detail::dynamic_format_specs<> specs_;
+
+ public:
+  FMT_CONSTEXPR auto parse(parse_context<>& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::string_type);
+  }
+
+  template <typename FormatContext>
+  auto format(bytes b, FormatContext& ctx) const -> decltype(ctx.out()) {
+    auto specs = specs_;
+    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width,
+                                specs.width_ref, ctx);
+    detail::handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
+                                specs.precision_ref, ctx);
+    return detail::write_bytes<char>(ctx.out(), b.data, specs);
+  }
+};
+
+// group_digits_view is not derived from view because it copies the argument.
+template <typename T> struct group_digits_view {
+  T value;
+};
+
+/**
+ * Returns a view that formats an integer value using ',' as a
+ * locale-independent thousands separator.
+ *
+ * **Example**:
+ *
+ *     fmt::print("{}", fmt::group_digits(12345));
+ *     // Output: "12,345"
+ */
+template <typename T> auto group_digits(T value) -> group_digits_view<T> {
+  return {value};
+}
+
+template <typename T> struct formatter<group_digits_view<T>> : formatter<T> {
+ private:
+  detail::dynamic_format_specs<> specs_;
+
+ public:
+  FMT_CONSTEXPR auto parse(parse_context<>& ctx) -> const char* {
+    return parse_format_specs(ctx.begin(), ctx.end(), specs_, ctx,
+                              detail::type::int_type);
+  }
+
+  template <typename FormatContext>
+  auto format(group_digits_view<T> view, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    auto specs = specs_;
+    detail::handle_dynamic_spec(specs.dynamic_width(), specs.width,
+                                specs.width_ref, ctx);
+    detail::handle_dynamic_spec(specs.dynamic_precision(), specs.precision,
+                                specs.precision_ref, ctx);
+    auto arg = detail::make_write_int_arg(view.value, specs.sign());
+    return detail::write_int(
+        ctx.out(), static_cast<detail::uint64_or_128_t<T>>(arg.abs_value),
+        arg.prefix, specs, detail::digit_grouping<char>("\3", ","));
+  }
+};
+
+template <typename T, typename Char> struct nested_view {
+  const formatter<T, Char>* fmt;
+  const T* value;
+};
+
+template <typename T, typename Char>
+struct formatter<nested_view<T, Char>, Char> {
+  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+    return ctx.begin();
+  }
+  template <typename FormatContext>
+  auto format(nested_view<T, Char> view, FormatContext& ctx) const
+      -> decltype(ctx.out()) {
+    return view.fmt->format(*view.value, ctx);
+  }
+};
+
+template <typename T, typename Char = char> struct nested_formatter {
+ private:
+  basic_specs specs_;
+  int width_;
+  formatter<T, Char> formatter_;
+
+ public:
+  constexpr nested_formatter() : width_(0) {}
+
+  FMT_CONSTEXPR auto parse(parse_context<Char>& ctx) -> const Char* {
+    auto it = ctx.begin(), end = ctx.end();
+    if (it == end) return it;
+    auto specs = format_specs();
+    it = detail::parse_align(it, end, specs);
+    specs_ = specs;
+    Char c = *it;
+    auto width_ref = detail::arg_ref<Char>();
+    if ((c >= '0' && c <= '9') || c == '{') {
+      it = detail::parse_width(it, end, specs, width_ref, ctx);
+      width_ = specs.width;
+    }
+    ctx.advance_to(it);
+    return formatter_.parse(ctx);
+  }
+
+  template <typename FormatContext, typename F>
+  auto write_padded(FormatContext& ctx, F write) const -> decltype(ctx.out()) {
+    if (width_ == 0) return write(ctx.out());
+    auto buf = basic_memory_buffer<Char>();
+    write(basic_appender<Char>(buf));
+    auto specs = format_specs();
+    specs.width = width_;
+    specs.copy_fill_from(specs_);
+    specs.set_align(specs_.align());
+    return detail::write<Char>(
+        ctx.out(), basic_string_view<Char>(buf.data(), buf.size()), specs);
+  }
+
+  auto nested(const T& value) const -> nested_view<T, Char> {
+    return nested_view<T, Char>{&formatter_, &value};
+  }
+};
+
+inline namespace literals {
+#if FMT_USE_NONTYPE_TEMPLATE_ARGS
+template <detail::fixed_string S> constexpr auto operator""_a() {
+  using char_t = remove_cvref_t<decltype(*S.data)>;
+  return detail::udl_arg<char_t, sizeof(S.data) / sizeof(char_t), S>();
+}
+#else
+/**
+ * User-defined literal equivalent of `fmt::arg`.
+ *
+ * **Example**:
+ *
+ *     using namespace fmt::literals;
+ *     fmt::print("The answer is {answer}.", "answer"_a=42);
+ */
+constexpr auto operator""_a(const char* s, size_t) -> detail::udl_arg<char> {
+  return {s};
+}
+#endif  // FMT_USE_NONTYPE_TEMPLATE_ARGS
+}  // namespace literals
+
+/// A fast integer formatter.
+class format_int {
+ private:
+  // Buffer should be large enough to hold all digits (digits10 + 1),
+  // a sign and a null character.
+  enum { buffer_size = std::numeric_limits<unsigned long long>::digits10 + 3 };
+  mutable char buffer_[buffer_size];
+  char* str_;
+
+  template <typename UInt>
+  FMT_CONSTEXPR20 auto format_unsigned(UInt value) -> char* {
+    auto n = static_cast<detail::uint32_or_64_or_128_t<UInt>>(value);
+    return detail::do_format_decimal(buffer_, n, buffer_size - 1);
+  }
+
+  template <typename Int>
+  FMT_CONSTEXPR20 auto format_signed(Int value) -> char* {
+    auto abs_value = static_cast<detail::uint32_or_64_or_128_t<Int>>(value);
+    bool negative = value < 0;
+    if (negative) abs_value = 0 - abs_value;
+    auto begin = format_unsigned(abs_value);
+    if (negative) *--begin = '-';
+    return begin;
+  }
+
+ public:
+  FMT_CONSTEXPR20 explicit format_int(int value) : str_(format_signed(value)) {}
+  FMT_CONSTEXPR20 explicit format_int(long value)
+      : str_(format_signed(value)) {}
+  FMT_CONSTEXPR20 explicit format_int(long long value)
+      : str_(format_signed(value)) {}
+  FMT_CONSTEXPR20 explicit format_int(unsigned value)
+      : str_(format_unsigned(value)) {}
+  FMT_CONSTEXPR20 explicit format_int(unsigned long value)
+      : str_(format_unsigned(value)) {}
+  FMT_CONSTEXPR20 explicit format_int(unsigned long long value)
+      : str_(format_unsigned(value)) {}
+
+  /// Returns the number of characters written to the output buffer.
+  FMT_CONSTEXPR20 auto size() const -> size_t {
+    return detail::to_unsigned(buffer_ - str_ + buffer_size - 1);
+  }
+
+  /// Returns a pointer to the output buffer content. No terminating null
+  /// character is appended.
+  FMT_CONSTEXPR20 auto data() const -> const char* { return str_; }
+
+  /// Returns a pointer to the output buffer content with terminating null
+  /// character appended.
+  FMT_CONSTEXPR20 auto c_str() const -> const char* {
+    buffer_[buffer_size - 1] = '\0';
+    return str_;
+  }
+
+  /// Returns the content of the output buffer as an `std::string`.
+  inline auto str() const -> std::string { return {str_, size()}; }
+};
+
+#define FMT_STRING_IMPL(s, base)                                              \
+  [] {                                                                        \
+    /* Use the hidden visibility as a workaround for a GCC bug (#1973). */    \
+    /* Use a macro-like name to avoid shadowing warnings. */                  \
+    struct FMT_VISIBILITY("hidden") FMT_COMPILE_STRING : base {               \
+      using char_type = fmt::remove_cvref_t<decltype(s[0])>;                  \
+      constexpr explicit operator fmt::basic_string_view<char_type>() const { \
+        return fmt::detail::compile_string_to_view<char_type>(s);             \
+      }                                                                       \
+    };                                                                        \
+    using FMT_STRING_VIEW =                                                   \
+        fmt::basic_string_view<typename FMT_COMPILE_STRING::char_type>;       \
+    fmt::detail::ignore_unused(FMT_STRING_VIEW(FMT_COMPILE_STRING()));        \
+    return FMT_COMPILE_STRING();                                              \
+  }()
+
+/**
+ * Constructs a legacy compile-time format string from a string literal `s`.
+ *
+ * **Example**:
+ *
+ *     // A compile-time error because 'd' is an invalid specifier for strings.
+ *     std::string s = fmt::format(FMT_STRING("{:d}"), "foo");
+ */
+#define FMT_STRING(s) FMT_STRING_IMPL(s, fmt::detail::compile_string)
+
+FMT_API auto vsystem_error(int error_code, string_view fmt, format_args args)
+    -> std::system_error;
+
+/**
+ * Constructs `std::system_error` with a message formatted with
+ * `fmt::format(fmt, args...)`.
+ * `error_code` is a system error code as given by `errno`.
+ *
+ * **Example**:
+ *
+ *     // This throws std::system_error with the description
+ *     //   cannot open file 'madeup': No such file or directory
+ *     // or similar (system message may vary).
+ *     const char* filename = "madeup";
+ *     FILE* file = fopen(filename, "r");
+ *     if (!file)
+ *       throw fmt::system_error(errno, "cannot open file '{}'", filename);
+ */
+template <typename... T>
+auto system_error(int error_code, format_string<T...> fmt, T&&... args)
+    -> std::system_error {
+  return vsystem_error(error_code, fmt.str, vargs<T...>{{args...}});
+}
+
+/**
+ * Formats an error message for an error returned by an operating system or a
+ * language runtime, for example a file opening error, and writes it to `out`.
+ * The format is the same as the one used by `std::system_error(ec, message)`
+ * where `ec` is `std::error_code(error_code, std::generic_category())`.
+ * It is implementation-defined but normally looks like:
+ *
+ *     <message>: <system-message>
+ *
+ * where `<message>` is the passed message and `<system-message>` is the system
+ * message corresponding to the error code.
+ * `error_code` is a system error code as given by `errno`.
+ */
+FMT_API void format_system_error(detail::buffer<char>& out, int error_code,
+                                 const char* message) noexcept;
+
+// Reports a system error without throwing an exception.
+// Can be used to report errors from destructors.
+FMT_API void report_system_error(int error_code, const char* message) noexcept;
+
+template <typename Locale, FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+inline auto vformat(const Locale& loc, string_view fmt, format_args args)
+    -> std::string {
+  auto buf = memory_buffer();
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return {buf.data(), buf.size()};
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+FMT_INLINE auto format(const Locale& loc, format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(loc, fmt.str, vargs<T...>{{args...}});
+}
+
+template <typename OutputIt, typename Locale,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value)>
+auto vformat_to(OutputIt out, const Locale& loc, string_view fmt,
+                format_args args) -> OutputIt {
+  auto&& buf = detail::get_buffer<char>(out);
+  detail::vformat_to(buf, fmt, args, detail::locale_ref(loc));
+  return detail::get_iterator(buf, out);
+}
+
+template <typename OutputIt, typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char>::value&&
+                            detail::is_locale<Locale>::value)>
+FMT_INLINE auto format_to(OutputIt out, const Locale& loc,
+                          format_string<T...> fmt, T&&... args) -> OutputIt {
+  return fmt::vformat_to(out, loc, fmt.str, vargs<T...>{{args...}});
+}
+
+template <typename Locale, typename... T,
+          FMT_ENABLE_IF(detail::is_locale<Locale>::value)>
+FMT_NODISCARD FMT_INLINE auto formatted_size(const Locale& loc,
+                                             format_string<T...> fmt,
+                                             T&&... args) -> size_t {
+  auto buf = detail::counting_buffer<>();
+  detail::vformat_to(buf, fmt.str, vargs<T...>{{args...}},
+                     detail::locale_ref(loc));
+  return buf.count();
+}
+
+FMT_API auto vformat(string_view fmt, format_args args) -> std::string;
+
+/**
+ * Formats `args` according to specifications in `fmt` and returns the result
+ * as a string.
+ *
+ * **Example**:
+ *
+ *     #include <fmt/format.h>
+ *     std::string message = fmt::format("The answer is {}.", 42);
+ */
+template <typename... T>
+FMT_NODISCARD FMT_INLINE auto format(format_string<T...> fmt, T&&... args)
+    -> std::string {
+  return vformat(fmt.str, vargs<T...>{{args...}});
+}
+
+/**
+ * Converts `value` to `std::string` using the default format for type `T`.
+ *
+ * **Example**:
+ *
+ *     std::string answer = fmt::to_string(42);
+ */
+template <typename T, FMT_ENABLE_IF(std::is_integral<T>::value)>
+FMT_NODISCARD auto to_string(T value) -> std::string {
+  // The buffer should be large enough to store the number including the sign
+  // or "false" for bool.
+  char buffer[max_of(detail::digits10<T>() + 2, 5)];
+  return {buffer, detail::write<char>(buffer, value)};
+}
+
+template <typename T, FMT_ENABLE_IF(detail::use_format_as<T>::value)>
+FMT_NODISCARD auto to_string(const T& value) -> std::string {
+  return to_string(format_as(value));
+}
+
+template <typename T, FMT_ENABLE_IF(!std::is_integral<T>::value &&
+                                    !detail::use_format_as<T>::value)>
+FMT_NODISCARD auto to_string(const T& value) -> std::string {
+  auto buffer = memory_buffer();
+  detail::write<char>(appender(buffer), value);
+  return {buffer.data(), buffer.size()};
+}
+
+FMT_END_EXPORT
+FMT_END_NAMESPACE
+
+#ifdef FMT_HEADER_ONLY
+#  define FMT_FUNC inline
+#  include "format-inl.h"
+#endif
+
+// Restore _LIBCPP_REMOVE_TRANSITIVE_INCLUDES.
+#ifdef FMT_REMOVE_TRANSITIVE_INCLUDES
+#  undef _LIBCPP_REMOVE_TRANSITIVE_INCLUDES
+#endif
+
+#endif  // FMT_FORMAT_H_
diff --git a/third_party/spdlog/fmt/fmt.h b/third_party/spdlog/fmt/fmt.h
new file mode 100644
index 00000000000..7fa6b093360
--- /dev/null
+++ b/third_party/spdlog/fmt/fmt.h
@@ -0,0 +1,30 @@
+//
+// Copyright(c) 2016-2018 Gabi Melman.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+//
+
+#pragma once
+
+//
+// Include a bundled header-only copy of fmtlib or an external one.
+// By default spdlog include its own copy.
+//
+#include <spdlog/tweakme.h>
+
+#if defined(SPDLOG_USE_STD_FORMAT)  // SPDLOG_USE_STD_FORMAT is defined - use std::format
+    #include <format>
+#elif !defined(SPDLOG_FMT_EXTERNAL)
+    #if !defined(SPDLOG_COMPILED_LIB) && !defined(FMT_HEADER_ONLY)
+        #define FMT_HEADER_ONLY
+    #endif
+    #ifndef FMT_USE_WINDOWS_H
+        #define FMT_USE_WINDOWS_H 0
+    #endif
+
+    #include <spdlog/fmt/bundled/core.h>
+    #include <spdlog/fmt/bundled/format.h>
+
+#else  // SPDLOG_FMT_EXTERNAL is defined - use external fmtlib
+    #include <fmt/core.h>
+    #include <fmt/format.h>
+#endif
diff --git a/src/common/spdlog/formatter.h b/third_party/spdlog/formatter.h
old mode 100755
new mode 100644
similarity index 78%
rename from src/common/spdlog/formatter.h
rename to third_party/spdlog/formatter.h
index 5216f8b7e41..4d482f827e6
--- a/src/common/spdlog/formatter.h
+++ b/third_party/spdlog/formatter.h
@@ -3,8 +3,8 @@
 
 #pragma once
 
-#include <common/spdlog/details/log_msg.h>
-#include <common/spdlog/fmt/fmt.h>
+#include <spdlog/details/log_msg.h>
+#include <spdlog/fmt/fmt.h>
 
 namespace spdlog {
 
@@ -14,4 +14,4 @@ class formatter {
     virtual void format(const details::log_msg &msg, memory_buf_t &dest) = 0;
     virtual std::unique_ptr<formatter> clone() const = 0;
 };
-} // namespace spdlog
+}  // namespace spdlog
diff --git a/third_party/spdlog/logger-inl.h b/third_party/spdlog/logger-inl.h
new file mode 100644
index 00000000000..5218fe4c440
--- /dev/null
+++ b/third_party/spdlog/logger-inl.h
@@ -0,0 +1,198 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/logger.h>
+#endif
+
+#include <spdlog/details/backtracer.h>
+#include <spdlog/pattern_formatter.h>
+#include <spdlog/sinks/sink.h>
+
+#include <cstdio>
+
+namespace spdlog {
+
+// public methods
+SPDLOG_INLINE logger::logger(const logger &other)
+    : name_(other.name_),
+      sinks_(other.sinks_),
+      level_(other.level_.load(std::memory_order_relaxed)),
+      flush_level_(other.flush_level_.load(std::memory_order_relaxed)),
+      custom_err_handler_(other.custom_err_handler_),
+      tracer_(other.tracer_) {}
+
+SPDLOG_INLINE logger::logger(logger &&other) SPDLOG_NOEXCEPT
+    : name_(std::move(other.name_)),
+      sinks_(std::move(other.sinks_)),
+      level_(other.level_.load(std::memory_order_relaxed)),
+      flush_level_(other.flush_level_.load(std::memory_order_relaxed)),
+      custom_err_handler_(std::move(other.custom_err_handler_)),
+      tracer_(std::move(other.tracer_))
+
+{}
+
+SPDLOG_INLINE logger &logger::operator=(logger other) SPDLOG_NOEXCEPT {
+    this->swap(other);
+    return *this;
+}
+
+SPDLOG_INLINE void logger::swap(spdlog::logger &other) SPDLOG_NOEXCEPT {
+    name_.swap(other.name_);
+    sinks_.swap(other.sinks_);
+
+    // swap level_
+    auto other_level = other.level_.load();
+    auto my_level = level_.exchange(other_level);
+    other.level_.store(my_level);
+
+    // swap flush level_
+    other_level = other.flush_level_.load();
+    my_level = flush_level_.exchange(other_level);
+    other.flush_level_.store(my_level);
+
+    custom_err_handler_.swap(other.custom_err_handler_);
+    std::swap(tracer_, other.tracer_);
+}
+
+SPDLOG_INLINE void swap(logger &a, logger &b) { a.swap(b); }
+
+SPDLOG_INLINE void logger::set_level(level::level_enum log_level) { level_.store(log_level); }
+
+SPDLOG_INLINE level::level_enum logger::level() const {
+    return static_cast<level::level_enum>(level_.load(std::memory_order_relaxed));
+}
+
+SPDLOG_INLINE const std::string &logger::name() const { return name_; }
+
+// set formatting for the sinks in this logger.
+// each sink will get a separate instance of the formatter object.
+SPDLOG_INLINE void logger::set_formatter(std::unique_ptr<formatter> f) {
+    for (auto it = sinks_.begin(); it != sinks_.end(); ++it) {
+        if (std::next(it) == sinks_.end()) {
+            // last element - we can be move it.
+            (*it)->set_formatter(std::move(f));
+            break;  // to prevent clang-tidy warning
+        } else {
+            (*it)->set_formatter(f->clone());
+        }
+    }
+}
+
+SPDLOG_INLINE void logger::set_pattern(std::string pattern, pattern_time_type time_type) {
+    auto new_formatter = details::make_unique<pattern_formatter>(std::move(pattern), time_type);
+    set_formatter(std::move(new_formatter));
+}
+
+// create new backtrace sink and move to it all our child sinks
+SPDLOG_INLINE void logger::enable_backtrace(size_t n_messages) { tracer_.enable(n_messages); }
+
+// restore orig sinks and level and delete the backtrace sink
+SPDLOG_INLINE void logger::disable_backtrace() { tracer_.disable(); }
+
+SPDLOG_INLINE void logger::dump_backtrace() { dump_backtrace_(); }
+
+// flush functions
+SPDLOG_INLINE void logger::flush() { flush_(); }
+
+SPDLOG_INLINE void logger::flush_on(level::level_enum log_level) { flush_level_.store(log_level); }
+
+SPDLOG_INLINE level::level_enum logger::flush_level() const {
+    return static_cast<level::level_enum>(flush_level_.load(std::memory_order_relaxed));
+}
+
+// sinks
+SPDLOG_INLINE const std::vector<sink_ptr> &logger::sinks() const { return sinks_; }
+
+SPDLOG_INLINE std::vector<sink_ptr> &logger::sinks() { return sinks_; }
+
+// error handler
+SPDLOG_INLINE void logger::set_error_handler(err_handler handler) {
+    custom_err_handler_ = std::move(handler);
+}
+
+// create new logger with same sinks and configuration.
+SPDLOG_INLINE std::shared_ptr<logger> logger::clone(std::string logger_name) {
+    auto cloned = std::make_shared<logger>(*this);
+    cloned->name_ = std::move(logger_name);
+    return cloned;
+}
+
+// protected methods
+SPDLOG_INLINE void logger::log_it_(const spdlog::details::log_msg &log_msg,
+                                   bool log_enabled,
+                                   bool traceback_enabled) {
+    if (log_enabled) {
+        sink_it_(log_msg);
+    }
+    if (traceback_enabled) {
+        tracer_.push_back(log_msg);
+    }
+}
+
+SPDLOG_INLINE void logger::sink_it_(const details::log_msg &msg) {
+    for (auto &sink : sinks_) {
+        if (sink->should_log(msg.level)) {
+            SPDLOG_TRY { sink->log(msg); }
+            SPDLOG_LOGGER_CATCH(msg.source)
+        }
+    }
+
+    if (should_flush_(msg)) {
+        flush_();
+    }
+}
+
+SPDLOG_INLINE void logger::flush_() {
+    for (auto &sink : sinks_) {
+        SPDLOG_TRY { sink->flush(); }
+        SPDLOG_LOGGER_CATCH(source_loc())
+    }
+}
+
+SPDLOG_INLINE void logger::dump_backtrace_() {
+    using details::log_msg;
+    if (tracer_.enabled() && !tracer_.empty()) {
+        sink_it_(
+            log_msg{name(), level::info, "****************** Backtrace Start ******************"});
+        tracer_.foreach_pop([this](const log_msg &msg) { this->sink_it_(msg); });
+        sink_it_(
+            log_msg{name(), level::info, "****************** Backtrace End ********************"});
+    }
+}
+
+SPDLOG_INLINE bool logger::should_flush_(const details::log_msg &msg) {
+    auto flush_level = flush_level_.load(std::memory_order_relaxed);
+    return (msg.level >= flush_level) && (msg.level != level::off);
+}
+
+SPDLOG_INLINE void logger::err_handler_(const std::string &msg) {
+    if (custom_err_handler_) {
+        custom_err_handler_(msg);
+    } else {
+        using std::chrono::system_clock;
+        static std::mutex mutex;
+        static std::chrono::system_clock::time_point last_report_time;
+        static size_t err_counter = 0;
+        std::lock_guard<std::mutex> lk{mutex};
+        auto now = system_clock::now();
+        err_counter++;
+        if (now - last_report_time < std::chrono::seconds(1)) {
+            return;
+        }
+        last_report_time = now;
+        auto tm_time = details::os::localtime(system_clock::to_time_t(now));
+        char date_buf[64];
+        std::strftime(date_buf, sizeof(date_buf), "%Y-%m-%d %H:%M:%S", &tm_time);
+#if defined(USING_R) && defined(R_R_H)  // if in R environment
+        REprintf("[*** LOG ERROR #%04zu ***] [%s] [%s] %s\n", err_counter, date_buf, name().c_str(),
+                 msg.c_str());
+#else
+        std::fprintf(stderr, "[*** LOG ERROR #%04zu ***] [%s] [%s] %s\n", err_counter, date_buf,
+                     name().c_str(), msg.c_str());
+#endif
+    }
+}
+}  // namespace spdlog
diff --git a/third_party/spdlog/logger.h b/third_party/spdlog/logger.h
new file mode 100644
index 00000000000..f49bdc009a3
--- /dev/null
+++ b/third_party/spdlog/logger.h
@@ -0,0 +1,379 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+// Thread safe logger (except for set_error_handler())
+// Has name, log level, vector of std::shared sink pointers and formatter
+// Upon each log write the logger:
+// 1. Checks if its log level is enough to log the message and if yes:
+// 2. Call the underlying sinks to do the job.
+// 3. Each sink use its own private copy of a formatter to format the message
+// and send to its destination.
+//
+// The use of private formatter per sink provides the opportunity to cache some
+// formatted data, and support for different format per sink.
+
+#include <spdlog/common.h>
+#include <spdlog/details/backtracer.h>
+#include <spdlog/details/log_msg.h>
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+    #ifndef _WIN32
+        #error SPDLOG_WCHAR_TO_UTF8_SUPPORT only supported on windows
+    #endif
+    #include <spdlog/details/os.h>
+#endif
+
+#include <vector>
+
+#ifndef SPDLOG_NO_EXCEPTIONS
+    #define SPDLOG_LOGGER_CATCH(location)                                                 \
+        catch (const std::exception &ex) {                                                \
+            if (location.filename) {                                                      \
+                err_handler_(fmt_lib::format(SPDLOG_FMT_STRING("{} [{}({})]"), ex.what(), \
+                                             location.filename, location.line));          \
+            } else {                                                                      \
+                err_handler_(ex.what());                                                  \
+            }                                                                             \
+        }                                                                                 \
+        catch (...) {                                                                     \
+            err_handler_("Rethrowing unknown exception in logger");                       \
+            throw;                                                                        \
+        }
+#else
+    #define SPDLOG_LOGGER_CATCH(location)
+#endif
+
+namespace spdlog {
+
+class SPDLOG_API logger {
+public:
+    // Empty logger
+    explicit logger(std::string name)
+        : name_(std::move(name)),
+          sinks_() {}
+
+    // Logger with range on sinks
+    template <typename It>
+    logger(std::string name, It begin, It end)
+        : name_(std::move(name)),
+          sinks_(begin, end) {}
+
+    // Logger with single sink
+    logger(std::string name, sink_ptr single_sink)
+        : logger(std::move(name), {std::move(single_sink)}) {}
+
+    // Logger with sinks init list
+    logger(std::string name, sinks_init_list sinks)
+        : logger(std::move(name), sinks.begin(), sinks.end()) {}
+
+    virtual ~logger() = default;
+
+    logger(const logger &other);
+    logger(logger &&other) SPDLOG_NOEXCEPT;
+    logger &operator=(logger other) SPDLOG_NOEXCEPT;
+    void swap(spdlog::logger &other) SPDLOG_NOEXCEPT;
+
+    template <typename... Args>
+    void log(source_loc loc, level::level_enum lvl, format_string_t<Args...> fmt, Args &&...args) {
+        log_(loc, lvl, details::to_string_view(fmt), std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void log(level::level_enum lvl, format_string_t<Args...> fmt, Args &&...args) {
+        log(source_loc{}, lvl, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename T>
+    void log(level::level_enum lvl, const T &msg) {
+        log(source_loc{}, lvl, msg);
+    }
+
+    // T cannot be statically converted to format string (including string_view/wstring_view)
+    template <class T,
+              typename std::enable_if<!is_convertible_to_any_format_string<const T &>::value,
+                                      int>::type = 0>
+    void log(source_loc loc, level::level_enum lvl, const T &msg) {
+        log(loc, lvl, "{}", msg);
+    }
+
+    void log(log_clock::time_point log_time,
+             source_loc loc,
+             level::level_enum lvl,
+             string_view_t msg) {
+        bool log_enabled = should_log(lvl);
+        bool traceback_enabled = tracer_.enabled();
+        if (!log_enabled && !traceback_enabled) {
+            return;
+        }
+
+        details::log_msg log_msg(log_time, loc, name_, lvl, msg);
+        log_it_(log_msg, log_enabled, traceback_enabled);
+    }
+
+    void log(source_loc loc, level::level_enum lvl, string_view_t msg) {
+        bool log_enabled = should_log(lvl);
+        bool traceback_enabled = tracer_.enabled();
+        if (!log_enabled && !traceback_enabled) {
+            return;
+        }
+
+        details::log_msg log_msg(loc, name_, lvl, msg);
+        log_it_(log_msg, log_enabled, traceback_enabled);
+    }
+
+    void log(level::level_enum lvl, string_view_t msg) { log(source_loc{}, lvl, msg); }
+
+    template <typename... Args>
+    void trace(format_string_t<Args...> fmt, Args &&...args) {
+        log(level::trace, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void debug(format_string_t<Args...> fmt, Args &&...args) {
+        log(level::debug, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void info(format_string_t<Args...> fmt, Args &&...args) {
+        log(level::info, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void warn(format_string_t<Args...> fmt, Args &&...args) {
+        log(level::warn, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void error(format_string_t<Args...> fmt, Args &&...args) {
+        log(level::err, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void critical(format_string_t<Args...> fmt, Args &&...args) {
+        log(level::critical, fmt, std::forward<Args>(args)...);
+    }
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+    template <typename... Args>
+    void log(source_loc loc, level::level_enum lvl, wformat_string_t<Args...> fmt, Args &&...args) {
+        log_(loc, lvl, details::to_string_view(fmt), std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void log(level::level_enum lvl, wformat_string_t<Args...> fmt, Args &&...args) {
+        log(source_loc{}, lvl, fmt, std::forward<Args>(args)...);
+    }
+
+    void log(log_clock::time_point log_time,
+             source_loc loc,
+             level::level_enum lvl,
+             wstring_view_t msg) {
+        bool log_enabled = should_log(lvl);
+        bool traceback_enabled = tracer_.enabled();
+        if (!log_enabled && !traceback_enabled) {
+            return;
+        }
+
+        memory_buf_t buf;
+        details::os::wstr_to_utf8buf(wstring_view_t(msg.data(), msg.size()), buf);
+        details::log_msg log_msg(log_time, loc, name_, lvl, string_view_t(buf.data(), buf.size()));
+        log_it_(log_msg, log_enabled, traceback_enabled);
+    }
+
+    void log(source_loc loc, level::level_enum lvl, wstring_view_t msg) {
+        bool log_enabled = should_log(lvl);
+        bool traceback_enabled = tracer_.enabled();
+        if (!log_enabled && !traceback_enabled) {
+            return;
+        }
+
+        memory_buf_t buf;
+        details::os::wstr_to_utf8buf(wstring_view_t(msg.data(), msg.size()), buf);
+        details::log_msg log_msg(loc, name_, lvl, string_view_t(buf.data(), buf.size()));
+        log_it_(log_msg, log_enabled, traceback_enabled);
+    }
+
+    void log(level::level_enum lvl, wstring_view_t msg) { log(source_loc{}, lvl, msg); }
+
+    template <typename... Args>
+    void trace(wformat_string_t<Args...> fmt, Args &&...args) {
+        log(level::trace, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void debug(wformat_string_t<Args...> fmt, Args &&...args) {
+        log(level::debug, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void info(wformat_string_t<Args...> fmt, Args &&...args) {
+        log(level::info, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void warn(wformat_string_t<Args...> fmt, Args &&...args) {
+        log(level::warn, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void error(wformat_string_t<Args...> fmt, Args &&...args) {
+        log(level::err, fmt, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    void critical(wformat_string_t<Args...> fmt, Args &&...args) {
+        log(level::critical, fmt, std::forward<Args>(args)...);
+    }
+#endif
+
+    template <typename T>
+    void trace(const T &msg) {
+        log(level::trace, msg);
+    }
+
+    template <typename T>
+    void debug(const T &msg) {
+        log(level::debug, msg);
+    }
+
+    template <typename T>
+    void info(const T &msg) {
+        log(level::info, msg);
+    }
+
+    template <typename T>
+    void warn(const T &msg) {
+        log(level::warn, msg);
+    }
+
+    template <typename T>
+    void error(const T &msg) {
+        log(level::err, msg);
+    }
+
+    template <typename T>
+    void critical(const T &msg) {
+        log(level::critical, msg);
+    }
+
+    // return true logging is enabled for the given level.
+    bool should_log(level::level_enum msg_level) const {
+        return msg_level >= level_.load(std::memory_order_relaxed);
+    }
+
+    // return true if backtrace logging is enabled.
+    bool should_backtrace() const { return tracer_.enabled(); }
+
+    void set_level(level::level_enum log_level);
+
+    level::level_enum level() const;
+
+    const std::string &name() const;
+
+    // set formatting for the sinks in this logger.
+    // each sink will get a separate instance of the formatter object.
+    void set_formatter(std::unique_ptr<formatter> f);
+
+    // set formatting for the sinks in this logger.
+    // equivalent to
+    //     set_formatter(make_unique<pattern_formatter>(pattern, time_type))
+    // Note: each sink will get a new instance of a formatter object, replacing the old one.
+    void set_pattern(std::string pattern, pattern_time_type time_type = pattern_time_type::local);
+
+    // backtrace support.
+    // efficiently store all debug/trace messages in a circular buffer until needed for debugging.
+    void enable_backtrace(size_t n_messages);
+    void disable_backtrace();
+    void dump_backtrace();
+
+    // flush functions
+    void flush();
+    void flush_on(level::level_enum log_level);
+    level::level_enum flush_level() const;
+
+    // sinks
+    const std::vector<sink_ptr> &sinks() const;
+
+    std::vector<sink_ptr> &sinks();
+
+    // error handler
+    void set_error_handler(err_handler);
+
+    // create new logger with same sinks and configuration.
+    virtual std::shared_ptr<logger> clone(std::string logger_name);
+
+protected:
+    std::string name_;
+    std::vector<sink_ptr> sinks_;
+    spdlog::level_t level_{level::info};
+    spdlog::level_t flush_level_{level::off};
+    err_handler custom_err_handler_{nullptr};
+    details::backtracer tracer_;
+
+    // common implementation for after templated public api has been resolved
+    template <typename... Args>
+    void log_(source_loc loc, level::level_enum lvl, string_view_t fmt, Args &&...args) {
+        bool log_enabled = should_log(lvl);
+        bool traceback_enabled = tracer_.enabled();
+        if (!log_enabled && !traceback_enabled) {
+            return;
+        }
+        SPDLOG_TRY {
+            memory_buf_t buf;
+#ifdef SPDLOG_USE_STD_FORMAT
+            fmt_lib::vformat_to(std::back_inserter(buf), fmt, fmt_lib::make_format_args(args...));
+#else
+            fmt::vformat_to(fmt::appender(buf), fmt, fmt::make_format_args(args...));
+#endif
+
+            details::log_msg log_msg(loc, name_, lvl, string_view_t(buf.data(), buf.size()));
+            log_it_(log_msg, log_enabled, traceback_enabled);
+        }
+        SPDLOG_LOGGER_CATCH(loc)
+    }
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+    template <typename... Args>
+    void log_(source_loc loc, level::level_enum lvl, wstring_view_t fmt, Args &&...args) {
+        bool log_enabled = should_log(lvl);
+        bool traceback_enabled = tracer_.enabled();
+        if (!log_enabled && !traceback_enabled) {
+            return;
+        }
+        SPDLOG_TRY {
+            // format to wmemory_buffer and convert to utf8
+            wmemory_buf_t wbuf;
+            fmt_lib::vformat_to(std::back_inserter(wbuf), fmt,
+                                fmt_lib::make_format_args<fmt_lib::wformat_context>(args...));
+
+            memory_buf_t buf;
+            details::os::wstr_to_utf8buf(wstring_view_t(wbuf.data(), wbuf.size()), buf);
+            details::log_msg log_msg(loc, name_, lvl, string_view_t(buf.data(), buf.size()));
+            log_it_(log_msg, log_enabled, traceback_enabled);
+        }
+        SPDLOG_LOGGER_CATCH(loc)
+    }
+#endif  // SPDLOG_WCHAR_TO_UTF8_SUPPORT
+
+    // log the given message (if the given log level is high enough),
+    // and save backtrace (if backtrace is enabled).
+    void log_it_(const details::log_msg &log_msg, bool log_enabled, bool traceback_enabled);
+    virtual void sink_it_(const details::log_msg &msg);
+    virtual void flush_();
+    void dump_backtrace_();
+    bool should_flush_(const details::log_msg &msg);
+
+    // handle errors during logging.
+    // default handler prints the error to stderr at max rate of 1 message/sec.
+    void err_handler_(const std::string &msg);
+};
+
+void swap(logger &a, logger &b);
+
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "logger-inl.h"
+#endif
diff --git a/src/common/spdlog/mdc.h b/third_party/spdlog/mdc.h
old mode 100755
new mode 100644
similarity index 82%
rename from src/common/spdlog/mdc.h
rename to third_party/spdlog/mdc.h
index 39ff249a5ef..80b6f25cdf4
--- a/src/common/spdlog/mdc.h
+++ b/third_party/spdlog/mdc.h
@@ -3,10 +3,14 @@
 
 #pragma once
 
+#if defined(SPDLOG_NO_TLS)
+    #error "This header requires thread local storage support, but SPDLOG_NO_TLS is defined."
+#endif
+
 #include <map>
 #include <string>
 
-#include <common/spdlog/common.h>
+#include <spdlog/common.h>
 
 // MDC is a simple map of key->string values stored in thread local storage whose content will be printed by the loggers.
 // Note: Not supported in async mode (thread local storage - so the async thread pool have different copy).
@@ -27,7 +31,9 @@ class SPDLOG_API mdc {
     static std::string get(const std::string &key) {
         auto &context = get_context();
         auto it = context.find(key);
-        if (it != context.end()) { return it->second; }
+        if (it != context.end()) {
+            return it->second;
+        }
         return "";
     }
 
@@ -41,4 +47,4 @@ class SPDLOG_API mdc {
     }
 };
 
-} // namespace spdlog
+}  // namespace spdlog
diff --git a/third_party/spdlog/pattern_formatter-inl.h b/third_party/spdlog/pattern_formatter-inl.h
new file mode 100644
index 00000000000..b53d8051451
--- /dev/null
+++ b/third_party/spdlog/pattern_formatter-inl.h
@@ -0,0 +1,1338 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/pattern_formatter.h>
+#endif
+
+#include <spdlog/details/fmt_helper.h>
+#include <spdlog/details/log_msg.h>
+#include <spdlog/details/os.h>
+
+#ifndef SPDLOG_NO_TLS
+    #include <spdlog/mdc.h>
+#endif
+
+#include <spdlog/fmt/fmt.h>
+#include <spdlog/formatter.h>
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <chrono>
+#include <cstring>
+#include <ctime>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
+namespace spdlog {
+namespace details {
+
+///////////////////////////////////////////////////////////////////////
+// name & level pattern appender
+///////////////////////////////////////////////////////////////////////
+
+class scoped_padder {
+public:
+    scoped_padder(size_t wrapped_size, const padding_info &padinfo, memory_buf_t &dest)
+        : padinfo_(padinfo),
+          dest_(dest) {
+        remaining_pad_ = static_cast<long>(padinfo.width_) - static_cast<long>(wrapped_size);
+        if (remaining_pad_ <= 0) {
+            return;
+        }
+
+        if (padinfo_.side_ == padding_info::pad_side::left) {
+            pad_it(remaining_pad_);
+            remaining_pad_ = 0;
+        } else if (padinfo_.side_ == padding_info::pad_side::center) {
+            auto half_pad = remaining_pad_ / 2;
+            auto reminder = remaining_pad_ & 1;
+            pad_it(half_pad);
+            remaining_pad_ = half_pad + reminder;  // for the right side
+        }
+    }
+
+    template <typename T>
+    static unsigned int count_digits(T n) {
+        return fmt_helper::count_digits(n);
+    }
+
+    ~scoped_padder() {
+        if (remaining_pad_ >= 0) {
+            pad_it(remaining_pad_);
+        } else if (padinfo_.truncate_) {
+            long new_size = static_cast<long>(dest_.size()) + remaining_pad_;
+            dest_.resize(static_cast<size_t>(new_size));
+        }
+    }
+
+private:
+    void pad_it(long count) {
+        fmt_helper::append_string_view(string_view_t(spaces_.data(), static_cast<size_t>(count)),
+                                       dest_);
+    }
+
+    const padding_info &padinfo_;
+    memory_buf_t &dest_;
+    long remaining_pad_;
+    string_view_t spaces_{"                                                                ", 64};
+};
+
+struct null_scoped_padder {
+    null_scoped_padder(size_t /*wrapped_size*/,
+                       const padding_info & /*padinfo*/,
+                       memory_buf_t & /*dest*/) {}
+
+    template <typename T>
+    static unsigned int count_digits(T /* number */) {
+        return 0;
+    }
+};
+
+template <typename ScopedPadder>
+class name_formatter final : public flag_formatter {
+public:
+    explicit name_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        ScopedPadder p(msg.logger_name.size(), padinfo_, dest);
+        fmt_helper::append_string_view(msg.logger_name, dest);
+    }
+};
+
+// log level appender
+template <typename ScopedPadder>
+class level_formatter final : public flag_formatter {
+public:
+    explicit level_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        const string_view_t &level_name = level::to_string_view(msg.level);
+        ScopedPadder p(level_name.size(), padinfo_, dest);
+        fmt_helper::append_string_view(level_name, dest);
+    }
+};
+
+// short log level appender
+template <typename ScopedPadder>
+class short_level_formatter final : public flag_formatter {
+public:
+    explicit short_level_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        string_view_t level_name{level::to_short_c_str(msg.level)};
+        ScopedPadder p(level_name.size(), padinfo_, dest);
+        fmt_helper::append_string_view(level_name, dest);
+    }
+};
+
+///////////////////////////////////////////////////////////////////////
+// Date time pattern appenders
+///////////////////////////////////////////////////////////////////////
+
+static const char *ampm(const tm &t) { return t.tm_hour >= 12 ? "PM" : "AM"; }
+
+static int to12h(const tm &t) { return t.tm_hour > 12 ? t.tm_hour - 12 : t.tm_hour; }
+
+// Abbreviated weekday name
+static std::array<const char *, 7> days{{"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}};
+
+template <typename ScopedPadder>
+class a_formatter final : public flag_formatter {
+public:
+    explicit a_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        string_view_t field_value{days[static_cast<size_t>(tm_time.tm_wday)]};
+        ScopedPadder p(field_value.size(), padinfo_, dest);
+        fmt_helper::append_string_view(field_value, dest);
+    }
+};
+
+// Full weekday name
+static std::array<const char *, 7> full_days{
+    {"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"}};
+
+template <typename ScopedPadder>
+class A_formatter : public flag_formatter {
+public:
+    explicit A_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        string_view_t field_value{full_days[static_cast<size_t>(tm_time.tm_wday)]};
+        ScopedPadder p(field_value.size(), padinfo_, dest);
+        fmt_helper::append_string_view(field_value, dest);
+    }
+};
+
+// Abbreviated month
+static const std::array<const char *, 12> months{
+    {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}};
+
+template <typename ScopedPadder>
+class b_formatter final : public flag_formatter {
+public:
+    explicit b_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        string_view_t field_value{months[static_cast<size_t>(tm_time.tm_mon)]};
+        ScopedPadder p(field_value.size(), padinfo_, dest);
+        fmt_helper::append_string_view(field_value, dest);
+    }
+};
+
+// Full month name
+static const std::array<const char *, 12> full_months{{"January", "February", "March", "April",
+                                                       "May", "June", "July", "August", "September",
+                                                       "October", "November", "December"}};
+
+template <typename ScopedPadder>
+class B_formatter final : public flag_formatter {
+public:
+    explicit B_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        string_view_t field_value{full_months[static_cast<size_t>(tm_time.tm_mon)]};
+        ScopedPadder p(field_value.size(), padinfo_, dest);
+        fmt_helper::append_string_view(field_value, dest);
+    }
+};
+
+// Date and time representation (Thu Aug 23 15:35:46 2014)
+template <typename ScopedPadder>
+class c_formatter final : public flag_formatter {
+public:
+    explicit c_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 24;
+        ScopedPadder p(field_size, padinfo_, dest);
+
+        fmt_helper::append_string_view(days[static_cast<size_t>(tm_time.tm_wday)], dest);
+        dest.push_back(' ');
+        fmt_helper::append_string_view(months[static_cast<size_t>(tm_time.tm_mon)], dest);
+        dest.push_back(' ');
+        fmt_helper::append_int(tm_time.tm_mday, dest);
+        dest.push_back(' ');
+        // time
+
+        fmt_helper::pad2(tm_time.tm_hour, dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_min, dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_sec, dest);
+        dest.push_back(' ');
+        fmt_helper::append_int(tm_time.tm_year + 1900, dest);
+    }
+};
+
+// year - 2 digit
+template <typename ScopedPadder>
+class C_formatter final : public flag_formatter {
+public:
+    explicit C_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(tm_time.tm_year % 100, dest);
+    }
+};
+
+// Short MM/DD/YY date, equivalent to %m/%d/%y 08/23/01
+template <typename ScopedPadder>
+class D_formatter final : public flag_formatter {
+public:
+    explicit D_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 10;
+        ScopedPadder p(field_size, padinfo_, dest);
+
+        fmt_helper::pad2(tm_time.tm_mon + 1, dest);
+        dest.push_back('/');
+        fmt_helper::pad2(tm_time.tm_mday, dest);
+        dest.push_back('/');
+        fmt_helper::pad2(tm_time.tm_year % 100, dest);
+    }
+};
+
+// year - 4 digit
+template <typename ScopedPadder>
+class Y_formatter final : public flag_formatter {
+public:
+    explicit Y_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 4;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::append_int(tm_time.tm_year + 1900, dest);
+    }
+};
+
+// month 1-12
+template <typename ScopedPadder>
+class m_formatter final : public flag_formatter {
+public:
+    explicit m_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(tm_time.tm_mon + 1, dest);
+    }
+};
+
+// day of month 1-31
+template <typename ScopedPadder>
+class d_formatter final : public flag_formatter {
+public:
+    explicit d_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(tm_time.tm_mday, dest);
+    }
+};
+
+// hours in 24 format 0-23
+template <typename ScopedPadder>
+class H_formatter final : public flag_formatter {
+public:
+    explicit H_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(tm_time.tm_hour, dest);
+    }
+};
+
+// hours in 12 format 1-12
+template <typename ScopedPadder>
+class I_formatter final : public flag_formatter {
+public:
+    explicit I_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(to12h(tm_time), dest);
+    }
+};
+
+// minutes 0-59
+template <typename ScopedPadder>
+class M_formatter final : public flag_formatter {
+public:
+    explicit M_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(tm_time.tm_min, dest);
+    }
+};
+
+// seconds 0-59
+template <typename ScopedPadder>
+class S_formatter final : public flag_formatter {
+public:
+    explicit S_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad2(tm_time.tm_sec, dest);
+    }
+};
+
+// milliseconds
+template <typename ScopedPadder>
+class e_formatter final : public flag_formatter {
+public:
+    explicit e_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        auto millis = fmt_helper::time_fraction<std::chrono::milliseconds>(msg.time);
+        const size_t field_size = 3;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad3(static_cast<uint32_t>(millis.count()), dest);
+    }
+};
+
+// microseconds
+template <typename ScopedPadder>
+class f_formatter final : public flag_formatter {
+public:
+    explicit f_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        auto micros = fmt_helper::time_fraction<std::chrono::microseconds>(msg.time);
+
+        const size_t field_size = 6;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad6(static_cast<size_t>(micros.count()), dest);
+    }
+};
+
+// nanoseconds
+template <typename ScopedPadder>
+class F_formatter final : public flag_formatter {
+public:
+    explicit F_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        auto ns = fmt_helper::time_fraction<std::chrono::nanoseconds>(msg.time);
+        const size_t field_size = 9;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::pad9(static_cast<size_t>(ns.count()), dest);
+    }
+};
+
+// seconds since epoch
+template <typename ScopedPadder>
+class E_formatter final : public flag_formatter {
+public:
+    explicit E_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        const size_t field_size = 10;
+        ScopedPadder p(field_size, padinfo_, dest);
+        auto duration = msg.time.time_since_epoch();
+        auto seconds = std::chrono::duration_cast<std::chrono::seconds>(duration).count();
+        fmt_helper::append_int(seconds, dest);
+    }
+};
+
+// AM/PM
+template <typename ScopedPadder>
+class p_formatter final : public flag_formatter {
+public:
+    explicit p_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 2;
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::append_string_view(ampm(tm_time), dest);
+    }
+};
+
+// 12 hour clock 02:55:02 pm
+template <typename ScopedPadder>
+class r_formatter final : public flag_formatter {
+public:
+    explicit r_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 11;
+        ScopedPadder p(field_size, padinfo_, dest);
+
+        fmt_helper::pad2(to12h(tm_time), dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_min, dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_sec, dest);
+        dest.push_back(' ');
+        fmt_helper::append_string_view(ampm(tm_time), dest);
+    }
+};
+
+// 24-hour HH:MM time, equivalent to %H:%M
+template <typename ScopedPadder>
+class R_formatter final : public flag_formatter {
+public:
+    explicit R_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 5;
+        ScopedPadder p(field_size, padinfo_, dest);
+
+        fmt_helper::pad2(tm_time.tm_hour, dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_min, dest);
+    }
+};
+
+// ISO 8601 time format (HH:MM:SS), equivalent to %H:%M:%S
+template <typename ScopedPadder>
+class T_formatter final : public flag_formatter {
+public:
+    explicit T_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 8;
+        ScopedPadder p(field_size, padinfo_, dest);
+
+        fmt_helper::pad2(tm_time.tm_hour, dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_min, dest);
+        dest.push_back(':');
+        fmt_helper::pad2(tm_time.tm_sec, dest);
+    }
+};
+
+// ISO 8601 offset from UTC in timezone (+-HH:MM)
+template <typename ScopedPadder>
+class z_formatter final : public flag_formatter {
+public:
+    explicit z_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    z_formatter() = default;
+    z_formatter(const z_formatter &) = delete;
+    z_formatter &operator=(const z_formatter &) = delete;
+
+    void format(const details::log_msg &msg, const std::tm &tm_time, memory_buf_t &dest) override {
+        const size_t field_size = 6;
+        ScopedPadder p(field_size, padinfo_, dest);
+
+        auto total_minutes = get_cached_offset(msg, tm_time);
+        bool is_negative = total_minutes < 0;
+        if (is_negative) {
+            total_minutes = -total_minutes;
+            dest.push_back('-');
+        } else {
+            dest.push_back('+');
+        }
+
+        fmt_helper::pad2(total_minutes / 60, dest);  // hours
+        dest.push_back(':');
+        fmt_helper::pad2(total_minutes % 60, dest);  // minutes
+    }
+
+private:
+    log_clock::time_point last_update_{std::chrono::seconds(0)};
+    int offset_minutes_{0};
+
+    int get_cached_offset(const log_msg &msg, const std::tm &tm_time) {
+        // refresh every 10 seconds
+        if (msg.time - last_update_ >= std::chrono::seconds(10)) {
+            offset_minutes_ = os::utc_minutes_offset(tm_time);
+            last_update_ = msg.time;
+        }
+        return offset_minutes_;
+    }
+};
+
+// Thread id
+template <typename ScopedPadder>
+class t_formatter final : public flag_formatter {
+public:
+    explicit t_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        const auto field_size = ScopedPadder::count_digits(msg.thread_id);
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::append_int(msg.thread_id, dest);
+    }
+};
+
+// Current pid
+template <typename ScopedPadder>
+class pid_formatter final : public flag_formatter {
+public:
+    explicit pid_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &, memory_buf_t &dest) override {
+        const auto pid = static_cast<uint32_t>(details::os::pid());
+        auto field_size = ScopedPadder::count_digits(pid);
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::append_int(pid, dest);
+    }
+};
+
+template <typename ScopedPadder>
+class v_formatter final : public flag_formatter {
+public:
+    explicit v_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        ScopedPadder p(msg.payload.size(), padinfo_, dest);
+        fmt_helper::append_string_view(msg.payload, dest);
+    }
+};
+
+class ch_formatter final : public flag_formatter {
+public:
+    explicit ch_formatter(char ch)
+        : ch_(ch) {}
+
+    void format(const details::log_msg &, const std::tm &, memory_buf_t &dest) override {
+        dest.push_back(ch_);
+    }
+
+private:
+    char ch_;
+};
+
+// aggregate user chars to display as is
+class aggregate_formatter final : public flag_formatter {
+public:
+    aggregate_formatter() = default;
+
+    void add_ch(char ch) { str_ += ch; }
+    void format(const details::log_msg &, const std::tm &, memory_buf_t &dest) override {
+        fmt_helper::append_string_view(str_, dest);
+    }
+
+private:
+    std::string str_;
+};
+
+// mark the color range. expect it to be in the form of "%^colored text%$"
+class color_start_formatter final : public flag_formatter {
+public:
+    explicit color_start_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        msg.color_range_start = dest.size();
+    }
+};
+
+class color_stop_formatter final : public flag_formatter {
+public:
+    explicit color_stop_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        msg.color_range_end = dest.size();
+    }
+};
+
+// print source location
+template <typename ScopedPadder>
+class source_location_formatter final : public flag_formatter {
+public:
+    explicit source_location_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        if (msg.source.empty()) {
+            ScopedPadder p(0, padinfo_, dest);
+            return;
+        }
+
+        size_t text_size;
+        if (padinfo_.enabled()) {
+            // calc text size for padding based on "filename:line"
+            text_size = std::char_traits<char>::length(msg.source.filename) +
+                        ScopedPadder::count_digits(msg.source.line) + 1;
+        } else {
+            text_size = 0;
+        }
+
+        ScopedPadder p(text_size, padinfo_, dest);
+        fmt_helper::append_string_view(msg.source.filename, dest);
+        dest.push_back(':');
+        fmt_helper::append_int(msg.source.line, dest);
+    }
+};
+
+// print source filename
+template <typename ScopedPadder>
+class source_filename_formatter final : public flag_formatter {
+public:
+    explicit source_filename_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        if (msg.source.empty()) {
+            ScopedPadder p(0, padinfo_, dest);
+            return;
+        }
+        size_t text_size =
+            padinfo_.enabled() ? std::char_traits<char>::length(msg.source.filename) : 0;
+        ScopedPadder p(text_size, padinfo_, dest);
+        fmt_helper::append_string_view(msg.source.filename, dest);
+    }
+};
+
+template <typename ScopedPadder>
+class short_filename_formatter final : public flag_formatter {
+public:
+    explicit short_filename_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+#ifdef _MSC_VER
+    #pragma warning(push)
+    #pragma warning(disable : 4127)  // consider using 'if constexpr' instead
+#endif                               // _MSC_VER
+    static const char *basename(const char *filename) {
+        // if the size is 2 (1 character + null terminator) we can use the more efficient strrchr
+        // the branch will be elided by optimizations
+        if (sizeof(os::folder_seps) == 2) {
+            const char *rv = std::strrchr(filename, os::folder_seps[0]);
+            return rv != nullptr ? rv + 1 : filename;
+        } else {
+            const std::reverse_iterator<const char *> begin(filename + std::strlen(filename));
+            const std::reverse_iterator<const char *> end(filename);
+
+            const auto it = std::find_first_of(begin, end, std::begin(os::folder_seps),
+                                               std::end(os::folder_seps) - 1);
+            return it != end ? it.base() : filename;
+        }
+    }
+#ifdef _MSC_VER
+    #pragma warning(pop)
+#endif  // _MSC_VER
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        if (msg.source.empty()) {
+            ScopedPadder p(0, padinfo_, dest);
+            return;
+        }
+        auto filename = basename(msg.source.filename);
+        size_t text_size = padinfo_.enabled() ? std::char_traits<char>::length(filename) : 0;
+        ScopedPadder p(text_size, padinfo_, dest);
+        fmt_helper::append_string_view(filename, dest);
+    }
+};
+
+template <typename ScopedPadder>
+class source_linenum_formatter final : public flag_formatter {
+public:
+    explicit source_linenum_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        if (msg.source.empty()) {
+            ScopedPadder p(0, padinfo_, dest);
+            return;
+        }
+
+        auto field_size = ScopedPadder::count_digits(msg.source.line);
+        ScopedPadder p(field_size, padinfo_, dest);
+        fmt_helper::append_int(msg.source.line, dest);
+    }
+};
+
+// print source funcname
+template <typename ScopedPadder>
+class source_funcname_formatter final : public flag_formatter {
+public:
+    explicit source_funcname_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        if (msg.source.empty()) {
+            ScopedPadder p(0, padinfo_, dest);
+            return;
+        }
+        size_t text_size =
+            padinfo_.enabled() ? std::char_traits<char>::length(msg.source.funcname) : 0;
+        ScopedPadder p(text_size, padinfo_, dest);
+        fmt_helper::append_string_view(msg.source.funcname, dest);
+    }
+};
+
+// print elapsed time since last message
+template <typename ScopedPadder, typename Units>
+class elapsed_formatter final : public flag_formatter {
+public:
+    using DurationUnits = Units;
+
+    explicit elapsed_formatter(padding_info padinfo)
+        : flag_formatter(padinfo),
+          last_message_time_(log_clock::now()) {}
+
+    void format(const details::log_msg &msg, const std::tm &, memory_buf_t &dest) override {
+        auto delta = (std::max)(msg.time - last_message_time_, log_clock::duration::zero());
+        auto delta_units = std::chrono::duration_cast<DurationUnits>(delta);
+        last_message_time_ = msg.time;
+        auto delta_count = static_cast<size_t>(delta_units.count());
+        auto n_digits = static_cast<size_t>(ScopedPadder::count_digits(delta_count));
+        ScopedPadder p(n_digits, padinfo_, dest);
+        fmt_helper::append_int(delta_count, dest);
+    }
+
+private:
+    log_clock::time_point last_message_time_;
+};
+
+// Class for formatting Mapped Diagnostic Context (MDC) in log messages.
+// Example: [logger-name] [info] [mdc_key_1:mdc_value_1 mdc_key_2:mdc_value_2] some message
+#ifndef SPDLOG_NO_TLS
+template <typename ScopedPadder>
+class mdc_formatter : public flag_formatter {
+public:
+    explicit mdc_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &, const std::tm &, memory_buf_t &dest) override {
+        auto &mdc_map = mdc::get_context();
+        if (mdc_map.empty()) {
+            ScopedPadder p(0, padinfo_, dest);
+            return;
+        } else {
+            format_mdc(mdc_map, dest);
+        }
+    }
+
+    void format_mdc(const mdc::mdc_map_t &mdc_map, memory_buf_t &dest) {
+        auto last_element = --mdc_map.end();
+        for (auto it = mdc_map.begin(); it != mdc_map.end(); ++it) {
+            auto &pair = *it;
+            const auto &key = pair.first;
+            const auto &value = pair.second;
+            size_t content_size = key.size() + value.size() + 1;  // 1 for ':'
+
+            if (it != last_element) {
+                content_size++;  // 1 for ' '
+            }
+
+            ScopedPadder p(content_size, padinfo_, dest);
+            fmt_helper::append_string_view(key, dest);
+            fmt_helper::append_string_view(":", dest);
+            fmt_helper::append_string_view(value, dest);
+            if (it != last_element) {
+                fmt_helper::append_string_view(" ", dest);
+            }
+        }
+    }
+};
+#endif
+
+// Full info formatter
+// pattern: [%Y-%m-%d %H:%M:%S.%e] [%n] [%l] [%s:%#] %v
+class full_formatter final : public flag_formatter {
+public:
+    explicit full_formatter(padding_info padinfo)
+        : flag_formatter(padinfo) {}
+
+    void format(const details::log_msg &msg, const std::tm &tm_time, memory_buf_t &dest) override {
+        using std::chrono::duration_cast;
+        using std::chrono::milliseconds;
+        using std::chrono::seconds;
+
+        // cache the date/time part for the next second.
+        auto duration = msg.time.time_since_epoch();
+        auto secs = duration_cast<seconds>(duration);
+
+        if (cache_timestamp_ != secs || cached_datetime_.size() == 0) {
+            cached_datetime_.clear();
+            cached_datetime_.push_back('[');
+            fmt_helper::append_int(tm_time.tm_year + 1900, cached_datetime_);
+            cached_datetime_.push_back('-');
+
+            fmt_helper::pad2(tm_time.tm_mon + 1, cached_datetime_);
+            cached_datetime_.push_back('-');
+
+            fmt_helper::pad2(tm_time.tm_mday, cached_datetime_);
+            cached_datetime_.push_back(' ');
+
+            fmt_helper::pad2(tm_time.tm_hour, cached_datetime_);
+            cached_datetime_.push_back(':');
+
+            fmt_helper::pad2(tm_time.tm_min, cached_datetime_);
+            cached_datetime_.push_back(':');
+
+            fmt_helper::pad2(tm_time.tm_sec, cached_datetime_);
+            cached_datetime_.push_back('.');
+
+            cache_timestamp_ = secs;
+        }
+        dest.append(cached_datetime_.begin(), cached_datetime_.end());
+
+        auto millis = fmt_helper::time_fraction<milliseconds>(msg.time);
+        fmt_helper::pad3(static_cast<uint32_t>(millis.count()), dest);
+        dest.push_back(']');
+        dest.push_back(' ');
+
+        // append logger name if exists
+        if (msg.logger_name.size() > 0) {
+            dest.push_back('[');
+            fmt_helper::append_string_view(msg.logger_name, dest);
+            dest.push_back(']');
+            dest.push_back(' ');
+        }
+
+        dest.push_back('[');
+        // wrap the level name with color
+        msg.color_range_start = dest.size();
+        // fmt_helper::append_string_view(level::to_c_str(msg.level), dest);
+        fmt_helper::append_string_view(level::to_string_view(msg.level), dest);
+        msg.color_range_end = dest.size();
+        dest.push_back(']');
+        dest.push_back(' ');
+
+        // add source location if present
+        if (!msg.source.empty()) {
+            dest.push_back('[');
+            const char *filename =
+                details::short_filename_formatter<details::null_scoped_padder>::basename(
+                    msg.source.filename);
+            fmt_helper::append_string_view(filename, dest);
+            dest.push_back(':');
+            fmt_helper::append_int(msg.source.line, dest);
+            dest.push_back(']');
+            dest.push_back(' ');
+        }
+
+#ifndef SPDLOG_NO_TLS
+        // add mdc if present
+        auto &mdc_map = mdc::get_context();
+        if (!mdc_map.empty()) {
+            dest.push_back('[');
+            mdc_formatter_.format_mdc(mdc_map, dest);
+            dest.push_back(']');
+            dest.push_back(' ');
+        }
+#endif
+        // fmt_helper::append_string_view(msg.msg(), dest);
+        fmt_helper::append_string_view(msg.payload, dest);
+    }
+
+private:
+    std::chrono::seconds cache_timestamp_{0};
+    memory_buf_t cached_datetime_;
+
+#ifndef SPDLOG_NO_TLS
+    mdc_formatter<null_scoped_padder> mdc_formatter_{padding_info{}};
+#endif
+
+};
+
+}  // namespace details
+
+SPDLOG_INLINE pattern_formatter::pattern_formatter(std::string pattern,
+                                                   pattern_time_type time_type,
+                                                   std::string eol,
+                                                   custom_flags custom_user_flags)
+    : pattern_(std::move(pattern)),
+      eol_(std::move(eol)),
+      pattern_time_type_(time_type),
+      need_localtime_(false),
+      last_log_secs_(0),
+      custom_handlers_(std::move(custom_user_flags)) {
+    std::memset(&cached_tm_, 0, sizeof(cached_tm_));
+    compile_pattern_(pattern_);
+}
+
+// use by default full formatter for if pattern is not given
+SPDLOG_INLINE pattern_formatter::pattern_formatter(pattern_time_type time_type, std::string eol)
+    : pattern_("%+"),
+      eol_(std::move(eol)),
+      pattern_time_type_(time_type),
+      need_localtime_(true),
+      last_log_secs_(0) {
+    std::memset(&cached_tm_, 0, sizeof(cached_tm_));
+    formatters_.push_back(details::make_unique<details::full_formatter>(details::padding_info{}));
+}
+
+SPDLOG_INLINE std::unique_ptr<formatter> pattern_formatter::clone() const {
+    custom_flags cloned_custom_formatters;
+    for (auto &it : custom_handlers_) {
+        cloned_custom_formatters[it.first] = it.second->clone();
+    }
+    auto cloned = details::make_unique<pattern_formatter>(pattern_, pattern_time_type_, eol_,
+                                                          std::move(cloned_custom_formatters));
+    cloned->need_localtime(need_localtime_);
+#if defined(__GNUC__) && __GNUC__ < 5
+    return std::move(cloned);
+#else
+    return cloned;
+#endif
+}
+
+SPDLOG_INLINE void pattern_formatter::format(const details::log_msg &msg, memory_buf_t &dest) {
+    if (need_localtime_) {
+        const auto secs =
+            std::chrono::duration_cast<std::chrono::seconds>(msg.time.time_since_epoch());
+        if (secs != last_log_secs_) {
+            cached_tm_ = get_time_(msg);
+            last_log_secs_ = secs;
+        }
+    }
+
+    for (auto &f : formatters_) {
+        f->format(msg, cached_tm_, dest);
+    }
+    // write eol
+    details::fmt_helper::append_string_view(eol_, dest);
+}
+
+SPDLOG_INLINE void pattern_formatter::set_pattern(std::string pattern) {
+    pattern_ = std::move(pattern);
+    need_localtime_ = false;
+    compile_pattern_(pattern_);
+}
+
+SPDLOG_INLINE void pattern_formatter::need_localtime(bool need) { need_localtime_ = need; }
+
+SPDLOG_INLINE std::tm pattern_formatter::get_time_(const details::log_msg &msg) {
+    if (pattern_time_type_ == pattern_time_type::local) {
+        return details::os::localtime(log_clock::to_time_t(msg.time));
+    }
+    return details::os::gmtime(log_clock::to_time_t(msg.time));
+}
+
+template <typename Padder>
+SPDLOG_INLINE void pattern_formatter::handle_flag_(char flag, details::padding_info padding) {
+    // process custom flags
+    auto it = custom_handlers_.find(flag);
+    if (it != custom_handlers_.end()) {
+        auto custom_handler = it->second->clone();
+        custom_handler->set_padding_info(padding);
+        formatters_.push_back(std::move(custom_handler));
+        return;
+    }
+
+    // process built-in flags
+    switch (flag) {
+        case ('+'):  // default formatter
+            formatters_.push_back(details::make_unique<details::full_formatter>(padding));
+            need_localtime_ = true;
+            break;
+
+        case 'n':  // logger name
+            formatters_.push_back(details::make_unique<details::name_formatter<Padder>>(padding));
+            break;
+
+        case 'l':  // level
+            formatters_.push_back(details::make_unique<details::level_formatter<Padder>>(padding));
+            break;
+
+        case 'L':  // short level
+            formatters_.push_back(
+                details::make_unique<details::short_level_formatter<Padder>>(padding));
+            break;
+
+        case ('t'):  // thread id
+            formatters_.push_back(details::make_unique<details::t_formatter<Padder>>(padding));
+            break;
+
+        case ('v'):  // the message text
+            formatters_.push_back(details::make_unique<details::v_formatter<Padder>>(padding));
+            break;
+
+        case ('a'):  // weekday
+            formatters_.push_back(details::make_unique<details::a_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('A'):  // short weekday
+            formatters_.push_back(details::make_unique<details::A_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('b'):
+        case ('h'):  // month
+            formatters_.push_back(details::make_unique<details::b_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('B'):  // short month
+            formatters_.push_back(details::make_unique<details::B_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('c'):  // datetime
+            formatters_.push_back(details::make_unique<details::c_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('C'):  // year 2 digits
+            formatters_.push_back(details::make_unique<details::C_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('Y'):  // year 4 digits
+            formatters_.push_back(details::make_unique<details::Y_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('D'):
+        case ('x'):  // datetime MM/DD/YY
+            formatters_.push_back(details::make_unique<details::D_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('m'):  // month 1-12
+            formatters_.push_back(details::make_unique<details::m_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('d'):  // day of month 1-31
+            formatters_.push_back(details::make_unique<details::d_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('H'):  // hours 24
+            formatters_.push_back(details::make_unique<details::H_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('I'):  // hours 12
+            formatters_.push_back(details::make_unique<details::I_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('M'):  // minutes
+            formatters_.push_back(details::make_unique<details::M_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('S'):  // seconds
+            formatters_.push_back(details::make_unique<details::S_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('e'):  // milliseconds
+            formatters_.push_back(details::make_unique<details::e_formatter<Padder>>(padding));
+            break;
+
+        case ('f'):  // microseconds
+            formatters_.push_back(details::make_unique<details::f_formatter<Padder>>(padding));
+            break;
+
+        case ('F'):  // nanoseconds
+            formatters_.push_back(details::make_unique<details::F_formatter<Padder>>(padding));
+            break;
+
+        case ('E'):  // seconds since epoch
+            formatters_.push_back(details::make_unique<details::E_formatter<Padder>>(padding));
+            break;
+
+        case ('p'):  // am/pm
+            formatters_.push_back(details::make_unique<details::p_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('r'):  // 12 hour clock 02:55:02 pm
+            formatters_.push_back(details::make_unique<details::r_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('R'):  // 24-hour HH:MM time
+            formatters_.push_back(details::make_unique<details::R_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('T'):
+        case ('X'):  // ISO 8601 time format (HH:MM:SS)
+            formatters_.push_back(details::make_unique<details::T_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('z'):  // timezone
+            formatters_.push_back(details::make_unique<details::z_formatter<Padder>>(padding));
+            need_localtime_ = true;
+            break;
+
+        case ('P'):  // pid
+            formatters_.push_back(details::make_unique<details::pid_formatter<Padder>>(padding));
+            break;
+
+        case ('^'):  // color range start
+            formatters_.push_back(details::make_unique<details::color_start_formatter>(padding));
+            break;
+
+        case ('$'):  // color range end
+            formatters_.push_back(details::make_unique<details::color_stop_formatter>(padding));
+            break;
+
+        case ('@'):  // source location (filename:filenumber)
+            formatters_.push_back(
+                details::make_unique<details::source_location_formatter<Padder>>(padding));
+            break;
+
+        case ('s'):  // short source filename - without directory name
+            formatters_.push_back(
+                details::make_unique<details::short_filename_formatter<Padder>>(padding));
+            break;
+
+        case ('g'):  // full source filename
+            formatters_.push_back(
+                details::make_unique<details::source_filename_formatter<Padder>>(padding));
+            break;
+
+        case ('#'):  // source line number
+            formatters_.push_back(
+                details::make_unique<details::source_linenum_formatter<Padder>>(padding));
+            break;
+
+        case ('!'):  // source funcname
+            formatters_.push_back(
+                details::make_unique<details::source_funcname_formatter<Padder>>(padding));
+            break;
+
+        case ('%'):  // % char
+            formatters_.push_back(details::make_unique<details::ch_formatter>('%'));
+            break;
+
+        case ('u'):  // elapsed time since last log message in nanos
+            formatters_.push_back(
+                details::make_unique<details::elapsed_formatter<Padder, std::chrono::nanoseconds>>(
+                    padding));
+            break;
+
+        case ('i'):  // elapsed time since last log message in micros
+            formatters_.push_back(
+                details::make_unique<details::elapsed_formatter<Padder, std::chrono::microseconds>>(
+                    padding));
+            break;
+
+        case ('o'):  // elapsed time since last log message in millis
+            formatters_.push_back(
+                details::make_unique<details::elapsed_formatter<Padder, std::chrono::milliseconds>>(
+                    padding));
+            break;
+
+        case ('O'):  // elapsed time since last log message in seconds
+            formatters_.push_back(
+                details::make_unique<details::elapsed_formatter<Padder, std::chrono::seconds>>(
+                    padding));
+            break;
+
+#ifndef SPDLOG_NO_TLS  // mdc formatter requires TLS support
+        case ('&'):
+            formatters_.push_back(details::make_unique<details::mdc_formatter<Padder>>(padding));
+            break;
+#endif
+
+        default:  // Unknown flag appears as is
+            auto unknown_flag = details::make_unique<details::aggregate_formatter>();
+
+            if (!padding.truncate_) {
+                unknown_flag->add_ch('%');
+                unknown_flag->add_ch(flag);
+                formatters_.push_back((std::move(unknown_flag)));
+            }
+            // fix issue #1617 (prev char was '!' and should have been treated as funcname flag
+            // instead of truncating flag) spdlog::set_pattern("[%10!] %v") => "[      main] some
+            // message" spdlog::set_pattern("[%3!!] %v") => "[mai] some message"
+            else {
+                padding.truncate_ = false;
+                formatters_.push_back(
+                    details::make_unique<details::source_funcname_formatter<Padder>>(padding));
+                unknown_flag->add_ch(flag);
+                formatters_.push_back((std::move(unknown_flag)));
+            }
+
+            break;
+    }
+}
+
+// Extract given pad spec (e.g. %8X, %=8X, %-8!X, %8!X, %=8!X, %-8!X, %+8!X)
+// Advance the given it pass the end of the padding spec found (if any)
+// Return padding.
+SPDLOG_INLINE details::padding_info pattern_formatter::handle_padspec_(
+    std::string::const_iterator &it, std::string::const_iterator end) {
+    using details::padding_info;
+    using details::scoped_padder;
+    const size_t max_width = 64;
+    if (it == end) {
+        return padding_info{};
+    }
+
+    padding_info::pad_side side;
+    switch (*it) {
+        case '-':
+            side = padding_info::pad_side::right;
+            ++it;
+            break;
+        case '=':
+            side = padding_info::pad_side::center;
+            ++it;
+            break;
+        default:
+            side = details::padding_info::pad_side::left;
+            break;
+    }
+
+    if (it == end || !std::isdigit(static_cast<unsigned char>(*it))) {
+        return padding_info{};  // no padding if no digit found here
+    }
+
+    auto width = static_cast<size_t>(*it) - '0';
+    for (++it; it != end && std::isdigit(static_cast<unsigned char>(*it)); ++it) {
+        auto digit = static_cast<size_t>(*it) - '0';
+        width = width * 10 + digit;
+    }
+
+    // search for the optional truncate marker '!'
+    bool truncate;
+    if (it != end && *it == '!') {
+        truncate = true;
+        ++it;
+    } else {
+        truncate = false;
+    }
+    return details::padding_info{std::min<size_t>(width, max_width), side, truncate};
+}
+
+SPDLOG_INLINE void pattern_formatter::compile_pattern_(const std::string &pattern) {
+    auto end = pattern.end();
+    std::unique_ptr<details::aggregate_formatter> user_chars;
+    formatters_.clear();
+    for (auto it = pattern.begin(); it != end; ++it) {
+        if (*it == '%') {
+            if (user_chars)  // append user chars found so far
+            {
+                formatters_.push_back(std::move(user_chars));
+            }
+
+            auto padding = handle_padspec_(++it, end);
+
+            if (it != end) {
+                if (padding.enabled()) {
+                    handle_flag_<details::scoped_padder>(*it, padding);
+                } else {
+                    handle_flag_<details::null_scoped_padder>(*it, padding);
+                }
+            } else {
+                break;
+            }
+        } else  // chars not following the % sign should be displayed as is
+        {
+            if (!user_chars) {
+                user_chars = details::make_unique<details::aggregate_formatter>();
+            }
+            user_chars->add_ch(*it);
+        }
+    }
+    if (user_chars)  // append raw chars found so far
+    {
+        formatters_.push_back(std::move(user_chars));
+    }
+}
+}  // namespace spdlog
diff --git a/third_party/spdlog/pattern_formatter.h b/third_party/spdlog/pattern_formatter.h
new file mode 100644
index 00000000000..ececd67322f
--- /dev/null
+++ b/third_party/spdlog/pattern_formatter.h
@@ -0,0 +1,118 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <spdlog/details/log_msg.h>
+#include <spdlog/details/os.h>
+#include <spdlog/formatter.h>
+
+#include <chrono>
+#include <ctime>
+#include <memory>
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace spdlog {
+namespace details {
+
+// padding information.
+struct padding_info {
+    enum class pad_side { left, right, center };
+
+    padding_info() = default;
+    padding_info(size_t width, padding_info::pad_side side, bool truncate)
+        : width_(width),
+          side_(side),
+          truncate_(truncate),
+          enabled_(true) {}
+
+    bool enabled() const { return enabled_; }
+    size_t width_ = 0;
+    pad_side side_ = pad_side::left;
+    bool truncate_ = false;
+    bool enabled_ = false;
+};
+
+class SPDLOG_API flag_formatter {
+public:
+    explicit flag_formatter(padding_info padinfo)
+        : padinfo_(padinfo) {}
+    flag_formatter() = default;
+    virtual ~flag_formatter() = default;
+    virtual void format(const details::log_msg &msg,
+                        const std::tm &tm_time,
+                        memory_buf_t &dest) = 0;
+
+protected:
+    padding_info padinfo_;
+};
+
+}  // namespace details
+
+class SPDLOG_API custom_flag_formatter : public details::flag_formatter {
+public:
+    virtual std::unique_ptr<custom_flag_formatter> clone() const = 0;
+
+    void set_padding_info(const details::padding_info &padding) {
+        flag_formatter::padinfo_ = padding;
+    }
+};
+
+class SPDLOG_API pattern_formatter final : public formatter {
+public:
+    using custom_flags = std::unordered_map<char, std::unique_ptr<custom_flag_formatter>>;
+
+    explicit pattern_formatter(std::string pattern,
+                               pattern_time_type time_type = pattern_time_type::local,
+                               std::string eol = spdlog::details::os::default_eol,
+                               custom_flags custom_user_flags = custom_flags());
+
+    // use default pattern is not given
+    explicit pattern_formatter(pattern_time_type time_type = pattern_time_type::local,
+                               std::string eol = spdlog::details::os::default_eol);
+
+    pattern_formatter(const pattern_formatter &other) = delete;
+    pattern_formatter &operator=(const pattern_formatter &other) = delete;
+
+    std::unique_ptr<formatter> clone() const override;
+    void format(const details::log_msg &msg, memory_buf_t &dest) override;
+
+    template <typename T, typename... Args>
+    pattern_formatter &add_flag(char flag, Args &&...args) {
+        custom_handlers_[flag] = details::make_unique<T>(std::forward<Args>(args)...);
+        return *this;
+    }
+    void set_pattern(std::string pattern);
+    void need_localtime(bool need = true);
+
+private:
+    std::string pattern_;
+    std::string eol_;
+    pattern_time_type pattern_time_type_;
+    bool need_localtime_;
+    std::tm cached_tm_;
+    std::chrono::seconds last_log_secs_;
+    std::vector<std::unique_ptr<details::flag_formatter>> formatters_;
+    custom_flags custom_handlers_;
+
+    std::tm get_time_(const details::log_msg &msg);
+    template <typename Padder>
+    void handle_flag_(char flag, details::padding_info padding);
+
+    // Extract given pad spec (e.g. %8X)
+    // Advance the given it pass the end of the padding spec found (if any)
+    // Return padding.
+    static details::padding_info handle_padspec_(std::string::const_iterator &it,
+                                                 std::string::const_iterator end);
+
+    void compile_pattern_(const std::string &pattern);
+};
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "pattern_formatter-inl.h"
+#endif
diff --git a/third_party/spdlog/sinks/ansicolor_sink-inl.h b/third_party/spdlog/sinks/ansicolor_sink-inl.h
new file mode 100644
index 00000000000..bac65e7f085
--- /dev/null
+++ b/third_party/spdlog/sinks/ansicolor_sink-inl.h
@@ -0,0 +1,141 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/sinks/ansicolor_sink.h>
+#endif
+
+#include <spdlog/details/os.h>
+#include <spdlog/pattern_formatter.h>
+
+namespace spdlog {
+namespace sinks {
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE ansicolor_sink<ConsoleMutex>::ansicolor_sink(FILE *target_file, color_mode mode)
+    : target_file_(target_file),
+      mutex_(ConsoleMutex::mutex()),
+      formatter_(details::make_unique<spdlog::pattern_formatter>())
+
+{
+    set_color_mode_(mode);
+    colors_.at(level::trace) = to_string_(white);
+    colors_.at(level::debug) = to_string_(cyan);
+    colors_.at(level::info) = to_string_(green);
+    colors_.at(level::warn) = to_string_(yellow_bold);
+    colors_.at(level::err) = to_string_(red_bold);
+    colors_.at(level::critical) = to_string_(bold_on_red);
+    colors_.at(level::off) = to_string_(reset);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_color(level::level_enum color_level,
+                                                           string_view_t color) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    colors_.at(static_cast<size_t>(color_level)) = to_string_(color);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::log(const details::log_msg &msg) {
+    // Wrap the originally formatted message in color codes.
+    // If color is not supported in the terminal, log as is instead.
+    std::lock_guard<mutex_t> lock(mutex_);
+    msg.color_range_start = 0;
+    msg.color_range_end = 0;
+    memory_buf_t formatted;
+    formatter_->format(msg, formatted);
+    if (should_do_colors_ && msg.color_range_end > msg.color_range_start) {
+        // before color range
+        print_range_(formatted, 0, msg.color_range_start);
+        // in color range
+        print_ccode_(colors_.at(static_cast<size_t>(msg.level)));
+        print_range_(formatted, msg.color_range_start, msg.color_range_end);
+        print_ccode_(reset);
+        // after color range
+        print_range_(formatted, msg.color_range_end, formatted.size());
+    } else  // no color
+    {
+        print_range_(formatted, 0, formatted.size());
+    }
+    fflush(target_file_);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::flush() {
+    std::lock_guard<mutex_t> lock(mutex_);
+    fflush(target_file_);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_pattern(const std::string &pattern) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    formatter_ = std::unique_ptr<spdlog::formatter>(new pattern_formatter(pattern));
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_formatter(
+    std::unique_ptr<spdlog::formatter> sink_formatter) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    formatter_ = std::move(sink_formatter);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE bool ansicolor_sink<ConsoleMutex>::should_color() const {
+    return should_do_colors_;
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_color_mode(color_mode mode) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    set_color_mode_(mode);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::set_color_mode_(color_mode mode) {
+    switch (mode) {
+        case color_mode::always:
+            should_do_colors_ = true;
+            return;
+        case color_mode::automatic:
+            should_do_colors_ =
+                details::os::in_terminal(target_file_) && details::os::is_color_terminal();
+            return;
+        case color_mode::never:
+            should_do_colors_ = false;
+            return;
+        default:
+            should_do_colors_ = false;
+    }
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::print_ccode_(const string_view_t &color_code) const {
+    details::os::fwrite_bytes(color_code.data(), color_code.size(), target_file_);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE void ansicolor_sink<ConsoleMutex>::print_range_(const memory_buf_t &formatted,
+                                                              size_t start,
+                                                              size_t end) const {
+    details::os::fwrite_bytes(formatted.data() + start, end - start, target_file_);
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE std::string ansicolor_sink<ConsoleMutex>::to_string_(const string_view_t &sv) {
+    return std::string(sv.data(), sv.size());
+}
+
+// ansicolor_stdout_sink
+template <typename ConsoleMutex>
+SPDLOG_INLINE ansicolor_stdout_sink<ConsoleMutex>::ansicolor_stdout_sink(color_mode mode)
+    : ansicolor_sink<ConsoleMutex>(stdout, mode) {}
+
+// ansicolor_stderr_sink
+template <typename ConsoleMutex>
+SPDLOG_INLINE ansicolor_stderr_sink<ConsoleMutex>::ansicolor_stderr_sink(color_mode mode)
+    : ansicolor_sink<ConsoleMutex>(stderr, mode) {}
+
+}  // namespace sinks
+}  // namespace spdlog
diff --git a/src/common/spdlog/sinks/ansicolor_sink.h b/third_party/spdlog/sinks/ansicolor_sink.h
old mode 100755
new mode 100644
similarity index 83%
rename from src/common/spdlog/sinks/ansicolor_sink.h
rename to third_party/spdlog/sinks/ansicolor_sink.h
index 639dcbfaef0..19aa421e6c6
--- a/src/common/spdlog/sinks/ansicolor_sink.h
+++ b/third_party/spdlog/sinks/ansicolor_sink.h
@@ -6,10 +6,10 @@
 #include <array>
 #include <memory>
 #include <mutex>
+#include <spdlog/details/console_globals.h>
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/sinks/sink.h>
 #include <string>
-#include <common/spdlog/details/console_globals.h>
-#include <common/spdlog/details/null_mutex.h>
-#include <common/spdlog/sinks/sink.h>
 
 namespace spdlog {
 namespace sinks {
@@ -36,13 +36,12 @@ class ansicolor_sink : public sink {
 
     void set_color(level::level_enum color_level, string_view_t color);
     void set_color_mode(color_mode mode);
-    bool should_color();
+    bool should_color() const;
 
     void log(const details::log_msg &msg) override;
     void flush() override;
-    void set_pattern(const std::string &pattern) final;
-    void set_formatter(
-            std::unique_ptr<spdlog::formatter> sink_formatter) override;
+    void set_pattern(const std::string &pattern) final override;
+    void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) override;
 
     // Formatting codes
     const string_view_t reset = "\033[m";
@@ -85,8 +84,9 @@ class ansicolor_sink : public sink {
     bool should_do_colors_;
     std::unique_ptr<spdlog::formatter> formatter_;
     std::array<std::string, level::n_levels> colors_;
-    void print_ccode_(const string_view_t &color_code);
-    void print_range_(const memory_buf_t &formatted, size_t start, size_t end);
+    void set_color_mode_(color_mode mode);
+    void print_ccode_(const string_view_t &color_code) const;
+    void print_range_(const memory_buf_t &formatted, size_t start, size_t end) const;
     static std::string to_string_(const string_view_t &sv);
 };
 
@@ -103,16 +103,14 @@ class ansicolor_stderr_sink : public ansicolor_sink<ConsoleMutex> {
 };
 
 using ansicolor_stdout_sink_mt = ansicolor_stdout_sink<details::console_mutex>;
-using ansicolor_stdout_sink_st
-        = ansicolor_stdout_sink<details::console_nullmutex>;
+using ansicolor_stdout_sink_st = ansicolor_stdout_sink<details::console_nullmutex>;
 
 using ansicolor_stderr_sink_mt = ansicolor_stderr_sink<details::console_mutex>;
-using ansicolor_stderr_sink_st
-        = ansicolor_stderr_sink<details::console_nullmutex>;
+using ansicolor_stderr_sink_st = ansicolor_stderr_sink<details::console_nullmutex>;
 
-} // namespace sinks
-} // namespace spdlog
+}  // namespace sinks
+}  // namespace spdlog
 
 #ifdef SPDLOG_HEADER_ONLY
-#include "ansicolor_sink-inl.h"
+    #include "ansicolor_sink-inl.h"
 #endif
diff --git a/third_party/spdlog/sinks/base_sink-inl.h b/third_party/spdlog/sinks/base_sink-inl.h
new file mode 100644
index 00000000000..ada161bcc1d
--- /dev/null
+++ b/third_party/spdlog/sinks/base_sink-inl.h
@@ -0,0 +1,59 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/sinks/base_sink.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/pattern_formatter.h>
+
+#include <memory>
+#include <mutex>
+
+template <typename Mutex>
+SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::base_sink()
+    : formatter_{details::make_unique<spdlog::pattern_formatter>()} {}
+
+template <typename Mutex>
+SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::base_sink(
+    std::unique_ptr<spdlog::formatter> formatter)
+    : formatter_{std::move(formatter)} {}
+
+template <typename Mutex>
+void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::log(const details::log_msg &msg) {
+    std::lock_guard<Mutex> lock(mutex_);
+    sink_it_(msg);
+}
+
+template <typename Mutex>
+void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::flush() {
+    std::lock_guard<Mutex> lock(mutex_);
+    flush_();
+}
+
+template <typename Mutex>
+void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_pattern(const std::string &pattern) {
+    std::lock_guard<Mutex> lock(mutex_);
+    set_pattern_(pattern);
+}
+
+template <typename Mutex>
+void SPDLOG_INLINE
+spdlog::sinks::base_sink<Mutex>::set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) {
+    std::lock_guard<Mutex> lock(mutex_);
+    set_formatter_(std::move(sink_formatter));
+}
+
+template <typename Mutex>
+void SPDLOG_INLINE spdlog::sinks::base_sink<Mutex>::set_pattern_(const std::string &pattern) {
+    set_formatter_(details::make_unique<spdlog::pattern_formatter>(pattern));
+}
+
+template <typename Mutex>
+void SPDLOG_INLINE
+spdlog::sinks::base_sink<Mutex>::set_formatter_(std::unique_ptr<spdlog::formatter> sink_formatter) {
+    formatter_ = std::move(sink_formatter);
+}
diff --git a/third_party/spdlog/sinks/base_sink.h b/third_party/spdlog/sinks/base_sink.h
new file mode 100644
index 00000000000..1b4bb0689b8
--- /dev/null
+++ b/third_party/spdlog/sinks/base_sink.h
@@ -0,0 +1,51 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+//
+// base sink templated over a mutex (either dummy or real)
+// concrete implementation should override the sink_it_() and flush_()  methods.
+// locking is taken care of in this class - no locking needed by the
+// implementers..
+//
+
+#include <spdlog/common.h>
+#include <spdlog/details/log_msg.h>
+#include <spdlog/sinks/sink.h>
+
+namespace spdlog {
+namespace sinks {
+template <typename Mutex>
+class SPDLOG_API base_sink : public sink {
+public:
+    base_sink();
+    explicit base_sink(std::unique_ptr<spdlog::formatter> formatter);
+    ~base_sink() override = default;
+
+    base_sink(const base_sink &) = delete;
+    base_sink(base_sink &&) = delete;
+
+    base_sink &operator=(const base_sink &) = delete;
+    base_sink &operator=(base_sink &&) = delete;
+
+    void log(const details::log_msg &msg) final override;
+    void flush() final override;
+    void set_pattern(const std::string &pattern) final override;
+    void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) final override;
+
+protected:
+    // sink formatter
+    std::unique_ptr<spdlog::formatter> formatter_;
+    Mutex mutex_;
+
+    virtual void sink_it_(const details::log_msg &msg) = 0;
+    virtual void flush_() = 0;
+    virtual void set_pattern_(const std::string &pattern);
+    virtual void set_formatter_(std::unique_ptr<spdlog::formatter> sink_formatter);
+};
+}  // namespace sinks
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "base_sink-inl.h"
+#endif
diff --git a/third_party/spdlog/sinks/basic_file_sink-inl.h b/third_party/spdlog/sinks/basic_file_sink-inl.h
new file mode 100644
index 00000000000..ce0ddad0068
--- /dev/null
+++ b/third_party/spdlog/sinks/basic_file_sink-inl.h
@@ -0,0 +1,48 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/sinks/basic_file_sink.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/details/os.h>
+
+namespace spdlog {
+namespace sinks {
+
+template <typename Mutex>
+SPDLOG_INLINE basic_file_sink<Mutex>::basic_file_sink(const filename_t &filename,
+                                                      bool truncate,
+                                                      const file_event_handlers &event_handlers)
+    : file_helper_{event_handlers} {
+    file_helper_.open(filename, truncate);
+}
+
+template <typename Mutex>
+SPDLOG_INLINE const filename_t &basic_file_sink<Mutex>::filename() const {
+    return file_helper_.filename();
+}
+
+template <typename Mutex>
+SPDLOG_INLINE void basic_file_sink<Mutex>::truncate() {
+    std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
+    file_helper_.reopen(true);
+}
+
+template <typename Mutex>
+SPDLOG_INLINE void basic_file_sink<Mutex>::sink_it_(const details::log_msg &msg) {
+    memory_buf_t formatted;
+    base_sink<Mutex>::formatter_->format(msg, formatted);
+    file_helper_.write(formatted);
+}
+
+template <typename Mutex>
+SPDLOG_INLINE void basic_file_sink<Mutex>::flush_() {
+    file_helper_.flush();
+}
+
+}  // namespace sinks
+}  // namespace spdlog
diff --git a/third_party/spdlog/sinks/basic_file_sink.h b/third_party/spdlog/sinks/basic_file_sink.h
new file mode 100644
index 00000000000..48c07671aa4
--- /dev/null
+++ b/third_party/spdlog/sinks/basic_file_sink.h
@@ -0,0 +1,66 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/file_helper.h>
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/details/synchronous_factory.h>
+#include <spdlog/sinks/base_sink.h>
+
+#include <mutex>
+#include <string>
+
+namespace spdlog {
+namespace sinks {
+/*
+ * Trivial file sink with single file as target
+ */
+template <typename Mutex>
+class basic_file_sink final : public base_sink<Mutex> {
+public:
+    explicit basic_file_sink(const filename_t &filename,
+                             bool truncate = false,
+                             const file_event_handlers &event_handlers = {});
+    const filename_t &filename() const;
+    void truncate();
+
+protected:
+    void sink_it_(const details::log_msg &msg) override;
+    void flush_() override;
+
+private:
+    details::file_helper file_helper_;
+};
+
+using basic_file_sink_mt = basic_file_sink<std::mutex>;
+using basic_file_sink_st = basic_file_sink<details::null_mutex>;
+
+}  // namespace sinks
+
+//
+// factory functions
+//
+template <typename Factory = spdlog::synchronous_factory>
+inline std::shared_ptr<logger> basic_logger_mt(const std::string &logger_name,
+                                               const filename_t &filename,
+                                               bool truncate = false,
+                                               const file_event_handlers &event_handlers = {}) {
+    return Factory::template create<sinks::basic_file_sink_mt>(logger_name, filename, truncate,
+                                                               event_handlers);
+}
+
+template <typename Factory = spdlog::synchronous_factory>
+inline std::shared_ptr<logger> basic_logger_st(const std::string &logger_name,
+                                               const filename_t &filename,
+                                               bool truncate = false,
+                                               const file_event_handlers &event_handlers = {}) {
+    return Factory::template create<sinks::basic_file_sink_st>(logger_name, filename, truncate,
+                                                               event_handlers);
+}
+
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "basic_file_sink-inl.h"
+#endif
diff --git a/third_party/spdlog/sinks/null_sink.h b/third_party/spdlog/sinks/null_sink.h
new file mode 100644
index 00000000000..74530b5b1a3
--- /dev/null
+++ b/third_party/spdlog/sinks/null_sink.h
@@ -0,0 +1,41 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/details/synchronous_factory.h>
+#include <spdlog/sinks/base_sink.h>
+
+#include <mutex>
+
+namespace spdlog {
+namespace sinks {
+
+template <typename Mutex>
+class null_sink final : public base_sink<Mutex> {
+protected:
+    void sink_it_(const details::log_msg &) override {}
+    void flush_() override {}
+};
+
+using null_sink_mt = null_sink<details::null_mutex>;
+using null_sink_st = null_sink<details::null_mutex>;
+
+}  // namespace sinks
+
+template <typename Factory = spdlog::synchronous_factory>
+inline std::shared_ptr<logger> null_logger_mt(const std::string &logger_name) {
+    auto null_logger = Factory::template create<sinks::null_sink_mt>(logger_name);
+    null_logger->set_level(level::off);
+    return null_logger;
+}
+
+template <typename Factory = spdlog::synchronous_factory>
+inline std::shared_ptr<logger> null_logger_st(const std::string &logger_name) {
+    auto null_logger = Factory::template create<sinks::null_sink_st>(logger_name);
+    null_logger->set_level(level::off);
+    return null_logger;
+}
+
+}  // namespace spdlog
diff --git a/third_party/spdlog/sinks/ostream_sink.h b/third_party/spdlog/sinks/ostream_sink.h
new file mode 100644
index 00000000000..6af9dd091e1
--- /dev/null
+++ b/third_party/spdlog/sinks/ostream_sink.h
@@ -0,0 +1,43 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/sinks/base_sink.h>
+
+#include <mutex>
+#include <ostream>
+
+namespace spdlog {
+namespace sinks {
+template <typename Mutex>
+class ostream_sink final : public base_sink<Mutex> {
+public:
+    explicit ostream_sink(std::ostream &os, bool force_flush = false)
+        : ostream_(os),
+          force_flush_(force_flush) {}
+    ostream_sink(const ostream_sink &) = delete;
+    ostream_sink &operator=(const ostream_sink &) = delete;
+
+protected:
+    void sink_it_(const details::log_msg &msg) override {
+        memory_buf_t formatted;
+        base_sink<Mutex>::formatter_->format(msg, formatted);
+        ostream_.write(formatted.data(), static_cast<std::streamsize>(formatted.size()));
+        if (force_flush_) {
+            ostream_.flush();
+        }
+    }
+
+    void flush_() override { ostream_.flush(); }
+
+    std::ostream &ostream_;
+    bool force_flush_;
+};
+
+using ostream_sink_mt = ostream_sink<std::mutex>;
+using ostream_sink_st = ostream_sink<details::null_mutex>;
+
+}  // namespace sinks
+}  // namespace spdlog
diff --git a/third_party/spdlog/sinks/rotating_file_sink-inl.h b/third_party/spdlog/sinks/rotating_file_sink-inl.h
new file mode 100644
index 00000000000..420bafb0736
--- /dev/null
+++ b/third_party/spdlog/sinks/rotating_file_sink-inl.h
@@ -0,0 +1,150 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/sinks/rotating_file_sink.h>
+#endif
+
+#include <spdlog/common.h>
+
+#include <spdlog/details/file_helper.h>
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/fmt/fmt.h>
+
+#include <cerrno>
+#include <chrono>
+#include <ctime>
+#include <mutex>
+#include <string>
+#include <tuple>
+
+namespace spdlog {
+namespace sinks {
+
+template <typename Mutex>
+SPDLOG_INLINE rotating_file_sink<Mutex>::rotating_file_sink(
+    filename_t base_filename,
+    std::size_t max_size,
+    std::size_t max_files,
+    bool rotate_on_open,
+    const file_event_handlers &event_handlers)
+    : base_filename_(std::move(base_filename)),
+      max_size_(max_size),
+      max_files_(max_files),
+      file_helper_{event_handlers} {
+    if (max_size == 0) {
+        throw_spdlog_ex("rotating sink constructor: max_size arg cannot be zero");
+    }
+
+    if (max_files > 200000) {
+        throw_spdlog_ex("rotating sink constructor: max_files arg cannot exceed 200000");
+    }
+    file_helper_.open(calc_filename(base_filename_, 0));
+    current_size_ = file_helper_.size();  // expensive. called only once
+    if (rotate_on_open && current_size_ > 0) {
+        rotate_();
+        current_size_ = 0;
+    }
+}
+
+// calc filename according to index and file extension if exists.
+// e.g. calc_filename("logs/mylog.txt, 3) => "logs/mylog.3.txt".
+template <typename Mutex>
+SPDLOG_INLINE filename_t rotating_file_sink<Mutex>::calc_filename(const filename_t &filename,
+                                                                  std::size_t index) {
+    if (index == 0u) {
+        return filename;
+    }
+
+    filename_t basename, ext;
+    std::tie(basename, ext) = details::file_helper::split_by_extension(filename);
+    return fmt_lib::format(SPDLOG_FMT_STRING(SPDLOG_FILENAME_T("{}.{}{}")), basename, index, ext);
+}
+
+template <typename Mutex>
+SPDLOG_INLINE filename_t rotating_file_sink<Mutex>::filename() {
+    std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
+    return file_helper_.filename();
+}
+
+template <typename Mutex>
+SPDLOG_INLINE void rotating_file_sink<Mutex>::rotate_now() {
+    std::lock_guard<Mutex> lock(base_sink<Mutex>::mutex_);
+    rotate_();
+}
+
+template <typename Mutex>
+SPDLOG_INLINE void rotating_file_sink<Mutex>::sink_it_(const details::log_msg &msg) {
+    memory_buf_t formatted;
+    base_sink<Mutex>::formatter_->format(msg, formatted);
+    auto new_size = current_size_ + formatted.size();
+
+    // rotate if the new estimated file size exceeds max size.
+    // rotate only if the real size > 0 to better deal with full disk (see issue #2261).
+    // we only check the real size when new_size > max_size_ because it is relatively expensive.
+    if (new_size > max_size_) {
+        file_helper_.flush();
+        if (file_helper_.size() > 0) {
+            rotate_();
+            new_size = formatted.size();
+        }
+    }
+    file_helper_.write(formatted);
+    current_size_ = new_size;
+}
+
+template <typename Mutex>
+SPDLOG_INLINE void rotating_file_sink<Mutex>::flush_() {
+    file_helper_.flush();
+}
+
+// Rotate files:
+// log.txt -> log.1.txt
+// log.1.txt -> log.2.txt
+// log.2.txt -> log.3.txt
+// log.3.txt -> delete
+template <typename Mutex>
+SPDLOG_INLINE void rotating_file_sink<Mutex>::rotate_() {
+    using details::os::filename_to_str;
+    using details::os::path_exists;
+
+    file_helper_.close();
+    for (auto i = max_files_; i > 0; --i) {
+        filename_t src = calc_filename(base_filename_, i - 1);
+        if (!path_exists(src)) {
+            continue;
+        }
+        filename_t target = calc_filename(base_filename_, i);
+
+        if (!rename_file_(src, target)) {
+            // if failed try again after a small delay.
+            // this is a workaround to a windows issue, where very high rotation
+            // rates can cause the rename to fail with permission denied (because of antivirus?).
+            details::os::sleep_for_millis(100);
+            if (!rename_file_(src, target)) {
+                file_helper_.reopen(
+                    true);  // truncate the log file anyway to prevent it to grow beyond its limit!
+                current_size_ = 0;
+                throw_spdlog_ex("rotating_file_sink: failed renaming " + filename_to_str(src) +
+                                    " to " + filename_to_str(target),
+                                errno);
+            }
+        }
+    }
+    file_helper_.reopen(true);
+}
+
+// delete the target if exists, and rename the src file  to target
+// return true on success, false otherwise.
+template <typename Mutex>
+SPDLOG_INLINE bool rotating_file_sink<Mutex>::rename_file_(const filename_t &src_filename,
+                                                           const filename_t &target_filename) {
+    // try to delete the target file in case it already exists.
+    (void)details::os::remove(target_filename);
+    return details::os::rename(src_filename, target_filename) == 0;
+}
+
+}  // namespace sinks
+}  // namespace spdlog
diff --git a/third_party/spdlog/sinks/rotating_file_sink.h b/third_party/spdlog/sinks/rotating_file_sink.h
new file mode 100644
index 00000000000..42bd3760c6e
--- /dev/null
+++ b/third_party/spdlog/sinks/rotating_file_sink.h
@@ -0,0 +1,90 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/file_helper.h>
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/details/synchronous_factory.h>
+#include <spdlog/sinks/base_sink.h>
+
+#include <chrono>
+#include <mutex>
+#include <string>
+
+namespace spdlog {
+namespace sinks {
+
+//
+// Rotating file sink based on size
+//
+template <typename Mutex>
+class rotating_file_sink final : public base_sink<Mutex> {
+public:
+    rotating_file_sink(filename_t base_filename,
+                       std::size_t max_size,
+                       std::size_t max_files,
+                       bool rotate_on_open = false,
+                       const file_event_handlers &event_handlers = {});
+    static filename_t calc_filename(const filename_t &filename, std::size_t index);
+    filename_t filename();
+    void rotate_now();
+
+protected:
+    void sink_it_(const details::log_msg &msg) override;
+    void flush_() override;
+
+private:
+    // Rotate files:
+    // log.txt -> log.1.txt
+    // log.1.txt -> log.2.txt
+    // log.2.txt -> log.3.txt
+    // log.3.txt -> delete
+    void rotate_();
+
+    // delete the target if exists, and rename the src file  to target
+    // return true on success, false otherwise.
+    bool rename_file_(const filename_t &src_filename, const filename_t &target_filename);
+
+    filename_t base_filename_;
+    std::size_t max_size_;
+    std::size_t max_files_;
+    std::size_t current_size_;
+    details::file_helper file_helper_;
+};
+
+using rotating_file_sink_mt = rotating_file_sink<std::mutex>;
+using rotating_file_sink_st = rotating_file_sink<details::null_mutex>;
+
+}  // namespace sinks
+
+//
+// factory functions
+//
+
+template <typename Factory = spdlog::synchronous_factory>
+inline std::shared_ptr<logger> rotating_logger_mt(const std::string &logger_name,
+                                                  const filename_t &filename,
+                                                  size_t max_file_size,
+                                                  size_t max_files,
+                                                  bool rotate_on_open = false,
+                                                  const file_event_handlers &event_handlers = {}) {
+    return Factory::template create<sinks::rotating_file_sink_mt>(
+        logger_name, filename, max_file_size, max_files, rotate_on_open, event_handlers);
+}
+
+template <typename Factory = spdlog::synchronous_factory>
+inline std::shared_ptr<logger> rotating_logger_st(const std::string &logger_name,
+                                                  const filename_t &filename,
+                                                  size_t max_file_size,
+                                                  size_t max_files,
+                                                  bool rotate_on_open = false,
+                                                  const file_event_handlers &event_handlers = {}) {
+    return Factory::template create<sinks::rotating_file_sink_st>(
+        logger_name, filename, max_file_size, max_files, rotate_on_open, event_handlers);
+}
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "rotating_file_sink-inl.h"
+#endif
diff --git a/third_party/spdlog/sinks/sink-inl.h b/third_party/spdlog/sinks/sink-inl.h
new file mode 100644
index 00000000000..e4b271404fd
--- /dev/null
+++ b/third_party/spdlog/sinks/sink-inl.h
@@ -0,0 +1,22 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/sinks/sink.h>
+#endif
+
+#include <spdlog/common.h>
+
+SPDLOG_INLINE bool spdlog::sinks::sink::should_log(spdlog::level::level_enum msg_level) const {
+    return msg_level >= level_.load(std::memory_order_relaxed);
+}
+
+SPDLOG_INLINE void spdlog::sinks::sink::set_level(level::level_enum log_level) {
+    level_.store(log_level, std::memory_order_relaxed);
+}
+
+SPDLOG_INLINE spdlog::level::level_enum spdlog::sinks::sink::level() const {
+    return static_cast<spdlog::level::level_enum>(level_.load(std::memory_order_relaxed));
+}
diff --git a/third_party/spdlog/sinks/sink.h b/third_party/spdlog/sinks/sink.h
new file mode 100644
index 00000000000..58506853640
--- /dev/null
+++ b/third_party/spdlog/sinks/sink.h
@@ -0,0 +1,34 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/details/log_msg.h>
+#include <spdlog/formatter.h>
+
+namespace spdlog {
+
+namespace sinks {
+class SPDLOG_API sink {
+public:
+    virtual ~sink() = default;
+    virtual void log(const details::log_msg &msg) = 0;
+    virtual void flush() = 0;
+    virtual void set_pattern(const std::string &pattern) = 0;
+    virtual void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) = 0;
+
+    void set_level(level::level_enum log_level);
+    level::level_enum level() const;
+    bool should_log(level::level_enum msg_level) const;
+
+protected:
+    // sink log level - default is all
+    level_t level_{level::trace};
+};
+
+}  // namespace sinks
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "sink-inl.h"
+#endif
diff --git a/third_party/spdlog/sinks/wincolor_sink-inl.h b/third_party/spdlog/sinks/wincolor_sink-inl.h
new file mode 100644
index 00000000000..696db566370
--- /dev/null
+++ b/third_party/spdlog/sinks/wincolor_sink-inl.h
@@ -0,0 +1,172 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/sinks/wincolor_sink.h>
+#endif
+
+#include <spdlog/details/windows_include.h>
+#include <wincon.h>
+
+#include <spdlog/common.h>
+#include <spdlog/pattern_formatter.h>
+
+namespace spdlog {
+namespace sinks {
+template <typename ConsoleMutex>
+SPDLOG_INLINE wincolor_sink<ConsoleMutex>::wincolor_sink(void *out_handle, color_mode mode)
+    : out_handle_(out_handle),
+      mutex_(ConsoleMutex::mutex()),
+      formatter_(details::make_unique<spdlog::pattern_formatter>()) {
+    set_color_mode_impl(mode);
+    // set level colors
+    colors_[level::trace] = FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE;  // white
+    colors_[level::debug] = FOREGROUND_GREEN | FOREGROUND_BLUE;                   // cyan
+    colors_[level::info] = FOREGROUND_GREEN;                                      // green
+    colors_[level::warn] =
+        FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_INTENSITY;  // intense yellow
+    colors_[level::err] = FOREGROUND_RED | FOREGROUND_INTENSITY;   // intense red
+    colors_[level::critical] = BACKGROUND_RED | FOREGROUND_RED | FOREGROUND_GREEN |
+                               FOREGROUND_BLUE |
+                               FOREGROUND_INTENSITY;  // intense white on red background
+    colors_[level::off] = 0;
+}
+
+template <typename ConsoleMutex>
+SPDLOG_INLINE wincolor_sink<ConsoleMutex>::~wincolor_sink() {
+    this->flush();
+}
+
+// change the color for the given level
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::set_color(level::level_enum level,
+                                                          std::uint16_t color) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    colors_[static_cast<size_t>(level)] = color;
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::log(const details::log_msg &msg) {
+    if (out_handle_ == nullptr || out_handle_ == INVALID_HANDLE_VALUE) {
+        return;
+    }
+
+    std::lock_guard<mutex_t> lock(mutex_);
+    msg.color_range_start = 0;
+    msg.color_range_end = 0;
+    memory_buf_t formatted;
+    formatter_->format(msg, formatted);
+    if (should_do_colors_ && msg.color_range_end > msg.color_range_start) {
+        // before color range
+        print_range_(formatted, 0, msg.color_range_start);
+        // in color range
+        auto orig_attribs =
+            static_cast<WORD>(set_foreground_color_(colors_[static_cast<size_t>(msg.level)]));
+        print_range_(formatted, msg.color_range_start, msg.color_range_end);
+        // reset to orig colors
+        ::SetConsoleTextAttribute(static_cast<HANDLE>(out_handle_), orig_attribs);
+        print_range_(formatted, msg.color_range_end, formatted.size());
+    } else  // print without colors if color range is invalid (or color is disabled)
+    {
+        write_to_file_(formatted);
+    }
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::flush() {
+    // windows console always flushed?
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::set_pattern(const std::string &pattern) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    formatter_ = std::unique_ptr<spdlog::formatter>(new pattern_formatter(pattern));
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE
+wincolor_sink<ConsoleMutex>::set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    formatter_ = std::move(sink_formatter);
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::set_color_mode(color_mode mode) {
+    std::lock_guard<mutex_t> lock(mutex_);
+    set_color_mode_impl(mode);
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::set_color_mode_impl(color_mode mode) {
+    if (mode == color_mode::automatic) {
+        // should do colors only if out_handle_  points to actual console.
+        DWORD console_mode;
+        bool in_console = ::GetConsoleMode(static_cast<HANDLE>(out_handle_), &console_mode) != 0;
+        should_do_colors_ = in_console;
+    } else {
+        should_do_colors_ = mode == color_mode::always ? true : false;
+    }
+}
+
+// set foreground color and return the orig console attributes (for resetting later)
+template <typename ConsoleMutex>
+std::uint16_t SPDLOG_INLINE
+wincolor_sink<ConsoleMutex>::set_foreground_color_(std::uint16_t attribs) {
+    CONSOLE_SCREEN_BUFFER_INFO orig_buffer_info;
+    if (!::GetConsoleScreenBufferInfo(static_cast<HANDLE>(out_handle_), &orig_buffer_info)) {
+        // just return white if failed getting console info
+        return FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_BLUE;
+    }
+
+    // change only the foreground bits (lowest 4 bits)
+    auto new_attribs = static_cast<WORD>(attribs) | (orig_buffer_info.wAttributes & 0xfff0);
+    auto ignored =
+        ::SetConsoleTextAttribute(static_cast<HANDLE>(out_handle_), static_cast<WORD>(new_attribs));
+    (void)(ignored);
+    return static_cast<std::uint16_t>(orig_buffer_info.wAttributes);  // return orig attribs
+}
+
+// print a range of formatted message to console
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::print_range_(const memory_buf_t &formatted,
+                                                             size_t start,
+                                                             size_t end) {
+    if (end > start) {
+#if defined(SPDLOG_UTF8_TO_WCHAR_CONSOLE)
+        wmemory_buf_t wformatted;
+        details::os::utf8_to_wstrbuf(string_view_t(formatted.data() + start, end - start),
+            wformatted);
+        auto size = static_cast<DWORD>(wformatted.size());
+        auto ignored = ::WriteConsoleW(static_cast<HANDLE>(out_handle_), wformatted.data(), size,
+            nullptr, nullptr);
+#else
+        auto size = static_cast<DWORD>(end - start);
+        auto ignored = ::WriteConsoleA(static_cast<HANDLE>(out_handle_), formatted.data() + start,
+                                       size, nullptr, nullptr);
+#endif
+        (void)(ignored);
+    }
+}
+
+template <typename ConsoleMutex>
+void SPDLOG_INLINE wincolor_sink<ConsoleMutex>::write_to_file_(const memory_buf_t &formatted) {
+    auto size = static_cast<DWORD>(formatted.size());
+    DWORD bytes_written = 0;
+    auto ignored = ::WriteFile(static_cast<HANDLE>(out_handle_), formatted.data(), size,
+                               &bytes_written, nullptr);
+    (void)(ignored);
+}
+
+// wincolor_stdout_sink
+template <typename ConsoleMutex>
+SPDLOG_INLINE wincolor_stdout_sink<ConsoleMutex>::wincolor_stdout_sink(color_mode mode)
+    : wincolor_sink<ConsoleMutex>(::GetStdHandle(STD_OUTPUT_HANDLE), mode) {}
+
+// wincolor_stderr_sink
+template <typename ConsoleMutex>
+SPDLOG_INLINE wincolor_stderr_sink<ConsoleMutex>::wincolor_stderr_sink(color_mode mode)
+    : wincolor_sink<ConsoleMutex>(::GetStdHandle(STD_ERROR_HANDLE), mode) {}
+}  // namespace sinks
+}  // namespace spdlog
diff --git a/third_party/spdlog/sinks/wincolor_sink.h b/third_party/spdlog/sinks/wincolor_sink.h
new file mode 100644
index 00000000000..8ba594cc2b9
--- /dev/null
+++ b/third_party/spdlog/sinks/wincolor_sink.h
@@ -0,0 +1,82 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <spdlog/details/console_globals.h>
+#include <spdlog/details/null_mutex.h>
+#include <spdlog/sinks/sink.h>
+
+#include <array>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+#include <string>
+
+namespace spdlog {
+namespace sinks {
+/*
+ * Windows color console sink. Uses WriteConsoleA to write to the console with
+ * colors
+ */
+template <typename ConsoleMutex>
+class wincolor_sink : public sink {
+public:
+    wincolor_sink(void *out_handle, color_mode mode);
+    ~wincolor_sink() override;
+
+    wincolor_sink(const wincolor_sink &other) = delete;
+    wincolor_sink &operator=(const wincolor_sink &other) = delete;
+
+    // change the color for the given level
+    void set_color(level::level_enum level, std::uint16_t color);
+    void log(const details::log_msg &msg) final override;
+    void flush() final override;
+    void set_pattern(const std::string &pattern) override final;
+    void set_formatter(std::unique_ptr<spdlog::formatter> sink_formatter) override final;
+    void set_color_mode(color_mode mode);
+
+protected:
+    using mutex_t = typename ConsoleMutex::mutex_t;
+    void *out_handle_;
+    mutex_t &mutex_;
+    bool should_do_colors_;
+    std::unique_ptr<spdlog::formatter> formatter_;
+    std::array<std::uint16_t, level::n_levels> colors_;
+
+    // set foreground color and return the orig console attributes (for resetting later)
+    std::uint16_t set_foreground_color_(std::uint16_t attribs);
+
+    // print a range of formatted message to console
+    void print_range_(const memory_buf_t &formatted, size_t start, size_t end);
+
+    // in case we are redirected to file (not in console mode)
+    void write_to_file_(const memory_buf_t &formatted);
+
+    void set_color_mode_impl(color_mode mode);
+};
+
+template <typename ConsoleMutex>
+class wincolor_stdout_sink : public wincolor_sink<ConsoleMutex> {
+public:
+    explicit wincolor_stdout_sink(color_mode mode = color_mode::automatic);
+};
+
+template <typename ConsoleMutex>
+class wincolor_stderr_sink : public wincolor_sink<ConsoleMutex> {
+public:
+    explicit wincolor_stderr_sink(color_mode mode = color_mode::automatic);
+};
+
+using wincolor_stdout_sink_mt = wincolor_stdout_sink<details::console_mutex>;
+using wincolor_stdout_sink_st = wincolor_stdout_sink<details::console_nullmutex>;
+
+using wincolor_stderr_sink_mt = wincolor_stderr_sink<details::console_mutex>;
+using wincolor_stderr_sink_st = wincolor_stderr_sink<details::console_nullmutex>;
+}  // namespace sinks
+}  // namespace spdlog
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "wincolor_sink-inl.h"
+#endif
diff --git a/third_party/spdlog/spdlog-inl.h b/third_party/spdlog/spdlog-inl.h
new file mode 100644
index 00000000000..97c362222ed
--- /dev/null
+++ b/third_party/spdlog/spdlog-inl.h
@@ -0,0 +1,92 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#ifndef SPDLOG_HEADER_ONLY
+    #include <spdlog/spdlog.h>
+#endif
+
+#include <spdlog/common.h>
+#include <spdlog/pattern_formatter.h>
+
+namespace spdlog {
+
+SPDLOG_INLINE void initialize_logger(std::shared_ptr<logger> logger) {
+    details::registry::instance().initialize_logger(std::move(logger));
+}
+
+SPDLOG_INLINE std::shared_ptr<logger> get(const std::string &name) {
+    return details::registry::instance().get(name);
+}
+
+SPDLOG_INLINE void set_formatter(std::unique_ptr<spdlog::formatter> formatter) {
+    details::registry::instance().set_formatter(std::move(formatter));
+}
+
+SPDLOG_INLINE void set_pattern(std::string pattern, pattern_time_type time_type) {
+    set_formatter(
+        std::unique_ptr<spdlog::formatter>(new pattern_formatter(std::move(pattern), time_type)));
+}
+
+SPDLOG_INLINE void enable_backtrace(size_t n_messages) {
+    details::registry::instance().enable_backtrace(n_messages);
+}
+
+SPDLOG_INLINE void disable_backtrace() { details::registry::instance().disable_backtrace(); }
+
+SPDLOG_INLINE void dump_backtrace() { default_logger_raw()->dump_backtrace(); }
+
+SPDLOG_INLINE level::level_enum get_level() { return default_logger_raw()->level(); }
+
+SPDLOG_INLINE bool should_log(level::level_enum log_level) {
+    return default_logger_raw()->should_log(log_level);
+}
+
+SPDLOG_INLINE void set_level(level::level_enum log_level) {
+    details::registry::instance().set_level(log_level);
+}
+
+SPDLOG_INLINE void flush_on(level::level_enum log_level) {
+    details::registry::instance().flush_on(log_level);
+}
+
+SPDLOG_INLINE void set_error_handler(void (*handler)(const std::string &msg)) {
+    details::registry::instance().set_error_handler(handler);
+}
+
+SPDLOG_INLINE void register_logger(std::shared_ptr<logger> logger) {
+    details::registry::instance().register_logger(std::move(logger));
+}
+
+SPDLOG_INLINE void apply_all(const std::function<void(std::shared_ptr<logger>)> &fun) {
+    details::registry::instance().apply_all(fun);
+}
+
+SPDLOG_INLINE void drop(const std::string &name) { details::registry::instance().drop(name); }
+
+SPDLOG_INLINE void drop_all() { details::registry::instance().drop_all(); }
+
+SPDLOG_INLINE void shutdown() { details::registry::instance().shutdown(); }
+
+SPDLOG_INLINE void set_automatic_registration(bool automatic_registration) {
+    details::registry::instance().set_automatic_registration(automatic_registration);
+}
+
+SPDLOG_INLINE std::shared_ptr<spdlog::logger> default_logger() {
+    return details::registry::instance().default_logger();
+}
+
+SPDLOG_INLINE spdlog::logger *default_logger_raw() {
+    return details::registry::instance().get_default_raw();
+}
+
+SPDLOG_INLINE void set_default_logger(std::shared_ptr<spdlog::logger> default_logger) {
+    details::registry::instance().set_default_logger(std::move(default_logger));
+}
+
+SPDLOG_INLINE void apply_logger_env_levels(std::shared_ptr<logger> logger) {
+    details::registry::instance().apply_logger_env_levels(std::move(logger));
+}
+
+}  // namespace spdlog
diff --git a/third_party/spdlog/spdlog.h b/third_party/spdlog/spdlog.h
new file mode 100644
index 00000000000..a8afbcec4a9
--- /dev/null
+++ b/third_party/spdlog/spdlog.h
@@ -0,0 +1,352 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+// spdlog main header file.
+// see example.cpp for usage example
+
+#ifndef SPDLOG_H
+#define SPDLOG_H
+
+#pragma once
+
+#include <spdlog/common.h>
+#include <spdlog/details/registry.h>
+#include <spdlog/details/synchronous_factory.h>
+#include <spdlog/logger.h>
+#include <spdlog/version.h>
+
+#include <chrono>
+#include <functional>
+#include <memory>
+#include <string>
+
+namespace spdlog {
+
+using default_factory = synchronous_factory;
+
+// Create and register a logger with a templated sink type
+// The logger's level, formatter and flush level will be set according the
+// global settings.
+//
+// Example:
+//   spdlog::create<daily_file_sink_st>("logger_name", "dailylog_filename", 11, 59);
+template <typename Sink, typename... SinkArgs>
+inline std::shared_ptr<spdlog::logger> create(std::string logger_name, SinkArgs &&...sink_args) {
+    return default_factory::create<Sink>(std::move(logger_name),
+                                         std::forward<SinkArgs>(sink_args)...);
+}
+
+// Initialize and register a logger,
+// formatter and flush level will be set according the global settings.
+//
+// Useful for initializing manually created loggers with the global settings.
+//
+// Example:
+//   auto mylogger = std::make_shared<spdlog::logger>("mylogger", ...);
+//   spdlog::initialize_logger(mylogger);
+SPDLOG_API void initialize_logger(std::shared_ptr<logger> logger);
+
+// Return an existing logger or nullptr if a logger with such name doesn't
+// exist.
+// example: spdlog::get("my_logger")->info("hello {}", "world");
+SPDLOG_API std::shared_ptr<logger> get(const std::string &name);
+
+// Set global formatter. Each sink in each logger will get a clone of this object
+SPDLOG_API void set_formatter(std::unique_ptr<spdlog::formatter> formatter);
+
+// Set global format string.
+// example: spdlog::set_pattern("%Y-%m-%d %H:%M:%S.%e %l : %v");
+SPDLOG_API void set_pattern(std::string pattern,
+                            pattern_time_type time_type = pattern_time_type::local);
+
+// enable global backtrace support
+SPDLOG_API void enable_backtrace(size_t n_messages);
+
+// disable global backtrace support
+SPDLOG_API void disable_backtrace();
+
+// call dump backtrace on default logger
+SPDLOG_API void dump_backtrace();
+
+// Get global logging level
+SPDLOG_API level::level_enum get_level();
+
+// Set global logging level
+SPDLOG_API void set_level(level::level_enum log_level);
+
+// Determine whether the default logger should log messages with a certain level
+SPDLOG_API bool should_log(level::level_enum lvl);
+
+// Set global flush level
+SPDLOG_API void flush_on(level::level_enum log_level);
+
+// Start/Restart a periodic flusher thread
+// Warning: Use only if all your loggers are thread safe!
+template <typename Rep, typename Period>
+inline void flush_every(std::chrono::duration<Rep, Period> interval) {
+    details::registry::instance().flush_every(interval);
+}
+
+// Set global error handler
+SPDLOG_API void set_error_handler(void (*handler)(const std::string &msg));
+
+// Register the given logger with the given name
+SPDLOG_API void register_logger(std::shared_ptr<logger> logger);
+
+// Apply a user defined function on all registered loggers
+// Example:
+// spdlog::apply_all([&](std::shared_ptr<spdlog::logger> l) {l->flush();});
+SPDLOG_API void apply_all(const std::function<void(std::shared_ptr<logger>)> &fun);
+
+// Drop the reference to the given logger
+SPDLOG_API void drop(const std::string &name);
+
+// Drop all references from the registry
+SPDLOG_API void drop_all();
+
+// stop any running threads started by spdlog and clean registry loggers
+SPDLOG_API void shutdown();
+
+// Automatic registration of loggers when using spdlog::create() or spdlog::create_async
+SPDLOG_API void set_automatic_registration(bool automatic_registration);
+
+// API for using default logger (stdout_color_mt),
+// e.g: spdlog::info("Message {}", 1);
+//
+// The default logger object can be accessed using the spdlog::default_logger():
+// For example, to add another sink to it:
+// spdlog::default_logger()->sinks().push_back(some_sink);
+//
+// The default logger can replaced using spdlog::set_default_logger(new_logger).
+// For example, to replace it with a file logger.
+//
+// IMPORTANT:
+// The default API is thread safe (for _mt loggers), but:
+// set_default_logger() *should not* be used concurrently with the default API.
+// e.g do not call set_default_logger() from one thread while calling spdlog::info() from another.
+
+SPDLOG_API std::shared_ptr<spdlog::logger> default_logger();
+
+SPDLOG_API spdlog::logger *default_logger_raw();
+
+SPDLOG_API void set_default_logger(std::shared_ptr<spdlog::logger> default_logger);
+
+// Initialize logger level based on environment configs.
+//
+// Useful for applying SPDLOG_LEVEL to manually created loggers.
+//
+// Example:
+//   auto mylogger = std::make_shared<spdlog::logger>("mylogger", ...);
+//   spdlog::apply_logger_env_levels(mylogger);
+SPDLOG_API void apply_logger_env_levels(std::shared_ptr<logger> logger);
+
+template <typename... Args>
+inline void log(source_loc source,
+                level::level_enum lvl,
+                format_string_t<Args...> fmt,
+                Args &&...args) {
+    default_logger_raw()->log(source, lvl, fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void log(level::level_enum lvl, format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->log(source_loc{}, lvl, fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void trace(format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->trace(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void debug(format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->debug(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void info(format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->info(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void warn(format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->warn(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void error(format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->error(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void critical(format_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->critical(fmt, std::forward<Args>(args)...);
+}
+
+template <typename T>
+inline void log(source_loc source, level::level_enum lvl, const T &msg) {
+    default_logger_raw()->log(source, lvl, msg);
+}
+
+template <typename T>
+inline void log(level::level_enum lvl, const T &msg) {
+    default_logger_raw()->log(lvl, msg);
+}
+
+#ifdef SPDLOG_WCHAR_TO_UTF8_SUPPORT
+template <typename... Args>
+inline void log(source_loc source,
+                level::level_enum lvl,
+                wformat_string_t<Args...> fmt,
+                Args &&...args) {
+    default_logger_raw()->log(source, lvl, fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void log(level::level_enum lvl, wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->log(source_loc{}, lvl, fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void trace(wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->trace(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void debug(wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->debug(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void info(wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->info(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void warn(wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->warn(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void error(wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->error(fmt, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+inline void critical(wformat_string_t<Args...> fmt, Args &&...args) {
+    default_logger_raw()->critical(fmt, std::forward<Args>(args)...);
+}
+#endif
+
+template <typename T>
+inline void trace(const T &msg) {
+    default_logger_raw()->trace(msg);
+}
+
+template <typename T>
+inline void debug(const T &msg) {
+    default_logger_raw()->debug(msg);
+}
+
+template <typename T>
+inline void info(const T &msg) {
+    default_logger_raw()->info(msg);
+}
+
+template <typename T>
+inline void warn(const T &msg) {
+    default_logger_raw()->warn(msg);
+}
+
+template <typename T>
+inline void error(const T &msg) {
+    default_logger_raw()->error(msg);
+}
+
+template <typename T>
+inline void critical(const T &msg) {
+    default_logger_raw()->critical(msg);
+}
+
+}  // namespace spdlog
+
+//
+// enable/disable log calls at compile time according to global level.
+//
+// define SPDLOG_ACTIVE_LEVEL to one of those (before including spdlog.h):
+// SPDLOG_LEVEL_TRACE,
+// SPDLOG_LEVEL_DEBUG,
+// SPDLOG_LEVEL_INFO,
+// SPDLOG_LEVEL_WARN,
+// SPDLOG_LEVEL_ERROR,
+// SPDLOG_LEVEL_CRITICAL,
+// SPDLOG_LEVEL_OFF
+//
+
+#ifndef SPDLOG_NO_SOURCE_LOC
+    #define SPDLOG_LOGGER_CALL(logger, level, ...) \
+        (logger)->log(spdlog::source_loc{__FILE__, __LINE__, SPDLOG_FUNCTION}, level, __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_CALL(logger, level, ...) \
+        (logger)->log(spdlog::source_loc{}, level, __VA_ARGS__)
+#endif
+
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_TRACE
+    #define SPDLOG_LOGGER_TRACE(logger, ...) \
+        SPDLOG_LOGGER_CALL(logger, spdlog::level::trace, __VA_ARGS__)
+    #define SPDLOG_TRACE(...) SPDLOG_LOGGER_TRACE(spdlog::default_logger_raw(), __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_TRACE(logger, ...) (void)0
+    #define SPDLOG_TRACE(...) (void)0
+#endif
+
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_DEBUG
+    #define SPDLOG_LOGGER_DEBUG(logger, ...) \
+        SPDLOG_LOGGER_CALL(logger, spdlog::level::debug, __VA_ARGS__)
+    #define SPDLOG_DEBUG(...) SPDLOG_LOGGER_DEBUG(spdlog::default_logger_raw(), __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_DEBUG(logger, ...) (void)0
+    #define SPDLOG_DEBUG(...) (void)0
+#endif
+
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_INFO
+    #define SPDLOG_LOGGER_INFO(logger, ...) \
+        SPDLOG_LOGGER_CALL(logger, spdlog::level::info, __VA_ARGS__)
+    #define SPDLOG_INFO(...) SPDLOG_LOGGER_INFO(spdlog::default_logger_raw(), __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_INFO(logger, ...) (void)0
+    #define SPDLOG_INFO(...) (void)0
+#endif
+
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_WARN
+    #define SPDLOG_LOGGER_WARN(logger, ...) \
+        SPDLOG_LOGGER_CALL(logger, spdlog::level::warn, __VA_ARGS__)
+    #define SPDLOG_WARN(...) SPDLOG_LOGGER_WARN(spdlog::default_logger_raw(), __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_WARN(logger, ...) (void)0
+    #define SPDLOG_WARN(...) (void)0
+#endif
+
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_ERROR
+    #define SPDLOG_LOGGER_ERROR(logger, ...) \
+        SPDLOG_LOGGER_CALL(logger, spdlog::level::err, __VA_ARGS__)
+    #define SPDLOG_ERROR(...) SPDLOG_LOGGER_ERROR(spdlog::default_logger_raw(), __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_ERROR(logger, ...) (void)0
+    #define SPDLOG_ERROR(...) (void)0
+#endif
+
+#if SPDLOG_ACTIVE_LEVEL <= SPDLOG_LEVEL_CRITICAL
+    #define SPDLOG_LOGGER_CRITICAL(logger, ...) \
+        SPDLOG_LOGGER_CALL(logger, spdlog::level::critical, __VA_ARGS__)
+    #define SPDLOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(spdlog::default_logger_raw(), __VA_ARGS__)
+#else
+    #define SPDLOG_LOGGER_CRITICAL(logger, ...) (void)0
+    #define SPDLOG_CRITICAL(...) (void)0
+#endif
+
+#ifdef SPDLOG_HEADER_ONLY
+    #include "spdlog-inl.h"
+#endif
+
+#endif  // SPDLOG_H
diff --git a/src/common/spdlog/tweakme.h b/third_party/spdlog/tweakme.h
old mode 100755
new mode 100644
similarity index 96%
rename from src/common/spdlog/tweakme.h
rename to third_party/spdlog/tweakme.h
index a47a90764a5..c4449c90465
--- a/src/common/spdlog/tweakme.h
+++ b/third_party/spdlog/tweakme.h
@@ -104,6 +104,13 @@
 //
 // #define SPDLOG_LEVEL_NAMES { "MY TRACE", "MY DEBUG", "MY INFO", "MY WARNING", "MY ERROR", "MY
 // CRITICAL", "OFF" }
+//
+// For C++17 use string_view_literals:
+//
+// #include <string_view>
+// using namespace std::string_view_literals;
+// #define SPDLOG_LEVEL_NAMES { "MY TRACE"sv, "MY DEBUG"sv, "MY INFO"sv, "MY WARNING"sv, "MY ERROR"sv, "MY
+// CRITICAL"sv, "OFF"sv }
 ///////////////////////////////////////////////////////////////////////////////
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/third_party/spdlog/version.h b/third_party/spdlog/version.h
new file mode 100644
index 00000000000..6f1a4ee5643
--- /dev/null
+++ b/third_party/spdlog/version.h
@@ -0,0 +1,11 @@
+// Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
+// Distributed under the MIT License (http://opensource.org/licenses/MIT)
+
+#pragma once
+
+#define SPDLOG_VER_MAJOR 1
+#define SPDLOG_VER_MINOR 15
+#define SPDLOG_VER_PATCH 1
+
+#define SPDLOG_TO_VERSION(major, minor, patch) (major * 10000 + minor * 100 + patch)
+#define SPDLOG_VERSION SPDLOG_TO_VERSION(SPDLOG_VER_MAJOR, SPDLOG_VER_MINOR, SPDLOG_VER_PATCH)
diff --git a/src/cpu/x64/xbyak/COPYRIGHT b/third_party/xbyak/COPYRIGHT
similarity index 100%
rename from src/cpu/x64/xbyak/COPYRIGHT
rename to third_party/xbyak/COPYRIGHT
diff --git a/src/cpu/x64/xbyak/xbyak.h b/third_party/xbyak/xbyak.h
similarity index 93%
rename from src/cpu/x64/xbyak/xbyak.h
rename to third_party/xbyak/xbyak.h
index c43bf2a4c3b..3e445de01cf 100644
--- a/src/cpu/x64/xbyak/xbyak.h
+++ b/third_party/xbyak/xbyak.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -67,6 +67,13 @@
 #include <iostream>
 #endif
 
+#include <map>
+// for debug trace in GCC
+#if !defined(NDEBUG) && defined(__GNUC__) && !(defined(__ANDROID__) || defined(ANDROID))
+#include <execinfo.h>
+#include <sstream>
+#endif
+
 // #define XBYAK_DISABLE_AVX512
 
 #if !defined(XBYAK_USE_MMAP_ALLOCATOR) && !defined(XBYAK_DONT_USE_MMAP_ALLOCATOR)
@@ -200,7 +207,7 @@ namespace Xbyak {
 
 enum {
 	DEFAULT_MAX_CODE_SIZE = 4096,
-	VERSION = 0x7050 /* 0xABCD = A.BC(.D) */
+	VERSION = 0x7231 /* 0xABCD = A.BC(.D) */
 };
 
 #ifndef MIE_INTEGER_TYPE_DEFINED
@@ -277,6 +284,7 @@ enum {
 	ERR_CANT_USE_REX2,
 	ERR_INVALID_DFV,
 	ERR_INVALID_REG_IDX,
+	ERR_BAD_ENCODING_MODE,
 	ERR_INTERNAL // Put it at last.
 };
 
@@ -335,6 +343,7 @@ inline const char *ConvertErrorToString(int err)
 		"can't use rex2",
 		"invalid dfv",
 		"invalid reg index",
+		"bad encoding mode",
 		"internal error"
 	};
 	assert(ERR_INTERNAL + 1 == sizeof(errTbl) / sizeof(*errTbl));
@@ -829,6 +838,9 @@ class Label;
 struct Reg8;
 struct Reg16;
 struct Reg32;
+struct Xmm;
+struct Ymm;
+struct Zmm;
 #ifdef XBYAK64
 struct Reg64;
 #endif
@@ -844,6 +856,9 @@ class Reg : public Operand {
 #ifdef XBYAK64
 	Reg64 cvt64() const;
 #endif
+	Xmm cvt128() const;
+	Ymm cvt256() const;
+	Zmm cvt512() const;
 	Reg operator|(const ApxFlagNF&) const { Reg r(*this); r.setNF(); return r; }
 	Reg operator|(const ApxFlagZU&) const { Reg r(*this); r.setZU(); return r; }
 };
@@ -981,6 +996,21 @@ inline Reg64 Reg::cvt64() const
 }
 #endif
 
+inline Xmm Reg::cvt128() const
+{
+	return Xmm(changeBit(128).getIdx());
+}
+
+inline Ymm Reg::cvt256() const
+{
+	return Ymm(changeBit(256).getIdx());
+}
+
+inline Zmm Reg::cvt512() const
+{
+	return Zmm(changeBit(512).getIdx());
+}
+
 #ifndef XBYAK_DISABLE_SEGMENT
 // not derived from Reg
 class Segment {
@@ -1022,6 +1052,7 @@ class RegExp {
 			base_ = r;
 		}
 	}
+    XBYAK_CONSTEXPR RegExp(const RegExp& rhs) : base_(rhs.base_), index_(rhs.index_), scale_(rhs.scale_), disp_(rhs.disp_) { }
 	bool isVsib(int bit = 128 | 256 | 512) const { return index_.isBit(bit); }
 	RegExp optimize() const
 	{
@@ -1718,7 +1749,9 @@ inline const uint8_t* Label::getAddress() const
 typedef enum {
 	DefaultEncoding,
 	VexEncoding,
-	EvexEncoding
+	EvexEncoding,
+	PreAVX10v2Encoding,
+	AVX10v2Encoding
 } PreferredEncoding;
 
 class CodeGenerator : public CodeArray {
@@ -1775,10 +1808,10 @@ class CodeGenerator : public CodeArray {
 	{
 		return op1.isREG(i32e) && ((op2.isREG(i32e) && op1.getBit() == op2.getBit()) || op2.isMEM());
 	}
-	static inline bool isValidSSE(const Operand& op1)
+	static inline bool isValidSSE(const Operand& op)
 	{
 		// SSE instructions do not support XMM16 - XMM31
-		return !(op1.isXMM() && op1.getIdx() >= 16);
+		return !(op.isXMM() && op.getIdx() >= 16);
 	}
 	static inline uint8_t rexRXB(int bit, int bit3, const Reg& r, const Reg& b, const Reg& x = Reg())
 	{
@@ -1891,6 +1924,7 @@ class CodeGenerator : public CodeArray {
 	static const uint64_t T_ZU = 1ull << 36; // ND=ZU
 	static const uint64_t T_F2 = 1ull << 37; // pp = 3
 	// T_66 = 1, T_F3 = 2, T_F2 = 3
+	static const uint64_t T_ALLOW_DIFF_SIZE = 1ull << 38; // allow difference reg size
 	static inline uint32_t getPP(uint64_t type) { return (type & T_66) ? 1 : (type & T_F3) ? 2 : (type & T_F2) ? 3 : 0; }
 	// @@@end of avx_type_def.h
 	static inline uint32_t getMap(uint64_t type) { return (type & T_0F) ? 1 : (type & T_0F38) ? 2 : (type & T_0F3A) ? 3 : 0; }
@@ -1912,16 +1946,19 @@ class CodeGenerator : public CodeArray {
 		}
 		db(code);
 	}
-	void verifySAE(const Reg& r, uint64_t type) const
+	// Allow YMM embedded rounding for AVX10.2 to minimize flag modifications
+	bool verifySAE(const Reg& r, const Reg& b, uint64_t type) const
 	{
-		if (((type & T_SAE_X) && r.isXMM()) || ((type & T_SAE_Y) && r.isYMM()) || ((type & T_SAE_Z) && r.isZMM())) return;
-		XBYAK_THROW(ERR_SAE_IS_INVALID)
+		if (((type & T_SAE_X) && (r.isYMM() && b.isXMM())) || ((type & T_SAE_Y) && b.isXMM()) || ((type & T_SAE_Z) && b.isYMM())) return true;
+		if (((type & T_SAE_X) && b.isXMM()) || ((type & T_SAE_Y) && b.isYMM()) || ((type & T_SAE_Z) && b.isZMM())) return false;
+		XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
 	}
-	void verifyER(const Reg& r, uint64_t type) const
+	bool verifyER(const Reg& r, const Reg& b, uint64_t type) const
 	{
-		if ((type & T_ER_R) && r.isREG(32|64)) return;
-		if (((type & T_ER_X) && r.isXMM()) || ((type & T_ER_Y) && r.isYMM()) || ((type & T_ER_Z) && r.isZMM())) return;
-		XBYAK_THROW(ERR_ER_IS_INVALID)
+		if ((type & T_ER_R) && b.isREG(32|64)) return false;
+		if (((type & T_ER_X) && (r.isYMM() && b.isXMM())) || ((type & T_ER_Y) && b.isXMM()) || ((type & T_ER_Z) && b.isYMM())) return true;
+		if (((type & T_ER_X) && b.isXMM()) || ((type & T_ER_Y) && b.isYMM()) || ((type & T_ER_Z) && b.isZMM())) return false;
+		XBYAK_THROW_RET(ERR_SAE_IS_INVALID, false)
 	}
 	// (a, b, c) contains non zero two or three values then err
 	int verifyDuplicate(int a, int b, int c, int err)
@@ -1942,19 +1979,21 @@ class CodeGenerator : public CodeArray {
 
 		bool R = reg.isExtIdx();
 		bool X3 = (x && x->isExtIdx()) || (base.isSIMD() && base.isExtIdx2());
-		bool B4 = base.isREG() && base.isExtIdx2();
-		bool X4 = x && (x->isREG() && x->isExtIdx2());
+		uint8_t B4 = (base.isREG() && base.isExtIdx2()) ? 8 : 0;
+		uint8_t U = (x && (x->isREG() && x->isExtIdx2())) ? 0 : 4;
 		bool B = base.isExtIdx();
 		bool Rp = reg.isExtIdx2();
 		int LL;
 		int rounding = verifyDuplicate(reg.getRounding(), base.getRounding(), v ? v->getRounding() : 0, ERR_ROUNDING_IS_ALREADY_SET);
 		int disp8N = 1;
 		if (rounding) {
+			bool isUzero = false;
 			if (rounding == EvexModifierRounding::T_SAE) {
-				verifySAE(base, type); LL = 0;
+				isUzero = verifySAE(reg, base, type); LL = 0;
 			} else {
-				verifyER(base, type); LL = rounding - 1;
+				isUzero = verifyER(reg, base, type); LL = rounding - 1;
 			}
+			if (isUzero) U = 0; // avx10.2 Evex.U
 			b = true;
 		} else {
 			if (v) VL = (std::max)(VL, v->getBit());
@@ -1980,8 +2019,8 @@ class CodeGenerator : public CodeArray {
 		if (aaa == 0) aaa = verifyDuplicate(base.getOpmaskIdx(), reg.getOpmaskIdx(), (v ? v->getOpmaskIdx() : 0), ERR_OPMASK_IS_ALREADY_SET);
 		if (aaa == 0) z = 0; // clear T_z if mask is not set
 		db(0x62);
-		db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | (B4 ? 8 : 0) | mmm);
-		db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | (X4 ? 0 : 4) | (pp & 3));
+		db((R ? 0 : 0x80) | (X3 ? 0 : 0x40) | (B ? 0 : 0x20) | (Rp ? 0 : 0x10) | B4 | mmm);
+		db((w == 1 ? 0x80 : 0) | ((vvvv & 15) << 3) | U | (pp & 3));
 		db((z ? 0x80 : 0) | ((LL & 3) << 5) | (b ? 0x10 : 0) | (V4 ? 0 : 8) | (aaa & 7));
 		db(code);
 		return disp8N;
@@ -2081,7 +2120,6 @@ class CodeGenerator : public CodeArray {
 		}
 	}
 	LabelManager labelMgr_;
-	bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
 	void writeCode(uint64_t type, const Reg& r, int code, bool rex2 = false)
 	{
 		if (!(type&T_APX || rex2)) {
@@ -2097,12 +2135,15 @@ class CodeGenerator : public CodeArray {
 	}
 	void opRR(const Reg& reg1, const Reg& reg2, uint64_t type, int code)
 	{
+        debug_trace();
+		if (!(type & T_ALLOW_DIFF_SIZE) && reg1.isREG() && reg2.isREG() && reg1.getBit() != reg2.getBit()) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
 		bool rex2 = rex(reg2, reg1, type);
 		writeCode(type, reg1, code, rex2);
 		setModRM(3, reg1.getIdx(), reg2.getIdx());
 	}
 	void opMR(const Address& addr, const Reg& r, uint64_t type, int code, uint64_t type2 = 0, int code2 = NONE)
 	{
+        debug_trace();
 		if (code2 == NONE) code2 = code;
 		if (type2 && opROO(Reg(), addr, r, type2, code2)) return;
 		if (addr.is64bitDisp()) XBYAK_THROW(ERR_CANT_USE_64BIT_DISP)
@@ -2208,7 +2249,7 @@ class CodeGenerator : public CodeArray {
 			}
 		}
 	}
-	void opSSE(const Reg& r, const Operand& op, uint64_t type, int code, bool isValid(const Operand&, const Operand&), int imm8 = NONE)
+	void opSSE(const Reg& r, const Operand& op, uint64_t type, int code, bool isValid(const Operand&, const Operand&) = 0, int imm8 = NONE)
 	{
 		if (isValid && !isValid(r, op)) XBYAK_THROW(ERR_BAD_COMBINATION)
 		if (!isValidSSE(r) || !isValidSSE(op)) XBYAK_THROW(ERR_NOT_SUPPORTED)
@@ -2340,10 +2381,13 @@ class CodeGenerator : public CodeArray {
 			opRO(static_cast<const Reg&>(op2), op1, 0, code, op1.getKind() == op2.getKind());
 		}
 	}
+	bool isInDisp16(uint32_t x) const { return 0xFFFF8000 <= x || x <= 0x7FFF; }
+	// allow add(ax, 0x8000);
+	bool isInDisp16relaxed(uint32_t x) const { uint32_t v = x & 0xffff0000; return v == 0 || v == 0xffff0000; }
 	uint32_t getImmBit(const Operand& op, uint32_t imm)
 	{
 		verifyMemHasSize(op);
-		uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16(imm) ? 16 : 32;
+		uint32_t immBit = inner::IsInDisp8(imm) ? 8 : isInDisp16relaxed(imm) ? 16 : 32;
 		if (op.isBit(8)) immBit = 8;
 		if (op.getBit() < immBit) XBYAK_THROW_RET(ERR_IMM_IS_TOO_BIG, 0)
 		if (op.isBit(32|64) && immBit == 16) immBit = 32; /* don't use MEM16 if 32/64bit mode */
@@ -2397,13 +2441,14 @@ class CodeGenerator : public CodeArray {
 		if (op.isREG() && op.hasRex2()) {
 			const Reg& r = static_cast<const Reg&>(op);
 			rex2(0, rexRXB(3, 0, Reg(), r), Reg(), r);
-			db(alt);
+			db(alt | (r.getIdx() & 7));
 			return;
 		}
 		int bit = op.getBit();
 		if (bit == 16 || bit == BIT) {
 			if (bit == 16) db(0x66);
 			if (op.isREG()) {
+				debug_trace();
 				if (op.getReg().getIdx() >= 8) db(0x41);
 				db(alt | (op.getIdx() & 7));
 				return;
@@ -2467,7 +2512,7 @@ class CodeGenerator : public CodeArray {
 		if (op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION)
 		int w = op.isBit(16);
 		if (!(reg.isREG() && (reg.getBit() > op.getBit()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-		opRO(reg, op, T_0F, code | w);
+		opRO(reg, op, T_0F | T_ALLOW_DIFF_SIZE, code | w);
 	}
 	void opFpuMem(const Address& addr, uint8_t m16, uint8_t m32, uint8_t m64, uint8_t ext, uint8_t m64ext)
 	{
@@ -2494,6 +2539,7 @@ class CodeGenerator : public CodeArray {
 	}
 	void opVex(const Reg& r, const Operand *p1, const Operand& op2, uint64_t type, int code, int imm8 = NONE)
 	{
+		debug_trace();
 		if (op2.isMEM()) {
 			Address addr = op2.getAddress();
 			const RegExp& regExp = addr.getRegExp();
@@ -2599,6 +2645,18 @@ class CodeGenerator : public CodeArray {
 		Operand::Kind kind = op.isBit(128) ? Operand::XMM : op.isBit(256) ? Operand::YMM : Operand::ZMM;
 		opVex(x.copyAndSetKind(kind), &xm0, op, type, code);
 	}
+	// (x, x, x/m), (x, y, y/m), (y, z, z/m)
+	void opCvt6(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code)
+	{
+		int b1 = x1.getBit();
+		int b2 = x2.getBit();
+		int b3 = op.getBit();
+		if ((b1 == 128 && (b2 == 128 || b2 == 256) && (b2 == b3 || op.isMEM())) || (b1 == 256 && b2 == 512 && (b3 == b2 || op.isMEM()))) {
+			opVex(x1, &x2, op, type, code);
+			return;
+		}
+		XBYAK_THROW(ERR_BAD_COMBINATION);
+	}
 	const Xmm& cvtIdx0(const Operand& x) const
 	{
 		return x.isZMM() ? zm0 : x.isYMM() ? ym0 : xm0;
@@ -2613,8 +2671,7 @@ class CodeGenerator : public CodeArray {
 		if (reg.isBit(8)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER)
 		bool is16bit = reg.isREG(16) && (op.isREG(16) || op.isMEM());
 		if (!is16bit && !(reg.isREG(i32e) && (op.isREG(reg.getBit()) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION)
-		if (is16bit) db(0x66);
-		opRO(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, T_F3 | T_0F, code);
+		opRO(reg, op, T_F3 | T_0F, code);
 	}
 	void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, uint64_t type, uint8_t code, int mode)
 	{
@@ -2689,21 +2746,24 @@ class CodeGenerator : public CodeArray {
 		if (addr.getRegExp().getIndex().getKind() != kind) XBYAK_THROW(ERR_BAD_VSIB_ADDRESSING)
 		opVex(x, 0, addr, type, code);
 	}
-	void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding encoding)
+	void opEncoding(const Xmm& x1, const Xmm& x2, const Operand& op, uint64_t type, int code, PreferredEncoding enc, int imm = NONE, uint64_t typeVex = 0, uint64_t typeEvex = 0, int sel = 0)
 	{
-		opAVX_X_X_XM(x1, x2, op, type | orEvexIf(encoding), code);
+		opAVX_X_X_XM(x1, x2, op, type | orEvexIf(enc, typeVex, typeEvex, sel), code, imm);
 	}
-	int orEvexIf(PreferredEncoding encoding) {
-		if (encoding == DefaultEncoding) {
-			encoding = defaultEncoding_;
+	PreferredEncoding getEncoding(PreferredEncoding enc, int sel) const
+	{
+		if (enc == DefaultEncoding) {
+			enc = defaultEncoding_[sel];
 		}
-		if (encoding == EvexEncoding) {
+		if ((sel == 0 && enc != VexEncoding && enc != EvexEncoding) || (sel == 1 && enc != PreAVX10v2Encoding && enc != AVX10v2Encoding)) XBYAK_THROW_RET(ERR_BAD_ENCODING_MODE, VexEncoding)
 #ifdef XBYAK_DISABLE_AVX512
-			XBYAK_THROW(ERR_EVEX_IS_INVALID)
+		if (enc == EvexEncoding || enc == AVX10v2Encoding) XBYAK_THROW_RET(ERR_EVEX_IS_INVALID, VexEncoding)
 #endif
-			return T_MUST_EVEX;
-		}
-		return 0;
+		return enc;
+	}
+	uint64_t orEvexIf(PreferredEncoding enc, uint64_t typeVex, uint64_t typeEvex, int sel) {
+		enc = getEncoding(enc, sel);
+		return ((sel == 0 && enc == VexEncoding) || (sel == 1 && enc != AVX10v2Encoding)) ? typeVex : (T_MUST_EVEX | typeEvex);
 	}
 	void opInOut(const Reg& a, const Reg& d, uint8_t code)
 	{
@@ -2815,6 +2875,31 @@ class CodeGenerator : public CodeArray {
 		}
 		opSSE(x, op, type1, code1, isXMM_XMMorMEM, imm);
 	}
+	// AVX10 zero-extending for vmovd, vmovw
+	void opAVX10ZeroExt(const Operand& op1, const Operand& op2, const uint64_t typeTbl[4], const int codeTbl[4], PreferredEncoding enc, int bit)
+	{
+		const Operand *p1 = &op1;
+		const Operand *p2 = &op2;
+		bool rev = false;
+		if (p1->isMEM()) {
+			std::swap(p1, p2);
+			rev = true;
+		}
+		if (p1->isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION)
+		if (p1->isXMM()) {
+			std::swap(p1, p2);
+			rev = !rev;
+		}
+		enc = getEncoding(enc, 1);
+		int sel = -1;
+		if (p1->isXMM() || (p1->isMEM() && enc == AVX10v2Encoding)) {
+			sel = 2 + int(rev);
+		} else if (p1->isREG(bit) || p1->isMEM()) {
+			sel = int(rev);
+		}
+		if (sel == -1) XBYAK_THROW(ERR_BAD_COMBINATION)
+		opAVX_X_X_XM(*static_cast<const Xmm*>(p2), xm0, *p1, typeTbl[sel], codeTbl[sel]);
+	}
 public:
 	unsigned int getVersion() const { return VERSION; }
 	using CodeArray::db;
@@ -2873,7 +2958,7 @@ class CodeGenerator : public CodeArray {
 #endif
 private:
 	bool isDefaultJmpNEAR_;
-	PreferredEncoding defaultEncoding_;
+	PreferredEncoding defaultEncoding_[2]; // 0:vnni, 1:vmpsadbw
 public:
 	void L(const std::string& label) { labelMgr_.defineSlabel(label); }
 	void L(Label& label) { labelMgr_.defineClabel(label); }
@@ -3044,9 +3129,39 @@ class CodeGenerator : public CodeArray {
 			rex(*p2, *p1); db(0x90 | (p2->getIdx() & 7));
 			return;
 		}
+		if (p1->isREG() && p2->isREG()) std::swap(p1, p2); // adapt to NASM 2.16.03 behavior to pass tests
 		opRO(static_cast<const Reg&>(*p1), *p2, 0, 0x86 | (p1->isBit(8) ? 0 : 1), (p1->isREG() && (p1->getBit() == p2->getBit())));
 	}
 
+#if !defined(NDEBUG) && defined(__GNUC__) && !(defined(__ANDROID__) || defined(ANDROID))
+	std::map<size_t, std::string> debug_traces;
+	void debug_trace() {
+		static bool enable_trace = std::getenv("ONEDNN_JIT_DUMP") && atoi(std::getenv("ONEDNN_JIT_DUMP")) != 0;
+		if (!enable_trace)
+			return;
+
+		void *array[8];
+		int size = backtrace(array, 8);
+		char **strings = backtrace_symbols(array, size);
+		std::stringstream ss;
+		// skip first 2 frame (backtrace,trace_code)
+		for(int i = 2; i < size; ++i) ss << "," << strings[i];
+		auto offset = getSize();
+		if (debug_traces.count(offset)) {
+			debug_traces[offset] += ss.str();
+		} else {
+			debug_traces[offset] = ss.str();
+		}
+		free(strings);
+	}
+	std::map<size_t, std::string> get_debug_traces() const {
+		return debug_traces;
+	}
+#else
+	void debug_trace() {}
+	std::map<size_t, std::string> get_debug_traces() const { return {}; }
+#endif
+
 #ifndef XBYAK_DISABLE_SEGMENT
 	void push(const Segment& seg)
 	{
@@ -3089,11 +3204,11 @@ class CodeGenerator : public CodeArray {
 	}
 	void mov(const Operand& op, const Segment& seg)
 	{
-		opRO(Reg8(seg.getIdx()), op, 0, 0x8C, op.isREG(16|i32e));
+		opRO(Reg8(seg.getIdx()), op, T_ALLOW_DIFF_SIZE, 0x8C, op.isREG(16|i32e));
 	}
 	void mov(const Segment& seg, const Operand& op)
 	{
-		opRO(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op, 0, 0x8E, op.isREG(16|i32e));
+		opRO(Reg8(seg.getIdx()), op.isREG(16|i32e) ? static_cast<const Operand&>(op.getReg().cvt32()) : op, T_ALLOW_DIFF_SIZE, 0x8E, op.isREG(16|i32e));
 	}
 #endif
 
@@ -3158,8 +3273,9 @@ class CodeGenerator : public CodeArray {
 		, es(Segment::es), cs(Segment::cs), ss(Segment::ss), ds(Segment::ds), fs(Segment::fs), gs(Segment::gs)
 #endif
 		, isDefaultJmpNEAR_(false)
-		, defaultEncoding_(EvexEncoding)
 	{
+		setDefaultEncoding();
+		setDefaultEncodingAVX10();
 		labelMgr_.set(this);
 	}
 	void reset()
@@ -3196,13 +3312,20 @@ class CodeGenerator : public CodeArray {
 	#undef jnl
 #endif
 
-	// set default encoding to select Vex or Evex
-	void setDefaultEncoding(PreferredEncoding encoding) { defaultEncoding_ = encoding; }
-
-	void sha1msg12(const Xmm& x, const Operand& op)
+	// set default encoding of VNNI
+	// EvexEncoding : AVX512_VNNI, VexEncoding : AVX-VNNI
+	void setDefaultEncoding(PreferredEncoding enc = EvexEncoding)
+	{
+		if (enc != VexEncoding && enc != EvexEncoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
+		defaultEncoding_[0] = enc;
+	}
+	// default : PreferredEncoding : AVX-VNNI-INT8/AVX512-FP16
+	void setDefaultEncodingAVX10(PreferredEncoding enc = PreAVX10v2Encoding)
 	{
-		opROO(Reg(), op, x, T_MUST_EVEX, 0xD9);
+		if (enc != PreAVX10v2Encoding && enc != AVX10v2Encoding) XBYAK_THROW(ERR_BAD_ENCODING_MODE)
+		defaultEncoding_[1] = enc;
 	}
+
 	void bswap(const Reg32e& r)
 	{
 		int idx = r.getIdx();
@@ -3215,6 +3338,24 @@ class CodeGenerator : public CodeArray {
 		}
 		db(0xC8 + (idx & 7));
 	}
+	void vmovd(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
+	{
+		const uint64_t typeTbl[] = {
+			T_EVEX|T_66|T_0F|T_W0|T_N4, T_EVEX|T_66|T_0F|T_W0|T_N4, // legacy, avx, avx512
+			T_MUST_EVEX|T_66|T_0F|T_EW0|T_N4, T_MUST_EVEX|T_F3|T_0F|T_EW0|T_N4, // avx10.2
+		};
+		const int codeTbl[] = { 0x7E, 0x6E, 0xD6, 0x7E };
+		opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 32);
+	}
+	void vmovw(const Operand& op1, const Operand& op2, PreferredEncoding enc = DefaultEncoding)
+	{
+		const uint64_t typeTbl[] = {
+			T_MUST_EVEX|T_66|T_MAP5|T_N2, T_MUST_EVEX|T_66|T_MAP5|T_N2, // avx512-fp16
+			T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, T_MUST_EVEX|T_F3|T_MAP5|T_EW0|T_N2, // avx10.2
+		};
+		const int codeTbl[] = { 0x7E, 0x6E, 0x7E, 0x6E };
+		opAVX10ZeroExt(op1, op2, typeTbl, codeTbl, enc, 16|32|64);
+	}
 	/*
 		use single byte nop if useMultiByteNop = false
 	*/
diff --git a/src/cpu/x64/xbyak/xbyak_bin2hex.h b/third_party/xbyak/xbyak_bin2hex.h
similarity index 100%
rename from src/cpu/x64/xbyak/xbyak_bin2hex.h
rename to third_party/xbyak/xbyak_bin2hex.h
diff --git a/src/cpu/x64/xbyak/xbyak_mnemonic.h b/third_party/xbyak/xbyak_mnemonic.h
similarity index 92%
rename from src/cpu/x64/xbyak/xbyak_mnemonic.h
rename to third_party/xbyak/xbyak_mnemonic.h
index af025c35bb6..fcfc6e44a69 100644
--- a/src/cpu/x64/xbyak/xbyak_mnemonic.h
+++ b/third_party/xbyak/xbyak_mnemonic.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
-const char *getVersionString() const { return "7.05"; }
+const char *getVersionString() const { return "7.23.1"; }
 void aadd(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38, 0x0FC, T_APX); }
 void aand(const Address& addr, const Reg32e &reg) { opMR(addr, reg, T_0F38|T_66, 0x0FC, T_APX|T_66); }
 void adc(const Operand& op, uint32_t imm) { opOI(op, imm, 0x10, 2); }
@@ -313,7 +313,7 @@ void cmpxchg8b(const Address& addr) { opMR(addr, Reg32(1), T_0F, 0xC7); }
 void comisd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_66|T_0F, 0x2F, isXMM_XMMorMEM); }
 void comiss(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F, 0x2F, isXMM_XMMorMEM); }
 void cpuid() { db(0x0F); db(0xA2); }
-void crc32(const Reg32e& r, const Operand& op) { if (!((r.isBit(32) && op.isBit(8|16|32)) || (r.isBit(64) && op.isBit(8|64)))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) int code = 0xF0 | (op.isBit(8) ? 0 : 1); uint64_t type = op.isBit(16) ? T_66:0; if (opROO(Reg(), op, static_cast<const Reg&>(r), T_APX|type, code)) return; opRO(r, op, T_F2|T_0F38|type, code); }
+void crc32(const Reg32e& r, const Operand& op) { if (!((r.isBit(32) && op.isBit(8|16|32)) || (r.isBit(64) && op.isBit(8|64)))) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) int code = 0xF0 | (op.isBit(8) ? 0 : 1); uint64_t type = op.isBit(16) ? T_66:0; type |= T_ALLOW_DIFF_SIZE; if (opROO(Reg(), op, static_cast<const Reg&>(r), T_APX|type, code)) return; opRO(r, op, T_F2|T_0F38|type, code); }
 void ctesta(const Operand& op, const Reg& r, int dfv = 0) { opCcmp(op, r, dfv, 0x84, 7); }
 void ctesta(const Operand& op, int imm, int dfv = 0) { opTesti(op, imm, dfv, 7); }
 void ctestae(const Operand& op, const Reg& r, int dfv = 0) { opCcmp(op, r, dfv, 0x84, 3); }
@@ -689,7 +689,7 @@ void jz(const char *label, LabelType type = T_AUTO) { jz(std::string(label), typ
 void jz(const void *addr) { opJmpAbs(addr, T_NEAR, 0x74, 0x84, 0x0F); }//-V524
 void jz(std::string label, LabelType type = T_AUTO) { opJmp(label, type, 0x74, 0x84, 0x0F); }//-V524
 void lahf() { db(0x9F); }
-void lddqu(const Xmm& xmm, const Address& addr) { opMR(addr, xmm, T_F2 | T_0F, 0xF0); }
+void lddqu(const Xmm& xmm, const Address& addr) { opSSE(xmm, addr, T_F2 | T_0F, 0xF0); }
 void ldmxcsr(const Address& addr) { opMR(addr, Reg32(2), T_0F, 0xAE); }
 void lea(const Reg& reg, const Address& addr) { if (!reg.isBit(16 | i32e)) XBYAK_THROW(ERR_BAD_SIZE_OF_REGISTER) opMR(addr, reg, 0, 0x8D); }
 void leave() { db(0xC9); }
@@ -711,8 +711,8 @@ void loopne(const char *label) { loopne(std::string(label)); }
 void loopne(std::string label) { opJmp(label, T_SHORT, 0xE0, 0, 0); }
 void lss(const Reg& reg, const Address& addr) { opLoadSeg(addr, reg, T_0F, 0xB2); }
 void lzcnt(const Reg&reg, const Operand& op) { if (opROO(Reg(), op, reg, T_APX|T_NF, 0xF5)) return; opCnt(reg, op, 0xBD); }
-void maskmovdqu(const Xmm& reg1, const Xmm& reg2) { opRR(reg1, reg2, T_66|T_0F, 0xF7); }
-void maskmovq(const Mmx& reg1, const Mmx& reg2) { if (!reg1.isMMX() || !reg2.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opRR(reg1, reg2, T_0F, 0xF7); }
+void maskmovdqu(const Xmm& reg1, const Xmm& reg2) { opSSE(reg1, reg2, T_66|T_0F, 0xF7); }
+void maskmovq(const Mmx& reg1, const Mmx& reg2) { opSSE(reg1, reg2, T_0F, 0xF7); }
 void maxpd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_66, 0x5F, isXMM_XMMorMEM); }
 void maxps(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F, 0x5F, isXMM_XMMorMEM); }
 void maxsd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_F2, 0x5F, isXMM_XMMorMEM); }
@@ -724,54 +724,52 @@ void minsd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_F2, 0x5D
 void minss(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_F3, 0x5D, isXMM_XMMorMEM); }
 void monitor() { db(0x0F); db(0x01); db(0xC8); }
 void monitorx() { db(0x0F); db(0x01); db(0xFA); }
-void movapd(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_66, 0x29); }
+void movapd(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_66, 0x29); }
 void movapd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, T_0F, T_66); }
-void movaps(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_NONE, 0x29); }
+void movaps(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_NONE, 0x29); }
 void movaps(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x28, T_0F, T_NONE); }
 void movbe(const Address& addr, const Reg& reg) { opMR(addr, reg, T_0F38, 0xF1, T_APX, 0x61); }
 void movbe(const Reg& reg, const Address& addr) { opMR(addr, reg, T_0F38, 0xF0, T_APX, 0x60); }
-void movd(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opMR(addr, mmx, T_0F, 0x7E); }
-void movd(const Mmx& mmx, const Address& addr) { if (mmx.isXMM()) db(0x66); opMR(addr, mmx, T_0F, 0x6E); }
-void movd(const Mmx& mmx, const Reg32& reg) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x6E); }
-void movd(const Reg32& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x7E); }
+void movd(const Mmx& mmx, const Operand& op) { if (!(op.isMEM() || op.isREG(32))) XBYAK_THROW(ERR_BAD_COMBINATION) if (mmx.isXMM()) db(0x66); opSSE(mmx, op, T_0F, 0x6E); }
+void movd(const Operand& op, const Mmx& mmx) { if (!(op.isMEM() || op.isREG(32))) XBYAK_THROW(ERR_BAD_COMBINATION) if (mmx.isXMM()) db(0x66); opSSE(mmx, op, T_0F, 0x7E); }
 void movddup(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12, isXMM_XMMorMEM, NONE); }
 void movdir64b(const Reg& reg, const Address& addr) { opMR(addr, reg.cvt32(), T_66|T_0F38, 0xF8, T_APX|T_66); }
 void movdiri(const Address& addr, const Reg32e& reg) { opMR(addr, reg, T_0F38, 0xF9, T_APX); }
-void movdq2q(const Mmx& mmx, const Xmm& xmm) { opRR(mmx, xmm, T_F2 | T_0F, 0xD6); }
-void movdqa(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_66, 0x7F); }
+void movdq2q(const Mmx& mmx, const Xmm& xmm) { opSSE(mmx, xmm, T_F2 | T_0F, 0xD6); }
+void movdqa(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_66, 0x7F); }
 void movdqa(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, T_0F, T_66); }
-void movdqu(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_F3, 0x7F); }
+void movdqu(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_F3, 0x7F); }
 void movdqu(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x6F, T_0F, T_F3); }
-void movhlps(const Xmm& reg1, const Xmm& reg2) { opRR(reg1, reg2, T_0F, 0x12); }
+void movhlps(const Xmm& reg1, const Xmm& reg2) { opSSE(reg1, reg2, T_0F, 0x12); }
 void movhpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_66|T_0F, 0x16); }
 void movhps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_0F, 0x16); }
-void movlhps(const Xmm& reg1, const Xmm& reg2) { opRR(reg1, reg2, T_0F, 0x16); }
+void movlhps(const Xmm& reg1, const Xmm& reg2) { opSSE(reg1, reg2, T_0F, 0x16); }
 void movlpd(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_66|T_0F, 0x12); }
 void movlps(const Operand& op1, const Operand& op2) { opMovXMM(op1, op2, T_0F, 0x12); }
 void movmskpd(const Reg32e& reg, const Xmm& xmm) { db(0x66); movmskps(reg, xmm); }
-void movmskps(const Reg32e& reg, const Xmm& xmm) { opRR(reg, xmm, T_0F, 0x50); }
-void movntdq(const Address& addr, const Xmm& reg) { opMR(addr, Reg16(reg.getIdx()), T_0F, 0xE7); }
-void movntdqa(const Xmm& xmm, const Address& addr) { opMR(addr, xmm, T_66 | T_0F38, 0x2A); }
+void movmskps(const Reg32e& reg, const Xmm& xmm) { opSSE(reg, xmm, T_0F, 0x50); }
+void movntdq(const Address& addr, const Xmm& reg) { if (reg.getIdx() >= 16) XBYAK_THROW(ERR_BAD_PARAMETER) opSSE(Reg16(reg.getIdx()), addr, T_0F, 0xE7); }
+void movntdqa(const Xmm& xmm, const Address& addr) { opSSE(xmm, addr, T_66 | T_0F38, 0x2A); }
 void movnti(const Address& addr, const Reg32e& reg) { opMR(addr, reg, T_0F, 0xC3); }
-void movntpd(const Address& addr, const Xmm& reg) { opMR(addr, Reg16(reg.getIdx()), T_0F, 0x2B); }
-void movntps(const Address& addr, const Xmm& xmm) { opMR(addr, Mmx(xmm.getIdx()), T_0F, 0x2B); }
-void movntq(const Address& addr, const Mmx& mmx) { if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opMR(addr, mmx, T_0F, 0xE7); }
-void movq(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opMR(addr, mmx, T_0F, mmx.isXMM() ? 0xD6 : 0x7F); }
-void movq(const Mmx& mmx, const Operand& op) { if (mmx.isXMM()) db(0xF3); opRO(mmx, op, T_0F, mmx.isXMM() ? 0x7E : 0x6F, mmx.getKind() == op.getKind()); }
-void movq2dq(const Xmm& xmm, const Mmx& mmx) { opRR(xmm, mmx, T_F3 | T_0F, 0xD6); }
+void movntpd(const Address& addr, const Xmm& reg) { if (reg.getIdx() >= 16) XBYAK_THROW(ERR_BAD_PARAMETER) opSSE(Reg16(reg.getIdx()), addr, T_0F, 0x2B); }
+void movntps(const Address& addr, const Xmm& xmm) { opSSE(Xmm(xmm.getIdx()), addr, T_0F, 0x2B); }
+void movntq(const Address& addr, const Mmx& mmx) { if (!mmx.isMMX()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(mmx, addr, T_0F, 0xE7); }
+void movq(const Address& addr, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(mmx, addr, T_0F, mmx.isXMM() ? 0xD6 : 0x7F); }
+void movq(const Mmx& mmx, const Operand& op) { if (!op.isMEM() && mmx.getKind() != op.getKind()) XBYAK_THROW(ERR_BAD_COMBINATION) if (mmx.isXMM()) db(0xF3); opSSE(mmx, op, T_0F, mmx.isXMM() ? 0x7E : 0x6F); }
+void movq2dq(const Xmm& xmm, const Mmx& mmx) { opSSE(xmm, mmx, T_F3 | T_0F, 0xD6); }
 void movsb() { db(0xA4); }
 void movsd() { db(0xA5); }
-void movsd(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_F2, 0x11); }
+void movsd(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_F2, 0x11); }
 void movsd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_F2); }
 void movshdup(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX, 0x16, isXMM_XMMorMEM, NONE); }
 void movsldup(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX, 0x12, isXMM_XMMorMEM, NONE); }
-void movss(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_F3, 0x11); }
+void movss(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_F3, 0x11); }
 void movss(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_F3); }
 void movsw() { db(0x66); db(0xA5); }
 void movsx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xBE); }
-void movupd(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_66, 0x11); }
+void movupd(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_66, 0x11); }
 void movupd(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_66); }
-void movups(const Address& addr, const Xmm& xmm) { opMR(addr, xmm, T_0F|T_NONE, 0x11); }
+void movups(const Address& addr, const Xmm& xmm) { opSSE(xmm, addr, T_0F|T_NONE, 0x11); }
 void movups(const Xmm& xmm, const Operand& op) { opMMX(xmm, op, 0x10, T_0F, T_NONE); }
 void movzx(const Reg& reg, const Operand& op) { opMovxx(reg, op, 0xB6); }
 void mpsadbw(const Xmm& xmm, const Operand& op, int imm) { opSSE(xmm, op, T_66 | T_0F3A, 0x42, isXMM_XMMorMEM, static_cast<uint8_t>(imm)); }
@@ -866,8 +864,7 @@ void pminsd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_66 | T_0F38, 0
 void pminsw(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xEA); }
 void pminub(const Mmx& mmx, const Operand& op) { opMMX(mmx, op, 0xDA); }
 void pminud(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_66 | T_0F38, 0x3B, isXMM_XMMorMEM); }
-void pminuw(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_66 | T_0F38, 0x3A, isXMM_XMMorMEM); }
-void pmovmskb(const Reg32e& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opRR(reg, mmx, T_0F, 0xD7); }
+void pmovmskb(const Reg32e& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(reg, mmx, T_0F, 0xD7); }
 void pmovsxbd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_N4|T_N_VL|T_66|T_0F38|T_YMM|T_EVEX, 0x21, isXMM_XMMorMEM, NONE); }
 void pmovsxbq(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_N2|T_N_VL|T_66|T_0F38|T_YMM|T_EVEX, 0x22, isXMM_XMMorMEM, NONE); }
 void pmovsxbw(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_N8|T_N_VL|T_66|T_0F38|T_YMM|T_EVEX, 0x20, isXMM_XMMorMEM, NONE); }
@@ -1103,10 +1100,10 @@ void vaesenc(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand())
 void vaesenclast(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_66|T_0F38|T_YMM|T_EVEX, 0xDD); }
 void vaesimc(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_W0, 0xDB); }
 void vaeskeygenassist(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0xDF, imm); }
-void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x55); }
-void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x55); }
-void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x54); }
-void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x54); }
+void vandnpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x55); }
+void vandnps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x55); }
+void vandpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x54); }
+void vandps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x54); }
 void vbcstnebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|T_0F38|T_W0|T_YMM|T_B16, 0xB1); }
 void vbcstnesh2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM|T_B16, 0xB1); }
 void vblendpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x0D, imm); }
@@ -1229,6 +1226,7 @@ void vcmpordpd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmppd(x1, x2,
 void vcmpordps(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpps(x1, x2, op, 7); }
 void vcmpordsd(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpsd(x1, x2, op, 7); }
 void vcmpordss(const Xmm& x1, const Xmm& x2, const Operand& op) { vcmpss(x1, x2, op, 7); }
+void vcmppbf16(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opVex(k, &x, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0xC2, imm); }
 void vcmppd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0xC2, imm); }
 void vcmpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F|T_YMM, 0xC2, imm); }
 void vcmpsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F, 0xC2, imm); }
@@ -1257,7 +1255,7 @@ void vcvtneebf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F3|
 void vcvtneeph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_66|T_0F38|T_W0|T_YMM, 0xB0); }
 void vcvtneobf162ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_F2|T_0F38|T_W0|T_YMM, 0xB0); }
 void vcvtneoph2ps(const Xmm& x, const Address& addr) { opVex(x, 0, addr, T_0F38|T_W0|T_YMM, 0xB0); }
-void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32|orEvexIf(encoding), 0x72); }
+void vcvtneps2bf16(const Xmm& x, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32|orEvexIf(encoding, 0, T_MUST_EVEX, 0), 0x72); }
 void vcvtpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_F2 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0xE6); }
 void vcvtpd2ps(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F | T_66 | T_YMM | T_EVEX | T_EW1 | T_B64 | T_ER_Z, 0x5A); }
 void vcvtph2ps(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_0F38 | T_66 | T_W0 | T_EVEX | T_EW0 | T_N8 | T_N_VL | T_SAE_Y, 0x13); }
@@ -1270,7 +1268,7 @@ void vcvtsi2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2,
 void vcvtsi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2, op, T_0F | T_F3 | T_EVEX | T_ER_X, T_W1 | T_EW1 | T_N8, T_W0 | T_EW0 | T_N4, 0x2A); }
 void vcvtss2sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_F3|T_0F|T_EW0|T_EVEX|T_SAE_X, 0x5A); }
 void vcvtss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_ER_X | T_N8, 0x2D); }
-void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_ER_Z, 0xE6); }
+void vcvttpd2dq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_66 | T_0F | T_YMM | T_EVEX |T_EW1 | T_B64 | T_SAE_Z, 0xE6); }
 void vcvttps2dq(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_F3|T_0F|T_EW0|T_YMM|T_EVEX|T_SAE_Z|T_B32, 0x5B); }
 void vcvttsd2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F2 | T_W0 | T_EVEX | T_EW0 | T_N4 | T_SAE_X, 0x2C); }
 void vcvttss2si(const Reg32& r, const Operand& op) { opAVX_X_X_XM(Xmm(r.getIdx()), xm0, op, T_0F | T_F3 | T_W0 | T_EVEX | T_EW0 | T_SAE_X | T_N8, 0x2C); }
@@ -1283,64 +1281,64 @@ void vdpps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX
 void vextractf128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x19, imm); }
 void vextracti128(const Operand& op, const Ymm& y, uint8_t imm) { if (!(op.isXMEM() && y.isYMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y, 0, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x39, imm); }
 void vextractps(const Operand& op, const Xmm& x, uint8_t imm) { if (!((op.isREG(32) || op.isMEM()) && x.isXMM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x, 0, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_N4, 0x17, imm); }
-void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x98); }
-void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x98); }
+void vfmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x98); }
+void vfmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x98); }
 void vfmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x99); }
 void vfmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x99); }
-void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA8); }
-void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA8); }
+void vfmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA8); }
+void vfmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA8); }
 void vfmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xA9); }
 void vfmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xA9); }
-void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB8); }
-void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB8); }
+void vfmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB8); }
+void vfmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB8); }
 void vfmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xB9); }
 void vfmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xB9); }
-void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x96); }
-void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x96); }
-void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA6); }
-void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA6); }
-void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB6); }
-void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB6); }
-void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9A); }
-void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9A); }
+void vfmaddsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x96); }
+void vfmaddsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x96); }
+void vfmaddsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA6); }
+void vfmaddsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA6); }
+void vfmaddsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB6); }
+void vfmaddsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB6); }
+void vfmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9A); }
+void vfmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9A); }
 void vfmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9B); }
 void vfmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9B); }
-void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAA); }
-void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAA); }
+void vfmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAA); }
+void vfmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAA); }
 void vfmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAB); }
 void vfmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAB); }
-void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBA); }
-void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBA); }
+void vfmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBA); }
+void vfmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBA); }
 void vfmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBB); }
 void vfmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBB); }
-void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x97); }
-void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x97); }
-void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xA7); }
-void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xA7); }
-void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xB7); }
-void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xB7); }
-void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9C); }
-void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9C); }
+void vfmsubadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x97); }
+void vfmsubadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x97); }
+void vfmsubadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xA7); }
+void vfmsubadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xA7); }
+void vfmsubadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xB7); }
+void vfmsubadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xB7); }
+void vfnmadd132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9C); }
+void vfnmadd132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9C); }
 void vfnmadd132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9D); }
 void vfnmadd132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9D); }
-void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAC); }
-void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAC); }
+void vfnmadd213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAC); }
+void vfnmadd213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAC); }
 void vfnmadd213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAD); }
 void vfnmadd213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAD); }
-void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBC); }
-void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBC); }
+void vfnmadd231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBC); }
+void vfnmadd231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBC); }
 void vfnmadd231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBD); }
 void vfnmadd231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBD); }
-void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0x9E); }
-void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x9E); }
+void vfnmsub132pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0x9E); }
+void vfnmsub132ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0x9E); }
 void vfnmsub132sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0x9F); }
 void vfnmsub132ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0x9F); }
-void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xAE); }
-void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xAE); }
+void vfnmsub213pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xAE); }
+void vfnmsub213ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xAE); }
 void vfnmsub213sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xAF); }
 void vfnmsub213ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xAF); }
-void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_B64, 0xBE); }
-void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0xBE); }
+void vfnmsub231pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W1|T_EW1|T_YMM|T_EVEX|T_ER_Z|T_B64, 0xBE); }
+void vfnmsub231ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_ER_Z|T_B32, 0xBE); }
 void vfnmsub231sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_W1|T_EW1|T_EVEX|T_ER_X, 0xBF); }
 void vfnmsub231ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_W0|T_EW0|T_EVEX|T_ER_X, 0xBF); }
 void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, T_0F38 | T_66 | T_YMM | T_VSIB | T_W1, 0x92, 0); }
@@ -1364,20 +1362,18 @@ void vmaskmovpd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_X
 void vmaskmovpd(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2D); }
 void vmaskmovps(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2E); }
 void vmaskmovps(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x2C); }
-void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5F); }
-void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5F); }
-void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5F); }
-void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5F); }
-void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x5D); }
-void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x5D); }
-void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x5D); }
-void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x5D); }
+void vmaxpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5F); }
+void vmaxps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5F); }
+void vmaxsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5F); }
+void vmaxss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5F); }
+void vminpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_SAE_Z | T_B64, 0x5D); }
+void vminps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_SAE_Z | T_B32, 0x5D); }
+void vminsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_SAE_X | T_N8, 0x5D); }
+void vminss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_SAE_X | T_N4, 0x5D); }
 void vmovapd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_M_K, 0x29); }
 void vmovapd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x28); }
 void vmovaps(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x29); }
 void vmovaps(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x28); }
-void vmovd(const Operand& op, const Xmm& x) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x7E); }
-void vmovd(const Xmm& x, const Operand& op) { if (!op.isREG(32) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_0F | T_66 | T_W0 | T_EVEX | T_N4, 0x6E); }
 void vmovddup(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_DUP|T_F2|T_0F|T_EW1|T_YMM|T_EVEX|T_ER_X|T_ER_Y|T_ER_Z, 0x12); }
 void vmovdqa(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_66|T_0F|T_YMM, 0x7F); }
 void vmovdqa(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_YMM, 0x6F); }
@@ -1414,13 +1410,12 @@ void vmovupd(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_
 void vmovupd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX, 0x10); }
 void vmovups(const Address& addr, const Xmm& xmm) { opAVX_X_XM_IMM(xmm, addr, T_0F|T_EW0|T_YMM|T_EVEX|T_M_K, 0x11); }
 void vmovups(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_0F|T_EW0|T_YMM|T_EVEX, 0x10); }
-void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_W0|T_YMM, 0x42, imm); }
 void vmulpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x59); }
 void vmulps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x59); }
 void vmulsd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F2 | T_EW1 | T_EVEX | T_ER_X | T_N8, 0x59); }
 void vmulss(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_F3 | T_EW0 | T_EVEX | T_ER_X | T_N4, 0x59); }
-void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x56); }
-void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x56); }
+void vorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x56); }
+void vorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x56); }
 void vpabsb(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1C); }
 void vpabsd(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_EW0|T_YMM|T_EVEX|T_B32, 0x1E); }
 void vpabsw(const Xmm& xm, const Operand& op) { opAVX_X_XM_IMM(xm, op, T_66|T_0F38|T_YMM|T_EVEX, 0x1D); }
@@ -1465,22 +1460,10 @@ void vpcmpgtq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1
 void vpcmpgtw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM, 0x65); }
 void vpcmpistri(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x63, imm); }
 void vpcmpistrm(const Xmm& xm, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(xm, op, T_66|T_0F3A, 0x62, imm); }
-void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x50); }
-void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_W0|T_YMM, 0x51); }
-void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x50); }
-void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0x51); }
 void vpdpbusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x50, encoding); }
 void vpdpbusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x51, encoding); }
-void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x50); }
-void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0x51); }
 void vpdpwssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x52, encoding); }
 void vpdpwssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_B32, 0x53, encoding); }
-void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD2); }
-void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_W0|T_YMM, 0xD3); }
-void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD2); }
-void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_W0|T_YMM, 0xD3); }
-void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD2); }
-void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_W0|T_YMM, 0xD3); }
 void vperm2f128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x06, imm); }
 void vperm2i128(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { if (!(y1.isYMM() && y2.isYMM() && op.isYMEM())) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(y1, &y2, op, T_0F3A | T_66 | T_W0 | T_YMM, 0x46, imm); }
 void vpermd(const Ymm& y1, const Ymm& y2, const Operand& op) { opAVX_X_X_XM(y1, y2, op, T_66|T_0F38|T_W0|T_EW0|T_YMM|T_EVEX|T_B32, 0x36); }
@@ -1512,8 +1495,6 @@ void vpinsrb(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if
 void vpinsrd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W0 | T_EVEX | T_EW0 | T_N4, 0x22, imm); }
 void vpinsrq(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(64) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F3A | T_66 | T_W1 | T_EVEX | T_EW1 | T_N8, 0x22, imm); }
 void vpinsrw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { if (!(x1.isXMM() && x2.isXMM() && (op.isREG(32) || op.isMEM()))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(x1, &x2, op, T_0F | T_66 | T_EVEX | T_N2, 0xC4, imm); }
-void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB5, encoding); }
-void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_B64, 0xB4, encoding); }
 void vpmaddubsw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_YMM|T_EVEX, 0x04); }
 void vpmaddwd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_YMM|T_EVEX, 0xF5); }
 void vpmaskmovd(const Address& addr, const Xmm& x1, const Xmm& x2) { opAVX_X_X_XM(x2, x1, addr, T_0F38 | T_66 | T_W0 | T_YMM, 0x8E); }
@@ -1637,8 +1618,8 @@ void vunpckhpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x
 void vunpckhps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x15); }
 void vunpcklpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F|T_EW1|T_YMM|T_EVEX|T_B64, 0x14); }
 void vunpcklps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F|T_EW0|T_YMM|T_EVEX|T_B32, 0x14); }
-void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_ER_Z | T_B64, 0x57); }
-void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_ER_Z | T_B32, 0x57); }
+void vxorpd(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_66 | T_EW1 | T_YMM | T_EVEX | T_B64, 0x57); }
+void vxorps(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_0F | T_EW0 | T_YMM | T_EVEX | T_B32, 0x57); }
 void vzeroall() { db(0xC5); db(0xFC); db(0x77); }
 void vzeroupper() { db(0xC5); db(0xF8); db(0x77); }
 void wait() { db(0x9B); }
@@ -1656,6 +1637,8 @@ void xor_(const Reg& d, const Operand& op, uint32_t imm) { opROI(d, op, imm, T_N
 void xor_(const Reg& d, const Operand& op1, const Operand& op2) { opROO(d, op1, op2, T_NF|T_CODE1_IF1, 0x30); }
 void xorpd(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F | T_66, 0x57, isXMM_XMMorMEM); }
 void xorps(const Xmm& xmm, const Operand& op) { opSSE(xmm, op, T_0F, 0x57, isXMM_XMMorMEM); }
+void xresldtrk() { db(0xF2); db(0x0F); db(0x01); db(0xE9); }
+void xsusldtrk() { db(0xF2); db(0x0F); db(0x01); db(0xE8); }
 #ifdef XBYAK_ENABLE_OMITTED_OPERAND
 void vblendpd(const Xmm& x, const Operand& op, uint8_t imm) { vblendpd(x, x, op, imm); }
 void vblendps(const Xmm& x, const Operand& op, uint8_t imm) { vblendps(x, x, op, imm); }
@@ -1936,9 +1919,9 @@ void testui() { db(0xF3); db(0x0F); db(0x01); db(0xED); }
 void uiret() { db(0xF3); db(0x0F); db(0x01); db(0xEC); }
 void cmpxchg16b(const Address& addr) { opMR(addr, Reg64(1), T_0F, 0xC7); }
 void fxrstor64(const Address& addr) { opMR(addr, Reg64(1), T_0F, 0xAE); }
-void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x7E); }
-void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opRR(mmx, reg, T_0F, 0x6E); }
-void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opRO(reg, op, 0, 0x63); }
+void movq(const Reg64& reg, const Mmx& mmx) { if (mmx.isXMM()) db(0x66); opSSE(mmx, reg, T_0F, 0x7E); }
+void movq(const Mmx& mmx, const Reg64& reg) { if (mmx.isXMM()) db(0x66); opSSE(mmx, reg, T_0F, 0x6E); }
+void movsxd(const Reg64& reg, const Operand& op) { if (!op.isBit(32)) XBYAK_THROW(ERR_BAD_COMBINATION) opRO(reg, op, T_ALLOW_DIFF_SIZE, 0x63); }
 void pextrq(const Operand& op, const Xmm& xmm, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(Reg64(xmm.getIdx()), op, T_66 | T_0F3A, 0x16, 0, imm); }
 void pinsrq(const Xmm& xmm, const Operand& op, uint8_t imm) { if (!op.isREG(64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opSSE(Reg64(xmm.getIdx()), op, T_66 | T_0F3A, 0x22, 0, imm); }
 void senduipi(const Reg64& r) { opRR(Reg32(6), r.cvt32(), T_F3 | T_0F, 0xC7); }
@@ -1979,6 +1962,10 @@ void aesencwide128kl(const Address& addr) { opSSE_APX(xmm0, addr, T_F3|T_0F38, 0
 void aesencwide256kl(const Address& addr) { opSSE_APX(xmm2, addr, T_F3|T_0F38, 0xD8, T_F3|T_MUST_EVEX, 0xD8); }
 void encodekey128(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFA, 0xDA); }
 void encodekey256(const Reg32& r1, const Reg32& r2) { opEncodeKey(r1, r2, 0xFB, 0xDB); }
+void rdfsbase(const Reg32e& r) { opRR(eax, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
+void rdgsbase(const Reg32e& r) { opRR(ecx, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
+void wrfsbase(const Reg32e& r) { opRR(edx, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
+void wrgsbase(const Reg32e& r) { opRR(ebx, r, T_F3|T_0F|T_ALLOW_DIFF_SIZE, 0xAE); }
 void ldtilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_0F38|T_W0, 0x49); }
 void sttilecfg(const Address& addr) { if (opROO(Reg(), addr, tmm0, T_APX|T_66|T_0F38|T_W0, 0x49)) return; opVex(tmm0, &tmm0, addr, T_66|T_0F38 | T_W0, 0x49); }
 void tileloadd(const Tmm& tm, const Address& addr) { opAMX(tm, addr, T_F2|T_0F38|T_W0, 0x4B); }
@@ -2085,6 +2072,7 @@ void v4fmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM
 void v4fmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0x9B); }
 void v4fnmaddps(const Zmm& z1, const Zmm& z2, const Address& addr) { opAVX_X_X_XM(z1, z2, addr, T_0F38 | T_F2 | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0xAA); }
 void v4fnmaddss(const Xmm& x1, const Xmm& x2, const Address& addr) { opAVX_X_X_XM(x1, x2, addr, T_0F38 | T_F2 | T_EW0 | T_MUST_EVEX | T_N16, 0xAB); }
+void vaddnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x58); }
 void vaddph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x58); }
 void vaddsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x58); }
 void valignd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x03, imm); }
@@ -2235,18 +2223,38 @@ void vcmpunordpd(const Opmask& k, const Xmm& x, const Operand& op) { vcmppd(k, x
 void vcmpunordps(const Opmask& k, const Xmm& x, const Operand& op) { vcmpps(k, x, op, 3); }
 void vcmpunordsd(const Opmask& k, const Xmm& x, const Operand& op) { vcmpsd(k, x, op, 3); }
 void vcmpunordss(const Opmask& k, const Xmm& x, const Operand& op) { vcmpss(k, x, op, 3); }
-void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2F); }
-void vcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x63); }
+void vcomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
 void vcompresspd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8A); }
 void vcompressps(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8A); }
-void vcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); }
+void vcomsbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_MAP5|T_EW0|T_MUST_EVEX, 0x2F); }
+void vcomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2F); }
+void vcomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
+void vcomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2F); }
+void vcvt2ps2phx(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x67); }
+void vcvtbiasph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtbiasph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtbiasph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
+void vcvtbiasph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt6(x1, x2, op, T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
 void vcvtdq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x5B); }
+void vcvthf82ph(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_MUST_EVEX | T_F2 | T_MAP5 | T_EW0 | T_YMM | T_N1, 0x1E); }
+void vcvtne2ph2bf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtne2ph2bf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtne2ph2hf8(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
+void vcvtne2ph2hf8s(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N1|T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
 void vcvtne2ps2bf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x72); }
+void vcvtnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x69); }
+void vcvtnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6B); }
+void vcvtneph2bf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtneph2bf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x74); }
+void vcvtneph2hf8(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x18); }
+void vcvtneph2hf8s(const Xmm& x, const Operand& op) { opCvt2(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x1B); }
 void vcvtpd2ph(const Xmm& x, const Operand& op) { opCvt5(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW1|T_ER_Z|T_MUST_EVEX|T_B64, 0x5A); }
 void vcvtpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x7B); }
 void vcvtpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
 void vcvtpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x79); }
 void vcvtph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B16, 0x5B); }
+void vcvtph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x69); }
+void vcvtph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6B); }
 void vcvtph2pd(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x5A); }
 void vcvtph2psx(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x13); }
 void vcvtph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x7B); }
@@ -2254,6 +2262,8 @@ void vcvtph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0,
 void vcvtph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_ER_X|T_MUST_EVEX|T_B16, 0x79); }
 void vcvtph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
 void vcvtph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
+void vcvtps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x69); }
+void vcvtps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6B); }
 void vcvtps2phx(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_66|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x1D); }
 void vcvtps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_ER_Y|T_MUST_EVEX|T_B32, 0x7B); }
 void vcvtps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x79); }
@@ -2270,22 +2280,40 @@ void vcvtsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3
 void vcvtsi2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { if (!(x1.isXMM() && x2.isXMM() && op.isBit(32|64))) XBYAK_THROW(ERR_BAD_COMBINATION) uint64_t type = (T_F3|T_MAP5|T_ER_R|T_MUST_EVEX|T_M_K) | (op.isBit(32) ? (T_EW0 | T_N4) : (T_EW1 | T_N8)); opVex(x1, &x2, op, type, 0x2A); }
 void vcvtss2sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x1D); }
 void vcvtss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_ER_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x79); }
+void vcvttnebf162ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x68); }
+void vcvttnebf162iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x6A); }
+void vcvttpd2dqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
 void vcvttpd2qq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x7A); }
+void vcvttpd2qqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6D); }
 void vcvttpd2udq(const Xmm& x, const Operand& op) { opCvt2(x, op, T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
+void vcvttpd2udqs(const Xmm& x, const Operand& op) { opCvt2(x, op, T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
 void vcvttpd2uqq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x78); }
+void vcvttpd2uqqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x6C); }
 void vcvttph2dq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x5B); }
+void vcvttph2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x68); }
+void vcvttph2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B16, 0x6A); }
 void vcvttph2qq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x7A); }
 void vcvttph2udq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B16, 0x78); }
 void vcvttph2uqq(const Xmm& x, const Operand& op) { if (!op.isXMM() && !op.isMEM()) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(x, 0, op, T_N4|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_MUST_EVEX|T_B16, 0x78); }
 void vcvttph2uw(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
 void vcvttph2w(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x7C); }
+void vcvttps2dqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6D); }
+void vcvttps2ibs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x68); }
+void vcvttps2iubs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_B32, 0x6A); }
 void vcvttps2qq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x7A); }
+void vcvttps2qqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6D); }
 void vcvttps2udq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_0F|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x78); }
+void vcvttps2udqs(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x6C); }
 void vcvttps2uqq(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_0F|T_EW0|T_YMM|T_SAE_Y|T_MUST_EVEX|T_B32, 0x78); }
+void vcvttps2uqqs(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_66|T_MAP5|T_EW0|T_YMM|T_SAE_X|T_SAE_Y|T_MUST_EVEX|T_B32, 0x6C); }
+void vcvttsd2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
 void vcvttsd2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
+void vcvttsd2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N8|T_F2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
 void vcvttsh2si(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x2C); }
 void vcvttsh2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
+void vcvttss2sis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6D); }
 void vcvttss2usi(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_0F|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x78); }
+void vcvttss2usis(const Reg32e& r, const Operand& op) { uint64_t type = (T_N4|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX) | (r.isREG(64) ? T_EW1 : T_EW0); opVex(r, &xm0, op, type, 0x6C); }
 void vcvtudq2pd(const Xmm& x, const Operand& op) { checkCvt1(x, op); opVex(x, 0, op, T_N8|T_N_VL|T_F3|T_0F|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x7A); }
 void vcvtudq2ph(const Xmm& x, const Operand& op) { checkCvt4(x, op); opCvt(x, op, T_N16|T_N_VL|T_F2|T_MAP5|T_EW0|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
 void vcvtudq2ps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x7A); }
@@ -2298,9 +2326,11 @@ void vcvtusi2ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opCvt3(x1, x2
 void vcvtuw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
 void vcvtw2ph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F3|T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x7D); }
 void vdbpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x42, imm); }
+void vdivnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5E); }
 void vdivph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5E); }
 void vdivsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5E); }
 void vdpbf16ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52); }
+void vdpphps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x52); }
 void vexp2pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64 | T_SAE_Z, 0xC8); }
 void vexp2ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xC8); }
 void vexpandpd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x88); }
@@ -2319,38 +2349,51 @@ void vfixupimmpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) {
 void vfixupimmps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x54, imm); }
 void vfixupimmsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
 void vfixupimmss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_Z|T_MUST_EVEX, 0x55, imm); }
+void vfmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x98); }
 void vfmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x98); }
 void vfmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x99); }
+void vfmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xA8); }
 void vfmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA8); }
 void vfmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xA9); }
+void vfmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xB8); }
 void vfmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB8); }
 void vfmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xB9); }
 void vfmaddcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x56); }
 void vfmaddsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x96); }
 void vfmaddsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA6); }
 void vfmaddsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB6); }
+void vfmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9A); }
 void vfmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9A); }
 void vfmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9B); }
+void vfmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAA); }
 void vfmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAA); }
 void vfmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAB); }
+void vfmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBA); }
 void vfmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBA); }
 void vfmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBB); }
 void vfmsubadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x97); }
 void vfmsubadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xA7); }
 void vfmsubadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xB7); }
 void vfmulcph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_F3|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0xD6); }
+void vfnmadd132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9C); }
 void vfnmadd132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9C); }
 void vfnmadd132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9D); }
+void vfnmadd213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAC); }
 void vfnmadd213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAC); }
 void vfnmadd213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAD); }
+void vfnmadd231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBC); }
 void vfnmadd231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBC); }
 void vfnmadd231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBD); }
+void vfnmsub132nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x9E); }
 void vfnmsub132ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x9E); }
 void vfnmsub132sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0x9F); }
+void vfnmsub213nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xAE); }
 void vfnmsub213ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xAE); }
 void vfnmsub213sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xAF); }
+void vfnmsub231nepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0xBE); }
 void vfnmsub231ph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0xBE); }
 void vfnmsub231sh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_ER_X|T_MUST_EVEX, 0xBF); }
+void vfpclasspbf16(const Opmask& k, const Operand& op, uint8_t imm) { opVex(k.changeBit(op.getBit()), 0, op, T_MUST_EVEX|T_F2|T_0F3A|T_EW0|T_YMM|T_B16, 0x66, imm); }
 void vfpclasspd(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, 0x66, imm); }
 void vfpclassph(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B16, 0x66, imm); }
 void vfpclassps(const Opmask& k, const Operand& op, uint8_t imm) { if (!op.isBit(128|256|512)) XBYAK_THROW(ERR_BAD_MEM_SIZE) opVex(k.changeBit(op.getBit()), 0, op, T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, 0x66, imm); }
@@ -2369,12 +2412,14 @@ void vgatherpf1qpd(const Address& addr) { opGatherFetch(addr, zm2, T_N8|T_66|T_0
 void vgatherpf1qps(const Address& addr) { opGatherFetch(addr, zm2, T_N4|T_66|T_0F38|T_EW0|T_MUST_EVEX|T_M_K|T_VSIB, 0xC7, Operand::ZMM); }
 void vgatherqpd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 0); }
 void vgatherqps(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_VSIB, 0x93, 2); }
+void vgetexppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x42); }
 void vgetexppd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x42); }
 void vgetexpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x42); }
 void vgetexpps(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x42); }
 void vgetexpsd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0x43); }
 void vgetexpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); }
 void vgetexpss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0x43); }
+void vgetmantpbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x26, imm); }
 void vgetmantpd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x26, imm); }
 void vgetmantph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x26, imm); }
 void vgetmantps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x26, imm); }
@@ -2389,10 +2434,19 @@ void vinserti32x4(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm)
 void vinserti32x8(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX, 0x3A, imm); }
 void vinserti64x2(const Ymm& r1, const Ymm& r2, const Operand& op, uint8_t imm) {if (!(r1.getKind() == r2.getKind() && op.is(Operand::MEM | Operand::XMM))) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N16|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x38, imm); }
 void vinserti64x4(const Zmm& r1, const Zmm& r2, const Operand& op, uint8_t imm) {if (!op.is(Operand::MEM | Operand::YMM)) XBYAK_THROW(ERR_BAD_COMBINATION) opVex(r1, &r2, op, T_N32|T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3A, imm); }
-void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5F); }
-void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5F); }
-void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5D); }
-void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5D); }
+void vmaxpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5F); }
+void vmaxph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5F); }
+void vmaxsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5F); }
+void vminmaxnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x52, imm); }
+void vminmaxpd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B64, 0x52, imm); }
+void vminmaxph(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B16, 0x52, imm); }
+void vminmaxps(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Y|T_SAE_Z|T_MUST_EVEX|T_B32, 0x52, imm); }
+void vminmaxsd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
+void vminmaxsh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
+void vminmaxss(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x53, imm); }
+void vminpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5D); }
+void vminph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_SAE_Z | T_B16, 0x5D); }
+void vminsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_SAE_X | T_N2, 0x5D); }
 void vmovdqa32(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); }
 void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F|T_EW0|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX, 0x6F); }
 void vmovdqa64(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_66|T_0F|T_EW1|T_YMM|T_ER_X|T_ER_Y|T_ER_Z|T_MUST_EVEX|T_M_K, 0x7F); }
@@ -2408,9 +2462,8 @@ void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2|T_0F
 void vmovsh(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX|T_M_K, 0x11); }
 void vmovsh(const Xmm& x, const Address& addr) { opAVX_X_X_XM(x, xm0, addr, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
 void vmovsh(const Xmm& x1, const Xmm& x2, const Xmm& x3) { opAVX_X_X_XM(x1, x2, x3, T_N2|T_F3|T_MAP5|T_EW0|T_MUST_EVEX, 0x10); }
-void vmovw(const Address& addr, const Xmm& x) { opAVX_X_XM_IMM(x, addr, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
-void vmovw(const Reg32e& r, const Xmm& x) { opAVX_X_X_XM(x, xm0, r, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x7E); }
-void vmovw(const Xmm& x, const Operand& op) { if (!op.isREG(32|64) && !op.isMEM()) XBYAK_THROW(ERR_BAD_COMBINATION) opAVX_X_X_XM(x, xm0, op, T_N2|T_66|T_MAP5|T_MUST_EVEX, 0x6E); }
+void vmpsadbw(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F3A|T_YMM, 0x42, encoding, imm, T_66|T_W0|T_YMM, T_F3|T_0F3A|T_EW0|T_B32, 1); }
+void vmulnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x59); }
 void vmulph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x59); }
 void vmulsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x59); }
 void vp2intersectd(const Opmask& k, const Xmm& x, const Operand& op) { if (k.getOpmaskIdx() != 0) XBYAK_THROW(ERR_OPMASK_IS_ALREADY_SET) opAVX_K_X_XM(k, x, op, T_F2 | T_0F38 | T_YMM | T_EVEX | T_EW0 | T_B32, 0x68); }
@@ -2447,10 +2500,24 @@ void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { op
 void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x1E, imm); }
 void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3E, imm); }
 void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8_t imm) { opAVX_K_X_XM(k, x, op, T_66|T_0F3A|T_EW1|T_YMM|T_MUST_EVEX, 0x3F, imm); }
+void vpcompressb(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N1|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x63); }
 void vpcompressd(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N4|T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8B); }
 void vpcompressq(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x8B); }
+void vpcompressw(const Operand& op, const Xmm& x) { opAVX_X_XM_IMM(x, op, T_N2|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x63); }
 void vpconflictd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0xC4); }
 void vpconflictq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xC4); }
+void vpdpbssd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbssds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F2|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x50, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpbuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0x51, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwsud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwsuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_F3|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwusd(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwusds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_66|T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwuud(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD2, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
+void vpdpwuuds(const Xmm& x1, const Xmm& x2, const Operand& op, PreferredEncoding encoding = DefaultEncoding) { opEncoding(x1, x2, op, T_0F38|T_YMM, 0xD3, encoding, NONE, T_W0, T_EW0|T_B32, 1); }
 void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x8D); }
 void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX, 0x75); }
 void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x76); }
@@ -2475,6 +2542,8 @@ void vpgatherqd(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N4|T_6
 void vpgatherqq(const Xmm& x, const Address& addr) { opGather2(x, addr, T_N8|T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_VSIB, 0x91, 0); }
 void vplzcntd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW0|T_YMM|T_MUST_EVEX|T_B32, 0x44); }
 void vplzcntq(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x44); }
+void vpmadd52huq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB5); }
+void vpmadd52luq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0xB4); }
 void vpmaxsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3D); }
 void vpmaxuq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x3F); }
 void vpminsq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX|T_B64, 0x39); }
@@ -2568,14 +2637,18 @@ void vrcp28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_
 void vrcp28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCA); }
 void vrcp28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCB); }
 void vrcp28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCB); }
+void vrcppbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); }
+
 void vrcpph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4C); }
 void vrcpsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4D); }
+void vreducenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x56, imm); }
 void vreducepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x56, imm); }
 void vreduceph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x56, imm); }
 void vreduceps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x56, imm); }
 void vreducesd(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F3A|T_EW1|T_SAE_X|T_MUST_EVEX, 0x57, imm); }
 void vreducesh(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N2|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); }
 void vreducess(const Xmm& x1, const Xmm& x2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F3A|T_EW0|T_SAE_X|T_MUST_EVEX, 0x57, imm); }
+void vrndscalenepbf16(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_F2|T_0F3A|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x08, imm); }
 void vrndscalepd(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW1|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B64, 0x09, imm); }
 void vrndscaleph(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B16, 0x08, imm); }
 void vrndscaleps(const Xmm& x, const Operand& op, uint8_t imm) { opAVX_X_XM_IMM(x, op, T_66|T_0F3A|T_EW0|T_YMM|T_SAE_Z|T_MUST_EVEX|T_B32, 0x08, imm); }
@@ -2590,8 +2663,11 @@ void vrsqrt28pd(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 |
 void vrsqrt28ps(const Zmm& z, const Operand& op) { opAVX_X_XM_IMM(z, op, T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0 | T_B32 | T_SAE_Z, 0xCC); }
 void vrsqrt28sd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N8|T_66|T_0F38|T_EW1|T_SAE_X|T_MUST_EVEX, 0xCD); }
 void vrsqrt28ss(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N4|T_66|T_0F38|T_EW0|T_SAE_X|T_MUST_EVEX, 0xCD); }
+void vrsqrtpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); }
 void vrsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x4E); }
 void vrsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_66|T_MAP6|T_EW0|T_MUST_EVEX, 0x4F); }
+void vscalefpbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); }
+void vscalefpbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_MAP6|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x2C); }
 void vscalefpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW1|T_YMM|T_ER_Z|T_MUST_EVEX|T_B64, 0x2C); }
 void vscalefph(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP6|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x2C); }
 void vscalefps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_0F38|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B32, 0x2C); }
@@ -2614,11 +2690,16 @@ void vshuff32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) {
 void vshuff64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x23, imm); }
 void vshufi32x4(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, 0x43, imm); }
 void vshufi64x2(const Ymm& y1, const Ymm& y2, const Operand& op, uint8_t imm) { opAVX_X_X_XM(y1, y2, op, T_66 | T_0F3A | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, 0x43, imm); }
+void vsqrtnepbf16(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x51); }
 void vsqrtph(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5|T_EW0|T_YMM|T_ER_Z|T_MUST_EVEX|T_B16, 0x51); }
 void vsqrtsh(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_N2|T_F3|T_MAP5|T_EW0|T_ER_X|T_MUST_EVEX, 0x51); }
+void vsubnepbf16(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66|T_MAP5|T_EW0|T_YMM|T_MUST_EVEX|T_B16, 0x5C); }
 void vsubph(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_EW0 | T_YMM | T_MUST_EVEX | T_ER_Z | T_B16, 0x5C); }
 void vsubsh(const Xmm& xmm, const Operand& op1, const Operand& op2 = Operand()) { opAVX_X_X_XM(xmm, op1, op2, T_MAP5 | T_F3 | T_EW0 | T_MUST_EVEX | T_ER_X | T_N2, 0x5C); }
-void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_MAP5 | T_MUST_EVEX | T_EW0 | T_SAE_X | T_N2, 0x2E); }
+void vucomish(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
+void vucomxsd(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N8|T_F2|T_0F|T_EW1|T_SAE_X|T_MUST_EVEX, 0x2E); }
+void vucomxsh(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N2|T_F3|T_MAP5|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
+void vucomxss(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_N4|T_F3|T_0F|T_EW0|T_SAE_X|T_MUST_EVEX, 0x2E); }
 #ifdef XBYAK64
 void kmovq(const Reg64& r, const Opmask& k) { opKmov(k, r, true, 64); }
 void vpbroadcastq(const Xmm& x, const Reg64& r) { opVex(x, 0, r, T_66|T_0F38|T_EW1|T_YMM|T_MUST_EVEX, 0x7C); }
diff --git a/src/cpu/x64/xbyak/xbyak_util.h b/third_party/xbyak/xbyak_util.h
similarity index 80%
rename from src/cpu/x64/xbyak/xbyak_util.h
rename to third_party/xbyak/xbyak_util.h
index c5642f38956..75cc1fc7ee0 100644
--- a/src/cpu/x64/xbyak/xbyak_util.h
+++ b/third_party/xbyak/xbyak_util.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2016-2024 Intel Corporation
+* Copyright 2016-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -136,7 +136,8 @@ namespace Xbyak { namespace util {
 typedef enum {
    SmtLevel = 1,
    CoreLevel = 2
-} IntelCpuTopologyLevel;
+} CpuTopologyLevel;
+typedef CpuTopologyLevel IntelCpuTopologyLevel; // for backward compatibility
 
 namespace local {
 
@@ -181,13 +182,12 @@ class Cpu {
 private:
 	Type type_;
 	//system topology
-	bool x2APIC_supported_;
 	static const size_t maxTopologyLevels = 2;
 	uint32_t numCores_[maxTopologyLevels];
 
 	static const uint32_t maxNumberCacheLevels = 10;
 	uint32_t dataCacheSize_[maxNumberCacheLevels];
-	uint32_t coresSharignDataCache_[maxNumberCacheLevels];
+	uint32_t coresSharingDataCache_[maxNumberCacheLevels];
 	uint32_t dataCacheLevels_;
 	uint32_t avx10version_;
 
@@ -199,152 +199,232 @@ class Cpu {
 	{
 		return (1U << n) - 1;
 	}
+	// [EBX:ECX:EDX] == s?
+	bool isEqualStr(uint32_t EBX, uint32_t ECX, uint32_t EDX, const char s[12]) const
+	{
+		return get32bitAsBE(&s[0]) == EBX && get32bitAsBE(&s[4]) == EDX && get32bitAsBE(&s[8]) == ECX;
+	}
+	uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end) const
+	{
+		return (val >> base) & ((1u << (end + 1 - base)) - 1);
+	}
+
 	void setFamily()
 	{
 		uint32_t data[4] = {};
 		getCpuid(1, data);
-		stepping = data[0] & mask(4);
-		model = (data[0] >> 4) & mask(4);
-		family = (data[0] >> 8) & mask(4);
-		// type = (data[0] >> 12) & mask(2);
-		extModel = (data[0] >> 16) & mask(4);
-		extFamily = (data[0] >> 20) & mask(8);
+		stepping = extractBit(data[0], 0, 3);
+		model = extractBit(data[0], 4, 7);
+		family = extractBit(data[0], 8, 11);
+		//type = extractBit(data[0], 12, 13);
+		extModel = extractBit(data[0], 16, 19);
+		extFamily = extractBit(data[0], 20, 27);
 		if (family == 0x0f) {
 			displayFamily = family + extFamily;
 		} else {
 			displayFamily = family;
 		}
-		if (family == 6 || family == 0x0f) {
+		if ((has(tINTEL) && family == 6) || family == 0x0f) {
 			displayModel = (extModel << 4) + model;
 		} else {
 			displayModel = model;
 		}
 	}
-	uint32_t extractBit(uint32_t val, uint32_t base, uint32_t end)
-	{
-		return (val >> base) & ((1u << (end - base)) - 1);
-	}
 	void setNumCores()
 	{
 		if (!has(tINTEL) && !has(tAMD)) return;
 
 		uint32_t data[4] = {};
-		getCpuidEx(0x0, 0, data);
+		getCpuid(0x0, data);
 		if (data[0] >= 0xB) {
-			 /*
-				if leaf 11 exists(x2APIC is supported),
-				we use it to get the number of smt cores and cores on socket
+			// Check if "Extended Topology Enumeration" is implemented.
+			getCpuidEx(0xB, 0, data);
+			if (data[0] != 0 || data[1] != 0) {
+				/*
+					if leaf 11 exists(x2APIC is supported),
+					we use it to get the number of smt cores and cores on socket
 
-				leaf 0xB can be zeroed-out by a hypervisor
-			*/
-			x2APIC_supported_ = true;
-			for (uint32_t i = 0; i < maxTopologyLevels; i++) {
-				getCpuidEx(0xB, i, data);
-				IntelCpuTopologyLevel level = (IntelCpuTopologyLevel)extractBit(data[2], 8, 15);
-				if (level == SmtLevel || level == CoreLevel) {
-					numCores_[level - 1] = extractBit(data[1], 0, 15);
+					leaf 0xB can be zeroed-out by a hypervisor
+				*/
+				for (uint32_t i = 0; i < maxTopologyLevels; i++) {
+					getCpuidEx(0xB, i, data);
+					CpuTopologyLevel level = (CpuTopologyLevel)extractBit(data[2], 8, 15);
+					if (level == SmtLevel || level == CoreLevel) {
+						numCores_[level - 1] = extractBit(data[1], 0, 15);
+					}
 				}
+				/*
+					Fallback values in case a hypervisor has the leaf zeroed-out.
+				*/
+				numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
+				numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
+				return;
 			}
+		}
+		// "Extended Topology Enumeration" is not supported.
+		if (has(tAMD)) {
 			/*
-				Fallback values in case a hypervisor has 0xB leaf zeroed-out.
+				AMD - Legacy Method
 			*/
-			numCores_[SmtLevel - 1] = local::max_(1u, numCores_[SmtLevel - 1]);
-			numCores_[CoreLevel - 1] = local::max_(numCores_[SmtLevel - 1], numCores_[CoreLevel - 1]);
+			int physicalThreadCount = 0;
+			getCpuid(0x1, data);
+			int logicalProcessorCount = extractBit(data[1], 16, 23);
+			int htt = extractBit(data[3], 28, 28);
+			getCpuid(0x80000000, data);
+			uint32_t highestExtendedLeaf = data[0];
+			if (highestExtendedLeaf >= 0x80000008) {
+				getCpuid(0x80000008, data);
+				physicalThreadCount = extractBit(data[2], 0, 7) + 1;
+			}
+			if (htt == 0) {
+				numCores_[SmtLevel - 1] = 1;
+				numCores_[CoreLevel - 1] = 1;
+			} else if (physicalThreadCount > 1) {
+				if ((displayFamily >= 0x17) && (highestExtendedLeaf >= 0x8000001E)) {
+					// Zen overreports its core count by a factor of two.
+					getCpuid(0x8000001E, data);
+					int threadsPerComputeUnit = extractBit(data[1], 8, 15) + 1;
+					physicalThreadCount /= threadsPerComputeUnit;
+				}
+				numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
+				numCores_[CoreLevel - 1] = logicalProcessorCount;
+			} else {
+				numCores_[SmtLevel - 1] = 1;
+				numCores_[CoreLevel - 1] = logicalProcessorCount > 1 ? logicalProcessorCount : 2;
+			}
 		} else {
 			/*
-				Failed to deremine num of cores without x2APIC support.
-				TODO: USE initial APIC ID to determine ncores.
+				Intel - Legacy Method
+
 			*/
-			numCores_[SmtLevel - 1] = 0;
-			numCores_[CoreLevel - 1] = 0;
+			int physicalThreadCount = 0;
+			getCpuid(0x1, data);
+			int logicalProcessorCount = extractBit(data[1], 16, 23);
+			int htt = extractBit(data[3], 28, 28);
+			getCpuid(0, data);
+			if (data[0] >= 0x4) {
+				getCpuid(0x4, data);
+				physicalThreadCount = extractBit(data[0], 26, 31) + 1;
+			}
+			if (htt == 0) {
+				numCores_[SmtLevel - 1] = 1;
+				numCores_[CoreLevel - 1] = 1;
+			} else if (physicalThreadCount > 1) {
+				numCores_[SmtLevel - 1] = logicalProcessorCount / physicalThreadCount;
+				numCores_[CoreLevel - 1] = logicalProcessorCount;
+			} else {
+				numCores_[SmtLevel - 1] = 1;
+				numCores_[CoreLevel - 1] = logicalProcessorCount > 0 ? logicalProcessorCount : 1;
+			}
 		}
-
 	}
 	void setCacheHierarchy()
 	{
-		if (!has(tINTEL) && !has(tAMD)) return;
-
-		// https://github.com/amd/ZenDNN/blob/a08bf9a9efc160a69147cdecfb61cc85cc0d4928/src/cpu/x64/xbyak/xbyak_util.h#L236-L288
+		uint32_t data[4] = {};
 		if (has(tAMD)) {
-			// There are 3 Data Cache Levels (L1, L2, L3)
-			dataCacheLevels_ = 3;
-			const uint32_t leaf = 0x8000001D; // for modern AMD CPus
-			// Sub leaf value ranges from 0 to 3
-			// Sub leaf value 0 refers to L1 Data Cache
-			// Sub leaf value 1 refers to L1 Instruction Cache
-			// Sub leaf value 2 refers to L2 Cache
-			// Sub leaf value 3 refers to L3 Cache
-			// For legacy AMD CPU, use leaf 0x80000005 for L1 cache
-			// and 0x80000006 for L2 and L3 cache
-			int cache_index = 0;
-			for (uint32_t sub_leaf = 0; sub_leaf <= dataCacheLevels_; sub_leaf++) {
-				// Skip sub_leaf = 1 as it refers to
-				// L1 Instruction Cache (not required)
-				if (sub_leaf == 1) {
-					continue;
+			getCpuid(0x80000000, data);
+			if (data[0] >= 0x8000001D) {
+				// For modern AMD CPUs.
+				dataCacheLevels_ = 0;
+				for (uint32_t subLeaf = 0; dataCacheLevels_ < maxNumberCacheLevels; subLeaf++) {
+					getCpuidEx(0x8000001D, subLeaf, data);
+					int cacheType = extractBit(data[0], 0, 4);
+					/*
+					  cacheType
+						00h - Null; no more caches
+						01h - Data cache
+						02h - Instrution cache
+						03h - Unified cache
+						04h-1Fh - Reserved
+					*/
+					if (cacheType == 0) break; // No more caches.
+					if (cacheType == 0x2) continue; // Skip instruction cache.
+					int fullyAssociative = extractBit(data[0], 9, 9);
+					int numSharingCache = extractBit(data[0], 14, 25) + 1;
+					int cacheNumWays = extractBit(data[1], 22, 31) + 1;
+					int cachePhysPartitions = extractBit(data[1], 12, 21) + 1;
+					int cacheLineSize = extractBit(data[1], 0, 11) + 1;
+					int cacheNumSets = data[2] + 1;
+					dataCacheSize_[dataCacheLevels_] =
+						cacheLineSize * cachePhysPartitions * cacheNumWays;
+					if (fullyAssociative == 0) {
+						dataCacheSize_[dataCacheLevels_] *= cacheNumSets;
+					}
+					if (subLeaf > 0) {
+						numSharingCache = local::min_(numSharingCache, (int)numCores_[1]);
+						numSharingCache /= local::max_(1u, coresSharingDataCache_[0]);
+					}
+					coresSharingDataCache_[dataCacheLevels_] = numSharingCache;
+					dataCacheLevels_ += 1;
 				}
-				uint32_t data[4] = {};
-				getCpuidEx(leaf, sub_leaf, data);
-				// Cache Size = Line Size * Partitions * Associativity * Cache Sets
-				dataCacheSize_[cache_index] =
-					(extractBit(data[1], 22, 31) + 1) // Associativity-1
-					* (extractBit(data[1], 12, 21) + 1) // Partitions-1
-					* (extractBit(data[1], 0, 11) + 1) // Line Size
-					* (data[2] + 1);
-				// Calculate the number of cores sharing the current data cache
-				int smt_width = numCores_[0];
-				int logical_cores = numCores_[1];
-				int actual_logical_cores = extractBit(data[0], 14, 25) /* # of cores * # of threads */ + 1;
-				if (logical_cores != 0) {
-					actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+				coresSharingDataCache_[0] = local::min_(1u, coresSharingDataCache_[0]);
+			} else if (data[0] >= 0x80000006) {
+				// For legacy AMD CPUs, use leaf 0x80000005 for L1 cache
+				// and 0x80000006 for L2 and L3 cache.
+				dataCacheLevels_ = 1;
+				getCpuid(0x80000005, data);
+				int l1dc_size = extractBit(data[2], 24, 31);
+				dataCacheSize_[0] = l1dc_size * 1024;
+				coresSharingDataCache_[0] = 1;
+				getCpuid(0x80000006, data);
+				// L2 cache
+				int l2_assoc = extractBit(data[2], 12, 15);
+				if (l2_assoc > 0) {
+					dataCacheLevels_ = 2;
+					int l2_size = extractBit(data[2], 16, 31);
+					dataCacheSize_[1] = l2_size * 1024;
+					coresSharingDataCache_[1] = 1;
+				}
+				// L3 cache
+				int l3_assoc = extractBit(data[3], 12, 15);
+				if (l3_assoc > 0) {
+					dataCacheLevels_ = 3;
+					int l3_size = extractBit(data[3], 18, 31);
+					dataCacheSize_[2] = l3_size * 512 * 1024;
+					coresSharingDataCache_[2] = numCores_[1];
 				}
-				coresSharignDataCache_[cache_index] = local::max_(actual_logical_cores / smt_width, 1);
-				++cache_index;
 			}
-			return;
-		}
-		// intel
-		const uint32_t NO_CACHE = 0;
-		const uint32_t DATA_CACHE = 1;
-//		const uint32_t INSTRUCTION_CACHE = 2;
-		const uint32_t UNIFIED_CACHE = 3;
-		uint32_t smt_width = 0;
-		uint32_t logical_cores = 0;
-		uint32_t data[4] = {};
+		} else if (has(tINTEL)) {
+			// Use the "Deterministic Cache Parameters" leaf is supported.
+			const uint32_t NO_CACHE = 0;
+			const uint32_t DATA_CACHE = 1;
+			//const uint32_t INSTRUCTION_CACHE = 2;
+			const uint32_t UNIFIED_CACHE = 3;
+			uint32_t smt_width = 0;
+			uint32_t logical_cores = 0;
 
-		if (x2APIC_supported_) {
 			smt_width = numCores_[0];
 			logical_cores = numCores_[1];
-		}
 
-		/*
-			Assumptions:
-			the first level of data cache is not shared (which is the
-			case for every existing architecture) and use this to
-			determine the SMT width for arch not supporting leaf 11.
-			when leaf 4 reports a number of core less than numCores_
-			on socket reported by leaf 11, then it is a correct number
-			of cores not an upperbound.
-		*/
-		for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
-			getCpuidEx(0x4, i, data);
-			uint32_t cacheType = extractBit(data[0], 0, 4);
-			if (cacheType == NO_CACHE) break;
-			if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
-				uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
-				if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
-					actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+			/*
+				Assumptions:
+				the first level of data cache is not shared (which is the
+				case for every existing architecture) and use this to
+				determine the SMT width for arch not supporting leaf 11.
+				when leaf 4 reports a number of core less than numCores_
+				on socket reported by leaf 11, then it is a correct number
+				of cores not an upperbound.
+			*/
+			for (int i = 0; dataCacheLevels_ < maxNumberCacheLevels; i++) {
+				getCpuidEx(0x4, i, data);
+				uint32_t cacheType = extractBit(data[0], 0, 4);
+				if (cacheType == NO_CACHE) break;
+				if (cacheType == DATA_CACHE || cacheType == UNIFIED_CACHE) {
+					uint32_t actual_logical_cores = extractBit(data[0], 14, 25) + 1;
+					if (logical_cores != 0) { // true only if leaf 0xB is supported and valid
+						actual_logical_cores = local::min_(actual_logical_cores, logical_cores);
+					}
+					assert(actual_logical_cores != 0);
+					dataCacheSize_[dataCacheLevels_] =
+						(extractBit(data[1], 22, 31) + 1)
+						* (extractBit(data[1], 12, 21) + 1)
+						* (extractBit(data[1], 0, 11) + 1)
+						* (data[2] + 1);
+					if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
+					assert(smt_width != 0);
+					coresSharingDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
+					dataCacheLevels_++;
 				}
-				assert(actual_logical_cores != 0);
-				dataCacheSize_[dataCacheLevels_] =
-					(extractBit(data[1], 22, 31) + 1)
-					* (extractBit(data[1], 12, 21) + 1)
-					* (extractBit(data[1], 0, 11) + 1)
-					* (data[2] + 1);
-				if (cacheType == DATA_CACHE && smt_width == 0) smt_width = actual_logical_cores;
-				assert(smt_width != 0);
-				coresSharignDataCache_[dataCacheLevels_] = local::max_(actual_logical_cores / smt_width, 1u);
-				dataCacheLevels_++;
 			}
 		}
 	}
@@ -358,8 +438,7 @@ class Cpu {
 	int displayFamily; // family + extFamily
 	int displayModel; // model + extModel
 
-	uint32_t getNumCores(IntelCpuTopologyLevel level) const {
-		if (!x2APIC_supported_) XBYAK_THROW_RET(ERR_X2APIC_IS_NOT_SUPPORTED, 0)
+	uint32_t getNumCores(CpuTopologyLevel level) const {
 		switch (level) {
 		case SmtLevel: return numCores_[level - 1];
 		case CoreLevel: return numCores_[level - 1] / numCores_[SmtLevel - 1];
@@ -371,7 +450,7 @@ class Cpu {
 	uint32_t getCoresSharingDataCache(uint32_t i) const
 	{
 		if (i >= dataCacheLevels_) XBYAK_THROW_RET(ERR_BAD_PARAMETER, 0)
-		return coresSharignDataCache_[i];
+		return coresSharingDataCache_[i];
 	}
 	uint32_t getDataCacheSize(uint32_t i) const
 	{
@@ -382,19 +461,6 @@ class Cpu {
 	/*
 		data[] = { eax, ebx, ecx, edx }
 	*/
-	static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
-	{
-#ifdef XBYAK_INTEL_CPU_SPECIFIC
-	#ifdef _WIN32
-		__cpuid(reinterpret_cast<int*>(data), eaxIn);
-	#else
-		__cpuid(eaxIn, data[0], data[1], data[2], data[3]);
-	#endif
-#else
-		(void)eaxIn;
-		(void)data;
-#endif
-	}
 	static inline void getCpuidEx(uint32_t eaxIn, uint32_t ecxIn, uint32_t data[4])
 	{
 #ifdef XBYAK_INTEL_CPU_SPECIFIC
@@ -409,6 +475,10 @@ class Cpu {
 		(void)data;
 #endif
 	}
+	static inline void getCpuid(uint32_t eaxIn, uint32_t data[4])
+	{
+		getCpuidEx(eaxIn, 0, data);
+	}
 	static inline uint64_t getXfeature()
 	{
 #ifdef XBYAK_INTEL_CPU_SPECIFIC
@@ -524,16 +594,16 @@ class Cpu {
 	XBYAK_DEFINE_TYPE(87, tKEYLOCKER_WIDE);
 	XBYAK_DEFINE_TYPE(88, tSSE4a);
 	XBYAK_DEFINE_TYPE(89, tCLWB);
+	XBYAK_DEFINE_TYPE(90, tTSXLDTRK);
 
 #undef XBYAK_SPLIT_ID
 #undef XBYAK_DEFINE_TYPE
 
 	Cpu()
 		: type_()
-		, x2APIC_supported_(false)
 		, numCores_()
 		, dataCacheSize_()
-		, coresSharignDataCache_()
+		, coresSharingDataCache_()
 		, dataCacheLevels_(0)
 		, avx10version_(0)
 	{
@@ -544,9 +614,7 @@ class Cpu {
 		const uint32_t& EDX = data[3];
 		getCpuid(0, data);
 		const uint32_t maxNum = EAX;
-		static const char intel[] = "ntel";
-		static const char amd[] = "cAMD";
-		if (ECX == get32bitAsBE(amd)) {
+		if (isEqualStr(EBX, ECX, EDX, "AuthenticAMD")) {
 			type_ |= tAMD;
 			getCpuid(0x80000001, data);
 			if (EDX & (1U << 31)) {
@@ -560,7 +628,7 @@ class Cpu {
 				type_ |= tPREFETCHW;
 			}
 		}
-		if (ECX == get32bitAsBE(intel)) {
+		if (isEqualStr(EBX, ECX, EDX, "GenuineIntel")) {
 			type_ |= tINTEL;
 		}
 
@@ -665,6 +733,7 @@ class Cpu {
 			if (ECX & (1U << 28)) type_ |= tMOVDIR64B;
 			if (EDX & (1U << 5)) type_ |= tUINTR;
 			if (EDX & (1U << 14)) type_ |= tSERIALIZE;
+			if (EDX & (1U << 16)) type_ |= tTSXLDTRK;
 			if (EDX & (1U << 22)) type_ |= tAMX_BF16;
 			if (EDX & (1U << 24)) type_ |= tAMX_TILE;
 			if (EDX & (1U << 25)) type_ |= tAMX_INT8;
@@ -871,17 +940,17 @@ class StackFrame {
 #endif
 	static const int maxRegNum = 14; // maxRegNum = 16 - rsp - rax
 	Xbyak::CodeGenerator *code_;
+	Xbyak::Reg64 pTbl_[4];
+	Xbyak::Reg64 tTbl_[maxRegNum];
+	Pack p_;
+	Pack t_;
 	int pNum_;
 	int tNum_;
-	bool useRcx_;
-	bool useRdx_;
 	int saveNum_;
 	int P_;
+	bool useRcx_;
+	bool useRdx_;
 	bool makeEpilog_;
-	Xbyak::Reg64 pTbl_[4];
-	Xbyak::Reg64 tTbl_[maxRegNum];
-	Pack p_;
-	Pack t_;
 	StackFrame(const StackFrame&);
 	void operator=(const StackFrame&);
 public:
@@ -907,10 +976,10 @@ class StackFrame {
 		: code_(code)
 		, pNum_(pNum)
 		, tNum_(tNum & ~(UseRCX | UseRDX))
-		, useRcx_((tNum & UseRCX) != 0)
-		, useRdx_((tNum & UseRDX) != 0)
 		, saveNum_(0)
 		, P_(0)
+		, useRcx_((tNum & UseRCX) != 0)
+		, useRdx_((tNum & UseRDX) != 0)
 		, makeEpilog_(makeEpilog)
 		, p(p_)
 		, t(t_)
diff --git a/src/cpu/aarch64/xbyak_aarch64/CMakeLists.txt b/third_party/xbyak_aarch64/CMakeLists.txt
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/CMakeLists.txt
rename to third_party/xbyak_aarch64/CMakeLists.txt
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/err_impl.h b/third_party/xbyak_aarch64/src/err_impl.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/err_impl.h
rename to third_party/xbyak_aarch64/src/err_impl.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp b/third_party/xbyak_aarch64/src/util_impl.cpp
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/util_impl.cpp
rename to third_party/xbyak_aarch64/src/util_impl.cpp
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl.h b/third_party/xbyak_aarch64/src/util_impl.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/util_impl.h
rename to third_party/xbyak_aarch64/src/util_impl.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h b/third_party/xbyak_aarch64/src/util_impl_linux.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/util_impl_linux.h
rename to third_party/xbyak_aarch64/src/util_impl_linux.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h b/third_party/xbyak_aarch64/src/util_impl_mac.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/util_impl_mac.h
rename to third_party/xbyak_aarch64/src/util_impl_mac.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/util_impl_windows.h b/third_party/xbyak_aarch64/src/util_impl_windows.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/util_impl_windows.h
rename to third_party/xbyak_aarch64/src/util_impl_windows.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.cpp b/third_party/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
rename to third_party/xbyak_aarch64/src/xbyak_aarch64_impl.cpp
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h b/third_party/xbyak_aarch64/src/xbyak_aarch64_impl.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_impl.h
rename to third_party/xbyak_aarch64/src/xbyak_aarch64_impl.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_mnemonic.h b/third_party/xbyak_aarch64/src/xbyak_aarch64_mnemonic.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/src/xbyak_aarch64_mnemonic.h
rename to third_party/xbyak_aarch64/src/xbyak_aarch64_mnemonic.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_adr.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_code_array.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_err.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_gen.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_inner.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_label.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_meta_mnemonic.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_meta_mnemonic.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_meta_mnemonic.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_meta_mnemonic.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_mnemonic_def.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_mnemonic_def.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_mnemonic_def.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_mnemonic_def.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_perf.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_perf.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_perf.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_perf.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_reg.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_reg.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_reg.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_reg.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_util.h
diff --git a/src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h b/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h
similarity index 100%
rename from src/cpu/aarch64/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h
rename to third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h